diff --git a/.gitignore b/.gitignore
index 6a183cd6b2403505a3bd2e2a0ce959a357443325..224bd2f3a9cf305cc4205f30d7742928de5f8b99 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,9 @@
 #OS X specific files.
 .DS_store
 
+# Nested build directory
+/build
+
 #==============================================================================#
 # Explicit files to ignore (only matches one).
 #==============================================================================#
@@ -62,8 +65,9 @@ tools/polly
 tools/avrlit
 # Sphinx build tree, if building in-source dir.
 docs/_build
-# VSCode config files.
+# VS2017 and VSCode config files.
 .vscode
+.vs
 
 #==============================================================================#
 # Files created in tree by the Go bindings.
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c2c9fe0a68b25ca4c42f69c57947ea81949bce9a..b51bc421fdbf60a5d6dbb2ed23e3ebe322aac420 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,17 +56,20 @@ endif()
 
 # This should only apply if you are both on an Apple host, and targeting Apple.
 if(CMAKE_HOST_APPLE AND APPLE)
-  if(NOT CMAKE_XCRUN)
-    find_program(CMAKE_XCRUN NAMES xcrun)
-  endif()
-  if(CMAKE_XCRUN)
-    execute_process(COMMAND ${CMAKE_XCRUN} -find libtool
-      OUTPUT_VARIABLE CMAKE_LIBTOOL
-      OUTPUT_STRIP_TRAILING_WHITESPACE)
-  endif()
+  # if CMAKE_LIBTOOL is not set, try and find it with xcrun or find_program
+  if(NOT CMAKE_LIBTOOL)
+    if(NOT CMAKE_XCRUN)
+      find_program(CMAKE_XCRUN NAMES xcrun)
+    endif()
+    if(CMAKE_XCRUN)
+      execute_process(COMMAND ${CMAKE_XCRUN} -find libtool
+        OUTPUT_VARIABLE CMAKE_LIBTOOL
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
+    endif()
 
-  if(NOT CMAKE_LIBTOOL OR NOT EXISTS CMAKE_LIBTOOL)
-    find_program(CMAKE_LIBTOOL NAMES libtool)
+    if(NOT CMAKE_LIBTOOL OR NOT EXISTS CMAKE_LIBTOOL)
+      find_program(CMAKE_LIBTOOL NAMES libtool)
+    endif()
   endif()
 
   get_property(languages GLOBAL PROPERTY ENABLED_LANGUAGES)
@@ -132,18 +135,6 @@ foreach(proj ${LLVM_ENABLE_PROJECTS})
   endif()
 endforeach()
 
-# The following only works with the Ninja generator in CMake >= 3.0.
-set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
-  "Define the maximum number of concurrent compilation jobs.")
-if(LLVM_PARALLEL_COMPILE_JOBS)
-  if(NOT CMAKE_MAKE_PROGRAM MATCHES "ninja")
-    message(WARNING "Job pooling is only available with Ninja generators.")
-  else()
-    set_property(GLOBAL APPEND PROPERTY JOB_POOLS compile_job_pool=${LLVM_PARALLEL_COMPILE_JOBS})
-    set(CMAKE_JOB_POOL_COMPILE compile_job_pool)
-  endif()
-endif()
-
 # Build llvm with ccache if the package is present
 set(LLVM_CCACHE_BUILD OFF CACHE BOOL "Set to ON for a ccache enabled build")
 if(LLVM_CCACHE_BUILD)
@@ -183,16 +174,7 @@ if(LLVM_BUILD_GLOBAL_ISEL)
   add_definitions(-DLLVM_BUILD_GLOBAL_ISEL)
 endif()
 
-set(LLVM_PARALLEL_LINK_JOBS "" CACHE STRING
-  "Define the maximum number of concurrent link jobs.")
-if(LLVM_PARALLEL_LINK_JOBS)
-  if(NOT CMAKE_MAKE_PROGRAM MATCHES "ninja")
-    message(WARNING "Job pooling is only available with Ninja generators.")
-  else()
-    set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${LLVM_PARALLEL_LINK_JOBS})
-    set(CMAKE_JOB_POOL_LINK link_job_pool)
-  endif()
-endif()
+option(LLVM_ENABLE_DAGISEL_COV "Debug: Prints tablegen patterns that were used for selecting" OFF)
 
 # Add path for custom modules
 set(CMAKE_MODULE_PATH
@@ -385,8 +367,6 @@ set(LLVM_TARGETS_TO_BUILD
    ${LLVM_EXPERIMENTAL_TARGETS_TO_BUILD})
 list(REMOVE_DUPLICATES LLVM_TARGETS_TO_BUILD)
 
-include(AddLLVMDefinitions)
-
 option(LLVM_ENABLE_PIC "Build Position-Independent Code" ON)
 option(LLVM_ENABLE_WARNINGS "Enable compiler warnings." ON)
 option(LLVM_ENABLE_MODULES "Compile with C++ modules enabled." OFF)
@@ -414,9 +394,6 @@ option(LLVM_ENABLE_EXPENSIVE_CHECKS "Enable expensive checks" OFF)
 set(LLVM_ABI_BREAKING_CHECKS "WITH_ASSERTS" CACHE STRING
   "Enable abi-breaking checks.  Can be WITH_ASSERTS, FORCE_ON or FORCE_OFF.")
 
-option(LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING
-  "Disable abi-breaking checks mismatch detection at link-tim." OFF)
-
 option(LLVM_FORCE_USE_OLD_HOST_TOOLCHAIN
        "Set to ON to force using an old, unsupported host toolchain." OFF)
 
@@ -506,6 +483,10 @@ option(LLVM_INCLUDE_UTILS "Generate build targets for the LLVM utils." ON)
 option(LLVM_BUILD_UTILS
   "Build LLVM utility binaries. If OFF, just generate build targets." ON)
 
+option(LLVM_INCLUDE_RUNTIMES "Generate build targets for the LLVM runtimes." ON)
+option(LLVM_BUILD_RUNTIMES
+  "Build the LLVM runtimes. If OFF, just generate build targets." ON)
+
 option(LLVM_BUILD_RUNTIME
   "Build the LLVM runtime libraries." ON)
 option(LLVM_BUILD_EXAMPLES
@@ -641,7 +622,7 @@ endif (LLVM_USE_OPROFILE)
 
 message(STATUS "Constructing LLVMBuild project information")
 execute_process(
-  COMMAND ${PYTHON_EXECUTABLE} ${LLVMBUILDTOOL}
+  COMMAND ${PYTHON_EXECUTABLE} -B ${LLVMBUILDTOOL}
             --native-target "${LLVM_NATIVE_ARCH}"
             --enable-targets "${LLVM_TARGETS_TO_BUILD}"
             --enable-optional-components "${LLVMOPTIONALCOMPONENTS}"
@@ -737,6 +718,30 @@ configure_file(
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Support/DataTypes.h.cmake
   ${LLVM_INCLUDE_DIR}/llvm/Support/DataTypes.h)
 
+# Add target for generating source rpm package.
+set(LLVM_SRPM_USER_BINARY_SPECFILE ${CMAKE_CURRENT_SOURCE_DIR}/llvm.spec.in
+    CACHE FILEPATH ".spec file to use for srpm generation")
+set(LLVM_SRPM_BINARY_SPECFILE ${CMAKE_CURRENT_BINARY_DIR}/llvm.spec)
+set(LLVM_SRPM_DIR "${CMAKE_CURRENT_BINARY_DIR}/srpm")
+
+# SVN_REVISION and GIT_COMMIT get set by the call to add_version_info_from_vcs.
+# DUMMY_VAR contains a version string which we don't care about.
+add_version_info_from_vcs(DUMMY_VAR)
+if ( SVN_REVISION )
+  set(LLVM_RPM_SPEC_REVISION "r${SVN_REVISION}")
+elseif ( GIT_COMMIT )
+  set (LLVM_RPM_SPEC_REVISION "g${GIT_COMMIT}")
+endif()
+
+configure_file(
+  ${LLVM_SRPM_USER_BINARY_SPECFILE}
+  ${LLVM_SRPM_BINARY_SPECFILE} @ONLY)
+
+add_custom_target(srpm
+  COMMAND cpack -G TGZ --config CPackSourceConfig.cmake -B ${LLVM_SRPM_DIR}/SOURCES
+  COMMAND rpmbuild -bs --define '_topdir ${LLVM_SRPM_DIR}' ${LLVM_SRPM_BINARY_SPECFILE})
+
+
 # They are not referenced. See set_output_directory().
 set( CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/bin )
 set( CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LLVM_BINARY_DIR}/lib${LLVM_LIBDIR_SUFFIX} )
@@ -861,7 +866,9 @@ if( LLVM_INCLUDE_TOOLS )
   add_subdirectory(tools)
 endif()
 
-add_subdirectory(runtimes)
+if( LLVM_INCLUDE_RUNTIMES )
+  add_subdirectory(runtimes)
+endif()
 
 if( LLVM_INCLUDE_EXAMPLES )
   add_subdirectory(examples)
diff --git a/CODE_OWNERS.TXT b/CODE_OWNERS.TXT
index 3625ebf099f1bc433c077cc2db958c51f25727a3..ec4561d991693c7ebffc07fa693f793d98739104 100644
--- a/CODE_OWNERS.TXT
+++ b/CODE_OWNERS.TXT
@@ -5,12 +5,9 @@ what goes in or not.
 
 The list is sorted by surname and formatted to allow easy grepping and
 beautification by scripts. The fields are: name (N), email (E), web-address
-(W), PGP key ID and fingerprint (P), description (D), and snail-mail address
-(S). Each entry should contain at least the (N), (E) and (D) fields.
-
-N: Joe Abbey
-E: jabbey@arxan.com
-D: LLVM Bitcode (lib/Bitcode/* include/llvm/Bitcode/*)
+(W), PGP key ID and fingerprint (P), description (D), snail-mail address
+(S) and (I) IRC handle. Each entry should contain at least the (N), (E) and
+(D) fields.
 
 N: Justin Bogner
 E: mail@justinbogner.com
@@ -21,6 +18,11 @@ N: Alex Bradbury
 E: asb@lowrisc.org
 D: RISC-V backend (lib/Target/RISCV/*)
 
+N: Matthias Braun
+E: matze@braunis.de
+I: MatzeB
+D: Instruction Scheduling
+
 N: Chandler Carruth
 E: chandlerc@gmail.com
 E: chandlerc@google.com
@@ -34,6 +36,10 @@ N: Eric Christopher
 E: echristo@gmail.com
 D: Debug Information, inline assembly
 
+N: Andrey Churbanov
+E: andrey.churbanov@intel.com
+D: OpenMP runtime library
+
 N: Greg Clayton
 E: gclayton@apple.com
 D: LLDB
@@ -48,7 +54,7 @@ D: libc++
 
 N: Peter Collingbourne
 E: peter@pcc.me.uk
-D: llgo, libLTO (lib/LTO/* tools/lto/*)
+D: llgo, libLTO (lib/LTO/* tools/lto/*), LLVM Bitcode (lib/Bitcode/* include/llvm/Bitcode/*)
 
 N: Quentin Colombet
 E: qcolombet@apple.com
@@ -96,7 +102,7 @@ D: MCJIT, RuntimeDyld and JIT event listeners, Orcish Warchief
 
 N: Teresa Johnson
 E: tejohnson@google.com
-D: Gold plugin (tools/gold/*)
+D: Gold plugin (tools/gold/*) and IR Linker
 
 N: Galina Kistanova
 E: gkistanova@gmail.com
@@ -132,7 +138,7 @@ E: david.majnemer@gmail.com
 D: IR Constant Folder, InstCombine
 
 N: Dylan McKay
-E: dylanmckay34@gmail.com
+E: me@dylanmckay.io
 D: AVR Backend
 
 N: Tim Northover
@@ -180,9 +186,8 @@ E: alexei.starovoitov@gmail.com
 D: BPF backend
 
 N: Tom Stellard
-E: thomas.stellard@amd.com
-E: mesa-dev@lists.freedesktop.org
-D: Release manager for the 3.5 and 3.6 branches, R600 Backend, libclc
+E: tstellar@redhat.com
+D: Stable release management (x.y.[1-9] releases), AMDGPU Backend, libclc
 
 N: Evgeniy Stepanov
 E: eugenis@google.com
@@ -192,18 +197,10 @@ N: Craig Topper
 E: craig.topper@gmail.com
 D: X86 Backend
 
-N: Andrew Trick
-E: atrick@apple.com
-D: Instruction Scheduling
-
 N: Ulrich Weigand
 E: uweigand@de.ibm.com
 D: SystemZ Backend
 
-N: Teresa Johnson
-E: tejohnson@google.com
-D: IR Linker
-
 N: Hans Wennborg
 E: hans@chromium.org
 D: Release management (x.y.0 releases)
@@ -211,7 +208,3 @@ D: Release management (x.y.0 releases)
 N: whitequark
 E: whitequark@whitequark.org
 D: OCaml bindings
-
-N: Andrey Churbanov
-E: andrey.churbanov@intel.com
-D: OpenMP runtime library
diff --git a/CREDITS.TXT b/CREDITS.TXT
index c354900a6e92d1a6c0ad7bafacb99db5bf77b9cd..15d822a680911f07fe48f33f9e733099e8edc396 100644
--- a/CREDITS.TXT
+++ b/CREDITS.TXT
@@ -457,6 +457,10 @@ N: Adam Treat
 E: manyoso@yahoo.com
 D: C++ bugs filed, and C++ front-end bug fixes.
 
+N: Andrew Trick
+E: atrick@apple.com
+D: Instruction Scheduling, ...
+
 N: Lauro Ramos Venancio
 E: lauro.venancio@indt.org.br
 D: ARM backend improvements
diff --git a/bindings/go/llvm/DIBuilderBindings.cpp b/bindings/go/llvm/DIBuilderBindings.cpp
index 42aa819c7961e91b0b28247d6b2a2b47a19db6ed..53e223d67b4e8e9636c7a0be8533a7ccdf73c154 100644
--- a/bindings/go/llvm/DIBuilderBindings.cpp
+++ b/bindings/go/llvm/DIBuilderBindings.cpp
@@ -119,7 +119,8 @@ LLVMMetadataRef LLVMDIBuilderCreatePointerType(LLVMDIBuilderRef Dref,
                                                const char *Name) {
   DIBuilder *D = unwrap(Dref);
   return wrap(D->createPointerType(unwrap<DIType>(PointeeType), SizeInBits,
-                                   AlignInBits, Name));
+                                   AlignInBits, /* DWARFAddressSpace */ None,
+                                   Name));
 }
 
 LLVMMetadataRef
diff --git a/bindings/go/llvm/IRBindings.cpp b/bindings/go/llvm/IRBindings.cpp
index 20cc05043f28620c5826a09d90dc430aea33e41c..4bfa1bbaf0cc8ac81972947abbc97350e2ec3267 100644
--- a/bindings/go/llvm/IRBindings.cpp
+++ b/bindings/go/llvm/IRBindings.cpp
@@ -14,6 +14,7 @@
 #include "IRBindings.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LLVMContext.h"
@@ -71,6 +72,18 @@ void LLVMSetCurrentDebugLocation2(LLVMBuilderRef Bref, unsigned Line,
                     InlinedAt ? unwrap<MDNode>(InlinedAt) : nullptr));
 }
 
+LLVMDebugLocMetadata LLVMGetCurrentDebugLocation2(LLVMBuilderRef Bref) {
+  const auto& Loc = unwrap(Bref)->getCurrentDebugLocation();
+  const auto* InlinedAt = Loc.getInlinedAt();
+  const LLVMDebugLocMetadata md{
+    Loc.getLine(),
+    Loc.getCol(),
+    wrap(Loc.getScope()),
+    InlinedAt == nullptr ? nullptr : wrap(InlinedAt->getRawInlinedAt()),
+  };
+  return md;
+}
+
 void LLVMSetSubprogram(LLVMValueRef Func, LLVMMetadataRef SP) {
   unwrap<Function>(Func)->setSubprogram(unwrap<DISubprogram>(SP));
 }
diff --git a/bindings/go/llvm/IRBindings.h b/bindings/go/llvm/IRBindings.h
index 21147712ed5b4a026f4a5282193b625070249131..f4f490391d4f1107558ee889af0628a93cb39b66 100644
--- a/bindings/go/llvm/IRBindings.h
+++ b/bindings/go/llvm/IRBindings.h
@@ -27,6 +27,12 @@ extern "C" {
 #endif
 
 typedef struct LLVMOpaqueMetadata *LLVMMetadataRef;
+struct LLVMDebugLocMetadata{
+    unsigned Line;
+    unsigned Col;
+    LLVMMetadataRef Scope;
+    LLVMMetadataRef InlinedAt;
+};
 
 LLVMMetadataRef LLVMConstantAsMetadata(LLVMValueRef Val);
 
@@ -46,6 +52,8 @@ void LLVMSetCurrentDebugLocation2(LLVMBuilderRef Bref, unsigned Line,
                                   unsigned Col, LLVMMetadataRef Scope,
                                   LLVMMetadataRef InlinedAt);
 
+struct LLVMDebugLocMetadata LLVMGetCurrentDebugLocation2(LLVMBuilderRef Bref);
+
 void LLVMSetSubprogram(LLVMValueRef Fn, LLVMMetadataRef SP);
 
 #ifdef __cplusplus
diff --git a/bindings/go/llvm/ir.go b/bindings/go/llvm/ir.go
index b263c07c512d8e69953f53a7042625d86e8230f4..fe191beb38132ef501e74178d9acf682a218cd90 100644
--- a/bindings/go/llvm/ir.go
+++ b/bindings/go/llvm/ir.go
@@ -1226,9 +1226,23 @@ func (b Builder) InsertWithName(instr Value, name string) {
 func (b Builder) Dispose() { C.LLVMDisposeBuilder(b.C) }
 
 // Metadata
+type DebugLoc struct {
+	Line, Col      uint
+	Scope          Metadata
+	InlinedAt      Metadata
+}
 func (b Builder) SetCurrentDebugLocation(line, col uint, scope, inlinedAt Metadata) {
 	C.LLVMSetCurrentDebugLocation2(b.C, C.unsigned(line), C.unsigned(col), scope.C, inlinedAt.C)
 }
+// Get current debug location. Please do not call this function until setting debug location with SetCurrentDebugLocation()
+func (b Builder) GetCurrentDebugLocation() (loc DebugLoc) {
+	md := C.LLVMGetCurrentDebugLocation2(b.C)
+	loc.Line = uint(md.Line)
+	loc.Col = uint(md.Col)
+	loc.Scope = Metadata{C: md.Scope}
+	loc.InlinedAt = Metadata{C: md.InlinedAt}
+	return
+}
 func (b Builder) SetInstDebugLocation(v Value) { C.LLVMSetInstDebugLocation(b.C, v.C) }
 func (b Builder) InsertDeclare(module Module, storage Value, md Value) Value {
 	f := module.NamedFunction("llvm.dbg.declare")
diff --git a/bindings/go/llvm/ir_test.go b/bindings/go/llvm/ir_test.go
index 13e113957b4d029b3672f822dd77b9f50eb83b3f..c823615a4293c8a2e2997b9fdbaf0a444e0081a3 100644
--- a/bindings/go/llvm/ir_test.go
+++ b/bindings/go/llvm/ir_test.go
@@ -95,3 +95,42 @@ func TestAttributes(t *testing.T) {
 		testAttribute(t, name)
 	}
 }
+
+func TestDebugLoc(t *testing.T) {
+	mod := NewModule("")
+	defer mod.Dispose()
+
+	ctx := mod.Context()
+
+	b := ctx.NewBuilder()
+	defer b.Dispose()
+
+	d := NewDIBuilder(mod)
+	defer func() {
+		d.Destroy()
+	}()
+	file := d.CreateFile("dummy_file", "dummy_dir")
+	voidInfo := d.CreateBasicType(DIBasicType{Name: "void"})
+	typeInfo := d.CreateSubroutineType(DISubroutineType{file, []Metadata{voidInfo}})
+	scope := d.CreateFunction(file, DIFunction{
+		Name:         "foo",
+		LinkageName:  "foo",
+		Line:         10,
+		ScopeLine:    10,
+		Type:         typeInfo,
+		File:         file,
+		IsDefinition: true,
+	})
+
+	b.SetCurrentDebugLocation(10, 20, scope, Metadata{})
+	loc := b.GetCurrentDebugLocation()
+	if loc.Line != 10 {
+		t.Errorf("Got line %d, though wanted 10", loc.Line)
+	}
+	if loc.Col != 20 {
+		t.Errorf("Got column %d, though wanted 20", loc.Col)
+	}
+	if loc.Scope.C != scope.C {
+		t.Errorf("Got metadata %v as scope, though wanted %v", loc.Scope.C, scope.C)
+	}
+}
diff --git a/bindings/go/llvm/transforms_pmbuilder.go b/bindings/go/llvm/transforms_pmbuilder.go
index 3d79d6e2f3273939b8ac7577538fa1073b023957..b164e58812b1553cf4995f7085b32ec0d44977b4 100644
--- a/bindings/go/llvm/transforms_pmbuilder.go
+++ b/bindings/go/llvm/transforms_pmbuilder.go
@@ -43,6 +43,26 @@ func (pmb PassManagerBuilder) PopulateFunc(pm PassManager) {
 	C.LLVMPassManagerBuilderPopulateFunctionPassManager(pmb.C, pm.C)
 }
 
+func (pmb PassManagerBuilder) PopulateLTOPassManager(pm PassManager, internalize bool, runInliner bool) {
+	C.LLVMPassManagerBuilderPopulateLTOPassManager(pmb.C, pm.C, boolToLLVMBool(internalize), boolToLLVMBool(runInliner))
+}
+
 func (pmb PassManagerBuilder) Dispose() {
 	C.LLVMPassManagerBuilderDispose(pmb.C)
 }
+
+func (pmb PassManagerBuilder) SetDisableUnitAtATime(val bool) {
+	C.LLVMPassManagerBuilderSetDisableUnitAtATime(pmb.C, boolToLLVMBool(val))
+}
+
+func (pmb PassManagerBuilder) SetDisableUnrollLoops(val bool) {
+	C.LLVMPassManagerBuilderSetDisableUnrollLoops(pmb.C, boolToLLVMBool(val))
+}
+
+func (pmb PassManagerBuilder) SetDisableSimplifyLibCalls(val bool) {
+	C.LLVMPassManagerBuilderSetDisableSimplifyLibCalls(pmb.C, boolToLLVMBool(val))
+}
+
+func (pmb PassManagerBuilder) UseInlinerWithThreshold(threshold uint) {
+	C.LLVMPassManagerBuilderUseInlinerWithThreshold(pmb.C, C.uint(threshold))
+}
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index a96a722f34d6bbbcd65eaf5a5982a76dd23bb65f..0331d0fa10abf161af11ffdfbf401681febcf6d8 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -46,7 +46,6 @@ endfunction()
 check_include_file(dirent.h HAVE_DIRENT_H)
 check_include_file(dlfcn.h HAVE_DLFCN_H)
 check_include_file(errno.h HAVE_ERRNO_H)
-check_include_file(execinfo.h HAVE_EXECINFO_H)
 check_include_file(fcntl.h HAVE_FCNTL_H)
 check_include_file(inttypes.h HAVE_INTTYPES_H)
 check_include_file(link.h HAVE_LINK_H)
@@ -88,6 +87,15 @@ if(APPLE)
     HAVE_CRASHREPORTER_INFO)
 endif()
 
+if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux")
+  check_include_file(linux/magic.h HAVE_LINUX_MAGIC_H)
+  if(NOT HAVE_LINUX_MAGIC_H)
+    # older kernels use split files
+    check_include_file(linux/nfs_fs.h HAVE_LINUX_NFS_FS_H)
+    check_include_file(linux/smb.h HAVE_LINUX_SMB_H)
+  endif()
+endif()
+
 # library checks
 if( NOT PURE_WINDOWS )
   check_library_exists(pthread pthread_create "" HAVE_LIBPTHREAD)
@@ -115,7 +123,7 @@ if(HAVE_LIBPTHREAD)
   set(CMAKE_THREAD_PREFER_PTHREAD TRUE)
   set(THREADS_HAVE_PTHREAD_ARG Off)
   find_package(Threads REQUIRED)
-  set(PTHREAD_LIB ${CMAKE_THREAD_LIBS_INIT})
+  set(LLVM_PTHREAD_LIB ${CMAKE_THREAD_LIBS_INIT})
 endif()
 
 # Don't look for these libraries on Windows. Also don't look for them if we're
@@ -156,7 +164,9 @@ endif()
 
 # function checks
 check_symbol_exists(arc4random "stdlib.h" HAVE_DECL_ARC4RANDOM)
-check_symbol_exists(backtrace "execinfo.h" HAVE_BACKTRACE)
+find_package(Backtrace)
+set(HAVE_BACKTRACE ${Backtrace_FOUND})
+set(BACKTRACE_HEADER ${Backtrace_HEADER})
 check_symbol_exists(_Unwind_Backtrace "unwind.h" HAVE__UNWIND_BACKTRACE)
 check_symbol_exists(getpagesize unistd.h HAVE_GETPAGESIZE)
 check_symbol_exists(sysconf unistd.h HAVE_SYSCONF)
@@ -227,6 +237,7 @@ if( HAVE_DLFCN_H )
     list(APPEND CMAKE_REQUIRED_LIBRARIES dl)
   endif()
   check_symbol_exists(dlopen dlfcn.h HAVE_DLOPEN)
+  check_symbol_exists(dladdr dlfcn.h HAVE_DLADDR)
   if( HAVE_LIBDL )
     list(REMOVE_ITEM CMAKE_REQUIRED_LIBRARIES dl)
   endif()
@@ -234,7 +245,15 @@ endif()
 
 check_symbol_exists(__GLIBC__ stdio.h LLVM_USING_GLIBC)
 if( LLVM_USING_GLIBC )
-  add_llvm_definitions( -D_GNU_SOURCE )
+  add_definitions( -D_GNU_SOURCE )
+endif()
+# This check requires _GNU_SOURCE
+if(HAVE_LIBPTHREAD)
+  check_library_exists(pthread pthread_getname_np "" HAVE_PTHREAD_GETNAME_NP)
+  check_library_exists(pthread pthread_setname_np "" HAVE_PTHREAD_SETNAME_NP)
+elseif(PTHREAD_IN_LIBC)
+  check_library_exists(c pthread_getname_np "" HAVE_PTHREAD_GETNAME_NP)
+  check_library_exists(c pthread_setname_np "" HAVE_PTHREAD_SETNAME_NP)
 endif()
 
 set(headers "sys/types.h")
@@ -545,6 +564,9 @@ set(LLVM_BINUTILS_INCDIR "" CACHE PATH
 	"PATH to binutils/include containing plugin-api.h for gold plugin.")
 
 if(CMAKE_HOST_APPLE AND APPLE)
+  if(NOT CMAKE_XCRUN)
+    find_program(CMAKE_XCRUN NAMES xcrun)
+  endif()
   if(CMAKE_XCRUN)
     execute_process(COMMAND ${CMAKE_XCRUN} -find ld
       OUTPUT_VARIABLE LD64_EXECUTABLE
diff --git a/cmake/modules/AddLLVM.cmake b/cmake/modules/AddLLVM.cmake
index b3c7746c480ade18bbd1f688ca5f323711fc00aa..7f7608cff33d3e0c683272656fc4bd9d5fa6592f 100644
--- a/cmake/modules/AddLLVM.cmake
+++ b/cmake/modules/AddLLVM.cmake
@@ -718,11 +718,11 @@ macro(add_llvm_executable name)
   if(NOT ARG_IGNORE_EXTERNALIZE_DEBUGINFO)
     llvm_externalize_debuginfo(${name})
   endif()
-  if (PTHREAD_LIB)
+  if (LLVM_PTHREAD_LIB)
     # libpthreads overrides some standard library symbols, so main
     # executable must be linked with it in order to provide consistent
     # API for all shared libaries loaded by this executable.
-    target_link_libraries(${name} ${PTHREAD_LIB})
+    target_link_libraries(${name} ${LLVM_PTHREAD_LIB})
   endif()
 endmacro(add_llvm_executable name)
 
@@ -1027,7 +1027,7 @@ function(add_unittest test_suite test_name)
   # libpthreads overrides some standard library symbols, so main
   # executable must be linked with it in order to provide consistent
   # API for all shared libaries loaded by this executable.
-  target_link_libraries(${test_name} gtest_main gtest ${PTHREAD_LIB})
+  target_link_libraries(${test_name} gtest_main gtest ${LLVM_PTHREAD_LIB})
 
   add_dependencies(${test_suite} ${test_name})
   get_target_property(test_suite_folder ${test_suite} FOLDER)
@@ -1387,7 +1387,11 @@ function(llvm_externalize_debuginfo name)
   endif()
 
   if(NOT LLVM_EXTERNALIZE_DEBUGINFO_SKIP_STRIP)
-    set(strip_command COMMAND xcrun strip -Sxl $<TARGET_FILE:${name}>)
+    if(APPLE)
+      set(strip_command COMMAND xcrun strip -Sxl $<TARGET_FILE:${name}>)
+    else()
+      set(strip_command COMMAND strip -gx $<TARGET_FILE:${name}>)
+    endif()
   endif()
 
   if(APPLE)
@@ -1403,7 +1407,11 @@ function(llvm_externalize_debuginfo name)
       ${strip_command}
       )
   else()
-    message(FATAL_ERROR "LLVM_EXTERNALIZE_DEBUGINFO isn't implemented for non-darwin platforms!")
+    add_custom_command(TARGET ${name} POST_BUILD
+      COMMAND objcopy --only-keep-debug $<TARGET_FILE:${name}> $<TARGET_FILE:${name}>.debug
+      ${strip_command} -R .gnu_debuglink
+      COMMAND objcopy --add-gnu-debuglink=$<TARGET_FILE:${name}>.debug $<TARGET_FILE:${name}>
+      )
   endif()
 endfunction()
 
diff --git a/cmake/modules/AddSphinxTarget.cmake b/cmake/modules/AddSphinxTarget.cmake
index 3456b536e80acc80523a2b733dc86ea9babe1d7b..cfc7f38e9e7776e81f79993b713584de935fbb1a 100644
--- a/cmake/modules/AddSphinxTarget.cmake
+++ b/cmake/modules/AddSphinxTarget.cmake
@@ -48,10 +48,15 @@ function (add_sphinx_target builder project)
     # Handle installation
     if (NOT LLVM_INSTALL_TOOLCHAIN_ONLY)
       if (builder STREQUAL man)
+        if (CMAKE_INSTALL_MANDIR)
+          set(INSTALL_MANDIR ${CMAKE_INSTALL_MANDIR}/)
+        else()
+          set(INSTALL_MANDIR share/man/)
+        endif()
         # FIXME: We might not ship all the tools that these man pages describe
         install(DIRECTORY "${SPHINX_BUILD_DIR}/" # Slash indicates contents of
                 COMPONENT "${project}-sphinx-man"
-                DESTINATION share/man/man1)
+                DESTINATION ${INSTALL_MANDIR}man1)
 
       elseif (builder STREQUAL html)
         string(TOUPPER "${project}" project_upper)
diff --git a/cmake/modules/HandleLLVMOptions.cmake b/cmake/modules/HandleLLVMOptions.cmake
index dee9bdc6e3ecddfbd9ea6536c1e46a4f83a4bdee..dd44476bc996426a7b5a7832e6eade5f75ec81bb 100644
--- a/cmake/modules/HandleLLVMOptions.cmake
+++ b/cmake/modules/HandleLLVMOptions.cmake
@@ -8,12 +8,41 @@ string(TOUPPER "${CMAKE_BUILD_TYPE}" uppercase_CMAKE_BUILD_TYPE)
 
 include(CheckCompilerVersion)
 include(HandleLLVMStdlib)
-include(AddLLVMDefinitions)
 include(CheckCCompilerFlag)
 include(CheckCXXCompilerFlag)
 
+if(CMAKE_LINKER MATCHES "lld-link.exe" OR (WIN32 AND LLVM_USE_LINKER STREQUAL "lld"))
+  set(LINKER_IS_LLD_LINK TRUE)
+else()
+  set(LINKER_IS_LLD_LINK FALSE)
+endif()
+
+# Ninja Job Pool support
+# The following only works with the Ninja generator in CMake >= 3.0.
+set(LLVM_PARALLEL_COMPILE_JOBS "" CACHE STRING
+  "Define the maximum number of concurrent compilation jobs.")
+if(LLVM_PARALLEL_COMPILE_JOBS)
+  if(NOT CMAKE_MAKE_PROGRAM MATCHES "ninja")
+    message(WARNING "Job pooling is only available with Ninja generators.")
+  else()
+    set_property(GLOBAL APPEND PROPERTY JOB_POOLS compile_job_pool=${LLVM_PARALLEL_COMPILE_JOBS})
+    set(CMAKE_JOB_POOL_COMPILE compile_job_pool)
+  endif()
+endif()
 
-if (CMAKE_LINKER MATCHES "lld-link.exe")
+set(LLVM_PARALLEL_LINK_JOBS "" CACHE STRING
+  "Define the maximum number of concurrent link jobs.")
+if(LLVM_PARALLEL_LINK_JOBS)
+  if(NOT CMAKE_MAKE_PROGRAM MATCHES "ninja")
+    message(WARNING "Job pooling is only available with Ninja generators.")
+  else()
+    set_property(GLOBAL APPEND PROPERTY JOB_POOLS link_job_pool=${LLVM_PARALLEL_LINK_JOBS})
+    set(CMAKE_JOB_POOL_LINK link_job_pool)
+  endif()
+endif()
+
+
+if (LINKER_IS_LLD_LINK)
   # Pass /MANIFEST:NO so that CMake doesn't run mt.exe on our binaries.  Adding
   # manifests with mt.exe breaks LLD's symbol tables and takes as much time as
   # the link. See PR24476.
@@ -223,10 +252,10 @@ if( MSVC_IDE )
     "Number of parallel compiler jobs. 0 means use all processors. Default is 0.")
   if( NOT LLVM_COMPILER_JOBS STREQUAL "1" )
     if( LLVM_COMPILER_JOBS STREQUAL "0" )
-      add_llvm_definitions( /MP )
+      add_definitions( /MP )
     else()
       message(STATUS "Number of parallel compiler jobs set to " ${LLVM_COMPILER_JOBS})
-      add_llvm_definitions( /MP${LLVM_COMPILER_JOBS} )
+      add_definitions( /MP${LLVM_COMPILER_JOBS} )
     endif()
   else()
     message(STATUS "Parallel compilation disabled")
@@ -255,17 +284,17 @@ if( MSVC )
   if( CMAKE_CXX_COMPILER_VERSION VERSION_LESS 19.0 )
     # For MSVC 2013, disable iterator null pointer checking in debug mode,
     # especially so std::equal(nullptr, nullptr, nullptr) will not assert.
-    add_llvm_definitions("-D_DEBUG_POINTER_IMPL=")
+    add_definitions("-D_DEBUG_POINTER_IMPL=")
   endif()
   
   include(ChooseMSVCCRT)
 
   if( MSVC11 )
-    add_llvm_definitions(-D_VARIADIC_MAX=10)
+    add_definitions(-D_VARIADIC_MAX=10)
   endif()
   
   # Add definitions that make MSVC much less annoying.
-  add_llvm_definitions(
+  add_definitions(
     # For some reason MS wants to deprecate a bunch of standard functions...
     -D_CRT_SECURE_NO_DEPRECATE
     -D_CRT_SECURE_NO_WARNINGS
@@ -276,94 +305,15 @@ if( MSVC )
     )
 
   # Tell MSVC to use the Unicode version of the Win32 APIs instead of ANSI.
-  add_llvm_definitions(
+  add_definitions(
     -DUNICODE
     -D_UNICODE
   )
 
-  set(msvc_warning_flags
-    # Disabled warnings.
-    -wd4141 # Suppress ''modifier' : used more than once' (because of __forceinline combined with inline)
-    -wd4146 # Suppress 'unary minus operator applied to unsigned type, result still unsigned'
-    -wd4180 # Suppress 'qualifier applied to function type has no meaning; ignored'
-    -wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
-    -wd4258 # Suppress ''var' : definition from the for loop is ignored; the definition from the enclosing scope is used'
-    -wd4267 # Suppress ''var' : conversion from 'size_t' to 'type', possible loss of data'
-    -wd4291 # Suppress ''declaration' : no matching operator delete found; memory will not be freed if initialization throws an exception'
-    -wd4345 # Suppress 'behavior change: an object of POD type constructed with an initializer of the form () will be default-initialized'
-    -wd4351 # Suppress 'new behavior: elements of array 'array' will be default initialized'
-    -wd4355 # Suppress ''this' : used in base member initializer list'
-    -wd4456 # Suppress 'declaration of 'var' hides local variable'
-    -wd4457 # Suppress 'declaration of 'var' hides function parameter'
-    -wd4458 # Suppress 'declaration of 'var' hides class member'
-    -wd4459 # Suppress 'declaration of 'var' hides global declaration'
-    -wd4503 # Suppress ''identifier' : decorated name length exceeded, name was truncated'
-    -wd4624 # Suppress ''derived class' : destructor could not be generated because a base class destructor is inaccessible'
-    -wd4722 # Suppress 'function' : destructor never returns, potential memory leak
-    -wd4800 # Suppress ''type' : forcing value to bool 'true' or 'false' (performance warning)'
-    -wd4100 # Suppress 'unreferenced formal parameter'
-    -wd4127 # Suppress 'conditional expression is constant'
-    -wd4512 # Suppress 'assignment operator could not be generated'
-    -wd4505 # Suppress 'unreferenced local function has been removed'
-    -wd4610 # Suppress '<class> can never be instantiated'
-    -wd4510 # Suppress 'default constructor could not be generated'
-    -wd4702 # Suppress 'unreachable code'
-    -wd4245 # Suppress 'signed/unsigned mismatch'
-    -wd4706 # Suppress 'assignment within conditional expression'
-    -wd4310 # Suppress 'cast truncates constant value'
-    -wd4701 # Suppress 'potentially uninitialized local variable'
-    -wd4703 # Suppress 'potentially uninitialized local pointer variable'
-    -wd4389 # Suppress 'signed/unsigned mismatch'
-    -wd4611 # Suppress 'interaction between '_setjmp' and C++ object destruction is non-portable'
-    -wd4805 # Suppress 'unsafe mix of type <type> and type <type> in operation'
-    -wd4204 # Suppress 'nonstandard extension used : non-constant aggregate initializer'
-    -wd4577 # Suppress 'noexcept used with no exception handling mode specified; termination on exception is not guaranteed'
-    -wd4091 # Suppress 'typedef: ignored on left of '' when no variable is declared'
-        # C4592 is disabled because of false positives in Visual Studio 2015
-        # Update 1. Re-evaluate the usefulness of this diagnostic with Update 2.
-    -wd4592 # Suppress ''var': symbol will be dynamically initialized (implementation limitation)
-    -wd4319 # Suppress ''operator' : zero extending 'type' to 'type' of greater size'
-
-	# Ideally, we'd like this warning to be enabled, but MSVC 2013 doesn't
-	# support the 'aligned' attribute in the way that clang sources requires (for
-	# any code that uses the LLVM_ALIGNAS macro), so this is must be disabled to
-	# avoid unwanted alignment warnings.
-	# When we switch to requiring a version of MSVC that supports the 'alignas'
-	# specifier (MSVC 2015?) this warning can be re-enabled.
-    -wd4324 # Suppress 'structure was padded due to __declspec(align())'
-
-    # Promoted warnings.
-    -w14062 # Promote 'enumerator in switch of enum is not handled' to level 1 warning.
-
-    # Promoted warnings to errors.
-    -we4238 # Promote 'nonstandard extension used : class rvalue used as lvalue' to error.
-    )
-
-  # Enable warnings
-  if (LLVM_ENABLE_WARNINGS)
-    # Put /W4 in front of all the -we flags. cl.exe doesn't care, but for
-    # clang-cl having /W4 after the -we flags will re-enable the warnings
-    # disabled by -we.
-    set(msvc_warning_flags "/W4 ${msvc_warning_flags}")
-    # CMake appends /W3 by default, and having /W3 followed by /W4 will result in 
-    # cl : Command line warning D9025 : overriding '/W3' with '/W4'.  Since this is
-    # a command line warning and not a compiler warning, it cannot be suppressed except
-    # by fixing the command line.
-    string(REGEX REPLACE " /W[0-4]" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
-    string(REGEX REPLACE " /W[0-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
-
-    if (LLVM_ENABLE_PEDANTIC)
-      # No MSVC equivalent available
-    endif (LLVM_ENABLE_PEDANTIC)
-  endif (LLVM_ENABLE_WARNINGS)
   if (LLVM_ENABLE_WERROR)
-    append("/WX" msvc_warning_flags)
+    append("/WX" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   endif (LLVM_ENABLE_WERROR)
 
-  foreach(flag ${msvc_warning_flags})
-    append("${flag}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-  endforeach(flag)
-
   append("/Zc:inline" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 
   # /Zc:strictStrings is incompatible with VS12's (Visual Studio 2013's)
@@ -383,11 +333,13 @@ if( MSVC )
   # "Enforce type conversion rules".
   append("/Zc:rvalueCast" CMAKE_CXX_FLAGS)
 
-  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+  if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT LLVM_ENABLE_LTO)
     # clang-cl and cl by default produce non-deterministic binaries because
     # link.exe /incremental requires a timestamp in the .obj file.  clang-cl
     # has the flag /Brepro to force deterministic binaries. We want to pass that
-    # whenever you're building with clang unless you're passing /incremental.
+    # whenever you're building with clang unless you're passing /incremental
+    # or using LTO (/Brepro with LTO would result in a warning about the flag
+    # being unused, because we're not generating object files).
     # This checks CMAKE_CXX_COMPILER_ID in addition to check_cxx_compiler_flag()
     # because cl.exe does not emit an error on flags it doesn't understand,
     # letting check_cxx_compiler_flag() claim it understands all flags.
@@ -411,63 +363,6 @@ if( MSVC )
   endif()
 
 elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
-  if (LLVM_ENABLE_WARNINGS)
-    append("-Wall -W -Wno-unused-parameter -Wwrite-strings" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    append("-Wcast-qual" CMAKE_CXX_FLAGS)
-
-    # Turn off missing field initializer warnings for gcc to avoid noise from
-    # false positives with empty {}. Turn them on otherwise (they're off by
-    # default for clang).
-    check_cxx_compiler_flag("-Wmissing-field-initializers" CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
-    if (CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
-      if (CMAKE_COMPILER_IS_GNUCXX)
-        append("-Wno-missing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-      else()
-        append("-Wmissing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-      endif()
-    endif()
-
-    append_if(LLVM_ENABLE_PEDANTIC "-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    append_if(LLVM_ENABLE_PEDANTIC "-Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
-    append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
-    append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
-
-    # Check if -Wnon-virtual-dtor warns even though the class is marked final.
-    # If it does, don't add it. So it won't be added on clang 3.4 and older.
-    # This also catches cases when -Wnon-virtual-dtor isn't supported by
-    # the compiler at all.  This flag is not activated for gcc since it will
-    # incorrectly identify a protected non-virtual base when there is a friend
-    # declaration.
-    if (NOT CMAKE_COMPILER_IS_GNUCXX)
-      set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
-      set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -std=c++11 -Werror=non-virtual-dtor")
-      CHECK_CXX_SOURCE_COMPILES("class base {public: virtual void anchor();protected: ~base();};
-                                 class derived final : public base { public: ~derived();};
-                                 int main() { return 0; }"
-                                CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR)
-      set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
-      append_if(CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR
-                "-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
-    endif()
-
-    # Enable -Wdelete-non-virtual-dtor if available.
-    add_flag_if_supported("-Wdelete-non-virtual-dtor" DELETE_NON_VIRTUAL_DTOR_FLAG)
-
-    # Check if -Wcomment is OK with an // comment ending with '\' if the next
-    # line is also a // comment.
-    set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
-    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror -Wcomment")
-    CHECK_C_SOURCE_COMPILES("// \\\\\\n//\\nint main() {return 0;}"
-                            C_WCOMMENT_ALLOWS_LINE_WRAP)
-    set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
-    if (NOT C_WCOMMENT_ALLOWS_LINE_WRAP)
-      append("-Wno-comment" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    endif()
-
-    # Enable -Wstring-conversion to catch misuse of string literals.
-    add_flag_if_supported("-Wstring-conversion" STRING_CONVERSION_FLAG)
-  endif (LLVM_ENABLE_WARNINGS)
   append_if(LLVM_ENABLE_WERROR "-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
   add_flag_if_supported("-Werror=date-time" WERROR_DATE_TIME)
   if (LLVM_ENABLE_CXX1Y)
@@ -521,6 +416,151 @@ elseif( LLVM_COMPILER_IS_GCC_COMPATIBLE )
   endif(LLVM_ENABLE_MODULES)
 endif( MSVC )
 
+if (MSVC AND NOT CLANG_CL)
+  set(msvc_warning_flags
+    # Disabled warnings.
+    -wd4141 # Suppress ''modifier' : used more than once' (because of __forceinline combined with inline)
+    -wd4146 # Suppress 'unary minus operator applied to unsigned type, result still unsigned'
+    -wd4180 # Suppress 'qualifier applied to function type has no meaning; ignored'
+    -wd4244 # Suppress ''argument' : conversion from 'type1' to 'type2', possible loss of data'
+    -wd4258 # Suppress ''var' : definition from the for loop is ignored; the definition from the enclosing scope is used'
+    -wd4267 # Suppress ''var' : conversion from 'size_t' to 'type', possible loss of data'
+    -wd4291 # Suppress ''declaration' : no matching operator delete found; memory will not be freed if initialization throws an exception'
+    -wd4345 # Suppress 'behavior change: an object of POD type constructed with an initializer of the form () will be default-initialized'
+    -wd4351 # Suppress 'new behavior: elements of array 'array' will be default initialized'
+    -wd4355 # Suppress ''this' : used in base member initializer list'
+    -wd4456 # Suppress 'declaration of 'var' hides local variable'
+    -wd4457 # Suppress 'declaration of 'var' hides function parameter'
+    -wd4458 # Suppress 'declaration of 'var' hides class member'
+    -wd4459 # Suppress 'declaration of 'var' hides global declaration'
+    -wd4503 # Suppress ''identifier' : decorated name length exceeded, name was truncated'
+    -wd4624 # Suppress ''derived class' : destructor could not be generated because a base class destructor is inaccessible'
+    -wd4722 # Suppress 'function' : destructor never returns, potential memory leak
+    -wd4800 # Suppress ''type' : forcing value to bool 'true' or 'false' (performance warning)'
+    -wd4100 # Suppress 'unreferenced formal parameter'
+    -wd4127 # Suppress 'conditional expression is constant'
+    -wd4512 # Suppress 'assignment operator could not be generated'
+    -wd4505 # Suppress 'unreferenced local function has been removed'
+    -wd4610 # Suppress '<class> can never be instantiated'
+    -wd4510 # Suppress 'default constructor could not be generated'
+    -wd4702 # Suppress 'unreachable code'
+    -wd4245 # Suppress 'signed/unsigned mismatch'
+    -wd4706 # Suppress 'assignment within conditional expression'
+    -wd4310 # Suppress 'cast truncates constant value'
+    -wd4701 # Suppress 'potentially uninitialized local variable'
+    -wd4703 # Suppress 'potentially uninitialized local pointer variable'
+    -wd4389 # Suppress 'signed/unsigned mismatch'
+    -wd4611 # Suppress 'interaction between '_setjmp' and C++ object destruction is non-portable'
+    -wd4805 # Suppress 'unsafe mix of type <type> and type <type> in operation'
+    -wd4204 # Suppress 'nonstandard extension used : non-constant aggregate initializer'
+    -wd4577 # Suppress 'noexcept used with no exception handling mode specified; termination on exception is not guaranteed'
+    -wd4091 # Suppress 'typedef: ignored on left of '' when no variable is declared'
+        # C4592 is disabled because of false positives in Visual Studio 2015
+        # Update 1. Re-evaluate the usefulness of this diagnostic with Update 2.
+    -wd4592 # Suppress ''var': symbol will be dynamically initialized (implementation limitation)
+    -wd4319 # Suppress ''operator' : zero extending 'type' to 'type' of greater size'
+
+    # Ideally, we'd like this warning to be enabled, but MSVC 2013 doesn't
+    # support the 'aligned' attribute in the way that clang sources requires (for
+    # any code that uses the LLVM_ALIGNAS macro), so this is must be disabled to
+    # avoid unwanted alignment warnings.
+    # When we switch to requiring a version of MSVC that supports the 'alignas'
+    # specifier (MSVC 2015?) this warning can be re-enabled.
+    -wd4324 # Suppress 'structure was padded due to __declspec(align())'
+
+    # Promoted warnings.
+    -w14062 # Promote 'enumerator in switch of enum is not handled' to level 1 warning.
+
+    # Promoted warnings to errors.
+    -we4238 # Promote 'nonstandard extension used : class rvalue used as lvalue' to error.
+    )
+
+  # Enable warnings
+  if (LLVM_ENABLE_WARNINGS)
+    # Put /W4 in front of all the -we flags. cl.exe doesn't care, but for
+    # clang-cl having /W4 after the -we flags will re-enable the warnings
+    # disabled by -we.
+    set(msvc_warning_flags "/W4 ${msvc_warning_flags}")
+    # CMake appends /W3 by default, and having /W3 followed by /W4 will result in
+    # cl : Command line warning D9025 : overriding '/W3' with '/W4'.  Since this is
+    # a command line warning and not a compiler warning, it cannot be suppressed except
+    # by fixing the command line.
+    string(REGEX REPLACE " /W[0-4]" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
+    string(REGEX REPLACE " /W[0-4]" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}")
+
+    if (LLVM_ENABLE_PEDANTIC)
+      # No MSVC equivalent available
+    endif (LLVM_ENABLE_PEDANTIC)
+  endif (LLVM_ENABLE_WARNINGS)
+
+  foreach(flag ${msvc_warning_flags})
+    append("${flag}" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endforeach(flag)
+endif (MSVC AND NOT CLANG_CL)
+
+if (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
+  append("-Wall -W -Wno-unused-parameter -Wwrite-strings" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  append("-Wcast-qual" CMAKE_CXX_FLAGS)
+
+  # Turn off missing field initializer warnings for gcc to avoid noise from
+  # false positives with empty {}. Turn them on otherwise (they're off by
+  # default for clang).
+  check_cxx_compiler_flag("-Wmissing-field-initializers" CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
+  if (CXX_SUPPORTS_MISSING_FIELD_INITIALIZERS_FLAG)
+    if (CMAKE_COMPILER_IS_GNUCXX)
+      append("-Wno-missing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    else()
+      append("-Wmissing-field-initializers" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    endif()
+  endif()
+
+  if (LLVM_ENABLE_PEDANTIC AND LLVM_COMPILER_IS_GCC_COMPATIBLE)
+    append("-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+    append("-Wno-long-long" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
+
+  add_flag_if_supported("-Wcovered-switch-default" COVERED_SWITCH_DEFAULT_FLAG)
+  append_if(USE_NO_UNINITIALIZED "-Wno-uninitialized" CMAKE_CXX_FLAGS)
+  append_if(USE_NO_MAYBE_UNINITIALIZED "-Wno-maybe-uninitialized" CMAKE_CXX_FLAGS)
+
+  # Check if -Wnon-virtual-dtor warns even though the class is marked final.
+  # If it does, don't add it. So it won't be added on clang 3.4 and older.
+  # This also catches cases when -Wnon-virtual-dtor isn't supported by
+  # the compiler at all.  This flag is not activated for gcc since it will
+  # incorrectly identify a protected non-virtual base when there is a friend
+  # declaration. Don't activate this in general on Windows as this warning has
+  # too many false positives on COM-style classes, which are destroyed with
+  # Release() (PR32286).
+  if (NOT CMAKE_COMPILER_IS_GNUCXX AND NOT WIN32)
+    set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+    set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -std=c++11 -Werror=non-virtual-dtor")
+    CHECK_CXX_SOURCE_COMPILES("class base {public: virtual void anchor();protected: ~base();};
+                               class derived final : public base { public: ~derived();};
+                               int main() { return 0; }"
+                              CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR)
+    set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+    append_if(CXX_WONT_WARN_ON_FINAL_NONVIRTUALDTOR
+              "-Wnon-virtual-dtor" CMAKE_CXX_FLAGS)
+  endif()
+
+  # Enable -Wdelete-non-virtual-dtor if available.
+  add_flag_if_supported("-Wdelete-non-virtual-dtor" DELETE_NON_VIRTUAL_DTOR_FLAG)
+
+  # Check if -Wcomment is OK with an // comment ending with '\' if the next
+  # line is also a // comment.
+  set(OLD_CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS})
+  set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -Werror -Wcomment")
+  CHECK_C_SOURCE_COMPILES("// \\\\\\n//\\nint main() {return 0;}"
+                          C_WCOMMENT_ALLOWS_LINE_WRAP)
+  set(CMAKE_REQUIRED_FLAGS ${OLD_CMAKE_REQUIRED_FLAGS})
+  if (NOT C_WCOMMENT_ALLOWS_LINE_WRAP)
+    append("-Wno-comment" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  endif()
+
+  # Enable -Wstring-conversion to catch misuse of string literals.
+  add_flag_if_supported("-Wstring-conversion" STRING_CONVERSION_FLAG)
+endif (LLVM_ENABLE_WARNINGS AND (LLVM_COMPILER_IS_GCC_COMPATIBLE OR CLANG_CL))
+
 macro(append_common_sanitizer_flags)
   if (NOT MSVC)
     # Append -fno-omit-frame-pointer and turn on debug info to get better
@@ -537,7 +577,7 @@ macro(append_common_sanitizer_flags)
   elseif (CLANG_CL)
     # Keep frame pointers around.
     append("/Oy-" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-    if (CMAKE_LINKER MATCHES "lld-link.exe")
+    if (LINKER_IS_LLD_LINK)
       # Use DWARF debug info with LLD.
       append("-gdwarf" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
     else()
@@ -565,8 +605,11 @@ if(LLVM_USE_SANITIZER)
       append_common_sanitizer_flags()
       append("-fsanitize=undefined -fno-sanitize=vptr,function -fno-sanitize-recover=all"
               CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-      append("-fsanitize-blacklist=${CMAKE_SOURCE_DIR}/utils/sanitizers/ubsan_blacklist.txt"
-	      CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+      set(BLACKLIST_FILE "${CMAKE_SOURCE_DIR}/utils/sanitizers/ubsan_blacklist.txt")
+      if (EXISTS "${BLACKLIST_FILE}")
+        append("-fsanitize-blacklist=${BLACKLIST_FILE}"
+	              CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+      endif()
     elseif (LLVM_USE_SANITIZER STREQUAL "Thread")
       append_common_sanitizer_flags()
       append("-fsanitize=thread" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
@@ -602,9 +645,9 @@ if(LLVM_USE_SPLIT_DWARF)
   add_definitions("-gsplit-dwarf")
 endif()
 
-add_llvm_definitions( -D__STDC_CONSTANT_MACROS )
-add_llvm_definitions( -D__STDC_FORMAT_MACROS )
-add_llvm_definitions( -D__STDC_LIMIT_MACROS )
+add_definitions( -D__STDC_CONSTANT_MACROS )
+add_definitions( -D__STDC_FORMAT_MACROS )
+add_definitions( -D__STDC_LIMIT_MACROS )
 
 # clang doesn't print colored diagnostics when invoked from Ninja
 if (UNIX AND
@@ -672,20 +715,38 @@ append_if(LLVM_BUILD_INSTRUMENTED_COVERAGE "-fprofile-instr-generate='${LLVM_PRO
 
 set(LLVM_ENABLE_LTO OFF CACHE STRING "Build LLVM with LTO. May be specified as Thin or Full to use a particular kind of LTO")
 string(TOUPPER "${LLVM_ENABLE_LTO}" uppercase_LLVM_ENABLE_LTO)
+if(LLVM_ENABLE_LTO AND LLVM_ON_WIN32 AND NOT LINKER_IS_LLD_LINK)
+  message(FATAL_ERROR "When compiling for Windows, LLVM_ENABLE_LTO requires using lld as the linker (point CMAKE_LINKER at lld-link.exe)")
+endif()
 if(uppercase_LLVM_ENABLE_LTO STREQUAL "THIN")
-  append("-flto=thin" CMAKE_CXX_FLAGS CMAKE_C_FLAGS
-                      CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
-  # On darwin, enable the lto cache. This improves initial build time a little
-  # since we re-link a lot of the same objects, and significantly improves
-  # incremental build time.
-  append_if(APPLE "-Wl,-cache_path_lto,${PROJECT_BINARY_DIR}/lto.cache"
-            CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  append("-flto=thin" CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+  if(NOT LINKER_IS_LLD_LINK)
+    append("-flto=thin" CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  endif()
+  # If the linker supports it, enable the lto cache. This improves initial build
+  # time a little since we re-link a lot of the same objects, and significantly
+  # improves incremental build time.
+  # FIXME: We should move all this logic into the clang driver.
+  if(APPLE)
+    append("-Wl,-cache_path_lto,${PROJECT_BINARY_DIR}/lto.cache"
+           CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  elseif(UNIX AND LLVM_USE_LINKER STREQUAL "lld")
+    append("-Wl,--thinlto-cache-dir=${PROJECT_BINARY_DIR}/lto.cache"
+           CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  elseif(LLVM_USE_LINKER STREQUAL "gold")
+    append("-Wl,--plugin-opt,cache-dir=${PROJECT_BINARY_DIR}/lto.cache"
+           CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  endif()
 elseif(uppercase_LLVM_ENABLE_LTO STREQUAL "FULL")
-  append("-flto=full" CMAKE_CXX_FLAGS CMAKE_C_FLAGS
-                 CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  append("-flto=full" CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+  if(NOT LINKER_IS_LLD_LINK)
+    append("-flto=full" CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  endif()
 elseif(LLVM_ENABLE_LTO)
-  append("-flto" CMAKE_CXX_FLAGS CMAKE_C_FLAGS
-                 CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  append("-flto" CMAKE_CXX_FLAGS CMAKE_C_FLAGS)
+  if(NOT LINKER_IS_LLD_LINK)
+    append("-flto" CMAKE_EXE_LINKER_FLAGS CMAKE_SHARED_LINKER_FLAGS)
+  endif()
 endif()
 
 # This option makes utils/extract_symbols.py be used to determine the list of
@@ -712,3 +773,16 @@ if(WIN32 OR CYGWIN)
 else()
   set(LLVM_ENABLE_PLUGINS ON)
 endif()
+
+function(get_compile_definitions)
+  get_directory_property(top_dir_definitions DIRECTORY ${CMAKE_SOURCE_DIR} COMPILE_DEFINITIONS)
+  foreach(definition ${top_dir_definitions})
+    if(DEFINED result)
+      string(APPEND result " -D${definition}")
+    else()
+      set(result "-D${definition}")
+    endif()
+  endforeach()
+  set(LLVM_DEFINITIONS "${result}" PARENT_SCOPE)
+endfunction()
+get_compile_definitions()
diff --git a/cmake/modules/LLVM-Config.cmake b/cmake/modules/LLVM-Config.cmake
index 725178ab57b171ed8f3d737f21d2418251335aca..52330151065b69c0d7e7c1b51f0874061ae029c7 100644
--- a/cmake/modules/LLVM-Config.cmake
+++ b/cmake/modules/LLVM-Config.cmake
@@ -8,27 +8,61 @@ function(link_system_libs target)
   message(AUTHOR_WARNING "link_system_libs no longer needed")
 endfunction()
 
-
+# is_llvm_target_library(
+#   library
+#     Name of the LLVM library to check
+#   return_var
+#     Output variable name
+#   ALL_TARGETS;INCLUDED_TARGETS;OMITTED_TARGETS
+#     ALL_TARGETS - default looks at the full list of known targets
+#     INCLUDED_TARGETS - looks only at targets being configured
+#     OMITTED_TARGETS - looks only at targets that are not being configured
+# )
 function(is_llvm_target_library library return_var)
+  cmake_parse_arguments(ARG "ALL_TARGETS;INCLUDED_TARGETS;OMITTED_TARGETS" "" "" ${ARGN})
   # Sets variable `return_var' to ON if `library' corresponds to a
   # LLVM supported target. To OFF if it doesn't.
   set(${return_var} OFF PARENT_SCOPE)
   string(TOUPPER "${library}" capitalized_lib)
-  string(TOUPPER "${LLVM_ALL_TARGETS}" targets)
+  if(ARG_INCLUDED_TARGETS)
+    string(TOUPPER "${LLVM_TARGETS_TO_BUILD}" targets)
+  elseif(ARG_OMITTED_TARGETS)
+    set(omitted_targets ${LLVM_ALL_TARGETS})
+    list(REMOVE_ITEM omitted_targets ${LLVM_TARGETS_TO_BUILD})
+    string(TOUPPER "${omitted_targets}" targets)
+  else()
+    string(TOUPPER "${LLVM_ALL_TARGETS}" targets)
+  endif()
   foreach(t ${targets})
     if( capitalized_lib STREQUAL t OR
-        capitalized_lib STREQUAL "LLVM${t}" OR
-        capitalized_lib STREQUAL "LLVM${t}CODEGEN" OR
-        capitalized_lib STREQUAL "LLVM${t}ASMPARSER" OR
-        capitalized_lib STREQUAL "LLVM${t}ASMPRINTER" OR
-        capitalized_lib STREQUAL "LLVM${t}DISASSEMBLER" OR
-        capitalized_lib STREQUAL "LLVM${t}INFO" )
+        capitalized_lib STREQUAL "${t}" OR
+        capitalized_lib STREQUAL "${t}DESC" OR
+        capitalized_lib STREQUAL "${t}CODEGEN" OR
+        capitalized_lib STREQUAL "${t}ASMPARSER" OR
+        capitalized_lib STREQUAL "${t}ASMPRINTER" OR
+        capitalized_lib STREQUAL "${t}DISASSEMBLER" OR
+        capitalized_lib STREQUAL "${t}INFO" OR
+        capitalized_lib STREQUAL "${t}UTILS" )
       set(${return_var} ON PARENT_SCOPE)
       break()
     endif()
   endforeach()
 endfunction(is_llvm_target_library)
 
+function(is_llvm_target_specifier library return_var)
+  is_llvm_target_library(${library} ${return_var} ${ARGN})
+  string(TOUPPER "${library}" capitalized_lib)
+  if(NOT ${return_var})
+    if( capitalized_lib STREQUAL "ALLTARGETSASMPARSERS" OR
+        capitalized_lib STREQUAL "ALLTARGETSDESCS" OR
+        capitalized_lib STREQUAL "ALLTARGETSDISASSEMBLERS" OR
+        capitalized_lib STREQUAL "ALLTARGETSINFOS" OR
+        capitalized_lib STREQUAL "NATIVE" OR
+        capitalized_lib STREQUAL "NATIVECODEGEN" )
+      set(${return_var} ON PARENT_SCOPE)
+    endif()
+  endif()
+endfunction()
 
 macro(llvm_config executable)
   cmake_parse_arguments(ARG "USE_SHARED" "" "" ${ARGN})
@@ -93,6 +127,21 @@ function(llvm_map_components_to_libnames out_libs)
   endif()
   string(TOUPPER "${LLVM_AVAILABLE_LIBS}" capitalized_libs)
 
+  get_property(LLVM_TARGETS_CONFIGURED GLOBAL PROPERTY LLVM_TARGETS_CONFIGURED)
+
+  # Generally in our build system we avoid order-dependence. Unfortunately since
+  # not all targets create the same set of libraries we actually need to ensure
+  # that all build targets associated with a target are added before we can
+  # process target dependencies.
+  if(NOT LLVM_TARGETS_CONFIGURED)
+    foreach(c ${link_components})
+      is_llvm_target_specifier(${c} iltl_result ALL_TARGETS)
+      if(iltl_result)
+        message(FATAL_ERROR "Specified target library before target registration is complete.")
+      endif()
+    endforeach()
+  endif()
+
   # Expand some keywords:
   list(FIND LLVM_TARGETS_TO_BUILD "${LLVM_NATIVE_ARCH}" have_native_backend)
   list(FIND link_components "engine" engine_required)
@@ -141,6 +190,12 @@ function(llvm_map_components_to_libnames out_libs)
       if( TARGET LLVM${c}Disassembler )
         list(APPEND expanded_components "LLVM${c}Disassembler")
       endif()
+      if( TARGET LLVM${c}Info )
+        list(APPEND expanded_components "LLVM${c}Info")
+      endif()
+      if( TARGET LLVM${c}Utils )
+        list(APPEND expanded_components "LLVM${c}Utils")
+      endif()
     elseif( c STREQUAL "native" )
       # already processed
     elseif( c STREQUAL "nativecodegen" )
@@ -198,9 +253,16 @@ function(llvm_map_components_to_libnames out_libs)
       list(FIND capitalized_libs LLVM${capitalized} lib_idx)
       if( lib_idx LESS 0 )
         # The component is unknown. Maybe is an omitted target?
-        is_llvm_target_library(${c} iltl_result)
-        if( NOT iltl_result )
-          message(FATAL_ERROR "Library `${c}' not found in list of llvm libraries.")
+        is_llvm_target_library(${c} iltl_result OMITTED_TARGETS)
+        if(iltl_result)
+          # A missing library to a directly referenced omitted target would be bad.
+          message(FATAL_ERROR "Library '${c}' is a direct reference to a target library for an omitted target.")
+        else()
+          # If it is not an omitted target we should assume it is a component
+          # that hasn't yet been processed by CMake. Missing components will
+          # cause errors later in the configuration, so we can safely assume
+          # that this is valid here.
+          list(APPEND expanded_components LLVM${c})
         endif()
       else( lib_idx LESS 0 )
         list(GET LLVM_AVAILABLE_LIBS ${lib_idx} canonical_lib)
diff --git a/cmake/modules/LLVMConfig.cmake.in b/cmake/modules/LLVMConfig.cmake.in
index c30c92b66d8af9bf535c6651ff2ebe8be4b7fdff..7a8eb3674720adf05dabca3ba05774d6c05f9b18 100644
--- a/cmake/modules/LLVMConfig.cmake.in
+++ b/cmake/modules/LLVMConfig.cmake.in
@@ -45,6 +45,10 @@ set(LLVM_ENABLE_PIC @LLVM_ENABLE_PIC@)
 
 set(LLVM_BUILD_32_BITS @LLVM_BUILD_32_BITS@)
 
+if (NOT "@LLVM_PTHREAD_LIB@" STREQUAL "")
+  set(LLVM_PTHREAD_LIB "@LLVM_PTHREAD_LIB@")
+endif()
+
 set(LLVM_ENABLE_PLUGINS @LLVM_ENABLE_PLUGINS@)
 set(LLVM_EXPORT_SYMBOLS_FOR_PLUGINS @LLVM_EXPORT_SYMBOLS_FOR_PLUGINS@)
 set(LLVM_PLUGIN_EXT @LLVM_PLUGIN_EXT@)
@@ -75,4 +79,5 @@ if(NOT TARGET LLVMSupport)
   @llvm_config_include_buildtree_only_exports@
 endif()
 
+set_property(GLOBAL PROPERTY LLVM_TARGETS_CONFIGURED On)
 include(${LLVM_CMAKE_DIR}/LLVM-Config.cmake)
diff --git a/cmake/modules/TableGen.cmake b/cmake/modules/TableGen.cmake
index 9682002c2abd93e76e968163be0f9ee84e412698..da0858e54d441d6ca892dd91d1708486c21a14db 100644
--- a/cmake/modules/TableGen.cmake
+++ b/cmake/modules/TableGen.cmake
@@ -23,6 +23,13 @@ function(tablegen project ofn)
     set(LLVM_TARGET_DEFINITIONS_ABSOLUTE
       ${CMAKE_CURRENT_SOURCE_DIR}/${LLVM_TARGET_DEFINITIONS})
   endif()
+  if (LLVM_ENABLE_DAGISEL_COV)
+    list(FIND ARGN "-gen-dag-isel" idx)
+    if( NOT idx EQUAL -1 )
+      list(APPEND LLVM_TABLEGEN_FLAGS "-instrument-coverage")
+    endif()
+  endif()
+
   add_custom_command(OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${ofn}.tmp
     # Generate tablegen output in a temporary file.
     COMMAND ${${project}_TABLEGEN_EXE} ${ARGN} -I ${CMAKE_CURRENT_SOURCE_DIR}
@@ -92,7 +99,7 @@ macro(add_tablegen target project)
     set(LLVM_ENABLE_OBJLIB ON)
   endif()
 
-  add_llvm_utility(${target} ${ARGN})
+  add_llvm_executable(${target} DISABLE_LLVM_LINK_LLVM_DYLIB ${ARGN})
   set(LLVM_LINK_COMPONENTS ${${target}_OLD_LLVM_LINK_COMPONENTS})
 
   set(${project}_TABLEGEN "${target}" CACHE
diff --git a/cmake/modules/VersionFromVCS.cmake b/cmake/modules/VersionFromVCS.cmake
index 8d56b66fa4781afd0dfbbdfb1bfdbf5688983fc1..e92540991a1092d7d15daa496456753857680156 100644
--- a/cmake/modules/VersionFromVCS.cmake
+++ b/cmake/modules/VersionFromVCS.cmake
@@ -28,10 +28,11 @@ function(add_version_info_from_vcs VERS)
   elseif( EXISTS ${SOURCE_DIR}/.git )
     set(result "${result}git")
     # Try to get a ref-id
-    if( EXISTS ${SOURCE_DIR}/.git/svn )
-      find_program(git_executable NAMES git git.exe git.cmd)
-      if( git_executable )
-        set(is_git_svn_rev_exact false)
+    find_program(git_executable NAMES git git.exe git.cmd)
+
+    if( git_executable )
+      if( EXISTS ${SOURCE_DIR}/.git/svn )
+        # Get the repository URL
         execute_process(COMMAND
           ${git_executable} svn info
           WORKING_DIRECTORY ${SOURCE_DIR}
@@ -43,42 +44,37 @@ function(add_version_info_from_vcs VERS)
           if(svn_url)
             set(LLVM_REPOSITORY ${CMAKE_MATCH_1} PARENT_SCOPE)
           endif()
-
-          string(REGEX REPLACE "^(.*\n)?Revision: ([^\n]+).*"
-            "\\2" git_svn_rev_number "${git_output}")
-          set(SVN_REVISION ${git_svn_rev_number} PARENT_SCOPE)
-          set(git_svn_rev "-svn-${git_svn_rev}")
-
-          # Determine if the HEAD points directly at a subversion revision.
-          execute_process(COMMAND ${git_executable} svn find-rev HEAD
-            WORKING_DIRECTORY ${SOURCE_DIR}
-            TIMEOUT 5
-            RESULT_VARIABLE git_result
-            OUTPUT_VARIABLE git_output)
-          if( git_result EQUAL 0 )
-            string(STRIP "${git_output}" git_head_svn_rev_number)
-            if( git_head_svn_rev_number EQUAL git_svn_rev_number )
-              set(is_git_svn_rev_exact true)
-            endif()
-          endif()
-        else()
-          set(git_svn_rev "")
         endif()
-        execute_process(COMMAND
-          ${git_executable} rev-parse --short HEAD
+
+        # Get the svn revision number for this git commit if one exists.
+        execute_process(COMMAND ${git_executable} svn find-rev HEAD
           WORKING_DIRECTORY ${SOURCE_DIR}
           TIMEOUT 5
           RESULT_VARIABLE git_result
-          OUTPUT_VARIABLE git_output)
-
-        if( git_result EQUAL 0 AND NOT is_git_svn_rev_exact )
-          string(STRIP "${git_output}" git_ref_id)
-          set(GIT_COMMIT ${git_ref_id} PARENT_SCOPE)
-          set(result "${result}${git_svn_rev}-${git_ref_id}")
+          OUTPUT_VARIABLE git_head_svn_rev_number
+          OUTPUT_STRIP_TRAILING_WHITESPACE)
+        if( git_result EQUAL 0 AND git_output)
+          set(SVN_REVISION ${git_head_svn_rev_number} PARENT_SCOPE)
+          set(git_svn_rev "-svn-${git_head_svn_rev_number}")
         else()
-          set(result "${result}${git_svn_rev}")
+          set(git_svn_rev "")
         endif()
+      endif()
+
+      # Get the git ref id
+      execute_process(COMMAND
+        ${git_executable} rev-parse --short HEAD
+        WORKING_DIRECTORY ${SOURCE_DIR}
+        TIMEOUT 5
+        RESULT_VARIABLE git_result
+        OUTPUT_VARIABLE git_ref_id
+        OUTPUT_STRIP_TRAILING_WHITESPACE)
 
+      if( git_result EQUAL 0 )
+        set(GIT_COMMIT ${git_ref_id} PARENT_SCOPE)
+        set(result "${result}${git_svn_rev}-${git_ref_id}")
+      else()
+        set(result "${result}${git_svn_rev}")
       endif()
     endif()
   endif()
diff --git a/cmake/platforms/iOS.cmake b/cmake/platforms/iOS.cmake
index 99692fd6d2aaedd406db763aa7724ee475d36020..15c7aae12c702bfdbcdce50460bf8df3f3273366 100644
--- a/cmake/platforms/iOS.cmake
+++ b/cmake/platforms/iOS.cmake
@@ -4,6 +4,7 @@ SET(CMAKE_SYSTEM_NAME Darwin)
 SET(CMAKE_SYSTEM_VERSION 13)
 SET(CMAKE_CXX_COMPILER_WORKS True)
 SET(CMAKE_C_COMPILER_WORKS True)
+SET(IOS True)
 
 if(NOT CMAKE_OSX_SYSROOT)
   execute_process(COMMAND xcodebuild -version -sdk iphoneos Path
diff --git a/docs/AMDGPUUsage.rst b/docs/AMDGPUUsage.rst
index 2c1a22762bce0556ceb051a400b85da50b3ec77c..5ff0f207f227b8f42c4a971e519f37e42d4b52a1 100644
--- a/docs/AMDGPUUsage.rst
+++ b/docs/AMDGPUUsage.rst
@@ -19,20 +19,73 @@ Address Spaces
 
 The AMDGPU back-end uses the following address space mapping:
 
-   ============= ============================================
-   Address Space Memory Space
-   ============= ============================================
-   0             Private
-   1             Global
-   2             Constant
-   3             Local
-   4             Generic (Flat)
-   5             Region
-   ============= ============================================
+   ================== =================== ==============
+   LLVM Address Space DWARF Address Space Memory Space
+   ================== =================== ==============
+   0                  1                   Private
+   1                  N/A                 Global
+   2                  N/A                 Constant
+   3                  2                   Local
+   4                  N/A                 Generic (Flat)
+   5                  N/A                 Region
+   ================== =================== ==============
 
 The terminology in the table, aside from the region memory space, is from the
 OpenCL standard.
 
+LLVM Address Space is used throughout LLVM (for example, in LLVM IR). DWARF
+Address Space is emitted in DWARF, and is used by tools, such as debugger,
+profiler and others.
+
+Trap Handler ABI
+----------------
+The OS element of the target triple controls the trap handler behavior.
+
+HSA OS
+^^^^^^
+For code objects generated by AMDGPU back-end for the HSA OS, the runtime
+installs a trap handler that supports the s_trap instruction with the following
+usage:
+
+ +--------------+-------------+-------------------+----------------------------+
+ |Usage         |Code Sequence|Trap Handler Inputs|Description                 |
+ +==============+=============+===================+============================+
+ |reserved      |s_trap 0x00  |                   |Reserved by hardware.       |
+ +--------------+-------------+-------------------+----------------------------+
+ |HSA debugtrap |s_trap 0x01  |SGPR0-1: queue_ptr |Reserved for HSA debugtrap  |
+ |(arg)         |             |VGPR0: arg         |intrinsic (not implemented).|
+ +--------------+-------------+-------------------+----------------------------+
+ |llvm.trap     |s_trap 0x02  |SGPR0-1: queue_ptr |Causes dispatch to be       |
+ |              |             |                   |terminated and its          |
+ |              |             |                   |associated queue put into   |
+ |              |             |                   |the error state.            |
+ +--------------+-------------+-------------------+----------------------------+
+ |llvm.debugtrap| s_trap 0x03 |SGPR0-1: queue_ptr |If debugger not installed   |
+ |              |             |                   |handled same as llvm.trap.  |
+ +--------------+-------------+-------------------+----------------------------+
+ |debugger      |s_trap 0x07  |                   |Reserved for debugger       |
+ |breakpoint    |             |                   |breakpoints.                |
+ +--------------+-------------+-------------------+----------------------------+
+ |debugger      |s_trap 0x08  |                   |Reserved for debugger.      |
+ +--------------+-------------+-------------------+----------------------------+
+ |debugger      |s_trap 0xfe  |                   |Reserved for debugger.      |
+ +--------------+-------------+-------------------+----------------------------+
+ |debugger      |s_trap 0xff  |                   |Reserved for debugger.      |
+ +--------------+-------------+-------------------+----------------------------+
+
+Non-HSA OS
+^^^^^^^^^^
+For code objects generated by AMDGPU back-end for non-HSA OS, the runtime does
+not install a trap handler. The llvm.trap and llvm.debugtrap instructions are
+handler as follows:
+
+   =============== ============= ===============================================
+   Usage           Code Sequence Description
+   =============== ============= ===============================================
+   llvm.trap       s_endpgm      Causes wavefront to be terminated.
+   llvm.debugtrap  s_nop         No operation. Compiler warning generated that
+                                 there is no trap handler installed.
+   =============== ============= ===============================================
 
 Assembler
 =========
diff --git a/docs/AliasAnalysis.rst b/docs/AliasAnalysis.rst
index 02b749ffb9181e106c560a62f4f21921836ae5e0..e201333f30070fc821ef34132e1d70cfbca98deb 100644
--- a/docs/AliasAnalysis.rst
+++ b/docs/AliasAnalysis.rst
@@ -136,7 +136,7 @@ be overlapping in some way, but do not start at the same address.
 
 The ``MustAlias`` response may only be returned if the two memory objects are
 guaranteed to always start at exactly the same location. A ``MustAlias``
-response implies that the pointers compare equal.
+response does not imply that the pointers compare equal.
 
 The ``getModRefInfo`` methods
 -----------------------------
diff --git a/docs/BitCodeFormat.rst b/docs/BitCodeFormat.rst
index 3c9aa1010704ce5b5b26254f81eb1067a3703ef9..a9a123595f7f5aa983201c33b1d5c1cb2525f95b 100644
--- a/docs/BitCodeFormat.rst
+++ b/docs/BitCodeFormat.rst
@@ -839,16 +839,6 @@ fields are
 * *unnamed_addr*: If present, an encoding of the
   :ref:`unnamed_addr<bcunnamedaddr>` attribute of this alias
 
-MODULE_CODE_PURGEVALS Record
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-``[PURGEVALS, numvals]``
-
-The ``PURGEVALS`` record (code 10) resets the module-level value list to the
-size given by the single operand value. Module-level value list items are added
-by ``GLOBALVAR``, ``FUNCTION``, and ``ALIAS`` records.  After a ``PURGEVALS``
-record is seen, new value indices will start from the given *numvals* value.
-
 .. _MODULE_CODE_GCNAME:
 
 MODULE_CODE_GCNAME Record
diff --git a/docs/BranchWeightMetadata.rst b/docs/BranchWeightMetadata.rst
index 9e61d232d74b57a3367b1d93a1fdb5951331b96a..b941d0d1505064139d9875bc0fcc263e60c7cf22 100644
--- a/docs/BranchWeightMetadata.rst
+++ b/docs/BranchWeightMetadata.rst
@@ -123,11 +123,11 @@ To allow comparing different functions during inter-procedural analysis and
 optimization, ``MD_prof`` nodes can also be assigned to a function definition.
 The first operand is a string indicating the name of the associated counter.
 
-Currently, one counter is supported: "function_entry_count". This is a 64-bit
-counter that indicates the number of times that this function was invoked (in
-the case of instrumentation-based profiles). In the case of sampling-based
-profiles, this counter is an approximation of how many times the function was
-invoked.
+Currently, one counter is supported: "function_entry_count". The second operand
+is a 64-bit counter that indicates the number of times that this function was
+invoked (in the case of instrumentation-based profiles). In the case of
+sampling-based profiles, this operand is an approximation of how many times
+the function was invoked.
 
 For example, in the code below, the instrumentation for function foo()
 indicates that it was called 2,590 times at runtime.
@@ -138,3 +138,13 @@ indicates that it was called 2,590 times at runtime.
     ret i32 0
   }
   !1 = !{!"function_entry_count", i64 2590}
+
+If "function_entry_count" has more than 2 operands, the later operands are
+the GUID of the functions that needs to be imported by ThinLTO. This is only
+set by sampling based profile. It is needed because the sampling based profile
+was collected on a binary that had already imported and inlined these functions,
+and we need to ensure the IR matches in the ThinLTO backends for profile
+annotation. The reason why we cannot annotate this on the callsite is that it
+can only goes down 1 level in the call chain. For the cases where
+foo_in_a_cc()->bar_in_b_cc()->baz_in_c_cc(), we will need to go down 2 levels
+in the call chain to import both bar_in_b_cc and baz_in_c_cc.
diff --git a/docs/CodeGenerator.rst b/docs/CodeGenerator.rst
index 6e5a54a592ceef4c79ea7a4b20f1441f590718cf..106fc8456f616f05f7f51170fe3b59cc3ca0861a 100644
--- a/docs/CodeGenerator.rst
+++ b/docs/CodeGenerator.rst
@@ -1005,7 +1005,7 @@ The TableGen DAG instruction selector generator reads the instruction patterns
 in the ``.td`` file and automatically builds parts of the pattern matching code
 for your target.  It has the following strengths:
 
-* At compiler-compiler time, it analyzes your instruction patterns and tells you
+* At compiler-compile time, it analyzes your instruction patterns and tells you
   if your patterns make sense or not.
 
 * It can handle arbitrary constraints on operands for the pattern match.  In
@@ -1026,7 +1026,7 @@ for your target.  It has the following strengths:
 
 * Targets can define their own (and rely on built-in) "pattern fragments".
   Pattern fragments are chunks of reusable patterns that get inlined into your
-  patterns during compiler-compiler time.  For example, the integer "``(not
+  patterns during compiler-compile time.  For example, the integer "``(not
   x)``" operation is actually defined as a pattern fragment that expands as
   "``(xor x, -1)``", since the SelectionDAG does not have a native '``not``'
   operation.  Targets can define their own short-hand fragments as they see fit.
diff --git a/docs/CommandGuide/FileCheck.rst b/docs/CommandGuide/FileCheck.rst
index 413b6f41b0cc25484f5861a636e54fb18e8b7979..8830c394b212fc343a1e6ae9cd87ccb9531dbf4a 100644
--- a/docs/CommandGuide/FileCheck.rst
+++ b/docs/CommandGuide/FileCheck.rst
@@ -77,6 +77,15 @@ OPTIONS
   -verify``. With this option FileCheck will verify that input does not contain
   warnings not covered by any ``CHECK:`` patterns.
 
+.. option:: --enable-var-scope
+
+  Enables scope for regex variables.
+
+  Variables with names that start with ``$`` are considered global and
+  remain set throughout the file.
+
+  All other variables get undefined after each encountered ``CHECK-LABEL``.
+
 .. option:: -version
 
  Show the version number of this program.
@@ -344,6 +353,9 @@ matched by the directive cannot also be matched by any other check present in
 other unique identifiers. Conceptually, the presence of ``CHECK-LABEL`` divides
 the input stream into separate blocks, each of which is processed independently,
 preventing a ``CHECK:`` directive in one block matching a line in another block.
+If ``--enable-var-scope`` is in effect, all local variables are cleared at the
+beginning of the block.
+
 For example,
 
 .. code-block:: llvm
@@ -436,6 +448,13 @@ were defined on. For example:
 Can be useful if you want the operands of ``op`` to be the same register,
 and don't care exactly which register it is.
 
+If ``--enable-var-scope`` is in effect, variables with names that
+start with ``$`` are considered to be global. All others variables are
+local.  All local variables get undefined at the beginning of each
+CHECK-LABEL block. Global variables are not affected by CHECK-LABEL.
+This makes it easier to ensure that individual tests are not affected
+by variables set in preceding tests.
+
 FileCheck Expressions
 ~~~~~~~~~~~~~~~~~~~~~
 
diff --git a/docs/CommandGuide/lit.rst b/docs/CommandGuide/lit.rst
index 2e9054b1ce90836d6495e26298dd8c79e220d8dc..b8299d44d48ec552d8ff90769431baec236f8f0e 100644
--- a/docs/CommandGuide/lit.rst
+++ b/docs/CommandGuide/lit.rst
@@ -56,7 +56,7 @@ GENERAL OPTIONS
  Search for :file:`{NAME}.cfg` and :file:`{NAME}.site.cfg` when searching for
  test suites, instead of :file:`lit.cfg` and :file:`lit.site.cfg`.
 
-.. option:: -D NAME, -D NAME=VALUE, --param NAME, --param NAME=VALUE
+.. option:: -D NAME[=VALUE], --param NAME[=VALUE]
 
  Add a user defined parameter ``NAME`` with the given ``VALUE`` (or the empty
  string if not given).  The meaning and use of these parameters is test suite
@@ -379,7 +379,7 @@ PRE-DEFINED SUBSTITUTIONS
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
 
 :program:`lit` provides various patterns that can be used with the RUN command.
-These are defined in TestRunner.py.
+These are defined in TestRunner.py. The base set of substitutions are:
 
  ========== ==============
   Macro      Substitution
@@ -391,17 +391,13 @@ These are defined in TestRunner.py.
  %t         temporary file name unique to the test
  %T         temporary directory unique to the test
  %%         %
- %/s        same as %s but replace all / with \\
- %/S        same as %S but replace all / with \\
- %/p        same as %p but replace all / with \\
- %/t        same as %t but replace all / with \\
- %/T        same as %T but replace all / with \\
  ========== ==============
 
-Further substitution patterns might be defined by each test module.
-See the modules :ref:`local-configuration-files`.
+Other substitutions are provided that are variations on this base set and
+further substitution patterns can be defined by each test module. See the
+modules :ref:`local-configuration-files`.
 
-More information on the testing infrastucture can be found in the
+More detailed information on substitutions can be found in the
 :doc:`../TestingGuide`.
 
 TEST RUN OUTPUT FORMAT
diff --git a/docs/CommandGuide/llvm-cov.rst b/docs/CommandGuide/llvm-cov.rst
index 4c0354c0d608fe8f1c7164efee4c52d8ddd213c0..ea2e625bc4d27e675cbb1702b1cf1ee631767e68 100644
--- a/docs/CommandGuide/llvm-cov.rst
+++ b/docs/CommandGuide/llvm-cov.rst
@@ -322,6 +322,10 @@ OPTIONS
  universal binary or to use an architecture that does not match a
  non-universal binary.
 
+.. option:: -show-functions
+
+ Show coverage summaries for each function.
+
 .. program:: llvm-cov export
 
 .. _llvm-cov-export:
diff --git a/docs/CommandGuide/llvm-profdata.rst b/docs/CommandGuide/llvm-profdata.rst
index bae0ff7d4ce07bae4b18f453f1188cb584261a5b..f7aa8309485b1be2eede390db1e52899cb99be57 100644
--- a/docs/CommandGuide/llvm-profdata.rst
+++ b/docs/CommandGuide/llvm-profdata.rst
@@ -196,6 +196,10 @@ OPTIONS
 
  Specify that the input profile is a sample-based profile.
 
+.. option:: -memop-sizes
+
+ Show the profiled sizes of the memory intrinsic calls for shown functions.
+
 EXIT STATUS
 -----------
 
diff --git a/docs/Coroutines.rst b/docs/Coroutines.rst
index 0e7cde7aa38ba9457f90477128eae0d2b8fb51c4..f7a38577fe8eb603944593c6c84907267ad97730 100644
--- a/docs/Coroutines.rst
+++ b/docs/Coroutines.rst
@@ -89,7 +89,7 @@ and 6 after which the coroutine will be destroyed.
 
 The LLVM IR for this coroutine looks like this:
 
-.. code-block:: none
+.. code-block:: llvm
 
   define i8* @f(i32 %n) {
   entry:
@@ -110,7 +110,7 @@ The LLVM IR for this coroutine looks like this:
     call void @free(i8* %mem)
     br label %suspend
   suspend:
-    call void @llvm.coro.end(i8* %hdl, i1 false)
+    %unused = call i1 @llvm.coro.end(i8* %hdl, i1 false)
     ret i8* %hdl
   }
 
@@ -156,7 +156,7 @@ We also store addresses of the resume and destroy functions so that the
 when its identity cannot be determined statically at compile time. For our 
 example, the coroutine frame will be:
 
-.. code-block:: text
+.. code-block:: llvm
 
   %f.frame = type { void (%f.frame*)*, void (%f.frame*)*, i32 }
 
@@ -164,7 +164,7 @@ After resume and destroy parts are outlined, function `f` will contain only the
 code responsible for creation and initialization of the coroutine frame and 
 execution of the coroutine until a suspend point is reached:
 
-.. code-block:: none
+.. code-block:: llvm
 
   define i8* @f(i32 %n) {
   entry:
@@ -224,7 +224,7 @@ In the entry block, we will call `coro.alloc`_ intrinsic that will return `true`
 when dynamic allocation is required, and `false` if dynamic allocation is 
 elided.
 
-.. code-block:: none
+.. code-block:: llvm
 
   entry:
     %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
@@ -242,7 +242,7 @@ In the cleanup block, we will make freeing the coroutine frame conditional on
 `coro.free`_ intrinsic. If allocation is elided, `coro.free`_ returns `null`
 thus skipping the deallocation code:
 
-.. code-block:: text
+.. code-block:: llvm
 
   cleanup:
     %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
@@ -286,7 +286,7 @@ Let's consider the coroutine that has more than one suspend point:
 Matching LLVM code would look like (with the rest of the code remaining the same
 as the code in the previous section):
 
-.. code-block:: text
+.. code-block:: llvm
 
   loop:
     %n.addr = phi i32 [ %n, %entry ], [ %inc, %loop.resume ]
@@ -383,17 +383,17 @@ point when coroutine should be ready for resumption (namely, when a resume index
 should be stored in the coroutine frame, so that it can be resumed at the 
 correct resume point):
 
-.. code-block:: text
+.. code-block:: llvm
 
   if.true:
     %save1 = call token @llvm.coro.save(i8* %hdl)
-    call void async_op1(i8* %hdl)
+    call void @async_op1(i8* %hdl)
     %suspend1 = call i1 @llvm.coro.suspend(token %save1, i1 false)
     switch i8 %suspend1, label %suspend [i8 0, label %resume1
                                          i8 1, label %cleanup]
   if.false:
     %save2 = call token @llvm.coro.save(i8* %hdl)
-    call void async_op2(i8* %hdl)
+    call void @async_op2(i8* %hdl)
     %suspend2 = call i1 @llvm.coro.suspend(token %save2, i1 false)
     switch i8 %suspend1, label %suspend [i8 0, label %resume2
                                          i8 1, label %cleanup]
@@ -411,7 +411,7 @@ be used to communicate with the coroutine. This distinguished alloca is called
 The following coroutine designates a 32 bit integer `promise` and uses it to
 store the current value produced by a coroutine.
 
-.. code-block:: text
+.. code-block:: llvm
 
   define i8* @f(i32 %n) {
   entry:
@@ -440,7 +440,7 @@ store the current value produced by a coroutine.
     call void @free(i8* %mem)
     br label %suspend
   suspend:
-    call void @llvm.coro.end(i8* %hdl, i1 false)
+    %unused = call i1 @llvm.coro.end(i8* %hdl, i1 false)
     ret i8* %hdl
   }
 
@@ -692,7 +692,7 @@ a coroutine user are responsible to makes sure there is no data races.
 Example:
 """"""""
 
-.. code-block:: text
+.. code-block:: llvm
 
   define i8* @f(i32 %n) {
   entry:
@@ -812,7 +812,7 @@ pointer that was returned by prior `coro.begin` call.
 Example (custom deallocation function):
 """""""""""""""""""""""""""""""""""""""
 
-.. code-block:: text
+.. code-block:: llvm
 
   cleanup:
     %mem = call i8* @llvm.coro.free(token %id, i8* %frame)
@@ -827,7 +827,7 @@ Example (custom deallocation function):
 Example (standard deallocation functions):
 """"""""""""""""""""""""""""""""""""""""""
 
-.. code-block:: text
+.. code-block:: llvm
 
   cleanup:
     %mem = call i8* @llvm.coro.free(token %id, i8* %frame)
@@ -864,7 +864,7 @@ when possible.
 Example:
 """"""""
 
-.. code-block:: text
+.. code-block:: llvm
 
   entry:
     %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
@@ -955,41 +955,90 @@ A frontend should emit exactly one `coro.id` intrinsic per coroutine.
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 ::
 
-  declare void @llvm.coro.end(i8* <handle>, i1 <unwind>)
+  declare i1 @llvm.coro.end(i8* <handle>, i1 <unwind>)
 
 Overview:
 """""""""
 
 The '``llvm.coro.end``' marks the point where execution of the resume part of 
-the coroutine should end and control returns back to the caller.
+the coroutine should end and control should return to the caller.
 
 
 Arguments:
 """"""""""
 
-The first argument should refer to the coroutine handle of the enclosing coroutine.
+The first argument should refer to the coroutine handle of the enclosing
+coroutine. A frontend is allowed to supply null as the first parameter, in this
+case `coro-early` pass will replace the null with an appropriate coroutine 
+handle value.
 
 The second argument should be `true` if this coro.end is in the block that is 
-part of the unwind sequence leaving the coroutine body due to exception prior to
-the first reaching any suspend points, and `false` otherwise.
+part of the unwind sequence leaving the coroutine body due to an exception and 
+`false` otherwise.
 
 Semantics:
 """"""""""
-The `coro.end`_ intrinsic is a no-op during an initial invocation of the 
-coroutine. When the coroutine resumes, the intrinsic marks the point when 
-coroutine need to return control back to the caller.
+The purpose of this intrinsic is to allow frontends to mark the cleanup and
+other code that is only relevant during the initial invocation of the coroutine
+and should not be present in resume and destroy parts. 
 
-This intrinsic is removed by the CoroSplit pass when a coroutine is split into
-the start, resume and destroy parts. In start part, the intrinsic is removed,
-in resume and destroy parts, it is replaced with `ret void` instructions and
+This intrinsic is lowered when a coroutine is split into
+the start, resume and destroy parts. In the start part, it is a no-op,
+in resume and destroy parts, it is replaced with `ret void` instruction and
 the rest of the block containing `coro.end` instruction is discarded.
-
 In landing pads it is replaced with an appropriate instruction to unwind to 
-caller.
+caller. The handling of coro.end differs depending on whether the target is 
+using landingpad or WinEH exception model.
+
+For landingpad based exception model, it is expected that frontend uses the 
+`coro.end`_ intrinsic as follows:
+
+.. code-block:: llvm
+
+    ehcleanup:
+      %InResumePart = call i1 @llvm.coro.end(i8* null, i1 true)
+      br i1 %InResumePart, label %eh.resume, label %cleanup.cont
 
-A frontend is allowed to supply null as the first parameter, in this case 
-`coro-early` pass will replace the null with an appropriate coroutine handle
-value.
+    cleanup.cont:
+      ; rest of the cleanup
+
+    eh.resume:
+      %exn = load i8*, i8** %exn.slot, align 8
+      %sel = load i32, i32* %ehselector.slot, align 4
+      %lpad.val = insertvalue { i8*, i32 } undef, i8* %exn, 0
+      %lpad.val29 = insertvalue { i8*, i32 } %lpad.val, i32 %sel, 1
+      resume { i8*, i32 } %lpad.val29
+
+The `CoroSpit` pass replaces `coro.end` with ``True`` in the resume functions,
+thus leading to immediate unwind to the caller, whereas in start function it
+is replaced with ``False``, thus allowing to proceed to the rest of the cleanup
+code that is only needed during initial invocation of the coroutine.
+
+For Windows Exception handling model, a frontend should attach a funclet bundle
+referring to an enclosing cleanuppad as follows:
+
+.. code-block:: llvm
+
+    ehcleanup: 
+      %tok = cleanuppad within none []
+      %unused = call i1 @llvm.coro.end(i8* null, i1 true) [ "funclet"(token %tok) ]
+      cleanupret from %tok unwind label %RestOfTheCleanup
+
+The `CoroSplit` pass, if the funclet bundle is present, will insert 
+``cleanupret from %tok unwind to caller`` before
+the `coro.end`_ intrinsic and will remove the rest of the block.
+
+The following table summarizes the handling of `coro.end`_ intrinsic.
+
++--------------------------+-------------------+-------------------------------+
+|                          | In Start Function | In Resume/Destroy Functions   |
++--------------------------+-------------------+-------------------------------+
+|unwind=false              | nothing           |``ret void``                   |
++------------+-------------+-------------------+-------------------------------+
+|            | WinEH       | nothing           |``cleanupret unwind to caller``|
+|unwind=true +-------------+-------------------+-------------------------------+
+|            | Landingpad  | nothing           | nothing                       |
++------------+-------------+-------------------+-------------------------------+
 
 .. _coro.suspend:
 .. _suspend points:
@@ -1025,7 +1074,7 @@ basic blocks.
 Example (normal suspend point):
 """""""""""""""""""""""""""""""
 
-.. code-block:: text
+.. code-block:: llvm
 
     %0 = call i8 @llvm.coro.suspend(token none, i1 false)
     switch i8 %0, label %suspend [i8 0, label %resume
@@ -1034,7 +1083,7 @@ Example (normal suspend point):
 Example (final suspend point):
 """"""""""""""""""""""""""""""
 
-.. code-block:: text
+.. code-block:: llvm
 
   while.end:
     %s.final = call i8 @llvm.coro.suspend(token none, i1 true)
@@ -1095,10 +1144,10 @@ In such a case, a coroutine should be ready for resumption prior to a call to
 a different thread possibly prior to `async_op` call returning control back
 to the coroutine:
 
-.. code-block:: text
+.. code-block:: llvm
 
     %save1 = call token @llvm.coro.save(i8* %hdl)
-    call void async_op1(i8* %hdl)
+    call void @async_op1(i8* %hdl)
     %suspend1 = call i1 @llvm.coro.suspend(token %save1, i1 false)
     switch i8 %suspend1, label %suspend [i8 0, label %resume1
                                          i8 1, label %cleanup]
diff --git a/docs/DeveloperPolicy.rst b/docs/DeveloperPolicy.rst
index 9ec6fb84636f451900ea4ff731533c5ec09b5bec..97e0572343798a35191efd2fd3f1be4d72cbf74c 100644
--- a/docs/DeveloperPolicy.rst
+++ b/docs/DeveloperPolicy.rst
@@ -62,7 +62,7 @@ way to see what other people are interested in and watching the flow of the
 project as a whole.
 
 We recommend that active developers register an email account with `LLVM
-Bugzilla <http://llvm.org/bugs/>`_ and preferably subscribe to the `llvm-bugs
+Bugzilla <https://bugs.llvm.org/>`_ and preferably subscribe to the `llvm-bugs
 <http://lists.llvm.org/mailman/listinfo/llvm-bugs>`_ email list to keep track
 of bugs and enhancements occurring in LLVM.  We really appreciate people who are
 proactive at catching incoming bugs in their components and dealing with them
@@ -261,7 +261,7 @@ the future that the change is responsible for.  For example:
 * The changes should not cause performance or correctness regressions in code
   compiled by LLVM on all applicable targets.
 
-* You are expected to address any `Bugzilla bugs <http://llvm.org/bugs/>`_ that
+* You are expected to address any `Bugzilla bugs <https://bugs.llvm.org/>`_ that
   result from your change.
 
 We prefer for this to be handled before submission but understand that it isn't
diff --git a/docs/Extensions.rst b/docs/Extensions.rst
index 2b12123cdf6889ba18e0e5a976490281d65b7abc..14fea30204b4cb235a1175f6889daffc4687aa47 100644
--- a/docs/Extensions.rst
+++ b/docs/Extensions.rst
@@ -204,9 +204,49 @@ For example, the following code creates two sections named ``.text``.
 The unique number is not present in the resulting object at all. It is just used
 in the assembler to differentiate the sections.
 
+The 'o' flag is mapped to SHF_LINK_ORDER. If it is present, a symbol
+must be given that identifies the section to be placed is the
+.sh_link.
+
+.. code-block:: gas
+
+        .section .foo,"a",@progbits
+        .Ltmp:
+        .section .bar,"ao",@progbits,.Ltmp
+
+which is equivalent to just
+
+.. code-block:: gas
+
+        .section .foo,"a",@progbits
+        .section .bar,"ao",@progbits,.foo
+
+
 Target Specific Behaviour
 =========================
 
+X86
+---
+
+Relocations
+^^^^^^^^^^^
+
+``@ABS8`` can be applied to symbols which appear as immediate operands to
+instructions that have an 8-bit immediate form for that operand. It causes
+the assembler to use the 8-bit form and an 8-bit relocation (e.g. ``R_386_8``
+or ``R_X86_64_8``) for the symbol.
+
+For example:
+
+.. code-block:: gas
+
+  cmpq $foo@ABS8, %rdi
+
+This causes the assembler to select the form of the 64-bit ``cmpq`` instruction
+that takes an 8-bit immediate operand that is sign extended to 64 bits, as
+opposed to ``cmpq $foo, %rdi`` which takes a 32-bit immediate operand. This
+is also not the same as ``cmpb $foo, %dil``, which is an 8-bit comparison.
+
 Windows on ARM
 --------------
 
diff --git a/docs/FaultMaps.rst b/docs/FaultMaps.rst
index 4ecdd86d7693c4bb31b38630176200c6cd757389..d63ff5a84394808d50299b8b2785121b9e18bbc2 100644
--- a/docs/FaultMaps.rst
+++ b/docs/FaultMaps.rst
@@ -47,12 +47,18 @@ The format of this section is
     uint32 : NumFaultingPCs
     uint32 : Reserved (expected to be 0)
     FunctionFaultInfo[NumFaultingPCs] {
-      uint32  : FaultKind = FaultMaps::FaultingLoad (only legal value currently)
+      uint32  : FaultKind
       uint32  : FaultingPCOffset
       uint32  : HandlerPCOffset
     }
   }
 
+FailtKind describes the reason of expected fault. Currently three kind
+of faults are supported:
+
+  1. ``FaultMaps::FaultingLoad`` - fault due to load from memory.
+  2. ``FaultMaps::FaultingLoadStore`` - fault due to instruction load and store.
+  3. ``FaultMaps::FaultingStore`` - fault due to store to memory.
 
 The ``ImplicitNullChecks`` pass
 ===============================
diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst
index 969976cbccc8f690088f6c6313829fb1aa9555f2..a88860310f642937a553d512bfa3f28e6b5af044 100644
--- a/docs/GettingStarted.rst
+++ b/docs/GettingStarted.rst
@@ -58,6 +58,12 @@ Here's the short story for getting up and running quickly with LLVM:
    * ``cd llvm/tools``
    * ``svn co http://llvm.org/svn/llvm-project/lld/trunk lld``
 
+#. Checkout Polly Loop Optimizer **[Optional]**:
+
+   * ``cd where-you-want-llvm-to-live``
+   * ``cd llvm/tools``
+   * ``svn co http://llvm.org/svn/llvm-project/polly/trunk polly``
+
 #. Checkout Compiler-RT (required to build the sanitizers) **[Optional]**:
 
    * ``cd where-you-want-llvm-to-live``
@@ -727,8 +733,8 @@ Or a combination of multiple projects:
   % mkdir clang-build && cd clang-build
   % cmake -GNinja ../llvm-project/llvm -DLLVM_ENABLE_PROJECTS="clang;libcxx;libcxxabi"
 
-A helper script is provided in `llvm/utils/git-svn/git-llvm`. After you add it
-to your path, you can push committed changes upstream with `git llvm push`.
+A helper script is provided in ``llvm/utils/git-svn/git-llvm``. After you add it
+to your path, you can push committed changes upstream with ``git llvm push``.
 
 .. code-block:: console
 
@@ -737,10 +743,22 @@ to your path, you can push committed changes upstream with `git llvm push`.
 
 While this is using SVN under the hood, it does not require any interaction from
 you with git-svn.
-After a few minutes, `git pull` should get back the changes as they were
-committed. Note that a current limitation is that `git` does not directly record
-file rename, and thus it is propagated to SVN as a combination of delete-add
-instead of a file rename.
+After a few minutes, ``git pull`` should get back the changes as they were
+committed. Note that a current limitation is that ``git`` does not directly
+record file rename, and thus it is propagated to SVN as a combination of
+delete-add instead of a file rename.
+
+The SVN revision of each monorepo commit can be found in the commit notes.  git
+does not fetch notes by default. The following commands will fetch the notes and
+configure git to fetch future notes. Use ``git notes show $commit`` to look up
+the SVN revision of a git commit. The notes show up ``git log``, and searching
+the log is currently the recommended way to look up the git commit for a given
+SVN revision.
+
+.. code-block:: console
+
+  % git config --add remote.origin.fetch +refs/notes/commits:refs/notes/commits
+  % git fetch
 
 If you are using `arc` to interact with Phabricator, you need to manually put it
 at the root of the checkout:
@@ -799,7 +817,8 @@ used by people developing LLVM.
 +-------------------------+----------------------------------------------------+
 | LLVM_ENABLE_SPHINX      | Build sphinx-based documentation from the source   |
 |                         | code. This is disabled by default because it is    |
-|                         | slow and generates a lot of output.                |
+|                         | slow and generates a lot of output. Sphinx version |
+|                         | 1.5 or later recommended.                          |
 +-------------------------+----------------------------------------------------+
 | LLVM_BUILD_LLVM_DYLIB   | Generate libLLVM.so. This library contains a       |
 |                         | default set of LLVM components that can be         |
@@ -1144,7 +1163,7 @@ the `Command Guide <CommandGuide/index.html>`_.
 ``llc``
 
   ``llc`` is the LLVM backend compiler, which translates LLVM bitcode to a
-  native code assembly file or to C code (with the ``-march=c`` option).
+  native code assembly file.
 
 ``opt``
 
diff --git a/docs/HowToAddABuilder.rst b/docs/HowToAddABuilder.rst
index 9e06a3276470b9a3a36996a683d68f9b784a91a6..fcc2293de052e3b9ad455d33fd29ed00561533d2 100644
--- a/docs/HowToAddABuilder.rst
+++ b/docs/HowToAddABuilder.rst
@@ -6,9 +6,19 @@ Introduction
 ============
 
 This document contains information about adding a build configuration and
-buildslave to private slave builder to LLVM Buildbot Infrastructure
-`<http://lab.llvm.org:8011>`_.
+buildslave to private slave builder to LLVM Buildbot Infrastructure.
 
+Buildmasters
+============
+
+There are two buildmasters running.
+
+* The main buildmaster at `<http://lab.llvm.org:8011>`_. All builders attached
+  to this machine will notify commit authors every time they break the build.
+* The staging buildbot at `<http://lab.llvm.org:8014>`_. All builders attached
+  to this machine will be completely silent by default when the build is broken.
+  Builders for experimental backends should generally be attached to this
+  buildmaster.
 
 Steps To Add Builder To LLVM Buildbot
 =====================================
@@ -73,6 +83,11 @@ Here are the steps you can follow to do so:
    * slaves are added to ``buildbot/osuosl/master/config/slaves.py``
    * builders are added to ``buildbot/osuosl/master/config/builders.py``
 
+   It is possible to whitelist email addresses to unconditionally receive notifications
+   on build failure; for this you'll need to add an ``InformativeMailNotifier`` to
+   ``buildbot/osuosl/master/config/status.py``. This is particularly useful for the
+   staging buildmaster which is silent otherwise.
+
 #. Send the buildslave access name and the access password directly to
    `Galina Kistanova <mailto:gkistanova@gmail.com>`_, and wait till she
    will let you know that your changes are applied and buildmaster is
diff --git a/docs/HowToSubmitABug.rst b/docs/HowToSubmitABug.rst
index 9f997d2757dd967854599329ca30c73935f62c7b..25cb2c8c80d3681495c6ae789d969dea81443d37 100644
--- a/docs/HowToSubmitABug.rst
+++ b/docs/HowToSubmitABug.rst
@@ -19,7 +19,7 @@ section to narrow down the bug so that the person who fixes it will be able
 to find the problem more easily.
 
 Once you have a reduced test-case, go to `the LLVM Bug Tracking System
-<http://llvm.org/bugs/enter_bug.cgi>`_ and fill out the form with the
+<https://bugs.llvm.org/enter_bug.cgi>`_ and fill out the form with the
 necessary details (note that you don't need to pick a category, just use
 the "new-bugs" category if you're not sure).  The bug description should
 contain the following information:
diff --git a/docs/HowToUseAttributes.rst b/docs/HowToUseAttributes.rst
index 66c44c01f631cb633cd22cb5a4919f5262ee2650..1d05e238587406f4d2aca5729ddc70fc4de5ad79 100644
--- a/docs/HowToUseAttributes.rst
+++ b/docs/HowToUseAttributes.rst
@@ -38,36 +38,35 @@ Because attributes are no longer represented as a bit mask, you will need to
 convert any code which does treat them as a bit mask to use the new query
 methods on the Attribute class.
 
-``AttributeSet``
-================
+``AttributeList``
+=================
 
-The ``AttributeSet`` class replaces the old ``AttributeList`` class.  The
-``AttributeSet`` stores a collection of Attribute objects for each kind of
-object that may have an attribute associated with it: the function as a
-whole, the return type, or the function's parameters.  A function's attributes
-are at index ``AttributeSet::FunctionIndex``; the return type's attributes are
-at index ``AttributeSet::ReturnIndex``; and the function's parameters'
-attributes are at indices 1, ..., n (where 'n' is the number of parameters).
-Most methods on the ``AttributeSet`` class take an index parameter.
+The ``AttributeList`` stores a collection of Attribute objects for each kind of
+object that may have an attribute associated with it: the function as a whole,
+the return type, or the function's parameters.  A function's attributes are at
+index ``AttributeList::FunctionIndex``; the return type's attributes are at
+index ``AttributeList::ReturnIndex``; and the function's parameters' attributes
+are at indices 1, ..., n (where 'n' is the number of parameters).  Most methods
+on the ``AttributeList`` class take an index parameter.
 
-An ``AttributeSet`` is also a uniqued and immutable object.  You create an
-``AttributeSet`` through the ``AttributeSet::get`` methods.  You can add and
-remove attributes, which result in the creation of a new ``AttributeSet``.
+An ``AttributeList`` is also a uniqued and immutable object.  You create an
+``AttributeList`` through the ``AttributeList::get`` methods.  You can add and
+remove attributes, which result in the creation of a new ``AttributeList``.
 
-An ``AttributeSet`` object is designed to be passed around by value.
+An ``AttributeList`` object is designed to be passed around by value.
 
-Note: It is advised that you do *not* use the ``AttributeSet`` "introspection"
+Note: It is advised that you do *not* use the ``AttributeList`` "introspection"
 methods (e.g. ``Raw``, ``getRawPointer``, etc.).  These methods break
 encapsulation, and may be removed in a future release (i.e. LLVM 4.0).
 
 ``AttrBuilder``
 ===============
 
-Lastly, we have a "builder" class to help create the ``AttributeSet`` object
+Lastly, we have a "builder" class to help create the ``AttributeList`` object
 without having to create several different intermediate uniqued
-``AttributeSet`` objects.  The ``AttrBuilder`` class allows you to add and
+``AttributeList`` objects.  The ``AttrBuilder`` class allows you to add and
 remove attributes at will.  The attributes won't be uniqued until you call the
-appropriate ``AttributeSet::get`` method.
+appropriate ``AttributeList::get`` method.
 
 An ``AttrBuilder`` object is *not* designed to be passed around by value.  It
 should be passed by reference.
diff --git a/docs/LLVMBuild.rst b/docs/LLVMBuild.rst
index a93dcf644084d2eb9ac4af5ece264865a99f9e81..622780aee3124f806e30968119bf726363fdbfef 100644
--- a/docs/LLVMBuild.rst
+++ b/docs/LLVMBuild.rst
@@ -54,7 +54,7 @@ handled by another build system (See: :doc:`CMake <CMake>`).
 The build system implementation will load the relevant contents of the
 LLVMBuild files and use that to drive the actual project build.
 Typically, the build system will only need to load this information at
-"configure" time, and use it to generative native information. Build
+"configure" time, and use it to generate native information. Build
 systems will also handle automatically reconfiguring their information
 when the contents of the ``LLVMBuild.txt`` files change.
 
diff --git a/docs/LangRef.rst b/docs/LangRef.rst
index 47d96290e6eba52135e2b19b89a8752ece9bbca5..363847af0a8a877311fe5bae506076d906c827cd 100644
--- a/docs/LangRef.rst
+++ b/docs/LangRef.rst
@@ -195,7 +195,7 @@ linkage:
 ``private``
     Global values with "``private``" linkage are only directly
     accessible by objects in the current module. In particular, linking
-    code into a module with an private global value may cause the
+    code into a module with a private global value may cause the
     private to be renamed as necessary to avoid collisions. Because the
     symbol is private to the module, all references can be updated. This
     doesn't show up in any symbol table in the object file.
@@ -1474,8 +1474,10 @@ example:
     any mutable state (e.g. memory, control registers, etc) visible to
     caller functions. It does not write through any pointer arguments
     (including ``byval`` arguments) and never changes any state visible
-    to callers. This means that it cannot unwind exceptions by calling
-    the ``C++`` exception throwing methods.
+    to callers. This means while it cannot unwind exceptions by calling
+    the ``C++`` exception throwing methods (since they write to memory), there may
+    be non-``C++`` mechanisms that throw exceptions without writing to LLVM
+    visible memory.
 
     On an argument, this attribute indicates that the function does not
     dereference that pointer argument, even though it may read or write the
@@ -1487,9 +1489,10 @@ example:
     caller functions. It may dereference pointer arguments and read
     state that may be set in the caller. A readonly function always
     returns the same value (or unwinds an exception identically) when
-    called with the same set of arguments and global state. It cannot
-    unwind an exception by calling the ``C++`` exception throwing
-    methods.
+    called with the same set of arguments and global state.  This means while it
+    cannot unwind exceptions by calling the ``C++`` exception throwing methods
+    (since they write to memory), there may be non-``C++`` mechanisms that throw
+    exceptions without writing to LLVM visible memory.
 
     On an argument, this attribute indicates that the function does not write
     through this pointer argument, even though it may write to the memory that
@@ -1809,6 +1812,9 @@ as follows:
     must be a multiple of 8-bits. If omitted, the natural stack
     alignment defaults to "unspecified", which does not prevent any
     alignment promotions.
+``A<address space>``
+    Specifies the address space of  objects created by '``alloca``'.
+    Defaults to the default address space of 0.
 ``p[n]:<size>:<abi>:<pref>``
     This specifies the *size* of a pointer and its ``<abi>`` and
     ``<pref>``\erred alignments for address space ``n``. All sizes are in
@@ -2191,6 +2197,10 @@ otherwise unsafe floating point transformations.
    Allow Reciprocal - Allow optimizations to use the reciprocal of an
    argument rather than perform division.
 
+``contract``
+   Allow floating-point contraction (e.g. fusing a multiply followed by an
+   addition into a fused multiply-and-add).
+
 ``fast``
    Fast - Allow algebraically equivalent transformations that may
    dramatically change results in floating point (e.g. reassociate). This
@@ -3199,6 +3209,22 @@ resulting assembly string is parsed by LLVM's integrated assembler unless it is
 disabled -- even when emitting a ``.s`` file -- and thus must contain assembly
 syntax known to LLVM.
 
+LLVM also supports a few more substitions useful for writing inline assembly:
+
+- ``${:uid}``: Expands to a decimal integer unique to this inline assembly blob.
+  This substitution is useful when declaring a local label. Many standard
+  compiler optimizations, such as inlining, may duplicate an inline asm blob.
+  Adding a blob-unique identifier ensures that the two labels will not conflict
+  during assembly. This is used to implement `GCC's %= special format
+  string <https://gcc.gnu.org/onlinedocs/gcc/Extended-Asm.html>`_.
+- ``${:comment}``: Expands to the comment character of the current target's
+  assembly dialect. This is usually ``#``, but many targets use other strings,
+  such as ``;``, ``//``, or ``!``.
+- ``${:private}``: Expands to the assembler private label prefix. Labels with
+  this prefix will not appear in the symbol table of the assembled object.
+  Typically the prefix is ``L``, but targets may use other strings. ``.L`` is
+  relatively popular.
+
 LLVM's support for inline asm is modeled closely on the requirements of Clang's
 GCC-compatible inline-asm support. Thus, the feature-set and the constraint and
 modifier codes listed here are similar or identical to those in GCC's inline asm
@@ -3987,7 +4013,9 @@ DICompileUnit
 ``retainedTypes:``, ``subprograms:``, ``globals:``, ``imports:`` and ``macros:``
 fields are tuples containing the debug info to be emitted along with the compile
 unit, regardless of code optimizations (some nodes are only emitted if there are
-references to them from instructions).
+references to them from instructions). The ``debugInfoForProfiling:`` field is a
+boolean indicating whether or not line-table discriminators are updated to
+provide more-accurate debug info for profiling results.
 
 .. code-block:: text
 
@@ -4345,24 +4373,42 @@ parameter, and it will be included in the ``variables:`` field of its
 DIExpression
 """"""""""""
 
-``DIExpression`` nodes represent DWARF expression sequences. They are used in
-:ref:`debug intrinsics<dbg_intrinsics>` (such as ``llvm.dbg.declare``) to
-describe how the referenced LLVM variable relates to the source language
-variable.
+``DIExpression`` nodes represent expressions that are inspired by the DWARF
+expression language. They are used in :ref:`debug intrinsics<dbg_intrinsics>`
+(such as ``llvm.dbg.declare`` and ``llvm.dbg.value``) to describe how the
+referenced LLVM variable relates to the source language variable.
 
 The current supported vocabulary is limited:
 
 - ``DW_OP_deref`` dereferences the working expression.
 - ``DW_OP_plus, 93`` adds ``93`` to the working expression.
-- ``DW_OP_bit_piece, 16, 8`` specifies the offset and size (``16`` and ``8``
-  here, respectively) of the variable piece from the working expression.
+- ``DW_OP_LLVM_fragment, 16, 8`` specifies the offset and size (``16`` and ``8``
+  here, respectively) of the variable fragment from the working expression. Note
+  that contrary to DW_OP_bit_piece, the offset is describing the the location
+  within the described source variable.
+- ``DW_OP_swap`` swaps top two stack entries.
+- ``DW_OP_xderef`` provides extended dereference mechanism. The entry at the top
+  of the stack is treated as an address. The second stack entry is treated as an
+  address space identifier.
+- ``DW_OP_stack_value`` marks a constant value.
+
+DIExpression nodes that contain a ``DW_OP_stack_value`` operator are standalone
+location descriptions that describe constant values. This form is used to
+describe global constants that have been optimized away. All other expressions
+are modifiers to another location: A debug intrinsic ties a location and a
+DIExpression together. Contrary to DWARF expressions, a DIExpression always
+describes the *value* of a source variable and never its *address*. In DWARF
+terminology, a DIExpression can always be considered an implicit location
+description regardless whether it contains a ``DW_OP_stack_value`` or not.
 
 .. code-block:: text
 
     !0 = !DIExpression(DW_OP_deref)
     !1 = !DIExpression(DW_OP_plus, 3)
     !2 = !DIExpression(DW_OP_bit_piece, 3, 7)
-    !3 = !DIExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_bit_piece, 3, 7)
+    !3 = !DIExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_LLVM_fragment, 3, 7)
+    !4 = !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef)
+    !5 = !DIExpression(DW_OP_constu, 42, DW_OP_stack_value)
 
 DIObjCProperty
 """"""""""""""
@@ -4415,37 +4461,156 @@ appear in the included source file.
 ^^^^^^^^^^^^^^^^^^^
 
 In LLVM IR, memory does not have types, so LLVM's own type system is not
-suitable for doing TBAA. Instead, metadata is added to the IR to
-describe a type system of a higher level language. This can be used to
-implement typical C/C++ TBAA, but it can also be used to implement
-custom alias analysis behavior for other languages.
+suitable for doing type based alias analysis (TBAA). Instead, metadata is
+added to the IR to describe a type system of a higher level language. This
+can be used to implement C/C++ strict type aliasing rules, but it can also
+be used to implement custom alias analysis behavior for other languages.
+
+This description of LLVM's TBAA system is broken into two parts:
+:ref:`Semantics<tbaa_node_semantics>` talks about high level issues, and
+:ref:`Representation<tbaa_node_representation>` talks about the metadata
+encoding of various entities.
+
+It is always possible to trace any TBAA node to a "root" TBAA node (details
+in the :ref:`Representation<tbaa_node_representation>` section).  TBAA
+nodes with different roots have an unknown aliasing relationship, and LLVM
+conservatively infers ``MayAlias`` between them.  The rules mentioned in
+this section only pertain to TBAA nodes living under the same root.
+
+.. _tbaa_node_semantics:
+
+Semantics
+"""""""""
+
+The TBAA metadata system, referred to as "struct path TBAA" (not to be
+confused with ``tbaa.struct``), consists of the following high level
+concepts: *Type Descriptors*, further subdivided into scalar type
+descriptors and struct type descriptors; and *Access Tags*.
+
+**Type descriptors** describe the type system of the higher level language
+being compiled.  **Scalar type descriptors** describe types that do not
+contain other types.  Each scalar type has a parent type, which must also
+be a scalar type or the TBAA root.  Via this parent relation, scalar types
+within a TBAA root form a tree.  **Struct type descriptors** denote types
+that contain a sequence of other type descriptors, at known offsets.  These
+contained type descriptors can either be struct type descriptors themselves
+or scalar type descriptors.
+
+**Access tags** are metadata nodes attached to load and store instructions.
+Access tags use type descriptors to describe the *location* being accessed
+in terms of the type system of the higher level language.  Access tags are
+tuples consisting of a base type, an access type and an offset.  The base
+type is a scalar type descriptor or a struct type descriptor, the access
+type is a scalar type descriptor, and the offset is a constant integer.
+
+The access tag ``(BaseTy, AccessTy, Offset)`` can describe one of two
+things:
+
+ * If ``BaseTy`` is a struct type, the tag describes a memory access (load
+   or store) of a value of type ``AccessTy`` contained in the struct type
+   ``BaseTy`` at offset ``Offset``.
+
+ * If ``BaseTy`` is a scalar type, ``Offset`` must be 0 and ``BaseTy`` and
+   ``AccessTy`` must be the same; and the access tag describes a scalar
+   access with scalar type ``AccessTy``.
+
+We first define an ``ImmediateParent`` relation on ``(BaseTy, Offset)``
+tuples this way:
+
+ * If ``BaseTy`` is a scalar type then ``ImmediateParent(BaseTy, 0)`` is
+   ``(ParentTy, 0)`` where ``ParentTy`` is the parent of the scalar type as
+   described in the TBAA metadata.  ``ImmediateParent(BaseTy, Offset)`` is
+   undefined if ``Offset`` is non-zero.
+
+ * If ``BaseTy`` is a struct type then ``ImmediateParent(BaseTy, Offset)``
+   is ``(NewTy, NewOffset)`` where ``NewTy`` is the type contained in
+   ``BaseTy`` at offset ``Offset`` and ``NewOffset`` is ``Offset`` adjusted
+   to be relative within that inner type.
+
+A memory access with an access tag ``(BaseTy1, AccessTy1, Offset1)``
+aliases a memory access with an access tag ``(BaseTy2, AccessTy2,
+Offset2)`` if either ``(BaseTy1, Offset1)`` is reachable from ``(Base2,
+Offset2)`` via the ``Parent`` relation or vice versa.
+
+As a concrete example, the type descriptor graph for the following program
 
-The current metadata format is very simple. TBAA metadata nodes have up
-to three fields, e.g.:
+.. code-block:: c
 
-.. code-block:: llvm
+    struct Inner {
+      int i;    // offset 0
+      float f;  // offset 4
+    };
+    
+    struct Outer {
+      float f;  // offset 0
+      double d; // offset 4
+      struct Inner inner_a;  // offset 12
+    };
+    
+    void f(struct Outer* outer, struct Inner* inner, float* f, int* i, char* c) {
+      outer->f = 0;            // tag0: (OuterStructTy, FloatScalarTy, 0)
+      outer->inner_a.i = 0;    // tag1: (OuterStructTy, IntScalarTy, 12)
+      outer->inner_a.f = 0.0;  // tag2: (OuterStructTy, IntScalarTy, 16)
+      *f = 0.0;                // tag3: (FloatScalarTy, FloatScalarTy, 0)
+    }
 
-    !0 = !{ !"an example type tree" }
-    !1 = !{ !"int", !0 }
-    !2 = !{ !"float", !0 }
-    !3 = !{ !"const float", !2, i64 1 }
-
-The first field is an identity field. It can be any value, usually a
-metadata string, which uniquely identifies the type. The most important
-name in the tree is the name of the root node. Two trees with different
-root node names are entirely disjoint, even if they have leaves with
-common names.
-
-The second field identifies the type's parent node in the tree, or is
-null or omitted for a root node. A type is considered to alias all of
-its descendants and all of its ancestors in the tree. Also, a type is
-considered to alias all types in other trees, so that bitcode produced
-from multiple front-ends is handled conservatively.
-
-If the third field is present, it's an integer which if equal to 1
-indicates that the type is "constant" (meaning
+is (note that in C and C++, ``char`` can be used to access any arbitrary
+type):
+
+.. code-block:: text
+
+    Root = "TBAA Root"
+    CharScalarTy = ("char", Root, 0)
+    FloatScalarTy = ("float", CharScalarTy, 0)
+    DoubleScalarTy = ("double", CharScalarTy, 0)
+    IntScalarTy = ("int", CharScalarTy, 0)
+    InnerStructTy = {"Inner" (IntScalarTy, 0), (FloatScalarTy, 4)}
+    OuterStructTy = {"Outer", (FloatScalarTy, 0), (DoubleScalarTy, 4),
+                     (InnerStructTy, 12)}
+
+
+with (e.g.) ``ImmediateParent(OuterStructTy, 12)`` = ``(InnerStructTy,
+0)``, ``ImmediateParent(InnerStructTy, 0)`` = ``(IntScalarTy, 0)``, and
+``ImmediateParent(IntScalarTy, 0)`` = ``(CharScalarTy, 0)``.
+
+.. _tbaa_node_representation:
+
+Representation
+""""""""""""""
+
+The root node of a TBAA type hierarchy is an ``MDNode`` with 0 operands or
+with exactly one ``MDString`` operand.
+
+Scalar type descriptors are represented as an ``MDNode`` s with two
+operands.  The first operand is an ``MDString`` denoting the name of the
+struct type.  LLVM does not assign meaning to the value of this operand, it
+only cares about it being an ``MDString``.  The second operand is an
+``MDNode`` which points to the parent for said scalar type descriptor,
+which is either another scalar type descriptor or the TBAA root.  Scalar
+type descriptors can have an optional third argument, but that must be the
+constant integer zero.
+
+Struct type descriptors are represented as ``MDNode`` s with an odd number
+of operands greater than 1.  The first operand is an ``MDString`` denoting
+the name of the struct type.  Like in scalar type descriptors the actual
+value of this name operand is irrelevant to LLVM.  After the name operand,
+the struct type descriptors have a sequence of alternating ``MDNode`` and
+``ConstantInt`` operands.  With N starting from 1, the 2N - 1 th operand,
+an ``MDNode``, denotes a contained field, and the 2N th operand, a
+``ConstantInt``, is the offset of the said contained field.  The offsets
+must be in non-decreasing order.
+
+Access tags are represented as ``MDNode`` s with either 3 or 4 operands.
+The first operand is an ``MDNode`` pointing to the node representing the
+base type.  The second operand is an ``MDNode`` pointing to the node
+representing the access type.  The third operand is a ``ConstantInt`` that
+states the offset of the access.  If a fourth field is present, it must be
+a ``ConstantInt`` valued at 0 or 1.  If it is 1 then the access tag states
+that the location being accessed is "constant" (meaning
 ``pointsToConstantMemory`` should return true; see `other useful
-AliasAnalysis methods <AliasAnalysis.html#OtherItfs>`_).
+AliasAnalysis methods <AliasAnalysis.html#OtherItfs>`_).  The TBAA root of
+the access type and the base type of an access tag must be the same, and
+that is the TBAA root of the access tag.
 
 '``tbaa.struct``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -4956,11 +5121,48 @@ Examples:
    !0 = !{!"magic ptr"}
    !1 = !{!"other ptr"}
 
+The invariant.group metadata must be dropped when replacing one pointer by
+another based on aliasing information. This is because invariant.group is tied
+to the SSA value of the pointer operand.
+
+.. code-block:: llvm
+  
+  %v = load i8, i8* %x, !invariant.group !0
+  ; if %x mustalias %y then we can replace the above instruction with
+  %v = load i8, i8* %y
+
+
 '``type``' Metadata
 ^^^^^^^^^^^^^^^^^^^
 
 See :doc:`TypeMetadata`.
 
+'``associated``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The ``associated`` metadata may be attached to a global object
+declaration with a single argument that references another global object.
+
+This metadata prevents discarding of the global object in linker GC
+unless the referenced object is also discarded. The linker support for
+this feature is spotty. For best compatibility, globals carrying this
+metadata may also:
+
+- Be in a comdat with the referenced global.
+- Be in @llvm.compiler.used.
+- Have an explicit section with a name which is a valid C identifier.
+
+It does not have any effect on non-ELF targets.
+
+Example:
+
+.. code-block:: llvm
+
+    $a = comdat any
+    @a = global i32 1, comdat $a
+    @b = internal global i32 2, comdat $a, section "abc", !associated !0
+    !0 = !{i32* @a}
+
 
 Module Flags Metadata
 =====================
@@ -6184,7 +6386,9 @@ The value produced is the unsigned integer quotient of the two operands.
 Note that unsigned integer division and signed integer division are
 distinct operations; for signed integer division, use '``sdiv``'.
 
-Division by zero leads to undefined behavior.
+Division by zero is undefined behavior. For vectors, if any element
+of the divisor is zero, the operation has undefined behavior.
+
 
 If the ``exact`` keyword is present, the result value of the ``udiv`` is
 a :ref:`poison value <poisonvalues>` if %op1 is not a multiple of %op2 (as
@@ -6229,9 +6433,10 @@ rounded towards zero.
 Note that signed integer division and unsigned integer division are
 distinct operations; for unsigned integer division, use '``udiv``'.
 
-Division by zero leads to undefined behavior. Overflow also leads to
-undefined behavior; this is a rare case, but can occur, for example, by
-doing a 32-bit division of -2147483648 by -1.
+Division by zero is undefined behavior. For vectors, if any element
+of the divisor is zero, the operation has undefined behavior.
+Overflow also leads to undefined behavior; this is a rare case, but can
+occur, for example, by doing a 32-bit division of -2147483648 by -1.
 
 If the ``exact`` keyword is present, the result value of the ``sdiv`` is
 a :ref:`poison value <poisonvalues>` if the result would be rounded.
@@ -6314,8 +6519,10 @@ remainder.
 
 Note that unsigned integer remainder and signed integer remainder are
 distinct operations; for signed integer remainder, use '``srem``'.
-
-Taking the remainder of a division by zero leads to undefined behavior.
+ 
+Taking the remainder of a division by zero is undefined behavior.
+For vectors, if any element of the divisor is zero, the operation has 
+undefined behavior.
 
 Example:
 """"""""
@@ -6365,7 +6572,9 @@ operation <http://en.wikipedia.org/wiki/Modulo_operation>`_.
 Note that signed integer remainder and unsigned integer remainder are
 distinct operations; for unsigned integer remainder, use '``urem``'.
 
-Taking the remainder of a division by zero leads to undefined behavior.
+Taking the remainder of a division by zero is undefined behavior.
+For vectors, if any element of the divisor is zero, the operation has 
+undefined behavior.
 Overflow also leads to undefined behavior; this is a rare case, but can
 occur, for example, by taking the remainder of a 32-bit division of
 -2147483648 by -1. (The remainder doesn't actually overflow, but this
@@ -6997,7 +7206,7 @@ Syntax:
 
 ::
 
-      <result> = alloca [inalloca] <type> [, <ty> <NumElements>] [, align <alignment>]     ; yields type*:result
+      <result> = alloca [inalloca] <type> [, <ty> <NumElements>] [, align <alignment>] [, addrspace(<num>)]     ; yields type addrspace(num)*:result
 
 Overview:
 """""""""
@@ -7005,7 +7214,7 @@ Overview:
 The '``alloca``' instruction allocates memory on the stack frame of the
 currently executing function, to be automatically released when this
 function returns to its caller. The object is always allocated in the
-generic address space (address space zero).
+address space for allocas indicated in the datalayout.
 
 Arguments:
 """"""""""
@@ -7594,8 +7803,10 @@ offsets implied by the indices to the base address with infinitely
 precise signed arithmetic are not an *in bounds* address of that
 allocated object. The *in bounds* addresses for an allocated object are
 all the addresses that point into the object, plus the address one byte
-past the end. In cases where the base is a vector of pointers the
-``inbounds`` keyword applies to each of the computations element-wise.
+past the end. The only *in bounds* address for a null pointer in the
+default address-space is the null pointer itself. In cases where the
+base is a vector of pointers the ``inbounds`` keyword applies to each
+of the computations element-wise.
 
 If the ``inbounds`` keyword is not present, the offsets are added to the
 base address with silently-wrapping two's complement arithmetic. If the
@@ -9598,7 +9809,7 @@ Semantics:
       compile-time-known constant value.
 
       The return value type of :ref:`llvm.get.dynamic.area.offset <int_get_dynamic_area_offset>`
-      must match the target's generic address space's (address space 0) pointer type.
+      must match the target's default address space's (address space 0) pointer type.
 
 '``llvm.prefetch``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10075,21 +10286,20 @@ all types however.
 Overview:
 """""""""
 
-The '``llvm.sqrt``' intrinsics return the sqrt of the specified operand,
+The '``llvm.sqrt``' intrinsics return the square root of the specified value,
 returning the same value as the libm '``sqrt``' functions would, but without
 trapping or setting ``errno``.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
 
-This function returns the sqrt of the specified operand if it is a
-nonnegative floating point number.
+This function returns the square root of the operand if it is a nonnegative
+floating point number.
 
 '``llvm.powi.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -10155,8 +10365,7 @@ The '``llvm.sin.*``' intrinsics return the sine of the operand.
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10191,8 +10400,7 @@ The '``llvm.cos.*``' intrinsics return the cosine of the operand.
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10259,13 +10467,13 @@ all types however.
 Overview:
 """""""""
 
-The '``llvm.exp.*``' intrinsics perform the exp function.
+The '``llvm.exp.*``' intrinsics compute the base-e exponential of the specified
+value.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10294,13 +10502,13 @@ all types however.
 Overview:
 """""""""
 
-The '``llvm.exp2.*``' intrinsics perform the exp2 function.
+The '``llvm.exp2.*``' intrinsics compute the base-2 exponential of the
+specified value.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10329,13 +10537,13 @@ all types however.
 Overview:
 """""""""
 
-The '``llvm.log.*``' intrinsics perform the log function.
+The '``llvm.log.*``' intrinsics compute the base-e logarithm of the specified
+value.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10364,13 +10572,13 @@ all types however.
 Overview:
 """""""""
 
-The '``llvm.log10.*``' intrinsics perform the log10 function.
+The '``llvm.log10.*``' intrinsics compute the base-10 logarithm of the
+specified value.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -10399,13 +10607,13 @@ all types however.
 Overview:
 """""""""
 
-The '``llvm.log2.*``' intrinsics perform the log2 function.
+The '``llvm.log2.*``' intrinsics compute the base-2 logarithm of the specified
+value.
 
 Arguments:
 """"""""""
 
-The argument and return value are floating point numbers of the same
-type.
+The argument and return value are floating point numbers of the same type.
 
 Semantics:
 """"""""""
@@ -12579,8 +12787,8 @@ Syntax:
 
 ::
 
-      declare i32 @llvm.objectsize.i32(i8* <object>, i1 <min>)
-      declare i64 @llvm.objectsize.i64(i8* <object>, i1 <min>)
+      declare i32 @llvm.objectsize.i32(i8* <object>, i1 <min>, i1 <nullunknown>)
+      declare i64 @llvm.objectsize.i64(i8* <object>, i1 <min>, i1 <nullunknown>)
 
 Overview:
 """""""""
@@ -12595,11 +12803,16 @@ other object.
 Arguments:
 """"""""""
 
-The ``llvm.objectsize`` intrinsic takes two arguments. The first
-argument is a pointer to or into the ``object``. The second argument is
-a boolean and determines whether ``llvm.objectsize`` returns 0 (if true)
-or -1 (if false) when the object size is unknown. The second argument
-only accepts constants.
+The ``llvm.objectsize`` intrinsic takes three arguments. The first argument is
+a pointer to or into the ``object``. The second argument determines whether
+``llvm.objectsize`` returns 0 (if true) or -1 (if false) when the object size
+is unknown. The third argument controls how ``llvm.objectsize`` acts when
+``null`` is used as its pointer argument. If it's true and the pointer is in
+address space 0, ``null`` is treated as an opaque value with an unknown number
+of bytes. Otherwise, ``llvm.objectsize`` reports 0 bytes available when given
+``null``.
+
+The second and third arguments only accept constants.
 
 Semantics:
 """"""""""
@@ -12684,6 +12897,33 @@ sufficient overall improvement in code quality. For this reason,
 that the optimizer can otherwise deduce or facts that are of little use to the
 optimizer.
 
+.. _int_ssa_copy:
+
+'``llvm.ssa_copy``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+::
+
+      declare type @llvm.ssa_copy(type %operand) returned(1) readnone
+
+Arguments:
+""""""""""
+
+The first argument is an operand which is used as the returned value.
+
+Overview:
+""""""""""
+
+The ``llvm.ssa_copy`` intrinsic can be used to attach information to
+operations by copying them and giving them new names.  For example,
+the PredicateInfo utility uses it to build Extended SSA form, and
+attach various forms of information to operands that dominate specific
+uses.  It is not meant for general use, only for building temporary
+renaming forms that require value splits at certain points.
+
 .. _type.test:
 
 '``llvm.type.test``' Intrinsic
diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst
index de929bec1b0e984b2ca2910e9875892b4ef3051e..5d16091e27e5ceed8c7f9c9b4a525b8ba09cc28a 100644
--- a/docs/Lexicon.rst
+++ b/docs/Lexicon.rst
@@ -182,7 +182,7 @@ P
 
 **PR**
     Problem report. A bug filed on `the LLVM Bug Tracking System
-    <http://llvm.org/bugs/enter_bug.cgi>`_.
+    <https://bugs.llvm.org/enter_bug.cgi>`_.
 
 **PRE**
     Partial Redundancy Elimination
diff --git a/docs/OptBisect.rst b/docs/OptBisect.rst
index e9f1c2541c9c0a94d9d44be1ea3e0ae5da8c6655..5a216d419a6414fd4c12ed6246440bc1a2ba207a 100644
--- a/docs/OptBisect.rst
+++ b/docs/OptBisect.rst
@@ -60,11 +60,14 @@ like this:
   clang -O2 -mllvm -opt-bisect-limit=256 my_file.c
 
 The -opt-bisect-limit option may also be applied to link-time optimizations by
-using a prefix to indicate that this is a plug-in option for the linker.  The
+using a prefix to indicate that this is a plug-in option for the linker. The
 following syntax will set a bisect limit for LTO transformations:
 
 ::
 
+  # When using lld, or ld64 (macOS)
+  clang -flto -Wl,-mllvm,-opt-bisect-limit=256 my_file.o my_other_file.o
+  # When using Gold
   clang -flto -Wl,-plugin-opt,-opt-bisect-limit=256 my_file.o my_other_file.o
 
 LTO passes are run by a library instance invoked by the linker. Therefore any
@@ -186,12 +189,5 @@ Adding Finer Granularity
 
 Once the pass in which an incorrect transformation is performed has been
 determined, it may be useful to perform further analysis in order to determine
-which specific transformation is causing the problem.  Ideally all passes
-would be instrumented to allow skipping of individual transformations.  This
-functionality is available through the OptBisect object but it is impractical
-to proactively instrument every existing pass.  It is hoped that as developers
-find that they need a pass to be instrumented they will add the instrumentation
-and contribute it back to the LLVM source base.
-
-Helper functions will be added to simplify this level of instrumentation, but
-this work is not yet completed.  For more information, contact Andy Kaylor.
+which specific transformation is causing the problem.  Debug counters
+can be used for this purpose.
diff --git a/docs/ProgrammersManual.rst b/docs/ProgrammersManual.rst
index 1c96046bf40a7b41c266626cd5abcbd4fcace7d5..4fb67e1e6d5f8f4e091b6719526f6eafe00f0183 100644
--- a/docs/ProgrammersManual.rst
+++ b/docs/ProgrammersManual.rst
@@ -32,7 +32,7 @@ to know when working in the LLVM infrastructure, and the second describes the
 Core LLVM classes.  In the future this manual will be extended with information
 describing how to use extension libraries, such as dominator information, CFG
 traversal routines, and useful utilities like the ``InstVisitor`` (`doxygen
-<http://llvm.org/doxygen/InstVisitor_8h-source.html>`__) template.
+<http://llvm.org/doxygen/InstVisitor_8h_source.html>`__) template.
 
 .. _general:
 
@@ -108,7 +108,7 @@ they don't have some drawbacks (primarily stemming from the fact that
 ``dynamic_cast<>`` only works on classes that have a v-table).  Because they are
 used so often, you must know what they do and how they work.  All of these
 templates are defined in the ``llvm/Support/Casting.h`` (`doxygen
-<http://llvm.org/doxygen/Casting_8h-source.html>`__) file (note that you very
+<http://llvm.org/doxygen/Casting_8h_source.html>`__) file (note that you very
 rarely have to include this file directly).
 
 ``isa<>``:
@@ -225,7 +225,7 @@ and clients can call it using any one of:
 Similarly, APIs which need to return a string may return a ``StringRef``
 instance, which can be used directly or converted to an ``std::string`` using
 the ``str`` member function.  See ``llvm/ADT/StringRef.h`` (`doxygen
-<http://llvm.org/doxygen/classllvm_1_1StringRef_8h-source.html>`__) for more
+<http://llvm.org/doxygen/StringRef_8h_source.html>`__) for more
 information.
 
 You should rarely use the ``StringRef`` class directly, because it contains
@@ -482,7 +482,7 @@ that inherits from the ErrorInfo utility, E.g.:
     }
   };
 
-  char FileExists::ID; // This should be declared in the C++ file.
+  char BadFileFormat::ID; // This should be declared in the C++ file.
 
   Error printFormattedFile(StringRef Path) {
     if (<check for valid format>)
@@ -564,18 +564,18 @@ the boolean conversion operator):
 
 .. code-block:: c++
 
-  if (auto Err = canFail(...))
+  if (auto Err = mayFail(...))
     return Err; // Failure value - move error to caller.
 
   // Safe to continue: Err was checked.
 
-In contrast, the following code will always cause an abort, even if ``canFail``
+In contrast, the following code will always cause an abort, even if ``mayFail``
 returns a success value:
 
 .. code-block:: c++
 
-    canFail();
-    // Program will always abort here, even if canFail() returns Success, since
+    mayFail();
+    // Program will always abort here, even if mayFail() returns Success, since
     // the value is not checked.
 
 Failure values are considered checked once a handler for the error type has
@@ -633,6 +633,12 @@ exiting with an error code, the :ref:`ExitOnError <err_exitonerr>` utility
 may be a better choice than handleErrors, as it simplifies control flow when
 calling fallible functions.
 
+In situations where it is known that a particular call to a fallible function
+will always succeed (for example, a call to a function that can only fail on a
+subset of inputs with an input that is known to be safe) the
+:ref:`cantFail <err_cantfail>` functions can be used to remove the error type,
+simplifying control flow.
+
 StringError
 """""""""""
 
@@ -765,6 +771,43 @@ mapping can also be supplied from ``Error`` values to exit codes using the
 Use ``ExitOnError`` in your tool code where possible as it can greatly improve
 readability.
 
+.. _err_cantfail:
+
+Using cantFail to simplify safe callsites
+"""""""""""""""""""""""""""""""""""""""""
+
+Some functions may only fail for a subset of their inputs. For such functions
+call-sites using known-safe inputs can assume that the result will be a success
+value.
+
+The cantFail functions encapsulate this by wrapping an assertion that their
+argument is a success value and, in the case of Expected<T>, unwrapping the
+T value from the Expected<T> argument:
+
+.. code-block:: c++
+
+  Error mayFail(int X);
+  Expected<int> mayFail2(int X);
+
+  void foo() {
+    cantFail(mayFail(KnownSafeValue));
+    int Y = cantFail(mayFail2(KnownSafeValue));
+    ...
+  }
+
+Like the ExitOnError utility, cantFail simplifies control flow. Their treatment
+of error cases is very different however: Where ExitOnError is guaranteed to
+terminate the program on an error input, cantFile simply asserts that the result
+is success. In debug builds this will result in an assertion failure if an error
+is encountered. In release builds the behavior of cantFail for failure values is
+undefined. As such, care must be taken in the use of cantFail: clients must be
+certain that a cantFail wrapped call really can not fail under any
+circumstances.
+
+Use of the cantFail functions should be rare in library code, but they are
+likely to be of more use in tool and unit-test code where inputs and/or
+mocked-up classes or functions may be known to be safe.
+
 Fallible constructors
 """""""""""""""""""""
 
@@ -931,7 +974,7 @@ The ``function_ref`` class template
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 The ``function_ref``
-(`doxygen <http://llvm.org/docs/doxygen/html/classllvm_1_1function__ref_3_01Ret_07Params_8_8_8_08_4.html>`__) class
+(`doxygen <http://llvm.org/doxygen/classllvm_1_1function__ref_3_01Ret_07Params_8_8_8_08_4.html>`__) class
 template represents a reference to a callable object, templated over the type
 of the callable. This is a good choice for passing a callback to a function,
 if you don't need to hold onto the callback after the function returns. In this
@@ -981,7 +1024,7 @@ you don't want them to always be noisy.  A standard compromise is to comment
 them out, allowing you to enable them if you need them in the future.
 
 The ``llvm/Support/Debug.h`` (`doxygen
-<http://llvm.org/doxygen/Debug_8h-source.html>`__) file provides a macro named
+<http://llvm.org/doxygen/Debug_8h_source.html>`__) file provides a macro named
 ``DEBUG()`` that is a much nicer solution to this problem.  Basically, you can
 put arbitrary code into the argument of the ``DEBUG`` macro, and it is only
 executed if '``opt``' (or any other tool) is run with the '``-debug``' command
@@ -1078,7 +1121,7 @@ The ``Statistic`` class & ``-stats`` option
 -------------------------------------------
 
 The ``llvm/ADT/Statistic.h`` (`doxygen
-<http://llvm.org/doxygen/Statistic_8h-source.html>`__) file provides a class
+<http://llvm.org/doxygen/Statistic_8h_source.html>`__) file provides a class
 named ``Statistic`` that is used as a unified way to keep track of what the LLVM
 compiler is doing and how effective various optimizations are.  It is useful to
 see what optimizations are contributing to making a particular program run
@@ -1094,23 +1137,23 @@ uniform manner with the rest of the passes being executed.
 There are many examples of ``Statistic`` uses, but the basics of using it are as
 follows:
 
-#. Define your statistic like this:
+Define your statistic like this:
 
-  .. code-block:: c++
+.. code-block:: c++
 
-    #define DEBUG_TYPE "mypassname"   // This goes before any #includes.
-    STATISTIC(NumXForms, "The # of times I did stuff");
+  #define DEBUG_TYPE "mypassname"   // This goes before any #includes.
+  STATISTIC(NumXForms, "The # of times I did stuff");
 
-  The ``STATISTIC`` macro defines a static variable, whose name is specified by
-  the first argument.  The pass name is taken from the ``DEBUG_TYPE`` macro, and
-  the description is taken from the second argument.  The variable defined
-  ("NumXForms" in this case) acts like an unsigned integer.
+The ``STATISTIC`` macro defines a static variable, whose name is specified by
+the first argument.  The pass name is taken from the ``DEBUG_TYPE`` macro, and
+the description is taken from the second argument.  The variable defined
+("NumXForms" in this case) acts like an unsigned integer.
 
-#. Whenever you make a transformation, bump the counter:
+Whenever you make a transformation, bump the counter:
 
-  .. code-block:: c++
+.. code-block:: c++
 
-    ++NumXForms;   // I did stuff!
+  ++NumXForms;   // I did stuff!
 
 That's all you have to do.  To get '``opt``' to print out the statistics
 gathered, use the '``-stats``' option:
@@ -1158,6 +1201,71 @@ Obviously, with so many optimizations, having a unified framework for this stuff
 is very nice.  Making your pass fit well into the framework makes it more
 maintainable and useful.
 
+.. _DebugCounters:
+
+Adding debug counters to aid in debugging your code
+---------------------------------------------------
+
+Sometimes, when writing new passes, or trying to track down bugs, it
+is useful to be able to control whether certain things in your pass
+happen or not.  For example, there are times the minimization tooling
+can only easily give you large testcases.  You would like to narrow
+your bug down to a specific transformation happening or not happening,
+automatically, using bisection.  This is where debug counters help.
+They provide a framework for making parts of your code only execute a
+certain number of times.
+
+The ``llvm/Support/DebugCounter.h`` (`doxygen
+<http://llvm.org/doxygen/DebugCounter_8h_source.html>`__) file
+provides a class named ``DebugCounter`` that can be used to create
+command line counter options that control execution of parts of your code.
+
+Define your DebugCounter like this:
+
+.. code-block:: c++
+
+  DEBUG_COUNTER(DeleteAnInstruction, "passname-delete-instruction",
+		"Controls which instructions get delete").
+
+The ``DEBUG_COUNTER`` macro defines a static variable, whose name
+is specified by the first argument.  The name of the counter
+(which is used on the command line) is specified by the second
+argument, and the description used in the help is specified by the
+third argument.
+
+Whatever code you want that control, use ``DebugCounter::shouldExecute`` to control it.
+
+.. code-block:: c++
+
+  if (DebugCounter::shouldExecute(DeleteAnInstruction))
+    I->eraseFromParent();
+
+That's all you have to do.  Now, using opt, you can control when this code triggers using
+the '``--debug-counter``' option.  There are two counters provided, ``skip`` and ``count``.
+``skip`` is the number of times to skip execution of the codepath.  ``count`` is the number
+of times, once we are done skipping, to execute the codepath.
+
+.. code-block:: none
+
+  $ opt --debug-counter=passname-delete-instruction-skip=1,passname-delete-instruction-count=2 -passname
+
+This will skip the above code the first time we hit it, then execute it twice, then skip the rest of the executions.
+
+So if executed on the following code:
+
+.. code-block:: llvm
+
+  %1 = add i32 %a, %b
+  %2 = add i32 %a, %b
+  %3 = add i32 %a, %b
+  %4 = add i32 %a, %b
+
+It would delete number ``%2`` and ``%3``.
+
+A utility is provided in `utils/bisect-skip-count` to binary search
+skip and count arguments. It can be used to automatically minimize the
+skip and count for a debug-counter variable.
+
 .. _ViewGraph:
 
 Viewing graphs while debugging code
@@ -2257,18 +2365,12 @@ of a ``BasicBlock`` and the number of ``Instruction``\ s it contains:
 
 .. code-block:: c++
 
-  // func is a pointer to a Function instance
-  for (Function::iterator i = func->begin(), e = func->end(); i != e; ++i)
+  Function &Func = ...
+  for (BasicBlock &BB : Func)
     // Print out the name of the basic block if it has one, and then the
     // number of instructions that it contains
-    errs() << "Basic block (name=" << i->getName() << ") has "
-               << i->size() << " instructions.\n";
-
-Note that i can be used as if it were a pointer for the purposes of invoking
-member functions of the ``Instruction`` class.  This is because the indirection
-operator is overloaded for the iterator classes.  In the above code, the
-expression ``i->size()`` is exactly equivalent to ``(*i).size()`` just like
-you'd expect.
+    errs() << "Basic block (name=" << BB.getName() << ") has "
+               << BB.size() << " instructions.\n";
 
 .. _iterate_basicblock:
 
@@ -2281,17 +2383,17 @@ a code snippet that prints out each instruction in a ``BasicBlock``:
 
 .. code-block:: c++
 
-  // blk is a pointer to a BasicBlock instance
-  for (BasicBlock::iterator i = blk->begin(), e = blk->end(); i != e; ++i)
+  BasicBlock& BB = ...
+  for (Instruction &I : BB)
      // The next statement works since operator<<(ostream&,...)
      // is overloaded for Instruction&
-     errs() << *i << "\n";
+     errs() << I << "\n";
 
 
 However, this isn't really the best way to print out the contents of a
 ``BasicBlock``!  Since the ostream operators are overloaded for virtually
 anything you'll care about, you could have just invoked the print routine on the
-basic block itself: ``errs() << *blk << "\n";``.
+basic block itself: ``errs() << BB << "\n";``.
 
 .. _iterate_insiter:
 
@@ -2425,13 +2527,13 @@ method):
       OurFunctionPass(): callCounter(0) { }
 
       virtual runOnFunction(Function& F) {
-        for (Function::iterator b = F.begin(), be = F.end(); b != be; ++b) {
-          for (BasicBlock::iterator i = b->begin(), ie = b->end(); i != ie; ++i) {
-            if (CallInst* callInst = dyn_cast<CallInst>(&*i)) {
+        for (BasicBlock &B : F) {
+          for (Instruction &I: B) {
+            if (auto *CallInst = dyn_cast<CallInst>(&I)) {
               // We know we've encountered a call instruction, so we
               // need to determine if it's a call to the
               // function pointed to by m_func or not.
-              if (callInst->getCalledFunction() == targetFunc)
+              if (CallInst->getCalledFunction() == targetFunc)
                 ++callCounter;
             }
           }
@@ -2524,12 +2626,11 @@ iterate over all predecessors of BB:
   #include "llvm/IR/CFG.h"
   BasicBlock *BB = ...;
 
-  for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
-    BasicBlock *Pred = *PI;
+  for (BasicBlock *Pred : predecessors(BB)) {
     // ...
   }
 
-Similarly, to iterate over successors use ``succ_iterator/succ_begin/succ_end``.
+Similarly, to iterate over successors use ``successors``.
 
 .. _simplechanges:
 
@@ -2554,7 +2655,7 @@ For example, an ``AllocaInst`` only *requires* a (const-ptr-to) ``Type``.  Thus:
 
 .. code-block:: c++
 
-  AllocaInst* ai = new AllocaInst(Type::Int32Ty);
+  auto *ai = new AllocaInst(Type::Int32Ty);
 
 will create an ``AllocaInst`` instance that represents the allocation of one
 integer in the current stack frame, at run time.  Each ``Instruction`` subclass
@@ -2579,7 +2680,7 @@ intending to use it within the same ``Function``.  I might do:
 
 .. code-block:: c++
 
-  AllocaInst* pa = new AllocaInst(Type::Int32Ty, 0, "indexLoc");
+  auto *pa = new AllocaInst(Type::Int32Ty, 0, "indexLoc");
 
 where ``indexLoc`` is now the logical name of the instruction's execution value,
 which is a pointer to an integer on the run time stack.
@@ -2599,7 +2700,7 @@ sequence of instructions that form a ``BasicBlock``:
 
       BasicBlock *pb = ...;
       Instruction *pi = ...;
-      Instruction *newInst = new Instruction(...);
+      auto *newInst = new Instruction(...);
 
       pb->getInstList().insert(pi, newInst); // Inserts newInst before pi in pb
 
@@ -2611,7 +2712,7 @@ sequence of instructions that form a ``BasicBlock``:
   .. code-block:: c++
 
     BasicBlock *pb = ...;
-    Instruction *newInst = new Instruction(...);
+    auto *newInst = new Instruction(...);
 
     pb->getInstList().push_back(newInst); // Appends newInst to pb
 
@@ -2620,7 +2721,7 @@ sequence of instructions that form a ``BasicBlock``:
   .. code-block:: c++
 
     BasicBlock *pb = ...;
-    Instruction *newInst = new Instruction(..., pb);
+    auto *newInst = new Instruction(..., pb);
 
   which is much cleaner, especially if you are creating long instruction
   streams.
@@ -2635,7 +2736,7 @@ sequence of instructions that form a ``BasicBlock``:
   .. code-block:: c++
 
     Instruction *pi = ...;
-    Instruction *newInst = new Instruction(...);
+    auto *newInst = new Instruction(...);
 
     pi->getParent()->getInstList().insert(pi, newInst);
 
@@ -2651,7 +2752,7 @@ sequence of instructions that form a ``BasicBlock``:
   .. code-block:: c++
 
     Instruction* pi = ...;
-    Instruction* newInst = new Instruction(..., pi);
+    auto *newInst = new Instruction(..., pi);
 
   which is much cleaner, especially if you're creating a lot of instructions and
   adding them to ``BasicBlock``\ s.
@@ -2718,7 +2819,7 @@ Replacing individual instructions
 """""""""""""""""""""""""""""""""
 
 Including "`llvm/Transforms/Utils/BasicBlockUtils.h
-<http://llvm.org/doxygen/BasicBlockUtils_8h-source.html>`_" permits use of two
+<http://llvm.org/doxygen/BasicBlockUtils_8h_source.html>`_" permits use of two
 very useful replace functions: ``ReplaceInstWithValue`` and
 ``ReplaceInstWithInst``.
 
@@ -2814,7 +2915,7 @@ is easier to read and write than the equivalent
   FunctionType *ft = FunctionType::get(Type::Int8Ty, params, false);
 
 See the `class comment
-<http://llvm.org/doxygen/TypeBuilder_8h-source.html#l00001>`_ for more details.
+<http://llvm.org/doxygen/TypeBuilder_8h_source.html#l00001>`_ for more details.
 
 .. _threading:
 
@@ -2903,7 +3004,7 @@ Another way is to only call ``getPointerToFunction()`` from the
 
 When the JIT is configured to compile lazily (using
 ``ExecutionEngine::DisableLazyCompilation(false)``), there is currently a `race
-condition <http://llvm.org/bugs/show_bug.cgi?id=5184>`_ in updating call sites
+condition <https://bugs.llvm.org/show_bug.cgi?id=5184>`_ in updating call sites
 after a function is lazily-jitted.  It's still possible to use the lazy JIT in a
 threaded program if you ensure that only one thread at a time can call any
 particular lazy stub and that the JIT lock guards any IR access, but we suggest
@@ -3235,7 +3336,7 @@ The Core LLVM Class Hierarchy Reference
 
 ``#include "llvm/IR/Type.h"``
 
-header source: `Type.h <http://llvm.org/doxygen/Type_8h-source.html>`_
+header source: `Type.h <http://llvm.org/doxygen/Type_8h_source.html>`_
 
 doxygen info: `Type Clases <http://llvm.org/doxygen/classllvm_1_1Type.html>`_
 
@@ -3339,7 +3440,7 @@ The ``Module`` class
 
 ``#include "llvm/IR/Module.h"``
 
-header source: `Module.h <http://llvm.org/doxygen/Module_8h-source.html>`_
+header source: `Module.h <http://llvm.org/doxygen/Module_8h_source.html>`_
 
 doxygen info: `Module Class <http://llvm.org/doxygen/classllvm_1_1Module.html>`_
 
@@ -3426,7 +3527,7 @@ The ``Value`` class
 
 ``#include "llvm/IR/Value.h"``
 
-header source: `Value.h <http://llvm.org/doxygen/Value_8h-source.html>`_
+header source: `Value.h <http://llvm.org/doxygen/Value_8h_source.html>`_
 
 doxygen info: `Value Class <http://llvm.org/doxygen/classllvm_1_1Value.html>`_
 
@@ -3517,7 +3618,7 @@ The ``User`` class
 
 ``#include "llvm/IR/User.h"``
 
-header source: `User.h <http://llvm.org/doxygen/User_8h-source.html>`_
+header source: `User.h <http://llvm.org/doxygen/User_8h_source.html>`_
 
 doxygen info: `User Class <http://llvm.org/doxygen/classllvm_1_1User.html>`_
 
@@ -3564,7 +3665,7 @@ The ``Instruction`` class
 ``#include "llvm/IR/Instruction.h"``
 
 header source: `Instruction.h
-<http://llvm.org/doxygen/Instruction_8h-source.html>`_
+<http://llvm.org/doxygen/Instruction_8h_source.html>`_
 
 doxygen info: `Instruction Class
 <http://llvm.org/doxygen/classllvm_1_1Instruction.html>`_
@@ -3712,7 +3813,7 @@ The ``GlobalValue`` class
 ``#include "llvm/IR/GlobalValue.h"``
 
 header source: `GlobalValue.h
-<http://llvm.org/doxygen/GlobalValue_8h-source.html>`_
+<http://llvm.org/doxygen/GlobalValue_8h_source.html>`_
 
 doxygen info: `GlobalValue Class
 <http://llvm.org/doxygen/classllvm_1_1GlobalValue.html>`_
@@ -3770,7 +3871,7 @@ The ``Function`` class
 
 ``#include "llvm/IR/Function.h"``
 
-header source: `Function.h <http://llvm.org/doxygen/Function_8h-source.html>`_
+header source: `Function.h <http://llvm.org/doxygen/Function_8h_source.html>`_
 
 doxygen info: `Function Class
 <http://llvm.org/doxygen/classllvm_1_1Function.html>`_
@@ -3879,7 +3980,7 @@ The ``GlobalVariable`` class
 ``#include "llvm/IR/GlobalVariable.h"``
 
 header source: `GlobalVariable.h
-<http://llvm.org/doxygen/GlobalVariable_8h-source.html>`_
+<http://llvm.org/doxygen/GlobalVariable_8h_source.html>`_
 
 doxygen info: `GlobalVariable Class
 <http://llvm.org/doxygen/classllvm_1_1GlobalVariable.html>`_
@@ -3937,7 +4038,7 @@ The ``BasicBlock`` class
 ``#include "llvm/IR/BasicBlock.h"``
 
 header source: `BasicBlock.h
-<http://llvm.org/doxygen/BasicBlock_8h-source.html>`_
+<http://llvm.org/doxygen/BasicBlock_8h_source.html>`_
 
 doxygen info: `BasicBlock Class
 <http://llvm.org/doxygen/classllvm_1_1BasicBlock.html>`_
diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst
index cebc59001b27935930608a49139123ba1ad85ed6..dbffb53d5a51911c5a8b08e8d0733d5579f6fdd4 100644
--- a/docs/ReleaseNotes.rst
+++ b/docs/ReleaseNotes.rst
@@ -6,9 +6,9 @@ LLVM 5.0.0 Release Notes
     :local:
 
 .. warning::
-   These are in-progress notes for the upcoming LLVM 5 release.  You may
-   prefer the `LLVM 4 Release Notes <http://llvm.org/releases/4.0.0/docs
-   /ReleaseNotes.html>`_.
+   These are in-progress notes for the upcoming LLVM 5 release.
+   Release notes for previous releases can be found on
+   `the Download Page <http://releases.llvm.org/download.html>`_.
 
 
 Introduction
diff --git a/docs/ScudoHardenedAllocator.rst b/docs/ScudoHardenedAllocator.rst
index a22051cca063009312ef876d219ed9fb8a6df4ec..e00c8324e55aba8a13c02cfe5d3f193d98cafb80 100644
--- a/docs/ScudoHardenedAllocator.rst
+++ b/docs/ScudoHardenedAllocator.rst
@@ -13,6 +13,13 @@ The Scudo Hardened Allocator is a user-mode allocator based on LLVM Sanitizer's
 CombinedAllocator, which aims at providing additional mitigations against heap
 based vulnerabilities, while maintaining good performance.
 
+Currently, the allocator supports (was tested on) the following architectures:
+
+- i386 (& i686) (32-bit);
+- x86_64 (64-bit);
+- armhf (32-bit);
+- AArch64 (64-bit).
+
 The name "Scudo" has been retained from the initial implementation (Escudo
 meaning Shield in Spanish and Portuguese).
 
@@ -31,29 +38,25 @@ header is accessed, and the process terminated.
 The following information is stored in the header:
 
 - the 16-bit checksum;
-- the user requested size for that chunk, which is necessary for reallocation
-  purposes;
+- the unused bytes amount for that chunk, which is necessary for computing the
+  size of the chunk;
 - the state of the chunk (available, allocated or quarantined);
 - the allocation type (malloc, new, new[] or memalign), to detect potential
   mismatches in the allocation APIs used;
-- whether or not the chunk is offseted (ie: if the chunk beginning is different
-  than the backend allocation beginning, which is most often the case with some
-  aligned allocations);
-- the associated offset;
-- a 16-bit salt.
+- the offset of the chunk, which is the distance in bytes from the beginning of
+  the returned chunk to the beginning of the backend allocation;
+- a 8-bit salt.
 
-On x64, which is currently the only architecture supported, the header fits
-within 16-bytes, which works nicely with the minimum alignment requirements.
+This header fits within 8 bytes, on all platforms supported.
 
-The checksum is computed as a CRC32 (requiring the SSE 4.2 instruction set)
-of the global secret, the chunk pointer itself, and the 16 bytes of header with
+The checksum is computed as a CRC32 (made faster with hardware support)
+of the global secret, the chunk pointer itself, and the 8 bytes of header with
 the checksum field zeroed out.
 
-The header is atomically loaded and stored to prevent races (this requires
-platform support such as the cmpxchg16b instruction). This is important as two
-consecutive chunks could belong to different threads. We also want to avoid
-any type of double fetches of information located in the header, and use local
-copies of the header for this purpose.
+The header is atomically loaded and stored to prevent races. This is important
+as two consecutive chunks could belong to different threads. We also want to
+avoid any type of double fetches of information located in the header, and use
+local copies of the header for this purpose.
 
 Delayed Freelist
 -----------------
@@ -94,9 +97,9 @@ You may also build Scudo like this:
 .. code::
 
   cd $LLVM/projects/compiler-rt/lib
-  clang++ -fPIC -std=c++11 -msse4.2 -mcx16 -O2 -I. scudo/*.cpp \
+  clang++ -fPIC -std=c++11 -msse4.2 -O2 -I. scudo/*.cpp \
     $(\ls sanitizer_common/*.{cc,S} | grep -v "sanitizer_termination\|sanitizer_common_nolibc") \
-    -shared -o scudo-allocator.so -lpthread
+    -shared -o scudo-allocator.so -pthread
 
 and then use it with existing binaries as follows:
 
@@ -136,29 +139,29 @@ Or using the function:
 
 The following options are available:
 
-+-----------------------------+---------+------------------------------------------------+
-| Option                      | Default | Description                                    |
-+-----------------------------+---------+------------------------------------------------+
-| QuarantineSizeMb            | 64      | The size (in Mb) of quarantine used to delay   |
-|                             |         | the actual deallocation of chunks. Lower value |
-|                             |         | may reduce memory usage but decrease the       |
-|                             |         | effectiveness of the mitigation; a negative    |
-|                             |         | value will fallback to a default of 64Mb.      |
-+-----------------------------+---------+------------------------------------------------+
-| ThreadLocalQuarantineSizeKb | 1024    | The size (in Kb) of per-thread cache use to    |
-|                             |         | offload the global quarantine. Lower value may |
-|                             |         | reduce memory usage but might increase         |
-|                             |         | contention on the global quarantine.           |
-+-----------------------------+---------+------------------------------------------------+
-| DeallocationTypeMismatch    | true    | Whether or not we report errors on             |
-|                             |         | malloc/delete, new/free, new/delete[], etc.    |
-+-----------------------------+---------+------------------------------------------------+
-| DeleteSizeMismatch          | true    | Whether or not we report errors on mismatch    |
-|                             |         | between sizes of new and delete.               |
-+-----------------------------+---------+------------------------------------------------+
-| ZeroContents                | false   | Whether or not we zero chunk contents on       |
-|                             |         | allocation and deallocation.                   |
-+-----------------------------+---------+------------------------------------------------+
++-----------------------------+----------------+----------------+------------------------------------------------+
+| Option                      | 64-bit default | 32-bit default | Description                                    |
++-----------------------------+----------------+----------------+------------------------------------------------+
+| QuarantineSizeMb            | 64             | 16             | The size (in Mb) of quarantine used to delay   |
+|                             |                |                | the actual deallocation of chunks. Lower value |
+|                             |                |                | may reduce memory usage but decrease the       |
+|                             |                |                | effectiveness of the mitigation; a negative    |
+|                             |                |                | value will fallback to a default of 64Mb.      |
++-----------------------------+----------------+----------------+------------------------------------------------+
+| ThreadLocalQuarantineSizeKb | 1024           | 256            | The size (in Kb) of per-thread cache use to    |
+|                             |                |                | offload the global quarantine. Lower value may |
+|                             |                |                | reduce memory usage but might increase         |
+|                             |                |                | contention on the global quarantine.           |
++-----------------------------+----------------+----------------+------------------------------------------------+
+| DeallocationTypeMismatch    | true           | true           | Whether or not we report errors on             |
+|                             |                |                | malloc/delete, new/free, new/delete[], etc.    |
++-----------------------------+----------------+----------------+------------------------------------------------+
+| DeleteSizeMismatch          | true           | true           | Whether or not we report errors on mismatch    |
+|                             |                |                | between sizes of new and delete.               |
++-----------------------------+----------------+----------------+------------------------------------------------+
+| ZeroContents                | false          | false          | Whether or not we zero chunk contents on       |
+|                             |                |                | allocation and deallocation.                   |
++-----------------------------+----------------+----------------+------------------------------------------------+
 
 Allocator related common Sanitizer options can also be passed through Scudo
 options, such as ``allocator_may_return_null``. A detailed list including those
diff --git a/docs/Statepoints.rst b/docs/Statepoints.rst
index 29b1be37a893cce8934c52d1546d94f36e9100fd..7f2b20544812f6e3da79e11c2984097b9c7d93e5 100644
--- a/docs/Statepoints.rst
+++ b/docs/Statepoints.rst
@@ -831,7 +831,7 @@ Bugs and Enhancements
 
 Currently known bugs and enhancements under consideration can be
 tracked by performing a `bugzilla search
-<http://llvm.org/bugs/buglist.cgi?cmdtype=runnamed&namedcmd=Statepoint%20Bugs&list_id=64342>`_
+<https://bugs.llvm.org/buglist.cgi?cmdtype=runnamed&namedcmd=Statepoint%20Bugs&list_id=64342>`_
 for [Statepoint] in the summary field. When filing new bugs, please
 use this tag so that interested parties see the newly filed bug.  As
 with most LLVM features, design discussions take place on `llvm-dev
diff --git a/docs/TableGen/BackEnds.rst b/docs/TableGen/BackEnds.rst
index fdab266fa31ce31c39e741f394a3cf59be5353af..993134386f76965299b5f64e269e19e687e881bc 100644
--- a/docs/TableGen/BackEnds.rst
+++ b/docs/TableGen/BackEnds.rst
@@ -228,6 +228,12 @@ CTags
 format. A helper script, utils/TableGen/tdtags, provides an easier-to-use
 interface; run 'tdtags -H' for documentation.
 
+X86EVEX2VEX
+-----------
+
+**Purpose**: This X86 specific tablegen backend emits tables that map EVEX
+encoded instructions to their VEX encoded identical instruction.
+
 Clang BackEnds
 ==============
 
diff --git a/docs/TestingGuide.rst b/docs/TestingGuide.rst
index 99616770d8e9ee37fd83362bee2a62d9b303dd42..a27da0de4d0e29de083c2bb470aaa7aedc841c0a 100644
--- a/docs/TestingGuide.rst
+++ b/docs/TestingGuide.rst
@@ -468,6 +468,25 @@ RUN lines:
 
    Expands to the path separator, i.e. ``:`` (or ``;`` on Windows).
 
+``%/s, %/S, %/t, %/T:``
+
+  Act like the corresponding substitution above but replace any ``\``
+  character with a ``/``. This is useful to normalize path separators.
+
+   Example: ``%s:  C:\Desktop Files/foo_test.s.tmp``
+   
+   Example: ``%/s: C:/Desktop Files/foo_test.s.tmp``
+
+``%:s, %:S, %:t, %:T:``
+
+  Act like the corresponding substitution above but remove colons at
+  the beginning of Windows paths. This is useful to allow concatenation
+  of absolute paths on Windows to produce a legal path.
+
+   Example: ``%s:  C:\Desktop Files\foo_test.s.tmp``
+
+   Example: ``%:s: C\Desktop Files\foo_test.s.tmp``
+
 
 **LLVM-specific substitutions:**
 
diff --git a/docs/XRay.rst b/docs/XRay.rst
index 222cc8f2e049c0e898af2c97a3ca6181e8c7f7e6..d650319e99220f66674a970fde1b8516e1e9eb2b 100644
--- a/docs/XRay.rst
+++ b/docs/XRay.rst
@@ -28,8 +28,9 @@ XRay consists of three main parts:
 - A runtime library for enabling/disabling tracing at runtime.
 - A suite of tools for analysing the traces.
 
-  **NOTE:** As of the time of this writing, XRay is only available for x86_64
-  and arm7 32-bit (no-thumb) Linux.
+  **NOTE:** As of February 27, 2017 , XRay is only available for the following
+  architectures running Linux: x86_64, arm7 (no thumb), aarch64, powerpc64le,
+  mips, mipsel, mips64, mips64el.
 
 The compiler-inserted instrumentation points come in the form of nop-sleds in
 the final generated binary, and an ELF section named ``xray_instr_map`` which
@@ -84,7 +85,10 @@ GCC-style attributes or C++11-style attributes.
 
 When linking a binary, you can either manually link in the `XRay Runtime
 Library`_ or use ``clang`` to link it in automatically with the
-``-fxray-instrument`` flag.
+``-fxray-instrument`` flag. Alternatively, you can statically link-in the XRay
+runtime library from compiler-rt -- those archive files will take the name of
+`libclang_rt.xray-{arch}` where `{arch}` is the mnemonic supported by clang
+(x86_64, arm7, etc.).
 
 LLVM Function Attribute
 -----------------------
@@ -135,7 +139,7 @@ variable, where we list down the options and their defaults below.
 +-------------------+-----------------+---------------+------------------------+
 | Option            | Type            | Default       | Description            |
 +===================+=================+===============+========================+
-| patch_premain     | ``bool``        | ``true``      | Whether to patch       |
+| patch_premain     | ``bool``        | ``false``     | Whether to patch       |
 |                   |                 |               | instrumentation points |
 |                   |                 |               | before main.           |
 +-------------------+-----------------+---------------+------------------------+
@@ -146,6 +150,11 @@ variable, where we list down the options and their defaults below.
 | xray_logfile_base | ``const char*`` | ``xray-log.`` | Filename base for the  |
 |                   |                 |               | XRay logfile.          |
 +-------------------+-----------------+---------------+------------------------+
+| xray_fdr_log      | ``bool``        | ``false``     | Wheter to install the  |
+|                   |                 |               | Flight Data Recorder   |
+|                   |                 |               | (FDR) mode.            |
++-------------------+-----------------+---------------+------------------------+
+
 
 If you choose to not use the default logging implementation that comes with the
 XRay runtime and/or control when/how the XRay instrumentation runs, you may use
@@ -175,6 +184,64 @@ thread-safety of operations to be performed by the XRay runtime library:
   XRay cannot guarantee that all threads that have ever gotten a copy of the
   pointer will not invoke the function.
 
+Flight Data Recorder Mode
+-------------------------
+
+XRay supports a logging mode which allows the application to only capture a
+fixed amount of memory's worth of events. Flight Data Recorder (FDR) mode works
+very much like a plane's "black box" which keeps recording data to memory in a
+fixed-size circular queue of buffers, and have the data available
+programmatically until the buffers are finalized and flushed. To use FDR mode
+on your application, you may set the ``xray_fdr_log`` option to ``true`` in the
+``XRAY_OPTIONS`` environment variable (while also optionally setting the
+``xray_naive_log`` to ``false``).
+
+When FDR mode is on, it will keep writing and recycling memory buffers until
+the logging implementation is finalized -- at which point it can be flushed and
+re-initialised later. To do this programmatically, we follow the workflow
+provided below:
+
+.. code-block:: c++
+
+  // Patch the sleds, if we haven't yet.
+  auto patch_status = __xray_patch();
+
+  // Maybe handle the patch_status errors.
+
+  // When we want to flush the log, we need to finalize it first, to give
+  // threads a chance to return buffers to the queue.
+  auto finalize_status = __xray_log_finalize();
+  if (finalize_status != XRAY_LOG_FINALIZED) {
+    // maybe retry, or bail out.
+  }
+
+  // At this point, we are sure that the log is finalized, so we may try
+  // flushing the log.
+  auto flush_status = __xray_log_flushLog();
+  if (flush_status != XRAY_LOG_FLUSHED) {
+    // maybe retry, or bail out.
+  }
+
+The default settings for the FDR mode implementation will create logs named
+similarly to the naive log implementation, but will have a different log
+format. All the trace analysis tools (and the trace reading library) will
+support all versions of the FDR mode format as we add more functionality and
+record types in the future.
+
+  **NOTE:** We do not however promise perpetual support for when we update the
+  log versions we support going forward. Deprecation of the formats will be
+  announced and discussed on the developers mailing list.
+
+XRay allows for replacing the default FDR mode logging implementation using the
+following API:
+
+- ``__xray_set_log_impl(...)``: This function takes a struct of type
+  ``XRayLogImpl``, which is defined in ``xray/xray_log_interface.h``, part of
+  the XRay compiler-rt installation.
+- ``__xray_log_init(...)``: This function allows for initializing and
+  re-initializing an installed logging implementation. See
+  ``xray/xray_log_interface.h`` for details, part of the XRay compiler-rt
+  installation.
 
 Trace Analysis Tools
 --------------------
@@ -185,7 +252,26 @@ supports the following subcommands:
 
 - ``extract``: Extract the instrumentation map from a binary, and return it as
   YAML.
-
+- ``account``: Performs basic function call accounting statistics with various
+  options for sorting, and output formats (supports CSV, YAML, and
+  console-friendly TEXT).
+- ``convert``: Converts an XRay log file from one format to another. Currently
+  only converts to YAML.
+- ``graph``: Generates a DOT graph of the function call relationships between
+  functions found in an XRay trace.
+
+These subcommands use various library components found as part of the XRay
+libraries, distributed with the LLVM distribution. These are:
+
+- ``llvm/XRay/Trace.h`` : A trace reading library for conveniently loading
+  an XRay trace of supported forms, into a convenient in-memory representation.
+  All the analysis tools that deal with traces use this implementation.
+- ``llvm/XRay/Graph.h`` : A semi-generic graph type used by the graph
+  subcommand to conveniently represent a function call graph with statistics
+  associated with edges and vertices.
+- ``llvm/XRay/InstrumentationMap.h``: A convenient tool for analyzing the
+  instrumentation map in XRay-instrumented object files and binaries. The
+  ``extract`` subcommand uses this particular library.
 
 Future Work
 ===========
@@ -193,38 +279,19 @@ Future Work
 There are a number of ongoing efforts for expanding the toolset building around
 the XRay instrumentation system.
 
-Flight Data Recorder Mode
--------------------------
-
-The `XRay whitepaper`_ mentions a mode for when events are kept in memory, and
-have the traces be dumped on demand through a triggering API. This work is
-currently ongoing.
-
 Trace Analysis
 --------------
 
-There are a few more subcommands making its way to the ``llvm-xray`` tool, that
-are currently under review:
-
-- ``convert``: Turns an XRay trace from one format to another. Currently
-  supporting conversion from the binary XRay log to YAML.
-- ``account``: Do function call accounting based on data in the XRay log.
-
 We have more subcommands and modes that we're thinking of developing, in the
 following forms:
 
 - ``stack``: Reconstruct the function call stacks in a timeline.
-- ``convert``: Converting from one version of the XRay log to another (higher)
-  version, and converting to other trace formats (i.e. Chrome Trace Viewer,
-  pprof, etc.).
-- ``graph``: Generate a function call graph with relative timings and distributions.
 
 More Platforms
 --------------
 
-Since XRay is only currently available in x86_64 and arm7 32-bit (no-thumb)
-running Linux, we're looking to supporting more platforms (architectures and
-operating systems).
+We're looking forward to contributions to port XRay to more architectures and
+operating systems.
 
 .. References...
 
diff --git a/docs/XRayExample.rst b/docs/XRayExample.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5dfb0bfaf29893f878442120de72487bfb57dad8
--- /dev/null
+++ b/docs/XRayExample.rst
@@ -0,0 +1,273 @@
+===================
+Debugging with XRay
+===================
+
+This document shows an example of how you would go about analyzing applications
+built with XRay instrumentation. Here we will attempt to debug ``llc``
+compiling some sample LLVM IR generated by Clang.
+
+.. contents::
+  :local:
+
+Building with XRay
+------------------
+
+To debug an application with XRay instrumentation, we need to build it with a
+Clang that supports the ``-fxray-instrument`` option. See `XRay <XRay.html>`_
+for more technical details of how XRay works for background information.
+
+In our example, we need to add ``-fxray-instrument`` to the list of flags
+passed to Clang when building a binary. Note that we need to link with Clang as
+well to get the XRay runtime linked in appropriately. For building ``llc`` with
+XRay, we do something similar below for our LLVM build:
+
+::
+
+  $ mkdir -p llvm-build && cd llvm-build
+  # Assume that the LLVM sources are at ../llvm
+  $ cmake -GNinja ../llvm -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_C_FLAGS_RELEASE="-fxray-instrument" -DCMAKE_CXX_FLAGS="-fxray-instrument" \
+  # Once this finishes, we should build llc
+  $ ninja llc
+
+
+To verify that we have an XRay instrumented binary, we can use ``objdump`` to
+look for the ``xray_instr_map`` section.
+
+::
+
+  $ objdump -h -j xray_instr_map ./bin/llc
+  ./bin/llc:     file format elf64-x86-64
+  
+  Sections:
+  Idx Name          Size      VMA               LMA               File off  Algn
+   14 xray_instr_map 00002fc0  00000000041516c6  00000000041516c6  03d516c6  2**0
+                    CONTENTS, ALLOC, LOAD, READONLY, DATA
+
+Getting Traces
+--------------
+
+By default, XRay does not write out the trace files or patch the application
+before main starts. If we just run ``llc`` it should just work like a normally
+built binary. However, if we want to get a full trace of the application's
+operations (of the functions we do end up instrumenting with XRay) then we need
+to enable XRay at application start. To do this, XRay checks the
+``XRAY_OPTIONS`` environment variable.
+
+::
+
+  # The following doesn't create an XRay trace by default.
+  $ ./bin/llc input.ll
+
+  # We need to set the XRAY_OPTIONS to enable some features.
+  $ XRAY_OPTIONS="patch_premain=true" ./bin/llc input.ll
+  ==69819==XRay: Log file in 'xray-log.llc.m35qPB'
+
+At this point we now have an XRay trace we can start analysing.
+
+The ``llvm-xray`` Tool
+----------------------
+
+Having a trace then allows us to do basic accounting of the functions that were
+instrumented, and how much time we're spending in parts of the code. To make
+sense of this data, we use the ``llvm-xray`` tool which has a few subcommands
+to help us understand our trace.
+
+One of the simplest things we can do is to get an accounting of the functions
+that have been instrumented. We can see an example accounting with ``llvm-xray
+account``:
+
+::
+
+  $ llvm-xray account xray-log.llc.m35qPB -top=10 -sort=sum -sortorder=dsc -instr_map ./bin/llc
+  Functions with latencies: 29
+     funcid      count [      min,       med,       90p,       99p,       max]       sum  function
+        187        360 [ 0.000000,  0.000001,  0.000014,  0.000032,  0.000075]  0.001596  LLLexer.cpp:446:0: llvm::LLLexer::LexIdentifier()
+         85        130 [ 0.000000,  0.000000,  0.000018,  0.000023,  0.000156]  0.000799  X86ISelDAGToDAG.cpp:1984:0: (anonymous namespace)::X86DAGToDAGISel::Select(llvm::SDNode*)
+        138        130 [ 0.000000,  0.000000,  0.000017,  0.000155,  0.000155]  0.000774  SelectionDAGISel.cpp:2963:0: llvm::SelectionDAGISel::SelectCodeCommon(llvm::SDNode*, unsigned char const*, unsigned int)
+        188        103 [ 0.000000,  0.000000,  0.000003,  0.000123,  0.000214]  0.000737  LLParser.cpp:2692:0: llvm::LLParser::ParseValID(llvm::ValID&, llvm::LLParser::PerFunctionState*)
+         88          1 [ 0.000562,  0.000562,  0.000562,  0.000562,  0.000562]  0.000562  X86ISelLowering.cpp:83:0: llvm::X86TargetLowering::X86TargetLowering(llvm::X86TargetMachine const&, llvm::X86Subtarget const&)
+        125        102 [ 0.000001,  0.000003,  0.000010,  0.000017,  0.000049]  0.000471  Verifier.cpp:3714:0: (anonymous namespace)::Verifier::visitInstruction(llvm::Instruction&)
+         90          8 [ 0.000023,  0.000035,  0.000106,  0.000106,  0.000106]  0.000342  X86ISelLowering.cpp:3363:0: llvm::X86TargetLowering::LowerCall(llvm::TargetLowering::CallLoweringInfo&, llvm::SmallVectorImpl<llvm::SDValue>&) const
+        124         32 [ 0.000003,  0.000007,  0.000016,  0.000041,  0.000041]  0.000310  Verifier.cpp:1967:0: (anonymous namespace)::Verifier::visitFunction(llvm::Function const&)
+        123          1 [ 0.000302,  0.000302,  0.000302,  0.000302,  0.000302]  0.000302  LLVMContextImpl.cpp:54:0: llvm::LLVMContextImpl::~LLVMContextImpl()
+        139         46 [ 0.000000,  0.000002,  0.000006,  0.000008,  0.000019]  0.000138  TargetLowering.cpp:506:0: llvm::TargetLowering::SimplifyDemandedBits(llvm::SDValue, llvm::APInt const&, llvm::APInt&, llvm::APInt&, llvm::TargetLowering::TargetLoweringOpt&, unsigned int, bool) const
+
+This shows us that for our input file, ``llc`` spent the most cumulative time
+in the lexer (a total of 1 millisecond). If we wanted for example to work with
+this data in a spreadsheet, we can output the results as CSV using the
+``-format=csv`` option to the command for further analysis.
+
+If we want to get a textual representation of the raw trace we can use the
+``llvm-xray convert`` tool to get YAML output. The first few lines of that
+ouput for an example trace would look like the following:
+
+::
+
+  $ llvm-xray convert -f yaml -symbolize -instr_map=./bin/llc xray-log.llc.m35qPB
+  ---
+  header:          
+    version:         1
+    type:            0
+    constant-tsc:    true
+    nonstop-tsc:     true
+    cycle-frequency: 2601000000
+  records:         
+    - { type: 0, func-id: 110, function: __cxx_global_var_init.8, cpu: 37, thread: 69819, kind: function-enter, tsc: 5434426023268520 }
+    - { type: 0, func-id: 110, function: __cxx_global_var_init.8, cpu: 37, thread: 69819, kind: function-exit, tsc: 5434426023523052 }
+    - { type: 0, func-id: 164, function: __cxx_global_var_init, cpu: 37, thread: 69819, kind: function-enter, tsc: 5434426029925386 }
+    - { type: 0, func-id: 164, function: __cxx_global_var_init, cpu: 37, thread: 69819, kind: function-exit, tsc: 5434426030031128 }
+    - { type: 0, func-id: 142, function: '(anonymous namespace)::CommandLineParser::ParseCommandLineOptions(int, char const* const*, llvm::StringRef, llvm::raw_ostream*)', cpu: 37, thread: 69819, kind: function-enter, tsc: 5434426046951388 }
+    - { type: 0, func-id: 142, function: '(anonymous namespace)::CommandLineParser::ParseCommandLineOptions(int, char const* const*, llvm::StringRef, llvm::raw_ostream*)', cpu: 37, thread: 69819, kind: function-exit, tsc: 5434426047282020 }
+    - { type: 0, func-id: 187, function: 'llvm::LLLexer::LexIdentifier()', cpu: 37, thread: 69819, kind: function-enter, tsc: 5434426047857332 }
+    - { type: 0, func-id: 187, function: 'llvm::LLLexer::LexIdentifier()', cpu: 37, thread: 69819, kind: function-exit, tsc: 5434426047984152 }
+    - { type: 0, func-id: 187, function: 'llvm::LLLexer::LexIdentifier()', cpu: 37, thread: 69819, kind: function-enter, tsc: 5434426048036584 }
+    - { type: 0, func-id: 187, function: 'llvm::LLLexer::LexIdentifier()', cpu: 37, thread: 69819, kind: function-exit, tsc: 5434426048042292 }
+    - { type: 0, func-id: 187, function: 'llvm::LLLexer::LexIdentifier()', cpu: 37, thread: 69819, kind: function-enter, tsc: 5434426048055056 }
+    - { type: 0, func-id: 187, function: 'llvm::LLLexer::LexIdentifier()', cpu: 37, thread: 69819, kind: function-exit, tsc: 5434426048067316 }
+
+Controlling Fidelity
+--------------------
+
+So far in our examples, we haven't been getting full coverage of the functions
+we have in the binary. To get that, we need to modify the compiler flags so
+that we can instrument more (if not all) the functions we have in the binary.
+We have two options for doing that, and we explore both of these below.
+
+Instruction Threshold
+`````````````````````
+
+The first "blunt" way of doing this is by setting the minimum threshold for
+function bodies to 1. We can do that with the
+``-fxray-instruction-threshold=N`` flag when building our binary. We rebuild
+``llc`` with this option and observe the results:
+
+::
+
+  $ rm CMakeCache.txt
+  $ cmake -GNinja ../llvm -DCMAKE_BUILD_TYPE=Release \
+      -DCMAKE_C_FLAGS_RELEASE="-fxray-instrument -fxray-instruction-threshold=1" \
+      -DCMAKE_CXX_FLAGS="-fxray-instrument -fxray-instruction-threshold=1"
+  $ ninja llc
+  $ XRAY_OPTIONS="patch_premain=true" ./bin/llc input.ll
+  ==69819==XRay: Log file in 'xray-log.llc.5rqxkU'
+
+  $ llvm-xray account xray-log.llc.5rqxkU -top=10 -sort=sum -sortorder=dsc -instr_map ./bin/llc
+  Functions with latencies: 36652
+   funcid      count [      min,       med,       90p,       99p,       max]       sum  function    
+       75          1 [ 0.672368,  0.672368,  0.672368,  0.672368,  0.672368]  0.672368  llc.cpp:271:0: main
+       78          1 [ 0.626455,  0.626455,  0.626455,  0.626455,  0.626455]  0.626455  llc.cpp:381:0: compileModule(char**, llvm::LLVMContext&)
+   139617          1 [ 0.472618,  0.472618,  0.472618,  0.472618,  0.472618]  0.472618  LegacyPassManager.cpp:1723:0: llvm::legacy::PassManager::run(llvm::Module&)
+   139610          1 [ 0.472618,  0.472618,  0.472618,  0.472618,  0.472618]  0.472618  LegacyPassManager.cpp:1681:0: llvm::legacy::PassManagerImpl::run(llvm::Module&)
+   139612          1 [ 0.470948,  0.470948,  0.470948,  0.470948,  0.470948]  0.470948  LegacyPassManager.cpp:1564:0: (anonymous namespace)::MPPassManager::runOnModule(llvm::Module&)
+   139607          2 [ 0.147345,  0.315994,  0.315994,  0.315994,  0.315994]  0.463340  LegacyPassManager.cpp:1530:0: llvm::FPPassManager::runOnModule(llvm::Module&)
+   139605         21 [ 0.000002,  0.000002,  0.102593,  0.213336,  0.213336]  0.463331  LegacyPassManager.cpp:1491:0: llvm::FPPassManager::runOnFunction(llvm::Function&)
+   139563      26096 [ 0.000002,  0.000002,  0.000037,  0.000063,  0.000215]  0.225708  LegacyPassManager.cpp:1083:0: llvm::PMDataManager::findAnalysisPass(void const*, bool)
+   108055        188 [ 0.000002,  0.000120,  0.001375,  0.004523,  0.062624]  0.159279  MachineFunctionPass.cpp:38:0: llvm::MachineFunctionPass::runOnFunction(llvm::Function&)
+    62635         22 [ 0.000041,  0.000046,  0.000050,  0.126744,  0.126744]  0.127715  X86TargetMachine.cpp:242:0: llvm::X86TargetMachine::getSubtargetImpl(llvm::Function const&) const
+
+
+Instrumentation Attributes
+``````````````````````````
+
+The other way is to use configuration files for selecting which functions
+should always be instrumented by the compiler. This gives us a way of ensuring
+that certain functions are either always or never instrumented by not having to
+add the attribute to the source.
+
+To use this feature, you can define one file for the functions to always
+instrument, and another for functions to never instrument. The format of these
+files are exactly the same as the SanitizerLists files that control similar
+things for the sanitizer implementations. For example, we can have two
+different files like below:
+
+::
+
+  # always-instrument.txt
+  # always instrument functions that match the following filters:
+  fun:main
+
+  # never-instrument.txt
+  # never instrument functions that match the following filters:
+  fun:__cxx_*
+
+Given the above two files we can re-build by providing those two files as
+arguments to clang as ``-fxray-always-instrument=always-instrument.txt`` or
+``-fxray-never-instrument=never-instrument.txt``.
+
+Further Exploration
+-------------------
+
+The ``llvm-xray`` tool has a few other subcommands that are in various stages
+of being developed. One interesting subcommand that can highlight a few
+interesting things is the ``graph`` subcommand. Given for example the following
+toy program that we build with XRay instrumentation, we can see how the
+generated graph may be a helpful indicator of where time is being spent for the
+application.
+
+.. code-block:: c++
+
+  // sample.cc
+  #include <iostream>
+  #include <thread>
+
+  [[clang::xray_always_intrument]] void f() {
+    std::cerr << '.';
+  }
+
+  [[clang::xray_always_intrument]] void g() {
+    for (int i = 0; i < 1 << 10; ++i) {
+      std::cerr << '-';
+    }
+  }
+
+  int main(int argc, char* argv[]) {
+    std::thread t1([] {
+      for (int i = 0; i < 1 << 10; ++i)
+        f();
+    });
+    std::thread t2([] {
+      g();
+    });
+    t1.join();
+    t2.join();
+    std::cerr << '\n';
+  }
+
+We then build the above with XRay instrumentation:
+
+::
+
+  $ clang++ -o sample -O3 sample.cc -std=c++11 -fxray-instrument -fxray-instruction-threshold=1
+  $ XRAY_OPTIONS="patch_premain=true" ./sample
+
+We can then explore the graph rendering of the trace generated by this sample
+application. We assume you have the graphviz toosl available in your system,
+including both ``unflatten`` and ``dot``. If you prefer rendering or exploring
+the graph using another tool, then that should be feasible as well. ``llvm-xray
+graph`` will create DOT format graphs which should be usable in most graph
+rendering applications. One example invocation of the ``llvm-xray graph``
+command should yield some interesting insights to the workings of C++
+applications:
+
+::
+
+  $ llvm-xray graph xray-log.sample.* -m sample -color-edges=sum -edge-label=sum \
+      | unflatten -f -l10 | dot -Tsvg -o sample.svg
+
+Next Steps
+----------
+
+If you have some interesting analyses you'd like to implement as part of the
+llvm-xray tool, please feel free to propose them on the llvm-dev@ mailing list.
+The following are some ideas to inspire you in getting involved and potentially
+making things better.
+
+  - Implement a query/filtering library that allows for finding patterns in the
+    XRay traces.
+  - A conversion from the XRay trace onto something that can be visualised
+    better by other tools (like the Chrome trace viewer for example).
+  - Collecting function call stacks and how often they're encountered in the
+    XRay trace.
+
+
diff --git a/docs/conf.py b/docs/conf.py
index cd9142cefdf2da2eef22639beb9a0a98d309b5b9..e7c18da48ebef9ca14ba5d4526bdfe7cffbc0fd5 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -251,3 +251,7 @@ for name in os.listdir(command_guide_path):
 
 # FIXME: Define intersphinx configuration.
 intersphinx_mapping = {}
+
+# Pygment lexer are sometimes out of date (when parsing LLVM for example) or
+# wrong. Suppress the warning so the build doesn't abort.
+suppress_warnings = [ 'misc.highlighting_failure' ]
diff --git a/docs/index.rst b/docs/index.rst
index 341a9c16325b9ab390e79eb8770d9cb931347b05..fe47eb1bcb7f7b38343412429548d84ae98999a5 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -274,6 +274,7 @@ For API clients and LLVM developers.
    Coroutines
    GlobalISel
    XRay
+   XRayExample
    PDB/index
 
 :doc:`WritingAnLLVMPass`
@@ -399,6 +400,9 @@ For API clients and LLVM developers.
 :doc:`XRay`
   High-level documentation of how to use XRay in LLVM.
 
+:doc:`XRayExample`
+  An example of how to debug an application with XRay.
+
 :doc:`The Microsoft PDB File Format <PDB/index>`
   A detailed description of the Microsoft PDB (Program Database) file format.
 
diff --git a/docs/tutorial/BuildingAJIT1.rst b/docs/tutorial/BuildingAJIT1.rst
index 80957ee620f0fb66ee33ebed6b9834ce23c504ce..625cbbba1a5cc8c59b85ce264b38d1370fe61b52 100644
--- a/docs/tutorial/BuildingAJIT1.rst
+++ b/docs/tutorial/BuildingAJIT1.rst
@@ -125,14 +125,12 @@ usual include guards and #includes [2]_, we get to the definition of our class:
 
   class KaleidoscopeJIT {
   private:
-
     std::unique_ptr<TargetMachine> TM;
     const DataLayout DL;
     ObjectLinkingLayer<> ObjectLayer;
     IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
   public:
-
     typedef decltype(CompileLayer)::ModuleSetHandleT ModuleHandleT;
 
 Our class begins with four members: A TargetMachine, TM, which will be used
@@ -152,16 +150,16 @@ compiling it, and passing the resulting in-memory object files down to the
 object linking layer below.
 
 That's it for member variables, after that we have a single typedef:
-ModuleHandle. This is the handle type that will be returned from our JIT's
+ModuleHandleT. This is the handle type that will be returned from our JIT's
 addModule method, and can be passed to the removeModule method to remove a
 module. The IRCompileLayer class already provides a convenient handle type
-(IRCompileLayer::ModuleSetHandleT), so we just alias our ModuleHandle to this.
+(IRCompileLayer::ModuleSetHandleT), so we just alias our ModuleHandleT to this.
 
 .. code-block:: c++
 
   KaleidoscopeJIT()
       : TM(EngineBuilder().selectTarget()), DL(TM->createDataLayout()),
-    CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
+        CompileLayer(ObjectLayer, SimpleCompiler(*TM)) {
     llvm::sys::DynamicLibrary::LoadLibraryPermanently(nullptr);
   }
 
@@ -200,7 +198,7 @@ available for execution.
           return JITSymbol(nullptr);
         });
 
-    // Build a singlton module set to hold our module.
+    // Build a singleton module set to hold our module.
     std::vector<std::unique_ptr<Module>> Ms;
     Ms.push_back(std::move(M));
 
@@ -259,16 +257,16 @@ were linked into a single, ever-growing logical dylib. To implement this our
 first lambda (the one defining findSymbolInLogicalDylib) will just search for
 JIT'd code by calling the CompileLayer's findSymbol method. If we don't find a
 symbol in the JIT itself we'll fall back to our second lambda, which implements
-findSymbol. This will use the RTDyldMemoyrManager::getSymbolAddressInProcess
+findSymbol. This will use the RTDyldMemoryManager::getSymbolAddressInProcess
 method to search for the symbol within the program itself. If we can't find a
-symbol definition via either of these paths the JIT will refuse to accept our
+symbol definition via either of these paths, the JIT will refuse to accept our
 module, returning a "symbol not found" error.
 
-Now that we've built our symbol resolver we're ready to add our module to the
+Now that we've built our symbol resolver, we're ready to add our module to the
 JIT. We do this by calling the CompileLayer's addModuleSet method [4]_. Since
 we only have a single Module and addModuleSet expects a collection, we will
 create a vector of modules and add our module as the only member. Since we
-have already typedef'd our ModuleHandle type to be the same as the
+have already typedef'd our ModuleHandleT type to be the same as the
 CompileLayer's handle type, we can return the handle from addModuleSet
 directly from our addModule method.
 
@@ -304,7 +302,7 @@ treated as a duplicate definition when the next top-level expression is
 entered. It is generally good to free any module that you know you won't need
 to call further, just to free up the resources dedicated to it. However, you
 don't strictly need to do this: All resources will be cleaned up when your
-JIT class is destructed, if the haven't been freed before then.
+JIT class is destructed, if they haven't been freed before then.
 
 This brings us to the end of Chapter 1 of Building a JIT. You now have a basic
 but fully functioning JIT stack that you can use to take LLVM IR and make it
diff --git a/docs/tutorial/LangImpl02.rst b/docs/tutorial/LangImpl02.rst
index ac8d2d79874325b50e762e4c278df3b6440d18f1..4be447eb5ba35da9c675e0729ad066b1523dbfd8 100644
--- a/docs/tutorial/LangImpl02.rst
+++ b/docs/tutorial/LangImpl02.rst
@@ -119,6 +119,8 @@ way to talk about functions themselves:
     public:
       PrototypeAST(const std::string &name, std::vector<std::string> Args)
         : Name(name), Args(std::move(Args)) {}
+
+      const std::string &getName() const { return Name; }
     };
 
     /// FunctionAST - This class represents a function definition itself.
diff --git a/docs/tutorial/LangImpl03.rst b/docs/tutorial/LangImpl03.rst
index 2bb3a300026e0ea1c29fc56f8a87f250dff10d53..1dfe10175c747a0c41011c0f9fe1b4c6414b740b 100644
--- a/docs/tutorial/LangImpl03.rst
+++ b/docs/tutorial/LangImpl03.rst
@@ -122,7 +122,7 @@ First we'll do numeric literals:
 .. code-block:: c++
 
     Value *NumberExprAST::codegen() {
-      return ConstantFP::get(LLVMContext, APFloat(Val));
+      return ConstantFP::get(TheContext, APFloat(Val));
     }
 
 In the LLVM IR, numeric constants are represented with the
@@ -171,7 +171,7 @@ variables <LangImpl7.html#user-defined-local-variables>`_.
       case '<':
         L = Builder.CreateFCmpULT(L, R, "cmptmp");
         // Convert bool 0/1 to double 0.0 or 1.0
-        return Builder.CreateUIToFP(L, Type::getDoubleTy(LLVMContext),
+        return Builder.CreateUIToFP(L, Type::getDoubleTy(TheContext),
                                     "booltmp");
       default:
         return LogErrorV("invalid binary operator");
@@ -270,9 +270,9 @@ with:
     Function *PrototypeAST::codegen() {
       // Make the function type:  double(double,double) etc.
       std::vector<Type*> Doubles(Args.size(),
-                                 Type::getDoubleTy(LLVMContext));
+                                 Type::getDoubleTy(TheContext));
       FunctionType *FT =
-        FunctionType::get(Type::getDoubleTy(LLVMContext), Doubles, false);
+        FunctionType::get(Type::getDoubleTy(TheContext), Doubles, false);
 
       Function *F =
         Function::Create(FT, Function::ExternalLinkage, Name, TheModule);
@@ -346,7 +346,7 @@ assert that the function is empty (i.e. has no body yet) before we start.
 .. code-block:: c++
 
   // Create a new basic block to start insertion into.
-  BasicBlock *BB = BasicBlock::Create(LLVMContext, "entry", TheFunction);
+  BasicBlock *BB = BasicBlock::Create(TheContext, "entry", TheFunction);
   Builder.SetInsertPoint(BB);
 
   // Record the function arguments in the NamedValues map.
@@ -533,7 +533,8 @@ This shows an extern for the libm "cos" function, and a call to it.
       ret double %calltmp
     }
 
-When you quit the current demo, it dumps out the IR for the entire
+When you quit the current demo (by sending an EOF via CTRL+D on Linux
+or CTRL+Z and ENTER on Windows), it dumps out the IR for the entire
 module generated. Here you can see the big picture with all the
 functions referencing each other.
 
diff --git a/docs/tutorial/LangImpl04.rst b/docs/tutorial/LangImpl04.rst
index 513bf8f4ab45ea02b110df118c58d90ebfbc72b9..16d7164ae15ee244b3e352118b1e87af348baeb3 100644
--- a/docs/tutorial/LangImpl04.rst
+++ b/docs/tutorial/LangImpl04.rst
@@ -131,33 +131,29 @@ for us:
 
     void InitializeModuleAndPassManager(void) {
       // Open a new module.
-      Context LLVMContext;
-      TheModule = llvm::make_unique<Module>("my cool jit", LLVMContext);
-      TheModule->setDataLayout(TheJIT->getTargetMachine().createDataLayout());
+      TheModule = llvm::make_unique<Module>("my cool jit", TheContext);
 
       // Create a new pass manager attached to it.
       TheFPM = llvm::make_unique<FunctionPassManager>(TheModule.get());
 
-      // Provide basic AliasAnalysis support for GVN.
-      TheFPM.add(createBasicAliasAnalysisPass());
       // Do simple "peephole" optimizations and bit-twiddling optzns.
-      TheFPM.add(createInstructionCombiningPass());
+      TheFPM->add(createInstructionCombiningPass());
       // Reassociate expressions.
-      TheFPM.add(createReassociatePass());
+      TheFPM->add(createReassociatePass());
       // Eliminate Common SubExpressions.
-      TheFPM.add(createGVNPass());
+      TheFPM->add(createGVNPass());
       // Simplify the control flow graph (deleting unreachable blocks, etc).
-      TheFPM.add(createCFGSimplificationPass());
+      TheFPM->add(createCFGSimplificationPass());
 
-      TheFPM.doInitialization();
+      TheFPM->doInitialization();
     }
 
 This code initializes the global module ``TheModule``, and the function pass
 manager ``TheFPM``, which is attached to ``TheModule``. Once the pass manager is
 set up, we use a series of "add" calls to add a bunch of LLVM passes.
 
-In this case, we choose to add five passes: one analysis pass (alias analysis),
-and four optimization passes. The passes we choose here are a pretty standard set
+In this case, we choose to add four optimization passes.
+The passes we choose here are a pretty standard set
 of "cleanup" optimizations that are useful for a wide variety of code. I won't
 delve into what they do but, believe me, they are a good starting place :).
 
@@ -227,8 +223,10 @@ expressions they type in. For example, if they type in "1 + 2;", we
 should evaluate and print out 3. If they define a function, they should
 be able to call it from the command line.
 
-In order to do this, we first declare and initialize the JIT. This is
-done by adding a global variable ``TheJIT``, and initializing it in
+In order to do this, we first prepare the environment to create code for
+the current native target and declare and initialize the JIT. This is
+done by calling some ``InitializeNativeTarget\*`` functions and
+adding a global variable ``TheJIT``, and initializing it in
 ``main``:
 
 .. code-block:: c++
@@ -236,7 +234,21 @@ done by adding a global variable ``TheJIT``, and initializing it in
     static std::unique_ptr<KaleidoscopeJIT> TheJIT;
     ...
     int main() {
-      ..
+      InitializeNativeTarget();
+      InitializeNativeTargetAsmPrinter();
+      InitializeNativeTargetAsmParser();
+
+      // Install standard binary operators.
+      // 1 is lowest precedence.
+      BinopPrecedence['<'] = 10;
+      BinopPrecedence['+'] = 20;
+      BinopPrecedence['-'] = 20;
+      BinopPrecedence['*'] = 40; // highest.
+
+      // Prime the first token.
+      fprintf(stderr, "ready> ");
+      getNextToken();
+
       TheJIT = llvm::make_unique<KaleidoscopeJIT>();
 
       // Run the main "interpreter loop" now.
@@ -245,9 +257,24 @@ done by adding a global variable ``TheJIT``, and initializing it in
       return 0;
     }
 
+We also need to setup the data layout for the JIT:
+
+.. code-block:: c++
+
+    void InitializeModuleAndPassManager(void) {
+      // Open a new module.
+      TheModule = llvm::make_unique<Module>("my cool jit", TheContext);
+      TheModule->setDataLayout(TheJIT->getTargetMachine().createDataLayout());
+
+      // Create a new pass manager attached to it.
+      TheFPM = llvm::make_unique<FunctionPassManager>(TheModule.get());
+      ...
+
 The KaleidoscopeJIT class is a simple JIT built specifically for these
-tutorials. In later chapters we will look at how it works and extend it with
-new features, but for now we will take it as given. Its API is very simple::
+tutorials, available inside the LLVM source code
+at llvm-src/examples/Kaleidoscope/include/KaleidoscopeJIT.h.
+In later chapters we will look at how it works and extend it with
+new features, but for now we will take it as given. Its API is very simple:
 ``addModule`` adds an LLVM IR module to the JIT, making its functions
 available for execution; ``removeModule`` removes a module, freeing any
 memory associated with the code in that module; and ``findSymbol`` allows us
@@ -554,7 +581,10 @@ most recent to the oldest, to find the newest definition. If no definition is
 found inside the JIT, it falls back to calling "``dlsym("sin")``" on the
 Kaleidoscope process itself. Since "``sin``" is defined within the JIT's
 address space, it simply patches up calls in the module to call the libm
-version of ``sin`` directly.
+version of ``sin`` directly. But in some cases this even goes further:
+as sin and cos are names of standard math functions, the constant folder
+will directly evaluate the function calls to the correct result when called
+with constants like in the "``sin(1.0)``" above.
 
 In the future we'll see how tweaking this symbol resolution rule can be used to
 enable all sorts of useful features, from security (restricting the set of
@@ -567,12 +597,21 @@ if we add:
 
 .. code-block:: c++
 
+    #ifdef LLVM_ON_WIN32
+    #define DLLEXPORT __declspec(dllexport)
+    #else
+    #define DLLEXPORT
+    #endif
+
     /// putchard - putchar that takes a double and returns 0.
-    extern "C" double putchard(double X) {
+    extern "C" DLLEXPORT double putchard(double X) {
       fputc((char)X, stderr);
       return 0;
     }
 
+Note, that for Windows we need to actually export the functions because
+the dynamic symbol loader will use GetProcAddress to find the symbols.
+
 Now we can produce simple output to the console by using things like:
 "``extern putchard(x); putchard(120);``", which prints a lowercase 'x'
 on the console (120 is the ASCII code for 'x'). Similar code could be
diff --git a/docs/tutorial/LangImpl05.rst b/docs/tutorial/LangImpl05.rst
index ae0935d9ba1f976a7812b4654954f7b9c2e31806..dcf45bcbf8d20c2d9fdee20c256f8ad1347f20c4 100644
--- a/docs/tutorial/LangImpl05.rst
+++ b/docs/tutorial/LangImpl05.rst
@@ -103,7 +103,8 @@ To represent the new expression we add a new AST node for it:
       IfExprAST(std::unique_ptr<ExprAST> Cond, std::unique_ptr<ExprAST> Then,
                 std::unique_ptr<ExprAST> Else)
         : Cond(std::move(Cond)), Then(std::move(Then)), Else(std::move(Else)) {}
-      virtual Value *codegen();
+
+      Value *codegen() override;
     };
 
 The AST node just has pointers to the various subexpressions.
@@ -290,9 +291,9 @@ for ``IfExprAST``:
       if (!CondV)
         return nullptr;
 
-      // Convert condition to a bool by comparing equal to 0.0.
+      // Convert condition to a bool by comparing non-equal to 0.0.
       CondV = Builder.CreateFCmpONE(
-          CondV, ConstantFP::get(LLVMContext, APFloat(0.0)), "ifcond");
+          CondV, ConstantFP::get(TheContext, APFloat(0.0)), "ifcond");
 
 This code is straightforward and similar to what we saw before. We emit
 the expression for the condition, then compare that value to zero to get
@@ -305,9 +306,9 @@ a truth value as a 1-bit (bool) value.
       // Create blocks for the then and else cases.  Insert the 'then' block at the
       // end of the function.
       BasicBlock *ThenBB =
-          BasicBlock::Create(LLVMContext, "then", TheFunction);
-      BasicBlock *ElseBB = BasicBlock::Create(LLVMContext, "else");
-      BasicBlock *MergeBB = BasicBlock::Create(LLVMContext, "ifcont");
+          BasicBlock::Create(TheContext, "then", TheFunction);
+      BasicBlock *ElseBB = BasicBlock::Create(TheContext, "else");
+      BasicBlock *MergeBB = BasicBlock::Create(TheContext, "ifcont");
 
       Builder.CreateCondBr(CondV, ThenBB, ElseBB);
 
@@ -400,7 +401,7 @@ code:
       TheFunction->getBasicBlockList().push_back(MergeBB);
       Builder.SetInsertPoint(MergeBB);
       PHINode *PN =
-        Builder.CreatePHI(Type::getDoubleTy(LLVMContext), 2, "iftmp");
+        Builder.CreatePHI(Type::getDoubleTy(TheContext), 2, "iftmp");
 
       PN->addIncoming(ThenV, ThenBB);
       PN->addIncoming(ElseV, ElseBB);
@@ -433,7 +434,7 @@ something more aggressive, a 'for' expression:
 
 ::
 
-     extern putchard(char)
+     extern putchard(char);
      def printstar(n)
        for i = 1, i < n, 1.0 in
          putchard(42);  # ascii 42 = '*'
@@ -500,7 +501,8 @@ variable name and the constituent expressions in the node.
                  std::unique_ptr<ExprAST> Body)
         : VarName(VarName), Start(std::move(Start)), End(std::move(End)),
           Step(std::move(Step)), Body(std::move(Body)) {}
-      virtual Value *codegen();
+
+      Value *codegen() override;
     };
 
 Parser Extensions for the 'for' Loop
@@ -561,6 +563,27 @@ value to null in the AST node:
                                            std::move(Body));
     }
 
+And again we hook it up as a primary expression:
+
+.. code-block:: c++
+
+    static std::unique_ptr<ExprAST> ParsePrimary() {
+      switch (CurTok) {
+      default:
+        return LogError("unknown token when expecting an expression");
+      case tok_identifier:
+        return ParseIdentifierExpr();
+      case tok_number:
+        return ParseNumberExpr();
+      case '(':
+        return ParseParenExpr();
+      case tok_if:
+        return ParseIfExpr();
+      case tok_for:
+        return ParseForExpr();
+      }
+    }
+
 LLVM IR for the 'for' Loop
 --------------------------
 
@@ -610,7 +633,8 @@ expression for the loop value:
     Value *ForExprAST::codegen() {
       // Emit the start code first, without 'variable' in scope.
       Value *StartVal = Start->codegen();
-      if (StartVal == 0) return 0;
+      if (!StartVal)
+        return nullptr;
 
 With this out of the way, the next step is to set up the LLVM basic
 block for the start of the loop body. In the case above, the whole loop
@@ -625,7 +649,7 @@ expression).
       Function *TheFunction = Builder.GetInsertBlock()->getParent();
       BasicBlock *PreheaderBB = Builder.GetInsertBlock();
       BasicBlock *LoopBB =
-          BasicBlock::Create(LLVMContext, "loop", TheFunction);
+          BasicBlock::Create(TheContext, "loop", TheFunction);
 
       // Insert an explicit fall through from the current block to the LoopBB.
       Builder.CreateBr(LoopBB);
@@ -642,7 +666,7 @@ the two blocks.
       Builder.SetInsertPoint(LoopBB);
 
       // Start the PHI node with an entry for Start.
-      PHINode *Variable = Builder.CreatePHI(Type::getDoubleTy(LLVMContext),
+      PHINode *Variable = Builder.CreatePHI(Type::getDoubleTy(TheContext),
                                             2, VarName.c_str());
       Variable->addIncoming(StartVal, PreheaderBB);
 
@@ -693,7 +717,7 @@ table.
           return nullptr;
       } else {
         // If not specified, use 1.0.
-        StepVal = ConstantFP::get(LLVMContext, APFloat(1.0));
+        StepVal = ConstantFP::get(TheContext, APFloat(1.0));
       }
 
       Value *NextVar = Builder.CreateFAdd(Variable, StepVal, "nextvar");
@@ -710,9 +734,9 @@ iteration of the loop.
       if (!EndCond)
         return nullptr;
 
-      // Convert condition to a bool by comparing equal to 0.0.
+      // Convert condition to a bool by comparing non-equal to 0.0.
       EndCond = Builder.CreateFCmpONE(
-          EndCond, ConstantFP::get(LLVMContext, APFloat(0.0)), "loopcond");
+          EndCond, ConstantFP::get(TheContext, APFloat(0.0)), "loopcond");
 
 Finally, we evaluate the exit value of the loop, to determine whether
 the loop should exit. This mirrors the condition evaluation for the
@@ -723,7 +747,7 @@ if/then/else statement.
       // Create the "after loop" block and insert it.
       BasicBlock *LoopEndBB = Builder.GetInsertBlock();
       BasicBlock *AfterBB =
-          BasicBlock::Create(LLVMContext, "afterloop", TheFunction);
+          BasicBlock::Create(TheContext, "afterloop", TheFunction);
 
       // Insert the conditional branch into the end of LoopEndBB.
       Builder.CreateCondBr(EndCond, LoopBB, AfterBB);
@@ -751,7 +775,7 @@ insertion position to it.
         NamedValues.erase(VarName);
 
       // for expr always returns 0.0.
-      return Constant::getNullValue(Type::getDoubleTy(LLVMContext));
+      return Constant::getNullValue(Type::getDoubleTy(TheContext));
     }
 
 The final code handles various cleanups: now that we have the "NextVar"
@@ -772,7 +796,7 @@ Full Code Listing
 =================
 
 Here is the complete code listing for our running example, enhanced with
-the if/then/else and for expressions.. To build this example, use:
+the if/then/else and for expressions. To build this example, use:
 
 .. code-block:: bash
 
diff --git a/docs/tutorial/LangImpl06.rst b/docs/tutorial/LangImpl06.rst
index f6d2bd943ef7d6984136d9419ced4f07718a2700..c1035bce8559359a5c969a54a4effe7e1f879c79 100644
--- a/docs/tutorial/LangImpl06.rst
+++ b/docs/tutorial/LangImpl06.rst
@@ -31,7 +31,7 @@ User-defined Operators: the Idea
 ================================
 
 The "operator overloading" that we will add to Kaleidoscope is more
-general than languages like C++. In C++, you are only allowed to
+general than in languages like C++. In C++, you are only allowed to
 redefine existing operators: you can't programmatically change the
 grammar, introduce new operators, change precedence levels, etc. In this
 chapter, we will add this capability to Kaleidoscope, which will let the
@@ -41,8 +41,8 @@ The point of going into user-defined operators in a tutorial like this
 is to show the power and flexibility of using a hand-written parser.
 Thus far, the parser we have been implementing uses recursive descent
 for most parts of the grammar and operator precedence parsing for the
-expressions. See `Chapter 2 <LangImpl2.html>`_ for details. Without
-using operator precedence parsing, it would be very difficult to allow
+expressions. See `Chapter 2 <LangImpl2.html>`_ for details. By
+using operator precedence parsing, it is very easy to allow
 the programmer to introduce new operators into the grammar: the grammar
 is dynamically extensible as the JIT runs.
 
@@ -143,17 +143,18 @@ this:
       : Name(name), Args(std::move(Args)), IsOperator(IsOperator),
         Precedence(Prec) {}
 
+      Function *codegen();
+      const std::string &getName() const { return Name; }
+
       bool isUnaryOp() const { return IsOperator && Args.size() == 1; }
       bool isBinaryOp() const { return IsOperator && Args.size() == 2; }
 
       char getOperatorName() const {
         assert(isUnaryOp() || isBinaryOp());
-        return Name[Name.size()-1];
+        return Name[Name.size() - 1];
       }
 
       unsigned getBinaryPrecedence() const { return Precedence; }
-
-      Function *codegen();
     };
 
 Basically, in addition to knowing a name for the prototype, we now keep
@@ -194,7 +195,7 @@ user-defined operator, we need to parse it:
         // Read the precedence if present.
         if (CurTok == tok_number) {
           if (NumVal < 1 || NumVal > 100)
-            return LogErrorP("Invalid precedecnce: must be 1..100");
+            return LogErrorP("Invalid precedence: must be 1..100");
           BinaryPrecedence = (unsigned)NumVal;
           getNextToken();
         }
@@ -225,7 +226,7 @@ This is all fairly straightforward parsing code, and we have already
 seen a lot of similar code in the past. One interesting part about the
 code above is the couple lines that set up ``FnName`` for binary
 operators. This builds names like "binary@" for a newly defined "@"
-operator. This then takes advantage of the fact that symbol names in the
+operator. It then takes advantage of the fact that symbol names in the
 LLVM symbol table are allowed to have any character in them, including
 embedded nul characters.
 
@@ -251,7 +252,7 @@ default case for our existing binary operator node:
       case '<':
         L = Builder.CreateFCmpULT(L, R, "cmptmp");
         // Convert bool 0/1 to double 0.0 or 1.0
-        return Builder.CreateUIToFP(L, Type::getDoubleTy(LLVMContext),
+        return Builder.CreateUIToFP(L, Type::getDoubleTy(TheContext),
                                     "booltmp");
       default:
         break;
@@ -259,7 +260,7 @@ default case for our existing binary operator node:
 
       // If it wasn't a builtin binary operator, it must be a user defined one. Emit
       // a call to it.
-      Function *F = TheModule->getFunction(std::string("binary") + Op);
+      Function *F = getFunction(std::string("binary") + Op);
       assert(F && "binary operator not found!");
 
       Value *Ops[2] = { L, R };
@@ -277,22 +278,21 @@ The final piece of code we are missing, is a bit of top-level magic:
 .. code-block:: c++
 
     Function *FunctionAST::codegen() {
-      NamedValues.clear();
-
-      Function *TheFunction = Proto->codegen();
+      // Transfer ownership of the prototype to the FunctionProtos map, but keep a
+      // reference to it for use below.
+      auto &P = *Proto;
+      FunctionProtos[Proto->getName()] = std::move(Proto);
+      Function *TheFunction = getFunction(P.getName());
       if (!TheFunction)
         return nullptr;
 
       // If this is an operator, install it.
-      if (Proto->isBinaryOp())
-        BinopPrecedence[Proto->getOperatorName()] = Proto->getBinaryPrecedence();
+      if (P.isBinaryOp())
+        BinopPrecedence[P.getOperatorName()] = P.getBinaryPrecedence();
 
       // Create a new basic block to start insertion into.
-      BasicBlock *BB = BasicBlock::Create(LLVMContext, "entry", TheFunction);
-      Builder.SetInsertPoint(BB);
-
-      if (Value *RetVal = Body->codegen()) {
-        ...
+      BasicBlock *BB = BasicBlock::Create(TheContext, "entry", TheFunction);
+      ...
 
 Basically, before codegening a function, if it is a user-defined
 operator, we register it in the precedence table. This allows the binary
@@ -323,7 +323,8 @@ that, we need an AST node:
     public:
       UnaryExprAST(char Opcode, std::unique_ptr<ExprAST> Operand)
         : Opcode(Opcode), Operand(std::move(Operand)) {}
-      virtual Value *codegen();
+
+      Value *codegen() override;
     };
 
 This AST node is very simple and obvious by now. It directly mirrors the
@@ -345,7 +346,7 @@ simple: we'll add a new function to do it:
       int Opc = CurTok;
       getNextToken();
       if (auto Operand = ParseUnary())
-        return llvm::unique_ptr<UnaryExprAST>(Opc, std::move(Operand));
+        return llvm::make_unique<UnaryExprAST>(Opc, std::move(Operand));
       return nullptr;
     }
 
@@ -433,7 +434,7 @@ unary operators. It looks like this:
       if (!OperandV)
         return nullptr;
 
-      Function *F = TheModule->getFunction(std::string("unary")+Opcode);
+      Function *F = getFunction(std::string("unary") + Opcode);
       if (!F)
         return LogErrorV("Unknown unary operator");
 
@@ -461,7 +462,7 @@ newline):
     declare double @printd(double)
 
     ready> def binary : 1 (x y) 0;  # Low-precedence operator that ignores operands.
-    ..
+    ...
     ready> printd(123) : printd(456) : printd(789);
     123.000000
     456.000000
@@ -518,10 +519,9 @@ denser the character:
 
 ::
 
-    ready>
-
-    extern putchard(char)
-    def printdensity(d)
+    ready> extern putchard(char);
+    ...
+    ready> def printdensity(d)
       if d > 8 then
         putchard(32)  # ' '
       else if d > 4 then
@@ -538,9 +538,9 @@ denser the character:
     Evaluated to 0.000000
 
 Based on these simple primitive operations, we can start to define more
-interesting things. For example, here's a little function that solves
-for the number of iterations it takes a function in the complex plane to
-converge:
+interesting things. For example, here's a little function that determines
+the number of iterations it takes for a certain function in the complex
+plane to diverge:
 
 ::
 
@@ -742,7 +742,7 @@ Full Code Listing
 =================
 
 Here is the complete code listing for our running example, enhanced with
-the if/then/else and for expressions.. To build this example, use:
+the support for user-defined operators. To build this example, use:
 
 .. code-block:: bash
 
diff --git a/docs/tutorial/LangImpl07.rst b/docs/tutorial/LangImpl07.rst
index 4d86ecad38aaa342fc0f67dba6c31788842f0c4e..582645f449b4156cb6e20e81bcb012d0d03b7a06 100644
--- a/docs/tutorial/LangImpl07.rst
+++ b/docs/tutorial/LangImpl07.rst
@@ -327,7 +327,7 @@ to update:
 
     static std::map<std::string, AllocaInst*> NamedValues;
 
-Also, since we will need to create these alloca's, we'll use a helper
+Also, since we will need to create these allocas, we'll use a helper
 function that ensures that the allocas are created in the entry block of
 the function:
 
@@ -339,7 +339,7 @@ the function:
                                               const std::string &VarName) {
       IRBuilder<> TmpB(&TheFunction->getEntryBlock(),
                      TheFunction->getEntryBlock().begin());
-      return TmpB.CreateAlloca(Type::getDoubleTy(LLVMContext), 0,
+      return TmpB.CreateAlloca(Type::getDoubleTy(TheContext), 0,
                                VarName.c_str());
     }
 
@@ -348,7 +348,7 @@ the first instruction (.begin()) of the entry block. It then creates an
 alloca with the expected name and returns it. Because all values in
 Kaleidoscope are doubles, there is no need to pass in a type to use.
 
-With this in place, the first functionality change we want to make is to
+With this in place, the first functionality change we want to make belongs to
 variable references. In our new scheme, variables live on the stack, so
 code generating a reference to them actually needs to produce a load
 from the stack slot:
@@ -377,7 +377,7 @@ the unabridged code):
       // Create an alloca for the variable in the entry block.
       AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
 
-        // Emit the start code first, without 'variable' in scope.
+      // Emit the start code first, without 'variable' in scope.
       Value *StartVal = Start->codegen();
       if (!StartVal)
         return nullptr;
@@ -408,21 +408,25 @@ them. The code for this is also pretty simple:
 
 .. code-block:: c++
 
-    /// CreateArgumentAllocas - Create an alloca for each argument and register the
-    /// argument in the symbol table so that references to it will succeed.
-    void PrototypeAST::CreateArgumentAllocas(Function *F) {
-      Function::arg_iterator AI = F->arg_begin();
-      for (unsigned Idx = 0, e = Args.size(); Idx != e; ++Idx, ++AI) {
+    Function *FunctionAST::codegen() {
+      ...
+      Builder.SetInsertPoint(BB);
+
+      // Record the function arguments in the NamedValues map.
+      NamedValues.clear();
+      for (auto &Arg : TheFunction->args()) {
         // Create an alloca for this variable.
-        AllocaInst *Alloca = CreateEntryBlockAlloca(F, Args[Idx]);
+        AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, Arg.getName());
 
         // Store the initial value into the alloca.
-        Builder.CreateStore(AI, Alloca);
+        Builder.CreateStore(&Arg, Alloca);
 
         // Add arguments to variable symbol table.
-        NamedValues[Args[Idx]] = Alloca;
+        NamedValues[Arg.getName()] = Alloca;
       }
-    }
+
+      if (Value *RetVal = Body->codegen()) {
+        ...
 
 For each argument, we make an alloca, store the input value to the
 function into the alloca, and register the alloca as the memory location
@@ -434,15 +438,13 @@ get good codegen once again:
 
 .. code-block:: c++
 
-        // Set up the optimizer pipeline.  Start with registering info about how the
-        // target lays out data structures.
-        OurFPM.add(new DataLayout(*TheExecutionEngine->getDataLayout()));
         // Promote allocas to registers.
-        OurFPM.add(createPromoteMemoryToRegisterPass());
+        TheFPM->add(createPromoteMemoryToRegisterPass());
         // Do simple "peephole" optimizations and bit-twiddling optzns.
-        OurFPM.add(createInstructionCombiningPass());
+        TheFPM->add(createInstructionCombiningPass());
         // Reassociate expressions.
-        OurFPM.add(createReassociatePass());
+        TheFPM->add(createReassociatePass());
+        ...
 
 It is interesting to see what the code looks like before and after the
 mem2reg optimization runs. For example, this is the before/after code
@@ -454,7 +456,7 @@ for our recursive fib function. Before the optimization:
     entry:
       %x1 = alloca double
       store double %x, double* %x1
-      %x2 = load double* %x1
+      %x2 = load double, double* %x1
       %cmptmp = fcmp ult double %x2, 3.000000e+00
       %booltmp = uitofp i1 %cmptmp to double
       %ifcond = fcmp one double %booltmp, 0.000000e+00
@@ -464,10 +466,10 @@ for our recursive fib function. Before the optimization:
       br label %ifcont
 
     else:       ; preds = %entry
-      %x3 = load double* %x1
+      %x3 = load double, double* %x1
       %subtmp = fsub double %x3, 1.000000e+00
       %calltmp = call double @fib(double %subtmp)
-      %x4 = load double* %x1
+      %x4 = load double, double* %x1
       %subtmp5 = fsub double %x4, 2.000000e+00
       %calltmp6 = call double @fib(double %subtmp5)
       %addtmp = fadd double %calltmp, %calltmp6
@@ -677,10 +679,10 @@ var/in, it looks like this:
 
     public:
       VarExprAST(std::vector<std::pair<std::string, std::unique_ptr<ExprAST>>> VarNames,
-                 std::unique_ptr<ExprAST> body)
-      : VarNames(std::move(VarNames)), Body(std::move(Body)) {}
+                 std::unique_ptr<ExprAST> Body)
+        : VarNames(std::move(VarNames)), Body(std::move(Body)) {}
 
-      virtual Value *codegen();
+      Value *codegen() override;
     };
 
 var/in allows a list of names to be defined all at once, and each name
@@ -812,7 +814,7 @@ previous value that we replace in OldBindings.
           if (!InitVal)
             return nullptr;
         } else { // If not specified, use 0.0.
-          InitVal = ConstantFP::get(LLVMContext, APFloat(0.0));
+          InitVal = ConstantFP::get(TheContext, APFloat(0.0));
         }
 
         AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, VarName);
diff --git a/docs/tutorial/LangImpl09.rst b/docs/tutorial/LangImpl09.rst
index 0053960756d29489c25f8e329a49ed5d39714006..fe5a95a5769ec7210b7fc3b8e684dcc069cc8b35 100644
--- a/docs/tutorial/LangImpl09.rst
+++ b/docs/tutorial/LangImpl09.rst
@@ -18,7 +18,7 @@ Source level debugging uses formatted data that helps a debugger
 translate from binary and the state of the machine back to the
 source that the programmer wrote. In LLVM we generally use a format
 called `DWARF <http://dwarfstd.org>`_. DWARF is a compact encoding
-that represents types, source locations, and variable locations. 
+that represents types, source locations, and variable locations.
 
 The short summary of this chapter is that we'll go through the
 various things you have to add to a programming language to
@@ -94,14 +94,14 @@ Then we're going to remove the command line code wherever it exists:
          return;
   @@ -1184,7 +1183,6 @@ int main() {
      BinopPrecedence['*'] = 40; // highest.
- 
+
      // Prime the first token.
   -  fprintf(stderr, "ready> ");
      getNextToken();
- 
+
 Lastly we're going to disable all of the optimization passes and the JIT so
 that the only thing that happens after we're done parsing and generating
-code is that the llvm IR goes to standard error:
+code is that the LLVM IR goes to standard error:
 
 .. code-block:: udiff
 
@@ -140,7 +140,7 @@ code is that the llvm IR goes to standard error:
   -
   +  #endif
      OurFPM.doInitialization();
- 
+
      // Set the global so the code gen can use this.
 
 This relatively small set of changes get us to the point that we can compile
@@ -166,8 +166,8 @@ DWARF Emission Setup
 
 Similar to the ``IRBuilder`` class we have a
 `DIBuilder <http://llvm.org/doxygen/classllvm_1_1DIBuilder.html>`_ class
-that helps in constructing debug metadata for an llvm IR file. It
-corresponds 1:1 similarly to ``IRBuilder`` and llvm IR, but with nicer names.
+that helps in constructing debug metadata for an LLVM IR file. It
+corresponds 1:1 similarly to ``IRBuilder`` and LLVM IR, but with nicer names.
 Using it does require that you be more familiar with DWARF terminology than
 you needed to be with ``IRBuilder`` and ``Instruction`` names, but if you
 read through the general documentation on the
@@ -194,7 +194,7 @@ expressions:
   } KSDbgInfo;
 
   DIType *DebugInfo::getDoubleTy() {
-    if (DblTy.isValid())
+    if (DblTy)
       return DblTy;
 
     DblTy = DBuilder->createBasicType("double", 64, 64, dwarf::DW_ATE_float);
@@ -214,7 +214,7 @@ There are a couple of things to note here. First, while we're producing a
 compile unit for a language called Kaleidoscope we used the language
 constant for C. This is because a debugger wouldn't necessarily understand
 the calling conventions or default ABI for a language it doesn't recognize
-and we follow the C ABI in our llvm code generation so it's the closest
+and we follow the C ABI in our LLVM code generation so it's the closest
 thing to accurate. This ensures we can actually call functions from the
 debugger and have them execute. Secondly, you'll see the "fib.ks" in the
 call to ``createCompileUnit``. This is a default hard coded value since
@@ -259,10 +259,11 @@ information) and construct our function definition:
   unsigned LineNo = 0;
   unsigned ScopeLine = 0;
   DISubprogram *SP = DBuilder->createFunction(
-      FContext, Name, StringRef(), Unit, LineNo,
-      CreateFunctionType(Args.size(), Unit), false /* internal linkage */,
-      true /* definition */, ScopeLine, DINode::FlagPrototyped, false);
-  F->setSubprogram(SP);
+      FContext, P.getName(), StringRef(), Unit, LineNo,
+      CreateFunctionType(TheFunction->arg_size(), Unit),
+      false /* internal linkage */, true /* definition */, ScopeLine,
+      DINode::FlagPrototyped, false);
+  TheFunction->setSubprogram(SP);
 
 and we now have an DISubprogram that contains a reference to all of our
 metadata for the function.
@@ -326,10 +327,9 @@ that we pass down through when we create a new expression:
 
 giving us locations for each of our expressions and variables.
 
-From this we can make sure to tell ``DIBuilder`` when we're at a new source
-location so it can use that when we generate the rest of our code and make
-sure that each instruction has source location information. We do this
-by constructing another small function:
+To make sure that every instruction gets proper source location information,
+we have to tell ``Builder`` whenever we're at a new source location.
+We use a small helper function for this:
 
 .. code-block:: c++
 
@@ -343,40 +343,23 @@ by constructing another small function:
         DebugLoc::get(AST->getLine(), AST->getCol(), Scope));
   }
 
-that both tells the main ``IRBuilder`` where we are, but also what scope
-we're in. Since we've just created a function above we can either be in
-the main file scope (like when we created our function), or now we can be
-in the function scope we just created. To represent this we create a stack
-of scopes:
+This both tells the main ``IRBuilder`` where we are, but also what scope
+we're in. The scope can either be on compile-unit level or be the nearest
+enclosing lexical block like the current function.
+To represent this we create a stack of scopes:
 
 .. code-block:: c++
 
    std::vector<DIScope *> LexicalBlocks;
-   std::map<const PrototypeAST *, DIScope *> FnScopeMap;
-
-and keep a map of each function to the scope that it represents (an
-DISubprogram is also an DIScope).
-
-Then we make sure to:
-
-.. code-block:: c++
-
-   KSDbgInfo.emitLocation(this);
 
-emit the location every time we start to generate code for a new AST, and
-also:
+and push the scope (function) to the top of the stack when we start
+generating the code for each function:
 
 .. code-block:: c++
 
-  KSDbgInfo.FnScopeMap[this] = SP;
-
-store the scope (function) when we create it and use it:
-
-  KSDbgInfo.LexicalBlocks.push_back(&KSDbgInfo.FnScopeMap[Proto]);
-
-when we start generating the code for each function.
+  KSDbgInfo.LexicalBlocks.push_back(SP);
 
-also, don't forget to pop the scope back off of your scope stack at the
+Also, we may not forget to pop the scope back off of the scope stack at the
 end of the code generation for the function:
 
 .. code-block:: c++
@@ -385,6 +368,13 @@ end of the code generation for the function:
   // unconditionally.
   KSDbgInfo.LexicalBlocks.pop_back();
 
+Then we make sure to emit the location every time we start to generate code
+for a new AST object:
+
+.. code-block:: c++
+
+   KSDbgInfo.emitLocation(this);
+
 Variables
 =========
 
@@ -392,25 +382,37 @@ Now that we have functions, we need to be able to print out the variables
 we have in scope. Let's get our function arguments set up so we can get
 decent backtraces and see how our functions are being called. It isn't
 a lot of code, and we generally handle it when we're creating the
-argument allocas in ``PrototypeAST::CreateArgumentAllocas``.
+argument allocas in ``FunctionAST::codegen``.
 
 .. code-block:: c++
 
-  DIScope *Scope = KSDbgInfo.LexicalBlocks.back();
-  DIFile *Unit = DBuilder->createFile(KSDbgInfo.TheCU.getFilename(),
-                                      KSDbgInfo.TheCU.getDirectory());
-  DILocalVariable D = DBuilder->createParameterVariable(
-      Scope, Args[Idx], Idx + 1, Unit, Line, KSDbgInfo.getDoubleTy(), true);
+    // Record the function arguments in the NamedValues map.
+    NamedValues.clear();
+    unsigned ArgIdx = 0;
+    for (auto &Arg : TheFunction->args()) {
+      // Create an alloca for this variable.
+      AllocaInst *Alloca = CreateEntryBlockAlloca(TheFunction, Arg.getName());
+
+      // Create a debug descriptor for the variable.
+      DILocalVariable *D = DBuilder->createParameterVariable(
+          SP, Arg.getName(), ++ArgIdx, Unit, LineNo, KSDbgInfo.getDoubleTy(),
+          true);
+
+      DBuilder->insertDeclare(Alloca, D, DBuilder->createExpression(),
+                              DebugLoc::get(LineNo, 0, SP),
+                              Builder.GetInsertBlock());
+
+      // Store the initial value into the alloca.
+      Builder.CreateStore(&Arg, Alloca);
+
+      // Add arguments to variable symbol table.
+      NamedValues[Arg.getName()] = Alloca;
+    }
 
-  DBuilder->insertDeclare(Alloca, D, DBuilder->createExpression(),
-                          DebugLoc::get(Line, 0, Scope),
-                          Builder.GetInsertBlock());
 
-Here we're doing a few things. First, we're grabbing our current scope
-for the variable so we can say what range of code our variable is valid
-through. Second, we're creating the variable, giving it the scope,
+Here we're first creating the variable, giving it the scope (``SP``),
 the name, source location, type, and since it's an argument, the argument
-index. Third, we create an ``lvm.dbg.declare`` call to indicate at the IR
+index. Next, we create an ``lvm.dbg.declare`` call to indicate at the IR
 level that we've got a variable in an alloca (and it gives a starting
 location for the variable), and setting a source location for the
 beginning of the scope on the declare.
@@ -420,7 +422,7 @@ assumptions based on how code and debug information was generated for them
 in the past. In this case we need to do a little bit of a hack to avoid
 generating line information for the function prologue so that the debugger
 knows to skip over those instructions when setting a breakpoint. So in
-``FunctionAST::CodeGen`` we add a couple of lines:
+``FunctionAST::CodeGen`` we add some more lines:
 
 .. code-block:: c++
 
@@ -434,7 +436,7 @@ body of the function:
 
 .. code-block:: c++
 
-  KSDbgInfo.emitLocation(Body);
+  KSDbgInfo.emitLocation(Body.get());
 
 With this we have enough debug information to set breakpoints in functions,
 print out argument variables, and call functions. Not too bad for just a
diff --git a/docs/tutorial/OCamlLangImpl5.rst b/docs/tutorial/OCamlLangImpl5.rst
index 3a135b23337339ce6665c6e41a11b7b95253be5f..6e17de4b2bde8d6d5fca60bb681ac8fef184e630 100644
--- a/docs/tutorial/OCamlLangImpl5.rst
+++ b/docs/tutorial/OCamlLangImpl5.rst
@@ -103,19 +103,7 @@ Parser Extensions for If/Then/Else
 
 Now that we have the relevant tokens coming from the lexer and we have
 the AST node to build, our parsing logic is relatively straightforward.
-First we define a new parsing function:
-
-.. code-block:: ocaml
-
-    let rec parse_primary = parser
-      ...
-      (* ifexpr ::= 'if' expr 'then' expr 'else' expr *)
-      | [< 'Token.If; c=parse_expr;
-           'Token.Then ?? "expected 'then'"; t=parse_expr;
-           'Token.Else ?? "expected 'else'"; e=parse_expr >] ->
-          Ast.If (c, t, e)
-
-Next we hook it up as a primary expression:
+Next we add a new case for parsing a if-expression as a primary expression:
 
 .. code-block:: ocaml
 
diff --git a/examples/BrainF/BrainF.cpp b/examples/BrainF/BrainF.cpp
index 91d813a6c3bb4d4b8188cc675ad61d404b5a917e..8af34d04701e32ef713e0af0509321fc0112c39e 100644
--- a/examples/BrainF/BrainF.cpp
+++ b/examples/BrainF/BrainF.cpp
@@ -74,18 +74,18 @@ void BrainF::header(LLVMContext& C) {
 
   //declare i32 @getchar()
   getchar_func = cast<Function>(module->
-    getOrInsertFunction("getchar", IntegerType::getInt32Ty(C), NULL));
+    getOrInsertFunction("getchar", IntegerType::getInt32Ty(C)));
 
   //declare i32 @putchar(i32)
   putchar_func = cast<Function>(module->
     getOrInsertFunction("putchar", IntegerType::getInt32Ty(C),
-                        IntegerType::getInt32Ty(C), NULL));
+                        IntegerType::getInt32Ty(C)));
 
   //Function header
 
   //define void @brainf()
   brainf_func = cast<Function>(module->
-    getOrInsertFunction("brainf", Type::getVoidTy(C), NULL));
+    getOrInsertFunction("brainf", Type::getVoidTy(C)));
 
   builder = new IRBuilder<>(BasicBlock::Create(C, label, brainf_func));
 
@@ -156,7 +156,7 @@ void BrainF::header(LLVMContext& C) {
     //declare i32 @puts(i8 *)
     Function *puts_func = cast<Function>(module->
       getOrInsertFunction("puts", IntegerType::getInt32Ty(C),
-                      PointerType::getUnqual(IntegerType::getInt8Ty(C)), NULL));
+                      PointerType::getUnqual(IntegerType::getInt8Ty(C))));
 
     //brainf.aberror:
     aberrorbb = BasicBlock::Create(C, label, brainf_func);
diff --git a/examples/BrainF/BrainFDriver.cpp b/examples/BrainF/BrainFDriver.cpp
index d704506d2442733d823bfa9e4d8cac0d63e92226..65f8033a7e27ecd44b0f04331fd074edd41c69f6 100644
--- a/examples/BrainF/BrainFDriver.cpp
+++ b/examples/BrainF/BrainFDriver.cpp
@@ -77,7 +77,7 @@ void addMainFunction(Module *mod) {
     getOrInsertFunction("main", IntegerType::getInt32Ty(mod->getContext()),
                         IntegerType::getInt32Ty(mod->getContext()),
                         PointerType::getUnqual(PointerType::getUnqual(
-                          IntegerType::getInt8Ty(mod->getContext()))), NULL));
+                          IntegerType::getInt8Ty(mod->getContext())))));
   {
     Function::arg_iterator args = main_func->arg_begin();
     Value *arg_0 = &*args++;
diff --git a/examples/Fibonacci/fibonacci.cpp b/examples/Fibonacci/fibonacci.cpp
index 16e52bf04099053916b76ba2add86b9381dcff87..662cb01dd37e9de96651d5dc71627d13047ed030 100644
--- a/examples/Fibonacci/fibonacci.cpp
+++ b/examples/Fibonacci/fibonacci.cpp
@@ -54,8 +54,7 @@ static Function *CreateFibFunction(Module *M, LLVMContext &Context) {
   // to return an int and take an int parameter.
   Function *FibF =
     cast<Function>(M->getOrInsertFunction("fib", Type::getInt32Ty(Context),
-                                          Type::getInt32Ty(Context),
-                                          nullptr));
+                                          Type::getInt32Ty(Context)));
 
   // Add a basic block to the function.
   BasicBlock *BB = BasicBlock::Create(Context, "EntryBlock", FibF);
diff --git a/examples/HowToUseJIT/HowToUseJIT.cpp b/examples/HowToUseJIT/HowToUseJIT.cpp
index 0050d27b45d7fbbbca446fab128c0a14c9d4a7d3..f141fa5a7f541037a42643e309fa32e005f32ae3 100644
--- a/examples/HowToUseJIT/HowToUseJIT.cpp
+++ b/examples/HowToUseJIT/HowToUseJIT.cpp
@@ -69,11 +69,9 @@ int main() {
 
   // Create the add1 function entry and insert this entry into module M.  The
   // function will have a return type of "int" and take an argument of "int".
-  // The '0' terminates the list of argument types.
   Function *Add1F =
     cast<Function>(M->getOrInsertFunction("add1", Type::getInt32Ty(Context),
-                                          Type::getInt32Ty(Context),
-                                          nullptr));
+                                          Type::getInt32Ty(Context)));
 
   // Add a basic block to the function. As before, it automatically inserts
   // because of the last argument.
@@ -102,8 +100,7 @@ int main() {
   // Now we're going to create function `foo', which returns an int and takes no
   // arguments.
   Function *FooF =
-    cast<Function>(M->getOrInsertFunction("foo", Type::getInt32Ty(Context),
-                                          nullptr));
+    cast<Function>(M->getOrInsertFunction("foo", Type::getInt32Ty(Context)));
 
   // Add a basic block to the FooF function.
   BB = BasicBlock::Create(Context, "EntryBlock", FooF);
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
index 22716b2b5215514f9cd4622f95e2226ad24ba791..a14fd1dc20eca72df000180fdde53587bc923b15 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter1/KaleidoscopeJIT.h
@@ -22,7 +22,7 @@
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/Support/DynamicLibrary.h"
@@ -40,7 +40,7 @@ class KaleidoscopeJIT {
 private:
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  ObjectLinkingLayer<> ObjectLayer;
+  RTDyldObjectLinkingLayer<> ObjectLayer;
   IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
 public:
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
index 91d903029a00a9059f2091e48abb91a9f40032eb..2039be4571a59956b7225518bd98dcdde1c5644c 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter2/KaleidoscopeJIT.h
@@ -23,7 +23,7 @@
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
@@ -44,7 +44,7 @@ class KaleidoscopeJIT {
 private:
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  ObjectLinkingLayer<> ObjectLayer;
+  RTDyldObjectLinkingLayer<> ObjectLayer;
   IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
   typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
index eefe6a551fa5264a6155e3c20b670f01e698a7d1..d22d41855072ca62bb03e9dea03f574e1f720d79 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter3/KaleidoscopeJIT.h
@@ -24,7 +24,7 @@
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
@@ -46,7 +46,7 @@ class KaleidoscopeJIT {
 private:
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  ObjectLinkingLayer<> ObjectLayer;
+  RTDyldObjectLinkingLayer<> ObjectLayer;
   IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
   typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
index 527d4be09f0f97f8e4763f1a4b4cf2f2d2db660e..e0a78410f7134953450ab3978ec6a439b3a942ee 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter4/KaleidoscopeJIT.h
@@ -24,7 +24,7 @@
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
@@ -73,7 +73,7 @@ class KaleidoscopeJIT {
 private:
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  ObjectLinkingLayer<> ObjectLayer;
+  RTDyldObjectLinkingLayer<> ObjectLayer;
   IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
   typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
diff --git a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
index d1ef3c9549ffa59214e93fd057a8a34e353cf8f7..70a896fe8f007b908826aaa8ca351133803e47fe 100644
--- a/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/BuildingAJIT/Chapter5/KaleidoscopeJIT.h
@@ -26,7 +26,7 @@
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/LegacyPassManager.h"
@@ -79,7 +79,7 @@ class KaleidoscopeJIT {
 private:
   std::unique_ptr<TargetMachine> TM;
   const DataLayout DL;
-  ObjectLinkingLayer<> ObjectLayer;
+  RTDyldObjectLinkingLayer<> ObjectLayer;
   IRCompileLayer<decltype(ObjectLayer)> CompileLayer;
 
   typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
diff --git a/examples/Kaleidoscope/Chapter2/toy.cpp b/examples/Kaleidoscope/Chapter2/toy.cpp
index 8357c5b63fb70bc99426a7b820a953373c183775..4dc917e3f06f3293bac74f00c35913c877e3c9b2 100644
--- a/examples/Kaleidoscope/Chapter2/toy.cpp
+++ b/examples/Kaleidoscope/Chapter2/toy.cpp
@@ -140,6 +140,8 @@ class PrototypeAST {
 public:
   PrototypeAST(const std::string &Name, std::vector<std::string> Args)
       : Name(Name), Args(std::move(Args)) {}
+
+  const std::string &getName() const { return Name; }
 };
 
 /// FunctionAST - This class represents a function definition itself.
diff --git a/examples/Kaleidoscope/Chapter4/toy.cpp b/examples/Kaleidoscope/Chapter4/toy.cpp
index 3bd077b6e3886f0b3aed16e1e87af45bc2306f64..cf7d6c2bee04e5b0b532d19c2c9d57b0058f63d4 100644
--- a/examples/Kaleidoscope/Chapter4/toy.cpp
+++ b/examples/Kaleidoscope/Chapter4/toy.cpp
@@ -650,14 +650,20 @@ static void MainLoop() {
 // "Library" functions that can be "extern'd" from user code.
 //===----------------------------------------------------------------------===//
 
+#ifdef LLVM_ON_WIN32
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+
 /// putchard - putchar that takes a double and returns 0.
-extern "C" double putchard(double X) {
+extern "C" DLLEXPORT double putchard(double X) {
   fputc((char)X, stderr);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" double printd(double X) {
+extern "C" DLLEXPORT double printd(double X) {
   fprintf(stderr, "%f\n", X);
   return 0;
 }
diff --git a/examples/Kaleidoscope/Chapter5/toy.cpp b/examples/Kaleidoscope/Chapter5/toy.cpp
index 795f49c847ea71a421b7895efd4fca976a5b9760..6852973bae40020d0983055ee043cc6763327709 100644
--- a/examples/Kaleidoscope/Chapter5/toy.cpp
+++ b/examples/Kaleidoscope/Chapter5/toy.cpp
@@ -622,7 +622,7 @@ Value *IfExprAST::codegen() {
   if (!CondV)
     return nullptr;
 
-  // Convert condition to a bool by comparing equal to 0.0.
+  // Convert condition to a bool by comparing non-equal to 0.0.
   CondV = Builder.CreateFCmpONE(
       CondV, ConstantFP::get(TheContext, APFloat(0.0)), "ifcond");
 
@@ -736,7 +736,7 @@ Value *ForExprAST::codegen() {
   if (!EndCond)
     return nullptr;
 
-  // Convert condition to a bool by comparing equal to 0.0.
+  // Convert condition to a bool by comparing non-equal to 0.0.
   EndCond = Builder.CreateFCmpONE(
       EndCond, ConstantFP::get(TheContext, APFloat(0.0)), "loopcond");
 
@@ -924,14 +924,20 @@ static void MainLoop() {
 // "Library" functions that can be "extern'd" from user code.
 //===----------------------------------------------------------------------===//
 
+#ifdef LLVM_ON_WIN32
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+
 /// putchard - putchar that takes a double and returns 0.
-extern "C" double putchard(double X) {
+extern "C" DLLEXPORT double putchard(double X) {
   fputc((char)X, stderr);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" double printd(double X) {
+extern "C" DLLEXPORT double printd(double X) {
   fprintf(stderr, "%f\n", X);
   return 0;
 }
diff --git a/examples/Kaleidoscope/Chapter6/toy.cpp b/examples/Kaleidoscope/Chapter6/toy.cpp
index 19e25d37dcdedcc062e2b206de188137ea638b2a..1e0ddca29b61f197555e25b971fd6d4ee4f80af0 100644
--- a/examples/Kaleidoscope/Chapter6/toy.cpp
+++ b/examples/Kaleidoscope/Chapter6/toy.cpp
@@ -567,7 +567,7 @@ static std::unique_ptr<PrototypeAST> ParsePrototype() {
     // Read the precedence if present.
     if (CurTok == tok_number) {
       if (NumVal < 1 || NumVal > 100)
-        return LogErrorP("Invalid precedecnce: must be 1..100");
+        return LogErrorP("Invalid precedence: must be 1..100");
       BinaryPrecedence = (unsigned)NumVal;
       getNextToken();
     }
@@ -734,7 +734,7 @@ Value *IfExprAST::codegen() {
   if (!CondV)
     return nullptr;
 
-  // Convert condition to a bool by comparing equal to 0.0.
+  // Convert condition to a bool by comparing non-equal to 0.0.
   CondV = Builder.CreateFCmpONE(
       CondV, ConstantFP::get(TheContext, APFloat(0.0)), "ifcond");
 
@@ -848,7 +848,7 @@ Value *ForExprAST::codegen() {
   if (!EndCond)
     return nullptr;
 
-  // Convert condition to a bool by comparing equal to 0.0.
+  // Convert condition to a bool by comparing non-equal to 0.0.
   EndCond = Builder.CreateFCmpONE(
       EndCond, ConstantFP::get(TheContext, APFloat(0.0)), "loopcond");
 
@@ -1043,14 +1043,20 @@ static void MainLoop() {
 // "Library" functions that can be "extern'd" from user code.
 //===----------------------------------------------------------------------===//
 
+#ifdef LLVM_ON_WIN32
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+
 /// putchard - putchar that takes a double and returns 0.
-extern "C" double putchard(double X) {
+extern "C" DLLEXPORT double putchard(double X) {
   fputc((char)X, stderr);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" double printd(double X) {
+extern "C" DLLEXPORT double printd(double X) {
   fprintf(stderr, "%f\n", X);
   return 0;
 }
diff --git a/examples/Kaleidoscope/Chapter7/CMakeLists.txt b/examples/Kaleidoscope/Chapter7/CMakeLists.txt
index e67d7928efe79539522bb8e955de6c522469aa31..69e78be6a620757daaf4bb13227a892b04cca4e4 100644
--- a/examples/Kaleidoscope/Chapter7/CMakeLists.txt
+++ b/examples/Kaleidoscope/Chapter7/CMakeLists.txt
@@ -7,6 +7,7 @@ set(LLVM_LINK_COMPONENTS
   RuntimeDyld
   ScalarOpts
   Support
+  TransformUtils
   native
   )
 
diff --git a/examples/Kaleidoscope/Chapter7/toy.cpp b/examples/Kaleidoscope/Chapter7/toy.cpp
index 7e723ba0397bd1cae4e737977ec2d7b057c71fdb..2f8cb682a847141dfc6fa348a25af61107b71d17 100644
--- a/examples/Kaleidoscope/Chapter7/toy.cpp
+++ b/examples/Kaleidoscope/Chapter7/toy.cpp
@@ -639,7 +639,7 @@ static std::unique_ptr<PrototypeAST> ParsePrototype() {
     // Read the precedence if present.
     if (CurTok == tok_number) {
       if (NumVal < 1 || NumVal > 100)
-        return LogErrorP("Invalid precedecnce: must be 1..100");
+        return LogErrorP("Invalid precedence: must be 1..100");
       BinaryPrecedence = (unsigned)NumVal;
       getNextToken();
     }
@@ -840,7 +840,7 @@ Value *IfExprAST::codegen() {
   if (!CondV)
     return nullptr;
 
-  // Convert condition to a bool by comparing equal to 0.0.
+  // Convert condition to a bool by comparing non-equal to 0.0.
   CondV = Builder.CreateFCmpONE(
       CondV, ConstantFP::get(TheContext, APFloat(0.0)), "ifcond");
 
@@ -963,7 +963,7 @@ Value *ForExprAST::codegen() {
   Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
   Builder.CreateStore(NextVar, Alloca);
 
-  // Convert condition to a bool by comparing equal to 0.0.
+  // Convert condition to a bool by comparing non-equal to 0.0.
   EndCond = Builder.CreateFCmpONE(
       EndCond, ConstantFP::get(TheContext, APFloat(0.0)), "loopcond");
 
@@ -1115,6 +1115,8 @@ static void InitializeModuleAndPassManager() {
   // Create a new pass manager attached to it.
   TheFPM = llvm::make_unique<legacy::FunctionPassManager>(TheModule.get());
 
+  // Promote allocas to registers.
+  TheFPM->add(createPromoteMemoryToRegisterPass());
   // Do simple "peephole" optimizations and bit-twiddling optzns.
   TheFPM->add(createInstructionCombiningPass());
   // Reassociate expressions.
@@ -1210,14 +1212,20 @@ static void MainLoop() {
 // "Library" functions that can be "extern'd" from user code.
 //===----------------------------------------------------------------------===//
 
+#ifdef LLVM_ON_WIN32
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+
 /// putchard - putchar that takes a double and returns 0.
-extern "C" double putchard(double X) {
+extern "C" DLLEXPORT double putchard(double X) {
   fputc((char)X, stderr);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" double printd(double X) {
+extern "C" DLLEXPORT double printd(double X) {
   fprintf(stderr, "%f\n", X);
   return 0;
 }
diff --git a/examples/Kaleidoscope/Chapter8/toy.cpp b/examples/Kaleidoscope/Chapter8/toy.cpp
index 354380adfc4a8c6bb7cda763d5483bdf57885d96..cdf650973b86022fc65162ad053ca3345312ee1f 100644
--- a/examples/Kaleidoscope/Chapter8/toy.cpp
+++ b/examples/Kaleidoscope/Chapter8/toy.cpp
@@ -642,7 +642,7 @@ static std::unique_ptr<PrototypeAST> ParsePrototype() {
     // Read the precedence if present.
     if (CurTok == tok_number) {
       if (NumVal < 1 || NumVal > 100)
-        return LogErrorP("Invalid precedecnce: must be 1..100");
+        return LogErrorP("Invalid precedence: must be 1..100");
       BinaryPrecedence = (unsigned)NumVal;
       getNextToken();
     }
@@ -841,7 +841,7 @@ Value *IfExprAST::codegen() {
   if (!CondV)
     return nullptr;
 
-  // Convert condition to a bool by comparing equal to 0.0.
+  // Convert condition to a bool by comparing non-equal to 0.0.
   CondV = Builder.CreateFCmpONE(
       CondV, ConstantFP::get(TheContext, APFloat(0.0)), "ifcond");
 
@@ -964,7 +964,7 @@ Value *ForExprAST::codegen() {
   Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
   Builder.CreateStore(NextVar, Alloca);
 
-  // Convert condition to a bool by comparing equal to 0.0.
+  // Convert condition to a bool by comparing non-equal to 0.0.
   EndCond = Builder.CreateFCmpONE(
       EndCond, ConstantFP::get(TheContext, APFloat(0.0)), "loopcond");
 
@@ -1173,14 +1173,20 @@ static void MainLoop() {
 // "Library" functions that can be "extern'd" from user code.
 //===----------------------------------------------------------------------===//
 
+#ifdef LLVM_ON_WIN32
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+
 /// putchard - putchar that takes a double and returns 0.
-extern "C" double putchard(double X) {
+extern "C" DLLEXPORT double putchard(double X) {
   fputc((char)X, stderr);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" double printd(double X) {
+extern "C" DLLEXPORT double printd(double X) {
   fprintf(stderr, "%f\n", X);
   return 0;
 }
diff --git a/examples/Kaleidoscope/Chapter9/toy.cpp b/examples/Kaleidoscope/Chapter9/toy.cpp
index aa609933fc9d147bcec5d086f177c5257aa0cec7..1b13e45ec4601f6b3677e1772fc033ef0640b3bb 100644
--- a/examples/Kaleidoscope/Chapter9/toy.cpp
+++ b/examples/Kaleidoscope/Chapter9/toy.cpp
@@ -756,7 +756,7 @@ static std::unique_ptr<PrototypeAST> ParsePrototype() {
     // Read the precedence if present.
     if (CurTok == tok_number) {
       if (NumVal < 1 || NumVal > 100)
-        return LogErrorP("Invalid precedecnce: must be 1..100");
+        return LogErrorP("Invalid precedence: must be 1..100");
       BinaryPrecedence = (unsigned)NumVal;
       getNextToken();
     }
@@ -1004,7 +1004,7 @@ Value *IfExprAST::codegen() {
   if (!CondV)
     return nullptr;
 
-  // Convert condition to a bool by comparing equal to 0.0.
+  // Convert condition to a bool by comparing non-equal to 0.0.
   CondV = Builder.CreateFCmpONE(
       CondV, ConstantFP::get(TheContext, APFloat(0.0)), "ifcond");
 
@@ -1129,7 +1129,7 @@ Value *ForExprAST::codegen() {
   Value *NextVar = Builder.CreateFAdd(CurVar, StepVal, "nextvar");
   Builder.CreateStore(NextVar, Alloca);
 
-  // Convert condition to a bool by comparing equal to 0.0.
+  // Convert condition to a bool by comparing non-equal to 0.0.
   EndCond = Builder.CreateFCmpONE(
       EndCond, ConstantFP::get(TheContext, APFloat(0.0)), "loopcond");
 
@@ -1379,14 +1379,20 @@ static void MainLoop() {
 // "Library" functions that can be "extern'd" from user code.
 //===----------------------------------------------------------------------===//
 
+#ifdef LLVM_ON_WIN32
+#define DLLEXPORT __declspec(dllexport)
+#else
+#define DLLEXPORT
+#endif
+
 /// putchard - putchar that takes a double and returns 0.
-extern "C" double putchard(double X) {
+extern "C" DLLEXPORT double putchard(double X) {
   fputc((char)X, stderr);
   return 0;
 }
 
 /// printd - printf that takes a double prints it as "%f\n", returning 0.
-extern "C" double printd(double X) {
+extern "C" DLLEXPORT double printd(double X) {
   fprintf(stderr, "%f\n", X);
   return 0;
 }
diff --git a/examples/Kaleidoscope/include/KaleidoscopeJIT.h b/examples/Kaleidoscope/include/KaleidoscopeJIT.h
index 6130107bdd942e28e26f1242e3be172672d00601..1dca39deba3c39e052796b2b61d1c3efa3f81af7 100644
--- a/examples/Kaleidoscope/include/KaleidoscopeJIT.h
+++ b/examples/Kaleidoscope/include/KaleidoscopeJIT.h
@@ -24,7 +24,7 @@
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/Support/DynamicLibrary.h"
@@ -40,7 +40,7 @@ namespace orc {
 
 class KaleidoscopeJIT {
 public:
-  typedef ObjectLinkingLayer<> ObjLayerT;
+  typedef RTDyldObjectLinkingLayer<> ObjLayerT;
   typedef IRCompileLayer<ObjLayerT> CompileLayerT;
   typedef CompileLayerT::ModuleSetHandleT ModuleHandleT;
 
@@ -97,17 +97,40 @@ private:
   }
 
   JITSymbol findMangledSymbol(const std::string &Name) {
+#ifdef LLVM_ON_WIN32
+    // The symbol lookup of ObjectLinkingLayer uses the SymbolRef::SF_Exported
+    // flag to decide whether a symbol will be visible or not, when we call
+    // IRCompileLayer::findSymbolIn with ExportedSymbolsOnly set to true.
+    //
+    // But for Windows COFF objects, this flag is currently never set.
+    // For a potential solution see: https://reviews.llvm.org/rL258665
+    // For now, we allow non-exported symbols on Windows as a workaround.
+    const bool ExportedSymbolsOnly = false;
+#else
+    const bool ExportedSymbolsOnly = true;
+#endif
+
     // Search modules in reverse order: from last added to first added.
     // This is the opposite of the usual search order for dlsym, but makes more
     // sense in a REPL where we want to bind to the newest available definition.
     for (auto H : make_range(ModuleHandles.rbegin(), ModuleHandles.rend()))
-      if (auto Sym = CompileLayer.findSymbolIn(H, Name, true))
+      if (auto Sym = CompileLayer.findSymbolIn(H, Name, ExportedSymbolsOnly))
         return Sym;
 
     // If we can't find the symbol in the JIT, try looking in the host process.
     if (auto SymAddr = RTDyldMemoryManager::getSymbolAddressInProcess(Name))
       return JITSymbol(SymAddr, JITSymbolFlags::Exported);
 
+#ifdef LLVM_ON_WIN32
+    // For Windows retry without "_" at begining, as RTDyldMemoryManager uses
+    // GetProcAddress and standard libraries like msvcrt.dll use names
+    // with and without "_" (for example "_itoa" but "sin").
+    if (Name.length() > 2 && Name[0] == '_')
+      if (auto SymAddr =
+              RTDyldMemoryManager::getSymbolAddressInProcess(Name.substr(1)))
+        return JITSymbol(SymAddr, JITSymbolFlags::Exported);
+#endif
+
     return nullptr;
   }
 
diff --git a/examples/ParallelJIT/CMakeLists.txt b/examples/ParallelJIT/CMakeLists.txt
index e85b470f5036b8c5b9ade0b707e8b5cdd79475f0..deeee072b33caa3896a10b9550d1bb8ad1875979 100644
--- a/examples/ParallelJIT/CMakeLists.txt
+++ b/examples/ParallelJIT/CMakeLists.txt
@@ -11,4 +11,4 @@ add_llvm_example(ParallelJIT
   ParallelJIT.cpp
   )
 
-target_link_libraries(ParallelJIT ${PTHREAD_LIB})
+target_link_libraries(ParallelJIT ${LLVM_PTHREAD_LIB})
diff --git a/examples/ParallelJIT/ParallelJIT.cpp b/examples/ParallelJIT/ParallelJIT.cpp
index 6fb8bd61982b5a141d305c2827d6badc88e9fb5a..f1932d2471cb88dcf190c290d0df9a94732d4ba0 100644
--- a/examples/ParallelJIT/ParallelJIT.cpp
+++ b/examples/ParallelJIT/ParallelJIT.cpp
@@ -54,8 +54,7 @@ static Function* createAdd1(Module *M) {
   Function *Add1F =
     cast<Function>(M->getOrInsertFunction("add1",
                                           Type::getInt32Ty(M->getContext()),
-                                          Type::getInt32Ty(M->getContext()),
-                                          nullptr));
+                                          Type::getInt32Ty(M->getContext())));
 
   // Add a basic block to the function. As before, it automatically inserts
   // because of the last argument.
@@ -85,8 +84,7 @@ static Function *CreateFibFunction(Module *M) {
   Function *FibF = 
     cast<Function>(M->getOrInsertFunction("fib",
                                           Type::getInt32Ty(M->getContext()),
-                                          Type::getInt32Ty(M->getContext()),
-                                          nullptr));
+                                          Type::getInt32Ty(M->getContext())));
 
   // Add a basic block to the function.
   BasicBlock *BB = BasicBlock::Create(M->getContext(), "EntryBlock", FibF);
diff --git a/include/llvm-c/Transforms/Scalar.h b/include/llvm-c/Transforms/Scalar.h
index 8991e0904849cd84851eb538147acc4218638464..b9612b9cec0443dc1761b68096bd4e3bbea46654 100644
--- a/include/llvm-c/Transforms/Scalar.h
+++ b/include/llvm-c/Transforms/Scalar.h
@@ -44,6 +44,9 @@ void LLVMAddAlignmentFromAssumptionsPass(LLVMPassManagerRef PM);
 /** See llvm::createCFGSimplificationPass function. */
 void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM);
 
+/** See llvm::createLateCFGSimplificationPass function. */
+void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM);
+
 /** See llvm::createDeadStoreEliminationPass function. */
 void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM);
 
diff --git a/include/llvm-c/lto.h b/include/llvm-c/lto.h
index c3af74cdedabed52bdd45dd4fe03d334d9beedfc..8d45b783204172e8e43c4845c499fe1ea8124dbc 100644
--- a/include/llvm-c/lto.h
+++ b/include/llvm-c/lto.h
@@ -551,7 +551,7 @@ lto_codegen_set_should_embed_uselists(lto_code_gen_t cg,
                                       lto_bool_t ShouldEmbedUselists);
 
 /**
- * @}
+ * @} // endgoup LLVMCLTO
  * @defgroup LLVMCTLTO ThinLTO
  * @ingroup LLVMC
  *
@@ -668,75 +668,6 @@ const char *thinlto_module_get_object_file(thinlto_code_gen_t cg,
 extern lto_bool_t thinlto_codegen_set_pic_model(thinlto_code_gen_t cg,
                                                 lto_codegen_model);
 
-/**
- * @}
- * @defgroup LLVMCTLTO_CACHING ThinLTO Cache Control
- * @ingroup LLVMCTLTO
- *
- * These entry points control the ThinLTO cache. The cache is intended to
- * support incremental build, and thus needs to be persistent accross build.
- * The client enabled the cache by supplying a path to an existing directory.
- * The code generator will use this to store objects files that may be reused
- * during a subsequent build.
- * To avoid filling the disk space, a few knobs are provided:
- *  - The pruning interval limit the frequency at which the garbage collector
- *    will try to scan the cache directory to prune it from expired entries.
- *    Setting to -1 disable the pruning (default).
- *  - The pruning expiration time indicates to the garbage collector how old an
- *    entry needs to be to be removed.
- *  - Finally, the garbage collector can be instructed to prune the cache till
- *    the occupied space goes below a threshold.
- * @{
- */
-
-/**
- * Sets the path to a directory to use as a cache storage for incremental build.
- * Setting this activates caching.
- *
- * \since LTO_API_VERSION=18
- */
-extern void thinlto_codegen_set_cache_dir(thinlto_code_gen_t cg,
-                                          const char *cache_dir);
-
-/**
- * Sets the cache pruning interval (in seconds). A negative value disable the
- * pruning. An unspecified default value will be applied, and a value of 0 will
- * be ignored.
- *
- * \since LTO_API_VERSION=18
- */
-extern void thinlto_codegen_set_cache_pruning_interval(thinlto_code_gen_t cg,
-                                                       int interval);
-
-/**
- * Sets the maximum cache size that can be persistent across build, in terms of
- * percentage of the available space on the the disk. Set to 100 to indicate
- * no limit, 50 to indicate that the cache size will not be left over half the
- * available space. A value over 100 will be reduced to 100, a value of 0 will
- * be ignored. An unspecified default value will be applied.
- *
- * The formula looks like:
- *  AvailableSpace = FreeSpace + ExistingCacheSize
- *  NewCacheSize = AvailableSpace * P/100
- *
- * \since LTO_API_VERSION=18
- */
-extern void thinlto_codegen_set_final_cache_size_relative_to_available_space(
-    thinlto_code_gen_t cg, unsigned percentage);
-
-/**
- * Sets the expiration (in seconds) for an entry in the cache. An unspecified
- * default value will be applied. A value of 0 will be ignored.
- *
- * \since LTO_API_VERSION=18
- */
-extern void thinlto_codegen_set_cache_entry_expiration(thinlto_code_gen_t cg,
-                                                       unsigned expiration);
-
-/**
- * @}
- */
-
 /**
  * Sets the path to a directory to use as a storage for temporary bitcode files.
  * The intention is to make the bitcode files available for debugging at various
@@ -820,12 +751,77 @@ extern void thinlto_codegen_add_cross_referenced_symbol(thinlto_code_gen_t cg,
                                                         const char *name,
                                                         int length);
 
-#ifdef __cplusplus
-}
-#endif
+/**
+ * @} // endgoup LLVMCTLTO
+ * @defgroup LLVMCTLTO_CACHING ThinLTO Cache Control
+ * @ingroup LLVMCTLTO
+ *
+ * These entry points control the ThinLTO cache. The cache is intended to
+ * support incremental build, and thus needs to be persistent accross build.
+ * The client enabled the cache by supplying a path to an existing directory.
+ * The code generator will use this to store objects files that may be reused
+ * during a subsequent build.
+ * To avoid filling the disk space, a few knobs are provided:
+ *  - The pruning interval limit the frequency at which the garbage collector
+ *    will try to scan the cache directory to prune it from expired entries.
+ *    Setting to -1 disable the pruning (default).
+ *  - The pruning expiration time indicates to the garbage collector how old an
+ *    entry needs to be to be removed.
+ *  - Finally, the garbage collector can be instructed to prune the cache till
+ *    the occupied space goes below a threshold.
+ * @{
+ */
+
+/**
+ * Sets the path to a directory to use as a cache storage for incremental build.
+ * Setting this activates caching.
+ *
+ * \since LTO_API_VERSION=18
+ */
+extern void thinlto_codegen_set_cache_dir(thinlto_code_gen_t cg,
+                                          const char *cache_dir);
+
+/**
+ * Sets the cache pruning interval (in seconds). A negative value disable the
+ * pruning. An unspecified default value will be applied, and a value of 0 will
+ * be ignored.
+ *
+ * \since LTO_API_VERSION=18
+ */
+extern void thinlto_codegen_set_cache_pruning_interval(thinlto_code_gen_t cg,
+                                                       int interval);
+
+/**
+ * Sets the maximum cache size that can be persistent across build, in terms of
+ * percentage of the available space on the the disk. Set to 100 to indicate
+ * no limit, 50 to indicate that the cache size will not be left over half the
+ * available space. A value over 100 will be reduced to 100, a value of 0 will
+ * be ignored. An unspecified default value will be applied.
+ *
+ * The formula looks like:
+ *  AvailableSpace = FreeSpace + ExistingCacheSize
+ *  NewCacheSize = AvailableSpace * P/100
+ *
+ * \since LTO_API_VERSION=18
+ */
+extern void thinlto_codegen_set_final_cache_size_relative_to_available_space(
+    thinlto_code_gen_t cg, unsigned percentage);
 
 /**
- * @}
+ * Sets the expiration (in seconds) for an entry in the cache. An unspecified
+ * default value will be applied. A value of 0 will be ignored.
+ *
+ * \since LTO_API_VERSION=18
  */
+extern void thinlto_codegen_set_cache_entry_expiration(thinlto_code_gen_t cg,
+                                                       unsigned expiration);
+
+/**
+ * @} // endgroup LLVMCTLTO_CACHING
+ */
+
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* LLVM_C_LTO_H */
diff --git a/include/llvm/ADT/APFloat.h b/include/llvm/ADT/APFloat.h
index c9a39ae6b047f93a93e0e33806ccd3e713491fbe..e7e5036e69307b7ceb5d8afdab5662bde70093bf 100644
--- a/include/llvm/ADT/APFloat.h
+++ b/include/llvm/ADT/APFloat.h
@@ -18,6 +18,7 @@
 #define LLVM_ADT_APFLOAT_H
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <memory>
 
@@ -139,22 +140,25 @@ enum lostFraction { // Example of truncated bits:
 // implementation classes. This struct should not define any non-static data
 // members.
 struct APFloatBase {
+  // TODO remove this and use APInt typedef directly.
+  typedef APInt::WordType integerPart;
+
   /// A signed type to represent a floating point numbers unbiased exponent.
   typedef signed short ExponentType;
 
   /// \name Floating Point Semantics.
   /// @{
 
-  static const fltSemantics &IEEEhalf();
-  static const fltSemantics &IEEEsingle();
-  static const fltSemantics &IEEEdouble();
-  static const fltSemantics &IEEEquad();
-  static const fltSemantics &PPCDoubleDouble();
-  static const fltSemantics &x87DoubleExtended();
+  static const fltSemantics &IEEEhalf() LLVM_READNONE;
+  static const fltSemantics &IEEEsingle() LLVM_READNONE;
+  static const fltSemantics &IEEEdouble() LLVM_READNONE;
+  static const fltSemantics &IEEEquad() LLVM_READNONE;
+  static const fltSemantics &PPCDoubleDouble() LLVM_READNONE;
+  static const fltSemantics &x87DoubleExtended() LLVM_READNONE;
 
   /// A Pseudo fltsemantic used to construct APFloats that cannot conflict with
   /// anything real.
-  static const fltSemantics &Bogus();
+  static const fltSemantics &Bogus() LLVM_READNONE;
 
   /// @}
 
@@ -273,8 +277,8 @@ public:
   /// @{
 
   opStatus convert(const fltSemantics &, roundingMode, bool *);
-  opStatus convertToInteger(integerPart *, unsigned int, bool, roundingMode,
-                            bool *) const;
+  opStatus convertToInteger(MutableArrayRef<integerPart>, unsigned int, bool,
+                            roundingMode, bool *) const;
   opStatus convertFromAPInt(const APInt &, bool, roundingMode);
   opStatus convertFromSignExtendedInteger(const integerPart *, unsigned int,
                                           bool, roundingMode);
@@ -361,7 +365,7 @@ public:
   /// Returns true if and only if the number has the largest possible finite
   /// magnitude in the current semantics.
   bool isLargest() const;
-  
+
   /// Returns true if and only if the number is an exact integer.
   bool isInteger() const;
 
@@ -495,8 +499,9 @@ private:
   opStatus addOrSubtract(const IEEEFloat &, roundingMode, bool subtract);
   opStatus handleOverflow(roundingMode);
   bool roundAwayFromZero(roundingMode, lostFraction, unsigned int) const;
-  opStatus convertToSignExtendedInteger(integerPart *, unsigned int, bool,
-                                        roundingMode, bool *) const;
+  opStatus convertToSignExtendedInteger(MutableArrayRef<integerPart>,
+                                        unsigned int, bool, roundingMode,
+                                        bool *) const;
   opStatus convertFromUnsignedParts(const integerPart *, unsigned int,
                                     roundingMode);
   opStatus convertFromHexadecimalString(StringRef, roundingMode);
@@ -625,8 +630,8 @@ public:
   opStatus convertFromString(StringRef, roundingMode);
   opStatus next(bool nextDown);
 
-  opStatus convertToInteger(integerPart *Input, unsigned int Width,
-                            bool IsSigned, roundingMode RM,
+  opStatus convertToInteger(MutableArrayRef<integerPart> Input,
+                            unsigned int Width, bool IsSigned, roundingMode RM,
                             bool *IsExact) const;
   opStatus convertFromAPInt(const APInt &Input, bool IsSigned, roundingMode RM);
   opStatus convertFromSignExtendedInteger(const integerPart *Input,
@@ -1055,8 +1060,8 @@ public:
 
   opStatus convert(const fltSemantics &ToSemantics, roundingMode RM,
                    bool *losesInfo);
-  opStatus convertToInteger(integerPart *Input, unsigned int Width,
-                            bool IsSigned, roundingMode RM,
+  opStatus convertToInteger(MutableArrayRef<integerPart> Input,
+                            unsigned int Width, bool IsSigned, roundingMode RM,
                             bool *IsExact) const {
     APFLOAT_DISPATCH_ON_SEMANTICS(
         convertToInteger(Input, Width, IsSigned, RM, IsExact));
diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h
index 2c0713da256cdc2594cd55d83928a5c256864b2f..045df3c908756aa90498af34fda37c614a1cf71d 100644
--- a/include/llvm/ADT/APInt.h
+++ b/include/llvm/ADT/APInt.h
@@ -32,14 +32,6 @@ class raw_ostream;
 template <typename T> class SmallVectorImpl;
 template <typename T> class ArrayRef;
 
-// An unsigned host type used as a single part of a multi-part
-// bignum.
-typedef uint64_t integerPart;
-
-const unsigned int host_char_bit = 8;
-const unsigned int integerPartWidth =
-    host_char_bit * static_cast<unsigned int>(sizeof(integerPart));
-
 class APInt;
 
 inline APInt operator-(APInt);
@@ -75,6 +67,18 @@ inline APInt operator-(APInt);
 ///     uses in its IR. This simplifies its use for LLVM.
 ///
 class LLVM_NODISCARD APInt {
+public:
+  typedef uint64_t WordType;
+
+  /// This enum is used to hold the constants we needed for APInt.
+  enum : unsigned {
+    /// Byte size of a word.
+    APINT_WORD_SIZE = sizeof(WordType),
+    /// Bits in a word.
+    APINT_BITS_PER_WORD = APINT_WORD_SIZE * CHAR_BIT
+  };
+
+private:
   unsigned BitWidth; ///< The number of bits in this APInt.
 
   /// This union is used to store the integer value. When the
@@ -84,15 +88,6 @@ class LLVM_NODISCARD APInt {
     uint64_t *pVal; ///< Used to store the >64 bits integer value.
   };
 
-  /// This enum is used to hold the constants we needed for APInt.
-  enum {
-    /// Bits in a word
-    APINT_BITS_PER_WORD =
-        static_cast<unsigned int>(sizeof(uint64_t)) * CHAR_BIT,
-    /// Byte size of a word
-    APINT_WORD_SIZE = static_cast<unsigned int>(sizeof(uint64_t))
-  };
-
   friend struct DenseMapAPIntKeyInfo;
 
   /// \brief Fast internal constructor
@@ -147,7 +142,7 @@ class LLVM_NODISCARD APInt {
       return *this;
 
     // Mask out the high bits.
-    uint64_t mask = ~uint64_t(0ULL) >> (APINT_BITS_PER_WORD - wordBits);
+    uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - wordBits);
     if (isSingleWord())
       VAL &= mask;
     else
@@ -196,32 +191,38 @@ class LLVM_NODISCARD APInt {
   /// out-of-line slow case for shl
   APInt shlSlowCase(unsigned shiftAmt) const;
 
-  /// out-of-line slow case for operator&
-  APInt AndSlowCase(const APInt &RHS) const;
-
-  /// out-of-line slow case for operator|
-  APInt OrSlowCase(const APInt &RHS) const;
-
-  /// out-of-line slow case for operator^
-  APInt XorSlowCase(const APInt &RHS) const;
-
   /// out-of-line slow case for operator=
   APInt &AssignSlowCase(const APInt &RHS);
 
   /// out-of-line slow case for operator==
-  bool EqualSlowCase(const APInt &RHS) const;
+  bool EqualSlowCase(const APInt &RHS) const LLVM_READONLY;
 
   /// out-of-line slow case for operator==
-  bool EqualSlowCase(uint64_t Val) const;
+  bool EqualSlowCase(uint64_t Val) const LLVM_READONLY;
 
   /// out-of-line slow case for countLeadingZeros
-  unsigned countLeadingZerosSlowCase() const;
+  unsigned countLeadingZerosSlowCase() const LLVM_READONLY;
 
   /// out-of-line slow case for countTrailingOnes
-  unsigned countTrailingOnesSlowCase() const;
+  unsigned countTrailingOnesSlowCase() const LLVM_READONLY;
 
   /// out-of-line slow case for countPopulation
-  unsigned countPopulationSlowCase() const;
+  unsigned countPopulationSlowCase() const LLVM_READONLY;
+
+  /// out-of-line slow case for setBits.
+  void setBitsSlowCase(unsigned loBit, unsigned hiBit);
+
+  /// out-of-line slow case for flipAllBits.
+  void flipAllBitsSlowCase();
+
+  /// out-of-line slow case for operator&=.
+  APInt& AndAssignSlowCase(const APInt& RHS);
+
+  /// out-of-line slow case for operator|=.
+  APInt& OrAssignSlowCase(const APInt& RHS);
+
+  /// out-of-line slow case for operator^=.
+  APInt& XorAssignSlowCase(const APInt& RHS);
 
 public:
   /// \name Constructors
@@ -238,13 +239,14 @@ public:
   /// \param val the initial value of the APInt
   /// \param isSigned how to treat signedness of val
   APInt(unsigned numBits, uint64_t val, bool isSigned = false)
-      : BitWidth(numBits), VAL(0) {
+      : BitWidth(numBits) {
     assert(BitWidth && "bitwidth too small");
-    if (isSingleWord())
+    if (isSingleWord()) {
       VAL = val;
-    else
+      clearUnusedBits();
+    } else {
       initSlowCase(val, isSigned);
-    clearUnusedBits();
+    }
   }
 
   /// \brief Construct an APInt of numBits width, initialized as bigVal[].
@@ -280,7 +282,7 @@ public:
 
   /// Simply makes *this a copy of that.
   /// @brief Copy Constructor.
-  APInt(const APInt &that) : BitWidth(that.BitWidth), VAL(0) {
+  APInt(const APInt &that) : BitWidth(that.BitWidth) {
     if (isSingleWord())
       VAL = that.VAL;
     else
@@ -341,7 +343,7 @@ public:
   /// This checks to see if the value has all bits of the APInt are set or not.
   bool isAllOnesValue() const {
     if (isSingleWord())
-      return VAL == ~integerPart(0) >> (APINT_BITS_PER_WORD - BitWidth);
+      return VAL == UINT64_MAX >> (APINT_BITS_PER_WORD - BitWidth);
     return countPopulationSlowCase() == BitWidth;
   }
 
@@ -406,7 +408,7 @@ public:
 
   /// If this value is smaller than the specified limit, return it, otherwise
   /// return the limit value.  This causes the value to saturate to the limit.
-  uint64_t getLimitedValue(uint64_t Limit = ~0ULL) const {
+  uint64_t getLimitedValue(uint64_t Limit = UINT64_MAX) const {
     return (getActiveBits() > 64 || getZExtValue() > Limit) ? Limit
                                                             : getZExtValue();
   }
@@ -418,6 +420,36 @@ public:
   /// width without remainder.
   bool isSplat(unsigned SplatSizeInBits) const;
 
+  /// \returns true if this APInt value is a sequence of \param numBits ones
+  /// starting at the least significant bit with the remainder zero.
+  bool isMask(unsigned numBits) const {
+    assert(numBits != 0 && "numBits must be non-zero");
+    assert(numBits <= BitWidth && "numBits out of range");
+    if (isSingleWord())
+      return VAL == (UINT64_MAX >> (APINT_BITS_PER_WORD - numBits));
+    unsigned Ones = countTrailingOnes();
+    return (numBits == Ones) && ((Ones + countLeadingZeros()) == BitWidth);
+  }
+
+  /// \returns true if this APInt is a non-empty sequence of ones starting at
+  /// the least significant bit with the remainder zero.
+  /// Ex. isMask(0x0000FFFFU) == true.
+  bool isMask() const {
+    if (isSingleWord())
+      return isMask_64(VAL);
+    unsigned Ones = countTrailingOnes();
+    return (Ones > 0) && ((Ones + countLeadingZeros()) == BitWidth);
+  }
+
+  /// \brief Return true if this APInt value contains a sequence of ones with
+  /// the remainder zero.
+  bool isShiftedMask() const {
+    if (isSingleWord())
+      return isShiftedMask_64(VAL);
+    unsigned Ones = countPopulation();
+    return (Ones + countTrailingZeros() + countLeadingZeros()) == BitWidth;
+  }
+
   /// @}
   /// \name Value Generators
   /// @{
@@ -501,12 +533,26 @@ public:
   ///
   /// \returns An APInt value with the requested bits set.
   static APInt getBitsSet(unsigned numBits, unsigned loBit, unsigned hiBit) {
-    assert(hiBit <= numBits && "hiBit out of range");
-    assert(loBit < numBits && "loBit out of range");
-    if (hiBit < loBit)
-      return getLowBitsSet(numBits, hiBit) |
-             getHighBitsSet(numBits, numBits - loBit);
-    return getLowBitsSet(numBits, hiBit - loBit).shl(loBit);
+    APInt Res(numBits, 0);
+    Res.setBits(loBit, hiBit);
+    return Res;
+  }
+
+  /// \brief Get a value with upper bits starting at loBit set.
+  ///
+  /// Constructs an APInt value that has a contiguous range of bits set. The
+  /// bits from loBit (inclusive) to numBits (exclusive) will be set. All other
+  /// bits will be zero. For example, with parameters(32, 12) you would get
+  /// 0xFFFFF000.
+  ///
+  /// \param numBits the intended bit width of the result
+  /// \param loBit the index of the lowest bit to set.
+  ///
+  /// \returns An APInt value with the requested bits set.
+  static APInt getBitsSetFrom(unsigned numBits, unsigned loBit) {
+    APInt Res(numBits, 0);
+    Res.setBitsFrom(loBit);
+    return Res;
   }
 
   /// \brief Get a value with high bits set
@@ -516,15 +562,9 @@ public:
   /// \param numBits the bitwidth of the result
   /// \param hiBitsSet the number of high-order bits set in the result.
   static APInt getHighBitsSet(unsigned numBits, unsigned hiBitsSet) {
-    assert(hiBitsSet <= numBits && "Too many bits to set!");
-    // Handle a degenerate case, to avoid shifting by word size
-    if (hiBitsSet == 0)
-      return APInt(numBits, 0);
-    unsigned shiftAmt = numBits - hiBitsSet;
-    // For small values, return quickly
-    if (numBits <= APINT_BITS_PER_WORD)
-      return APInt(numBits, ~0ULL << shiftAmt);
-    return getAllOnesValue(numBits).shl(shiftAmt);
+    APInt Res(numBits, 0);
+    Res.setHighBits(hiBitsSet);
+    return Res;
   }
 
   /// \brief Get a value with low bits set
@@ -534,16 +574,9 @@ public:
   /// \param numBits the bitwidth of the result
   /// \param loBitsSet the number of low-order bits set in the result.
   static APInt getLowBitsSet(unsigned numBits, unsigned loBitsSet) {
-    assert(loBitsSet <= numBits && "Too many bits to set!");
-    // Handle a degenerate case, to avoid shifting by word size
-    if (loBitsSet == 0)
-      return APInt(numBits, 0);
-    if (loBitsSet == APINT_BITS_PER_WORD)
-      return APInt(numBits, UINT64_MAX);
-    // For small values, return quickly.
-    if (loBitsSet <= APINT_BITS_PER_WORD)
-      return APInt(numBits, UINT64_MAX >> (APINT_BITS_PER_WORD - loBitsSet));
-    return getAllOnesValue(numBits).lshr(numBits - loBitsSet);
+    APInt Res(numBits, 0);
+    Res.setLowBits(loBitsSet);
+    return Res;
   }
 
   /// \brief Return a value containing V broadcasted over NewLen bits.
@@ -613,30 +646,13 @@ public:
   /// \returns *this decremented by one.
   APInt &operator--();
 
-  /// \brief Unary bitwise complement operator.
-  ///
-  /// Performs a bitwise complement operation on this APInt.
-  ///
-  /// \returns an APInt that is the bitwise complement of *this
-  APInt operator~() const {
-    APInt Result(*this);
-    Result.flipAllBits();
-    return Result;
-  }
-
   /// \brief Logical negation operator.
   ///
   /// Performs logical negation operation on this APInt.
   ///
   /// \returns true if *this is zero, false otherwise.
   bool operator!() const {
-    if (isSingleWord())
-      return !VAL;
-
-    for (unsigned i = 0; i != getNumWords(); ++i)
-      if (pVal[i])
-        return false;
-    return true;
+    return *this == 0;
   }
 
   /// @}
@@ -688,7 +704,16 @@ public:
   /// than 64, the value is zero filled in the unspecified high order bits.
   ///
   /// \returns *this after assignment of RHS value.
-  APInt &operator=(uint64_t RHS);
+  APInt &operator=(uint64_t RHS) {
+    if (isSingleWord()) {
+      VAL = RHS;
+      clearUnusedBits();
+    } else {
+      pVal[0] = RHS;
+      memset(pVal+1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
+    }
+    return *this;
+  }
 
   /// \brief Bitwise AND assignment operator.
   ///
@@ -696,7 +721,29 @@ public:
   /// assigned to *this.
   ///
   /// \returns *this after ANDing with RHS.
-  APInt &operator&=(const APInt &RHS);
+  APInt &operator&=(const APInt &RHS) {
+    assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+    if (isSingleWord()) {
+      VAL &= RHS.VAL;
+      return *this;
+    }
+    return AndAssignSlowCase(RHS);
+  }
+
+  /// \brief Bitwise AND assignment operator.
+  ///
+  /// Performs a bitwise AND operation on this APInt and RHS. RHS is
+  /// logically zero-extended or truncated to match the bit-width of
+  /// the LHS.
+  APInt &operator&=(uint64_t RHS) {
+    if (isSingleWord()) {
+      VAL &= RHS;
+      return *this;
+    }
+    pVal[0] &= RHS;
+    memset(pVal+1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
+    return *this;
+  }
 
   /// \brief Bitwise OR assignment operator.
   ///
@@ -704,7 +751,14 @@ public:
   /// assigned *this;
   ///
   /// \returns *this after ORing with RHS.
-  APInt &operator|=(const APInt &RHS);
+  APInt &operator|=(const APInt &RHS) {
+    assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+    if (isSingleWord()) {
+      VAL |= RHS.VAL;
+      return *this;
+    }
+    return OrAssignSlowCase(RHS);
+  }
 
   /// \brief Bitwise OR assignment operator.
   ///
@@ -727,7 +781,29 @@ public:
   /// assigned to *this.
   ///
   /// \returns *this after XORing with RHS.
-  APInt &operator^=(const APInt &RHS);
+  APInt &operator^=(const APInt &RHS) {
+    assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
+    if (isSingleWord()) {
+      VAL ^= RHS.VAL;
+      return *this;
+    }
+    return XorAssignSlowCase(RHS);
+  }
+
+  /// \brief Bitwise XOR assignment operator.
+  ///
+  /// Performs a bitwise XOR operation on this APInt and RHS. RHS is
+  /// logically zero-extended or truncated to match the bit-width of
+  /// the LHS.
+  APInt &operator^=(uint64_t RHS) {
+    if (isSingleWord()) {
+      VAL ^= RHS;
+      clearUnusedBits();
+    } else {
+      pVal[0] ^= RHS;
+    }
+    return *this;
+  }
 
   /// \brief Multiplication assignment operator.
   ///
@@ -766,59 +842,6 @@ public:
   /// \name Binary Operators
   /// @{
 
-  /// \brief Bitwise AND operator.
-  ///
-  /// Performs a bitwise AND operation on *this and RHS.
-  ///
-  /// \returns An APInt value representing the bitwise AND of *this and RHS.
-  APInt operator&(const APInt &RHS) const {
-    assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-    if (isSingleWord())
-      return APInt(getBitWidth(), VAL & RHS.VAL);
-    return AndSlowCase(RHS);
-  }
-  APInt And(const APInt &RHS) const { return this->operator&(RHS); }
-
-  /// \brief Bitwise OR operator.
-  ///
-  /// Performs a bitwise OR operation on *this and RHS.
-  ///
-  /// \returns An APInt value representing the bitwise OR of *this and RHS.
-  APInt operator|(const APInt &RHS) const {
-    assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-    if (isSingleWord())
-      return APInt(getBitWidth(), VAL | RHS.VAL);
-    return OrSlowCase(RHS);
-  }
-
-  /// \brief Bitwise OR function.
-  ///
-  /// Performs a bitwise or on *this and RHS. This is implemented by simply
-  /// calling operator|.
-  ///
-  /// \returns An APInt value representing the bitwise OR of *this and RHS.
-  APInt Or(const APInt &RHS) const { return this->operator|(RHS); }
-
-  /// \brief Bitwise XOR operator.
-  ///
-  /// Performs a bitwise XOR operation on *this and RHS.
-  ///
-  /// \returns An APInt value representing the bitwise XOR of *this and RHS.
-  APInt operator^(const APInt &RHS) const {
-    assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-    if (isSingleWord())
-      return APInt(BitWidth, VAL ^ RHS.VAL);
-    return XorSlowCase(RHS);
-  }
-
-  /// \brief Bitwise XOR function.
-  ///
-  /// Performs a bitwise XOR operation on *this and RHS. This is implemented
-  /// through the usage of operator^.
-  ///
-  /// \returns An APInt value representing the bitwise XOR of *this and RHS.
-  APInt Xor(const APInt &RHS) const { return this->operator^(RHS); }
-
   /// \brief Multiplication operator.
   ///
   /// Multiplies this APInt by RHS and returns the result.
@@ -1012,7 +1035,7 @@ public:
   /// the validity of the less-than relationship.
   ///
   /// \returns true if *this < RHS when both are considered unsigned.
-  bool ult(const APInt &RHS) const;
+  bool ult(const APInt &RHS) const LLVM_READONLY;
 
   /// \brief Unsigned less than comparison
   ///
@@ -1030,7 +1053,7 @@ public:
   /// validity of the less-than relationship.
   ///
   /// \returns true if *this < RHS when both are considered signed.
-  bool slt(const APInt &RHS) const;
+  bool slt(const APInt &RHS) const LLVM_READONLY;
 
   /// \brief Signed less than comparison
   ///
@@ -1144,7 +1167,11 @@ public:
 
   /// This operation tests if there are any pairs of corresponding bits
   /// between this APInt and RHS that are both set.
-  bool intersects(const APInt &RHS) const { return (*this & RHS) != 0; }
+  bool intersects(const APInt &RHS) const {
+    APInt temp(*this);
+    temp &= RHS;
+    return temp != 0;
+  }
 
   /// @}
   /// \name Resizing Operators
@@ -1203,11 +1230,9 @@ public:
   void setAllBits() {
     if (isSingleWord())
       VAL = UINT64_MAX;
-    else {
+    else
       // Set all the bits in all the words.
-      for (unsigned i = 0; i < getNumWords(); ++i)
-        pVal[i] = UINT64_MAX;
-    }
+      memset(pVal, -1, getNumWords() * APINT_WORD_SIZE);
     // Clear the unused ones
     clearUnusedBits();
   }
@@ -1217,6 +1242,49 @@ public:
   /// Set the given bit to 1 whose position is given as "bitPosition".
   void setBit(unsigned bitPosition);
 
+  /// Set the sign bit to 1.
+  void setSignBit() {
+    setBit(BitWidth - 1);
+  }
+
+  /// Set the bits from loBit (inclusive) to hiBit (exclusive) to 1.
+  void setBits(unsigned loBit, unsigned hiBit) {
+    assert(hiBit <= BitWidth && "hiBit out of range");
+    assert(loBit <= BitWidth && "loBit out of range");
+    if (loBit == hiBit)
+      return;
+    if (loBit > hiBit) {
+      setLowBits(hiBit);
+      setHighBits(BitWidth - loBit);
+      return;
+    }
+    if (loBit < APINT_BITS_PER_WORD && hiBit <= APINT_BITS_PER_WORD) {
+      uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - (hiBit - loBit));
+      mask <<= loBit;
+      if (isSingleWord())
+        VAL |= mask;
+      else
+        pVal[0] |= mask;
+    } else {
+      setBitsSlowCase(loBit, hiBit);
+    }
+  }
+
+  /// Set the top bits starting from loBit.
+  void setBitsFrom(unsigned loBit) {
+    return setBits(loBit, BitWidth);
+  }
+
+  /// Set the bottom loBits bits.
+  void setLowBits(unsigned loBits) {
+    return setBits(0, loBits);
+  }
+
+  /// Set the top hiBits bits.
+  void setHighBits(unsigned hiBits) {
+    return setBits(BitWidth - hiBits, BitWidth);
+  }
+
   /// \brief Set every bit to 0.
   void clearAllBits() {
     if (isSingleWord())
@@ -1232,13 +1300,12 @@ public:
 
   /// \brief Toggle every bit to its opposite value.
   void flipAllBits() {
-    if (isSingleWord())
+    if (isSingleWord()) {
       VAL ^= UINT64_MAX;
-    else {
-      for (unsigned i = 0; i < getNumWords(); ++i)
-        pVal[i] ^= UINT64_MAX;
+      clearUnusedBits();
+    } else {
+      flipAllBitsSlowCase();
     }
-    clearUnusedBits();
   }
 
   /// \brief Toggles a given bit to its opposite value.
@@ -1247,6 +1314,12 @@ public:
   /// as "bitPosition".
   void flipBit(unsigned bitPosition);
 
+  /// Insert the bits from a smaller APInt starting at bitPosition.
+  void insertBits(const APInt &SubBits, unsigned bitPosition);
+
+  /// Return an APInt with the extracted bits [bitPosition,bitPosition+numBits).
+  APInt extractBits(unsigned numBits, unsigned bitPosition) const;
+
   /// @}
   /// \name Value Characterization Functions
   /// @{
@@ -1356,7 +1429,7 @@ public:
   ///
   /// \returns 0 if the high order bit is not set, otherwise returns the number
   /// of 1 bits from the most significant to the least
-  unsigned countLeadingOnes() const;
+  unsigned countLeadingOnes() const LLVM_READONLY;
 
   /// Computes the number of leading bits of this APInt that are equal to its
   /// sign bit.
@@ -1372,7 +1445,7 @@ public:
   ///
   /// \returns BitWidth if the value is zero, otherwise returns the number of
   /// zeros from the least significant bit to the first one bit.
-  unsigned countTrailingZeros() const;
+  unsigned countTrailingZeros() const LLVM_READONLY;
 
   /// \brief Count the number of trailing one bits.
   ///
@@ -1589,46 +1662,46 @@ public:
 
   /// Sets the least significant part of a bignum to the input value, and zeroes
   /// out higher parts.
-  static void tcSet(integerPart *, integerPart, unsigned int);
+  static void tcSet(WordType *, WordType, unsigned);
 
   /// Assign one bignum to another.
-  static void tcAssign(integerPart *, const integerPart *, unsigned int);
+  static void tcAssign(WordType *, const WordType *, unsigned);
 
   /// Returns true if a bignum is zero, false otherwise.
-  static bool tcIsZero(const integerPart *, unsigned int);
+  static bool tcIsZero(const WordType *, unsigned);
 
   /// Extract the given bit of a bignum; returns 0 or 1.  Zero-based.
-  static int tcExtractBit(const integerPart *, unsigned int bit);
+  static int tcExtractBit(const WordType *, unsigned bit);
 
   /// Copy the bit vector of width srcBITS from SRC, starting at bit srcLSB, to
   /// DST, of dstCOUNT parts, such that the bit srcLSB becomes the least
   /// significant bit of DST.  All high bits above srcBITS in DST are
   /// zero-filled.
-  static void tcExtract(integerPart *, unsigned int dstCount,
-                        const integerPart *, unsigned int srcBits,
-                        unsigned int srcLSB);
+  static void tcExtract(WordType *, unsigned dstCount,
+                        const WordType *, unsigned srcBits,
+                        unsigned srcLSB);
 
   /// Set the given bit of a bignum.  Zero-based.
-  static void tcSetBit(integerPart *, unsigned int bit);
+  static void tcSetBit(WordType *, unsigned bit);
 
   /// Clear the given bit of a bignum.  Zero-based.
-  static void tcClearBit(integerPart *, unsigned int bit);
+  static void tcClearBit(WordType *, unsigned bit);
 
   /// Returns the bit number of the least or most significant set bit of a
   /// number.  If the input number has no bits set -1U is returned.
-  static unsigned int tcLSB(const integerPart *, unsigned int);
-  static unsigned int tcMSB(const integerPart *parts, unsigned int n);
+  static unsigned tcLSB(const WordType *, unsigned n);
+  static unsigned tcMSB(const WordType *parts, unsigned n);
 
   /// Negate a bignum in-place.
-  static void tcNegate(integerPart *, unsigned int);
+  static void tcNegate(WordType *, unsigned);
 
   /// DST += RHS + CARRY where CARRY is zero or one.  Returns the carry flag.
-  static integerPart tcAdd(integerPart *, const integerPart *,
-                           integerPart carry, unsigned);
+  static WordType tcAdd(WordType *, const WordType *,
+                        WordType carry, unsigned);
 
   /// DST -= RHS + CARRY where CARRY is zero or one. Returns the carry flag.
-  static integerPart tcSubtract(integerPart *, const integerPart *,
-                                integerPart carry, unsigned);
+  static WordType tcSubtract(WordType *, const WordType *,
+                             WordType carry, unsigned);
 
   /// DST += SRC * MULTIPLIER + PART   if add is true
   /// DST  = SRC * MULTIPLIER + PART   if add is false
@@ -1640,23 +1713,23 @@ public:
   /// Otherwise DST is filled with the least significant DSTPARTS parts of the
   /// result, and if all of the omitted higher parts were zero return zero,
   /// otherwise overflow occurred and return one.
-  static int tcMultiplyPart(integerPart *dst, const integerPart *src,
-                            integerPart multiplier, integerPart carry,
-                            unsigned int srcParts, unsigned int dstParts,
+  static int tcMultiplyPart(WordType *dst, const WordType *src,
+                            WordType multiplier, WordType carry,
+                            unsigned srcParts, unsigned dstParts,
                             bool add);
 
   /// DST = LHS * RHS, where DST has the same width as the operands and is
   /// filled with the least significant parts of the result.  Returns one if
   /// overflow occurred, otherwise zero.  DST must be disjoint from both
   /// operands.
-  static int tcMultiply(integerPart *, const integerPart *, const integerPart *,
+  static int tcMultiply(WordType *, const WordType *, const WordType *,
                         unsigned);
 
   /// DST = LHS * RHS, where DST has width the sum of the widths of the
   /// operands.  No overflow occurs.  DST must be disjoint from both
   /// operands. Returns the number of parts required to hold the result.
-  static unsigned int tcFullMultiply(integerPart *, const integerPart *,
-                                     const integerPart *, unsigned, unsigned);
+  static unsigned tcFullMultiply(WordType *, const WordType *,
+                                 const WordType *, unsigned, unsigned);
 
   /// If RHS is zero LHS and REMAINDER are left unchanged, return one.
   /// Otherwise set LHS to LHS / RHS with the fractional part discarded, set
@@ -1667,38 +1740,35 @@ public:
   /// SCRATCH is a bignum of the same size as the operands and result for use by
   /// the routine; its contents need not be initialized and are destroyed.  LHS,
   /// REMAINDER and SCRATCH must be distinct.
-  static int tcDivide(integerPart *lhs, const integerPart *rhs,
-                      integerPart *remainder, integerPart *scratch,
-                      unsigned int parts);
+  static int tcDivide(WordType *lhs, const WordType *rhs,
+                      WordType *remainder, WordType *scratch,
+                      unsigned parts);
 
   /// Shift a bignum left COUNT bits.  Shifted in bits are zero.  There are no
   /// restrictions on COUNT.
-  static void tcShiftLeft(integerPart *, unsigned int parts,
-                          unsigned int count);
+  static void tcShiftLeft(WordType *, unsigned parts, unsigned count);
 
   /// Shift a bignum right COUNT bits.  Shifted in bits are zero.  There are no
   /// restrictions on COUNT.
-  static void tcShiftRight(integerPart *, unsigned int parts,
-                           unsigned int count);
+  static void tcShiftRight(WordType *, unsigned parts, unsigned count);
 
   /// The obvious AND, OR and XOR and complement operations.
-  static void tcAnd(integerPart *, const integerPart *, unsigned int);
-  static void tcOr(integerPart *, const integerPart *, unsigned int);
-  static void tcXor(integerPart *, const integerPart *, unsigned int);
-  static void tcComplement(integerPart *, unsigned int);
+  static void tcAnd(WordType *, const WordType *, unsigned);
+  static void tcOr(WordType *, const WordType *, unsigned);
+  static void tcXor(WordType *, const WordType *, unsigned);
+  static void tcComplement(WordType *, unsigned);
 
   /// Comparison (unsigned) of two bignums.
-  static int tcCompare(const integerPart *, const integerPart *, unsigned int);
+  static int tcCompare(const WordType *, const WordType *, unsigned);
 
   /// Increment a bignum in-place.  Return the carry flag.
-  static integerPart tcIncrement(integerPart *, unsigned int);
+  static WordType tcIncrement(WordType *, unsigned);
 
   /// Decrement a bignum in-place.  Return the borrow flag.
-  static integerPart tcDecrement(integerPart *, unsigned int);
+  static WordType tcDecrement(WordType *, unsigned);
 
   /// Set the least significant BITS and clear the rest.
-  static void tcSetLeastSignificantBits(integerPart *, unsigned int,
-                                        unsigned int bits);
+  static void tcSetLeastSignificantBits(WordType *, unsigned, unsigned bits);
 
   /// \brief debug method
   void dump() const;
@@ -1723,6 +1793,74 @@ inline bool operator==(uint64_t V1, const APInt &V2) { return V2 == V1; }
 
 inline bool operator!=(uint64_t V1, const APInt &V2) { return V2 != V1; }
 
+/// \brief Unary bitwise complement operator.
+///
+/// \returns an APInt that is the bitwise complement of \p v.
+inline APInt operator~(APInt v) {
+  v.flipAllBits();
+  return v;
+}
+
+inline APInt operator&(APInt a, const APInt &b) {
+  a &= b;
+  return a;
+}
+
+inline APInt operator&(const APInt &a, APInt &&b) {
+  b &= a;
+  return std::move(b);
+}
+
+inline APInt operator&(APInt a, uint64_t RHS) {
+  a &= RHS;
+  return a;
+}
+
+inline APInt operator&(uint64_t LHS, APInt b) {
+  b &= LHS;
+  return b;
+}
+
+inline APInt operator|(APInt a, const APInt &b) {
+  a |= b;
+  return a;
+}
+
+inline APInt operator|(const APInt &a, APInt &&b) {
+  b |= a;
+  return std::move(b);
+}
+
+inline APInt operator|(APInt a, uint64_t RHS) {
+  a |= RHS;
+  return a;
+}
+
+inline APInt operator|(uint64_t LHS, APInt b) {
+  b |= LHS;
+  return b;
+}
+
+inline APInt operator^(APInt a, const APInt &b) {
+  a ^= b;
+  return a;
+}
+
+inline APInt operator^(const APInt &a, APInt &&b) {
+  b ^= a;
+  return std::move(b);
+}
+
+inline APInt operator^(APInt a, uint64_t RHS) {
+  a ^= RHS;
+  return a;
+}
+
+inline APInt operator^(uint64_t LHS, APInt b) {
+  b ^= LHS;
+  return b;
+}
+
 inline raw_ostream &operator<<(raw_ostream &OS, const APInt &I) {
   I.print(OS, true);
   return OS;
@@ -1799,47 +1937,13 @@ inline const APInt &umax(const APInt &A, const APInt &B) {
   return A.ugt(B) ? A : B;
 }
 
-/// \brief Check if the specified APInt has a N-bits unsigned integer value.
-inline bool isIntN(unsigned N, const APInt &APIVal) { return APIVal.isIntN(N); }
-
-/// \brief Check if the specified APInt has a N-bits signed integer value.
-inline bool isSignedIntN(unsigned N, const APInt &APIVal) {
-  return APIVal.isSignedIntN(N);
-}
-
-/// \returns true if the argument APInt value is a sequence of ones starting at
-/// the least significant bit with the remainder zero.
-inline bool isMask(unsigned numBits, const APInt &APIVal) {
-  return numBits <= APIVal.getBitWidth() &&
-         APIVal == APInt::getLowBitsSet(APIVal.getBitWidth(), numBits);
-}
-
-/// \returns true if the argument is a non-empty sequence of ones starting at
-/// the least significant bit with the remainder zero (32 bit version).
-/// Ex. isMask(0x0000FFFFU) == true.
-inline bool isMask(const APInt &Value) {
-  return (Value != 0) && ((Value + 1) & Value) == 0;
-}
-
-/// \brief Return true if the argument APInt value contains a sequence of ones
-/// with the remainder zero.
-inline bool isShiftedMask(unsigned numBits, const APInt &APIVal) {
-  return isMask(numBits, (APIVal - APInt(numBits, 1)) | APIVal);
-}
-
-/// \brief Returns a byte-swapped representation of the specified APInt Value.
-inline APInt byteSwap(const APInt &APIVal) { return APIVal.byteSwap(); }
-
-/// \brief Returns the floor log base 2 of the specified APInt value.
-inline unsigned logBase2(const APInt &APIVal) { return APIVal.logBase2(); }
-
 /// \brief Compute GCD of two APInt values.
 ///
 /// This function returns the greatest common divisor of the two APInt values
 /// using Euclid's algorithm.
 ///
-/// \returns the greatest common divisor of Val1 and Val2
-APInt GreatestCommonDivisor(const APInt &Val1, const APInt &Val2);
+/// \returns the greatest common divisor of A and B.
+APInt GreatestCommonDivisor(APInt A, APInt B);
 
 /// \brief Converts the given APInt to a double value.
 ///
@@ -1879,83 +1983,6 @@ inline APInt RoundFloatToAPInt(float Float, unsigned width) {
   return RoundDoubleToAPInt(double(Float), width);
 }
 
-/// \brief Arithmetic right-shift function.
-///
-/// Arithmetic right-shift the APInt by shiftAmt.
-inline APInt ashr(const APInt &LHS, unsigned shiftAmt) {
-  return LHS.ashr(shiftAmt);
-}
-
-/// \brief Logical right-shift function.
-///
-/// Logical right-shift the APInt by shiftAmt.
-inline APInt lshr(const APInt &LHS, unsigned shiftAmt) {
-  return LHS.lshr(shiftAmt);
-}
-
-/// \brief Left-shift function.
-///
-/// Left-shift the APInt by shiftAmt.
-inline APInt shl(const APInt &LHS, unsigned shiftAmt) {
-  return LHS.shl(shiftAmt);
-}
-
-/// \brief Signed division function for APInt.
-///
-/// Signed divide APInt LHS by APInt RHS.
-inline APInt sdiv(const APInt &LHS, const APInt &RHS) { return LHS.sdiv(RHS); }
-
-/// \brief Unsigned division function for APInt.
-///
-/// Unsigned divide APInt LHS by APInt RHS.
-inline APInt udiv(const APInt &LHS, const APInt &RHS) { return LHS.udiv(RHS); }
-
-/// \brief Function for signed remainder operation.
-///
-/// Signed remainder operation on APInt.
-inline APInt srem(const APInt &LHS, const APInt &RHS) { return LHS.srem(RHS); }
-
-/// \brief Function for unsigned remainder operation.
-///
-/// Unsigned remainder operation on APInt.
-inline APInt urem(const APInt &LHS, const APInt &RHS) { return LHS.urem(RHS); }
-
-/// \brief Function for multiplication operation.
-///
-/// Performs multiplication on APInt values.
-inline APInt mul(const APInt &LHS, const APInt &RHS) { return LHS * RHS; }
-
-/// \brief Function for addition operation.
-///
-/// Performs addition on APInt values.
-inline APInt add(const APInt &LHS, const APInt &RHS) { return LHS + RHS; }
-
-/// \brief Function for subtraction operation.
-///
-/// Performs subtraction on APInt values.
-inline APInt sub(const APInt &LHS, const APInt &RHS) { return LHS - RHS; }
-
-/// \brief Bitwise AND function for APInt.
-///
-/// Performs bitwise AND operation on APInt LHS and
-/// APInt RHS.
-inline APInt And(const APInt &LHS, const APInt &RHS) { return LHS & RHS; }
-
-/// \brief Bitwise OR function for APInt.
-///
-/// Performs bitwise OR operation on APInt LHS and APInt RHS.
-inline APInt Or(const APInt &LHS, const APInt &RHS) { return LHS | RHS; }
-
-/// \brief Bitwise XOR function for APInt.
-///
-/// Performs bitwise XOR operation on APInt.
-inline APInt Xor(const APInt &LHS, const APInt &RHS) { return LHS ^ RHS; }
-
-/// \brief Bitwise complement function.
-///
-/// Performs a bitwise complement operation on APInt.
-inline APInt Not(const APInt &APIVal) { return ~APIVal; }
-
 } // End of APIntOps namespace
 
 // See friend declaration above. This additional declaration is required in
diff --git a/include/llvm/ADT/ArrayRef.h b/include/llvm/ADT/ArrayRef.h
index b3fe31f4a806dcdc05aa58606fd2485112c5910c..6b35d0aec8b2b2f90756ac5177bd464843515870 100644
--- a/include/llvm/ADT/ArrayRef.h
+++ b/include/llvm/ADT/ArrayRef.h
@@ -487,6 +487,18 @@ namespace llvm {
     return ArrayRef<T>(Arr);
   }
 
+  /// Construct a MutableArrayRef from a single element.
+  template<typename T>
+  MutableArrayRef<T> makeMutableArrayRef(T &OneElt) {
+    return OneElt;
+  }
+
+  /// Construct a MutableArrayRef from a pointer and length.
+  template<typename T>
+  MutableArrayRef<T> makeMutableArrayRef(T *data, size_t length) {
+    return MutableArrayRef<T>(data, length);
+  }
+
   /// @}
   /// @name ArrayRef Comparison Operators
   /// @{
diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h
index cb318199ec778b852c4eb27a62e93a67cd66923d..8240d01ae977c7dc75179f997cf5eadb954ca590 100644
--- a/include/llvm/ADT/BitVector.h
+++ b/include/llvm/ADT/BitVector.h
@@ -161,6 +161,17 @@ public:
     return -1;
   }
 
+  /// find_first_unset - Returns the index of the first unset bit, -1 if all
+  /// of the bits are set.
+  int find_first_unset() const {
+    for (unsigned i = 0; i < NumBitWords(size()); ++i)
+      if (Bits[i] != ~0UL) {
+        unsigned Result = i * BITWORD_SIZE + countTrailingOnes(Bits[i]);
+        return Result < size() ? Result : -1;
+      }
+    return -1;
+  }
+
   /// find_next - Returns the index of the next set bit following the
   /// "Prev" bit. Returns -1 if the next set bit is not found.
   int find_next(unsigned Prev) const {
@@ -184,6 +195,30 @@ public:
     return -1;
   }
 
+  /// find_next_unset - Returns the index of the next usnet bit following the
+  /// "Prev" bit.  Returns -1 if all remaining bits are set.
+  int find_next_unset(unsigned Prev) const {
+    ++Prev;
+    if (Prev >= Size)
+      return -1;
+
+    unsigned WordPos = Prev / BITWORD_SIZE;
+    unsigned BitPos = Prev % BITWORD_SIZE;
+    BitWord Copy = Bits[WordPos];
+    // Mask in previous bits.
+    BitWord Mask = (1 << BitPos) - 1;
+    Copy |= Mask;
+
+    if (Copy != ~0UL)
+      return next_unset_in_word(WordPos, Copy);
+
+    // Check subsequent words.
+    for (unsigned i = WordPos + 1; i < NumBitWords(size()); ++i)
+      if (Bits[i] != ~0UL)
+        return next_unset_in_word(i, Bits[i]);
+    return -1;
+  }
+
   /// clear - Clear all bits.
   void clear() {
     Size = 0;
@@ -503,6 +538,11 @@ public:
   }
 
 private:
+  int next_unset_in_word(int WordIndex, BitWord Word) const {
+    unsigned Result = WordIndex * BITWORD_SIZE + countTrailingOnes(Word);
+    return Result < size() ? Result : -1;
+  }
+
   unsigned NumBitWords(unsigned S) const {
     return (S + BITWORD_SIZE-1) / BITWORD_SIZE;
   }
diff --git a/include/llvm/ADT/BreadthFirstIterator.h b/include/llvm/ADT/BreadthFirstIterator.h
new file mode 100644
index 0000000000000000000000000000000000000000..eaeecb6e057ffecf0e506193a205a9e3e51f6c56
--- /dev/null
+++ b/include/llvm/ADT/BreadthFirstIterator.h
@@ -0,0 +1,164 @@
+//===- llvm/ADT/BreadthFirstIterator.h - Breadth First iterator -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file builds on the ADT/GraphTraits.h file to build a generic breadth
+// first graph iterator.  This file exposes the following functions/types:
+//
+// bf_begin/bf_end/bf_iterator
+//   * Normal breadth-first iteration - visit a graph level-by-level.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_BREADTHFIRSTITERATOR_H
+#define LLVM_ADT_BREADTHFIRSTITERATOR_H
+
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/iterator_range.h"
+#include <iterator>
+#include <queue>
+#include <set>
+#include <utility>
+
+namespace llvm {
+
+// bf_iterator_storage - A private class which is used to figure out where to
+// store the visited set. We only provide a non-external variant for now.
+template <class SetType> class bf_iterator_storage {
+public:
+  SetType Visited;
+};
+
+// The visited state for the iteration is a simple set.
+template <typename NodeRef, unsigned SmallSize = 8>
+using bf_iterator_default_set = SmallPtrSet<NodeRef, SmallSize>;
+
+// Generic Breadth first search iterator.
+template <class GraphT,
+          class SetType =
+              bf_iterator_default_set<typename GraphTraits<GraphT>::NodeRef>,
+          class GT = GraphTraits<GraphT>>
+class bf_iterator
+    : public std::iterator<std::forward_iterator_tag, typename GT::NodeRef>,
+      public bf_iterator_storage<SetType> {
+  typedef std::iterator<std::forward_iterator_tag, typename GT::NodeRef> super;
+
+  typedef typename GT::NodeRef NodeRef;
+  typedef typename GT::ChildIteratorType ChildItTy;
+
+  // First element is the node reference, second is the next child to visit.
+  typedef std::pair<NodeRef, Optional<ChildItTy>> QueueElement;
+
+  // Visit queue - used to maintain BFS ordering.
+  // Optional<> because we need markers for levels.
+  std::queue<Optional<QueueElement>> VisitQueue;
+
+  // Current level.
+  unsigned Level;
+
+private:
+  inline bf_iterator(NodeRef Node) {
+    this->Visited.insert(Node);
+    Level = 0;
+
+    // Also, insert a dummy node as marker.
+    VisitQueue.push(QueueElement(Node, None));
+    VisitQueue.push(None);
+  }
+
+  inline bf_iterator() = default;
+
+  inline void toNext() {
+    Optional<QueueElement> Head = VisitQueue.front();
+    QueueElement H = Head.getValue();
+    NodeRef Node = H.first;
+    Optional<ChildItTy> &ChildIt = H.second;
+
+    if (!ChildIt)
+      ChildIt.emplace(GT::child_begin(Node));
+    while (*ChildIt != GT::child_end(Node)) {
+      NodeRef Next = *(*ChildIt)++;
+
+      // Already visited?
+      if (this->Visited.insert(Next).second)
+        VisitQueue.push(QueueElement(Next, None));
+    }
+    VisitQueue.pop();
+
+    // Go to the next element skipping markers if needed.
+    if (!VisitQueue.empty()) {
+      Head = VisitQueue.front();
+      if (Head != None)
+        return;
+      Level += 1;
+      VisitQueue.pop();
+
+      // Don't push another marker if this is the last
+      // element.
+      if (!VisitQueue.empty())
+        VisitQueue.push(None);
+    }
+  }
+
+public:
+  typedef typename super::pointer pointer;
+
+  // Provide static begin and end methods as our public "constructors"
+  static bf_iterator begin(const GraphT &G) {
+    return bf_iterator(GT::getEntryNode(G));
+  }
+
+  static bf_iterator end(const GraphT &G) { return bf_iterator(); }
+
+  bool operator==(const bf_iterator &RHS) const {
+    return VisitQueue == RHS.VisitQueue;
+  }
+
+  bool operator!=(const bf_iterator &RHS) const { return !(*this == RHS); }
+
+  const NodeRef &operator*() const { return VisitQueue.front()->first; }
+
+  // This is a nonstandard operator-> that dereferenfces the pointer an extra
+  // time so that you can actually call methods on the node, because the
+  // contained type is a pointer.
+  NodeRef operator->() const { return **this; }
+
+  bf_iterator &operator++() { // Pre-increment
+    toNext();
+    return *this;
+  }
+
+  bf_iterator operator++(int) { // Post-increment
+    bf_iterator ItCopy = *this;
+    ++*this;
+    return ItCopy;
+  }
+
+  unsigned getLevel() const { return Level; }
+};
+
+// Provide global constructors that automatically figure out correct types.
+template <class T> bf_iterator<T> bf_begin(const T &G) {
+  return bf_iterator<T>::begin(G);
+}
+
+template <class T> bf_iterator<T> bf_end(const T &G) {
+  return bf_iterator<T>::end(G);
+}
+
+// Provide an accessor method to use them in range-based patterns.
+template <class T> iterator_range<bf_iterator<T>> breadth_first(const T &G) {
+  return make_range(bf_begin(G), bf_end(G));
+}
+
+} // end namespace llvm
+
+#endif // LLVM_ADT_BREADTHFIRSTITERATOR_H
diff --git a/include/llvm/ADT/DenseMap.h b/include/llvm/ADT/DenseMap.h
index 0b4b09d4b7330b1bb174eca1e0034743d2f9a79f..fd8d3bf368a8866f52ceaa41da6e98b51b94a3b9 100644
--- a/include/llvm/ADT/DenseMap.h
+++ b/include/llvm/ADT/DenseMap.h
@@ -53,6 +53,9 @@ class DenseMapIterator;
 template <typename DerivedT, typename KeyT, typename ValueT, typename KeyInfoT,
           typename BucketT>
 class DenseMapBase : public DebugEpochBase {
+  template <typename T>
+  using const_arg_type_t = typename const_pointer_or_const_ref<T>::type;
+
 public:
   typedef unsigned size_type;
   typedef KeyT key_type;
@@ -119,18 +122,18 @@ public:
   }
 
   /// Return 1 if the specified key is in the map, 0 otherwise.
-  size_type count(const KeyT &Val) const {
+  size_type count(const_arg_type_t<KeyT> Val) const {
     const BucketT *TheBucket;
     return LookupBucketFor(Val, TheBucket) ? 1 : 0;
   }
 
-  iterator find(const KeyT &Val) {
+  iterator find(const_arg_type_t<KeyT> Val) {
     BucketT *TheBucket;
     if (LookupBucketFor(Val, TheBucket))
       return iterator(TheBucket, getBucketsEnd(), *this, true);
     return end();
   }
-  const_iterator find(const KeyT &Val) const {
+  const_iterator find(const_arg_type_t<KeyT> Val) const {
     const BucketT *TheBucket;
     if (LookupBucketFor(Val, TheBucket))
       return const_iterator(TheBucket, getBucketsEnd(), *this, true);
@@ -159,7 +162,7 @@ public:
 
   /// lookup - Return the entry for the specified key, or a default
   /// constructed value if no such entry exists.
-  ValueT lookup(const KeyT &Val) const {
+  ValueT lookup(const_arg_type_t<KeyT> Val) const {
     const BucketT *TheBucket;
     if (LookupBucketFor(Val, TheBucket))
       return TheBucket->getSecond();
@@ -389,6 +392,8 @@ protected:
     return KeyInfoT::getHashValue(Val);
   }
   static const KeyT getEmptyKey() {
+    static_assert(std::is_base_of<DenseMapBase, DerivedT>::value,
+                  "Must pass the derived type to this template!");
     return KeyInfoT::getEmptyKey();
   }
   static const KeyT getTombstoneKey() {
diff --git a/include/llvm/ADT/DenseMapInfo.h b/include/llvm/ADT/DenseMapInfo.h
index a844ebcccf5b89306186097295bb4d9b5f4b86a4..bb973ac65063428ed5ea8e7e2059a413961b885c 100644
--- a/include/llvm/ADT/DenseMapInfo.h
+++ b/include/llvm/ADT/DenseMapInfo.h
@@ -60,6 +60,16 @@ template<> struct DenseMapInfo<char> {
   }
 };
 
+// Provide DenseMapInfo for unsigned shorts.
+template <> struct DenseMapInfo<unsigned short> {
+  static inline unsigned short getEmptyKey() { return 0xFFFF; }
+  static inline unsigned short getTombstoneKey() { return 0xFFFF - 1; }
+  static unsigned getHashValue(const unsigned short &Val) { return Val * 37U; }
+  static bool isEqual(const unsigned short &LHS, const unsigned short &RHS) {
+    return LHS == RHS;
+  }
+};
+
 // Provide DenseMapInfo for unsigned ints.
 template<> struct DenseMapInfo<unsigned> {
   static inline unsigned getEmptyKey() { return ~0U; }
@@ -95,6 +105,14 @@ template<> struct DenseMapInfo<unsigned long long> {
   }
 };
 
+// Provide DenseMapInfo for shorts.
+template <> struct DenseMapInfo<short> {
+  static inline short getEmptyKey() { return 0x7FFF; }
+  static inline short getTombstoneKey() { return -0x7FFF - 1; }
+  static unsigned getHashValue(const short &Val) { return Val * 37U; }
+  static bool isEqual(const short &LHS, const short &RHS) { return LHS == RHS; }
+};
+
 // Provide DenseMapInfo for ints.
 template<> struct DenseMapInfo<int> {
   static inline int getEmptyKey() { return 0x7fffffff; }
diff --git a/include/llvm/ADT/DenseSet.h b/include/llvm/ADT/DenseSet.h
index b1345f7da7388deed693d74c9e48703fcb13d4f1..fcf304c3ecc41060846c2f327515a04536a25917 100644
--- a/include/llvm/ADT/DenseSet.h
+++ b/include/llvm/ADT/DenseSet.h
@@ -48,6 +48,8 @@ class DenseSetImpl {
   static_assert(sizeof(typename MapTy::value_type) == sizeof(ValueT),
                 "DenseMap buckets unexpectedly large!");
   MapTy TheMap;
+  template <typename T>
+  using const_arg_type_t = typename const_pointer_or_const_ref<T>::type;
 
 public:
   typedef ValueT key_type;
@@ -78,7 +80,7 @@ public:
   }
 
   /// Return 1 if the specified key is in the set, 0 otherwise.
-  size_type count(const ValueT &V) const {
+  size_type count(const_arg_type_t<ValueT> V) const {
     return TheMap.count(V);
   }
 
@@ -154,8 +156,8 @@ public:
   const_iterator begin() const { return ConstIterator(TheMap.begin()); }
   const_iterator end() const { return ConstIterator(TheMap.end()); }
 
-  iterator find(const ValueT &V) { return Iterator(TheMap.find(V)); }
-  const_iterator find(const ValueT &V) const {
+  iterator find(const_arg_type_t<ValueT> V) { return Iterator(TheMap.find(V)); }
+  const_iterator find(const_arg_type_t<ValueT> V) const {
     return ConstIterator(TheMap.find(V));
   }
 
diff --git a/include/llvm/ADT/DepthFirstIterator.h b/include/llvm/ADT/DepthFirstIterator.h
index c54573204588ea3ce99c9f5f1a846a37e783e7c9..b020d48cb3f082d4366da13498ea8a84fcb80baa 100644
--- a/include/llvm/ADT/DepthFirstIterator.h
+++ b/include/llvm/ADT/DepthFirstIterator.h
@@ -135,7 +135,7 @@ private:
         }
       }
       this->Visited.completed(Node);
-      
+
       // Oops, ran out of successors... go up a level on the stack.
       VisitStack.pop_back();
     } while (!VisitStack.empty());
diff --git a/include/llvm/ADT/GraphTraits.h b/include/llvm/ADT/GraphTraits.h
index 29bbcb010eeef3266fbff411d9acc2948cb09f2a..2c88c4271b4895d6d305c25f5f430fb984ab7f79 100644
--- a/include/llvm/ADT/GraphTraits.h
+++ b/include/llvm/ADT/GraphTraits.h
@@ -18,6 +18,8 @@
 #ifndef LLVM_ADT_GRAPHTRAITS_H
 #define LLVM_ADT_GRAPHTRAITS_H
 
+#include "llvm/ADT/iterator_range.h"
+
 namespace llvm {
 
 // GraphTraits - This class should be specialized by different graph types...
@@ -86,6 +88,33 @@ struct Inverse {
 // inverse falls back to the original graph.
 template <class T> struct GraphTraits<Inverse<Inverse<T>>> : GraphTraits<T> {};
 
+// Provide iterator ranges for the graph traits nodes and children
+template <class GraphType>
+iterator_range<typename GraphTraits<GraphType>::nodes_iterator>
+nodes(const GraphType &G) {
+  return make_range(GraphTraits<GraphType>::nodes_begin(G),
+                    GraphTraits<GraphType>::nodes_end(G));
+}
+template <class GraphType>
+iterator_range<typename GraphTraits<Inverse<GraphType>>::nodes_iterator>
+inverse_nodes(const GraphType &G) {
+  return make_range(GraphTraits<Inverse<GraphType>>::nodes_begin(G),
+                    GraphTraits<Inverse<GraphType>>::nodes_end(G));
+}
+
+template <class GraphType>
+iterator_range<typename GraphTraits<GraphType>::ChildIteratorType>
+children(const typename GraphTraits<GraphType>::NodeRef &G) {
+  return make_range(GraphTraits<GraphType>::child_begin(G),
+                    GraphTraits<GraphType>::child_end(G));
+}
+
+template <class GraphType>
+iterator_range<typename GraphTraits<Inverse<GraphType>>::ChildIteratorType>
+inverse_children(const typename GraphTraits<GraphType>::NodeRef &G) {
+  return make_range(GraphTraits<Inverse<GraphType>>::child_begin(G),
+                    GraphTraits<Inverse<GraphType>>::child_end(G));
+}
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/ADT/PointerUnion.h b/include/llvm/ADT/PointerUnion.h
index a8ac18645f3ab75db17d6d219f829ca982597535..9eb15524c0f3059918d08e37e5596e3ea9e53f31 100644
--- a/include/llvm/ADT/PointerUnion.h
+++ b/include/llvm/ADT/PointerUnion.h
@@ -31,7 +31,7 @@ template <typename T> struct PointerUnionTypeSelectorReturn {
 /// Get a type based on whether two types are the same or not.
 ///
 /// For:
-/// 
+///
 /// \code
 ///   typedef typename PointerUnionTypeSelector<T1, T2, EQ, NE>::Return Ret;
 /// \endcode
@@ -190,17 +190,17 @@ public:
 };
 
 template <typename PT1, typename PT2>
-static bool operator==(PointerUnion<PT1, PT2> lhs, PointerUnion<PT1, PT2> rhs) {
+bool operator==(PointerUnion<PT1, PT2> lhs, PointerUnion<PT1, PT2> rhs) {
   return lhs.getOpaqueValue() == rhs.getOpaqueValue();
 }
 
 template <typename PT1, typename PT2>
-static bool operator!=(PointerUnion<PT1, PT2> lhs, PointerUnion<PT1, PT2> rhs) {
+bool operator!=(PointerUnion<PT1, PT2> lhs, PointerUnion<PT1, PT2> rhs) {
   return lhs.getOpaqueValue() != rhs.getOpaqueValue();
 }
 
 template <typename PT1, typename PT2>
-static bool operator<(PointerUnion<PT1, PT2> lhs, PointerUnion<PT1, PT2> rhs) {
+bool operator<(PointerUnion<PT1, PT2> lhs, PointerUnion<PT1, PT2> rhs) {
   return lhs.getOpaqueValue() < rhs.getOpaqueValue();
 }
 
diff --git a/include/llvm/ADT/PostOrderIterator.h b/include/llvm/ADT/PostOrderIterator.h
index e519b5c07964ae104124c889c4a7a51dd99a4996..8fc08eb252eb214c6e041747f8cb454a3d5cf49b 100644
--- a/include/llvm/ADT/PostOrderIterator.h
+++ b/include/llvm/ADT/PostOrderIterator.h
@@ -268,6 +268,10 @@ inverse_post_order_ext(const T &G, SetType &S) {
 // with a postorder iterator to build the data structures).  The moral of this
 // story is: Don't create more ReversePostOrderTraversal classes than necessary.
 //
+// Because it does the traversal in its constructor, it won't invalidate when
+// BasicBlocks are removed, *but* it may contain erased blocks. Some places
+// rely on this behavior (i.e. GVN).
+//
 // This class should be used like this:
 // {
 //   ReversePostOrderTraversal<Function*> RPOT(FuncPtr); // Expensive to create
diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h
index ec121e0d55cd44bf3275959fca61837ce42712c7..15945adbe589a4634e2dbf2738866467db2b9a94 100644
--- a/include/llvm/ADT/STLExtras.h
+++ b/include/llvm/ADT/STLExtras.h
@@ -23,11 +23,13 @@
 #include <cstdlib> // for qsort
 #include <functional>
 #include <iterator>
+#include <limits>
 #include <memory>
 #include <tuple>
 #include <utility> // for std::pair
 
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Compiler.h"
@@ -44,6 +46,10 @@ namespace detail {
 template <typename RangeT>
 using IterOfRange = decltype(std::begin(std::declval<RangeT &>()));
 
+template <typename RangeT>
+using ValueOfRange = typename std::remove_reference<decltype(
+    *std::begin(std::declval<RangeT &>()))>::type;
+
 } // End detail namespace
 
 //===----------------------------------------------------------------------===//
@@ -123,7 +129,7 @@ inline void deleter(T *Ptr) {
 //===----------------------------------------------------------------------===//
 
 // mapped_iterator - This is a simple iterator adapter that causes a function to
-// be dereferenced whenever operator* is invoked on the iterator.
+// be applied whenever operator* is invoked on the iterator.
 //
 template <class RootIt, class UnaryFunc>
 class mapped_iterator {
@@ -134,9 +140,8 @@ public:
           iterator_category;
   typedef typename std::iterator_traits<RootIt>::difference_type
           difference_type;
-  typedef typename std::result_of<
-            UnaryFunc(decltype(*std::declval<RootIt>()))>
-          ::type value_type;
+  typedef decltype(std::declval<UnaryFunc>()(*std::declval<RootIt>()))
+          value_type;
 
   typedef void pointer;
   //typedef typename UnaryFunc::result_type *pointer;
@@ -356,65 +361,126 @@ template <size_t... I> struct index_sequence;
 template <class... Ts> struct index_sequence_for;
 
 namespace detail {
-template <typename... Iters> class zip_first {
-public:
-  typedef std::input_iterator_tag iterator_category;
-  typedef std::tuple<decltype(*std::declval<Iters>())...> value_type;
+using std::declval;
+
+// We have to alias this since inlining the actual type at the usage site
+// in the parameter list of iterator_facade_base<> below ICEs MSVC 2017.
+template<typename... Iters> struct ZipTupleType {
+  typedef std::tuple<decltype(*declval<Iters>())...> type;
+};
+
+template <typename ZipType, typename... Iters>
+using zip_traits = iterator_facade_base<
+    ZipType, typename std::common_type<std::bidirectional_iterator_tag,
+                                       typename std::iterator_traits<
+                                           Iters>::iterator_category...>::type,
+    // ^ TODO: Implement random access methods.
+    typename ZipTupleType<Iters...>::type,
+    typename std::iterator_traits<typename std::tuple_element<
+        0, std::tuple<Iters...>>::type>::difference_type,
+    // ^ FIXME: This follows boost::make_zip_iterator's assumption that all
+    // inner iterators have the same difference_type. It would fail if, for
+    // instance, the second field's difference_type were non-numeric while the
+    // first is.
+    typename ZipTupleType<Iters...>::type *,
+    typename ZipTupleType<Iters...>::type>;
+
+template <typename ZipType, typename... Iters>
+struct zip_common : public zip_traits<ZipType, Iters...> {
+  using Base = zip_traits<ZipType, Iters...>;
+  using value_type = typename Base::value_type;
+
   std::tuple<Iters...> iterators;
 
-private:
-  template <size_t... Ns> value_type deres(index_sequence<Ns...>) {
+protected:
+  template <size_t... Ns> value_type deref(index_sequence<Ns...>) const {
     return value_type(*std::get<Ns>(iterators)...);
   }
 
-  template <size_t... Ns> decltype(iterators) tup_inc(index_sequence<Ns...>) {
+  template <size_t... Ns>
+  decltype(iterators) tup_inc(index_sequence<Ns...>) const {
     return std::tuple<Iters...>(std::next(std::get<Ns>(iterators))...);
   }
 
+  template <size_t... Ns>
+  decltype(iterators) tup_dec(index_sequence<Ns...>) const {
+    return std::tuple<Iters...>(std::prev(std::get<Ns>(iterators))...);
+  }
+
 public:
-  value_type operator*() { return deres(index_sequence_for<Iters...>{}); }
+  zip_common(Iters &&... ts) : iterators(std::forward<Iters>(ts)...) {}
+
+  value_type operator*() { return deref(index_sequence_for<Iters...>{}); }
 
-  void operator++() { iterators = tup_inc(index_sequence_for<Iters...>{}); }
+  const value_type operator*() const {
+    return deref(index_sequence_for<Iters...>{});
+  }
 
-  bool operator!=(const zip_first<Iters...> &other) const {
-    return std::get<0>(iterators) != std::get<0>(other.iterators);
+  ZipType &operator++() {
+    iterators = tup_inc(index_sequence_for<Iters...>{});
+    return *reinterpret_cast<ZipType *>(this);
   }
-  zip_first(Iters &&... ts) : iterators(std::forward<Iters>(ts)...) {}
+
+  ZipType &operator--() {
+    static_assert(Base::IsBidirectional,
+                  "All inner iterators must be at least bidirectional.");
+    iterators = tup_dec(index_sequence_for<Iters...>{});
+    return *reinterpret_cast<ZipType *>(this);
+  }
+};
+
+template <typename... Iters>
+struct zip_first : public zip_common<zip_first<Iters...>, Iters...> {
+  using Base = zip_common<zip_first<Iters...>, Iters...>;
+
+  bool operator==(const zip_first<Iters...> &other) const {
+    return std::get<0>(this->iterators) == std::get<0>(other.iterators);
+  }
+
+  zip_first(Iters &&... ts) : Base(std::forward<Iters>(ts)...) {}
 };
 
-template <typename... Iters> class zip_shortest : public zip_first<Iters...> {
+template <typename... Iters>
+class zip_shortest : public zip_common<zip_shortest<Iters...>, Iters...> {
   template <size_t... Ns>
-  bool test(const zip_first<Iters...> &other, index_sequence<Ns...>) const {
+  bool test(const zip_shortest<Iters...> &other, index_sequence<Ns...>) const {
     return all_of(std::initializer_list<bool>{std::get<Ns>(this->iterators) !=
                                               std::get<Ns>(other.iterators)...},
                   identity<bool>{});
   }
 
 public:
-  bool operator!=(const zip_first<Iters...> &other) const {
-    return test(other, index_sequence_for<Iters...>{});
+  using Base = zip_common<zip_shortest<Iters...>, Iters...>;
+
+  bool operator==(const zip_shortest<Iters...> &other) const {
+    return !test(other, index_sequence_for<Iters...>{});
   }
-  zip_shortest(Iters &&... ts)
-      : zip_first<Iters...>(std::forward<Iters>(ts)...) {}
+
+  zip_shortest(Iters &&... ts) : Base(std::forward<Iters>(ts)...) {}
 };
 
 template <template <typename...> class ItType, typename... Args> class zippy {
 public:
-  typedef ItType<decltype(std::begin(std::declval<Args>()))...> iterator;
+  using iterator = ItType<decltype(std::begin(std::declval<Args>()))...>;
+  using iterator_category = typename iterator::iterator_category;
+  using value_type = typename iterator::value_type;
+  using difference_type = typename iterator::difference_type;
+  using pointer = typename iterator::pointer;
+  using reference = typename iterator::reference;
 
 private:
   std::tuple<Args...> ts;
 
-  template <size_t... Ns> iterator begin_impl(index_sequence<Ns...>) {
+  template <size_t... Ns> iterator begin_impl(index_sequence<Ns...>) const {
     return iterator(std::begin(std::get<Ns>(ts))...);
   }
-  template <size_t... Ns> iterator end_impl(index_sequence<Ns...>) {
+  template <size_t... Ns> iterator end_impl(index_sequence<Ns...>) const {
     return iterator(std::end(std::get<Ns>(ts))...);
   }
 
 public:
-  iterator begin() { return begin_impl(index_sequence_for<Args...>{}); }
-  iterator end() { return end_impl(index_sequence_for<Args...>{}); }
+  iterator begin() const { return begin_impl(index_sequence_for<Args...>{}); }
+  iterator end() const { return end_impl(index_sequence_for<Args...>{}); }
   zippy(Args &&... ts_) : ts(std::forward<Args>(ts_)...) {}
 };
 } // End detail namespace
@@ -777,6 +843,13 @@ auto remove_if(R &&Range, UnaryPredicate P) -> decltype(std::begin(Range)) {
   return std::remove_if(std::begin(Range), std::end(Range), P);
 }
 
+/// Provide wrappers to std::copy_if which take ranges instead of having to
+/// pass begin/end explicitly.
+template <typename R, typename OutputIt, typename UnaryPredicate>
+OutputIt copy_if(R &&Range, OutputIt Out, UnaryPredicate P) {
+  return std::copy_if(std::begin(Range), std::end(Range), Out, P);
+}
+
 /// Wrapper function around std::find to detect if an element exists
 /// in a container.
 template <typename R, typename E>
@@ -815,6 +888,15 @@ auto partition(R &&Range, UnaryPredicate P) -> decltype(std::begin(Range)) {
   return std::partition(std::begin(Range), std::end(Range), P);
 }
 
+/// \brief Given a range of type R, iterate the entire range and return a
+/// SmallVector with elements of the vector.  This is useful, for example,
+/// when you want to iterate a range and then sort the results.
+template <unsigned Size, typename R>
+SmallVector<typename std::remove_const<detail::ValueOfRange<R>>::type, Size>
+to_vector(R &&Range) {
+  return {std::begin(Range), std::end(Range)};
+}
+
 /// Provide a container algorithm similar to C++ Library Fundamentals v2's
 /// `erase_if` which is equivalent to:
 ///
@@ -909,47 +991,85 @@ template <typename T> struct deref {
 };
 
 namespace detail {
-template <typename R> class enumerator_impl {
-public:
-  template <typename X> struct result_pair {
-    result_pair(std::size_t Index, X Value) : Index(Index), Value(Value) {}
+template <typename R> class enumerator_iter;
 
-    const std::size_t Index;
-    X Value;
-  };
+template <typename R> struct result_pair {
+  friend class enumerator_iter<R>;
+
+  result_pair() : Index(-1) {}
+  result_pair(std::size_t Index, IterOfRange<R> Iter)
+      : Index(Index), Iter(Iter) {}
 
-  class iterator {
-    typedef
-        typename std::iterator_traits<IterOfRange<R>>::reference iter_reference;
-    typedef result_pair<iter_reference> result_type;
+  result_pair<R> &operator=(const result_pair<R> &Other) {
+    Index = Other.Index;
+    Iter = Other.Iter;
+    return *this;
+  }
 
-  public:
-    iterator(IterOfRange<R> &&Iter, std::size_t Index)
-        : Iter(Iter), Index(Index) {}
+  std::size_t index() const { return Index; }
+  const ValueOfRange<R> &value() const { return *Iter; }
+  ValueOfRange<R> &value() { return *Iter; }
 
-    result_type operator*() const { return result_type(Index, *Iter); }
+private:
+  std::size_t Index;
+  IterOfRange<R> Iter;
+};
 
-    iterator &operator++() {
-      ++Iter;
-      ++Index;
-      return *this;
-    }
+template <typename R>
+class enumerator_iter
+    : public iterator_facade_base<
+          enumerator_iter<R>, std::forward_iterator_tag, result_pair<R>,
+          typename std::iterator_traits<IterOfRange<R>>::difference_type,
+          typename std::iterator_traits<IterOfRange<R>>::pointer,
+          typename std::iterator_traits<IterOfRange<R>>::reference> {
+  using result_type = result_pair<R>;
 
-    bool operator!=(const iterator &RHS) const { return Iter != RHS.Iter; }
+public:
+  explicit enumerator_iter(IterOfRange<R> EndIter)
+    : Result(std::numeric_limits<size_t>::max(), EndIter) { }
 
-  private:
-    IterOfRange<R> Iter;
-    std::size_t Index;
-  };
+  enumerator_iter(std::size_t Index, IterOfRange<R> Iter)
+      : Result(Index, Iter) {}
+
+  result_type &operator*() { return Result; }
+  const result_type &operator*() const { return Result; }
 
+  enumerator_iter<R> &operator++() {
+    assert(Result.Index != std::numeric_limits<size_t>::max());
+    ++Result.Iter;
+    ++Result.Index;
+    return *this;
+  }
+
+  bool operator==(const enumerator_iter<R> &RHS) const {
+    // Don't compare indices here, only iterators.  It's possible for an end
+    // iterator to have different indices depending on whether it was created
+    // by calling std::end() versus incrementing a valid iterator.
+    return Result.Iter == RHS.Result.Iter;
+  }
+
+  enumerator_iter<R> &operator=(const enumerator_iter<R> &Other) {
+    Result = Other.Result;
+    return *this;
+  }
+
+private:
+  result_type Result;
+};
+
+template <typename R> class enumerator {
 public:
-  explicit enumerator_impl(R &&Range) : Range(std::forward<R>(Range)) {}
+  explicit enumerator(R &&Range) : TheRange(std::forward<R>(Range)) {}
 
-  iterator begin() { return iterator(std::begin(Range), 0); }
-  iterator end() { return iterator(std::end(Range), std::size_t(-1)); }
+  enumerator_iter<R> begin() {
+    return enumerator_iter<R>(0, std::begin(TheRange));
+  }
+  enumerator_iter<R> end() {
+    return enumerator_iter<R>(std::end(TheRange));
+  }
 
 private:
-  R Range;
+  R TheRange;
 };
 }
 
@@ -968,8 +1088,8 @@ private:
 ///   Item 2 - C
 ///   Item 3 - D
 ///
-template <typename R> detail::enumerator_impl<R> enumerate(R &&Range) {
-  return detail::enumerator_impl<R>(std::forward<R>(Range));
+template <typename R> detail::enumerator<R> enumerate(R &&TheRange) {
+  return detail::enumerator<R>(std::forward<R>(TheRange));
 }
 
 namespace detail {
diff --git a/include/llvm/ADT/ScopedHashTable.h b/include/llvm/ADT/ScopedHashTable.h
index ad805b0991c144a9dcec85197363ac1ceb2fbe7b..d52128e294a32690ba6dfd8908ccfa481222663a 100644
--- a/include/llvm/ADT/ScopedHashTable.h
+++ b/include/llvm/ADT/ScopedHashTable.h
@@ -182,8 +182,8 @@ public:
     return TopLevelMap.count(Key);
   }
 
-  V lookup(const K &Key) {
-    typename DenseMap<K, ValTy*, KInfo>::iterator I = TopLevelMap.find(Key);
+  V lookup(const K &Key) const {
+    auto I = TopLevelMap.find(Key);
     if (I != TopLevelMap.end())
       return I->second->getValue();
 
diff --git a/include/llvm/ADT/SetVector.h b/include/llvm/ADT/SetVector.h
index 4dc18bc52178fa18f0a26b7019653cab2bf5c26a..13378aa3a04efc3b0ce8914ab5412bd2eb8e0b4b 100644
--- a/include/llvm/ADT/SetVector.h
+++ b/include/llvm/ADT/SetVector.h
@@ -119,6 +119,12 @@ public:
     return vector_.rend();
   }
 
+  /// \brief Return the first element of the SetVector.
+  const T &front() const {
+    assert(!empty() && "Cannot call front() on empty SetVector!");
+    return vector_.front();
+  }
+
   /// \brief Return the last element of the SetVector.
   const T &back() const {
     assert(!empty() && "Cannot call back() on empty SetVector!");
@@ -232,7 +238,7 @@ public:
   bool operator!=(const SetVector &that) const {
     return vector_ != that.vector_;
   }
-  
+
   /// \brief Compute This := This u S, return whether 'This' changed.
   /// TODO: We should be able to use set_union from SetOperations.h, but
   ///       SetVector interface is inconsistent with DenseSet.
diff --git a/include/llvm/ADT/SmallBitVector.h b/include/llvm/ADT/SmallBitVector.h
index bb99e0cf221f78c186a8287ee9ec89edfaab5f1d..edb37da38da1bec88e97163ba8e5b445e9d22ab9 100644
--- a/include/llvm/ADT/SmallBitVector.h
+++ b/include/llvm/ADT/SmallBitVector.h
@@ -216,6 +216,18 @@ public:
     return getPointer()->find_first();
   }
 
+  /// Returns the index of the first unset bit, -1 if all of the bits are set.
+  int find_first_unset() const {
+    if (isSmall()) {
+      if (count() == getSmallSize())
+        return -1;
+
+      uintptr_t Bits = getSmallBits();
+      return countTrailingOnes(Bits);
+    }
+    return getPointer()->find_first_unset();
+  }
+
   /// Returns the index of the next set bit following the "Prev" bit.
   /// Returns -1 if the next set bit is not found.
   int find_next(unsigned Prev) const {
@@ -230,6 +242,23 @@ public:
     return getPointer()->find_next(Prev);
   }
 
+  /// Returns the index of the next unset bit following the "Prev" bit.
+  /// Returns -1 if the next unset bit is not found.
+  int find_next_unset(unsigned Prev) const {
+    if (isSmall()) {
+      ++Prev;
+      uintptr_t Bits = getSmallBits();
+      // Mask in previous bits.
+      uintptr_t Mask = (1 << Prev) - 1;
+      Bits |= Mask;
+
+      if (Bits == ~uintptr_t(0) || Prev + 1 >= getSmallSize())
+        return -1;
+      return countTrailingOnes(Bits);
+    }
+    return getPointer()->find_next_unset(Prev);
+  }
+
   /// Clear all bits.
   void clear() {
     if (!isSmall())
diff --git a/include/llvm/ADT/SmallPtrSet.h b/include/llvm/ADT/SmallPtrSet.h
index 49feb9da897a2e096093e0847b35d53175d98dcd..196ab6338047cce727796ae40b0fcf4fc9d96394 100644
--- a/include/llvm/ADT/SmallPtrSet.h
+++ b/include/llvm/ADT/SmallPtrSet.h
@@ -18,6 +18,7 @@
 #include "llvm/Config/abi-breaking.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
+#include "llvm/Support/type_traits.h"
 #include <cassert>
 #include <cstddef>
 #include <cstring>
@@ -166,8 +167,8 @@ protected:
     const void *const *P = find_imp(Ptr);
     if (P == EndPointer())
       return false;
-    
-    const void ** Loc = const_cast<const void **>(P);
+
+    const void **Loc = const_cast<const void **>(P);
     assert(*Loc == Ptr && "broken find!");
     *Loc = getTombstoneMarker();
     NumTombstones++;
@@ -193,7 +194,7 @@ protected:
       return Bucket;
     return EndPointer();
   }
-  
+
 private:
   bool isSmall() const { return CurArray == SmallArray; }
 
@@ -259,11 +260,10 @@ protected:
   }
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
   void RetreatIfNotValid() {
-    --Bucket;
-    assert(Bucket <= End);
+    assert(Bucket >= End);
     while (Bucket != End &&
-           (*Bucket == SmallPtrSetImplBase::getEmptyMarker() ||
-            *Bucket == SmallPtrSetImplBase::getTombstoneMarker())) {
+           (Bucket[-1] == SmallPtrSetImplBase::getEmptyMarker() ||
+            Bucket[-1] == SmallPtrSetImplBase::getTombstoneMarker())) {
       --Bucket;
     }
   }
@@ -288,6 +288,12 @@ public:
   // Most methods provided by baseclass.
 
   const PtrTy operator*() const {
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+    if (ReverseIterate<bool>::value) {
+      assert(Bucket > End);
+      return PtrTraits::getFromVoidPointer(const_cast<void *>(Bucket[-1]));
+    }
+#endif
     assert(Bucket < End);
     return PtrTraits::getFromVoidPointer(const_cast<void*>(*Bucket));
   }
@@ -295,6 +301,7 @@ public:
   inline SmallPtrSetIterator& operator++() {          // Preincrement
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
     if (ReverseIterate<bool>::value) {
+      --Bucket;
       RetreatIfNotValid();
       return *this;
     }
@@ -343,7 +350,9 @@ struct RoundUpToPowerOfTwo {
 /// to avoid encoding a particular small size in the interface boundary.
 template <typename PtrType>
 class SmallPtrSetImpl : public SmallPtrSetImplBase {
+  using ConstPtrType = typename add_const_past_pointer<PtrType>::type;
   typedef PointerLikeTypeTraits<PtrType> PtrTraits;
+  typedef PointerLikeTypeTraits<ConstPtrType> ConstPtrTraits;
 
 protected:
   // Constructors that forward to the base.
@@ -367,7 +376,7 @@ public:
   /// the element equal to Ptr.
   std::pair<iterator, bool> insert(PtrType Ptr) {
     auto p = insert_imp(PtrTraits::getAsVoidPointer(Ptr));
-    return std::make_pair(iterator(p.first, EndPointer()), p.second);
+    return std::make_pair(makeIterator(p.first), p.second);
   }
 
   /// erase - If the set contains the specified pointer, remove it and return
@@ -375,14 +384,10 @@ public:
   bool erase(PtrType Ptr) {
     return erase_imp(PtrTraits::getAsVoidPointer(Ptr));
   }
-
   /// count - Return 1 if the specified pointer is in the set, 0 otherwise.
-  size_type count(PtrType Ptr) const {
-    return find(Ptr) != endPtr() ? 1 : 0;
-  }
-  iterator find(PtrType Ptr) const {
-    auto *P = find_imp(PtrTraits::getAsVoidPointer(Ptr));
-    return iterator(P, EndPointer());
+  size_type count(ConstPtrType Ptr) const { return find(Ptr) != end() ? 1 : 0; }
+  iterator find(ConstPtrType Ptr) const {
+    return makeIterator(find_imp(ConstPtrTraits::getAsVoidPointer(Ptr)));
   }
 
   template <typename IterT>
@@ -395,25 +400,23 @@ public:
     insert(IL.begin(), IL.end());
   }
 
-  inline iterator begin() const {
+  iterator begin() const {
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
     if (ReverseIterate<bool>::value)
-      return endPtr();
+      return makeIterator(EndPointer() - 1);
 #endif
-    return iterator(CurArray, EndPointer());
+    return makeIterator(CurArray);
   }
-  inline iterator end() const {
+  iterator end() const { return makeIterator(EndPointer()); }
+
+private:
+  /// Create an iterator that dereferences to same place as the given pointer.
+  iterator makeIterator(const void *const *P) const {
 #if LLVM_ENABLE_ABI_BREAKING_CHECKS
     if (ReverseIterate<bool>::value)
-      return iterator(CurArray, CurArray);
+      return iterator(P == EndPointer() ? CurArray : P + 1, CurArray);
 #endif
-    return endPtr();
-  }
-
-private:
-  inline iterator endPtr() const {
-    const void *const *End = EndPointer();
-    return iterator(End, End);
+    return iterator(P, EndPointer());
   }
 };
 
diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h
index 488748a5f60548af3118c8e44c0f961b0f487b16..8214782bfe800e3a3c00605d5dfeecd0a0aeb6a5 100644
--- a/include/llvm/ADT/StringExtras.h
+++ b/include/llvm/ADT/StringExtras.h
@@ -234,6 +234,13 @@ inline std::string join(IteratorT Begin, IteratorT End, StringRef Separator) {
   return detail::join_impl(Begin, End, Separator, tag());
 }
 
+/// Joins the strings in the range [R.begin(), R.end()), adding Separator
+/// between the elements.
+template <typename Range>
+inline std::string join(Range &&R, StringRef Separator) {
+  return join(R.begin(), R.end(), Separator);
+}
+
 /// Joins the strings in the parameter pack \p Items, adding \p Separator
 /// between the elements.  All arguments must be implicitly convertible to
 /// std::string, or there should be an overload of std::string::operator+=()
diff --git a/include/llvm/ADT/StringMap.h b/include/llvm/ADT/StringMap.h
index 24e3ecf71b13534353db9ecd05cc6760e0a65eb1..c36fda7d690652b4f513aa9a253c07a9afc0daf2 100644
--- a/include/llvm/ADT/StringMap.h
+++ b/include/llvm/ADT/StringMap.h
@@ -15,13 +15,13 @@
 #define LLVM_ADT_STRINGMAP_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
-#include <utility>
 #include <initializer_list>
 #include <new>
 #include <utility>
@@ -32,6 +32,7 @@ namespace llvm {
   class StringMapConstIterator;
   template<typename ValueT>
   class StringMapIterator;
+  template <typename ValueT> class StringMapKeyIterator;
   template<typename ValueTy>
   class StringMapEntry;
 
@@ -312,6 +313,11 @@ public:
     return const_iterator(TheTable+NumBuckets, true);
   }
 
+  llvm::iterator_range<StringMapKeyIterator<ValueTy>> keys() const {
+    return make_range(StringMapKeyIterator<ValueTy>(begin()),
+                      StringMapKeyIterator<ValueTy>(end()));
+  }
+
   iterator find(StringRef Key) {
     int Bucket = FindKey(Key);
     if (Bucket == -1) return end();
@@ -444,42 +450,39 @@ public:
   }
 };
 
-template <typename ValueTy> class StringMapConstIterator {
+template <typename DerivedTy, typename ValueTy>
+class StringMapIterBase
+    : public iterator_facade_base<DerivedTy, std::forward_iterator_tag,
+                                  ValueTy> {
 protected:
   StringMapEntryBase **Ptr = nullptr;
 
 public:
-  typedef StringMapEntry<ValueTy> value_type;
-
-  StringMapConstIterator() = default;
+  StringMapIterBase() = default;
 
-  explicit StringMapConstIterator(StringMapEntryBase **Bucket,
-                                  bool NoAdvance = false)
-  : Ptr(Bucket) {
+  explicit StringMapIterBase(StringMapEntryBase **Bucket,
+                             bool NoAdvance = false)
+      : Ptr(Bucket) {
     if (!NoAdvance) AdvancePastEmptyBuckets();
   }
 
-  const value_type &operator*() const {
-    return *static_cast<StringMapEntry<ValueTy>*>(*Ptr);
-  }
-  const value_type *operator->() const {
-    return static_cast<StringMapEntry<ValueTy>*>(*Ptr);
+  DerivedTy &operator=(const DerivedTy &Other) {
+    Ptr = Other.Ptr;
+    return static_cast<DerivedTy &>(*this);
   }
 
-  bool operator==(const StringMapConstIterator &RHS) const {
-    return Ptr == RHS.Ptr;
-  }
-  bool operator!=(const StringMapConstIterator &RHS) const {
-    return Ptr != RHS.Ptr;
-  }
+  bool operator==(const DerivedTy &RHS) const { return Ptr == RHS.Ptr; }
 
-  inline StringMapConstIterator& operator++() {   // Preincrement
+  DerivedTy &operator++() { // Preincrement
     ++Ptr;
     AdvancePastEmptyBuckets();
-    return *this;
+    return static_cast<DerivedTy &>(*this);
   }
-  StringMapConstIterator operator++(int) {        // Postincrement
-    StringMapConstIterator tmp = *this; ++*this; return tmp;
+
+  DerivedTy operator++(int) { // Post-increment
+    DerivedTy Tmp(Ptr);
+    ++*this;
+    return Tmp;
   }
 
 private:
@@ -489,22 +492,67 @@ private:
   }
 };
 
-template<typename ValueTy>
-class StringMapIterator : public StringMapConstIterator<ValueTy> {
+template <typename ValueTy>
+class StringMapConstIterator
+    : public StringMapIterBase<StringMapConstIterator<ValueTy>,
+                               const StringMapEntry<ValueTy>> {
+  using base = StringMapIterBase<StringMapConstIterator<ValueTy>,
+                                 const StringMapEntry<ValueTy>>;
+
 public:
-  StringMapIterator() = default;
+  StringMapConstIterator() = default;
+  explicit StringMapConstIterator(StringMapEntryBase **Bucket,
+                                  bool NoAdvance = false)
+      : base(Bucket, NoAdvance) {}
+
+  const StringMapEntry<ValueTy> &operator*() const {
+    return *static_cast<const StringMapEntry<ValueTy> *>(*this->Ptr);
+  }
+};
+
+template <typename ValueTy>
+class StringMapIterator : public StringMapIterBase<StringMapIterator<ValueTy>,
+                                                   StringMapEntry<ValueTy>> {
+  using base =
+      StringMapIterBase<StringMapIterator<ValueTy>, StringMapEntry<ValueTy>>;
 
+public:
+  StringMapIterator() = default;
   explicit StringMapIterator(StringMapEntryBase **Bucket,
                              bool NoAdvance = false)
-    : StringMapConstIterator<ValueTy>(Bucket, NoAdvance) {
-  }
+      : base(Bucket, NoAdvance) {}
 
   StringMapEntry<ValueTy> &operator*() const {
-    return *static_cast<StringMapEntry<ValueTy>*>(*this->Ptr);
+    return *static_cast<StringMapEntry<ValueTy> *>(*this->Ptr);
+  }
+
+  operator StringMapConstIterator<ValueTy>() const {
+    return StringMapConstIterator<ValueTy>(this->Ptr, true);
   }
-  StringMapEntry<ValueTy> *operator->() const {
-    return static_cast<StringMapEntry<ValueTy>*>(*this->Ptr);
+};
+
+template <typename ValueTy>
+class StringMapKeyIterator
+    : public iterator_adaptor_base<StringMapKeyIterator<ValueTy>,
+                                   StringMapConstIterator<ValueTy>,
+                                   std::forward_iterator_tag, StringRef> {
+  using base = iterator_adaptor_base<StringMapKeyIterator<ValueTy>,
+                                     StringMapConstIterator<ValueTy>,
+                                     std::forward_iterator_tag, StringRef>;
+
+public:
+  StringMapKeyIterator() = default;
+
+  explicit StringMapKeyIterator(StringMapConstIterator<ValueTy> Iter)
+      : base(std::move(Iter)) {}
+
+  StringRef &operator*() {
+    Key = this->wrapped()->getKey();
+    return Key;
   }
+
+private:
+  StringRef Key;
 };
 
 } // end namespace llvm
diff --git a/include/llvm/ADT/StringRef.h b/include/llvm/ADT/StringRef.h
index d80a848c44a13b32751289bcc516316e173acd2e..ce48f6d3bad32b78473c8645de06788ba60988b0 100644
--- a/include/llvm/ADT/StringRef.h
+++ b/include/llvm/ADT/StringRef.h
@@ -557,6 +557,14 @@ namespace llvm {
     /// string is well-formed in the given radix.
     bool getAsInteger(unsigned Radix, APInt &Result) const;
 
+    /// Parse the current string as an IEEE double-precision floating
+    /// point value.  The string must be a well-formed double.
+    ///
+    /// If \p AllowInexact is false, the function will fail if the string
+    /// cannot be represented exactly.  Otherwise, the function only fails
+    /// in case of an overflow or underflow.
+    bool getAsDouble(double &Result, bool AllowInexact = true) const;
+
     /// @}
     /// @name String Operations
     /// @{
@@ -600,7 +608,7 @@ namespace llvm {
       return drop_back(size() - N);
     }
 
-    /// Return a StringRef equal to 'this' but with only the first \p N
+    /// Return a StringRef equal to 'this' but with only the last \p N
     /// elements remaining.  If \p N is greater than the length of the
     /// string, the entire string is returned.
     LLVM_NODISCARD
diff --git a/include/llvm/ADT/Triple.h b/include/llvm/ADT/Triple.h
index 0325ff1ceb089074cfe47ae610326122e9922988..e271075b7e2add9e6c17b86e51a8c55eebc05711 100644
--- a/include/llvm/ADT/Triple.h
+++ b/include/llvm/ADT/Triple.h
@@ -110,6 +110,7 @@ public:
     ARMSubArch_v7m,
     ARMSubArch_v7s,
     ARMSubArch_v7k,
+    ARMSubArch_v7ve,
     ARMSubArch_v6,
     ARMSubArch_v6m,
     ARMSubArch_v6k,
@@ -559,7 +560,8 @@ public:
 
   /// Tests whether the OS uses glibc.
   bool isOSGlibc() const {
-    return getOS() == Triple::Linux || getOS() == Triple::KFreeBSD;
+    return (getOS() == Triple::Linux || getOS() == Triple::KFreeBSD) &&
+           !isAndroid();
   }
 
   /// Tests whether the OS uses the ELF binary format.
@@ -598,6 +600,19 @@ public:
   /// Tests whether the target is Android
   bool isAndroid() const { return getEnvironment() == Triple::Android; }
 
+  bool isAndroidVersionLT(unsigned Major) const {
+    assert(isAndroid() && "Not an Android triple!");
+
+    unsigned Env[3];
+    getEnvironmentVersion(Env[0], Env[1], Env[2]);
+
+    // 64-bit targets did not exist before API level 21 (Lollipop).
+    if (isArch64Bit() && Env[0] < 21)
+      Env[0] = 21;
+
+    return Env[0] < Major;
+  }
+
   /// Tests whether the environment is musl-libc
   bool isMusl() const {
     return getEnvironment() == Triple::Musl ||
diff --git a/include/llvm/ADT/ilist_iterator.h b/include/llvm/ADT/ilist_iterator.h
index ef532d2cf1729005a380efe4025f39c41310d13c..c848d1a134f19cf04204476d4cbd858583c73ed7 100644
--- a/include/llvm/ADT/ilist_iterator.h
+++ b/include/llvm/ADT/ilist_iterator.h
@@ -102,10 +102,23 @@ public:
     return *this;
   }
 
-  /// Convert from an iterator to its reverse.
+  /// Explicit conversion between forward/reverse iterators.
   ///
-  /// TODO: Roll this into the implicit constructor once we're sure that no one
-  /// is relying on the std::reverse_iterator off-by-one semantics.
+  /// Translate between forward and reverse iterators without changing range
+  /// boundaries.  The resulting iterator will dereference (and have a handle)
+  /// to the previous node, which is somewhat unexpected; but converting the
+  /// two endpoints in a range will give the same range in reverse.
+  ///
+  /// This matches std::reverse_iterator conversions.
+  explicit ilist_iterator(
+      const ilist_iterator<OptionsT, !IsReverse, IsConst> &RHS)
+      : ilist_iterator(++RHS.getReverse()) {}
+
+  /// Get a reverse iterator to the same node.
+  ///
+  /// Gives a reverse iterator that will dereference (and have a handle) to the
+  /// same node.  Converting the endpoint iterators in a range will give a
+  /// different range; for range operations, use the explicit conversions.
   ilist_iterator<OptionsT, !IsReverse, IsConst> getReverse() const {
     if (NodePtr)
       return ilist_iterator<OptionsT, !IsReverse, IsConst>(*NodePtr);
diff --git a/include/llvm/ADT/iterator.h b/include/llvm/ADT/iterator.h
index 6470e09db86cc6d86e2feeec23d6b44659de5419..28dcdf9613ef2479885f344f53f63bed18640825 100644
--- a/include/llvm/ADT/iterator.h
+++ b/include/llvm/ADT/iterator.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_ADT_ITERATOR_H
 #define LLVM_ADT_ITERATOR_H
 
+#include "llvm/ADT/iterator_range.h"
 #include <cstddef>
 #include <iterator>
 #include <type_traits>
@@ -91,6 +92,8 @@ protected:
 
 public:
   DerivedT operator+(DifferenceTypeT n) const {
+    static_assert(std::is_base_of<iterator_facade_base, DerivedT>::value,
+                  "Must pass the derived type to this template!");
     static_assert(
         IsRandomAccess,
         "The '+' operator is only defined for random access iterators.");
@@ -114,6 +117,8 @@ public:
   }
 
   DerivedT &operator++() {
+    static_assert(std::is_base_of<iterator_facade_base, DerivedT>::value,
+                  "Must pass the derived type to this template!");
     return static_cast<DerivedT *>(this)->operator+=(1);
   }
   DerivedT operator++(int) {
@@ -160,9 +165,15 @@ public:
     return !static_cast<const DerivedT *>(this)->operator<(RHS);
   }
 
+  PointerT operator->() { return &static_cast<DerivedT *>(this)->operator*(); }
   PointerT operator->() const {
     return &static_cast<const DerivedT *>(this)->operator*();
   }
+  ReferenceProxy operator[](DifferenceTypeT n) {
+    static_assert(IsRandomAccess,
+                  "Subscripting is only defined for random access iterators.");
+    return ReferenceProxy(static_cast<DerivedT *>(this)->operator+(n));
+  }
   ReferenceProxy operator[](DifferenceTypeT n) const {
     static_assert(IsRandomAccess,
                   "Subscripting is only defined for random access iterators.");
@@ -202,7 +213,10 @@ protected:
 
   iterator_adaptor_base() = default;
 
-  explicit iterator_adaptor_base(WrappedIteratorT u) : I(std::move(u)) {}
+  explicit iterator_adaptor_base(WrappedIteratorT u) : I(std::move(u)) {
+    static_assert(std::is_base_of<iterator_adaptor_base, DerivedT>::value,
+                  "Must pass the derived type to this template!");
+  }
 
   const WrappedIteratorT &wrapped() const { return I; }
 
@@ -283,6 +297,15 @@ struct pointee_iterator
   T &operator*() const { return **this->I; }
 };
 
+template <typename RangeT, typename WrappedIteratorT =
+                               decltype(std::begin(std::declval<RangeT>()))>
+iterator_range<pointee_iterator<WrappedIteratorT>>
+make_pointee_range(RangeT &&Range) {
+  using PointeeIteratorT = pointee_iterator<WrappedIteratorT>;
+  return make_range(PointeeIteratorT(std::begin(std::forward<RangeT>(Range))),
+                    PointeeIteratorT(std::end(std::forward<RangeT>(Range))));
+}
+
 template <typename WrappedIteratorT,
           typename T = decltype(&*std::declval<WrappedIteratorT>())>
 class pointer_iterator
@@ -300,6 +323,15 @@ public:
   const T &operator*() const { return Ptr = &*this->I; }
 };
 
+template <typename RangeT, typename WrappedIteratorT =
+                               decltype(std::begin(std::declval<RangeT>()))>
+iterator_range<pointer_iterator<WrappedIteratorT>>
+make_pointer_range(RangeT &&Range) {
+  using PointerIteratorT = pointer_iterator<WrappedIteratorT>;
+  return make_range(PointerIteratorT(std::begin(std::forward<RangeT>(Range))),
+                    PointerIteratorT(std::end(std::forward<RangeT>(Range))));
+}
+
 } // end namespace llvm
 
 #endif // LLVM_ADT_ITERATOR_H
diff --git a/include/llvm/Analysis/AliasAnalysis.h b/include/llvm/Analysis/AliasAnalysis.h
index 484df2f55d9b3880344446c6a186e2fb819ade73..1b8b9751faa19cba5e22f104166ca5722908ac18 100644
--- a/include/llvm/Analysis/AliasAnalysis.h
+++ b/include/llvm/Analysis/AliasAnalysis.h
@@ -524,6 +524,14 @@ public:
   /// Check whether or not an instruction may read or write the specified
   /// memory location.
   ///
+  /// Note explicitly that getModRefInfo considers the effects of reading and
+  /// writing the memory location, and not the effect of ordering relative to
+  /// other instructions.  Thus, a volatile load is considered to be Ref,
+  /// because it does not actually write memory, it just can't be reordered
+  /// relative to other volatiles (or removed).  Atomic ordered loads/stores are
+  /// considered ModRef ATM because conservatively, the visible effect appears
+  /// as if memory was written, not just an ordering constraint.
+  ///
   /// An instruction that doesn't read or write memory may be trivially LICM'd
   /// for example.
   ///
diff --git a/include/llvm/Analysis/AliasSetTracker.h b/include/llvm/Analysis/AliasSetTracker.h
index 5d11b22c6eed93133af27e79bb680a393fcfbb19..eac97501c759c73be37a6deb8f03baeb1b5278fb 100644
--- a/include/llvm/Analysis/AliasSetTracker.h
+++ b/include/llvm/Analysis/AliasSetTracker.h
@@ -121,7 +121,10 @@ class AliasSet : public ilist_node<AliasSet> {
   AliasSet *Forward;
 
   /// All instructions without a specific address in this alias set.
-  std::vector<AssertingVH<Instruction> > UnknownInsts;
+  /// In rare cases this vector can have a null'ed out WeakVH
+  /// instances (can happen if some other loop pass deletes an
+  /// instruction in this list).
+  std::vector<WeakVH> UnknownInsts;
 
   /// Number of nodes pointing to this AliasSet plus the number of AliasSets
   /// forwarding to it.
@@ -171,7 +174,7 @@ class AliasSet : public ilist_node<AliasSet> {
 
   Instruction *getUnknownInst(unsigned i) const {
     assert(i < UnknownInsts.size());
-    return UnknownInsts[i];
+    return cast_or_null<Instruction>(UnknownInsts[i]);
   }
 
 public:
diff --git a/include/llvm/Analysis/AssumptionCache.h b/include/llvm/Analysis/AssumptionCache.h
index 72c67955a950f166f0430a4361b1dd173729e270..f833f417c7dd2e6e9ad473a316c6932fe56ceda6 100644
--- a/include/llvm/Analysis/AssumptionCache.h
+++ b/include/llvm/Analysis/AssumptionCache.h
@@ -31,11 +31,10 @@ namespace llvm {
 /// \brief A cache of @llvm.assume calls within a function.
 ///
 /// This cache provides fast lookup of assumptions within a function by caching
-/// them and amortizing the cost of scanning for them across all queries. The
-/// cache is also conservatively self-updating so that it will never return
-/// incorrect results about a function even as the function is being mutated.
-/// However, flushing the cache and rebuilding it (or explicitly updating it)
-/// may allow it to discover new assumptions.
+/// them and amortizing the cost of scanning for them across all queries. Passes
+/// that create new assumptions are required to call registerAssumption() to
+/// register any new @llvm.assume calls that they create. Deletions of
+/// @llvm.assume calls do not require special handling.
 class AssumptionCache {
   /// \brief The function for which this cache is handling assumptions.
   ///
@@ -203,7 +202,10 @@ public:
   AssumptionCacheTracker();
   ~AssumptionCacheTracker() override;
 
-  void releaseMemory() override { AssumptionCaches.shrink_and_clear(); }
+  void releaseMemory() override {
+    verifyAnalysis();
+    AssumptionCaches.shrink_and_clear();
+  }
 
   void verifyAnalysis() const override;
   bool doFinalization(Module &) override {
diff --git a/include/llvm/Analysis/BasicAliasAnalysis.h b/include/llvm/Analysis/BasicAliasAnalysis.h
index addfffa01061f61569f184f8b89dc0f15f7b21fa..14e4bded264a9dbd52faafae1f88c7f4be2f2b42 100644
--- a/include/llvm/Analysis/BasicAliasAnalysis.h
+++ b/include/llvm/Analysis/BasicAliasAnalysis.h
@@ -233,6 +233,24 @@ FunctionPass *createBasicAAWrapperPass();
 /// populated to the best of our ability for a particular function when inside
 /// of a \c ModulePass or a \c CallGraphSCCPass.
 BasicAAResult createLegacyPMBasicAAResult(Pass &P, Function &F);
+
+/// This class is a functor to be used in legacy module or SCC passes for
+/// computing AA results for a function. We store the results in fields so that
+/// they live long enough to be queried, but we re-use them each time.
+class LegacyAARGetter {
+  Pass &P;
+  Optional<BasicAAResult> BAR;
+  Optional<AAResults> AAR;
+
+public:
+  LegacyAARGetter(Pass &P) : P(P) {}
+  AAResults &operator()(Function &F) {
+    BAR.emplace(createLegacyPMBasicAAResult(P, F));
+    AAR.emplace(createLegacyPMAAResults(P, F, *BAR));
+    return *AAR;
+  }
+};
+
 }
 
 #endif
diff --git a/include/llvm/Analysis/BranchProbabilityInfo.h b/include/llvm/Analysis/BranchProbabilityInfo.h
index 14b7a7f529f79cba15203324163f03314b830105..6a876679543d4026ab4b163957bb6aaf037d944b 100644
--- a/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -164,6 +164,8 @@ private:
   /// \brief Track the set of blocks that always lead to a cold call.
   SmallPtrSet<const BasicBlock *, 16> PostDominatedByColdCall;
 
+  void updatePostDominatedByUnreachable(const BasicBlock *BB);
+  void updatePostDominatedByColdCall(const BasicBlock *BB);
   bool calcUnreachableHeuristics(const BasicBlock *BB);
   bool calcMetadataWeights(const BasicBlock *BB);
   bool calcColdCallHeuristics(const BasicBlock *BB);
diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h
index efaa9d6df8ea9dadfe79050ce450b2b9be9cc53c..5786769cc500ab9d622781b120c142e2a633bda6 100644
--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h
@@ -140,8 +140,7 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
 
       std::string Str;
       raw_string_ostream OS(Str);
-      SwitchInst::ConstCaseIt Case =
-          SwitchInst::ConstCaseIt::fromSuccessorIndex(SI, SuccNo);
+      auto Case = *SwitchInst::ConstCaseIt::fromSuccessorIndex(SI, SuccNo);
       OS << Case.getCaseValue()->getValue();
       return OS.str();
     }
diff --git a/include/llvm/Analysis/CGSCCPassManager.h b/include/llvm/Analysis/CGSCCPassManager.h
index 3832e23e854a57d5c4eaa8abae6fe2662332a723..398bbfb0c4132752e478899c4ab5d311eee66a0d 100644
--- a/include/llvm/Analysis/CGSCCPassManager.h
+++ b/include/llvm/Analysis/CGSCCPassManager.h
@@ -191,8 +191,8 @@ CGSCCAnalysisManagerModuleProxy::run(Module &M, ModuleAnalysisManager &AM);
 // template.
 extern template class InnerAnalysisManagerProxy<CGSCCAnalysisManager, Module>;
 
-extern template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
-                                                LazyCallGraph::SCC>;
+extern template class OuterAnalysisManagerProxy<
+    ModuleAnalysisManager, LazyCallGraph::SCC, LazyCallGraph &>;
 /// A proxy from a \c ModuleAnalysisManager to an \c SCC.
 typedef OuterAnalysisManagerProxy<ModuleAnalysisManager, LazyCallGraph::SCC,
                                   LazyCallGraph &>
@@ -334,6 +334,7 @@ public:
                             InvalidSCCSet, nullptr,   nullptr};
 
     PreservedAnalyses PA = PreservedAnalyses::all();
+    CG.buildRefSCCs();
     for (auto RCI = CG.postorder_ref_scc_begin(),
               RCE = CG.postorder_ref_scc_end();
          RCI != RCE;) {
diff --git a/include/llvm/Analysis/ConstantFolding.h b/include/llvm/Analysis/ConstantFolding.h
index 517842c8b0dcb332fd975ed4c9013e1dabf47c7c..ff6ca1959153abbcf90febe6b3c1d2c88fd0f154 100644
--- a/include/llvm/Analysis/ConstantFolding.h
+++ b/include/llvm/Analysis/ConstantFolding.h
@@ -100,6 +100,12 @@ Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
 /// successful; if not, null is returned.
 Constant *ConstantFoldExtractElementInstruction(Constant *Val, Constant *Idx);
 
+/// \brief Attempt to constant fold a shufflevector instruction with the
+/// specified operands and indices.  The constant result is returned if
+/// successful; if not, null is returned.
+Constant *ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
+                                               Constant *Mask);
+
 /// ConstantFoldLoadFromConstPtr - Return the value that a load from C would
 /// produce if it is constant and determinable.  If this is not determinable,
 /// return null.
diff --git a/include/llvm/Analysis/IndirectCallSiteVisitor.h b/include/llvm/Analysis/IndirectCallSiteVisitor.h
index 71a8cb886321676b9e2919ca59cdaa8f05b1a4d0..3c40cc0235cc0b28a62952c5ccc342615eb3ff21 100644
--- a/include/llvm/Analysis/IndirectCallSiteVisitor.h
+++ b/include/llvm/Analysis/IndirectCallSiteVisitor.h
@@ -21,16 +21,8 @@ struct PGOIndirectCallSiteVisitor
   PGOIndirectCallSiteVisitor() {}
 
   void visitCallSite(CallSite CS) {
-    if (CS.getCalledFunction() || !CS.getCalledValue())
-      return;
-    Instruction *I = CS.getInstruction();
-    if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      if (CI->isInlineAsm())
-        return;
-    }
-    if (isa<Constant>(CS.getCalledValue()))
-      return;
-    IndirectCallInsts.push_back(I);
+    if (CS.isIndirectCall())
+      IndirectCallInsts.push_back(CS.getInstruction());
   }
 };
 
diff --git a/include/llvm/Analysis/InstructionSimplify.h b/include/llvm/Analysis/InstructionSimplify.h
index 47d6118313cb11fe79ed2570fe88b4b2a44d8637..b829e995db055fff064d57ea6e60e7d1e83f5c26 100644
--- a/include/llvm/Analysis/InstructionSimplify.h
+++ b/include/llvm/Analysis/InstructionSimplify.h
@@ -42,6 +42,7 @@ namespace llvm {
   class Instruction;
   class DataLayout;
   class FastMathFlags;
+  class OptimizationRemarkEmitter;
   class TargetLibraryInfo;
   class Type;
   class Value;
@@ -246,6 +247,14 @@ namespace llvm {
                           AssumptionCache *AC = nullptr,
                           const Instruction *CxtI = nullptr);
 
+  /// Given operands for a ShuffleVectorInst, fold the result or return null.
+  Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
+                                   Type *RetTy, const DataLayout &DL,
+                                   const TargetLibraryInfo *TLI = nullptr,
+                                   const DominatorTree *DT = nullptr,
+                                   AssumptionCache *AC = nullptr,
+                                   const Instruction *CxtI = nullptr);
+
   //=== Helper functions for higher up the class hierarchy.
 
 
@@ -296,7 +305,8 @@ namespace llvm {
   Value *SimplifyInstruction(Instruction *I, const DataLayout &DL,
                              const TargetLibraryInfo *TLI = nullptr,
                              const DominatorTree *DT = nullptr,
-                             AssumptionCache *AC = nullptr);
+                             AssumptionCache *AC = nullptr,
+                             OptimizationRemarkEmitter *ORE = nullptr);
 
   /// Replace all uses of 'I' with 'SimpleV' and simplify the uses recursively.
   ///
diff --git a/include/llvm/Analysis/LazyBlockFrequencyInfo.h b/include/llvm/Analysis/LazyBlockFrequencyInfo.h
index 5a02b9dce46317fda16935f69e7cefa14e482851..71ce0842f6a994f833f36520da9eae4ed01aba85 100644
--- a/include/llvm/Analysis/LazyBlockFrequencyInfo.h
+++ b/include/llvm/Analysis/LazyBlockFrequencyInfo.h
@@ -9,7 +9,7 @@
 //
 // This is an alternative analysis pass to BlockFrequencyInfoWrapperPass.  The
 // difference is that with this pass the block frequencies are not computed when
-// the analysis pass is executed but rather when the BFI results is explicitly
+// the analysis pass is executed but rather when the BFI result is explicitly
 // requested by the analysis client.
 //
 //===----------------------------------------------------------------------===//
@@ -27,10 +27,58 @@ class BranchProbabilityInfo;
 class Function;
 class LoopInfo;
 
+/// Wraps a BFI to allow lazy computation of the block frequencies.
+///
+/// A pass that only conditionally uses BFI can uncondtionally require the
+/// analysis without paying for the overhead if BFI doesn't end up being used.
+template <typename FunctionT, typename BranchProbabilityInfoPassT,
+          typename LoopInfoT, typename BlockFrequencyInfoT>
+class LazyBlockFrequencyInfo {
+public:
+  LazyBlockFrequencyInfo()
+      : Calculated(false), F(nullptr), BPIPass(nullptr), LI(nullptr) {}
+
+  /// Set up the per-function input.
+  void setAnalysis(const FunctionT *F, BranchProbabilityInfoPassT *BPIPass,
+                   const LoopInfoT *LI) {
+    this->F = F;
+    this->BPIPass = BPIPass;
+    this->LI = LI;
+  }
+
+  /// Retrieve the BFI with the block frequencies computed.
+  BlockFrequencyInfoT &getCalculated() {
+    if (!Calculated) {
+      assert(F && BPIPass && LI && "call setAnalysis");
+      BFI.calculate(
+          *F, BPIPassTrait<BranchProbabilityInfoPassT>::getBPI(BPIPass), *LI);
+      Calculated = true;
+    }
+    return BFI;
+  }
+
+  const BlockFrequencyInfoT &getCalculated() const {
+    return const_cast<LazyBlockFrequencyInfo *>(this)->getCalculated();
+  }
+
+  void releaseMemory() {
+    BFI.releaseMemory();
+    Calculated = false;
+    setAnalysis(nullptr, nullptr, nullptr);
+  }
+
+private:
+  BlockFrequencyInfoT BFI;
+  bool Calculated;
+  const FunctionT *F;
+  BranchProbabilityInfoPassT *BPIPass;
+  const LoopInfoT *LI;
+};
+
 /// \brief This is an alternative analysis pass to
 /// BlockFrequencyInfoWrapperPass.  The difference is that with this pass the
 /// block frequencies are not computed when the analysis pass is executed but
-/// rather when the BFI results is explicitly requested by the analysis client.
+/// rather when the BFI result is explicitly requested by the analysis client.
 ///
 /// There are some additional requirements for any client pass that wants to use
 /// the analysis:
@@ -49,54 +97,12 @@ class LoopInfo;
 ///
 /// Note that it is expected that we wouldn't need this functionality for the
 /// new PM since with the new PM, analyses are executed on demand.
-class LazyBlockFrequencyInfoPass : public FunctionPass {
-
-  /// Wraps a BFI to allow lazy computation of the block frequencies.
-  ///
-  /// A pass that only conditionally uses BFI can uncondtionally require the
-  /// analysis without paying for the overhead if BFI doesn't end up being used.
-  class LazyBlockFrequencyInfo {
-  public:
-    LazyBlockFrequencyInfo()
-        : Calculated(false), F(nullptr), BPIPass(nullptr), LI(nullptr) {}
-
-    /// Set up the per-function input.
-    void setAnalysis(const Function *F, LazyBranchProbabilityInfoPass *BPIPass,
-                     const LoopInfo *LI) {
-      this->F = F;
-      this->BPIPass = BPIPass;
-      this->LI = LI;
-    }
 
-    /// Retrieve the BFI with the block frequencies computed.
-    BlockFrequencyInfo &getCalculated() {
-      if (!Calculated) {
-        assert(F && BPIPass && LI && "call setAnalysis");
-        BFI.calculate(*F, BPIPass->getBPI(), *LI);
-        Calculated = true;
-      }
-      return BFI;
-    }
-
-    const BlockFrequencyInfo &getCalculated() const {
-      return const_cast<LazyBlockFrequencyInfo *>(this)->getCalculated();
-    }
-
-    void releaseMemory() {
-      BFI.releaseMemory();
-      Calculated = false;
-      setAnalysis(nullptr, nullptr, nullptr);
-    }
-
-  private:
-    BlockFrequencyInfo BFI;
-    bool Calculated;
-    const Function *F;
-    LazyBranchProbabilityInfoPass *BPIPass;
-    const LoopInfo *LI;
-  };
-
-  LazyBlockFrequencyInfo LBFI;
+class LazyBlockFrequencyInfoPass : public FunctionPass {
+private:
+  LazyBlockFrequencyInfo<Function, LazyBranchProbabilityInfoPass, LoopInfo,
+                         BlockFrequencyInfo>
+      LBFI;
 
 public:
   static char ID;
diff --git a/include/llvm/Analysis/LazyBranchProbabilityInfo.h b/include/llvm/Analysis/LazyBranchProbabilityInfo.h
index c76fa1e819ae9e2ae69777ad22ea36815d3fc11b..067d7ebfd1f53808be3ddd9561a2b7ac68e04e7f 100644
--- a/include/llvm/Analysis/LazyBranchProbabilityInfo.h
+++ b/include/llvm/Analysis/LazyBranchProbabilityInfo.h
@@ -105,5 +105,17 @@ public:
 
 /// \brief Helper for client passes to initialize dependent passes for LBPI.
 void initializeLazyBPIPassPass(PassRegistry &Registry);
+
+/// \brief Simple trait class that provides a mapping between BPI passes and the
+/// corresponding BPInfo.
+template <typename PassT> struct BPIPassTrait {
+  static PassT &getBPI(PassT *P) { return *P; }
+};
+
+template <> struct BPIPassTrait<LazyBranchProbabilityInfoPass> {
+  static BranchProbabilityInfo &getBPI(LazyBranchProbabilityInfoPass *P) {
+    return P->getBPI();
+  }
+};
 }
 #endif
diff --git a/include/llvm/Analysis/LazyCallGraph.h b/include/llvm/Analysis/LazyCallGraph.h
index bca0aebe2eefa8a1f90fd9eed191e38e88ef1acc..ad7f5c80549fc0bb6195e299defb96f22f56b441 100644
--- a/include/llvm/Analysis/LazyCallGraph.h
+++ b/include/llvm/Analysis/LazyCallGraph.h
@@ -106,6 +106,7 @@ class raw_ostream;
 class LazyCallGraph {
 public:
   class Node;
+  class EdgeSequence;
   class SCC;
   class RefSCC;
   class edge_iterator;
@@ -121,16 +122,6 @@ public:
   /// inherently reference edges, and so the reference graph forms a superset
   /// of the formal call graph.
   ///
-  /// Furthermore, edges also may point to raw \c Function objects when those
-  /// functions have not been scanned and incorporated into the graph (yet).
-  /// This is one of the primary ways in which the graph can be lazy. When
-  /// functions are scanned and fully incorporated into the graph, all of the
-  /// edges referencing them are updated to point to the graph \c Node objects
-  /// instead of to the raw \c Function objects. This class even provides
-  /// methods to trigger this scan on-demand by attempting to get the target
-  /// node of the graph and providing a reference back to the graph in order to
-  /// lazily build it if necessary.
-  ///
   /// All of these forms of edges are fundamentally represented as outgoing
   /// edges. The edges are stored in the source node and point at the target
   /// node. This allows the edge structure itself to be a very compact data
@@ -141,7 +132,6 @@ public:
     enum Kind : bool { Ref = false, Call = true };
 
     Edge();
-    explicit Edge(Function &F, Kind K);
     explicit Edge(Node &N, Kind K);
 
     /// Test whether the edge is null.
@@ -158,197 +148,251 @@ public:
     /// This requires that the edge is not null.
     bool isCall() const;
 
-    /// Get the function referenced by this edge.
-    ///
-    /// This requires that the edge is not null, but will succeed whether we
-    /// have built a graph node for the function yet or not.
-    Function &getFunction() const;
-
-    /// Get the call graph node referenced by this edge if one exists.
+    /// Get the call graph node referenced by this edge.
     ///
-    /// This requires that the edge is not null. If we have built a graph node
-    /// for the function this edge points to, this will return that node,
-    /// otherwise it will return null.
-    Node *getNode() const;
+    /// This requires that the edge is not null.
+    Node &getNode() const;
 
-    /// Get the call graph node for this edge, building it if necessary.
+    /// Get the function referenced by this edge.
     ///
-    /// This requires that the edge is not null. If we have not yet built
-    /// a graph node for the function this edge points to, this will first ask
-    /// the graph to build that node, inserting it into all the relevant
-    /// structures.
-    Node &getNode(LazyCallGraph &G);
+    /// This requires that the edge is not null.
+    Function &getFunction() const;
 
   private:
-    friend class LazyCallGraph::Node;
+    friend class LazyCallGraph::EdgeSequence;
     friend class LazyCallGraph::RefSCC;
 
-    PointerIntPair<PointerUnion<Function *, Node *>, 1, Kind> Value;
+    PointerIntPair<Node *, 1, Kind> Value;
 
     void setKind(Kind K) { Value.setInt(K); }
   };
 
-  typedef SmallVector<Edge, 4> EdgeVectorT;
-  typedef SmallVectorImpl<Edge> EdgeVectorImplT;
-
-  /// A node in the call graph.
+  /// The edge sequence object.
   ///
-  /// This represents a single node. It's primary roles are to cache the list of
-  /// callees, de-duplicate and provide fast testing of whether a function is
-  /// a callee, and facilitate iteration of child nodes in the graph.
-  class Node {
+  /// This typically exists entirely within the node but is exposed as
+  /// a separate type because a node doesn't initially have edges. An explicit
+  /// population step is required to produce this sequence at first and it is
+  /// then cached in the node. It is also used to represent edges entering the
+  /// graph from outside the module to model the graph's roots.
+  ///
+  /// The sequence itself both iterable and indexable. The indexes remain
+  /// stable even as the sequence mutates (including removal).
+  class EdgeSequence {
     friend class LazyCallGraph;
-    friend class LazyCallGraph::SCC;
+    friend class LazyCallGraph::Node;
     friend class LazyCallGraph::RefSCC;
 
-    LazyCallGraph *G;
-    Function &F;
+    typedef SmallVector<Edge, 4> VectorT;
+    typedef SmallVectorImpl<Edge> VectorImplT;
 
-    // We provide for the DFS numbering and Tarjan walk lowlink numbers to be
-    // stored directly within the node. These are both '-1' when nodes are part
-    // of an SCC (or RefSCC), or '0' when not yet reached in a DFS walk.
-    int DFSNumber;
-    int LowLink;
+  public:
+    /// An iterator used for the edges to both entry nodes and child nodes.
+    class iterator
+        : public iterator_adaptor_base<iterator, VectorImplT::iterator,
+                                       std::forward_iterator_tag> {
+      friend class LazyCallGraph;
+      friend class LazyCallGraph::Node;
+
+      VectorImplT::iterator E;
+
+      // Build the iterator for a specific position in the edge list.
+      iterator(VectorImplT::iterator BaseI, VectorImplT::iterator E)
+          : iterator_adaptor_base(BaseI), E(E) {
+        while (I != E && !*I)
+          ++I;
+      }
 
-    mutable EdgeVectorT Edges;
-    DenseMap<Function *, int> EdgeIndexMap;
+    public:
+      iterator() {}
 
-    /// Basic constructor implements the scanning of F into Edges and
-    /// EdgeIndexMap.
-    Node(LazyCallGraph &G, Function &F);
+      using iterator_adaptor_base::operator++;
+      iterator &operator++() {
+        do {
+          ++I;
+        } while (I != E && !*I);
+        return *this;
+      }
+    };
 
-    /// Internal helper to insert an edge to a function.
-    void insertEdgeInternal(Function &ChildF, Edge::Kind EK);
+    /// An iterator over specifically call edges.
+    ///
+    /// This has the same iteration properties as the \c iterator, but
+    /// restricts itself to edges which represent actual calls.
+    class call_iterator
+        : public iterator_adaptor_base<call_iterator, VectorImplT::iterator,
+                                       std::forward_iterator_tag> {
+      friend class LazyCallGraph;
+      friend class LazyCallGraph::Node;
+
+      VectorImplT::iterator E;
+
+      /// Advance the iterator to the next valid, call edge.
+      void advanceToNextEdge() {
+        while (I != E && (!*I || !I->isCall()))
+          ++I;
+      }
 
-    /// Internal helper to insert an edge to a node.
-    void insertEdgeInternal(Node &ChildN, Edge::Kind EK);
+      // Build the iterator for a specific position in the edge list.
+      call_iterator(VectorImplT::iterator BaseI, VectorImplT::iterator E)
+          : iterator_adaptor_base(BaseI), E(E) {
+        advanceToNextEdge();
+      }
 
-    /// Internal helper to change an edge kind.
-    void setEdgeKind(Function &ChildF, Edge::Kind EK);
+    public:
+      call_iterator() {}
 
-    /// Internal helper to remove the edge to the given function.
-    void removeEdgeInternal(Function &ChildF);
+      using iterator_adaptor_base::operator++;
+      call_iterator &operator++() {
+        ++I;
+        advanceToNextEdge();
+        return *this;
+      }
+    };
 
-    void clear() {
-      Edges.clear();
-      EdgeIndexMap.clear();
-    }
+    iterator begin() { return iterator(Edges.begin(), Edges.end()); }
+    iterator end() { return iterator(Edges.end(), Edges.end()); }
 
-    /// Print the name of this node's function.
-    friend raw_ostream &operator<<(raw_ostream &OS, const Node &N) {
-      return OS << N.F.getName();
+    Edge &operator[](int i) { return Edges[i]; }
+    Edge &operator[](Node &N) {
+      assert(EdgeIndexMap.find(&N) != EdgeIndexMap.end() && "No such edge!");
+      return Edges[EdgeIndexMap.find(&N)->second];
     }
-
-    /// Dump the name of this node's function to stderr.
-    void dump() const;
-
-  public:
-    LazyCallGraph &getGraph() const { return *G; }
-
-    Function &getFunction() const { return F; }
-
-    edge_iterator begin() const {
-      return edge_iterator(Edges.begin(), Edges.end());
+    Edge *lookup(Node &N) {
+      auto EI = EdgeIndexMap.find(&N);
+      return EI != EdgeIndexMap.end() ? &Edges[EI->second] : nullptr;
     }
-    edge_iterator end() const { return edge_iterator(Edges.end(), Edges.end()); }
 
-    const Edge &operator[](int i) const { return Edges[i]; }
-    const Edge &operator[](Function &F) const {
-      assert(EdgeIndexMap.find(&F) != EdgeIndexMap.end() && "No such edge!");
-      return Edges[EdgeIndexMap.find(&F)->second];
+    call_iterator call_begin() {
+      return call_iterator(Edges.begin(), Edges.end());
     }
-    const Edge &operator[](Node &N) const { return (*this)[N.getFunction()]; }
+    call_iterator call_end() { return call_iterator(Edges.end(), Edges.end()); }
 
-    const Edge *lookup(Function &F) const {
-      auto EI = EdgeIndexMap.find(&F);
-      return EI != EdgeIndexMap.end() ? &Edges[EI->second] : nullptr;
+    iterator_range<call_iterator> calls() {
+      return make_range(call_begin(), call_end());
     }
 
-    call_edge_iterator call_begin() const {
-      return call_edge_iterator(Edges.begin(), Edges.end());
-    }
-    call_edge_iterator call_end() const {
-      return call_edge_iterator(Edges.end(), Edges.end());
-    }
+    bool empty() {
+      for (auto &E : Edges)
+        if (E)
+          return false;
 
-    iterator_range<call_edge_iterator> calls() const {
-      return make_range(call_begin(), call_end());
+      return true;
     }
 
-    /// Equality is defined as address equality.
-    bool operator==(const Node &N) const { return this == &N; }
-    bool operator!=(const Node &N) const { return !operator==(N); }
-  };
+  private:
+    VectorT Edges;
+    DenseMap<Node *, int> EdgeIndexMap;
 
-  /// A lazy iterator used for both the entry nodes and child nodes.
-  ///
-  /// When this iterator is dereferenced, if not yet available, a function will
-  /// be scanned for "calls" or uses of functions and its child information
-  /// will be constructed. All of these results are accumulated and cached in
-  /// the graph.
-  class edge_iterator
-      : public iterator_adaptor_base<edge_iterator, EdgeVectorImplT::iterator,
-                                     std::forward_iterator_tag> {
-    friend class LazyCallGraph;
-    friend class LazyCallGraph::Node;
+    EdgeSequence() = default;
 
-    EdgeVectorImplT::iterator E;
+    /// Internal helper to insert an edge to a node.
+    void insertEdgeInternal(Node &ChildN, Edge::Kind EK);
 
-    // Build the iterator for a specific position in the edge list.
-    edge_iterator(EdgeVectorImplT::iterator BaseI,
-                  EdgeVectorImplT::iterator E)
-        : iterator_adaptor_base(BaseI), E(E) {
-      while (I != E && !*I)
-        ++I;
-    }
+    /// Internal helper to change an edge kind.
+    void setEdgeKind(Node &ChildN, Edge::Kind EK);
 
-  public:
-    edge_iterator() {}
+    /// Internal helper to remove the edge to the given function.
+    bool removeEdgeInternal(Node &ChildN);
 
-    using iterator_adaptor_base::operator++;
-    edge_iterator &operator++() {
-      do {
-        ++I;
-      } while (I != E && !*I);
-      return *this;
-    }
+    /// Internal helper to replace an edge key with a new one.
+    ///
+    /// This should be used when the function for a particular node in the
+    /// graph gets replaced and we are updating all of the edges to that node
+    /// to use the new function as the key.
+    void replaceEdgeKey(Function &OldTarget, Function &NewTarget);
   };
 
-  /// A lazy iterator over specifically call edges.
+  /// A node in the call graph.
+  ///
+  /// This represents a single node. It's primary roles are to cache the list of
+  /// callees, de-duplicate and provide fast testing of whether a function is
+  /// a callee, and facilitate iteration of child nodes in the graph.
   ///
-  /// This has the same iteration properties as the \c edge_iterator, but
-  /// restricts itself to edges which represent actual calls.
-  class call_edge_iterator
-      : public iterator_adaptor_base<call_edge_iterator,
-                                     EdgeVectorImplT::iterator,
-                                     std::forward_iterator_tag> {
+  /// The node works much like an optional in order to lazily populate the
+  /// edges of each node. Until populated, there are no edges. Once populated,
+  /// you can access the edges by dereferencing the node or using the `->`
+  /// operator as if the node was an `Optional<EdgeSequence>`.
+  class Node {
     friend class LazyCallGraph;
-    friend class LazyCallGraph::Node;
+    friend class LazyCallGraph::RefSCC;
 
-    EdgeVectorImplT::iterator E;
+  public:
+    LazyCallGraph &getGraph() const { return *G; }
 
-    /// Advance the iterator to the next valid, call edge.
-    void advanceToNextEdge() {
-      while (I != E && (!*I || !I->isCall()))
-        ++I;
+    Function &getFunction() const { return *F; }
+
+    StringRef getName() const { return F->getName(); }
+
+    /// Equality is defined as address equality.
+    bool operator==(const Node &N) const { return this == &N; }
+    bool operator!=(const Node &N) const { return !operator==(N); }
+
+    /// Tests whether the node has been populated with edges.
+    operator bool() const { return Edges.hasValue(); }
+
+    // We allow accessing the edges by dereferencing or using the arrow
+    // operator, essentially wrapping the internal optional.
+    EdgeSequence &operator*() const {
+      // Rip const off because the node itself isn't changing here.
+      return const_cast<EdgeSequence &>(*Edges);
     }
+    EdgeSequence *operator->() const { return &**this; }
 
-    // Build the iterator for a specific position in the edge list.
-    call_edge_iterator(EdgeVectorImplT::iterator BaseI,
-                       EdgeVectorImplT::iterator E)
-        : iterator_adaptor_base(BaseI), E(E) {
-      advanceToNextEdge();
+    /// Populate the edges of this node if necessary.
+    ///
+    /// The first time this is called it will populate the edges for this node
+    /// in the graph. It does this by scanning the underlying function, so once
+    /// this is done, any changes to that function must be explicitly reflected
+    /// in updates to the graph.
+    ///
+    /// \returns the populated \c EdgeSequence to simplify walking it.
+    ///
+    /// This will not update or re-scan anything if called repeatedly. Instead,
+    /// the edge sequence is cached and returned immediately on subsequent
+    /// calls.
+    EdgeSequence &populate() {
+      if (Edges)
+        return *Edges;
+
+      return populateSlow();
     }
 
-  public:
-    call_edge_iterator() {}
+  private:
+    LazyCallGraph *G;
+    Function *F;
 
-    using iterator_adaptor_base::operator++;
-    call_edge_iterator &operator++() {
-      ++I;
-      advanceToNextEdge();
-      return *this;
+    // We provide for the DFS numbering and Tarjan walk lowlink numbers to be
+    // stored directly within the node. These are both '-1' when nodes are part
+    // of an SCC (or RefSCC), or '0' when not yet reached in a DFS walk.
+    int DFSNumber;
+    int LowLink;
+
+    Optional<EdgeSequence> Edges;
+
+    /// Basic constructor implements the scanning of F into Edges and
+    /// EdgeIndexMap.
+    Node(LazyCallGraph &G, Function &F)
+        : G(&G), F(&F), DFSNumber(0), LowLink(0) {}
+
+    /// Implementation of the scan when populating.
+    EdgeSequence &populateSlow();
+
+    /// Internal helper to directly replace the function with a new one.
+    ///
+    /// This is used to facilitate tranfsormations which need to replace the
+    /// formal Function object but directly move the body and users from one to
+    /// the other.
+    void replaceFunction(Function &NewF);
+
+    void clear() { Edges.reset(); }
+
+    /// Print the name of this node's function.
+    friend raw_ostream &operator<<(raw_ostream &OS, const Node &N) {
+      return OS << N.F->getName();
     }
+
+    /// Dump the name of this node's function to stderr.
+    void dump() const;
   };
 
   /// An SCC of the call graph.
@@ -789,19 +833,26 @@ public:
     /// already existing edges.
     void insertTrivialRefEdge(Node &SourceN, Node &TargetN);
 
+    /// Directly replace a node's function with a new function.
+    ///
+    /// This should be used when moving the body and users of a function to
+    /// a new formal function object but not otherwise changing the call graph
+    /// structure in any way.
+    ///
+    /// It requires that the old function in the provided node have zero uses
+    /// and the new function must have calls and references to it establishing
+    /// an equivalent graph.
+    void replaceNodeFunction(Node &N, Function &NewF);
+
     ///@}
   };
 
   /// A post-order depth-first RefSCC iterator over the call graph.
   ///
-  /// This iterator triggers the Tarjan DFS-based formation of the RefSCC (and
-  /// SCC) DAG for the call graph, walking it lazily in depth-first post-order.
-  /// That is, it always visits RefSCCs for the target of a reference edge
-  /// prior to visiting the RefSCC for a source of the edge (when they are in
-  /// different RefSCCs).
-  ///
-  /// When forming each RefSCC, the call edges within it are used to form SCCs
-  /// within it, so iterating this also controls the lazy formation of SCCs.
+  /// This iterator walks the cached post-order sequence of RefSCCs. However,
+  /// it trades stability for flexibility. It is restricted to a forward
+  /// iterator but will survive mutations which insert new RefSCCs and continue
+  /// to point to the same RefSCC even if it moves in the post-order sequence.
   class postorder_ref_scc_iterator
       : public iterator_facade_base<postorder_ref_scc_iterator,
                                     std::forward_iterator_tag, RefSCC> {
@@ -825,12 +876,9 @@ public:
     /// populating it if necessary.
     static RefSCC *getRC(LazyCallGraph &G, int Index) {
       if (Index == (int)G.PostOrderRefSCCs.size())
-        if (!G.buildNextRefSCCInPostOrder())
-          // We're at the end.
-          return nullptr;
+        // We're at the end.
+        return nullptr;
 
-      assert(Index < (int)G.PostOrderRefSCCs.size() &&
-             "Built the next post-order RefSCC without growing list!");
       return G.PostOrderRefSCCs[Index];
     }
 
@@ -859,17 +907,21 @@ public:
   LazyCallGraph(LazyCallGraph &&G);
   LazyCallGraph &operator=(LazyCallGraph &&RHS);
 
-  edge_iterator begin() {
-    return edge_iterator(EntryEdges.begin(), EntryEdges.end());
-  }
-  edge_iterator end() {
-    return edge_iterator(EntryEdges.end(), EntryEdges.end());
-  }
+  EdgeSequence::iterator begin() { return EntryEdges.begin(); }
+  EdgeSequence::iterator end() { return EntryEdges.end(); }
+
+  void buildRefSCCs();
 
   postorder_ref_scc_iterator postorder_ref_scc_begin() {
+    if (!EntryEdges.empty())
+      assert(!PostOrderRefSCCs.empty() &&
+             "Must form RefSCCs before iterating them!");
     return postorder_ref_scc_iterator(*this);
   }
   postorder_ref_scc_iterator postorder_ref_scc_end() {
+    if (!EntryEdges.empty())
+      assert(!PostOrderRefSCCs.empty() &&
+             "Must form RefSCCs before iterating them!");
     return postorder_ref_scc_iterator(*this,
                                       postorder_ref_scc_iterator::IsAtEndT());
   }
@@ -920,19 +972,19 @@ public:
   /// below.
 
   /// Update the call graph after inserting a new edge.
-  void insertEdge(Node &Caller, Function &Callee, Edge::Kind EK);
+  void insertEdge(Node &SourceN, Node &TargetN, Edge::Kind EK);
 
   /// Update the call graph after inserting a new edge.
-  void insertEdge(Function &Caller, Function &Callee, Edge::Kind EK) {
-    return insertEdge(get(Caller), Callee, EK);
+  void insertEdge(Function &Source, Function &Target, Edge::Kind EK) {
+    return insertEdge(get(Source), get(Target), EK);
   }
 
   /// Update the call graph after deleting an edge.
-  void removeEdge(Node &Caller, Function &Callee);
+  void removeEdge(Node &SourceN, Node &TargetN);
 
   /// Update the call graph after deleting an edge.
-  void removeEdge(Function &Caller, Function &Callee) {
-    return removeEdge(get(Caller), Callee);
+  void removeEdge(Function &Source, Function &Target) {
+    return removeEdge(get(Source), get(Target));
   }
 
   ///@}
@@ -1013,14 +1065,11 @@ private:
   /// Maps function->node for fast lookup.
   DenseMap<const Function *, Node *> NodeMap;
 
-  /// The entry nodes to the graph.
+  /// The entry edges into the graph.
   ///
-  /// These nodes are reachable through "external" means. Put another way, they
+  /// These edges are from "external" sources. Put another way, they
   /// escape at the module scope.
-  EdgeVectorT EntryEdges;
-
-  /// Map of the entry nodes in the graph to their indices in \c EntryEdges.
-  DenseMap<Function *, int> EntryIndexMap;
+  EdgeSequence EntryEdges;
 
   /// Allocator that holds all the call graph SCCs.
   SpecificBumpPtrAllocator<SCC> SCCBPA;
@@ -1045,18 +1094,6 @@ private:
   /// These are all of the RefSCCs which have no children.
   SmallVector<RefSCC *, 4> LeafRefSCCs;
 
-  /// Stack of nodes in the DFS walk.
-  SmallVector<std::pair<Node *, edge_iterator>, 4> DFSStack;
-
-  /// Set of entry nodes not-yet-processed into RefSCCs.
-  SmallVector<Function *, 4> RefSCCEntryNodes;
-
-  /// Stack of nodes the DFS has walked but not yet put into a RefSCC.
-  SmallVector<Node *, 4> PendingRefSCCStack;
-
-  /// Counter for the next DFS number to assign.
-  int NextDFSNumber;
-
   /// Helper to insert a new function, with an already looked-up entry in
   /// the NodeMap.
   Node &insertInto(Function &F, Node *&MappedN);
@@ -1078,6 +1115,23 @@ private:
     return new (RefSCCBPA.Allocate()) RefSCC(std::forward<Ts>(Args)...);
   }
 
+  /// Common logic for building SCCs from a sequence of roots.
+  ///
+  /// This is a very generic implementation of the depth-first walk and SCC
+  /// formation algorithm. It uses a generic sequence of roots and generic
+  /// callbacks for each step. This is designed to be used to implement both
+  /// the RefSCC formation and SCC formation with shared logic.
+  ///
+  /// Currently this is a relatively naive implementation of Tarjan's DFS
+  /// algorithm to form the SCCs.
+  ///
+  /// FIXME: We should consider newer variants such as Nuutila.
+  template <typename RootsT, typename GetBeginT, typename GetEndT,
+            typename GetNodeT, typename FormSCCCallbackT>
+  static void buildGenericSCCs(RootsT &&Roots, GetBeginT &&GetBegin,
+                               GetEndT &&GetEnd, GetNodeT &&GetNode,
+                               FormSCCCallbackT &&FormSCC);
+
   /// Build the SCCs for a RefSCC out of a list of nodes.
   void buildSCCs(RefSCC &RC, node_stack_range Nodes);
 
@@ -1098,22 +1152,12 @@ private:
            "Index does not point back at RC!");
     return IndexIt->second;
   }
-
-  /// Builds the next node in the post-order RefSCC walk of the call graph and
-  /// appends it to the \c PostOrderRefSCCs vector.
-  ///
-  /// Returns true if a new RefSCC was successfully constructed, and false if
-  /// there are no more RefSCCs to build in the graph.
-  bool buildNextRefSCCInPostOrder();
 };
 
 inline LazyCallGraph::Edge::Edge() : Value() {}
-inline LazyCallGraph::Edge::Edge(Function &F, Kind K) : Value(&F, K) {}
 inline LazyCallGraph::Edge::Edge(Node &N, Kind K) : Value(&N, K) {}
 
-inline LazyCallGraph::Edge::operator bool() const {
-  return !Value.getPointer().isNull();
-}
+inline LazyCallGraph::Edge::operator bool() const { return Value.getPointer(); }
 
 inline LazyCallGraph::Edge::Kind LazyCallGraph::Edge::getKind() const {
   assert(*this && "Queried a null edge!");
@@ -1125,51 +1169,32 @@ inline bool LazyCallGraph::Edge::isCall() const {
   return getKind() == Call;
 }
 
-inline Function &LazyCallGraph::Edge::getFunction() const {
+inline LazyCallGraph::Node &LazyCallGraph::Edge::getNode() const {
   assert(*this && "Queried a null edge!");
-  auto P = Value.getPointer();
-  if (auto *F = P.dyn_cast<Function *>())
-    return *F;
-
-  return P.get<Node *>()->getFunction();
+  return *Value.getPointer();
 }
 
-inline LazyCallGraph::Node *LazyCallGraph::Edge::getNode() const {
-  assert(*this && "Queried a null edge!");
-  auto P = Value.getPointer();
-  if (auto *N = P.dyn_cast<Node *>())
-    return N;
-
-  return nullptr;
-}
-
-inline LazyCallGraph::Node &LazyCallGraph::Edge::getNode(LazyCallGraph &G) {
+inline Function &LazyCallGraph::Edge::getFunction() const {
   assert(*this && "Queried a null edge!");
-  auto P = Value.getPointer();
-  if (auto *N = P.dyn_cast<Node *>())
-    return *N;
-
-  Node &N = G.get(*P.get<Function *>());
-  Value.setPointer(&N);
-  return N;
+  return getNode().getFunction();
 }
 
 // Provide GraphTraits specializations for call graphs.
 template <> struct GraphTraits<LazyCallGraph::Node *> {
   typedef LazyCallGraph::Node *NodeRef;
-  typedef LazyCallGraph::edge_iterator ChildIteratorType;
+  typedef LazyCallGraph::EdgeSequence::iterator ChildIteratorType;
 
   static NodeRef getEntryNode(NodeRef N) { return N; }
-  static ChildIteratorType child_begin(NodeRef N) { return N->begin(); }
-  static ChildIteratorType child_end(NodeRef N) { return N->end(); }
+  static ChildIteratorType child_begin(NodeRef N) { return (*N)->begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return (*N)->end(); }
 };
 template <> struct GraphTraits<LazyCallGraph *> {
   typedef LazyCallGraph::Node *NodeRef;
-  typedef LazyCallGraph::edge_iterator ChildIteratorType;
+  typedef LazyCallGraph::EdgeSequence::iterator ChildIteratorType;
 
   static NodeRef getEntryNode(NodeRef N) { return N; }
-  static ChildIteratorType child_begin(NodeRef N) { return N->begin(); }
-  static ChildIteratorType child_end(NodeRef N) { return N->end(); }
+  static ChildIteratorType child_begin(NodeRef N) { return (*N)->begin(); }
+  static ChildIteratorType child_end(NodeRef N) { return (*N)->end(); }
 };
 
 /// An analysis pass which computes the call graph for a module.
diff --git a/include/llvm/Analysis/LazyValueInfo.h b/include/llvm/Analysis/LazyValueInfo.h
index ef0762079d924b62aac99c9e9c46df1734f84aab..49e088e533dc175db576d0ea2cab1139b5d43085 100644
--- a/include/llvm/Analysis/LazyValueInfo.h
+++ b/include/llvm/Analysis/LazyValueInfo.h
@@ -32,6 +32,7 @@ namespace llvm {
 class LazyValueInfo {
   friend class LazyValueInfoWrapperPass;
   AssumptionCache *AC = nullptr;
+  const DataLayout *DL = nullptr;
   class TargetLibraryInfo *TLI = nullptr;
   DominatorTree *DT = nullptr;
   void *PImpl = nullptr;
@@ -40,16 +41,17 @@ class LazyValueInfo {
 public:
   ~LazyValueInfo();
   LazyValueInfo() {}
-  LazyValueInfo(AssumptionCache *AC_, TargetLibraryInfo *TLI_,
+  LazyValueInfo(AssumptionCache *AC_, const DataLayout *DL_, TargetLibraryInfo *TLI_,
                 DominatorTree *DT_)
-      : AC(AC_), TLI(TLI_), DT(DT_) {}
+      : AC(AC_), DL(DL_), TLI(TLI_), DT(DT_) {}
   LazyValueInfo(LazyValueInfo &&Arg)
-      : AC(Arg.AC), TLI(Arg.TLI), DT(Arg.DT), PImpl(Arg.PImpl) {
+      : AC(Arg.AC), DL(Arg.DL), TLI(Arg.TLI), DT(Arg.DT), PImpl(Arg.PImpl) {
     Arg.PImpl = nullptr;
   }
   LazyValueInfo &operator=(LazyValueInfo &&Arg) {
     releaseMemory();
     AC = Arg.AC;
+    DL = Arg.DL;
     TLI = Arg.TLI;
     DT = Arg.DT;
     PImpl = Arg.PImpl;
@@ -98,6 +100,9 @@ public:
   /// Inform the analysis cache that we have erased a block.
   void eraseBlock(BasicBlock *BB);
 
+  /// Print the \LazyValueInfoCache.
+  void printCache(Function &F, raw_ostream &OS);
+
   // For old PM pass. Delete once LazyValueInfoWrapperPass is gone.
   void releaseMemory();
 
diff --git a/include/llvm/Analysis/Loads.h b/include/llvm/Analysis/Loads.h
index e167f36219d2c2d438ee417808d8ec6ef2d26c03..a59c1f88e229f7f17c1013fa483783c54c786a2c 100644
--- a/include/llvm/Analysis/Loads.h
+++ b/include/llvm/Analysis/Loads.h
@@ -85,8 +85,37 @@ Value *FindAvailableLoadedValue(LoadInst *Load,
                                 BasicBlock::iterator &ScanFrom,
                                 unsigned MaxInstsToScan = DefMaxInstsToScan,
                                 AliasAnalysis *AA = nullptr,
-                                bool *IsLoadCSE = nullptr);
+                                bool *IsLoadCSE = nullptr,
+                                unsigned *NumScanedInst = nullptr);
 
+/// Scan backwards to see if we have the value of the given pointer available
+/// locally within a small number of instructions.
+///
+/// You can use this function to scan across multiple blocks: after you call
+/// this function, if ScanFrom points at the beginning of the block, it's safe
+/// to continue scanning the predecessors.
+///
+/// \param Ptr The pointer we want the load and store to originate from.
+/// \param AccessTy The access type of the pointer.
+/// \param AtLeastAtomic Are we looking for at-least an atomic load/store ? In
+/// case it is false, we can return an atomic or non-atomic load or store. In
+/// case it is true, we need to return an atomic load or store.
+/// \param ScanBB The basic block to scan.
+/// \param [in,out] ScanFrom The location to start scanning from. When this
+/// function returns, it points at the last instruction scanned.
+/// \param MaxInstsToScan The maximum number of instructions to scan. If this
+/// is zero, the whole block will be scanned.
+/// \param AA Optional pointer to alias analysis, to make the scan more
+/// precise.
+/// \param [out] IsLoad Whether the returned value is a load from the same
+/// location in memory, as opposed to the value operand of a store.
+///
+/// \returns The found value, or nullptr if no value is found.
+Value *FindAvailablePtrLoadStore(Value *Ptr, Type *AccessTy, bool AtLeastAtomic,
+                                 BasicBlock *ScanBB,
+                                 BasicBlock::iterator &ScanFrom,
+                                 unsigned MaxInstsToScan, AliasAnalysis *AA,
+                                 bool *IsLoad, unsigned *NumScanedInst);
 }
 
 #endif
diff --git a/include/llvm/Analysis/LoopAccessAnalysis.h b/include/llvm/Analysis/LoopAccessAnalysis.h
index 9eee08c101cce84ad2c44a3db13dabd12fabe6b8..2568903c57f3365ff35cb4719a02dae9e48c668e 100644
--- a/include/llvm/Analysis/LoopAccessAnalysis.h
+++ b/include/llvm/Analysis/LoopAccessAnalysis.h
@@ -38,39 +38,6 @@ class SCEVUnionPredicate;
 class LoopAccessInfo;
 class OptimizationRemarkEmitter;
 
-/// Optimization analysis message produced during vectorization. Messages inform
-/// the user why vectorization did not occur.
-class LoopAccessReport {
-  std::string Message;
-  const Instruction *Instr;
-
-protected:
-  LoopAccessReport(const Twine &Message, const Instruction *I)
-      : Message(Message.str()), Instr(I) {}
-
-public:
-  LoopAccessReport(const Instruction *I = nullptr) : Instr(I) {}
-
-  template <typename A> LoopAccessReport &operator<<(const A &Value) {
-    raw_string_ostream Out(Message);
-    Out << Value;
-    return *this;
-  }
-
-  const Instruction *getInstr() const { return Instr; }
-
-  std::string &str() { return Message; }
-  const std::string &str() const { return Message; }
-  operator Twine() { return Message; }
-
-  /// \brief Emit an analysis note for \p PassName with the debug location from
-  /// the instruction in \p Message if available.  Otherwise use the location of
-  /// \p TheLoop.
-  static void emitAnalysis(const LoopAccessReport &Message, const Loop *TheLoop,
-                           const char *PassName,
-                           OptimizationRemarkEmitter &ORE);
-};
-
 /// \brief Collection of parameters shared beetween the Loop Vectorizer and the
 /// Loop Access Analysis.
 struct VectorizerParams {
@@ -126,7 +93,7 @@ struct VectorizerParams {
 class MemoryDepChecker {
 public:
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
-  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
+  typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
   /// \brief Set of potential dependent memory accesses.
   typedef EquivalenceClasses<MemAccessInfo> DepCandidates;
 
@@ -221,7 +188,7 @@ public:
   /// \brief Check whether the dependencies between the accesses are safe.
   ///
   /// Only checks sets with elements in \p CheckDeps.
-  bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoSet &CheckDeps,
+  bool areDepsSafe(DepCandidates &AccessSets, MemAccessInfoList &CheckDeps,
                    const ValueToValueMap &Strides);
 
   /// \brief No memory dependence was encountered that would inhibit
@@ -690,11 +657,6 @@ int64_t getPtrStride(PredicatedScalarEvolution &PSE, Value *Ptr, const Loop *Lp,
                      const ValueToValueMap &StridesMap = ValueToValueMap(),
                      bool Assume = false, bool ShouldCheckWrap = true);
 
-/// \brief Saves the sorted memory accesses in vector argument 'Sorted' after
-/// sorting the jumbled memory accesses.
-void sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
-                     ScalarEvolution &SE, SmallVectorImpl<Value *> &Sorted);
-
 /// \brief Returns true if the memory operations \p A and \p B are consecutive.
 /// This is a simple API that does not depend on the analysis pass. 
 bool isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
diff --git a/include/llvm/Analysis/LoopAnalysisManager.h b/include/llvm/Analysis/LoopAnalysisManager.h
index 640c086be2306370c42adcd275807ca4e91d236d..17da516889b0058e8f71d3500e48ea1ac9742ad6 100644
--- a/include/llvm/Analysis/LoopAnalysisManager.h
+++ b/include/llvm/Analysis/LoopAnalysisManager.h
@@ -141,7 +141,8 @@ LoopAnalysisManagerFunctionProxy::run(Function &F, FunctionAnalysisManager &AM);
 // template.
 extern template class InnerAnalysisManagerProxy<LoopAnalysisManager, Function>;
 
-extern template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>;
+extern template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop,
+                                                LoopStandardAnalysisResults &>;
 /// A proxy from a \c FunctionAnalysisManager to a \c Loop.
 typedef OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop,
                                   LoopStandardAnalysisResults &>
diff --git a/include/llvm/Analysis/LoopInfo.h b/include/llvm/Analysis/LoopInfo.h
index c49ed78403143f60bc56652ae4344045d6d24f18..996794b660a9e8771fc9afd862b5186fad7433b4 100644
--- a/include/llvm/Analysis/LoopInfo.h
+++ b/include/llvm/Analysis/LoopInfo.h
@@ -478,7 +478,8 @@ public:
 
   /// Return all unique successor blocks of this loop.
   /// These are the blocks _outside of the current loop_ which are branched to.
-  /// This assumes that loop exits are in canonical form.
+  /// This assumes that loop exits are in canonical form, i.e. all exits are
+  /// dedicated exits.
   void getUniqueExitBlocks(SmallVectorImpl<BasicBlock *> &ExitBlocks) const;
 
   /// If getUniqueExitBlocks would return exactly one block, return that block.
diff --git a/include/llvm/Analysis/LoopInfoImpl.h b/include/llvm/Analysis/LoopInfoImpl.h
index 0febe7b1a8bd1e50da6df94d4f659490efdf7193..761f8721b54fda6ba14b0d4fff1849545c4a0824 100644
--- a/include/llvm/Analysis/LoopInfoImpl.h
+++ b/include/llvm/Analysis/LoopInfoImpl.h
@@ -577,10 +577,9 @@ bool compareVectors(std::vector<T> &BB1, std::vector<T> &BB2) {
 }
 
 template <class BlockT, class LoopT>
-static void
-addInnerLoopsToHeadersMap(DenseMap<BlockT *, const LoopT *> &LoopHeaders,
-                          const LoopInfoBase<BlockT, LoopT> &LI,
-                          const LoopT &L) {
+void addInnerLoopsToHeadersMap(DenseMap<BlockT *, const LoopT *> &LoopHeaders,
+                               const LoopInfoBase<BlockT, LoopT> &LI,
+                               const LoopT &L) {
   LoopHeaders[L.getHeader()] = &L;
   for (LoopT *SL : L)
     addInnerLoopsToHeadersMap(LoopHeaders, LI, *SL);
diff --git a/include/llvm/Analysis/MemoryBuiltins.h b/include/llvm/Analysis/MemoryBuiltins.h
index b58f07e6947577ff5be5106cf2a46c6dd3f9fbb9..c5514316f75f043159e1f3a3bbae409706a12c90 100644
--- a/include/llvm/Analysis/MemoryBuiltins.h
+++ b/include/llvm/Analysis/MemoryBuiltins.h
@@ -32,12 +32,6 @@ class TargetLibraryInfo;
 class Type;
 class Value;
 
-enum class ObjSizeMode {
-  Exact = 0,
-  Min = 1,
-  Max = 2
-};
-
 /// \brief Tests if a value is a call or invoke to a library function that
 /// allocates or reallocates memory (either malloc, calloc, realloc, or strdup
 /// like).
@@ -129,17 +123,36 @@ static inline CallInst *isFreeCall(Value *I, const TargetLibraryInfo *TLI) {
 //  Utility functions to compute size of objects.
 //
 
+/// Various options to control the behavior of getObjectSize.
+struct ObjectSizeOpts {
+  /// Controls how we handle conditional statements with unknown conditions.
+  enum class Mode : uint8_t {
+    /// Fail to evaluate an unknown condition.
+    Exact,
+    /// Evaluate all branches of an unknown condition. If all evaluations
+    /// succeed, pick the minimum size.
+    Min,
+    /// Same as Min, except we pick the maximum size of all of the branches.
+    Max
+  };
+
+  /// How we want to evaluate this object's size.
+  Mode EvalMode = Mode::Exact;
+  /// Whether to round the result up to the alignment of allocas, byval
+  /// arguments, and global variables.
+  bool RoundToAlign = false;
+  /// If this is true, null pointers in address space 0 will be treated as
+  /// though they can't be evaluated. Otherwise, null is always considered to
+  /// point to a 0 byte region of memory.
+  bool NullIsUnknownSize = false;
+};
+
 /// \brief Compute the size of the object pointed by Ptr. Returns true and the
 /// object size in Size if successful, and false otherwise. In this context, by
 /// object we mean the region of memory starting at Ptr to the end of the
 /// underlying object pointed to by Ptr.
-/// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
-/// byval arguments, and global variables.
-/// If Mode is Min or Max the size will be evaluated even if it depends on
-/// a condition and corresponding value will be returned (min or max).
 bool getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL,
-                   const TargetLibraryInfo *TLI, bool RoundToAlign = false,
-                   ObjSizeMode Mode = ObjSizeMode::Exact);
+                   const TargetLibraryInfo *TLI, ObjectSizeOpts Opts = {});
 
 /// Try to turn a call to @llvm.objectsize into an integer value of the given
 /// Type. Returns null on failure.
@@ -160,8 +173,7 @@ class ObjectSizeOffsetVisitor
 
   const DataLayout &DL;
   const TargetLibraryInfo *TLI;
-  bool RoundToAlign;
-  ObjSizeMode Mode;
+  ObjectSizeOpts Options;
   unsigned IntTyBits;
   APInt Zero;
   SmallPtrSet<Instruction *, 8> SeenInsts;
@@ -174,8 +186,7 @@ class ObjectSizeOffsetVisitor
 
 public:
   ObjectSizeOffsetVisitor(const DataLayout &DL, const TargetLibraryInfo *TLI,
-                          LLVMContext &Context, bool RoundToAlign = false,
-                          ObjSizeMode Mode = ObjSizeMode::Exact);
+                          LLVMContext &Context, ObjectSizeOpts Options = {});
 
   SizeOffsetType compute(Value *V);
 
diff --git a/include/llvm/Transforms/Utils/MemorySSA.h b/include/llvm/Analysis/MemorySSA.h
similarity index 82%
rename from include/llvm/Transforms/Utils/MemorySSA.h
rename to include/llvm/Analysis/MemorySSA.h
index bad2a3b708cca18822dd7c9e81fd4433a8b9a8d2..db31ae9f4f109196e70da6cf6556c0aa1bb8ac96 100644
--- a/include/llvm/Transforms/Utils/MemorySSA.h
+++ b/include/llvm/Analysis/MemorySSA.h
@@ -6,71 +6,71 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// \file
-// \brief This file exposes an interface to building/using memory SSA to
-// walk memory instructions using a use/def graph.
-//
-// Memory SSA class builds an SSA form that links together memory access
-// instructions such as loads, stores, atomics, and calls. Additionally, it does
-// a trivial form of "heap versioning" Every time the memory state changes in
-// the program, we generate a new heap version. It generates MemoryDef/Uses/Phis
-// that are overlayed on top of the existing instructions.
-//
-// As a trivial example,
-// define i32 @main() #0 {
-// entry:
-//   %call = call noalias i8* @_Znwm(i64 4) #2
-//   %0 = bitcast i8* %call to i32*
-//   %call1 = call noalias i8* @_Znwm(i64 4) #2
-//   %1 = bitcast i8* %call1 to i32*
-//   store i32 5, i32* %0, align 4
-//   store i32 7, i32* %1, align 4
-//   %2 = load i32* %0, align 4
-//   %3 = load i32* %1, align 4
-//   %add = add nsw i32 %2, %3
-//   ret i32 %add
-// }
-//
-// Will become
-// define i32 @main() #0 {
-// entry:
-//   ; 1 = MemoryDef(0)
-//   %call = call noalias i8* @_Znwm(i64 4) #3
-//   %2 = bitcast i8* %call to i32*
-//   ; 2 = MemoryDef(1)
-//   %call1 = call noalias i8* @_Znwm(i64 4) #3
-//   %4 = bitcast i8* %call1 to i32*
-//   ; 3 = MemoryDef(2)
-//   store i32 5, i32* %2, align 4
-//   ; 4 = MemoryDef(3)
-//   store i32 7, i32* %4, align 4
-//   ; MemoryUse(3)
-//   %7 = load i32* %2, align 4
-//   ; MemoryUse(4)
-//   %8 = load i32* %4, align 4
-//   %add = add nsw i32 %7, %8
-//   ret i32 %add
-// }
-//
-// Given this form, all the stores that could ever effect the load at %8 can be
-// gotten by using the MemoryUse associated with it, and walking from use to def
-// until you hit the top of the function.
-//
-// Each def also has a list of users associated with it, so you can walk from
-// both def to users, and users to defs. Note that we disambiguate MemoryUses,
-// but not the RHS of MemoryDefs. You can see this above at %7, which would
-// otherwise be a MemoryUse(4). Being disambiguated means that for a given
-// store, all the MemoryUses on its use lists are may-aliases of that store (but
-// the MemoryDefs on its use list may not be).
-//
-// MemoryDefs are not disambiguated because it would require multiple reaching
-// definitions, which would require multiple phis, and multiple memoryaccesses
-// per instruction.
+///
+/// \file
+/// \brief This file exposes an interface to building/using memory SSA to
+/// walk memory instructions using a use/def graph.
+///
+/// Memory SSA class builds an SSA form that links together memory access
+/// instructions such as loads, stores, atomics, and calls. Additionally, it
+/// does a trivial form of "heap versioning" Every time the memory state changes
+/// in the program, we generate a new heap version. It generates
+/// MemoryDef/Uses/Phis that are overlayed on top of the existing instructions.
+///
+/// As a trivial example,
+/// define i32 @main() #0 {
+/// entry:
+///   %call = call noalias i8* @_Znwm(i64 4) #2
+///   %0 = bitcast i8* %call to i32*
+///   %call1 = call noalias i8* @_Znwm(i64 4) #2
+///   %1 = bitcast i8* %call1 to i32*
+///   store i32 5, i32* %0, align 4
+///   store i32 7, i32* %1, align 4
+///   %2 = load i32* %0, align 4
+///   %3 = load i32* %1, align 4
+///   %add = add nsw i32 %2, %3
+///   ret i32 %add
+/// }
+///
+/// Will become
+/// define i32 @main() #0 {
+/// entry:
+///   ; 1 = MemoryDef(0)
+///   %call = call noalias i8* @_Znwm(i64 4) #3
+///   %2 = bitcast i8* %call to i32*
+///   ; 2 = MemoryDef(1)
+///   %call1 = call noalias i8* @_Znwm(i64 4) #3
+///   %4 = bitcast i8* %call1 to i32*
+///   ; 3 = MemoryDef(2)
+///   store i32 5, i32* %2, align 4
+///   ; 4 = MemoryDef(3)
+///   store i32 7, i32* %4, align 4
+///   ; MemoryUse(3)
+///   %7 = load i32* %2, align 4
+///   ; MemoryUse(4)
+///   %8 = load i32* %4, align 4
+///   %add = add nsw i32 %7, %8
+///   ret i32 %add
+/// }
+///
+/// Given this form, all the stores that could ever effect the load at %8 can be
+/// gotten by using the MemoryUse associated with it, and walking from use to
+/// def until you hit the top of the function.
+///
+/// Each def also has a list of users associated with it, so you can walk from
+/// both def to users, and users to defs. Note that we disambiguate MemoryUses,
+/// but not the RHS of MemoryDefs. You can see this above at %7, which would
+/// otherwise be a MemoryUse(4). Being disambiguated means that for a given
+/// store, all the MemoryUses on its use lists are may-aliases of that store
+/// (but the MemoryDefs on its use list may not be).
+///
+/// MemoryDefs are not disambiguated because it would require multiple reaching
+/// definitions, which would require multiple phis, and multiple memoryaccesses
+/// per instruction.
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_UTILS_MEMORYSSA_H
-#define LLVM_TRANSFORMS_UTILS_MEMORYSSA_H
+#ifndef LLVM_ANALYSIS_MEMORYSSA_H
+#define LLVM_ANALYSIS_MEMORYSSA_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/GraphTraits.h"
@@ -138,7 +138,6 @@ public:
 
   // Methods for support type inquiry through isa, cast, and
   // dyn_cast
-  static inline bool classof(const MemoryAccess *) { return true; }
   static inline bool classof(const Value *V) {
     unsigned ID = V->getValueID();
     return ID == MemoryUseVal || ID == MemoryPhiVal || ID == MemoryDefVal;
@@ -243,11 +242,21 @@ public:
   /// \brief Get the access that produces the memory state used by this Use.
   MemoryAccess *getDefiningAccess() const { return getOperand(0); }
 
-  static inline bool classof(const MemoryUseOrDef *) { return true; }
   static inline bool classof(const Value *MA) {
     return MA->getValueID() == MemoryUseVal || MA->getValueID() == MemoryDefVal;
   }
 
+  // Sadly, these have to be public because they are needed in some of the
+  // iterators.
+  virtual bool isOptimized() const = 0;
+  virtual MemoryAccess *getOptimized() const = 0;
+  virtual void setOptimized(MemoryAccess *) = 0;
+
+  /// \brief Reset the ID of what this MemoryUse was optimized to, causing it to
+  /// be rewalked by the walker if necessary.
+  /// This really should only be called by tests.
+  virtual void resetOptimized() = 0;
+
 protected:
   friend class MemorySSA;
   friend class MemorySSAUpdater;
@@ -256,8 +265,13 @@ protected:
       : MemoryAccess(C, Vty, BB, 1), MemoryInst(MI) {
     setDefiningAccess(DMA);
   }
-
-  void setDefiningAccess(MemoryAccess *DMA) { setOperand(0, DMA); }
+  void setDefiningAccess(MemoryAccess *DMA, bool Optimized = false) {
+    if (!Optimized) {
+      setOperand(0, DMA);
+      return;
+    }
+    setOptimized(DMA);
+  }
 
 private:
   Instruction *MemoryInst;
@@ -284,27 +298,27 @@ public:
   void *operator new(size_t s) { return User::operator new(s, 1); }
   void *operator new(size_t, unsigned) = delete;
 
-  static inline bool classof(const MemoryUse *) { return true; }
   static inline bool classof(const Value *MA) {
     return MA->getValueID() == MemoryUseVal;
   }
 
   void print(raw_ostream &OS) const override;
 
-  void setDefiningAccess(MemoryAccess *DMA, bool Optimized = false) {
-    if (Optimized)
-      OptimizedID = DMA->getID();
-    MemoryUseOrDef::setDefiningAccess(DMA);
+  virtual void setOptimized(MemoryAccess *DMA) override {
+    OptimizedID = DMA->getID();
+    setOperand(0, DMA);
   }
 
-  bool isOptimized() const {
+  virtual bool isOptimized() const override {
     return getDefiningAccess() && OptimizedID == getDefiningAccess()->getID();
   }
 
-  /// \brief Reset the ID of what this MemoryUse was optimized to, causing it to
-  /// be rewalked by the walker if necessary.
-  /// This really should only be called by tests.
-  void resetOptimized() { OptimizedID = INVALID_MEMORYACCESS_ID; }
+  virtual MemoryAccess *getOptimized() const override {
+    return getDefiningAccess();
+  }
+  virtual void resetOptimized() override {
+    OptimizedID = INVALID_MEMORYACCESS_ID;
+  }
 
 protected:
   friend class MemorySSA;
@@ -336,17 +350,30 @@ public:
 
   MemoryDef(LLVMContext &C, MemoryAccess *DMA, Instruction *MI, BasicBlock *BB,
             unsigned Ver)
-      : MemoryUseOrDef(C, DMA, MemoryDefVal, MI, BB), ID(Ver) {}
+      : MemoryUseOrDef(C, DMA, MemoryDefVal, MI, BB), ID(Ver),
+        Optimized(nullptr), OptimizedID(INVALID_MEMORYACCESS_ID) {}
 
   // allocate space for exactly one operand
   void *operator new(size_t s) { return User::operator new(s, 1); }
   void *operator new(size_t, unsigned) = delete;
 
-  static inline bool classof(const MemoryDef *) { return true; }
   static inline bool classof(const Value *MA) {
     return MA->getValueID() == MemoryDefVal;
   }
 
+  virtual void setOptimized(MemoryAccess *MA) override {
+    Optimized = MA;
+    OptimizedID = getDefiningAccess()->getID();
+  }
+  virtual MemoryAccess *getOptimized() const override { return Optimized; }
+  virtual bool isOptimized() const override {
+    return getOptimized() && getDefiningAccess() &&
+           OptimizedID == getDefiningAccess()->getID();
+  }
+  virtual void resetOptimized() override {
+    OptimizedID = INVALID_MEMORYACCESS_ID;
+  }
+
   void print(raw_ostream &OS) const override;
 
 protected:
@@ -356,6 +383,8 @@ protected:
 
 private:
   const unsigned ID;
+  MemoryAccess *Optimized;
+  unsigned int OptimizedID;
 };
 
 template <>
@@ -501,7 +530,6 @@ public:
     return getIncomingValue(Idx);
   }
 
-  static inline bool classof(const MemoryPhi *) { return true; }
   static inline bool classof(const Value *V) {
     return V->getValueID() == MemoryPhiVal;
   }
@@ -598,55 +626,6 @@ public:
     return getWritableBlockDefs(BB);
   }
 
-  /// \brief Create an empty MemoryPhi in MemorySSA for a given basic block.
-  /// Only one MemoryPhi for a block exists at a time, so this function will
-  /// assert if you try to create one where it already exists.
-  MemoryPhi *createMemoryPhi(BasicBlock *BB);
-
-  enum InsertionPlace { Beginning, End };
-
-  /// \brief Create a MemoryAccess in MemorySSA at a specified point in a block,
-  /// with a specified clobbering definition.
-  ///
-  /// Returns the new MemoryAccess.
-  /// This should be called when a memory instruction is created that is being
-  /// used to replace an existing memory instruction. It will *not* create PHI
-  /// nodes, or verify the clobbering definition. The insertion place is used
-  /// solely to determine where in the memoryssa access lists the instruction
-  /// will be placed. The caller is expected to keep ordering the same as
-  /// instructions.
-  /// It will return the new MemoryAccess.
-  /// Note: If a MemoryAccess already exists for I, this function will make it
-  /// inaccessible and it *must* have removeMemoryAccess called on it.
-  MemoryAccess *createMemoryAccessInBB(Instruction *I, MemoryAccess *Definition,
-                                       const BasicBlock *BB,
-                                       InsertionPlace Point);
-
-  /// \brief Create a MemoryAccess in MemorySSA before or after an existing
-  /// MemoryAccess.
-  ///
-  /// Returns the new MemoryAccess.
-  /// This should be called when a memory instruction is created that is being
-  /// used to replace an existing memory instruction. It will *not* create PHI
-  /// nodes, or verify the clobbering definition.
-  ///
-  /// Note: If a MemoryAccess already exists for I, this function will make it
-  /// inaccessible and it *must* have removeMemoryAccess called on it.
-  MemoryUseOrDef *createMemoryAccessBefore(Instruction *I,
-                                           MemoryAccess *Definition,
-                                           MemoryUseOrDef *InsertPt);
-  MemoryUseOrDef *createMemoryAccessAfter(Instruction *I,
-                                          MemoryAccess *Definition,
-                                          MemoryAccess *InsertPt);
-
-  /// \brief Remove a MemoryAccess from MemorySSA, including updating all
-  /// definitions and uses.
-  /// This should be called when a memory instruction that has a MemoryAccess
-  /// associated with it is erased from the program.  For example, if a store or
-  /// load is simply erased (not replaced), removeMemoryAccess should be called
-  /// on the MemoryAccess for that store/load.
-  void removeMemoryAccess(MemoryAccess *);
-
   /// \brief Given two memory accesses in the same basic block, determine
   /// whether MemoryAccess \p A dominates MemoryAccess \p B.
   bool locallyDominates(const MemoryAccess *A, const MemoryAccess *B) const;
@@ -663,6 +642,10 @@ public:
   /// all uses, uses appear in the right places).  This is used by unit tests.
   void verifyMemorySSA() const;
 
+  /// Used in various insertion functions to specify whether we are talking
+  /// about the beginning or end of a block.
+  enum InsertionPlace { Beginning, End };
+
 protected:
   // Used by Memory SSA annotater, dumpers, and wrapper pass
   friend class MemorySSAAnnotatedWriter;
@@ -685,11 +668,24 @@ protected:
     return It == PerBlockDefs.end() ? nullptr : It->second.get();
   }
 
-  // This is used by the updater to perform the internal memoryssa machinations
-  // for moves.  It does not always leave the IR in a correct state, and relies
-  // on the updater to fixup what it breaks, so it is not public.
+  // These is used by the updater to perform various internal MemorySSA
+  // machinsations.  They do not always leave the IR in a correct state, and
+  // relies on the updater to fixup what it breaks, so it is not public.
+
   void moveTo(MemoryUseOrDef *What, BasicBlock *BB, AccessList::iterator Where);
   void moveTo(MemoryUseOrDef *What, BasicBlock *BB, InsertionPlace Point);
+  // Rename the dominator tree branch rooted at BB.
+  void renamePass(BasicBlock *BB, MemoryAccess *IncomingVal,
+                  SmallPtrSetImpl<BasicBlock *> &Visited) {
+    renamePass(DT->getNode(BB), IncomingVal, Visited, true, true);
+  }
+  void removeFromLookups(MemoryAccess *);
+  void removeFromLists(MemoryAccess *, bool ShouldDelete = true);
+  void insertIntoListsForBlock(MemoryAccess *, const BasicBlock *,
+                               InsertionPlace);
+  void insertIntoListsBefore(MemoryAccess *, const BasicBlock *,
+                             AccessList::iterator);
+  MemoryUseOrDef *createDefinedAccess(Instruction *, MemoryAccess *);
 
 private:
   class CachingWalker;
@@ -705,27 +701,21 @@ private:
 
   void
   determineInsertionPoint(const SmallPtrSetImpl<BasicBlock *> &DefiningBlocks);
-  void computeDomLevels(DenseMap<DomTreeNode *, unsigned> &DomLevels);
   void markUnreachableAsLiveOnEntry(BasicBlock *BB);
   bool dominatesUse(const MemoryAccess *, const MemoryAccess *) const;
+  MemoryPhi *createMemoryPhi(BasicBlock *BB);
   MemoryUseOrDef *createNewAccess(Instruction *);
-  MemoryUseOrDef *createDefinedAccess(Instruction *, MemoryAccess *);
   MemoryAccess *findDominatingDef(BasicBlock *, enum InsertionPlace);
-  void removeFromLookups(MemoryAccess *);
-  void removeFromLists(MemoryAccess *, bool ShouldDelete = true);
-
   void placePHINodes(const SmallPtrSetImpl<BasicBlock *> &,
                      const DenseMap<const BasicBlock *, unsigned int> &);
-  MemoryAccess *renameBlock(BasicBlock *, MemoryAccess *);
+  MemoryAccess *renameBlock(BasicBlock *, MemoryAccess *, bool);
+  void renameSuccessorPhis(BasicBlock *, MemoryAccess *, bool);
   void renamePass(DomTreeNode *, MemoryAccess *IncomingVal,
-                  SmallPtrSet<BasicBlock *, 16> &Visited);
+                  SmallPtrSetImpl<BasicBlock *> &Visited,
+                  bool SkipVisited = false, bool RenameAllUses = false);
   AccessList *getOrCreateAccessList(const BasicBlock *);
   DefsList *getOrCreateDefsList(const BasicBlock *);
   void renumberBlock(const BasicBlock *) const;
-  void insertIntoListsForBlock(MemoryAccess *, const BasicBlock *,
-                               InsertionPlace);
-  void insertIntoListsBefore(MemoryAccess *, const BasicBlock *,
-                             AccessList::iterator);
   AliasAnalysis *AA;
   DominatorTree *DT;
   Function &F;
@@ -753,6 +743,16 @@ private:
   unsigned NextID;
 };
 
+// Internal MemorySSA utils, for use by MemorySSA classes and walkers
+class MemorySSAUtil {
+protected:
+  friend class MemorySSAWalker;
+  friend class GVNHoist;
+  // This function should not be used by new passes.
+  static bool defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
+                                  AliasAnalysis &AA);
+};
+
 // This pass does eager building and then printing of MemorySSA. It is used by
 // the tests to be able to build, dump, and verify Memory SSA.
 class MemorySSAPrinterLegacyPass : public FunctionPass {
@@ -1087,10 +1087,69 @@ inline upward_defs_iterator upward_defs_begin(const MemoryAccessPair &Pair) {
 
 inline upward_defs_iterator upward_defs_end() { return upward_defs_iterator(); }
 
-// Return true when MD may alias MU, return false otherwise.
-bool defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
-                         AliasAnalysis &AA);
+inline iterator_range<upward_defs_iterator>
+upward_defs(const MemoryAccessPair &Pair) {
+  return make_range(upward_defs_begin(Pair), upward_defs_end());
+}
+
+/// Walks the defining accesses of MemoryDefs. Stops after we hit something that
+/// has no defining use (e.g. a MemoryPhi or liveOnEntry). Note that, when
+/// comparing against a null def_chain_iterator, this will compare equal only
+/// after walking said Phi/liveOnEntry.
+///
+/// The UseOptimizedChain flag specifies whether to walk the clobbering
+/// access chain, or all the accesses.
+///
+/// Normally, MemoryDef are all just def/use linked together, so a def_chain on
+/// a MemoryDef will walk all MemoryDefs above it in the program until it hits
+/// a phi node.  The optimized chain walks the clobbering access of a store.
+/// So if you are just trying to find, given a store, what the next
+/// thing that would clobber the same memory is, you want the optimized chain.
+template <class T, bool UseOptimizedChain = false>
+struct def_chain_iterator
+    : public iterator_facade_base<def_chain_iterator<T, UseOptimizedChain>,
+                                  std::forward_iterator_tag, MemoryAccess *> {
+  def_chain_iterator() : MA(nullptr) {}
+  def_chain_iterator(T MA) : MA(MA) {}
+
+  T operator*() const { return MA; }
+
+  def_chain_iterator &operator++() {
+    // N.B. liveOnEntry has a null defining access.
+    if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA)) {
+      if (UseOptimizedChain && MUD->isOptimized())
+        MA = MUD->getOptimized();
+      else
+        MA = MUD->getDefiningAccess();
+    } else {
+      MA = nullptr;
+    }
+
+    return *this;
+  }
+
+  bool operator==(const def_chain_iterator &O) const { return MA == O.MA; }
+
+private:
+  T MA;
+};
+
+template <class T>
+inline iterator_range<def_chain_iterator<T>>
+def_chain(T MA, MemoryAccess *UpTo = nullptr) {
+#ifdef EXPENSIVE_CHECKS
+  assert((!UpTo || find(def_chain(MA), UpTo) != def_chain_iterator<T>()) &&
+         "UpTo isn't in the def chain!");
+#endif
+  return make_range(def_chain_iterator<T>(MA), def_chain_iterator<T>(UpTo));
+}
+
+template <class T>
+inline iterator_range<def_chain_iterator<T, true>> optimized_def_chain(T MA) {
+  return make_range(def_chain_iterator<T, true>(MA),
+                    def_chain_iterator<T, true>(nullptr));
+}
 
 } // end namespace llvm
 
-#endif // LLVM_TRANSFORMS_UTILS_MEMORYSSA_H
+#endif // LLVM_ANALYSIS_MEMORYSSA_H
diff --git a/include/llvm/Analysis/MemorySSAUpdater.h b/include/llvm/Analysis/MemorySSAUpdater.h
new file mode 100644
index 0000000000000000000000000000000000000000..d30eeeaa95b6a54a91efa45d0a7322271b75a013
--- /dev/null
+++ b/include/llvm/Analysis/MemorySSAUpdater.h
@@ -0,0 +1,153 @@
+//===- MemorySSAUpdater.h - Memory SSA Updater-------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file
+// \brief An automatic updater for MemorySSA that handles arbitrary insertion,
+// deletion, and moves.  It performs phi insertion where necessary, and
+// automatically updates the MemorySSA IR to be correct.
+// While updating loads or removing instructions is often easy enough to not
+// need this, updating stores should generally not be attemped outside this
+// API.
+//
+// Basic API usage:
+// Create the memory access you want for the instruction (this is mainly so
+// we know where it is, without having to duplicate the entire set of create
+// functions MemorySSA supports).
+// Call insertDef or insertUse depending on whether it's a MemoryUse or a
+// MemoryDef.
+// That's it.
+//
+// For moving, first, move the instruction itself using the normal SSA
+// instruction moving API, then just call moveBefore, moveAfter,or moveTo with
+// the right arguments.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_MEMORYSSAUPDATER_H
+#define LLVM_ANALYSIS_MEMORYSSAUPDATER_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Analysis/MemorySSA.h"
+
+namespace llvm {
+
+class Function;
+class Instruction;
+class MemoryAccess;
+class LLVMContext;
+class raw_ostream;
+
+class MemorySSAUpdater {
+private:
+  MemorySSA *MSSA;
+  SmallVector<MemoryPhi *, 8> InsertedPHIs;
+  SmallPtrSet<BasicBlock *, 8> VisitedBlocks;
+
+public:
+  MemorySSAUpdater(MemorySSA *MSSA) : MSSA(MSSA) {}
+  /// Insert a definition into the MemorySSA IR.  RenameUses will rename any use
+  /// below the new def block (and any inserted phis).  RenameUses should be set
+  /// to true if the definition may cause new aliases for loads below it.  This
+  /// is not the case for hoisting or sinking or other forms of code *movement*.
+  /// It *is* the case for straight code insertion.
+  /// For example:
+  /// store a
+  /// if (foo) { }
+  /// load a
+  ///
+  /// Moving the store into the if block, and calling insertDef, does not
+  /// require RenameUses.
+  /// However, changing it to:
+  /// store a
+  /// if (foo) { store b }
+  /// load a
+  /// Where a mayalias b, *does* require RenameUses be set to true.
+  void insertDef(MemoryDef *Def, bool RenameUses = false);
+  void insertUse(MemoryUse *Use);
+  void moveBefore(MemoryUseOrDef *What, MemoryUseOrDef *Where);
+  void moveAfter(MemoryUseOrDef *What, MemoryUseOrDef *Where);
+  void moveToPlace(MemoryUseOrDef *What, BasicBlock *BB,
+                   MemorySSA::InsertionPlace Where);
+
+  // The below are utility functions. Other than creation of accesses to pass
+  // to insertDef, and removeAccess to remove accesses, you should generally
+  // not attempt to update memoryssa yourself. It is very non-trivial to get
+  // the edge cases right, and the above calls already operate in near-optimal
+  // time bounds.
+
+  /// \brief Create a MemoryAccess in MemorySSA at a specified point in a block,
+  /// with a specified clobbering definition.
+  ///
+  /// Returns the new MemoryAccess.
+  /// This should be called when a memory instruction is created that is being
+  /// used to replace an existing memory instruction. It will *not* create PHI
+  /// nodes, or verify the clobbering definition. The insertion place is used
+  /// solely to determine where in the memoryssa access lists the instruction
+  /// will be placed. The caller is expected to keep ordering the same as
+  /// instructions.
+  /// It will return the new MemoryAccess.
+  /// Note: If a MemoryAccess already exists for I, this function will make it
+  /// inaccessible and it *must* have removeMemoryAccess called on it.
+  MemoryAccess *createMemoryAccessInBB(Instruction *I, MemoryAccess *Definition,
+                                       const BasicBlock *BB,
+                                       MemorySSA::InsertionPlace Point);
+
+  /// \brief Create a MemoryAccess in MemorySSA before or after an existing
+  /// MemoryAccess.
+  ///
+  /// Returns the new MemoryAccess.
+  /// This should be called when a memory instruction is created that is being
+  /// used to replace an existing memory instruction. It will *not* create PHI
+  /// nodes, or verify the clobbering definition.
+  ///
+  /// Note: If a MemoryAccess already exists for I, this function will make it
+  /// inaccessible and it *must* have removeMemoryAccess called on it.
+  MemoryUseOrDef *createMemoryAccessBefore(Instruction *I,
+                                           MemoryAccess *Definition,
+                                           MemoryUseOrDef *InsertPt);
+  MemoryUseOrDef *createMemoryAccessAfter(Instruction *I,
+                                          MemoryAccess *Definition,
+                                          MemoryAccess *InsertPt);
+
+  /// \brief Remove a MemoryAccess from MemorySSA, including updating all
+  /// definitions and uses.
+  /// This should be called when a memory instruction that has a MemoryAccess
+  /// associated with it is erased from the program.  For example, if a store or
+  /// load is simply erased (not replaced), removeMemoryAccess should be called
+  /// on the MemoryAccess for that store/load.
+  void removeMemoryAccess(MemoryAccess *);
+
+private:
+  // Move What before Where in the MemorySSA IR.
+  template <class WhereType>
+  void moveTo(MemoryUseOrDef *What, BasicBlock *BB, WhereType Where);
+  MemoryAccess *getPreviousDef(MemoryAccess *);
+  MemoryAccess *getPreviousDefInBlock(MemoryAccess *);
+  MemoryAccess *getPreviousDefFromEnd(BasicBlock *);
+  MemoryAccess *getPreviousDefRecursive(BasicBlock *);
+  MemoryAccess *recursePhi(MemoryAccess *Phi);
+  template <class RangeType>
+  MemoryAccess *tryRemoveTrivialPhi(MemoryPhi *Phi, RangeType &Operands);
+  void fixupDefs(const SmallVectorImpl<MemoryAccess *> &);
+};
+} // end namespace llvm
+
+#endif // LLVM_ANALYSIS_MEMORYSSAUPDATER_H
diff --git a/include/llvm/Analysis/ObjectUtils.h b/include/llvm/Analysis/ObjectUtils.h
new file mode 100644
index 0000000000000000000000000000000000000000..2ad3b17170095bb66874335d13b4fd5feb133293
--- /dev/null
+++ b/include/llvm/Analysis/ObjectUtils.h
@@ -0,0 +1,42 @@
+//===- Analysis/ObjectUtils.h - analysis utils for object files -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_OBJECT_UTILS_H
+#define LLVM_ANALYSIS_OBJECT_UTILS_H
+
+#include "llvm/IR/GlobalVariable.h"
+
+namespace llvm {
+
+/// True if GV can be left out of the object symbol table. This is the case
+/// for linkonce_odr values whose address is not significant. While legal, it is
+/// not normally profitable to omit them from the .o symbol table. Using this
+/// analysis makes sense when the information can be passed down to the linker
+/// or we are in LTO.
+inline bool canBeOmittedFromSymbolTable(const GlobalValue *GV) {
+  if (!GV->hasLinkOnceODRLinkage())
+    return false;
+
+  // We assume that anyone who sets global unnamed_addr on a non-constant knows
+  // what they're doing.
+  if (GV->hasGlobalUnnamedAddr())
+    return true;
+
+  // If it is a non constant variable, it needs to be uniqued across shared
+  // objects.
+  if (auto *Var = dyn_cast<GlobalVariable>(GV))
+    if (!Var->isConstant())
+      return false;
+
+  return GV->hasAtLeastLocalUnnamedAddr();
+}
+
+}
+
+#endif
diff --git a/include/llvm/Analysis/OptimizationDiagnosticInfo.h b/include/llvm/Analysis/OptimizationDiagnosticInfo.h
index 426fbb548e680c244226a5ec2d20d95f53ccdc87..edd9140a3493b7a7a2266ce54f1657f3ec4ecb41 100644
--- a/include/llvm/Analysis/OptimizationDiagnosticInfo.h
+++ b/include/llvm/Analysis/OptimizationDiagnosticInfo.h
@@ -38,7 +38,7 @@ class Value;
 /// enabled in the LLVM context.
 class OptimizationRemarkEmitter {
 public:
-  OptimizationRemarkEmitter(Function *F, BlockFrequencyInfo *BFI)
+  OptimizationRemarkEmitter(const Function *F, BlockFrequencyInfo *BFI)
       : F(F), BFI(BFI) {}
 
   /// \brief This variant can be used to generate ORE on demand (without the
@@ -52,7 +52,7 @@ public:
   /// operation since BFI and all its required analyses are computed.  This is
   /// for example useful for CGSCC passes that can't use function analyses
   /// passes in the old PM.
-  OptimizationRemarkEmitter(Function *F);
+  OptimizationRemarkEmitter(const Function *F);
 
   OptimizationRemarkEmitter(OptimizationRemarkEmitter &&Arg)
       : F(Arg.F), BFI(Arg.BFI) {}
@@ -74,134 +74,6 @@ public:
   /// emit* APIs.
   void emit(DiagnosticInfoOptimizationBase &OptDiag);
 
-  /// Emit an optimization-applied message.
-  ///
-  /// \p PassName is the name of the pass emitting the message. If -Rpass= is
-  /// given and \p PassName matches the regular expression in -Rpass, then the
-  /// remark will be emitted. \p Fn is the function triggering the remark, \p
-  /// DLoc is the debug location where the diagnostic is generated. \p V is the
-  /// IR Value that identifies the code region. \p Msg is the message string to
-  /// use.
-  void emitOptimizationRemark(const char *PassName, const DebugLoc &DLoc,
-                              const Value *V, const Twine &Msg);
-
-  /// \brief Same as above but derives the IR Value for the code region and the
-  /// debug location from the Loop parameter \p L.
-  void emitOptimizationRemark(const char *PassName, Loop *L, const Twine &Msg);
-
-  /// \brief Same as above but derives the debug location and the code region
-  /// from the debug location and the basic block of \p Inst, respectively.
-  void emitOptimizationRemark(const char *PassName, Instruction *Inst,
-                              const Twine &Msg) {
-    emitOptimizationRemark(PassName, Inst->getDebugLoc(), Inst->getParent(),
-                           Msg);
-  }
-
-  /// Emit an optimization-missed message.
-  ///
-  /// \p PassName is the name of the pass emitting the message. If
-  /// -Rpass-missed= is given and the name matches the regular expression in
-  /// -Rpass, then the remark will be emitted.  \p DLoc is the debug location
-  /// where the diagnostic is generated. \p V is the IR Value that identifies
-  /// the code region. \p Msg is the message string to use.  If \p IsVerbose is
-  /// true, the message is considered verbose and will only be emitted when
-  /// verbose output is turned on.
-  void emitOptimizationRemarkMissed(const char *PassName, const DebugLoc &DLoc,
-                                    const Value *V, const Twine &Msg,
-                                    bool IsVerbose = false);
-
-  /// \brief Same as above but derives the IR Value for the code region and the
-  /// debug location from the Loop parameter \p L.
-  void emitOptimizationRemarkMissed(const char *PassName, Loop *L,
-                                    const Twine &Msg, bool IsVerbose = false);
-
-  /// \brief Same as above but derives the debug location and the code region
-  /// from the debug location and the basic block of \p Inst, respectively.
-  void emitOptimizationRemarkMissed(const char *PassName, Instruction *Inst,
-                                    const Twine &Msg, bool IsVerbose = false) {
-    emitOptimizationRemarkMissed(PassName, Inst->getDebugLoc(),
-                                 Inst->getParent(), Msg, IsVerbose);
-  }
-
-  /// Emit an optimization analysis remark message.
-  ///
-  /// \p PassName is the name of the pass emitting the message. If
-  /// -Rpass-analysis= is given and \p PassName matches the regular expression
-  /// in -Rpass, then the remark will be emitted. \p DLoc is the debug location
-  /// where the diagnostic is generated. \p V is the IR Value that identifies
-  /// the code region. \p Msg is the message string to use. If \p IsVerbose is
-  /// true, the message is considered verbose and will only be emitted when
-  /// verbose output is turned on.
-  void emitOptimizationRemarkAnalysis(const char *PassName,
-                                      const DebugLoc &DLoc, const Value *V,
-                                      const Twine &Msg, bool IsVerbose = false);
-
-  /// \brief Same as above but derives the IR Value for the code region and the
-  /// debug location from the Loop parameter \p L.
-  void emitOptimizationRemarkAnalysis(const char *PassName, Loop *L,
-                                      const Twine &Msg, bool IsVerbose = false);
-
-  /// \brief Same as above but derives the debug location and the code region
-  /// from the debug location and the basic block of \p Inst, respectively.
-  void emitOptimizationRemarkAnalysis(const char *PassName, Instruction *Inst,
-                                      const Twine &Msg,
-                                      bool IsVerbose = false) {
-    emitOptimizationRemarkAnalysis(PassName, Inst->getDebugLoc(),
-                                   Inst->getParent(), Msg, IsVerbose);
-  }
-
-  /// \brief This variant allows specifying what should be emitted for missed
-  /// and analysis remarks in one call.
-  ///
-  /// \p PassName is the name of the pass emitting the message. If
-  /// -Rpass-missed= is given and \p PassName matches the regular expression, \p
-  /// MsgForMissedRemark is emitted.
-  ///
-  /// If -Rpass-analysis= is given and \p PassName matches the regular
-  /// expression, \p MsgForAnalysisRemark is emitted.
-  ///
-  /// The debug location and the code region is derived from \p Inst. If \p
-  /// IsVerbose is true, the message is considered verbose and will only be
-  /// emitted when verbose output is turned on.
-  void emitOptimizationRemarkMissedAndAnalysis(
-      const char *PassName, Instruction *Inst, const Twine &MsgForMissedRemark,
-      const Twine &MsgForAnalysisRemark, bool IsVerbose = false) {
-    emitOptimizationRemarkAnalysis(PassName, Inst, MsgForAnalysisRemark,
-                                   IsVerbose);
-    emitOptimizationRemarkMissed(PassName, Inst, MsgForMissedRemark, IsVerbose);
-  }
-
-  /// \brief Emit an optimization analysis remark related to floating-point
-  /// non-commutativity.
-  ///
-  /// \p PassName is the name of the pass emitting the message. If
-  /// -Rpass-analysis= is given and \p PassName matches the regular expression
-  /// in -Rpass, then the remark will be emitted. \p Fn is the function
-  /// triggering the remark, \p DLoc is the debug location where the diagnostic
-  /// is generated.\p V is the IR Value that identifies the code region.  \p Msg
-  /// is the message string to use.
-  void emitOptimizationRemarkAnalysisFPCommute(const char *PassName,
-                                               const DebugLoc &DLoc,
-                                               const Value *V,
-                                               const Twine &Msg);
-
-  /// \brief Emit an optimization analysis remark related to pointer aliasing.
-  ///
-  /// \p PassName is the name of the pass emitting the message. If
-  /// -Rpass-analysis= is given and \p PassName matches the regular expression
-  /// in -Rpass, then the remark will be emitted. \p Fn is the function
-  /// triggering the remark, \p DLoc is the debug location where the diagnostic
-  /// is generated.\p V is the IR Value that identifies the code region.  \p Msg
-  /// is the message string to use.
-  void emitOptimizationRemarkAnalysisAliasing(const char *PassName,
-                                              const DebugLoc &DLoc,
-                                              const Value *V, const Twine &Msg);
-
-  /// \brief Same as above but derives the IR Value for the code region and the
-  /// debug location from the Loop parameter \p L.
-  void emitOptimizationRemarkAnalysisAliasing(const char *PassName, Loop *L,
-                                              const Twine &Msg);
-
   /// \brief Whether we allow for extra compile-time budget to perform more
   /// analysis to produce fewer false positives.
   ///
@@ -216,7 +88,7 @@ public:
   }
 
 private:
-  Function *F;
+  const Function *F;
 
   BlockFrequencyInfo *BFI;
 
diff --git a/include/llvm/Analysis/ProfileSummaryInfo.h b/include/llvm/Analysis/ProfileSummaryInfo.h
index 80f57fde6fa0729d0d03302d88249b1c24760efe..1aec35c3e677e6a975504eb38145810fcf8c49e9 100644
--- a/include/llvm/Analysis/ProfileSummaryInfo.h
+++ b/include/llvm/Analysis/ProfileSummaryInfo.h
@@ -49,16 +49,22 @@ private:
   void computeThresholds();
   // Count thresholds to answer isHotCount and isColdCount queries.
   Optional<uint64_t> HotCountThreshold, ColdCountThreshold;
-  bool extractProfTotalWeight(const Instruction *TI, uint64_t &TotalCount);
 
 public:
   ProfileSummaryInfo(Module &M) : M(M) {}
   ProfileSummaryInfo(ProfileSummaryInfo &&Arg)
       : M(Arg.M), Summary(std::move(Arg.Summary)) {}
+  /// Returns the profile count for \p CallInst.
+  static Optional<uint64_t> getProfileCount(const Instruction *CallInst,
+                                            BlockFrequencyInfo *BFI);
   /// \brief Returns true if \p F has hot function entry.
   bool isFunctionEntryHot(const Function *F);
+  /// Returns true if \p F has hot function entry or hot call edge.
+  bool isFunctionHotInCallGraph(const Function *F);
   /// \brief Returns true if \p F has cold function entry.
   bool isFunctionEntryCold(const Function *F);
+  /// Returns true if \p F has cold function entry or cold call edge.
+  bool isFunctionColdInCallGraph(const Function *F);
   /// \brief Returns true if \p F is a hot function.
   bool isHotCount(uint64_t C);
   /// \brief Returns true if count \p C is considered cold.
diff --git a/include/llvm/Analysis/PtrUseVisitor.h b/include/llvm/Analysis/PtrUseVisitor.h
index 6e61fc3be38456fe77b0d077ca37e6c8adb3c5ba..2fe7c672526604412b6b9cbcd6757455f9fc4e2a 100644
--- a/include/llvm/Analysis/PtrUseVisitor.h
+++ b/include/llvm/Analysis/PtrUseVisitor.h
@@ -196,7 +196,10 @@ class PtrUseVisitor : protected InstVisitor<DerivedT>,
   typedef InstVisitor<DerivedT> Base;
 
 public:
-  PtrUseVisitor(const DataLayout &DL) : PtrUseVisitorBase(DL) {}
+  PtrUseVisitor(const DataLayout &DL) : PtrUseVisitorBase(DL) {
+    static_assert(std::is_base_of<PtrUseVisitor, DerivedT>::value,
+                  "Must pass the derived type to this template!");
+  }
 
   /// \brief Recursively visit the uses of the given pointer.
   /// \returns An info struct about the pointer. See \c PtrInfo for details.
diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h
index 73955323ba34486c9722c43b55f6343f4f540c7a..9a50de540f2b1b4d6e205844a714e19b231c1d4d 100644
--- a/include/llvm/Analysis/ScalarEvolution.h
+++ b/include/llvm/Analysis/ScalarEvolution.h
@@ -543,6 +543,12 @@ private:
   /// predicate by splitting it into a set of independent predicates.
   bool ProvingSplitPredicate;
 
+  /// Memoized values for the GetMinTrailingZeros
+  DenseMap<const SCEV *, uint32_t> MinTrailingZerosCache;
+
+  /// Private helper method for the GetMinTrailingZeros method
+  uint32_t GetMinTrailingZerosImpl(const SCEV *S);
+
   /// Information about the number of loop iterations for which a loop exit's
   /// branch condition evaluates to the not-taken path.  This is a temporary
   /// pair of exact and max expressions that are eventually summarized in
@@ -970,6 +976,20 @@ private:
                              const SCEV *RHS, const SCEV *FoundLHS,
                              const SCEV *FoundRHS);
 
+  /// Test whether the condition described by Pred, LHS, and RHS is true
+  /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
+  /// true. Here LHS is an operation that includes FoundLHS as one of its
+  /// arguments.
+  bool isImpliedViaOperations(ICmpInst::Predicate Pred,
+                              const SCEV *LHS, const SCEV *RHS,
+                              const SCEV *FoundLHS, const SCEV *FoundRHS,
+                              unsigned Depth = 0);
+
+  /// Test whether the condition described by Pred, LHS, and RHS is true.
+  /// Use only simple non-recursive types of checks, such as range analysis etc.
+  bool isKnownViaSimpleReasoning(ICmpInst::Predicate Pred,
+                                 const SCEV *LHS, const SCEV *RHS);
+
   /// Test whether the condition described by Pred, LHS, and RHS is true
   /// whenever the condition described by Pred, FoundLHS, and FoundRHS is
   /// true.
@@ -1117,6 +1137,9 @@ public:
   /// return true. For pointer types, this is the pointer-sized integer type.
   Type *getEffectiveSCEVType(Type *Ty) const;
 
+  // Returns a wider type among {Ty1, Ty2}.
+  Type *getWiderType(Type *Ty1, Type *Ty2) const;
+
   /// Return true if the SCEV is a scAddRecExpr or it contains
   /// scAddRecExpr. The result will be cached in HasRecMap.
   ///
@@ -1140,7 +1163,8 @@ public:
   const SCEV *getSignExtendExpr(const SCEV *Op, Type *Ty);
   const SCEV *getAnyExtendExpr(const SCEV *Op, Type *Ty);
   const SCEV *getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
-                         SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap);
+                         SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap,
+                         unsigned Depth = 0);
   const SCEV *getAddExpr(const SCEV *LHS, const SCEV *RHS,
                          SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap) {
     SmallVector<const SCEV *, 2> Ops = {LHS, RHS};
@@ -1289,7 +1313,7 @@ public:
   ///
   /// Implemented in terms of the \c getSmallConstantTripCount overload with
   /// the single exiting block passed to it. See that routine for details.
-  unsigned getSmallConstantTripCount(Loop *L);
+  unsigned getSmallConstantTripCount(const Loop *L);
 
   /// Returns the maximum trip count of this loop as a normal unsigned
   /// value. Returns 0 if the trip count is unknown or not constant. This
@@ -1298,12 +1322,12 @@ public:
   /// before taking the branch. For loops with multiple exits, it may not be
   /// the number times that the loop header executes if the loop exits
   /// prematurely via another branch.
-  unsigned getSmallConstantTripCount(Loop *L, BasicBlock *ExitingBlock);
+  unsigned getSmallConstantTripCount(const Loop *L, BasicBlock *ExitingBlock);
 
   /// Returns the upper bound of the loop trip count as a normal unsigned
   /// value.
   /// Returns 0 if the trip count is unknown or not constant.
-  unsigned getSmallConstantMaxTripCount(Loop *L);
+  unsigned getSmallConstantMaxTripCount(const Loop *L);
 
   /// Returns the largest constant divisor of the trip count of the
   /// loop if it is a single-exit loop and we can compute a small maximum for
@@ -1311,7 +1335,7 @@ public:
   ///
   /// Implemented in terms of the \c getSmallConstantTripMultiple overload with
   /// the single exiting block passed to it. See that routine for details.
-  unsigned getSmallConstantTripMultiple(Loop *L);
+  unsigned getSmallConstantTripMultiple(const Loop *L);
 
   /// Returns the largest constant divisor of the trip count of this loop as a
   /// normal unsigned value, if possible. This means that the actual trip
@@ -1319,12 +1343,13 @@ public:
   /// count could very well be zero as well!). As explained in the comments
   /// for getSmallConstantTripCount, this assumes that control exits the loop
   /// via ExitingBlock.
-  unsigned getSmallConstantTripMultiple(Loop *L, BasicBlock *ExitingBlock);
+  unsigned getSmallConstantTripMultiple(const Loop *L,
+                                        BasicBlock *ExitingBlock);
 
   /// Get the expression for the number of loop iterations for which this loop
   /// is guaranteed not to exit via ExitingBlock. Otherwise return
   /// SCEVCouldNotCompute.
-  const SCEV *getExitCount(Loop *L, BasicBlock *ExitingBlock);
+  const SCEV *getExitCount(const Loop *L, BasicBlock *ExitingBlock);
 
   /// If the specified loop has a predictable backedge-taken count, return it,
   /// otherwise return a SCEVCouldNotCompute object. The backedge-taken count
@@ -1613,6 +1638,10 @@ private:
   bool doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride, bool IsSigned,
                           bool NoWrap);
 
+  /// Get add expr already created or create a new one
+  const SCEV *getOrCreateAddExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                 SCEV::NoWrapFlags Flags);
+
 private:
   FoldingSet<SCEV> UniqueSCEVs;
   FoldingSet<SCEVPredicate> UniquePreds;
diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h
index f947c46268ce1e952cde2afe42c13ed2a6e105f4..67196687d55610cde207e4cc4c209e0ea92c534a 100644
--- a/include/llvm/Analysis/TargetTransformInfo.h
+++ b/include/llvm/Analysis/TargetTransformInfo.h
@@ -44,23 +44,26 @@ class Value;
 
 /// \brief Information about a load/store intrinsic defined by the target.
 struct MemIntrinsicInfo {
-  MemIntrinsicInfo()
-      : ReadMem(false), WriteMem(false), IsSimple(false), MatchingId(0),
-        NumMemRefs(0), PtrVal(nullptr) {}
-  bool ReadMem;
-  bool WriteMem;
-  /// True only if this memory operation is non-volatile, non-atomic, and
-  /// unordered.  (See LoadInst/StoreInst for details on each)
-  bool IsSimple;
-  // Same Id is set by the target for corresponding load/store intrinsics.
-  unsigned short MatchingId;
-  int NumMemRefs;
-
   /// This is the pointer that the intrinsic is loading from or storing to.
   /// If this is non-null, then analysis/optimization passes can assume that
   /// this intrinsic is functionally equivalent to a load/store from this
   /// pointer.
-  Value *PtrVal;
+  Value *PtrVal = nullptr;
+
+  // Ordering for atomic operations.
+  AtomicOrdering Ordering = AtomicOrdering::NotAtomic;
+
+  // Same Id is set by the target for corresponding load/store intrinsics.
+  unsigned short MatchingId = 0;
+
+  bool ReadMem = false;
+  bool WriteMem = false;
+  bool IsVolatile = false;
+
+  bool isUnordered() const {
+    return (Ordering == AtomicOrdering::NotAtomic ||
+            Ordering == AtomicOrdering::Unordered) && !IsVolatile;
+  }
 };
 
 /// \brief This pass provides access to the codegen interfaces that are needed
@@ -434,6 +437,11 @@ public:
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) const;
 
+  /// If target has efficient vector element load/store instructions, it can
+  /// return true here so that insertion/extraction costs are not added to
+  /// the scalarization cost of a load/store.
+  bool supportsEfficientVectorElementLoadStore() const;
+
   /// \brief Don't restrict interleaved unrolling to small loops.
   bool enableAggressiveInterleaving(bool LoopHasReductions) const;
 
@@ -523,6 +531,12 @@ public:
   /// \return The width of the largest scalar or vector register type.
   unsigned getRegisterBitWidth(bool Vector) const;
 
+  /// \return True if it should be considered for address type promotion.
+  /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is
+  /// profitable without finding other extensions fed by the same input.
+  bool shouldConsiderAddressTypePromotion(
+      const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const;
+
   /// \return The size of a cache line in bytes.
   unsigned getCacheLineSize() const;
 
@@ -563,8 +577,10 @@ public:
                      Type *SubTp = nullptr) const;
 
   /// \return The expected cost of cast instructions, such as bitcast, trunc,
-  /// zext, etc.
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) const;
+  /// zext, etc. If there is an existing instruction that holds Opcode, it
+  /// may be passed in the 'I' parameter.
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr) const;
 
   /// \return The expected cost of a sign- or zero-extended vector extract. Use
   /// -1 to indicate that there is no information about the index value.
@@ -575,9 +591,11 @@ public:
   /// Phi, Ret, Br.
   int getCFInstrCost(unsigned Opcode) const;
 
-  /// \returns The expected cost of compare and select instructions.
+  /// \returns The expected cost of compare and select instructions. If there
+  /// is an existing instruction that holds Opcode, it may be passed in the
+  /// 'I' parameter.
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                         Type *CondTy = nullptr) const;
+                 Type *CondTy = nullptr, const Instruction *I = nullptr) const;
 
   /// \return The expected cost of vector Insert and Extract.
   /// Use -1 to indicate that there is no information on the index value.
@@ -585,7 +603,7 @@ public:
 
   /// \return The cost of Load and Store instructions.
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace) const;
+                      unsigned AddressSpace, const Instruction *I = nullptr) const;
 
   /// \return The cost of masked Load and Store instructions.
   int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
@@ -628,13 +646,19 @@ public:
   ///  ((v0+v2), (v1+v3), undef, undef)
   int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm) const;
 
-  /// \returns The cost of Intrinsic instructions. Types analysis only.
+  /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
+  /// Three cases are handled: 1. scalar instruction 2. vector instruction
+  /// 3. scalar instruction which is to be vectorized with VF.
   int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                            ArrayRef<Type *> Tys, FastMathFlags FMF) const;
+                            ArrayRef<Value *> Args, FastMathFlags FMF,
+                            unsigned VF = 1) const;
 
-  /// \returns The cost of Intrinsic instructions. Analyses the real arguments.
+  /// \returns The cost of Intrinsic instructions. Types analysis only.
+  /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the
+  /// arguments and the return value will be computed based on types.
   int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                            ArrayRef<Value *> Args, FastMathFlags FMF) const;
+                            ArrayRef<Type *> Tys, FastMathFlags FMF,
+                            unsigned ScalarizationCostPassed = UINT_MAX) const;
 
   /// \returns The cost of Call instructions.
   int getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type *> Tys) const;
@@ -771,6 +795,7 @@ public:
   getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) = 0;
   virtual unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                                     unsigned VF) = 0;
+  virtual bool supportsEfficientVectorElementLoadStore() = 0;
   virtual bool enableAggressiveInterleaving(bool LoopHasReductions) = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
@@ -791,6 +816,8 @@ public:
                             Type *Ty) = 0;
   virtual unsigned getNumberOfRegisters(bool Vector) = 0;
   virtual unsigned getRegisterBitWidth(bool Vector) = 0;
+  virtual bool shouldConsiderAddressTypePromotion(
+      const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0;
   virtual unsigned getCacheLineSize() = 0;
   virtual unsigned getPrefetchDistance() = 0;
   virtual unsigned getMinPrefetchStride() = 0;
@@ -804,16 +831,17 @@ public:
                          ArrayRef<const Value *> Args) = 0;
   virtual int getShuffleCost(ShuffleKind Kind, Type *Tp, int Index,
                              Type *SubTp) = 0;
-  virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) = 0;
+  virtual int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                               const Instruction *I) = 0;
   virtual int getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                        VectorType *VecTy, unsigned Index) = 0;
   virtual int getCFInstrCost(unsigned Opcode) = 0;
   virtual int getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                 Type *CondTy) = 0;
+                                Type *CondTy, const Instruction *I) = 0;
   virtual int getVectorInstrCost(unsigned Opcode, Type *Val,
                                  unsigned Index) = 0;
   virtual int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                              unsigned AddressSpace) = 0;
+                              unsigned AddressSpace, const Instruction *I) = 0;
   virtual int getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
                                     unsigned Alignment,
                                     unsigned AddressSpace) = 0;
@@ -828,11 +856,10 @@ public:
   virtual int getReductionCost(unsigned Opcode, Type *Ty,
                                bool IsPairwiseForm) = 0;
   virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                    ArrayRef<Type *> Tys,
-                                    FastMathFlags FMF) = 0;
+                      ArrayRef<Type *> Tys, FastMathFlags FMF,
+                      unsigned ScalarizationCostPassed) = 0;
   virtual int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                    ArrayRef<Value *> Args,
-                                    FastMathFlags FMF) = 0;
+         ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) = 0;
   virtual int getCallInstrCost(Function *F, Type *RetTy,
                                ArrayRef<Type *> Tys) = 0;
   virtual unsigned getNumberOfParts(Type *Tp) = 0;
@@ -975,6 +1002,10 @@ public:
     return Impl.getOperandsScalarizationOverhead(Args, VF);
   }
 
+  bool supportsEfficientVectorElementLoadStore() override {
+    return Impl.supportsEfficientVectorElementLoadStore();
+  }
+
   bool enableAggressiveInterleaving(bool LoopHasReductions) override {
     return Impl.enableAggressiveInterleaving(LoopHasReductions);
   }
@@ -1018,7 +1049,11 @@ public:
   unsigned getRegisterBitWidth(bool Vector) override {
     return Impl.getRegisterBitWidth(Vector);
   }
-
+  bool shouldConsiderAddressTypePromotion(
+      const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override {
+    return Impl.shouldConsiderAddressTypePromotion(
+        I, AllowPromotionWithoutCommonHeader);
+  }
   unsigned getCacheLineSize() override {
     return Impl.getCacheLineSize();
   }
@@ -1045,8 +1080,9 @@ public:
                      Type *SubTp) override {
     return Impl.getShuffleCost(Kind, Tp, Index, SubTp);
   }
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) override {
-    return Impl.getCastInstrCost(Opcode, Dst, Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I) override {
+    return Impl.getCastInstrCost(Opcode, Dst, Src, I);
   }
   int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                                unsigned Index) override {
@@ -1055,15 +1091,16 @@ public:
   int getCFInstrCost(unsigned Opcode) override {
     return Impl.getCFInstrCost(Opcode);
   }
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) override {
-    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I) override {
+    return Impl.getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
   }
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) override {
     return Impl.getVectorInstrCost(Opcode, Val, Index);
   }
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace) override {
-    return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+                      unsigned AddressSpace, const Instruction *I) override {
+    return Impl.getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
   }
   int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                             unsigned AddressSpace) override {
@@ -1086,13 +1123,13 @@ public:
     return Impl.getReductionCost(Opcode, Ty, IsPairwiseForm);
   }
   int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, ArrayRef<Type *> Tys,
-                            FastMathFlags FMF) override {
-    return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
+               FastMathFlags FMF, unsigned ScalarizationCostPassed) override {
+    return Impl.getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+                                      ScalarizationCostPassed);
   }
   int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                            ArrayRef<Value *> Args,
-                            FastMathFlags FMF) override {
-    return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF);
+       ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) override {
+    return Impl.getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
   }
   int getCallInstrCost(Function *F, Type *RetTy,
                        ArrayRef<Type *> Tys) override {
diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h
index df503449cf98d260c71c16ff3e6885db8443ca0f..9ab6b7445ab86cd157b9393852d0219833cb5162 100644
--- a/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -262,6 +262,8 @@ public:
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) { return 0; }
 
+  bool supportsEfficientVectorElementLoadStore() { return false; }
+
   bool enableAggressiveInterleaving(bool LoopHasReductions) { return false; }
 
   bool enableInterleavedAccessVectorization() { return false; }
@@ -303,6 +305,13 @@ public:
 
   unsigned getRegisterBitWidth(bool Vector) { return 32; }
 
+  bool
+  shouldConsiderAddressTypePromotion(const Instruction &I,
+                                     bool &AllowPromotionWithoutCommonHeader) {
+    AllowPromotionWithoutCommonHeader = false;
+    return false;
+  }
+
   unsigned getCacheLineSize() { return 0; }
 
   unsigned getPrefetchDistance() { return 0; }
@@ -327,7 +336,8 @@ public:
     return 1;
   }
 
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) { return 1; }
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                            const Instruction *I) { return 1; }
 
   unsigned getExtractWithExtendCost(unsigned Opcode, Type *Dst,
                                     VectorType *VecTy, unsigned Index) {
@@ -336,7 +346,8 @@ public:
 
   unsigned getCFInstrCost(unsigned Opcode) { return 1; }
 
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                              const Instruction *I) {
     return 1;
   }
 
@@ -345,7 +356,7 @@ public:
   }
 
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) {
+                           unsigned AddressSpace, const Instruction *I) {
     return 1;
   }
 
@@ -369,11 +380,12 @@ public:
   }
 
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                 ArrayRef<Type *> Tys, FastMathFlags FMF) {
+                                 ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                 unsigned ScalarizationCostPassed) {
     return 1;
   }
   unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                 ArrayRef<Value *> Args, FastMathFlags FMF) {
+            ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
     return 1;
   }
 
diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h
index 5121c8b0c176a040b7365744df42ce4c54d07bfe..e3c2f3bed2277537c90fb51a5240eb5f66d78991 100644
--- a/include/llvm/Analysis/ValueTracking.h
+++ b/include/llvm/Analysis/ValueTracking.h
@@ -31,6 +31,7 @@ template <typename T> class ArrayRef;
   class Instruction;
   class Loop;
   class LoopInfo;
+  class OptimizationRemarkEmitter;
   class MDNode;
   class StringRef;
   class TargetLibraryInfo;
@@ -52,7 +53,8 @@ template <typename T> class ArrayRef;
                         const DataLayout &DL, unsigned Depth = 0,
                         AssumptionCache *AC = nullptr,
                         const Instruction *CxtI = nullptr,
-                        const DominatorTree *DT = nullptr);
+                        const DominatorTree *DT = nullptr,
+                        OptimizationRemarkEmitter *ORE = nullptr);
   /// Compute known bits from the range metadata.
   /// \p KnownZero the set of bits that are known to be zero
   /// \p KnownOne the set of bits that are known to be one
@@ -86,8 +88,10 @@ template <typename T> class ArrayRef;
 
   /// Return true if the given value is known to be non-zero when defined. For
   /// vectors, return true if every element is known to be non-zero when
-  /// defined. Supports values with integer or pointer type and vectors of
-  /// integers.
+  /// defined. For pointers, if the context instruction and dominator tree are
+  /// specified, perform context-sensitive analysis and return true if the
+  /// pointer couldn't possibly be null at the specified instruction.
+  /// Supports values with integer or pointer type and vectors of integers.
   bool isKnownNonZero(const Value *V, const DataLayout &DL, unsigned Depth = 0,
                       AssumptionCache *AC = nullptr,
                       const Instruction *CxtI = nullptr,
diff --git a/include/llvm/Analysis/VectorUtils.h b/include/llvm/Analysis/VectorUtils.h
index eaa068b89c774c7d06e5648fd3d1125a816e9163..6315e8408f056bf6b10276c6f3f395815ae2d974 100644
--- a/include/llvm/Analysis/VectorUtils.h
+++ b/include/llvm/Analysis/VectorUtils.h
@@ -1,4 +1,4 @@
-//===- llvm/Transforms/Utils/VectorUtils.h - Vector utilities -*- C++ -*-=====//
+//===- llvm/Analysis/VectorUtils.h - Vector utilities -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,11 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_TRANSFORMS_UTILS_VECTORUTILS_H
-#define LLVM_TRANSFORMS_UTILS_VECTORUTILS_H
+#ifndef LLVM_ANALYSIS_VECTORUTILS_H
+#define LLVM_ANALYSIS_VECTORUTILS_H
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/IR/IRBuilder.h"
 
 namespace llvm {
 
@@ -123,6 +124,58 @@ computeMinimumValueSizes(ArrayRef<BasicBlock*> Blocks,
 /// This function always sets a (possibly null) value for each K in Kinds.
 Instruction *propagateMetadata(Instruction *I, ArrayRef<Value *> VL);
 
+/// \brief Create an interleave shuffle mask.
+///
+/// This function creates a shuffle mask for interleaving \p NumVecs vectors of
+/// vectorization factor \p VF into a single wide vector. The mask is of the
+/// form:
+///
+///   <0, VF, VF * 2, ..., VF * (NumVecs - 1), 1, VF + 1, VF * 2 + 1, ...>
+///
+/// For example, the mask for VF = 4 and NumVecs = 2 is:
+///
+///   <0, 4, 1, 5, 2, 6, 3, 7>.
+Constant *createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
+                               unsigned NumVecs);
+
+/// \brief Create a stride shuffle mask.
+///
+/// This function creates a shuffle mask whose elements begin at \p Start and
+/// are incremented by \p Stride. The mask can be used to deinterleave an
+/// interleaved vector into separate vectors of vectorization factor \p VF. The
+/// mask is of the form:
+///
+///   <Start, Start + Stride, ..., Start + Stride * (VF - 1)>
+///
+/// For example, the mask for Start = 0, Stride = 2, and VF = 4 is:
+///
+///   <0, 2, 4, 6>
+Constant *createStrideMask(IRBuilder<> &Builder, unsigned Start,
+                           unsigned Stride, unsigned VF);
+
+/// \brief Create a sequential shuffle mask.
+///
+/// This function creates shuffle mask whose elements are sequential and begin
+/// at \p Start.  The mask contains \p NumInts integers and is padded with \p
+/// NumUndefs undef values. The mask is of the form:
+///
+///   <Start, Start + 1, ... Start + NumInts - 1, undef_1, ... undef_NumUndefs>
+///
+/// For example, the mask for Start = 0, NumInsts = 4, and NumUndefs = 4 is:
+///
+///   <0, 1, 2, 3, undef, undef, undef, undef>
+Constant *createSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                               unsigned NumInts, unsigned NumUndefs);
+
+/// \brief Concatenate a list of vectors.
+///
+/// This function generates code that concatenate the vectors in \p Vecs into a
+/// single large vector. The number of vectors should be greater than one, and
+/// their element types should be the same. The number of elements in the
+/// vectors should also be the same; however, if the last vector has fewer
+/// elements, it will be padded with undefs.
+Value *concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs);
+
 } // llvm namespace
 
 #endif
diff --git a/include/llvm/Bitcode/BitcodeWriter.h b/include/llvm/Bitcode/BitcodeWriter.h
index 4f72f98bbf9c5a12bd5af778070e7ecc7a4b30d3..271cb2d81bbb20bf8f8e08b21539deaa1437756c 100644
--- a/include/llvm/Bitcode/BitcodeWriter.h
+++ b/include/llvm/Bitcode/BitcodeWriter.h
@@ -43,9 +43,16 @@ namespace llvm {
     ///
     /// \p GenerateHash enables hashing the Module and including the hash in the
     /// bitcode (currently for use in ThinLTO incremental build).
+    ///
+    /// If \p ModHash is non-null, when GenerateHash is true, the resulting
+    /// hash is written into ModHash. When GenerateHash is false, that value
+    /// is used as the hash instead of computing from the generated bitcode.
+    /// Can be used to produce the same module hash for a minimized bitcode
+    /// used just for the thin link as in the regular full bitcode that will
+    /// be used in the backend.
     void writeModule(const Module *M, bool ShouldPreserveUseListOrder = false,
                      const ModuleSummaryIndex *Index = nullptr,
-                     bool GenerateHash = false);
+                     bool GenerateHash = false, ModuleHash *ModHash = nullptr);
   };
 
   /// \brief Write the specified module to the specified raw output stream.
@@ -62,10 +69,18 @@ namespace llvm {
   ///
   /// \p GenerateHash enables hashing the Module and including the hash in the
   /// bitcode (currently for use in ThinLTO incremental build).
+  ///
+  /// If \p ModHash is non-null, when GenerateHash is true, the resulting
+  /// hash is written into ModHash. When GenerateHash is false, that value
+  /// is used as the hash instead of computing from the generated bitcode.
+  /// Can be used to produce the same module hash for a minimized bitcode
+  /// used just for the thin link as in the regular full bitcode that will
+  /// be used in the backend.
   void WriteBitcodeToFile(const Module *M, raw_ostream &Out,
                           bool ShouldPreserveUseListOrder = false,
                           const ModuleSummaryIndex *Index = nullptr,
-                          bool GenerateHash = false);
+                          bool GenerateHash = false,
+                          ModuleHash *ModHash = nullptr);
 
   /// Write the specified module summary index to the given raw output stream,
   /// where it will be written in a new bitcode block. This is used when
diff --git a/include/llvm/Bitcode/LLVMBitCodes.h b/include/llvm/Bitcode/LLVMBitCodes.h
index c996c38261c0d055f17731cc9ee86c4e6afcfe30..e2d2fbb0f449aae2051a88eb02264e2eadb90d4d 100644
--- a/include/llvm/Bitcode/LLVMBitCodes.h
+++ b/include/llvm/Bitcode/LLVMBitCodes.h
@@ -92,9 +92,6 @@ enum ModuleCodes {
   // ALIAS: [alias type, aliasee val#, linkage, visibility]
   MODULE_CODE_ALIAS_OLD = 9,
 
-  // MODULE_CODE_PURGEVALS: [numvals]
-  MODULE_CODE_PURGEVALS = 10,
-
   MODULE_CODE_GCNAME = 11, // GCNAME: [strchr x N]
   MODULE_CODE_COMDAT = 12, // COMDAT: [selection_kind, name]
 
@@ -213,8 +210,28 @@ enum GlobalValueSummarySymtabCodes {
   FS_COMBINED_ORIGINAL_NAME = 9,
   // VERSION of the summary, bumped when adding flags for instance.
   FS_VERSION = 10,
-  // The list of llvm.type.test type identifiers used by the following function.
+  // The list of llvm.type.test type identifiers used by the following function
+  // that are used other than by an llvm.assume.
+  // [n x typeid]
   FS_TYPE_TESTS = 11,
+  // The list of virtual calls made by this function using
+  // llvm.assume(llvm.type.test) intrinsics that do not have all constant
+  // integer arguments.
+  // [n x (typeid, offset)]
+  FS_TYPE_TEST_ASSUME_VCALLS = 12,
+  // The list of virtual calls made by this function using
+  // llvm.type.checked.load intrinsics that do not have all constant integer
+  // arguments.
+  // [n x (typeid, offset)]
+  FS_TYPE_CHECKED_LOAD_VCALLS = 13,
+  // Identifies a virtual call made by this function using an
+  // llvm.assume(llvm.type.test) intrinsic with all constant integer arguments.
+  // [typeid, offset, n x arg]
+  FS_TYPE_TEST_ASSUME_CONST_VCALL = 14,
+  // Identifies a virtual call made by this function using an
+  // llvm.type.checked.load intrinsic with all constant integer arguments.
+  // [typeid, offset, n x arg]
+  FS_TYPE_CHECKED_LOAD_CONST_VCALL = 15,
 };
 
 enum MetadataCodes {
diff --git a/include/llvm/CodeGen/Analysis.h b/include/llvm/CodeGen/Analysis.h
index f20185c4499adaaffffcea811510e450ea6c7b4b..ba88f1f78fb836717ef05493d97a10f43d7b989c 100644
--- a/include/llvm/CodeGen/Analysis.h
+++ b/include/llvm/CodeGen/Analysis.h
@@ -123,13 +123,6 @@ bool returnTypeIsEligibleForTailCall(const Function *F, const Instruction *I,
                                      const ReturnInst *Ret,
                                      const TargetLoweringBase &TLI);
 
-// True if GV can be left out of the object symbol table. This is the case
-// for linkonce_odr values whose address is not significant. While legal, it is
-// not normally profitable to omit them from the .o symbol table. Using this
-// analysis makes sense when the information can be passed down to the linker
-// or we are in LTO.
-bool canBeOmittedFromSymbolTable(const GlobalValue *GV);
-
 DenseMap<const MachineBasicBlock *, int>
 getFuncletMembership(const MachineFunction &MF);
 
diff --git a/include/llvm/CodeGen/AsmPrinter.h b/include/llvm/CodeGen/AsmPrinter.h
index 4daca0347b77bdaed937e16214e6a674e7a08d13..772043fa3ce3632fb92d24119eaee93c283363db 100644
--- a/include/llvm/CodeGen/AsmPrinter.h
+++ b/include/llvm/CodeGen/AsmPrinter.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/AsmPrinter.h - AsmPrinter Framework --------*- C++ -*-===//
+//===- llvm/CodeGen/AsmPrinter.h - AsmPrinter Framework ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,35 +17,43 @@
 #define LLVM_CODEGEN_ASMPRINTER_H
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/DwarfStringPoolEntry.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SourceMgr.h"
+#include <cstdint>
+#include <memory>
+#include <utility>
+#include <vector>
 
 namespace llvm {
+
 class AsmPrinterHandler;
 class BlockAddress;
-class ByteStreamer;
-class GCStrategy;
 class Constant;
 class ConstantArray;
+class DataLayout;
 class DIE;
 class DIEAbbrev;
+class DwarfDebug;
 class GCMetadataPrinter;
 class GlobalIndirectSymbol;
 class GlobalValue;
 class GlobalVariable;
+class GCStrategy;
 class MachineBasicBlock;
+class MachineConstantPoolValue;
 class MachineFunction;
 class MachineInstr;
-class MachineLocation;
-class MachineLoopInfo;
-class MachineLoop;
-class MachineConstantPoolValue;
 class MachineJumpTableInfo;
+class MachineLoopInfo;
 class MachineModuleInfo;
+class MachineOptimizationRemarkEmitter;
 class MCAsmInfo;
 class MCCFIInstruction;
 class MCContext;
@@ -57,10 +65,7 @@ class MCSubtargetInfo;
 class MCSymbol;
 class MCTargetOptions;
 class MDNode;
-class DwarfDebug;
-class Mangler;
 class TargetLoweringObjectFile;
-class DataLayout;
 class TargetMachine;
 
 /// This class is intended to be used as a driving class for all asm writers.
@@ -84,20 +89,23 @@ public:
   std::unique_ptr<MCStreamer> OutStreamer;
 
   /// The current machine function.
-  const MachineFunction *MF;
+  const MachineFunction *MF = nullptr;
 
   /// This is a pointer to the current MachineModuleInfo.
-  MachineModuleInfo *MMI;
+  MachineModuleInfo *MMI = nullptr;
+
+  /// Optimization remark emitter.
+  MachineOptimizationRemarkEmitter *ORE;
 
   /// The symbol for the current function. This is recalculated at the beginning
   /// of each call to runOnMachineFunction().
   ///
-  MCSymbol *CurrentFnSym;
+  MCSymbol *CurrentFnSym = nullptr;
 
   /// The symbol used to represent the start of the current function for the
   /// purpose of calculating its size (e.g. using the .size directive). By
   /// default, this is equal to CurrentFnSym.
-  MCSymbol *CurrentFnSymForSize;
+  MCSymbol *CurrentFnSymForSize = nullptr;
 
   /// Map global GOT equivalent MCSymbols to GlobalVariables and keep track of
   /// its number of uses by other globals.
@@ -105,12 +113,12 @@ public:
   MapVector<const MCSymbol *, GOTEquivUsePair> GlobalGOTEquivs;
 
 private:
-  MCSymbol *CurrentFnBegin;
-  MCSymbol *CurrentFnEnd;
-  MCSymbol *CurExceptionSym;
+  MCSymbol *CurrentFnBegin = nullptr;
+  MCSymbol *CurrentFnEnd = nullptr;
+  MCSymbol *CurExceptionSym = nullptr;
 
   // The garbage collection metadata printer table.
-  void *GCMetadataPrinters; // Really a DenseMap.
+  void *GCMetadataPrinters = nullptr; // Really a DenseMap.
 
   /// Emit comments in assembly output if this is true.
   ///
@@ -118,7 +126,7 @@ private:
   static char ID;
 
   /// If VerboseAsm is set, a pointer to the loop info for this function.
-  MachineLoopInfo *LI;
+  MachineLoopInfo *LI = nullptr;
 
   struct HandlerInfo {
     AsmPrinterHandler *Handler;
@@ -126,6 +134,7 @@ private:
     const char *TimerDescription;
     const char *TimerGroupName;
     const char *TimerGroupDescription;
+
     HandlerInfo(AsmPrinterHandler *Handler, const char *TimerName,
                 const char *TimerDescription, const char *TimerGroupName,
                 const char *TimerGroupDescription)
@@ -137,11 +146,24 @@ private:
   /// maintains ownership of the emitters.
   SmallVector<HandlerInfo, 1> Handlers;
 
+public:
+  struct SrcMgrDiagInfo {
+    SourceMgr SrcMgr;
+    std::vector<const MDNode *> LocInfos;
+    LLVMContext::InlineAsmDiagHandlerTy DiagHandler;
+    void *DiagContext;
+  };
+
+private:
+  /// Structure for generating diagnostics for inline assembly. Only initialised
+  /// when necessary.
+  mutable std::unique_ptr<SrcMgrDiagInfo> DiagInfo;
+
   /// If the target supports dwarf debug info, this pointer is non-null.
-  DwarfDebug *DD;
+  DwarfDebug *DD = nullptr;
 
   /// If the current module uses dwarf CFI annotations strictly for debugging.
-  bool isCFIMoveForDebugging;
+  bool isCFIMoveForDebugging = false;
 
 protected:
   explicit AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
@@ -200,6 +222,7 @@ public:
     FUNCTION_ENTER = 0,
     FUNCTION_EXIT = 1,
     TAIL_CALL = 2,
+    LOG_ARGS_ENTER = 3,
   };
 
   // The table will contain these structs that point to the sled, the function
@@ -381,7 +404,7 @@ public:
   //===------------------------------------------------------------------===//
   // Symbol Lowering Routines.
   //===------------------------------------------------------------------===//
-public:
+
   MCSymbol *createTempSymbol(const Twine &Name) const;
 
   /// Return the MCSymbol for a private symbol with global value name as its
@@ -407,7 +430,7 @@ public:
   //===------------------------------------------------------------------===//
   // Emission Helper Routines.
   //===------------------------------------------------------------------===//
-public:
+
   /// This is just convenient handler for printing offsets.
   void printOffset(int64_t Offset, raw_ostream &OS) const;
 
@@ -484,7 +507,7 @@ public:
   ///
   /// \p Value - The value to emit.
   /// \p Size - The size of the integer (in bytes) to emit.
-  virtual void EmitDebugValue(const MCExpr *Value, unsigned Size) const;
+  virtual void EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const;
 
   //===------------------------------------------------------------------===//
   // Dwarf Lowering Routines
@@ -511,7 +534,7 @@ public:
   //===------------------------------------------------------------------===//
   // Inline Asm Support
   //===------------------------------------------------------------------===//
-public:
+
   // These are hooks that targets can override to implement inline asm
   // support.  These should probably be moved out of AsmPrinter someday.
 
@@ -555,9 +578,9 @@ public:
 private:
   /// Private state for PrintSpecial()
   // Assign a unique ID to this machine instruction.
-  mutable const MachineInstr *LastMI;
-  mutable unsigned LastFn;
-  mutable unsigned Counter;
+  mutable const MachineInstr *LastMI = nullptr;
+  mutable unsigned LastFn = 0;
+  mutable unsigned Counter = ~0U;
 
   /// This method emits the header for the current function.
   virtual void EmitFunctionHeader();
@@ -596,6 +619,7 @@ private:
   void emitGlobalIndirectSymbol(Module &M,
                                 const GlobalIndirectSymbol& GIS);
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_ASMPRINTER_H
diff --git a/include/llvm/CodeGen/BasicTTIImpl.h b/include/llvm/CodeGen/BasicTTIImpl.h
index ba774b5d1607b98122a92d270f38367f3c8a49ae..e30e947f787f18ad27f89f0cc6855098d211d541 100644
--- a/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/include/llvm/CodeGen/BasicTTIImpl.h
@@ -306,16 +306,46 @@ public:
     return Cost;
   }
 
-  /// Estimate the overhead of scalarizing an instructions unique operands.
+  /// Estimate the overhead of scalarizing an instructions unique
+  /// non-constant operands. The types of the arguments are ordinarily
+  /// scalar, in which case the costs are multiplied with VF.
   unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
                                             unsigned VF) {
     unsigned Cost = 0;
     SmallPtrSet<const Value*, 4> UniqueOperands;
     for (const Value *A : Args) {
-      if (UniqueOperands.insert(A).second)
-        Cost += getScalarizationOverhead(VectorType::get(A->getType(), VF),
-                                         false, true);
+      if (!isa<Constant>(A) && UniqueOperands.insert(A).second) {
+        Type *VecTy = nullptr;
+        if (A->getType()->isVectorTy()) {
+          VecTy = A->getType();
+          // If A is a vector operand, VF should be 1 or correspond to A.
+          assert ((VF == 1 || VF == VecTy->getVectorNumElements()) &&
+                  "Vector argument does not match VF");
+        }
+        else
+          VecTy = VectorType::get(A->getType(), VF);
+
+        Cost += getScalarizationOverhead(VecTy, false, true);
+      }
     }
+
+    return Cost;
+  }
+
+  unsigned getScalarizationOverhead(Type *VecTy, ArrayRef<const Value *> Args) {
+    assert (VecTy->isVectorTy());
+    
+    unsigned Cost = 0;
+
+    Cost += getScalarizationOverhead(VecTy, true, false);
+    if (!Args.empty())
+      Cost += getOperandsScalarizationOverhead(Args,
+                                               VecTy->getVectorNumElements());
+    else
+      // When no information on arguments is provided, we add the cost
+      // associated with one argument as a heuristic.
+      Cost += getScalarizationOverhead(VecTy, false, true);
+
     return Cost;
   }
 
@@ -361,15 +391,7 @@ public:
                           ->getArithmeticInstrCost(Opcode, Ty->getScalarType());
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
-      unsigned TotCost = getScalarizationOverhead(Ty, true, false) + Num * Cost;
-      if (!Args.empty())
-        TotCost += getOperandsScalarizationOverhead(Args, Num);
-      else
-        // When no information on arguments is provided, we add the cost
-        // associated with one argument as a heuristic.
-        TotCost += getScalarizationOverhead(Ty, false, true);
-
-      return TotCost;
+      return getScalarizationOverhead(Ty, Args) + Num * Cost;
     }
 
     // We don't know anything about this scalar instruction.
@@ -385,7 +407,8 @@ public:
     return 1;
   }
 
-  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+  unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                            const Instruction *I = nullptr) {
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");
@@ -414,6 +437,18 @@ public:
                                  Dst->getPointerAddressSpace()))
       return 0;
 
+    // If this is a zext/sext of a load, return 0 if the corresponding
+    // extending load exists on target.
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+        I && isa<LoadInst>(I->getOperand(0))) {
+        EVT ExtVT = EVT::getEVT(Dst);
+        EVT LoadVT = EVT::getEVT(Src);
+        unsigned LType =
+          ((Opcode == Instruction::ZExt) ? ISD::ZEXTLOAD : ISD::SEXTLOAD);
+        if (TLI->isLoadExtLegal(LType, ExtVT, LoadVT))
+          return 0;
+    }
+
     // If the cast is marked as legal (or promote) then assume low cost.
     if (SrcLT.first == DstLT.first &&
         TLI->isOperationLegalOrPromote(ISD, DstLT.second))
@@ -471,14 +506,14 @@ public:
                                          Src->getVectorNumElements() / 2);
         T *TTI = static_cast<T *>(this);
         return TTI->getVectorSplitCost() +
-               (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc));
+               (2 * TTI->getCastInstrCost(Opcode, SplitDst, SplitSrc, I));
       }
 
       // In other cases where the source or destination are illegal, assume
       // the operation will get scalarized.
       unsigned Num = Dst->getVectorNumElements();
       unsigned Cost = static_cast<T *>(this)->getCastInstrCost(
-          Opcode, Dst->getScalarType(), Src->getScalarType());
+          Opcode, Dst->getScalarType(), Src->getScalarType(), I);
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
@@ -512,7 +547,8 @@ public:
     return 0;
   }
 
-  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+  unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                              const Instruction *I) {
     const TargetLoweringBase *TLI = getTLI();
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
     assert(ISD && "Invalid opcode");
@@ -540,7 +576,7 @@ public:
       if (CondTy)
         CondTy = CondTy->getScalarType();
       unsigned Cost = static_cast<T *>(this)->getCmpSelInstrCost(
-          Opcode, ValTy->getScalarType(), CondTy);
+          Opcode, ValTy->getScalarType(), CondTy, I);
 
       // Return the cost of multiple scalar invocation plus the cost of
       // inserting and extracting the values.
@@ -559,7 +595,7 @@ public:
   }
 
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                           unsigned AddressSpace) {
+                       unsigned AddressSpace, const Instruction *I = nullptr) {
     assert(!Src->isVoidTy() && "Invalid type");
     std::pair<unsigned, MVT> LT = getTLI()->getTypeLegalizationCost(DL, Src);
 
@@ -705,18 +741,42 @@ public:
     return Cost;
   }
 
-  /// Get intrinsic cost based on arguments  
+  /// Get intrinsic cost based on arguments.
   unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                 ArrayRef<Value *> Args, FastMathFlags FMF) {
+                                 ArrayRef<Value *> Args, FastMathFlags FMF,
+                                 unsigned VF = 1) {
+    unsigned RetVF = (RetTy->isVectorTy() ? RetTy->getVectorNumElements() : 1);
+    assert ((RetVF == 1 || VF == 1) && "VF > 1 and RetVF is a vector type");
+
     switch (IID) {
     default: {
+      // Assume that we need to scalarize this intrinsic.
       SmallVector<Type *, 4> Types;
-      for (Value *Op : Args)
-        Types.push_back(Op->getType());
-      return static_cast<T *>(this)->getIntrinsicInstrCost(IID, RetTy, Types,
-                                                           FMF);
+      for (Value *Op : Args) {
+        Type *OpTy = Op->getType();
+        assert (VF == 1 || !OpTy->isVectorTy());
+        Types.push_back(VF == 1 ? OpTy : VectorType::get(OpTy, VF));
+      }
+
+      if (VF > 1 && !RetTy->isVoidTy())
+        RetTy = VectorType::get(RetTy, VF);
+
+      // Compute the scalarization overhead based on Args for a vector
+      // intrinsic. A vectorizer will pass a scalar RetTy and VF > 1, while
+      // CostModel will pass a vector RetTy and VF is 1.
+      unsigned ScalarizationCost = UINT_MAX;
+      if (RetVF > 1 || VF > 1) {
+        ScalarizationCost = 0;
+        if (!RetTy->isVoidTy())
+          ScalarizationCost += getScalarizationOverhead(RetTy, true, false);
+        ScalarizationCost += getOperandsScalarizationOverhead(Args, VF);
+      }
+
+      return static_cast<T *>(this)->
+        getIntrinsicInstrCost(IID, RetTy, Types, FMF, ScalarizationCost);
     }
     case Intrinsic::masked_scatter: {
+      assert (VF == 1 && "Can't vectorize types here.");
       Value *Mask = Args[3];
       bool VarMask = !isa<Constant>(Mask);
       unsigned Alignment = cast<ConstantInt>(Args[2])->getZExtValue();
@@ -727,6 +787,7 @@ public:
                                                        Alignment);
     }
     case Intrinsic::masked_gather: {
+      assert (VF == 1 && "Can't vectorize types here.");
       Value *Mask = Args[2];
       bool VarMask = !isa<Constant>(Mask);
       unsigned Alignment = cast<ConstantInt>(Args[1])->getZExtValue();
@@ -738,19 +799,23 @@ public:
     }
   }
   
-  /// Get intrinsic cost based on argument types
+  /// Get intrinsic cost based on argument types.
+  /// If ScalarizationCostPassed is UINT_MAX, the cost of scalarizing the
+  /// arguments and the return value will be computed based on types.
   unsigned getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                 ArrayRef<Type *> Tys, FastMathFlags FMF) {
+                          ArrayRef<Type *> Tys, FastMathFlags FMF,
+                          unsigned ScalarizationCostPassed = UINT_MAX) {
     SmallVector<unsigned, 2> ISDs;
     unsigned SingleCallCost = 10; // Library call cost. Make it expensive.
     switch (IID) {
     default: {
       // Assume that we need to scalarize this intrinsic.
-      unsigned ScalarizationCost = 0;
+      unsigned ScalarizationCost = ScalarizationCostPassed;
       unsigned ScalarCalls = 1;
       Type *ScalarRetTy = RetTy;
       if (RetTy->isVectorTy()) {
-        ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
+        if (ScalarizationCostPassed == UINT_MAX)
+          ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
         ScalarCalls = std::max(ScalarCalls, RetTy->getVectorNumElements());
         ScalarRetTy = RetTy->getScalarType();
       }
@@ -758,7 +823,8 @@ public:
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
         Type *Ty = Tys[i];
         if (Ty->isVectorTy()) {
-          ScalarizationCost += getScalarizationOverhead(Ty, false, true);
+          if (ScalarizationCostPassed == UINT_MAX)
+            ScalarizationCost += getScalarizationOverhead(Ty, false, true);
           ScalarCalls = std::max(ScalarCalls, Ty->getVectorNumElements());
           Ty = Ty->getScalarType();
         }
@@ -906,7 +972,8 @@ public:
     // this will emit a costly libcall, adding call overhead and spills. Make it
     // very expensive.
     if (RetTy->isVectorTy()) {
-      unsigned ScalarizationCost = getScalarizationOverhead(RetTy, true, false);
+      unsigned ScalarizationCost = ((ScalarizationCostPassed != UINT_MAX) ?
+         ScalarizationCostPassed : getScalarizationOverhead(RetTy, true, false));
       unsigned ScalarCalls = RetTy->getVectorNumElements();
       SmallVector<Type *, 4> ScalarTys;
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
@@ -919,7 +986,8 @@ public:
           IID, RetTy->getScalarType(), ScalarTys, FMF);
       for (unsigned i = 0, ie = Tys.size(); i != ie; ++i) {
         if (Tys[i]->isVectorTy()) {
-          ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
+          if (ScalarizationCostPassed == UINT_MAX)
+            ScalarizationCost += getScalarizationOverhead(Tys[i], false, true);
           ScalarCalls = std::max(ScalarCalls, Tys[i]->getVectorNumElements());
         }
       }
diff --git a/include/llvm/CodeGen/CallingConvLower.h b/include/llvm/CodeGen/CallingConvLower.h
index bfbd22823eb822fe6d3a47794bf74bbfdf7811c5..50e464ebb9b8044f10eef6b14c30f2e30ab41a0a 100644
--- a/include/llvm/CodeGen/CallingConvLower.h
+++ b/include/llvm/CodeGen/CallingConvLower.h
@@ -183,11 +183,6 @@ typedef bool CCCustomFn(unsigned &ValNo, MVT &ValVT,
                         MVT &LocVT, CCValAssign::LocInfo &LocInfo,
                         ISD::ArgFlagsTy &ArgFlags, CCState &State);
 
-/// ParmContext - This enum tracks whether calling convention lowering is in
-/// the context of prologue or call generation. Not all backends make use of
-/// this information.
-typedef enum { Unknown, Prologue, Call } ParmContext;
-
 /// CCState - This class holds information needed while lowering arguments and
 /// return values.  It captures which registers are already assigned and which
 /// stack slots are used.  It provides accessors to allocate these values.
@@ -256,9 +251,6 @@ private:
   // during argument analysis.
   unsigned InRegsParamsProcessed;
 
-protected:
-  ParmContext CallOrPrologue;
-
 public:
   CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
           SmallVectorImpl<CCValAssign> &locs, LLVMContext &C);
@@ -510,8 +502,6 @@ public:
     InRegsParamsProcessed = 0;
   }
 
-  ParmContext getCallOrPrologue() const { return CallOrPrologue; }
-
   // Get list of pending assignments
   SmallVectorImpl<llvm::CCValAssign> &getPendingLocs() {
     return PendingLocs;
diff --git a/include/llvm/CodeGen/CommandFlags.h b/include/llvm/CodeGen/CommandFlags.h
index 9d5943fa55247639c50c32861414673255e9d82c..317a5d3f54c8a5c9da4ce8408d1dbb0508146594 100644
--- a/include/llvm/CodeGen/CommandFlags.h
+++ b/include/llvm/CodeGen/CommandFlags.h
@@ -118,11 +118,6 @@ FileType("filetype", cl::init(TargetMachine::CGFT_AssemblyFile),
              clEnumValN(TargetMachine::CGFT_Null, "null",
                         "Emit nothing, for performance testing")));
 
-cl::opt<bool>
-EnableFPMAD("enable-fp-mad",
-            cl::desc("Enable less precise MAD instructions to be generated"),
-            cl::init(false));
-
 cl::opt<bool>
 DisableFPElim("disable-fp-elim",
               cl::desc("Disable frame pointer elimination optimization"),
@@ -283,7 +278,6 @@ DebuggerTuningOpt("debugger-tune",
 // a TargetOptions object with CodeGen flags and returns it.
 static inline TargetOptions InitTargetOptionsFromCodeGenFlags() {
   TargetOptions Options;
-  Options.LessPreciseFPMADOption = EnableFPMAD;
   Options.AllowFPOpFusion = FuseFPOps;
   Options.UnsafeFPMath = EnableUnsafeFPMath;
   Options.NoInfsFPMath = EnableNoInfsFPMath;
@@ -352,28 +346,28 @@ static inline void setFunctionAttributes(StringRef CPU, StringRef Features,
                                          Module &M) {
   for (auto &F : M) {
     auto &Ctx = F.getContext();
-    AttributeSet Attrs = F.getAttributes(), NewAttrs;
+    AttributeList Attrs = F.getAttributes(), NewAttrs;
 
     if (!CPU.empty())
-      NewAttrs = NewAttrs.addAttribute(Ctx, AttributeSet::FunctionIndex,
+      NewAttrs = NewAttrs.addAttribute(Ctx, AttributeList::FunctionIndex,
                                        "target-cpu", CPU);
 
     if (!Features.empty())
-      NewAttrs = NewAttrs.addAttribute(Ctx, AttributeSet::FunctionIndex,
+      NewAttrs = NewAttrs.addAttribute(Ctx, AttributeList::FunctionIndex,
                                        "target-features", Features);
 
     if (DisableFPElim.getNumOccurrences() > 0)
-      NewAttrs = NewAttrs.addAttribute(Ctx, AttributeSet::FunctionIndex,
+      NewAttrs = NewAttrs.addAttribute(Ctx, AttributeList::FunctionIndex,
                                        "no-frame-pointer-elim",
                                        DisableFPElim ? "true" : "false");
 
     if (DisableTailCalls.getNumOccurrences() > 0)
-      NewAttrs = NewAttrs.addAttribute(Ctx, AttributeSet::FunctionIndex,
+      NewAttrs = NewAttrs.addAttribute(Ctx, AttributeList::FunctionIndex,
                                        "disable-tail-calls",
                                        toStringRef(DisableTailCalls));
 
     if (StackRealign)
-      NewAttrs = NewAttrs.addAttribute(Ctx, AttributeSet::FunctionIndex,
+      NewAttrs = NewAttrs.addAttribute(Ctx, AttributeList::FunctionIndex,
                                        "stackrealign");
 
     if (TrapFuncName.getNumOccurrences() > 0)
@@ -383,12 +377,12 @@ static inline void setFunctionAttributes(StringRef CPU, StringRef Features,
             if (const auto *F = Call->getCalledFunction())
               if (F->getIntrinsicID() == Intrinsic::debugtrap ||
                   F->getIntrinsicID() == Intrinsic::trap)
-                Call->addAttribute(llvm::AttributeSet::FunctionIndex,
-                                   Attribute::get(Ctx, "trap-func-name",
-                                                  TrapFuncName));
+                Call->addAttribute(
+                    llvm::AttributeList::FunctionIndex,
+                    Attribute::get(Ctx, "trap-func-name", TrapFuncName));
 
     // Let NewAttrs override Attrs.
-    NewAttrs = Attrs.addAttributes(Ctx, AttributeSet::FunctionIndex, NewAttrs);
+    NewAttrs = Attrs.addAttributes(Ctx, AttributeList::FunctionIndex, NewAttrs);
     F.setAttributes(NewAttrs);
   }
 }
diff --git a/include/llvm/CodeGen/ExecutionDepsFix.h b/include/llvm/CodeGen/ExecutionDepsFix.h
new file mode 100644
index 0000000000000000000000000000000000000000..1d5b9684e105503d80e64add6cc889a36a42a455
--- /dev/null
+++ b/include/llvm/CodeGen/ExecutionDepsFix.h
@@ -0,0 +1,220 @@
+//===- llvm/CodeGen/ExecutionDepsFix.h - Execution Dependency Fix -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file Execution Dependency Fix pass.
+///
+/// Some X86 SSE instructions like mov, and, or, xor are available in different
+/// variants for different operand types. These variant instructions are
+/// equivalent, but on Nehalem and newer cpus there is extra latency
+/// transferring data between integer and floating point domains.  ARM cores
+/// have similar issues when they are configured with both VFP and NEON
+/// pipelines.
+///
+/// This pass changes the variant instructions to minimize domain crossings.
+//
+//===----------------------------------------------------------------------===//
+
+
+#ifndef LLVM_CODEGEN_EXECUTIONDEPSFIX_H
+#define LLVM_CODEGEN_EXECUTIONDEPSFIX_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/Support/Allocator.h"
+#include <vector>
+
+namespace llvm {
+
+/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
+/// of execution domains.
+///
+/// An open DomainValue represents a set of instructions that can still switch
+/// execution domain. Multiple registers may refer to the same open
+/// DomainValue - they will eventually be collapsed to the same execution
+/// domain.
+///
+/// A collapsed DomainValue represents a single register that has been forced
+/// into one of more execution domains. There is a separate collapsed
+/// DomainValue for each register, but it may contain multiple execution
+/// domains. A register value is initially created in a single execution
+/// domain, but if we were forced to pay the penalty of a domain crossing, we
+/// keep track of the fact that the register is now available in multiple
+/// domains.
+struct DomainValue {
+  // Basic reference counting.
+  unsigned Refs;
+
+  // Bitmask of available domains. For an open DomainValue, it is the still
+  // possible domains for collapsing. For a collapsed DomainValue it is the
+  // domains where the register is available for free.
+  unsigned AvailableDomains;
+
+  // Pointer to the next DomainValue in a chain.  When two DomainValues are
+  // merged, Victim.Next is set to point to Victor, so old DomainValue
+  // references can be updated by following the chain.
+  DomainValue *Next;
+
+  // Twiddleable instructions using or defining these registers.
+  SmallVector<MachineInstr*, 8> Instrs;
+
+  // A collapsed DomainValue has no instructions to twiddle - it simply keeps
+  // track of the domains where the registers are already available.
+  bool isCollapsed() const { return Instrs.empty(); }
+
+  // Is domain available?
+  bool hasDomain(unsigned domain) const {
+    assert(domain <
+               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
+           "undefined behavior");
+    return AvailableDomains & (1u << domain);
+  }
+
+  // Mark domain as available.
+  void addDomain(unsigned domain) {
+    AvailableDomains |= 1u << domain;
+  }
+
+  // Restrict to a single domain available.
+  void setSingleDomain(unsigned domain) {
+    AvailableDomains = 1u << domain;
+  }
+
+  // Return bitmask of domains that are available and in mask.
+  unsigned getCommonDomains(unsigned mask) const {
+    return AvailableDomains & mask;
+  }
+
+  // First domain available.
+  unsigned getFirstDomain() const {
+    return countTrailingZeros(AvailableDomains);
+  }
+
+  DomainValue() : Refs(0) { clear(); }
+
+  // Clear this DomainValue and point to next which has all its data.
+  void clear() {
+    AvailableDomains = 0;
+    Next = nullptr;
+    Instrs.clear();
+  }
+};
+
+/// Information about a live register.
+struct LiveReg {
+  /// Value currently in this register, or NULL when no value is being tracked.
+  /// This counts as a DomainValue reference.
+  DomainValue *Value;
+
+  /// Instruction that defined this register, relative to the beginning of the
+  /// current basic block.  When a LiveReg is used to represent a live-out
+  /// register, this value is relative to the end of the basic block, so it
+  /// will be a negative number.
+  int Def;
+};
+
+class ExecutionDepsFix : public MachineFunctionPass {
+  SpecificBumpPtrAllocator<DomainValue> Allocator;
+  SmallVector<DomainValue*,16> Avail;
+
+  const TargetRegisterClass *const RC;
+  MachineFunction *MF;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  RegisterClassInfo RegClassInfo;
+  std::vector<SmallVector<int, 1>> AliasMap;
+  const unsigned NumRegs;
+  LiveReg *LiveRegs;
+  struct MBBInfo {
+    // Keeps clearance and domain information for all registers. Note that this
+    // is different from the usual definition notion of liveness. The CPU
+    // doesn't care whether or not we consider a register killed.
+    LiveReg *OutRegs;
+
+    // Whether we have gotten to this block in primary processing yet.
+    bool PrimaryCompleted;
+
+    // The number of predecessors for which primary processing has completed
+    unsigned IncomingProcessed;
+
+    // The value of `IncomingProcessed` at the start of primary processing
+    unsigned PrimaryIncoming;
+
+    // The number of predecessors for which all processing steps are done.
+    unsigned IncomingCompleted;
+
+    MBBInfo()
+        : OutRegs(nullptr), PrimaryCompleted(false), IncomingProcessed(0),
+          PrimaryIncoming(0), IncomingCompleted(0) {}
+  };
+  typedef DenseMap<MachineBasicBlock *, MBBInfo> MBBInfoMap;
+  MBBInfoMap MBBInfos;
+
+  /// List of undefined register reads in this block in forward order.
+  std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
+
+  /// Storage for register unit liveness.
+  LivePhysRegs LiveRegSet;
+
+  /// Current instruction number.
+  /// The first instruction in each basic block is 0.
+  int CurInstr;
+public:
+  ExecutionDepsFix(char &PassID, const TargetRegisterClass &RC)
+    : MachineFunctionPass(PassID), RC(&RC), NumRegs(RC.getNumRegs()) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
+
+private:
+  iterator_range<SmallVectorImpl<int>::const_iterator>
+  regIndices(unsigned Reg) const;
+  // DomainValue allocation.
+  DomainValue *alloc(int domain = -1);
+  DomainValue *retain(DomainValue *DV) {
+    if (DV) ++DV->Refs;
+    return DV;
+  }
+  void release(DomainValue*);
+  DomainValue *resolve(DomainValue*&);
+
+  // LiveRegs manipulations.
+  void setLiveReg(int rx, DomainValue *DV);
+  void kill(int rx);
+  void force(int rx, unsigned domain);
+  void collapse(DomainValue *dv, unsigned domain);
+  bool merge(DomainValue *A, DomainValue *B);
+
+  void enterBasicBlock(MachineBasicBlock*);
+  void leaveBasicBlock(MachineBasicBlock*);
+  bool isBlockDone(MachineBasicBlock *);
+  void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass);
+  bool visitInstr(MachineInstr *);
+  void processDefs(MachineInstr *, bool breakDependency, bool Kill);
+  void visitSoftInstr(MachineInstr*, unsigned mask);
+  void visitHardInstr(MachineInstr*, unsigned domain);
+  bool pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
+                                unsigned Pref);
+  bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
+  void processUndefReads(MachineBasicBlock*);
+};
+
+} // end namepsace llvm
+
+#endif
diff --git a/include/llvm/CodeGen/FastISel.h b/include/llvm/CodeGen/FastISel.h
index cdaea250c33be35bd9ab04798841f8bc6cf3642a..79c96283e7337fe2448322832e41c380c940adca 100644
--- a/include/llvm/CodeGen/FastISel.h
+++ b/include/llvm/CodeGen/FastISel.h
@@ -1,4 +1,4 @@
-//===-- FastISel.h - Definition of the FastISel class ---*- C++ -*---------===//
+//===- FastISel.h - Definition of the FastISel class ------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,10 +16,21 @@
 #define LLVM_CODEGEN_FASTISEL_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+#include <cstdint>
+#include <utility>
+#include <vector>
 
 namespace llvm {
 
@@ -30,57 +41,31 @@ class MachineConstantPool;
 /// quickly.
 class FastISel {
 public:
-  struct ArgListEntry {
-    Value *Val;
-    Type *Ty;
-    bool IsSExt : 1;
-    bool IsZExt : 1;
-    bool IsInReg : 1;
-    bool IsSRet : 1;
-    bool IsNest : 1;
-    bool IsByVal : 1;
-    bool IsInAlloca : 1;
-    bool IsReturned : 1;
-    bool IsSwiftSelf : 1;
-    bool IsSwiftError : 1;
-    uint16_t Alignment;
-
-    ArgListEntry()
-        : Val(nullptr), Ty(nullptr), IsSExt(false), IsZExt(false),
-          IsInReg(false), IsSRet(false), IsNest(false), IsByVal(false),
-          IsInAlloca(false), IsReturned(false), IsSwiftSelf(false),
-          IsSwiftError(false), Alignment(0) {}
-
-    /// \brief Set CallLoweringInfo attribute flags based on a call instruction
-    /// and called function attributes.
-    void setAttributes(ImmutableCallSite *CS, unsigned AttrIdx);
-  };
-  typedef std::vector<ArgListEntry> ArgListTy;
-
+  typedef TargetLoweringBase::ArgListEntry ArgListEntry;
+  typedef TargetLoweringBase::ArgListTy ArgListTy;
   struct CallLoweringInfo {
-    Type *RetTy;
+    Type *RetTy = nullptr;
     bool RetSExt : 1;
     bool RetZExt : 1;
     bool IsVarArg : 1;
     bool IsInReg : 1;
     bool DoesNotReturn : 1;
     bool IsReturnValueUsed : 1;
+    bool IsPatchPoint : 1;
 
     // \brief IsTailCall Should be modified by implementations of FastLowerCall
     // that perform tail call conversions.
-    bool IsTailCall;
+    bool IsTailCall = false;
 
-    unsigned NumFixedArgs;
-    CallingConv::ID CallConv;
-    const Value *Callee;
-    MCSymbol *Symbol;
+    unsigned NumFixedArgs = -1;
+    CallingConv::ID CallConv = CallingConv::C;
+    const Value *Callee = nullptr;
+    MCSymbol *Symbol = nullptr;
     ArgListTy Args;
-    ImmutableCallSite *CS;
-    MachineInstr *Call;
-    unsigned ResultReg;
-    unsigned NumResultRegs;
-
-    bool IsPatchPoint;
+    ImmutableCallSite *CS = nullptr;
+    MachineInstr *Call = nullptr;
+    unsigned ResultReg = 0;
+    unsigned NumResultRegs = 0;
 
     SmallVector<Value *, 16> OutVals;
     SmallVector<ISD::ArgFlagsTy, 16> OutFlags;
@@ -89,11 +74,8 @@ public:
     SmallVector<unsigned, 4> InRegs;
 
     CallLoweringInfo()
-        : RetTy(nullptr), RetSExt(false), RetZExt(false), IsVarArg(false),
-          IsInReg(false), DoesNotReturn(false), IsReturnValueUsed(true),
-          IsTailCall(false), NumFixedArgs(-1), CallConv(CallingConv::C),
-          Callee(nullptr), Symbol(nullptr), CS(nullptr), Call(nullptr),
-          ResultReg(0), NumResultRegs(0), IsPatchPoint(false) {}
+        : RetSExt(false), RetZExt(false), IsVarArg(false), IsInReg(false),
+          DoesNotReturn(false), IsReturnValueUsed(true), IsPatchPoint(false) {}
 
     CallLoweringInfo &setCallee(Type *ResultTy, FunctionType *FuncTy,
                                 const Value *Target, ArgListTy &&ArgsList,
@@ -510,7 +492,6 @@ protected:
     }
   }
 
-
   bool lowerCall(const CallInst *I);
   /// \brief Select and emit code for a binary operator instruction, which has
   /// an opcode which directly corresponds to the given ISD opcode.
@@ -567,4 +548,4 @@ private:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_FASTISEL_H
diff --git a/include/llvm/CodeGen/FaultMaps.h b/include/llvm/CodeGen/FaultMaps.h
index 9b5a3e1ba0500fa6085a4d86b8f296a1d644646b..0f0005b83c543f8404d9cb1f27670d9d82bfc39d 100644
--- a/include/llvm/CodeGen/FaultMaps.h
+++ b/include/llvm/CodeGen/FaultMaps.h
@@ -1,4 +1,4 @@
-//===------------------- FaultMaps.h - The "FaultMaps" section --*- C++ -*-===//
+//===- FaultMaps.h - The "FaultMaps" section --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,26 +12,31 @@
 
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/Format.h"
-
-#include <vector>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 #include <map>
+#include <vector>
 
 namespace llvm {
 
 class AsmPrinter;
 class MCExpr;
-class MCSymbol;
-class MCStreamer;
+class raw_ostream;
 
 class FaultMaps {
 public:
-  enum FaultKind { FaultingLoad = 1, FaultKindMax };
-
-  static const char *faultTypeToString(FaultKind);
+  enum FaultKind {
+    FaultingLoad = 1,
+    FaultingLoadStore,
+    FaultingStore,
+    FaultKindMax
+  };
 
   explicit FaultMaps(AsmPrinter &AP);
 
+  static const char *faultTypeToString(FaultKind);
+
   void recordFaultingOp(FaultKind FaultTy, const MCSymbol *HandlerLabel);
   void serializeToFaultMapSection();
 
@@ -39,13 +44,11 @@ private:
   static const char *WFMP;
 
   struct FaultInfo {
-    FaultKind Kind;
-    const MCExpr *FaultingOffsetExpr;
-    const MCExpr *HandlerOffsetExpr;
+    FaultKind Kind = FaultKindMax;
+    const MCExpr *FaultingOffsetExpr = nullptr;
+    const MCExpr *HandlerOffsetExpr = nullptr;
 
-    FaultInfo()
-        : Kind(FaultKindMax), FaultingOffsetExpr(nullptr),
-          HandlerOffsetExpr(nullptr) {}
+    FaultInfo() = default;
 
     explicit FaultInfo(FaultMaps::FaultKind Kind, const MCExpr *FaultingOffset,
                        const MCExpr *HandlerOffset)
@@ -153,11 +156,11 @@ public:
 
     static const size_t FunctionInfoHeaderSize = FunctionFaultInfosOffset;
 
-    const uint8_t *P;
-    const uint8_t *E;
+    const uint8_t *P = nullptr;
+    const uint8_t *E = nullptr;
 
   public:
-    FunctionInfoAccessor() : P(nullptr), E(nullptr) {}
+    FunctionInfoAccessor() = default;
 
     explicit FunctionInfoAccessor(const uint8_t *P, const uint8_t *E)
         : P(P), E(E) {}
@@ -214,6 +217,6 @@ raw_ostream &operator<<(raw_ostream &OS,
 
 raw_ostream &operator<<(raw_ostream &OS, const FaultMapParser &);
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_FAULTMAPS_H
diff --git a/include/llvm/CodeGen/GCStrategy.h b/include/llvm/CodeGen/GCStrategy.h
index 3088a86a32607bcc7465a6815436cea7b8d84e21..5b1fafea25b57d7ff6b931f931fe9ff259dfce48 100644
--- a/include/llvm/CodeGen/GCStrategy.h
+++ b/include/llvm/CodeGen/GCStrategy.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/GCStrategy.h - Garbage collection ----------*- C++ -*-===//
+//===- llvm/CodeGen/GCStrategy.h - Garbage collection -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -47,19 +47,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_IR_GCSTRATEGY_H
-#define LLVM_IR_GCSTRATEGY_H
+#ifndef LLVM_CODEGEN_GCSTRATEGY_H
+#define LLVM_CODEGEN_GCSTRATEGY_H
 
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Registry.h"
 #include <string>
 
 namespace llvm {
+
+class Type;
+
 namespace GC {
+
 /// PointKind - Used to indicate whether the address of the call instruction
 /// or the address after the call instruction is listed in the stackmap.  For
 /// most runtimes, PostCall safepoints are appropriate.
@@ -68,7 +69,8 @@ enum PointKind {
   PreCall, ///< Instr is a call instruction.
   PostCall ///< Instr is the return address of a call.
 };
-}
+
+} // end namespace GC
 
 /// GCStrategy describes a garbage collector algorithm's code generation
 /// requirements, and provides overridable hooks for those needs which cannot
@@ -77,24 +79,25 @@ enum PointKind {
 /// be immutable.
 class GCStrategy {
 private:
-  std::string Name;
   friend class GCModuleInfo;
 
+  std::string Name;
+
 protected:
-  bool UseStatepoints; /// Uses gc.statepoints as opposed to gc.roots,
-                       /// if set, none of the other options can be
-                       /// anything but their default values.
+  bool UseStatepoints = false; /// Uses gc.statepoints as opposed to gc.roots,
+                               /// if set, none of the other options can be
+                               /// anything but their default values.
 
-  unsigned NeededSafePoints; ///< Bitmask of required safe points.
-  bool CustomReadBarriers;   ///< Default is to insert loads.
-  bool CustomWriteBarriers;  ///< Default is to insert stores.
-  bool CustomRoots;          ///< Default is to pass through to backend.
-  bool InitRoots;            ///< If set, roots are nulled during lowering.
-  bool UsesMetadata;         ///< If set, backend must emit metadata tables.
+  unsigned NeededSafePoints = 0;    ///< Bitmask of required safe points.
+  bool CustomReadBarriers = false;  ///< Default is to insert loads.
+  bool CustomWriteBarriers = false; ///< Default is to insert stores.
+  bool CustomRoots = false;      ///< Default is to pass through to backend.
+  bool InitRoots= true;          ///< If set, roots are nulled during lowering.
+  bool UsesMetadata = false;     ///< If set, backend must emit metadata tables.
 
 public:
   GCStrategy();
-  virtual ~GCStrategy() {}
+  virtual ~GCStrategy() = default;
 
   /// Return the name of the GC strategy.  This is the value of the collector
   /// name string specified on functions which use this strategy.
@@ -172,6 +175,7 @@ public:
 /// register your GCMetadataPrinter subclass with the
 /// GCMetadataPrinterRegistery as well.
 typedef Registry<GCStrategy> GCRegistry;
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_GCSTRATEGY_H
diff --git a/include/llvm/CodeGen/GlobalISel/CallLowering.h b/include/llvm/CodeGen/GlobalISel/CallLowering.h
index 8eec32cece275d784af1ec8cea8e8576ffb7da3c..3e9a9d514cb8727e84c77141a1640575bec1bada 100644
--- a/include/llvm/CodeGen/GlobalISel/CallLowering.h
+++ b/include/llvm/CodeGen/GlobalISel/CallLowering.h
@@ -70,6 +70,17 @@ public:
                                       uint64_t Size, MachinePointerInfo &MPO,
                                       CCValAssign &VA) = 0;
 
+    /// Handle custom values, which may be passed into one or more of \p VAs.
+    /// \return The number of \p VAs that have been assigned after the first
+    ///         one, and which should therefore be skipped from further
+    ///         processing.
+    virtual unsigned assignCustomValue(const ArgInfo &Arg,
+                                       ArrayRef<CCValAssign> VAs) {
+      // This is not a pure virtual method because not all targets need to worry
+      // about custom values.
+      llvm_unreachable("Custom values not supported");
+    }
+
     unsigned extendRegister(unsigned ValReg, CCValAssign &VA);
 
     virtual bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
@@ -145,6 +156,8 @@ public:
   /// This hook must be implemented to lower the given call instruction,
   /// including argument and return value marshalling.
   ///
+  /// \p CallConv is the calling convention to be used for the call.
+  ///
   /// \p Callee is the destination of the call. It should be either a register,
   /// globaladdress, or externalsymbol.
   ///
@@ -160,14 +173,16 @@ public:
   /// needs to be passed.
   ///
   /// \return true if the lowering succeeded, false otherwise.
-  virtual bool lowerCall(MachineIRBuilder &MIRBuilder,
+  virtual bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
                          const MachineOperand &Callee, const ArgInfo &OrigRet,
                          ArrayRef<ArgInfo> OrigArgs) const {
     return false;
   }
 
-  /// This hook must be implemented to lower the given call instruction,
-  /// including argument and return value marshalling.
+  /// Lower the given call instruction, including argument and return value
+  /// marshalling.
+  ///
+  /// \p CI is the call/invoke instruction.
   ///
   /// \p ResReg is a register where the call's return value should be stored (or
   /// 0 if there is no return value).
@@ -181,9 +196,9 @@ public:
   /// range of an immediate jump.
   ///
   /// \return true if the lowering succeeded, false otherwise.
-  virtual bool lowerCall(MachineIRBuilder &MIRBuilder, const CallInst &CI,
-                         unsigned ResReg, ArrayRef<unsigned> ArgRegs,
-                         std::function<unsigned()> GetCalleeReg) const;
+  bool lowerCall(MachineIRBuilder &MIRBuilder, ImmutableCallSite CS,
+                 unsigned ResReg, ArrayRef<unsigned> ArgRegs,
+                 std::function<unsigned()> GetCalleeReg) const;
 };
 } // End namespace llvm.
 
diff --git a/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 25e8588c4373da0bbf7fa81454e4311fd52ee121..31ffdc0e2e78c2c117145fa027dfcd67d3a51759 100644
--- a/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -34,6 +34,7 @@ class Instruction;
 class MachineBasicBlock;
 class MachineFunction;
 class MachineInstr;
+class OptimizationRemarkEmitter;
 class MachineRegisterInfo;
 class TargetPassConfig;
 
@@ -55,15 +56,6 @@ private:
   /// Mapping of the values of the current LLVM IR function
   /// to the related virtual registers.
   ValueToVReg ValToVReg;
-  // Constants are special because when we encounter one,
-  // we do not know at first where to insert the definition since
-  // this depends on all its uses.
-  // Thus, we will insert the sequences to materialize them when
-  // we know all their users.
-  // In the meantime, just keep it in a set.
-  // Note: Constants that end up as immediate in the related instructions,
-  // do not appear in that map.
-  SmallSetVector<const Constant *, 8> Constants;
 
   // N.b. it's not completely obvious that this will be sufficient for every
   // LLVM IR construct (with "invoke" being the obvious candidate to mess up our
@@ -142,6 +134,8 @@ private:
   bool translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                                MachineIRBuilder &MIRBuilder);
 
+  bool translateInlineAsm(const CallInst &CI, MachineIRBuilder &MIRBuilder);
+
   /// Translate call instruction.
   /// \pre \p U is a call instruction.
   bool translateCall(const User &U, MachineIRBuilder &MIRBuilder);
@@ -155,11 +149,6 @@ private:
   bool translateCast(unsigned Opcode, const User &U,
                      MachineIRBuilder &MIRBuilder);
 
-  /// Translate static alloca instruction (i.e. one  of constant size and in the
-  /// first basic block).
-  bool translateStaticAlloca(const AllocaInst &Inst,
-                             MachineIRBuilder &MIRBuilder);
-
   /// Translate a phi instruction.
   bool translatePHI(const User &U, MachineIRBuilder &MIRBuilder);
 
@@ -202,12 +191,16 @@ private:
 
   bool translateGetElementPtr(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateAlloca(const User &U, MachineIRBuilder &MIRBuilder);
+
   /// Translate return (ret) instruction.
   /// The target needs to implement CallLowering::lowerReturn for
   /// this to succeed.
   /// \pre \p U is a return instruction.
   bool translateRet(const User &U, MachineIRBuilder &MIRBuilder);
 
+  bool translateFSub(const User &U, MachineIRBuilder &MIRBuilder);
+
   bool translateAdd(const User &U, MachineIRBuilder &MIRBuilder) {
     return translateBinaryOp(TargetOpcode::G_ADD, U, MIRBuilder);
   }
@@ -239,9 +232,6 @@ private:
   bool translateSRem(const User &U, MachineIRBuilder &MIRBuilder) {
     return translateBinaryOp(TargetOpcode::G_SREM, U, MIRBuilder);
   }
-  bool translateAlloca(const User &U, MachineIRBuilder &MIRBuilder) {
-    return translateStaticAlloca(cast<AllocaInst>(U), MIRBuilder);
-  }
   bool translateIntToPtr(const User &U, MachineIRBuilder &MIRBuilder) {
     return translateCast(TargetOpcode::G_INTTOPTR, U, MIRBuilder);
   }
@@ -293,9 +283,6 @@ private:
   bool translateFAdd(const User &U, MachineIRBuilder &MIRBuilder) {
     return translateBinaryOp(TargetOpcode::G_FADD, U, MIRBuilder);
   }
-  bool translateFSub(const User &U, MachineIRBuilder &MIRBuilder) {
-    return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder);
-  }
   bool translateFMul(const User &U, MachineIRBuilder &MIRBuilder) {
     return translateBinaryOp(TargetOpcode::G_FMUL, U, MIRBuilder);
   }
@@ -306,6 +293,14 @@ private:
     return translateBinaryOp(TargetOpcode::G_FREM, U, MIRBuilder);
   }
 
+  bool translateVAArg(const User &U, MachineIRBuilder &MIRBuilder);
+
+  bool translateInsertElement(const User &U, MachineIRBuilder &MIRBuilder);
+
+  bool translateExtractElement(const User &U, MachineIRBuilder &MIRBuilder);
+
+  bool translateShuffleVector(const User &U, MachineIRBuilder &MIRBuilder);
+
   // Stubs to keep the compiler happy while we implement the rest of the
   // translation.
   bool translateResume(const User &U, MachineIRBuilder &MIRBuilder) {
@@ -344,18 +339,6 @@ private:
   bool translateUserOp2(const User &U, MachineIRBuilder &MIRBuilder) {
     return false;
   }
-  bool translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) {
-    return false;
-  }
-  bool translateExtractElement(const User &U, MachineIRBuilder &MIRBuilder) {
-    return false;
-  }
-  bool translateInsertElement(const User &U, MachineIRBuilder &MIRBuilder) {
-    return false;
-  }
-  bool translateShuffleVector(const User &U, MachineIRBuilder &MIRBuilder) {
-    return false;
-  }
 
   /// @}
 
@@ -380,6 +363,9 @@ private:
   /// Current target configuration. Controls how the pass handles errors.
   const TargetPassConfig *TPC;
 
+  /// Current optimization remark emitter. Used to report failures.
+  std::unique_ptr<OptimizationRemarkEmitter> ORE;
+
   // * Insert all the code needed to materialize the constants
   // at the proper place. E.g., Entry block or dominator block
   // of each constant depending on how fancy we want to be.
@@ -401,8 +387,8 @@ private:
 
   /// Get the MachineBasicBlock that represents \p BB. Specifically, the block
   /// returned will be the head of the translated block (suitable for branch
-  /// destinations). If such basic block does not exist, it is created.
-  MachineBasicBlock &getOrCreateBB(const BasicBlock &BB);
+  /// destinations).
+  MachineBasicBlock &getMBB(const BasicBlock &BB);
 
   /// Record \p NewPred as a Machine predecessor to `Edge.second`, corresponding
   /// to `Edge.first` at the IR level. This is used when IRTranslation creates
@@ -418,7 +404,7 @@ private:
     auto RemappedEdge = MachinePreds.find(Edge);
     if (RemappedEdge != MachinePreds.end())
       return RemappedEdge->second;
-    return SmallVector<MachineBasicBlock *, 4>(1, &getOrCreateBB(*Edge.first));
+    return SmallVector<MachineBasicBlock *, 4>(1, &getMBB(*Edge.first));
   }
 
 public:
@@ -433,13 +419,13 @@ public:
   //   CallLowering = MF.subtarget.getCallLowering()
   //   F = MF.getParent()
   //   MIRBuilder.reset(MF)
-  //   MIRBuilder.getOrCreateBB(F.getEntryBB())
+  //   getMBB(F.getEntryBB())
   //   CallLowering->translateArguments(MIRBuilder, F, ValToVReg)
   //   for each bb in F
-  //     MIRBuilder.getOrCreateBB(bb)
+  //     getMBB(bb)
   //     for each inst in bb
   //       if (!translate(MIRBuilder, inst, ValToVReg, ConstantToSequence))
-  //         report_fatal_error(“Don’t know how to translate input");
+  //         report_fatal_error("Don't know how to translate input");
   //   finalize()
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
diff --git a/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
index 63b4f7b9507f1493090a259adfc3a0aff7f7c51a..d8096aeb215ada2ad247d1130abe98c301a0c2fe 100644
--- a/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
+++ b/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
@@ -16,8 +16,13 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_INSTRUCTIONSELECTOR_H
 #define LLVM_CODEGEN_GLOBALISEL_INSTRUCTIONSELECTOR_H
 
+#include "llvm/ADT/Optional.h"
+#include <cstdint>
+
 namespace llvm {
 class MachineInstr;
+class MachineOperand;
+class MachineRegisterInfo;
 class RegisterBankInfo;
 class TargetInstrInfo;
 class TargetRegisterInfo;
@@ -56,6 +61,14 @@ protected:
                                         const TargetInstrInfo &TII,
                                         const TargetRegisterInfo &TRI,
                                         const RegisterBankInfo &RBI) const;
+
+  Optional<int64_t> getConstantVRegVal(unsigned VReg,
+                                       const MachineRegisterInfo &MRI) const;
+
+  bool isOperandImmEqual(const MachineOperand &MO, int64_t Value,
+                         const MachineRegisterInfo &MRI) const;
+
+  bool isObviouslySafeToFold(MachineInstr &MI) const;
 };
 
 } // End namespace llvm.
diff --git a/include/llvm/CodeGen/GlobalISel/Legalizer.h b/include/llvm/CodeGen/GlobalISel/Legalizer.h
index 8284ab6dac65e083241287fae13e74aeb8d9ad20..bed7230cc013bef042dc552722d432e7e02d13b0 100644
--- a/include/llvm/CodeGen/GlobalISel/Legalizer.h
+++ b/include/llvm/CodeGen/GlobalISel/Legalizer.h
@@ -58,6 +58,9 @@ public:
   bool combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI,
                        const TargetInstrInfo &TII);
 
+  bool combineMerges(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     const TargetInstrInfo &TII);
+
   bool runOnMachineFunction(MachineFunction &MF) override;
 };
 } // End namespace llvm.
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index 56c444ca46befa761360dd396cbbfe23230dcd7d..8fecafdc08d0e9fdf1eba436716413feda1f7b50 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -55,11 +55,7 @@ public:
   ///
   /// Considered as an opaque blob, the legal code will use and define the same
   /// registers as \p MI.
-  LegalizeResult legalizeInstrStep(MachineInstr &MI,
-                                   const LegalizerInfo &LegalizerInfo);
-
-  LegalizeResult legalizeInstr(MachineInstr &MI,
-                               const LegalizerInfo &LegalizerInfo);
+  LegalizeResult legalizeInstrStep(MachineInstr &MI);
 
   /// Legalize an instruction by emiting a runtime library call instead.
   LegalizeResult libcall(MachineInstr &MI);
@@ -87,6 +83,10 @@ public:
   LegalizeResult moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
                                     LLT WideTy);
 
+  /// Expose MIRBuilder so clients can set their own RecordInsertInstruction
+  /// functions
+  MachineIRBuilder MIRBuilder;
+
 private:
 
   /// Helper function to split a wide generic register into bitwise blocks with
@@ -95,8 +95,8 @@ private:
   void extractParts(unsigned Reg, LLT Ty, int NumParts,
                     SmallVectorImpl<unsigned> &Ops);
 
-  MachineIRBuilder MIRBuilder;
   MachineRegisterInfo &MRI;
+  const LegalizerInfo &LI;
 };
 
 } // End namespace llvm.
diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 4a9e535090ce6ad77f0eeb897928a6a1ee86ac5e..30d67eb4992333428f6d3b2ded3aa752ad3bfcc3 100644
--- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -25,6 +25,7 @@
 namespace llvm {
 class LLVMContext;
 class MachineInstr;
+class MachineIRBuilder;
 class MachineRegisterInfo;
 class Type;
 class VectorType;
@@ -96,6 +97,7 @@ public:
   };
 
   LegalizerInfo();
+  virtual ~LegalizerInfo() = default;
 
   /// Compute any ancillary tables needed to quickly decide how an operation
   /// should be handled. This must be called after all "set*Action"methods but
@@ -186,6 +188,10 @@ public:
 
   bool isLegal(const MachineInstr &MI, const MachineRegisterInfo &MRI) const;
 
+  virtual bool legalizeCustom(MachineInstr &MI,
+                              MachineRegisterInfo &MRI,
+                              MachineIRBuilder &MIRBuilder) const;
+
 private:
   static const int FirstOp = TargetOpcode::PRE_ISEL_GENERIC_OPCODE_START;
   static const int LastOp = TargetOpcode::PRE_ISEL_GENERIC_OPCODE_END;
diff --git a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
index 353aab5e12b7a051b30842211b35c481c92d145c..472f50576d966ca8149255ab8410a19c6cac13f7 100644
--- a/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
+++ b/include/llvm/CodeGen/GlobalISel/MachineIRBuilder.h
@@ -229,6 +229,22 @@ public:
   MachineInstrBuilder buildGEP(unsigned Res, unsigned Op0,
                                unsigned Op1);
 
+  /// Build and insert \p Res<def> = G_PTR_MASK \p Op0, \p NumBits
+  ///
+  /// G_PTR_MASK clears the low bits of a pointer operand without destroying its
+  /// pointer properties. This has the effect of rounding the address *down* to
+  /// a specified alignment in bits.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res and \p Op0 must be generic virtual registers with pointer
+  ///      type.
+  /// \pre \p NumBits must be an integer representing the number of low bits to
+  ///      be cleared in \p Op0.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildPtrMask(unsigned Res, unsigned Op0,
+                                   uint32_t NumBits);
+
   /// Build and insert \p Res<def>, \p CarryOut<def> = G_UADDE \p Op0,
   /// \p Op1, \p CarryIn
   ///
@@ -246,6 +262,19 @@ public:
   MachineInstrBuilder buildUAdde(unsigned Res, unsigned CarryOut, unsigned Op0,
                                  unsigned Op1, unsigned CarryIn);
 
+  /// Build and insert \p Res<def> = G_AND \p Op0, \p Op1
+  ///
+  /// G_AND sets \p Res to the bitwise and of integer parameters \p Op0 and \p
+  /// Op1.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res, \p Op0 and \p Op1 must be generic virtual registers
+  ///      with the same (scalar or vector) type).
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildAnd(unsigned Res, unsigned Op0,
+                               unsigned Op1);
+
   /// Build and insert \p Res<def> = G_ANYEXT \p Op0
   ///
   /// G_ANYEXT produces a register of the specified width, with bits 0 to
@@ -299,6 +328,19 @@ public:
   /// \return The newly created instruction.
   MachineInstrBuilder buildSExtOrTrunc(unsigned Res, unsigned Op);
 
+  /// Build and insert \p Res<def> = G_ZEXT \p Op, \p Res = G_TRUNC \p Op, or
+  /// \p Res = COPY \p Op depending on the differing sizes of \p Res and \p Op.
+  ///  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res must be a generic virtual register with scalar or vector type.
+  /// \pre \p Op must be a generic virtual register with scalar or vector type.
+  ///
+  /// \return The newly created instruction.
+  MachineInstrBuilder buildZExtOrTrunc(unsigned Res, unsigned Op);
+
+  /// Build and insert an appropriate cast between two registers of equal size.
+  MachineInstrBuilder buildCast(unsigned Dst, unsigned Src);
+
   /// Build and insert G_BR \p Dest
   ///
   /// G_BR is an unconditional branch to \p Dest.
@@ -398,19 +440,16 @@ public:
   MachineInstrBuilder buildStore(unsigned Val, unsigned Addr,
                                  MachineMemOperand &MMO);
 
-  /// Build and insert `Res0<def>, ... = G_EXTRACT Src, Idx0, ...`.
-  ///
-  /// If \p Res[i] has size N bits, G_EXTRACT sets \p Res[i] to bits `[Idxs[i],
-  /// Idxs[i] + N)` of \p Src.
+  /// Build and insert `Res0<def>, ... = G_EXTRACT Src, Idx0`.
   ///
   /// \pre setBasicBlock or setMI must have been called.
-  /// \pre Indices must be in ascending order of bit position.
-  /// \pre Each member of \p Results and \p Src must be a generic
-  ///      virtual register.
+  /// \pre \p Res and \p Src must be generic virtual registers.
   ///
   /// \return a MachineInstrBuilder for the newly created instruction.
-  MachineInstrBuilder buildExtract(ArrayRef<unsigned> Results,
-                                   ArrayRef<uint64_t> Indices, unsigned Src);
+  MachineInstrBuilder buildExtract(unsigned Res, unsigned Src, uint64_t Index);
+
+  /// Build and insert \p Res = IMPLICIT_DEF.
+  MachineInstrBuilder buildUndef(unsigned Dst);
 
   /// Build and insert \p Res<def> = G_SEQUENCE \p Op0, \p Idx0...
   ///
@@ -429,6 +468,31 @@ public:
                                     ArrayRef<unsigned> Ops,
                                     ArrayRef<uint64_t> Indices);
 
+  /// Build and insert \p Res<def> = G_MERGE_VALUES \p Op0, ...
+  ///
+  /// G_MERGE_VALUES combines the input elements contiguously into a larger
+  /// register.
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre The entire register \p Res (and no more) must be covered by the input
+  ///      registers.
+  /// \pre The type of all \p Ops registers must be identical.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildMerge(unsigned Res, ArrayRef<unsigned> Ops);
+
+  /// Build and insert \p Res0<def>, ... = G_UNMERGE_VALUES \p Op
+  ///
+  /// G_UNMERGE_VALUES splits contiguous bits of the input into multiple
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre The entire register \p Res (and no more) must be covered by the input
+  ///      registers.
+  /// \pre The type of all \p Res registers must be identical.
+  ///
+  /// \return a MachineInstrBuilder for the newly created instruction.
+  MachineInstrBuilder buildUnmerge(ArrayRef<unsigned> Res, unsigned Op);
+
   void addUsesWithIndices(MachineInstrBuilder MIB) {}
 
   template <typename... ArgTys>
@@ -447,14 +511,8 @@ public:
     return MIB;
   }
 
-  template <typename... ArgTys>
   MachineInstrBuilder buildInsert(unsigned Res, unsigned Src,
-                                  unsigned Op, unsigned Index, ArgTys... Args) {
-    MachineInstrBuilder MIB =
-        buildInstr(TargetOpcode::G_INSERT).addDef(Res).addUse(Src);
-    addUsesWithIndices(MIB, Op, Index, Args...);
-    return MIB;
-  }
+                                  unsigned Op, unsigned Index);
 
   /// Build and insert either a G_INTRINSIC (if \p HasSideEffects is false) or
   /// G_INTRINSIC_W_SIDE_EFFECTS instruction. Its first operand will be the
@@ -536,6 +594,30 @@ public:
   /// \return a MachineInstrBuilder for the newly created instruction.
   MachineInstrBuilder buildSelect(unsigned Res, unsigned Tst,
                                   unsigned Op0, unsigned Op1);
+
+  /// Build and insert \p Res<def> = G_INSERT_VECTOR_ELT \p Val,
+  /// \p Elt, \p Idx
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res and \p Val must be a generic virtual register
+  //       with the same vector type.
+  /// \pre \p Elt and \p Idx must be a generic virtual register
+  ///      with scalar type.
+  ///
+  /// \return The newly created instruction.
+  MachineInstrBuilder buildInsertVectorElement(unsigned Res, unsigned Val,
+                                               unsigned Elt, unsigned Idx);
+
+  /// Build and insert \p Res<def> = G_EXTRACT_VECTOR_ELT \p Val, \p Idx
+  ///
+  /// \pre setBasicBlock or setMI must have been called.
+  /// \pre \p Res must be a generic virtual register with scalar type.
+  /// \pre \p Val must be a generic virtual register with vector type.
+  /// \pre \p Idx must be a generic virtual register with scalar type.
+  ///
+  /// \return The newly created instruction.
+  MachineInstrBuilder buildExtractVectorElement(unsigned Res, unsigned Val,
+                                                unsigned Idx);
 };
 
 } // End namespace llvm.
diff --git a/include/llvm/CodeGen/GlobalISel/RegBankSelect.h b/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
index b331533cd7fb0288a659667d2f9873cbcaa425c5..daa8dcf2061b7d944fa764134763ed6ed6b3143b 100644
--- a/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
+++ b/include/llvm/CodeGen/GlobalISel/RegBankSelect.h
@@ -67,6 +67,7 @@
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 
 namespace llvm {
 // Forward declarations.
@@ -484,6 +485,9 @@ private:
   /// This is required for non-fast mode.
   MachineBranchProbabilityInfo *MBPI;
 
+  /// Current optimization remark emitter. Used to report failures.
+  std::unique_ptr<MachineOptimizationRemarkEmitter> MORE;
+
   /// Helper class used for every code morphing.
   MachineIRBuilder MIRBuilder;
 
diff --git a/include/llvm/CodeGen/GlobalISel/Utils.h b/include/llvm/CodeGen/GlobalISel/Utils.h
index f5d5f5cdf0cd81e05ad0c94933f7fe9b2c7aec15..52bf965a3cb3f953e4a27783ce388e59f208b083 100644
--- a/include/llvm/CodeGen/GlobalISel/Utils.h
+++ b/include/llvm/CodeGen/GlobalISel/Utils.h
@@ -15,15 +15,21 @@
 #ifndef LLVM_CODEGEN_GLOBALISEL_UTILS_H
 #define LLVM_CODEGEN_GLOBALISEL_UTILS_H
 
+#include "llvm/ADT/StringRef.h"
+
 namespace llvm {
 
 class MachineFunction;
 class MachineInstr;
+class MachineOptimizationRemarkEmitter;
+class MachineOptimizationRemarkMissed;
 class MachineRegisterInfo;
 class MCInstrDesc;
 class RegisterBankInfo;
 class TargetInstrInfo;
+class TargetPassConfig;
 class TargetRegisterInfo;
+class Twine;
 
 /// Try to constrain Reg so that it is usable by argument OpIdx of the
 /// provided MCInstrDesc \p II. If this fails, create a new virtual
@@ -39,5 +45,20 @@ unsigned constrainOperandRegClass(const MachineFunction &MF,
                                   MachineInstr &InsertPt, const MCInstrDesc &II,
                                   unsigned Reg, unsigned OpIdx);
 
+/// Check whether an instruction \p MI is dead: it only defines dead virtual
+/// registers, and doesn't have other side effects.
+bool isTriviallyDead(const MachineInstr &MI, const MachineRegisterInfo &MRI);
+
+/// Report an ISel error as a missed optimization remark to the LLVMContext's
+/// diagnostic stream.  Set the FailedISel MachineFunction property.
+void reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                        MachineOptimizationRemarkEmitter &MORE,
+                        MachineOptimizationRemarkMissed &R);
+
+void reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                        MachineOptimizationRemarkEmitter &MORE,
+                        const char *PassName, StringRef Msg,
+                        const MachineInstr &MI);
+
 } // End namespace llvm.
 #endif
diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h
index f943e48a329f8b6e62291cbd4cd41b877d3aae91..ee3fd0bdda2a997aa8ce33bfaef49d13d387f103 100644
--- a/include/llvm/CodeGen/ISDOpcodes.h
+++ b/include/llvm/CodeGen/ISDOpcodes.h
@@ -339,6 +339,12 @@ namespace ISD {
     /// Bitwise operators - logical and, logical or, logical xor.
     AND, OR, XOR,
 
+    /// ABS - Determine the unsigned absolute value of a signed integer value of
+    /// the same bitwidth.
+    /// Note: A value of INT_MIN will return INT_MIN, no saturation or overflow
+    /// is performed.
+    ABS,
+
     /// Shift and rotation operations.  After legalization, the type of the
     /// shift amount is known to be TLI.getShiftAmountTy().  Before legalization
     /// the shift amount can be any type, but care must be taken to ensure it is
@@ -808,10 +814,11 @@ namespace ISD {
     PRE_INC,
     PRE_DEC,
     POST_INC,
-    POST_DEC,
-    LAST_INDEXED_MODE
+    POST_DEC
   };
 
+  static const int LAST_INDEXED_MODE = POST_DEC + 1;
+
   //===--------------------------------------------------------------------===//
   /// LoadExtType enum - This enum defines the three variants of LOADEXT
   /// (load with extension).
@@ -826,10 +833,11 @@ namespace ISD {
     NON_EXTLOAD = 0,
     EXTLOAD,
     SEXTLOAD,
-    ZEXTLOAD,
-    LAST_LOADEXT_TYPE
+    ZEXTLOAD
   };
 
+  static const int LAST_LOADEXT_TYPE = ZEXTLOAD + 1;
+
   NodeType getExtForLoadExtType(bool IsFP, LoadExtType);
 
   //===--------------------------------------------------------------------===//
diff --git a/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h b/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..848ee1dc0dc6013ca49999e40afc1339d0e0a042
--- /dev/null
+++ b/include/llvm/CodeGen/LazyMachineBlockFrequencyInfo.h
@@ -0,0 +1,76 @@
+///===- LazyMachineBlockFrequencyInfo.h - Lazy Block Frequency -*- C++ -*--===//
+///
+///                     The LLVM Compiler Infrastructure
+///
+/// This file is distributed under the University of Illinois Open Source
+/// License. See LICENSE.TXT for details.
+///
+///===---------------------------------------------------------------------===//
+/// \file
+/// This is an alternative analysis pass to MachineBlockFrequencyInfo.  The
+/// difference is that with this pass the block frequencies are not computed
+/// when the analysis pass is executed but rather when the BFI result is
+/// explicitly requested by the analysis client.
+///
+///===---------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_LAZYMACHINEBLOCKFREQUENCYINFO_H
+#define LLVM_ANALYSIS_LAZYMACHINEBLOCKFREQUENCYINFO_H
+
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+
+namespace llvm {
+/// \brief This is an alternative analysis pass to MachineBlockFrequencyInfo.
+/// The difference is that with this pass, the block frequencies are not
+/// computed when the analysis pass is executed but rather when the BFI result
+/// is explicitly requested by the analysis client.
+///
+/// This works by checking querying if MBFI is available and otherwise
+/// generating MBFI on the fly.  In this case the passes required for (LI, DT)
+/// are also queried before being computed on the fly.
+///
+/// Note that it is expected that we wouldn't need this functionality for the
+/// new PM since with the new PM, analyses are executed on demand.
+
+class LazyMachineBlockFrequencyInfoPass : public MachineFunctionPass {
+private:
+  /// If generated on the fly this own the instance.
+  mutable std::unique_ptr<MachineBlockFrequencyInfo> OwnedMBFI;
+
+  /// If generated on the fly this own the instance.
+  mutable std::unique_ptr<MachineLoopInfo> OwnedMLI;
+
+  /// If generated on the fly this own the instance.
+  mutable std::unique_ptr<MachineDominatorTree> OwnedMDT;
+
+  /// The function.
+  MachineFunction *MF = nullptr;
+
+  /// \brief Calculate MBFI and all other analyses that's not available and
+  /// required by BFI.
+  MachineBlockFrequencyInfo &calculateIfNotAvailable() const;
+
+public:
+  static char ID;
+
+  LazyMachineBlockFrequencyInfoPass();
+
+  /// \brief Compute and return the block frequencies.
+  MachineBlockFrequencyInfo &getBFI() { return calculateIfNotAvailable(); }
+
+  /// \brief Compute and return the block frequencies.
+  const MachineBlockFrequencyInfo &getBFI() const {
+    return calculateIfNotAvailable();
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void releaseMemory() override;
+  void print(raw_ostream &OS, const Module *M) const override;
+};
+}
+#endif
diff --git a/include/llvm/CodeGen/LexicalScopes.h b/include/llvm/CodeGen/LexicalScopes.h
index 7d7e48af2a0ff3bf5e17f357c465abbd9b936f6a..6c35832f963c7868153d71a70701225f921465d5 100644
--- a/include/llvm/CodeGen/LexicalScopes.h
+++ b/include/llvm/CodeGen/LexicalScopes.h
@@ -1,4 +1,4 @@
-//===- LexicalScopes.cpp - Collecting lexical scope info -*- C++ -*--------===//
+//===- LexicalScopes.cpp - Collecting lexical scope info --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,19 +19,18 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DebugInfoMetadata.h"
-#include "llvm/IR/ValueHandle.h"
+#include <cassert>
 #include <unordered_map>
 #include <utility>
+
 namespace llvm {
 
-class MachineInstr;
 class MachineBasicBlock;
 class MachineFunction;
+class MachineInstr;
 
 //===----------------------------------------------------------------------===//
 /// InsnRange - This is used to track range of instructions with identical
@@ -43,13 +42,15 @@ typedef std::pair<const MachineInstr *, const MachineInstr *> InsnRange;
 /// LexicalScope - This class is used to track scope information.
 ///
 class LexicalScope {
-
 public:
   LexicalScope(LexicalScope *P, const DILocalScope *D, const DILocation *I,
                bool A)
-      : Parent(P), Desc(D), InlinedAtLocation(I), AbstractScope(A),
-        LastInsn(nullptr), FirstInsn(nullptr), DFSIn(0), DFSOut(0) {
-    assert((!D || D->isResolved()) && "Expected resolved node");
+      : Parent(P), Desc(D), InlinedAtLocation(I), AbstractScope(A) {
+    assert(D);
+    assert(D->getSubprogram()->getUnit()->getEmissionKind() !=
+           DICompileUnit::NoDebug &&
+           "Don't build lexical scopes for non-debug locations");
+    assert(D->isResolved() && "Expected resolved node");
     assert((!I || I->isResolved()) && "Expected resolved node");
     if (Parent)
       Parent->addChild(this);
@@ -127,10 +128,10 @@ private:
                                                // Contents not owned.
   SmallVector<InsnRange, 4> Ranges;
 
-  const MachineInstr *LastInsn;  // Last instruction of this scope.
-  const MachineInstr *FirstInsn; // First instruction of this scope.
-  unsigned DFSIn, DFSOut;        // In & Out Depth use to determine
-                                 // scope nesting.
+  const MachineInstr *LastInsn = nullptr;  // Last instruction of this scope.
+  const MachineInstr *FirstInsn = nullptr; // First instruction of this scope.
+  unsigned DFSIn = 0; // In & Out Depth use to determine scope nesting.
+  unsigned DFSOut = 0;
 };
 
 //===----------------------------------------------------------------------===//
@@ -139,7 +140,7 @@ private:
 ///
 class LexicalScopes {
 public:
-  LexicalScopes() : MF(nullptr), CurrentFnLexicalScope(nullptr) {}
+  LexicalScopes() = default;
 
   /// initialize - Scan machine function and constuct lexical scope nest, resets
   /// the instance if necessary.
@@ -225,8 +226,7 @@ private:
   assignInstructionRanges(SmallVectorImpl<InsnRange> &MIRanges,
                           DenseMap<const MachineInstr *, LexicalScope *> &M);
 
-private:
-  const MachineFunction *MF;
+  const MachineFunction *MF = nullptr;
 
   /// LexicalScopeMap - Tracks the scopes in the current function.
   // Use an unordered_map to ensure value pointer validity over insertion.
@@ -249,9 +249,9 @@ private:
 
   /// CurrentFnLexicalScope - Top level scope for the current function.
   ///
-  LexicalScope *CurrentFnLexicalScope;
+  LexicalScope *CurrentFnLexicalScope = nullptr;
 };
 
-} // end llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_LEXICALSCOPES_H
diff --git a/include/llvm/CodeGen/LiveInterval.h b/include/llvm/CodeGen/LiveInterval.h
index a86706223261d99e996357bdd3f513e1970091a5..b792cba4b78a5e2dfddda60bbfe8154feb98e73f 100644
--- a/include/llvm/CodeGen/LiveInterval.h
+++ b/include/llvm/CodeGen/LiveInterval.h
@@ -227,15 +227,22 @@ namespace llvm {
     LiveRange(const LiveRange &Other, BumpPtrAllocator &Allocator) {
       assert(Other.segmentSet == nullptr &&
              "Copying of LiveRanges with active SegmentSets is not supported");
+      assign(Other, Allocator);
+    }
+
+    /// Copies values numbers and live segments from \p Other into this range.
+    void assign(const LiveRange &Other, BumpPtrAllocator &Allocator) {
+      if (this == &Other)
+        return;
 
+      assert(Other.segmentSet == nullptr &&
+             "Copying of LiveRanges with active SegmentSets is not supported");
       // Duplicate valnos.
-      for (const VNInfo *VNI : Other.valnos) {
+      for (const VNInfo *VNI : Other.valnos)
         createValueCopy(VNI, Allocator);
-      }
       // Now we can copy segments and remap their valnos.
-      for (const Segment &S : Other.segments) {
+      for (const Segment &S : Other.segments)
         segments.push_back(Segment(S.start, S.end, valnos[S.valno->id]));
-      }
     }
 
     /// advanceTo - Advance the specified iterator to point to the Segment
@@ -767,6 +774,19 @@ namespace llvm {
                                const MachineRegisterInfo &MRI,
                                const SlotIndexes &Indexes) const;
 
+    /// Refines the subranges to support \p LaneMask. This may only be called
+    /// for LI.hasSubrange()==true. Subregister ranges are split or created
+    /// until \p LaneMask can be matched exactly. \p Mod is executed on the
+    /// matching subranges.
+    ///
+    /// Example:
+    ///    Given an interval with subranges with lanemasks L0F00, L00F0 and
+    ///    L000F, refining for mask L0018. Will split the L00F0 lane into
+    ///    L00E0 and L0010 and the L000F lane into L0007 and L0008. The Mod
+    ///    function will be applied to the L0010 and L0008 subranges.
+    void refineSubRanges(BumpPtrAllocator &Allocator, LaneBitmask LaneMask,
+                         std::function<void(LiveInterval::SubRange&)> Mod);
+
     bool operator<(const LiveInterval& other) const {
       const SlotIndex &thisIndex = beginIndex();
       const SlotIndex &otherIndex = other.beginIndex();
diff --git a/include/llvm/CodeGen/LiveIntervalUnion.h b/include/llvm/CodeGen/LiveIntervalUnion.h
index f0f1637dc92d0622bb1398749d01564ed082badc..57e3deb038af3b029111c6e1a1beb243aa32a650 100644
--- a/include/llvm/CodeGen/LiveIntervalUnion.h
+++ b/include/llvm/CodeGen/LiveIntervalUnion.h
@@ -1,4 +1,4 @@
-//===-- LiveIntervalUnion.h - Live interval union data struct --*- C++ -*--===//
+//===- LiveIntervalUnion.h - Live interval union data struct ---*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,7 +18,11 @@
 #define LLVM_CODEGEN_LIVEINTERVALUNION_H
 
 #include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include <cassert>
+#include <limits>
 
 namespace llvm {
 
@@ -30,13 +34,6 @@ template <unsigned Element> class SparseBitVector;
 typedef SparseBitVector<128> LiveVirtRegBitSet;
 #endif
 
-/// Compare a live virtual register segment to a LiveIntervalUnion segment.
-inline bool
-overlap(const LiveInterval::Segment &VRSeg,
-        const IntervalMap<SlotIndex, LiveInterval*>::const_iterator &LUSeg) {
-  return VRSeg.start < LUSeg.stop() && LUSeg.start() < VRSeg.end;
-}
-
 /// Union of live intervals that are strong candidates for coalescing into a
 /// single register (either physical or virtual depending on the context).  We
 /// expect the constituent live intervals to be disjoint, although we may
@@ -53,29 +50,34 @@ public:
   // to reach the current segment's containing virtual register.
   typedef LiveSegments::iterator SegmentIter;
 
+  /// Const version of SegmentIter.
+  typedef LiveSegments::const_iterator ConstSegmentIter;
+
   // LiveIntervalUnions share an external allocator.
   typedef LiveSegments::Allocator Allocator;
 
-  class Query;
-
 private:
-  unsigned Tag;           // unique tag for current contents.
+  unsigned Tag = 0;       // unique tag for current contents.
   LiveSegments Segments;  // union of virtual reg segments
 
 public:
-  explicit LiveIntervalUnion(Allocator &a) : Tag(0), Segments(a) {}
+  explicit LiveIntervalUnion(Allocator &a) : Segments(a) {}
 
   // Iterate over all segments in the union of live virtual registers ordered
   // by their starting position.
   SegmentIter begin() { return Segments.begin(); }
   SegmentIter end() { return Segments.end(); }
   SegmentIter find(SlotIndex x) { return Segments.find(x); }
+  ConstSegmentIter begin() const { return Segments.begin(); }
+  ConstSegmentIter end() const { return Segments.end(); }
+  ConstSegmentIter find(SlotIndex x) const { return Segments.find(x); }
+
   bool empty() const { return Segments.empty(); }
   SlotIndex startIndex() const { return Segments.start(); }
 
   // Provide public access to the underlying map to allow overlap iteration.
   typedef LiveSegments Map;
-  const Map &getMap() { return Segments; }
+  const Map &getMap() const { return Segments; }
 
   /// getTag - Return an opaque tag representing the current state of the union.
   unsigned getTag() const { return Tag; }
@@ -85,15 +87,9 @@ public:
 
   // Add a live virtual register to this union and merge its segments.
   void unify(LiveInterval &VirtReg, const LiveRange &Range);
-  void unify(LiveInterval &VirtReg) {
-    unify(VirtReg, VirtReg);
-  }
 
   // Remove a live virtual register's segments from this union.
   void extract(LiveInterval &VirtReg, const LiveRange &Range);
-  void extract(LiveInterval &VirtReg) {
-    extract(VirtReg, VirtReg);
-  }
 
   // Remove all inserted virtual registers.
   void clear() { Segments.clear(); ++Tag; }
@@ -109,52 +105,42 @@ public:
   /// Query interferences between a single live virtual register and a live
   /// interval union.
   class Query {
-    LiveIntervalUnion *LiveUnion;
-    LiveInterval *VirtReg;
-    LiveInterval::iterator VirtRegI; // current position in VirtReg
-    SegmentIter LiveUnionI;          // current position in LiveUnion
+    const LiveIntervalUnion *LiveUnion = nullptr;
+    const LiveRange *LR = nullptr;
+    LiveRange::const_iterator LRI;  ///< current position in LR
+    ConstSegmentIter LiveUnionI;    ///< current position in LiveUnion
     SmallVector<LiveInterval*,4> InterferingVRegs;
-    bool CheckedFirstInterference;
-    bool SeenAllInterferences;
-    bool SeenUnspillableVReg;
-    unsigned Tag, UserTag;
-
-  public:
-    Query(): LiveUnion(), VirtReg(), Tag(0), UserTag(0) {}
-
-    Query(LiveInterval *VReg, LiveIntervalUnion *LIU):
-      LiveUnion(LIU), VirtReg(VReg), CheckedFirstInterference(false),
-      SeenAllInterferences(false), SeenUnspillableVReg(false)
-    {}
-
-    void clear() {
-      LiveUnion = nullptr;
-      VirtReg = nullptr;
+    bool CheckedFirstInterference = false;
+    bool SeenAllInterferences = false;
+    unsigned Tag = 0;
+    unsigned UserTag = 0;
+
+    void reset(unsigned NewUserTag, const LiveRange &NewLR,
+               const LiveIntervalUnion &NewLiveUnion) {
+      LiveUnion = &NewLiveUnion;
+      LR = &NewLR;
       InterferingVRegs.clear();
       CheckedFirstInterference = false;
       SeenAllInterferences = false;
-      SeenUnspillableVReg = false;
-      Tag = 0;
-      UserTag = 0;
+      Tag = NewLiveUnion.getTag();
+      UserTag = NewUserTag;
     }
 
-    void init(unsigned UTag, LiveInterval *VReg, LiveIntervalUnion *LIU) {
-      assert(VReg && LIU && "Invalid arguments");
-      if (UserTag == UTag && VirtReg == VReg &&
-          LiveUnion == LIU && !LIU->changedSince(Tag)) {
+  public:
+    Query() = default;
+    Query(const LiveRange &LR, const LiveIntervalUnion &LIU):
+      LiveUnion(&LIU), LR(&LR) {}
+    Query(const Query &) = delete;
+    Query &operator=(const Query &) = delete;
+
+    void init(unsigned NewUserTag, const LiveRange &NewLR,
+              const LiveIntervalUnion &NewLiveUnion) {
+      if (UserTag == NewUserTag && LR == &NewLR && LiveUnion == &NewLiveUnion &&
+          !NewLiveUnion.changedSince(Tag)) {
         // Retain cached results, e.g. firstInterference.
         return;
       }
-      clear();
-      LiveUnion = LIU;
-      VirtReg = VReg;
-      Tag = LIU->getTag();
-      UserTag = UTag;
-    }
-
-    LiveInterval &virtReg() const {
-      assert(VirtReg && "uninitialized");
-      return *VirtReg;
+      reset(NewUserTag, NewLR, NewLiveUnion);
     }
 
     // Does this live virtual register interfere with the union?
@@ -162,7 +148,8 @@ public:
 
     // Count the virtual registers in this union that interfere with this
     // query's live virtual register, up to maxInterferingRegs.
-    unsigned collectInterferingVRegs(unsigned MaxInterferingRegs = UINT_MAX);
+    unsigned collectInterferingVRegs(
+        unsigned MaxInterferingRegs = std::numeric_limits<unsigned>::max());
 
     // Was this virtual register visited during collectInterferingVRegs?
     bool isSeenInterference(LiveInterval *VReg) const;
@@ -170,25 +157,19 @@ public:
     // Did collectInterferingVRegs collect all interferences?
     bool seenAllInterferences() const { return SeenAllInterferences; }
 
-    // Did collectInterferingVRegs encounter an unspillable vreg?
-    bool seenUnspillableVReg() const { return SeenUnspillableVReg; }
-
     // Vector generated by collectInterferingVRegs.
     const SmallVectorImpl<LiveInterval*> &interferingVRegs() const {
       return InterferingVRegs;
     }
-
-  private:
-    Query(const Query&) = delete;
-    void operator=(const Query&) = delete;
   };
 
   // Array of LiveIntervalUnions.
   class Array {
-    unsigned Size;
-    LiveIntervalUnion *LIUs;
+    unsigned Size = 0;
+    LiveIntervalUnion *LIUs = nullptr;
+
   public:
-    Array() : Size(0), LIUs(nullptr) {}
+    Array() = default;
     ~Array() { clear(); }
 
     // Initialize the array to have Size entries.
@@ -213,4 +194,4 @@ public:
 
 } // end namespace llvm
 
-#endif // !defined(LLVM_CODEGEN_LIVEINTERVALUNION_H)
+#endif // LLVM_CODEGEN_LIVEINTERVALUNION_H
diff --git a/include/llvm/CodeGen/LiveRegMatrix.h b/include/llvm/CodeGen/LiveRegMatrix.h
index e169058ca5634eeeb22ff395e42383b0822413ef..fa6827f6b1f9b1e1671be40d201b745f821ffa5e 100644
--- a/include/llvm/CodeGen/LiveRegMatrix.h
+++ b/include/llvm/CodeGen/LiveRegMatrix.h
@@ -1,4 +1,4 @@
-//===-- LiveRegMatrix.h - Track register interference ---------*- C++ -*---===//
+//===- LiveRegMatrix.h - Track register interference ----------*- C++ -*---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -27,11 +27,14 @@
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include <memory>
 
 namespace llvm {
 
+class AnalysisUsage;
 class LiveInterval;
-class LiveIntervalAnalysis;
+class LiveIntervals;
+class MachineFunction;
 class TargetRegisterInfo;
 class VirtRegMap;
 
@@ -41,7 +44,7 @@ class LiveRegMatrix : public MachineFunctionPass {
   VirtRegMap *VRM;
 
   // UserTag changes whenever virtual registers have been modified.
-  unsigned UserTag;
+  unsigned UserTag = 0;
 
   // The matrix is represented as a LiveIntervalUnion per register unit.
   LiveIntervalUnion::Allocator LIUAlloc;
@@ -51,16 +54,18 @@ class LiveRegMatrix : public MachineFunctionPass {
   std::unique_ptr<LiveIntervalUnion::Query[]> Queries;
 
   // Cached register mask interference info.
-  unsigned RegMaskTag;
-  unsigned RegMaskVirtReg;
+  unsigned RegMaskTag = 0;
+  unsigned RegMaskVirtReg = 0;
   BitVector RegMaskUsable;
 
   // MachineFunctionPass boilerplate.
-  void getAnalysisUsage(AnalysisUsage&) const override;
-  bool runOnMachineFunction(MachineFunction&) override;
+  void getAnalysisUsage(AnalysisUsage &) const override;
+  bool runOnMachineFunction(MachineFunction &) override;
   void releaseMemory() override;
+
 public:
   static char ID;
+
   LiveRegMatrix();
 
   //===--------------------------------------------------------------------===//
@@ -136,7 +141,7 @@ public:
   /// Use MCRegUnitIterator to enumerate all regunits in the desired PhysReg.
   /// This returns a reference to an internal Query data structure that is only
   /// valid until the next query() call.
-  LiveIntervalUnion::Query &query(LiveInterval &VirtReg, unsigned RegUnit);
+  LiveIntervalUnion::Query &query(const LiveRange &LR, unsigned RegUnit);
 
   /// Directly access the live interval unions per regunit.
   /// This returns an array indexed by the regunit number.
diff --git a/include/llvm/CodeGen/LiveRegUnits.h b/include/llvm/CodeGen/LiveRegUnits.h
index 33f4a4a0337d9f5fd5baa4e8f472685e792fad76..5de76c8b87bf75bf2c29a6bfb437af4d48601824 100644
--- a/include/llvm/CodeGen/LiveRegUnits.h
+++ b/include/llvm/CodeGen/LiveRegUnits.h
@@ -17,6 +17,9 @@
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include <cstdint>
 
 namespace llvm {
 
@@ -25,12 +28,12 @@ class MachineBasicBlock;
 
 /// A set of register units used to track register liveness.
 class LiveRegUnits {
-  const TargetRegisterInfo *TRI;
+  const TargetRegisterInfo *TRI = nullptr;
   BitVector Units;
 
 public:
   /// Constructs a new empty LiveRegUnits set.
-  LiveRegUnits() : TRI(nullptr) {}
+  LiveRegUnits() = default;
 
   /// Constructs and initialize an empty LiveRegUnits set.
   LiveRegUnits(const TargetRegisterInfo &TRI) {
@@ -120,6 +123,6 @@ public:
   }
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_LIVEREGUNITS_H
diff --git a/include/llvm/CodeGen/LowLevelType.h b/include/llvm/CodeGen/LowLevelType.h
index b8885c3a95fd70d355fdf42a9592ee72b74b8d9e..a3c5c9329f53a4df7bb98d025fb977c2089b32aa 100644
--- a/include/llvm/CodeGen/LowLevelType.h
+++ b/include/llvm/CodeGen/LowLevelType.h
@@ -1,4 +1,4 @@
-//== llvm/CodeGen/GlobalISel/LowLevelType.h -------------------- -*- C++ -*-==//
+//== llvm/CodeGen/LowLevelType.h ------------------------------- -*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,197 +10,23 @@
 /// Implement a low-level type suitable for MachineInstr level instruction
 /// selection.
 ///
-/// For a type attached to a MachineInstr, we only care about 2 details: total
-/// size and the number of vector lanes (if any). Accordingly, there are 4
-/// possible valid type-kinds:
-///
-///    * `sN` for scalars and aggregates
-///    * `<N x sM>` for vectors, which must have at least 2 elements.
-///    * `pN` for pointers
-///
-/// Other information required for correct selection is expected to be carried
-/// by the opcode, or non-type flags. For example the distinction between G_ADD
-/// and G_FADD for int/float or fast-math flags.
+/// This provides the CodeGen aspects of LowLevelType, such as Type conversion.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_GLOBALISEL_LOWLEVELTYPE_H
-#define LLVM_CODEGEN_GLOBALISEL_LOWLEVELTYPE_H
+#ifndef LLVM_CODEGEN_LOWLEVELTYPE_H
+#define LLVM_CODEGEN_LOWLEVELTYPE_H
 
-#include <cassert>
-#include "llvm/ADT/DenseMapInfo.h"
-#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
 
 namespace llvm {
 
 class DataLayout;
-class LLVMContext;
 class Type;
-class raw_ostream;
-
-class LLT {
-public:
-  enum TypeKind : uint16_t {
-    Invalid,
-    Scalar,
-    Pointer,
-    Vector,
-  };
-
-  /// Get a low-level scalar or aggregate "bag of bits".
-  static LLT scalar(unsigned SizeInBits) {
-    assert(SizeInBits > 0 && "invalid scalar size");
-    return LLT{Scalar, 1, SizeInBits};
-  }
-
-  /// Get a low-level pointer in the given address space (defaulting to 0).
-  static LLT pointer(uint16_t AddressSpace, unsigned SizeInBits) {
-    return LLT{Pointer, AddressSpace, SizeInBits};
-  }
-
-  /// Get a low-level vector of some number of elements and element width.
-  /// \p NumElements must be at least 2.
-  static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits) {
-    assert(NumElements > 1 && "invalid number of vector elements");
-    return LLT{Vector, NumElements, ScalarSizeInBits};
-  }
-
-  /// Get a low-level vector of some number of elements and element type.
-  static LLT vector(uint16_t NumElements, LLT ScalarTy) {
-    assert(NumElements > 1 && "invalid number of vector elements");
-    assert(ScalarTy.isScalar() && "invalid vector element type");
-    return LLT{Vector, NumElements, ScalarTy.getSizeInBits()};
-  }
-
-  explicit LLT(TypeKind Kind, uint16_t NumElements, unsigned SizeInBits)
-    : SizeInBits(SizeInBits), ElementsOrAddrSpace(NumElements), Kind(Kind) {
-    assert((Kind != Vector || ElementsOrAddrSpace > 1) &&
-           "invalid number of vector elements");
-  }
-
-  explicit LLT() : SizeInBits(0), ElementsOrAddrSpace(0), Kind(Invalid) {}
-
-  /// Construct a low-level type based on an LLVM type.
-  explicit LLT(Type &Ty, const DataLayout &DL);
-
-  explicit LLT(MVT VT);
-
-  bool isValid() const { return Kind != Invalid; }
-
-  bool isScalar() const { return Kind == Scalar; }
-
-  bool isPointer() const { return Kind == Pointer; }
-
-  bool isVector() const { return Kind == Vector; }
-
-  /// Returns the number of elements in a vector LLT. Must only be called on
-  /// vector types.
-  uint16_t getNumElements() const {
-    assert(isVector() && "cannot get number of elements on scalar/aggregate");
-    return ElementsOrAddrSpace;
-  }
-
-  /// Returns the total size of the type. Must only be called on sized types.
-  unsigned getSizeInBits() const {
-    if (isPointer() || isScalar())
-      return SizeInBits;
-    return SizeInBits * ElementsOrAddrSpace;
-  }
-
-  unsigned getScalarSizeInBits() const {
-    return SizeInBits;
-  }
-
-  unsigned getAddressSpace() const {
-    assert(isPointer() && "cannot get address space of non-pointer type");
-    return ElementsOrAddrSpace;
-  }
-
-  /// Returns the vector's element type. Only valid for vector types.
-  LLT getElementType() const {
-    assert(isVector() && "cannot get element type of scalar/aggregate");
-    return scalar(SizeInBits);
-  }
-
-  /// Get a low-level type with half the size of the original, by halving the
-  /// size of the scalar type involved. For example `s32` will become `s16`,
-  /// `<2 x s32>` will become `<2 x s16>`.
-  LLT halfScalarSize() const {
-    assert(!isPointer() && getScalarSizeInBits() > 1 &&
-           getScalarSizeInBits() % 2 == 0 && "cannot half size of this type");
-    return LLT{Kind, ElementsOrAddrSpace, SizeInBits / 2};
-  }
-
-  /// Get a low-level type with twice the size of the original, by doubling the
-  /// size of the scalar type involved. For example `s32` will become `s64`,
-  /// `<2 x s32>` will become `<2 x s64>`.
-  LLT doubleScalarSize() const {
-    assert(!isPointer() && "cannot change size of this type");
-    return LLT{Kind, ElementsOrAddrSpace, SizeInBits * 2};
-  }
-
-  /// Get a low-level type with half the size of the original, by halving the
-  /// number of vector elements of the scalar type involved. The source must be
-  /// a vector type with an even number of elements. For example `<4 x s32>`
-  /// will become `<2 x s32>`, `<2 x s32>` will become `s32`.
-  LLT halfElements() const {
-    assert(isVector() && ElementsOrAddrSpace % 2 == 0 &&
-           "cannot half odd vector");
-    if (ElementsOrAddrSpace == 2)
-      return scalar(SizeInBits);
-
-    return LLT{Vector, static_cast<uint16_t>(ElementsOrAddrSpace / 2),
-               SizeInBits};
-  }
-
-  /// Get a low-level type with twice the size of the original, by doubling the
-  /// number of vector elements of the scalar type involved. The source must be
-  /// a vector type. For example `<2 x s32>` will become `<4 x s32>`. Doubling
-  /// the number of elements in sN produces <2 x sN>.
-  LLT doubleElements() const {
-    assert(!isPointer() && "cannot double elements in pointer");
-    return LLT{Vector, static_cast<uint16_t>(ElementsOrAddrSpace * 2),
-               SizeInBits};
-  }
-
-  void print(raw_ostream &OS) const;
-
-  bool operator==(const LLT &RHS) const {
-    return Kind == RHS.Kind && SizeInBits == RHS.SizeInBits &&
-           ElementsOrAddrSpace == RHS.ElementsOrAddrSpace;
-  }
-
-  bool operator!=(const LLT &RHS) const { return !(*this == RHS); }
-
-  friend struct DenseMapInfo<LLT>;
-private:
-  unsigned SizeInBits;
-  uint16_t ElementsOrAddrSpace;
-  TypeKind Kind;
-};
-
-inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) {
-  Ty.print(OS);
-  return OS;
-}
 
-template<> struct DenseMapInfo<LLT> {
-  static inline LLT getEmptyKey() {
-    return LLT{LLT::Invalid, 0, -1u};
-  }
-  static inline LLT getTombstoneKey() {
-    return LLT{LLT::Invalid, 0, -2u};
-  }
-  static inline unsigned getHashValue(const LLT &Ty) {
-    uint64_t Val = ((uint64_t)Ty.SizeInBits << 32) |
-                   ((uint64_t)Ty.ElementsOrAddrSpace << 16) | (uint64_t)Ty.Kind;
-    return DenseMapInfo<uint64_t>::getHashValue(Val);
-  }
-  static bool isEqual(const LLT &LHS, const LLT &RHS) {
-    return LHS == RHS;
-  }
-};
+/// Construct a low-level type based on an LLVM type.
+LLT getLLTForType(Type &Ty, const DataLayout &DL);
 
 }
 
-#endif
+#endif // LLVM_CODEGEN_LOWLEVELTYPE_H
diff --git a/include/llvm/CodeGen/MIRYamlMapping.h b/include/llvm/CodeGen/MIRYamlMapping.h
index 05d0429ab63a13549a8bfbe7e8993d0df1ba057e..38cf8aa165a457104f9b522218470727096bae4b 100644
--- a/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/include/llvm/CodeGen/MIRYamlMapping.h
@@ -381,6 +381,7 @@ struct MachineFunction {
   StringRef Name;
   unsigned Alignment = 0;
   bool ExposesReturnsTwice = false;
+  bool NoVRegs;
   // GISel MachineFunctionProperties.
   bool Legalized = false;
   bool RegBankSelected = false;
@@ -405,6 +406,7 @@ template <> struct MappingTraits<MachineFunction> {
     YamlIO.mapRequired("name", MF.Name);
     YamlIO.mapOptional("alignment", MF.Alignment);
     YamlIO.mapOptional("exposesReturnsTwice", MF.ExposesReturnsTwice);
+    YamlIO.mapOptional("noVRegs", MF.NoVRegs);
     YamlIO.mapOptional("legalized", MF.Legalized);
     YamlIO.mapOptional("regBankSelected", MF.RegBankSelected);
     YamlIO.mapOptional("selected", MF.Selected);
diff --git a/include/llvm/CodeGen/MachineBasicBlock.h b/include/llvm/CodeGen/MachineBasicBlock.h
index f3f5e324d76af7b85fbd7c55b3e4a43aa5434c32..18d40564856d56ca31aa29fbae648789a741ad72 100644
--- a/include/llvm/CodeGen/MachineBasicBlock.h
+++ b/include/llvm/CodeGen/MachineBasicBlock.h
@@ -128,7 +128,7 @@ public:
   /// to an LLVM basic block.
   const BasicBlock *getBasicBlock() const { return BB; }
 
-  /// Return the name of the corresponding LLVM basic block, or "(null)".
+  /// Return the name of the corresponding LLVM basic block, or an empty string.
   StringRef getName() const;
 
   /// Return a formatted string to identify this block and its parent function.
@@ -455,10 +455,19 @@ public:
   /// other block.
   bool isLayoutSuccessor(const MachineBasicBlock *MBB) const;
 
-  /// Return true if the block can implicitly transfer control to the block
-  /// after it by falling off the end of it.  This should return false if it can
-  /// reach the block after it, but it uses an explicit branch to do so (e.g., a
-  /// table jump).  True is a conservative answer.
+
+  /// Return the fallthrough block if the block can implicitly
+  /// transfer control to the block after it by falling off the end of
+  /// it.  This should return null if it can reach the block after
+  /// it, but it uses an explicit branch to do so (e.g., a table
+  /// jump).  Non-null return  is a conservative answer.
+  MachineBasicBlock *getFallThrough();
+
+  /// Return true if the block can implicitly transfer control to the
+  /// block after it by falling off the end of it.  This should return
+  /// false if it can reach the block after it, but it uses an
+  /// explicit branch to do so (e.g., a table jump).  True is a
+  /// conservative answer.
   bool canFallThrough();
 
   /// Returns a pointer to the first instruction in this block that is not a
@@ -664,6 +673,10 @@ public:
     return findDebugLoc(MBBI.getInstrIterator());
   }
 
+  /// Find and return the merged DebugLoc of the branch instructions of the
+  /// block. Return UnknownLoc if there is none.
+  DebugLoc findBranchDebugLoc();
+
   /// Possible outcome of a register liveness query to computeRegisterLiveness()
   enum LivenessQueryResult {
     LQR_Live,   ///< Register is known to be (at least partially) live.
diff --git a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
index acd84088f1e52447cadea5164ccdc0ce6025e4b9..cd1c204981ed86760ad2d377dfb83dc7c5e086b5 100644
--- a/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
+++ b/include/llvm/CodeGen/MachineBlockFrequencyInfo.h
@@ -23,6 +23,7 @@ namespace llvm {
 
 class MachineBasicBlock;
 class MachineBranchProbabilityInfo;
+class MachineLoopInfo;
 template <class BlockT> class BlockFrequencyInfoImpl;
 
 /// MachineBlockFrequencyInfo pass uses BlockFrequencyInfoImpl implementation
@@ -42,6 +43,11 @@ public:
 
   bool runOnMachineFunction(MachineFunction &F) override;
 
+  /// calculate - compute block frequency info for the given function.
+  void calculate(const MachineFunction &F,
+                 const MachineBranchProbabilityInfo &MBPI,
+                 const MachineLoopInfo &MLI);
+
   void releaseMemory() override;
 
   /// getblockFreq - Return block frequency. Return 0 if we don't have the
@@ -56,7 +62,7 @@ public:
 
   const MachineFunction *getFunction() const;
   const MachineBranchProbabilityInfo *getMBPI() const;
-  void view(bool isSimple = true) const;
+  void view(const Twine &Name, bool isSimple = true) const;
 
   // Print the block frequency Freq to OS using the current functions entry
   // frequency to convert freq into a relative decimal form.
diff --git a/include/llvm/CodeGen/MachineDominators.h b/include/llvm/CodeGen/MachineDominators.h
index 21ecef587aa5a6ffd872cc8f56504c39089e46a2..30b6cfdd1c36d78a0c35a7744a59ae928b2789f1 100644
--- a/include/llvm/CodeGen/MachineDominators.h
+++ b/include/llvm/CodeGen/MachineDominators.h
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/GenericDomTreeConstruction.h"
+#include <memory>
 
 namespace llvm {
 
@@ -60,7 +61,7 @@ class MachineDominatorTree : public MachineFunctionPass {
   mutable SmallSet<MachineBasicBlock *, 32> NewBBs;
 
   /// The DominatorTreeBase that is used to compute a normal dominator tree
-  DominatorTreeBase<MachineBasicBlock>* DT;
+  std::unique_ptr<DominatorTreeBase<MachineBasicBlock>> DT;
 
   /// \brief Apply all the recorded critical edges to the DT.
   /// This updates the underlying DT information in a way that uses
@@ -74,9 +75,9 @@ public:
 
   MachineDominatorTree();
 
-  ~MachineDominatorTree() override;
-
   DominatorTreeBase<MachineBasicBlock> &getBase() {
+    if (!DT)
+      DT.reset(new DominatorTreeBase<MachineBasicBlock>(false));
     applySplitCriticalEdges();
     return *DT;
   }
@@ -244,21 +245,6 @@ public:
     CriticalEdgesToSplit.push_back({FromBB, ToBB, NewBB});
   }
 
-  /// \brief Returns *false* if the other dominator tree matches this dominator
-  /// tree.
-  inline bool compare(const MachineDominatorTree &Other) const {
-    const MachineDomTreeNode *R = getRootNode();
-    const MachineDomTreeNode *OtherR = Other.getRootNode();
-
-    if (!R || !OtherR || R->getBlock() != OtherR->getBlock())
-      return true;
-
-    if (DT->compare(*Other.DT))
-      return true;
-
-    return false;
-  }
-
   /// \brief Verify the correctness of the domtree by re-computing it.
   ///
   /// This should only be used for debugging as it aborts the program if the
diff --git a/include/llvm/CodeGen/MachineFrameInfo.h b/include/llvm/CodeGen/MachineFrameInfo.h
index 4600c2c0f10cb128bbbd293a7be37d917d67dfd0..5c9728b0a51ed128c3c078d44816b716b912b008 100644
--- a/include/llvm/CodeGen/MachineFrameInfo.h
+++ b/include/llvm/CodeGen/MachineFrameInfo.h
@@ -559,8 +559,7 @@ public:
     return Objects[ObjectIdx+NumFixedObjects].isAliased;
   }
 
-  /// isImmutableObjectIndex - Returns true if the specified index corresponds
-  /// to an immutable object.
+  /// Returns true if the specified index corresponds to an immutable object.
   bool isImmutableObjectIndex(int ObjectIdx) const {
     // Tail calling functions can clobber their function arguments.
     if (HasTailCall)
@@ -570,6 +569,13 @@ public:
     return Objects[ObjectIdx+NumFixedObjects].isImmutable;
   }
 
+  /// Marks the immutability of an object.
+  void setIsImmutableObjectIndex(int ObjectIdx, bool Immutable) {
+    assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
+           "Invalid Object Idx!");
+    Objects[ObjectIdx+NumFixedObjects].isImmutable = Immutable;
+  }
+
   /// Returns true if the specified index corresponds to a spill slot.
   bool isSpillSlotObjectIndex(int ObjectIdx) const {
     assert(unsigned(ObjectIdx+NumFixedObjects) < Objects.size() &&
diff --git a/include/llvm/CodeGen/MachineFunction.h b/include/llvm/CodeGen/MachineFunction.h
index 0c21b3254631ea5d647c19019f6e4db10ca13ece..5859a4e61fdd57a8e41f9de0138886a337ae0dbb 100644
--- a/include/llvm/CodeGen/MachineFunction.h
+++ b/include/llvm/CodeGen/MachineFunction.h
@@ -475,9 +475,8 @@ public:
 
   /// getBlockNumbered - MachineBasicBlocks are automatically numbered when they
   /// are inserted into the machine function.  The block number for a machine
-  /// basic block can be found by using the MBB::getBlockNumber method, this
-  /// method provides the inverse mapping.
-  ///
+  /// basic block can be found by using the MBB::getNumber method, this method
+  /// provides the inverse mapping.
   MachineBasicBlock *getBlockNumbered(unsigned N) const {
     assert(N < MBBNumbering.size() && "Illegal block number");
     assert(MBBNumbering[N] && "Block was removed from the machine function!");
diff --git a/include/llvm/CodeGen/MachineFunctionInitializer.h b/include/llvm/CodeGen/MachineFunctionInitializer.h
index ff4c29cc014d02f7cbf33e11b043ddee81f02f0b..c644c9783e2fefa5fae145c17246878f4b4aac38 100644
--- a/include/llvm/CodeGen/MachineFunctionInitializer.h
+++ b/include/llvm/CodeGen/MachineFunctionInitializer.h
@@ -1,4 +1,4 @@
-//===- MachineFunctionInitalizer.h - machine function initializer ---------===//
+//===- MachineFunctionInitializer.h - machine function initializer ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/include/llvm/CodeGen/MachineInstr.h b/include/llvm/CodeGen/MachineInstr.h
index 828aef590aa34f5e5fcbbfc1b4927f36d53dc868..e7e728c1be28b7f0e08613d230ad003b68af4a76 100644
--- a/include/llvm/CodeGen/MachineInstr.h
+++ b/include/llvm/CodeGen/MachineInstr.h
@@ -1108,6 +1108,18 @@ public:
   /// the instruction's location and its intended destination.
   bool isSafeToMove(AliasAnalysis *AA, bool &SawStore) const;
 
+  /// Returns true if this instruction's memory access aliases the memory
+  /// access of Other.
+  //
+  /// Assumes any physical registers used to compute addresses
+  /// have the same value for both instructions.  Returns false if neither
+  /// instruction writes to memory.
+  ///
+  /// @param AA Optional alias analysis, used to compare memory operands.
+  /// @param Other MachineInstr to check aliasing against.
+  /// @param UseTBAA Whether to pass TBAA information to alias analysis.
+  bool mayAlias(AliasAnalysis *AA, MachineInstr &Other, bool UseTBAA);
+
   /// Return true if this instruction may have an ordered
   /// or volatile memory reference, or if the information describing the memory
   /// reference is not available. Return false if it is known to have no
@@ -1146,14 +1158,21 @@ public:
   /// instruction to this instruction.
   void copyImplicitOps(MachineFunction &MF, const MachineInstr &MI);
 
-  //
-  // Debugging support
-  //
-  void print(raw_ostream &OS, bool SkipOpers = false,
+  /// Debugging support
+  /// @{
+  /// Print this MI to \p OS.
+  /// Only print the defs and the opcode if \p SkipOpers is true.
+  /// Otherwise, also print operands if \p SkipDebugLoc is true.
+  /// Otherwise, also print the debug loc, with a terminating newline.
+  /// \p TII is used to print the opcode name.  If it's not present, but the
+  /// MI is in a function, the opcode will be printed using the function's TII.
+  void print(raw_ostream &OS, bool SkipOpers = false, bool SkipDebugLoc = false,
              const TargetInstrInfo *TII = nullptr) const;
   void print(raw_ostream &OS, ModuleSlotTracker &MST, bool SkipOpers = false,
+             bool SkipDebugLoc = false,
              const TargetInstrInfo *TII = nullptr) const;
   void dump() const;
+  /// @}
 
   //===--------------------------------------------------------------------===//
   // Accessors used to build up machine instructions.
diff --git a/include/llvm/CodeGen/MachineInstrBuilder.h b/include/llvm/CodeGen/MachineInstrBuilder.h
index 233a467e17e6b85f17c9d4f8359a0f31b51152ef..ef4226d30fe3627b61c78b8ea431f6a7fdfc26ef 100644
--- a/include/llvm/CodeGen/MachineInstrBuilder.h
+++ b/include/llvm/CodeGen/MachineInstrBuilder.h
@@ -1,4 +1,4 @@
-//===-- CodeGen/MachineInstBuilder.h - Simplify creation of MIs -*- C++ -*-===//
+//===- CodeGen/MachineInstrBuilder.h - Simplify creation of MIs --*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,9 +19,18 @@
 #ifndef LLVM_CODEGEN_MACHINEINSTRBUILDER_H
 #define LLVM_CODEGEN_MACHINEINSTRBUILDER_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
 
@@ -29,6 +38,7 @@ class MCInstrDesc;
 class MDNode;
 
 namespace RegState {
+
   enum {
     Define         = 0x2,
     Implicit       = 0x4,
@@ -42,13 +52,15 @@ namespace RegState {
     ImplicitDefine = Implicit | Define,
     ImplicitKill   = Implicit | Kill
   };
-}
+
+} // end namespace RegState
 
 class MachineInstrBuilder {
-  MachineFunction *MF;
-  MachineInstr *MI;
+  MachineFunction *MF = nullptr;
+  MachineInstr *MI = nullptr;
+
 public:
-  MachineInstrBuilder() : MF(nullptr), MI(nullptr) {}
+  MachineInstrBuilder() = default;
 
   /// Create a MachineInstrBuilder for manipulating an existing instruction.
   /// F must be the machine function that was used to allocate I.
@@ -518,6 +530,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_MACHINEINSTRBUILDER_H
diff --git a/include/llvm/CodeGen/MachineInstrBundleIterator.h b/include/llvm/CodeGen/MachineInstrBundleIterator.h
index 2d77cfcae20f85f8f04f2af11c904f1411a8195e..3104185385eaf8a5fed02b44d593d9a57ce40f6a 100644
--- a/include/llvm/CodeGen/MachineInstrBundleIterator.h
+++ b/include/llvm/CodeGen/MachineInstrBundleIterator.h
@@ -153,6 +153,18 @@ public:
       : MII(I.getInstrIterator()) {}
   MachineInstrBundleIterator() : MII(nullptr) {}
 
+  /// Explicit conversion between forward/reverse iterators.
+  ///
+  /// Translate between forward and reverse iterators without changing range
+  /// boundaries.  The resulting iterator will dereference (and have a handle)
+  /// to the previous node, which is somewhat unexpected; but converting the
+  /// two endpoints in a range will give the same range in reverse.
+  ///
+  /// This matches std::reverse_iterator conversions.
+  explicit MachineInstrBundleIterator(
+      const MachineInstrBundleIterator<Ty, !IsReverse> &I)
+      : MachineInstrBundleIterator(++I.getReverse()) {}
+
   /// Get the bundle iterator for the given instruction's bundle.
   static MachineInstrBundleIterator getAtBundleBegin(instr_iterator MI) {
     return MachineInstrBundleIteratorHelper<IsReverse>::getBundleBegin(MI);
@@ -258,6 +270,11 @@ public:
 
   nonconst_iterator getNonConstIterator() const { return MII.getNonConst(); }
 
+  /// Get a reverse iterator to the same node.
+  ///
+  /// Gives a reverse iterator that will dereference (and have a handle) to the
+  /// same node.  Converting the endpoint iterators in a range will give a
+  /// different range; for range operations, use the explicit conversions.
   reverse_iterator getReverse() const { return MII.getReverse(); }
 };
 
diff --git a/include/llvm/CodeGen/MachineModuleInfoImpls.h b/include/llvm/CodeGen/MachineModuleInfoImpls.h
index f9fa6999073fc93cfe608ad57609ca915a36ebdd..f28a79c5b5cca3c98b9c78dfd94b3ae267ade0a8 100644
--- a/include/llvm/CodeGen/MachineModuleInfoImpls.h
+++ b/include/llvm/CodeGen/MachineModuleInfoImpls.h
@@ -15,7 +15,9 @@
 #ifndef LLVM_CODEGEN_MACHINEMODULEINFOIMPLS_H
 #define LLVM_CODEGEN_MACHINEMODULEINFOIMPLS_H
 
+#include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/Support/Wasm.h"
 
 namespace llvm {
 class MCSymbol;
@@ -75,6 +77,33 @@ public:
   SymbolListTy GetGVStubList() { return getSortedStubs(GVStubs); }
 };
 
+/// MachineModuleInfoWasm - This is a MachineModuleInfoImpl implementation
+/// for Wasm targets.
+class MachineModuleInfoWasm : public MachineModuleInfoImpl {
+  /// WebAssembly global variables defined by CodeGen.
+  std::vector<wasm::Global> Globals;
+
+  /// The WebAssembly global variable which is the stack pointer.
+  unsigned StackPointerGlobal;
+
+  virtual void anchor(); // Out of line virtual method.
+public:
+  MachineModuleInfoWasm(const MachineModuleInfo &)
+    : StackPointerGlobal(-1U) {}
+
+  void addGlobal(const wasm::Global &G) { Globals.push_back(G); }
+  const std::vector<wasm::Global> &getGlobals() const { return Globals; }
+
+  bool hasStackPointerGlobal() const {
+    return StackPointerGlobal != -1U;
+  }
+  unsigned getStackPointerGlobal() const {
+    assert(hasStackPointerGlobal() && "Stack ptr global hasn't been set");
+    return StackPointerGlobal;
+  }
+  void setStackPointerGlobal(unsigned Global) { StackPointerGlobal = Global; }
+};
+
 } // end namespace llvm
 
 #endif
diff --git a/include/llvm/CodeGen/MachineOperand.h b/include/llvm/CodeGen/MachineOperand.h
index 3fc05f7784a699784ac66edc74bc8fff9f70224c..81b43126adeba7899e61c19d3f99af1e65ede466 100644
--- a/include/llvm/CodeGen/MachineOperand.h
+++ b/include/llvm/CodeGen/MachineOperand.h
@@ -65,6 +65,7 @@ public:
     MO_CFIIndex,          ///< MCCFIInstruction index.
     MO_IntrinsicID,       ///< Intrinsic ID for ISel
     MO_Predicate,         ///< Generic predicate for ISel
+    MO_Placeholder,       ///< Placeholder for GlobalISel ComplexPattern result.
   };
 
 private:
@@ -355,7 +356,7 @@ public:
   void setReg(unsigned Reg);
 
   void setSubReg(unsigned subReg) {
-    assert(isReg() && "Wrong MachineOperand accessor");
+    assert(isReg() && "Wrong MachineOperand mutator");
     SubReg_TargetFlags = subReg;
     assert(SubReg_TargetFlags == subReg && "SubReg out of range");
   }
@@ -378,38 +379,38 @@ public:
   void setIsDef(bool Val = true);
 
   void setImplicit(bool Val = true) {
-    assert(isReg() && "Wrong MachineOperand accessor");
+    assert(isReg() && "Wrong MachineOperand mutator");
     IsImp = Val;
   }
 
   void setIsKill(bool Val = true) {
-    assert(isReg() && !IsDef && "Wrong MachineOperand accessor");
+    assert(isReg() && !IsDef && "Wrong MachineOperand mutator");
     assert((!Val || !isDebug()) && "Marking a debug operation as kill");
     IsKill = Val;
   }
 
   void setIsDead(bool Val = true) {
-    assert(isReg() && IsDef && "Wrong MachineOperand accessor");
+    assert(isReg() && IsDef && "Wrong MachineOperand mutator");
     IsDead = Val;
   }
 
   void setIsUndef(bool Val = true) {
-    assert(isReg() && "Wrong MachineOperand accessor");
+    assert(isReg() && "Wrong MachineOperand mutator");
     IsUndef = Val;
   }
 
   void setIsInternalRead(bool Val = true) {
-    assert(isReg() && "Wrong MachineOperand accessor");
+    assert(isReg() && "Wrong MachineOperand mutator");
     IsInternalRead = Val;
   }
 
   void setIsEarlyClobber(bool Val = true) {
-    assert(isReg() && IsDef && "Wrong MachineOperand accessor");
+    assert(isReg() && IsDef && "Wrong MachineOperand mutator");
     IsEarlyClobber = Val;
   }
 
   void setIsDebug(bool Val = true) {
-    assert(isReg() && !IsDef && "Wrong MachineOperand accessor");
+    assert(isReg() && !IsDef && "Wrong MachineOperand mutator");
     IsDebug = Val;
   }
 
@@ -538,19 +539,19 @@ public:
   void setOffset(int64_t Offset) {
     assert((isGlobal() || isSymbol() || isMCSymbol() || isCPI() ||
             isTargetIndex() || isBlockAddress()) &&
-           "Wrong MachineOperand accessor");
+           "Wrong MachineOperand mutator");
     SmallContents.OffsetLo = unsigned(Offset);
     Contents.OffsetedInfo.OffsetHi = int(Offset >> 32);
   }
 
   void setIndex(int Idx) {
     assert((isFI() || isCPI() || isTargetIndex() || isJTI()) &&
-           "Wrong MachineOperand accessor");
+           "Wrong MachineOperand mutator");
     Contents.OffsetedInfo.Val.Index = Idx;
   }
 
   void setMBB(MachineBasicBlock *MBB) {
-    assert(isMBB() && "Wrong MachineOperand accessor");
+    assert(isMBB() && "Wrong MachineOperand mutator");
     Contents.MBB = MBB;
   }
 
@@ -767,6 +768,11 @@ public:
     return Op;
   }
 
+  static MachineOperand CreatePlaceholder() {
+    MachineOperand Op(MachineOperand::MO_Placeholder);
+    return Op;
+  }
+
   friend class MachineInstr;
   friend class MachineRegisterInfo;
 private:
diff --git a/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h b/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
index fd48347d51ae232221879fc663884bd8433a8aa9..da8fdcdf5a33d27fd40cd9bff6efcd0512e2d7d9 100644
--- a/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
+++ b/include/llvm/CodeGen/MachineOptimizationRemarkEmitter.h
@@ -22,18 +22,26 @@
 namespace llvm {
 class MachineBasicBlock;
 class MachineBlockFrequencyInfo;
+class MachineInstr;
 
 /// \brief Common features for diagnostics dealing with optimization remarks
 /// that are used by machine passes.
 class DiagnosticInfoMIROptimization : public DiagnosticInfoOptimizationBase {
 public:
   DiagnosticInfoMIROptimization(enum DiagnosticKind Kind, const char *PassName,
-                                StringRef RemarkName, const DebugLoc &DLoc,
-                                MachineBasicBlock *MBB)
+                                StringRef RemarkName,
+                                const DiagnosticLocation &Loc,
+                                const MachineBasicBlock *MBB)
       : DiagnosticInfoOptimizationBase(Kind, DS_Remark, PassName, RemarkName,
-                                       *MBB->getParent()->getFunction(), DLoc),
+                                       *MBB->getParent()->getFunction(), Loc),
         MBB(MBB) {}
 
+  /// MI-specific kinds of diagnostic Arguments.
+  struct MachineArgument : public DiagnosticInfoOptimizationBase::Argument {
+    /// Print an entire MachineInstr.
+    MachineArgument(StringRef Key, const MachineInstr &MI);
+  };
+
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() >= DK_FirstMachineRemark &&
            DI->getKind() <= DK_LastMachineRemark;
@@ -42,7 +50,7 @@ public:
   const MachineBasicBlock *getBlock() const { return MBB; }
 
 private:
-  MachineBasicBlock *MBB;
+  const MachineBasicBlock *MBB;
 };
 
 /// Diagnostic information for applied optimization remarks.
@@ -51,12 +59,13 @@ public:
   /// \p PassName is the name of the pass emitting this diagnostic. If this name
   /// matches the regular expression given in -Rpass=, then the diagnostic will
   /// be emitted.  \p RemarkName is a textual identifier for the remark.  \p
-  /// DLoc is the debug location and \p MBB is the block that the optimization
+  /// Loc is the debug location and \p MBB is the block that the optimization
   /// operates in.
   MachineOptimizationRemark(const char *PassName, StringRef RemarkName,
-                            const DebugLoc &DLoc, MachineBasicBlock *MBB)
+                            const DiagnosticLocation &Loc,
+                            const MachineBasicBlock *MBB)
       : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemark, PassName,
-                                      RemarkName, DLoc, MBB) {}
+                                      RemarkName, Loc, MBB) {}
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_MachineOptimizationRemark;
@@ -74,12 +83,13 @@ public:
   /// \p PassName is the name of the pass emitting this diagnostic. If this name
   /// matches the regular expression given in -Rpass-missed=, then the
   /// diagnostic will be emitted.  \p RemarkName is a textual identifier for the
-  /// remark.  \p DLoc is the debug location and \p MBB is the block that the
+  /// remark.  \p Loc is the debug location and \p MBB is the block that the
   /// optimization operates in.
   MachineOptimizationRemarkMissed(const char *PassName, StringRef RemarkName,
-                                  const DebugLoc &DLoc, MachineBasicBlock *MBB)
+                                  const DiagnosticLocation &Loc,
+                                  const MachineBasicBlock *MBB)
       : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemarkMissed,
-                                      PassName, RemarkName, DLoc, MBB) {}
+                                      PassName, RemarkName, Loc, MBB) {}
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_MachineOptimizationRemarkMissed;
@@ -97,13 +107,13 @@ public:
   /// \p PassName is the name of the pass emitting this diagnostic. If this name
   /// matches the regular expression given in -Rpass-analysis=, then the
   /// diagnostic will be emitted.  \p RemarkName is a textual identifier for the
-  /// remark.  \p DLoc is the debug location and \p MBB is the block that the
+  /// remark.  \p Loc is the debug location and \p MBB is the block that the
   /// optimization operates in.
   MachineOptimizationRemarkAnalysis(const char *PassName, StringRef RemarkName,
-                                    const DebugLoc &DLoc,
-                                    MachineBasicBlock *MBB)
+                                    const DiagnosticLocation &Loc,
+                                    const MachineBasicBlock *MBB)
       : DiagnosticInfoMIROptimization(DK_MachineOptimizationRemarkAnalysis,
-                                      PassName, RemarkName, DLoc, MBB) {}
+                                      PassName, RemarkName, Loc, MBB) {}
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_MachineOptimizationRemarkAnalysis;
@@ -115,6 +125,11 @@ public:
   }
 };
 
+/// Extend llvm::ore:: with MI-specific helper names.
+namespace ore {
+using MNV = DiagnosticInfoMIROptimization::MachineArgument;
+}
+
 /// The optimization diagnostic interface.
 ///
 /// It allows reporting when optimizations are performed and when they are not
diff --git a/include/llvm/CodeGen/MachineRegisterInfo.h b/include/llvm/CodeGen/MachineRegisterInfo.h
index c599caf7535d1cdcaeb1af7316fe46961abc6c2f..6e5c6473ff4a49d9c829ce109078a7e3932f7772 100644
--- a/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/MachineRegisterInfo.h ----------------------*- C++ -*-===//
+//===- llvm/CodeGen/MachineRegisterInfo.h -----------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,19 +15,29 @@
 #define LLVM_CODEGEN_MACHINEREGISTERINFO_H
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IndexedMap.h"
-#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/iterator_range.h"
-// PointerUnion needs to have access to the full RegisterBank type.
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/LaneBitmask.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <vector>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <utility>
 
 namespace llvm {
+
 class PSetIterator;
 
 /// Convenient type to represent either a register class or a register bank.
@@ -41,15 +51,16 @@ class MachineRegisterInfo {
 public:
   class Delegate {
     virtual void anchor();
+
   public:
-    virtual void MRI_NoteNewVirtualRegister(unsigned Reg) = 0;
+    virtual ~Delegate() = default;
 
-    virtual ~Delegate() {}
+    virtual void MRI_NoteNewVirtualRegister(unsigned Reg) = 0;
   };
 
 private:
   MachineFunction *MF;
-  Delegate *TheDelegate;
+  Delegate *TheDelegate = nullptr;
 
   /// True if subregister liveness is tracked.
   const bool TracksSubRegLiveness;
@@ -62,6 +73,15 @@ private:
              VirtReg2IndexFunctor>
       VRegInfo;
 
+  /// The flag is true upon \p UpdatedCSRs initialization
+  /// and false otherwise.
+  bool IsUpdatedCSRsInitialized;
+
+  /// Contains the updated callee saved register list.
+  /// As opposed to the static list defined in register info,
+  /// all registers that were disabled are removed from the list.
+  SmallVector<MCPhysReg, 16> UpdatedCSRs;
+
   /// RegAllocHints - This vector records register allocation hints for virtual
   /// registers. For each virtual register, it keeps a register and hint type
   /// pair making up the allocation hint. Hint type is target specific except
@@ -113,12 +133,12 @@ private:
   /// Live in values are typically arguments in registers.  LiveIn values are
   /// allowed to have virtual registers associated with them, stored in the
   /// second element.
-  std::vector<std::pair<unsigned, unsigned> > LiveIns;
+  std::vector<std::pair<unsigned, unsigned>> LiveIns;
 
-  MachineRegisterInfo(const MachineRegisterInfo&) = delete;
-  void operator=(const MachineRegisterInfo&) = delete;
 public:
   explicit MachineRegisterInfo(MachineFunction *MF);
+  MachineRegisterInfo(const MachineRegisterInfo &) = delete;
+  MachineRegisterInfo &operator=(const MachineRegisterInfo &) = delete;
 
   const TargetRegisterInfo *getTargetRegisterInfo() const {
     return MF->getSubtarget().getRegisterInfo();
@@ -196,6 +216,23 @@ public:
   // Register Info
   //===--------------------------------------------------------------------===//
 
+  /// Returns true if the updated CSR list was initialized and false otherwise.
+  bool isUpdatedCSRsInitialized() const { return IsUpdatedCSRsInitialized; }
+
+  /// Disables the register from the list of CSRs.
+  /// I.e. the register will not appear as part of the CSR mask.
+  /// \see UpdatedCalleeSavedRegs.
+  void disableCalleeSavedRegister(unsigned Reg);
+
+  /// Returns list of callee saved registers.
+  /// The function returns the updated CSR list (after taking into account
+  /// registers that are disabled from the CSR list).
+  const MCPhysReg *getCalleeSavedRegs() const;
+
+  /// Sets the updated Callee Saved Registers list.
+  /// Notice that it will override ant previously disabled/saved CSRs.
+  void setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs);
+
   // Strictly for use by MachineInstr.cpp.
   void addRegOperandToUseList(MachineOperand *MO);
 
@@ -227,8 +264,6 @@ public:
   template<bool, bool, bool, bool, bool, bool>
     friend class defusechain_instr_iterator;
 
-
-
   /// reg_iterator/reg_begin/reg_end - Walk all defs and uses of the specified
   /// register.
   typedef defusechain_iterator<true,true,false,true,false,false>
@@ -727,8 +762,6 @@ public:
 
   const BitVector &getUsedPhysRegsMask() const { return UsedPhysRegMask; }
 
-  void setUsedPhysRegMask(BitVector &Mask) { UsedPhysRegMask = Mask; }
-
   //===--------------------------------------------------------------------===//
   // Reserved Register Info
   //===--------------------------------------------------------------------===//
@@ -800,7 +833,7 @@ public:
 
   // Iteration support for the live-ins set.  It's kept in sorted order
   // by register number.
-  typedef std::vector<std::pair<unsigned,unsigned> >::const_iterator
+  typedef std::vector<std::pair<unsigned,unsigned>>::const_iterator
   livein_iterator;
   livein_iterator livein_begin() const { return LiveIns.begin(); }
   livein_iterator livein_end()   const { return LiveIns.end(); }
@@ -836,7 +869,10 @@ public:
            bool ByOperand, bool ByInstr, bool ByBundle>
   class defusechain_iterator
     : public std::iterator<std::forward_iterator_tag, MachineInstr, ptrdiff_t> {
-    MachineOperand *Op;
+    friend class MachineRegisterInfo;
+
+    MachineOperand *Op = nullptr;
+
     explicit defusechain_iterator(MachineOperand *op) : Op(op) {
       // If the first node isn't one we're interested in, advance to one that
       // we are interested in.
@@ -847,7 +883,6 @@ public:
           advance();
       }
     }
-    friend class MachineRegisterInfo;
 
     void advance() {
       assert(Op && "Cannot increment end iterator!");
@@ -868,13 +903,14 @@ public:
           Op = getNextOperandForReg(Op);
       }
     }
+
   public:
     typedef std::iterator<std::forward_iterator_tag,
                           MachineInstr, ptrdiff_t>::reference reference;
     typedef std::iterator<std::forward_iterator_tag,
                           MachineInstr, ptrdiff_t>::pointer pointer;
 
-    defusechain_iterator() : Op(nullptr) {}
+    defusechain_iterator() = default;
 
     bool operator==(const defusechain_iterator &x) const {
       return Op == x.Op;
@@ -939,7 +975,10 @@ public:
            bool ByOperand, bool ByInstr, bool ByBundle>
   class defusechain_instr_iterator
     : public std::iterator<std::forward_iterator_tag, MachineInstr, ptrdiff_t> {
-    MachineOperand *Op;
+    friend class MachineRegisterInfo;
+
+    MachineOperand *Op = nullptr;
+
     explicit defusechain_instr_iterator(MachineOperand *op) : Op(op) {
       // If the first node isn't one we're interested in, advance to one that
       // we are interested in.
@@ -950,7 +989,6 @@ public:
           advance();
       }
     }
-    friend class MachineRegisterInfo;
 
     void advance() {
       assert(Op && "Cannot increment end iterator!");
@@ -971,13 +1009,14 @@ public:
           Op = getNextOperandForReg(Op);
       }
     }
+
   public:
     typedef std::iterator<std::forward_iterator_tag,
                           MachineInstr, ptrdiff_t>::reference reference;
     typedef std::iterator<std::forward_iterator_tag,
                           MachineInstr, ptrdiff_t>::pointer pointer;
 
-    defusechain_instr_iterator() : Op(nullptr) {}
+    defusechain_instr_iterator() = default;
 
     bool operator==(const defusechain_instr_iterator &x) const {
       return Op == x.Op;
@@ -1029,10 +1068,12 @@ public:
 /// register. If Reg is physical, it must be a register unit (from
 /// MCRegUnitIterator).
 class PSetIterator {
-  const int *PSet;
-  unsigned Weight;
+  const int *PSet = nullptr;
+  unsigned Weight = 0;
+
 public:
-  PSetIterator(): PSet(nullptr), Weight(0) {}
+  PSetIterator() = default;
+
   PSetIterator(unsigned RegUnit, const MachineRegisterInfo *MRI) {
     const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo();
     if (TargetRegisterInfo::isVirtualRegister(RegUnit)) {
@@ -1047,6 +1088,7 @@ public:
     if (*PSet == -1)
       PSet = nullptr;
   }
+
   bool isValid() const { return PSet; }
 
   unsigned getWeight() const { return Weight; }
@@ -1066,6 +1108,6 @@ getPressureSets(unsigned RegUnit) const {
   return PSetIterator(RegUnit, this);
 }
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_MACHINEREGISTERINFO_H
diff --git a/include/llvm/CodeGen/MachineScheduler.h b/include/llvm/CodeGen/MachineScheduler.h
index 81b8741fea27f9180385788fa3e9a14267a828aa..6b2a16e1d36e672e1213d812644f703164079ef8 100644
--- a/include/llvm/CodeGen/MachineScheduler.h
+++ b/include/llvm/CodeGen/MachineScheduler.h
@@ -1,4 +1,4 @@
-//==- MachineScheduler.h - MachineInstr Scheduling Pass ----------*- C++ -*-==//
+//===- MachineScheduler.h - MachineInstr Scheduling Pass --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -112,12 +112,12 @@ class ScheduleHazardRecognizer;
 /// MachineSchedContext provides enough context from the MachineScheduler pass
 /// for the target to instantiate a scheduler.
 struct MachineSchedContext {
-  MachineFunction *MF;
-  const MachineLoopInfo *MLI;
-  const MachineDominatorTree *MDT;
-  const TargetPassConfig *PassConfig;
-  AliasAnalysis *AA;
-  LiveIntervals *LIS;
+  MachineFunction *MF = nullptr;
+  const MachineLoopInfo *MLI = nullptr;
+  const MachineDominatorTree *MDT = nullptr;
+  const TargetPassConfig *PassConfig = nullptr;
+  AliasAnalysis *AA = nullptr;
+  LiveIntervals *LIS = nullptr;
 
   RegisterClassInfo *RegClassInfo;
 
@@ -165,22 +165,21 @@ class ScheduleDAGMI;
 /// before building the DAG.
 struct MachineSchedPolicy {
   // Allow the scheduler to disable register pressure tracking.
-  bool ShouldTrackPressure;
+  bool ShouldTrackPressure = false;
   /// Track LaneMasks to allow reordering of independent subregister writes
   /// of the same vreg. \sa MachineSchedStrategy::shouldTrackLaneMasks()
-  bool ShouldTrackLaneMasks;
+  bool ShouldTrackLaneMasks = false;
 
   // Allow the scheduler to force top-down or bottom-up scheduling. If neither
   // is true, the scheduler runs in both directions and converges.
-  bool OnlyTopDown;
-  bool OnlyBottomUp;
+  bool OnlyTopDown = false;
+  bool OnlyBottomUp = false;
 
   // Disable heuristic that tries to fetch nodes from long dependency chains
   // first.
-  bool DisableLatencyHeuristic;
+  bool DisableLatencyHeuristic = false;
 
-  MachineSchedPolicy(): ShouldTrackPressure(false), ShouldTrackLaneMasks(false),
-    OnlyTopDown(false), OnlyBottomUp(false), DisableLatencyHeuristic(false) {}
+  MachineSchedPolicy() = default;
 };
 
 /// MachineSchedStrategy - Interface to the scheduling algorithm used by
@@ -232,6 +231,7 @@ public:
   /// When all predecessor dependencies have been resolved, free this node for
   /// top-down scheduling.
   virtual void releaseTopNode(SUnit *SU) = 0;
+
   /// When all successor dependencies have been resolved, free this node for
   /// bottom-up scheduling.
   virtual void releaseBottomNode(SUnit *SU) = 0;
@@ -261,24 +261,20 @@ protected:
   MachineBasicBlock::iterator CurrentBottom;
 
   /// Record the next node in a scheduled cluster.
-  const SUnit *NextClusterPred;
-  const SUnit *NextClusterSucc;
+  const SUnit *NextClusterPred = nullptr;
+  const SUnit *NextClusterSucc = nullptr;
 
 #ifndef NDEBUG
   /// The number of instructions scheduled so far. Used to cut off the
   /// scheduler at the point determined by misched-cutoff.
-  unsigned NumInstrsScheduled;
+  unsigned NumInstrsScheduled = 0;
 #endif
+
 public:
   ScheduleDAGMI(MachineSchedContext *C, std::unique_ptr<MachineSchedStrategy> S,
                 bool RemoveKillFlags)
       : ScheduleDAGInstrs(*C->MF, C->MLI, RemoveKillFlags), AA(C->AA),
-        LIS(C->LIS), SchedImpl(std::move(S)), Topo(SUnits, &ExitSU),
-        NextClusterPred(nullptr), NextClusterSucc(nullptr) {
-#ifndef NDEBUG
-    NumInstrsScheduled = 0;
-#endif
-  }
+        LIS(C->LIS), SchedImpl(std::move(S)), Topo(SUnits, &ExitSU) {}
 
   // Provide a vtable anchor
   ~ScheduleDAGMI() override;
@@ -375,7 +371,7 @@ protected:
 
   /// Information about DAG subtrees. If DFSResult is NULL, then SchedulerTrees
   /// will be empty.
-  SchedDFSResult *DFSResult;
+  SchedDFSResult *DFSResult = nullptr;
   BitVector ScheduledTrees;
 
   MachineBasicBlock::iterator LiveRegionEnd;
@@ -389,8 +385,8 @@ protected:
   PressureDiffs SUPressureDiffs;
 
   /// Register pressure in this region computed by initRegPressure.
-  bool ShouldTrackPressure;
-  bool ShouldTrackLaneMasks;
+  bool ShouldTrackPressure = false;
+  bool ShouldTrackLaneMasks = false;
   IntervalPressure RegPressure;
   RegPressureTracker RPTracker;
 
@@ -409,16 +405,14 @@ protected:
 
   /// True if disconnected subregister components are already renamed.
   /// The renaming is only done on demand if lane masks are tracked.
-  bool DisconnectedComponentsRenamed;
+  bool DisconnectedComponentsRenamed = false;
 
 public:
   ScheduleDAGMILive(MachineSchedContext *C,
                     std::unique_ptr<MachineSchedStrategy> S)
       : ScheduleDAGMI(C, std::move(S), /*RemoveKillFlags=*/false),
-        RegClassInfo(C->RegClassInfo), DFSResult(nullptr),
-        ShouldTrackPressure(false), ShouldTrackLaneMasks(false),
-        RPTracker(RegPressure), TopRPTracker(TopPressure),
-        BotRPTracker(BotPressure), DisconnectedComponentsRenamed(false) {}
+        RegClassInfo(C->RegClassInfo), RPTracker(RegPressure),
+        TopRPTracker(TopPressure), BotRPTracker(BotPressure) {}
 
   ~ScheduleDAGMILive() override;
 
@@ -573,6 +567,8 @@ struct SchedRemainder {
   // Unscheduled resources
   SmallVector<unsigned, 16> RemainingCounts;
 
+  SchedRemainder() { reset(); }
+
   void reset() {
     CriticalPath = 0;
     CyclicCritPath = 0;
@@ -581,8 +577,6 @@ struct SchedRemainder {
     RemainingCounts.clear();
   }
 
-  SchedRemainder() { reset(); }
-
   void init(ScheduleDAGMI *DAG, const TargetSchedModel *SchedModel);
 };
 
@@ -598,14 +592,14 @@ public:
     LogMaxQID = 2
   };
 
-  ScheduleDAGMI *DAG;
-  const TargetSchedModel *SchedModel;
-  SchedRemainder *Rem;
+  ScheduleDAGMI *DAG = nullptr;
+  const TargetSchedModel *SchedModel = nullptr;
+  SchedRemainder *Rem = nullptr;
 
   ReadyQueue Available;
   ReadyQueue Pending;
 
-  ScheduleHazardRecognizer *HazardRec;
+  ScheduleHazardRecognizer *HazardRec = nullptr;
 
 private:
   /// True if the pending Q should be checked/updated before scheduling another
@@ -665,9 +659,7 @@ public:
   /// Pending queues extend the ready queues with the same ID and the
   /// PendingFlag set.
   SchedBoundary(unsigned ID, const Twine &Name):
-    DAG(nullptr), SchedModel(nullptr), Rem(nullptr), Available(ID, Name+".A"),
-    Pending(ID << LogMaxQID, Name+".P"),
-    HazardRec(nullptr) {
+    Available(ID, Name+".A"), Pending(ID << LogMaxQID, Name+".P") {
     reset();
   }
 
@@ -781,11 +773,11 @@ public:
 
   /// Policy for scheduling the next instruction in the candidate's zone.
   struct CandPolicy {
-    bool ReduceLatency;
-    unsigned ReduceResIdx;
-    unsigned DemandResIdx;
+    bool ReduceLatency = false;
+    unsigned ReduceResIdx = 0;
+    unsigned DemandResIdx = 0;
 
-    CandPolicy(): ReduceLatency(false), ReduceResIdx(0), DemandResIdx(0) {}
+    CandPolicy() = default;
 
     bool operator==(const CandPolicy &RHS) const {
       return ReduceLatency == RHS.ReduceLatency &&
@@ -800,12 +792,12 @@ public:
   /// Status of an instruction's critical resource consumption.
   struct SchedResourceDelta {
     // Count critical resources in the scheduled region required by SU.
-    unsigned CritResources;
+    unsigned CritResources = 0;
 
     // Count critical resources from another region consumed by SU.
-    unsigned DemandedResources;
+    unsigned DemandedResources = 0;
 
-    SchedResourceDelta(): CritResources(0), DemandedResources(0) {}
+    SchedResourceDelta() = default;
 
     bool operator==(const SchedResourceDelta &RHS) const {
       return CritResources == RHS.CritResources
@@ -866,13 +858,12 @@ public:
 
 protected:
   const MachineSchedContext *Context;
-  const TargetSchedModel *SchedModel;
-  const TargetRegisterInfo *TRI;
+  const TargetSchedModel *SchedModel = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
 
   SchedRemainder Rem;
 
-  GenericSchedulerBase(const MachineSchedContext *C):
-    Context(C), SchedModel(nullptr), TRI(nullptr) {}
+  GenericSchedulerBase(const MachineSchedContext *C) : Context(C) {}
 
   void setPolicy(CandPolicy &Policy, bool IsPostRA, SchedBoundary &CurrZone,
                  SchedBoundary *OtherZone);
@@ -887,7 +878,7 @@ protected:
 class GenericScheduler : public GenericSchedulerBase {
 public:
   GenericScheduler(const MachineSchedContext *C):
-    GenericSchedulerBase(C), DAG(nullptr), Top(SchedBoundary::TopQID, "TopQ"),
+    GenericSchedulerBase(C), Top(SchedBoundary::TopQID, "TopQ"),
     Bot(SchedBoundary::BotQID, "BotQ") {}
 
   void initPolicy(MachineBasicBlock::iterator Begin,
@@ -929,7 +920,7 @@ public:
   void registerRoots() override;
 
 protected:
-  ScheduleDAGMILive *DAG;
+  ScheduleDAGMILive *DAG = nullptr;
 
   MachineSchedPolicy RegionPolicy;
 
@@ -1032,9 +1023,6 @@ std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
                               const TargetRegisterInfo *TRI);
 
-std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(const TargetInstrInfo *TII);
-
 std::unique_ptr<ScheduleDAGMutation>
 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
                                const TargetRegisterInfo *TRI);
diff --git a/include/llvm/CodeGen/MachineTraceMetrics.h b/include/llvm/CodeGen/MachineTraceMetrics.h
index 06db17abaed91b57287d5ce90e5e696c745666e2..284f8c1976076351a62f734af7b3967214ef9aa1 100644
--- a/include/llvm/CodeGen/MachineTraceMetrics.h
+++ b/include/llvm/CodeGen/MachineTraceMetrics.h
@@ -49,54 +49,59 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 
 namespace llvm {
 
-class InstrItineraryData;
+class AnalysisUsage;
 class MachineBasicBlock;
+class MachineFunction;
 class MachineInstr;
 class MachineLoop;
 class MachineLoopInfo;
 class MachineRegisterInfo;
+struct MCSchedClassDesc;
+class raw_ostream;
 class TargetInstrInfo;
 class TargetRegisterInfo;
-class raw_ostream;
 
 class MachineTraceMetrics : public MachineFunctionPass {
-  const MachineFunction *MF;
-  const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  const MachineRegisterInfo *MRI;
-  const MachineLoopInfo *Loops;
+  const MachineFunction *MF = nullptr;
+  const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const MachineRegisterInfo *MRI = nullptr;
+  const MachineLoopInfo *Loops = nullptr;
   TargetSchedModel SchedModel;
 
 public:
+  friend class Ensemble;
+  friend class Trace;
+
   class Ensemble;
-  class Trace;
+
   static char ID;
+
   MachineTraceMetrics();
+
   void getAnalysisUsage(AnalysisUsage&) const override;
   bool runOnMachineFunction(MachineFunction&) override;
   void releaseMemory() override;
   void verifyAnalysis() const override;
 
-  friend class Ensemble;
-  friend class Trace;
-
   /// Per-basic block information that doesn't depend on the trace through the
   /// block.
   struct FixedBlockInfo {
     /// The number of non-trivial instructions in the block.
     /// Doesn't count PHI and COPY instructions that are likely to be removed.
-    unsigned InstrCount;
+    unsigned InstrCount = ~0u;
 
     /// True when the block contains calls.
-    bool HasCalls;
+    bool HasCalls = false;
 
-    FixedBlockInfo() : InstrCount(~0u), HasCalls(false) {}
+    FixedBlockInfo() = default;
 
     /// Returns true when resource information for this block has been computed.
     bool hasResources() const { return InstrCount != ~0u; }
@@ -134,11 +139,11 @@ public:
   struct TraceBlockInfo {
     /// Trace predecessor, or NULL for the first block in the trace.
     /// Valid when hasValidDepth().
-    const MachineBasicBlock *Pred;
+    const MachineBasicBlock *Pred = nullptr;
 
     /// Trace successor, or NULL for the last block in the trace.
     /// Valid when hasValidHeight().
-    const MachineBasicBlock *Succ;
+    const MachineBasicBlock *Succ = nullptr;
 
     /// The block number of the head of the trace. (When hasValidDepth()).
     unsigned Head;
@@ -148,16 +153,13 @@ public:
 
     /// Accumulated number of instructions in the trace above this block.
     /// Does not include instructions in this block.
-    unsigned InstrDepth;
+    unsigned InstrDepth = ~0u;
 
     /// Accumulated number of instructions in the trace below this block.
     /// Includes instructions in this block.
-    unsigned InstrHeight;
+    unsigned InstrHeight = ~0u;
 
-    TraceBlockInfo() :
-      Pred(nullptr), Succ(nullptr),
-      InstrDepth(~0u), InstrHeight(~0u),
-      HasValidInstrDepths(false), HasValidInstrHeights(false) {}
+    TraceBlockInfo() = default;
 
     /// Returns true if the depth resources have been computed from the trace
     /// above this block.
@@ -199,10 +201,10 @@ public:
     // itinerary data.
 
     /// Instruction depths have been computed. This implies hasValidDepth().
-    bool HasValidInstrDepths;
+    bool HasValidInstrDepths = false;
 
     /// Instruction heights have been computed. This implies hasValidHeight().
-    bool HasValidInstrHeights;
+    bool HasValidInstrHeights = false;
 
     /// Critical path length. This is the number of cycles in the longest data
     /// dependency chain through the trace. This is only valid when both
@@ -242,6 +244,7 @@ public:
 
   public:
     explicit Trace(Ensemble &te, TraceBlockInfo &tbi) : TE(te), TBI(tbi) {}
+
     void print(raw_ostream&) const;
 
     /// Compute the total number of instructions in the trace.
@@ -300,11 +303,12 @@ public:
   /// strategy, for example 'minimum resource height'. There is one trace for
   /// every block in the function.
   class Ensemble {
+    friend class Trace;
+
     SmallVector<TraceBlockInfo, 4> BlockInfo;
     DenseMap<const MachineInstr*, InstrCycles> Cycles;
     SmallVector<unsigned, 0> ProcResourceDepths;
     SmallVector<unsigned, 0> ProcResourceHeights;
-    friend class Trace;
 
     void computeTrace(const MachineBasicBlock*);
     void computeDepthResources(const MachineBasicBlock*);
@@ -317,9 +321,11 @@ public:
 
   protected:
     MachineTraceMetrics &MTM;
+
+    explicit Ensemble(MachineTraceMetrics*);
+
     virtual const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) =0;
     virtual const MachineBasicBlock *pickTraceSucc(const MachineBasicBlock*) =0;
-    explicit Ensemble(MachineTraceMetrics*);
     const MachineLoop *getLoopFor(const MachineBasicBlock*) const;
     const TraceBlockInfo *getDepthResources(const MachineBasicBlock*) const;
     const TraceBlockInfo *getHeightResources(const MachineBasicBlock*) const;
@@ -328,7 +334,8 @@ public:
 
   public:
     virtual ~Ensemble();
-    virtual const char *getName() const =0;
+
+    virtual const char *getName() const = 0;
     void print(raw_ostream&) const;
     void invalidate(const MachineBasicBlock *MBB);
     void verify() const;
@@ -394,6 +401,7 @@ inline raw_ostream &operator<<(raw_ostream &OS,
   En.print(OS);
   return OS;
 }
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_MACHINETRACEMETRICS_H
diff --git a/include/llvm/CodeGen/MachineValueType.h b/include/llvm/CodeGen/MachineValueType.h
index de7064f07c3e54bf505f7c452a2a71b3a475287a..e4744fd5e260b877c35f6054ebbbcb6eef99b1a2 100644
--- a/include/llvm/CodeGen/MachineValueType.h
+++ b/include/llvm/CodeGen/MachineValueType.h
@@ -23,14 +23,13 @@ namespace llvm {
 
   class Type;
 
-  /// MVT - Machine Value Type. Every type that is supported natively by some
+  /// Machine Value Type. Every type that is supported natively by some
   /// processor targeted by LLVM occurs here. This means that any legal value
   /// type can be represented by an MVT.
 class MVT {
   public:
     enum SimpleValueType : int8_t {
-      // INVALID_SIMPLE_VALUE_TYPE - Simple value types less than zero are
-      // considered extended value types.
+      // Simple value types less than zero are considered extended value types.
       INVALID_SIMPLE_VALUE_TYPE = -1,
 
       // If you change this numbering, you must change the values in
@@ -141,37 +140,37 @@ class MVT {
       // This value must be a multiple of 32.
       MAX_ALLOWED_VALUETYPE = 96,
 
-      // Token - A value of type llvm::TokenTy
+      // A value of type llvm::TokenTy
       token          = 120,
 
-      // Metadata - This is MDNode or MDString.
+      // This is MDNode or MDString.
       Metadata       = 121,
 
-      // iPTRAny - An int value the size of the pointer of the current
+      // An int value the size of the pointer of the current
       // target to any address space. This must only be used internal to
       // tblgen. Other than for overloading, we treat iPTRAny the same as iPTR.
       iPTRAny        = 122,
 
-      // vAny - A vector with any length and element size. This is used
+      // A vector with any length and element size. This is used
       // for intrinsics that have overloadings based on vector types.
       // This is only for tblgen's consumption!
       vAny           = 123,
 
-      // fAny - Any floating-point or vector floating-point value. This is used
+      // Any floating-point or vector floating-point value. This is used
       // for intrinsics that have overloadings based on floating-point types.
       // This is only for tblgen's consumption!
       fAny           = 124,
 
-      // iAny - An integer or vector integer value of any bit width. This is
+      // An integer or vector integer value of any bit width. This is
       // used for intrinsics that have overloadings based on integer bit widths.
       // This is only for tblgen's consumption!
       iAny           = 125,
 
-      // iPTR - An int value the size of the pointer of the current
+      // An int value the size of the pointer of the current
       // target.  This should only be used internal to tblgen!
       iPTR           = 126,
 
-      // Any - Any type. This is used for intrinsics that have overloadings.
+      // Any type. This is used for intrinsics that have overloadings.
       // This is only for tblgen's consumption!
       Any            = 127
     };
@@ -188,13 +187,13 @@ class MVT {
     bool operator>=(const MVT& S) const { return SimpleTy >= S.SimpleTy; }
     bool operator<=(const MVT& S) const { return SimpleTy <= S.SimpleTy; }
 
-    /// isValid - Return true if this is a valid simple valuetype.
+    /// Return true if this is a valid simple valuetype.
     bool isValid() const {
       return (SimpleTy >= MVT::FIRST_VALUETYPE &&
               SimpleTy < MVT::LAST_VALUETYPE);
     }
 
-    /// isFloatingPoint - Return true if this is a FP, or a vector FP type.
+    /// Return true if this is a FP or a vector FP type.
     bool isFloatingPoint() const {
       return ((SimpleTy >= MVT::FIRST_FP_VALUETYPE &&
                SimpleTy <= MVT::LAST_FP_VALUETYPE) ||
@@ -202,7 +201,7 @@ class MVT {
                SimpleTy <= MVT::LAST_FP_VECTOR_VALUETYPE));
     }
 
-    /// isInteger - Return true if this is an integer, or a vector integer type.
+    /// Return true if this is an integer or a vector integer type.
     bool isInteger() const {
       return ((SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
                SimpleTy <= MVT::LAST_INTEGER_VALUETYPE) ||
@@ -210,41 +209,40 @@ class MVT {
                SimpleTy <= MVT::LAST_INTEGER_VECTOR_VALUETYPE));
     }
 
-    /// isScalarInteger - Return true if this is an integer, not including
-    /// vectors.
+    /// Return true if this is an integer, not including vectors.
     bool isScalarInteger() const {
       return (SimpleTy >= MVT::FIRST_INTEGER_VALUETYPE &&
               SimpleTy <= MVT::LAST_INTEGER_VALUETYPE);
     }
 
-    /// isVector - Return true if this is a vector value type.
+    /// Return true if this is a vector value type.
     bool isVector() const {
       return (SimpleTy >= MVT::FIRST_VECTOR_VALUETYPE &&
               SimpleTy <= MVT::LAST_VECTOR_VALUETYPE);
     }
 
-    /// is16BitVector - Return true if this is a 16-bit vector type.
+    /// Return true if this is a 16-bit vector type.
     bool is16BitVector() const {
       return (SimpleTy == MVT::v2i8  || SimpleTy == MVT::v1i16 ||
               SimpleTy == MVT::v16i1);
     }
 
-    /// is32BitVector - Return true if this is a 32-bit vector type.
+    /// Return true if this is a 32-bit vector type.
     bool is32BitVector() const {
-      return (SimpleTy == MVT::v4i8  || SimpleTy == MVT::v2i16 ||
-              SimpleTy == MVT::v1i32 || SimpleTy == MVT::v2f16 ||
-              SimpleTy == MVT::v1f32);
+      return (SimpleTy == MVT::v32i1 || SimpleTy == MVT::v4i8  ||
+              SimpleTy == MVT::v2i16 || SimpleTy == MVT::v1i32 ||
+              SimpleTy == MVT::v2f16 || SimpleTy == MVT::v1f32);
     }
 
-    /// is64BitVector - Return true if this is a 64-bit vector type.
+    /// Return true if this is a 64-bit vector type.
     bool is64BitVector() const {
-      return (SimpleTy == MVT::v8i8  || SimpleTy == MVT::v4i16 ||
-              SimpleTy == MVT::v2i32 || SimpleTy == MVT::v1i64 ||
-              SimpleTy == MVT::v4f16 || SimpleTy == MVT::v2f32 ||
-              SimpleTy == MVT::v1f64);
+      return (SimpleTy == MVT::v64i1 || SimpleTy == MVT::v8i8  ||
+              SimpleTy == MVT::v4i16 || SimpleTy == MVT::v2i32 ||
+              SimpleTy == MVT::v1i64 || SimpleTy == MVT::v4f16 ||
+              SimpleTy == MVT::v2f32 || SimpleTy == MVT::v1f64);
     }
 
-    /// is128BitVector - Return true if this is a 128-bit vector type.
+    /// Return true if this is a 128-bit vector type.
     bool is128BitVector() const {
       return (SimpleTy == MVT::v16i8  || SimpleTy == MVT::v8i16 ||
               SimpleTy == MVT::v4i32  || SimpleTy == MVT::v2i64 ||
@@ -252,14 +250,14 @@ class MVT {
               SimpleTy == MVT::v4f32  || SimpleTy == MVT::v2f64);
     }
 
-    /// is256BitVector - Return true if this is a 256-bit vector type.
+    /// Return true if this is a 256-bit vector type.
     bool is256BitVector() const {
       return (SimpleTy == MVT::v8f32 || SimpleTy == MVT::v4f64  ||
               SimpleTy == MVT::v32i8 || SimpleTy == MVT::v16i16 ||
               SimpleTy == MVT::v8i32 || SimpleTy == MVT::v4i64);
     }
 
-    /// is512BitVector - Return true if this is a 512-bit vector type.
+    /// Return true if this is a 512-bit vector type.
     bool is512BitVector() const {
       return (SimpleTy == MVT::v16f32 || SimpleTy == MVT::v8f64  ||
               SimpleTy == MVT::v512i1 || SimpleTy == MVT::v64i8  ||
@@ -267,34 +265,34 @@ class MVT {
               SimpleTy == MVT::v8i64);
     }
 
-    /// is1024BitVector - Return true if this is a 1024-bit vector type.
+    /// Return true if this is a 1024-bit vector type.
     bool is1024BitVector() const {
       return (SimpleTy == MVT::v1024i1 || SimpleTy == MVT::v128i8 ||
               SimpleTy == MVT::v64i16  || SimpleTy == MVT::v32i32 ||
               SimpleTy == MVT::v16i64);
     }
 
-    /// is2048BitVector - Return true if this is a 1024-bit vector type.
+    /// Return true if this is a 1024-bit vector type.
     bool is2048BitVector() const {
       return (SimpleTy == MVT::v256i8 || SimpleTy == MVT::v128i16 ||
               SimpleTy == MVT::v64i32 || SimpleTy == MVT::v32i64);
     }
 
-    /// isOverloaded - Return true if this is an overloaded type for TableGen.
+    /// Return true if this is an overloaded type for TableGen.
     bool isOverloaded() const {
       return (SimpleTy==MVT::Any  ||
               SimpleTy==MVT::iAny || SimpleTy==MVT::fAny ||
               SimpleTy==MVT::vAny || SimpleTy==MVT::iPTRAny);
     }
 
-    /// isPow2VectorType - Returns true if the given vector is a power of 2.
+    /// Returns true if the given vector is a power of 2.
     bool isPow2VectorType() const {
       unsigned NElts = getVectorNumElements();
       return !(NElts & (NElts - 1));
     }
 
-    /// getPow2VectorType - Widens the length of the given vector MVT up to
-    /// the nearest power of 2 and returns that type.
+    /// Widens the length of the given vector MVT up to the nearest power of 2
+    /// and returns that type.
     MVT getPow2VectorType() const {
       if (isPow2VectorType())
         return *this;
@@ -304,8 +302,7 @@ class MVT {
       return MVT::getVectorVT(getVectorElementType(), Pow2NElts);
     }
 
-    /// getScalarType - If this is a vector type, return the element type,
-    /// otherwise return this.
+    /// If this is a vector, return the element type, otherwise return this.
     MVT getScalarType() const {
       return isVector() ? getVectorElementType() : *this;
     }
@@ -516,14 +513,14 @@ class MVT {
       return getScalarType().getSizeInBits();
     }
 
-    /// getStoreSize - Return the number of bytes overwritten by a store
-    /// of the specified value type.
+    /// Return the number of bytes overwritten by a store of the specified value
+    /// type.
     unsigned getStoreSize() const {
       return (getSizeInBits() + 7) / 8;
     }
 
-    /// getStoreSizeInBits - Return the number of bits overwritten by a store
-    /// of the specified value type.
+    /// Return the number of bits overwritten by a store of the specified value
+    /// type.
     unsigned getStoreSizeInBits() const {
       return getStoreSize() * 8;
     }
diff --git a/include/llvm/CodeGen/PBQP/Solution.h b/include/llvm/CodeGen/PBQP/Solution.h
index bd74805a239728356c10e2f049cc1d4b10d7b57b..d96b5eac45200de1f7e93e3f7d5de9c53b0f0706 100644
--- a/include/llvm/CodeGen/PBQP/Solution.h
+++ b/include/llvm/CodeGen/PBQP/Solution.h
@@ -1,4 +1,4 @@
-//===-- Solution.h ------- PBQP Solution ------------------------*- C++ -*-===//
+//===- Solution.h - PBQP Solution -------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,8 +14,8 @@
 #ifndef LLVM_CODEGEN_PBQP_SOLUTION_H
 #define LLVM_CODEGEN_PBQP_SOLUTION_H
 
-#include "Graph.h"
-#include "Math.h"
+#include "llvm/CodeGen/PBQP/Graph.h"
+#include <cassert>
 #include <map>
 
 namespace llvm {
@@ -26,17 +26,17 @@ namespace PBQP {
   /// To get the selection for each node in the problem use the getSelection method.
   class Solution {
   private:
-
     typedef std::map<GraphBase::NodeId, unsigned> SelectionsMap;
     SelectionsMap selections;
 
-    unsigned r0Reductions, r1Reductions, r2Reductions, rNReductions;
+    unsigned r0Reductions = 0;
+    unsigned r1Reductions = 0;
+    unsigned r2Reductions = 0;
+    unsigned rNReductions = 0;
 
   public:
-
     /// \brief Initialise an empty solution.
-    Solution()
-      : r0Reductions(0), r1Reductions(0), r2Reductions(0), rNReductions(0) {}
+    Solution() = default;
 
     /// \brief Set the selection for a given node.
     /// @param nodeId Node id.
@@ -53,10 +53,9 @@ namespace PBQP {
       assert(sItr != selections.end() && "No selection for node.");
       return sItr->second;
     }
-
   };
 
-} // namespace PBQP
-} // namespace llvm
+} // end namespace PBQP
+} // end namespace llvm
 
 #endif // LLVM_CODEGEN_PBQP_SOLUTION_H
diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h
index 2fff94c03f8896d0d68c7d6b69b688856c17cafd..42299b5294108362e8e506260b7e82ef59e0df78 100644
--- a/include/llvm/CodeGen/Passes.h
+++ b/include/llvm/CodeGen/Passes.h
@@ -81,6 +81,9 @@ namespace llvm {
 /// MachineDominanaceFrontier - This pass is a machine dominators analysis pass.
   extern char &MachineDominanceFrontierID;
 
+  /// MachineRegionInfo - This pass computes SESE regions for machine functions.
+  extern char &MachineRegionInfoPassID;
+
   /// EdgeBundles analysis - Bundle machine CFG edges.
   extern char &EdgeBundlesID;
 
@@ -286,6 +289,9 @@ namespace llvm {
   /// the target platform.
   extern char &XRayInstrumentationID;
 
+  /// This pass inserts FEntry calls
+  extern char &FEntryInserterID;
+
   /// \brief This pass implements the "patchable-function" attribute.
   extern char &PatchableFunctionID;
 
@@ -320,14 +326,6 @@ namespace llvm {
   /// ExpandISelPseudos - This pass expands pseudo-instructions.
   extern char &ExpandISelPseudosID;
 
-  /// createExecutionDependencyFixPass - This pass fixes execution time
-  /// problems with dependent instructions, such as switching execution
-  /// domains to match.
-  ///
-  /// The pass will examine instructions using and defining registers in RC.
-  ///
-  FunctionPass *createExecutionDependencyFixPass(const TargetRegisterClass *RC);
-
   /// UnpackMachineBundles - This pass unpack machine instruction bundles.
   extern char &UnpackMachineBundlesID;
 
@@ -399,6 +397,14 @@ namespace llvm {
 
   /// This pass frees the memory occupied by the MachineFunction.
   FunctionPass *createFreeMachineFunctionPass();
+
+  /// This pass combine basic blocks guarded by the same branch.
+  extern char &BranchCoalescingID;
+
+  /// This pass performs outlining on machine instructions directly before
+  /// printing assembly.
+  ModulePass *createMachineOutlinerPass();
+
 } // End llvm namespace
 
 /// Target machine pass initializer for passes with dependencies. Use with
@@ -415,7 +421,7 @@ namespace llvm {
   Registry.registerPass(*PI, true);                                            \
   return PI;                                                                   \
   }                                                                            \
-  LLVM_DEFINE_ONCE_FLAG(Initialize##passName##PassFlag);                       \
+  static llvm::once_flag Initialize##passName##PassFlag;                       \
   void llvm::initialize##passName##Pass(PassRegistry &Registry) {              \
     llvm::call_once(Initialize##passName##PassFlag,                            \
                     initialize##passName##PassOnce, std::ref(Registry));       \
diff --git a/include/llvm/CodeGen/RegAllocPBQP.h b/include/llvm/CodeGen/RegAllocPBQP.h
index d458bd95f68067aa243f85d97e09a909dad2fa43..8872a5dc54a1d66686e672a0499cce2c2325eee9 100644
--- a/include/llvm/CodeGen/RegAllocPBQP.h
+++ b/include/llvm/CodeGen/RegAllocPBQP.h
@@ -1,4 +1,4 @@
-//===-- RegAllocPBQP.h ------------------------------------------*- C++ -*-===//
+//===- RegAllocPBQP.h -------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,15 +16,28 @@
 #ifndef LLVM_CODEGEN_REGALLOCPBQP_H
 #define LLVM_CODEGEN_REGALLOCPBQP_H
 
-#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Hashing.h"
 #include "llvm/CodeGen/PBQP/CostAllocator.h"
+#include "llvm/CodeGen/PBQP/Graph.h"
+#include "llvm/CodeGen/PBQP/Math.h"
 #include "llvm/CodeGen/PBQP/ReductionRules.h"
-#include "llvm/CodeGen/PBQPRAConstraint.h"
+#include "llvm/CodeGen/PBQP/Solution.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <limits>
+#include <memory>
 #include <set>
+#include <vector>
 
 namespace llvm {
 
+class FunctionPass;
+class LiveIntervals;
+class MachineBlockFrequencyInfo;
+class MachineFunction;
 class raw_ostream;
 
 namespace PBQP {
@@ -37,15 +50,10 @@ inline unsigned getSpillOptionIdx() { return 0; }
 ///
 /// Keeps track of the number of infinities in each row and column.
 class MatrixMetadata {
-private:
-  MatrixMetadata(const MatrixMetadata&);
-  void operator=(const MatrixMetadata&);
 public:
   MatrixMetadata(const Matrix& M)
-    : WorstRow(0), WorstCol(0),
-      UnsafeRows(new bool[M.getRows() - 1]()),
+    : UnsafeRows(new bool[M.getRows() - 1]()),
       UnsafeCols(new bool[M.getCols() - 1]()) {
-
     unsigned* ColCounts = new unsigned[M.getCols() - 1]();
 
     for (unsigned i = 1; i < M.getRows(); ++i) {
@@ -66,13 +74,17 @@ public:
     delete[] ColCounts;
   }
 
+  MatrixMetadata(const MatrixMetadata &) = delete;
+  MatrixMetadata &operator=(const MatrixMetadata &) = delete;
+
   unsigned getWorstRow() const { return WorstRow; }
   unsigned getWorstCol() const { return WorstCol; }
   const bool* getUnsafeRows() const { return UnsafeRows.get(); }
   const bool* getUnsafeCols() const { return UnsafeCols.get(); }
 
 private:
-  unsigned WorstRow, WorstCol;
+  unsigned WorstRow = 0;
+  unsigned WorstCol = 0;
   std::unique_ptr<bool[]> UnsafeRows;
   std::unique_ptr<bool[]> UnsafeCols;
 };
@@ -80,17 +92,16 @@ private:
 /// \brief Holds a vector of the allowed physical regs for a vreg.
 class AllowedRegVector {
   friend hash_code hash_value(const AllowedRegVector &);
-public:
 
-  AllowedRegVector() : NumOpts(0), Opts(nullptr) {}
+public:
+  AllowedRegVector() = default;
+  AllowedRegVector(AllowedRegVector &&) = default;
 
   AllowedRegVector(const std::vector<unsigned> &OptVec)
     : NumOpts(OptVec.size()), Opts(new unsigned[NumOpts]) {
     std::copy(OptVec.begin(), OptVec.end(), Opts.get());
   }
 
-  AllowedRegVector(AllowedRegVector &&) = default;
-
   unsigned size() const { return NumOpts; }
   unsigned operator[](size_t I) const { return Opts[I]; }
 
@@ -105,7 +116,7 @@ public:
   }
 
 private:
-  unsigned NumOpts;
+  unsigned NumOpts = 0;
   std::unique_ptr<unsigned[]> Opts;
 };
 
@@ -120,8 +131,8 @@ inline hash_code hash_value(const AllowedRegVector &OptRegs) {
 class GraphMetadata {
 private:
   typedef ValuePool<AllowedRegVector> AllowedRegVecPool;
-public:
 
+public:
   typedef AllowedRegVecPool::PoolRef AllowedRegVecRef;
 
   GraphMetadata(MachineFunction &MF,
@@ -168,13 +179,7 @@ public:
     OptimallyReducible
   } ReductionState;
 
-  NodeMetadata()
-    : RS(Unprocessed), NumOpts(0), DeniedOpts(0), OptUnsafeEdges(nullptr),
-      VReg(0)
-#ifndef NDEBUG
-      , everConservativelyAllocatable(false)
-#endif
-      {}
+  NodeMetadata() = default;
 
   NodeMetadata(const NodeMetadata &Other)
     : RS(Other.RS), NumOpts(Other.NumOpts), DeniedOpts(Other.DeniedOpts),
@@ -190,9 +195,8 @@ public:
     }
   }
 
-  NodeMetadata(NodeMetadata &&Other) = default;
-
-  NodeMetadata& operator=(NodeMetadata &&Other) = default;
+  NodeMetadata(NodeMetadata &&) = default;
+  NodeMetadata& operator=(NodeMetadata &&) = default;
 
   void setVReg(unsigned VReg) { this->VReg = VReg; }
   unsigned getVReg() const { return VReg; }
@@ -249,21 +253,22 @@ public:
 #endif
 
 private:
-  ReductionState RS;
-  unsigned NumOpts;
-  unsigned DeniedOpts;
+  ReductionState RS = Unprocessed;
+  unsigned NumOpts = 0;
+  unsigned DeniedOpts = 0;
   std::unique_ptr<unsigned[]> OptUnsafeEdges;
-  unsigned VReg;
+  unsigned VReg = 0;
   GraphMetadata::AllowedRegVecRef AllowedRegs;
 
 #ifndef NDEBUG
-  bool everConservativelyAllocatable;
+  bool everConservativelyAllocatable = false;
 #endif
 };
 
 class RegAllocSolverImpl {
 private:
   typedef MDMatrix<MatrixMetadata> RAMatrix;
+
 public:
   typedef PBQP::Vector RawVector;
   typedef PBQP::Matrix RawMatrix;
@@ -296,6 +301,7 @@ public:
            "PBQP Graph should not contain single or zero-option nodes");
     G.getNodeMetadata(NId).setup(G.getNodeCosts(NId));
   }
+
   void handleRemoveNode(NodeId NId) {}
   void handleSetNodeCosts(NodeId NId, const Vector& newCosts) {}
 
@@ -342,7 +348,6 @@ public:
   }
 
 private:
-
   void promote(NodeId NId, NodeMetadata& NMd) {
     if (G.getNodeDegree(NId) == 3) {
       // This node is becoming optimally reducible.
@@ -474,6 +479,7 @@ private:
   class SpillCostComparator {
   public:
     SpillCostComparator(const Graph& G) : G(G) {}
+
     bool operator()(NodeId N1Id, NodeId N2Id) {
       PBQPNum N1SC = G.getNodeCosts(N1Id)[0];
       PBQPNum N2SC = G.getNodeCosts(N2Id)[0];
@@ -481,6 +487,7 @@ private:
         return G.getNodeDegree(N1Id) < G.getNodeDegree(N2Id);
       return N1SC < N2SC;
     }
+
   private:
     const Graph& G;
   };
@@ -495,6 +502,7 @@ private:
 class PBQPRAGraph : public PBQP::Graph<RegAllocSolverImpl> {
 private:
   typedef PBQP::Graph<RegAllocSolverImpl> BaseT;
+
 public:
   PBQPRAGraph(GraphMetadata Metadata) : BaseT(std::move(Metadata)) {}
 
@@ -517,13 +525,13 @@ inline Solution solve(PBQPRAGraph& G) {
   return RegAllocSolver.solve();
 }
 
-} // namespace RegAlloc
-} // namespace PBQP
+} // end namespace RegAlloc
+} // end namespace PBQP
 
 /// @brief Create a PBQP register allocator instance.
 FunctionPass *
 createPBQPRegisterAllocator(char *customPassID = nullptr);
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif /* LLVM_CODEGEN_REGALLOCPBQP_H */
+#endif // LLVM_CODEGEN_REGALLOCPBQP_H
diff --git a/include/llvm/CodeGen/RegisterClassInfo.h b/include/llvm/CodeGen/RegisterClassInfo.h
index d784dfbda7ecbc5c36b7c722b4db6c6085d13e46..355c9f9b2f1e6777070e99ce01f9173f85bf72bf 100644
--- a/include/llvm/CodeGen/RegisterClassInfo.h
+++ b/include/llvm/CodeGen/RegisterClassInfo.h
@@ -1,4 +1,4 @@
-//===-- RegisterClassInfo.h - Dynamic Register Class Info -*- C++ -*-------===//
+//===- RegisterClassInfo.h - Dynamic Register Class Info --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,22 +19,25 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
 
 namespace llvm {
 
 class RegisterClassInfo {
   struct RCInfo {
-    unsigned Tag;
-    unsigned NumRegs;
-    bool ProperSubClass;
-    uint8_t MinCost;
-    uint16_t LastCostChange;
+    unsigned Tag = 0;
+    unsigned NumRegs = 0;
+    bool ProperSubClass = false;
+    uint8_t MinCost = 0;
+    uint16_t LastCostChange = 0;
     std::unique_ptr<MCPhysReg[]> Order;
 
-    RCInfo()
-      : Tag(0), NumRegs(0), ProperSubClass(false), MinCost(0),
-        LastCostChange(0) {}
+    RCInfo() = default;
 
     operator ArrayRef<MCPhysReg>() const {
       return makeArrayRef(Order.get(), NumRegs);
@@ -46,17 +49,18 @@ class RegisterClassInfo {
 
   // Tag changes whenever cached information needs to be recomputed. An RCInfo
   // entry is valid when its tag matches.
-  unsigned Tag;
+  unsigned Tag = 0;
 
-  const MachineFunction *MF;
-  const TargetRegisterInfo *TRI;
+  const MachineFunction *MF = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
 
   // Callee saved registers of last MF. Assumed to be valid until the next
   // runOnFunction() call.
-  const MCPhysReg *CalleeSaved;
+  // Used only to determine if an update was made to CalleeSavedAliases.
+  const MCPhysReg *CalleeSavedRegs = nullptr;
 
-  // Map register number to CalleeSaved index + 1;
-  SmallVector<uint8_t, 4> CSRNum;
+  // Map register alias to the callee saved Register.
+  SmallVector<MCPhysReg, 4> CalleeSavedAliases;
 
   // Reserved registers in the current MF.
   BitVector Reserved;
@@ -105,11 +109,11 @@ public:
   }
 
   /// getLastCalleeSavedAlias - Returns the last callee saved register that
-  /// overlaps PhysReg, or 0 if Reg doesn't overlap a CSR.
+  /// overlaps PhysReg, or 0 if Reg doesn't overlap a CalleeSavedAliases.
   unsigned getLastCalleeSavedAlias(unsigned PhysReg) const {
     assert(TargetRegisterInfo::isPhysicalRegister(PhysReg));
-    if (unsigned N = CSRNum[PhysReg])
-      return CalleeSaved[N-1];
+    if (PhysReg < CalleeSavedAliases.size())
+      return CalleeSavedAliases[PhysReg];
     return 0;
   }
 
@@ -140,6 +144,7 @@ public:
 protected:
   unsigned computePSetLimit(unsigned Idx) const;
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_REGISTERCLASSINFO_H
diff --git a/include/llvm/CodeGen/RegisterPressure.h b/include/llvm/CodeGen/RegisterPressure.h
index 82440069959b6d5ae46644aefba9dc81543764f7..a3ea41d5236e3be47d39ba9d2d82b26617552cf6 100644
--- a/include/llvm/CodeGen/RegisterPressure.h
+++ b/include/llvm/CodeGen/RegisterPressure.h
@@ -1,4 +1,4 @@
-//===-- RegisterPressure.h - Dynamic Register Pressure -*- C++ -*-------===//
+//===- RegisterPressure.h - Dynamic Register Pressure -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,16 +15,25 @@
 #ifndef LLVM_CODEGEN_REGISTERPRESSURE_H
 #define LLVM_CODEGEN_REGISTERPRESSURE_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/LaneBitmask.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <limits>
+#include <vector>
 
 namespace llvm {
 
 class LiveIntervals;
-class LiveRange;
-class RegisterClassInfo;
 class MachineInstr;
+class RegisterClassInfo;
 
 struct RegisterMaskPair {
   unsigned RegUnit; ///< Virtual register or register unit.
@@ -91,12 +100,13 @@ struct RegionPressure : RegisterPressure {
 /// higher level assert that pressure is consistent within a region. We also
 /// effectively ignore dead defs which don't affect heuristics much.
 class PressureChange {
-  uint16_t PSetID; // ID+1. 0=Invalid.
-  int16_t  UnitInc;
+  uint16_t PSetID = 0; // ID+1. 0=Invalid.
+  int16_t UnitInc = 0;
+
 public:
-  PressureChange(): PSetID(0), UnitInc(0) {}
-  PressureChange(unsigned id): PSetID(id+1), UnitInc(0) {
-    assert(id < UINT16_MAX && "PSetID overflow.");
+  PressureChange() = default;
+  PressureChange(unsigned id): PSetID(id + 1) {
+    assert(id < std::numeric_limits<uint16_t>::max() && "PSetID overflow.");
   }
 
   bool isValid() const { return PSetID > 0; }
@@ -105,8 +115,11 @@ public:
     assert(isValid() && "invalid PressureChange");
     return PSetID - 1;
   }
+
   // If PSetID is invalid, return UINT16_MAX to give it lowest priority.
-  unsigned getPSetOrMax() const { return (PSetID - 1) & UINT16_MAX; }
+  unsigned getPSetOrMax() const {
+    return (PSetID - 1) & std::numeric_limits<uint16_t>::max();
+  }
 
   int getUnitInc() const { return UnitInc; }
 
@@ -182,11 +195,12 @@ public:
 
 /// Array of PressureDiffs.
 class PressureDiffs {
-  PressureDiff *PDiffArray;
-  unsigned Size;
-  unsigned Max;
+  PressureDiff *PDiffArray = nullptr;
+  unsigned Size = 0;
+  unsigned Max = 0;
+
 public:
-  PressureDiffs(): PDiffArray(nullptr), Size(0), Max(0) {}
+  PressureDiffs() = default;
   ~PressureDiffs() { free(PDiffArray); }
 
   void clear() { Size = 0; }
@@ -200,6 +214,7 @@ public:
   const PressureDiff &operator[](unsigned Idx) const {
     return const_cast<PressureDiffs*>(this)->operator[](Idx);
   }
+
   /// \brief Record pressure difference induced by the given operand list to
   /// node with index \p Idx.
   void addInstruction(unsigned Idx, const RegisterOperands &RegOpers,
@@ -225,7 +240,7 @@ struct RegPressureDelta {
   PressureChange CriticalMax;
   PressureChange CurrentMax;
 
-  RegPressureDelta() {}
+  RegPressureDelta() = default;
 
   bool operator==(const RegPressureDelta &RHS) const {
     return Excess == RHS.Excess && CriticalMax == RHS.CriticalMax
@@ -264,6 +279,7 @@ private:
     assert(Reg < NumRegUnits);
     return Reg;
   }
+
   unsigned getRegFromSparseIndex(unsigned SparseIndex) const {
     if (SparseIndex >= NumRegUnits)
       return TargetRegisterInfo::index2VirtReg(SparseIndex-NumRegUnits);
@@ -338,14 +354,14 @@ public:
 /// tracking. Changing direction has the side effect of closing region, and
 /// traversing past TopIdx or BottomIdx reopens it.
 class RegPressureTracker {
-  const MachineFunction     *MF;
-  const TargetRegisterInfo  *TRI;
-  const RegisterClassInfo   *RCI;
+  const MachineFunction *MF = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  const RegisterClassInfo *RCI = nullptr;
   const MachineRegisterInfo *MRI;
-  const LiveIntervals       *LIS;
+  const LiveIntervals *LIS = nullptr;
 
   /// We currently only allow pressure tracking within a block.
-  const MachineBasicBlock *MBB;
+  const MachineBasicBlock *MBB = nullptr;
 
   /// Track the max pressure within the region traversed so far.
   RegisterPressure &P;
@@ -355,10 +371,10 @@ class RegPressureTracker {
   bool RequireIntervals;
 
   /// True if UntiedDefs will be populated.
-  bool TrackUntiedDefs;
+  bool TrackUntiedDefs = false;
 
   /// True if lanemasks should be tracked.
-  bool TrackLaneMasks;
+  bool TrackLaneMasks = false;
 
   /// Register pressure corresponds to liveness before this instruction
   /// iterator. It may point to the end of the block or a DebugValue rather than
@@ -377,13 +393,8 @@ class RegPressureTracker {
   std::vector<unsigned> LiveThruPressure;
 
 public:
-  RegPressureTracker(IntervalPressure &rp) :
-    MF(nullptr), TRI(nullptr), RCI(nullptr), LIS(nullptr), MBB(nullptr), P(rp),
-    RequireIntervals(true), TrackUntiedDefs(false), TrackLaneMasks(false) {}
-
-  RegPressureTracker(RegionPressure &rp) :
-    MF(nullptr), TRI(nullptr), RCI(nullptr), LIS(nullptr), MBB(nullptr), P(rp),
-    RequireIntervals(false), TrackUntiedDefs(false), TrackLaneMasks(false) {}
+  RegPressureTracker(IntervalPressure &rp) : P(rp), RequireIntervals(true) {}
+  RegPressureTracker(RegionPressure &rp) : P(rp), RequireIntervals(false) {}
 
   void reset();
 
@@ -555,6 +566,7 @@ protected:
 
 void dumpRegSetPressure(ArrayRef<unsigned> SetPressure,
                         const TargetRegisterInfo *TRI);
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_REGISTERPRESSURE_H
diff --git a/include/llvm/CodeGen/RegisterScavenging.h b/include/llvm/CodeGen/RegisterScavenging.h
index 53859968db686ae6c6ede42b3b84b5b4b354811b..1f939e72e1396bf9f473ecc08be019fa47b14008 100644
--- a/include/llvm/CodeGen/RegisterScavenging.h
+++ b/include/llvm/CodeGen/RegisterScavenging.h
@@ -1,4 +1,4 @@
-//===-- RegisterScavenging.h - Machine register scavenging ------*- C++ -*-===//
+//===- RegisterScavenging.h - Machine register scavenging -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -19,41 +19,43 @@
 #define LLVM_CODEGEN_REGISTERSCAVENGING_H
 
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/MC/LaneBitmask.h"
 
 namespace llvm {
 
-class MachineRegisterInfo;
-class TargetRegisterInfo;
+class MachineInstr;
 class TargetInstrInfo;
 class TargetRegisterClass;
+class TargetRegisterInfo;
 
 class RegScavenger {
   const TargetRegisterInfo *TRI;
   const TargetInstrInfo *TII;
   MachineRegisterInfo* MRI;
-  MachineBasicBlock *MBB;
+  MachineBasicBlock *MBB = nullptr;
   MachineBasicBlock::iterator MBBI;
-  unsigned NumRegUnits;
+  unsigned NumRegUnits = 0;
 
   /// True if RegScavenger is currently tracking the liveness of registers.
-  bool Tracking;
+  bool Tracking = false;
 
   /// Information on scavenged registers (held in a spill slot).
   struct ScavengedInfo {
-    ScavengedInfo(int FI = -1) : FrameIndex(FI), Reg(0), Restore(nullptr) {}
+    ScavengedInfo(int FI = -1) : FrameIndex(FI) {}
 
     /// A spill slot used for scavenging a register post register allocation.
     int FrameIndex;
 
     /// If non-zero, the specific register is currently being
     /// scavenged. That is, it is spilled to this scavenging stack slot.
-    unsigned Reg;
+    unsigned Reg = 0;
 
     /// The instruction that restores the scavenged register from stack.
-    const MachineInstr *Restore;
+    const MachineInstr *Restore = nullptr;
   };
 
   /// A vector of information on scavenged registers.
@@ -67,8 +69,7 @@ class RegScavenger {
   BitVector TmpRegUnits;
 
 public:
-  RegScavenger()
-    : MBB(nullptr), NumRegUnits(0), Tracking(false) {}
+  RegScavenger() = default;
 
   /// Start tracking liveness from the begin of basic block \p MBB.
   void enterBasicBlock(MachineBasicBlock &MBB);
@@ -163,6 +164,7 @@ public:
 
   /// Tell the scavenger a register is used.
   void setRegUsed(unsigned Reg, LaneBitmask LaneMask = LaneBitmask::getAll());
+
 private:
   /// Returns true if a register is reserved. It is never "unused".
   bool isReserved(unsigned Reg) const { return MRI->isReserved(Reg); }
@@ -202,6 +204,6 @@ private:
   void setLiveInsUsed(const MachineBasicBlock &MBB);
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_REGISTERSCAVENGING_H
diff --git a/include/llvm/CodeGen/ScheduleDAG.h b/include/llvm/CodeGen/ScheduleDAG.h
index ed4e0bc8a4a1aeed567845c927bceba3cc19e9e3..99afd8c5c9ab9ba9010639a0dd986352d8885ec3 100644
--- a/include/llvm/CodeGen/ScheduleDAG.h
+++ b/include/llvm/CodeGen/ScheduleDAG.h
@@ -1,4 +1,4 @@
-//===------- llvm/CodeGen/ScheduleDAG.h - Common Base Class------*- C++ -*-===//
+//===- llvm/CodeGen/ScheduleDAG.h - Common Base Class -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the ScheduleDAG class, which is used as the common
-// base class for instruction schedulers. This encapsulates the scheduling DAG,
-// which is shared between SelectionDAG and MachineInstr scheduling.
+/// \file Implements the ScheduleDAG class, which is used as the common base
+/// class for instruction schedulers. This encapsulates the scheduling DAG,
+/// which is shared between SelectionDAG and MachineInstr scheduling.
 //
 //===----------------------------------------------------------------------===//
 
@@ -18,33 +18,38 @@
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/iterator.h"
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetLowering.h"
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <string>
+#include <vector>
 
 namespace llvm {
-  class SUnit;
-  class MachineConstantPool;
-  class MachineFunction;
-  class MachineRegisterInfo;
-  class MachineInstr;
-  struct MCSchedClassDesc;
-  class TargetRegisterInfo;
-  class ScheduleDAG;
-  class SDNode;
-  class TargetInstrInfo;
-  class MCInstrDesc;
-  class TargetMachine;
-  class TargetRegisterClass;
-  template<class Graph> class GraphWriter;
-
-  /// SDep - Scheduling dependency. This represents one direction of an
-  /// edge in the scheduling DAG.
+
+template<class Graph> class GraphWriter;
+class MachineFunction;
+class MachineRegisterInfo;
+class MCInstrDesc;
+struct MCSchedClassDesc;
+class ScheduleDAG;
+class SDNode;
+class SUnit;
+class TargetInstrInfo;
+class TargetMachine;
+class TargetRegisterClass;
+class TargetRegisterInfo;
+
+  /// Scheduling dependency. This represents one direction of an edge in the
+  /// scheduling DAG.
   class SDep {
   public:
-    /// Kind - These are the different kinds of scheduling dependencies.
+    /// These are the different kinds of scheduling dependencies.
     enum Kind {
       Data,        ///< Regular data dependence (aka true-dependence).
       Anti,        ///< A register anti-dependedence (aka WAR).
@@ -71,33 +76,32 @@ namespace llvm {
     };
 
   private:
-    /// Dep - A pointer to the depending/depended-on SUnit, and an enum
+    /// \brief A pointer to the depending/depended-on SUnit, and an enum
     /// indicating the kind of the dependency.
     PointerIntPair<SUnit *, 2, Kind> Dep;
 
-    /// Contents - A union discriminated by the dependence kind.
+    /// A union discriminated by the dependence kind.
     union {
-      /// Reg - For Data, Anti, and Output dependencies, the associated
-      /// register. For Data dependencies that don't currently have a register
-      /// assigned, this is set to zero.
+      /// For Data, Anti, and Output dependencies, the associated register. For
+      /// Data dependencies that don't currently have a register/ assigned, this
+      /// is set to zero.
       unsigned Reg;
 
-      /// Order - Additional information about Order dependencies.
+      /// Additional information about Order dependencies.
       unsigned OrdKind; // enum OrderKind
     } Contents;
 
-    /// Latency - The time associated with this edge. Often this is just
-    /// the value of the Latency field of the predecessor, however advanced
-    /// models may provide additional information about specific edges.
+    /// The time associated with this edge. Often this is just the value of the
+    /// Latency field of the predecessor, however advanced models may provide
+    /// additional information about specific edges.
     unsigned Latency;
 
   public:
-    /// SDep - Construct a null SDep. This is only for use by container
-    /// classes which require default constructors. SUnits may not
-    /// have null SDep edges.
+    /// Constructs a null SDep. This is only for use by container classes which
+    /// require default constructors. SUnits may not/ have null SDep edges.
     SDep() : Dep(nullptr, Data) {}
 
-    /// SDep - Construct an SDep with the specified values.
+    /// Constructs an SDep with the specified values.
     SDep(SUnit *S, Kind kind, unsigned Reg)
       : Dep(S, kind), Contents() {
       switch (kind) {
@@ -116,12 +120,13 @@ namespace llvm {
         break;
       }
     }
+
     SDep(SUnit *S, OrderKind kind)
       : Dep(S, Order), Contents(), Latency(0) {
       Contents.OrdKind = kind;
     }
 
-    /// Return true if the specified SDep is equivalent except for latency.
+    /// Returns true if the specified SDep is equivalent except for latency.
     bool overlaps(const SDep &Other) const;
 
     bool operator==(const SDep &Other) const {
@@ -132,100 +137,95 @@ namespace llvm {
       return !operator==(Other);
     }
 
-    /// getLatency - Return the latency value for this edge, which roughly
-    /// means the minimum number of cycles that must elapse between the
-    /// predecessor and the successor, given that they have this edge
-    /// between them.
+    /// \brief Returns the latency value for this edge, which roughly means the
+    /// minimum number of cycles that must elapse between the predecessor and
+    /// the successor, given that they have this edge between them.
     unsigned getLatency() const {
       return Latency;
     }
 
-    /// setLatency - Set the latency for this edge.
+    /// Sets the latency for this edge.
     void setLatency(unsigned Lat) {
       Latency = Lat;
     }
 
-    //// getSUnit - Return the SUnit to which this edge points.
+    //// Returns the SUnit to which this edge points.
     SUnit *getSUnit() const;
 
-    //// setSUnit - Assign the SUnit to which this edge points.
+    //// Assigns the SUnit to which this edge points.
     void setSUnit(SUnit *SU);
 
-    /// getKind - Return an enum value representing the kind of the dependence.
+    /// Returns an enum value representing the kind of the dependence.
     Kind getKind() const;
 
-    /// isCtrl - Shorthand for getKind() != SDep::Data.
+    /// Shorthand for getKind() != SDep::Data.
     bool isCtrl() const {
       return getKind() != Data;
     }
 
-    /// isNormalMemory - Test if this is an Order dependence between two
-    /// memory accesses where both sides of the dependence access memory
-    /// in non-volatile and fully modeled ways.
+    /// \brief Tests if this is an Order dependence between two memory accesses
+    /// where both sides of the dependence access memory in non-volatile and
+    /// fully modeled ways.
     bool isNormalMemory() const {
       return getKind() == Order && (Contents.OrdKind == MayAliasMem
                                     || Contents.OrdKind == MustAliasMem);
     }
 
-    /// isBarrier - Test if this is an Order dependence that is marked
-    /// as a barrier.
+    /// Tests if this is an Order dependence that is marked as a barrier.
     bool isBarrier() const {
       return getKind() == Order && Contents.OrdKind == Barrier;
     }
 
-    /// isNormalMemoryOrBarrier - Test if this is could be any kind of memory
-    /// dependence.
+    /// Tests if this is could be any kind of memory dependence.
     bool isNormalMemoryOrBarrier() const {
       return (isNormalMemory() || isBarrier());
     }
 
-    /// isMustAlias - Test if this is an Order dependence that is marked
-    /// as "must alias", meaning that the SUnits at either end of the edge
-    /// have a memory dependence on a known memory location.
+    /// \brief Tests if this is an Order dependence that is marked as
+    /// "must alias", meaning that the SUnits at either end of the edge have a
+    /// memory dependence on a known memory location.
     bool isMustAlias() const {
       return getKind() == Order && Contents.OrdKind == MustAliasMem;
     }
 
-    /// isWeak - Test if this a weak dependence. Weak dependencies are
-    /// considered DAG edges for height computation and other heuristics, but do
-    /// not force ordering. Breaking a weak edge may require the scheduler to
-    /// compensate, for example by inserting a copy.
+    /// Tests if this a weak dependence. Weak dependencies are considered DAG
+    /// edges for height computation and other heuristics, but do not force
+    /// ordering. Breaking a weak edge may require the scheduler to compensate,
+    /// for example by inserting a copy.
     bool isWeak() const {
       return getKind() == Order && Contents.OrdKind >= Weak;
     }
 
-    /// isArtificial - Test if this is an Order dependence that is marked
-    /// as "artificial", meaning it isn't necessary for correctness.
+    /// \brief Tests if this is an Order dependence that is marked as
+    /// "artificial", meaning it isn't necessary for correctness.
     bool isArtificial() const {
       return getKind() == Order && Contents.OrdKind == Artificial;
     }
 
-    /// isCluster - Test if this is an Order dependence that is marked
-    /// as "cluster", meaning it is artificial and wants to be adjacent.
+    /// \brief Tests if this is an Order dependence that is marked as "cluster",
+    /// meaning it is artificial and wants to be adjacent.
     bool isCluster() const {
       return getKind() == Order && Contents.OrdKind == Cluster;
     }
 
-    /// isAssignedRegDep - Test if this is a Data dependence that is
-    /// associated with a register.
+    /// Tests if this is a Data dependence that is associated with a register.
     bool isAssignedRegDep() const {
       return getKind() == Data && Contents.Reg != 0;
     }
 
-    /// getReg - Return the register associated with this edge. This is
-    /// only valid on Data, Anti, and Output edges. On Data edges, this
-    /// value may be zero, meaning there is no associated register.
+    /// Returns the register associated with this edge. This is only valid on
+    /// Data, Anti, and Output edges. On Data edges, this value may be zero,
+    /// meaning there is no associated register.
     unsigned getReg() const {
       assert((getKind() == Data || getKind() == Anti || getKind() == Output) &&
              "getReg called on non-register dependence edge!");
       return Contents.Reg;
     }
 
-    /// setReg - Assign the associated register for this edge. This is
-    /// only valid on Data, Anti, and Output edges. On Anti and Output
-    /// edges, this value must not be zero. On Data edges, the value may
-    /// be zero, which would mean that no specific register is associated
-    /// with this edge.
+    /// Assigns the associated register for this edge. This is only valid on
+    /// Data, Anti, and Output edges. On Anti and Output edges, this value must
+    /// not be zero. On Data edges, the value may be zero, which would mean that
+    /// no specific register is associated with this edge.
     void setReg(unsigned Reg) {
       assert((getKind() == Data || getKind() == Anti || getKind() == Output) &&
              "setReg called on non-register dependence edge!");
@@ -240,115 +240,101 @@ namespace llvm {
   template <>
   struct isPodLike<SDep> { static const bool value = true; };
 
-  /// SUnit - Scheduling unit. This is a node in the scheduling DAG.
+  /// Scheduling unit. This is a node in the scheduling DAG.
   class SUnit {
   private:
     enum : unsigned { BoundaryID = ~0u };
 
-    SDNode *Node;                       // Representative node.
-    MachineInstr *Instr;                // Alternatively, a MachineInstr.
+    SDNode *Node = nullptr;        ///< Representative node.
+    MachineInstr *Instr = nullptr; ///< Alternatively, a MachineInstr.
+
   public:
-    SUnit *OrigNode;                    // If not this, the node from which
-                                        // this node was cloned.
-                                        // (SD scheduling only)
+    SUnit *OrigNode = nullptr; ///< If not this, the node from which this node 
+                               /// was cloned. (SD scheduling only)
 
-    const MCSchedClassDesc *SchedClass; // NULL or resolved SchedClass.
+    const MCSchedClassDesc *SchedClass =
+        nullptr; ///< nullptr or resolved SchedClass.
 
-    // Preds/Succs - The SUnits before/after us in the graph.
-    SmallVector<SDep, 4> Preds;  // All sunit predecessors.
-    SmallVector<SDep, 4> Succs;  // All sunit successors.
+    SmallVector<SDep, 4> Preds;  ///< All sunit predecessors.
+    SmallVector<SDep, 4> Succs;  ///< All sunit successors.
 
     typedef SmallVectorImpl<SDep>::iterator pred_iterator;
     typedef SmallVectorImpl<SDep>::iterator succ_iterator;
     typedef SmallVectorImpl<SDep>::const_iterator const_pred_iterator;
     typedef SmallVectorImpl<SDep>::const_iterator const_succ_iterator;
 
-    unsigned NodeNum;                   // Entry # of node in the node vector.
-    unsigned NodeQueueId;               // Queue id of node.
-    unsigned NumPreds;                  // # of SDep::Data preds.
-    unsigned NumSuccs;                  // # of SDep::Data sucss.
-    unsigned NumPredsLeft;              // # of preds not scheduled.
-    unsigned NumSuccsLeft;              // # of succs not scheduled.
-    unsigned WeakPredsLeft;             // # of weak preds not scheduled.
-    unsigned WeakSuccsLeft;             // # of weak succs not scheduled.
-    unsigned short NumRegDefsLeft;      // # of reg defs with no scheduled use.
-    unsigned short Latency;             // Node latency.
-    bool isVRegCycle      : 1;          // May use and def the same vreg.
-    bool isCall           : 1;          // Is a function call.
-    bool isCallOp         : 1;          // Is a function call operand.
-    bool isTwoAddress     : 1;          // Is a two-address instruction.
-    bool isCommutable     : 1;          // Is a commutable instruction.
-    bool hasPhysRegUses   : 1;          // Has physreg uses.
-    bool hasPhysRegDefs   : 1;          // Has physreg defs that are being used.
-    bool hasPhysRegClobbers : 1;        // Has any physreg defs, used or not.
-    bool isPending        : 1;          // True once pending.
-    bool isAvailable      : 1;          // True once available.
-    bool isScheduled      : 1;          // True once scheduled.
-    bool isScheduleHigh   : 1;          // True if preferable to schedule high.
-    bool isScheduleLow    : 1;          // True if preferable to schedule low.
-    bool isCloned         : 1;          // True if this node has been cloned.
-    bool isUnbuffered     : 1;          // Uses an unbuffered resource.
-    bool hasReservedResource : 1;       // Uses a reserved resource.
-    Sched::Preference SchedulingPref;   // Scheduling preference.
+    unsigned NodeNum = BoundaryID;     ///< Entry # of node in the node vector.
+    unsigned NodeQueueId = 0;          ///< Queue id of node.
+    unsigned NumPreds = 0;             ///< # of SDep::Data preds.
+    unsigned NumSuccs = 0;             ///< # of SDep::Data sucss.
+    unsigned NumPredsLeft = 0;         ///< # of preds not scheduled.
+    unsigned NumSuccsLeft = 0;         ///< # of succs not scheduled.
+    unsigned WeakPredsLeft = 0;        ///< # of weak preds not scheduled.
+    unsigned WeakSuccsLeft = 0;        ///< # of weak succs not scheduled.
+    unsigned short NumRegDefsLeft = 0; ///< # of reg defs with no scheduled use.
+    unsigned short Latency = 0;        ///< Node latency.
+    bool isVRegCycle      : 1;         ///< May use and def the same vreg.
+    bool isCall           : 1;         ///< Is a function call.
+    bool isCallOp         : 1;         ///< Is a function call operand.
+    bool isTwoAddress     : 1;         ///< Is a two-address instruction.
+    bool isCommutable     : 1;         ///< Is a commutable instruction.
+    bool hasPhysRegUses   : 1;         ///< Has physreg uses.
+    bool hasPhysRegDefs   : 1;         ///< Has physreg defs that are being used.
+    bool hasPhysRegClobbers : 1;       ///< Has any physreg defs, used or not.
+    bool isPending        : 1;         ///< True once pending.
+    bool isAvailable      : 1;         ///< True once available.
+    bool isScheduled      : 1;         ///< True once scheduled.
+    bool isScheduleHigh   : 1;         ///< True if preferable to schedule high.
+    bool isScheduleLow    : 1;         ///< True if preferable to schedule low.
+    bool isCloned         : 1;         ///< True if this node has been cloned.
+    bool isUnbuffered     : 1;         ///< Uses an unbuffered resource.
+    bool hasReservedResource : 1;      ///< Uses a reserved resource.
+    Sched::Preference SchedulingPref = Sched::None; ///< Scheduling preference.
 
   private:
-    bool isDepthCurrent   : 1;          // True if Depth is current.
-    bool isHeightCurrent  : 1;          // True if Height is current.
-    unsigned Depth;                     // Node depth.
-    unsigned Height;                    // Node height.
+    bool isDepthCurrent   : 1;         ///< True if Depth is current.
+    bool isHeightCurrent  : 1;         ///< True if Height is current.
+    unsigned Depth = 0;                ///< Node depth.
+    unsigned Height = 0;               ///< Node height.
+
   public:
-    unsigned TopReadyCycle; // Cycle relative to start when node is ready.
-    unsigned BotReadyCycle; // Cycle relative to end when node is ready.
+    unsigned TopReadyCycle = 0; ///< Cycle relative to start when node is ready.
+    unsigned BotReadyCycle = 0; ///< Cycle relative to end when node is ready.
 
-    const TargetRegisterClass *CopyDstRC; // Is a special copy node if not null.
-    const TargetRegisterClass *CopySrcRC;
+    const TargetRegisterClass *CopyDstRC =
+        nullptr; ///< Is a special copy node if != nullptr.
+    const TargetRegisterClass *CopySrcRC = nullptr;
 
-    /// SUnit - Construct an SUnit for pre-regalloc scheduling to represent
-    /// an SDNode and any nodes flagged to it.
+    /// \brief Constructs an SUnit for pre-regalloc scheduling to represent an
+    /// SDNode and any nodes flagged to it.
     SUnit(SDNode *node, unsigned nodenum)
-      : Node(node), Instr(nullptr), OrigNode(nullptr), SchedClass(nullptr),
-        NodeNum(nodenum), NodeQueueId(0), NumPreds(0), NumSuccs(0),
-        NumPredsLeft(0), NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0),
-        NumRegDefsLeft(0), Latency(0), isVRegCycle(false), isCall(false),
+      : Node(node), NodeNum(nodenum), isVRegCycle(false), isCall(false),
         isCallOp(false), isTwoAddress(false), isCommutable(false),
         hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
         isPending(false), isAvailable(false), isScheduled(false),
         isScheduleHigh(false), isScheduleLow(false), isCloned(false),
-        isUnbuffered(false), hasReservedResource(false),
-        SchedulingPref(Sched::None), isDepthCurrent(false),
-        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
-        BotReadyCycle(0), CopyDstRC(nullptr), CopySrcRC(nullptr) {}
+        isUnbuffered(false), hasReservedResource(false), isDepthCurrent(false),
+        isHeightCurrent(false) {}
 
-    /// SUnit - Construct an SUnit for post-regalloc scheduling to represent
-    /// a MachineInstr.
+    /// \brief Constructs an SUnit for post-regalloc scheduling to represent a
+    /// MachineInstr.
     SUnit(MachineInstr *instr, unsigned nodenum)
-      : Node(nullptr), Instr(instr), OrigNode(nullptr), SchedClass(nullptr),
-        NodeNum(nodenum), NodeQueueId(0), NumPreds(0), NumSuccs(0),
-        NumPredsLeft(0), NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0),
-        NumRegDefsLeft(0), Latency(0), isVRegCycle(false), isCall(false),
+      : Instr(instr), NodeNum(nodenum), isVRegCycle(false), isCall(false),
         isCallOp(false), isTwoAddress(false), isCommutable(false),
         hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
         isPending(false), isAvailable(false), isScheduled(false),
         isScheduleHigh(false), isScheduleLow(false), isCloned(false),
-        isUnbuffered(false), hasReservedResource(false),
-        SchedulingPref(Sched::None), isDepthCurrent(false),
-        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
-        BotReadyCycle(0), CopyDstRC(nullptr), CopySrcRC(nullptr) {}
+        isUnbuffered(false), hasReservedResource(false), isDepthCurrent(false),
+        isHeightCurrent(false) {}
 
-    /// SUnit - Construct a placeholder SUnit.
+    /// \brief Constructs a placeholder SUnit.
     SUnit()
-      : Node(nullptr), Instr(nullptr), OrigNode(nullptr), SchedClass(nullptr),
-        NodeNum(BoundaryID), NodeQueueId(0), NumPreds(0), NumSuccs(0),
-        NumPredsLeft(0), NumSuccsLeft(0), WeakPredsLeft(0), WeakSuccsLeft(0),
-        NumRegDefsLeft(0), Latency(0), isVRegCycle(false), isCall(false),
-        isCallOp(false), isTwoAddress(false), isCommutable(false),
-        hasPhysRegUses(false), hasPhysRegDefs(false), hasPhysRegClobbers(false),
-        isPending(false), isAvailable(false), isScheduled(false),
-        isScheduleHigh(false), isScheduleLow(false), isCloned(false),
-        isUnbuffered(false), hasReservedResource(false),
-        SchedulingPref(Sched::None), isDepthCurrent(false),
-        isHeightCurrent(false), Depth(0), Height(0), TopReadyCycle(0),
-        BotReadyCycle(0), CopyDstRC(nullptr), CopySrcRC(nullptr) {}
+      : isVRegCycle(false), isCall(false), isCallOp(false), isTwoAddress(false),
+        isCommutable(false), hasPhysRegUses(false), hasPhysRegDefs(false),
+        hasPhysRegClobbers(false), isPending(false), isAvailable(false),
+        isScheduled(false), isScheduleHigh(false), isScheduleLow(false),
+        isCloned(false), isUnbuffered(false), hasReservedResource(false),
+        isDepthCurrent(false), isHeightCurrent(false) {}
 
     /// \brief Boundary nodes are placeholders for the boundary of the
     /// scheduling region.
@@ -359,46 +345,44 @@ namespace llvm {
     /// an assoicative data structure keyed on node ID.
     bool isBoundaryNode() const { return NodeNum == BoundaryID; }
 
-    /// setNode - Assign the representative SDNode for this SUnit.
-    /// This may be used during pre-regalloc scheduling.
+    /// Assigns the representative SDNode for this SUnit. This may be used
+    /// during pre-regalloc scheduling.
     void setNode(SDNode *N) {
       assert(!Instr && "Setting SDNode of SUnit with MachineInstr!");
       Node = N;
     }
 
-    /// getNode - Return the representative SDNode for this SUnit.
-    /// This may be used during pre-regalloc scheduling.
+    /// Returns the representative SDNode for this SUnit. This may be used
+    /// during pre-regalloc scheduling.
     SDNode *getNode() const {
       assert(!Instr && "Reading SDNode of SUnit with MachineInstr!");
       return Node;
     }
 
-    /// isInstr - Return true if this SUnit refers to a machine instruction as
+    /// \brief Returns true if this SUnit refers to a machine instruction as
     /// opposed to an SDNode.
     bool isInstr() const { return Instr; }
 
-    /// setInstr - Assign the instruction for the SUnit.
-    /// This may be used during post-regalloc scheduling.
+    /// Assigns the instruction for the SUnit. This may be used during
+    /// post-regalloc scheduling.
     void setInstr(MachineInstr *MI) {
       assert(!Node && "Setting MachineInstr of SUnit with SDNode!");
       Instr = MI;
     }
 
-    /// getInstr - Return the representative MachineInstr for this SUnit.
-    /// This may be used during post-regalloc scheduling.
+    /// Returns the representative MachineInstr for this SUnit. This may be used
+    /// during post-regalloc scheduling.
     MachineInstr *getInstr() const {
       assert(!Node && "Reading MachineInstr of SUnit with SDNode!");
       return Instr;
     }
 
-    /// addPred - This adds the specified edge as a pred of the current node if
-    /// not already.  It also adds the current node as a successor of the
-    /// specified node.
+    /// Adds the specified edge as a pred of the current node if not already.
+    /// It also adds the current node as a successor of the specified node.
     bool addPred(const SDep &D, bool Required = true);
 
-    /// addPredBarrier - This adds a barrier edge to SU by calling
-    /// addPred(), with latency 0 generally or latency 1 for a store
-    /// followed by a load.
+    /// \brief Adds a barrier edge to SU by calling addPred(), with latency 0
+    /// generally or latency 1 for a store followed by a load.
     bool addPredBarrier(SUnit *SU) {
       SDep Dep(SU, SDep::Barrier);
       unsigned TrueMemOrderLatency =
@@ -407,20 +391,19 @@ namespace llvm {
       return addPred(Dep);
     }
 
-    /// removePred - This removes the specified edge as a pred of the current
-    /// node if it exists.  It also removes the current node as a successor of
-    /// the specified node.
+    /// Removes the specified edge as a pred of the current node if it exists.
+    /// It also removes the current node as a successor of the specified node.
     void removePred(const SDep &D);
 
-    /// getDepth - Return the depth of this node, which is the length of the
-    /// maximum path up to any node which has no predecessors.
+    /// Returns the depth of this node, which is the length of the maximum path
+    /// up to any node which has no predecessors.
     unsigned getDepth() const {
       if (!isDepthCurrent)
         const_cast<SUnit *>(this)->ComputeDepth();
       return Depth;
     }
 
-    /// getHeight - Return the height of this node, which is the length of the
+    /// \brief Returns the height of this node, which is the length of the
     /// maximum path down to any node which has no successors.
     unsigned getHeight() const {
       if (!isHeightCurrent)
@@ -428,38 +411,36 @@ namespace llvm {
       return Height;
     }
 
-    /// setDepthToAtLeast - If NewDepth is greater than this node's
-    /// depth value, set it to be the new depth value. This also
-    /// recursively marks successor nodes dirty.
+    /// \brief If NewDepth is greater than this node's depth value, sets it to
+    /// be the new depth value. This also recursively marks successor nodes
+    /// dirty.
     void setDepthToAtLeast(unsigned NewDepth);
 
-    /// setDepthToAtLeast - If NewDepth is greater than this node's
-    /// depth value, set it to be the new height value. This also
-    /// recursively marks predecessor nodes dirty.
+    /// \brief If NewDepth is greater than this node's depth value, set it to be
+    /// the new height value. This also recursively marks predecessor nodes
+    /// dirty.
     void setHeightToAtLeast(unsigned NewHeight);
 
-    /// setDepthDirty - Set a flag in this node to indicate that its
-    /// stored Depth value will require recomputation the next time
-    /// getDepth() is called.
+    /// \brief Sets a flag in this node to indicate that its stored Depth value
+    /// will require recomputation the next time getDepth() is called.
     void setDepthDirty();
 
-    /// setHeightDirty - Set a flag in this node to indicate that its
-    /// stored Height value will require recomputation the next time
-    /// getHeight() is called.
+    /// \brief Sets a flag in this node to indicate that its stored Height value
+    /// will require recomputation the next time getHeight() is called.
     void setHeightDirty();
 
-    /// isPred - Test if node N is a predecessor of this node.
-    bool isPred(SUnit *N) {
-      for (unsigned i = 0, e = (unsigned)Preds.size(); i != e; ++i)
-        if (Preds[i].getSUnit() == N)
+    /// Tests if node N is a predecessor of this node.
+    bool isPred(const SUnit *N) const {
+      for (const SDep &Pred : Preds)
+        if (Pred.getSUnit() == N)
           return true;
       return false;
     }
 
-    /// isSucc - Test if node N is a successor of this node.
-    bool isSucc(SUnit *N) {
-      for (unsigned i = 0, e = (unsigned)Succs.size(); i != e; ++i)
-        if (Succs[i].getSUnit() == N)
+    /// Tests if node N is a successor of this node.
+    bool isSucc(const SUnit *N) const {
+      for (const SDep &Succ : Succs)
+        if (Succ.getSUnit() == N)
           return true;
       return false;
     }
@@ -471,7 +452,7 @@ namespace llvm {
       return NumSuccsLeft == 0;
     }
 
-    /// \brief Order this node's predecessor edges such that the critical path
+    /// \brief Orders this node's predecessor edges such that the critical path
     /// edge occurs first.
     void biasCriticalPath();
 
@@ -484,7 +465,7 @@ namespace llvm {
     void ComputeHeight();
   };
 
-  /// Return true if the specified SDep is equivalent except for latency.
+  /// Returns true if the specified SDep is equivalent except for latency.
   inline bool SDep::overlaps(const SDep &Other) const {
     if (Dep != Other.Dep)
       return false;
@@ -499,31 +480,33 @@ namespace llvm {
     llvm_unreachable("Invalid dependency kind!");
   }
 
-  //// getSUnit - Return the SUnit to which this edge points.
+  //// Returns the SUnit to which this edge points.
   inline SUnit *SDep::getSUnit() const { return Dep.getPointer(); }
 
-  //// setSUnit - Assign the SUnit to which this edge points.
+  //// Assigns the SUnit to which this edge points.
   inline void SDep::setSUnit(SUnit *SU) { Dep.setPointer(SU); }
 
-  /// getKind - Return an enum value representing the kind of the dependence.
+  /// Returns an enum value representing the kind of the dependence.
   inline SDep::Kind SDep::getKind() const { return Dep.getInt(); }
 
   //===--------------------------------------------------------------------===//
-  /// SchedulingPriorityQueue - This interface is used to plug different
-  /// priorities computation algorithms into the list scheduler. It implements
-  /// the interface of a standard priority queue, where nodes are inserted in
-  /// arbitrary order and returned in priority order.  The computation of the
-  /// priority and the representation of the queue are totally up to the
-  /// implementation to decide.
-  ///
+
+  /// \brief This interface is used to plug different priorities computation
+  /// algorithms into the list scheduler. It implements the interface of a
+  /// standard priority queue, where nodes are inserted in arbitrary order and
+  /// returned in priority order.  The computation of the priority and the
+  /// representation of the queue are totally up to the implementation to
+  /// decide.
   class SchedulingPriorityQueue {
     virtual void anchor();
-    unsigned CurCycle;
+
+    unsigned CurCycle = 0;
     bool HasReadyFilter;
+
   public:
-    SchedulingPriorityQueue(bool rf = false):
-      CurCycle(0), HasReadyFilter(rf) {}
-    virtual ~SchedulingPriorityQueue() {}
+    SchedulingPriorityQueue(bool rf = false) :  HasReadyFilter(rf) {}
+
+    virtual ~SchedulingPriorityQueue() = default;
 
     virtual bool isBottomUp() const = 0;
 
@@ -542,6 +525,7 @@ namespace llvm {
       assert(!HasReadyFilter && "The ready filter must override isReady()");
       return true;
     }
+
     virtual void push(SUnit *U) = 0;
 
     void push_all(const std::vector<SUnit *> &Nodes) {
@@ -556,10 +540,9 @@ namespace llvm {
 
     virtual void dump(ScheduleDAG *) const {}
 
-    /// scheduledNode - As each node is scheduled, this method is invoked.  This
-    /// allows the priority function to adjust the priority of related
-    /// unscheduled nodes, for example.
-    ///
+    /// As each node is scheduled, this method is invoked.  This allows the
+    /// priority function to adjust the priority of related unscheduled nodes,
+    /// for example.
     virtual void scheduledNode(SUnit *) {}
 
     virtual void unscheduledNode(SUnit *) {}
@@ -575,14 +558,14 @@ namespace llvm {
 
   class ScheduleDAG {
   public:
-    const TargetMachine &TM;              // Target processor
-    const TargetInstrInfo *TII;           // Target instruction information
-    const TargetRegisterInfo *TRI;        // Target processor register info
-    MachineFunction &MF;                  // Machine function
-    MachineRegisterInfo &MRI;             // Virtual/real register map
-    std::vector<SUnit> SUnits;            // The scheduling units.
-    SUnit EntrySU;                        // Special node for the region entry.
-    SUnit ExitSU;                         // Special node for the region exit.
+    const TargetMachine &TM;            ///< Target processor
+    const TargetInstrInfo *TII;         ///< Target instruction information
+    const TargetRegisterInfo *TRI;      ///< Target processor register info
+    MachineFunction &MF;                ///< Machine function
+    MachineRegisterInfo &MRI;           ///< Virtual/real register map
+    std::vector<SUnit> SUnits;          ///< The scheduling units.
+    SUnit EntrySU;                      ///< Special node for the region entry.
+    SUnit ExitSU;                       ///< Special node for the region exit.
 
 #ifdef NDEBUG
     static const bool StressSched = false;
@@ -594,43 +577,39 @@ namespace llvm {
 
     virtual ~ScheduleDAG();
 
-    /// clearDAG - clear the DAG state (between regions).
+    /// Clears the DAG state (between regions).
     void clearDAG();
 
-    /// getInstrDesc - Return the MCInstrDesc of this SUnit.
-    /// Return NULL for SDNodes without a machine opcode.
+    /// Returns the MCInstrDesc of this SUnit.
+    /// Returns NULL for SDNodes without a machine opcode.
     const MCInstrDesc *getInstrDesc(const SUnit *SU) const {
       if (SU->isInstr()) return &SU->getInstr()->getDesc();
       return getNodeDesc(SU->getNode());
     }
 
-    /// viewGraph - Pop up a GraphViz/gv window with the ScheduleDAG rendered
-    /// using 'dot'.
-    ///
+    /// Pops up a GraphViz/gv window with the ScheduleDAG rendered using 'dot'.
     virtual void viewGraph(const Twine &Name, const Twine &Title);
     virtual void viewGraph();
 
     virtual void dumpNode(const SUnit *SU) const = 0;
 
-    /// getGraphNodeLabel - Return a label for an SUnit node in a visualization
-    /// of the ScheduleDAG.
+    /// Returns a label for an SUnit node in a visualization of the ScheduleDAG.
     virtual std::string getGraphNodeLabel(const SUnit *SU) const = 0;
 
-    /// getDAGLabel - Return a label for the region of code covered by the DAG.
+    /// Returns a label for the region of code covered by the DAG.
     virtual std::string getDAGName() const = 0;
 
-    /// addCustomGraphFeatures - Add custom features for a visualization of
-    /// the ScheduleDAG.
+    /// Adds custom features for a visualization of the ScheduleDAG.
     virtual void addCustomGraphFeatures(GraphWriter<ScheduleDAG*> &) const {}
 
 #ifndef NDEBUG
-    /// VerifyScheduledDAG - Verify that all SUnits were scheduled and that
-    /// their state is consistent. Return the number of scheduled SUnits.
+    /// \brief Verifies that all SUnits were scheduled and that their state is
+    /// consistent. Returns the number of scheduled SUnits.
     unsigned VerifyScheduledDAG(bool isBottomUp);
 #endif
 
   private:
-    // Return the MCInstrDesc of this SDNode or NULL.
+    /// Returns the MCInstrDesc of this SDNode or NULL.
     const MCInstrDesc *getNodeDesc(const SDNode *Node) const;
   };
 
@@ -640,6 +619,7 @@ namespace llvm {
     unsigned Operand;
 
     SUnitIterator(SUnit *N, unsigned Op) : Node(N), Operand(Op) {}
+
   public:
     bool operator==(const SUnitIterator& x) const {
       return Operand == x.Operand;
@@ -666,7 +646,8 @@ namespace llvm {
 
     unsigned getOperand() const { return Operand; }
     const SUnit *getNode() const { return Node; }
-    /// isCtrlDep - Test if this is not an SDep::Data dependence.
+
+    /// Tests if this is not an SDep::Data dependence.
     bool isCtrlDep() const {
       return getSDep().isCtrl();
     }
@@ -700,56 +681,61 @@ namespace llvm {
     }
   };
 
-  /// ScheduleDAGTopologicalSort is a class that computes a topological
-  /// ordering for SUnits and provides methods for dynamically updating
-  /// the ordering as new edges are added.
+  /// This class can compute a topological ordering for SUnits and provides
+  /// methods for dynamically updating the ordering as new edges are added.
   ///
   /// This allows a very fast implementation of IsReachable, for example.
-  ///
   class ScheduleDAGTopologicalSort {
-    /// SUnits - A reference to the ScheduleDAG's SUnits.
+    /// A reference to the ScheduleDAG's SUnits.
     std::vector<SUnit> &SUnits;
     SUnit *ExitSU;
 
-    /// Index2Node - Maps topological index to the node number.
+    /// Maps topological index to the node number.
     std::vector<int> Index2Node;
-    /// Node2Index - Maps the node number to its topological index.
+    /// Maps the node number to its topological index.
     std::vector<int> Node2Index;
-    /// Visited - a set of nodes visited during a DFS traversal.
+    /// a set of nodes visited during a DFS traversal.
     BitVector Visited;
 
-    /// DFS - make a DFS traversal and mark all nodes affected by the
-    /// edge insertion. These nodes will later get new topological indexes
-    /// by means of the Shift method.
+    /// Makes a DFS traversal and mark all nodes affected by the edge insertion.
+    /// These nodes will later get new topological indexes by means of the Shift
+    /// method.
     void DFS(const SUnit *SU, int UpperBound, bool& HasLoop);
 
-    /// Shift - reassign topological indexes for the nodes in the DAG
-    /// to preserve the topological ordering.
+    /// \brief Reassigns topological indexes for the nodes in the DAG to
+    /// preserve the topological ordering.
     void Shift(BitVector& Visited, int LowerBound, int UpperBound);
 
-    /// Allocate - assign the topological index to the node n.
+    /// Assigns the topological index to the node n.
     void Allocate(int n, int index);
 
   public:
     ScheduleDAGTopologicalSort(std::vector<SUnit> &SUnits, SUnit *ExitSU);
 
-    /// InitDAGTopologicalSorting - create the initial topological
-    /// ordering from the DAG to be scheduled.
+    /// Creates the initial topological ordering from the DAG to be scheduled.
     void InitDAGTopologicalSorting();
 
-    /// IsReachable - Checks if SU is reachable from TargetSU.
+    /// Returns an array of SUs that are both in the successor
+    /// subtree of StartSU and in the predecessor subtree of TargetSU.
+    /// StartSU and TargetSU are not in the array.
+    /// Success is false if TargetSU is not in the successor subtree of
+    /// StartSU, else it is true.
+    std::vector<int> GetSubGraph(const SUnit &StartSU, const SUnit &TargetSU,
+                                 bool &Success);
+
+    /// Checks if \p SU is reachable from \p TargetSU.
     bool IsReachable(const SUnit *SU, const SUnit *TargetSU);
 
-    /// WillCreateCycle - Return true if addPred(TargetSU, SU) creates a cycle.
+    /// Returns true if addPred(TargetSU, SU) creates a cycle.
     bool WillCreateCycle(SUnit *TargetSU, SUnit *SU);
 
-    /// AddPred - Updates the topological ordering to accommodate an edge
-    /// to be added from SUnit X to SUnit Y.
+    /// \brief Updates the topological ordering to accommodate an edge to be
+    /// added from SUnit \p X to SUnit \p Y.
     void AddPred(SUnit *Y, SUnit *X);
 
-    /// RemovePred - Updates the topological ordering to accommodate an
-    /// an edge to be removed from the specified node N from the predecessors
-    /// of the current node M.
+    /// \brief Updates the topological ordering to accommodate an an edge to be
+    /// removed from the specified node \p N from the predecessors of the
+    /// current node \p M.
     void RemovePred(SUnit *M, SUnit *N);
 
     typedef std::vector<int>::iterator iterator;
@@ -766,6 +752,7 @@ namespace llvm {
     reverse_iterator rend() { return Index2Node.rend(); }
     const_reverse_iterator rend() const { return Index2Node.rend(); }
   };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_SCHEDULEDAG_H
diff --git a/include/llvm/CodeGen/ScheduleDAGMutation.h b/include/llvm/CodeGen/ScheduleDAGMutation.h
index 02fe2294815c4bef8178ccac43a32805fe07ce60..5c236427e0b8cb76c9194e80acc37a93f0ca299a 100644
--- a/include/llvm/CodeGen/ScheduleDAGMutation.h
+++ b/include/llvm/CodeGen/ScheduleDAGMutation.h
@@ -1,4 +1,4 @@
-//==- ScheduleDAGMutation.h - MachineInstr Scheduling ------------*- C++ -*-==//
+//===- ScheduleDAGMutation.h - MachineInstr Scheduling ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,16 +16,19 @@
 #define LLVM_CODEGEN_SCHEDULEDAGMUTATION_H
 
 namespace llvm {
-  class ScheduleDAGInstrs;
 
-  /// Mutate the DAG as a postpass after normal DAG building.
-  class ScheduleDAGMutation {
-    virtual void anchor();
-  public:
-    virtual ~ScheduleDAGMutation() {}
+class ScheduleDAGInstrs;
 
-    virtual void apply(ScheduleDAGInstrs *DAG) = 0;
-  };
-}
+/// Mutate the DAG as a postpass after normal DAG building.
+class ScheduleDAGMutation {
+  virtual void anchor();
 
-#endif
+public:
+  virtual ~ScheduleDAGMutation() = default;
+
+  virtual void apply(ScheduleDAGInstrs *DAG) = 0;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_SCHEDULEDAGMUTATION_H
diff --git a/include/llvm/CodeGen/ScheduleDFS.h b/include/llvm/CodeGen/ScheduleDFS.h
index b2108ad3bedb17c32ad204269ac08f12220579fb..c2013661cfff8d89b5ff163a7041961eb3964d17 100644
--- a/include/llvm/CodeGen/ScheduleDFS.h
+++ b/include/llvm/CodeGen/ScheduleDFS.h
@@ -14,16 +14,16 @@
 #ifndef LLVM_CODEGEN_SCHEDULEDFS_H
 #define LLVM_CODEGEN_SCHEDULEDFS_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/DataTypes.h"
 #include <vector>
+#include <cassert>
+#include <cstdint>
 
 namespace llvm {
 
 class raw_ostream;
-class IntEqClasses;
-class ScheduleDAGInstrs;
-class SUnit;
 
 /// \brief Represent the ILP of the subDAG rooted at a DAG node.
 ///
@@ -75,18 +75,18 @@ class SchedDFSResult {
   /// interior node. Finally, it is set to a representative subtree ID during
   /// finalization.
   struct NodeData {
-    unsigned InstrCount;
-    unsigned SubtreeID;
+    unsigned InstrCount = 0;
+    unsigned SubtreeID = InvalidSubtreeID;
 
-    NodeData(): InstrCount(0), SubtreeID(InvalidSubtreeID) {}
+    NodeData() = default;
   };
 
   /// \brief Per-Subtree data computed during DFS.
   struct TreeData {
-    unsigned ParentTreeID;
-    unsigned SubInstrCount;
+    unsigned ParentTreeID = InvalidSubtreeID;
+    unsigned SubInstrCount = 0;
 
-    TreeData(): ParentTreeID(InvalidSubtreeID), SubInstrCount(0) {}
+    TreeData() = default;
   };
 
   /// \brief Record a connection between subtrees and the connection level.
@@ -107,7 +107,7 @@ class SchedDFSResult {
 
   // For each subtree discovered during DFS, record its connections to other
   // subtrees.
-  std::vector<SmallVector<Connection, 4> > SubtreeConnections;
+  std::vector<SmallVector<Connection, 4>> SubtreeConnections;
 
   /// Cache the current connection level of each subtree.
   /// This mutable array is updated during scheduling.
@@ -189,6 +189,6 @@ public:
 
 raw_ostream &operator<<(raw_ostream &OS, const ILPValue &Val);
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_SCHEDULEDFS_H
diff --git a/include/llvm/CodeGen/ScheduleHazardRecognizer.h b/include/llvm/CodeGen/ScheduleHazardRecognizer.h
index 214be2794ba37bc576bba272f270cb82d8602cfc..ace4a2d836ca81d2fd72dc668033d16b3955d226 100644
--- a/include/llvm/CodeGen/ScheduleHazardRecognizer.h
+++ b/include/llvm/CodeGen/ScheduleHazardRecognizer.h
@@ -29,10 +29,10 @@ protected:
   /// state. Important to restore the state after backtracking. Additionally,
   /// MaxLookAhead=0 identifies a fake recognizer, allowing the client to
   /// bypass virtual calls. Currently the PostRA scheduler ignores it.
-  unsigned MaxLookAhead;
+  unsigned MaxLookAhead = 0;
 
 public:
-  ScheduleHazardRecognizer(): MaxLookAhead(0) {}
+  ScheduleHazardRecognizer() = default;
   virtual ~ScheduleHazardRecognizer();
 
   enum HazardType {
@@ -117,6 +117,6 @@ public:
   }
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_SCHEDULEHAZARDRECOGNIZER_H
diff --git a/include/llvm/CodeGen/ScoreboardHazardRecognizer.h b/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
index e0c30fe4d82a6255a04e9a44d63363d07b78f974..466ab532030c7b2a421ed9234dc974481a7bb5cc 100644
--- a/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
+++ b/include/llvm/CodeGen/ScoreboardHazardRecognizer.h
@@ -17,8 +17,8 @@
 #define LLVM_CODEGEN_SCOREBOARDHAZARDRECOGNIZER_H
 
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
-#include "llvm/Support/DataTypes.h"
 #include <cassert>
+#include <cstddef>
 #include <cstring>
 
 namespace llvm {
@@ -38,21 +38,25 @@ class ScoreboardHazardRecognizer : public ScheduleHazardRecognizer {
   // bottom-up scheduler, then the scoreboard cycles are the inverse of the
   // scheduler's cycles.
   class Scoreboard {
-    unsigned *Data;
+    unsigned *Data = nullptr;
 
     // The maximum number of cycles monitored by the Scoreboard. This
     // value is determined based on the target itineraries to ensure
     // that all hazards can be tracked.
-    size_t Depth;
+    size_t Depth = 0;
+
     // Indices into the Scoreboard that represent the current cycle.
-    size_t Head;
+    size_t Head = 0;
+
   public:
-    Scoreboard():Data(nullptr), Depth(0), Head(0) { }
+    Scoreboard() = default;
+
     ~Scoreboard() {
       delete[] Data;
     }
 
     size_t getDepth() const { return Depth; }
+
     unsigned& operator[](size_t idx) const {
       // Depth is expected to be a power-of-2.
       assert(Depth && !(Depth & (Depth - 1)) &&
@@ -93,10 +97,10 @@ class ScoreboardHazardRecognizer : public ScheduleHazardRecognizer {
   const ScheduleDAG *DAG;
 
   /// IssueWidth - Max issue per cycle. 0=Unknown.
-  unsigned IssueWidth;
+  unsigned IssueWidth = 0;
 
   /// IssueCount - Count instructions issued in this cycle.
-  unsigned IssueCount;
+  unsigned IssueCount = 0;
 
   Scoreboard ReservedScoreboard;
   Scoreboard RequiredScoreboard;
@@ -119,6 +123,6 @@ public:
   void RecedeCycle() override;
 };
 
-}
+} // end namespace llvm
 
-#endif //!LLVM_CODEGEN_SCOREBOARDHAZARDRECOGNIZER_H
+#endif // LLVM_CODEGEN_SCOREBOARDHAZARDRECOGNIZER_H
diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h
index 5a291c79c9363266db8089cc428a658899920351..6f0509543e7d88d3204ca9e28348930c6a20b8dd 100644
--- a/include/llvm/CodeGen/SelectionDAG.h
+++ b/include/llvm/CodeGen/SelectionDAG.h
@@ -36,6 +36,7 @@ namespace llvm {
 class MachineConstantPoolValue;
 class MachineFunction;
 class MDNode;
+class OptimizationRemarkEmitter;
 class SDDbgValue;
 class TargetLowering;
 class SelectionDAGTargetInfo;
@@ -171,6 +172,10 @@ class SelectionDAG {
   LLVMContext *Context;
   CodeGenOpt::Level OptLevel;
 
+  /// The function-level optimization remark emitter.  Used to emit remarks
+  /// whenever manipulating the DAG.
+  OptimizationRemarkEmitter *ORE;
+
   /// The starting token.
   SDNode EntryNode;
 
@@ -318,7 +323,7 @@ public:
   ~SelectionDAG();
 
   /// Prepare this SelectionDAG to process code in the given MachineFunction.
-  void init(MachineFunction &mf);
+  void init(MachineFunction &NewMF, OptimizationRemarkEmitter &NewORE);
 
   /// Clear state and free memory necessary to make this
   /// SelectionDAG ready to process a new block.
@@ -331,6 +336,7 @@ public:
   const TargetLowering &getTargetLoweringInfo() const { return *TLI; }
   const SelectionDAGTargetInfo &getSelectionDAGInfo() const { return *TSI; }
   LLVMContext *getContext() const {return Context; }
+  OptimizationRemarkEmitter &getORE() const { return *ORE; }
 
   /// Pop up a GraphViz/gv window with the DAG rendered using 'dot'.
   void viewGraph(const std::string &Title);
@@ -480,6 +486,13 @@ public:
                       bool isTarget = false, bool isOpaque = false);
   SDValue getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
                       bool isTarget = false, bool isOpaque = false);
+
+  SDValue getAllOnesConstant(const SDLoc &DL, EVT VT, bool IsTarget = false,
+                             bool IsOpaque = false) {
+    return getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()), DL,
+                       VT, IsTarget, IsOpaque);
+  }
+
   SDValue getConstant(const ConstantInt &Val, const SDLoc &DL, EVT VT,
                       bool isTarget = false, bool isOpaque = false);
   SDValue getIntPtrConstant(uint64_t Val, const SDLoc &DL,
@@ -733,6 +746,9 @@ public:
     return getNode(ISD::CALLSEQ_END, DL, NodeTys, Ops);
   }
 
+  /// Return true if the result of this operation is always undefined.
+  bool isUndef(unsigned Opcode, ArrayRef<SDValue> Ops);
+
   /// Return an UNDEF node. UNDEF does not have a useful SDLoc.
   SDValue getUNDEF(EVT VT) {
     return getNode(ISD::UNDEF, SDLoc(), VT);
@@ -1274,6 +1290,19 @@ public:
   void computeKnownBits(SDValue Op, APInt &KnownZero, APInt &KnownOne,
                         const APInt &DemandedElts, unsigned Depth = 0) const;
 
+  /// Used to represent the possible overflow behavior of an operation.
+  /// Never: the operation cannot overflow.
+  /// Always: the operation will always overflow.
+  /// Sometime: the operation may or may not overflow.
+  enum OverflowKind {
+    OFK_Never,
+    OFK_Sometime,
+    OFK_Always,
+  };
+
+  /// Determine if the result of the addition of 2 node can overflow.
+  OverflowKind computeOverflowKind(SDValue N0, SDValue N1) const;
+
   /// Test if the given value is known to have exactly one bit set. This differs
   /// from computeKnownBits in that it doesn't necessarily determine which bit
   /// is set.
@@ -1288,6 +1317,17 @@ public:
   /// target nodes to be understood.
   unsigned ComputeNumSignBits(SDValue Op, unsigned Depth = 0) const;
 
+  /// Return the number of times the sign bit of the register is replicated into
+  /// the other bits. We know that at least 1 bit is always equal to the sign
+  /// bit (itself), but other cases can give us information. For example,
+  /// immediately after an "SRA X, 2", we know that the top 3 bits are all equal
+  /// to each other, so we return 3. The DemandedElts argument allows
+  /// us to only collect the minimum sign bits of the requested vector elements.
+  /// Targets can implement the ComputeNumSignBitsForTarget method in the
+  /// TargetLowering class to allow target nodes to be understood.
+  unsigned ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
+                              unsigned Depth = 0) const;
+
   /// Return true if the specified operand is an ISD::ADD with a ConstantSDNode
   /// on the right-hand side, or if it is an ISD::OR with a ConstantSDNode that
   /// is guaranteed to have the same semantics as an ADD. This handles the
diff --git a/include/llvm/CodeGen/SelectionDAGISel.h b/include/llvm/CodeGen/SelectionDAGISel.h
index c204ce2be069c7ab442f9ee47c78fb415d91ac8f..591b2f773344d7811c1cf924d3cedea9b0fdff69 100644
--- a/include/llvm/CodeGen/SelectionDAGISel.h
+++ b/include/llvm/CodeGen/SelectionDAGISel.h
@@ -20,6 +20,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <memory>
 
 namespace llvm {
   class FastISel;
@@ -29,6 +30,7 @@ namespace llvm {
   class MachineBasicBlock;
   class MachineFunction;
   class MachineInstr;
+  class OptimizationRemarkEmitter;
   class TargetLowering;
   class TargetLibraryInfo;
   class FunctionLoweringInfo;
@@ -53,6 +55,12 @@ public:
   CodeGenOpt::Level OptLevel;
   const TargetInstrInfo *TII;
   const TargetLowering *TLI;
+  bool FastISelFailed;
+  SmallPtrSet<const Instruction *, 4> ElidedArgCopyInstrs;
+
+  /// Current optimization remark emitter.
+  /// Used to report things like combines and FastISel failures.
+  std::unique_ptr<OptimizationRemarkEmitter> ORE;
 
   static char ID;
 
@@ -151,7 +159,9 @@ public:
     OPC_MorphNodeTo,
     // Space-optimized forms that implicitly encode number of result VTs.
     OPC_MorphNodeTo0, OPC_MorphNodeTo1, OPC_MorphNodeTo2,
-    OPC_CompleteMatch
+    OPC_CompleteMatch,
+    // Contains offset in table for pattern being selected
+    OPC_Coverage
   };
 
   enum {
@@ -213,6 +223,15 @@ protected:
   void SelectInlineAsmMemoryOperands(std::vector<SDValue> &Ops,
                                      const SDLoc &DL);
 
+  /// getPatternForIndex - Patterns selected by tablegen during ISEL
+  virtual StringRef getPatternForIndex(unsigned index) {
+    llvm_unreachable("Tblgen should generate the implementation of this!");
+  }
+
+  /// getIncludePathForIndex - get the td source location of pattern instantiation
+  virtual StringRef getIncludePathForIndex(unsigned index) {
+    llvm_unreachable("Tblgen should generate the implementation of this!");
+  }
 public:
   // Calls to these predicates are generated by tblgen.
   bool CheckAndMask(SDValue LHS, ConstantSDNode *RHS,
diff --git a/include/llvm/CodeGen/SelectionDAGNodes.h b/include/llvm/CodeGen/SelectionDAGNodes.h
index a512c44e4bafaa505cef19ac9f4f58a58cd1ee8e..81cc0b39cf873cf4bda066cf029571e9d213ea53 100644
--- a/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/SelectionDAGNodes.h - SelectionDAG Nodes ---*- C++ -*-===//
+//===- llvm/CodeGen/SelectionDAGNodes.h - SelectionDAG Nodes ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -118,11 +118,11 @@ namespace ISD {
 class SDValue {
   friend struct DenseMapInfo<SDValue>;
 
-  SDNode *Node;       // The node defining the value we are using.
-  unsigned ResNo;     // Which return value of the node we are using.
+  SDNode *Node = nullptr; // The node defining the value we are using.
+  unsigned ResNo = 0;     // Which return value of the node we are using.
 
 public:
-  SDValue() : Node(nullptr), ResNo(0) {}
+  SDValue() = default;
   SDValue(SDNode *node, unsigned resno);
 
   /// get the index which selects a specific result in the SDNode
@@ -250,16 +250,16 @@ class SDUse {
   /// Val - The value being used.
   SDValue Val;
   /// User - The user of this value.
-  SDNode *User;
+  SDNode *User = nullptr;
   /// Prev, Next - Pointers to the uses list of the SDNode referred by
   /// this operand.
-  SDUse **Prev, *Next;
-
-  SDUse(const SDUse &U) = delete;
-  void operator=(const SDUse &U) = delete;
+  SDUse **Prev = nullptr;
+  SDUse *Next = nullptr;
 
 public:
-  SDUse() : User(nullptr), Prev(nullptr), Next(nullptr) {}
+  SDUse() = default;
+  SDUse(const SDUse &U) = delete;
+  SDUse &operator=(const SDUse &) = delete;
 
   /// Normally SDUse will just implicitly convert to an SDValue that it holds.
   operator const SDValue&() const { return Val; }
@@ -350,20 +350,15 @@ private:
   bool NoSignedZeros : 1;
   bool AllowReciprocal : 1;
   bool VectorReduction : 1;
+  bool AllowContract : 1;
 
 public:
   /// Default constructor turns off all optimization flags.
-  SDNodeFlags() {
-    NoUnsignedWrap = false;
-    NoSignedWrap = false;
-    Exact = false;
-    UnsafeAlgebra = false;
-    NoNaNs = false;
-    NoInfs = false;
-    NoSignedZeros = false;
-    AllowReciprocal = false;
-    VectorReduction = false;
-  }
+  SDNodeFlags()
+      : NoUnsignedWrap(false), NoSignedWrap(false), Exact(false),
+        UnsafeAlgebra(false), NoNaNs(false), NoInfs(false),
+        NoSignedZeros(false), AllowReciprocal(false), VectorReduction(false),
+        AllowContract(false) {}
 
   // These are mutators for each flag.
   void setNoUnsignedWrap(bool b) { NoUnsignedWrap = b; }
@@ -375,6 +370,7 @@ public:
   void setNoSignedZeros(bool b) { NoSignedZeros = b; }
   void setAllowReciprocal(bool b) { AllowReciprocal = b; }
   void setVectorReduction(bool b) { VectorReduction = b; }
+  void setAllowContract(bool b) { AllowContract = b; }
 
   // These are accessors for each flag.
   bool hasNoUnsignedWrap() const { return NoUnsignedWrap; }
@@ -386,6 +382,7 @@ public:
   bool hasNoSignedZeros() const { return NoSignedZeros; }
   bool hasAllowReciprocal() const { return AllowReciprocal; }
   bool hasVectorReduction() const { return VectorReduction; }
+  bool hasAllowContract() const { return AllowContract; }
 
   /// Clear any flags in this flag set that aren't also set in Flags.
   void intersectWith(const SDNodeFlags *Flags) {
@@ -397,6 +394,8 @@ public:
     NoInfs &= Flags->NoInfs;
     NoSignedZeros &= Flags->NoSignedZeros;
     AllowReciprocal &= Flags->AllowReciprocal;
+    VectorReduction &= Flags->VectorReduction;
+    AllowContract &= Flags->AllowContract;
   }
 };
 
@@ -446,6 +445,7 @@ protected:
 
   class LSBaseSDNodeBitfields {
     friend class LSBaseSDNode;
+
     uint16_t : NumMemSDNodeBits;
 
     uint16_t AddressingMode : 3; // enum ISD::MemIndexedMode
@@ -493,21 +493,26 @@ protected:
   static_assert(sizeof(StoreSDNodeBitfields) <= 2, "field too wide");
 
 private:
+  friend class SelectionDAG;
+  // TODO: unfriend HandleSDNode once we fix its operand handling.
+  friend class HandleSDNode;
+
   /// Unique id per SDNode in the DAG.
-  int NodeId;
+  int NodeId = -1;
 
   /// The values that are used by this operation.
-  SDUse *OperandList;
+  SDUse *OperandList = nullptr;
 
   /// The types of the values this node defines.  SDNode's may
   /// define multiple values simultaneously.
   const EVT *ValueList;
 
   /// List of uses for this SDNode.
-  SDUse *UseList;
+  SDUse *UseList = nullptr;
 
   /// The number of entries in the Operand/Value list.
-  unsigned short NumOperands, NumValues;
+  unsigned short NumOperands = 0;
+  unsigned short NumValues;
 
   // The ordering of the SDNodes. It roughly corresponds to the ordering of the
   // original LLVM instructions.
@@ -522,10 +527,6 @@ private:
   /// Return a pointer to the specified value type.
   static const EVT *getValueTypeList(EVT VT);
 
-  friend class SelectionDAG;
-  // TODO: unfriend HandleSDNode once we fix its operand handling.
-  friend class HandleSDNode;
-
 public:
   /// Unique and persistent id per SDNode in the DAG.
   /// Used for debug printing.
@@ -616,10 +617,10 @@ public:
   /// operands that use a specific SDNode.
   class use_iterator
     : public std::iterator<std::forward_iterator_tag, SDUse, ptrdiff_t> {
-    SDUse *Op;
-
     friend class SDNode;
 
+    SDUse *Op = nullptr;
+
     explicit use_iterator(SDUse *op) : Op(op) {}
 
   public:
@@ -628,8 +629,8 @@ public:
     typedef std::iterator<std::forward_iterator_tag,
                           SDUse, ptrdiff_t>::pointer pointer;
 
+    use_iterator() = default;
     use_iterator(const use_iterator &I) : Op(I.Op) {}
-    use_iterator() : Op(nullptr) {}
 
     bool operator==(const use_iterator &x) const {
       return Op == x.Op;
@@ -737,6 +738,10 @@ public:
     return false;
   }
 
+  /// Return true if all the users of N are contained in Nodes.
+  /// NOTE: Requires at least one match, but doesn't require them all.
+  static bool areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N);
+
   /// Return the number of values used by this operation.
   unsigned getNumOperands() const { return NumOperands; }
 
@@ -896,9 +901,8 @@ protected:
   /// SDNodes are created without any operands, and never own the operand
   /// storage. To add operands, see SelectionDAG::createOperands.
   SDNode(unsigned Opc, unsigned Order, DebugLoc dl, SDVTList VTs)
-      : NodeType(Opc), NodeId(-1), OperandList(nullptr), ValueList(VTs.VTs),
-        UseList(nullptr), NumOperands(0), NumValues(VTs.NumVTs), IROrder(Order),
-        debugLoc(std::move(dl)) {
+      : NodeType(Opc), ValueList(VTs.VTs), NumValues(VTs.NumVTs),
+        IROrder(Order), debugLoc(std::move(dl)) {
     memset(&RawSDNodeBits, 0, sizeof(RawSDNodeBits));
     assert(debugLoc.hasTrivialDestructor() && "Expected trivial destructor");
     assert(NumValues == VTs.NumVTs &&
@@ -1366,10 +1370,10 @@ public:
 };
 
 class ConstantSDNode : public SDNode {
-  const ConstantInt *Value;
-
   friend class SelectionDAG;
 
+  const ConstantInt *Value;
+
   ConstantSDNode(bool isTarget, bool isOpaque, const ConstantInt *val,
                  const DebugLoc &DL, EVT VT)
       : SDNode(isTarget ? ISD::TargetConstant : ISD::Constant, 0, DL,
@@ -1401,10 +1405,10 @@ uint64_t SDNode::getConstantOperandVal(unsigned Num) const {
 }
 
 class ConstantFPSDNode : public SDNode {
-  const ConstantFP *Value;
-
   friend class SelectionDAG;
 
+  const ConstantFP *Value;
+
   ConstantFPSDNode(bool isTarget, const ConstantFP *val, const DebugLoc &DL,
                    EVT VT)
       : SDNode(isTarget ? ISD::TargetConstantFP : ISD::ConstantFP, 0, DL,
@@ -1475,10 +1479,12 @@ ConstantSDNode *isConstOrConstSplat(SDValue V);
 ConstantFPSDNode *isConstOrConstSplatFP(SDValue V);
 
 class GlobalAddressSDNode : public SDNode {
+  friend class SelectionDAG;
+
   const GlobalValue *TheGlobal;
   int64_t Offset;
   unsigned char TargetFlags;
-  friend class SelectionDAG;
+
   GlobalAddressSDNode(unsigned Opc, unsigned Order, const DebugLoc &DL,
                       const GlobalValue *GA, EVT VT, int64_t o,
                       unsigned char TargetFlags);
@@ -1499,10 +1505,10 @@ public:
 };
 
 class FrameIndexSDNode : public SDNode {
-  int FI;
-
   friend class SelectionDAG;
 
+  int FI;
+
   FrameIndexSDNode(int fi, EVT VT, bool isTarg)
     : SDNode(isTarg ? ISD::TargetFrameIndex : ISD::FrameIndex,
       0, DebugLoc(), getSDVTList(VT)), FI(fi) {
@@ -1518,11 +1524,11 @@ public:
 };
 
 class JumpTableSDNode : public SDNode {
+  friend class SelectionDAG;
+
   int JTI;
   unsigned char TargetFlags;
 
-  friend class SelectionDAG;
-
   JumpTableSDNode(int jti, EVT VT, bool isTarg, unsigned char TF)
     : SDNode(isTarg ? ISD::TargetJumpTable : ISD::JumpTable,
       0, DebugLoc(), getSDVTList(VT)), JTI(jti), TargetFlags(TF) {
@@ -1539,6 +1545,8 @@ public:
 };
 
 class ConstantPoolSDNode : public SDNode {
+  friend class SelectionDAG;
+
   union {
     const Constant *ConstVal;
     MachineConstantPoolValue *MachineCPVal;
@@ -1547,8 +1555,6 @@ class ConstantPoolSDNode : public SDNode {
   unsigned Alignment;  // Minimum alignment requirement of CP (not log2 value).
   unsigned char TargetFlags;
 
-  friend class SelectionDAG;
-
   ConstantPoolSDNode(bool isTarget, const Constant *c, EVT VT, int o,
                      unsigned Align, unsigned char TF)
     : SDNode(isTarget ? ISD::TargetConstantPool : ISD::ConstantPool, 0,
@@ -1602,12 +1608,12 @@ public:
 
 /// Completely target-dependent object reference.
 class TargetIndexSDNode : public SDNode {
+  friend class SelectionDAG;
+
   unsigned char TargetFlags;
   int Index;
   int64_t Offset;
 
-  friend class SelectionDAG;
-
 public:
   TargetIndexSDNode(int Idx, EVT VT, int64_t Ofs, unsigned char TF)
     : SDNode(ISD::TargetIndex, 0, DebugLoc(), getSDVTList(VT)),
@@ -1623,10 +1629,10 @@ public:
 };
 
 class BasicBlockSDNode : public SDNode {
-  MachineBasicBlock *MBB;
-
   friend class SelectionDAG;
 
+  MachineBasicBlock *MBB;
+
   /// Debug info is meaningful and potentially useful here, but we create
   /// blocks out of order when they're jumped to, which makes it a bit
   /// harder.  Let's see if we need it first.
@@ -1644,10 +1650,10 @@ public:
 
 /// A "pseudo-class" with methods for operating on BUILD_VECTORs.
 class BuildVectorSDNode : public SDNode {
+public:
   // These are constructed as SDNodes and then cast to BuildVectorSDNodes.
   explicit BuildVectorSDNode() = delete;
 
-public:
   /// Check if this is a constant splat, and if so, find the
   /// smallest element size that splats the vector.  If MinSplatBits is
   /// nonzero, the element size must be at least that large.  Note that the
@@ -1704,10 +1710,10 @@ public:
 /// in the LLVM IR representation.
 ///
 class SrcValueSDNode : public SDNode {
-  const Value *V;
-
   friend class SelectionDAG;
 
+  const Value *V;
+
   /// Create a SrcValue for a general value.
   explicit SrcValueSDNode(const Value *v)
     : SDNode(ISD::SRCVALUE, 0, DebugLoc(), getSDVTList(MVT::Other)), V(v) {}
@@ -1722,10 +1728,10 @@ public:
 };
 
 class MDNodeSDNode : public SDNode {
-  const MDNode *MD;
-
   friend class SelectionDAG;
 
+  const MDNode *MD;
+
   explicit MDNodeSDNode(const MDNode *md)
   : SDNode(ISD::MDNODE_SDNODE, 0, DebugLoc(), getSDVTList(MVT::Other)), MD(md)
   {}
@@ -1739,10 +1745,10 @@ public:
 };
 
 class RegisterSDNode : public SDNode {
-  unsigned Reg;
-
   friend class SelectionDAG;
 
+  unsigned Reg;
+
   RegisterSDNode(unsigned reg, EVT VT)
     : SDNode(ISD::Register, 0, DebugLoc(), getSDVTList(VT)), Reg(reg) {}
 
@@ -1755,11 +1761,11 @@ public:
 };
 
 class RegisterMaskSDNode : public SDNode {
+  friend class SelectionDAG;
+
   // The memory for RegMask is not owned by the node.
   const uint32_t *RegMask;
 
-  friend class SelectionDAG;
-
   RegisterMaskSDNode(const uint32_t *mask)
     : SDNode(ISD::RegisterMask, 0, DebugLoc(), getSDVTList(MVT::Untyped)),
       RegMask(mask) {}
@@ -1773,12 +1779,12 @@ public:
 };
 
 class BlockAddressSDNode : public SDNode {
+  friend class SelectionDAG;
+
   const BlockAddress *BA;
   int64_t Offset;
   unsigned char TargetFlags;
 
-  friend class SelectionDAG;
-
   BlockAddressSDNode(unsigned NodeTy, EVT VT, const BlockAddress *ba,
                      int64_t o, unsigned char Flags)
     : SDNode(NodeTy, 0, DebugLoc(), getSDVTList(VT)),
@@ -1797,10 +1803,10 @@ public:
 };
 
 class EHLabelSDNode : public SDNode {
-  MCSymbol *Label;
-
   friend class SelectionDAG;
 
+  MCSymbol *Label;
+
   EHLabelSDNode(unsigned Order, const DebugLoc &dl, MCSymbol *L)
       : SDNode(ISD::EH_LABEL, Order, dl, getSDVTList(MVT::Other)), Label(L) {}
 
@@ -1813,11 +1819,11 @@ public:
 };
 
 class ExternalSymbolSDNode : public SDNode {
+  friend class SelectionDAG;
+
   const char *Symbol;
   unsigned char TargetFlags;
 
-  friend class SelectionDAG;
-
   ExternalSymbolSDNode(bool isTarget, const char *Sym, unsigned char TF, EVT VT)
     : SDNode(isTarget ? ISD::TargetExternalSymbol : ISD::ExternalSymbol,
              0, DebugLoc(), getSDVTList(VT)), Symbol(Sym), TargetFlags(TF) {}
@@ -1833,9 +1839,10 @@ public:
 };
 
 class MCSymbolSDNode : public SDNode {
+  friend class SelectionDAG;
+
   MCSymbol *Symbol;
 
-  friend class SelectionDAG;
   MCSymbolSDNode(MCSymbol *Symbol, EVT VT)
       : SDNode(ISD::MCSymbol, 0, DebugLoc(), getSDVTList(VT)), Symbol(Symbol) {}
 
@@ -1848,10 +1855,10 @@ public:
 };
 
 class CondCodeSDNode : public SDNode {
-  ISD::CondCode Condition;
-
   friend class SelectionDAG;
 
+  ISD::CondCode Condition;
+
   explicit CondCodeSDNode(ISD::CondCode Cond)
     : SDNode(ISD::CONDCODE, 0, DebugLoc(), getSDVTList(MVT::Other)),
       Condition(Cond) {}
@@ -1867,10 +1874,10 @@ public:
 /// This class is used to represent EVT's, which are used
 /// to parameterize some operations.
 class VTSDNode : public SDNode {
-  EVT ValueType;
-
   friend class SelectionDAG;
 
+  EVT ValueType;
+
   explicit VTSDNode(EVT VT)
     : SDNode(ISD::VALUETYPE, 0, DebugLoc(), getSDVTList(MVT::Other)),
       ValueType(VT) {}
@@ -1999,6 +2006,7 @@ public:
 class MaskedLoadSDNode : public MaskedLoadStoreSDNode {
 public:
   friend class SelectionDAG;
+
   MaskedLoadSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
                    ISD::LoadExtType ETy, bool IsExpanding, EVT MemVT,
                    MachineMemOperand *MMO)
@@ -2118,11 +2126,11 @@ private:
   friend class SelectionDAG;
 
   MachineSDNode(unsigned Opc, unsigned Order, const DebugLoc &DL, SDVTList VTs)
-      : SDNode(Opc, Order, DL, VTs), MemRefs(nullptr), MemRefsEnd(nullptr) {}
+      : SDNode(Opc, Order, DL, VTs) {}
 
   /// Memory reference descriptions for this instruction.
-  mmo_iterator MemRefs;
-  mmo_iterator MemRefsEnd;
+  mmo_iterator MemRefs = nullptr;
+  mmo_iterator MemRefsEnd = nullptr;
 
 public:
   mmo_iterator memoperands_begin() const { return MemRefs; }
@@ -2188,9 +2196,11 @@ template <> struct GraphTraits<SDNode*> {
   typedef SDNodeIterator ChildIteratorType;
 
   static NodeRef getEntryNode(SDNode *N) { return N; }
+
   static ChildIteratorType child_begin(NodeRef N) {
     return SDNodeIterator::begin(N);
   }
+
   static ChildIteratorType child_end(NodeRef N) {
     return SDNodeIterator::end(N);
   }
diff --git a/include/llvm/CodeGen/SlotIndexes.h b/include/llvm/CodeGen/SlotIndexes.h
index 2ac3b3d86cb6677c44002d009b37864bfa152bfa..14fc3a499a082dc9bb707c2d769692d01c7ef753 100644
--- a/include/llvm/CodeGen/SlotIndexes.h
+++ b/include/llvm/CodeGen/SlotIndexes.h
@@ -602,19 +602,15 @@ namespace llvm {
       return newIndex;
     }
 
-    /// Remove the given machine instruction from the mapping.
-    void removeMachineInstrFromMaps(MachineInstr &MI) {
-      // remove index -> MachineInstr and
-      // MachineInstr -> index mappings
-      Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI);
-      if (mi2iItr != mi2iMap.end()) {
-        IndexListEntry *miEntry(mi2iItr->second.listEntry());
-        assert(miEntry->getInstr() == &MI && "Instruction indexes broken.");
-        // FIXME: Eventually we want to actually delete these indexes.
-        miEntry->setInstr(nullptr);
-        mi2iMap.erase(mi2iItr);
-      }
-    }
+    /// Removes machine instruction (bundle) \p MI from the mapping.
+    /// This should be called before MachineInstr::eraseFromParent() is used to
+    /// remove a whole bundle or an unbundled instruction.
+    void removeMachineInstrFromMaps(MachineInstr &MI);
+
+    /// Removes a single machine instruction \p MI from the mapping.
+    /// This should be called before MachineInstr::eraseFromBundle() is used to
+    /// remove a single instruction (out of a bundle).
+    void removeSingleMachineInstrFromMaps(MachineInstr &MI);
 
     /// ReplaceMachineInstrInMaps - Replacing a machine instr with a new one in
     /// maps used by register allocator. \returns the index where the new
diff --git a/include/llvm/CodeGen/StackMaps.h b/include/llvm/CodeGen/StackMaps.h
index 7b55b796863583f07f033e85f2bf24c9db50bb15..a18936feea7b0c56ac7ac1a8c808520dfee815bc 100644
--- a/include/llvm/CodeGen/StackMaps.h
+++ b/include/llvm/CodeGen/StackMaps.h
@@ -1,4 +1,4 @@
-//===------------------- StackMaps.h - StackMaps ----------------*- C++ -*-===//
+//===- StackMaps.h - StackMaps ----------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,7 +13,11 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 #include <vector>
 
 namespace llvm {
@@ -21,6 +25,9 @@ namespace llvm {
 class AsmPrinter;
 class MCExpr;
 class MCStreamer;
+class MCSymbol;
+class raw_ostream;
+class TargetRegisterInfo;
 
 /// \brief MI-level stackmap operands.
 ///
@@ -189,21 +196,22 @@ public:
       Constant,
       ConstantIndex
     };
-    LocationType Type;
-    unsigned Size;
-    unsigned Reg;
-    int64_t Offset;
-    Location() : Type(Unprocessed), Size(0), Reg(0), Offset(0) {}
+    LocationType Type = Unprocessed;
+    unsigned Size = 0;
+    unsigned Reg = 0;
+    int64_t Offset = 0;
+
+    Location() = default;
     Location(LocationType Type, unsigned Size, unsigned Reg, int64_t Offset)
         : Type(Type), Size(Size), Reg(Reg), Offset(Offset) {}
   };
 
   struct LiveOutReg {
-    unsigned short Reg;
-    unsigned short DwarfRegNum;
-    unsigned short Size;
+    unsigned short Reg = 0;
+    unsigned short DwarfRegNum = 0;
+    unsigned short Size = 0;
 
-    LiveOutReg() : Reg(0), DwarfRegNum(0), Size(0) {}
+    LiveOutReg() = default;
     LiveOutReg(unsigned short Reg, unsigned short DwarfRegNum,
                unsigned short Size)
         : Reg(Reg), DwarfRegNum(DwarfRegNum), Size(Size) {}
@@ -245,18 +253,20 @@ private:
   typedef MapVector<uint64_t, uint64_t> ConstantPool;
 
   struct FunctionInfo {
-    uint64_t StackSize;
-    uint64_t RecordCount;
-    FunctionInfo() : StackSize(0), RecordCount(1) {}
-    explicit FunctionInfo(uint64_t StackSize) : StackSize(StackSize), RecordCount(1) {}
+    uint64_t StackSize = 0;
+    uint64_t RecordCount = 1;
+
+    FunctionInfo() = default;
+    explicit FunctionInfo(uint64_t StackSize) : StackSize(StackSize) {}
   };
 
   struct CallsiteInfo {
-    const MCExpr *CSOffsetExpr;
-    uint64_t ID;
+    const MCExpr *CSOffsetExpr = nullptr;
+    uint64_t ID = 0;
     LocationVec Locations;
     LiveOutVec LiveOuts;
-    CallsiteInfo() : CSOffsetExpr(nullptr), ID(0) {}
+
+    CallsiteInfo() = default;
     CallsiteInfo(const MCExpr *CSOffsetExpr, uint64_t ID,
                  LocationVec &&Locations, LiveOutVec &&LiveOuts)
         : CSOffsetExpr(CSOffsetExpr), ID(ID), Locations(std::move(Locations)),
@@ -309,6 +319,7 @@ private:
   void print(raw_ostream &OS);
   void debug() { print(dbgs()); }
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_STACKMAPS_H
diff --git a/include/llvm/CodeGen/StackProtector.h b/include/llvm/CodeGen/StackProtector.h
index 1b3c0eb4a4d0ac2815086dd77d21e377eefd6dca..0655f19a323e49a83d5fd387074bf03240cc71c3 100644
--- a/include/llvm/CodeGen/StackProtector.h
+++ b/include/llvm/CodeGen/StackProtector.h
@@ -1,4 +1,4 @@
-//===-- StackProtector.h - Stack Protector Insertion ----------------------===//
+//===- StackProtector.h - Stack Protector Insertion -------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -23,8 +23,10 @@
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Pass.h"
 #include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
+
 class Function;
 class Module;
 class PHINode;
@@ -48,11 +50,11 @@ public:
   typedef ValueMap<const AllocaInst *, SSPLayoutKind> SSPLayoutMap;
 
 private:
-  const TargetMachine *TM;
+  const TargetMachine *TM = nullptr;
 
   /// TLI - Keep a pointer of a TargetLowering to consult for determining
   /// target type sizes.
-  const TargetLoweringBase *TLI;
+  const TargetLoweringBase *TLI = nullptr;
   const Triple Trip;
 
   Function *F;
@@ -67,7 +69,7 @@ private:
 
   /// \brief The minimum size of buffers that will receive stack smashing
   /// protection when -fstack-protection is used.
-  unsigned SSPBufferSize;
+  unsigned SSPBufferSize = 0;
 
   /// VisitedPHIs - The set of PHI nodes visited when determining
   /// if a variable's reference has been taken.  This set
@@ -111,12 +113,13 @@ private:
 
 public:
   static char ID; // Pass identification, replacement for typeid.
-  StackProtector()
-      : FunctionPass(ID), TM(nullptr), TLI(nullptr), SSPBufferSize(0) {
+
+  StackProtector() : FunctionPass(ID) {
     initializeStackProtectorPass(*PassRegistry::getPassRegistry());
   }
+
   StackProtector(const TargetMachine *TM)
-      : FunctionPass(ID), TM(TM), TLI(nullptr), Trip(TM->getTargetTriple()),
+      : FunctionPass(ID), TM(TM), Trip(TM->getTargetTriple()),
         SSPBufferSize(8) {
     initializeStackProtectorPass(*PassRegistry::getPassRegistry());
   }
@@ -134,6 +137,7 @@ public:
 
   bool runOnFunction(Function &Fn) override;
 };
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_STACKPROTECTOR_H
diff --git a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
index cc71fa3918a1332278380dbb6cb898a7aca20dab..adf2b3ea1c9b3f79883330bf30f1725cda33da8d 100644
--- a/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
+++ b/include/llvm/CodeGen/TargetLoweringObjectFileImpl.h
@@ -1,4 +1,4 @@
-//==-- llvm/CodeGen/TargetLoweringObjectFileImpl.h - Object Info -*- C++ -*-==//
+//==- llvm/CodeGen/TargetLoweringObjectFileImpl.h - Object Info --*- C++ -*-==//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,24 +15,22 @@
 #ifndef LLVM_CODEGEN_TARGETLOWERINGOBJECTFILEIMPL_H
 #define LLVM_CODEGEN_TARGETLOWERINGOBJECTFILEIMPL_H
 
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/SectionKind.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
-  class MachineModuleInfo;
-  class Mangler;
-  class MCAsmInfo;
-  class MCSection;
-  class MCSectionMachO;
-  class MCSymbol;
-  class MCContext;
-  class GlobalValue;
-  class TargetMachine;
 
+class GlobalValue;
+class MachineModuleInfo;
+class Mangler;
+class MCContext;
+class MCSection;
+class MCSymbol;
+class TargetMachine;
 
 class TargetLoweringObjectFileELF : public TargetLoweringObjectFile {
-  bool UseInitArray;
+  bool UseInitArray = false;
   mutable unsigned NextUniqueID = 1;  // ID 0 is reserved for execute-only sections
 
 protected:
@@ -40,9 +38,8 @@ protected:
       MCSymbolRefExpr::VK_None;
 
 public:
-  TargetLoweringObjectFileELF() : UseInitArray(false) {}
-
-  ~TargetLoweringObjectFileELF() override {}
+  TargetLoweringObjectFileELF() = default;
+  ~TargetLoweringObjectFileELF() override = default;
 
   void emitPersonalityValue(MCStreamer &Streamer, const DataLayout &TM,
                             const MCSymbol *Sym) const override;
@@ -89,12 +86,10 @@ public:
                                        const TargetMachine &TM) const override;
 };
 
-
-
 class TargetLoweringObjectFileMachO : public TargetLoweringObjectFile {
 public:
-  ~TargetLoweringObjectFileMachO() override {}
   TargetLoweringObjectFileMachO();
+  ~TargetLoweringObjectFileMachO() override = default;
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 
@@ -135,13 +130,11 @@ public:
                          const TargetMachine &TM) const override;
 };
 
-
-
 class TargetLoweringObjectFileCOFF : public TargetLoweringObjectFile {
   mutable unsigned NextUniqueID = 0;
 
 public:
-  ~TargetLoweringObjectFileCOFF() override {}
+  ~TargetLoweringObjectFileCOFF() override = default;
 
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
   MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
@@ -171,6 +164,29 @@ public:
                                 const GlobalValue *GV) const override;
 };
 
+class TargetLoweringObjectFileWasm : public TargetLoweringObjectFile {
+  mutable unsigned NextUniqueID = 0;
+
+public:
+  TargetLoweringObjectFileWasm() = default;
+  ~TargetLoweringObjectFileWasm() override = default;
+
+  MCSection *getExplicitSectionGlobal(const GlobalObject *GO, SectionKind Kind,
+                                      const TargetMachine &TM) const override;
+
+  MCSection *SelectSectionForGlobal(const GlobalObject *GO, SectionKind Kind,
+                                    const TargetMachine &TM) const override;
+
+  bool shouldPutJumpTableInFunctionSection(bool UsesLabelDifference,
+                                           const Function &F) const override;
+
+  void InitializeWasm();
+
+  const MCExpr *lowerRelativeReference(const GlobalValue *LHS,
+                                       const GlobalValue *RHS,
+                                       const TargetMachine &TM) const override;
+};
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_TARGETLOWERINGOBJECTFILEIMPL_H
diff --git a/include/llvm/CodeGen/TargetPassConfig.h b/include/llvm/CodeGen/TargetPassConfig.h
index 2287f9aca4bffe9e22049ca84699aab1d6797353..f0c826dc1d457cf6637809fc2863b7e52e9f0ad1 100644
--- a/include/llvm/CodeGen/TargetPassConfig.h
+++ b/include/llvm/CodeGen/TargetPassConfig.h
@@ -115,6 +115,10 @@ protected:
   /// Default setting for -enable-tail-merge on this target.
   bool EnableTailMerge;
 
+  /// Require processing of functions such that callees are generated before
+  /// callers.
+  bool RequireCodeGenSCCOrder;
+
 public:
   TargetPassConfig(TargetMachine *tm, PassManagerBase &pm);
   // Dummy constructor.
@@ -162,6 +166,11 @@ public:
   bool getEnableTailMerge() const { return EnableTailMerge; }
   void setEnableTailMerge(bool Enable) { setOpt(EnableTailMerge, Enable); }
 
+  bool requiresCodeGenSCCOrder() const { return RequireCodeGenSCCOrder; }
+  void setRequiresCodeGenSCCOrder(bool Enable = true) {
+    setOpt(RequireCodeGenSCCOrder, Enable);
+  }
+
   /// Allow the target to override a specific pass without overriding the pass
   /// pipeline. When passes are added to the standard pipeline at the
   /// point where StandardID is expected, add TargetID in its place.
@@ -286,6 +295,10 @@ public:
   /// verification is enabled.
   void addVerifyPass(const std::string &Banner);
 
+  /// Check whether or not GlobalISel should be enabled by default.
+  /// Fallback/abort behavior is controlled via other methods.
+  virtual bool isGlobalISelEnabled() const;
+
   /// Check whether or not GlobalISel should abort on error.
   /// When this is disable, GlobalISel will fall back on SDISel instead of
   /// erroring out.
diff --git a/include/llvm/CodeGen/TargetSchedule.h b/include/llvm/CodeGen/TargetSchedule.h
index 81054aba066f6d14e4fbbbf89ef85d5f78a33e45..0c5a84e0e3b8f25dfb26cc15416cc818ff77c913 100644
--- a/include/llvm/CodeGen/TargetSchedule.h
+++ b/include/llvm/CodeGen/TargetSchedule.h
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/TargetSchedule.h - Sched Machine Model -----*- C++ -*-===//
+//===- llvm/CodeGen/TargetSchedule.h - Sched Machine Model ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -23,10 +23,8 @@
 
 namespace llvm {
 
-class TargetRegisterInfo;
-class TargetSubtargetInfo;
-class TargetInstrInfo;
 class MachineInstr;
+class TargetInstrInfo;
 
 /// Provide an instruction scheduling machine model to CodeGen passes.
 class TargetSchedModel {
@@ -34,8 +32,8 @@ class TargetSchedModel {
   // processor.
   MCSchedModel SchedModel;
   InstrItineraryData InstrItins;
-  const TargetSubtargetInfo *STI;
-  const TargetInstrInfo *TII;
+  const TargetSubtargetInfo *STI = nullptr;
+  const TargetInstrInfo *TII = nullptr;
 
   SmallVector<unsigned, 16> ResourceFactors;
   unsigned MicroOpFactor; // Multiply to normalize microops to resource units.
@@ -44,7 +42,7 @@ class TargetSchedModel {
   unsigned computeInstrLatency(const MCSchedClassDesc &SCDesc) const;
 
 public:
-  TargetSchedModel(): SchedModel(MCSchedModel::GetDefaultSchedModel()), STI(nullptr), TII(nullptr) {}
+  TargetSchedModel() : SchedModel(MCSchedModel::GetDefaultSchedModel()) {}
 
   /// \brief Initialize the machine model for instruction scheduling.
   ///
@@ -93,6 +91,13 @@ public:
   /// \brief Maximum number of micro-ops that may be scheduled per cycle.
   unsigned getIssueWidth() const { return SchedModel.IssueWidth; }
 
+  /// \brief Return true if new group must begin.
+  bool mustBeginGroup(const MachineInstr *MI,
+                          const MCSchedClassDesc *SC = nullptr) const;
+  /// \brief Return true if current group must end.
+  bool mustEndGroup(const MachineInstr *MI,
+                          const MCSchedClassDesc *SC = nullptr) const;
+
   /// \brief Return the number of issue slots required for this MI.
   unsigned getNumMicroOps(const MachineInstr *MI,
                           const MCSchedClassDesc *SC = nullptr) const;
@@ -178,6 +183,7 @@ public:
                                bool UseDefaultDefLatency = true) const;
   unsigned computeInstrLatency(unsigned Opcode) const;
 
+
   /// \brief Output dependency latency of a pair of defs of the same register.
   ///
   /// This is typically one cycle.
@@ -185,6 +191,6 @@ public:
                                 const MachineInstr *DepMI) const;
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_CODEGEN_TARGETSCHEDULE_H
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index 2699fa28f0f18f6253919c7467ae401769e9114a..0a3063663cef81fab397e4691880a229d6fb57d5 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -25,9 +25,9 @@ namespace llvm {
   class LLVMContext;
   class Type;
 
-  /// EVT - Extended Value Type.  Capable of holding value types which are not
-  /// native for any processor (such as the i12345 type), as well as the types
-  /// a MVT can represent.
+  /// Extended Value Type. Capable of holding value types which are not native
+  /// for any processor (such as the i12345 type), as well as the types an MVT
+  /// can represent.
   struct EVT {
   private:
     MVT V;
@@ -49,15 +49,15 @@ namespace llvm {
       return false;
     }
 
-    /// getFloatingPointVT - Returns the EVT that represents a floating point
-    /// type with the given number of bits.  There are two floating point types
-    /// with 128 bits - this returns f128 rather than ppcf128.
+    /// Returns the EVT that represents a floating-point type with the given
+    /// number of bits. There are two floating-point types with 128 bits - this
+    /// returns f128 rather than ppcf128.
     static EVT getFloatingPointVT(unsigned BitWidth) {
       return MVT::getFloatingPointVT(BitWidth);
     }
 
-    /// getIntegerVT - Returns the EVT that represents an integer with the given
-    /// number of bits.
+    /// Returns the EVT that represents an integer with the given number of
+    /// bits.
     static EVT getIntegerVT(LLVMContext &Context, unsigned BitWidth) {
       MVT M = MVT::getIntegerVT(BitWidth);
       if (M.SimpleTy >= 0)
@@ -65,8 +65,8 @@ namespace llvm {
       return getExtendedIntegerVT(Context, BitWidth);
     }
 
-    /// getVectorVT - Returns the EVT that represents a vector NumElements in
-    /// length, where each element is of type VT.
+    /// Returns the EVT that represents a vector NumElements in length, where
+    /// each element is of type VT.
     static EVT getVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements) {
       MVT M = MVT::getVectorVT(VT.V, NumElements);
       if (M.SimpleTy >= 0)
@@ -74,9 +74,9 @@ namespace llvm {
       return getExtendedVectorVT(Context, VT, NumElements);
     }
 
-    /// changeVectorElementTypeToInteger - Return a vector with the same number
-    /// of elements as this vector, but with the element type converted to an
-    /// integer type with the same bitwidth.
+    /// Return a vector with the same number of elements as this vector, but
+    /// with the element type converted to an integer type with the same
+    /// bitwidth.
     EVT changeVectorElementTypeToInteger() const {
       if (!isSimple())
         return changeExtendedVectorElementTypeToInteger();
@@ -102,140 +102,136 @@ namespace llvm {
       return changeExtendedTypeToInteger();
     }
 
-    /// isSimple - Test if the given EVT is simple (as opposed to being
-    /// extended).
+    /// Test if the given EVT is simple (as opposed to being extended).
     bool isSimple() const {
       return V.SimpleTy >= 0;
     }
 
-    /// isExtended - Test if the given EVT is extended (as opposed to
-    /// being simple).
+    /// Test if the given EVT is extended (as opposed to being simple).
     bool isExtended() const {
       return !isSimple();
     }
 
-    /// isFloatingPoint - Return true if this is a FP, or a vector FP type.
+    /// Return true if this is a FP or a vector FP type.
     bool isFloatingPoint() const {
       return isSimple() ? V.isFloatingPoint() : isExtendedFloatingPoint();
     }
 
-    /// isInteger - Return true if this is an integer, or a vector integer type.
+    /// Return true if this is an integer or a vector integer type.
     bool isInteger() const {
       return isSimple() ? V.isInteger() : isExtendedInteger();
     }
 
-    /// isScalarInteger - Return true if this is an integer, but not a vector.
+    /// Return true if this is an integer, but not a vector.
     bool isScalarInteger() const {
       return isSimple() ? V.isScalarInteger() : isExtendedScalarInteger();
     }
 
-    /// isVector - Return true if this is a vector value type.
+    /// Return true if this is a vector value type.
     bool isVector() const {
       return isSimple() ? V.isVector() : isExtendedVector();
     }
 
-    /// is16BitVector - Return true if this is a 16-bit vector type.
+    /// Return true if this is a 16-bit vector type.
     bool is16BitVector() const {
       return isSimple() ? V.is16BitVector() : isExtended16BitVector();
     }
 
-    /// is32BitVector - Return true if this is a 32-bit vector type.
+    /// Return true if this is a 32-bit vector type.
     bool is32BitVector() const {
       return isSimple() ? V.is32BitVector() : isExtended32BitVector();
     }
 
-    /// is64BitVector - Return true if this is a 64-bit vector type.
+    /// Return true if this is a 64-bit vector type.
     bool is64BitVector() const {
       return isSimple() ? V.is64BitVector() : isExtended64BitVector();
     }
 
-    /// is128BitVector - Return true if this is a 128-bit vector type.
+    /// Return true if this is a 128-bit vector type.
     bool is128BitVector() const {
       return isSimple() ? V.is128BitVector() : isExtended128BitVector();
     }
 
-    /// is256BitVector - Return true if this is a 256-bit vector type.
+    /// Return true if this is a 256-bit vector type.
     bool is256BitVector() const {
       return isSimple() ? V.is256BitVector() : isExtended256BitVector();
     }
 
-    /// is512BitVector - Return true if this is a 512-bit vector type.
+    /// Return true if this is a 512-bit vector type.
     bool is512BitVector() const {
       return isSimple() ? V.is512BitVector() : isExtended512BitVector();
     }
 
-    /// is1024BitVector - Return true if this is a 1024-bit vector type.
+    /// Return true if this is a 1024-bit vector type.
     bool is1024BitVector() const {
       return isSimple() ? V.is1024BitVector() : isExtended1024BitVector();
     }
 
-    /// is2048BitVector - Return true if this is a 2048-bit vector type.
+    /// Return true if this is a 2048-bit vector type.
     bool is2048BitVector() const {
       return isSimple() ? V.is2048BitVector() : isExtended2048BitVector();
     }
 
-    /// isOverloaded - Return true if this is an overloaded type for TableGen.
+    /// Return true if this is an overloaded type for TableGen.
     bool isOverloaded() const {
       return (V==MVT::iAny || V==MVT::fAny || V==MVT::vAny || V==MVT::iPTRAny);
     }
 
-    /// isByteSized - Return true if the bit size is a multiple of 8.
+    /// Return true if the bit size is a multiple of 8.
     bool isByteSized() const {
       return (getSizeInBits() & 7) == 0;
     }
 
-    /// isRound - Return true if the size is a power-of-two number of bytes.
+    /// Return true if the size is a power-of-two number of bytes.
     bool isRound() const {
       unsigned BitSize = getSizeInBits();
       return BitSize >= 8 && !(BitSize & (BitSize - 1));
     }
 
-    /// bitsEq - Return true if this has the same number of bits as VT.
+    /// Return true if this has the same number of bits as VT.
     bool bitsEq(EVT VT) const {
       if (EVT::operator==(VT)) return true;
       return getSizeInBits() == VT.getSizeInBits();
     }
 
-    /// bitsGT - Return true if this has more bits than VT.
+    /// Return true if this has more bits than VT.
     bool bitsGT(EVT VT) const {
       if (EVT::operator==(VT)) return false;
       return getSizeInBits() > VT.getSizeInBits();
     }
 
-    /// bitsGE - Return true if this has no less bits than VT.
+    /// Return true if this has no less bits than VT.
     bool bitsGE(EVT VT) const {
       if (EVT::operator==(VT)) return true;
       return getSizeInBits() >= VT.getSizeInBits();
     }
 
-    /// bitsLT - Return true if this has less bits than VT.
+    /// Return true if this has less bits than VT.
     bool bitsLT(EVT VT) const {
       if (EVT::operator==(VT)) return false;
       return getSizeInBits() < VT.getSizeInBits();
     }
 
-    /// bitsLE - Return true if this has no more bits than VT.
+    /// Return true if this has no more bits than VT.
     bool bitsLE(EVT VT) const {
       if (EVT::operator==(VT)) return true;
       return getSizeInBits() <= VT.getSizeInBits();
     }
 
 
-    /// getSimpleVT - Return the SimpleValueType held in the specified
-    /// simple EVT.
+    /// Return the SimpleValueType held in the specified simple EVT.
     MVT getSimpleVT() const {
       assert(isSimple() && "Expected a SimpleValueType!");
       return V;
     }
 
-    /// getScalarType - If this is a vector type, return the element type,
-    /// otherwise return this.
+    /// If this is a vector type, return the element type, otherwise return
+    /// this.
     EVT getScalarType() const {
       return isVector() ? getVectorElementType() : *this;
     }
 
-    /// getVectorElementType - Given a vector type, return the type of
-    /// each element.
+    /// Given a vector type, return the type of each element.
     EVT getVectorElementType() const {
       assert(isVector() && "Invalid vector type!");
       if (isSimple())
@@ -243,8 +239,7 @@ namespace llvm {
       return getExtendedVectorElementType();
     }
 
-    /// getVectorNumElements - Given a vector type, return the number of
-    /// elements it contains.
+    /// Given a vector type, return the number of elements it contains.
     unsigned getVectorNumElements() const {
       assert(isVector() && "Invalid vector type!");
       if (isSimple())
@@ -252,7 +247,7 @@ namespace llvm {
       return getExtendedVectorNumElements();
     }
 
-    /// getSizeInBits - Return the size of the specified value type in bits.
+    /// Return the size of the specified value type in bits.
     unsigned getSizeInBits() const {
       if (isSimple())
         return V.getSizeInBits();
@@ -263,21 +258,21 @@ namespace llvm {
       return getScalarType().getSizeInBits();
     }
 
-    /// getStoreSize - Return the number of bytes overwritten by a store
-    /// of the specified value type.
+    /// Return the number of bytes overwritten by a store of the specified value
+    /// type.
     unsigned getStoreSize() const {
       return (getSizeInBits() + 7) / 8;
     }
 
-    /// getStoreSizeInBits - Return the number of bits overwritten by a store
-    /// of the specified value type.
+    /// Return the number of bits overwritten by a store of the specified value
+    /// type.
     unsigned getStoreSizeInBits() const {
       return getStoreSize() * 8;
     }
 
-    /// getRoundIntegerType - Rounds the bit-width of the given integer EVT up
-    /// to the nearest power of two (and at least to eight), and returns the
-    /// integer EVT with that number of bits.
+    /// Rounds the bit-width of the given integer EVT up to the nearest power of
+    /// two (and at least to eight), and returns the integer EVT with that
+    /// number of bits.
     EVT getRoundIntegerType(LLVMContext &Context) const {
       assert(isInteger() && !isVector() && "Invalid integer type!");
       unsigned BitWidth = getSizeInBits();
@@ -286,10 +281,9 @@ namespace llvm {
       return getIntegerVT(Context, 1 << Log2_32_Ceil(BitWidth));
     }
 
-    /// getHalfSizedIntegerVT - Finds the smallest simple value type that is
-    /// greater than or equal to half the width of this EVT. If no simple
-    /// value type can be found, an extended integer value type of half the
-    /// size (rounded up) is returned.
+    /// Finds the smallest simple value type that is greater than or equal to
+    /// half the width of this EVT. If no simple value type can be found, an
+    /// extended integer value type of half the size (rounded up) is returned.
     EVT getHalfSizedIntegerVT(LLVMContext &Context) const {
       assert(isInteger() && !isVector() && "Invalid integer type!");
       unsigned EVTSize = getSizeInBits();
@@ -302,7 +296,7 @@ namespace llvm {
       return getIntegerVT(Context, (EVTSize + 1) / 2);
     }
 
-    /// \brief Return a VT for an integer vector type with the size of the
+    /// Return a VT for an integer vector type with the size of the
     /// elements doubled. The typed returned may be an extended type.
     EVT widenIntegerVectorElementType(LLVMContext &Context) const {
       EVT EltVT = getVectorElementType();
@@ -310,14 +304,14 @@ namespace llvm {
       return EVT::getVectorVT(Context, EltVT, getVectorNumElements());
     }
 
-    /// isPow2VectorType - Returns true if the given vector is a power of 2.
+    /// Returns true if the given vector is a power of 2.
     bool isPow2VectorType() const {
       unsigned NElts = getVectorNumElements();
       return !(NElts & (NElts - 1));
     }
 
-    /// getPow2VectorType - Widens the length of the given vector EVT up to
-    /// the nearest power of 2 and returns that type.
+    /// Widens the length of the given vector EVT up to the nearest power of 2
+    /// and returns that type.
     EVT getPow2VectorType(LLVMContext &Context) const {
       if (!isPow2VectorType()) {
         unsigned NElts = getVectorNumElements();
@@ -329,16 +323,15 @@ namespace llvm {
       }
     }
 
-    /// getEVTString - This function returns value type as a string,
-    /// e.g. "i32".
+    /// This function returns value type as a string, e.g. "i32".
     std::string getEVTString() const;
 
-    /// getTypeForEVT - This method returns an LLVM type corresponding to the
-    /// specified EVT.  For integer types, this returns an unsigned type.  Note
-    /// that this will abort for types that cannot be represented.
+    /// This method returns an LLVM type corresponding to the specified EVT.
+    /// For integer types, this returns an unsigned type. Note that this will
+    /// abort for types that cannot be represented.
     Type *getTypeForEVT(LLVMContext &Context) const;
 
-    /// getEVT - Return the value type corresponding to the specified type.
+    /// Return the value type corresponding to the specified type.
     /// This returns all pointers as iPTR.  If HandleUnknown is true, unknown
     /// types are returned as Other, otherwise they are invalid.
     static EVT getEVT(Type *Ty, bool HandleUnknown = false);
@@ -350,8 +343,8 @@ namespace llvm {
         return (intptr_t)(LLVMTy);
     }
 
-    /// compareRawBits - A meaningless but well-behaved order, useful for
-    /// constructing containers.
+    /// A meaningless but well-behaved order, useful for constructing
+    /// containers.
     struct compareRawBits {
       bool operator()(EVT L, EVT R) const {
         if (L.V.SimpleTy == R.V.SimpleTy)
diff --git a/include/llvm/Config/abi-breaking.h.cmake b/include/llvm/Config/abi-breaking.h.cmake
index e5697f79e93d5c971ef46233a4d08a1e8e81f338..4ce487b8f5f3c67e6e579961809e2789c9286d11 100644
--- a/include/llvm/Config/abi-breaking.h.cmake
+++ b/include/llvm/Config/abi-breaking.h.cmake
@@ -15,9 +15,8 @@
 /* Define to enable checks that alter the LLVM C++ ABI */
 #cmakedefine01 LLVM_ENABLE_ABI_BREAKING_CHECKS
 
-/* Define to disable the link-time checking of mismatch for
-   LLVM_ENABLE_ABI_BREAKING_CHECKS */
-#cmakedefine01 LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING
+/* Allow selectively disabling link-time mismatch checking so that header-only
+   ADT content from LLVM can be used without linking libSupport. */
 #if !LLVM_DISABLE_ABI_BREAKING_CHECKS_ENFORCING
 
 // ABI_BREAKING_CHECKS protection: provides link-time failure when clients build
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index 30a31ecd27f967b8b88b8f6b12a959fda12c5121..a3c919d39804f8d6b537692a81092cc1ac947a07 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -16,6 +16,8 @@
 /* Define to 1 if you have the `backtrace' function. */
 #cmakedefine HAVE_BACKTRACE ${HAVE_BACKTRACE}
 
+#define BACKTRACE_HEADER <${BACKTRACE_HEADER}>
+
 /* Define to 1 if you have the <CrashReporterClient.h> header file. */
 #cmakedefine HAVE_CRASHREPORTERCLIENT_H
 
@@ -51,6 +53,9 @@
 /* Define if dlopen() is available on this platform. */
 #cmakedefine HAVE_DLOPEN ${HAVE_DLOPEN}
 
+/* Define if dladdr() is available on this platform. */
+#cmakedefine HAVE_DLADDR ${HAVE_DLADDR}
+
 /* Define to 1 if you have the <errno.h> header file. */
 #cmakedefine HAVE_ERRNO_H ${HAVE_ERRNO_H}
 
diff --git a/include/llvm/DebugInfo/CodeView/CVRecord.h b/include/llvm/DebugInfo/CodeView/CVRecord.h
index a327d450db5508f6b3215b0f26fa6dacb3c6c2cd..487f3b6446fa5697bb4e6e96d759cf8187b37a03 100644
--- a/include/llvm/DebugInfo/CodeView/CVRecord.h
+++ b/include/llvm/DebugInfo/CodeView/CVRecord.h
@@ -14,8 +14,8 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -48,15 +48,13 @@ public:
 
 } // end namespace codeview
 
-namespace msf {
-
 template <typename Kind>
 struct VarStreamArrayExtractor<codeview::CVRecord<Kind>> {
-  Error operator()(ReadableStreamRef Stream, uint32_t &Len,
+  Error operator()(BinaryStreamRef Stream, uint32_t &Len,
                    codeview::CVRecord<Kind> &Item) const {
     using namespace codeview;
     const RecordPrefix *Prefix = nullptr;
-    StreamReader Reader(Stream);
+    BinaryStreamReader Reader(Stream);
     uint32_t Offset = Reader.getOffset();
 
     if (auto EC = Reader.readObject(Prefix))
@@ -76,8 +74,6 @@ struct VarStreamArrayExtractor<codeview::CVRecord<Kind>> {
   }
 };
 
-} // end namespace msf
-
 } // end namespace llvm
 
 #endif // LLVM_DEBUGINFO_CODEVIEW_RECORDITERATOR_H
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeDumper.h b/include/llvm/DebugInfo/CodeView/CVTypeDumper.h
index e1dd6a10b5a16dc112b910d9a3dd43027d7e8dc9..02f14ea2107b2c8486845ef7cb2782375cbc1d75 100644
--- a/include/llvm/DebugInfo/CodeView/CVTypeDumper.h
+++ b/include/llvm/DebugInfo/CodeView/CVTypeDumper.h
@@ -22,10 +22,14 @@ namespace llvm {
 
 namespace codeview {
 
+class TypeServerHandler;
+
 /// Dumper for CodeView type streams found in COFF object files and PDB files.
 class CVTypeDumper {
 public:
-  explicit CVTypeDumper(TypeDatabase &TypeDB) : TypeDB(TypeDB) {}
+  explicit CVTypeDumper(TypeDatabase &TypeDB,
+                        TypeServerHandler *Handler = nullptr)
+      : TypeDB(TypeDB), Handler(Handler) {}
 
   /// Dumps one type record.  Returns false if there was a type parsing error,
   /// and true otherwise.  This should be called in order, since the dumper
@@ -48,6 +52,7 @@ public:
 
 private:
   TypeDatabase &TypeDB;
+  TypeServerHandler *Handler;
 };
 
 } // end namespace codeview
diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
index d1b0363a4133d8dfc7c3e08fc8bfd38bc2a39317..e9012db7602d28018474a36126187dfc6e83dae2 100644
--- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h
@@ -10,9 +10,10 @@
 #ifndef LLVM_DEBUGINFO_CODEVIEW_CVTYPEVISITOR_H
 #define LLVM_DEBUGINFO_CODEVIEW_CVTYPEVISITOR_H
 
-#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
 #include "llvm/Support/Error.h"
 
@@ -23,18 +24,23 @@ class CVTypeVisitor {
 public:
   explicit CVTypeVisitor(TypeVisitorCallbacks &Callbacks);
 
+  void addTypeServerHandler(TypeServerHandler &Handler);
+
   Error visitTypeRecord(CVType &Record);
   Error visitMemberRecord(CVMemberRecord &Record);
 
   /// Visits the type records in Data. Sets the error flag on parse failures.
   Error visitTypeStream(const CVTypeArray &Types);
+  Error visitTypeStream(CVTypeRange Types);
 
   Error visitFieldListMemberStream(ArrayRef<uint8_t> FieldList);
-  Error visitFieldListMemberStream(msf::StreamReader Reader);
+  Error visitFieldListMemberStream(BinaryStreamReader Reader);
 
 private:
   /// The interface to the class that gets notified of each visitation.
   TypeVisitorCallbacks &Callbacks;
+
+  TinyPtrVector<TypeServerHandler *> Handlers;
 };
 
 } // end namespace codeview
diff --git a/include/llvm/DebugInfo/CodeView/CodeView.h b/include/llvm/DebugInfo/CodeView/CodeView.h
index e21cfa3d030ae5111d7757c6bf0515cef26b5539..2791c9dc374651b8c1289cecdb1cc323f36b1dcd 100644
--- a/include/llvm/DebugInfo/CodeView/CodeView.h
+++ b/include/llvm/DebugInfo/CodeView/CodeView.h
@@ -275,6 +275,12 @@ enum class MethodOptions : uint16_t {
 };
 CV_DEFINE_ENUM_CLASS_FLAGS_OPERATORS(MethodOptions)
 
+/// Equivalent to CV_LABEL_TYPE_e.
+enum class LabelType : uint16_t {
+  Near = 0x0,
+  Far  = 0x4,
+};
+
 /// Equivalent to CV_modifier_t.
 /// TODO: Add flag for _Atomic modifier
 enum class ModifierOptions : uint16_t {
diff --git a/include/llvm/DebugInfo/CodeView/CodeViewError.h b/include/llvm/DebugInfo/CodeView/CodeViewError.h
index 0556fd0e19f26add6f46127b0e955ef7aa573a56..586a720ce6e4027d7f307e0b3dff3b88462d6442 100644
--- a/include/llvm/DebugInfo/CodeView/CodeViewError.h
+++ b/include/llvm/DebugInfo/CodeView/CodeViewError.h
@@ -21,6 +21,7 @@ enum class cv_error_code {
   insufficient_buffer,
   operation_unsupported,
   corrupt_record,
+  no_records,
   unknown_member_record,
 };
 
diff --git a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
index 5a036b9d5b6cdf4423a04be61a67ee3c34b4744a..b3976826a316c7f85a128c05e9a58927c28d5f9a 100644
--- a/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
+++ b/include/llvm/DebugInfo/CodeView/CodeViewRecordIO.h
@@ -17,8 +17,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Error.h"
 #include <cassert>
 #include <cstdint>
@@ -33,8 +33,8 @@ class CodeViewRecordIO {
   }
 
 public:
-  explicit CodeViewRecordIO(msf::StreamReader &Reader) : Reader(&Reader) {}
-  explicit CodeViewRecordIO(msf::StreamWriter &Writer) : Writer(&Writer) {}
+  explicit CodeViewRecordIO(BinaryStreamReader &Reader) : Reader(&Reader) {}
+  explicit CodeViewRecordIO(BinaryStreamWriter &Writer) : Writer(&Writer) {}
 
   Error beginRecord(Optional<uint32_t> MaxLength);
   Error endRecord();
@@ -160,8 +160,8 @@ private:
 
   SmallVector<RecordLimit, 2> Limits;
 
-  msf::StreamReader *Reader = nullptr;
-  msf::StreamWriter *Writer = nullptr;
+  BinaryStreamReader *Reader = nullptr;
+  BinaryStreamWriter *Writer = nullptr;
 };
 
 } // end namespace codeview
diff --git a/include/llvm/DebugInfo/CodeView/Formatters.h b/include/llvm/DebugInfo/CodeView/Formatters.h
new file mode 100644
index 0000000000000000000000000000000000000000..37a91098a8b65e3e625b88e21e26adce1a02a11e
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/Formatters.h
@@ -0,0 +1,40 @@
+//===- Formatters.h ---------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_FORMATTERS_H
+#define LLVM_DEBUGINFO_CODEVIEW_FORMATTERS_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/FormatAdapters.h"
+
+namespace llvm {
+namespace codeview {
+namespace detail {
+class GuidAdapter final : public llvm::FormatAdapter<ArrayRef<uint8_t>> {
+  ArrayRef<uint8_t> Guid;
+
+public:
+  explicit GuidAdapter(ArrayRef<uint8_t> Guid);
+  explicit GuidAdapter(StringRef Guid);
+  void format(llvm::raw_ostream &Stream, StringRef Style);
+};
+}
+
+inline detail::GuidAdapter fmt_guid(StringRef Item) {
+  return detail::GuidAdapter(Item);
+}
+
+inline detail::GuidAdapter fmt_guid(ArrayRef<uint8_t> Item) {
+  return detail::GuidAdapter(Item);
+}
+}
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/ModuleSubstream.h b/include/llvm/DebugInfo/CodeView/ModuleSubstream.h
index 8860ae42fc0913028b6f777a7229d20b4a7869b5..a1c5c93cc3f8e14c2d0045e20f2297484793a6af 100644
--- a/include/llvm/DebugInfo/CodeView/ModuleSubstream.h
+++ b/include/llvm/DebugInfo/CodeView/ModuleSubstream.h
@@ -11,8 +11,8 @@
 #define LLVM_DEBUGINFO_CODEVIEW_MODULESUBSTREAM_H
 
 #include "llvm/DebugInfo/CodeView/CodeView.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
@@ -59,23 +59,22 @@ struct ColumnNumberEntry {
 class ModuleSubstream {
 public:
   ModuleSubstream();
-  ModuleSubstream(ModuleSubstreamKind Kind, msf::ReadableStreamRef Data);
-  static Error initialize(msf::ReadableStreamRef Stream, ModuleSubstream &Info);
+  ModuleSubstream(ModuleSubstreamKind Kind, BinaryStreamRef Data);
+  static Error initialize(BinaryStreamRef Stream, ModuleSubstream &Info);
   uint32_t getRecordLength() const;
   ModuleSubstreamKind getSubstreamKind() const;
-  msf::ReadableStreamRef getRecordData() const;
+  BinaryStreamRef getRecordData() const;
 
 private:
   ModuleSubstreamKind Kind;
-  msf::ReadableStreamRef Data;
+  BinaryStreamRef Data;
 };
 
-typedef msf::VarStreamArray<ModuleSubstream> ModuleSubstreamArray;
+typedef VarStreamArray<ModuleSubstream> ModuleSubstreamArray;
 } // namespace codeview
 
-namespace msf {
 template <> struct VarStreamArrayExtractor<codeview::ModuleSubstream> {
-  Error operator()(ReadableStreamRef Stream, uint32_t &Length,
+  Error operator()(BinaryStreamRef Stream, uint32_t &Length,
                    codeview::ModuleSubstream &Info) const {
     if (auto EC = codeview::ModuleSubstream::initialize(Stream, Info))
       return EC;
@@ -83,7 +82,6 @@ template <> struct VarStreamArrayExtractor<codeview::ModuleSubstream> {
     return Error::success();
   }
 };
-} // namespace msf
 } // namespace llvm
 
 #endif // LLVM_DEBUGINFO_CODEVIEW_MODULESUBSTREAM_H
diff --git a/include/llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h b/include/llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h
index f9927d660933395e3915cbec68c170466a07f008..1a40654a3f3379eb1e2a1da77d899bfb8e946091 100644
--- a/include/llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h
@@ -15,9 +15,9 @@
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/Line.h"
 #include "llvm/DebugInfo/CodeView/ModuleSubstream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -28,8 +28,8 @@ namespace codeview {
 
 struct LineColumnEntry {
   support::ulittle32_t NameIndex;
-  msf::FixedStreamArray<LineNumberEntry> LineNumbers;
-  msf::FixedStreamArray<ColumnNumberEntry> Columns;
+  FixedStreamArray<LineNumberEntry> LineNumbers;
+  FixedStreamArray<ColumnNumberEntry> Columns;
 };
 
 struct FileChecksumEntry {
@@ -38,49 +38,47 @@ struct FileChecksumEntry {
   ArrayRef<uint8_t> Checksum; // The bytes of the checksum.
 };
 
-typedef msf::VarStreamArray<LineColumnEntry> LineInfoArray;
-typedef msf::VarStreamArray<FileChecksumEntry> FileChecksumArray;
+typedef VarStreamArray<LineColumnEntry> LineInfoArray;
+typedef VarStreamArray<FileChecksumEntry> FileChecksumArray;
 
 class IModuleSubstreamVisitor {
 public:
   virtual ~IModuleSubstreamVisitor() = default;
 
   virtual Error visitUnknown(ModuleSubstreamKind Kind,
-                             msf::ReadableStreamRef Data) = 0;
-  virtual Error visitSymbols(msf::ReadableStreamRef Data);
-  virtual Error visitLines(msf::ReadableStreamRef Data,
+                             BinaryStreamRef Data) = 0;
+  virtual Error visitSymbols(BinaryStreamRef Data);
+  virtual Error visitLines(BinaryStreamRef Data,
                            const LineSubstreamHeader *Header,
                            const LineInfoArray &Lines);
-  virtual Error visitStringTable(msf::ReadableStreamRef Data);
-  virtual Error visitFileChecksums(msf::ReadableStreamRef Data,
+  virtual Error visitStringTable(BinaryStreamRef Data);
+  virtual Error visitFileChecksums(BinaryStreamRef Data,
                                    const FileChecksumArray &Checksums);
-  virtual Error visitFrameData(msf::ReadableStreamRef Data);
-  virtual Error visitInlineeLines(msf::ReadableStreamRef Data);
-  virtual Error visitCrossScopeImports(msf::ReadableStreamRef Data);
-  virtual Error visitCrossScopeExports(msf::ReadableStreamRef Data);
-  virtual Error visitILLines(msf::ReadableStreamRef Data);
-  virtual Error visitFuncMDTokenMap(msf::ReadableStreamRef Data);
-  virtual Error visitTypeMDTokenMap(msf::ReadableStreamRef Data);
-  virtual Error visitMergedAssemblyInput(msf::ReadableStreamRef Data);
-  virtual Error visitCoffSymbolRVA(msf::ReadableStreamRef Data);
+  virtual Error visitFrameData(BinaryStreamRef Data);
+  virtual Error visitInlineeLines(BinaryStreamRef Data);
+  virtual Error visitCrossScopeImports(BinaryStreamRef Data);
+  virtual Error visitCrossScopeExports(BinaryStreamRef Data);
+  virtual Error visitILLines(BinaryStreamRef Data);
+  virtual Error visitFuncMDTokenMap(BinaryStreamRef Data);
+  virtual Error visitTypeMDTokenMap(BinaryStreamRef Data);
+  virtual Error visitMergedAssemblyInput(BinaryStreamRef Data);
+  virtual Error visitCoffSymbolRVA(BinaryStreamRef Data);
 };
 
 Error visitModuleSubstream(const ModuleSubstream &R,
                            IModuleSubstreamVisitor &V);
 } // end namespace codeview
 
-namespace msf {
-
 template <> class VarStreamArrayExtractor<codeview::LineColumnEntry> {
 public:
   VarStreamArrayExtractor(const codeview::LineSubstreamHeader *Header)
       : Header(Header) {}
 
-  Error operator()(ReadableStreamRef Stream, uint32_t &Len,
+  Error operator()(BinaryStreamRef Stream, uint32_t &Len,
                    codeview::LineColumnEntry &Item) const {
     using namespace codeview;
     const LineFileBlockHeader *BlockHeader;
-    StreamReader Reader(Stream);
+    BinaryStreamReader Reader(Stream);
     if (auto EC = Reader.readObject(BlockHeader))
       return EC;
     bool HasColumn = Header->Flags & LineFlags::HaveColumns;
@@ -113,11 +111,11 @@ private:
 
 template <> class VarStreamArrayExtractor<codeview::FileChecksumEntry> {
 public:
-  Error operator()(ReadableStreamRef Stream, uint32_t &Len,
+  Error operator()(BinaryStreamRef Stream, uint32_t &Len,
                    codeview::FileChecksumEntry &Item) const {
     using namespace codeview;
     const FileChecksum *Header;
-    StreamReader Reader(Stream);
+    BinaryStreamReader Reader(Stream);
     if (auto EC = Reader.readObject(Header))
       return EC;
     Item.FileNameOffset = Header->FileNameOffset;
@@ -129,8 +127,6 @@ public:
   }
 };
 
-} // end namespace msf
-
 } // end namespace llvm
 
 #endif // LLVM_DEBUGINFO_CODEVIEW_MODULESUBSTREAMVISITOR_H
diff --git a/include/llvm/DebugInfo/CodeView/RecordSerialization.h b/include/llvm/DebugInfo/CodeView/RecordSerialization.h
index 97b6f561bb973becfea6ca6fab52320aa54eb25a..58449c2c7565b9f8905d880d26d282afdc0502ed 100644
--- a/include/llvm/DebugInfo/CodeView/RecordSerialization.h
+++ b/include/llvm/DebugInfo/CodeView/RecordSerialization.h
@@ -15,7 +15,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cinttypes>
@@ -41,37 +41,37 @@ struct RecordPrefix {
 StringRef getBytesAsCharacters(ArrayRef<uint8_t> LeafData);
 StringRef getBytesAsCString(ArrayRef<uint8_t> LeafData);
 
-inline Error consume(msf::StreamReader &Reader) { return Error::success(); }
+inline Error consume(BinaryStreamReader &Reader) { return Error::success(); }
 
 /// Decodes a numeric "leaf" value. These are integer literals encountered in
 /// the type stream. If the value is positive and less than LF_NUMERIC (1 <<
 /// 15), it is emitted directly in Data. Otherwise, it has a tag like LF_CHAR
 /// that indicates the bitwidth and sign of the numeric data.
-Error consume(msf::StreamReader &Reader, APSInt &Num);
+Error consume(BinaryStreamReader &Reader, APSInt &Num);
 
 /// Decodes a numeric leaf value that is known to be a particular type.
-Error consume_numeric(msf::StreamReader &Reader, uint64_t &Value);
+Error consume_numeric(BinaryStreamReader &Reader, uint64_t &Value);
 
 /// Decodes signed and unsigned fixed-length integers.
-Error consume(msf::StreamReader &Reader, uint32_t &Item);
-Error consume(msf::StreamReader &Reader, int32_t &Item);
+Error consume(BinaryStreamReader &Reader, uint32_t &Item);
+Error consume(BinaryStreamReader &Reader, int32_t &Item);
 
 /// Decodes a null terminated string.
-Error consume(msf::StreamReader &Reader, StringRef &Item);
+Error consume(BinaryStreamReader &Reader, StringRef &Item);
 
 Error consume(StringRef &Data, APSInt &Num);
 Error consume(StringRef &Data, uint32_t &Item);
 
 /// Decodes an arbitrary object whose layout matches that of the underlying
 /// byte sequence, and returns a pointer to the object.
-template <typename T> Error consume(msf::StreamReader &Reader, T *&Item) {
+template <typename T> Error consume(BinaryStreamReader &Reader, T *&Item) {
   return Reader.readObject(Item);
 }
 
 template <typename T, typename U> struct serialize_conditional_impl {
   serialize_conditional_impl(T &Item, U Func) : Item(Item), Func(Func) {}
 
-  Error deserialize(msf::StreamReader &Reader) const {
+  Error deserialize(BinaryStreamReader &Reader) const {
     if (!Func())
       return Error::success();
     return consume(Reader, Item);
@@ -89,7 +89,7 @@ serialize_conditional_impl<T, U> serialize_conditional(T &Item, U Func) {
 template <typename T, typename U> struct serialize_array_impl {
   serialize_array_impl(ArrayRef<T> &Item, U Func) : Item(Item), Func(Func) {}
 
-  Error deserialize(msf::StreamReader &Reader) const {
+  Error deserialize(BinaryStreamReader &Reader) const {
     return Reader.readArray(Item, Func());
   }
 
@@ -100,7 +100,7 @@ template <typename T, typename U> struct serialize_array_impl {
 template <typename T> struct serialize_vector_tail_impl {
   serialize_vector_tail_impl(std::vector<T> &Item) : Item(Item) {}
 
-  Error deserialize(msf::StreamReader &Reader) const {
+  Error deserialize(BinaryStreamReader &Reader) const {
     T Field;
     // Stop when we run out of bytes or we hit record padding bytes.
     while (!Reader.empty() && Reader.peek() < LF_PAD0) {
@@ -118,14 +118,14 @@ struct serialize_null_term_string_array_impl {
   serialize_null_term_string_array_impl(std::vector<StringRef> &Item)
       : Item(Item) {}
 
-  Error deserialize(msf::StreamReader &Reader) const {
+  Error deserialize(BinaryStreamReader &Reader) const {
     if (Reader.empty())
       return make_error<CodeViewError>(cv_error_code::insufficient_buffer,
                                        "Null terminated string is empty!");
 
     while (Reader.peek() != 0) {
       StringRef Field;
-      if (auto EC = Reader.readZeroString(Field))
+      if (auto EC = Reader.readCString(Field))
         return EC;
       Item.push_back(Field);
     }
@@ -138,7 +138,7 @@ struct serialize_null_term_string_array_impl {
 template <typename T> struct serialize_arrayref_tail_impl {
   serialize_arrayref_tail_impl(ArrayRef<T> &Item) : Item(Item) {}
 
-  Error deserialize(msf::StreamReader &Reader) const {
+  Error deserialize(BinaryStreamReader &Reader) const {
     uint32_t Count = Reader.bytesRemaining() / sizeof(T);
     return Reader.readArray(Item, Count);
   }
@@ -149,7 +149,7 @@ template <typename T> struct serialize_arrayref_tail_impl {
 template <typename T> struct serialize_numeric_impl {
   serialize_numeric_impl(T &Item) : Item(Item) {}
 
-  Error deserialize(msf::StreamReader &Reader) const {
+  Error deserialize(BinaryStreamReader &Reader) const {
     return consume_numeric(Reader, Item);
   }
 
@@ -201,42 +201,42 @@ template <typename T> serialize_numeric_impl<T> serialize_numeric(T &Item) {
 #define CV_NUMERIC_FIELD(I) serialize_numeric(I)
 
 template <typename T, typename U>
-Error consume(msf::StreamReader &Reader,
+Error consume(BinaryStreamReader &Reader,
               const serialize_conditional_impl<T, U> &Item) {
   return Item.deserialize(Reader);
 }
 
 template <typename T, typename U>
-Error consume(msf::StreamReader &Reader,
+Error consume(BinaryStreamReader &Reader,
               const serialize_array_impl<T, U> &Item) {
   return Item.deserialize(Reader);
 }
 
-inline Error consume(msf::StreamReader &Reader,
+inline Error consume(BinaryStreamReader &Reader,
                      const serialize_null_term_string_array_impl &Item) {
   return Item.deserialize(Reader);
 }
 
 template <typename T>
-Error consume(msf::StreamReader &Reader,
+Error consume(BinaryStreamReader &Reader,
               const serialize_vector_tail_impl<T> &Item) {
   return Item.deserialize(Reader);
 }
 
 template <typename T>
-Error consume(msf::StreamReader &Reader,
+Error consume(BinaryStreamReader &Reader,
               const serialize_arrayref_tail_impl<T> &Item) {
   return Item.deserialize(Reader);
 }
 
 template <typename T>
-Error consume(msf::StreamReader &Reader,
+Error consume(BinaryStreamReader &Reader,
               const serialize_numeric_impl<T> &Item) {
   return Item.deserialize(Reader);
 }
 
 template <typename T, typename U, typename... Args>
-Error consume(msf::StreamReader &Reader, T &&X, U &&Y, Args &&... Rest) {
+Error consume(BinaryStreamReader &Reader, T &&X, U &&Y, Args &&... Rest) {
   if (auto EC = consume(Reader, X))
     return EC;
   return consume(Reader, Y, std::forward<Args>(Rest)...);
diff --git a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
index 13c2bb14ecf599153df90aae4bbb4602e0b0f1ca..c1a5152930fff26458cd4e925f6a6edb3331d121 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolDeserializer.h
@@ -15,8 +15,8 @@
 #include "llvm/DebugInfo/CodeView/SymbolRecordMapping.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -25,10 +25,11 @@ class SymbolVisitorDelegate;
 class SymbolDeserializer : public SymbolVisitorCallbacks {
   struct MappingInfo {
     explicit MappingInfo(ArrayRef<uint8_t> RecordData)
-        : Stream(RecordData), Reader(Stream), Mapping(Reader) {}
+        : Stream(RecordData, llvm::support::little), Reader(Stream),
+          Mapping(Reader) {}
 
-    msf::ByteStream Stream;
-    msf::StreamReader Reader;
+    BinaryByteStream Stream;
+    BinaryStreamReader Reader;
     SymbolRecordMapping Mapping;
   };
 
diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecord.h b/include/llvm/DebugInfo/CodeView/SymbolRecord.h
index 2c3648ad28a9caaa96e4a4de45f9f51d526a6fa3..c5a5549bf818ae74e007cb20d822bd67b42a1e7f 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolRecord.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolRecord.h
@@ -13,13 +13,13 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
+#include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cstddef>
@@ -938,7 +938,7 @@ public:
 };
 
 typedef CVRecord<SymbolKind> CVSymbol;
-typedef msf::VarStreamArray<CVSymbol> CVSymbolArray;
+typedef VarStreamArray<CVSymbol> CVSymbolArray;
 
 } // end namespace codeview
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h b/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h
index 1bd14ed1347abde51f42277ab399b000db11febb..0a1837a0d935f7e78273f6cc20076839b48520b1 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolRecordMapping.h
@@ -14,16 +14,14 @@
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h"
 
 namespace llvm {
-namespace msf {
-class StreamReader;
-class StreamWriter;
-}
+class BinaryStreamReader;
+class BinaryStreamWriter;
 
 namespace codeview {
 class SymbolRecordMapping : public SymbolVisitorCallbacks {
 public:
-  explicit SymbolRecordMapping(msf::StreamReader &Reader) : IO(Reader) {}
-  explicit SymbolRecordMapping(msf::StreamWriter &Writer) : IO(Writer) {}
+  explicit SymbolRecordMapping(BinaryStreamReader &Reader) : IO(Reader) {}
+  explicit SymbolRecordMapping(BinaryStreamWriter &Writer) : IO(Writer) {}
 
   Error visitSymbolBegin(CVSymbol &Record) override;
   Error visitSymbolEnd(CVSymbol &Record) override;
diff --git a/include/llvm/DebugInfo/CodeView/SymbolSerializer.h b/include/llvm/DebugInfo/CodeView/SymbolSerializer.h
index 4eb914e7ae6bd46e94165d2cdde37a864879ca9f..f2e99bd8332605daa18373d497cb97d0db4648eb 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolSerializer.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolSerializer.h
@@ -12,8 +12,6 @@
 
 #include "llvm/DebugInfo/CodeView/SymbolRecordMapping.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
@@ -21,14 +19,19 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
+class BinaryStreamWriter;
 namespace codeview {
 
 class SymbolSerializer : public SymbolVisitorCallbacks {
-  uint32_t RecordStart = 0;
-  msf::StreamWriter &Writer;
+  BumpPtrAllocator &Storage;
+  std::vector<uint8_t> RecordBuffer;
+  MutableBinaryByteStream Stream;
+  BinaryStreamWriter Writer;
   SymbolRecordMapping Mapping;
   Optional<SymbolKind> CurrentSymbol;
 
@@ -42,40 +45,10 @@ class SymbolSerializer : public SymbolVisitorCallbacks {
   }
 
 public:
-  explicit SymbolSerializer(msf::StreamWriter &Writer)
-      : Writer(Writer), Mapping(Writer) {}
+  explicit SymbolSerializer(BumpPtrAllocator &Storage);
 
-  virtual Error visitSymbolBegin(CVSymbol &Record) override {
-    assert(!CurrentSymbol.hasValue() && "Already in a symbol mapping!");
-
-    RecordStart = Writer.getOffset();
-    if (auto EC = writeRecordPrefix(Record.kind()))
-      return EC;
-
-    CurrentSymbol = Record.kind();
-    if (auto EC = Mapping.visitSymbolBegin(Record))
-      return EC;
-
-    return Error::success();
-  }
-
-  virtual Error visitSymbolEnd(CVSymbol &Record) override {
-    assert(CurrentSymbol.hasValue() && "Not in a symbol mapping!");
-
-    if (auto EC = Mapping.visitSymbolEnd(Record))
-      return EC;
-
-    uint32_t RecordEnd = Writer.getOffset();
-    Writer.setOffset(RecordStart);
-    uint16_t Length = RecordEnd - Writer.getOffset() - 2;
-    if (auto EC = Writer.writeInteger(Length))
-      return EC;
-
-    Writer.setOffset(RecordEnd);
-    CurrentSymbol.reset();
-
-    return Error::success();
-  }
+  virtual Error visitSymbolBegin(CVSymbol &Record) override;
+  virtual Error visitSymbolEnd(CVSymbol &Record) override;
 
 #define SYMBOL_RECORD(EnumName, EnumVal, Name)                                 \
   virtual Error visitKnownRecord(CVSymbol &CVR, Name &Record) override {       \
diff --git a/include/llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h b/include/llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h
index 2b468a289fd8207fb1a00cb8619b856886d96db5..2bef3f61adfcccecfca550db2d418460dfdd1374 100644
--- a/include/llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h
+++ b/include/llvm/DebugInfo/CodeView/SymbolVisitorDelegate.h
@@ -15,9 +15,7 @@
 
 namespace llvm {
 
-namespace msf {
-class StreamReader;
-} // end namespace msf
+class BinaryStreamReader;
 
 namespace codeview {
 
@@ -25,7 +23,7 @@ class SymbolVisitorDelegate {
 public:
   virtual ~SymbolVisitorDelegate() = default;
 
-  virtual uint32_t getRecordOffset(msf::StreamReader Reader) = 0;
+  virtual uint32_t getRecordOffset(BinaryStreamReader Reader) = 0;
   virtual StringRef getFileNameForFileOffset(uint32_t FileOffset) = 0;
   virtual StringRef getStringTable() = 0;
 };
diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabase.h b/include/llvm/DebugInfo/CodeView/TypeDatabase.h
index c00d3b79420156183689b9127d580ffb7815e247..54ad862cfa7e582edba734f59451bafa00993481 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDatabase.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDatabase.h
@@ -34,6 +34,8 @@ public:
 
   StringRef getTypeName(TypeIndex Index) const;
 
+  const CVType &getTypeRecord(TypeIndex Index) const;
+
   bool containsTypeIndex(TypeIndex Index) const;
 
   uint32_t size() const;
diff --git a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
index dc5eaf82845bbcc25e8347f87c8140e1d58a9bb5..0e34437891702351d32abc4d166ecbebd5cfd4dc 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h
@@ -16,8 +16,8 @@
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <cassert>
 #include <cstdint>
@@ -29,10 +29,11 @@ namespace codeview {
 class TypeDeserializer : public TypeVisitorCallbacks {
   struct MappingInfo {
     explicit MappingInfo(ArrayRef<uint8_t> RecordData)
-        : Stream(RecordData), Reader(Stream), Mapping(Reader) {}
+        : Stream(RecordData, llvm::support::little), Reader(Stream),
+          Mapping(Reader) {}
 
-    msf::ByteStream Stream;
-    msf::StreamReader Reader;
+    BinaryByteStream Stream;
+    BinaryStreamReader Reader;
     TypeRecordMapping Mapping;
   };
 
@@ -72,16 +73,16 @@ private:
 
 class FieldListDeserializer : public TypeVisitorCallbacks {
   struct MappingInfo {
-    explicit MappingInfo(msf::StreamReader &R)
+    explicit MappingInfo(BinaryStreamReader &R)
         : Reader(R), Mapping(Reader), StartOffset(0) {}
 
-    msf::StreamReader &Reader;
+    BinaryStreamReader &Reader;
     TypeRecordMapping Mapping;
     uint32_t StartOffset;
   };
 
 public:
-  explicit FieldListDeserializer(msf::StreamReader &Reader) : Mapping(Reader) {
+  explicit FieldListDeserializer(BinaryStreamReader &Reader) : Mapping(Reader) {
     CVType FieldList;
     FieldList.Type = TypeLeafKind::LF_FIELDLIST;
     consumeError(Mapping.Mapping.visitTypeBegin(FieldList));
diff --git a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
index a466e42981582c24c5667a5429903f94253d26bf..00bb09137e488d4c494c0ed9e6522e86929888e6 100644
--- a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
+++ b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h
@@ -28,8 +28,16 @@ public:
   TypeDumpVisitor(TypeDatabase &TypeDB, ScopedPrinter *W, bool PrintRecordBytes)
       : W(W), PrintRecordBytes(PrintRecordBytes), TypeDB(TypeDB) {}
 
+  /// When dumping types from an IPI stream in a PDB, a type index may refer to
+  /// a type or an item ID. The dumper will lookup the "name" of the index in
+  /// the item database if appropriate. If ItemDB is null, it will use TypeDB,
+  /// which is correct when dumping types from an object file (/Z7).
+  void setItemDB(TypeDatabase &DB) { ItemDB = &DB; }
+
   void printTypeIndex(StringRef FieldName, TypeIndex TI) const;
 
+  void printItemIndex(StringRef FieldName, TypeIndex TI) const;
+
   /// Action to take on unknown types. By default, they are ignored.
   Error visitUnknownType(CVType &Record) override;
   Error visitUnknownMember(CVMemberRecord &Record) override;
@@ -54,11 +62,17 @@ private:
   void printMemberAttributes(MemberAccess Access, MethodKind Kind,
                              MethodOptions Options);
 
+  /// Get the database of indices for the stream that we are dumping. If ItemDB
+  /// is set, then we must be dumping an item (IPI) stream. This will also
+  /// always get the appropriate DB for printing item names.
+  TypeDatabase &getSourceDB() const { return ItemDB ? *ItemDB : TypeDB; }
+
   ScopedPrinter *W;
 
   bool PrintRecordBytes = false;
 
   TypeDatabase &TypeDB;
+  TypeDatabase *ItemDB = nullptr;
 };
 
 } // end namespace codeview
diff --git a/include/llvm/DebugInfo/CodeView/TypeRecord.h b/include/llvm/DebugInfo/CodeView/TypeRecord.h
index 4f1c047815d29508b98c7b94fbfd777ccf5c9aef..1f10872c8768040b20a87265cdbde2a8ed317059 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecord.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecord.h
@@ -18,7 +18,7 @@
 #include "llvm/DebugInfo/CodeView/CVRecord.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
+#include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Endian.h"
 #include <algorithm>
 #include <cstdint>
@@ -26,9 +26,7 @@
 
 namespace llvm {
 
-namespace msf {
-class StreamReader;
-} // end namespace msf
+class BinaryStreamReader;
 
 namespace codeview {
 
@@ -42,7 +40,8 @@ struct CVMemberRecord {
   TypeLeafKind Kind;
   ArrayRef<uint8_t> Data;
 };
-typedef msf::VarStreamArray<CVType> CVTypeArray;
+typedef VarStreamArray<CVType> CVTypeArray;
+typedef iterator_range<CVTypeArray::Iterator> CVTypeRange;
 
 /// Equvalent to CV_fldattr_t in cvinfo.h.
 struct MemberAttributes {
@@ -106,10 +105,6 @@ public:
                     PointerToMemberRepresentation Representation)
       : ContainingType(ContainingType), Representation(Representation) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getContainingType() const { return ContainingType; }
   PointerToMemberRepresentation getRepresentation() const {
     return Representation;
@@ -139,10 +134,6 @@ public:
       : TypeRecord(TypeRecordKind::Modifier), ModifiedType(ModifiedType),
         Modifiers(Modifiers) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getModifiedType() const { return ModifiedType; }
   ModifierOptions getModifiers() const { return Modifiers; }
 
@@ -161,10 +152,6 @@ public:
         CallConv(CallConv), Options(Options), ParameterCount(ParameterCount),
         ArgumentList(ArgumentList) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getReturnType() const { return ReturnType; }
   CallingConvention getCallConv() const { return CallConv; }
   FunctionOptions getOptions() const { return Options; }
@@ -193,10 +180,6 @@ public:
         ArgumentList(ArgumentList),
         ThisPointerAdjustment(ThisPointerAdjustment) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getReturnType() const { return ReturnType; }
   TypeIndex getClassType() const { return ClassType; }
   TypeIndex getThisType() const { return ThisType; }
@@ -216,6 +199,16 @@ public:
   int32_t ThisPointerAdjustment;
 };
 
+// LF_LABEL
+class LabelRecord : public TypeRecord {
+public:
+  explicit LabelRecord(TypeRecordKind Kind) : TypeRecord(Kind) {}
+
+  LabelRecord(LabelType Mode) : TypeRecord(TypeRecordKind::Label), Mode(Mode) {}
+
+  LabelType Mode;
+};
+
 // LF_MFUNC_ID
 class MemberFuncIdRecord : public TypeRecord {
 public:
@@ -225,10 +218,6 @@ public:
       : TypeRecord(TypeRecordKind::MemberFuncId), ClassType(ClassType),
         FunctionType(FunctionType), Name(Name) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getClassType() const { return ClassType; }
   TypeIndex getFunctionType() const { return FunctionType; }
   StringRef getName() const { return Name; }
@@ -237,17 +226,26 @@ public:
   StringRef Name;
 };
 
-// LF_ARGLIST, LF_SUBSTR_LIST
+// LF_ARGLIST
 class ArgListRecord : public TypeRecord {
 public:
   explicit ArgListRecord(TypeRecordKind Kind) : TypeRecord(Kind) {}
 
   ArgListRecord(TypeRecordKind Kind, ArrayRef<TypeIndex> Indices)
-      : TypeRecord(Kind), StringIndices(Indices) {}
+      : TypeRecord(Kind), ArgIndices(Indices) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
+  ArrayRef<TypeIndex> getIndices() const { return ArgIndices; }
+
+  std::vector<TypeIndex> ArgIndices;
+};
+
+// LF_SUBSTR_LIST
+class StringListRecord : public TypeRecord {
+public:
+  explicit StringListRecord(TypeRecordKind Kind) : TypeRecord(Kind) {}
+
+  StringListRecord(TypeRecordKind Kind, ArrayRef<TypeIndex> Indices)
+      : TypeRecord(Kind), StringIndices(Indices) {}
 
   ArrayRef<TypeIndex> getIndices() const { return StringIndices; }
 
@@ -290,10 +288,6 @@ public:
       : TypeRecord(TypeRecordKind::Pointer), ReferentType(ReferentType),
         Attrs(Attrs), MemberInfo(Member) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getReferentType() const { return ReferentType; }
 
   PointerKind getPointerKind() const {
@@ -356,10 +350,6 @@ public:
   NestedTypeRecord(TypeIndex Type, StringRef Name)
       : TypeRecord(TypeRecordKind::NestedType), Type(Type), Name(Name) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getNestedType() const { return Type; }
   StringRef getName() const { return Name; }
 
@@ -374,10 +364,6 @@ public:
   explicit FieldListRecord(ArrayRef<uint8_t> Data)
       : TypeRecord(TypeRecordKind::FieldList), Data(Data) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap) { return false; }
-
   ArrayRef<uint8_t> Data;
 };
 
@@ -390,10 +376,6 @@ public:
       : TypeRecord(TypeRecordKind::Array), ElementType(ElementType),
         IndexType(IndexType), Size(Size), Name(Name) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getElementType() const { return ElementType; }
   TypeIndex getIndexType() const { return IndexType; }
   uint64_t getSize() const { return Size; }
@@ -414,10 +396,6 @@ protected:
         FieldList(FieldList), Name(Name), UniqueName(UniqueName) {}
 
 public:
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   static const int HfaKindShift = 11;
   static const int HfaKindMask = 0x1800;
   static const int WinRTKindShift = 14;
@@ -451,10 +429,6 @@ public:
       : TagRecord(Kind, MemberCount, Options, FieldList, Name, UniqueName),
         DerivationList(DerivationList), VTableShape(VTableShape), Size(Size) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   HfaKind getHfa() const {
     uint16_t Value = static_cast<uint16_t>(Options);
     Value = (Value & HfaKindMask) >> HfaKindShift;
@@ -506,9 +480,6 @@ public:
                   UniqueName),
         UnderlyingType(UnderlyingType) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getUnderlyingType() const { return UnderlyingType; }
   TypeIndex UnderlyingType;
 };
@@ -521,10 +492,6 @@ public:
       : TypeRecord(TypeRecordKind::BitField), Type(Type), BitSize(BitSize),
         BitOffset(BitOffset) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getType() const { return Type; }
   uint8_t getBitOffset() const { return BitOffset; }
   uint8_t getBitSize() const { return BitSize; }
@@ -542,10 +509,6 @@ public:
   explicit VFTableShapeRecord(std::vector<VFTableSlotKind> Slots)
       : TypeRecord(TypeRecordKind::VFTableShape), Slots(std::move(Slots)) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   ArrayRef<VFTableSlotKind> getSlots() const {
     if (!SlotsRef.empty())
       return SlotsRef;
@@ -565,10 +528,6 @@ public:
       : TypeRecord(TypeRecordKind::TypeServer2), Guid(Guid), Age(Age),
         Name(Name) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   StringRef getGuid() const { return Guid; }
 
   uint32_t getAge() const { return Age; }
@@ -587,10 +546,6 @@ public:
   StringIdRecord(TypeIndex Id, StringRef String)
       : TypeRecord(TypeRecordKind::StringId), Id(Id), String(String) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getId() const { return Id; }
 
   StringRef getString() const { return String; }
@@ -606,10 +561,6 @@ public:
       : TypeRecord(TypeRecordKind::FuncId), ParentScope(ParentScope),
         FunctionType(FunctionType), Name(Name) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getParentScope() const { return ParentScope; }
 
   TypeIndex getFunctionType() const { return FunctionType; }
@@ -629,10 +580,6 @@ public:
       : TypeRecord(TypeRecordKind::UdtSourceLine), UDT(UDT),
         SourceFile(SourceFile), LineNumber(LineNumber) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getUDT() const { return UDT; }
   TypeIndex getSourceFile() const { return SourceFile; }
   uint32_t getLineNumber() const { return LineNumber; }
@@ -651,8 +598,6 @@ public:
       : TypeRecord(TypeRecordKind::UdtSourceLine), UDT(UDT),
         SourceFile(SourceFile), LineNumber(LineNumber), Module(Module) {}
 
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getUDT() const { return UDT; }
   TypeIndex getSourceFile() const { return SourceFile; }
   uint32_t getLineNumber() const { return LineNumber; }
@@ -672,10 +617,6 @@ public:
       : TypeRecord(TypeRecordKind::BuildInfo),
         ArgIndices(ArgIndices.begin(), ArgIndices.end()) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   ArrayRef<TypeIndex> getArgs() const { return ArgIndices; }
   SmallVector<TypeIndex, 4> ArgIndices;
 };
@@ -693,10 +634,6 @@ public:
     MethodNames.insert(MethodNames.end(), Methods.begin(), Methods.end());
   }
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getCompleteClass() const { return CompleteClass; }
   TypeIndex getOverriddenVTable() const { return OverriddenVFTable; }
   uint32_t getVFPtrOffset() const { return VFPtrOffset; }
@@ -725,10 +662,6 @@ public:
       : TypeRecord(TypeRecordKind::OneMethod), Type(Type),
         Attrs(Access, MK, Options), VFTableOffset(VFTableOffset), Name(Name) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getType() const { return Type; }
   MethodKind getMethodKind() const { return Attrs.getMethodKind(); }
   MethodOptions getOptions() const { return Attrs.getFlags(); }
@@ -754,10 +687,6 @@ public:
   MethodOverloadListRecord(ArrayRef<OneMethodRecord> Methods)
       : TypeRecord(TypeRecordKind::MethodOverloadList), Methods(Methods) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   ArrayRef<OneMethodRecord> getMethods() const { return Methods; }
   std::vector<OneMethodRecord> Methods;
 };
@@ -771,10 +700,6 @@ public:
       : TypeRecord(TypeRecordKind::OverloadedMethod),
         NumOverloads(NumOverloads), MethodList(MethodList), Name(Name) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   uint16_t getNumOverloads() const { return NumOverloads; }
   TypeIndex getMethodList() const { return MethodList; }
   StringRef getName() const { return Name; }
@@ -796,10 +721,6 @@ public:
       : TypeRecord(TypeRecordKind::DataMember), Attrs(Access), Type(Type),
         FieldOffset(Offset), Name(Name) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   MemberAccess getAccess() const { return Attrs.getAccess(); }
   TypeIndex getType() const { return Type; }
   uint64_t getFieldOffset() const { return FieldOffset; }
@@ -822,10 +743,6 @@ public:
       : TypeRecord(TypeRecordKind::StaticDataMember), Attrs(Access), Type(Type),
         Name(Name) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   MemberAccess getAccess() const { return Attrs.getAccess(); }
   TypeIndex getType() const { return Type; }
   StringRef getName() const { return Name; }
@@ -846,10 +763,6 @@ public:
       : TypeRecord(TypeRecordKind::Enumerator), Attrs(Access),
         Value(std::move(Value)), Name(Name) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   MemberAccess getAccess() const { return Attrs.getAccess(); }
   APSInt getValue() const { return Value; }
   StringRef getName() const { return Name; }
@@ -866,10 +779,6 @@ public:
   VFPtrRecord(TypeIndex Type)
       : TypeRecord(TypeRecordKind::VFPtr), Type(Type) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex getType() const { return Type; }
 
   TypeIndex Type;
@@ -886,10 +795,6 @@ public:
       : TypeRecord(TypeRecordKind::BaseClass), Attrs(Access), Type(Type),
         Offset(Offset) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   MemberAccess getAccess() const { return Attrs.getAccess(); }
   TypeIndex getBaseType() const { return Type; }
   uint64_t getBaseOffset() const { return Offset; }
@@ -914,10 +819,6 @@ public:
       : TypeRecord(Kind), Attrs(Access), BaseType(BaseType),
         VBPtrType(VBPtrType), VBPtrOffset(Offset), VTableIndex(Index) {}
 
-  /// Rewrite member type indices with IndexMap. Returns false if a type index
-  /// is not in the map.
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   MemberAccess getAccess() const { return Attrs.getAccess(); }
   TypeIndex getBaseType() const { return BaseType; }
   TypeIndex getVBPtrType() const { return VBPtrType; }
@@ -942,8 +843,6 @@ public:
 
   TypeIndex getContinuationIndex() const { return ContinuationIndex; }
 
-  bool remapTypeIndices(ArrayRef<TypeIndex> IndexMap);
-
   TypeIndex ContinuationIndex;
 };
 
diff --git a/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h b/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h
index fe470a72abbb15e80ac727bf070c0f1d07964779..924ca0470fad4760ccde7be410bdd1c434169bb1 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h
+++ b/include/llvm/DebugInfo/CodeView/TypeRecordMapping.h
@@ -16,15 +16,14 @@
 #include "llvm/Support/Error.h"
 
 namespace llvm {
-namespace msf {
-class StreamReader;
-class StreamWriter;
-}
+class BinaryStreamReader;
+class BinaryStreamWriter;
+
 namespace codeview {
 class TypeRecordMapping : public TypeVisitorCallbacks {
 public:
-  explicit TypeRecordMapping(msf::StreamReader &Reader) : IO(Reader) {}
-  explicit TypeRecordMapping(msf::StreamWriter &Writer) : IO(Writer) {}
+  explicit TypeRecordMapping(BinaryStreamReader &Reader) : IO(Reader) {}
+  explicit TypeRecordMapping(BinaryStreamWriter &Writer) : IO(Writer) {}
 
   Error visitTypeBegin(CVType &Record) override;
   Error visitTypeEnd(CVType &Record) override;
diff --git a/include/llvm/DebugInfo/CodeView/TypeRecords.def b/include/llvm/DebugInfo/CodeView/TypeRecords.def
index c98dbac21a7a008ad7dc373cc01ea5c0c7b40355..8c193bb13cb7e87dcf2ee00af434e8f65640e4c1 100644
--- a/include/llvm/DebugInfo/CodeView/TypeRecords.def
+++ b/include/llvm/DebugInfo/CodeView/TypeRecords.def
@@ -41,6 +41,7 @@ TYPE_RECORD(LF_POINTER, 0x1002, Pointer)
 TYPE_RECORD(LF_MODIFIER, 0x1001, Modifier)
 TYPE_RECORD(LF_PROCEDURE, 0x1008, Procedure)
 TYPE_RECORD(LF_MFUNCTION, 0x1009, MemberFunction)
+TYPE_RECORD(LF_LABEL, 0x000e, Label)
 TYPE_RECORD(LF_ARGLIST, 0x1201, ArgList)
 
 TYPE_RECORD(LF_FIELDLIST, 0x1203, FieldList)
@@ -79,9 +80,7 @@ MEMBER_RECORD(LF_INDEX, 0x1404, ListContinuation)
 TYPE_RECORD(LF_FUNC_ID, 0x1601, FuncId)
 TYPE_RECORD(LF_MFUNC_ID, 0x1602, MemberFuncId)
 TYPE_RECORD(LF_BUILDINFO, 0x1603, BuildInfo)
-// FIXME: We reuse the structure of ArgListRecord for substring lists, but it
-// makes for confusing dumper output.
-TYPE_RECORD_ALIAS(LF_SUBSTR_LIST, 0x1604, StringList, ArgList)
+TYPE_RECORD(LF_SUBSTR_LIST, 0x1604, StringList)
 TYPE_RECORD(LF_STRING_ID, 0x1605, StringId)
 TYPE_RECORD(LF_UDT_SRC_LINE, 0x1606, UdtSourceLine)
 TYPE_RECORD(LF_UDT_MOD_SRC_LINE, 0x1607, UdtModSourceLine)
@@ -103,7 +102,6 @@ CV_TYPE(LF_MFUNCTION_16t, 0x0009)
 CV_TYPE(LF_COBOL0_16t, 0x000b)
 CV_TYPE(LF_COBOL1, 0x000c)
 CV_TYPE(LF_BARRAY_16t, 0x000d)
-CV_TYPE(LF_LABEL, 0x000e)
 CV_TYPE(LF_NULLLEAF, 0x000f) // LF_NULL
 CV_TYPE(LF_NOTTRAN, 0x0010)
 CV_TYPE(LF_DIMARRAY_16t, 0x0011)
diff --git a/include/llvm/DebugInfo/CodeView/TypeSerializer.h b/include/llvm/DebugInfo/CodeView/TypeSerializer.h
index e0592219463842567258df7bebe8f6a9ac0d0134..1f4873c4f96938fb1616c98b0ccd15b2a2da7939 100644
--- a/include/llvm/DebugInfo/CodeView/TypeSerializer.h
+++ b/include/llvm/DebugInfo/CodeView/TypeSerializer.h
@@ -12,8 +12,8 @@
 
 #include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
@@ -56,8 +56,8 @@ class TypeSerializer : public TypeVisitorCallbacks {
   Optional<TypeLeafKind> TypeKind;
   Optional<TypeLeafKind> MemberKind;
   std::vector<uint8_t> RecordBuffer;
-  msf::MutableByteStream Stream;
-  msf::StreamWriter Writer;
+  MutableBinaryByteStream Stream;
+  BinaryStreamWriter Writer;
   TypeRecordMapping Mapping;
 
   RecordList SeenRecords;
diff --git a/include/llvm/DebugInfo/CodeView/TypeServerHandler.h b/include/llvm/DebugInfo/CodeView/TypeServerHandler.h
new file mode 100644
index 0000000000000000000000000000000000000000..35f06eaf6eb4005ac0c515f08a45052a71b85206
--- /dev/null
+++ b/include/llvm/DebugInfo/CodeView/TypeServerHandler.h
@@ -0,0 +1,36 @@
+//===- TypeServerHandler.h --------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H
+#define LLVM_DEBUGINFO_CODEVIEW_TYPESERVERHANDLER_H
+
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace codeview {
+class TypeVisitorCallbacks;
+
+class TypeServerHandler {
+public:
+  virtual ~TypeServerHandler() {}
+
+  /// Handle a TypeServer record.  If the implementation returns true
+  /// the record will not be processed by the top-level visitor.  If
+  /// it returns false, it will be processed.  If it returns an Error,
+  /// then the top-level visitor will fail.
+  virtual Expected<bool> handle(TypeServer2Record &TS,
+                                TypeVisitorCallbacks &Callbacks) {
+    return false;
+  }
+};
+}
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
index af396c79d074199cc4289edde0eae174295c117b..2246f197e78436382c469be03bf7ffb962e64783 100644
--- a/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
+++ b/include/llvm/DebugInfo/CodeView/TypeStreamMerger.h
@@ -13,12 +13,17 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/Support/Error.h"
 
 namespace llvm {
 namespace codeview {
 
+class TypeServerHandler;
+
 /// Merges one type stream into another. Returns true on success.
-bool mergeTypeStreams(TypeTableBuilder &DestStream, const CVTypeArray &Types);
+Error mergeTypeStreams(TypeTableBuilder &DestIdStream,
+                       TypeTableBuilder &DestTypeStream,
+                       TypeServerHandler *Handler, const CVTypeArray &Types);
 
 } // end namespace codeview
 } // end namespace llvm
diff --git a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
index 4e6d81ece318b0990d4628db5ef417f20433944d..102bee4b0801e1a316b05ef7bd9c46fe2b145af3 100644
--- a/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
+++ b/include/llvm/DebugInfo/CodeView/TypeTableBuilder.h
@@ -121,6 +121,12 @@ public:
     }
     return Index;
   }
+
+  /// Stop building the record.
+  void reset() {
+    if (auto EC = TempSerializer.visitTypeEnd(Type))
+      consumeError(std::move(EC));
+  }
 };
 
 } // end namespace codeview
diff --git a/include/llvm/DebugInfo/DIContext.h b/include/llvm/DebugInfo/DIContext.h
index 804419c517dfc66aa1ac3c54f2711db7b0504128..e3386a8dcd24c69fa3cb8dceb35ae4326bd8e0c3 100644
--- a/include/llvm/DebugInfo/DIContext.h
+++ b/include/llvm/DebugInfo/DIContext.h
@@ -1,4 +1,4 @@
-//===-- DIContext.h ---------------------------------------------*- C++ -*-===//
+//===- DIContext.h ----------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -32,26 +32,28 @@ class raw_ostream;
 struct DILineInfo {
   std::string FileName;
   std::string FunctionName;
-  uint32_t Line;
-  uint32_t Column;
+  uint32_t Line = 0;
+  uint32_t Column = 0;
+  uint32_t StartLine = 0;
 
   // DWARF-specific.
-  uint32_t Discriminator;
+  uint32_t Discriminator = 0;
 
-  DILineInfo()
-      : FileName("<invalid>"), FunctionName("<invalid>"), Line(0), Column(0),
-        Discriminator(0) {}
+  DILineInfo() : FileName("<invalid>"), FunctionName("<invalid>") {}
 
   bool operator==(const DILineInfo &RHS) const {
     return Line == RHS.Line && Column == RHS.Column &&
-           FileName == RHS.FileName && FunctionName == RHS.FunctionName;
+           FileName == RHS.FileName && FunctionName == RHS.FunctionName &&
+           StartLine == RHS.StartLine && Discriminator == RHS.Discriminator;
   }
   bool operator!=(const DILineInfo &RHS) const {
     return !(*this == RHS);
   }
   bool operator<(const DILineInfo &RHS) const {
-    return std::tie(FileName, FunctionName, Line, Column) <
-           std::tie(RHS.FileName, RHS.FunctionName, RHS.Line, RHS.Column);
+    return std::tie(FileName, FunctionName, Line, Column, StartLine,
+                    Discriminator) <
+           std::tie(RHS.FileName, RHS.FunctionName, RHS.Line, RHS.Column,
+                    RHS.StartLine, RHS.Discriminator);
   }
 };
 
@@ -86,10 +88,10 @@ public:
 /// DIGlobal - container for description of a global variable.
 struct DIGlobal {
   std::string Name;
-  uint64_t Start;
-  uint64_t Size;
+  uint64_t Start = 0;
+  uint64_t Size = 0;
 
-  DIGlobal() : Name("<invalid>"), Start(0), Size(0) {}
+  DIGlobal() : Name("<invalid>") {}
 };
 
 /// A DINameKind is passed to name search methods to specify a
@@ -175,8 +177,8 @@ private:
 /// on the fly.
 class LoadedObjectInfo {
 protected:
-  LoadedObjectInfo(const LoadedObjectInfo &) = default;
   LoadedObjectInfo() = default;
+  LoadedObjectInfo(const LoadedObjectInfo &) = default;
 
 public:
   virtual ~LoadedObjectInfo() = default;
diff --git a/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
index b10e849902cebf9b6ae1ff33d2cfb16870d91073..7324f6e3eb387fc1bc67527f3cc74ffa355479e6 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
@@ -1,4 +1,4 @@
-//===-- DWARFAbbreviationDeclaration.h --------------------------*- C++ -*-===//
+//===- DWARFAbbreviationDeclaration.h ---------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
-#define LLVM_LIB_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
+#ifndef LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
+#define LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
 
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Dwarf.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
 namespace llvm {
 
-class DWARFUnit;
 class DWARFFormValue;
+class DWARFUnit;
 class raw_ostream;
 
 class DWARFAbbreviationDeclaration {
@@ -25,6 +30,7 @@ public:
   struct AttributeSpec {
     AttributeSpec(dwarf::Attribute A, dwarf::Form F, Optional<int64_t> V)
         : Attr(A), Form(F), ByteSizeOrValue(V) {}
+
     dwarf::Attribute Attr;
     dwarf::Form Form;
     /// The following field is used for ByteSize for non-implicit_const
@@ -41,9 +47,11 @@ public:
     /// * Form == DW_FORM_implicit_const:
     ///     ByteSizeOrValue contains value for the implicit_const attribute.
     Optional<int64_t> ByteSizeOrValue;
+
     bool isImplicitConst() const {
       return Form == dwarf::DW_FORM_implicit_const;
     }
+
     /// Get the fixed byte size of this Form if possible. This function might
     /// use the DWARFUnit to calculate the size of the Form, like for
     /// DW_AT_address and DW_AT_ref_addr, so this isn't just an accessor for
@@ -118,16 +126,16 @@ private:
   /// abbreviation declaration.
   struct FixedSizeInfo {
     /// The fixed byte size for fixed size forms.
-    uint16_t NumBytes;
+    uint16_t NumBytes = 0;
     /// Number of DW_FORM_address forms in this abbrevation declaration.
-    uint8_t NumAddrs;
+    uint8_t NumAddrs = 0;
     /// Number of DW_FORM_ref_addr forms in this abbrevation declaration.
-    uint8_t NumRefAddrs;
+    uint8_t NumRefAddrs = 0;
     /// Number of 4 byte in DWARF32 and 8 byte in DWARF64 forms.
-    uint8_t NumDwarfOffsets;
-    /// Constructor
-    FixedSizeInfo()
-        : NumBytes(0), NumAddrs(0), NumRefAddrs(0), NumDwarfOffsets(0) {}
+    uint8_t NumDwarfOffsets = 0;
+
+    FixedSizeInfo() = default;
+
     /// Calculate the fixed size in bytes given a DWARFUnit.
     ///
     /// \param U the DWARFUnit to use when determing the byte size.
@@ -147,6 +155,6 @@ private:
   Optional<FixedSizeInfo> FixedAttributeSize;
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARFABBREVIATIONDECLARATION_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
index 63343728fa99a6a10705802a8502371628c4bb45..f95a013d7552381e14254050fe40ef11f09f6617 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
@@ -1,4 +1,4 @@
-//===--- DWARFAcceleratorTable.h --------------------------------*- C++ -*-===//
+//===- DWARFAcceleratorTable.h ----------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +7,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFACCELERATORTABLE_H
-#define LLVM_LIB_DEBUGINFO_DWARFACCELERATORTABLE_H
+#ifndef LLVM_DEBUGINFO_DWARFACCELERATORTABLE_H
+#define LLVM_DEBUGINFO_DWARFACCELERATORTABLE_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Dwarf.h"
 #include <cstdint>
+#include <utility>
 
 namespace llvm {
 
-class DWARFAcceleratorTable {
+class raw_ostream;
 
+class DWARFAcceleratorTable {
   struct Header {
     uint32_t Magic;
     uint16_t Version;
@@ -41,6 +43,7 @@ class DWARFAcceleratorTable {
   DataExtractor AccelSection;
   DataExtractor StringSection;
   const RelocAddrMap& Relocs;
+
 public:
   DWARFAcceleratorTable(DataExtractor AccelSection, DataExtractor StringSection,
                         const RelocAddrMap &Relocs)
@@ -50,6 +53,6 @@ public:
   void dump(raw_ostream &OS) const;
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARFACCELERATORTABLE_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFAttribute.h b/include/llvm/DebugInfo/DWARF/DWARFAttribute.h
index 0974f827db55afe6ca27bc6b871767983ee2f59a..5919aaddea409857d8d904c869d293c96bb8dc32 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFAttribute.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFAttribute.h
@@ -1,4 +1,4 @@
-//===-- DWARFAttribute.h ----------------------------------------*- C++ -*-===//
+//===- DWARFAttribute.h -----------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFATTRIBUTE_H
-#define LLVM_LIB_DEBUGINFO_DWARFATTRIBUTE_H
+#ifndef LLVM_DEBUGINFO_DWARFATTRIBUTE_H
+#define LLVM_DEBUGINFO_DWARFATTRIBUTE_H
 
-#include "llvm/Support/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/Support/Dwarf.h"
+#include <cstdint>
 
 namespace llvm {
 
@@ -23,17 +24,16 @@ namespace llvm {
 /// attributes in a DWARFDie.
 struct DWARFAttribute {
   /// The debug info/types offset for this attribute.
-  uint32_t Offset;
+  uint32_t Offset = 0;
   /// The debug info/types section byte size of the data for this attribute.
-  uint32_t ByteSize;
+  uint32_t ByteSize = 0;
   /// The attribute enumeration of this attribute.
   dwarf::Attribute Attr;
   /// The form and value for this attribute.
   DWARFFormValue Value;
   
   DWARFAttribute(uint32_t O, dwarf::Attribute A = dwarf::Attribute(0),
-                 dwarf::Form F = dwarf::Form(0)) :
-      Offset(0), ByteSize(0), Attr(A), Value(F) {}
+                 dwarf::Form F = dwarf::Form(0)) : Attr(A), Value(F) {}
   
   bool isValid() const {
     return Offset != 0 && Attr != dwarf::Attribute(0);
@@ -51,6 +51,6 @@ struct DWARFAttribute {
   }
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARFATTRIBUTE_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
index bba3abe6e9e96ea30974004ad8075ad04649b330..b2a4d247ccc6be27f5a6b9c3fe7fbcea66e79cda 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFCompileUnit.h
@@ -1,4 +1,4 @@
-//===-- DWARFCompileUnit.h --------------------------------------*- C++ -*-===//
+//===- DWARFCompileUnit.h ---------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFCOMPILEUNIT_H
-#define LLVM_LIB_DEBUGINFO_DWARFCOMPILEUNIT_H
+#ifndef LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
+#define LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
 
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 
 namespace llvm {
 
@@ -23,12 +24,15 @@ public:
                    const DWARFUnitIndex::Entry *Entry)
       : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
                   UnitSection, Entry) {}
-  void dump(raw_ostream &OS);
-  static const DWARFSectionKind Section = DW_SECT_INFO;
+
   // VTable anchor.
   ~DWARFCompileUnit() override;
+
+  void dump(raw_ostream &OS);
+
+  static const DWARFSectionKind Section = DW_SECT_INFO;
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARFCOMPILEUNIT_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h
index e6c26efbdaba14e46e10738942d8a5921fae20b1..f941cdd1060a58782ce768eb7d73ba56bc02f1b4 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFContext.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h
@@ -1,4 +1,4 @@
-//===-- DWARFContext.h ------------------------------------------*- C++ -*-===//
+//===- DWARFContext.h -------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===/
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFCONTEXT_H
-#define LLVM_LIB_DEBUGINFO_DWARFCONTEXT_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFCONTEXT_H
+#define LLVM_DEBUGINFO_DWARF_DWARFCONTEXT_H
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/iterator_range.h"
@@ -31,6 +31,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Host.h"
 #include <cstdint>
 #include <deque>
 #include <map>
@@ -39,6 +40,9 @@
 
 namespace llvm {
 
+class MemoryBuffer;
+class raw_ostream;
+
 // In place of applying the relocations to the data we've read from disk we use
 // a separate mapping table to the side and checking that at locations in the
 // dwarf where we expect relocated values. This adds a bit of complexity to the
@@ -328,20 +332,26 @@ public:
 
   // Sections for DWARF5 split dwarf proposal.
   const DWARFSection &getInfoDWOSection() override { return InfoDWOSection; }
+
   const TypeSectionMap &getTypesDWOSections() override {
     return TypesDWOSections;
   }
+
   StringRef getAbbrevDWOSection() override { return AbbrevDWOSection; }
   const DWARFSection &getLineDWOSection() override { return LineDWOSection; }
   const DWARFSection &getLocDWOSection() override { return LocDWOSection; }
   StringRef getStringDWOSection() override { return StringDWOSection; }
+
   StringRef getStringOffsetDWOSection() override {
     return StringOffsetDWOSection;
   }
+
   StringRef getRangeDWOSection() override { return RangeDWOSection; }
+
   StringRef getAddrSection() override {
     return AddrSection;
   }
+
   StringRef getCUIndexSection() override { return CUIndexSection; }
   StringRef getGdbIndexSection() override { return GdbIndexSection; }
   StringRef getTUIndexSection() override { return TUIndexSection; }
@@ -349,4 +359,4 @@ public:
 
 } // end namespace llvm
 
-#endif // LLVM_LIB_DEBUGINFO_DWARFCONTEXT_H
+#endif // LLVM_DEBUGINFO_DWARF_DWARFCONTEXT_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
index f732deef548c8787392cfb6086a57869d0b3ad9e..9f86fe5083896795f90a52d46c61054d569e9b6c 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h
@@ -1,4 +1,4 @@
-//===-- DWARFDebugAbbrev.h --------------------------------------*- C++ -*-===//
+//===- DWARFDebugAbbrev.h ---------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGABBREV_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGABBREV_H
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGABBREV_H
+#define LLVM_DEBUGINFO_DWARFDEBUGABBREV_H
 
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
+#include "llvm/Support/DataExtractor.h"
+#include <cstdint>
 #include <map>
 #include <vector>
 
@@ -76,6 +78,6 @@ private:
   void clear();
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARFDEBUGABBREV_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h b/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
index 5a602392add81472dfa6cfc1ecb86e22bc468066..40eb7e9a88364affaa99d13c2b50218a04bd799e 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h
@@ -1,4 +1,4 @@
-//===-- DWARFDebugArangeSet.h -----------------------------------*- C++ -*-===//
+//===- DWARFDebugArangeSet.h ------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGARANGESET_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGARANGESET_H
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGARANGESET_H
+#define LLVM_DEBUGINFO_DWARFDEBUGARANGESET_H
 
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/DataExtractor.h"
+#include <cstdint>
 #include <vector>
 
 namespace llvm {
@@ -40,6 +41,7 @@ public:
   struct Descriptor {
     uint64_t Address;
     uint64_t Length;
+
     uint64_t getEndAddress() const { return Address + Length; }
   };
 
@@ -53,6 +55,7 @@ private:
 
 public:
   DWARFDebugArangeSet() { clear(); }
+
   void clear();
   bool extract(DataExtractor data, uint32_t *offset_ptr);
   void dump(raw_ostream &OS) const;
@@ -67,6 +70,6 @@ public:
   }
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARFDEBUGARANGESET_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
index 791f010a8892ba38de4c95d45f66ee49d1178cfb..c06771d6afb4397ca7628099b2112c11649152ba 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugAranges.h
@@ -1,4 +1,4 @@
-//===-- DWARFDebugAranges.h -------------------------------------*- C++ -*-===//
+//===- DWARFDebugAranges.h --------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGARANGES_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGARANGES_H
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
+#define LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
 
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/Support/DataExtractor.h"
+#include <cstdint>
 #include <vector>
 
 namespace llvm {
@@ -42,6 +43,7 @@ private:
       else
         Length = HighPC - LowPC;
     }
+
     uint64_t HighPC() const {
       if (Length)
         return LowPC + Length;
@@ -51,6 +53,7 @@ private:
     bool containsAddress(uint64_t Address) const {
       return LowPC <= Address && Address < HighPC();
     }
+
     bool operator<(const Range &other) const {
       return LowPC < other.LowPC;
     }
@@ -73,7 +76,6 @@ private:
     }
   };
 
-
   typedef std::vector<Range>              RangeColl;
   typedef RangeColl::const_iterator       RangeCollIterator;
 
@@ -82,6 +84,6 @@ private:
   DenseSet<uint32_t> ParsedCUOffsets;
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARFDEBUGARANGES_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
index cd76c909ddaef873bfee6fb10704f07f1bd83f66..e0a779bb81823503aeab256d3a401e0a86618d9e 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
@@ -1,4 +1,4 @@
-//===-- DWARFDebugFrame.h - Parsing of .debug_frame -------------*- C++ -*-===//
+//===- DWARFDebugFrame.h - Parsing of .debug_frame --------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,23 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGFRAME_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGFRAME_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFDEBUGFRAME_H
+#define LLVM_DEBUGINFO_DWARF_DWARFDEBUGFRAME_H
 
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/raw_ostream.h"
 #include <memory>
 #include <vector>
 
 namespace llvm {
 
 class FrameEntry;
+class raw_ostream;
 
 /// \brief A parsed .debug_frame or .eh_frame section
 ///
 class DWARFDebugFrame {
   // True if this is parsing an eh_frame section.
   bool IsEH;
+
 public:
   DWARFDebugFrame(bool IsEH);
   ~DWARFDebugFrame();
@@ -39,7 +40,6 @@ private:
   std::vector<std::unique_ptr<FrameEntry>> Entries;
 };
 
+} // end namespace llvm
 
-} // namespace llvm
-
-#endif
+#endif // LLVM_DEBUGINFO_DWARF_DWARFDEBUGFRAME_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h b/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
index f36f470980b1a9e27b51932435e99924d32fdd08..fc2423a2708b8d2237c45b5ae36a2f256ef3a3a8 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h
@@ -1,4 +1,4 @@
-//===-- DWARFDebugInfoEntry.h -----------------------------------*- C++ -*-===//
+//===- DWARFDebugInfoEntry.h ------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,43 +7,37 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGINFOENTRY_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGINFOENTRY_H
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
+#define LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
 
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Dwarf.h"
+#include <cstdint>
 
 namespace llvm {
 
-class DWARFDebugAranges;
-class DWARFCompileUnit;
+class DataExtractor;
 class DWARFUnit;
-class DWARFContext;
-class DWARFFormValue;
-struct DWARFDebugInfoEntryInlinedChain;
 
 /// DWARFDebugInfoEntry - A DIE with only the minimum required data.
 class DWARFDebugInfoEntry {
   /// Offset within the .debug_info of the start of this entry.
-  uint32_t Offset;
+  uint32_t Offset = 0;
 
   /// The integer depth of this DIE within the compile unit DIEs where the
   /// compile/type unit DIE has a depth of zero.
-  uint32_t Depth;
+  uint32_t Depth = 0;
+
+  const DWARFAbbreviationDeclaration *AbbrevDecl = nullptr;
 
-  const DWARFAbbreviationDeclaration *AbbrevDecl;
 public:
-  DWARFDebugInfoEntry()
-    : Offset(0), Depth(0), AbbrevDecl(nullptr) {}
+  DWARFDebugInfoEntry() = default;
 
   /// Extracts a debug info entry, which is a child of a given unit,
   /// starting at a given offset. If DIE can't be extracted, returns false and
   /// doesn't change OffsetPtr.
   bool extractFast(const DWARFUnit &U, uint32_t *OffsetPtr);
+
   /// High performance extraction should use this call.
   bool extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
                    const DataExtractor &DebugInfoData,
@@ -52,15 +46,18 @@ public:
 
   uint32_t getOffset() const { return Offset; }
   uint32_t getDepth() const { return Depth; }
+
   dwarf::Tag getTag() const {
     return AbbrevDecl ? AbbrevDecl->getTag() : dwarf::DW_TAG_null;
   }
+
   bool hasChildren() const { return AbbrevDecl && AbbrevDecl->hasChildren(); }
+
   const DWARFAbbreviationDeclaration *getAbbreviationDeclarationPtr() const {
     return AbbrevDecl;
   }
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARFDEBUGINFOENTRY_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
index 878f1c76ebf6742ae8a3ff714603ec4e4a25b815..e5bb24707b638402869892acb40c5b8bf3142ba3 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h
@@ -1,4 +1,4 @@
-//===-- DWARFDebugLine.h ----------------------------------------*- C++ -*-===//
+//===- DWARFDebugLine.h -----------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGLINE_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGLINE_H
+#ifndef LLVM_DEBUGINFO_DWARFDEBUGLINE_H
+#define LLVM_DEBUGINFO_DWARFDEBUGLINE_H
 
 #include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/DataExtractor.h"
+#include <cstdint>
 #include <map>
 #include <string>
 #include <vector>
@@ -24,13 +25,14 @@ class raw_ostream;
 class DWARFDebugLine {
 public:
   DWARFDebugLine(const RelocAddrMap* LineInfoRelocMap) : RelocMap(LineInfoRelocMap) {}
+
   struct FileNameEntry {
-    FileNameEntry() : Name(nullptr), DirIdx(0), ModTime(0), Length(0) {}
+    FileNameEntry() = default;
 
-    const char *Name;
-    uint64_t DirIdx;
-    uint64_t ModTime;
-    uint64_t Length;
+    const char *Name = nullptr;
+    uint64_t DirIdx = 0;
+    uint64_t ModTime = 0;
+    uint64_t Length = 0;
   };
 
   struct Prologue {
@@ -64,9 +66,11 @@ public:
     std::vector<FileNameEntry> FileNames;
 
     bool IsDWARF64;
+
     uint32_t sizeofTotalLength() const {
       return IsDWARF64 ? 12 : 4;
     }
+
     uint32_t sizeofPrologueLength() const {
       return IsDWARF64 ? 8 : 4;
     }
@@ -76,10 +80,12 @@ public:
       return PrologueLength + sizeofTotalLength() + sizeof(Version) +
              sizeofPrologueLength();
     }
+
     // Length of the line table data in bytes (not including the prologue).
     uint32_t getStatementTableLength() const {
       return TotalLength + sizeofTotalLength() - getLength();
     }
+
     int32_t getMaxLineIncrementForSpecialOpcode() const {
       return LineBase + (int8_t)LineRange - 1;
     }
@@ -146,6 +152,8 @@ public:
   // compilation unit may consist of multiple sequences, which are not
   // guaranteed to be in the order of ascending instruction address.
   struct Sequence {
+    Sequence();
+
     // Sequence describes instructions at address range [LowPC, HighPC)
     // and is described by line table rows [FirstRowIndex, LastRowIndex).
     uint64_t LowPC;
@@ -154,15 +162,16 @@ public:
     unsigned LastRowIndex;
     bool Empty;
 
-    Sequence();
     void reset();
 
     static bool orderByLowPC(const Sequence& LHS, const Sequence& RHS) {
       return LHS.LowPC < RHS.LowPC;
     }
+
     bool isValid() const {
       return !Empty && (LowPC < HighPC) && (FirstRowIndex < LastRowIndex);
     }
+
     bool containsPC(uint64_t pc) const {
       return (LowPC <= pc && pc < HighPC);
     }
@@ -177,6 +186,7 @@ public:
     void appendRow(const DWARFDebugLine::Row &R) {
       Rows.push_back(R);
     }
+
     void appendSequence(const DWARFDebugLine::Sequence &S) {
       Sequences.push_back(S);
     }
@@ -249,6 +259,7 @@ private:
   const RelocAddrMap *RelocMap;
   LineTableMapTy LineTableMap;
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARFDEBUGLINE_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
index bd44c2e5aab9b9a43b240af06f817a13b470f79d..6d4cd8d1b5a3c0842deba7248e06eb60c9034f50 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLoc.h
@@ -1,4 +1,4 @@
-//===-- DWARFDebugLoc.h -----------------------------------------*- C++ -*-===//
+//===- DWARFDebugLoc.h ------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGLOC_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGLOC_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFDEBUGLOC_H
+#define LLVM_DEBUGINFO_DWARF_DWARFDEBUGLOC_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/DataExtractor.h"
+#include <cstdint>
 
 namespace llvm {
 
@@ -49,8 +50,10 @@ class DWARFDebugLoc {
 
 public:
   DWARFDebugLoc(const RelocAddrMap &LocRelocMap) : RelocMap(LocRelocMap) {}
+
   /// Print the location lists found within the debug_loc section.
   void dump(raw_ostream &OS) const;
+
   /// Parse the debug_loc section accessible via the 'data' parameter using the
   /// specified address size to interpret the address ranges.
   void parse(DataExtractor data, unsigned AddressSize);
@@ -76,6 +79,7 @@ public:
   void parse(DataExtractor data);
   void dump(raw_ostream &OS) const;
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFDEBUGLOC_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h b/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
index 5a0352dacdb9ee46c4aa63217d9d537988f8981d..85d98b45afcd5f9078119de23e109efd4f7296ff 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugMacro.h
@@ -1,4 +1,4 @@
-//===-- DWARFDebugMacro.h ---------------------------------------*- C++ -*-===//
+//===- DWARFDebugMacro.h ----------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -50,6 +50,7 @@ public:
 
   /// Print the macro list found within the debug_macinfo section.
   void dump(raw_ostream &OS) const;
+
   /// Parse the debug_macinfo section accessible via the 'data' parameter.
   void parse(DataExtractor data);
 };
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h b/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
index 2b23837e32d69ad2197ad6d5b01cb76af4817795..9d36bb7ad211c5440d5d0ee08f17a781fe756cec 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugPubTable.h
@@ -1,4 +1,4 @@
-//===-- DWARFDebugPubTable.h ------------------------------------*- C++ -*-===//
+//===- DWARFDebugPubTable.h -------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGPUBTABLE_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGPUBTABLE_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFDEBUGPUBTABLE_H
+#define LLVM_DEBUGINFO_DWARF_DWARFDEBUGPUBTABLE_H
 
-#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/DataExtractor.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Dwarf.h"
+#include <cstdint>
 #include <vector>
 
 namespace llvm {
@@ -28,7 +28,7 @@ public:
     uint32_t SecOffset;
 
     /// An entry of the various gnu_pub* debug sections.
-    llvm::dwarf::PubIndexEntryDescriptor Descriptor;
+    dwarf::PubIndexEntryDescriptor Descriptor;
 
     /// The name of the object as given by the DW_AT_name attribute of the
     /// referenced DIE.
@@ -68,10 +68,12 @@ private:
 
 public:
   DWARFDebugPubTable(StringRef Data, bool LittleEndian, bool GnuStyle);
+
   void dump(StringRef Name, raw_ostream &OS) const;
 
   ArrayRef<Set> getData() { return Sets; }
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFDEBUGPUBTABLE_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
index c930bd603d4d570b91ae31dd8b0e786e17751329..018a049a3ed8184361fb8132fbda37fe94a3e5b9 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h
@@ -1,4 +1,4 @@
-//===-- DWARFDebugRangeList.h -----------------------------------*- C++ -*-===//
+//===- DWARFDebugRangeList.h ------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDEBUGRANGELIST_H
-#define LLVM_LIB_DEBUGINFO_DWARFDEBUGRANGELIST_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
+#define LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
 
 #include "llvm/Support/DataExtractor.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
 #include <vector>
 
 namespace llvm {
@@ -34,12 +37,14 @@ public:
     // address past the end of the address range. The ending address must
     // be greater than or equal to the beginning address.
     uint64_t EndAddress;
+
     // The end of any given range list is marked by an end of list entry,
     // which consists of a 0 for the beginning address offset
     // and a 0 for the ending address offset.
     bool isEndOfListEntry() const {
       return (StartAddress == 0) && (EndAddress == 0);
     }
+
     // A base address selection entry consists of:
     // 1. The value of the largest representable address offset
     // (for example, 0xffffffff when the size of an address is 32 bits).
@@ -63,6 +68,7 @@ private:
 
 public:
   DWARFDebugRangeList() { clear(); }
+
   void clear();
   void dump(raw_ostream &OS) const;
   bool extract(DataExtractor data, uint32_t *offset_ptr);
@@ -74,6 +80,6 @@ public:
   DWARFAddressRangesVector getAbsoluteRanges(uint64_t BaseAddress) const;
 };
 
-}  // namespace llvm
+} // end namespace llvm
 
-#endif  // LLVM_DEBUGINFO_DWARFDEBUGRANGELIST_H
+#endif // LLVM_DEBUGINFO_DWARF_DWARFDEBUGRANGELIST_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFDie.h b/include/llvm/DebugInfo/DWARF/DWARFDie.h
index 07651dd872ab2ac11f6011f940daeb17190595a0..33e24fe3adc909a0f829fcd7098065f6a82974a5 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFDie.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFDie.h
@@ -1,4 +1,4 @@
-//===-- DWARFDie.h --------------------------------------------------------===//
+//===- DWARFDie.h -----------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,20 +7,25 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFDIE_H
-#define LLVM_LIB_DEBUGINFO_DWARFDIE_H
+#ifndef LLVM_DEBUGINFO_DWARFDIE_H
+#define LLVM_DEBUGINFO_DWARFDIE_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/iterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFAttribute.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/Support/Dwarf.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 namespace llvm {
     
 class DWARFUnit;
-class DWARFDebugInfoEntry;
 class raw_ostream;
   
 //===----------------------------------------------------------------------===//
@@ -36,10 +41,11 @@ class raw_ostream;
 /// also simplifies the attribute extraction calls by not having to specify the
 /// DWARFUnit for each call.
 class DWARFDie {
-  DWARFUnit *U;
-  const DWARFDebugInfoEntry *Die;
+  DWARFUnit *U = nullptr;
+  const DWARFDebugInfoEntry *Die = nullptr;
+
 public:
-  DWARFDie() : U(nullptr), Die(nullptr) {}
+  DWARFDie() = default;
   DWARFDie(DWARFUnit *Unit, const DWARFDebugInfoEntry * D) : U(Unit), Die(D) {}
   
   bool isValid() const { return U && Die; }
@@ -47,7 +53,6 @@ public:
   const DWARFDebugInfoEntry *getDebugInfoEntry() const { return Die; }
   DWARFUnit *getDwarfUnit() const { return U; }
 
-
   /// Get the abbreviation declaration for this DIE.
   ///
   /// \returns the abbreviation declaration or NULL for null tags.
@@ -80,6 +85,7 @@ public:
   bool isNULL() const {
     return getAbbreviationDeclarationPtr() == nullptr;
   }
+
   /// Returns true if DIE represents a subprogram (not inlined).
   bool isSubprogramDIE() const;
 
@@ -140,14 +146,6 @@ public:
   /// exist in this DIE.
   Optional<DWARFFormValue> find(ArrayRef<dwarf::Attribute> Attrs) const;
 
-  /// Extract an attribute value from this DIE and recurse into any
-  /// DW_AT_specification or DW_AT_abstract_origin referenced DIEs.
-  ///
-  /// \param Attr the attribute to extract.
-  /// \returns an optional DWARFFormValue that will have the form value if the
-  /// attribute was successfully extracted.
-  Optional<DWARFFormValue> findRecursively(dwarf::Attribute Attr) const;
-
   /// Extract the first value of any attribute in Attrs from this DIE and
   /// recurse into any DW_AT_specification or DW_AT_abstract_origin referenced
   /// DIEs.
@@ -233,6 +231,12 @@ public:
   /// references if necessary. Returns null if no name is found.
   const char *getName(DINameKind Kind) const;
   
+  /// Returns the declaration line (start line) for a DIE, assuming it specifies
+  /// a subprogram. This may be fetched from specification or abstract origin
+  /// for this subprogram by resolving DW_AT_sepcification or
+  /// DW_AT_abstract_origin references if necessary.
+  uint64_t getDeclLine() const;
+
   /// Retrieves values of DW_AT_call_file, DW_AT_call_line and DW_AT_call_column
   /// from DIE (or zeroes if they are missing). This function looks for
   /// DW_AT_call attributes in this DIE only, it will not resolve the attribute
@@ -283,16 +287,17 @@ class DWARFDie::attribute_iterator :
   /// error will be set if the Err member variable is non-NULL and the iterator
   /// will be set to the end value so iteration stops.
   void updateForIndex(const DWARFAbbreviationDeclaration &AbbrDecl, uint32_t I);
+
 public:
   attribute_iterator() = delete;
   explicit attribute_iterator(DWARFDie D, bool End);
+
   attribute_iterator &operator++();
   explicit operator bool() const { return AttrValue.isValid(); }
   const DWARFAttribute &operator*() const { return AttrValue; }
   bool operator==(const attribute_iterator &X) const { return Index == X.Index; }
 };
 
-  
 inline bool operator==(const DWARFDie &LHS, const DWARFDie &RHS) {
   return LHS.getDebugInfoEntry() == RHS.getDebugInfoEntry() &&
       LHS.getDwarfUnit() == RHS.getDwarfUnit();
@@ -312,16 +317,19 @@ class DWARFDie::iterator : public iterator_facade_base<iterator,
   }
 public:
   iterator() = default;
+
   explicit iterator(DWARFDie D) : Die(D) {
     // If we start out with only a Null DIE then invalidate.
     skipNull();
   }
+
   iterator &operator++() {
     Die = Die.getSibling();
     // Don't include the NULL die when iterating.
     skipNull();
     return *this;
   }
+
   explicit operator bool() const { return Die.isValid(); }
   const DWARFDie &operator*() const { return Die; }
   bool operator==(const iterator &X) const { return Die == X.Die; }
@@ -343,4 +351,4 @@ inline iterator_range<DWARFDie::iterator> DWARFDie::children() const {
 
 } // end namespace llvm
 
-#endif  // LLVM_LIB_DEBUGINFO_DWARFDIE_H
+#endif // LLVM_DEBUGINFO_DWARFDIE_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
index 878dcab5801d12846dc39beb9f24866f6a75aa20..c8d7a0c1ac7a3014f53979af387d3e0b0c2f833e 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFFormValue.h
@@ -1,4 +1,4 @@
-//===-- DWARFFormValue.h ----------------------------------------*- C++ -*-===//
+//===- DWARFFormValue.h -----------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,13 +11,14 @@
 #define LLVM_DEBUGINFO_DWARFFORMVALUE_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Dwarf.h"
+#include <cstdint>
 
 namespace llvm {
 
-template <typename T> class ArrayRef;
 class DWARFUnit;
 class raw_ostream;
 
@@ -38,7 +39,7 @@ public:
 
 private:
   struct ValueType {
-    ValueType() : data(nullptr) {
+    ValueType() {
       uval = 0;
     }
 
@@ -47,24 +48,27 @@ private:
       int64_t sval;
       const char* cstr;
     };
-    const uint8_t* data;
+    const uint8_t* data = nullptr;
   };
 
   dwarf::Form Form; // Form for this value.
   ValueType Value; // Contains all data for the form.
-  const DWARFUnit *U; // Remember the DWARFUnit at extract time.
+  const DWARFUnit *U = nullptr; // Remember the DWARFUnit at extract time.
 
 public:
-  DWARFFormValue(dwarf::Form F = dwarf::Form(0)) : Form(F), U(nullptr) {}
+  DWARFFormValue(dwarf::Form F = dwarf::Form(0)) : Form(F) {}
+
   dwarf::Form getForm() const { return Form; }
   void setForm(dwarf::Form F) { Form = F; }
   void setUValue(uint64_t V) { Value.uval = V; }
   void setSValue(int64_t V) { Value.sval = V; }
   void setPValue(const char *V) { Value.cstr = V; }
+
   void setBlockValue(const ArrayRef<uint8_t> &Data) {
     Value.data = Data.data();
     setUValue(Data.size());
   }
+
   bool isFormClass(FormClass FC) const;
   const DWARFUnit *getUnit() const { return U; }
   void dump(raw_ostream &OS) const;
@@ -77,6 +81,7 @@ public:
   /// \returns whether the extraction succeeded.
   bool extractValue(const DataExtractor &Data, uint32_t *OffsetPtr,
                     const DWARFUnit *U);
+
   bool isInlinedCStr() const {
     return Value.data != nullptr && Value.data == (const uint8_t*)Value.cstr;
   }
@@ -92,6 +97,7 @@ public:
   Optional<ArrayRef<uint8_t>> getAsBlock() const;
   Optional<uint64_t> getAsCStringOffset() const;
   Optional<uint64_t> getAsReferenceUVal() const;
+
   /// Get the fixed byte size for a given form.
   ///
   /// If the form always has a fixed valid byte size that doesn't depend on a
@@ -110,6 +116,7 @@ public:
   /// and was needed to calculate the byte size.
   static Optional<uint8_t> getFixedByteSize(dwarf::Form Form,
                                             const DWARFUnit *U = nullptr);
+
   /// Get the fixed byte size for a given form.
   ///
   /// If the form has a fixed byte size given a valid DWARF version and address
@@ -138,6 +145,7 @@ public:
   /// \returns true on success, false if the form was not skipped.
   bool skipValue(DataExtractor debug_info_data, uint32_t *offset_ptr,
                  const DWARFUnit *U) const;
+
   /// Skip a form in \p debug_info_data at offset specified by \p offset_ptr.
   ///
   /// Skips the bytes for this form in the debug info and updates the offset.
@@ -150,6 +158,7 @@ public:
   /// \returns true on success, false if the form was not skipped.
   static bool skipValue(dwarf::Form form, DataExtractor debug_info_data,
                         uint32_t *offset_ptr, const DWARFUnit *U);
+
   /// Skip a form in \p debug_info_data at offset specified by \p offset_ptr.
   ///
   /// Skips the bytes for this form in the debug info and updates the offset.
@@ -170,6 +179,7 @@ private:
 };
 
 namespace dwarf {
+
   /// Take an optional DWARFFormValue and try to extract a string value from it.
   ///
   /// \param V and optional DWARFFormValue to attempt to extract the value from.
@@ -316,6 +326,6 @@ namespace dwarf {
 
 } // end namespace dwarf
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARFFORMVALUE_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h b/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
index 66041be9656616b345889c8a1955890194278dfc..7a52218663b9d3a9a13a4bc91be1a4011d58a11e 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFGdbIndex.h
@@ -1,4 +1,4 @@
-//===-- DWARFGdbIndex.h -----------------------------------------*- C++ -*-===//
+//===- DWARFGdbIndex.h ------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFGDBINDEX_H
-#define LLVM_LIB_DEBUGINFO_DWARFGDBINDEX_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFGDBINDEX_H
+#define LLVM_DEBUGINFO_DWARF_DWARFGDBINDEX_H
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
+
+class raw_ostream;
+
 class DWARFGdbIndex {
   uint32_t Version;
 
@@ -63,6 +68,7 @@ public:
   bool HasContent = false;
   bool HasError = false;
 };
-}
 
-#endif // LLVM_LIB_DEBUGINFO_DWARFGDBINDEX_H
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFGDBINDEX_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
index d7fe3032e505489594a0903d178147673e36f02b..af01bddeed153db52083c4fe6f549d47a363ae78 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h
@@ -1,4 +1,4 @@
-//===-- DWARFRelocMap.h -----------------------------------------*- C++ -*-===//
+//===- DWARFRelocMap.h ------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,16 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFRELOCMAP_H
-#define LLVM_LIB_DEBUGINFO_DWARFRELOCMAP_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFRELOCMAP_H
+#define LLVM_DEBUGINFO_DWARF_DWARFRELOCMAP_H
 
 #include "llvm/ADT/DenseMap.h"
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
 
-typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t> > RelocAddrMap;
+typedef DenseMap<uint64_t, std::pair<uint8_t, int64_t>> RelocAddrMap;
 
-} // namespace llvm
-
-#endif
+} // end namespace llvm
 
+#endif // LLVM_DEBUGINFO_DWARF_DWARFRELOCMAP_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFSection.h b/include/llvm/DebugInfo/DWARF/DWARFSection.h
index 3e27b529e97bfb048fabde52f4d6c9275f5143cc..2b8a53a4c93ea661d00470d0a74db3dccc598b10 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFSection.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFSection.h
@@ -1,4 +1,4 @@
-//===-- DWARFSection.h ------------------------------------------*- C++ -*-===//
+//===- DWARFSection.h -------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFSECTION_H
-#define LLVM_LIB_DEBUGINFO_DWARFSECTION_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFSECTION_H
+#define LLVM_DEBUGINFO_DWARF_DWARFSECTION_H
 
-#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 
 namespace llvm {
 
@@ -20,6 +20,6 @@ struct DWARFSection {
   RelocAddrMap Relocs;
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARF_DWARFSECTION_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
index 4f1e1292a1f1b5aa53b345e04b431d75c0b27967..703316005887cc241a912d86640e2f61e6480239 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFTypeUnit.h
@@ -1,4 +1,4 @@
-//===-- DWARFTypeUnit.h -----------------------------------------*- C++ -*-===//
+//===- DWARFTypeUnit.h ------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFTYPEUNIT_H
-#define LLVM_LIB_DEBUGINFO_DWARFTYPEUNIT_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFTYPEUNIT_H
+#define LLVM_DEBUGINFO_DWARF_DWARFTYPEUNIT_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+#include "llvm/Support/DataExtractor.h"
+#include <cstdint>
 
 namespace llvm {
 
+class DWARFContext;
+class DWARFDebugAbbrev;
+struct DWARFSection;
+class raw_ostream;
+
 class DWARFTypeUnit : public DWARFUnit {
 private:
   uint64_t TypeHash;
   uint32_t TypeOffset;
+
 public:
   DWARFTypeUnit(DWARFContext &Context, const DWARFSection &Section,
                 const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
@@ -26,9 +36,11 @@ public:
                 const DWARFUnitIndex::Entry *Entry)
       : DWARFUnit(Context, Section, DA, RS, SS, SOS, AOS, LS, LE, IsDWO,
                   UnitSection, Entry) {}
+
   uint32_t getHeaderSize() const override {
     return DWARFUnit::getHeaderSize() + 12;
   }
+
   void dump(raw_ostream &OS, bool Brief = false);
   static const DWARFSectionKind Section = DW_SECT_TYPES;
 
@@ -36,7 +48,6 @@ protected:
   bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) override;
 };
 
-}
-
-#endif
+} // end namespace llvm
 
+#endif // LLVM_DEBUGINFO_DWARF_DWARFTYPEUNIT_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index db7b59be90c2aeb8bda9fd6b24e0a0262b0bf0e0..40eb4434bd61e2918b3c4fb56f446074c7d89427 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -1,4 +1,4 @@
-//===-- DWARFUnit.h ---------------------------------------------*- C++ -*-===//
+//===- DWARFUnit.h ----------------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,32 +7,37 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFUNIT_H
-#define LLVM_LIB_DEBUGINFO_DWARFUNIT_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFUNIT_H
+#define LLVM_DEBUGINFO_DWARF_DWARFUNIT_H
 
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/DebugInfo/DWARF/DWARFSection.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Dwarf.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <memory>
 #include <vector>
 
 namespace llvm {
 
-namespace object {
-class ObjectFile;
-}
-
+class DWARFAbbreviationDeclarationSet;
 class DWARFContext;
 class DWARFDebugAbbrev;
 class DWARFUnit;
-class StringRef;
-class raw_ostream;
 
 /// Base class for all DWARFUnitSection classes. This provides the
 /// functionality common to all unit types.
@@ -47,12 +52,12 @@ public:
                 DWARFUnitIndex *Index = nullptr);
 
 protected:
+  ~DWARFUnitSectionBase() = default;
+
   virtual void parseImpl(DWARFContext &Context, const DWARFSection &Section,
                          const DWARFDebugAbbrev *DA, StringRef RS, StringRef SS,
                          StringRef SOS, StringRef AOS, StringRef LS,
                          bool isLittleEndian, bool isDWO) = 0;
-
-  ~DWARFUnitSectionBase() = default;
 };
 
 const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context,
@@ -65,7 +70,7 @@ class DWARFUnitSection final : public SmallVector<std::unique_ptr<UnitType>, 1>,
   bool Parsed = false;
 
 public:
-  typedef llvm::SmallVectorImpl<std::unique_ptr<UnitType>> UnitVector;
+  typedef SmallVectorImpl<std::unique_ptr<UnitType>> UnitVector;
   typedef typename UnitVector::iterator iterator;
   typedef llvm::iterator_range<typename UnitVector::iterator> iterator_range;
 
@@ -122,8 +127,9 @@ class DWARFUnit {
 
   uint32_t Offset;
   uint32_t Length;
-  uint16_t Version;
   const DWARFAbbreviationDeclarationSet *Abbrevs;
+  uint16_t Version;
+  uint8_t UnitType;
   uint8_t AddrSize;
   uint64_t BaseAddr;
   // The compile unit debug information entry items.
@@ -134,9 +140,11 @@ class DWARFUnit {
   class DWOHolder {
     object::OwningBinary<object::ObjectFile> DWOFile;
     std::unique_ptr<DWARFContext> DWOContext;
-    DWARFUnit *DWOU;
+    DWARFUnit *DWOU = nullptr;
+
   public:
     DWOHolder(StringRef DWOPath);
+
     DWARFUnit *getUnit() const { return DWOU; }
   };
   std::unique_ptr<DWOHolder> DWO;
@@ -151,8 +159,9 @@ class DWARFUnit {
 
 protected:
   virtual bool extractImpl(DataExtractor debug_info, uint32_t *offset_ptr);
+
   /// Size in bytes of the unit header.
-  virtual uint32_t getHeaderSize() const { return 11; }
+  virtual uint32_t getHeaderSize() const { return Version <= 4 ? 11 : 12; }
 
 public:
   DWARFUnit(DWARFContext &Context, const DWARFSection &Section,
@@ -168,10 +177,12 @@ public:
   StringRef getLineSection() const { return LineSection; }
   StringRef getStringSection() const { return StringSection; }
   StringRef getStringOffsetSection() const { return StringOffsetSection; }
+
   void setAddrOffsetSection(StringRef AOS, uint32_t Base) {
     AddrOffsetSection = AOS;
     AddrOffsetSectionBase = Base;
   }
+
   void setRangesSection(StringRef RS, uint32_t Base) {
     RangeSection = RS;
     RangeSectionBase = Base;
@@ -184,6 +195,7 @@ public:
   DataExtractor getDebugInfoExtractor() const {
     return DataExtractor(InfoSection.Data, isLittleEndian, AddrSize);
   }
+
   DataExtractor getStringExtractor() const {
     return DataExtractor(StringSection, false, 0);
   }
@@ -202,23 +214,30 @@ public:
   uint32_t getNextUnitOffset() const { return Offset + Length + 4; }
   uint32_t getLength() const { return Length; }
   uint16_t getVersion() const { return Version; }
+
   dwarf::DwarfFormat getFormat() const {
     return dwarf::DwarfFormat::DWARF32; // FIXME: Support DWARF64.
   }
+
   const DWARFAbbreviationDeclarationSet *getAbbreviations() const {
     return Abbrevs;
   }
+
+  uint8_t getUnitType() const { return UnitType; }
   uint8_t getAddressByteSize() const { return AddrSize; }
+
   uint8_t getRefAddrByteSize() const {
     if (Version == 2)
       return AddrSize;
     return getDwarfOffsetByteSize();
   }
+
   uint8_t getDwarfOffsetByteSize() const {
     if (getFormat() == dwarf::DwarfFormat::DWARF64)
       return 8;
     return 4;
   }
+
   uint64_t getBaseAddress() const { return BaseAddr; }
 
   void setBaseAddress(uint64_t base_addr) {
@@ -308,9 +327,11 @@ private:
   /// extractDIEsIfNeeded - Parses a compile unit and indexes its DIEs if it
   /// hasn't already been done. Returns the number of DIEs parsed at this call.
   size_t extractDIEsIfNeeded(bool CUDieOnly);
+
   /// extractDIEsToVector - Appends all parsed DIEs to a vector.
   void extractDIEsToVector(bool AppendCUDie, bool AppendNonCUDIEs,
                            std::vector<DWARFDebugInfoEntry> &DIEs) const;
+
   /// clearDIEs - Clear parsed DIEs to keep memory usage low.
   void clearDIEs(bool KeepCUDie);
 
@@ -324,6 +345,6 @@ private:
   DWARFDie getSubprogramForAddress(uint64_t Address);
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_DEBUGINFO_DWARF_DWARFUNIT_H
diff --git a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
index 9f051cd7081c65078fe33596ef1f62e5d58e2a4a..8e2ce023695bf31a3e04033178b99ca29a337556 100644
--- a/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
+++ b/include/llvm/DebugInfo/DWARF/DWARFUnitIndex.h
@@ -1,4 +1,4 @@
-//===-- DWARFUnitIndex.h --------------------------------------------------===//
+//===- DWARFUnitIndex.h -----------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_DEBUGINFO_DWARFUNITINDEX_H
-#define LLVM_LIB_DEBUGINFO_DWARFUNITINDEX_H
+#ifndef LLVM_DEBUGINFO_DWARF_DWARFUNITINDEX_H
+#define LLVM_DEBUGINFO_DWARF_DWARFUNITINDEX_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cstdint>
+#include <memory>
 
 namespace llvm {
 
+class raw_ostream;
+
 enum DWARFSectionKind {
   DW_SECT_INFO = 1,
   DW_SECT_TYPES,
@@ -57,9 +59,11 @@ public:
   public:
     const SectionContribution *getOffset(DWARFSectionKind Sec) const;
     const SectionContribution *getOffset() const;
+
     const SectionContribution *getOffsets() const {
       return Contributions.get();
     }
+
     uint64_t getSignature() const { return Signature; }
   };
 
@@ -72,21 +76,26 @@ private:
   std::unique_ptr<Entry[]> Rows;
 
   static StringRef getColumnHeader(DWARFSectionKind DS);
+
   bool parseImpl(DataExtractor IndexData);
 
 public:
-  bool parse(DataExtractor IndexData);
   DWARFUnitIndex(DWARFSectionKind InfoColumnKind)
       : InfoColumnKind(InfoColumnKind) {}
+
+  bool parse(DataExtractor IndexData);
   void dump(raw_ostream &OS) const;
   const Entry *getFromOffset(uint32_t Offset) const;
+
   ArrayRef<DWARFSectionKind> getColumnKinds() const {
     return makeArrayRef(ColumnKinds.get(), Header.NumColumns);
   }
+
   ArrayRef<Entry> getRows() const {
     return makeArrayRef(Rows.get(), Header.NumBuckets);
   }
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_DWARF_DWARFUNITINDEX_H
diff --git a/include/llvm/DebugInfo/MSF/ByteStream.h b/include/llvm/DebugInfo/MSF/ByteStream.h
deleted file mode 100644
index 547844be5e5d46ff3d8503f002d84cb2511bab22..0000000000000000000000000000000000000000
--- a/include/llvm/DebugInfo/MSF/ByteStream.h
+++ /dev/null
@@ -1,169 +0,0 @@
-//===- ByteStream.h - Reads stream data from a byte sequence ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_MSF_BYTESTREAM_H
-#define LLVM_DEBUGINFO_MSF_BYTESTREAM_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/FileOutputBuffer.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include <algorithm>
-#include <cstdint>
-#include <cstring>
-#include <memory>
-
-namespace llvm {
-namespace msf {
-
-class ByteStream : public ReadableStream {
-public:
-  ByteStream() = default;
-  explicit ByteStream(ArrayRef<uint8_t> Data) : Data(Data) {}
-  explicit ByteStream(StringRef Data)
-      : Data(Data.bytes_begin(), Data.bytes_end()) {}
-
-  Error readBytes(uint32_t Offset, uint32_t Size,
-                  ArrayRef<uint8_t> &Buffer) const override {
-    if (Offset > Data.size())
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-    if (Data.size() < Size + Offset)
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-    Buffer = Data.slice(Offset, Size);
-    return Error::success();
-  }
-
-  Error readLongestContiguousChunk(uint32_t Offset,
-                                   ArrayRef<uint8_t> &Buffer) const override {
-    if (Offset >= Data.size())
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-    Buffer = Data.slice(Offset);
-    return Error::success();
-  }
-
-  uint32_t getLength() const override { return Data.size(); }
-
-  ArrayRef<uint8_t> data() const { return Data; }
-
-  StringRef str() const {
-    const char *CharData = reinterpret_cast<const char *>(Data.data());
-    return StringRef(CharData, Data.size());
-  }
-
-protected:
-  ArrayRef<uint8_t> Data;
-};
-
-// MemoryBufferByteStream behaves like a read-only ByteStream, but has its data
-// backed by an llvm::MemoryBuffer.  It also owns the underlying MemoryBuffer.
-class MemoryBufferByteStream : public ByteStream {
-public:
-  explicit MemoryBufferByteStream(std::unique_ptr<MemoryBuffer> Buffer)
-      : ByteStream(ArrayRef<uint8_t>(Buffer->getBuffer().bytes_begin(),
-                                     Buffer->getBuffer().bytes_end())),
-        MemBuffer(std::move(Buffer)) {}
-
-  std::unique_ptr<MemoryBuffer> MemBuffer;
-};
-
-class MutableByteStream : public WritableStream {
-public:
-  MutableByteStream() = default;
-  explicit MutableByteStream(MutableArrayRef<uint8_t> Data)
-      : Data(Data), ImmutableStream(Data) {}
-
-  Error readBytes(uint32_t Offset, uint32_t Size,
-                  ArrayRef<uint8_t> &Buffer) const override {
-    return ImmutableStream.readBytes(Offset, Size, Buffer);
-  }
-
-  Error readLongestContiguousChunk(uint32_t Offset,
-                                   ArrayRef<uint8_t> &Buffer) const override {
-    return ImmutableStream.readLongestContiguousChunk(Offset, Buffer);
-  }
-
-  uint32_t getLength() const override { return ImmutableStream.getLength(); }
-
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Buffer) const override {
-    if (Buffer.empty())
-      return Error::success();
-
-    if (Data.size() < Buffer.size())
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-    if (Offset > Buffer.size() - Data.size())
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-
-    uint8_t *DataPtr = const_cast<uint8_t *>(Data.data());
-    ::memcpy(DataPtr + Offset, Buffer.data(), Buffer.size());
-    return Error::success();
-  }
-
-  Error commit() const override { return Error::success(); }
-
-  MutableArrayRef<uint8_t> data() const { return Data; }
-
-private:
-  MutableArrayRef<uint8_t> Data;
-  ByteStream ImmutableStream;
-};
-
-// A simple adapter that acts like a ByteStream but holds ownership over
-// and underlying FileOutputBuffer.
-class FileBufferByteStream : public WritableStream {
-private:
-  class StreamImpl : public MutableByteStream {
-  public:
-    StreamImpl(std::unique_ptr<FileOutputBuffer> Buffer)
-        : MutableByteStream(MutableArrayRef<uint8_t>(Buffer->getBufferStart(),
-                                                     Buffer->getBufferEnd())),
-          FileBuffer(std::move(Buffer)) {}
-
-    Error commit() const override {
-      if (FileBuffer->commit())
-        return llvm::make_error<MSFError>(msf_error_code::not_writable);
-      return Error::success();
-    }
-
-  private:
-    std::unique_ptr<FileOutputBuffer> FileBuffer;
-  };
-
-public:
-  explicit FileBufferByteStream(std::unique_ptr<FileOutputBuffer> Buffer)
-      : Impl(std::move(Buffer)) {}
-
-  Error readBytes(uint32_t Offset, uint32_t Size,
-                  ArrayRef<uint8_t> &Buffer) const override {
-    return Impl.readBytes(Offset, Size, Buffer);
-  }
-
-  Error readLongestContiguousChunk(uint32_t Offset,
-                                   ArrayRef<uint8_t> &Buffer) const override {
-    return Impl.readLongestContiguousChunk(Offset, Buffer);
-  }
-
-  uint32_t getLength() const override { return Impl.getLength(); }
-
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) const override {
-    return Impl.writeBytes(Offset, Data);
-  }
-
-  Error commit() const override { return Impl.commit(); }
-
-private:
-  StreamImpl Impl;
-};
-
-} // end namespace msf
-} // end namespace llvm
-
-#endif // LLVM_DEBUGINFO_MSF_BYTESTREAM_H
diff --git a/include/llvm/DebugInfo/MSF/MappedBlockStream.h b/include/llvm/DebugInfo/MSF/MappedBlockStream.h
index fff4e9cecef5ecc0fee167e8f4b0cfd65fd377e7..c91f6f725c80685398ac51a425f8e62bf754d3e0 100644
--- a/include/llvm/DebugInfo/MSF/MappedBlockStream.h
+++ b/include/llvm/DebugInfo/MSF/MappedBlockStream.h
@@ -15,8 +15,10 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/MSF/MSFStreamLayout.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -37,29 +39,33 @@ struct MSFLayout;
 /// the MSF.  MappedBlockStream provides methods for reading from and writing
 /// to one of these streams transparently, as if it were a contiguous sequence
 /// of bytes.
-class MappedBlockStream : public ReadableStream {
+class MappedBlockStream : public BinaryStream {
   friend class WritableMappedBlockStream;
 public:
   static std::unique_ptr<MappedBlockStream>
   createStream(uint32_t BlockSize, uint32_t NumBlocks,
-               const MSFStreamLayout &Layout, const ReadableStream &MsfData);
+               const MSFStreamLayout &Layout, BinaryStreamRef MsfData);
 
   static std::unique_ptr<MappedBlockStream>
-  createIndexedStream(const MSFLayout &Layout, const ReadableStream &MsfData,
+  createIndexedStream(const MSFLayout &Layout, BinaryStreamRef MsfData,
                       uint32_t StreamIndex);
 
   static std::unique_ptr<MappedBlockStream>
-  createFpmStream(const MSFLayout &Layout, const ReadableStream &MsfData);
+  createFpmStream(const MSFLayout &Layout, BinaryStreamRef MsfData);
 
   static std::unique_ptr<MappedBlockStream>
-  createDirectoryStream(const MSFLayout &Layout, const ReadableStream &MsfData);
+  createDirectoryStream(const MSFLayout &Layout, BinaryStreamRef MsfData);
+
+  llvm::support::endianness getEndian() const override {
+    return llvm::support::little;
+  }
 
   Error readBytes(uint32_t Offset, uint32_t Size,
-                  ArrayRef<uint8_t> &Buffer) const override;
+                  ArrayRef<uint8_t> &Buffer) override;
   Error readLongestContiguousChunk(uint32_t Offset,
-                                   ArrayRef<uint8_t> &Buffer) const override;
+                                   ArrayRef<uint8_t> &Buffer) override;
 
-  uint32_t getLength() const override;
+  uint32_t getLength() override;
 
   uint32_t getNumBytesCopied() const;
 
@@ -74,51 +80,56 @@ public:
 protected:
   MappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks,
                     const MSFStreamLayout &StreamLayout,
-                    const ReadableStream &MsfData);
+                    BinaryStreamRef MsfData);
 
 private:
   const MSFStreamLayout &getStreamLayout() const { return StreamLayout; }
   void fixCacheAfterWrite(uint32_t Offset, ArrayRef<uint8_t> Data) const;
 
-  Error readBytes(uint32_t Offset, MutableArrayRef<uint8_t> Buffer) const;
+  Error readBytes(uint32_t Offset, MutableArrayRef<uint8_t> Buffer);
   bool tryReadContiguously(uint32_t Offset, uint32_t Size,
-                           ArrayRef<uint8_t> &Buffer) const;
+                           ArrayRef<uint8_t> &Buffer);
 
   const uint32_t BlockSize;
   const uint32_t NumBlocks;
   const MSFStreamLayout StreamLayout;
-  const ReadableStream &MsfData;
+  BinaryStreamRef MsfData;
 
   typedef MutableArrayRef<uint8_t> CacheEntry;
-  mutable llvm::BumpPtrAllocator Pool;
-  mutable DenseMap<uint32_t, std::vector<CacheEntry>> CacheMap;
+  llvm::BumpPtrAllocator Pool;
+  DenseMap<uint32_t, std::vector<CacheEntry>> CacheMap;
 };
 
-class WritableMappedBlockStream : public WritableStream {
+class WritableMappedBlockStream : public WritableBinaryStream {
 public:
   static std::unique_ptr<WritableMappedBlockStream>
   createStream(uint32_t BlockSize, uint32_t NumBlocks,
-               const MSFStreamLayout &Layout, const WritableStream &MsfData);
+               const MSFStreamLayout &Layout, WritableBinaryStreamRef MsfData);
 
   static std::unique_ptr<WritableMappedBlockStream>
-  createIndexedStream(const MSFLayout &Layout, const WritableStream &MsfData,
+  createIndexedStream(const MSFLayout &Layout, WritableBinaryStreamRef MsfData,
                       uint32_t StreamIndex);
 
   static std::unique_ptr<WritableMappedBlockStream>
-  createDirectoryStream(const MSFLayout &Layout, const WritableStream &MsfData);
+  createDirectoryStream(const MSFLayout &Layout,
+                        WritableBinaryStreamRef MsfData);
 
   static std::unique_ptr<WritableMappedBlockStream>
-  createFpmStream(const MSFLayout &Layout, const WritableStream &MsfData);
+  createFpmStream(const MSFLayout &Layout, WritableBinaryStreamRef MsfData);
+
+  llvm::support::endianness getEndian() const override {
+    return llvm::support::little;
+  }
 
   Error readBytes(uint32_t Offset, uint32_t Size,
-                  ArrayRef<uint8_t> &Buffer) const override;
+                  ArrayRef<uint8_t> &Buffer) override;
   Error readLongestContiguousChunk(uint32_t Offset,
-                                   ArrayRef<uint8_t> &Buffer) const override;
-  uint32_t getLength() const override;
+                                   ArrayRef<uint8_t> &Buffer) override;
+  uint32_t getLength() override;
 
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Buffer) const override;
+  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Buffer) override;
 
-  Error commit() const override;
+  Error commit() override;
 
   const MSFStreamLayout &getStreamLayout() const {
     return ReadInterface.getStreamLayout();
@@ -130,12 +141,12 @@ public:
 protected:
   WritableMappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks,
                             const MSFStreamLayout &StreamLayout,
-                            const WritableStream &MsfData);
+                            WritableBinaryStreamRef MsfData);
 
 private:
   MappedBlockStream ReadInterface;
 
-  const WritableStream &WriteInterface;
+  WritableBinaryStreamRef WriteInterface;
 };
 
 } // end namespace pdb
diff --git a/include/llvm/DebugInfo/MSF/StreamInterface.h b/include/llvm/DebugInfo/MSF/StreamInterface.h
deleted file mode 100644
index 09782d8e3b303f65262e9dbfab66cbf1526fdeeb..0000000000000000000000000000000000000000
--- a/include/llvm/DebugInfo/MSF/StreamInterface.h
+++ /dev/null
@@ -1,53 +0,0 @@
-//===- StreamInterface.h - Base interface for a stream of data --*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_MSF_STREAMINTERFACE_H
-#define LLVM_DEBUGINFO_MSF_STREAMINTERFACE_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/Support/Error.h"
-#include <cstdint>
-
-namespace llvm {
-namespace msf {
-
-class ReadableStream {
-public:
-  virtual ~ReadableStream() = default;
-
-  // Given an offset into the stream and a number of bytes, attempt to read
-  // the bytes and set the output ArrayRef to point to a reference into the
-  // stream, without copying any data.
-  virtual Error readBytes(uint32_t Offset, uint32_t Size,
-                          ArrayRef<uint8_t> &Buffer) const = 0;
-
-  // Given an offset into the stream, read as much as possible without copying
-  // any data.
-  virtual Error readLongestContiguousChunk(uint32_t Offset,
-                                           ArrayRef<uint8_t> &Buffer) const = 0;
-
-  virtual uint32_t getLength() const = 0;
-};
-
-class WritableStream : public ReadableStream {
-public:
-  ~WritableStream() override = default;
-
-  // Attempt to write the given bytes into the stream at the desired offset.
-  // This will always necessitate a copy.  Cannot shrink or grow the stream,
-  // only writes into existing allocated space.
-  virtual Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) const = 0;
-
-  virtual Error commit() const = 0;
-};
-
-} // end namespace msf
-} // end namespace llvm
-
-#endif // LLVM_DEBUGINFO_MSF_STREAMINTERFACE_H
diff --git a/include/llvm/DebugInfo/MSF/StreamReader.h b/include/llvm/DebugInfo/MSF/StreamReader.h
deleted file mode 100644
index fc2ca78dc18f493ef2682b220e46f91f1f8997a8..0000000000000000000000000000000000000000
--- a/include/llvm/DebugInfo/MSF/StreamReader.h
+++ /dev/null
@@ -1,121 +0,0 @@
-//===- StreamReader.h - Reads bytes and objects from a stream ---*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_MSF_STREAMREADER_H
-#define LLVM_DEBUGINFO_MSF_STREAMREADER_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-#include "llvm/Support/Endian.h"
-#include "llvm/Support/Error.h"
-
-#include <string>
-
-namespace llvm {
-namespace msf {
-
-class StreamReader {
-public:
-  StreamReader(ReadableStreamRef Stream);
-
-  Error readLongestContiguousChunk(ArrayRef<uint8_t> &Buffer);
-  Error readBytes(ArrayRef<uint8_t> &Buffer, uint32_t Size);
-  Error readInteger(uint8_t &Dest);
-  Error readInteger(uint16_t &Dest);
-  Error readInteger(uint32_t &Dest);
-  Error readInteger(uint64_t &Dest);
-  Error readInteger(int8_t &Dest);
-  Error readInteger(int16_t &Dest);
-  Error readInteger(int32_t &Dest);
-  Error readInteger(int64_t &Dest);
-  Error readZeroString(StringRef &Dest);
-  Error readFixedString(StringRef &Dest, uint32_t Length);
-  Error readStreamRef(ReadableStreamRef &Ref);
-  Error readStreamRef(ReadableStreamRef &Ref, uint32_t Length);
-
-  template <typename T> Error readEnum(T &Dest) {
-    typename std::underlying_type<T>::type N;
-    if (auto EC = readInteger(N))
-      return EC;
-    Dest = static_cast<T>(N);
-    return Error::success();
-  }
-
-  template <typename T> Error readObject(const T *&Dest) {
-    ArrayRef<uint8_t> Buffer;
-    if (auto EC = readBytes(Buffer, sizeof(T)))
-      return EC;
-    Dest = reinterpret_cast<const T *>(Buffer.data());
-    return Error::success();
-  }
-
-  template <typename T>
-  Error readArray(ArrayRef<T> &Array, uint32_t NumElements) {
-    ArrayRef<uint8_t> Bytes;
-    if (NumElements == 0) {
-      Array = ArrayRef<T>();
-      return Error::success();
-    }
-
-    if (NumElements > UINT32_MAX / sizeof(T))
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-
-    if (auto EC = readBytes(Bytes, NumElements * sizeof(T)))
-      return EC;
-    Array = ArrayRef<T>(reinterpret_cast<const T *>(Bytes.data()), NumElements);
-    return Error::success();
-  }
-
-  template <typename T, typename U>
-  Error readArray(VarStreamArray<T, U> &Array, uint32_t Size) {
-    ReadableStreamRef S;
-    if (auto EC = readStreamRef(S, Size))
-      return EC;
-    Array = VarStreamArray<T, U>(S, Array.getExtractor());
-    return Error::success();
-  }
-
-  template <typename T>
-  Error readArray(FixedStreamArray<T> &Array, uint32_t NumItems) {
-    if (NumItems == 0) {
-      Array = FixedStreamArray<T>();
-      return Error::success();
-    }
-    uint32_t Length = NumItems * sizeof(T);
-    if (Length / sizeof(T) != NumItems)
-      return make_error<MSFError>(msf_error_code::invalid_format);
-    if (Offset + Length > Stream.getLength())
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-    ReadableStreamRef View = Stream.slice(Offset, Length);
-    Array = FixedStreamArray<T>(View);
-    Offset += Length;
-    return Error::success();
-  }
-
-  bool empty() const { return bytesRemaining() == 0; }
-  void setOffset(uint32_t Off) { Offset = Off; }
-  uint32_t getOffset() const { return Offset; }
-  uint32_t getLength() const { return Stream.getLength(); }
-  uint32_t bytesRemaining() const { return getLength() - getOffset(); }
-
-  Error skip(uint32_t Amount);
-
-  uint8_t peek() const;
-
-private:
-  ReadableStreamRef Stream;
-  uint32_t Offset;
-};
-} // namespace msf
-} // namespace llvm
-
-#endif // LLVM_DEBUGINFO_MSF_STREAMREADER_H
diff --git a/include/llvm/DebugInfo/MSF/StreamRef.h b/include/llvm/DebugInfo/MSF/StreamRef.h
deleted file mode 100644
index eee71e53a39b003d38e6d8a5824b715de3b9eec4..0000000000000000000000000000000000000000
--- a/include/llvm/DebugInfo/MSF/StreamRef.h
+++ /dev/null
@@ -1,135 +0,0 @@
-//===- StreamRef.h - A copyable reference to a stream -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_MSF_STREAMREF_H
-#define LLVM_DEBUGINFO_MSF_STREAMREF_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
-#include "llvm/Support/Error.h"
-#include <algorithm>
-#include <cstdint>
-
-namespace llvm {
-namespace msf {
-
-template <class StreamType, class RefType> class StreamRefBase {
-public:
-  StreamRefBase() : Stream(nullptr), ViewOffset(0), Length(0) {}
-  StreamRefBase(const StreamType &Stream, uint32_t Offset, uint32_t Length)
-      : Stream(&Stream), ViewOffset(Offset), Length(Length) {}
-
-  uint32_t getLength() const { return Length; }
-  const StreamType *getStream() const { return Stream; }
-
-  RefType drop_front(uint32_t N) const {
-    if (!Stream)
-      return RefType();
-
-    N = std::min(N, Length);
-    return RefType(*Stream, ViewOffset + N, Length - N);
-  }
-
-  RefType keep_front(uint32_t N) const {
-    if (!Stream)
-      return RefType();
-    N = std::min(N, Length);
-    return RefType(*Stream, ViewOffset, N);
-  }
-
-  RefType slice(uint32_t Offset, uint32_t Len) const {
-    return drop_front(Offset).keep_front(Len);
-  }
-
-  bool operator==(const RefType &Other) const {
-    if (Stream != Other.Stream)
-      return false;
-    if (ViewOffset != Other.ViewOffset)
-      return false;
-    if (Length != Other.Length)
-      return false;
-    return true;
-  }
-
-protected:
-  const StreamType *Stream;
-  uint32_t ViewOffset;
-  uint32_t Length;
-};
-
-class ReadableStreamRef
-    : public StreamRefBase<ReadableStream, ReadableStreamRef> {
-public:
-  ReadableStreamRef() = default;
-  ReadableStreamRef(const ReadableStream &Stream)
-      : StreamRefBase(Stream, 0, Stream.getLength()) {}
-  ReadableStreamRef(const ReadableStream &Stream, uint32_t Offset,
-                    uint32_t Length)
-      : StreamRefBase(Stream, Offset, Length) {}
-
-  // Use StreamRef.slice() instead.
-  ReadableStreamRef(const ReadableStreamRef &S, uint32_t Offset,
-                    uint32_t Length) = delete;
-
-  Error readBytes(uint32_t Offset, uint32_t Size,
-                  ArrayRef<uint8_t> &Buffer) const {
-    if (ViewOffset + Offset < Offset)
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-    if (Size + Offset > Length)
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-    return Stream->readBytes(ViewOffset + Offset, Size, Buffer);
-  }
-
-  // Given an offset into the stream, read as much as possible without copying
-  // any data.
-  Error readLongestContiguousChunk(uint32_t Offset,
-                                   ArrayRef<uint8_t> &Buffer) const {
-    if (Offset >= Length)
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-
-    if (auto EC = Stream->readLongestContiguousChunk(Offset, Buffer))
-      return EC;
-    // This StreamRef might refer to a smaller window over a larger stream.  In
-    // that case we will have read out more bytes than we should return, because
-    // we should not read past the end of the current view.
-    uint32_t MaxLength = Length - Offset;
-    if (Buffer.size() > MaxLength)
-      Buffer = Buffer.slice(0, MaxLength);
-    return Error::success();
-  }
-};
-
-class WritableStreamRef
-    : public StreamRefBase<WritableStream, WritableStreamRef> {
-public:
-  WritableStreamRef() = default;
-  WritableStreamRef(const WritableStream &Stream)
-      : StreamRefBase(Stream, 0, Stream.getLength()) {}
-  WritableStreamRef(const WritableStream &Stream, uint32_t Offset,
-                    uint32_t Length)
-      : StreamRefBase(Stream, Offset, Length) {}
-
-  // Use StreamRef.slice() instead.
-  WritableStreamRef(const WritableStreamRef &S, uint32_t Offset,
-                    uint32_t Length) = delete;
-
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) const {
-    if (Data.size() + Offset > Length)
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-    return Stream->writeBytes(ViewOffset + Offset, Data);
-  }
-
-  Error commit() const { return Stream->commit(); }
-};
-
-} // end namespace msf
-} // end namespace llvm
-
-#endif // LLVM_DEBUGINFO_MSF_STREAMREF_H
diff --git a/include/llvm/DebugInfo/MSF/StreamWriter.h b/include/llvm/DebugInfo/MSF/StreamWriter.h
deleted file mode 100644
index 2bb14434dd83c026e532be406bbc0ddca73f8283..0000000000000000000000000000000000000000
--- a/include/llvm/DebugInfo/MSF/StreamWriter.h
+++ /dev/null
@@ -1,92 +0,0 @@
-//===- StreamWriter.h - Writes bytes and objects to a stream ----*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_DEBUGINFO_MSF_STREAMWRITER_H
-#define LLVM_DEBUGINFO_MSF_STREAMWRITER_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-#include "llvm/Support/Error.h"
-#include <cstdint>
-#include <type_traits>
-
-namespace llvm {
-namespace msf {
-
-class StreamWriter {
-public:
-  StreamWriter() = default;
-  explicit StreamWriter(WritableStreamRef Stream);
-
-  Error writeBytes(ArrayRef<uint8_t> Buffer);
-  Error writeInteger(uint8_t Int);
-  Error writeInteger(uint16_t Dest);
-  Error writeInteger(uint32_t Dest);
-  Error writeInteger(uint64_t Dest);
-  Error writeInteger(int8_t Int);
-  Error writeInteger(int16_t Dest);
-  Error writeInteger(int32_t Dest);
-  Error writeInteger(int64_t Dest);
-  Error writeZeroString(StringRef Str);
-  Error writeFixedString(StringRef Str);
-  Error writeStreamRef(ReadableStreamRef Ref);
-  Error writeStreamRef(ReadableStreamRef Ref, uint32_t Size);
-
-  template <typename T> Error writeEnum(T Num) {
-    return writeInteger(
-        static_cast<typename std::underlying_type<T>::type>(Num));
-  }
-
-  template <typename T> Error writeObject(const T &Obj) {
-    static_assert(!std::is_pointer<T>::value,
-                  "writeObject should not be used with pointers, to write "
-                  "the pointed-to value dereference the pointer before calling "
-                  "writeObject");
-    return writeBytes(
-        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(&Obj), sizeof(T)));
-  }
-
-  template <typename T> Error writeArray(ArrayRef<T> Array) {
-    if (Array.empty())
-      return Error::success();
-
-    if (Array.size() > UINT32_MAX / sizeof(T))
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
-
-    return writeBytes(
-        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(Array.data()),
-                          Array.size() * sizeof(T)));
-  }
-
-  template <typename T, typename U>
-  Error writeArray(VarStreamArray<T, U> Array) {
-    return writeStreamRef(Array.getUnderlyingStream());
-  }
-
-  template <typename T> Error writeArray(FixedStreamArray<T> Array) {
-    return writeStreamRef(Array.getUnderlyingStream());
-  }
-
-  void setOffset(uint32_t Off) { Offset = Off; }
-  uint32_t getOffset() const { return Offset; }
-  uint32_t getLength() const { return Stream.getLength(); }
-  uint32_t bytesRemaining() const { return getLength() - getOffset(); }
-
-private:
-  WritableStreamRef Stream;
-  uint32_t Offset = 0;
-};
-
-} // end namespace msf
-} // end namespace llvm
-
-#endif // LLVM_DEBUGINFO_MSF_STREAMWRITER_H
diff --git a/include/llvm/DebugInfo/PDB/DIA/DIASession.h b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
index 3f5818631e7bc1c9c1c56a868ab18fdf9da3b7a6..350442556bef88b176fbd45ee3b738bee6ae89f6 100644
--- a/include/llvm/DebugInfo/PDB/DIA/DIASession.h
+++ b/include/llvm/DebugInfo/PDB/DIA/DIASession.h
@@ -31,7 +31,7 @@ public:
 
   uint64_t getLoadAddress() const override;
   void setLoadAddress(uint64_t Address) override;
-  std::unique_ptr<PDBSymbolExe> getGlobalScope() const override;
+  std::unique_ptr<PDBSymbolExe> getGlobalScope() override;
   std::unique_ptr<PDBSymbol> getSymbolById(uint32_t SymbolId) const override;
 
   std::unique_ptr<PDBSymbol>
diff --git a/include/llvm/DebugInfo/PDB/IPDBSession.h b/include/llvm/DebugInfo/PDB/IPDBSession.h
index 15e97ac198e55114c218b796426f0bbb140d00ee..696736a907a6f260edec5d357e22e0d6a3eb801a 100644
--- a/include/llvm/DebugInfo/PDB/IPDBSession.h
+++ b/include/llvm/DebugInfo/PDB/IPDBSession.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_IPDBSESSION_H
 #define LLVM_DEBUGINFO_PDB_IPDBSESSION_H
 
+#include "PDBSymbol.h"
 #include "PDBTypes.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
@@ -28,7 +29,7 @@ public:
 
   virtual uint64_t getLoadAddress() const = 0;
   virtual void setLoadAddress(uint64_t Address) = 0;
-  virtual std::unique_ptr<PDBSymbolExe> getGlobalScope() const = 0;
+  virtual std::unique_ptr<PDBSymbolExe> getGlobalScope() = 0;
   virtual std::unique_ptr<PDBSymbol> getSymbolById(uint32_t SymbolId) const = 0;
 
   template <typename T>
diff --git a/include/llvm/DebugInfo/PDB/Native/DbiStream.h b/include/llvm/DebugInfo/PDB/Native/DbiStream.h
index 06e4515ba474de08d72fca8f9c2ab1c28f2b8fd3..f49f5aaefacadda7784f9c170afe51d22425255c 100644
--- a/include/llvm/DebugInfo/PDB/Native/DbiStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/DbiStream.h
@@ -12,13 +12,15 @@
 
 #include "llvm/DebugInfo/CodeView/ModuleSubstream.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
 #include "llvm/DebugInfo/PDB/Native/ModInfo.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/Native/StringTable.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
@@ -70,11 +72,11 @@ public:
 
   Expected<StringRef> getFileNameForIndex(uint32_t Index) const;
 
-  msf::FixedStreamArray<object::coff_section> getSectionHeaders();
+  FixedStreamArray<object::coff_section> getSectionHeaders();
 
-  msf::FixedStreamArray<object::FpoData> getFpoRecords();
+  FixedStreamArray<object::FpoData> getFpoRecords();
 
-  msf::FixedStreamArray<SecMapEntry> getSectionMap() const;
+  FixedStreamArray<SecMapEntry> getSectionMap() const;
   void visitSectionContributions(ISectionContribVisitor &Visitor) const;
 
 private:
@@ -91,28 +93,28 @@ private:
   std::vector<ModuleInfoEx> ModuleInfos;
   StringTable ECNames;
 
-  msf::ReadableStreamRef ModInfoSubstream;
-  msf::ReadableStreamRef SecContrSubstream;
-  msf::ReadableStreamRef SecMapSubstream;
-  msf::ReadableStreamRef FileInfoSubstream;
-  msf::ReadableStreamRef TypeServerMapSubstream;
-  msf::ReadableStreamRef ECSubstream;
+  BinaryStreamRef ModInfoSubstream;
+  BinaryStreamRef SecContrSubstream;
+  BinaryStreamRef SecMapSubstream;
+  BinaryStreamRef FileInfoSubstream;
+  BinaryStreamRef TypeServerMapSubstream;
+  BinaryStreamRef ECSubstream;
 
-  msf::ReadableStreamRef NamesBuffer;
+  BinaryStreamRef NamesBuffer;
 
-  msf::FixedStreamArray<support::ulittle16_t> DbgStreams;
+  FixedStreamArray<support::ulittle16_t> DbgStreams;
 
   PdbRaw_DbiSecContribVer SectionContribVersion;
-  msf::FixedStreamArray<SectionContrib> SectionContribs;
-  msf::FixedStreamArray<SectionContrib2> SectionContribs2;
-  msf::FixedStreamArray<SecMapEntry> SectionMap;
-  msf::FixedStreamArray<support::little32_t> FileNameOffsets;
+  FixedStreamArray<SectionContrib> SectionContribs;
+  FixedStreamArray<SectionContrib2> SectionContribs2;
+  FixedStreamArray<SecMapEntry> SectionMap;
+  FixedStreamArray<support::little32_t> FileNameOffsets;
 
   std::unique_ptr<msf::MappedBlockStream> SectionHeaderStream;
-  msf::FixedStreamArray<object::coff_section> SectionHeaders;
+  FixedStreamArray<object::coff_section> SectionHeaders;
 
   std::unique_ptr<msf::MappedBlockStream> FpoStream;
-  msf::FixedStreamArray<object::FpoData> FpoRecords;
+  FixedStreamArray<object::FpoData> FpoRecords;
 
   const DbiStreamHeader *Header;
 };
diff --git a/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
index 05af1ef06ed9415c71fd81a5dae4ce86a483b9f9..16426bd93847a685e5eb34b0ba32433377919028 100644
--- a/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h
@@ -14,11 +14,11 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Error.h"
 
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 
 namespace llvm {
@@ -31,11 +31,13 @@ struct coff_section;
 namespace pdb {
 class DbiStream;
 struct DbiStreamHeader;
+class ModInfoBuilder;
 class PDBFile;
 
 class DbiStreamBuilder {
 public:
   DbiStreamBuilder(msf::MSFBuilder &Msf);
+  ~DbiStreamBuilder();
 
   DbiStreamBuilder(const DbiStreamBuilder &) = delete;
   DbiStreamBuilder &operator=(const DbiStreamBuilder &) = delete;
@@ -55,12 +57,12 @@ public:
 
   uint32_t calculateSerializedLength() const;
 
-  Error addModuleInfo(StringRef ObjFile, StringRef Module);
+  Expected<ModInfoBuilder &> addModuleInfo(StringRef ModuleName);
   Error addModuleSourceFile(StringRef Module, StringRef File);
 
   Error finalizeMsfLayout();
 
-  Error commit(const msf::MSFLayout &Layout, const msf::WritableStream &Buffer);
+  Error commit(const msf::MSFLayout &Layout, WritableBinaryStreamRef MsfBuffer);
 
   // A helper function to create Section Contributions from COFF input
   // section headers.
@@ -88,12 +90,6 @@ private:
   Error generateModiSubstream();
   Error generateFileInfoSubstream();
 
-  struct ModuleInfo {
-    std::vector<StringRef> SourceFiles;
-    StringRef Obj;
-    StringRef Mod;
-  };
-
   msf::MSFBuilder &Msf;
   BumpPtrAllocator &Allocator;
 
@@ -107,14 +103,13 @@ private:
 
   const DbiStreamHeader *Header;
 
-  StringMap<std::unique_ptr<ModuleInfo>> ModuleInfos;
-  std::vector<ModuleInfo *> ModuleInfoList;
+  StringMap<std::unique_ptr<ModInfoBuilder>> ModiMap;
+  std::vector<ModInfoBuilder *> ModiList;
 
   StringMap<uint32_t> SourceFileNames;
 
-  msf::WritableStreamRef NamesBuffer;
-  msf::MutableByteStream ModInfoBuffer;
-  msf::MutableByteStream FileInfoBuffer;
+  WritableBinaryStreamRef NamesBuffer;
+  MutableBinaryByteStream FileInfoBuffer;
   ArrayRef<SectionContrib> SectionContribs;
   ArrayRef<SecMapEntry> SectionMap;
   llvm::SmallVector<DebugStream, (int)DbgHeaderType::Max> DbgStreams;
diff --git a/include/llvm/DebugInfo/PDB/Native/Formatters.h b/include/llvm/DebugInfo/PDB/Native/Formatters.h
new file mode 100644
index 0000000000000000000000000000000000000000..183f0ad8307e39b05d0dcdfa3b7d265e5d18cb9f
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/Native/Formatters.h
@@ -0,0 +1,52 @@
+//===- Formatters.h ---------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_NATIVE_FORMATTERS_H
+#define LLVM_DEBUGINFO_PDB_NATIVE_FORMATTERS_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/Formatters.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/FormatProviders.h"
+
+#define FORMAT_CASE(Value, Name)                                               \
+  case Value:                                                                  \
+    Stream << Name;                                                            \
+    break;
+
+namespace llvm {
+template <> struct format_provider<pdb::PDB_UniqueId> {
+  static void format(const pdb::PDB_UniqueId &V, llvm::raw_ostream &Stream,
+                     StringRef Style) {
+    codeview::fmt_guid(V.Guid).format(Stream, Style);
+  }
+};
+
+template <> struct format_provider<pdb::PdbRaw_ImplVer> {
+  static void format(const pdb::PdbRaw_ImplVer &V, llvm::raw_ostream &Stream,
+                     StringRef Style) {
+    switch (V) {
+      FORMAT_CASE(pdb::PdbRaw_ImplVer::PdbImplVC110, "VC110")
+      FORMAT_CASE(pdb::PdbRaw_ImplVer::PdbImplVC140, "VC140")
+      FORMAT_CASE(pdb::PdbRaw_ImplVer::PdbImplVC2, "VC2")
+      FORMAT_CASE(pdb::PdbRaw_ImplVer::PdbImplVC4, "VC4")
+      FORMAT_CASE(pdb::PdbRaw_ImplVer::PdbImplVC41, "VC41")
+      FORMAT_CASE(pdb::PdbRaw_ImplVer::PdbImplVC50, "VC50")
+      FORMAT_CASE(pdb::PdbRaw_ImplVer::PdbImplVC70, "VC70")
+      FORMAT_CASE(pdb::PdbRaw_ImplVer::PdbImplVC70Dep, "VC70Dep")
+      FORMAT_CASE(pdb::PdbRaw_ImplVer::PdbImplVC80, "VC80")
+      FORMAT_CASE(pdb::PdbRaw_ImplVer::PdbImplVC98, "VC98")
+    }
+  }
+};
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h b/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h
index 06596cb020c8d8d16426582ad89a7a64515d51f7..dcea3d3be0ab802fd9295a4dbd3322c28f5870fb 100644
--- a/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/GlobalsStream.h
@@ -11,10 +11,10 @@
 #define LLVM_DEBUGINFO_PDB_RAW_GLOBALS_STREAM_H
 
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -27,15 +27,15 @@ public:
   explicit GlobalsStream(std::unique_ptr<msf::MappedBlockStream> Stream);
   ~GlobalsStream();
   Error commit();
-  msf::FixedStreamArray<support::ulittle32_t> getHashBuckets() const {
+  FixedStreamArray<support::ulittle32_t> getHashBuckets() const {
     return HashBuckets;
   }
   uint32_t getNumBuckets() const { return NumBuckets; }
   Error reload();
 
 private:
-  msf::FixedStreamArray<support::ulittle32_t> HashBuckets;
-  msf::FixedStreamArray<PSHashRecord> HashRecords;
+  FixedStreamArray<support::ulittle32_t> HashBuckets;
+  FixedStreamArray<PSHashRecord> HashRecords;
   uint32_t NumBuckets;
   std::unique_ptr<msf::MappedBlockStream> Stream;
 };
diff --git a/include/llvm/DebugInfo/PDB/Native/HashTable.h b/include/llvm/DebugInfo/PDB/Native/HashTable.h
index 793df32858dad28e6431163261a0db8851f1b638..46eefa968e523949d1e8f3f10f9c245d510d4977 100644
--- a/include/llvm/DebugInfo/PDB/Native/HashTable.h
+++ b/include/llvm/DebugInfo/PDB/Native/HashTable.h
@@ -14,9 +14,9 @@
 #include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
@@ -42,10 +42,10 @@ public:
   HashTable();
   explicit HashTable(uint32_t Capacity);
 
-  Error load(msf::StreamReader &Stream);
+  Error load(BinaryStreamReader &Stream);
 
   uint32_t calculateSerializedLength() const;
-  Error commit(msf::StreamWriter &Writer) const;
+  Error commit(BinaryStreamWriter &Writer) const;
 
   void clear();
 
@@ -71,9 +71,9 @@ private:
   static uint32_t maxLoad(uint32_t capacity);
   void grow();
 
-  static Error readSparseBitVector(msf::StreamReader &Stream,
+  static Error readSparseBitVector(BinaryStreamReader &Stream,
                                    SparseBitVector<> &V);
-  static Error writeSparseBitVector(msf::StreamWriter &Writer,
+  static Error writeSparseBitVector(BinaryStreamWriter &Writer,
                                     SparseBitVector<> &Vec);
 };
 
diff --git a/include/llvm/DebugInfo/PDB/Native/InfoStream.h b/include/llvm/DebugInfo/PDB/Native/InfoStream.h
index 0b59d9e789dbbfeebe83541eae54cb5da7c988df..1c38c2b6194fc1f7ae81677ed0cb95e6b6cce422 100644
--- a/include/llvm/DebugInfo/PDB/Native/InfoStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/InfoStream.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_RAW_PDBINFOSTREAM_H
 #define LLVM_DEBUGINFO_PDB_RAW_PDBINFOSTREAM_H
 
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
@@ -32,10 +33,16 @@ public:
 
   Error reload();
 
+  uint32_t getStreamSize() const;
+
   PdbRaw_ImplVer getVersion() const;
   uint32_t getSignature() const;
   uint32_t getAge() const;
   PDB_UniqueId getGuid() const;
+  uint32_t getNamedStreamMapByteSize() const;
+
+  PdbRaw_Features getFeatures() const;
+  ArrayRef<PdbRaw_FeatureSig> getFeatureSignatures() const;
 
   const NamedStreamMap &getNamedStreams() const;
 
@@ -63,6 +70,11 @@ private:
   // universally unique.
   PDB_UniqueId Guid;
 
+  std::vector<PdbRaw_FeatureSig> FeatureSignatures;
+  PdbRaw_Features Features = PdbFeatureNone;
+
+  uint32_t NamedStreamMapByteSize = 0;
+
   NamedStreamMap NamedStreams;
 };
 }
diff --git a/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
index 1b182bc65aa1d5feeaba0dffe4995d6febd10dbd..90c28a90d25233c80f2b37132c127694a8f681ae 100644
--- a/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h
@@ -19,9 +19,10 @@
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 namespace llvm {
+class WritableBinaryStreamRef;
+
 namespace msf {
 class MSFBuilder;
-class StreamWriter;
 }
 namespace pdb {
 class PDBFile;
@@ -37,17 +38,19 @@ public:
   void setSignature(uint32_t S);
   void setAge(uint32_t A);
   void setGuid(PDB_UniqueId G);
+  void addFeature(PdbRaw_FeatureSig Sig);
 
   uint32_t finalize();
 
   Error finalizeMsfLayout();
 
   Error commit(const msf::MSFLayout &Layout,
-               const msf::WritableStream &Buffer) const;
+               WritableBinaryStreamRef Buffer) const;
 
 private:
   msf::MSFBuilder &Msf;
 
+  std::vector<PdbRaw_FeatureSig> Features;
   PdbRaw_ImplVer Ver;
   uint32_t Sig;
   uint32_t Age;
diff --git a/include/llvm/DebugInfo/PDB/Native/ModInfo.h b/include/llvm/DebugInfo/PDB/Native/ModInfo.h
index d81d4c20ed1a6b5f76b9ee0759261af5bf22d9e8..d26d0d6184496951ac0ff2e2b7af4c5e78c28d0e 100644
--- a/include/llvm/DebugInfo/PDB/Native/ModInfo.h
+++ b/include/llvm/DebugInfo/PDB/Native/ModInfo.h
@@ -11,9 +11,9 @@
 #define LLVM_DEBUGINFO_PDB_RAW_MODINFO_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
 #include <vector>
@@ -30,7 +30,7 @@ public:
   ModInfo(const ModInfo &Info);
   ~ModInfo();
 
-  static Error initialize(msf::ReadableStreamRef Stream, ModInfo &Info);
+  static Error initialize(BinaryStreamRef Stream, ModInfo &Info);
 
   bool hasECInfo() const;
   uint16_t getTypeServerIndex() const;
@@ -63,10 +63,8 @@ struct ModuleInfoEx {
 
 } // end namespace pdb
 
-namespace msf {
-
 template <> struct VarStreamArrayExtractor<pdb::ModInfo> {
-  Error operator()(ReadableStreamRef Stream, uint32_t &Length,
+  Error operator()(BinaryStreamRef Stream, uint32_t &Length,
                    pdb::ModInfo &Info) const {
     if (auto EC = pdb::ModInfo::initialize(Stream, Info))
       return EC;
@@ -75,8 +73,6 @@ template <> struct VarStreamArrayExtractor<pdb::ModInfo> {
   }
 };
 
-} // end namespace msf
-
 } // end namespace llvm
 
 #endif // LLVM_DEBUGINFO_PDB_RAW_MODINFO_H
diff --git a/include/llvm/DebugInfo/PDB/Native/ModInfoBuilder.h b/include/llvm/DebugInfo/PDB/Native/ModInfoBuilder.h
new file mode 100644
index 0000000000000000000000000000000000000000..605fd2483c3b8ad411a9835dabe930972339fad9
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/Native/ModInfoBuilder.h
@@ -0,0 +1,74 @@
+//===- ModInfoBuilder.h - PDB module information ----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_RAW_MODINFOBUILDER_H
+#define LLVM_DEBUGINFO_PDB_RAW_MODINFOBUILDER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <string>
+#include <vector>
+
+namespace llvm {
+class BinaryStreamWriter;
+
+namespace msf {
+class MSFBuilder;
+struct MSFLayout;
+}
+namespace pdb {
+
+class ModInfoBuilder {
+  friend class DbiStreamBuilder;
+
+public:
+  ModInfoBuilder(StringRef ModuleName, uint32_t ModIndex, msf::MSFBuilder &Msf);
+
+  ModInfoBuilder(const ModInfoBuilder &) = delete;
+  ModInfoBuilder &operator=(const ModInfoBuilder &) = delete;
+
+  void setObjFileName(StringRef Name);
+  void addSymbol(codeview::CVSymbol Symbol);
+
+  uint16_t getStreamIndex() const;
+  StringRef getModuleName() const { return ModuleName; }
+  StringRef getObjFileName() const { return ObjFileName; }
+
+  ArrayRef<std::string> source_files() const {
+    return makeArrayRef(SourceFiles);
+  }
+
+  uint32_t calculateSerializedLength() const;
+
+  void finalize();
+  Error finalizeMsfLayout();
+
+  Error commit(BinaryStreamWriter &ModiWriter, const msf::MSFLayout &MsfLayout,
+               WritableBinaryStreamRef MsfBuffer);
+
+private:
+  void addSourceFile(StringRef Path);
+  msf::MSFBuilder &MSF;
+
+  uint32_t SymbolByteSize = 0;
+  std::string ModuleName;
+  std::string ObjFileName;
+  std::vector<std::string> SourceFiles;
+  std::vector<codeview::CVSymbol> Symbols;
+  ModuleInfoHeader Layout;
+};
+
+} // end namespace pdb
+
+} // end namespace llvm
+
+#endif // LLVM_DEBUGINFO_PDB_RAW_MODINFOBUILDER_H
diff --git a/include/llvm/DebugInfo/PDB/Native/ModStream.h b/include/llvm/DebugInfo/PDB/Native/ModStream.h
index d5e7a6830d8d6cadb00e8fb7fccae9933a5648fc..d65e195dbb95b080f5183f6b4d32d7c75d905c2d 100644
--- a/include/llvm/DebugInfo/PDB/Native/ModStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/ModStream.h
@@ -15,8 +15,8 @@
 #include "llvm/DebugInfo/CodeView/ModuleSubstream.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -50,9 +50,9 @@ private:
   std::unique_ptr<msf::MappedBlockStream> Stream;
 
   codeview::CVSymbolArray SymbolsSubstream;
-  msf::ReadableStreamRef LinesSubstream;
-  msf::ReadableStreamRef C13LinesSubstream;
-  msf::ReadableStreamRef GlobalRefsSubstream;
+  BinaryStreamRef LinesSubstream;
+  BinaryStreamRef C13LinesSubstream;
+  BinaryStreamRef GlobalRefsSubstream;
 
   codeview::ModuleSubstreamArray LineInfo;
 };
diff --git a/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h b/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
index c5cf76e774002ff8b69e28633c87860b842c3095..d4206503e7dca5e706e10895b896f749227cb0dd 100644
--- a/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
+++ b/include/llvm/DebugInfo/PDB/Native/NamedStreamMap.h
@@ -17,10 +17,9 @@
 #include <cstdint>
 
 namespace llvm {
-namespace msf {
-class StreamReader;
-class StreamWriter;
-}
+class BinaryStreamReader;
+class BinaryStreamWriter;
+
 namespace pdb {
 class NamedStreamMapBuilder;
 class NamedStreamMap {
@@ -33,10 +32,11 @@ class NamedStreamMap {
 public:
   NamedStreamMap();
 
-  Error load(msf::StreamReader &Stream);
-  Error commit(msf::StreamWriter &Writer) const;
+  Error load(BinaryStreamReader &Stream);
+  Error commit(BinaryStreamWriter &Writer) const;
   uint32_t finalize();
 
+  uint32_t size() const;
   bool get(StringRef Stream, uint32_t &StreamNo) const;
   void set(StringRef Stream, uint32_t StreamNo);
   void remove(StringRef Stream);
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h b/include/llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h
new file mode 100644
index 0000000000000000000000000000000000000000..8eeaf3e0ea49b9d326814044a35dee19a3300bfe
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h
@@ -0,0 +1,35 @@
+//===- NativeCompilandSymbol.h - native impl for compiland syms -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVECOMPILANDSYMBOL_H
+#define LLVM_DEBUGINFO_PDB_NATIVE_NATIVECOMPILANDSYMBOL_H
+
+#include "llvm/DebugInfo/PDB/Native/ModInfo.h"
+#include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
+
+namespace llvm {
+namespace pdb {
+
+class NativeCompilandSymbol : public NativeRawSymbol {
+public:
+  NativeCompilandSymbol(NativeSession &Session, const ModuleInfoEx &MI);
+  PDB_SymType getSymTag() const override;
+  bool isEditAndContinueEnabled() const override;
+  uint32_t getLexicalParentId() const override;
+  std::string getLibraryName() const override;
+  std::string getName() const override;
+
+private:
+  ModuleInfoEx Module;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeEnumModules.h b/include/llvm/DebugInfo/PDB/Native/NativeEnumModules.h
new file mode 100644
index 0000000000000000000000000000000000000000..60a55ee50cc48bac5d2cdc2e8230a81b691e54e8
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/Native/NativeEnumModules.h
@@ -0,0 +1,41 @@
+//==- NativeEnumModules.h - Native Module Enumerator impl --------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMMODULES_H
+#define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEENUMMODULES_H
+
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/ModInfo.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+namespace llvm {
+namespace pdb {
+
+class NativeSession;
+
+class NativeEnumModules : public IPDBEnumChildren<PDBSymbol> {
+public:
+  explicit NativeEnumModules(NativeSession &Session,
+                             ArrayRef<ModuleInfoEx> Modules,
+                             uint32_t Index = 0);
+
+  uint32_t getChildCount() const override;
+  std::unique_ptr<PDBSymbol> getChildAtIndex(uint32_t Index) const override;
+  std::unique_ptr<PDBSymbol> getNext() override;
+  void reset() override;
+  NativeEnumModules *clone() const override;
+
+private:
+  NativeSession &Session;
+  ArrayRef<ModuleInfoEx> Modules;
+  uint32_t Index;
+};
+}
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h b/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
new file mode 100644
index 0000000000000000000000000000000000000000..9516810539b6b617ddedf54ec01fb8fddf931f0d
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/Native/NativeExeSymbol.h
@@ -0,0 +1,39 @@
+//===- NativeExeSymbol.h - native impl for PDBSymbolExe ---------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVEEXESYMBOL_H
+#define LLVM_DEBUGINFO_PDB_NATIVE_NATIVEEXESYMBOL_H
+
+#include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+
+namespace llvm {
+namespace pdb {
+
+class NativeExeSymbol : public NativeRawSymbol {
+public:
+  NativeExeSymbol(NativeSession &Session);
+
+  std::unique_ptr<IPDBEnumSymbols>
+  findChildren(PDB_SymType Type) const override;
+
+  uint32_t getAge() const override;
+  std::string getSymbolsFileName() const override;
+  PDB_UniqueId getGuid() const override;
+  bool hasCTypes() const override;
+  bool hasPrivateSymbols() const override;
+
+private:
+  PDBFile &File;
+};
+
+} // namespace pdb
+} // namespace llvm
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h b/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
new file mode 100644
index 0000000000000000000000000000000000000000..655bed9ac17c1815939d344f5b6263cc013d688b
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/Native/NativeRawSymbol.h
@@ -0,0 +1,207 @@
+//===- NativeRawSymbol.h - Native implementation of IPDBRawSymbol - C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_NATIVE_NATIVERAWSYMBOL_H
+#define LLVM_DEBUGINFO_PDB_NATIVE_NATIVERAWSYMBOL_H
+
+#include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+
+namespace llvm {
+namespace pdb {
+
+class NativeSession;
+
+class NativeRawSymbol : public IPDBRawSymbol {
+public:
+  explicit NativeRawSymbol(NativeSession &PDBSession);
+
+  void dump(raw_ostream &OS, int Indent) const override;
+
+  std::unique_ptr<IPDBEnumSymbols>
+    findChildren(PDB_SymType Type) const override;
+  std::unique_ptr<IPDBEnumSymbols>
+    findChildren(PDB_SymType Type, StringRef Name,
+      PDB_NameSearchFlags Flags) const override;
+  std::unique_ptr<IPDBEnumSymbols>
+    findChildrenByRVA(PDB_SymType Type, StringRef Name, PDB_NameSearchFlags Flags,
+      uint32_t RVA) const override;
+  std::unique_ptr<IPDBEnumSymbols>
+    findInlineFramesByRVA(uint32_t RVA) const override;
+
+  void getDataBytes(llvm::SmallVector<uint8_t, 32> &Bytes) const override;
+  void getFrontEndVersion(VersionInfo &Version) const override;
+  void getBackEndVersion(VersionInfo &Version) const override;
+  PDB_MemberAccess getAccess() const override;
+  uint32_t getAddressOffset() const override;
+  uint32_t getAddressSection() const override;
+  uint32_t getAge() const override;
+  uint32_t getArrayIndexTypeId() const override;
+  uint32_t getBaseDataOffset() const override;
+  uint32_t getBaseDataSlot() const override;
+  uint32_t getBaseSymbolId() const override;
+  PDB_BuiltinType getBuiltinType() const override;
+  uint32_t getBitPosition() const override;
+  PDB_CallingConv getCallingConvention() const override;
+  uint32_t getClassParentId() const override;
+  std::string getCompilerName() const override;
+  uint32_t getCount() const override;
+  uint32_t getCountLiveRanges() const override;
+  PDB_Lang getLanguage() const override;
+  uint32_t getLexicalParentId() const override;
+  std::string getLibraryName() const override;
+  uint32_t getLiveRangeStartAddressOffset() const override;
+  uint32_t getLiveRangeStartAddressSection() const override;
+  uint32_t getLiveRangeStartRelativeVirtualAddress() const override;
+  codeview::RegisterId getLocalBasePointerRegisterId() const override;
+  uint32_t getLowerBoundId() const override;
+  uint32_t getMemorySpaceKind() const override;
+  std::string getName() const override;
+  uint32_t getNumberOfAcceleratorPointerTags() const override;
+  uint32_t getNumberOfColumns() const override;
+  uint32_t getNumberOfModifiers() const override;
+  uint32_t getNumberOfRegisterIndices() const override;
+  uint32_t getNumberOfRows() const override;
+  std::string getObjectFileName() const override;
+  uint32_t getOemId() const override;
+  uint32_t getOemSymbolId() const override;
+  uint32_t getOffsetInUdt() const override;
+  PDB_Cpu getPlatform() const override;
+  uint32_t getRank() const override;
+  codeview::RegisterId getRegisterId() const override;
+  uint32_t getRegisterType() const override;
+  uint32_t getRelativeVirtualAddress() const override;
+  uint32_t getSamplerSlot() const override;
+  uint32_t getSignature() const override;
+  uint32_t getSizeInUdt() const override;
+  uint32_t getSlot() const override;
+  std::string getSourceFileName() const override;
+  uint32_t getStride() const override;
+  uint32_t getSubTypeId() const override;
+  std::string getSymbolsFileName() const override;
+  uint32_t getSymIndexId() const override;
+  uint32_t getTargetOffset() const override;
+  uint32_t getTargetRelativeVirtualAddress() const override;
+  uint64_t getTargetVirtualAddress() const override;
+  uint32_t getTargetSection() const override;
+  uint32_t getTextureSlot() const override;
+  uint32_t getTimeStamp() const override;
+  uint32_t getToken() const override;
+  uint32_t getTypeId() const override;
+  uint32_t getUavSlot() const override;
+  std::string getUndecoratedName() const override;
+  uint32_t getUnmodifiedTypeId() const override;
+  uint32_t getUpperBoundId() const override;
+  Variant getValue() const override;
+  uint32_t getVirtualBaseDispIndex() const override;
+  uint32_t getVirtualBaseOffset() const override;
+  uint32_t getVirtualTableShapeId() const override;
+  PDB_DataKind getDataKind() const override;
+  PDB_SymType getSymTag() const override;
+  PDB_UniqueId getGuid() const override;
+  int32_t getOffset() const override;
+  int32_t getThisAdjust() const override;
+  int32_t getVirtualBasePointerOffset() const override;
+  PDB_LocType getLocationType() const override;
+  PDB_Machine getMachineType() const override;
+  codeview::ThunkOrdinal getThunkOrdinal() const override;
+  uint64_t getLength() const override;
+  uint64_t getLiveRangeLength() const override;
+  uint64_t getVirtualAddress() const override;
+  PDB_UdtType getUdtKind() const override;
+  bool hasConstructor() const override;
+  bool hasCustomCallingConvention() const override;
+  bool hasFarReturn() const override;
+  bool isCode() const override;
+  bool isCompilerGenerated() const override;
+  bool isConstType() const override;
+  bool isEditAndContinueEnabled() const override;
+  bool isFunction() const override;
+  bool getAddressTaken() const override;
+  bool getNoStackOrdering() const override;
+  bool hasAlloca() const override;
+  bool hasAssignmentOperator() const override;
+  bool hasCTypes() const override;
+  bool hasCastOperator() const override;
+  bool hasDebugInfo() const override;
+  bool hasEH() const override;
+  bool hasEHa() const override;
+  bool hasInlAsm() const override;
+  bool hasInlineAttribute() const override;
+  bool hasInterruptReturn() const override;
+  bool hasFramePointer() const override;
+  bool hasLongJump() const override;
+  bool hasManagedCode() const override;
+  bool hasNestedTypes() const override;
+  bool hasNoInlineAttribute() const override;
+  bool hasNoReturnAttribute() const override;
+  bool hasOptimizedCodeDebugInfo() const override;
+  bool hasOverloadedOperator() const override;
+  bool hasSEH() const override;
+  bool hasSecurityChecks() const override;
+  bool hasSetJump() const override;
+  bool hasStrictGSCheck() const override;
+  bool isAcceleratorGroupSharedLocal() const override;
+  bool isAcceleratorPointerTagLiveRange() const override;
+  bool isAcceleratorStubFunction() const override;
+  bool isAggregated() const override;
+  bool isIntroVirtualFunction() const override;
+  bool isCVTCIL() const override;
+  bool isConstructorVirtualBase() const override;
+  bool isCxxReturnUdt() const override;
+  bool isDataAligned() const override;
+  bool isHLSLData() const override;
+  bool isHotpatchable() const override;
+  bool isIndirectVirtualBaseClass() const override;
+  bool isInterfaceUdt() const override;
+  bool isIntrinsic() const override;
+  bool isLTCG() const override;
+  bool isLocationControlFlowDependent() const override;
+  bool isMSILNetmodule() const override;
+  bool isMatrixRowMajor() const override;
+  bool isManagedCode() const override;
+  bool isMSILCode() const override;
+  bool isMultipleInheritance() const override;
+  bool isNaked() const override;
+  bool isNested() const override;
+  bool isOptimizedAway() const override;
+  bool isPacked() const override;
+  bool isPointerBasedOnSymbolValue() const override;
+  bool isPointerToDataMember() const override;
+  bool isPointerToMemberFunction() const override;
+  bool isPureVirtual() const override;
+  bool isRValueReference() const override;
+  bool isRefUdt() const override;
+  bool isReference() const override;
+  bool isRestrictedType() const override;
+  bool isReturnValue() const override;
+  bool isSafeBuffers() const override;
+  bool isScoped() const override;
+  bool isSdl() const override;
+  bool isSingleInheritance() const override;
+  bool isSplitted() const override;
+  bool isStatic() const override;
+  bool hasPrivateSymbols() const override;
+  bool isUnalignedType() const override;
+  bool isUnreached() const override;
+  bool isValueUdt() const override;
+  bool isVirtual() const override;
+  bool isVirtualBaseClass() const override;
+  bool isVirtualInheritance() const override;
+  bool isVolatileType() const override;
+  bool wasInlined() const override;
+  std::string getUnused() const override;
+
+protected:
+  NativeSession &Session;
+};
+
+}
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/Native/NativeSession.h b/include/llvm/DebugInfo/PDB/Native/NativeSession.h
index e6da266f796d545a7c96f8b64f4b6ffa5c88e314..bbe207738e02115ce599ebef63051ecdf9faebac 100644
--- a/include/llvm/DebugInfo/PDB/Native/NativeSession.h
+++ b/include/llvm/DebugInfo/PDB/Native/NativeSession.h
@@ -32,7 +32,7 @@ public:
 
   uint64_t getLoadAddress() const override;
   void setLoadAddress(uint64_t Address) override;
-  std::unique_ptr<PDBSymbolExe> getGlobalScope() const override;
+  std::unique_ptr<PDBSymbolExe> getGlobalScope() override;
   std::unique_ptr<PDBSymbol> getSymbolById(uint32_t SymbolId) const override;
 
   std::unique_ptr<PDBSymbol>
diff --git a/include/llvm/DebugInfo/PDB/Native/PDBFile.h b/include/llvm/DebugInfo/PDB/Native/PDBFile.h
index 9fb0ccc87f1a05c6824154f2891280e900d4464f..fbca62d6e9d936e7f4fc59c43d9d4602a8fd7979 100644
--- a/include/llvm/DebugInfo/PDB/Native/PDBFile.h
+++ b/include/llvm/DebugInfo/PDB/Native/PDBFile.h
@@ -13,9 +13,8 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/DebugInfo/MSF/IMSFFile.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
@@ -24,6 +23,8 @@
 
 namespace llvm {
 
+class BinaryStream;
+
 namespace msf {
 class MappedBlockStream;
 }
@@ -42,10 +43,13 @@ class PDBFile : public msf::IMSFFile {
   friend PDBFileBuilder;
 
 public:
-  PDBFile(std::unique_ptr<msf::ReadableStream> PdbFileBuffer,
+  PDBFile(StringRef Path, std::unique_ptr<BinaryStream> PdbFileBuffer,
           BumpPtrAllocator &Allocator);
   ~PDBFile() override;
 
+  StringRef getFileDirectory() const;
+  StringRef getFilePath() const;
+
   uint32_t getFreeBlockMapBlock() const;
   uint32_t getUnknown1() const;
 
@@ -77,7 +81,7 @@ public:
   }
 
   const msf::MSFLayout &getMsfLayout() const { return ContainerLayout; }
-  const msf::ReadableStream &getMsfBuffer() const { return *Buffer; }
+  BinaryStreamRef getMsfBuffer() const { return *Buffer; }
 
   ArrayRef<support::ulittle32_t> getDirectoryBlockArray() const;
 
@@ -107,12 +111,13 @@ public:
 private:
   Expected<std::unique_ptr<msf::MappedBlockStream>>
   safelyCreateIndexedStream(const msf::MSFLayout &Layout,
-                            const msf::ReadableStream &MsfData,
+                            BinaryStreamRef MsfData,
                             uint32_t StreamIndex) const;
 
+  std::string FilePath;
   BumpPtrAllocator &Allocator;
 
-  std::unique_ptr<msf::ReadableStream> Buffer;
+  std::unique_ptr<BinaryStream> Buffer;
 
   std::vector<uint32_t> FpmPages;
   msf::MSFLayout ContainerLayout;
diff --git a/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
new file mode 100644
index 0000000000000000000000000000000000000000..d965e1008e95aad65a32e373f6804413bbb8ebae
--- /dev/null
+++ b/include/llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h
@@ -0,0 +1,48 @@
+//===- PDBTypeServerHandler.h -----------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_DEBUGINFO_PDB_PDBTYPESERVERHANDLER_H
+#define LLVM_DEBUGINFO_PDB_PDBTYPESERVERHANDLER_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
+
+#include <memory>
+#include <string>
+
+namespace llvm {
+namespace pdb {
+class NativeSession;
+
+class PDBTypeServerHandler : public codeview::TypeServerHandler {
+public:
+  PDBTypeServerHandler(bool RevisitAlways = false);
+
+  void addSearchPath(StringRef Path);
+  Expected<bool> handle(codeview::TypeServer2Record &TS,
+                        codeview::TypeVisitorCallbacks &Callbacks) override;
+
+private:
+  Expected<bool> handleInternal(PDBFile &File,
+                                codeview::TypeVisitorCallbacks &Callbacks);
+
+  bool RevisitAlways;
+  std::unique_ptr<NativeSession> Session;
+  SmallVector<SmallString<64>, 4> SearchPaths;
+};
+}
+}
+
+#endif
diff --git a/include/llvm/DebugInfo/PDB/Native/PublicsStream.h b/include/llvm/DebugInfo/PDB/Native/PublicsStream.h
index 9adf04d11c0f9c39aacc63eea1890e23eed3b4e5..4a541edd6a7b464457594b04827742a0228b34f5 100644
--- a/include/llvm/DebugInfo/PDB/Native/PublicsStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/PublicsStream.h
@@ -12,11 +12,10 @@
 
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
-
+#include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
@@ -38,16 +37,16 @@ public:
   uint32_t getNumBuckets() const { return NumBuckets; }
   iterator_range<codeview::CVSymbolArray::Iterator>
   getSymbols(bool *HadError) const;
-  msf::FixedStreamArray<support::ulittle32_t> getHashBuckets() const {
+  FixedStreamArray<support::ulittle32_t> getHashBuckets() const {
     return HashBuckets;
   }
-  msf::FixedStreamArray<support::ulittle32_t> getAddressMap() const {
+  FixedStreamArray<support::ulittle32_t> getAddressMap() const {
     return AddressMap;
   }
-  msf::FixedStreamArray<support::ulittle32_t> getThunkMap() const {
+  FixedStreamArray<support::ulittle32_t> getThunkMap() const {
     return ThunkMap;
   }
-  msf::FixedStreamArray<SectionOffset> getSectionOffsets() const {
+  FixedStreamArray<SectionOffset> getSectionOffsets() const {
     return SectionOffsets;
   }
 
@@ -59,11 +58,11 @@ private:
   std::unique_ptr<msf::MappedBlockStream> Stream;
   uint32_t NumBuckets = 0;
   ArrayRef<uint8_t> Bitmap;
-  msf::FixedStreamArray<PSHashRecord> HashRecords;
-  msf::FixedStreamArray<support::ulittle32_t> HashBuckets;
-  msf::FixedStreamArray<support::ulittle32_t> AddressMap;
-  msf::FixedStreamArray<support::ulittle32_t> ThunkMap;
-  msf::FixedStreamArray<SectionOffset> SectionOffsets;
+  FixedStreamArray<PSHashRecord> HashRecords;
+  FixedStreamArray<support::ulittle32_t> HashBuckets;
+  FixedStreamArray<support::ulittle32_t> AddressMap;
+  FixedStreamArray<support::ulittle32_t> ThunkMap;
+  FixedStreamArray<SectionOffset> SectionOffsets;
 
   const HeaderInfo *Header;
   const GSIHashHeader *HashHdr;
diff --git a/include/llvm/DebugInfo/PDB/Native/RawConstants.h b/include/llvm/DebugInfo/PDB/Native/RawConstants.h
index af114ff524916c8e1b801282ce5ef5cae31dc25c..f5d4df8feb2ed2307771b4e2c46380a64267ee75 100644
--- a/include/llvm/DebugInfo/PDB/Native/RawConstants.h
+++ b/include/llvm/DebugInfo/PDB/Native/RawConstants.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_DEBUGINFO_PDB_RAW_PDBRAWCONSTANTS_H
 #define LLVM_DEBUGINFO_PDB_RAW_PDBRAWCONSTANTS_H
 
+#include "llvm/ADT/BitmaskEnum.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 
 #include <cstdint>
@@ -32,6 +33,21 @@ enum PdbRaw_ImplVer : uint32_t {
   PdbImplVC140 = 20140508,
 };
 
+enum class PdbRaw_FeatureSig : uint32_t {
+  VC110 = PdbImplVC110,
+  VC140 = PdbImplVC140,
+  NoTypeMerge = 0x4D544F4E,
+  MinimalDebugInfo = 0x494E494D,
+};
+
+enum PdbRaw_Features : uint32_t {
+  PdbFeatureNone = 0x0,
+  PdbFeatureContainsIdStream = 0x1,
+  PdbFeatureMinimalDebugInfo = 0x2,
+  PdbFeatureNoTypeMerging = 0x4,
+  LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ PdbFeatureNoTypeMerging)
+};
+
 enum PdbRaw_DbiVer : uint32_t {
   PdbDbiVC41 = 930803,
   PdbDbiV50 = 19960307,
diff --git a/include/llvm/DebugInfo/PDB/Native/RawError.h b/include/llvm/DebugInfo/PDB/Native/RawError.h
index f96b8066bbe56dce7c1a4bf3a8592aaa1a66a895..3624a7682e3876dcd269550e7939bee0d15f238a 100644
--- a/include/llvm/DebugInfo/PDB/Native/RawError.h
+++ b/include/llvm/DebugInfo/PDB/Native/RawError.h
@@ -28,6 +28,7 @@ enum class raw_error_code {
   duplicate_entry,
   no_entry,
   not_writable,
+  stream_too_long,
   invalid_tpi_hash,
 };
 
diff --git a/include/llvm/DebugInfo/PDB/Native/RawTypes.h b/include/llvm/DebugInfo/PDB/Native/RawTypes.h
index 5f0a824b4dd616e2f0b70f42e1a93205b0eb54da..1b2631efce70e7d9d43cc172cb9572c38f814bd3 100644
--- a/include/llvm/DebugInfo/PDB/Native/RawTypes.h
+++ b/include/llvm/DebugInfo/PDB/Native/RawTypes.h
@@ -266,6 +266,10 @@ struct PDB_UniqueId {
   uint8_t Guid[16];
 };
 
+inline bool operator==(const PDB_UniqueId &LHS, const PDB_UniqueId &RHS) {
+  return 0 == ::memcmp(LHS.Guid, RHS.Guid, sizeof(LHS.Guid));
+}
+
 // The header preceeding the global TPI stream.
 // This corresponds to `HDR` in PDB/dbi/tpi.h.
 struct TpiStreamHeader {
diff --git a/include/llvm/DebugInfo/PDB/Native/StringTable.h b/include/llvm/DebugInfo/PDB/Native/StringTable.h
index bf26b4b62c623670418b0c755afca0123015f618..dd5e30e6182708e0089def42e0917adca5073540 100644
--- a/include/llvm/DebugInfo/PDB/Native/StringTable.h
+++ b/include/llvm/DebugInfo/PDB/Native/StringTable.h
@@ -12,24 +12,25 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
 #include <vector>
 
 namespace llvm {
-namespace msf {
-class StreamReader;
-}
+class BinaryStreamReader;
+
 namespace pdb {
 
 class StringTable {
 public:
   StringTable();
 
-  Error load(msf::StreamReader &Stream);
+  Error load(BinaryStreamReader &Stream);
+
+  uint32_t getByteSize() const;
 
   uint32_t getNameCount() const { return NameCount; }
   uint32_t getHashVersion() const { return HashVersion; }
@@ -38,14 +39,15 @@ public:
   StringRef getStringForID(uint32_t ID) const;
   uint32_t getIDForString(StringRef Str) const;
 
-  msf::FixedStreamArray<support::ulittle32_t> name_ids() const;
+  FixedStreamArray<support::ulittle32_t> name_ids() const;
 
 private:
-  msf::ReadableStreamRef NamesBuffer;
-  msf::FixedStreamArray<support::ulittle32_t> IDs;
-  uint32_t Signature;
-  uint32_t HashVersion;
-  uint32_t NameCount;
+  BinaryStreamRef NamesBuffer;
+  FixedStreamArray<support::ulittle32_t> IDs;
+  uint32_t ByteSize = 0;
+  uint32_t Signature = 0;
+  uint32_t HashVersion = 0;
+  uint32_t NameCount = 0;
 };
 
 } // end namespace pdb
diff --git a/include/llvm/DebugInfo/PDB/Native/StringTableBuilder.h b/include/llvm/DebugInfo/PDB/Native/StringTableBuilder.h
index d6a20831b1e3c9efcf3269fff4683b32d1ffaa92..dd0f40b1978d8afc871c1e4506dbf310d0b86116 100644
--- a/include/llvm/DebugInfo/PDB/Native/StringTableBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/StringTableBuilder.h
@@ -20,9 +20,8 @@
 #include <vector>
 
 namespace llvm {
-namespace msf {
-class StreamWriter;
-}
+class BinaryStreamWriter;
+
 namespace pdb {
 
 class StringTableBuilder {
@@ -32,7 +31,7 @@ public:
   uint32_t insert(StringRef S);
 
   uint32_t finalize();
-  Error commit(msf::StreamWriter &Writer) const;
+  Error commit(BinaryStreamWriter &Writer) const;
 
 private:
   DenseMap<StringRef, uint32_t> Strings;
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h
index 72d25190065ca3b61afaf3b8f2c3c48a00933d79..dd2698c354a20125cc8e3421f1117ec9e4154dba 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiHashing.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiHashing.h
@@ -15,8 +15,8 @@
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <cstdint>
@@ -57,7 +57,7 @@ private:
 
 class TpiHashVerifier : public codeview::TypeVisitorCallbacks {
 public:
-  TpiHashVerifier(msf::FixedStreamArray<support::ulittle32_t> &HashValues,
+  TpiHashVerifier(FixedStreamArray<support::ulittle32_t> &HashValues,
                   uint32_t NumHashBuckets)
       : HashValues(HashValues), NumHashBuckets(NumHashBuckets) {}
 
@@ -83,7 +83,7 @@ private:
             utohexstr(codeview::TypeIndex::FirstNonSimpleIndex + Index));
   }
 
-  msf::FixedStreamArray<support::ulittle32_t> HashValues;
+  FixedStreamArray<support::ulittle32_t> HashValues;
   codeview::CVType RawRecord;
   uint32_t NumHashBuckets;
   uint32_t Index = -1;
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
index b9368a7ef25a3f2386879b074e182264a3a0008a..62dde0ef08b767765f1e3fe2be9865d0e043bbb5 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStream.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStream.h
@@ -11,11 +11,11 @@
 #define LLVM_DEBUGINFO_PDB_RAW_PDBTPISTREAM_H
 
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
 #include "llvm/Support/raw_ostream.h"
 
 #include "llvm/Support/Error.h"
@@ -46,11 +46,11 @@ public:
 
   uint32_t getHashKeySize() const;
   uint32_t NumHashBuckets() const;
-  msf::FixedStreamArray<support::ulittle32_t> getHashValues() const;
-  msf::FixedStreamArray<TypeIndexOffset> getTypeIndexOffsets() const;
+  FixedStreamArray<support::ulittle32_t> getHashValues() const;
+  FixedStreamArray<TypeIndexOffset> getTypeIndexOffsets() const;
   HashTable &getHashAdjusters();
 
-  iterator_range<codeview::CVTypeArray::Iterator> types(bool *HadError) const;
+  codeview::CVTypeRange types(bool *HadError) const;
 
   Error commit();
 
@@ -62,9 +62,9 @@ private:
 
   codeview::CVTypeArray TypeRecords;
 
-  std::unique_ptr<msf::ReadableStream> HashStream;
-  msf::FixedStreamArray<support::ulittle32_t> HashValues;
-  msf::FixedStreamArray<TypeIndexOffset> TypeIndexOffsets;
+  std::unique_ptr<BinaryStream> HashStream;
+  FixedStreamArray<support::ulittle32_t> HashValues;
+  FixedStreamArray<TypeIndexOffset> TypeIndexOffsets;
   HashTable HashAdjusters;
 
   const TpiStreamHeader *Header;
diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
index 74145d4df9f250714c3de3a7c55741040bd0755a..a29ed0b610d364c5a106b8ed41fc1431ed91c2e8 100644
--- a/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
+++ b/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h
@@ -12,31 +12,33 @@
 
 #include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/SequencedItemStream.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryItemStream.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 
 #include <vector>
 
 namespace llvm {
-namespace codeview {
-class TypeRecord;
-}
-namespace msf {
-class ByteStream;
-class MSFBuilder;
-struct MSFLayout;
-class ReadableStreamRef;
-class WritableStream;
+class BinaryByteStream;
+class WritableBinaryStreamRef;
 
-template <> struct SequencedItemTraits<llvm::codeview::CVType> {
+template <> struct BinaryItemTraits<llvm::codeview::CVType> {
   static size_t length(const codeview::CVType &Item) { return Item.length(); }
   static ArrayRef<uint8_t> bytes(const codeview::CVType &Item) {
     return Item.data();
   }
 };
+
+namespace codeview {
+class TypeRecord;
+}
+namespace msf {
+class MSFBuilder;
+struct MSFLayout;
 }
 namespace pdb {
 class PDBFile;
@@ -52,26 +54,30 @@ public:
   TpiStreamBuilder &operator=(const TpiStreamBuilder &) = delete;
 
   void setVersionHeader(PdbRaw_TpiVer Version);
-  void addTypeRecord(const codeview::CVType &Record);
+  void addTypeRecord(ArrayRef<uint8_t> Type, Optional<uint32_t> Hash);
 
   Error finalizeMsfLayout();
 
-  Error commit(const msf::MSFLayout &Layout, const msf::WritableStream &Buffer);
+  Error commit(const msf::MSFLayout &Layout, WritableBinaryStreamRef Buffer);
 
-  uint32_t calculateSerializedLength() const;
+  uint32_t calculateSerializedLength();
 
 private:
   uint32_t calculateHashBufferSize() const;
+  uint32_t calculateIndexOffsetSize() const;
   Error finalize();
 
   msf::MSFBuilder &Msf;
   BumpPtrAllocator &Allocator;
 
+  size_t TypeRecordBytes = 0;
+
   Optional<PdbRaw_TpiVer> VerHeader;
-  std::vector<codeview::CVType> TypeRecords;
-  msf::SequencedItemStream<codeview::CVType> TypeRecordStream;
+  std::vector<ArrayRef<uint8_t>> TypeRecords;
+  std::vector<uint32_t> TypeHashes;
+  std::vector<TypeIndexOffset> TypeIndexOffsets;
   uint32_t HashStreamIndex = kInvalidStreamIndex;
-  std::unique_ptr<msf::ByteStream> HashValueStream;
+  std::unique_ptr<BinaryByteStream> HashValueStream;
 
   const TpiStreamHeader *Header;
   uint32_t Idx;
diff --git a/include/llvm/DebugInfo/PDB/PDBExtras.h b/include/llvm/DebugInfo/PDB/PDBExtras.h
index 5a7422d9e9e4a529a8a46bf195cec74504fa6ba8..fc5787556a6d17693df017590aa715517e2c2ece 100644
--- a/include/llvm/DebugInfo/PDB/PDBExtras.h
+++ b/include/llvm/DebugInfo/PDB/PDBExtras.h
@@ -30,8 +30,8 @@ raw_ostream &operator<<(raw_ostream &OS, const PDB_Checksum &Checksum);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_Lang &Lang);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_SymType &Tag);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_MemberAccess &Access);
+raw_ostream &operator<<(raw_ostream &OS, const PDB_UniqueId &Guid);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_UdtType &Type);
-raw_ostream &operator<<(raw_ostream &OS, const PDB_UniqueId &Id);
 raw_ostream &operator<<(raw_ostream &OS, const PDB_Machine &Machine);
 
 raw_ostream &operator<<(raw_ostream &OS, const Variant &Value);
diff --git a/include/llvm/DebugInfo/PDB/PDBSymDumper.h b/include/llvm/DebugInfo/PDB/PDBSymDumper.h
index 095c33cfe8b5892e7b9b1fdbfd6b44a80f14926d..c976935c48e027c890439e77b3ac49284d94cd25 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymDumper.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymDumper.h
@@ -54,6 +54,22 @@ public:
   virtual void dump(const PDBSymbolUnknown &Symbol);
   virtual void dump(const PDBSymbolUsingNamespace &Symbol);
 
+  virtual void dumpRight(const PDBSymbolTypeArray &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeBaseClass &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeBuiltin &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeCustom &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeDimension &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeEnum &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeFriend &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeFunctionArg &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeFunctionSig &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeManaged &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypePointer &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeTypedef &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeUDT &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeVTable &Symbol) {}
+  virtual void dumpRight(const PDBSymbolTypeVTableShape &Symbol) {}
+
 private:
   bool RequireImpl;
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbol.h b/include/llvm/DebugInfo/PDB/PDBSymbol.h
index bf51188065407cb0614b63df57c64ceb446d6f65..652f2136105650d00bc931b9b6eec83b3d4b70b4 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbol.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbol.h
@@ -22,6 +22,23 @@
     return RawSymbol->MethodName();                                            \
   }
 
+#define FORWARD_CONCRETE_SYMBOL_ID_METHOD_WITH_NAME(ConcreteType, PrivateName, \
+                                                    PublicName)                \
+  auto PublicName##Id() const->decltype(RawSymbol->PrivateName##Id()) {        \
+    return RawSymbol->PrivateName##Id();                                       \
+  }                                                                            \
+  std::unique_ptr<ConcreteType> PublicName() const {                           \
+    uint32_t Id = PublicName##Id();                                            \
+    return getConcreteSymbolByIdHelper<ConcreteType>(Id);                      \
+  }
+
+#define FORWARD_SYMBOL_ID_METHOD_WITH_NAME(PrivateName, PublicName)            \
+  FORWARD_CONCRETE_SYMBOL_ID_METHOD_WITH_NAME(PDBSymbol, PrivateName,          \
+                                              PublicName)
+
+#define FORWARD_SYMBOL_ID_METHOD(MethodName)                                   \
+  FORWARD_SYMBOL_ID_METHOD_WITH_NAME(MethodName, MethodName)
+
 namespace llvm {
 
 class StringRef;
@@ -29,6 +46,7 @@ class raw_ostream;
 
 namespace pdb {
 class IPDBRawSymbol;
+class IPDBSession;
 
 #define DECLARE_PDB_SYMBOL_CONCRETE_TYPE(TagValue)                             \
   static const PDB_SymType Tag = TagValue;                                     \
@@ -56,7 +74,14 @@ public:
   /// unknown properties, but individual implementations of PDBSymbol may
   /// override the behavior to only dump known fields.
   virtual void dump(PDBSymDumper &Dumper) const = 0;
+
+  /// For certain PDBSymbolTypes, dumps additional information for the type that
+  /// normally goes on the right side of the symbol.
+  virtual void dumpRight(PDBSymDumper &Dumper) const {}
+
   void defaultDump(raw_ostream &OS, int Indent) const;
+  void dumpProperties() const;
+  void dumpChildStats() const;
 
   PDB_SymType getSymTag() const;
   uint32_t getSymIndexId() const;
@@ -66,6 +91,14 @@ public:
     return Enumerator->getNext();
   }
 
+  template <typename T> T *cast() { return llvm::dyn_cast<T>(this); }
+
+  template <typename T> const T *cast() const {
+    return llvm::dyn_cast<T>(this);
+  }
+
+  std::unique_ptr<PDBSymbol> clone() const;
+
   template <typename T>
   std::unique_ptr<ConcreteSymbolEnumerator<T>> findAllChildren() const {
     auto BaseIter = RawSymbol->findChildren(T::Tag);
@@ -91,6 +124,20 @@ public:
   std::unique_ptr<IPDBEnumSymbols> getChildStats(TagStats &Stats) const;
 
 protected:
+  std::unique_ptr<PDBSymbol> getSymbolByIdHelper(uint32_t Id) const;
+
+  template <typename ConcreteType>
+  std::unique_ptr<ConcreteType> getConcreteSymbolByIdHelper(uint32_t Id) const {
+    auto Sym = getSymbolByIdHelper(Id);
+    if (!Sym)
+      return nullptr;
+    ConcreteType *Result = Sym->cast<ConcreteType>();
+    if (!Result)
+      return nullptr;
+    Sym.release();
+    return std::unique_ptr<ConcreteType>(Result);
+  }
+
   const IPDBSession &Session;
   const std::unique_ptr<IPDBRawSymbol> RawSymbol;
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h b/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
index d0ff62ca7c3f50a20f5082a32da84c17bb8d62d3..d81da1eaa0237c5d6458899feb406f1f1c1a0e1e 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolBlock.h
@@ -30,7 +30,7 @@ public:
   FORWARD_SYMBOL_METHOD(getAddressOffset)
   FORWARD_SYMBOL_METHOD(getAddressSection)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getLocationType)
   FORWARD_SYMBOL_METHOD(getName)
   FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h b/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h
index f1983b3f7bf5b689712acf1e9aefa0a95c46ceda..26788017cf32e7e690d04b20263ff6f7ada22f7b 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolCompiland.h
@@ -29,7 +29,7 @@ public:
   void dump(PDBSymDumper &Dumper) const override;
 
   FORWARD_SYMBOL_METHOD(isEditAndContinueEnabled)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getLibraryName)
   FORWARD_SYMBOL_METHOD(getName)
 
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h b/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
index bb4a78f68e2f7de925ca18cf6498f7195caeafc0..dba50c42cf818da9c41879374e1e555332e5476e 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolCompilandDetails.h
@@ -46,7 +46,7 @@ public:
   FORWARD_SYMBOL_METHOD(isLTCG)
   FORWARD_SYMBOL_METHOD(isMSILNetmodule)
   FORWARD_SYMBOL_METHOD(getLanguage)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getPlatform)
   FORWARD_SYMBOL_METHOD(getSourceFileName)
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h b/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
index a71a0ba2df589f19006563d71aebb579940f7e20..7868f0459086b68744d8884a9a7ca0a4faa00e4b 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolCompilandEnv.h
@@ -26,7 +26,7 @@ public:
 
   void dump(PDBSymDumper &Dumper) const override;
 
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getName)
   std::string getValue() const;
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolData.h b/include/llvm/DebugInfo/PDB/PDBSymbolData.h
index 36f32ab51c1108b675de3c6625677b3907bcf782..ad4285df4d44e9d3625e8626cfe587b3d42b1664 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolData.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolData.h
@@ -26,8 +26,6 @@ public:
 
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Data)
 
-  std::unique_ptr<PDBSymbol> getType() const;
-
   void dump(PDBSymDumper &Dumper) const override;
 
   FORWARD_SYMBOL_METHOD(getAccess)
@@ -35,14 +33,14 @@ public:
   FORWARD_SYMBOL_METHOD(getAddressSection)
   FORWARD_SYMBOL_METHOD(getAddressTaken)
   FORWARD_SYMBOL_METHOD(getBitPosition)
-  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
   FORWARD_SYMBOL_METHOD(isCompilerGenerated)
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(getDataKind)
   FORWARD_SYMBOL_METHOD(isAggregated)
   FORWARD_SYMBOL_METHOD(isSplitted)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getLocationType)
   FORWARD_SYMBOL_METHOD(getName)
   FORWARD_SYMBOL_METHOD(getOffset)
@@ -50,7 +48,7 @@ public:
   FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
   FORWARD_SYMBOL_METHOD(getSlot)
   FORWARD_SYMBOL_METHOD(getToken)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getType)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(getValue)
   FORWARD_SYMBOL_METHOD(getVirtualAddress)
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h b/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
index 7170bcbe846c63a3a849a85106860dfc3b7f6c83..5686f8716a0cc820aaff78630c5a11d5d714f2a6 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolFunc.h
@@ -11,6 +11,7 @@
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLFUNC_H
 
 #include "PDBSymbol.h"
+#include "PDBSymbolTypeFunctionSig.h"
 #include "PDBTypes.h"
 
 namespace llvm {
@@ -26,8 +27,6 @@ public:
 
   void dump(PDBSymDumper &Dumper) const override;
 
-  std::unique_ptr<PDBSymbolTypeFunctionSig> getSignature() const;
-  std::unique_ptr<PDBSymbolTypeUDT> getClassParent() const;
   std::unique_ptr<IPDBEnumChildren<PDBSymbolData>> getArguments() const;
 
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::Function)
@@ -35,7 +34,7 @@ public:
   FORWARD_SYMBOL_METHOD(getAccess)
   FORWARD_SYMBOL_METHOD(getAddressOffset)
   FORWARD_SYMBOL_METHOD(getAddressSection)
-  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
   FORWARD_SYMBOL_METHOD(isCompilerGenerated)
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(hasCustomCallingConvention)
@@ -54,7 +53,7 @@ public:
   FORWARD_SYMBOL_METHOD(isNaked)
   FORWARD_SYMBOL_METHOD(isStatic)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getLocalBasePointerRegisterId)
   FORWARD_SYMBOL_METHOD(getLocationType)
   FORWARD_SYMBOL_METHOD(getName)
@@ -67,7 +66,8 @@ public:
   FORWARD_SYMBOL_METHOD(isPureVirtual)
   FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
   FORWARD_SYMBOL_METHOD(getToken)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_CONCRETE_SYMBOL_ID_METHOD_WITH_NAME(PDBSymbolTypeFunctionSig, getType,
+                                              getSignature)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(getUndecoratedName)
   FORWARD_SYMBOL_METHOD(isVirtual)
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h b/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
index 464389503bef26e74de6709d38e14f321ecdb21c..3341bd9b30fd903b3ce2b2912c381f0daf040555 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugEnd.h
@@ -34,7 +34,7 @@ public:
   FORWARD_SYMBOL_METHOD(hasFarReturn)
   FORWARD_SYMBOL_METHOD(hasInterruptReturn)
   FORWARD_SYMBOL_METHOD(isStatic)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getLocationType)
   FORWARD_SYMBOL_METHOD(hasNoInlineAttribute)
   FORWARD_SYMBOL_METHOD(hasNoReturnAttribute)
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h b/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
index c2e3dd39be6cf21c2764cf1a3573d10d28889cba..6729838597c88adb6edb427ddc511ec8579eb2a9 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolFuncDebugStart.h
@@ -33,7 +33,7 @@ public:
   FORWARD_SYMBOL_METHOD(hasFarReturn)
   FORWARD_SYMBOL_METHOD(hasInterruptReturn)
   FORWARD_SYMBOL_METHOD(isStatic)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getLocationType)
   FORWARD_SYMBOL_METHOD(hasNoInlineAttribute)
   FORWARD_SYMBOL_METHOD(hasNoReturnAttribute)
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h b/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
index 3aeae10b47bcd44060183df972067e41fd907d2d..c2b1c28c929eafca9df1ce8d761f9090831e85d5 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolLabel.h
@@ -32,7 +32,7 @@ public:
   FORWARD_SYMBOL_METHOD(hasCustomCallingConvention)
   FORWARD_SYMBOL_METHOD(hasFarReturn)
   FORWARD_SYMBOL_METHOD(hasInterruptReturn)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getLocationType)
   FORWARD_SYMBOL_METHOD(getName)
   FORWARD_SYMBOL_METHOD(hasNoInlineAttribute)
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h b/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
index be0734445973ff2fca92a9a66eceeb2dbc62046a..c9e6ee67c575b3f8172da475ec6b08199158ba67 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolPublicSymbol.h
@@ -32,7 +32,7 @@ public:
   FORWARD_SYMBOL_METHOD(isCode)
   FORWARD_SYMBOL_METHOD(isFunction)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getLocationType)
   FORWARD_SYMBOL_METHOD(isManagedCode)
   FORWARD_SYMBOL_METHOD(isMSILCode)
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h b/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
index 63f7a09fc881d5213dc17e9b0c61977795458320..614fad86caa840ec339a8caedc9556e31b6d40bc 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolThunk.h
@@ -30,12 +30,12 @@ public:
   FORWARD_SYMBOL_METHOD(getAccess)
   FORWARD_SYMBOL_METHOD(getAddressOffset)
   FORWARD_SYMBOL_METHOD(getAddressSection)
-  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(isIntroVirtualFunction)
   FORWARD_SYMBOL_METHOD(isStatic)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getName)
   FORWARD_SYMBOL_METHOD(isPureVirtual)
   FORWARD_SYMBOL_METHOD(getRelativeVirtualAddress)
@@ -44,7 +44,7 @@ public:
   FORWARD_SYMBOL_METHOD(getTargetVirtualAddress)
   FORWARD_SYMBOL_METHOD(getTargetSection)
   FORWARD_SYMBOL_METHOD(getThunkOrdinal)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getType)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(isVirtual)
   FORWARD_SYMBOL_METHOD(getVirtualAddress)
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
index 57db03661fb7ebc0ab964e0c20e00234cbc0c827..39b7d3b300ea07a16cf7ec6ddadc16fefff99bd4 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeArray.h
@@ -25,17 +25,16 @@ public:
 
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::ArrayType)
 
-  std::unique_ptr<PDBSymbol> getElementType() const;
-
   void dump(PDBSymDumper &Dumper) const override;
+  void dumpRight(PDBSymDumper &Dumper) const override;
 
-  FORWARD_SYMBOL_METHOD(getArrayIndexTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getArrayIndexType)
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(getCount)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getRank)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_ID_METHOD_WITH_NAME(getType, getElementType)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
index aaa3ab7988d71a6c20902dd3c845122ce6738c00..0924efb8aa9c004f2d78db377a2f746e8ecca8ad 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeBaseClass.h
@@ -28,7 +28,7 @@ public:
   void dump(PDBSymDumper &Dumper) const override;
 
   FORWARD_SYMBOL_METHOD(getAccess)
-  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
   FORWARD_SYMBOL_METHOD(hasConstructor)
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(hasAssignmentOperator)
@@ -36,14 +36,14 @@ public:
   FORWARD_SYMBOL_METHOD(hasNestedTypes)
   FORWARD_SYMBOL_METHOD(isIndirectVirtualBaseClass)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getName)
   FORWARD_SYMBOL_METHOD(isNested)
   FORWARD_SYMBOL_METHOD(getOffset)
   FORWARD_SYMBOL_METHOD(hasOverloadedOperator)
   FORWARD_SYMBOL_METHOD(isPacked)
   FORWARD_SYMBOL_METHOD(isScoped)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getType)
   FORWARD_SYMBOL_METHOD(getUdtKind)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
 
@@ -51,7 +51,7 @@ public:
   FORWARD_SYMBOL_METHOD(getVirtualBaseDispIndex)
   FORWARD_SYMBOL_METHOD(getVirtualBasePointerOffset)
   // FORWARD_SYMBOL_METHOD(getVirtualBaseTableType)
-  FORWARD_SYMBOL_METHOD(getVirtualTableShapeId)
+  FORWARD_SYMBOL_ID_METHOD(getVirtualTableShape)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
 
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
index c8f59f1f140a1a5420278d1ba7b778953cad1ca0..5b1863c42a04bbd59528c1689d356322172313c8 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
@@ -30,7 +30,7 @@ public:
   FORWARD_SYMBOL_METHOD(getBuiltinType)
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
index ade2887bac14be53059487f70e59886b19f0c088..c5ae3c51162c6797d2ab35277e9ff4cb8cfcc1e1 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h
@@ -11,6 +11,7 @@
 #define LLVM_DEBUGINFO_PDB_PDBSYMBOLTYPEENUM_H
 
 #include "PDBSymbol.h"
+#include "PDBSymbolTypeBuiltin.h"
 #include "PDBTypes.h"
 
 namespace llvm {
@@ -27,25 +28,22 @@ public:
 
   void dump(PDBSymDumper &Dumper) const override;
 
-  std::unique_ptr<PDBSymbolTypeUDT> getClassParent() const;
-  std::unique_ptr<PDBSymbolTypeBuiltin> getUnderlyingType() const;
-
   FORWARD_SYMBOL_METHOD(getBuiltinType)
-  FORWARD_SYMBOL_METHOD(getClassParentId)
-  FORWARD_SYMBOL_METHOD(getUnmodifiedTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
   FORWARD_SYMBOL_METHOD(hasConstructor)
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(hasAssignmentOperator)
   FORWARD_SYMBOL_METHOD(hasCastOperator)
   FORWARD_SYMBOL_METHOD(hasNestedTypes)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getName)
   FORWARD_SYMBOL_METHOD(isNested)
   FORWARD_SYMBOL_METHOD(hasOverloadedOperator)
   FORWARD_SYMBOL_METHOD(isPacked)
   FORWARD_SYMBOL_METHOD(isScoped)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_CONCRETE_SYMBOL_ID_METHOD_WITH_NAME(PDBSymbolTypeBuiltin, getType,
+                                              getUnderlyingType)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
index 196d149ed2a2027acd3a8fc75e8eef17e3eb6efd..24c13128111f7cc1dcd2d397b84d906fb89347e4 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFriend.h
@@ -27,9 +27,9 @@ public:
 
   void dump(PDBSymDumper &Dumper) const override;
 
-  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
   FORWARD_SYMBOL_METHOD(getName)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getType)
 };
 
 } // namespace llvm
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
index 5561341d7e777d064723f1ba59cdaeb142b76db0..3855999c473f5eb5a845a5e799928f6b02622845 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionArg.h
@@ -27,9 +27,9 @@ public:
 
   void dump(PDBSymDumper &Dumper) const override;
 
-  FORWARD_SYMBOL_METHOD(getClassParentId)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
+  FORWARD_SYMBOL_ID_METHOD(getType)
 };
 
 } // namespace llvm
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h
index 516011ff8b3dc1f3daa4de1a77dbbc36f3cfaab1..8de54e70701ddef5345ff8196866b41e00ac81af 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeFunctionSig.h
@@ -25,22 +25,21 @@ public:
 
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::FunctionSig)
 
-  std::unique_ptr<PDBSymbol> getReturnType() const;
   std::unique_ptr<IPDBEnumSymbols> getArguments() const;
-  std::unique_ptr<PDBSymbol> getClassParent() const;
 
   void dump(PDBSymDumper &Dumper) const override;
+  void dumpRight(PDBSymDumper &Dumper) const override;
   void dumpArgList(raw_ostream &OS) const;
 
   FORWARD_SYMBOL_METHOD(getCallingConvention)
-  FORWARD_SYMBOL_METHOD(getClassParentId)
-  FORWARD_SYMBOL_METHOD(getUnmodifiedTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
+  FORWARD_SYMBOL_ID_METHOD(getUnmodifiedType)
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(getCount)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   // FORWARD_SYMBOL_METHOD(getObjectPointerType)
   FORWARD_SYMBOL_METHOD(getThisAdjust)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_ID_METHOD_WITH_NAME(getType, getReturnType)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
index 7a57272adb791e1420ba83e9d1f0e5a70712062d..c502d4e77afee464a1e871823a378bc331a75cd3 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypePointer.h
@@ -25,15 +25,14 @@ public:
 
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::PointerType)
 
-  std::unique_ptr<PDBSymbol> getPointeeType() const;
-
   void dump(PDBSymDumper &Dumper) const override;
+  void dumpRight(PDBSymDumper &Dumper) const override;
 
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(isReference)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_ID_METHOD_WITH_NAME(getType, getPointeeType)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
index 5ed4f8d21d9029c2cc797d0361084aa44839e5d0..16c1d1b88c6d080676a7c80788a7ad7f656ceab8 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h
@@ -28,24 +28,24 @@ public:
   void dump(PDBSymDumper &Dumper) const override;
 
   FORWARD_SYMBOL_METHOD(getBuiltinType)
-  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
   FORWARD_SYMBOL_METHOD(hasConstructor)
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(hasAssignmentOperator)
   FORWARD_SYMBOL_METHOD(hasCastOperator)
   FORWARD_SYMBOL_METHOD(hasNestedTypes)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getName)
   FORWARD_SYMBOL_METHOD(isNested)
   FORWARD_SYMBOL_METHOD(hasOverloadedOperator)
   FORWARD_SYMBOL_METHOD(isPacked)
   FORWARD_SYMBOL_METHOD(isReference)
   FORWARD_SYMBOL_METHOD(isScoped)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getType)
   FORWARD_SYMBOL_METHOD(getUdtKind)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
-  FORWARD_SYMBOL_METHOD(getVirtualTableShapeId)
+  FORWARD_SYMBOL_ID_METHOD(getVirtualTableShape)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
 
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
index 1874dfef34f7fcd1bfd91f5d21cca5f984251716..47a4525a47bac91f232c3526dcc7d217d9e586ba 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h
@@ -27,15 +27,15 @@ public:
 
   void dump(PDBSymDumper &Dumper) const override;
 
-  FORWARD_SYMBOL_METHOD(getClassParentId)
-  FORWARD_SYMBOL_METHOD(getUnmodifiedTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
+  FORWARD_SYMBOL_ID_METHOD(getUnmodifiedType)
   FORWARD_SYMBOL_METHOD(hasConstructor)
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(hasAssignmentOperator)
   FORWARD_SYMBOL_METHOD(hasCastOperator)
   FORWARD_SYMBOL_METHOD(hasNestedTypes)
   FORWARD_SYMBOL_METHOD(getLength)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getName)
   FORWARD_SYMBOL_METHOD(isNested)
   FORWARD_SYMBOL_METHOD(hasOverloadedOperator)
@@ -43,7 +43,7 @@ public:
   FORWARD_SYMBOL_METHOD(isScoped)
   FORWARD_SYMBOL_METHOD(getUdtKind)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
-  FORWARD_SYMBOL_METHOD(getVirtualTableShapeId)
+  FORWARD_SYMBOL_ID_METHOD(getVirtualTableShape)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
 }
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
index baf7ab79d60e6cbd19534048186143747499e913..17612ff460643ea08802486ac4326e2130ebeda7 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h
@@ -27,10 +27,10 @@ public:
 
   void dump(PDBSymDumper &Dumper) const override;
 
-  FORWARD_SYMBOL_METHOD(getClassParentId)
+  FORWARD_SYMBOL_ID_METHOD(getClassParent)
   FORWARD_SYMBOL_METHOD(isConstType)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
-  FORWARD_SYMBOL_METHOD(getTypeId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
+  FORWARD_SYMBOL_ID_METHOD(getType)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h b/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
index 431fc1ac8625711d29c06a20b92c7ee5efde2979..8acaabea5bb889849332b85f99089a2605105cd3 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolTypeVTableShape.h
@@ -29,7 +29,7 @@ public:
 
   FORWARD_SYMBOL_METHOD(isConstType)
   FORWARD_SYMBOL_METHOD(getCount)
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(isUnalignedType)
   FORWARD_SYMBOL_METHOD(isVolatileType)
 };
diff --git a/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h b/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
index a273fe159c12680e73b8f73e55bed2398858b05f..70fbd5b84c34e0c2525cba310b5d090bca4e0294 100644
--- a/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
+++ b/include/llvm/DebugInfo/PDB/PDBSymbolUsingNamespace.h
@@ -27,7 +27,7 @@ public:
 
   void dump(PDBSymDumper &Dumper) const override;
 
-  FORWARD_SYMBOL_METHOD(getLexicalParentId)
+  FORWARD_SYMBOL_ID_METHOD(getLexicalParent)
   FORWARD_SYMBOL_METHOD(getName)
 };
 
diff --git a/include/llvm/DebugInfo/Symbolize/DIPrinter.h b/include/llvm/DebugInfo/Symbolize/DIPrinter.h
index 49f86eae01cf03273760f428185ef322168a8877..ab82be3706d8391c0b24ec5da3d547f67548ef00 100644
--- a/include/llvm/DebugInfo/Symbolize/DIPrinter.h
+++ b/include/llvm/DebugInfo/Symbolize/DIPrinter.h
@@ -29,15 +29,18 @@ class DIPrinter {
   bool PrintFunctionNames;
   bool PrintPretty;
   int PrintSourceContext;
+  bool Verbose;
 
   void print(const DILineInfo &Info, bool Inlined);
   void printContext(const std::string &FileName, int64_t Line);
 
 public:
   DIPrinter(raw_ostream &OS, bool PrintFunctionNames = true,
-            bool PrintPretty = false, int PrintSourceContext = 0)
+            bool PrintPretty = false, int PrintSourceContext = 0,
+            bool Verbose = false)
       : OS(OS), PrintFunctionNames(PrintFunctionNames),
-        PrintPretty(PrintPretty), PrintSourceContext(PrintSourceContext) {}
+        PrintPretty(PrintPretty), PrintSourceContext(PrintSourceContext),
+        Verbose(Verbose) {}
 
   DIPrinter &operator<<(const DILineInfo &Info);
   DIPrinter &operator<<(const DIInliningInfo &Info);
diff --git a/include/llvm/ExecutionEngine/Orc/OrcError.h b/include/llvm/ExecutionEngine/Orc/OrcError.h
index 34ce2c174c4bd6b7ac1fccdc9b7ebe973ba4eaf4..2fe4a5ee0588998a51ef21087638fb7cadcb4fb9 100644
--- a/include/llvm/ExecutionEngine/Orc/OrcError.h
+++ b/include/llvm/ExecutionEngine/Orc/OrcError.h
@@ -33,7 +33,7 @@ enum class OrcErrorCode : int {
   UnknownRPCFunction
 };
 
-Error orcError(OrcErrorCode ErrCode);
+std::error_code orcError(OrcErrorCode ErrCode);
 
 class RPCFunctionNotSupported : public ErrorInfo<RPCFunctionNotSupported> {
 public:
diff --git a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
index 506330fe3a5e9e49caeb079d9bdea043d291c52f..a61ff102be0b072c9643d5c95cf95f495022b7f2 100644
--- a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
+++ b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetServer.h
@@ -132,7 +132,7 @@ private:
     Error setProtections(void *block, unsigned Flags) {
       auto I = Allocs.find(block);
       if (I == Allocs.end())
-        return orcError(OrcErrorCode::RemoteMProtectAddrUnrecognized);
+        return errorCodeToError(orcError(OrcErrorCode::RemoteMProtectAddrUnrecognized));
       return errorCodeToError(
           sys::Memory::protectMappedMemory(I->second, Flags));
     }
@@ -198,7 +198,8 @@ private:
   Error handleCreateRemoteAllocator(ResourceIdMgr::ResourceId Id) {
     auto I = Allocators.find(Id);
     if (I != Allocators.end())
-      return orcError(OrcErrorCode::RemoteAllocatorIdAlreadyInUse);
+      return errorCodeToError(
+               orcError(OrcErrorCode::RemoteAllocatorIdAlreadyInUse));
     DEBUG(dbgs() << "  Created allocator " << Id << "\n");
     Allocators[Id] = Allocator();
     return Error::success();
@@ -207,7 +208,8 @@ private:
   Error handleCreateIndirectStubsOwner(ResourceIdMgr::ResourceId Id) {
     auto I = IndirectStubsOwners.find(Id);
     if (I != IndirectStubsOwners.end())
-      return orcError(OrcErrorCode::RemoteIndirectStubsOwnerIdAlreadyInUse);
+      return errorCodeToError(
+               orcError(OrcErrorCode::RemoteIndirectStubsOwnerIdAlreadyInUse));
     DEBUG(dbgs() << "  Create indirect stubs owner " << Id << "\n");
     IndirectStubsOwners[Id] = ISBlockOwnerList();
     return Error::success();
@@ -224,7 +226,8 @@ private:
   Error handleDestroyRemoteAllocator(ResourceIdMgr::ResourceId Id) {
     auto I = Allocators.find(Id);
     if (I == Allocators.end())
-      return orcError(OrcErrorCode::RemoteAllocatorDoesNotExist);
+      return errorCodeToError(
+               orcError(OrcErrorCode::RemoteAllocatorDoesNotExist));
     Allocators.erase(I);
     DEBUG(dbgs() << "  Destroyed allocator " << Id << "\n");
     return Error::success();
@@ -233,7 +236,8 @@ private:
   Error handleDestroyIndirectStubsOwner(ResourceIdMgr::ResourceId Id) {
     auto I = IndirectStubsOwners.find(Id);
     if (I == IndirectStubsOwners.end())
-      return orcError(OrcErrorCode::RemoteIndirectStubsOwnerDoesNotExist);
+      return errorCodeToError(
+               orcError(OrcErrorCode::RemoteIndirectStubsOwnerDoesNotExist));
     IndirectStubsOwners.erase(I);
     return Error::success();
   }
@@ -246,7 +250,8 @@ private:
 
     auto StubOwnerItr = IndirectStubsOwners.find(Id);
     if (StubOwnerItr == IndirectStubsOwners.end())
-      return orcError(OrcErrorCode::RemoteIndirectStubsOwnerDoesNotExist);
+      return errorCodeToError(
+               orcError(OrcErrorCode::RemoteIndirectStubsOwnerDoesNotExist));
 
     typename TargetT::IndirectStubsInfo IS;
     if (auto Err =
@@ -361,7 +366,8 @@ private:
                                               uint64_t Size, uint32_t Align) {
     auto I = Allocators.find(Id);
     if (I == Allocators.end())
-      return orcError(OrcErrorCode::RemoteAllocatorDoesNotExist);
+      return errorCodeToError(
+               orcError(OrcErrorCode::RemoteAllocatorDoesNotExist));
     auto &Allocator = I->second;
     void *LocalAllocAddr = nullptr;
     if (auto Err = Allocator.allocate(LocalAllocAddr, Size, Align))
@@ -380,7 +386,8 @@ private:
                              JITTargetAddress Addr, uint32_t Flags) {
     auto I = Allocators.find(Id);
     if (I == Allocators.end())
-      return orcError(OrcErrorCode::RemoteAllocatorDoesNotExist);
+      return errorCodeToError(
+               orcError(OrcErrorCode::RemoteAllocatorDoesNotExist));
     auto &Allocator = I->second;
     void *LocalAddr = reinterpret_cast<void *>(static_cast<uintptr_t>(Addr));
     DEBUG(dbgs() << "  Allocator " << Id << " set permissions on " << LocalAddr
diff --git a/include/llvm/ExecutionEngine/Orc/RPCUtils.h b/include/llvm/ExecutionEngine/Orc/RPCUtils.h
index c0ea94897be5609d65e8d34720f86ecab42288ee..fe7e1ba6ff78bb6c3274989b180726dabc353891 100644
--- a/include/llvm/ExecutionEngine/Orc/RPCUtils.h
+++ b/include/llvm/ExecutionEngine/Orc/RPCUtils.h
@@ -26,23 +26,8 @@
 #include "llvm/ExecutionEngine/Orc/OrcError.h"
 #include "llvm/ExecutionEngine/Orc/RPCSerialization.h"
 
-#ifdef _MSC_VER
-// concrt.h depends on eh.h for __uncaught_exception declaration
-// even if we disable exceptions.
-#include <eh.h>
-
-// Disable warnings from ppltasks.h transitively included by <future>.
-#pragma warning(push)
-#pragma warning(disable : 4530)
-#pragma warning(disable : 4062)
-#endif
-
 #include <future>
 
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
 namespace llvm {
 namespace orc {
 namespace rpc {
@@ -82,16 +67,6 @@ std::mutex Function<DerivedFunc, RetT(ArgTs...)>::NameMutex;
 template <typename DerivedFunc, typename RetT, typename... ArgTs>
 std::string Function<DerivedFunc, RetT(ArgTs...)>::Name;
 
-/// Provides a typedef for a tuple containing the decayed argument types.
-template <typename T> class FunctionArgsTuple;
-
-template <typename RetT, typename... ArgTs>
-class FunctionArgsTuple<RetT(ArgTs...)> {
-public:
-  using Type = std::tuple<typename std::decay<
-      typename std::remove_reference<ArgTs>::type>::type...>;
-};
-
 /// Allocates RPC function ids during autonegotiation.
 /// Specializations of this class must provide four members:
 ///
@@ -196,6 +171,16 @@ public:
 
 #endif // _MSC_VER
 
+/// Provides a typedef for a tuple containing the decayed argument types.
+template <typename T> class FunctionArgsTuple;
+
+template <typename RetT, typename... ArgTs>
+class FunctionArgsTuple<RetT(ArgTs...)> {
+public:
+  using Type = std::tuple<typename std::decay<
+      typename std::remove_reference<ArgTs>::type>::type...>;
+};
+
 // ResultTraits provides typedefs and utilities specific to the return type
 // of functions.
 template <typename RetT> class ResultTraits {
@@ -304,8 +289,8 @@ static Error respond(ChannelT &C, const FunctionIdT &ResponseId,
 // the handler ran.
 template <typename WireRetT, typename ChannelT, typename FunctionIdT,
           typename SequenceNumberT>
-static Error respond(ChannelT &C, const FunctionIdT &ResponseId,
-                     SequenceNumberT SeqNo, Error Err) {
+Error respond(ChannelT &C, const FunctionIdT &ResponseId, SequenceNumberT SeqNo,
+              Error Err) {
   if (Err)
     return Err;
   if (auto Err2 = C.startSendMessage(ResponseId, SeqNo))
@@ -339,6 +324,29 @@ public:
   using Type = Error;
 };
 
+// Traits class that strips the response function from the list of handler
+// arguments.
+template <typename FnT> class AsyncHandlerTraits;
+
+template <typename ResultT, typename... ArgTs>
+class AsyncHandlerTraits<Error(std::function<Error(Expected<ResultT>)>, ArgTs...)> {
+public:
+  using Type = Error(ArgTs...);
+  using ResultType = Expected<ResultT>;
+};
+
+template <typename... ArgTs>
+class AsyncHandlerTraits<Error(std::function<Error(Error)>, ArgTs...)> {
+public:
+  using Type = Error(ArgTs...);
+  using ResultType = Error;
+};
+
+template <typename ResponseHandlerT, typename... ArgTs>
+class AsyncHandlerTraits<Error(ResponseHandlerT, ArgTs...)> :
+    public AsyncHandlerTraits<Error(typename std::decay<ResponseHandlerT>::type,
+                                    ArgTs...)> {};
+
 // This template class provides utilities related to RPC function handlers.
 // The base case applies to non-function types (the template class is
 // specialized for function types) and inherits from the appropriate
@@ -358,15 +366,20 @@ public:
   // Return type of the handler.
   using ReturnType = RetT;
 
-  // A std::tuple wrapping the handler arguments.
-  using ArgStorage = typename FunctionArgsTuple<RetT(ArgTs...)>::Type;
-
   // Call the given handler with the given arguments.
-  template <typename HandlerT>
+  template <typename HandlerT, typename... TArgTs>
   static typename WrappedHandlerReturn<RetT>::Type
-  unpackAndRun(HandlerT &Handler, ArgStorage &Args) {
+  unpackAndRun(HandlerT &Handler, std::tuple<TArgTs...> &Args) {
     return unpackAndRunHelper(Handler, Args,
-                              llvm::index_sequence_for<ArgTs...>());
+                              llvm::index_sequence_for<TArgTs...>());
+  }
+
+  // Call the given handler with the given arguments.
+  template <typename HandlerT, typename ResponderT, typename... TArgTs>
+  static Error unpackAndRunAsync(HandlerT &Handler, ResponderT &Responder,
+                                 std::tuple<TArgTs...> &Args) {
+    return unpackAndRunAsyncHelper(Handler, Responder, Args,
+                                   llvm::index_sequence_for<TArgTs...>());
   }
 
   // Call the given handler with the given arguments.
@@ -379,11 +392,11 @@ public:
     return Error::success();
   }
 
-  template <typename HandlerT>
+  template <typename HandlerT, typename... TArgTs>
   static typename std::enable_if<
       !std::is_void<typename HandlerTraits<HandlerT>::ReturnType>::value,
       typename HandlerTraits<HandlerT>::ReturnType>::type
-  run(HandlerT &Handler, ArgTs... Args) {
+  run(HandlerT &Handler, TArgTs... Args) {
     return Handler(std::move(Args)...);
   }
 
@@ -408,15 +421,31 @@ private:
         C, std::get<Indexes>(Args)...);
   }
 
-  template <typename HandlerT, size_t... Indexes>
+  template <typename HandlerT, typename ArgTuple, size_t... Indexes>
   static typename WrappedHandlerReturn<
       typename HandlerTraits<HandlerT>::ReturnType>::Type
-  unpackAndRunHelper(HandlerT &Handler, ArgStorage &Args,
+  unpackAndRunHelper(HandlerT &Handler, ArgTuple &Args,
                      llvm::index_sequence<Indexes...>) {
     return run(Handler, std::move(std::get<Indexes>(Args))...);
   }
+
+
+  template <typename HandlerT, typename ResponderT, typename ArgTuple,
+            size_t... Indexes>
+  static typename WrappedHandlerReturn<
+      typename HandlerTraits<HandlerT>::ReturnType>::Type
+  unpackAndRunAsyncHelper(HandlerT &Handler, ResponderT &Responder,
+                          ArgTuple &Args,
+                          llvm::index_sequence<Indexes...>) {
+    return run(Handler, Responder, std::move(std::get<Indexes>(Args))...);
+  }
 };
 
+// Handler traits for free functions.
+template <typename RetT, typename... ArgTs>
+class HandlerTraits<RetT(*)(ArgTs...)>
+  : public HandlerTraits<RetT(ArgTs...)> {};
+
 // Handler traits for class methods (especially call operators for lambdas).
 template <typename Class, typename RetT, typename... ArgTs>
 class HandlerTraits<RetT (Class::*)(ArgTs...)>
@@ -471,7 +500,7 @@ public:
 
   // Create an error instance representing an abandoned response.
   static Error createAbandonedResponseError() {
-    return orcError(OrcErrorCode::RPCResponseAbandoned);
+    return errorCodeToError(orcError(OrcErrorCode::RPCResponseAbandoned));
   }
 };
 
@@ -493,7 +522,7 @@ public:
       return Err;
     if (auto Err = C.endReceiveMessage())
       return Err;
-    return Handler(Result);
+    return Handler(std::move(Result));
   }
 
   // Abandon this response by calling the handler with an 'abandoned response'
@@ -758,8 +787,7 @@ public:
     auto NegotiateId = FnIdAllocator.getNegotiateId();
     RemoteFunctionIds[OrcRPCNegotiate::getPrototype()] = NegotiateId;
     Handlers[NegotiateId] = wrapHandler<OrcRPCNegotiate>(
-        [this](const std::string &Name) { return handleNegotiate(Name); },
-        LaunchPolicy());
+        [this](const std::string &Name) { return handleNegotiate(Name); });
   }
 
 
@@ -789,7 +817,8 @@ public:
       // This isn't a channel error so we don't want to abandon other pending
       // responses, but we still need to run the user handler with an error to
       // let them know the call failed.
-      if (auto Err = Handler(orcError(OrcErrorCode::UnknownRPCFunction)))
+      if (auto Err = Handler(errorCodeToError(
+                               orcError(OrcErrorCode::UnknownRPCFunction))))
         report_fatal_error(std::move(Err));
       return FnIdOrErr.takeError();
     }
@@ -856,7 +885,7 @@ public:
       return I->second(C, SeqNo);
 
     // else: No handler found. Report error to client?
-    return orcError(OrcErrorCode::UnexpectedRPCCall);
+    return errorCodeToError(orcError(OrcErrorCode::UnexpectedRPCCall));
   }
 
   /// Helper for handling setter procedures - this method returns a functor that
@@ -914,9 +943,6 @@ public:
   }
 
 protected:
-  // The LaunchPolicy type allows a launch policy to be specified when adding
-  // a function handler. See addHandlerImpl.
-  using LaunchPolicy = std::function<Error(std::function<Error()>)>;
 
   FunctionIdT getInvalidFunctionId() const {
     return FnIdAllocator.getInvalidId();
@@ -925,7 +951,7 @@ protected:
   /// Add the given handler to the handler map and make it available for
   /// autonegotiation and execution.
   template <typename Func, typename HandlerT>
-  void addHandlerImpl(HandlerT Handler, LaunchPolicy Launch) {
+  void addHandlerImpl(HandlerT Handler) {
 
     static_assert(detail::RPCArgTypeCheck<
                       CanDeserializeCheck, typename Func::Type,
@@ -934,8 +960,22 @@ protected:
 
     FunctionIdT NewFnId = FnIdAllocator.template allocate<Func>();
     LocalFunctionIds[Func::getPrototype()] = NewFnId;
-    Handlers[NewFnId] =
-        wrapHandler<Func>(std::move(Handler), std::move(Launch));
+    Handlers[NewFnId] = wrapHandler<Func>(std::move(Handler));
+  }
+
+  template <typename Func, typename HandlerT>
+  void addAsyncHandlerImpl(HandlerT Handler) {
+
+    static_assert(detail::RPCArgTypeCheck<
+                      CanDeserializeCheck, typename Func::Type,
+                      typename detail::AsyncHandlerTraits<
+                        typename detail::HandlerTraits<HandlerT>::Type
+                      >::Type>::value,
+                  "");
+
+    FunctionIdT NewFnId = FnIdAllocator.template allocate<Func>();
+    LocalFunctionIds[Func::getPrototype()] = NewFnId;
+    Handlers[NewFnId] = wrapAsyncHandler<Func>(std::move(Handler));
   }
 
   Error handleResponse(SequenceNumberT SeqNo) {
@@ -955,7 +995,7 @@ protected:
         // Unlock the pending results map to prevent recursive lock.
         Lock.unlock();
         abandonPendingResponses();
-        return orcError(OrcErrorCode::UnexpectedRPCResponse);
+        return errorCodeToError(orcError(OrcErrorCode::UnexpectedRPCResponse));
       }
     }
 
@@ -1017,12 +1057,15 @@ protected:
   // Wrap the given user handler in the necessary argument-deserialization code,
   // result-serialization code, and call to the launch policy (if present).
   template <typename Func, typename HandlerT>
-  WrappedHandlerFn wrapHandler(HandlerT Handler, LaunchPolicy Launch) {
-    return [this, Handler, Launch](ChannelT &Channel,
-                                   SequenceNumberT SeqNo) mutable -> Error {
+  WrappedHandlerFn wrapHandler(HandlerT Handler) {
+    return [this, Handler](ChannelT &Channel,
+                           SequenceNumberT SeqNo) mutable -> Error {
       // Start by deserializing the arguments.
-      auto Args = std::make_shared<
-          typename detail::HandlerTraits<HandlerT>::ArgStorage>();
+      using ArgsTuple =
+          typename detail::FunctionArgsTuple<
+            typename detail::HandlerTraits<HandlerT>::Type>::Type;
+      auto Args = std::make_shared<ArgsTuple>();
+
       if (auto Err =
               detail::HandlerTraits<typename Func::Type>::deserializeArgs(
                   Channel, *Args))
@@ -1037,22 +1080,49 @@ protected:
       if (auto Err = Channel.endReceiveMessage())
         return Err;
 
-      // Build the handler/responder.
-      auto Responder = [this, Handler, Args, &Channel,
-                        SeqNo]() mutable -> Error {
-        using HTraits = detail::HandlerTraits<HandlerT>;
-        using FuncReturn = typename Func::ReturnType;
-        return detail::respond<FuncReturn>(
-            Channel, ResponseId, SeqNo, HTraits::unpackAndRun(Handler, *Args));
-      };
-
-      // If there is an explicit launch policy then use it to launch the
-      // handler.
-      if (Launch)
-        return Launch(std::move(Responder));
-
-      // Otherwise run the handler on the listener thread.
-      return Responder();
+      using HTraits = detail::HandlerTraits<HandlerT>;
+      using FuncReturn = typename Func::ReturnType;
+      return detail::respond<FuncReturn>(Channel, ResponseId, SeqNo,
+                                         HTraits::unpackAndRun(Handler, *Args));
+    };
+  }
+
+  // Wrap the given user handler in the necessary argument-deserialization code,
+  // result-serialization code, and call to the launch policy (if present).
+  template <typename Func, typename HandlerT>
+  WrappedHandlerFn wrapAsyncHandler(HandlerT Handler) {
+    return [this, Handler](ChannelT &Channel,
+                           SequenceNumberT SeqNo) mutable -> Error {
+      // Start by deserializing the arguments.
+      using AHTraits = detail::AsyncHandlerTraits<
+                         typename detail::HandlerTraits<HandlerT>::Type>;
+      using ArgsTuple =
+          typename detail::FunctionArgsTuple<typename AHTraits::Type>::Type;
+      auto Args = std::make_shared<ArgsTuple>();
+
+      if (auto Err =
+              detail::HandlerTraits<typename Func::Type>::deserializeArgs(
+                  Channel, *Args))
+        return Err;
+
+      // GCC 4.7 and 4.8 incorrectly issue a -Wunused-but-set-variable warning
+      // for RPCArgs. Void cast RPCArgs to work around this for now.
+      // FIXME: Remove this workaround once we can assume a working GCC version.
+      (void)Args;
+
+      // End receieve message, unlocking the channel for reading.
+      if (auto Err = Channel.endReceiveMessage())
+        return Err;
+
+      using HTraits = detail::HandlerTraits<HandlerT>;
+      using FuncReturn = typename Func::ReturnType;
+      auto Responder =
+        [this, SeqNo](typename AHTraits::ResultType RetVal) -> Error {
+          return detail::respond<FuncReturn>(C, ResponseId, SeqNo,
+                                             std::move(RetVal));
+        };
+
+      return HTraits::unpackAndRunAsync(Handler, Responder, *Args);
     };
   }
 
@@ -1092,40 +1162,31 @@ public:
   MultiThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
       : BaseClass(C, LazyAutoNegotiation) {}
 
-  /// The LaunchPolicy type allows a launch policy to be specified when adding
-  /// a function handler. See addHandler.
-  using LaunchPolicy = typename BaseClass::LaunchPolicy;
-
   /// Add a handler for the given RPC function.
   /// This installs the given handler functor for the given RPC Function, and
   /// makes the RPC function available for negotiation/calling from the remote.
-  ///
-  /// The optional LaunchPolicy argument can be used to control how the handler
-  /// is run when called:
-  ///
-  /// * If no LaunchPolicy is given, the handler code will be run on the RPC
-  ///   handler thread that is reading from the channel. This handler cannot
-  ///   make blocking RPC calls (since it would be blocking the thread used to
-  ///   get the result), but can make non-blocking calls.
-  ///
-  /// * If a LaunchPolicy is given, the user's handler will be wrapped in a
-  ///   call to serialize and send the result, and the resulting functor (with
-  ///   type 'Error()' will be passed to the LaunchPolicy. The user can then
-  ///   choose to add the wrapped handler to a work queue, spawn a new thread,
-  ///   or anything else.
   template <typename Func, typename HandlerT>
-  void addHandler(HandlerT Handler, LaunchPolicy Launch = LaunchPolicy()) {
-    return this->template addHandlerImpl<Func>(std::move(Handler),
-                                               std::move(Launch));
+  void addHandler(HandlerT Handler) {
+    return this->template addHandlerImpl<Func>(std::move(Handler));
   }
 
   /// Add a class-method as a handler.
   template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
-  void addHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...),
-                  LaunchPolicy Launch = LaunchPolicy()) {
+  void addHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
     addHandler<Func>(
-      detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method),
-      Launch);
+      detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
+  }
+
+  template <typename Func, typename HandlerT>
+  void addAsyncHandler(HandlerT Handler) {
+    return this->template addAsyncHandlerImpl<Func>(std::move(Handler));
+  }
+
+  /// Add a class-method as a handler.
+  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
+  void addAsyncHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
+    addAsyncHandler<Func>(
+      detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
   }
 
   /// Return type for non-blocking call primitives.
@@ -1215,16 +1276,13 @@ private:
         SingleThreadedRPCEndpoint<ChannelT, FunctionIdT, SequenceNumberT>,
         ChannelT, FunctionIdT, SequenceNumberT>;
 
-  using LaunchPolicy = typename BaseClass::LaunchPolicy;
-
 public:
   SingleThreadedRPCEndpoint(ChannelT &C, bool LazyAutoNegotiation)
       : BaseClass(C, LazyAutoNegotiation) {}
 
   template <typename Func, typename HandlerT>
   void addHandler(HandlerT Handler) {
-    return this->template addHandlerImpl<Func>(std::move(Handler),
-                                               LaunchPolicy());
+    return this->template addHandlerImpl<Func>(std::move(Handler));
   }
 
   template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
@@ -1233,6 +1291,18 @@ public:
         detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
   }
 
+  template <typename Func, typename HandlerT>
+  void addAsyncHandler(HandlerT Handler) {
+    return this->template addAsyncHandlerImpl<Func>(std::move(Handler));
+  }
+
+  /// Add a class-method as a handler.
+  template <typename Func, typename ClassT, typename RetT, typename... ArgTs>
+  void addAsyncHandler(ClassT &Object, RetT (ClassT::*Method)(ArgTs...)) {
+    addAsyncHandler<Func>(
+      detail::MemberFnWrapper<ClassT, RetT, ArgTs...>(Object, Method));
+  }
+
   template <typename Func, typename... ArgTs,
             typename AltRetT = typename Func::ReturnType>
   typename detail::ResultTraits<AltRetT>::ErrorReturnType
diff --git a/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
similarity index 96%
rename from include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
rename to include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index 0588d2228598143cce18eab7c7bd4a56e3ce681a..babcc7f26aab5e4268b0b5cac4b5da8c58ddb080 100644
--- a/include/llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h
+++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -1,4 +1,4 @@
-//===- ObjectLinkingLayer.h - Add object files to a JIT process -*- C++ -*-===//
+//===-- RTDyldObjectLinkingLayer.h - RTDyld-based jit linking  --*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Contains the definition for the object layer of the JIT.
+// Contains the definition for an RTDyld-based, in-process object linking layer.
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_EXECUTIONENGINE_ORC_OBJECTLINKINGLAYER_H
-#define LLVM_EXECUTIONENGINE_ORC_OBJECTLINKINGLAYER_H
+#ifndef LLVM_EXECUTIONENGINE_ORC_RTDYLDOBJECTLINKINGLAYER_H
+#define LLVM_EXECUTIONENGINE_ORC_RTDYLDOBJECTLINKINGLAYER_H
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
@@ -35,7 +35,7 @@
 namespace llvm {
 namespace orc {
 
-class ObjectLinkingLayerBase {
+class RTDyldObjectLinkingLayerBase {
 protected:
   /// @brief Holds a set of objects to be allocated/linked as a unit in the JIT.
   ///
@@ -87,7 +87,7 @@ public:
 class DoNothingOnNotifyLoaded {
 public:
   template <typename ObjSetT, typename LoadResult>
-  void operator()(ObjectLinkingLayerBase::ObjSetHandleT, const ObjSetT &,
+  void operator()(RTDyldObjectLinkingLayerBase::ObjSetHandleT, const ObjSetT &,
                   const LoadResult &) {}
 };
 
@@ -98,7 +98,7 @@ public:
 /// symbols queried. All objects added to this layer can see each other's
 /// symbols.
 template <typename NotifyLoadedFtor = DoNothingOnNotifyLoaded>
-class ObjectLinkingLayer : public ObjectLinkingLayerBase {
+class RTDyldObjectLinkingLayer : public RTDyldObjectLinkingLayerBase {
 public:
   /// @brief Functor for receiving finalization notifications.
   typedef std::function<void(ObjSetHandleT)> NotifyFinalizedFtor;
@@ -227,7 +227,7 @@ public:
 
   /// @brief Construct an ObjectLinkingLayer with the given NotifyLoaded,
   ///        and NotifyFinalized functors.
-  ObjectLinkingLayer(
+  RTDyldObjectLinkingLayer(
       NotifyLoadedFtor NotifyLoaded = NotifyLoadedFtor(),
       NotifyFinalizedFtor NotifyFinalized = NotifyFinalizedFtor())
       : NotifyLoaded(std::move(NotifyLoaded)),
@@ -359,4 +359,4 @@ private:
 } // end namespace orc
 } // end namespace llvm
 
-#endif // LLVM_EXECUTIONENGINE_ORC_OBJECTLINKINGLAYER_H
+#endif // LLVM_EXECUTIONENGINE_ORC_RTDYLDOBJECTLINKINGLAYER_H
diff --git a/include/llvm/ExecutionEngine/Orc/RawByteChannel.h b/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
index ce01c663afecb6651d850b5011298d056ce5d665..39753edaefc5dd45e372b6533aa10f3c9d80c0fe 100644
--- a/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
+++ b/include/llvm/ExecutionEngine/Orc/RawByteChannel.h
@@ -142,10 +142,12 @@ public:
   }
 };
 
-template <typename ChannelT>
-class SerializationTraits<ChannelT, std::string, const char *,
-                          typename std::enable_if<std::is_base_of<
-                              RawByteChannel, ChannelT>::value>::type> {
+template <typename ChannelT, typename T>
+class SerializationTraits<ChannelT, std::string, T,
+                          typename std::enable_if<
+                            std::is_base_of<RawByteChannel, ChannelT>::value &&
+                            (std::is_same<T, const char*>::value ||
+                             std::is_same<T, char*>::value)>::type> {
 public:
   static Error serialize(RawByteChannel &C, const char *S) {
     return SerializationTraits<ChannelT, std::string, StringRef>::serialize(C,
diff --git a/include/llvm/IR/Argument.h b/include/llvm/IR/Argument.h
index d8b280a66f1847fd433988dfa232395c6d2f4489..6fc1dd2f285a190316d39f2ba29eae5b1a097186 100644
--- a/include/llvm/IR/Argument.h
+++ b/include/llvm/IR/Argument.h
@@ -21,127 +21,110 @@
 
 namespace llvm {
 
-template <typename NodeTy> class SymbolTableListTraits;
-
-/// \brief LLVM Argument representation
-///
 /// This class represents an incoming formal argument to a Function. A formal
 /// argument, since it is ``formal'', does not contain an actual value but
 /// instead represents the type, argument number, and attributes of an argument
 /// for a specific function. When used in the body of said function, the
 /// argument of course represents the value of the actual argument that the
 /// function was called with.
-class Argument : public Value, public ilist_node<Argument> {
+class Argument : public Value {
   virtual void anchor();
   Function *Parent;
+  unsigned ArgNo;
 
-  friend class SymbolTableListTraits<Argument>;
+  friend class Function;
   void setParent(Function *parent);
 
 public:
-  /// \brief Constructor.
-  ///
-  /// If \p F is specified, the argument is inserted at the end of the argument
-  /// list for \p F.
-  explicit Argument(Type *Ty, const Twine &Name = "", Function *F = nullptr);
+  /// Argument constructor.
+  explicit Argument(Type *Ty, const Twine &Name = "", Function *F = nullptr,
+                    unsigned ArgNo = 0);
 
   inline const Function *getParent() const { return Parent; }
   inline       Function *getParent()       { return Parent; }
 
-  /// \brief Return the index of this formal argument in its containing
-  /// function.
+  /// Return the index of this formal argument in its containing function.
   ///
   /// For example in "void foo(int a, float b)" a is 0 and b is 1.
-  unsigned getArgNo() const;
+  unsigned getArgNo() const {
+    assert(Parent && "can't get number of unparented arg");
+    return ArgNo;
+  }
 
-  /// \brief Return true if this argument has the nonnull attribute on it in
-  /// its containing function. Also returns true if at least one byte is known
-  /// to be dereferenceable and the pointer is in addrspace(0).
+  /// Return true if this argument has the nonnull attribute. Also returns true
+  /// if at least one byte is known to be dereferenceable and the pointer is in
+  /// addrspace(0).
   bool hasNonNullAttr() const;
 
-  /// \brief If this argument has the dereferenceable attribute on it in its
-  /// containing function, return the number of bytes known to be
-  /// dereferenceable. Otherwise, zero is returned.
+  /// If this argument has the dereferenceable attribute, return the number of
+  /// bytes known to be dereferenceable. Otherwise, zero is returned.
   uint64_t getDereferenceableBytes() const;
 
-  /// \brief If this argument has the dereferenceable_or_null attribute on
-  /// it in its containing function, return the number of bytes known to be
-  /// dereferenceable. Otherwise, zero is returned.
+  /// If this argument has the dereferenceable_or_null attribute, return the
+  /// number of bytes known to be dereferenceable. Otherwise, zero is returned.
   uint64_t getDereferenceableOrNullBytes() const;
 
-  /// \brief Return true if this argument has the byval attribute on it in its
-  /// containing function.
+  /// Return true if this argument has the byval attribute.
   bool hasByValAttr() const;
 
-  /// \brief Return true if this argument has the swiftself attribute.
+  /// Return true if this argument has the swiftself attribute.
   bool hasSwiftSelfAttr() const;
 
-  /// \brief Return true if this argument has the swifterror attribute.
+  /// Return true if this argument has the swifterror attribute.
   bool hasSwiftErrorAttr() const;
 
-  /// \brief Return true if this argument has the byval attribute or inalloca
-  /// attribute on it in its containing function.  These attributes both
-  /// represent arguments being passed by value.
+  /// Return true if this argument has the byval attribute or inalloca
+  /// attribute. These attributes represent arguments being passed by value.
   bool hasByValOrInAllocaAttr() const;
 
-  /// \brief If this is a byval or inalloca argument, return its alignment.
+  /// If this is a byval or inalloca argument, return its alignment.
   unsigned getParamAlignment() const;
 
-  /// \brief Return true if this argument has the nest attribute on it in its
-  /// containing function.
+  /// Return true if this argument has the nest attribute.
   bool hasNestAttr() const;
 
-  /// \brief Return true if this argument has the noalias attribute on it in its
-  /// containing function.
+  /// Return true if this argument has the noalias attribute.
   bool hasNoAliasAttr() const;
 
-  /// \brief Return true if this argument has the nocapture attribute on it in
-  /// its containing function.
+  /// Return true if this argument has the nocapture attribute.
   bool hasNoCaptureAttr() const;
 
-  /// \brief Return true if this argument has the sret attribute on it in its
-  /// containing function.
+  /// Return true if this argument has the sret attribute.
   bool hasStructRetAttr() const;
 
-  /// \brief Return true if this argument has the returned attribute on it in
-  /// its containing function.
+  /// Return true if this argument has the returned attribute.
   bool hasReturnedAttr() const;
 
-  /// \brief Return true if this argument has the readonly or readnone attribute
-  /// on it in its containing function.
+  /// Return true if this argument has the readonly or readnone attribute.
   bool onlyReadsMemory() const;
 
-  /// \brief Return true if this argument has the inalloca attribute on it in
-  /// its containing function.
+  /// Return true if this argument has the inalloca attribute.
   bool hasInAllocaAttr() const;
 
-  /// \brief Return true if this argument has the zext attribute on it in its
-  /// containing function.
+  /// Return true if this argument has the zext attribute.
   bool hasZExtAttr() const;
 
-  /// \brief Return true if this argument has the sext attribute on it in its
-  /// containing function.
+  /// Return true if this argument has the sext attribute.
   bool hasSExtAttr() const;
 
-  /// \brief Add a Attribute to an argument.
-  void addAttr(AttributeSet AS);
+  /// Add attributes to an argument.
+  void addAttr(AttributeList AS);
 
   void addAttr(Attribute::AttrKind Kind) {
-    addAttr(AttributeSet::get(getContext(), getArgNo() + 1, Kind));
+    addAttr(AttributeList::get(getContext(), getArgNo() + 1, Kind));
   }
 
-  /// \brief Remove a Attribute from an argument.
-  void removeAttr(AttributeSet AS);
+  /// Remove attributes from an argument.
+  void removeAttr(AttributeList AS);
 
   void removeAttr(Attribute::AttrKind Kind) {
-    removeAttr(AttributeSet::get(getContext(), getArgNo() + 1, Kind));
+    removeAttr(AttributeList::get(getContext(), getArgNo() + 1, Kind));
   }
 
-  /// \brief Checks if an argument has a given attribute.
+  /// Check if an argument has a given attribute.
   bool hasAttribute(Attribute::AttrKind Kind) const;
 
-  /// \brief Method for support type inquiry through isa, cast, and
-  /// dyn_cast.
+  /// Method for support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const Value *V) {
     return V->getValueID() == ArgumentVal;
   }
diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h
index 15783858dd322d2747f5b6e8f152261de0e24b8a..457682b9b2e72facddeaee1842d8c644823942e8 100644
--- a/include/llvm/IR/Attributes.h
+++ b/include/llvm/IR/Attributes.h
@@ -1,4 +1,4 @@
-//===-- llvm/Attributes.h - Container for Attributes ------------*- C++ -*-===//
+//===- llvm/Attributes.h - Container for Attributes -------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,22 +18,24 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/Optional.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/PointerLikeTypeTraits.h"
 #include "llvm-c/Types.h"
 #include <bitset>
 #include <cassert>
+#include <cstdint>
 #include <map>
 #include <string>
+#include <utility>
 
 namespace llvm {
 
 class AttrBuilder;
 class AttributeImpl;
-class AttributeSetImpl;
+class AttributeListImpl;
 class AttributeSetNode;
-class Constant;
 template<typename T> struct DenseMapInfo;
 class Function;
 class LLVMContext;
@@ -73,11 +75,12 @@ public:
   };
 
 private:
-  AttributeImpl *pImpl;
+  AttributeImpl *pImpl = nullptr;
+
   Attribute(AttributeImpl *A) : pImpl(A) {}
 
 public:
-  Attribute() : pImpl(nullptr) {}
+  Attribute() = default;
 
   //===--------------------------------------------------------------------===//
   // Attribute Construction
@@ -192,15 +195,93 @@ inline Attribute unwrap(LLVMAttributeRef Attr) {
   return Attribute::fromRawPointer(Attr);
 }
 
+//===----------------------------------------------------------------------===//
+/// \class
+/// This class holds the attributes for a particular argument, parameter,
+/// function, or return value. It is an immutable value type that is cheap to
+/// copy. Adding and removing enum attributes is intended to be fast, but adding
+/// and removing string or integer attributes involves a FoldingSet lookup.
+class AttributeSet {
+  // TODO: Extract AvailableAttrs from AttributeSetNode and store them here.
+  // This will allow an efficient implementation of addAttribute and
+  // removeAttribute for enum attrs.
+
+  /// Private implementation pointer.
+  AttributeSetNode *SetNode = nullptr;
+
+  friend AttributeListImpl;
+  template <typename Ty> friend struct DenseMapInfo;
+
+private:
+  AttributeSet(AttributeSetNode *ASN) : SetNode(ASN) {}
+
+public:
+  /// AttributeSet is a trivially copyable value type.
+  AttributeSet() = default;
+  AttributeSet(const AttributeSet &) = default;
+  ~AttributeSet() = default;
+
+  static AttributeSet get(LLVMContext &C, const AttrBuilder &B);
+  static AttributeSet get(LLVMContext &C, ArrayRef<Attribute> Attrs);
+
+  bool operator==(const AttributeSet &O) { return SetNode == O.SetNode; }
+  bool operator!=(const AttributeSet &O) { return !(*this == O); }
+
+  unsigned getNumAttributes() const;
+
+  bool hasAttributes() const { return SetNode != nullptr; }
+
+  bool hasAttribute(Attribute::AttrKind Kind) const;
+  bool hasAttribute(StringRef Kind) const;
+
+  Attribute getAttribute(Attribute::AttrKind Kind) const;
+  Attribute getAttribute(StringRef Kind) const;
+
+  unsigned getAlignment() const;
+  unsigned getStackAlignment() const;
+  uint64_t getDereferenceableBytes() const;
+  uint64_t getDereferenceableOrNullBytes() const;
+  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
+  std::string getAsString(bool InAttrGrp) const;
+
+  typedef const Attribute *iterator;
+  iterator begin() const;
+  iterator end() const;
+};
+
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief Provide DenseMapInfo for AttributeSet.
+template <> struct DenseMapInfo<AttributeSet> {
+  static inline AttributeSet getEmptyKey() {
+    uintptr_t Val = static_cast<uintptr_t>(-1);
+    Val <<= PointerLikeTypeTraits<void *>::NumLowBitsAvailable;
+    return AttributeSet(reinterpret_cast<AttributeSetNode *>(Val));
+  }
+
+  static inline AttributeSet getTombstoneKey() {
+    uintptr_t Val = static_cast<uintptr_t>(-2);
+    Val <<= PointerLikeTypeTraits<void *>::NumLowBitsAvailable;
+    return AttributeSet(reinterpret_cast<AttributeSetNode *>(Val));
+  }
+
+  static unsigned getHashValue(AttributeSet AS) {
+    return (unsigned((uintptr_t)AS.SetNode) >> 4) ^
+           (unsigned((uintptr_t)AS.SetNode) >> 9);
+  }
+
+  static bool isEqual(AttributeSet LHS, AttributeSet RHS) { return LHS == RHS; }
+};
+
 //===----------------------------------------------------------------------===//
 /// \class
 /// \brief This class holds the attributes for a function, its return value, and
 /// its parameters. You access the attributes for each of them via an index into
-/// the AttributeSet object. The function attributes are at index
-/// `AttributeSet::FunctionIndex', the return value is at index
-/// `AttributeSet::ReturnIndex', and the attributes for the parameters start at
+/// the AttributeList object. The function attributes are at index
+/// `AttributeList::FunctionIndex', the return value is at index
+/// `AttributeList::ReturnIndex', and the attributes for the parameters start at
 /// index `1'.
-class AttributeSet {
+class AttributeList {
 public:
   enum AttrIndex : unsigned {
     ReturnIndex = 0U,
@@ -209,113 +290,136 @@ public:
 
 private:
   friend class AttrBuilder;
-  friend class AttributeSetImpl;
+  friend class AttributeListImpl;
+  friend class AttributeSet;
   friend class AttributeSetNode;
+
   template <typename Ty> friend struct DenseMapInfo;
 
   /// \brief The attributes that we are managing. This can be null to represent
   /// the empty attributes list.
-  AttributeSetImpl *pImpl;
-
-  /// \brief The attributes for the specified index are returned.
-  AttributeSetNode *getAttributes(unsigned Index) const;
-
-  /// \brief Create an AttributeSet with the specified parameters in it.
-  static AttributeSet get(LLVMContext &C,
-                          ArrayRef<std::pair<unsigned, Attribute> > Attrs);
-  static AttributeSet get(LLVMContext &C,
-                          ArrayRef<std::pair<unsigned,
-                                             AttributeSetNode*> > Attrs);
+  AttributeListImpl *pImpl = nullptr;
 
-  static AttributeSet getImpl(LLVMContext &C,
-                              ArrayRef<std::pair<unsigned,
-                                                 AttributeSetNode*> > Attrs);
+public:
+  /// \brief Create an AttributeList with the specified parameters in it.
+  static AttributeList get(LLVMContext &C,
+                           ArrayRef<std::pair<unsigned, Attribute>> Attrs);
+  static AttributeList
+  get(LLVMContext &C, ArrayRef<std::pair<unsigned, AttributeSet>> Attrs);
+
+  /// \brief Create an AttributeList from a vector of AttributeSetNodes. The
+  /// index of each set is implied by its position in the array \p Attrs:
+  ///   0      : Return attributes
+  /// 1 to n-1 : Argument attributes
+  ///   n      : Function attributes
+  /// Any element that has no entries should be left null.
+  static AttributeList get(LLVMContext &C, ArrayRef<AttributeSet> Attrs);
+
+  static AttributeList
+  getImpl(LLVMContext &C,
+          ArrayRef<std::pair<unsigned, AttributeSet>> Attrs);
 
-  explicit AttributeSet(AttributeSetImpl *LI) : pImpl(LI) {}
+private:
+  explicit AttributeList(AttributeListImpl *LI) : pImpl(LI) {}
 
 public:
-  AttributeSet() : pImpl(nullptr) {}
+  AttributeList() = default;
 
   //===--------------------------------------------------------------------===//
-  // AttributeSet Construction and Mutation
+  // AttributeList Construction and Mutation
   //===--------------------------------------------------------------------===//
 
-  /// \brief Return an AttributeSet with the specified parameters in it.
-  static AttributeSet get(LLVMContext &C, ArrayRef<AttributeSet> Attrs);
-  static AttributeSet get(LLVMContext &C, unsigned Index,
-                          ArrayRef<Attribute::AttrKind> Kinds);
-  static AttributeSet get(LLVMContext &C, unsigned Index,
-                          ArrayRef<StringRef> Kind);
-  static AttributeSet get(LLVMContext &C, unsigned Index, const AttrBuilder &B);
+  /// \brief Return an AttributeList with the specified parameters in it.
+  static AttributeList get(LLVMContext &C, ArrayRef<AttributeList> Attrs);
+  static AttributeList get(LLVMContext &C, unsigned Index,
+                           ArrayRef<Attribute::AttrKind> Kinds);
+  static AttributeList get(LLVMContext &C, unsigned Index,
+                           ArrayRef<StringRef> Kind);
+  static AttributeList get(LLVMContext &C, unsigned Index,
+                           const AttrBuilder &B);
 
   /// \brief Add an attribute to the attribute set at the given index. Because
   /// attribute sets are immutable, this returns a new set.
-  AttributeSet addAttribute(LLVMContext &C, unsigned Index,
-                            Attribute::AttrKind Kind) const;
+  AttributeList addAttribute(LLVMContext &C, unsigned Index,
+                             Attribute::AttrKind Kind) const;
 
   /// \brief Add an attribute to the attribute set at the given index. Because
   /// attribute sets are immutable, this returns a new set.
-  AttributeSet addAttribute(LLVMContext &C, unsigned Index, StringRef Kind,
-                            StringRef Value = StringRef()) const;
+  AttributeList addAttribute(LLVMContext &C, unsigned Index, StringRef Kind,
+                             StringRef Value = StringRef()) const;
 
   /// Add an attribute to the attribute set at the given indices. Because
   /// attribute sets are immutable, this returns a new set.
-  AttributeSet addAttribute(LLVMContext &C, ArrayRef<unsigned> Indices,
-                            Attribute A) const;
+  AttributeList addAttribute(LLVMContext &C, ArrayRef<unsigned> Indices,
+                             Attribute A) const;
 
   /// \brief Add attributes to the attribute set at the given index. Because
   /// attribute sets are immutable, this returns a new set.
-  AttributeSet addAttributes(LLVMContext &C, unsigned Index,
-                             AttributeSet Attrs) const;
+  AttributeList addAttributes(LLVMContext &C, unsigned Index,
+                              AttributeList Attrs) const;
+
+  AttributeList addAttributes(LLVMContext &C, unsigned Index,
+                              AttributeSet AS) const;
+
+  AttributeList addAttributes(LLVMContext &C, unsigned Index,
+                              const AttrBuilder &B) const;
 
   /// \brief Remove the specified attribute at the specified index from this
   /// attribute list. Because attribute lists are immutable, this returns the
   /// new list.
-  AttributeSet removeAttribute(LLVMContext &C, unsigned Index,
-                               Attribute::AttrKind Kind) const;
+  AttributeList removeAttribute(LLVMContext &C, unsigned Index,
+                                Attribute::AttrKind Kind) const;
 
   /// \brief Remove the specified attribute at the specified index from this
   /// attribute list. Because attribute lists are immutable, this returns the
   /// new list.
-  AttributeSet removeAttribute(LLVMContext &C, unsigned Index,
-                               StringRef Kind) const;
+  AttributeList removeAttribute(LLVMContext &C, unsigned Index,
+                                StringRef Kind) const;
 
   /// \brief Remove the specified attributes at the specified index from this
   /// attribute list. Because attribute lists are immutable, this returns the
   /// new list.
-  AttributeSet removeAttributes(LLVMContext &C, unsigned Index,
-                                AttributeSet Attrs) const;
+  AttributeList removeAttributes(LLVMContext &C, unsigned Index,
+                                 AttributeList Attrs) const;
 
   /// \brief Remove the specified attributes at the specified index from this
   /// attribute list. Because attribute lists are immutable, this returns the
   /// new list.
-  AttributeSet removeAttributes(LLVMContext &C, unsigned Index,
-                                const AttrBuilder &Attrs) const;
+  AttributeList removeAttributes(LLVMContext &C, unsigned Index,
+                                 const AttrBuilder &Attrs) const;
+
+  /// \brief Remove all attributes at the specified index from this
+  /// attribute list. Because attribute lists are immutable, this returns the
+  /// new list.
+  AttributeList removeAttributes(LLVMContext &C, unsigned Index) const;
 
   /// \brief Add the dereferenceable attribute to the attribute set at the given
   /// index. Because attribute sets are immutable, this returns a new set.
-  AttributeSet addDereferenceableAttr(LLVMContext &C, unsigned Index,
-                                      uint64_t Bytes) const;
+  AttributeList addDereferenceableAttr(LLVMContext &C, unsigned Index,
+                                       uint64_t Bytes) const;
 
   /// \brief Add the dereferenceable_or_null attribute to the attribute set at
   /// the given index. Because attribute sets are immutable, this returns a new
   /// set.
-  AttributeSet addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index,
-                                            uint64_t Bytes) const;
+  AttributeList addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index,
+                                             uint64_t Bytes) const;
 
   /// Add the allocsize attribute to the attribute set at the given index.
   /// Because attribute sets are immutable, this returns a new set.
-  AttributeSet addAllocSizeAttr(LLVMContext &C, unsigned Index,
-                                unsigned ElemSizeArg,
-                                const Optional<unsigned> &NumElemsArg);
+  AttributeList addAllocSizeAttr(LLVMContext &C, unsigned Index,
+                                 unsigned ElemSizeArg,
+                                 const Optional<unsigned> &NumElemsArg);
 
   //===--------------------------------------------------------------------===//
-  // AttributeSet Accessors
+  // AttributeList Accessors
   //===--------------------------------------------------------------------===//
 
   /// \brief Retrieve the LLVM context.
   LLVMContext &getContext() const;
 
+  /// \brief The attributes for the specified index are returned.
+  AttributeSet getAttributes(unsigned Index) const;
+
   /// \brief The attributes for the specified index are returned.
   AttributeSet getParamAttributes(unsigned Index) const;
 
@@ -334,11 +438,11 @@ public:
   /// \brief Return true if attribute exists at the given index.
   bool hasAttributes(unsigned Index) const;
 
-  /// \brief Equivalent to hasAttribute(AttributeSet::FunctionIndex, Kind) but
+  /// \brief Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but
   /// may be faster.
   bool hasFnAttribute(Attribute::AttrKind Kind) const;
 
-  /// \brief Equivalent to hasAttribute(AttributeSet::FunctionIndex, Kind) but
+  /// \brief Equivalent to hasAttribute(AttributeList::FunctionIndex, Kind) but
   /// may be faster.
   bool hasFnAttribute(StringRef Kind) const;
 
@@ -380,15 +484,11 @@ public:
   iterator end(unsigned Slot) const;
 
   /// operator==/!= - Provide equality predicates.
-  bool operator==(const AttributeSet &RHS) const {
-    return pImpl == RHS.pImpl;
-  }
-  bool operator!=(const AttributeSet &RHS) const {
-    return pImpl != RHS.pImpl;
-  }
+  bool operator==(const AttributeList &RHS) const { return pImpl == RHS.pImpl; }
+  bool operator!=(const AttributeList &RHS) const { return pImpl != RHS.pImpl; }
 
   //===--------------------------------------------------------------------===//
-  // AttributeSet Introspection
+  // AttributeList Introspection
   //===--------------------------------------------------------------------===//
 
   /// \brief Return a raw pointer that uniquely identifies this attribute list.
@@ -410,30 +510,35 @@ public:
   unsigned getSlotIndex(unsigned Slot) const;
 
   /// \brief Return the attributes at the given slot.
-  AttributeSet getSlotAttributes(unsigned Slot) const;
+  AttributeList getSlotAttributes(unsigned Slot) const;
 
   void dump() const;
 };
 
 //===----------------------------------------------------------------------===//
 /// \class
-/// \brief Provide DenseMapInfo for AttributeSet.
-template<> struct DenseMapInfo<AttributeSet> {
-  static inline AttributeSet getEmptyKey() {
+/// \brief Provide DenseMapInfo for AttributeList.
+template <> struct DenseMapInfo<AttributeList> {
+  static inline AttributeList getEmptyKey() {
     uintptr_t Val = static_cast<uintptr_t>(-1);
     Val <<= PointerLikeTypeTraits<void*>::NumLowBitsAvailable;
-    return AttributeSet(reinterpret_cast<AttributeSetImpl*>(Val));
+    return AttributeList(reinterpret_cast<AttributeListImpl *>(Val));
   }
-  static inline AttributeSet getTombstoneKey() {
+
+  static inline AttributeList getTombstoneKey() {
     uintptr_t Val = static_cast<uintptr_t>(-2);
     Val <<= PointerLikeTypeTraits<void*>::NumLowBitsAvailable;
-    return AttributeSet(reinterpret_cast<AttributeSetImpl*>(Val));
+    return AttributeList(reinterpret_cast<AttributeListImpl *>(Val));
   }
-  static unsigned getHashValue(AttributeSet AS) {
+
+  static unsigned getHashValue(AttributeList AS) {
     return (unsigned((uintptr_t)AS.pImpl) >> 4) ^
            (unsigned((uintptr_t)AS.pImpl) >> 9);
   }
-  static bool isEqual(AttributeSet LHS, AttributeSet RHS) { return LHS == RHS; }
+
+  static bool isEqual(AttributeList LHS, AttributeList RHS) {
+    return LHS == RHS;
+  }
 };
 
 //===----------------------------------------------------------------------===//
@@ -445,22 +550,19 @@ template<> struct DenseMapInfo<AttributeSet> {
 class AttrBuilder {
   std::bitset<Attribute::EndAttrKinds> Attrs;
   std::map<std::string, std::string> TargetDepAttrs;
-  uint64_t Alignment;
-  uint64_t StackAlignment;
-  uint64_t DerefBytes;
-  uint64_t DerefOrNullBytes;
-  uint64_t AllocSizeArgs;
+  uint64_t Alignment = 0;
+  uint64_t StackAlignment = 0;
+  uint64_t DerefBytes = 0;
+  uint64_t DerefOrNullBytes = 0;
+  uint64_t AllocSizeArgs = 0;
 
 public:
-  AttrBuilder()
-      : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0),
-        DerefOrNullBytes(0), AllocSizeArgs(0) {}
-  AttrBuilder(const Attribute &A)
-      : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0),
-        DerefOrNullBytes(0), AllocSizeArgs(0) {
+  AttrBuilder() = default;
+  AttrBuilder(const Attribute &A) {
     addAttribute(A);
   }
-  AttrBuilder(AttributeSet AS, unsigned Idx);
+  AttrBuilder(AttributeList AS, unsigned Idx);
+  AttrBuilder(AttributeSet AS);
 
   void clear();
 
@@ -477,7 +579,7 @@ public:
   AttrBuilder &removeAttribute(Attribute::AttrKind Val);
 
   /// \brief Remove the attributes from the builder.
-  AttrBuilder &removeAttributes(AttributeSet A, uint64_t Index);
+  AttrBuilder &removeAttributes(AttributeList A, uint64_t WithoutIndex);
 
   /// \brief Remove the target-dependent attribute to the builder.
   AttrBuilder &removeAttribute(StringRef A);
@@ -507,7 +609,7 @@ public:
 
   /// \brief Return true if the builder has any attribute that's in the
   /// specified attribute.
-  bool hasAttributes(AttributeSet A, uint64_t Index) const;
+  bool hasAttributes(AttributeList A, uint64_t Index) const;
 
   /// \brief Return true if the builder has an alignment attribute.
   bool hasAlignmentAttr() const;
@@ -562,8 +664,8 @@ public:
   typedef std::pair<std::string, std::string>                td_type;
   typedef std::map<std::string, std::string>::iterator       td_iterator;
   typedef std::map<std::string, std::string>::const_iterator td_const_iterator;
-  typedef llvm::iterator_range<td_iterator>                  td_range;
-  typedef llvm::iterator_range<td_const_iterator>            td_const_range;
+  typedef iterator_range<td_iterator>                        td_range;
+  typedef iterator_range<td_const_iterator>                  td_const_range;
 
   td_iterator td_begin()             { return TargetDepAttrs.begin(); }
   td_iterator td_end()               { return TargetDepAttrs.end(); }
@@ -600,4 +702,4 @@ void mergeAttributesForInlining(Function &Caller, const Function &Callee);
 
 } // end llvm namespace
 
-#endif
+#endif // LLVM_IR_ATTRIBUTES_H
diff --git a/include/llvm/IR/BasicBlock.h b/include/llvm/IR/BasicBlock.h
index 41dc9a0ae18d023ce98afc5d22f6d4d7345edadd..bd210e1abf31dd476c3b85d3a67a415424df9555 100644
--- a/include/llvm/IR/BasicBlock.h
+++ b/include/llvm/IR/BasicBlock.h
@@ -104,32 +104,36 @@ public:
   /// or nullptr it the function does not have a module.
   ///
   /// Note: this is undefined behavior if the block does not have a parent.
-  Module *getModule();
-  const Module *getModule() const {
-    return const_cast<BasicBlock *>(this)->getModule();
+  const Module *getModule() const;
+  Module *getModule() {
+    return const_cast<Module *>(
+                            static_cast<const BasicBlock *>(this)->getModule());
   }
 
   /// \brief Returns the terminator instruction if the block is well formed or
   /// null if the block is not well formed.
-  TerminatorInst *getTerminator() LLVM_READONLY;
-  const TerminatorInst *getTerminator() const {
-    return const_cast<BasicBlock *>(this)->getTerminator();
+  const TerminatorInst *getTerminator() const LLVM_READONLY;
+  TerminatorInst *getTerminator() {
+    return const_cast<TerminatorInst *>(
+                        static_cast<const BasicBlock *>(this)->getTerminator());
   }
 
   /// \brief Returns the call instruction calling @llvm.experimental.deoptimize
   /// prior to the terminating return instruction of this basic block, if such a
   /// call is present.  Otherwise, returns null.
-  CallInst *getTerminatingDeoptimizeCall();
-  const CallInst *getTerminatingDeoptimizeCall() const {
-    return const_cast<BasicBlock *>(this)->getTerminatingDeoptimizeCall();
+  const CallInst *getTerminatingDeoptimizeCall() const;
+  CallInst *getTerminatingDeoptimizeCall() {
+    return const_cast<CallInst *>(
+         static_cast<const BasicBlock *>(this)->getTerminatingDeoptimizeCall());
   }
 
   /// \brief Returns the call instruction marked 'musttail' prior to the
   /// terminating return instruction of this basic block, if such a call is
   /// present.  Otherwise, returns null.
-  CallInst *getTerminatingMustTailCall();
-  const CallInst *getTerminatingMustTailCall() const {
-    return const_cast<BasicBlock *>(this)->getTerminatingMustTailCall();
+  const CallInst *getTerminatingMustTailCall() const;
+  CallInst *getTerminatingMustTailCall() {
+    return const_cast<CallInst *>(
+           static_cast<const BasicBlock *>(this)->getTerminatingMustTailCall());
   }
 
   /// \brief Returns a pointer to the first instruction in this block that is
@@ -138,32 +142,36 @@ public:
   /// When adding instructions to the beginning of the basic block, they should
   /// be added before the returned value, not before the first instruction,
   /// which might be PHI. Returns 0 is there's no non-PHI instruction.
-  Instruction* getFirstNonPHI();
-  const Instruction* getFirstNonPHI() const {
-    return const_cast<BasicBlock*>(this)->getFirstNonPHI();
+  const Instruction* getFirstNonPHI() const;
+  Instruction* getFirstNonPHI() {
+    return const_cast<Instruction *>(
+                       static_cast<const BasicBlock *>(this)->getFirstNonPHI());
   }
 
   /// \brief Returns a pointer to the first instruction in this block that is not
   /// a PHINode or a debug intrinsic.
-  Instruction* getFirstNonPHIOrDbg();
-  const Instruction* getFirstNonPHIOrDbg() const {
-    return const_cast<BasicBlock*>(this)->getFirstNonPHIOrDbg();
+  const Instruction* getFirstNonPHIOrDbg() const;
+  Instruction* getFirstNonPHIOrDbg() {
+    return const_cast<Instruction *>(
+                  static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbg());
   }
 
   /// \brief Returns a pointer to the first instruction in this block that is not
   /// a PHINode, a debug intrinsic, or a lifetime intrinsic.
-  Instruction* getFirstNonPHIOrDbgOrLifetime();
-  const Instruction* getFirstNonPHIOrDbgOrLifetime() const {
-    return const_cast<BasicBlock*>(this)->getFirstNonPHIOrDbgOrLifetime();
+  const Instruction* getFirstNonPHIOrDbgOrLifetime() const;
+  Instruction* getFirstNonPHIOrDbgOrLifetime() {
+    return const_cast<Instruction *>(
+        static_cast<const BasicBlock *>(this)->getFirstNonPHIOrDbgOrLifetime());
   }
 
   /// \brief Returns an iterator to the first instruction in this block that is
   /// suitable for inserting a non-PHI instruction.
   ///
   /// In particular, it skips all PHIs and LandingPad instructions.
-  iterator getFirstInsertionPt();
-  const_iterator getFirstInsertionPt() const {
-    return const_cast<BasicBlock*>(this)->getFirstInsertionPt();
+  const_iterator getFirstInsertionPt() const;
+  iterator getFirstInsertionPt() {
+    return static_cast<const BasicBlock *>(this)
+                                          ->getFirstInsertionPt().getNonConst();
   }
 
   /// \brief Unlink 'this' from the containing function, but do not delete it.
@@ -192,9 +200,10 @@ public:
 
   /// \brief Return the predecessor of this block if it has a single predecessor
   /// block. Otherwise return a null pointer.
-  BasicBlock *getSinglePredecessor();
-  const BasicBlock *getSinglePredecessor() const {
-    return const_cast<BasicBlock*>(this)->getSinglePredecessor();
+  const BasicBlock *getSinglePredecessor() const;
+  BasicBlock *getSinglePredecessor() {
+    return const_cast<BasicBlock *>(
+                 static_cast<const BasicBlock *>(this)->getSinglePredecessor());
   }
 
   /// \brief Return the predecessor of this block if it has a unique predecessor
@@ -203,27 +212,30 @@ public:
   /// Note that unique predecessor doesn't mean single edge, there can be
   /// multiple edges from the unique predecessor to this block (for example a
   /// switch statement with multiple cases having the same destination).
-  BasicBlock *getUniquePredecessor();
-  const BasicBlock *getUniquePredecessor() const {
-    return const_cast<BasicBlock*>(this)->getUniquePredecessor();
+  const BasicBlock *getUniquePredecessor() const;
+  BasicBlock *getUniquePredecessor() {
+    return const_cast<BasicBlock *>(
+                 static_cast<const BasicBlock *>(this)->getUniquePredecessor());
   }
 
   /// \brief Return the successor of this block if it has a single successor.
   /// Otherwise return a null pointer.
   ///
   /// This method is analogous to getSinglePredecessor above.
-  BasicBlock *getSingleSuccessor();
-  const BasicBlock *getSingleSuccessor() const {
-    return const_cast<BasicBlock*>(this)->getSingleSuccessor();
+  const BasicBlock *getSingleSuccessor() const;
+  BasicBlock *getSingleSuccessor() {
+    return const_cast<BasicBlock *>(
+                   static_cast<const BasicBlock *>(this)->getSingleSuccessor());
   }
 
   /// \brief Return the successor of this block if it has a unique successor.
   /// Otherwise return a null pointer.
   ///
   /// This method is analogous to getUniquePredecessor above.
-  BasicBlock *getUniqueSuccessor();
-  const BasicBlock *getUniqueSuccessor() const {
-    return const_cast<BasicBlock*>(this)->getUniqueSuccessor();
+  const BasicBlock *getUniqueSuccessor() const;
+  BasicBlock *getUniqueSuccessor() {
+    return const_cast<BasicBlock *>(
+                   static_cast<const BasicBlock *>(this)->getUniqueSuccessor());
   }
 
   //===--------------------------------------------------------------------===//
@@ -325,8 +337,11 @@ public:
   bool isLandingPad() const;
 
   /// \brief Return the landingpad instruction associated with the landing pad.
-  LandingPadInst *getLandingPadInst();
   const LandingPadInst *getLandingPadInst() const;
+  LandingPadInst *getLandingPadInst() {
+    return const_cast<LandingPadInst *>(
+                    static_cast<const BasicBlock *>(this)->getLandingPadInst());
+  }
 
 private:
   /// \brief Increment the internal refcount of the number of BlockAddresses
diff --git a/include/llvm/IR/CallSite.h b/include/llvm/IR/CallSite.h
index b02c89474146fe049e7a19227315edf9dcada94e..6a465709cc51d62877c6b33a60658bae2685419f 100644
--- a/include/llvm/IR/CallSite.h
+++ b/include/llvm/IR/CallSite.h
@@ -65,11 +65,9 @@ protected:
   explicit CallSiteBase(ValTy *II) { *this = get(II); }
 
 private:
-  /// CallSiteBase::get - This static method is sort of like a constructor.  It
-  /// will create an appropriate call site for a Call or Invoke instruction, but
-  /// it can also create a null initialized CallSiteBase object for something
-  /// which is NOT a call site.
-  ///
+  /// This static method is like a constructor. It will create an appropriate
+  /// call site for a Call or Invoke instruction, but it can also create a null
+  /// initialized CallSiteBase object for something which is NOT a call site.
   static CallSiteBase get(ValTy *V) {
     if (InstrTy *II = dyn_cast<InstrTy>(V)) {
       if (II->getOpcode() == Instruction::Call)
@@ -81,38 +79,47 @@ private:
   }
 
 public:
-  /// isCall - true if a CallInst is enclosed.
-  /// Note that !isCall() does not mean it is an InvokeInst enclosed,
-  /// it also could signify a NULL Instruction pointer.
+  /// Return true if a CallInst is enclosed. Note that !isCall() does not mean
+  /// an InvokeInst is enclosed. It may also signify a NULL instruction pointer.
   bool isCall() const { return I.getInt(); }
 
-  /// isInvoke - true if a InvokeInst is enclosed.
-  ///
+  /// Return true if a InvokeInst is enclosed.
   bool isInvoke() const { return getInstruction() && !I.getInt(); }
 
   InstrTy *getInstruction() const { return I.getPointer(); }
   InstrTy *operator->() const { return I.getPointer(); }
   explicit operator bool() const { return I.getPointer(); }
 
-  /// Get the basic block containing the call site
+  /// Get the basic block containing the call site.
   BBTy* getParent() const { return getInstruction()->getParent(); }
 
-  /// getCalledValue - Return the pointer to function that is being called.
-  ///
+  /// Return the pointer to function that is being called.
   ValTy *getCalledValue() const {
     assert(getInstruction() && "Not a call or invoke instruction!");
     return *getCallee();
   }
 
-  /// getCalledFunction - Return the function being called if this is a direct
-  /// call, otherwise return null (if it's an indirect call).
-  ///
+  /// Return the function being called if this is a direct call, otherwise
+  /// return null (if it's an indirect call).
   FunTy *getCalledFunction() const {
     return dyn_cast<FunTy>(getCalledValue());
   }
 
-  /// setCalledFunction - Set the callee to the specified value.
-  ///
+  /// Return true if the callsite is an indirect call.
+  bool isIndirectCall() const {
+    Value *V = getCalledValue();
+    if (!V)
+      return false;
+    if (isa<FunTy>(V) || isa<Constant>(V))
+      return false;
+    if (CallInst *CI = dyn_cast<CallInst>(getInstruction())) {
+      if (CI->isInlineAsm())
+        return false;
+    }
+    return true;
+  }
+
+  /// Set the callee to the specified value.
   void setCalledFunction(Value *V) {
     assert(getInstruction() && "Not a call or invoke instruction!");
     *getCallee() = V;
@@ -129,8 +136,7 @@ public:
     return static_cast<Intrinsic::ID>(0);
   }
 
-  /// isCallee - Determine whether the passed iterator points to the
-  /// callee operand's Use.
+  /// Determine whether the passed iterator points to the callee operand's Use.
   bool isCallee(Value::const_user_iterator UI) const {
     return isCallee(&UI.getUse());
   }
@@ -138,24 +144,23 @@ public:
   /// Determine whether this Use is the callee operand's Use.
   bool isCallee(const Use *U) const { return getCallee() == U; }
 
-  /// \brief Determine whether the passed iterator points to an argument
-  /// operand.
+  /// Determine whether the passed iterator points to an argument operand.
   bool isArgOperand(Value::const_user_iterator UI) const {
     return isArgOperand(&UI.getUse());
   }
 
-  /// \brief Determine whether the passed use points to an argument operand.
+  /// Determine whether the passed use points to an argument operand.
   bool isArgOperand(const Use *U) const {
     assert(getInstruction() == U->getUser());
     return arg_begin() <= U && U < arg_end();
   }
 
-  /// \brief Determine whether the passed iterator points to a bundle operand.
+  /// Determine whether the passed iterator points to a bundle operand.
   bool isBundleOperand(Value::const_user_iterator UI) const {
     return isBundleOperand(&UI.getUse());
   }
 
-  /// \brief Determine whether the passed use points to a bundle operand.
+  /// Determine whether the passed use points to a bundle operand.
   bool isBundleOperand(const Use *U) const {
     assert(getInstruction() == U->getUser());
     if (!hasOperandBundles())
@@ -165,12 +170,12 @@ public:
            OperandNo < getBundleOperandsEndIndex();
   }
 
-  /// \brief Determine whether the passed iterator points to a data operand.
+  /// Determine whether the passed iterator points to a data operand.
   bool isDataOperand(Value::const_user_iterator UI) const {
     return isDataOperand(&UI.getUse());
   }
 
-  /// \brief Determine whether the passed use points to a data operand.
+  /// Determine whether the passed use points to a data operand.
   bool isDataOperand(const Use *U) const {
     return data_operands_begin() <= U && U < data_operands_end();
   }
@@ -200,8 +205,8 @@ public:
     return U - arg_begin();
   }
 
-  /// arg_iterator - The type of iterator to use when looping over actual
-  /// arguments at this call site.
+  /// The type of iterator to use when looping over actual arguments at this
+  /// call site.
   typedef IterTy arg_iterator;
 
   iterator_range<IterTy> args() const {
@@ -210,8 +215,7 @@ public:
   bool arg_empty() const { return arg_end() == arg_begin(); }
   unsigned arg_size() const { return unsigned(arg_end() - arg_begin()); }
 
-  /// Given a value use iterator, returns the data operand that corresponds to
-  /// it.
+  /// Given a value use iterator, return the data operand corresponding to it.
   /// Iterator must actually correspond to a data operand.
   unsigned getDataOperandNo(Value::const_user_iterator UI) const {
     return getDataOperandNo(&UI.getUse());
@@ -253,21 +257,19 @@ public:
     return std::distance(data_operands_begin(), data_operands_end());
   }
 
-  /// getType - Return the type of the instruction that generated this call site
-  ///
+  /// Return the type of the instruction that generated this call site.
   Type *getType() const { return (*this)->getType(); }
 
-  /// getCaller - Return the caller function for this call site
-  ///
+  /// Return the caller function for this call site.
   FunTy *getCaller() const { return (*this)->getParent()->getParent(); }
 
-  /// \brief Tests if this call site must be tail call optimized.  Only a
-  /// CallInst can be tail call optimized.
+  /// Tests if this call site must be tail call optimized. Only a CallInst can
+  /// be tail call optimized.
   bool isMustTailCall() const {
     return isCall() && cast<CallInst>(getInstruction())->isMustTailCall();
   }
 
-  /// \brief Tests if this call site is marked as a tail call.
+  /// Tests if this call site is marked as a tail call.
   bool isTailCall() const {
     return isCall() && cast<CallInst>(getInstruction())->isTailCall();
   }
@@ -303,11 +305,11 @@ public:
     return false;
   }
 
-  /// getCallingConv/setCallingConv - get or set the calling convention of the
-  /// call.
+  /// Get the calling convention of the call.
   CallingConv::ID getCallingConv() const {
     CALLSITE_DELEGATE_GETTER(getCallingConv());
   }
+  /// Set the calling convention of the call.
   void setCallingConv(CallingConv::ID CC) {
     CALLSITE_DELEGATE_SETTER(setCallingConv(CC));
   }
@@ -320,12 +322,12 @@ public:
     CALLSITE_DELEGATE_SETTER(mutateFunctionType(Ty));
   }
 
-  /// getAttributes/setAttributes - get or set the parameter attributes of
-  /// the call.
-  AttributeSet getAttributes() const {
+  /// Get the parameter attributes of the call.
+  AttributeList getAttributes() const {
     CALLSITE_DELEGATE_GETTER(getAttributes());
   }
-  void setAttributes(AttributeSet PAL) {
+  /// Set the parameter attributes of the call.
+  void setAttributes(AttributeList PAL) {
     CALLSITE_DELEGATE_SETTER(setAttributes(PAL));
   }
 
@@ -345,17 +347,17 @@ public:
     CALLSITE_DELEGATE_SETTER(removeAttribute(i, Kind));
   }
 
-  /// \brief Return true if this function has the given attribute.
+  /// Return true if this function has the given attribute.
   bool hasFnAttr(Attribute::AttrKind Kind) const {
     CALLSITE_DELEGATE_GETTER(hasFnAttr(Kind));
   }
 
-  /// \brief Return true if this function has the given attribute.
+  /// Return true if this function has the given attribute.
   bool hasFnAttr(StringRef Kind) const {
     CALLSITE_DELEGATE_GETTER(hasFnAttr(Kind));
   }
 
-  /// \brief Return true if the call or the callee has the given attribute.
+  /// Return true if the call or the callee has the given attribute.
   bool paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
     CALLSITE_DELEGATE_GETTER(paramHasAttr(i, Kind));
   }
@@ -368,8 +370,8 @@ public:
     CALLSITE_DELEGATE_GETTER(getAttribute(i, Kind));
   }
 
-  /// \brief Return true if the data operand at index \p i directly or
-  /// indirectly has the attribute \p A.
+  /// Return true if the data operand at index \p i directly or indirectly has
+  /// the attribute \p A.
   ///
   /// Normal call or invoke arguments have per operand attributes, as specified
   /// in the attribute set attached to this instruction, while operand bundle
@@ -379,37 +381,36 @@ public:
     CALLSITE_DELEGATE_GETTER(dataOperandHasImpliedAttr(i, Kind));
   }
 
-  /// @brief Extract the alignment for a call or parameter (0=unknown).
+  /// Extract the alignment for a call or parameter (0=unknown).
   uint16_t getParamAlignment(uint16_t i) const {
     CALLSITE_DELEGATE_GETTER(getParamAlignment(i));
   }
 
-  /// @brief Extract the number of dereferenceable bytes for a call or
-  /// parameter (0=unknown).
+  /// Extract the number of dereferenceable bytes for a call or parameter
+  /// (0=unknown).
   uint64_t getDereferenceableBytes(uint16_t i) const {
     CALLSITE_DELEGATE_GETTER(getDereferenceableBytes(i));
   }
 
-  /// @brief Extract the number of dereferenceable_or_null bytes for a call or
+  /// Extract the number of dereferenceable_or_null bytes for a call or
   /// parameter (0=unknown).
   uint64_t getDereferenceableOrNullBytes(uint16_t i) const {
     CALLSITE_DELEGATE_GETTER(getDereferenceableOrNullBytes(i));
   }
 
-  /// @brief Determine if the parameter or return value is marked with NoAlias
+  /// Determine if the parameter or return value is marked with NoAlias
   /// attribute.
   /// @param n The parameter to check. 1 is the first parameter, 0 is the return
   bool doesNotAlias(unsigned n) const {
     CALLSITE_DELEGATE_GETTER(doesNotAlias(n));
   }
 
-  /// \brief Return true if the call should not be treated as a call to a
-  /// builtin.
+  /// Return true if the call should not be treated as a call to a builtin.
   bool isNoBuiltin() const {
     CALLSITE_DELEGATE_GETTER(isNoBuiltin());
   }
 
-  /// @brief Return true if the call should not be inlined.
+  /// Return true if the call should not be inlined.
   bool isNoInline() const {
     CALLSITE_DELEGATE_GETTER(isNoInline());
   }
@@ -417,7 +418,7 @@ public:
     CALLSITE_DELEGATE_SETTER(setIsNoInline(Value));
   }
 
-  /// @brief Determine if the call does not access memory.
+  /// Determine if the call does not access memory.
   bool doesNotAccessMemory() const {
     CALLSITE_DELEGATE_GETTER(doesNotAccessMemory());
   }
@@ -425,7 +426,7 @@ public:
     CALLSITE_DELEGATE_SETTER(setDoesNotAccessMemory());
   }
 
-  /// @brief Determine if the call does not access or only reads memory.
+  /// Determine if the call does not access or only reads memory.
   bool onlyReadsMemory() const {
     CALLSITE_DELEGATE_GETTER(onlyReadsMemory());
   }
@@ -433,7 +434,7 @@ public:
     CALLSITE_DELEGATE_SETTER(setOnlyReadsMemory());
   }
 
-  /// @brief Determine if the call does not access or only writes memory.
+  /// Determine if the call does not access or only writes memory.
   bool doesNotReadMemory() const {
     CALLSITE_DELEGATE_GETTER(doesNotReadMemory());
   }
@@ -441,7 +442,7 @@ public:
     CALLSITE_DELEGATE_SETTER(setDoesNotReadMemory());
   }
 
-  /// @brief Determine if the call can access memmory only using pointers based
+  /// Determine if the call can access memmory only using pointers based
   /// on its arguments.
   bool onlyAccessesArgMemory() const {
     CALLSITE_DELEGATE_GETTER(onlyAccessesArgMemory());
@@ -450,7 +451,7 @@ public:
     CALLSITE_DELEGATE_SETTER(setOnlyAccessesArgMemory());
   }
 
-  /// @brief Determine if the call cannot return.
+  /// Determine if the call cannot return.
   bool doesNotReturn() const {
     CALLSITE_DELEGATE_GETTER(doesNotReturn());
   }
@@ -458,7 +459,7 @@ public:
     CALLSITE_DELEGATE_SETTER(setDoesNotReturn());
   }
 
-  /// @brief Determine if the call cannot unwind.
+  /// Determine if the call cannot unwind.
   bool doesNotThrow() const {
     CALLSITE_DELEGATE_GETTER(doesNotThrow());
   }
@@ -466,7 +467,7 @@ public:
     CALLSITE_DELEGATE_SETTER(setDoesNotThrow());
   }
 
-  /// @brief Determine if the call can be duplicated.
+  /// Determine if the call can be duplicated.
   bool cannotDuplicate() const {
     CALLSITE_DELEGATE_GETTER(cannotDuplicate());
   }
@@ -474,7 +475,7 @@ public:
     CALLSITE_DELEGATE_GETTER(setCannotDuplicate());
   }
 
-  /// @brief Determine if the call is convergent.
+  /// Determine if the call is convergent.
   bool isConvergent() const {
     CALLSITE_DELEGATE_GETTER(isConvergent());
   }
@@ -546,29 +547,29 @@ public:
       cast<InvokeInst>(II)->getOperandBundlesAsDefs(Defs);
   }
 
-  /// @brief Determine whether this data operand is not captured.
+  /// Determine whether this data operand is not captured.
   bool doesNotCapture(unsigned OpNo) const {
     return dataOperandHasImpliedAttr(OpNo + 1, Attribute::NoCapture);
   }
 
-  /// @brief Determine whether this argument is passed by value.
+  /// Determine whether this argument is passed by value.
   bool isByValArgument(unsigned ArgNo) const {
     return paramHasAttr(ArgNo + 1, Attribute::ByVal);
   }
 
-  /// @brief Determine whether this argument is passed in an alloca.
+  /// Determine whether this argument is passed in an alloca.
   bool isInAllocaArgument(unsigned ArgNo) const {
     return paramHasAttr(ArgNo + 1, Attribute::InAlloca);
   }
 
-  /// @brief Determine whether this argument is passed by value or in an alloca.
+  /// Determine whether this argument is passed by value or in an alloca.
   bool isByValOrInAllocaArgument(unsigned ArgNo) const {
     return paramHasAttr(ArgNo + 1, Attribute::ByVal) ||
            paramHasAttr(ArgNo + 1, Attribute::InAlloca);
   }
 
-  /// @brief Determine if there are is an inalloca argument.  Only the last
-  /// argument can have the inalloca attribute.
+  /// Determine if there are is an inalloca argument. Only the last argument can
+  /// have the inalloca attribute.
   bool hasInAllocaArgument() const {
     return paramHasAttr(arg_size(), Attribute::InAlloca);
   }
@@ -582,7 +583,12 @@ public:
            dataOperandHasImpliedAttr(OpNo + 1, Attribute::ReadNone);
   }
 
-  /// @brief Return true if the return value is known to be not null.
+  bool doesNotReadMemory(unsigned OpNo) const {
+    return dataOperandHasImpliedAttr(OpNo + 1, Attribute::WriteOnly) ||
+           dataOperandHasImpliedAttr(OpNo + 1, Attribute::ReadNone);
+  }
+
+  /// Return true if the return value is known to be not null.
   /// This may be because it has the nonnull attribute, or because at least
   /// one byte is dereferenceable and the pointer is in addrspace(0).
   bool isReturnNonNull() const {
@@ -595,8 +601,8 @@ public:
     return false;
   }
 
-  /// hasArgument - Returns true if this CallSite passes the given Value* as an
-  /// argument to the called function.
+  /// Returns true if this CallSite passes the given Value* as an argument to
+  /// the called function.
   bool hasArgument(const Value *Arg) const {
     for (arg_iterator AI = this->arg_begin(), E = this->arg_end(); AI != E;
          ++AI)
@@ -661,7 +667,7 @@ template <> struct DenseMapInfo<CallSite> {
   }
 };
 
-/// ImmutableCallSite - establish a view to a call site for examination
+/// Establish a view to a call site for examination.
 class ImmutableCallSite : public CallSiteBase<> {
 public:
   ImmutableCallSite() = default;
diff --git a/include/llvm/IR/Comdat.h b/include/llvm/IR/Comdat.h
index f4a391c31ae2248ef265131f6fa0ae3ef6dd6665..fa87093ca50ac44bb9357db48c41f666805a9c5a 100644
--- a/include/llvm/IR/Comdat.h
+++ b/include/llvm/IR/Comdat.h
@@ -1,4 +1,4 @@
-//===-- llvm/IR/Comdat.h - Comdat definitions -------------------*- C++ -*-===//
+//===- llvm/IR/Comdat.h - Comdat definitions --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -51,8 +51,8 @@ private:
   Comdat();
 
   // Points to the map in Module.
-  StringMapEntry<Comdat> *Name;
-  SelectionKind SK;
+  StringMapEntry<Comdat> *Name = nullptr;
+  SelectionKind SK = Any;
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const Comdat &C) {
diff --git a/include/llvm/IR/Constant.h b/include/llvm/IR/Constant.h
index 99c970ebb63358942d7ba497cb39f8aa0232c702..3b3694e7e60d0c2ba96f5b203c77abf5de1d88a7 100644
--- a/include/llvm/IR/Constant.h
+++ b/include/llvm/IR/Constant.h
@@ -152,12 +152,13 @@ public:
   /// hanging off of the globals.
   void removeDeadConstantUsers() const;
 
-  Constant *stripPointerCasts() {
+  const Constant *stripPointerCasts() const {
     return cast<Constant>(Value::stripPointerCasts());
   }
 
-  const Constant *stripPointerCasts() const {
-    return const_cast<Constant*>(this)->stripPointerCasts();
+  Constant *stripPointerCasts() {
+    return const_cast<Constant*>(
+                      static_cast<const Constant *>(this)->stripPointerCasts());
   }
 };
 
diff --git a/include/llvm/IR/ConstantRange.h b/include/llvm/IR/ConstantRange.h
index 27a9b136444896467a2d72a7112b5a6e0d137cab..17c39a6ef9b564479b26983fdd0d4de623c201be 100644
--- a/include/llvm/IR/ConstantRange.h
+++ b/include/llvm/IR/ConstantRange.h
@@ -184,6 +184,10 @@ public:
   ///
   APInt getSetSize() const;
 
+  /// Compare set size of this range with the range CR.
+  ///
+  bool isSizeStrictlySmallerThanOf(const ConstantRange &CR) const;
+
   /// Return the largest unsigned value contained in the ConstantRange.
   ///
   APInt getUnsignedMax() const;
diff --git a/include/llvm/IR/DIBuilder.h b/include/llvm/IR/DIBuilder.h
index 48cb7fe5df6f3019316bd38dac42145fa4f932d9..69bd5c847a8d06e051ac840fc03201bfdd9b0d42 100644
--- a/include/llvm/IR/DIBuilder.h
+++ b/include/llvm/IR/DIBuilder.h
@@ -105,13 +105,17 @@ namespace llvm {
     ///                      out into.
     /// \param Kind          The kind of debug information to generate.
     /// \param DWOId         The DWOId if this is a split skeleton compile unit.
+    /// \param SplitDebugInlining    Whether to emit inline debug info.
+    /// \param DebugInfoForProfiling Whether to emit extra debug info for
+    ///                              profile collection.
     DICompileUnit *
     createCompileUnit(unsigned Lang, DIFile *File, StringRef Producer,
                       bool isOptimized, StringRef Flags, unsigned RV,
                       StringRef SplitName = StringRef(),
                       DICompileUnit::DebugEmissionKind Kind =
                           DICompileUnit::DebugEmissionKind::FullDebug,
-                      uint64_t DWOId = 0, bool SplitDebugInlining = true);
+                      uint64_t DWOId = 0, bool SplitDebugInlining = true,
+                      bool DebugInfoForProfiling = false);
 
     /// Create a file descriptor to hold debugging information for a file.
     /// \param Filename  File name.
@@ -164,12 +168,15 @@ namespace llvm {
     DIDerivedType *createQualifiedType(unsigned Tag, DIType *FromTy);
 
     /// Create debugging information entry for a pointer.
-    /// \param PointeeTy   Type pointed by this pointer.
-    /// \param SizeInBits  Size.
-    /// \param AlignInBits Alignment. (optional)
-    /// \param Name        Pointer type name. (optional)
+    /// \param PointeeTy         Type pointed by this pointer.
+    /// \param SizeInBits        Size.
+    /// \param AlignInBits       Alignment. (optional)
+    /// \param DWARFAddressSpace DWARF address space. (optional)
+    /// \param Name              Pointer type name. (optional)
     DIDerivedType *createPointerType(DIType *PointeeTy, uint64_t SizeInBits,
                                      uint32_t AlignInBits = 0,
+                                     Optional<unsigned> DWARFAddressSpace =
+                                         None,
                                      StringRef Name = "");
 
     /// Create debugging information entry for a pointer to member.
@@ -186,7 +193,9 @@ namespace llvm {
     /// style reference or rvalue reference type.
     DIDerivedType *createReferenceType(unsigned Tag, DIType *RTy,
                                        uint64_t SizeInBits = 0,
-                                       uint32_t AlignInBits = 0);
+                                       uint32_t AlignInBits = 0,
+                                       Optional<unsigned> DWARFAddressSpace =
+                                           None);
 
     /// Create debugging information entry for a typedef.
     /// \param Ty          Original type.
@@ -431,13 +440,6 @@ namespace llvm {
                          DINode::DIFlags Flags = DINode::FlagZero,
                          unsigned CC = 0);
 
-    /// Create an external type reference.
-    /// \param Tag              Dwarf TAG.
-    /// \param File             File in which the type is defined.
-    /// \param UniqueIdentifier A unique identifier for the type.
-    DICompositeType *createExternalTypeRef(unsigned Tag, DIFile *File,
-                                           StringRef UniqueIdentifier);
-
     /// Create a new DIType* with "artificial" flag set.
     DIType *createArtificialType(DIType *Ty);
 
diff --git a/include/llvm/IR/DataLayout.h b/include/llvm/IR/DataLayout.h
index 6f37669f976897d672391e626f328d18fe0e4d9c..1930d48577d4fc9ece1bfabc1d2dda0e7264b78b 100644
--- a/include/llvm/IR/DataLayout.h
+++ b/include/llvm/IR/DataLayout.h
@@ -104,6 +104,7 @@ private:
   /// Defaults to false.
   bool BigEndian;
 
+  unsigned AllocaAddrSpace;
   unsigned StackNaturalAlign;
 
   enum ManglingModeT {
@@ -118,8 +119,19 @@ private:
 
   SmallVector<unsigned char, 8> LegalIntWidths;
 
-  /// \brief Primitive type alignment data.
-  SmallVector<LayoutAlignElem, 16> Alignments;
+  /// \brief Primitive type alignment data. This is sorted by type and bit
+  /// width during construction.
+  typedef SmallVector<LayoutAlignElem, 16> AlignmentsTy;
+  AlignmentsTy Alignments;
+
+  AlignmentsTy::const_iterator
+  findAlignmentLowerBound(AlignTypeEnum AlignType, uint32_t BitWidth) const {
+    return const_cast<DataLayout *>(this)->findAlignmentLowerBound(AlignType,
+                                                                   BitWidth);
+  }
+
+  AlignmentsTy::iterator
+  findAlignmentLowerBound(AlignTypeEnum AlignType, uint32_t BitWidth);
 
   /// \brief The string representation used to create this DataLayout
   std::string StringRepresentation;
@@ -134,14 +146,6 @@ private:
 
   PointersTy::iterator findPointerLowerBound(uint32_t AddressSpace);
 
-  /// This member is a signal that a requested alignment type and bit width were
-  /// not found in the SmallVector.
-  static const LayoutAlignElem InvalidAlignmentElem;
-
-  /// This member is a signal that a requested pointer type and bit width were
-  /// not found in the DenseSet.
-  static const PointerAlignElem InvalidPointerElem;
-
   // The StructType -> StructLayout map.
   mutable void *LayoutMap;
 
@@ -159,22 +163,6 @@ private:
   /// Internal helper method that returns requested alignment for type.
   unsigned getAlignment(Type *Ty, bool abi_or_pref) const;
 
-  /// \brief Valid alignment predicate.
-  ///
-  /// Predicate that tests a LayoutAlignElem reference returned by get() against
-  /// InvalidAlignmentElem.
-  bool validAlignment(const LayoutAlignElem &align) const {
-    return &align != &InvalidAlignmentElem;
-  }
-
-  /// \brief Valid pointer predicate.
-  ///
-  /// Predicate that tests a PointerAlignElem reference returned by get()
-  /// against \c InvalidPointerElem.
-  bool validPointer(const PointerAlignElem &align) const {
-    return &align != &InvalidPointerElem;
-  }
-
   /// Parses a target data specification string. Assert if the string is
   /// malformed.
   void parseSpecifier(StringRef LayoutDescription);
@@ -199,6 +187,7 @@ public:
     clear();
     StringRepresentation = DL.StringRepresentation;
     BigEndian = DL.isBigEndian();
+    AllocaAddrSpace = DL.AllocaAddrSpace;
     StackNaturalAlign = DL.StackNaturalAlign;
     ManglingMode = DL.ManglingMode;
     LegalIntWidths = DL.LegalIntWidths;
@@ -254,6 +243,7 @@ public:
   }
 
   unsigned getStackAlignment() const { return StackNaturalAlign; }
+  unsigned getAllocaAddrSpace() const { return AllocaAddrSpace; }
 
   bool hasMicrosoftFastStdCallMangling() const {
     return ManglingMode == MM_WinCOFFX86;
diff --git a/include/llvm/IR/DebugInfoFlags.def b/include/llvm/IR/DebugInfoFlags.def
index 87f3dc9dbdd326a41da329996c0b9e8805172e39..7ea6346998fe54e55d56663df5aca3d6a89d1ae6 100644
--- a/include/llvm/IR/DebugInfoFlags.def
+++ b/include/llvm/IR/DebugInfoFlags.def
@@ -34,7 +34,8 @@ HANDLE_DI_FLAG((1 << 11), Vector)
 HANDLE_DI_FLAG((1 << 12), StaticMember)
 HANDLE_DI_FLAG((1 << 13), LValueReference)
 HANDLE_DI_FLAG((1 << 14), RValueReference)
-HANDLE_DI_FLAG((1 << 15), ExternalTypeRef)
+// 15 was formerly ExternalTypeRef, but this was never used.
+HANDLE_DI_FLAG((1 << 15), Reserved)
 HANDLE_DI_FLAG((1 << 16), SingleInheritance)
 HANDLE_DI_FLAG((2 << 16), MultipleInheritance)
 HANDLE_DI_FLAG((3 << 16), VirtualInheritance)
diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h
index 187855225c50594c7e516a39a4212b1e757df49c..8a924b40143aa732d65c6b33a05b5bcd82bd71ee 100644
--- a/include/llvm/IR/DebugInfoMetadata.h
+++ b/include/llvm/IR/DebugInfoMetadata.h
@@ -629,7 +629,6 @@ public:
   bool isStaticMember() const { return getFlags() & FlagStaticMember; }
   bool isLValueReference() const { return getFlags() & FlagLValueReference; }
   bool isRValueReference() const { return getFlags() & FlagRValueReference; }
-  bool isExternalTypeRef() const { return getFlags() & FlagExternalTypeRef; }
 
   static bool classof(const Metadata *MD) {
     switch (MD->getMetadataID()) {
@@ -710,37 +709,45 @@ class DIDerivedType : public DIType {
   friend class LLVMContextImpl;
   friend class MDNode;
 
+  /// \brief The DWARF address space of the memory pointed to or referenced by a
+  /// pointer or reference type respectively.
+  Optional<unsigned> DWARFAddressSpace;
+
   DIDerivedType(LLVMContext &C, StorageType Storage, unsigned Tag,
                 unsigned Line, uint64_t SizeInBits, uint32_t AlignInBits,
-                uint64_t OffsetInBits, DIFlags Flags, ArrayRef<Metadata *> Ops)
+                uint64_t OffsetInBits, Optional<unsigned> DWARFAddressSpace,
+                DIFlags Flags, ArrayRef<Metadata *> Ops)
       : DIType(C, DIDerivedTypeKind, Storage, Tag, Line, SizeInBits,
-               AlignInBits, OffsetInBits, Flags, Ops) {}
+               AlignInBits, OffsetInBits, Flags, Ops),
+        DWARFAddressSpace(DWARFAddressSpace) {}
   ~DIDerivedType() = default;
 
   static DIDerivedType *getImpl(LLVMContext &Context, unsigned Tag,
                                 StringRef Name, DIFile *File, unsigned Line,
                                 DIScopeRef Scope, DITypeRef BaseType,
                                 uint64_t SizeInBits, uint32_t AlignInBits,
-                                uint64_t OffsetInBits, DIFlags Flags,
-                                Metadata *ExtraData, StorageType Storage,
-                                bool ShouldCreate = true) {
+                                uint64_t OffsetInBits,
+                                Optional<unsigned> DWARFAddressSpace,
+                                DIFlags Flags, Metadata *ExtraData,
+                                StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, Tag, getCanonicalMDString(Context, Name), File,
                    Line, Scope, BaseType, SizeInBits, AlignInBits, OffsetInBits,
-                   Flags, ExtraData, Storage, ShouldCreate);
+                   DWARFAddressSpace, Flags, ExtraData, Storage, ShouldCreate);
   }
   static DIDerivedType *getImpl(LLVMContext &Context, unsigned Tag,
                                 MDString *Name, Metadata *File, unsigned Line,
                                 Metadata *Scope, Metadata *BaseType,
                                 uint64_t SizeInBits, uint32_t AlignInBits,
-                                uint64_t OffsetInBits, DIFlags Flags,
-                                Metadata *ExtraData, StorageType Storage,
-                                bool ShouldCreate = true);
+                                uint64_t OffsetInBits,
+                                Optional<unsigned> DWARFAddressSpace,
+                                DIFlags Flags, Metadata *ExtraData,
+                                StorageType Storage, bool ShouldCreate = true);
 
   TempDIDerivedType cloneImpl() const {
     return getTemporary(getContext(), getTag(), getName(), getFile(), getLine(),
                         getScope(), getBaseType(), getSizeInBits(),
-                        getAlignInBits(), getOffsetInBits(), getFlags(),
-                        getExtraData());
+                        getAlignInBits(), getOffsetInBits(),
+                        getDWARFAddressSpace(), getFlags(), getExtraData());
   }
 
 public:
@@ -748,24 +755,32 @@ public:
                     (unsigned Tag, MDString *Name, Metadata *File,
                      unsigned Line, Metadata *Scope, Metadata *BaseType,
                      uint64_t SizeInBits, uint32_t AlignInBits,
-                     uint64_t OffsetInBits, DIFlags Flags,
+                     uint64_t OffsetInBits,
+                     Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
                      Metadata *ExtraData = nullptr),
                     (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                     AlignInBits, OffsetInBits, Flags, ExtraData))
+                     AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
+                     ExtraData))
   DEFINE_MDNODE_GET(DIDerivedType,
                     (unsigned Tag, StringRef Name, DIFile *File, unsigned Line,
                      DIScopeRef Scope, DITypeRef BaseType, uint64_t SizeInBits,
                      uint32_t AlignInBits, uint64_t OffsetInBits,
-                     DIFlags Flags, Metadata *ExtraData = nullptr),
+                     Optional<unsigned> DWARFAddressSpace, DIFlags Flags,
+                     Metadata *ExtraData = nullptr),
                     (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                     AlignInBits, OffsetInBits, Flags, ExtraData))
+                     AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
+                     ExtraData))
 
   TempDIDerivedType clone() const { return cloneImpl(); }
 
-  //// Get the base type this is derived from.
+  /// Get the base type this is derived from.
   DITypeRef getBaseType() const { return DITypeRef(getRawBaseType()); }
   Metadata *getRawBaseType() const { return getOperand(3); }
 
+  /// \returns The DWARF address space of the memory pointed to or referenced by
+  /// a pointer or reference type respectively.
+  Optional<unsigned> getDWARFAddressSpace() const { return DWARFAddressSpace; }
+
   /// Get extra data associated with this derived type.
   ///
   /// Class type for pointer-to-members, objective-c property node for ivars,
@@ -1044,15 +1059,17 @@ private:
   unsigned EmissionKind;
   uint64_t DWOId;
   bool SplitDebugInlining;
+  bool DebugInfoForProfiling;
 
   DICompileUnit(LLVMContext &C, StorageType Storage, unsigned SourceLanguage,
                 bool IsOptimized, unsigned RuntimeVersion,
                 unsigned EmissionKind, uint64_t DWOId, bool SplitDebugInlining,
-                ArrayRef<Metadata *> Ops)
+                bool DebugInfoForProfiling, ArrayRef<Metadata *> Ops)
       : DIScope(C, DICompileUnitKind, Storage, dwarf::DW_TAG_compile_unit, Ops),
         SourceLanguage(SourceLanguage), IsOptimized(IsOptimized),
         RuntimeVersion(RuntimeVersion), EmissionKind(EmissionKind),
-        DWOId(DWOId), SplitDebugInlining(SplitDebugInlining) {
+        DWOId(DWOId), SplitDebugInlining(SplitDebugInlining),
+        DebugInfoForProfiling(DebugInfoForProfiling) {
     assert(Storage != Uniqued);
   }
   ~DICompileUnit() = default;
@@ -1065,15 +1082,16 @@ private:
           DIScopeArray RetainedTypes,
           DIGlobalVariableExpressionArray GlobalVariables,
           DIImportedEntityArray ImportedEntities, DIMacroNodeArray Macros,
-          uint64_t DWOId, bool SplitDebugInlining, StorageType Storage,
-          bool ShouldCreate = true) {
+          uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling,
+          StorageType Storage, bool ShouldCreate = true) {
     return getImpl(Context, SourceLanguage, File,
                    getCanonicalMDString(Context, Producer), IsOptimized,
                    getCanonicalMDString(Context, Flags), RuntimeVersion,
                    getCanonicalMDString(Context, SplitDebugFilename),
                    EmissionKind, EnumTypes.get(), RetainedTypes.get(),
                    GlobalVariables.get(), ImportedEntities.get(), Macros.get(),
-                   DWOId, SplitDebugInlining, Storage, ShouldCreate);
+                   DWOId, SplitDebugInlining, DebugInfoForProfiling, Storage,
+                   ShouldCreate);
   }
   static DICompileUnit *
   getImpl(LLVMContext &Context, unsigned SourceLanguage, Metadata *File,
@@ -1082,7 +1100,8 @@ private:
           unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
           Metadata *GlobalVariables, Metadata *ImportedEntities,
           Metadata *Macros, uint64_t DWOId, bool SplitDebugInlining,
-          StorageType Storage, bool ShouldCreate = true);
+          bool DebugInfoForProfiling, StorageType Storage,
+          bool ShouldCreate = true);
 
   TempDICompileUnit cloneImpl() const {
     return getTemporary(getContext(), getSourceLanguage(), getFile(),
@@ -1090,7 +1109,8 @@ private:
                         getRuntimeVersion(), getSplitDebugFilename(),
                         getEmissionKind(), getEnumTypes(), getRetainedTypes(),
                         getGlobalVariables(), getImportedEntities(),
-                        getMacros(), DWOId, getSplitDebugInlining());
+                        getMacros(), DWOId, getSplitDebugInlining(),
+                        getDebugInfoForProfiling());
   }
 
 public:
@@ -1105,10 +1125,11 @@ public:
        DICompositeTypeArray EnumTypes, DIScopeArray RetainedTypes,
        DIGlobalVariableExpressionArray GlobalVariables,
        DIImportedEntityArray ImportedEntities, DIMacroNodeArray Macros,
-       uint64_t DWOId, bool SplitDebugInlining),
+       uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling),
       (SourceLanguage, File, Producer, IsOptimized, Flags, RuntimeVersion,
        SplitDebugFilename, EmissionKind, EnumTypes, RetainedTypes,
-       GlobalVariables, ImportedEntities, Macros, DWOId, SplitDebugInlining))
+       GlobalVariables, ImportedEntities, Macros, DWOId, SplitDebugInlining,
+       DebugInfoForProfiling))
   DEFINE_MDNODE_GET_DISTINCT_TEMPORARY(
       DICompileUnit,
       (unsigned SourceLanguage, Metadata *File, MDString *Producer,
@@ -1116,10 +1137,11 @@ public:
        MDString *SplitDebugFilename, unsigned EmissionKind, Metadata *EnumTypes,
        Metadata *RetainedTypes, Metadata *GlobalVariables,
        Metadata *ImportedEntities, Metadata *Macros, uint64_t DWOId,
-       bool SplitDebugInlining),
+       bool SplitDebugInlining, bool DebugInfoForProfiling),
       (SourceLanguage, File, Producer, IsOptimized, Flags, RuntimeVersion,
        SplitDebugFilename, EmissionKind, EnumTypes, RetainedTypes,
-       GlobalVariables, ImportedEntities, Macros, DWOId, SplitDebugInlining))
+       GlobalVariables, ImportedEntities, Macros, DWOId, SplitDebugInlining,
+       DebugInfoForProfiling))
 
   TempDICompileUnit clone() const { return cloneImpl(); }
 
@@ -1129,6 +1151,7 @@ public:
   DebugEmissionKind getEmissionKind() const {
     return (DebugEmissionKind)EmissionKind;
   }
+  bool getDebugInfoForProfiling() const { return DebugInfoForProfiling; }
   StringRef getProducer() const { return getStringOperand(1); }
   StringRef getFlags() const { return getStringOperand(2); }
   StringRef getSplitDebugFilename() const { return getStringOperand(3); }
@@ -1246,6 +1269,28 @@ class DILocation : public MDNode {
                    static_cast<Metadata *>(InlinedAt), Storage, ShouldCreate);
   }
 
+  /// With a given unsigned int \p U, use up to 13 bits to represent it.
+  /// old_bit 1~5  --> new_bit 1~5
+  /// old_bit 6~12 --> new_bit 7~13
+  /// new_bit_6 is 0 if higher bits (7~13) are all 0
+  static unsigned getPrefixEncodingFromUnsigned(unsigned U) {
+    U &= 0xfff;
+    return U > 0x1f ? (((U & 0xfe0) << 1) | (U & 0x1f) | 0x20) : U;
+  }
+
+  /// Reverse transformation as getPrefixEncodingFromUnsigned.
+  static unsigned getUnsignedFromPrefixEncoding(unsigned U) {
+    return (U & 0x20) ? (((U >> 1) & 0xfe0) | (U & 0x1f)) : (U & 0x1f);
+  }
+
+  /// Returns the next component stored in discriminator.
+  static unsigned getNextComponentInDiscriminator(unsigned D) {
+    if ((D & 1) == 0)
+      return D >> ((D & 0x40) ? 14 : 7);
+    else
+      return D >> 1;
+  }
+
   TempDILocation cloneImpl() const {
     // Get the raw scope/inlinedAt since it is possible to invoke this on
     // a DILocation containing temporary metadata.
@@ -1307,10 +1352,48 @@ public:
   ///
   /// DWARF discriminators distinguish identical file locations between
   /// instructions that are on different basic blocks.
+  ///
+  /// There are 3 components stored in discriminator, from lower bits:
+  ///
+  /// Base discriminator: assigned by AddDiscriminators pass to identify IRs
+  ///                     that are defined by the same source line, but
+  ///                     different basic blocks.
+  /// Duplication factor: assigned by optimizations that will scale down
+  ///                     the execution frequency of the original IR.
+  /// Copy Identifier: assigned by optimizations that clones the IR.
+  ///                  Each copy of the IR will be assigned an identifier.
+  ///
+  /// Encoding:
+  ///
+  /// The above 3 components are encoded into a 32bit unsigned integer in
+  /// order. If the lowest bit is 1, the current component is empty, and the
+  /// next component will start in the next bit. Otherwise, the the current
+  /// component is non-empty, and its content starts in the next bit. The
+  /// length of each components is either 5 bit or 12 bit: if the 7th bit
+  /// is 0, the bit 2~6 (5 bits) are used to represent the component; if the
+  /// 7th bit is 1, the bit 2~6 (5 bits) and 8~14 (7 bits) are combined to
+  /// represent the component.
+
   inline unsigned getDiscriminator() const;
 
   /// Returns a new DILocation with updated \p Discriminator.
-  inline DILocation *cloneWithDiscriminator(unsigned Discriminator) const;
+  inline const DILocation *cloneWithDiscriminator(unsigned Discriminator) const;
+
+  /// Returns a new DILocation with updated base discriminator \p BD.
+  inline const DILocation *setBaseDiscriminator(unsigned BD) const;
+
+  /// Returns the duplication factor stored in the discriminator.
+  inline unsigned getDuplicationFactor() const;
+
+  /// Returns the copy identifier stored in the discriminator.
+  inline unsigned getCopyIdentifier() const;
+
+  /// Returns the base discriminator stored in the discriminator.
+  inline unsigned getBaseDiscriminator() const;
+
+  /// Returns a new DILocation with duplication factor \p DF encoded in the
+  /// discriminator.
+  inline const DILocation *cloneWithDuplicationFactor(unsigned DF) const;
 
   /// When two instructions are combined into a single instruction we also
   /// need to combine the original locations into a single location.
@@ -1333,6 +1416,30 @@ public:
     return nullptr;
   }
 
+  /// Returns the base discriminator for a given encoded discriminator \p D.
+  static unsigned getBaseDiscriminatorFromDiscriminator(unsigned D) {
+    if ((D & 1) == 0)
+      return getUnsignedFromPrefixEncoding(D >> 1);
+    else
+      return 0;
+  }
+
+  /// Returns the duplication factor for a given encoded discriminator \p D.
+  static unsigned getDuplicationFactorFromDiscriminator(unsigned D) {
+    D = getNextComponentInDiscriminator(D);
+    if (D == 0 || (D & 1))
+      return 1;
+    else
+      return getUnsignedFromPrefixEncoding(D >> 1);
+  }
+
+  /// Returns the copy identifier for a given encoded discriminator \p D.
+  static unsigned getCopyIdentifierFromDiscriminator(unsigned D) {
+    return getUnsignedFromPrefixEncoding(getNextComponentInDiscriminator(
+        getNextComponentInDiscriminator(D)));
+  }
+
+
   Metadata *getRawScope() const { return getOperand(0); }
   Metadata *getRawInlinedAt() const {
     if (getNumOperands() == 2)
@@ -1343,6 +1450,7 @@ public:
   static bool classof(const Metadata *MD) {
     return MD->getMetadataID() == DILocationKind;
   }
+
 };
 
 /// Subprogram description.
@@ -1676,7 +1784,8 @@ unsigned DILocation::getDiscriminator() const {
   return 0;
 }
 
-DILocation *DILocation::cloneWithDiscriminator(unsigned Discriminator) const {
+const DILocation *
+DILocation::cloneWithDiscriminator(unsigned Discriminator) const {
   DIScope *Scope = getScope();
   // Skip all parent DILexicalBlockFile that already have a discriminator
   // assigned. We do not want to have nested DILexicalBlockFiles that have
@@ -1692,6 +1801,42 @@ DILocation *DILocation::cloneWithDiscriminator(unsigned Discriminator) const {
                          getInlinedAt());
 }
 
+unsigned DILocation::getBaseDiscriminator() const {
+  return getBaseDiscriminatorFromDiscriminator(getDiscriminator());
+}
+
+unsigned DILocation::getDuplicationFactor() const {
+  return getDuplicationFactorFromDiscriminator(getDiscriminator());
+}
+
+unsigned DILocation::getCopyIdentifier() const {
+  return getCopyIdentifierFromDiscriminator(getDiscriminator());
+}
+
+const DILocation *DILocation::setBaseDiscriminator(unsigned D) const {
+  if (D == 0)
+    return this;
+  else
+    return cloneWithDiscriminator(getPrefixEncodingFromUnsigned(D) << 1);
+}
+
+const DILocation *DILocation::cloneWithDuplicationFactor(unsigned DF) const {
+  DF *= getDuplicationFactor();
+  if (DF <= 1)
+    return this;
+
+  unsigned BD = getBaseDiscriminator();
+  unsigned CI = getCopyIdentifier() << (DF > 0x1f ? 14 : 7);
+  unsigned D = CI | (getPrefixEncodingFromUnsigned(DF) << 1);
+
+  if (BD == 0)
+    D = (D << 1) | 1;
+  else
+    D = (D << (BD > 0x1f ? 14 : 7)) | (getPrefixEncodingFromUnsigned(BD) << 1);
+
+  return cloneWithDiscriminator(D);
+}
+
 class DINamespace : public DIScope {
   friend class LLVMContextImpl;
   friend class MDNode;
@@ -1918,7 +2063,7 @@ protected:
   DIVariable(LLVMContext &C, unsigned ID, StorageType Storage, unsigned Line,
              ArrayRef<Metadata *> Ops, uint32_t AlignInBits = 0)
       : DINode(C, ID, Storage, dwarf::DW_TAG_variable, Ops), Line(Line),
-	      AlignInBits(AlignInBits) {}
+        AlignInBits(AlignInBits) {}
   ~DIVariable() = default;
 
 public:
@@ -2108,7 +2253,7 @@ public:
 
   /// Retrieve the details of this fragment expression.
   static Optional<FragmentInfo> getFragmentInfo(expr_op_iterator Start,
-						expr_op_iterator End);
+                                                expr_op_iterator End);
 
   /// Retrieve the details of this fragment expression.
   Optional<FragmentInfo> getFragmentInfo() const {
diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h
index 8361c577f9ea08fd8626df50f81c04037ab859d7..458c3cf29b0d1d027432e07e60c0f364cec2efab 100644
--- a/include/llvm/IR/DiagnosticInfo.h
+++ b/include/llvm/IR/DiagnosticInfo.h
@@ -347,19 +347,34 @@ private:
   const Twine &Msg;
 };
 
-/// Common features for diagnostics with an associated DebugLoc
-class DiagnosticInfoWithDebugLocBase : public DiagnosticInfo {
+class DiagnosticLocation {
+  StringRef Filename;
+  unsigned Line = 0;
+  unsigned Column = 0;
 public:
-  /// \p Fn is the function where the diagnostic is being emitted. \p DLoc is
+  DiagnosticLocation() {}
+  DiagnosticLocation(const DebugLoc &DL);
+  DiagnosticLocation(const DISubprogram *SP);
+
+  bool isValid() const { return !Filename.empty(); }
+  StringRef getFilename() const { return Filename; }
+  unsigned getLine() const { return Line; }
+  unsigned getColumn() const { return Column; }
+};
+
+/// Common features for diagnostics with an associated location.
+class DiagnosticInfoWithLocationBase : public DiagnosticInfo {
+public:
+  /// \p Fn is the function where the diagnostic is being emitted. \p Loc is
   /// the location information to use in the diagnostic.
-  DiagnosticInfoWithDebugLocBase(enum DiagnosticKind Kind,
+  DiagnosticInfoWithLocationBase(enum DiagnosticKind Kind,
                                  enum DiagnosticSeverity Severity,
                                  const Function &Fn,
-                                 const DebugLoc &DLoc)
-      : DiagnosticInfo(Kind, Severity), Fn(Fn), DLoc(DLoc) {}
+                                 const DiagnosticLocation &Loc)
+      : DiagnosticInfo(Kind, Severity), Fn(Fn), Loc(Loc) {}
 
   /// Return true if location information is available for this diagnostic.
-  bool isLocationAvailable() const;
+  bool isLocationAvailable() const { return Loc.isValid(); }
 
   /// Return a string with the location information for this diagnostic
   /// in the format "file:line:col". If location information is not available,
@@ -371,19 +386,19 @@ public:
   void getLocation(StringRef *Filename, unsigned *Line, unsigned *Column) const;
 
   const Function &getFunction() const { return Fn; }
-  const DebugLoc &getDebugLoc() const { return DLoc; }
+  DiagnosticLocation getLocation() const { return Loc; }
 
 private:
   /// Function where this diagnostic is triggered.
   const Function &Fn;
 
   /// Debug location where this diagnostic is triggered.
-  DebugLoc DLoc;
+  DiagnosticLocation Loc;
 };
 
 /// \brief Common features for diagnostics dealing with optimization remarks
 /// that are used by both IR and MIR passes.
-class DiagnosticInfoOptimizationBase : public DiagnosticInfoWithDebugLocBase {
+class DiagnosticInfoOptimizationBase : public DiagnosticInfoWithLocationBase {
 public:
   /// \brief Used to set IsVerbose via the stream interface.
   struct setIsVerbose {};
@@ -400,26 +415,28 @@ public:
     StringRef Key;
     std::string Val;
     // If set, the debug location corresponding to the value.
-    DebugLoc DLoc;
+    DiagnosticLocation Loc;
 
     explicit Argument(StringRef Str = "") : Key("String"), Val(Str) {}
-    Argument(StringRef Key, Value *V);
-    Argument(StringRef Key, Type *T);
+    Argument(StringRef Key, const Value *V);
+    Argument(StringRef Key, const Type *T);
     Argument(StringRef Key, int N);
     Argument(StringRef Key, unsigned N);
     Argument(StringRef Key, bool B) : Key(Key), Val(B ? "true" : "false") {}
   };
 
   /// \p PassName is the name of the pass emitting this diagnostic. \p
-  /// RemarkName is a textual identifier for the remark.  \p Fn is the function
-  /// where the diagnostic is being emitted. \p DLoc is the location information
-  /// to use in the diagnostic. If line table information is available, the
-  /// diagnostic will include the source code location.
+  /// RemarkName is a textual identifier for the remark (single-word,
+  /// camel-case). \p Fn is the function where the diagnostic is being emitted.
+  /// \p Loc is the location information to use in the diagnostic. If line table
+  /// information is available, the diagnostic will include the source code
+  /// location.
   DiagnosticInfoOptimizationBase(enum DiagnosticKind Kind,
                                  enum DiagnosticSeverity Severity,
                                  const char *PassName, StringRef RemarkName,
-                                 const Function &Fn, const DebugLoc &DLoc)
-      : DiagnosticInfoWithDebugLocBase(Kind, Severity, Fn, DLoc),
+                                 const Function &Fn,
+                                 const DiagnosticLocation &Loc)
+      : DiagnosticInfoWithLocationBase(Kind, Severity, Fn, Loc),
         PassName(PassName), RemarkName(RemarkName) {}
 
   DiagnosticInfoOptimizationBase &operator<<(StringRef S);
@@ -472,8 +489,9 @@ protected:
   /// be emitted.
   const char *PassName;
 
-  /// Textual identifier for the remark.  Can be used by external tools reading
-  /// the YAML output file for optimization remarks to identify the remark.
+  /// Textual identifier for the remark (single-word, camel-case). Can be used
+  /// by external tools reading the YAML output file for optimization remarks to
+  /// identify the remark.
   StringRef RemarkName;
 
   /// If profile information is available, this is the number of times the
@@ -499,19 +517,21 @@ protected:
 class DiagnosticInfoIROptimization : public DiagnosticInfoOptimizationBase {
 public:
   /// \p PassName is the name of the pass emitting this diagnostic. \p
-  /// RemarkName is a textual identifier for the remark.  \p Fn is the function
-  /// where the diagnostic is being emitted. \p DLoc is the location information
-  /// to use in the diagnostic. If line table information is available, the
-  /// diagnostic will include the source code location. \p CodeRegion is IR
-  /// value (currently basic block) that the optimization operates on.  This is
-  /// currently used to provide run-time hotness information with PGO.
+  /// RemarkName is a textual identifier for the remark (single-word,
+  /// camel-case). \p Fn is the function where the diagnostic is being emitted.
+  /// \p Loc is the location information to use in the diagnostic. If line table
+  /// information is available, the diagnostic will include the source code
+  /// location. \p CodeRegion is IR value (currently basic block) that the
+  /// optimization operates on. This is currently used to provide run-time
+  /// hotness information with PGO.
   DiagnosticInfoIROptimization(enum DiagnosticKind Kind,
                                enum DiagnosticSeverity Severity,
                                const char *PassName, StringRef RemarkName,
-                               const Function &Fn, const DebugLoc &DLoc,
-                               Value *CodeRegion = nullptr)
+                               const Function &Fn,
+                               const DiagnosticLocation &Loc,
+                               const Value *CodeRegion = nullptr)
       : DiagnosticInfoOptimizationBase(Kind, Severity, PassName, RemarkName, Fn,
-                                       DLoc),
+                                       Loc),
         CodeRegion(CodeRegion) {}
 
   /// \brief This is ctor variant allows a pass to build an optimization remark
@@ -525,7 +545,7 @@ public:
                                const DiagnosticInfoIROptimization &Orig)
       : DiagnosticInfoOptimizationBase(
             (DiagnosticKind)Orig.getKind(), Orig.getSeverity(), PassName,
-            Orig.RemarkName, Orig.getFunction(), Orig.getDebugLoc()),
+            Orig.RemarkName, Orig.getFunction(), Orig.getLocation()),
         CodeRegion(Orig.getCodeRegion()) {
     *this << Prepend;
     std::copy(Orig.Args.begin(), Orig.Args.end(), std::back_inserter(Args));
@@ -533,7 +553,7 @@ public:
 
   /// Legacy interface.
   /// \p PassName is the name of the pass emitting this diagnostic.
-  /// \p Fn is the function where the diagnostic is being emitted. \p DLoc is
+  /// \p Fn is the function where the diagnostic is being emitted. \p Loc is
   /// the location information to use in the diagnostic. If line table
   /// information is available, the diagnostic will include the source code
   /// location. \p Msg is the message to show. Note that this class does not
@@ -542,14 +562,12 @@ public:
   DiagnosticInfoIROptimization(enum DiagnosticKind Kind,
                                enum DiagnosticSeverity Severity,
                                const char *PassName, const Function &Fn,
-                               const DebugLoc &DLoc, const Twine &Msg,
-                               Optional<uint64_t> Hotness = None)
-      : DiagnosticInfoOptimizationBase(Kind, Severity, PassName, "", Fn, DLoc) {
-    setHotness(Hotness);
+                               const DiagnosticLocation &Loc, const Twine &Msg)
+      : DiagnosticInfoOptimizationBase(Kind, Severity, PassName, "", Fn, Loc) {
     *this << Msg.str();
   }
 
-  Value *getCodeRegion() const { return CodeRegion; }
+  const Value *getCodeRegion() const { return CodeRegion; }
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() >= DK_FirstRemark && DI->getKind() <= DK_LastRemark;
@@ -558,38 +576,30 @@ public:
 private:
   /// The IR value (currently basic block) that the optimization operates on.
   /// This is currently used to provide run-time hotness information with PGO.
-  Value *CodeRegion;
+  const Value *CodeRegion;
 };
 
 /// Diagnostic information for applied optimization remarks.
 class OptimizationRemark : public DiagnosticInfoIROptimization {
 public:
-  /// \p PassName is the name of the pass emitting this diagnostic. If
-  /// this name matches the regular expression given in -Rpass=, then the
-  /// diagnostic will be emitted. \p Fn is the function where the diagnostic
-  /// is being emitted. \p DLoc is the location information to use in the
-  /// diagnostic. If line table information is available, the diagnostic
-  /// will include the source code location. \p Msg is the message to show.
-  /// Note that this class does not copy this message, so this reference
-  /// must be valid for the whole life time of the diagnostic.
-  OptimizationRemark(const char *PassName, const Function &Fn,
-                     const DebugLoc &DLoc, const Twine &Msg,
-                     Optional<uint64_t> Hotness = None)
-      : DiagnosticInfoIROptimization(DK_OptimizationRemark, DS_Remark, PassName,
-                                     Fn, DLoc, Msg, Hotness) {}
-
   /// \p PassName is the name of the pass emitting this diagnostic. If this name
   /// matches the regular expression given in -Rpass=, then the diagnostic will
-  /// be emitted.  \p RemarkName is a textual identifier for the remark.  \p
-  /// DLoc is the debug location and \p CodeRegion is the region that the
-  /// optimization operates on (currently on block is supported).
+  /// be emitted. \p RemarkName is a textual identifier for the remark (single-
+  /// word, camel-case). \p Loc is the debug location and \p CodeRegion is the
+  /// region that the optimization operates on (currently only block is
+  /// supported).
   OptimizationRemark(const char *PassName, StringRef RemarkName,
-                     const DebugLoc &DLoc, Value *CodeRegion);
+                     const DiagnosticLocation &Loc, const Value *CodeRegion);
 
-  /// Same as above but the debug location and code region is derived from \p
+  /// Same as above, but the debug location and code region are derived from \p
   /// Instr.
   OptimizationRemark(const char *PassName, StringRef RemarkName,
-                     Instruction *Inst);
+                     const Instruction *Inst);
+
+  /// Same as above, but the debug location and code region are derived from \p
+  /// Func.
+  OptimizationRemark(const char *PassName, StringRef RemarkName,
+                     const Function *Func);
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemark;
@@ -599,37 +609,45 @@ public:
 
   /// \see DiagnosticInfoOptimizationBase::isEnabled.
   bool isEnabled() const override { return isEnabled(getPassName()); }
-};
 
-/// Diagnostic information for missed-optimization remarks.
-class OptimizationRemarkMissed : public DiagnosticInfoIROptimization {
-public:
+private:
+  /// This is deprecated now and only used by the function API below.
   /// \p PassName is the name of the pass emitting this diagnostic. If
-  /// this name matches the regular expression given in -Rpass-missed=, then the
+  /// this name matches the regular expression given in -Rpass=, then the
   /// diagnostic will be emitted. \p Fn is the function where the diagnostic
-  /// is being emitted. \p DLoc is the location information to use in the
+  /// is being emitted. \p Loc is the location information to use in the
   /// diagnostic. If line table information is available, the diagnostic
   /// will include the source code location. \p Msg is the message to show.
   /// Note that this class does not copy this message, so this reference
   /// must be valid for the whole life time of the diagnostic.
-  OptimizationRemarkMissed(const char *PassName, const Function &Fn,
-                           const DebugLoc &DLoc, const Twine &Msg,
-                           Optional<uint64_t> Hotness = None)
-      : DiagnosticInfoIROptimization(DK_OptimizationRemarkMissed, DS_Remark,
-                                     PassName, Fn, DLoc, Msg, Hotness) {}
+  OptimizationRemark(const char *PassName, const Function &Fn,
+                     const DiagnosticLocation &Loc, const Twine &Msg)
+      : DiagnosticInfoIROptimization(DK_OptimizationRemark, DS_Remark, PassName,
+                                     Fn, Loc, Msg) {}
+
+  friend void emitOptimizationRemark(LLVMContext &Ctx, const char *PassName,
+                                     const Function &Fn,
+                                     const DiagnosticLocation &Loc,
+                                     const Twine &Msg);
+};
 
+/// Diagnostic information for missed-optimization remarks.
+class OptimizationRemarkMissed : public DiagnosticInfoIROptimization {
+public:
   /// \p PassName is the name of the pass emitting this diagnostic. If this name
   /// matches the regular expression given in -Rpass-missed=, then the
-  /// diagnostic will be emitted.  \p RemarkName is a textual identifier for the
-  /// remark.  \p DLoc is the debug location and \p CodeRegion is the region
-  /// that the optimization operates on (currently on block is supported).
+  /// diagnostic will be emitted. \p RemarkName is a textual identifier for the
+  /// remark (single-word, camel-case). \p Loc is the debug location and \p
+  /// CodeRegion is the region that the optimization operates on (currently only
+  /// block is supported).
   OptimizationRemarkMissed(const char *PassName, StringRef RemarkName,
-                           const DebugLoc &DLoc, Value *CodeRegion);
+                           const DiagnosticLocation &Loc,
+                           const Value *CodeRegion);
 
   /// \brief Same as above but \p Inst is used to derive code region and debug
   /// location.
   OptimizationRemarkMissed(const char *PassName, StringRef RemarkName,
-                           Instruction *Inst);
+                           const Instruction *Inst);
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemarkMissed;
@@ -639,32 +657,41 @@ public:
 
   /// \see DiagnosticInfoOptimizationBase::isEnabled.
   bool isEnabled() const override { return isEnabled(getPassName()); }
+
+private:
+  /// This is deprecated now and only used by the function API below.
+  /// \p PassName is the name of the pass emitting this diagnostic. If
+  /// this name matches the regular expression given in -Rpass-missed=, then the
+  /// diagnostic will be emitted. \p Fn is the function where the diagnostic
+  /// is being emitted. \p Loc is the location information to use in the
+  /// diagnostic. If line table information is available, the diagnostic
+  /// will include the source code location. \p Msg is the message to show.
+  /// Note that this class does not copy this message, so this reference
+  /// must be valid for the whole life time of the diagnostic.
+  OptimizationRemarkMissed(const char *PassName, const Function &Fn,
+                           const DiagnosticLocation &Loc, const Twine &Msg)
+      : DiagnosticInfoIROptimization(DK_OptimizationRemarkMissed, DS_Remark,
+                                     PassName, Fn, Loc, Msg) {}
+
+  friend void emitOptimizationRemarkMissed(LLVMContext &Ctx,
+                                           const char *PassName,
+                                           const Function &Fn,
+                                           const DiagnosticLocation &Loc,
+                                           const Twine &Msg);
 };
 
 /// Diagnostic information for optimization analysis remarks.
 class OptimizationRemarkAnalysis : public DiagnosticInfoIROptimization {
 public:
-  /// \p PassName is the name of the pass emitting this diagnostic. If
-  /// this name matches the regular expression given in -Rpass-analysis=, then
-  /// the diagnostic will be emitted. \p Fn is the function where the diagnostic
-  /// is being emitted. \p DLoc is the location information to use in the
-  /// diagnostic. If line table information is available, the diagnostic will
-  /// include the source code location. \p Msg is the message to show. Note that
-  /// this class does not copy this message, so this reference must be valid for
-  /// the whole life time of the diagnostic.
-  OptimizationRemarkAnalysis(const char *PassName, const Function &Fn,
-                             const DebugLoc &DLoc, const Twine &Msg,
-                             Optional<uint64_t> Hotness = None)
-      : DiagnosticInfoIROptimization(DK_OptimizationRemarkAnalysis, DS_Remark,
-                                     PassName, Fn, DLoc, Msg, Hotness) {}
-
   /// \p PassName is the name of the pass emitting this diagnostic. If this name
   /// matches the regular expression given in -Rpass-analysis=, then the
-  /// diagnostic will be emitted.  \p RemarkName is a textual identifier for the
-  /// remark.  \p DLoc is the debug location and \p CodeRegion is the region
-  /// that the optimization operates on (currently on block is supported).
+  /// diagnostic will be emitted. \p RemarkName is a textual identifier for the
+  /// remark (single-word, camel-case). \p Loc is the debug location and \p
+  /// CodeRegion is the region that the optimization operates on (currently only
+  /// block is supported).
   OptimizationRemarkAnalysis(const char *PassName, StringRef RemarkName,
-                             const DebugLoc &DLoc, Value *CodeRegion);
+                             const DiagnosticLocation &Loc,
+                             const Value *CodeRegion);
 
   /// \brief This is ctor variant allows a pass to build an optimization remark
   /// from an existing remark.
@@ -680,7 +707,7 @@ public:
   /// \brief Same as above but \p Inst is used to derive code region and debug
   /// location.
   OptimizationRemarkAnalysis(const char *PassName, StringRef RemarkName,
-                             Instruction *Inst);
+                             const Instruction *Inst);
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationRemarkAnalysis;
@@ -699,24 +726,65 @@ public:
 
 protected:
   OptimizationRemarkAnalysis(enum DiagnosticKind Kind, const char *PassName,
-                             const Function &Fn, const DebugLoc &DLoc,
-                             const Twine &Msg, Optional<uint64_t> Hotness)
-      : DiagnosticInfoIROptimization(Kind, DS_Remark, PassName, Fn, DLoc, Msg,
-                                     Hotness) {}
+                             const Function &Fn, const DiagnosticLocation &Loc,
+                             const Twine &Msg)
+      : DiagnosticInfoIROptimization(Kind, DS_Remark, PassName, Fn, Loc, Msg) {}
 
   OptimizationRemarkAnalysis(enum DiagnosticKind Kind, const char *PassName,
-                             StringRef RemarkName, const DebugLoc &DLoc,
-                             Value *CodeRegion);
+                             StringRef RemarkName,
+                             const DiagnosticLocation &Loc,
+                             const Value *CodeRegion);
+
+private:
+  /// This is deprecated now and only used by the function API below.
+  /// \p PassName is the name of the pass emitting this diagnostic. If
+  /// this name matches the regular expression given in -Rpass-analysis=, then
+  /// the diagnostic will be emitted. \p Fn is the function where the diagnostic
+  /// is being emitted. \p Loc is the location information to use in the
+  /// diagnostic. If line table information is available, the diagnostic will
+  /// include the source code location. \p Msg is the message to show. Note that
+  /// this class does not copy this message, so this reference must be valid for
+  /// the whole life time of the diagnostic.
+  OptimizationRemarkAnalysis(const char *PassName, const Function &Fn,
+                             const DiagnosticLocation &Loc, const Twine &Msg)
+      : DiagnosticInfoIROptimization(DK_OptimizationRemarkAnalysis, DS_Remark,
+                                     PassName, Fn, Loc, Msg) {}
+
+  friend void emitOptimizationRemarkAnalysis(LLVMContext &Ctx,
+                                             const char *PassName,
+                                             const Function &Fn,
+                                             const DiagnosticLocation &Loc,
+                                             const Twine &Msg);
 };
 
 /// Diagnostic information for optimization analysis remarks related to
 /// floating-point non-commutativity.
 class OptimizationRemarkAnalysisFPCommute : public OptimizationRemarkAnalysis {
 public:
+  /// \p PassName is the name of the pass emitting this diagnostic. If this name
+  /// matches the regular expression given in -Rpass-analysis=, then the
+  /// diagnostic will be emitted. \p RemarkName is a textual identifier for the
+  /// remark (single-word, camel-case). \p Loc is the debug location and \p
+  /// CodeRegion is the region that the optimization operates on (currently only
+  /// block is supported). The front-end will append its own message related to
+  /// options that address floating-point non-commutativity.
+  OptimizationRemarkAnalysisFPCommute(const char *PassName,
+                                      StringRef RemarkName,
+                                      const DiagnosticLocation &Loc,
+                                      const Value *CodeRegion)
+      : OptimizationRemarkAnalysis(DK_OptimizationRemarkAnalysisFPCommute,
+                                   PassName, RemarkName, Loc, CodeRegion) {}
+
+  static bool classof(const DiagnosticInfo *DI) {
+    return DI->getKind() == DK_OptimizationRemarkAnalysisFPCommute;
+  }
+
+private:
+  /// This is deprecated now and only used by the function API below.
   /// \p PassName is the name of the pass emitting this diagnostic. If
   /// this name matches the regular expression given in -Rpass-analysis=, then
   /// the diagnostic will be emitted. \p Fn is the function where the diagnostic
-  /// is being emitted. \p DLoc is the location information to use in the
+  /// is being emitted. \p Loc is the location information to use in the
   /// diagnostic. If line table information is available, the diagnostic will
   /// include the source code location. \p Msg is the message to show. The
   /// front-end will append its own message related to options that address
@@ -724,37 +792,42 @@ public:
   /// message, so this reference must be valid for the whole life time of the
   /// diagnostic.
   OptimizationRemarkAnalysisFPCommute(const char *PassName, const Function &Fn,
-                                      const DebugLoc &DLoc, const Twine &Msg,
-                                      Optional<uint64_t> Hotness = None)
+                                      const DiagnosticLocation &Loc,
+                                      const Twine &Msg)
       : OptimizationRemarkAnalysis(DK_OptimizationRemarkAnalysisFPCommute,
-                                   PassName, Fn, DLoc, Msg, Hotness) {}
+                                   PassName, Fn, Loc, Msg) {}
+  friend void emitOptimizationRemarkAnalysisFPCommute(
+      LLVMContext &Ctx, const char *PassName, const Function &Fn,
+      const DiagnosticLocation &Loc, const Twine &Msg);
+};
 
+/// Diagnostic information for optimization analysis remarks related to
+/// pointer aliasing.
+class OptimizationRemarkAnalysisAliasing : public OptimizationRemarkAnalysis {
+public:
   /// \p PassName is the name of the pass emitting this diagnostic. If this name
   /// matches the regular expression given in -Rpass-analysis=, then the
-  /// diagnostic will be emitted.  \p RemarkName is a textual identifier for the
-  /// remark.  \p DLoc is the debug location and \p CodeRegion is the region
-  /// that the optimization operates on (currently on block is supported). The
-  /// front-end will append its own message related to options that address
-  /// floating-point non-commutativity.
-  OptimizationRemarkAnalysisFPCommute(const char *PassName,
-                                      StringRef RemarkName,
-                                      const DebugLoc &DLoc, Value *CodeRegion)
-      : OptimizationRemarkAnalysis(DK_OptimizationRemarkAnalysisFPCommute,
-                                   PassName, RemarkName, DLoc, CodeRegion) {}
+  /// diagnostic will be emitted. \p RemarkName is a textual identifier for the
+  /// remark (single-word, camel-case). \p Loc is the debug location and \p
+  /// CodeRegion is the region that the optimization operates on (currently only
+  /// block is supported). The front-end will append its own message related to
+  /// options that address pointer aliasing legality.
+  OptimizationRemarkAnalysisAliasing(const char *PassName, StringRef RemarkName,
+                                     const DiagnosticLocation &Loc,
+                                     const Value *CodeRegion)
+      : OptimizationRemarkAnalysis(DK_OptimizationRemarkAnalysisAliasing,
+                                   PassName, RemarkName, Loc, CodeRegion) {}
 
   static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == DK_OptimizationRemarkAnalysisFPCommute;
+    return DI->getKind() == DK_OptimizationRemarkAnalysisAliasing;
   }
-};
 
-/// Diagnostic information for optimization analysis remarks related to
-/// pointer aliasing.
-class OptimizationRemarkAnalysisAliasing : public OptimizationRemarkAnalysis {
-public:
+private:
+  /// This is deprecated now and only used by the function API below.
   /// \p PassName is the name of the pass emitting this diagnostic. If
   /// this name matches the regular expression given in -Rpass-analysis=, then
   /// the diagnostic will be emitted. \p Fn is the function where the diagnostic
-  /// is being emitted. \p DLoc is the location information to use in the
+  /// is being emitted. \p Loc is the location information to use in the
   /// diagnostic. If line table information is available, the diagnostic will
   /// include the source code location. \p Msg is the message to show. The
   /// front-end will append its own message related to options that address
@@ -762,26 +835,14 @@ public:
   /// message, so this reference must be valid for the whole life time of the
   /// diagnostic.
   OptimizationRemarkAnalysisAliasing(const char *PassName, const Function &Fn,
-                                     const DebugLoc &DLoc, const Twine &Msg,
-                                     Optional<uint64_t> Hotness = None)
+                                     const DiagnosticLocation &Loc,
+                                     const Twine &Msg)
       : OptimizationRemarkAnalysis(DK_OptimizationRemarkAnalysisAliasing,
-                                   PassName, Fn, DLoc, Msg, Hotness) {}
+                                   PassName, Fn, Loc, Msg) {}
 
-  /// \p PassName is the name of the pass emitting this diagnostic. If this name
-  /// matches the regular expression given in -Rpass-analysis=, then the
-  /// diagnostic will be emitted.  \p RemarkName is a textual identifier for the
-  /// remark.  \p DLoc is the debug location and \p CodeRegion is the region
-  /// that the optimization operates on (currently on block is supported). The
-  /// front-end will append its own message related to options that address
-  /// pointer aliasing legality.
-  OptimizationRemarkAnalysisAliasing(const char *PassName, StringRef RemarkName,
-                                     const DebugLoc &DLoc, Value *CodeRegion)
-      : OptimizationRemarkAnalysis(DK_OptimizationRemarkAnalysisAliasing,
-                                   PassName, RemarkName, DLoc, CodeRegion) {}
-
-  static bool classof(const DiagnosticInfo *DI) {
-    return DI->getKind() == DK_OptimizationRemarkAnalysisAliasing;
-  }
+  friend void emitOptimizationRemarkAnalysisAliasing(
+      LLVMContext &Ctx, const char *PassName, const Function &Fn,
+      const DiagnosticLocation &Loc, const Twine &Msg);
 };
 
 /// Diagnostic information for machine IR parser.
@@ -824,72 +885,97 @@ public:
 // Create wrappers for C Binding types (see CBindingWrapping.h).
 DEFINE_SIMPLE_CONVERSION_FUNCTIONS(DiagnosticInfo, LLVMDiagnosticInfoRef)
 
-/// Emit an optimization-applied message. \p PassName is the name of the pass
-/// emitting the message. If -Rpass= is given and \p PassName matches the
-/// regular expression in -Rpass, then the remark will be emitted. \p Fn is
-/// the function triggering the remark, \p DLoc is the debug location where
-/// the diagnostic is generated. \p Msg is the message string to use.
+/// \brief Legacy interface to emit an optimization-applied message.  Use
+/// (Machine)OptimizationRemarkEmitter instead.
+///
+/// \p PassName is the name of the pass emitting the message. If -Rpass= is
+/// given and \p PassName matches the regular expression in -Rpass, then the
+/// remark will be emitted. \p Fn is the function triggering the remark, \p Loc
+/// is the debug location where the diagnostic is generated. \p Msg is the
+/// message string to use.
 void emitOptimizationRemark(LLVMContext &Ctx, const char *PassName,
-                            const Function &Fn, const DebugLoc &DLoc,
+                            const Function &Fn, const DiagnosticLocation &Loc,
                             const Twine &Msg);
 
-/// Emit an optimization-missed message. \p PassName is the name of the
-/// pass emitting the message. If -Rpass-missed= is given and \p PassName
-/// matches the regular expression in -Rpass, then the remark will be
-/// emitted. \p Fn is the function triggering the remark, \p DLoc is the
-/// debug location where the diagnostic is generated. \p Msg is the
+/// \brief Legacy interface to emit an optimization-missed message.  Use
+/// (Machine)OptimizationRemarkEmitter instead.
+///
+/// \p PassName is the name of the pass emitting the message. If -Rpass-missed=
+/// is given and \p PassName matches the regular expression in -Rpass, then the
+/// remark will be emitted. \p Fn is the function triggering the remark, \p Loc
+/// is the debug location where the diagnostic is generated. \p Msg is the
 /// message string to use.
 void emitOptimizationRemarkMissed(LLVMContext &Ctx, const char *PassName,
-                                  const Function &Fn, const DebugLoc &DLoc,
+                                  const Function &Fn,
+                                  const DiagnosticLocation &Loc,
                                   const Twine &Msg);
 
-/// Emit an optimization analysis remark message. \p PassName is the name of
-/// the pass emitting the message. If -Rpass-analysis= is given and \p
-/// PassName matches the regular expression in -Rpass, then the remark will be
-/// emitted. \p Fn is the function triggering the remark, \p DLoc is the debug
-/// location where the diagnostic is generated. \p Msg is the message string
-/// to use.
+/// \brief Legacy interface to emit an optimization analysis remark message.
+/// Use (Machine)OptimizationRemarkEmitter instead.
+///
+/// \p PassName is the name of the pass emitting the message. If
+/// -Rpass-analysis= is given and \p PassName matches the regular expression in
+/// -Rpass, then the remark will be emitted. \p Fn is the function triggering
+/// the remark, \p Loc is the debug location where the diagnostic is
+/// generated. \p Msg is the message string to use.
 void emitOptimizationRemarkAnalysis(LLVMContext &Ctx, const char *PassName,
-                                    const Function &Fn, const DebugLoc &DLoc,
+                                    const Function &Fn,
+                                    const DiagnosticLocation &Loc,
                                     const Twine &Msg);
 
-/// Emit an optimization analysis remark related to messages about
-/// floating-point non-commutativity. \p PassName is the name of the pass
-/// emitting the message. If -Rpass-analysis= is given and \p PassName matches
-/// the regular expression in -Rpass, then the remark will be emitted. \p Fn is
-/// the function triggering the remark, \p DLoc is the debug location where the
-/// diagnostic is generated. \p Msg is the message string to use.
+/// \brief Legacy interface to emit an optimization analysis remark related to
+/// messages about floating-point non-commutativity.  Use
+/// (Machine)OptimizationRemarkEmitter instead.
+///
+/// \p PassName is the name of the pass emitting the message. If
+/// -Rpass-analysis= is given and \p PassName matches the regular expression in
+/// -Rpass, then the remark will be emitted. \p Fn is the function triggering
+/// the remark, \p Loc is the debug location where the diagnostic is
+/// generated. \p Msg is the message string to use.
 void emitOptimizationRemarkAnalysisFPCommute(LLVMContext &Ctx,
                                              const char *PassName,
                                              const Function &Fn,
-                                             const DebugLoc &DLoc,
+                                             const DiagnosticLocation &Loc,
                                              const Twine &Msg);
 
-/// Emit an optimization analysis remark related to messages about
-/// pointer aliasing. \p PassName is the name of the pass emitting the message.
+/// \brief Legacy interface to emit an optimization analysis remark related to
+/// messages about pointer aliasing.  Use (Machine)OptimizationRemarkEmitter
+/// instead.
+///
+/// \p PassName is the name of the pass emitting the message.
 /// If -Rpass-analysis= is given and \p PassName matches the regular expression
 /// in -Rpass, then the remark will be emitted. \p Fn is the function triggering
-/// the remark, \p DLoc is the debug location where the diagnostic is generated.
+/// the remark, \p Loc is the debug location where the diagnostic is generated.
 /// \p Msg is the message string to use.
 void emitOptimizationRemarkAnalysisAliasing(LLVMContext &Ctx,
                                             const char *PassName,
                                             const Function &Fn,
-                                            const DebugLoc &DLoc,
+                                            const DiagnosticLocation &Loc,
                                             const Twine &Msg);
 
 /// Diagnostic information for optimization failures.
 class DiagnosticInfoOptimizationFailure : public DiagnosticInfoIROptimization {
 public:
-  /// \p Fn is the function where the diagnostic is being emitted. \p DLoc is
+  /// \p Fn is the function where the diagnostic is being emitted. \p Loc is
   /// the location information to use in the diagnostic. If line table
   /// information is available, the diagnostic will include the source code
   /// location. \p Msg is the message to show. Note that this class does not
   /// copy this message, so this reference must be valid for the whole life time
   /// of the diagnostic.
-  DiagnosticInfoOptimizationFailure(const Function &Fn, const DebugLoc &DLoc,
+  DiagnosticInfoOptimizationFailure(const Function &Fn,
+                                    const DiagnosticLocation &Loc,
                                     const Twine &Msg)
       : DiagnosticInfoIROptimization(DK_OptimizationFailure, DS_Warning,
-                                     nullptr, Fn, DLoc, Msg) {}
+                                     nullptr, Fn, Loc, Msg) {}
+
+  /// \p PassName is the name of the pass emitting this diagnostic.  \p
+  /// RemarkName is a textual identifier for the remark (single-word,
+  /// camel-case).  \p Loc is the debug location and \p CodeRegion is the
+  /// region that the optimization operates on (currently basic block is
+  /// supported).
+  DiagnosticInfoOptimizationFailure(const char *PassName, StringRef RemarkName,
+                                    const DiagnosticLocation &Loc,
+                                    const Value *CodeRegion);
 
   static bool classof(const DiagnosticInfo *DI) {
     return DI->getKind() == DK_OptimizationFailure;
@@ -900,22 +986,22 @@ public:
 };
 
 /// Diagnostic information for unsupported feature in backend.
-class DiagnosticInfoUnsupported
-    : public DiagnosticInfoWithDebugLocBase {
+class DiagnosticInfoUnsupported : public DiagnosticInfoWithLocationBase {
 private:
   Twine Msg;
 
 public:
-  /// \p Fn is the function where the diagnostic is being emitted. \p DLoc is
+  /// \p Fn is the function where the diagnostic is being emitted. \p Loc is
   /// the location information to use in the diagnostic. If line table
   /// information is available, the diagnostic will include the source code
   /// location. \p Msg is the message to show. Note that this class does not
   /// copy this message, so this reference must be valid for the whole life time
   /// of the diagnostic.
-  DiagnosticInfoUnsupported(const Function &Fn, const Twine &Msg,
-                            DebugLoc DLoc = DebugLoc(),
-                            DiagnosticSeverity Severity = DS_Error)
-      : DiagnosticInfoWithDebugLocBase(DK_Unsupported, Severity, Fn, DLoc),
+  DiagnosticInfoUnsupported(
+      const Function &Fn, const Twine &Msg,
+      const DiagnosticLocation &Loc = DiagnosticLocation(),
+      DiagnosticSeverity Severity = DS_Error)
+      : DiagnosticInfoWithLocationBase(DK_Unsupported, Severity, Fn, Loc),
         Msg(Msg) {}
 
   static bool classof(const DiagnosticInfo *DI) {
@@ -926,19 +1012,6 @@ public:
 
   void print(DiagnosticPrinter &DP) const override;
 };
-
-/// Emit a warning when loop vectorization is specified but fails. \p Fn is the
-/// function triggering the warning, \p DLoc is the debug location where the
-/// diagnostic is generated. \p Msg is the message string to use.
-void emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn,
-                              const DebugLoc &DLoc, const Twine &Msg);
-
-/// Emit a warning when loop interleaving is specified but fails. \p Fn is the
-/// function triggering the warning, \p DLoc is the debug location where the
-/// diagnostic is generated. \p Msg is the message string to use.
-void emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn,
-                               const DebugLoc &DLoc, const Twine &Msg);
-
 } // end namespace llvm
 
 #endif // LLVM_IR_DIAGNOSTICINFO_H
diff --git a/include/llvm/IR/Dominators.h b/include/llvm/IR/Dominators.h
index e39ef64e699b340a9c2adc84a207c24d6e638058..cae03d33a7eedc02bbb68cd83bea14b18b7dfcdc 100644
--- a/include/llvm/IR/Dominators.h
+++ b/include/llvm/IR/Dominators.h
@@ -16,17 +16,21 @@
 #define LLVM_IR_DOMINATORS_H
 
 #include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Hashing.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/GenericDomTree.h"
+#include <utility>
 
 namespace llvm {
 
 class Function;
-class BasicBlock;
+class Instruction;
+class Module;
 class raw_ostream;
 
 extern template class DomTreeNodeBase<BasicBlock>;
@@ -43,24 +47,37 @@ typedef DomTreeNodeBase<BasicBlock> DomTreeNode;
 class BasicBlockEdge {
   const BasicBlock *Start;
   const BasicBlock *End;
+
 public:
   BasicBlockEdge(const BasicBlock *Start_, const BasicBlock *End_) :
-    Start(Start_), End(End_) { }
+    Start(Start_), End(End_) {}
+
+  BasicBlockEdge(const std::pair<BasicBlock *, BasicBlock *> &Pair)
+      : Start(Pair.first), End(Pair.second) {}
+
+  BasicBlockEdge(const std::pair<const BasicBlock *, const BasicBlock *> &Pair)
+      : Start(Pair.first), End(Pair.second) {}
+
   const BasicBlock *getStart() const {
     return Start;
   }
+
   const BasicBlock *getEnd() const {
     return End;
   }
+
   bool isSingleEdge() const;
 };
 
 template <> struct DenseMapInfo<BasicBlockEdge> {
-  static unsigned getHashValue(const BasicBlockEdge *V);
   typedef DenseMapInfo<const BasicBlock *> BBInfo;
+
+  static unsigned getHashValue(const BasicBlockEdge *V);
+
   static inline BasicBlockEdge getEmptyKey() {
     return BasicBlockEdge(BBInfo::getEmptyKey(), BBInfo::getEmptyKey());
   }
+
   static inline BasicBlockEdge getTombstoneKey() {
     return BasicBlockEdge(BBInfo::getTombstoneKey(), BBInfo::getTombstoneKey());
   }
@@ -69,6 +86,7 @@ template <> struct DenseMapInfo<BasicBlockEdge> {
     return hash_combine(BBInfo::getHashValue(Edge.getStart()),
                         BBInfo::getHashValue(Edge.getEnd()));
   }
+
   static bool isEqual(const BasicBlockEdge &LHS, const BasicBlockEdge &RHS) {
     return BBInfo::isEqual(LHS.getStart(), RHS.getStart()) &&
            BBInfo::isEqual(LHS.getEnd(), RHS.getEnd());
@@ -111,14 +129,8 @@ public:
   inline bool compare(const DominatorTree &Other) const {
     const DomTreeNode *R = getRootNode();
     const DomTreeNode *OtherR = Other.getRootNode();
-
-    if (!R || !OtherR || R->getBlock() != OtherR->getBlock())
-      return true;
-
-    if (Base::compare(Other))
-      return true;
-
-    return false;
+    return !R || !OtherR || R->getBlock() != OtherR->getBlock() ||
+           Base::compare(Other);
   }
 
   // Ensure base-class overloads are visible.
@@ -209,6 +221,7 @@ class DominatorTreePrinterPass
 
 public:
   explicit DominatorTreePrinterPass(raw_ostream &OS);
+
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
@@ -244,6 +257,6 @@ public:
   void print(raw_ostream &OS, const Module *M = nullptr) const override;
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_DOMINATORS_H
diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h
index 1854d413c627e3c32f8f6590c27c050aee83ed7c..787361ffca039d488731395abecf44a3c6334562 100644
--- a/include/llvm/IR/Function.h
+++ b/include/llvm/IR/Function.h
@@ -18,6 +18,7 @@
 #ifndef LLVM_IR_FUNCTION_H
 #define LLVM_IR_FUNCTION_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/ilist_node.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/StringRef.h"
@@ -47,23 +48,23 @@ class DISubprogram;
 
 class Function : public GlobalObject, public ilist_node<Function> {
 public:
-  typedef SymbolTableList<Argument> ArgumentListType;
   typedef SymbolTableList<BasicBlock> BasicBlockListType;
 
   // BasicBlock iterators...
   typedef BasicBlockListType::iterator iterator;
   typedef BasicBlockListType::const_iterator const_iterator;
 
-  typedef ArgumentListType::iterator arg_iterator;
-  typedef ArgumentListType::const_iterator const_arg_iterator;
+  typedef Argument *arg_iterator;
+  typedef const Argument *const_arg_iterator;
 
 private:
   // Important things that make up a function!
   BasicBlockListType  BasicBlocks;        ///< The basic blocks
-  mutable ArgumentListType ArgumentList;  ///< The formal arguments
+  mutable Argument *Arguments;            ///< The formal arguments
+  size_t NumArgs;
   std::unique_ptr<ValueSymbolTable>
       SymTab;                             ///< Symbol table of args/instructions
-  AttributeSet AttributeSets;             ///< Parameter attributes
+  AttributeList AttributeSets;            ///< Parameter attributes
 
   /*
    * Value::SubclassData
@@ -102,6 +103,8 @@ private:
 
   void BuildLazyArguments() const;
 
+  void clearArguments();
+
   /// Function ctor - If the (optional) Module argument is specified, the
   /// function is automatically inserted into the end of the function list for
   /// the module.
@@ -121,10 +124,12 @@ public:
 
   // Provide fast operand accessors.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
-  /// Returns the type of the ret val.
-  Type *getReturnType() const;
   /// Returns the FunctionType for me.
-  FunctionType *getFunctionType() const;
+  FunctionType *getFunctionType() const {
+    return cast<FunctionType>(getValueType());
+  }
+  /// Returns the type of the ret val.
+  Type *getReturnType() const { return getFunctionType()->getReturnType(); }
 
   /// getContext - Return a reference to the LLVMContext associated with this
   /// function.
@@ -132,10 +137,16 @@ public:
 
   /// isVarArg - Return true if this function takes a variable number of
   /// arguments.
-  bool isVarArg() const;
+  bool isVarArg() const { return getFunctionType()->isVarArg(); }
 
-  bool isMaterializable() const;
-  void setIsMaterializable(bool V);
+  bool isMaterializable() const {
+    return getGlobalObjectSubClassData() & (1 << IsMaterializableBit);
+  }
+  void setIsMaterializable(bool V) {
+    unsigned Mask = 1 << IsMaterializableBit;
+    setGlobalObjectSubClassData((~Mask & getGlobalObjectSubClassData()) |
+                                (V ? Mask : 0u));
+  }
 
   /// getIntrinsicID - This method returns the ID number of the specified
   /// function, or Intrinsic::not_intrinsic if the function is not an
@@ -173,42 +184,45 @@ public:
   }
 
   /// @brief Return the attribute list for this Function.
-  AttributeSet getAttributes() const { return AttributeSets; }
+  AttributeList getAttributes() const { return AttributeSets; }
 
   /// @brief Set the attribute list for this Function.
-  void setAttributes(AttributeSet Attrs) { AttributeSets = Attrs; }
+  void setAttributes(AttributeList Attrs) { AttributeSets = Attrs; }
 
   /// @brief Add function attributes to this function.
   void addFnAttr(Attribute::AttrKind Kind) {
-    addAttribute(AttributeSet::FunctionIndex, Kind);
+    addAttribute(AttributeList::FunctionIndex, Kind);
   }
 
   /// @brief Add function attributes to this function.
   void addFnAttr(StringRef Kind, StringRef Val = StringRef()) {
-    addAttribute(AttributeSet::FunctionIndex,
+    addAttribute(AttributeList::FunctionIndex,
                  Attribute::get(getContext(), Kind, Val));
   }
 
   void addFnAttr(Attribute Attr) {
-    addAttribute(AttributeSet::FunctionIndex, Attr);
+    addAttribute(AttributeList::FunctionIndex, Attr);
   }
 
   /// @brief Remove function attributes from this function.
   void removeFnAttr(Attribute::AttrKind Kind) {
-    removeAttribute(AttributeSet::FunctionIndex, Kind);
+    removeAttribute(AttributeList::FunctionIndex, Kind);
   }
 
   /// @brief Remove function attribute from this function.
   void removeFnAttr(StringRef Kind) {
     setAttributes(AttributeSets.removeAttribute(
-        getContext(), AttributeSet::FunctionIndex, Kind));
+        getContext(), AttributeList::FunctionIndex, Kind));
   }
 
   /// \brief Set the entry count for this function.
   ///
   /// Entry count is the number of times this function was executed based on
-  /// pgo data.
-  void setEntryCount(uint64_t Count);
+  /// pgo data. \p Imports points to a set of GUIDs that needs to be imported
+  /// by the function for sample PGO, to enable the same inlines as the
+  /// profiled optimized binary.
+  void setEntryCount(uint64_t Count,
+                     const DenseSet<GlobalValue::GUID> *Imports = nullptr);
 
   /// \brief Get the entry count for this function.
   ///
@@ -216,6 +230,10 @@ public:
   /// pgo data.
   Optional<uint64_t> getEntryCount() const;
 
+  /// Returns the set of GUIDs that needs to be imported to the function for
+  /// sample PGO, to enable the same inlines as the profiled optimized binary.
+  DenseSet<GlobalValue::GUID> getImportGUIDs() const;
+
   /// Set the section prefix for this function.
   void setSectionPrefix(StringRef Prefix);
 
@@ -232,17 +250,17 @@ public:
 
   /// @brief Return the attribute for the given attribute kind.
   Attribute getFnAttribute(Attribute::AttrKind Kind) const {
-    return getAttribute(AttributeSet::FunctionIndex, Kind);
+    return getAttribute(AttributeList::FunctionIndex, Kind);
   }
   Attribute getFnAttribute(StringRef Kind) const {
-    return getAttribute(AttributeSet::FunctionIndex, Kind);
+    return getAttribute(AttributeList::FunctionIndex, Kind);
   }
 
   /// \brief Return the stack alignment for the function.
   unsigned getFnStackAlignment() const {
     if (!hasFnAttribute(Attribute::StackAlignment))
       return 0;
-    return AttributeSets.getStackAlignment(AttributeSet::FunctionIndex);
+    return AttributeSets.getStackAlignment(AttributeList::FunctionIndex);
   }
 
   /// hasGC/getGC/setGC/clearGC - The name of the garbage collection algorithm
@@ -261,7 +279,7 @@ public:
   void addAttribute(unsigned i, Attribute Attr);
 
   /// @brief adds the attributes to the list of attributes.
-  void addAttributes(unsigned i, AttributeSet Attrs);
+  void addAttributes(unsigned i, AttributeList Attrs);
 
   /// @brief removes the attribute from the list of attributes.
   void removeAttribute(unsigned i, Attribute::AttrKind Kind);
@@ -270,7 +288,7 @@ public:
   void removeAttribute(unsigned i, StringRef Kind);
 
   /// @brief removes the attributes from the list of attributes.
-  void removeAttributes(unsigned i, AttributeSet Attrs);
+  void removeAttributes(unsigned i, AttributeList Attrs);
 
   /// @brief check if an attributes is in the list of attributes.
   bool hasAttribute(unsigned i, Attribute::AttrKind Kind) const {
@@ -496,19 +514,6 @@ public:
   /// Get the underlying elements of the Function... the basic block list is
   /// empty for external functions.
   ///
-  const ArgumentListType &getArgumentList() const {
-    CheckLazyArguments();
-    return ArgumentList;
-  }
-  ArgumentListType &getArgumentList() {
-    CheckLazyArguments();
-    return ArgumentList;
-  }
-
-  static ArgumentListType Function::*getSublistAccess(Argument*) {
-    return &Function::ArgumentList;
-  }
-
   const BasicBlockListType &getBasicBlockList() const { return BasicBlocks; }
         BasicBlockListType &getBasicBlockList()       { return BasicBlocks; }
 
@@ -549,20 +554,20 @@ public:
 
   arg_iterator arg_begin() {
     CheckLazyArguments();
-    return ArgumentList.begin();
+    return Arguments;
   }
   const_arg_iterator arg_begin() const {
     CheckLazyArguments();
-    return ArgumentList.begin();
+    return Arguments;
   }
 
   arg_iterator arg_end() {
     CheckLazyArguments();
-    return ArgumentList.end();
+    return Arguments + NumArgs;
   }
   const_arg_iterator arg_end() const {
     CheckLazyArguments();
-    return ArgumentList.end();
+    return Arguments + NumArgs;
   }
 
   iterator_range<arg_iterator> args() {
@@ -574,8 +579,8 @@ public:
 
 /// @}
 
-  size_t arg_size() const;
-  bool arg_empty() const;
+  size_t arg_size() const { return NumArgs; }
+  bool arg_empty() const { return arg_size() == 0; }
 
   /// \brief Check whether this function has a personality function.
   bool hasPersonalityFn() const {
@@ -671,6 +676,9 @@ public:
   /// to \a DISubprogram.
   DISubprogram *getSubprogram() const;
 
+  /// Returns true if we should emit debug info for profiling.
+  bool isDebugInfoForProfiling() const;
+
 private:
   void allocHungoffUselist();
   template<int Idx> void setHungoffOperand(Constant *C);
diff --git a/include/llvm/IR/GlobalIndirectSymbol.h b/include/llvm/IR/GlobalIndirectSymbol.h
index 671309e85d196ae8c1f972372f9ec44a83accef3..212703af7101910fd599526f6f644f0d688568ab 100644
--- a/include/llvm/IR/GlobalIndirectSymbol.h
+++ b/include/llvm/IR/GlobalIndirectSymbol.h
@@ -48,27 +48,31 @@ public:
     setOperand(0, Symbol);
   }
   const Constant *getIndirectSymbol() const {
-    return const_cast<GlobalIndirectSymbol *>(this)->getIndirectSymbol();
+    return getOperand(0);
   }
   Constant *getIndirectSymbol() {
-    return getOperand(0);
+    return const_cast<Constant *>(
+          static_cast<const GlobalIndirectSymbol *>(this)->getIndirectSymbol());
   }
 
   const GlobalObject *getBaseObject() const {
-    return const_cast<GlobalIndirectSymbol *>(this)->getBaseObject();
+    return dyn_cast<GlobalObject>(getIndirectSymbol()->stripInBoundsOffsets());
   }
   GlobalObject *getBaseObject() {
-    return dyn_cast<GlobalObject>(getIndirectSymbol()->stripInBoundsOffsets());
+    return const_cast<GlobalObject *>(
+              static_cast<const GlobalIndirectSymbol *>(this)->getBaseObject());
   }
 
   const GlobalObject *getBaseObject(const DataLayout &DL, APInt &Offset) const {
-    return const_cast<GlobalIndirectSymbol *>(this)->getBaseObject(DL, Offset);
-  }
-  GlobalObject *getBaseObject(const DataLayout &DL, APInt &Offset) {
     return dyn_cast<GlobalObject>(
         getIndirectSymbol()->stripAndAccumulateInBoundsConstantOffsets(DL,
                                                                        Offset));
   }
+  GlobalObject *getBaseObject(const DataLayout &DL, APInt &Offset) {
+    return const_cast<GlobalObject *>(
+                                 static_cast<const GlobalIndirectSymbol *>(this)
+                                   ->getBaseObject(DL, Offset));
+  }
 
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static inline bool classof(const Value *V) {
diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h
index 1057f564aab3b35207739c245433646312c2f71e..f3789bafefe3e8830cc2904d4b2fe9321fe2051c 100644
--- a/include/llvm/IR/GlobalObject.h
+++ b/include/llvm/IR/GlobalObject.h
@@ -63,8 +63,17 @@ public:
   }
   void setAlignment(unsigned Align);
 
-  unsigned getGlobalObjectSubClassData() const;
-  void setGlobalObjectSubClassData(unsigned Val);
+  unsigned getGlobalObjectSubClassData() const {
+    unsigned ValueData = getGlobalValueSubClassData();
+    return ValueData >> GlobalObjectBits;
+  }
+
+  void setGlobalObjectSubClassData(unsigned Val) {
+    unsigned OldData = getGlobalValueSubClassData();
+    setGlobalValueSubClassData((OldData & GlobalObjectMask) |
+                               (Val << GlobalObjectBits));
+    assert(getGlobalObjectSubClassData() == Val && "representation error");
+  }
 
   /// Check if this global has a custom object file section.
   ///
diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h
index c6398aaa4847f8ca2d96873cd4b1dff3c90b6d3c..bb30fa8be8674430690ba6b2a7e873f1e5d8bfe6 100644
--- a/include/llvm/IR/GlobalValue.h
+++ b/include/llvm/IR/GlobalValue.h
@@ -211,9 +211,10 @@ public:
   }
 
   bool hasComdat() const { return getComdat() != nullptr; }
-  Comdat *getComdat();
-  const Comdat *getComdat() const {
-    return const_cast<GlobalValue *>(this)->getComdat();
+  const Comdat *getComdat() const;
+  Comdat *getComdat() {
+    return const_cast<Comdat *>(
+                           static_cast<const GlobalValue *>(this)->getComdat());
   }
 
   VisibilityTypes getVisibility() const { return VisibilityTypes(Visibility); }
@@ -514,10 +515,11 @@ public:
   // increased.
   bool canIncreaseAlignment() const;
 
-  const GlobalObject *getBaseObject() const {
-    return const_cast<GlobalValue *>(this)->getBaseObject();
+  const GlobalObject *getBaseObject() const;
+  GlobalObject *getBaseObject() {
+    return const_cast<GlobalObject *>(
+                       static_cast<const GlobalValue *>(this)->getBaseObject());
   }
-  GlobalObject *getBaseObject();
 
   /// Returns whether this is a reference to an absolute symbol.
   bool isAbsoluteSymbolRef() const;
diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h
index 5a41b5a2ed631b97e32c365817cb6684505f58ff..bc689f3b01d79c7ed9379cdc4dbc297788dc6286 100644
--- a/include/llvm/IR/IRBuilder.h
+++ b/include/llvm/IR/IRBuilder.h
@@ -33,6 +33,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -560,6 +561,22 @@ public:
                              Type *ResultType,
                              const Twine &Name = "");
 
+  /// Create a call to intrinsic \p ID with 2 operands which is mangled on the
+  /// first type.
+  CallInst *CreateBinaryIntrinsic(Intrinsic::ID ID,
+                                  Value *LHS, Value *RHS,
+                                  const Twine &Name = "");
+
+  /// Create call to the minnum intrinsic.
+  CallInst *CreateMinNum(Value *LHS, Value *RHS, const Twine &Name = "") {
+    return CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS, Name);
+  }
+
+  /// Create call to the maxnum intrinsic.
+  CallInst *CreateMaxNum(Value *LHS, Value *RHS, const Twine &Name = "") {
+    return CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS, Name);
+  }
+
 private:
   /// \brief Create a call to a masked intrinsic with given Id.
   CallInst *CreateMaskedIntrinsic(Intrinsic::ID Id, ArrayRef<Value *> Ops,
@@ -1073,9 +1090,15 @@ public:
   // Instruction creation methods: Memory Instructions
   //===--------------------------------------------------------------------===//
 
+  AllocaInst *CreateAlloca(Type *Ty, unsigned AddrSpace,
+                           Value *ArraySize = nullptr, const Twine &Name = "") {
+    return Insert(new AllocaInst(Ty, AddrSpace, ArraySize), Name);
+  }
+
   AllocaInst *CreateAlloca(Type *Ty, Value *ArraySize = nullptr,
                            const Twine &Name = "") {
-    return Insert(new AllocaInst(Ty, ArraySize), Name);
+    const DataLayout &DL = BB->getParent()->getParent()->getDataLayout();
+    return Insert(new AllocaInst(Ty, DL.getAllocaAddrSpace(), ArraySize), Name);
   }
   // \brief Provided to resolve 'CreateLoad(Ptr, "...")' correctly, instead of
   // converting the string to 'bool' for the isVolatile parameter.
@@ -1790,24 +1813,16 @@ public:
     return V;
   }
 
-  /// \brief Create an assume intrinsic call that represents an alignment
-  /// assumption on the provided pointer.
-  ///
-  /// An optional offset can be provided, and if it is provided, the offset
-  /// must be subtracted from the provided pointer to get the pointer with the
-  /// specified alignment.
-  CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue,
-                                      unsigned Alignment,
-                                      Value *OffsetValue = nullptr) {
-    assert(isa<PointerType>(PtrValue->getType()) &&
-           "trying to create an alignment assumption on a non-pointer?");
-
-    PointerType *PtrTy = cast<PointerType>(PtrValue->getType());
-    Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace());
+private:
+  /// \brief Helper function that creates an assume intrinsic call that
+  /// represents an alignment assumption on the provided Ptr, Mask, Type
+  /// and Offset.
+  CallInst *CreateAlignmentAssumptionHelper(const DataLayout &DL,
+                                            Value *PtrValue, Value *Mask,
+                                            Type *IntPtrTy,
+                                            Value *OffsetValue) {
     Value *PtrIntValue = CreatePtrToInt(PtrValue, IntPtrTy, "ptrint");
 
-    Value *Mask = ConstantInt::get(IntPtrTy,
-      Alignment > 0 ? Alignment - 1 : 0);
     if (OffsetValue) {
       bool IsOffsetZero = false;
       if (ConstantInt *CI = dyn_cast<ConstantInt>(OffsetValue))
@@ -1824,9 +1839,60 @@ public:
     Value *Zero = ConstantInt::get(IntPtrTy, 0);
     Value *MaskedPtr = CreateAnd(PtrIntValue, Mask, "maskedptr");
     Value *InvCond = CreateICmpEQ(MaskedPtr, Zero, "maskcond");
-
     return CreateAssumption(InvCond);
   }
+
+public:
+  /// \brief Create an assume intrinsic call that represents an alignment
+  /// assumption on the provided pointer.
+  ///
+  /// An optional offset can be provided, and if it is provided, the offset
+  /// must be subtracted from the provided pointer to get the pointer with the
+  /// specified alignment.
+  CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue,
+                                      unsigned Alignment,
+                                      Value *OffsetValue = nullptr) {
+    assert(isa<PointerType>(PtrValue->getType()) &&
+           "trying to create an alignment assumption on a non-pointer?");
+    PointerType *PtrTy = cast<PointerType>(PtrValue->getType());
+    Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace());
+
+    Value *Mask = ConstantInt::get(IntPtrTy, Alignment > 0 ? Alignment - 1 : 0);
+    return CreateAlignmentAssumptionHelper(DL, PtrValue, Mask, IntPtrTy,
+                                           OffsetValue);
+  }
+  //
+  /// \brief Create an assume intrinsic call that represents an alignment
+  /// assumption on the provided pointer.
+  ///
+  /// An optional offset can be provided, and if it is provided, the offset
+  /// must be subtracted from the provided pointer to get the pointer with the
+  /// specified alignment.
+  ///
+  /// This overload handles the condition where the Alignment is dependent
+  /// on an existing value rather than a static value.
+  CallInst *CreateAlignmentAssumption(const DataLayout &DL, Value *PtrValue,
+                                      Value *Alignment,
+                                      Value *OffsetValue = nullptr) {
+    assert(isa<PointerType>(PtrValue->getType()) &&
+           "trying to create an alignment assumption on a non-pointer?");
+    PointerType *PtrTy = cast<PointerType>(PtrValue->getType());
+    Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace());
+
+    if (Alignment->getType() != IntPtrTy)
+      Alignment = CreateIntCast(Alignment, IntPtrTy, /*isSigned*/ true,
+                                "alignmentcast");
+    Value *IsPositive =
+        CreateICmp(CmpInst::ICMP_SGT, Alignment,
+                   ConstantInt::get(Alignment->getType(), 0), "ispositive");
+    Value *PositiveMask =
+        CreateSub(Alignment, ConstantInt::get(IntPtrTy, 1), "positivemask");
+    Value *Mask = CreateSelect(IsPositive, PositiveMask,
+                               ConstantInt::get(IntPtrTy, 0), "mask");
+
+    return CreateAlignmentAssumptionHelper(DL, PtrValue, Mask, IntPtrTy,
+                                           OffsetValue);
+  }
 };
 
 // Create wrappers for C Binding types (see CBindingWrapping.h).
diff --git a/include/llvm/IR/InlineAsm.h b/include/llvm/IR/InlineAsm.h
index f95509b9b09ab25e0e51c06d79fcc20e311d3550..5d2f72d211ff7e82e9d37d5acd4f3b32c7afba89 100644
--- a/include/llvm/IR/InlineAsm.h
+++ b/include/llvm/IR/InlineAsm.h
@@ -1,4 +1,4 @@
-//===-- llvm/InlineAsm.h - Class to represent inline asm strings-*- C++ -*-===//
+//===- llvm/InlineAsm.h - Class to represent inline asm strings -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -102,12 +102,14 @@ public:
     /// input constraint is required to match it (e.g. "0").  The value is the
     /// constraint number that matches this one (for example, if this is
     /// constraint #0 and constraint #4 has the value "0", this will be 4).
-    signed char MatchingInput;
+    signed char MatchingInput = -1;
+
     /// Code - The constraint code, either the register name (in braces) or the
     /// constraint letter/number.
     ConstraintCodeVector Codes;
+
     /// Default constructor.
-    SubConstraintInfo() : MatchingInput(-1) {}
+    SubConstraintInfo() = default;
   };
 
   typedef std::vector<SubConstraintInfo> SubConstraintInfoVector;
@@ -117,17 +119,17 @@ public:
   struct ConstraintInfo {
     /// Type - The basic type of the constraint: input/output/clobber
     ///
-    ConstraintPrefix Type;
+    ConstraintPrefix Type = isInput;
 
     /// isEarlyClobber - "&": output operand writes result before inputs are all
     /// read.  This is only ever set for an output operand.
-    bool isEarlyClobber;
+    bool isEarlyClobber = false;
 
     /// MatchingInput - If this is not -1, this is an output constraint where an
     /// input constraint is required to match it (e.g. "0").  The value is the
     /// constraint number that matches this one (for example, if this is
     /// constraint #0 and constraint #4 has the value "0", this will be 4).
-    signed char MatchingInput;
+    signed char MatchingInput = -1;
 
     /// hasMatchingInput - Return true if this is an output constraint that has
     /// a matching input constraint.
@@ -135,30 +137,30 @@ public:
 
     /// isCommutative - This is set to true for a constraint that is commutative
     /// with the next operand.
-    bool isCommutative;
+    bool isCommutative = false;
 
     /// isIndirect - True if this operand is an indirect operand.  This means
     /// that the address of the source or destination is present in the call
     /// instruction, instead of it being returned or passed in explicitly.  This
     /// is represented with a '*' in the asm string.
-    bool isIndirect;
+    bool isIndirect = false;
 
     /// Code - The constraint code, either the register name (in braces) or the
     /// constraint letter/number.
     ConstraintCodeVector Codes;
 
     /// isMultipleAlternative - '|': has multiple-alternative constraints.
-    bool isMultipleAlternative;
+    bool isMultipleAlternative = false;
 
     /// multipleAlternatives - If there are multiple alternative constraints,
     /// this array will contain them.  Otherwise it will be empty.
     SubConstraintInfoVector multipleAlternatives;
 
     /// The currently selected alternative constraint index.
-    unsigned currentAlternativeIndex;
+    unsigned currentAlternativeIndex = 0;
 
     /// Default constructor.
-    ConstraintInfo();
+    ConstraintInfo() = default;
 
     /// Parse - Analyze the specified string (e.g. "=*&{eax}") and fill in the
     /// fields in this structure.  If the constraint string is not understood,
diff --git a/include/llvm/IR/InstVisitor.h b/include/llvm/IR/InstVisitor.h
index 088d3e0fbfa56b5209f643678bec52b75e1d6e47..55579819fd34f4b3917c46d0ee2cbc8e2c20eb40 100644
--- a/include/llvm/IR/InstVisitor.h
+++ b/include/llvm/IR/InstVisitor.h
@@ -116,6 +116,9 @@ public:
   // visit - Finally, code to visit an instruction...
   //
   RetTy visit(Instruction &I) {
+    static_assert(std::is_base_of<InstVisitor, SubClass>::value,
+                  "Must pass the derived type to this template!");
+
     switch (I.getOpcode()) {
     default: llvm_unreachable("Unknown instruction type encountered!");
       // Build the switch statement using the Instruction.def file...
diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h
index fd7c54d69b63088ce6e31cf49154e3467dfd503f..90c3175122fd8f240d196bc6bec3b58e91efdc66 100644
--- a/include/llvm/IR/Instruction.h
+++ b/include/llvm/IR/Instruction.h
@@ -68,14 +68,20 @@ public:
   /// Note: this is undefined behavior if the instruction does not have a
   /// parent, or the parent basic block does not have a parent function.
   const Module *getModule() const;
-  Module *getModule();
+  Module *getModule() {
+    return const_cast<Module *>(
+                           static_cast<const Instruction *>(this)->getModule());
+  }
 
   /// Return the function this instruction belongs to.
   ///
   /// Note: it is undefined behavior to call this on an instruction not
   /// currently inserted into a function.
   const Function *getFunction() const;
-  Function *getFunction();
+  Function *getFunction() {
+    return const_cast<Function *>(
+                         static_cast<const Instruction *>(this)->getFunction());
+  }
 
   /// This method unlinks 'this' from the containing basic block, but does not
   /// delete it.
@@ -252,6 +258,12 @@ public:
   /// Returns false if no metadata was found.
   bool extractProfTotalWeight(uint64_t &TotalVal) const;
 
+  /// Updates branch_weights metadata by scaling it by \p S / \p T.
+  void updateProfWeight(uint64_t S, uint64_t T);
+
+  /// Sets the branch_weights metadata to \p W for CallInst.
+  void setProfWeight(uint64_t W);
+
   /// Set the debug location information for this instruction.
   void setDebugLoc(DebugLoc Loc) { DbgLoc = std::move(Loc); }
 
@@ -276,6 +288,10 @@ public:
   /// Determine whether the no signed wrap flag is set.
   bool hasNoSignedWrap() const;
 
+  /// Drops flags that may cause this instruction to evaluate to poison despite
+  /// having non-poison inputs.
+  void dropPoisonGeneratingFlags();
+
   /// Determine whether the exact flag is set.
   bool isExact() const;
 
@@ -329,6 +345,9 @@ public:
   /// Determine whether the allow-reciprocal flag is set.
   bool hasAllowReciprocal() const;
 
+  /// Determine whether the allow-contract flag is set.
+  bool hasAllowContract() const;
+
   /// Convenience function for getting all the fast-math flags, which must be an
   /// operator which supports these flags. See LangRef.html for the meaning of
   /// these flags.
@@ -372,18 +391,30 @@ public:
   ///
   /// In LLVM, the Add, Mul, And, Or, and Xor operators are associative.
   ///
-  bool isAssociative() const;
-  static bool isAssociative(unsigned op);
+  bool isAssociative() const LLVM_READONLY;
+  static bool isAssociative(unsigned Opcode) {
+    return Opcode == And || Opcode == Or || Opcode == Xor ||
+           Opcode == Add || Opcode == Mul;
+  }
 
   /// Return true if the instruction is commutative:
   ///
   ///   Commutative operators satisfy: (x op y) === (y op x)
   ///
-  /// In LLVM, these are the associative operators, plus SetEQ and SetNE, when
+  /// In LLVM, these are the commutative operators, plus SetEQ and SetNE, when
   /// applied to any type.
   ///
   bool isCommutative() const { return isCommutative(getOpcode()); }
-  static bool isCommutative(unsigned op);
+  static bool isCommutative(unsigned Opcode) {
+    switch (Opcode) {
+    case Add: case FAdd:
+    case Mul: case FMul:
+    case And: case Or: case Xor:
+      return true;
+    default:
+      return false;
+  }
+  }
 
   /// Return true if the instruction is idempotent:
   ///
@@ -392,7 +423,9 @@ public:
   /// In LLVM, the And and Or operators are idempotent.
   ///
   bool isIdempotent() const { return isIdempotent(getOpcode()); }
-  static bool isIdempotent(unsigned op);
+  static bool isIdempotent(unsigned Opcode) {
+    return Opcode == And || Opcode == Or;
+  }
 
   /// Return true if the instruction is nilpotent:
   ///
@@ -404,7 +437,9 @@ public:
   /// In LLVM, the Xor operator is nilpotent.
   ///
   bool isNilpotent() const { return isNilpotent(getOpcode()); }
-  static bool isNilpotent(unsigned op);
+  static bool isNilpotent(unsigned Opcode) {
+    return Opcode == Xor;
+  }
 
   /// Return true if this instruction may modify memory.
   bool mayWriteToMemory() const;
diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h
index a5d78a08171af515d6370a38502b1f736dfbf770..9e4b49925a691fe775689a23e58a34eee6167a66 100644
--- a/include/llvm/IR/Instructions.h
+++ b/include/llvm/IR/Instructions.h
@@ -67,18 +67,21 @@ protected:
   AllocaInst *cloneImpl() const;
 
 public:
-  explicit AllocaInst(Type *Ty, Value *ArraySize = nullptr,
+  explicit AllocaInst(Type *Ty, unsigned AddrSpace,
+                      Value *ArraySize = nullptr,
                       const Twine &Name = "",
                       Instruction *InsertBefore = nullptr);
-  AllocaInst(Type *Ty, Value *ArraySize,
+  AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
              const Twine &Name, BasicBlock *InsertAtEnd);
 
-  AllocaInst(Type *Ty, const Twine &Name, Instruction *InsertBefore = nullptr);
-  AllocaInst(Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd);
+  AllocaInst(Type *Ty, unsigned AddrSpace,
+             const Twine &Name, Instruction *InsertBefore = nullptr);
+  AllocaInst(Type *Ty, unsigned AddrSpace,
+             const Twine &Name, BasicBlock *InsertAtEnd);
 
-  AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
+  AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, unsigned Align,
              const Twine &Name = "", Instruction *InsertBefore = nullptr);
-  AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
+  AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, unsigned Align,
              const Twine &Name, BasicBlock *InsertAtEnd);
 
   // Out of line virtual method, so the vtable, etc. has a home.
@@ -958,6 +961,14 @@ public:
   inline op_iterator       idx_end()         { return op_end(); }
   inline const_op_iterator idx_end()   const { return op_end(); }
 
+  inline iterator_range<op_iterator> indices() {
+    return make_range(idx_begin(), idx_end());
+  }
+
+  inline iterator_range<const_op_iterator> indices() const {
+    return make_range(idx_begin(), idx_end());
+  }
+
   Value *getPointerOperand() {
     return getOperand(0);
   }
@@ -1354,7 +1365,7 @@ class CallInst : public Instruction,
                  public OperandBundleUser<CallInst, User::op_iterator> {
   friend class OperandBundleUser<CallInst, User::op_iterator>;
 
-  AttributeSet AttributeList; ///< parameter attributes for call
+  AttributeList Attrs; ///< parameter attributes for call
   FunctionType *FTy;
 
   CallInst(const CallInst &CI);
@@ -1633,11 +1644,11 @@ public:
 
   /// Return the parameter attributes for this call.
   ///
-  AttributeSet getAttributes() const { return AttributeList; }
+  AttributeList getAttributes() const { return Attrs; }
 
   /// Set the parameter attributes for this call.
   ///
-  void setAttributes(AttributeSet Attrs) { AttributeList = Attrs; }
+  void setAttributes(AttributeList A) { Attrs = A; }
 
   /// adds the attribute to the list of attributes.
   void addAttribute(unsigned i, Attribute::AttrKind Kind);
@@ -1700,26 +1711,26 @@ public:
 
   /// Extract the alignment for a call or parameter (0=unknown).
   unsigned getParamAlignment(unsigned i) const {
-    return AttributeList.getParamAlignment(i);
+    return Attrs.getParamAlignment(i);
   }
 
   /// Extract the number of dereferenceable bytes for a call or
   /// parameter (0=unknown).
   uint64_t getDereferenceableBytes(unsigned i) const {
-    return AttributeList.getDereferenceableBytes(i);
+    return Attrs.getDereferenceableBytes(i);
   }
 
   /// Extract the number of dereferenceable_or_null bytes for a call or
   /// parameter (0=unknown).
   uint64_t getDereferenceableOrNullBytes(unsigned i) const {
-    return AttributeList.getDereferenceableOrNullBytes(i);
+    return Attrs.getDereferenceableOrNullBytes(i);
   }
 
   /// @brief Determine if the parameter or return value is marked with NoAlias
   /// attribute.
   /// @param n The parameter to check. 1 is the first parameter, 0 is the return
   bool doesNotAlias(unsigned n) const {
-    return AttributeList.hasAttribute(n, Attribute::NoAlias);
+    return Attrs.hasAttribute(n, Attribute::NoAlias);
   }
 
   /// Return true if the call should not be treated as a call to a
@@ -1732,7 +1743,7 @@ public:
   /// Return true if the call should not be inlined.
   bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
   void setIsNoInline() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::NoInline);
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
   }
 
   /// Return true if the call can return twice
@@ -1740,7 +1751,7 @@ public:
     return hasFnAttr(Attribute::ReturnsTwice);
   }
   void setCanReturnTwice() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::ReturnsTwice);
+    addAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice);
   }
 
   /// Determine if the call does not access memory.
@@ -1748,7 +1759,7 @@ public:
     return hasFnAttr(Attribute::ReadNone);
   }
   void setDoesNotAccessMemory() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+    addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
   }
 
   /// Determine if the call does not access or only reads memory.
@@ -1756,7 +1767,7 @@ public:
     return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
   }
   void setOnlyReadsMemory() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly);
+    addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
   }
 
   /// Determine if the call does not access or only writes memory.
@@ -1764,7 +1775,7 @@ public:
     return doesNotAccessMemory() || hasFnAttr(Attribute::WriteOnly);
   }
   void setDoesNotReadMemory() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::WriteOnly);
+    addAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly);
   }
 
   /// @brief Determine if the call can access memmory only using pointers based
@@ -1773,34 +1784,34 @@ public:
     return hasFnAttr(Attribute::ArgMemOnly);
   }
   void setOnlyAccessesArgMemory() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::ArgMemOnly);
+    addAttribute(AttributeList::FunctionIndex, Attribute::ArgMemOnly);
   }
 
   /// Determine if the call cannot return.
   bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
   void setDoesNotReturn() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::NoReturn);
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
   }
 
   /// Determine if the call cannot unwind.
   bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
   void setDoesNotThrow() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
   }
 
   /// Determine if the call cannot be duplicated.
   bool cannotDuplicate() const {return hasFnAttr(Attribute::NoDuplicate); }
   void setCannotDuplicate() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::NoDuplicate);
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoDuplicate);
   }
 
   /// Determine if the call is convergent
   bool isConvergent() const { return hasFnAttr(Attribute::Convergent); }
   void setConvergent() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::Convergent);
+    addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
   }
   void setNotConvergent() {
-    removeAttribute(AttributeSet::FunctionIndex, Attribute::Convergent);
+    removeAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
   }
 
   /// Determine if the call returns a structure through first
@@ -1815,7 +1826,7 @@ public:
 
   /// Determine if any call argument is an aggregate passed by value.
   bool hasByValArgument() const {
-    return AttributeList.hasAttrSomewhere(Attribute::ByVal);
+    return Attrs.hasAttrSomewhere(Attribute::ByVal);
   }
 
   /// Return the function called, or null if this is an
@@ -1858,7 +1869,7 @@ public:
 
 private:
   template <typename AttrKind> bool hasFnAttrImpl(AttrKind Kind) const {
-    if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, Kind))
+    if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind))
       return true;
 
     // Operand bundles override attributes on the called function, but don't
@@ -1867,7 +1878,8 @@ private:
       return false;
 
     if (const Function *F = getCalledFunction())
-      return F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Kind);
+      return F->getAttributes().hasAttribute(AttributeList::FunctionIndex,
+                                             Kind);
     return false;
   }
 
@@ -3084,42 +3096,41 @@ public:
   // -2
   static const unsigned DefaultPseudoIndex = static_cast<unsigned>(~0L-1);
 
-  template <class SwitchInstTy, class ConstantIntTy, class BasicBlockTy>
-  class CaseIteratorT {
-  protected:
-    SwitchInstTy *SI;
-    unsigned Index;
+  template <typename CaseHandleT> class CaseIteratorImpl;
 
-  public:
-    typedef CaseIteratorT<SwitchInstTy, ConstantIntTy, BasicBlockTy> Self;
+  /// A handle to a particular switch case. It exposes a convenient interface
+  /// to both the case value and the successor block.
+  ///
+  /// We define this as a template and instantiate it to form both a const and
+  /// non-const handle.
+  template <typename SwitchInstT, typename ConstantIntT, typename BasicBlockT>
+  class CaseHandleImpl {
+    // Directly befriend both const and non-const iterators.
+    friend class SwitchInst::CaseIteratorImpl<
+        CaseHandleImpl<SwitchInstT, ConstantIntT, BasicBlockT>>;
 
-    /// Initializes case iterator for given SwitchInst and for given
-    /// case number.
-    CaseIteratorT(SwitchInstTy *SI, unsigned CaseNum) {
-      this->SI = SI;
-      Index = CaseNum;
-    }
+  protected:
+    // Expose the switch type we're parameterized with to the iterator.
+    typedef SwitchInstT SwitchInstType;
 
-    /// Initializes case iterator for given SwitchInst and for given
-    /// TerminatorInst's successor index.
-    static Self fromSuccessorIndex(SwitchInstTy *SI, unsigned SuccessorIndex) {
-      assert(SuccessorIndex < SI->getNumSuccessors() &&
-             "Successor index # out of range!");
-      return SuccessorIndex != 0 ?
-             Self(SI, SuccessorIndex - 1) :
-             Self(SI, DefaultPseudoIndex);
-    }
+    SwitchInstT *SI;
+    ptrdiff_t Index;
+
+    CaseHandleImpl() = default;
+    CaseHandleImpl(SwitchInstT *SI, ptrdiff_t Index) : SI(SI), Index(Index) {}
 
+  public:
     /// Resolves case value for current case.
-    ConstantIntTy *getCaseValue() {
-      assert(Index < SI->getNumCases() && "Index out the number of cases.");
-      return reinterpret_cast<ConstantIntTy*>(SI->getOperand(2 + Index*2));
+    ConstantIntT *getCaseValue() const {
+      assert((unsigned)Index < SI->getNumCases() &&
+             "Index out the number of cases.");
+      return reinterpret_cast<ConstantIntT *>(SI->getOperand(2 + Index * 2));
     }
 
     /// Resolves successor for current case.
-    BasicBlockTy *getCaseSuccessor() {
-      assert((Index < SI->getNumCases() ||
-              Index == DefaultPseudoIndex) &&
+    BasicBlockT *getCaseSuccessor() const {
+      assert(((unsigned)Index < SI->getNumCases() ||
+              (unsigned)Index == DefaultPseudoIndex) &&
              "Index out the number of cases.");
       return SI->getSuccessor(getSuccessorIndex());
     }
@@ -3129,63 +3140,32 @@ public:
 
     /// Returns TerminatorInst's successor index for current case successor.
     unsigned getSuccessorIndex() const {
-      assert((Index == DefaultPseudoIndex || Index < SI->getNumCases()) &&
+      assert(((unsigned)Index == DefaultPseudoIndex ||
+              (unsigned)Index < SI->getNumCases()) &&
              "Index out the number of cases.");
-      return Index != DefaultPseudoIndex ? Index + 1 : 0;
+      return (unsigned)Index != DefaultPseudoIndex ? Index + 1 : 0;
     }
 
-    Self operator++() {
-      // Check index correctness after increment.
-      // Note: Index == getNumCases() means end().
-      assert(Index+1 <= SI->getNumCases() && "Index out the number of cases.");
-      ++Index;
-      return *this;
-    }
-    Self operator++(int) {
-      Self tmp = *this;
-      ++(*this);
-      return tmp;
-    }
-    Self operator--() {
-      // Check index correctness after decrement.
-      // Note: Index == getNumCases() means end().
-      // Also allow "-1" iterator here. That will became valid after ++.
-      assert((Index == 0 || Index-1 <= SI->getNumCases()) &&
-             "Index out the number of cases.");
-      --Index;
-      return *this;
-    }
-    Self operator--(int) {
-      Self tmp = *this;
-      --(*this);
-      return tmp;
-    }
-    bool operator==(const Self& RHS) const {
-      assert(RHS.SI == SI && "Incompatible operators.");
-      return RHS.Index == Index;
-    }
-    bool operator!=(const Self& RHS) const {
-      assert(RHS.SI == SI && "Incompatible operators.");
-      return RHS.Index != Index;
-    }
-    Self &operator*() {
-      return *this;
+    bool operator==(const CaseHandleImpl &RHS) const {
+      assert(SI == RHS.SI && "Incompatible operators.");
+      return Index == RHS.Index;
     }
   };
 
-  typedef CaseIteratorT<const SwitchInst, const ConstantInt, const BasicBlock>
-    ConstCaseIt;
+  typedef CaseHandleImpl<const SwitchInst, const ConstantInt, const BasicBlock>
+      ConstCaseHandle;
 
-  class CaseIt : public CaseIteratorT<SwitchInst, ConstantInt, BasicBlock> {
-    typedef CaseIteratorT<SwitchInst, ConstantInt, BasicBlock> ParentTy;
+  class CaseHandle
+      : public CaseHandleImpl<SwitchInst, ConstantInt, BasicBlock> {
+    friend class SwitchInst::CaseIteratorImpl<CaseHandle>;
 
   public:
-    CaseIt(const ParentTy &Src) : ParentTy(Src) {}
-    CaseIt(SwitchInst *SI, unsigned CaseNum) : ParentTy(SI, CaseNum) {}
+    CaseHandle(SwitchInst *SI, ptrdiff_t Index) : CaseHandleImpl(SI, Index) {}
 
     /// Sets the new value for current case.
     void setValue(ConstantInt *V) {
-      assert(Index < SI->getNumCases() && "Index out the number of cases.");
+      assert((unsigned)Index < SI->getNumCases() &&
+             "Index out the number of cases.");
       SI->setOperand(2 + Index*2, reinterpret_cast<Value*>(V));
     }
 
@@ -3195,6 +3175,76 @@ public:
     }
   };
 
+  template <typename CaseHandleT>
+  class CaseIteratorImpl
+      : public iterator_facade_base<CaseIteratorImpl<CaseHandleT>,
+                                    std::random_access_iterator_tag,
+                                    CaseHandleT> {
+    typedef typename CaseHandleT::SwitchInstType SwitchInstT;
+
+    CaseHandleT Case;
+
+  public:
+    /// Default constructed iterator is in an invalid state until assigned to
+    /// a case for a particular switch.
+    CaseIteratorImpl() = default;
+
+    /// Initializes case iterator for given SwitchInst and for given
+    /// case number.
+    CaseIteratorImpl(SwitchInstT *SI, unsigned CaseNum) : Case(SI, CaseNum) {}
+
+    /// Initializes case iterator for given SwitchInst and for given
+    /// TerminatorInst's successor index.
+    static CaseIteratorImpl fromSuccessorIndex(SwitchInstT *SI,
+                                               unsigned SuccessorIndex) {
+      assert(SuccessorIndex < SI->getNumSuccessors() &&
+             "Successor index # out of range!");
+      return SuccessorIndex != 0 ? CaseIteratorImpl(SI, SuccessorIndex - 1)
+                                 : CaseIteratorImpl(SI, DefaultPseudoIndex);
+    }
+
+    /// Support converting to the const variant. This will be a no-op for const
+    /// variant.
+    operator CaseIteratorImpl<ConstCaseHandle>() const {
+      return CaseIteratorImpl<ConstCaseHandle>(Case.SI, Case.Index);
+    }
+
+    CaseIteratorImpl &operator+=(ptrdiff_t N) {
+      // Check index correctness after addition.
+      // Note: Index == getNumCases() means end().
+      assert(Case.Index + N >= 0 &&
+             (unsigned)(Case.Index + N) <= Case.SI->getNumCases() &&
+             "Case.Index out the number of cases.");
+      Case.Index += N;
+      return *this;
+    }
+    CaseIteratorImpl &operator-=(ptrdiff_t N) {
+      // Check index correctness after subtraction.
+      // Note: Case.Index == getNumCases() means end().
+      assert(Case.Index - N >= 0 &&
+             (unsigned)(Case.Index - N) <= Case.SI->getNumCases() &&
+             "Case.Index out the number of cases.");
+      Case.Index -= N;
+      return *this;
+    }
+    ptrdiff_t operator-(const CaseIteratorImpl &RHS) const {
+      assert(Case.SI == RHS.Case.SI && "Incompatible operators.");
+      return Case.Index - RHS.Case.Index;
+    }
+    bool operator==(const CaseIteratorImpl &RHS) const {
+      return Case == RHS.Case;
+    }
+    bool operator<(const CaseIteratorImpl &RHS) const {
+      assert(Case.SI == RHS.Case.SI && "Incompatible operators.");
+      return Case.Index < RHS.Case.Index;
+    }
+    CaseHandleT &operator*() { return Case; }
+    const CaseHandleT &operator*() const { return Case; }
+  };
+
+  typedef CaseIteratorImpl<CaseHandle> CaseIt;
+  typedef CaseIteratorImpl<ConstCaseHandle> ConstCaseIt;
+
   static SwitchInst *Create(Value *Value, BasicBlock *Default,
                             unsigned NumCases,
                             Instruction *InsertBefore = nullptr) {
@@ -3278,30 +3328,40 @@ public:
   /// default case iterator to indicate that it is handled by the default
   /// handler.
   CaseIt findCaseValue(const ConstantInt *C) {
-    for (CaseIt i = case_begin(), e = case_end(); i != e; ++i)
-      if (i.getCaseValue() == C)
-        return i;
+    CaseIt I = llvm::find_if(
+        cases(), [C](CaseHandle &Case) { return Case.getCaseValue() == C; });
+    if (I != case_end())
+      return I;
+
     return case_default();
   }
   ConstCaseIt findCaseValue(const ConstantInt *C) const {
-    for (ConstCaseIt i = case_begin(), e = case_end(); i != e; ++i)
-      if (i.getCaseValue() == C)
-        return i;
+    ConstCaseIt I = llvm::find_if(cases(), [C](ConstCaseHandle &Case) {
+      return Case.getCaseValue() == C;
+    });
+    if (I != case_end())
+      return I;
+
     return case_default();
   }
 
   /// Finds the unique case value for a given successor. Returns null if the
   /// successor is not found, not unique, or is the default case.
   ConstantInt *findCaseDest(BasicBlock *BB) {
-    if (BB == getDefaultDest()) return nullptr;
+    if (BB == getDefaultDest())
+      return nullptr;
 
     ConstantInt *CI = nullptr;
-    for (CaseIt i = case_begin(), e = case_end(); i != e; ++i) {
-      if (i.getCaseSuccessor() == BB) {
-        if (CI) return nullptr;   // Multiple cases lead to BB.
-        else CI = i.getCaseValue();
-      }
+    for (auto Case : cases()) {
+      if (Case.getCaseSuccessor() != BB)
+        continue;
+
+      if (CI)
+        return nullptr; // Multiple cases lead to BB.
+
+      CI = Case.getCaseValue();
     }
+
     return CI;
   }
 
@@ -3316,8 +3376,9 @@ public:
   /// index idx and above.
   /// Note:
   /// This action invalidates iterators for all cases following the one removed,
-  /// including the case_end() iterator.
-  void removeCase(CaseIt i);
+  /// including the case_end() iterator. It returns an iterator for the next
+  /// case.
+  CaseIt removeCase(CaseIt I);
 
   unsigned getNumSuccessors() const { return getNumOperands()/2; }
   BasicBlock *getSuccessor(unsigned idx) const {
@@ -3465,7 +3526,7 @@ class InvokeInst : public TerminatorInst,
                    public OperandBundleUser<InvokeInst, User::op_iterator> {
   friend class OperandBundleUser<InvokeInst, User::op_iterator>;
 
-  AttributeSet AttributeList;
+  AttributeList Attrs;
   FunctionType *FTy;
 
   InvokeInst(const InvokeInst &BI);
@@ -3669,11 +3730,11 @@ public:
 
   /// Return the parameter attributes for this invoke.
   ///
-  AttributeSet getAttributes() const { return AttributeList; }
+  AttributeList getAttributes() const { return Attrs; }
 
   /// Set the parameter attributes for this invoke.
   ///
-  void setAttributes(AttributeSet Attrs) { AttributeList = Attrs; }
+  void setAttributes(AttributeList A) { Attrs = A; }
 
   /// adds the attribute to the list of attributes.
   void addAttribute(unsigned i, Attribute::AttrKind Kind);
@@ -3737,26 +3798,26 @@ public:
 
   /// Extract the alignment for a call or parameter (0=unknown).
   unsigned getParamAlignment(unsigned i) const {
-    return AttributeList.getParamAlignment(i);
+    return Attrs.getParamAlignment(i);
   }
 
   /// Extract the number of dereferenceable bytes for a call or
   /// parameter (0=unknown).
   uint64_t getDereferenceableBytes(unsigned i) const {
-    return AttributeList.getDereferenceableBytes(i);
+    return Attrs.getDereferenceableBytes(i);
   }
 
   /// Extract the number of dereferenceable_or_null bytes for a call or
   /// parameter (0=unknown).
   uint64_t getDereferenceableOrNullBytes(unsigned i) const {
-    return AttributeList.getDereferenceableOrNullBytes(i);
+    return Attrs.getDereferenceableOrNullBytes(i);
   }
 
   /// @brief Determine if the parameter or return value is marked with NoAlias
   /// attribute.
   /// @param n The parameter to check. 1 is the first parameter, 0 is the return
   bool doesNotAlias(unsigned n) const {
-    return AttributeList.hasAttribute(n, Attribute::NoAlias);
+    return Attrs.hasAttribute(n, Attribute::NoAlias);
   }
 
   /// Return true if the call should not be treated as a call to a
@@ -3771,7 +3832,7 @@ public:
   /// Return true if the call should not be inlined.
   bool isNoInline() const { return hasFnAttr(Attribute::NoInline); }
   void setIsNoInline() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::NoInline);
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
   }
 
   /// Determine if the call does not access memory.
@@ -3779,7 +3840,7 @@ public:
     return hasFnAttr(Attribute::ReadNone);
   }
   void setDoesNotAccessMemory() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+    addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
   }
 
   /// Determine if the call does not access or only reads memory.
@@ -3787,7 +3848,7 @@ public:
     return doesNotAccessMemory() || hasFnAttr(Attribute::ReadOnly);
   }
   void setOnlyReadsMemory() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly);
+    addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
   }
 
   /// Determine if the call does not access or only writes memory.
@@ -3795,7 +3856,7 @@ public:
     return doesNotAccessMemory() || hasFnAttr(Attribute::WriteOnly);
   }
   void setDoesNotReadMemory() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::WriteOnly);
+    addAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly);
   }
 
   /// @brief Determine if the call access memmory only using it's pointer
@@ -3804,34 +3865,34 @@ public:
     return hasFnAttr(Attribute::ArgMemOnly);
   }
   void setOnlyAccessesArgMemory() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::ArgMemOnly);
+    addAttribute(AttributeList::FunctionIndex, Attribute::ArgMemOnly);
   }
 
   /// Determine if the call cannot return.
   bool doesNotReturn() const { return hasFnAttr(Attribute::NoReturn); }
   void setDoesNotReturn() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::NoReturn);
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
   }
 
   /// Determine if the call cannot unwind.
   bool doesNotThrow() const { return hasFnAttr(Attribute::NoUnwind); }
   void setDoesNotThrow() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
   }
 
   /// Determine if the invoke cannot be duplicated.
   bool cannotDuplicate() const {return hasFnAttr(Attribute::NoDuplicate); }
   void setCannotDuplicate() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::NoDuplicate);
+    addAttribute(AttributeList::FunctionIndex, Attribute::NoDuplicate);
   }
 
   /// Determine if the invoke is convergent
   bool isConvergent() const { return hasFnAttr(Attribute::Convergent); }
   void setConvergent() {
-    addAttribute(AttributeSet::FunctionIndex, Attribute::Convergent);
+    addAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
   }
   void setNotConvergent() {
-    removeAttribute(AttributeSet::FunctionIndex, Attribute::Convergent);
+    removeAttribute(AttributeList::FunctionIndex, Attribute::Convergent);
   }
 
   /// Determine if the call returns a structure through first
@@ -3846,7 +3907,7 @@ public:
 
   /// Determine if any call argument is an aggregate passed by value.
   bool hasByValArgument() const {
-    return AttributeList.hasAttrSomewhere(Attribute::ByVal);
+    return Attrs.hasAttrSomewhere(Attribute::ByVal);
   }
 
   /// Return the function called, or null if this is an
@@ -3918,7 +3979,7 @@ private:
   void setSuccessorV(unsigned idx, BasicBlock *B) override;
 
   template <typename AttrKind> bool hasFnAttrImpl(AttrKind Kind) const {
-    if (AttributeList.hasAttribute(AttributeSet::FunctionIndex, Kind))
+    if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind))
       return true;
 
     // Operand bundles override attributes on the called function, but don't
@@ -3927,7 +3988,8 @@ private:
       return false;
 
     if (const Function *F = getCalledFunction())
-      return F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, Kind);
+      return F->getAttributes().hasAttribute(AttributeList::FunctionIndex,
+                                             Kind);
     return false;
   }
 
diff --git a/include/llvm/IR/IntrinsicInst.h b/include/llvm/IR/IntrinsicInst.h
index 7f4eb0e9df81b8836b339b1bd0b6b9f870c666a4..f69b5bfc0be2643a251232c882988fecaef78915 100644
--- a/include/llvm/IR/IntrinsicInst.h
+++ b/include/llvm/IR/IntrinsicInst.h
@@ -191,6 +191,32 @@ namespace llvm {
     }
   };
 
+  /// This class represents atomic memcpy intrinsic
+  /// TODO: Integrate this class into MemIntrinsic hierarchy.
+  class ElementAtomicMemCpyInst : public IntrinsicInst {
+  public:
+    Value *getRawDest() const { return getArgOperand(0); }
+    Value *getRawSource() const { return getArgOperand(1); }
+
+    Value *getNumElements() const { return getArgOperand(2); }
+    void setNumElements(Value *V) { setArgOperand(2, V); }
+
+    uint64_t getSrcAlignment() const { return getParamAlignment(1); }
+    uint64_t getDstAlignment() const { return getParamAlignment(2); }
+
+    uint64_t getElementSizeInBytes() const {
+      Value *Arg = getArgOperand(3);
+      return cast<ConstantInt>(Arg)->getZExtValue();
+    }
+
+    static inline bool classof(const IntrinsicInst *I) {
+      return I->getIntrinsicID() == Intrinsic::memcpy_element_atomic;
+    }
+    static inline bool classof(const Value *V) {
+      return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+    }
+  };
+
   /// This is the common base class for memset/memcpy/memmove.
   class MemIntrinsic : public IntrinsicInst {
   public:
diff --git a/include/llvm/IR/Intrinsics.h b/include/llvm/IR/Intrinsics.h
index d07358445dab23b3119092c04dd87e39cf15cdaa..2f6bdf8ecf19faa8e672641aa9e72c0ffb0acc6f 100644
--- a/include/llvm/IR/Intrinsics.h
+++ b/include/llvm/IR/Intrinsics.h
@@ -28,7 +28,7 @@ class FunctionType;
 class Function;
 class LLVMContext;
 class Module;
-class AttributeSet;
+class AttributeList;
 
 /// This namespace contains an enum with a value for every intrinsic/builtin
 /// function known by LLVM. The enum values are returned by
@@ -69,7 +69,7 @@ namespace Intrinsic {
   bool isLeaf(ID id);
 
   /// Return the attributes for an intrinsic.
-  AttributeSet getAttributes(LLVMContext &C, ID id);
+  AttributeList getAttributes(LLVMContext &C, ID id);
 
   /// Create or insert an LLVM Function declaration for an intrinsic, and return
   /// it.
diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td
index 24b98df0afae330ac325672cc9138a965ef543c6..5b796e7dfcbfa3d15b067010de108d5e710c507d 100644
--- a/include/llvm/IR/Intrinsics.td
+++ b/include/llvm/IR/Intrinsics.td
@@ -331,13 +331,13 @@ def int_get_dynamic_area_offset : Intrinsic<[llvm_anyint_ty]>;
 def int_thread_pointer : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>,
                          GCCBuiltin<"__builtin_thread_pointer">;
 
-// IntrArgMemOnly is more pessimistic than strictly necessary for prefetch,
-// however it does conveniently prevent the prefetch from being reordered
-// with respect to nearby accesses to the same memory.
-def int_prefetch      : Intrinsic<[],
-                                  [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty,
-                                   llvm_i32_ty],
-                                  [IntrArgMemOnly, NoCapture<0>]>;
+// IntrInaccessibleMemOrArgMemOnly is a little more pessimistic than strictly
+// necessary for prefetch, however it does conveniently prevent the prefetch
+// from being reordered overly much with respect to nearby access to the same
+// memory while not impeding optimization.
+def int_prefetch
+    : Intrinsic<[], [ llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty ],
+                [ IntrInaccessibleMemOrArgMemOnly, ReadOnly<0>, NoCapture<0> ]>;
 def int_pcmarker      : Intrinsic<[], [llvm_i32_ty]>;
 
 def int_readcyclecounter : Intrinsic<[llvm_i64_ty]>;
@@ -441,7 +441,8 @@ def int_sigsetjmp  : Intrinsic<[llvm_i32_ty] , [llvm_ptr_ty, llvm_i32_ty]>;
 def int_siglongjmp : Intrinsic<[], [llvm_ptr_ty, llvm_i32_ty], [IntrNoReturn]>;
 
 // Internal interface for object size checking
-def int_objectsize : Intrinsic<[llvm_anyint_ty], [llvm_anyptr_ty, llvm_i1_ty],
+def int_objectsize : Intrinsic<[llvm_anyint_ty],
+                               [llvm_anyptr_ty, llvm_i1_ty, llvm_i1_ty],
                                [IntrNoMem]>,
                                GCCBuiltin<"__builtin_object_size">;
 
@@ -601,10 +602,10 @@ def int_umul_with_overflow : Intrinsic<[llvm_anyint_ty, llvm_i1_ty],
 //===------------------------- Memory Use Markers -------------------------===//
 //
 def int_lifetime_start  : Intrinsic<[],
-                                    [llvm_i64_ty, llvm_ptr_ty],
+                                    [llvm_i64_ty, llvm_anyptr_ty],
                                     [IntrArgMemOnly, NoCapture<1>]>;
 def int_lifetime_end    : Intrinsic<[],
-                                    [llvm_i64_ty, llvm_ptr_ty],
+                                    [llvm_i64_ty, llvm_anyptr_ty],
                                     [IntrArgMemOnly, NoCapture<1>]>;
 def int_invariant_start : Intrinsic<[llvm_descriptor_ty],
                                     [llvm_i64_ty, llvm_anyptr_ty],
@@ -655,18 +656,18 @@ def int_experimental_gc_relocate : Intrinsic<[llvm_any_ty],
 
 // Coroutine Structure Intrinsics.
 
-def int_coro_id : Intrinsic<[llvm_token_ty], [llvm_i32_ty, llvm_ptr_ty, 
-                             llvm_ptr_ty, llvm_ptr_ty], 
-                            [IntrArgMemOnly, IntrReadMem, 
+def int_coro_id : Intrinsic<[llvm_token_ty], [llvm_i32_ty, llvm_ptr_ty,
+                             llvm_ptr_ty, llvm_ptr_ty],
+                            [IntrArgMemOnly, IntrReadMem,
                              ReadNone<1>, ReadOnly<2>, NoCapture<2>]>;
 def int_coro_alloc : Intrinsic<[llvm_i1_ty], [llvm_token_ty], []>;
 def int_coro_begin : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty],
                                [WriteOnly<1>]>;
 
-def int_coro_free : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty], 
-                              [IntrReadMem, IntrArgMemOnly, ReadOnly<1>, 
+def int_coro_free : Intrinsic<[llvm_ptr_ty], [llvm_token_ty, llvm_ptr_ty],
+                              [IntrReadMem, IntrArgMemOnly, ReadOnly<1>,
                                NoCapture<1>]>;
-def int_coro_end : Intrinsic<[], [llvm_ptr_ty, llvm_i1_ty], []>;
+def int_coro_end : Intrinsic<[llvm_i1_ty], [llvm_ptr_ty, llvm_i1_ty], []>;
 
 def int_coro_frame : Intrinsic<[llvm_ptr_ty], [], [IntrNoMem]>;
 def int_coro_size : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>;
@@ -781,6 +782,10 @@ def int_memcpy_element_atomic  : Intrinsic<[],
                                  [IntrArgMemOnly, NoCapture<0>, NoCapture<1>,
                                   WriteOnly<0>, ReadOnly<1>]>;
 
+//===----- Intrinsics that are used to provide predicate information -----===//
+
+def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>],
+                             [IntrNoMem, Returned<0>]>;
 //===----------------------------------------------------------------------===//
 // Target-specific intrinsics
 //===----------------------------------------------------------------------===//
diff --git a/include/llvm/IR/IntrinsicsAMDGPU.td b/include/llvm/IR/IntrinsicsAMDGPU.td
index 05fbac176fb929fb16d4ecea50b959fc3a1d214c..5415c6b0d1518f0ea9ff8baff9b7a3279a51832c 100644
--- a/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -64,6 +64,10 @@ def int_r600_recipsqrt_clamped : Intrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
 >;
 
+def int_r600_cube : Intrinsic<
+  [llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]
+>;
+
 } // End TargetPrefix = "r600"
 
 let TargetPrefix = "amdgcn" in {
@@ -121,7 +125,8 @@ def int_amdgcn_s_barrier : GCCBuiltin<"__builtin_amdgcn_s_barrier">,
 def int_amdgcn_wave_barrier : GCCBuiltin<"__builtin_amdgcn_wave_barrier">,
   Intrinsic<[], [], [IntrConvergent]>;
 
-def int_amdgcn_s_waitcnt : Intrinsic<[], [llvm_i32_ty], []>;
+def int_amdgcn_s_waitcnt : GCCBuiltin<"__builtin_amdgcn_s_waitcnt">,
+  Intrinsic<[], [llvm_i32_ty], []>;
 
 def int_amdgcn_div_scale : Intrinsic<
   // 1st parameter: Numerator
@@ -202,10 +207,19 @@ def int_amdgcn_fract : Intrinsic<
   [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
 >;
 
+def int_amdgcn_cvt_pkrtz : Intrinsic<
+  [llvm_v2f16_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]
+>;
+
 def int_amdgcn_class : Intrinsic<
   [llvm_i1_ty], [llvm_anyfloat_ty, llvm_i32_ty], [IntrNoMem]
 >;
 
+def int_amdgcn_fmed3 : GCCBuiltin<"__builtin_amdgcn_fmed3">,
+  Intrinsic<[llvm_anyfloat_ty],
+    [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]
+>;
+
 def int_amdgcn_cubeid : GCCBuiltin<"__builtin_amdgcn_cubeid">,
   Intrinsic<[llvm_float_ty],
     [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]
@@ -231,17 +245,20 @@ def int_amdgcn_cubetc : GCCBuiltin<"__builtin_amdgcn_cubetc">,
 def int_amdgcn_sffbh :
   Intrinsic<[llvm_anyint_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 
-// TODO: Do we want an ordering for these?
-def int_amdgcn_atomic_inc : Intrinsic<[llvm_anyint_ty],
-  [llvm_anyptr_ty, LLVMMatchType<0>],
-  [IntrArgMemOnly, NoCapture<0>]
->;
 
-def int_amdgcn_atomic_dec : Intrinsic<[llvm_anyint_ty],
-  [llvm_anyptr_ty, LLVMMatchType<0>],
+// Fields should mirror atomicrmw
+class AMDGPUAtomicIncIntrin : Intrinsic<[llvm_anyint_ty],
+  [llvm_anyptr_ty,
+  LLVMMatchType<0>,
+  llvm_i32_ty, // ordering
+  llvm_i32_ty, // scope
+  llvm_i1_ty], // isVolatile
   [IntrArgMemOnly, NoCapture<0>]
 >;
 
+def int_amdgcn_atomic_inc : AMDGPUAtomicIncIntrin;
+def int_amdgcn_atomic_dec : AMDGPUAtomicIncIntrin;
+
 class AMDGPUImageLoad : Intrinsic <
   [llvm_anyfloat_ty], // vdata(VGPR)
   [llvm_anyint_ty,    // vaddr(VGPR)
@@ -556,7 +573,14 @@ def int_amdgcn_ds_swizzle :
   GCCBuiltin<"__builtin_amdgcn_ds_swizzle">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
 
-// llvm.amdgcn.lerp
+def int_amdgcn_ubfe : Intrinsic<[llvm_anyint_ty],
+  [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]
+>;
+
+def int_amdgcn_sbfe : Intrinsic<[llvm_anyint_ty],
+  [LLVMMatchType<0>, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]
+>;
+
 def int_amdgcn_lerp :
   GCCBuiltin<"__builtin_amdgcn_lerp">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -645,10 +669,51 @@ def int_amdgcn_s_memrealtime :
 
 // llvm.amdgcn.ds.permute <index> <src>
 def int_amdgcn_ds_permute :
+  GCCBuiltin<"__builtin_amdgcn_ds_permute">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
 
 // llvm.amdgcn.ds.bpermute <index> <src>
 def int_amdgcn_ds_bpermute :
+  GCCBuiltin<"__builtin_amdgcn_ds_bpermute">,
   Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty], [IntrNoMem, IntrConvergent]>;
 
+
+//===----------------------------------------------------------------------===//
+// Special Intrinsics for backend internal use only. No frontend
+// should emit calls to these.
+// ===----------------------------------------------------------------------===//
+def int_amdgcn_if : Intrinsic<[llvm_i1_ty, llvm_i64_ty],
+  [llvm_i1_ty], [IntrConvergent]
+>;
+
+def int_amdgcn_else : Intrinsic<[llvm_i1_ty, llvm_i64_ty],
+  [llvm_i64_ty], [IntrConvergent]
+>;
+
+def int_amdgcn_break : Intrinsic<[llvm_i64_ty],
+  [llvm_i64_ty], [IntrNoMem, IntrConvergent]
+>;
+
+def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty],
+  [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
+>;
+
+def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty],
+  [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]
+>;
+
+def int_amdgcn_loop : Intrinsic<[llvm_i1_ty],
+  [llvm_i64_ty], [IntrConvergent]
+>;
+
+def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
+
+// Represent unreachable in a divergent region.
+def int_amdgcn_unreachable : Intrinsic<[], [], [IntrConvergent]>;
+
+// Emit 2.5 ulp, no denormal division. Should only be inserted by
+// pass based on !fpmath metadata.
+def int_amdgcn_fdiv_fast : Intrinsic<
+  [llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]
+>;
 }
diff --git a/include/llvm/IR/IntrinsicsARM.td b/include/llvm/IR/IntrinsicsARM.td
index 24239689a62ee71d9809850f647fd4d0063da598..18ed24be56d4e59f126deec4734432e726dc46e4 100644
--- a/include/llvm/IR/IntrinsicsARM.td
+++ b/include/llvm/IR/IntrinsicsARM.td
@@ -67,7 +67,7 @@ def int_arm_isb : GCCBuiltin<"__builtin_arm_isb">, MSBuiltin<"__isb">,
 // VFP
 
 def int_arm_get_fpscr : GCCBuiltin<"__builtin_arm_get_fpscr">,
-                       Intrinsic<[llvm_i32_ty], [], [IntrNoMem]>;
+                       Intrinsic<[llvm_i32_ty], [], []>;
 def int_arm_set_fpscr : GCCBuiltin<"__builtin_arm_set_fpscr">,
                        Intrinsic<[], [llvm_i32_ty], []>;
 def int_arm_vcvtr     : Intrinsic<[llvm_float_ty], [llvm_anyfloat_ty],
diff --git a/include/llvm/IR/IntrinsicsHexagon.td b/include/llvm/IR/IntrinsicsHexagon.td
index 6519f051deeb7c8cbc11cf7d2638ba30762fa11b..8ac56e03be6a60265e2d486e66a4986704235c7b 100644
--- a/include/llvm/IR/IntrinsicsHexagon.td
+++ b/include/llvm/IR/IntrinsicsHexagon.td
@@ -5659,20 +5659,20 @@ class Hexagon_v2048v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
                           [IntrNoMem]>;
 
 //
-// Hexagon_LLiLLiLLi_Intrinsic<string GCCIntSuffix>
-// tag : M6_vabsdiffb
-class Hexagon_LLiLLiLLi_Intrinsic<string GCCIntSuffix>
+// Hexagon_vv64ivmemv512_Intrinsic<string GCCIntSuffix>
+// tag: V6_vS32b_qpred_ai
+class Hexagon_vv64ivmemv512_Intrinsic<string GCCIntSuffix>
  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i64_ty,llvm_i64_ty],
-                          [IntrNoMem]>;
+                          [], [llvm_v512i1_ty,llvm_ptr_ty,llvm_v16i32_ty],
+                          [IntrArgMemOnly]>;
 
 //
-// Hexagon_LLii_Intrinsic<string GCCIntSuffix>
-// tag : S6_vsplatrbp
-class Hexagon_LLii_Intrinsic<string GCCIntSuffix>
+// Hexagon_vv128ivmemv1024_Intrinsic<string GCCIntSuffix>
+// tag: V6_vS32b_qpred_ai_128B
+class Hexagon_vv128ivmemv1024_Intrinsic<string GCCIntSuffix>
  : Hexagon_Intrinsic<GCCIntSuffix,
-                          [llvm_i64_ty], [llvm_i32_ty],
-                          [IntrNoMem]>;
+                          [], [llvm_v1024i1_ty,llvm_ptr_ty,llvm_v32i32_ty],
+                          [IntrArgMemOnly]>;
 
 //
 // BUILTIN_INFO(HEXAGON.S6_rol_i_r,SI_ftype_SISI,2)
@@ -9342,6 +9342,303 @@ Hexagon_v1024v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh_oracc">;
 def int_hexagon_V6_vlutvwh_oracc_128B :
 Hexagon_v2048v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_oracc_128B">;
 
+//
+// Masked vector stores
+//
+def int_hexagon_V6_vmaskedstoreq :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstoreq">;
+
+def int_hexagon_V6_vmaskedstorenq :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorenq">;
+
+def int_hexagon_V6_vmaskedstorentq :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorentq">;
+
+def int_hexagon_V6_vmaskedstorentnq :
+Hexagon_vv64ivmemv512_Intrinsic<"HEXAGON_V6_vmaskedstorentnq">;
+
+def int_hexagon_V6_vmaskedstoreq_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstoreq_128B">;
+
+def int_hexagon_V6_vmaskedstorenq_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorenq_128B">;
+
+def int_hexagon_V6_vmaskedstorentq_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentq_128B">;
+
+def int_hexagon_V6_vmaskedstorentnq_128B :
+Hexagon_vv128ivmemv1024_Intrinsic<"HEXAGON_V6_vmaskedstorentnq_128B">;
+
+
+///
+/// HexagonV62 intrinsics
+///
+
+//
+// Hexagon_LLiLLiLLi_Intrinsic<string GCCIntSuffix>
+// tag : M6_vabsdiffb
+class Hexagon_LLiLLiLLi_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i64_ty], [llvm_i64_ty,llvm_i64_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_LLii_Intrinsic<string GCCIntSuffix>
+// tag : S6_vsplatrbp
+class Hexagon_LLii_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_i64_ty], [llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v512v512i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vlsrb
+class Hexagon_V62_v512v512i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v1024i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vlsrb_128B
+class Hexagon_V62_v1024v1024i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v512v512v512i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vasrwuhrndsat
+class Hexagon_V62_v512v512v512i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vasrwuhrndsat_128B
+class Hexagon_V62_v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v512v512v512_Intrinsic<string GCCIntSuffix>
+// tag : V6_vrounduwuh
+class Hexagon_V62_v512v512v512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v1024v1024_Intrinsic<string GCCIntSuffix>
+// tag : V6_vrounduwuh_128B
+class Hexagon_V62_v1024v1024v1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v2048v2048v2048_Intrinsic<string GCCIntSuffix>
+// tag : V6_vadduwsat_dv_128B
+class Hexagon_V62_v2048v2048v2048_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v1024v512v512_Intrinsic<string GCCIntSuffix>
+// tag : V6_vaddhw_acc
+class Hexagon_V62_v1024v1024v512v512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_v16i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v2048v2048v1024v1024_Intrinsic<string GCCIntSuffix>
+// tag : V6_vaddhw_acc_128B
+class Hexagon_V62_v2048v2048v1024v1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_v32i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v512v512_Intrinsic<string GCCIntSuffix>
+// tag : V6_vmpyewuh_64
+class Hexagon_V62_v1024v512v512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v2048v1024v1024_Intrinsic<string GCCIntSuffix>
+// tag : V6_vmpyewuh_64_128B
+class Hexagon_V62_v2048v1024v1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v2048v2048i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vmpauhb_128B
+class Hexagon_V62_v2048v2048i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v2048v2048v2048i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vmpauhb_acc_128B
+class Hexagon_V62_v2048v2048v2048i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v64i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v512v64ii_Intrinsic<string GCCIntSuffix>
+// tag : V6_vandnqrt
+class Hexagon_V62_v512v64ii_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v16i32_ty], [llvm_v512i1_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v128ii_Intrinsic<string GCCIntSuffix>
+// tag : V6_vandnqrt_128B
+class Hexagon_V62_v1024v128ii_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v1024i1_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v512v512v64ii_Intrinsic<string GCCIntSuffix>
+// tag : V6_vandnqrt_acc
+class Hexagon_V62_v512v512v64ii_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v512i1_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v1024v128ii_Intrinsic<string GCCIntSuffix>
+// tag : V6_vandnqrt_acc_128B
+class Hexagon_V62_v1024v1024v128ii_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v1024i1_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v512v64iv512_Intrinsic<string GCCIntSuffix>
+// tag : V6_vandvqv
+class Hexagon_V62_v512v64iv512_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v16i32_ty], [llvm_v512i1_ty,llvm_v16i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v128iv1024_Intrinsic<string GCCIntSuffix>
+// tag : V6_vandvqv_128B
+class Hexagon_V62_v1024v128iv1024_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v1024i1_ty,llvm_v32i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v64ii_Intrinsic<string GCCIntSuffix>
+// tag : V6_pred_scalar2v2
+class Hexagon_V62_v64ii_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v512i1_ty], [llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v128ii_Intrinsic<string GCCIntSuffix>
+// tag : V6_pred_scalar2v2_128B
+class Hexagon_V62_v128ii_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v1024i1_ty], [llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v64iv64iv64i_Intrinsic<string GCCIntSuffix>
+// tag : V6_shuffeqw
+class Hexagon_V62_v64iv64iv64i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v512i1_ty], [llvm_v512i1_ty,llvm_v512i1_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v128iv128iv128i_Intrinsic<string GCCIntSuffix>
+// tag : V6_shuffeqw_128B
+class Hexagon_V62_v128iv128iv128i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v1024i1_ty], [llvm_v1024i1_ty,llvm_v1024i1_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v512i_Intrinsic<string GCCIntSuffix>
+// tag : V6_lvsplath
+class Hexagon_V62_v512i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v16i32_ty], [llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024i_Intrinsic<string GCCIntSuffix>
+// tag : V6_lvsplath_128B
+class Hexagon_V62_v1024i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v512v512v512v512i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vlutvvb_oracci
+class Hexagon_V62_v512v512v512v512i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v16i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vlutvvb_oracci_128B
+class Hexagon_V62_v1024v1024v1024v1024i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v512v512i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vlutvwhi
+class Hexagon_V62_v1024v512v512i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vlutvwhi_128B
+class Hexagon_V62_v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v64i32_ty], [llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v1024v1024v512v512i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vlutvwh_oracci
+class Hexagon_V62_v1024v1024v512v512i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v32i32_ty], [llvm_v32i32_ty,llvm_v16i32_ty,llvm_v16i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+//
+// Hexagon_V62_v2048v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
+// tag : V6_vlutvwh_oracci_128B
+class Hexagon_V62_v2048v2048v1024v1024i_Intrinsic<string GCCIntSuffix>
+ : Hexagon_Intrinsic<GCCIntSuffix,
+                          [llvm_v64i32_ty], [llvm_v64i32_ty,llvm_v32i32_ty,llvm_v32i32_ty,llvm_i32_ty],
+                          [IntrNoMem]>;
+
+
 //
 // BUILTIN_INFO(HEXAGON.M6_vabsdiffb,DI_ftype_DIDI,2)
 // tag : M6_vabsdiffb
@@ -9354,12 +9651,6 @@ Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_M6_vabsdiffb">;
 def int_hexagon_M6_vabsdiffub :
 Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_M6_vabsdiffub">;
 
-//
-// BUILTIN_INFO(HEXAGON.S6_vsplatrbp,DI_ftype_SI,1)
-// tag : S6_vsplatrbp
-def int_hexagon_S6_vsplatrbp :
-Hexagon_LLii_Intrinsic<"HEXAGON_S6_vsplatrbp">;
-
 //
 // BUILTIN_INFO(HEXAGON.S6_vtrunehb_ppp,DI_ftype_DIDI,2)
 // tag : S6_vtrunehb_ppp
@@ -9371,3 +9662,550 @@ Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_S6_vtrunehb_ppp">;
 // tag : S6_vtrunohb_ppp
 def int_hexagon_S6_vtrunohb_ppp :
 Hexagon_LLiLLiLLi_Intrinsic<"HEXAGON_S6_vtrunohb_ppp">;
+
+//
+// BUILTIN_INFO(HEXAGON.S6_vsplatrbp,DI_ftype_SI,1)
+// tag : S6_vsplatrbp
+def int_hexagon_S6_vsplatrbp :
+Hexagon_LLii_Intrinsic<"HEXAGON_S6_vsplatrbp">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlsrb,VI_ftype_VISI,2)
+// tag : V6_vlsrb
+def int_hexagon_V6_vlsrb :
+Hexagon_V62_v512v512i_Intrinsic<"HEXAGON_V6_vlsrb">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlsrb_128B,VI_ftype_VISI,2)
+// tag : V6_vlsrb_128B
+def int_hexagon_V6_vlsrb_128B :
+Hexagon_V62_v1024v1024i_Intrinsic<"HEXAGON_V6_vlsrb_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vasrwuhrndsat,VI_ftype_VIVISI,3)
+// tag : V6_vasrwuhrndsat
+def int_hexagon_V6_vasrwuhrndsat :
+Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrwuhrndsat">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vasrwuhrndsat_128B,VI_ftype_VIVISI,3)
+// tag : V6_vasrwuhrndsat_128B
+def int_hexagon_V6_vasrwuhrndsat_128B :
+Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrwuhrndsat_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vasruwuhrndsat,VI_ftype_VIVISI,3)
+// tag : V6_vasruwuhrndsat
+def int_hexagon_V6_vasruwuhrndsat :
+Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vasruwuhrndsat">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vasruwuhrndsat_128B,VI_ftype_VIVISI,3)
+// tag : V6_vasruwuhrndsat_128B
+def int_hexagon_V6_vasruwuhrndsat_128B :
+Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasruwuhrndsat_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vasrhbsat,VI_ftype_VIVISI,3)
+// tag : V6_vasrhbsat
+def int_hexagon_V6_vasrhbsat :
+Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vasrhbsat">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vasrhbsat_128B,VI_ftype_VIVISI,3)
+// tag : V6_vasrhbsat_128B
+def int_hexagon_V6_vasrhbsat_128B :
+Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vasrhbsat_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vrounduwuh,VI_ftype_VIVI,2)
+// tag : V6_vrounduwuh
+def int_hexagon_V6_vrounduwuh :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vrounduwuh">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vrounduwuh_128B,VI_ftype_VIVI,2)
+// tag : V6_vrounduwuh_128B
+def int_hexagon_V6_vrounduwuh_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrounduwuh_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vrounduhub,VI_ftype_VIVI,2)
+// tag : V6_vrounduhub
+def int_hexagon_V6_vrounduhub :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vrounduhub">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vrounduhub_128B,VI_ftype_VIVI,2)
+// tag : V6_vrounduhub_128B
+def int_hexagon_V6_vrounduhub_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vrounduhub_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vadduwsat,VI_ftype_VIVI,2)
+// tag : V6_vadduwsat
+def int_hexagon_V6_vadduwsat :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vadduwsat">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vadduwsat_128B,VI_ftype_VIVI,2)
+// tag : V6_vadduwsat_128B
+def int_hexagon_V6_vadduwsat_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vadduwsat_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vadduwsat_dv,VD_ftype_VDVD,2)
+// tag : V6_vadduwsat_dv
+def int_hexagon_V6_vadduwsat_dv :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vadduwsat_dv">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vadduwsat_dv_128B,VD_ftype_VDVD,2)
+// tag : V6_vadduwsat_dv_128B
+def int_hexagon_V6_vadduwsat_dv_128B :
+Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vadduwsat_dv_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsubuwsat,VI_ftype_VIVI,2)
+// tag : V6_vsubuwsat
+def int_hexagon_V6_vsubuwsat :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsubuwsat">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsubuwsat_128B,VI_ftype_VIVI,2)
+// tag : V6_vsubuwsat_128B
+def int_hexagon_V6_vsubuwsat_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubuwsat_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsubuwsat_dv,VD_ftype_VDVD,2)
+// tag : V6_vsubuwsat_dv
+def int_hexagon_V6_vsubuwsat_dv :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubuwsat_dv">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsubuwsat_dv_128B,VD_ftype_VDVD,2)
+// tag : V6_vsubuwsat_dv_128B
+def int_hexagon_V6_vsubuwsat_dv_128B :
+Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubuwsat_dv_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddbsat,VI_ftype_VIVI,2)
+// tag : V6_vaddbsat
+def int_hexagon_V6_vaddbsat :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddbsat">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddbsat_128B,VI_ftype_VIVI,2)
+// tag : V6_vaddbsat_128B
+def int_hexagon_V6_vaddbsat_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddbsat_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddbsat_dv,VD_ftype_VDVD,2)
+// tag : V6_vaddbsat_dv
+def int_hexagon_V6_vaddbsat_dv :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddbsat_dv">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddbsat_dv_128B,VD_ftype_VDVD,2)
+// tag : V6_vaddbsat_dv_128B
+def int_hexagon_V6_vaddbsat_dv_128B :
+Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vaddbsat_dv_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsubbsat,VI_ftype_VIVI,2)
+// tag : V6_vsubbsat
+def int_hexagon_V6_vsubbsat :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsubbsat">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsubbsat_128B,VI_ftype_VIVI,2)
+// tag : V6_vsubbsat_128B
+def int_hexagon_V6_vsubbsat_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubbsat_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsubbsat_dv,VD_ftype_VDVD,2)
+// tag : V6_vsubbsat_dv
+def int_hexagon_V6_vsubbsat_dv :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubbsat_dv">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsubbsat_dv_128B,VD_ftype_VDVD,2)
+// tag : V6_vsubbsat_dv_128B
+def int_hexagon_V6_vsubbsat_dv_128B :
+Hexagon_V62_v2048v2048v2048_Intrinsic<"HEXAGON_V6_vsubbsat_dv_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddububb_sat,VI_ftype_VIVI,2)
+// tag : V6_vaddububb_sat
+def int_hexagon_V6_vaddububb_sat :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddububb_sat">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddububb_sat_128B,VI_ftype_VIVI,2)
+// tag : V6_vaddububb_sat_128B
+def int_hexagon_V6_vaddububb_sat_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddububb_sat_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsubububb_sat,VI_ftype_VIVI,2)
+// tag : V6_vsubububb_sat
+def int_hexagon_V6_vsubububb_sat :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsubububb_sat">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsubububb_sat_128B,VI_ftype_VIVI,2)
+// tag : V6_vsubububb_sat_128B
+def int_hexagon_V6_vsubububb_sat_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsubububb_sat_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddhw_acc,VD_ftype_VDVIVI,3)
+// tag : V6_vaddhw_acc
+def int_hexagon_V6_vaddhw_acc :
+Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vaddhw_acc">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddhw_acc_128B,VD_ftype_VDVIVI,3)
+// tag : V6_vaddhw_acc_128B
+def int_hexagon_V6_vaddhw_acc_128B :
+Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vaddhw_acc_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vadduhw_acc,VD_ftype_VDVIVI,3)
+// tag : V6_vadduhw_acc
+def int_hexagon_V6_vadduhw_acc :
+Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vadduhw_acc">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vadduhw_acc_128B,VD_ftype_VDVIVI,3)
+// tag : V6_vadduhw_acc_128B
+def int_hexagon_V6_vadduhw_acc_128B :
+Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vadduhw_acc_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddubh_acc,VD_ftype_VDVIVI,3)
+// tag : V6_vaddubh_acc
+def int_hexagon_V6_vaddubh_acc :
+Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vaddubh_acc">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddubh_acc_128B,VD_ftype_VDVIVI,3)
+// tag : V6_vaddubh_acc_128B
+def int_hexagon_V6_vaddubh_acc_128B :
+Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vaddubh_acc_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpyewuh_64,VD_ftype_VIVI,2)
+// tag : V6_vmpyewuh_64
+def int_hexagon_V6_vmpyewuh_64 :
+Hexagon_V62_v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyewuh_64">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpyewuh_64_128B,VD_ftype_VIVI,2)
+// tag : V6_vmpyewuh_64_128B
+def int_hexagon_V6_vmpyewuh_64_128B :
+Hexagon_V62_v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyewuh_64_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpyowh_64_acc,VD_ftype_VDVIVI,3)
+// tag : V6_vmpyowh_64_acc
+def int_hexagon_V6_vmpyowh_64_acc :
+Hexagon_V62_v1024v1024v512v512_Intrinsic<"HEXAGON_V6_vmpyowh_64_acc">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpyowh_64_acc_128B,VD_ftype_VDVIVI,3)
+// tag : V6_vmpyowh_64_acc_128B
+def int_hexagon_V6_vmpyowh_64_acc_128B :
+Hexagon_V62_v2048v2048v1024v1024_Intrinsic<"HEXAGON_V6_vmpyowh_64_acc_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpauhb,VD_ftype_VDSI,2)
+// tag : V6_vmpauhb
+def int_hexagon_V6_vmpauhb :
+Hexagon_V62_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpauhb">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpauhb_128B,VD_ftype_VDSI,2)
+// tag : V6_vmpauhb_128B
+def int_hexagon_V6_vmpauhb_128B :
+Hexagon_V62_v2048v2048i_Intrinsic<"HEXAGON_V6_vmpauhb_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpauhb_acc,VD_ftype_VDVDSI,3)
+// tag : V6_vmpauhb_acc
+def int_hexagon_V6_vmpauhb_acc :
+Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpauhb_acc">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpauhb_acc_128B,VD_ftype_VDVDSI,3)
+// tag : V6_vmpauhb_acc_128B
+def int_hexagon_V6_vmpauhb_acc_128B :
+Hexagon_V62_v2048v2048v2048i_Intrinsic<"HEXAGON_V6_vmpauhb_acc_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpyiwub,VI_ftype_VISI,2)
+// tag : V6_vmpyiwub
+def int_hexagon_V6_vmpyiwub :
+Hexagon_V62_v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwub">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpyiwub_128B,VI_ftype_VISI,2)
+// tag : V6_vmpyiwub_128B
+def int_hexagon_V6_vmpyiwub_128B :
+Hexagon_V62_v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwub_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpyiwub_acc,VI_ftype_VIVISI,3)
+// tag : V6_vmpyiwub_acc
+def int_hexagon_V6_vmpyiwub_acc :
+Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vmpyiwub_acc">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmpyiwub_acc_128B,VI_ftype_VIVISI,3)
+// tag : V6_vmpyiwub_acc_128B
+def int_hexagon_V6_vmpyiwub_acc_128B :
+Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vmpyiwub_acc_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vandnqrt,VI_ftype_QVSI,2)
+// tag : V6_vandnqrt
+def int_hexagon_V6_vandnqrt :
+Hexagon_V62_v512v64ii_Intrinsic<"HEXAGON_V6_vandnqrt">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vandnqrt_128B,VI_ftype_QVSI,2)
+// tag : V6_vandnqrt_128B
+def int_hexagon_V6_vandnqrt_128B :
+Hexagon_V62_v1024v128ii_Intrinsic<"HEXAGON_V6_vandnqrt_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vandnqrt_acc,VI_ftype_VIQVSI,3)
+// tag : V6_vandnqrt_acc
+def int_hexagon_V6_vandnqrt_acc :
+Hexagon_V62_v512v512v64ii_Intrinsic<"HEXAGON_V6_vandnqrt_acc">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vandnqrt_acc_128B,VI_ftype_VIQVSI,3)
+// tag : V6_vandnqrt_acc_128B
+def int_hexagon_V6_vandnqrt_acc_128B :
+Hexagon_V62_v1024v1024v128ii_Intrinsic<"HEXAGON_V6_vandnqrt_acc_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vandvqv,VI_ftype_QVVI,2)
+// tag : V6_vandvqv
+def int_hexagon_V6_vandvqv :
+Hexagon_V62_v512v64iv512_Intrinsic<"HEXAGON_V6_vandvqv">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vandvqv_128B,VI_ftype_QVVI,2)
+// tag : V6_vandvqv_128B
+def int_hexagon_V6_vandvqv_128B :
+Hexagon_V62_v1024v128iv1024_Intrinsic<"HEXAGON_V6_vandvqv_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vandvnqv,VI_ftype_QVVI,2)
+// tag : V6_vandvnqv
+def int_hexagon_V6_vandvnqv :
+Hexagon_V62_v512v64iv512_Intrinsic<"HEXAGON_V6_vandvnqv">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vandvnqv_128B,VI_ftype_QVVI,2)
+// tag : V6_vandvnqv_128B
+def int_hexagon_V6_vandvnqv_128B :
+Hexagon_V62_v1024v128iv1024_Intrinsic<"HEXAGON_V6_vandvnqv_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_pred_scalar2v2,QV_ftype_SI,1)
+// tag : V6_pred_scalar2v2
+def int_hexagon_V6_pred_scalar2v2 :
+Hexagon_V62_v64ii_Intrinsic<"HEXAGON_V6_pred_scalar2v2">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_pred_scalar2v2_128B,QV_ftype_SI,1)
+// tag : V6_pred_scalar2v2_128B
+def int_hexagon_V6_pred_scalar2v2_128B :
+Hexagon_V62_v128ii_Intrinsic<"HEXAGON_V6_pred_scalar2v2_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_shuffeqw,QV_ftype_QVQV,2)
+// tag : V6_shuffeqw
+def int_hexagon_V6_shuffeqw :
+Hexagon_V62_v64iv64iv64i_Intrinsic<"HEXAGON_V6_shuffeqw">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_shuffeqw_128B,QV_ftype_QVQV,2)
+// tag : V6_shuffeqw_128B
+def int_hexagon_V6_shuffeqw_128B :
+Hexagon_V62_v128iv128iv128i_Intrinsic<"HEXAGON_V6_shuffeqw_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_shuffeqh,QV_ftype_QVQV,2)
+// tag : V6_shuffeqh
+def int_hexagon_V6_shuffeqh :
+Hexagon_V62_v64iv64iv64i_Intrinsic<"HEXAGON_V6_shuffeqh">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_shuffeqh_128B,QV_ftype_QVQV,2)
+// tag : V6_shuffeqh_128B
+def int_hexagon_V6_shuffeqh_128B :
+Hexagon_V62_v128iv128iv128i_Intrinsic<"HEXAGON_V6_shuffeqh_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmaxb,VI_ftype_VIVI,2)
+// tag : V6_vmaxb
+def int_hexagon_V6_vmaxb :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vmaxb">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vmaxb_128B,VI_ftype_VIVI,2)
+// tag : V6_vmaxb_128B
+def int_hexagon_V6_vmaxb_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vmaxb_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vminb,VI_ftype_VIVI,2)
+// tag : V6_vminb
+def int_hexagon_V6_vminb :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vminb">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vminb_128B,VI_ftype_VIVI,2)
+// tag : V6_vminb_128B
+def int_hexagon_V6_vminb_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vminb_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsatuwuh,VI_ftype_VIVI,2)
+// tag : V6_vsatuwuh
+def int_hexagon_V6_vsatuwuh :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vsatuwuh">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vsatuwuh_128B,VI_ftype_VIVI,2)
+// tag : V6_vsatuwuh_128B
+def int_hexagon_V6_vsatuwuh_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vsatuwuh_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_lvsplath,VI_ftype_SI,1)
+// tag : V6_lvsplath
+def int_hexagon_V6_lvsplath :
+Hexagon_V62_v512i_Intrinsic<"HEXAGON_V6_lvsplath">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_lvsplath_128B,VI_ftype_SI,1)
+// tag : V6_lvsplath_128B
+def int_hexagon_V6_lvsplath_128B :
+Hexagon_V62_v1024i_Intrinsic<"HEXAGON_V6_lvsplath_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_lvsplatb,VI_ftype_SI,1)
+// tag : V6_lvsplatb
+def int_hexagon_V6_lvsplatb :
+Hexagon_V62_v512i_Intrinsic<"HEXAGON_V6_lvsplatb">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_lvsplatb_128B,VI_ftype_SI,1)
+// tag : V6_lvsplatb_128B
+def int_hexagon_V6_lvsplatb_128B :
+Hexagon_V62_v1024i_Intrinsic<"HEXAGON_V6_lvsplatb_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddclbw,VI_ftype_VIVI,2)
+// tag : V6_vaddclbw
+def int_hexagon_V6_vaddclbw :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddclbw">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddclbw_128B,VI_ftype_VIVI,2)
+// tag : V6_vaddclbw_128B
+def int_hexagon_V6_vaddclbw_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddclbw_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddclbh,VI_ftype_VIVI,2)
+// tag : V6_vaddclbh
+def int_hexagon_V6_vaddclbh :
+Hexagon_V62_v512v512v512_Intrinsic<"HEXAGON_V6_vaddclbh">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vaddclbh_128B,VI_ftype_VIVI,2)
+// tag : V6_vaddclbh_128B
+def int_hexagon_V6_vaddclbh_128B :
+Hexagon_V62_v1024v1024v1024_Intrinsic<"HEXAGON_V6_vaddclbh_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvvbi,VI_ftype_VIVISI,3)
+// tag : V6_vlutvvbi
+def int_hexagon_V6_vlutvvbi :
+Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvbi">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvvbi_128B,VI_ftype_VIVISI,3)
+// tag : V6_vlutvvbi_128B
+def int_hexagon_V6_vlutvvbi_128B :
+Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvbi_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvvb_oracci,VI_ftype_VIVIVISI,4)
+// tag : V6_vlutvvb_oracci
+def int_hexagon_V6_vlutvvb_oracci :
+Hexagon_V62_v512v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvb_oracci">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvvb_oracci_128B,VI_ftype_VIVIVISI,4)
+// tag : V6_vlutvvb_oracci_128B
+def int_hexagon_V6_vlutvvb_oracci_128B :
+Hexagon_V62_v1024v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvb_oracci_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvwhi,VD_ftype_VIVISI,3)
+// tag : V6_vlutvwhi
+def int_hexagon_V6_vlutvwhi :
+Hexagon_V62_v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwhi">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvwhi_128B,VD_ftype_VIVISI,3)
+// tag : V6_vlutvwhi_128B
+def int_hexagon_V6_vlutvwhi_128B :
+Hexagon_V62_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwhi_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvwh_oracci,VD_ftype_VDVIVISI,4)
+// tag : V6_vlutvwh_oracci
+def int_hexagon_V6_vlutvwh_oracci :
+Hexagon_V62_v1024v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh_oracci">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvwh_oracci_128B,VD_ftype_VDVIVISI,4)
+// tag : V6_vlutvwh_oracci_128B
+def int_hexagon_V6_vlutvwh_oracci_128B :
+Hexagon_V62_v2048v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_oracci_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvvb_nm,VI_ftype_VIVISI,3)
+// tag : V6_vlutvvb_nm
+def int_hexagon_V6_vlutvvb_nm :
+Hexagon_V62_v512v512v512i_Intrinsic<"HEXAGON_V6_vlutvvb_nm">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvvb_nm_128B,VI_ftype_VIVISI,3)
+// tag : V6_vlutvvb_nm_128B
+def int_hexagon_V6_vlutvvb_nm_128B :
+Hexagon_V62_v1024v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvvb_nm_128B">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvwh_nm,VD_ftype_VIVISI,3)
+// tag : V6_vlutvwh_nm
+def int_hexagon_V6_vlutvwh_nm :
+Hexagon_V62_v1024v512v512i_Intrinsic<"HEXAGON_V6_vlutvwh_nm">;
+
+//
+// BUILTIN_INFO(HEXAGON.V6_vlutvwh_nm_128B,VD_ftype_VIVISI,3)
+// tag : V6_vlutvwh_nm_128B
+def int_hexagon_V6_vlutvwh_nm_128B :
+Hexagon_V62_v2048v1024v1024i_Intrinsic<"HEXAGON_V6_vlutvwh_nm_128B">;
+
diff --git a/include/llvm/IR/IntrinsicsX86.td b/include/llvm/IR/IntrinsicsX86.td
index 85966af9c820d27fd8f46b9b00298e72a784d6df..d3cce634479893136a941e8609142e8ca2ab8770 100644
--- a/include/llvm/IR/IntrinsicsX86.td
+++ b/include/llvm/IR/IntrinsicsX86.td
@@ -3033,17 +3033,6 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_xop_vfrcz_ps_256 : GCCBuiltin<"__builtin_ia32_vfrczps256">,
               Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 
-  def int_x86_xop_vpcmov :
-              GCCBuiltin<"__builtin_ia32_vpcmov">,
-              Intrinsic<[llvm_v2i64_ty],
-                        [llvm_v2i64_ty, llvm_v2i64_ty, llvm_v2i64_ty],
-                        [IntrNoMem]>;
-  def int_x86_xop_vpcmov_256 :
-              GCCBuiltin<"__builtin_ia32_vpcmov_256">,
-              Intrinsic<[llvm_v4i64_ty],
-                        [llvm_v4i64_ty, llvm_v4i64_ty, llvm_v4i64_ty],
-                        [IntrNoMem]>;
-
   def int_x86_xop_vpcomb : GCCBuiltin<"__builtin_ia32_vpcomb">,
               Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, llvm_v16i8_ty,
                          llvm_i8_ty], [IntrNoMem]>;
@@ -3881,74 +3870,22 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_cvtq2mask_512 : GCCBuiltin<"__builtin_ia32_cvtq2mask512">,
               Intrinsic<[llvm_i8_ty], [llvm_v8i64_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_cvtmask2b_128 : GCCBuiltin<"__builtin_ia32_cvtmask2b128">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtmask2b_256 : GCCBuiltin<"__builtin_ia32_cvtmask2b256">,
-              Intrinsic<[llvm_v32i8_ty], [llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtmask2b_512 : GCCBuiltin<"__builtin_ia32_cvtmask2b512">,
-              Intrinsic<[llvm_v64i8_ty], [llvm_i64_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_cvtmask2w_128 : GCCBuiltin<"__builtin_ia32_cvtmask2w128">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtmask2w_256 : GCCBuiltin<"__builtin_ia32_cvtmask2w256">,
-              Intrinsic<[llvm_v16i16_ty], [llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtmask2w_512 : GCCBuiltin<"__builtin_ia32_cvtmask2w512">,
-              Intrinsic<[llvm_v32i16_ty], [llvm_i32_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_cvtmask2d_128 : GCCBuiltin<"__builtin_ia32_cvtmask2d128">,
-              Intrinsic<[llvm_v4i32_ty], [llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtmask2d_256 : GCCBuiltin<"__builtin_ia32_cvtmask2d256">,
-              Intrinsic<[llvm_v8i32_ty], [llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtmask2d_512 : GCCBuiltin<"__builtin_ia32_cvtmask2d512">,
-              Intrinsic<[llvm_v16i32_ty], [llvm_i16_ty], [IntrNoMem]>;
-
-  def int_x86_avx512_cvtmask2q_128 : GCCBuiltin<"__builtin_ia32_cvtmask2q128">,
-              Intrinsic<[llvm_v2i64_ty], [llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtmask2q_256 : GCCBuiltin<"__builtin_ia32_cvtmask2q256">,
-              Intrinsic<[llvm_v4i64_ty], [llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_cvtmask2q_512 : GCCBuiltin<"__builtin_ia32_cvtmask2q512">,
-              Intrinsic<[llvm_v8i64_ty], [llvm_i8_ty], [IntrNoMem]>;
-
 }
 
 // Pack ops.
 let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
-  def int_x86_avx512_mask_packsswb_128 : GCCBuiltin<"__builtin_ia32_packsswb128_mask">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
-                         llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packsswb_256 : GCCBuiltin<"__builtin_ia32_packsswb256_mask">,
-              Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty,llvm_v16i16_ty,
-                         llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packsswb_512 : GCCBuiltin<"__builtin_ia32_packsswb512_mask">,
-              Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty,
-                         llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packssdw_128 : GCCBuiltin<"__builtin_ia32_packssdw128_mask">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packssdw_256 : GCCBuiltin<"__builtin_ia32_packssdw256_mask">,
-              Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packssdw_512 : GCCBuiltin<"__builtin_ia32_packssdw512_mask">,
-              Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                         llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packuswb_128 : GCCBuiltin<"__builtin_ia32_packuswb128_mask">,
-              Intrinsic<[llvm_v16i8_ty], [llvm_v8i16_ty, llvm_v8i16_ty,
-                         llvm_v16i8_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packuswb_256 : GCCBuiltin<"__builtin_ia32_packuswb256_mask">,
-              Intrinsic<[llvm_v32i8_ty], [llvm_v16i16_ty,llvm_v16i16_ty,
-                         llvm_v32i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packuswb_512 : GCCBuiltin<"__builtin_ia32_packuswb512_mask">,
-              Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty,
-                         llvm_v64i8_ty, llvm_i64_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packusdw_128 : GCCBuiltin<"__builtin_ia32_packusdw128_mask">,
-              Intrinsic<[llvm_v8i16_ty], [llvm_v4i32_ty, llvm_v4i32_ty,
-                         llvm_v8i16_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packusdw_256 : GCCBuiltin<"__builtin_ia32_packusdw256_mask">,
-              Intrinsic<[llvm_v16i16_ty], [llvm_v8i32_ty, llvm_v8i32_ty,
-                         llvm_v16i16_ty, llvm_i16_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_packusdw_512 : GCCBuiltin<"__builtin_ia32_packusdw512_mask">,
-              Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty,
-                         llvm_v32i16_ty, llvm_i32_ty], [IntrNoMem]>;
+  def int_x86_avx512_packsswb_512 : GCCBuiltin<"__builtin_ia32_packsswb512">,
+              Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty],
+                        [IntrNoMem]>;
+  def int_x86_avx512_packssdw_512 : GCCBuiltin<"__builtin_ia32_packssdw512">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
+                         [IntrNoMem]>;
+  def int_x86_avx512_packuswb_512 : GCCBuiltin<"__builtin_ia32_packuswb512">,
+              Intrinsic<[llvm_v64i8_ty], [llvm_v32i16_ty,llvm_v32i16_ty],
+                         [IntrNoMem]>;
+  def int_x86_avx512_packusdw_512 : GCCBuiltin<"__builtin_ia32_packusdw512">,
+              Intrinsic<[llvm_v32i16_ty], [llvm_v16i32_ty, llvm_v16i32_ty],
+                         [IntrNoMem]>;
 }
 
 // Vector convert
@@ -4595,39 +4532,15 @@ let TargetPrefix = "x86" in {  // All intrinsics start with "llvm.x86.".
   def int_x86_avx512_mask_div_pd_512 : GCCBuiltin<"__builtin_ia32_divpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_max_ps_128 : GCCBuiltin<"__builtin_ia32_maxps_mask">,
-          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                     llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_max_ps_256 : GCCBuiltin<"__builtin_ia32_maxps256_mask">,
-          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
-                     llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_max_ps_512 : GCCBuiltin<"__builtin_ia32_maxps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_max_pd_128 : GCCBuiltin<"__builtin_ia32_maxpd_mask">,
-          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                     llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_max_pd_256 : GCCBuiltin<"__builtin_ia32_maxpd256_mask">,
-          Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
-                     llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_max_pd_512 : GCCBuiltin<"__builtin_ia32_maxpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_min_ps_128 : GCCBuiltin<"__builtin_ia32_minps_mask">,
-          Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                     llvm_v4f32_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_min_ps_256 : GCCBuiltin<"__builtin_ia32_minps256_mask">,
-          Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
-                     llvm_v8f32_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_min_ps_512 : GCCBuiltin<"__builtin_ia32_minps512_mask">,
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                      llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_min_pd_128 : GCCBuiltin<"__builtin_ia32_minpd_mask">,
-          Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                     llvm_v2f64_ty, llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_min_pd_256 : GCCBuiltin<"__builtin_ia32_minpd256_mask">,
-          Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
-                     llvm_v4f64_ty, llvm_i8_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_min_pd_512 : GCCBuiltin<"__builtin_ia32_minpd512_mask">,
           Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                      llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
@@ -5481,32 +5394,6 @@ let TargetPrefix = "x86" in {
           Intrinsic<[llvm_v8i64_ty],
                     [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
                     [IntrNoMem]>;
-
-  def int_x86_avx512_mask_lzcnt_d_128 :
-          Intrinsic<[llvm_v4i32_ty],
-                    [llvm_v4i32_ty, llvm_v4i32_ty, llvm_i8_ty],
-                    [IntrNoMem]>;
-  def int_x86_avx512_mask_lzcnt_d_256 :
-          Intrinsic<[llvm_v8i32_ty],
-                    [llvm_v8i32_ty, llvm_v8i32_ty, llvm_i8_ty],
-                    [IntrNoMem]>;
-  def int_x86_avx512_mask_lzcnt_d_512 :
-          Intrinsic<[llvm_v16i32_ty],
-                    [llvm_v16i32_ty, llvm_v16i32_ty, llvm_i16_ty],
-                    [IntrNoMem]>;
-
-  def int_x86_avx512_mask_lzcnt_q_128 :
-          Intrinsic<[llvm_v2i64_ty],
-                    [llvm_v2i64_ty, llvm_v2i64_ty, llvm_i8_ty],
-                    [IntrNoMem]>;
- def int_x86_avx512_mask_lzcnt_q_256 :
-          Intrinsic<[llvm_v4i64_ty],
-                    [llvm_v4i64_ty, llvm_v4i64_ty, llvm_i8_ty],
-                    [IntrNoMem]>;
-  def int_x86_avx512_mask_lzcnt_q_512 :
-          Intrinsic<[llvm_v8i64_ty],
-                    [llvm_v8i64_ty, llvm_v8i64_ty, llvm_i8_ty],
-                    [IntrNoMem]>;
 }
 
 // Compares
@@ -6495,3 +6382,10 @@ let TargetPrefix = "x86" in {
       : GCCBuiltin<"__builtin_ia32_mwaitx">,
         Intrinsic<[], [ llvm_i32_ty, llvm_i32_ty, llvm_i32_ty ], []>;
 }
+
+//===----------------------------------------------------------------------===//
+// Cache-line zero
+let TargetPrefix = "x86" in {
+  def int_x86_clzero : GCCBuiltin<"__builtin_ia32_clzero">,
+      Intrinsic<[], [llvm_ptr_ty], []>;
+}
diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h
index 7f43d5df3c3f8f7e88a439d79033e1dc09530ca9..d13d5ddaeb3c6665f7f5873896ee299a0a8f2b47 100644
--- a/include/llvm/IR/LLVMContext.h
+++ b/include/llvm/IR/LLVMContext.h
@@ -78,6 +78,7 @@ public:
     MD_type = 19,                     // "type"
     MD_section_prefix = 20,           // "section_prefix"
     MD_absolute_symbol = 21,          // "absolute_symbol"
+    MD_associated = 22,               // "associated"
   };
 
   /// Known operand bundle tag IDs, which always have the same value.  All
diff --git a/include/llvm/IR/MDBuilder.h b/include/llvm/IR/MDBuilder.h
index bab8728ed49f979e41ffbed3073e5b6384493ee9..899976a87bc7e7762659da22242668631f591c0b 100644
--- a/include/llvm/IR/MDBuilder.h
+++ b/include/llvm/IR/MDBuilder.h
@@ -15,7 +15,9 @@
 #ifndef LLVM_IR_MDBUILDER_H
 #define LLVM_IR_MDBUILDER_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/DataTypes.h"
 #include <utility>
 
@@ -63,8 +65,11 @@ public:
   /// Return metadata specifying that a branch or switch is unpredictable.
   MDNode *createUnpredictable();
 
-  /// Return metadata containing the entry count for a function.
-  MDNode *createFunctionEntryCount(uint64_t Count);
+  /// Return metadata containing the entry \p Count for a function, and the
+  /// GUIDs stored in \p Imports that need to be imported for sample PGO, to
+  /// enable the same inlines as the profiled optimized binary
+  MDNode *createFunctionEntryCount(uint64_t Count,
+                                   const DenseSet<GlobalValue::GUID> *Imports);
 
   /// Return metadata containing the section prefix for a function.
   MDNode *createFunctionSectionPrefix(StringRef Prefix);
diff --git a/include/llvm/IR/Mangler.h b/include/llvm/IR/Mangler.h
index 0eb91a3b0600e9aeb89834a6303995c8da27080a..56ee21392ccd6c2cf1124fce8fd9e3d4456cf6bb 100644
--- a/include/llvm/IR/Mangler.h
+++ b/include/llvm/IR/Mangler.h
@@ -21,6 +21,7 @@ namespace llvm {
 
 class DataLayout;
 template <typename T> class SmallVectorImpl;
+class Triple;
 class Twine;
 class raw_ostream;
 
@@ -46,6 +47,9 @@ public:
                                 const Twine &GVName, const DataLayout &DL);
 };
 
+void emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
+                                  const Triple &TT, Mangler &Mangler);
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/IR/Metadata.h b/include/llvm/IR/Metadata.h
index 46c785a1c05d2633328d74880da485c00ac9c8f9..0647e4253d3cf63c086ab00d40797f5db4956b3e 100644
--- a/include/llvm/IR/Metadata.h
+++ b/include/llvm/IR/Metadata.h
@@ -269,12 +269,11 @@ public:
 
 private:
   LLVMContext &Context;
-  uint64_t NextIndex;
+  uint64_t NextIndex = 0;
   SmallDenseMap<void *, std::pair<OwnerTy, uint64_t>, 4> UseMap;
 
 public:
-  ReplaceableMetadataImpl(LLVMContext &Context)
-      : Context(Context), NextIndex(0) {}
+  ReplaceableMetadataImpl(LLVMContext &Context) : Context(Context) {}
 
   ~ReplaceableMetadataImpl() {
     assert(UseMap.empty() && "Cannot destroy in-use replaceable metadata");
@@ -586,8 +585,9 @@ dyn_extract_or_null(Y &&MD) {
 class MDString : public Metadata {
   friend class StringMapEntry<MDString>;
 
-  StringMapEntry<MDString> *Entry;
-  MDString() : Metadata(MDStringKind, Uniqued), Entry(nullptr) {}
+  StringMapEntry<MDString> *Entry = nullptr;
+
+  MDString() : Metadata(MDStringKind, Uniqued) {}
 
 public:
   MDString(const MDString &) = delete;
@@ -1062,7 +1062,6 @@ public:
   static MDNode *getMostGenericRange(MDNode *A, MDNode *B);
   static MDNode *getMostGenericAliasScope(MDNode *A, MDNode *B);
   static MDNode *getMostGenericAlignmentOrDereferenceable(MDNode *A, MDNode *B);
-
 };
 
 /// \brief Tuple of metadata.
@@ -1284,7 +1283,7 @@ class NamedMDNode : public ilist_node<NamedMDNode> {
   friend class Module;
 
   std::string Name;
-  Module *Parent;
+  Module *Parent = nullptr;
   void *Operands; // SmallVector<TrackingMDRef, 4>
 
   void setParent(Module *M) { Parent = M; }
diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h
index 79870b9455a6e94a2e5676b795711682ac0fbfcc..70c57cf90addcb022df17c63e43b1a2a90bb8f93 100644
--- a/include/llvm/IR/Module.h
+++ b/include/llvm/IR/Module.h
@@ -311,7 +311,7 @@ public:
   ///   4. Finally, the function exists but has the wrong prototype: return the
   ///      function with a constantexpr cast to the right prototype.
   Constant *getOrInsertFunction(StringRef Name, FunctionType *T,
-                                AttributeSet AttributeList);
+                                AttributeList AttributeList);
 
   Constant *getOrInsertFunction(StringRef Name, FunctionType *T);
 
@@ -321,13 +321,22 @@ public:
   /// or a ConstantExpr BitCast of that type if the named function has a
   /// different type. This version of the method takes a null terminated list of
   /// function arguments, which makes it easier for clients to use.
+  template<typename... ArgsTy>
   Constant *getOrInsertFunction(StringRef Name,
-                                AttributeSet AttributeList,
-                                Type *RetTy, ...) LLVM_END_WITH_NULL;
+                                AttributeList AttributeList,
+                                Type *RetTy, ArgsTy... Args)
+  {
+    SmallVector<Type*, sizeof...(ArgsTy)> ArgTys{Args...};
+    return getOrInsertFunction(Name,
+                               FunctionType::get(RetTy, ArgTys, false),
+                               AttributeList);
+  }
 
   /// Same as above, but without the attributes.
-  Constant *getOrInsertFunction(StringRef Name, Type *RetTy, ...)
-    LLVM_END_WITH_NULL;
+  template<typename... ArgsTy>
+  Constant *getOrInsertFunction(StringRef Name, Type *RetTy, ArgsTy... Args) {
+    return getOrInsertFunction(Name, AttributeList{}, RetTy, Args...);
+  }
 
   /// Look up the specified function in the module symbol table. If it does not
   /// exist, return null.
@@ -345,20 +354,23 @@ public:
     return getGlobalVariable(Name, false);
   }
 
-  GlobalVariable *getGlobalVariable(StringRef Name, bool AllowInternal) const {
-    return const_cast<Module *>(this)->getGlobalVariable(Name, AllowInternal);
-  }
+  GlobalVariable *getGlobalVariable(StringRef Name, bool AllowInternal) const;
 
-  GlobalVariable *getGlobalVariable(StringRef Name, bool AllowInternal = false);
+  GlobalVariable *getGlobalVariable(StringRef Name,
+                                    bool AllowInternal = false) {
+    return static_cast<const Module *>(this)->getGlobalVariable(Name,
+                                                                AllowInternal);
+  }
 
   /// Return the global variable in the module with the specified name, of
   /// arbitrary type. This method returns null if a global with the specified
   /// name is not found.
-  GlobalVariable *getNamedGlobal(StringRef Name) {
+  const GlobalVariable *getNamedGlobal(StringRef Name) const {
     return getGlobalVariable(Name, true);
   }
-  const GlobalVariable *getNamedGlobal(StringRef Name) const {
-    return const_cast<Module *>(this)->getNamedGlobal(Name);
+  GlobalVariable *getNamedGlobal(StringRef Name) {
+    return const_cast<GlobalVariable *>(
+                       static_cast<const Module *>(this)->getNamedGlobal(Name));
   }
 
   /// Look up the specified global in the module symbol table.
@@ -615,6 +627,32 @@ public:
     return global_objects().end();
   }
 
+  typedef concat_iterator<GlobalValue, iterator, global_iterator,
+                          alias_iterator, ifunc_iterator>
+      global_value_iterator;
+  typedef concat_iterator<const GlobalValue, const_iterator,
+                          const_global_iterator, const_alias_iterator,
+                          const_ifunc_iterator>
+      const_global_value_iterator;
+
+  iterator_range<global_value_iterator> global_values() {
+    return concat<GlobalValue>(functions(), globals(), aliases(), ifuncs());
+  }
+  iterator_range<const_global_value_iterator> global_values() const {
+    return concat<const GlobalValue>(functions(), globals(), aliases(),
+                                     ifuncs());
+  }
+
+  global_value_iterator global_value_begin() { return global_values().begin(); }
+  global_value_iterator global_value_end() { return global_values().end(); }
+
+  const_global_value_iterator global_value_begin() const {
+    return global_values().begin();
+  }
+  const_global_value_iterator global_value_end() const {
+    return global_values().end();
+  }
+
   /// @}
   /// @name Named Metadata Iteration
   /// @{
@@ -726,6 +764,10 @@ public:
 /// @name Utility functions for querying Debug information.
 /// @{
 
+  /// \brief Returns the Number of Register ParametersDwarf Version by checking
+  /// module flags.
+  unsigned getNumberRegisterParameters() const;
+
   /// \brief Returns the Dwarf Version by checking module flags.
   unsigned getDwarfVersion() const;
 
diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h
index c710c41cccd08cbd716cbe0d035bb2bacadddcd1..09f6c1897009529392c6bd6e114d09825e3c9a8a 100644
--- a/include/llvm/IR/ModuleSummaryIndex.h
+++ b/include/llvm/IR/ModuleSummaryIndex.h
@@ -162,7 +162,7 @@ private:
 protected:
   /// GlobalValueSummary constructor.
   GlobalValueSummary(SummaryKind K, GVFlags Flags, std::vector<ValueInfo> Refs)
-      : Kind(K), Flags(Flags), RefEdgeList(std::move(Refs)) {}
+      : Kind(K), Flags(Flags), OriginalName(0), RefEdgeList(std::move(Refs)) {}
 
 public:
   virtual ~GlobalValueSummary() = default;
@@ -233,12 +233,13 @@ public:
   void setAliasee(GlobalValueSummary *Aliasee) { AliaseeSummary = Aliasee; }
 
   const GlobalValueSummary &getAliasee() const {
-    return const_cast<AliasSummary *>(this)->getAliasee();
+    assert(AliaseeSummary && "Unexpected missing aliasee summary");
+    return *AliaseeSummary;
   }
 
   GlobalValueSummary &getAliasee() {
-    assert(AliaseeSummary && "Unexpected missing aliasee summary");
-    return *AliaseeSummary;
+    return const_cast<GlobalValueSummary &>(
+                         static_cast<const AliasSummary *>(this)->getAliasee());
   }
 };
 
@@ -249,6 +250,23 @@ public:
   /// <CalleeValueInfo, CalleeInfo> call edge pair.
   typedef std::pair<ValueInfo, CalleeInfo> EdgeTy;
 
+  /// An "identifier" for a virtual function. This contains the type identifier
+  /// represented as a GUID and the offset from the address point to the virtual
+  /// function pointer, where "address point" is as defined in the Itanium ABI:
+  /// https://mentorembedded.github.io/cxx-abi/abi.html#vtable-general
+  struct VFuncId {
+    GlobalValue::GUID GUID;
+    uint64_t Offset;
+  };
+
+  /// A specification for a virtual function call with all constant integer
+  /// arguments. This is used to perform virtual constant propagation on the
+  /// summary.
+  struct ConstVCall {
+    VFuncId VFunc;
+    std::vector<uint64_t> Args;
+  };
+
 private:
   /// Number of instructions (ignoring debug instructions, e.g.) computed
   /// during the initial compile step when the summary index is first built.
@@ -257,17 +275,47 @@ private:
   /// List of <CalleeValueInfo, CalleeInfo> call edge pairs from this function.
   std::vector<EdgeTy> CallGraphEdgeList;
 
-  /// List of type identifiers used by this function, represented as GUIDs.
-  std::vector<GlobalValue::GUID> TypeIdList;
+  /// All type identifier related information. Because these fields are
+  /// relatively uncommon we only allocate space for them if necessary.
+  struct TypeIdInfo {
+    /// List of type identifiers used by this function in llvm.type.test
+    /// intrinsics other than by an llvm.assume intrinsic, represented as GUIDs.
+    std::vector<GlobalValue::GUID> TypeTests;
+
+    /// List of virtual calls made by this function using (respectively)
+    /// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics that do
+    /// not have all constant integer arguments.
+    std::vector<VFuncId> TypeTestAssumeVCalls, TypeCheckedLoadVCalls;
+
+    /// List of virtual calls made by this function using (respectively)
+    /// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics with
+    /// all constant integer arguments.
+    std::vector<ConstVCall> TypeTestAssumeConstVCalls,
+        TypeCheckedLoadConstVCalls;
+  };
+
+  std::unique_ptr<TypeIdInfo> TIdInfo;
 
 public:
   /// Summary constructors.
   FunctionSummary(GVFlags Flags, unsigned NumInsts, std::vector<ValueInfo> Refs,
                   std::vector<EdgeTy> CGEdges,
-                  std::vector<GlobalValue::GUID> TypeIds)
+                  std::vector<GlobalValue::GUID> TypeTests,
+                  std::vector<VFuncId> TypeTestAssumeVCalls,
+                  std::vector<VFuncId> TypeCheckedLoadVCalls,
+                  std::vector<ConstVCall> TypeTestAssumeConstVCalls,
+                  std::vector<ConstVCall> TypeCheckedLoadConstVCalls)
       : GlobalValueSummary(FunctionKind, Flags, std::move(Refs)),
-        InstCount(NumInsts), CallGraphEdgeList(std::move(CGEdges)),
-        TypeIdList(std::move(TypeIds)) {}
+        InstCount(NumInsts), CallGraphEdgeList(std::move(CGEdges)) {
+    if (!TypeTests.empty() || !TypeTestAssumeVCalls.empty() ||
+        !TypeCheckedLoadVCalls.empty() || !TypeTestAssumeConstVCalls.empty() ||
+        !TypeCheckedLoadConstVCalls.empty())
+      TIdInfo = llvm::make_unique<TypeIdInfo>(TypeIdInfo{
+          std::move(TypeTests), std::move(TypeTestAssumeVCalls),
+          std::move(TypeCheckedLoadVCalls),
+          std::move(TypeTestAssumeConstVCalls),
+          std::move(TypeCheckedLoadConstVCalls)});
+  }
 
   /// Check if this is a function summary.
   static bool classof(const GlobalValueSummary *GVS) {
@@ -280,8 +328,85 @@ public:
   /// Return the list of <CalleeValueInfo, CalleeInfo> pairs.
   ArrayRef<EdgeTy> calls() const { return CallGraphEdgeList; }
 
-  /// Returns the list of type identifiers used by this function.
-  ArrayRef<GlobalValue::GUID> type_tests() const { return TypeIdList; }
+  /// Returns the list of type identifiers used by this function in
+  /// llvm.type.test intrinsics other than by an llvm.assume intrinsic,
+  /// represented as GUIDs.
+  ArrayRef<GlobalValue::GUID> type_tests() const {
+    if (TIdInfo)
+      return TIdInfo->TypeTests;
+    return {};
+  }
+
+  /// Returns the list of virtual calls made by this function using
+  /// llvm.assume(llvm.type.test) intrinsics that do not have all constant
+  /// integer arguments.
+  ArrayRef<VFuncId> type_test_assume_vcalls() const {
+    if (TIdInfo)
+      return TIdInfo->TypeTestAssumeVCalls;
+    return {};
+  }
+
+  /// Returns the list of virtual calls made by this function using
+  /// llvm.type.checked.load intrinsics that do not have all constant integer
+  /// arguments.
+  ArrayRef<VFuncId> type_checked_load_vcalls() const {
+    if (TIdInfo)
+      return TIdInfo->TypeCheckedLoadVCalls;
+    return {};
+  }
+
+  /// Returns the list of virtual calls made by this function using
+  /// llvm.assume(llvm.type.test) intrinsics with all constant integer
+  /// arguments.
+  ArrayRef<ConstVCall> type_test_assume_const_vcalls() const {
+    if (TIdInfo)
+      return TIdInfo->TypeTestAssumeConstVCalls;
+    return {};
+  }
+
+  /// Returns the list of virtual calls made by this function using
+  /// llvm.type.checked.load intrinsics with all constant integer arguments.
+  ArrayRef<ConstVCall> type_checked_load_const_vcalls() const {
+    if (TIdInfo)
+      return TIdInfo->TypeCheckedLoadConstVCalls;
+    return {};
+  }
+
+  /// Add a type test to the summary. This is used by WholeProgramDevirt if we
+  /// were unable to devirtualize a checked call.
+  void addTypeTest(GlobalValue::GUID Guid) {
+    if (!TIdInfo)
+      TIdInfo = llvm::make_unique<TypeIdInfo>();
+    TIdInfo->TypeTests.push_back(Guid);
+  }
+};
+
+template <> struct DenseMapInfo<FunctionSummary::VFuncId> {
+  static FunctionSummary::VFuncId getEmptyKey() { return {0, uint64_t(-1)}; }
+  static FunctionSummary::VFuncId getTombstoneKey() {
+    return {0, uint64_t(-2)};
+  }
+  static bool isEqual(FunctionSummary::VFuncId L, FunctionSummary::VFuncId R) {
+    return L.GUID == R.GUID && L.Offset == R.Offset;
+  }
+  static unsigned getHashValue(FunctionSummary::VFuncId I) { return I.GUID; }
+};
+
+template <> struct DenseMapInfo<FunctionSummary::ConstVCall> {
+  static FunctionSummary::ConstVCall getEmptyKey() {
+    return {{0, uint64_t(-1)}, {}};
+  }
+  static FunctionSummary::ConstVCall getTombstoneKey() {
+    return {{0, uint64_t(-2)}, {}};
+  }
+  static bool isEqual(FunctionSummary::ConstVCall L,
+                      FunctionSummary::ConstVCall R) {
+    return DenseMapInfo<FunctionSummary::VFuncId>::isEqual(L.VFunc, R.VFunc) &&
+           L.Args == R.Args;
+  }
+  static unsigned getHashValue(FunctionSummary::ConstVCall I) {
+    return I.VFunc.GUID;
+  }
 };
 
 /// \brief Global variable summary information to aid decisions and
@@ -323,8 +448,40 @@ struct TypeTestResolution {
   unsigned SizeM1BitWidth = 0;
 };
 
+struct WholeProgramDevirtResolution {
+  enum Kind {
+    Indir,      ///< Just do a regular virtual call
+    SingleImpl, ///< Single implementation devirtualization
+  } TheKind = Indir;
+
+  std::string SingleImplName;
+
+  struct ByArg {
+    enum Kind {
+      Indir,            ///< Just do a regular virtual call
+      UniformRetVal,    ///< Uniform return value optimization
+      UniqueRetVal,     ///< Unique return value optimization
+      VirtualConstProp, ///< Virtual constant propagation
+    } TheKind = Indir;
+
+    /// Additional information for the resolution:
+    /// - UniformRetVal: the uniform return value.
+    /// - UniqueRetVal: the return value associated with the unique vtable (0 or
+    ///   1).
+    uint64_t Info = 0;
+  };
+
+  /// Resolutions for calls with all constant integer arguments (excluding the
+  /// first argument, "this"), where the key is the argument vector.
+  std::map<std::vector<uint64_t>, ByArg> ResByArg;
+};
+
 struct TypeIdSummary {
   TypeTestResolution TTRes;
+
+  /// Mapping from byte offset to whole-program devirt resolution for that
+  /// (typeid, byte offset) pair.
+  std::map<uint64_t, WholeProgramDevirtResolution> WPDRes;
 };
 
 /// 160 bits SHA1
@@ -372,6 +529,10 @@ private:
   // FIXME: Add bitcode read/write support for this field.
   std::map<std::string, TypeIdSummary> TypeIdMap;
 
+  /// Mapping from original ID to GUID. If original ID can map to multiple
+  /// GUIDs, it will be mapped to 0.
+  std::map<GlobalValue::GUID, GlobalValue::GUID> OidGuidMap;
+
   // YAML I/O support.
   friend yaml::MappingTraits<ModuleSummaryIndex>;
 
@@ -399,9 +560,17 @@ public:
     return GlobalValueMap.find(ValueGUID);
   }
 
+  /// Return the GUID for \p OriginalId in the OidGuidMap.
+  GlobalValue::GUID getGUIDFromOriginalID(GlobalValue::GUID OriginalID) const {
+    const auto I = OidGuidMap.find(OriginalID);
+    return I == OidGuidMap.end() ? 0 : I->second;
+  }
+
   /// Add a global value summary for a value of the given name.
   void addGlobalValueSummary(StringRef ValueName,
                              std::unique_ptr<GlobalValueSummary> Summary) {
+    addOriginalName(GlobalValue::getGUID(ValueName),
+                    Summary->getOriginalName());
     GlobalValueMap[GlobalValue::getGUID(ValueName)].push_back(
         std::move(Summary));
   }
@@ -409,9 +578,21 @@ public:
   /// Add a global value summary for a value of the given GUID.
   void addGlobalValueSummary(GlobalValue::GUID ValueGUID,
                              std::unique_ptr<GlobalValueSummary> Summary) {
+    addOriginalName(ValueGUID, Summary->getOriginalName());
     GlobalValueMap[ValueGUID].push_back(std::move(Summary));
   }
 
+  /// Add an original name for the value of the given GUID.
+  void addOriginalName(GlobalValue::GUID ValueGUID,
+                       GlobalValue::GUID OrigGUID) {
+    if (OrigGUID == 0 || ValueGUID == OrigGUID)
+      return;
+    if (OidGuidMap.count(OrigGUID) && OidGuidMap[OrigGUID] != ValueGUID)
+      OidGuidMap[OrigGUID] = 0;
+    else
+      OidGuidMap[OrigGUID] = ValueGUID;
+  }
+
   /// Find the summary for global \p GUID in module \p ModuleId, or nullptr if
   /// not found.
   GlobalValueSummary *findSummaryInModule(GlobalValue::GUID ValueGUID,
@@ -507,10 +688,25 @@ public:
     return ModulePathStringTable.count(M.getModuleIdentifier());
   }
 
-  TypeIdSummary &getTypeIdSummary(StringRef TypeId) {
+  const std::map<std::string, TypeIdSummary> &typeIds() const {
+    return TypeIdMap;
+  }
+
+  /// This accessor should only be used when exporting because it can mutate the
+  /// map.
+  TypeIdSummary &getOrInsertTypeIdSummary(StringRef TypeId) {
     return TypeIdMap[TypeId];
   }
 
+  /// This returns either a pointer to the type id summary (if present in the
+  /// summary map) or null (if not present). This may be used when importing.
+  const TypeIdSummary *getTypeIdSummary(StringRef TypeId) const {
+    auto I = TypeIdMap.find(TypeId);
+    if (I == TypeIdMap.end())
+      return nullptr;
+    return &I->second;
+  }
+
   /// Remove entries in the GlobalValueMap that have empty summaries due to the
   /// eager nature of map entry creation during VST parsing. These would
   /// also be suppressed during combined index generation in mergeFrom(),
diff --git a/include/llvm/IR/ModuleSummaryIndexYAML.h b/include/llvm/IR/ModuleSummaryIndexYAML.h
index e2880ec6fec8d6e0f20e5c60afe4c284de9e6198..80719c696935ec4e941a29ac62bff10f8850027e 100644
--- a/include/llvm/IR/ModuleSummaryIndexYAML.h
+++ b/include/llvm/IR/ModuleSummaryIndexYAML.h
@@ -33,20 +33,135 @@ template <> struct MappingTraits<TypeTestResolution> {
   }
 };
 
+template <>
+struct ScalarEnumerationTraits<WholeProgramDevirtResolution::ByArg::Kind> {
+  static void enumeration(IO &io,
+                          WholeProgramDevirtResolution::ByArg::Kind &value) {
+    io.enumCase(value, "Indir", WholeProgramDevirtResolution::ByArg::Indir);
+    io.enumCase(value, "UniformRetVal",
+                WholeProgramDevirtResolution::ByArg::UniformRetVal);
+    io.enumCase(value, "UniqueRetVal",
+                WholeProgramDevirtResolution::ByArg::UniqueRetVal);
+    io.enumCase(value, "VirtualConstProp",
+                WholeProgramDevirtResolution::ByArg::VirtualConstProp);
+  }
+};
+
+template <> struct MappingTraits<WholeProgramDevirtResolution::ByArg> {
+  static void mapping(IO &io, WholeProgramDevirtResolution::ByArg &res) {
+    io.mapOptional("Kind", res.TheKind);
+    io.mapOptional("Info", res.Info);
+  }
+};
+
+template <>
+struct CustomMappingTraits<
+    std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg>> {
+  static void inputOne(
+      IO &io, StringRef Key,
+      std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg> &V) {
+    std::vector<uint64_t> Args;
+    std::pair<StringRef, StringRef> P = {"", Key};
+    while (!P.second.empty()) {
+      P = P.second.split(',');
+      uint64_t Arg;
+      if (P.first.getAsInteger(0, Arg)) {
+        io.setError("key not an integer");
+        return;
+      }
+      Args.push_back(Arg);
+    }
+    io.mapRequired(Key.str().c_str(), V[Args]);
+  }
+  static void output(
+      IO &io,
+      std::map<std::vector<uint64_t>, WholeProgramDevirtResolution::ByArg> &V) {
+    for (auto &P : V) {
+      std::string Key;
+      for (uint64_t Arg : P.first) {
+        if (!Key.empty())
+          Key += ',';
+        Key += llvm::utostr(Arg);
+      }
+      io.mapRequired(Key.c_str(), P.second);
+    }
+  }
+};
+
+template <> struct ScalarEnumerationTraits<WholeProgramDevirtResolution::Kind> {
+  static void enumeration(IO &io, WholeProgramDevirtResolution::Kind &value) {
+    io.enumCase(value, "Indir", WholeProgramDevirtResolution::Indir);
+    io.enumCase(value, "SingleImpl", WholeProgramDevirtResolution::SingleImpl);
+  }
+};
+
+template <> struct MappingTraits<WholeProgramDevirtResolution> {
+  static void mapping(IO &io, WholeProgramDevirtResolution &res) {
+    io.mapOptional("Kind", res.TheKind);
+    io.mapOptional("SingleImplName", res.SingleImplName);
+    io.mapOptional("ResByArg", res.ResByArg);
+  }
+};
+
+template <>
+struct CustomMappingTraits<std::map<uint64_t, WholeProgramDevirtResolution>> {
+  static void inputOne(IO &io, StringRef Key,
+                       std::map<uint64_t, WholeProgramDevirtResolution> &V) {
+    uint64_t KeyInt;
+    if (Key.getAsInteger(0, KeyInt)) {
+      io.setError("key not an integer");
+      return;
+    }
+    io.mapRequired(Key.str().c_str(), V[KeyInt]);
+  }
+  static void output(IO &io, std::map<uint64_t, WholeProgramDevirtResolution> &V) {
+    for (auto &P : V)
+      io.mapRequired(llvm::utostr(P.first).c_str(), P.second);
+  }
+};
+
 template <> struct MappingTraits<TypeIdSummary> {
   static void mapping(IO &io, TypeIdSummary& summary) {
     io.mapOptional("TTRes", summary.TTRes);
+    io.mapOptional("WPDRes", summary.WPDRes);
   }
 };
 
 struct FunctionSummaryYaml {
   std::vector<uint64_t> TypeTests;
+  std::vector<FunctionSummary::VFuncId> TypeTestAssumeVCalls,
+      TypeCheckedLoadVCalls;
+  std::vector<FunctionSummary::ConstVCall> TypeTestAssumeConstVCalls,
+      TypeCheckedLoadConstVCalls;
+};
+
+} // End yaml namespace
+} // End llvm namespace
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint64_t)
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<FunctionSummary::VFuncId> {
+  static void mapping(IO &io, FunctionSummary::VFuncId& id) {
+    io.mapOptional("GUID", id.GUID);
+    io.mapOptional("Offset", id.Offset);
+  }
+};
+
+template <> struct MappingTraits<FunctionSummary::ConstVCall> {
+  static void mapping(IO &io, FunctionSummary::ConstVCall& id) {
+    io.mapOptional("VFunc", id.VFunc);
+    io.mapOptional("Args", id.Args);
+  }
 };
 
 } // End yaml namespace
 } // End llvm namespace
 
-LLVM_YAML_IS_SEQUENCE_VECTOR(uint64_t)
+LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionSummary::VFuncId)
+LLVM_YAML_IS_SEQUENCE_VECTOR(FunctionSummary::ConstVCall)
 
 namespace llvm {
 namespace yaml {
@@ -54,6 +169,12 @@ namespace yaml {
 template <> struct MappingTraits<FunctionSummaryYaml> {
   static void mapping(IO &io, FunctionSummaryYaml& summary) {
     io.mapOptional("TypeTests", summary.TypeTests);
+    io.mapOptional("TypeTestAssumeVCalls", summary.TypeTestAssumeVCalls);
+    io.mapOptional("TypeCheckedLoadVCalls", summary.TypeCheckedLoadVCalls);
+    io.mapOptional("TypeTestAssumeConstVCalls",
+                   summary.TypeTestAssumeConstVCalls);
+    io.mapOptional("TypeCheckedLoadConstVCalls",
+                   summary.TypeCheckedLoadConstVCalls);
   }
 };
 
@@ -82,7 +203,11 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
                                           false);
       Elem.push_back(llvm::make_unique<FunctionSummary>(
           GVFlags, 0, ArrayRef<ValueInfo>{},
-          ArrayRef<FunctionSummary::EdgeTy>{}, std::move(FSum.TypeTests)));
+          ArrayRef<FunctionSummary::EdgeTy>{}, std::move(FSum.TypeTests),
+          std::move(FSum.TypeTestAssumeVCalls),
+          std::move(FSum.TypeCheckedLoadVCalls),
+          std::move(FSum.TypeTestAssumeConstVCalls),
+          std::move(FSum.TypeCheckedLoadConstVCalls)));
     }
   }
   static void output(IO &io, GlobalValueSummaryMapTy &V) {
@@ -90,7 +215,11 @@ template <> struct CustomMappingTraits<GlobalValueSummaryMapTy> {
       std::vector<FunctionSummaryYaml> FSums;
       for (auto &Sum : P.second) {
         if (auto *FSum = dyn_cast<FunctionSummary>(Sum.get()))
-          FSums.push_back(FunctionSummaryYaml{FSum->type_tests()});
+          FSums.push_back(FunctionSummaryYaml{
+              FSum->type_tests(), FSum->type_test_assume_vcalls(),
+              FSum->type_checked_load_vcalls(),
+              FSum->type_test_assume_const_vcalls(),
+              FSum->type_checked_load_const_vcalls()});
       }
       if (!FSums.empty())
         io.mapRequired(llvm::utostr(P.first).c_str(), FSums);
diff --git a/include/llvm/IR/Operator.h b/include/llvm/IR/Operator.h
index 444ce93921f6dd196a1c5e20b526a272ddba38bf..997a85340c2591e43df0dfe58f0a947c809bb77b 100644
--- a/include/llvm/IR/Operator.h
+++ b/include/llvm/IR/Operator.h
@@ -18,8 +18,6 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
@@ -174,12 +172,15 @@ private:
   FastMathFlags(unsigned F) : Flags(F) { }
 
 public:
+  /// This is how the bits are used in Value::SubclassOptionalData so they
+  /// should fit there too.
   enum {
     UnsafeAlgebra   = (1 << 0),
     NoNaNs          = (1 << 1),
     NoInfs          = (1 << 2),
     NoSignedZeros   = (1 << 3),
-    AllowReciprocal = (1 << 4)
+    AllowReciprocal = (1 << 4),
+    AllowContract   = (1 << 5)
   };
 
   FastMathFlags() = default;
@@ -195,6 +196,7 @@ public:
   bool noInfs() const          { return 0 != (Flags & NoInfs); }
   bool noSignedZeros() const   { return 0 != (Flags & NoSignedZeros); }
   bool allowReciprocal() const { return 0 != (Flags & AllowReciprocal); }
+  bool allowContract() const { return 0 != (Flags & AllowContract); }
   bool unsafeAlgebra() const   { return 0 != (Flags & UnsafeAlgebra); }
 
   /// Flag setters
@@ -202,12 +204,16 @@ public:
   void setNoInfs()          { Flags |= NoInfs; }
   void setNoSignedZeros()   { Flags |= NoSignedZeros; }
   void setAllowReciprocal() { Flags |= AllowReciprocal; }
+  void setAllowContract(bool B) {
+    Flags = (Flags & ~AllowContract) | B * AllowContract;
+  }
   void setUnsafeAlgebra() {
     Flags |= UnsafeAlgebra;
     setNoNaNs();
     setNoInfs();
     setNoSignedZeros();
     setAllowReciprocal();
+    setAllowContract(true);
   }
 
   void operator&=(const FastMathFlags &OtherFlags) {
@@ -259,6 +265,12 @@ private:
       (B * FastMathFlags::AllowReciprocal);
   }
 
+  void setHasAllowContract(bool B) {
+    SubclassOptionalData =
+        (SubclassOptionalData & ~FastMathFlags::AllowContract) |
+        (B * FastMathFlags::AllowContract);
+  }
+
   /// Convenience function for setting multiple fast-math flags.
   /// FMF is a mask of the bits to set.
   void setFastMathFlags(FastMathFlags FMF) {
@@ -302,6 +314,12 @@ public:
     return (SubclassOptionalData & FastMathFlags::AllowReciprocal) != 0;
   }
 
+  /// Test whether this operation is permitted to
+  /// be floating-point contracted.
+  bool hasAllowContract() const {
+    return (SubclassOptionalData & FastMathFlags::AllowContract) != 0;
+  }
+
   /// Convenience function for getting all the fast-math flags
   FastMathFlags getFastMathFlags() const {
     return FastMathFlags(SubclassOptionalData);
diff --git a/include/llvm/IR/OptBisect.h b/include/llvm/IR/OptBisect.h
index 9eee65e93e52f5c659ec14f841a132885ac004e3..185a5ac956f54f5386c31b9d640c8e574ba03bf6 100644
--- a/include/llvm/IR/OptBisect.h
+++ b/include/llvm/IR/OptBisect.h
@@ -51,24 +51,6 @@ public:
   template <class UnitT>
   bool shouldRunPass(const Pass *P, const UnitT &U);
 
-  /// Checks the bisect limit to determine if the optimization described by the
-  /// /p Desc argument should run.
-  ///
-  /// This function will immediate return true if bisection is disabled. If the
-  /// bisect limit is set to -1, the function will print a message with the
-  /// bisect number assigned to the optimization along with the /p Desc
-  /// description and return true.  Otherwise, the function will print a message
-  /// with the bisect number assigned to the optimization and indicating whether
-  /// or not the pass will be run and return true if the bisect limit has not
-  /// yet been exceded or false if it has.
-  ///
-  /// Passes may call this function to provide more fine grained control over
-  /// individual optimizations performed by the pass.  Passes which cannot be
-  /// skipped entirely (such as non-optional code generation passes) may still
-  /// call this function to control whether or not individual optional
-  /// transformations are performed.
-  bool shouldRunCase(const Twine &Desc);
-
 private:
   bool checkPass(const StringRef PassName, const StringRef TargetDesc);
 
diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h
index 61b6278ea0b96db824fb2a4912fdffeef12be0d6..c845112baa4531c588495327374d89ea1882a081 100644
--- a/include/llvm/IR/PassManager.h
+++ b/include/llvm/IR/PassManager.h
@@ -351,6 +351,8 @@ template <typename IRUnitT, typename... ExtraArgTs> class AnalysisManager;
 template <typename DerivedT> struct PassInfoMixin {
   /// Gets the name of the pass we are mixed into.
   static StringRef name() {
+    static_assert(std::is_base_of<PassInfoMixin, DerivedT>::value,
+                  "Must pass the derived type as the template argument!");
     StringRef Name = getTypeName<DerivedT>();
     if (Name.startswith("llvm::"))
       Name = Name.drop_front(strlen("llvm::"));
@@ -379,7 +381,11 @@ struct AnalysisInfoMixin : PassInfoMixin<DerivedT> {
   /// known platform with this limitation is Windows DLL builds, specifically
   /// building each part of LLVM as a DLL. If we ever remove that build
   /// configuration, this mixin can provide the static key as well.
-  static AnalysisKey *ID() { return &DerivedT::Key; }
+  static AnalysisKey *ID() {
+    static_assert(std::is_base_of<AnalysisInfoMixin, DerivedT>::value,
+                  "Must pass the derived type as the template argument!");
+    return &DerivedT::Key;
+  }
 };
 
 /// \brief Manages a sequence of passes over a particular unit of IR.
@@ -1028,7 +1034,7 @@ extern template class InnerAnalysisManagerProxy<FunctionAnalysisManager,
 template <typename AnalysisManagerT, typename IRUnitT, typename... ExtraArgTs>
 class OuterAnalysisManagerProxy
     : public AnalysisInfoMixin<
-          OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT>> {
+          OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>> {
 public:
   /// \brief Result proxy object for \c OuterAnalysisManagerProxy.
   class Result {
@@ -1090,7 +1096,7 @@ public:
 
 private:
   friend AnalysisInfoMixin<
-      OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT>>;
+      OuterAnalysisManagerProxy<AnalysisManagerT, IRUnitT, ExtraArgTs...>>;
   static AnalysisKey Key;
 
   const AnalysisManagerT *AM;
diff --git a/include/llvm/IR/PassManagerInternal.h b/include/llvm/IR/PassManagerInternal.h
index 02f21675fa9d0d84a04a675c0c2fa3cb97cfecae..387dc4c65c4331760dc676ee11348fbb1fd5d15c 100644
--- a/include/llvm/IR/PassManagerInternal.h
+++ b/include/llvm/IR/PassManagerInternal.h
@@ -291,7 +291,7 @@ struct AnalysisPassModel : AnalysisPassConcept<IRUnitT, PreservedAnalysesT,
       AnalysisResultConcept<IRUnitT, PreservedAnalysesT, InvalidatorT>>
   run(IRUnitT &IR, AnalysisManager<IRUnitT, ExtraArgTs...> &AM,
       ExtraArgTs... ExtraArgs) override {
-    return make_unique<ResultModelT>(Pass.run(IR, AM, ExtraArgs...));
+    return llvm::make_unique<ResultModelT>(Pass.run(IR, AM, ExtraArgs...));
   }
 
   /// \brief The model delegates to a static \c PassT::name method.
diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h
index 6e45dcfd719a7050966653b3a5ea455446fdab8f..40f9c21f646bcbaad041bed2a45ec3971c7dc294 100644
--- a/include/llvm/IR/PatternMatch.h
+++ b/include/llvm/IR/PatternMatch.h
@@ -157,6 +157,19 @@ inline match_combine_or<match_zero, match_neg_zero> m_AnyZero() {
   return m_CombineOr(m_Zero(), m_NegZero());
 }
 
+struct match_nan {
+  template <typename ITy> bool match(ITy *V) {
+    if (const auto *C = dyn_cast<ConstantFP>(V)) {
+      const APFloat &APF = C->getValueAPF();
+      return APF.isNaN();
+    }
+    return false;
+  }
+};
+
+/// Match an arbitrary NaN constant. This includes quiet and signalling nans.
+inline match_nan m_NaN() { return match_nan(); }
+
 struct apint_match {
   const APInt *&Res;
   apint_match(const APInt *&R) : Res(R) {}
@@ -814,6 +827,13 @@ inline CastClass_match<OpTy, Instruction::ZExt> m_ZExt(const OpTy &Op) {
   return CastClass_match<OpTy, Instruction::ZExt>(Op);
 }
 
+template <typename OpTy>
+inline match_combine_or<CastClass_match<OpTy, Instruction::ZExt>,
+                        CastClass_match<OpTy, Instruction::SExt>>
+m_ZExtOrSExt(const OpTy &Op) {
+  return m_CombineOr(m_ZExt(Op), m_SExt(Op));
+}
+
 /// \brief Matches UIToFP.
 template <typename OpTy>
 inline CastClass_match<OpTy, Instruction::UIToFP> m_UIToFP(const OpTy &Op) {
@@ -1328,6 +1348,14 @@ template <typename Val_t> inline Signum_match<Val_t> m_Signum(const Val_t &V) {
 // Matchers for two-operands operators with the operators in either order
 //
 
+/// \brief Matches a BinaryOperator with LHS and RHS in either order.
+template<typename LHS, typename RHS>
+inline match_combine_or<AnyBinaryOp_match<LHS, RHS>,
+                        AnyBinaryOp_match<RHS, LHS>>
+m_c_BinOp(const LHS &L, const RHS &R) {
+  return m_CombineOr(m_BinOp(L, R), m_BinOp(R, L));
+}
+
 /// \brief Matches an ICmp with a predicate over LHS and RHS in either order.
 /// Does not swap the predicate.
 template<typename LHS, typename RHS>
@@ -1337,6 +1365,22 @@ m_c_ICmp(ICmpInst::Predicate &Pred, const LHS &L, const RHS &R) {
   return m_CombineOr(m_ICmp(Pred, L, R), m_ICmp(Pred, R, L));
 }
 
+/// \brief Matches a Add with LHS and RHS in either order.
+template<typename LHS, typename RHS>
+inline match_combine_or<BinaryOp_match<LHS, RHS, Instruction::Add>,
+                        BinaryOp_match<RHS, LHS, Instruction::Add>>
+m_c_Add(const LHS &L, const RHS &R) {
+  return m_CombineOr(m_Add(L, R), m_Add(R, L));
+}
+
+/// \brief Matches a Mul with LHS and RHS in either order.
+template<typename LHS, typename RHS>
+inline match_combine_or<BinaryOp_match<LHS, RHS, Instruction::Mul>,
+                        BinaryOp_match<RHS, LHS, Instruction::Mul>>
+m_c_Mul(const LHS &L, const RHS &R) {
+  return m_CombineOr(m_Mul(L, R), m_Mul(R, L));
+}
+
 /// \brief Matches an And with LHS and RHS in either order.
 template<typename LHS, typename RHS>
 inline match_combine_or<BinaryOp_match<LHS, RHS, Instruction::And>,
diff --git a/include/llvm/IR/PredIteratorCache.h b/include/llvm/IR/PredIteratorCache.h
index 118310aed1d0665893bdbafd297b6a85b1c5e235..81f535311431acec72e18406634d3edafc7cd848 100644
--- a/include/llvm/IR/PredIteratorCache.h
+++ b/include/llvm/IR/PredIteratorCache.h
@@ -27,8 +27,8 @@ namespace llvm {
 /// wants the predecessor list for the same blocks.
 class PredIteratorCache {
   /// BlockToPredsMap - Pointer to null-terminated list.
-  DenseMap<BasicBlock *, BasicBlock **> BlockToPredsMap;
-  DenseMap<BasicBlock *, unsigned> BlockToPredCountMap;
+  mutable DenseMap<BasicBlock *, BasicBlock **> BlockToPredsMap;
+  mutable DenseMap<BasicBlock *, unsigned> BlockToPredCountMap;
 
   /// Memory - This is the space that holds cached preds.
   BumpPtrAllocator Memory;
@@ -55,13 +55,15 @@ private:
     return Entry;
   }
 
-  unsigned GetNumPreds(BasicBlock *BB) {
-    GetPreds(BB);
-    return BlockToPredCountMap[BB];
+  unsigned GetNumPreds(BasicBlock *BB) const {
+    auto Result = BlockToPredCountMap.find(BB);
+    if (Result != BlockToPredCountMap.end())
+      return Result->second;
+    return BlockToPredCountMap[BB] = std::distance(pred_begin(BB), pred_end(BB));
   }
 
 public:
-  size_t size(BasicBlock *BB) { return GetNumPreds(BB); }
+  size_t size(BasicBlock *BB) const { return GetNumPreds(BB); }
   ArrayRef<BasicBlock *> get(BasicBlock *BB) {
     return makeArrayRef(GetPreds(BB), GetNumPreds(BB));
   }
diff --git a/include/llvm/IR/Statepoint.h b/include/llvm/IR/Statepoint.h
index 916faa4b327e3c2d0b2202d28da6960f6298d786..03151cd7c8f70890d66d19ee99c2f332f794674a 100644
--- a/include/llvm/IR/Statepoint.h
+++ b/include/llvm/IR/Statepoint.h
@@ -454,7 +454,7 @@ struct StatepointDirectives {
 
 /// Parse out statepoint directives from the function attributes present in \p
 /// AS.
-StatepointDirectives parseStatepointDirectivesFromAttrs(AttributeSet AS);
+StatepointDirectives parseStatepointDirectivesFromAttrs(AttributeList AS);
 
 /// Return \c true if the the \p Attr is an attribute that is a statepoint
 /// directive.
diff --git a/include/llvm/IR/SymbolTableListTraits.h b/include/llvm/IR/SymbolTableListTraits.h
index 5c6d58affd7a0a184216e392181e9013e5175240..49a5fb21297d6577e8fece4c1fbbe84b89f98ab7 100644
--- a/include/llvm/IR/SymbolTableListTraits.h
+++ b/include/llvm/IR/SymbolTableListTraits.h
@@ -1,4 +1,4 @@
-//===-- llvm/SymbolTableListTraits.h - Traits for iplist --------*- C++ -*-===//
+//===- llvm/SymbolTableListTraits.h - Traits for iplist ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -26,23 +26,27 @@
 #define LLVM_IR_SYMBOLTABLELISTTRAITS_H
 
 #include "llvm/ADT/ilist.h"
+#include "llvm/ADT/simple_ilist.h"
+#include <cstddef>
 
 namespace llvm {
-class ValueSymbolTable;
 
-/// Template metafunction to get the parent type for a symbol table list.
-///
-/// Implementations create a typedef called \c type so that we only need a
-/// single template parameter for the list and traits.
-template <typename NodeTy> struct SymbolTableListParentType {};
 class Argument;
 class BasicBlock;
 class Function;
-class Instruction;
-class GlobalVariable;
 class GlobalAlias;
 class GlobalIFunc;
+class GlobalVariable;
+class Instruction;
 class Module;
+class ValueSymbolTable;
+
+/// Template metafunction to get the parent type for a symbol table list.
+///
+/// Implementations create a typedef called \c type so that we only need a
+/// single template parameter for the list and traits.
+template <typename NodeTy> struct SymbolTableListParentType {};
+
 #define DEFINE_SYMBOL_TABLE_PARENT_TYPE(NODE, PARENT)                          \
   template <> struct SymbolTableListParentType<NODE> { typedef PARENT type; };
 DEFINE_SYMBOL_TABLE_PARENT_TYPE(Instruction, BasicBlock)
@@ -67,7 +71,7 @@ class SymbolTableListTraits : public ilist_alloc_traits<ValueSubClass> {
       typename SymbolTableListParentType<ValueSubClass>::type ItemParentClass;
 
 public:
-  SymbolTableListTraits() {}
+  SymbolTableListTraits() = default;
 
 private:
   /// getListOwner - Return the object that owns this list.  If this is a list
@@ -109,6 +113,6 @@ template <class T>
 class SymbolTableList
     : public iplist_impl<simple_ilist<T>, SymbolTableListTraits<T>> {};
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_SYMBOLTABLELISTTRAITS_H
diff --git a/include/llvm/IR/TrackingMDRef.h b/include/llvm/IR/TrackingMDRef.h
index fe513a8f9795155ce36673fc91a9bdb6959b5ea1..12b196432006d737cce4cbc7239478c62eee14a2 100644
--- a/include/llvm/IR/TrackingMDRef.h
+++ b/include/llvm/IR/TrackingMDRef.h
@@ -15,6 +15,8 @@
 #define LLVM_IR_TRACKINGMDREF_H
 
 #include "llvm/IR/Metadata.h"
+#include <algorithm>
+#include <cassert>
 
 namespace llvm {
 
@@ -22,14 +24,15 @@ namespace llvm {
 ///
 /// This class behaves like \a TrackingVH, but for metadata.
 class TrackingMDRef {
-  Metadata *MD;
+  Metadata *MD = nullptr;
 
 public:
-  TrackingMDRef() : MD(nullptr) {}
+  TrackingMDRef() = default;
   explicit TrackingMDRef(Metadata *MD) : MD(MD) { track(); }
 
   TrackingMDRef(TrackingMDRef &&X) : MD(X.MD) { retrack(X); }
   TrackingMDRef(const TrackingMDRef &X) : MD(X.MD) { track(); }
+
   TrackingMDRef &operator=(TrackingMDRef &&X) {
     if (&X == this)
       return *this;
@@ -39,6 +42,7 @@ public:
     retrack(X);
     return *this;
   }
+
   TrackingMDRef &operator=(const TrackingMDRef &X) {
     if (&X == this)
       return *this;
@@ -48,6 +52,7 @@ public:
     track();
     return *this;
   }
+
   ~TrackingMDRef() { untrack(); }
 
   Metadata *get() const { return MD; }
@@ -80,10 +85,12 @@ private:
     if (MD)
       MetadataTracking::track(MD);
   }
+
   void untrack() {
     if (MD)
       MetadataTracking::untrack(MD);
   }
+
   void retrack(TrackingMDRef &X) {
     assert(MD == X.MD && "Expected values to match");
     if (X.MD) {
@@ -101,15 +108,17 @@ template <class T> class TypedTrackingMDRef {
   TrackingMDRef Ref;
 
 public:
-  TypedTrackingMDRef() {}
+  TypedTrackingMDRef() = default;
   explicit TypedTrackingMDRef(T *MD) : Ref(static_cast<Metadata *>(MD)) {}
 
   TypedTrackingMDRef(TypedTrackingMDRef &&X) : Ref(std::move(X.Ref)) {}
   TypedTrackingMDRef(const TypedTrackingMDRef &X) : Ref(X.Ref) {}
+
   TypedTrackingMDRef &operator=(TypedTrackingMDRef &&X) {
     Ref = std::move(X.Ref);
     return *this;
   }
+
   TypedTrackingMDRef &operator=(const TypedTrackingMDRef &X) {
     Ref = X.Ref;
     return *this;
@@ -162,4 +171,4 @@ template <class T> struct simplify_type<const TypedTrackingMDRef<T>> {
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_IR_TRACKINGMDREF_H
diff --git a/include/llvm/IR/Type.h b/include/llvm/IR/Type.h
index 778ee06169f1717ba0f52d7000d9aeed7be9aed2..e6a0df937e9bb67f49c6dc783c405bbf88899896 100644
--- a/include/llvm/IR/Type.h
+++ b/include/llvm/IR/Type.h
@@ -290,7 +290,11 @@ public:
 
   /// If this is a vector type, return the element type, otherwise return
   /// 'this'.
-  Type *getScalarType() const LLVM_READONLY;
+  Type *getScalarType() const {
+    if (isVectorTy())
+      return getVectorElementType();
+    return const_cast<Type*>(this);
+  }
 
   //===--------------------------------------------------------------------===//
   // Type Iteration support.
@@ -423,7 +427,7 @@ private:
 };
 
 // Printing of types.
-static inline raw_ostream &operator<<(raw_ostream &OS, Type &T) {
+static inline raw_ostream &operator<<(raw_ostream &OS, const Type &T) {
   T.print(OS);
   return OS;
 }
diff --git a/include/llvm/IR/TypeFinder.h b/include/llvm/IR/TypeFinder.h
index 046f85caec9d4daae2974b0d4333d6a1c1d9f840..48c4f1161aa1d39a0ecf72722afeb3cb33d3ec4a 100644
--- a/include/llvm/IR/TypeFinder.h
+++ b/include/llvm/IR/TypeFinder.h
@@ -1,4 +1,4 @@
-//===-- llvm/IR/TypeFinder.h - Class to find used struct types --*- C++ -*-===//
+//===- llvm/IR/TypeFinder.h - Class to find used struct types ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,8 +15,7 @@
 #define LLVM_IR_TYPEFINDER_H
 
 #include "llvm/ADT/DenseSet.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Type.h"
+#include <cstddef>
 #include <vector>
 
 namespace llvm {
@@ -24,6 +23,7 @@ namespace llvm {
 class MDNode;
 class Module;
 class StructType;
+class Type;
 class Value;
 
 /// TypeFinder - Walk over a module, identifying all of the types that are
@@ -36,10 +36,10 @@ class TypeFinder {
   DenseSet<Type*> VisitedTypes;
 
   std::vector<StructType*> StructTypes;
-  bool OnlyNamed;
+  bool OnlyNamed = false;
 
 public:
-  TypeFinder() : OnlyNamed(false) {}
+  TypeFinder() = default;
 
   void run(const Module &M, bool onlyNamed);
   void clear();
@@ -77,6 +77,6 @@ private:
   void incorporateMDNode(const MDNode *V);
 };
 
-} // end llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_TYPEFINDER_H
diff --git a/include/llvm/IR/Use.h b/include/llvm/IR/Use.h
index ff6b2e1f1e2287e6b599e6b3e26b74eb6b402414..05b68ccbb38e801087e6ead57c6015de2bcdc8eb 100644
--- a/include/llvm/IR/Use.h
+++ b/include/llvm/IR/Use.h
@@ -85,7 +85,7 @@ public:
   ///
   /// For an instruction operand, for example, this will return the
   /// instruction.
-  User *getUser() const;
+  User *getUser() const LLVM_READONLY;
 
   inline void set(Value *Val);
 
@@ -111,7 +111,7 @@ public:
   static void zap(Use *Start, const Use *Stop, bool del = false);
 
 private:
-  const Use *getImpliedUser() const;
+  const Use *getImpliedUser() const LLVM_READONLY;
 
   Value *Val;
   Use *Next;
diff --git a/include/llvm/IR/UseListOrder.h b/include/llvm/IR/UseListOrder.h
index efff208295b6aefe4bdd7d4e62c17beb3f24837e..ebe99223facd0b28aa0fe2e345aba63ad843e034 100644
--- a/include/llvm/IR/UseListOrder.h
+++ b/include/llvm/IR/UseListOrder.h
@@ -20,20 +20,19 @@
 
 namespace llvm {
 
-class Module;
 class Function;
 class Value;
 
 /// \brief Structure to hold a use-list order.
 struct UseListOrder {
-  const Value *V;
-  const Function *F;
+  const Value *V = nullptr;
+  const Function *F = nullptr;
   std::vector<unsigned> Shuffle;
 
   UseListOrder(const Value *V, const Function *F, size_t ShuffleSize)
       : V(V), F(F), Shuffle(ShuffleSize) {}
 
-  UseListOrder() : V(nullptr), F(nullptr) {}
+  UseListOrder() = default;
   UseListOrder(UseListOrder &&) = default;
   UseListOrder &operator=(UseListOrder &&) = default;
 };
diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h
index c907d6b670b55bffa9a443f0c3722b757fb8a5a0..54758a9b6d6a8bed2626bd8cccec7ec60ec3d9fa 100644
--- a/include/llvm/IR/User.h
+++ b/include/llvm/IR/User.h
@@ -122,8 +122,16 @@ protected:
   }
 
 private:
+  const Use *getHungOffOperands() const {
+    return *(reinterpret_cast<const Use *const *>(this) - 1);
+  }
+
   Use *&getHungOffOperands() { return *(reinterpret_cast<Use **>(this) - 1); }
 
+  const Use *getIntrusiveOperands() const {
+    return reinterpret_cast<const Use *>(this) - NumUserOperands;
+  }
+
   Use *getIntrusiveOperands() {
     return reinterpret_cast<Use *>(this) - NumUserOperands;
   }
@@ -135,11 +143,11 @@ private:
   }
 
 public:
-  Use *getOperandList() {
+  const Use *getOperandList() const {
     return HasHungOffUses ? getHungOffOperands() : getIntrusiveOperands();
   }
-  const Use *getOperandList() const {
-    return const_cast<User *>(this)->getOperandList();
+  Use *getOperandList() {
+    return const_cast<Use *>(static_cast<const User *>(this)->getOperandList());
   }
 
   Value *getOperand(unsigned i) const {
diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h
index 9c91f4e07e330fa2597e738c32e3b130f48f7578..a4b48d7f3539b1c186afad9ad6010fbe4f465860 100644
--- a/include/llvm/IR/Value.h
+++ b/include/llvm/IR/Value.h
@@ -476,27 +476,30 @@ public:
   ///
   /// Returns the original uncasted value.  If this is called on a non-pointer
   /// value, it returns 'this'.
-  Value *stripPointerCasts();
-  const Value *stripPointerCasts() const {
-    return const_cast<Value*>(this)->stripPointerCasts();
+  const Value *stripPointerCasts() const;
+  Value *stripPointerCasts() {
+    return const_cast<Value *>(
+                         static_cast<const Value *>(this)->stripPointerCasts());
   }
 
   /// \brief Strip off pointer casts and all-zero GEPs.
   ///
   /// Returns the original uncasted value.  If this is called on a non-pointer
   /// value, it returns 'this'.
-  Value *stripPointerCastsNoFollowAliases();
-  const Value *stripPointerCastsNoFollowAliases() const {
-    return const_cast<Value*>(this)->stripPointerCastsNoFollowAliases();
+  const Value *stripPointerCastsNoFollowAliases() const;
+  Value *stripPointerCastsNoFollowAliases() {
+    return const_cast<Value *>(
+          static_cast<const Value *>(this)->stripPointerCastsNoFollowAliases());
   }
 
   /// \brief Strip off pointer casts and all-constant inbounds GEPs.
   ///
   /// Returns the original pointer value.  If this is called on a non-pointer
   /// value, it returns 'this'.
-  Value *stripInBoundsConstantOffsets();
-  const Value *stripInBoundsConstantOffsets() const {
-    return const_cast<Value*>(this)->stripInBoundsConstantOffsets();
+  const Value *stripInBoundsConstantOffsets() const;
+  Value *stripInBoundsConstantOffsets() {
+    return const_cast<Value *>(
+              static_cast<const Value *>(this)->stripInBoundsConstantOffsets());
   }
 
   /// \brief Accumulate offsets from \a stripInBoundsConstantOffsets().
@@ -506,21 +509,22 @@ public:
   /// correct bitwidth for an offset of this pointer type.
   ///
   /// If this is called on a non-pointer value, it returns 'this'.
-  Value *stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
-                                                   APInt &Offset);
   const Value *stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
-                                                         APInt &Offset) const {
-    return const_cast<Value *>(this)
-        ->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
+                                                         APInt &Offset) const;
+  Value *stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
+                                                   APInt &Offset) {
+    return const_cast<Value *>(static_cast<const Value *>(this)
+        ->stripAndAccumulateInBoundsConstantOffsets(DL, Offset));
   }
 
   /// \brief Strip off pointer casts and inbounds GEPs.
   ///
   /// Returns the original pointer value.  If this is called on a non-pointer
   /// value, it returns 'this'.
-  Value *stripInBoundsOffsets();
-  const Value *stripInBoundsOffsets() const {
-    return const_cast<Value*>(this)->stripInBoundsOffsets();
+  const Value *stripInBoundsOffsets() const;
+  Value *stripInBoundsOffsets() {
+    return const_cast<Value *>(
+                      static_cast<const Value *>(this)->stripInBoundsOffsets());
   }
 
   /// \brief Returns the number of bytes known to be dereferenceable for the
@@ -543,11 +547,12 @@ public:
   /// the PHI node corresponding to PredBB.  If not, return ourself.  This is
   /// useful if you want to know the value something has in a predecessor
   /// block.
-  Value *DoPHITranslation(const BasicBlock *CurBB, const BasicBlock *PredBB);
-
   const Value *DoPHITranslation(const BasicBlock *CurBB,
-                                const BasicBlock *PredBB) const{
-    return const_cast<Value*>(this)->DoPHITranslation(CurBB, PredBB);
+                                const BasicBlock *PredBB) const;
+
+  Value *DoPHITranslation(const BasicBlock *CurBB, const BasicBlock *PredBB) {
+    return const_cast<Value *>(
+             static_cast<const Value *>(this)->DoPHITranslation(CurBB, PredBB));
   }
 
   /// \brief The maximum alignment for instructions.
diff --git a/include/llvm/IR/ValueSymbolTable.h b/include/llvm/IR/ValueSymbolTable.h
index 61a12db403ea7c08ccf4b9361ae889f6bb232ee8..9e86751dae6f7652593811bb0d5f2b07aaeddacc 100644
--- a/include/llvm/IR/ValueSymbolTable.h
+++ b/include/llvm/IR/ValueSymbolTable.h
@@ -1,4 +1,4 @@
-//===-- llvm/ValueSymbolTable.h - Implement a Value Symtab ------*- C++ -*-===//
+//===- llvm/ValueSymbolTable.h - Implement a Value Symtab -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,31 +15,36 @@
 #define LLVM_IR_VALUESYMBOLTABLE_H
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Value.h"
-#include "llvm/Support/DataTypes.h"
+#include <cstdint>
 
 namespace llvm {
-  template <typename ValueSubClass> class SymbolTableListTraits;
-  template <unsigned InternalLen> class SmallString;
-  class BasicBlock;
-  class Function;
-  class NamedMDNode;
-  class Module;
-  class StringRef;
+
+class Argument;
+class BasicBlock;
+class Function;
+class GlobalAlias;
+class GlobalIFunc;
+class GlobalVariable;
+class Instruction;
+template <unsigned InternalLen> class SmallString;
+template <typename ValueSubClass> class SymbolTableListTraits;
 
 /// This class provides a symbol table of name/value pairs. It is essentially
 /// a std::map<std::string,Value*> but has a controlled interface provided by
 /// LLVM as well as ensuring uniqueness of names.
 ///
 class ValueSymbolTable {
-  friend class Value;
   friend class SymbolTableListTraits<Argument>;
   friend class SymbolTableListTraits<BasicBlock>;
-  friend class SymbolTableListTraits<Instruction>;
   friend class SymbolTableListTraits<Function>;
-  friend class SymbolTableListTraits<GlobalVariable>;
   friend class SymbolTableListTraits<GlobalAlias>;
   friend class SymbolTableListTraits<GlobalIFunc>;
+  friend class SymbolTableListTraits<GlobalVariable>;
+  friend class SymbolTableListTraits<Instruction>;
+  friend class Value;
+
 /// @name Types
 /// @{
 public:
@@ -55,14 +60,14 @@ public:
 /// @}
 /// @name Constructors
 /// @{
-public:
-  ValueSymbolTable() : vmap(0), LastUnique(0) {}
+
+  ValueSymbolTable() : vmap(0) {}
   ~ValueSymbolTable();
 
 /// @}
 /// @name Accessors
 /// @{
-public:
+
   /// This method finds the value with the given \p Name in the
   /// the symbol table.
   /// @returns the value associated with the \p Name
@@ -84,7 +89,7 @@ public:
 /// @}
 /// @name Iteration
 /// @{
-public:
+
   /// @brief Get an iterator that from the beginning of the symbol table.
   inline iterator begin() { return vmap.begin(); }
 
@@ -122,13 +127,13 @@ private:
   /// @}
   /// @name Internal Data
   /// @{
-private:
+
   ValueMap vmap;                    ///< The map that holds the symbol table.
-  mutable uint32_t LastUnique; ///< Counter for tracking unique names
+  mutable uint32_t LastUnique = 0;  ///< Counter for tracking unique names
 
 /// @}
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_IR_VALUESYMBOLTABLE_H
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index dd355012cb927d1ff42e2c4f4931456df8d273f4..15c8ff6d04def344cee7b792ec89455d05da5a8c 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -53,13 +53,13 @@ void initializeCoroutines(PassRegistry&);
 void initializeCodeGen(PassRegistry&);
 
 /// Initialize all passes linked into the GlobalISel library.
-void initializeGlobalISel(PassRegistry &Registry);
+void initializeGlobalISel(PassRegistry&);
 
 /// Initialize all passes linked into the CodeGen library.
 void initializeTarget(PassRegistry&);
 
 void initializeAAEvalLegacyPassPass(PassRegistry&);
-void initializeAAResultsWrapperPassPass(PassRegistry &);
+void initializeAAResultsWrapperPassPass(PassRegistry&);
 void initializeADCELegacyPassPass(PassRegistry&);
 void initializeAddDiscriminatorsLegacyPassPass(PassRegistry&);
 void initializeAddressSanitizerModulePass(PassRegistry&);
@@ -68,37 +68,39 @@ void initializeAliasSetPrinterPass(PassRegistry&);
 void initializeAlignmentFromAssumptionsPass(PassRegistry&);
 void initializeAlwaysInlinerLegacyPassPass(PassRegistry&);
 void initializeArgPromotionPass(PassRegistry&);
-void initializeAssumptionCacheTrackerPass(PassRegistry &);
+void initializeAssumptionCacheTrackerPass(PassRegistry&);
 void initializeAtomicExpandPass(PassRegistry&);
 void initializeBBVectorizePass(PassRegistry&);
-void initializeBDCELegacyPassPass(PassRegistry &);
+void initializeBDCELegacyPassPass(PassRegistry&);
 void initializeBarrierNoopPass(PassRegistry&);
 void initializeBasicAAWrapperPassPass(PassRegistry&);
 void initializeBlockExtractorPassPass(PassRegistry&);
 void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry&);
 void initializeBoundsCheckingPass(PassRegistry&);
+void initializeBranchCoalescingPass(PassRegistry&);
 void initializeBranchFolderPassPass(PassRegistry&);
 void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry&);
 void initializeBranchRelaxationPass(PassRegistry&);
 void initializeBreakCriticalEdgesPass(PassRegistry&);
+void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&);
 void initializeCFGOnlyViewerLegacyPassPass(PassRegistry&);
 void initializeCFGPrinterLegacyPassPass(PassRegistry&);
-void initializeCFGOnlyPrinterLegacyPassPass(PassRegistry&);
 void initializeCFGSimplifyPassPass(PassRegistry&);
+void initializeLateCFGSimplifyPassPass(PassRegistry&);
 void initializeCFGViewerLegacyPassPass(PassRegistry&);
 void initializeCFLAndersAAWrapperPassPass(PassRegistry&);
 void initializeCFLSteensAAWrapperPassPass(PassRegistry&);
 void initializeCallGraphDOTPrinterPass(PassRegistry&);
 void initializeCallGraphPrinterLegacyPassPass(PassRegistry&);
 void initializeCallGraphViewerPass(PassRegistry&);
-void initializeCallGraphWrapperPassPass(PassRegistry &);
+void initializeCallGraphWrapperPassPass(PassRegistry&);
 void initializeCodeGenPreparePass(PassRegistry&);
-void initializeCountingFunctionInserterPass(PassRegistry&);
 void initializeConstantHoistingLegacyPassPass(PassRegistry&);
-void initializeConstantMergeLegacyPassPass(PassRegistry &);
+void initializeConstantMergeLegacyPassPass(PassRegistry&);
 void initializeConstantPropagationPass(PassRegistry&);
 void initializeCorrelatedValuePropagationPass(PassRegistry&);
 void initializeCostModelAnalysisPass(PassRegistry&);
+void initializeCountingFunctionInserterPass(PassRegistry&);
 void initializeCrossDSOCFIPass(PassRegistry&);
 void initializeDAEPass(PassRegistry&);
 void initializeDAHPass(PassRegistry&);
@@ -107,7 +109,7 @@ void initializeDSELegacyPassPass(PassRegistry&);
 void initializeDataFlowSanitizerPass(PassRegistry&);
 void initializeDeadInstEliminationPass(PassRegistry&);
 void initializeDeadMachineInstructionElimPass(PassRegistry&);
-void initializeDelinearizationPass(PassRegistry &);
+void initializeDelinearizationPass(PassRegistry&);
 void initializeDemandedBitsWrapperPassPass(PassRegistry&);
 void initializeDependenceAnalysisPass(PassRegistry&);
 void initializeDependenceAnalysisWrapperPassPass(PassRegistry&);
@@ -120,27 +122,27 @@ void initializeDomViewerPass(PassRegistry&);
 void initializeDominanceFrontierWrapperPassPass(PassRegistry&);
 void initializeDominatorTreeWrapperPassPass(PassRegistry&);
 void initializeDwarfEHPreparePass(PassRegistry&);
-void initializeEarlyCSELegacyPassPass(PassRegistry &);
-void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &);
+void initializeEarlyCSELegacyPassPass(PassRegistry&);
+void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry&);
 void initializeEarlyIfConverterPass(PassRegistry&);
 void initializeEdgeBundlesPass(PassRegistry&);
 void initializeEfficiencySanitizerPass(PassRegistry&);
-void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry &);
-void initializeRAGreedyPass(PassRegistry&);
-void initializeGVNHoistLegacyPassPass(PassRegistry &);
+void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&);
 void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
 void initializeExternalAAWrapperPassPass(PassRegistry&);
+void initializeFEntryInserterPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
 void initializeFlattenCFGPassPass(PassRegistry&);
 void initializeFloat2IntLegacyPassPass(PassRegistry&);
 void initializeForceFunctionAttrsLegacyPassPass(PassRegistry&);
 void initializeForwardControlFlowIntegrityPass(PassRegistry&);
-void initializeFuncletLayoutPass(PassRegistry &);
-void initializeFunctionImportLegacyPassPass(PassRegistry &);
+void initializeFuncletLayoutPass(PassRegistry&);
+void initializeFunctionImportLegacyPassPass(PassRegistry&);
 void initializeGCMachineCodeAnalysisPass(PassRegistry&);
 void initializeGCModuleInfoPass(PassRegistry&);
 void initializeGCOVProfilerLegacyPassPass(PassRegistry&);
+void initializeGVNHoistLegacyPassPass(PassRegistry&);
 void initializeGVNLegacyPassPass(PassRegistry&);
 void initializeGlobalDCELegacyPassPass(PassRegistry&);
 void initializeGlobalMergePass(PassRegistry&);
@@ -149,8 +151,8 @@ void initializeGlobalSplitPass(PassRegistry&);
 void initializeGlobalsAAWrapperPassPass(PassRegistry&);
 void initializeGuardWideningLegacyPassPass(PassRegistry&);
 void initializeIPCPPass(PassRegistry&);
-void initializeIPSCCPLegacyPassPass(PassRegistry &);
-void initializeIRTranslatorPass(PassRegistry &);
+void initializeIPSCCPLegacyPassPass(PassRegistry&);
+void initializeIRTranslatorPass(PassRegistry&);
 void initializeIVUsersWrapperPassPass(PassRegistry&);
 void initializeIfConverterPass(PassRegistry&);
 void initializeImplicitNullChecksPass(PassRegistry&);
@@ -162,20 +164,22 @@ void initializeInlineCostAnalysisPass(PassRegistry&);
 void initializeInstCountPass(PassRegistry&);
 void initializeInstNamerPass(PassRegistry&);
 void initializeInstSimplifierPass(PassRegistry&);
-void initializeInstrProfilingLegacyPassPass(PassRegistry &);
+void initializeInstrProfilingLegacyPassPass(PassRegistry&);
 void initializeInstructionCombiningPassPass(PassRegistry&);
-void initializeInstructionSelectPass(PassRegistry &);
-void initializeInterleavedAccessPass(PassRegistry &);
+void initializeInstructionSelectPass(PassRegistry&);
+void initializeInterleavedAccessPass(PassRegistry&);
 void initializeInternalizeLegacyPassPass(PassRegistry&);
 void initializeIntervalPartitionPass(PassRegistry&);
 void initializeJumpThreadingPass(PassRegistry&);
-void initializeLCSSAWrapperPassPass(PassRegistry&);
 void initializeLCSSAVerificationPassPass(PassRegistry&);
-void initializeLegacyLICMPassPass(PassRegistry&);
-void initializeLegacyLoopSinkPassPass(PassRegistry&);
-void initializeLazyBranchProbabilityInfoPassPass(PassRegistry&);
+void initializeLCSSAWrapperPassPass(PassRegistry&);
 void initializeLazyBlockFrequencyInfoPassPass(PassRegistry&);
+void initializeLazyBranchProbabilityInfoPassPass(PassRegistry&);
+void initializeLazyMachineBlockFrequencyInfoPassPass(PassRegistry&);
 void initializeLazyValueInfoWrapperPassPass(PassRegistry&);
+void initializeLegacyLICMPassPass(PassRegistry&);
+void initializeLegacyLoopSinkPassPass(PassRegistry&);
+void initializeLazyValueInfoPrinterPass(PassRegistry&);
 void initializeLegalizerPass(PassRegistry&);
 void initializeLibCallsShrinkWrapLegacyPassPass(PassRegistry&);
 void initializeLintPass(PassRegistry&);
@@ -186,18 +190,18 @@ void initializeLiveRegMatrixPass(PassRegistry&);
 void initializeLiveStacksPass(PassRegistry&);
 void initializeLiveVariablesPass(PassRegistry&);
 void initializeLoadCombinePass(PassRegistry&);
-void initializeLoaderPassPass(PassRegistry&);
 void initializeLoadStoreVectorizerPass(PassRegistry&);
+void initializeLoaderPassPass(PassRegistry&);
 void initializeLocalStackSlotPassPass(PassRegistry&);
 void initializeLoopAccessLegacyAnalysisPass(PassRegistry&);
-void initializeLoopDataPrefetchLegacyPassPass(PassRegistry &);
+void initializeLoopDataPrefetchLegacyPassPass(PassRegistry&);
 void initializeLoopDeletionLegacyPassPass(PassRegistry&);
 void initializeLoopDistributeLegacyPass(PassRegistry&);
 void initializeLoopExtractorPass(PassRegistry&);
 void initializeLoopIdiomRecognizeLegacyPassPass(PassRegistry&);
 void initializeLoopInfoWrapperPassPass(PassRegistry&);
 void initializeLoopInstSimplifyLegacyPassPass(PassRegistry&);
-void initializeLoopInterchangePass(PassRegistry &);
+void initializeLoopInterchangePass(PassRegistry&);
 void initializeLoopLoadEliminationPass(PassRegistry&);
 void initializeLoopPassPass(PassRegistry&);
 void initializeLoopPredicationLegacyPassPass(PassRegistry&);
@@ -210,8 +214,8 @@ void initializeLoopUnrollPass(PassRegistry&);
 void initializeLoopUnswitchPass(PassRegistry&);
 void initializeLoopVectorizePass(PassRegistry&);
 void initializeLoopVersioningLICMPass(PassRegistry&);
-void initializeLoopVersioningPassPass(PassRegistry &);
-void initializeLowerAtomicLegacyPassPass(PassRegistry &);
+void initializeLoopVersioningPassPass(PassRegistry&);
+void initializeLowerAtomicLegacyPassPass(PassRegistry&);
 void initializeLowerEmuTLSPass(PassRegistry&);
 void initializeLowerExpectIntrinsicPass(PassRegistry&);
 void initializeLowerGuardIntrinsicLegacyPassPass(PassRegistry&);
@@ -225,7 +229,7 @@ void initializeMachineBlockPlacementPass(PassRegistry&);
 void initializeMachineBlockPlacementStatsPass(PassRegistry&);
 void initializeMachineBranchProbabilityInfoPass(PassRegistry&);
 void initializeMachineCSEPass(PassRegistry&);
-void initializeMachineCombinerPass(PassRegistry &);
+void initializeMachineCombinerPass(PassRegistry&);
 void initializeMachineCopyPropagationPass(PassRegistry&);
 void initializeMachineDominanceFrontierPass(PassRegistry&);
 void initializeMachineDominatorTreePass(PassRegistry&);
@@ -234,6 +238,7 @@ void initializeMachineLICMPass(PassRegistry&);
 void initializeMachineLoopInfoPass(PassRegistry&);
 void initializeMachineModuleInfoPass(PassRegistry&);
 void initializeMachineOptimizationRemarkEmitterPassPass(PassRegistry&);
+void initializeMachineOutlinerPass(PassRegistry&);
 void initializeMachinePipelinerPass(PassRegistry&);
 void initializeMachinePostDominatorTreePass(PassRegistry&);
 void initializeMachineRegionInfoPassPass(PassRegistry&);
@@ -245,17 +250,17 @@ void initializeMemCpyOptLegacyPassPass(PassRegistry&);
 void initializeMemDepPrinterPass(PassRegistry&);
 void initializeMemDerefPrinterPass(PassRegistry&);
 void initializeMemoryDependenceWrapperPassPass(PassRegistry&);
+void initializeMemorySSAPrinterLegacyPassPass(PassRegistry&);
 void initializeMemorySSAWrapperPassPass(PassRegistry&);
-void initializeMemorySSAPrinterLegacyPassPass(PassRegistry &);
 void initializeMemorySanitizerPass(PassRegistry&);
 void initializeMergeFunctionsPass(PassRegistry&);
-void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry &);
+void initializeMergedLoadStoreMotionLegacyPassPass(PassRegistry&);
 void initializeMetaRenamerPass(PassRegistry&);
 void initializeModuleDebugInfoPrinterPass(PassRegistry&);
-void initializeModuleSummaryIndexWrapperPassPass(PassRegistry &);
-void initializeNameAnonGlobalLegacyPassPass(PassRegistry &);
-void initializeNaryReassociateLegacyPassPass(PassRegistry &);
-void initializeNewGVNPass(PassRegistry&);
+void initializeModuleSummaryIndexWrapperPassPass(PassRegistry&);
+void initializeNameAnonGlobalLegacyPassPass(PassRegistry&);
+void initializeNaryReassociateLegacyPassPass(PassRegistry&);
+void initializeNewGVNLegacyPassPass(PassRegistry&);
 void initializeObjCARCAAWrapperPassPass(PassRegistry&);
 void initializeObjCARCAPElimPass(PassRegistry&);
 void initializeObjCARCContractPass(PassRegistry&);
@@ -263,17 +268,18 @@ void initializeObjCARCExpandPass(PassRegistry&);
 void initializeObjCARCOptPass(PassRegistry&);
 void initializeOptimizationRemarkEmitterWrapperPassPass(PassRegistry&);
 void initializeOptimizePHIsPass(PassRegistry&);
-void initializePAEvalPass(PassRegistry &);
+void initializePAEvalPass(PassRegistry&);
 void initializePEIPass(PassRegistry&);
 void initializePGOIndirectCallPromotionLegacyPassPass(PassRegistry&);
 void initializePGOInstrumentationGenLegacyPassPass(PassRegistry&);
 void initializePGOInstrumentationUseLegacyPassPass(PassRegistry&);
+void initializePGOMemOPSizeOptLegacyPassPass(PassRegistry&);
 void initializePHIEliminationPass(PassRegistry&);
-void initializePhysicalRegisterUsageInfoPass(PassRegistry &);
-void initializePartialInlinerLegacyPassPass(PassRegistry &);
-void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry &);
-void initializePatchableFunctionPass(PassRegistry &);
+void initializePartialInlinerLegacyPassPass(PassRegistry&);
+void initializePartiallyInlineLibCallsLegacyPassPass(PassRegistry&);
+void initializePatchableFunctionPass(PassRegistry&);
 void initializePeepholeOptimizerPass(PassRegistry&);
+void initializePhysicalRegisterUsageInfoPass(PassRegistry&);
 void initializePlaceBackedgeSafepointsImplPass(PassRegistry&);
 void initializePlaceSafepointsPass(PassRegistry&);
 void initializePostDomOnlyPrinterPass(PassRegistry&);
@@ -286,15 +292,17 @@ void initializePostOrderFunctionAttrsLegacyPassPass(PassRegistry&);
 void initializePostRAHazardRecognizerPass(PassRegistry&);
 void initializePostRASchedulerPass(PassRegistry&);
 void initializePreISelIntrinsicLoweringLegacyPassPass(PassRegistry&);
+void initializePredicateInfoPrinterLegacyPassPass(PassRegistry&);
 void initializePrintBasicBlockPassPass(PassRegistry&);
 void initializePrintFunctionPassWrapperPass(PassRegistry&);
 void initializePrintModulePassWrapperPass(PassRegistry&);
 void initializeProcessImplicitDefsPass(PassRegistry&);
-void initializeProfileSummaryInfoWrapperPassPass(PassRegistry &);
-void initializePromoteLegacyPassPass(PassRegistry &);
+void initializeProfileSummaryInfoWrapperPassPass(PassRegistry&);
+void initializePromoteLegacyPassPass(PassRegistry&);
 void initializePruneEHPass(PassRegistry&);
+void initializeRAGreedyPass(PassRegistry&);
 void initializeReassociateLegacyPassPass(PassRegistry&);
-void initializeRegBankSelectPass(PassRegistry &);
+void initializeRegBankSelectPass(PassRegistry&);
 void initializeRegToMemPass(PassRegistry&);
 void initializeRegionInfoPassPass(PassRegistry&);
 void initializeRegionOnlyPrinterPass(PassRegistry&);
@@ -302,13 +310,12 @@ void initializeRegionOnlyViewerPass(PassRegistry&);
 void initializeRegionPrinterPass(PassRegistry&);
 void initializeRegionViewerPass(PassRegistry&);
 void initializeRegisterCoalescerPass(PassRegistry&);
-void initializeStripGCRelocatesPass(PassRegistry&);
 void initializeRenameIndependentSubregsPass(PassRegistry&);
-void initializeResetMachineFunctionPass(PassRegistry &);
+void initializeResetMachineFunctionPass(PassRegistry&);
 void initializeReversePostOrderFunctionAttrsLegacyPassPass(PassRegistry&);
 void initializeRewriteStatepointsForGCPass(PassRegistry&);
 void initializeRewriteSymbolsLegacyPassPass(PassRegistry&);
-void initializeSCCPLegacyPassPass(PassRegistry &);
+void initializeSCCPLegacyPassPass(PassRegistry&);
 void initializeSCEVAAWrapperPassPass(PassRegistry&);
 void initializeSLPVectorizerPass(PassRegistry&);
 void initializeSROALegacyPassPass(PassRegistry&);
@@ -318,9 +325,9 @@ void initializeSanitizerCoverageModulePass(PassRegistry&);
 void initializeScalarEvolutionWrapperPassPass(PassRegistry&);
 void initializeScalarizerPass(PassRegistry&);
 void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&);
-void initializeSeparateConstOffsetFromGEPPass(PassRegistry &);
+void initializeSeparateConstOffsetFromGEPPass(PassRegistry&);
 void initializeShadowStackGCLoweringPass(PassRegistry&);
-void initializeShrinkWrapPass(PassRegistry &);
+void initializeShrinkWrapPass(PassRegistry&);
 void initializeSimpleInlinerPass(PassRegistry&);
 void initializeSingleLoopExtractorPass(PassRegistry&);
 void initializeSinkingLegacyPassPass(PassRegistry&);
@@ -332,19 +339,20 @@ void initializeStackColoringPass(PassRegistry&);
 void initializeStackMapLivenessPass(PassRegistry&);
 void initializeStackProtectorPass(PassRegistry&);
 void initializeStackSlotColoringPass(PassRegistry&);
-void initializeStraightLineStrengthReducePass(PassRegistry &);
+void initializeStraightLineStrengthReducePass(PassRegistry&);
 void initializeStripDeadDebugInfoPass(PassRegistry&);
 void initializeStripDeadPrototypesLegacyPassPass(PassRegistry&);
 void initializeStripDebugDeclarePass(PassRegistry&);
+void initializeStripGCRelocatesPass(PassRegistry&);
 void initializeStripNonDebugSymbolsPass(PassRegistry&);
 void initializeStripNonLineTableDebugInfoPass(PassRegistry&);
 void initializeStripSymbolsPass(PassRegistry&);
 void initializeStructurizeCFGPass(PassRegistry&);
 void initializeTailCallElimPass(PassRegistry&);
 void initializeTailDuplicatePassPass(PassRegistry&);
-void initializeTargetLibraryInfoWrapperPassPass(PassRegistry &);
+void initializeTargetLibraryInfoWrapperPassPass(PassRegistry&);
 void initializeTargetPassConfigPass(PassRegistry&);
-void initializeTargetTransformInfoWrapperPassPass(PassRegistry &);
+void initializeTargetTransformInfoWrapperPassPass(PassRegistry&);
 void initializeThreadSanitizerPass(PassRegistry&);
 void initializeTwoAddressInstructionPassPass(PassRegistry&);
 void initializeTypeBasedAAWrapperPassPass(PassRegistry&);
@@ -355,11 +363,11 @@ void initializeUnreachableMachineBlockElimPass(PassRegistry&);
 void initializeVerifierLegacyPassPass(PassRegistry&);
 void initializeVirtRegMapPass(PassRegistry&);
 void initializeVirtRegRewriterPass(PassRegistry&);
-void initializeWholeProgramDevirtPass(PassRegistry &);
+void initializeWholeProgramDevirtPass(PassRegistry&);
 void initializeWinEHPreparePass(PassRegistry&);
-void initializeWriteBitcodePassPass(PassRegistry &);
-void initializeWriteThinLTOBitcodePass(PassRegistry &);
-void initializeXRayInstrumentationPass(PassRegistry &);
+void initializeWriteBitcodePassPass(PassRegistry&);
+void initializeWriteThinLTOBitcodePass(PassRegistry&);
+void initializeXRayInstrumentationPass(PassRegistry&);
 }
 
 #endif
diff --git a/include/llvm/LTO/Caching.h b/include/llvm/LTO/Caching.h
index 769f4cd9cc769ed976861bb0af6102c97e16ac21..f5ec70e081c12ce4d824173acf02619eca7bd101 100644
--- a/include/llvm/LTO/Caching.h
+++ b/include/llvm/LTO/Caching.h
@@ -24,12 +24,19 @@ namespace lto {
 /// This type defines the callback to add a pre-existing native object file
 /// (e.g. in a cache).
 ///
-/// File callbacks must be thread safe.
-typedef std::function<void(unsigned Task, StringRef Path)> AddFileFn;
+/// MB->getBufferIdentifier() is a valid path for the file at the time that it
+/// was opened, but clients should prefer to access MB directly in order to
+/// avoid a potential race condition.
+///
+/// Buffer callbacks must be thread safe.
+typedef std::function<void(unsigned Task, std::unique_ptr<MemoryBuffer> MB)>
+    AddBufferFn;
 
 /// Create a local file system cache which uses the given cache directory and
-/// file callback.
-NativeObjectCache localCache(StringRef CacheDirectoryPath, AddFileFn AddFile);
+/// file callback. This function also creates the cache directory if it does not
+/// already exist.
+Expected<NativeObjectCache> localCache(StringRef CacheDirectoryPath,
+                                       AddBufferFn AddBuffer);
 
 } // namespace lto
 } // namespace llvm
diff --git a/include/llvm/LTO/Config.h b/include/llvm/LTO/Config.h
index 3aa48c9f7c2868f0a8f5caf8ea3f0c9e03d68ae0..ede6637dfa4dd61f29ca653c5a9c738148123283 100644
--- a/include/llvm/LTO/Config.h
+++ b/include/llvm/LTO/Config.h
@@ -17,6 +17,7 @@
 
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
 #include <functional>
@@ -41,6 +42,7 @@ struct Config {
   Reloc::Model RelocModel = Reloc::PIC_;
   CodeModel::Model CodeModel = CodeModel::Default;
   CodeGenOpt::Level CGOptLevel = CodeGenOpt::Default;
+  TargetMachine::CodeGenFileType CGFileType = TargetMachine::CGFT_ObjectFile;
   unsigned OptLevel = 2;
   bool DisableVerify = false;
 
@@ -68,6 +70,12 @@ struct Config {
   /// Sample PGO profile path.
   std::string SampleProfile;
 
+  /// Optimization remarks file path.
+  std::string RemarksFilename = "";
+
+  /// Whether to emit optimization remarks with hotness informations.
+  bool RemarksWithHotness = false;
+
   bool ShouldDiscardValueNames = true;
   DiagnosticHandlerFunction DiagHandler;
 
diff --git a/include/llvm/LTO/LTO.h b/include/llvm/LTO/LTO.h
index 78ac73a7418ccf80db053f84b57bd22bd0f6bcd5..693568f5b9a968b393ab02b8289019c5a4afc4fe 100644
--- a/include/llvm/LTO/LTO.h
+++ b/include/llvm/LTO/LTO.h
@@ -19,12 +19,14 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSet.h"
-#include "llvm/CodeGen/Analysis.h"
+#include "llvm/Analysis/ObjectUtils.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/LTO/Config.h"
 #include "llvm/Linker/IRMover.h"
-#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/IRSymtab.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/thread.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
@@ -68,25 +70,35 @@ std::string getThinLTOOutputFile(const std::string &Path,
                                  const std::string &OldPrefix,
                                  const std::string &NewPrefix);
 
+/// Setup optimization remarks.
+Expected<std::unique_ptr<tool_output_file>>
+setupOptimizationRemarks(LLVMContext &Context, StringRef LTORemarksFilename,
+                         bool LTOPassRemarksWithHotness, int Count = -1);
+
 class LTO;
 struct SymbolResolution;
 class ThinBackendProc;
 
-/// An input file. This is a wrapper for ModuleSymbolTable that exposes only the
+/// An input file. This is a symbol table wrapper that only exposes the
 /// information that an LTO client should need in order to do symbol resolution.
 class InputFile {
+public:
+  class Symbol;
+
+private:
   // FIXME: Remove LTO class friendship once we have bitcode symbol tables.
   friend LTO;
   InputFile() = default;
 
-  // FIXME: Remove the LLVMContext once we have bitcode symbol tables.
-  LLVMContext Ctx;
-  struct InputModule;
-  std::vector<InputModule> Mods;
-  ModuleSymbolTable SymTab;
+  std::vector<BitcodeModule> Mods;
+  SmallVector<char, 0> Strtab;
+  std::vector<Symbol> Symbols;
 
-  std::vector<StringRef> Comdats;
-  DenseMap<const Comdat *, unsigned> ComdatMap;
+  // [begin, end) for each module
+  std::vector<std::pair<size_t, size_t>> ModuleSymIndices;
+
+  StringRef SourceFileName, COFFLinkerOpts;
+  std::vector<StringRef> ComdatTable;
 
 public:
   ~InputFile();
@@ -94,143 +106,48 @@ public:
   /// Create an InputFile.
   static Expected<std::unique_ptr<InputFile>> create(MemoryBufferRef Object);
 
-  class symbol_iterator;
-
-  /// This is a wrapper for ArrayRef<ModuleSymbolTable::Symbol>::iterator that
-  /// exposes only the information that an LTO client should need in order to do
-  /// symbol resolution.
-  ///
-  /// This object is ephemeral; it is only valid as long as an iterator obtained
-  /// from symbols() refers to it.
-  class Symbol {
-    friend symbol_iterator;
+  /// The purpose of this class is to only expose the symbol information that an
+  /// LTO client should need in order to do symbol resolution.
+  class Symbol : irsymtab::Symbol {
     friend LTO;
 
-    ArrayRef<ModuleSymbolTable::Symbol>::iterator I;
-    const ModuleSymbolTable &SymTab;
-    const InputFile *File;
-    uint32_t Flags;
-    SmallString<64> Name;
-
-    bool shouldSkip() {
-      return !(Flags & object::BasicSymbolRef::SF_Global) ||
-             (Flags & object::BasicSymbolRef::SF_FormatSpecific);
-    }
-
-    void skip() {
-      ArrayRef<ModuleSymbolTable::Symbol>::iterator E = SymTab.symbols().end();
-      while (I != E) {
-        Flags = SymTab.getSymbolFlags(*I);
-        if (!shouldSkip())
-          break;
-        ++I;
-      }
-      if (I == E)
-        return;
-
-      Name.clear();
-      {
-        raw_svector_ostream OS(Name);
-        SymTab.printSymbolName(OS, *I);
-      }
-    }
-
-    bool isGV() const { return I->is<GlobalValue *>(); }
-    GlobalValue *getGV() const { return I->get<GlobalValue *>(); }
-
   public:
-    Symbol(ArrayRef<ModuleSymbolTable::Symbol>::iterator I,
-           const ModuleSymbolTable &SymTab, const InputFile *File)
-        : I(I), SymTab(SymTab), File(File) {
-      skip();
-    }
-
-    /// Returns the mangled name of the global.
-    StringRef getName() const { return Name; }
-
-    uint32_t getFlags() const { return Flags; }
-    GlobalValue::VisibilityTypes getVisibility() const {
-      if (isGV())
-        return getGV()->getVisibility();
-      return GlobalValue::DefaultVisibility;
-    }
-    bool canBeOmittedFromSymbolTable() const {
-      return isGV() && llvm::canBeOmittedFromSymbolTable(getGV());
-    }
-    bool isTLS() const {
-      // FIXME: Expose a thread-local flag for module asm symbols.
-      return isGV() && getGV()->isThreadLocal();
-    }
-
-    // Returns the index of the comdat this symbol is in or -1 if the symbol
-    // is not in a comdat.
-    // FIXME: We have to return Expected<int> because aliases point to an
-    // arbitrary ConstantExpr and that might not actually be a constant. That
-    // means we might not be able to find what an alias is aliased to and
-    // so find its comdat.
-    Expected<int> getComdatIndex() const;
-
-    uint64_t getCommonSize() const {
-      assert(Flags & object::BasicSymbolRef::SF_Common);
-      if (!isGV())
-        return 0;
-      return getGV()->getParent()->getDataLayout().getTypeAllocSize(
-          getGV()->getType()->getElementType());
-    }
-    unsigned getCommonAlignment() const {
-      assert(Flags & object::BasicSymbolRef::SF_Common);
-      if (!isGV())
-        return 0;
-      return getGV()->getAlignment();
-    }
-  };
-
-  class symbol_iterator {
-    Symbol Sym;
-
-  public:
-    symbol_iterator(ArrayRef<ModuleSymbolTable::Symbol>::iterator I,
-                    const ModuleSymbolTable &SymTab, const InputFile *File)
-        : Sym(I, SymTab, File) {}
-
-    symbol_iterator &operator++() {
-      ++Sym.I;
-      Sym.skip();
-      return *this;
-    }
-
-    symbol_iterator operator++(int) {
-      symbol_iterator I = *this;
-      ++*this;
-      return I;
-    }
-
-    const Symbol &operator*() const { return Sym; }
-    const Symbol *operator->() const { return &Sym; }
-
-    bool operator!=(const symbol_iterator &Other) const {
-      return Sym.I != Other.Sym.I;
-    }
+    Symbol(const irsymtab::Symbol &S) : irsymtab::Symbol(S) {}
+
+    using irsymtab::Symbol::isUndefined;
+    using irsymtab::Symbol::isCommon;
+    using irsymtab::Symbol::isWeak;
+    using irsymtab::Symbol::isIndirect;
+    using irsymtab::Symbol::getName;
+    using irsymtab::Symbol::getVisibility;
+    using irsymtab::Symbol::canBeOmittedFromSymbolTable;
+    using irsymtab::Symbol::isTLS;
+    using irsymtab::Symbol::getComdatIndex;
+    using irsymtab::Symbol::getCommonSize;
+    using irsymtab::Symbol::getCommonAlignment;
+    using irsymtab::Symbol::getCOFFWeakExternalFallback;
   };
 
   /// A range over the symbols in this InputFile.
-  iterator_range<symbol_iterator> symbols() {
-    return llvm::make_range(
-        symbol_iterator(SymTab.symbols().begin(), SymTab, this),
-        symbol_iterator(SymTab.symbols().end(), SymTab, this));
-  }
+  ArrayRef<Symbol> symbols() const { return Symbols; }
+
+  /// Returns linker options specified in the input file.
+  StringRef getCOFFLinkerOpts() const { return COFFLinkerOpts; }
 
   /// Returns the path to the InputFile.
   StringRef getName() const;
 
   /// Returns the source file path specified at compile time.
-  StringRef getSourceFileName() const;
+  StringRef getSourceFileName() const { return SourceFileName; }
 
   // Returns a table with all the comdats used by this file.
-  ArrayRef<StringRef> getComdatTable() const { return Comdats; }
+  ArrayRef<StringRef> getComdatTable() const { return ComdatTable; }
 
 private:
-  iterator_range<symbol_iterator> module_symbols(InputModule &IM);
+  ArrayRef<Symbol> module_symbols(unsigned I) const {
+    const auto &Indices = ModuleSymIndices[I];
+    return {Symbols.data() + Indices.first, Symbols.data() + Indices.second};
+  }
 };
 
 /// This class wraps an output stream for a native object. Most clients should
@@ -418,20 +335,20 @@ private:
   // Global mapping from mangled symbol names to resolutions.
   StringMap<GlobalResolution> GlobalResolutions;
 
-  void addSymbolToGlobalRes(SmallPtrSet<GlobalValue *, 8> &Used,
-                            const InputFile::Symbol &Sym, SymbolResolution Res,
+  void addSymbolToGlobalRes(const InputFile::Symbol &Sym, SymbolResolution Res,
                             unsigned Partition);
 
   // These functions take a range of symbol resolutions [ResI, ResE) and consume
   // the resolutions used by a single input module by incrementing ResI. After
   // these functions return, [ResI, ResE) will refer to the resolution range for
   // the remaining modules in the InputFile.
-  Error addModule(InputFile &Input, InputFile::InputModule &IM,
+  Error addModule(InputFile &Input, unsigned ModI,
                   const SymbolResolution *&ResI, const SymbolResolution *ResE);
-  Error addRegularLTO(BitcodeModule BM, const SymbolResolution *&ResI,
+  Error addRegularLTO(BitcodeModule BM,
+                      ArrayRef<InputFile::Symbol> Syms,
+                      const SymbolResolution *&ResI,
                       const SymbolResolution *ResE);
-  Error addThinLTO(BitcodeModule BM, Module &M,
-                   iterator_range<InputFile::symbol_iterator> Syms,
+  Error addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
                    const SymbolResolution *&ResI, const SymbolResolution *ResE);
 
   Error runRegularLTO(AddStreamFn AddStream);
diff --git a/include/llvm/LTO/LTOBackend.h b/include/llvm/LTO/LTOBackend.h
index f5e35daf34f1a05417c5bd6372311d1d05ac6e21..d4743f6940ff931c471f2b7c01c7f159247a7536 100644
--- a/include/llvm/LTO/LTOBackend.h
+++ b/include/llvm/LTO/LTOBackend.h
@@ -42,7 +42,7 @@ Error backend(Config &C, AddStreamFn AddStream,
 
 /// Runs a ThinLTO backend.
 Error thinBackend(Config &C, unsigned Task, AddStreamFn AddStream, Module &M,
-                  ModuleSummaryIndex &CombinedIndex,
+                  const ModuleSummaryIndex &CombinedIndex,
                   const FunctionImporter::ImportMapTy &ImportList,
                   const GVSummaryMapTy &DefinedGlobals,
                   MapVector<StringRef, BitcodeModule> &ModuleMap);
diff --git a/include/llvm/LTO/legacy/LTOCodeGenerator.h b/include/llvm/LTO/legacy/LTOCodeGenerator.h
index f1468211128030778d2f45ebb01ad6bf8eb1350f..952875fc854e81efbe837f4377a4d6bd11671411 100644
--- a/include/llvm/LTO/legacy/LTOCodeGenerator.h
+++ b/include/llvm/LTO/legacy/LTOCodeGenerator.h
@@ -41,6 +41,7 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -174,6 +175,10 @@ struct LTOCodeGenerator {
   /// Calls \a verifyMergedModuleOnce().
   bool compileOptimized(ArrayRef<raw_pwrite_stream *> Out);
 
+  /// Enable the Freestanding mode: indicate that the optimizer should not
+  /// assume builtins are present on the target.
+  void setFreestanding(bool Enabled) { Freestanding = Enabled; }
+
   void setDiagnosticHandler(lto_diagnostic_handler_t, void *);
 
   LLVMContext &getContext() { return Context; }
@@ -206,7 +211,6 @@ private:
   void emitError(const std::string &ErrMsg);
   void emitWarning(const std::string &ErrMsg);
 
-  bool setupOptimizationRemarks();
   void finishOptimizationRemarks();
 
   LLVMContext &Context;
@@ -237,6 +241,7 @@ private:
   bool ShouldRestoreGlobalsLinkage = false;
   TargetMachine::CodeGenFileType FileType = TargetMachine::CGFT_ObjectFile;
   std::unique_ptr<tool_output_file> DiagnosticOutputFile;
+  bool Freestanding = false;
 };
 }
 #endif
diff --git a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
index cb4a16cb5b7b68be5b6dc400e2b5bc9083755430..f9545333aabdf187d68426fb2117307cc49cf93e 100644
--- a/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
+++ b/include/llvm/LTO/legacy/ThinLTOCodeGenerator.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/Support/CachePruning.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Target/TargetOptions.h"
@@ -31,6 +32,23 @@ class StringRef;
 class LLVMContext;
 class TargetMachine;
 
+/// Wrapper around MemoryBufferRef, owning the identifier
+class ThinLTOBuffer {
+  std::string OwnedIdentifier;
+  StringRef Buffer;
+
+public:
+  ThinLTOBuffer(StringRef Buffer, StringRef Identifier)
+      : OwnedIdentifier(Identifier), Buffer(Buffer) {}
+
+  MemoryBufferRef getMemBuffer() const {
+    return MemoryBufferRef(Buffer,
+                           {OwnedIdentifier.c_str(), OwnedIdentifier.size()});
+  }
+  StringRef getBuffer() const { return Buffer; }
+  StringRef getBufferIdentifier() const { return OwnedIdentifier; }
+};
+
 /// Helper to gather options relevant to the target machine creation
 struct TargetMachineBuilder {
   Triple TheTriple;
@@ -123,9 +141,7 @@ public:
 
   struct CachingOptions {
     std::string Path;                    // Path to the cache, empty to disable.
-    int PruningInterval = 1200;          // seconds, -1 to disable pruning.
-    unsigned int Expiration = 7 * 24 * 3600;     // seconds (1w default).
-    unsigned MaxPercentageOfAvailableSpace = 75; // percentage.
+    CachePruningPolicy Policy;
   };
 
   /// Provide a path to a directory where to store the cached files for
@@ -136,14 +152,14 @@ public:
   /// negative value (default) to disable pruning. A value of 0 will be ignored.
   void setCachePruningInterval(int Interval) {
     if (Interval)
-      CacheOptions.PruningInterval = Interval;
+      CacheOptions.Policy.Interval = std::chrono::seconds(Interval);
   }
 
   /// Cache policy: expiration (in seconds) for an entry.
   /// A value of 0 will be ignored.
   void setCacheEntryExpiration(unsigned Expiration) {
     if (Expiration)
-      CacheOptions.Expiration = Expiration;
+      CacheOptions.Policy.Expiration = std::chrono::seconds(Expiration);
   }
 
   /**
@@ -161,7 +177,7 @@ public:
    */
   void setMaxCacheSizeRelativeToAvailableSpace(unsigned Percentage) {
     if (Percentage)
-      CacheOptions.MaxPercentageOfAvailableSpace = Percentage;
+      CacheOptions.Policy.PercentageOfAvailableSpace = Percentage;
   }
 
   /**@}*/
@@ -189,6 +205,10 @@ public:
     TMBuilder.Options = std::move(Options);
   }
 
+  /// Enable the Freestanding mode: indicate that the optimizer should not
+  /// assume builtins are present on the target.
+  void setFreestanding(bool Enabled) { Freestanding = Enabled; }
+
   /// CodeModel
   void setCodePICModel(Optional<Reloc::Model> Model) {
     TMBuilder.RelocModel = Model;
@@ -280,7 +300,7 @@ private:
 
   /// Vector holding the input buffers containing the bitcode modules to
   /// process.
-  std::vector<MemoryBufferRef> Modules;
+  std::vector<ThinLTOBuffer> Modules;
 
   /// Set of symbols that need to be preserved outside of the set of bitcode
   /// files.
@@ -306,6 +326,10 @@ private:
   /// importing or optimization.
   bool CodeGenOnly = false;
 
+  /// Flag to indicate that the optimizer should not assume builtins are present
+  /// on the target.
+  bool Freestanding = false;
+
   /// IR Optimization Level [0-3].
   unsigned OptLevel = 3;
 };
diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h
index 98dac1256e01ed6aa19f63c2a57fe92747524ba5..39a86e838bde3bb2e7c3857bce9203f6543d141a 100644
--- a/include/llvm/LinkAllPasses.h
+++ b/include/llvm/LinkAllPasses.h
@@ -75,6 +75,7 @@ namespace {
       (void) llvm::createCallGraphDOTPrinterPass();
       (void) llvm::createCallGraphViewerPass();
       (void) llvm::createCFGSimplificationPass();
+      (void) llvm::createLateCFGSimplificationPass();
       (void) llvm::createCFLAndersAAWrapperPass();
       (void) llvm::createCFLSteensAAWrapperPass();
       (void) llvm::createStructurizeCFGPass();
@@ -96,6 +97,7 @@ namespace {
       (void) llvm::createPGOInstrumentationGenLegacyPass();
       (void) llvm::createPGOInstrumentationUseLegacyPass();
       (void) llvm::createPGOIndirectCallPromotionLegacyPass();
+      (void) llvm::createPGOMemOPSizeOptLegacyPass();
       (void) llvm::createInstrProfilingLegacyPass();
       (void) llvm::createFunctionImportPass();
       (void) llvm::createFunctionInliningPass();
diff --git a/include/llvm/Linker/IRMover.h b/include/llvm/Linker/IRMover.h
index 2a187cbc42f5a8bf373c2677604574cffcf9bf27..235ada47cef47900d5f2203f5cf4223fcd923c43 100644
--- a/include/llvm/Linker/IRMover.h
+++ b/include/llvm/Linker/IRMover.h
@@ -71,15 +71,11 @@ public:
   ///   not present in ValuesToLink. The GlobalValue and a ValueAdder callback
   ///   are passed as an argument, and the callback is expected to be called
   ///   if the GlobalValue needs to be added to the \p ValuesToLink and linked.
-  /// - \p LinkModuleInlineAsm is true if the ModuleInlineAsm string in Src
-  ///   should be linked with (concatenated into) the ModuleInlineAsm string
-  ///   for the destination module. It should be true for full LTO, but not
-  ///   when importing for ThinLTO, otherwise we can have duplicate symbols.
   /// - \p IsPerformingImport is true when this IR link is to perform ThinLTO
   ///   function importing from Src.
   Error move(std::unique_ptr<Module> Src, ArrayRef<GlobalValue *> ValuesToLink,
              std::function<void(GlobalValue &GV, ValueAdder Add)> AddLazyFor,
-             bool LinkModuleInlineAsm, bool IsPerformingImport);
+             bool IsPerformingImport);
   Module &getModule() { return Composite; }
 
 private:
diff --git a/include/llvm/Linker/Linker.h b/include/llvm/Linker/Linker.h
index b077c373326fc3392cef3ada4b65c653ea402513..628e0112bd9d9f5ce5808d725528e154387d5353 100644
--- a/include/llvm/Linker/Linker.h
+++ b/include/llvm/Linker/Linker.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LINKER_LINKER_H
 #define LLVM_LINKER_LINKER_H
 
+#include "llvm/ADT/StringSet.h"
 #include "llvm/Linker/IRMover.h"
 
 namespace llvm {
@@ -29,10 +30,6 @@ public:
     None = 0,
     OverrideFromSrc = (1 << 0),
     LinkOnlyNeeded = (1 << 1),
-    InternalizeLinkedSymbols = (1 << 2),
-    /// Don't force link referenced linkonce definitions, import declaration.
-    DontForceLinkLinkonceODR = (1 << 3)
-
   };
 
   Linker(Module &M);
@@ -41,16 +38,20 @@ public:
   ///
   /// Passing OverrideSymbols as true will have symbols from Src
   /// shadow those in the Dest.
-  /// For ThinLTO function importing/exporting the \p ModuleSummaryIndex
-  /// is passed. If \p GlobalsToImport is provided, only the globals that
-  /// are part of the set will be imported from the source module.
+  ///
+  /// Passing InternalizeCallback will have the linker call the function with
+  /// the new module and a list of global value names to be internalized by the
+  /// callback.
   ///
   /// Returns true on error.
   bool linkInModule(std::unique_ptr<Module> Src, unsigned Flags = Flags::None,
-                    DenseSet<const GlobalValue *> *GlobalsToImport = nullptr);
+                    std::function<void(Module &, const StringSet<> &)>
+                        InternalizeCallback = {});
 
   static bool linkModules(Module &Dest, std::unique_ptr<Module> Src,
-                          unsigned Flags = Flags::None);
+                          unsigned Flags = Flags::None,
+                          std::function<void(Module &, const StringSet<> &)>
+                              InternalizeCallback = {});
 };
 
 } // End llvm namespace
diff --git a/include/llvm/MC/ConstantPools.h b/include/llvm/MC/ConstantPools.h
index f0c445dbe59fd97af9d3c931329f3e9ed430e0fc..643902377dd31350e50a5e3aa4ad1c003631059b 100644
--- a/include/llvm/MC/ConstantPools.h
+++ b/include/llvm/MC/ConstantPools.h
@@ -11,15 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-
 #ifndef LLVM_MC_CONSTANTPOOLS_H
 #define LLVM_MC_CONSTANTPOOLS_H
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/SMLoc.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MCContext;
 class MCExpr;
 class MCSection;
@@ -30,6 +32,7 @@ class MCSymbolRefExpr;
 struct ConstantPoolEntry {
   ConstantPoolEntry(MCSymbol *L, const MCExpr *Val, unsigned Sz, SMLoc Loc_)
     : Label(L), Value(Val), Size(Sz), Loc(Loc_) {}
+
   MCSymbol *Label;
   const MCExpr *Value;
   unsigned Size;
@@ -45,7 +48,7 @@ class ConstantPool {
 
 public:
   // Initialize a new empty constant pool
-  ConstantPool() {}
+  ConstantPool() = default;
 
   // Add a new entry to the constant pool in the next slot.
   // \param Value is the new entry to put in the constant pool.
@@ -90,6 +93,7 @@ private:
   ConstantPool *getConstantPool(MCSection *Section);
   ConstantPool &getOrCreateConstantPool(MCSection *Section);
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_CONSTANTPOOLS_H
diff --git a/include/llvm/MC/MCAsmBackend.h b/include/llvm/MC/MCAsmBackend.h
index d4bdbcd2baa37413d3c2962ddff0d860812be7a7..fb21e195b1dfe3e22813044bc18206b54c9b63af 100644
--- a/include/llvm/MC/MCAsmBackend.h
+++ b/include/llvm/MC/MCAsmBackend.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCAsmBackend.h - MC Asm Backend -----------------*- C++ -*-===//
+//===- llvm/MC/MCAsmBackend.h - MC Asm Backend ------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,35 +12,33 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCFixup.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MCAsmLayout;
 class MCAssembler;
 class MCCFIInstruction;
-class MCELFObjectTargetWriter;
 struct MCFixupKindInfo;
 class MCFragment;
 class MCInst;
-class MCRelaxableFragment;
 class MCObjectWriter;
-class MCSection;
+class MCRelaxableFragment;
 class MCSubtargetInfo;
 class MCValue;
 class raw_pwrite_stream;
 
 /// Generic interface to target specific assembler backends.
 class MCAsmBackend {
-  MCAsmBackend(const MCAsmBackend &) = delete;
-  void operator=(const MCAsmBackend &) = delete;
-
 protected: // Can only create subclasses.
   MCAsmBackend();
 
 public:
+  MCAsmBackend(const MCAsmBackend &) = delete;
+  MCAsmBackend &operator=(const MCAsmBackend &) = delete;
   virtual ~MCAsmBackend();
 
   /// lifetime management
@@ -73,9 +71,11 @@ public:
 
   /// Apply the \p Value for given \p Fixup into the provided data fragment, at
   /// the offset specified by the fixup and following the fixup kind as
-  /// appropriate.
+  /// appropriate. Errors (such as an out of range fixup value) should be
+  /// reported via \p Ctx.
   virtual void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                          uint64_t Value, bool IsPCRel) const = 0;
+                          uint64_t Value, bool IsPCRel,
+                          MCContext &Ctx) const = 0;
 
   /// @}
 
@@ -136,6 +136,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCASMBACKEND_H
diff --git a/include/llvm/MC/MCAsmInfo.h b/include/llvm/MC/MCAsmInfo.h
index f898bf5288d60351bd39967297d424ba61da905a..bd2717de9960bdf0517f8840711eb54860f0d42e 100644
--- a/include/llvm/MC/MCAsmInfo.h
+++ b/include/llvm/MC/MCAsmInfo.h
@@ -16,20 +16,22 @@
 #ifndef LLVM_MC_MCASMINFO_H
 #define LLVM_MC_MCASMINFO_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCTargetOptions.h"
-#include <cassert>
 #include <vector>
 
 namespace llvm {
+
+class MCContext;
 class MCExpr;
 class MCSection;
 class MCStreamer;
 class MCSymbol;
-class MCContext;
 
 namespace WinEH {
+
 enum class EncodingType {
   Invalid, /// Invalid
   Alpha,   /// Windows Alpha
@@ -40,11 +42,14 @@ enum class EncodingType {
   X86,     /// Windows x86, uses no CFI, just EH tables
   MIPS = Alpha,
 };
-}
+
+} // end namespace WinEH
 
 namespace LCOMM {
+
 enum LCOMMType { NoAlignment, ByteAlignment, Log2Alignment };
-}
+
+} // end namespace LCOMM
 
 enum class DebugCompressionType {
   DCT_None,    // no compression
@@ -61,41 +66,41 @@ protected:
   //
 
   /// Pointer size in bytes.  Default is 4.
-  unsigned PointerSize;
+  unsigned PointerSize = 4;
 
   /// Size of the stack slot reserved for callee-saved registers, in bytes.
   /// Default is same as pointer size.
-  unsigned CalleeSaveStackSlotSize;
+  unsigned CalleeSaveStackSlotSize = 4;
 
   /// True if target is little endian.  Default is true.
-  bool IsLittleEndian;
+  bool IsLittleEndian = true;
 
   /// True if target stack grow up.  Default is false.
-  bool StackGrowsUp;
+  bool StackGrowsUp = false;
 
   /// True if this target has the MachO .subsections_via_symbols directive.
   /// Default is false.
-  bool HasSubsectionsViaSymbols;
+  bool HasSubsectionsViaSymbols = false;
 
   /// True if this is a MachO target that supports the macho-specific .zerofill
   /// directive for emitting BSS Symbols.  Default is false.
-  bool HasMachoZeroFillDirective;
+  bool HasMachoZeroFillDirective = false;
 
   /// True if this is a MachO target that supports the macho-specific .tbss
   /// directive for emitting thread local BSS Symbols.  Default is false.
-  bool HasMachoTBSSDirective;
+  bool HasMachoTBSSDirective = false;
 
   /// This is the maximum possible length of an instruction, which is needed to
   /// compute the size of an inline asm.  Defaults to 4.
-  unsigned MaxInstLength;
+  unsigned MaxInstLength = 4;
 
   /// Every possible instruction length is a multiple of this value.  Factored
   /// out in .debug_frame and .debug_line.  Defaults to 1.
-  unsigned MinInstAlignment;
+  unsigned MinInstAlignment = 1;
 
   /// The '$' token, when not referencing an identifier or constant, refers to
   /// the current PC.  Defaults to false.
-  bool DollarIsPC;
+  bool DollarIsPC = false;
 
   /// This string, if specified, is used to separate instructions from each
   /// other when on the same line.  Defaults to ';'
@@ -109,10 +114,10 @@ protected:
   const char *LabelSuffix;
 
   // Print the EH begin symbol with an assignment. Defaults to false.
-  bool UseAssignmentForEHBegin;
+  bool UseAssignmentForEHBegin = false;
 
   // Do we need to create a local symbol for .size?
-  bool NeedsLocalForSize;
+  bool NeedsLocalForSize = false;
 
   /// This prefix is used for globals like constant pool entries that are
   /// completely private to the .s file and should not have names in the .o
@@ -142,20 +147,20 @@ protected:
   const char *Code64Directive;
 
   /// Which dialect of an assembler variant to use.  Defaults to 0
-  unsigned AssemblerDialect;
+  unsigned AssemblerDialect = 0;
 
   /// This is true if the assembler allows @ characters in symbol names.
   /// Defaults to false.
-  bool AllowAtInName;
+  bool AllowAtInName = false;
 
   /// If this is true, symbol names with invalid characters will be printed in
   /// quotes.
-  bool SupportsQuotedNames;
+  bool SupportsQuotedNames = true;
 
   /// This is true if data region markers should be printed as
   /// ".data_region/.end_data_region" directives. If false, use "$d/$a" labels
   /// instead.
-  bool UseDataRegionDirectives;
+  bool UseDataRegionDirectives = false;
 
   //===--- Data Emission Directives -------------------------------------===//
 
@@ -185,13 +190,13 @@ protected:
 
   /// If non-null, a directive that is used to emit a word which should be
   /// relocated as a 64-bit GP-relative offset, e.g. .gpdword on Mips.  Defaults
-  /// to NULL.
-  const char *GPRel64Directive;
+  /// to nullptr.
+  const char *GPRel64Directive = nullptr;
 
   /// If non-null, a directive that is used to emit a word which should be
   /// relocated as a 32-bit GP-relative offset, e.g. .gpword on Mips or .gprel32
-  /// on Alpha.  Defaults to NULL.
-  const char *GPRel32Directive;
+  /// on Alpha.  Defaults to nullptr.
+  const char *GPRel32Directive = nullptr;
 
   /// If non-null, directives that are used to emit a word/dword which should
   /// be relocated as a 32/64-bit DTP/TP-relative offset, e.g. .dtprelword/
@@ -204,14 +209,14 @@ protected:
   /// This is true if this target uses "Sun Style" syntax for section switching
   /// ("#alloc,#write" etc) instead of the normal ELF syntax (,"a,w") in
   /// .section directives.  Defaults to false.
-  bool SunStyleELFSectionSwitchSyntax;
+  bool SunStyleELFSectionSwitchSyntax = false;
 
   /// This is true if this target uses ELF '.section' directive before the
   /// '.bss' one. It's used for PPC/Linux which doesn't support the '.bss'
   /// directive only.  Defaults to false.
-  bool UsesELFSectionDirectiveForBSS;
+  bool UsesELFSectionDirectiveForBSS = false;
 
-  bool NeedsDwarfSectionOffsetDirective;
+  bool NeedsDwarfSectionOffsetDirective = false;
 
   //===--- Alignment Information ----------------------------------------===//
 
@@ -219,11 +224,11 @@ protected:
   /// directives, where N is the number of bytes to align to.  Otherwise, it
   /// emits ".align log2(N)", e.g. 3 to align to an 8 byte boundary.  Defaults
   /// to true.
-  bool AlignmentIsInBytes;
+  bool AlignmentIsInBytes = true;
 
   /// If non-zero, this is used to fill the executable space created as the
   /// result of a alignment directive.  Defaults to 0
-  unsigned TextAlignFillValue;
+  unsigned TextAlignFillValue = 0;
 
   //===--- Global Variable Emission Directives --------------------------===//
 
@@ -236,7 +241,7 @@ protected:
   /// uses a relocation but it can be suppressed by writing
   ///   a = f - g
   ///   .long a
-  bool SetDirectiveSuppressesReloc;
+  bool SetDirectiveSuppressesReloc = false;
 
   /// False if the assembler requires that we use
   /// \code
@@ -251,98 +256,98 @@ protected:
   /// \endcode
   ///
   ///  Defaults to true.
-  bool HasAggressiveSymbolFolding;
+  bool HasAggressiveSymbolFolding = true;
 
   /// True is .comm's and .lcomms optional alignment is to be specified in bytes
   /// instead of log2(n).  Defaults to true.
-  bool COMMDirectiveAlignmentIsInBytes;
+  bool COMMDirectiveAlignmentIsInBytes = true;
 
   /// Describes if the .lcomm directive for the target supports an alignment
   /// argument and how it is interpreted.  Defaults to NoAlignment.
-  LCOMM::LCOMMType LCOMMDirectiveAlignmentType;
+  LCOMM::LCOMMType LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
 
   // True if the target allows .align directives on functions. This is true for
   // most targets, so defaults to true.
-  bool HasFunctionAlignment;
+  bool HasFunctionAlignment = true;
 
   /// True if the target has .type and .size directives, this is true for most
   /// ELF targets.  Defaults to true.
-  bool HasDotTypeDotSizeDirective;
+  bool HasDotTypeDotSizeDirective = true;
 
   /// True if the target has a single parameter .file directive, this is true
   /// for ELF targets.  Defaults to true.
-  bool HasSingleParameterDotFile;
+  bool HasSingleParameterDotFile = true;
 
   /// True if the target has a .ident directive, this is true for ELF targets.
   /// Defaults to false.
-  bool HasIdentDirective;
+  bool HasIdentDirective = false;
 
   /// True if this target supports the MachO .no_dead_strip directive.  Defaults
   /// to false.
-  bool HasNoDeadStrip;
+  bool HasNoDeadStrip = false;
 
   /// True if this target supports the MachO .alt_entry directive.  Defaults to
   /// false.
-  bool HasAltEntry;
+  bool HasAltEntry = false;
 
   /// Used to declare a global as being a weak symbol. Defaults to ".weak".
   const char *WeakDirective;
 
   /// This directive, if non-null, is used to declare a global as being a weak
-  /// undefined symbol.  Defaults to NULL.
-  const char *WeakRefDirective;
+  /// undefined symbol.  Defaults to nullptr.
+  const char *WeakRefDirective = nullptr;
 
   /// True if we have a directive to declare a global as being a weak defined
   /// symbol.  Defaults to false.
-  bool HasWeakDefDirective;
+  bool HasWeakDefDirective = false;
 
   /// True if we have a directive to declare a global as being a weak defined
   /// symbol that can be hidden (unexported).  Defaults to false.
-  bool HasWeakDefCanBeHiddenDirective;
+  bool HasWeakDefCanBeHiddenDirective = false;
 
   /// True if we have a .linkonce directive.  This is used on cygwin/mingw.
   /// Defaults to false.
-  bool HasLinkOnceDirective;
+  bool HasLinkOnceDirective = false;
 
   /// This attribute, if not MCSA_Invalid, is used to declare a symbol as having
   /// hidden visibility.  Defaults to MCSA_Hidden.
-  MCSymbolAttr HiddenVisibilityAttr;
+  MCSymbolAttr HiddenVisibilityAttr = MCSA_Hidden;
 
   /// This attribute, if not MCSA_Invalid, is used to declare an undefined
   /// symbol as having hidden visibility. Defaults to MCSA_Hidden.
-  MCSymbolAttr HiddenDeclarationVisibilityAttr;
+  MCSymbolAttr HiddenDeclarationVisibilityAttr = MCSA_Hidden;
 
   /// This attribute, if not MCSA_Invalid, is used to declare a symbol as having
   /// protected visibility.  Defaults to MCSA_Protected
-  MCSymbolAttr ProtectedVisibilityAttr;
+  MCSymbolAttr ProtectedVisibilityAttr = MCSA_Protected;
 
   //===--- Dwarf Emission Directives -----------------------------------===//
 
   /// True if target supports emission of debugging information.  Defaults to
   /// false.
-  bool SupportsDebugInformation;
+  bool SupportsDebugInformation = false;
 
   /// Exception handling format for the target.  Defaults to None.
-  ExceptionHandling ExceptionsType;
+  ExceptionHandling ExceptionsType = ExceptionHandling::None;
 
   /// Windows exception handling data (.pdata) encoding.  Defaults to Invalid.
-  WinEH::EncodingType WinEHEncodingType;
+  WinEH::EncodingType WinEHEncodingType = WinEH::EncodingType::Invalid;
 
   /// True if Dwarf2 output generally uses relocations for references to other
   /// .debug_* sections.
-  bool DwarfUsesRelocationsAcrossSections;
+  bool DwarfUsesRelocationsAcrossSections = true;
 
   /// True if DWARF FDE symbol reference relocations should be replaced by an
   /// absolute difference.
-  bool DwarfFDESymbolsUseAbsDiff;
+  bool DwarfFDESymbolsUseAbsDiff = false;
 
   /// True if dwarf register numbers are printed instead of symbolic register
   /// names in .cfi_* directives.  Defaults to false.
-  bool DwarfRegNumForCFI;
+  bool DwarfRegNumForCFI = false;
 
   /// True if target uses parens to indicate the symbol variant instead of @.
   /// For example, foo(plt) instead of foo@plt.  Defaults to false.
-  bool UseParensForSymbolVariant;
+  bool UseParensForSymbolVariant = false;
 
   //===--- Prologue State ----------------------------------------------===//
 
@@ -361,11 +366,11 @@ protected:
   bool PreserveAsmComments;
 
   /// Compress DWARF debug sections. Defaults to no compression.
-  DebugCompressionType CompressDebugSections;
+  DebugCompressionType CompressDebugSections = DebugCompressionType::DCT_None;
 
   /// True if the integrated assembler should interpret 'a >> b' constant
   /// expressions as logical rather than arithmetic.
-  bool UseLogicalShr;
+  bool UseLogicalShr = true;
 
   // If true, emit GOTPCRELX/REX_GOTPCRELX instead of GOTPCREL, on
   // X86_64 ELF.
@@ -475,14 +480,17 @@ public:
   bool needsLocalForSize() const { return NeedsLocalForSize; }
   StringRef getPrivateGlobalPrefix() const { return PrivateGlobalPrefix; }
   StringRef getPrivateLabelPrefix() const { return PrivateLabelPrefix; }
+
   bool hasLinkerPrivateGlobalPrefix() const {
     return LinkerPrivateGlobalPrefix[0] != '\0';
   }
+
   StringRef getLinkerPrivateGlobalPrefix() const {
     if (hasLinkerPrivateGlobalPrefix())
       return LinkerPrivateGlobalPrefix;
     return getPrivateGlobalPrefix();
   }
+
   const char *getInlineAsmStart() const { return InlineAsmStart; }
   const char *getInlineAsmEnd() const { return InlineAsmEnd; }
   const char *getCode16Directive() const { return Code16Directive; }
@@ -491,25 +499,32 @@ public:
   unsigned getAssemblerDialect() const { return AssemblerDialect; }
   bool doesAllowAtInName() const { return AllowAtInName; }
   bool supportsNameQuoting() const { return SupportsQuotedNames; }
+
   bool doesSupportDataRegionDirectives() const {
     return UseDataRegionDirectives;
   }
+
   const char *getZeroDirective() const { return ZeroDirective; }
   const char *getAsciiDirective() const { return AsciiDirective; }
   const char *getAscizDirective() const { return AscizDirective; }
   bool getAlignmentIsInBytes() const { return AlignmentIsInBytes; }
   unsigned getTextAlignFillValue() const { return TextAlignFillValue; }
   const char *getGlobalDirective() const { return GlobalDirective; }
+
   bool doesSetDirectiveSuppressReloc() const {
     return SetDirectiveSuppressesReloc;
   }
+
   bool hasAggressiveSymbolFolding() const { return HasAggressiveSymbolFolding; }
+
   bool getCOMMDirectiveAlignmentIsInBytes() const {
     return COMMDirectiveAlignmentIsInBytes;
   }
+
   LCOMM::LCOMMType getLCOMMDirectiveAlignmentType() const {
     return LCOMMDirectiveAlignmentType;
   }
+
   bool hasFunctionAlignment() const { return HasFunctionAlignment; }
   bool hasDotTypeDotSizeDirective() const { return HasDotTypeDotSizeDirective; }
   bool hasSingleParameterDotFile() const { return HasSingleParameterDotFile; }
@@ -519,22 +534,29 @@ public:
   const char *getWeakDirective() const { return WeakDirective; }
   const char *getWeakRefDirective() const { return WeakRefDirective; }
   bool hasWeakDefDirective() const { return HasWeakDefDirective; }
+
   bool hasWeakDefCanBeHiddenDirective() const {
     return HasWeakDefCanBeHiddenDirective;
   }
+
   bool hasLinkOnceDirective() const { return HasLinkOnceDirective; }
 
   MCSymbolAttr getHiddenVisibilityAttr() const { return HiddenVisibilityAttr; }
+
   MCSymbolAttr getHiddenDeclarationVisibilityAttr() const {
     return HiddenDeclarationVisibilityAttr;
   }
+
   MCSymbolAttr getProtectedVisibilityAttr() const {
     return ProtectedVisibilityAttr;
   }
+
   bool doesSupportDebugInformation() const { return SupportsDebugInformation; }
+
   bool doesSupportExceptionHandling() const {
     return ExceptionsType != ExceptionHandling::None;
   }
+
   ExceptionHandling getExceptionHandlingType() const { return ExceptionsType; }
   WinEH::EncodingType getWinEHEncodingType() const { return WinEHEncodingType; }
 
@@ -558,6 +580,7 @@ public:
   bool doesDwarfUseRelocationsAcrossSections() const {
     return DwarfUsesRelocationsAcrossSections;
   }
+
   bool doDwarfFDESymbolsUseAbsDiff() const { return DwarfFDESymbolsUseAbsDiff; }
   bool useDwarfRegNumForCFI() const { return DwarfRegNumForCFI; }
   bool useParensForSymbolVariant() const { return UseParensForSymbolVariant; }
@@ -600,6 +623,7 @@ public:
   void setRelaxELFRelocations(bool V) { RelaxELFRelocations = V; }
   bool hasMipsExpressions() const { return HasMipsExpressions; }
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_MC_MCASMINFO_H
diff --git a/include/llvm/MC/MCAsmInfoCOFF.h b/include/llvm/MC/MCAsmInfoCOFF.h
index 56444f3c7cf576bf91f6b7636f78c064d55c83d3..01c8ae49a6fcd4163eb986f4a29e0d51e213db34 100644
--- a/include/llvm/MC/MCAsmInfoCOFF.h
+++ b/include/llvm/MC/MCAsmInfoCOFF.h
@@ -1,4 +1,4 @@
-//===-- MCAsmInfoCOFF.h - COFF asm properties -------------------*- C++ -*-===//
+//===- MCAsmInfoCOFF.h - COFF asm properties --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,24 +13,28 @@
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
-  class MCAsmInfoCOFF : public MCAsmInfo {
-    virtual void anchor();
-  protected:
-    explicit MCAsmInfoCOFF();
-  };
-
-  class MCAsmInfoMicrosoft : public MCAsmInfoCOFF {
-    void anchor() override;
-  protected:
-    explicit MCAsmInfoMicrosoft();
-  };
-
-  class MCAsmInfoGNUCOFF : public MCAsmInfoCOFF {
-    void anchor() override;
-  protected:
-    explicit MCAsmInfoGNUCOFF();
-  };
-}
 
+class MCAsmInfoCOFF : public MCAsmInfo {
+  virtual void anchor();
+
+protected:
+  explicit MCAsmInfoCOFF();
+};
+
+class MCAsmInfoMicrosoft : public MCAsmInfoCOFF {
+  void anchor() override;
+
+protected:
+  explicit MCAsmInfoMicrosoft();
+};
+
+class MCAsmInfoGNUCOFF : public MCAsmInfoCOFF {
+  void anchor() override;
+
+protected:
+  explicit MCAsmInfoGNUCOFF();
+};
+
+} // end namespace llvm
 
 #endif // LLVM_MC_MCASMINFOCOFF_H
diff --git a/include/llvm/MC/MCAsmInfoDarwin.h b/include/llvm/MC/MCAsmInfoDarwin.h
index d587c3ce9d547af2f9da720e0ad44243d77a8b96..a533d604a89e1ab833f2dc2c9906259cb0d6b329 100644
--- a/include/llvm/MC/MCAsmInfoDarwin.h
+++ b/include/llvm/MC/MCAsmInfoDarwin.h
@@ -1,4 +1,4 @@
-//===---- MCAsmInfoDarwin.h - Darwin asm properties -------------*- C++ -*-===//
+//===- MCAsmInfoDarwin.h - Darwin asm properties ----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,12 +18,14 @@
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
-  class MCAsmInfoDarwin : public MCAsmInfo {
-  public:
-    explicit MCAsmInfoDarwin();
-    bool isSectionAtomizableBySymbols(const MCSection &Section) const override;
-  };
-}
 
+class MCAsmInfoDarwin : public MCAsmInfo {
+public:
+  explicit MCAsmInfoDarwin();
+
+  bool isSectionAtomizableBySymbols(const MCSection &Section) const override;
+};
+
+} // end namespace llvm
 
 #endif // LLVM_MC_MCASMINFODARWIN_H
diff --git a/include/llvm/MC/MCAsmInfoELF.h b/include/llvm/MC/MCAsmInfoELF.h
index f8bb943aac4e95dff3f117e81e365535b98ce123..f113afc9885e7dc8fc61ab826fa7e294ccd20034 100644
--- a/include/llvm/MC/MCAsmInfoELF.h
+++ b/include/llvm/MC/MCAsmInfoELF.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCAsmInfoELF.h - ELF Asm info -------------------*- C++ -*-===//
+//===- llvm/MC/MCAsmInfoELF.h - ELF Asm info --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
+
 class MCAsmInfoELF : public MCAsmInfo {
   virtual void anchor();
   MCSection *getNonexecutableStackSection(MCContext &Ctx) const final;
@@ -20,10 +21,11 @@ class MCAsmInfoELF : public MCAsmInfo {
 protected:
   /// Targets which have non-executable stacks by default can set this to false
   /// to disable the special section which requests a non-executable stack.
-  bool UsesNonexecutableStackSection;
+  bool UsesNonexecutableStackSection = true;
 
   MCAsmInfoELF();
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_MC_MCASMINFOELF_H
diff --git a/include/llvm/MC/MCAsmInfoWasm.h b/include/llvm/MC/MCAsmInfoWasm.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc46cfdf4c4cc9ba7ef520ab85ffcb323fd6db77
--- /dev/null
+++ b/include/llvm/MC/MCAsmInfoWasm.h
@@ -0,0 +1,24 @@
+//===-- llvm/MC/MCAsmInfoWasm.h - Wasm Asm info -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCASMINFOWASM_H
+#define LLVM_MC_MCASMINFOWASM_H
+
+#include "llvm/MC/MCAsmInfo.h"
+
+namespace llvm {
+class MCAsmInfoWasm : public MCAsmInfo {
+  virtual void anchor();
+
+protected:
+  MCAsmInfoWasm();
+};
+}
+
+#endif
diff --git a/include/llvm/MC/MCAssembler.h b/include/llvm/MC/MCAssembler.h
index 641e78994768ba3f746fb196dd70c85b0e17a86a..c29abaa03a6de7ff9b6fbe2275293864ae216250 100644
--- a/include/llvm/MC/MCAssembler.h
+++ b/include/llvm/MC/MCAssembler.h
@@ -10,33 +10,35 @@
 #ifndef LLVM_MC_MCASSEMBLER_H
 #define LLVM_MC_MCASSEMBLER_H
 
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/ilist.h"
-#include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFragment.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
-#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
 
 namespace llvm {
-class raw_ostream;
+
+class MCAsmBackend;
 class MCAsmLayout;
-class MCAssembler;
 class MCContext;
 class MCCodeEmitter;
-class MCExpr;
 class MCFragment;
 class MCObjectWriter;
 class MCSection;
-class MCSubtargetInfo;
 class MCValue;
-class MCAsmBackend;
 
 // FIXME: This really doesn't belong here. See comments below.
 struct IndirectSymbolData {
@@ -90,9 +92,6 @@ public:
   } VersionMinInfoType;
 
 private:
-  MCAssembler(const MCAssembler &) = delete;
-  void operator=(const MCAssembler &) = delete;
-
   MCContext &Context;
 
   MCAsmBackend &Backend;
@@ -131,9 +130,9 @@ private:
   /// By default it's 0, which means bundling is disabled.
   unsigned BundleAlignSize;
 
-  unsigned RelaxAll : 1;
-  unsigned SubsectionsViaSymbols : 1;
-  unsigned IncrementalLinkerCompatible : 1;
+  bool RelaxAll : 1;
+  bool SubsectionsViaSymbols : 1;
+  bool IncrementalLinkerCompatible : 1;
 
   /// ELF specific e_header flags
   // It would be good if there were an MCELFAssembler class to hold this.
@@ -148,7 +147,6 @@ private:
 
   VersionMinInfoType VersionMinInfo;
 
-private:
   /// Evaluate a fixup to a relocatable expression and the value which should be
   /// placed into the fixup.
   ///
@@ -201,6 +199,18 @@ private:
                                         MCFragment &F, const MCFixup &Fixup);
 
 public:
+  /// Construct a new assembler instance.
+  //
+  // FIXME: How are we going to parameterize this? Two obvious options are stay
+  // concrete and require clients to pass in a target like object. The other
+  // option is to make this abstract, and have targets provide concrete
+  // implementations as we do with AsmParser.
+  MCAssembler(MCContext &Context, MCAsmBackend &Backend,
+              MCCodeEmitter &Emitter, MCObjectWriter &Writer);
+  MCAssembler(const MCAssembler &) = delete;
+  MCAssembler &operator=(const MCAssembler &) = delete;
+  ~MCAssembler();
+
   /// Compute the effective fragment size assuming it is laid out at the given
   /// \p SectionAddress and \p FragmentOffset.
   uint64_t computeFragmentSize(const MCAsmLayout &Layout,
@@ -240,17 +250,6 @@ public:
     VersionMinInfo.Update = Update;
   }
 
-public:
-  /// Construct a new assembler instance.
-  //
-  // FIXME: How are we going to parameterize this? Two obvious options are stay
-  // concrete and require clients to pass in a target like object. The other
-  // option is to make this abstract, and have targets provide concrete
-  // implementations as we do with AsmParser.
-  MCAssembler(MCContext &Context, MCAsmBackend &Backend,
-              MCCodeEmitter &Emitter, MCObjectWriter &Writer);
-  ~MCAssembler();
-
   /// Reuse an assembler instance
   ///
   void reset();
@@ -425,4 +424,4 @@ uint64_t computeBundlePadding(const MCAssembler &Assembler, const MCFragment *F,
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCASSEMBLER_H
diff --git a/include/llvm/MC/MCCodeEmitter.h b/include/llvm/MC/MCCodeEmitter.h
index b6c19150c12a574a943aa508a563b318686120b4..f1b0b784a2df6ee5a6d5ef2839f2b86a523b0e5b 100644
--- a/include/llvm/MC/MCCodeEmitter.h
+++ b/include/llvm/MC/MCCodeEmitter.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCCodeEmitter.h - Instruction Encoding ----------*- C++ -*-===//
+//===- llvm/MC/MCCodeEmitter.h - Instruction Encoding -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,9 +10,8 @@
 #ifndef LLVM_MC_MCCODEEMITTER_H
 #define LLVM_MC_MCCODEEMITTER_H
 
-#include "llvm/Support/Compiler.h"
-
 namespace llvm {
+
 class MCFixup;
 class MCInst;
 class MCSubtargetInfo;
@@ -21,14 +20,12 @@ template<typename T> class SmallVectorImpl;
 
 /// MCCodeEmitter - Generic instruction encoding interface.
 class MCCodeEmitter {
-private:
-  MCCodeEmitter(const MCCodeEmitter &) = delete;
-  void operator=(const MCCodeEmitter &) = delete;
-
 protected: // Can only create subclasses.
   MCCodeEmitter();
 
 public:
+  MCCodeEmitter(const MCCodeEmitter &) = delete;
+  MCCodeEmitter &operator=(const MCCodeEmitter &) = delete;
   virtual ~MCCodeEmitter();
 
   /// Lifetime management
@@ -41,6 +38,6 @@ public:
                                  const MCSubtargetInfo &STI) const = 0;
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCCODEEMITTER_H
diff --git a/include/llvm/MC/MCContext.h b/include/llvm/MC/MCContext.h
index f846b632f112d1da95cfacded043cd22694159f8..b3106936e27f3656afecff4d68b377434344735f 100644
--- a/include/llvm/MC/MCContext.h
+++ b/include/llvm/MC/MCContext.h
@@ -15,6 +15,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -23,35 +24,37 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 #include <map>
-#include <tuple>
-#include <vector> // FIXME: Shouldn't be needed.
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 namespace llvm {
+
+  class CodeViewContext;
   class MCAsmInfo;
-  class MCExpr;
-  class MCSection;
-  class MCSymbol;
-  class MCSymbolELF;
   class MCLabel;
-  struct MCDwarfFile;
-  class MCDwarfLoc;
   class MCObjectFileInfo;
   class MCRegisterInfo;
-  class MCLineSection;
-  class SMLoc;
-  class MCSectionMachO;
-  class MCSectionELF;
+  class MCSection;
   class MCSectionCOFF;
-  class CodeViewContext;
+  class MCSectionELF;
+  class MCSectionMachO;
+  class MCSectionWasm;
+  class MCSymbol;
+  class MCSymbolELF;
+  class MCSymbolWasm;
+  class SMLoc;
 
   /// Context object for machine code objects.  This class owns all of the
   /// sections that it creates.
   ///
   class MCContext {
-    MCContext(const MCContext &) = delete;
-    MCContext &operator=(const MCContext &) = delete;
-
   public:
     typedef StringMap<MCSymbol *, BumpPtrAllocator &> SymbolTable;
 
@@ -59,6 +62,9 @@ namespace llvm {
     /// The SourceMgr for this object, if any.
     const SourceMgr *SrcMgr;
 
+    /// The SourceMgr for inline assembly, if any.
+    SourceMgr *InlineSrcMgr;
+
     /// The MCAsmInfo for this target.
     const MCAsmInfo *MAI;
 
@@ -79,14 +85,11 @@ namespace llvm {
     SpecificBumpPtrAllocator<MCSectionCOFF> COFFAllocator;
     SpecificBumpPtrAllocator<MCSectionELF> ELFAllocator;
     SpecificBumpPtrAllocator<MCSectionMachO> MachOAllocator;
+    SpecificBumpPtrAllocator<MCSectionWasm> WasmAllocator;
 
     /// Bindings of names to symbols.
     SymbolTable Symbols;
 
-    /// Sections can have a corresponding symbol. This maps one to the
-    /// other.
-    DenseMap<const MCSection *, MCSymbol *> SectionSymbols;
-
     /// A mapping from a local label number and an instance count to a symbol.
     /// For example, in the assembly
     ///     1:
@@ -123,7 +126,7 @@ namespace llvm {
     /// Boolean toggled when .secure_log_unique / .secure_log_reset is seen to
     /// catch errors if .secure_log_unique appears twice without
     /// .secure_log_reset appearing between them.
-    bool SecureLogUsed;
+    bool SecureLogUsed = false;
 
     /// The compilation directory to use for DW_AT_comp_dir.
     SmallString<128> CompilationDir;
@@ -139,14 +142,14 @@ namespace llvm {
 
     /// The current dwarf line information from the last dwarf .loc directive.
     MCDwarfLoc CurrentDwarfLoc;
-    bool DwarfLocSeen;
+    bool DwarfLocSeen = false;
 
     /// Generate dwarf debugging info for assembly source files.
-    bool GenDwarfForAssembly;
+    bool GenDwarfForAssembly = false;
 
     /// The current dwarf file number when generate dwarf debugging info for
     /// assembly source files.
-    unsigned GenDwarfFileNumber;
+    unsigned GenDwarfFileNumber = 0;
 
     /// Sections for generating the .debug_ranges and .debug_aranges sections.
     SetVector<MCSection *> SectionsForRanges;
@@ -164,25 +167,27 @@ namespace llvm {
     StringRef DwarfDebugProducer;
 
     /// The maximum version of dwarf that we should emit.
-    uint16_t DwarfVersion;
+    uint16_t DwarfVersion = 4;
 
     /// Honor temporary labels, this is useful for debugging semantic
     /// differences between temporary and non-temporary labels (primarily on
     /// Darwin).
-    bool AllowTemporaryLabels;
+    bool AllowTemporaryLabels = true;
     bool UseNamesOnTempLabels = true;
 
     /// The Compile Unit ID that we are currently processing.
-    unsigned DwarfCompileUnitID;
+    unsigned DwarfCompileUnitID = 0;
 
     struct ELFSectionKey {
       std::string SectionName;
       StringRef GroupName;
       unsigned UniqueID;
+
       ELFSectionKey(StringRef SectionName, StringRef GroupName,
                     unsigned UniqueID)
           : SectionName(SectionName), GroupName(GroupName), UniqueID(UniqueID) {
       }
+
       bool operator<(const ELFSectionKey &Other) const {
         if (SectionName != Other.SectionName)
           return SectionName < Other.SectionName;
@@ -197,10 +202,12 @@ namespace llvm {
       StringRef GroupName;
       int SelectionKey;
       unsigned UniqueID;
+
       COFFSectionKey(StringRef SectionName, StringRef GroupName,
                      int SelectionKey, unsigned UniqueID)
           : SectionName(SectionName), GroupName(GroupName),
             SelectionKey(SelectionKey), UniqueID(UniqueID) {}
+
       bool operator<(const COFFSectionKey &Other) const {
         if (SectionName != Other.SectionName)
           return SectionName < Other.SectionName;
@@ -212,17 +219,35 @@ namespace llvm {
       }
     };
 
+    struct WasmSectionKey {
+      std::string SectionName;
+      StringRef GroupName;
+      unsigned UniqueID;
+      WasmSectionKey(StringRef SectionName, StringRef GroupName,
+                     unsigned UniqueID)
+          : SectionName(SectionName), GroupName(GroupName), UniqueID(UniqueID) {
+      }
+      bool operator<(const WasmSectionKey &Other) const {
+        if (SectionName != Other.SectionName)
+          return SectionName < Other.SectionName;
+        if (GroupName != Other.GroupName)
+          return GroupName < Other.GroupName;
+        return UniqueID < Other.UniqueID;
+      }
+    };
+
     StringMap<MCSectionMachO *> MachOUniquingMap;
     std::map<ELFSectionKey, MCSectionELF *> ELFUniquingMap;
     std::map<COFFSectionKey, MCSectionCOFF *> COFFUniquingMap;
-    StringMap<bool> ELFRelSecNames;
+    std::map<WasmSectionKey, MCSectionWasm *> WasmUniquingMap;
+    StringMap<bool> RelSecNames;
 
     SpecificBumpPtrAllocator<MCSubtargetInfo> MCSubtargetAllocator;
 
     /// Do automatic reset in destructor
     bool AutoReset;
 
-    bool HadError;
+    bool HadError = false;
 
     MCSymbol *createSymbolImpl(const StringMapEntry<bool> *Name,
                                bool CanBeUnnamed);
@@ -232,14 +257,25 @@ namespace llvm {
     MCSymbol *getOrCreateDirectionalLocalSymbol(unsigned LocalLabelVal,
                                                 unsigned Instance);
 
+    MCSectionELF *createELFSectionImpl(StringRef Section, unsigned Type,
+                                       unsigned Flags, SectionKind K,
+                                       unsigned EntrySize,
+                                       const MCSymbolELF *Group,
+                                       unsigned UniqueID,
+                                       const MCSymbolELF *Associated);
+
   public:
     explicit MCContext(const MCAsmInfo *MAI, const MCRegisterInfo *MRI,
                        const MCObjectFileInfo *MOFI,
                        const SourceMgr *Mgr = nullptr, bool DoAutoReset = true);
+    MCContext(const MCContext &) = delete;
+    MCContext &operator=(const MCContext &) = delete;
     ~MCContext();
 
     const SourceMgr *getSourceManager() const { return SrcMgr; }
 
+    void setInlineSourceManager(SourceMgr *SM) { InlineSrcMgr = SM; }
+
     const MCAsmInfo *getAsmInfo() const { return MAI; }
 
     const MCRegisterInfo *getRegisterInfo() const { return MRI; }
@@ -288,8 +324,6 @@ namespace llvm {
     /// \param Name - The symbol name, which must be unique across all symbols.
     MCSymbol *getOrCreateSymbol(const Twine &Name);
 
-    MCSymbolELF *getOrCreateSectionSymbol(const MCSectionELF &Section);
-
     /// Gets a symbol that will be defined to the final stack offset of a local
     /// variable after codegen.
     ///
@@ -340,25 +374,13 @@ namespace llvm {
 
     MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
                                 unsigned Flags) {
-      return getELFSection(Section, Type, Flags, nullptr);
-    }
-
-    MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
-                                unsigned Flags, const char *BeginSymName) {
-      return getELFSection(Section, Type, Flags, 0, "", BeginSymName);
+      return getELFSection(Section, Type, Flags, 0, "");
     }
 
     MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
                                 unsigned Flags, unsigned EntrySize,
                                 const Twine &Group) {
-      return getELFSection(Section, Type, Flags, EntrySize, Group, nullptr);
-    }
-
-    MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
-                                unsigned Flags, unsigned EntrySize,
-                                const Twine &Group, const char *BeginSymName) {
-      return getELFSection(Section, Type, Flags, EntrySize, Group, ~0,
-                           BeginSymName);
+      return getELFSection(Section, Type, Flags, EntrySize, Group, ~0);
     }
 
     MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
@@ -371,13 +393,12 @@ namespace llvm {
     MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
                                 unsigned Flags, unsigned EntrySize,
                                 const Twine &Group, unsigned UniqueID,
-                                const char *BeginSymName);
+                                const MCSymbolELF *Associated);
 
     MCSectionELF *getELFSection(const Twine &Section, unsigned Type,
                                 unsigned Flags, unsigned EntrySize,
                                 const MCSymbolELF *Group, unsigned UniqueID,
-                                const char *BeginSymName,
-                                const MCSectionELF *Associated);
+                                const MCSymbolELF *Associated);
 
     /// Get a section with the provided group identifier. This section is
     /// named by concatenating \p Prefix with '.' then \p Suffix. The \p Type
@@ -390,7 +411,7 @@ namespace llvm {
     MCSectionELF *createELFRelSection(const Twine &Name, unsigned Type,
                                       unsigned Flags, unsigned EntrySize,
                                       const MCSymbolELF *Group,
-                                      const MCSectionELF *Associated);
+                                      const MCSectionELF *RelInfoSection);
 
     void renameELFSection(MCSectionELF *Section, StringRef Name);
 
@@ -416,6 +437,54 @@ namespace llvm {
     getAssociativeCOFFSection(MCSectionCOFF *Sec, const MCSymbol *KeySym,
                               unsigned UniqueID = GenericSectionID);
 
+    MCSectionWasm *getWasmSection(const Twine &Section, unsigned Type,
+                                  unsigned Flags) {
+      return getWasmSection(Section, Type, Flags, nullptr);
+    }
+
+    MCSectionWasm *getWasmSection(const Twine &Section, unsigned Type,
+                                  unsigned Flags, const char *BeginSymName) {
+      return getWasmSection(Section, Type, Flags, "", BeginSymName);
+    }
+
+    MCSectionWasm *getWasmSection(const Twine &Section, unsigned Type,
+                                  unsigned Flags, const Twine &Group) {
+      return getWasmSection(Section, Type, Flags, Group, nullptr);
+    }
+
+    MCSectionWasm *getWasmSection(const Twine &Section, unsigned Type,
+                                  unsigned Flags, const Twine &Group,
+                                  const char *BeginSymName) {
+      return getWasmSection(Section, Type, Flags, Group, ~0, BeginSymName);
+    }
+
+    MCSectionWasm *getWasmSection(const Twine &Section, unsigned Type,
+                                  unsigned Flags, const Twine &Group,
+                                  unsigned UniqueID) {
+      return getWasmSection(Section, Type, Flags, Group, UniqueID, nullptr);
+    }
+
+    MCSectionWasm *getWasmSection(const Twine &Section, unsigned Type,
+                                  unsigned Flags, const Twine &Group,
+                                  unsigned UniqueID, const char *BeginSymName);
+
+    MCSectionWasm *getWasmSection(const Twine &Section, unsigned Type,
+                                  unsigned Flags, const MCSymbolWasm *Group,
+                                  unsigned UniqueID, const char *BeginSymName);
+
+    /// Get a section with the provided group identifier. This section is
+    /// named by concatenating \p Prefix with '.' then \p Suffix. The \p Type
+    /// describes the type of the section and \p Flags are used to further
+    /// configure this named section.
+    MCSectionWasm *getWasmNamedSection(const Twine &Prefix, const Twine &Suffix,
+                                       unsigned Type, unsigned Flags);
+
+    MCSectionWasm *createWasmRelSection(const Twine &Name, unsigned Type,
+                                        unsigned Flags,
+                                        const MCSymbolWasm *Group);
+
+    void renameWasmSection(MCSectionWasm *Section, StringRef Name);
+
     // Create and save a copy of STI and return a reference to the copy.
     MCSubtargetInfo &getSubtargetCopy(const MCSubtargetInfo &STI);
 
@@ -463,6 +532,7 @@ namespace llvm {
     const SmallVectorImpl<MCDwarfFile> &getMCDwarfFiles(unsigned CUID = 0) {
       return getMCDwarfLineTable(CUID).getMCDwarfFiles();
     }
+
     const SmallVectorImpl<std::string> &getMCDwarfDirs(unsigned CUID = 0) {
       return getMCDwarfLineTable(CUID).getMCDwarfDirs();
     }
@@ -473,10 +543,13 @@ namespace llvm {
           return true;
       return false;
     }
+
     unsigned getDwarfCompileUnitID() { return DwarfCompileUnitID; }
+
     void setDwarfCompileUnitID(unsigned CUIndex) {
       DwarfCompileUnitID = CUIndex;
     }
+
     void setMCLineTableCompilationDir(unsigned CUID, StringRef CompilationDir) {
       getMCDwarfLineTable(CUID).setCompilationDir(CompilationDir);
     }
@@ -496,6 +569,7 @@ namespace llvm {
       CurrentDwarfLoc.setDiscriminator(Discriminator);
       DwarfLocSeen = true;
     }
+
     void clearDwarfLocSeen() { DwarfLocSeen = false; }
 
     bool getDwarfLocSeen() { return DwarfLocSeen; }
@@ -504,20 +578,25 @@ namespace llvm {
     bool getGenDwarfForAssembly() { return GenDwarfForAssembly; }
     void setGenDwarfForAssembly(bool Value) { GenDwarfForAssembly = Value; }
     unsigned getGenDwarfFileNumber() { return GenDwarfFileNumber; }
+
     void setGenDwarfFileNumber(unsigned FileNumber) {
       GenDwarfFileNumber = FileNumber;
     }
+
     const SetVector<MCSection *> &getGenDwarfSectionSyms() {
       return SectionsForRanges;
     }
+
     bool addGenDwarfSection(MCSection *Sec) {
       return SectionsForRanges.insert(Sec);
     }
 
     void finalizeDwarfSections(MCStreamer &MCOS);
+
     const std::vector<MCGenDwarfLabelEntry> &getMCGenDwarfLabelEntries() const {
       return MCGenDwarfLabelEntries;
     }
+
     void addMCGenDwarfLabelEntry(const MCGenDwarfLabelEntry &E) {
       MCGenDwarfLabelEntries.push_back(E);
     }
@@ -527,10 +606,12 @@ namespace llvm {
 
     void setDwarfDebugProducer(StringRef S) { DwarfDebugProducer = S; }
     StringRef getDwarfDebugProducer() { return DwarfDebugProducer; }
+
     dwarf::DwarfFormat getDwarfFormat() const {
       // TODO: Support DWARF64
       return dwarf::DWARF32;
     }
+
     void setDwarfVersion(uint16_t v) { DwarfVersion = v; }
     uint16_t getDwarfVersion() const { return DwarfVersion; }
 
@@ -538,15 +619,18 @@ namespace llvm {
 
     char *getSecureLogFile() { return SecureLogFile; }
     raw_fd_ostream *getSecureLog() { return SecureLog.get(); }
-    bool getSecureLogUsed() { return SecureLogUsed; }
+
     void setSecureLog(std::unique_ptr<raw_fd_ostream> Value) {
       SecureLog = std::move(Value);
     }
+
+    bool getSecureLogUsed() { return SecureLogUsed; }
     void setSecureLogUsed(bool Value) { SecureLogUsed = Value; }
 
     void *allocate(unsigned Size, unsigned Align = 8) {
       return Allocator.Allocate(Size, Align);
     }
+
     void deallocate(void *Ptr) {}
 
     bool hadError() { return HadError; }
@@ -632,4 +716,4 @@ inline void operator delete[](void *Ptr, llvm::MCContext &C) noexcept {
   C.deallocate(Ptr);
 }
 
-#endif
+#endif // LLVM_MC_MCCONTEXT_H
diff --git a/include/llvm/MC/MCDisassembler/MCDisassembler.h b/include/llvm/MC/MCDisassembler/MCDisassembler.h
index 9006d87abb43ddeefd404f94e3b19af7623621e6..5e626f1869861b41575fa748766347f7c0c777a2 100644
--- a/include/llvm/MC/MCDisassembler/MCDisassembler.h
+++ b/include/llvm/MC/MCDisassembler/MCDisassembler.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCDisassembler.h - Disassembler interface -------*- C++ -*-===//
+//===- llvm/MC/MCDisassembler.h - Disassembler interface --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,20 +6,21 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+
 #ifndef LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
 #define LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
 
-#include "llvm-c/Disassembler.h"
 #include "llvm/MC/MCDisassembler/MCSymbolizer.h"
-#include "llvm/Support/DataTypes.h"
+#include <cstdint>
+#include <memory>
 
 namespace llvm {
 
 template <typename T> class ArrayRef;
+class MCContext;
 class MCInst;
 class MCSubtargetInfo;
 class raw_ostream;
-class MCContext;
 
 /// Superclass for all disassemblers. Consumes a memory region and provides an
 /// array of assembly instructions.
@@ -54,7 +55,7 @@ public:
   };
 
   MCDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
-    : Ctx(Ctx), STI(STI), Symbolizer(), CommentStream(nullptr) {}
+    : Ctx(Ctx), STI(STI) {}
 
   virtual ~MCDisassembler();
 
@@ -105,9 +106,9 @@ public:
 
   // Marked mutable because we cache it inside the disassembler, rather than
   // having to pass it around as an argument through all the autogenerated code.
-  mutable raw_ostream *CommentStream;
+  mutable raw_ostream *CommentStream = nullptr;
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCDISASSEMBLER_MCDISASSEMBLER_H
diff --git a/include/llvm/MC/MCDisassembler/MCRelocationInfo.h b/include/llvm/MC/MCDisassembler/MCRelocationInfo.h
index 25334f755ee6b31649673c02bc34d4a11ba12b70..7836e886c303c883503bc4fdf6c264a53316f8ef 100644
--- a/include/llvm/MC/MCDisassembler/MCRelocationInfo.h
+++ b/include/llvm/MC/MCDisassembler/MCRelocationInfo.h
@@ -1,4 +1,4 @@
-//==-- llvm/MC/MCRelocationInfo.h --------------------------------*- C++ -*-==//
+//===- llvm/MC/MCRelocationInfo.h -------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,26 +16,20 @@
 #ifndef LLVM_MC_MCDISASSEMBLER_MCRELOCATIONINFO_H
 #define LLVM_MC_MCDISASSEMBLER_MCRELOCATIONINFO_H
 
-#include "llvm/Support/Compiler.h"
-
 namespace llvm {
 
-namespace object {
-class RelocationRef;
-}
-class MCExpr;
 class MCContext;
+class MCExpr;
 
 /// \brief Create MCExprs from relocations found in an object file.
 class MCRelocationInfo {
-  MCRelocationInfo(const MCRelocationInfo &) = delete;
-  void operator=(const MCRelocationInfo &) = delete;
-
 protected:
   MCContext &Ctx;
 
 public:
   MCRelocationInfo(MCContext &Ctx);
+  MCRelocationInfo(const MCRelocationInfo &) = delete;
+  MCRelocationInfo &operator=(const MCRelocationInfo &) = delete;
   virtual ~MCRelocationInfo();
 
   /// \brief Create an MCExpr for the target-specific \p VariantKind.
@@ -46,6 +40,6 @@ public:
                                                      unsigned VariantKind);
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCDISASSEMBLER_MCRELOCATIONINFO_H
diff --git a/include/llvm/MC/MCDisassembler/MCSymbolizer.h b/include/llvm/MC/MCDisassembler/MCSymbolizer.h
index 713467c0a3e706c7922d6d2343ed5c2ec5c3a826..d85cf5e066f5ad09276003423e73c679fc3d6d14 100644
--- a/include/llvm/MC/MCDisassembler/MCSymbolizer.h
+++ b/include/llvm/MC/MCDisassembler/MCSymbolizer.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCSymbolizer.h - MCSymbolizer class -------------*- C++ -*-===//
+//===- llvm/MC/MCSymbolizer.h - MCSymbolizer class --------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,9 +17,8 @@
 #define LLVM_MC_MCDISASSEMBLER_MCSYMBOLIZER_H
 
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataTypes.h"
-#include <cassert>
+#include <algorithm>
+#include <cstdint>
 #include <memory>
 
 namespace llvm {
@@ -38,9 +37,6 @@ class raw_ostream;
 /// operands are actually symbolizable, and in what way. I don't think this
 /// information exists right now.
 class MCSymbolizer {
-  MCSymbolizer(const MCSymbolizer &) = delete;
-  void operator=(const MCSymbolizer &) = delete;
-
 protected:
   MCContext &Ctx;
   std::unique_ptr<MCRelocationInfo> RelInfo;
@@ -51,6 +47,8 @@ public:
     : Ctx(Ctx), RelInfo(std::move(RelInfo)) {
   }
 
+  MCSymbolizer(const MCSymbolizer &) = delete;
+  MCSymbolizer &operator=(const MCSymbolizer &) = delete;
   virtual ~MCSymbolizer();
 
   /// \brief Try to add a symbolic operand instead of \p Value to the MCInst.
@@ -80,6 +78,6 @@ public:
                                                uint64_t Address) = 0;
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCDISASSEMBLER_MCSYMBOLIZER_H
diff --git a/include/llvm/MC/MCDwarf.h b/include/llvm/MC/MCDwarf.h
index 0c555d377d8b091bd8d5da9919f11fba5c3856b6..0d69c2005cb43218655b10f64c9f6585037764dc 100644
--- a/include/llvm/MC/MCDwarf.h
+++ b/include/llvm/MC/MCDwarf.h
@@ -16,24 +16,27 @@
 #define LLVM_MC_MCDWARF_H
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/Support/Dwarf.h"
+#include <cassert>
+#include <cstdint>
 #include <string>
 #include <utility>
 #include <vector>
 
 namespace llvm {
+
 template <typename T> class ArrayRef;
-class raw_ostream;
 class MCAsmBackend;
 class MCContext;
 class MCObjectStreamer;
 class MCStreamer;
 class MCSymbol;
-class SourceMgr;
+class raw_ostream;
 class SMLoc;
+class SourceMgr;
 
 /// \brief Instances of this class represent the name of the dwarf
 /// .file directive and its associated dwarf file number in the MC file,
@@ -71,6 +74,7 @@ class MCDwarfLoc {
 private: // MCContext manages these
   friend class MCContext;
   friend class MCDwarfLineEntry;
+
   MCDwarfLoc(unsigned fileNum, unsigned line, unsigned column, unsigned flags,
              unsigned isa, unsigned discriminator)
       : FileNum(fileNum), Line(line), Column(column), Flags(flags), Isa(isa),
@@ -194,13 +198,14 @@ struct MCDwarfLineTableParams {
 };
 
 struct MCDwarfLineTableHeader {
-  MCSymbol *Label;
+  MCSymbol *Label = nullptr;
   SmallVector<std::string, 3> MCDwarfDirs;
   SmallVector<MCDwarfFile, 3> MCDwarfFiles;
   StringMap<unsigned> SourceIdMap;
   StringRef CompilationDir;
 
-  MCDwarfLineTableHeader() : Label(nullptr) {}
+  MCDwarfLineTableHeader() = default;
+
   unsigned getFile(StringRef &Directory, StringRef &FileName,
                    unsigned FileNumber = 0);
   std::pair<MCSymbol *, MCSymbol *> Emit(MCStreamer *MCOS,
@@ -212,13 +217,16 @@ struct MCDwarfLineTableHeader {
 
 class MCDwarfDwoLineTable {
   MCDwarfLineTableHeader Header;
+
 public:
   void setCompilationDir(StringRef CompilationDir) {
     Header.CompilationDir = CompilationDir;
   }
+
   unsigned getFile(StringRef Directory, StringRef FileName) {
     return Header.getFile(Directory, FileName);
   }
+
   void Emit(MCStreamer &MCOS, MCDwarfLineTableParams Params) const;
 };
 
@@ -488,22 +496,19 @@ public:
 };
 
 struct MCDwarfFrameInfo {
-  MCDwarfFrameInfo()
-      : Begin(nullptr), End(nullptr), Personality(nullptr), Lsda(nullptr),
-        Instructions(), CurrentCfaRegister(0), PersonalityEncoding(),
-        LsdaEncoding(0), CompactUnwindEncoding(0), IsSignalFrame(false),
-        IsSimple(false) {}
-  MCSymbol *Begin;
-  MCSymbol *End;
-  const MCSymbol *Personality;
-  const MCSymbol *Lsda;
+  MCDwarfFrameInfo() = default;
+
+  MCSymbol *Begin = nullptr;
+  MCSymbol *End = nullptr;
+  const MCSymbol *Personality = nullptr;
+  const MCSymbol *Lsda = nullptr;
   std::vector<MCCFIInstruction> Instructions;
-  unsigned CurrentCfaRegister;
-  unsigned PersonalityEncoding;
-  unsigned LsdaEncoding;
-  uint32_t CompactUnwindEncoding;
-  bool IsSignalFrame;
-  bool IsSimple;
+  unsigned CurrentCfaRegister = 0;
+  unsigned PersonalityEncoding = 0;
+  unsigned LsdaEncoding = 0;
+  uint32_t CompactUnwindEncoding = 0;
+  bool IsSignalFrame = false;
+  bool IsSimple = false;
 };
 
 class MCDwarfFrameEmitter {
@@ -516,6 +521,7 @@ public:
   static void EncodeAdvanceLoc(MCContext &Context, uint64_t AddrDelta,
                                raw_ostream &OS);
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCDWARF_H
diff --git a/include/llvm/MC/MCELFObjectWriter.h b/include/llvm/MC/MCELFObjectWriter.h
index 376e21821316d286de87eef2a77d6d4a1c0f8027..f22fc11f9b0734ad39f58c2c76d82ea7beae0c67 100644
--- a/include/llvm/MC/MCELFObjectWriter.h
+++ b/include/llvm/MC/MCELFObjectWriter.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCELFObjectWriter.h - ELF Object Writer ---------*- C++ -*-===//
+//===- llvm/MC/MCELFObjectWriter.h - ELF Object Writer ----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,22 +11,21 @@
 #define LLVM_MC_MCELFOBJECTWRITER_H
 
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 #include <vector>
 
 namespace llvm {
+
 class MCAssembler;
 class MCContext;
 class MCFixup;
-class MCFragment;
 class MCObjectWriter;
 class MCSymbol;
 class MCSymbolELF;
 class MCValue;
-class raw_pwrite_stream;
 
 struct ELFRelocationEntry {
   uint64_t Offset; // Where is the relocation.
@@ -47,6 +46,7 @@ struct ELFRelocationEntry {
         << ", Addend=" << Addend << ", OriginalSymbol=" << OriginalSymbol
         << ", OriginalAddend=" << OriginalAddend;
   }
+
   void dump() const { print(errs()); }
 };
 
@@ -58,12 +58,12 @@ class MCELFObjectTargetWriter {
   const unsigned IsN64 : 1;
 
 protected:
-
-  MCELFObjectTargetWriter(bool Is64Bit_, uint8_t OSABI_,
-                          uint16_t EMachine_,  bool HasRelocationAddend,
-                          bool IsN64=false);
+  MCELFObjectTargetWriter(bool Is64Bit_, uint8_t OSABI_, uint16_t EMachine_,
+                          bool HasRelocationAddend, bool IsN64 = false);
 
 public:
+  virtual ~MCELFObjectTargetWriter() = default;
+
   static uint8_t getOSABI(Triple::OSType OSType) {
     switch (OSType) {
       case Triple::CloudABI:
@@ -76,8 +76,6 @@ public:
     }
   }
 
-  virtual ~MCELFObjectTargetWriter() {}
-
   virtual unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                                 const MCFixup &Fixup, bool IsPCRel) const = 0;
 
@@ -144,6 +142,7 @@ public:
 MCObjectWriter *createELFObjectWriter(MCELFObjectTargetWriter *MOTW,
                                       raw_pwrite_stream &OS,
                                       bool IsLittleEndian);
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_MC_MCELFOBJECTWRITER_H
diff --git a/include/llvm/MC/MCELFStreamer.h b/include/llvm/MC/MCELFStreamer.h
index a5c2638443529613184a689dc51db01859b97172..90434f34da5f19633f59d591143ac02d04c2b6ee 100644
--- a/include/llvm/MC/MCELFStreamer.h
+++ b/include/llvm/MC/MCELFStreamer.h
@@ -10,27 +10,24 @@
 #ifndef LLVM_MC_MCELFSTREAMER_H
 #define LLVM_MC_MCELFSTREAMER_H
 
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectStreamer.h"
-#include "llvm/MC/SectionKind.h"
-#include "llvm/Support/DataTypes.h"
 
 namespace llvm {
+
 class MCAsmBackend;
-class MCAssembler;
 class MCCodeEmitter;
 class MCExpr;
 class MCInst;
-class raw_ostream;
 
 class MCELFStreamer : public MCObjectStreamer {
 public:
   MCELFStreamer(MCContext &Context, MCAsmBackend &TAB, raw_pwrite_stream &OS,
                 MCCodeEmitter *Emitter)
-      : MCObjectStreamer(Context, TAB, OS, Emitter), SeenIdent(false) {}
+      : MCObjectStreamer(Context, TAB, OS, Emitter) {}
 
-  ~MCELFStreamer() override;
+  ~MCELFStreamer() override = default;
 
   /// state management
   void reset() override {
@@ -44,7 +41,8 @@ public:
 
   void InitSections(bool NoExecStack) override;
   void ChangeSection(MCSection *Section, const MCExpr *Subsection) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc, MCFragment *F) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitThumbFunc(MCSymbol *Func) override;
   void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
@@ -52,10 +50,6 @@ public:
   void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
-  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override;
-  void EmitCOFFSymbolStorageClass(int StorageClass) override;
-  void EmitCOFFSymbolType(int Type) override;
-  void EndCOFFSymbolDef() override;
 
   void emitELFSize(MCSymbol *Symbol, const MCExpr *Value) override;
 
@@ -69,8 +63,6 @@ public:
   void EmitValueImpl(const MCExpr *Value, unsigned Size,
                      SMLoc Loc = SMLoc()) override;
 
-  void EmitFileDirective(StringRef Filename) override;
-
   void EmitIdent(StringRef IdentString) override;
 
   void EmitValueToAlignment(unsigned, int64_t, unsigned, unsigned) override;
@@ -91,11 +83,11 @@ private:
   /// \brief Merge the content of the fragment \p EF into the fragment \p DF.
   void mergeFragment(MCDataFragment *, MCDataFragment *);
 
-  bool SeenIdent;
+  bool SeenIdent = false;
 
   /// BundleGroups - The stack of fragments holding the bundle-locked
   /// instructions.
-  llvm::SmallVector<MCDataFragment *, 4> BundleGroups;
+  SmallVector<MCDataFragment *, 4> BundleGroups;
 };
 
 MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
@@ -105,4 +97,4 @@ MCELFStreamer *createARMELFStreamer(MCContext &Context, MCAsmBackend &TAB,
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCELFSTREAMER_H
diff --git a/include/llvm/MC/MCExpr.h b/include/llvm/MC/MCExpr.h
index 50f4d7597dea3cfdad31f04ae9e8625c36f6dae3..c850abf42e2c6fad9d34c72dc2af7fcced08a7cb 100644
--- a/include/llvm/MC/MCExpr.h
+++ b/include/llvm/MC/MCExpr.h
@@ -11,11 +11,11 @@
 #define LLVM_MC_MCEXPR_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/SMLoc.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MCAsmInfo;
 class MCAsmLayout;
 class MCAssembler;
@@ -46,9 +46,6 @@ private:
   ExprKind Kind;
   SMLoc Loc;
 
-  MCExpr(const MCExpr&) = delete;
-  void operator=(const MCExpr&) = delete;
-
   bool evaluateAsAbsolute(int64_t &Res, const MCAssembler *Asm,
                           const MCAsmLayout *Layout,
                           const SectionAddrMap *Addrs) const;
@@ -66,6 +63,9 @@ protected:
                                  const SectionAddrMap *Addrs, bool InSet) const;
 
 public:
+  MCExpr(const MCExpr &) = delete;
+  MCExpr &operator=(const MCExpr &) = delete;
+
   /// \name Accessors
   /// @{
 
@@ -194,6 +194,8 @@ public:
     VK_SIZE,      // symbol@SIZE
     VK_WEAKREF,   // The link between the symbols in .weakref foo, bar
 
+    VK_X86_ABS8,
+
     VK_ARM_NONE,
     VK_ARM_GOT_PREL,
     VK_ARM_TARGET1,
@@ -268,6 +270,7 @@ public:
     VK_Hexagon_IE_GOT,
 
     VK_WebAssembly_FUNCTION, // Function table index, rather than virtual addr
+    VK_WebAssembly_TYPEINDEX,// Type table index
 
     VK_AMDGPU_GOTPCREL32_LO, // symbol@gotpcrel32@lo
     VK_AMDGPU_GOTPCREL32_HI, // symbol@gotpcrel32@hi
@@ -348,26 +351,30 @@ private:
   Opcode Op;
   const MCExpr *Expr;
 
-  MCUnaryExpr(Opcode Op, const MCExpr *Expr)
-      : MCExpr(MCExpr::Unary, SMLoc()), Op(Op), Expr(Expr) {}
+  MCUnaryExpr(Opcode Op, const MCExpr *Expr, SMLoc Loc)
+      : MCExpr(MCExpr::Unary, Loc), Op(Op), Expr(Expr) {}
 
 public:
   /// \name Construction
   /// @{
 
   static const MCUnaryExpr *create(Opcode Op, const MCExpr *Expr,
-                                   MCContext &Ctx);
-  static const MCUnaryExpr *createLNot(const MCExpr *Expr, MCContext &Ctx) {
-    return create(LNot, Expr, Ctx);
+                                   MCContext &Ctx, SMLoc Loc = SMLoc());
+
+  static const MCUnaryExpr *createLNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc = SMLoc()) {
+    return create(LNot, Expr, Ctx, Loc);
   }
-  static const MCUnaryExpr *createMinus(const MCExpr *Expr, MCContext &Ctx) {
-    return create(Minus, Expr, Ctx);
+
+  static const MCUnaryExpr *createMinus(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc = SMLoc()) {
+    return create(Minus, Expr, Ctx, Loc);
   }
-  static const MCUnaryExpr *createNot(const MCExpr *Expr, MCContext &Ctx) {
-    return create(Not, Expr, Ctx);
+
+  static const MCUnaryExpr *createNot(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc = SMLoc()) {
+    return create(Not, Expr, Ctx, Loc);
   }
-  static const MCUnaryExpr *createPlus(const MCExpr *Expr, MCContext &Ctx) {
-    return create(Plus, Expr, Ctx);
+
+  static const MCUnaryExpr *createPlus(const MCExpr *Expr, MCContext &Ctx, SMLoc Loc = SMLoc()) {
+    return create(Plus, Expr, Ctx, Loc);
   }
 
   /// @}
@@ -431,78 +438,97 @@ public:
   static const MCBinaryExpr *create(Opcode Op, const MCExpr *LHS,
                                     const MCExpr *RHS, MCContext &Ctx,
                                     SMLoc Loc = SMLoc());
+
   static const MCBinaryExpr *createAdd(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(Add, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createAnd(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(And, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createDiv(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(Div, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createEQ(const MCExpr *LHS, const MCExpr *RHS,
                                       MCContext &Ctx) {
     return create(EQ, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createGT(const MCExpr *LHS, const MCExpr *RHS,
                                       MCContext &Ctx) {
     return create(GT, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createGTE(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(GTE, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createLAnd(const MCExpr *LHS, const MCExpr *RHS,
                                         MCContext &Ctx) {
     return create(LAnd, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createLOr(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(LOr, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createLT(const MCExpr *LHS, const MCExpr *RHS,
                                       MCContext &Ctx) {
     return create(LT, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createLTE(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(LTE, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createMod(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(Mod, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createMul(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(Mul, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createNE(const MCExpr *LHS, const MCExpr *RHS,
                                       MCContext &Ctx) {
     return create(NE, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createOr(const MCExpr *LHS, const MCExpr *RHS,
                                       MCContext &Ctx) {
     return create(Or, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createShl(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(Shl, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createAShr(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(AShr, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createLShr(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(LShr, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createSub(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(Sub, LHS, RHS, Ctx);
   }
+
   static const MCBinaryExpr *createXor(const MCExpr *LHS, const MCExpr *RHS,
                                        MCContext &Ctx) {
     return create(Xor, LHS, RHS, Ctx);
@@ -535,9 +561,11 @@ public:
 /// MCExprs are bump pointer allocated and not destructed.
 class MCTargetExpr : public MCExpr {
   virtual void anchor();
+
 protected:
   MCTargetExpr() : MCExpr(Target, SMLoc()) {}
-  virtual ~MCTargetExpr() {}
+  virtual ~MCTargetExpr() = default;
+
 public:
   virtual void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const = 0;
   virtual bool evaluateAsRelocatableImpl(MCValue &Res,
@@ -555,4 +583,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCEXPR_H
diff --git a/include/llvm/MC/MCFragment.h b/include/llvm/MC/MCFragment.h
index edb740f36d91fa491f1c5043c76ef91bead44a38..fc8257f90a9f7abfbac134de1eb55a31584761e5 100644
--- a/include/llvm/MC/MCFragment.h
+++ b/include/llvm/MC/MCFragment.h
@@ -10,25 +10,26 @@
 #ifndef LLVM_MC_MCFRAGMENT_H
 #define LLVM_MC_MCFRAGMENT_H
 
-#include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/ilist_node.h"
-#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/Support/SMLoc.h"
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
+
 class MCSection;
-class MCSymbol;
 class MCSubtargetInfo;
+class MCSymbol;
 
 class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
   friend class MCAsmLayout;
 
-  MCFragment() = delete;
-  MCFragment(const MCFragment &) = delete;
-  void operator=(const MCFragment &) = delete;
-
 public:
   enum FragmentType : uint8_t {
     FT_Align,
@@ -86,6 +87,10 @@ protected:
   ~MCFragment();
 
 public:
+  MCFragment() = delete;
+  MCFragment(const MCFragment &) = delete;
+  MCFragment &operator=(const MCFragment &) = delete;
+
   /// Destroys the current fragment.
   ///
   /// This must be used instead of delete as MCFragment is non-virtual.
@@ -131,7 +136,8 @@ public:
 class MCDummyFragment : public MCFragment {
 public:
   explicit MCDummyFragment(MCSection *Sec)
-      : MCFragment(FT_Dummy, false, 0, Sec){};
+      : MCFragment(FT_Dummy, false, 0, Sec) {}
+
   static bool classof(const MCFragment *F) { return F->getKind() == FT_Dummy; }
 };
 
@@ -271,7 +277,6 @@ public:
 };
 
 class MCAlignFragment : public MCFragment {
-
   /// Alignment - The alignment to ensure, in bytes.
   unsigned Alignment;
 
@@ -319,7 +324,6 @@ public:
 };
 
 class MCFillFragment : public MCFragment {
-
   /// Value to use for filling bytes.
   uint8_t Value;
 
@@ -339,7 +343,6 @@ public:
 };
 
 class MCOrgFragment : public MCFragment {
-
   /// Offset - The offset this fragment should start at.
   const MCExpr *Offset;
 
@@ -371,7 +374,6 @@ public:
 };
 
 class MCLEBFragment : public MCFragment {
-
   /// Value - The value this fragment should contain.
   const MCExpr *Value;
 
@@ -404,7 +406,6 @@ public:
 };
 
 class MCDwarfLineAddrFragment : public MCFragment {
-
   /// LineDelta - the value of the difference between the two line numbers
   /// between two .loc dwarf directives.
   int64_t LineDelta;
@@ -441,7 +442,6 @@ public:
 };
 
 class MCDwarfCallFrameFragment : public MCFragment {
-
   /// AddrDelta - The expression for the difference of the two symbols that
   /// make up the address delta between two .cfi_* dwarf directives.
   const MCExpr *AddrDelta;
@@ -561,4 +561,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCFRAGMENT_H
diff --git a/include/llvm/MC/MCInst.h b/include/llvm/MC/MCInst.h
index 4688b5f2b6e92e68e74ff4c74c79ed99000bbfba..702279659371714314fc2fd15a4a74780195a332 100644
--- a/include/llvm/MC/MCInst.h
+++ b/include/llvm/MC/MCInst.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCInst.h - MCInst class -------------------------*- C++ -*-===//
+//===- llvm/MC/MCInst.h - MCInst class --------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,15 +18,17 @@
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 
 namespace llvm {
-class raw_ostream;
-class MCAsmInfo;
-class MCInstPrinter;
+
 class MCExpr;
 class MCInst;
+class MCInstPrinter;
+class raw_ostream;
 
 /// \brief Instances of this class represent operands of the MCInst class.
 /// This is a simple discriminated union.
@@ -39,7 +41,7 @@ class MCOperand {
     kExpr,        ///< Relocatable immediate operand.
     kInst         ///< Sub-instruction operand.
   };
-  MachineOperandType Kind;
+  MachineOperandType Kind = kInvalid;
 
   union {
     unsigned RegVal;
@@ -50,7 +52,7 @@ class MCOperand {
   };
 
 public:
-  MCOperand() : Kind(kInvalid), FPImmVal(0.0) {}
+  MCOperand() : FPImmVal(0.0) {}
 
   bool isValid() const { return Kind != kInvalid; }
   bool isReg() const { return Kind == kRegister; }
@@ -75,6 +77,7 @@ public:
     assert(isImm() && "This is not an immediate");
     return ImmVal;
   }
+
   void setImm(int64_t Val) {
     assert(isImm() && "This is not an immediate");
     ImmVal = Val;
@@ -94,6 +97,7 @@ public:
     assert(isExpr() && "This is not an expression");
     return ExprVal;
   }
+
   void setExpr(const MCExpr *Val) {
     assert(isExpr() && "This is not an expression");
     ExprVal = Val;
@@ -103,6 +107,7 @@ public:
     assert(isInst() && "This is not a sub-instruction");
     return InstVal;
   }
+
   void setInst(const MCInst *Val) {
     assert(isInst() && "This is not a sub-instruction");
     InstVal = Val;
@@ -114,24 +119,28 @@ public:
     Op.RegVal = Reg;
     return Op;
   }
+
   static MCOperand createImm(int64_t Val) {
     MCOperand Op;
     Op.Kind = kImmediate;
     Op.ImmVal = Val;
     return Op;
   }
+
   static MCOperand createFPImm(double Val) {
     MCOperand Op;
     Op.Kind = kFPImmediate;
     Op.FPImmVal = Val;
     return Op;
   }
+
   static MCOperand createExpr(const MCExpr *Val) {
     MCOperand Op;
     Op.Kind = kExpr;
     Op.ExprVal = Val;
     return Op;
   }
+
   static MCOperand createInst(const MCInst *Val) {
     MCOperand Op;
     Op.Kind = kInst;
@@ -148,12 +157,12 @@ template <> struct isPodLike<MCOperand> { static const bool value = true; };
 /// \brief Instances of this class represent a single low-level machine
 /// instruction.
 class MCInst {
-  unsigned Opcode;
+  unsigned Opcode = 0;
   SMLoc Loc;
   SmallVector<MCOperand, 8> Operands;
 
 public:
-  MCInst() : Opcode(0) {}
+  MCInst() = default;
 
   void setOpcode(unsigned Op) { Opcode = Op; }
   unsigned getOpcode() const { return Opcode; }
@@ -176,6 +185,7 @@ public:
   const_iterator begin() const { return Operands.begin(); }
   iterator end() { return Operands.end(); }
   const_iterator end() const { return Operands.end(); }
+
   iterator insert(iterator I, const MCOperand &Op) {
     return Operands.insert(I, Op);
   }
@@ -202,4 +212,4 @@ inline raw_ostream& operator<<(raw_ostream &OS, const MCInst &MI) {
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCINST_H
diff --git a/include/llvm/MC/MCInstPrinter.h b/include/llvm/MC/MCInstPrinter.h
index 320b280cc756160eac98312d6711746eba40d809..069403074b31c7c3c76a7061add6d126687d736c 100644
--- a/include/llvm/MC/MCInstPrinter.h
+++ b/include/llvm/MC/MCInstPrinter.h
@@ -16,12 +16,12 @@
 namespace llvm {
 
 template <typename T> class ArrayRef;
-class MCInst;
-class raw_ostream;
 class MCAsmInfo;
+class MCInst;
 class MCInstrInfo;
 class MCRegisterInfo;
 class MCSubtargetInfo;
+class raw_ostream;
 class StringRef;
 
 /// Convert `Bytes' to a hex string and output to `OS'
@@ -43,28 +43,26 @@ protected:
   /// \brief A stream that comments can be emitted to if desired.  Each comment
   /// must end with a newline.  This will be null if verbose assembly emission
   /// is disable.
-  raw_ostream *CommentStream;
+  raw_ostream *CommentStream = nullptr;
   const MCAsmInfo &MAI;
   const MCInstrInfo &MII;
   const MCRegisterInfo &MRI;
 
   /// True if we are printing marked up assembly.
-  bool UseMarkup;
+  bool UseMarkup = false;
 
   /// True if we are printing immediates as hex.
-  bool PrintImmHex;
+  bool PrintImmHex = false;
 
   /// Which style to use for printing hexadecimal values.
-  HexStyle::Style PrintHexStyle;
+  HexStyle::Style PrintHexStyle = HexStyle::C;
 
   /// Utility function for printing annotations.
   void printAnnotation(raw_ostream &OS, StringRef Annot);
 
 public:
   MCInstPrinter(const MCAsmInfo &mai, const MCInstrInfo &mii,
-                const MCRegisterInfo &mri)
-      : CommentStream(nullptr), MAI(mai), MII(mii), MRI(mri), UseMarkup(false),
-        PrintImmHex(false), PrintHexStyle(HexStyle::C) {}
+                const MCRegisterInfo &mri) : MAI(mai), MII(mii), MRI(mri) {}
 
   virtual ~MCInstPrinter();
 
diff --git a/include/llvm/MC/MCInstrAnalysis.h b/include/llvm/MC/MCInstrAnalysis.h
index 8f5159e9e1c85319f73b1b0a8f516e7d19597559..dd3e1df477b45bb728ab67bd94b62f2b9a64b720 100644
--- a/include/llvm/MC/MCInstrAnalysis.h
+++ b/include/llvm/MC/MCInstrAnalysis.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCInstrAnalysis.h - InstrDesc target hooks ------*- C++ -*-===//
+//===- llvm/MC/MCInstrAnalysis.h - InstrDesc target hooks -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,18 +18,19 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include <cstdint>
 
 namespace llvm {
 
 class MCInstrAnalysis {
 protected:
   friend class Target;
+
   const MCInstrInfo *Info;
 
 public:
   MCInstrAnalysis(const MCInstrInfo *Info) : Info(Info) {}
-
-  virtual ~MCInstrAnalysis() {}
+  virtual ~MCInstrAnalysis() = default;
 
   virtual bool isBranch(const MCInst &Inst) const {
     return Info->get(Inst.getOpcode()).isBranch();
@@ -66,6 +67,6 @@ public:
                  uint64_t &Target) const;
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCINSTRANALYSIS_H
diff --git a/include/llvm/MC/MCInstrItineraries.h b/include/llvm/MC/MCInstrItineraries.h
index 1fb276a302b91223dc355f51020bc45ebc3477a0..4443dd113715e9e2998f3deb449e90c312c1285c 100644
--- a/include/llvm/MC/MCInstrItineraries.h
+++ b/include/llvm/MC/MCInstrItineraries.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCInstrItineraries.h - Scheduling ---------------*- C++ -*-===//
+//===- llvm/MC/MCInstrItineraries.h - Scheduling ----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -88,7 +88,6 @@ struct InstrStage {
   }
 };
 
-
 //===----------------------------------------------------------------------===//
 /// An itinerary represents the scheduling information for an instruction.
 /// This includes a set of stages occupied by the instruction and the pipeline
@@ -102,23 +101,20 @@ struct InstrItinerary {
   unsigned LastOperandCycle;   ///< Index of last + 1 operand rd/wr
 };
 
-
 //===----------------------------------------------------------------------===//
 /// Itinerary data supplied by a subtarget to be used by a target.
 ///
 class InstrItineraryData {
 public:
-  MCSchedModel          SchedModel;     ///< Basic machine properties.
-  const InstrStage     *Stages;         ///< Array of stages selected
-  const unsigned       *OperandCycles;  ///< Array of operand cycles selected
-  const unsigned       *Forwardings;    ///< Array of pipeline forwarding paths
-  const InstrItinerary *Itineraries;    ///< Array of itineraries selected
-
-  /// Ctors.
-  InstrItineraryData() : SchedModel(MCSchedModel::GetDefaultSchedModel()),
-                         Stages(nullptr), OperandCycles(nullptr),
-                         Forwardings(nullptr), Itineraries(nullptr) {}
-
+  MCSchedModel SchedModel =
+      MCSchedModel::GetDefaultSchedModel(); ///< Basic machine properties.
+  const InstrStage *Stages = nullptr;       ///< Array of stages selected
+  const unsigned *OperandCycles = nullptr; ///< Array of operand cycles selected
+  const unsigned *Forwardings = nullptr; ///< Array of pipeline forwarding paths
+  const InstrItinerary *Itineraries =
+      nullptr; ///< Array of itineraries selected
+
+  InstrItineraryData() = default;
   InstrItineraryData(const MCSchedModel &SM, const InstrStage *S,
                      const unsigned *OS, const unsigned *F)
     : SchedModel(SM), Stages(S), OperandCycles(OS), Forwardings(F),
@@ -234,6 +230,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCINSTRITINERARIES_H
diff --git a/include/llvm/MC/MCLabel.h b/include/llvm/MC/MCLabel.h
index a12473fdad02f52451a284eef58172e10ec19d6f..b6579fd654ab405a896357fa536945e4215cf125 100644
--- a/include/llvm/MC/MCLabel.h
+++ b/include/llvm/MC/MCLabel.h
@@ -14,10 +14,8 @@
 #ifndef LLVM_MC_MCLABEL_H
 #define LLVM_MC_MCLABEL_H
 
-#include "llvm/Support/Compiler.h"
-
 namespace llvm {
-class MCContext;
+
 class raw_ostream;
 
 /// \brief Instances of this class represent a label name in the MC file,
@@ -29,12 +27,13 @@ class MCLabel {
 
 private: // MCContext creates and uniques these.
   friend class MCContext;
+
   MCLabel(unsigned instance) : Instance(instance) {}
 
+public:
   MCLabel(const MCLabel &) = delete;
-  void operator=(const MCLabel &) = delete;
+  MCLabel &operator=(const MCLabel &) = delete;
 
-public:
   /// \brief Get the current instance of this Directional Local Label.
   unsigned getInstance() const { return Instance; }
 
@@ -52,6 +51,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const MCLabel &Label) {
   Label.print(OS);
   return OS;
 }
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCLABEL_H
diff --git a/include/llvm/MC/MCLinkerOptimizationHint.h b/include/llvm/MC/MCLinkerOptimizationHint.h
index 200bb93f64c8de7a0421f50079c9dca73621a94b..0c3525bbeda65ca7120e2978f926585a00826854 100644
--- a/include/llvm/MC/MCLinkerOptimizationHint.h
+++ b/include/llvm/MC/MCLinkerOptimizationHint.h
@@ -21,13 +21,14 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 namespace llvm {
 
-// Forward declarations.
+class MachObjectWriter;
 class MCAsmLayout;
 class MCSymbol;
-class MachObjectWriter;
 
 /// Linker Optimization Hint Type.
 enum MCLOHType {
@@ -133,7 +134,7 @@ public:
 
 class MCLOHContainer {
   /// Keep track of the emit size of all the LOHs.
-  mutable uint64_t EmitSize;
+  mutable uint64_t EmitSize = 0;
 
   /// Keep track of all LOH directives.
   SmallVector<MCLOHDirective, 32> Directives;
@@ -141,7 +142,7 @@ class MCLOHContainer {
 public:
   typedef SmallVectorImpl<MCLOHDirective> LOHDirectives;
 
-  MCLOHContainer() : EmitSize(0) {}
+  MCLOHContainer() = default;
 
   /// Const accessor to the directives.
   const LOHDirectives &getDirectives() const {
@@ -183,4 +184,4 @@ typedef MCLOHContainer::LOHDirectives MCLOHDirectives;
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCLINKEROPTIMIZATIONHINT_H
diff --git a/include/llvm/MC/MCMachObjectWriter.h b/include/llvm/MC/MCMachObjectWriter.h
index 1a685dbd608eebf7a040c723865300d25d2f70aa..b93638f86408476d459aa31181a133c40181d13b 100644
--- a/include/llvm/MC/MCMachObjectWriter.h
+++ b/include/llvm/MC/MCMachObjectWriter.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCMachObjectWriter.h - Mach Object Writer -------*- C++ -*-===//
+//===- llvm/MC/MCMachObjectWriter.h - Mach Object Writer --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,15 @@
 #define LLVM_MC_MCMACHOBJECTWRITER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/StringTableBuilder.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/MachO.h"
+#include <cstdint>
+#include <memory>
+#include <string>
 #include <vector>
 
 namespace llvm {
@@ -95,8 +98,8 @@ class MachObjectWriter : public MCObjectWriter {
         : Sym(Sym), MRE(MRE) {}
   };
 
-  llvm::DenseMap<const MCSection *, std::vector<RelAndSymbol>> Relocations;
-  llvm::DenseMap<const MCSection *, unsigned> IndirectSymBase;
+  DenseMap<const MCSection *, std::vector<RelAndSymbol>> Relocations;
+  DenseMap<const MCSection *, unsigned> IndirectSymBase;
 
   SectionAddrMap SectionAddress;
 
@@ -271,6 +274,6 @@ MCObjectWriter *createMachObjectWriter(MCMachObjectTargetWriter *MOTW,
                                        raw_pwrite_stream &OS,
                                        bool IsLittleEndian);
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCMACHOBJECTWRITER_H
diff --git a/include/llvm/MC/MCObjectFileInfo.h b/include/llvm/MC/MCObjectFileInfo.h
index 9aa8812c7bb315e2894e4c82ffd397c1bdc28fb1..8b2a1261b220442a7f42edd4bfcd52a08b09f933 100644
--- a/include/llvm/MC/MCObjectFileInfo.h
+++ b/include/llvm/MC/MCObjectFileInfo.h
@@ -129,7 +129,7 @@ protected:
   /// it'll go here.
   MCSection *TLSExtraDataSection;
 
-  /// Section directive for Thread Local data. ELF, MachO and COFF.
+  /// Section directive for Thread Local data. ELF, MachO, COFF, and Wasm.
   MCSection *TLSDataSection; // Defaults to ".tdata".
 
   /// Section directive for Thread Local uninitialized data.
@@ -338,7 +338,7 @@ public:
     return EHFrameSection;
   }
 
-  enum Environment { IsMachO, IsELF, IsCOFF };
+  enum Environment { IsMachO, IsELF, IsCOFF, IsWasm };
   Environment getObjectFileType() const { return Env; }
 
   bool isPositionIndependent() const { return PositionIndependent; }
@@ -353,6 +353,7 @@ private:
   void initMachOMCObjectFileInfo(const Triple &T);
   void initELFMCObjectFileInfo(const Triple &T);
   void initCOFFMCObjectFileInfo(const Triple &T);
+  void initWasmMCObjectFileInfo(const Triple &T);
 
 public:
   const Triple &getTargetTriple() const { return TT; }
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index f9111b7f47eaac54991901454ff0f772cafb8647..11f8dfa24484d5cfbd357bbdc3c5de22ffc7d1bd 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -89,7 +89,8 @@ public:
   /// \name MCStreamer Interface
   /// @{
 
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
+  virtual void EmitLabel(MCSymbol *Symbol, SMLoc Loc, MCFragment *F);
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void EmitValueImpl(const MCExpr *Value, unsigned Size,
                      SMLoc Loc = SMLoc()) override;
@@ -152,6 +153,7 @@ public:
                 SMLoc Loc = SMLoc()) override;
   void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
                 SMLoc Loc = SMLoc()) override;
+  void EmitFileDirective(StringRef Filename) override;
 
   void FinishImpl() override;
 
diff --git a/include/llvm/MC/MCObjectWriter.h b/include/llvm/MC/MCObjectWriter.h
index 0ecebe42a0b9546a5a211a4f71148f54f4153521..86bcbb6861d7f22c1089d4eb3c673854030cddfa 100644
--- a/include/llvm/MC/MCObjectWriter.h
+++ b/include/llvm/MC/MCObjectWriter.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCObjectWriter.h - Object File Writer Interface -*- C++ -*-===//
+//===- llvm/MC/MCObjectWriter.h - Object File Writer Interface --*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,13 +11,15 @@
 #define LLVM_MC_MCOBJECTWRITER_H
 
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
+#include <cstdint>
 
 namespace llvm {
+
 class MCAsmLayout;
 class MCAssembler;
 class MCFixup;
@@ -38,15 +40,12 @@ class MCValue;
 /// The object writer also contains a number of helper methods for writing
 /// binary data to the output stream.
 class MCObjectWriter {
-  MCObjectWriter(const MCObjectWriter &) = delete;
-  void operator=(const MCObjectWriter &) = delete;
-
   raw_pwrite_stream *OS;
 
 protected:
   unsigned IsLittleEndian : 1;
 
-protected: // Can only create subclasses.
+  // Can only create subclasses.
   MCObjectWriter(raw_pwrite_stream &OS, bool IsLittleEndian)
       : OS(&OS), IsLittleEndian(IsLittleEndian) {}
 
@@ -55,6 +54,8 @@ protected: // Can only create subclasses.
   }
 
 public:
+  MCObjectWriter(const MCObjectWriter &) = delete;
+  MCObjectWriter &operator=(const MCObjectWriter &) = delete;
   virtual ~MCObjectWriter();
 
   /// lifetime management
@@ -108,11 +109,6 @@ public:
                                                       bool InSet,
                                                       bool IsPCRel) const;
 
-  /// True if this symbol (which is a variable) is weak. This is not
-  /// just STB_WEAK, but more generally whether or not we can evaluate
-  /// past it.
-  virtual bool isWeak(const MCSymbol &Sym) const;
-
   /// Write the object file.
   ///
   /// This routine is called by the assembler after layout and relaxation is
@@ -199,6 +195,6 @@ public:
   /// @}
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCOBJECTWRITER_H
diff --git a/include/llvm/MC/MCParser/AsmCond.h b/include/llvm/MC/MCParser/AsmCond.h
index a918b5600ed5df4a5392430e0b585193338b0867..8e7bfc521556602a12efc9501a7b2cfb2c092a4d 100644
--- a/include/llvm/MC/MCParser/AsmCond.h
+++ b/include/llvm/MC/MCParser/AsmCond.h
@@ -28,13 +28,13 @@ public:
     ElseCond    // inside else conditional
   };
 
-  ConditionalAssemblyType TheCond;
-  bool CondMet;
-  bool Ignore;
+  ConditionalAssemblyType TheCond = NoCond;
+  bool CondMet = false;
+  bool Ignore = false;
 
-  AsmCond() : TheCond(NoCond), CondMet(false), Ignore(false) {}
+  AsmCond() = default;
 };
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCPARSER_ASMCOND_H
diff --git a/include/llvm/MC/MCParser/AsmLexer.h b/include/llvm/MC/MCParser/AsmLexer.h
index 029598c013d3570c0ed226017006d67dc5eabbbb..207183a69b0ed2fca84af0309175736b139ed28b 100644
--- a/include/llvm/MC/MCParser/AsmLexer.h
+++ b/include/llvm/MC/MCParser/AsmLexer.h
@@ -16,25 +16,22 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/Support/DataTypes.h"
 #include <string>
 
 namespace llvm {
-class MemoryBuffer;
+
 class MCAsmInfo;
 
 /// AsmLexer - Lexer class for assembly files.
 class AsmLexer : public MCAsmLexer {
   const MCAsmInfo &MAI;
 
-  const char *CurPtr;
+  const char *CurPtr = nullptr;
   StringRef CurBuf;
-  bool IsAtStartOfLine;
-  bool IsAtStartOfStatement;
-  bool IsParsingMSInlineAsm;
-  bool IsPeeking;
-  void operator=(const AsmLexer&) = delete;
-  AsmLexer(const AsmLexer&) = delete;
+  bool IsAtStartOfLine = true;
+  bool IsAtStartOfStatement = true;
+  bool IsParsingMSInlineAsm = false;
+  bool IsPeeking = false;
 
 protected:
   /// LexToken - Read the next token and return its code.
@@ -42,6 +39,8 @@ protected:
 
 public:
   AsmLexer(const MCAsmInfo &MAI);
+  AsmLexer(const AsmLexer &) = delete;
+  AsmLexer &operator=(const AsmLexer &) = delete;
   ~AsmLexer() override;
 
   void setBuffer(StringRef Buf, const char *ptr = nullptr);
@@ -74,4 +73,4 @@ private:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCPARSER_ASMLEXER_H
diff --git a/include/llvm/MC/MCParser/MCAsmLexer.h b/include/llvm/MC/MCParser/MCAsmLexer.h
index 56da6f85c1993cf8bc36e0b5fbfaa0dcc3e751b9..7ddc7722e512beb8e6dd56690e992a41951a1ee7 100644
--- a/include/llvm/MC/MCParser/MCAsmLexer.h
+++ b/include/llvm/MC/MCParser/MCAsmLexer.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCAsmLexer.h - Abstract Asm Lexer Interface -----*- C++ -*-===//
+//===- llvm/MC/MCAsmLexer.h - Abstract Asm Lexer Interface ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,10 +14,12 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/SMLoc.h"
-#include <utility>
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <string>
 
 namespace llvm {
 
@@ -76,7 +78,7 @@ private:
   APInt IntVal;
 
 public:
-  AsmToken() {}
+  AsmToken() = default;
   AsmToken(TokenKind Kind, StringRef Str, APInt IntVal)
       : Kind(Kind), Str(Str), IntVal(std::move(IntVal)) {}
   AsmToken(TokenKind Kind, StringRef Str, int64_t IntVal = 0)
@@ -132,7 +134,7 @@ public:
 /// it is lexed.
 class AsmCommentConsumer {
 public:
-  virtual ~AsmCommentConsumer() {};
+  virtual ~AsmCommentConsumer() = default;
 
   /// Callback function for when a comment is lexed. Loc is the start of the
   /// comment text (excluding the comment-start marker). CommentText is the text
@@ -152,14 +154,12 @@ class MCAsmLexer {
   SMLoc ErrLoc;
   std::string Err;
 
-  MCAsmLexer(const MCAsmLexer &) = delete;
-  void operator=(const MCAsmLexer &) = delete;
 protected: // Can only create subclasses.
-  const char *TokStart;
-  bool SkipSpace;
+  const char *TokStart = nullptr;
+  bool SkipSpace = true;
   bool AllowAtInIdentifier;
-  bool IsAtStartOfStatement;
-  AsmCommentConsumer *CommentConsumer;
+  bool IsAtStartOfStatement = true;
+  AsmCommentConsumer *CommentConsumer = nullptr;
 
   MCAsmLexer();
 
@@ -171,6 +171,8 @@ protected: // Can only create subclasses.
   }
 
 public:
+  MCAsmLexer(const MCAsmLexer &) = delete;
+  MCAsmLexer &operator=(const MCAsmLexer &) = delete;
   virtual ~MCAsmLexer();
 
   /// Consume the next token from the input stream and return it.
@@ -255,6 +257,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCPARSER_MCASMLEXER_H
diff --git a/include/llvm/MC/MCParser/MCAsmParser.h b/include/llvm/MC/MCParser/MCAsmParser.h
index c289f51f43f75007823d36f8cdb06c24e5b08000..6763374185ec169ea73c0829b7df9151cffdccf3 100644
--- a/include/llvm/MC/MCParser/MCAsmParser.h
+++ b/include/llvm/MC/MCParser/MCAsmParser.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCAsmParser.h - Abstract Asm Parser Interface ---*- C++ -*-===//
+//===- llvm/MC/MCAsmParser.h - Abstract Asm Parser Interface ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,16 +10,21 @@
 #ifndef LLVM_MC_MCPARSER_MCASMPARSER_H
 #define LLVM_MC_MCPARSER_MCASMPARSER_H
 
-#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCParser/AsmLexer.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/Support/SMLoc.h"
+#include <cstdint>
+#include <string>
+#include <utility>
 
 namespace llvm {
+
 class MCAsmInfo;
-class MCAsmLexer;
 class MCAsmParserExtension;
 class MCContext;
 class MCExpr;
@@ -27,10 +32,7 @@ class MCInstPrinter;
 class MCInstrInfo;
 class MCStreamer;
 class MCTargetAsmParser;
-class SMLoc;
-class SMRange;
 class SourceMgr;
-class Twine;
 
 class InlineAsmIdentifierInfo {
 public:
@@ -51,12 +53,12 @@ public:
 class MCAsmParserSemaCallback {
 public:
   virtual ~MCAsmParserSemaCallback();
+
   virtual void *LookupInlineAsmIdentifier(StringRef &LineBuf,
                                           InlineAsmIdentifierInfo &Info,
                                           bool IsUnevaluatedContext) = 0;
   virtual StringRef LookupInlineAsmLabel(StringRef Identifier, SourceMgr &SM,
                                          SMLoc Location, bool Create) = 0;
-
   virtual bool LookupInlineAsmField(StringRef Base, StringRef Member,
                                     unsigned &Offset) = 0;
 };
@@ -76,22 +78,21 @@ public:
   };
 
 private:
-  MCAsmParser(const MCAsmParser &) = delete;
-  void operator=(const MCAsmParser &) = delete;
-
-  MCTargetAsmParser *TargetParser;
+  MCTargetAsmParser *TargetParser = nullptr;
 
   unsigned ShowParsedOperands : 1;
 
 protected: // Can only create subclasses.
   MCAsmParser();
 
-  bool HadError;
+  bool HadError = false;
 
   SmallVector<MCPendingError, 1> PendingErrors;
   /// Flag tracking whether any errors have been encountered.
 
 public:
+  MCAsmParser(const MCAsmParser &) = delete;
+  MCAsmParser &operator=(const MCAsmParser &) = delete;
   virtual ~MCAsmParser();
 
   virtual void addDirectiveHandler(StringRef Directive,
@@ -190,8 +191,8 @@ public:
 
   bool parseIntToken(int64_t &V, const Twine &ErrMsg);
 
-  bool check(bool P, const llvm::Twine &Msg);
-  bool check(bool P, SMLoc Loc, const llvm::Twine &Msg);
+  bool check(bool P, const Twine &Msg);
+  bool check(bool P, SMLoc Loc, const Twine &Msg);
 
   /// \brief Parse an identifier or string (as a quoted identifier) and set \p
   /// Res to the identifier contents.
@@ -258,8 +259,8 @@ public:
 
 /// \brief Create an MCAsmParser instance.
 MCAsmParser *createMCAsmParser(SourceMgr &, MCContext &, MCStreamer &,
-                               const MCAsmInfo &);
+                               const MCAsmInfo &, unsigned CB = 0);
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCPARSER_MCASMPARSER_H
diff --git a/include/llvm/MC/MCParser/MCAsmParserExtension.h b/include/llvm/MC/MCParser/MCAsmParserExtension.h
index 7817d41c0170270b6b8e129d1103b14cf0d30f23..ffb8d7a4a26a2022bcab6a632f61e04071644688 100644
--- a/include/llvm/MC/MCParser/MCAsmParserExtension.h
+++ b/include/llvm/MC/MCParser/MCAsmParserExtension.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCAsmParserExtension.h - Asm Parser Hooks -------*- C++ -*-===//
+//===- llvm/MC/MCAsmParserExtension.h - Asm Parser Hooks --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,20 +10,20 @@
 #ifndef LLVM_MC_MCPARSER_MCASMPARSEREXTENSION_H
 #define LLVM_MC_MCPARSER_MCASMPARSEREXTENSION_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/Support/SMLoc.h"
 
 namespace llvm {
+
 class Twine;
 
 /// \brief Generic interface for extending the MCAsmParser,
 /// which is implemented by target and object file assembly parser
 /// implementations.
 class MCAsmParserExtension {
-  MCAsmParserExtension(const MCAsmParserExtension &) = delete;
-  void operator=(const MCAsmParserExtension &) = delete;
-
   MCAsmParser *Parser;
 
 protected:
@@ -38,9 +38,11 @@ protected:
     return (Obj->*Handler)(Directive, DirectiveLoc);
   }
 
-  bool BracketExpressionsSupported;
+  bool BracketExpressionsSupported = false;
 
 public:
+  MCAsmParserExtension(const MCAsmParserExtension &) = delete;
+  MCAsmParserExtension &operator=(const MCAsmParserExtension &) = delete;
   virtual ~MCAsmParserExtension();
 
   /// \brief Initialize the extension for parsing using the given \p Parser.
@@ -65,15 +67,19 @@ public:
 
   SourceMgr &getSourceManager() { return getParser().getSourceManager(); }
   MCStreamer &getStreamer() { return getParser().getStreamer(); }
+
   bool Warning(SMLoc L, const Twine &Msg) {
     return getParser().Warning(L, Msg);
   }
+
   bool Error(SMLoc L, const Twine &Msg, SMRange Range = SMRange()) {
     return getParser().Error(L, Msg, Range);
   }
+
   void Note(SMLoc L, const Twine &Msg) {
     getParser().Note(L, Msg);
   }
+
   bool TokError(const Twine &Msg) {
     return getParser().TokError(Msg);
   }
@@ -93,11 +99,11 @@ public:
     return getParser().parseOptionalToken(T);
   }
 
-  bool check(bool P, const llvm::Twine &Msg) {
+  bool check(bool P, const Twine &Msg) {
     return getParser().check(P, Msg);
   }
 
-  bool check(bool P, SMLoc Loc, const llvm::Twine &Msg) {
+  bool check(bool P, SMLoc Loc, const Twine &Msg) {
     return getParser().check(P, Loc, Msg);
   }
 
@@ -110,6 +116,6 @@ public:
   /// @}
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCPARSER_MCASMPARSEREXTENSION_H
diff --git a/include/llvm/MC/MCParser/MCAsmParserUtils.h b/include/llvm/MC/MCParser/MCAsmParserUtils.h
index 9834fe96307b2f9649a0fe1b3e02943fd77344f5..84173bb9cb8ebe97a1af130b374c67ad3d8dc1ed 100644
--- a/include/llvm/MC/MCParser/MCAsmParserUtils.h
+++ b/include/llvm/MC/MCParser/MCAsmParserUtils.h
@@ -1,4 +1,4 @@
-//===------ llvm/MC/MCAsmParserUtils.h - Asm Parser Utilities ---*- C++ -*-===//
+//===- llvm/MC/MCAsmParserUtils.h - Asm Parser Utilities --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -28,6 +28,7 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef,
                                const MCExpr *&Value);
 
 } // namespace MCParserUtils
+
 } // namespace llvm
 
-#endif
+#endif // LLVM_MC_MCPARSER_MCASMPARSERUTILS_H
diff --git a/include/llvm/MC/MCParser/MCParsedAsmOperand.h b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
index a90d280c240ce2e17e36be1bef01fb4bdba90736..4af76ac2a858a71cd6727f4b64617f8c8f46a0e9 100644
--- a/include/llvm/MC/MCParser/MCParsedAsmOperand.h
+++ b/include/llvm/MC/MCParser/MCParsedAsmOperand.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCParsedAsmOperand.h - Asm Parser Operand -------*- C++ -*-===//
+//===- llvm/MC/MCParsedAsmOperand.h - Asm Parser Operand --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,11 +10,12 @@
 #ifndef LLVM_MC_MCPARSER_MCPARSEDASMOPERAND_H
 #define LLVM_MC_MCPARSER_MCPARSEDASMOPERAND_H
 
-#include <string>
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/SMLoc.h"
+#include <string>
 
 namespace llvm {
+
 class raw_ostream;
 
 /// MCParsedAsmOperand - This abstract class represents a source-level assembly
@@ -35,12 +36,12 @@ protected:
   // lots of members and MSVC doesn't support defaulted move ops, so to avoid
   // that verbosity, just rely on defaulted copy ops. It's only the Constraint
   // string member that would benefit from movement anyway.
+  MCParsedAsmOperand() = default;
   MCParsedAsmOperand(const MCParsedAsmOperand &RHS) = default;
   MCParsedAsmOperand &operator=(const MCParsedAsmOperand &) = default;
-  MCParsedAsmOperand() = default;
 
 public:
-  virtual ~MCParsedAsmOperand() {}
+  virtual ~MCParsedAsmOperand() = default;
 
   void setConstraint(StringRef C) { Constraint = C.str(); }
   StringRef getConstraint() { return Constraint; }
@@ -81,6 +82,7 @@ public:
 
   /// print - Print a debug representation of the operand to the given stream.
   virtual void print(raw_ostream &OS) const = 0;
+
   /// dump - Print to the debug stream.
   virtual void dump() const;
 };
@@ -93,6 +95,6 @@ inline raw_ostream& operator<<(raw_ostream &OS, const MCParsedAsmOperand &MO) {
   return OS;
 }
 
-} // end namespace llvm.
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCPARSER_MCPARSEDASMOPERAND_H
diff --git a/include/llvm/MC/MCParser/MCTargetAsmParser.h b/include/llvm/MC/MCParser/MCTargetAsmParser.h
index 70cd60c9a112d4385bc9584fde5b3252d6c22c7c..c81a7624011fa096c9a8e300ba6f131aa0a5fe87 100644
--- a/include/llvm/MC/MCParser/MCTargetAsmParser.h
+++ b/include/llvm/MC/MCParser/MCTargetAsmParser.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCTargetAsmParser.h - Target Assembly Parser ----*- C++ -*-===//
+//===- llvm/MC/MCTargetAsmParser.h - Target Assembly Parser -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,19 +10,21 @@
 #ifndef LLVM_MC_MCPARSER_MCTARGETASMPARSER_H
 #define LLVM_MC_MCPARSER_MCTARGETASMPARSER_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/SMLoc.h"
+#include <cstdint>
 #include <memory>
 
 namespace llvm {
-class AsmToken;
+
 class MCInst;
 class MCParsedAsmOperand;
 class MCStreamer;
 class MCSubtargetInfo;
-class SMLoc;
-class StringRef;
 template <typename T> class SmallVectorImpl;
 
 typedef SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> OperandVector;
@@ -66,6 +68,7 @@ struct AsmRewrite {
   unsigned Len;
   unsigned Val;
   StringRef Label;
+
 public:
   AsmRewrite(AsmRewriteKind kind, SMLoc loc, unsigned len = 0, unsigned val = 0)
     : Kind(kind), Loc(loc), Len(len), Val(val) {}
@@ -74,10 +77,9 @@ public:
 };
 
 struct ParseInstructionInfo {
+  SmallVectorImpl<AsmRewrite> *AsmRewrites = nullptr;
 
-  SmallVectorImpl<AsmRewrite> *AsmRewrites;
-
-  ParseInstructionInfo() : AsmRewrites(nullptr) {}
+  ParseInstructionInfo() = default;
   ParseInstructionInfo(SmallVectorImpl<AsmRewrite> *rewrites)
     : AsmRewrites(rewrites) {}
 };
@@ -99,9 +101,6 @@ public:
     FIRST_TARGET_MATCH_RESULT_TY
   };
 
-private:
-  MCTargetAsmParser(const MCTargetAsmParser &) = delete;
-  void operator=(const MCTargetAsmParser &) = delete;
 protected: // Can only create subclasses.
   MCTargetAsmParser(MCTargetOptions const &, const MCSubtargetInfo &STI);
 
@@ -109,10 +108,10 @@ protected: // Can only create subclasses.
   MCSubtargetInfo &copySTI();
 
   /// AvailableFeatures - The current set of available features.
-  uint64_t AvailableFeatures;
+  uint64_t AvailableFeatures = 0;
 
   /// ParsingInlineAsm - Are we parsing ms-style inline assembly?
-  bool ParsingInlineAsm;
+  bool ParsingInlineAsm = false;
 
   /// SemaCallback - The Sema callback implementation.  Must be set when parsing
   /// ms-style inline assembly.
@@ -125,6 +124,9 @@ protected: // Can only create subclasses.
   const MCSubtargetInfo *STI;
 
 public:
+  MCTargetAsmParser(const MCTargetAsmParser &) = delete;
+  MCTargetAsmParser &operator=(const MCTargetAsmParser &) = delete;
+
   ~MCTargetAsmParser() override;
 
   const MCSubtargetInfo &getSTI() const;
@@ -229,11 +231,11 @@ public:
     return nullptr;
   }
 
-  virtual void onLabelParsed(MCSymbol *Symbol) { }
+  virtual void onLabelParsed(MCSymbol *Symbol) {}
 
   /// Ensure that all previously parsed instructions have been emitted to the
   /// output streamer, if the target does not emit them immediately.
-  virtual void flushPendingInstructions(MCStreamer &Out) { }
+  virtual void flushPendingInstructions(MCStreamer &Out) {}
 
   virtual const MCExpr *createTargetUnaryExpr(const MCExpr *E,
                                               AsmToken::TokenKind OperatorToken,
@@ -242,6 +244,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCPARSER_MCTARGETASMPARSER_H
diff --git a/include/llvm/MC/MCRegisterInfo.h b/include/llvm/MC/MCRegisterInfo.h
index 80ed8e21420b3a5ae8b54145b77dadce483647f9..015d0b96d9f27b4cecaff12cc46ff4a06d9c6163 100644
--- a/include/llvm/MC/MCRegisterInfo.h
+++ b/include/llvm/MC/MCRegisterInfo.h
@@ -1,4 +1,4 @@
-//=== MC/MCRegisterInfo.h - Target Register Description ---------*- C++ -*-===//
+//===- MC/MCRegisterInfo.h - Target Register Description --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,9 +17,11 @@
 #define LLVM_MC_MCREGISTERINFO_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/MC/LaneBitmask.h"
-#include "llvm/Support/ErrorHandling.h"
 #include <cassert>
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
 
@@ -39,7 +41,7 @@ public:
   const uint16_t RegsSize;
   const uint16_t RegSetSize;
   const uint16_t ID;
-  const uint16_t RegSize, Alignment; // Size & Alignment of register in bytes
+  const uint16_t PhysRegSize;
   const int8_t CopyCost;
   const bool Allocatable;
 
@@ -78,13 +80,10 @@ public:
     return contains(Reg1) && contains(Reg2);
   }
 
-  /// getSize - Return the size of the register in bytes, which is also the size
-  /// of a stack slot allocated to hold a spilled copy of this register.
-  unsigned getSize() const { return RegSize; }
-
-  /// getAlignment - Return the minimum required alignment for a register of
-  /// this class.
-  unsigned getAlignment() const { return Alignment; }
+  /// Return the size of the physical register in bytes.
+  unsigned getPhysRegSize() const { return PhysRegSize; }
+  /// Temporary function to allow out-of-tree targets to switch.
+  unsigned getSize() const { return getPhysRegSize(); }
 
   /// getCopyCost - Return the cost of copying a value between two registers in
   /// this class. A negative number means the register class is very expensive
@@ -152,6 +151,7 @@ public:
     uint16_t Offset;
     uint16_t Size;
   };
+
 private:
   const MCRegisterDesc *Desc;                 // Pointer to the descriptor array
   unsigned NumRegs;                           // Number of entries in the array
@@ -191,12 +191,12 @@ public:
   /// Don't use this class directly, use one of the specialized sub-classes
   /// defined below.
   class DiffListIterator {
-    uint16_t Val;
-    const MCPhysReg *List;
+    uint16_t Val = 0;
+    const MCPhysReg *List = nullptr;
 
   protected:
     /// Create an invalid iterator. Call init() to point to something useful.
-    DiffListIterator() : Val(0), List(nullptr) {}
+    DiffListIterator() = default;
 
     /// init - Point the iterator to InitVal, decoding subsequent values from
     /// DiffList. The iterator will initially point to InitVal, sub-classes are
@@ -217,7 +217,6 @@ public:
     }
 
   public:
-
     /// isValid - returns true if this iterator is not yet at the end.
     bool isValid() const { return List; }
 
@@ -495,6 +494,7 @@ public:
 class MCSubRegIndexIterator {
   MCSubRegIterator SRIter;
   const uint16_t *SRIndex;
+
 public:
   /// Constructs an iterator that traverses subregisters and their
   /// associated subregister indices.
@@ -507,6 +507,7 @@ public:
   unsigned getSubReg() const {
     return *SRIter;
   }
+
   /// Returns sub-register index of the current sub-register.
   unsigned getSubRegIndex() const {
     return *SRIndex;
@@ -526,7 +527,8 @@ public:
 /// If IncludeSelf is set, Reg itself is included in the list.
 class MCSuperRegIterator : public MCRegisterInfo::DiffListIterator {
 public:
-  MCSuperRegIterator() {}
+  MCSuperRegIterator() = default;
+
   MCSuperRegIterator(unsigned Reg, const MCRegisterInfo *MCRI,
                      bool IncludeSelf = false) {
     init(Reg, MCRI->DiffLists + MCRI->get(Reg).SuperRegs);
@@ -563,7 +565,8 @@ class MCRegUnitIterator : public MCRegisterInfo::DiffListIterator {
 public:
   /// MCRegUnitIterator - Create an iterator that traverses the register units
   /// in Reg.
-  MCRegUnitIterator() {}
+  MCRegUnitIterator() = default;
+
   MCRegUnitIterator(unsigned Reg, const MCRegisterInfo *MCRI) {
     assert(Reg && "Null register has no regunits");
     // Decode the RegUnits MCRegisterDesc field.
@@ -589,8 +592,10 @@ public:
 class MCRegUnitMaskIterator {
   MCRegUnitIterator RUIter;
   const LaneBitmask *MaskListIter;
+
 public:
-  MCRegUnitMaskIterator() {}
+  MCRegUnitMaskIterator() = default;
+
   /// Constructs an iterator that traverses the register units and their
   /// associated LaneMasks in Reg.
   MCRegUnitMaskIterator(unsigned Reg, const MCRegisterInfo *MCRI)
@@ -625,10 +630,12 @@ public:
 
 /// MCRegUnitRootIterator enumerates the root registers of a register unit.
 class MCRegUnitRootIterator {
-  uint16_t Reg0;
-  uint16_t Reg1;
+  uint16_t Reg0 = 0;
+  uint16_t Reg1 = 0;
+
 public:
-  MCRegUnitRootIterator() : Reg0(0), Reg1(0) {}
+  MCRegUnitRootIterator() = default;
+
   MCRegUnitRootIterator(unsigned RegUnit, const MCRegisterInfo *MCRI) {
     assert(RegUnit < MCRI->getNumRegUnits() && "Invalid register unit");
     Reg0 = MCRI->RegUnitRoots[RegUnit][0];
@@ -665,11 +672,11 @@ private:
   MCRegUnitIterator RI;
   MCRegUnitRootIterator RRI;
   MCSuperRegIterator SI;
+
 public:
   MCRegAliasIterator(unsigned Reg, const MCRegisterInfo *MCRI,
                      bool IncludeSelf)
     : Reg(Reg), MCRI(MCRI), IncludeSelf(IncludeSelf) {
-
     // Initialize the iterators.
     for (RI = MCRegUnitIterator(Reg, MCRI); RI.isValid(); ++RI) {
       for (RRI = MCRegUnitRootIterator(*RI, MCRI); RRI.isValid(); ++RRI) {
@@ -684,7 +691,7 @@ public:
   bool isValid() const { return RI.isValid(); }
 
   unsigned operator*() const {
-    assert (SI.isValid() && "Cannot dereference an invalid iterator.");
+    assert(SI.isValid() && "Cannot dereference an invalid iterator.");
     return *SI;
   }
 
@@ -713,6 +720,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCREGISTERINFO_H
diff --git a/include/llvm/MC/MCSection.h b/include/llvm/MC/MCSection.h
index 68d4a7f5509135377c47a5633f2f84bd3612d41f..2974d8f1b80b029d705f050588acef79ed2dc418 100644
--- a/include/llvm/MC/MCSection.h
+++ b/include/llvm/MC/MCSection.h
@@ -14,23 +14,21 @@
 #ifndef LLVM_MC_MCSECTION_H
 #define LLVM_MC_MCSECTION_H
 
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/ilist.h"
-#include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCFragment.h"
 #include "llvm/MC/SectionKind.h"
-#include "llvm/Support/Compiler.h"
+#include <cassert>
+#include <utility>
 
 namespace llvm {
+
 class MCAsmInfo;
-class MCAssembler;
 class MCContext;
 class MCExpr;
-class MCFragment;
-class MCSection;
 class MCSymbol;
-class Triple;
 class raw_ostream;
+class Triple;
 
 template <> struct ilist_alloc_traits<MCFragment> {
   static void deleteNode(MCFragment *V);
@@ -40,7 +38,7 @@ template <> struct ilist_alloc_traits<MCFragment> {
 /// current translation unit.  The MCContext class uniques and creates these.
 class MCSection {
 public:
-  enum SectionVariant { SV_COFF = 0, SV_ELF, SV_MachO };
+  enum SectionVariant { SV_COFF = 0, SV_ELF, SV_MachO, SV_Wasm };
 
   /// \brief Express the state of bundle locked groups while emitting code.
   enum BundleLockStateType {
@@ -58,9 +56,6 @@ public:
   typedef FragmentListType::reverse_iterator reverse_iterator;
 
 private:
-  MCSection(const MCSection &) = delete;
-  void operator=(const MCSection &) = delete;
-
   MCSymbol *Begin;
   MCSymbol *End = nullptr;
   /// The alignment requirement of this section.
@@ -78,12 +73,12 @@ private:
 
   /// \brief We've seen a bundle_lock directive but not its first instruction
   /// yet.
-  unsigned BundleGroupBeforeFirstInst : 1;
+  bool BundleGroupBeforeFirstInst : 1;
 
   /// Whether this section has had instructions emitted into it.
-  unsigned HasInstructions : 1;
+  bool HasInstructions : 1;
 
-  unsigned IsRegistered : 1;
+  bool IsRegistered : 1;
 
   MCDummyFragment DummyFragment;
 
@@ -94,12 +89,16 @@ private:
   SmallVector<std::pair<unsigned, MCFragment *>, 1> SubsectionFragmentMap;
 
 protected:
-  MCSection(SectionVariant V, SectionKind K, MCSymbol *Begin);
   SectionVariant Variant;
   SectionKind Kind;
+
+  MCSection(SectionVariant V, SectionKind K, MCSymbol *Begin);
   ~MCSection();
 
 public:
+  MCSection(const MCSection &) = delete;
+  MCSection &operator=(const MCSection &) = delete;
+
   SectionKind getKind() const { return Kind; }
 
   SectionVariant getVariant() const { return Variant; }
@@ -185,4 +184,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCSECTION_H
diff --git a/include/llvm/MC/MCSectionCOFF.h b/include/llvm/MC/MCSectionCOFF.h
index 914008d4c3d2e8e7729a3fc5d27e87bf323fdc80..24b9f8898ebb48c02420a628ff515e0fcfb1a4f9 100644
--- a/include/llvm/MC/MCSectionCOFF.h
+++ b/include/llvm/MC/MCSectionCOFF.h
@@ -16,8 +16,11 @@
 
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/SectionKind.h"
+#include <cassert>
 
 namespace llvm {
+
 class MCSymbol;
 
 /// This represents a section on Windows
@@ -94,4 +97,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCSECTIONCOFF_H
diff --git a/include/llvm/MC/MCSectionELF.h b/include/llvm/MC/MCSectionELF.h
index e296de60c77749db8f5216966afb99834c144d5c..00c289c6bd6ebfbf83d01f2670361e7d102b5b1d 100644
--- a/include/llvm/MC/MCSectionELF.h
+++ b/include/llvm/MC/MCSectionELF.h
@@ -14,12 +14,10 @@
 #ifndef LLVM_MC_MCSECTIONELF_H
 #define LLVM_MC_MCSECTIONELF_H
 
-#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ELF.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/SectionKind.h"
 
 namespace llvm {
 
@@ -47,17 +45,18 @@ class MCSectionELF final : public MCSection {
 
   const MCSymbolELF *Group;
 
-  /// Depending on the type of the section this is sh_link or sh_info.
-  const MCSectionELF *Associated;
+  /// sh_info for SHF_LINK_ORDER (can be null).
+  const MCSymbol *AssociatedSymbol;
 
 private:
   friend class MCContext;
+
   MCSectionELF(StringRef Section, unsigned type, unsigned flags, SectionKind K,
                unsigned entrySize, const MCSymbolELF *group, unsigned UniqueID,
-               MCSymbol *Begin, const MCSectionELF *Associated)
+               MCSymbol *Begin, const MCSymbolELF *AssociatedSymbol)
       : MCSection(SV_ELF, K, Begin), SectionName(Section), Type(type),
         Flags(flags), UniqueID(UniqueID), EntrySize(entrySize), Group(group),
-        Associated(Associated) {
+        AssociatedSymbol(AssociatedSymbol) {
     if (Group)
       Group->setIsSignature();
   }
@@ -87,7 +86,8 @@ public:
   bool isUnique() const { return UniqueID != ~0U; }
   unsigned getUniqueID() const { return UniqueID; }
 
-  const MCSectionELF *getAssociatedSection() const { return Associated; }
+  const MCSection *getAssociatedSection() const { return &AssociatedSymbol->getSection(); }
+  const MCSymbol *getAssociatedSymbol() const { return AssociatedSymbol; }
 
   static bool classof(const MCSection *S) {
     return S->getVariant() == SV_ELF;
@@ -96,4 +96,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCSECTIONELF_H
diff --git a/include/llvm/MC/MCSectionWasm.h b/include/llvm/MC/MCSectionWasm.h
new file mode 100644
index 0000000000000000000000000000000000000000..4e19196175c074703df2656bf6123c763f17435e
--- /dev/null
+++ b/include/llvm/MC/MCSectionWasm.h
@@ -0,0 +1,86 @@
+//===- MCSectionWasm.h - Wasm Machine Code Sections -------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares the MCSectionWasm class.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCSECTIONWASM_H
+#define LLVM_MC_MCSECTIONWASM_H
+
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+namespace llvm {
+
+class MCSymbol;
+
+/// This represents a section on wasm.
+class MCSectionWasm final : public MCSection {
+  /// This is the name of the section.  The referenced memory is owned by
+  /// TargetLoweringObjectFileWasm's WasmUniqueMap.
+  StringRef SectionName;
+
+  /// This is the sh_type field of a section, drawn from the enums below.
+  unsigned Type;
+
+  /// This is the sh_flags field of a section, drawn from the enums below.
+  unsigned Flags;
+
+  unsigned UniqueID;
+
+  const MCSymbolWasm *Group;
+
+  // The offset of the MC function section in the wasm code section.
+  uint64_t SectionOffset;
+
+private:
+  friend class MCContext;
+  MCSectionWasm(StringRef Section, unsigned type, unsigned flags, SectionKind K,
+                const MCSymbolWasm *group, unsigned UniqueID, MCSymbol *Begin)
+      : MCSection(SV_Wasm, K, Begin), SectionName(Section), Type(type),
+        Flags(flags), UniqueID(UniqueID), Group(group), SectionOffset(0) {
+  }
+
+  void setSectionName(StringRef Name) { SectionName = Name; }
+
+public:
+  ~MCSectionWasm();
+
+  /// Decides whether a '.section' directive should be printed before the
+  /// section name
+  bool ShouldOmitSectionDirective(StringRef Name, const MCAsmInfo &MAI) const;
+
+  StringRef getSectionName() const { return SectionName; }
+  unsigned getType() const { return Type; }
+  unsigned getFlags() const { return Flags; }
+  void setFlags(unsigned F) { Flags = F; }
+  const MCSymbolWasm *getGroup() const { return Group; }
+
+  void PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+                            raw_ostream &OS,
+                            const MCExpr *Subsection) const override;
+  bool UseCodeAlign() const override;
+  bool isVirtualSection() const override;
+
+  bool isUnique() const { return UniqueID != ~0U; }
+  unsigned getUniqueID() const { return UniqueID; }
+
+  uint64_t getSectionOffset() const { return SectionOffset; }
+  void setSectionOffset(uint64_t Offset) { SectionOffset = Offset; }
+
+  static bool classof(const MCSection *S) { return S->getVariant() == SV_Wasm; }
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/MC/MCStreamer.h b/include/llvm/MC/MCStreamer.h
index 41f00a24dfbfb03ff161e31bbd27c5e4356a6d98..c0d322e3ed3acf92ed96c7b8fba1a3028d80ad4a 100644
--- a/include/llvm/MC/MCStreamer.h
+++ b/include/llvm/MC/MCStreamer.h
@@ -15,17 +15,26 @@
 #define LLVM_MC_MCSTREAMER_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWinEH.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <cstdint>
+#include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 namespace llvm {
+
+class AssemblerConstantPools;
+class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
@@ -34,14 +43,11 @@ class MCInst;
 class MCInstPrinter;
 class MCSection;
 class MCStreamer;
-class MCSymbolELF;
 class MCSymbolRefExpr;
+class MCSymbolWasm;
 class MCSubtargetInfo;
-class StringRef;
-class Twine;
 class raw_ostream;
-class formatted_raw_ostream;
-class AssemblerConstantPools;
+class Twine;
 
 typedef std::pair<MCSection *, const MCExpr *> MCSectionSubPair;
 
@@ -162,9 +168,6 @@ class MCStreamer {
   MCContext &Context;
   std::unique_ptr<MCTargetStreamer> TargetStreamer;
 
-  MCStreamer(const MCStreamer &) = delete;
-  MCStreamer &operator=(const MCStreamer &) = delete;
-
   std::vector<MCDwarfFrameInfo> DwarfFrameInfos;
   MCDwarfFrameInfo *getCurrentDwarfFrameInfo();
   void EnsureValidDwarfFrame();
@@ -205,6 +208,8 @@ protected:
   virtual void EmitRawTextImpl(StringRef String);
 
 public:
+  MCStreamer(const MCStreamer &) = delete;
+  MCStreamer &operator=(const MCStreamer &) = delete;
   virtual ~MCStreamer();
 
   void visitUsedExpr(const MCExpr &Expr);
@@ -282,6 +287,7 @@ public:
   /// \brief Add explicit comment T. T is required to be a valid
   /// comment in the output and does not need to be escaped.
   virtual void addExplicitComment(const Twine &T);
+
   /// \brief Emit added explicit comments.
   virtual void emitExplicitComments();
 
@@ -393,7 +399,7 @@ public:
   /// used in an assignment.
   // FIXME: These emission are non-const because we mutate the symbol to
   // add the section we're emitting it to later.
-  virtual void EmitLabel(MCSymbol *Symbol);
+  virtual void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc());
 
   virtual void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol);
 
@@ -483,6 +489,14 @@ public:
   ///  .size symbol, expression
   virtual void emitELFSize(MCSymbol *Symbol, const MCExpr *Value);
 
+  /// \brief Emit an ELF .symver directive.
+  ///
+  /// This corresponds to an assembler statement such as:
+  ///  .symver _start, foo@@SOME_VERSION
+  /// \param Alias - The versioned alias (i.e. "foo@@SOME_VERSION")
+  /// \param Aliasee - The aliased symbol (i.e. "_start")
+  virtual void emitELFSymverDirective(MCSymbol *Alias, const MCSymbol *Aliasee);
+
   /// \brief Emit a Linker Optimization Hint (LOH) directive.
   /// \param Args - Arguments of the LOH.
   virtual void EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) {}
@@ -876,6 +890,7 @@ MCStreamer *createAsmStreamer(MCContext &Ctx,
                               bool isVerboseAsm, bool useDwarfDirectory,
                               MCInstPrinter *InstPrint, MCCodeEmitter *CE,
                               MCAsmBackend *TAB, bool ShowInst);
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCSTREAMER_H
diff --git a/include/llvm/MC/MCSubtargetInfo.h b/include/llvm/MC/MCSubtargetInfo.h
index 5ede043fa2eea77f382db2045aac008674d138bb..bbdac8fad5f56044c1b11d59f95c23f0e53e6c71 100644
--- a/include/llvm/MC/MCSubtargetInfo.h
+++ b/include/llvm/MC/MCSubtargetInfo.h
@@ -1,4 +1,4 @@
-//==-- llvm/MC/MCSubtargetInfo.h - Subtarget Information ---------*- C++ -*-==//
+//===- llvm/MC/MCSubtargetInfo.h - Subtarget Information --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,14 +15,18 @@
 #define LLVM_MC_MCSUBTARGETINFO_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 #include <string>
 
 namespace llvm {
 
-class StringRef;
-
 //===----------------------------------------------------------------------===//
 ///
 /// MCSubtargetInfo - Generic base class for all target subtargets.
@@ -45,10 +49,6 @@ class MCSubtargetInfo {
   const unsigned *ForwardingPaths;     // Forwarding paths
   FeatureBitset FeatureBits;           // Feature bits for current CPU + FS
 
-  MCSubtargetInfo() = delete;
-  MCSubtargetInfo &operator=(MCSubtargetInfo &&) = delete;
-  MCSubtargetInfo &operator=(const MCSubtargetInfo &) = delete;
-
 public:
   MCSubtargetInfo(const MCSubtargetInfo &) = default;
   MCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS,
@@ -58,6 +58,9 @@ public:
                   const MCWriteProcResEntry *WPR, const MCWriteLatencyEntry *WL,
                   const MCReadAdvanceEntry *RA, const InstrStage *IS,
                   const unsigned *OC, const unsigned *FP);
+  MCSubtargetInfo() = delete;
+  MCSubtargetInfo &operator=(const MCSubtargetInfo &) = delete;
+  MCSubtargetInfo &operator=(MCSubtargetInfo &&) = delete;
 
   /// getTargetTriple - Return the target triple string.
   const Triple &getTargetTriple() const { return TargetTriple; }
@@ -166,6 +169,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCSUBTARGETINFO_H
diff --git a/include/llvm/MC/MCSymbol.h b/include/llvm/MC/MCSymbol.h
index 23e34b7869a56e0ad08b1f98cf2614ee1eef09e0..e8432afd8627fb1165bb62b63b0c92b4ce9b8d67 100644
--- a/include/llvm/MC/MCSymbol.h
+++ b/include/llvm/MC/MCSymbol.h
@@ -15,18 +15,21 @@
 #define LLVM_MC_MCSYMBOL_H
 
 #include "llvm/ADT/PointerIntPair.h"
-#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCFragment.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 
 namespace llvm {
+
 class MCAsmInfo;
+class MCContext;
 class MCExpr;
-class MCSymbol;
-class MCFragment;
 class MCSection;
-class MCContext;
 class raw_ostream;
 
 /// MCSymbol - Instances of this class represent a symbol name in the MC file,
@@ -45,6 +48,7 @@ protected:
     SymbolKindCOFF,
     SymbolKindELF,
     SymbolKindMachO,
+    SymbolKindWasm,
   };
 
   /// A symbol can contain an Offset, or Value, or be Common, but never more
@@ -97,7 +101,7 @@ protected:
 
   /// LLVM RTTI discriminator. This is actually a SymbolKind enumerator, but is
   /// unsigned to avoid sign extension and achieve better bitpacking with MSVC.
-  unsigned Kind : 2;
+  unsigned Kind : 3;
 
   /// True if we have created a relocation that uses this symbol.
   mutable unsigned IsUsedInReloc : 1;
@@ -133,7 +137,7 @@ protected:
     const MCExpr *Value;
   };
 
-protected: // MCContext creates and uniques these.
+  // MCContext creates and uniques these.
   friend class MCExpr;
   friend class MCContext;
 
@@ -163,7 +167,6 @@ protected: // MCContext creates and uniques these.
                      MCContext &Ctx);
 
 private:
-
   void operator delete(void *);
   /// \brief Placement delete - required by std, but never called.
   void operator delete(void*, unsigned) {
@@ -174,8 +177,6 @@ private:
     llvm_unreachable("Constructor throws?");
   }
 
-  MCSymbol(const MCSymbol &) = delete;
-  void operator=(const MCSymbol &) = delete;
   MCSection *getSectionPtr(bool SetUsed = true) const {
     if (MCFragment *F = getFragment(SetUsed)) {
       assert(F != AbsolutePseudoFragment);
@@ -195,6 +196,9 @@ private:
   }
 
 public:
+  MCSymbol(const MCSymbol &) = delete;
+  MCSymbol &operator=(const MCSymbol &) = delete;
+
   /// getName - Get the symbol name.
   StringRef getName() const {
     if (!FragmentAndHasName.getInt())
@@ -281,6 +285,8 @@ public:
 
   bool isMachO() const { return Kind == SymbolKindMachO; }
 
+  bool isWasm() const { return Kind == SymbolKindWasm; }
+
   /// @}
   /// \name Variable Symbols
   /// @{
@@ -416,6 +422,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const MCSymbol &Sym) {
   Sym.print(OS, nullptr);
   return OS;
 }
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCSYMBOL_H
diff --git a/include/llvm/MC/MCSymbolCOFF.h b/include/llvm/MC/MCSymbolCOFF.h
index 2172c67981c0234e9ffebbba3afdcab5fbbec5d9..7918c353dc1546f310174e2aa2959b2816050a4f 100644
--- a/include/llvm/MC/MCSymbolCOFF.h
+++ b/include/llvm/MC/MCSymbolCOFF.h
@@ -6,16 +6,18 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
+
 #ifndef LLVM_MC_MCSYMBOLCOFF_H
 #define LLVM_MC_MCSYMBOLCOFF_H
 
 #include "llvm/MC/MCSymbol.h"
+#include <cstdint>
 
 namespace llvm {
-class MCSymbolCOFF : public MCSymbol {
 
+class MCSymbolCOFF : public MCSymbol {
   /// This corresponds to the e_type field of the COFF symbol.
-  mutable uint16_t Type;
+  mutable uint16_t Type = 0;
 
   enum SymbolFlags : uint16_t {
     SF_ClassMask = 0x00FF,
@@ -27,7 +29,7 @@ class MCSymbolCOFF : public MCSymbol {
 
 public:
   MCSymbolCOFF(const StringMapEntry<bool> *Name, bool isTemporary)
-      : MCSymbol(SymbolKindCOFF, Name, isTemporary), Type(0) {}
+      : MCSymbol(SymbolKindCOFF, Name, isTemporary) {}
 
   uint16_t getType() const {
     return Type;
@@ -59,6 +61,7 @@ public:
 
   static bool classof(const MCSymbol *S) { return S->isCOFF(); }
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_MC_MCSYMBOLCOFF_H
diff --git a/include/llvm/MC/MCSymbolWasm.h b/include/llvm/MC/MCSymbolWasm.h
new file mode 100644
index 0000000000000000000000000000000000000000..4445be006eb0d268ee630d55d4d21aaf7fe02727
--- /dev/null
+++ b/include/llvm/MC/MCSymbolWasm.h
@@ -0,0 +1,57 @@
+//===- MCSymbolWasm.h -  ----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_MC_MCSYMBOLWASM_H
+#define LLVM_MC_MCSYMBOLWASM_H
+
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Wasm.h"
+
+namespace llvm {
+class MCSymbolWasm : public MCSymbol {
+private:
+  bool IsFunction = false;
+  std::string ModuleName;
+  SmallVector<wasm::ValType, 1> Returns;
+  SmallVector<wasm::ValType, 4> Params;
+
+  /// An expression describing how to calculate the size of a symbol. If a
+  /// symbol has no size this field will be NULL.
+  const MCExpr *SymbolSize = nullptr;
+
+public:
+  // Use a module name of "env" for now, for compatibility with existing tools.
+  // This is temporary, and may change, as the ABI is not yet stable.
+  MCSymbolWasm(const StringMapEntry<bool> *Name, bool isTemporary)
+      : MCSymbol(SymbolKindWasm, Name, isTemporary),
+        ModuleName("env") {}
+  static bool classof(const MCSymbol *S) { return S->isWasm(); }
+
+  const MCExpr *getSize() const { return SymbolSize; }
+  void setSize(const MCExpr *SS) { SymbolSize = SS; }
+
+  bool isFunction() const { return IsFunction; }
+  void setIsFunction(bool isFunc) { IsFunction = isFunc; }
+
+  const StringRef getModuleName() const { return ModuleName; }
+
+  const SmallVector<wasm::ValType, 1> &getReturns() const { return Returns; }
+
+  void setReturns(SmallVectorImpl<wasm::ValType> &&Rets) {
+    Returns = std::move(Rets);
+  }
+
+  const SmallVector<wasm::ValType, 4> &getParams() const { return Params; }
+
+  void setParams(SmallVectorImpl<wasm::ValType> &&Pars) {
+    Params = std::move(Pars);
+  }
+};
+}
+
+#endif
diff --git a/include/llvm/MC/MCTargetOptions.h b/include/llvm/MC/MCTargetOptions.h
index 25642379ac9f1a3da4b35e323b7d1e4ceb022c40..06f58d49803081414653a40f4a4b01407b5e83b0 100644
--- a/include/llvm/MC/MCTargetOptions.h
+++ b/include/llvm/MC/MCTargetOptions.h
@@ -1,4 +1,4 @@
-//===- MCTargetOptions.h - MC Target Options -------------------*- C++ -*-===//
+//===- MCTargetOptions.h - MC Target Options --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -51,12 +51,8 @@ public:
   /// Preserve Comments in Assembly.
   bool PreserveAsmComments : 1;
 
-  int DwarfVersion;
+  int DwarfVersion = 0;
 
-  /// getABIName - If this returns a non-empty string this represents the
-  /// textual name of the ABI that we want the backend to use, e.g. o32, or
-  /// aapcs-linux.
-  StringRef getABIName() const;
   std::string ABIName;
 
   /// Additional paths to search for `.include` directives when using the
@@ -64,33 +60,13 @@ public:
   std::vector<std::string> IASSearchPaths;
 
   MCTargetOptions();
-};
 
-inline bool operator==(const MCTargetOptions &LHS, const MCTargetOptions &RHS) {
-#define ARE_EQUAL(X) LHS.X == RHS.X
-  return (ARE_EQUAL(SanitizeAddress) &&
-          ARE_EQUAL(MCRelaxAll) &&
-          ARE_EQUAL(MCNoExecStack) &&
-          ARE_EQUAL(MCFatalWarnings) &&
-          ARE_EQUAL(MCNoWarn) &&
-          ARE_EQUAL(MCNoDeprecatedWarn) &&
-          ARE_EQUAL(MCSaveTempLabels) &&
-          ARE_EQUAL(MCUseDwarfDirectory) &&
-          ARE_EQUAL(MCIncrementalLinkerCompatible) &&
-          ARE_EQUAL(MCPIECopyRelocations) &&
-          ARE_EQUAL(ShowMCEncoding) &&
-          ARE_EQUAL(ShowMCInst) &&
-          ARE_EQUAL(AsmVerbose) &&
-          ARE_EQUAL(DwarfVersion) &&
-          ARE_EQUAL(ABIName) &&
-          ARE_EQUAL(IASSearchPaths));
-#undef ARE_EQUAL
-}
-
-inline bool operator!=(const MCTargetOptions &LHS, const MCTargetOptions &RHS) {
-  return !(LHS == RHS);
-}
+  /// getABIName - If this returns a non-empty string this represents the
+  /// textual name of the ABI that we want the backend to use, e.g. o32, or
+  /// aapcs-linux.
+  StringRef getABIName() const;
+};
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCTARGETOPTIONS_H
diff --git a/include/llvm/MC/MCWasmObjectWriter.h b/include/llvm/MC/MCWasmObjectWriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e458eaac9c83d5e2ac03196106b44662a41f90e
--- /dev/null
+++ b/include/llvm/MC/MCWasmObjectWriter.h
@@ -0,0 +1,85 @@
+//===-- llvm/MC/MCWasmObjectWriter.h - Wasm Object Writer -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCWASMOBJECTWRITER_H
+#define LLVM_MC_MCWASMOBJECTWRITER_H
+
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/raw_ostream.h"
+#include <vector>
+
+namespace llvm {
+class MCAssembler;
+class MCContext;
+class MCFixup;
+class MCFragment;
+class MCObjectWriter;
+class MCSectionWasm;
+class MCSymbol;
+class MCSymbolWasm;
+class MCValue;
+class raw_pwrite_stream;
+
+// Information about a single relocation.
+struct WasmRelocationEntry {
+  uint64_t Offset;            // Where is the relocation.
+  const MCSymbolWasm *Symbol; // The symbol to relocate with.
+  uint64_t Addend;            // A value to add to the symbol.
+  unsigned Type;              // The type of the relocation.
+  MCSectionWasm *FixupSection;// The section the relocation is targeting.
+
+  WasmRelocationEntry(uint64_t Offset, const MCSymbolWasm *Symbol,
+                      uint64_t Addend, unsigned Type,
+                      MCSectionWasm *FixupSection)
+      : Offset(Offset), Symbol(Symbol), Addend(Addend), Type(Type),
+        FixupSection(FixupSection) {}
+
+  void print(raw_ostream &Out) const {
+    Out << "Off=" << Offset << ", Sym=" << Symbol << ", Addend=" << Addend
+        << ", Type=" << Type << ", FixupSection=" << FixupSection;
+  }
+  void dump() const { print(errs()); }
+};
+
+class MCWasmObjectTargetWriter {
+  const unsigned Is64Bit : 1;
+
+protected:
+  explicit MCWasmObjectTargetWriter(bool Is64Bit_);
+
+public:
+  virtual ~MCWasmObjectTargetWriter() {}
+
+  virtual unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                                const MCFixup &Fixup, bool IsPCRel) const = 0;
+
+  virtual bool needsRelocateWithSymbol(const MCSymbol &Sym,
+                                       unsigned Type) const;
+
+  virtual void sortRelocs(const MCAssembler &Asm,
+                          std::vector<WasmRelocationEntry> &Relocs);
+
+  /// \name Accessors
+  /// @{
+  bool is64Bit() const { return Is64Bit; }
+  /// @}
+};
+
+/// \brief Construct a new Wasm writer instance.
+///
+/// \param MOTW - The target specific Wasm writer subclass.
+/// \param OS - The stream to write to.
+/// \returns The constructed object writer.
+MCObjectWriter *createWasmObjectWriter(MCWasmObjectTargetWriter *MOTW,
+                                       raw_pwrite_stream &OS);
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/MC/MCWasmStreamer.h b/include/llvm/MC/MCWasmStreamer.h
new file mode 100644
index 0000000000000000000000000000000000000000..bdd6f103cd445beee9677f94b45a0cbf9ad3b797
--- /dev/null
+++ b/include/llvm/MC/MCWasmStreamer.h
@@ -0,0 +1,83 @@
+//===- MCWasmStreamer.h - MCStreamer Wasm Object File Interface -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_MC_MCWASMSTREAMER_H
+#define LLVM_MC_MCWASMSTREAMER_H
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/DataTypes.h"
+
+namespace llvm {
+class MCAsmBackend;
+class MCAssembler;
+class MCCodeEmitter;
+class MCExpr;
+class MCInst;
+class raw_ostream;
+
+class MCWasmStreamer : public MCObjectStreamer {
+public:
+  MCWasmStreamer(MCContext &Context, MCAsmBackend &TAB, raw_pwrite_stream &OS,
+                 MCCodeEmitter *Emitter)
+      : MCObjectStreamer(Context, TAB, OS, Emitter), SeenIdent(false) {}
+
+  ~MCWasmStreamer() override;
+
+  /// state management
+  void reset() override {
+    SeenIdent = false;
+    MCObjectStreamer::reset();
+  }
+
+  /// \name MCStreamer Interface
+  /// @{
+
+  void ChangeSection(MCSection *Section, const MCExpr *Subsection) override;
+  void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
+  void EmitThumbFunc(MCSymbol *Func) override;
+  void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
+  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
+  void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
+  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                        unsigned ByteAlignment) override;
+
+  void emitELFSize(MCSymbol *Symbol, const MCExpr *Value) override;
+
+  void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+                             unsigned ByteAlignment) override;
+
+  void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
+                    uint64_t Size = 0, unsigned ByteAlignment = 0) override;
+  void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
+                      unsigned ByteAlignment = 0) override;
+  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+                     SMLoc Loc = SMLoc()) override;
+
+  void EmitIdent(StringRef IdentString) override;
+
+  void EmitValueToAlignment(unsigned, int64_t, unsigned, unsigned) override;
+
+  void FinishImpl() override;
+
+private:
+  void EmitInstToFragment(const MCInst &Inst, const MCSubtargetInfo &) override;
+  void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo &) override;
+
+  /// \brief Merge the content of the fragment \p EF into the fragment \p DF.
+  void mergeFragment(MCDataFragment *, MCDataFragment *);
+
+  bool SeenIdent;
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/MC/MCWinCOFFObjectWriter.h b/include/llvm/MC/MCWinCOFFObjectWriter.h
index e2e95c7df7101c697ccf75b4cd826810441dc9d9..57bed213aad47102c66cd9fbffcb53d0e0fe6526 100644
--- a/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCWinCOFFObjectWriter.h - Win COFF Object Writer *- C++ -*-===//
+//===- llvm/MC/MCWinCOFFObjectWriter.h - Win COFF Object Writer -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,22 +11,23 @@
 #define LLVM_MC_MCWINCOFFOBJECTWRITER_H
 
 namespace llvm {
+
 class MCAsmBackend;
 class MCFixup;
 class MCObjectWriter;
 class MCValue;
-class raw_ostream;
 class raw_pwrite_stream;
 
   class MCWinCOFFObjectTargetWriter {
     virtual void anchor();
+
     const unsigned Machine;
 
   protected:
     MCWinCOFFObjectTargetWriter(unsigned Machine_);
 
   public:
-    virtual ~MCWinCOFFObjectTargetWriter() {}
+    virtual ~MCWinCOFFObjectTargetWriter() = default;
 
     unsigned getMachine() const { return Machine; }
     virtual unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
@@ -42,6 +43,6 @@ class raw_pwrite_stream;
   /// \returns The constructed object writer.
   MCObjectWriter *createWinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW,
                                             raw_pwrite_stream &OS);
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MCWINCOFFOBJECTWRITER_H
diff --git a/include/llvm/MC/MCWinCOFFStreamer.h b/include/llvm/MC/MCWinCOFFStreamer.h
index 63e44f2e67d6791cb8e7a0d7b13a5a3c2cfcd1e4..84e60b85be6a46b268d7868c58d437a99d7699b8 100644
--- a/include/llvm/MC/MCWinCOFFStreamer.h
+++ b/include/llvm/MC/MCWinCOFFStreamer.h
@@ -14,16 +14,15 @@
 #include "llvm/MC/MCObjectStreamer.h"
 
 namespace llvm {
+
 class MCAsmBackend;
 class MCContext;
 class MCCodeEmitter;
-class MCExpr;
 class MCInst;
 class MCSection;
 class MCSubtargetInfo;
 class MCSymbol;
 class StringRef;
-class raw_ostream;
 class raw_pwrite_stream;
 
 class MCWinCOFFStreamer : public MCObjectStreamer {
@@ -41,7 +40,7 @@ public:
   /// \{
 
   void InitSections(bool NoExecStack) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitThumbFunc(MCSymbol *Func) override;
   bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
@@ -61,7 +60,6 @@ public:
                     unsigned ByteAlignment) override;
   void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment) override;
-  void EmitFileDirective(StringRef Filename) override;
   void EmitIdent(StringRef IdentString) override;
   void EmitWinEHHandlerData() override;
   void FinishImpl() override;
@@ -70,12 +68,13 @@ public:
 
 protected:
   const MCSymbol *CurSymbol;
+
   void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo &STI) override;
 
 private:
   void Error(const Twine &Msg) const;
 };
-}
 
-#endif
+} // end namespace llvm
 
+#endif // LLVM_MC_MCWINCOFFSTREAMER_H
diff --git a/include/llvm/MC/MachineLocation.h b/include/llvm/MC/MachineLocation.h
index 4b5cf435779336c009dc586ec496e5ea0f47ba59..f4fc6ee2fd19cc85537dfdbcb2bc43b5df23128e 100644
--- a/include/llvm/MC/MachineLocation.h
+++ b/include/llvm/MC/MachineLocation.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MachineLocation.h -------------------------------*- C++ -*-===//
+//===- llvm/MC/MachineLocation.h --------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,35 +12,31 @@
 // explicitly passing an offset to the constructor.
 //===----------------------------------------------------------------------===//
 
-
 #ifndef LLVM_MC_MACHINELOCATION_H
 #define LLVM_MC_MACHINELOCATION_H
 
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/DataTypes.h"
+#include <cstdint>
 
 namespace llvm {
-  class MCSymbol;
 
 class MachineLocation {
 private:
-  bool IsRegister;                      // True if location is a register.
-  unsigned Register;                    // gcc/gdb register number.
-  int Offset;                           // Displacement if not register.
+  bool IsRegister = false;              // True if location is a register.
+  unsigned Register = 0;                // gcc/gdb register number.
+  int Offset = 0;                       // Displacement if not register.
+
 public:
   enum : uint32_t {
     // The target register number for an abstract frame pointer. The value is
     // an arbitrary value that doesn't collide with any real target register.
     VirtualFP = ~0U
   };
-  MachineLocation()
-    : IsRegister(false), Register(0), Offset(0) {}
+
+  MachineLocation() = default;
   /// Create a direct register location.
-  explicit MachineLocation(unsigned R)
-    : IsRegister(true), Register(R), Offset(0) {}
+  explicit MachineLocation(unsigned R) : IsRegister(true), Register(R) {}
   /// Create a register-indirect location with an offset.
-  MachineLocation(unsigned R, int O)
-    : IsRegister(false), Register(R), Offset(O) {}
+  MachineLocation(unsigned R, int O) : Register(R), Offset(O) {}
 
   bool operator==(const MachineLocation &Other) const {
       return IsRegister == Other.IsRegister && Register == Other.Register &&
@@ -56,12 +52,14 @@ public:
   void setIsRegister(bool Is)  { IsRegister = Is; }
   void setRegister(unsigned R) { Register = R; }
   void setOffset(int O)        { Offset = O; }
+
   /// Make this location a direct register location.
   void set(unsigned R) {
     IsRegister = true;
     Register = R;
     Offset = 0;
   }
+
   /// Make this location a register-indirect+offset location.
   void set(unsigned R, int O) {
     IsRegister = false;
@@ -74,6 +72,6 @@ inline bool operator!=(const MachineLocation &LHS, const MachineLocation &RHS) {
   return !(LHS == RHS);
 }
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_MACHINELOCATION_H
diff --git a/include/llvm/MC/StringTableBuilder.h b/include/llvm/MC/StringTableBuilder.h
index 7da444f7bfb164381ee4e6ed7442d29c02e30908..0df3fcd63723f1867dfb6214d7324c217388e8af 100644
--- a/include/llvm/MC/StringTableBuilder.h
+++ b/include/llvm/MC/StringTableBuilder.h
@@ -1,4 +1,4 @@
-//===-- StringTableBuilder.h - String table building utility ------*- C++ -*-=//
+//===- StringTableBuilder.h - String table building utility -----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,9 +12,12 @@
 
 #include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/DenseMap.h"
-#include <cassert>
+#include "llvm/ADT/StringRef.h"
+#include <cstddef>
+#include <cstdint>
 
 namespace llvm {
+
 class raw_ostream;
 
 /// \brief Utility for building string tables with deduplicated suffixes.
@@ -67,6 +70,6 @@ private:
   bool isFinalized() const { return Finalized; }
 };
 
-} // end llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_STRINGTABLEBUILDER_H
diff --git a/include/llvm/MC/SubtargetFeature.h b/include/llvm/MC/SubtargetFeature.h
index ed4abd772821f58944929f89ac10b1f93987a4f8..cb036671b752454dca2ceef3f0e505feb099bfcf 100644
--- a/include/llvm/MC/SubtargetFeature.h
+++ b/include/llvm/MC/SubtargetFeature.h
@@ -1,4 +1,4 @@
-//===-- llvm/MC/SubtargetFeature.h - CPU characteristics --------*- C++ -*-===//
+//===- llvm/MC/SubtargetFeature.h - CPU characteristics ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,126 +7,124 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file defines and manages user or tool specified CPU characteristics.
-// The intent is to be able to package specific features that should or should
-// not be used on a specific target processor.  A tool, such as llc, could, as
-// as example, gather chip info from the command line, a long with features
-// that should be used on that chip.
+/// \file Defines and manages user or tool specified CPU characteristics.
+/// The intent is to be able to package specific features that should or should
+/// not be used on a specific target processor.  A tool, such as llc, could, as
+/// as example, gather chip info from the command line, a long with features
+/// that should be used on that chip.
 //
 //===----------------------------------------------------------------------===//
 
 #ifndef LLVM_MC_SUBTARGETFEATURE_H
 #define LLVM_MC_SUBTARGETFEATURE_H
 
-#include "llvm/ADT/Triple.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/ADT/StringRef.h"
 #include <bitset>
+#include <initializer_list>
+#include <string>
 #include <vector>
 
 namespace llvm {
+
 template <typename T> class ArrayRef;
-  class raw_ostream;
-  class StringRef;
+class raw_ostream;
+class Triple;
 
-// A container class for subtarget features.
-// This is convenient because std::bitset does not have a constructor
-// with an initializer list of set bits.
-const unsigned MAX_SUBTARGET_FEATURES = 128;
+const unsigned MAX_SUBTARGET_FEATURES = 192;
+/// Container class for subtarget features.
+/// This is convenient because std::bitset does not have a constructor
+/// with an initializer list of set bits.
 class FeatureBitset : public std::bitset<MAX_SUBTARGET_FEATURES> {
 public:
   // Cannot inherit constructors because it's not supported by VC++..
-  FeatureBitset() : bitset() {}
+  FeatureBitset() = default;
 
   FeatureBitset(const bitset<MAX_SUBTARGET_FEATURES>& B) : bitset(B) {}
 
-  FeatureBitset(std::initializer_list<unsigned> Init) : bitset() {
+  FeatureBitset(std::initializer_list<unsigned> Init) {
     for (auto I : Init)
       set(I);
   }
 };
 
 //===----------------------------------------------------------------------===//
-///
-/// SubtargetFeatureKV - Used to provide key value pairs for feature and
-/// CPU bit flags.
-//
+
+/// Used to provide key value pairs for feature and CPU bit flags.
 struct SubtargetFeatureKV {
-  const char *Key;                      // K-V key string
-  const char *Desc;                     // Help descriptor
-  FeatureBitset Value;                  // K-V integer value
-  FeatureBitset Implies;                // K-V bit mask
+  const char *Key;                      ///< K-V key string
+  const char *Desc;                     ///< Help descriptor
+  FeatureBitset Value;                  ///< K-V integer value
+  FeatureBitset Implies;                ///< K-V bit mask
 
-  // Compare routine for std::lower_bound
+  /// Compare routine for std::lower_bound
   bool operator<(StringRef S) const {
     return StringRef(Key) < S;
   }
 
-  // Compare routine for std::is_sorted.
+  /// Compare routine for std::is_sorted.
   bool operator<(const SubtargetFeatureKV &Other) const {
     return StringRef(Key) < StringRef(Other.Key);
   }
 };
 
 //===----------------------------------------------------------------------===//
-///
-/// SubtargetInfoKV - Used to provide key value pairs for CPU and arbitrary
-/// pointers.
-//
+
+/// Used to provide key value pairs for CPU and arbitrary pointers.
 struct SubtargetInfoKV {
-  const char *Key;                      // K-V key string
-  const void *Value;                    // K-V pointer value
+  const char *Key;                      ///< K-V key string
+  const void *Value;                    ///< K-V pointer value
 
-  // Compare routine for std::lower_bound
+  /// Compare routine for std::lower_bound
   bool operator<(StringRef S) const {
     return StringRef(Key) < S;
   }
 };
 
 //===----------------------------------------------------------------------===//
+
+/// Manages the enabling and disabling of subtarget specific features.
 ///
-/// SubtargetFeatures - Manages the enabling and disabling of subtarget
-/// specific features.  Features are encoded as a string of the form
+/// Features are encoded as a string of the form
 ///   "+attr1,+attr2,-attr3,...,+attrN"
 /// A comma separates each feature from the next (all lowercase.)
 /// Each of the remaining features is prefixed with + or - indicating whether
 /// that feature should be enabled or disabled contrary to the cpu
 /// specification.
-///
-
 class SubtargetFeatures {
-  std::vector<std::string> Features;    // Subtarget features as a vector
+  std::vector<std::string> Features;    ///< Subtarget features as a vector
+
 public:
   explicit SubtargetFeatures(StringRef Initial = "");
 
-  /// Features string accessors.
+  /// Returns features as a string.
   std::string getString() const;
 
-  /// Adding Features.
+  /// Adds Features.
   void AddFeature(StringRef String, bool Enable = true);
 
-  /// ToggleFeature - Toggle a feature and update the feature bits.
+  /// Toggles a feature and update the feature bits.
   static void ToggleFeature(FeatureBitset &Bits, StringRef String,
                             ArrayRef<SubtargetFeatureKV> FeatureTable);
 
-  /// Apply the feature flag and update the feature bits.
+  /// Applies the feature flag and update the feature bits.
   static void ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
                                ArrayRef<SubtargetFeatureKV> FeatureTable);
 
-  /// Get feature bits of a CPU.
+  /// Returns feature bits of a CPU.
   FeatureBitset getFeatureBits(StringRef CPU,
-                          ArrayRef<SubtargetFeatureKV> CPUTable,
-                          ArrayRef<SubtargetFeatureKV> FeatureTable);
+                               ArrayRef<SubtargetFeatureKV> CPUTable,
+                               ArrayRef<SubtargetFeatureKV> FeatureTable);
 
-  /// Print feature string.
+  /// Prints feature string.
   void print(raw_ostream &OS) const;
 
-  // Dump feature info.
+  // Dumps feature info.
   void dump() const;
 
   /// Adds the default features for the specified target triple.
   void getDefaultSubtargetFeatures(const Triple& Triple);
 };
 
-} // End namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_MC_SUBTARGETFEATURE_H
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index 08128b0c2515fe89c04d7b43b34819dbd61a99f3..d423957d9b79d344726e04922387888aa1c5afb3 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -212,6 +212,7 @@ public:
     K_GNU,
     K_MIPS64,
     K_BSD,
+    K_DARWIN,
     K_DARWIN64,
     K_COFF
   };
diff --git a/include/llvm/Object/ELF.h b/include/llvm/Object/ELF.h
index aaa79ae70f01d62a4d47f4f5fe324efb9f99082e..7a3155b3953ecd8fe55c5b85baf3f158eb2530ef 100644
--- a/include/llvm/Object/ELF.h
+++ b/include/llvm/Object/ELF.h
@@ -340,7 +340,7 @@ ELFFile<ELFT>::ELFFile(StringRef Object) : Buf(Object) {
 }
 
 template <class ELFT>
-static bool compareAddr(uint64_t VAddr, const Elf_Phdr_Impl<ELFT> *Phdr) {
+bool compareAddr(uint64_t VAddr, const Elf_Phdr_Impl<ELFT> *Phdr) {
   return VAddr < Phdr->p_vaddr;
 }
 
diff --git a/include/llvm/Object/ELFObjectFile.h b/include/llvm/Object/ELFObjectFile.h
index f9fdcac8c5524f6ca815a1028a4e6d0465d60114..9e95f2958aa4fe14ecc673fc7c6a05163e820eae 100644
--- a/include/llvm/Object/ELFObjectFile.h
+++ b/include/llvm/Object/ELFObjectFile.h
@@ -74,9 +74,9 @@ public:
 
   SubtargetFeatures getFeatures() const override;
 
-  SubtargetFeatures getMIPSFeatures() const override;
+  SubtargetFeatures getMIPSFeatures() const;
 
-  SubtargetFeatures getARMFeatures() const override;
+  SubtargetFeatures getARMFeatures() const;
 
   void setARMSubArch(Triple &TheTriple) const override;
 };
@@ -895,6 +895,8 @@ elf_symbol_iterator ELFObjectFile<ELFT>::dynamic_symbol_begin() const {
 template <class ELFT>
 elf_symbol_iterator ELFObjectFile<ELFT>::dynamic_symbol_end() const {
   const Elf_Shdr *SymTab = DotDynSymSec;
+  if (!SymTab)
+    return dynamic_symbol_begin();
   DataRefImpl Sym = toDRI(SymTab, SymTab->sh_size / sizeof(Elf_Sym));
   return basic_symbol_iterator(SymbolRef(Sym, this));
 }
diff --git a/include/llvm/Object/IRSymtab.h b/include/llvm/Object/IRSymtab.h
new file mode 100644
index 0000000000000000000000000000000000000000..7ce6fa50e6da32ab521f7fac039510b7d1989106
--- /dev/null
+++ b/include/llvm/Object/IRSymtab.h
@@ -0,0 +1,298 @@
+//===- IRSymtab.h - data definitions for IR symbol tables -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains data definitions and a reader and builder for a symbol
+// table for LLVM IR. Its purpose is to allow linkers and other consumers of
+// bitcode files to efficiently read the symbol table for symbol resolution
+// purposes without needing to construct a module in memory.
+//
+// As with most object files the symbol table has two parts: the symbol table
+// itself and a string table which is referenced by the symbol table.
+//
+// A symbol table corresponds to a single bitcode file, which may consist of
+// multiple modules, so symbol tables may likewise contain symbols for multiple
+// modules.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECT_IRSYMTAB_H
+#define LLVM_OBJECT_IRSYMTAB_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/Object/SymbolicFile.h"
+#include "llvm/Support/Endian.h"
+
+namespace llvm {
+namespace irsymtab {
+namespace storage {
+
+// The data structures in this namespace define the low-level serialization
+// format. Clients that just want to read a symbol table should use the
+// irsymtab::Reader class.
+
+typedef support::ulittle32_t Word;
+
+/// A reference to a string in the string table.
+struct Str {
+  Word Offset;
+  StringRef get(StringRef Strtab) const {
+    return Strtab.data() + Offset;
+  }
+};
+
+/// A reference to a range of objects in the symbol table.
+template <typename T> struct Range {
+  Word Offset, Size;
+  ArrayRef<T> get(StringRef Symtab) const {
+    return {reinterpret_cast<const T *>(Symtab.data() + Offset), Size};
+  }
+};
+
+/// Describes the range of a particular module's symbols within the symbol
+/// table.
+struct Module {
+  Word Begin, End;
+};
+
+/// This is equivalent to an IR comdat.
+struct Comdat {
+  Str Name;
+};
+
+/// Contains the information needed by linkers for symbol resolution, as well as
+/// by the LTO implementation itself.
+struct Symbol {
+  /// The mangled symbol name.
+  Str Name;
+
+  /// The unmangled symbol name, or the empty string if this is not an IR
+  /// symbol.
+  Str IRName;
+
+  /// The index into Header::Comdats, or -1 if not a comdat member.
+  Word ComdatIndex;
+
+  Word Flags;
+  enum FlagBits {
+    FB_visibility, // 2 bits
+    FB_undefined = FB_visibility + 2,
+    FB_weak,
+    FB_common,
+    FB_indirect,
+    FB_used,
+    FB_tls,
+    FB_may_omit,
+    FB_global,
+    FB_format_specific,
+    FB_unnamed_addr,
+  };
+
+  /// The index into the Uncommon table, or -1 if this symbol does not have an
+  /// Uncommon.
+  Word UncommonIndex;
+};
+
+/// This data structure contains rarely used symbol fields and is optionally
+/// referenced by a Symbol.
+struct Uncommon {
+  Word CommonSize, CommonAlign;
+
+  /// COFF-specific: the name of the symbol that a weak external resolves to
+  /// if not defined.
+  Str COFFWeakExternFallbackName;
+};
+
+struct Header {
+  Range<Module> Modules;
+  Range<Comdat> Comdats;
+  Range<Symbol> Symbols;
+  Range<Uncommon> Uncommons;
+
+  Str SourceFileName;
+
+  /// COFF-specific: linker directives.
+  Str COFFLinkerOpts;
+};
+
+}
+
+/// Fills in Symtab and Strtab with a valid symbol and string table for Mods.
+Error build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab,
+            SmallVector<char, 0> &Strtab);
+
+/// This represents a symbol that has been read from a storage::Symbol and
+/// possibly a storage::Uncommon.
+struct Symbol {
+  // Copied from storage::Symbol.
+  StringRef Name, IRName;
+  int ComdatIndex;
+  uint32_t Flags;
+
+  // Copied from storage::Uncommon.
+  uint32_t CommonSize, CommonAlign;
+  StringRef COFFWeakExternFallbackName;
+
+  /// Returns the mangled symbol name.
+  StringRef getName() const { return Name; }
+
+  /// Returns the unmangled symbol name, or the empty string if this is not an
+  /// IR symbol.
+  StringRef getIRName() const { return IRName; }
+
+  /// Returns the index into the comdat table (see Reader::getComdatTable()), or
+  /// -1 if not a comdat member.
+  int getComdatIndex() const { return ComdatIndex; }
+
+  using S = storage::Symbol;
+  GlobalValue::VisibilityTypes getVisibility() const {
+    return GlobalValue::VisibilityTypes((Flags >> S::FB_visibility) & 3);
+  }
+  bool isUndefined() const { return (Flags >> S::FB_undefined) & 1; }
+  bool isWeak() const { return (Flags >> S::FB_weak) & 1; }
+  bool isCommon() const { return (Flags >> S::FB_common) & 1; }
+  bool isIndirect() const { return (Flags >> S::FB_indirect) & 1; }
+  bool isUsed() const { return (Flags >> S::FB_used) & 1; }
+  bool isTLS() const { return (Flags >> S::FB_tls) & 1; }
+  bool canBeOmittedFromSymbolTable() const {
+    return (Flags >> S::FB_may_omit) & 1;
+  }
+  bool isGlobal() const { return (Flags >> S::FB_global) & 1; }
+  bool isFormatSpecific() const { return (Flags >> S::FB_format_specific) & 1; }
+  bool isUnnamedAddr() const { return (Flags >> S::FB_unnamed_addr) & 1; }
+
+  uint64_t getCommonSize() const {
+    assert(isCommon());
+    return CommonSize;
+  }
+  uint32_t getCommonAlignment() const {
+    assert(isCommon());
+    return CommonAlign;
+  }
+
+  /// COFF-specific: for weak externals, returns the name of the symbol that is
+  /// used as a fallback if the weak external remains undefined.
+  StringRef getCOFFWeakExternalFallback() const {
+    assert(isWeak() && isIndirect());
+    return COFFWeakExternFallbackName;
+  }
+};
+
+/// This class can be used to read a Symtab and Strtab produced by
+/// irsymtab::build.
+class Reader {
+  StringRef Symtab, Strtab;
+
+  ArrayRef<storage::Module> Modules;
+  ArrayRef<storage::Comdat> Comdats;
+  ArrayRef<storage::Symbol> Symbols;
+  ArrayRef<storage::Uncommon> Uncommons;
+
+  StringRef str(storage::Str S) const { return S.get(Strtab); }
+  template <typename T> ArrayRef<T> range(storage::Range<T> R) const {
+    return R.get(Symtab);
+  }
+  const storage::Header &header() const {
+    return *reinterpret_cast<const storage::Header *>(Symtab.data());
+  }
+
+public:
+  class SymbolRef;
+
+  Reader() = default;
+  Reader(StringRef Symtab, StringRef Strtab) : Symtab(Symtab), Strtab(Strtab) {
+    Modules = range(header().Modules);
+    Comdats = range(header().Comdats);
+    Symbols = range(header().Symbols);
+    Uncommons = range(header().Uncommons);
+  }
+
+  typedef iterator_range<object::content_iterator<SymbolRef>> symbol_range;
+
+  /// Returns the symbol table for the entire bitcode file.
+  /// The symbols enumerated by this method are ephemeral, but they can be
+  /// copied into an irsymtab::Symbol object.
+  symbol_range symbols() const;
+
+  /// Returns a slice of the symbol table for the I'th module in the file.
+  /// The symbols enumerated by this method are ephemeral, but they can be
+  /// copied into an irsymtab::Symbol object.
+  symbol_range module_symbols(unsigned I) const;
+
+  /// Returns the source file path specified at compile time.
+  StringRef getSourceFileName() const { return str(header().SourceFileName); }
+
+  /// Returns a table with all the comdats used by this file.
+  std::vector<StringRef> getComdatTable() const {
+    std::vector<StringRef> ComdatTable;
+    ComdatTable.reserve(Comdats.size());
+    for (auto C : Comdats)
+      ComdatTable.push_back(str(C.Name));
+    return ComdatTable;
+  }
+
+  /// COFF-specific: returns linker options specified in the input file.
+  StringRef getCOFFLinkerOpts() const { return str(header().COFFLinkerOpts); }
+};
+
+/// Ephemeral symbols produced by Reader::symbols() and
+/// Reader::module_symbols().
+class Reader::SymbolRef : public Symbol {
+  const storage::Symbol *SymI, *SymE;
+  const Reader *R;
+
+public:
+  SymbolRef(const storage::Symbol *SymI, const storage::Symbol *SymE,
+            const Reader *R)
+      : SymI(SymI), SymE(SymE), R(R) {
+    read();
+  }
+
+  void read() {
+    if (SymI == SymE)
+      return;
+
+    Name = R->str(SymI->Name);
+    IRName = R->str(SymI->IRName);
+    ComdatIndex = SymI->ComdatIndex;
+    Flags = SymI->Flags;
+
+    uint32_t UncI = SymI->UncommonIndex;
+    if (UncI != -1u) {
+      const storage::Uncommon &Unc = R->Uncommons[UncI];
+      CommonSize = Unc.CommonSize;
+      CommonAlign = Unc.CommonAlign;
+      COFFWeakExternFallbackName = R->str(Unc.COFFWeakExternFallbackName);
+    }
+  }
+  void moveNext() {
+    ++SymI;
+    read();
+  }
+
+  bool operator==(const SymbolRef &Other) const { return SymI == Other.SymI; }
+};
+
+inline Reader::symbol_range Reader::symbols() const {
+  return {SymbolRef(Symbols.begin(), Symbols.end(), this),
+          SymbolRef(Symbols.end(), Symbols.end(), this)};
+}
+
+inline Reader::symbol_range Reader::module_symbols(unsigned I) const {
+  const storage::Module &M = Modules[I];
+  const storage::Symbol *MBegin = Symbols.begin() + M.Begin,
+                        *MEnd = Symbols.begin() + M.End;
+  return {SymbolRef(MBegin, MEnd, this), SymbolRef(MEnd, MEnd, this)};
+}
+
+}
+
+}
+
+#endif
diff --git a/include/llvm/Object/MachO.h b/include/llvm/Object/MachO.h
index b91e8c3658177912712a91ef6a8a564db8c4dc99..1ee571cce738eca551c6fa511e8b806a4b3c6bde 100644
--- a/include/llvm/Object/MachO.h
+++ b/include/llvm/Object/MachO.h
@@ -100,18 +100,58 @@ private:
 };
 typedef content_iterator<ExportEntry> export_iterator;
 
+// Segment info so SegIndex/SegOffset pairs in a Mach-O Bind or Rebase entry
+// can be checked and translated.  Only the SegIndex/SegOffset pairs from
+// checked entries are to be used with the segmentName(), sectionName() and
+// address() methods below.
+class BindRebaseSegInfo {
+public:
+  BindRebaseSegInfo(const object::MachOObjectFile *Obj);
+
+  // Used to check a Mach-O Bind or Rebase entry for errors when iterating.
+  const char *checkSegAndOffset(int32_t SegIndex, uint64_t SegOffset,
+                                bool endInvalid);
+  const char *checkCountAndSkip(uint32_t Count, uint32_t Skip,
+                                uint8_t PointerSize, int32_t SegIndex,
+                                uint64_t SegOffset);
+  // Used with valid SegIndex/SegOffset values from checked entries.
+  StringRef segmentName(int32_t SegIndex);
+  StringRef sectionName(int32_t SegIndex, uint64_t SegOffset);
+  uint64_t address(uint32_t SegIndex, uint64_t SegOffset);
+
+private:
+  struct SectionInfo {
+    uint64_t Address;
+    uint64_t Size;
+    StringRef SectionName;
+    StringRef SegmentName;
+    uint64_t OffsetInSegment;
+    uint64_t SegmentStartAddress;
+    int32_t SegmentIndex;
+  };
+  const SectionInfo &findSection(int32_t SegIndex, uint64_t SegOffset);
+  SmallVector<SectionInfo, 32> Sections;
+  int32_t MaxSegIndex;
+};
+
 /// MachORebaseEntry encapsulates the current state in the decompression of
 /// rebasing opcodes. This allows you to iterate through the compressed table of
 /// rebasing using:
-///    for (const llvm::object::MachORebaseEntry &Entry : Obj->rebaseTable()) {
+///    Error Err;
+///    for (const llvm::object::MachORebaseEntry &Entry : Obj->rebaseTable(&Err)) {
 ///    }
+///    if (Err) { report error ...
 class MachORebaseEntry {
 public:
-  MachORebaseEntry(ArrayRef<uint8_t> opcodes, bool is64Bit);
+  MachORebaseEntry(Error *Err, const MachOObjectFile *O,
+                   ArrayRef<uint8_t> opcodes, bool is64Bit);
 
-  uint32_t segmentIndex() const;
+  int32_t segmentIndex() const;
   uint64_t segmentOffset() const;
   StringRef typeName() const;
+  StringRef segmentName() const;
+  StringRef sectionName() const;
+  uint64_t address() const;
 
   bool operator==(const MachORebaseEntry &) const;
 
@@ -121,17 +161,18 @@ private:
   friend class MachOObjectFile;
   void moveToFirst();
   void moveToEnd();
-  uint64_t readULEB128();
+  uint64_t readULEB128(const char **error);
 
+  Error *E;
+  const MachOObjectFile *O;
   ArrayRef<uint8_t> Opcodes;
   const uint8_t *Ptr;
   uint64_t SegmentOffset;
-  uint32_t SegmentIndex;
+  int32_t SegmentIndex;
   uint64_t RemainingLoopCount;
   uint64_t AdvanceAmount;
   uint8_t  RebaseType;
   uint8_t  PointerSize;
-  bool     Malformed;
   bool     Done;
 };
 typedef content_iterator<MachORebaseEntry> rebase_iterator;
@@ -139,15 +180,18 @@ typedef content_iterator<MachORebaseEntry> rebase_iterator;
 /// MachOBindEntry encapsulates the current state in the decompression of
 /// binding opcodes. This allows you to iterate through the compressed table of
 /// bindings using:
-///    for (const llvm::object::MachOBindEntry &Entry : Obj->bindTable()) {
+///    Error Err;
+///    for (const llvm::object::MachOBindEntry &Entry : Obj->bindTable(&Err)) {
 ///    }
+///    if (Err) { report error ...
 class MachOBindEntry {
 public:
   enum class Kind { Regular, Lazy, Weak };
 
-  MachOBindEntry(ArrayRef<uint8_t> Opcodes, bool is64Bit, MachOBindEntry::Kind);
+  MachOBindEntry(Error *Err, const MachOObjectFile *O,
+                 ArrayRef<uint8_t> Opcodes, bool is64Bit, MachOBindEntry::Kind);
 
-  uint32_t segmentIndex() const;
+  int32_t segmentIndex() const;
   uint64_t segmentOffset() const;
   StringRef typeName() const;
   StringRef symbolName() const;
@@ -155,6 +199,10 @@ public:
   int64_t addend() const;
   int ordinal() const;
 
+  StringRef segmentName() const;
+  StringRef sectionName() const;
+  uint64_t address() const;
+
   bool operator==(const MachOBindEntry &) const;
 
   void moveNext();
@@ -163,14 +211,17 @@ private:
   friend class MachOObjectFile;
   void moveToFirst();
   void moveToEnd();
-  uint64_t readULEB128();
-  int64_t readSLEB128();
+  uint64_t readULEB128(const char **error);
+  int64_t readSLEB128(const char **error);
 
+  Error *E;
+  const MachOObjectFile *O;
   ArrayRef<uint8_t> Opcodes;
   const uint8_t *Ptr;
   uint64_t SegmentOffset;
-  uint32_t SegmentIndex;
+  int32_t  SegmentIndex;
   StringRef SymbolName;
+  bool     LibraryOrdinalSet;
   int      Ordinal;
   uint32_t Flags;
   int64_t  Addend;
@@ -179,7 +230,6 @@ private:
   uint8_t  BindType;
   uint8_t  PointerSize;
   Kind     TableKind;
-  bool     Malformed;
   bool     Done;
 };
 typedef content_iterator<MachOBindEntry> bind_iterator;
@@ -245,6 +295,7 @@ public:
 
   // MachO specific.
   std::error_code getLibraryShortNameByIndex(unsigned Index, StringRef &) const;
+  uint32_t getLibraryCount() const;
 
   section_iterator getRelocationRelocatedSection(relocation_iterator Rel) const;
 
@@ -285,26 +336,79 @@ public:
   static iterator_range<export_iterator> exports(ArrayRef<uint8_t> Trie);
 
   /// For use iterating over all rebase table entries.
-  iterator_range<rebase_iterator> rebaseTable() const;
+  iterator_range<rebase_iterator> rebaseTable(Error &Err);
 
-  /// For use examining rebase opcodes not in a MachOObjectFile.
-  static iterator_range<rebase_iterator> rebaseTable(ArrayRef<uint8_t> Opcodes,
+  /// For use examining rebase opcodes in a MachOObjectFile.
+  static iterator_range<rebase_iterator> rebaseTable(Error &Err,
+                                                     MachOObjectFile *O,
+                                                     ArrayRef<uint8_t> Opcodes,
                                                      bool is64);
 
   /// For use iterating over all bind table entries.
-  iterator_range<bind_iterator> bindTable() const;
+  iterator_range<bind_iterator> bindTable(Error &Err);
 
   /// For use iterating over all lazy bind table entries.
-  iterator_range<bind_iterator> lazyBindTable() const;
+  iterator_range<bind_iterator> lazyBindTable(Error &Err);
 
-  /// For use iterating over all lazy bind table entries.
-  iterator_range<bind_iterator> weakBindTable() const;
+  /// For use iterating over all weak bind table entries.
+  iterator_range<bind_iterator> weakBindTable(Error &Err);
 
-  /// For use examining bind opcodes not in a MachOObjectFile.
-  static iterator_range<bind_iterator> bindTable(ArrayRef<uint8_t> Opcodes,
+  /// For use examining bind opcodes in a MachOObjectFile.
+  static iterator_range<bind_iterator> bindTable(Error &Err,
+                                                 MachOObjectFile *O,
+                                                 ArrayRef<uint8_t> Opcodes,
                                                  bool is64,
                                                  MachOBindEntry::Kind);
 
+  /// For use with a SegIndex,SegOffset pair in MachOBindEntry::moveNext() to
+  /// validate a MachOBindEntry.
+  const char *BindEntryCheckSegAndOffset(int32_t SegIndex, uint64_t SegOffset,
+                                         bool endInvalid) const {
+    return BindRebaseSectionTable->checkSegAndOffset(SegIndex, SegOffset,
+                                                     endInvalid);
+  }
+  /// For use in MachOBindEntry::moveNext() to validate a MachOBindEntry for
+  /// the BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB opcode.
+  const char *BindEntryCheckCountAndSkip(uint32_t Count, uint32_t Skip,
+                                         uint8_t PointerSize, int32_t SegIndex,
+                                         uint64_t SegOffset) const {
+    return BindRebaseSectionTable->checkCountAndSkip(Count, Skip, PointerSize,
+                                                     SegIndex, SegOffset);
+  }
+
+  /// For use with a SegIndex,SegOffset pair in MachORebaseEntry::moveNext() to
+  /// validate a MachORebaseEntry.
+  const char *RebaseEntryCheckSegAndOffset(int32_t SegIndex, uint64_t SegOffset,
+                                           bool endInvalid) const {
+    return BindRebaseSectionTable->checkSegAndOffset(SegIndex, SegOffset,
+                                                     endInvalid);
+  }
+  /// For use in MachORebaseEntry::moveNext() to validate a MachORebaseEntry for
+  /// the REBASE_OPCODE_DO_*_TIMES* opcodes.
+  const char *RebaseEntryCheckCountAndSkip(uint32_t Count, uint32_t Skip,
+                                         uint8_t PointerSize, int32_t SegIndex,
+                                         uint64_t SegOffset) const {
+    return BindRebaseSectionTable->checkCountAndSkip(Count, Skip, PointerSize,
+                                                     SegIndex, SegOffset);
+  }
+
+  /// For use with the SegIndex of a checked Mach-O Bind or Rebase entry to
+  /// get the segment name.
+  StringRef BindRebaseSegmentName(int32_t SegIndex) const {
+    return BindRebaseSectionTable->segmentName(SegIndex);
+  }
+
+  /// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or
+  /// Rebase entry to get the section name.
+  StringRef BindRebaseSectionName(uint32_t SegIndex, uint64_t SegOffset) const {
+    return BindRebaseSectionTable->sectionName(SegIndex, SegOffset);
+  }
+
+  /// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or
+  /// Rebase entry to get the address.
+  uint64_t BindRebaseAddress(uint32_t SegIndex, uint64_t SegOffset) const {
+    return BindRebaseSectionTable->address(SegIndex, SegOffset);
+  }
 
   // In a MachO file, sections have a segment name. This is used in the .o
   // files. They have a single segment, but this field specifies which segment
@@ -511,6 +615,7 @@ private:
   using BuildToolList = SmallVector<const char*, 1>;
   BuildToolList BuildTools;
   mutable LibraryShortName LibrariesShortNames;
+  std::unique_ptr<BindRebaseSegInfo> BindRebaseSectionTable;
   const char *SymtabLoadCmd;
   const char *DysymtabLoadCmd;
   const char *DataInCodeLoadCmd;
diff --git a/include/llvm/Object/ModuleSummaryIndexObjectFile.h b/include/llvm/Object/ModuleSummaryIndexObjectFile.h
index 6205927039dcd7684a2e3db1c4c5cf023d94915b..713022264ea7ad08ef5aa2d1fb1126ad176dba86 100644
--- a/include/llvm/Object/ModuleSummaryIndexObjectFile.h
+++ b/include/llvm/Object/ModuleSummaryIndexObjectFile.h
@@ -88,9 +88,12 @@ public:
 }
 
 /// Parse the module summary index out of an IR file and return the module
-/// summary index object if found, or nullptr if not.
+/// summary index object if found, or nullptr if not. If Identifier is
+/// non-empty, it is used as the module ID (module path) in the resulting
+/// index. This can be used when the index is being read from a file
+/// containing minimized bitcode just for the thin link.
 Expected<std::unique_ptr<ModuleSummaryIndex>>
-getModuleSummaryIndexForFile(StringRef Path);
+getModuleSummaryIndexForFile(StringRef Path, StringRef Identifier = "");
 }
 
 #endif
diff --git a/include/llvm/Object/ModuleSymbolTable.h b/include/llvm/Object/ModuleSymbolTable.h
index 70775352d977485170bc7f506cdc2e8fcacbe733..333301d5b456c02366044a7d0797d06e8f6172eb 100644
--- a/include/llvm/Object/ModuleSymbolTable.h
+++ b/include/llvm/Object/ModuleSymbolTable.h
@@ -26,6 +26,7 @@
 namespace llvm {
 
 class GlobalValue;
+class RecordStreamer;
 
 class ModuleSymbolTable {
 public:
@@ -52,7 +53,7 @@ public:
   /// For each found symbol, call \p AsmSymbol with the name of the symbol found
   /// and the associated flags.
   static void CollectAsmSymbols(
-      const Triple &TheTriple, StringRef InlineAsm,
+      const Module &M,
       function_ref<void(StringRef, object::BasicSymbolRef::Flags)> AsmSymbol);
 };
 
diff --git a/include/llvm/Object/ObjectFile.h b/include/llvm/Object/ObjectFile.h
index b4251193cd1a4b25a51586ed86cfaa969f6ea265..b689dc2ac03ac3a2830e97039342fabdcbaad85a 100644
--- a/include/llvm/Object/ObjectFile.h
+++ b/include/llvm/Object/ObjectFile.h
@@ -267,8 +267,6 @@ public:
   virtual StringRef getFileFormatName() const = 0;
   virtual /* Triple::ArchType */ unsigned getArch() const = 0;
   virtual SubtargetFeatures getFeatures() const = 0;
-  virtual SubtargetFeatures getMIPSFeatures() const { return SubtargetFeatures(); }
-  virtual SubtargetFeatures getARMFeatures() const { return SubtargetFeatures(); }
   virtual void setARMSubArch(Triple &TheTriple) const { }
 
   /// Returns platform-specific object flags, if any.
diff --git a/include/llvm/Object/RelocVisitor.h b/include/llvm/Object/RelocVisitor.h
index 3510d293d73d7c4a7ffa9e686ec332c6014b5007..3a0a62d9283b3b3fb049b1de6c29d4aac05914cd 100644
--- a/include/llvm/Object/RelocVisitor.h
+++ b/include/llvm/Object/RelocVisitor.h
@@ -155,6 +155,8 @@ private:
         switch (RelocType) {
         case llvm::ELF::R_AMDGPU_ABS32:
           return visitELF_AMDGPU_ABS32(R, Value);
+        case llvm::ELF::R_AMDGPU_ABS64:
+          return visitELF_AMDGPU_ABS64(R, Value);
         default:
           HasError = true;
           return RelocToApply();
@@ -450,6 +452,11 @@ private:
     return RelocToApply(Value + Addend, 4);
   }
 
+  RelocToApply visitELF_AMDGPU_ABS64(RelocationRef R, uint64_t Value) {
+    int64_t Addend = getELFAddend(R);
+    return RelocToApply(Value + Addend, 8);
+  }
+
   /// I386 COFF
   RelocToApply visitCOFF_I386_SECREL(RelocationRef R, uint64_t Value) {
     return RelocToApply(static_cast<uint32_t>(Value), /*Width=*/4);
diff --git a/include/llvm/Object/SymbolicFile.h b/include/llvm/Object/SymbolicFile.h
index af62e62c51d8f780655413a5234d5e3ee84118f0..ef0f96f7834abd11085196ef0e3a19ccdee9120c 100644
--- a/include/llvm/Object/SymbolicFile.h
+++ b/include/llvm/Object/SymbolicFile.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Object/Binary.h"
 #include "llvm/Support/Format.h"
+#include <cinttypes>
 #include <utility>
 
 namespace llvm {
@@ -33,7 +34,8 @@ union DataRefImpl {
 
 template <typename OStream>
 OStream& operator<<(OStream &OS, const DataRefImpl &D) {
-  OS << "(" << format("0x%x8", D.p) << " (" << format("0x%x8", D.d.a) << ", " << format("0x%x8", D.d.b) << "))";
+  OS << "(" << format("0x%08" PRIxPTR, D.p) << " (" << format("0x%08x", D.d.a)
+     << ", " << format("0x%08x", D.d.b) << "))";
   return OS;
 }
 
diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h
index 999d575943b6c85ac41a1b027707586705f1ad2b..4833db0f2e50471a2674d81f1e2cf3e4f690a514 100644
--- a/include/llvm/Object/Wasm.h
+++ b/include/llvm/Object/Wasm.h
@@ -17,25 +17,76 @@
 #ifndef LLVM_OBJECT_WASM_H
 #define LLVM_OBJECT_WASM_H
 
+#include "llvm/Object/Binary.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Wasm.h"
+#include <cstddef>
+#include <cstdint>
+#include <vector>
 
 namespace llvm {
 namespace object {
 
+class WasmSymbol {
+public:
+  enum class SymbolType {
+    FUNCTION_IMPORT,
+    FUNCTION_EXPORT,
+    GLOBAL_IMPORT,
+    GLOBAL_EXPORT,
+    DEBUG_FUNCTION_NAME,
+  };
+
+  WasmSymbol(StringRef Name, SymbolType Type) : Name(Name), Type(Type) {}
+
+  StringRef Name;
+  SymbolType Type;
+};
+
+class WasmSection {
+public:
+  WasmSection() : Type(0), Offset(0) {}
+
+  uint32_t Type; // Section type (See below)
+  uint32_t Offset; // Offset with in the file
+  StringRef Name; // Section name (User-defined sections only)
+  ArrayRef<uint8_t> Content; // Section content
+  std::vector<wasm::WasmRelocation> Relocations; // Relocations for this section
+};
+
 class WasmObjectFile : public ObjectFile {
 public:
   WasmObjectFile(MemoryBufferRef Object, Error &Err);
+
   const wasm::WasmObjectHeader &getHeader() const;
-  const wasm::WasmSection *getWasmSection(const SectionRef &Section) const;
+  const WasmSymbol &getWasmSymbol(DataRefImpl Symb) const;
+  const WasmSection &getWasmSection(const SectionRef &Section) const;
+  const wasm::WasmRelocation &getWasmRelocation(const RelocationRef& Ref) const;
+
   static bool classof(const Binary *v) { return v->isWasm(); }
 
+  const std::vector<wasm::WasmSignature>& types() const { return Signatures; }
+  const std::vector<uint32_t>& functionTypes() const { return FunctionTypes; }
+  const std::vector<wasm::WasmImport>& imports() const { return Imports; }
+  const std::vector<wasm::WasmTable>& tables() const { return Tables; }
+  const std::vector<wasm::WasmLimits>& memories() const { return Memories; }
+  const std::vector<wasm::WasmGlobal>& globals() const { return Globals; }
+  const std::vector<wasm::WasmExport>& exports() const { return Exports; }
+  const std::vector<wasm::WasmElemSegment>& elements() const {
+    return ElemSegments;
+  }
+  const std::vector<wasm::WasmDataSegment>& dataSegments() const {
+    return DataSegments;
+  }
+  const std::vector<wasm::WasmFunction>& functions() const { return Functions; }
+  const ArrayRef<uint8_t>& code() const { return CodeSection; }
+  uint32_t startFunction() const { return StartFunction; }
+
 protected:
   void moveSymbolNext(DataRefImpl &Symb) const override;
 
-  std::error_code printSymbolName(raw_ostream &OS,
-                                  DataRefImpl Symb) const override;
-
   uint32_t getSymbolFlags(DataRefImpl Symb) const override;
 
   basic_symbol_iterator symbol_begin() const override;
@@ -67,7 +118,6 @@ protected:
   bool isSectionBitcode(DataRefImpl Sec) const override;
   relocation_iterator section_rel_begin(DataRefImpl Sec) const override;
   relocation_iterator section_rel_end(DataRefImpl Sec) const override;
-  section_iterator getRelocatedSection(DataRefImpl Sec) const override;
 
   // Overrides from RelocationRef.
   void moveRelocationNext(DataRefImpl &Rel) const override;
@@ -86,14 +136,53 @@ protected:
   bool isRelocatableObject() const override;
 
 private:
+  const WasmSection &getWasmSection(DataRefImpl Ref) const;
+  const wasm::WasmRelocation &getWasmRelocation(DataRefImpl Ref) const;
+
+  WasmSection* findCustomSectionByName(StringRef Name);
+  WasmSection* findSectionByType(uint32_t Type);
+
   const uint8_t *getPtr(size_t Offset) const;
-  Error parseCustomSection(wasm::WasmSection &Sec, const uint8_t *Ptr,
-                           size_t Length);
+  Error parseSection(WasmSection &Sec);
+  Error parseCustomSection(WasmSection &Sec, const uint8_t *Ptr,
+                           const uint8_t *End);
+
+  // Standard section types
+  Error parseTypeSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseImportSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseFunctionSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseTableSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseMemorySection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseGlobalSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseExportSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseStartSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseElemSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseCodeSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseDataSection(const uint8_t *Ptr, const uint8_t *End);
+
+  // Custom section types
+  Error parseNameSection(const uint8_t *Ptr, const uint8_t *End);
+  Error parseRelocSection(StringRef Name, const uint8_t *Ptr,
+                          const uint8_t *End);
 
   wasm::WasmObjectHeader Header;
-  std::vector<wasm::WasmSection> Sections;
+  std::vector<WasmSection> Sections;
+  std::vector<wasm::WasmSignature> Signatures;
+  std::vector<uint32_t> FunctionTypes;
+  std::vector<wasm::WasmTable> Tables;
+  std::vector<wasm::WasmLimits> Memories;
+  std::vector<wasm::WasmGlobal> Globals;
+  std::vector<wasm::WasmImport> Imports;
+  std::vector<wasm::WasmExport> Exports;
+  std::vector<wasm::WasmElemSegment> ElemSegments;
+  std::vector<wasm::WasmDataSegment> DataSegments;
+  std::vector<WasmSymbol> Symbols;
+  std::vector<wasm::WasmFunction> Functions;
+  ArrayRef<uint8_t> CodeSection;
+  uint32_t StartFunction;
 };
-}
-}
 
-#endif
+} // end namespace object
+} // end namespace llvm
+
+#endif // LLVM_OBJECT_WASM_H
diff --git a/include/llvm/ObjectYAML/DWARFYAML.h b/include/llvm/ObjectYAML/DWARFYAML.h
index d031b5ac404c5164e8ac465894e82389644b6891..ec34de1f08814335e826959f38f2259a2c69d975 100644
--- a/include/llvm/ObjectYAML/DWARFYAML.h
+++ b/include/llvm/ObjectYAML/DWARFYAML.h
@@ -13,7 +13,6 @@
 ///
 //===----------------------------------------------------------------------===//
 
-
 #ifndef LLVM_OBJECTYAML_DWARFYAML_H
 #define LLVM_OBJECTYAML_DWARFYAML_H
 
@@ -23,9 +22,30 @@
 namespace llvm {
 namespace DWARFYAML {
 
+struct InitialLength {
+  uint32_t TotalLength;
+  uint64_t TotalLength64;
+
+  bool isDWARF64() const { return TotalLength == UINT32_MAX; }
+
+  uint64_t getLength() const {
+    return isDWARF64() ? TotalLength64 : TotalLength;
+  }
+
+  void setLength(uint64_t Len) {
+    if (Len >= (uint64_t)UINT32_MAX) {
+      TotalLength64 = Len;
+      TotalLength = UINT32_MAX;
+    } else {
+      TotalLength = Len;
+    }
+  }
+};
+
 struct AttributeAbbrev {
   llvm::dwarf::Attribute Attribute;
   llvm::dwarf::Form Form;
+  llvm::yaml::Hex64 Value; // Some DWARF5 attributes have values
 };
 
 struct Abbrev {
@@ -41,7 +61,7 @@ struct ARangeDescriptor {
 };
 
 struct ARange {
-  uint32_t Length;
+  InitialLength Length;
   uint16_t Version;
   uint32_t CuOffset;
   uint8_t AddrSize;
@@ -58,7 +78,7 @@ struct PubEntry {
 struct PubSection {
   PubSection() : IsGNUStyle(false) {}
 
-  uint32_t Length;
+  InitialLength Length;
   uint16_t Version;
   uint32_t UnitOffset;
   uint32_t UnitSize;
@@ -78,8 +98,9 @@ struct Entry {
 };
 
 struct Unit {
-  uint32_t Length;
+  InitialLength Length;
   uint16_t Version;
+  llvm::dwarf::UnitType Type; // Added in DWARF 5
   uint32_t AbbrOffset;
   uint8_t AddrSize;
   std::vector<Entry> Entries;
@@ -104,8 +125,7 @@ struct LineTableOpcode {
 };
 
 struct LineTable {
-  uint32_t TotalLength;
-  uint64_t TotalLength64;
+  InitialLength Length;
   uint16_t Version;
   uint64_t PrologueLength;
   uint8_t MinInstLength;
@@ -130,7 +150,7 @@ struct Data {
 
   PubSection GNUPubNames;
   PubSection GNUPubTypes;
-  
+
   std::vector<Unit> CompileUnits;
 
   std::vector<LineTable> DebugLines;
@@ -141,7 +161,7 @@ struct Data {
 } // namespace llvm::DWARFYAML
 } // namespace llvm
 
-LLVM_YAML_IS_SEQUENCE_VECTOR(uint8_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::Hex64)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::StringRef)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::yaml::Hex8)
@@ -203,7 +223,7 @@ template <> struct MappingTraits<DWARFYAML::FormValue> {
 template <> struct MappingTraits<DWARFYAML::File> {
   static void mapping(IO &IO, DWARFYAML::File &File);
 };
-  
+
 template <> struct MappingTraits<DWARFYAML::LineTableOpcode> {
   static void mapping(IO &IO, DWARFYAML::LineTableOpcode &LineTableOpcode);
 };
@@ -212,6 +232,10 @@ template <> struct MappingTraits<DWARFYAML::LineTable> {
   static void mapping(IO &IO, DWARFYAML::LineTable &LineTable);
 };
 
+template <> struct MappingTraits<DWARFYAML::InitialLength> {
+  static void mapping(IO &IO, DWARFYAML::InitialLength &DWARF);
+};
+
 #define HANDLE_DW_TAG(unused, name)                                            \
   io.enumCase(value, "DW_TAG_" #name, dwarf::DW_TAG_##name);
 
@@ -262,6 +286,16 @@ template <> struct ScalarEnumerationTraits<dwarf::Form> {
   }
 };
 
+#define HANDLE_DW_UT(unused, name)                                             \
+  io.enumCase(value, "DW_UT_" #name, dwarf::DW_UT_##name);
+
+template <> struct ScalarEnumerationTraits<dwarf::UnitType> {
+  static void enumeration(IO &io, dwarf::UnitType &value) {
+#include "llvm/Support/Dwarf.def"
+    io.enumFallback<Hex8>(value);
+  }
+};
+
 template <> struct ScalarEnumerationTraits<dwarf::Constants> {
   static void enumeration(IO &io, dwarf::Constants &value) {
     io.enumCase(value, "DW_CHILDREN_no", dwarf::DW_CHILDREN_no);
diff --git a/include/llvm/ObjectYAML/MachOYAML.h b/include/llvm/ObjectYAML/MachOYAML.h
index f69aa1515d5ffc0c01192ffb6086afbc1c3aa970..ae858c8f4aafd0c7841cb0221c69c9f9893d7851 100644
--- a/include/llvm/ObjectYAML/MachOYAML.h
+++ b/include/llvm/ObjectYAML/MachOYAML.h
@@ -140,7 +140,7 @@ struct UniversalBinary {
 
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::LoadCommand)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::Section)
-LLVM_YAML_IS_SEQUENCE_VECTOR(int64_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(int64_t)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::RebaseOpcode)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::BindOpcode)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::MachOYAML::ExportEntry)
diff --git a/include/llvm/ObjectYAML/ObjectYAML.h b/include/llvm/ObjectYAML/ObjectYAML.h
index 1d6462347770c56ea0b103a41f1805c678f92727..36d6ed5417cf5370f581ac8ad6afcbca374e2246 100644
--- a/include/llvm/ObjectYAML/ObjectYAML.h
+++ b/include/llvm/ObjectYAML/ObjectYAML.h
@@ -10,10 +10,11 @@
 #ifndef LLVM_OBJECTYAML_OBJECTYAML_H
 #define LLVM_OBJECTYAML_OBJECTYAML_H
 
-#include "llvm/Support/YAMLTraits.h"
-#include "llvm/ObjectYAML/ELFYAML.h"
 #include "llvm/ObjectYAML/COFFYAML.h"
+#include "llvm/ObjectYAML/ELFYAML.h"
 #include "llvm/ObjectYAML/MachOYAML.h"
+#include "llvm/ObjectYAML/WasmYAML.h"
+#include "llvm/Support/YAMLTraits.h"
 
 namespace llvm {
 namespace yaml {
@@ -23,6 +24,7 @@ struct YamlObjectFile {
   std::unique_ptr<COFFYAML::Object> Coff;
   std::unique_ptr<MachOYAML::Object> MachO;
   std::unique_ptr<MachOYAML::UniversalBinary> FatMachO;
+  std::unique_ptr<WasmYAML::Object> Wasm;
 };
 
 template <> struct MappingTraits<YamlObjectFile> {
diff --git a/include/llvm/ObjectYAML/WasmYAML.h b/include/llvm/ObjectYAML/WasmYAML.h
new file mode 100644
index 0000000000000000000000000000000000000000..b1af8bbdfa6e97c9b79b52b066973aed9c688c8e
--- /dev/null
+++ b/include/llvm/ObjectYAML/WasmYAML.h
@@ -0,0 +1,339 @@
+//===- WasmYAML.h - Wasm YAMLIO implementation ------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file declares classes for handling the YAML representation
+/// of wasm binaries.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECTYAML_WASMYAML_H
+#define LLVM_OBJECTYAML_WASMYAML_H
+
+#include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Wasm.h"
+
+namespace llvm {
+namespace WasmYAML {
+
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, SectionType)
+LLVM_YAML_STRONG_TYPEDEF(int32_t, ValueType)
+LLVM_YAML_STRONG_TYPEDEF(int32_t, TableType)
+LLVM_YAML_STRONG_TYPEDEF(int32_t, SignatureForm)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, ExportKind)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, Opcode)
+LLVM_YAML_STRONG_TYPEDEF(uint32_t, RelocType)
+
+struct FileHeader {
+  yaml::Hex32 Version;
+};
+
+struct Import {
+  StringRef Module;
+  StringRef Field;
+  ExportKind Kind;
+  union {
+    uint32_t SigIndex;
+    ValueType GlobalType;
+  };
+  bool GlobalMutable;
+};
+
+struct Limits {
+  yaml::Hex32 Flags;
+  yaml::Hex32 Initial;
+  yaml::Hex32 Maximum;
+};
+
+struct Table {
+  TableType ElemType;
+  Limits TableLimits;
+};
+
+struct Export {
+  StringRef Name;
+  ExportKind Kind;
+  uint32_t Index;
+};
+
+struct ElemSegment {
+  uint32_t TableIndex;
+  wasm::WasmInitExpr Offset;
+  std::vector<uint32_t> Functions;
+};
+
+struct Global {
+  ValueType Type;
+  bool Mutable;
+  wasm::WasmInitExpr InitExpr;
+};
+
+struct LocalDecl {
+  ValueType Type;
+  uint32_t Count;
+};
+
+struct Function {
+  std::vector<LocalDecl> Locals;
+  yaml::BinaryRef Body;
+};
+
+struct Relocation {
+  RelocType Type;
+  uint32_t Index;
+  yaml::Hex32 Offset;
+  yaml::Hex32 Addend;
+};
+
+struct DataSegment {
+  uint32_t Index;
+  wasm::WasmInitExpr Offset;
+  yaml::BinaryRef Content;
+};
+
+struct Signature {
+  Signature() : Form(wasm::WASM_TYPE_FUNC) {}
+
+  uint32_t Index;
+  SignatureForm Form;
+  std::vector<ValueType> ParamTypes;
+  ValueType ReturnType;
+};
+
+struct Section {
+  Section(SectionType SecType) : Type(SecType) {}
+  virtual ~Section();
+
+  SectionType Type;
+  std::vector<Relocation> Relocations;
+};
+
+struct CustomSection : Section {
+  CustomSection() : Section(wasm::WASM_SEC_CUSTOM) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_CUSTOM;
+  }
+
+  StringRef Name;
+  yaml::BinaryRef Payload;
+};
+
+struct TypeSection : Section {
+  TypeSection() : Section(wasm::WASM_SEC_TYPE) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_TYPE;
+  }
+
+  std::vector<Signature> Signatures;
+};
+
+struct ImportSection : Section {
+  ImportSection() : Section(wasm::WASM_SEC_IMPORT) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_IMPORT;
+  }
+
+  std::vector<Import> Imports;
+};
+
+struct FunctionSection : Section {
+  FunctionSection() : Section(wasm::WASM_SEC_FUNCTION) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_FUNCTION;
+  }
+
+  std::vector<uint32_t> FunctionTypes;
+};
+
+struct TableSection : Section {
+  TableSection() : Section(wasm::WASM_SEC_TABLE) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_TABLE;
+  }
+
+  std::vector<Table> Tables;
+};
+
+struct MemorySection : Section {
+  MemorySection() : Section(wasm::WASM_SEC_MEMORY) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_MEMORY;
+  }
+
+  std::vector<Limits> Memories;
+};
+
+struct GlobalSection : Section {
+  GlobalSection() : Section(wasm::WASM_SEC_GLOBAL) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_GLOBAL;
+  }
+
+  std::vector<Global> Globals;
+};
+
+struct ExportSection : Section {
+  ExportSection() : Section(wasm::WASM_SEC_EXPORT) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_EXPORT;
+  }
+
+  std::vector<Export> Exports;
+};
+
+struct StartSection : Section {
+  StartSection() : Section(wasm::WASM_SEC_START) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_START;
+  }
+
+  uint32_t StartFunction;
+};
+
+struct ElemSection : Section {
+  ElemSection() : Section(wasm::WASM_SEC_ELEM) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_ELEM;
+  }
+
+  std::vector<ElemSegment> Segments;
+};
+
+struct CodeSection : Section {
+  CodeSection() : Section(wasm::WASM_SEC_CODE) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_CODE;
+  }
+
+  std::vector<Function> Functions;
+};
+
+struct DataSection : Section {
+  DataSection() : Section(wasm::WASM_SEC_DATA) {}
+  static bool classof(const Section *S) {
+    return S->Type == wasm::WASM_SEC_DATA;
+  }
+
+  std::vector<DataSegment> Segments;
+};
+
+struct Object {
+  FileHeader Header;
+  std::vector<std::unique_ptr<Section>> Sections;
+};
+
+} // end namespace WasmYAML
+} // end namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(std::unique_ptr<llvm::WasmYAML::Section>)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Signature)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::ValueType)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Table)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Import)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Export)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::ElemSegment)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Limits)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::DataSegment)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Global)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Function)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::LocalDecl)
+LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::WasmYAML::Relocation)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
+
+namespace llvm {
+namespace yaml {
+
+template <> struct MappingTraits<WasmYAML::FileHeader> {
+  static void mapping(IO &IO, WasmYAML::FileHeader &FileHdr);
+};
+
+template <> struct MappingTraits<std::unique_ptr<WasmYAML::Section>> {
+  static void mapping(IO &IO, std::unique_ptr<WasmYAML::Section> &Section);
+};
+
+template <> struct MappingTraits<WasmYAML::Object> {
+  static void mapping(IO &IO, WasmYAML::Object &Object);
+};
+
+template <> struct MappingTraits<WasmYAML::Import> {
+  static void mapping(IO &IO, WasmYAML::Import &Import);
+};
+
+template <> struct MappingTraits<WasmYAML::Export> {
+  static void mapping(IO &IO, WasmYAML::Export &Export);
+};
+
+template <> struct MappingTraits<WasmYAML::Global> {
+  static void mapping(IO &IO, WasmYAML::Global &Global);
+};
+
+template <> struct ScalarEnumerationTraits<WasmYAML::SectionType> {
+  static void enumeration(IO &IO, WasmYAML::SectionType &Type);
+};
+
+template <> struct MappingTraits<WasmYAML::Signature> {
+  static void mapping(IO &IO, WasmYAML::Signature &Signature);
+};
+
+template <> struct MappingTraits<WasmYAML::Table> {
+  static void mapping(IO &IO, WasmYAML::Table &Table);
+};
+
+template <> struct MappingTraits<WasmYAML::Limits> {
+  static void mapping(IO &IO, WasmYAML::Limits &Limits);
+};
+
+template <> struct MappingTraits<WasmYAML::Function> {
+  static void mapping(IO &IO, WasmYAML::Function &Function);
+};
+
+template <> struct MappingTraits<WasmYAML::Relocation> {
+  static void mapping(IO &IO, WasmYAML::Relocation &Relocation);
+};
+
+template <> struct MappingTraits<WasmYAML::LocalDecl> {
+  static void mapping(IO &IO, WasmYAML::LocalDecl &LocalDecl);
+};
+
+template <> struct MappingTraits<wasm::WasmInitExpr> {
+  static void mapping(IO &IO, wasm::WasmInitExpr &Expr);
+};
+
+template <> struct MappingTraits<WasmYAML::DataSegment> {
+  static void mapping(IO &IO, WasmYAML::DataSegment &Segment);
+};
+
+template <> struct MappingTraits<WasmYAML::ElemSegment> {
+  static void mapping(IO &IO, WasmYAML::ElemSegment &Segment);
+};
+
+template <> struct ScalarEnumerationTraits<WasmYAML::ValueType> {
+  static void enumeration(IO &IO, WasmYAML::ValueType &Type);
+};
+
+template <> struct ScalarEnumerationTraits<WasmYAML::ExportKind> {
+  static void enumeration(IO &IO, WasmYAML::ExportKind &Kind);
+};
+
+template <> struct ScalarEnumerationTraits<WasmYAML::TableType> {
+  static void enumeration(IO &IO, WasmYAML::TableType &Type);
+};
+
+template <> struct ScalarEnumerationTraits<WasmYAML::Opcode> {
+  static void enumeration(IO &IO, WasmYAML::Opcode &Opcode);
+};
+
+template <> struct ScalarEnumerationTraits<WasmYAML::RelocType> {
+  static void enumeration(IO &IO, WasmYAML::RelocType &Kind);
+};
+
+} // end namespace yaml
+} // end namespace llvm
+
+#endif
diff --git a/include/llvm/PassSupport.h b/include/llvm/PassSupport.h
index e77a0b9882b2e8266a72f901f95ba6c047f6d4f3..852d79fbd443593c907f8c9ae2f822229c526bf3 100644
--- a/include/llvm/PassSupport.h
+++ b/include/llvm/PassSupport.h
@@ -41,7 +41,7 @@ class TargetMachine;
     Registry.registerPass(*PI, true);                                          \
     return PI;                                                                 \
   }                                                                            \
-  LLVM_DEFINE_ONCE_FLAG(Initialize##passName##PassFlag);                       \
+  static llvm::once_flag Initialize##passName##PassFlag;                       \
   void llvm::initialize##passName##Pass(PassRegistry &Registry) {              \
     llvm::call_once(Initialize##passName##PassFlag,                            \
                     initialize##passName##PassOnce, std::ref(Registry));       \
@@ -61,7 +61,7 @@ class TargetMachine;
   Registry.registerPass(*PI, true);                                            \
   return PI;                                                                   \
   }                                                                            \
-  LLVM_DEFINE_ONCE_FLAG(Initialize##passName##PassFlag);                       \
+  static llvm::once_flag Initialize##passName##PassFlag;                       \
   void llvm::initialize##passName##Pass(PassRegistry &Registry) {              \
     llvm::call_once(Initialize##passName##PassFlag,                            \
                     initialize##passName##PassOnce, std::ref(Registry));       \
@@ -152,7 +152,7 @@ struct RegisterAnalysisGroup : public RegisterAGBase {
     Registry.registerAnalysisGroup(&agName::ID, 0, *AI, false, true);          \
     return AI;                                                                 \
   }                                                                            \
-  LLVM_DEFINE_ONCE_FLAG(Initialize##agName##AnalysisGroupFlag);                \
+  static llvm::once_flag Initialize##agName##AnalysisGroupFlag;                \
   void llvm::initialize##agName##AnalysisGroup(PassRegistry &Registry) {       \
     llvm::call_once(Initialize##agName##AnalysisGroupFlag,                     \
                     initialize##agName##AnalysisGroupOnce,                     \
@@ -173,7 +173,7 @@ struct RegisterAnalysisGroup : public RegisterAGBase {
                                    true);                                      \
     return AI;                                                                 \
   }                                                                            \
-  LLVM_DEFINE_ONCE_FLAG(Initialize##passName##PassFlag);                       \
+  static llvm::once_flag Initialize##passName##PassFlag;                       \
   void llvm::initialize##passName##Pass(PassRegistry &Registry) {              \
     llvm::call_once(Initialize##passName##PassFlag,                            \
                     initialize##passName##PassOnce, std::ref(Registry));       \
@@ -194,7 +194,7 @@ struct RegisterAnalysisGroup : public RegisterAGBase {
   Registry.registerAnalysisGroup(&agName::ID, &passName::ID, *AI, def, true);  \
   return AI;                                                                   \
   }                                                                            \
-  LLVM_DEFINE_ONCE_FLAG(Initialize##passName##PassFlag);                       \
+  static llvm::once_flag Initialize##passName##PassFlag;                       \
   void llvm::initialize##passName##Pass(PassRegistry &Registry) {              \
     llvm::call_once(Initialize##passName##PassFlag,                            \
                     initialize##passName##PassOnce, std::ref(Registry));       \
diff --git a/include/llvm/Passes/PassBuilder.h b/include/llvm/Passes/PassBuilder.h
index d76c13984d11c597ea049b13806eb6e66825034e..efa36d957fbd69be288ab806a46e69d8b75dd996 100644
--- a/include/llvm/Passes/PassBuilder.h
+++ b/include/llvm/Passes/PassBuilder.h
@@ -27,6 +27,14 @@ class StringRef;
 class AAManager;
 class TargetMachine;
 
+/// A struct capturing PGO tunables.
+struct PGOOptions {
+  std::string ProfileGenFile = "";
+  std::string ProfileUseFile = "";
+  bool RunProfileGen = false;
+  bool SamplePGO = false;
+};
+
 /// \brief This class provides access to building LLVM's passes.
 ///
 /// It's members provide the baseline state available to passes during their
@@ -35,6 +43,7 @@ class TargetMachine;
 /// construction.
 class PassBuilder {
   TargetMachine *TM;
+  Optional<PGOOptions> PGOOpt;
 
 public:
   /// \brief LLVM-provided high-level optimization levels.
@@ -123,7 +132,9 @@ public:
     Oz
   };
 
-  explicit PassBuilder(TargetMachine *TM = nullptr) : TM(TM) {}
+  explicit PassBuilder(TargetMachine *TM = nullptr,
+                       Optional<PGOOptions> PGOOpt = None)
+      : TM(TM), PGOOpt(PGOOpt) {}
 
   /// \brief Cross register the analysis managers through their proxies.
   ///
diff --git a/include/llvm/ProfileData/Coverage/CoverageMapping.h b/include/llvm/ProfileData/Coverage/CoverageMapping.h
index d6051ffb3f8d0e8edee59a487bf4f07c97972b6b..b9a9f53776984366dd7dd03784c5e16d29b975da 100644
--- a/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -1,4 +1,4 @@
-//=-- CoverageMapping.h - Code coverage mapping support ---------*- C++ -*-=//
+//===- CoverageMapping.h - Code coverage mapping support --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,25 +12,42 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PROFILEDATA_COVERAGEMAPPING_H_
-#define LLVM_PROFILEDATA_COVERAGEMAPPING_H_
+#ifndef LLVM_PROFILEDATA_COVERAGE_COVERAGEMAPPING_H
+#define LLVM_PROFILEDATA_COVERAGE_COVERAGEMAPPING_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/StringSet.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
 #include <system_error>
 #include <tuple>
+#include <utility>
+#include <vector>
 
 namespace llvm {
+
+class IndexedInstrProfReader;
+
 namespace coverage {
 
+class CoverageMappingReader;
+struct CoverageMappingRecord;
+
 enum class coveragemap_error {
   success = 0,
   eof,
@@ -68,19 +85,6 @@ private:
   coveragemap_error Err;
 };
 
-} // end of coverage namespace.
-} // end of llvm namespace
-
-namespace llvm {
-class IndexedInstrProfReader;
-namespace coverage {
-
-class CoverageMappingReader;
-struct CoverageMappingRecord;
-
-class CoverageMapping;
-struct CounterExpressions;
-
 /// \brief A Counter is an abstract value that describes how to compute the
 /// execution count for a region of code using the collected profile count data.
 struct Counter {
@@ -91,13 +95,13 @@ struct Counter {
       EncodingTagBits + 1;
 
 private:
-  CounterKind Kind;
-  unsigned ID;
+  CounterKind Kind = Zero;
+  unsigned ID = 0;
 
   Counter(CounterKind Kind, unsigned ID) : Kind(Kind), ID(ID) {}
 
 public:
-  Counter() : Kind(Zero), ID(0) {}
+  Counter() = default;
 
   CounterKind getKind() const { return Kind; }
 
@@ -153,8 +157,9 @@ struct CounterExpression {
 class CounterExpressionBuilder {
   /// \brief A list of all the counter expressions
   std::vector<CounterExpression> Expressions;
+
   /// \brief A lookup table for the index of a given expression.
-  llvm::DenseMap<CounterExpression, unsigned> ExpressionIndices;
+  DenseMap<CounterExpression, unsigned> ExpressionIndices;
 
   /// \brief Return the counter which corresponds to the given expression.
   ///
@@ -238,7 +243,6 @@ struct CounterMappingRegion {
                                 LineEnd, ColumnEnd, SkippedRegion);
   }
 
-
   inline std::pair<unsigned, unsigned> startLoc() const {
     return std::pair<unsigned, unsigned>(LineStart, ColumnStart);
   }
@@ -269,7 +273,7 @@ public:
 
   void setCounts(ArrayRef<uint64_t> Counts) { CounterValues = Counts; }
 
-  void dump(const Counter &C, llvm::raw_ostream &OS) const;
+  void dump(const Counter &C, raw_ostream &OS) const;
   void dump(const Counter &C) const { dump(C, dbgs()); }
 
   /// \brief Return the number of times that a region of code associated with
@@ -390,13 +394,14 @@ struct CoverageSegment {
 /// provides a sequence of CoverageSegments to iterate through, as well as the
 /// list of expansions that can be further processed.
 class CoverageData {
+  friend class CoverageMapping;
+
   std::string Filename;
   std::vector<CoverageSegment> Segments;
   std::vector<ExpansionRecord> Expansions;
-  friend class CoverageMapping;
 
 public:
-  CoverageData() {}
+  CoverageData() = default;
 
   CoverageData(StringRef Filename) : Filename(Filename) {}
 
@@ -422,18 +427,17 @@ public:
 class CoverageMapping {
   StringSet<> FunctionNames;
   std::vector<FunctionRecord> Functions;
-  unsigned MismatchedFunctionCount;
-
-  CoverageMapping() : MismatchedFunctionCount(0) {}
-
-  CoverageMapping(const CoverageMapping &) = delete;
-  const CoverageMapping &operator=(const CoverageMapping &) = delete;
+  unsigned MismatchedFunctionCount = 0;
 
+  CoverageMapping() = default;
   /// \brief Add a function record corresponding to \p Record.
   Error loadFunctionRecord(const CoverageMappingRecord &Record,
                            IndexedInstrProfReader &ProfileReader);
 
 public:
+  CoverageMapping(const CoverageMapping &) = delete;
+  CoverageMapping &operator=(const CoverageMapping &) = delete;
+
   /// \brief Load the coverage mapping using the given readers.
   static Expected<std::unique_ptr<CoverageMapping>>
   load(CoverageMappingReader &CoverageReader,
@@ -517,14 +521,17 @@ template <class IntPtrT> struct CovMapFunctionRecordV1 {
   template <support::endianness Endian> uint64_t getFuncHash() const {
     return support::endian::byte_swap<uint64_t, Endian>(FuncHash);
   }
+
   // Return the coverage map data size for the funciton.
   template <support::endianness Endian> uint32_t getDataSize() const {
     return support::endian::byte_swap<uint32_t, Endian>(DataSize);
   }
+
   // Return function lookup key. The value is consider opaque.
   template <support::endianness Endian> IntPtrT getFuncNameRef() const {
     return support::endian::byte_swap<IntPtrT, Endian>(NamePtr);
   }
+
   // Return the PGO name of the function */
   template <support::endianness Endian>
   Error getFuncName(InstrProfSymtab &ProfileNames, StringRef &FuncName) const {
@@ -545,14 +552,17 @@ struct CovMapFunctionRecord {
   template <support::endianness Endian> uint64_t getFuncHash() const {
     return support::endian::byte_swap<uint64_t, Endian>(FuncHash);
   }
+
   // Return the coverage map data size for the funciton.
   template <support::endianness Endian> uint32_t getDataSize() const {
     return support::endian::byte_swap<uint32_t, Endian>(DataSize);
   }
+
   // Return function lookup key. The value is consider opaque.
   template <support::endianness Endian> uint64_t getFuncNameRef() const {
     return support::endian::byte_swap<uint64_t, Endian>(NameRef);
   }
+
   // Return the PGO name of the function */
   template <support::endianness Endian>
   Error getFuncName(InstrProfSymtab &ProfileNames, StringRef &FuncName) const {
@@ -570,12 +580,15 @@ struct CovMapHeader {
   template <support::endianness Endian> uint32_t getNRecords() const {
     return support::endian::byte_swap<uint32_t, Endian>(NRecords);
   }
+
   template <support::endianness Endian> uint32_t getFilenamesSize() const {
     return support::endian::byte_swap<uint32_t, Endian>(FilenamesSize);
   }
+
   template <support::endianness Endian> uint32_t getCoverageSize() const {
     return support::endian::byte_swap<uint32_t, Endian>(CoverageSize);
   }
+
   template <support::endianness Endian> uint32_t getVersion() const {
     return support::endian::byte_swap<uint32_t, Endian>(Version);
   }
@@ -635,4 +648,4 @@ template<> struct DenseMapInfo<coverage::CounterExpression> {
 
 } // end namespace llvm
 
-#endif // LLVM_PROFILEDATA_COVERAGEMAPPING_H_
+#endif // LLVM_PROFILEDATA_COVERAGE_COVERAGEMAPPING_H
diff --git a/include/llvm/ProfileData/Coverage/CoverageMappingReader.h b/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
index db907f128d931e5edb7211cbed5ee2b43d1b856e..5b372252a9ac9698e157b987962b4c9b18716caf 100644
--- a/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
+++ b/include/llvm/ProfileData/Coverage/CoverageMappingReader.h
@@ -1,4 +1,4 @@
-//=-- CoverageMappingReader.h - Code coverage mapping reader ------*- C++ -*-=//
+//===- CoverageMappingReader.h - Code coverage mapping reader ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,20 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PROFILEDATA_COVERAGEMAPPINGREADER_H
-#define LLVM_PROFILEDATA_COVERAGEMAPPINGREADER_H
+#ifndef LLVM_PROFILEDATA_COVERAGE_COVERAGEMAPPINGREADER_H
+#define LLVM_PROFILEDATA_COVERAGE_COVERAGEMAPPINGREADER_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
-#include "llvm/Object/ObjectFile.h"
 #include "llvm/ProfileData/Coverage/CoverageMapping.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include <cstddef>
+#include <cstdint>
 #include <iterator>
+#include <memory>
+#include <vector>
 
 namespace llvm {
 namespace coverage {
@@ -42,13 +44,14 @@ struct CoverageMappingRecord {
 /// \brief A file format agnostic iterator over coverage mapping data.
 class CoverageMappingIterator
     : public std::iterator<std::input_iterator_tag, CoverageMappingRecord> {
-  CoverageMappingReader *Reader;
+  CoverageMappingReader *Reader = nullptr;
   CoverageMappingRecord Record;
 
   void increment();
 
 public:
-  CoverageMappingIterator() : Reader(nullptr) {}
+  CoverageMappingIterator() = default;
+
   CoverageMappingIterator(CoverageMappingReader *Reader) : Reader(Reader) {
     increment();
   }
@@ -69,10 +72,11 @@ public:
 
 class CoverageMappingReader {
 public:
+  virtual ~CoverageMappingReader() = default;
+
   virtual Error readNextRecord(CoverageMappingRecord &Record) = 0;
   CoverageMappingIterator begin() { return CoverageMappingIterator(this); }
   CoverageMappingIterator end() { return CoverageMappingIterator(); }
-  virtual ~CoverageMappingReader() {}
 };
 
 /// \brief Base class for the raw coverage mapping and filenames data readers.
@@ -92,13 +96,12 @@ protected:
 class RawCoverageFilenamesReader : public RawCoverageReader {
   std::vector<StringRef> &Filenames;
 
-  RawCoverageFilenamesReader(const RawCoverageFilenamesReader &) = delete;
-  RawCoverageFilenamesReader &
-  operator=(const RawCoverageFilenamesReader &) = delete;
-
 public:
   RawCoverageFilenamesReader(StringRef Data, std::vector<StringRef> &Filenames)
       : RawCoverageReader(Data), Filenames(Filenames) {}
+  RawCoverageFilenamesReader(const RawCoverageFilenamesReader &) = delete;
+  RawCoverageFilenamesReader &
+  operator=(const RawCoverageFilenamesReader &) = delete;
 
   Error read();
 };
@@ -120,10 +123,6 @@ class RawCoverageMappingReader : public RawCoverageReader {
   std::vector<CounterExpression> &Expressions;
   std::vector<CounterMappingRegion> &MappingRegions;
 
-  RawCoverageMappingReader(const RawCoverageMappingReader &) = delete;
-  RawCoverageMappingReader &
-  operator=(const RawCoverageMappingReader &) = delete;
-
 public:
   RawCoverageMappingReader(StringRef MappingData,
                            ArrayRef<StringRef> TranslationUnitFilenames,
@@ -134,6 +133,9 @@ public:
         TranslationUnitFilenames(TranslationUnitFilenames),
         Filenames(Filenames), Expressions(Expressions),
         MappingRegions(MappingRegions) {}
+  RawCoverageMappingReader(const RawCoverageMappingReader &) = delete;
+  RawCoverageMappingReader &
+  operator=(const RawCoverageMappingReader &) = delete;
 
   Error read();
 
@@ -169,17 +171,17 @@ private:
   std::vector<StringRef> Filenames;
   std::vector<ProfileMappingRecord> MappingRecords;
   InstrProfSymtab ProfileNames;
-  size_t CurrentRecord;
+  size_t CurrentRecord = 0;
   std::vector<StringRef> FunctionsFilenames;
   std::vector<CounterExpression> Expressions;
   std::vector<CounterMappingRegion> MappingRegions;
 
+  BinaryCoverageReader() = default;
+
+public:
   BinaryCoverageReader(const BinaryCoverageReader &) = delete;
   BinaryCoverageReader &operator=(const BinaryCoverageReader &) = delete;
 
-  BinaryCoverageReader() : CurrentRecord(0) {}
-
-public:
   static Expected<std::unique_ptr<BinaryCoverageReader>>
   create(std::unique_ptr<MemoryBuffer> &ObjectBuffer,
          StringRef Arch);
@@ -190,4 +192,4 @@ public:
 } // end namespace coverage
 } // end namespace llvm
 
-#endif
+#endif // LLVM_PROFILEDATA_COVERAGE_COVERAGEMAPPINGREADER_H
diff --git a/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h b/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h
index 24fb94647247c26da60a6fa49603da350c99e46c..b6f864ab3de38f88840ff09d826eb8459ec470d1 100644
--- a/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h
+++ b/include/llvm/ProfileData/Coverage/CoverageMappingWriter.h
@@ -1,4 +1,4 @@
-//=-- CoverageMappingWriter.h - Code coverage mapping writer ------*- C++ -*-=//
+//===- CoverageMappingWriter.h - Code coverage mapping writer ---*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +12,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PROFILEDATA_COVERAGEMAPPINGWRITER_H
-#define LLVM_PROFILEDATA_COVERAGEMAPPINGWRITER_H
+#ifndef LLVM_PROFILEDATA_COVERAGE_COVERAGEMAPPINGWRITER_H
+#define LLVM_PROFILEDATA_COVERAGE_COVERAGEMAPPINGWRITER_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ProfileData/Coverage/CoverageMapping.h"
-#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
+
+class raw_ostream;
+
 namespace coverage {
 
 /// \brief Writer of the filenames section for the instrumentation
@@ -54,6 +56,7 @@ public:
 };
 
 } // end namespace coverage
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_PROFILEDATA_COVERAGE_COVERAGEMAPPINGWRITER_H
diff --git a/include/llvm/ProfileData/InstrProf.h b/include/llvm/ProfileData/InstrProf.h
index c7e558efa3dcc114591c4408518d0ef644227aa3..f97bbfd9e0d8ee7b95d21b05ca5a14fdb81eca5e 100644
--- a/include/llvm/ProfileData/InstrProf.h
+++ b/include/llvm/ProfileData/InstrProf.h
@@ -1,4 +1,4 @@
-//===-- InstrProf.h - Instrumented profiling format support -----*- C++ -*-===//
+//===- InstrProf.h - Instrumented profiling format support ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,26 +16,41 @@
 #ifndef LLVM_PROFILEDATA_INSTRPROF_H
 #define LLVM_PROFILEDATA_INSTRPROF_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Metadata.h"
+#include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProfData.inc"
-#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
 #include <cstdint>
+#include <cstring>
 #include <list>
+#include <memory>
+#include <string>
 #include <system_error>
+#include <utility>
 #include <vector>
 
 namespace llvm {
 
 class Function;
 class GlobalVariable;
+struct InstrProfRecord;
+class InstrProfSymtab;
+class Instruction;
+class MDNode;
 class Module;
 
 /// Return the name of data section containing profile counter variables.
@@ -79,6 +94,11 @@ inline StringRef getInstrProfValueProfFuncName() {
   return INSTR_PROF_VALUE_PROF_FUNC_STR;
 }
 
+/// Return the name profile runtime entry point to do value range profiling.
+inline StringRef getInstrProfValueRangeProfFuncName() {
+  return INSTR_PROF_VALUE_RANGE_PROF_FUNC_STR;
+}
+
 /// Return the name of the section containing function coverage mapping
 /// data.
 inline StringRef getInstrProfCoverageSectionName(bool AddSegment) {
@@ -201,6 +221,7 @@ GlobalVariable *createPGOFuncNameVar(Function &F, StringRef PGOFuncName);
 GlobalVariable *createPGOFuncNameVar(Module &M,
                                      GlobalValue::LinkageTypes Linkage,
                                      StringRef PGOFuncName);
+
 /// Return the initializer in string of the PGO name var \c NameVar.
 StringRef getPGOFuncNameVarInitializer(GlobalVariable *NameVar);
 
@@ -220,11 +241,12 @@ StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName,
 /// second field will have value zero.
 Error collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
                                 bool doCompression, std::string &Result);
+
 /// Produce \c Result string with the same format described above. The input
 /// is vector of PGO function name variables that are referenced.
 Error collectPGOFuncNameStrings(const std::vector<GlobalVariable *> &NameVars,
                                 std::string &Result, bool doCompression = true);
-class InstrProfSymtab;
+
 /// \c NameStrings is a string composed of one of more sub-strings encoded in
 /// the format described above. The substrings are separated by 0 or more zero
 /// bytes. This method decodes the string and populates the \c Symtab.
@@ -244,8 +266,6 @@ enum InstrProfValueKind : uint32_t {
 #include "llvm/ProfileData/InstrProfData.inc"
 };
 
-struct InstrProfRecord;
-
 /// Get the value profile data for value site \p SiteIdx from \p InstrProfR
 /// and annotate the instruction \p Inst with the value profile meta data.
 /// Annotate up to \p MaxMDCount (default 3) number of records per value site.
@@ -253,6 +273,7 @@ void annotateValueSite(Module &M, Instruction &Inst,
                        const InstrProfRecord &InstrProfR,
                        InstrProfValueKind ValueKind, uint32_t SiteIndx,
                        uint32_t MaxMDCount = 3);
+
 /// Same as the above interface but using an ArrayRef, as well as \p Sum.
 void annotateValueSite(Module &M, Instruction &Inst,
                        ArrayRef<InstrProfValueData> VDs,
@@ -347,25 +368,22 @@ class SoftInstrProfErrors {
   /// the first such error for reporting purposes.
 
   /// The first soft error encountered.
-  instrprof_error FirstError;
+  instrprof_error FirstError = instrprof_error::success;
 
   /// The number of hash mismatches.
-  unsigned NumHashMismatches;
+  unsigned NumHashMismatches = 0;
 
   /// The number of count mismatches.
-  unsigned NumCountMismatches;
+  unsigned NumCountMismatches = 0;
 
   /// The number of counter overflows.
-  unsigned NumCounterOverflows;
+  unsigned NumCounterOverflows = 0;
 
   /// The number of value site count mismatches.
-  unsigned NumValueSiteCountMismatches;
+  unsigned NumValueSiteCountMismatches = 0;
 
 public:
-  SoftInstrProfErrors()
-      : FirstError(instrprof_error::success), NumHashMismatches(0),
-        NumCountMismatches(0), NumCounterOverflows(0),
-        NumValueSiteCountMismatches(0) {}
+  SoftInstrProfErrors() = default;
 
   ~SoftInstrProfErrors() {
     assert(FirstError == instrprof_error::success &&
@@ -401,12 +419,16 @@ public:
 };
 
 namespace object {
+
 class SectionRef;
-}
+
+} // end namespace object
 
 namespace IndexedInstrProf {
+
 uint64_t ComputeHash(StringRef K);
-}
+
+} // end namespace IndexedInstrProf
 
 /// A symbol table used for function PGO name look-up with keys
 /// (such as pointers, md5hash values) to the function. A function's
@@ -419,7 +441,7 @@ public:
 
 private:
   StringRef Data;
-  uint64_t Address;
+  uint64_t Address = 0;
   // Unique name strings.
   StringSet<> NameTab;
   // A map from MD5 keys to function name strings.
@@ -432,9 +454,7 @@ private:
   AddrHashMap AddrToMD5Map;
 
 public:
-  InstrProfSymtab()
-      : Data(), Address(0), NameTab(), MD5NameMap(), MD5FuncMap(),
-      AddrToMD5Map() {}
+  InstrProfSymtab() = default; 
 
   /// Create InstrProfSymtab from an object file section which
   /// contains function PGO names. When section may contain raw
@@ -443,26 +463,32 @@ public:
   /// the section base address. The decompression will be delayed
   /// until before it is used. See also \c create(StringRef) method.
   Error create(object::SectionRef &Section);
+
   /// This interface is used by reader of CoverageMapping test
   /// format.
   inline Error create(StringRef D, uint64_t BaseAddr);
+
   /// \c NameStrings is a string composed of one of more sub-strings
   ///  encoded in the format described in \c collectPGOFuncNameStrings.
   /// This method is a wrapper to \c readPGOFuncNameStrings method.
   inline Error create(StringRef NameStrings);
+
   /// A wrapper interface to populate the PGO symtab with functions
   /// decls from module \c M. This interface is used by transformation
   /// passes such as indirect function call promotion. Variable \c InLTO
   /// indicates if this is called from LTO optimization passes.
   void create(Module &M, bool InLTO = false);
+
   /// Create InstrProfSymtab from a set of names iteratable from
   /// \p IterRange. This interface is used by IndexedProfReader.
   template <typename NameIterRange> void create(const NameIterRange &IterRange);
+
   // If the symtab is created by a series of calls to \c addFuncName, \c
   // finalizeSymtab needs to be called before looking up function names.
   // This is required because the underlying map is a vector (for space
   // efficiency) which needs to be sorted.
   inline void finalizeSymtab();
+
   /// Update the symtab by adding \p FuncName to the table. This interface
   /// is used by the raw and text profile readers.
   void addFuncName(StringRef FuncName) {
@@ -471,25 +497,32 @@ public:
       MD5NameMap.push_back(std::make_pair(
           IndexedInstrProf::ComputeHash(FuncName), Ins.first->getKey()));
   }
+
   /// Map a function address to its name's MD5 hash. This interface
   /// is only used by the raw profiler reader.
   void mapAddress(uint64_t Addr, uint64_t MD5Val) {
     AddrToMD5Map.push_back(std::make_pair(Addr, MD5Val));
   }
+
   AddrHashMap &getAddrHashMap() { return AddrToMD5Map; }
+
   /// Return function's PGO name from the function name's symbol
   /// address in the object file. If an error occurs, return
   /// an empty string.
   StringRef getFuncName(uint64_t FuncNameAddress, size_t NameSize);
+
   /// Return function's PGO name from the name's md5 hash value.
   /// If not found, return an empty string.
   inline StringRef getFuncName(uint64_t FuncMD5Hash);
+
   /// Return function from the name's md5 hash. Return nullptr if not found.
   inline Function *getFunction(uint64_t FuncMD5Hash);
+
   /// Return the function's original assembly name by stripping off
   /// the prefix attached (to symbols with priviate linkage). For
   /// global functions, it returns the same string as getFuncName.
   inline StringRef getOrigFuncName(uint64_t FuncMD5Hash);
+
   /// Return the name section data.
   inline StringRef getNameData() const { return Data; }
 };
@@ -579,40 +612,48 @@ struct InstrProfValueSiteRecord {
 
 /// Profiling information for a single function.
 struct InstrProfRecord {
-  InstrProfRecord() : SIPE() {}
-  InstrProfRecord(StringRef Name, uint64_t Hash, std::vector<uint64_t> Counts)
-      : Name(Name), Hash(Hash), Counts(std::move(Counts)), SIPE() {}
   StringRef Name;
   uint64_t Hash;
   std::vector<uint64_t> Counts;
   SoftInstrProfErrors SIPE;
 
+  InstrProfRecord() = default;
+  InstrProfRecord(StringRef Name, uint64_t Hash, std::vector<uint64_t> Counts)
+      : Name(Name), Hash(Hash), Counts(std::move(Counts)) {}
+
   typedef std::vector<std::pair<uint64_t, uint64_t>> ValueMapType;
 
   /// Return the number of value profile kinds with non-zero number
   /// of profile sites.
   inline uint32_t getNumValueKinds() const;
+
   /// Return the number of instrumented sites for ValueKind.
   inline uint32_t getNumValueSites(uint32_t ValueKind) const;
+
   /// Return the total number of ValueData for ValueKind.
   inline uint32_t getNumValueData(uint32_t ValueKind) const;
+
   /// Return the number of value data collected for ValueKind at profiling
   /// site: Site.
   inline uint32_t getNumValueDataForSite(uint32_t ValueKind,
                                          uint32_t Site) const;
+
   /// Return the array of profiled values at \p Site. If \p TotalC
   /// is not null, the total count of all target values at this site
   /// will be stored in \c *TotalC.
   inline std::unique_ptr<InstrProfValueData[]>
   getValueForSite(uint32_t ValueKind, uint32_t Site,
-                  uint64_t *TotalC = 0) const;
+                  uint64_t *TotalC = nullptr) const;
+
   /// Get the target value/counts of kind \p ValueKind collected at site
   /// \p Site and store the result in array \p Dest. Return the total
   /// counts of all target values at this site.
   inline uint64_t getValueForSite(InstrProfValueData Dest[], uint32_t ValueKind,
                                   uint32_t Site) const;
+
   /// Reserve space for NumValueSites sites.
   inline void reserveSites(uint32_t ValueKind, uint32_t NumValueSites);
+
   /// Add ValueData for ValueKind at value Site.
   void addValueData(uint32_t ValueKind, uint32_t Site,
                     InstrProfValueData *VData, uint32_t N,
@@ -635,6 +676,13 @@ struct InstrProfRecord {
         SR.sortByCount();
     }
   }
+
+  /// Clear value data entries and edge counters.
+  void Clear() {
+    Counts.clear();
+    clearValueData();
+  }
+
   /// Clear value data entries
   void clearValueData() {
     for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
@@ -646,11 +694,15 @@ struct InstrProfRecord {
 
 private:
   std::vector<InstrProfValueSiteRecord> IndirectCallSites;
+  std::vector<InstrProfValueSiteRecord> MemOPSizes;
   const std::vector<InstrProfValueSiteRecord> &
+
   getValueSitesForKind(uint32_t ValueKind) const {
     switch (ValueKind) {
     case IPVK_IndirectCallTarget:
       return IndirectCallSites;
+    case IPVK_MemOPSize:
+      return MemOPSizes;
     default:
       llvm_unreachable("Unknown value kind!");
     }
@@ -672,6 +724,7 @@ private:
   // Scale merged value counts by \p Weight.
   void mergeValueProfData(uint32_t ValueKind, InstrProfRecord &Src,
                           uint64_t Weight);
+
   // Scale up value profile data count.
   void scaleValueProfData(uint32_t ValueKind, uint64_t Weight);
 };
@@ -706,7 +759,7 @@ std::unique_ptr<InstrProfValueData[]>
 InstrProfRecord::getValueForSite(uint32_t ValueKind, uint32_t Site,
                                  uint64_t *TotalC) const {
   uint64_t Dummy;
-  uint64_t &TotalCount = (TotalC == 0 ? Dummy : *TotalC);
+  uint64_t &TotalCount = (TotalC == nullptr ? Dummy : *TotalC);
   uint32_t N = getNumValueDataForSite(ValueKind, Site);
   if (N == 0) {
     TotalCount = 0;
@@ -762,7 +815,6 @@ namespace IndexedInstrProf {
 
 enum class HashT : uint32_t {
   MD5,
-
   Last = MD5
 };
 
@@ -816,7 +868,6 @@ struct Header {
 // format. It is introduced in version 4. The summary data follows
 // right after the profile file header.
 struct Summary {
-
   struct Entry {
     uint64_t Cutoff; ///< The required percentile of total execution count.
     uint64_t
@@ -857,13 +908,16 @@ struct Summary {
   const uint64_t *getSummaryDataBase() const {
     return reinterpret_cast<const uint64_t *>(this + 1);
   }
+
   uint64_t *getSummaryDataBase() {
     return reinterpret_cast<uint64_t *>(this + 1);
   }
+
   const Entry *getCutoffEntryBase() const {
     return reinterpret_cast<const Entry *>(
         &getSummaryDataBase()[NumSummaryFields]);
   }
+
   Entry *getCutoffEntryBase() {
     return reinterpret_cast<Entry *>(&getSummaryDataBase()[NumSummaryFields]);
   }
@@ -877,6 +931,7 @@ struct Summary {
   }
 
   const Entry &getEntry(uint32_t I) const { return getCutoffEntryBase()[I]; }
+
   void setEntry(uint32_t I, const ProfileSummaryEntry &E) {
     Entry &ER = getCutoffEntryBase()[I];
     ER.Cutoff = E.Cutoff;
@@ -894,6 +949,7 @@ inline std::unique_ptr<Summary> allocSummary(uint32_t TotalSize) {
   return std::unique_ptr<Summary>(new (::operator new(TotalSize))
                                       Summary(TotalSize));
 }
+
 } // end namespace IndexedInstrProf
 
 namespace RawInstrProf {
@@ -937,6 +993,10 @@ struct Header {
 
 } // end namespace RawInstrProf
 
+// Parse MemOP Size range option.
+void getMemOPSizeRangeFromOption(std::string Str, int64_t &RangeStart,
+                                 int64_t &RangeLast);
+
 } // end namespace llvm
 
 #endif // LLVM_PROFILEDATA_INSTRPROF_H
diff --git a/include/llvm/ProfileData/InstrProfData.inc b/include/llvm/ProfileData/InstrProfData.inc
index f7c22d10763c5e266ef5be78de6085e5f7ba7f1e..6ef1625d81c42d094e1456adb81a872498a56bad 100644
--- a/include/llvm/ProfileData/InstrProfData.inc
+++ b/include/llvm/ProfileData/InstrProfData.inc
@@ -153,7 +153,17 @@ INSTR_PROF_RAW_HEADER(uint64_t, ValueKindLast, IPVK_Last)
 VALUE_PROF_FUNC_PARAM(uint64_t, TargetValue, Type::getInt64Ty(Ctx)) \
                       INSTR_PROF_COMMA
 VALUE_PROF_FUNC_PARAM(void *, Data, Type::getInt8PtrTy(Ctx)) INSTR_PROF_COMMA
+#ifndef VALUE_RANGE_PROF
 VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
+#else /* VALUE_RANGE_PROF */
+VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx)) \
+                      INSTR_PROF_COMMA
+VALUE_PROF_FUNC_PARAM(uint64_t, PreciseRangeStart, Type::getInt64Ty(Ctx)) \
+                      INSTR_PROF_COMMA
+VALUE_PROF_FUNC_PARAM(uint64_t, PreciseRangeLast, Type::getInt64Ty(Ctx)) \
+                      INSTR_PROF_COMMA
+VALUE_PROF_FUNC_PARAM(uint64_t, LargeValue, Type::getInt64Ty(Ctx))
+#endif /*VALUE_RANGE_PROF */
 #undef VALUE_PROF_FUNC_PARAM
 #undef INSTR_PROF_COMMA
 /* VALUE_PROF_FUNC_PARAM end */
@@ -174,13 +184,15 @@ VALUE_PROF_FUNC_PARAM(uint32_t, CounterIndex, Type::getInt32Ty(Ctx))
  * name hash and the function address.
  */
 VALUE_PROF_KIND(IPVK_IndirectCallTarget, 0)
+/* For memory intrinsic functions size profiling. */
+VALUE_PROF_KIND(IPVK_MemOPSize, 1)
 /* These two kinds must be the last to be
  * declared. This is to make sure the string
  * array created with the template can be
  * indexed with the kind value.
  */
 VALUE_PROF_KIND(IPVK_First, IPVK_IndirectCallTarget)
-VALUE_PROF_KIND(IPVK_Last, IPVK_IndirectCallTarget)
+VALUE_PROF_KIND(IPVK_Last, IPVK_MemOPSize)
 
 #undef VALUE_PROF_KIND
 /* VALUE_PROF_KIND end */
@@ -649,6 +661,9 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_VALUE_PROF_FUNC __llvm_profile_instrument_target
 #define INSTR_PROF_VALUE_PROF_FUNC_STR \
         INSTR_PROF_QUOTE(INSTR_PROF_VALUE_PROF_FUNC)
+#define INSTR_PROF_VALUE_RANGE_PROF_FUNC __llvm_profile_instrument_range
+#define INSTR_PROF_VALUE_RANGE_PROF_FUNC_STR \
+        INSTR_PROF_QUOTE(INSTR_PROF_VALUE_RANGE_PROF_FUNC)
 
 /* InstrProfile per-function control data alignment.  */
 #define INSTR_PROF_DATA_ALIGNMENT 8
diff --git a/include/llvm/ProfileData/InstrProfReader.h b/include/llvm/ProfileData/InstrProfReader.h
index 65b11f61d10bc36bb87d07533f89102d76597283..1d85a7149afc80497e39f2e7becb804414babf52 100644
--- a/include/llvm/ProfileData/InstrProfReader.h
+++ b/include/llvm/ProfileData/InstrProfReader.h
@@ -1,4 +1,4 @@
-//=-- InstrProfReader.h - Instrumented profiling readers ----------*- C++ -*-=//
+//===- InstrProfReader.h - Instrumented profiling readers -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,14 +16,23 @@
 #define LLVM_PROFILEDATA_INSTRPROFREADER_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/OnDiskHashTable.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
 #include <iterator>
+#include <memory>
+#include <utility>
+#include <vector>
 
 namespace llvm {
 
@@ -32,12 +41,13 @@ class InstrProfReader;
 /// A file format agnostic iterator over profiling data.
 class InstrProfIterator : public std::iterator<std::input_iterator_tag,
                                                InstrProfRecord> {
-  InstrProfReader *Reader;
+  InstrProfReader *Reader = nullptr;
   InstrProfRecord Record;
 
   void Increment();
+
 public:
-  InstrProfIterator() : Reader(nullptr) {}
+  InstrProfIterator() = default;
   InstrProfIterator(InstrProfReader *Reader) : Reader(Reader) { Increment(); }
 
   InstrProfIterator &operator++() { Increment(); return *this; }
@@ -50,19 +60,22 @@ public:
 /// Base class and interface for reading profiling data of any known instrprof
 /// format. Provides an iterator over InstrProfRecords.
 class InstrProfReader {
-  instrprof_error LastError;
+  instrprof_error LastError = instrprof_error::success;
 
 public:
-  InstrProfReader() : LastError(instrprof_error::success), Symtab() {}
-  virtual ~InstrProfReader() {}
+  InstrProfReader() = default;
+  virtual ~InstrProfReader() = default;
 
   /// Read the header.  Required before reading first record.
   virtual Error readHeader() = 0;
+
   /// Read a single record.
   virtual Error readNextRecord(InstrProfRecord &Record) = 0;
+
   /// Iterator over profile data.
   InstrProfIterator begin() { return InstrProfIterator(this); }
   InstrProfIterator end() { return InstrProfIterator(); }
+
   virtual bool isIRLevelProfile() const = 0;
 
   /// Return the PGO symtab. There are three different readers:
@@ -86,6 +99,7 @@ protected:
       return Error::success();
     return make_error<InstrProfError>(Err);
   }
+
   Error error(Error E) { return error(InstrProfError::take(std::move(E))); }
 
   /// Clear the current error and return a successful one.
@@ -94,8 +108,10 @@ protected:
 public:
   /// Return true if the reader has finished reading the profile data.
   bool isEOF() { return LastError == instrprof_error::eof; }
+
   /// Return true if the reader encountered an error reading profiling data.
   bool hasError() { return LastError != instrprof_error::success && !isEOF(); }
+
   /// Get the current error.
   Error getError() {
     if (hasError())
@@ -125,16 +141,15 @@ private:
   std::unique_ptr<MemoryBuffer> DataBuffer;
   /// Iterator over the profile data.
   line_iterator Line;
-  bool IsIRLevelProfile;
+  bool IsIRLevelProfile = false;
 
-  TextInstrProfReader(const TextInstrProfReader &) = delete;
-  TextInstrProfReader &operator=(const TextInstrProfReader &) = delete;
   Error readValueProfileData(InstrProfRecord &Record);
 
 public:
   TextInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer_)
-      : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, true, '#'),
-        IsIRLevelProfile(false) {}
+      : DataBuffer(std::move(DataBuffer_)), Line(*DataBuffer, true, '#') {}
+  TextInstrProfReader(const TextInstrProfReader &) = delete;
+  TextInstrProfReader &operator=(const TextInstrProfReader &) = delete;
 
   /// Return true if the given buffer is in text instrprof format.
   static bool hasFormat(const MemoryBuffer &Buffer);
@@ -143,6 +158,7 @@ public:
 
   /// Read the header.
   Error readHeader() override;
+
   /// Read a single record.
   Error readNextRecord(InstrProfRecord &Record) override;
 
@@ -184,15 +200,16 @@ private:
 
   InstrProfRecord::ValueMapType FunctionPtrToNameMap;
 
-  RawInstrProfReader(const RawInstrProfReader &) = delete;
-  RawInstrProfReader &operator=(const RawInstrProfReader &) = delete;
 public:
   RawInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
       : DataBuffer(std::move(DataBuffer)) { }
+  RawInstrProfReader(const RawInstrProfReader &) = delete;
+  RawInstrProfReader &operator=(const RawInstrProfReader &) = delete;
 
   static bool hasFormat(const MemoryBuffer &DataBuffer);
   Error readHeader() override;
   Error readNextRecord(InstrProfRecord &Record) override;
+
   bool isIRLevelProfile() const override {
     return (Version & VARIANT_MASK_IR_PROF) != 0;
   }
@@ -206,9 +223,11 @@ private:
   Error createSymtab(InstrProfSymtab &Symtab);
   Error readNextHeader(const char *CurrentPos);
   Error readHeader(const RawInstrProf::Header &Header);
+
   template <class IntT> IntT swap(IntT Int) const {
     return ShouldSwapBytes ? sys::getSwappedBytes(Int) : Int;
   }
+
   support::endianness getDataEndianness() const {
     support::endianness HostEndian = getHostEndianness();
     if (!ShouldSwapBytes)
@@ -222,15 +241,18 @@ private:
   inline uint8_t getNumPaddingBytes(uint64_t SizeInBytes) {
     return 7 & (sizeof(uint64_t) - SizeInBytes % sizeof(uint64_t));
   }
+
   Error readName(InstrProfRecord &Record);
   Error readFuncHash(InstrProfRecord &Record);
   Error readRawCounts(InstrProfRecord &Record);
   Error readValueProfilingData(InstrProfRecord &Record);
   bool atEnd() const { return Data == DataEnd; }
+
   void advanceData() {
     Data++;
     ValueDataStart += CurValueDataSize;
   }
+
   const char *getNextHeaderPos() const {
       assert(atEnd());
       return (const char *)ValueDataStart;
@@ -240,6 +262,7 @@ private:
     ptrdiff_t Offset = (swap(CounterPtr) - CountersDelta) / sizeof(uint64_t);
     return CountersStart + Offset;
   }
+
   StringRef getName(uint64_t NameRef) const {
     return Symtab->getFuncName(swap(NameRef));
   }
@@ -249,8 +272,10 @@ typedef RawInstrProfReader<uint32_t> RawInstrProfReader32;
 typedef RawInstrProfReader<uint64_t> RawInstrProfReader64;
 
 namespace IndexedInstrProf {
+
 enum class HashT : uint32_t;
-}
+
+} // end namespace IndexedInstrProf
 
 /// Trait for lookups into the on-disk hash table for the binary instrprof
 /// format.
@@ -261,12 +286,11 @@ class InstrProfLookupTrait {
   // Endianness of the input value profile data.
   // It should be LE by default, but can be changed
   // for testing purpose.
-  support::endianness ValueProfDataEndianness;
+  support::endianness ValueProfDataEndianness = support::little;
 
 public:
   InstrProfLookupTrait(IndexedInstrProf::HashT HashType, unsigned FormatVersion)
-      : HashType(HashType), FormatVersion(FormatVersion),
-        ValueProfDataEndianness(support::little) {}
+      : HashType(HashType), FormatVersion(FormatVersion) {}
 
   typedef ArrayRef<InstrProfRecord> data_type;
 
@@ -284,6 +308,7 @@ public:
   static std::pair<offset_type, offset_type>
   ReadKeyDataLength(const unsigned char *&D) {
     using namespace support;
+
     offset_type KeyLen = endian::readNext<offset_type, little, unaligned>(D);
     offset_type DataLen = endian::readNext<offset_type, little, unaligned>(D);
     return std::make_pair(KeyLen, DataLen);
@@ -304,16 +329,18 @@ public:
 };
 
 struct InstrProfReaderIndexBase {
+  virtual ~InstrProfReaderIndexBase() = default;
+
   // Read all the profile records with the same key pointed to the current
   // iterator.
   virtual Error getRecords(ArrayRef<InstrProfRecord> &Data) = 0;
+
   // Read all the profile records with the key equal to FuncName
   virtual Error getRecords(StringRef FuncName,
                                      ArrayRef<InstrProfRecord> &Data) = 0;
   virtual void advanceToNextKey() = 0;
   virtual bool atEnd() const = 0;
   virtual void setValueProfDataEndianness(support::endianness Endianness) = 0;
-  virtual ~InstrProfReaderIndexBase() {}
   virtual uint64_t getVersion() const = 0;
   virtual bool isIRLevelProfile() const = 0;
   virtual void populateSymtab(InstrProfSymtab &) = 0;
@@ -335,22 +362,27 @@ public:
                        const unsigned char *const Payload,
                        const unsigned char *const Base,
                        IndexedInstrProf::HashT HashType, uint64_t Version);
+  ~InstrProfReaderIndex() override = default;
 
   Error getRecords(ArrayRef<InstrProfRecord> &Data) override;
   Error getRecords(StringRef FuncName,
                    ArrayRef<InstrProfRecord> &Data) override;
   void advanceToNextKey() override { RecordIterator++; }
+
   bool atEnd() const override {
     return RecordIterator == HashTable->data_end();
   }
+
   void setValueProfDataEndianness(support::endianness Endianness) override {
     HashTable->getInfoObj().setValueProfDataEndianness(Endianness);
   }
-  ~InstrProfReaderIndex() override {}
+
   uint64_t getVersion() const override { return GET_VERSION(FormatVersion); }
+
   bool isIRLevelProfile() const override {
     return (FormatVersion & VARIANT_MASK_IR_PROF) != 0;
   }
+
   void populateSymtab(InstrProfSymtab &Symtab) override {
     Symtab.create(HashTable->keys());
   }
@@ -366,20 +398,20 @@ private:
   /// Profile summary data.
   std::unique_ptr<ProfileSummary> Summary;
 
-  IndexedInstrProfReader(const IndexedInstrProfReader &) = delete;
-  IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete;
-
   // Read the profile summary. Return a pointer pointing to one byte past the
   // end of the summary data if it exists or the input \c Cur.
   const unsigned char *readSummary(IndexedInstrProf::ProfVersion Version,
                                    const unsigned char *Cur);
 
 public:
+  IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
+      : DataBuffer(std::move(DataBuffer)) {}
+  IndexedInstrProfReader(const IndexedInstrProfReader &) = delete;
+  IndexedInstrProfReader &operator=(const IndexedInstrProfReader &) = delete;
+
   /// Return the profile version.
   uint64_t getVersion() const { return Index->getVersion(); }
   bool isIRLevelProfile() const override { return Index->isIRLevelProfile(); }
-  IndexedInstrProfReader(std::unique_ptr<MemoryBuffer> DataBuffer)
-      : DataBuffer(std::move(DataBuffer)), Index(nullptr) {}
 
   /// Return true if the given buffer is in an indexed instrprof format.
   static bool hasFormat(const MemoryBuffer &DataBuffer);
@@ -422,4 +454,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_PROFILEDATA_INSTRPROFREADER_H
diff --git a/include/llvm/ProfileData/InstrProfWriter.h b/include/llvm/ProfileData/InstrProfWriter.h
index f7780fb45004849e52d9444aded834f5cb295421..10742c0228ebe82983793fd7bce03fcf52378a15 100644
--- a/include/llvm/ProfileData/InstrProfWriter.h
+++ b/include/llvm/ProfileData/InstrProfWriter.h
@@ -1,4 +1,4 @@
-//=-- InstrProfWriter.h - Instrumented profiling writer -----------*- C++ -*-=//
+//===- InstrProfWriter.h - Instrumented profiling writer --------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,16 +16,19 @@
 #define LLVM_PROFILEDATA_INSTRPROFWRITER_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ProfileData/InstrProf.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
+#include <memory>
 
 namespace llvm {
 
 /// Writer for instrumentation based profile data.
-class ProfOStream;
 class InstrProfRecordWriterTrait;
+class ProfOStream;
 
 class InstrProfWriter {
 public:
@@ -35,7 +38,7 @@ public:
 private:
   bool Sparse;
   StringMap<ProfilingData> FunctionData;
-  ProfKind ProfileKind;
+  ProfKind ProfileKind = PF_Unknown;
   // Use raw pointer here for the incomplete type object.
   InstrProfRecordWriterTrait *InfoObj;
 
@@ -47,15 +50,20 @@ public:
   /// for this function and the hash and number of counts match, each counter is
   /// summed. Optionally scale counts by \p Weight.
   Error addRecord(InstrProfRecord &&I, uint64_t Weight = 1);
+
   /// Merge existing function counts from the given writer.
   Error mergeRecordsFromWriter(InstrProfWriter &&IPW);
+
   /// Write the profile to \c OS
   void write(raw_fd_ostream &OS);
+
   /// Write the profile in text format to \c OS
   void writeText(raw_fd_ostream &OS);
+
   /// Write \c Record in text format to \c OS
   static void writeRecordInText(const InstrProfRecord &Record,
                                 InstrProfSymtab &Symtab, raw_fd_ostream &OS);
+
   /// Write the profile, returning the raw data. For testing.
   std::unique_ptr<MemoryBuffer> writeBuffer();
 
@@ -82,4 +90,4 @@ private:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_PROFILEDATA_INSTRPROFWRITER_H
diff --git a/include/llvm/ProfileData/ProfileCommon.h b/include/llvm/ProfileData/ProfileCommon.h
index e955755e5c9a54de170851600e7ea7d7d18da05c..987e3160ccae26fa158c01f8a2286187828530c3 100644
--- a/include/llvm/ProfileData/ProfileCommon.h
+++ b/include/llvm/ProfileData/ProfileCommon.h
@@ -1,4 +1,4 @@
-//===-- ProfileCommon.h - Common profiling APIs. ----------------*- C++ -*-===//
+//===- ProfileCommon.h - Common profiling APIs. -----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,38 +12,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PROFILEDATA_PROFILE_COMMON_H
-#define LLVM_PROFILEDATA_PROFILE_COMMON_H
+#ifndef LLVM_PROFILEDATA_PROFILECOMMON_H
+#define LLVM_PROFILEDATA_PROFILECOMMON_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
 #include <cstdint>
 #include <functional>
 #include <map>
-#include <utility>
+#include <memory>
 #include <vector>
 
-#include "llvm/IR/ProfileSummary.h"
-#include "llvm/Support/Error.h"
-#include "llvm/ADT/ArrayRef.h"
-
 namespace llvm {
-class Function;
-namespace IndexedInstrProf {
-struct Summary;
-}
+
+struct InstrProfRecord;
+
 namespace sampleprof {
+
 class FunctionSamples;
-}
-struct InstrProfRecord;
-class LLVMContext;
-class Metadata;
-class MDTuple;
-class MDNode;
+
+} // end namespace sampleprof
 
 inline const char *getHotSectionPrefix() { return ".hot"; }
 inline const char *getUnlikelySectionPrefix() { return ".unlikely"; }
 
 class ProfileSummaryBuilder {
-
 private:
   /// We keep track of the number of times a count (block count or samples)
   /// appears in the profile. The map is kept sorted in the descending order of
@@ -53,13 +48,18 @@ private:
 
 protected:
   SummaryEntryVector DetailedSummary;
+  uint64_t TotalCount = 0;
+  uint64_t MaxCount = 0;
+  uint64_t MaxFunctionCount = 0;
+  uint32_t NumCounts = 0;
+  uint32_t NumFunctions = 0;
+
   ProfileSummaryBuilder(std::vector<uint32_t> Cutoffs)
       : DetailedSummaryCutoffs(std::move(Cutoffs)) {}
-  inline void addCount(uint64_t Count);
   ~ProfileSummaryBuilder() = default;
+
+  inline void addCount(uint64_t Count);
   void computeDetailedSummary();
-  uint64_t TotalCount = 0, MaxCount = 0, MaxFunctionCount = 0;
-  uint32_t NumCounts = 0, NumFunctions = 0;
 
 public:
   /// \brief A vector of useful cutoff values for detailed summary.
@@ -68,22 +68,24 @@ public:
 
 class InstrProfSummaryBuilder final : public ProfileSummaryBuilder {
   uint64_t MaxInternalBlockCount = 0;
+
   inline void addEntryCount(uint64_t Count);
   inline void addInternalCount(uint64_t Count);
 
 public:
   InstrProfSummaryBuilder(std::vector<uint32_t> Cutoffs)
       : ProfileSummaryBuilder(std::move(Cutoffs)) {}
+
   void addRecord(const InstrProfRecord &);
   std::unique_ptr<ProfileSummary> getSummary();
 };
 
 class SampleProfileSummaryBuilder final : public ProfileSummaryBuilder {
-
 public:
-  void addRecord(const sampleprof::FunctionSamples &FS);
   SampleProfileSummaryBuilder(std::vector<uint32_t> Cutoffs)
       : ProfileSummaryBuilder(std::move(Cutoffs)) {}
+
+  void addRecord(const sampleprof::FunctionSamples &FS);
   std::unique_ptr<ProfileSummary> getSummary();
 };
 
@@ -96,6 +98,6 @@ void ProfileSummaryBuilder::addCount(uint64_t Count) {
   CountFrequencies[Count]++;
 }
 
-
 } // end namespace llvm
-#endif
+
+#endif // LLVM_PROFILEDATA_PROFILECOMMON_H
diff --git a/include/llvm/ProfileData/SampleProf.h b/include/llvm/ProfileData/SampleProf.h
index b286df34957585e2871c0531f278e1d0316bf6ad..a35ae4f92bd140996e0308fce5e924e4ef40c8ec 100644
--- a/include/llvm/ProfileData/SampleProf.h
+++ b/include/llvm/ProfileData/SampleProf.h
@@ -1,4 +1,4 @@
-//=-- SampleProf.h - Sampling profiling format support --------------------===//
+//===- SampleProf.h - Sampling profiling format support ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,20 +12,30 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_PROFILEDATA_SAMPLEPROF_H_
-#define LLVM_PROFILEDATA_SAMPLEPROF_H_
+#ifndef LLVM_PROFILEDATA_SAMPLEPROF_H
+#define LLVM_PROFILEDATA_SAMPLEPROF_H
 
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/raw_ostream.h"
-
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cstdint>
 #include <map>
+#include <string>
 #include <system_error>
+#include <utility>
 
 namespace llvm {
 
+class raw_ostream;
+
 const std::error_category &sampleprof_category();
 
 enum class sampleprof_error {
@@ -59,12 +69,13 @@ inline sampleprof_error MergeResult(sampleprof_error &Accumulator,
 } // end namespace llvm
 
 namespace std {
+
 template <>
 struct is_error_code_enum<llvm::sampleprof_error> : std::true_type {};
-}
 
-namespace llvm {
+} // end namespace std
 
+namespace llvm {
 namespace sampleprof {
 
 static inline uint64_t SPMagic() {
@@ -87,8 +98,10 @@ static inline uint64_t SPVersion() { return 103; }
 /// (e.g., the two post-increment instructions in "if (p) x++; else y++;").
 struct LineLocation {
   LineLocation(uint32_t L, uint32_t D) : LineOffset(L), Discriminator(D) {}
+
   void print(raw_ostream &OS) const;
   void dump() const;
+
   bool operator<(const LineLocation &O) const {
     return LineOffset < O.LineOffset ||
            (LineOffset == O.LineOffset && Discriminator < O.Discriminator);
@@ -114,7 +127,7 @@ class SampleRecord {
 public:
   typedef StringMap<uint64_t> CallTargetMap;
 
-  SampleRecord() : NumSamples(0), CallTargets() {}
+  SampleRecord() = default;
 
   /// Increment the number of samples for this record by \p S.
   /// Optionally scale sample count \p S by \p Weight.
@@ -144,7 +157,7 @@ public:
   }
 
   /// Return true if this sample record contains function calls.
-  bool hasCalls() const { return CallTargets.size() > 0; }
+  bool hasCalls() const { return !CallTargets.empty(); }
 
   uint64_t getSamples() const { return NumSamples; }
   const CallTargetMap &getCallTargets() const { return CallTargets; }
@@ -163,7 +176,7 @@ public:
   void dump() const;
 
 private:
-  uint64_t NumSamples;
+  uint64_t NumSamples = 0;
   CallTargetMap CallTargets;
 };
 
@@ -180,9 +193,11 @@ typedef std::map<LineLocation, FunctionSamples> CallsiteSampleMap;
 /// within the body of the function.
 class FunctionSamples {
 public:
-  FunctionSamples() : Name(), TotalSamples(0), TotalHeadSamples(0) {}
+  FunctionSamples() = default;
+
   void print(raw_ostream &OS = dbgs(), unsigned Indent = 0) const;
   void dump() const;
+
   sampleprof_error addTotalSamples(uint64_t Num, uint64_t Weight = 1) {
     bool Overflowed;
     TotalSamples =
@@ -190,6 +205,7 @@ public:
     return Overflowed ? sampleprof_error::counter_overflow
                       : sampleprof_error::success;
   }
+
   sampleprof_error addHeadSamples(uint64_t Num, uint64_t Weight = 1) {
     bool Overflowed;
     TotalHeadSamples =
@@ -197,11 +213,13 @@ public:
     return Overflowed ? sampleprof_error::counter_overflow
                       : sampleprof_error::success;
   }
+
   sampleprof_error addBodySamples(uint32_t LineOffset, uint32_t Discriminator,
                                   uint64_t Num, uint64_t Weight = 1) {
     return BodySamples[LineLocation(LineOffset, Discriminator)].addSamples(
         Num, Weight);
   }
+
   sampleprof_error addCalledTargetSamples(uint32_t LineOffset,
                                           uint32_t Discriminator,
                                           const std::string &FName,
@@ -222,21 +240,6 @@ public:
       return ret->second.getSamples();
   }
 
-  /// Return the total number of call target samples collected at a given
-  /// location. Each location is specified by \p LineOffset and
-  /// \p Discriminator. If the location is not found in profile, return error.
-  ErrorOr<uint64_t> findCallSamplesAt(uint32_t LineOffset,
-                                      uint32_t Discriminator) const {
-    const auto &ret = BodySamples.find(LineLocation(LineOffset, Discriminator));
-    if (ret == BodySamples.end())
-      return std::error_code();
-    uint64_t T = 0;
-    for (const auto &t_c : ret->second.getCallTargets()) {
-      T += t_c.second;
-    }
-    return T;
-  }
-
   /// Returns the call target map collected at a given location.
   /// Each location is specified by \p LineOffset and \p Discriminator.
   /// If the location is not found in profile, return error.
@@ -300,6 +303,20 @@ public:
     return Result;
   }
 
+  /// Recursively traverses all children, if the corresponding function is
+  /// not defined in module \p M, and its total sample is no less than
+  /// \p Threshold, add its corresponding GUID to \p S.
+  void findImportedFunctions(DenseSet<GlobalValue::GUID> &S, const Module *M,
+                             uint64_t Threshold) const {
+    if (TotalSamples <= Threshold)
+      return;
+    Function *F = M->getFunction(Name);
+    if (!F || !F->getSubprogram())
+      S.insert(Function::getGUID(Name));
+    for (auto CS : CallsiteSamples)
+      CS.second.findImportedFunctions(S, M, Threshold);
+  }
+
   /// Set the name of the function.
   void setName(StringRef FunctionName) { Name = FunctionName; }
 
@@ -314,12 +331,12 @@ private:
   ///
   /// Samples are cumulative, they include all the samples collected
   /// inside this function and all its inlined callees.
-  uint64_t TotalSamples;
+  uint64_t TotalSamples = 0;
 
   /// Total number of samples collected at the head of the function.
   /// This is an approximation of the number of calls made to this function
   /// at runtime.
-  uint64_t TotalHeadSamples;
+  uint64_t TotalHeadSamples = 0;
 
   /// Map instruction locations to collected samples.
   ///
@@ -366,6 +383,7 @@ public:
                        return A->first < B->first;
                      });
   }
+
   const SamplesWithLocList &get() const { return V; }
 
 private:
@@ -373,7 +391,6 @@ private:
 };
 
 } // end namespace sampleprof
-
 } // end namespace llvm
 
-#endif // LLVM_PROFILEDATA_SAMPLEPROF_H_
+#endif // LLVM_PROFILEDATA_SAMPLEPROF_H
diff --git a/include/llvm/ProfileData/SampleProfReader.h b/include/llvm/ProfileData/SampleProfReader.h
index bf86721709c7b1a785aafa84bf04b7a4ab28d92b..29e3aba3e0e76ad2610b77ba526d8eafca8c2243 100644
--- a/include/llvm/ProfileData/SampleProfReader.h
+++ b/include/llvm/ProfileData/SampleProfReader.h
@@ -1,4 +1,4 @@
-//===- SampleProfReader.h - Read LLVM sample profile data -----------------===//
+//===- SampleProfReader.h - Read LLVM sample profile data -------*- C++ -*-===//
 //
 //                      The LLVM Compiler Infrastructure
 //
@@ -205,26 +205,34 @@
 //        FUNCTION BODY
 //          A FUNCTION BODY entry describing the inlined function.
 //===----------------------------------------------------------------------===//
+
 #ifndef LLVM_PROFILEDATA_SAMPLEPROFREADER_H
 #define LLVM_PROFILEDATA_SAMPLEPROFREADER_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/GCOV.h"
 #include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <vector>
 
 namespace llvm {
 
+class raw_ostream;
+
 namespace sampleprof {
 
 /// \brief Sample-based profile reader.
@@ -259,7 +267,7 @@ public:
   SampleProfileReader(std::unique_ptr<MemoryBuffer> B, LLVMContext &C)
       : Profiles(0), Ctx(C), Buffer(std::move(B)) {}
 
-  virtual ~SampleProfileReader() {}
+  virtual ~SampleProfileReader() = default;
 
   /// \brief Read and validate the file header.
   virtual std::error_code readHeader() = 0;
@@ -275,7 +283,12 @@ public:
 
   /// \brief Return the samples collected for function \p F.
   FunctionSamples *getSamplesFor(const Function &F) {
-    return &Profiles[F.getName()];
+    // The function name may have been updated by adding suffix. In sample
+    // profile, the function names are all stripped, so we need to strip
+    // the function name suffix before matching with profile.
+    if (Profiles.count(F.getName().split('.').first))
+      return &Profiles[(F.getName().split('.').first)];
+    return nullptr;
   }
 
   /// \brief Return all the profiles.
@@ -442,8 +455,8 @@ protected:
   static const uint32_t GCOVTagAFDOFunction = 0xac000000;
 };
 
-} // End namespace sampleprof
+} // end namespace sampleprof
 
-} // End namespace llvm
+} // end namespace llvm
 
 #endif // LLVM_PROFILEDATA_SAMPLEPROFREADER_H
diff --git a/include/llvm/ProfileData/SampleProfWriter.h b/include/llvm/ProfileData/SampleProfWriter.h
index f6f2e2702e31674f57a62777fc7a29a45c1d10b3..9d69af32dd46a52bd397d231021f6dafb72ff22f 100644
--- a/include/llvm/ProfileData/SampleProfWriter.h
+++ b/include/llvm/ProfileData/SampleProfWriter.h
@@ -1,4 +1,4 @@
-//===- SampleProfWriter.h - Write LLVM sample profile data ----------------===//
+//===- SampleProfWriter.h - Write LLVM sample profile data ------*- C++ -*-===//
 //
 //                      The LLVM Compiler Infrastructure
 //
@@ -14,15 +14,18 @@
 #define LLVM_PROFILEDATA_SAMPLEPROFWRITER_H
 
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/ErrorOr.h"
-#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <system_error>
 
 namespace llvm {
-
 namespace sampleprof {
 
 enum SampleProfileFormat { SPF_None = 0, SPF_Text, SPF_Binary, SPF_GCC };
@@ -30,7 +33,7 @@ enum SampleProfileFormat { SPF_None = 0, SPF_Text, SPF_Binary, SPF_GCC };
 /// \brief Sample-based profile writer. Base class.
 class SampleProfileWriter {
 public:
-  virtual ~SampleProfileWriter() {}
+  virtual ~SampleProfileWriter() = default;
 
   /// Write sample profiles in \p S.
   ///
@@ -114,7 +117,7 @@ public:
 
 protected:
   SampleProfileWriterBinary(std::unique_ptr<raw_ostream> &OS)
-      : SampleProfileWriter(OS), NameTable() {}
+      : SampleProfileWriter(OS) {}
 
   std::error_code
   writeHeader(const StringMap<FunctionSamples> &ProfileMap) override;
@@ -133,8 +136,7 @@ private:
                               SampleProfileFormat Format);
 };
 
-} // End namespace sampleprof
-
-} // End namespace llvm
+} // end namespace sampleprof
+} // end namespace llvm
 
 #endif // LLVM_PROFILEDATA_SAMPLEPROFWRITER_H
diff --git a/include/llvm/Support/AArch64TargetParser.def b/include/llvm/Support/AArch64TargetParser.def
index c4416f099de1ac0daeb4a752db488bd6dabbafbc..46d253bf0ec772a563e3d96d7a047424d238f194 100644
--- a/include/llvm/Support/AArch64TargetParser.def
+++ b/include/llvm/Support/AArch64TargetParser.def
@@ -73,8 +73,17 @@ AARCH64_CPU_NAME("falkor", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                 (AArch64::AEK_SIMD | AArch64::AEK_CRC | AArch64::AEK_CRYPTO))
 AARCH64_CPU_NAME("kryo", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
                 (AArch64::AEK_SIMD | AArch64::AEK_CRC | AArch64::AEK_CRYPTO))
-AARCH64_CPU_NAME("vulcan", AK_ARMV8_1A, FK_CRYPTO_NEON_FP_ARMV8, false,
-                (AArch64::AEK_SIMD | AArch64::AEK_CRC | AArch64::AEK_CRYPTO))
+AARCH64_CPU_NAME("thunderx2t99", AK_ARMV8_1A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                (AArch64::AEK_SIMD | AArch64::AEK_LSE | AArch64::AEK_CRC |
+                 AArch64::AEK_CRYPTO))
+AARCH64_CPU_NAME("thunderx", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                (AArch64::AEK_SIMD | AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP | AArch64::AEK_PROFILE))
+AARCH64_CPU_NAME("thunderxt88", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                (AArch64::AEK_SIMD | AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP | AArch64::AEK_PROFILE))
+AARCH64_CPU_NAME("thunderxt81", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                (AArch64::AEK_SIMD | AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP | AArch64::AEK_PROFILE))
+AARCH64_CPU_NAME("thunderxt83", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false,
+                (AArch64::AEK_SIMD | AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_FP | AArch64::AEK_PROFILE))
 // Invalid CPU
 AARCH64_CPU_NAME("invalid", AK_INVALID, FK_INVALID, true, AArch64::AEK_INVALID)
 #undef AARCH64_CPU_NAME
diff --git a/include/llvm/Support/ARMTargetParser.def b/include/llvm/Support/ARMTargetParser.def
index 58cb6381a9abaa49131305c951ad593641365023..18bf9af43226268b9c73e53958abe839d38d8394 100644
--- a/include/llvm/Support/ARMTargetParser.def
+++ b/include/llvm/Support/ARMTargetParser.def
@@ -76,6 +76,9 @@ ARM_ARCH("armv6-m", AK_ARMV6M, "6-M", "v6m", ARMBuildAttrs::CPUArch::v6_M,
           FK_NONE, ARM::AEK_NONE)
 ARM_ARCH("armv7-a", AK_ARMV7A, "7-A", "v7", ARMBuildAttrs::CPUArch::v7,
           FK_NEON, ARM::AEK_DSP)
+ARM_ARCH("armv7ve", AK_ARMV7VE, "7VE", "v7ve", ARMBuildAttrs::CPUArch::v7,
+          FK_NEON, (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT |
+          ARM::AEK_HWDIVARM | ARM::AEK_HWDIV | ARM::AEK_DSP))
 ARM_ARCH("armv7-r", AK_ARMV7R, "7-R", "v7r", ARMBuildAttrs::CPUArch::v7,
           FK_NONE, (ARM::AEK_HWDIV | ARM::AEK_DSP))
 ARM_ARCH("armv7-m", AK_ARMV7M, "7-M", "v7m", ARMBuildAttrs::CPUArch::v7,
@@ -229,6 +232,8 @@ ARM_CPU_NAME("sc300", AK_ARMV7M, FK_NONE, false, ARM::AEK_NONE)
 ARM_CPU_NAME("cortex-m3", AK_ARMV7M, FK_NONE, true, ARM::AEK_NONE)
 ARM_CPU_NAME("cortex-m4", AK_ARMV7EM, FK_FPV4_SP_D16, true, ARM::AEK_NONE)
 ARM_CPU_NAME("cortex-m7", AK_ARMV7EM, FK_FPV5_D16, false, ARM::AEK_NONE)
+ARM_CPU_NAME("cortex-m23", AK_ARMV8MBaseline, FK_NONE, false, ARM::AEK_NONE)
+ARM_CPU_NAME("cortex-m33", AK_ARMV8MMainline, FK_FPV5_SP_D16, false, ARM::AEK_DSP)
 ARM_CPU_NAME("cortex-a32", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("cortex-a35", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("cortex-a53", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, true, ARM::AEK_CRC)
@@ -239,6 +244,7 @@ ARM_CPU_NAME("cyclone", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("exynos-m1", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("exynos-m2", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 ARM_CPU_NAME("exynos-m3", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
+ARM_CPU_NAME("kryo", AK_ARMV8A, FK_CRYPTO_NEON_FP_ARMV8, false, ARM::AEK_CRC)
 // Non-standard Arch names.
 ARM_CPU_NAME("iwmmxt", AK_IWMMXT, FK_NONE, true, ARM::AEK_NONE)
 ARM_CPU_NAME("xscale", AK_XSCALE, FK_NONE, true, ARM::AEK_NONE)
diff --git a/include/llvm/Support/Allocator.h b/include/llvm/Support/Allocator.h
index c71759abd7d211219c39a61f33d2c57ca96a71cf..a5e662f4c588a9b290749deef98494479fa05379 100644
--- a/include/llvm/Support/Allocator.h
+++ b/include/llvm/Support/Allocator.h
@@ -1,4 +1,4 @@
-//===--- Allocator.h - Simple memory allocation abstraction -----*- C++ -*-===//
+//===- Allocator.h - Simple memory allocation abstraction -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -144,19 +144,18 @@ public:
                 "that objects larger than a slab go into their own memory "
                 "allocation.");
 
-  BumpPtrAllocatorImpl()
-      : CurPtr(nullptr), End(nullptr), BytesAllocated(0), Allocator() {}
+  BumpPtrAllocatorImpl() = default;
+
   template <typename T>
   BumpPtrAllocatorImpl(T &&Allocator)
-      : CurPtr(nullptr), End(nullptr), BytesAllocated(0),
-        Allocator(std::forward<T &&>(Allocator)) {}
+      : Allocator(std::forward<T &&>(Allocator)) {}
 
   // Manually implement a move constructor as we must clear the old allocator's
   // slabs as a matter of correctness.
   BumpPtrAllocatorImpl(BumpPtrAllocatorImpl &&Old)
       : CurPtr(Old.CurPtr), End(Old.End), Slabs(std::move(Old.Slabs)),
         CustomSizedSlabs(std::move(Old.CustomSizedSlabs)),
-        BytesAllocated(Old.BytesAllocated),
+        BytesAllocated(Old.BytesAllocated), RedZoneSize(Old.RedZoneSize),
         Allocator(std::move(Old.Allocator)) {
     Old.CurPtr = Old.End = nullptr;
     Old.BytesAllocated = 0;
@@ -176,6 +175,7 @@ public:
     CurPtr = RHS.CurPtr;
     End = RHS.End;
     BytesAllocated = RHS.BytesAllocated;
+    RedZoneSize = RHS.RedZoneSize;
     Slabs = std::move(RHS.Slabs);
     CustomSizedSlabs = std::move(RHS.CustomSizedSlabs);
     Allocator = std::move(RHS.Allocator);
@@ -218,10 +218,16 @@ public:
     size_t Adjustment = alignmentAdjustment(CurPtr, Alignment);
     assert(Adjustment + Size >= Size && "Adjustment + Size must not overflow");
 
+    size_t SizeToAllocate = Size;
+#if LLVM_ADDRESS_SANITIZER_BUILD
+    // Add trailing bytes as a "red zone" under ASan.
+    SizeToAllocate += RedZoneSize;
+#endif
+
     // Check if we have enough space.
-    if (Adjustment + Size <= size_t(End - CurPtr)) {
+    if (Adjustment + SizeToAllocate <= size_t(End - CurPtr)) {
       char *AlignedPtr = CurPtr + Adjustment;
-      CurPtr = AlignedPtr + Size;
+      CurPtr = AlignedPtr + SizeToAllocate;
       // Update the allocation point of this memory block in MemorySanitizer.
       // Without this, MemorySanitizer messages for values originated from here
       // will point to the allocation of the entire slab.
@@ -232,7 +238,7 @@ public:
     }
 
     // If Size is really big, allocate a separate slab for it.
-    size_t PaddedSize = Size + Alignment - 1;
+    size_t PaddedSize = SizeToAllocate + Alignment - 1;
     if (PaddedSize > SizeThreshold) {
       void *NewSlab = Allocator.Allocate(PaddedSize, 0);
       // We own the new slab and don't want anyone reading anyting other than
@@ -251,10 +257,10 @@ public:
     // Otherwise, start a new slab and try again.
     StartNewSlab();
     uintptr_t AlignedAddr = alignAddr(CurPtr, Alignment);
-    assert(AlignedAddr + Size <= (uintptr_t)End &&
+    assert(AlignedAddr + SizeToAllocate <= (uintptr_t)End &&
            "Unable to allocate memory!");
     char *AlignedPtr = (char*)AlignedAddr;
-    CurPtr = AlignedPtr + Size;
+    CurPtr = AlignedPtr + SizeToAllocate;
     __msan_allocated_memory(AlignedPtr, Size);
     __asan_unpoison_memory_region(AlignedPtr, Size);
     return AlignedPtr;
@@ -283,6 +289,10 @@ public:
 
   size_t getBytesAllocated() const { return BytesAllocated; }
 
+  void setRedZoneSize(size_t NewSize) {
+    RedZoneSize = NewSize;
+  }
+
   void PrintStats() const {
     detail::printBumpPtrAllocatorStats(Slabs.size(), BytesAllocated,
                                        getTotalMemory());
@@ -292,10 +302,10 @@ private:
   /// \brief The current pointer into the current slab.
   ///
   /// This points to the next free byte in the slab.
-  char *CurPtr;
+  char *CurPtr = nullptr;
 
   /// \brief The end of the current slab.
-  char *End;
+  char *End = nullptr;
 
   /// \brief The slabs allocated so far.
   SmallVector<void *, 4> Slabs;
@@ -306,7 +316,11 @@ private:
   /// \brief How many bytes we've allocated.
   ///
   /// Used so that we can compute how much space was wasted.
-  size_t BytesAllocated;
+  size_t BytesAllocated = 0;
+
+  /// \brief The number of bytes to put between allocations when running under
+  /// a sanitizer.
+  size_t RedZoneSize = 1;
 
   /// \brief The allocator instance we use to get slabs of memory.
   AllocatorT Allocator;
@@ -357,7 +371,7 @@ private:
 };
 
 /// \brief The standard BumpPtrAllocator which just uses the default template
-/// paramaters.
+/// parameters.
 typedef BumpPtrAllocatorImpl<> BumpPtrAllocator;
 
 /// \brief A BumpPtrAllocator that allows only elements of a specific type to be
@@ -369,7 +383,11 @@ template <typename T> class SpecificBumpPtrAllocator {
   BumpPtrAllocator Allocator;
 
 public:
-  SpecificBumpPtrAllocator() = default;
+  SpecificBumpPtrAllocator() {
+    // Because SpecificBumpPtrAllocator walks the memory to call destructors,
+    // it can't have red zones between allocations.
+    Allocator.setRedZoneSize(0);
+  }
   SpecificBumpPtrAllocator(SpecificBumpPtrAllocator &&Old)
       : Allocator(std::move(Old.Allocator)) {}
   ~SpecificBumpPtrAllocator() { DestroyAll(); }
diff --git a/include/llvm/Support/Atomic.h b/include/llvm/Support/Atomic.h
index d03714b009c513f8b093c2cdc152f74d5d1409bc..552313f0c2412bfebc895832ac06d8159f607497 100644
--- a/include/llvm/Support/Atomic.h
+++ b/include/llvm/Support/Atomic.h
@@ -20,6 +20,11 @@
 
 #include "llvm/Support/DataTypes.h"
 
+// Windows will at times define MemoryFence.
+#ifdef MemoryFence
+#undef MemoryFence
+#endif
+
 namespace llvm {
   namespace sys {
     void MemoryFence();
diff --git a/include/llvm/Support/BinaryByteStream.h b/include/llvm/Support/BinaryByteStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..694be28e07e16c33d021c4db16f411dbd5ff4706
--- /dev/null
+++ b/include/llvm/Support/BinaryByteStream.h
@@ -0,0 +1,192 @@
+//===- BinaryByteStream.h ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//===----------------------------------------------------------------------===//
+// A BinaryStream which stores data in a single continguous memory buffer.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BINARYBYTESTREAM_H
+#define LLVM_SUPPORT_BINARYBYTESTREAM_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileOutputBuffer.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <algorithm>
+#include <cstdint>
+#include <cstring>
+#include <memory>
+
+namespace llvm {
+
+/// \brief An implementation of BinaryStream which holds its entire data set
+/// in a single contiguous buffer.  BinaryByteStream guarantees that no read
+/// operation will ever incur a copy.  Note that BinaryByteStream does not
+/// own the underlying buffer.
+class BinaryByteStream : public BinaryStream {
+public:
+  BinaryByteStream() = default;
+  BinaryByteStream(ArrayRef<uint8_t> Data, llvm::support::endianness Endian)
+      : Endian(Endian), Data(Data) {}
+  BinaryByteStream(StringRef Data, llvm::support::endianness Endian)
+      : Endian(Endian), Data(Data.bytes_begin(), Data.bytes_end()) {}
+
+  llvm::support::endianness getEndian() const override { return Endian; }
+
+  Error readBytes(uint32_t Offset, uint32_t Size,
+                  ArrayRef<uint8_t> &Buffer) override {
+    if (auto EC = checkOffset(Offset, Size))
+      return EC;
+    Buffer = Data.slice(Offset, Size);
+    return Error::success();
+  }
+
+  Error readLongestContiguousChunk(uint32_t Offset,
+                                   ArrayRef<uint8_t> &Buffer) override {
+    if (auto EC = checkOffset(Offset, 1))
+      return EC;
+    Buffer = Data.slice(Offset);
+    return Error::success();
+  }
+
+  uint32_t getLength() override { return Data.size(); }
+
+  ArrayRef<uint8_t> data() const { return Data; }
+
+  StringRef str() const {
+    const char *CharData = reinterpret_cast<const char *>(Data.data());
+    return StringRef(CharData, Data.size());
+  }
+
+protected:
+  llvm::support::endianness Endian;
+  ArrayRef<uint8_t> Data;
+};
+
+/// \brief An implementation of BinaryStream whose data is backed by an llvm
+/// MemoryBuffer object.  MemoryBufferByteStream owns the MemoryBuffer in
+/// question.  As with BinaryByteStream, reading from a MemoryBufferByteStream
+/// will never cause a copy.
+class MemoryBufferByteStream : public BinaryByteStream {
+public:
+  MemoryBufferByteStream(std::unique_ptr<MemoryBuffer> Buffer,
+                         llvm::support::endianness Endian)
+      : BinaryByteStream(Buffer->getBuffer(), Endian),
+        MemBuffer(std::move(Buffer)) {}
+
+  std::unique_ptr<MemoryBuffer> MemBuffer;
+};
+
+/// \brief An implementation of BinaryStream which holds its entire data set
+/// in a single contiguous buffer.  As with BinaryByteStream, the mutable
+/// version also guarantees that no read operation will ever incur a copy,
+/// and similarly it does not own the underlying buffer.
+class MutableBinaryByteStream : public WritableBinaryStream {
+public:
+  MutableBinaryByteStream() = default;
+  MutableBinaryByteStream(MutableArrayRef<uint8_t> Data,
+                          llvm::support::endianness Endian)
+      : Data(Data), ImmutableStream(Data, Endian) {}
+
+  llvm::support::endianness getEndian() const override {
+    return ImmutableStream.getEndian();
+  }
+
+  Error readBytes(uint32_t Offset, uint32_t Size,
+                  ArrayRef<uint8_t> &Buffer) override {
+    return ImmutableStream.readBytes(Offset, Size, Buffer);
+  }
+
+  Error readLongestContiguousChunk(uint32_t Offset,
+                                   ArrayRef<uint8_t> &Buffer) override {
+    return ImmutableStream.readLongestContiguousChunk(Offset, Buffer);
+  }
+
+  uint32_t getLength() override { return ImmutableStream.getLength(); }
+
+  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Buffer) override {
+    if (Buffer.empty())
+      return Error::success();
+
+    if (auto EC = checkOffset(Offset, Buffer.size()))
+      return EC;
+
+    uint8_t *DataPtr = const_cast<uint8_t *>(Data.data());
+    ::memcpy(DataPtr + Offset, Buffer.data(), Buffer.size());
+    return Error::success();
+  }
+
+  Error commit() override { return Error::success(); }
+
+  MutableArrayRef<uint8_t> data() const { return Data; }
+
+private:
+  MutableArrayRef<uint8_t> Data;
+  BinaryByteStream ImmutableStream;
+};
+
+/// \brief An implementation of WritableBinaryStream backed by an llvm
+/// FileOutputBuffer.
+class FileBufferByteStream : public WritableBinaryStream {
+private:
+  class StreamImpl : public MutableBinaryByteStream {
+  public:
+    StreamImpl(std::unique_ptr<FileOutputBuffer> Buffer,
+               llvm::support::endianness Endian)
+        : MutableBinaryByteStream(
+              MutableArrayRef<uint8_t>(Buffer->getBufferStart(),
+                                       Buffer->getBufferEnd()),
+              Endian),
+          FileBuffer(std::move(Buffer)) {}
+
+    Error commit() override {
+      if (FileBuffer->commit())
+        return make_error<BinaryStreamError>(
+            stream_error_code::filesystem_error);
+      return Error::success();
+    }
+
+  private:
+    std::unique_ptr<FileOutputBuffer> FileBuffer;
+  };
+
+public:
+  FileBufferByteStream(std::unique_ptr<FileOutputBuffer> Buffer,
+                       llvm::support::endianness Endian)
+      : Impl(std::move(Buffer), Endian) {}
+
+  llvm::support::endianness getEndian() const override {
+    return Impl.getEndian();
+  }
+
+  Error readBytes(uint32_t Offset, uint32_t Size,
+                  ArrayRef<uint8_t> &Buffer) override {
+    return Impl.readBytes(Offset, Size, Buffer);
+  }
+
+  Error readLongestContiguousChunk(uint32_t Offset,
+                                   ArrayRef<uint8_t> &Buffer) override {
+    return Impl.readLongestContiguousChunk(Offset, Buffer);
+  }
+
+  uint32_t getLength() override { return Impl.getLength(); }
+
+  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) override {
+    return Impl.writeBytes(Offset, Data);
+  }
+
+  Error commit() override { return Impl.commit(); }
+
+private:
+  StreamImpl Impl;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_BYTESTREAM_H
diff --git a/include/llvm/DebugInfo/MSF/SequencedItemStream.h b/include/llvm/Support/BinaryItemStream.h
similarity index 55%
rename from include/llvm/DebugInfo/MSF/SequencedItemStream.h
rename to include/llvm/Support/BinaryItemStream.h
index 1949beef9fff69e93eef064c720ea8b424368bc6..f4b319217819ea8e46cfa0195fc433830e96ce69 100644
--- a/include/llvm/DebugInfo/MSF/SequencedItemStream.h
+++ b/include/llvm/Support/BinaryItemStream.h
@@ -1,4 +1,4 @@
-//===- SequencedItemStream.h ------------------------------------*- C++ -*-===//
+//===- BinaryItemStream.h ---------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,52 +7,54 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_MSF_SEQUENCEDITEMSTREAM_H
-#define LLVM_DEBUGINFO_MSF_SEQUENCEDITEMSTREAM_H
+#ifndef LLVM_SUPPORT_BINARYITEMSTREAM_H
+#define LLVM_SUPPORT_BINARYITEMSTREAM_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamError.h"
 #include "llvm/Support/Error.h"
 #include <cstddef>
 #include <cstdint>
 
 namespace llvm {
-namespace msf {
 
-template <typename T> struct SequencedItemTraits {
+template <typename T> struct BinaryItemTraits {
   static size_t length(const T &Item) = delete;
   static ArrayRef<uint8_t> bytes(const T &Item) = delete;
 };
 
-/// SequencedItemStream represents a sequence of objects stored in a
-/// standard container but for which it is useful to view as a stream of
-/// contiguous bytes.  An example of this might be if you have a std::vector
-/// of TPI records, where each record contains a byte sequence that
-/// represents that one record serialized, but where each consecutive item
-/// might not be allocated immediately after the previous item.  Using a
-/// SequencedItemStream, we can adapt the VarStreamArray class to trivially
-/// extract one item at a time, allowing the data to be used anywhere a
-/// VarStreamArray could be used.
-template <typename T, typename Traits = SequencedItemTraits<T>>
-class SequencedItemStream : public ReadableStream {
+/// BinaryItemStream represents a sequence of objects stored in some kind of
+/// external container but for which it is useful to view as a stream of
+/// contiguous bytes.  An example of this might be if you have a collection of
+/// records and you serialize each one into a buffer, and store these serialized
+/// records in a container.  The pointers themselves are not laid out
+/// contiguously in memory, but we may wish to read from or write to these
+/// records as if they were.
+template <typename T, typename Traits = BinaryItemTraits<T>>
+class BinaryItemStream : public BinaryStream {
 public:
-  SequencedItemStream() = default;
+  explicit BinaryItemStream(llvm::support::endianness Endian)
+      : Endian(Endian) {}
+
+  llvm::support::endianness getEndian() const override { return Endian; }
 
   Error readBytes(uint32_t Offset, uint32_t Size,
-                  ArrayRef<uint8_t> &Buffer) const override {
+                  ArrayRef<uint8_t> &Buffer) override {
     auto ExpectedIndex = translateOffsetIndex(Offset);
     if (!ExpectedIndex)
       return ExpectedIndex.takeError();
     const auto &Item = Items[*ExpectedIndex];
+    if (auto EC = checkOffset(Offset, Size))
+      return EC;
     if (Size > Traits::length(Item))
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
+      return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
     Buffer = Traits::bytes(Item).take_front(Size);
     return Error::success();
   }
 
   Error readLongestContiguousChunk(uint32_t Offset,
-                                   ArrayRef<uint8_t> &Buffer) const override {
+                                   ArrayRef<uint8_t> &Buffer) override {
     auto ExpectedIndex = translateOffsetIndex(Offset);
     if (!ExpectedIndex)
       return ExpectedIndex.takeError();
@@ -62,7 +64,7 @@ public:
 
   void setItems(ArrayRef<T> ItemArray) { Items = ItemArray; }
 
-  uint32_t getLength() const override {
+  uint32_t getLength() override {
     uint32_t Size = 0;
     for (const auto &Item : Items)
       Size += Traits::length(Item);
@@ -80,14 +82,14 @@ private:
       ++CurrentIndex;
     }
     if (CurrentOffset != Offset)
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
+      return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
     return CurrentIndex;
   }
 
+  llvm::support::endianness Endian;
   ArrayRef<T> Items;
 };
 
-} // end namespace msf
 } // end namespace llvm
 
-#endif // LLVM_DEBUGINFO_MSF_SEQUENCEDITEMSTREAM_H
+#endif // LLVM_SUPPORT_BINARYITEMSTREAM_H
diff --git a/include/llvm/Support/BinaryStream.h b/include/llvm/Support/BinaryStream.h
new file mode 100644
index 0000000000000000000000000000000000000000..a227117e063e99157cf1f784b600b1baec329f56
--- /dev/null
+++ b/include/llvm/Support/BinaryStream.h
@@ -0,0 +1,78 @@
+//===- BinaryStream.h - Base interface for a stream of data -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BINARYSTREAM_H
+#define LLVM_SUPPORT_BINARYSTREAM_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+
+namespace llvm {
+
+/// \brief An interface for accessing data in a stream-like format, but which
+/// discourages copying.  Instead of specifying a buffer in which to copy
+/// data on a read, the API returns an ArrayRef to data owned by the stream's
+/// implementation.  Since implementations may not necessarily store data in a
+/// single contiguous buffer (or even in memory at all), in such cases a it may
+/// be necessary for an implementation to cache such a buffer so that it can
+/// return it.
+class BinaryStream {
+public:
+  virtual ~BinaryStream() = default;
+
+  virtual llvm::support::endianness getEndian() const = 0;
+
+  /// \brief Given an offset into the stream and a number of bytes, attempt to
+  /// read the bytes and set the output ArrayRef to point to data owned by the
+  /// stream.
+  virtual Error readBytes(uint32_t Offset, uint32_t Size,
+                          ArrayRef<uint8_t> &Buffer) = 0;
+
+  /// \brief Given an offset into the stream, read as much as possible without
+  /// copying any data.
+  virtual Error readLongestContiguousChunk(uint32_t Offset,
+                                           ArrayRef<uint8_t> &Buffer) = 0;
+
+  /// \brief Return the number of bytes of data in this stream.
+  virtual uint32_t getLength() = 0;
+
+protected:
+  Error checkOffset(uint32_t Offset, uint32_t DataSize) {
+    if (Offset > getLength())
+      return make_error<BinaryStreamError>(stream_error_code::invalid_offset);
+    if (getLength() < DataSize + Offset)
+      return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+    return Error::success();
+  }
+};
+
+/// \brief A BinaryStream which can be read from as well as written to.  Note
+/// that writing to a BinaryStream always necessitates copying from the input
+/// buffer to the stream's backing store.  Streams are assumed to be buffered
+/// so that to be portable it is necessary to call commit() on the stream when
+/// all data has been written.
+class WritableBinaryStream : public BinaryStream {
+public:
+  ~WritableBinaryStream() override = default;
+
+  /// \brief Attempt to write the given bytes into the stream at the desired
+  /// offset. This will always necessitate a copy.  Cannot shrink or grow the
+  /// stream, only writes into existing allocated space.
+  virtual Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) = 0;
+
+  /// \brief For buffered streams, commits changes to the backing store.
+  virtual Error commit() = 0;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_BINARYSTREAM_H
diff --git a/include/llvm/DebugInfo/MSF/StreamArray.h b/include/llvm/Support/BinaryStreamArray.h
similarity index 75%
rename from include/llvm/DebugInfo/MSF/StreamArray.h
rename to include/llvm/Support/BinaryStreamArray.h
index 5dfeb8c524af3c7d59c26ec815b9a8f341e404d6..3b1301d3cc0bdd604e44b364f25eb10473e4a0c5 100644
--- a/include/llvm/DebugInfo/MSF/StreamArray.h
+++ b/include/llvm/Support/BinaryStreamArray.h
@@ -1,4 +1,4 @@
-//===- StreamArray.h - Array backed by an arbitrary stream ------*- C++ -*-===//
+//===- BinaryStreamArray.h - Array backed by an arbitrary stream *- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,21 +7,30 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_DEBUGINFO_MSF_STREAMARRAY_H
-#define LLVM_DEBUGINFO_MSF_STREAMARRAY_H
+#ifndef LLVM_SUPPORT_BINARYSTREAMARRAY_H
+#define LLVM_SUPPORT_BINARYSTREAMARRAY_H
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/iterator.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 #include <cassert>
 #include <cstdint>
 
+/// Lightweight arrays that are backed by an arbitrary BinaryStream.  This file
+/// provides two different array implementations.
+///
+///     VarStreamArray - Arrays of variable length records.  The user specifies
+///       an Extractor type that can extract a record from a given offset and
+///       return the number of bytes consumed by the record.
+///
+///     FixedStreamArray - Arrays of fixed length records.  This is similar in
+///       spirit to ArrayRef<T>, but since it is backed by a BinaryStream, the
+///       elements of the array need not be laid out in contiguous memory.
 namespace llvm {
-namespace msf {
 
 /// VarStreamArrayExtractor is intended to be specialized to provide customized
-/// extraction logic.  On input it receives a StreamRef pointing to the
+/// extraction logic.  On input it receives a BinaryStreamRef pointing to the
 /// beginning of the next record, but where the length of the record is not yet
 /// known.  Upon completion, it should return an appropriate Error instance if
 /// a record could not be extracted, or if one could be extracted it should
@@ -35,7 +44,7 @@ namespace msf {
 template <typename T> struct VarStreamArrayExtractor {
   // Method intentionally deleted.  You must provide an explicit specialization
   // with the following method implemented.
-  Error operator()(ReadableStreamRef Stream, uint32_t &Len,
+  Error operator()(BinaryStreamRef Stream, uint32_t &Len,
                    T &Item) const = delete;
 };
 
@@ -49,10 +58,10 @@ template <typename T> struct VarStreamArrayExtractor {
 /// abstracting this out, we need not duplicate this memory, and we can
 /// iterate over arrays in arbitrarily formatted streams.  Elements are parsed
 /// lazily on iteration, so there is no upfront cost associated with building
-/// a VarStreamArray, no matter how large it may be.
+/// or copying a VarStreamArray, no matter how large it may be.
 ///
 /// You create a VarStreamArray by specifying a ValueType and an Extractor type.
-/// If you do not specify an Extractor type, it expects you to specialize
+/// If you do not specify an Extractor type, you are expected to specialize
 /// VarStreamArrayExtractor<T> for your ValueType.
 ///
 /// By default an Extractor is default constructed in the class, but in some
@@ -86,8 +95,8 @@ public:
   VarStreamArray() = default;
   explicit VarStreamArray(const Extractor &E) : E(E) {}
 
-  explicit VarStreamArray(ReadableStreamRef Stream) : Stream(Stream) {}
-  VarStreamArray(ReadableStreamRef Stream, const Extractor &E)
+  explicit VarStreamArray(BinaryStreamRef Stream) : Stream(Stream) {}
+  VarStreamArray(BinaryStreamRef Stream, const Extractor &E)
       : Stream(Stream), E(E) {}
 
   VarStreamArray(const VarStreamArray<ValueType, Extractor> &Other)
@@ -101,10 +110,10 @@ public:
 
   const Extractor &getExtractor() const { return E; }
 
-  ReadableStreamRef getUnderlyingStream() const { return Stream; }
+  BinaryStreamRef getUnderlyingStream() const { return Stream; }
 
 private:
-  ReadableStreamRef Stream;
+  BinaryStreamRef Stream;
   Extractor E;
 };
 
@@ -153,23 +162,25 @@ public:
     return ThisValue;
   }
 
-  IterType &operator++() {
-    // We are done with the current record, discard it so that we are
-    // positioned at the next record.
-    IterRef = IterRef.drop_front(ThisLen);
-    if (IterRef.getLength() == 0) {
-      // There is nothing after the current record, we must make this an end
-      // iterator.
-      moveToEnd();
-    } else {
-      // There is some data after the current record.
-      auto EC = Extract(IterRef, ThisLen, ThisValue);
-      if (EC) {
-        consumeError(std::move(EC));
-        markError();
-      } else if (ThisLen == 0) {
-        // An empty record? Make this an end iterator.
+  IterType &operator+=(unsigned N) {
+    for (unsigned I = 0; I < N; ++I) {
+      // We are done with the current record, discard it so that we are
+      // positioned at the next record.
+      IterRef = IterRef.drop_front(ThisLen);
+      if (IterRef.getLength() == 0) {
+        // There is nothing after the current record, we must make this an end
+        // iterator.
         moveToEnd();
+      } else {
+        // There is some data after the current record.
+        auto EC = Extract(IterRef, ThisLen, ThisValue);
+        if (EC) {
+          consumeError(std::move(EC));
+          markError();
+        } else if (ThisLen == 0) {
+          // An empty record? Make this an end iterator.
+          moveToEnd();
+        }
       }
     }
     return *this;
@@ -188,7 +199,7 @@ private:
   }
 
   ValueType ThisValue;
-  ReadableStreamRef IterRef;
+  BinaryStreamRef IterRef;
   const ArrayType *Array{nullptr};
   uint32_t ThisLen{0};
   bool HasError{false};
@@ -198,12 +209,17 @@ private:
 
 template <typename T> class FixedStreamArrayIterator;
 
+/// FixedStreamArray is similar to VarStreamArray, except with each record
+/// having a fixed-length.  As with VarStreamArray, there is no upfront
+/// cost associated with building or copying a FixedStreamArray, as the
+/// memory for each element is not read from the backing stream until that
+/// element is iterated.
 template <typename T> class FixedStreamArray {
   friend class FixedStreamArrayIterator<T>;
 
 public:
   FixedStreamArray() = default;
-  FixedStreamArray(ReadableStreamRef Stream) : Stream(Stream) {
+  explicit FixedStreamArray(BinaryStreamRef Stream) : Stream(Stream) {
     assert(Stream.getLength() % sizeof(T) == 0);
   }
 
@@ -227,6 +243,7 @@ public:
       // an exact multiple of the element size.
       consumeError(std::move(EC));
     }
+    assert(llvm::alignmentAdjustment(Data.data(), alignof(T)) == 0);
     return *reinterpret_cast<const T *>(Data.data());
   }
 
@@ -242,10 +259,10 @@ public:
     return FixedStreamArrayIterator<T>(*this, size());
   }
 
-  ReadableStreamRef getUnderlyingStream() const { return Stream; }
+  BinaryStreamRef getUnderlyingStream() const { return Stream; }
 
 private:
-  ReadableStreamRef Stream;
+  BinaryStreamRef Stream;
 };
 
 template <typename T>
@@ -298,7 +315,6 @@ private:
   uint32_t Index;
 };
 
-} // namespace msf
 } // namespace llvm
 
-#endif // LLVM_DEBUGINFO_MSF_STREAMARRAY_H
+#endif // LLVM_SUPPORT_BINARYSTREAMARRAY_H
diff --git a/include/llvm/Support/BinaryStreamError.h b/include/llvm/Support/BinaryStreamError.h
new file mode 100644
index 0000000000000000000000000000000000000000..7d9699d5363910a05e2bd200411a6ac4ead916bb
--- /dev/null
+++ b/include/llvm/Support/BinaryStreamError.h
@@ -0,0 +1,48 @@
+//===- BinaryStreamError.h - Error extensions for Binary Streams *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BINARYSTREAMERROR_H
+#define LLVM_SUPPORT_BINARYSTREAMERROR_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+
+#include <string>
+
+namespace llvm {
+enum class stream_error_code {
+  unspecified,
+  stream_too_short,
+  invalid_array_size,
+  invalid_offset,
+  filesystem_error
+};
+
+/// Base class for errors originating when parsing raw PDB files
+class BinaryStreamError : public ErrorInfo<BinaryStreamError> {
+public:
+  static char ID;
+  explicit BinaryStreamError(stream_error_code C);
+  explicit BinaryStreamError(StringRef Context);
+  BinaryStreamError(stream_error_code C, StringRef Context);
+
+  void log(raw_ostream &OS) const override;
+  std::error_code convertToErrorCode() const override;
+
+  StringRef getErrorMessage() const;
+
+  stream_error_code getErrorCode() const { return Code; }
+
+private:
+  std::string ErrMsg;
+  stream_error_code Code;
+};
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_BINARYSTREAMERROR_H
diff --git a/include/llvm/Support/BinaryStreamReader.h b/include/llvm/Support/BinaryStreamReader.h
new file mode 100644
index 0000000000000000000000000000000000000000..d994fa0f49d0bad4c9c869fe60386283b14e84e0
--- /dev/null
+++ b/include/llvm/Support/BinaryStreamReader.h
@@ -0,0 +1,234 @@
+//===- BinaryStreamReader.h - Reads objects from a binary stream *- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BINARYSTREAMREADER_H
+#define LLVM_SUPPORT_BINARYSTREAMREADER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/type_traits.h"
+
+#include <string>
+#include <type_traits>
+
+namespace llvm {
+
+/// \brief Provides read only access to a subclass of `BinaryStream`.  Provides
+/// bounds checking and helpers for writing certain common data types such as
+/// null-terminated strings, integers in various flavors of endianness, etc.
+/// Can be subclassed to provide reading of custom datatypes, although no
+/// are overridable.
+class BinaryStreamReader {
+public:
+  explicit BinaryStreamReader(BinaryStreamRef Stream);
+  virtual ~BinaryStreamReader() {}
+
+  /// Read as much as possible from the underlying string at the current offset
+  /// without invoking a copy, and set \p Buffer to the resulting data slice.
+  /// Updates the stream's offset to point after the newly read data.
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  Error readLongestContiguousChunk(ArrayRef<uint8_t> &Buffer);
+
+  /// Read \p Size bytes from the underlying stream at the current offset and
+  /// and set \p Buffer to the resulting data slice.  Whether a copy occurs
+  /// depends on the implementation of the underlying stream.  Updates the
+  /// stream's offset to point after the newly read data.
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  Error readBytes(ArrayRef<uint8_t> &Buffer, uint32_t Size);
+
+  /// Read an integer of the specified endianness into \p Dest and update the
+  /// stream's offset.  The data is always copied from the stream's underlying
+  /// buffer into \p Dest. Updates the stream's offset to point after the newly
+  /// read data.
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  template <typename T> Error readInteger(T &Dest) {
+    static_assert(std::is_integral<T>::value,
+                  "Cannot call readInteger with non-integral value!");
+
+    ArrayRef<uint8_t> Bytes;
+    if (auto EC = readBytes(Bytes, sizeof(T)))
+      return EC;
+
+    Dest = llvm::support::endian::read<T, llvm::support::unaligned>(
+        Bytes.data(), Stream.getEndian());
+    return Error::success();
+  }
+
+  /// Similar to readInteger.
+  template <typename T> Error readEnum(T &Dest) {
+    static_assert(std::is_enum<T>::value,
+                  "Cannot call readEnum with non-enum value!");
+    typename std::underlying_type<T>::type N;
+    if (auto EC = readInteger(N))
+      return EC;
+    Dest = static_cast<T>(N);
+    return Error::success();
+  }
+
+  /// Read a null terminated string from \p Dest.  Whether a copy occurs depends
+  /// on the implementation of the underlying stream.  Updates the stream's
+  /// offset to point after the newly read data.
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  Error readCString(StringRef &Dest);
+
+  /// Read a \p Length byte string into \p Dest.  Whether a copy occurs depends
+  /// on the implementation of the underlying stream.  Updates the stream's
+  /// offset to point after the newly read data.
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  Error readFixedString(StringRef &Dest, uint32_t Length);
+
+  /// Read the entire remainder of the underlying stream into \p Ref.  This is
+  /// equivalent to calling getUnderlyingStream().slice(Offset).  Updates the
+  /// stream's offset to point to the end of the stream.  Never causes a copy.
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  Error readStreamRef(BinaryStreamRef &Ref);
+
+  /// Read \p Length bytes from the underlying stream into \p Ref.  This is
+  /// equivalent to calling getUnderlyingStream().slice(Offset, Length).
+  /// Updates the stream's offset to point after the newly read object.  Never
+  /// causes a copy.
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  Error readStreamRef(BinaryStreamRef &Ref, uint32_t Length);
+
+  /// Get a pointer to an object of type T from the underlying stream, as if by
+  /// memcpy, and store the result into \p Dest.  It is up to the caller to
+  /// ensure that objects of type T can be safely treated in this manner.
+  /// Updates the stream's offset to point after the newly read object.  Whether
+  /// a copy occurs depends upon the implementation of the underlying
+  /// stream.
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  template <typename T> Error readObject(const T *&Dest) {
+    ArrayRef<uint8_t> Buffer;
+    if (auto EC = readBytes(Buffer, sizeof(T)))
+      return EC;
+    Dest = reinterpret_cast<const T *>(Buffer.data());
+    return Error::success();
+  }
+
+  /// Get a reference to a \p NumElements element array of objects of type T
+  /// from the underlying stream as if by memcpy, and store the resulting array
+  /// slice into \p array.  It is up to the caller to ensure that objects of
+  /// type T can be safely treated in this manner.  Updates the stream's offset
+  /// to point after the newly read object.  Whether a copy occurs depends upon
+  /// the implementation of the underlying stream.
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  template <typename T>
+  Error readArray(ArrayRef<T> &Array, uint32_t NumElements) {
+    ArrayRef<uint8_t> Bytes;
+    if (NumElements == 0) {
+      Array = ArrayRef<T>();
+      return Error::success();
+    }
+
+    if (NumElements > UINT32_MAX / sizeof(T))
+      return make_error<BinaryStreamError>(
+          stream_error_code::invalid_array_size);
+
+    if (auto EC = readBytes(Bytes, NumElements * sizeof(T)))
+      return EC;
+
+    assert(alignmentAdjustment(Bytes.data(), alignof(T)) == 0 &&
+           "Reading at invalid alignment!");
+
+    Array = ArrayRef<T>(reinterpret_cast<const T *>(Bytes.data()), NumElements);
+    return Error::success();
+  }
+
+  /// Read a VarStreamArray of size \p Size bytes and store the result into
+  /// \p Array.  Updates the stream's offset to point after the newly read
+  /// array.  Never causes a copy (although iterating the elements of the
+  /// VarStreamArray may, depending upon the implementation of the underlying
+  /// stream).
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  template <typename T, typename U>
+  Error readArray(VarStreamArray<T, U> &Array, uint32_t Size) {
+    BinaryStreamRef S;
+    if (auto EC = readStreamRef(S, Size))
+      return EC;
+    Array = VarStreamArray<T, U>(S, Array.getExtractor());
+    return Error::success();
+  }
+
+  /// Read a FixedStreamArray of \p NumItems elements and store the result into
+  /// \p Array.  Updates the stream's offset to point after the newly read
+  /// array.  Never causes a copy (although iterating the elements of the
+  /// FixedStreamArray may, depending upon the implementation of the underlying
+  /// stream).
+  ///
+  /// \returns a success error code if the data was successfully read, otherwise
+  /// returns an appropriate error code.
+  template <typename T>
+  Error readArray(FixedStreamArray<T> &Array, uint32_t NumItems) {
+    if (NumItems == 0) {
+      Array = FixedStreamArray<T>();
+      return Error::success();
+    }
+
+    if (NumItems > UINT32_MAX / sizeof(T))
+      return make_error<BinaryStreamError>(
+          stream_error_code::invalid_array_size);
+
+    BinaryStreamRef View;
+    if (auto EC = readStreamRef(View, NumItems * sizeof(T)))
+      return EC;
+
+    Array = FixedStreamArray<T>(View);
+    return Error::success();
+  }
+
+  bool empty() const { return bytesRemaining() == 0; }
+  void setOffset(uint32_t Off) { Offset = Off; }
+  uint32_t getOffset() const { return Offset; }
+  uint32_t getLength() const { return Stream.getLength(); }
+  uint32_t bytesRemaining() const { return getLength() - getOffset(); }
+
+  /// Advance the stream's offset by \p Amount bytes.
+  ///
+  /// \returns a success error code if at least \p Amount bytes remain in the
+  /// stream, otherwise returns an appropriate error code.
+  Error skip(uint32_t Amount);
+
+  /// Examine the next byte of the underlying stream without advancing the
+  /// stream's offset.  If the stream is empty the behavior is undefined.
+  ///
+  /// \returns the next byte in the stream.
+  uint8_t peek() const;
+
+private:
+  BinaryStreamRef Stream;
+  uint32_t Offset;
+};
+} // namespace llvm
+
+#endif // LLVM_SUPPORT_BINARYSTREAMREADER_H
diff --git a/include/llvm/Support/BinaryStreamRef.h b/include/llvm/Support/BinaryStreamRef.h
new file mode 100644
index 0000000000000000000000000000000000000000..23ce02fd7ca41f5f1f83051e79dfecd161724285
--- /dev/null
+++ b/include/llvm/Support/BinaryStreamRef.h
@@ -0,0 +1,174 @@
+//===- BinaryStreamRef.h - A copyable reference to a stream -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BINARYSTREAMREF_H
+#define LLVM_SUPPORT_BINARYSTREAMREF_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cstdint>
+
+namespace llvm {
+
+/// Common stuff for mutable and immutable StreamRefs.
+template <class StreamType, class RefType> class BinaryStreamRefBase {
+public:
+  BinaryStreamRefBase() : Stream(nullptr), ViewOffset(0), Length(0) {}
+  BinaryStreamRefBase(StreamType &Stream, uint32_t Offset, uint32_t Length)
+      : Stream(&Stream), ViewOffset(Offset), Length(Length) {}
+
+  llvm::support::endianness getEndian() const { return Stream->getEndian(); }
+
+  uint32_t getLength() const { return Length; }
+  const StreamType *getStream() const { return Stream; }
+
+  /// Return a new BinaryStreamRef with the first \p N elements removed.
+  RefType drop_front(uint32_t N) const {
+    if (!Stream)
+      return RefType();
+
+    N = std::min(N, Length);
+    return RefType(*Stream, ViewOffset + N, Length - N);
+  }
+
+  /// Return a new BinaryStreamRef with only the first \p N elements remaining.
+  RefType keep_front(uint32_t N) const {
+    if (!Stream)
+      return RefType();
+    N = std::min(N, Length);
+    return RefType(*Stream, ViewOffset, N);
+  }
+
+  /// Return a new BinaryStreamRef with the first \p Offset elements removed,
+  /// and retaining exactly \p Len elements.
+  RefType slice(uint32_t Offset, uint32_t Len) const {
+    return drop_front(Offset).keep_front(Len);
+  }
+
+  bool operator==(const RefType &Other) const {
+    if (Stream != Other.Stream)
+      return false;
+    if (ViewOffset != Other.ViewOffset)
+      return false;
+    if (Length != Other.Length)
+      return false;
+    return true;
+  }
+
+protected:
+  Error checkOffset(uint32_t Offset, uint32_t DataSize) const {
+    if (Offset > getLength())
+      return make_error<BinaryStreamError>(stream_error_code::invalid_offset);
+    if (getLength() < DataSize + Offset)
+      return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+    return Error::success();
+  }
+
+  StreamType *Stream;
+  uint32_t ViewOffset;
+  uint32_t Length;
+};
+
+/// \brief BinaryStreamRef is to BinaryStream what ArrayRef is to an Array.  It
+/// provides copy-semantics and read only access to a "window" of the underlying
+/// BinaryStream. Note that BinaryStreamRef is *not* a BinaryStream.  That is to
+/// say, it does not inherit and override the methods of BinaryStream.  In
+/// general, you should not pass around pointers or references to BinaryStreams
+/// and use inheritance to achieve polymorphism.  Instead, you should pass
+/// around BinaryStreamRefs by value and achieve polymorphism that way.
+class BinaryStreamRef
+    : public BinaryStreamRefBase<BinaryStream, BinaryStreamRef> {
+public:
+  BinaryStreamRef() = default;
+  BinaryStreamRef(BinaryStream &Stream)
+      : BinaryStreamRefBase(Stream, 0, Stream.getLength()) {}
+  BinaryStreamRef(BinaryStream &Stream, uint32_t Offset, uint32_t Length)
+      : BinaryStreamRefBase(Stream, Offset, Length) {}
+
+  // Use BinaryStreamRef.slice() instead.
+  BinaryStreamRef(BinaryStreamRef &S, uint32_t Offset,
+                  uint32_t Length) = delete;
+
+  /// Given an Offset into this StreamRef and a Size, return a reference to a
+  /// buffer owned by the stream.
+  ///
+  /// \returns a success error code if the entire range of data is within the
+  /// bounds of this BinaryStreamRef's view and the implementation could read
+  /// the data, and an appropriate error code otherwise.
+  Error readBytes(uint32_t Offset, uint32_t Size,
+                  ArrayRef<uint8_t> &Buffer) const {
+    if (auto EC = checkOffset(Offset, Size))
+      return EC;
+
+    return Stream->readBytes(ViewOffset + Offset, Size, Buffer);
+  }
+
+  /// Given an Offset into this BinaryStreamRef, return a reference to the
+  /// largest buffer the stream could support without necessitating a copy.
+  ///
+  /// \returns a success error code if implementation could read the data,
+  /// and an appropriate error code otherwise.
+  Error readLongestContiguousChunk(uint32_t Offset,
+                                   ArrayRef<uint8_t> &Buffer) const {
+    if (auto EC = checkOffset(Offset, 1))
+      return EC;
+
+    if (auto EC =
+            Stream->readLongestContiguousChunk(ViewOffset + Offset, Buffer))
+      return EC;
+    // This StreamRef might refer to a smaller window over a larger stream.  In
+    // that case we will have read out more bytes than we should return, because
+    // we should not read past the end of the current view.
+    uint32_t MaxLength = Length - Offset;
+    if (Buffer.size() > MaxLength)
+      Buffer = Buffer.slice(0, MaxLength);
+    return Error::success();
+  }
+};
+
+class WritableBinaryStreamRef
+    : public BinaryStreamRefBase<WritableBinaryStream,
+                                 WritableBinaryStreamRef> {
+public:
+  WritableBinaryStreamRef() = default;
+  WritableBinaryStreamRef(WritableBinaryStream &Stream)
+      : BinaryStreamRefBase(Stream, 0, Stream.getLength()) {}
+  WritableBinaryStreamRef(WritableBinaryStream &Stream, uint32_t Offset,
+                          uint32_t Length)
+      : BinaryStreamRefBase(Stream, Offset, Length) {}
+
+  // Use WritableBinaryStreamRef.slice() instead.
+  WritableBinaryStreamRef(WritableBinaryStreamRef &S, uint32_t Offset,
+                          uint32_t Length) = delete;
+
+  /// Given an Offset into this WritableBinaryStreamRef and some input data,
+  /// writes the data to the underlying stream.
+  ///
+  /// \returns a success error code if the data could fit within the underlying
+  /// stream at the specified location and the implementation could write the
+  /// data, and an appropriate error code otherwise.
+  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> Data) const {
+    if (auto EC = checkOffset(Offset, Data.size()))
+      return EC;
+
+    return Stream->writeBytes(ViewOffset + Offset, Data);
+  }
+
+  operator BinaryStreamRef() { return BinaryStreamRef(*Stream); }
+
+  /// \brief For buffered streams, commits changes to the backing store.
+  Error commit() { return Stream->commit(); }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_BINARYSTREAMREF_H
diff --git a/include/llvm/Support/BinaryStreamWriter.h b/include/llvm/Support/BinaryStreamWriter.h
new file mode 100644
index 0000000000000000000000000000000000000000..64f26b24543df3a545991993e1b8a9b8af54e176
--- /dev/null
+++ b/include/llvm/Support/BinaryStreamWriter.h
@@ -0,0 +1,166 @@
+//===- BinaryStreamWriter.h - Writes objects to a BinaryStream ---*- C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_BINARYSTREAMWRITER_H
+#define LLVM_SUPPORT_BINARYSTREAMWRITER_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include <cstdint>
+#include <type_traits>
+
+namespace llvm {
+
+/// \brief Provides write only access to a subclass of `WritableBinaryStream`.
+/// Provides bounds checking and helpers for writing certain common data types
+/// such as null-terminated strings, integers in various flavors of endianness,
+/// etc.  Can be subclassed to provide reading and writing of custom datatypes,
+/// although no methods are overridable.
+class BinaryStreamWriter {
+public:
+  BinaryStreamWriter() = default;
+  explicit BinaryStreamWriter(WritableBinaryStreamRef Stream);
+  virtual ~BinaryStreamWriter() {}
+
+  /// Write the bytes specified in \p Buffer to the underlying stream.
+  /// On success, updates the offset so that subsequent writes will occur
+  /// at the next unwritten position.
+  ///
+  /// \returns a success error code if the data was successfully written,
+  /// otherwise returns an appropriate error code.
+  Error writeBytes(ArrayRef<uint8_t> Buffer);
+
+  /// Write the the integer \p Value to the underlying stream in the
+  /// specified endianness.  On success, updates the offset so that
+  /// subsequent writes occur at the next unwritten position.
+  ///
+  /// \returns a success error code if the data was successfully written,
+  /// otherwise returns an appropriate error code.
+  template <typename T> Error writeInteger(T Value) {
+    static_assert(std::is_integral<T>::value,
+                  "Cannot call writeInteger with non-integral value!");
+    uint8_t Buffer[sizeof(T)];
+    llvm::support::endian::write<T, llvm::support::unaligned>(
+        Buffer, Value, Stream.getEndian());
+    return writeBytes(Buffer);
+  }
+
+  /// Similar to writeInteger
+  template <typename T> Error writeEnum(T Num) {
+    static_assert(std::is_enum<T>::value,
+                  "Cannot call writeEnum with non-Enum type");
+
+    using U = typename std::underlying_type<T>::type;
+    return writeInteger<U>(static_cast<U>(Num));
+  }
+
+  /// Write the the string \p Str to the underlying stream followed by a null
+  /// terminator.  On success, updates the offset so that subsequent writes
+  /// occur at the next unwritten position.  \p Str need not be null terminated
+  /// on input.
+  ///
+  /// \returns a success error code if the data was successfully written,
+  /// otherwise returns an appropriate error code.
+  Error writeCString(StringRef Str);
+
+  /// Write the the string \p Str to the underlying stream without a null
+  /// terminator.  On success, updates the offset so that subsequent writes
+  /// occur at the next unwritten position.
+  ///
+  /// \returns a success error code if the data was successfully written,
+  /// otherwise returns an appropriate error code.
+  Error writeFixedString(StringRef Str);
+
+  /// Efficiently reads all data from \p Ref, and writes it to this stream.
+  /// This operation will not invoke any copies of the source data, regardless
+  /// of the source stream's implementation.
+  ///
+  /// \returns a success error code if the data was successfully written,
+  /// otherwise returns an appropriate error code.
+  Error writeStreamRef(BinaryStreamRef Ref);
+
+  /// Efficiently reads \p Size bytes from \p Ref, and writes it to this stream.
+  /// This operation will not invoke any copies of the source data, regardless
+  /// of the source stream's implementation.
+  ///
+  /// \returns a success error code if the data was successfully written,
+  /// otherwise returns an appropriate error code.
+  Error writeStreamRef(BinaryStreamRef Ref, uint32_t Size);
+
+  /// Writes the object \p Obj to the underlying stream, as if by using memcpy.
+  /// It is up to the caller to ensure that type of \p Obj can be safely copied
+  /// in this fashion, as no checks are made to ensure that this is safe.
+  ///
+  /// \returns a success error code if the data was successfully written,
+  /// otherwise returns an appropriate error code.
+  template <typename T> Error writeObject(const T &Obj) {
+    static_assert(!std::is_pointer<T>::value,
+                  "writeObject should not be used with pointers, to write "
+                  "the pointed-to value dereference the pointer before calling "
+                  "writeObject");
+    return writeBytes(
+        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(&Obj), sizeof(T)));
+  }
+
+  /// Writes an array of objects of type T to the underlying stream, as if by
+  /// using memcpy.  It is up to the caller to ensure that type of \p Obj can
+  /// be safely copied in this fashion, as no checks are made to ensure that
+  /// this is safe.
+  ///
+  /// \returns a success error code if the data was successfully written,
+  /// otherwise returns an appropriate error code.
+  template <typename T> Error writeArray(ArrayRef<T> Array) {
+    if (Array.empty())
+      return Error::success();
+    if (Array.size() > UINT32_MAX / sizeof(T))
+      return make_error<BinaryStreamError>(
+          stream_error_code::invalid_array_size);
+
+    return writeBytes(
+        ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(Array.data()),
+                          Array.size() * sizeof(T)));
+  }
+
+  /// Writes all data from the array \p Array to the underlying stream.
+  ///
+  /// \returns a success error code if the data was successfully written,
+  /// otherwise returns an appropriate error code.
+  template <typename T, typename U>
+  Error writeArray(VarStreamArray<T, U> Array) {
+    return writeStreamRef(Array.getUnderlyingStream());
+  }
+
+  /// Writes all elements from the array \p Array to the underlying stream.
+  ///
+  /// \returns a success error code if the data was successfully written,
+  /// otherwise returns an appropriate error code.
+  template <typename T> Error writeArray(FixedStreamArray<T> Array) {
+    return writeStreamRef(Array.getUnderlyingStream());
+  }
+
+  void setOffset(uint32_t Off) { Offset = Off; }
+  uint32_t getOffset() const { return Offset; }
+  uint32_t getLength() const { return Stream.getLength(); }
+  uint32_t bytesRemaining() const { return getLength() - getOffset(); }
+  Error padToAlignment(uint32_t Align);
+
+protected:
+  WritableBinaryStreamRef Stream;
+  uint32_t Offset = 0;
+};
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_BINARYSTREAMWRITER_H
diff --git a/include/llvm/Support/CachePruning.h b/include/llvm/Support/CachePruning.h
index 954fd8ae7ffbbe839330a960b7493874ad3cf1c8..e826938878e50e3e9a59185aa9ff4035028e6fe0 100644
--- a/include/llvm/Support/CachePruning.h
+++ b/include/llvm/Support/CachePruning.h
@@ -20,51 +20,44 @@
 
 namespace llvm {
 
-/// Handle pruning a directory provided a path and some options to control what
-/// to prune.
-class CachePruning {
-public:
-  /// Prepare to prune \p Path.
-  CachePruning(StringRef Path) : Path(Path) {}
-
-  /// Define the pruning interval. This is intended to be used to avoid scanning
-  /// the directory too often. It does not impact the decision of which file to
-  /// prune. A value of 0 forces the scan to occurs.
-  CachePruning &setPruningInterval(std::chrono::seconds PruningInterval) {
-    Interval = PruningInterval;
-    return *this;
-  }
-
-  /// Define the expiration for a file. When a file hasn't been accessed for
-  /// \p ExpireAfter seconds, it is removed from the cache. A value of 0 disable
-  /// the expiration-based pruning.
-  CachePruning &setEntryExpiration(std::chrono::seconds ExpireAfter) {
-    Expiration = ExpireAfter;
-    return *this;
-  }
-
-  /// Define the maximum size for the cache directory, in terms of percentage of
-  /// the available space on the the disk. Set to 100 to indicate no limit, 50
-  /// to indicate that the cache size will not be left over half the
-  /// available disk space. A value over 100 will be reduced to 100. A value of
-  /// 0 disable the size-based pruning.
-  CachePruning &setMaxSize(unsigned Percentage) {
-    PercentageOfAvailableSpace = std::min(100u, Percentage);
-    return *this;
-  }
-
-  /// Peform pruning using the supplied options, returns true if pruning
-  /// occured, i.e. if PruningInterval was expired.
-  bool prune();
-
-private:
-  // Options that matches the setters above.
-  std::string Path;
-  std::chrono::seconds Expiration = std::chrono::seconds::zero();
-  std::chrono::seconds Interval = std::chrono::seconds::zero();
-  unsigned PercentageOfAvailableSpace = 0;
+template <typename T> class Expected;
+
+/// Policy for the pruneCache() function. A default constructed
+/// CachePruningPolicy provides a reasonable default policy.
+struct CachePruningPolicy {
+  /// The pruning interval. This is intended to be used to avoid scanning the
+  /// directory too often. It does not impact the decision of which file to
+  /// prune. A value of 0 forces the scan to occur.
+  std::chrono::seconds Interval = std::chrono::seconds(1200);
+
+  /// The expiration for a file. When a file hasn't been accessed for Expiration
+  /// seconds, it is removed from the cache. A value of 0 disables the
+  /// expiration-based pruning.
+  std::chrono::seconds Expiration = std::chrono::hours(7 * 24); // 1w
+
+  /// The maximum size for the cache directory, in terms of percentage of the
+  /// available space on the the disk. Set to 100 to indicate no limit, 50 to
+  /// indicate that the cache size will not be left over half the available disk
+  /// space. A value over 100 will be reduced to 100. A value of 0 disables the
+  /// size-based pruning.
+  unsigned PercentageOfAvailableSpace = 75;
 };
 
+/// Parse the given string as a cache pruning policy. Defaults are taken from a
+/// default constructed CachePruningPolicy object.
+/// For example: "prune_interval=30s:prune_after=24h:cache_size=50%"
+/// which means a pruning interval of 30 seconds, expiration time of 24 hours
+/// and maximum cache size of 50% of available disk space.
+Expected<CachePruningPolicy> parseCachePruningPolicy(StringRef PolicyStr);
+
+/// Peform pruning using the supplied policy, returns true if pruning
+/// occured, i.e. if Policy.Interval was expired.
+///
+/// As a safeguard against data loss if the user specifies the wrong directory
+/// as their cache directory, this function will ignore files not matching the
+/// pattern "llvmcache-*".
+bool pruneCache(StringRef Path, CachePruningPolicy Policy);
+
 } // namespace llvm
 
 #endif
diff --git a/include/llvm/Support/Chrono.h b/include/llvm/Support/Chrono.h
index 203439cab91920b3761ea7435b816e50495691c6..6118ed0476edf0f2cd1471b35ece6510cf9f1d19 100644
--- a/include/llvm/Support/Chrono.h
+++ b/include/llvm/Support/Chrono.h
@@ -11,6 +11,7 @@
 #define LLVM_SUPPORT_CHRONO_H
 
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/FormatProviders.h"
 
 #include <chrono>
 #include <ctime>
@@ -50,6 +51,100 @@ toTimePoint(std::time_t T) {
 
 raw_ostream &operator<<(raw_ostream &OS, sys::TimePoint<> TP);
 
+/// Implementation of format_provider<T> for duration types.
+///
+/// The options string of a duration  type has the grammar:
+///
+///   duration_options  ::= [unit][show_unit [number_options]]
+///   unit              ::= `h`|`m`|`s`|`ms|`us`|`ns`
+///   show_unit         ::= `+` | `-`
+///   number_options    ::= options string for a integral or floating point type
+///
+///   Examples
+///   =================================
+///   |  options  | Input | Output    |
+///   =================================
+///   | ""        | 1s    | 1 s       |
+///   | "ms"      | 1s    | 1000 ms   |
+///   | "ms-"     | 1s    | 1000      |
+///   | "ms-n"    | 1s    | 1,000     |
+///   | ""        | 1.0s  | 1.00 s    |
+///   =================================
+///
+///  If the unit of the duration type is not one of the units specified above,
+///  it is still possible to format it, provided you explicitly request a
+///  display unit or you request that the unit is not displayed.
+
+namespace detail {
+template <typename Period> struct unit { static const char value[]; };
+template <typename Period> const char unit<Period>::value[] = "";
+
+template <> struct unit<std::ratio<3600>> { static const char value[]; };
+template <> struct unit<std::ratio<60>> { static const char value[]; };
+template <> struct unit<std::ratio<1>> { static const char value[]; };
+template <> struct unit<std::milli> { static const char value[]; };
+template <> struct unit<std::micro> { static const char value[]; };
+template <> struct unit<std::nano> { static const char value[]; };
+} // namespace detail
+
+template <typename Rep, typename Period>
+struct format_provider<std::chrono::duration<Rep, Period>> {
+private:
+  typedef std::chrono::duration<Rep, Period> Dur;
+  typedef typename std::conditional<
+      std::chrono::treat_as_floating_point<Rep>::value, double, intmax_t>::type
+      InternalRep;
+
+  template <typename AsPeriod> static InternalRep getAs(const Dur &D) {
+    using namespace std::chrono;
+    return duration_cast<duration<InternalRep, AsPeriod>>(D).count();
+  }
+
+  static std::pair<InternalRep, StringRef> consumeUnit(StringRef &Style,
+                                                        const Dur &D) {
+    using namespace std::chrono;
+    if (Style.consume_front("ns"))
+      return {getAs<std::nano>(D), "ns"};
+    if (Style.consume_front("us"))
+      return {getAs<std::micro>(D), "us"};
+    if (Style.consume_front("ms"))
+      return {getAs<std::milli>(D), "ms"};
+    if (Style.consume_front("s"))
+      return {getAs<std::ratio<1>>(D), "s"};
+    if (Style.consume_front("m"))
+      return {getAs<std::ratio<60>>(D), "m"};
+    if (Style.consume_front("h"))
+      return {getAs<std::ratio<3600>>(D), "h"};
+    return {D.count(), detail::unit<Period>::value};
+  }
+
+  static bool consumeShowUnit(StringRef &Style) {
+    if (Style.empty())
+      return true;
+    if (Style.consume_front("-"))
+      return false;
+    if (Style.consume_front("+"))
+      return true;
+    assert(0 && "Unrecognised duration format");
+    return true;
+  }
+
+public:
+  static void format(const Dur &D, llvm::raw_ostream &Stream, StringRef Style) {
+    InternalRep count;
+    StringRef unit;
+    std::tie(count, unit) = consumeUnit(Style, D);
+    bool show_unit = consumeShowUnit(Style);
+
+    format_provider<InternalRep>::format(count, Stream, Style);
+
+    if (show_unit) {
+      assert(!unit.empty());
+      Stream << " " << unit;
+    }
+  }
+};
+
 } // namespace llvm
 
 #endif // LLVM_SUPPORT_CHRONO_H
diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h
index 8d4ac81d29429be464145c2474f246a973af2554..ae32e20d6daba932545aaa3d2742ff24bef7696f 100644
--- a/include/llvm/Support/CommandLine.h
+++ b/include/llvm/Support/CommandLine.h
@@ -50,9 +50,12 @@ namespace cl {
 //===----------------------------------------------------------------------===//
 // ParseCommandLineOptions - Command line option processing entry point.
 //
+// Returns true on success. Otherwise, this will print the error message to
+// stderr and exit if \p Errs is not set (nullptr by default), or print the
+// error message to \p Errs and return false if \p Errs is provided.
 bool ParseCommandLineOptions(int argc, const char *const *argv,
                              StringRef Overview = "",
-                             bool IgnoreErrors = false);
+                             raw_ostream *Errs = nullptr);
 
 //===----------------------------------------------------------------------===//
 // ParseEnvironmentOptions - Environment variable option processing alternate
@@ -343,6 +346,9 @@ public:
 
   virtual void printOptionValue(size_t GlobalWidth, bool Force) const = 0;
 
+  static void printHelpStr(StringRef HelpStr, size_t Indent,
+                           size_t FirstLineIndentedBy);
+
   virtual void getExtraOptionNames(SmallVectorImpl<StringRef> &) {}
 
   // addOccurrence - Wrapper around handleOccurrence that enforces Flags.
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 4b4a92451781515be810270e868eedf41f6e462c..a56bc93e111b6e24c801f35f67f1fd3b085aef7c 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -343,7 +343,7 @@
 ///   int k;
 ///   long long l;
 /// };
-/// LLVM_PACKED_END 
+/// LLVM_PACKED_END
 #ifdef _MSC_VER
 # define LLVM_PACKED(d) __pragma(pack(push, 1)) d __pragma(pack(pop))
 # define LLVM_PACKED_START __pragma(pack(push, 1))
@@ -464,7 +464,7 @@ void AnnotateIgnoreWritesEnd(const char *file, int line);
 #define LLVM_PRETTY_FUNCTION __FUNCSIG__
 #elif defined(__GNUC__) || defined(__clang__)
 #define LLVM_PRETTY_FUNCTION __PRETTY_FUNCTION__
-#else 
+#else
 #define LLVM_PRETTY_FUNCTION __func__
 #endif
 
diff --git a/include/llvm/Support/DebugCounter.h b/include/llvm/Support/DebugCounter.h
new file mode 100644
index 0000000000000000000000000000000000000000..9687cb7b9d95fa835b55e7a0c9991f04e94e4695
--- /dev/null
+++ b/include/llvm/Support/DebugCounter.h
@@ -0,0 +1,165 @@
+//===- llvm/Support/DebugCounter.h - Debug counter support ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// \brief This file provides an implementation of debug counters.  Debug
+/// counters are a tool that let you narrow down a miscompilation to a specific
+/// thing happening.
+///
+/// To give a use case: Imagine you have a file, very large, and you
+/// are trying to understand the minimal transformation that breaks it. Bugpoint
+/// and bisection is often helpful here in narrowing it down to a specific pass,
+/// but it's still a very large file, and a very complicated pass to try to
+/// debug.  That is where debug counting steps in.  You can instrument the pass
+/// with a debug counter before it does a certain thing, and depending on the
+/// counts, it will either execute that thing or not.  The debug counter itself
+/// consists of a skip and a count.  Skip is the number of times shouldExecute
+/// needs to be called before it returns true.  Count is the number of times to
+/// return true once Skip is 0.  So a skip=47, count=2 ,would skip the first 47
+/// executions by returning false from shouldExecute, then execute twice, and
+/// then return false again.
+/// Note that a counter set to a negative number will always execute.
+/// For a concrete example, during predicateinfo creation, the renaming pass
+/// replaces each use with a renamed use.
+////
+/// If I use DEBUG_COUNTER to create a counter called "predicateinfo", and
+/// variable name RenameCounter, and then instrument this renaming with a debug
+/// counter, like so:
+///
+/// if (!DebugCounter::shouldExecute(RenameCounter)
+/// <continue or return or whatever not executing looks like>
+///
+/// Now I can, from the command line, make it rename or not rename certain uses
+/// by setting the skip and count.
+/// So for example
+/// bin/opt -debug-counter=predicateinfo-skip=47,predicateinfo-count=1
+/// will skip renaming the first 47 uses, then rename one, then skip the rest.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_DEBUGCOUNTER_H
+#define LLVM_SUPPORT_DEBUGCOUNTER_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/UniqueVector.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+
+namespace llvm {
+
+class DebugCounter {
+public:
+  /// \brief Returns a reference to the singleton instance.
+  static DebugCounter &instance();
+
+  // Used by the command line option parser to push a new value it parsed.
+  void push_back(const std::string &);
+
+  // Register a counter with the specified name.
+  //
+  // FIXME: Currently, counter registration is required to happen before command
+  // line option parsing. The main reason to register counters is to produce a
+  // nice list of them on the command line, but i'm not sure this is worth it.
+  static unsigned registerCounter(StringRef Name, StringRef Desc) {
+    return instance().addCounter(Name, Desc);
+  }
+  inline static bool shouldExecute(unsigned CounterName) {
+// Compile to nothing when debugging is off
+#ifdef NDEBUG
+    return true;
+#else
+    auto &Us = instance();
+    auto Result = Us.Counters.find(CounterName);
+    if (Result != Us.Counters.end()) {
+      auto &CounterPair = Result->second;
+      // We only execute while the skip (first) is zero and the count (second)
+      // is non-zero.
+      // Negative counters always execute.
+      if (CounterPair.first < 0)
+        return true;
+      if (CounterPair.first != 0) {
+        --CounterPair.first;
+        return false;
+      }
+      if (CounterPair.second < 0)
+        return true;
+      if (CounterPair.second != 0) {
+        --CounterPair.second;
+        return true;
+      }
+      return false;
+    }
+    // Didn't find the counter, should we warn?
+    return true;
+#endif // NDEBUG
+  }
+
+  // Return true if a given counter had values set (either programatically or on
+  // the command line).  This will return true even if those values are
+  // currently in a state where the counter will always execute.
+  static bool isCounterSet(unsigned ID) {
+    return instance().Counters.count(ID);
+  }
+
+  // Return the skip and count for a counter. This only works for set counters.
+  static std::pair<int, int> getCounterValue(unsigned ID) {
+    auto &Us = instance();
+    auto Result = Us.Counters.find(ID);
+    assert(Result != Us.Counters.end() && "Asking about a non-set counter");
+    return Result->second;
+  }
+
+  // Set a registered counter to a given value.
+  static void setCounterValue(unsigned ID, const std::pair<int, int> &Val) {
+    auto &Us = instance();
+    Us.Counters[ID] = Val;
+  }
+
+  // Dump or print the current counter set.
+  LLVM_DUMP_METHOD void dump() { print(dbgs()); }
+
+  void print(raw_ostream &OS);
+
+  // Get the counter ID for a given named counter, or return 0 if none is found.
+  unsigned getCounterId(const std::string &Name) const {
+    return RegisteredCounters.idFor(Name);
+  }
+
+  // Return the number of registered counters.
+  unsigned int getNumCounters() const { return RegisteredCounters.size(); }
+
+  // Return the name and description of the counter with the given ID.
+  std::pair<std::string, std::string> getCounterInfo(unsigned ID) const {
+    return std::make_pair(RegisteredCounters[ID], CounterDesc.lookup(ID));
+  }
+
+  // Iterate through the registered counters
+  typedef UniqueVector<std::string> CounterVector;
+  CounterVector::const_iterator begin() const {
+    return RegisteredCounters.begin();
+  }
+  CounterVector::const_iterator end() const { return RegisteredCounters.end(); }
+
+private:
+  unsigned addCounter(const std::string &Name, const std::string &Desc) {
+    unsigned Result = RegisteredCounters.insert(Name);
+    CounterDesc[Result] = Desc;
+    return Result;
+  }
+  DenseMap<unsigned, std::pair<long, long>> Counters;
+  DenseMap<unsigned, std::string> CounterDesc;
+  CounterVector RegisteredCounters;
+};
+
+#define DEBUG_COUNTER(VARNAME, COUNTERNAME, DESC)                              \
+  static const unsigned VARNAME =                                              \
+      DebugCounter::registerCounter(COUNTERNAME, DESC);
+
+} // namespace llvm
+#endif
diff --git a/include/llvm/Support/Dwarf.def b/include/llvm/Support/Dwarf.def
index 841fc7d4ae22ee8e4c320df35e5fd9c090bf7c41..fdbd8ea7011669ac1cfd19123abd4e0d66d9a515 100644
--- a/include/llvm/Support/Dwarf.def
+++ b/include/llvm/Support/Dwarf.def
@@ -19,7 +19,8 @@
       defined HANDLE_DW_CC || defined HANDLE_DW_LNS ||                         \
       defined HANDLE_DW_LNE || defined HANDLE_DW_LNCT ||                       \
       defined HANDLE_DW_MACRO || defined HANDLE_DW_RLE ||                      \
-      defined HANDLE_DW_CFA || defined HANDLE_DW_APPLE_PROPERTY)
+      defined HANDLE_DW_CFA || defined HANDLE_DW_APPLE_PROPERTY ||             \
+      defined HANDLE_DW_UT)
 #error "Missing macro definition of HANDLE_DW*"
 #endif
 
@@ -87,6 +88,10 @@
 #define HANDLE_DW_APPLE_PROPERTY(ID, NAME)
 #endif
 
+#ifndef HANDLE_DW_UT
+#define HANDLE_DW_UT(ID, NAME)
+#endif
+
 HANDLE_DW_TAG(0x0000, null)
 HANDLE_DW_TAG(0x0001, array_type)
 HANDLE_DW_TAG(0x0002, class_type)
@@ -135,6 +140,7 @@ HANDLE_DW_TAG(0x0032, try_block)
 HANDLE_DW_TAG(0x0033, variant_part)
 HANDLE_DW_TAG(0x0034, variable)
 HANDLE_DW_TAG(0x0035, volatile_type)
+// New in DWARF v3:
 HANDLE_DW_TAG(0x0036, dwarf_procedure)
 HANDLE_DW_TAG(0x0037, restrict_type)
 HANDLE_DW_TAG(0x0038, interface_type)
@@ -145,11 +151,11 @@ HANDLE_DW_TAG(0x003c, partial_unit)
 HANDLE_DW_TAG(0x003d, imported_unit)
 HANDLE_DW_TAG(0x003f, condition)
 HANDLE_DW_TAG(0x0040, shared_type)
+// New in DWARF v4:
 HANDLE_DW_TAG(0x0041, type_unit)
 HANDLE_DW_TAG(0x0042, rvalue_reference_type)
 HANDLE_DW_TAG(0x0043, template_alias)
-
-// New in DWARF v5.
+// New in DWARF v5:
 HANDLE_DW_TAG(0x0044, coarray_type)
 HANDLE_DW_TAG(0x0045, generic_subrange)
 HANDLE_DW_TAG(0x0046, dynamic_type)
@@ -158,8 +164,7 @@ HANDLE_DW_TAG(0x0048, call_site)
 HANDLE_DW_TAG(0x0049, call_site_parameter)
 HANDLE_DW_TAG(0x004a, skeleton_unit)
 HANDLE_DW_TAG(0x004b, immutable_type)
-
-// User-defined tags.
+// Vendor extensions:
 HANDLE_DW_TAG(0x4081, MIPS_loop)
 HANDLE_DW_TAG(0x4101, format_label)
 HANDLE_DW_TAG(0x4102, function_template)
@@ -234,6 +239,7 @@ HANDLE_DW_AT(0x4a, use_location)
 HANDLE_DW_AT(0x4b, variable_parameter)
 HANDLE_DW_AT(0x4c, virtuality)
 HANDLE_DW_AT(0x4d, vtable_elem_location)
+// New in DWARF v3:
 HANDLE_DW_AT(0x4e, allocated)
 HANDLE_DW_AT(0x4f, associated)
 HANDLE_DW_AT(0x50, data_location)
@@ -261,14 +267,14 @@ HANDLE_DW_AT(0x65, endianity)
 HANDLE_DW_AT(0x66, elemental)
 HANDLE_DW_AT(0x67, pure)
 HANDLE_DW_AT(0x68, recursive)
+// New in DWARF v4:
 HANDLE_DW_AT(0x69, signature)
 HANDLE_DW_AT(0x6a, main_subprogram)
 HANDLE_DW_AT(0x6b, data_bit_offset)
 HANDLE_DW_AT(0x6c, const_expr)
 HANDLE_DW_AT(0x6d, enum_class)
 HANDLE_DW_AT(0x6e, linkage_name)
-
-// New in DWARF 5:
+// New in DWARF v5:
 HANDLE_DW_AT(0x6f, string_length_bit_size)
 HANDLE_DW_AT(0x70, string_length_byte_size)
 HANDLE_DW_AT(0x71, rank)
@@ -299,7 +305,7 @@ HANDLE_DW_AT(0x89, export_symbols)
 HANDLE_DW_AT(0x8a, deleted)
 HANDLE_DW_AT(0x8b, defaulted)
 HANDLE_DW_AT(0x8c, loclists_base)
-
+// Vendor extensions:
 HANDLE_DW_AT(0x2002, MIPS_loop_begin)
 HANDLE_DW_AT(0x2003, MIPS_tail_loop_begin)
 HANDLE_DW_AT(0x2004, MIPS_epilog_begin)
@@ -315,11 +321,9 @@ HANDLE_DW_AT(0x200d, MIPS_stride_elem)
 HANDLE_DW_AT(0x200e, MIPS_ptr_dopetype)
 HANDLE_DW_AT(0x200f, MIPS_allocatable_dopetype)
 HANDLE_DW_AT(0x2010, MIPS_assumed_shape_dopetype)
-
 // This one appears to have only been implemented by Open64 for
 // fortran and may conflict with other extensions.
 HANDLE_DW_AT(0x2011, MIPS_assumed_size)
-
 // GNU extensions
 HANDLE_DW_AT(0x2101, sf_names)
 HANDLE_DW_AT(0x2102, src_info)
@@ -329,10 +333,8 @@ HANDLE_DW_AT(0x2105, body_begin)
 HANDLE_DW_AT(0x2106, body_end)
 HANDLE_DW_AT(0x2107, GNU_vector)
 HANDLE_DW_AT(0x2110, GNU_template_name)
-
 HANDLE_DW_AT(0x210f, GNU_odr_signature)
 HANDLE_DW_AT(0x2119, GNU_macros)
-
 // Extensions for Fission proposal.
 HANDLE_DW_AT(0x2130, GNU_dwo_name)
 HANDLE_DW_AT(0x2131, GNU_dwo_id)
@@ -341,7 +343,6 @@ HANDLE_DW_AT(0x2133, GNU_addr_base)
 HANDLE_DW_AT(0x2134, GNU_pubnames)
 HANDLE_DW_AT(0x2135, GNU_pubtypes)
 HANDLE_DW_AT(0x2136, GNU_discriminator)
-
 // Borland extensions.
 HANDLE_DW_AT(0x3b11, BORLAND_property_read)
 HANDLE_DW_AT(0x3b12, BORLAND_property_write)
@@ -360,12 +361,10 @@ HANDLE_DW_AT(0x3b28, BORLAND_Delphi_ABI)
 HANDLE_DW_AT(0x3b29, BORLAND_Delphi_return)
 HANDLE_DW_AT(0x3b30, BORLAND_Delphi_frameptr)
 HANDLE_DW_AT(0x3b31, BORLAND_closure)
-
 // LLVM project extensions.
 HANDLE_DW_AT(0x3e00, LLVM_include_path)
 HANDLE_DW_AT(0x3e01, LLVM_config_macros)
 HANDLE_DW_AT(0x3e02, LLVM_isysroot)
-
 // Apple extensions.
 HANDLE_DW_AT(0x3fe1, APPLE_optimized)
 HANDLE_DW_AT(0x3fe2, APPLE_flags)
@@ -403,26 +402,34 @@ HANDLE_DW_FORM(0x13, ref4)
 HANDLE_DW_FORM(0x14, ref8)
 HANDLE_DW_FORM(0x15, ref_udata)
 HANDLE_DW_FORM(0x16, indirect)
+// New in DWARF v4:
 HANDLE_DW_FORM(0x17, sec_offset)
 HANDLE_DW_FORM(0x18, exprloc)
 HANDLE_DW_FORM(0x19, flag_present)
-
-// New in DWARF v5.
+// This was defined out of sequence.
+HANDLE_DW_FORM(0x20, ref_sig8)
+// New in DWARF v5:
 HANDLE_DW_FORM(0x1a, strx)
 HANDLE_DW_FORM(0x1b, addrx)
-HANDLE_DW_FORM(0x1c, ref_sup)
+HANDLE_DW_FORM(0x1c, ref_sup4)
 HANDLE_DW_FORM(0x1d, strp_sup)
 HANDLE_DW_FORM(0x1e, data16)
 HANDLE_DW_FORM(0x1f, line_strp)
-HANDLE_DW_FORM(0x20, ref_sig8)
 HANDLE_DW_FORM(0x21, implicit_const)
 HANDLE_DW_FORM(0x22, loclistx)
 HANDLE_DW_FORM(0x23, rnglistx)
-
+HANDLE_DW_FORM(0x24, ref_sup8)
+HANDLE_DW_FORM(0x25, strx1)
+HANDLE_DW_FORM(0x26, strx2)
+HANDLE_DW_FORM(0x27, strx3)
+HANDLE_DW_FORM(0x28, strx4)
+HANDLE_DW_FORM(0x29, addrx1)
+HANDLE_DW_FORM(0x2a, addrx2)
+HANDLE_DW_FORM(0x2b, addrx3)
+HANDLE_DW_FORM(0x2c, addrx4)
 // Extensions for Fission proposal
 HANDLE_DW_FORM(0x1f01, GNU_addr_index)
 HANDLE_DW_FORM(0x1f02, GNU_str_index)
-
 // Alternate debug sections proposal (output of "dwz" tool).
 HANDLE_DW_FORM(0x1f20, GNU_ref_alt)
 HANDLE_DW_FORM(0x1f21, GNU_strp_alt)
@@ -462,7 +469,6 @@ HANDLE_DW_OP(0x24, shl)
 HANDLE_DW_OP(0x25, shr)
 HANDLE_DW_OP(0x26, shra)
 HANDLE_DW_OP(0x27, xor)
-HANDLE_DW_OP(0x2f, skip)
 HANDLE_DW_OP(0x28, bra)
 HANDLE_DW_OP(0x29, eq)
 HANDLE_DW_OP(0x2a, ge)
@@ -470,6 +476,7 @@ HANDLE_DW_OP(0x2b, gt)
 HANDLE_DW_OP(0x2c, le)
 HANDLE_DW_OP(0x2d, lt)
 HANDLE_DW_OP(0x2e, ne)
+HANDLE_DW_OP(0x2f, skip)
 HANDLE_DW_OP(0x30, lit0)
 HANDLE_DW_OP(0x31, lit1)
 HANDLE_DW_OP(0x32, lit2)
@@ -573,6 +580,7 @@ HANDLE_DW_OP(0x93, piece)
 HANDLE_DW_OP(0x94, deref_size)
 HANDLE_DW_OP(0x95, xderef_size)
 HANDLE_DW_OP(0x96, nop)
+// New in DWARF v3:
 HANDLE_DW_OP(0x97, push_object_address)
 HANDLE_DW_OP(0x98, call2)
 HANDLE_DW_OP(0x99, call4)
@@ -580,8 +588,10 @@ HANDLE_DW_OP(0x9a, call_ref)
 HANDLE_DW_OP(0x9b, form_tls_address)
 HANDLE_DW_OP(0x9c, call_frame_cfa)
 HANDLE_DW_OP(0x9d, bit_piece)
+// New in DWARF v4:
 HANDLE_DW_OP(0x9e, implicit_value)
 HANDLE_DW_OP(0x9f, stack_value)
+// New in DWARF v5:
 HANDLE_DW_OP(0xa0, implicit_pointer)
 HANDLE_DW_OP(0xa1, addrx)
 HANDLE_DW_OP(0xa2, constx)
@@ -592,11 +602,9 @@ HANDLE_DW_OP(0xa6, deref_type)
 HANDLE_DW_OP(0xa7, xderef_type)
 HANDLE_DW_OP(0xa8, convert)
 HANDLE_DW_OP(0xa9, reinterpret)
-
-// Vendor extensions.
+// Vendor extensions:
 // Extensions for GNU-style thread-local storage.
 HANDLE_DW_OP(0xe0, GNU_push_tls_address)
-
 // Extensions for Fission proposal.
 HANDLE_DW_OP(0xfb, GNU_addr_index)
 HANDLE_DW_OP(0xfc, GNU_const_index)
@@ -612,6 +620,7 @@ HANDLE_DW_LANG(0x0007, Fortran77)
 HANDLE_DW_LANG(0x0008, Fortran90)
 HANDLE_DW_LANG(0x0009, Pascal83)
 HANDLE_DW_LANG(0x000a, Modula2)
+// New in DWARF v3:
 HANDLE_DW_LANG(0x000b, Java)
 HANDLE_DW_LANG(0x000c, C99)
 HANDLE_DW_LANG(0x000d, Ada95)
@@ -621,9 +630,9 @@ HANDLE_DW_LANG(0x0010, ObjC)
 HANDLE_DW_LANG(0x0011, ObjC_plus_plus)
 HANDLE_DW_LANG(0x0012, UPC)
 HANDLE_DW_LANG(0x0013, D)
-
-// New in DWARF 5:
+// New in DWARF v4:
 HANDLE_DW_LANG(0x0014, Python)
+// New in DWARF v5:
 HANDLE_DW_LANG(0x0015, OpenCL)
 HANDLE_DW_LANG(0x0016, Go)
 HANDLE_DW_LANG(0x0017, Modula3)
@@ -640,8 +649,8 @@ HANDLE_DW_LANG(0x0021, C_plus_plus_14)
 HANDLE_DW_LANG(0x0022, Fortran03)
 HANDLE_DW_LANG(0x0023, Fortran08)
 HANDLE_DW_LANG(0x0024, RenderScript)
-
-// Vendor extensions.
+HANDLE_DW_LANG(0x0025, BLISS)
+// Vendor extensions:
 HANDLE_DW_LANG(0x8001, Mips_Assembler)
 HANDLE_DW_LANG(0x8e57, GOOGLE_RenderScript)
 HANDLE_DW_LANG(0xb000, BORLAND_Delphi)
@@ -655,6 +664,7 @@ HANDLE_DW_ATE(0x05, signed)
 HANDLE_DW_ATE(0x06, signed_char)
 HANDLE_DW_ATE(0x07, unsigned)
 HANDLE_DW_ATE(0x08, unsigned_char)
+// New in DWARF v3:
 HANDLE_DW_ATE(0x09, imaginary_float)
 HANDLE_DW_ATE(0x0a, packed_decimal)
 HANDLE_DW_ATE(0x0b, numeric_string)
@@ -662,7 +672,9 @@ HANDLE_DW_ATE(0x0c, edited)
 HANDLE_DW_ATE(0x0d, signed_fixed)
 HANDLE_DW_ATE(0x0e, unsigned_fixed)
 HANDLE_DW_ATE(0x0f, decimal_float)
+// New in DWARF v4:
 HANDLE_DW_ATE(0x10, UTF)
+// New in DWARF v5:
 HANDLE_DW_ATE(0x11, UCS)
 HANDLE_DW_ATE(0x12, ASCII)
 
@@ -680,8 +692,10 @@ HANDLE_DW_DEFAULTED(0x02, out_of_class)
 HANDLE_DW_CC(0x01, normal)
 HANDLE_DW_CC(0x02, program)
 HANDLE_DW_CC(0x03, nocall)
+// New in DWARF v5:
 HANDLE_DW_CC(0x04, pass_by_reference)
 HANDLE_DW_CC(0x05, pass_by_value)
+// Vendor extensions:
 HANDLE_DW_CC(0x41, GNU_borland_fastcall_i386)
 HANDLE_DW_CC(0xb0, BORLAND_safecall)
 HANDLE_DW_CC(0xb1, BORLAND_stdcall)
@@ -696,6 +710,7 @@ HANDLE_DW_CC(0xc0, LLVM_vectorcall)
 HANDLE_DW_LNE(0x01, end_sequence)
 HANDLE_DW_LNE(0x02, set_address)
 HANDLE_DW_LNE(0x03, define_file)
+// New in DWARF v4:
 HANDLE_DW_LNE(0x04, set_discriminator)
 
 // Line Number Standard Opcode Encodings.
@@ -709,6 +724,7 @@ HANDLE_DW_LNS(0x06, negate_stmt)
 HANDLE_DW_LNS(0x07, set_basic_block)
 HANDLE_DW_LNS(0x08, const_add_pc)
 HANDLE_DW_LNS(0x09, fixed_advance_pc)
+// New in DWARF v3:
 HANDLE_DW_LNS(0x0a, set_prologue_end)
 HANDLE_DW_LNS(0x0b, set_epilogue_begin)
 HANDLE_DW_LNS(0x0c, set_isa)
@@ -720,6 +736,7 @@ HANDLE_DW_LNCT(0x03, timestamp)
 HANDLE_DW_LNCT(0x04, size)
 HANDLE_DW_LNCT(0x05, MD5)
 
+// DWARF v5 Macro information.
 HANDLE_DW_MACRO(0x01, define)
 HANDLE_DW_MACRO(0x02, undef)
 HANDLE_DW_MACRO(0x03, start_file)
@@ -733,7 +750,7 @@ HANDLE_DW_MACRO(0x0a, import_sup)
 HANDLE_DW_MACRO(0x0b, define_strx)
 HANDLE_DW_MACRO(0x0c, undef_strx)
 
-// Range list entry encoding values.
+// DWARF v5 Range List Entry encoding values.
 HANDLE_DW_RLE(0x00, end_of_list)
 HANDLE_DW_RLE(0x01, base_addressx)
 HANDLE_DW_RLE(0x02, startx_endx)
@@ -762,6 +779,7 @@ HANDLE_DW_CFA(0x0b, restore_state)
 HANDLE_DW_CFA(0x0c, def_cfa)
 HANDLE_DW_CFA(0x0d, def_cfa_register)
 HANDLE_DW_CFA(0x0e, def_cfa_offset)
+// New in DWARF v3:
 HANDLE_DW_CFA(0x0f, def_cfa_expression)
 HANDLE_DW_CFA(0x10, expression)
 HANDLE_DW_CFA(0x11, offset_extended_sf)
@@ -770,6 +788,7 @@ HANDLE_DW_CFA(0x13, def_cfa_offset_sf)
 HANDLE_DW_CFA(0x14, val_offset)
 HANDLE_DW_CFA(0x15, val_offset_sf)
 HANDLE_DW_CFA(0x16, val_expression)
+// Vendor extensions:
 HANDLE_DW_CFA(0x1d, MIPS_advance_loc8)
 HANDLE_DW_CFA(0x2d, GNU_window_save)
 HANDLE_DW_CFA(0x2e, GNU_args_size)
@@ -792,6 +811,13 @@ HANDLE_DW_APPLE_PROPERTY(0x1000, nullability)
 HANDLE_DW_APPLE_PROPERTY(0x2000, null_resettable)
 HANDLE_DW_APPLE_PROPERTY(0x4000, class)
 
+// DWARF v5 Unit Types.
+HANDLE_DW_UT(0x01, compile)
+HANDLE_DW_UT(0x02, type)
+HANDLE_DW_UT(0x03, partial)
+HANDLE_DW_UT(0x04, skeleton)
+HANDLE_DW_UT(0x05, split_compile)
+HANDLE_DW_UT(0x06, split_type)
 
 #undef HANDLE_DW_TAG
 #undef HANDLE_DW_AT
@@ -809,3 +835,4 @@ HANDLE_DW_APPLE_PROPERTY(0x4000, class)
 #undef HANDLE_DW_RLE
 #undef HANDLE_DW_CFA
 #undef HANDLE_DW_APPLE_PROPERTY
+#undef HANDLE_DW_UT
diff --git a/include/llvm/Support/Dwarf.h b/include/llvm/Support/Dwarf.h
index 8336b9df9df0d890bf302712de51b93992658a82..84056682924ebb596cd6f72ffe7fb27a46c0743d 100644
--- a/include/llvm/Support/Dwarf.h
+++ b/include/llvm/Support/Dwarf.h
@@ -29,7 +29,7 @@ class StringRef;
 namespace dwarf {
 
 //===----------------------------------------------------------------------===//
-// Dwarf constants as gleaned from the DWARF Debugging Information Format V.4
+// DWARF constants as gleaned from the DWARF Debugging Information Format V.5
 // reference manual http://www.dwarfstd.org/.
 //
 
@@ -305,7 +305,15 @@ enum ApplePropertyAttributes {
 #include "llvm/Support/Dwarf.def"
 };
 
-// Constants for the DWARF5 Accelerator Table Proposal
+/// Constants for unit types in DWARF v5.
+enum UnitType : unsigned char {
+#define HANDLE_DW_UT(ID, NAME) DW_UT_##NAME = ID,
+#include "llvm/Support/Dwarf.def"
+  DW_UT_lo_user = 0x80,
+  DW_UT_hi_user = 0xff
+};
+
+// Constants for the DWARF v5 Accelerator Table Proposal
 enum AcceleratorTable {
   // Data layout descriptors.
   DW_ATOM_null = 0u,       // Marker as the end of a list of atoms.
@@ -373,6 +381,7 @@ StringRef LNExtendedString(unsigned Encoding);
 StringRef MacinfoString(unsigned Encoding);
 StringRef CallFrameString(unsigned Encoding);
 StringRef ApplePropertyString(unsigned);
+StringRef UnitTypeString(unsigned);
 StringRef AtomTypeString(unsigned Atom);
 StringRef GDBIndexEntryKindString(GDBIndexEntryKind Kind);
 StringRef GDBIndexEntryLinkageString(GDBIndexEntryLinkage Linkage);
diff --git a/include/llvm/Support/DynamicLibrary.h b/include/llvm/Support/DynamicLibrary.h
index a7d22212dbdb5e39e295048136fb530853ccba12..aa9bb8938ad3bf2f9359a1e5b123b4bd6dfe556e 100644
--- a/include/llvm/Support/DynamicLibrary.h
+++ b/include/llvm/Support/DynamicLibrary.h
@@ -68,6 +68,15 @@ namespace sys {
     static DynamicLibrary getPermanentLibrary(const char *filename,
                                               std::string *errMsg = nullptr);
 
+    /// Registers an externally loaded library. The library will be unloaded
+    /// when the program terminates.
+    ///
+    /// It is safe to call this function multiple times for the same library.
+    ///
+    /// \returns An empty \p DynamicLibrary if the library was already loaded.
+    static DynamicLibrary addPermanentLibrary(void *handle,
+                                              std::string *errMsg = nullptr);
+
     /// This function permanently loads the dynamic library at the given path.
     /// Use this instead of getPermanentLibrary() when you won't need to get
     /// symbols from the library itself.
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index 9bbec86a45c6423defee21672cc542a136d59d19..33f20a809d6ca11dd7c219971dbdd539627c315d 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -556,6 +556,7 @@ enum {
   EF_HEXAGON_MACH_V5 = 0x00000004,  // Hexagon V5
   EF_HEXAGON_MACH_V55 = 0x00000005, // Hexagon V55
   EF_HEXAGON_MACH_V60 = 0x00000060, // Hexagon V60
+  EF_HEXAGON_MACH_V62 = 0x00000062, // Hexagon V62
 
   // Highest ISA version flags
   EF_HEXAGON_ISA_MACH = 0x00000000, // Same as specified in bits[11:0]
@@ -566,6 +567,7 @@ enum {
   EF_HEXAGON_ISA_V5 = 0x00000040,   // Hexagon V5 ISA
   EF_HEXAGON_ISA_V55 = 0x00000050,  // Hexagon V55 ISA
   EF_HEXAGON_ISA_V60 = 0x00000060,  // Hexagon V60 ISA
+  EF_HEXAGON_ISA_V62 = 0x00000062,  // Hexagon V62 ISA
 };
 
 // Hexagon-specific section indexes for common small data
@@ -703,6 +705,7 @@ enum : unsigned {
 
   SHT_MIPS_REGINFO = 0x70000006,  // Register usage information
   SHT_MIPS_OPTIONS = 0x7000000d,  // General options
+  SHT_MIPS_DWARF = 0x7000001e,    // DWARF debugging section.
   SHT_MIPS_ABIFLAGS = 0x7000002a, // ABI information.
 
   SHT_HIPROC = 0x7fffffff, // Highest processor arch-specific type.
@@ -1311,6 +1314,19 @@ enum {
 enum { VER_NEED_NONE = 0, VER_NEED_CURRENT = 1 };
 
 // SHT_NOTE section types
+enum {
+  NT_FREEBSD_THRMISC = 7,
+  NT_FREEBSD_PROCSTAT_PROC = 8,
+  NT_FREEBSD_PROCSTAT_FILES = 9,
+  NT_FREEBSD_PROCSTAT_VMMAP = 10,
+  NT_FREEBSD_PROCSTAT_GROUPS = 11,
+  NT_FREEBSD_PROCSTAT_UMASK = 12,
+  NT_FREEBSD_PROCSTAT_RLIMIT = 13,
+  NT_FREEBSD_PROCSTAT_OSREL = 14,
+  NT_FREEBSD_PROCSTAT_PSSTRINGS = 15,
+  NT_FREEBSD_PROCSTAT_AUXV = 16,
+};
+
 enum {
   NT_GNU_ABI_TAG = 1,
   NT_GNU_HWCAP = 2,
diff --git a/include/llvm/Support/Endian.h b/include/llvm/Support/Endian.h
index cbe3d67b1f9e6fad1a9cbd5aad3d4cb9f79f5db1..06e089ffa166f6efd03ee0ae473cbead75464aa4 100644
--- a/include/llvm/Support/Endian.h
+++ b/include/llvm/Support/Endian.h
@@ -17,6 +17,8 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/SwapByteOrder.h"
 
+#include <stdint.h>
+
 namespace llvm {
 namespace support {
 enum endianness {big, little, native};
@@ -33,48 +35,71 @@ namespace detail {
 } // end namespace detail
 
 namespace endian {
+constexpr endianness system_endianness() {
+  return sys::IsBigEndianHost ? big : little;
+}
+
+template <typename value_type>
+inline value_type byte_swap(value_type value, endianness endian) {
+  if ((endian != native) && (endian != system_endianness()))
+    sys::swapByteOrder(value);
+  return value;
+}
+
 /// Swap the bytes of value to match the given endianness.
 template<typename value_type, endianness endian>
 inline value_type byte_swap(value_type value) {
-  if (endian != native && sys::IsBigEndianHost != (endian == big))
-    sys::swapByteOrder(value);
-  return value;
+  return byte_swap(value, endian);
 }
 
 /// Read a value of a particular endianness from memory.
-template<typename value_type,
-         endianness endian,
-         std::size_t alignment>
-inline value_type read(const void *memory) {
+template <typename value_type, std::size_t alignment>
+inline value_type read(const void *memory, endianness endian) {
   value_type ret;
 
   memcpy(&ret,
-         LLVM_ASSUME_ALIGNED(memory,
-           (detail::PickAlignment<value_type, alignment>::value)),
+         LLVM_ASSUME_ALIGNED(
+             memory, (detail::PickAlignment<value_type, alignment>::value)),
          sizeof(value_type));
-  return byte_swap<value_type, endian>(ret);
+  return byte_swap<value_type>(ret, endian);
+}
+
+template<typename value_type,
+         endianness endian,
+         std::size_t alignment>
+inline value_type read(const void *memory) {
+  return read<value_type, alignment>(memory, endian);
 }
 
 /// Read a value of a particular endianness from a buffer, and increment the
 /// buffer past that value.
+template <typename value_type, std::size_t alignment, typename CharT>
+inline value_type readNext(const CharT *&memory, endianness endian) {
+  value_type ret = read<value_type, alignment>(memory, endian);
+  memory += sizeof(value_type);
+  return ret;
+}
+
 template<typename value_type, endianness endian, std::size_t alignment,
          typename CharT>
 inline value_type readNext(const CharT *&memory) {
-  value_type ret = read<value_type, endian, alignment>(memory);
-  memory += sizeof(value_type);
-  return ret;
+  return readNext<value_type, alignment, CharT>(memory, endian);
 }
 
 /// Write a value to memory with a particular endianness.
+template <typename value_type, std::size_t alignment>
+inline void write(void *memory, value_type value, endianness endian) {
+  value = byte_swap<value_type>(value, endian);
+  memcpy(LLVM_ASSUME_ALIGNED(
+             memory, (detail::PickAlignment<value_type, alignment>::value)),
+         &value, sizeof(value_type));
+}
+
 template<typename value_type,
          endianness endian,
          std::size_t alignment>
 inline void write(void *memory, value_type value) {
-  value = byte_swap<value_type, endian>(value);
-  memcpy(LLVM_ASSUME_ALIGNED(memory,
-           (detail::PickAlignment<value_type, alignment>::value)),
-         &value,
-         sizeof(value_type));
+  write<value_type, alignment>(memory, value, endian);
 }
 
 template <typename value_type>
@@ -300,10 +325,24 @@ typedef detail::packed_endian_specific_integral
                    <int64_t, native, unaligned> unaligned_int64_t;
 
 namespace endian {
+template <typename T> inline T read(const void *P, endianness E) {
+  return read<T, unaligned>(P, E);
+}
+
 template <typename T, endianness E> inline T read(const void *P) {
   return *(const detail::packed_endian_specific_integral<T, E, unaligned> *)P;
 }
 
+inline uint16_t read16(const void *P, endianness E) {
+  return read<uint16_t>(P, E);
+}
+inline uint32_t read32(const void *P, endianness E) {
+  return read<uint32_t>(P, E);
+}
+inline uint64_t read64(const void *P, endianness E) {
+  return read<uint64_t>(P, E);
+}
+
 template <endianness E> inline uint16_t read16(const void *P) {
   return read<uint16_t, E>(P);
 }
@@ -321,10 +360,24 @@ inline uint16_t read16be(const void *P) { return read16<big>(P); }
 inline uint32_t read32be(const void *P) { return read32<big>(P); }
 inline uint64_t read64be(const void *P) { return read64<big>(P); }
 
+template <typename T> inline void write(void *P, T V, endianness E) {
+  write<T, unaligned>(P, V, E);
+}
+
 template <typename T, endianness E> inline void write(void *P, T V) {
   *(detail::packed_endian_specific_integral<T, E, unaligned> *)P = V;
 }
 
+inline void write16(void *P, uint16_t V, endianness E) {
+  write<uint16_t>(P, V, E);
+}
+inline void write32(void *P, uint32_t V, endianness E) {
+  write<uint32_t>(P, V, E);
+}
+inline void write64(void *P, uint64_t V, endianness E) {
+  write<uint64_t>(P, V, E);
+}
+
 template <endianness E> inline void write16(void *P, uint16_t V) {
   write<uint16_t, E>(P, V);
 }
diff --git a/include/llvm/Support/Error.h b/include/llvm/Support/Error.h
index f13c9484b5fd5d475d6ec5d72f08ae87d9ad2dbd..21664d4b71557cf6c5319565c0d7da5db77d90d5 100644
--- a/include/llvm/Support/Error.h
+++ b/include/llvm/Support/Error.h
@@ -985,6 +985,45 @@ private:
 LLVM_ATTRIBUTE_NORETURN void report_fatal_error(Error Err,
                                                 bool gen_crash_diag = true);
 
+/// Report a fatal error if Err is a failure value.
+///
+/// This function can be used to wrap calls to fallible functions ONLY when it
+/// is known that the Error will always be a success value. E.g.
+///
+///   @code{.cpp}
+///   // foo only attempts the fallible operation if DoFallibleOperation is
+///   // true. If DoFallibleOperation is false then foo always returns
+///   // Error::success().
+///   Error foo(bool DoFallibleOperation);
+///
+///   cantFail(foo(false));
+///   @endcode
+inline void cantFail(Error Err) {
+  if (Err)
+    llvm_unreachable("Failure value returned from cantFail wrapped call");
+}
+
+/// Report a fatal error if ValOrErr is a failure value, otherwise unwraps and
+/// returns the contained value.
+///
+/// This function can be used to wrap calls to fallible functions ONLY when it
+/// is known that the Error will always be a success value. E.g.
+///
+///   @code{.cpp}
+///   // foo only attempts the fallible operation if DoFallibleOperation is
+///   // true. If DoFallibleOperation is false then foo always returns an int.
+///   Expected<int> foo(bool DoFallibleOperation);
+///
+///   int X = cantFail(foo(false));
+///   @endcode
+template <typename T>
+T cantFail(Expected<T> ValOrErr) {
+  if (ValOrErr)
+    return std::move(*ValOrErr);
+  else
+    llvm_unreachable("Failure value returned from cantFail wrapped call");
+}
+
 } // end namespace llvm
 
 #endif // LLVM_SUPPORT_ERROR_H
diff --git a/include/llvm/Support/FileSystem.h b/include/llvm/Support/FileSystem.h
index a0ddcf3dddba5afe2b8188181d14727d24349178..29515c231bc46f989de37a7a1bdece459c12acb8 100644
--- a/include/llvm/Support/FileSystem.h
+++ b/include/llvm/Support/FileSystem.h
@@ -33,6 +33,7 @@
 #include "llvm/Support/Chrono.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MD5.h"
 #include <cassert>
 #include <cstdint>
 #include <ctime>
@@ -93,6 +94,7 @@ enum perms {
   set_uid_on_exe = 04000,
   set_gid_on_exe = 02000,
   sticky_bit = 01000,
+  all_perms = all_all | set_uid_on_exe | set_gid_on_exe | sticky_bit,
   perms_not_known = 0xFFFF
 };
 
@@ -141,70 +143,61 @@ public:
 ///               a platform-specific member to store the result.
 class file_status
 {
+  friend bool equivalent(file_status A, file_status B);
+
   #if defined(LLVM_ON_UNIX)
-  dev_t fs_st_dev;
-  ino_t fs_st_ino;
-  time_t fs_st_atime;
-  time_t fs_st_mtime;
-  uid_t fs_st_uid;
-  gid_t fs_st_gid;
-  off_t fs_st_size;
+  dev_t fs_st_dev = 0;
+  nlink_t fs_st_nlinks = 0;
+  ino_t fs_st_ino = 0;
+  time_t fs_st_atime = 0;
+  time_t fs_st_mtime = 0;
+  uid_t fs_st_uid = 0;
+  gid_t fs_st_gid = 0;
+  off_t fs_st_size = 0;
   #elif defined (LLVM_ON_WIN32)
-  uint32_t LastAccessedTimeHigh;
-  uint32_t LastAccessedTimeLow;
-  uint32_t LastWriteTimeHigh;
-  uint32_t LastWriteTimeLow;
-  uint32_t VolumeSerialNumber;
-  uint32_t FileSizeHigh;
-  uint32_t FileSizeLow;
-  uint32_t FileIndexHigh;
-  uint32_t FileIndexLow;
+  uint32_t NumLinks = 0;
+  uint32_t LastAccessedTimeHigh = 0;
+  uint32_t LastAccessedTimeLow = 0;
+  uint32_t LastWriteTimeHigh = 0;
+  uint32_t LastWriteTimeLow = 0;
+  uint32_t VolumeSerialNumber = 0;
+  uint32_t FileSizeHigh = 0;
+  uint32_t FileSizeLow = 0;
+  uint32_t FileIndexHigh = 0;
+  uint32_t FileIndexLow = 0;
   #endif
-  friend bool equivalent(file_status A, file_status B);
-  file_type Type;
-  perms Perms;
+  file_type Type = file_type::status_error;
+  perms Perms = perms_not_known;
 
 public:
   #if defined(LLVM_ON_UNIX)
-  file_status()
-      : fs_st_dev(0), fs_st_ino(0), fs_st_atime(0), fs_st_mtime(0),
-        fs_st_uid(0), fs_st_gid(0), fs_st_size(0),
-        Type(file_type::status_error), Perms(perms_not_known) {}
-
-  file_status(file_type Type)
-      : fs_st_dev(0), fs_st_ino(0), fs_st_atime(0), fs_st_mtime(0),
-        fs_st_uid(0), fs_st_gid(0), fs_st_size(0), Type(Type),
-        Perms(perms_not_known) {}
-
-  file_status(file_type Type, perms Perms, dev_t Dev, ino_t Ino, time_t ATime,
-              time_t MTime, uid_t UID, gid_t GID, off_t Size)
-      : fs_st_dev(Dev), fs_st_ino(Ino), fs_st_atime(ATime), fs_st_mtime(MTime),
-        fs_st_uid(UID), fs_st_gid(GID), fs_st_size(Size), Type(Type),
-        Perms(Perms) {}
+  file_status() = default;
+
+  file_status(file_type Type) : Type(Type) {}
+
+  file_status(file_type Type, perms Perms, dev_t Dev, nlink_t Links, ino_t Ino,
+              time_t ATime, time_t MTime, uid_t UID, gid_t GID, off_t Size)
+      : fs_st_dev(Dev), fs_st_nlinks(Links), fs_st_ino(Ino), fs_st_atime(ATime),
+        fs_st_mtime(MTime), fs_st_uid(UID), fs_st_gid(GID), fs_st_size(Size),
+        Type(Type), Perms(Perms) {}
   #elif defined(LLVM_ON_WIN32)
-  file_status()
-      : LastAccessedTimeHigh(0), LastAccessedTimeLow(0), LastWriteTimeHigh(0),
-        LastWriteTimeLow(0), VolumeSerialNumber(0), FileSizeHigh(0),
-        FileSizeLow(0), FileIndexHigh(0), FileIndexLow(0),
-        Type(file_type::status_error), Perms(perms_not_known) {}
-
-  file_status(file_type Type)
-      : LastAccessedTimeHigh(0), LastAccessedTimeLow(0), LastWriteTimeHigh(0),
-        LastWriteTimeLow(0), VolumeSerialNumber(0), FileSizeHigh(0),
-        FileSizeLow(0), FileIndexHigh(0), FileIndexLow(0), Type(Type),
-        Perms(perms_not_known) {}
-
-  file_status(file_type Type, uint32_t LastAccessTimeHigh,
-              uint32_t LastAccessTimeLow, uint32_t LastWriteTimeHigh,
-              uint32_t LastWriteTimeLow, uint32_t VolumeSerialNumber,
-              uint32_t FileSizeHigh, uint32_t FileSizeLow,
-              uint32_t FileIndexHigh, uint32_t FileIndexLow)
-      : LastAccessedTimeHigh(LastAccessTimeHigh), LastAccessedTimeLow(LastAccessTimeLow),
+  file_status() = default;
+
+  file_status(file_type Type) : Type(Type) {}
+
+  file_status(file_type Type, perms Perms, uint32_t LinkCount,
+              uint32_t LastAccessTimeHigh, uint32_t LastAccessTimeLow,
+              uint32_t LastWriteTimeHigh, uint32_t LastWriteTimeLow,
+              uint32_t VolumeSerialNumber, uint32_t FileSizeHigh,
+              uint32_t FileSizeLow, uint32_t FileIndexHigh,
+              uint32_t FileIndexLow)
+      : NumLinks(LinkCount), LastAccessedTimeHigh(LastAccessTimeHigh),
+        LastAccessedTimeLow(LastAccessTimeLow),
         LastWriteTimeHigh(LastWriteTimeHigh),
         LastWriteTimeLow(LastWriteTimeLow),
         VolumeSerialNumber(VolumeSerialNumber), FileSizeHigh(FileSizeHigh),
         FileSizeLow(FileSizeLow), FileIndexHigh(FileIndexHigh),
-        FileIndexLow(FileIndexLow), Type(Type), Perms(perms_not_known) {}
+        FileIndexLow(FileIndexLow), Type(Type), Perms(Perms) {}
   #endif
 
   // getters
@@ -213,6 +206,7 @@ public:
   TimePoint<> getLastAccessedTime() const;
   TimePoint<> getLastModificationTime() const;
   UniqueID getUniqueID() const;
+  uint32_t getLinkCount() const;
 
   #if defined(LLVM_ON_UNIX)
   uint32_t getUser() const { return fs_st_uid; }
@@ -222,9 +216,11 @@ public:
   uint32_t getUser() const {
     return 9999; // Not applicable to Windows, so...
   }
+
   uint32_t getGroup() const {
     return 9999; // Not applicable to Windows, so...
   }
+
   uint64_t getSize() const {
     return (uint64_t(FileSizeHigh) << 32) + FileSizeLow;
   }
@@ -271,12 +267,12 @@ struct file_magic {
     return V != unknown;
   }
 
-  file_magic() : V(unknown) {}
+  file_magic() = default;
   file_magic(Impl V) : V(V) {}
   operator Impl() const { return V; }
 
 private:
-  Impl V;
+  Impl V = unknown;
 };
 
 /// @}
@@ -350,6 +346,16 @@ std::error_code create_link(const Twine &to, const Twine &from);
 /// specific error_code.
 std::error_code create_hard_link(const Twine &to, const Twine &from);
 
+/// @brief Collapse all . and .. patterns, resolve all symlinks, and optionally
+///        expand ~ expressions to the user's home directory.
+///
+/// @param path The path to resolve.
+/// @param output The location to store the resolved path.
+/// @param expand_tilde If true, resolves ~ expressions to the user's home
+///                     directory.
+std::error_code real_path(const Twine &path, SmallVectorImpl<char> &output,
+                          bool expand_tilde = false);
+
 /// @brief Get the current path.
 ///
 /// @param result Holds the current path on return.
@@ -372,6 +378,13 @@ std::error_code set_current_path(const Twine &path);
 ///          returns error if the file didn't exist.
 std::error_code remove(const Twine &path, bool IgnoreNonExisting = true);
 
+/// @brief Recursively delete a directory.
+///
+/// @param path Input path.
+/// @returns errc::success if path has been removed or didn't exist, otherwise a
+///          platform-specific error code.
+std::error_code remove_directories(const Twine &path, bool IgnoreErrors = true);
+
 /// @brief Rename \a from to \a to. Files are renamed as if by POSIX rename().
 ///
 /// @param from The path to rename from.
@@ -392,6 +405,16 @@ std::error_code copy_file(const Twine &From, const Twine &To);
 ///          platform-specific error_code.
 std::error_code resize_file(int FD, uint64_t Size);
 
+/// @brief Compute an MD5 hash of a file's contents.
+///
+/// @param FD Input file descriptor.
+/// @returns An MD5Result with the hash computed, if successful, otherwise a
+///          std::error_code.
+ErrorOr<MD5::MD5Result> md5_contents(int FD);
+
+/// @brief Version of compute_md5 that doesn't require an open file descriptor.
+ErrorOr<MD5::MD5Result> md5_contents(const Twine &Path);
+
 /// @}
 /// @name Physical Observers
 /// @{
@@ -464,6 +487,40 @@ inline bool equivalent(const Twine &A, const Twine &B) {
   return !equivalent(A, B, result) && result;
 }
 
+/// @brief Is the file mounted on a local filesystem?
+///
+/// @param path Input path.
+/// @param result Set to true if \a path is on fixed media such as a hard disk,
+///               false if it is not.
+/// @returns errc::success if result has been successfully set, otherwise a
+///          platform specific error_code.
+std::error_code is_local(const Twine &path, bool &result);
+
+/// @brief Version of is_local accepting an open file descriptor.
+std::error_code is_local(int FD, bool &result);
+
+/// @brief Simpler version of is_local for clients that don't need to
+///        differentiate between an error and false.
+inline bool is_local(const Twine &Path) {
+  bool Result;
+  return !is_local(Path, Result) && Result;
+}
+
+/// @brief Simpler version of is_local accepting an open file descriptor for
+///        clients that don't need to differentiate between an error and false.
+inline bool is_local(int FD) {
+  bool Result;
+  return !is_local(FD, Result) && Result;
+}
+
+/// @brief Does status represent a directory?
+///
+/// @param Path The path to get the type of.
+/// @param Follow For symbolic links, indicates whether to return the file type
+///               of the link itself, or of the target.
+/// @returns A value from the file_type enumeration indicating the type of file.
+file_type get_file_type(const Twine &Path, bool Follow = true);
+
 /// @brief Does status represent a directory?
 ///
 /// @param status A file_status previously returned from status.
@@ -473,8 +530,8 @@ bool is_directory(file_status status);
 /// @brief Is path a directory?
 ///
 /// @param path Input path.
-/// @param result Set to true if \a path is a directory, false if it is not.
-///               Undefined otherwise.
+/// @param result Set to true if \a path is a directory (after following
+///               symlinks, false if it is not. Undefined otherwise.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform-specific error_code.
 std::error_code is_directory(const Twine &path, bool &result);
@@ -495,8 +552,8 @@ bool is_regular_file(file_status status);
 /// @brief Is path a regular file?
 ///
 /// @param path Input path.
-/// @param result Set to true if \a path is a regular file, false if it is not.
-///               Undefined otherwise.
+/// @param result Set to true if \a path is a regular file (after following
+///               symlinks), false if it is not. Undefined otherwise.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform-specific error_code.
 std::error_code is_regular_file(const Twine &path, bool &result);
@@ -510,8 +567,32 @@ inline bool is_regular_file(const Twine &Path) {
   return Result;
 }
 
+/// @brief Does status represent a symlink file?
+///
+/// @param status A file_status previously returned from status.
+/// @returns status_known(status) && status.type() == file_type::symlink_file.
+bool is_symlink_file(file_status status);
+
+/// @brief Is path a symlink file?
+///
+/// @param path Input path.
+/// @param result Set to true if \a path is a symlink file, false if it is not.
+///               Undefined otherwise.
+/// @returns errc::success if result has been successfully set, otherwise a
+///          platform-specific error_code.
+std::error_code is_symlink_file(const Twine &path, bool &result);
+
+/// @brief Simpler version of is_symlink_file for clients that don't need to
+///        differentiate between an error and false.
+inline bool is_symlink_file(const Twine &Path) {
+  bool Result;
+  if (is_symlink_file(Path, Result))
+    return false;
+  return Result;
+}
+
 /// @brief Does this status represent something that exists but is not a
-///        directory, regular file, or symlink?
+///        directory or regular file?
 ///
 /// @param status A file_status previously returned from status.
 /// @returns exists(s) && !is_regular_file(s) && !is_directory(s)
@@ -531,13 +612,37 @@ std::error_code is_other(const Twine &path, bool &result);
 ///
 /// @param path Input path.
 /// @param result Set to the file status.
+/// @param follow When true, follows symlinks.  Otherwise, the symlink itself is
+///               statted.
 /// @returns errc::success if result has been successfully set, otherwise a
 ///          platform-specific error_code.
-std::error_code status(const Twine &path, file_status &result);
+std::error_code status(const Twine &path, file_status &result,
+                       bool follow = true);
 
 /// @brief A version for when a file descriptor is already available.
 std::error_code status(int FD, file_status &Result);
 
+/// @brief Set file permissions.
+///
+/// @param Path File to set permissions on.
+/// @param Permissions New file permissions.
+/// @returns errc::success if the permissions were successfully set, otherwise
+///          a platform-specific error_code.
+/// @note On Windows, all permissions except *_write are ignored. Using any of
+///       owner_write, group_write, or all_write will make the file writable.
+///       Otherwise, the file will be marked as read-only.
+std::error_code setPermissions(const Twine &Path, perms Permissions);
+
+/// @brief Get file permissions.
+///
+/// @param Path File to get permissions from.
+/// @returns the permissions if they were successfully retrieved, otherwise a
+///          platform-specific error_code.
+/// @note On Windows, if the file does not have the FILE_ATTRIBUTE_READONLY
+///       attribute, all_all will be returned. Otherwise, all_read | all_exe
+///       will be returned.
+ErrorOr<perms> getPermissions(const Twine &Path);
+
 /// @brief Get file size.
 ///
 /// @param Path Input path.
@@ -742,12 +847,13 @@ std::string getMainExecutable(const char *argv0, void *MainExecAddr);
 /// called.
 class directory_entry {
   std::string Path;
+  bool FollowSymlinks;
   mutable file_status Status;
 
 public:
-  explicit directory_entry(const Twine &path, file_status st = file_status())
-    : Path(path.str())
-    , Status(st) {}
+  explicit directory_entry(const Twine &path, bool follow_symlinks = true,
+                           file_status st = file_status())
+      : Path(path.str()), FollowSymlinks(follow_symlinks), Status(st) {}
 
   directory_entry() = default;
 
@@ -770,9 +876,10 @@ public:
 };
 
 namespace detail {
+
   struct DirIterState;
 
-  std::error_code directory_iterator_construct(DirIterState &, StringRef);
+  std::error_code directory_iterator_construct(DirIterState &, StringRef, bool);
   std::error_code directory_iterator_increment(DirIterState &);
   std::error_code directory_iterator_destruct(DirIterState &);
 
@@ -785,6 +892,7 @@ namespace detail {
     intptr_t IterationHandle = 0;
     directory_entry CurrentEntry;
   };
+
 } // end namespace detail
 
 /// directory_iterator - Iterates through the entries in path. There is no
@@ -792,18 +900,24 @@ namespace detail {
 /// it call report_fatal_error on error.
 class directory_iterator {
   std::shared_ptr<detail::DirIterState> State;
+  bool FollowSymlinks = true;
 
 public:
-  explicit directory_iterator(const Twine &path, std::error_code &ec) {
+  explicit directory_iterator(const Twine &path, std::error_code &ec,
+                              bool follow_symlinks = true)
+      : FollowSymlinks(follow_symlinks) {
     State = std::make_shared<detail::DirIterState>();
     SmallString<128> path_storage;
-    ec = detail::directory_iterator_construct(*State,
-            path.toStringRef(path_storage));
+    ec = detail::directory_iterator_construct(
+        *State, path.toStringRef(path_storage), FollowSymlinks);
   }
 
-  explicit directory_iterator(const directory_entry &de, std::error_code &ec) {
+  explicit directory_iterator(const directory_entry &de, std::error_code &ec,
+                              bool follow_symlinks = true)
+      : FollowSymlinks(follow_symlinks) {
     State = std::make_shared<detail::DirIterState>();
-    ec = detail::directory_iterator_construct(*State, de.path());
+    ec =
+        detail::directory_iterator_construct(*State, de.path(), FollowSymlinks);
   }
 
   /// Construct end iterator.
@@ -836,24 +950,29 @@ public:
 };
 
 namespace detail {
+
   /// Keeps state for the recursive_directory_iterator.
   struct RecDirIterState {
     std::stack<directory_iterator, std::vector<directory_iterator>> Stack;
     uint16_t Level = 0;
     bool HasNoPushRequest = false;
   };
+
 } // end namespace detail
 
 /// recursive_directory_iterator - Same as directory_iterator except for it
 /// recurses down into child directories.
 class recursive_directory_iterator {
   std::shared_ptr<detail::RecDirIterState> State;
+  bool Follow;
 
 public:
   recursive_directory_iterator() = default;
-  explicit recursive_directory_iterator(const Twine &path, std::error_code &ec)
-      : State(std::make_shared<detail::RecDirIterState>()) {
-    State->Stack.push(directory_iterator(path, ec));
+  explicit recursive_directory_iterator(const Twine &path, std::error_code &ec,
+                                        bool follow_symlinks = true)
+      : State(std::make_shared<detail::RecDirIterState>()),
+        Follow(follow_symlinks) {
+    State->Stack.push(directory_iterator(path, ec, Follow));
     if (State->Stack.top() == directory_iterator())
       State.reset();
   }
@@ -868,7 +987,7 @@ public:
       file_status st;
       if ((ec = State->Stack.top()->status(st))) return *this;
       if (is_directory(st)) {
-        State->Stack.push(directory_iterator(*State->Stack.top(), ec));
+        State->Stack.push(directory_iterator(*State->Stack.top(), ec, Follow));
         if (ec) return *this;
         if (State->Stack.top() != end_itr) {
           ++State->Level;
diff --git a/include/llvm/Support/FormatAdapters.h b/include/llvm/Support/FormatAdapters.h
index 7bacd2e17135c1f119e21c0d89f7a97ac55bfa7c..698e134b328deaba09e1fc791467f786b602bd27 100644
--- a/include/llvm/Support/FormatAdapters.h
+++ b/include/llvm/Support/FormatAdapters.h
@@ -22,9 +22,6 @@ protected:
   explicit FormatAdapter(T &&Item) : Item(Item) {}
 
   T Item;
-
-  static_assert(!detail::uses_missing_provider<T>::value,
-                "Item does not have a format provider!");
 };
 
 namespace detail {
diff --git a/include/llvm/Support/FormatProviders.h b/include/llvm/Support/FormatProviders.h
index 1f0768c3ab08b38ad4569d6031c9dced82adaa04..4e57034ff98e9336478548cfba0fb8ed2a46fe44 100644
--- a/include/llvm/Support/FormatProviders.h
+++ b/include/llvm/Support/FormatProviders.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/FormatVariadicDetails.h"
 #include "llvm/Support/NativeFormatting.h"
 
@@ -45,9 +46,8 @@ struct is_cstring
 
 template <typename T>
 struct use_string_formatter
-    : public std::integral_constant<
-          bool, is_one_of<T, llvm::StringRef, std::string>::value ||
-                    is_cstring<T>::value> {};
+    : public std::integral_constant<bool,
+                                    std::is_convertible<T, llvm::StringRef>::value> {};
 
 template <typename T>
 struct use_pointer_formatter
@@ -205,11 +205,22 @@ struct format_provider<
     if (!Style.empty() && Style.getAsInteger(10, N)) {
       assert(false && "Style is not a valid integer");
     }
-    llvm::StringRef S(V);
+    llvm::StringRef S = V;
     Stream << S.substr(0, N);
   }
 };
 
+/// Implementation of format_provider<T> for llvm::Twine.
+///
+/// This follows the same rules as the string formatter.
+
+template <> struct format_provider<Twine> {
+  static void format(const Twine &V, llvm::raw_ostream &Stream,
+                     StringRef Style) {
+    format_provider<std::string>::format(V.str(), Stream, Style);
+  }
+};
+
 /// Implementation of format_provider<T> for characters.
 ///
 /// The options string of a character type has the grammar:
@@ -359,8 +370,7 @@ template <typename IterT> class format_provider<llvm::iterator_range<IterT>> {
       return Default;
     }
 
-    std::vector<const char *> Delims = {"[]", "<>", "()"};
-    for (const char *D : Delims) {
+    for (const char *D : {"[]", "<>", "()"}) {
       if (Style.front() != D[0])
         continue;
       size_t End = Style.find_first_of(D[1]);
diff --git a/include/llvm/Support/FormatVariadic.h b/include/llvm/Support/FormatVariadic.h
index e5f5c9615cb6b2d96dc6612b63a8e1e716987380..3a4668687cc94b47cc9e5d2019faf22cdc027792 100644
--- a/include/llvm/Support/FormatVariadic.h
+++ b/include/llvm/Support/FormatVariadic.h
@@ -196,7 +196,7 @@ public:
 // "}}" to print a literal '}'.
 //
 // ===Parameter Indexing===
-// `index` specifies the index of the paramter in the parameter pack to format
+// `index` specifies the index of the parameter in the parameter pack to format
 // into the output.  Note that it is possible to refer to the same parameter
 // index multiple times in a given format string.  This makes it possible to
 // output the same value multiple times without passing it multiple times to the
diff --git a/include/llvm/Support/GCOV.h b/include/llvm/Support/GCOV.h
index 9a0fb38bd82838506bc6349f150e6096d87bdd10..73fddca8e35bbbccf5f3718bf0bfbf7e3e47af86 100644
--- a/include/llvm/Support/GCOV.h
+++ b/include/llvm/Support/GCOV.h
@@ -63,7 +63,7 @@ struct Options {
 /// read operations.
 class GCOVBuffer {
 public:
-  GCOVBuffer(MemoryBuffer *B) : Buffer(B), Cursor(0) {}
+  GCOVBuffer(MemoryBuffer *B) : Buffer(B) {}
 
   /// readGCNOFormat - Check GCNO signature is valid at the beginning of buffer.
   bool readGCNOFormat() {
@@ -234,16 +234,14 @@ public:
 
 private:
   MemoryBuffer *Buffer;
-  uint64_t Cursor;
+  uint64_t Cursor = 0;
 };
 
 /// GCOVFile - Collects coverage information for one pair of coverage file
 /// (.gcno and .gcda).
 class GCOVFile {
 public:
-  GCOVFile()
-      : GCNOInitialized(false), Checksum(0), RunCount(0),
-        ProgramCount(0) {}
+  GCOVFile() = default;
 
   bool readGCNO(GCOVBuffer &Buffer);
   bool readGCDA(GCOVBuffer &Buffer);
@@ -253,21 +251,21 @@ public:
   void collectLineCounts(FileInfo &FI);
 
 private:
-  bool GCNOInitialized;
+  bool GCNOInitialized = false;
   GCOV::GCOVVersion Version;
-  uint32_t Checksum;
+  uint32_t Checksum = 0;
   SmallVector<std::unique_ptr<GCOVFunction>, 16> Functions;
-  uint32_t RunCount;
-  uint32_t ProgramCount;
+  uint32_t RunCount = 0;
+  uint32_t ProgramCount = 0;
 };
 
 /// GCOVEdge - Collects edge information.
 struct GCOVEdge {
-  GCOVEdge(GCOVBlock &S, GCOVBlock &D) : Src(S), Dst(D), Count(0) {}
+  GCOVEdge(GCOVBlock &S, GCOVBlock &D) : Src(S), Dst(D) {}
 
   GCOVBlock &Src;
   GCOVBlock &Dst;
-  uint64_t Count;
+  uint64_t Count = 0;
 };
 
 /// GCOVFunction - Collects function information.
@@ -276,7 +274,8 @@ public:
   typedef pointee_iterator<SmallVectorImpl<
       std::unique_ptr<GCOVBlock>>::const_iterator> BlockIterator;
 
-  GCOVFunction(GCOVFile &P) : Parent(P), Ident(0), LineNumber(0) {}
+  GCOVFunction(GCOVFile &P) : Parent(P) {}
+
   bool readGCNO(GCOVBuffer &Buffer, GCOV::GCOVVersion Version);
   bool readGCDA(GCOVBuffer &Buffer, GCOV::GCOVVersion Version);
   StringRef getName() const { return Name; }
@@ -297,9 +296,9 @@ public:
 
 private:
   GCOVFile &Parent;
-  uint32_t Ident;
+  uint32_t Ident = 0;
   uint32_t Checksum;
-  uint32_t LineNumber;
+  uint32_t LineNumber = 0;
   StringRef Name;
   StringRef Filename;
   SmallVector<std::unique_ptr<GCOVBlock>, 16> Blocks;
@@ -309,10 +308,10 @@ private:
 /// GCOVBlock - Collects block information.
 class GCOVBlock {
   struct EdgeWeight {
-    EdgeWeight(GCOVBlock *D) : Dst(D), Count(0) {}
+    EdgeWeight(GCOVBlock *D) : Dst(D) {}
 
     GCOVBlock *Dst;
-    uint64_t Count;
+    uint64_t Count = 0;
   };
 
   struct SortDstEdgesFunctor {
@@ -324,8 +323,7 @@ class GCOVBlock {
 public:
   typedef SmallVectorImpl<GCOVEdge *>::const_iterator EdgeIterator;
 
-  GCOVBlock(GCOVFunction &P, uint32_t N)
-      : Parent(P), Number(N), Counter(0), DstEdgesAreSorted(true) {}
+  GCOVBlock(GCOVFunction &P, uint32_t N) : Parent(P), Number(N) {}
   ~GCOVBlock();
 
   const GCOVFunction &getParent() const { return Parent; }
@@ -370,8 +368,8 @@ public:
 private:
   GCOVFunction &Parent;
   uint32_t Number;
-  uint64_t Counter;
-  bool DstEdgesAreSorted;
+  uint64_t Counter = 0;
+  bool DstEdgesAreSorted = true;
   SmallVector<GCOVEdge *, 16> SrcEdges;
   SmallVector<GCOVEdge *, 16> DstEdges;
   SmallVector<uint32_t, 16> Lines;
@@ -389,30 +387,28 @@ class FileInfo {
   typedef DenseMap<uint32_t, BlockVector> BlockLines;
 
   struct LineData {
-    LineData() : LastLine(0) {}
+    LineData() = default;
+
     BlockLines Blocks;
     FunctionLines Functions;
-    uint32_t LastLine;
+    uint32_t LastLine = 0;
   };
 
   struct GCOVCoverage {
-    GCOVCoverage(StringRef Name)
-        : Name(Name), LogicalLines(0), LinesExec(0), Branches(0),
-          BranchesExec(0), BranchesTaken(0) {}
+    GCOVCoverage(StringRef Name) : Name(Name) {}
 
     StringRef Name;
 
-    uint32_t LogicalLines;
-    uint32_t LinesExec;
+    uint32_t LogicalLines = 0;
+    uint32_t LinesExec = 0;
 
-    uint32_t Branches;
-    uint32_t BranchesExec;
-    uint32_t BranchesTaken;
+    uint32_t Branches = 0;
+    uint32_t BranchesExec = 0;
+    uint32_t BranchesTaken = 0;
   };
 
 public:
-  FileInfo(const GCOV::Options &Options)
-      : Options(Options), RunCount(0), ProgramCount(0) {}
+  FileInfo(const GCOV::Options &Options) : Options(Options) {}
 
   void addBlockLine(StringRef Filename, uint32_t Line, const GCOVBlock *Block) {
     if (Line > LineInfo[Filename].LastLine)
@@ -449,8 +445,8 @@ private:
 
   const GCOV::Options &Options;
   StringMap<LineData> LineInfo;
-  uint32_t RunCount;
-  uint32_t ProgramCount;
+  uint32_t RunCount = 0;
+  uint32_t ProgramCount = 0;
 
   typedef SmallVector<std::pair<std::string, GCOVCoverage>, 4> FileCoverageList;
   typedef MapVector<const GCOVFunction *, GCOVCoverage> FuncCoverageMap;
diff --git a/include/llvm/Support/GenericDomTree.h b/include/llvm/Support/GenericDomTree.h
index 6e6ee400164407220356cf081448f67b5ee88c0d..20f3ffdf3aab79a85b6f5ebdd93be1a25d5e844d 100644
--- a/include/llvm/Support/GenericDomTree.h
+++ b/include/llvm/Support/GenericDomTree.h
@@ -13,7 +13,7 @@
 /// dominance queries on the CFG, but is fully generic w.r.t. the underlying
 /// graph types.
 ///
-/// Unlike ADT/* graph algorithms, generic dominator tree has more reuiqrement
+/// Unlike ADT/* graph algorithms, generic dominator tree has more requirements
 /// on the graph's NodeRef. The NodeRef should be a pointer and, depending on
 /// the implementation, e.g. NodeRef->getParent() return the parent node.
 ///
@@ -25,14 +25,19 @@
 #define LLVM_SUPPORT_GENERICDOMTREE_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <vector>
 
 namespace llvm {
 
@@ -47,7 +52,7 @@ template <typename GT> struct DominatorTreeBaseTraits {
       typename std::remove_pointer<typename GT::NodeRef>::type>;
 };
 
-} // End namespace detail
+} // end namespace detail
 
 template <typename GT>
 using DominatorTreeBaseByGraphTraits =
@@ -59,13 +64,16 @@ template <class NodeT> class DominatorBase {
 protected:
   std::vector<NodeT *> Roots;
   bool IsPostDominators;
+
   explicit DominatorBase(bool isPostDom)
       : Roots(), IsPostDominators(isPostDom) {}
+
   DominatorBase(DominatorBase &&Arg)
       : Roots(std::move(Arg.Roots)),
         IsPostDominators(std::move(Arg.IsPostDominators)) {
     Arg.Roots.clear();
   }
+
   DominatorBase &operator=(DominatorBase &&RHS) {
     Roots = std::move(RHS.Roots);
     IsPostDominators = std::move(RHS.IsPostDominators);
@@ -85,19 +93,21 @@ public:
   bool isPostDominator() const { return IsPostDominators; }
 };
 
-struct PostDominatorTree;
-
 /// \brief Base class for the actual dominator tree node.
 template <class NodeT> class DomTreeNodeBase {
+  friend struct PostDominatorTree;
+  template <class N> friend class DominatorTreeBase;
+
   NodeT *TheBB;
   DomTreeNodeBase<NodeT> *IDom;
   std::vector<DomTreeNodeBase<NodeT> *> Children;
-  mutable int DFSNumIn, DFSNumOut;
-
-  template <class N> friend class DominatorTreeBase;
-  friend struct PostDominatorTree;
+  mutable int DFSNumIn = -1;
+  mutable int DFSNumOut = -1;
 
 public:
+  DomTreeNodeBase(NodeT *BB, DomTreeNodeBase<NodeT> *iDom)
+      : TheBB(BB), IDom(iDom) {}
+
   typedef typename std::vector<DomTreeNodeBase<NodeT> *>::iterator iterator;
   typedef typename std::vector<DomTreeNodeBase<NodeT> *>::const_iterator
       const_iterator;
@@ -109,13 +119,11 @@ public:
 
   NodeT *getBlock() const { return TheBB; }
   DomTreeNodeBase<NodeT> *getIDom() const { return IDom; }
+
   const std::vector<DomTreeNodeBase<NodeT> *> &getChildren() const {
     return Children;
   }
 
-  DomTreeNodeBase(NodeT *BB, DomTreeNodeBase<NodeT> *iDom)
-      : TheBB(BB), IDom(iDom), DFSNumIn(-1), DFSNumOut(-1) {}
-
   std::unique_ptr<DomTreeNodeBase<NodeT>>
   addChild(std::unique_ptr<DomTreeNodeBase<NodeT>> C) {
     Children.push_back(C.get());
@@ -206,9 +214,6 @@ void Calculate(DominatorTreeBaseByGraphTraits<GraphTraits<N>> &DT, FuncT &F);
 /// This class is a generic template over graph nodes. It is instantiated for
 /// various graphs in the LLVM IR or in the code generator.
 template <class NodeT> class DominatorTreeBase : public DominatorBase<NodeT> {
-  DominatorTreeBase(const DominatorTreeBase &) = delete;
-  DominatorTreeBase &operator=(const DominatorTreeBase &) = delete;
-
   bool dominatedBySlowTreeWalk(const DomTreeNodeBase<NodeT> *A,
                                const DomTreeNodeBase<NodeT> *B) const {
     assert(A != B);
@@ -239,16 +244,16 @@ protected:
   DomTreeNodeMapType DomTreeNodes;
   DomTreeNodeBase<NodeT> *RootNode;
 
-  mutable bool DFSInfoValid;
-  mutable unsigned int SlowQueries;
+  mutable bool DFSInfoValid = false;
+  mutable unsigned int SlowQueries = 0;
   // Information record used during immediate dominators computation.
   struct InfoRec {
-    unsigned DFSNum;
-    unsigned Parent;
-    unsigned Semi;
-    NodeT *Label;
+    unsigned DFSNum = 0;
+    unsigned Parent = 0;
+    unsigned Semi = 0;
+    NodeT *Label = nullptr;
 
-    InfoRec() : DFSNum(0), Parent(0), Semi(0), Label(nullptr) {}
+    InfoRec() = default;
   };
 
   DenseMap<NodeT *, NodeT *> IDoms;
@@ -336,7 +341,7 @@ protected:
 
 public:
   explicit DominatorTreeBase(bool isPostDom)
-      : DominatorBase<NodeT>(isPostDom), DFSInfoValid(false), SlowQueries(0) {}
+      : DominatorBase<NodeT>(isPostDom) {}
 
   DominatorTreeBase(DominatorTreeBase &&Arg)
       : DominatorBase<NodeT>(
@@ -348,6 +353,7 @@ public:
         Vertex(std::move(Arg.Vertex)), Info(std::move(Arg.Info)) {
     Arg.wipe();
   }
+
   DominatorTreeBase &operator=(DominatorTreeBase &&RHS) {
     DominatorBase<NodeT>::operator=(
         std::move(static_cast<DominatorBase<NodeT> &>(RHS)));
@@ -362,6 +368,9 @@ public:
     return *this;
   }
 
+  DominatorTreeBase(const DominatorTreeBase &) = delete;
+  DominatorTreeBase &operator=(const DominatorTreeBase &) = delete;
+
   /// compare - Return false if the other dominator tree base matches this
   /// dominator tree base. Otherwise return true.
   bool compare(const DominatorTreeBase &Other) const {
@@ -683,6 +692,10 @@ protected:
   Eval(DominatorTreeBaseByGraphTraits<GraphT> &DT, typename GraphT::NodeRef V,
        unsigned LastLinked);
 
+  template <class GraphT>
+  friend unsigned ReverseDFSPass(DominatorTreeBaseByGraphTraits<GraphT> &DT,
+                                 typename GraphT::NodeRef V, unsigned N);
+
   template <class GraphT>
   friend unsigned DFSPass(DominatorTreeBaseByGraphTraits<GraphT> &DT,
                           typename GraphT::NodeRef V, unsigned N);
@@ -716,7 +729,6 @@ public:
   /// updateDFSNumbers - Assign In and Out numbers to the nodes while walking
   /// dominator tree in dfs order.
   void updateDFSNumbers() const {
-
     if (DFSInfoValid) {
       SlowQueries = 0;
       return;
@@ -778,11 +790,9 @@ public:
       Calculate<FT, NodeT *>(*this, F);
     } else {
       // Initialize the roots list
-      for (typename TraitsTy::nodes_iterator I = TraitsTy::nodes_begin(&F),
-                                             E = TraitsTy::nodes_end(&F);
-           I != E; ++I)
-        if (TraitsTy::child_begin(*I) == TraitsTy::child_end(*I))
-          addRoot(*I);
+      for (auto *Node : nodes(&F))
+        if (TraitsTy::child_begin(Node) == TraitsTy::child_end(Node))
+          addRoot(Node);
 
       Calculate<FT, Inverse<NodeT *>>(*this, F);
     }
@@ -815,6 +825,6 @@ bool DominatorTreeBase<NodeT>::properlyDominates(const NodeT *A,
                    getNode(const_cast<NodeT *>(B)));
 }
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_SUPPORT_GENERICDOMTREE_H
diff --git a/include/llvm/Support/GenericDomTreeConstruction.h b/include/llvm/Support/GenericDomTreeConstruction.h
index 54e55cc1a32e19dcae61d7b8a5a34bec0d45f077..c1d757f3ab6a37ad16296e85252f9bc0d7b8d906 100644
--- a/include/llvm/Support/GenericDomTreeConstruction.h
+++ b/include/llvm/Support/GenericDomTreeConstruction.h
@@ -24,82 +24,77 @@
 #ifndef LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H
 #define LLVM_SUPPORT_GENERICDOMTREECONSTRUCTION_H
 
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/GenericDomTree.h"
 
 namespace llvm {
 
-template <class GraphT>
-unsigned DFSPass(DominatorTreeBaseByGraphTraits<GraphT> &DT,
-                 typename GraphT::NodeRef V, unsigned N) {
-  // This is more understandable as a recursive algorithm, but we can't use the
-  // recursive algorithm due to stack depth issues.  Keep it here for
-  // documentation purposes.
-#if 0
-  InfoRec &VInfo = DT.Info[DT.Roots[i]];
-  VInfo.DFSNum = VInfo.Semi = ++N;
-  VInfo.Label = V;
-
-  Vertex.push_back(V);        // Vertex[n] = V;
-
-  for (succ_iterator SI = succ_begin(V), E = succ_end(V); SI != E; ++SI) {
-    InfoRec &SuccVInfo = DT.Info[*SI];
-    if (SuccVInfo.Semi == 0) {
-      SuccVInfo.Parent = V;
-      N = DTDFSPass(DT, *SI, N);
-    }
+// External storage for depth first iterator that reuses the info lookup map
+// domtree already has.  We don't have a set, but a map instead, so we are
+// converting the one argument insert calls.
+template <class NodeRef, class InfoType> struct df_iterator_dom_storage {
+public:
+  typedef DenseMap<NodeRef, InfoType> BaseSet;
+  df_iterator_dom_storage(BaseSet &Storage) : Storage(Storage) {}
+
+  typedef typename BaseSet::iterator iterator;
+  std::pair<iterator, bool> insert(NodeRef N) {
+    return Storage.insert({N, InfoType()});
   }
-#else
-  bool IsChildOfArtificialExit = (N != 0);
+  void completed(NodeRef) {}
 
-  SmallVector<
-      std::pair<typename GraphT::NodeRef, typename GraphT::ChildIteratorType>,
-      32>
-      Worklist;
-  Worklist.push_back(std::make_pair(V, GraphT::child_begin(V)));
-  while (!Worklist.empty()) {
-    typename GraphT::NodeRef BB = Worklist.back().first;
-    typename GraphT::ChildIteratorType NextSucc = Worklist.back().second;
+private:
+  BaseSet &Storage;
+};
 
+template <class GraphT>
+unsigned ReverseDFSPass(DominatorTreeBaseByGraphTraits<GraphT> &DT,
+                        typename GraphT::NodeRef V, unsigned N) {
+  df_iterator_dom_storage<
+      typename GraphT::NodeRef,
+      typename DominatorTreeBaseByGraphTraits<GraphT>::InfoRec>
+      DFStorage(DT.Info);
+  bool IsChildOfArtificialExit = (N != 0);
+  for (auto I = idf_ext_begin(V, DFStorage), E = idf_ext_end(V, DFStorage);
+       I != E; ++I) {
+    typename GraphT::NodeRef BB = *I;
     auto &BBInfo = DT.Info[BB];
+    BBInfo.DFSNum = BBInfo.Semi = ++N;
+    BBInfo.Label = BB;
+    // Set the parent to the top of the visited stack.  The stack includes us,
+    // and is 1 based, so we subtract to account for both of these.
+    if (I.getPathLength() > 1)
+      BBInfo.Parent = DT.Info[I.getPath(I.getPathLength() - 2)].DFSNum;
+    DT.Vertex.push_back(BB); // Vertex[n] = V;
 
-    // First time we visited this BB?
-    if (NextSucc == GraphT::child_begin(BB)) {
-      BBInfo.DFSNum = BBInfo.Semi = ++N;
-      BBInfo.Label = BB;
-
-      DT.Vertex.push_back(BB);       // Vertex[n] = V;
-
-      if (IsChildOfArtificialExit)
-        BBInfo.Parent = 1;
-
-      IsChildOfArtificialExit = false;
-    }
-
-    // store the DFS number of the current BB - the reference to BBInfo might
-    // get invalidated when processing the successors.
-    unsigned BBDFSNum = BBInfo.DFSNum;
-
-    // If we are done with this block, remove it from the worklist.
-    if (NextSucc == GraphT::child_end(BB)) {
-      Worklist.pop_back();
-      continue;
-    }
-
-    // Increment the successor number for the next time we get to it.
-    ++Worklist.back().second;
-
-    // Visit the successor next, if it isn't already visited.
-    typename GraphT::NodeRef Succ = *NextSucc;
+    if (IsChildOfArtificialExit)
+      BBInfo.Parent = 1;
 
-    auto &SuccVInfo = DT.Info[Succ];
-    if (SuccVInfo.Semi == 0) {
-      SuccVInfo.Parent = BBDFSNum;
-      Worklist.push_back(std::make_pair(Succ, GraphT::child_begin(Succ)));
-    }
+    IsChildOfArtificialExit = false;
   }
-#endif
-    return N;
+  return N;
+}
+template <class GraphT>
+unsigned DFSPass(DominatorTreeBaseByGraphTraits<GraphT> &DT,
+                 typename GraphT::NodeRef V, unsigned N) {
+  df_iterator_dom_storage<
+      typename GraphT::NodeRef,
+      typename DominatorTreeBaseByGraphTraits<GraphT>::InfoRec>
+      DFStorage(DT.Info);
+  for (auto I = df_ext_begin(V, DFStorage), E = df_ext_end(V, DFStorage);
+       I != E; ++I) {
+    typename GraphT::NodeRef BB = *I;
+    auto &BBInfo = DT.Info[BB];
+    BBInfo.DFSNum = BBInfo.Semi = ++N;
+    BBInfo.Label = BB;
+    // Set the parent to the top of the visited stack.  The stack includes us,
+    // and is 1 based, so we subtract to account for both of these.
+    if (I.getPathLength() > 1)
+      BBInfo.Parent = DT.Info[I.getPath(I.getPathLength() - 2)].DFSNum;
+    DT.Vertex.push_back(BB); // Vertex[n] = V;
+  }
+  return N;
 }
 
 template <class GraphT>
@@ -163,9 +158,13 @@ void Calculate(DominatorTreeBaseByGraphTraits<GraphTraits<NodeT>> &DT,
 
   // Step #1: Number blocks in depth-first order and initialize variables used
   // in later stages of the algorithm.
-  for (unsigned i = 0, e = static_cast<unsigned>(DT.Roots.size());
-       i != e; ++i)
-    N = DFSPass<GraphT>(DT, DT.Roots[i], N);
+  if (DT.isPostDominator()){
+    for (unsigned i = 0, e = static_cast<unsigned>(DT.Roots.size());
+         i != e; ++i)
+      N = ReverseDFSPass<GraphT>(DT, DT.Roots[i], N);
+  } else {
+    N = DFSPass<GraphT>(DT, DT.Roots[0], N);
+  }
 
   // it might be that some blocks did not get a DFS number (e.g., blocks of
   // infinite loops). In these cases an artificial exit node is required.
@@ -201,17 +200,12 @@ void Calculate(DominatorTreeBaseByGraphTraits<GraphTraits<NodeT>> &DT,
 
     // initialize the semi dominator to point to the parent node
     WInfo.Semi = WInfo.Parent;
-    typedef GraphTraits<Inverse<NodeT> > InvTraits;
-    for (typename InvTraits::ChildIteratorType CI =
-         InvTraits::child_begin(W),
-         E = InvTraits::child_end(W); CI != E; ++CI) {
-      typename InvTraits::NodeRef N = *CI;
-      if (DT.Info.count(N)) {  // Only if this predecessor is reachable!
+    for (const auto &N : inverse_children<NodeT>(W))
+      if (DT.Info.count(N)) { // Only if this predecessor is reachable!
         unsigned SemiU = DT.Info[Eval<GraphT>(DT, N, i + 1)].Semi;
         if (SemiU < WInfo.Semi)
           WInfo.Semi = SemiU;
       }
-    }
 
     // If V is a non-root vertex and sdom(V) = parent(V), then idom(V) is
     // necessarily parent(V). In this case, set idom(V) here and avoid placing
diff --git a/include/llvm/Support/Host.h b/include/llvm/Support/Host.h
index 9df584c68c0d9b79261e9a8e94fba8278b0f5d80..89986fdae9713d23546e50c9d29e87ebebb14f7f 100644
--- a/include/llvm/Support/Host.h
+++ b/include/llvm/Support/Host.h
@@ -15,6 +15,7 @@
 #define LLVM_SUPPORT_HOST_H
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/Support/MemoryBuffer.h"
 
 #if defined(__linux__) || defined(__GNU__) || defined(__HAIKU__)
 #include <endian.h>
@@ -32,9 +33,9 @@ namespace llvm {
 namespace sys {
 
 #if defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN
-  static const bool IsBigEndianHost = true;
+constexpr bool IsBigEndianHost = true;
 #else
-  static const bool IsBigEndianHost = false;
+constexpr bool IsBigEndianHost = false;
 #endif
 
   static const bool IsLittleEndianHost = !IsBigEndianHost;
@@ -75,6 +76,13 @@ namespace sys {
   /// from thread::hardware_concurrency(), which includes hyperthreads).
   /// Returns -1 if unknown for the current host system.
   int getHostNumPhysicalCores();
+
+  namespace detail {
+  /// Helper functions to extract HostCPUName from /proc/cpuinfo on linux.
+  StringRef getHostCPUNameForPowerPC(const StringRef &ProcCpuinfoContent);
+  StringRef getHostCPUNameForARM(const StringRef &ProcCpuinfoContent);
+  StringRef getHostCPUNameForS390x(const StringRef &ProcCpuinfoContent);
+  }
 }
 }
 
diff --git a/include/llvm/Support/LEB128.h b/include/llvm/Support/LEB128.h
index 6a95432ca2d93c69dd5c89e5186844d75d8f3b81..ff775f3b7b364d8415c3dc94a0fa87efe4d3e994 100644
--- a/include/llvm/Support/LEB128.h
+++ b/include/llvm/Support/LEB128.h
@@ -20,7 +20,8 @@
 namespace llvm {
 
 /// Utility function to encode a SLEB128 value to an output stream.
-inline void encodeSLEB128(int64_t Value, raw_ostream &OS) {
+inline void encodeSLEB128(int64_t Value, raw_ostream &OS,
+                          unsigned Padding = 0) {
   bool More;
   do {
     uint8_t Byte = Value & 0x7f;
@@ -28,10 +29,45 @@ inline void encodeSLEB128(int64_t Value, raw_ostream &OS) {
     Value >>= 7;
     More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) ||
               ((Value == -1) && ((Byte & 0x40) != 0))));
-    if (More)
+    if (More || Padding != 0)
       Byte |= 0x80; // Mark this byte to show that more bytes will follow.
     OS << char(Byte);
   } while (More);
+
+  // Pad with 0x80 and emit a terminating byte at the end.
+  if (Padding != 0) {
+    uint8_t PadValue = Value < 0 ? 0x7f : 0x00;
+    for (; Padding != 1; --Padding)
+      OS << char(PadValue | 0x80);
+    OS << char(PadValue);
+  }
+}
+
+/// Utility function to encode a SLEB128 value to a buffer. Returns
+/// the length in bytes of the encoded value.
+inline unsigned encodeSLEB128(int64_t Value, uint8_t *p,
+                              unsigned Padding = 0) {
+  uint8_t *orig_p = p;
+  bool More;
+  do {
+    uint8_t Byte = Value & 0x7f;
+    // NOTE: this assumes that this signed shift is an arithmetic right shift.
+    Value >>= 7;
+    More = !((((Value == 0 ) && ((Byte & 0x40) == 0)) ||
+              ((Value == -1) && ((Byte & 0x40) != 0))));
+    if (More || Padding != 0)
+      Byte |= 0x80; // Mark this byte to show that more bytes will follow.
+    *p++ = Byte;
+  } while (More);
+
+  // Pad with 0x80 and emit a terminating byte at the end.
+  if (Padding != 0) {
+    uint8_t PadValue = Value < 0 ? 0x7f : 0x00;
+    for (; Padding != 1; --Padding)
+      *p++ = (PadValue | 0x80);
+    *p++ = PadValue;
+  }
+  return (unsigned)(p - orig_p);
 }
 
 /// Utility function to encode a ULEB128 value to an output stream.
@@ -77,11 +113,30 @@ inline unsigned encodeULEB128(uint64_t Value, uint8_t *p,
 
 
 /// Utility function to decode a ULEB128 value.
-inline uint64_t decodeULEB128(const uint8_t *p, unsigned *n = nullptr) {
+inline uint64_t decodeULEB128(const uint8_t *p, unsigned *n = nullptr,
+                              const uint8_t *end = nullptr,
+                              const char **error = nullptr) {
   const uint8_t *orig_p = p;
   uint64_t Value = 0;
   unsigned Shift = 0;
+  if(error)
+    *error = nullptr;
   do {
+    if(end && p == end){
+      if(error)
+        *error = "malformed uleb128, extends past end";
+      if (n)
+        *n = (unsigned)(p - orig_p);
+      return 0;
+    }
+    uint64_t Slice = *p & 0x7f;
+    if(Shift >= 64 || Slice << Shift >> Shift != Slice){
+      if(error)
+        *error = "uleb128 too big for uint64";
+      if (n)
+        *n = (unsigned)(p - orig_p);
+      return 0;
+    }
     Value += uint64_t(*p & 0x7f) << Shift;
     Shift += 7;
   } while (*p++ >= 128);
@@ -91,12 +146,21 @@ inline uint64_t decodeULEB128(const uint8_t *p, unsigned *n = nullptr) {
 }
 
 /// Utility function to decode a SLEB128 value.
-inline int64_t decodeSLEB128(const uint8_t *p, unsigned *n = nullptr) {
+inline int64_t decodeSLEB128(const uint8_t *p, unsigned *n = nullptr,
+                             const uint8_t *end = nullptr,
+                             const char **error = nullptr) {
   const uint8_t *orig_p = p;
   int64_t Value = 0;
   unsigned Shift = 0;
   uint8_t Byte;
   do {
+    if(end && p == end){
+      if(error)
+        *error = "malformed sleb128, extends past end";
+      if (n)
+        *n = (unsigned)(p - orig_p);
+      return 0;
+    }
     Byte = *p++;
     Value |= ((Byte & 0x7f) << Shift);
     Shift += 7;
diff --git a/include/llvm/Support/LowLevelTypeImpl.h b/include/llvm/Support/LowLevelTypeImpl.h
new file mode 100644
index 0000000000000000000000000000000000000000..02df4d806f13bfd163cf85f16b3d88925db56359
--- /dev/null
+++ b/include/llvm/Support/LowLevelTypeImpl.h
@@ -0,0 +1,202 @@
+//== llvm/Support/LowLevelTypeImpl.h --------------------------- -*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// Implement a low-level type suitable for MachineInstr level instruction
+/// selection.
+///
+/// For a type attached to a MachineInstr, we only care about 2 details: total
+/// size and the number of vector lanes (if any). Accordingly, there are 4
+/// possible valid type-kinds:
+///
+///    * `sN` for scalars and aggregates
+///    * `<N x sM>` for vectors, which must have at least 2 elements.
+///    * `pN` for pointers
+///
+/// Other information required for correct selection is expected to be carried
+/// by the opcode, or non-type flags. For example the distinction between G_ADD
+/// and G_FADD for int/float or fast-math flags.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
+#define LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
+
+#include <cassert>
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+
+namespace llvm {
+
+class DataLayout;
+class Type;
+class raw_ostream;
+
+class LLT {
+public:
+  enum TypeKind : uint16_t {
+    Invalid,
+    Scalar,
+    Pointer,
+    Vector,
+  };
+
+  /// Get a low-level scalar or aggregate "bag of bits".
+  static LLT scalar(unsigned SizeInBits) {
+    assert(SizeInBits > 0 && "invalid scalar size");
+    return LLT{Scalar, 1, SizeInBits};
+  }
+
+  /// Get a low-level pointer in the given address space (defaulting to 0).
+  static LLT pointer(uint16_t AddressSpace, unsigned SizeInBits) {
+    return LLT{Pointer, AddressSpace, SizeInBits};
+  }
+
+  /// Get a low-level vector of some number of elements and element width.
+  /// \p NumElements must be at least 2.
+  static LLT vector(uint16_t NumElements, unsigned ScalarSizeInBits) {
+    assert(NumElements > 1 && "invalid number of vector elements");
+    return LLT{Vector, NumElements, ScalarSizeInBits};
+  }
+
+  /// Get a low-level vector of some number of elements and element type.
+  static LLT vector(uint16_t NumElements, LLT ScalarTy) {
+    assert(NumElements > 1 && "invalid number of vector elements");
+    assert(ScalarTy.isScalar() && "invalid vector element type");
+    return LLT{Vector, NumElements, ScalarTy.getSizeInBits()};
+  }
+
+  explicit LLT(TypeKind Kind, uint16_t NumElements, unsigned SizeInBits)
+    : SizeInBits(SizeInBits), ElementsOrAddrSpace(NumElements), Kind(Kind) {
+    assert((Kind != Vector || ElementsOrAddrSpace > 1) &&
+           "invalid number of vector elements");
+  }
+
+  explicit LLT() : SizeInBits(0), ElementsOrAddrSpace(0), Kind(Invalid) {}
+
+  explicit LLT(MVT VT);
+
+  bool isValid() const { return Kind != Invalid; }
+
+  bool isScalar() const { return Kind == Scalar; }
+
+  bool isPointer() const { return Kind == Pointer; }
+
+  bool isVector() const { return Kind == Vector; }
+
+  /// Returns the number of elements in a vector LLT. Must only be called on
+  /// vector types.
+  uint16_t getNumElements() const {
+    assert(isVector() && "cannot get number of elements on scalar/aggregate");
+    return ElementsOrAddrSpace;
+  }
+
+  /// Returns the total size of the type. Must only be called on sized types.
+  unsigned getSizeInBits() const {
+    if (isPointer() || isScalar())
+      return SizeInBits;
+    return SizeInBits * ElementsOrAddrSpace;
+  }
+
+  unsigned getScalarSizeInBits() const {
+    return SizeInBits;
+  }
+
+  unsigned getAddressSpace() const {
+    assert(isPointer() && "cannot get address space of non-pointer type");
+    return ElementsOrAddrSpace;
+  }
+
+  /// Returns the vector's element type. Only valid for vector types.
+  LLT getElementType() const {
+    assert(isVector() && "cannot get element type of scalar/aggregate");
+    return scalar(SizeInBits);
+  }
+
+  /// Get a low-level type with half the size of the original, by halving the
+  /// size of the scalar type involved. For example `s32` will become `s16`,
+  /// `<2 x s32>` will become `<2 x s16>`.
+  LLT halfScalarSize() const {
+    assert(!isPointer() && getScalarSizeInBits() > 1 &&
+           getScalarSizeInBits() % 2 == 0 && "cannot half size of this type");
+    return LLT{Kind, ElementsOrAddrSpace, SizeInBits / 2};
+  }
+
+  /// Get a low-level type with twice the size of the original, by doubling the
+  /// size of the scalar type involved. For example `s32` will become `s64`,
+  /// `<2 x s32>` will become `<2 x s64>`.
+  LLT doubleScalarSize() const {
+    assert(!isPointer() && "cannot change size of this type");
+    return LLT{Kind, ElementsOrAddrSpace, SizeInBits * 2};
+  }
+
+  /// Get a low-level type with half the size of the original, by halving the
+  /// number of vector elements of the scalar type involved. The source must be
+  /// a vector type with an even number of elements. For example `<4 x s32>`
+  /// will become `<2 x s32>`, `<2 x s32>` will become `s32`.
+  LLT halfElements() const {
+    assert(isVector() && ElementsOrAddrSpace % 2 == 0 &&
+           "cannot half odd vector");
+    if (ElementsOrAddrSpace == 2)
+      return scalar(SizeInBits);
+
+    return LLT{Vector, static_cast<uint16_t>(ElementsOrAddrSpace / 2),
+               SizeInBits};
+  }
+
+  /// Get a low-level type with twice the size of the original, by doubling the
+  /// number of vector elements of the scalar type involved. The source must be
+  /// a vector type. For example `<2 x s32>` will become `<4 x s32>`. Doubling
+  /// the number of elements in sN produces <2 x sN>.
+  LLT doubleElements() const {
+    assert(!isPointer() && "cannot double elements in pointer");
+    return LLT{Vector, static_cast<uint16_t>(ElementsOrAddrSpace * 2),
+               SizeInBits};
+  }
+
+  void print(raw_ostream &OS) const;
+
+  bool operator==(const LLT &RHS) const {
+    return Kind == RHS.Kind && SizeInBits == RHS.SizeInBits &&
+           ElementsOrAddrSpace == RHS.ElementsOrAddrSpace;
+  }
+
+  bool operator!=(const LLT &RHS) const { return !(*this == RHS); }
+
+  friend struct DenseMapInfo<LLT>;
+private:
+  unsigned SizeInBits;
+  uint16_t ElementsOrAddrSpace;
+  TypeKind Kind;
+};
+
+inline raw_ostream& operator<<(raw_ostream &OS, const LLT &Ty) {
+  Ty.print(OS);
+  return OS;
+}
+
+template<> struct DenseMapInfo<LLT> {
+  static inline LLT getEmptyKey() {
+    return LLT{LLT::Invalid, 0, -1u};
+  }
+  static inline LLT getTombstoneKey() {
+    return LLT{LLT::Invalid, 0, -2u};
+  }
+  static inline unsigned getHashValue(const LLT &Ty) {
+    uint64_t Val = ((uint64_t)Ty.SizeInBits << 32) |
+                   ((uint64_t)Ty.ElementsOrAddrSpace << 16) | (uint64_t)Ty.Kind;
+    return DenseMapInfo<uint64_t>::getHashValue(Val);
+  }
+  static bool isEqual(const LLT &LHS, const LLT &RHS) {
+    return LHS == RHS;
+  }
+};
+
+}
+
+#endif // LLVM_SUPPORT_LOWLEVELTYPEIMPL_H
diff --git a/include/llvm/Support/MD5.h b/include/llvm/Support/MD5.h
index eb181bfe8a5c8066f6088a305435d36a3ac09926..2c0dc76485f85f2b7b8f6640dda61bfa5a4cd932 100644
--- a/include/llvm/Support/MD5.h
+++ b/include/llvm/Support/MD5.h
@@ -1,4 +1,4 @@
-/*
+/* -*- C++ -*-
  * This code is derived from (original license follows):
  *
  * This is an OpenSSL-compatible implementation of the RSA Data Security, Inc.
@@ -29,24 +29,55 @@
 #define LLVM_SUPPORT_MD5_H
 
 #include "llvm/ADT/SmallString.h"
-#include "llvm/Support/DataTypes.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Endian.h"
 #include <array>
+#include <cstdint>
 
 namespace llvm {
+
 template <typename T> class ArrayRef;
 
 class MD5 {
   // Any 32-bit or wider unsigned integer data type will do.
   typedef uint32_t MD5_u32plus;
 
-  MD5_u32plus a, b, c, d;
-  MD5_u32plus hi, lo;
+  MD5_u32plus a = 0x67452301;
+  MD5_u32plus b = 0xefcdab89;
+  MD5_u32plus c = 0x98badcfe;
+  MD5_u32plus d = 0x10325476;
+  MD5_u32plus hi = 0;
+  MD5_u32plus lo = 0;
   uint8_t buffer[64];
   MD5_u32plus block[16];
 
 public:
-  typedef uint8_t MD5Result[16];
+  struct MD5Result {
+    std::array<uint8_t, 16> Bytes;
+
+    operator std::array<uint8_t, 16>() const { return Bytes; }
+
+    const uint8_t &operator[](size_t I) const { return Bytes[I]; }
+    uint8_t &operator[](size_t I) { return Bytes[I]; }
+
+    SmallString<32> digest() const;
+
+    uint64_t low() const {
+      // Our MD5 implementation returns the result in little endian, so the low
+      // word is first.
+      using namespace support;
+      return endian::read<uint64_t, little, unaligned>(Bytes.data());
+    }
+
+    uint64_t high() const {
+      using namespace support;
+      return endian::read<uint64_t, little, unaligned>(Bytes.data() + 8);
+    }
+    std::pair<uint64_t, uint64_t> words() const {
+      using namespace support;
+      return std::make_pair(high(), low());
+    }
+  };
 
   MD5();
 
@@ -70,18 +101,22 @@ private:
   const uint8_t *body(ArrayRef<uint8_t> Data);
 };
 
+inline bool operator==(const MD5::MD5Result &LHS, const MD5::MD5Result &RHS) {
+  return LHS.Bytes == RHS.Bytes;
+}
+
 /// Helper to compute and return lower 64 bits of the given string's MD5 hash.
 inline uint64_t MD5Hash(StringRef Str) {
+  using namespace support;
+
   MD5 Hash;
   Hash.update(Str);
-  llvm::MD5::MD5Result Result;
+  MD5::MD5Result Result;
   Hash.final(Result);
-  // Return the least significant 8 bytes. Our MD5 implementation returns the
-  // result in little endian, so we may need to swap bytes.
-  using namespace llvm::support;
-  return endian::read<uint64_t, little, unaligned>(Result);
+  // Return the least significant word.
+  return Result.low();
 }
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_SUPPORT_MD5_H
diff --git a/include/llvm/Support/MathExtras.h b/include/llvm/Support/MathExtras.h
index 77970f487112a031eb6759e2ad02be95bcbe7a09..19380b23d9d24768b8312a8fa56fb525c28ef41e 100644
--- a/include/llvm/Support/MathExtras.h
+++ b/include/llvm/Support/MathExtras.h
@@ -112,7 +112,7 @@ std::size_t countTrailingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
   static_assert(std::numeric_limits<T>::is_integer &&
                     !std::numeric_limits<T>::is_signed,
                 "Only unsigned integral types are allowed.");
-  return detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+  return llvm::detail::TrailingZerosCounter<T, sizeof(T)>::count(Val, ZB);
 }
 
 namespace detail {
@@ -181,7 +181,7 @@ std::size_t countLeadingZeros(T Val, ZeroBehavior ZB = ZB_Width) {
   static_assert(std::numeric_limits<T>::is_integer &&
                     !std::numeric_limits<T>::is_signed,
                 "Only unsigned integral types are allowed.");
-  return detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
+  return llvm::detail::LeadingZerosCounter<T, sizeof(T)>::count(Val, ZB);
 }
 
 /// \brief Get the index of the first set bit starting from the least
diff --git a/include/llvm/Support/MemoryBuffer.h b/include/llvm/Support/MemoryBuffer.h
index f739d19907b04cd7d22aec0edcf457560aeb6075..e8bdc3e89fa7cd9d0a7882b1e99e1726614ac527 100644
--- a/include/llvm/Support/MemoryBuffer.h
+++ b/include/llvm/Support/MemoryBuffer.h
@@ -69,12 +69,12 @@ public:
   /// means that the client knows that the file exists and that it has the
   /// specified size.
   ///
-  /// \param IsVolatileSize Set to true to indicate that the file size may be
-  /// changing, e.g. when libclang tries to parse while the user is
-  /// editing/updating the file.
+  /// \param IsVolatile Set to true to indicate that the contents of the file
+  /// can change outside the user's control, e.g. when libclang tries to parse
+  /// while the user is editing/updating the file or if the file is on an NFS.
   static ErrorOr<std::unique_ptr<MemoryBuffer>>
   getFile(const Twine &Filename, int64_t FileSize = -1,
-          bool RequiresNullTerminator = true, bool IsVolatileSize = false);
+          bool RequiresNullTerminator = true, bool IsVolatile = false);
 
   /// Read all of the specified file into a MemoryBuffer as a stream
   /// (i.e. until EOF reached). This is useful for special files that
@@ -87,17 +87,17 @@ public:
   /// Since this is in the middle of a file, the buffer is not null terminated.
   static ErrorOr<std::unique_ptr<MemoryBuffer>>
   getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize,
-                   int64_t Offset);
+                   int64_t Offset, bool IsVolatile = false);
 
   /// Given an already-open file descriptor, read the file and return a
   /// MemoryBuffer.
   ///
-  /// \param IsVolatileSize Set to true to indicate that the file size may be
-  /// changing, e.g. when libclang tries to parse while the user is
-  /// editing/updating the file.
+  /// \param IsVolatile Set to true to indicate that the contents of the file
+  /// can change outside the user's control, e.g. when libclang tries to parse
+  /// while the user is editing/updating the file or if the file is on an NFS.
   static ErrorOr<std::unique_ptr<MemoryBuffer>>
   getOpenFile(int FD, const Twine &Filename, uint64_t FileSize,
-              bool RequiresNullTerminator = true, bool IsVolatileSize = false);
+              bool RequiresNullTerminator = true, bool IsVolatile = false);
 
   /// Open the specified memory range as a MemoryBuffer. Note that InputData
   /// must be null terminated if RequiresNullTerminator is true.
@@ -136,7 +136,7 @@ public:
 
   /// Map a subrange of the specified file as a MemoryBuffer.
   static ErrorOr<std::unique_ptr<MemoryBuffer>>
-  getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset);
+  getFileSlice(const Twine &Filename, uint64_t MapSize, uint64_t Offset, bool IsVolatile = false);
 
   //===--------------------------------------------------------------------===//
   // Provided for performance analysis.
diff --git a/include/llvm/Support/Path.h b/include/llvm/Support/Path.h
index 2bbcef0c293f6c99034ca09a370110d54e67cd81..6ac51195519eb3907b959038aa7d93edf954264c 100644
--- a/include/llvm/Support/Path.h
+++ b/include/llvm/Support/Path.h
@@ -24,6 +24,8 @@ namespace llvm {
 namespace sys {
 namespace path {
 
+enum class Style { windows, posix, native };
+
 /// @name Lexical Component Iterator
 /// @{
 
@@ -51,9 +53,10 @@ class const_iterator
   StringRef Path;      ///< The entire path.
   StringRef Component; ///< The current component. Not necessarily in Path.
   size_t    Position;  ///< The iterators current position within Path.
+  Style S;             ///< The path style to use.
 
   // An end iterator has Position = Path.size() + 1.
-  friend const_iterator begin(StringRef path);
+  friend const_iterator begin(StringRef path, Style style);
   friend const_iterator end(StringRef path);
 
 public:
@@ -77,8 +80,9 @@ class reverse_iterator
   StringRef Path;      ///< The entire path.
   StringRef Component; ///< The current component. Not necessarily in Path.
   size_t    Position;  ///< The iterators current position within Path.
+  Style S;             ///< The path style to use.
 
-  friend reverse_iterator rbegin(StringRef path);
+  friend reverse_iterator rbegin(StringRef path, Style style);
   friend reverse_iterator rend(StringRef path);
 
 public:
@@ -95,7 +99,7 @@ public:
 /// @brief Get begin iterator over \a path.
 /// @param path Input path.
 /// @returns Iterator initialized with the first component of \a path.
-const_iterator begin(StringRef path);
+const_iterator begin(StringRef path, Style style = Style::native);
 
 /// @brief Get end iterator over \a path.
 /// @param path Input path.
@@ -105,7 +109,7 @@ const_iterator end(StringRef path);
 /// @brief Get reverse begin iterator over \a path.
 /// @param path Input path.
 /// @returns Iterator initialized with the first reverse component of \a path.
-reverse_iterator rbegin(StringRef path);
+reverse_iterator rbegin(StringRef path, Style style = Style::native);
 
 /// @brief Get reverse end iterator over \a path.
 /// @param path Input path.
@@ -126,7 +130,7 @@ reverse_iterator rend(StringRef path);
 /// @endcode
 ///
 /// @param path A path that is modified to not have a file component.
-void remove_filename(SmallVectorImpl<char> &path);
+void remove_filename(SmallVectorImpl<char> &path, Style style = Style::native);
 
 /// @brief Replace the file extension of \a path with \a extension.
 ///
@@ -140,7 +144,8 @@ void remove_filename(SmallVectorImpl<char> &path);
 /// @param extension The extension to be added. It may be empty. It may also
 ///                  optionally start with a '.', if it does not, one will be
 ///                  prepended.
-void replace_extension(SmallVectorImpl<char> &path, const Twine &extension);
+void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
+                       Style style = Style::native);
 
 /// @brief Replace matching path prefix with another path.
 ///
@@ -156,8 +161,8 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension);
 /// @param OldPrefix The path prefix to strip from \a Path.
 /// @param NewPrefix The path prefix to replace \a NewPrefix with.
 void replace_path_prefix(SmallVectorImpl<char> &Path,
-                         const StringRef &OldPrefix,
-                         const StringRef &NewPrefix);
+                         const StringRef &OldPrefix, const StringRef &NewPrefix,
+                         Style style = Style::native);
 
 /// @brief Append to path.
 ///
@@ -174,6 +179,9 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
                                          const Twine &c = "",
                                          const Twine &d = "");
 
+void append(SmallVectorImpl<char> &path, Style style, const Twine &a,
+            const Twine &b = "", const Twine &c = "", const Twine &d = "");
+
 /// @brief Append to path.
 ///
 /// @code
@@ -185,8 +193,8 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
 /// @param path Set to \a path + [\a begin, \a end).
 /// @param begin Start of components to append.
 /// @param end One past the end of components to append.
-void append(SmallVectorImpl<char> &path,
-            const_iterator begin, const_iterator end);
+void append(SmallVectorImpl<char> &path, const_iterator begin,
+            const_iterator end, Style style = Style::native);
 
 /// @}
 /// @name Transforms (or some other better name)
@@ -198,14 +206,15 @@ void append(SmallVectorImpl<char> &path,
 ///
 /// @param path A path that is transformed to native format.
 /// @param result Holds the result of the transformation.
-void native(const Twine &path, SmallVectorImpl<char> &result);
+void native(const Twine &path, SmallVectorImpl<char> &result,
+            Style style = Style::native);
 
 /// Convert path to the native form in place. This is used to give paths to
 /// users and operating system calls in the platform's normal way. For example,
 /// on Windows all '/' are converted to '\'.
 ///
 /// @param path A path that is transformed to native format.
-void native(SmallVectorImpl<char> &path);
+void native(SmallVectorImpl<char> &path, Style style = Style::native);
 
 /// @brief Replaces backslashes with slashes if Windows.
 ///
@@ -213,7 +222,7 @@ void native(SmallVectorImpl<char> &path);
 /// @result The result of replacing backslashes with forward slashes if Windows.
 /// On Unix, this function is a no-op because backslashes are valid path
 /// chracters.
-std::string convert_to_slash(StringRef path);
+std::string convert_to_slash(StringRef path, Style style = Style::native);
 
 /// @}
 /// @name Lexical Observers
@@ -229,7 +238,7 @@ std::string convert_to_slash(StringRef path);
 ///
 /// @param path Input path.
 /// @result The root name of \a path if it has one, otherwise "".
-StringRef root_name(StringRef path);
+StringRef root_name(StringRef path, Style style = Style::native);
 
 /// @brief Get root directory.
 ///
@@ -242,7 +251,7 @@ StringRef root_name(StringRef path);
 /// @param path Input path.
 /// @result The root directory of \a path if it has one, otherwise
 ///               "".
-StringRef root_directory(StringRef path);
+StringRef root_directory(StringRef path, Style style = Style::native);
 
 /// @brief Get root path.
 ///
@@ -250,7 +259,7 @@ StringRef root_directory(StringRef path);
 ///
 /// @param path Input path.
 /// @result The root path of \a path if it has one, otherwise "".
-StringRef root_path(StringRef path);
+StringRef root_path(StringRef path, Style style = Style::native);
 
 /// @brief Get relative path.
 ///
@@ -262,7 +271,7 @@ StringRef root_path(StringRef path);
 ///
 /// @param path Input path.
 /// @result The path starting after root_path if one exists, otherwise "".
-StringRef relative_path(StringRef path);
+StringRef relative_path(StringRef path, Style style = Style::native);
 
 /// @brief Get parent path.
 ///
@@ -274,7 +283,7 @@ StringRef relative_path(StringRef path);
 ///
 /// @param path Input path.
 /// @result The parent path of \a path if one exists, otherwise "".
-StringRef parent_path(StringRef path);
+StringRef parent_path(StringRef path, Style style = Style::native);
 
 /// @brief Get filename.
 ///
@@ -288,7 +297,7 @@ StringRef parent_path(StringRef path);
 /// @param path Input path.
 /// @result The filename part of \a path. This is defined as the last component
 ///         of \a path.
-StringRef filename(StringRef path);
+StringRef filename(StringRef path, Style style = Style::native);
 
 /// @brief Get stem.
 ///
@@ -306,7 +315,7 @@ StringRef filename(StringRef path);
 ///
 /// @param path Input path.
 /// @result The stem of \a path.
-StringRef stem(StringRef path);
+StringRef stem(StringRef path, Style style = Style::native);
 
 /// @brief Get extension.
 ///
@@ -322,18 +331,18 @@ StringRef stem(StringRef path);
 ///
 /// @param path Input path.
 /// @result The extension of \a path.
-StringRef extension(StringRef path);
+StringRef extension(StringRef path, Style style = Style::native);
 
 /// @brief Check whether the given char is a path separator on the host OS.
 ///
 /// @param value a character
 /// @result true if \a value is a path separator character on the host OS
-bool is_separator(char value);
+bool is_separator(char value, Style style = Style::native);
 
 /// @brief Return the preferred separator for this platform.
 ///
 /// @result StringRef of the preferred separator, null-terminated.
-StringRef get_separator();
+StringRef get_separator(Style style = Style::native);
 
 /// @brief Get the typical temporary directory for the system, e.g.,
 /// "/var/tmp" or "C:/TEMP"
@@ -374,7 +383,7 @@ bool user_cache_directory(SmallVectorImpl<char> &Result, const Twine &Path1,
 ///
 /// @param path Input path.
 /// @result True if the path has a root name, false otherwise.
-bool has_root_name(const Twine &path);
+bool has_root_name(const Twine &path, Style style = Style::native);
 
 /// @brief Has root directory?
 ///
@@ -382,7 +391,7 @@ bool has_root_name(const Twine &path);
 ///
 /// @param path Input path.
 /// @result True if the path has a root directory, false otherwise.
-bool has_root_directory(const Twine &path);
+bool has_root_directory(const Twine &path, Style style = Style::native);
 
 /// @brief Has root path?
 ///
@@ -390,7 +399,7 @@ bool has_root_directory(const Twine &path);
 ///
 /// @param path Input path.
 /// @result True if the path has a root path, false otherwise.
-bool has_root_path(const Twine &path);
+bool has_root_path(const Twine &path, Style style = Style::native);
 
 /// @brief Has relative path?
 ///
@@ -398,7 +407,7 @@ bool has_root_path(const Twine &path);
 ///
 /// @param path Input path.
 /// @result True if the path has a relative path, false otherwise.
-bool has_relative_path(const Twine &path);
+bool has_relative_path(const Twine &path, Style style = Style::native);
 
 /// @brief Has parent path?
 ///
@@ -406,7 +415,7 @@ bool has_relative_path(const Twine &path);
 ///
 /// @param path Input path.
 /// @result True if the path has a parent path, false otherwise.
-bool has_parent_path(const Twine &path);
+bool has_parent_path(const Twine &path, Style style = Style::native);
 
 /// @brief Has filename?
 ///
@@ -414,7 +423,7 @@ bool has_parent_path(const Twine &path);
 ///
 /// @param path Input path.
 /// @result True if the path has a filename, false otherwise.
-bool has_filename(const Twine &path);
+bool has_filename(const Twine &path, Style style = Style::native);
 
 /// @brief Has stem?
 ///
@@ -422,7 +431,7 @@ bool has_filename(const Twine &path);
 ///
 /// @param path Input path.
 /// @result True if the path has a stem, false otherwise.
-bool has_stem(const Twine &path);
+bool has_stem(const Twine &path, Style style = Style::native);
 
 /// @brief Has extension?
 ///
@@ -430,25 +439,25 @@ bool has_stem(const Twine &path);
 ///
 /// @param path Input path.
 /// @result True if the path has a extension, false otherwise.
-bool has_extension(const Twine &path);
+bool has_extension(const Twine &path, Style style = Style::native);
 
 /// @brief Is path absolute?
 ///
 /// @param path Input path.
 /// @result True if the path is absolute, false if it is not.
-bool is_absolute(const Twine &path);
+bool is_absolute(const Twine &path, Style style = Style::native);
 
 /// @brief Is path relative?
 ///
 /// @param path Input path.
 /// @result True if the path is relative, false if it is not.
-bool is_relative(const Twine &path);
+bool is_relative(const Twine &path, Style style = Style::native);
 
 /// @brief Remove redundant leading "./" pieces and consecutive separators.
 ///
 /// @param path Input path.
 /// @result The cleaned-up \a path.
-StringRef remove_leading_dotslash(StringRef path);
+StringRef remove_leading_dotslash(StringRef path, Style style = Style::native);
 
 /// @brief In-place remove any './' and optionally '../' components from a path.
 ///
@@ -456,7 +465,8 @@ StringRef remove_leading_dotslash(StringRef path);
 /// @param remove_dot_dot specify if '../' (except for leading "../") should be
 /// removed
 /// @result True if path was changed
-bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot = false);
+bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot = false,
+                 Style style = Style::native);
 
 } // end namespace path
 } // end namespace sys
diff --git a/include/llvm/Support/PointerLikeTypeTraits.h b/include/llvm/Support/PointerLikeTypeTraits.h
index 9ff894edbeb06a5256f42e7037589db1b2149782..521a49684e45a77e37f29f8f88bee972eb3e0c89 100644
--- a/include/llvm/Support/PointerLikeTypeTraits.h
+++ b/include/llvm/Support/PointerLikeTypeTraits.h
@@ -60,6 +60,20 @@ public:
   enum { NumLowBitsAvailable = 2 };
 };
 
+// Provide PointerLikeTypeTraits for const things.
+template <typename T> class PointerLikeTypeTraits<const T> {
+  typedef PointerLikeTypeTraits<T> NonConst;
+
+public:
+  static inline const void *getAsVoidPointer(const T P) {
+    return NonConst::getAsVoidPointer(P);
+  }
+  static inline const T getFromVoidPointer(const void *P) {
+    return NonConst::getFromVoidPointer(const_cast<void *>(P));
+  }
+  enum { NumLowBitsAvailable = NonConst::NumLowBitsAvailable };
+};
+
 // Provide PointerLikeTypeTraits for const pointers.
 template <typename T> class PointerLikeTypeTraits<const T *> {
   typedef PointerLikeTypeTraits<T *> NonConst;
diff --git a/include/llvm/Support/RWMutex.h b/include/llvm/Support/RWMutex.h
index e4736b8e24eb1dcc611f6f7615a7624be544f881..85f4fc09fb87f96fe7ba36b26f3dcac9ad9a3cdd 100644
--- a/include/llvm/Support/RWMutex.h
+++ b/include/llvm/Support/RWMutex.h
@@ -14,7 +14,7 @@
 #ifndef LLVM_SUPPORT_RWMUTEX_H
 #define LLVM_SUPPORT_RWMUTEX_H
 
-#include "llvm/Support/Compiler.h"
+#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Threading.h"
 #include <cassert>
 
@@ -32,6 +32,13 @@ namespace sys {
       /// @brief Default Constructor.
       explicit RWMutexImpl();
 
+    /// @}
+    /// @name Do Not Implement
+    /// @{
+      RWMutexImpl(const RWMutexImpl & original) = delete;
+      RWMutexImpl &operator=(const RWMutexImpl &) = delete;
+    /// @}
+
       /// Releases and removes the lock
       /// @brief Destructor
       ~RWMutexImpl();
@@ -70,16 +77,8 @@ namespace sys {
     /// @{
     private:
 #if defined(LLVM_ENABLE_THREADS) && LLVM_ENABLE_THREADS != 0
-      void* data_; ///< We don't know what the data will be
+      void* data_ = nullptr; ///< We don't know what the data will be
 #endif
-
-    /// @}
-    /// @name Do Not Implement
-    /// @{
-    private:
-      RWMutexImpl(const RWMutexImpl & original) = delete;
-      void operator=(const RWMutexImpl &) = delete;
-    /// @}
     };
 
     /// SmartMutex - An R/W mutex with a compile time constant parameter that
@@ -93,6 +92,8 @@ namespace sys {
 
     public:
       explicit SmartRWMutex() = default;
+      SmartRWMutex(const SmartRWMutex<mt_only> & original) = delete;
+      SmartRWMutex<mt_only> &operator=(const SmartRWMutex<mt_only> &) = delete;
 
       bool lock_shared() {
         if (!mt_only || llvm_is_multithreaded())
@@ -136,10 +137,6 @@ namespace sys {
         --writers;
         return true;
       }
-
-    private:
-      SmartRWMutex(const SmartRWMutex<mt_only> & original);
-      void operator=(const SmartRWMutex<mt_only> &);
     };
 
     typedef SmartRWMutex<false> RWMutex;
diff --git a/include/llvm/Support/SMLoc.h b/include/llvm/Support/SMLoc.h
index eb3a1ba7db511f670e2992a203d13bfe69e96164..5b8be55055405c9d2043c1fea3169570403e504b 100644
--- a/include/llvm/Support/SMLoc.h
+++ b/include/llvm/Support/SMLoc.h
@@ -22,10 +22,10 @@ namespace llvm {
 
 /// Represents a location in source code.
 class SMLoc {
-  const char *Ptr;
+  const char *Ptr = nullptr;
 
 public:
-  SMLoc() : Ptr(nullptr) {}
+  SMLoc() = default;
 
   bool isValid() const { return Ptr != nullptr; }
 
diff --git a/include/llvm/Support/SourceMgr.h b/include/llvm/Support/SourceMgr.h
index bc7478e0d7031c3a290c6e4875cae9504262bd41..cb90d968c44c51e878fa7a49d1d2ffc0a705266e 100644
--- a/include/llvm/Support/SourceMgr.h
+++ b/include/llvm/Support/SourceMgr.h
@@ -17,18 +17,24 @@
 #define LLVM_SUPPORT_SOURCEMGR_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SMLoc.h"
+#include <algorithm>
+#include <cassert>
+#include <memory>
 #include <string>
+#include <utility>
+#include <vector>
 
 namespace llvm {
-  class SourceMgr;
-  class SMDiagnostic;
-  class SMFixIt;
-  class Twine;
-  class raw_ostream;
+
+class raw_ostream;
+class SMDiagnostic;
+class SMFixIt;
 
 /// This owns the files read by a parser, handles include stacks,
 /// and handles diagnostic wrangling.
@@ -44,6 +50,7 @@ public:
   /// register a function pointer+context as a diagnostic handler.
   /// It gets called each time PrintMessage is invoked.
   typedef void (*DiagHandlerTy)(const SMDiagnostic &, void *Context);
+
 private:
   struct SrcBuffer {
     /// The memory buffer for the file.
@@ -61,18 +68,17 @@ private:
 
   /// This is a cache for line number queries, its implementation is really
   /// private to SourceMgr.cpp.
-  mutable void *LineNoCache;
+  mutable void *LineNoCache = nullptr;
 
-  DiagHandlerTy DiagHandler;
-  void *DiagContext;
+  DiagHandlerTy DiagHandler = nullptr;
+  void *DiagContext = nullptr;
 
   bool isValidBufferID(unsigned i) const { return i && i <= Buffers.size(); }
 
-  SourceMgr(const SourceMgr&) = delete;
-  void operator=(const SourceMgr&) = delete;
 public:
-  SourceMgr()
-    : LineNoCache(nullptr), DiagHandler(nullptr), DiagContext(nullptr) {}
+  SourceMgr() = default;
+  SourceMgr(const SourceMgr &) = delete;
+  SourceMgr &operator=(const SourceMgr &) = delete;
   ~SourceMgr();
 
   void setIncludeDirs(const std::vector<std::string> &Dirs) {
@@ -190,7 +196,6 @@ public:
   void PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const;
 };
 
-
 /// Represents a single fixit, a replacement of one range of text with another.
 class SMFixIt {
   SMRange Range;
@@ -222,33 +227,31 @@ public:
   }
 };
 
-
 /// Instances of this class encapsulate one diagnostic report, allowing
 /// printing to a raw_ostream as a caret diagnostic.
 class SMDiagnostic {
-  const SourceMgr *SM;
+  const SourceMgr *SM = nullptr;
   SMLoc Loc;
   std::string Filename;
-  int LineNo, ColumnNo;
-  SourceMgr::DiagKind Kind;
+  int LineNo = 0;
+  int ColumnNo = 0;
+  SourceMgr::DiagKind Kind = SourceMgr::DK_Error;
   std::string Message, LineContents;
-  std::vector<std::pair<unsigned, unsigned> > Ranges;
+  std::vector<std::pair<unsigned, unsigned>> Ranges;
   SmallVector<SMFixIt, 4> FixIts;
 
 public:
   // Null diagnostic.
-  SMDiagnostic()
-    : SM(nullptr), LineNo(0), ColumnNo(0), Kind(SourceMgr::DK_Error) {}
+  SMDiagnostic() = default;
   // Diagnostic with no location (e.g. file not found, command line arg error).
   SMDiagnostic(StringRef filename, SourceMgr::DiagKind Knd, StringRef Msg)
-    : SM(nullptr), Filename(filename), LineNo(-1), ColumnNo(-1), Kind(Knd),
-      Message(Msg) {}
+    : Filename(filename), LineNo(-1), ColumnNo(-1), Kind(Knd), Message(Msg) {}
 
   // Diagnostic with a location.
   SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
                int Line, int Col, SourceMgr::DiagKind Kind,
                StringRef Msg, StringRef LineStr,
-               ArrayRef<std::pair<unsigned,unsigned> > Ranges,
+               ArrayRef<std::pair<unsigned,unsigned>> Ranges,
                ArrayRef<SMFixIt> FixIts = None);
 
   const SourceMgr *getSourceMgr() const { return SM; }
@@ -259,9 +262,7 @@ public:
   SourceMgr::DiagKind getKind() const { return Kind; }
   StringRef getMessage() const { return Message; }
   StringRef getLineContents() const { return LineContents; }
-  ArrayRef<std::pair<unsigned, unsigned> > getRanges() const {
-    return Ranges;
-  }
+  ArrayRef<std::pair<unsigned, unsigned>> getRanges() const { return Ranges; }
 
   void addFixIt(const SMFixIt &Hint) {
     FixIts.push_back(Hint);
@@ -275,6 +276,6 @@ public:
              bool ShowKindLabel = true) const;
 };
 
-}  // end llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_SUPPORT_SOURCEMGR_H
diff --git a/include/llvm/Support/TargetParser.h b/include/llvm/Support/TargetParser.h
index 63aeca7f4e1eaa98589af5bf3f160408d45889d1..68e6b276581029fb18f6306cbb35342abd162faa 100644
--- a/include/llvm/Support/TargetParser.h
+++ b/include/llvm/Support/TargetParser.h
@@ -142,7 +142,7 @@ unsigned parseArchVersion(StringRef Arch);
 
 } // namespace ARM
 
-// FIXME:This should be made into class design,to avoid dupplication. 
+// FIXME:This should be made into class design,to avoid dupplication.
 namespace AArch64 {
 
 // Arch names.
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index 954cdb13abafa435183854db2ce19f4a1ec06725..bd68d241448754a305461e5c3931fcde9be6e040 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -1,4 +1,4 @@
-//===-- Support/TargetRegistry.h - Target Registration ----------*- C++ -*-===//
+//===- Support/TargetRegistry.h - Target Registration -----------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -20,15 +20,22 @@
 #define LLVM_SUPPORT_TARGETREGISTRY_H
 
 #include "llvm-c/Disassembler.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
+#include <algorithm>
 #include <cassert>
+#include <cstddef>
+#include <iterator>
 #include <memory>
 #include <string>
 
 namespace llvm {
+
 class AsmPrinter;
 class MCAsmBackend;
 class MCAsmInfo;
@@ -36,22 +43,20 @@ class MCAsmParser;
 class MCCodeEmitter;
 class MCContext;
 class MCDisassembler;
-class MCInstrAnalysis;
 class MCInstPrinter;
+class MCInstrAnalysis;
 class MCInstrInfo;
 class MCRegisterInfo;
+class MCRelocationInfo;
 class MCStreamer;
 class MCSubtargetInfo;
 class MCSymbolizer;
-class MCRelocationInfo;
 class MCTargetAsmParser;
 class MCTargetOptions;
 class MCTargetStreamer;
+class raw_pwrite_stream;
 class TargetMachine;
 class TargetOptions;
-class raw_ostream;
-class raw_pwrite_stream;
-class formatted_raw_ostream;
 
 MCStreamer *createNullStreamer(MCContext &Ctx);
 MCStreamer *createAsmStreamer(MCContext &Ctx,
@@ -68,6 +73,9 @@ MCStreamer *createMachOStreamer(MCContext &Ctx, MCAsmBackend &TAB,
                                 raw_pwrite_stream &OS, MCCodeEmitter *CE,
                                 bool RelaxAll, bool DWARFMustBeAtTheEnd,
                                 bool LabelSections = false);
+MCStreamer *createWasmStreamer(MCContext &Ctx, MCAsmBackend &TAB,
+                               raw_pwrite_stream &OS, MCCodeEmitter *CE,
+                               bool RelaxAll);
 
 MCRelocationInfo *createMCRelocationInfo(const Triple &TT, MCContext &Ctx);
 
@@ -143,6 +151,11 @@ public:
                                             MCCodeEmitter *Emitter,
                                             bool RelaxAll,
                                             bool IncrementalLinkerCompatible);
+  typedef MCStreamer *(*WasmStreamerCtorTy)(const Triple &T, MCContext &Ctx,
+                                            MCAsmBackend &TAB,
+                                            raw_pwrite_stream &OS,
+                                            MCCodeEmitter *Emitter,
+                                            bool RelaxAll);
   typedef MCTargetStreamer *(*NullTargetStreamerCtorTy)(MCStreamer &S);
   typedef MCTargetStreamer *(*AsmTargetStreamerCtorTy)(
       MCStreamer &S, formatted_raw_ostream &OS, MCInstPrinter *InstPrint,
@@ -224,36 +237,33 @@ private:
   MCCodeEmitterCtorTy MCCodeEmitterCtorFn;
 
   // Construction functions for the various object formats, if registered.
-  COFFStreamerCtorTy COFFStreamerCtorFn;
-  MachOStreamerCtorTy MachOStreamerCtorFn;
-  ELFStreamerCtorTy ELFStreamerCtorFn;
+  COFFStreamerCtorTy COFFStreamerCtorFn = nullptr;
+  MachOStreamerCtorTy MachOStreamerCtorFn = nullptr;
+  ELFStreamerCtorTy ELFStreamerCtorFn = nullptr;
+  WasmStreamerCtorTy WasmStreamerCtorFn = nullptr;
 
   /// Construction function for this target's null TargetStreamer, if
   /// registered (default = nullptr).
-  NullTargetStreamerCtorTy NullTargetStreamerCtorFn;
+  NullTargetStreamerCtorTy NullTargetStreamerCtorFn = nullptr;
 
   /// Construction function for this target's asm TargetStreamer, if
   /// registered (default = nullptr).
-  AsmTargetStreamerCtorTy AsmTargetStreamerCtorFn;
+  AsmTargetStreamerCtorTy AsmTargetStreamerCtorFn = nullptr;
 
   /// Construction function for this target's obj TargetStreamer, if
   /// registered (default = nullptr).
-  ObjectTargetStreamerCtorTy ObjectTargetStreamerCtorFn;
+  ObjectTargetStreamerCtorTy ObjectTargetStreamerCtorFn = nullptr;
 
   /// MCRelocationInfoCtorFn - Construction function for this target's
   /// MCRelocationInfo, if registered (default = llvm::createMCRelocationInfo)
-  MCRelocationInfoCtorTy MCRelocationInfoCtorFn;
+  MCRelocationInfoCtorTy MCRelocationInfoCtorFn = nullptr;
 
   /// MCSymbolizerCtorFn - Construction function for this target's
   /// MCSymbolizer, if registered (default = llvm::createMCSymbolizer)
-  MCSymbolizerCtorTy MCSymbolizerCtorFn;
+  MCSymbolizerCtorTy MCSymbolizerCtorFn = nullptr;
 
 public:
-  Target()
-      : COFFStreamerCtorFn(nullptr), MachOStreamerCtorFn(nullptr),
-        ELFStreamerCtorFn(nullptr), NullTargetStreamerCtorFn(nullptr),
-        AsmTargetStreamerCtorFn(nullptr), ObjectTargetStreamerCtorFn(nullptr),
-        MCRelocationInfoCtorFn(nullptr), MCSymbolizerCtorFn(nullptr) {}
+  Target() = default;
 
   /// @name Target Information
   /// @{
@@ -461,6 +471,12 @@ public:
       else
         S = createELFStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
       break;
+    case Triple::Wasm:
+      if (WasmStreamerCtorFn)
+        S = WasmStreamerCtorFn(T, Ctx, TAB, OS, Emitter, RelaxAll);
+      else
+        S = createWasmStreamer(Ctx, TAB, OS, Emitter, RelaxAll);
+      break;
     }
     if (ObjectTargetStreamerCtorFn)
       ObjectTargetStreamerCtorFn(*S, STI);
@@ -548,12 +564,14 @@ struct TargetRegistry {
 
   class iterator
       : public std::iterator<std::forward_iterator_tag, Target, ptrdiff_t> {
-    const Target *Current;
-    explicit iterator(Target *T) : Current(T) {}
     friend struct TargetRegistry;
 
+    const Target *Current = nullptr;
+
+    explicit iterator(Target *T) : Current(T) {}
+
   public:
-    iterator() : Current(nullptr) {}
+    iterator() = default;
 
     bool operator==(const iterator &x) const { return Current == x.Current; }
     bool operator!=(const iterator &x) const { return !operator==(x); }
@@ -800,6 +818,10 @@ struct TargetRegistry {
     T.ELFStreamerCtorFn = Fn;
   }
 
+  static void RegisterWasmStreamer(Target &T, Target::WasmStreamerCtorTy Fn) {
+    T.WasmStreamerCtorFn = Fn;
+  }
+
   static void RegisterNullTargetStreamer(Target &T,
                                          Target::NullTargetStreamerCtorTy Fn) {
     T.NullTargetStreamerCtorFn = Fn;
@@ -1147,6 +1169,7 @@ private:
     return new MCCodeEmitterImpl();
   }
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_TARGETREGISTRY_H
diff --git a/include/llvm/Support/ThreadPool.h b/include/llvm/Support/ThreadPool.h
index 665cec2465bfc083bce9d2e6c0b35ba9d08d2d76..f0e3ffa0999c262b7a0d8fa38c2c04c71d9301f8 100644
--- a/include/llvm/Support/ThreadPool.h
+++ b/include/llvm/Support/ThreadPool.h
@@ -16,23 +16,8 @@
 
 #include "llvm/Support/thread.h"
 
-#ifdef _MSC_VER
-// concrt.h depends on eh.h for __uncaught_exception declaration
-// even if we disable exceptions.
-#include <eh.h>
-
-// Disable warnings from ppltasks.h transitively included by <future>.
-#pragma warning(push)
-#pragma warning(disable:4530)
-#pragma warning(disable:4062)
-#endif
-
 #include <future>
 
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
 #include <atomic>
 #include <condition_variable>
 #include <functional>
diff --git a/include/llvm/Support/Threading.h b/include/llvm/Support/Threading.h
index 4bef7ec8dd3f240667d339be3e280781ea302d25..03963a24c107eeed24dd39611c6b3d57ea63cb78 100644
--- a/include/llvm/Support/Threading.h
+++ b/include/llvm/Support/Threading.h
@@ -15,16 +15,22 @@
 #ifndef LLVM_SUPPORT_THREADING_H
 #define LLVM_SUPPORT_THREADING_H
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ON_UNIX
 #include "llvm/Support/Compiler.h"
 #include <ciso646> // So we can check the C++ standard lib macros.
 #include <functional>
 
+#if defined(_MSC_VER)
+// MSVC's call_once implementation worked since VS 2015, which is the minimum
+// supported version as of this writing.
+#define LLVM_THREADING_USE_STD_CALL_ONCE 1
+#elif defined(LLVM_ON_UNIX) &&                                                 \
+    (defined(_LIBCPP_VERSION) ||                                               \
+     !(defined(__NetBSD__) || defined(__OpenBSD__) || defined(__ppc__)))
 // std::call_once from libc++ is used on all Unix platforms. Other
 // implementations like libstdc++ are known to have problems on NetBSD,
 // OpenBSD and PowerPC.
-#if defined(LLVM_ON_UNIX) && (defined(_LIBCPP_VERSION) ||                      \
-    !(defined(__NetBSD__) || defined(__OpenBSD__) || defined(__ppc__)))
 #define LLVM_THREADING_USE_STD_CALL_ONCE 1
 #else
 #define LLVM_THREADING_USE_STD_CALL_ONCE 0
@@ -37,41 +43,43 @@
 #endif
 
 namespace llvm {
-  /// Returns true if LLVM is compiled with support for multi-threading, and
-  /// false otherwise.
-  bool llvm_is_multithreaded();
-
-  /// llvm_execute_on_thread - Execute the given \p UserFn on a separate
-  /// thread, passing it the provided \p UserData and waits for thread
-  /// completion.
-  ///
-  /// This function does not guarantee that the code will actually be executed
-  /// on a separate thread or honoring the requested stack size, but tries to do
-  /// so where system support is available.
-  ///
-  /// \param UserFn - The callback to execute.
-  /// \param UserData - An argument to pass to the callback function.
-  /// \param RequestedStackSize - If non-zero, a requested size (in bytes) for
-  /// the thread stack.
-  void llvm_execute_on_thread(void (*UserFn)(void*), void *UserData,
-                              unsigned RequestedStackSize = 0);
+class Twine;
+
+/// Returns true if LLVM is compiled with support for multi-threading, and
+/// false otherwise.
+bool llvm_is_multithreaded();
+
+/// llvm_execute_on_thread - Execute the given \p UserFn on a separate
+/// thread, passing it the provided \p UserData and waits for thread
+/// completion.
+///
+/// This function does not guarantee that the code will actually be executed
+/// on a separate thread or honoring the requested stack size, but tries to do
+/// so where system support is available.
+///
+/// \param UserFn - The callback to execute.
+/// \param UserData - An argument to pass to the callback function.
+/// \param RequestedStackSize - If non-zero, a requested size (in bytes) for
+/// the thread stack.
+void llvm_execute_on_thread(void (*UserFn)(void *), void *UserData,
+                            unsigned RequestedStackSize = 0);
 
 #if LLVM_THREADING_USE_STD_CALL_ONCE
 
   typedef std::once_flag once_flag;
 
-  /// This macro is the only way you should define your once flag for LLVM's
-  /// call_once.
-#define LLVM_DEFINE_ONCE_FLAG(flag) static once_flag flag
-
 #else
 
   enum InitStatus { Uninitialized = 0, Wait = 1, Done = 2 };
-  typedef volatile sys::cas_flag once_flag;
 
-  /// This macro is the only way you should define your once flag for LLVM's
-  /// call_once.
-#define LLVM_DEFINE_ONCE_FLAG(flag) static once_flag flag = Uninitialized
+  /// \brief The llvm::once_flag structure
+  ///
+  /// This type is modeled after std::once_flag to use with llvm::call_once.
+  /// This structure must be used as an opaque object. It is a struct to force
+  /// autoinitialization and behave like std::once_flag.
+  struct once_flag {
+    volatile sys::cas_flag status = Uninitialized;
+  };
 
 #endif
 
@@ -81,7 +89,7 @@ namespace llvm {
   /// \code
   ///   void foo() {...};
   ///   ...
-  ///   LLVM_DEFINE_ONCE_FLAG(flag);
+  ///   static once_flag flag;
   ///   call_once(flag, foo);
   /// \endcode
   ///
@@ -95,24 +103,24 @@ namespace llvm {
 #else
     // For other platforms we use a generic (if brittle) version based on our
     // atomics.
-    sys::cas_flag old_val = sys::CompareAndSwap(&flag, Wait, Uninitialized);
+    sys::cas_flag old_val = sys::CompareAndSwap(&flag.status, Wait, Uninitialized);
     if (old_val == Uninitialized) {
       std::forward<Function>(F)(std::forward<Args>(ArgList)...);
       sys::MemoryFence();
       TsanIgnoreWritesBegin();
-      TsanHappensBefore(&flag);
-      flag = Done;
+      TsanHappensBefore(&flag.status);
+      flag.status = Done;
       TsanIgnoreWritesEnd();
     } else {
       // Wait until any thread doing the call has finished.
-      sys::cas_flag tmp = flag;
+      sys::cas_flag tmp = flag.status;
       sys::MemoryFence();
       while (tmp != Done) {
-        tmp = flag;
+        tmp = flag.status;
         sys::MemoryFence();
       }
     }
-    TsanHappensAfter(&flag);
+    TsanHappensAfter(&flag.status);
 #endif
   }
 
@@ -122,6 +130,32 @@ namespace llvm {
   /// thread::hardware_concurrency().
   /// Returns 1 when LLVM is configured with LLVM_ENABLE_THREADS=OFF
   unsigned heavyweight_hardware_concurrency();
+
+  /// \brief Return the current thread id, as used in various OS system calls.
+  /// Note that not all platforms guarantee that the value returned will be
+  /// unique across the entire system, so portable code should not assume
+  /// this.
+  uint64_t get_threadid();
+
+  /// \brief Get the maximum length of a thread name on this platform.
+  /// A value of 0 means there is no limit.
+  uint32_t get_max_thread_name_length();
+
+  /// \brief Set the name of the current thread.  Setting a thread's name can
+  /// be helpful for enabling useful diagnostics under a debugger or when
+  /// logging.  The level of support for setting a thread's name varies
+  /// wildly across operating systems, and we only make a best effort to
+  /// perform the operation on supported platforms.  No indication of success
+  /// or failure is returned.
+  void set_thread_name(const Twine &Name);
+
+  /// \brief Get the name of the current thread.  The level of support for
+  /// getting a thread's name varies wildly across operating systems, and it
+  /// is not even guaranteed that if you can successfully set a thread's name
+  /// that you can later get it back.  This function is intended for diagnostic
+  /// purposes, and as with setting a thread's name no indication of whether
+  /// the operation succeeded or failed is returned.
+  void get_thread_name(SmallVectorImpl<char> &Name);
 }
 
 #endif
diff --git a/include/llvm/Support/Timer.h b/include/llvm/Support/Timer.h
index 80e8f13dccfe9d70e19a7f3bc9b7d4d923b75b4e..198855ae03775dcb40d403457f11a9d63e608f03 100644
--- a/include/llvm/Support/Timer.h
+++ b/include/llvm/Support/Timer.h
@@ -207,6 +207,9 @@ public:
   /// This static method prints all timers and clears them all out.
   static void printAll(raw_ostream &OS);
 
+  /// Prints all timers as JSON key/value pairs, and clears them all out.
+  static const char *printAllJSONValues(raw_ostream &OS, const char *delim);
+
   /// Ensure global timer group lists are initialized. This function is mostly
   /// used by the Statistic code to influence the construction and destruction
   /// order of the global timer lists.
@@ -221,7 +224,6 @@ private:
   void printJSONValue(raw_ostream &OS, const PrintRecord &R,
                       const char *suffix, double Value);
   const char *printJSONValues(raw_ostream &OS, const char *delim);
-  static const char *printAllJSONValues(raw_ostream &OS, const char *delim);
 };
 
 } // end namespace llvm
diff --git a/include/llvm/Support/TrailingObjects.h b/include/llvm/Support/TrailingObjects.h
index 4d355724149c271861165d7f18aa35dce0074dd3..cb5a52b0d861b121e7312e462fef8f9cf88c8365 100644
--- a/include/llvm/Support/TrailingObjects.h
+++ b/include/llvm/Support/TrailingObjects.h
@@ -294,7 +294,14 @@ class TrailingObjects : private trailing_objects_internal::TrailingObjectsImpl<
 
 public:
   // Make this (privately inherited) member public.
+#ifndef _MSC_VER
   using ParentType::OverloadToken;
+#else
+  // MSVC bug prevents the above from working, at least up through CL
+  // 19.10.24629.
+  template <typename T>
+  using OverloadToken = typename ParentType::template OverloadToken<T>;
+#endif
 
   /// Returns a pointer to the trailing object array of the given type
   /// (which must be one of those specified in the class template). The
diff --git a/include/llvm/Support/UniqueLock.h b/include/llvm/Support/UniqueLock.h
index 529284d3868bcd7d5fac784967f1d176ed169f00..b4675f4b43aeb5cf52d64ee1647a67262a575d9d 100644
--- a/include/llvm/Support/UniqueLock.h
+++ b/include/llvm/Support/UniqueLock.h
@@ -1,4 +1,4 @@
-//===-- Support/UniqueLock.h - Acquire/Release Mutex In Scope ---*- C++ -*-===//
+//===- Support/UniqueLock.h - Acquire/Release Mutex In Scope ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,9 +15,10 @@
 #ifndef LLVM_SUPPORT_UNIQUE_LOCK_H
 #define LLVM_SUPPORT_UNIQUE_LOCK_H
 
-#include "llvm/Support/Mutex.h"
+#include <cassert>
 
 namespace llvm {
+
   /// A pared-down imitation of std::unique_lock from C++11. Contrary to the
   /// name, it's really more of a wrapper for a lock. It may or may not have
   /// an associated mutex, which is guaranteed to be locked upon creation
@@ -26,14 +27,14 @@ namespace llvm {
   /// @brief Guard a section of code with a mutex.
   template<typename MutexT>
   class unique_lock {
-    MutexT *M;
-    bool locked;
+    MutexT *M = nullptr;
+    bool locked = false;
 
-    unique_lock(const unique_lock &) = delete;
-    void operator=(const unique_lock &) = delete;
   public:
-    unique_lock() : M(nullptr), locked(false) {}
+    unique_lock() = default;
     explicit unique_lock(MutexT &m) : M(&m), locked(true) { M->lock(); }
+    unique_lock(const unique_lock &) = delete;
+     unique_lock &operator=(const unique_lock &) = delete;
 
     void operator=(unique_lock &&o) {
       if (owns_lock())
@@ -62,6 +63,7 @@ namespace llvm {
 
     bool owns_lock() { return locked; }
   };
-}
+
+} // end namespace llvm
 
 #endif // LLVM_SUPPORT_UNIQUE_LOCK_H
diff --git a/include/llvm/Support/Wasm.h b/include/llvm/Support/Wasm.h
index b45149577a0ae4a24101eabb9f7b2eace4fa2f45..8e6c418c8189a3ca7b8a91787a2f0033149fc33e 100644
--- a/include/llvm/Support/Wasm.h
+++ b/include/llvm/Support/Wasm.h
@@ -23,18 +23,90 @@ namespace wasm {
 // Object file magic string.
 const char WasmMagic[] = {'\0', 'a', 's', 'm'};
 // Wasm binary format version
-const uint32_t WasmVersion = 0xd;
+const uint32_t WasmVersion = 0x1;
 
 struct WasmObjectHeader {
   StringRef Magic;
   uint32_t Version;
 };
 
-struct WasmSection {
-  uint32_t Type;             // Section type (See below)
-  uint32_t Offset;           // Offset with in the file
-  StringRef Name;            // Section name (User-defined sections only)
-  ArrayRef<uint8_t> Content; // Section content
+struct WasmSignature {
+  std::vector<int32_t> ParamTypes;
+  int32_t ReturnType;
+};
+
+struct WasmImport {
+  StringRef Module;
+  StringRef Field;
+  uint32_t Kind;
+  union {
+    uint32_t SigIndex;
+    int32_t GlobalType;
+  };
+  bool GlobalMutable;
+};
+
+struct WasmExport {
+  StringRef Name;
+  uint32_t Kind;
+  uint32_t Index;
+};
+
+struct WasmLimits {
+  uint32_t Flags;
+  uint32_t Initial;
+  uint32_t Maximum;
+};
+
+struct WasmTable {
+  int32_t ElemType;
+  WasmLimits Limits;
+};
+
+struct WasmInitExpr {
+  uint8_t Opcode;
+  union {
+    int32_t Int32;
+    int64_t Int64;
+    int32_t Float32;
+    int64_t Float64;
+    uint32_t Global;
+  } Value;
+};
+
+struct WasmGlobal {
+  int32_t Type;
+  bool Mutable;
+  WasmInitExpr InitExpr;
+};
+
+struct WasmLocalDecl {
+  int32_t Type;
+  uint32_t Count;
+};
+
+struct WasmFunction {
+  std::vector<WasmLocalDecl> Locals;
+  ArrayRef<uint8_t> Body;
+};
+
+struct WasmDataSegment {
+  uint32_t Index;
+  WasmInitExpr Offset;
+  ArrayRef<uint8_t> Content;
+};
+
+struct WasmElemSegment {
+  uint32_t TableIndex;
+  WasmInitExpr Offset;
+  std::vector<uint32_t> Functions;
+};
+
+struct WasmRelocation {
+  uint32_t Type;         // The type of the relocation.
+  int32_t Index;         // Index into function to global index space.
+  uint64_t Offset;       // Offset from the start of the section.
+  uint64_t Addend;       // A value to add to the symbol.
 };
 
 enum : unsigned {
@@ -53,14 +125,14 @@ enum : unsigned {
 };
 
 // Type immediate encodings used in various contexts.
-enum : unsigned {
-  WASM_TYPE_I32          = 0x7f,
-  WASM_TYPE_I64          = 0x7e,
-  WASM_TYPE_F32          = 0x7d,
-  WASM_TYPE_F64          = 0x7c,
-  WASM_TYPE_ANYFUNC      = 0x70,
-  WASM_TYPE_FUNC         = 0x60,
-  WASM_TYPE_NORESULT     = 0x40, // for blocks with no result values
+enum {
+  WASM_TYPE_I32          = -0x01,
+  WASM_TYPE_I64          = -0x02,
+  WASM_TYPE_F32          = -0x03,
+  WASM_TYPE_F64          = -0x04,
+  WASM_TYPE_ANYFUNC      = -0x10,
+  WASM_TYPE_FUNC         = -0x20,
+  WASM_TYPE_NORESULT     = -0x40, // for blocks with no result values
 };
 
 // Kinds of externals (for imports and exports).
@@ -81,6 +153,49 @@ enum : unsigned {
   WASM_OPCODE_F64_CONST  = 0x44,
 };
 
+enum : unsigned {
+  WASM_NAMES_FUNCTION    = 0x1,
+  WASM_NAMES_LOCAL       = 0x2,
+};
+
+enum : unsigned {
+  WASM_LIMITS_FLAG_HAS_MAX = 0x1,
+};
+
+// Subset of types that a value can have
+enum class ValType {
+  I32 = WASM_TYPE_I32,
+  I64 = WASM_TYPE_I64,
+  F32 = WASM_TYPE_F32,
+  F64 = WASM_TYPE_F64,
+};
+
+// Linking metadata kinds.
+enum : unsigned {
+  WASM_STACK_POINTER = 0x1,
+};
+
+#define WASM_RELOC(name, value) name = value,
+
+enum : unsigned {
+#include "WasmRelocs/WebAssembly.def"
+};
+
+#undef WASM_RELOC
+
+struct Global {
+  ValType Type;
+  bool Mutable;
+
+  // The initial value for this global is either the value of an imported
+  // global, in which case InitialModule and InitialName specify the global
+  // import, or a value, in which case InitialModule is empty and InitialValue
+  // holds the value.
+  StringRef InitialModule;
+  StringRef InitialName;
+  uint64_t InitialValue;
+};
+
 } // end namespace wasm
 } // end namespace llvm
 
diff --git a/include/llvm/Support/WasmRelocs/WebAssembly.def b/include/llvm/Support/WasmRelocs/WebAssembly.def
new file mode 100644
index 0000000000000000000000000000000000000000..da64e025478dee6077fffa0bedaea4b39852ade1
--- /dev/null
+++ b/include/llvm/Support/WasmRelocs/WebAssembly.def
@@ -0,0 +1,13 @@
+
+#ifndef WASM_RELOC
+#error "WASM_RELOC must be defined"
+#endif
+
+WASM_RELOC(R_WEBASSEMBLY_FUNCTION_INDEX_LEB,   0)
+WASM_RELOC(R_WEBASSEMBLY_TABLE_INDEX_SLEB,     1)
+WASM_RELOC(R_WEBASSEMBLY_TABLE_INDEX_I32,      2)
+WASM_RELOC(R_WEBASSEMBLY_GLOBAL_ADDR_LEB,      3)
+WASM_RELOC(R_WEBASSEMBLY_GLOBAL_ADDR_SLEB,     4)
+WASM_RELOC(R_WEBASSEMBLY_GLOBAL_ADDR_I32,      5)
+WASM_RELOC(R_WEBASSEMBLY_TYPE_INDEX_LEB,       6)
+WASM_RELOC(R_WEBASSEMBLY_GLOBAL_INDEX_LEB,     7)
diff --git a/include/llvm/Support/YAMLTraits.h b/include/llvm/Support/YAMLTraits.h
index cbba9c08275a05337ee1b83c5e11c320eb48a5c1..6d02e4aba48a7b94a3d8a2744f6a0420b4a4cbb2 100644
--- a/include/llvm/Support/YAMLTraits.h
+++ b/include/llvm/Support/YAMLTraits.h
@@ -689,11 +689,12 @@ private:
     assert(DefaultValue.hasValue() == false &&
            "Optional<T> shouldn't have a value!");
     void *SaveInfo;
-    bool UseDefault;
+    bool UseDefault = true;
     const bool sameAsDefault = outputting() && !Val.hasValue();
     if (!outputting() && !Val.hasValue())
       Val = T();
-    if (this->preflightKey(Key, Required, sameAsDefault, UseDefault,
+    if (Val.hasValue() &&
+        this->preflightKey(Key, Required, sameAsDefault, UseDefault,
                            SaveInfo)) {
       yamlize(*this, Val.getValue(), Required, Ctx);
       this->postflightKey(SaveInfo);
@@ -731,7 +732,7 @@ private:
   }
 
 private:
-  void  *Ctxt;
+  void *Ctxt;
 };
 
 namespace detail {
@@ -1251,6 +1252,13 @@ public:
   Output(llvm::raw_ostream &, void *Ctxt = nullptr, int WrapColumn = 70);
   ~Output() override;
 
+  /// \brief Set whether or not to output optional values which are equal
+  /// to the default value.  By default, when outputting if you attempt
+  /// to write a value that is equal to the default, the value gets ignored.
+  /// Sometimes, it is useful to be able to see these in the resulting YAML
+  /// anyway.
+  void setWriteDefaultValues(bool Write) { WriteDefaultValues = Write; }
+
   bool outputting() override;
   bool mapTag(StringRef, bool) override;
   void beginMapping() override;
@@ -1314,6 +1322,7 @@ private:
   bool                     NeedFlowSequenceComma;
   bool                     EnumerationMatchFound;
   bool                     NeedsNewLine;
+  bool WriteDefaultValues;
 };
 
 /// YAML I/O does conversion based on types. But often native data types
diff --git a/include/llvm/Support/thread.h b/include/llvm/Support/thread.h
index 9c45418df55c2da3aa4ebff5551fa88496fa7be2..787a513d6017616283ae249f00ae28e5fc973379 100644
--- a/include/llvm/Support/thread.h
+++ b/include/llvm/Support/thread.h
@@ -21,22 +21,8 @@
 
 #if LLVM_ENABLE_THREADS
 
-#ifdef _MSC_VER
-// concrt.h depends on eh.h for __uncaught_exception declaration
-// even if we disable exceptions.
-#include <eh.h>
-
-// Suppress 'C++ exception handler used, but unwind semantics are not enabled.'
-#pragma warning(push)
-#pragma warning(disable:4530)
-#endif
-
 #include <thread>
 
-#ifdef _MSC_VER
-#pragma warning(pop)
-#endif
-
 namespace llvm {
 typedef std::thread thread;
 }
diff --git a/include/llvm/Support/type_traits.h b/include/llvm/Support/type_traits.h
index 7706ff527197e3fb96b6c1ac03514262013117b3..ce4bbf8cb2cc5c6603fabb16506443d6af878da7 100644
--- a/include/llvm/Support/type_traits.h
+++ b/include/llvm/Support/type_traits.h
@@ -95,6 +95,15 @@ struct add_const_past_pointer<
   typedef const typename std::remove_pointer<T>::type *type;
 };
 
+template <typename T, typename Enable = void>
+struct const_pointer_or_const_ref {
+  using type = const T &;
+};
+template <typename T>
+struct const_pointer_or_const_ref<
+    T, typename std::enable_if<std::is_pointer<T>::value>::type> {
+  using type = typename add_const_past_pointer<T>::type;
+};
 }
 
 // If the compiler supports detecting whether a class is final, define
diff --git a/include/llvm/TableGen/Record.h b/include/llvm/TableGen/Record.h
index 271804eb733be727a4abb872d1620eda0b4b24aa..fef5bf304566677fa4c126f33646b128177da8bd 100644
--- a/include/llvm/TableGen/Record.h
+++ b/include/llvm/TableGen/Record.h
@@ -1196,6 +1196,9 @@ public:
 
   inline const_arg_iterator  arg_begin() const { return Args.begin(); }
   inline const_arg_iterator  arg_end  () const { return Args.end();   }
+  inline iterator_range<const_arg_iterator> args() const {
+    return llvm::make_range(arg_begin(), arg_end());
+  }
 
   inline size_t              arg_size () const { return Args.size();  }
   inline bool                arg_empty() const { return Args.empty(); }
diff --git a/include/llvm/TableGen/StringMatcher.h b/include/llvm/TableGen/StringMatcher.h
index b4387791083460e77cc9c6d6e6066717bac8096f..11a8ad8183aad823b0d1185fd8392b1a07ce2259 100644
--- a/include/llvm/TableGen/StringMatcher.h
+++ b/include/llvm/TableGen/StringMatcher.h
@@ -38,7 +38,7 @@ private:
   raw_ostream &OS;
 
 public:
-  StringMatcher(StringRef strVariableName, 
+  StringMatcher(StringRef strVariableName,
                 const std::vector<StringPair> &matches, raw_ostream &os)
     : StrVariableName(strVariableName), Matches(matches), OS(os) {}
 
diff --git a/include/llvm/TableGen/StringToOffsetTable.h b/include/llvm/TableGen/StringToOffsetTable.h
index e5b61ed1195e705d9695691c1c05e2759fb55769..aaf2a356ffab6f6cd4a83e3c2a63b56ea374cf2c 100644
--- a/include/llvm/TableGen/StringToOffsetTable.h
+++ b/include/llvm/TableGen/StringToOffsetTable.h
@@ -60,10 +60,10 @@ public:
       if (AggregateString[i] != '\\')
         continue;
 
-      assert(i+1 < AggregateString.size() && "Incomplete escape sequence!");
-      if (isdigit(AggregateString[i+1])) {
-        assert(isdigit(AggregateString[i+2]) && 
-               isdigit(AggregateString[i+3]) &&
+      assert(i + 1 < AggregateString.size() && "Incomplete escape sequence!");
+      if (isdigit(AggregateString[i + 1])) {
+        assert(isdigit(AggregateString[i + 2]) &&
+               isdigit(AggregateString[i + 3]) &&
                "Expected 3 digit octal escape!");
         O << AggregateString[++i];
         O << AggregateString[++i];
diff --git a/include/llvm/Target/GenericOpcodes.td b/include/llvm/Target/GenericOpcodes.td
index d3b2835d1bd403a4fad743400d5cee6c8d95f5d6..de3796cd4ee5639771006646502fb4ff5d7a60b8 100644
--- a/include/llvm/Target/GenericOpcodes.td
+++ b/include/llvm/Target/GenericOpcodes.td
@@ -91,6 +91,21 @@ def G_FCONSTANT : Instruction {
   let hasSideEffects = 0;
 }
 
+def G_VASTART : Instruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins type0:$list);
+  let hasSideEffects = 0;
+  let mayStore = 1;
+}
+
+def G_VAARG : Instruction {
+  let OutOperandList = (outs type0:$val);
+  let InOperandList = (ins type1:$list, unknown:$align);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
 //------------------------------------------------------------------------------
 // Binary ops.
 //------------------------------------------------------------------------------
@@ -103,13 +118,6 @@ def G_ADD : Instruction {
   let isCommutable = 1;
 }
 
-// Generic pointer offset.
-def G_GEP : Instruction {
-  let OutOperandList = (outs type0:$dst);
-  let InOperandList = (ins type0:$src1, type1:$src2);
-  let hasSideEffects = 0;
-}
-
 // Generic subtraction.
 def G_SUB : Instruction {
   let OutOperandList = (outs type0:$dst);
@@ -224,6 +232,19 @@ def G_SELECT : Instruction {
   let hasSideEffects = 0;
 }
 
+// Generic pointer offset.
+def G_GEP : Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type1:$src2);
+  let hasSideEffects = 0;
+}
+
+def G_PTR_MASK : Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, unknown:$bits);
+  let hasSideEffects = 0;
+}
+
 //------------------------------------------------------------------------------
 // Overflow ops
 //------------------------------------------------------------------------------
@@ -273,10 +294,34 @@ def G_SMULO : Instruction {
   let isCommutable = 1;
 }
 
+// Multiply two numbers at twice the incoming bit width (unsigned) and return
+// the high half of the result.
+def G_UMULH : Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+  let isCommutable = 1;
+}
+
+// Multiply two numbers at twice the incoming bit width (signed) and return
+// the high half of the result.
+def G_SMULH : Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+  let isCommutable = 1;
+}
+
 //------------------------------------------------------------------------------
 // Floating Point Unary Ops.
 //------------------------------------------------------------------------------
 
+def G_FNEG : Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+  let hasSideEffects = 0;
+}
+
 def G_FPEXT : Instruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type1:$src);
@@ -355,6 +400,13 @@ def G_FREM : Instruction {
   let hasSideEffects = 0;
 }
 
+// Floating point exponentiation.
+def G_FPOW : Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1, type0:$src2);
+  let hasSideEffects = 0;
+}
+
 //------------------------------------------------------------------------------
 // Memory ops
 //------------------------------------------------------------------------------
@@ -383,17 +435,24 @@ def G_STORE : Instruction {
 // indexes. This will almost certainly be mapped to sub-register COPYs after
 // register banks have been selected.
 def G_EXTRACT : Instruction {
+  let OutOperandList = (outs type0:$res);
+  let InOperandList = (ins type1:$src, unknown:$offset);
+  let hasSideEffects = 0;
+}
+
+// Extract multiple registers specified size, starting from blocks given by
+// indexes. This will almost certainly be mapped to sub-register COPYs after
+// register banks have been selected.
+def G_UNMERGE_VALUES : Instruction {
   let OutOperandList = (outs);
   let InOperandList = (ins variable_ops);
   let hasSideEffects = 0;
 }
 
-// Insert a sequence of smaller registers into a larger one at the specified
-// indices (interleaved with the values in the operand list "op0, bit0, op1,
-// bit1, ...")).
+// Insert a smaller register into a larger one at the specified bit-index.
 def G_INSERT : Instruction {
   let OutOperandList = (outs type0:$dst);
-  let InOperandList = (ins type0:$src, variable_ops);
+  let InOperandList = (ins type0:$src, type1:$op, unknown:$offset);
   let hasSideEffects = 0;
 }
 
@@ -406,6 +465,12 @@ def G_SEQUENCE : Instruction {
   let hasSideEffects = 0;
 }
 
+def G_MERGE_VALUES : Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins variable_ops);
+  let hasSideEffects = 0;
+}
+
 // Intrinsic without side effects.
 def G_INTRINSIC : Instruction {
   let OutOperandList = (outs);
@@ -454,4 +519,29 @@ def G_BRINDIRECT : Instruction {
   let isTerminator = 1;
 }
 
+//------------------------------------------------------------------------------
+// Vector ops
+//------------------------------------------------------------------------------
+
+// Generic insertelement.
+def G_INSERT_VECTOR_ELT : Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src, type1:$elt, type2:$idx);
+  let hasSideEffects = 0;
+}
+
+// Generic extractelement.
+def G_EXTRACT_VECTOR_ELT : Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src, type2:$idx);
+  let hasSideEffects = 0;
+}
+
+// Generic shufflevector.
+def G_SHUFFLE_VECTOR: Instruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$v1, type1:$v2, type2:$mask);
+  let hasSideEffects = 0;
+}
+
 // TODO: Add the other generic opcodes.
diff --git a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
index 60fdf1c3dd3e9216ad801e52a15bbe64509440f7..9f034220815f21c887b8035335b80fdc15fabc09 100644
--- a/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
+++ b/include/llvm/Target/GlobalISel/SelectionDAGCompat.td
@@ -25,6 +25,8 @@ class GINodeEquiv<Instruction i, SDNode node> {
   SDNode Node = node;
 }
 
+def : GINodeEquiv<G_ZEXT, zext>;
+def : GINodeEquiv<G_SEXT, sext>;
 def : GINodeEquiv<G_ADD, add>;
 def : GINodeEquiv<G_SUB, sub>;
 def : GINodeEquiv<G_MUL, mul>;
@@ -43,3 +45,9 @@ def : GINodeEquiv<G_SREM, srem>;
 def : GINodeEquiv<G_UREM, urem>;
 
 def : GINodeEquiv<G_BR, br>;
+
+// Specifies the GlobalISel equivalents for SelectionDAG's ComplexPattern.
+// Should be used on defs that subclass GIComplexOperandMatcher<>.
+class GIComplexPatternEquiv<ComplexPattern seldag> {
+  ComplexPattern SelDAGEquivalent = seldag;
+}
diff --git a/include/llvm/Target/GlobalISel/Target.td b/include/llvm/Target/GlobalISel/Target.td
new file mode 100644
index 0000000000000000000000000000000000000000..fa1a424b589542e8a090a0c6eca0bae51fca7004
--- /dev/null
+++ b/include/llvm/Target/GlobalISel/Target.td
@@ -0,0 +1,56 @@
+//===- Target.td - Define GlobalISel rules -----------------*- tablegen -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the target-independent interfaces used to support
+// SelectionDAG instruction selection patterns (specified in
+// TargetSelectionDAG.td) when generating GlobalISel instruction selectors.
+//
+// This is intended as a compatibility layer, to enable reuse of target
+// descriptions written for SelectionDAG without requiring explicit GlobalISel
+// support.  It will eventually supersede SelectionDAG patterns.
+//
+//===----------------------------------------------------------------------===//
+
+// Definitions that inherit from LLT define types that will be used in the
+// GlobalISel matcher.
+class LLT;
+
+def s32 : LLT;
+def s64 : LLT;
+
+// Defines a matcher for complex operands. This is analogous to ComplexPattern
+// from SelectionDAG.
+//
+// Definitions that inherit from this may also inherit from
+// GIComplexPatternEquiv to enable the import of SelectionDAG patterns involving
+// those ComplexPatterns.
+class GIComplexOperandMatcher<LLT type, dag operands, string matcherfn> {
+  // The expected type of the root of the match.
+  //
+  // TODO: We should probably support, any-type, any-scalar, and multiple types
+  //       in the future.
+  LLT Type = type;
+
+  // The operands that result from a successful match
+  // Should be of the form '(ops ty1, ty2, ...)' where ty1/ty2 are definitions
+  // that inherit from Operand.
+  //
+  // FIXME: Which definition is used for ty1/ty2 doesn't actually matter at the
+  //        moment. Only the number of operands is used.
+  dag Operands = operands;
+
+  // The function that determines whether the operand matches. It should be of
+  // the form:
+  //   bool select(const MatchOperand &Root, MatchOperand &Result1)
+  // and should have the same number of ResultX arguments as the number of
+  // result operands. It must return true on successful match and false
+  // otherwise. If it returns true, then all the ResultX arguments must be
+  // overwritten.
+  string MatcherFn = matcherfn;
+}
diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td
index e50969d7001d7165a7dab127ece8b81332edc695..b21689e0e1346e9bcac5b02fc626f5b73bd02e1a 100644
--- a/include/llvm/Target/Target.td
+++ b/include/llvm/Target/Target.td
@@ -402,11 +402,8 @@ class Instruction {
                             // If so, make sure to override
                             // TargetInstrInfo::getInsertSubregLikeInputs.
 
-  // Side effect flags - When set, the flags have these meanings:
-  //
-  //  hasSideEffects - The instruction has side effects that are not
-  //    captured by any operands of the instruction or other flags.
-  //
+  // Does the instruction have side effects that are not captured by any
+  // operands of the instruction or other flags?
   bit hasSideEffects = ?;
 
   // Is this instruction a "real" instruction (with a distinct machine
@@ -951,11 +948,12 @@ def LOCAL_ESCAPE : Instruction {
   let hasSideEffects = 0;
   let hasCtrlDep = 1;
 }
-def FAULTING_LOAD_OP : Instruction {
+def FAULTING_OP : Instruction {
   let OutOperandList = (outs unknown:$dst);
   let InOperandList = (ins variable_ops);
   let usesCustomInserter = 1;
   let mayLoad = 1;
+  let mayStore = 1;
   let isTerminator = 1;
   let isBranch = 1;
 }
@@ -998,6 +996,15 @@ def PATCHABLE_TAIL_CALL : Instruction {
   let hasSideEffects = 1;
   let isReturn = 1;
 }
+def FENTRY_CALL : Instruction {
+  let OutOperandList = (outs unknown:$dst);
+  let InOperandList = (ins variable_ops);
+  let AsmString = "# FEntry call";
+  let usesCustomInserter = 1;
+  let mayLoad = 1;
+  let mayStore = 1;
+  let hasSideEffects = 1;
+}
 
 // Generic opcodes used in GlobalISel.
 include "llvm/Target/GenericOpcodes.td"
@@ -1346,6 +1353,11 @@ include "llvm/Target/TargetSelectionDAG.td"
 //
 include "llvm/Target/GlobalISel/RegisterBank.td"
 
+//===----------------------------------------------------------------------===//
+// Pull in the common support for DAG isel generation.
+//
+include "llvm/Target/GlobalISel/Target.td"
+
 //===----------------------------------------------------------------------===//
 // Pull in the common support for the Global ISel DAG-based selector generation.
 //
diff --git a/include/llvm/Target/TargetCallingConv.h b/include/llvm/Target/TargetCallingConv.h
index 779b5add80f287d963359f27f1fa455706e77dd6..4f750b8a289ff743e6987e942adf01c13c859e62 100644
--- a/include/llvm/Target/TargetCallingConv.h
+++ b/include/llvm/Target/TargetCallingConv.h
@@ -26,136 +26,108 @@ namespace ISD {
 
   struct ArgFlagsTy {
   private:
-    static const uint64_t NoFlagSet      = 0ULL;
-    static const uint64_t ZExt           = 1ULL<<0;  ///< Zero extended
-    static const uint64_t ZExtOffs       = 0;
-    static const uint64_t SExt           = 1ULL<<1;  ///< Sign extended
-    static const uint64_t SExtOffs       = 1;
-    static const uint64_t InReg          = 1ULL<<2;  ///< Passed in register
-    static const uint64_t InRegOffs      = 2;
-    static const uint64_t SRet           = 1ULL<<3;  ///< Hidden struct-ret ptr
-    static const uint64_t SRetOffs       = 3;
-    static const uint64_t ByVal          = 1ULL<<4;  ///< Struct passed by value
-    static const uint64_t ByValOffs      = 4;
-    static const uint64_t Nest           = 1ULL<<5;  ///< Nested fn static chain
-    static const uint64_t NestOffs       = 5;
-    static const uint64_t Returned       = 1ULL<<6;  ///< Always returned
-    static const uint64_t ReturnedOffs   = 6;
-    static const uint64_t ByValAlign     = 0xFULL<<7; ///< Struct alignment
-    static const uint64_t ByValAlignOffs = 7;
-    static const uint64_t Split          = 1ULL<<11;
-    static const uint64_t SplitOffs      = 11;
-    static const uint64_t InAlloca       = 1ULL<<12; ///< Passed with inalloca
-    static const uint64_t InAllocaOffs   = 12;
-    static const uint64_t SplitEnd       = 1ULL<<13; ///< Last part of a split
-    static const uint64_t SplitEndOffs   = 13;
-    static const uint64_t SwiftSelf      = 1ULL<<14; ///< Swift self parameter
-    static const uint64_t SwiftSelfOffs  = 14;
-    static const uint64_t SwiftError     = 1ULL<<15; ///< Swift error parameter
-    static const uint64_t SwiftErrorOffs = 15;
-    static const uint64_t Hva            = 1ULL << 16; ///< HVA field for
-                                                       ///< vectorcall
-    static const uint64_t HvaOffs        = 16;
-    static const uint64_t HvaStart       = 1ULL << 17; ///< HVA structure start
-                                                       ///< for vectorcall
-    static const uint64_t HvaStartOffs   = 17;
-    static const uint64_t SecArgPass     = 1ULL << 18; ///< Second argument
-                                                       ///< pass for vectorcall
-    static const uint64_t SecArgPassOffs = 18;
-    static const uint64_t OrigAlign      = 0x1FULL<<27;
-    static const uint64_t OrigAlignOffs  = 27;
-    static const uint64_t ByValSize      = 0x3fffffffULL<<32; ///< Struct size
-    static const uint64_t ByValSizeOffs  = 32;
-    static const uint64_t InConsecutiveRegsLast      = 0x1ULL<<62; ///< Struct size
-    static const uint64_t InConsecutiveRegsLastOffs  = 62;
-    static const uint64_t InConsecutiveRegs      = 0x1ULL<<63; ///< Struct size
-    static const uint64_t InConsecutiveRegsOffs  = 63;
-
-    static const uint64_t One            = 1ULL; ///< 1 of this type, for shifts
-
-    uint64_t Flags = 0;
+    unsigned IsZExt : 1;     ///< Zero extended
+    unsigned IsSExt : 1;     ///< Sign extended
+    unsigned IsInReg : 1;    ///< Passed in register
+    unsigned IsSRet : 1;     ///< Hidden struct-ret ptr
+    unsigned IsByVal : 1;    ///< Struct passed by value
+    unsigned IsNest : 1;     ///< Nested fn static chain
+    unsigned IsReturned : 1; ///< Always returned
+    unsigned IsSplit : 1;
+    unsigned IsInAlloca : 1;   ///< Passed with inalloca
+    unsigned IsSplitEnd : 1;   ///< Last part of a split
+    unsigned IsSwiftSelf : 1;  ///< Swift self parameter
+    unsigned IsSwiftError : 1; ///< Swift error parameter
+    unsigned IsHva : 1;        ///< HVA field for
+    unsigned IsHvaStart : 1;   ///< HVA structure start
+    unsigned IsSecArgPass : 1; ///< Second argument
+    unsigned ByValAlign : 4;   ///< Log 2 of byval alignment
+    unsigned OrigAlign : 5;    ///< Log 2 of original alignment
+    unsigned IsInConsecutiveRegsLast : 1;
+    unsigned IsInConsecutiveRegs : 1;
+    unsigned IsCopyElisionCandidate : 1; ///< Argument copy elision candidate
+
+    unsigned ByValSize; ///< Byval struct size
 
   public:
-    ArgFlagsTy() = default;
+    ArgFlagsTy()
+        : IsZExt(0), IsSExt(0), IsInReg(0), IsSRet(0), IsByVal(0), IsNest(0),
+          IsReturned(0), IsSplit(0), IsInAlloca(0), IsSplitEnd(0),
+          IsSwiftSelf(0), IsSwiftError(0), IsHva(0), IsHvaStart(0),
+          IsSecArgPass(0), ByValAlign(0), OrigAlign(0),
+          IsInConsecutiveRegsLast(0), IsInConsecutiveRegs(0),
+          IsCopyElisionCandidate(0), ByValSize(0) {
+      static_assert(sizeof(*this) == 2 * sizeof(unsigned), "flags are too big");
+    }
 
-    bool isZExt()      const { return Flags & ZExt; }
-    void setZExt()     { Flags |= One << ZExtOffs; }
+    bool isZExt() const { return IsZExt; }
+    void setZExt() { IsZExt = 1; }
 
-    bool isSExt()      const { return Flags & SExt; }
-    void setSExt()     { Flags |= One << SExtOffs; }
+    bool isSExt() const { return IsSExt; }
+    void setSExt() { IsSExt = 1; }
 
-    bool isInReg()     const { return Flags & InReg; }
-    void setInReg()    { Flags |= One << InRegOffs; }
+    bool isInReg() const { return IsInReg; }
+    void setInReg() { IsInReg = 1; }
 
-    bool isSRet()      const { return Flags & SRet; }
-    void setSRet()     { Flags |= One << SRetOffs; }
+    bool isSRet() const { return IsSRet; }
+    void setSRet() { IsSRet = 1; }
 
-    bool isByVal()     const { return Flags & ByVal; }
-    void setByVal()    { Flags |= One << ByValOffs; }
+    bool isByVal() const { return IsByVal; }
+    void setByVal() { IsByVal = 1; }
 
-    bool isInAlloca()  const { return Flags & InAlloca; }
-    void setInAlloca() { Flags |= One << InAllocaOffs; }
+    bool isInAlloca() const { return IsInAlloca; }
+    void setInAlloca() { IsInAlloca = 1; }
 
-    bool isSwiftSelf() const { return Flags & SwiftSelf; }
-    void setSwiftSelf() { Flags |= One << SwiftSelfOffs; }
+    bool isSwiftSelf() const { return IsSwiftSelf; }
+    void setSwiftSelf() { IsSwiftSelf = 1; }
 
-    bool isSwiftError() const { return Flags & SwiftError; }
-    void setSwiftError() { Flags |= One << SwiftErrorOffs; }
+    bool isSwiftError() const { return IsSwiftError; }
+    void setSwiftError() { IsSwiftError = 1; }
 
-    bool isHva() const { return Flags & Hva; }
-    void setHva() { Flags |= One << HvaOffs; }
+    bool isHva() const { return IsHva; }
+    void setHva() { IsHva = 1; }
 
-    bool isHvaStart() const { return Flags & HvaStart; }
-    void setHvaStart() { Flags |= One << HvaStartOffs; }
+    bool isHvaStart() const { return IsHvaStart; }
+    void setHvaStart() { IsHvaStart = 1; }
 
-    bool isSecArgPass() const { return Flags & SecArgPass; }
-    void setSecArgPass() { Flags |= One << SecArgPassOffs; }
+    bool isSecArgPass() const { return IsSecArgPass; }
+    void setSecArgPass() { IsSecArgPass = 1; }
 
-    bool isNest()      const { return Flags & Nest; }
-    void setNest()     { Flags |= One << NestOffs; }
+    bool isNest() const { return IsNest; }
+    void setNest() { IsNest = 1; }
 
-    bool isReturned()  const { return Flags & Returned; }
-    void setReturned() { Flags |= One << ReturnedOffs; }
+    bool isReturned() const { return IsReturned; }
+    void setReturned() { IsReturned = 1; }
 
-    bool isInConsecutiveRegs()  const { return Flags & InConsecutiveRegs; }
-    void setInConsecutiveRegs() { Flags |= One << InConsecutiveRegsOffs; }
+    bool isInConsecutiveRegs()  const { return IsInConsecutiveRegs; }
+    void setInConsecutiveRegs() { IsInConsecutiveRegs = 1; }
 
-    bool isInConsecutiveRegsLast()  const { return Flags & InConsecutiveRegsLast; }
-    void setInConsecutiveRegsLast() { Flags |= One << InConsecutiveRegsLastOffs; }
+    bool isInConsecutiveRegsLast() const { return IsInConsecutiveRegsLast; }
+    void setInConsecutiveRegsLast() { IsInConsecutiveRegsLast = 1; }
 
-    unsigned getByValAlign() const {
-      return (unsigned)
-        ((One << ((Flags & ByValAlign) >> ByValAlignOffs)) / 2);
-    }
-    void setByValAlign(unsigned A) {
-      Flags = (Flags & ~ByValAlign) |
-        (uint64_t(Log2_32(A) + 1) << ByValAlignOffs);
-    }
+    bool isSplit()   const { return IsSplit; }
+    void setSplit()  { IsSplit = 1; }
 
-    bool isSplit()   const { return Flags & Split; }
-    void setSplit()  { Flags |= One << SplitOffs; }
+    bool isSplitEnd()   const { return IsSplitEnd; }
+    void setSplitEnd()  { IsSplitEnd = 1; }
 
-    bool isSplitEnd()   const { return Flags & SplitEnd; }
-    void setSplitEnd()  { Flags |= One << SplitEndOffs; }
+    bool isCopyElisionCandidate()  const { return IsCopyElisionCandidate; }
+    void setCopyElisionCandidate() { IsCopyElisionCandidate = 1; }
 
-    unsigned getOrigAlign() const {
-      return (unsigned)
-        ((One << ((Flags & OrigAlign) >> OrigAlignOffs)) / 2);
-    }
-    void setOrigAlign(unsigned A) {
-      Flags = (Flags & ~OrigAlign) |
-        (uint64_t(Log2_32(A) + 1) << OrigAlignOffs);
+    unsigned getByValAlign() const { return (1U << ByValAlign) / 2; }
+    void setByValAlign(unsigned A) {
+      ByValAlign = Log2_32(A) + 1;
+      assert(getByValAlign() == A && "bitfield overflow");
     }
 
-    unsigned getByValSize() const {
-      return (unsigned)((Flags & ByValSize) >> ByValSizeOffs);
-    }
-    void setByValSize(unsigned S) {
-      Flags = (Flags & ~ByValSize) | (uint64_t(S) << ByValSizeOffs);
+    unsigned getOrigAlign() const { return (1U << OrigAlign) / 2; }
+    void setOrigAlign(unsigned A) {
+      OrigAlign = Log2_32(A) + 1;
+      assert(getOrigAlign() == A && "bitfield overflow");
     }
 
-    /// getRawBits - Represent the flags as a bunch of bits.
-    uint64_t getRawBits() const { return Flags; }
+    unsigned getByValSize() const { return ByValSize; }
+    void setByValSize(unsigned S) { ByValSize = S; }
   };
 
   /// InputArg - This struct carries flags and type information about a
diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h
index 7d50201c3b767aa6a159fad0ff298ff329b43bad..0beb6cddf5bc3ff04b903226997424243ef5ed81 100644
--- a/include/llvm/Target/TargetInstrInfo.h
+++ b/include/llvm/Target/TargetInstrInfo.h
@@ -1070,15 +1070,6 @@ public:
     llvm_unreachable("target did not implement shouldClusterMemOps()");
   }
 
-  /// Can this target fuse the given instructions if they are scheduled
-  /// adjacent. Note that you have to add:
-  ///   DAG.addMutation(createMacroFusionDAGMutation());
-  /// to TargetPassConfig::createMachineScheduler() to have an effect.
-  virtual bool shouldScheduleAdjacent(const MachineInstr &First,
-                                      const MachineInstr &Second) const {
-    llvm_unreachable("target did not implement shouldScheduleAdjacent()");
-  }
-
   /// Reverses the branch condition of the specified condition list,
   /// returning false on success and true if it cannot be reversed.
   virtual
@@ -1151,7 +1142,7 @@ public:
   /// Return true if the specified instruction can be predicated.
   /// By default, this returns true for every instruction with a
   /// PredicateOperand.
-  virtual bool isPredicable(MachineInstr &MI) const {
+  virtual bool isPredicable(const MachineInstr &MI) const {
     return MI.getDesc().isPredicable();
   }
 
@@ -1446,10 +1437,17 @@ public:
     return nullptr;
   }
 
-  // Sometimes, it is possible for the target
-  // to tell, even without aliasing information, that two MIs access different
-  // memory addresses. This function returns true if two MIs access different
-  // memory addresses and false otherwise.
+  /// Sometimes, it is possible for the target
+  /// to tell, even without aliasing information, that two MIs access different
+  /// memory addresses. This function returns true if two MIs access different
+  /// memory addresses and false otherwise.
+  ///
+  /// Assumes any physical registers used to compute addresses have the same
+  /// value for both instructions. (This is the most useful assumption for
+  /// post-RA scheduling.)
+  ///
+  /// See also MachineInstr::mayAlias, which is implemented on top of this
+  /// function.
   virtual bool
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const {
@@ -1505,9 +1503,11 @@ public:
     return None;
   }
 
-  /// Determines whether |Inst| is a tail call instruction.
+  /// Determines whether \p Inst is a tail call instruction. Override this
+  /// method on targets that do not properly set MCID::Return and MCID::Call on
+  /// tail call instructions."
   virtual bool isTailCall(const MachineInstr &Inst) const {
-    return false;
+    return Inst.isReturn() && Inst.isCall();
   }
 
   /// True if the instruction is bound to the top of its basic block and no
@@ -1517,6 +1517,65 @@ public:
     return false;
   }
 
+  /// \brief Return how many instructions would be saved by outlining a
+  /// sequence containing \p SequenceSize instructions that appears
+  /// \p Occurrences times in a module.
+  virtual unsigned getOutliningBenefit(size_t SequenceSize, size_t Occurrences,
+                                       bool CanBeTailCall) const {
+    llvm_unreachable(
+        "Target didn't implement TargetInstrInfo::getOutliningBenefit!");
+  }
+
+  /// Represents how an instruction should be mapped by the outliner.
+  /// \p Legal instructions are those which are safe to outline.
+  /// \p Illegal instructions are those which cannot be outlined.
+  /// \p Invisible instructions are instructions which can be outlined, but
+  /// shouldn't actually impact the outlining result.
+  enum MachineOutlinerInstrType {Legal, Illegal, Invisible};
+
+  /// Returns how or if \p MI should be outlined.
+  virtual MachineOutlinerInstrType getOutliningType(MachineInstr &MI) const {
+    llvm_unreachable(
+        "Target didn't implement TargetInstrInfo::getOutliningType!");
+  }
+
+  /// Insert a custom epilogue for outlined functions.
+  /// This may be empty, in which case no epilogue or return statement will be
+  /// emitted.
+  virtual void insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                                      MachineFunction &MF,
+                                      bool IsTailCall) const {
+    llvm_unreachable(
+        "Target didn't implement TargetInstrInfo::insertOutlinerEpilogue!");
+  }
+
+  /// Insert a call to an outlined function into the program.
+  /// Returns an iterator to the spot where we inserted the call. This must be
+  /// implemented by the target.
+  virtual MachineBasicBlock::iterator
+  insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator &It, MachineFunction &MF,
+                     bool IsTailCall) const {
+    llvm_unreachable(
+        "Target didn't implement TargetInstrInfo::insertOutlinedCall!");
+  }
+
+  /// Insert a custom prologue for outlined functions.
+  /// This may be empty, in which case no prologue will be emitted.
+  virtual void insertOutlinerPrologue(MachineBasicBlock &MBB,
+                                      MachineFunction &MF,
+                                      bool IsTailCall) const {
+    llvm_unreachable(
+        "Target didn't implement TargetInstrInfo::insertOutlinerPrologue!");
+  }
+
+  /// Return true if the function can safely be outlined from.
+  /// By default, this means that the function has no red zone.
+  virtual bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const {
+    llvm_unreachable("Target didn't implement "
+                     "TargetInstrInfo::isFunctionSafeToOutlineFrom!");
+  }
+
 private:
   unsigned CallFrameSetupOpcode, CallFrameDestroyOpcode;
   unsigned CatchRetOpcode;
diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h
index 01c5c51f6b18ce8bbe3ab0aa3a5c27424ad06af6..240896a538f1974d45947bf399ba63ce8040cf2b 100644
--- a/include/llvm/Target/TargetLowering.h
+++ b/include/llvm/Target/TargetLowering.h
@@ -25,13 +25,14 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/RuntimeLibcalls.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Attributes.h"
@@ -163,6 +164,35 @@ public:
                        // or custom.
   };
 
+  class ArgListEntry {
+  public:
+    Value *Val = nullptr;
+    SDValue Node = SDValue();
+    Type *Ty = nullptr;
+    bool IsSExt : 1;
+    bool IsZExt : 1;
+    bool IsInReg : 1;
+    bool IsSRet : 1;
+    bool IsNest : 1;
+    bool IsByVal : 1;
+    bool IsInAlloca : 1;
+    bool IsReturned : 1;
+    bool IsSwiftSelf : 1;
+    bool IsSwiftError : 1;
+    uint16_t Alignment = 0;
+
+    ArgListEntry()
+        : IsSExt(false), IsZExt(false), IsInReg(false), IsSRet(false),
+          IsNest(false), IsByVal(false), IsInAlloca(false), IsReturned(false),
+          IsSwiftSelf(false), IsSwiftError(false) {}
+
+    void setAttributes(ImmutableCallSite *CS, unsigned AttrIdx);
+  };
+  typedef std::vector<ArgListEntry> ArgListTy;
+
+  virtual void markLibCallAttributes(MachineFunction *MF, unsigned CC,
+                                     ArgListTy &Args) const {};
+
   static ISD::NodeType getExtendForContent(BooleanContent Content) {
     switch (Content) {
     case UndefinedBooleanContent:
@@ -254,9 +284,7 @@ public:
   /// several shifts, adds, and multiplies for this target.
   /// The definition of "cheaper" may depend on whether we're optimizing
   /// for speed or for size.
-  virtual bool isIntDivCheap(EVT VT, AttributeSet Attr) const {
-    return false;
-  }
+  virtual bool isIntDivCheap(EVT VT, AttributeList Attr) const { return false; }
 
   /// Return true if the target can handle a standalone remainder operation.
   virtual bool hasStandaloneRem(EVT VT) const {
@@ -363,6 +391,9 @@ public:
     return false;
   }
 
+  /// Returns if it's reasonable to merge stores to MemVT size.
+  virtual bool canMergeStoresTo(EVT MemVT) const { return true; }
+
   /// \brief Return true if it is cheap to speculate a call to intrinsic cttz.
   virtual bool isCheapToSpeculateCttz() const {
     return false;
@@ -395,16 +426,33 @@ public:
   /// \brief Return if the target supports combining a
   /// chain like:
   /// \code
-  ///   %andResult = and %val1, #imm-with-one-bit-set;
+  ///   %andResult = and %val1, #mask
   ///   %icmpResult = icmp %andResult, 0
-  ///   br i1 %icmpResult, label %dest1, label %dest2
   /// \endcode
   /// into a single machine instruction of a form like:
   /// \code
-  ///   brOnBitSet %register, #bitNumber, dest
+  ///   cc = test %register, #mask
   /// \endcode
-  bool isMaskAndBranchFoldingLegal() const {
-    return MaskAndBranchFoldingIsLegal;
+  virtual bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const {
+    return false;
+  }
+
+  /// Use bitwise logic to make pairs of compares more efficient. For example:
+  /// and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
+  /// This should be true when it takes more than one instruction to lower
+  /// setcc (cmp+set on x86 scalar), when bitwise ops are faster than logic on
+  /// condition bits (crand on PowerPC), and/or when reducing cmp+br is a win.
+  virtual bool convertSetCCLogicToBitwiseLogic(EVT VT) const {
+    return false;
+  }
+
+  /// Return the preferred operand type if the target has a quick way to compare
+  /// integer values of the given size. Assume that any legal integer type can
+  /// be compared efficiently. Targets may override this to allow illegal wide
+  /// types to return a vector type if there is support to compare that type.
+  virtual MVT hasFastEqualityCompare(unsigned NumBits) const {
+    MVT VT = MVT::getIntegerVT(NumBits);
+    return isTypeLegal(VT) ? VT : MVT::INVALID_SIMPLE_VALUE_TYPE;
   }
 
   /// Return true if the target should transform:
@@ -987,6 +1035,11 @@ public:
     return GatherAllAliasesMaxDepth;
   }
 
+  /// Returns the size of the platform's va_list object.
+  virtual unsigned getVaListSizeInBits(const DataLayout &DL) const {
+    return getPointerTy(DL).getSizeInBits();
+  }
+
   /// \brief Get maximum # of store operations permitted for llvm.memset
   ///
   /// This function returns the maximum number of store operations permitted
@@ -1384,6 +1437,13 @@ public:
       Action != TypeSplitVector;
   }
 
+  /// Return true if a select of constants (select Cond, C1, C2) should be
+  /// transformed into simple math ops with the condition value. For example:
+  /// select Cond, C1, C1-1 --> add (zext Cond), C1-1
+  virtual bool convertSelectOfConstantsToMath() const {
+    return false;
+  }
+
   //===--------------------------------------------------------------------===//
   // TargetLowering Configuration Methods - These methods should be invoked by
   // the derived class constructor to configure this object for the target.
@@ -1643,10 +1703,9 @@ public:
   /// possible to be done in the address mode for that operand. This hook lets
   /// targets also pass back when this should be done on intrinsics which
   /// load/store.
-  virtual bool GetAddrModeArguments(IntrinsicInst * /*I*/,
+  virtual bool getAddrModeArguments(IntrinsicInst * /*I*/,
                                     SmallVectorImpl<Value*> &/*Ops*/,
-                                    Type *&/*AccessTy*/,
-                                    unsigned AddrSpace = 0) const {
+                                    Type *&/*AccessTy*/) const {
     return false;
   }
 
@@ -2198,10 +2257,6 @@ protected:
   /// the branch is usually predicted right.
   bool PredictableSelectIsExpensive;
 
-  /// MaskAndBranchFoldingIsLegal - Indicates if the target supports folding
-  /// a mask of a single bit, a compare, and a branch into a single instruction.
-  bool MaskAndBranchFoldingIsLegal;
-
   /// \see enableExtLdPromotion.
   bool EnableExtLdPromotion;
 
@@ -2358,11 +2413,11 @@ public:
   /// expression and return a mask of KnownOne and KnownZero bits for the
   /// expression (used to simplify the caller).  The KnownZero/One bits may only
   /// be accurate for those bits in the DemandedMask.
-  /// \p AssumeSingleUse When this paramater is true, this function will
+  /// \p AssumeSingleUse When this parameter is true, this function will
   ///    attempt to simplify \p Op even if there are multiple uses.
   ///    Callers are responsible for correctly updating the DAG based on the
   ///    results of this function, because simply replacing replacing TLO.Old
-  ///    with TLO.New will be incorrect when this paramater is true and TLO.Old
+  ///    with TLO.New will be incorrect when this parameter is true and TLO.Old
   ///    has multiple uses.
   bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedMask,
                             APInt &KnownZero, APInt &KnownOne,
@@ -2370,17 +2425,27 @@ public:
                             unsigned Depth = 0,
                             bool AssumeSingleUse = false) const;
 
+  /// Helper wrapper around SimplifyDemandedBits
+  bool SimplifyDemandedBits(SDValue Op, APInt &DemandedMask,
+                            DAGCombinerInfo &DCI) const;
+
   /// Determine which of the bits specified in Mask are known to be either zero
-  /// or one and return them in the KnownZero/KnownOne bitsets.
+  /// or one and return them in the KnownZero/KnownOne bitsets. The DemandedElts
+  /// argument allows us to only collect the known bits that are shared by the
+  /// requested vector elements.
   virtual void computeKnownBitsForTargetNode(const SDValue Op,
                                              APInt &KnownZero,
                                              APInt &KnownOne,
+                                             const APInt &DemandedElts,
                                              const SelectionDAG &DAG,
                                              unsigned Depth = 0) const;
 
   /// This method can be implemented by targets that want to expose additional
-  /// information about sign bits to the DAG Combiner.
+  /// information about sign bits to the DAG Combiner. The DemandedElts
+  /// argument allows us to only collect the minimum sign bits that are shared
+  /// by the requested vector elements.
   virtual unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                   const APInt &DemandedElts,
                                                    const SelectionDAG &DAG,
                                                    unsigned Depth = 0) const;
 
@@ -2537,30 +2602,6 @@ public:
     llvm_unreachable("Not Implemented");
   }
 
-  struct ArgListEntry {
-    SDValue Node;
-    Type* Ty;
-    bool isSExt     : 1;
-    bool isZExt     : 1;
-    bool isInReg    : 1;
-    bool isSRet     : 1;
-    bool isNest     : 1;
-    bool isByVal    : 1;
-    bool isInAlloca : 1;
-    bool isReturned : 1;
-    bool isSwiftSelf : 1;
-    bool isSwiftError : 1;
-    uint16_t Alignment;
-
-    ArgListEntry() : isSExt(false), isZExt(false), isInReg(false),
-      isSRet(false), isNest(false), isByVal(false), isInAlloca(false),
-      isReturned(false), isSwiftSelf(false), isSwiftError(false),
-      Alignment(0) {}
-
-    void setAttributes(ImmutableCallSite *CS, unsigned AttrIdx);
-  };
-  typedef std::vector<ArgListEntry> ArgListTy;
-
   /// This structure contains all information that is necessary for lowering
   /// calls. It is passed to TLI::LowerCallTo when the SelectionDAG builder
   /// needs to lower a call, and targets will see this struct in their LowerCall
@@ -2610,6 +2651,20 @@ public:
       return *this;
     }
 
+    // setCallee with target/module-specific attributes
+    CallLoweringInfo &setLibCallee(CallingConv::ID CC, Type *ResultType,
+                                   SDValue Target, ArgListTy &&ArgsList) {
+      RetTy = ResultType;
+      Callee = Target;
+      CallConv = CC;
+      NumFixedArgs = Args.size();
+      Args = std::move(ArgsList);
+
+      DAG.getTargetLoweringInfo().markLibCallAttributes(
+          &(DAG.getMachineFunction()), CC, Args);
+      return *this;
+    }
+
     CallLoweringInfo &setCallee(CallingConv::ID CC, Type *ResultType,
                                 SDValue Target, ArgListTy &&ArgsList) {
       RetTy = ResultType;
@@ -3184,7 +3239,7 @@ private:
 /// Given an LLVM IR type and return type attributes, compute the return value
 /// EVTs and flags, and optionally also the offsets, if the return value is
 /// being lowered to memory.
-void GetReturnInfo(Type *ReturnType, AttributeSet attr,
+void GetReturnInfo(Type *ReturnType, AttributeList attr,
                    SmallVectorImpl<ISD::OutputArg> &Outs,
                    const TargetLowering &TLI, const DataLayout &DL);
 
diff --git a/include/llvm/Target/TargetOpcodes.def b/include/llvm/Target/TargetOpcodes.def
index cd62a19fdf16734e753b5a81fe2aa95f71f2b6aa..96db6e0a97698dd142464abd784f24ef014873ce 100644
--- a/include/llvm/Target/TargetOpcodes.def
+++ b/include/llvm/Target/TargetOpcodes.def
@@ -107,6 +107,9 @@ HANDLE_TARGET_OPCODE(LIFETIME_END)
 /// that must lie within the function and not contain another stackmap.
 HANDLE_TARGET_OPCODE(STACKMAP)
 
+/// FEntry all - This is a marker instruction which gets translated into a raw fentry call.
+HANDLE_TARGET_OPCODE(FENTRY_CALL)
+
 /// Patchable call instruction - this instruction represents a call to a
 /// constant address, followed by a series of NOPs. It is intended to
 /// support optimizations for dynamic languages (such as javascript) that
@@ -131,11 +134,13 @@ HANDLE_TARGET_OPCODE(STATEPOINT)
 /// frame index of the local stack allocation.
 HANDLE_TARGET_OPCODE(LOCAL_ESCAPE)
 
-/// Loading instruction that may page fault, bundled with associated
+/// Wraps a machine instruction which can fault, bundled with associated
+/// information on how to handle such a fault.
+/// For example loading instruction that may page fault, bundled with associated
 /// information on how to handle such a page fault.  It is intended to support
 /// "zero cost" null checks in managed languages by allowing LLVM to fold
 /// comparisons into existing memory operations.
-HANDLE_TARGET_OPCODE(FAULTING_LOAD_OP)
+HANDLE_TARGET_OPCODE(FAULTING_OP)
 
 /// Wraps a machine instruction to add patchability constraints.  An
 /// instruction wrapped in PATCHABLE_OP has to either have a minimum
@@ -224,6 +229,8 @@ HANDLE_TARGET_OPCODE(G_GLOBAL_VALUE)
 /// (typically a sub-register COPY after instruction selection).
 HANDLE_TARGET_OPCODE(G_EXTRACT)
 
+HANDLE_TARGET_OPCODE(G_UNMERGE_VALUES)
+
 /// Generic instruction to insert blocks of bits from the registers given into
 /// the source.
 HANDLE_TARGET_OPCODE(G_INSERT)
@@ -232,6 +239,8 @@ HANDLE_TARGET_OPCODE(G_INSERT)
 /// larger register.
 HANDLE_TARGET_OPCODE(G_SEQUENCE)
 
+HANDLE_TARGET_OPCODE(G_MERGE_VALUES)
+
 /// Generic pointer to int conversion.
 HANDLE_TARGET_OPCODE(G_PTRTOINT)
 
@@ -275,6 +284,12 @@ HANDLE_TARGET_OPCODE(G_CONSTANT)
 /// Generic floating constant.
 HANDLE_TARGET_OPCODE(G_FCONSTANT)
 
+/// Generic va_start instruction. Stores to its one pointer operand.
+HANDLE_TARGET_OPCODE(G_VASTART)
+
+/// Generic va_start instruction. Stores to its one pointer operand.
+HANDLE_TARGET_OPCODE(G_VAARG)
+
 // Generic sign extend
 HANDLE_TARGET_OPCODE(G_SEXT)
 
@@ -323,6 +338,14 @@ HANDLE_TARGET_OPCODE(G_UMULO)
 /// overflow flag.
 HANDLE_TARGET_OPCODE(G_SMULO)
 
+// Multiply two numbers at twice the incoming bit width (unsigned) and return
+// the high half of the result.
+HANDLE_TARGET_OPCODE(G_UMULH)
+
+// Multiply two numbers at twice the incoming bit width (signed) and return
+// the high half of the result.
+HANDLE_TARGET_OPCODE(G_SMULH)
+
 /// Generic FP addition.
 HANDLE_TARGET_OPCODE(G_FADD)
 
@@ -338,7 +361,13 @@ HANDLE_TARGET_OPCODE(G_FDIV)
 /// Generic FP remainder.
 HANDLE_TARGET_OPCODE(G_FREM)
 
-/// Generic float to signed-int conversion
+/// Generic FP exponentiation.
+HANDLE_TARGET_OPCODE(G_FPOW)
+
+/// Generic FP negation.
+HANDLE_TARGET_OPCODE(G_FNEG)
+
+/// Generic FP extension.
 HANDLE_TARGET_OPCODE(G_FPEXT)
 
 /// Generic float to signed-int conversion
@@ -359,15 +388,28 @@ HANDLE_TARGET_OPCODE(G_UITOFP)
 /// Generic pointer offset
 HANDLE_TARGET_OPCODE(G_GEP)
 
+/// Clear the specified number of low bits in a pointer. This rounds the value
+/// *down* to the given alignment.
+HANDLE_TARGET_OPCODE(G_PTR_MASK)
+
 /// Generic BRANCH instruction. This is an unconditional branch.
 HANDLE_TARGET_OPCODE(G_BR)
 
+/// Generic insertelement.
+HANDLE_TARGET_OPCODE(G_INSERT_VECTOR_ELT)
+
+/// Generic extractelement.
+HANDLE_TARGET_OPCODE(G_EXTRACT_VECTOR_ELT)
+
+/// Generic shufflevector.
+HANDLE_TARGET_OPCODE(G_SHUFFLE_VECTOR)
+
 // TODO: Add more generic opcodes as we move along.
 
 /// Marker for the end of the generic opcode.
 /// This is used to check if an opcode is in the range of the
 /// generic opcodes.
-HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_BR)
+HANDLE_TARGET_OPCODE_MARKER(PRE_ISEL_GENERIC_OPCODE_END, G_SHUFFLE_VECTOR)
 
 /// BUILTIN_OP_END - This must be the last enum value in this list.
 /// The target-specific post-isel opcode values start here.
diff --git a/include/llvm/Target/TargetOptions.h b/include/llvm/Target/TargetOptions.h
index ebb01dcb18036a5d8cc252362dd1242aadecf086..7cc33f2fdccbbd655e5639afb99bd3b19a73690e 100644
--- a/include/llvm/Target/TargetOptions.h
+++ b/include/llvm/Target/TargetOptions.h
@@ -99,16 +99,16 @@ namespace llvm {
   class TargetOptions {
   public:
     TargetOptions()
-        : PrintMachineCode(false), LessPreciseFPMADOption(false),
-          UnsafeFPMath(false), NoInfsFPMath(false), NoNaNsFPMath(false),
-          NoTrappingFPMath(false), NoSignedZerosFPMath(false),
+        : PrintMachineCode(false), UnsafeFPMath(false), NoInfsFPMath(false),
+          NoNaNsFPMath(false), NoTrappingFPMath(false),
+          NoSignedZerosFPMath(false),
           HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false),
           GuaranteedTailCallOpt(false), StackSymbolOrdering(true),
           EnableFastISel(false), UseInitArray(false),
           DisableIntegratedAS(false), CompressDebugSections(false),
           RelaxELFRelocations(false), FunctionSections(false),
           DataSections(false), UniqueSectionNames(true), TrapUnreachable(false),
-          EmulatedTLS(false), EnableIPRA(false), DebugInfoForProfiling(false) {}
+          EmulatedTLS(false), EnableIPRA(false) {}
 
     /// PrintMachineCode - This flag is enabled when the -print-machineinstrs
     /// option is specified on the command line, and should enable debugging
@@ -119,20 +119,11 @@ namespace llvm {
     /// optimization should be disabled for the given machine function.
     bool DisableFramePointerElim(const MachineFunction &MF) const;
 
-    /// LessPreciseFPMAD - This flag is enabled when the
-    /// -enable-fp-mad is specified on the command line.  When this flag is off
-    /// (the default), the code generator is not allowed to generate mad
-    /// (multiply add) if the result is "less precise" than doing those
-    /// operations individually.
-    unsigned LessPreciseFPMADOption : 1;
-    bool LessPreciseFPMAD() const;
-
     /// UnsafeFPMath - This flag is enabled when the
     /// -enable-unsafe-fp-math flag is specified on the command line.  When
     /// this flag is off (the default), the code generator is not allowed to
     /// produce results that are "less precise" than IEEE allows.  This includes
     /// use of X86 instructions like FSIN and FCOS instead of libcalls.
-    /// UnsafeFPMath implies LessPreciseFPMAD.
     unsigned UnsafeFPMath : 1;
 
     /// NoInfsFPMath - This flag is enabled when the
@@ -225,9 +216,6 @@ namespace llvm {
     /// This flag enables InterProcedural Register Allocation (IPRA).
     unsigned EnableIPRA : 1;
 
-    /// This flag enables emitting extra debug info for sample profiling.
-    unsigned DebugInfoForProfiling : 1;
-
     /// FloatABIType - This setting is set by -float-abi=xxx option is specfied
     /// on the command line. This setting may either be Default, Soft, or Hard.
     /// Default selects the target's default behavior. Soft selects the ABI for
@@ -275,43 +263,6 @@ namespace llvm {
     MCTargetOptions MCOptions;
   };
 
-// Comparison operators:
-
-
-inline bool operator==(const TargetOptions &LHS,
-                       const TargetOptions &RHS) {
-#define ARE_EQUAL(X) LHS.X == RHS.X
-  return
-    ARE_EQUAL(UnsafeFPMath) &&
-    ARE_EQUAL(NoInfsFPMath) &&
-    ARE_EQUAL(NoNaNsFPMath) &&
-    ARE_EQUAL(NoTrappingFPMath) &&
-    ARE_EQUAL(HonorSignDependentRoundingFPMathOption) &&
-    ARE_EQUAL(NoZerosInBSS) &&
-    ARE_EQUAL(GuaranteedTailCallOpt) &&
-    ARE_EQUAL(StackAlignmentOverride) &&
-    ARE_EQUAL(EnableFastISel) &&
-    ARE_EQUAL(UseInitArray) &&
-    ARE_EQUAL(TrapUnreachable) &&
-    ARE_EQUAL(EmulatedTLS) &&
-    ARE_EQUAL(FloatABIType) &&
-    ARE_EQUAL(AllowFPOpFusion) &&
-    ARE_EQUAL(ThreadModel) &&
-    ARE_EQUAL(EABIVersion) &&
-    ARE_EQUAL(DebuggerTuning) &&
-    ARE_EQUAL(FPDenormalMode) &&
-    ARE_EQUAL(ExceptionModel) &&
-    ARE_EQUAL(MCOptions) &&
-    ARE_EQUAL(EnableIPRA) &&
-    ARE_EQUAL(DebugInfoForProfiling);
-#undef ARE_EQUAL
-}
-
-inline bool operator!=(const TargetOptions &LHS,
-                       const TargetOptions &RHS) {
-  return !(LHS == RHS);
-}
-
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Target/TargetRegisterInfo.h b/include/llvm/Target/TargetRegisterInfo.h
index 673cca7f44a905250fe01963224145a53e142389..3f5daea63ab591270d08c1a550d12e6afc2b2616 100644
--- a/include/llvm/Target/TargetRegisterInfo.h
+++ b/include/llvm/Target/TargetRegisterInfo.h
@@ -45,6 +45,7 @@ public:
 
   // Instance variables filled by tablegen, do not use!
   const MCRegisterClass *MC;
+  const uint16_t SpillSize, SpillAlignment;
   const vt_iterator VTs;
   const uint32_t *SubClassMask;
   const uint16_t *SuperRegIndices;
@@ -94,10 +95,10 @@ public:
 
   /// Return the size of the register in bytes, which is also the size
   /// of a stack slot allocated to hold a spilled copy of this register.
-  unsigned getSize() const { return MC->getSize(); }
+  unsigned getSize() const { return SpillSize; }
 
   /// Return the minimum required alignment for a register of this class.
-  unsigned getAlignment() const { return MC->getAlignment(); }
+  unsigned getAlignment() const { return SpillAlignment; }
 
   /// Return the cost of copying a value between two registers in this class.
   /// A negative number means the register class is very expensive
@@ -426,7 +427,9 @@ public:
   /// this target. The register should be in the order of desired callee-save
   /// stack frame offset. The first register is closest to the incoming stack
   /// pointer if stack grows down, and vice versa.
-  ///
+  /// Notice: This function does not take into account disabled CSRs.
+  ///         In most cases you will want to use instead the function 
+  ///         getCalleeSavedRegs that is implemented in MachineRegisterInfo.
   virtual const MCPhysReg*
   getCalleeSavedRegs(const MachineFunction *MF) const = 0;
 
diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td
index 74b98ac5f6c5d30f2b10889d628c610668b826ed..d342e4fe2613556d5aff434c440f4400bc66a63f 100644
--- a/include/llvm/Target/TargetSchedule.td
+++ b/include/llvm/Target/TargetSchedule.td
@@ -139,7 +139,7 @@ class ProcResourceKind;
 // changes this to an in-order issue/dispatch resource. In this case,
 // the scheduler counts down from the cycle that the instruction
 // issues in-order, forcing a stall whenever a subsequent instruction
-// requires the same resource until the number of ResourceCyles
+// requires the same resource until the number of ResourceCycles
 // specified in WriteRes expire. Setting BufferSize=1 changes this to
 // an in-order latency resource. In this case, the scheduler models
 // producer/consumer stalls between instructions that use the
@@ -255,6 +255,9 @@ class ProcWriteResources<list<ProcResourceKind> resources> {
   // Allow a processor to mark some scheduling classes as unsupported
   // for stronger verification.
   bit Unsupported = 0;
+  // Allow a processor to mark some scheduling classes as single-issue.
+  // SingleIssue is an alias for Begin/End Group.
+  bit SingleIssue = 0;
   SchedMachineModel SchedModel = ?;
 }
 
diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td
index 55e2c2bce3db91683061c924c800c36b50ac6810..45a842f77a21a513cb9f5e3ac79b71b8642930c0 100644
--- a/include/llvm/Target/TargetSelectionDAG.td
+++ b/include/llvm/Target/TargetSelectionDAG.td
@@ -136,30 +136,34 @@ def SDTIntUnaryOp : SDTypeProfile<1, 1, [   // ctlz
   SDTCisSameAs<0, 1>, SDTCisInt<0>
 ]>;
 def SDTIntExtendOp : SDTypeProfile<1, 1, [  // sext, zext, anyext
-  SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>
 ]>;
 def SDTIntTruncOp  : SDTypeProfile<1, 1, [  // trunc
-  SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
 ]>;
 def SDTFPUnaryOp  : SDTypeProfile<1, 1, [   // fneg, fsqrt, etc
   SDTCisSameAs<0, 1>, SDTCisFP<0>
 ]>;
 def SDTFPRoundOp  : SDTypeProfile<1, 1, [   // fround
-  SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>
+  SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<0, 1>, SDTCisSameNumEltsAs<0, 1>
 ]>;
 def SDTFPExtendOp  : SDTypeProfile<1, 1, [  // fextend
-  SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0>
+  SDTCisFP<0>, SDTCisFP<1>, SDTCisOpSmallerThanOp<1, 0>, SDTCisSameNumEltsAs<0, 1>
 ]>;
 def SDTIntToFPOp : SDTypeProfile<1, 1, [    // [su]int_to_fp
-  SDTCisFP<0>, SDTCisInt<1>
+  SDTCisFP<0>, SDTCisInt<1>, SDTCisSameNumEltsAs<0, 1>
 ]>;
 def SDTFPToIntOp : SDTypeProfile<1, 1, [    // fp_to_[su]int
-  SDTCisInt<0>, SDTCisFP<1>
+  SDTCisInt<0>, SDTCisFP<1>, SDTCisSameNumEltsAs<0, 1>
 ]>;
 def SDTExtInreg : SDTypeProfile<1, 2, [     // sext_inreg
   SDTCisSameAs<0, 1>, SDTCisInt<0>, SDTCisVT<2, OtherVT>,
   SDTCisVTSmallerThanOp<2, 1>
 ]>;
+def SDTExtInvec : SDTypeProfile<1, 1, [     // sext_invec
+  SDTCisInt<0>, SDTCisVec<0>, SDTCisInt<1>, SDTCisVec<1>,
+  SDTCisOpSmallerThanOp<1, 0>, SDTCisSameSizeAs<0,1>
+]>;
 
 def SDTSetCC : SDTypeProfile<1, 3, [        // setcc
   SDTCisInt<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, OtherVT>
@@ -170,7 +174,7 @@ def SDTSelect : SDTypeProfile<1, 3, [       // select
 ]>;
 
 def SDTVSelect : SDTypeProfile<1, 3, [       // vselect
-  SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameNumEltsAs<0, 1>
+  SDTCisVec<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<2, 3>, SDTCisSameNumEltsAs<0, 1>
 ]>;
 
 def SDTSelectCC : SDTypeProfile<1, 5, [     // select_cc
@@ -406,6 +410,10 @@ def umax       : SDNode<"ISD::UMAX"      , SDTIntBinOp,
                                   [SDNPCommutative, SDNPAssociative]>;
 
 def sext_inreg : SDNode<"ISD::SIGN_EXTEND_INREG", SDTExtInreg>;
+def sext_invec : SDNode<"ISD::SIGN_EXTEND_VECTOR_INREG", SDTExtInvec>;
+def zext_invec : SDNode<"ISD::ZERO_EXTEND_VECTOR_INREG", SDTExtInvec>;
+
+def abs        : SDNode<"ISD::ABS"        , SDTIntUnaryOp>;
 def bitreverse : SDNode<"ISD::BITREVERSE" , SDTIntUnaryOp>;
 def bswap      : SDNode<"ISD::BSWAP"      , SDTIntUnaryOp>;
 def ctlz       : SDNode<"ISD::CTLZ"       , SDTIntUnaryOp>;
diff --git a/include/llvm/Transforms/IPO.h b/include/llvm/Transforms/IPO.h
index b26238ef3fc7dcce7f0ca8099a4fa36f2bd95c16..39ceb19525b3cba05f141ac406b6b38321e81100 100644
--- a/include/llvm/Transforms/IPO.h
+++ b/include/llvm/Transforms/IPO.h
@@ -108,7 +108,8 @@ Pass *createFunctionImportPass();
 /// threshold given here.
 Pass *createFunctionInliningPass();
 Pass *createFunctionInliningPass(int Threshold);
-Pass *createFunctionInliningPass(unsigned OptLevel, unsigned SizeOptLevel);
+Pass *createFunctionInliningPass(unsigned OptLevel, unsigned SizeOptLevel,
+                                 bool DisableInlineHotCallSite);
 Pass *createFunctionInliningPass(InlineParams &Params);
 
 //===----------------------------------------------------------------------===//
@@ -215,27 +216,42 @@ ModulePass *createMetaRenamerPass();
 /// manager.
 ModulePass *createBarrierNoopPass();
 
-/// What to do with the summary when running the LowerTypeTests pass.
-enum class LowerTypeTestsSummaryAction {
+/// What to do with the summary when running passes that operate on it.
+enum class PassSummaryAction {
   None,   ///< Do nothing.
-  Import, ///< Import typeid resolutions from summary and globals.
-  Export, ///< Export typeid resolutions to summary and globals.
+  Import, ///< Import information from summary.
+  Export, ///< Export information to summary.
 };
 
 /// \brief This pass lowers type metadata and the llvm.type.test intrinsic to
 /// bitsets.
-/// \param Action What to do with the summary passed as Index.
-/// \param Index The summary to use for importing or exporting, this can be null
-///              when Action is None.
-ModulePass *createLowerTypeTestsPass(LowerTypeTestsSummaryAction Action,
-                                     ModuleSummaryIndex *Index);
+///
+/// The behavior depends on the summary arguments:
+/// - If ExportSummary is non-null, this pass will export type identifiers to
+///   the given summary.
+/// - Otherwise, if ImportSummary is non-null, this pass will import type
+///   identifiers from the given summary.
+/// - Otherwise it does neither.
+/// It is invalid for both ExportSummary and ImportSummary to be non-null.
+ModulePass *createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
+                                     const ModuleSummaryIndex *ImportSummary);
 
 /// \brief This pass export CFI checks for use by external modules.
 ModulePass *createCrossDSOCFIPass();
 
 /// \brief This pass implements whole-program devirtualization using type
 /// metadata.
-ModulePass *createWholeProgramDevirtPass();
+///
+/// The behavior depends on the summary arguments:
+/// - If ExportSummary is non-null, this pass will export type identifiers to
+///   the given summary.
+/// - Otherwise, if ImportSummary is non-null, this pass will import type
+///   identifiers from the given summary.
+/// - Otherwise it does neither.
+/// It is invalid for both ExportSummary and ImportSummary to be non-null.
+ModulePass *
+createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
+                             const ModuleSummaryIndex *ImportSummary);
 
 /// This pass splits globals into pieces for the benefit of whole-program
 /// devirtualization and control-flow integrity.
@@ -248,7 +264,8 @@ ModulePass *createSampleProfileLoaderPass();
 ModulePass *createSampleProfileLoaderPass(StringRef Name);
 
 /// Write ThinLTO-ready bitcode to Str.
-ModulePass *createWriteThinLTOBitcodePass(raw_ostream &Str);
+ModulePass *createWriteThinLTOBitcodePass(raw_ostream &Str,
+                                          raw_ostream *ThinLinkOS = nullptr);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Transforms/IPO/ArgumentPromotion.h b/include/llvm/Transforms/IPO/ArgumentPromotion.h
new file mode 100644
index 0000000000000000000000000000000000000000..724ff72f3b5a17da766ad5da0ee93304c0c83387
--- /dev/null
+++ b/include/llvm/Transforms/IPO/ArgumentPromotion.h
@@ -0,0 +1,31 @@
+//===- ArgumentPromotion.h - Promote by-reference arguments -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_IPO_ARGUMENTPROMOTION_H
+#define LLVM_TRANSFORMS_IPO_ARGUMENTPROMOTION_H
+
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+
+namespace llvm {
+
+/// Argument promotion pass.
+///
+/// This pass walks the functions in each SCC and for each one tries to
+/// transform it and all of its callers to replace indirect arguments with
+/// direct (by-value) arguments.
+class ArgumentPromotionPass : public PassInfoMixin<ArgumentPromotionPass> {
+public:
+  PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &AM,
+                        LazyCallGraph &CG, CGSCCUpdateResult &UR);
+};
+
+}
+
+#endif
diff --git a/include/llvm/Transforms/IPO/FunctionAttrs.h b/include/llvm/Transforms/IPO/FunctionAttrs.h
index ee45f35bf11b2d410ea2562dcccc0965e1f1abab..85d6364c8bbc98d27e321d46fbd086d440dd0225 100644
--- a/include/llvm/Transforms/IPO/FunctionAttrs.h
+++ b/include/llvm/Transforms/IPO/FunctionAttrs.h
@@ -20,6 +20,19 @@
 
 namespace llvm {
 
+class AAResults;
+
+/// The three kinds of memory access relevant to 'readonly' and
+/// 'readnone' attributes.
+enum MemoryAccessKind {
+  MAK_ReadNone = 0,
+  MAK_ReadOnly = 1,
+  MAK_MayWrite = 2
+};
+
+/// Returns the memory access properties of this copy of the function.
+MemoryAccessKind computeFunctionBodyMemoryAccess(Function &F, AAResults &AAR);
+
 /// Computes function attributes in post-order over the call graph.
 ///
 /// By operating in post-order, this pass computes precise attributes for
@@ -43,7 +56,7 @@ Pass *createPostOrderFunctionAttrsLegacyPass();
 /// This pass provides a general RPO or "top down" propagation of
 /// function attributes. For a few (rare) cases, we can deduce significantly
 /// more about function attributes by working in RPO, so this pass
-/// provides the compliment to the post-order pass above where the majority of
+/// provides the complement to the post-order pass above where the majority of
 /// deduction is performed.
 // FIXME: Currently there is no RPO CGSCC pass structure to slide into and so
 // this is a boring module pass, but eventually it should be an RPO CGSCC pass
diff --git a/include/llvm/Transforms/IPO/FunctionImport.h b/include/llvm/Transforms/IPO/FunctionImport.h
index eaea092c91795324d11dd9c58abb39afc0e9c13e..ed5742ab8b564f527445457b04bf6c19d7c6949c 100644
--- a/include/llvm/Transforms/IPO/FunctionImport.h
+++ b/include/llvm/Transforms/IPO/FunctionImport.h
@@ -53,12 +53,8 @@ public:
       : Index(Index), ModuleLoader(std::move(ModuleLoader)) {}
 
   /// Import functions in Module \p M based on the supplied import list.
-  /// \p ForceImportReferencedDiscardableSymbols will set the ModuleLinker in
-  /// a mode where referenced discarable symbols in the source modules will be
-  /// imported as well even if they are not present in the ImportList.
   Expected<bool>
-  importFunctions(Module &M, const ImportMapTy &ImportList,
-                  bool ForceImportReferencedDiscardableSymbols = false);
+  importFunctions(Module &M, const ImportMapTy &ImportList);
 
 private:
   /// The summaries index used to trigger importing.
diff --git a/include/llvm/Transforms/IPO/PassManagerBuilder.h b/include/llvm/Transforms/IPO/PassManagerBuilder.h
index 1782e7e37b46233abee2f4d909813e11b5789fb0..247382c35eebfd82fc5e930a48ef139c0118d245 100644
--- a/include/llvm/Transforms/IPO/PassManagerBuilder.h
+++ b/include/llvm/Transforms/IPO/PassManagerBuilder.h
@@ -132,10 +132,15 @@ public:
   /// added to the per-module passes.
   Pass *Inliner;
 
-  /// The module summary index to use for passing information between the
-  /// regular LTO phase and the thin LTO backends, for example the CFI and
-  /// devirtualization type tests.
-  ModuleSummaryIndex *Summary = nullptr;
+  /// The module summary index to use for exporting information from the
+  /// regular LTO phase, for example for the CFI and devirtualization type
+  /// tests.
+  ModuleSummaryIndex *ExportSummary = nullptr;
+
+  /// The module summary index to use for importing information to the
+  /// thin LTO backends, for example for the CFI and devirtualization type
+  /// tests.
+  const ModuleSummaryIndex *ImportSummary = nullptr;
 
   bool DisableTailCalls;
   bool DisableUnitAtATime;
@@ -153,6 +158,7 @@ public:
   bool PrepareForLTO;
   bool PrepareForThinLTO;
   bool PerformThinLTO;
+  bool DivergentTarget;
 
   /// Enable profile instrumentation pass.
   bool EnablePGOInstrGen;
diff --git a/include/llvm/Transforms/InstrProfiling.h b/include/llvm/Transforms/InstrProfiling.h
index e3897df073b6ab0054f62133f6c7028a8176c424..e303dcf7acfef1794d4fd4bd82690f875d432ac2 100644
--- a/include/llvm/Transforms/InstrProfiling.h
+++ b/include/llvm/Transforms/InstrProfiling.h
@@ -59,6 +59,11 @@ private:
   GlobalVariable *NamesVar;
   size_t NamesSize;
 
+  // The start value of precise value profile range for memory intrinsic sizes.
+  int64_t MemOPSizeRangeStart;
+  // The end value of precise value profile range for memory intrinsic sizes.
+  int64_t MemOPSizeRangeLast;
+
   bool isMachO() const;
 
   /// Get the section name for the counter variables.
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index de398a6945100f93ee4e8a8c7088c0152cfca906..01a3975a4f2cc20107e1c5c06001987b3ca6f5ae 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -86,7 +86,9 @@ ModulePass *createGCOVProfilerPass(const GCOVOptions &Options =
 ModulePass *createPGOInstrumentationGenLegacyPass();
 ModulePass *
 createPGOInstrumentationUseLegacyPass(StringRef Filename = StringRef(""));
-ModulePass *createPGOIndirectCallPromotionLegacyPass(bool InLTO = false);
+ModulePass *createPGOIndirectCallPromotionLegacyPass(bool InLTO = false,
+                                                     bool SamplePGO = false);
+FunctionPass *createPGOMemOPSizeOptLegacyPass();
 
 // Helper function to check if it is legal to promote indirect call \p Inst
 // to a direct call of function \p F. Stores the reason in \p Reason.
@@ -102,9 +104,12 @@ bool isLegalToPromote(Instruction *Inst, Function *F, const char **Reason);
 // TotalCount is the profile count value that the instruction executes.
 // Count is the profile count value that F is the target function.
 // These two values are used to update the branch weight.
+// If \p AttachProfToDirectCall is true, a prof metadata is attached to the
+// new direct call to contain \p Count.
 // Returns the promoted direct call instruction.
 Instruction *promoteIndirectCall(Instruction *Inst, Function *F, uint64_t Count,
-                                 uint64_t TotalCount);
+                                 uint64_t TotalCount,
+                                 bool AttachProfToDirectCall);
 
 /// Options for the frontend instrumentation based profiling pass.
 struct InstrProfOptions {
diff --git a/include/llvm/Transforms/PGOInstrumentation.h b/include/llvm/Transforms/PGOInstrumentation.h
index 1b449c9abdc275841fefff908757121e0da2328a..19263f0f8071d66ff355162b572e15a4046d36cd 100644
--- a/include/llvm/Transforms/PGOInstrumentation.h
+++ b/include/llvm/Transforms/PGOInstrumentation.h
@@ -38,11 +38,24 @@ private:
 /// The indirect function call promotion pass.
 class PGOIndirectCallPromotion : public PassInfoMixin<PGOIndirectCallPromotion> {
 public:
-  PGOIndirectCallPromotion(bool IsInLTO = false) : InLTO(IsInLTO) {}
+  PGOIndirectCallPromotion(bool IsInLTO = false, bool SamplePGO = false)
+      : InLTO(IsInLTO), SamplePGO(SamplePGO) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
 private:
   bool InLTO;
+  bool SamplePGO;
 };
 
+/// The profile size based optimization pass for memory intrinsics.
+class PGOMemOPSizeOpt : public PassInfoMixin<PGOMemOPSizeOpt> {
+public:
+  PGOMemOPSizeOpt() {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+void setProfMetadata(Module *M, Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
+                     uint64_t MaxCount);
+
 } // End llvm namespace
 #endif
diff --git a/include/llvm/Transforms/Scalar.h b/include/llvm/Transforms/Scalar.h
index b62b05574b5be404d98bd707fc405b49412b2f55..ba0a3ee1287a4b6c22f5900bfabcd2e4f7e6f8b1 100644
--- a/include/llvm/Transforms/Scalar.h
+++ b/include/llvm/Transforms/Scalar.h
@@ -169,7 +169,8 @@ Pass *createLoopStrengthReducePass();
 //
 // LoopUnswitch - This pass is a simple loop unswitching pass.
 //
-Pass *createLoopUnswitchPass(bool OptimizeForSize = false);
+Pass *createLoopUnswitchPass(bool OptimizeForSize = false,
+                             bool hasBranchDivergence = false);
 
 //===----------------------------------------------------------------------===//
 //
@@ -181,11 +182,11 @@ Pass *createLoopInstSimplifyPass();
 //
 // LoopUnroll - This pass is a simple loop unrolling pass.
 //
-Pass *createLoopUnrollPass(int Threshold = -1, int Count = -1,
+Pass *createLoopUnrollPass(int OptLevel = 2, int Threshold = -1, int Count = -1,
                            int AllowPartial = -1, int Runtime = -1,
                            int UpperBound = -1);
 // Create an unrolling pass for full unrolling that uses exact trip count only.
-Pass *createSimpleLoopUnrollPass();
+Pass *createSimpleLoopUnrollPass(int OptLevel = 2);
 
 //===----------------------------------------------------------------------===//
 //
@@ -259,6 +260,14 @@ FunctionPass *createJumpThreadingPass(int Threshold = -1);
 FunctionPass *createCFGSimplificationPass(
     int Threshold = -1, std::function<bool(const Function &)> Ftor = nullptr);
 
+//===----------------------------------------------------------------------===//
+//
+// LateCFGSimplification - Like CFGSimplification, but may also
+// convert switches to lookup tables.
+//
+FunctionPass *createLateCFGSimplificationPass(
+    int Threshold = -1, std::function<bool(const Function &)> Ftor = nullptr);
+
 //===----------------------------------------------------------------------===//
 //
 // FlattenCFG - flatten CFG, reduce number of conditional branches by using
diff --git a/include/llvm/Transforms/Scalar/GVNExpression.h b/include/llvm/Transforms/Scalar/GVNExpression.h
index c967bb3adc10c14833ff4410e1b7db2663aecf49..2670a0c1a5339b7cf4b157ce2f65519b9f0b7850 100644
--- a/include/llvm/Transforms/Scalar/GVNExpression.h
+++ b/include/llvm/Transforms/Scalar/GVNExpression.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Value.h"
@@ -26,7 +27,6 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -43,11 +43,13 @@ enum ExpressionType {
   ET_Unknown,
   ET_BasicStart,
   ET_Basic,
-  ET_Call,
   ET_AggregateValue,
   ET_Phi,
+  ET_MemoryStart,
+  ET_Call,
   ET_Load,
   ET_Store,
+  ET_MemoryEnd,
   ET_BasicEnd
 };
 
@@ -65,15 +67,14 @@ public:
 
   static unsigned getEmptyKey() { return ~0U; }
   static unsigned getTombstoneKey() { return ~1U; }
-
+  bool operator!=(const Expression &Other) const { return !(*this == Other); }
   bool operator==(const Expression &Other) const {
     if (getOpcode() != Other.getOpcode())
       return false;
     if (getOpcode() == getEmptyKey() || getOpcode() == getTombstoneKey())
       return true;
     // Compare the expression type for anything but load and store.
-    // For load and store we set the opcode to zero.
-    // This is needed for load coercion.
+    // For load and store we set the opcode to zero to make them equal.
     if (getExpressionType() != ET_Load && getExpressionType() != ET_Store &&
         getExpressionType() != Other.getExpressionType())
       return false;
@@ -87,9 +88,8 @@ public:
   void setOpcode(unsigned opcode) { Opcode = opcode; }
   ExpressionType getExpressionType() const { return EType; }
 
-  virtual hash_code getHashValue() const {
-    return hash_combine(getExpressionType(), getOpcode());
-  }
+  // We deliberately leave the expression type out of the hash value.
+  virtual hash_code getHashValue() const { return getOpcode(); }
 
   //
   // Debugging support
@@ -106,7 +106,10 @@ public:
     OS << "}";
   }
 
-  void dump() const { print(dbgs()); }
+  LLVM_DUMP_METHOD void dump() const {
+    print(dbgs());
+    dbgs() << "\n";
+  }
 };
 
 inline raw_ostream &operator<<(raw_ostream &OS, const Expression &E) {
@@ -200,7 +203,7 @@ public:
   }
 
   hash_code getHashValue() const override {
-    return hash_combine(getExpressionType(), getOpcode(), ValueType,
+    return hash_combine(this->Expression::getHashValue(), ValueType,
                         hash_combine_range(op_begin(), op_end()));
   }
 
@@ -241,32 +244,53 @@ public:
   op_inserter &operator++(int) { return *this; }
 };
 
-class CallExpression final : public BasicExpression {
+class MemoryExpression : public BasicExpression {
 private:
-  CallInst *Call;
-  MemoryAccess *DefiningAccess;
+  const MemoryAccess *MemoryLeader;
 
 public:
-  CallExpression(unsigned NumOperands, CallInst *C, MemoryAccess *DA)
-      : BasicExpression(NumOperands, ET_Call), Call(C), DefiningAccess(DA) {}
-  CallExpression() = delete;
-  CallExpression(const CallExpression &) = delete;
-  CallExpression &operator=(const CallExpression &) = delete;
-  ~CallExpression() override;
+  MemoryExpression(unsigned NumOperands, enum ExpressionType EType,
+                   const MemoryAccess *MemoryLeader)
+      : BasicExpression(NumOperands, EType), MemoryLeader(MemoryLeader){};
 
+  MemoryExpression() = delete;
+  MemoryExpression(const MemoryExpression &) = delete;
+  MemoryExpression &operator=(const MemoryExpression &) = delete;
   static bool classof(const Expression *EB) {
-    return EB->getExpressionType() == ET_Call;
+    return EB->getExpressionType() > ET_MemoryStart &&
+           EB->getExpressionType() < ET_MemoryEnd;
+  }
+  hash_code getHashValue() const override {
+    return hash_combine(this->BasicExpression::getHashValue(), MemoryLeader);
   }
 
   bool equals(const Expression &Other) const override {
     if (!this->BasicExpression::equals(Other))
       return false;
-    const auto &OE = cast<CallExpression>(Other);
-    return DefiningAccess == OE.DefiningAccess;
+    const MemoryExpression &OtherMCE = cast<MemoryExpression>(Other);
+
+    return MemoryLeader == OtherMCE.MemoryLeader;
   }
 
-  hash_code getHashValue() const override {
-    return hash_combine(this->BasicExpression::getHashValue(), DefiningAccess);
+  const MemoryAccess *getMemoryLeader() const { return MemoryLeader; }
+  void setMemoryLeader(const MemoryAccess *ML) { MemoryLeader = ML; }
+};
+
+class CallExpression final : public MemoryExpression {
+private:
+  CallInst *Call;
+
+public:
+  CallExpression(unsigned NumOperands, CallInst *C,
+                 const MemoryAccess *MemoryLeader)
+      : MemoryExpression(NumOperands, ET_Call, MemoryLeader), Call(C) {}
+  CallExpression() = delete;
+  CallExpression(const CallExpression &) = delete;
+  CallExpression &operator=(const CallExpression &) = delete;
+  ~CallExpression() override;
+
+  static bool classof(const Expression *EB) {
+    return EB->getExpressionType() == ET_Call;
   }
 
   //
@@ -276,22 +300,23 @@ public:
     if (PrintEType)
       OS << "ExpressionTypeCall, ";
     this->BasicExpression::printInternal(OS, false);
-    OS << " represents call at " << Call;
+    OS << " represents call at ";
+    Call->printAsOperand(OS);
   }
 };
 
-class LoadExpression final : public BasicExpression {
+class LoadExpression final : public MemoryExpression {
 private:
   LoadInst *Load;
-  MemoryAccess *DefiningAccess;
   unsigned Alignment;
 
 public:
-  LoadExpression(unsigned NumOperands, LoadInst *L, MemoryAccess *DA)
-      : LoadExpression(ET_Load, NumOperands, L, DA) {}
+  LoadExpression(unsigned NumOperands, LoadInst *L,
+                 const MemoryAccess *MemoryLeader)
+      : LoadExpression(ET_Load, NumOperands, L, MemoryLeader) {}
   LoadExpression(enum ExpressionType EType, unsigned NumOperands, LoadInst *L,
-                 MemoryAccess *DA)
-      : BasicExpression(NumOperands, EType), Load(L), DefiningAccess(DA) {
+                 const MemoryAccess *MemoryLeader)
+      : MemoryExpression(NumOperands, EType, MemoryLeader), Load(L) {
     Alignment = L ? L->getAlignment() : 0;
   }
   LoadExpression() = delete;
@@ -306,18 +331,11 @@ public:
   LoadInst *getLoadInst() const { return Load; }
   void setLoadInst(LoadInst *L) { Load = L; }
 
-  MemoryAccess *getDefiningAccess() const { return DefiningAccess; }
-  void setDefiningAccess(MemoryAccess *MA) { DefiningAccess = MA; }
   unsigned getAlignment() const { return Alignment; }
   void setAlignment(unsigned Align) { Alignment = Align; }
 
   bool equals(const Expression &Other) const override;
 
-  hash_code getHashValue() const override {
-    return hash_combine(getOpcode(), getType(), DefiningAccess,
-                        hash_combine_range(op_begin(), op_end()));
-  }
-
   //
   // Debugging support
   //
@@ -325,22 +343,22 @@ public:
     if (PrintEType)
       OS << "ExpressionTypeLoad, ";
     this->BasicExpression::printInternal(OS, false);
-    OS << " represents Load at " << Load;
-    OS << " with DefiningAccess " << *DefiningAccess;
+    OS << " represents Load at ";
+    Load->printAsOperand(OS);
+    OS << " with MemoryLeader " << *getMemoryLeader();
   }
 };
 
-class StoreExpression final : public BasicExpression {
+class StoreExpression final : public MemoryExpression {
 private:
   StoreInst *Store;
   Value *StoredValue;
-  MemoryAccess *DefiningAccess;
 
 public:
   StoreExpression(unsigned NumOperands, StoreInst *S, Value *StoredValue,
-                  MemoryAccess *DA)
-      : BasicExpression(NumOperands, ET_Store), Store(S),
-        StoredValue(StoredValue), DefiningAccess(DA) {}
+                  const MemoryAccess *MemoryLeader)
+      : MemoryExpression(NumOperands, ET_Store, MemoryLeader), Store(S),
+        StoredValue(StoredValue) {}
   StoreExpression() = delete;
   StoreExpression(const StoreExpression &) = delete;
   StoreExpression &operator=(const StoreExpression &) = delete;
@@ -351,27 +369,18 @@ public:
   }
 
   StoreInst *getStoreInst() const { return Store; }
-  MemoryAccess *getDefiningAccess() const { return DefiningAccess; }
   Value *getStoredValue() const { return StoredValue; }
 
   bool equals(const Expression &Other) const override;
 
-  hash_code getHashValue() const override {
-    // This deliberately does not include the stored value we compare it as part
-    // of equals, and only against other stores.
-    return hash_combine(getOpcode(), getType(), DefiningAccess,
-                        hash_combine_range(op_begin(), op_end()));
-  }
-
-  //
   // Debugging support
   //
   void printInternal(raw_ostream &OS, bool PrintEType) const override {
     if (PrintEType)
       OS << "ExpressionTypeStore, ";
     this->BasicExpression::printInternal(OS, false);
-    OS << " represents Store at " << Store;
-    OS << " with DefiningAccess " << *DefiningAccess;
+    OS << " represents Store  " << *Store;
+    OS << " with MemoryLeader " << *getMemoryLeader();
   }
 };
 
@@ -527,8 +536,8 @@ public:
   }
 
   hash_code getHashValue() const override {
-    return hash_combine(getExpressionType(), VariableValue->getType(),
-                        VariableValue);
+    return hash_combine(this->Expression::getHashValue(),
+                        VariableValue->getType(), VariableValue);
   }
 
   //
@@ -566,8 +575,8 @@ public:
   }
 
   hash_code getHashValue() const override {
-    return hash_combine(getExpressionType(), ConstantValue->getType(),
-                        ConstantValue);
+    return hash_combine(this->Expression::getHashValue(),
+                        ConstantValue->getType(), ConstantValue);
   }
 
   //
@@ -604,7 +613,7 @@ public:
   }
 
   hash_code getHashValue() const override {
-    return hash_combine(getExpressionType(), Inst);
+    return hash_combine(this->Expression::getHashValue(), Inst);
   }
 
   //
diff --git a/include/llvm/Transforms/Scalar/JumpThreading.h b/include/llvm/Transforms/Scalar/JumpThreading.h
index f96741c0127d8ebf65b9bc5a4523dbd8da1a9a38..1da86132591b79353d2dce37664832184b919152 100644
--- a/include/llvm/Transforms/Scalar/JumpThreading.h
+++ b/include/llvm/Transforms/Scalar/JumpThreading.h
@@ -17,12 +17,14 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ValueHandle.h"
 
 namespace llvm {
@@ -59,9 +61,11 @@ enum ConstantPreference { WantInteger, WantBlockAddress };
 class JumpThreadingPass : public PassInfoMixin<JumpThreadingPass> {
   TargetLibraryInfo *TLI;
   LazyValueInfo *LVI;
+  AliasAnalysis *AA;
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = false;
+  bool HasGuards = false;
 #ifdef NDEBUG
   SmallPtrSet<const BasicBlock *, 16> LoopHeaders;
 #else
@@ -88,7 +92,8 @@ public:
 
   // Glue for old PM.
   bool runImpl(Function &F, TargetLibraryInfo *TLI_, LazyValueInfo *LVI_,
-               bool HasProfileData_, std::unique_ptr<BlockFrequencyInfo> BFI_,
+               AliasAnalysis *AA_, bool HasProfileData_,
+               std::unique_ptr<BlockFrequencyInfo> BFI_,
                std::unique_ptr<BranchProbabilityInfo> BPI_);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
@@ -122,6 +127,9 @@ public:
   bool TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB);
   bool TryToUnfoldSelectInCurrBB(BasicBlock *BB);
 
+  bool ProcessGuards(BasicBlock *BB);
+  bool ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard, BranchInst *BI);
+
 private:
   BasicBlock *SplitBlockPreds(BasicBlock *BB, ArrayRef<BasicBlock *> Preds,
                               const char *Suffix);
diff --git a/include/llvm/Transforms/Scalar/LoopPassManager.h b/include/llvm/Transforms/Scalar/LoopPassManager.h
index e90aebde23e8e5c44b58fa3f10fd829464565e5e..715b11d3d974999deba9d6ccd96342fa298dc35f 100644
--- a/include/llvm/Transforms/Scalar/LoopPassManager.h
+++ b/include/llvm/Transforms/Scalar/LoopPassManager.h
@@ -263,9 +263,6 @@ public:
     // manager handles all the invalidation at that layer.
     PreservedAnalyses PA = LoopCanonicalizationFPM.run(F, AM);
 
-    // Setup the loop analysis manager from its proxy.
-    LoopAnalysisManager &LAM =
-        AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
     // Get the loop structure for this function
     LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
 
@@ -282,6 +279,14 @@ public:
                                        AM.getResult<TargetLibraryAnalysis>(F),
                                        AM.getResult<TargetIRAnalysis>(F)};
 
+    // Setup the loop analysis manager from its proxy. It is important that
+    // this is only done when there are loops to process and we have built the
+    // LoopStandardAnalysisResults object. The loop analyses cached in this
+    // manager have access to those analysis results and so it must invalidate
+    // itself when they go away.
+    LoopAnalysisManager &LAM =
+        AM.getResult<LoopAnalysisManagerFunctionProxy>(F).getManager();
+
     // A postorder worklist of loops to process.
     SmallPriorityWorklist<Loop *, 4> Worklist;
 
diff --git a/include/llvm/Transforms/Scalar/LoopUnrollPass.h b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
index 0b22080284e5e344e538c8b0ee9f4297edbb7ae0..7253bd09766efdec5665dd7a628d34520f94dd77 100644
--- a/include/llvm/Transforms/Scalar/LoopUnrollPass.h
+++ b/include/llvm/Transforms/Scalar/LoopUnrollPass.h
@@ -18,9 +18,10 @@ namespace llvm {
 
 class LoopUnrollPass : public PassInfoMixin<LoopUnrollPass> {
   const bool AllowPartialUnrolling;
+  const int OptLevel;
 
-  explicit LoopUnrollPass(bool AllowPartialUnrolling)
-      : AllowPartialUnrolling(AllowPartialUnrolling) {}
+  explicit LoopUnrollPass(bool AllowPartialUnrolling, int OptLevel)
+      : AllowPartialUnrolling(AllowPartialUnrolling), OptLevel(OptLevel) {}
 
 public:
   /// Create an instance of the loop unroll pass that will support both full
@@ -28,16 +29,16 @@ public:
   ///
   /// This uses the target information (or flags) to control the thresholds for
   /// different unrolling stategies but supports all of them.
-  static LoopUnrollPass create() {
-    return LoopUnrollPass(/*AllowPartialUnrolling*/ true);
+  static LoopUnrollPass create(int OptLevel = 2) {
+    return LoopUnrollPass(/*AllowPartialUnrolling*/ true, OptLevel);
   }
 
   /// Create an instance of the loop unroll pass that only does full loop
   /// unrolling.
   ///
   /// This will disable any runtime or partial unrolling.
-  static LoopUnrollPass createFull() {
-    return LoopUnrollPass(/*AllowPartialUnrolling*/ false);
+  static LoopUnrollPass createFull(int OptLevel = 2) {
+    return LoopUnrollPass(/*AllowPartialUnrolling*/ false, OptLevel);
   }
 
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM,
diff --git a/include/llvm/Transforms/Scalar/SimplifyCFG.h b/include/llvm/Transforms/Scalar/SimplifyCFG.h
index 96e1658c00b0eaa9e384177d1e6a25b4ff45eda6..54b51c405ad411c7c0d6c76a67571eb87d51bbd8 100644
--- a/include/llvm/Transforms/Scalar/SimplifyCFG.h
+++ b/include/llvm/Transforms/Scalar/SimplifyCFG.h
@@ -27,13 +27,16 @@ namespace llvm {
 /// by the rest of the mid-level optimizer.
 class SimplifyCFGPass : public PassInfoMixin<SimplifyCFGPass> {
   int BonusInstThreshold;
+  bool LateSimplifyCFG;
 
 public:
-  /// \brief Construct a pass with the default thresholds.
+  /// \brief Construct a pass with the default thresholds
+  /// and switch optimizations.
   SimplifyCFGPass();
 
-  /// \brief Construct a pass with a specific bonus threshold.
-  SimplifyCFGPass(int BonusInstThreshold);
+  /// \brief Construct a pass with a specific bonus threshold
+  /// and optional switch optimizations.
+  SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG);
 
   /// \brief Run the pass over the function.
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index 2d2a85905d0ea9ae87cdf0cd863ac1e78e8f0644..a067a685b8372c7600f6dc6c8bd431c268083ea7 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -84,14 +84,14 @@ namespace llvm {
   /// value with the same type. If 'Op' is a long double, 'l' is added as the
   /// suffix of name, if 'Op' is a float, we add a 'f' suffix.
   Value *emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
-                              const AttributeSet &Attrs);
+                              const AttributeList &Attrs);
 
   /// Emit a call to the binary function named 'Name' (e.g. 'fmin'). This
   /// function is known to take type matching 'Op1' and 'Op2' and return one
   /// value with the same type. If 'Op1/Op2' are long double, 'l' is added as
   /// the suffix of name, if 'Op1/Op2' are float, we add a 'f' suffix.
   Value *emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
-                                  IRBuilder<> &B, const AttributeSet &Attrs);
+                               IRBuilder<> &B, const AttributeList &Attrs);
 
   /// Emit a call to the putchar function. This assumes that Char is an integer.
   Value *emitPutChar(Value *Char, IRBuilder<> &B, const TargetLibraryInfo *TLI);
diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h
index efb799c21beeb77ab6cf8461b038d798b8094fe8..337305a0a82ce065f00e4b9e6eb12b2193b6c031 100644
--- a/include/llvm/Transforms/Utils/Cloning.h
+++ b/include/llvm/Transforms/Utils/Cloning.h
@@ -245,6 +245,16 @@ Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
 void remapInstructionsInBlocks(const SmallVectorImpl<BasicBlock *> &Blocks,
                                ValueToValueMapTy &VMap);
 
+/// Split edge between BB and PredBB and duplicate all non-Phi instructions
+/// from BB between its beginning and the StopAt instruction into the split
+/// block. Phi nodes are not duplicated, but their uses are handled correctly:
+/// we replace them with the uses of corresponding Phi inputs. ValueMapping
+/// is used to map the original instructions from BB to their newly-created
+/// copies. Returns the split block.
+BasicBlock *
+DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
+                                    Instruction *StopAt,
+                                    ValueToValueMapTy &ValueMapping);
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_CLONING_H
diff --git a/include/llvm/Transforms/Utils/FunctionComparator.h b/include/llvm/Transforms/Utils/FunctionComparator.h
index a613fc31a5e30b13c3e1b0da8b7e2ea6faa243a4..ee58d1d138f74c77a86d80b8dd6b4997990815c6 100644
--- a/include/llvm/Transforms/Utils/FunctionComparator.h
+++ b/include/llvm/Transforms/Utils/FunctionComparator.h
@@ -314,7 +314,7 @@ protected:
 private:
   int cmpOrderings(AtomicOrdering L, AtomicOrdering R) const;
   int cmpInlineAsm(const InlineAsm *L, const InlineAsm *R) const;
-  int cmpAttrs(const AttributeSet L, const AttributeSet R) const;
+  int cmpAttrs(const AttributeList L, const AttributeList R) const;
   int cmpRangeMetadata(const MDNode *L, const MDNode *R) const;
   int cmpOperandBundlesSchema(const Instruction *L, const Instruction *R) const;
 
diff --git a/include/llvm/Transforms/Utils/FunctionImportUtils.h b/include/llvm/Transforms/Utils/FunctionImportUtils.h
index f18cd92310b4aa521f7edf025c83745b729f56c2..b9fbef04cdc3d6a7c0741a12b0cd41cef8c28d28 100644
--- a/include/llvm/Transforms/Utils/FunctionImportUtils.h
+++ b/include/llvm/Transforms/Utils/FunctionImportUtils.h
@@ -32,7 +32,7 @@ class FunctionImportGlobalProcessing {
 
   /// Globals to import from this module, all other functions will be
   /// imported as declarations instead of definitions.
-  DenseSet<const GlobalValue *> *GlobalsToImport;
+  SetVector<GlobalValue *> *GlobalsToImport;
 
   /// Set to true if the given ModuleSummaryIndex contains any functions
   /// from this source module, in which case we must conservatively assume
@@ -85,7 +85,7 @@ class FunctionImportGlobalProcessing {
 public:
   FunctionImportGlobalProcessing(
       Module &M, const ModuleSummaryIndex &Index,
-      DenseSet<const GlobalValue *> *GlobalsToImport = nullptr)
+      SetVector<GlobalValue *> *GlobalsToImport = nullptr)
       : M(M), ImportIndex(Index), GlobalsToImport(GlobalsToImport) {
     // If we have a ModuleSummaryIndex but no function to import,
     // then this is the primary module being compiled in a ThinLTO
@@ -104,16 +104,15 @@ public:
 
   bool run();
 
-  static bool
-  doImportAsDefinition(const GlobalValue *SGV,
-                       DenseSet<const GlobalValue *> *GlobalsToImport);
+  static bool doImportAsDefinition(const GlobalValue *SGV,
+                                   SetVector<GlobalValue *> *GlobalsToImport);
 };
 
 /// Perform in-place global value handling on the given Module for
 /// exported local functions renamed and promoted for ThinLTO.
 bool renameModuleForThinLTO(
     Module &M, const ModuleSummaryIndex &Index,
-    DenseSet<const GlobalValue *> *GlobalsToImport = nullptr);
+    SetVector<GlobalValue *> *GlobalsToImport = nullptr);
 
 } // End llvm namespace
 
diff --git a/include/llvm/Transforms/Utils/Local.h b/include/llvm/Transforms/Utils/Local.h
index 490a765c3fabcb93dc5436eb214c80bfcf0194cb..4933712fb8adc370d27669957728304a687136e7 100644
--- a/include/llvm/Transforms/Utils/Local.h
+++ b/include/llvm/Transforms/Utils/Local.h
@@ -49,8 +49,6 @@ class LazyValueInfo;
 
 template<typename T> class SmallVectorImpl;
 
-typedef SmallVector<DbgValueInst *, 1> DbgValueList;
-
 //===----------------------------------------------------------------------===//
 //  Local constant propagation.
 //
@@ -74,6 +72,12 @@ bool ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions = false,
 bool isInstructionTriviallyDead(Instruction *I,
                                 const TargetLibraryInfo *TLI = nullptr);
 
+/// Return true if the result produced by the instruction would have no side
+/// effects if it was not used. This is equivalent to checking whether
+/// isInstructionTriviallyDead would be true if the use count was 0.
+bool wouldInstructionBeTriviallyDead(Instruction *I,
+                                     const TargetLibraryInfo *TLI = nullptr);
+
 /// If the specified value is a trivially dead instruction, delete it.
 /// If that makes any of its operands trivially dead, delete them too,
 /// recursively. Return true if any instructions were deleted.
@@ -138,7 +142,8 @@ bool EliminateDuplicatePHINodes(BasicBlock *BB);
 /// eliminate.
 bool SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
                  unsigned BonusInstThreshold, AssumptionCache *AC = nullptr,
-                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders = nullptr);
+                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders = nullptr,
+                 bool LateSimplifyCFG = false);
 
 /// This function is used to flatten a CFG. For example, it uses parallel-and
 /// and parallel-or mode to collapse if-conditions and merge if-regions with
@@ -278,8 +283,11 @@ bool LowerDbgDeclare(Function &F);
 /// Finds the llvm.dbg.declare intrinsic corresponding to an alloca, if any.
 DbgDeclareInst *FindAllocaDbgDeclare(Value *V);
 
-/// Finds the llvm.dbg.value intrinsics corresponding to an alloca, if any.
-void FindAllocaDbgValues(DbgValueList &DbgValues, Value *V);
+/// Finds the llvm.dbg.value intrinsics describing a value.
+void findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V);
+
+/// Constants for \p replaceDbgDeclare and friends.
+enum { NoDeref = false, WithDeref = true };
 
 /// Replaces llvm.dbg.declare instruction when the address it describes
 /// is replaced with a new value. If Deref is true, an additional DW_OP_deref is
@@ -306,6 +314,11 @@ bool replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
 void replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
                               DIBuilder &Builder, int Offset = 0);
 
+/// Assuming the instruction \p I is going to be deleted, attempt to salvage any
+/// dbg.value intrinsics referring to \p I by rewriting its effect into a
+/// DIExpression.
+void salvageDebugInfo(Instruction &I);
+
 /// Remove all instructions from a basic block other than it's terminator
 /// and any present EH pad instructions.
 unsigned removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB);
diff --git a/include/llvm/Transforms/Utils/LowerMemIntrinsics.h b/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
new file mode 100644
index 0000000000000000000000000000000000000000..e4906b709e4bf16d60ea33edd0a98d5a8596c7e5
--- /dev/null
+++ b/include/llvm/Transforms/Utils/LowerMemIntrinsics.h
@@ -0,0 +1,44 @@
+//===- llvm/Transforms/Utils/LowerMemintrinsics.h ---------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Lower memset, memcpy, memmov intrinsics to loops (e.g. for targets without
+// library support).
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_LOWERMEMINTRINSICS_H
+#define LLVM_TRANSFORMS_UTILS_LOWERMEMINTRINSICS_H
+
+namespace llvm {
+
+class Instruction;
+class MemCpyInst;
+class MemMoveInst;
+class MemSetInst;
+class Value;
+
+/// Emit a loop implementing the semantics of llvm.memcpy with the equivalent
+/// arguments at \p InsertBefore.
+void createMemCpyLoop(Instruction *InsertBefore,
+                      Value *SrcAddr, Value *DstAddr, Value *CopyLen,
+                      unsigned SrcAlign, unsigned DestAlign,
+                      bool SrcIsVolatile, bool DstIsVolatile);
+
+/// Expand \p MemCpy as a loop. \p MemCpy is not deleted.
+void expandMemCpyAsLoop(MemCpyInst *MemCpy);
+
+/// Expand \p MemMove as a loop. \p MemMove is not deleted.
+void expandMemMoveAsLoop(MemMoveInst *MemMove);
+
+/// Expand \p MemSet as a loop. \p MemSet is not deleted.
+void expandMemSetAsLoop(MemSetInst *MemSet);
+
+} // End llvm namespace
+
+#endif
diff --git a/include/llvm/Transforms/Utils/MemorySSAUpdater.h b/include/llvm/Transforms/Utils/MemorySSAUpdater.h
deleted file mode 100644
index 3fb759a891a28033f154ef8bf7dc3b9d0736a6a0..0000000000000000000000000000000000000000
--- a/include/llvm/Transforms/Utils/MemorySSAUpdater.h
+++ /dev/null
@@ -1,88 +0,0 @@
-//===- MemorySSAUpdater.h - Memory SSA Updater-------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// \file
-// \brief An automatic updater for MemorySSA that handles arbitrary insertion,
-// deletion, and moves.  It performs phi insertion where necessary, and
-// automatically updates the MemorySSA IR to be correct.
-// While updating loads or removing instructions is often easy enough to not
-// need this, updating stores should generally not be attemped outside this
-// API.
-//
-// Basic API usage:
-// Create the memory access you want for the instruction (this is mainly so
-// we know where it is, without having to duplicate the entire set of create
-// functions MemorySSA supports).
-// Call insertDef or insertUse depending on whether it's a MemoryUse or a
-// MemoryDef.
-// That's it.
-//
-// For moving, first, move the instruction itself using the normal SSA
-// instruction moving API, then just call moveBefore, moveAfter,or moveTo with
-// the right arguments.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TRANSFORMS_UTILS_MEMORYSSAUPDATER_H
-#define LLVM_TRANSFORMS_UTILS_MEMORYSSAUPDATER_H
-
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/Dominators.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/OperandTraits.h"
-#include "llvm/IR/Type.h"
-#include "llvm/IR/Use.h"
-#include "llvm/IR/User.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Pass.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
-
-namespace llvm {
-
-class Function;
-class Instruction;
-class MemoryAccess;
-class LLVMContext;
-class raw_ostream;
-
-class MemorySSAUpdater {
-private:
-  MemorySSA *MSSA;
-  SmallVector<MemoryPhi *, 8> InsertedPHIs;
-  SmallPtrSet<BasicBlock *, 8> VisitedBlocks;
-
-public:
-  MemorySSAUpdater(MemorySSA *MSSA) : MSSA(MSSA) {}
-  void insertDef(MemoryDef *Def);
-  void insertUse(MemoryUse *Use);
-  void moveBefore(MemoryUseOrDef *What, MemoryUseOrDef *Where);
-  void moveAfter(MemoryUseOrDef *What, MemoryUseOrDef *Where);
-  void moveToPlace(MemoryUseOrDef *What, BasicBlock *BB,
-                   MemorySSA::InsertionPlace Where);
-private:
-  // Move What before Where in the MemorySSA IR.
-  template <class WhereType>
-  void moveTo(MemoryUseOrDef *What, BasicBlock *BB,
-              WhereType Where);
-  MemoryAccess *getPreviousDef(MemoryAccess *);
-  MemoryAccess *getPreviousDefInBlock(MemoryAccess *);
-  MemoryAccess *getPreviousDefFromEnd(BasicBlock *);
-  MemoryAccess *getPreviousDefRecursive(BasicBlock *);
-  MemoryAccess *recursePhi(MemoryAccess *Phi);
-  template <class RangeType>
-  MemoryAccess *tryRemoveTrivialPhi(MemoryPhi *Phi, RangeType &Operands);
-  void fixupDefs(const SmallVectorImpl<MemoryAccess *> &);
-};
-} // end namespace llvm
-
-#endif // LLVM_TRANSFORMS_UTILS_MEMORYSSAUPDATER_H
diff --git a/include/llvm/Transforms/Utils/ModuleUtils.h b/include/llvm/Transforms/Utils/ModuleUtils.h
index 27508799f8e0f5d1027042ef7ca9d4f02e9c82d2..f5e843e2e8b55688e939b1f0667ca68a3c73ca5d 100644
--- a/include/llvm/Transforms/Utils/ModuleUtils.h
+++ b/include/llvm/Transforms/Utils/ModuleUtils.h
@@ -46,6 +46,9 @@ void appendToGlobalDtors(Module &M, Function *F, int Priority,
 // getOrInsertFunction returns a bitcast.
 Function *checkSanitizerInterfaceFunction(Constant *FuncOrBitcast);
 
+Function *declareSanitizerInitFunction(Module &M, StringRef InitName,
+                                       ArrayRef<Type *> InitArgTypes);
+
 /// \brief Creates sanitizer constructor function, and calls sanitizer's init
 /// function from it.
 /// \return Returns pair of pointers to constructor, and init functions
diff --git a/include/llvm/Transforms/Utils/PredicateInfo.h b/include/llvm/Transforms/Utils/PredicateInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..1322c686eb900eec24c4298c48165724e4830363
--- /dev/null
+++ b/include/llvm/Transforms/Utils/PredicateInfo.h
@@ -0,0 +1,295 @@
+//===- PredicateInfo.h - Build PredicateInfo ----------------------*-C++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief  This file implements the PredicateInfo analysis, which creates an Extended
+/// SSA form for operations used in branch comparisons and llvm.assume
+/// comparisons.
+///
+/// Copies of these operations are inserted into the true/false edge (and after
+/// assumes), and information attached to the copies.  All uses of the original
+/// operation in blocks dominated by the true/false edge (and assume), are
+/// replaced with uses of the copies.  This enables passes to easily and sparsely
+/// propagate condition based info into the operations that may be affected.
+///
+/// Example:
+/// %cmp = icmp eq i32 %x, 50
+/// br i1 %cmp, label %true, label %false
+/// true:
+/// ret i32 %x
+/// false:
+/// ret i32 1
+///
+/// will become
+///
+/// %cmp = icmp eq i32, %x, 50
+/// br i1 %cmp, label %true, label %false
+/// true:
+/// %x.0 = call @llvm.ssa_copy.i32(i32 %x)
+/// ret i32 %x.0
+/// false:
+/// ret i32 1
+///
+/// Using getPredicateInfoFor on x.0 will give you the comparison it is
+/// dominated by (the icmp), and that you are located in the true edge of that
+/// comparison, which tells you x.0 is 50.
+///
+/// In order to reduce the number of copies inserted, predicateinfo is only
+/// inserted where it would actually be live.  This means if there are no uses of
+/// an operation dominated by the branch edges, or by an assume, the associated
+/// predicate info is never inserted.
+///
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TRANSFORMS_UTILS_PREDICATEINFO_H
+#define LLVM_TRANSFORMS_UTILS_PREDICATEINFO_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/ilist.h"
+#include "llvm/ADT/ilist_node.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/OperandTraits.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <memory>
+#include <utility>
+
+namespace llvm {
+
+class DominatorTree;
+class Function;
+class Instruction;
+class MemoryAccess;
+class LLVMContext;
+class raw_ostream;
+class OrderedBasicBlock;
+
+enum PredicateType { PT_Branch, PT_Assume, PT_Switch };
+
+// Base class for all predicate information we provide.
+// All of our predicate information has at least a comparison.
+class PredicateBase : public ilist_node<PredicateBase> {
+public:
+  PredicateType Type;
+  // The original operand before we renamed it.
+  // This can be use by passes, when destroying predicateinfo, to know
+  // whether they can just drop the intrinsic, or have to merge metadata.
+  Value *OriginalOp;
+  PredicateBase(const PredicateBase &) = delete;
+  PredicateBase &operator=(const PredicateBase &) = delete;
+  PredicateBase() = delete;
+  virtual ~PredicateBase() = default;
+
+protected:
+  PredicateBase(PredicateType PT, Value *Op) : Type(PT), OriginalOp(Op) {}
+};
+
+class PredicateWithCondition : public PredicateBase {
+public:
+  Value *Condition;
+  static inline bool classof(const PredicateBase *PB) {
+    return PB->Type == PT_Assume || PB->Type == PT_Branch || PB->Type == PT_Switch;
+  }
+
+protected:
+  PredicateWithCondition(PredicateType PT, Value *Op, Value *Condition)
+      : PredicateBase(PT, Op), Condition(Condition) {}
+};
+
+// Provides predicate information for assumes.  Since assumes are always true,
+// we simply provide the assume instruction, so you can tell your relative
+// position to it.
+class PredicateAssume : public PredicateWithCondition {
+public:
+  IntrinsicInst *AssumeInst;
+  PredicateAssume(Value *Op, IntrinsicInst *AssumeInst, Value *Condition)
+      : PredicateWithCondition(PT_Assume, Op, Condition),
+        AssumeInst(AssumeInst) {}
+  PredicateAssume() = delete;
+  static inline bool classof(const PredicateBase *PB) {
+    return PB->Type == PT_Assume;
+  }
+};
+
+// Mixin class for edge predicates.  The FROM block is the block where the
+// predicate originates, and the TO block is the block where the predicate is
+// valid.
+class PredicateWithEdge : public PredicateWithCondition {
+public:
+  BasicBlock *From;
+  BasicBlock *To;
+  PredicateWithEdge() = delete;
+  static inline bool classof(const PredicateBase *PB) {
+    return PB->Type == PT_Branch || PB->Type == PT_Switch;
+  }
+
+protected:
+  PredicateWithEdge(PredicateType PType, Value *Op, BasicBlock *From,
+                    BasicBlock *To, Value *Cond)
+      : PredicateWithCondition(PType, Op, Cond), From(From), To(To) {}
+};
+
+// Provides predicate information for branches.
+class PredicateBranch : public PredicateWithEdge {
+public:
+  // If true, SplitBB is the true successor, otherwise it's the false successor.
+  bool TrueEdge;
+  PredicateBranch(Value *Op, BasicBlock *BranchBB, BasicBlock *SplitBB,
+                  Value *Condition, bool TakenEdge)
+      : PredicateWithEdge(PT_Branch, Op, BranchBB, SplitBB, Condition),
+        TrueEdge(TakenEdge) {}
+  PredicateBranch() = delete;
+  static inline bool classof(const PredicateBase *PB) {
+    return PB->Type == PT_Branch;
+  }
+};
+
+class PredicateSwitch : public PredicateWithEdge {
+public:
+  Value *CaseValue;
+  // This is the switch instruction.
+  SwitchInst *Switch;
+  PredicateSwitch(Value *Op, BasicBlock *SwitchBB, BasicBlock *TargetBB,
+                  Value *CaseValue, SwitchInst *SI)
+      : PredicateWithEdge(PT_Switch, Op, SwitchBB, TargetBB,
+                          SI->getCondition()),
+        CaseValue(CaseValue), Switch(SI) {}
+  PredicateSwitch() = delete;
+  static inline bool classof(const PredicateBase *PB) {
+    return PB->Type == PT_Switch;
+  }
+};
+
+// This name is used in a few places, so kick it into their own namespace
+namespace PredicateInfoClasses {
+struct ValueDFS;
+}
+
+/// \brief Encapsulates PredicateInfo, including all data associated with memory
+/// accesses.
+class PredicateInfo {
+private:
+  // Used to store information about each value we might rename.
+  struct ValueInfo {
+    // Information about each possible copy. During processing, this is each
+    // inserted info. After processing, we move the uninserted ones to the
+    // uninserted vector.
+    SmallVector<PredicateBase *, 4> Infos;
+    SmallVector<PredicateBase *, 4> UninsertedInfos;
+  };
+  // This owns the all the predicate infos in the function, placed or not.
+  iplist<PredicateBase> AllInfos;
+
+public:
+  PredicateInfo(Function &, DominatorTree &, AssumptionCache &);
+  ~PredicateInfo();
+
+  void verifyPredicateInfo() const;
+
+  void dump() const;
+  void print(raw_ostream &) const;
+
+  const PredicateBase *getPredicateInfoFor(const Value *V) const {
+    return PredicateMap.lookup(V);
+  }
+
+protected:
+  // Used by PredicateInfo annotater, dumpers, and wrapper pass.
+  friend class PredicateInfoAnnotatedWriter;
+  friend class PredicateInfoPrinterLegacyPass;
+
+private:
+  void buildPredicateInfo();
+  void processAssume(IntrinsicInst *, BasicBlock *, SmallPtrSetImpl<Value *> &);
+  void processBranch(BranchInst *, BasicBlock *, SmallPtrSetImpl<Value *> &);
+  void processSwitch(SwitchInst *, BasicBlock *, SmallPtrSetImpl<Value *> &);
+  void renameUses(SmallPtrSetImpl<Value *> &);
+  using ValueDFS = PredicateInfoClasses::ValueDFS;
+  typedef SmallVectorImpl<ValueDFS> ValueDFSStack;
+  void convertUsesToDFSOrdered(Value *, SmallVectorImpl<ValueDFS> &);
+  Value *materializeStack(unsigned int &, ValueDFSStack &, Value *);
+  bool stackIsInScope(const ValueDFSStack &, const ValueDFS &) const;
+  void popStackUntilDFSScope(ValueDFSStack &, const ValueDFS &);
+  ValueInfo &getOrCreateValueInfo(Value *);
+  void addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op,
+                  PredicateBase *PB);
+  const ValueInfo &getValueInfo(Value *) const;
+  Function &F;
+  DominatorTree &DT;
+  AssumptionCache &AC;
+  // This maps from copy operands to Predicate Info. Note that it does not own
+  // the Predicate Info, they belong to the ValueInfo structs in the ValueInfos
+  // vector.
+  DenseMap<const Value *, const PredicateBase *> PredicateMap;
+  // This stores info about each operand or comparison result we make copies
+  // of.  The real ValueInfos start at index 1, index 0 is unused so that we can
+  // more easily detect invalid indexing.
+  SmallVector<ValueInfo, 32> ValueInfos;
+  // This gives the index into the ValueInfos array for a given Value.  Because
+  // 0 is not a valid Value Info index, you can use DenseMap::lookup and tell
+  // whether it returned a valid result.
+  DenseMap<Value *, unsigned int> ValueInfoNums;
+  // OrderedBasicBlocks used during sorting uses
+  DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> OBBMap;
+  // The set of edges along which we can only handle phi uses, due to critical
+  // edges.
+  DenseSet<std::pair<BasicBlock *, BasicBlock *>> EdgeUsesOnly;
+};
+
+// This pass does eager building and then printing of PredicateInfo. It is used
+// by
+// the tests to be able to build, dump, and verify PredicateInfo.
+class PredicateInfoPrinterLegacyPass : public FunctionPass {
+public:
+  PredicateInfoPrinterLegacyPass();
+
+  static char ID;
+  bool runOnFunction(Function &) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+/// \brief Printer pass for \c PredicateInfo.
+class PredicateInfoPrinterPass
+    : public PassInfoMixin<PredicateInfoPrinterPass> {
+  raw_ostream &OS;
+
+public:
+  explicit PredicateInfoPrinterPass(raw_ostream &OS) : OS(OS) {}
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+/// \brief Verifier pass for \c PredicateInfo.
+struct PredicateInfoVerifierPass : PassInfoMixin<PredicateInfoVerifierPass> {
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TRANSFORMS_UTILS_PREDICATEINFO_H
diff --git a/include/llvm/Transforms/Utils/PromoteMemToReg.h b/include/llvm/Transforms/Utils/PromoteMemToReg.h
index b548072c413ea7b4874fcc050b0d6dfad940f94a..bb8a61a474f20505d1d849de00f881ff8ce271af 100644
--- a/include/llvm/Transforms/Utils/PromoteMemToReg.h
+++ b/include/llvm/Transforms/Utils/PromoteMemToReg.h
@@ -38,10 +38,7 @@ bool isAllocaPromotable(const AllocaInst *AI);
 /// does not modify the CFG of the function at all.  All allocas must be from
 /// the same function.
 ///
-/// If AST is specified, the specified tracker is updated to reflect changes
-/// made to the IR.
 void PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                     AliasSetTracker *AST = nullptr,
                      AssumptionCache *AC = nullptr);
 
 } // End llvm namespace
diff --git a/include/llvm/Transforms/Utils/UnrollLoop.h b/include/llvm/Transforms/Utils/UnrollLoop.h
index 885edc135c4ef2617df76cf8d9359aa58460918f..a3115ad16914da94ecc73e6f8731b2676fcc2855 100644
--- a/include/llvm/Transforms/Utils/UnrollLoop.h
+++ b/include/llvm/Transforms/Utils/UnrollLoop.h
@@ -53,7 +53,8 @@ bool UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                                 bool PreserveLCSSA);
 
 void computePeelCount(Loop *L, unsigned LoopSize,
-                      TargetTransformInfo::UnrollingPreferences &UP);
+                      TargetTransformInfo::UnrollingPreferences &UP,
+                      unsigned &TripCount);
 
 bool peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI, ScalarEvolution *SE,
               DominatorTree *DT, AssumptionCache *AC, bool PreserveLCSSA);
diff --git a/include/llvm/Transforms/Utils/VNCoercion.h b/include/llvm/Transforms/Utils/VNCoercion.h
new file mode 100644
index 0000000000000000000000000000000000000000..1baa9b66e491906e07421bff18e8a85bbe851a38
--- /dev/null
+++ b/include/llvm/Transforms/Utils/VNCoercion.h
@@ -0,0 +1,108 @@
+//===- VNCoercion.h - Value Numbering Coercion Utilities --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file / This file provides routines used by LLVM's value numbering passes to
+/// perform various forms of value extraction from memory when the types are not
+/// identical.  For example, given
+///
+/// store i32 8, i32 *%foo
+/// %a = bitcast i32 *%foo to i16
+/// %val = load i16, i16 *%a
+///
+/// It possible to extract the value of the load of %a from the store to %foo.
+/// These routines know how to tell whether they can do that (the analyze*
+/// routines), and can also insert the necessary IR to do it (the get*
+/// routines).
+
+#ifndef LLVM_TRANSFORMS_UTILS_VNCOERCION_H
+#define LLVM_TRANSFORMS_UTILS_VNCOERCION_H
+#include "llvm/IR/IRBuilder.h"
+
+namespace llvm {
+class Function;
+class StoreInst;
+class LoadInst;
+class MemIntrinsic;
+class Instruction;
+class Value;
+class Type;
+class DataLayout;
+namespace VNCoercion {
+/// Return true if CoerceAvailableValueToLoadType would succeed if it was
+/// called.
+bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
+                                     const DataLayout &DL);
+
+/// If we saw a store of a value to memory, and then a load from a must-aliased
+/// pointer of a different type, try to coerce the stored value to the loaded
+/// type.  LoadedTy is the type of the load we want to replace.  IRB is
+/// IRBuilder used to insert new instructions.
+///
+/// If we can't do it, return null.
+Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
+                                      IRBuilder<> &IRB, const DataLayout &DL);
+
+/// This function determines whether a value for the pointer LoadPtr can be
+/// extracted from the store at DepSI.
+///
+/// On success, it returns the offset into DepSI that extraction would start.
+/// On failure, it returns -1.
+int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
+                                   StoreInst *DepSI, const DataLayout &DL);
+
+/// This function determines whether a value for the pointer LoadPtr can be
+/// extracted from the load at DepLI.
+///
+/// On success, it returns the offset into DepLI that extraction would start.
+/// On failure, it returns -1.
+int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
+                                  const DataLayout &DL);
+
+/// This function determines whether a value for the pointer LoadPtr can be
+/// extracted from the memory intrinsic at DepMI.
+///
+/// On success, it returns the offset into DepMI that extraction would start.
+/// On failure, it returns -1.
+int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
+                                     MemIntrinsic *DepMI, const DataLayout &DL);
+
+/// If analyzeLoadFromClobberingStore returned an offset, this function can be
+/// used to actually perform the extraction of the bits from the store. It
+/// inserts instructions to do so at InsertPt, and returns the extracted value.
+Value *getStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
+                            Instruction *InsertPt, const DataLayout &DL);
+// This is the same as getStoreValueForLoad, except it performs no insertion
+// It only allows constant inputs.
+Constant *getConstantStoreValueForLoad(Constant *SrcVal, unsigned Offset,
+                                       Type *LoadTy, const DataLayout &DL);
+
+/// If analyzeLoadFromClobberingLoad returned an offset, this function can be
+/// used to actually perform the extraction of the bits from the load, including
+/// any necessary load widening.  It inserts instructions to do so at InsertPt,
+/// and returns the extracted value.
+Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
+                           Instruction *InsertPt, const DataLayout &DL);
+// This is the same as getLoadValueForLoad, except it is given the load value as
+// a constant. It returns nullptr if it would require widening the load.
+Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
+                                      Type *LoadTy, const DataLayout &DL);
+
+/// If analyzeLoadFromClobberingMemInst returned an offset, this function can be
+/// used to actually perform the extraction of the bits from the memory
+/// intrinsic.  It inserts instructions to do so at InsertPt, and returns the
+/// extracted value.
+Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                              Type *LoadTy, Instruction *InsertPt,
+                              const DataLayout &DL);
+// This is the same as getStoreValueForLoad, except it performs no insertion.
+// It returns nullptr if it cannot produce a constant.
+Constant *getConstantMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                                         Type *LoadTy, const DataLayout &DL);
+}
+}
+#endif
diff --git a/include/llvm/XRay/Graph.h b/include/llvm/XRay/Graph.h
new file mode 100644
index 0000000000000000000000000000000000000000..a4d34a8a4be34d228f4fe37ab3ea68b5cf10bed7
--- /dev/null
+++ b/include/llvm/XRay/Graph.h
@@ -0,0 +1,494 @@
+//===-- Graph.h - XRay Graph Class ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A Graph Datatype for XRay.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_XRAY_GRAPH_T_H
+#define LLVM_XRAY_GRAPH_T_H
+
+#include <initializer_list>
+#include <stdint.h>
+#include <type_traits>
+#include <utility>
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/Support/Error.h"
+
+namespace llvm {
+namespace xray {
+
+/// A Graph object represents a Directed Graph and is used in XRay to compute
+/// and store function call graphs and associated statistical information.
+///
+/// The graph takes in four template parameters, these are:
+///  - VertexAttribute, this is a structure which is stored for each vertex.
+///    Must be DefaultConstructible, CopyConstructible, CopyAssignable and
+///    Destructible.
+///  - EdgeAttribute, this is a structure which is stored for each edge
+///    Must be DefaultConstructible, CopyConstructible, CopyAssignable and
+///    Destructible.
+///  - EdgeAttribute, this is a structure which is stored for each variable
+///  - VI, this is a type over which DenseMapInfo is defined and is the type
+///    used look up strings, available as VertexIdentifier.
+///  - If the built in DenseMapInfo is not defined, provide a specialization
+///    class type here.
+///
+/// Graph is CopyConstructible, CopyAssignable, MoveConstructible and
+/// MoveAssignable but is not EqualityComparible or LessThanComparible.
+///
+/// Usage Example Graph with weighted edges and vertices:
+///   Graph<int, int, int> G;
+///
+///   G[1] = 0;
+///   G[2] = 2;
+///   G[{1,2}] = 1;
+///   G[{2,1}] = -1;
+///   for(const auto &v : G.vertices()){
+///     // Do something with the vertices in the graph;
+///   }
+///   for(const auto &e : G.edges()){
+///     // Do something with the edges in the graph;
+///   }
+///
+/// Usage Example with StrRef keys.
+///   Graph<int, double, StrRef> StrG;
+///    char va[] = "Vertex A";
+///    char vaa[] = "Vertex A";
+///    char vb[] = "Vertex B"; // Vertices are referenced by String Refs.
+///    G[va] = 0;
+///    G[vb] = 1;
+///    G[{va, vb}] = 1.0;
+///    cout() << G[vaa] << " " << G[{vaa, vb}]; //prints "0 1.0".
+///
+template <typename VertexAttribute, typename EdgeAttribute,
+          typename VI = int32_t>
+class Graph {
+public:
+  /// These objects are used to name edges and vertices in the graph.
+  typedef VI VertexIdentifier;
+  typedef std::pair<VI, VI> EdgeIdentifier;
+
+  /// This type is the value_type of all iterators which range over vertices,
+  /// Determined by the Vertices DenseMap
+  using VertexValueType =
+      detail::DenseMapPair<VertexIdentifier, VertexAttribute>;
+
+  /// This type is the value_type of all iterators which range over edges,
+  /// Determined by the Edges DenseMap.
+  using EdgeValueType = detail::DenseMapPair<EdgeIdentifier, EdgeAttribute>;
+
+  using size_type = std::size_t;
+
+private:
+  /// The type used for storing the EdgeAttribute for each edge in the graph
+  using EdgeMapT = DenseMap<EdgeIdentifier, EdgeAttribute>;
+
+  /// The type used for storing the VertexAttribute for each vertex in
+  /// the graph.
+  using VertexMapT = DenseMap<VertexIdentifier, VertexAttribute>;
+
+  /// The type used for storing the edges entering a vertex. Indexed by
+  /// the VertexIdentifier of the start of the edge. Only used to determine
+  /// where the incoming edges are, the EdgeIdentifiers are stored in an
+  /// InnerEdgeMapT.
+  using NeighborSetT = DenseSet<VertexIdentifier>;
+
+  /// The type storing the InnerInvGraphT corresponding to each vertex in
+  /// the graph (When a vertex has an incoming edge incident to it)
+  using NeighborLookupT = DenseMap<VertexIdentifier, NeighborSetT>;
+
+private:
+  /// Stores the map from the start and end vertex of an edge to it's
+  /// EdgeAttribute
+  EdgeMapT Edges;
+
+  /// Stores the map from VertexIdentifier to VertexAttribute
+  VertexMapT Vertices;
+
+  /// Allows fast lookup for the incoming edge set of any given vertex.
+  NeighborLookupT InNeighbors;
+
+  /// Allows fast lookup for the outgoing edge set of any given vertex.
+  NeighborLookupT OutNeighbors;
+
+  /// An Iterator adapter using an InnerInvGraphT::iterator as a base iterator,
+  /// and storing the VertexIdentifier the iterator range comes from. The
+  /// dereference operator is then performed using a pointer to the graph's edge
+  /// set.
+  template <bool IsConst, bool IsOut,
+            typename BaseIt = typename NeighborSetT::const_iterator,
+            typename T = typename std::conditional<IsConst, const EdgeValueType,
+                                                   EdgeValueType>::type>
+  class NeighborEdgeIteratorT
+      : public iterator_adaptor_base<
+            NeighborEdgeIteratorT<IsConst, IsOut>, BaseIt,
+            typename std::iterator_traits<BaseIt>::iterator_category, T> {
+    using InternalEdgeMapT =
+        typename std::conditional<IsConst, const EdgeMapT, EdgeMapT>::type;
+
+    friend class NeighborEdgeIteratorT<false, IsOut, BaseIt, EdgeValueType>;
+    friend class NeighborEdgeIteratorT<true, IsOut, BaseIt,
+                                       const EdgeValueType>;
+
+    InternalEdgeMapT *MP;
+    VertexIdentifier SI;
+
+  public:
+    template <bool IsConstDest,
+              typename = typename std::enable_if<IsConstDest && !IsConst>::type>
+    operator NeighborEdgeIteratorT<IsConstDest, IsOut, BaseIt,
+                                   const EdgeValueType>() const {
+      return NeighborEdgeIteratorT<IsConstDest, IsOut, BaseIt,
+                                   const EdgeValueType>(this->I, MP, SI);
+    }
+
+    NeighborEdgeIteratorT() = default;
+    NeighborEdgeIteratorT(BaseIt _I, InternalEdgeMapT *_MP,
+                          VertexIdentifier _SI)
+        : iterator_adaptor_base<
+              NeighborEdgeIteratorT<IsConst, IsOut>, BaseIt,
+              typename std::iterator_traits<BaseIt>::iterator_category, T>(_I),
+          MP(_MP), SI(_SI) {}
+
+    T &operator*() const {
+      if (!IsOut)
+        return *(MP->find({*(this->I), SI}));
+      else
+        return *(MP->find({SI, *(this->I)}));
+    }
+  };
+
+public:
+  /// A const iterator type for iterating through the set of edges entering a
+  /// vertex.
+  ///
+  /// Has a const EdgeValueType as its value_type
+  using ConstInEdgeIterator = NeighborEdgeIteratorT<true, false>;
+
+  /// An iterator type for iterating through the set of edges leaving a vertex.
+  ///
+  /// Has an EdgeValueType as its value_type
+  using InEdgeIterator = NeighborEdgeIteratorT<false, false>;
+
+  /// A const iterator type for iterating through the set of edges entering a
+  /// vertex.
+  ///
+  /// Has a const EdgeValueType as its value_type
+  using ConstOutEdgeIterator = NeighborEdgeIteratorT<true, true>;
+
+  /// An iterator type for iterating through the set of edges leaving a vertex.
+  ///
+  /// Has an EdgeValueType as its value_type
+  using OutEdgeIterator = NeighborEdgeIteratorT<false, true>;
+
+  /// A class for ranging over the incoming edges incident to a vertex.
+  ///
+  /// Like all views in this class it provides methods to get the beginning and
+  /// past the range iterators for the range, as well as methods to determine
+  /// the number of elements in the range and whether the range is empty.
+  template <bool isConst, bool isOut> class InOutEdgeView {
+  public:
+    using iterator = NeighborEdgeIteratorT<isConst, isOut>;
+    using const_iterator = NeighborEdgeIteratorT<true, isOut>;
+    using GraphT = typename std::conditional<isConst, const Graph, Graph>::type;
+    using InternalEdgeMapT =
+        typename std::conditional<isConst, const EdgeMapT, EdgeMapT>::type;
+
+  private:
+    InternalEdgeMapT &M;
+    const VertexIdentifier A;
+    const NeighborLookupT &NL;
+
+  public:
+    iterator begin() {
+      auto It = NL.find(A);
+      if (It == NL.end())
+        return iterator();
+      return iterator(It->second.begin(), &M, A);
+    }
+
+    const_iterator cbegin() const {
+      auto It = NL.find(A);
+      if (It == NL.end())
+        return const_iterator();
+      return const_iterator(It->second.begin(), &M, A);
+    }
+
+    const_iterator begin() const { return cbegin(); }
+
+    iterator end() {
+      auto It = NL.find(A);
+      if (It == NL.end())
+        return iterator();
+      return iterator(It->second.end(), &M, A);
+    }
+    const_iterator cend() const {
+      auto It = NL.find(A);
+      if (It == NL.end())
+        return const_iterator();
+      return const_iterator(It->second.end(), &M, A);
+    }
+
+    const_iterator end() const { return cend(); }
+
+    size_type size() const {
+      auto I = NL.find(A);
+      if (I == NL.end())
+        return 0;
+      else
+        return I->second.size();
+    }
+
+    bool empty() const { return NL.count(A) == 0; };
+
+    InOutEdgeView(GraphT &G, VertexIdentifier A)
+        : M(G.Edges), A(A), NL(isOut ? G.OutNeighbors : G.InNeighbors) {}
+  };
+
+  /// A const iterator type for iterating through the whole vertex set of the
+  /// graph.
+  ///
+  /// Has a const VertexValueType as its value_type
+  using ConstVertexIterator = typename VertexMapT::const_iterator;
+
+  /// An iterator type for iterating through the whole vertex set of the graph.
+  ///
+  /// Has a VertexValueType as its value_type
+  using VertexIterator = typename VertexMapT::iterator;
+
+  /// A class for ranging over the vertices in the graph.
+  ///
+  /// Like all views in this class it provides methods to get the beginning and
+  /// past the range iterators for the range, as well as methods to determine
+  /// the number of elements in the range and whether the range is empty.
+  template <bool isConst> class VertexView {
+  public:
+    using iterator = typename std::conditional<isConst, ConstVertexIterator,
+                                               VertexIterator>::type;
+    using const_iterator = ConstVertexIterator;
+    using GraphT = typename std::conditional<isConst, const Graph, Graph>::type;
+
+  private:
+    GraphT &G;
+
+  public:
+    iterator begin() { return G.Vertices.begin(); }
+    iterator end() { return G.Vertices.end(); }
+    const_iterator cbegin() const { return G.Vertices.cbegin(); }
+    const_iterator cend() const { return G.Vertices.cend(); }
+    const_iterator begin() const { return G.Vertices.begin(); }
+    const_iterator end() const { return G.Vertices.end(); }
+    size_type size() const { return G.Vertices.size(); }
+    bool empty() const { return G.Vertices.empty(); }
+    VertexView(GraphT &_G) : G(_G) {}
+  };
+
+  /// A const iterator for iterating through the entire edge set of the graph.
+  ///
+  /// Has a const EdgeValueType as its value_type
+  using ConstEdgeIterator = typename EdgeMapT::const_iterator;
+
+  /// An iterator for iterating through the entire edge set of the graph.
+  ///
+  /// Has an EdgeValueType as its value_type
+  using EdgeIterator = typename EdgeMapT::iterator;
+
+  /// A class for ranging over all the edges in the graph.
+  ///
+  /// Like all views in this class it provides methods to get the beginning and
+  /// past the range iterators for the range, as well as methods to determine
+  /// the number of elements in the range and whether the range is empty.
+  template <bool isConst> class EdgeView {
+  public:
+    using iterator = typename std::conditional<isConst, ConstEdgeIterator,
+                                               EdgeIterator>::type;
+    using const_iterator = ConstEdgeIterator;
+    using GraphT = typename std::conditional<isConst, const Graph, Graph>::type;
+
+  private:
+    GraphT &G;
+
+  public:
+    iterator begin() { return G.Edges.begin(); }
+    iterator end() { return G.Edges.end(); }
+    const_iterator cbegin() const { return G.Edges.cbegin(); }
+    const_iterator cend() const { return G.Edges.cend(); }
+    const_iterator begin() const { return G.Edges.begin(); }
+    const_iterator end() const { return G.Edges.end(); }
+    size_type size() const { return G.Edges.size(); }
+    bool empty() const { return G.Edges.empty(); }
+    EdgeView(GraphT &_G) : G(_G) {}
+  };
+
+public:
+  // TODO: implement constructor to enable Graph Initialisation.\
+  // Something like:
+  //   Graph<int, int, int> G(
+  //   {1, 2, 3, 4, 5},
+  //   {{1, 2}, {2, 3}, {3, 4}});
+
+  /// Empty the Graph
+  void clear() {
+    Edges.clear();
+    Vertices.clear();
+    InNeighbors.clear();
+    OutNeighbors.clear();
+  }
+
+  /// Returns a view object allowing iteration over the vertices of the graph.
+  /// also allows access to the size of the vertex set.
+  VertexView<false> vertices() { return VertexView<false>(*this); }
+
+  VertexView<true> vertices() const { return VertexView<true>(*this); }
+
+  /// Returns a view object allowing iteration over the edges of the graph.
+  /// also allows access to the size of the edge set.
+  EdgeView<false> edges() { return EdgeView<false>(*this); }
+
+  EdgeView<true> edges() const { return EdgeView<true>(*this); }
+
+  /// Returns a view object allowing iteration over the edges which start at
+  /// a vertex I.
+  InOutEdgeView<false, true> outEdges(const VertexIdentifier I) {
+    return InOutEdgeView<false, true>(*this, I);
+  }
+
+  InOutEdgeView<true, true> outEdges(const VertexIdentifier I) const {
+    return InOutEdgeView<true, true>(*this, I);
+  }
+
+  /// Returns a view object allowing iteration over the edges which point to
+  /// a vertex I.
+  InOutEdgeView<false, false> inEdges(const VertexIdentifier I) {
+    return InOutEdgeView<false, false>(*this, I);
+  }
+
+  InOutEdgeView<true, false> inEdges(const VertexIdentifier I) const {
+    return InOutEdgeView<true, false>(*this, I);
+  }
+
+  /// Looks up the vertex with identifier I, if it does not exist it default
+  /// constructs it.
+  VertexAttribute &operator[](const VertexIdentifier &I) {
+    return Vertices.FindAndConstruct(I).second;
+  }
+
+  /// Looks up the edge with identifier I, if it does not exist it default
+  /// constructs it, if it's endpoints do not exist it also default constructs
+  /// them.
+  EdgeAttribute &operator[](const EdgeIdentifier &I) {
+    auto &P = Edges.FindAndConstruct(I);
+    Vertices.FindAndConstruct(I.first);
+    Vertices.FindAndConstruct(I.second);
+    InNeighbors[I.second].insert(I.first);
+    OutNeighbors[I.first].insert(I.second);
+    return P.second;
+  }
+
+  /// Looks up a vertex with Identifier I, or an error if it does not exist.
+  Expected<VertexAttribute &> at(const VertexIdentifier &I) {
+    auto It = Vertices.find(I);
+    if (It == Vertices.end())
+      return make_error<StringError>(
+          "Vertex Identifier Does Not Exist",
+          std::make_error_code(std::errc::invalid_argument));
+    return It->second;
+  }
+
+  Expected<const VertexAttribute &> at(const VertexIdentifier &I) const {
+    auto It = Vertices.find(I);
+    if (It == Vertices.end())
+      return make_error<StringError>(
+          "Vertex Identifier Does Not Exist",
+          std::make_error_code(std::errc::invalid_argument));
+    return It->second;
+  }
+
+  /// Looks up an edge with Identifier I, or an error if it does not exist.
+  Expected<EdgeAttribute &> at(const EdgeIdentifier &I) {
+    auto It = Edges.find(I);
+    if (It == Edges.end())
+      return make_error<StringError>(
+          "Edge Identifier Does Not Exist",
+          std::make_error_code(std::errc::invalid_argument));
+    return It->second;
+  }
+
+  Expected<const EdgeAttribute &> at(const EdgeIdentifier &I) const {
+    auto It = Edges.find(I);
+    if (It == Edges.end())
+      return make_error<StringError>(
+          "Edge Identifier Does Not Exist",
+          std::make_error_code(std::errc::invalid_argument));
+    return It->second;
+  }
+
+  /// Looks for a vertex with identifier I, returns 1 if one exists, and
+  /// 0 otherwise
+  size_type count(const VertexIdentifier &I) const {
+    return Vertices.count(I);
+  }
+
+  /// Looks for an edge with Identifier I, returns 1 if one exists and 0
+  /// otherwise
+  size_type count(const EdgeIdentifier &I) const { return Edges.count(I); }
+
+  /// Inserts a vertex into the graph with Identifier Val.first, and
+  /// Attribute Val.second.
+  std::pair<VertexIterator, bool>
+  insert(const std::pair<VertexIdentifier, VertexAttribute> &Val) {
+    return Vertices.insert(Val);
+  }
+
+  std::pair<VertexIterator, bool>
+  insert(std::pair<VertexIdentifier, VertexAttribute> &&Val) {
+    return Vertices.insert(std::move(Val));
+  }
+
+  /// Inserts an edge into the graph with Identifier Val.first, and
+  /// Attribute Val.second. If the key is already in the map, it returns false
+  /// and doesn't update the value.
+  std::pair<EdgeIterator, bool>
+  insert(const std::pair<EdgeIdentifier, EdgeAttribute> &Val) {
+    const auto &p = Edges.insert(Val);
+    if (p.second) {
+      const auto &EI = Val.first;
+      Vertices.FindAndConstruct(EI.first);
+      Vertices.FindAndConstruct(EI.second);
+      InNeighbors[EI.second].insert(EI.first);
+      OutNeighbors[EI.first].insert(EI.second);
+    };
+
+    return p;
+  }
+
+  /// Inserts an edge into the graph with Identifier Val.first, and
+  /// Attribute Val.second. If the key is already in the map, it returns false
+  /// and doesn't update the value.
+  std::pair<EdgeIterator, bool>
+  insert(std::pair<EdgeIdentifier, EdgeAttribute> &&Val) {
+    auto EI = Val.first;
+    const auto &p = Edges.insert(std::move(Val));
+    if (p.second) {
+      Vertices.FindAndConstruct(EI.first);
+      Vertices.FindAndConstruct(EI.second);
+      InNeighbors[EI.second].insert(EI.first);
+      OutNeighbors[EI.first].insert(EI.second);
+    };
+
+    return p;
+  }
+};
+}
+}
+#endif
diff --git a/include/llvm/XRay/InstrumentationMap.h b/include/llvm/XRay/InstrumentationMap.h
new file mode 100644
index 0000000000000000000000000000000000000000..f7286c52ff42e219d1adf7a2db1c65a39ececb89
--- /dev/null
+++ b/include/llvm/XRay/InstrumentationMap.h
@@ -0,0 +1,129 @@
+//===- InstrumentationMap.h - XRay Instrumentation Map ----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Defines the interface for extracting the instrumentation map from an
+// XRay-instrumented binary.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_XRAY_INSTRUMENTATION_MAP_H
+#define LLVM_XRAY_INSTRUMENTATION_MAP_H
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/YAMLTraits.h"
+#include <cstdint>
+#include <unordered_map>
+#include <vector>
+
+namespace llvm {
+
+namespace xray {
+
+// Forward declare to make a friend.
+class InstrumentationMap;
+
+/// Loads the instrumentation map from |Filename|. This auto-deduces the type of
+/// the instrumentation map.
+Expected<InstrumentationMap> loadInstrumentationMap(StringRef Filename);
+
+/// Represents an XRay instrumentation sled entry from an object file.
+struct SledEntry {
+  /// Each entry here represents the kinds of supported instrumentation map
+  /// entries.
+  enum class FunctionKinds { ENTRY, EXIT, TAIL };
+
+  /// The address of the sled.
+  uint64_t Address;
+
+  /// The address of the function.
+  uint64_t Function;
+
+  /// The kind of sled.
+  FunctionKinds Kind;
+
+  /// Whether the sled was annotated to always be instrumented.
+  bool AlwaysInstrument;
+};
+
+struct YAMLXRaySledEntry {
+  int32_t FuncId;
+  yaml::Hex64 Address;
+  yaml::Hex64 Function;
+  SledEntry::FunctionKinds Kind;
+  bool AlwaysInstrument;
+};
+
+/// The InstrumentationMap represents the computed function id's and indicated
+/// function addresses from an object file (or a YAML file). This provides an
+/// interface to just the mapping between the function id, and the function
+/// address.
+///
+/// We also provide raw access to the actual instrumentation map entries we find
+/// associated with a particular object file.
+///
+class InstrumentationMap {
+public:
+  using FunctionAddressMap = std::unordered_map<int32_t, uint64_t>;
+  using FunctionAddressReverseMap = std::unordered_map<uint64_t, int32_t>;
+  using SledContainer = std::vector<SledEntry>;
+
+private:
+  SledContainer Sleds;
+  FunctionAddressMap FunctionAddresses;
+  FunctionAddressReverseMap FunctionIds;
+
+  friend Expected<InstrumentationMap> loadInstrumentationMap(StringRef);
+
+public:
+  /// Provides a raw accessor to the unordered map of function addresses.
+  const FunctionAddressMap &getFunctionAddresses() { return FunctionAddresses; }
+
+  /// Returns an XRay computed function id, provided a function address.
+  Optional<int32_t> getFunctionId(uint64_t Addr) const;
+
+  /// Returns the function address for a function id.
+  Optional<uint64_t> getFunctionAddr(int32_t FuncId) const;
+
+  /// Provide read-only access to the entries of the instrumentation map.
+  const SledContainer &sleds() const { return Sleds; };
+};
+
+} // end namespace xray
+
+namespace yaml {
+
+template <> struct ScalarEnumerationTraits<xray::SledEntry::FunctionKinds> {
+  static void enumeration(IO &IO, xray::SledEntry::FunctionKinds &Kind) {
+    IO.enumCase(Kind, "function-enter", xray::SledEntry::FunctionKinds::ENTRY);
+    IO.enumCase(Kind, "function-exit", xray::SledEntry::FunctionKinds::EXIT);
+    IO.enumCase(Kind, "tail-exit", xray::SledEntry::FunctionKinds::TAIL);
+  }
+};
+
+template <> struct MappingTraits<xray::YAMLXRaySledEntry> {
+  static void mapping(IO &IO, xray::YAMLXRaySledEntry &Entry) {
+    IO.mapRequired("id", Entry.FuncId);
+    IO.mapRequired("address", Entry.Address);
+    IO.mapRequired("function", Entry.Function);
+    IO.mapRequired("kind", Entry.Kind);
+    IO.mapRequired("always-instrument", Entry.AlwaysInstrument);
+  }
+
+  static constexpr bool flow = true;
+};
+
+} // end namespace yaml
+
+} // end namespace llvm
+
+LLVM_YAML_IS_SEQUENCE_VECTOR(xray::YAMLXRaySledEntry)
+
+#endif // LLVM_XRAY_INSTRUMENTATION_MAP_H
diff --git a/include/llvm/XRay/XRayRecord.h b/include/llvm/XRay/XRayRecord.h
index a96846136ec33d8b6147c281125e514621ad3a3d..68c91a40fed1ee119d33620b79425416f7d8f22c 100644
--- a/include/llvm/XRay/XRayRecord.h
+++ b/include/llvm/XRay/XRayRecord.h
@@ -42,6 +42,11 @@ struct XRayFileHeader {
   /// counter (TSC) values. Useful for estimating the amount of time that
   /// elapsed between two TSCs on some platforms.
   uint64_t CycleFrequency = 0;
+
+  // This is different depending on the type of xray record. The naive format
+  // stores a Wallclock timespec. FDR logging stores the size of a thread
+  // buffer.
+  char FreeFormData[16];
 };
 
 /// Determines the supported types of records that could be seen in XRay traces.
@@ -54,8 +59,8 @@ struct XRayRecord {
   /// The type of record.
   uint16_t RecordType;
 
-  /// The CPU where the thread is running. We assume number of CPUs <= 256.
-  uint8_t CPU;
+  /// The CPU where the thread is running. We assume number of CPUs <= 65536.
+  uint16_t CPU;
 
   /// Identifies the type of record.
   RecordTypes Type;
diff --git a/include/llvm/XRay/YAMLXRayRecord.h b/include/llvm/XRay/YAMLXRayRecord.h
index f5836b3922421b478e2b0b8c5f22808ac681dbd4..7e1a4112818edbef48543d9c52dea2767e52df19 100644
--- a/include/llvm/XRay/YAMLXRayRecord.h
+++ b/include/llvm/XRay/YAMLXRayRecord.h
@@ -31,7 +31,7 @@ struct YAMLXRayFileHeader {
 
 struct YAMLXRayRecord {
   uint16_t RecordType;
-  uint8_t CPU;
+  uint16_t CPU;
   RecordTypes Type;
   int32_t FuncId;
   std::string Function;
diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap
index 29e6d66b27ff7fa416934e9e65089c61d6f7b4a2..59b1f1621039f91ecdf2b82afc3ec2ee7ba0025c 100644
--- a/include/llvm/module.modulemap
+++ b/include/llvm/module.modulemap
@@ -276,6 +276,7 @@ module LLVM_Utils {
     textual header "Support/ELFRelocs/SystemZ.def"
     textual header "Support/ELFRelocs/x86_64.def"
     textual header "Support/ELFRelocs/WebAssembly.def"
+    textual header "Support/WasmRelocs/WebAssembly.def"
   }
 
   // This part of the module is usable from both C and C++ code.
@@ -283,12 +284,12 @@ module LLVM_Utils {
     header "Support/ConvertUTF.h"
     export *
   }
-}
 
-module LLVM_CodeGen_MachineValueType {
-  requires cplusplus
-  header "CodeGen/MachineValueType.h"
-  export *
+  module LLVM_CodeGen_MachineValueType {
+    requires cplusplus
+    header "CodeGen/MachineValueType.h"
+    export *
+  }
 }
 
 // This is used for a $src == $build compilation. Otherwise we use
diff --git a/lib/Analysis/AliasAnalysis.cpp b/lib/Analysis/AliasAnalysis.cpp
index fbc030b26aaff041764bd712e9790ec9632fa4ef..4c6423d5c17dd770ec53284328e3159cec381580 100644
--- a/lib/Analysis/AliasAnalysis.cpp
+++ b/lib/Analysis/AliasAnalysis.cpp
@@ -332,8 +332,8 @@ FunctionModRefBehavior AAResults::getModRefBehavior(const Function *F) {
 
 ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
                                     const MemoryLocation &Loc) {
-  // Be conservative in the face of volatile/atomic.
-  if (!L->isUnordered())
+  // Be conservative in the face of atomic.
+  if (isStrongerThan(L->getOrdering(), AtomicOrdering::Unordered))
     return MRI_ModRef;
 
   // If the load address doesn't alias the given address, it doesn't read
@@ -347,8 +347,8 @@ ModRefInfo AAResults::getModRefInfo(const LoadInst *L,
 
 ModRefInfo AAResults::getModRefInfo(const StoreInst *S,
                                     const MemoryLocation &Loc) {
-  // Be conservative in the face of volatile/atomic.
-  if (!S->isUnordered())
+  // Be conservative in the face of atomic.
+  if (isStrongerThan(S->getOrdering(), AtomicOrdering::Unordered))
     return MRI_ModRef;
 
   if (Loc.Ptr) {
diff --git a/lib/Analysis/AliasSetTracker.cpp b/lib/Analysis/AliasSetTracker.cpp
index 701b0e1a592514d0c01bc901f9270619d1b3cc3b..16b711a69ec390f539a06ab90fef19c6450eae5e 100644
--- a/lib/Analysis/AliasSetTracker.cpp
+++ b/lib/Analysis/AliasSetTracker.cpp
@@ -199,9 +199,10 @@ bool AliasSet::aliasesPointer(const Value *Ptr, uint64_t Size,
   // Check the unknown instructions...
   if (!UnknownInsts.empty()) {
     for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i)
-      if (AA.getModRefInfo(UnknownInsts[i],
-                           MemoryLocation(Ptr, Size, AAInfo)) != MRI_NoModRef)
-        return true;
+      if (auto *Inst = getUnknownInst(i))
+        if (AA.getModRefInfo(Inst, MemoryLocation(Ptr, Size, AAInfo)) !=
+            MRI_NoModRef)
+          return true;
   }
 
   return false;
@@ -217,10 +218,12 @@ bool AliasSet::aliasesUnknownInst(const Instruction *Inst,
     return false;
 
   for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
-    ImmutableCallSite C1(getUnknownInst(i)), C2(Inst);
-    if (!C1 || !C2 || AA.getModRefInfo(C1, C2) != MRI_NoModRef ||
-        AA.getModRefInfo(C2, C1) != MRI_NoModRef)
-      return true;
+    if (auto *Inst = getUnknownInst(i)) {
+      ImmutableCallSite C1(Inst), C2(Inst);
+      if (!C1 || !C2 || AA.getModRefInfo(C1, C2) != MRI_NoModRef ||
+          AA.getModRefInfo(C2, C1) != MRI_NoModRef)
+        return true;
+    }
   }
 
   for (iterator I = begin(), E = end(); I != E; ++I)
@@ -471,7 +474,8 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
 
     // If there are any call sites in the alias set, add them to this AST.
     for (unsigned i = 0, e = AS.UnknownInsts.size(); i != e; ++i)
-      add(AS.UnknownInsts[i]);
+      if (auto *Inst = AS.getUnknownInst(i))
+        add(Inst);
 
     // Loop over all of the pointers in this alias set.
     for (AliasSet::iterator ASI = AS.begin(), E = AS.end(); ASI != E; ++ASI) {
@@ -489,19 +493,6 @@ void AliasSetTracker::add(const AliasSetTracker &AST) {
 // dangling pointers to deleted instructions.
 //
 void AliasSetTracker::deleteValue(Value *PtrVal) {
-  // If this is a call instruction, remove the callsite from the appropriate
-  // AliasSet (if present).
-  if (Instruction *Inst = dyn_cast<Instruction>(PtrVal)) {
-    if (Inst->mayReadOrWriteMemory()) {
-      // Scan all the alias sets to see if this call site is contained.
-      for (iterator I = begin(), E = end(); I != E;) {
-        iterator Cur = I++;
-        if (!Cur->Forward)
-          Cur->removeUnknownInst(*this, Inst);
-      }
-    }
-  }
-
   // First, look up the PointerRec for this pointer.
   PointerMapType::iterator I = PointerMap.find_as(PtrVal);
   if (I == PointerMap.end()) return;  // Noop
@@ -633,7 +624,8 @@ void AliasSet::print(raw_ostream &OS) const {
     OS << "\n    " << UnknownInsts.size() << " Unknown instructions: ";
     for (unsigned i = 0, e = UnknownInsts.size(); i != e; ++i) {
       if (i) OS << ", ";
-      UnknownInsts[i]->printAsOperand(OS);
+      if (auto *I = getUnknownInst(i))
+        I->printAsOperand(OS);
     }
   }
   OS << "\n";
diff --git a/lib/Analysis/Analysis.cpp b/lib/Analysis/Analysis.cpp
index 0e7cf402cdb57580ae1ccaf23dcb0ce445b25561..0e0b5c92a918a49a510166df3583ebf32fffc8d7 100644
--- a/lib/Analysis/Analysis.cpp
+++ b/lib/Analysis/Analysis.cpp
@@ -57,6 +57,7 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeLazyBranchProbabilityInfoPassPass(Registry);
   initializeLazyBlockFrequencyInfoPassPass(Registry);
   initializeLazyValueInfoWrapperPassPass(Registry);
+  initializeLazyValueInfoPrinterPass(Registry);
   initializeLintPass(Registry);
   initializeLoopInfoWrapperPassPass(Registry);
   initializeMemDepPrinterPass(Registry);
@@ -78,6 +79,8 @@ void llvm::initializeAnalysis(PassRegistry &Registry) {
   initializeTypeBasedAAWrapperPassPass(Registry);
   initializeScopedNoAliasAAWrapperPassPass(Registry);
   initializeLCSSAVerificationPassPass(Registry);
+  initializeMemorySSAWrapperPassPass(Registry);
+  initializeMemorySSAPrinterLegacyPassPass(Registry);
 }
 
 void LLVMInitializeAnalysis(LLVMPassRegistryRef R) {
diff --git a/lib/Analysis/AssumptionCache.cpp b/lib/Analysis/AssumptionCache.cpp
index 4e287e5f6bc11a987d5a72d52b56629ce2703307..1fae947244878daed5da51cf799f2144c2d601df 100644
--- a/lib/Analysis/AssumptionCache.cpp
+++ b/lib/Analysis/AssumptionCache.cpp
@@ -24,6 +24,11 @@
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+static cl::opt<bool>
+    VerifyAssumptionCache("verify-assumption-cache", cl::Hidden,
+                          cl::desc("Enable verification of assumption cache"),
+                          cl::init(false));
+
 SmallVector<WeakVH, 1> &AssumptionCache::getOrInsertAffectedValues(Value *V) {
   // Try using find_as first to avoid creating extra value handles just for the
   // purpose of doing the lookup.
@@ -231,7 +236,13 @@ AssumptionCache &AssumptionCacheTracker::getAssumptionCache(Function &F) {
 }
 
 void AssumptionCacheTracker::verifyAnalysis() const {
-#ifndef NDEBUG
+  // FIXME: In the long term the verifier should not be controllable with a
+  // flag. We should either fix all passes to correctly update the assumption
+  // cache and enable the verifier unconditionally or somehow arrange for the
+  // assumption list to be updated automatically by passes.
+  if (!VerifyAssumptionCache)
+    return;
+
   SmallPtrSet<const CallInst *, 4> AssumptionSet;
   for (const auto &I : AssumptionCaches) {
     for (auto &VH : I.second->assumptions())
@@ -240,11 +251,10 @@ void AssumptionCacheTracker::verifyAnalysis() const {
 
     for (const BasicBlock &B : cast<Function>(*I.first))
       for (const Instruction &II : B)
-        if (match(&II, m_Intrinsic<Intrinsic::assume>()))
-          assert(AssumptionSet.count(cast<CallInst>(&II)) &&
-                 "Assumption in scanned function not in cache");
+        if (match(&II, m_Intrinsic<Intrinsic::assume>()) &&
+            !AssumptionSet.count(cast<CallInst>(&II)))
+          report_fatal_error("Assumption in scanned function not in cache");
   }
-#endif
 }
 
 AssumptionCacheTracker::AssumptionCacheTracker() : ImmutablePass(ID) {
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index 55f40a348399a96716de2548fafb017ee9d5c520..0fa884ae30ab5b61bb3883449dc6f16e4f069dd7 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -127,7 +127,9 @@ static uint64_t getObjectSize(const Value *V, const DataLayout &DL,
                               const TargetLibraryInfo &TLI,
                               bool RoundToAlign = false) {
   uint64_t Size;
-  if (getObjectSize(V, Size, DL, &TLI, RoundToAlign))
+  ObjectSizeOpts Opts;
+  Opts.RoundToAlign = RoundToAlign;
+  if (getObjectSize(V, Size, DL, &TLI, Opts))
     return Size;
   return MemoryLocation::UnknownSize;
 }
@@ -749,7 +751,11 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
   // as an argument, and itself doesn't capture it.
   if (!isa<Constant>(Object) && CS.getInstruction() != Object &&
       isNonEscapingLocalObject(Object)) {
-    bool PassedAsArg = false;
+
+    // Optimistically assume that call doesn't touch Object and check this
+    // assumption in the following loop.
+    ModRefInfo Result = MRI_NoModRef;
+
     unsigned OperandNo = 0;
     for (auto CI = CS.data_operands_begin(), CE = CS.data_operands_end();
          CI != CE; ++CI, ++OperandNo) {
@@ -761,20 +767,38 @@ ModRefInfo BasicAAResult::getModRefInfo(ImmutableCallSite CS,
            OperandNo < CS.getNumArgOperands() && !CS.isByValArgument(OperandNo)))
         continue;
 
+      // Call doesn't access memory through this operand, so we don't care
+      // if it aliases with Object.
+      if (CS.doesNotAccessMemory(OperandNo))
+        continue;
+
       // If this is a no-capture pointer argument, see if we can tell that it
-      // is impossible to alias the pointer we're checking.  If not, we have to
-      // assume that the call could touch the pointer, even though it doesn't
-      // escape.
+      // is impossible to alias the pointer we're checking.
       AliasResult AR =
           getBestAAResults().alias(MemoryLocation(*CI), MemoryLocation(Object));
-      if (AR) {
-        PassedAsArg = true;
-        break;
+
+      // Operand doesnt alias 'Object', continue looking for other aliases
+      if (AR == NoAlias)
+        continue;
+      // Operand aliases 'Object', but call doesn't modify it. Strengthen
+      // initial assumption and keep looking in case if there are more aliases.
+      if (CS.onlyReadsMemory(OperandNo)) {
+        Result = static_cast<ModRefInfo>(Result | MRI_Ref);
+        continue;
+      }
+      // Operand aliases 'Object' but call only writes into it.
+      if (CS.doesNotReadMemory(OperandNo)) {
+        Result = static_cast<ModRefInfo>(Result | MRI_Mod);
+        continue;
       }
+      // This operand aliases 'Object' and call reads and writes into it.
+      Result = MRI_ModRef;
+      break;
     }
 
-    if (!PassedAsArg)
-      return MRI_NoModRef;
+    // Early return if we improved mod ref information
+    if (Result != MRI_ModRef)
+      return Result;
   }
 
   // If the CallSite is to malloc or calloc, we can assume that it doesn't
diff --git a/lib/Analysis/BlockFrequencyInfo.cpp b/lib/Analysis/BlockFrequencyInfo.cpp
index 3c57c171e07eb3fa9440efe87bd671cba8275655..07a2a9229fd543bf5928e074fe5f475548cbb737 100644
--- a/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/lib/Analysis/BlockFrequencyInfo.cpp
@@ -26,7 +26,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "block-freq"
 
-#ifndef NDEBUG
 static cl::opt<GVDAGType> ViewBlockFreqPropagationDAG(
     "view-block-freq-propagation-dags", cl::Hidden,
     cl::desc("Pop up a window to show a dag displaying how block "
@@ -56,7 +55,18 @@ cl::opt<unsigned>
                                 "function multiplied by this percent."));
 
 // Command line option to turn on CFG dot dump after profile annotation.
-cl::opt<bool> PGOViewCounts("pgo-view-counts", cl::init(false), cl::Hidden);
+cl::opt<bool>
+    PGOViewCounts("pgo-view-counts", cl::init(false), cl::Hidden,
+                  cl::desc("A boolean option to show CFG dag with "
+                           "block profile counts and branch probabilities "
+                           "right after PGO profile annotation step. The "
+                           "profile counts are computed using branch "
+                           "probabilities from the runtime profile data and "
+                           "block frequency propagation algorithm. To view "
+                           "the raw counts from the profile, use option "
+                           "-pgo-view-raw-counts instead. To limit graph "
+                           "display to only one function, use filtering option "
+                           "-view-bfi-func-name."));
 
 namespace llvm {
 
@@ -116,7 +126,6 @@ struct DOTGraphTraits<BlockFrequencyInfo *> : public BFIDOTGTraitsBase {
 };
 
 } // end namespace llvm
-#endif
 
 BlockFrequencyInfo::BlockFrequencyInfo() {}
 
@@ -156,13 +165,11 @@ void BlockFrequencyInfo::calculate(const Function &F,
   if (!BFI)
     BFI.reset(new ImplType);
   BFI->calculate(F, BPI, LI);
-#ifndef NDEBUG
   if (ViewBlockFreqPropagationDAG != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
        F.getName().equals(ViewBlockFreqFuncName))) {
     view();
   }
-#endif
 }
 
 BlockFrequency BlockFrequencyInfo::getBlockFreq(const BasicBlock *BB) const {
@@ -214,13 +221,7 @@ void BlockFrequencyInfo::setBlockFreqAndScale(
 /// Pop up a ghostview window with the current block frequency propagation
 /// rendered using dot.
 void BlockFrequencyInfo::view() const {
-// This code is only for debugging.
-#ifndef NDEBUG
   ViewGraph(const_cast<BlockFrequencyInfo *>(this), "BlockFrequencyDAGs");
-#else
-  errs() << "BlockFrequencyInfo::view is only available in debug builds on "
-            "systems with Graphviz or gv!\n";
-#endif // NDEBUG
 }
 
 const Function *BlockFrequencyInfo::getFunction() const {
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 3eabb780398c0e3d95d6dc9eae5e913357b32cf5..5935dec15c7019aa07fb3aeca1610bc99c111236 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -108,11 +108,9 @@ static const uint32_t IH_TAKEN_WEIGHT = 1024 * 1024 - 1;
 /// instruction. This is essentially never taken.
 static const uint32_t IH_NONTAKEN_WEIGHT = 1;
 
-/// \brief Calculate edge weights for successors lead to unreachable.
-///
-/// Predict that a successor which leads necessarily to an
-/// unreachable-terminated block as extremely unlikely.
-bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
+/// \brief Add \p BB to PostDominatedByUnreachable set if applicable.
+void
+BranchProbabilityInfo::updatePostDominatedByUnreachable(const BasicBlock *BB) {
   const TerminatorInst *TI = BB->getTerminator();
   if (TI->getNumSuccessors() == 0) {
     if (isa<UnreachableInst>(TI) ||
@@ -122,38 +120,86 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
         // never execute.
         BB->getTerminatingDeoptimizeCall())
       PostDominatedByUnreachable.insert(BB);
-    return false;
+    return;
+  }
+
+  // If the terminator is an InvokeInst, check only the normal destination block
+  // as the unwind edge of InvokeInst is also very unlikely taken.
+  if (auto *II = dyn_cast<InvokeInst>(TI)) {
+    if (PostDominatedByUnreachable.count(II->getNormalDest()))
+      PostDominatedByUnreachable.insert(BB);
+    return;
   }
 
+  for (auto *I : successors(BB))
+    // If any of successor is not post dominated then BB is also not.
+    if (!PostDominatedByUnreachable.count(I))
+      return;
+
+  PostDominatedByUnreachable.insert(BB);
+}
+
+/// \brief Add \p BB to PostDominatedByColdCall set if applicable.
+void
+BranchProbabilityInfo::updatePostDominatedByColdCall(const BasicBlock *BB) {
+  assert(!PostDominatedByColdCall.count(BB));
+  const TerminatorInst *TI = BB->getTerminator();
+  if (TI->getNumSuccessors() == 0)
+    return;
+
+  // If all of successor are post dominated then BB is also done.
+  if (llvm::all_of(successors(BB), [&](const BasicBlock *SuccBB) {
+        return PostDominatedByColdCall.count(SuccBB);
+      })) {
+    PostDominatedByColdCall.insert(BB);
+    return;
+  }
+
+  // If the terminator is an InvokeInst, check only the normal destination
+  // block as the unwind edge of InvokeInst is also very unlikely taken.
+  if (auto *II = dyn_cast<InvokeInst>(TI))
+    if (PostDominatedByColdCall.count(II->getNormalDest())) {
+      PostDominatedByColdCall.insert(BB);
+      return;
+    }
+
+  // Otherwise, if the block itself contains a cold function, add it to the
+  // set of blocks post-dominated by a cold call.
+  for (auto &I : *BB)
+    if (const CallInst *CI = dyn_cast<CallInst>(&I))
+      if (CI->hasFnAttr(Attribute::Cold)) {
+        PostDominatedByColdCall.insert(BB);
+        return;
+      }
+}
+
+/// \brief Calculate edge weights for successors lead to unreachable.
+///
+/// Predict that a successor which leads necessarily to an
+/// unreachable-terminated block as extremely unlikely.
+bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
+  const TerminatorInst *TI = BB->getTerminator();
+  if (TI->getNumSuccessors() == 0)
+    return false;
+
   SmallVector<unsigned, 4> UnreachableEdges;
   SmallVector<unsigned, 4> ReachableEdges;
 
-  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
     if (PostDominatedByUnreachable.count(*I))
       UnreachableEdges.push_back(I.getSuccessorIndex());
     else
       ReachableEdges.push_back(I.getSuccessorIndex());
-  }
-
-  // If all successors are in the set of blocks post-dominated by unreachable,
-  // this block is too.
-  if (UnreachableEdges.size() == TI->getNumSuccessors())
-    PostDominatedByUnreachable.insert(BB);
 
   // Skip probabilities if this block has a single successor or if all were
   // reachable.
   if (TI->getNumSuccessors() == 1 || UnreachableEdges.empty())
     return false;
 
-  // If the terminator is an InvokeInst, check only the normal destination block
-  // as the unwind edge of InvokeInst is also very unlikely taken.
-  if (auto *II = dyn_cast<InvokeInst>(TI))
-    if (PostDominatedByUnreachable.count(II->getNormalDest())) {
-      PostDominatedByUnreachable.insert(BB);
-      // Return false here so that edge weights for InvokeInst could be decided
-      // in calcInvokeHeuristics().
-      return false;
-    }
+  // Return false here so that edge weights for InvokeInst could be decided
+  // in calcInvokeHeuristics().
+  if (isa<InvokeInst>(TI))
+    return false;
 
   if (ReachableEdges.empty()) {
     BranchProbability Prob(1, UnreachableEdges.size());
@@ -263,31 +309,10 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
     else
       NormalEdges.push_back(I.getSuccessorIndex());
 
-  // If all successors are in the set of blocks post-dominated by cold calls,
-  // this block is in the set post-dominated by cold calls.
-  if (ColdEdges.size() == TI->getNumSuccessors())
-    PostDominatedByColdCall.insert(BB);
-  else {
-    // Otherwise, if the block itself contains a cold function, add it to the
-    // set of blocks postdominated by a cold call.
-    assert(!PostDominatedByColdCall.count(BB));
-    for (BasicBlock::const_iterator I = BB->begin(), E = BB->end(); I != E; ++I)
-      if (const CallInst *CI = dyn_cast<CallInst>(I))
-        if (CI->hasFnAttr(Attribute::Cold)) {
-          PostDominatedByColdCall.insert(BB);
-          break;
-        }
-  }
-
-  if (auto *II = dyn_cast<InvokeInst>(TI)) {
-    // If the terminator is an InvokeInst, consider only the normal destination
-    // block.
-    if (PostDominatedByColdCall.count(II->getNormalDest()))
-      PostDominatedByColdCall.insert(BB);
-    // Return false here so that edge weights for InvokeInst could be decided
-    // in calcInvokeHeuristics().
+  // Return false here so that edge weights for InvokeInst could be decided
+  // in calcInvokeHeuristics().
+  if (isa<InvokeInst>(TI))
     return false;
-  }
 
   // Skip probabilities if this block has a single successor.
   if (TI->getNumSuccessors() == 1 || ColdEdges.empty())
@@ -671,6 +696,8 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI) {
   // the successors of a block iteratively.
   for (auto BB : post_order(&F.getEntryBlock())) {
     DEBUG(dbgs() << "Computing probabilities for " << BB->getName() << "\n");
+    updatePostDominatedByUnreachable(BB);
+    updatePostDominatedByColdCall(BB);
     if (calcUnreachableHeuristics(BB))
       continue;
     if (calcMetadataWeights(BB))
diff --git a/lib/Analysis/CGSCCPassManager.cpp b/lib/Analysis/CGSCCPassManager.cpp
index 55ef7b99f63ad7227612a943c2a825d0c353f4ba..9d4521221f477491444ed3f781e63c2b409ac7de 100644
--- a/lib/Analysis/CGSCCPassManager.cpp
+++ b/lib/Analysis/CGSCCPassManager.cpp
@@ -24,7 +24,7 @@ template class PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager,
                            LazyCallGraph &, CGSCCUpdateResult &>;
 template class InnerAnalysisManagerProxy<CGSCCAnalysisManager, Module>;
 template class OuterAnalysisManagerProxy<ModuleAnalysisManager,
-                                         LazyCallGraph::SCC>;
+                                         LazyCallGraph::SCC, LazyCallGraph &>;
 template class OuterAnalysisManagerProxy<CGSCCAnalysisManager, Function>;
 
 /// Explicitly specialize the pass manager run method to handle call graph
@@ -117,6 +117,7 @@ bool CGSCCAnalysisManagerModuleProxy::Result::invalidate(
       PA.allAnalysesInSetPreserved<AllAnalysesOn<LazyCallGraph::SCC>>();
 
   // Ok, we have a graph, so we can propagate the invalidation down into it.
+  G->buildRefSCCs();
   for (auto &RC : G->postorder_ref_sccs())
     for (auto &C : RC) {
       Optional<PreservedAnalyses> InnerPA;
@@ -273,9 +274,9 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
   // demoted edges.
   SmallVector<Constant *, 16> Worklist;
   SmallPtrSet<Constant *, 16> Visited;
-  SmallPtrSet<Function *, 16> RetainedEdges;
-  SmallSetVector<Function *, 4> PromotedRefTargets;
-  SmallSetVector<Function *, 4> DemotedCallTargets;
+  SmallPtrSet<Node *, 16> RetainedEdges;
+  SmallSetVector<Node *, 4> PromotedRefTargets;
+  SmallSetVector<Node *, 4> DemotedCallTargets;
 
   // First walk the function and handle all called functions. We do this first
   // because if there is a single call edge, whether there are ref edges is
@@ -284,7 +285,8 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     if (auto CS = CallSite(&I))
       if (Function *Callee = CS.getCalledFunction())
         if (Visited.insert(Callee).second && !Callee->isDeclaration()) {
-          const Edge *E = N.lookup(*Callee);
+          Node &CalleeN = *G.lookup(*Callee);
+          Edge *E = N->lookup(CalleeN);
           // FIXME: We should really handle adding new calls. While it will
           // make downstream usage more complex, there is no fundamental
           // limitation and it will allow passes within the CGSCC to be a bit
@@ -293,9 +295,9 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
           assert(E && "No function transformations should introduce *new* "
                       "call edges! Any new calls should be modeled as "
                       "promoted existing ref edges!");
-          RetainedEdges.insert(Callee);
+          RetainedEdges.insert(&CalleeN);
           if (!E->isCall())
-            PromotedRefTargets.insert(Callee);
+            PromotedRefTargets.insert(&CalleeN);
         }
 
   // Now walk all references.
@@ -306,24 +308,25 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
           Worklist.push_back(C);
 
   LazyCallGraph::visitReferences(Worklist, Visited, [&](Function &Referee) {
-    const Edge *E = N.lookup(Referee);
+    Node &RefereeN = *G.lookup(Referee);
+    Edge *E = N->lookup(RefereeN);
     // FIXME: Similarly to new calls, we also currently preclude
     // introducing new references. See above for details.
     assert(E && "No function transformations should introduce *new* ref "
                 "edges! Any new ref edges would require IPO which "
                 "function passes aren't allowed to do!");
-    RetainedEdges.insert(&Referee);
+    RetainedEdges.insert(&RefereeN);
     if (E->isCall())
-      DemotedCallTargets.insert(&Referee);
+      DemotedCallTargets.insert(&RefereeN);
   });
 
   // First remove all of the edges that are no longer present in this function.
   // We have to build a list of dead targets first and then remove them as the
   // data structures will all be invalidated by removing them.
   SmallVector<PointerIntPair<Node *, 1, Edge::Kind>, 4> DeadTargets;
-  for (Edge &E : N)
-    if (!RetainedEdges.count(&E.getFunction()))
-      DeadTargets.push_back({E.getNode(), E.getKind()});
+  for (Edge &E : *N)
+    if (!RetainedEdges.count(&E.getNode()))
+      DeadTargets.push_back({&E.getNode(), E.getKind()});
   for (auto DeadTarget : DeadTargets) {
     Node &TargetN = *DeadTarget.getPointer();
     bool IsCall = DeadTarget.getInt() == Edge::Call;
@@ -397,9 +400,8 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
   // Next demote all the call edges that are now ref edges. This helps make
   // the SCCs small which should minimize the work below as we don't want to
   // form cycles that this would break.
-  for (Function *RefTarget : DemotedCallTargets) {
-    Node &TargetN = *G.lookup(*RefTarget);
-    SCC &TargetC = *G.lookupSCC(TargetN);
+  for (Node *RefTarget : DemotedCallTargets) {
+    SCC &TargetC = *G.lookupSCC(*RefTarget);
     RefSCC &TargetRC = TargetC.getOuterRefSCC();
 
     // The easy case is when the target RefSCC is not this RefSCC. This is
@@ -407,10 +409,10 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     if (&TargetRC != RC) {
       assert(RC->isAncestorOf(TargetRC) &&
              "Cannot potentially form RefSCC cycles here!");
-      RC->switchOutgoingEdgeToRef(N, TargetN);
+      RC->switchOutgoingEdgeToRef(N, *RefTarget);
       if (DebugLogging)
         dbgs() << "Switch outgoing call edge to a ref edge from '" << N
-               << "' to '" << TargetN << "'\n";
+               << "' to '" << *RefTarget << "'\n";
       continue;
     }
 
@@ -418,7 +420,7 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     // some SCCs.
     if (C != &TargetC) {
       // For separate SCCs this is trivial.
-      RC->switchTrivialInternalEdgeToRef(N, TargetN);
+      RC->switchTrivialInternalEdgeToRef(N, *RefTarget);
       continue;
     }
 
@@ -430,14 +432,13 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     // structure is changed.
     AM.invalidate(*C, PreservedAnalyses::none());
     // Now update the call graph.
-    C = incorporateNewSCCRange(RC->switchInternalEdgeToRef(N, TargetN), G,
-                               N, C, AM, UR, DebugLogging);
+    C = incorporateNewSCCRange(RC->switchInternalEdgeToRef(N, *RefTarget), G, N,
+                               C, AM, UR, DebugLogging);
   }
 
   // Now promote ref edges into call edges.
-  for (Function *CallTarget : PromotedRefTargets) {
-    Node &TargetN = *G.lookup(*CallTarget);
-    SCC &TargetC = *G.lookupSCC(TargetN);
+  for (Node *CallTarget : PromotedRefTargets) {
+    SCC &TargetC = *G.lookupSCC(*CallTarget);
     RefSCC &TargetRC = TargetC.getOuterRefSCC();
 
     // The easy case is when the target RefSCC is not this RefSCC. This is
@@ -445,22 +446,22 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
     if (&TargetRC != RC) {
       assert(RC->isAncestorOf(TargetRC) &&
              "Cannot potentially form RefSCC cycles here!");
-      RC->switchOutgoingEdgeToCall(N, TargetN);
+      RC->switchOutgoingEdgeToCall(N, *CallTarget);
       if (DebugLogging)
         dbgs() << "Switch outgoing ref edge to a call edge from '" << N
-               << "' to '" << TargetN << "'\n";
+               << "' to '" << *CallTarget << "'\n";
       continue;
     }
     if (DebugLogging)
       dbgs() << "Switch an internal ref edge to a call edge from '" << N
-             << "' to '" << TargetN << "'\n";
+             << "' to '" << *CallTarget << "'\n";
 
     // Otherwise we are switching an internal ref edge to a call edge. This
     // may merge away some SCCs, and we add those to the UpdateResult. We also
     // need to make sure to update the worklist in the event SCCs have moved
     // before the current one in the post-order sequence.
     auto InitialSCCIndex = RC->find(*C) - RC->begin();
-    auto InvalidatedSCCs = RC->switchInternalEdgeToCall(N, TargetN);
+    auto InvalidatedSCCs = RC->switchInternalEdgeToCall(N, *CallTarget);
     if (!InvalidatedSCCs.empty()) {
       C = &TargetC;
       assert(G.lookupSCC(N) == C && "Failed to update current SCC!");
diff --git a/lib/Analysis/CMakeLists.txt b/lib/Analysis/CMakeLists.txt
index d53364373d7bccdc46e397528f9925702f19c734..161709a48466f8868ca34bc2e945343d4b58310e 100644
--- a/lib/Analysis/CMakeLists.txt
+++ b/lib/Analysis/CMakeLists.txt
@@ -53,6 +53,8 @@ add_llvm_library(LLVMAnalysis
   MemoryBuiltins.cpp
   MemoryDependenceAnalysis.cpp
   MemoryLocation.cpp
+  MemorySSA.cpp
+  MemorySSAUpdater.cpp
   ModuleDebugInfoPrinter.cpp
   ModuleSummaryAnalysis.cpp
   ObjCARCAliasAnalysis.cpp
diff --git a/lib/Analysis/CallGraphSCCPass.cpp b/lib/Analysis/CallGraphSCCPass.cpp
index 1c84cf5c6b076e446afcea9dd7b547396dd66ea6..ea70f5752c613f3fc959296365e06d028020cabf 100644
--- a/lib/Analysis/CallGraphSCCPass.cpp
+++ b/lib/Analysis/CallGraphSCCPass.cpp
@@ -629,6 +629,8 @@ namespace {
       }
       return false;
     }
+    
+    StringRef getPassName() const override { return "Print CallGraph IR"; }
   };
   
 } // end anonymous namespace.
diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp
index 6d8d4a1c3e90ce3617eed9c7490202305b0c72c2..14176dac2104c5a1d242ea1d2997b432d6a527d2 100644
--- a/lib/Analysis/ConstantFolding.cpp
+++ b/lib/Analysis/ConstantFolding.cpp
@@ -1058,8 +1058,8 @@ ConstantFoldConstantImpl(const Constant *C, const DataLayout &DL,
       if (It == FoldedOps.end()) {
         if (auto *FoldedC =
                 ConstantFoldConstantImpl(NewC, DL, TLI, FoldedOps)) {
-          NewC = FoldedC;
           FoldedOps.insert({NewC, FoldedC});
+          NewC = FoldedC;
         } else {
           FoldedOps.insert({NewC, NewC});
         }
@@ -1401,7 +1401,7 @@ bool llvm::canConstantFoldCallTo(const Function *F) {
     return true;
   default:
     return false;
-  case 0: break;
+  case Intrinsic::not_intrinsic: break;
   }
 
   if (!F->hasName())
@@ -1518,9 +1518,9 @@ Constant *ConstantFoldSSEConvertToInt(const APFloat &Val, bool roundTowardZero,
   bool isExact = false;
   APFloat::roundingMode mode = roundTowardZero? APFloat::rmTowardZero
                                               : APFloat::rmNearestTiesToEven;
-  APFloat::opStatus status = Val.convertToInteger(&UIntVal, ResultWidth,
-                                                  /*isSigned=*/true, mode,
-                                                  &isExact);
+  APFloat::opStatus status =
+      Val.convertToInteger(makeMutableArrayRef(UIntVal), ResultWidth,
+                           /*isSigned=*/true, mode, &isExact);
   if (status != APFloat::opOK &&
       (!roundTowardZero || status != APFloat::opInexact))
     return nullptr;
@@ -1768,7 +1768,8 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty,
     }
 
     if (isa<UndefValue>(Operands[0])) {
-      if (IntrinsicID == Intrinsic::bswap)
+      if (IntrinsicID == Intrinsic::bswap ||
+          IntrinsicID == Intrinsic::bitreverse)
         return Operands[0];
       return nullptr;
     }
diff --git a/lib/Analysis/CostModel.cpp b/lib/Analysis/CostModel.cpp
index 6b77397956cdac3281bec8f609bbd9e54d35889b..32bfea58bf9d420eff90575b9352f7822bafa870 100644
--- a/lib/Analysis/CostModel.cpp
+++ b/lib/Analysis/CostModel.cpp
@@ -447,25 +447,25 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   case Instruction::Select: {
     const SelectInst *SI = cast<SelectInst>(I);
     Type *CondTy = SI->getCondition()->getType();
-    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
     Type *ValTy = I->getOperand(0)->getType();
-    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy);
+    return TTI->getCmpSelInstrCost(I->getOpcode(), ValTy, I->getType(), I);
   }
   case Instruction::Store: {
     const StoreInst *SI = cast<StoreInst>(I);
     Type *ValTy = SI->getValueOperand()->getType();
     return TTI->getMemoryOpCost(I->getOpcode(), ValTy,
-                                 SI->getAlignment(),
-                                 SI->getPointerAddressSpace());
+                                SI->getAlignment(),
+                                SI->getPointerAddressSpace(), I);
   }
   case Instruction::Load: {
     const LoadInst *LI = cast<LoadInst>(I);
     return TTI->getMemoryOpCost(I->getOpcode(), I->getType(),
-                                 LI->getAlignment(),
-                                 LI->getPointerAddressSpace());
+                                LI->getAlignment(),
+                                LI->getPointerAddressSpace(), I);
   }
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -481,7 +481,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast: {
     Type *SrcTy = I->getOperand(0)->getType();
-    return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy);
+    return TTI->getCastInstrCost(I->getOpcode(), I->getType(), SrcTy, I);
   }
   case Instruction::ExtractElement: {
     const ExtractElementInst * EEI = cast<ExtractElementInst>(I);
@@ -542,9 +542,7 @@ unsigned CostModelAnalysis::getInstructionCost(const Instruction *I) const {
   }
   case Instruction::Call:
     if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-      SmallVector<Value *, 4> Args;
-      for (unsigned J = 0, JE = II->getNumArgOperands(); J != JE; ++J)
-        Args.push_back(II->getArgOperand(J));
+      SmallVector<Value *, 4> Args(II->arg_operands());
 
       FastMathFlags FMF;
       if (auto *FPMO = dyn_cast<FPMathOperator>(II))
diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp
index 120071045cb7dedfbbd54c5b2c79c83804936446..cd85ef3703f53bbd3ed1123e37b0e1767a70614d 100644
--- a/lib/Analysis/InlineCost.cpp
+++ b/lib/Analysis/InlineCost.cpp
@@ -142,10 +142,11 @@ class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   void disableSROA(Value *V);
   void accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
                           int InstructionCost);
-  bool isGEPOffsetConstant(GetElementPtrInst &GEP);
   bool isGEPFree(GetElementPtrInst &GEP);
   bool accumulateGEPOffset(GEPOperator &GEP, APInt &Offset);
   bool simplifyCallSite(Function *F, CallSite CS);
+  template <typename Callable>
+  bool simplifyInstruction(Instruction &I, Callable Evaluate);
   ConstantInt *stripAndComputeInBoundsConstantOffsets(Value *&V);
 
   /// Return true if the given argument to the function being considered for
@@ -298,17 +299,6 @@ void CallAnalyzer::accumulateSROACost(DenseMap<Value *, int>::iterator CostIt,
   SROACostSavings += InstructionCost;
 }
 
-/// \brief Check whether a GEP's indices are all constant.
-///
-/// Respects any simplified values known during the analysis of this callsite.
-bool CallAnalyzer::isGEPOffsetConstant(GetElementPtrInst &GEP) {
-  for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
-    if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
-      return false;
-
-  return true;
-}
-
 /// \brief Accumulate a constant GEP offset into an APInt if possible.
 ///
 /// Returns false if unable to compute the offset for any reason. Respects any
@@ -438,7 +428,15 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
     }
   }
 
-  if (isGEPOffsetConstant(I)) {
+  // Lambda to check whether a GEP's indices are all constant.
+  auto IsGEPOffsetConstant = [&](GetElementPtrInst &GEP) {
+    for (User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end(); I != E; ++I)
+      if (!isa<Constant>(*I) && !SimplifiedValues.lookup(*I))
+        return false;
+    return true;
+  };
+
+  if (IsGEPOffsetConstant(I)) {
     if (SROACandidate)
       SROAArgValues[&I] = SROAArg;
 
@@ -452,16 +450,33 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
   return isGEPFree(I);
 }
 
+/// Simplify \p I if its operands are constants and update SimplifiedValues.
+/// \p Evaluate is a callable specific to instruction type that evaluates the
+/// instruction when all the operands are constants.
+template <typename Callable>
+bool CallAnalyzer::simplifyInstruction(Instruction &I, Callable Evaluate) {
+  SmallVector<Constant *, 2> COps;
+  for (Value *Op : I.operands()) {
+    Constant *COp = dyn_cast<Constant>(Op);
+    if (!COp)
+      COp = SimplifiedValues.lookup(Op);
+    if (!COp)
+      return false;
+    COps.push_back(COp);
+  }
+  auto *C = Evaluate(COps);
+  if (!C)
+    return false;
+  SimplifiedValues[&I] = C;
+  return true;
+}
+
 bool CallAnalyzer::visitBitCast(BitCastInst &I) {
   // Propagate constants through bitcasts.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getBitCast(COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getBitCast(COps[0], I.getType());
+      }))
+    return true;
 
   // Track base/offsets through casts
   std::pair<Value *, APInt> BaseAndOffset =
@@ -482,14 +497,10 @@ bool CallAnalyzer::visitBitCast(BitCastInst &I) {
 
 bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   // Propagate constants through ptrtoint.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getPtrToInt(COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getPtrToInt(COps[0], I.getType());
+      }))
+    return true;
 
   // Track base/offset pairs when converted to a plain integer provided the
   // integer is large enough to represent the pointer.
@@ -519,14 +530,10 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
 
 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   // Propagate constants through ptrtoint.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getIntToPtr(COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getIntToPtr(COps[0], I.getType());
+      }))
+    return true;
 
   // Track base/offset pairs when round-tripped through a pointer without
   // modifications provided the integer is not too large.
@@ -550,14 +557,10 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
 
 bool CallAnalyzer::visitCastInst(CastInst &I) {
   // Propagate constants through ptrtoint.
-  Constant *COp = dyn_cast<Constant>(I.getOperand(0));
-  if (!COp)
-    COp = SimplifiedValues.lookup(I.getOperand(0));
-  if (COp)
-    if (Constant *C = ConstantExpr::getCast(I.getOpcode(), COp, I.getType())) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getCast(I.getOpcode(), COps[0], I.getType());
+      }))
+    return true;
 
   // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
   disableSROA(I.getOperand(0));
@@ -567,16 +570,11 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
 
 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
   Value *Operand = I.getOperand(0);
-  Constant *COp = dyn_cast<Constant>(Operand);
-  if (!COp)
-    COp = SimplifiedValues.lookup(Operand);
-  if (COp) {
-    const DataLayout &DL = F.getParent()->getDataLayout();
-    if (Constant *C = ConstantFoldInstOperands(&I, COp, DL)) {
-      SimplifiedValues[&I] = C;
-      return true;
-    }
-  }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        const DataLayout &DL = F.getParent()->getDataLayout();
+        return ConstantFoldInstOperands(&I, COps[0], DL);
+      }))
+    return true;
 
   // Disable any SROA on the argument to arbitrary unary operators.
   disableSROA(Operand);
@@ -672,7 +670,7 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
       BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
       if (PSI->isHotCallSite(CS, CallerBFI)) {
         DEBUG(dbgs() << "Hot callsite.\n");
-        Threshold = MaxIfValid(Threshold, Params.HotCallSiteThreshold);
+        Threshold = Params.HotCallSiteThreshold.getValue();
       } else if (PSI->isFunctionEntryHot(&Callee)) {
         DEBUG(dbgs() << "Hot callee.\n");
         // If callsite hotness can not be determined, we may still know
@@ -697,20 +695,10 @@ void CallAnalyzer::updateThreshold(CallSite CS, Function &Callee) {
 bool CallAnalyzer::visitCmpInst(CmpInst &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
   // First try to handle simplified comparisons.
-  if (!isa<Constant>(LHS))
-    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
-      LHS = SimpleLHS;
-  if (!isa<Constant>(RHS))
-    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
-      RHS = SimpleRHS;
-  if (Constant *CLHS = dyn_cast<Constant>(LHS)) {
-    if (Constant *CRHS = dyn_cast<Constant>(RHS))
-      if (Constant *C =
-              ConstantExpr::getCompare(I.getPredicate(), CLHS, CRHS)) {
-        SimplifiedValues[&I] = C;
-        return true;
-      }
-  }
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getCompare(I.getPredicate(), COps[0], COps[1]);
+      }))
+    return true;
 
   if (I.getOpcode() == Instruction::FCmp)
     return false;
@@ -788,24 +776,19 @@ bool CallAnalyzer::visitSub(BinaryOperator &I) {
 
 bool CallAnalyzer::visitBinaryOperator(BinaryOperator &I) {
   Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  if (!isa<Constant>(LHS))
-    if (Constant *SimpleLHS = SimplifiedValues.lookup(LHS))
-      LHS = SimpleLHS;
-  if (!isa<Constant>(RHS))
-    if (Constant *SimpleRHS = SimplifiedValues.lookup(RHS))
-      RHS = SimpleRHS;
-  Value *SimpleV = nullptr;
-  if (auto FI = dyn_cast<FPMathOperator>(&I))
-    SimpleV =
-        SimplifyFPBinOp(I.getOpcode(), LHS, RHS, FI->getFastMathFlags(), DL);
-  else
-    SimpleV = SimplifyBinOp(I.getOpcode(), LHS, RHS, DL);
+  auto Evaluate = [&](SmallVectorImpl<Constant *> &COps) {
+    Value *SimpleV = nullptr;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    if (auto FI = dyn_cast<FPMathOperator>(&I))
+      SimpleV = SimplifyFPBinOp(I.getOpcode(), COps[0], COps[1],
+                                FI->getFastMathFlags(), DL);
+    else
+      SimpleV = SimplifyBinOp(I.getOpcode(), COps[0], COps[1], DL);
+    return dyn_cast_or_null<Constant>(SimpleV);
+  };
 
-  if (Constant *C = dyn_cast_or_null<Constant>(SimpleV)) {
-    SimplifiedValues[&I] = C;
+  if (simplifyInstruction(I, Evaluate))
     return true;
-  }
 
   // Disable any SROA on arguments to arbitrary, unsimplified binary operators.
   disableSROA(LHS);
@@ -846,13 +829,10 @@ bool CallAnalyzer::visitStore(StoreInst &I) {
 
 bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
   // Constant folding for extract value is trivial.
-  Constant *C = dyn_cast<Constant>(I.getAggregateOperand());
-  if (!C)
-    C = SimplifiedValues.lookup(I.getAggregateOperand());
-  if (C) {
-    SimplifiedValues[&I] = ConstantExpr::getExtractValue(C, I.getIndices());
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getExtractValue(COps[0], I.getIndices());
+      }))
     return true;
-  }
 
   // SROA can look through these but give them a cost.
   return false;
@@ -860,17 +840,12 @@ bool CallAnalyzer::visitExtractValue(ExtractValueInst &I) {
 
 bool CallAnalyzer::visitInsertValue(InsertValueInst &I) {
   // Constant folding for insert value is trivial.
-  Constant *AggC = dyn_cast<Constant>(I.getAggregateOperand());
-  if (!AggC)
-    AggC = SimplifiedValues.lookup(I.getAggregateOperand());
-  Constant *InsertedC = dyn_cast<Constant>(I.getInsertedValueOperand());
-  if (!InsertedC)
-    InsertedC = SimplifiedValues.lookup(I.getInsertedValueOperand());
-  if (AggC && InsertedC) {
-    SimplifiedValues[&I] =
-        ConstantExpr::getInsertValue(AggC, InsertedC, I.getIndices());
+  if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        return ConstantExpr::getInsertValue(/*AggregateOperand*/ COps[0],
+                                            /*InsertedValueOperand*/ COps[1],
+                                            I.getIndices());
+      }))
     return true;
-  }
 
   // SROA can look through these but give them a cost.
   return false;
@@ -1039,8 +1014,8 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   // does not (yet) fire.
   SmallPtrSet<BasicBlock *, 8> SuccessorBlocks;
   SuccessorBlocks.insert(SI.getDefaultDest());
-  for (auto I = SI.case_begin(), E = SI.case_end(); I != E; ++I)
-    SuccessorBlocks.insert(I.getCaseSuccessor());
+  for (auto Case : SI.cases())
+    SuccessorBlocks.insert(Case.getCaseSuccessor());
   // Add cost corresponding to the number of distinct destinations. The first
   // we model as free because of fallthrough.
   Cost += (SuccessorBlocks.size() - 1) * InlineConstants::InstrCost;
@@ -1404,7 +1379,7 @@ bool CallAnalyzer::analyzeCall(CallSite CS) {
       Value *Cond = SI->getCondition();
       if (ConstantInt *SimpleCond =
               dyn_cast_or_null<ConstantInt>(SimplifiedValues.lookup(Cond))) {
-        BBWorklist.insert(SI->findCaseValue(SimpleCond).getCaseSuccessor());
+        BBWorklist.insert(SI->findCaseValue(SimpleCond)->getCaseSuccessor());
         continue;
       }
     }
@@ -1463,13 +1438,6 @@ LLVM_DUMP_METHOD void CallAnalyzer::dump() {
 }
 #endif
 
-/// \brief Test that two functions either have or have not the given attribute
-///        at the same time.
-template <typename AttrKind>
-static bool attributeMatches(Function *F1, Function *F2, AttrKind Attr) {
-  return F1->getFnAttribute(Attr) == F2->getFnAttribute(Attr);
-}
-
 /// \brief Test that there are no attribute conflicts between Caller and Callee
 ///        that prevent inlining.
 static bool functionsHaveCompatibleAttributes(Function *Caller,
diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp
index fc0b86aaa395973a55ebc80deea7329b76a24f29..0e522cb4e495c6e3f1ed2cb3d0b29f61d4b2bbea 100644
--- a/lib/Analysis/InstructionSimplify.cpp
+++ b/lib/Analysis/InstructionSimplify.cpp
@@ -24,6 +24,7 @@
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/ConstantRange.h"
@@ -140,10 +141,9 @@ static bool ValueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
 /// given by OpcodeToExpand, while "A" corresponds to LHS and "B op' C" to RHS.
 /// Also performs the transform "(A op' B) op C" -> "(A op C) op' (B op C)".
 /// Returns the simplified value, or null if no simplification was performed.
-static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
-                          unsigned OpcToExpand, const Query &Q,
+static Value *ExpandBinOp(Instruction::BinaryOps Opcode, Value *LHS, Value *RHS,
+                          Instruction::BinaryOps OpcodeToExpand, const Query &Q,
                           unsigned MaxRecurse) {
-  Instruction::BinaryOps OpcodeToExpand = (Instruction::BinaryOps)OpcToExpand;
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -199,9 +199,9 @@ static Value *ExpandBinOp(unsigned Opcode, Value *LHS, Value *RHS,
 
 /// Generic simplifications for associative binary operations.
 /// Returns the simpler value, or null if none was found.
-static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
-                                       const Query &Q, unsigned MaxRecurse) {
-  Instruction::BinaryOps Opcode = (Instruction::BinaryOps)Opc;
+static Value *SimplifyAssociativeBinOp(Instruction::BinaryOps Opcode,
+                                       Value *LHS, Value *RHS, const Query &Q,
+                                       unsigned MaxRecurse) {
   assert(Instruction::isAssociative(Opcode) && "Not an associative operation!");
 
   // Recursion is always used, so bail out at once if we already hit the limit.
@@ -298,8 +298,9 @@ static Value *SimplifyAssociativeBinOp(unsigned Opc, Value *LHS, Value *RHS,
 /// try to simplify the binop by seeing whether evaluating it on both branches
 /// of the select results in the same value. Returns the common value if so,
 /// otherwise returns null.
-static Value *ThreadBinOpOverSelect(unsigned Opcode, Value *LHS, Value *RHS,
-                                    const Query &Q, unsigned MaxRecurse) {
+static Value *ThreadBinOpOverSelect(Instruction::BinaryOps Opcode, Value *LHS,
+                                    Value *RHS, const Query &Q,
+                                    unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -451,8 +452,9 @@ static Value *ThreadCmpOverSelect(CmpInst::Predicate Pred, Value *LHS,
 /// try to simplify the binop by seeing whether evaluating it on the incoming
 /// phi values yields the same result for every value. If so returns the common
 /// value, otherwise returns null.
-static Value *ThreadBinOpOverPHI(unsigned Opcode, Value *LHS, Value *RHS,
-                                 const Query &Q, unsigned MaxRecurse) {
+static Value *ThreadBinOpOverPHI(Instruction::BinaryOps Opcode, Value *LHS,
+                                 Value *RHS, const Query &Q,
+                                 unsigned MaxRecurse) {
   // Recursion is always used, so bail out at once if we already hit the limit.
   if (!MaxRecurse--)
     return nullptr;
@@ -527,17 +529,26 @@ static Value *ThreadCmpOverPHI(CmpInst::Predicate Pred, Value *LHS, Value *RHS,
   return CommonValue;
 }
 
+static Constant *foldOrCommuteConstant(Instruction::BinaryOps Opcode,
+                                       Value *&Op0, Value *&Op1,
+                                       const Query &Q) {
+  if (auto *CLHS = dyn_cast<Constant>(Op0)) {
+    if (auto *CRHS = dyn_cast<Constant>(Op1))
+      return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);
+
+    // Canonicalize the constant to the RHS if this is a commutative operation.
+    if (Instruction::isCommutative(Opcode))
+      std::swap(Op0, Op1);
+  }
+  return nullptr;
+}
+
 /// Given operands for an Add, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Add, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Add, Op0, Op1, Q))
+    return C;
 
   // X + undef -> undef
   if (match(Op1, m_Undef()))
@@ -556,12 +567,20 @@ static Value *SimplifyAddInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     return Y;
 
   // X + ~X -> -1   since   ~X = -X-1
+  Type *Ty = Op0->getType();
   if (match(Op0, m_Not(m_Specific(Op1))) ||
       match(Op1, m_Not(m_Specific(Op0))))
-    return Constant::getAllOnesValue(Op0->getType());
+    return Constant::getAllOnesValue(Ty);
+
+  // add nsw/nuw (xor Y, signbit), signbit --> Y
+  // The no-wrapping add guarantees that the top bit will be set by the add.
+  // Therefore, the xor must be clearing the already set sign bit of Y.
+  if ((isNSW || isNUW) && match(Op1, m_SignBit()) &&
+      match(Op0, m_Xor(m_Value(Y), m_SignBit())))
+    return Y;
 
   /// i1 add -> xor.
-  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+  if (MaxRecurse && Op0->getType()->getScalarType()->isIntegerTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
@@ -665,9 +684,8 @@ static Constant *computePointerDifference(const DataLayout &DL, Value *LHS,
 /// If not, this returns null.
 static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
                               const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0))
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Sub, CLHS, CRHS, Q.DL);
+  if (Constant *C = foldOrCommuteConstant(Instruction::Sub, Op0, Op1, Q))
+    return C;
 
   // X - undef -> undef
   // undef - X -> undef
@@ -692,7 +710,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
     computeKnownBits(Op1, KnownZero, KnownOne, Q.DL, 0, Q.AC, Q.CxtI, Q.DT);
-    if (KnownZero == ~APInt::getSignBit(BitWidth)) {
+    if (KnownZero.isMaxSignedValue()) {
       // Op1 is either 0 or the minimum signed value. If the sub is NSW, then
       // Op1 must be 0 because negating the minimum signed value is undefined.
       if (isNSW)
@@ -779,7 +797,7 @@ static Value *SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
       return ConstantExpr::getIntegerCast(Result, Op0->getType(), true);
 
   // i1 sub -> xor.
-  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+  if (MaxRecurse && Op0->getType()->getScalarType()->isIntegerTy(1))
     if (Value *V = SimplifyXorInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
@@ -807,13 +825,8 @@ Value *llvm::SimplifySubInst(Value *Op0, Value *Op1, bool isNSW, bool isNUW,
 /// returns null.
 static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::FAdd, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q))
+    return C;
 
   // fadd X, -0 ==> X
   if (match(Op1, m_NegZero()))
@@ -846,10 +859,8 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 /// returns null.
 static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const Query &Q, unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::FSub, CLHS, CRHS, Q.DL);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q))
+    return C;
 
   // fsub X, 0 ==> X
   if (match(Op1, m_Zero()))
@@ -878,40 +889,28 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 }
 
 /// Given the operands for an FMul, see if we can fold the result
-static Value *SimplifyFMulInst(Value *Op0, Value *Op1,
-                               FastMathFlags FMF,
-                               const Query &Q,
-                               unsigned MaxRecurse) {
- if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::FMul, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
- }
+static Value *SimplifyFMulInst(Value *Op0, Value *Op1, FastMathFlags FMF,
+                               const Query &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FMul, Op0, Op1, Q))
+    return C;
 
- // fmul X, 1.0 ==> X
- if (match(Op1, m_FPOne()))
-   return Op0;
+  // fmul X, 1.0 ==> X
+  if (match(Op1, m_FPOne()))
+    return Op0;
 
- // fmul nnan nsz X, 0 ==> 0
- if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZero()))
-   return Op1;
+  // fmul nnan nsz X, 0 ==> 0
+  if (FMF.noNaNs() && FMF.noSignedZeros() && match(Op1, m_AnyZero()))
+    return Op1;
 
- return nullptr;
+  return nullptr;
 }
 
 /// Given operands for a Mul, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Mul, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Mul, Op0, Op1, Q))
+    return C;
 
   // X * undef -> 0
   if (match(Op1, m_Undef()))
@@ -932,7 +931,7 @@ static Value *SimplifyMulInst(Value *Op0, Value *Op1, const Query &Q,
     return X;
 
   // i1 mul -> and.
-  if (MaxRecurse && Op0->getType()->isIntegerTy(1))
+  if (MaxRecurse && Op0->getType()->getScalarType()->isIntegerTy(1))
     if (Value *V = SimplifyAndInst(Op0, Op1, Q, MaxRecurse-1))
       return V;
 
@@ -998,43 +997,68 @@ Value *llvm::SimplifyMulInst(Value *Op0, Value *Op1, const DataLayout &DL,
                            RecursionLimit);
 }
 
-/// Given operands for an SDiv or UDiv, see if we can fold the result.
-/// If not, this returns null.
-static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
-                          const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0))
-    if (Constant *C1 = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
-
-  bool isSigned = Opcode == Instruction::SDiv;
+/// Check for common or similar folds of integer division or integer remainder.
+static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) {
+  Type *Ty = Op0->getType();
 
   // X / undef -> undef
+  // X % undef -> undef
   if (match(Op1, m_Undef()))
     return Op1;
 
-  // X / 0 -> undef, we don't need to preserve faults!
+  // X / 0 -> undef
+  // X % 0 -> undef
+  // We don't need to preserve faults!
   if (match(Op1, m_Zero()))
-    return UndefValue::get(Op1->getType());
+    return UndefValue::get(Ty);
+
+  // If any element of a constant divisor vector is zero, the whole op is undef.
+  auto *Op1C = dyn_cast<Constant>(Op1);
+  if (Op1C && Ty->isVectorTy()) {
+    unsigned NumElts = Ty->getVectorNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = Op1C->getAggregateElement(i);
+      if (Elt && Elt->isNullValue())
+        return UndefValue::get(Ty);
+    }
+  }
 
   // undef / X -> 0
+  // undef % X -> 0
   if (match(Op0, m_Undef()))
-    return Constant::getNullValue(Op0->getType());
+    return Constant::getNullValue(Ty);
 
-  // 0 / X -> 0, we don't need to preserve faults!
+  // 0 / X -> 0
+  // 0 % X -> 0
   if (match(Op0, m_Zero()))
     return Op0;
 
+  // X / X -> 1
+  // X % X -> 0
+  if (Op0 == Op1)
+    return IsDiv ? ConstantInt::get(Ty, 1) : Constant::getNullValue(Ty);
+
   // X / 1 -> X
-  if (match(Op1, m_One()))
-    return Op0;
+  // X % 1 -> 0
+  // If this is a boolean op (single-bit element type), we can't have
+  // division-by-zero or remainder-by-zero, so assume the divisor is 1.
+  if (match(Op1, m_One()) || Ty->getScalarType()->isIntegerTy(1))
+    return IsDiv ? Op0 : Constant::getNullValue(Ty);
 
-  if (Op0->getType()->isIntegerTy(1))
-    // It can't be division by zero, hence it must be division by one.
-    return Op0;
+  return nullptr;
+}
 
-  // X / X -> 1
-  if (Op0 == Op1)
-    return ConstantInt::get(Op0->getType(), 1);
+/// Given operands for an SDiv or UDiv, see if we can fold the result.
+/// If not, this returns null.
+static Value *SimplifyDiv(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
+                          const Query &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
+    return C;
+
+  if (Value *V = simplifyDivRem(Op0, Op1, true))
+    return V;
+
+  bool isSigned = Opcode == Instruction::SDiv;
 
   // (X * Y) / Y -> X if the multiplication does not overflow.
   Value *X = nullptr, *Y = nullptr;
@@ -1129,6 +1153,9 @@ Value *llvm::SimplifyUDivInst(Value *Op0, Value *Op1, const DataLayout &DL,
 
 static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
                                const Query &Q, unsigned) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q))
+    return C;
+
   // undef / X -> undef    (the undef could be a snan).
   if (match(Op0, m_Undef()))
     return Op0;
@@ -1178,37 +1205,11 @@ Value *llvm::SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 /// If not, this returns null.
 static Value *SimplifyRem(Instruction::BinaryOps Opcode, Value *Op0, Value *Op1,
                           const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0))
-    if (Constant *C1 = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
+  if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
+    return C;
 
-  // X % undef -> undef
-  if (match(Op1, m_Undef()))
-    return Op1;
-
-  // undef % X -> 0
-  if (match(Op0, m_Undef()))
-    return Constant::getNullValue(Op0->getType());
-
-  // 0 % X -> 0, we don't need to preserve faults!
-  if (match(Op0, m_Zero()))
-    return Op0;
-
-  // X % 0 -> undef, we don't need to preserve faults!
-  if (match(Op1, m_Zero()))
-    return UndefValue::get(Op0->getType());
-
-  // X % 1 -> 0
-  if (match(Op1, m_One()))
-    return Constant::getNullValue(Op0->getType());
-
-  if (Op0->getType()->isIntegerTy(1))
-    // It can't be remainder by zero, hence it must be remainder by one.
-    return Constant::getNullValue(Op0->getType());
-
-  // X % X -> 0
-  if (Op0 == Op1)
-    return Constant::getNullValue(Op0->getType());
+  if (Value *V = simplifyDivRem(Op0, Op1, false))
+    return V;
 
   // (X % Y) % Y -> X % Y
   if ((Opcode == Instruction::SRem &&
@@ -1279,7 +1280,10 @@ Value *llvm::SimplifyURemInst(Value *Op0, Value *Op1, const DataLayout &DL,
 }
 
 static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
-                               const Query &, unsigned) {
+                               const Query &Q, unsigned) {
+  if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q))
+    return C;
+
   // undef % X -> undef    (the undef could be a snan).
   if (match(Op0, m_Undef()))
     return Op0;
@@ -1335,11 +1339,10 @@ static bool isUndefShift(Value *Amount) {
 
 /// Given operands for an Shl, LShr or AShr, see if we can fold the result.
 /// If not, this returns null.
-static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
-                            const Query &Q, unsigned MaxRecurse) {
-  if (Constant *C0 = dyn_cast<Constant>(Op0))
-    if (Constant *C1 = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Opcode, C0, C1, Q.DL);
+static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0,
+                            Value *Op1, const Query &Q, unsigned MaxRecurse) {
+  if (Constant *C = foldOrCommuteConstant(Opcode, Op0, Op1, Q))
+    return C;
 
   // 0 shift by X -> 0
   if (match(Op0, m_Zero()))
@@ -1386,8 +1389,8 @@ static Value *SimplifyShift(unsigned Opcode, Value *Op0, Value *Op1,
 
 /// \brief Given operands for an Shl, LShr or AShr, see if we can
 /// fold the result.  If not, this returns null.
-static Value *SimplifyRightShift(unsigned Opcode, Value *Op0, Value *Op1,
-                                 bool isExact, const Query &Q,
+static Value *SimplifyRightShift(Instruction::BinaryOps Opcode, Value *Op0,
+                                 Value *Op1, bool isExact, const Query &Q,
                                  unsigned MaxRecurse) {
   if (Value *V = SimplifyShift(Opcode, Op0, Op1, Q, MaxRecurse))
     return V;
@@ -1636,13 +1639,8 @@ static Value *SimplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
 /// If not, this returns null.
 static Value *SimplifyAndInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::And, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::And, Op0, Op1, Q))
+    return C;
 
   // X & undef -> 0
   if (match(Op1, m_Undef()))
@@ -1838,13 +1836,8 @@ static Value *SimplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1) {
 /// If not, this returns null.
 static Value *SimplifyOrInst(Value *Op0, Value *Op1, const Query &Q,
                              unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Or, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Or, Op0, Op1, Q))
+    return C;
 
   // X | undef -> -1
   if (match(Op1, m_Undef()))
@@ -1971,13 +1964,8 @@ Value *llvm::SimplifyOrInst(Value *Op0, Value *Op1, const DataLayout &DL,
 /// If not, this returns null.
 static Value *SimplifyXorInst(Value *Op0, Value *Op1, const Query &Q,
                               unsigned MaxRecurse) {
-  if (Constant *CLHS = dyn_cast<Constant>(Op0)) {
-    if (Constant *CRHS = dyn_cast<Constant>(Op1))
-      return ConstantFoldBinaryOpOperands(Instruction::Xor, CLHS, CRHS, Q.DL);
-
-    // Canonicalize the constant to the RHS.
-    std::swap(Op0, Op1);
-  }
+  if (Constant *C = foldOrCommuteConstant(Instruction::Xor, Op0, Op1, Q))
+    return C;
 
   // A ^ undef -> undef
   if (match(Op1, m_Undef()))
@@ -3119,8 +3107,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   // If both operands have range metadata, use the metadata
   // to simplify the comparison.
   if (isa<Instruction>(RHS) && isa<Instruction>(LHS)) {
-    auto RHS_Instr = dyn_cast<Instruction>(RHS);
-    auto LHS_Instr = dyn_cast<Instruction>(LHS);
+    auto RHS_Instr = cast<Instruction>(RHS);
+    auto LHS_Instr = cast<Instruction>(LHS);
 
     if (RHS_Instr->getMetadata(LLVMContext::MD_range) &&
         LHS_Instr->getMetadata(LLVMContext::MD_range)) {
@@ -4094,6 +4082,62 @@ Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty,
                             RecursionLimit);
 }
 
+static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
+                                        Type *RetTy, const Query &Q,
+                                        unsigned MaxRecurse) {
+  Type *InVecTy = Op0->getType();
+  unsigned MaskNumElts = Mask->getType()->getVectorNumElements();
+  unsigned InVecNumElts = InVecTy->getVectorNumElements();
+
+  auto *Op0Const = dyn_cast<Constant>(Op0);
+  auto *Op1Const = dyn_cast<Constant>(Op1);
+
+  // If all operands are constant, constant fold the shuffle.
+  if (Op0Const && Op1Const)
+    return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask);
+
+  // If only one of the operands is constant, constant fold the shuffle if the
+  // mask does not select elements from the variable operand.
+  bool MaskSelects0 = false, MaskSelects1 = false;
+  for (unsigned i = 0; i != MaskNumElts; ++i) {
+    int Idx = ShuffleVectorInst::getMaskValue(Mask, i);
+    if (Idx == -1)
+      continue;
+    if ((unsigned)Idx < InVecNumElts)
+      MaskSelects0 = true;
+    else
+      MaskSelects1 = true;
+  }
+  if (!MaskSelects0 && Op1Const)
+    return ConstantFoldShuffleVectorInstruction(UndefValue::get(InVecTy),
+                                                Op1Const, Mask);
+  if (!MaskSelects1 && Op0Const)
+    return ConstantFoldShuffleVectorInstruction(Op0Const,
+                                                UndefValue::get(InVecTy), Mask);
+
+  // A shuffle of a splat is always the splat itself. Legal if the shuffle's
+  // value type is same as the input vectors' type.
+  if (auto *OpShuf = dyn_cast<ShuffleVectorInst>(Op0))
+    if (!MaskSelects1 && RetTy == InVecTy &&
+        OpShuf->getMask()->getSplatValue())
+      return Op0;
+  if (auto *OpShuf = dyn_cast<ShuffleVectorInst>(Op1))
+    if (!MaskSelects0 && RetTy == InVecTy &&
+        OpShuf->getMask()->getSplatValue())
+      return Op1;
+
+  return nullptr;
+}
+
+/// Given operands for a ShuffleVectorInst, fold the result or return null.
+Value *llvm::SimplifyShuffleVectorInst(
+    Value *Op0, Value *Op1, Constant *Mask, Type *RetTy,
+    const DataLayout &DL, const TargetLibraryInfo *TLI, const DominatorTree *DT,
+    AssumptionCache *AC, const Instruction *CxtI) {
+  return ::SimplifyShuffleVectorInst(
+      Op0, Op1, Mask, RetTy, Query(DL, TLI, DT, AC, CxtI), RecursionLimit);
+}
+
 //=== Helper functions for higher up the class hierarchy.
 
 /// Given operands for a BinaryOperator, see if we can fold the result.
@@ -4102,61 +4146,43 @@ static Value *SimplifyBinOp(unsigned Opcode, Value *LHS, Value *RHS,
                             const Query &Q, unsigned MaxRecurse) {
   switch (Opcode) {
   case Instruction::Add:
-    return SimplifyAddInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           Q, MaxRecurse);
+    return SimplifyAddInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::FAdd:
     return SimplifyFAddInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-
   case Instruction::Sub:
-    return SimplifySubInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           Q, MaxRecurse);
+    return SimplifySubInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::FSub:
     return SimplifyFSubInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-
-  case Instruction::Mul:  return SimplifyMulInst (LHS, RHS, Q, MaxRecurse);
+  case Instruction::Mul:
+    return SimplifyMulInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FMul:
-    return SimplifyFMulInst (LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-  case Instruction::SDiv: return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::UDiv: return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
+    return SimplifyFMulInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+  case Instruction::SDiv:
+    return SimplifySDivInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::UDiv:
+    return SimplifyUDivInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FDiv:
-      return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
-  case Instruction::SRem: return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::URem: return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
+    return SimplifyFDivInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+  case Instruction::SRem:
+    return SimplifySRemInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::URem:
+    return SimplifyURemInst(LHS, RHS, Q, MaxRecurse);
   case Instruction::FRem:
-      return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
+    return SimplifyFRemInst(LHS, RHS, FastMathFlags(), Q, MaxRecurse);
   case Instruction::Shl:
-    return SimplifyShlInst(LHS, RHS, /*isNSW*/false, /*isNUW*/false,
-                           Q, MaxRecurse);
+    return SimplifyShlInst(LHS, RHS, false, false, Q, MaxRecurse);
   case Instruction::LShr:
-    return SimplifyLShrInst(LHS, RHS, /*isExact*/false, Q, MaxRecurse);
+    return SimplifyLShrInst(LHS, RHS, false, Q, MaxRecurse);
   case Instruction::AShr:
-    return SimplifyAShrInst(LHS, RHS, /*isExact*/false, Q, MaxRecurse);
-  case Instruction::And: return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
-  case Instruction::Or:  return SimplifyOrInst (LHS, RHS, Q, MaxRecurse);
-  case Instruction::Xor: return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
+    return SimplifyAShrInst(LHS, RHS, false, Q, MaxRecurse);
+  case Instruction::And:
+    return SimplifyAndInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::Or:
+    return SimplifyOrInst(LHS, RHS, Q, MaxRecurse);
+  case Instruction::Xor:
+    return SimplifyXorInst(LHS, RHS, Q, MaxRecurse);
   default:
-    if (Constant *CLHS = dyn_cast<Constant>(LHS))
-      if (Constant *CRHS = dyn_cast<Constant>(RHS))
-        return ConstantFoldBinaryOpOperands(Opcode, CLHS, CRHS, Q.DL);
-
-    // If the operation is associative, try some generic simplifications.
-    if (Instruction::isAssociative(Opcode))
-      if (Value *V = SimplifyAssociativeBinOp(Opcode, LHS, RHS, Q, MaxRecurse))
-        return V;
-
-    // If the operation is with the result of a select instruction check whether
-    // operating on either branch of the select always yields the same value.
-    if (isa<SelectInst>(LHS) || isa<SelectInst>(RHS))
-      if (Value *V = ThreadBinOpOverSelect(Opcode, LHS, RHS, Q, MaxRecurse))
-        return V;
-
-    // If the operation is with the result of a phi instruction, check whether
-    // operating on all incoming values of the phi always yields the same value.
-    if (isa<PHINode>(LHS) || isa<PHINode>(RHS))
-      if (Value *V = ThreadBinOpOverPHI(Opcode, LHS, RHS, Q, MaxRecurse))
-        return V;
-
-    return nullptr;
+    llvm_unreachable("Unexpected opcode");
   }
 }
 
@@ -4452,7 +4478,8 @@ Value *llvm::SimplifyCall(Value *V, ArrayRef<Value *> Args,
 /// If not, this returns null.
 Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
                                  const TargetLibraryInfo *TLI,
-                                 const DominatorTree *DT, AssumptionCache *AC) {
+                                 const DominatorTree *DT, AssumptionCache *AC,
+                                 OptimizationRemarkEmitter *ORE) {
   Value *Result;
 
   switch (I->getOpcode()) {
@@ -4578,6 +4605,13 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
         EEI->getVectorOperand(), EEI->getIndexOperand(), DL, TLI, DT, AC, I);
     break;
   }
+  case Instruction::ShuffleVector: {
+    auto *SVI = cast<ShuffleVectorInst>(I);
+    Result = SimplifyShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
+                                       SVI->getMask(), SVI->getType(), DL, TLI,
+                                       DT, AC, I);
+    break;
+  }
   case Instruction::PHI:
     Result = SimplifyPHINode(cast<PHINode>(I), Query(DL, TLI, DT, AC, I));
     break;
@@ -4601,7 +4635,7 @@ Value *llvm::SimplifyInstruction(Instruction *I, const DataLayout &DL,
     unsigned BitWidth = I->getType()->getScalarSizeInBits();
     APInt KnownZero(BitWidth, 0);
     APInt KnownOne(BitWidth, 0);
-    computeKnownBits(I, KnownZero, KnownOne, DL, /*Depth*/0, AC, I, DT);
+    computeKnownBits(I, KnownZero, KnownOne, DL, /*Depth*/0, AC, I, DT, ORE);
     if ((KnownZero | KnownOne).isAllOnesValue())
       Result = ConstantInt::get(I->getType(), KnownOne);
   }
diff --git a/lib/Analysis/IteratedDominanceFrontier.cpp b/lib/Analysis/IteratedDominanceFrontier.cpp
index d1374acd963eccf991f76706b08dab04ae876a4c..2a736ec0379ca29e69b688acb9f9696fe8461548 100644
--- a/lib/Analysis/IteratedDominanceFrontier.cpp
+++ b/lib/Analysis/IteratedDominanceFrontier.cpp
@@ -64,10 +64,7 @@ void IDFCalculator<NodeTy>::calculate(
       BasicBlock *BB = Node->getBlock();
       // Succ is the successor in the direction we are calculating IDF, so it is
       // successor for IDF, and predecessor for Reverse IDF.
-      for (auto SuccIter = GraphTraits<NodeTy>::child_begin(BB),
-                End = GraphTraits<NodeTy>::child_end(BB);
-           SuccIter != End; ++SuccIter) {
-        BasicBlock *Succ = *SuccIter;
+      for (auto *Succ : children<NodeTy>(BB)) {
         DomTreeNode *SuccNode = DT.getNode(Succ);
 
         // Quickly skip all CFG edges that are also dominator tree edges instead
diff --git a/lib/Analysis/LazyBlockFrequencyInfo.cpp b/lib/Analysis/LazyBlockFrequencyInfo.cpp
index 596b6fc1afb5f5f811234c03a505bc97e82608dd..a8178ecc0a24914f16051d2b9244bb5eb1b0826d 100644
--- a/lib/Analysis/LazyBlockFrequencyInfo.cpp
+++ b/lib/Analysis/LazyBlockFrequencyInfo.cpp
@@ -9,7 +9,7 @@
 //
 // This is an alternative analysis pass to BlockFrequencyInfoWrapperPass.  The
 // difference is that with this pass the block frequencies are not computed when
-// the analysis pass is executed but rather when the BFI results is explicitly
+// the analysis pass is executed but rather when the BFI result is explicitly
 // requested by the analysis client.
 //
 //===----------------------------------------------------------------------===//
diff --git a/lib/Analysis/LazyCallGraph.cpp b/lib/Analysis/LazyCallGraph.cpp
index 43f47ccc73737a1b7fb110a4ea30106946df41c9..eef56815f2e07f515b201a76da8bd9683effdade 100644
--- a/lib/Analysis/LazyCallGraph.cpp
+++ b/lib/Analysis/LazyCallGraph.cpp
@@ -18,26 +18,50 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/GraphWriter.h"
+#include <utility>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "lcg"
 
+void LazyCallGraph::EdgeSequence::insertEdgeInternal(Node &TargetN,
+                                                     Edge::Kind EK) {
+  EdgeIndexMap.insert({&TargetN, Edges.size()});
+  Edges.emplace_back(TargetN, EK);
+}
+
+void LazyCallGraph::EdgeSequence::setEdgeKind(Node &TargetN, Edge::Kind EK) {
+  Edges[EdgeIndexMap.find(&TargetN)->second].setKind(EK);
+}
+
+bool LazyCallGraph::EdgeSequence::removeEdgeInternal(Node &TargetN) {
+  auto IndexMapI = EdgeIndexMap.find(&TargetN);
+  if (IndexMapI == EdgeIndexMap.end())
+    return false;
+
+  Edges[IndexMapI->second] = Edge();
+  EdgeIndexMap.erase(IndexMapI);
+  return true;
+}
+
 static void addEdge(SmallVectorImpl<LazyCallGraph::Edge> &Edges,
-                    DenseMap<Function *, int> &EdgeIndexMap, Function &F,
-                    LazyCallGraph::Edge::Kind EK) {
-  if (!EdgeIndexMap.insert({&F, Edges.size()}).second)
+                    DenseMap<LazyCallGraph::Node *, int> &EdgeIndexMap,
+                    LazyCallGraph::Node &N, LazyCallGraph::Edge::Kind EK) {
+  if (!EdgeIndexMap.insert({&N, Edges.size()}).second)
     return;
 
-  DEBUG(dbgs() << "    Added callable function: " << F.getName() << "\n");
-  Edges.emplace_back(LazyCallGraph::Edge(F, EK));
+  DEBUG(dbgs() << "    Added callable function: " << N.getName() << "\n");
+  Edges.emplace_back(LazyCallGraph::Edge(N, EK));
 }
 
-LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
-    : G(&G), F(F), DFSNumber(0), LowLink(0) {
-  DEBUG(dbgs() << "  Adding functions called by '" << F.getName()
+LazyCallGraph::EdgeSequence &LazyCallGraph::Node::populateSlow() {
+  assert(!Edges && "Must not have already populated the edges for this node!");
+
+  DEBUG(dbgs() << "  Adding functions called by '" << getName()
                << "' to the graph.\n");
 
+  Edges = EdgeSequence();
+
   SmallVector<Constant *, 16> Worklist;
   SmallPtrSet<Function *, 4> Callees;
   SmallPtrSet<Constant *, 16> Visited;
@@ -58,14 +82,15 @@ LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
   // alias. Then a test of the address of the weak function against the new
   // strong definition's address would be an effective way to determine the
   // safety of optimizing a direct call edge.
-  for (BasicBlock &BB : F)
+  for (BasicBlock &BB : *F)
     for (Instruction &I : BB) {
       if (auto CS = CallSite(&I))
         if (Function *Callee = CS.getCalledFunction())
           if (!Callee->isDeclaration())
             if (Callees.insert(Callee).second) {
               Visited.insert(Callee);
-              addEdge(Edges, EdgeIndexMap, *Callee, LazyCallGraph::Edge::Call);
+              addEdge(Edges->Edges, Edges->EdgeIndexMap, G->get(*Callee),
+                      LazyCallGraph::Edge::Call);
             }
 
       for (Value *Op : I.operand_values())
@@ -78,34 +103,16 @@ LazyCallGraph::Node::Node(LazyCallGraph &G, Function &F)
   // function containing) operands to all of the instructions in the function.
   // Process them (recursively) collecting every function found.
   visitReferences(Worklist, Visited, [&](Function &F) {
-    addEdge(Edges, EdgeIndexMap, F, LazyCallGraph::Edge::Ref);
+    addEdge(Edges->Edges, Edges->EdgeIndexMap, G->get(F),
+            LazyCallGraph::Edge::Ref);
   });
-}
-
-void LazyCallGraph::Node::insertEdgeInternal(Function &Target, Edge::Kind EK) {
-  if (Node *N = G->lookup(Target))
-    return insertEdgeInternal(*N, EK);
-
-  EdgeIndexMap.insert({&Target, Edges.size()});
-  Edges.emplace_back(Target, EK);
-}
 
-void LazyCallGraph::Node::insertEdgeInternal(Node &TargetN, Edge::Kind EK) {
-  EdgeIndexMap.insert({&TargetN.getFunction(), Edges.size()});
-  Edges.emplace_back(TargetN, EK);
+  return *Edges;
 }
 
-void LazyCallGraph::Node::setEdgeKind(Function &TargetF, Edge::Kind EK) {
-  Edges[EdgeIndexMap.find(&TargetF)->second].setKind(EK);
-}
-
-void LazyCallGraph::Node::removeEdgeInternal(Function &Target) {
-  auto IndexMapI = EdgeIndexMap.find(&Target);
-  assert(IndexMapI != EdgeIndexMap.end() &&
-         "Target not in the edge set for this caller?");
-
-  Edges[IndexMapI->second] = Edge();
-  EdgeIndexMap.erase(IndexMapI);
+void LazyCallGraph::Node::replaceFunction(Function &NewF) {
+  assert(F != &NewF && "Must not replace a function with itself!");
+  F = &NewF;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -114,16 +121,15 @@ LLVM_DUMP_METHOD void LazyCallGraph::Node::dump() const {
 }
 #endif
 
-LazyCallGraph::LazyCallGraph(Module &M) : NextDFSNumber(0) {
+LazyCallGraph::LazyCallGraph(Module &M) {
   DEBUG(dbgs() << "Building CG for module: " << M.getModuleIdentifier()
                << "\n");
   for (Function &F : M)
-    if (!F.isDeclaration() && !F.hasLocalLinkage())
-      if (EntryIndexMap.insert({&F, EntryEdges.size()}).second) {
-        DEBUG(dbgs() << "  Adding '" << F.getName()
-                     << "' to entry set of the graph.\n");
-        EntryEdges.emplace_back(F, Edge::Ref);
-      }
+    if (!F.isDeclaration() && !F.hasLocalLinkage()) {
+      DEBUG(dbgs() << "  Adding '" << F.getName()
+                   << "' to entry set of the graph.\n");
+      addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F), Edge::Ref);
+    }
 
   // Now add entry nodes for functions reachable via initializers to globals.
   SmallVector<Constant *, 16> Worklist;
@@ -136,21 +142,15 @@ LazyCallGraph::LazyCallGraph(Module &M) : NextDFSNumber(0) {
   DEBUG(dbgs() << "  Adding functions referenced by global initializers to the "
                   "entry set.\n");
   visitReferences(Worklist, Visited, [&](Function &F) {
-    addEdge(EntryEdges, EntryIndexMap, F, LazyCallGraph::Edge::Ref);
+    addEdge(EntryEdges.Edges, EntryEdges.EdgeIndexMap, get(F),
+            LazyCallGraph::Edge::Ref);
   });
-
-  for (const Edge &E : EntryEdges)
-    RefSCCEntryNodes.push_back(&E.getFunction());
 }
 
 LazyCallGraph::LazyCallGraph(LazyCallGraph &&G)
     : BPA(std::move(G.BPA)), NodeMap(std::move(G.NodeMap)),
-      EntryEdges(std::move(G.EntryEdges)),
-      EntryIndexMap(std::move(G.EntryIndexMap)), SCCBPA(std::move(G.SCCBPA)),
-      SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)),
-      DFSStack(std::move(G.DFSStack)),
-      RefSCCEntryNodes(std::move(G.RefSCCEntryNodes)),
-      NextDFSNumber(G.NextDFSNumber) {
+      EntryEdges(std::move(G.EntryEdges)), SCCBPA(std::move(G.SCCBPA)),
+      SCCMap(std::move(G.SCCMap)), LeafRefSCCs(std::move(G.LeafRefSCCs)) {
   updateGraphPtrs();
 }
 
@@ -158,13 +158,9 @@ LazyCallGraph &LazyCallGraph::operator=(LazyCallGraph &&G) {
   BPA = std::move(G.BPA);
   NodeMap = std::move(G.NodeMap);
   EntryEdges = std::move(G.EntryEdges);
-  EntryIndexMap = std::move(G.EntryIndexMap);
   SCCBPA = std::move(G.SCCBPA);
   SCCMap = std::move(G.SCCMap);
   LeafRefSCCs = std::move(G.LeafRefSCCs);
-  DFSStack = std::move(G.DFSStack);
-  RefSCCEntryNodes = std::move(G.RefSCCEntryNodes);
-  NextDFSNumber = G.NextDFSNumber;
   updateGraphPtrs();
   return *this;
 }
@@ -188,8 +184,8 @@ void LazyCallGraph::SCC::verify() {
            "Must set DFS numbers to -1 when adding a node to an SCC!");
     assert(N->LowLink == -1 &&
            "Must set low link to -1 when adding a node to an SCC!");
-    for (Edge &E : *N)
-      assert(E.getNode() && "Can't have an edge to a raw function!");
+    for (Edge &E : **N)
+      assert(E.getNode() && "Can't have an unpopulated node!");
   }
 }
 #endif
@@ -199,10 +195,9 @@ bool LazyCallGraph::SCC::isParentOf(const SCC &C) const {
     return false;
 
   for (Node &N : *this)
-    for (Edge &E : N.calls())
-      if (Node *CalleeN = E.getNode())
-        if (OuterRefSCC->G->lookupSCC(*CalleeN) == &C)
-          return true;
+    for (Edge &E : N->calls())
+      if (OuterRefSCC->G->lookupSCC(E.getNode()) == &C)
+        return true;
 
   // No edges found.
   return false;
@@ -222,11 +217,8 @@ bool LazyCallGraph::SCC::isAncestorOf(const SCC &TargetC) const {
   do {
     const SCC &C = *Worklist.pop_back_val();
     for (Node &N : C)
-      for (Edge &E : N.calls()) {
-        Node *CalleeN = E.getNode();
-        if (!CalleeN)
-          continue;
-        SCC *CalleeC = G.lookupSCC(*CalleeN);
+      for (Edge &E : N->calls()) {
+        SCC *CalleeC = G.lookupSCC(E.getNode());
         if (!CalleeC)
           continue;
 
@@ -285,10 +277,10 @@ void LazyCallGraph::RefSCC::verify() {
   for (int i = 0, Size = SCCs.size(); i < Size; ++i) {
     SCC &SourceSCC = *SCCs[i];
     for (Node &N : SourceSCC)
-      for (Edge &E : N) {
+      for (Edge &E : *N) {
         if (!E.isCall())
           continue;
-        SCC &TargetSCC = *G->lookupSCC(*E.getNode());
+        SCC &TargetSCC = *G->lookupSCC(E.getNode());
         if (&TargetSCC.getOuterRefSCC() == this) {
           assert(SCCIndices.find(&TargetSCC)->second <= i &&
                  "Edge between SCCs violates post-order relationship.");
@@ -305,8 +297,8 @@ void LazyCallGraph::RefSCC::verify() {
     auto HasConnectingEdge = [&] {
       for (SCC &C : *ParentRC)
         for (Node &N : C)
-          for (Edge &E : N)
-            if (G->lookupRefSCC(*E.getNode()) == this)
+          for (Edge &E : *N)
+            if (G->lookupRefSCC(E.getNode()) == this)
               return true;
       return false;
     };
@@ -467,7 +459,7 @@ updatePostorderSequenceForEdgeInsertion(
 
 SmallVector<LazyCallGraph::SCC *, 1>
 LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
-  assert(!SourceN[TargetN].isCall() && "Must start with a ref edge!");
+  assert(!(*SourceN)[TargetN].isCall() && "Must start with a ref edge!");
   SmallVector<SCC *, 1> DeletedSCCs;
 
 #ifndef NDEBUG
@@ -483,7 +475,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
   // If the two nodes are already part of the same SCC, we're also done as
   // we've just added more connectivity.
   if (&SourceSCC == &TargetSCC) {
-    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+    SourceN->setEdgeKind(TargetN, Edge::Call);
     return DeletedSCCs;
   }
 
@@ -496,7 +488,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
   int SourceIdx = SCCIndices[&SourceSCC];
   int TargetIdx = SCCIndices[&TargetSCC];
   if (TargetIdx < SourceIdx) {
-    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+    SourceN->setEdgeKind(TargetN, Edge::Call);
     return DeletedSCCs;
   }
 
@@ -510,11 +502,9 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
     ConnectedSet.insert(&SourceSCC);
     auto IsConnected = [&](SCC &C) {
       for (Node &N : C)
-        for (Edge &E : N.calls()) {
-          assert(E.getNode() && "Must have formed a node within an SCC!");
-          if (ConnectedSet.count(G->lookupSCC(*E.getNode())))
+        for (Edge &E : N->calls())
+          if (ConnectedSet.count(G->lookupSCC(E.getNode())))
             return true;
-        }
 
       return false;
     };
@@ -541,11 +531,10 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
     do {
       SCC &C = *Worklist.pop_back_val();
       for (Node &N : C)
-        for (Edge &E : N) {
-          assert(E.getNode() && "Must have formed a node within an SCC!");
+        for (Edge &E : *N) {
           if (!E.isCall())
             continue;
-          SCC &EdgeC = *G->lookupSCC(*E.getNode());
+          SCC &EdgeC = *G->lookupSCC(E.getNode());
           if (&EdgeC.getOuterRefSCC() != this)
             // Not in this RefSCC...
             continue;
@@ -571,7 +560,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
   // new cycles. We're done.
   if (MergeRange.begin() == MergeRange.end()) {
     // Now that the SCC structure is finalized, flip the kind to call.
-    SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+    SourceN->setEdgeKind(TargetN, Edge::Call);
     return DeletedSCCs;
   }
 
@@ -606,7 +595,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
     SCCIndices[C] -= IndexOffset;
 
   // Now that the SCC structure is finalized, flip the kind to call.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+  SourceN->setEdgeKind(TargetN, Edge::Call);
 
   // And we're done!
   return DeletedSCCs;
@@ -614,7 +603,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToCall(Node &SourceN, Node &TargetN) {
 
 void LazyCallGraph::RefSCC::switchTrivialInternalEdgeToRef(Node &SourceN,
                                                            Node &TargetN) {
-  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+  assert((*SourceN)[TargetN].isCall() && "Must start with a call edge!");
 
 #ifndef NDEBUG
   // In a debug build, verify the RefSCC is valid to start with and when this
@@ -631,12 +620,12 @@ void LazyCallGraph::RefSCC::switchTrivialInternalEdgeToRef(Node &SourceN,
          "Source and Target must be in separate SCCs for this to be trivial!");
 
   // Set the edge kind.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+  SourceN->setEdgeKind(TargetN, Edge::Ref);
 }
 
 iterator_range<LazyCallGraph::RefSCC::iterator>
 LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
-  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+  assert((*SourceN)[TargetN].isCall() && "Must start with a call edge!");
 
 #ifndef NDEBUG
   // In a debug build, verify the RefSCC is valid to start with and when this
@@ -656,7 +645,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
                                                 "full CG update.");
 
   // Set the edge kind.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+  SourceN->setEdgeKind(TargetN, Edge::Ref);
 
   // Otherwise we are removing a call edge from a single SCC. This may break
   // the cycle. In order to compute the new set of SCCs, we need to do a small
@@ -671,7 +660,7 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
   // etc.
 
   SCC &OldSCC = TargetSCC;
-  SmallVector<std::pair<Node *, call_edge_iterator>, 16> DFSStack;
+  SmallVector<std::pair<Node *, EdgeSequence::call_iterator>, 16> DFSStack;
   SmallVector<Node *, 16> PendingSCCStack;
   SmallVector<SCC *, 4> NewSCCs;
 
@@ -712,14 +701,14 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
     RootN->DFSNumber = RootN->LowLink = 1;
     int NextDFSNumber = 2;
 
-    DFSStack.push_back({RootN, RootN->call_begin()});
+    DFSStack.push_back({RootN, (*RootN)->call_begin()});
     do {
       Node *N;
-      call_edge_iterator I;
+      EdgeSequence::call_iterator I;
       std::tie(N, I) = DFSStack.pop_back_val();
-      auto E = N->call_end();
+      auto E = (*N)->call_end();
       while (I != E) {
-        Node &ChildN = *I->getNode();
+        Node &ChildN = I->getNode();
         if (ChildN.DFSNumber == 0) {
           // We haven't yet visited this child, so descend, pushing the current
           // node onto the stack.
@@ -729,8 +718,8 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
                  "Found a node with 0 DFS number but already in an SCC!");
           ChildN.DFSNumber = ChildN.LowLink = NextDFSNumber++;
           N = &ChildN;
-          I = N->call_begin();
-          E = N->call_end();
+          I = (*N)->call_begin();
+          E = (*N)->call_end();
           continue;
         }
 
@@ -823,17 +812,19 @@ LazyCallGraph::RefSCC::switchInternalEdgeToRef(Node &SourceN, Node &TargetN) {
 
 void LazyCallGraph::RefSCC::switchOutgoingEdgeToCall(Node &SourceN,
                                                      Node &TargetN) {
-  assert(!SourceN[TargetN].isCall() && "Must start with a ref edge!");
+  assert(!(*SourceN)[TargetN].isCall() && "Must start with a ref edge!");
 
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
   assert(G->lookupRefSCC(TargetN) != this &&
          "Target must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(G->lookupRefSCC(TargetN)->isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
 
   // Edges between RefSCCs are the same regardless of call or ref, so we can
   // just flip the edge here.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Call);
+  SourceN->setEdgeKind(TargetN, Edge::Call);
 
 #ifndef NDEBUG
   // Check that the RefSCC is still valid.
@@ -843,17 +834,19 @@ void LazyCallGraph::RefSCC::switchOutgoingEdgeToCall(Node &SourceN,
 
 void LazyCallGraph::RefSCC::switchOutgoingEdgeToRef(Node &SourceN,
                                                     Node &TargetN) {
-  assert(SourceN[TargetN].isCall() && "Must start with a call edge!");
+  assert((*SourceN)[TargetN].isCall() && "Must start with a call edge!");
 
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
   assert(G->lookupRefSCC(TargetN) != this &&
          "Target must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(G->lookupRefSCC(TargetN)->isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
 
   // Edges between RefSCCs are the same regardless of call or ref, so we can
   // just flip the edge here.
-  SourceN.setEdgeKind(TargetN.getFunction(), Edge::Ref);
+  SourceN->setEdgeKind(TargetN, Edge::Ref);
 
 #ifndef NDEBUG
   // Check that the RefSCC is still valid.
@@ -866,7 +859,7 @@ void LazyCallGraph::RefSCC::insertInternalRefEdge(Node &SourceN,
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
   assert(G->lookupRefSCC(TargetN) == this && "Target must be in this RefSCC.");
 
-  SourceN.insertEdgeInternal(TargetN, Edge::Ref);
+  SourceN->insertEdgeInternal(TargetN, Edge::Ref);
 
 #ifndef NDEBUG
   // Check that the RefSCC is still valid.
@@ -877,14 +870,16 @@ void LazyCallGraph::RefSCC::insertInternalRefEdge(Node &SourceN,
 void LazyCallGraph::RefSCC::insertOutgoingEdge(Node &SourceN, Node &TargetN,
                                                Edge::Kind EK) {
   // First insert it into the caller.
-  SourceN.insertEdgeInternal(TargetN, EK);
+  SourceN->insertEdgeInternal(TargetN, EK);
 
   assert(G->lookupRefSCC(SourceN) == this && "Source must be in this RefSCC.");
 
   RefSCC &TargetC = *G->lookupRefSCC(TargetN);
   assert(&TargetC != this && "Target must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(TargetC.isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
 
   // The only change required is to add this SCC to the parent set of the
   // callee.
@@ -901,8 +896,10 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
   assert(G->lookupRefSCC(TargetN) == this && "Target must be in this RefSCC.");
   RefSCC &SourceC = *G->lookupRefSCC(SourceN);
   assert(&SourceC != this && "Source must not be in this RefSCC.");
+#ifdef EXPENSIVE_CHECKS
   assert(SourceC.isDescendantOf(*this) &&
          "Source must be a descendant of the Target.");
+#endif
 
   SmallVector<RefSCC *, 1> DeletedRefSCCs;
 
@@ -957,9 +954,8 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
       RefSCC &RC = *Worklist.pop_back_val();
       for (SCC &C : RC)
         for (Node &N : C)
-          for (Edge &E : N) {
-            assert(E.getNode() && "Must have formed a node!");
-            RefSCC &EdgeRC = *G->lookupRefSCC(*E.getNode());
+          for (Edge &E : *N) {
+            RefSCC &EdgeRC = *G->lookupRefSCC(E.getNode());
             if (G->getRefSCCIndex(EdgeRC) <= SourceIdx)
               // Not in the postorder sequence between source and target.
               continue;
@@ -1009,10 +1005,8 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
       SCCIndices[&InnerC] = SCCIndex++;
       for (Node &N : InnerC) {
         G->SCCMap[&N] = &InnerC;
-        for (Edge &E : N) {
-          assert(E.getNode() &&
-                 "Cannot have a null node within a visited SCC!");
-          RefSCC &ChildRC = *G->lookupRefSCC(*E.getNode());
+        for (Edge &E : *N) {
+          RefSCC &ChildRC = *G->lookupRefSCC(E.getNode());
           if (MergeSet.count(&ChildRC))
             continue;
           ChildRC.Parents.erase(RC);
@@ -1048,7 +1042,7 @@ LazyCallGraph::RefSCC::insertIncomingRefEdge(Node &SourceN, Node &TargetN) {
 
   // At this point we have a merged RefSCC with a post-order SCCs list, just
   // connect the nodes to form the new edge.
-  SourceN.insertEdgeInternal(TargetN, Edge::Ref);
+  SourceN->insertEdgeInternal(TargetN, Edge::Ref);
 
   // We return the list of SCCs which were merged so that callers can
   // invalidate any data they have associated with those SCCs. Note that these
@@ -1075,15 +1069,16 @@ void LazyCallGraph::RefSCC::removeOutgoingEdge(Node &SourceN, Node &TargetN) {
 #endif
 
   // First remove it from the node.
-  SourceN.removeEdgeInternal(TargetN.getFunction());
+  bool Removed = SourceN->removeEdgeInternal(TargetN);
+  (void)Removed;
+  assert(Removed && "Target not in the edge set for this caller?");
 
   bool HasOtherEdgeToChildRC = false;
   bool HasOtherChildRC = false;
   for (SCC *InnerC : SCCs) {
     for (Node &N : *InnerC) {
-      for (Edge &E : N) {
-        assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
-        RefSCC &OtherChildRC = *G->lookupRefSCC(*E.getNode());
+      for (Edge &E : *N) {
+        RefSCC &OtherChildRC = *G->lookupRefSCC(E.getNode());
         if (&OtherChildRC == &TargetRC) {
           HasOtherEdgeToChildRC = true;
           break;
@@ -1122,7 +1117,7 @@ void LazyCallGraph::RefSCC::removeOutgoingEdge(Node &SourceN, Node &TargetN) {
 
 SmallVector<LazyCallGraph::RefSCC *, 1>
 LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
-  assert(!SourceN[TargetN].isCall() &&
+  assert(!(*SourceN)[TargetN].isCall() &&
          "Cannot remove a call edge, it must first be made a ref edge");
 
 #ifndef NDEBUG
@@ -1133,7 +1128,9 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
 #endif
 
   // First remove the actual edge.
-  SourceN.removeEdgeInternal(TargetN.getFunction());
+  bool Removed = SourceN->removeEdgeInternal(TargetN);
+  (void)Removed;
+  assert(Removed && "Target not in the edge set for this caller?");
 
   // We return a list of the resulting *new* RefSCCs in post-order.
   SmallVector<RefSCC *, 1> Result;
@@ -1192,7 +1189,7 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
     PostOrderMapping[&N] = Number;
   };
 
-  SmallVector<std::pair<Node *, edge_iterator>, 4> DFSStack;
+  SmallVector<std::pair<Node *, EdgeSequence::iterator>, 4> DFSStack;
   SmallVector<Node *, 4> PendingRefSCCStack;
   do {
     assert(DFSStack.empty() &&
@@ -1211,18 +1208,18 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
     RootN->DFSNumber = RootN->LowLink = 1;
     int NextDFSNumber = 2;
 
-    DFSStack.push_back({RootN, RootN->begin()});
+    DFSStack.push_back({RootN, (*RootN)->begin()});
     do {
       Node *N;
-      edge_iterator I;
+      EdgeSequence::iterator I;
       std::tie(N, I) = DFSStack.pop_back_val();
-      auto E = N->end();
+      auto E = (*N)->end();
 
       assert(N->DFSNumber != 0 && "We should always assign a DFS number "
                                   "before processing a node.");
 
       while (I != E) {
-        Node &ChildN = I->getNode(*G);
+        Node &ChildN = I->getNode();
         if (ChildN.DFSNumber == 0) {
           // Mark that we should start at this child when next this node is the
           // top of the stack. We don't start at the next child to ensure this
@@ -1232,8 +1229,8 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
           // Continue, resetting to the child node.
           ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
           N = &ChildN;
-          I = ChildN.begin();
-          E = ChildN.end();
+          I = ChildN->begin();
+          E = ChildN->end();
           continue;
         }
         if (ChildN.DFSNumber == -1) {
@@ -1388,9 +1385,8 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
 #endif
   for (SCC *C : SCCs)
     for (Node &N : *C) {
-      for (Edge &E : N) {
-        assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
-        RefSCC &ChildRC = *G->lookupRefSCC(*E.getNode());
+      for (Edge &E : *N) {
+        RefSCC &ChildRC = *G->lookupRefSCC(E.getNode());
         if (&ChildRC == this)
           continue;
         ChildRC.Parents.insert(this);
@@ -1414,9 +1410,8 @@ LazyCallGraph::RefSCC::removeInternalRefEdge(Node &SourceN, Node &TargetN) {
   for (RefSCC *ParentRC : OldParents)
     for (SCC &ParentC : *ParentRC)
       for (Node &ParentN : ParentC)
-        for (Edge &E : ParentN) {
-          assert(E.getNode() && "Cannot have a missing node in a visited SCC!");
-          RefSCC &RC = *G->lookupRefSCC(*E.getNode());
+        for (Edge &E : *ParentN) {
+          RefSCC &RC = *G->lookupRefSCC(E.getNode());
           if (&RC != ParentRC)
             RC.Parents.insert(ParentRC);
         }
@@ -1454,8 +1449,10 @@ void LazyCallGraph::RefSCC::handleTrivialEdgeInsertion(Node &SourceN,
     return;
   }
 
+#ifdef EXPENSIVE_CHECKS
   assert(TargetRC.isDescendantOf(*this) &&
          "Target must be a descendant of the Source.");
+#endif
   // The only change required is to add this RefSCC to the parent set of the
   // target. This is a set and so idempotent if the edge already existed.
   TargetRC.Parents.insert(this);
@@ -1467,25 +1464,29 @@ void LazyCallGraph::RefSCC::insertTrivialCallEdge(Node &SourceN,
   // Check that the RefSCC is still valid when we finish.
   auto ExitVerifier = make_scope_exit([this] { verify(); });
 
-  // Check that we aren't breaking some invariants of the SCC graph.
+#ifdef EXPENSIVE_CHECKS
+  // Check that we aren't breaking some invariants of the SCC graph. Note that
+  // this is quadratic in the number of edges in the call graph!
   SCC &SourceC = *G->lookupSCC(SourceN);
   SCC &TargetC = *G->lookupSCC(TargetN);
   if (&SourceC != &TargetC)
     assert(SourceC.isAncestorOf(TargetC) &&
            "Call edge is not trivial in the SCC graph!");
-#endif
+#endif // EXPENSIVE_CHECKS
+#endif // NDEBUG
+
   // First insert it into the source or find the existing edge.
-  auto InsertResult = SourceN.EdgeIndexMap.insert(
-      {&TargetN.getFunction(), SourceN.Edges.size()});
+  auto InsertResult =
+      SourceN->EdgeIndexMap.insert({&TargetN, SourceN->Edges.size()});
   if (!InsertResult.second) {
     // Already an edge, just update it.
-    Edge &E = SourceN.Edges[InsertResult.first->second];
+    Edge &E = SourceN->Edges[InsertResult.first->second];
     if (E.isCall())
       return; // Nothing to do!
     E.setKind(Edge::Call);
   } else {
     // Create the new edge.
-    SourceN.Edges.emplace_back(TargetN, Edge::Call);
+    SourceN->Edges.emplace_back(TargetN, Edge::Call);
   }
 
   // Now that we have the edge, handle the graph fallout.
@@ -1497,39 +1498,75 @@ void LazyCallGraph::RefSCC::insertTrivialRefEdge(Node &SourceN, Node &TargetN) {
   // Check that the RefSCC is still valid when we finish.
   auto ExitVerifier = make_scope_exit([this] { verify(); });
 
+#ifdef EXPENSIVE_CHECKS
   // Check that we aren't breaking some invariants of the RefSCC graph.
   RefSCC &SourceRC = *G->lookupRefSCC(SourceN);
   RefSCC &TargetRC = *G->lookupRefSCC(TargetN);
   if (&SourceRC != &TargetRC)
     assert(SourceRC.isAncestorOf(TargetRC) &&
            "Ref edge is not trivial in the RefSCC graph!");
-#endif
+#endif // EXPENSIVE_CHECKS
+#endif // NDEBUG
+
   // First insert it into the source or find the existing edge.
-  auto InsertResult = SourceN.EdgeIndexMap.insert(
-      {&TargetN.getFunction(), SourceN.Edges.size()});
+  auto InsertResult =
+      SourceN->EdgeIndexMap.insert({&TargetN, SourceN->Edges.size()});
   if (!InsertResult.second)
     // Already an edge, we're done.
     return;
 
   // Create the new edge.
-  SourceN.Edges.emplace_back(TargetN, Edge::Ref);
+  SourceN->Edges.emplace_back(TargetN, Edge::Ref);
 
   // Now that we have the edge, handle the graph fallout.
   handleTrivialEdgeInsertion(SourceN, TargetN);
 }
 
-void LazyCallGraph::insertEdge(Node &SourceN, Function &Target, Edge::Kind EK) {
-  assert(SCCMap.empty() && DFSStack.empty() &&
+void LazyCallGraph::RefSCC::replaceNodeFunction(Node &N, Function &NewF) {
+  Function &OldF = N.getFunction();
+
+#ifndef NDEBUG
+  // Check that the RefSCC is still valid when we finish.
+  auto ExitVerifier = make_scope_exit([this] { verify(); });
+
+  assert(G->lookupRefSCC(N) == this &&
+         "Cannot replace the function of a node outside this RefSCC.");
+
+  assert(G->NodeMap.find(&NewF) == G->NodeMap.end() &&
+         "Must not have already walked the new function!'");
+
+  // It is important that this replacement not introduce graph changes so we
+  // insist that the caller has already removed every use of the original
+  // function and that all uses of the new function correspond to existing
+  // edges in the graph. The common and expected way to use this is when
+  // replacing the function itself in the IR without changing the call graph
+  // shape and just updating the analysis based on that.
+  assert(&OldF != &NewF && "Cannot replace a function with itself!");
+  assert(OldF.use_empty() &&
+         "Must have moved all uses from the old function to the new!");
+#endif
+
+  N.replaceFunction(NewF);
+
+  // Update various call graph maps.
+  G->NodeMap.erase(&OldF);
+  G->NodeMap[&NewF] = &N;
+}
+
+void LazyCallGraph::insertEdge(Node &SourceN, Node &TargetN, Edge::Kind EK) {
+  assert(SCCMap.empty() &&
          "This method cannot be called after SCCs have been formed!");
 
-  return SourceN.insertEdgeInternal(Target, EK);
+  return SourceN->insertEdgeInternal(TargetN, EK);
 }
 
-void LazyCallGraph::removeEdge(Node &SourceN, Function &Target) {
-  assert(SCCMap.empty() && DFSStack.empty() &&
+void LazyCallGraph::removeEdge(Node &SourceN, Node &TargetN) {
+  assert(SCCMap.empty() &&
          "This method cannot be called after SCCs have been formed!");
 
-  return SourceN.removeEdgeInternal(Target);
+  bool Removed = SourceN->removeEdgeInternal(TargetN);
+  (void)Removed;
+  assert(Removed && "Target not in the edge set for this caller?");
 }
 
 void LazyCallGraph::removeDeadFunction(Function &F) {
@@ -1538,19 +1575,6 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   assert(F.use_empty() &&
          "This routine should only be called on trivially dead functions!");
 
-  auto EII = EntryIndexMap.find(&F);
-  if (EII != EntryIndexMap.end()) {
-    EntryEdges[EII->second] = Edge();
-    EntryIndexMap.erase(EII);
-  }
-
-  // It's safe to just remove un-visited functions from the RefSCC entry list.
-  // FIXME: This is a linear operation which could become hot and benefit from
-  // an index map.
-  auto RENI = find(RefSCCEntryNodes, &F);
-  if (RENI != RefSCCEntryNodes.end())
-    RefSCCEntryNodes.erase(RENI);
-
   auto NI = NodeMap.find(&F);
   if (NI == NodeMap.end())
     // Not in the graph at all!
@@ -1559,22 +1583,16 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   Node &N = *NI->second;
   NodeMap.erase(NI);
 
-  if (SCCMap.empty() && DFSStack.empty()) {
-    // No SCC walk has begun, so removing this is fine and there is nothing
+  // Remove this from the entry edges if present.
+  EntryEdges.removeEdgeInternal(N);
+
+  if (SCCMap.empty()) {
+    // No SCCs have been formed, so removing this is fine and there is nothing
     // else necessary at this point but clearing out the node.
     N.clear();
     return;
   }
 
-  // Check that we aren't going to break the DFS walk.
-  assert(all_of(DFSStack,
-                [&N](const std::pair<Node *, edge_iterator> &Element) {
-                  return Element.first != &N;
-                }) &&
-         "Tried to remove a function currently in the DFS stack!");
-  assert(find(PendingRefSCCStack, &N) == PendingRefSCCStack.end() &&
-         "Tried to remove a function currently pending to add to a RefSCC!");
-
   // Cannot remove a function which has yet to be visited in the DFS walk, so
   // if we have a node at all then we must have an SCC and RefSCC.
   auto CI = SCCMap.find(&N);
@@ -1589,13 +1607,19 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   // Validate these properties first.
   assert(C.size() == 1 && "Dead functions must be in a singular SCC");
   assert(RC.size() == 1 && "Dead functions must be in a singular RefSCC");
-  assert(RC.Parents.empty() && "Cannot have parents of a dead RefSCC!");
+
+  // Clean up any remaining reference edges. Note that we walk an unordered set
+  // here but are just removing and so the order doesn't matter.
+  for (RefSCC &ParentRC : RC.parents())
+    for (SCC &ParentC : ParentRC)
+      for (Node &ParentN : ParentC)
+        if (ParentN)
+          ParentN->removeEdgeInternal(N);
 
   // Now remove this RefSCC from any parents sets and the leaf list.
-  for (Edge &E : N)
-    if (Node *TargetN = E.getNode())
-      if (RefSCC *TargetRC = lookupRefSCC(*TargetN))
-        TargetRC->Parents.erase(&RC);
+  for (Edge &E : *N)
+    if (RefSCC *TargetRC = lookupRefSCC(E.getNode()))
+      TargetRC->Parents.erase(&RC);
   // FIXME: This is a linear operation which could become hot and benefit from
   // an index map.
   auto LRI = find(LeafRefSCCs, &RC);
@@ -1628,15 +1652,14 @@ void LazyCallGraph::updateGraphPtrs() {
   {
     SmallVector<Node *, 16> Worklist;
     for (Edge &E : EntryEdges)
-      if (Node *EntryN = E.getNode())
-        Worklist.push_back(EntryN);
+      Worklist.push_back(&E.getNode());
 
     while (!Worklist.empty()) {
-      Node *N = Worklist.pop_back_val();
-      N->G = this;
-      for (Edge &E : N->Edges)
-        if (Node *TargetN = E.getNode())
-          Worklist.push_back(TargetN);
+      Node &N = *Worklist.pop_back_val();
+      N.G = this;
+      if (N)
+        for (Edge &E : *N)
+          Worklist.push_back(&E.getNode());
     }
   }
 
@@ -1653,34 +1676,18 @@ void LazyCallGraph::updateGraphPtrs() {
   }
 }
 
-/// Build the internal SCCs for a RefSCC from a sequence of nodes.
-///
-/// Appends the SCCs to the provided vector and updates the map with their
-/// indices. Both the vector and map must be empty when passed into this
-/// routine.
-void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
-  assert(RC.SCCs.empty() && "Already built SCCs!");
-  assert(RC.SCCIndices.empty() && "Already mapped SCC indices!");
+template <typename RootsT, typename GetBeginT, typename GetEndT,
+          typename GetNodeT, typename FormSCCCallbackT>
+void LazyCallGraph::buildGenericSCCs(RootsT &&Roots, GetBeginT &&GetBegin,
+                                     GetEndT &&GetEnd, GetNodeT &&GetNode,
+                                     FormSCCCallbackT &&FormSCC) {
+  typedef decltype(GetBegin(std::declval<Node &>())) EdgeItT;
 
-  for (Node *N : Nodes) {
-    assert(N->LowLink >= (*Nodes.begin())->LowLink &&
-           "We cannot have a low link in an SCC lower than its root on the "
-           "stack!");
-
-    // This node will go into the next RefSCC, clear out its DFS and low link
-    // as we scan.
-    N->DFSNumber = N->LowLink = 0;
-  }
-
-  // Each RefSCC contains a DAG of the call SCCs. To build these, we do
-  // a direct walk of the call edges using Tarjan's algorithm. We reuse the
-  // internal storage as we won't need it for the outer graph's DFS any longer.
-
-  SmallVector<std::pair<Node *, call_edge_iterator>, 16> DFSStack;
+  SmallVector<std::pair<Node *, EdgeItT>, 16> DFSStack;
   SmallVector<Node *, 16> PendingSCCStack;
 
   // Scan down the stack and DFS across the call edges.
-  for (Node *RootN : Nodes) {
+  for (Node *RootN : Roots) {
     assert(DFSStack.empty() &&
            "Cannot begin a new root with a non-empty DFS stack!");
     assert(PendingSCCStack.empty() &&
@@ -1696,25 +1703,23 @@ void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
     RootN->DFSNumber = RootN->LowLink = 1;
     int NextDFSNumber = 2;
 
-    DFSStack.push_back({RootN, RootN->call_begin()});
+    DFSStack.push_back({RootN, GetBegin(*RootN)});
     do {
       Node *N;
-      call_edge_iterator I;
+      EdgeItT I;
       std::tie(N, I) = DFSStack.pop_back_val();
-      auto E = N->call_end();
+      auto E = GetEnd(*N);
       while (I != E) {
-        Node &ChildN = *I->getNode();
+        Node &ChildN = GetNode(I);
         if (ChildN.DFSNumber == 0) {
           // We haven't yet visited this child, so descend, pushing the current
           // node onto the stack.
           DFSStack.push_back({N, I});
 
-          assert(!lookupSCC(ChildN) &&
-                 "Found a node with 0 DFS number but already in an SCC!");
           ChildN.DFSNumber = ChildN.LowLink = NextDFSNumber++;
           N = &ChildN;
-          I = N->call_begin();
-          E = N->call_end();
+          I = GetBegin(*N);
+          E = GetEnd(*N);
           continue;
         }
 
@@ -1756,20 +1761,93 @@ void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
           }));
       // Form a new SCC out of these nodes and then clear them off our pending
       // stack.
-      RC.SCCs.push_back(createSCC(RC, SCCNodes));
-      for (Node &N : *RC.SCCs.back()) {
-        N.DFSNumber = N.LowLink = -1;
-        SCCMap[&N] = RC.SCCs.back();
-      }
+      FormSCC(SCCNodes);
       PendingSCCStack.erase(SCCNodes.end().base(), PendingSCCStack.end());
     } while (!DFSStack.empty());
   }
+}
+
+/// Build the internal SCCs for a RefSCC from a sequence of nodes.
+///
+/// Appends the SCCs to the provided vector and updates the map with their
+/// indices. Both the vector and map must be empty when passed into this
+/// routine.
+void LazyCallGraph::buildSCCs(RefSCC &RC, node_stack_range Nodes) {
+  assert(RC.SCCs.empty() && "Already built SCCs!");
+  assert(RC.SCCIndices.empty() && "Already mapped SCC indices!");
+
+  for (Node *N : Nodes) {
+    assert(N->LowLink >= (*Nodes.begin())->LowLink &&
+           "We cannot have a low link in an SCC lower than its root on the "
+           "stack!");
+
+    // This node will go into the next RefSCC, clear out its DFS and low link
+    // as we scan.
+    N->DFSNumber = N->LowLink = 0;
+  }
+
+  // Each RefSCC contains a DAG of the call SCCs. To build these, we do
+  // a direct walk of the call edges using Tarjan's algorithm. We reuse the
+  // internal storage as we won't need it for the outer graph's DFS any longer.
+  buildGenericSCCs(
+      Nodes, [](Node &N) { return N->call_begin(); },
+      [](Node &N) { return N->call_end(); },
+      [](EdgeSequence::call_iterator I) -> Node & { return I->getNode(); },
+      [this, &RC](node_stack_range Nodes) {
+        RC.SCCs.push_back(createSCC(RC, Nodes));
+        for (Node &N : *RC.SCCs.back()) {
+          N.DFSNumber = N.LowLink = -1;
+          SCCMap[&N] = RC.SCCs.back();
+        }
+      });
 
   // Wire up the SCC indices.
   for (int i = 0, Size = RC.SCCs.size(); i < Size; ++i)
     RC.SCCIndices[RC.SCCs[i]] = i;
 }
 
+void LazyCallGraph::buildRefSCCs() {
+  if (EntryEdges.empty() || !PostOrderRefSCCs.empty())
+    // RefSCCs are either non-existent or already built!
+    return;
+
+  assert(RefSCCIndices.empty() && "Already mapped RefSCC indices!");
+
+  SmallVector<Node *, 16> Roots;
+  for (Edge &E : *this)
+    Roots.push_back(&E.getNode());
+
+  // The roots will be popped of a stack, so use reverse to get a less
+  // surprising order. This doesn't change any of the semantics anywhere.
+  std::reverse(Roots.begin(), Roots.end());
+
+  buildGenericSCCs(
+      Roots,
+      [](Node &N) {
+        // We need to populate each node as we begin to walk its edges.
+        N.populate();
+        return N->begin();
+      },
+      [](Node &N) { return N->end(); },
+      [](EdgeSequence::iterator I) -> Node & { return I->getNode(); },
+      [this](node_stack_range Nodes) {
+        RefSCC *NewRC = createRefSCC(*this);
+        buildSCCs(*NewRC, Nodes);
+        connectRefSCC(*NewRC);
+
+        // Push the new node into the postorder list and remember its position
+        // in the index map.
+        bool Inserted =
+            RefSCCIndices.insert({NewRC, PostOrderRefSCCs.size()}).second;
+        (void)Inserted;
+        assert(Inserted && "Cannot already have this RefSCC in the index map!");
+        PostOrderRefSCCs.push_back(NewRC);
+#ifndef NDEBUG
+        NewRC->verify();
+#endif
+      });
+}
+
 // FIXME: We should move callers of this to embed the parent linking and leaf
 // tracking into their DFS in order to remove a full walk of all edges.
 void LazyCallGraph::connectRefSCC(RefSCC &RC) {
@@ -1779,10 +1857,8 @@ void LazyCallGraph::connectRefSCC(RefSCC &RC) {
   bool IsLeaf = true;
   for (SCC &C : RC)
     for (Node &N : C)
-      for (Edge &E : N) {
-        assert(E.getNode() &&
-               "Cannot have a missing node in a visited part of the graph!");
-        RefSCC &ChildRC = *lookupRefSCC(*E.getNode());
+      for (Edge &E : *N) {
+        RefSCC &ChildRC = *lookupRefSCC(E.getNode());
         if (&ChildRC == &RC)
           continue;
         ChildRC.Parents.insert(&RC);
@@ -1794,113 +1870,13 @@ void LazyCallGraph::connectRefSCC(RefSCC &RC) {
     LeafRefSCCs.push_back(&RC);
 }
 
-bool LazyCallGraph::buildNextRefSCCInPostOrder() {
-  if (DFSStack.empty()) {
-    Node *N;
-    do {
-      // If we've handled all candidate entry nodes to the SCC forest, we're
-      // done.
-      if (RefSCCEntryNodes.empty())
-        return false;
-
-      N = &get(*RefSCCEntryNodes.pop_back_val());
-    } while (N->DFSNumber != 0);
-
-    // Found a new root, begin the DFS here.
-    N->LowLink = N->DFSNumber = 1;
-    NextDFSNumber = 2;
-    DFSStack.push_back({N, N->begin()});
-  }
-
-  for (;;) {
-    Node *N;
-    edge_iterator I;
-    std::tie(N, I) = DFSStack.pop_back_val();
-
-    assert(N->DFSNumber > 0 && "We should always assign a DFS number "
-                               "before placing a node onto the stack.");
-
-    auto E = N->end();
-    while (I != E) {
-      Node &ChildN = I->getNode(*this);
-      if (ChildN.DFSNumber == 0) {
-        // We haven't yet visited this child, so descend, pushing the current
-        // node onto the stack.
-        DFSStack.push_back({N, N->begin()});
-
-        assert(!SCCMap.count(&ChildN) &&
-               "Found a node with 0 DFS number but already in an SCC!");
-        ChildN.LowLink = ChildN.DFSNumber = NextDFSNumber++;
-        N = &ChildN;
-        I = N->begin();
-        E = N->end();
-        continue;
-      }
-
-      // If the child has already been added to some child component, it
-      // couldn't impact the low-link of this parent because it isn't
-      // connected, and thus its low-link isn't relevant so skip it.
-      if (ChildN.DFSNumber == -1) {
-        ++I;
-        continue;
-      }
-
-      // Track the lowest linked child as the lowest link for this node.
-      assert(ChildN.LowLink > 0 && "Must have a positive low-link number!");
-      if (ChildN.LowLink < N->LowLink)
-        N->LowLink = ChildN.LowLink;
-
-      // Move to the next edge.
-      ++I;
-    }
-
-    // We've finished processing N and its descendents, put it on our pending
-    // SCC stack to eventually get merged into an SCC of nodes.
-    PendingRefSCCStack.push_back(N);
-
-    // If this node is linked to some lower entry, continue walking up the
-    // stack.
-    if (N->LowLink != N->DFSNumber) {
-      assert(!DFSStack.empty() &&
-             "We never found a viable root for an SCC to pop off!");
-      continue;
-    }
-
-    // Otherwise, form a new RefSCC from the top of the pending node stack.
-    int RootDFSNumber = N->DFSNumber;
-    // Find the range of the node stack by walking down until we pass the
-    // root DFS number.
-    auto RefSCCNodes = node_stack_range(
-        PendingRefSCCStack.rbegin(),
-        find_if(reverse(PendingRefSCCStack), [RootDFSNumber](const Node *N) {
-          return N->DFSNumber < RootDFSNumber;
-        }));
-    // Form a new RefSCC out of these nodes and then clear them off our pending
-    // stack.
-    RefSCC *NewRC = createRefSCC(*this);
-    buildSCCs(*NewRC, RefSCCNodes);
-    connectRefSCC(*NewRC);
-    PendingRefSCCStack.erase(RefSCCNodes.end().base(),
-                             PendingRefSCCStack.end());
-
-    // Push the new node into the postorder list and return true indicating we
-    // successfully grew the postorder sequence by one.
-    bool Inserted =
-        RefSCCIndices.insert({NewRC, PostOrderRefSCCs.size()}).second;
-    (void)Inserted;
-    assert(Inserted && "Cannot already have this RefSCC in the index map!");
-    PostOrderRefSCCs.push_back(NewRC);
-    return true;
-  }
-}
-
 AnalysisKey LazyCallGraphAnalysis::Key;
 
 LazyCallGraphPrinterPass::LazyCallGraphPrinterPass(raw_ostream &OS) : OS(OS) {}
 
 static void printNode(raw_ostream &OS, LazyCallGraph::Node &N) {
   OS << "  Edges in function: " << N.getFunction().getName() << "\n";
-  for (const LazyCallGraph::Edge &E : N)
+  for (LazyCallGraph::Edge &E : N.populate())
     OS << "    " << (E.isCall() ? "call" : "ref ") << " -> "
        << E.getFunction().getName() << "\n";
 
@@ -1935,6 +1911,7 @@ PreservedAnalyses LazyCallGraphPrinterPass::run(Module &M,
   for (Function &F : M)
     printNode(OS, G.get(F));
 
+  G.buildRefSCCs();
   for (LazyCallGraph::RefSCC &C : G.postorder_ref_sccs())
     printRefSCC(OS, C);
 
@@ -1947,7 +1924,7 @@ LazyCallGraphDOTPrinterPass::LazyCallGraphDOTPrinterPass(raw_ostream &OS)
 static void printNodeDOT(raw_ostream &OS, LazyCallGraph::Node &N) {
   std::string Name = "\"" + DOT::EscapeString(N.getFunction().getName()) + "\"";
 
-  for (const LazyCallGraph::Edge &E : N) {
+  for (LazyCallGraph::Edge &E : N.populate()) {
     OS << "  " << Name << " -> \""
        << DOT::EscapeString(E.getFunction().getName()) << "\"";
     if (!E.isCall()) // It is a ref edge.
diff --git a/lib/Analysis/LazyValueInfo.cpp b/lib/Analysis/LazyValueInfo.cpp
index dcf0ca20c58c6e92f6e67c762a547c1302ecc836..ad01f7f2f2158e07b9dd12cf314f1de3b38e94f0 100644
--- a/lib/Analysis/LazyValueInfo.cpp
+++ b/lib/Analysis/LazyValueInfo.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -31,6 +32,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 #include <map>
 #include <stack>
@@ -39,6 +41,10 @@ using namespace PatternMatch;
 
 #define DEBUG_TYPE "lazy-value-info"
 
+// This is the number of worklist items we will process to try to discover an
+// answer for a given value.
+static const unsigned MaxProcessedPerValue = 500;
+
 char LazyValueInfoWrapperPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LazyValueInfoWrapperPass, "lazy-value-info",
                 "Lazy Value Information Analysis", false, true)
@@ -358,6 +364,7 @@ namespace {
   /// This is the cache kept by LazyValueInfo which
   /// maintains information about queries across the clients' queries.
   class LazyValueInfoCache {
+    friend class LazyValueInfoAnnotatedWriter;
     /// This is all of the cached block information for exactly one Value*.
     /// The entries are sorted by the BasicBlock* of the
     /// entries, allowing us to do a lookup with a binary search.
@@ -369,20 +376,21 @@ namespace {
       SmallDenseMap<PoisoningVH<BasicBlock>, LVILatticeVal, 4> BlockVals;
     };
 
-    /// This is all of the cached information for all values,
-    /// mapped from Value* to key information.
-    DenseMap<Value *, std::unique_ptr<ValueCacheEntryTy>> ValueCache;
-
     /// This tracks, on a per-block basis, the set of values that are
     /// over-defined at the end of that block.
     typedef DenseMap<PoisoningVH<BasicBlock>, SmallPtrSet<Value *, 4>>
         OverDefinedCacheTy;
-    OverDefinedCacheTy OverDefinedCache;
-
     /// Keep track of all blocks that we have ever seen, so we
     /// don't spend time removing unused blocks from our caches.
     DenseSet<PoisoningVH<BasicBlock> > SeenBlocks;
 
+  protected:
+    /// This is all of the cached information for all values,
+    /// mapped from Value* to key information.
+    DenseMap<Value *, std::unique_ptr<ValueCacheEntryTy>> ValueCache;
+    OverDefinedCacheTy OverDefinedCache;
+
+
   public:
     void insertResult(Value *Val, BasicBlock *BB, const LVILatticeVal &Result) {
       SeenBlocks.insert(BB);
@@ -435,6 +443,7 @@ namespace {
       return BBI->second;
     }
 
+    void printCache(Function &F, raw_ostream &OS);
     /// clear - Empty the cache.
     void clear() {
       SeenBlocks.clear();
@@ -458,6 +467,61 @@ namespace {
   };
 }
 
+
+namespace {
+
+  /// An assembly annotator class to print LazyValueCache information in
+  /// comments.
+  class LazyValueInfoAnnotatedWriter : public AssemblyAnnotationWriter {
+    const LazyValueInfoCache* LVICache;
+
+  public:
+    LazyValueInfoAnnotatedWriter(const LazyValueInfoCache *L) : LVICache(L) {}
+
+    virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                          formatted_raw_ostream &OS) {
+      auto ODI = LVICache->OverDefinedCache.find(const_cast<BasicBlock*>(BB));
+      if (ODI == LVICache->OverDefinedCache.end())
+        return;
+      OS << "; OverDefined values for block are: \n";
+      for (auto *V : ODI->second)
+        OS << ";" << *V << "\n";
+
+      // Find if there are latticevalues defined for arguments of the function.
+      auto *F = const_cast<Function *>(BB->getParent());
+      for (auto &Arg : F->args()) {
+        auto VI = LVICache->ValueCache.find_as(&Arg);
+        if (VI == LVICache->ValueCache.end())
+          continue;
+        auto BBI = VI->second->BlockVals.find(const_cast<BasicBlock *>(BB));
+        if (BBI != VI->second->BlockVals.end())
+          OS << "; CachedLatticeValue for: '" << *VI->first << "' is: '"
+             << BBI->second << "'\n";
+      }
+    }
+
+    virtual void emitInstructionAnnot(const Instruction *I,
+                                      formatted_raw_ostream &OS) {
+
+      auto VI = LVICache->ValueCache.find_as(const_cast<Instruction *>(I));
+      if (VI == LVICache->ValueCache.end())
+        return;
+      OS << "; CachedLatticeValues for: '" << *VI->first << "'\n";
+      for (auto &BV : VI->second->BlockVals) {
+        OS << "; at beginning of BasicBlock: '";
+        BV.first->printAsOperand(OS, false);
+        OS << "' LatticeVal: '" << BV.second << "' \n";
+      }
+    }
+};
+}
+
+void LazyValueInfoCache::printCache(Function &F, raw_ostream &OS) {
+  LazyValueInfoAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+
+}
+
 void LazyValueInfoCache::eraseValue(Value *V) {
   for (auto I = OverDefinedCache.begin(), E = OverDefinedCache.end(); I != E;) {
     // Copy and increment the iterator immediately so we can erase behind
@@ -563,7 +627,7 @@ namespace {
     /// This stack holds the state of the value solver during a query.
     /// It basically emulates the callstack of the naive
     /// recursive value lookup process.
-    std::stack<std::pair<BasicBlock*, Value*> > BlockValueStack;
+    SmallVector<std::pair<BasicBlock*, Value*>, 8> BlockValueStack;
 
     /// Keeps track of which block-value pairs are in BlockValueStack.
     DenseSet<std::pair<BasicBlock*, Value*> > BlockValueSet;
@@ -576,7 +640,7 @@ namespace {
 
       DEBUG(dbgs() << "PUSH: " << *BV.second << " in " << BV.first->getName()
                    << "\n");
-      BlockValueStack.push(BV);
+      BlockValueStack.push_back(BV);
       return true;
     }
 
@@ -629,6 +693,11 @@ namespace {
       TheCache.clear();
     }
 
+    /// Printing the LazyValueInfoCache.
+    void printCache(Function &F, raw_ostream &OS) {
+       TheCache.printCache(F, OS);
+    }
+
     /// This is part of the update interface to inform the cache
     /// that a block has been deleted.
     void eraseBlock(BasicBlock *BB) {
@@ -646,24 +715,50 @@ namespace {
 } // end anonymous namespace
 
 void LazyValueInfoImpl::solve() {
+  SmallVector<std::pair<BasicBlock *, Value *>, 8> StartingStack(
+      BlockValueStack.begin(), BlockValueStack.end());
+
+  unsigned processedCount = 0;
   while (!BlockValueStack.empty()) {
-    std::pair<BasicBlock*, Value*> &e = BlockValueStack.top();
+    processedCount++;
+    // Abort if we have to process too many values to get a result for this one.
+    // Because of the design of the overdefined cache currently being per-block
+    // to avoid naming-related issues (IE it wants to try to give different
+    // results for the same name in different blocks), overdefined results don't
+    // get cached globally, which in turn means we will often try to rediscover
+    // the same overdefined result again and again.  Once something like
+    // PredicateInfo is used in LVI or CVP, we should be able to make the
+    // overdefined cache global, and remove this throttle.
+    if (processedCount > MaxProcessedPerValue) {
+      DEBUG(dbgs() << "Giving up on stack because we are getting too deep\n");
+      // Fill in the original values
+      while (!StartingStack.empty()) {
+        std::pair<BasicBlock *, Value *> &e = StartingStack.back();
+        TheCache.insertResult(e.second, e.first,
+                              LVILatticeVal::getOverdefined());
+        StartingStack.pop_back();
+      }
+      BlockValueSet.clear();
+      BlockValueStack.clear();
+      return;
+    }
+    std::pair<BasicBlock *, Value *> e = BlockValueStack.back();
     assert(BlockValueSet.count(e) && "Stack value should be in BlockValueSet!");
 
     if (solveBlockValue(e.second, e.first)) {
       // The work item was completely processed.
-      assert(BlockValueStack.top() == e && "Nothing should have been pushed!");
+      assert(BlockValueStack.back() == e && "Nothing should have been pushed!");
       assert(TheCache.hasCachedValueInfo(e.second, e.first) &&
              "Result should be in cache!");
 
       DEBUG(dbgs() << "POP " << *e.second << " in " << e.first->getName()
                    << " = " << TheCache.getCachedValueInfo(e.second, e.first) << "\n");
 
-      BlockValueStack.pop();
+      BlockValueStack.pop_back();
       BlockValueSet.erase(e);
     } else {
       // More work needs to be done before revisiting.
-      assert(BlockValueStack.top() != e && "Stack should have been pushed!");
+      assert(BlockValueStack.back() != e && "Stack should have been pushed!");
     }
   }
 }
@@ -839,13 +934,19 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(LVILatticeVal &BBLV,
   }
 
   // Loop over all of our predecessors, merging what we know from them into
-  // result.
-  bool EdgesMissing = false;
+  // result.  If we encounter an unexplored predecessor, we eagerly explore it
+  // in a depth first manner.  In practice, this has the effect of discovering
+  // paths we can't analyze eagerly without spending compile times analyzing
+  // other paths.  This heuristic benefits from the fact that predecessors are
+  // frequently arranged such that dominating ones come first and we quickly
+  // find a path to function entry.  TODO: We should consider explicitly
+  // canonicalizing to make this true rather than relying on this happy
+  // accident.  
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     LVILatticeVal EdgeResult;
-    EdgesMissing |= !getEdgeValue(Val, *PI, BB, EdgeResult);
-    if (EdgesMissing)
-      continue;
+    if (!getEdgeValue(Val, *PI, BB, EdgeResult))
+      // Explore that input, then return here
+      return false;
 
     Result.mergeIn(EdgeResult, DL);
 
@@ -866,8 +967,6 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(LVILatticeVal &BBLV,
       return true;
     }
   }
-  if (EdgesMissing)
-    return false;
 
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined());
@@ -880,8 +979,8 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(LVILatticeVal &BBLV,
   LVILatticeVal Result;  // Start Undefined.
 
   // Loop over all of our predecessors, merging what we know from them into
-  // result.
-  bool EdgesMissing = false;
+  // result.  See the comment about the chosen traversal order in
+  // solveBlockValueNonLocal; the same reasoning applies here.
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *PhiBB = PN->getIncomingBlock(i);
     Value *PhiVal = PN->getIncomingValue(i);
@@ -889,9 +988,9 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(LVILatticeVal &BBLV,
     // Note that we can provide PN as the context value to getEdgeValue, even
     // though the results will be cached, because PN is the value being used as
     // the cache key in the caller.
-    EdgesMissing |= !getEdgeValue(PhiVal, PhiBB, BB, EdgeResult, PN);
-    if (EdgesMissing)
-      continue;
+    if (!getEdgeValue(PhiVal, PhiBB, BB, EdgeResult, PN))
+      // Explore that input, then return here
+      return false;
 
     Result.mergeIn(EdgeResult, DL);
 
@@ -905,8 +1004,6 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(LVILatticeVal &BBLV,
       return true;
     }
   }
-  if (EdgesMissing)
-    return false;
 
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined() && "Possible PHI in entry block?");
@@ -1333,14 +1430,14 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
     unsigned BitWidth = Val->getType()->getIntegerBitWidth();
     ConstantRange EdgesVals(BitWidth, DefaultCase/*isFullSet*/);
 
-    for (SwitchInst::CaseIt i : SI->cases()) {
-      ConstantRange EdgeVal(i.getCaseValue()->getValue());
+    for (auto Case : SI->cases()) {
+      ConstantRange EdgeVal(Case.getCaseValue()->getValue());
       if (DefaultCase) {
         // It is possible that the default destination is the destination of
         // some cases. There is no need to perform difference for those cases.
-        if (i.getCaseSuccessor() != BBTo)
+        if (Case.getCaseSuccessor() != BBTo)
           EdgesVals = EdgesVals.difference(EdgeVal);
-      } else if (i.getCaseSuccessor() == BBTo)
+      } else if (Case.getCaseSuccessor() == BBTo)
         EdgesVals = EdgesVals.unionWith(EdgeVal);
     }
     Result = LVILatticeVal::getRange(std::move(EdgesVals));
@@ -1352,8 +1449,8 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
 /// \brief Compute the value of Val on the edge BBFrom -> BBTo or the value at
 /// the basic block if the edge does not constrain Val.
 bool LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
-                                      BasicBlock *BBTo, LVILatticeVal &Result,
-                                      Instruction *CxtI) {
+                                     BasicBlock *BBTo, LVILatticeVal &Result,
+                                     Instruction *CxtI) {
   // If already a constant, there is nothing to compute.
   if (Constant *VC = dyn_cast<Constant>(Val)) {
     Result = LVILatticeVal::get(VC);
@@ -1522,7 +1619,7 @@ LazyValueInfo LazyValueAnalysis::run(Function &F, FunctionAnalysisManager &FAM)
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
 
-  return LazyValueInfo(&AC, &TLI, DT);
+  return LazyValueInfo(&AC, &F.getParent()->getDataLayout(), &TLI, DT);
 }
 
 /// Returns true if we can statically tell that this value will never be a
@@ -1792,3 +1889,40 @@ void LazyValueInfo::eraseBlock(BasicBlock *BB) {
     getImpl(PImpl, AC, &DL, DT).eraseBlock(BB);
   }
 }
+
+
+void LazyValueInfo::printCache(Function &F, raw_ostream &OS) {
+  if (PImpl) {
+    getImpl(PImpl, AC, DL, DT).printCache(F, OS);
+  }
+}
+
+namespace {
+// Printer class for LazyValueInfo results.
+class LazyValueInfoPrinter : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  LazyValueInfoPrinter() : FunctionPass(ID) {
+    initializeLazyValueInfoPrinterPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    AU.addRequired<LazyValueInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override {
+    dbgs() << "LVI for function '" << F.getName() << "':\n";
+    auto &LVI = getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+    LVI.printCache(F, dbgs());
+    return false;
+  }
+};
+}
+
+char LazyValueInfoPrinter::ID = 0;
+INITIALIZE_PASS_BEGIN(LazyValueInfoPrinter, "print-lazy-value-info",
+                "Lazy Value Info Printer Pass", false, false)
+INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
+INITIALIZE_PASS_END(LazyValueInfoPrinter, "print-lazy-value-info",
+                "Lazy Value Info Printer Pass", false, false)
diff --git a/lib/Analysis/Loads.cpp b/lib/Analysis/Loads.cpp
index e46541e6538d5dfa630ef74da2d1a0ad5e25a143..96799a459bfc48a4726c650bd38d6fd922714880 100644
--- a/lib/Analysis/Loads.cpp
+++ b/lib/Analysis/Loads.cpp
@@ -312,21 +312,26 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
                                       BasicBlock *ScanBB,
                                       BasicBlock::iterator &ScanFrom,
                                       unsigned MaxInstsToScan,
-                                      AliasAnalysis *AA, bool *IsLoadCSE) {
-  if (MaxInstsToScan == 0)
-    MaxInstsToScan = ~0U;
-
-  Value *Ptr = Load->getPointerOperand();
-  Type *AccessTy = Load->getType();
-
-  // We can never remove a volatile load
-  if (Load->isVolatile())
-    return nullptr;
-
-  // Anything stronger than unordered is currently unimplemented.
+                                      AliasAnalysis *AA, bool *IsLoad,
+                                      unsigned *NumScanedInst) {
+  // Don't CSE load that is volatile or anything stronger than unordered.
   if (!Load->isUnordered())
     return nullptr;
 
+  return FindAvailablePtrLoadStore(
+      Load->getPointerOperand(), Load->getType(), Load->isAtomic(), ScanBB,
+      ScanFrom, MaxInstsToScan, AA, IsLoad, NumScanedInst);
+}
+
+Value *llvm::FindAvailablePtrLoadStore(Value *Ptr, Type *AccessTy,
+                                       bool AtLeastAtomic, BasicBlock *ScanBB,
+                                       BasicBlock::iterator &ScanFrom,
+                                       unsigned MaxInstsToScan,
+                                       AliasAnalysis *AA, bool *IsLoadCSE,
+                                       unsigned *NumScanedInst) {
+  if (MaxInstsToScan == 0)
+    MaxInstsToScan = ~0U;
+
   const DataLayout &DL = ScanBB->getModule()->getDataLayout();
 
   // Try to get the store size for the type.
@@ -344,6 +349,9 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
     // Restore ScanFrom to expected value in case next test succeeds
     ScanFrom++;
 
+    if (NumScanedInst)
+      ++(*NumScanedInst);
+
     // Don't scan huge blocks.
     if (MaxInstsToScan-- == 0)
       return nullptr;
@@ -359,7 +367,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
 
         // We can value forward from an atomic to a non-atomic, but not the
         // other way around.
-        if (LI->isAtomic() < Load->isAtomic())
+        if (LI->isAtomic() < AtLeastAtomic)
           return nullptr;
 
         if (IsLoadCSE)
@@ -378,7 +386,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
 
         // We can value forward from an atomic to a non-atomic, but not the
         // other way around.
-        if (SI->isAtomic() < Load->isAtomic())
+        if (SI->isAtomic() < AtLeastAtomic)
           return nullptr;
 
         if (IsLoadCSE)
diff --git a/lib/Analysis/LoopAccessAnalysis.cpp b/lib/Analysis/LoopAccessAnalysis.cpp
index e8f52eda3c466df5d9331087cd59d263cf3fe74a..4ba12583ff8394506214c80d490bf233a098bb56 100644
--- a/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/lib/Analysis/LoopAccessAnalysis.cpp
@@ -135,21 +135,6 @@ bool VectorizerParams::isInterleaveForced() {
   return ::VectorizationInterleave.getNumOccurrences() > 0;
 }
 
-void LoopAccessReport::emitAnalysis(const LoopAccessReport &Message,
-                                    const Loop *TheLoop, const char *PassName,
-                                    OptimizationRemarkEmitter &ORE) {
-  DebugLoc DL = TheLoop->getStartLoc();
-  const Value *V = TheLoop->getHeader();
-  if (const Instruction *I = Message.getInstr()) {
-    // If there is no debug location attached to the instruction, revert back to
-    // using the loop's.
-    if (I->getDebugLoc())
-      DL = I->getDebugLoc();
-    V = I->getParent();
-  }
-  ORE.emitOptimizationRemarkAnalysis(PassName, DL, V, Message.str());
-}
-
 Value *llvm::stripIntegerCast(Value *V) {
   if (auto *CI = dyn_cast<CastInst>(V))
     if (CI->getOperand(0)->getType()->isIntegerTy())
@@ -172,11 +157,6 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
     // Strip casts.
     StrideVal = stripIntegerCast(StrideVal);
 
-    // Replace symbolic stride by one.
-    Value *One = ConstantInt::get(StrideVal->getType(), 1);
-    ValueToValueMap RewriteMap;
-    RewriteMap[StrideVal] = One;
-
     ScalarEvolution *SE = PSE.getSE();
     const auto *U = cast<SCEVUnknown>(SE->getSCEV(StrideVal));
     const auto *CT =
@@ -518,7 +498,7 @@ class AccessAnalysis {
 public:
   /// \brief Read or write access location.
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
-  typedef SmallPtrSet<MemAccessInfo, 8> MemAccessInfoSet;
+  typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
   AccessAnalysis(const DataLayout &Dl, AliasAnalysis *AA, LoopInfo *LI,
                  MemoryDepChecker::DepCandidates &DA,
@@ -570,7 +550,7 @@ public:
     DepChecker.clearDependences();
   }
 
-  MemAccessInfoSet &getDependenciesToCheck() { return CheckDeps; }
+  MemAccessInfoList &getDependenciesToCheck() { return CheckDeps; }
 
 private:
   typedef SetVector<MemAccessInfo> PtrAccessSet;
@@ -584,8 +564,8 @@ private:
 
   const DataLayout &DL;
 
-  /// Set of accesses that need a further dependence check.
-  MemAccessInfoSet CheckDeps;
+  /// List of accesses that need a further dependence check.
+  MemAccessInfoList CheckDeps;
 
   /// Set of pointers that are read only.
   SmallPtrSet<Value*, 16> ReadOnlyPtr;
@@ -842,7 +822,7 @@ void AccessAnalysis::processMemAccesses() {
           // there is no other write to the ptr - this is an optimization to
           // catch "a[i] = a[i] + " without having to do a dependence check).
           if ((IsWrite || IsReadOnlyPtr) && SetHasWrite) {
-            CheckDeps.insert(Access);
+            CheckDeps.push_back(Access);
             IsRTCheckAnalysisNeeded = true;
           }
 
@@ -1058,37 +1038,6 @@ static unsigned getAddressSpaceOperand(Value *I) {
   return -1;
 }
 
-/// Saves the memory accesses after sorting it into vector argument 'Sorted'.
-void llvm::sortMemAccesses(ArrayRef<Value *> VL, const DataLayout &DL,
-                         ScalarEvolution &SE,
-                         SmallVectorImpl<Value *> &Sorted) {
-  SmallVector<std::pair<int, Value *>, 4> OffValPairs;
-  for (auto *Val : VL) {
-    // Compute the constant offset from the base pointer of each memory accesses
-    // and insert into the vector of key,value pair which needs to be sorted.
-    Value *Ptr = getPointerOperand(Val);
-    unsigned AS = getAddressSpaceOperand(Val);
-    unsigned PtrBitWidth = DL.getPointerSizeInBits(AS);
-    Type *Ty = cast<PointerType>(Ptr->getType())->getElementType();
-    APInt Size(PtrBitWidth, DL.getTypeStoreSize(Ty));
-
-    // FIXME: Currently the offsets are assumed to be constant.However this not
-    // always true as offsets can be variables also and we would need to
-    // consider the difference of the variable offsets.
-    APInt Offset(PtrBitWidth, 0);
-    Ptr->stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
-    OffValPairs.push_back(std::make_pair(Offset.getSExtValue(), Val));
-  }
-  std::sort(OffValPairs.begin(), OffValPairs.end(),
-            [](const std::pair<int, Value *> &Left,
-               const std::pair<int, Value *> &Right) {
-              return Left.first < Right.first;
-            });
-
-  for (auto& it : OffValPairs)
-    Sorted.push_back(it.second);
-}
-
 /// Returns true if the memory operations \p A and \p B are consecutive.
 bool llvm::isConsecutiveAccess(Value *A, Value *B, const DataLayout &DL,
                                ScalarEvolution &SE, bool CheckType) {
@@ -1236,6 +1185,73 @@ bool MemoryDepChecker::couldPreventStoreLoadForward(uint64_t Distance,
   return false;
 }
 
+/// Given a non-constant (unknown) dependence-distance \p Dist between two 
+/// memory accesses, that have the same stride whose absolute value is given
+/// in \p Stride, and that have the same type size \p TypeByteSize,
+/// in a loop whose takenCount is \p BackedgeTakenCount, check if it is
+/// possible to prove statically that the dependence distance is larger
+/// than the range that the accesses will travel through the execution of
+/// the loop. If so, return true; false otherwise. This is useful for
+/// example in loops such as the following (PR31098):
+///     for (i = 0; i < D; ++i) {
+///                = out[i];
+///       out[i+D] =
+///     }
+static bool isSafeDependenceDistance(const DataLayout &DL, ScalarEvolution &SE,
+                                     const SCEV &BackedgeTakenCount,
+                                     const SCEV &Dist, uint64_t Stride,
+                                     uint64_t TypeByteSize) {
+
+  // If we can prove that
+  //      (**) |Dist| > BackedgeTakenCount * Step
+  // where Step is the absolute stride of the memory accesses in bytes, 
+  // then there is no dependence.
+  //
+  // Ratioanle: 
+  // We basically want to check if the absolute distance (|Dist/Step|) 
+  // is >= the loop iteration count (or > BackedgeTakenCount). 
+  // This is equivalent to the Strong SIV Test (Practical Dependence Testing, 
+  // Section 4.2.1); Note, that for vectorization it is sufficient to prove 
+  // that the dependence distance is >= VF; This is checked elsewhere.
+  // But in some cases we can prune unknown dependence distances early, and 
+  // even before selecting the VF, and without a runtime test, by comparing 
+  // the distance against the loop iteration count. Since the vectorized code 
+  // will be executed only if LoopCount >= VF, proving distance >= LoopCount 
+  // also guarantees that distance >= VF.
+  //
+  const uint64_t ByteStride = Stride * TypeByteSize;
+  const SCEV *Step = SE.getConstant(BackedgeTakenCount.getType(), ByteStride);
+  const SCEV *Product = SE.getMulExpr(&BackedgeTakenCount, Step);
+
+  const SCEV *CastedDist = &Dist;
+  const SCEV *CastedProduct = Product;
+  uint64_t DistTypeSize = DL.getTypeAllocSize(Dist.getType());
+  uint64_t ProductTypeSize = DL.getTypeAllocSize(Product->getType());
+
+  // The dependence distance can be positive/negative, so we sign extend Dist; 
+  // The multiplication of the absolute stride in bytes and the 
+  // backdgeTakenCount is non-negative, so we zero extend Product.
+  if (DistTypeSize > ProductTypeSize)
+    CastedProduct = SE.getZeroExtendExpr(Product, Dist.getType());
+  else
+    CastedDist = SE.getNoopOrSignExtend(&Dist, Product->getType());
+
+  // Is  Dist - (BackedgeTakenCount * Step) > 0 ?
+  // (If so, then we have proven (**) because |Dist| >= Dist)
+  const SCEV *Minus = SE.getMinusSCEV(CastedDist, CastedProduct);
+  if (SE.isKnownPositive(Minus))
+    return true;
+
+  // Second try: Is  -Dist - (BackedgeTakenCount * Step) > 0 ?
+  // (If so, then we have proven (**) because |Dist| >= -1*Dist)
+  const SCEV *NegDist = SE.getNegativeSCEV(CastedDist);
+  Minus = SE.getMinusSCEV(NegDist, CastedProduct);
+  if (SE.isKnownPositive(Minus))
+    return true;
+
+  return false;
+}
+
 /// \brief Check the dependence for two accesses with the same stride \p Stride.
 /// \p Distance is the positive distance and \p TypeByteSize is type size in
 /// bytes.
@@ -1323,21 +1339,26 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
     return Dependence::Unknown;
   }
 
+  Type *ATy = APtr->getType()->getPointerElementType();
+  Type *BTy = BPtr->getType()->getPointerElementType();
+  auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
+  uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
+  uint64_t Stride = std::abs(StrideAPtr);
   const SCEVConstant *C = dyn_cast<SCEVConstant>(Dist);
   if (!C) {
+    if (TypeByteSize == DL.getTypeAllocSize(BTy) &&
+        isSafeDependenceDistance(DL, *(PSE.getSE()),
+                                 *(PSE.getBackedgeTakenCount()), *Dist, Stride,
+                                 TypeByteSize))
+      return Dependence::NoDep;
+
     DEBUG(dbgs() << "LAA: Dependence because of non-constant distance\n");
     ShouldRetryWithRuntimeCheck = true;
     return Dependence::Unknown;
   }
 
-  Type *ATy = APtr->getType()->getPointerElementType();
-  Type *BTy = BPtr->getType()->getPointerElementType();
-  auto &DL = InnermostLoop->getHeader()->getModule()->getDataLayout();
-  uint64_t TypeByteSize = DL.getTypeAllocSize(ATy);
-
   const APInt &Val = C->getAPInt();
   int64_t Distance = Val.getSExtValue();
-  uint64_t Stride = std::abs(StrideAPtr);
 
   // Attempt to prove strided accesses independent.
   if (std::abs(Distance) > 0 && Stride > 1 && ATy == BTy &&
@@ -1458,12 +1479,14 @@ MemoryDepChecker::isDependent(const MemAccessInfo &A, unsigned AIdx,
 }
 
 bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
-                                   MemAccessInfoSet &CheckDeps,
+                                   MemAccessInfoList &CheckDeps,
                                    const ValueToValueMap &Strides) {
 
   MaxSafeDepDistBytes = -1;
-  while (!CheckDeps.empty()) {
-    MemAccessInfo CurAccess = *CheckDeps.begin();
+  SmallPtrSet<MemAccessInfo, 8> Visited;
+  for (MemAccessInfo CurAccess : CheckDeps) {
+    if (Visited.count(CurAccess))
+      continue;
 
     // Get the relevant memory access set.
     EquivalenceClasses<MemAccessInfo>::iterator I =
@@ -1477,7 +1500,7 @@ bool MemoryDepChecker::areDepsSafe(DepCandidates &AccessSets,
 
     // Check every access pair.
     while (AI != AE) {
-      CheckDeps.erase(*AI);
+      Visited.insert(*AI);
       EquivalenceClasses<MemAccessInfo>::member_iterator OI = std::next(AI);
       while (OI != AE) {
         // Check every accessing instruction pair in program order.
@@ -1916,7 +1939,10 @@ expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
     Value *NewPtr = (Inst && TheLoop->contains(Inst))
                         ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
                         : Ptr;
-    return {NewPtr, NewPtr};
+    // We must return a half-open range, which means incrementing Sc.
+    const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
+    Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
+    return {NewPtr, NewPtrPlusOne};
   } else {
     Value *Start = nullptr, *End = nullptr;
     DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
diff --git a/lib/Analysis/LoopAnalysisManager.cpp b/lib/Analysis/LoopAnalysisManager.cpp
index 6ca9c273771caac43754775a9a399aa21af85c1f..e4a0f90b2f7102543abc587554edce0c50d56fc6 100644
--- a/lib/Analysis/LoopAnalysisManager.cpp
+++ b/lib/Analysis/LoopAnalysisManager.cpp
@@ -23,7 +23,8 @@ namespace llvm {
 template class AllAnalysesOn<Loop>;
 template class AnalysisManager<Loop, LoopStandardAnalysisResults &>;
 template class InnerAnalysisManagerProxy<LoopAnalysisManager, Function>;
-template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop>;
+template class OuterAnalysisManagerProxy<FunctionAnalysisManager, Loop,
+                                         LoopStandardAnalysisResults &>;
 
 bool LoopAnalysisManagerFunctionProxy::Result::invalidate(
     Function &F, const PreservedAnalyses &PA,
diff --git a/lib/Analysis/LoopPass.cpp b/lib/Analysis/LoopPass.cpp
index 3f4a07942154c853c5a03b0fb8a8998292914f79..0b5f6266e3737a5aa9d1497b1caaae7e92b2151d 100644
--- a/lib/Analysis/LoopPass.cpp
+++ b/lib/Analysis/LoopPass.cpp
@@ -54,6 +54,8 @@ public:
     }
     return false;
   }
+
+  StringRef getPassName() const override { return "Print Loop IR"; }
 };
 
 char PrintLoopPassWrapper::ID = 0;
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index e9d27b5e367b3652a3095ced7150d0f9ca9253bc..f99d3b3fbda324da88029db521dacc41c9c2ffc7 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -183,7 +183,7 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V,
 
 static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
   ImmutableCallSite CS(LookThroughBitCast ? V->stripPointerCasts() : V);
-  return CS && CS.paramHasAttr(AttributeSet::ReturnIndex, Attribute::NoAlias);
+  return CS && CS.paramHasAttr(AttributeList::ReturnIndex, Attribute::NoAlias);
 }
 
 
@@ -394,10 +394,8 @@ static APInt getSizeWithOverflow(const SizeOffsetType &Data) {
 /// If RoundToAlign is true, then Size is rounded up to the aligment of allocas,
 /// byval arguments, and global variables.
 bool llvm::getObjectSize(const Value *Ptr, uint64_t &Size, const DataLayout &DL,
-                         const TargetLibraryInfo *TLI, bool RoundToAlign,
-                         llvm::ObjSizeMode Mode) {
-  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(),
-                                  RoundToAlign, Mode);
+                         const TargetLibraryInfo *TLI, ObjectSizeOpts Opts) {
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Ptr->getContext(), Opts);
   SizeOffsetType Data = Visitor.compute(const_cast<Value*>(Ptr));
   if (!Visitor.bothKnown(Data))
     return false;
@@ -414,19 +412,23 @@ ConstantInt *llvm::lowerObjectSizeCall(IntrinsicInst *ObjectSize,
          "ObjectSize must be a call to llvm.objectsize!");
 
   bool MaxVal = cast<ConstantInt>(ObjectSize->getArgOperand(1))->isZero();
-  ObjSizeMode Mode;
+  ObjectSizeOpts EvalOptions;
   // Unless we have to fold this to something, try to be as accurate as
   // possible.
   if (MustSucceed)
-    Mode = MaxVal ? ObjSizeMode::Max : ObjSizeMode::Min;
+    EvalOptions.EvalMode =
+        MaxVal ? ObjectSizeOpts::Mode::Max : ObjectSizeOpts::Mode::Min;
   else
-    Mode = ObjSizeMode::Exact;
+    EvalOptions.EvalMode = ObjectSizeOpts::Mode::Exact;
+
+  EvalOptions.NullIsUnknownSize =
+      cast<ConstantInt>(ObjectSize->getArgOperand(2))->isOne();
 
   // FIXME: Does it make sense to just return a failure value if the size won't
   // fit in the output and `!MustSucceed`?
   uint64_t Size;
   auto *ResultType = cast<IntegerType>(ObjectSize->getType());
-  if (getObjectSize(ObjectSize->getArgOperand(0), Size, DL, TLI, false, Mode) &&
+  if (getObjectSize(ObjectSize->getArgOperand(0), Size, DL, TLI, EvalOptions) &&
       isUIntN(ResultType->getBitWidth(), Size))
     return ConstantInt::get(ResultType, Size);
 
@@ -443,7 +445,7 @@ STATISTIC(ObjectVisitorLoad,
 
 
 APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
-  if (RoundToAlign && Align)
+  if (Options.RoundToAlign && Align)
     return APInt(IntTyBits, alignTo(Size.getZExtValue(), Align));
   return Size;
 }
@@ -451,9 +453,8 @@ APInt ObjectSizeOffsetVisitor::align(APInt Size, uint64_t Align) {
 ObjectSizeOffsetVisitor::ObjectSizeOffsetVisitor(const DataLayout &DL,
                                                  const TargetLibraryInfo *TLI,
                                                  LLVMContext &Context,
-                                                 bool RoundToAlign,
-                                                 ObjSizeMode Mode)
-    : DL(DL), TLI(TLI), RoundToAlign(RoundToAlign), Mode(Mode) {
+                                                 ObjectSizeOpts Options)
+    : DL(DL), TLI(TLI), Options(Options) {
   // Pointer size must be rechecked for each object visited since it could have
   // a different address space.
 }
@@ -596,7 +597,9 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
 }
 
 SizeOffsetType
-ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull&) {
+ObjectSizeOffsetVisitor::visitConstantPointerNull(ConstantPointerNull& CPN) {
+  if (Options.NullIsUnknownSize && CPN.getType()->getAddressSpace() == 0)
+    return unknown();
   return std::make_pair(Zero, Zero);
 }
 
@@ -663,12 +666,12 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitSelectInst(SelectInst &I) {
     if (TrueResult == FalseResult) {
       return TrueSide;
     }
-    if (Mode == ObjSizeMode::Min) {
+    if (Options.EvalMode == ObjectSizeOpts::Mode::Min) {
       if (TrueResult.slt(FalseResult))
         return TrueSide;
       return FalseSide;
     }
-    if (Mode == ObjSizeMode::Max) {
+    if (Options.EvalMode == ObjectSizeOpts::Mode::Max) {
       if (TrueResult.sgt(FalseResult))
         return TrueSide;
       return FalseSide;
@@ -719,7 +722,10 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute(Value *V) {
 }
 
 SizeOffsetEvalType ObjectSizeOffsetEvaluator::compute_(Value *V) {
-  ObjectSizeOffsetVisitor Visitor(DL, TLI, Context, RoundToAlign);
+  ObjectSizeOpts ObjSizeOptions;
+  ObjSizeOptions.RoundToAlign = RoundToAlign;
+
+  ObjectSizeOffsetVisitor Visitor(DL, TLI, Context, ObjSizeOptions);
   SizeOffsetType Const = Visitor.compute(V);
   if (Visitor.bothKnown(Const))
     return std::make_pair(ConstantInt::get(Context, Const.first),
diff --git a/lib/Transforms/Utils/MemorySSA.cpp b/lib/Analysis/MemorySSA.cpp
similarity index 80%
rename from lib/Transforms/Utils/MemorySSA.cpp
rename to lib/Analysis/MemorySSA.cpp
index af4dc028b851a562c2021d1bc0bf9495e2b0d2fc..910170561abf6cc5d72eea27a124bc16b6f4d7f3 100644
--- a/lib/Transforms/Utils/MemorySSA.cpp
+++ b/lib/Analysis/MemorySSA.cpp
@@ -10,7 +10,7 @@
 // This file implements the MemorySSA class.
 //
 //===----------------------------------------------------------------===//
-#include "llvm/Transforms/Utils/MemorySSA.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
@@ -44,10 +44,6 @@
 
 #define DEBUG_TYPE "memoryssa"
 using namespace llvm;
-STATISTIC(NumClobberCacheLookups, "Number of Memory SSA version cache lookups");
-STATISTIC(NumClobberCacheHits, "Number of Memory SSA version cache hits");
-STATISTIC(NumClobberCacheInserts, "Number of MemorySSA version cache inserts");
-
 INITIALIZE_PASS_BEGIN(MemorySSAWrapperPass, "memoryssa", "Memory SSA", false,
                       true)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
@@ -218,12 +214,16 @@ static bool instructionClobbersQuery(MemoryDef *MD,
                                      AliasAnalysis &AA) {
   Instruction *DefInst = MD->getMemoryInst();
   assert(DefInst && "Defining instruction not actually an instruction");
+  ImmutableCallSite UseCS(UseInst);
 
   if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(DefInst)) {
     // These intrinsics will show up as affecting memory, but they are just
     // markers.
     switch (II->getIntrinsicID()) {
     case Intrinsic::lifetime_start:
+      if (UseCS)
+        return false;
+      return AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), UseLoc);
     case Intrinsic::lifetime_end:
     case Intrinsic::invariant_start:
     case Intrinsic::invariant_end:
@@ -234,7 +234,6 @@ static bool instructionClobbersQuery(MemoryDef *MD,
     }
   }
 
-  ImmutableCallSite UseCS(UseInst);
   if (UseCS) {
     ModRefInfo I = AA.getModRefInfo(DefInst, UseCS);
     return I != MRI_NoModRef;
@@ -269,8 +268,8 @@ static bool instructionClobbersQuery(MemoryDef *MD, const MemoryUseOrDef *MU,
 }
 
 // Return true when MD may alias MU, return false otherwise.
-bool defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
-                         AliasAnalysis &AA) {
+bool MemorySSAUtil::defClobbersUseOrDef(MemoryDef *MD, const MemoryUseOrDef *MU,
+                                        AliasAnalysis &AA) {
   return instructionClobbersQuery(MD, MU, MemoryLocOrCall(MU), AA);
 }
 }
@@ -302,7 +301,6 @@ static bool lifetimeEndsAt(MemoryDef *MD, const MemoryLocation &Loc,
   Instruction *Inst = MD->getMemoryInst();
   if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     switch (II->getIntrinsicID()) {
-    case Intrinsic::lifetime_start:
     case Intrinsic::lifetime_end:
       return AA.isMustAlias(MemoryLocation(II->getArgOperand(1)), Loc);
     default:
@@ -320,95 +318,8 @@ static bool isUseTriviallyOptimizableToLiveOnEntry(AliasAnalysis &AA,
   // FIXME: We should handle invariant groups, as well. It's a bit harder,
   // because we need to pay close attention to invariant group barriers.
   return isa<LoadInst>(I) && (I->getMetadata(LLVMContext::MD_invariant_load) ||
-                              AA.pointsToConstantMemory(I));
-}
-
-/// Cache for our caching MemorySSA walker.
-class WalkerCache {
-  DenseMap<ConstMemoryAccessPair, MemoryAccess *> Accesses;
-  DenseMap<const MemoryAccess *, MemoryAccess *> Calls;
-
-public:
-  MemoryAccess *lookup(const MemoryAccess *MA, const MemoryLocation &Loc,
-                       bool IsCall) const {
-    ++NumClobberCacheLookups;
-    MemoryAccess *R = IsCall ? Calls.lookup(MA) : Accesses.lookup({MA, Loc});
-    if (R)
-      ++NumClobberCacheHits;
-    return R;
-  }
-
-  bool insert(const MemoryAccess *MA, MemoryAccess *To,
-              const MemoryLocation &Loc, bool IsCall) {
-    // This is fine for Phis, since there are times where we can't optimize
-    // them.  Making a def its own clobber is never correct, though.
-    assert((MA != To || isa<MemoryPhi>(MA)) &&
-           "Something can't clobber itself!");
-
-    ++NumClobberCacheInserts;
-    bool Inserted;
-    if (IsCall)
-      Inserted = Calls.insert({MA, To}).second;
-    else
-      Inserted = Accesses.insert({{MA, Loc}, To}).second;
-
-    return Inserted;
-  }
-
-  bool remove(const MemoryAccess *MA, const MemoryLocation &Loc, bool IsCall) {
-    return IsCall ? Calls.erase(MA) : Accesses.erase({MA, Loc});
-  }
-
-  void clear() {
-    Accesses.clear();
-    Calls.clear();
-  }
-
-  bool contains(const MemoryAccess *MA) const {
-    for (auto &P : Accesses)
-      if (P.first.first == MA || P.second == MA)
-        return true;
-    for (auto &P : Calls)
-      if (P.first == MA || P.second == MA)
-        return true;
-    return false;
-  }
-};
-
-/// Walks the defining uses of MemoryDefs. Stops after we hit something that has
-/// no defining use (e.g. a MemoryPhi or liveOnEntry). Note that, when comparing
-/// against a null def_chain_iterator, this will compare equal only after
-/// walking said Phi/liveOnEntry.
-struct def_chain_iterator
-    : public iterator_facade_base<def_chain_iterator, std::forward_iterator_tag,
-                                  MemoryAccess *> {
-  def_chain_iterator() : MA(nullptr) {}
-  def_chain_iterator(MemoryAccess *MA) : MA(MA) {}
-
-  MemoryAccess *operator*() const { return MA; }
-
-  def_chain_iterator &operator++() {
-    // N.B. liveOnEntry has a null defining access.
-    if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
-      MA = MUD->getDefiningAccess();
-    else
-      MA = nullptr;
-    return *this;
-  }
-
-  bool operator==(const def_chain_iterator &O) const { return MA == O.MA; }
-
-private:
-  MemoryAccess *MA;
-};
-
-static iterator_range<def_chain_iterator>
-def_chain(MemoryAccess *MA, MemoryAccess *UpTo = nullptr) {
-#ifdef EXPENSIVE_CHECKS
-  assert((!UpTo || find(def_chain(MA), UpTo) != def_chain_iterator()) &&
-         "UpTo isn't in the def chain!");
-#endif
-  return make_range(def_chain_iterator(MA), def_chain_iterator(UpTo));
+                              AA.pointsToConstantMemory(cast<LoadInst>(I)->
+                                                          getPointerOperand()));
 }
 
 /// Verifies that `Start` is clobbered by `ClobberAt`, and that nothing
@@ -512,91 +423,24 @@ class ClobberWalker {
   const MemorySSA &MSSA;
   AliasAnalysis &AA;
   DominatorTree &DT;
-  WalkerCache &WC;
   UpwardsMemoryQuery *Query;
-  bool UseCache;
 
   // Phi optimization bookkeeping
   SmallVector<DefPath, 32> Paths;
   DenseSet<ConstMemoryAccessPair> VisitedPhis;
-  DenseMap<const BasicBlock *, MemoryAccess *> WalkTargetCache;
-
-  void setUseCache(bool Use) { UseCache = Use; }
-  bool shouldIgnoreCache() const {
-    // UseCache will only be false when we're debugging, or when expensive
-    // checks are enabled. In either case, we don't care deeply about speed.
-    return LLVM_UNLIKELY(!UseCache);
-  }
-
-  void addCacheEntry(const MemoryAccess *What, MemoryAccess *To,
-                     const MemoryLocation &Loc) const {
-// EXPENSIVE_CHECKS because most of these queries are redundant.
-#ifdef EXPENSIVE_CHECKS
-    assert(MSSA.dominates(To, What));
-#endif
-    if (shouldIgnoreCache())
-      return;
-    WC.insert(What, To, Loc, Query->IsCall);
-  }
-
-  MemoryAccess *lookupCache(const MemoryAccess *MA, const MemoryLocation &Loc) {
-    return shouldIgnoreCache() ? nullptr : WC.lookup(MA, Loc, Query->IsCall);
-  }
-
-  void cacheDefPath(const DefPath &DN, MemoryAccess *Target) const {
-    if (shouldIgnoreCache())
-      return;
-
-    for (MemoryAccess *MA : def_chain(DN.First, DN.Last))
-      addCacheEntry(MA, Target, DN.Loc);
-
-    // DefPaths only express the path we walked. So, DN.Last could either be a
-    // thing we want to cache, or not.
-    if (DN.Last != Target)
-      addCacheEntry(DN.Last, Target, DN.Loc);
-  }
 
   /// Find the nearest def or phi that `From` can legally be optimized to.
-  ///
-  /// FIXME: Deduplicate this with MSSA::findDominatingDef. Ideally, MSSA should
-  /// keep track of this information for us, and allow us O(1) lookups of this
-  /// info.
-  MemoryAccess *getWalkTarget(const MemoryPhi *From) {
+  const MemoryAccess *getWalkTarget(const MemoryPhi *From) const {
     assert(From->getNumOperands() && "Phi with no operands?");
 
     BasicBlock *BB = From->getBlock();
-    auto At = WalkTargetCache.find(BB);
-    if (At != WalkTargetCache.end())
-      return At->second;
-
-    SmallVector<const BasicBlock *, 8> ToCache;
-    ToCache.push_back(BB);
-
     MemoryAccess *Result = MSSA.getLiveOnEntryDef();
     DomTreeNode *Node = DT.getNode(BB);
     while ((Node = Node->getIDom())) {
-      auto At = WalkTargetCache.find(BB);
-      if (At != WalkTargetCache.end()) {
-        Result = At->second;
-        break;
-      }
-
-      auto *Accesses = MSSA.getBlockAccesses(Node->getBlock());
-      if (Accesses) {
-        auto Iter = find_if(reverse(*Accesses), [](const MemoryAccess &MA) {
-          return !isa<MemoryUse>(MA);
-        });
-        if (Iter != Accesses->rend()) {
-          Result = const_cast<MemoryAccess *>(&*Iter);
-          break;
-        }
-      }
-
-      ToCache.push_back(Node->getBlock());
+      auto *Defs = MSSA.getBlockDefs(Node->getBlock());
+      if (Defs)
+        return &*Defs->rbegin();
     }
-
-    for (const BasicBlock *BB : ToCache)
-      WalkTargetCache.insert({BB, Result});
     return Result;
   }
 
@@ -606,7 +450,6 @@ class ClobberWalker {
     /// both.
     MemoryAccess *Result;
     bool IsKnownClobber;
-    bool FromCache;
   };
 
   /// Walk to the next Phi or Clobber in the def chain starting at Desc.Last.
@@ -614,29 +457,25 @@ class ClobberWalker {
   /// StopAt.
   ///
   /// This does not test for whether StopAt is a clobber
-  UpwardsWalkResult walkToPhiOrClobber(DefPath &Desc,
-                                       MemoryAccess *StopAt = nullptr) {
+  UpwardsWalkResult
+  walkToPhiOrClobber(DefPath &Desc,
+                     const MemoryAccess *StopAt = nullptr) const {
     assert(!isa<MemoryUse>(Desc.Last) && "Uses don't exist in my world");
 
     for (MemoryAccess *Current : def_chain(Desc.Last)) {
       Desc.Last = Current;
       if (Current == StopAt)
-        return {Current, false, false};
+        return {Current, false};
 
       if (auto *MD = dyn_cast<MemoryDef>(Current))
         if (MSSA.isLiveOnEntryDef(MD) ||
             instructionClobbersQuery(MD, Desc.Loc, Query->Inst, AA))
-          return {MD, true, false};
-
-      // Cache checks must be done last, because if Current is a clobber, the
-      // cache will contain the clobber for Current.
-      if (MemoryAccess *MA = lookupCache(Current, Desc.Loc))
-        return {MA, true, true};
+          return {MD, true};
     }
 
     assert(isa<MemoryPhi>(Desc.Last) &&
            "Ended at a non-clobber that's not a phi?");
-    return {Desc.Last, false, false};
+    return {Desc.Last, false};
   }
 
   void addSearches(MemoryPhi *Phi, SmallVectorImpl<ListIndex> &PausedSearches,
@@ -666,7 +505,7 @@ class ClobberWalker {
   /// If this returns None, NewPaused is a vector of searches that terminated
   /// at StopWhere. Otherwise, NewPaused is left in an unspecified state.
   Optional<TerminatedPath>
-  getBlockingAccess(MemoryAccess *StopWhere,
+  getBlockingAccess(const MemoryAccess *StopWhere,
                     SmallVectorImpl<ListIndex> &PausedSearches,
                     SmallVectorImpl<ListIndex> &NewPaused,
                     SmallVectorImpl<TerminatedPath> &Terminated) {
@@ -701,11 +540,11 @@ class ClobberWalker {
 
       UpwardsWalkResult Res = walkToPhiOrClobber(Node, /*StopAt=*/StopWhere);
       if (Res.IsKnownClobber) {
-        assert(Res.Result != StopWhere || Res.FromCache);
+        assert(Res.Result != StopWhere);
         // If this wasn't a cache hit, we hit a clobber when walking. That's a
         // failure.
         TerminatedPath Term{Res.Result, PathIndex};
-        if (!Res.FromCache || !MSSA.dominates(Res.Result, StopWhere))
+        if (!MSSA.dominates(Res.Result, StopWhere))
           return Term;
 
         // Otherwise, it's a valid thing to potentially optimize to.
@@ -830,7 +669,7 @@ class ClobberWalker {
       assert(!MSSA.isLiveOnEntryDef(Current) &&
              "liveOnEntry wasn't treated as a clobber?");
 
-      MemoryAccess *Target = getWalkTarget(Current);
+      const auto *Target = getWalkTarget(Current);
       // If a TerminatedPath doesn't dominate Target, then it wasn't a legal
       // optimization for the prior phi.
       assert(all_of(TerminatedPaths, [&](const TerminatedPath &P) {
@@ -842,8 +681,6 @@ class ClobberWalker {
       // For the moment, this is fine, since we do nothing with blocker info.
       if (Optional<TerminatedPath> Blocker = getBlockingAccess(
               Target, PausedSearches, NewPaused, TerminatedPaths)) {
-        // Cache our work on the blocking node, since we know that's correct.
-        cacheDefPath(Paths[Blocker->LastNode], Blocker->Clobber);
 
         // Find the node we started at. We can't search based on N->Last, since
         // we may have gone around a loop with a different MemoryLocation.
@@ -908,7 +745,7 @@ class ClobberWalker {
         // If we couldn't find the dominating phi/liveOnEntry in the above loop,
         // do it now.
         if (!DefChainEnd)
-          for (MemoryAccess *MA : def_chain(Target))
+          for (auto *MA : def_chain(const_cast<MemoryAccess *>(Target)))
             DefChainEnd = MA;
 
         // If any of the terminated paths don't dominate the phi we'll try to
@@ -946,35 +783,6 @@ class ClobberWalker {
     }
   }
 
-  /// Caches everything in an OptznResult.
-  void cacheOptResult(const OptznResult &R) {
-    if (R.OtherClobbers.empty()) {
-      // If we're not going to be caching OtherClobbers, don't bother with
-      // marking visited/etc.
-      for (const DefPath &N : const_def_path(R.PrimaryClobber.LastNode))
-        cacheDefPath(N, R.PrimaryClobber.Clobber);
-      return;
-    }
-
-    // PrimaryClobber is our answer. If we can cache anything back, we need to
-    // stop caching when we visit PrimaryClobber.
-    SmallBitVector Visited(Paths.size());
-    for (const DefPath &N : const_def_path(R.PrimaryClobber.LastNode)) {
-      Visited[defPathIndex(N)] = true;
-      cacheDefPath(N, R.PrimaryClobber.Clobber);
-    }
-
-    for (const TerminatedPath &P : R.OtherClobbers) {
-      for (const DefPath &N : const_def_path(P.LastNode)) {
-        ListIndex NIndex = defPathIndex(N);
-        if (Visited[NIndex])
-          break;
-        Visited[NIndex] = true;
-        cacheDefPath(N, P.Clobber);
-      }
-    }
-  }
-
   void verifyOptResult(const OptznResult &R) const {
     assert(all_of(R.OtherClobbers, [&](const TerminatedPath &P) {
       return MSSA.dominates(P.Clobber, R.PrimaryClobber.Clobber);
@@ -987,17 +795,14 @@ class ClobberWalker {
   }
 
 public:
-  ClobberWalker(const MemorySSA &MSSA, AliasAnalysis &AA, DominatorTree &DT,
-                WalkerCache &WC)
-      : MSSA(MSSA), AA(AA), DT(DT), WC(WC), UseCache(true) {}
+  ClobberWalker(const MemorySSA &MSSA, AliasAnalysis &AA, DominatorTree &DT)
+      : MSSA(MSSA), AA(AA), DT(DT) {}
 
-  void reset() { WalkTargetCache.clear(); }
+  void reset() {}
 
   /// Finds the nearest clobber for the given query, optimizing phis if
   /// possible.
-  MemoryAccess *findClobber(MemoryAccess *Start, UpwardsMemoryQuery &Q,
-                            bool UseWalkerCache = true) {
-    setUseCache(UseWalkerCache);
+  MemoryAccess *findClobber(MemoryAccess *Start, UpwardsMemoryQuery &Q) {
     Query = &Q;
 
     MemoryAccess *Current = Start;
@@ -1012,13 +817,11 @@ public:
     UpwardsWalkResult WalkResult = walkToPhiOrClobber(FirstDesc);
     MemoryAccess *Result;
     if (WalkResult.IsKnownClobber) {
-      cacheDefPath(FirstDesc, WalkResult.Result);
       Result = WalkResult.Result;
     } else {
       OptznResult OptRes = tryOptimizePhi(cast<MemoryPhi>(FirstDesc.Last),
                                           Current, Q.StartingLoc);
       verifyOptResult(OptRes);
-      cacheOptResult(OptRes);
       resetPhiOptznState();
       Result = OptRes.PrimaryClobber.Clobber;
     }
@@ -1049,41 +852,10 @@ struct RenamePassData {
 } // anonymous namespace
 
 namespace llvm {
-/// \brief A MemorySSAWalker that does AA walks and caching of lookups to
-/// disambiguate accesses.
-///
-/// FIXME: The current implementation of this can take quadratic space in rare
-/// cases. This can be fixed, but it is something to note until it is fixed.
-///
-/// In order to trigger this behavior, you need to store to N distinct locations
-/// (that AA can prove don't alias), perform M stores to other memory
-/// locations that AA can prove don't alias any of the initial N locations, and
-/// then load from all of the N locations. In this case, we insert M cache
-/// entries for each of the N loads.
-///
-/// For example:
-/// define i32 @foo() {
-///   %a = alloca i32, align 4
-///   %b = alloca i32, align 4
-///   store i32 0, i32* %a, align 4
-///   store i32 0, i32* %b, align 4
-///
-///   ; Insert M stores to other memory that doesn't alias %a or %b here
-///
-///   %c = load i32, i32* %a, align 4 ; Caches M entries in
-///                                   ; CachedUpwardsClobberingAccess for the
-///                                   ; MemoryLocation %a
-///   %d = load i32, i32* %b, align 4 ; Caches M entries in
-///                                   ; CachedUpwardsClobberingAccess for the
-///                                   ; MemoryLocation %b
-///
-///   ; For completeness' sake, loading %a or %b again would not cache *another*
-///   ; M entries.
-///   %r = add i32 %c, %d
-///   ret i32 %r
-/// }
+/// \brief A MemorySSAWalker that does AA walks to disambiguate accesses. It no
+/// longer does caching on its own,
+/// but the name has been retained for the moment.
 class MemorySSA::CachingWalker final : public MemorySSAWalker {
-  WalkerCache Cache;
   ClobberWalker Walker;
   bool AutoResetWalker;
 
@@ -1104,10 +876,7 @@ public:
   /// answer a clobber query.
   void setAutoResetWalker(bool AutoReset) { AutoResetWalker = AutoReset; }
 
-  /// Drop the walker's persistent data structures. At the moment, this means
-  /// "drop the walker's cache of BasicBlocks ->
-  /// earliest-MemoryAccess-we-can-optimize-to". This is necessary if we're
-  /// going to have DT updates, if we remove MemoryAccesses, etc.
+  /// Drop the walker's persistent data structures.
   void resetClobberWalker() { Walker.reset(); }
 
   void verify(const MemorySSA *MSSA) override {
@@ -1116,18 +885,37 @@ public:
   }
 };
 
+void MemorySSA::renameSuccessorPhis(BasicBlock *BB, MemoryAccess *IncomingVal,
+                                    bool RenameAllUses) {
+  // Pass through values to our successors
+  for (const BasicBlock *S : successors(BB)) {
+    auto It = PerBlockAccesses.find(S);
+    // Rename the phi nodes in our successor block
+    if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
+      continue;
+    AccessList *Accesses = It->second.get();
+    auto *Phi = cast<MemoryPhi>(&Accesses->front());
+    if (RenameAllUses) {
+      int PhiIndex = Phi->getBasicBlockIndex(BB);
+      assert(PhiIndex != -1 && "Incomplete phi during partial rename");
+      Phi->setIncomingValue(PhiIndex, IncomingVal);
+    } else
+      Phi->addIncoming(IncomingVal, BB);
+  }
+}
+
 /// \brief Rename a single basic block into MemorySSA form.
 /// Uses the standard SSA renaming algorithm.
 /// \returns The new incoming value.
-MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
-                                     MemoryAccess *IncomingVal) {
+MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB, MemoryAccess *IncomingVal,
+                                     bool RenameAllUses) {
   auto It = PerBlockAccesses.find(BB);
   // Skip most processing if the list is empty.
   if (It != PerBlockAccesses.end()) {
     AccessList *Accesses = It->second.get();
     for (MemoryAccess &L : *Accesses) {
       if (MemoryUseOrDef *MUD = dyn_cast<MemoryUseOrDef>(&L)) {
-        if (MUD->getDefiningAccess() == nullptr)
+        if (MUD->getDefiningAccess() == nullptr || RenameAllUses)
           MUD->setDefiningAccess(IncomingVal);
         if (isa<MemoryDef>(&L))
           IncomingVal = &L;
@@ -1136,18 +924,6 @@ MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
       }
     }
   }
-
-  // Pass through values to our successors
-  for (const BasicBlock *S : successors(BB)) {
-    auto It = PerBlockAccesses.find(S);
-    // Rename the phi nodes in our successor block
-    if (It == PerBlockAccesses.end() || !isa<MemoryPhi>(It->second->front()))
-      continue;
-    AccessList *Accesses = It->second.get();
-    auto *Phi = cast<MemoryPhi>(&Accesses->front());
-    Phi->addIncoming(IncomingVal, BB);
-  }
-
   return IncomingVal;
 }
 
@@ -1156,11 +932,19 @@ MemoryAccess *MemorySSA::renameBlock(BasicBlock *BB,
 /// We walk the dominator tree in preorder, renaming accesses, and then filling
 /// in phi nodes in our successors.
 void MemorySSA::renamePass(DomTreeNode *Root, MemoryAccess *IncomingVal,
-                           SmallPtrSet<BasicBlock *, 16> &Visited) {
+                           SmallPtrSetImpl<BasicBlock *> &Visited,
+                           bool SkipVisited, bool RenameAllUses) {
   SmallVector<RenamePassData, 32> WorkStack;
-  IncomingVal = renameBlock(Root->getBlock(), IncomingVal);
+  // Skip everything if we already renamed this block and we are skipping.
+  // Note: You can't sink this into the if, because we need it to occur
+  // regardless of whether we skip blocks or not.
+  bool AlreadyVisited = !Visited.insert(Root->getBlock()).second;
+  if (SkipVisited && AlreadyVisited)
+    return;
+
+  IncomingVal = renameBlock(Root->getBlock(), IncomingVal, RenameAllUses);
+  renameSuccessorPhis(Root->getBlock(), IncomingVal, RenameAllUses);
   WorkStack.push_back({Root, Root->begin(), IncomingVal});
-  Visited.insert(Root->getBlock());
 
   while (!WorkStack.empty()) {
     DomTreeNode *Node = WorkStack.back().DTN;
@@ -1173,20 +957,25 @@ void MemorySSA::renamePass(DomTreeNode *Root, MemoryAccess *IncomingVal,
       DomTreeNode *Child = *ChildIt;
       ++WorkStack.back().ChildIt;
       BasicBlock *BB = Child->getBlock();
-      Visited.insert(BB);
-      IncomingVal = renameBlock(BB, IncomingVal);
+      // Note: You can't sink this into the if, because we need it to occur
+      // regardless of whether we skip blocks or not.
+      AlreadyVisited = !Visited.insert(BB).second;
+      if (SkipVisited && AlreadyVisited) {
+        // We already visited this during our renaming, which can happen when
+        // being asked to rename multiple blocks. Figure out the incoming val,
+        // which is the last def.
+        // Incoming value can only change if there is a block def, and in that
+        // case, it's the last block def in the list.
+        if (auto *BlockDefs = getWritableBlockDefs(BB))
+          IncomingVal = &*BlockDefs->rbegin();
+      } else
+        IncomingVal = renameBlock(BB, IncomingVal, RenameAllUses);
+      renameSuccessorPhis(BB, IncomingVal, RenameAllUses);
       WorkStack.push_back({Child, Child->begin(), IncomingVal});
     }
   }
 }
 
-/// \brief Compute dominator levels, used by the phi insertion algorithm above.
-void MemorySSA::computeDomLevels(DenseMap<DomTreeNode *, unsigned> &DomLevels) {
-  for (auto DFI = df_begin(DT->getRootNode()), DFE = df_end(DT->getRootNode());
-       DFI != DFE; ++DFI)
-    DomLevels[*DFI] = DFI.getPathLength() - 1;
-}
-
 /// \brief This handles unreachable block accesses by deleting phi nodes in
 /// unreachable blocks, and marking all other unreachable MemoryAccess's as
 /// being uses of the live on entry definition.
@@ -1322,7 +1111,10 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
 
   // Pop everything that doesn't dominate the current block off the stack,
   // increment the PopEpoch to account for this.
-  while (!VersionStack.empty()) {
+  while (true) {
+    assert(
+        !VersionStack.empty() &&
+        "Version stack should have liveOnEntry sentinel dominating everything");
     BasicBlock *BackBlock = VersionStack.back()->getBlock();
     if (DT->dominates(BackBlock, BB))
       break;
@@ -1330,6 +1122,7 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
       VersionStack.pop_back();
     ++PopEpoch;
   }
+
   for (MemoryAccess &MA : *Accesses) {
     auto *MU = dyn_cast<MemoryUse>(&MA);
     if (!MU) {
@@ -1450,20 +1243,13 @@ void MemorySSA::OptimizeUses::optimizeUsesInBlock(
 
 /// Optimize uses to point to their actual clobbering definitions.
 void MemorySSA::OptimizeUses::optimizeUses() {
-
-  // We perform a non-recursive top-down dominator tree walk
-  struct StackInfo {
-    const DomTreeNode *Node;
-    DomTreeNode::const_iterator Iter;
-  };
-
   SmallVector<MemoryAccess *, 16> VersionStack;
-  SmallVector<StackInfo, 16> DomTreeWorklist;
   DenseMap<MemoryLocOrCall, MemlocStackInfo> LocStackInfo;
   VersionStack.push_back(MSSA->getLiveOnEntryDef());
 
   unsigned long StackEpoch = 1;
   unsigned long PopEpoch = 1;
+  // We perform a non-recursive top-down dominator tree walk.
   for (const auto *DomNode : depth_first(DT->getRootNode()))
     optimizeUsesInBlock(DomNode->getBlock(), StackEpoch, PopEpoch, VersionStack,
                         LocStackInfo);
@@ -1667,37 +1453,19 @@ MemoryUseOrDef *MemorySSA::createDefinedAccess(Instruction *I,
   return NewAccess;
 }
 
-MemoryAccess *MemorySSA::createMemoryAccessInBB(Instruction *I,
-                                                MemoryAccess *Definition,
-                                                const BasicBlock *BB,
-                                                InsertionPlace Point) {
-  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
-  insertIntoListsForBlock(NewAccess, BB, Point);
-  return NewAccess;
-}
-
-MemoryUseOrDef *MemorySSA::createMemoryAccessBefore(Instruction *I,
-                                                    MemoryAccess *Definition,
-                                                    MemoryUseOrDef *InsertPt) {
-  assert(I->getParent() == InsertPt->getBlock() &&
-         "New and old access must be in the same block");
-  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
-  insertIntoListsBefore(NewAccess, InsertPt->getBlock(),
-                        InsertPt->getIterator());
-  return NewAccess;
-}
-
-MemoryUseOrDef *MemorySSA::createMemoryAccessAfter(Instruction *I,
-                                                   MemoryAccess *Definition,
-                                                   MemoryAccess *InsertPt) {
-  assert(I->getParent() == InsertPt->getBlock() &&
-         "New and old access must be in the same block");
-  MemoryUseOrDef *NewAccess = createDefinedAccess(I, Definition);
-  insertIntoListsBefore(NewAccess, InsertPt->getBlock(),
-                        ++(InsertPt->getIterator()));
-  return NewAccess;
+// Return true if the instruction has ordering constraints.
+// Note specifically that this only considers stores and loads
+// because others are still considered ModRef by getModRefInfo.
+static inline bool isOrdered(const Instruction *I) {
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    if (!SI->isUnordered())
+      return true;
+  } else if (auto *LI = dyn_cast<LoadInst>(I)) {
+    if (!LI->isUnordered())
+      return true;
+  }
+  return false;
 }
-
 /// \brief Helper function to create new memory accesses
 MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
   // The assume intrinsic has a control dependency which we model by claiming
@@ -1710,7 +1478,15 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
 
   // Find out what affect this instruction has on memory.
   ModRefInfo ModRef = AA->getModRefInfo(I);
-  bool Def = bool(ModRef & MRI_Mod);
+  // The isOrdered check is used to ensure that volatiles end up as defs
+  // (atomics end up as ModRef right now anyway).  Until we separate the
+  // ordering chain from the memory chain, this enables people to see at least
+  // some relative ordering to volatiles.  Note that getClobberingMemoryAccess
+  // will still give an answer that bypasses other volatile loads.  TODO:
+  // Separate memory aliasing and ordering into two different chains so that we
+  // can precisely represent both "what memory will this read/write/is clobbered
+  // by" and "what instructions can I move this past".
+  bool Def = bool(ModRef & MRI_Mod) || isOrdered(I);
   bool Use = bool(ModRef & MRI_Ref);
 
   // It's possible for an instruction to not modify memory at all. During
@@ -1730,33 +1506,6 @@ MemoryUseOrDef *MemorySSA::createNewAccess(Instruction *I) {
   return MUD;
 }
 
-MemoryAccess *MemorySSA::findDominatingDef(BasicBlock *UseBlock,
-                                           enum InsertionPlace Where) {
-  // Handle the initial case
-  if (Where == Beginning)
-    // The only thing that could define us at the beginning is a phi node
-    if (MemoryPhi *Phi = getMemoryAccess(UseBlock))
-      return Phi;
-
-  DomTreeNode *CurrNode = DT->getNode(UseBlock);
-  // Need to be defined by our dominator
-  if (Where == Beginning)
-    CurrNode = CurrNode->getIDom();
-  Where = End;
-  while (CurrNode) {
-    auto It = PerBlockAccesses.find(CurrNode->getBlock());
-    if (It != PerBlockAccesses.end()) {
-      auto &Accesses = It->second;
-      for (MemoryAccess &RA : reverse(*Accesses)) {
-        if (isa<MemoryDef>(RA) || isa<MemoryPhi>(RA))
-          return &RA;
-      }
-    }
-    CurrNode = CurrNode->getIDom();
-  }
-  return LiveOnEntryDef.get();
-}
-
 /// \brief Returns true if \p Replacer dominates \p Replacee .
 bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
                              const MemoryAccess *Replacee) const {
@@ -1774,20 +1523,6 @@ bool MemorySSA::dominatesUse(const MemoryAccess *Replacer,
   return true;
 }
 
-/// \brief If all arguments of a MemoryPHI are defined by the same incoming
-/// argument, return that argument.
-static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
-  MemoryAccess *MA = nullptr;
-
-  for (auto &Arg : MP->operands()) {
-    if (!MA)
-      MA = cast<MemoryAccess>(Arg);
-    else if (MA != Arg)
-      return nullptr;
-  }
-  return MA;
-}
-
 /// \brief Properly remove \p MA from all of MemorySSA's lookup tables.
 void MemorySSA::removeFromLookups(MemoryAccess *MA) {
   assert(MA->use_empty() &&
@@ -1841,62 +1576,13 @@ void MemorySSA::removeFromLists(MemoryAccess *MA, bool ShouldDelete) {
     PerBlockAccesses.erase(AccessIt);
 }
 
-void MemorySSA::removeMemoryAccess(MemoryAccess *MA) {
-  assert(!isLiveOnEntryDef(MA) && "Trying to remove the live on entry def");
-  // We can only delete phi nodes if they have no uses, or we can replace all
-  // uses with a single definition.
-  MemoryAccess *NewDefTarget = nullptr;
-  if (MemoryPhi *MP = dyn_cast<MemoryPhi>(MA)) {
-    // Note that it is sufficient to know that all edges of the phi node have
-    // the same argument.  If they do, by the definition of dominance frontiers
-    // (which we used to place this phi), that argument must dominate this phi,
-    // and thus, must dominate the phi's uses, and so we will not hit the assert
-    // below.
-    NewDefTarget = onlySingleValue(MP);
-    assert((NewDefTarget || MP->use_empty()) &&
-           "We can't delete this memory phi");
-  } else {
-    NewDefTarget = cast<MemoryUseOrDef>(MA)->getDefiningAccess();
-  }
-
-  // Re-point the uses at our defining access
-  if (!isa<MemoryUse>(MA) && !MA->use_empty()) {
-    // Reset optimized on users of this store, and reset the uses.
-    // A few notes:
-    // 1. This is a slightly modified version of RAUW to avoid walking the
-    // uses twice here.
-    // 2. If we wanted to be complete, we would have to reset the optimized
-    // flags on users of phi nodes if doing the below makes a phi node have all
-    // the same arguments. Instead, we prefer users to removeMemoryAccess those
-    // phi nodes, because doing it here would be N^3.
-    if (MA->hasValueHandle())
-      ValueHandleBase::ValueIsRAUWd(MA, NewDefTarget);
-    // Note: We assume MemorySSA is not used in metadata since it's not really
-    // part of the IR.
-
-    while (!MA->use_empty()) {
-      Use &U = *MA->use_begin();
-      if (MemoryUse *MU = dyn_cast<MemoryUse>(U.getUser()))
-        MU->resetOptimized();
-      U.set(NewDefTarget);
-    }
-  }
-
-  // The call below to erase will destroy MA, so we can't change the order we
-  // are doing things here
-  removeFromLookups(MA);
-  removeFromLists(MA);
-}
-
 void MemorySSA::print(raw_ostream &OS) const {
   MemorySSAAnnotatedWriter Writer(this);
   F.print(OS, &Writer);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void MemorySSA::dump() const {
-  print(dbgs());
-}
+LLVM_DUMP_METHOD void MemorySSA::dump() const { print(dbgs()); }
 #endif
 
 void MemorySSA::verifyMemorySSA() const {
@@ -2166,7 +1852,7 @@ void MemoryUse::print(raw_ostream &OS) const {
 }
 
 void MemoryAccess::dump() const {
-  // Cannot completely remove virtual function even in release mode.
+// Cannot completely remove virtual function even in release mode.
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   print(dbgs());
   dbgs() << "\n";
@@ -2248,35 +1934,13 @@ MemorySSAWalker::MemorySSAWalker(MemorySSA *M) : MSSA(M) {}
 
 MemorySSA::CachingWalker::CachingWalker(MemorySSA *M, AliasAnalysis *A,
                                         DominatorTree *D)
-    : MemorySSAWalker(M), Walker(*M, *A, *D, Cache), AutoResetWalker(true) {}
+    : MemorySSAWalker(M), Walker(*M, *A, *D), AutoResetWalker(true) {}
 
 MemorySSA::CachingWalker::~CachingWalker() {}
 
 void MemorySSA::CachingWalker::invalidateInfo(MemoryAccess *MA) {
-  // TODO: We can do much better cache invalidation with differently stored
-  // caches.  For now, for MemoryUses, we simply remove them
-  // from the cache, and kill the entire call/non-call cache for everything
-  // else.  The problem is for phis or defs, currently we'd need to follow use
-  // chains down and invalidate anything below us in the chain that currently
-  // terminates at this access.
-
-  // See if this is a MemoryUse, if so, just remove the cached info. MemoryUse
-  // is by definition never a barrier, so nothing in the cache could point to
-  // this use. In that case, we only need invalidate the info for the use
-  // itself.
-
-  if (MemoryUse *MU = dyn_cast<MemoryUse>(MA)) {
-    UpwardsMemoryQuery Q(MU->getMemoryInst(), MU);
-    Cache.remove(MU, Q.StartingLoc, Q.IsCall);
-    MU->resetOptimized();
-  } else {
-    // If it is not a use, the best we can do right now is destroy the cache.
-    Cache.clear();
-  }
-
-#ifdef EXPENSIVE_CHECKS
-  verifyRemoved(MA);
-#endif
+  if (auto *MUD = dyn_cast<MemoryUseOrDef>(MA))
+    MUD->resetOptimized();
 }
 
 /// \brief Walk the use-def chains starting at \p MA and find
@@ -2287,8 +1951,7 @@ MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
     MemoryAccess *StartingAccess, UpwardsMemoryQuery &Q) {
   MemoryAccess *New = Walker.findClobber(StartingAccess, Q);
 #ifdef EXPENSIVE_CHECKS
-  MemoryAccess *NewNoCache =
-      Walker.findClobber(StartingAccess, Q, /*UseWalkerCache=*/false);
+  MemoryAccess *NewNoCache = Walker.findClobber(StartingAccess, Q);
   assert(NewNoCache == New && "Cache made us hand back a different result?");
 #endif
   if (AutoResetWalker)
@@ -2318,9 +1981,6 @@ MemoryAccess *MemorySSA::CachingWalker::getClobberingMemoryAccess(
   Q.Inst = I;
   Q.IsCall = false;
 
-  if (auto *CacheResult = Cache.lookup(StartingUseOrDef, Loc, Q.IsCall))
-    return CacheResult;
-
   // Unlike the other function, do not walk to the def of a def, because we are
   // handed something we already believe is the clobbering access.
   MemoryAccess *DefiningAccess = isa<MemoryUse>(StartingUseOrDef)
@@ -2345,9 +2005,9 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   // If this is an already optimized use or def, return the optimized result.
   // Note: Currently, we do not store the optimized def result because we'd need
   // a separate field, since we can't use it as the defining access.
-  if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
-    if (MU->isOptimized())
-      return MU->getDefiningAccess();
+  if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
+    if (MUD->isOptimized())
+      return MUD->getOptimized();
 
   const Instruction *I = StartingAccess->getMemoryInst();
   UpwardsMemoryQuery Q(I, StartingAccess);
@@ -2357,14 +2017,10 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   if (!Q.IsCall && I->isFenceLike())
     return StartingAccess;
 
-  if (auto *CacheResult = Cache.lookup(StartingAccess, Q.StartingLoc, Q.IsCall))
-    return CacheResult;
-
   if (isUseTriviallyOptimizableToLiveOnEntry(*MSSA->AA, I)) {
     MemoryAccess *LiveOnEntry = MSSA->getLiveOnEntryDef();
-    Cache.insert(StartingAccess, LiveOnEntry, Q.StartingLoc, Q.IsCall);
-    if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
-      MU->setDefiningAccess(LiveOnEntry, true);
+    if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
+      MUD->setOptimized(LiveOnEntry);
     return LiveOnEntry;
   }
 
@@ -2381,17 +2037,12 @@ MemorySSA::CachingWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   DEBUG(dbgs() << *DefiningAccess << "\n");
   DEBUG(dbgs() << "Final Memory SSA clobber for " << *I << " is ");
   DEBUG(dbgs() << *Result << "\n");
-  if (MemoryUse *MU = dyn_cast<MemoryUse>(StartingAccess))
-    MU->setDefiningAccess(Result, true);
+  if (auto *MUD = dyn_cast<MemoryUseOrDef>(StartingAccess))
+    MUD->setOptimized(Result);
 
   return Result;
 }
 
-// Verify that MA doesn't exist in any of the caches.
-void MemorySSA::CachingWalker::verifyRemoved(MemoryAccess *MA) {
-  assert(!Cache.contains(MA) && "Found removed MemoryAccess in cache.");
-}
-
 MemoryAccess *
 DoNothingMemorySSAWalker::getClobberingMemoryAccess(MemoryAccess *MA) {
   if (auto *Use = dyn_cast<MemoryUseOrDef>(MA))
diff --git a/lib/Transforms/Utils/MemorySSAUpdater.cpp b/lib/Analysis/MemorySSAUpdater.cpp
similarity index 76%
rename from lib/Transforms/Utils/MemorySSAUpdater.cpp
rename to lib/Analysis/MemorySSAUpdater.cpp
index 21f286b8cdea1baaaa2e03efb553f8285767ee8f..c63677fe5502a893d0730d37bd4565a3f4224a18 100644
--- a/lib/Transforms/Utils/MemorySSAUpdater.cpp
+++ b/lib/Analysis/MemorySSAUpdater.cpp
@@ -10,7 +10,7 @@
 // This file implements the MemorySSAUpdater class.
 //
 //===----------------------------------------------------------------===//
-#include "llvm/Transforms/Utils/MemorySSAUpdater.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
@@ -24,7 +24,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include <algorithm>
 
 #define DEBUG_TYPE "memoryssa"
@@ -189,7 +189,7 @@ MemoryAccess *MemorySSAUpdater::tryRemoveTrivialPhi(MemoryPhi *Phi,
     return MSSA->getLiveOnEntryDef();
   if (Phi) {
     Phi->replaceAllUsesWith(Same);
-    MSSA->removeMemoryAccess(Phi);
+    removeMemoryAccess(Phi);
   }
 
   // We should only end up recursing in case we replaced something, in which
@@ -234,7 +234,7 @@ void setMemoryPhiValueForBlock(MemoryPhi *MP, const BasicBlock *BB,
 // Then, we update the defs below us (and any new phi nodes) in the graph to
 // point to the correct new defs, to ensure we only have one variable, and no
 // disconnected stores.
-void MemorySSAUpdater::insertDef(MemoryDef *MD) {
+void MemorySSAUpdater::insertDef(MemoryDef *MD, bool RenameUses) {
   InsertedPHIs.clear();
 
   // See if we had a local def, and if not, go hunting.
@@ -287,6 +287,24 @@ void MemorySSAUpdater::insertDef(MemoryDef *MD) {
     // Put any new phis on the fixup list, and process them
     FixupList.append(InsertedPHIs.end() - StartingPHISize, InsertedPHIs.end());
   }
+  // Now that all fixups are done, rename all uses if we are asked.
+  if (RenameUses) {
+    SmallPtrSet<BasicBlock *, 16> Visited;
+    BasicBlock *StartBlock = MD->getBlock();
+    // We are guaranteed there is a def in the block, because we just got it
+    // handed to us in this function.
+    MemoryAccess *FirstDef = &*MSSA->getWritableBlockDefs(StartBlock)->begin();
+    // Convert to incoming value if it's a memorydef. A phi *is* already an
+    // incoming value.
+    if (auto *MD = dyn_cast<MemoryDef>(FirstDef))
+      FirstDef = MD->getDefiningAccess();
+
+    MSSA->renamePass(MD->getBlock(), FirstDef, Visited);
+    // We just inserted a phi into this block, so the incoming value will become
+    // the phi anyway, so it does not matter what we pass.
+    for (auto *MP : InsertedPHIs)
+      MSSA->renamePass(MP->getBlock(), nullptr, Visited);
+  }
 }
 
 void MemorySSAUpdater::fixupDefs(const SmallVectorImpl<MemoryAccess *> &Vars) {
@@ -383,4 +401,94 @@ void MemorySSAUpdater::moveToPlace(MemoryUseOrDef *What, BasicBlock *BB,
                                    MemorySSA::InsertionPlace Where) {
   return moveTo(What, BB, Where);
 }
+
+/// \brief If all arguments of a MemoryPHI are defined by the same incoming
+/// argument, return that argument.
+static MemoryAccess *onlySingleValue(MemoryPhi *MP) {
+  MemoryAccess *MA = nullptr;
+
+  for (auto &Arg : MP->operands()) {
+    if (!MA)
+      MA = cast<MemoryAccess>(Arg);
+    else if (MA != Arg)
+      return nullptr;
+  }
+  return MA;
+}
+void MemorySSAUpdater::removeMemoryAccess(MemoryAccess *MA) {
+  assert(!MSSA->isLiveOnEntryDef(MA) &&
+         "Trying to remove the live on entry def");
+  // We can only delete phi nodes if they have no uses, or we can replace all
+  // uses with a single definition.
+  MemoryAccess *NewDefTarget = nullptr;
+  if (MemoryPhi *MP = dyn_cast<MemoryPhi>(MA)) {
+    // Note that it is sufficient to know that all edges of the phi node have
+    // the same argument.  If they do, by the definition of dominance frontiers
+    // (which we used to place this phi), that argument must dominate this phi,
+    // and thus, must dominate the phi's uses, and so we will not hit the assert
+    // below.
+    NewDefTarget = onlySingleValue(MP);
+    assert((NewDefTarget || MP->use_empty()) &&
+           "We can't delete this memory phi");
+  } else {
+    NewDefTarget = cast<MemoryUseOrDef>(MA)->getDefiningAccess();
+  }
+
+  // Re-point the uses at our defining access
+  if (!isa<MemoryUse>(MA) && !MA->use_empty()) {
+    // Reset optimized on users of this store, and reset the uses.
+    // A few notes:
+    // 1. This is a slightly modified version of RAUW to avoid walking the
+    // uses twice here.
+    // 2. If we wanted to be complete, we would have to reset the optimized
+    // flags on users of phi nodes if doing the below makes a phi node have all
+    // the same arguments. Instead, we prefer users to removeMemoryAccess those
+    // phi nodes, because doing it here would be N^3.
+    if (MA->hasValueHandle())
+      ValueHandleBase::ValueIsRAUWd(MA, NewDefTarget);
+    // Note: We assume MemorySSA is not used in metadata since it's not really
+    // part of the IR.
+
+    while (!MA->use_empty()) {
+      Use &U = *MA->use_begin();
+      if (auto *MUD = dyn_cast<MemoryUseOrDef>(U.getUser()))
+        MUD->resetOptimized();
+      U.set(NewDefTarget);
+    }
+  }
+
+  // The call below to erase will destroy MA, so we can't change the order we
+  // are doing things here
+  MSSA->removeFromLookups(MA);
+  MSSA->removeFromLists(MA);
+}
+
+MemoryAccess *MemorySSAUpdater::createMemoryAccessInBB(
+    Instruction *I, MemoryAccess *Definition, const BasicBlock *BB,
+    MemorySSA::InsertionPlace Point) {
+  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(I, Definition);
+  MSSA->insertIntoListsForBlock(NewAccess, BB, Point);
+  return NewAccess;
+}
+
+MemoryUseOrDef *MemorySSAUpdater::createMemoryAccessBefore(
+    Instruction *I, MemoryAccess *Definition, MemoryUseOrDef *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(I, Definition);
+  MSSA->insertIntoListsBefore(NewAccess, InsertPt->getBlock(),
+                              InsertPt->getIterator());
+  return NewAccess;
+}
+
+MemoryUseOrDef *MemorySSAUpdater::createMemoryAccessAfter(
+    Instruction *I, MemoryAccess *Definition, MemoryAccess *InsertPt) {
+  assert(I->getParent() == InsertPt->getBlock() &&
+         "New and old access must be in the same block");
+  MemoryUseOrDef *NewAccess = MSSA->createDefinedAccess(I, Definition);
+  MSSA->insertIntoListsBefore(NewAccess, InsertPt->getBlock(),
+                              ++InsertPt->getIterator());
+  return NewAccess;
+}
+
 } // namespace llvm
diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp
index f5ba637e58e20332d465c7562d895a14d56f8014..f6d9a73e4e9a5bf00b468133e570fc7bcc18efc7 100644
--- a/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -28,7 +28,7 @@
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/ValueSymbolTable.h"
-#include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Pass.h"
 using namespace llvm;
 
@@ -84,6 +84,92 @@ static bool isNonRenamableLocal(const GlobalValue &GV) {
   return GV.hasSection() && GV.hasLocalLinkage();
 }
 
+/// Determine whether this call has all constant integer arguments (excluding
+/// "this") and summarize it to VCalls or ConstVCalls as appropriate.
+static void addVCallToSet(DevirtCallSite Call, GlobalValue::GUID Guid,
+                          SetVector<FunctionSummary::VFuncId> &VCalls,
+                          SetVector<FunctionSummary::ConstVCall> &ConstVCalls) {
+  std::vector<uint64_t> Args;
+  // Start from the second argument to skip the "this" pointer.
+  for (auto &Arg : make_range(Call.CS.arg_begin() + 1, Call.CS.arg_end())) {
+    auto *CI = dyn_cast<ConstantInt>(Arg);
+    if (!CI || CI->getBitWidth() > 64) {
+      VCalls.insert({Guid, Call.Offset});
+      return;
+    }
+    Args.push_back(CI->getZExtValue());
+  }
+  ConstVCalls.insert({{Guid, Call.Offset}, std::move(Args)});
+}
+
+/// If this intrinsic call requires that we add information to the function
+/// summary, do so via the non-constant reference arguments.
+static void addIntrinsicToSummary(
+    const CallInst *CI, SetVector<GlobalValue::GUID> &TypeTests,
+    SetVector<FunctionSummary::VFuncId> &TypeTestAssumeVCalls,
+    SetVector<FunctionSummary::VFuncId> &TypeCheckedLoadVCalls,
+    SetVector<FunctionSummary::ConstVCall> &TypeTestAssumeConstVCalls,
+    SetVector<FunctionSummary::ConstVCall> &TypeCheckedLoadConstVCalls) {
+  switch (CI->getCalledFunction()->getIntrinsicID()) {
+  case Intrinsic::type_test: {
+    auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
+    auto *TypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
+    if (!TypeId)
+      break;
+    GlobalValue::GUID Guid = GlobalValue::getGUID(TypeId->getString());
+
+    // Produce a summary from type.test intrinsics. We only summarize type.test
+    // intrinsics that are used other than by an llvm.assume intrinsic.
+    // Intrinsics that are assumed are relevant only to the devirtualization
+    // pass, not the type test lowering pass.
+    bool HasNonAssumeUses = llvm::any_of(CI->uses(), [](const Use &CIU) {
+      auto *AssumeCI = dyn_cast<CallInst>(CIU.getUser());
+      if (!AssumeCI)
+        return true;
+      Function *F = AssumeCI->getCalledFunction();
+      return !F || F->getIntrinsicID() != Intrinsic::assume;
+    });
+    if (HasNonAssumeUses)
+      TypeTests.insert(Guid);
+
+    SmallVector<DevirtCallSite, 4> DevirtCalls;
+    SmallVector<CallInst *, 4> Assumes;
+    findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI);
+    for (auto &Call : DevirtCalls)
+      addVCallToSet(Call, Guid, TypeTestAssumeVCalls,
+                    TypeTestAssumeConstVCalls);
+
+    break;
+  }
+
+  case Intrinsic::type_checked_load: {
+    auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(2));
+    auto *TypeId = dyn_cast<MDString>(TypeMDVal->getMetadata());
+    if (!TypeId)
+      break;
+    GlobalValue::GUID Guid = GlobalValue::getGUID(TypeId->getString());
+
+    SmallVector<DevirtCallSite, 4> DevirtCalls;
+    SmallVector<Instruction *, 4> LoadedPtrs;
+    SmallVector<Instruction *, 4> Preds;
+    bool HasNonCallUses = false;
+    findDevirtualizableCallsForTypeCheckedLoad(DevirtCalls, LoadedPtrs, Preds,
+                                               HasNonCallUses, CI);
+    // Any non-call uses of the result of llvm.type.checked.load will
+    // prevent us from optimizing away the llvm.type.test.
+    if (HasNonCallUses)
+      TypeTests.insert(Guid);
+    for (auto &Call : DevirtCalls)
+      addVCallToSet(Call, Guid, TypeCheckedLoadVCalls,
+                    TypeCheckedLoadConstVCalls);
+
+    break;
+  }
+  default:
+    break;
+  }
+}
+
 static void
 computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
                        const Function &F, BlockFrequencyInfo *BFI,
@@ -99,6 +185,10 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
   MapVector<ValueInfo, CalleeInfo> CallGraphEdges;
   SetVector<ValueInfo> RefEdges;
   SetVector<GlobalValue::GUID> TypeTests;
+  SetVector<FunctionSummary::VFuncId> TypeTestAssumeVCalls,
+      TypeCheckedLoadVCalls;
+  SetVector<FunctionSummary::ConstVCall> TypeTestAssumeConstVCalls,
+      TypeCheckedLoadConstVCalls;
   ICallPromotionAnalysis ICallAnalysis;
 
   bool HasInlineAsmMaybeReferencingInternal = false;
@@ -133,29 +223,15 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       // Check if this is a direct call to a known function or a known
       // intrinsic, or an indirect call with profile data.
       if (CalledFunction) {
-        if (CalledFunction->isIntrinsic()) {
-          if (CalledFunction->getIntrinsicID() != Intrinsic::type_test)
-            continue;
-          // Produce a summary from type.test intrinsics. We only summarize
-          // type.test intrinsics that are used other than by an llvm.assume
-          // intrinsic. Intrinsics that are assumed are relevant only to the
-          // devirtualization pass, not the type test lowering pass.
-          bool HasNonAssumeUses = llvm::any_of(CI->uses(), [](const Use &CIU) {
-            auto *AssumeCI = dyn_cast<CallInst>(CIU.getUser());
-            if (!AssumeCI)
-              return true;
-            Function *F = AssumeCI->getCalledFunction();
-            return !F || F->getIntrinsicID() != Intrinsic::assume;
-          });
-          if (HasNonAssumeUses) {
-            auto *TypeMDVal = cast<MetadataAsValue>(CI->getArgOperand(1));
-            if (auto *TypeId = dyn_cast<MDString>(TypeMDVal->getMetadata()))
-              TypeTests.insert(GlobalValue::getGUID(TypeId->getString()));
-          }
+        if (CI && CalledFunction->isIntrinsic()) {
+          addIntrinsicToSummary(
+              CI, TypeTests, TypeTestAssumeVCalls, TypeCheckedLoadVCalls,
+              TypeTestAssumeConstVCalls, TypeCheckedLoadConstVCalls);
+          continue;
         }
         // We should have named any anonymous globals
         assert(CalledFunction->hasName());
-        auto ScaledCount = BFI ? BFI->getBlockProfileCount(&BB) : None;
+        auto ScaledCount = ProfileSummaryInfo::getProfileCount(&I, BFI);
         auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
                                    : CalleeInfo::HotnessType::Unknown;
 
@@ -183,6 +259,11 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       }
     }
 
+  // Explicit add hot edges to enforce importing for designated GUIDs for
+  // sample PGO, to enable the same inlines as the profiled optimized binary.
+  for (auto &I : F.getImportGUIDs())
+    CallGraphEdges[I].updateHotness(CalleeInfo::HotnessType::Hot);
+
   bool NonRenamableLocal = isNonRenamableLocal(F);
   bool NotEligibleForImport =
       NonRenamableLocal || HasInlineAsmMaybeReferencingInternal ||
@@ -193,7 +274,10 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
                                     /* LiveRoot = */ false);
   auto FuncSummary = llvm::make_unique<FunctionSummary>(
       Flags, NumInsts, RefEdges.takeVector(), CallGraphEdges.takeVector(),
-      TypeTests.takeVector());
+      TypeTests.takeVector(), TypeTestAssumeVCalls.takeVector(),
+      TypeCheckedLoadVCalls.takeVector(),
+      TypeTestAssumeConstVCalls.takeVector(),
+      TypeCheckedLoadConstVCalls.takeVector());
   if (NonRenamableLocal)
     CantBePromoted.insert(F.getGUID());
   Index.addGlobalValueSummary(F.getName(), std::move(FuncSummary));
@@ -326,9 +410,8 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     // be listed on the llvm.used or llvm.compiler.used global and marked as
     // referenced from there.
     ModuleSymbolTable::CollectAsmSymbols(
-        Triple(M.getTargetTriple()), M.getModuleInlineAsm(),
-        [&M, &Index, &CantBePromoted](StringRef Name,
-                                      object::BasicSymbolRef::Flags Flags) {
+        M, [&M, &Index, &CantBePromoted](StringRef Name,
+                                         object::BasicSymbolRef::Flags Flags) {
           // Symbols not marked as Weak or Global are local definitions.
           if (Flags & (object::BasicSymbolRef::SF_Weak |
                        object::BasicSymbolRef::SF_Global))
@@ -347,7 +430,11 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                 llvm::make_unique<FunctionSummary>(
                     GVFlags, 0, ArrayRef<ValueInfo>{},
                     ArrayRef<FunctionSummary::EdgeTy>{},
-                    ArrayRef<GlobalValue::GUID>{});
+                    ArrayRef<GlobalValue::GUID>{},
+                    ArrayRef<FunctionSummary::VFuncId>{},
+                    ArrayRef<FunctionSummary::VFuncId>{},
+                    ArrayRef<FunctionSummary::ConstVCall>{},
+                    ArrayRef<FunctionSummary::ConstVCall>{});
             Index.addGlobalValueSummary(Name, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
@@ -364,6 +451,12 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     auto &Summary = GlobalList.second[0];
     bool AllRefsCanBeExternallyReferenced =
         llvm::all_of(Summary->refs(), [&](const ValueInfo &VI) {
+          // If a global value definition references an unnamed global,
+          // be conservative. They're valid IR so we don't want to crash
+          // when we encounter any of them but they're infrequent enough
+          // that we don't bother optimizing them.
+          if (!VI.getValue()->hasName())
+            return false;
           return !CantBePromoted.count(VI.getValue()->getGUID());
         });
     if (!AllRefsCanBeExternallyReferenced) {
diff --git a/lib/Analysis/OptimizationDiagnosticInfo.cpp b/lib/Analysis/OptimizationDiagnosticInfo.cpp
index a104a786afdc38f1d31d26911590fb3bc4710156..73245981b0228a10828cc4bf4bf36f9260b7cd8d 100644
--- a/lib/Analysis/OptimizationDiagnosticInfo.cpp
+++ b/lib/Analysis/OptimizationDiagnosticInfo.cpp
@@ -23,14 +23,14 @@
 
 using namespace llvm;
 
-OptimizationRemarkEmitter::OptimizationRemarkEmitter(Function *F)
+OptimizationRemarkEmitter::OptimizationRemarkEmitter(const Function *F)
     : F(F), BFI(nullptr) {
   if (!F->getContext().getDiagnosticHotnessRequested())
     return;
 
   // First create a dominator tree.
   DominatorTree DT;
-  DT.recalculate(*F);
+  DT.recalculate(*const_cast<Function *>(F));
 
   // Generate LoopInfo from it.
   LoopInfo LI;
@@ -93,32 +93,33 @@ void MappingTraits<DiagnosticInfoOptimizationBase *>::mapping(
                      OptDiag->getKind() ==
                          DK_OptimizationRemarkAnalysisAliasing))
     ;
+  else if (io.mapTag("!Failure", OptDiag->getKind() == DK_OptimizationFailure))
+    ;
   else
-    llvm_unreachable("todo");
+    llvm_unreachable("Unknown remark type");
 
   // These are read-only for now.
-  DebugLoc DL = OptDiag->getDebugLoc();
+  DiagnosticLocation DL = OptDiag->getLocation();
   StringRef FN =
       GlobalValue::getRealLinkageName(OptDiag->getFunction().getName());
 
   StringRef PassName(OptDiag->PassName);
   io.mapRequired("Pass", PassName);
   io.mapRequired("Name", OptDiag->RemarkName);
-  if (!io.outputting() || DL)
+  if (!io.outputting() || DL.isValid())
     io.mapOptional("DebugLoc", DL);
   io.mapRequired("Function", FN);
   io.mapOptional("Hotness", OptDiag->Hotness);
   io.mapOptional("Args", OptDiag->Args);
 }
 
-template <> struct MappingTraits<DebugLoc> {
-  static void mapping(IO &io, DebugLoc &DL) {
+template <> struct MappingTraits<DiagnosticLocation> {
+  static void mapping(IO &io, DiagnosticLocation &DL) {
     assert(io.outputting() && "input not yet implemented");
 
-    auto *Scope = cast<DIScope>(DL.getScope());
-    StringRef File = Scope->getFilename();
+    StringRef File = DL.getFilename();
     unsigned Line = DL.getLine();
-    unsigned Col = DL.getCol();
+    unsigned Col = DL.getColumn();
 
     io.mapRequired("File", File);
     io.mapRequired("Line", Line);
@@ -133,8 +134,8 @@ template <> struct MappingTraits<DiagnosticInfoOptimizationBase::Argument> {
   static void mapping(IO &io, DiagnosticInfoOptimizationBase::Argument &A) {
     assert(io.outputting() && "input not yet implemented");
     io.mapRequired(A.Key.data(), A.Val);
-    if (A.DLoc)
-      io.mapOptional("DebugLoc", A.DLoc);
+    if (A.Loc.isValid())
+      io.mapOptional("DebugLoc", A.Loc);
   }
 };
 
@@ -145,7 +146,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(DiagnosticInfoOptimizationBase::Argument)
 
 void OptimizationRemarkEmitter::computeHotness(
     DiagnosticInfoIROptimization &OptDiag) {
-  Value *V = OptDiag.getCodeRegion();
+  const Value *V = OptDiag.getCodeRegion();
   if (V)
     OptDiag.setHotness(computeHotness(V));
 }
@@ -166,72 +167,6 @@ void OptimizationRemarkEmitter::emit(
     F->getContext().diagnose(OptDiag);
 }
 
-void OptimizationRemarkEmitter::emitOptimizationRemark(const char *PassName,
-                                                       const DebugLoc &DLoc,
-                                                       const Value *V,
-                                                       const Twine &Msg) {
-  LLVMContext &Ctx = F->getContext();
-  Ctx.diagnose(OptimizationRemark(PassName, *F, DLoc, Msg, computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemark(const char *PassName,
-                                                       Loop *L,
-                                                       const Twine &Msg) {
-  emitOptimizationRemark(PassName, L->getStartLoc(), L->getHeader(), Msg);
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkMissed(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg, bool IsVerbose) {
-  LLVMContext &Ctx = F->getContext();
-  if (!IsVerbose || shouldEmitVerbose())
-    Ctx.diagnose(
-        OptimizationRemarkMissed(PassName, *F, DLoc, Msg, computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkMissed(
-    const char *PassName, Loop *L, const Twine &Msg, bool IsVerbose) {
-  emitOptimizationRemarkMissed(PassName, L->getStartLoc(), L->getHeader(), Msg,
-                               IsVerbose);
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysis(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg, bool IsVerbose) {
-  LLVMContext &Ctx = F->getContext();
-  if (!IsVerbose || shouldEmitVerbose())
-    Ctx.diagnose(
-        OptimizationRemarkAnalysis(PassName, *F, DLoc, Msg, computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysis(
-    const char *PassName, Loop *L, const Twine &Msg, bool IsVerbose) {
-  emitOptimizationRemarkAnalysis(PassName, L->getStartLoc(), L->getHeader(),
-                                 Msg, IsVerbose);
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysisFPCommute(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg) {
-  LLVMContext &Ctx = F->getContext();
-  Ctx.diagnose(OptimizationRemarkAnalysisFPCommute(PassName, *F, DLoc, Msg,
-                                                   computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysisAliasing(
-    const char *PassName, const DebugLoc &DLoc, const Value *V,
-    const Twine &Msg) {
-  LLVMContext &Ctx = F->getContext();
-  Ctx.diagnose(OptimizationRemarkAnalysisAliasing(PassName, *F, DLoc, Msg,
-                                                  computeHotness(V)));
-}
-
-void OptimizationRemarkEmitter::emitOptimizationRemarkAnalysisAliasing(
-    const char *PassName, Loop *L, const Twine &Msg) {
-  emitOptimizationRemarkAnalysisAliasing(PassName, L->getStartLoc(),
-                                         L->getHeader(), Msg);
-}
-
 OptimizationRemarkEmitterWrapperPass::OptimizationRemarkEmitterWrapperPass()
     : FunctionPass(ID) {
   initializeOptimizationRemarkEmitterWrapperPassPass(
diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp
index 5ca1830333889d4e0b00d8d5eaa9944904d04435..1a53a8ed428377f3067459ee03ee06cd1063da11 100644
--- a/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/lib/Analysis/ProfileSummaryInfo.cpp
@@ -68,6 +68,23 @@ bool ProfileSummaryInfo::computeSummary() {
   return true;
 }
 
+Optional<uint64_t>
+ProfileSummaryInfo::getProfileCount(const Instruction *Inst,
+                                    BlockFrequencyInfo *BFI) {
+  if (!Inst)
+    return None;
+  assert((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
+         "We can only get profile count for call/invoke instruction.");
+  // Check if there is a profile metadata on the instruction. If it is present,
+  // determine hotness solely based on that.
+  uint64_t TotalCount;
+  if (Inst->extractProfTotalWeight(TotalCount))
+    return TotalCount;
+  if (BFI)
+    return BFI->getBlockProfileCount(Inst->getParent());
+  return None;
+}
+
 /// Returns true if the function's entry is hot. If it returns false, it
 /// either means it is not hot or it is unknown whether it is hot or not (for
 /// example, no profile data is available).
@@ -81,15 +98,52 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
   return FunctionCount && isHotCount(FunctionCount.getValue());
 }
 
+/// Returns true if the function's entry or total call edge count is hot.
+/// If it returns false, it either means it is not hot or it is unknown
+/// whether it is hot or not (for example, no profile data is available).
+bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F) {
+  if (!F || !computeSummary())
+    return false;
+  if (auto FunctionCount = F->getEntryCount())
+    if (isHotCount(FunctionCount.getValue()))
+      return true;
+
+  uint64_t TotalCallCount = 0;
+  for (const auto &BB : *F)
+    for (const auto &I : BB)
+      if (isa<CallInst>(I) || isa<InvokeInst>(I))
+        if (auto CallCount = getProfileCount(&I, nullptr))
+          TotalCallCount += CallCount.getValue();
+  return isHotCount(TotalCallCount);
+}
+
+/// Returns true if the function's entry and total call edge count is cold.
+/// If it returns false, it either means it is not cold or it is unknown
+/// whether it is cold or not (for example, no profile data is available).
+bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F) {
+  if (!F || !computeSummary())
+    return false;
+  if (auto FunctionCount = F->getEntryCount())
+    if (!isColdCount(FunctionCount.getValue()))
+      return false;
+  
+  uint64_t TotalCallCount = 0;
+  for (const auto &BB : *F)
+    for (const auto &I : BB) 
+      if (isa<CallInst>(I) || isa<InvokeInst>(I))
+        if (auto CallCount = getProfileCount(&I, nullptr))
+          TotalCallCount += CallCount.getValue();
+  return isColdCount(TotalCallCount);
+}
+
 /// Returns true if the function's entry is a cold. If it returns false, it
 /// either means it is not cold or it is unknown whether it is cold or not (for
 /// example, no profile data is available).
 bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
   if (!F)
     return false;
-  if (F->hasFnAttribute(Attribute::Cold)) {
+  if (F->hasFnAttribute(Attribute::Cold))
     return true;
-  }
   if (!computeSummary())
     return false;
   auto FunctionCount = F->getEntryCount();
@@ -124,18 +178,7 @@ bool ProfileSummaryInfo::isColdCount(uint64_t C) {
 
 bool ProfileSummaryInfo::isHotBB(const BasicBlock *B, BlockFrequencyInfo *BFI) {
   auto Count = BFI->getBlockProfileCount(B);
-  if (Count && isHotCount(*Count))
-    return true;
-  // Use extractProfTotalWeight to get BB count.
-  // For Sample PGO, BFI may not provide accurate BB count due to errors
-  // magnified during sample count propagation. This serves as a backup plan
-  // to ensure all hot BB will not be missed.
-  // The query currently has false positives as branch instruction cloning does
-  // not update/scale branch weights. Unlike false negatives, this will not cause
-  // performance problem.
-  uint64_t TotalCount;
-  auto *TI = B->getTerminator();
-  return extractProfTotalWeight(TI, TotalCount) && isHotCount(TotalCount);
+  return Count && isHotCount(*Count);
 }
 
 bool ProfileSummaryInfo::isColdBB(const BasicBlock *B,
@@ -144,44 +187,16 @@ bool ProfileSummaryInfo::isColdBB(const BasicBlock *B,
   return Count && isColdCount(*Count);
 }
 
-bool ProfileSummaryInfo::extractProfTotalWeight(const Instruction *I,
-                                                uint64_t &TotalCount) {
-  if (!computeSummary())
-    return false;
-  // Use profile weight on metadata only for sample profiling where block counts
-  // could differ from the count of an instruction within the block.
-  if (Summary.get()->getKind() != ProfileSummary::PSK_Sample)
-    return false;
-
-  return (isa<CallInst>(I) ||
-          (isa<TerminatorInst>(I) && !isa<ReturnInst>(I))) &&
-         I->extractProfTotalWeight(TotalCount);
-}
-
 bool ProfileSummaryInfo::isHotCallSite(const CallSite &CS,
                                        BlockFrequencyInfo *BFI) {
-  auto *CallInst = CS.getInstruction();
-  if (!CS)
-    return false;
-  // Check if there is a profile metadata on the instruction. If it is present,
-  // determine hotness solely based on that.
-  uint64_t TotalCount;
-  if (extractProfTotalWeight(CallInst, TotalCount))
-    return isHotCount(TotalCount);
-  return BFI && isHotBB(CallInst->getParent(), BFI);
+  auto C = getProfileCount(CS.getInstruction(), BFI);
+  return C && isHotCount(*C);
 }
 
 bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS,
                                         BlockFrequencyInfo *BFI) {
-  auto *CallInst = CS.getInstruction();
-  if (!CS)
-    return false;
-  // Check if there is a profile metadata on the instruction. If it is present,
-  // and tells that the callsite is not cold, then return false;
-  uint64_t TotalCount;
-  if (extractProfTotalWeight(CallInst, TotalCount) && !isColdCount(TotalCount))
-    return false;
-  return BFI && isColdBB(CallInst->getParent(), BFI);
+  auto C = getProfileCount(CS.getInstruction(), BFI);
+  return C && isColdCount(*C);
 }
 
 INITIALIZE_PASS(ProfileSummaryInfoWrapperPass, "profile-summary-info",
diff --git a/lib/Analysis/RegionPass.cpp b/lib/Analysis/RegionPass.cpp
index 7358aa6810a1f0ed339be12bea70cf6a299d90a2..82107cb1802518383c7b2edfab87f37322899194 100644
--- a/lib/Analysis/RegionPass.cpp
+++ b/lib/Analysis/RegionPass.cpp
@@ -206,6 +206,8 @@ public:
 
     return false;
   }
+
+  StringRef getPassName() const override { return "Print Region IR"; }
 };
 
 char PrintRegionPass::ID = 0;
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index 7116d3a9e54018d4adfdb2362f56c4397b7dab9f..ca32cf3c7c34292d9d4b7429e832b64f862ff83b 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -132,9 +132,24 @@ static cl::opt<unsigned> AddOpsInlineThreshold(
     cl::desc("Threshold for inlining multiplication operands into a SCEV"),
     cl::init(500));
 
+static cl::opt<unsigned> MaxSCEVCompareDepth(
+    "scalar-evolution-max-scev-compare-depth", cl::Hidden,
+    cl::desc("Maximum depth of recursive SCEV complexity comparisons"),
+    cl::init(32));
+
+static cl::opt<unsigned> MaxSCEVOperationsImplicationDepth(
+    "scalar-evolution-max-scev-operations-implication-depth", cl::Hidden,
+    cl::desc("Maximum depth of recursive SCEV operations implication analysis"),
+    cl::init(2));
+
+static cl::opt<unsigned> MaxValueCompareDepth(
+    "scalar-evolution-max-value-compare-depth", cl::Hidden,
+    cl::desc("Maximum depth of recursive value complexity comparisons"),
+    cl::init(2));
+
 static cl::opt<unsigned>
-    MaxCompareDepth("scalar-evolution-max-compare-depth", cl::Hidden,
-                    cl::desc("Maximum depth of recursive compare complexity"),
+    MaxAddExprDepth("scalar-evolution-max-addexpr-depth", cl::Hidden,
+                    cl::desc("Maximum depth of recursive AddExpr"),
                     cl::init(32));
 
 static cl::opt<unsigned> MaxConstantEvolvingDepth(
@@ -491,7 +506,7 @@ static int
 CompareValueComplexity(SmallSet<std::pair<Value *, Value *>, 8> &EqCache,
                        const LoopInfo *const LI, Value *LV, Value *RV,
                        unsigned Depth) {
-  if (Depth > MaxCompareDepth || EqCache.count({LV, RV}))
+  if (Depth > MaxValueCompareDepth || EqCache.count({LV, RV}))
     return 0;
 
   // Order pointer values after integer values. This helps SCEVExpander form
@@ -578,7 +593,7 @@ static int CompareSCEVComplexity(
   if (LType != RType)
     return (int)LType - (int)RType;
 
-  if (Depth > MaxCompareDepth || EqCacheSCEV.count({LHS, RHS}))
+  if (Depth > MaxSCEVCompareDepth || EqCacheSCEV.count({LHS, RHS}))
     return 0;
   // Aside from the getSCEVType() ordering, the particular ordering
   // isn't very important except that it's beneficial to be consistent,
@@ -2100,7 +2115,8 @@ StrengthenNoWrapFlags(ScalarEvolution *SE, SCEVTypes Type,
 
 /// Get a canonical add expression, or something simpler if possible.
 const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
-                                        SCEV::NoWrapFlags Flags) {
+                                        SCEV::NoWrapFlags Flags,
+                                        unsigned Depth) {
   assert(!(Flags & ~(SCEV::FlagNUW | SCEV::FlagNSW)) &&
          "only nuw or nsw allowed");
   assert(!Ops.empty() && "Cannot get empty add!");
@@ -2139,6 +2155,10 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     if (Ops.size() == 1) return Ops[0];
   }
 
+  // Limit recursion calls depth
+  if (Depth > MaxAddExprDepth)
+    return getOrCreateAddExpr(Ops, Flags);
+
   // Okay, check to see if the same value occurs in the operand list more than
   // once.  If so, merge them together into an multiply expression.  Since we
   // sorted the list, these values are required to be adjacent.
@@ -2210,7 +2230,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     }
     if (Ok) {
       // Evaluate the expression in the larger type.
-      const SCEV *Fold = getAddExpr(LargeOps, Flags);
+      const SCEV *Fold = getAddExpr(LargeOps, Flags, Depth + 1);
       // If it folds to something simple, use it. Otherwise, don't.
       if (isa<SCEVConstant>(Fold) || isa<SCEVUnknown>(Fold))
         return getTruncateExpr(Fold, DstType);
@@ -2239,7 +2259,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
     // and they are not necessarily sorted.  Recurse to resort and resimplify
     // any operands we just acquired.
     if (DeletedAdd)
-      return getAddExpr(Ops);
+      return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
   }
 
   // Skip over the add expression until we get to a multiply.
@@ -2274,13 +2294,14 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
         Ops.push_back(getConstant(AccumulatedConstant));
       for (auto &MulOp : MulOpLists)
         if (MulOp.first != 0)
-          Ops.push_back(getMulExpr(getConstant(MulOp.first),
-                                   getAddExpr(MulOp.second)));
+          Ops.push_back(getMulExpr(
+              getConstant(MulOp.first),
+              getAddExpr(MulOp.second, SCEV::FlagAnyWrap, Depth + 1)));
       if (Ops.empty())
         return getZero(Ty);
       if (Ops.size() == 1)
         return Ops[0];
-      return getAddExpr(Ops);
+      return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
     }
   }
 
@@ -2305,8 +2326,8 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
             MulOps.append(Mul->op_begin()+MulOp+1, Mul->op_end());
             InnerMul = getMulExpr(MulOps);
           }
-          const SCEV *One = getOne(Ty);
-          const SCEV *AddOne = getAddExpr(One, InnerMul);
+          SmallVector<const SCEV *, 2> TwoOps = {getOne(Ty), InnerMul};
+          const SCEV *AddOne = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
           const SCEV *OuterMul = getMulExpr(AddOne, MulOpSCEV);
           if (Ops.size() == 2) return OuterMul;
           if (AddOp < Idx) {
@@ -2317,7 +2338,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
             Ops.erase(Ops.begin()+AddOp-1);
           }
           Ops.push_back(OuterMul);
-          return getAddExpr(Ops);
+          return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
         }
 
       // Check this multiply against other multiplies being added together.
@@ -2345,13 +2366,15 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
               MulOps.append(OtherMul->op_begin()+OMulOp+1, OtherMul->op_end());
               InnerMul2 = getMulExpr(MulOps);
             }
-            const SCEV *InnerMulSum = getAddExpr(InnerMul1,InnerMul2);
+            SmallVector<const SCEV *, 2> TwoOps = {InnerMul1, InnerMul2};
+            const SCEV *InnerMulSum =
+                getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
             const SCEV *OuterMul = getMulExpr(MulOpSCEV, InnerMulSum);
             if (Ops.size() == 2) return OuterMul;
             Ops.erase(Ops.begin()+Idx);
             Ops.erase(Ops.begin()+OtherMulIdx-1);
             Ops.push_back(OuterMul);
-            return getAddExpr(Ops);
+            return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
           }
       }
     }
@@ -2387,7 +2410,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
       // This follows from the fact that the no-wrap flags on the outer add
       // expression are applicable on the 0th iteration, when the add recurrence
       // will be equal to its start value.
-      AddRecOps[0] = getAddExpr(LIOps, Flags);
+      AddRecOps[0] = getAddExpr(LIOps, Flags, Depth + 1);
 
       // Build the new addrec. Propagate the NUW and NSW flags if both the
       // outer add and the inner addrec are guaranteed to have no overflow.
@@ -2404,7 +2427,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
           Ops[i] = NewRec;
           break;
         }
-      return getAddExpr(Ops);
+      return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
     }
 
     // Okay, if there weren't any loop invariants to be folded, check to see if
@@ -2428,14 +2451,15 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
                                    OtherAddRec->op_end());
                   break;
                 }
-                AddRecOps[i] = getAddExpr(AddRecOps[i],
-                                          OtherAddRec->getOperand(i));
+                SmallVector<const SCEV *, 2> TwoOps = {
+                    AddRecOps[i], OtherAddRec->getOperand(i)};
+                AddRecOps[i] = getAddExpr(TwoOps, SCEV::FlagAnyWrap, Depth + 1);
               }
               Ops.erase(Ops.begin() + OtherIdx); --OtherIdx;
             }
         // Step size has changed, so we cannot guarantee no self-wraparound.
         Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap);
-        return getAddExpr(Ops);
+        return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1);
       }
 
     // Otherwise couldn't fold anything into this recurrence.  Move onto the
@@ -2444,18 +2468,24 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
 
   // Okay, it looks like we really DO need an add expr.  Check to see if we
   // already have one, otherwise create a new one.
+  return getOrCreateAddExpr(Ops, Flags);
+}
+
+const SCEV *
+ScalarEvolution::getOrCreateAddExpr(SmallVectorImpl<const SCEV *> &Ops,
+                                    SCEV::NoWrapFlags Flags) {
   FoldingSetNodeID ID;
   ID.AddInteger(scAddExpr);
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     ID.AddPointer(Ops[i]);
   void *IP = nullptr;
   SCEVAddExpr *S =
-    static_cast<SCEVAddExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
+      static_cast<SCEVAddExpr *>(UniqueSCEVs.FindNodeOrInsertPos(ID, IP));
   if (!S) {
     const SCEV **O = SCEVAllocator.Allocate<const SCEV *>(Ops.size());
     std::uninitialized_copy(Ops.begin(), Ops.end(), O);
-    S = new (SCEVAllocator) SCEVAddExpr(ID.Intern(SCEVAllocator),
-                                        O, Ops.size());
+    S = new (SCEVAllocator)
+        SCEVAddExpr(ID.Intern(SCEVAllocator), O, Ops.size());
     UniqueSCEVs.InsertNode(S, IP);
   }
   S->setNoWrapFlags(Flags);
@@ -3393,6 +3423,10 @@ Type *ScalarEvolution::getEffectiveSCEVType(Type *Ty) const {
   return getDataLayout().getIntPtrType(Ty);
 }
 
+Type *ScalarEvolution::getWiderType(Type *T1, Type *T2) const {
+  return  getTypeSizeInBits(T1) >= getTypeSizeInBits(T2) ? T1 : T2;
+}
+
 const SCEV *ScalarEvolution::getCouldNotCompute() {
   return CouldNotCompute.get();
 }
@@ -4417,8 +4451,7 @@ const SCEV *ScalarEvolution::createNodeForGEP(GEPOperator *GEP) {
   return getGEPExpr(GEP, IndexExprs);
 }
 
-uint32_t
-ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
+uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S))
     return C->getAPInt().countTrailingZeros();
 
@@ -4428,14 +4461,16 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
 
   if (const SCEVZeroExtendExpr *E = dyn_cast<SCEVZeroExtendExpr>(S)) {
     uint32_t OpRes = GetMinTrailingZeros(E->getOperand());
-    return OpRes == getTypeSizeInBits(E->getOperand()->getType()) ?
-             getTypeSizeInBits(E->getType()) : OpRes;
+    return OpRes == getTypeSizeInBits(E->getOperand()->getType())
+               ? getTypeSizeInBits(E->getType())
+               : OpRes;
   }
 
   if (const SCEVSignExtendExpr *E = dyn_cast<SCEVSignExtendExpr>(S)) {
     uint32_t OpRes = GetMinTrailingZeros(E->getOperand());
-    return OpRes == getTypeSizeInBits(E->getOperand()->getType()) ?
-             getTypeSizeInBits(E->getType()) : OpRes;
+    return OpRes == getTypeSizeInBits(E->getOperand()->getType())
+               ? getTypeSizeInBits(E->getType())
+               : OpRes;
   }
 
   if (const SCEVAddExpr *A = dyn_cast<SCEVAddExpr>(S)) {
@@ -4452,8 +4487,8 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
     uint32_t BitWidth = getTypeSizeInBits(M->getType());
     for (unsigned i = 1, e = M->getNumOperands();
          SumOpRes != BitWidth && i != e; ++i)
-      SumOpRes = std::min(SumOpRes + GetMinTrailingZeros(M->getOperand(i)),
-                          BitWidth);
+      SumOpRes =
+          std::min(SumOpRes + GetMinTrailingZeros(M->getOperand(i)), BitWidth);
     return SumOpRes;
   }
 
@@ -4494,6 +4529,17 @@ ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
   return 0;
 }
 
+uint32_t ScalarEvolution::GetMinTrailingZeros(const SCEV *S) {
+  auto I = MinTrailingZerosCache.find(S);
+  if (I != MinTrailingZerosCache.end())
+    return I->second;
+
+  uint32_t Result = GetMinTrailingZerosImpl(S);
+  auto InsertPair = MinTrailingZerosCache.insert({S, Result});
+  assert(InsertPair.second && "Should insert a new key");
+  return InsertPair.first->second;
+}
+
 /// Helper method to assign a range to V from metadata present in the IR.
 static Optional<ConstantRange> GetRangeFromMetadata(Value *V) {
   if (Instruction *I = dyn_cast<Instruction>(V))
@@ -4676,6 +4722,77 @@ ScalarEvolution::getRange(const SCEV *S,
   return setRange(S, SignHint, ConservativeResult);
 }
 
+// Given a StartRange, Step and MaxBECount for an expression compute a range of
+// values that the expression can take. Initially, the expression has a value
+// from StartRange and then is changed by Step up to MaxBECount times. Signed
+// argument defines if we treat Step as signed or unsigned.
+static ConstantRange getRangeForAffineARHelper(APInt Step,
+                                               ConstantRange StartRange,
+                                               APInt MaxBECount,
+                                               unsigned BitWidth, bool Signed) {
+  // If either Step or MaxBECount is 0, then the expression won't change, and we
+  // just need to return the initial range.
+  if (Step == 0 || MaxBECount == 0)
+    return StartRange;
+
+  // If we don't know anything about the initial value (i.e. StartRange is
+  // FullRange), then we don't know anything about the final range either.
+  // Return FullRange.
+  if (StartRange.isFullSet())
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  // If Step is signed and negative, then we use its absolute value, but we also
+  // note that we're moving in the opposite direction.
+  bool Descending = Signed && Step.isNegative();
+
+  if (Signed)
+    // This is correct even for INT_SMIN. Let's look at i8 to illustrate this:
+    // abs(INT_SMIN) = abs(-128) = abs(0x80) = -0x80 = 0x80 = 128.
+    // This equations hold true due to the well-defined wrap-around behavior of
+    // APInt.
+    Step = Step.abs();
+
+  // Check if Offset is more than full span of BitWidth. If it is, the
+  // expression is guaranteed to overflow.
+  if (APInt::getMaxValue(StartRange.getBitWidth()).udiv(Step).ult(MaxBECount))
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  // Offset is by how much the expression can change. Checks above guarantee no
+  // overflow here.
+  APInt Offset = Step * MaxBECount;
+
+  // Minimum value of the final range will match the minimal value of StartRange
+  // if the expression is increasing and will be decreased by Offset otherwise.
+  // Maximum value of the final range will match the maximal value of StartRange
+  // if the expression is decreasing and will be increased by Offset otherwise.
+  APInt StartLower = StartRange.getLower();
+  APInt StartUpper = StartRange.getUpper() - 1;
+  APInt MovedBoundary =
+      Descending ? (StartLower - Offset) : (StartUpper + Offset);
+
+  // It's possible that the new minimum/maximum value will fall into the initial
+  // range (due to wrap around). This means that the expression can take any
+  // value in this bitwidth, and we have to return full range.
+  if (StartRange.contains(MovedBoundary))
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  APInt NewLower, NewUpper;
+  if (Descending) {
+    NewLower = MovedBoundary;
+    NewUpper = StartUpper;
+  } else {
+    NewLower = StartLower;
+    NewUpper = MovedBoundary;
+  }
+
+  // If we end up with full range, return a proper full range.
+  if (NewLower == NewUpper + 1)
+    return ConstantRange(BitWidth, /* isFullSet = */ true);
+
+  // No overflow detected, return [StartLower, StartUpper + Offset + 1) range.
+  return ConstantRange(NewLower, NewUpper + 1);
+}
+
 ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
                                                    const SCEV *Step,
                                                    const SCEV *MaxBECount,
@@ -4684,60 +4801,30 @@ ConstantRange ScalarEvolution::getRangeForAffineAR(const SCEV *Start,
          getTypeSizeInBits(MaxBECount->getType()) <= BitWidth &&
          "Precondition!");
 
-  ConstantRange Result(BitWidth, /* isFullSet = */ true);
-
-  // Check for overflow.  This must be done with ConstantRange arithmetic
-  // because we could be called from within the ScalarEvolution overflow
-  // checking code.
-
   MaxBECount = getNoopOrZeroExtend(MaxBECount, Start->getType());
   ConstantRange MaxBECountRange = getUnsignedRange(MaxBECount);
-  ConstantRange ZExtMaxBECountRange = MaxBECountRange.zextOrTrunc(BitWidth * 2);
+  APInt MaxBECountValue = MaxBECountRange.getUnsignedMax();
 
+  // First, consider step signed.
+  ConstantRange StartSRange = getSignedRange(Start);
   ConstantRange StepSRange = getSignedRange(Step);
-  ConstantRange SExtStepSRange = StepSRange.sextOrTrunc(BitWidth * 2);
-
-  ConstantRange StartURange = getUnsignedRange(Start);
-  ConstantRange EndURange =
-      StartURange.add(MaxBECountRange.multiply(StepSRange));
-
-  // Check for unsigned overflow.
-  ConstantRange ZExtStartURange = StartURange.zextOrTrunc(BitWidth * 2);
-  ConstantRange ZExtEndURange = EndURange.zextOrTrunc(BitWidth * 2);
-  if (ZExtStartURange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) ==
-      ZExtEndURange) {
-    APInt Min = APIntOps::umin(StartURange.getUnsignedMin(),
-                               EndURange.getUnsignedMin());
-    APInt Max = APIntOps::umax(StartURange.getUnsignedMax(),
-                               EndURange.getUnsignedMax());
-    bool IsFullRange = Min.isMinValue() && Max.isMaxValue();
-    if (!IsFullRange)
-      Result =
-          Result.intersectWith(ConstantRange(Min, Max + 1));
-  }
 
-  ConstantRange StartSRange = getSignedRange(Start);
-  ConstantRange EndSRange =
-      StartSRange.add(MaxBECountRange.multiply(StepSRange));
-
-  // Check for signed overflow. This must be done with ConstantRange
-  // arithmetic because we could be called from within the ScalarEvolution
-  // overflow checking code.
-  ConstantRange SExtStartSRange = StartSRange.sextOrTrunc(BitWidth * 2);
-  ConstantRange SExtEndSRange = EndSRange.sextOrTrunc(BitWidth * 2);
-  if (SExtStartSRange.add(ZExtMaxBECountRange.multiply(SExtStepSRange)) ==
-      SExtEndSRange) {
-    APInt Min =
-        APIntOps::smin(StartSRange.getSignedMin(), EndSRange.getSignedMin());
-    APInt Max =
-        APIntOps::smax(StartSRange.getSignedMax(), EndSRange.getSignedMax());
-    bool IsFullRange = Min.isMinSignedValue() && Max.isMaxSignedValue();
-    if (!IsFullRange)
-      Result =
-          Result.intersectWith(ConstantRange(Min, Max + 1));
-  }
+  // If Step can be both positive and negative, we need to find ranges for the
+  // maximum absolute step values in both directions and union them.
+  ConstantRange SR =
+      getRangeForAffineARHelper(StepSRange.getSignedMin(), StartSRange,
+                                MaxBECountValue, BitWidth, /* Signed = */ true);
+  SR = SR.unionWith(getRangeForAffineARHelper(StepSRange.getSignedMax(),
+                                              StartSRange, MaxBECountValue,
+                                              BitWidth, /* Signed = */ true));
 
-  return Result;
+  // Next, consider step unsigned.
+  ConstantRange UR = getRangeForAffineARHelper(
+      getUnsignedRange(Step).getUnsignedMax(), getUnsignedRange(Start),
+      MaxBECountValue, BitWidth, /* Signed = */ false);
+
+  // Finally, intersect signed and unsigned ranges.
+  return SR.intersectWith(UR);
 }
 
 ConstantRange ScalarEvolution::getRangeViaFactoring(const SCEV *Start,
@@ -5234,7 +5321,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
                 // If C is a low-bits mask, the zero extend is serving to
                 // mask off the high bits. Complement the operand and
                 // re-apply the zext.
-                if (APIntOps::isMask(Z0TySize, CI->getValue()))
+                if (CI->getValue().isMask(Z0TySize))
                   return getZeroExtendExpr(getNotSCEV(Z0), UTy);
 
                 // If C is a single bit, it may be in the sign-bit position
@@ -5278,28 +5365,55 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     break;
 
     case Instruction::AShr:
-      // For a two-shift sext-inreg, use sext(trunc(x)) as the SCEV expression.
-      if (ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS))
-        if (Operator *L = dyn_cast<Operator>(BO->LHS))
-          if (L->getOpcode() == Instruction::Shl &&
-              L->getOperand(1) == BO->RHS) {
-            uint64_t BitWidth = getTypeSizeInBits(BO->LHS->getType());
-
-            // If the shift count is not less than the bitwidth, the result of
-            // the shift is undefined. Don't try to analyze it, because the
-            // resolution chosen here may differ from the resolution chosen in
-            // other parts of the compiler.
-            if (CI->getValue().uge(BitWidth))
-              break;
+      // AShr X, C, where C is a constant.
+      ConstantInt *CI = dyn_cast<ConstantInt>(BO->RHS);
+      if (!CI)
+        break;
+
+      Type *OuterTy = BO->LHS->getType();
+      uint64_t BitWidth = getTypeSizeInBits(OuterTy);
+      // If the shift count is not less than the bitwidth, the result of
+      // the shift is undefined. Don't try to analyze it, because the
+      // resolution chosen here may differ from the resolution chosen in
+      // other parts of the compiler.
+      if (CI->getValue().uge(BitWidth))
+        break;
 
-            uint64_t Amt = BitWidth - CI->getZExtValue();
-            if (Amt == BitWidth)
-              return getSCEV(L->getOperand(0)); // shift by zero --> noop
+      if (CI->isNullValue())
+        return getSCEV(BO->LHS); // shift by zero --> noop
+
+      uint64_t AShrAmt = CI->getZExtValue();
+      Type *TruncTy = IntegerType::get(getContext(), BitWidth - AShrAmt);
+
+      Operator *L = dyn_cast<Operator>(BO->LHS);
+      if (L && L->getOpcode() == Instruction::Shl) {
+        // X = Shl A, n
+        // Y = AShr X, m
+        // Both n and m are constant.
+
+        const SCEV *ShlOp0SCEV = getSCEV(L->getOperand(0));
+        if (L->getOperand(1) == BO->RHS)
+          // For a two-shift sext-inreg, i.e. n = m,
+          // use sext(trunc(x)) as the SCEV expression.
+          return getSignExtendExpr(
+              getTruncateExpr(ShlOp0SCEV, TruncTy), OuterTy);
+
+        ConstantInt *ShlAmtCI = dyn_cast<ConstantInt>(L->getOperand(1));
+        if (ShlAmtCI && ShlAmtCI->getValue().ult(BitWidth)) {
+          uint64_t ShlAmt = ShlAmtCI->getZExtValue();
+          if (ShlAmt > AShrAmt) {
+            // When n > m, use sext(mul(trunc(x), 2^(n-m)))) as the SCEV
+            // expression. We already checked that ShlAmt < BitWidth, so
+            // the multiplier, 1 << (ShlAmt - AShrAmt), fits into TruncTy as
+            // ShlAmt - AShrAmt < Amt.
+            APInt Mul = APInt::getOneBitSet(BitWidth - AShrAmt,
+                                            ShlAmt - AShrAmt);
             return getSignExtendExpr(
-                getTruncateExpr(getSCEV(L->getOperand(0)),
-                                IntegerType::get(getContext(), Amt)),
-                BO->LHS->getType());
+                getMulExpr(getTruncateExpr(ShlOp0SCEV, TruncTy),
+                getConstant(Mul)), OuterTy);
           }
+        }
+      }
       break;
     }
   }
@@ -5371,7 +5485,7 @@ static unsigned getConstantTripCount(const SCEVConstant *ExitCount) {
   return ((unsigned)ExitConst->getZExtValue()) + 1;
 }
 
-unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) {
+unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L) {
   if (BasicBlock *ExitingBB = L->getExitingBlock())
     return getSmallConstantTripCount(L, ExitingBB);
 
@@ -5379,7 +5493,7 @@ unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L) {
   return 0;
 }
 
-unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L,
+unsigned ScalarEvolution::getSmallConstantTripCount(const Loop *L,
                                                     BasicBlock *ExitingBlock) {
   assert(ExitingBlock && "Must pass a non-null exiting block!");
   assert(L->isLoopExiting(ExitingBlock) &&
@@ -5389,13 +5503,13 @@ unsigned ScalarEvolution::getSmallConstantTripCount(Loop *L,
   return getConstantTripCount(ExitCount);
 }
 
-unsigned ScalarEvolution::getSmallConstantMaxTripCount(Loop *L) {
+unsigned ScalarEvolution::getSmallConstantMaxTripCount(const Loop *L) {
   const auto *MaxExitCount =
       dyn_cast<SCEVConstant>(getMaxBackedgeTakenCount(L));
   return getConstantTripCount(MaxExitCount);
 }
 
-unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) {
+unsigned ScalarEvolution::getSmallConstantTripMultiple(const Loop *L) {
   if (BasicBlock *ExitingBB = L->getExitingBlock())
     return getSmallConstantTripMultiple(L, ExitingBB);
 
@@ -5416,7 +5530,7 @@ unsigned ScalarEvolution::getSmallConstantTripMultiple(Loop *L) {
 /// As explained in the comments for getSmallConstantTripCount, this assumes
 /// that control exits the loop via ExitingBlock.
 unsigned
-ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
+ScalarEvolution::getSmallConstantTripMultiple(const Loop *L,
                                               BasicBlock *ExitingBlock) {
   assert(ExitingBlock && "Must pass a non-null exiting block!");
   assert(L->isLoopExiting(ExitingBlock) &&
@@ -5426,17 +5540,16 @@ ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
     return 1;
 
   // Get the trip count from the BE count by adding 1.
-  const SCEV *TCMul = getAddExpr(ExitCount, getOne(ExitCount->getType()));
-  // FIXME: SCEV distributes multiplication as V1*C1 + V2*C1. We could attempt
-  // to factor simple cases.
-  if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(TCMul))
-    TCMul = Mul->getOperand(0);
-
-  const SCEVConstant *MulC = dyn_cast<SCEVConstant>(TCMul);
-  if (!MulC)
-    return 1;
+  const SCEV *TCExpr = getAddExpr(ExitCount, getOne(ExitCount->getType()));
+
+  const SCEVConstant *TC = dyn_cast<SCEVConstant>(TCExpr);
+  if (!TC)
+    // Attempt to factor more general cases. Returns the greatest power of
+    // two divisor. If overflow happens, the trip count expression is still
+    // divisible by the greatest power of 2 divisor returned.
+    return 1U << std::min((uint32_t)31, GetMinTrailingZeros(TCExpr));
 
-  ConstantInt *Result = MulC->getValue();
+  ConstantInt *Result = TC->getValue();
 
   // Guard against huge trip counts (this requires checking
   // for zero to handle the case where the trip count == -1 and the
@@ -5451,7 +5564,8 @@ ScalarEvolution::getSmallConstantTripMultiple(Loop *L,
 /// Get the expression for the number of loop iterations for which this loop is
 /// guaranteed not to exit via ExitingBlock. Otherwise return
 /// SCEVCouldNotCompute.
-const SCEV *ScalarEvolution::getExitCount(Loop *L, BasicBlock *ExitingBlock) {
+const SCEV *ScalarEvolution::getExitCount(const Loop *L,
+                                          BasicBlock *ExitingBlock) {
   return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
 }
 
@@ -7107,7 +7221,7 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) {
     // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C
     // The B coefficient is M-N/2
     APInt B(M);
-    B -= sdiv(N,Two);
+    B -= N.sdiv(Two);
 
     // The A coefficient is N/2
     APInt A(N.sdiv(Two));
@@ -8454,19 +8568,161 @@ static bool IsKnownPredicateViaMinOrMax(ScalarEvolution &SE,
   llvm_unreachable("covered switch fell through?!");
 }
 
+bool ScalarEvolution::isImpliedViaOperations(ICmpInst::Predicate Pred,
+                                             const SCEV *LHS, const SCEV *RHS,
+                                             const SCEV *FoundLHS,
+                                             const SCEV *FoundRHS,
+                                             unsigned Depth) {
+  assert(getTypeSizeInBits(LHS->getType()) ==
+             getTypeSizeInBits(RHS->getType()) &&
+         "LHS and RHS have different sizes?");
+  assert(getTypeSizeInBits(FoundLHS->getType()) ==
+             getTypeSizeInBits(FoundRHS->getType()) &&
+         "FoundLHS and FoundRHS have different sizes?");
+  // We want to avoid hurting the compile time with analysis of too big trees.
+  if (Depth > MaxSCEVOperationsImplicationDepth)
+    return false;
+  // We only want to work with ICMP_SGT comparison so far.
+  // TODO: Extend to ICMP_UGT?
+  if (Pred == ICmpInst::ICMP_SLT) {
+    Pred = ICmpInst::ICMP_SGT;
+    std::swap(LHS, RHS);
+    std::swap(FoundLHS, FoundRHS);
+  }
+  if (Pred != ICmpInst::ICMP_SGT)
+    return false;
+
+  auto GetOpFromSExt = [&](const SCEV *S) {
+    if (auto *Ext = dyn_cast<SCEVSignExtendExpr>(S))
+      return Ext->getOperand();
+    // TODO: If S is a SCEVConstant then you can cheaply "strip" the sext off
+    // the constant in some cases.
+    return S;
+  };
+
+  // Acquire values from extensions.
+  auto *OrigFoundLHS = FoundLHS;
+  LHS = GetOpFromSExt(LHS);
+  FoundLHS = GetOpFromSExt(FoundLHS);
+
+  // Is the SGT predicate can be proved trivially or using the found context.
+  auto IsSGTViaContext = [&](const SCEV *S1, const SCEV *S2) {
+    return isKnownViaSimpleReasoning(ICmpInst::ICMP_SGT, S1, S2) ||
+           isImpliedViaOperations(ICmpInst::ICMP_SGT, S1, S2, OrigFoundLHS,
+                                  FoundRHS, Depth + 1);
+  };
+
+  if (auto *LHSAddExpr = dyn_cast<SCEVAddExpr>(LHS)) {
+    // We want to avoid creation of any new non-constant SCEV. Since we are
+    // going to compare the operands to RHS, we should be certain that we don't
+    // need any size extensions for this. So let's decline all cases when the
+    // sizes of types of LHS and RHS do not match.
+    // TODO: Maybe try to get RHS from sext to catch more cases?
+    if (getTypeSizeInBits(LHS->getType()) != getTypeSizeInBits(RHS->getType()))
+      return false;
+
+    // Should not overflow.
+    if (!LHSAddExpr->hasNoSignedWrap())
+      return false;
+
+    auto *LL = LHSAddExpr->getOperand(0);
+    auto *LR = LHSAddExpr->getOperand(1);
+    auto *MinusOne = getNegativeSCEV(getOne(RHS->getType()));
+
+    // Checks that S1 >= 0 && S2 > RHS, trivially or using the found context.
+    auto IsSumGreaterThanRHS = [&](const SCEV *S1, const SCEV *S2) {
+      return IsSGTViaContext(S1, MinusOne) && IsSGTViaContext(S2, RHS);
+    };
+    // Try to prove the following rule:
+    // (LHS = LL + LR) && (LL >= 0) && (LR > RHS) => (LHS > RHS).
+    // (LHS = LL + LR) && (LR >= 0) && (LL > RHS) => (LHS > RHS).
+    if (IsSumGreaterThanRHS(LL, LR) || IsSumGreaterThanRHS(LR, LL))
+      return true;
+  } else if (auto *LHSUnknownExpr = dyn_cast<SCEVUnknown>(LHS)) {
+    Value *LL, *LR;
+    // FIXME: Once we have SDiv implemented, we can get rid of this matching.
+    using namespace llvm::PatternMatch;
+    if (match(LHSUnknownExpr->getValue(), m_SDiv(m_Value(LL), m_Value(LR)))) {
+      // Rules for division.
+      // We are going to perform some comparisons with Denominator and its
+      // derivative expressions. In general case, creating a SCEV for it may
+      // lead to a complex analysis of the entire graph, and in particular it
+      // can request trip count recalculation for the same loop. This would
+      // cache as SCEVCouldNotCompute to avoid the infinite recursion. To avoid
+      // this, we only want to create SCEVs that are constants in this section.
+      // So we bail if Denominator is not a constant.
+      if (!isa<ConstantInt>(LR))
+        return false;
+
+      auto *Denominator = cast<SCEVConstant>(getSCEV(LR));
+
+      // We want to make sure that LHS = FoundLHS / Denominator. If it is so,
+      // then a SCEV for the numerator already exists and matches with FoundLHS.
+      auto *Numerator = getExistingSCEV(LL);
+      if (!Numerator || Numerator->getType() != FoundLHS->getType())
+        return false;
+
+      // Make sure that the numerator matches with FoundLHS and the denominator
+      // is positive.
+      if (!HasSameValue(Numerator, FoundLHS) || !isKnownPositive(Denominator))
+        return false;
+
+      auto *DTy = Denominator->getType();
+      auto *FRHSTy = FoundRHS->getType();
+      if (DTy->isPointerTy() != FRHSTy->isPointerTy())
+        // One of types is a pointer and another one is not. We cannot extend
+        // them properly to a wider type, so let us just reject this case.
+        // TODO: Usage of getEffectiveSCEVType for DTy, FRHSTy etc should help
+        // to avoid this check.
+        return false;
+
+      // Given that:
+      // FoundLHS > FoundRHS, LHS = FoundLHS / Denominator, Denominator > 0.
+      auto *WTy = getWiderType(DTy, FRHSTy);
+      auto *DenominatorExt = getNoopOrSignExtend(Denominator, WTy);
+      auto *FoundRHSExt = getNoopOrSignExtend(FoundRHS, WTy);
+
+      // Try to prove the following rule:
+      // (FoundRHS > Denominator - 2) && (RHS <= 0) => (LHS > RHS).
+      // For example, given that FoundLHS > 2. It means that FoundLHS is at
+      // least 3. If we divide it by Denominator < 4, we will have at least 1.
+      auto *DenomMinusTwo = getMinusSCEV(DenominatorExt, getConstant(WTy, 2));
+      if (isKnownNonPositive(RHS) &&
+          IsSGTViaContext(FoundRHSExt, DenomMinusTwo))
+        return true;
+
+      // Try to prove the following rule:
+      // (FoundRHS > -1 - Denominator) && (RHS < 0) => (LHS > RHS).
+      // For example, given that FoundLHS > -3. Then FoundLHS is at least -2.
+      // If we divide it by Denominator > 2, then:
+      // 1. If FoundLHS is negative, then the result is 0.
+      // 2. If FoundLHS is non-negative, then the result is non-negative.
+      // Anyways, the result is non-negative.
+      auto *MinusOne = getNegativeSCEV(getOne(WTy));
+      auto *NegDenomMinusOne = getMinusSCEV(MinusOne, DenominatorExt);
+      if (isKnownNegative(RHS) &&
+          IsSGTViaContext(FoundRHSExt, NegDenomMinusOne))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+bool
+ScalarEvolution::isKnownViaSimpleReasoning(ICmpInst::Predicate Pred,
+                                           const SCEV *LHS, const SCEV *RHS) {
+  return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) ||
+         IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) ||
+         IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) ||
+         isKnownPredicateViaNoOverflow(Pred, LHS, RHS);
+}
+
 bool
 ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
                                              const SCEV *LHS, const SCEV *RHS,
                                              const SCEV *FoundLHS,
                                              const SCEV *FoundRHS) {
-  auto IsKnownPredicateFull =
-      [this](ICmpInst::Predicate Pred, const SCEV *LHS, const SCEV *RHS) {
-    return isKnownPredicateViaConstantRanges(Pred, LHS, RHS) ||
-           IsKnownPredicateViaMinOrMax(*this, Pred, LHS, RHS) ||
-           IsKnownPredicateViaAddRecStart(*this, Pred, LHS, RHS) ||
-           isKnownPredicateViaNoOverflow(Pred, LHS, RHS);
-  };
-
   switch (Pred) {
   default: llvm_unreachable("Unexpected ICmpInst::Predicate value!");
   case ICmpInst::ICMP_EQ:
@@ -8476,30 +8732,34 @@ ScalarEvolution::isImpliedCondOperandsHelper(ICmpInst::Predicate Pred,
     break;
   case ICmpInst::ICMP_SLT:
   case ICmpInst::ICMP_SLE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_SGE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_SLE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_SGE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_SGT:
   case ICmpInst::ICMP_SGE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_SLE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_SGE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_SLE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_ULT:
   case ICmpInst::ICMP_ULE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_UGE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_ULE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_UGE, RHS, FoundRHS))
       return true;
     break;
   case ICmpInst::ICMP_UGT:
   case ICmpInst::ICMP_UGE:
-    if (IsKnownPredicateFull(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
-        IsKnownPredicateFull(ICmpInst::ICMP_ULE, RHS, FoundRHS))
+    if (isKnownViaSimpleReasoning(ICmpInst::ICMP_UGE, LHS, FoundLHS) &&
+        isKnownViaSimpleReasoning(ICmpInst::ICMP_ULE, RHS, FoundRHS))
       return true;
     break;
   }
 
+  // Maybe it can be proved via operations?
+  if (isImpliedViaOperations(Pred, LHS, RHS, FoundLHS, FoundRHS))
+    return true;
+
   return false;
 }
 
@@ -9490,6 +9750,7 @@ ScalarEvolution::ScalarEvolution(ScalarEvolution &&Arg)
       ValueExprMap(std::move(Arg.ValueExprMap)),
       PendingLoopPredicates(std::move(Arg.PendingLoopPredicates)),
       WalkingBEDominatingConds(false), ProvingSplitPredicate(false),
+      MinTrailingZerosCache(std::move(Arg.MinTrailingZerosCache)),
       BackedgeTakenCounts(std::move(Arg.BackedgeTakenCounts)),
       PredicatedBackedgeTakenCounts(
           std::move(Arg.PredicatedBackedgeTakenCounts)),
@@ -9587,6 +9848,13 @@ static void PrintLoopInfo(raw_ostream &OS, ScalarEvolution *SE,
     OS << "Unpredictable predicated backedge-taken count. ";
   }
   OS << "\n";
+
+  if (SE->hasLoopInvariantBackedgeTakenCount(L)) {
+    OS << "Loop ";
+    L->getHeader()->printAsOperand(OS, /*PrintType=*/false);
+    OS << ": ";
+    OS << "Trip multiple is " << SE->getSmallConstantTripMultiple(L) << "\n";
+  }
 }
 
 static StringRef loopDispositionToStr(ScalarEvolution::LoopDisposition LD) {
@@ -9895,6 +10163,7 @@ void ScalarEvolution::forgetMemoizedResults(const SCEV *S) {
   SignedRanges.erase(S);
   ExprValueMap.erase(S);
   HasRecMap.erase(S);
+  MinTrailingZerosCache.erase(S);
 
   auto RemoveSCEVFromBackedgeMap =
       [S, this](DenseMap<const Loop *, BackedgeTakenInfo> &Map) {
diff --git a/lib/Analysis/SparsePropagation.cpp b/lib/Analysis/SparsePropagation.cpp
index 79dc84e25533f80b42796b5242b7ea2e5d9be164..470f4bee1e0ab78faa733c2bcf11a68358597c7c 100644
--- a/lib/Analysis/SparsePropagation.cpp
+++ b/lib/Analysis/SparsePropagation.cpp
@@ -195,7 +195,7 @@ void SparseSolver::getFeasibleSuccessors(TerminatorInst &TI,
     Succs.assign(TI.getNumSuccessors(), true);
     return;
   }
-  SwitchInst::CaseIt Case = SI.findCaseValue(cast<ConstantInt>(C));
+  SwitchInst::CaseHandle Case = *SI.findCaseValue(cast<ConstantInt>(C));
   Succs[Case.getSuccessorIndex()] = true;
 }
 
diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp
index 3e1c1457b6df0046124c1d1ff8737df5161dff6e..d73b1a12803187114c55d34904957cdf580f086c 100644
--- a/lib/Analysis/TargetTransformInfo.cpp
+++ b/lib/Analysis/TargetTransformInfo.cpp
@@ -197,6 +197,10 @@ getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
   return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
 }
 
+bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
+  return TTIImpl->supportsEfficientVectorElementLoadStore();
+}
+
 bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const {
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
@@ -269,6 +273,12 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const {
   return TTIImpl->getRegisterBitWidth(Vector);
 }
 
+bool TargetTransformInfo::shouldConsiderAddressTypePromotion(
+    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const {
+  return TTIImpl->shouldConsiderAddressTypePromotion(
+      I, AllowPromotionWithoutCommonHeader);
+}
+
 unsigned TargetTransformInfo::getCacheLineSize() const {
   return TTIImpl->getCacheLineSize();
 }
@@ -308,8 +318,10 @@ int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index,
 }
 
 int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                          Type *Src) const {
-  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src);
+                                 Type *Src, const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -329,8 +341,10 @@ int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
 }
 
 int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                            Type *CondTy) const {
-  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy);
+                                 Type *CondTy, const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -344,8 +358,11 @@ int TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
 
 int TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
                                          unsigned Alignment,
-                                         unsigned AddressSpace) const {
-  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+                                         unsigned AddressSpace,
+                                         const Instruction *I) const {
+  assert ((I == nullptr || I->getOpcode() == Opcode) &&
+          "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -378,17 +395,17 @@ int TargetTransformInfo::getInterleavedMemoryOpCost(
 }
 
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                               ArrayRef<Type *> Tys,
-                                               FastMathFlags FMF) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
+                                    ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                    unsigned ScalarizationCostPassed) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
+                                            ScalarizationCostPassed);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
 int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                               ArrayRef<Value *> Args,
-                                               FastMathFlags FMF) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF);
+           ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 8ba8d6a55192e64fd1756f13c6a34715b062ecbb..d4c0e7092eaa146cad0504514baf52d60c71e0c7 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/ConstantRange.h"
@@ -76,6 +77,9 @@ struct Query {
   AssumptionCache *AC;
   const Instruction *CxtI;
   const DominatorTree *DT;
+  // Unlike the other analyses, this may be a nullptr because not all clients
+  // provide it currently.
+  OptimizationRemarkEmitter *ORE;
 
   /// Set of assumptions that should be excluded from further queries.
   /// This is because of the potential for mutual recursion to cause
@@ -90,11 +94,12 @@ struct Query {
   unsigned NumExcluded;
 
   Query(const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI,
-        const DominatorTree *DT)
-      : DL(DL), AC(AC), CxtI(CxtI), DT(DT), NumExcluded(0) {}
+        const DominatorTree *DT, OptimizationRemarkEmitter *ORE = nullptr)
+      : DL(DL), AC(AC), CxtI(CxtI), DT(DT), ORE(ORE), NumExcluded(0) {}
 
   Query(const Query &Q, const Value *NewExcl)
-      : DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), NumExcluded(Q.NumExcluded) {
+      : DL(Q.DL), AC(Q.AC), CxtI(Q.CxtI), DT(Q.DT), ORE(Q.ORE),
+        NumExcluded(Q.NumExcluded) {
     Excluded = Q.Excluded;
     Excluded[NumExcluded++] = NewExcl;
     assert(NumExcluded <= Excluded.size());
@@ -131,9 +136,10 @@ static void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
 void llvm::computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
                             const DataLayout &DL, unsigned Depth,
                             AssumptionCache *AC, const Instruction *CxtI,
-                            const DominatorTree *DT) {
+                            const DominatorTree *DT,
+                            OptimizationRemarkEmitter *ORE) {
   ::computeKnownBits(V, KnownZero, KnownOne, Depth,
-                     Query(DL, AC, safeCxtI(V, CxtI), DT));
+                     Query(DL, AC, safeCxtI(V, CxtI), DT, ORE));
 }
 
 bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
@@ -249,30 +255,6 @@ static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
                                    APInt &KnownZero, APInt &KnownOne,
                                    APInt &KnownZero2, APInt &KnownOne2,
                                    unsigned Depth, const Query &Q) {
-  if (!Add) {
-    if (const ConstantInt *CLHS = dyn_cast<ConstantInt>(Op0)) {
-      // We know that the top bits of C-X are clear if X contains less bits
-      // than C (i.e. no wrap-around can happen).  For example, 20-X is
-      // positive if we can prove that X is >= 0 and < 16.
-      if (!CLHS->getValue().isNegative()) {
-        unsigned BitWidth = KnownZero.getBitWidth();
-        unsigned NLZ = (CLHS->getValue()+1).countLeadingZeros();
-        // NLZ can't be BitWidth with no sign bit
-        APInt MaskV = APInt::getHighBitsSet(BitWidth, NLZ+1);
-        computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q);
-
-        // If all of the MaskV bits are known to be zero, then we know the
-        // output top bits are zero, because we now know that the output is
-        // from [0-C].
-        if ((KnownZero2 & MaskV) == MaskV) {
-          unsigned NLZ2 = CLHS->getValue().countLeadingZeros();
-          // Top bits known zero.
-          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
-        }
-      }
-    }
-  }
-
   unsigned BitWidth = KnownZero.getBitWidth();
 
   // If an initial sequence of bits in the result is not needed, the
@@ -282,11 +264,11 @@ static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
   computeKnownBits(Op1, KnownZero2, KnownOne2, Depth + 1, Q);
 
   // Carry in a 1 for a subtract, rather than a 0.
-  APInt CarryIn(BitWidth, 0);
+  uint64_t CarryIn = 0;
   if (!Add) {
     // Sum = LHS + ~RHS + 1
     std::swap(KnownZero2, KnownOne2);
-    CarryIn.setBit(0);
+    CarryIn = 1;
   }
 
   APInt PossibleSumZero = ~LHSKnownZero + ~KnownZero2 + CarryIn;
@@ -315,11 +297,11 @@ static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
       // Adding two non-negative numbers, or subtracting a negative number from
       // a non-negative one, can't wrap into negative.
       if (LHSKnownZero.isNegative() && KnownZero2.isNegative())
-        KnownZero |= APInt::getSignBit(BitWidth);
+        KnownZero.setSignBit();
       // Adding two negative numbers, or subtracting a non-negative number from
       // a negative one, can't wrap into non-negative.
       else if (LHSKnownOne.isNegative() && KnownOne2.isNegative())
-        KnownOne |= APInt::getSignBit(BitWidth);
+        KnownOne.setSignBit();
     }
   }
 }
@@ -370,8 +352,9 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
 
   TrailZ = std::min(TrailZ, BitWidth);
   LeadZ = std::min(LeadZ, BitWidth);
-  KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
-              APInt::getHighBitsSet(BitWidth, LeadZ);
+  KnownZero.clearAllBits();
+  KnownZero.setLowBits(TrailZ);
+  KnownZero.setHighBits(LeadZ);
 
   // Only make use of no-wrap flags if we failed to compute the sign bit
   // directly.  This matters if the multiplication always overflows, in
@@ -379,9 +362,9 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
   // though as the program is invoking undefined behaviour we can choose
   // whatever we like here.
   if (isKnownNonNegative && !KnownOne.isNegative())
-    KnownZero.setBit(BitWidth - 1);
+    KnownZero.setSignBit();
   else if (isKnownNegative && !KnownZero.isNegative())
-    KnownOne.setBit(BitWidth - 1);
+    KnownOne.setSignBit();
 }
 
 void llvm::computeKnownBitsFromRangeMetadata(const MDNode &Ranges,
@@ -726,7 +709,7 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
       if (RHSKnownZero.isNegative()) {
         // We know that the sign bit is zero.
-        KnownZero |= APInt::getSignBit(BitWidth);
+        KnownZero.setSignBit();
       }
     // assume(v >_s c) where c is at least -1.
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
@@ -737,7 +720,7 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
       if (RHSKnownOne.isAllOnesValue() || RHSKnownZero.isNegative()) {
         // We know that the sign bit is zero.
-        KnownZero |= APInt::getSignBit(BitWidth);
+        KnownZero.setSignBit();
       }
     // assume(v <=_s c) where c is negative
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
@@ -748,7 +731,7 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
       if (RHSKnownOne.isNegative()) {
         // We know that the sign bit is one.
-        KnownOne |= APInt::getSignBit(BitWidth);
+        KnownOne.setSignBit();
       }
     // assume(v <_s c) where c is non-positive
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
@@ -759,7 +742,7 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
 
       if (RHSKnownZero.isAllOnesValue() || RHSKnownOne.isNegative()) {
         // We know that the sign bit is one.
-        KnownOne |= APInt::getSignBit(BitWidth);
+        KnownOne.setSignBit();
       }
     // assume(v <=_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
@@ -769,8 +752,7 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
       computeKnownBits(A, RHSKnownZero, RHSKnownOne, Depth+1, Query(Q, I));
 
       // Whatever high bits in c are zero are known to be zero.
-      KnownZero |=
-        APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes());
+      KnownZero.setHighBits(RHSKnownZero.countLeadingOnes());
     // assume(v <_u c)
     } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) &&
                Pred == ICmpInst::ICMP_ULT &&
@@ -781,11 +763,27 @@ static void computeKnownBitsFromAssume(const Value *V, APInt &KnownZero,
       // Whatever high bits in c are zero are known to be zero (if c is a power
       // of 2, then one more).
       if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I)))
-        KnownZero |=
-          APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes()+1);
+        KnownZero.setHighBits(RHSKnownZero.countLeadingOnes()+1);
       else
-        KnownZero |=
-          APInt::getHighBitsSet(BitWidth, RHSKnownZero.countLeadingOnes());
+        KnownZero.setHighBits(RHSKnownZero.countLeadingOnes());
+    }
+  }
+
+  // If assumptions conflict with each other or previous known bits, then we
+  // have a logical fallacy. It's possible that the assumption is not reachable,
+  // so this isn't a real bug. On the other hand, the program may have undefined
+  // behavior, or we might have a bug in the compiler. We can't assert/crash, so
+  // clear out the known bits, try to warn the user, and hope for the best.
+  if ((KnownZero & KnownOne) != 0) {
+    KnownZero.clearAllBits();
+    KnownOne.clearAllBits();
+
+    if (Q.ORE) {
+      auto *CxtI = const_cast<Instruction *>(Q.CxtI);
+      OptimizationRemarkAnalysis ORA("value-tracking", "BadAssumption", CxtI);
+      Q.ORE->emit(ORA << "Detected conflicting code assumptions. Program may "
+                         "have undefined behavior, or compiler may have "
+                         "internal error.");
     }
   }
 }
@@ -824,6 +822,14 @@ static void computeKnownBitsFromShiftOperator(
 
   computeKnownBits(I->getOperand(1), KnownZero, KnownOne, Depth + 1, Q);
 
+  // If the shift amount could be greater than or equal to the bit-width of the LHS, the
+  // value could be undef, so we don't know anything about it.
+  if ((~KnownZero).uge(BitWidth)) {
+    KnownZero.clearAllBits();
+    KnownOne.clearAllBits();
+    return;
+  }
+
   // Note: We cannot use KnownZero.getLimitedValue() here, because if
   // BitWidth > 64 and any upper bits are known, we'll end up returning the
   // limit value (which implies all bits are known).
@@ -919,7 +925,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       APInt KnownZero3(BitWidth, 0), KnownOne3(BitWidth, 0);
       computeKnownBits(Y, KnownZero3, KnownOne3, Depth + 1, Q);
       if (KnownOne3.countTrailingOnes() > 0)
-        KnownZero |= APInt::getLowBitsSet(BitWidth, 1);
+        KnownZero.setBit(0);
     }
     break;
   }
@@ -965,7 +971,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       LeadZ = std::min(BitWidth,
                        LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
 
-    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
+    KnownZero.setHighBits(LeadZ);
     break;
   }
   case Instruction::Select: {
@@ -987,17 +993,17 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     unsigned MaxHighZeros = 0;
     if (SPF == SPF_SMAX) {
       // If both sides are negative, the result is negative.
-      if (KnownOne[BitWidth - 1] && KnownOne2[BitWidth - 1])
+      if (KnownOne.isNegative() && KnownOne2.isNegative())
         // We can derive a lower bound on the result by taking the max of the
         // leading one bits.
         MaxHighOnes =
             std::max(KnownOne.countLeadingOnes(), KnownOne2.countLeadingOnes());
       // If either side is non-negative, the result is non-negative.
-      else if (KnownZero[BitWidth - 1] || KnownZero2[BitWidth - 1])
+      else if (KnownZero.isNegative() || KnownZero2.isNegative())
         MaxHighZeros = 1;
     } else if (SPF == SPF_SMIN) {
       // If both sides are non-negative, the result is non-negative.
-      if (KnownZero[BitWidth - 1] && KnownZero2[BitWidth - 1])
+      if (KnownZero.isNegative() && KnownZero2.isNegative())
         // We can derive an upper bound on the result by taking the max of the
         // leading zero bits.
         MaxHighZeros = std::max(KnownZero.countLeadingOnes(),
@@ -1021,9 +1027,9 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     KnownOne &= KnownOne2;
     KnownZero &= KnownZero2;
     if (MaxHighOnes > 0)
-      KnownOne |= APInt::getHighBitsSet(BitWidth, MaxHighOnes);
+      KnownOne.setHighBits(MaxHighOnes);
     if (MaxHighZeros > 0)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, MaxHighZeros);
+      KnownZero.setHighBits(MaxHighZeros);
     break;
   }
   case Instruction::FPTrunc:
@@ -1054,7 +1060,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     KnownOne = KnownOne.zextOrTrunc(BitWidth);
     // Any top bits are known to be zero.
     if (BitWidth > SrcBitWidth)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+      KnownZero.setBitsFrom(SrcBitWidth);
     break;
   }
   case Instruction::BitCast: {
@@ -1081,29 +1087,28 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     // If the sign bit of the input is known set or clear, then we know the
     // top bits of the result.
     if (KnownZero[SrcBitWidth-1])             // Input sign bit known zero
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+      KnownZero.setBitsFrom(SrcBitWidth);
     else if (KnownOne[SrcBitWidth-1])           // Input sign bit known set
-      KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+      KnownOne.setBitsFrom(SrcBitWidth);
     break;
   }
   case Instruction::Shl: {
     // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
     bool NSW = cast<OverflowingBinaryOperator>(I)->hasNoSignedWrap();
-    auto KZF = [BitWidth, NSW](const APInt &KnownZero, unsigned ShiftAmt) {
-      APInt KZResult =
-          (KnownZero << ShiftAmt) |
-          APInt::getLowBitsSet(BitWidth, ShiftAmt); // Low bits known 0.
+    auto KZF = [NSW](const APInt &KnownZero, unsigned ShiftAmt) {
+      APInt KZResult = KnownZero << ShiftAmt;
+      KZResult.setLowBits(ShiftAmt); // Low bits known 0.
       // If this shift has "nsw" keyword, then the result is either a poison
       // value or has the same sign bit as the first operand.
       if (NSW && KnownZero.isNegative())
-        KZResult.setBit(BitWidth - 1);
+        KZResult.setSignBit();
       return KZResult;
     };
 
-    auto KOF = [BitWidth, NSW](const APInt &KnownOne, unsigned ShiftAmt) {
+    auto KOF = [NSW](const APInt &KnownOne, unsigned ShiftAmt) {
       APInt KOResult = KnownOne << ShiftAmt;
       if (NSW && KnownOne.isNegative())
-        KOResult.setBit(BitWidth - 1);
+        KOResult.setSignBit();
       return KOResult;
     };
 
@@ -1115,13 +1120,13 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
   case Instruction::LShr: {
     // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
     auto KZF = [BitWidth](const APInt &KnownZero, unsigned ShiftAmt) {
-      return APIntOps::lshr(KnownZero, ShiftAmt) |
+      return KnownZero.lshr(ShiftAmt) |
              // High bits known zero.
              APInt::getHighBitsSet(BitWidth, ShiftAmt);
     };
 
     auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
-      return APIntOps::lshr(KnownOne, ShiftAmt);
+      return KnownOne.lshr(ShiftAmt);
     };
 
     computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
@@ -1132,11 +1137,11 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
   case Instruction::AShr: {
     // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
     auto KZF = [](const APInt &KnownZero, unsigned ShiftAmt) {
-      return APIntOps::ashr(KnownZero, ShiftAmt);
+      return KnownZero.ashr(ShiftAmt);
     };
 
     auto KOF = [](const APInt &KnownOne, unsigned ShiftAmt) {
-      return APIntOps::ashr(KnownOne, ShiftAmt);
+      return KnownOne.ashr(ShiftAmt);
     };
 
     computeKnownBitsFromShiftOperator(I, KnownZero, KnownOne,
@@ -1172,12 +1177,12 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
 
         // If the first operand is non-negative or has all low bits zero, then
         // the upper bits are all zero.
-        if (KnownZero2[BitWidth-1] || ((KnownZero2 & LowBits) == LowBits))
+        if (KnownZero2.isNegative() || ((KnownZero2 & LowBits) == LowBits))
           KnownZero |= ~LowBits;
 
         // If the first operand is negative and not all low bits are zero, then
         // the upper bits are all one.
-        if (KnownOne2[BitWidth-1] && ((KnownOne2 & LowBits) != 0))
+        if (KnownOne2.isNegative() && ((KnownOne2 & LowBits) != 0))
           KnownOne |= ~LowBits;
 
         assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
@@ -1192,7 +1197,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
                        Q);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
-        KnownZero.setBit(BitWidth - 1);
+        KnownZero.setSignBit();
     }
 
     break;
@@ -1216,7 +1221,8 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
     unsigned Leaders = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
     KnownOne.clearAllBits();
-    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
+    KnownZero.clearAllBits();
+    KnownZero.setHighBits(Leaders);
     break;
   }
 
@@ -1227,7 +1233,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       Align = Q.DL.getABITypeAlignment(AI->getAllocatedType());
 
     if (Align > 0)
-      KnownZero = APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
+      KnownZero.setLowBits(countTrailingZeros(Align));
     break;
   }
   case Instruction::GetElementPtr: {
@@ -1274,7 +1280,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       }
     }
 
-    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ);
+    KnownZero.setLowBits(TrailZ);
     break;
   }
   case Instruction::PHI: {
@@ -1315,9 +1321,8 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
           APInt KnownZero3(KnownZero), KnownOne3(KnownOne);
           computeKnownBits(L, KnownZero3, KnownOne3, Depth + 1, Q);
 
-          KnownZero = APInt::getLowBitsSet(
-              BitWidth, std::min(KnownZero2.countTrailingOnes(),
-                                 KnownZero3.countTrailingOnes()));
+          KnownZero.setLowBits(std::min(KnownZero2.countTrailingOnes(),
+                                        KnownZero3.countTrailingOnes()));
 
           if (DontImproveNonNegativePhiBits)
             break;
@@ -1335,24 +1340,24 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
             // (add negative, negative) --> negative
             if (Opcode == Instruction::Add) {
               if (KnownZero2.isNegative() && KnownZero3.isNegative())
-                KnownZero.setBit(BitWidth - 1);
+                KnownZero.setSignBit();
               else if (KnownOne2.isNegative() && KnownOne3.isNegative())
-                KnownOne.setBit(BitWidth - 1);
+                KnownOne.setSignBit();
             }
 
             // (sub nsw non-negative, negative) --> non-negative
             // (sub nsw negative, non-negative) --> negative
             else if (Opcode == Instruction::Sub && LL == I) {
               if (KnownZero2.isNegative() && KnownOne3.isNegative())
-                KnownZero.setBit(BitWidth - 1);
+                KnownZero.setSignBit();
               else if (KnownOne2.isNegative() && KnownZero3.isNegative())
-                KnownOne.setBit(BitWidth - 1);
+                KnownOne.setSignBit();
             }
 
             // (mul nsw non-negative, non-negative) --> non-negative
             else if (Opcode == Instruction::Mul && KnownZero2.isNegative() &&
                      KnownZero3.isNegative())
-              KnownZero.setBit(BitWidth - 1);
+              KnownZero.setSignBit();
           }
 
           break;
@@ -1371,8 +1376,8 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       if (dyn_cast_or_null<UndefValue>(P->hasConstantValue()))
         break;
 
-      KnownZero = APInt::getAllOnesValue(BitWidth);
-      KnownOne = APInt::getAllOnesValue(BitWidth);
+      KnownZero.setAllBits();
+      KnownOne.setAllBits();
       for (Value *IncValue : P->incoming_values()) {
         // Skip direct self references.
         if (IncValue == P) continue;
@@ -1409,8 +1414,8 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
       default: break;
       case Intrinsic::bitreverse:
         computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
-        KnownZero = KnownZero2.reverseBits();
-        KnownOne = KnownOne2.reverseBits();
+        KnownZero |= KnownZero2.reverseBits();
+        KnownOne |= KnownOne2.reverseBits();
         break;
       case Intrinsic::bswap:
         computeKnownBits(I->getOperand(0), KnownZero2, KnownOne2, Depth + 1, Q);
@@ -1423,7 +1428,7 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
         // If this call is undefined for 0, the result will be less than 2^n.
         if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext()))
           LowBits -= 1;
-        KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
+        KnownZero.setBitsFrom(LowBits);
         break;
       }
       case Intrinsic::ctpop: {
@@ -1434,14 +1439,14 @@ static void computeKnownBitsFromOperator(const Operator *I, APInt &KnownZero,
         unsigned LeadingZeros =
           APInt(BitWidth, BitsPossiblySet).countLeadingZeros();
         assert(LeadingZeros <= BitWidth);
-        KnownZero |= APInt::getHighBitsSet(BitWidth, LeadingZeros);
+        KnownZero.setHighBits(LeadingZeros);
         KnownOne &= ~KnownZero;
         // TODO: we could bound KnownOne using the lower bound on the number
         // of bits which might be set provided by popcnt KnownOne2.
         break;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
-        KnownZero |= APInt::getHighBitsSet(64, 32);
+        KnownZero.setBitsFrom(32);
         break;
       }
     }
@@ -1514,6 +1519,7 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
          KnownZero.getBitWidth() == BitWidth &&
          KnownOne.getBitWidth() == BitWidth &&
          "V, KnownOne and KnownZero should have same BitWidth");
+  (void)BitWidth;
 
   const APInt *C;
   if (match(V, m_APInt(C))) {
@@ -1525,7 +1531,7 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
   // Null and aggregate-zero are all-zeros.
   if (isa<ConstantPointerNull>(V) || isa<ConstantAggregateZero>(V)) {
     KnownOne.clearAllBits();
-    KnownZero = APInt::getAllOnesValue(BitWidth);
+    KnownZero.setAllBits();
     return;
   }
   // Handle a constant vector by taking the intersection of the known bits of
@@ -1594,7 +1600,7 @@ void computeKnownBits(const Value *V, APInt &KnownZero, APInt &KnownOne,
   if (V->getType()->isPointerTy()) {
     unsigned Align = V->getPointerAlignment(Q.DL);
     if (Align)
-      KnownZero |= APInt::getLowBitsSet(BitWidth, countTrailingZeros(Align));
+      KnownZero.setLowBits(countTrailingZeros(Align));
   }
 
   // computeKnownBitsFromAssume strictly refines KnownZero and
@@ -1619,8 +1625,8 @@ void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne,
   APInt ZeroBits(BitWidth, 0);
   APInt OneBits(BitWidth, 0);
   computeKnownBits(V, ZeroBits, OneBits, Depth, Q);
-  KnownOne = OneBits[BitWidth - 1];
-  KnownZero = ZeroBits[BitWidth - 1];
+  KnownOne = OneBits.isNegative();
+  KnownZero = ZeroBits.isNegative();
 }
 
 /// Return true if the given value is known to have exactly one
@@ -1800,10 +1806,12 @@ static bool rangeMetadataExcludesValue(const MDNode* Ranges, const APInt& Value)
   return true;
 }
 
-/// Return true if the given value is known to be non-zero when defined.
-/// For vectors return true if every element is known to be non-zero when
-/// defined. Supports values with integer or pointer type and vectors of
-/// integers.
+/// Return true if the given value is known to be non-zero when defined. For
+/// vectors, return true if every element is known to be non-zero when
+/// defined. For pointers, if the context instruction and dominator tree are
+/// specified, perform context-sensitive analysis and return true if the
+/// pointer couldn't possibly be null at the specified instruction.
+/// Supports values with integer or pointer type and vectors of integers.
 bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
   if (auto *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
@@ -1846,7 +1854,7 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
 
   // Check for pointer simplifications.
   if (V->getType()->isPointerTy()) {
-    if (isKnownNonNull(V))
+    if (isKnownNonNullAt(V, Q.CxtI, Q.DT))
       return true;
     if (const GEPOperator *GEP = dyn_cast<GEPOperator>(V))
       if (isGEPKnownNonNull(GEP, Depth, Q))
@@ -2087,13 +2095,29 @@ static unsigned computeNumSignBitsVectorConstant(const Value *V,
   return MinSignBits;
 }
 
+static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
+                                       const Query &Q);
+
+static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
+                                   const Query &Q) {
+  unsigned Result = ComputeNumSignBitsImpl(V, Depth, Q);
+  assert(Result > 0 && "At least one sign bit needs to be present!");
+  return Result;
+}
+
 /// Return the number of times the sign bit of the register is replicated into
 /// the other bits. We know that at least 1 bit is always equal to the sign bit
 /// (itself), but other cases can give us information. For example, immediately
 /// after an "ashr X, 2", we know that the top 3 bits are all equal to each
 /// other, so we return 3. For vectors, return the number of sign bits for the
 /// vector element with the mininum number of known sign bits.
-unsigned ComputeNumSignBits(const Value *V, unsigned Depth, const Query &Q) {
+static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
+                                       const Query &Q) {
+
+  // We return the minimum number of sign bits that are guaranteed to be present
+  // in V, so for undef we have to conservatively return 1.  We don't have the
+  // same behavior for poison though -- that's a FIXME today.
+
   unsigned TyBits = Q.DL.getTypeSizeInBits(V->getType()->getScalarType());
   unsigned Tmp, Tmp2;
   unsigned FirstAnswer = 1;
@@ -2169,7 +2193,10 @@ unsigned ComputeNumSignBits(const Value *V, unsigned Depth, const Query &Q) {
     // ashr X, C   -> adds C sign bits.  Vectors too.
     const APInt *ShAmt;
     if (match(U->getOperand(1), m_APInt(ShAmt))) {
-      Tmp += ShAmt->getZExtValue();
+      unsigned ShAmtLimited = ShAmt->getZExtValue();
+      if (ShAmtLimited >= TyBits)
+        break;  // Bad shift.
+      Tmp += ShAmtLimited;
       if (Tmp > TyBits) Tmp = TyBits;
     }
     return Tmp;
@@ -3449,6 +3476,16 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
     if (NumUsesExplored >= DomConditionsMaxUses)
       break;
     NumUsesExplored++;
+
+    // If the value is used as an argument to a call or invoke, then argument
+    // attributes may provide an answer about null-ness.
+    if (auto CS = ImmutableCallSite(U))
+      if (auto *CalledFunc = CS.getCalledFunction())
+        for (const Argument &Arg : CalledFunc->args())
+          if (CS.getArgOperand(Arg.getArgNo()) == V &&
+              Arg.hasNonNullAttr() && DT->dominates(CS.getInstruction(), CtxI))
+            return true;
+
     // Consider only compare instructions uniquely controlling a branch
     CmpInst::Predicate Pred;
     if (!match(const_cast<User *>(U),
@@ -3726,6 +3763,8 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
     return false;
   if (isa<ReturnInst>(I))
     return false;
+  if (isa<UnreachableInst>(I))
+    return false;
 
   // Calls can throw, or contain an infinite loop, or kill the process.
   if (auto CS = ImmutableCallSite(I)) {
@@ -3774,79 +3813,33 @@ bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
 
 bool llvm::propagatesFullPoison(const Instruction *I) {
   switch (I->getOpcode()) {
-    case Instruction::Add:
-    case Instruction::Sub:
-    case Instruction::Xor:
-    case Instruction::Trunc:
-    case Instruction::BitCast:
-    case Instruction::AddrSpaceCast:
-      // These operations all propagate poison unconditionally. Note that poison
-      // is not any particular value, so xor or subtraction of poison with
-      // itself still yields poison, not zero.
-      return true;
-
-    case Instruction::AShr:
-    case Instruction::SExt:
-      // For these operations, one bit of the input is replicated across
-      // multiple output bits. A replicated poison bit is still poison.
-      return true;
-
-    case Instruction::Shl: {
-      // Left shift *by* a poison value is poison. The number of
-      // positions to shift is unsigned, so no negative values are
-      // possible there. Left shift by zero places preserves poison. So
-      // it only remains to consider left shift of poison by a positive
-      // number of places.
-      //
-      // A left shift by a positive number of places leaves the lowest order bit
-      // non-poisoned. However, if such a shift has a no-wrap flag, then we can
-      // make the poison operand violate that flag, yielding a fresh full-poison
-      // value.
-      auto *OBO = cast<OverflowingBinaryOperator>(I);
-      return OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap();
-    }
-
-    case Instruction::Mul: {
-      // A multiplication by zero yields a non-poison zero result, so we need to
-      // rule out zero as an operand. Conservatively, multiplication by a
-      // non-zero constant is not multiplication by zero.
-      //
-      // Multiplication by a non-zero constant can leave some bits
-      // non-poisoned. For example, a multiplication by 2 leaves the lowest
-      // order bit unpoisoned. So we need to consider that.
-      //
-      // Multiplication by 1 preserves poison. If the multiplication has a
-      // no-wrap flag, then we can make the poison operand violate that flag
-      // when multiplied by any integer other than 0 and 1.
-      auto *OBO = cast<OverflowingBinaryOperator>(I);
-      if (OBO->hasNoUnsignedWrap() || OBO->hasNoSignedWrap()) {
-        for (Value *V : OBO->operands()) {
-          if (auto *CI = dyn_cast<ConstantInt>(V)) {
-            // A ConstantInt cannot yield poison, so we can assume that it is
-            // the other operand that is poison.
-            return !CI->isZero();
-          }
-        }
-      }
-      return false;
-    }
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Xor:
+  case Instruction::Trunc:
+  case Instruction::BitCast:
+  case Instruction::AddrSpaceCast:
+  case Instruction::Mul:
+  case Instruction::Shl:
+  case Instruction::GetElementPtr:
+    // These operations all propagate poison unconditionally. Note that poison
+    // is not any particular value, so xor or subtraction of poison with
+    // itself still yields poison, not zero.
+    return true;
 
-    case Instruction::ICmp:
-      // Comparing poison with any value yields poison.  This is why, for
-      // instance, x s< (x +nsw 1) can be folded to true.
-      return true;
+  case Instruction::AShr:
+  case Instruction::SExt:
+    // For these operations, one bit of the input is replicated across
+    // multiple output bits. A replicated poison bit is still poison.
+    return true;
 
-    case Instruction::GetElementPtr:
-      // A GEP implicitly represents a sequence of additions, subtractions,
-      // truncations, sign extensions and multiplications. The multiplications
-      // are by the non-zero sizes of some set of types, so we do not have to be
-      // concerned with multiplication by zero. If the GEP is in-bounds, then
-      // these operations are implicitly no-signed-wrap so poison is propagated
-      // by the arguments above for Add, Sub, Trunc, SExt and Mul.
-      return cast<GEPOperator>(I)->isInBounds();
+  case Instruction::ICmp:
+    // Comparing poison with any value yields poison.  This is why, for
+    // instance, x s< (x +nsw 1) can be folded to true.
+    return true;
 
-    default:
-      return false;
+  default:
+    return false;
   }
 }
 
diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp
index 7e598f435ff5a05097c44e72e017d1a2384a0fc8..722f17a8067eeef9c4bd84cc393b13520860e736 100644
--- a/lib/Analysis/VectorUtils.cpp
+++ b/lib/Analysis/VectorUtils.cpp
@@ -488,3 +488,88 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
 
   return Inst;
 }
+
+Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
+                                     unsigned NumVecs) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    for (unsigned j = 0; j < NumVecs; j++)
+      Mask.push_back(Builder.getInt32(j * VF + i));
+
+  return ConstantVector::get(Mask);
+}
+
+Constant *llvm::createStrideMask(IRBuilder<> &Builder, unsigned Start,
+                                 unsigned Stride, unsigned VF) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < VF; i++)
+    Mask.push_back(Builder.getInt32(Start + i * Stride));
+
+  return ConstantVector::get(Mask);
+}
+
+Constant *llvm::createSequentialMask(IRBuilder<> &Builder, unsigned Start,
+                                     unsigned NumInts, unsigned NumUndefs) {
+  SmallVector<Constant *, 16> Mask;
+  for (unsigned i = 0; i < NumInts; i++)
+    Mask.push_back(Builder.getInt32(Start + i));
+
+  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
+  for (unsigned i = 0; i < NumUndefs; i++)
+    Mask.push_back(Undef);
+
+  return ConstantVector::get(Mask);
+}
+
+/// A helper function for concatenating vectors. This function concatenates two
+/// vectors having the same element type. If the second vector has fewer
+/// elements than the first, it is padded with undefs.
+static Value *concatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
+                                    Value *V2) {
+  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
+  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
+  assert(VecTy1 && VecTy2 &&
+         VecTy1->getScalarType() == VecTy2->getScalarType() &&
+         "Expect two vectors with the same element type");
+
+  unsigned NumElts1 = VecTy1->getNumElements();
+  unsigned NumElts2 = VecTy2->getNumElements();
+  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
+
+  if (NumElts1 > NumElts2) {
+    // Extend with UNDEFs.
+    Constant *ExtMask =
+        createSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2);
+    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
+  }
+
+  Constant *Mask = createSequentialMask(Builder, 0, NumElts1 + NumElts2, 0);
+  return Builder.CreateShuffleVector(V1, V2, Mask);
+}
+
+Value *llvm::concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs) {
+  unsigned NumVecs = Vecs.size();
+  assert(NumVecs > 1 && "Should be at least two vectors");
+
+  SmallVector<Value *, 8> ResList;
+  ResList.append(Vecs.begin(), Vecs.end());
+  do {
+    SmallVector<Value *, 8> TmpList;
+    for (unsigned i = 0; i < NumVecs - 1; i += 2) {
+      Value *V0 = ResList[i], *V1 = ResList[i + 1];
+      assert((V0->getType() == V1->getType() || i == NumVecs - 2) &&
+             "Only the last vector may have a different type");
+
+      TmpList.push_back(concatenateTwoVectors(Builder, V0, V1));
+    }
+
+    // Push the last vector if the total number of vectors is odd.
+    if (NumVecs % 2 != 0)
+      TmpList.push_back(ResList[NumVecs - 1]);
+
+    ResList = TmpList;
+    NumVecs = ResList.size();
+  } while (NumVecs > 1);
+
+  return ResList[0];
+}
diff --git a/lib/AsmParser/LLLexer.cpp b/lib/AsmParser/LLLexer.cpp
index 752942fc9fcce20390f9f6ace09e1ce68ceed3d0..49a8ce4bed0b5f0fcf75ce09cc60f80447b871a1 100644
--- a/lib/AsmParser/LLLexer.cpp
+++ b/lib/AsmParser/LLLexer.cpp
@@ -548,6 +548,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(ninf);
   KEYWORD(nsz);
   KEYWORD(arcp);
+  KEYWORD(contract);
   KEYWORD(fast);
   KEYWORD(nuw);
   KEYWORD(nsw);
diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp
index 4cd986e143b6c1617ab3c2b554140c7ece280888..68d448ed7e066de4e76fce230dc496c9235965f6 100644
--- a/lib/AsmParser/LLParser.cpp
+++ b/lib/AsmParser/LLParser.cpp
@@ -130,10 +130,9 @@ bool LLParser::ValidateEndOfModule() {
       B.merge(NumberedAttrBuilders[Attr]);
 
     if (Function *Fn = dyn_cast<Function>(V)) {
-      AttributeSet AS = Fn->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
-      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
-                               AS.getFnAttributes());
+      AttributeList AS = Fn->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
 
       FnAttrs.merge(B);
 
@@ -144,32 +143,27 @@ bool LLParser::ValidateEndOfModule() {
         FnAttrs.removeAttribute(Attribute::Alignment);
       }
 
-      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
-                            AttributeSet::get(Context,
-                                              AttributeSet::FunctionIndex,
-                                              FnAttrs));
+      AS = AS.addAttributes(
+          Context, AttributeList::FunctionIndex,
+          AttributeList::get(Context, AttributeList::FunctionIndex, FnAttrs));
       Fn->setAttributes(AS);
     } else if (CallInst *CI = dyn_cast<CallInst>(V)) {
-      AttributeSet AS = CI->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
-      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
-                               AS.getFnAttributes());
+      AttributeList AS = CI->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
-                            AttributeSet::get(Context,
-                                              AttributeSet::FunctionIndex,
-                                              FnAttrs));
+      AS = AS.addAttributes(
+          Context, AttributeList::FunctionIndex,
+          AttributeList::get(Context, AttributeList::FunctionIndex, FnAttrs));
       CI->setAttributes(AS);
     } else if (InvokeInst *II = dyn_cast<InvokeInst>(V)) {
-      AttributeSet AS = II->getAttributes();
-      AttrBuilder FnAttrs(AS.getFnAttributes(), AttributeSet::FunctionIndex);
-      AS = AS.removeAttributes(Context, AttributeSet::FunctionIndex,
-                               AS.getFnAttributes());
+      AttributeList AS = II->getAttributes();
+      AttrBuilder FnAttrs(AS.getFnAttributes());
+      AS = AS.removeAttributes(Context, AttributeList::FunctionIndex);
       FnAttrs.merge(B);
-      AS = AS.addAttributes(Context, AttributeSet::FunctionIndex,
-                            AttributeSet::get(Context,
-                                              AttributeSet::FunctionIndex,
-                                              FnAttrs));
+      AS = AS.addAttributes(
+          Context, AttributeList::FunctionIndex,
+          AttributeList::get(Context, AttributeList::FunctionIndex, FnAttrs));
       II->setAttributes(AS);
     } else {
       llvm_unreachable("invalid object with forward attribute group reference");
@@ -1855,6 +1849,34 @@ bool LLParser::ParseOptionalCommaAlign(unsigned &Alignment,
   return false;
 }
 
+/// ParseOptionalCommaAddrSpace
+///   ::=
+///   ::= ',' addrspace(1)
+///
+/// This returns with AteExtraComma set to true if it ate an excess comma at the
+/// end.
+bool LLParser::ParseOptionalCommaAddrSpace(unsigned &AddrSpace,
+                                           LocTy &Loc,
+                                           bool &AteExtraComma) {
+  AteExtraComma = false;
+  while (EatIfPresent(lltok::comma)) {
+    // Metadata at the end is an early exit.
+    if (Lex.getKind() == lltok::MetadataVar) {
+      AteExtraComma = true;
+      return false;
+    }
+
+    Loc = Lex.getLoc();
+    if (Lex.getKind() != lltok::kw_addrspace)
+      return Error(Lex.getLoc(), "expected metadata or 'addrspace'");
+
+    if (ParseOptionalAddrSpace(AddrSpace))
+      return true;
+  }
+
+  return false;
+}
+
 bool LLParser::parseAllocSizeArguments(unsigned &BaseSizeArg,
                                        Optional<unsigned> &HowManyArg) {
   Lex.Lex();
@@ -2098,7 +2120,6 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
   if (ParseToken(lltok::lparen, "expected '(' in call"))
     return true;
 
-  unsigned AttrIndex = 1;
   while (Lex.getKind() != lltok::rparen) {
     // If this isn't the first argument, we need a comma.
     if (!ArgList.empty() &&
@@ -2132,9 +2153,8 @@ bool LLParser::ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
       if (ParseOptionalParamAttrs(ArgAttrs) || ParseValue(ArgTy, V, PFS))
         return true;
     }
-    ArgList.push_back(ParamInfo(ArgLoc, V, AttributeSet::get(V->getContext(),
-                                                             AttrIndex++,
-                                                             ArgAttrs)));
+    ArgList.push_back(ParamInfo(
+        ArgLoc, V, AttributeSet::get(V->getContext(), ArgAttrs)));
   }
 
   if (IsMustTailCall && InVarArgsFunc)
@@ -2239,9 +2259,8 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     if (!FunctionType::isValidArgumentType(ArgTy))
       return Error(TypeLoc, "invalid type for function argument");
 
-    unsigned AttrIndex = 1;
-    ArgList.emplace_back(TypeLoc, ArgTy, AttributeSet::get(ArgTy->getContext(),
-                                                           AttrIndex++, Attrs),
+    ArgList.emplace_back(TypeLoc, ArgTy,
+                         AttributeSet::get(ArgTy->getContext(), Attrs),
                          std::move(Name));
 
     while (EatIfPresent(lltok::comma)) {
@@ -2268,10 +2287,9 @@ bool LLParser::ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
       if (!ArgTy->isFirstClassType())
         return Error(TypeLoc, "invalid type for function argument");
 
-      ArgList.emplace_back(
-          TypeLoc, ArgTy,
-          AttributeSet::get(ArgTy->getContext(), AttrIndex++, Attrs),
-          std::move(Name));
+      ArgList.emplace_back(TypeLoc, ArgTy,
+                           AttributeSet::get(ArgTy->getContext(), Attrs),
+                           std::move(Name));
     }
   }
 
@@ -2295,7 +2313,7 @@ bool LLParser::ParseFunctionType(Type *&Result) {
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     if (!ArgList[i].Name.empty())
       return Error(ArgList[i].Loc, "argument name invalid in function type");
-    if (ArgList[i].Attrs.hasAttributes(i + 1))
+    if (ArgList[i].Attrs.hasAttributes())
       return Error(ArgList[i].Loc,
                    "argument attributes invalid in function type");
   }
@@ -3908,7 +3926,8 @@ bool LLParser::ParseDIBasicType(MDNode *&Result, bool IsDistinct) {
 /// ParseDIDerivedType:
 ///   ::= !DIDerivedType(tag: DW_TAG_pointer_type, name: "int", file: !0,
 ///                      line: 7, scope: !1, baseType: !2, size: 32,
-///                      align: 32, offset: 0, flags: 0, extraData: !3)
+///                      align: 32, offset: 0, flags: 0, extraData: !3,
+///                      dwarfAddressSpace: 3)
 bool LLParser::ParseDIDerivedType(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(tag, DwarfTagField, );                                              \
@@ -3921,14 +3940,20 @@ bool LLParser::ParseDIDerivedType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(align, MDUnsignedField, (0, UINT32_MAX));                           \
   OPTIONAL(offset, MDUnsignedField, (0, UINT64_MAX));                          \
   OPTIONAL(flags, DIFlagField, );                                              \
-  OPTIONAL(extraData, MDField, );
+  OPTIONAL(extraData, MDField, );                                              \
+  OPTIONAL(dwarfAddressSpace, MDUnsignedField, (UINT32_MAX, UINT32_MAX));
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
+  Optional<unsigned> DWARFAddressSpace;
+  if (dwarfAddressSpace.Val != UINT32_MAX)
+    DWARFAddressSpace = dwarfAddressSpace.Val;
+
   Result = GET_OR_DISTINCT(DIDerivedType,
                            (Context, tag.Val, name.Val, file.Val, line.Val,
                             scope.Val, baseType.Val, size.Val, align.Val,
-                            offset.Val, flags.Val, extraData.Val));
+                            offset.Val, DWARFAddressSpace, flags.Val,
+                            extraData.Val));
   return false;
 }
 
@@ -4029,7 +4054,8 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(imports, MDField, );                                                \
   OPTIONAL(macros, MDField, );                                                 \
   OPTIONAL(dwoId, MDUnsignedField, );                                          \
-  OPTIONAL(splitDebugInlining, MDBoolField, = true);
+  OPTIONAL(splitDebugInlining, MDBoolField, = true);                           \
+  OPTIONAL(debugInfoForProfiling, MDBoolField, = false);
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4037,7 +4063,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
       Context, language.Val, file.Val, producer.Val, isOptimized.Val, flags.Val,
       runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val,
       retainedTypes.Val, globals.Val, imports.Val, macros.Val, dwoId.Val,
-      splitDebugInlining.Val);
+      splitDebugInlining.Val, debugInfoForProfiling.Val);
   return false;
 }
 
@@ -4589,6 +4615,9 @@ bool LLParser::parseConstantValue(Type *Ty, Constant *&C) {
     C = cast<Constant>(V);
     return false;
   }
+  case ValID::t_Null:
+    C = Constant::getNullValue(Ty);
+    return false;
   default:
     return Error(Loc, "expected a constant value");
   }
@@ -4735,25 +4764,16 @@ bool LLParser::ParseFunctionHeader(Function *&Fn, bool isDefine) {
   std::vector<Type*> ParamTypeList;
   SmallVector<AttributeSet, 8> Attrs;
 
-  if (RetAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::ReturnIndex,
-                                      RetAttrs));
+  Attrs.push_back(AttributeSet::get(Context, RetAttrs));
 
   for (unsigned i = 0, e = ArgList.size(); i != e; ++i) {
     ParamTypeList.push_back(ArgList[i].Ty);
-    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
-      AttrBuilder B(ArgList[i].Attrs, i + 1);
-      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
-    }
+    Attrs.push_back(ArgList[i].Attrs);
   }
 
-  if (FuncAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::FunctionIndex,
-                                      FuncAttrs));
+  Attrs.push_back(AttributeSet::get(Context, FuncAttrs));
 
-  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+  AttributeList PAL = AttributeList::get(Context, Attrs);
 
   if (PAL.hasAttribute(1, Attribute::StructRet) && !RetType->isVoidTy())
     return Error(RetTypeLoc, "functions with 'sret' argument must return void");
@@ -5364,10 +5384,7 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
 
   // Set up the Attribute for the function.
   SmallVector<AttributeSet, 8> Attrs;
-  if (RetAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::ReturnIndex,
-                                      RetAttrs));
+  Attrs.push_back(AttributeSet::get(Context, RetAttrs));
 
   SmallVector<Value*, 8> Args;
 
@@ -5387,26 +5404,19 @@ bool LLParser::ParseInvoke(Instruction *&Inst, PerFunctionState &PFS) {
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
-      AttrBuilder B(ArgList[i].Attrs, i + 1);
-      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
-    }
+    Attrs.push_back(ArgList[i].Attrs);
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs.hasAttributes()) {
-    if (FnAttrs.hasAlignmentAttr())
-      return Error(CallLoc, "invoke instructions may not have an alignment");
+  if (FnAttrs.hasAlignmentAttr())
+    return Error(CallLoc, "invoke instructions may not have an alignment");
 
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::FunctionIndex,
-                                      FnAttrs));
-  }
+  Attrs.push_back(AttributeSet::get(Context, FnAttrs));
 
   // Finish off the Attribute and check them
-  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+  AttributeList PAL = AttributeList::get(Context, Attrs);
 
   InvokeInst *II =
       InvokeInst::Create(Ty, Callee, NormalBB, UnwindBB, Args, BundleList);
@@ -5968,10 +5978,7 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 
   // Set up the Attribute for the function.
   SmallVector<AttributeSet, 8> Attrs;
-  if (RetAttrs.hasAttributes())
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::ReturnIndex,
-                                      RetAttrs));
+  Attrs.push_back(AttributeSet::get(Context, RetAttrs));
 
   SmallVector<Value*, 8> Args;
 
@@ -5991,26 +5998,19 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
       return Error(ArgList[i].Loc, "argument is not of expected type '" +
                    getTypeString(ExpectedTy) + "'");
     Args.push_back(ArgList[i].V);
-    if (ArgList[i].Attrs.hasAttributes(i + 1)) {
-      AttrBuilder B(ArgList[i].Attrs, i + 1);
-      Attrs.push_back(AttributeSet::get(RetType->getContext(), i + 1, B));
-    }
+    Attrs.push_back(ArgList[i].Attrs);
   }
 
   if (I != E)
     return Error(CallLoc, "not enough parameters specified for call");
 
-  if (FnAttrs.hasAttributes()) {
-    if (FnAttrs.hasAlignmentAttr())
-      return Error(CallLoc, "call instructions may not have an alignment");
+  if (FnAttrs.hasAlignmentAttr())
+    return Error(CallLoc, "call instructions may not have an alignment");
 
-    Attrs.push_back(AttributeSet::get(RetType->getContext(),
-                                      AttributeSet::FunctionIndex,
-                                      FnAttrs));
-  }
+  Attrs.push_back(AttributeSet::get(Context, FnAttrs));
 
   // Finish off the Attribute and check them
-  AttributeSet PAL = AttributeSet::get(Context, Attrs);
+  AttributeList PAL = AttributeList::get(Context, Attrs);
 
   CallInst *CI = CallInst::Create(Ty, Callee, Args, BundleList);
   CI->setTailCallKind(TCK);
@@ -6032,8 +6032,9 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
 ///       (',' 'align' i32)?
 int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   Value *Size = nullptr;
-  LocTy SizeLoc, TyLoc;
+  LocTy SizeLoc, TyLoc, ASLoc;
   unsigned Alignment = 0;
+  unsigned AddrSpace = 0;
   Type *Ty = nullptr;
 
   bool IsInAlloca = EatIfPresent(lltok::kw_inalloca);
@@ -6047,12 +6048,21 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   bool AteExtraComma = false;
   if (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::kw_align) {
-      if (ParseOptionalAlignment(Alignment)) return true;
+      if (ParseOptionalAlignment(Alignment))
+        return true;
+      if (ParseOptionalCommaAddrSpace(AddrSpace, ASLoc, AteExtraComma))
+        return true;
+    } else if (Lex.getKind() == lltok::kw_addrspace) {
+      ASLoc = Lex.getLoc();
+      if (ParseOptionalAddrSpace(AddrSpace))
+        return true;
     } else if (Lex.getKind() == lltok::MetadataVar) {
       AteExtraComma = true;
     } else {
       if (ParseTypeAndValue(Size, SizeLoc, PFS) ||
-          ParseOptionalCommaAlign(Alignment, AteExtraComma))
+          ParseOptionalCommaAlign(Alignment, AteExtraComma) ||
+          (!AteExtraComma &&
+           ParseOptionalCommaAddrSpace(AddrSpace, ASLoc, AteExtraComma)))
         return true;
     }
   }
@@ -6060,7 +6070,14 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   if (Size && !Size->getType()->isIntegerTy())
     return Error(SizeLoc, "element count must have integer type");
 
-  AllocaInst *AI = new AllocaInst(Ty, Size, Alignment);
+  const DataLayout &DL = M->getDataLayout();
+  unsigned AS = DL.getAllocaAddrSpace();
+  if (AS != AddrSpace) {
+    // TODO: In the future it should be possible to specify addrspace per-alloca.
+    return Error(ASLoc, "address space must match datalayout");
+  }
+
+  AllocaInst *AI = new AllocaInst(Ty, AS, Size, Alignment);
   AI->setUsedWithInAlloca(IsInAlloca);
   AI->setSwiftError(IsSwiftError);
   Inst = AI;
diff --git a/lib/AsmParser/LLParser.h b/lib/AsmParser/LLParser.h
index 16d4e8b5baa04aa33776738bbd850eaf801a86ca..4616c2e86947c7b64b24288f4dbf2fc0b6cf8296 100644
--- a/lib/AsmParser/LLParser.h
+++ b/lib/AsmParser/LLParser.h
@@ -193,6 +193,10 @@ namespace llvm {
         case lltok::kw_ninf: FMF.setNoInfs();          Lex.Lex(); continue;
         case lltok::kw_nsz:  FMF.setNoSignedZeros();   Lex.Lex(); continue;
         case lltok::kw_arcp: FMF.setAllowReciprocal(); Lex.Lex(); continue;
+        case lltok::kw_contract:
+          FMF.setAllowContract(true);
+          Lex.Lex();
+          continue;
         default: return FMF;
         }
       return FMF;
@@ -242,6 +246,8 @@ namespace llvm {
     bool ParseOrdering(AtomicOrdering &Ordering);
     bool ParseOptionalStackAlignment(unsigned &Alignment);
     bool ParseOptionalCommaAlign(unsigned &Alignment, bool &AteExtraComma);
+    bool ParseOptionalCommaAddrSpace(unsigned &AddrSpace, LocTy &Loc,
+                                     bool &AteExtraComma);
     bool ParseOptionalCommaInAlloca(bool &IsInAlloca);
     bool parseAllocSizeArguments(unsigned &ElemSizeArg,
                                  Optional<unsigned> &HowManyArg);
@@ -393,7 +399,7 @@ namespace llvm {
       Value *V;
       AttributeSet Attrs;
       ParamInfo(LocTy loc, Value *v, AttributeSet attrs)
-        : Loc(loc), V(v), Attrs(attrs) {}
+          : Loc(loc), V(v), Attrs(attrs) {}
     };
     bool ParseParameterList(SmallVectorImpl<ParamInfo> &ArgList,
                             PerFunctionState &PFS,
@@ -447,7 +453,7 @@ namespace llvm {
       AttributeSet Attrs;
       std::string Name;
       ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N)
-        : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
+          : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
     };
     bool ParseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &isVarArg);
     bool ParseFunctionHeader(Function *&Fn, bool isDefine);
diff --git a/lib/AsmParser/LLToken.h b/lib/AsmParser/LLToken.h
index 048aeee90b359b141737b776e31a26088b30529b..33f8e63daa059749f8d99ede31dbe1ea14068a7a 100644
--- a/lib/AsmParser/LLToken.h
+++ b/lib/AsmParser/LLToken.h
@@ -98,6 +98,7 @@ enum Kind {
   kw_ninf,
   kw_nsz,
   kw_arcp,
+  kw_contract,
   kw_fast,
   kw_nuw,
   kw_nsw,
diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp
index a46e49ccde83e4622a2558f7ff7c94ee1f801a09..fdd8024a9b05425b9893d358186d946f10bf6550 100644
--- a/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -419,10 +419,10 @@ class BitcodeReader : public BitcodeReaderBase, public GVMaterializer {
 
   /// The set of attributes by index.  Index zero in the file is for null, and
   /// is thus not represented here.  As such all indices are off by one.
-  std::vector<AttributeSet> MAttributes;
+  std::vector<AttributeList> MAttributes;
 
   /// The set of attribute groups.
-  std::map<unsigned, AttributeSet> MAttributeGroups;
+  std::map<unsigned, AttributeList> MAttributeGroups;
 
   /// While parsing a function body, this is a list of the basic blocks for the
   /// function.
@@ -520,10 +520,10 @@ private:
     return FunctionBBs[ID];
   }
 
-  AttributeSet getAttributes(unsigned i) const {
+  AttributeList getAttributes(unsigned i) const {
     if (i-1 < MAttributes.size())
       return MAttributes[i-1];
-    return AttributeSet();
+    return AttributeList();
   }
 
   /// Read a value/type pair out of the specified record from slot 'Slot'.
@@ -971,6 +971,8 @@ static FastMathFlags getDecodedFastMathFlags(unsigned Val) {
     FMF.setNoSignedZeros();
   if (0 != (Val & FastMathFlags::AllowReciprocal))
     FMF.setAllowReciprocal();
+  if (0 != (Val & FastMathFlags::AllowContract))
+    FMF.setAllowContract(true);
   return FMF;
 }
 
@@ -1132,7 +1134,7 @@ Error BitcodeReader::parseAttributeBlock() {
 
   SmallVector<uint64_t, 64> Record;
 
-  SmallVector<AttributeSet, 8> Attrs;
+  SmallVector<AttributeList, 8> Attrs;
 
   // Read all the records.
   while (true) {
@@ -1162,10 +1164,10 @@ Error BitcodeReader::parseAttributeBlock() {
       for (unsigned i = 0, e = Record.size(); i != e; i += 2) {
         AttrBuilder B;
         decodeLLVMAttributesForBitcode(B, Record[i+1]);
-        Attrs.push_back(AttributeSet::get(Context, Record[i], B));
+        Attrs.push_back(AttributeList::get(Context, Record[i], B));
       }
 
-      MAttributes.push_back(AttributeSet::get(Context, Attrs));
+      MAttributes.push_back(AttributeList::get(Context, Attrs));
       Attrs.clear();
       break;
     }
@@ -1173,7 +1175,7 @@ Error BitcodeReader::parseAttributeBlock() {
       for (unsigned i = 0, e = Record.size(); i != e; ++i)
         Attrs.push_back(MAttributeGroups[Record[i]]);
 
-      MAttributes.push_back(AttributeSet::get(Context, Attrs));
+      MAttributes.push_back(AttributeList::get(Context, Attrs));
       Attrs.clear();
       break;
     }
@@ -1391,7 +1393,7 @@ Error BitcodeReader::parseAttributeGroupBlock() {
         }
       }
 
-      MAttributeGroups[GrpID] = AttributeSet::get(Context, Idx, B);
+      MAttributeGroups[GrpID] = AttributeList::get(Context, Idx, B);
       break;
     }
     }
@@ -1794,22 +1796,16 @@ Error BitcodeReader::parseValueSymbolTable(uint64_t Offset) {
         return Err;
       Value *V = ValOrErr.get();
 
-      auto *GO = dyn_cast<GlobalObject>(V);
-      if (!GO) {
-        // If this is an alias, need to get the actual Function object
-        // it aliases, in order to set up the DeferredFunctionInfo entry below.
-        auto *GA = dyn_cast<GlobalAlias>(V);
-        if (GA)
-          GO = GA->getBaseObject();
-        assert(GO);
-      }
+      auto *F = dyn_cast<Function>(V);
+      // Ignore function offsets emitted for aliases of functions in older
+      // versions of LLVM.
+      if (!F)
+        break;
 
       // Note that we subtract 1 here because the offset is relative to one word
       // before the start of the identification or module block, which was
       // historically always the start of the regular bitcode header.
       uint64_t FuncWordOffset = Record[1] - 1;
-      Function *F = dyn_cast<Function>(GO);
-      assert(F);
       uint64_t FuncBitOffset = FuncWordOffset * 32;
       DeferredFunctionInfo[F] = FuncBitOffset + FuncBitcodeOffsetDelta;
       // Set the LastFunctionBlockBit to point to the last function block.
@@ -3058,13 +3054,6 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       IndirectSymbolInits.push_back(std::make_pair(NewGA, Val));
       break;
     }
-    /// MODULE_CODE_PURGEVALS: [numvals]
-    case bitc::MODULE_CODE_PURGEVALS:
-      // Trim down the value list to the specified size.
-      if (Record.size() < 1 || Record[0] > ValueList.size())
-        return error("Invalid record");
-      ValueList.shrinkTo(Record[0]);
-      break;
     /// MODULE_CODE_VSTOFFSET: [offset]
     case bitc::MODULE_CODE_VSTOFFSET:
       if (Record.size() < 1)
@@ -3840,7 +3829,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (Record.size() < 4)
         return error("Invalid record");
       unsigned OpNum = 0;
-      AttributeSet PAL = getAttributes(Record[OpNum++]);
+      AttributeList PAL = getAttributes(Record[OpNum++]);
       unsigned CCInfo = Record[OpNum++];
       BasicBlock *NormalBB = getBasicBlock(Record[OpNum++]);
       BasicBlock *UnwindBB = getBasicBlock(Record[OpNum++]);
@@ -4017,7 +4006,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       }
       if (!Ty || !Size)
         return error("Invalid record");
-      AllocaInst *AI = new AllocaInst(Ty, Size, Align);
+
+      // FIXME: Make this an optional field.
+      const DataLayout &DL = TheModule->getDataLayout();
+      unsigned AS = DL.getAllocaAddrSpace();
+
+      AllocaInst *AI = new AllocaInst(Ty, AS, Size, Align);
       AI->setUsedWithInAlloca(InAlloca);
       AI->setSwiftError(SwiftError);
       I = AI;
@@ -4225,7 +4219,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Invalid record");
 
       unsigned OpNum = 0;
-      AttributeSet PAL = getAttributes(Record[OpNum++]);
+      AttributeList PAL = getAttributes(Record[OpNum++]);
       unsigned CCInfo = Record[OpNum++];
 
       FastMathFlags FMF;
@@ -4753,33 +4747,13 @@ Error ModuleSummaryIndexBitcodeReader::parseModule(StringRef ModulePath) {
           // was historically always the start of the regular bitcode header.
           VSTOffset = Record[0] - 1;
           break;
-        // GLOBALVAR: [pointer type, isconst, initid,
-        //             linkage, alignment, section, visibility, threadlocal,
-        //             unnamed_addr, externally_initialized, dllstorageclass,
-        //             comdat]
-        case bitc::MODULE_CODE_GLOBALVAR: {
-          if (Record.size() < 6)
-            return error("Invalid record");
-          uint64_t RawLinkage = Record[3];
-          GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-          ValueIdToLinkageMap[ValueId++] = Linkage;
-          break;
-        }
-        // FUNCTION:  [type, callingconv, isproto, linkage, paramattr,
-        //             alignment, section, visibility, gc, unnamed_addr,
-        //             prologuedata, dllstorageclass, comdat, prefixdata]
-        case bitc::MODULE_CODE_FUNCTION: {
-          if (Record.size() < 8)
-            return error("Invalid record");
-          uint64_t RawLinkage = Record[3];
-          GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
-          ValueIdToLinkageMap[ValueId++] = Linkage;
-          break;
-        }
-        // ALIAS: [alias type, addrspace, aliasee val#, linkage, visibility,
-        // dllstorageclass]
+        // GLOBALVAR: [pointer type, isconst,     initid,       linkage, ...]
+        // FUNCTION:  [type,         callingconv, isproto,      linkage, ...]
+        // ALIAS:     [alias type,   addrspace,   aliasee val#, linkage, ...]
+        case bitc::MODULE_CODE_GLOBALVAR:
+        case bitc::MODULE_CODE_FUNCTION:
         case bitc::MODULE_CODE_ALIAS: {
-          if (Record.size() < 6)
+          if (Record.size() <= 3)
             return error("Invalid record");
           uint64_t RawLinkage = Record[3];
           GlobalValue::LinkageTypes Linkage = getDecodedLinkage(RawLinkage);
@@ -4846,8 +4820,17 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
   // Keep around the last seen summary to be used when we see an optional
   // "OriginalName" attachement.
   GlobalValueSummary *LastSeenSummary = nullptr;
+  GlobalValue::GUID LastSeenGUID = 0;
   bool Combined = false;
+
+  // We can expect to see any number of type ID information records before
+  // each function summary records; these variables store the information
+  // collected so far so that it can be used to create the summary object.
   std::vector<GlobalValue::GUID> PendingTypeTests;
+  std::vector<FunctionSummary::VFuncId> PendingTypeTestAssumeVCalls,
+      PendingTypeCheckedLoadVCalls;
+  std::vector<FunctionSummary::ConstVCall> PendingTypeTestAssumeConstVCalls,
+      PendingTypeCheckedLoadConstVCalls;
 
   while (true) {
     BitstreamEntry Entry = Stream.advanceSkippingSubblocks();
@@ -4914,8 +4897,15 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
           IsOldProfileFormat, HasProfile);
       auto FS = llvm::make_unique<FunctionSummary>(
           Flags, InstCount, std::move(Refs), std::move(Calls),
-          std::move(PendingTypeTests));
+          std::move(PendingTypeTests), std::move(PendingTypeTestAssumeVCalls),
+          std::move(PendingTypeCheckedLoadVCalls),
+          std::move(PendingTypeTestAssumeConstVCalls),
+          std::move(PendingTypeCheckedLoadConstVCalls));
       PendingTypeTests.clear();
+      PendingTypeTestAssumeVCalls.clear();
+      PendingTypeCheckedLoadVCalls.clear();
+      PendingTypeTestAssumeConstVCalls.clear();
+      PendingTypeCheckedLoadConstVCalls.clear();
       auto GUID = getGUIDFromValueId(ValueID);
       FS->setModulePath(TheIndex.addModulePath(ModulePath, 0)->first());
       FS->setOriginalName(GUID.second);
@@ -4989,9 +4979,17 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
       auto FS = llvm::make_unique<FunctionSummary>(
           Flags, InstCount, std::move(Refs), std::move(Edges),
-          std::move(PendingTypeTests));
+          std::move(PendingTypeTests), std::move(PendingTypeTestAssumeVCalls),
+          std::move(PendingTypeCheckedLoadVCalls),
+          std::move(PendingTypeTestAssumeConstVCalls),
+          std::move(PendingTypeCheckedLoadConstVCalls));
       PendingTypeTests.clear();
+      PendingTypeTestAssumeVCalls.clear();
+      PendingTypeCheckedLoadVCalls.clear();
+      PendingTypeTestAssumeConstVCalls.clear();
+      PendingTypeCheckedLoadConstVCalls.clear();
       LastSeenSummary = FS.get();
+      LastSeenGUID = GUID;
       FS->setModulePath(ModuleIdMap[ModuleId]);
       TheIndex.addGlobalValueSummary(GUID, std::move(FS));
       Combined = true;
@@ -5018,6 +5016,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       AS->setAliasee(AliaseeInModule);
 
       GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
+      LastSeenGUID = GUID;
       TheIndex.addGlobalValueSummary(GUID, std::move(AS));
       Combined = true;
       break;
@@ -5034,6 +5033,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       LastSeenSummary = FS.get();
       FS->setModulePath(ModuleIdMap[ModuleId]);
       GlobalValue::GUID GUID = getGUIDFromValueId(ValueID).first;
+      LastSeenGUID = GUID;
       TheIndex.addGlobalValueSummary(GUID, std::move(FS));
       Combined = true;
       break;
@@ -5044,8 +5044,10 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
       if (!LastSeenSummary)
         return error("Name attachment that does not follow a combined record");
       LastSeenSummary->setOriginalName(OriginalName);
+      TheIndex.addOriginalName(LastSeenGUID, OriginalName);
       // Reset the LastSeenSummary
       LastSeenSummary = nullptr;
+      LastSeenGUID = 0;
       break;
     }
     case bitc::FS_TYPE_TESTS: {
@@ -5054,6 +5056,28 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(
                               Record.end());
       break;
     }
+    case bitc::FS_TYPE_TEST_ASSUME_VCALLS: {
+      assert(PendingTypeTestAssumeVCalls.empty());
+      for (unsigned I = 0; I != Record.size(); I += 2)
+        PendingTypeTestAssumeVCalls.push_back({Record[I], Record[I+1]});
+      break;
+    }
+    case bitc::FS_TYPE_CHECKED_LOAD_VCALLS: {
+      assert(PendingTypeCheckedLoadVCalls.empty());
+      for (unsigned I = 0; I != Record.size(); I += 2)
+        PendingTypeCheckedLoadVCalls.push_back({Record[I], Record[I+1]});
+      break;
+    }
+    case bitc::FS_TYPE_TEST_ASSUME_CONST_VCALL: {
+      PendingTypeTestAssumeConstVCalls.push_back(
+          {{Record[0], Record[1]}, {Record.begin() + 2, Record.end()}});
+      break;
+    }
+    case bitc::FS_TYPE_CHECKED_LOAD_CONST_VCALL: {
+      PendingTypeCheckedLoadConstVCalls.push_back(
+          {{Record[0], Record[1]}, {Record.begin() + 2, Record.end()}});
+      break;
+    }
     }
   }
   llvm_unreachable("Exit infinite loop");
diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp
index ab9dd06d4f048b07a9a9812df95f5203d19e351e..274dfe89cce544b5ce58d2ac298c2b8cb06b529f 100644
--- a/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -451,6 +451,7 @@ class MetadataLoader::MetadataLoaderImpl {
 
   bool StripTBAA = false;
   bool HasSeenOldLoopTags = false;
+  bool NeedUpgradeToDIGlobalVariableExpression = false;
 
   /// True if metadata is being parsed for a module being ThinLTO imported.
   bool IsImporting = false;
@@ -476,6 +477,45 @@ class MetadataLoader::MetadataLoaderImpl {
     CUSubprograms.clear();
   }
 
+  /// Upgrade old-style bare DIGlobalVariables to DIGlobalVariableExpressions.
+  void upgradeCUVariables() {
+    if (!NeedUpgradeToDIGlobalVariableExpression)
+      return;
+
+    // Upgrade list of variables attached to the CUs.
+    if (NamedMDNode *CUNodes = TheModule.getNamedMetadata("llvm.dbg.cu"))
+      for (unsigned I = 0, E = CUNodes->getNumOperands(); I != E; ++I) {
+        auto *CU = cast<DICompileUnit>(CUNodes->getOperand(I));
+        if (auto *GVs = dyn_cast_or_null<MDTuple>(CU->getRawGlobalVariables()))
+          for (unsigned I = 0; I < GVs->getNumOperands(); I++)
+            if (auto *GV =
+                    dyn_cast_or_null<DIGlobalVariable>(GVs->getOperand(I))) {
+              auto *DGVE =
+                  DIGlobalVariableExpression::getDistinct(Context, GV, nullptr);
+              GVs->replaceOperandWith(I, DGVE);
+            }
+      }
+
+    // Upgrade variables attached to globals.
+    for (auto &GV : TheModule.globals()) {
+      SmallVector<MDNode *, 1> MDs, NewMDs;
+      GV.getMetadata(LLVMContext::MD_dbg, MDs);
+      GV.eraseMetadata(LLVMContext::MD_dbg);
+      for (auto *MD : MDs)
+        if (auto *DGV = dyn_cast_or_null<DIGlobalVariable>(MD)) {
+          auto *DGVE =
+              DIGlobalVariableExpression::getDistinct(Context, DGV, nullptr);
+          GV.addMetadata(LLVMContext::MD_dbg, *DGVE);
+        } else
+          GV.addMetadata(LLVMContext::MD_dbg, *MD);
+    }
+  }
+
+  void upgradeDebugInfo() {
+    upgradeCUSubprograms();
+    upgradeCUVariables();
+  }
+
 public:
   MetadataLoaderImpl(BitstreamCursor &Stream, Module &TheModule,
                      BitcodeReaderValueList &ValueList,
@@ -527,7 +567,7 @@ public:
   void shrinkTo(unsigned N) { MetadataList.shrinkTo(N); }
 };
 
-Error error(const Twine &Message) {
+static Error error(const Twine &Message) {
   return make_error<StringError>(
       Message, make_error_code(BitcodeError::CorruptedBitcode));
 }
@@ -729,7 +769,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
       // Reading the named metadata created forward references and/or
       // placeholders, that we flush here.
       resolveForwardRefsAndPlaceholders(Placeholders);
-      upgradeCUSubprograms();
+      upgradeDebugInfo();
       // Return at the beginning of the block, since it is easy to skip it
       // entirely from there.
       Stream.ReadBlockEnd(); // Pop the abbrev block context.
@@ -753,7 +793,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseMetadata(bool ModuleLevel) {
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
       resolveForwardRefsAndPlaceholders(Placeholders);
-      upgradeCUSubprograms();
+      upgradeDebugInfo();
       return Error::success();
     case BitstreamEntry::Record:
       // The interesting case.
@@ -1070,9 +1110,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_DERIVED_TYPE: {
-    if (Record.size() != 12)
+    if (Record.size() < 12 || Record.size() > 13)
       return error("Invalid record");
 
+    // DWARF address space is encoded as N->getDWARFAddressSpace() + 1. 0 means
+    // that there is no DWARF address space associated with DIDerivedType.
+    Optional<unsigned> DWARFAddressSpace;
+    if (Record.size() > 12 && Record[12])
+      DWARFAddressSpace = Record[12] - 1;
+
     IsDistinct = Record[0];
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[10]);
     MetadataList.assignValue(
@@ -1081,7 +1127,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                          getMDOrNull(Record[3]), Record[4],
                          getDITypeRefOrNull(Record[5]),
                          getDITypeRefOrNull(Record[6]), Record[7], Record[8],
-                         Record[9], Flags, getDITypeRefOrNull(Record[11]))),
+                         Record[9], DWARFAddressSpace, Flags,
+                         getDITypeRefOrNull(Record[11]))),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1203,7 +1250,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_COMPILE_UNIT: {
-    if (Record.size() < 14 || Record.size() > 17)
+    if (Record.size() < 14 || Record.size() > 18)
       return error("Invalid record");
 
     // Ignore Record[0], which indicates whether this compile unit is
@@ -1216,7 +1263,8 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         getMDOrNull(Record[12]), getMDOrNull(Record[13]),
         Record.size() <= 15 ? nullptr : getMDOrNull(Record[15]),
         Record.size() <= 14 ? 0 : Record[14],
-        Record.size() <= 16 ? true : Record[16]);
+        Record.size() <= 16 ? true : Record[16],
+        Record.size() <= 17 ? false : Record[17]);
 
     MetadataList.assignValue(CU, NextMetadataNo);
     NextMetadataNo++;
@@ -1396,6 +1444,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     } else if (Version == 0) {
       // Upgrade old metadata, which stored a global variable reference or a
       // ConstantInt here.
+      NeedUpgradeToDIGlobalVariableExpression = true;
       Metadata *Expr = getMDOrNull(Record[9]);
       uint32_t AlignInBits = 0;
       if (Record.size() > 11) {
@@ -1423,11 +1472,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
            getDITypeRefOrNull(Record[6]), Record[7], Record[8],
            getMDOrNull(Record[10]), AlignInBits));
 
-      auto *DGVE = DIGlobalVariableExpression::getDistinct(Context, DGV, Expr);
-      MetadataList.assignValue(DGVE, NextMetadataNo);
-      NextMetadataNo++;
+      DIGlobalVariableExpression *DGVE = nullptr;
+      if (Attach || Expr)
+        DGVE = DIGlobalVariableExpression::getDistinct(Context, DGV, Expr);
       if (Attach)
         Attach->addDebugInfo(DGVE);
+
+      auto *MDNode = Expr ? cast<Metadata>(DGVE) : cast<Metadata>(DGV);
+      MetadataList.assignValue(MDNode, NextMetadataNo);
+      NextMetadataNo++;
     } else
       return error("Invalid record");
 
@@ -1442,7 +1495,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     bool HasAlignment = Record[0] & 2;
     // 2nd field used to be an artificial tag, either DW_TAG_auto_variable or
     // DW_TAG_arg_variable, if we have alignment flag encoded it means, that
-    // this is newer version of record which doesn't have artifical tag.
+    // this is newer version of record which doesn't have artificial tag.
     bool HasTag = !HasAlignment && Record.size() > 8;
     DINode::DIFlags Flags = static_cast<DINode::DIFlags>(Record[7 + HasTag]);
     uint32_t AlignInBits = 0;
diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp
index ebb2022551f784bdc01410764eebbce0ae049c3f..043441bac4dead056ccbb51d272f9d10b545d6dd 100644
--- a/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -108,6 +108,14 @@ class ModuleBitcodeWriter : public BitcodeWriterBase {
   /// True if a module hash record should be written.
   bool GenerateHash;
 
+  /// If non-null, when GenerateHash is true, the resulting hash is written
+  /// into ModHash. When GenerateHash is false, that specified value
+  /// is used as the hash instead of computing from the generated bitcode.
+  /// Can be used to produce the same module hash for a minimized bitcode
+  /// used just for the thin link as in the regular full bitcode that will
+  /// be used in the backend.
+  ModuleHash *ModHash;
+
   /// The start bit of the identification block.
   uint64_t BitcodeStartBit;
 
@@ -124,10 +132,12 @@ public:
   /// writing to the provided \p Buffer.
   ModuleBitcodeWriter(const Module *M, SmallVectorImpl<char> &Buffer,
                       BitstreamWriter &Stream, bool ShouldPreserveUseListOrder,
-                      const ModuleSummaryIndex *Index, bool GenerateHash)
+                      const ModuleSummaryIndex *Index, bool GenerateHash,
+                      ModuleHash *ModHash = nullptr)
       : BitcodeWriterBase(Stream), Buffer(Buffer), M(*M),
         VE(*M, ShouldPreserveUseListOrder), Index(Index),
-        GenerateHash(GenerateHash), BitcodeStartBit(Stream.GetCurrentBitNo()) {
+        GenerateHash(GenerateHash), ModHash(ModHash),
+        BitcodeStartBit(Stream.GetCurrentBitNo()) {
     // Assign ValueIds to any callee values in the index that came from
     // indirect call profiles and were recorded as a GUID not a Value*
     // (which would have been assigned an ID by the ValueEnumerator).
@@ -466,7 +476,6 @@ public:
   void write();
 
 private:
-  void writeIndex();
   void writeModStrings();
   void writeCombinedValueSymbolTable();
   void writeCombinedGlobalValueSummary();
@@ -709,22 +718,22 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
 }
 
 void ModuleBitcodeWriter::writeAttributeGroupTable() {
-  const std::vector<AttributeSet> &AttrGrps = VE.getAttributeGroups();
+  const std::vector<AttributeList> &AttrGrps = VE.getAttributeGroups();
   if (AttrGrps.empty()) return;
 
   Stream.EnterSubblock(bitc::PARAMATTR_GROUP_BLOCK_ID, 3);
 
   SmallVector<uint64_t, 64> Record;
   for (unsigned i = 0, e = AttrGrps.size(); i != e; ++i) {
-    AttributeSet AS = AttrGrps[i];
+    AttributeList AS = AttrGrps[i];
     for (unsigned i = 0, e = AS.getNumSlots(); i != e; ++i) {
-      AttributeSet A = AS.getSlotAttributes(i);
+      AttributeList A = AS.getSlotAttributes(i);
 
       Record.push_back(VE.getAttributeGroupID(A));
       Record.push_back(AS.getSlotIndex(i));
 
-      for (AttributeSet::iterator I = AS.begin(0), E = AS.end(0);
-           I != E; ++I) {
+      for (AttributeList::iterator I = AS.begin(0), E = AS.end(0); I != E;
+           ++I) {
         Attribute Attr = *I;
         if (Attr.isEnumAttribute()) {
           Record.push_back(0);
@@ -756,14 +765,14 @@ void ModuleBitcodeWriter::writeAttributeGroupTable() {
 }
 
 void ModuleBitcodeWriter::writeAttributeTable() {
-  const std::vector<AttributeSet> &Attrs = VE.getAttributes();
+  const std::vector<AttributeList> &Attrs = VE.getAttributes();
   if (Attrs.empty()) return;
 
   Stream.EnterSubblock(bitc::PARAMATTR_BLOCK_ID, 3);
 
   SmallVector<uint64_t, 64> Record;
   for (unsigned i = 0, e = Attrs.size(); i != e; ++i) {
-    const AttributeSet &A = Attrs[i];
+    const AttributeList &A = Attrs[i];
     for (unsigned i = 0, e = A.getNumSlots(); i != e; ++i)
       Record.push_back(VE.getAttributeGroupID(A.getSlotAttributes(i)));
 
@@ -1326,6 +1335,8 @@ static uint64_t getOptimizationFlags(const Value *V) {
       Flags |= FastMathFlags::NoSignedZeros;
     if (FPMO->hasAllowReciprocal())
       Flags |= FastMathFlags::AllowReciprocal;
+    if (FPMO->hasAllowContract())
+      Flags |= FastMathFlags::AllowContract;
   }
 
   return Flags;
@@ -1473,6 +1484,13 @@ void ModuleBitcodeWriter::writeDIDerivedType(const DIDerivedType *N,
   Record.push_back(N->getFlags());
   Record.push_back(VE.getMetadataOrNullID(N->getExtraData()));
 
+  // DWARF address space is encoded as N->getDWARFAddressSpace() + 1. 0 means
+  // that there is no DWARF address space associated with DIDerivedType.
+  if (const auto &DWARFAddressSpace = N->getDWARFAddressSpace())
+    Record.push_back(*DWARFAddressSpace + 1);
+  else
+    Record.push_back(0);
+
   Stream.EmitRecord(bitc::METADATA_DERIVED_TYPE, Record, Abbrev);
   Record.clear();
 }
@@ -1549,6 +1567,7 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
   Record.push_back(N->getDWOId());
   Record.push_back(VE.getMetadataOrNullID(N->getMacros().get()));
   Record.push_back(N->getSplitDebugInlining());
+  Record.push_back(N->getDebugInfoForProfiling());
 
   Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
   Record.clear();
@@ -2559,7 +2578,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
       Vals.push_back(VE.getTypeID(SI.getCondition()->getType()));
       pushValue(SI.getCondition(), InstID, Vals);
       Vals.push_back(VE.getValueID(SI.getDefaultDest()));
-      for (SwitchInst::ConstCaseIt Case : SI.cases()) {
+      for (auto Case : SI.cases()) {
         Vals.push_back(VE.getValueID(Case.getCaseValue()));
         Vals.push_back(VE.getValueID(Case.getCaseSuccessor()));
       }
@@ -2905,13 +2924,6 @@ void ModuleBitcodeWriter::writeValueSymbolTable(
     NameVals.push_back(VE.getValueID(Name.getValue()));
 
     Function *F = dyn_cast<Function>(Name.getValue());
-    if (!F) {
-      // If value is an alias, need to get the aliased base object to
-      // see if it is a function.
-      auto *GA = dyn_cast<GlobalAlias>(Name.getValue());
-      if (GA && GA->getBaseObject())
-        F = dyn_cast<Function>(GA->getBaseObject());
-    }
 
     // VST_CODE_ENTRY:   [valueid, namechar x N]
     // VST_CODE_FNENTRY: [valueid, funcoffset, namechar x N]
@@ -3367,6 +3379,49 @@ void IndexBitcodeWriter::writeModStrings() {
   Stream.ExitBlock();
 }
 
+/// Write the function type metadata related records that need to appear before
+/// a function summary entry (whether per-module or combined).
+static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
+                                             FunctionSummary *FS) {
+  if (!FS->type_tests().empty())
+    Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
+
+  SmallVector<uint64_t, 64> Record;
+
+  auto WriteVFuncIdVec = [&](uint64_t Ty,
+                             ArrayRef<FunctionSummary::VFuncId> VFs) {
+    if (VFs.empty())
+      return;
+    Record.clear();
+    for (auto &VF : VFs) {
+      Record.push_back(VF.GUID);
+      Record.push_back(VF.Offset);
+    }
+    Stream.EmitRecord(Ty, Record);
+  };
+
+  WriteVFuncIdVec(bitc::FS_TYPE_TEST_ASSUME_VCALLS,
+                  FS->type_test_assume_vcalls());
+  WriteVFuncIdVec(bitc::FS_TYPE_CHECKED_LOAD_VCALLS,
+                  FS->type_checked_load_vcalls());
+
+  auto WriteConstVCallVec = [&](uint64_t Ty,
+                                ArrayRef<FunctionSummary::ConstVCall> VCs) {
+    for (auto &VC : VCs) {
+      Record.clear();
+      Record.push_back(VC.VFunc.GUID);
+      Record.push_back(VC.VFunc.Offset);
+      Record.insert(Record.end(), VC.Args.begin(), VC.Args.end());
+      Stream.EmitRecord(Ty, Record);
+    }
+  };
+
+  WriteConstVCallVec(bitc::FS_TYPE_TEST_ASSUME_CONST_VCALL,
+                     FS->type_test_assume_const_vcalls());
+  WriteConstVCallVec(bitc::FS_TYPE_CHECKED_LOAD_CONST_VCALL,
+                     FS->type_checked_load_const_vcalls());
+}
+
 // Helper to emit a single function summary record.
 void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord(
     SmallVector<uint64_t, 64> &NameVals, GlobalValueSummary *Summary,
@@ -3375,8 +3430,7 @@ void ModuleBitcodeWriter::writePerModuleFunctionSummaryRecord(
   NameVals.push_back(ValueID);
 
   FunctionSummary *FS = cast<FunctionSummary>(Summary);
-  if (!FS->type_tests().empty())
-    Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
+  writeFunctionTypeMetadataRecords(Stream, FS);
 
   NameVals.push_back(getEncodedGVSummaryFlags(FS->flags()));
   NameVals.push_back(FS->instCount());
@@ -3636,8 +3690,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     }
 
     auto *FS = cast<FunctionSummary>(S);
-    if (!FS->type_tests().empty())
-      Stream.EmitRecord(bitc::FS_TYPE_TESTS, FS->type_tests());
+    writeFunctionTypeMetadataRecords(Stream, FS);
 
     NameVals.push_back(ValueId);
     NameVals.push_back(Index.getModuleId(FS->modulePath()));
@@ -3659,9 +3712,16 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     for (auto &EI : FS->calls()) {
       // If this GUID doesn't have a value id, it doesn't have a function
       // summary and we don't need to record any calls to it.
-      if (!hasValueId(EI.first.getGUID()))
-        continue;
-      NameVals.push_back(getValueId(EI.first.getGUID()));
+      GlobalValue::GUID GUID = EI.first.getGUID();
+      if (!hasValueId(GUID)) {
+        // For SamplePGO, the indirect call targets for local functions will
+        // have its original name annotated in profile. We try to find the
+        // corresponding PGOFuncName as the GUID.
+        GUID = Index.getGUIDFromOriginalID(GUID);
+        if (GUID == 0 || !hasValueId(GUID))
+          continue;
+      }
+      NameVals.push_back(getValueId(GUID));
       if (HasProfileData)
         NameVals.push_back(static_cast<uint8_t>(EI.second.Hotness));
     }
@@ -3697,7 +3757,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
 
 /// Create the "IDENTIFICATION_BLOCK_ID" containing a single string with the
 /// current llvm version, and a record for the epoch number.
-void writeIdentificationBlock(BitstreamWriter &Stream) {
+static void writeIdentificationBlock(BitstreamWriter &Stream) {
   Stream.EnterSubblock(bitc::IDENTIFICATION_BLOCK_ID, 5);
 
   // Write the "user readable" string identifying the bitcode producer
@@ -3722,17 +3782,24 @@ void writeIdentificationBlock(BitstreamWriter &Stream) {
 void ModuleBitcodeWriter::writeModuleHash(size_t BlockStartPos) {
   // Emit the module's hash.
   // MODULE_CODE_HASH: [5*i32]
-  SHA1 Hasher;
-  Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&(Buffer)[BlockStartPos],
-                                  Buffer.size() - BlockStartPos));
-  StringRef Hash = Hasher.result();
-  uint32_t Vals[5];
-  for (int Pos = 0; Pos < 20; Pos += 4) {
-    Vals[Pos / 4] = support::endian::read32be(Hash.data() + Pos);
-  }
+  if (GenerateHash) {
+    SHA1 Hasher;
+    uint32_t Vals[5];
+    Hasher.update(ArrayRef<uint8_t>((const uint8_t *)&(Buffer)[BlockStartPos],
+                                    Buffer.size() - BlockStartPos));
+    StringRef Hash = Hasher.result();
+    for (int Pos = 0; Pos < 20; Pos += 4) {
+      Vals[Pos / 4] = support::endian::read32be(Hash.data() + Pos);
+    }
 
-  // Emit the finished record.
-  Stream.EmitRecord(bitc::MODULE_CODE_HASH, Vals);
+    // Emit the finished record.
+    Stream.EmitRecord(bitc::MODULE_CODE_HASH, Vals);
+
+    if (ModHash)
+      // Save the written hash value.
+      std::copy(std::begin(Vals), std::end(Vals), std::begin(*ModHash));
+  } else if (ModHash)
+    Stream.EmitRecord(bitc::MODULE_CODE_HASH, ArrayRef<uint32_t>(*ModHash));
 }
 
 void ModuleBitcodeWriter::write() {
@@ -3793,9 +3860,7 @@ void ModuleBitcodeWriter::write() {
   writeValueSymbolTable(M.getValueSymbolTable(),
                         /* IsModuleLevel */ true, &FunctionToBitcodeIndex);
 
-  if (GenerateHash) {
-    writeModuleHash(BlockStartPos);
-  }
+  writeModuleHash(BlockStartPos);
 
   Stream.ExitBlock();
 }
@@ -3886,9 +3951,10 @@ BitcodeWriter::~BitcodeWriter() = default;
 void BitcodeWriter::writeModule(const Module *M,
                                 bool ShouldPreserveUseListOrder,
                                 const ModuleSummaryIndex *Index,
-                                bool GenerateHash) {
-  ModuleBitcodeWriter ModuleWriter(
-      M, Buffer, *Stream, ShouldPreserveUseListOrder, Index, GenerateHash);
+                                bool GenerateHash, ModuleHash *ModHash) {
+  ModuleBitcodeWriter ModuleWriter(M, Buffer, *Stream,
+                                   ShouldPreserveUseListOrder, Index,
+                                   GenerateHash, ModHash);
   ModuleWriter.write();
 }
 
@@ -3897,7 +3963,7 @@ void BitcodeWriter::writeModule(const Module *M,
 void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
                               bool ShouldPreserveUseListOrder,
                               const ModuleSummaryIndex *Index,
-                              bool GenerateHash) {
+                              bool GenerateHash, ModuleHash *ModHash) {
   SmallVector<char, 0> Buffer;
   Buffer.reserve(256*1024);
 
@@ -3908,7 +3974,8 @@ void llvm::WriteBitcodeToFile(const Module *M, raw_ostream &Out,
     Buffer.insert(Buffer.begin(), BWH_HeaderSize, 0);
 
   BitcodeWriter Writer(Buffer);
-  Writer.writeModule(M, ShouldPreserveUseListOrder, Index, GenerateHash);
+  Writer.writeModule(M, ShouldPreserveUseListOrder, Index, GenerateHash,
+                     ModHash);
 
   if (TT.isOSDarwin() || TT.isOSBinFormatMachO())
     emitDarwinBCHeaderAndTrailer(Buffer, TT);
diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp
index dc10094969d6b76d0f4b7ebaf8df6bfddf1f878d..3800d9abd429ab0eb4d53a983f85c458d00c2370 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -887,7 +887,7 @@ void ValueEnumerator::EnumerateOperandType(const Value *V) {
   }
 }
 
-void ValueEnumerator::EnumerateAttributes(AttributeSet PAL) {
+void ValueEnumerator::EnumerateAttributes(AttributeList PAL) {
   if (PAL.isEmpty()) return;  // null is always 0.
 
   // Do a lookup.
@@ -900,7 +900,7 @@ void ValueEnumerator::EnumerateAttributes(AttributeSet PAL) {
 
   // Do lookups for all attribute groups.
   for (unsigned i = 0, e = PAL.getNumSlots(); i != e; ++i) {
-    AttributeSet AS = PAL.getSlotAttributes(i);
+    AttributeList AS = PAL.getSlotAttributes(i);
     unsigned &Entry = AttributeGroupMap[AS];
     if (Entry == 0) {
       AttributeGroups.push_back(AS);
diff --git a/lib/Bitcode/Writer/ValueEnumerator.h b/lib/Bitcode/Writer/ValueEnumerator.h
index a8d6cf965a4b47a5e683114aac33c8e6e740f4d2..8a82aab2983637623cc5920be82a66042f391207 100644
--- a/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/lib/Bitcode/Writer/ValueEnumerator.h
@@ -36,7 +36,7 @@ class LocalAsMetadata;
 class MDNode;
 class MDOperand;
 class NamedMDNode;
-class AttributeSet;
+class AttributeList;
 class ValueSymbolTable;
 class MDSymbolTable;
 class raw_ostream;
@@ -102,13 +102,13 @@ private:
 
   bool ShouldPreserveUseListOrder;
 
-  typedef DenseMap<AttributeSet, unsigned> AttributeGroupMapType;
+  typedef DenseMap<AttributeList, unsigned> AttributeGroupMapType;
   AttributeGroupMapType AttributeGroupMap;
-  std::vector<AttributeSet> AttributeGroups;
+  std::vector<AttributeList> AttributeGroups;
 
-  typedef DenseMap<AttributeSet, unsigned> AttributeMapType;
+  typedef DenseMap<AttributeList, unsigned> AttributeMapType;
   AttributeMapType AttributeMap;
-  std::vector<AttributeSet> Attribute;
+  std::vector<AttributeList> Attribute;
 
   /// GlobalBasicBlockIDs - This map memoizes the basic block ID's referenced by
   /// the "getGlobalBasicBlockID" method.
@@ -166,14 +166,14 @@ public:
   unsigned getInstructionID(const Instruction *I) const;
   void setInstructionID(const Instruction *I);
 
-  unsigned getAttributeID(AttributeSet PAL) const {
+  unsigned getAttributeID(AttributeList PAL) const {
     if (PAL.isEmpty()) return 0;  // Null maps to zero.
     AttributeMapType::const_iterator I = AttributeMap.find(PAL);
     assert(I != AttributeMap.end() && "Attribute not in ValueEnumerator!");
     return I->second;
   }
 
-  unsigned getAttributeGroupID(AttributeSet PAL) const {
+  unsigned getAttributeGroupID(AttributeList PAL) const {
     if (PAL.isEmpty()) return 0;  // Null maps to zero.
     AttributeGroupMapType::const_iterator I = AttributeGroupMap.find(PAL);
     assert(I != AttributeGroupMap.end() && "Attribute not in ValueEnumerator!");
@@ -206,10 +206,8 @@ public:
   const std::vector<const BasicBlock*> &getBasicBlocks() const {
     return BasicBlocks;
   }
-  const std::vector<AttributeSet> &getAttributes() const {
-    return Attribute;
-  }
-  const std::vector<AttributeSet> &getAttributeGroups() const {
+  const std::vector<AttributeList> &getAttributes() const { return Attribute; }
+  const std::vector<AttributeList> &getAttributeGroups() const {
     return AttributeGroups;
   }
 
@@ -283,7 +281,7 @@ private:
   void EnumerateValue(const Value *V);
   void EnumerateType(Type *T);
   void EnumerateOperandType(const Value *V);
-  void EnumerateAttributes(AttributeSet PAL);
+  void EnumerateAttributes(AttributeList PAL);
 
   void EnumerateValueSymbolTable(const ValueSymbolTable &ST);
   void EnumerateNamedMetadata(const Module &M);
diff --git a/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index bb908618b6794e7442ab7e2e7517852d115ffd35..955524c2a676e784eec8699b827cfe1aa89c17f4 100644
--- a/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -163,9 +163,11 @@ void AggressiveAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   // callee-saved register that is not saved in the prolog.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   BitVector Pristine = MFI.getPristineRegs(MF);
-  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
+  for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I;
+       ++I) {
     unsigned Reg = *I;
-    if (!IsReturnBlock && !Pristine.test(Reg)) continue;
+    if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg)))
+      continue;
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
       unsigned AliasReg = *AI;
       State->UnionGroups(AliasReg, 0);
diff --git a/lib/CodeGen/Analysis.cpp b/lib/CodeGen/Analysis.cpp
index 79ecc4308fe7f605de744e5464a171a4bd201a62..09a37a77e9fbc91d404161e9e65468932d41e4fa 100644
--- a/lib/CodeGen/Analysis.cpp
+++ b/lib/CodeGen/Analysis.cpp
@@ -516,10 +516,9 @@ bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I,
   bool &ADS = AllowDifferingSizes ? *AllowDifferingSizes : DummyADS;
   ADS = true;
 
-  AttrBuilder CallerAttrs(F->getAttributes(),
-                          AttributeSet::ReturnIndex);
+  AttrBuilder CallerAttrs(F->getAttributes(), AttributeList::ReturnIndex);
   AttrBuilder CalleeAttrs(cast<CallInst>(I)->getAttributes(),
-                          AttributeSet::ReturnIndex);
+                          AttributeList::ReturnIndex);
 
   // Noalias is completely benign as far as calling convention goes, it
   // shouldn't affect whether the call is a tail call.
@@ -613,25 +612,6 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
   return true;
 }
 
-bool llvm::canBeOmittedFromSymbolTable(const GlobalValue *GV) {
-  if (!GV->hasLinkOnceODRLinkage())
-    return false;
-
-  // We assume that anyone who sets global unnamed_addr on a non-constant knows
-  // what they're doing.
-  if (GV->hasGlobalUnnamedAddr())
-    return true;
-
-  // If it is a non constant variable, it needs to be uniqued across shared
-  // objects.
-  if (const GlobalVariable *Var = dyn_cast<GlobalVariable>(GV)) {
-    if (!Var->isConstant())
-      return false;
-  }
-
-  return GV->hasAtLeastLocalUnnamedAddr();
-}
-
 static void collectFuncletMembers(
     DenseMap<const MachineBasicBlock *, int> &FuncletMembership, int Funclet,
     const MachineBasicBlock *MBB) {
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 0f6ca02b2a89d7650e4a1af925c5b457a41b12c4..834a59a12a6a9a0b26ec7b924dad2a7b3475c495 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -11,48 +11,102 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/AsmPrinter.h"
+#include "AsmPrinterHandler.h"
 #include "CodeViewDebug.h"
 #include "DwarfDebug.h"
 #include "DwarfException.h"
 #include "WinException.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/ObjectUtils.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalIFunc.h"
+#include "llvm/IR/GlobalIndirectSymbol.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Value.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/Timer.h"
 #include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -78,7 +132,6 @@ static gcp_map_type &getGCMap(void *&P) {
   return *(gcp_map_type*)P;
 }
 
-
 /// getGVAlignmentLog2 - Return the alignment to use for the specified global
 /// value in log2 form.  This rounds up to the preferred alignment if possible
 /// and legal.
@@ -107,16 +160,7 @@ static unsigned getGVAlignmentLog2(const GlobalValue *GV, const DataLayout &DL,
 
 AsmPrinter::AsmPrinter(TargetMachine &tm, std::unique_ptr<MCStreamer> Streamer)
     : MachineFunctionPass(ID), TM(tm), MAI(tm.getMCAsmInfo()),
-      OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)),
-      isCFIMoveForDebugging(false), LastMI(nullptr), LastFn(0), Counter(~0U) {
-  DD = nullptr;
-  MMI = nullptr;
-  LI = nullptr;
-  MF = nullptr;
-  CurExceptionSym = CurrentFnSym = CurrentFnSymForSize = nullptr;
-  CurrentFnBegin = nullptr;
-  CurrentFnEnd = nullptr;
-  GCMetadataPrinters = nullptr;
+      OutContext(Streamer->getContext()), OutStreamer(std::move(Streamer)) {
   VerboseAsm = OutStreamer->isVerboseAsm();
 }
 
@@ -171,6 +215,7 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
   AU.addRequired<MachineModuleInfo>();
+  AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
   if (isVerbose())
     AU.addRequired<MachineLoopInfo>();
@@ -223,7 +268,7 @@ bool AsmPrinter::doInitialization(Module &M) {
   // don't, this at least helps the user find where a global came from.
   if (MAI->hasSingleParameterDotFile()) {
     // .file "foo.c"
-    OutStreamer->EmitFileDirective(M.getModuleIdentifier());
+    OutStreamer->EmitFileDirective(M.getSourceFileName());
   }
 
   GCModuleInfo *MI = getAnalysisIfAvailable<GCModuleInfo>();
@@ -571,7 +616,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 ///
 /// \p Value - The value to emit.
 /// \p Size - The size of the integer (in bytes) to emit.
-void AsmPrinter::EmitDebugValue(const MCExpr *Value,
+void AsmPrinter::EmitDebugThreadLocal(const MCExpr *Value,
                                       unsigned Size) const {
   OutStreamer->EmitValue(Value, Size);
 }
@@ -602,8 +647,23 @@ void AsmPrinter::EmitFunctionHeader() {
   }
 
   // Emit the prefix data.
-  if (F->hasPrefixData())
-    EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
+  if (F->hasPrefixData()) {
+    if (MAI->hasSubsectionsViaSymbols()) {
+      // Preserving prefix data on platforms which use subsections-via-symbols
+      // is a bit tricky. Here we introduce a symbol for the prefix data
+      // and use the .alt_entry attribute to mark the function's real entry point
+      // as an alternative entry point to the prefix-data symbol.
+      MCSymbol *PrefixSym = OutContext.createLinkerPrivateTempSymbol();
+      OutStreamer->EmitLabel(PrefixSym);
+
+      EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
+
+      // Emit an .alt_entry directive for the actual function symbol.
+      OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_AltEntry);
+    } else {
+      EmitGlobalConstant(F->getParent()->getDataLayout(), F->getPrefixData());
+    }
+  }
 
   // Emit the CurrentFnSym.  This is a virtual function to allow targets to
   // do their wild and crazy things as required.
@@ -883,6 +943,7 @@ void AsmPrinter::EmitFunctionBody() {
 
   // Print out code for the function.
   bool HasAnyRealCode = false;
+  int NumInstsInFunction = 0;
   for (auto &MBB : *MF) {
     // Print a label for the basic block.
     EmitBasicBlockStart(MBB);
@@ -892,7 +953,7 @@ void AsmPrinter::EmitFunctionBody() {
       if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
           !MI.isDebugValue()) {
         HasAnyRealCode = true;
-        ++EmittedInsts;
+        ++NumInstsInFunction;
       }
 
       if (ShouldPrintDebugScopes) {
@@ -953,6 +1014,14 @@ void AsmPrinter::EmitFunctionBody() {
     EmitBasicBlockEnd(MBB);
   }
 
+  EmittedInsts += NumInstsInFunction;
+  MachineOptimizationRemarkAnalysis R(DEBUG_TYPE, "InstructionCount",
+                                      MF->getFunction()->getSubprogram(),
+                                      &MF->front());
+  R << ore::NV("NumInstructions", NumInstsInFunction)
+    << " instructions in function";
+  ORE->emit(R);
+
   // If the function is empty and the object file uses .subsections_via_symbols,
   // then we need to emit *something* to the function body to prevent the
   // labels from collapsing together.  Just emit a noop.
@@ -1238,7 +1307,7 @@ bool AsmPrinter::doFinalization(Module &M) {
         break;
       AliasStack.push_back(Cur);
     }
-    for (const GlobalAlias *AncestorAlias : reverse(AliasStack))
+    for (const GlobalAlias *AncestorAlias : llvm::reverse(AliasStack))
       emitGlobalIndirectSymbol(M, *AncestorAlias);
     AliasStack.clear();
   }
@@ -1311,19 +1380,23 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
       CurrentFnSymForSize = CurrentFnBegin;
   }
 
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
   if (isVerbose())
     LI = &getAnalysis<MachineLoopInfo>();
 }
 
 namespace {
+
 // Keep track the alignment, constpool entries per Section.
   struct SectionCPs {
     MCSection *S;
     unsigned Alignment;
     SmallVector<unsigned, 4> CPEs;
+
     SectionCPs(MCSection *s, unsigned a) : S(s), Alignment(a) {}
   };
-}
+
+} // end anonymous namespace
 
 /// EmitConstantPool - Print to the current output stream assembly
 /// representations of the constants in the constant pool MCP. This is
@@ -1547,7 +1620,6 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   OutStreamer->EmitValue(Value, EntrySize);
 }
 
-
 /// EmitSpecialLLVMGlobal - Check to see if the specified global is a
 /// special global used by LLVM.  If so, emit it and return true, otherwise
 /// do nothing and return false.
@@ -1598,13 +1670,16 @@ void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) {
 }
 
 namespace {
+
 struct Structor {
-  Structor() : Priority(0), Func(nullptr), ComdatKey(nullptr) {}
-  int Priority;
-  llvm::Constant *Func;
-  llvm::GlobalValue *ComdatKey;
+  int Priority = 0;
+  Constant *Func = nullptr;
+  GlobalValue *ComdatKey = nullptr;
+
+  Structor() = default;
 };
-} // end namespace
+
+}  // end anonymous namespace
 
 /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
 /// priority.
@@ -1934,7 +2009,6 @@ static int isRepeatedByteSequence(const ConstantDataSequential *V) {
   return static_cast<uint8_t>(C); // Ensure 255 is not returned as -1.
 }
 
-
 /// isRepeatedByteSequence - Determine whether the given value is
 /// composed of a repeated sequence of identical bytes and return the
 /// byte value.  If it is not a repeated sequence, return -1.
@@ -1975,7 +2049,6 @@ static int isRepeatedByteSequence(const Value *V, const DataLayout &DL) {
 static void emitGlobalConstantDataSequential(const DataLayout &DL,
                                              const ConstantDataSequential *CDS,
                                              AsmPrinter &AP) {
-
   // See if we can aggregate this into a .fill, if so, emit it as such.
   int Value = isRepeatedByteSequence(CDS, DL);
   if (Value != -1) {
@@ -2009,7 +2082,6 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
                         CDS->getNumElements();
   if (unsigned Padding = Size - EmittedSize)
     AP.OutStreamer->EmitZeros(Padding);
-
 }
 
 static void emitGlobalConstantArray(const DataLayout &DL,
@@ -2423,8 +2495,6 @@ MCSymbol *AsmPrinter::GetExternalSymbolSymbol(StringRef Sym) const {
   return OutContext.getOrCreateSymbol(NameStr);
 }
 
-
-
 /// PrintParentLoopComment - Print comments about parent loops of this one.
 static void PrintParentLoopComment(raw_ostream &OS, const MachineLoop *Loop,
                                    unsigned FunctionNumber) {
@@ -2489,7 +2559,6 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
   PrintChildLoopComment(OS, Loop, AP.getFunctionNumber());
 }
 
-
 /// EmitBasicBlockStart - This method prints the label for the specified
 /// MachineBasicBlock, an alignment (if present) and a comment describing
 /// it if appropriate.
@@ -2610,8 +2679,6 @@ isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
   return true;
 }
 
-
-
 GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
   if (!S.usesMetadata())
     return nullptr;
@@ -2642,7 +2709,7 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
 }
 
 /// Pin vtable to this file.
-AsmPrinterHandler::~AsmPrinterHandler() {}
+AsmPrinterHandler::~AsmPrinterHandler() = default;
 
 void AsmPrinterHandler::markFunctionEnd() {}
 
@@ -2705,8 +2772,11 @@ void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,
   SledKind Kind) {
   auto Fn = MI.getParent()->getParent()->getFunction();
   auto Attr = Fn->getFnAttribute("function-instrument");
+  bool LogArgs = Fn->hasFnAttribute("xray-log-args");
   bool AlwaysInstrument =
     Attr.isStringAttribute() && Attr.getValueAsString() == "xray-always";
+  if (Kind == SledKind::FUNCTION_ENTER && LogArgs)
+    Kind = SledKind::LOG_ARGS_ENTER;
   Sleds.emplace_back(
     XRayFunctionEntry{ Sled, CurrentFnSym, Kind, AlwaysInstrument, Fn });
 }
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index 57864e4e4d4f206c7747d1de677c4c567cef8f63..683e622e3d5379fb884324194f6133c24e221901 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -40,25 +40,24 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-namespace {
-  struct SrcMgrDiagInfo {
-    const MDNode *LocInfo;
-    LLVMContext::InlineAsmDiagHandlerTy DiagHandler;
-    void *DiagContext;
-  };
-}
-
 /// srcMgrDiagHandler - This callback is invoked when the SourceMgr for an
 /// inline asm has an error in it.  diagInfo is a pointer to the SrcMgrDiagInfo
 /// struct above.
 static void srcMgrDiagHandler(const SMDiagnostic &Diag, void *diagInfo) {
-  SrcMgrDiagInfo *DiagInfo = static_cast<SrcMgrDiagInfo *>(diagInfo);
+  AsmPrinter::SrcMgrDiagInfo *DiagInfo =
+      static_cast<AsmPrinter::SrcMgrDiagInfo *>(diagInfo);
   assert(DiagInfo && "Diagnostic context not passed down?");
 
+  // Look up a LocInfo for the buffer this diagnostic is coming from.
+  unsigned BufNum = DiagInfo->SrcMgr.FindBufferContainingLoc(Diag.getLoc());
+  const MDNode *LocInfo = nullptr;
+  if (BufNum > 0 && BufNum <= DiagInfo->LocInfos.size())
+    LocInfo = DiagInfo->LocInfos[BufNum-1];
+
   // If the inline asm had metadata associated with it, pull out a location
   // cookie corresponding to which line the error occurred on.
   unsigned LocCookie = 0;
-  if (const MDNode *LocInfo = DiagInfo->LocInfo) {
+  if (LocInfo) {
     unsigned ErrorLine = Diag.getLineNo()-1;
     if (ErrorLine >= LocInfo->getNumOperands())
       ErrorLine = 0;
@@ -99,35 +98,39 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
     return;
   }
 
-  SourceMgr SrcMgr;
-  SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
+  if (!DiagInfo) {
+    DiagInfo = make_unique<SrcMgrDiagInfo>();
 
-  SrcMgrDiagInfo DiagInfo;
-
-  // If the current LLVMContext has an inline asm handler, set it in SourceMgr.
-  LLVMContext &LLVMCtx = MMI->getModule()->getContext();
-  bool HasDiagHandler = false;
-  if (LLVMCtx.getInlineAsmDiagnosticHandler() != nullptr) {
-    // If the source manager has an issue, we arrange for srcMgrDiagHandler
-    // to be invoked, getting DiagInfo passed into it.
-    DiagInfo.LocInfo = LocMDNode;
-    DiagInfo.DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
-    DiagInfo.DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
-    SrcMgr.setDiagHandler(srcMgrDiagHandler, &DiagInfo);
-    HasDiagHandler = true;
+    MCContext &Context = MMI->getContext();
+    Context.setInlineSourceManager(&DiagInfo->SrcMgr);
+
+    LLVMContext &LLVMCtx = MMI->getModule()->getContext();
+    if (LLVMCtx.getInlineAsmDiagnosticHandler()) {
+      DiagInfo->DiagHandler = LLVMCtx.getInlineAsmDiagnosticHandler();
+      DiagInfo->DiagContext = LLVMCtx.getInlineAsmDiagnosticContext();
+      DiagInfo->SrcMgr.setDiagHandler(srcMgrDiagHandler, DiagInfo.get());
+    }
   }
 
+  SourceMgr &SrcMgr = DiagInfo->SrcMgr;
+  SrcMgr.setIncludeDirs(MCOptions.IASSearchPaths);
+
   std::unique_ptr<MemoryBuffer> Buffer;
-  if (isNullTerminated)
-    Buffer = MemoryBuffer::getMemBuffer(Str, "<inline asm>");
-  else
-    Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>");
+  // The inline asm source manager will outlive Str, so make a copy of the
+  // string for SourceMgr to own.
+  Buffer = MemoryBuffer::getMemBufferCopy(Str, "<inline asm>");
 
   // Tell SrcMgr about this buffer, it takes ownership of the buffer.
-  SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
+  unsigned BufNum = SrcMgr.AddNewSourceBuffer(std::move(Buffer), SMLoc());
+
+  // Store LocMDNode in DiagInfo, using BufNum as an identifier.
+  if (LocMDNode) {
+    DiagInfo->LocInfos.resize(BufNum);
+    DiagInfo->LocInfos[BufNum-1] = LocMDNode;
+  }
 
   std::unique_ptr<MCAsmParser> Parser(
-      createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI));
+      createMCAsmParser(SrcMgr, OutContext, *OutStreamer, *MAI, BufNum));
 
   // We create a new MCInstrInfo here since we might be at the module level
   // and not have a MachineFunction to initialize the TargetInstrInfo from and
@@ -151,7 +154,8 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   int Res = Parser->Run(/*NoInitialTextSection*/ true,
                         /*NoFinalize*/ true);
   emitInlineAsmEnd(STI, &TAP->getSTI());
-  if (Res && !HasDiagHandler)
+
+  if (Res && !DiagInfo->DiagHandler)
     report_fatal_error("Error parsing inline asm\n");
 }
 
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 83440513225c127df366b8fde289701337d53812..383b8cddb1a06384fafe131ebf077f9b7302f7cd 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -23,13 +23,13 @@
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -38,7 +38,6 @@
 
 using namespace llvm;
 using namespace llvm::codeview;
-using namespace llvm::msf;
 
 CodeViewDebug::CodeViewDebug(AsmPrinter *AP)
     : DebugHandlerBase(AP), OS(*Asm->OutStreamer), Allocator(),
@@ -495,9 +494,9 @@ void CodeViewDebug::emitTypeInformation() {
       // comments. The MSVC linker doesn't do much type record validation,
       // so the first link of an invalid type record can succeed while
       // subsequent links will fail with LNK1285.
-      ByteStream Stream(Record);
+      BinaryByteStream Stream(Record, llvm::support::little);
       CVTypeArray Types;
-      StreamReader Reader(Stream);
+      BinaryStreamReader Reader(Stream);
       Error E = Reader.readArray(Types, Reader.getLength());
       if (!E) {
         TypeVisitorCallbacks C;
@@ -948,10 +947,10 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
 
       // Handle fragments.
       auto Fragment = DIExpr->getFragmentInfo();
-      if (DIExpr && Fragment) {
+      if (Fragment) {
         IsSubfield = true;
         StructOffset = Fragment->OffsetInBits / 8;
-      } else if (DIExpr && DIExpr->getNumElements() > 0) {
+      } else if (DIExpr->getNumElements() > 0) {
         continue; // Ignore unrecognized exprs.
       }
 
@@ -1014,14 +1013,7 @@ void CodeViewDebug::collectVariableInfo(const DISubprogram *SP) {
   }
 }
 
-void CodeViewDebug::beginFunction(const MachineFunction *MF) {
-  assert(!CurFn && "Can't process two functions at once!");
-
-  if (!Asm || !MMI->hasDebugInfo() || !MF->getFunction()->getSubprogram())
-    return;
-
-  DebugHandlerBase::beginFunction(MF);
-
+void CodeViewDebug::beginFunctionImpl(const MachineFunction *MF) {
   const Function *GV = MF->getFunction();
   assert(FnDebugInfo.count(GV) == false);
   CurFn = &FnDebugInfo[GV];
@@ -1150,27 +1142,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
 
   uint64_t ElementSize = getBaseTypeSize(ElementTypeRef) / 8;
 
-
-  // We want to assert that the element type multiplied by the array lengths
-  // match the size of the overall array. However, if we don't have complete
-  // type information for the base type, we can't make this assertion. This
-  // happens if limited debug info is enabled in this case:
-  //   struct VTableOptzn { VTableOptzn(); virtual ~VTableOptzn(); };
-  //   VTableOptzn array[3];
-  // The DICompositeType of VTableOptzn will have size zero, and the array will
-  // have size 3 * sizeof(void*), and we should avoid asserting.
-  //
-  // There is a related bug in the front-end where an array of a structure,
-  // which was declared as incomplete structure first, ends up not getting a
-  // size assigned to it. (PR28303)
-  // Example:
-  //   struct A(*p)[3];
-  //   struct A { int f; } a[3];
-  bool PartiallyIncomplete = false;
-  if (Ty->getSizeInBits() == 0 || ElementSize == 0) {
-    PartiallyIncomplete = true;
-  }
-
   // Add subranges to array type.
   DINodeArray Elements = Ty->getElements();
   for (int i = Elements.size() - 1; i >= 0; --i) {
@@ -1185,16 +1156,14 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
     // Variable Length Array (VLA) has Count equal to '-1'.
     // Replace with Count '1', assume it is the minimum VLA length.
     // FIXME: Make front-end support VLA subrange and emit LF_DIMVARLU.
-    if (Count == -1) {
+    if (Count == -1)
       Count = 1;
-      PartiallyIncomplete = true;
-    }
 
     // Update the element size and element type index for subsequent subranges.
     ElementSize *= Count;
 
     // If this is the outermost array, use the size from the array. It will be
-    // more accurate if PartiallyIncomplete is true.
+    // more accurate if we had a VLA or an incomplete element type size.
     uint64_t ArraySize =
         (i == 0 && ElementSize == 0) ? Ty->getSizeInBits() / 8 : ElementSize;
 
@@ -1203,9 +1172,6 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
     ElementTypeIndex = TypeTable.writeKnownType(AR);
   }
 
-  (void)PartiallyIncomplete;
-  assert(PartiallyIncomplete || ElementSize == (Ty->getSizeInBits() / 8));
-
   return ElementTypeIndex;
 }
 
@@ -2115,18 +2081,13 @@ void CodeViewDebug::emitLocalVariable(const LocalVariable &Var) {
   }
 }
 
-void CodeViewDebug::endFunction(const MachineFunction *MF) {
-  if (!Asm || !CurFn)  // We haven't created any debug info for this function.
-    return;
-
+void CodeViewDebug::endFunctionImpl(const MachineFunction *MF) {
   const Function *GV = MF->getFunction();
   assert(FnDebugInfo.count(GV));
   assert(CurFn == &FnDebugInfo[GV]);
 
   collectVariableInfo(GV->getSubprogram());
 
-  DebugHandlerBase::endFunction(MF);
-
   // Don't emit anything if we don't have any line tables.
   if (!CurFn->HaveLineInfo) {
     FnDebugInfo.erase(GV);
diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index 3dd4315e4c2f1d110af58679f83e76b0f594f413..343384c5177285d790a66a7ade988641c49d8b3d 100644
--- a/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -299,6 +299,13 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   unsigned getPointerSizeInBytes();
 
+protected:
+  /// \brief Gather pre-function debug information.
+  void beginFunctionImpl(const MachineFunction *MF) override;
+
+  /// \brief Gather post-function debug information.
+  void endFunctionImpl(const MachineFunction *) override;
+
 public:
   CodeViewDebug(AsmPrinter *Asm);
 
@@ -307,12 +314,6 @@ public:
   /// \brief Emit the COFF section that holds the line table information.
   void endModule() override;
 
-  /// \brief Gather pre-function debug information.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// \brief Gather post-function debug information.
-  void endFunction(const MachineFunction *) override;
-
   /// \brief Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 };
diff --git a/lib/CodeGen/AsmPrinter/DIE.cpp b/lib/CodeGen/AsmPrinter/DIE.cpp
index ae1a08374ad3494aa3a72dae5aa3d0c14b759df1..b510e0ef36ac63e383e7491fbf4657410fb4ecb6 100644
--- a/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -42,6 +42,8 @@ void DIEAbbrevData::Profile(FoldingSetNodeID &ID) const {
   // overloads.  Otherwise MSVC 2010 thinks this call is ambiguous.
   ID.AddInteger(unsigned(Attribute));
   ID.AddInteger(unsigned(Form));
+  if (Form == dwarf::DW_FORM_implicit_const)
+    ID.AddInteger(Value);
 }
 
 //===----------------------------------------------------------------------===//
@@ -107,8 +109,12 @@ void DIEAbbrev::print(raw_ostream &O) {
     O << "  "
       << dwarf::AttributeString(Data[i].getAttribute())
       << "  "
-      << dwarf::FormEncodingString(Data[i].getForm())
-      << '\n';
+      << dwarf::FormEncodingString(Data[i].getForm());
+
+    if (Data[i].getForm() == dwarf::DW_FORM_implicit_const)
+      O << " " << Data[i].getValue();
+
+    O << '\n';
   }
 }
 
@@ -359,57 +365,42 @@ LLVM_DUMP_METHOD void DIEValue::dump() const {
 void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_implicit_const:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_flag_present:
     // Emit something to keep the lines and comments in sync.
     // FIXME: Is there a better way to do this?
     Asm->OutStreamer->AddBlankLine();
     return;
   case dwarf::DW_FORM_flag:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref1:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data1:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_addrx1:
   case dwarf::DW_FORM_ref2:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data2:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_addrx2:
   case dwarf::DW_FORM_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref4:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data4:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_sup4:
+  case dwarf::DW_FORM_strx4:
+  case dwarf::DW_FORM_addrx4:
   case dwarf::DW_FORM_ref8:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_sig8:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_data8:
-    LLVM_FALLTHROUGH;
+  case dwarf::DW_FORM_ref_sup8:
   case dwarf::DW_FORM_GNU_ref_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_strp_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_line_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_sec_offset:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_strp_sup:
-    LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref_sup:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_addr:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_addr:
     Asm->OutStreamer->EmitIntValue(Integer, SizeOf(Asm, Form));
     return;
   case dwarf::DW_FORM_GNU_str_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_addr_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_udata:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_udata:
     Asm->EmitULEB128(Integer);
     return;
@@ -424,35 +415,41 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
 ///
 unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
   switch (Form) {
-  case dwarf::DW_FORM_implicit_const: LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_flag_present: return 0;
-  case dwarf::DW_FORM_flag:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref1:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data1: return sizeof(int8_t);
-  case dwarf::DW_FORM_ref2:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data2: return sizeof(int16_t);
-  case dwarf::DW_FORM_ref4:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data4: return sizeof(int32_t);
-  case dwarf::DW_FORM_ref8:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref_sig8:  LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_data8: return sizeof(int64_t);
+  case dwarf::DW_FORM_implicit_const:
+  case dwarf::DW_FORM_flag_present:
+    return 0;
+  case dwarf::DW_FORM_flag:
+  case dwarf::DW_FORM_ref1:
+  case dwarf::DW_FORM_data1:
+  case dwarf::DW_FORM_strx1:
+  case dwarf::DW_FORM_addrx1:
+    return sizeof(int8_t);
+  case dwarf::DW_FORM_ref2:
+  case dwarf::DW_FORM_data2:
+  case dwarf::DW_FORM_strx2:
+  case dwarf::DW_FORM_addrx2:
+    return sizeof(int16_t);
+  case dwarf::DW_FORM_ref4:
+  case dwarf::DW_FORM_data4:
+  case dwarf::DW_FORM_ref_sup4:
+  case dwarf::DW_FORM_strx4:
+  case dwarf::DW_FORM_addrx4:
+    return sizeof(int32_t);
+  case dwarf::DW_FORM_ref8:
+  case dwarf::DW_FORM_ref_sig8:
+  case dwarf::DW_FORM_data8:
+  case dwarf::DW_FORM_ref_sup8:
+    return sizeof(int64_t);
   case dwarf::DW_FORM_ref_addr:
     if (AP->getDwarfVersion() == 2)
       return AP->getPointerSize();
     LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_ref_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_strp_alt:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_line_strp:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_sec_offset:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_strp_sup:
-    LLVM_FALLTHROUGH;
-  case dwarf::DW_FORM_ref_sup:
     switch (AP->OutStreamer->getContext().getDwarfFormat()) {
     case dwarf::DWARF32:
       return 4;
@@ -461,11 +458,8 @@ unsigned DIEInteger::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
     }
     llvm_unreachable("Invalid DWARF format");
   case dwarf::DW_FORM_GNU_str_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_GNU_addr_index:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_ref_udata:
-    LLVM_FALLTHROUGH;
   case dwarf::DW_FORM_udata:
     return getULEB128Size(Integer);
   case dwarf::DW_FORM_sdata:
@@ -489,7 +483,7 @@ void DIEInteger::print(raw_ostream &O) const {
 /// EmitValue - Emit expression value.
 ///
 void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  AP->EmitDebugValue(Expr, SizeOf(AP, Form));
+  AP->EmitDebugThreadLocal(Expr, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of expression value in bytes.
diff --git a/lib/CodeGen/AsmPrinter/DIEHash.cpp b/lib/CodeGen/AsmPrinter/DIEHash.cpp
index d8ecc7ccfb9bf6709ef80908d7bd24b936e895b1..8e3b88d0af0e5b9a07c61e2e970ea45cc107099a 100644
--- a/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -490,9 +490,9 @@ uint64_t DIEHash::computeCUSignature(const DIE &Die) {
   Hash.final(Result);
 
   // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
-  return support::endian::read64le(Result + 8);
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
+  return Result.high();
 }
 
 /// This is based on the type signature computation given in section 7.27 of the
@@ -514,7 +514,7 @@ uint64_t DIEHash::computeTypeSignature(const DIE &Die) {
   Hash.final(Result);
 
   // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
-  return support::endian::read64le(Result + 8);
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
+  return Result.high();
 }
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 94190981e88ec3f25fe4279c546b8a2e7dd9a140..1d63e33a4d33af585e98617aa1df3676916a1e40 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -115,12 +115,35 @@ uint64_t DebugHandlerBase::getBaseTypeSize(const DITypeRef TyRef) {
   return getBaseTypeSize(BaseType);
 }
 
+bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) {
+  if (!MMI->hasDebugInfo())
+    return false;
+  auto *SP = MF->getFunction()->getSubprogram();
+  if (!SP)
+    return false;
+  assert(SP->getUnit());
+  auto EK = SP->getUnit()->getEmissionKind();
+  if (EK == DICompileUnit::NoDebug)
+    return false;
+  return true;
+}
+
 void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
+  assert(Asm);
+  PrevInstBB = nullptr;
+
+  if (!hasDebugInfo(MMI, MF)) {
+    skippedNonDebugFunction();
+    return;
+  }
+
   // Grab the lexical scopes for the function, if we don't have any of those
   // then we're not going to be able to do anything.
   LScopes.initialize(*MF);
-  if (LScopes.empty())
+  if (LScopes.empty()) {
+    beginFunctionImpl(MF);
     return;
+  }
 
   // Make sure that each lexical scope will have a begin/end label.
   identifyScopeMarkers();
@@ -167,6 +190,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
 
   PrevInstLoc = DebugLoc();
   PrevLabel = Asm->getFunctionBegin();
+  beginFunctionImpl(MF);
 }
 
 void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
@@ -228,6 +252,8 @@ void DebugHandlerBase::endInstruction() {
 }
 
 void DebugHandlerBase::endFunction(const MachineFunction *MF) {
+  if (hasDebugInfo(MMI, MF))
+    endFunctionImpl(MF);
   DbgValues.clear();
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
index c00fa189d94af589a0a4735d79191e3df08c1ccb..659a921e1fc56bbc90ac9c2527bcd0a82222df66 100644
--- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
+++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.h
@@ -80,6 +80,10 @@ protected:
     LabelsAfterInsn.insert(std::make_pair(MI, nullptr));
   }
 
+  virtual void beginFunctionImpl(const MachineFunction *MF) = 0;
+  virtual void endFunctionImpl(const MachineFunction *MF) = 0;
+  virtual void skippedNonDebugFunction() {}
+
   // AsmPrinterHandler overrides.
 public:
   void beginInstruction(const MachineInstr *MI) override;
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 0db623bbc29a25c7079baeff13475cf65882f86d..a550ff2fb90f32a3a50a1db841746168d4341b71 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1,3 +1,16 @@
+//===-- llvm/CodeGen/DwarfCompileUnit.cpp - Dwarf Compile Units -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains support for constructing a dwarf compile unit.
+//
+//===----------------------------------------------------------------------===//
+
 #include "DwarfCompileUnit.h"
 #include "DwarfExpression.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -129,67 +142,72 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
   bool addToAccelTable = false;
   DIELoc *Loc = nullptr;
   std::unique_ptr<DIEDwarfExpression> DwarfExpr;
-  bool AllConstant = std::all_of(
-      GlobalExprs.begin(), GlobalExprs.end(),
-      [&](const GlobalExpr GE) {
-        return GE.Expr && GE.Expr->isConstant();
-      });
-
   for (const auto &GE : GlobalExprs) {
     const GlobalVariable *Global = GE.Var;
     const DIExpression *Expr = GE.Expr;
+
     // For compatibility with DWARF 3 and earlier,
     // DW_AT_location(DW_OP_constu, X, DW_OP_stack_value) becomes
     // DW_AT_const_value(X).
     if (GlobalExprs.size() == 1 && Expr && Expr->isConstant()) {
+      addToAccelTable = true;
       addConstantValue(*VariableDIE, /*Unsigned=*/true, Expr->getElement(1));
-      // We cannot describe the location of dllimport'd variables: the
-      // computation of their address requires loads from the IAT.
-    } else if ((Global && !Global->hasDLLImportStorageClass()) || AllConstant) {
-      if (!Loc) {
-        Loc = new (DIEValueAllocator) DIELoc;
-        DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
-      }
+      break;
+    }
+
+    // We cannot describe the location of dllimport'd variables: the
+    // computation of their address requires loads from the IAT.
+    if (Global && Global->hasDLLImportStorageClass())
+      continue;
+
+    // Nothing to describe without address or constant.
+    if (!Global && (!Expr || !Expr->isConstant()))
+      continue;
+
+    if (!Loc) {
       addToAccelTable = true;
-      if (Global) {
-        const MCSymbol *Sym = Asm->getSymbol(Global);
-        if (Global->isThreadLocal()) {
-          if (Asm->TM.Options.EmulatedTLS) {
-            // TODO: add debug info for emulated thread local mode.
-          } else {
-            // FIXME: Make this work with -gsplit-dwarf.
-            unsigned PointerSize = Asm->getDataLayout().getPointerSize();
-            assert((PointerSize == 4 || PointerSize == 8) &&
-                   "Add support for other sizes if necessary");
-            // Based on GCC's support for TLS:
-            if (!DD->useSplitDwarf()) {
-              // 1) Start with a constNu of the appropriate pointer size
-              addUInt(*Loc, dwarf::DW_FORM_data1,
-                      PointerSize == 4 ? dwarf::DW_OP_const4u
-                                       : dwarf::DW_OP_const8u);
-              // 2) containing the (relocated) offset of the TLS variable
-              //    within the module's TLS block.
-              addExpr(*Loc, dwarf::DW_FORM_udata,
-                      Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
-            } else {
-              addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
-              addUInt(*Loc, dwarf::DW_FORM_udata,
-                      DD->getAddressPool().getIndex(Sym, /* TLS */ true));
-            }
-            // 3) followed by an OP to make the debugger do a TLS lookup.
+      Loc = new (DIEValueAllocator) DIELoc;
+      DwarfExpr = llvm::make_unique<DIEDwarfExpression>(*Asm, *this, *Loc);
+    }
+
+    if (Global) {
+      const MCSymbol *Sym = Asm->getSymbol(Global);
+      if (Global->isThreadLocal()) {
+        if (Asm->TM.Options.EmulatedTLS) {
+          // TODO: add debug info for emulated thread local mode.
+        } else {
+          // FIXME: Make this work with -gsplit-dwarf.
+          unsigned PointerSize = Asm->getDataLayout().getPointerSize();
+          assert((PointerSize == 4 || PointerSize == 8) &&
+                 "Add support for other sizes if necessary");
+          // Based on GCC's support for TLS:
+          if (!DD->useSplitDwarf()) {
+            // 1) Start with a constNu of the appropriate pointer size
             addUInt(*Loc, dwarf::DW_FORM_data1,
-                    DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
-                                          : dwarf::DW_OP_form_tls_address);
+                    PointerSize == 4 ? dwarf::DW_OP_const4u
+                                     : dwarf::DW_OP_const8u);
+            // 2) containing the (relocated) offset of the TLS variable
+            //    within the module's TLS block.
+            addExpr(*Loc, dwarf::DW_FORM_udata,
+                    Asm->getObjFileLowering().getDebugThreadLocalSymbol(Sym));
+          } else {
+            addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_GNU_const_index);
+            addUInt(*Loc, dwarf::DW_FORM_udata,
+                    DD->getAddressPool().getIndex(Sym, /* TLS */ true));
           }
-        } else {
-          DD->addArangeLabel(SymbolCU(this, Sym));
-          addOpAddress(*Loc, Sym);
+          // 3) followed by an OP to make the debugger do a TLS lookup.
+          addUInt(*Loc, dwarf::DW_FORM_data1,
+                  DD->useGNUTLSOpcode() ? dwarf::DW_OP_GNU_push_tls_address
+                                        : dwarf::DW_OP_form_tls_address);
         }
+      } else {
+        DD->addArangeLabel(SymbolCU(this, Sym));
+        addOpAddress(*Loc, Sym);
       }
-      if (Expr) {
-        DwarfExpr->addFragmentOffset(Expr);
-        DwarfExpr->AddExpression(Expr);
-      }
+    }
+    if (Expr) {
+      DwarfExpr->addFragmentOffset(Expr);
+      DwarfExpr->addExpression(Expr);
     }
   }
   if (Loc)
@@ -507,8 +525,8 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
         DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
         // If there is an expression, emit raw unsigned bytes.
         DwarfExpr.addFragmentOffset(Expr);
-        DwarfExpr.AddUnsignedConstant(DVInsn->getOperand(0).getImm());
-        DwarfExpr.AddExpression(Expr);
+        DwarfExpr.addUnsignedConstant(DVInsn->getOperand(0).getImm());
+        DwarfExpr.addExpression(Expr);
         addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
       } else
         addConstantValue(*VariableDie, DVInsn->getOperand(0), DV.getType());
@@ -522,22 +540,25 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
   }
 
   // .. else use frame index.
-  if (DV.getFrameIndex().empty())
+  if (!DV.hasFrameIndexExprs())
     return VariableDie;
 
-  auto Expr = DV.getExpression().begin();
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
   DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
-  for (auto FI : DV.getFrameIndex()) {
+  for (auto &Fragment : DV.getFrameIndexExprs()) {
     unsigned FrameReg = 0;
     const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
-    int Offset = TFI->getFrameIndexReference(*Asm->MF, FI, FrameReg);
-    assert(Expr != DV.getExpression().end() && "Wrong number of expressions");
-    DwarfExpr.addFragmentOffset(*Expr);
-    DwarfExpr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                    FrameReg, Offset);
-    DwarfExpr.AddExpression(*Expr);
-    ++Expr;
+    int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg);
+    DwarfExpr.addFragmentOffset(Fragment.Expr);
+    SmallVector<uint64_t, 8> Ops;
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(Offset);
+    Ops.push_back(dwarf::DW_OP_deref);
+    Ops.append(Fragment.Expr->elements_begin(), Fragment.Expr->elements_end());
+    DIExpressionCursor Expr(Ops);
+    DwarfExpr.addMachineRegExpression(
+        *Asm->MF->getSubtarget().getRegisterInfo(), Expr, FrameReg);
+    DwarfExpr.addExpression(std::move(Expr));
   }
   addBlock(*VariableDie, dwarf::DW_AT_location, DwarfExpr.finalize());
 
@@ -693,11 +714,14 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) {
     Asm->OutStreamer->EmitLabel(LabelBegin);
   }
 
-  DwarfUnit::emitHeader(UseOffsets);
+  dwarf::UnitType UT = Skeleton ? dwarf::DW_UT_split_compile
+                                : DD->useSplitDwarf() ? dwarf::DW_UT_skeleton
+                                                      : dwarf::DW_UT_compile;
+  DwarfUnit::emitCommonHeader(UseOffsets, UT);
 }
 
 /// addGlobalName - Add a new global name to the compile unit.
-void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die,
+void DwarfCompileUnit::addGlobalName(StringRef Name, const DIE &Die,
                                      const DIScope *Context) {
   if (includeMinimalInlineScopes())
     return;
@@ -705,6 +729,18 @@ void DwarfCompileUnit::addGlobalName(StringRef Name, DIE &Die,
   GlobalNames[FullName] = &Die;
 }
 
+void DwarfCompileUnit::addGlobalNameForTypeUnit(StringRef Name,
+                                                const DIScope *Context) {
+  if (includeMinimalInlineScopes())
+    return;
+  std::string FullName = getParentContextString(Context) + Name.str();
+  // Insert, allowing the entry to remain as-is if it's already present
+  // This way the CU-level type DIE is preferred over the "can't describe this
+  // type as a unit offset because it's not really in the CU at all, it's only
+  // in a type unit"
+  GlobalNames.insert(std::make_pair(std::move(FullName), &getUnitDie()));
+}
+
 /// Add a new global type to the unit.
 void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
                                      const DIScope *Context) {
@@ -714,6 +750,18 @@ void DwarfCompileUnit::addGlobalType(const DIType *Ty, const DIE &Die,
   GlobalTypes[FullName] = &Die;
 }
 
+void DwarfCompileUnit::addGlobalTypeUnitType(const DIType *Ty,
+                                             const DIScope *Context) {
+  if (includeMinimalInlineScopes())
+    return;
+  std::string FullName = getParentContextString(Context) + Ty->getName().str();
+  // Insert, allowing the entry to remain as-is if it's already present
+  // This way the CU-level type DIE is preferred over the "can't describe this
+  // type as a unit offset because it's not really in the CU at all, it's only
+  // in a type unit"
+  GlobalTypes.insert(std::make_pair(std::move(FullName), &getUnitDie()));
+}
+
 /// addVariableAddress - Add DW_AT_location attribute for a
 /// DbgVariable based on provided MachineLocation.
 void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
@@ -730,22 +778,22 @@ void DwarfCompileUnit::addVariableAddress(const DbgVariable &DV, DIE &Die,
 void DwarfCompileUnit::addAddress(DIE &Die, dwarf::Attribute Attribute,
                                   const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-  DIEDwarfExpression Expr(*Asm, *this, *Loc);
-
-  bool validReg;
-  if (Location.isReg())
-    validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                  Location.getReg());
-  else
-    validReg =
-        Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                   Location.getReg(), Location.getOffset());
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
 
-  if (!validReg)
+  SmallVector<uint64_t, 8> Ops;
+  if (Location.isIndirect()) {
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(Location.getOffset());
+    Ops.push_back(dwarf::DW_OP_deref);
+  }
+  DIExpressionCursor Cursor(Ops);
+  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
     return;
+  DwarfExpr.addExpression(std::move(Cursor));
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Expr.finalize());
+  addBlock(Die, Attribute, DwarfExpr.finalize());
 }
 
 /// Start with the address based on the location provided, and generate the
@@ -757,23 +805,24 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
                                          const MachineLocation &Location) {
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
   DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
-  const DIExpression *Expr = DV.getSingleExpression();
-  DIExpressionCursor ExprCursor(Expr);
+  const DIExpression *DIExpr = DV.getSingleExpression();
+  DwarfExpr.addFragmentOffset(DIExpr);
+
+  SmallVector<uint64_t, 8> Ops;
+  if (Location.isIndirect()) {
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(Location.getOffset());
+    Ops.push_back(dwarf::DW_OP_deref);
+  }
+  Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+  DIExpressionCursor Cursor(Ops);
   const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
-  auto Reg = Location.getReg();
-  DwarfExpr.addFragmentOffset(Expr);
-  bool ValidReg =
-      Location.getOffset()
-          ? DwarfExpr.AddMachineRegIndirect(TRI, Reg, Location.getOffset())
-          : DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Reg);
-
-  if (!ValidReg)
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
     return;
-
-  DwarfExpr.AddExpression(std::move(ExprCursor));
+  DwarfExpr.addExpression(std::move(Cursor));
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
+  addBlock(Die, Attribute, DwarfExpr.finalize());
 }
 
 /// Add a Dwarf loclistptr attribute data and value.
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index a8025f1d152196ef3a85977c629a1d7db36fccf2..9a64b4b76b06ec4449e7e7e59b5c71affcd20827 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -210,12 +210,19 @@ public:
   }
 
   /// Add a new global name to the compile unit.
-  void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) override;
+  void addGlobalName(StringRef Name, const DIE &Die,
+                     const DIScope *Context) override;
+
+  /// Add a new global name present in a type unit to this compile unit.
+  void addGlobalNameForTypeUnit(StringRef Name, const DIScope *Context);
 
   /// Add a new global type to the compile unit.
   void addGlobalType(const DIType *Ty, const DIE &Die,
                      const DIScope *Context) override;
 
+  /// Add a new global type present in a type unit to this compile unit.
+  void addGlobalTypeUnitType(const DIType *Ty, const DIScope *Context);
+
   const StringMap<const DIE *> &getGlobalNames() const { return GlobalNames; }
   const StringMap<const DIE *> &getGlobalTypes() const { return GlobalTypes; }
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index 8e2f4f2551acd617bbb516f542d9ae9d1801e815..5ce11130920885ed49f1e17e33229f336cfbc7f2 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -39,7 +39,6 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LEB128.h"
@@ -127,17 +126,17 @@ static const char *const DWARFGroupDescription = "DWARF Emission";
 static const char *const DbgTimerName = "writer";
 static const char *const DbgTimerDescription = "DWARF Debug Writer";
 
-void DebugLocDwarfExpression::EmitOp(uint8_t Op, const char *Comment) {
+void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) {
   BS.EmitInt8(
       Op, Comment ? Twine(Comment) + " " + dwarf::OperationEncodingString(Op)
                   : dwarf::OperationEncodingString(Op));
 }
 
-void DebugLocDwarfExpression::EmitSigned(int64_t Value) {
+void DebugLocDwarfExpression::emitSigned(int64_t Value) {
   BS.EmitSLEB128(Value, Twine(Value));
 }
 
-void DebugLocDwarfExpression::EmitUnsigned(uint64_t Value) {
+void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) {
   BS.EmitULEB128(Value, Twine(Value));
 }
 
@@ -199,6 +198,21 @@ const DIType *DbgVariable::getType() const {
   return Ty;
 }
 
+ArrayRef<DbgVariable::FrameIndexExpr> DbgVariable::getFrameIndexExprs() const {
+  if (FrameIndexExprs.size() == 1)
+    return FrameIndexExprs;
+
+  assert(all_of(FrameIndexExprs,
+                [](const FrameIndexExpr &A) { return A.Expr->isFragment(); }) &&
+         "multiple FI expressions without DW_OP_LLVM_fragment");
+  std::sort(FrameIndexExprs.begin(), FrameIndexExprs.end(),
+            [](const FrameIndexExpr &A, const FrameIndexExpr &B) -> bool {
+              return A.Expr->getFragmentInfo()->OffsetInBits <
+                     B.Expr->getFragmentInfo()->OffsetInBits;
+            });
+  return FrameIndexExprs;
+}
+
 static const DwarfAccelTable::Atom TypeAtoms[] = {
     DwarfAccelTable::Atom(dwarf::DW_ATOM_die_offset, dwarf::DW_FORM_data4),
     DwarfAccelTable::Atom(dwarf::DW_ATOM_die_tag, dwarf::DW_FORM_data2),
@@ -409,7 +423,14 @@ DwarfDebug::constructDwarfCompileUnit(const DICompileUnit *DIUnit) {
     Asm->OutStreamer->getContext().setMCLineTableCompilationDir(
         NewCU.getUniqueID(), CompilationDir);
 
-  NewCU.addString(Die, dwarf::DW_AT_producer, DIUnit->getProducer());
+  StringRef Producer = DIUnit->getProducer();
+  StringRef Flags = DIUnit->getFlags();
+  if (!Flags.empty()) {
+    std::string ProducerWithFlags = Producer.str() + " " + Flags.str();
+    NewCU.addString(Die, dwarf::DW_AT_producer, ProducerWithFlags);
+  } else
+    NewCU.addString(Die, dwarf::DW_AT_producer, Producer);
+
   NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
                 DIUnit->getSourceLanguage());
   NewCU.addString(Die, dwarf::DW_AT_name, FN);
@@ -535,7 +556,6 @@ void DwarfDebug::beginModule() {
       // The retained types array by design contains pointers to
       // MDNodes rather than DIRefs. Unique them here.
       if (DIType *RT = dyn_cast<DIType>(Ty))
-        if (!RT->isExternalTypeRef())
           // There is no point in force-emitting a forward declaration.
           CU.getOrCreateTypeDIE(RT);
     }
@@ -731,6 +751,7 @@ DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) {
 
 void DwarfDebug::createAbstractVariable(const DILocalVariable *Var,
                                         LexicalScope *Scope) {
+  assert(Scope && Scope->isAbstractScope());
   auto AbsDbgVariable = make_unique<DbgVariable>(Var, /* IA */ nullptr);
   InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get());
   AbstractVariables[Var] = std::move(AbsDbgVariable);
@@ -1128,20 +1149,9 @@ static DebugLoc findPrologueEndLoc(const MachineFunction *MF) {
 
 // Gather pre-function debug information.  Assumes being called immediately
 // after the function entry point has been emitted.
-void DwarfDebug::beginFunction(const MachineFunction *MF) {
+void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
   CurFn = MF;
 
-  // If there's no debug info for the function we're not going to do anything.
-  if (!MMI->hasDebugInfo())
-    return;
-
-  auto DI = MF->getFunction()->getSubprogram();
-  if (!DI)
-    return;
-
-  // Grab the lexical scopes for the function, if we don't have any of those
-  // then we're not going to be able to do anything.
-  DebugHandlerBase::beginFunction(MF);
   if (LScopes.empty())
     return;
 
@@ -1180,23 +1190,21 @@ void DwarfDebug::beginFunction(const MachineFunction *MF) {
   }
 }
 
+void DwarfDebug::skippedNonDebugFunction() {
+  // If we don't have a subprogram for this function then there will be a hole
+  // in the range information. Keep note of this by setting the previously used
+  // section to nullptr.
+  PrevCU = nullptr;
+  CurFn = nullptr;
+}
+
 // Gather and emit post-function debug information.
-void DwarfDebug::endFunction(const MachineFunction *MF) {
+void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
+  const DISubprogram *SP = MF->getFunction()->getSubprogram();
+
   assert(CurFn == MF &&
       "endFunction should be called with the same function as beginFunction");
 
-  const DISubprogram *SP = MF->getFunction()->getSubprogram();
-  if (!MMI->hasDebugInfo() || !SP ||
-      SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug) {
-    // If we don't have a subprogram for this function then there will be a hole
-    // in the range information. Keep note of this by setting the previously
-    // used section to nullptr.
-    PrevCU = nullptr;
-    CurFn = nullptr;
-    DebugHandlerBase::endFunction(MF);
-    return;
-  }
-
   // Set DwarfDwarfCompileUnitID in MCContext to default value.
   Asm->OutStreamer->getContext().setDwarfCompileUnitID(0);
 
@@ -1213,17 +1221,12 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   // Under -gmlt, skip building the subprogram if there are no inlined
   // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram
   // is still needed as we need its source location.
-  if (!Asm->TM.Options.DebugInfoForProfiling &&
+  if (!TheCU.getCUNode()->getDebugInfoForProfiling() &&
       TheCU.getCUNode()->getEmissionKind() == DICompileUnit::LineTablesOnly &&
       LScopes.getAbstractScopesList().empty() && !IsDarwin) {
     assert(InfoHolder.getScopeVariables().empty());
-    assert(DbgValues.empty());
-    // FIXME: This wouldn't be true in LTO with a -g (with inlining) CU followed
-    // by a -gmlt CU. Add a test and remove this assertion.
-    assert(AbstractVariables.empty());
     PrevLabel = nullptr;
     CurFn = nullptr;
-    DebugHandlerBase::endFunction(MF);
     return;
   }
 
@@ -1259,7 +1262,6 @@ void DwarfDebug::endFunction(const MachineFunction *MF) {
   InfoHolder.getScopeVariables().clear();
   PrevLabel = nullptr;
   CurFn = nullptr;
-  DebugHandlerBase::endFunction(MF);
 }
 
 // Register a source line with debug info. Returns the  unique label that was
@@ -1354,6 +1356,18 @@ void DwarfDebug::emitAccelTypes() {
 /// computeIndexValue - Compute the gdb index value for the DIE and CU.
 static dwarf::PubIndexEntryDescriptor computeIndexValue(DwarfUnit *CU,
                                                         const DIE *Die) {
+  // Entities that ended up only in a Type Unit reference the CU instead (since
+  // the pub entry has offsets within the CU there's no real offset that can be
+  // provided anyway). As it happens all such entities (namespaces and types,
+  // types only in C++ at that) are rendered as TYPE+EXTERNAL. If this turns out
+  // not to be true it would be necessary to persist this information from the
+  // point at which the entry is added to the index data structure - since by
+  // the time the index is built from that, the original type/namespace DIE in a
+  // type unit has already been destroyed so it can't be queried for properties
+  // like tag, etc.
+  if (Die->getTag() == dwarf::DW_TAG_compile_unit)
+    return dwarf::PubIndexEntryDescriptor(dwarf::GIEK_TYPE,
+                                          dwarf::GIEL_EXTERNAL);
   dwarf::GDBIndexEntryLinkage Linkage = dwarf::GIEL_STATIC;
 
   // We could have a specification DIE that has our most of our knowledge,
@@ -1491,27 +1505,37 @@ static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
                               ByteStreamer &Streamer,
                               const DebugLocEntry::Value &Value,
                               DwarfExpression &DwarfExpr) {
-  DIExpressionCursor ExprCursor(Value.getExpression());
-  DwarfExpr.addFragmentOffset(Value.getExpression());
+  auto *DIExpr = Value.getExpression();
+  DIExpressionCursor ExprCursor(DIExpr);
+  DwarfExpr.addFragmentOffset(DIExpr);
   // Regular entry.
   if (Value.isInt()) {
     if (BT && (BT->getEncoding() == dwarf::DW_ATE_signed ||
                BT->getEncoding() == dwarf::DW_ATE_signed_char))
-      DwarfExpr.AddSignedConstant(Value.getInt());
+      DwarfExpr.addSignedConstant(Value.getInt());
     else
-      DwarfExpr.AddUnsignedConstant(Value.getInt());
+      DwarfExpr.addUnsignedConstant(Value.getInt());
   } else if (Value.isLocation()) {
-    MachineLocation Loc = Value.getLoc();
+    MachineLocation Location = Value.getLoc();
+
+    SmallVector<uint64_t, 8> Ops;
+    // FIXME: Should this condition be Location.isIndirect() instead?
+    if (Location.getOffset()) {
+      Ops.push_back(dwarf::DW_OP_plus);
+      Ops.push_back(Location.getOffset());
+      Ops.push_back(dwarf::DW_OP_deref);
+    }
+    Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+    DIExpressionCursor Cursor(Ops);
     const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
-    if (Loc.getOffset())
-      DwarfExpr.AddMachineRegIndirect(TRI, Loc.getReg(), Loc.getOffset());
-    else
-      DwarfExpr.AddMachineRegExpression(TRI, ExprCursor, Loc.getReg());
+    if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
+      return;
+    return DwarfExpr.addExpression(std::move(Cursor));
   } else if (Value.isConstantFP()) {
     APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt();
-    DwarfExpr.AddUnsignedConstant(RawBytes);
+    DwarfExpr.addUnsignedConstant(RawBytes);
   }
-  DwarfExpr.AddExpression(std::move(ExprCursor));
+  DwarfExpr.addExpression(std::move(ExprCursor));
 }
 
 void DebugLocEntry::finalize(const AsmPrinter &AP,
@@ -1933,11 +1957,11 @@ uint64_t DwarfDebug::makeTypeSignature(StringRef Identifier) {
   MD5 Hash;
   Hash.update(Identifier);
   // ... take the least significant 8 bytes and return those. Our MD5
-  // implementation always returns its results in little endian, swap bytes
-  // appropriately.
+  // implementation always returns its results in little endian, so we actually
+  // need the "high" word.
   MD5::MD5Result Result;
   Hash.final(Result);
-  return support::endian::read64le(Result + 8);
+  return Result.high();
 }
 
 void DwarfDebug::addDwarfTypeUnitType(DwarfCompileUnit &CU,
diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h
index e5bf33db81fb532dcdd698344f2da08fd793ee68..8a96e7867b6e3da02a28ad9c30c56482a865c422 100644
--- a/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -54,7 +54,7 @@ class MachineModuleInfo;
 ///
 /// Variables can be created from allocas, in which case they're generated from
 /// the MMI table.  Such variables can have multiple expressions and frame
-/// indices.  The \a Expr and \a FrameIndices array must match.
+/// indices.
 ///
 /// Variables can be created from \c DBG_VALUE instructions.  Those whose
 /// location changes over time use \a DebugLocListIndex, while those with a
@@ -64,11 +64,16 @@ class MachineModuleInfo;
 class DbgVariable {
   const DILocalVariable *Var;                /// Variable Descriptor.
   const DILocation *IA;                      /// Inlined at location.
-  SmallVector<const DIExpression *, 1> Expr; /// Complex address.
   DIE *TheDIE = nullptr;                     /// Variable DIE.
   unsigned DebugLocListIndex = ~0u;          /// Offset in DebugLocs.
   const MachineInstr *MInsn = nullptr;       /// DBG_VALUE instruction.
-  SmallVector<int, 1> FrameIndex;            /// Frame index.
+
+  struct FrameIndexExpr {
+    int FI;
+    const DIExpression *Expr;
+  };
+  mutable SmallVector<FrameIndexExpr, 1>
+      FrameIndexExprs; /// Frame index + expression.
 
 public:
   /// Construct a DbgVariable.
@@ -80,21 +85,18 @@ public:
 
   /// Initialize from the MMI table.
   void initializeMMI(const DIExpression *E, int FI) {
-    assert(Expr.empty() && "Already initialized?");
-    assert(FrameIndex.empty() && "Already initialized?");
+    assert(FrameIndexExprs.empty() && "Already initialized?");
     assert(!MInsn && "Already initialized?");
 
     assert((!E || E->isValid()) && "Expected valid expression");
-    assert(~FI && "Expected valid index");
+    assert(FI != INT_MAX && "Expected valid index");
 
-    Expr.push_back(E);
-    FrameIndex.push_back(FI);
+    FrameIndexExprs.push_back({FI, E});
   }
 
   /// Initialize from a DBG_VALUE instruction.
   void initializeDbgValue(const MachineInstr *DbgValue) {
-    assert(Expr.empty() && "Already initialized?");
-    assert(FrameIndex.empty() && "Already initialized?");
+    assert(FrameIndexExprs.empty() && "Already initialized?");
     assert(!MInsn && "Already initialized?");
 
     assert(Var == DbgValue->getDebugVariable() && "Wrong variable");
@@ -103,16 +105,15 @@ public:
     MInsn = DbgValue;
     if (auto *E = DbgValue->getDebugExpression())
       if (E->getNumElements())
-        Expr.push_back(E);
+        FrameIndexExprs.push_back({0, E});
   }
 
   // Accessors.
   const DILocalVariable *getVariable() const { return Var; }
   const DILocation *getInlinedAt() const { return IA; }
-  ArrayRef<const DIExpression *> getExpression() const { return Expr; }
   const DIExpression *getSingleExpression() const {
-    assert(MInsn && Expr.size() <= 1);
-    return Expr.size() ? Expr[0] : nullptr;
+    assert(MInsn && FrameIndexExprs.size() <= 1);
+    return FrameIndexExprs.size() ? FrameIndexExprs[0].Expr : nullptr;
   }
   void setDIE(DIE &D) { TheDIE = &D; }
   DIE *getDIE() const { return TheDIE; }
@@ -120,7 +121,9 @@ public:
   unsigned getDebugLocListIndex() const { return DebugLocListIndex; }
   StringRef getName() const { return Var->getName(); }
   const MachineInstr *getMInsn() const { return MInsn; }
-  ArrayRef<int> getFrameIndex() const { return FrameIndex; }
+  /// Get the FI entries, sorted by fragment offset.
+  ArrayRef<FrameIndexExpr> getFrameIndexExprs() const;
+  bool hasFrameIndexExprs() const { return !FrameIndexExprs.empty(); }
 
   void addMMIEntry(const DbgVariable &V) {
     assert(DebugLocListIndex == ~0U && !MInsn && "not an MMI entry");
@@ -128,16 +131,15 @@ public:
     assert(V.Var == Var && "conflicting variable");
     assert(V.IA == IA && "conflicting inlined-at location");
 
-    assert(!FrameIndex.empty() && "Expected an MMI entry");
-    assert(!V.FrameIndex.empty() && "Expected an MMI entry");
-    assert(Expr.size() == FrameIndex.size() && "Mismatched expressions");
-    assert(V.Expr.size() == V.FrameIndex.size() && "Mismatched expressions");
+    assert(!FrameIndexExprs.empty() && "Expected an MMI entry");
+    assert(!V.FrameIndexExprs.empty() && "Expected an MMI entry");
 
-    Expr.append(V.Expr.begin(), V.Expr.end());
-    FrameIndex.append(V.FrameIndex.begin(), V.FrameIndex.end());
-    assert(all_of(Expr, [](const DIExpression *E) {
-             return E && E->isFragment();
-           }) && "conflicting locations for variable");
+    FrameIndexExprs.append(V.FrameIndexExprs.begin(), V.FrameIndexExprs.end());
+    assert(all_of(FrameIndexExprs,
+                  [](FrameIndexExpr &FIE) {
+                    return FIE.Expr && FIE.Expr->isFragment();
+                  }) &&
+           "conflicting locations for variable");
   }
 
   // Translate tag to proper Dwarf tag.
@@ -167,11 +169,11 @@ public:
 
   bool hasComplexAddress() const {
     assert(MInsn && "Expected DBG_VALUE, not MMI variable");
-    assert(FrameIndex.empty() && "Expected DBG_VALUE, not MMI variable");
-    assert(
-        (Expr.empty() || (Expr.size() == 1 && Expr.back()->getNumElements())) &&
-        "Invalid Expr for DBG_VALUE");
-    return !Expr.empty();
+    assert((FrameIndexExprs.empty() ||
+            (FrameIndexExprs.size() == 1 &&
+             FrameIndexExprs[0].Expr->getNumElements())) &&
+           "Invalid Expr for DBG_VALUE");
+    return !FrameIndexExprs.empty();
   }
   bool isBlockByrefVariable() const;
   const DIType *getType() const;
@@ -446,6 +448,15 @@ class DwarfDebug : public DebugHandlerBase {
   /// Collect variable information from the side table maintained by MF.
   void collectVariableInfoFromMFTable(DenseSet<InlinedVariable> &P);
 
+protected:
+  /// Gather pre-function debug information.
+  void beginFunctionImpl(const MachineFunction *MF) override;
+
+  /// Gather and emit post-function debug information.
+  void endFunctionImpl(const MachineFunction *MF) override;
+
+  void skippedNonDebugFunction() override;
+
 public:
   //===--------------------------------------------------------------------===//
   // Main entry points.
@@ -461,12 +472,6 @@ public:
   /// Emit all Dwarf sections that should come after the content.
   void endModule() override;
 
-  /// Gather pre-function debug information.
-  void beginFunction(const MachineFunction *MF) override;
-
-  /// Gather and emit post-function debug information.
-  void endFunction(const MachineFunction *MF) override;
-
   /// Process beginning of an instruction.
   void beginInstruction(const MachineInstr *MI) override;
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index c584e5a92da61256f7bc61f45a80dd96259ce339..debe88f3b1ee168f3316138515f4b96303ea9f81 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -22,77 +22,76 @@
 
 using namespace llvm;
 
-void DwarfExpression::AddReg(int DwarfReg, const char *Comment) {
+void DwarfExpression::addReg(int DwarfReg, const char *Comment) {
   assert(DwarfReg >= 0 && "invalid negative dwarf register number");
   if (DwarfReg < 32) {
-    EmitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
+    emitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
   } else {
-    EmitOp(dwarf::DW_OP_regx, Comment);
-    EmitUnsigned(DwarfReg);
+    emitOp(dwarf::DW_OP_regx, Comment);
+    emitUnsigned(DwarfReg);
   }
 }
 
-void DwarfExpression::AddRegIndirect(int DwarfReg, int Offset) {
+void DwarfExpression::addBReg(int DwarfReg, int Offset) {
   assert(DwarfReg >= 0 && "invalid negative dwarf register number");
   if (DwarfReg < 32) {
-    EmitOp(dwarf::DW_OP_breg0 + DwarfReg);
+    emitOp(dwarf::DW_OP_breg0 + DwarfReg);
   } else {
-    EmitOp(dwarf::DW_OP_bregx);
-    EmitUnsigned(DwarfReg);
+    emitOp(dwarf::DW_OP_bregx);
+    emitUnsigned(DwarfReg);
   }
-  EmitSigned(Offset);
+  emitSigned(Offset);
 }
 
-void DwarfExpression::AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
+void DwarfExpression::addFBReg(int Offset) {
+  emitOp(dwarf::DW_OP_fbreg);
+  emitSigned(Offset);
+}
+
+void DwarfExpression::addOpPiece(unsigned SizeInBits, unsigned OffsetInBits) {
   if (!SizeInBits)
     return;
 
   const unsigned SizeOfByte = 8;
   if (OffsetInBits > 0 || SizeInBits % SizeOfByte) {
-    EmitOp(dwarf::DW_OP_bit_piece);
-    EmitUnsigned(SizeInBits);
-    EmitUnsigned(OffsetInBits);
+    emitOp(dwarf::DW_OP_bit_piece);
+    emitUnsigned(SizeInBits);
+    emitUnsigned(OffsetInBits);
   } else {
-    EmitOp(dwarf::DW_OP_piece);
+    emitOp(dwarf::DW_OP_piece);
     unsigned ByteSize = SizeInBits / SizeOfByte;
-    EmitUnsigned(ByteSize);
+    emitUnsigned(ByteSize);
   }
   this->OffsetInBits += SizeInBits;
 }
 
-void DwarfExpression::AddShr(unsigned ShiftBy) {
-  EmitOp(dwarf::DW_OP_constu);
-  EmitUnsigned(ShiftBy);
-  EmitOp(dwarf::DW_OP_shr);
+void DwarfExpression::addShr(unsigned ShiftBy) {
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(ShiftBy);
+  emitOp(dwarf::DW_OP_shr);
 }
 
-bool DwarfExpression::AddMachineRegIndirect(const TargetRegisterInfo &TRI,
-                                            unsigned MachineReg, int Offset) {
-  if (isFrameRegister(TRI, MachineReg)) {
-    // If variable offset is based in frame register then use fbreg.
-    EmitOp(dwarf::DW_OP_fbreg);
-    EmitSigned(Offset);
-    return true;
-  }
-
-  int DwarfReg = TRI.getDwarfRegNum(MachineReg, false);
-  if (DwarfReg < 0)
-    return false;
-
-  AddRegIndirect(DwarfReg, Offset);
-  return true;
+void DwarfExpression::addAnd(unsigned Mask) {
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(Mask);
+  emitOp(dwarf::DW_OP_and);
 }
 
-bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,
+bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
                                     unsigned MachineReg, unsigned MaxSize) {
-  if (!TRI.isPhysicalRegister(MachineReg))
+  if (!TRI.isPhysicalRegister(MachineReg)) {
+    if (isFrameRegister(TRI, MachineReg)) {
+      DwarfRegs.push_back({-1, 0, nullptr});
+      return true;
+    }
     return false;
+  }
 
   int Reg = TRI.getDwarfRegNum(MachineReg, false);
 
   // If this is a valid register number, emit it.
   if (Reg >= 0) {
-    AddReg(Reg);
+    DwarfRegs.push_back({Reg, 0, nullptr});
     return true;
   }
 
@@ -104,7 +103,7 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,
       unsigned Idx = TRI.getSubRegIndex(*SR, MachineReg);
       unsigned Size = TRI.getSubRegIdxSize(Idx);
       unsigned RegOffset = TRI.getSubRegIdxOffset(Idx);
-      AddReg(Reg, "super-register");
+      DwarfRegs.push_back({Reg, 0, "super-register"});
       // Use a DW_OP_bit_piece to describe the sub-register.
       setSubRegisterPiece(Size, RegOffset);
       return true;
@@ -134,72 +133,101 @@ bool DwarfExpression::AddMachineReg(const TargetRegisterInfo &TRI,
     // If this sub-register has a DWARF number and we haven't covered
     // its range, emit a DWARF piece for it.
     if (Reg >= 0 && Intersection.any()) {
-      AddReg(Reg, "sub-register");
+      // Emit a piece for any gap in the coverage.
+      if (Offset > CurPos)
+        DwarfRegs.push_back({-1, Offset - CurPos, nullptr});
+      DwarfRegs.push_back(
+          {Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"});
       if (Offset >= MaxSize)
 	break;
-      // Emit a piece for the any gap in the coverage.
-      if (Offset > CurPos)
-        AddOpPiece(Offset - CurPos);
-      AddOpPiece(std::min<unsigned>(Size, MaxSize - Offset));
-      CurPos = Offset + Size;
 
       // Mark it as emitted.
       Coverage.set(Offset, Offset + Size);
+      CurPos = Offset + Size;
     }
   }
 
   return CurPos;
 }
 
-void DwarfExpression::AddStackValue() {
+void DwarfExpression::addStackValue() {
   if (DwarfVersion >= 4)
-    EmitOp(dwarf::DW_OP_stack_value);
+    emitOp(dwarf::DW_OP_stack_value);
 }
 
-void DwarfExpression::AddSignedConstant(int64_t Value) {
-  EmitOp(dwarf::DW_OP_consts);
-  EmitSigned(Value);
-  AddStackValue();
+void DwarfExpression::addSignedConstant(int64_t Value) {
+  emitOp(dwarf::DW_OP_consts);
+  emitSigned(Value);
+  addStackValue();
 }
 
-void DwarfExpression::AddUnsignedConstant(uint64_t Value) {
-  EmitOp(dwarf::DW_OP_constu);
-  EmitUnsigned(Value);
-  AddStackValue();
+void DwarfExpression::addUnsignedConstant(uint64_t Value) {
+  emitOp(dwarf::DW_OP_constu);
+  emitUnsigned(Value);
+  addStackValue();
 }
 
-void DwarfExpression::AddUnsignedConstant(const APInt &Value) {
+void DwarfExpression::addUnsignedConstant(const APInt &Value) {
   unsigned Size = Value.getBitWidth();
   const uint64_t *Data = Value.getRawData();
 
   // Chop it up into 64-bit pieces, because that's the maximum that
-  // AddUnsignedConstant takes.
+  // addUnsignedConstant takes.
   unsigned Offset = 0;
   while (Offset < Size) {
-    AddUnsignedConstant(*Data++);
+    addUnsignedConstant(*Data++);
     if (Offset == 0 && Size <= 64)
       break;
-    AddOpPiece(std::min(Size-Offset, 64u), Offset);
+    addOpPiece(std::min(Size-Offset, 64u), Offset);
     Offset += 64;
   }
 }
 
-bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI,
+bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
                                               DIExpressionCursor &ExprCursor,
                                               unsigned MachineReg,
                                               unsigned FragmentOffsetInBits) {
-  if (!ExprCursor)
-    return AddMachineReg(TRI, MachineReg);
+  auto Fragment = ExprCursor.getFragmentInfo();
+  if (!addMachineReg(TRI, MachineReg, Fragment ? Fragment->SizeInBits : ~1U))
+    return false;
 
-  // Pattern-match combinations for which more efficient representations exist
-  // first.
-  bool ValidReg = false;
+  bool HasComplexExpression = false;
   auto Op = ExprCursor.peek();
+  if (Op && Op->getOp() != dwarf::DW_OP_LLVM_fragment)
+    HasComplexExpression = true;
+
+  // If the register can only be described by a complex expression (i.e.,
+  // multiple subregisters) it doesn't safely compose with another complex
+  // expression. For example, it is not possible to apply a DW_OP_deref
+  // operation to multiple DW_OP_pieces.
+  if (HasComplexExpression && DwarfRegs.size() > 1) {
+    DwarfRegs.clear();
+    return false;
+  }
+
+  // Handle simple register locations.
+  if (!HasComplexExpression) {
+    for (auto &Reg : DwarfRegs) {
+      if (Reg.DwarfRegNo >= 0)
+        addReg(Reg.DwarfRegNo, Reg.Comment);
+      addOpPiece(Reg.Size);
+    }
+    DwarfRegs.clear();
+    return true;
+  }
+
+  assert(DwarfRegs.size() == 1);
+  auto Reg = DwarfRegs[0];
+  bool FBReg = isFrameRegister(TRI, MachineReg); 
+  assert(Reg.Size == 0 && "subregister has same size as superregister");
+
+  // Pattern-match combinations for which more efficient representations exist.
   switch (Op->getOp()) {
   default: {
-    auto Fragment = ExprCursor.getFragmentInfo();
-    ValidReg = AddMachineReg(TRI, MachineReg,
-			     Fragment ? Fragment->SizeInBits : ~1U);
+    if (FBReg)
+      addFBReg(0);
+    else
+      addReg(Reg.DwarfRegNo, 0);
     break;
   }
   case dwarf::DW_OP_plus:
@@ -208,28 +236,42 @@ bool DwarfExpression::AddMachineRegExpression(const TargetRegisterInfo &TRI,
     // [DW_OP_reg,Offset,DW_OP_minus,DW_OP_deref] --> [DW_OP_breg,-Offset].
     auto N = ExprCursor.peekNext();
     if (N && N->getOp() == dwarf::DW_OP_deref) {
-      unsigned Offset = Op->getArg(0);
-      ValidReg = AddMachineRegIndirect(
-          TRI, MachineReg, Op->getOp() == dwarf::DW_OP_plus ? Offset : -Offset);
+      int Offset = Op->getArg(0);
+      int SignedOffset = (Op->getOp() == dwarf::DW_OP_plus) ? Offset : -Offset;
+      if (FBReg)
+        addFBReg(SignedOffset);
+      else
+        addBReg(Reg.DwarfRegNo, SignedOffset);
+
       ExprCursor.consume(2);
-    } else
-      ValidReg = AddMachineReg(TRI, MachineReg);
+      break;
+    }
+    addReg(Reg.DwarfRegNo, 0);
     break;
   }
   case dwarf::DW_OP_deref:
     // [DW_OP_reg,DW_OP_deref] --> [DW_OP_breg].
-    ValidReg = AddMachineRegIndirect(TRI, MachineReg);
+    if (FBReg)
+      addFBReg(0);
+    else
+      addBReg(Reg.DwarfRegNo, 0);
     ExprCursor.take();
     break;
   }
-
-  return ValidReg;
+  DwarfRegs.clear();
+  return true;
 }
 
-void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,
+void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
                                     unsigned FragmentOffsetInBits) {
   while (ExprCursor) {
     auto Op = ExprCursor.take();
+
+    // If we need to mask out a subregister, do it now, unless the next
+    // operation would emit an OpPiece anyway.
+    if (SubRegisterSizeInBits && Op->getOp() != dwarf::DW_OP_LLVM_fragment)
+      maskSubRegister();
+
     switch (Op->getOp()) {
     case dwarf::DW_OP_LLVM_fragment: {
       unsigned SizeInBits = Op->getArg(1);
@@ -239,39 +281,45 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,
       // location.
       assert(OffsetInBits >= FragmentOffset && "fragment offset not added?");
 
-      // If \a AddMachineReg already emitted DW_OP_piece operations to represent
+      // If \a addMachineReg already emitted DW_OP_piece operations to represent
       // a super-register by splicing together sub-registers, subtract the size
       // of the pieces that was already emitted.
       SizeInBits -= OffsetInBits - FragmentOffset;
 
-      // If \a AddMachineReg requested a DW_OP_bit_piece to stencil out a
+      // If \a addMachineReg requested a DW_OP_bit_piece to stencil out a
       // sub-register that is smaller than the current fragment's size, use it.
       if (SubRegisterSizeInBits)
         SizeInBits = std::min<unsigned>(SizeInBits, SubRegisterSizeInBits);
       
-      AddOpPiece(SizeInBits, SubRegisterOffsetInBits);
+      addOpPiece(SizeInBits, SubRegisterOffsetInBits);
       setSubRegisterPiece(0, 0);
       break;
     }
     case dwarf::DW_OP_plus:
-      EmitOp(dwarf::DW_OP_plus_uconst);
-      EmitUnsigned(Op->getArg(0));
+      emitOp(dwarf::DW_OP_plus_uconst);
+      emitUnsigned(Op->getArg(0));
       break;
     case dwarf::DW_OP_minus:
       // There is no OP_minus_uconst.
-      EmitOp(dwarf::DW_OP_constu);
-      EmitUnsigned(Op->getArg(0));
-      EmitOp(dwarf::DW_OP_minus);
+      emitOp(dwarf::DW_OP_constu);
+      emitUnsigned(Op->getArg(0));
+      emitOp(dwarf::DW_OP_minus);
       break;
     case dwarf::DW_OP_deref:
-      EmitOp(dwarf::DW_OP_deref);
+      emitOp(dwarf::DW_OP_deref);
       break;
     case dwarf::DW_OP_constu:
-      EmitOp(dwarf::DW_OP_constu);
-      EmitUnsigned(Op->getArg(0));
+      emitOp(dwarf::DW_OP_constu);
+      emitUnsigned(Op->getArg(0));
       break;
     case dwarf::DW_OP_stack_value:
-      AddStackValue();
+      addStackValue();
+      break;
+    case dwarf::DW_OP_swap:
+      emitOp(dwarf::DW_OP_swap);
+      break;
+    case dwarf::DW_OP_xderef:
+      emitOp(dwarf::DW_OP_xderef);
       break;
     default:
       llvm_unreachable("unhandled opcode found in expression");
@@ -279,9 +327,25 @@ void DwarfExpression::AddExpression(DIExpressionCursor &&ExprCursor,
   }
 }
 
+/// add masking operations to stencil out a subregister.
+void DwarfExpression::maskSubRegister() {
+  assert(SubRegisterSizeInBits && "no subregister was registered");
+  if (SubRegisterOffsetInBits > 0)
+    addShr(SubRegisterOffsetInBits);
+  uint64_t Mask = (1ULL << (uint64_t)SubRegisterSizeInBits) - 1ULL;
+  addAnd(Mask);
+}
+
+
 void DwarfExpression::finalize() {
-  if (SubRegisterSizeInBits)
-    AddOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits);
+  assert(DwarfRegs.size() == 0 && "dwarf registers not emitted");
+  // Emit any outstanding DW_OP_piece operations to mask out subregisters.
+  if (SubRegisterSizeInBits == 0)
+    return;
+  // Don't emit a DW_OP_piece for a subregister at offset 0.
+  if (SubRegisterOffsetInBits == 0)
+    return;
+  addOpPiece(SubRegisterSizeInBits, SubRegisterOffsetInBits);
 }
 
 void DwarfExpression::addFragmentOffset(const DIExpression *Expr) {
@@ -292,6 +356,6 @@ void DwarfExpression::addFragmentOffset(const DIExpression *Expr) {
   assert(FragmentOffset >= OffsetInBits &&
          "overlapping or duplicate fragments");
   if (FragmentOffset > OffsetInBits)
-    AddOpPiece(FragmentOffset - OffsetInBits);
+    addOpPiece(FragmentOffset - OffsetInBits);
   OffsetInBits = FragmentOffset;
 }
diff --git a/lib/CodeGen/AsmPrinter/DwarfExpression.h b/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 01f66fa7eac4e609f38b9e570eb8ee950f333f8d..e8dc211eb3c22778739ab0f7157804eb708d457b 100644
--- a/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -84,9 +84,19 @@ public:
 /// entry.
 class DwarfExpression {
 protected:
-  unsigned DwarfVersion;
+  /// Holds information about all subregisters comprising a register location.
+  struct Register {
+    int DwarfRegNo;
+    unsigned Size;
+    const char *Comment;
+  };
+
+  /// The register location, if any.
+  SmallVector<Register, 2> DwarfRegs;
+
   /// Current Fragment Offset in Bits.
   uint64_t OffsetInBits = 0;
+  unsigned DwarfVersion;
 
   /// Sometimes we need to add a DW_OP_bit_piece to describe a subregister. 
   unsigned SubRegisterSizeInBits = 0;
@@ -99,35 +109,54 @@ protected:
     SubRegisterOffsetInBits = OffsetInBits;
   }
 
-public:
-  DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {}
-  virtual ~DwarfExpression() {};
-
-  /// This needs to be called last to commit any pending changes.
-  void finalize();
+  /// Add masking operations to stencil out a subregister.
+  void maskSubRegister();
 
   /// Output a dwarf operand and an optional assembler comment.
-  virtual void EmitOp(uint8_t Op, const char *Comment = nullptr) = 0;
+  virtual void emitOp(uint8_t Op, const char *Comment = nullptr) = 0;
   /// Emit a raw signed value.
-  virtual void EmitSigned(int64_t Value) = 0;
+  virtual void emitSigned(int64_t Value) = 0;
   /// Emit a raw unsigned value.
-  virtual void EmitUnsigned(uint64_t Value) = 0;
+  virtual void emitUnsigned(uint64_t Value) = 0;
   /// Return whether the given machine register is the frame register in the
   /// current function.
   virtual bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) = 0;
 
-  /// Emit a dwarf register operation.
-  void AddReg(int DwarfReg, const char *Comment = nullptr);
-  /// Emit an (double-)indirect dwarf register operation.
-  void AddRegIndirect(int DwarfReg, int Offset);
+  /// Emit a DW_OP_reg operation.
+  void addReg(int DwarfReg, const char *Comment = nullptr);
+  /// Emit a DW_OP_breg operation.
+  void addBReg(int DwarfReg, int Offset);
+  /// Emit DW_OP_fbreg <Offset>.
+  void addFBReg(int Offset);
+
+  /// Emit a partial DWARF register operation.
+  ///
+  /// \param MachineReg           The register number.
+  /// \param MaxSize              If the register must be composed from
+  ///                             sub-registers this is an upper bound
+  ///                             for how many bits the emitted DW_OP_piece
+  ///                             may cover.
+  ///
+  /// If size and offset is zero an operation for the entire register is
+  /// emitted: Some targets do not provide a DWARF register number for every
+  /// register.  If this is the case, this function will attempt to emit a DWARF
+  /// register by emitting a fragment of a super-register or by piecing together
+  /// multiple subregisters that alias the register.
+  ///
+  /// \return false if no DWARF register exists for MachineReg.
+  bool addMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg,
+                     unsigned MaxSize = ~1U);
+
 
   /// Emit a DW_OP_piece or DW_OP_bit_piece operation for a variable fragment.
   /// \param OffsetInBits    This is an optional offset into the location that
   /// is at the top of the DWARF stack.
-  void AddOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
+  void addOpPiece(unsigned SizeInBits, unsigned OffsetInBits = 0);
 
-  /// Emit a shift-right dwarf expression.
-  void AddShr(unsigned ShiftBy);
+  /// Emit a shift-right dwarf operation.
+  void addShr(unsigned ShiftBy);
+  /// Emit a bitwise and dwarf operation.
+  void addAnd(unsigned Mask);
 
   /// Emit a DW_OP_stack_value, if supported.
   ///
@@ -140,37 +169,21 @@ public:
   /// constant value, so the producers and consumers started to rely on
   /// heuristics to disambiguate the value vs. location status of the
   /// expression.  See PR21176 for more details.
-  void AddStackValue();
+  void addStackValue();
 
-  /// Emit an indirect dwarf register operation for the given machine register.
-  /// \return false if no DWARF register exists for MachineReg.
-  bool AddMachineRegIndirect(const TargetRegisterInfo &TRI, unsigned MachineReg,
-                             int Offset = 0);
+  ~DwarfExpression() = default;
+public:
+  DwarfExpression(unsigned DwarfVersion) : DwarfVersion(DwarfVersion) {}
 
-  /// Emit a partial DWARF register operation.
-  ///
-  /// \param MachineReg           The register number.
-  /// \param MaxSize              If the register must be composed from
-  ///                             sub-registers this is an upper bound
-  ///                             for how many bits the emitted DW_OP_piece
-  ///                             may cover.
-  ///
-  /// If size and offset is zero an operation for the entire register is
-  /// emitted: Some targets do not provide a DWARF register number for every
-  /// register.  If this is the case, this function will attempt to emit a DWARF
-  /// register by emitting a fragment of a super-register or by piecing together
-  /// multiple subregisters that alias the register.
-  ///
-  /// \return false if no DWARF register exists for MachineReg.
-  bool AddMachineReg(const TargetRegisterInfo &TRI, unsigned MachineReg,
-                     unsigned MaxSize = ~1U);
+  /// This needs to be called last to commit any pending changes.
+  void finalize();
 
   /// Emit a signed constant.
-  void AddSignedConstant(int64_t Value);
+  void addSignedConstant(int64_t Value);
   /// Emit an unsigned constant.
-  void AddUnsignedConstant(uint64_t Value);
+  void addUnsignedConstant(uint64_t Value);
   /// Emit an unsigned constant.
-  void AddUnsignedConstant(const APInt &Value);
+  void addUnsignedConstant(const APInt &Value);
 
   /// Emit a machine register location. As an optimization this may also consume
   /// the prefix of a DwarfExpression if a more efficient representation for
@@ -181,7 +194,7 @@ public:
   ///                                 fragment inside the entire variable.
   /// \return                         false if no DWARF register exists
   ///                                 for MachineReg.
-  bool AddMachineRegExpression(const TargetRegisterInfo &TRI,
+  bool addMachineRegExpression(const TargetRegisterInfo &TRI,
                                DIExpressionCursor &Expr, unsigned MachineReg,
                                unsigned FragmentOffsetInBits = 0);
   /// Emit all remaining operations in the DIExpressionCursor.
@@ -189,7 +202,7 @@ public:
   /// \param FragmentOffsetInBits     If this is one fragment out of multiple
   ///                                 locations, this is the offset of the
   ///                                 fragment inside the entire variable.
-  void AddExpression(DIExpressionCursor &&Expr,
+  void addExpression(DIExpressionCursor &&Expr,
                      unsigned FragmentOffsetInBits = 0);
 
   /// If applicable, emit an empty DW_OP_piece / DW_OP_bit_piece to advance to
@@ -198,33 +211,32 @@ public:
 };
 
 /// DwarfExpression implementation for .debug_loc entries.
-class DebugLocDwarfExpression : public DwarfExpression {
+class DebugLocDwarfExpression final : public DwarfExpression {
   ByteStreamer &BS;
 
+  void emitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void emitSigned(int64_t Value) override;
+  void emitUnsigned(uint64_t Value) override;
+  bool isFrameRegister(const TargetRegisterInfo &TRI,
+                       unsigned MachineReg) override;
 public:
   DebugLocDwarfExpression(unsigned DwarfVersion, ByteStreamer &BS)
       : DwarfExpression(DwarfVersion), BS(BS) {}
-
-  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
-  void EmitSigned(int64_t Value) override;
-  void EmitUnsigned(uint64_t Value) override;
-  bool isFrameRegister(const TargetRegisterInfo &TRI,
-                       unsigned MachineReg) override;
 };
 
 /// DwarfExpression implementation for singular DW_AT_location.
-class DIEDwarfExpression : public DwarfExpression {
+class DIEDwarfExpression final : public DwarfExpression {
 const AsmPrinter &AP;
   DwarfUnit &DU;
   DIELoc &DIE;
 
-public:
-  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE);
-  void EmitOp(uint8_t Op, const char *Comment = nullptr) override;
-  void EmitSigned(int64_t Value) override;
-  void EmitUnsigned(uint64_t Value) override;
+  void emitOp(uint8_t Op, const char *Comment = nullptr) override;
+  void emitSigned(int64_t Value) override;
+  void emitUnsigned(uint64_t Value) override;
   bool isFrameRegister(const TargetRegisterInfo &TRI,
                        unsigned MachineReg) override;
+public:
+  DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU, DIELoc &DIE);
   DIELoc *finalize() {
     DwarfExpression::finalize();
     return &DIE;
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 4269bb672a9f60971d2c4f71c238b01dce0d58f2..bad5b09553cdc123fc48b9952482bbf5ad3b09a7 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -54,15 +54,15 @@ DIEDwarfExpression::DIEDwarfExpression(const AsmPrinter &AP, DwarfUnit &DU,
     : DwarfExpression(AP.getDwarfVersion()), AP(AP), DU(DU),
       DIE(DIE) {}
 
-void DIEDwarfExpression::EmitOp(uint8_t Op, const char* Comment) {
+void DIEDwarfExpression::emitOp(uint8_t Op, const char* Comment) {
   DU.addUInt(DIE, dwarf::DW_FORM_data1, Op);
 }
 
-void DIEDwarfExpression::EmitSigned(int64_t Value) {
+void DIEDwarfExpression::emitSigned(int64_t Value) {
   DU.addSInt(DIE, dwarf::DW_FORM_sdata, Value);
 }
 
-void DIEDwarfExpression::EmitUnsigned(uint64_t Value) {
+void DIEDwarfExpression::emitUnsigned(uint64_t Value) {
   DU.addUInt(DIE, dwarf::DW_FORM_udata, Value);
 }
 
@@ -98,25 +98,35 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
   default:
     break;
 
-  case dwarf::DW_LANG_C89:
-  case dwarf::DW_LANG_C99:
+  // The languages below have valid values in all DWARF versions.
   case dwarf::DW_LANG_C:
+  case dwarf::DW_LANG_C89:
   case dwarf::DW_LANG_C_plus_plus:
-  case dwarf::DW_LANG_ObjC:
-  case dwarf::DW_LANG_ObjC_plus_plus:
     return 0;
 
   case dwarf::DW_LANG_Fortran77:
   case dwarf::DW_LANG_Fortran90:
-  case dwarf::DW_LANG_Fortran95:
     return 1;
 
-  // The languages below have valid values only if the DWARF version >= 4.
+  // The languages below have valid values only if the DWARF version >= 3.
+  case dwarf::DW_LANG_C99:
+  case dwarf::DW_LANG_ObjC:
+  case dwarf::DW_LANG_ObjC_plus_plus:
+    if (DD->getDwarfVersion() >= 3)
+      return 0;
+    break;
+
+  case dwarf::DW_LANG_Fortran95:
+    if (DD->getDwarfVersion() >= 3)
+      return 1;
+    break;
+
+  // Starting with DWARF v4, all defined languages have valid values.
+  case dwarf::DW_LANG_D:
   case dwarf::DW_LANG_Java:
   case dwarf::DW_LANG_Python:
   case dwarf::DW_LANG_UPC:
-  case dwarf::DW_LANG_D:
-    if (dwarf::DWARF_VERSION >= 4)
+    if (DD->getDwarfVersion() >= 4)
       return 0;
     break;
 
@@ -127,31 +137,33 @@ int64_t DwarfUnit::getDefaultLowerBound() const {
   case dwarf::DW_LANG_Modula2:
   case dwarf::DW_LANG_Pascal83:
   case dwarf::DW_LANG_PLI:
-    if (dwarf::DWARF_VERSION >= 4)
+    if (DD->getDwarfVersion() >= 4)
       return 1;
     break;
 
-  // The languages below have valid values only if the DWARF version >= 5.
-  case dwarf::DW_LANG_OpenCL:
-  case dwarf::DW_LANG_Go:
-  case dwarf::DW_LANG_Haskell:
+  // The languages below are new in DWARF v5.
+  case dwarf::DW_LANG_BLISS:
+  case dwarf::DW_LANG_C11:
   case dwarf::DW_LANG_C_plus_plus_03:
   case dwarf::DW_LANG_C_plus_plus_11:
+  case dwarf::DW_LANG_C_plus_plus_14:
+  case dwarf::DW_LANG_Dylan:
+  case dwarf::DW_LANG_Go:
+  case dwarf::DW_LANG_Haskell:
   case dwarf::DW_LANG_OCaml:
+  case dwarf::DW_LANG_OpenCL:
+  case dwarf::DW_LANG_RenderScript:
   case dwarf::DW_LANG_Rust:
-  case dwarf::DW_LANG_C11:
   case dwarf::DW_LANG_Swift:
-  case dwarf::DW_LANG_Dylan:
-  case dwarf::DW_LANG_C_plus_plus_14:
-    if (dwarf::DWARF_VERSION >= 5)
+    if (DD->getDwarfVersion() >= 5)
       return 0;
     break;
 
-  case dwarf::DW_LANG_Modula3:
-  case dwarf::DW_LANG_Julia:
   case dwarf::DW_LANG_Fortran03:
   case dwarf::DW_LANG_Fortran08:
-    if (dwarf::DWARF_VERSION >= 5)
+  case dwarf::DW_LANG_Julia:
+  case dwarf::DW_LANG_Modula3:
+    if (DD->getDwarfVersion() >= 5)
       return 1;
     break;
   }
@@ -285,13 +297,6 @@ void DwarfUnit::addDIETypeSignature(DIE &Die, uint64_t Signature) {
                dwarf::DW_FORM_ref_sig8, DIEInteger(Signature));
 }
 
-void DwarfUnit::addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
-                                    StringRef Identifier) {
-  uint64_t Signature = DD->makeTypeSignature(Identifier);
-  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_ref_sig8,
-               DIEInteger(Signature));
-}
-
 void DwarfUnit::addDIEEntry(DIE &Die, dwarf::Attribute Attribute,
                             DIEEntry Entry) {
   const DIEUnit *CU = Die.getUnit();
@@ -465,50 +470,47 @@ void DwarfUnit::addBlockByrefAddress(const DbgVariable &DV, DIE &Die,
   // Decode the original location, and use that as the start of the byref
   // variable's location.
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
-  SmallVector<uint64_t, 6> DIExpr;
-  DIEDwarfExpression Expr(*Asm, *this, *Loc);
-
-  bool validReg;
-  if (Location.isReg())
-    validReg = Expr.AddMachineReg(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                  Location.getReg());
-  else
-    validReg =
-        Expr.AddMachineRegIndirect(*Asm->MF->getSubtarget().getRegisterInfo(),
-                                   Location.getReg(), Location.getOffset());
-
-  if (!validReg)
-    return;
+  DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
 
+  SmallVector<uint64_t, 9> Ops;
+  if (Location.isIndirect()) {
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(Location.getOffset());
+    Ops.push_back(dwarf::DW_OP_deref);
+  }
   // If we started with a pointer to the __Block_byref... struct, then
   // the first thing we need to do is dereference the pointer (DW_OP_deref).
   if (isPointer)
-    DIExpr.push_back(dwarf::DW_OP_deref);
+    Ops.push_back(dwarf::DW_OP_deref);
 
   // Next add the offset for the '__forwarding' field:
   // DW_OP_plus_uconst ForwardingFieldOffset.  Note there's no point in
   // adding the offset if it's 0.
   if (forwardingFieldOffset > 0) {
-    DIExpr.push_back(dwarf::DW_OP_plus);
-    DIExpr.push_back(forwardingFieldOffset);
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(forwardingFieldOffset);
   }
 
   // Now dereference the __forwarding field to get to the real __Block_byref
   // struct:  DW_OP_deref.
-  DIExpr.push_back(dwarf::DW_OP_deref);
+  Ops.push_back(dwarf::DW_OP_deref);
 
   // Now that we've got the real __Block_byref... struct, add the offset
   // for the variable's field to get to the location of the actual variable:
   // DW_OP_plus_uconst varFieldOffset.  Again, don't add if it's 0.
   if (varFieldOffset > 0) {
-    DIExpr.push_back(dwarf::DW_OP_plus);
-    DIExpr.push_back(varFieldOffset);
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(varFieldOffset);
   }
-  Expr.AddExpression(makeArrayRef(DIExpr));
-  Expr.finalize();
+
+  DIExpressionCursor Cursor(Ops);
+  const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
+  if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
+    return;
+  DwarfExpr.addExpression(std::move(Cursor));
 
   // Now attach the location information to the DIE.
-  addBlock(Die, Attribute, Loc);
+  addBlock(Die, Attribute, DwarfExpr.finalize());
 }
 
 /// Return true if type encoding is unsigned.
@@ -672,7 +674,7 @@ DIE *DwarfUnit::getOrCreateContextDIE(const DIScope *Context) {
   return getDIE(Context);
 }
 
-DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) {
+DIE *DwarfTypeUnit::createTypeDIE(const DICompositeType *Ty) {
   auto *Context = resolve(Ty->getScope());
   DIE *ContextDIE = getOrCreateContextDIE(Context);
 
@@ -684,8 +686,7 @@ DIE *DwarfUnit::createTypeDIE(const DICompositeType *Ty) {
 
   constructTypeDIE(TyDIE, cast<DICompositeType>(Ty));
 
-  if (!Ty->isExternalTypeRef())
-    updateAcceleratorTables(Context, Ty, TyDIE);
+  updateAcceleratorTables(Context, Ty, TyDIE);
   return &TyDIE;
 }
 
@@ -841,6 +842,13 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy) {
   // Add source line info if available and TyDesc is not a forward declaration.
   if (!DTy->isForwardDecl())
     addSourceLine(Buffer, DTy);
+
+  // If DWARF address space value is other than None, add it for pointer and
+  // reference types as DW_AT_address_class.
+  if (DTy->getDWARFAddressSpace() && (Tag == dwarf::DW_TAG_pointer_type ||
+                                      Tag == dwarf::DW_TAG_reference_type))
+    addUInt(Buffer, dwarf::DW_AT_address_class, dwarf::DW_FORM_data4,
+            DTy->getDWARFAddressSpace().getValue());
 }
 
 void DwarfUnit::constructSubprogramArguments(DIE &Buffer, DITypeRefArray Args) {
@@ -892,13 +900,6 @@ void DwarfUnit::constructTypeDIE(DIE &Buffer, const DISubroutineType *CTy) {
 }
 
 void DwarfUnit::constructTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
-  if (CTy->isExternalTypeRef()) {
-    StringRef Identifier = CTy->getIdentifier();
-    assert(!Identifier.empty() && "external type ref without identifier");
-    addFlag(Buffer, dwarf::DW_AT_declaration);
-    return addDIETypeSignature(Buffer, dwarf::DW_AT_signature, Identifier);
-  }
-
   // Add name if not anonymous or intermediate type.
   StringRef Name = CTy->getName();
 
@@ -1184,7 +1185,7 @@ void DwarfUnit::applySubprogramAttributes(const DISubprogram *SP, DIE &SPDie,
   // If -fdebug-info-for-profiling is enabled, need to emit the subprogram
   // and its source location.
   bool SkipSPSourceLocation = SkipSPAttributes &&
-                              !Asm->TM.Options.DebugInfoForProfiling;
+                              !CUNode->getDebugInfoForProfiling();
   if (!SkipSPSourceLocation)
     if (applySubprogramDefinitionAttributes(SP, SPDie))
       return;
@@ -1531,18 +1532,27 @@ DIE *DwarfUnit::getOrCreateStaticMemberDIE(const DIDerivedType *DT) {
   return &StaticMemberDIE;
 }
 
-void DwarfUnit::emitHeader(bool UseOffsets) {
+void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
   // Emit size of content not including length itself
   Asm->OutStreamer->AddComment("Length of Unit");
   Asm->EmitInt32(getHeaderSize() + getUnitDie().getSize());
 
   Asm->OutStreamer->AddComment("DWARF version number");
-  Asm->EmitInt16(DD->getDwarfVersion());
-  Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
+  unsigned Version = DD->getDwarfVersion();
+  Asm->EmitInt16(Version);
+
+  // DWARF v5 reorders the address size and adds a unit type.
+  if (Version >= 5) {
+    Asm->OutStreamer->AddComment("DWARF Unit Type");
+    Asm->EmitInt8(UT);
+    Asm->OutStreamer->AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+  }
 
   // We share one abbreviations table across all units so it's always at the
   // start of the section. Use a relocatable offset where needed to ensure
   // linking doesn't invalidate that offset.
+  Asm->OutStreamer->AddComment("Offset Into Abbrev. Section");
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   if (UseOffsets)
     Asm->EmitInt32(0);
@@ -1550,12 +1560,16 @@ void DwarfUnit::emitHeader(bool UseOffsets) {
     Asm->emitDwarfSymbolReference(
         TLOF.getDwarfAbbrevSection()->getBeginSymbol(), false);
 
-  Asm->OutStreamer->AddComment("Address Size (in bytes)");
-  Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+  if (Version <= 4) {
+    Asm->OutStreamer->AddComment("Address Size (in bytes)");
+    Asm->EmitInt8(Asm->getDataLayout().getPointerSize());
+  }
 }
 
 void DwarfTypeUnit::emitHeader(bool UseOffsets) {
-  DwarfUnit::emitHeader(UseOffsets);
+  DwarfUnit::emitCommonHeader(UseOffsets, 
+                              DD->useSplitDwarf() ? dwarf::DW_UT_split_type
+                                                  : dwarf::DW_UT_type);
   Asm->OutStreamer->AddComment("Type Signature");
   Asm->OutStreamer->EmitIntValue(TypeSignature, sizeof(TypeSignature));
   Asm->OutStreamer->AddComment("Type DIE Offset");
@@ -1569,3 +1583,13 @@ bool DwarfTypeUnit::isDwoUnit() const {
   // when split DWARF is being used.
   return DD->useSplitDwarf();
 }
+
+void DwarfTypeUnit::addGlobalName(StringRef Name, const DIE &Die,
+                                  const DIScope *Context) {
+  getCU().addGlobalNameForTypeUnit(Name, Context);
+}
+
+void DwarfTypeUnit::addGlobalType(const DIType *Ty, const DIE &Die,
+                                  const DIScope *Context) {
+  getCU().addGlobalTypeUnitType(Ty, Context);
+}
diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h
index a4646eca7a6272aad210836646f50f22dc895941..d626ef920f956a9440a12bba0a2fc31487b83f51 100644
--- a/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -124,12 +124,12 @@ public:
   std::string getParentContextString(const DIScope *Context) const;
 
   /// Add a new global name to the compile unit.
-  virtual void addGlobalName(StringRef Name, DIE &Die, const DIScope *Context) {
-  }
+  virtual void addGlobalName(StringRef Name, const DIE &Die,
+                             const DIScope *Context) = 0;
 
   /// Add a new global type to the compile unit.
   virtual void addGlobalType(const DIType *Ty, const DIE &Die,
-                             const DIScope *Context) {}
+                             const DIScope *Context) = 0;
 
   /// Returns the DIE map slot for the specified debug variable.
   ///
@@ -198,9 +198,6 @@ public:
 
   /// Add a type's DW_AT_signature and set the  declaration flag.
   void addDIETypeSignature(DIE &Die, uint64_t Signature);
-  /// Add an attribute containing the type signature for a unique identifier.
-  void addDIETypeSignature(DIE &Die, dwarf::Attribute Attribute,
-                           StringRef Identifier);
 
   /// Add block data.
   void addBlock(DIE &Die, dwarf::Attribute Attribute, DIELoc *Block);
@@ -261,9 +258,6 @@ public:
   /// Find existing DIE or create new DIE for the given type.
   DIE *getOrCreateTypeDIE(const MDNode *N);
 
-  /// Get context owner's DIE.
-  DIE *createTypeDIE(const DICompositeType *Ty);
-
   /// Get context owner's DIE.
   DIE *getOrCreateContextDIE(const DIScope *Context);
 
@@ -282,11 +276,13 @@ public:
   virtual unsigned getHeaderSize() const {
     return sizeof(int16_t) + // DWARF version number
            sizeof(int32_t) + // Offset Into Abbrev. Section
-           sizeof(int8_t);   // Pointer Size (in bytes)
+           sizeof(int8_t) +  // Pointer Size (in bytes)
+           (DD->getDwarfVersion() >= 5 ? sizeof(int8_t)
+                                       : 0); // DWARF v5 unit type
   }
 
   /// Emit the header for this unit, not including the initial length field.
-  virtual void emitHeader(bool UseOffsets);
+  virtual void emitHeader(bool UseOffsets) = 0;
 
   virtual DwarfCompileUnit &getCU() = 0;
 
@@ -306,6 +302,14 @@ protected:
     return Ref.resolve();
   }
 
+  /// If this is a named finished type then include it in the list of types for
+  /// the accelerator tables.
+  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
+                               const DIE &TyDIE);
+
+  /// Emit the common part of the header for this unit.
+  void emitCommonHeader(bool UseOffsets, dwarf::UnitType UT);
+
 private:
   void constructTypeDIE(DIE &Buffer, const DIBasicType *BTy);
   void constructTypeDIE(DIE &Buffer, const DIDerivedType *DTy);
@@ -330,11 +334,6 @@ private:
   /// Set D as anonymous type for index which can be reused later.
   void setIndexTyDie(DIE *D) { IndexTyDie = D; }
 
-  /// If this is a named finished type then include it in the list of types for
-  /// the accelerator tables.
-  void updateAcceleratorTables(const DIScope *Context, const DIType *Ty,
-                               const DIE &TyDIE);
-
   virtual bool isDwoUnit() const = 0;
 };
 
@@ -354,12 +353,19 @@ public:
   void setTypeSignature(uint64_t Signature) { TypeSignature = Signature; }
   void setType(const DIE *Ty) { this->Ty = Ty; }
 
+  /// Get context owner's DIE.
+  DIE *createTypeDIE(const DICompositeType *Ty);
+
   /// Emit the header for this unit, not including the initial length field.
   void emitHeader(bool UseOffsets) override;
   unsigned getHeaderSize() const override {
     return DwarfUnit::getHeaderSize() + sizeof(uint64_t) + // Type Signature
            sizeof(uint32_t);                               // Type DIE Offset
   }
+  void addGlobalName(StringRef Name, const DIE &Die,
+                     const DIScope *Context) override;
+  void addGlobalType(const DIType *Ty, const DIE &Die,
+                     const DIScope *Context) override;
   DwarfCompileUnit &getCU() override { return CU; }
 };
 } // end llvm namespace
diff --git a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index 6a023b998b326f1a6df00bbc3a925796a2db55f8..342efc3611c784ff0af28a40b5c5a0f26e604438 100644
--- a/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter -----*- C++ -*-===//
+//===- ErlangGCPrinter.cpp - Erlang/OTP frametable emitter ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,21 +14,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
+#include "llvm/CodeGen/GCStrategy.h"
 #include "llvm/CodeGen/GCs.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include "llvm/Target/TargetLoweringObjectFile.h" 
+#include "llvm/Support/ELF.h"
 
 using namespace llvm;
 
@@ -38,13 +36,12 @@ class ErlangGCPrinter : public GCMetadataPrinter {
 public:
   void finishAssembly(Module &M, GCModuleInfo &Info, AsmPrinter &AP) override;
 };
-}
+
+} // end anonymous namespace
 
 static GCMetadataPrinterRegistry::Add<ErlangGCPrinter>
     X("erlang", "erlang-compatible garbage collector");
 
-void llvm::linkErlangGCPrinter() {}
-
 void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
                                      AsmPrinter &AP) {
   MCStreamer &OS = *AP.OutStreamer;
@@ -121,3 +118,5 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
     }
   }
 }
+
+void llvm::linkErlangGCPrinter() {}
diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp
index 9d7c96a1b8efdc64203e86ffc7e67ab4cf18d88b..704f0ac2f191951a99b94b4fb5de6119de7acebe 100644
--- a/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -68,7 +68,7 @@ void WinException::beginFunction(const MachineFunction *MF) {
 
   const Function *F = MF->getFunction();
 
-  shouldEmitMoves = Asm->needsSEHMoves();
+  shouldEmitMoves = Asm->needsSEHMoves() && MF->hasWinCFI();
 
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
@@ -94,7 +94,7 @@ void WinException::beginFunction(const MachineFunction *MF) {
 
   // If we're not using CFI, we don't want the CFI or the personality, but we
   // might want EH tables if we had EH pads.
-  if (!Asm->MAI->usesWindowsCFI() || (!MF->hasWinCFI() && !PerFn)) {
+  if (!Asm->MAI->usesWindowsCFI()) {
     if (Per == EHPersonality::MSVC_X86SEH && !hasEHFunclets) {
       // If this is 32-bit SEH and we don't have any funclets (really invokes),
       // make sure we emit the parent offset label. Some unreferenced filter
diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp
index bf5cf105a8f86f621ce6b9b67ea1c7247cbf2164..9c19a4fd3c3e0044c56415d35d973a4550af52ae 100644
--- a/lib/CodeGen/AtomicExpandPass.cpp
+++ b/lib/CodeGen/AtomicExpandPass.cpp
@@ -1532,7 +1532,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
 
   Type *ResultTy;
   SmallVector<Value *, 6> Args;
-  AttributeSet Attr;
+  AttributeList Attr;
 
   // 'size' argument.
   if (!UseSizedLibcall) {
@@ -1593,7 +1593,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // Now, the return type.
   if (CASExpected) {
     ResultTy = Type::getInt1Ty(Ctx);
-    Attr = Attr.addAttribute(Ctx, AttributeSet::ReturnIndex, Attribute::ZExt);
+    Attr = Attr.addAttribute(Ctx, AttributeList::ReturnIndex, Attribute::ZExt);
   } else if (HasResult && UseSizedLibcall)
     ResultTy = SizedIntTy;
   else
diff --git a/lib/CodeGen/BranchCoalescing.cpp b/lib/CodeGen/BranchCoalescing.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..efdf300df85063cef2de9d31c9a466b64d896f6e
--- /dev/null
+++ b/lib/CodeGen/BranchCoalescing.cpp
@@ -0,0 +1,758 @@
+//===-- CoalesceBranches.cpp - Coalesce blocks with the same condition ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Coalesce basic blocks guarded by the same branch condition into a single
+/// basic block.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "coal-branch"
+
+static cl::opt<cl::boolOrDefault>
+    EnableBranchCoalescing("enable-branch-coalesce", cl::Hidden,
+                           cl::desc("enable coalescing of duplicate branches"));
+
+STATISTIC(NumBlocksCoalesced, "Number of blocks coalesced");
+STATISTIC(NumPHINotMoved, "Number of PHI Nodes that cannot be merged");
+STATISTIC(NumBlocksNotCoalesced, "Number of blocks not coalesced");
+
+//===----------------------------------------------------------------------===//
+//                               BranchCoalescing
+//===----------------------------------------------------------------------===//
+///
+/// Improve scheduling by coalescing branches that depend on the same condition.
+/// This pass looks for blocks that are guarded by the same branch condition
+/// and attempts to merge the blocks together. Such opportunities arise from
+/// the expansion of select statements in the IR.
+///
+/// For example, consider the following LLVM IR:
+///
+/// %test = icmp eq i32 %x 0
+/// %tmp1 = select i1 %test, double %a, double 2.000000e-03
+/// %tmp2 = select i1 %test, double %b, double 5.000000e-03
+///
+/// This IR expands to the following machine code on PowerPC:
+///
+/// BB#0: derived from LLVM BB %entry
+///    Live Ins: %F1 %F3 %X6
+///        <SNIP1>
+///        %vreg0<def> = COPY %F1; F8RC:%vreg0
+///        %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4
+///        %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>;
+///                    mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7
+///        BCC 76, %vreg5, <BB#2>; CRRC:%vreg5
+///    Successors according to CFG: BB#1(?%) BB#2(?%)
+///
+/// BB#1: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0
+///    Successors according to CFG: BB#2(?%)
+///
+/// BB#2: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0 BB#1
+///        %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>;
+///                    F8RC:%vreg9,%vreg8,%vreg0
+///        <SNIP2>
+///        BCC 76, %vreg5, <BB#4>; CRRC:%vreg5
+///    Successors according to CFG: BB#3(?%) BB#4(?%)
+///
+/// BB#3: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#2
+///    Successors according to CFG: BB#4(?%)
+///
+/// BB#4: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#2 BB#3
+///        %vreg13<def> = PHI %vreg12, <BB#3>, %vreg2, <BB#2>;
+///                     F8RC:%vreg13,%vreg12,%vreg2
+///        <SNIP3>
+///        BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use>
+///
+/// When this pattern is detected, branch coalescing will try to collapse
+/// it by moving code in BB#2 to BB#0 and/or BB#4 and removing BB#3.
+///
+/// If all conditions are meet, IR should collapse to:
+///
+/// BB#0: derived from LLVM BB %entry
+///    Live Ins: %F1 %F3 %X6
+///        <SNIP1>
+///        %vreg0<def> = COPY %F1; F8RC:%vreg0
+///        %vreg5<def> = CMPLWI %vreg4<kill>, 0; CRRC:%vreg5 GPRC:%vreg4
+///        %vreg8<def> = LXSDX %ZERO8, %vreg7<kill>, %RM<imp-use>;
+///                     mem:LD8[ConstantPool] F8RC:%vreg8 G8RC:%vreg7
+///        <SNIP2>
+///        BCC 76, %vreg5, <BB#4>; CRRC:%vreg5
+///    Successors according to CFG: BB#1(0x2aaaaaaa / 0x80000000 = 33.33%)
+///      BB#4(0x55555554 / 0x80000000 = 66.67%)
+///
+/// BB#1: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0
+///    Successors according to CFG: BB#4(0x40000000 / 0x80000000 = 50.00%)
+///
+/// BB#4: derived from LLVM BB %entry
+///    Predecessors according to CFG: BB#0 BB#1
+///        %vreg9<def> = PHI %vreg8, <BB#1>, %vreg0, <BB#0>;
+///                    F8RC:%vreg9,%vreg8,%vreg0
+///        %vreg13<def> = PHI %vreg12, <BB#1>, %vreg2, <BB#0>;
+///                     F8RC:%vreg13,%vreg12,%vreg2
+///        <SNIP3>
+///        BLR8 %LR8<imp-use>, %RM<imp-use>, %F1<imp-use>
+///
+/// Branch Coalescing does not split blocks, it moves everything in the same
+/// direction ensuring it does not break use/definition semantics.
+///
+/// PHI nodes and its corresponding use instructions are moved to its successor
+/// block if there are no uses within the successor block PHI nodes.  PHI
+/// node ordering cannot be assumed.
+///
+/// Non-PHI can be moved up to the predecessor basic block or down to the
+/// successor basic block following any PHI instructions. Whether it moves
+/// up or down depends on whether the register(s) defined in the instructions
+/// are used in current block or in any PHI instructions at the beginning of
+/// the successor block.
+
+namespace {
+
+class BranchCoalescing : public MachineFunctionPass {
+  struct CoalescingCandidateInfo {
+    MachineBasicBlock *BranchBlock;       // Block containing the branch
+    MachineBasicBlock *BranchTargetBlock; // Block branched to
+    MachineBasicBlock *FallThroughBlock;  // Fall-through if branch not taken
+    SmallVector<MachineOperand, 4> Cond;
+    bool MustMoveDown;
+    bool MustMoveUp;
+
+    CoalescingCandidateInfo();
+    void clear();
+  };
+
+  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *MPDT;
+  const TargetInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+
+  void initialize(MachineFunction &F);
+  bool canCoalesceBranch(CoalescingCandidateInfo &Cand);
+  bool identicalOperands(ArrayRef<MachineOperand> OperandList1,
+                         ArrayRef<MachineOperand> OperandList2) const;
+  bool validateCandidates(CoalescingCandidateInfo &SourceRegion,
+                          CoalescingCandidateInfo &TargetRegion) const;
+
+  static bool isBranchCoalescingEnabled() {
+    return EnableBranchCoalescing == cl::BOU_TRUE;
+  }
+
+public:
+  static char ID;
+
+  BranchCoalescing() : MachineFunctionPass(ID) {
+    initializeBranchCoalescingPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override { return "Branch Coalescing"; }
+
+  bool mergeCandidates(CoalescingCandidateInfo &SourceRegion,
+                       CoalescingCandidateInfo &TargetRegion);
+  bool canMoveToBeginning(const MachineInstr &MI,
+                          const MachineBasicBlock &MBB) const;
+  bool canMoveToEnd(const MachineInstr &MI,
+                    const MachineBasicBlock &MBB) const;
+  bool canMerge(CoalescingCandidateInfo &SourceRegion,
+                CoalescingCandidateInfo &TargetRegion) const;
+  void moveAndUpdatePHIs(MachineBasicBlock *SourceRegionMBB,
+                         MachineBasicBlock *TargetRegionMBB);
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char BranchCoalescing::ID = 0;
+char &llvm::BranchCoalescingID = BranchCoalescing::ID;
+
+INITIALIZE_PASS_BEGIN(BranchCoalescing, "branch-coalescing",
+                      "Branch Coalescing", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_END(BranchCoalescing, "branch-coalescing", "Branch Coalescing",
+                    false, false)
+
+BranchCoalescing::CoalescingCandidateInfo::CoalescingCandidateInfo()
+    : BranchBlock(nullptr), BranchTargetBlock(nullptr),
+      FallThroughBlock(nullptr), MustMoveDown(false), MustMoveUp(false) {}
+
+void BranchCoalescing::CoalescingCandidateInfo::clear() {
+  BranchBlock = nullptr;
+  BranchTargetBlock = nullptr;
+  FallThroughBlock = nullptr;
+  Cond.clear();
+  MustMoveDown = false;
+  MustMoveUp = false;
+}
+
+void BranchCoalescing::initialize(MachineFunction &MF) {
+  MDT = &getAnalysis<MachineDominatorTree>();
+  MPDT = &getAnalysis<MachinePostDominatorTree>();
+  TII = MF.getSubtarget().getInstrInfo();
+  MRI = &MF.getRegInfo();
+}
+
+///
+/// Analyze the branch statement to determine if it can be coalesced. This
+/// method analyses the branch statement for the given candidate to determine
+/// if it can be coalesced. If the branch can be coalesced, then the
+/// BranchTargetBlock and the FallThroughBlock are recorded in the specified
+/// Candidate.
+///
+///\param[in,out] Cand The coalescing candidate to analyze
+///\return true if and only if the branch can be coalesced, false otherwise
+///
+bool BranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
+  DEBUG(dbgs() << "Determine if branch block " << Cand.BranchBlock->getNumber()
+               << " can be coalesced:");
+  MachineBasicBlock *FalseMBB = nullptr;
+
+  if (TII->analyzeBranch(*Cand.BranchBlock, Cand.BranchTargetBlock, FalseMBB,
+                         Cand.Cond)) {
+    DEBUG(dbgs() << "TII unable to Analyze Branch - skip\n");
+    return false;
+  }
+
+  for (auto &I : Cand.BranchBlock->terminators()) {
+    DEBUG(dbgs() << "Looking at terminator : " << I << "\n");
+    if (!I.isBranch())
+      continue;
+
+    if (I.getNumOperands() != I.getNumExplicitOperands()) {
+      DEBUG(dbgs() << "Terminator contains implicit operands - skip : " << I
+                   << "\n");
+      return false;
+    }
+  }
+
+  if (Cand.BranchBlock->isEHPad() || Cand.BranchBlock->hasEHPadSuccessor()) {
+    DEBUG(dbgs() << "EH Pad - skip\n");
+    return false;
+  }
+
+  // For now only consider triangles (i.e, BranchTargetBlock is set,
+  // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock)
+  if (!Cand.BranchTargetBlock || FalseMBB ||
+      !Cand.BranchBlock->isSuccessor(Cand.BranchTargetBlock)) {
+    DEBUG(dbgs() << "Does not form a triangle - skip\n");
+    return false;
+  }
+
+  // Ensure there are only two successors
+  if (Cand.BranchBlock->succ_size() != 2) {
+    DEBUG(dbgs() << "Does not have 2 successors - skip\n");
+    return false;
+  }
+
+  // Sanity check - the block must be able to fall through
+  assert(Cand.BranchBlock->canFallThrough() &&
+         "Expecting the block to fall through!");
+
+  // We have already ensured there are exactly two successors to
+  // BranchBlock and that BranchTargetBlock is a successor to BranchBlock.
+  // Ensure the single fall though block is empty.
+  MachineBasicBlock *Succ =
+    (*Cand.BranchBlock->succ_begin() == Cand.BranchTargetBlock)
+    ? *Cand.BranchBlock->succ_rbegin()
+    : *Cand.BranchBlock->succ_begin();
+
+  assert(Succ && "Expecting a valid fall-through block\n");
+
+  if (!Succ->empty()) {
+      DEBUG(dbgs() << "Fall-through block contains code -- skip\n");
+      return false;
+  }
+
+  if (!Succ->isSuccessor(Cand.BranchTargetBlock)) {
+      DEBUG(dbgs()
+            << "Successor of fall through block is not branch taken block\n");
+      return false;
+  }
+
+  Cand.FallThroughBlock = Succ;
+  DEBUG(dbgs() << "Valid Candidate\n");
+  return true;
+}
+
+///
+/// Determine if the two operand lists are identical
+///
+/// \param[in] OpList1 operand list
+/// \param[in] OpList2 operand list
+/// \return true if and only if the operands lists are identical
+///
+bool BranchCoalescing::identicalOperands(
+    ArrayRef<MachineOperand> OpList1, ArrayRef<MachineOperand> OpList2) const {
+
+  if (OpList1.size() != OpList2.size()) {
+    DEBUG(dbgs() << "Operand list is different size\n");
+    return false;
+  }
+
+  for (unsigned i = 0; i < OpList1.size(); ++i) {
+    const MachineOperand &Op1 = OpList1[i];
+    const MachineOperand &Op2 = OpList2[i];
+
+    DEBUG(dbgs() << "Op1: " << Op1 << "\n"
+                 << "Op2: " << Op2 << "\n");
+
+    if (Op1.isIdenticalTo(Op2)) {
+      DEBUG(dbgs() << "Op1 and Op2 are identical!\n");
+      continue;
+    }
+
+    // If the operands are not identical, but are registers, check to see if the
+    // definition of the register produces the same value. If they produce the
+    // same value, consider them to be identical.
+    if (Op1.isReg() && Op2.isReg() &&
+        TargetRegisterInfo::isVirtualRegister(Op1.getReg()) &&
+        TargetRegisterInfo::isVirtualRegister(Op2.getReg())) {
+      MachineInstr *Op1Def = MRI->getVRegDef(Op1.getReg());
+      MachineInstr *Op2Def = MRI->getVRegDef(Op2.getReg());
+      if (TII->produceSameValue(*Op1Def, *Op2Def, MRI)) {
+        DEBUG(dbgs() << "Op1Def: " << *Op1Def << " and " << *Op2Def
+                     << " produce the same value!\n");
+      } else {
+        DEBUG(dbgs() << "Operands produce different values\n");
+        return false;
+      }
+    } else {
+      DEBUG(dbgs() << "The operands are not provably identical.\n");
+      return false;
+    }
+  }
+  return true;
+}
+
+///
+/// Moves ALL PHI instructions in SourceMBB to beginning of TargetMBB
+/// and update them to refer to the new block.  PHI node ordering
+/// cannot be assumed so it does not matter where the PHI instructions
+/// are moved to in TargetMBB.
+///
+/// \param[in] SourceMBB block to move PHI instructions from
+/// \param[in] TargetMBB block to move PHI instructions to
+///
+void BranchCoalescing::moveAndUpdatePHIs(MachineBasicBlock *SourceMBB,
+                                         MachineBasicBlock *TargetMBB) {
+
+  MachineBasicBlock::iterator MI = SourceMBB->begin();
+  MachineBasicBlock::iterator ME = SourceMBB->getFirstNonPHI();
+
+  if (MI == ME) {
+    DEBUG(dbgs() << "SourceMBB contains no PHI instructions.\n");
+    return;
+  }
+
+  // Update all PHI instructions in SourceMBB and move to top of TargetMBB
+  for (MachineBasicBlock::iterator Iter = MI; Iter != ME; Iter++) {
+    MachineInstr &PHIInst = *Iter;
+    for (unsigned i = 2, e = PHIInst.getNumOperands() + 1; i != e; i += 2) {
+      MachineOperand &MO = PHIInst.getOperand(i);
+      if (MO.getMBB() == SourceMBB)
+        MO.setMBB(TargetMBB);
+    }
+  }
+  TargetMBB->splice(TargetMBB->begin(), SourceMBB, MI, ME);
+}
+
+///
+/// This function checks if MI can be moved to the beginning of the TargetMBB
+/// following PHI instructions. A MI instruction can be moved to beginning of
+/// the TargetMBB if there are no uses of it within the TargetMBB PHI nodes.
+///
+/// \param[in] MI the machine instruction to move.
+/// \param[in] TargetMBB the machine basic block to move to
+/// \return true if it is safe to move MI to beginning of TargetMBB,
+///         false otherwise.
+///
+bool BranchCoalescing::canMoveToBeginning(const MachineInstr &MI,
+                                          const MachineBasicBlock &TargetMBB
+                                          ) const {
+
+  DEBUG(dbgs() << "Checking if " << MI << " can move to beginning of "
+        << TargetMBB.getNumber() << "\n");
+
+  for (auto &Def : MI.defs()) { // Looking at Def
+    for (auto &Use : MRI->use_instructions(Def.getReg())) {
+      if (Use.isPHI() && Use.getParent() == &TargetMBB) {
+        DEBUG(dbgs() << "    *** used in a PHI -- cannot move ***\n");
+       return false;
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "  Safe to move to the beginning.\n");
+  return true;
+}
+
+///
+/// This function checks if MI can be moved to the end of the TargetMBB,
+/// immediately before the first terminator.  A MI instruction can be moved
+/// to then end of the TargetMBB if no PHI node defines what MI uses within
+/// it's own MBB.
+///
+/// \param[in] MI the machine instruction to move.
+/// \param[in] TargetMBB the machine basic block to move to
+/// \return true if it is safe to move MI to end of TargetMBB,
+///         false otherwise.
+///
+bool BranchCoalescing::canMoveToEnd(const MachineInstr &MI,
+                                    const MachineBasicBlock &TargetMBB
+                                    ) const {
+
+  DEBUG(dbgs() << "Checking if " << MI << " can move to end of "
+        << TargetMBB.getNumber() << "\n");
+
+  for (auto &Use : MI.uses()) {
+    if (Use.isReg() && TargetRegisterInfo::isVirtualRegister(Use.getReg())) {
+      MachineInstr *DefInst = MRI->getVRegDef(Use.getReg());
+      if (DefInst->isPHI() && DefInst->getParent() == MI.getParent()) {
+        DEBUG(dbgs() << "    *** Cannot move this instruction ***\n");
+        return false;
+      } else {
+        DEBUG(dbgs() << "    *** def is in another block -- safe to move!\n");
+      }
+    }
+  }
+
+  DEBUG(dbgs() << "  Safe to move to the end.\n");
+  return true;
+}
+
+///
+/// This method checks to ensure the two coalescing candidates follows the
+/// expected pattern required for coalescing.
+///
+/// \param[in] SourceRegion The candidate to move statements from
+/// \param[in] TargetRegion The candidate to move statements to
+/// \return true if all instructions in SourceRegion.BranchBlock can be merged
+/// into a block in TargetRegion; false otherwise.
+///
+bool BranchCoalescing::validateCandidates(
+    CoalescingCandidateInfo &SourceRegion,
+    CoalescingCandidateInfo &TargetRegion) const {
+
+  if (TargetRegion.BranchTargetBlock != SourceRegion.BranchBlock)
+    llvm_unreachable("Expecting SourceRegion to immediately follow TargetRegion");
+  else if (!MDT->dominates(TargetRegion.BranchBlock, SourceRegion.BranchBlock))
+    llvm_unreachable("Expecting TargetRegion to dominate SourceRegion");
+  else if (!MPDT->dominates(SourceRegion.BranchBlock, TargetRegion.BranchBlock))
+    llvm_unreachable("Expecting SourceRegion to post-dominate TargetRegion");
+  else if (!TargetRegion.FallThroughBlock->empty() ||
+           !SourceRegion.FallThroughBlock->empty())
+    llvm_unreachable("Expecting fall-through blocks to be empty");
+
+  return true;
+}
+
+///
+/// This method determines whether the two coalescing candidates can be merged.
+/// In order to be merged, all instructions must be able to
+///   1. Move to the beginning of the SourceRegion.BranchTargetBlock;
+///   2. Move to the end of the TargetRegion.BranchBlock.
+/// Merging involves moving the instructions in the
+/// TargetRegion.BranchTargetBlock (also SourceRegion.BranchBlock).
+///
+/// This function first try to move instructions from the
+/// TargetRegion.BranchTargetBlock down, to the beginning of the
+/// SourceRegion.BranchTargetBlock. This is not possible if any register defined
+/// in TargetRegion.BranchTargetBlock is used in a PHI node in the
+/// SourceRegion.BranchTargetBlock. In this case, check whether the statement
+/// can be moved up, to the end of the TargetRegion.BranchBlock (immediately
+/// before the branch statement). If it cannot move, then these blocks cannot
+/// be merged.
+///
+/// Note that there is no analysis for moving instructions past the fall-through
+/// blocks because they are confirmed to be empty. An assert is thrown if they
+/// are not.
+///
+/// \param[in] SourceRegion The candidate to move statements from
+/// \param[in] TargetRegion The candidate to move statements to
+/// \return true if all instructions in SourceRegion.BranchBlock can be merged
+///         into a block in TargetRegion, false otherwise.
+///
+bool BranchCoalescing::canMerge(CoalescingCandidateInfo &SourceRegion,
+                                CoalescingCandidateInfo &TargetRegion) const {
+  if (!validateCandidates(SourceRegion, TargetRegion))
+    return false;
+
+  // Walk through PHI nodes first and see if they force the merge into the
+  // SourceRegion.BranchTargetBlock.
+  for (MachineBasicBlock::iterator
+           I = SourceRegion.BranchBlock->instr_begin(),
+           E = SourceRegion.BranchBlock->getFirstNonPHI();
+       I != E; ++I) {
+    for (auto &Def : I->defs())
+      for (auto &Use : MRI->use_instructions(Def.getReg())) {
+        if (Use.isPHI() && Use.getParent() == SourceRegion.BranchTargetBlock) {
+          DEBUG(dbgs() << "PHI " << *I << " defines register used in another "
+                          "PHI within branch target block -- can't merge\n");
+          NumPHINotMoved++;
+          return false;
+        }
+        if (Use.getParent() == SourceRegion.BranchBlock) {
+          DEBUG(dbgs() << "PHI " << *I
+                       << " defines register used in this "
+                          "block -- all must move down\n");
+          SourceRegion.MustMoveDown = true;
+        }
+      }
+  }
+
+  // Walk through the MI to see if they should be merged into
+  // TargetRegion.BranchBlock (up) or SourceRegion.BranchTargetBlock (down)
+  for (MachineBasicBlock::iterator
+           I = SourceRegion.BranchBlock->getFirstNonPHI(),
+           E = SourceRegion.BranchBlock->end();
+       I != E; ++I) {
+    if (!canMoveToBeginning(*I, *SourceRegion.BranchTargetBlock)) {
+      DEBUG(dbgs() << "Instruction " << *I
+                   << " cannot move down - must move up!\n");
+      SourceRegion.MustMoveUp = true;
+    }
+    if (!canMoveToEnd(*I, *TargetRegion.BranchBlock)) {
+      DEBUG(dbgs() << "Instruction " << *I
+                   << " cannot move up - must move down!\n");
+      SourceRegion.MustMoveDown = true;
+    }
+  }
+
+  return (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) ? false : true;
+}
+
+/// Merge the instructions from SourceRegion.BranchBlock,
+/// SourceRegion.BranchTargetBlock, and SourceRegion.FallThroughBlock into
+/// TargetRegion.BranchBlock, TargetRegion.BranchTargetBlock and
+/// TargetRegion.FallThroughBlock respectively.
+///
+/// The successors for blocks in TargetRegion will be updated to use the
+/// successors from blocks in SourceRegion. Finally, the blocks in SourceRegion
+/// will be removed from the function.
+///
+/// A region consists of a BranchBlock, a FallThroughBlock, and a
+/// BranchTargetBlock. Branch coalesce works on patterns where the
+/// TargetRegion's BranchTargetBlock must also be the SourceRegions's
+/// BranchBlock.
+///
+///  Before mergeCandidates:
+///
+///  +---------------------------+
+///  |  TargetRegion.BranchBlock |
+///  +---------------------------+
+///     /        |
+///    /   +--------------------------------+
+///   |    |  TargetRegion.FallThroughBlock |
+///    \   +--------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  TargetRegion.BranchTargetBlock  |
+///  |  SourceRegion.BranchBlock        |
+///  +----------------------------------+
+///     /        |
+///    /   +--------------------------------+
+///   |    |  SourceRegion.FallThroughBlock |
+///    \   +--------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  SourceRegion.BranchTargetBlock  |
+///  +----------------------------------+
+///
+///  After mergeCandidates:
+///
+///  +-----------------------------+
+///  |  TargetRegion.BranchBlock   |
+///  |  SourceRegion.BranchBlock   |
+///  +-----------------------------+
+///     /        |
+///    /   +---------------------------------+
+///   |    |  TargetRegion.FallThroughBlock  |
+///   |    |  SourceRegion.FallThroughBlock  |
+///    \   +---------------------------------+
+///     \        |
+///  +----------------------------------+
+///  |  SourceRegion.BranchTargetBlock  |
+///  +----------------------------------+
+///
+/// \param[in] SourceRegion The candidate to move blocks from
+/// \param[in] TargetRegion The candidate to move blocks to
+///
+bool BranchCoalescing::mergeCandidates(CoalescingCandidateInfo &SourceRegion,
+                                       CoalescingCandidateInfo &TargetRegion) {
+
+  if (SourceRegion.MustMoveUp && SourceRegion.MustMoveDown) {
+    llvm_unreachable("Cannot have both MustMoveDown and MustMoveUp set!");
+    return false;
+  }
+
+  if (!validateCandidates(SourceRegion, TargetRegion))
+    return false;
+
+  // Start the merging process by first handling the BranchBlock.
+  // Move any PHIs in SourceRegion.BranchBlock down to the branch-taken block
+  moveAndUpdatePHIs(SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock);
+
+  // Move remaining instructions in SourceRegion.BranchBlock into
+  // TargetRegion.BranchBlock
+  MachineBasicBlock::iterator firstInstr =
+      SourceRegion.BranchBlock->getFirstNonPHI();
+  MachineBasicBlock::iterator lastInstr =
+      SourceRegion.BranchBlock->getFirstTerminator();
+
+  MachineBasicBlock *Source = SourceRegion.MustMoveDown
+                                  ? SourceRegion.BranchTargetBlock
+                                  : TargetRegion.BranchBlock;
+
+  MachineBasicBlock::iterator Target =
+      SourceRegion.MustMoveDown
+          ? SourceRegion.BranchTargetBlock->getFirstNonPHI()
+          : TargetRegion.BranchBlock->getFirstTerminator();
+
+  Source->splice(Target, SourceRegion.BranchBlock, firstInstr, lastInstr);
+
+  // Once PHI and instructions have been moved we need to clean up the
+  // control flow.
+
+  // Remove SourceRegion.FallThroughBlock before transferring successors of
+  // SourceRegion.BranchBlock to TargetRegion.BranchBlock.
+  SourceRegion.BranchBlock->removeSuccessor(SourceRegion.FallThroughBlock);
+  TargetRegion.BranchBlock->transferSuccessorsAndUpdatePHIs(
+      SourceRegion.BranchBlock);
+  // Update branch in TargetRegion.BranchBlock to jump to
+  // SourceRegion.BranchTargetBlock
+  // In this case, TargetRegion.BranchTargetBlock == SourceRegion.BranchBlock.
+  TargetRegion.BranchBlock->ReplaceUsesOfBlockWith(
+      SourceRegion.BranchBlock, SourceRegion.BranchTargetBlock);
+  // Remove the branch statement(s) in SourceRegion.BranchBlock
+  MachineBasicBlock::iterator I =
+      SourceRegion.BranchBlock->terminators().begin();
+  while (I != SourceRegion.BranchBlock->terminators().end()) {
+    MachineInstr &CurrInst = *I;
+    ++I;
+    if (CurrInst.isBranch())
+      CurrInst.eraseFromParent();
+  }
+
+  // Fall-through block should be empty since this is part of the condition
+  // to coalesce the branches.
+  assert(TargetRegion.FallThroughBlock->empty() &&
+         "FallThroughBlocks should be empty!");
+
+  // Transfer successor information and move PHIs down to the
+  // branch-taken block.
+  TargetRegion.FallThroughBlock->transferSuccessorsAndUpdatePHIs(
+      SourceRegion.FallThroughBlock);
+  TargetRegion.FallThroughBlock->removeSuccessor(SourceRegion.BranchBlock);
+
+  // Remove the blocks from the function.
+  assert(SourceRegion.BranchBlock->empty() &&
+         "Expecting branch block to be empty!");
+  SourceRegion.BranchBlock->eraseFromParent();
+
+  assert(SourceRegion.FallThroughBlock->empty() &&
+         "Expecting fall-through block to be empty!\n");
+  SourceRegion.FallThroughBlock->eraseFromParent();
+
+  NumBlocksCoalesced++;
+  return true;
+}
+
+bool BranchCoalescing::runOnMachineFunction(MachineFunction &MF) {
+
+  if (skipFunction(*MF.getFunction()) || MF.empty() ||
+      !isBranchCoalescingEnabled())
+    return false;
+
+  bool didSomething = false;
+
+  DEBUG(dbgs() << "******** Branch Coalescing ********\n");
+  initialize(MF);
+
+  DEBUG(dbgs() << "Function: "; MF.dump(); dbgs() << "\n");
+
+  CoalescingCandidateInfo Cand1, Cand2;
+  // Walk over blocks and find candidates to merge
+  // Continue trying to merge with the first candidate found, as long as merging
+  // is successfull.
+  for (MachineBasicBlock &MBB : MF) {
+    bool MergedCandidates = false;
+    do {
+      MergedCandidates = false;
+      Cand1.clear();
+      Cand2.clear();
+
+      Cand1.BranchBlock = &MBB;
+
+      // If unable to coalesce the branch, then continue to next block
+      if (!canCoalesceBranch(Cand1))
+        break;
+
+      Cand2.BranchBlock = Cand1.BranchTargetBlock;
+      if (!canCoalesceBranch(Cand2))
+        break;
+
+      // Sanity check
+      // The branch-taken block of the second candidate should post-dominate the
+      // first candidate
+      assert(MPDT->dominates(Cand2.BranchTargetBlock, Cand1.BranchBlock) &&
+             "Branch-taken block should post-dominate first candidate");
+
+      if (!identicalOperands(Cand1.Cond, Cand2.Cond)) {
+        DEBUG(dbgs() << "Blocks " << Cand1.BranchBlock->getNumber() << " and "
+                     << Cand2.BranchBlock->getNumber()
+                     << " have different branches\n");
+        break;
+      }
+      if (!canMerge(Cand2, Cand1)) {
+        DEBUG(dbgs() << "Cannot merge blocks " << Cand1.BranchBlock->getNumber()
+                     << " and " << Cand2.BranchBlock->getNumber() << "\n");
+        NumBlocksNotCoalesced++;
+        continue;
+      }
+      DEBUG(dbgs() << "Merging blocks " << Cand1.BranchBlock->getNumber()
+                   << " and " << Cand1.BranchTargetBlock->getNumber() << "\n");
+      MergedCandidates = mergeCandidates(Cand2, Cand1);
+      if (MergedCandidates)
+        didSomething = true;
+
+      DEBUG(dbgs() << "Function after merging: "; MF.dump(); dbgs() << "\n");
+    } while (MergedCandidates);
+  }
+
+#ifndef NDEBUG
+  // Verify MF is still valid after branch coalescing
+  if (didSomething)
+    MF.verify(nullptr, "Error in code produced by branch coalescing");
+#endif // NDEBUG
+
+  DEBUG(dbgs() << "Finished Branch Coalescing\n");
+  return didSomething;
+}
diff --git a/lib/CodeGen/BranchFolding.cpp b/lib/CodeGen/BranchFolding.cpp
index 1bdfa2dbbb99bd5b37600ba8ca056ab9298ddadf..2d01301402f04370a81d87be965d7ec2793711b0 100644
--- a/lib/CodeGen/BranchFolding.cpp
+++ b/lib/CodeGen/BranchFolding.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -124,8 +125,6 @@ BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
   }
 }
 
-/// RemoveDeadBlock - Remove the specified dead machine basic block from the
-/// function, updating the CFG.
 void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
   assert(MBB->pred_empty() && "MBB must be dead!");
   DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
@@ -145,9 +144,6 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
     MLI->removeBlock(MBB);
 }
 
-/// OptimizeFunction - Perhaps branch folding, tail merging and other
-/// CFG optimizations on the given function.  Block placement changes the layout
-/// and may create new tail merging opportunities.
 bool BranchFolder::OptimizeFunction(MachineFunction &MF,
                                     const TargetInstrInfo *tii,
                                     const TargetRegisterInfo *tri,
@@ -349,8 +345,6 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
   return TailLen;
 }
 
-/// ReplaceTailWithBranchTo - Delete the instruction OldInst and everything
-/// after it, replacing it with an unconditional branch to NewDest.
 void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                            MachineBasicBlock *NewDest) {
   TII->ReplaceTailWithBranchTo(OldInst, NewDest);
@@ -363,9 +357,6 @@ void BranchFolder::ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
   ++NumTailMerge;
 }
 
-/// SplitMBBAt - Given a machine basic block and an iterator into it, split the
-/// MBB so that the part before the iterator falls into the part starting at the
-/// iterator.  This returns the new MBB.
 MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
                                             MachineBasicBlock::iterator BBI1,
                                             const BasicBlock *BB) {
@@ -389,7 +380,7 @@ MachineBasicBlock *BranchFolder::SplitMBBAt(MachineBasicBlock &CurMBB,
   NewMBB->splice(NewMBB->end(), &CurMBB, BBI1, CurMBB.end());
 
   // NewMBB belongs to the same loop as CurMBB.
-  if (MLI) 
+  if (MLI)
     if (MachineLoop *ML = MLI->getLoopFor(&CurMBB))
       ML->addBasicBlockToLoop(NewMBB, MLI->getBase());
 
@@ -437,7 +428,7 @@ static void FixTail(MachineBasicBlock *CurMBB, MachineBasicBlock *SuccBB,
   MachineFunction::iterator I = std::next(MachineFunction::iterator(CurMBB));
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  DebugLoc dl;  // FIXME: this is nowhere
+  DebugLoc dl = CurMBB->findBranchDebugLoc();
   if (I != MF->end() && !TII->analyzeBranch(*CurMBB, TBB, FBB, Cond, true)) {
     MachineBasicBlock *NextBB = &*I;
     if (TBB == NextBB && !Cond.empty() && !FBB) {
@@ -498,7 +489,14 @@ BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
   return MBFI.printBlockFreq(OS, Freq);
 }
 
-void BranchFolder::MBFIWrapper::view(bool isSimple) { MBFI.view(isSimple); }
+void BranchFolder::MBFIWrapper::view(const Twine &Name, bool isSimple) {
+  MBFI.view(Name, isSimple);
+}
+
+uint64_t
+BranchFolder::MBFIWrapper::getEntryFreq() const {
+  return MBFI.getEntryFreq();
+}
 
 /// CountTerminators - Count the number of terminators in the given
 /// block and set I to the position of the first non-terminator, if there
@@ -519,6 +517,17 @@ static unsigned CountTerminators(MachineBasicBlock *MBB,
   return NumTerms;
 }
 
+/// A no successor, non-return block probably ends in unreachable and is cold.
+/// Also consider a block that ends in an indirect branch to be a return block,
+/// since many targets use plain indirect branches to return.
+static bool blockEndsInUnreachable(const MachineBasicBlock *MBB) {
+  if (!MBB->succ_empty())
+    return false;
+  if (MBB->empty())
+    return true;
+  return !(MBB->back().isReturn() || MBB->back().isIndirectBranch());
+}
+
 /// ProfitableToMerge - Check if two machine basic blocks have a common tail
 /// and decide if it would be profitable to merge those tails.  Return the
 /// length of the common tail and iterators to the first common instruction
@@ -573,6 +582,15 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
       return true;
   }
 
+  // If these are identical non-return blocks with no successors, merge them.
+  // Such blocks are typically cold calls to noreturn functions like abort, and
+  // are unlikely to become a fallthrough target after machine block placement.
+  // Tail merging these blocks is unlikely to create additional unconditional
+  // branches, and will reduce the size of this cold code.
+  if (I1 == MBB1->begin() && I2 == MBB2->begin() &&
+      blockEndsInUnreachable(MBB1) && blockEndsInUnreachable(MBB2))
+    return true;
+
   // If one of the blocks can be completely merged and happens to be in
   // a position where the other could fall through into it, merge any number
   // of instructions, because it can be done without a branch.
@@ -582,6 +600,22 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
   if (MBB2->isLayoutSuccessor(MBB1) && I1 == MBB1->begin())
     return true;
 
+  // If both blocks are identical and end in a branch, merge them unless they
+  // both have a fallthrough predecessor and successor.
+  // We can only do this after block placement because it depends on whether
+  // there are fallthroughs, and we don't know until after layout.
+  if (AfterPlacement && I1 == MBB1->begin() && I2 == MBB2->begin()) {
+    auto BothFallThrough = [](MachineBasicBlock *MBB) {
+      if (MBB->succ_size() != 0 && !MBB->canFallThrough())
+        return false;
+      MachineFunction::iterator I(MBB);
+      MachineFunction *MF = MBB->getParent();
+      return (MBB != &*MF->begin()) && std::prev(I)->canFallThrough();
+    };
+    if (!BothFallThrough(MBB1) || !BothFallThrough(MBB2))
+      return true;
+  }
+
   // If both blocks have an unconditional branch temporarily stripped out,
   // count that as an additional common instruction for the following
   // heuristics. This heuristic is only accurate for single-succ blocks, so to
@@ -607,16 +641,6 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
          (I1 == MBB1->begin() || I2 == MBB2->begin());
 }
 
-/// ComputeSameTails - Look through all the blocks in MergePotentials that have
-/// hash CurHash (guaranteed to match the last element).  Build the vector
-/// SameTails of all those that have the (same) largest number of instructions
-/// in common of any pair of these blocks.  SameTails entries contain an
-/// iterator into MergePotentials (from which the MachineBasicBlock can be
-/// found) and a MachineBasicBlock::iterator into that MBB indicating the
-/// instruction where the matching code sequence begins.
-/// Order of elements in SameTails is the reverse of the order in which
-/// those blocks appear in MergePotentials (where they are not necessarily
-/// consecutive).
 unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
                                         unsigned MinCommonTailLength,
                                         MachineBasicBlock *SuccBB,
@@ -653,8 +677,6 @@ unsigned BranchFolder::ComputeSameTails(unsigned CurHash,
   return maxCommonTailLength;
 }
 
-/// RemoveBlocksWithHash - Remove all blocks with hash CurHash from
-/// MergePotentials, restoring branches at ends of blocks as appropriate.
 void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
                                         MachineBasicBlock *SuccBB,
                                         MachineBasicBlock *PredBB) {
@@ -674,8 +696,6 @@ void BranchFolder::RemoveBlocksWithHash(unsigned CurHash,
   MergePotentials.erase(CurMPIter, MergePotentials.end());
 }
 
-/// CreateCommonTailOnlyBlock - None of the blocks to be tail-merged consist
-/// only of the common tail.  Create a block that does by splitting one.
 bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
                                              MachineBasicBlock *SuccBB,
                                              unsigned maxCommonTailLength,
@@ -726,6 +746,43 @@ bool BranchFolder::CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
   return true;
 }
 
+void BranchFolder::MergeCommonTailDebugLocs(unsigned commonTailIndex) {
+  MachineBasicBlock *MBB = SameTails[commonTailIndex].getBlock();
+
+  std::vector<MachineBasicBlock::iterator> NextCommonInsts(SameTails.size());
+  for (unsigned int i = 0 ; i != SameTails.size() ; ++i) {
+    if (i != commonTailIndex)
+      NextCommonInsts[i] = SameTails[i].getTailStartPos();
+    else {
+      assert(SameTails[i].getTailStartPos() == MBB->begin() &&
+          "MBB is not a common tail only block");
+    }
+  }
+
+  for (auto &MI : *MBB) {
+    if (MI.isDebugValue())
+      continue;
+    DebugLoc DL = MI.getDebugLoc();
+    for (unsigned int i = 0 ; i < NextCommonInsts.size() ; i++) {
+      if (i == commonTailIndex)
+        continue;
+
+      auto &Pos = NextCommonInsts[i];
+      assert(Pos != SameTails[i].getBlock()->end() &&
+          "Reached BB end within common tail");
+      while (Pos->isDebugValue()) {
+        ++Pos;
+        assert(Pos != SameTails[i].getBlock()->end() &&
+            "Reached BB end within common tail");
+      }
+      assert(MI.isIdenticalTo(*Pos) && "Expected matching MIIs!");
+      DL = DILocation::getMergedLocation(DL, Pos->getDebugLoc());
+      NextCommonInsts[i] = ++Pos;
+    }
+    MI.setDebugLoc(DL);
+  }
+}
+
 static void
 mergeOperations(MachineBasicBlock::iterator MBBIStartPos,
                 MachineBasicBlock &MBBCommon) {
@@ -878,10 +935,8 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
     // Recompute common tail MBB's edge weights and block frequency.
     setCommonTailEdgeWeights(*MBB);
 
-    // Remove the original debug location from the common tail.
-    for (auto &MI : *MBB)
-      if (!MI.isDebugValue())
-        MI.setDebugLoc(DebugLoc());
+    // Merge debug locations across identical instructions for common tail.
+    MergeCommonTailDebugLocs(commonTailIndex);
 
     // MBB is common tail.  Adjust all other BB's to jump to this one.
     // Traversal must be forwards so erases work.
@@ -1046,7 +1101,7 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
 
         // Remove the unconditional branch at the end, if any.
         if (TBB && (Cond.empty() || FBB)) {
-          DebugLoc dl;  // FIXME: this is nowhere
+          DebugLoc dl = PBB->findBranchDebugLoc();
           TII->removeBranch(*PBB);
           if (!Cond.empty())
             // reinsert conditional branch only, for now
@@ -1196,8 +1251,6 @@ static DebugLoc getBranchDebugLoc(MachineBasicBlock &MBB) {
   return DebugLoc();
 }
 
-/// OptimizeBlock - Analyze and optimize control flow related to the specified
-/// block.  This is never called on the entry block.
 bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
   bool MadeChange = false;
   MachineFunction &MF = *MBB->getParent();
@@ -1638,8 +1691,6 @@ ReoptimizeBlock:
 //  Hoist Common Code
 //===----------------------------------------------------------------------===//
 
-/// HoistCommonCode - Hoist common instruction sequences at the start of basic
-/// blocks to their common predecessor.
 bool BranchFolder::HoistCommonCode(MachineFunction &MF) {
   bool MadeChange = false;
   for (MachineFunction::iterator I = MF.begin(), E = MF.end(); I != E; ) {
@@ -1773,9 +1824,6 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
   return PI;
 }
 
-/// HoistCommonCodeInSuccs - If the successors of MBB has common instruction
-/// sequence at the start of the function, move the instructions before MBB
-/// terminator if it's legal.
 bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
diff --git a/lib/CodeGen/BranchFolding.h b/lib/CodeGen/BranchFolding.h
index a6f2ddbcd8f33555bb4d14ac24527da67887bfe3..4852721eea10247f434c6d2aa6cfea826ed1b09e 100644
--- a/lib/CodeGen/BranchFolding.h
+++ b/lib/CodeGen/BranchFolding.h
@@ -37,6 +37,9 @@ namespace llvm {
                           // flag. Ignored for optsize.
                           unsigned MinCommonTailLength = 0);
 
+    /// Perhaps branch folding, tail merging and other CFG optimizations on the
+    /// given function.  Block placement changes the layout and may create new
+    /// tail merging opportunities.
     bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii,
                           const TargetRegisterInfo *tri, MachineModuleInfo *mmi,
                           MachineLoopInfo *mli = nullptr,
@@ -122,7 +125,8 @@ namespace llvm {
                                   const MachineBasicBlock *MBB) const;
       raw_ostream &printBlockFreq(raw_ostream &OS,
                                   const BlockFrequency Freq) const;
-      void view(bool isSimple = true);
+      void view(const Twine &Name, bool isSimple = true);
+      uint64_t getEntryFreq() const;
 
     private:
       const MachineBlockFrequencyInfo &MBFI;
@@ -138,26 +142,64 @@ namespace llvm {
                        MachineBasicBlock* PredBB,
                        unsigned MinCommonTailLength);
     void setCommonTailEdgeWeights(MachineBasicBlock &TailMBB);
+
+    /// Delete the instruction OldInst and everything after it, replacing it
+    /// with an unconditional branch to NewDest.
     void ReplaceTailWithBranchTo(MachineBasicBlock::iterator OldInst,
                                  MachineBasicBlock *NewDest);
+
+    /// Given a machine basic block and an iterator into it, split the MBB so
+    /// that the part before the iterator falls into the part starting at the
+    /// iterator.  This returns the new MBB.
     MachineBasicBlock *SplitMBBAt(MachineBasicBlock &CurMBB,
                                   MachineBasicBlock::iterator BBI1,
                                   const BasicBlock *BB);
+
+    /// Look through all the blocks in MergePotentials that have hash CurHash
+    /// (guaranteed to match the last element).  Build the vector SameTails of
+    /// all those that have the (same) largest number of instructions in common
+    /// of any pair of these blocks.  SameTails entries contain an iterator into
+    /// MergePotentials (from which the MachineBasicBlock can be found) and a
+    /// MachineBasicBlock::iterator into that MBB indicating the instruction
+    /// where the matching code sequence begins.  Order of elements in SameTails
+    /// is the reverse of the order in which those blocks appear in
+    /// MergePotentials (where they are not necessarily consecutive).
     unsigned ComputeSameTails(unsigned CurHash, unsigned minCommonTailLength,
                               MachineBasicBlock *SuccBB,
                               MachineBasicBlock *PredBB);
+
+    /// Remove all blocks with hash CurHash from MergePotentials, restoring
+    /// branches at ends of blocks as appropriate.
     void RemoveBlocksWithHash(unsigned CurHash, MachineBasicBlock* SuccBB,
                                                 MachineBasicBlock* PredBB);
+
+    /// None of the blocks to be tail-merged consist only of the common tail.
+    /// Create a block that does by splitting one.
     bool CreateCommonTailOnlyBlock(MachineBasicBlock *&PredBB,
                                    MachineBasicBlock *SuccBB,
                                    unsigned maxCommonTailLength,
                                    unsigned &commonTailIndex);
 
+    /// Create merged DebugLocs of identical instructions across SameTails and
+    /// assign it to the instruction in common tail.
+    void MergeCommonTailDebugLocs(unsigned commonTailIndex);
+
     bool OptimizeBranches(MachineFunction &MF);
+
+    /// Analyze and optimize control flow related to the specified block. This
+    /// is never called on the entry block.
     bool OptimizeBlock(MachineBasicBlock *MBB);
+
+    /// Remove the specified dead machine basic block from the function,
+    /// updating the CFG.
     void RemoveDeadBlock(MachineBasicBlock *MBB);
 
+    /// Hoist common instruction sequences at the start of basic blocks to their
+    /// common predecessor.
     bool HoistCommonCode(MachineFunction &MF);
+
+    /// If the successors of MBB has common instruction sequence at the start of
+    /// the function, move the instructions before MBB terminator if it's legal.
     bool HoistCommonCodeInSuccs(MachineBasicBlock *MBB);
   };
 }
diff --git a/lib/CodeGen/BuiltinGCs.cpp b/lib/CodeGen/BuiltinGCs.cpp
index ff7c99de0420e4ad61d9e6c43218daa2ba24814d..e4eab8c513d991fd5e1fc0f0f31e5a37a2d7e4d9 100644
--- a/lib/CodeGen/BuiltinGCs.cpp
+++ b/lib/CodeGen/BuiltinGCs.cpp
@@ -1,4 +1,4 @@
-//===-- BuiltinGCs.cpp - Boilerplate for our built in GC types --*- C++ -*-===//
+//===- BuiltinGCs.cpp - Boilerplate for our built in GC types -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,6 +14,8 @@
 
 #include "llvm/CodeGen/GCs.h"
 #include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
 
@@ -77,6 +79,7 @@ public:
     UsesMetadata = false;
     CustomRoots = false;
   }
+
   Optional<bool> isGCManagedPointer(const Type *Ty) const override {
     // Method is only valid on pointer typed values.
     const PointerType *PT = cast<PointerType>(Ty);
@@ -110,6 +113,7 @@ public:
     UsesMetadata = false;
     CustomRoots = false;
   }
+
   Optional<bool> isGCManagedPointer(const Type *Ty) const override {
     // Method is only valid on pointer typed values.
     const PointerType *PT = cast<PointerType>(Ty);
@@ -117,7 +121,8 @@ public:
     return (1 == PT->getAddressSpace());
   }
 };
-}
+
+} // end anonymous namespace
 
 // Register all the above so that they can be found at runtime.  Note that
 // these static initializers are important since the registration list is
diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt
index 43bca0ef5f015af1949e6a094c9045c1164220c8..0912d9f68aff20b8c923fa83dc54fb7c8fc148c2 100644
--- a/lib/CodeGen/CMakeLists.txt
+++ b/lib/CodeGen/CMakeLists.txt
@@ -4,6 +4,7 @@ add_llvm_library(LLVMCodeGen
   Analysis.cpp
   AtomicExpandPass.cpp
   BasicTargetTransformInfo.cpp
+  BranchCoalescing.cpp
   BranchFolding.cpp
   BranchRelaxation.cpp
   BuiltinGCs.cpp
@@ -23,6 +24,7 @@ add_llvm_library(LLVMCodeGen
   ExpandISelPseudos.cpp
   ExpandPostRAPseudos.cpp
   FaultMaps.cpp
+  FEntryInserter.cpp
   FuncletLayout.cpp
   GCMetadata.cpp
   GCMetadataPrinter.cpp
@@ -36,6 +38,7 @@ add_llvm_library(LLVMCodeGen
   InterleavedAccessPass.cpp
   IntrinsicLowering.cpp
   LatencyPriorityQueue.cpp
+  LazyMachineBlockFrequencyInfo.cpp
   LexicalScopes.cpp
   LiveDebugValues.cpp
   LiveDebugVariables.cpp
@@ -72,6 +75,7 @@ add_llvm_library(LLVMCodeGen
   MachineModuleInfo.cpp
   MachineModuleInfoImpls.cpp
   MachineOptimizationRemarkEmitter.cpp
+  MachineOutliner.cpp
   MachinePassRegistry.cpp
   MachinePipeliner.cpp
   MachinePostDominators.cpp
@@ -149,7 +153,7 @@ add_llvm_library(LLVMCodeGen
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/CodeGen/PBQP
 
-  LINK_LIBS ${PTHREAD_LIB}
+  LINK_LIBS ${LLVM_PTHREAD_LIB}
 
   DEPENDS
   intrinsics_gen
diff --git a/lib/CodeGen/CallingConvLower.cpp b/lib/CodeGen/CallingConvLower.cpp
index 2e33f14c7ee3e1632a33b89b4ad2c1aeff42f8a2..7cad4d0311694be55de2f0bd1c44efe6cf48acb2 100644
--- a/lib/CodeGen/CallingConvLower.cpp
+++ b/lib/CodeGen/CallingConvLower.cpp
@@ -30,8 +30,7 @@ using namespace llvm;
 CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
                  SmallVectorImpl<CCValAssign> &locs, LLVMContext &C)
     : CallingConv(CC), IsVarArg(isVarArg), MF(mf),
-      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C),
-      CallOrPrologue(Unknown) {
+      TRI(*MF.getSubtarget().getRegisterInfo()), Locs(locs), Context(C) {
   // No stack is used.
   StackOffset = 0;
   MaxStackArgAlign = 1;
diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp
index 4cf9b138f10d3bf8fba07659978598d478a39d5d..3fc12ccc3b60c8bc944a4beac2e93e9c0e72d022 100644
--- a/lib/CodeGen/CodeGen.cpp
+++ b/lib/CodeGen/CodeGen.cpp
@@ -21,6 +21,7 @@ using namespace llvm;
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAtomicExpandPass(Registry);
+  initializeBranchCoalescingPass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeBranchRelaxationPass(Registry);
   initializeCodeGenPreparePass(Registry);
@@ -31,12 +32,15 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeEarlyIfConverterPass(Registry);
   initializeExpandISelPseudosPass(Registry);
   initializeExpandPostRAPass(Registry);
+  initializeFEntryInserterPass(Registry);
   initializeFinalizeMachineBundlesPass(Registry);
   initializeFuncletLayoutPass(Registry);
   initializeGCMachineCodeAnalysisPass(Registry);
   initializeGCModuleInfoPass(Registry);
   initializeIfConverterPass(Registry);
+  initializeImplicitNullChecksPass(Registry);
   initializeInterleavedAccessPass(Registry);
+  initializeLiveDebugValuesPass(Registry);
   initializeLiveDebugVariablesPass(Registry);
   initializeLiveIntervalsPass(Registry);
   initializeLiveStacksPass(Registry);
@@ -47,7 +51,6 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineBlockPlacementPass(Registry);
   initializeMachineBlockPlacementStatsPass(Registry);
   initializeMachineCSEPass(Registry);
-  initializeImplicitNullChecksPass(Registry);
   initializeMachineCombinerPass(Registry);
   initializeMachineCopyPropagationPass(Registry);
   initializeMachineDominatorTreePass(Registry);
@@ -55,16 +58,18 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeMachineLICMPass(Registry);
   initializeMachineLoopInfoPass(Registry);
   initializeMachineModuleInfoPass(Registry);
+  initializeMachineOptimizationRemarkEmitterPassPass(Registry);
+  initializeMachineOutlinerPass(Registry);
   initializeMachinePipelinerPass(Registry);
   initializeMachinePostDominatorTreePass(Registry);
+  initializeMachineRegionInfoPassPass(Registry);
   initializeMachineSchedulerPass(Registry);
   initializeMachineSinkingPass(Registry);
   initializeMachineVerifierPassPass(Registry);
-  initializeXRayInstrumentationPass(Registry);
-  initializePatchableFunctionPass(Registry);
   initializeOptimizePHIsPass(Registry);
   initializePEIPass(Registry);
   initializePHIEliminationPass(Registry);
+  initializePatchableFunctionPass(Registry);
   initializePeepholeOptimizerPass(Registry);
   initializePostMachineSchedulerPass(Registry);
   initializePostRAHazardRecognizerPass(Registry);
@@ -74,12 +79,11 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeRAGreedyPass(Registry);
   initializeRegisterCoalescerPass(Registry);
   initializeRenameIndependentSubregsPass(Registry);
+  initializeSafeStackPass(Registry);
   initializeShrinkWrapPass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackMapLivenessPass(Registry);
-  initializeLiveDebugValuesPass(Registry);
-  initializeSafeStackPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
   initializeTailDuplicatePassPass(Registry);
@@ -91,6 +95,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeVirtRegMapPass(Registry);
   initializeVirtRegRewriterPass(Registry);
   initializeWinEHPreparePass(Registry);
+  initializeXRayInstrumentationPass(Registry);
 }
 
 void LLVMInitializeCodeGen(LLVMPassRegistryRef R) {
diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp
index 3029cb10a78edceb20e6022877a60347cd5fc3a1..2bdd189557b40dc6cc5f8af4e3c717a3aa3ffec7 100644
--- a/lib/CodeGen/CodeGenPrepare.cpp
+++ b/lib/CodeGen/CodeGenPrepare.cpp
@@ -15,10 +15,12 @@
 
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
@@ -53,8 +55,10 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SimplifyLibCalls.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
@@ -77,7 +81,6 @@ STATISTIC(NumAndUses, "Number of uses of and mask instructions optimized");
 STATISTIC(NumRetsDup,    "Number of return instructions duplicated");
 STATISTIC(NumDbgValueMoved, "Number of debug value instructions moved");
 STATISTIC(NumSelectsExpanded, "Number of selects turned into branches");
-STATISTIC(NumAndCmpsMoved, "Number of and/cmp's pushed into branches");
 STATISTIC(NumStoreExtractExposed, "Number of store(extractelement) exposed");
 
 static cl::opt<bool> DisableBranchOpts(
@@ -93,7 +96,7 @@ static cl::opt<bool> DisableSelectToBranch(
   cl::desc("Disable select to branch conversion."));
 
 static cl::opt<bool> AddrSinkUsingGEPs(
-  "addr-sink-using-gep", cl::Hidden, cl::init(false),
+  "addr-sink-using-gep", cl::Hidden, cl::init(true),
   cl::desc("Address sinking in CGP using GEPs."));
 
 static cl::opt<bool> EnableAndCmpSinking(
@@ -135,15 +138,24 @@ static cl::opt<bool> ForceSplitStore(
     "force-split-store", cl::Hidden, cl::init(false),
     cl::desc("Force store splitting no matter what the target query says."));
 
+static cl::opt<bool>
+EnableTypePromotionMerge("cgp-type-promotion-merge", cl::Hidden,
+    cl::desc("Enable merging of redundant sexts when one is dominating"
+    " the other."), cl::init(true));
+
 namespace {
 typedef SmallPtrSet<Instruction *, 16> SetOfInstrs;
 typedef PointerIntPair<Type *, 1, bool> TypeIsSExt;
 typedef DenseMap<Instruction *, TypeIsSExt> InstrToOrigTy;
+typedef SmallVector<Instruction *, 16> SExts;
+typedef DenseMap<Value *, SExts> ValueToSExts;
 class TypePromotionTransaction;
 
   class CodeGenPrepare : public FunctionPass {
     const TargetMachine *TM;
+    const TargetSubtargetInfo *SubtargetInfo;
     const TargetLowering *TLI;
+    const TargetRegisterInfo *TRI;
     const TargetTransformInfo *TTI;
     const TargetLibraryInfo *TLInfo;
     const LoopInfo *LI;
@@ -165,6 +177,15 @@ class TypePromotionTransaction;
     /// promotion for the current function.
     InstrToOrigTy PromotedInsts;
 
+    /// Keep track of instructions removed during promotion.
+    SetOfInstrs RemovedInsts;
+
+    /// Keep track of sext chains based on their initial value.
+    DenseMap<Value *, Instruction *> SeenChainsForSExt;
+
+    /// Keep track of SExt promoted.
+    ValueToSExts ValToSExtendedUses;
+
     /// True if CFG is modified in any way.
     bool ModifiedDT;
 
@@ -206,7 +227,7 @@ class TypePromotionTransaction;
                             Type *AccessTy, unsigned AS);
     bool optimizeInlineAsmInst(CallInst *CS);
     bool optimizeCallInst(CallInst *CI, bool& ModifiedDT);
-    bool moveExtToFormExtLoad(Instruction *&I);
+    bool optimizeExt(Instruction *&I);
     bool optimizeExtUses(Instruction *I);
     bool optimizeLoadExt(LoadInst *I);
     bool optimizeSelectInst(SelectInst *SI);
@@ -215,13 +236,21 @@ class TypePromotionTransaction;
     bool optimizeExtractElementInst(Instruction *Inst);
     bool dupRetToEnableTailCallOpts(BasicBlock *BB);
     bool placeDbgValues(Function &F);
-    bool sinkAndCmp(Function &F);
-    bool extLdPromotion(TypePromotionTransaction &TPT, LoadInst *&LI,
-                        Instruction *&Inst,
-                        const SmallVectorImpl<Instruction *> &Exts,
-                        unsigned CreatedInstCost);
+    bool canFormExtLd(const SmallVectorImpl<Instruction *> &MovedExts,
+                      LoadInst *&LI, Instruction *&Inst, bool HasPromoted);
+    bool tryToPromoteExts(TypePromotionTransaction &TPT,
+                          const SmallVectorImpl<Instruction *> &Exts,
+                          SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
+                          unsigned CreatedInstsCost = 0);
+    bool mergeSExts(Function &F);
+    bool performAddressTypePromotion(
+        Instruction *&Inst,
+        bool AllowPromotionWithoutCommonHeader,
+        bool HasPromoted, TypePromotionTransaction &TPT,
+        SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
     bool splitBranchCondition(Function &F);
     bool simplifyOffsetableRelocate(Instruction &I);
+    bool splitIndirectCriticalEdges(Function &F);
   };
 }
 
@@ -250,8 +279,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   BPI.reset();
 
   ModifiedDT = false;
-  if (TM)
-    TLI = TM->getSubtargetImpl(F)->getTargetLowering();
+  if (TM) {
+    SubtargetInfo = TM->getSubtargetImpl(F);
+    TLI = SubtargetInfo->getTargetLowering();
+    TRI = SubtargetInfo->getRegisterInfo();
+  }
   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -260,9 +292,9 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   if (ProfileGuidedSectionPrefix) {
     ProfileSummaryInfo *PSI =
         getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-    if (PSI->isFunctionEntryHot(&F))
+    if (PSI->isFunctionHotInCallGraph(&F))
       F.setSectionPrefix(".hot");
-    else if (PSI->isFunctionEntryCold(&F))
+    else if (PSI->isFunctionColdInCallGraph(&F))
       F.setSectionPrefix(".cold");
   }
 
@@ -290,18 +322,19 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // find a node corresponding to the value.
   EverMadeChange |= placeDbgValues(F);
 
-  // If there is a mask, compare against zero, and branch that can be combined
-  // into a single target instruction, push the mask and compare into branch
-  // users. Do this before OptimizeBlock -> OptimizeInst ->
-  // OptimizeCmpExpression, which perturbs the pattern being searched for.
-  if (!DisableBranchOpts) {
-    EverMadeChange |= sinkAndCmp(F);
+  if (!DisableBranchOpts)
     EverMadeChange |= splitBranchCondition(F);
-  }
+
+  // Split some critical edges where one of the sources is an indirect branch,
+  // to help generate sane code for PHIs involving such edges.
+  EverMadeChange |= splitIndirectCriticalEdges(F);
 
   bool MadeChange = true;
   while (MadeChange) {
     MadeChange = false;
+    SeenChainsForSExt.clear();
+    ValToSExtendedUses.clear();
+    RemovedInsts.clear();
     for (Function::iterator I = F.begin(); I != F.end(); ) {
       BasicBlock *BB = &*I++;
       bool ModifiedDTOnIteration = false;
@@ -311,6 +344,13 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
       if (ModifiedDTOnIteration)
         break;
     }
+    if (EnableTypePromotionMerge && !ValToSExtendedUses.empty())
+      MadeChange |= mergeSExts(F);
+
+    // Really free removed instructions during promotion.
+    for (Instruction *I : RemovedInsts)
+      delete I;
+
     EverMadeChange |= MadeChange;
   }
 
@@ -432,6 +472,154 @@ BasicBlock *CodeGenPrepare::findDestBlockOfMergeableEmptyBlock(BasicBlock *BB) {
   return DestBB;
 }
 
+// Return the unique indirectbr predecessor of a block. This may return null
+// even if such a predecessor exists, if it's not useful for splitting.
+// If a predecessor is found, OtherPreds will contain all other (non-indirectbr)
+// predecessors of BB.
+static BasicBlock *
+findIBRPredecessor(BasicBlock *BB, SmallVectorImpl<BasicBlock *> &OtherPreds) {
+  // If the block doesn't have any PHIs, we don't care about it, since there's
+  // no point in splitting it.
+  PHINode *PN = dyn_cast<PHINode>(BB->begin());
+  if (!PN)
+    return nullptr;
+
+  // Verify we have exactly one IBR predecessor.
+  // Conservatively bail out if one of the other predecessors is not a "regular"
+  // terminator (that is, not a switch or a br).
+  BasicBlock *IBB = nullptr;
+  for (unsigned Pred = 0, E = PN->getNumIncomingValues(); Pred != E; ++Pred) {
+    BasicBlock *PredBB = PN->getIncomingBlock(Pred);
+    TerminatorInst *PredTerm = PredBB->getTerminator();
+    switch (PredTerm->getOpcode()) {
+    case Instruction::IndirectBr:
+      if (IBB)
+        return nullptr;
+      IBB = PredBB;
+      break;
+    case Instruction::Br:
+    case Instruction::Switch:
+      OtherPreds.push_back(PredBB);
+      continue;
+    default:
+      return nullptr;
+    }
+  }
+
+  return IBB;
+}
+
+// Split critical edges where the source of the edge is an indirectbr
+// instruction. This isn't always possible, but we can handle some easy cases.
+// This is useful because MI is unable to split such critical edges,
+// which means it will not be able to sink instructions along those edges.
+// This is especially painful for indirect branches with many successors, where
+// we end up having to prepare all outgoing values in the origin block.
+//
+// Our normal algorithm for splitting critical edges requires us to update
+// the outgoing edges of the edge origin block, but for an indirectbr this
+// is hard, since it would require finding and updating the block addresses
+// the indirect branch uses. But if a block only has a single indirectbr
+// predecessor, with the others being regular branches, we can do it in a
+// different way.
+// Say we have A -> D, B -> D, I -> D where only I -> D is an indirectbr.
+// We can split D into D0 and D1, where D0 contains only the PHIs from D,
+// and D1 is the D block body. We can then duplicate D0 as D0A and D0B, and
+// create the following structure:
+// A -> D0A, B -> D0A, I -> D0B, D0A -> D1, D0B -> D1
+bool CodeGenPrepare::splitIndirectCriticalEdges(Function &F) {
+  // Check whether the function has any indirectbrs, and collect which blocks
+  // they may jump to. Since most functions don't have indirect branches,
+  // this lowers the common case's overhead to O(Blocks) instead of O(Edges).
+  SmallSetVector<BasicBlock *, 16> Targets;
+  for (auto &BB : F) {
+    auto *IBI = dyn_cast<IndirectBrInst>(BB.getTerminator());
+    if (!IBI)
+      continue;
+
+    for (unsigned Succ = 0, E = IBI->getNumSuccessors(); Succ != E; ++Succ)
+      Targets.insert(IBI->getSuccessor(Succ));
+  }
+
+  if (Targets.empty())
+    return false;
+
+  bool Changed = false;
+  for (BasicBlock *Target : Targets) {
+    SmallVector<BasicBlock *, 16> OtherPreds;
+    BasicBlock *IBRPred = findIBRPredecessor(Target, OtherPreds);
+    // If we did not found an indirectbr, or the indirectbr is the only
+    // incoming edge, this isn't the kind of edge we're looking for.
+    if (!IBRPred || OtherPreds.empty())
+      continue;
+
+    // Don't even think about ehpads/landingpads.
+    Instruction *FirstNonPHI = Target->getFirstNonPHI();
+    if (FirstNonPHI->isEHPad() || Target->isLandingPad())
+      continue;
+
+    BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split");
+    // It's possible Target was its own successor through an indirectbr.
+    // In this case, the indirectbr now comes from BodyBlock.
+    if (IBRPred == Target)
+      IBRPred = BodyBlock;
+
+    // At this point Target only has PHIs, and BodyBlock has the rest of the
+    // block's body. Create a copy of Target that will be used by the "direct"
+    // preds.
+    ValueToValueMapTy VMap;
+    BasicBlock *DirectSucc = CloneBasicBlock(Target, VMap, ".clone", &F);
+
+    for (BasicBlock *Pred : OtherPreds)
+      Pred->getTerminator()->replaceUsesOfWith(Target, DirectSucc);
+
+    // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that
+    // they are clones, so the number of PHIs are the same.
+    // (a) Remove the edge coming from IBRPred from the "Direct" PHI
+    // (b) Leave that as the only edge in the "Indirect" PHI.
+    // (c) Merge the two in the body block.
+    BasicBlock::iterator Indirect = Target->begin(),
+                         End = Target->getFirstNonPHI()->getIterator();
+    BasicBlock::iterator Direct = DirectSucc->begin();
+    BasicBlock::iterator MergeInsert = BodyBlock->getFirstInsertionPt();
+
+    assert(&*End == Target->getTerminator() &&
+           "Block was expected to only contain PHIs");
+
+    while (Indirect != End) {
+      PHINode *DirPHI = cast<PHINode>(Direct);
+      PHINode *IndPHI = cast<PHINode>(Indirect);
+
+      // Now, clean up - the direct block shouldn't get the indirect value,
+      // and vice versa.
+      DirPHI->removeIncomingValue(IBRPred);
+      Direct++;
+
+      // Advance the pointer here, to avoid invalidation issues when the old
+      // PHI is erased.
+      Indirect++;
+
+      PHINode *NewIndPHI = PHINode::Create(IndPHI->getType(), 1, "ind", IndPHI);
+      NewIndPHI->addIncoming(IndPHI->getIncomingValueForBlock(IBRPred),
+                             IBRPred);
+
+      // Create a PHI in the body block, to merge the direct and indirect
+      // predecessors.
+      PHINode *MergePHI =
+          PHINode::Create(IndPHI->getType(), 2, "merge", &*MergeInsert);
+      MergePHI->addIncoming(NewIndPHI, Target);
+      MergePHI->addIncoming(DirPHI, DirectSucc);
+
+      IndPHI->replaceAllUsesWith(MergePHI);
+      IndPHI->eraseFromParent();
+    }
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
 /// Eliminate blocks that contain only PHI nodes, debug info directives, and an
 /// unconditional branch. Passes before isel (e.g. LSR/loopsimplify) often split
 /// edges in ways that are non-optimal for isel. Start by eliminating these
@@ -1090,6 +1278,83 @@ static bool OptimizeCmpExpression(CmpInst *CI, const TargetLowering *TLI) {
   return false;
 }
 
+/// Duplicate and sink the given 'and' instruction into user blocks where it is
+/// used in a compare to allow isel to generate better code for targets where
+/// this operation can be combined.
+///
+/// Return true if any changes are made.
+static bool sinkAndCmp0Expression(Instruction *AndI,
+                                  const TargetLowering &TLI,
+                                  SetOfInstrs &InsertedInsts) {
+  // Double-check that we're not trying to optimize an instruction that was
+  // already optimized by some other part of this pass.
+  assert(!InsertedInsts.count(AndI) &&
+         "Attempting to optimize already optimized and instruction");
+  (void) InsertedInsts;
+
+  // Nothing to do for single use in same basic block.
+  if (AndI->hasOneUse() &&
+      AndI->getParent() == cast<Instruction>(*AndI->user_begin())->getParent())
+    return false;
+
+  // Try to avoid cases where sinking/duplicating is likely to increase register
+  // pressure.
+  if (!isa<ConstantInt>(AndI->getOperand(0)) &&
+      !isa<ConstantInt>(AndI->getOperand(1)) &&
+      AndI->getOperand(0)->hasOneUse() && AndI->getOperand(1)->hasOneUse())
+    return false;
+
+  for (auto *U : AndI->users()) {
+    Instruction *User = cast<Instruction>(U);
+
+    // Only sink for and mask feeding icmp with 0.
+    if (!isa<ICmpInst>(User))
+      return false;
+
+    auto *CmpC = dyn_cast<ConstantInt>(User->getOperand(1));
+    if (!CmpC || !CmpC->isZero())
+      return false;
+  }
+
+  if (!TLI.isMaskAndCmp0FoldingBeneficial(*AndI))
+    return false;
+
+  DEBUG(dbgs() << "found 'and' feeding only icmp 0;\n");
+  DEBUG(AndI->getParent()->dump());
+
+  // Push the 'and' into the same block as the icmp 0.  There should only be
+  // one (icmp (and, 0)) in each block, since CSE/GVN should have removed any
+  // others, so we don't need to keep track of which BBs we insert into.
+  for (Value::user_iterator UI = AndI->user_begin(), E = AndI->user_end();
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+
+    DEBUG(dbgs() << "sinking 'and' use: " << *User << "\n");
+
+    // Keep the 'and' in the same place if the use is already in the same block.
+    Instruction *InsertPt =
+        User->getParent() == AndI->getParent() ? AndI : User;
+    Instruction *InsertedAnd =
+        BinaryOperator::Create(Instruction::And, AndI->getOperand(0),
+                               AndI->getOperand(1), "", InsertPt);
+    // Propagate the debug info.
+    InsertedAnd->setDebugLoc(AndI->getDebugLoc());
+
+    // Replace a use of the 'and' with a use of the new 'and'.
+    TheUse = InsertedAnd;
+    ++NumAndUses;
+    DEBUG(User->getParent()->dump());
+  }
+
+  // We removed all uses, nuke the and.
+  AndI->eraseFromParent();
+  return true;
+}
+
 /// Check if the candidates could be combined with a shift instruction, which
 /// includes:
 /// 1. Truncate instruction
@@ -2028,16 +2293,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) {
     }
 
     if (TLI) {
-      // Unknown address space.
-      // TODO: Target hook to pick which address space the intrinsic cares
-      // about?
-      unsigned AddrSpace = ~0u;
       SmallVector<Value*, 2> PtrOps;
       Type *AccessTy;
-      if (TLI->GetAddrModeArguments(II, PtrOps, AccessTy, AddrSpace))
-        while (!PtrOps.empty())
-          if (optimizeMemoryInst(II, PtrOps.pop_back_val(), AccessTy, AddrSpace))
+      if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
+        while (!PtrOps.empty()) {
+          Value *PtrVal = PtrOps.pop_back_val();
+          unsigned AS = PtrVal->getType()->getPointerAddressSpace();
+          if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
             return true;
+        }
     }
   }
 
@@ -2168,11 +2432,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB) {
 
     // Conservatively require the attributes of the call to match those of the
     // return. Ignore noalias because it doesn't affect the call sequence.
-    AttributeSet CalleeAttrs = CS.getAttributes();
-    if (AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
-          removeAttribute(Attribute::NoAlias) !=
-        AttrBuilder(CalleeAttrs, AttributeSet::ReturnIndex).
-          removeAttribute(Attribute::NoAlias))
+    AttributeList CalleeAttrs = CS.getAttributes();
+    if (AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex)
+            .removeAttribute(Attribute::NoAlias) !=
+        AttrBuilder(CalleeAttrs, AttributeList::ReturnIndex)
+            .removeAttribute(Attribute::NoAlias))
       continue;
 
     // Make sure the call instruction is followed by an unconditional branch to
@@ -2561,25 +2825,30 @@ class TypePromotionTransaction {
     OperandsHider Hider;
     /// Keep track of the uses replaced, if any.
     UsesReplacer *Replacer;
+    /// Keep track of instructions removed.
+    SetOfInstrs &RemovedInsts;
 
   public:
     /// \brief Remove all reference of \p Inst and optinally replace all its
     /// uses with New.
+    /// \p RemovedInsts Keep track of the instructions removed by this Action.
     /// \pre If !Inst->use_empty(), then New != nullptr
-    InstructionRemover(Instruction *Inst, Value *New = nullptr)
+    InstructionRemover(Instruction *Inst, SetOfInstrs &RemovedInsts,
+                       Value *New = nullptr)
         : TypePromotionAction(Inst), Inserter(Inst), Hider(Inst),
-          Replacer(nullptr) {
+          Replacer(nullptr), RemovedInsts(RemovedInsts) {
       if (New)
         Replacer = new UsesReplacer(Inst, New);
       DEBUG(dbgs() << "Do: InstructionRemover: " << *Inst << "\n");
+      RemovedInsts.insert(Inst);
+      /// The instructions removed here will be freed after completing
+      /// optimizeBlock() for all blocks as we need to keep track of the
+      /// removed instructions during promotion.
       Inst->removeFromParent();
     }
 
     ~InstructionRemover() override { delete Replacer; }
 
-    /// \brief Really remove the instruction.
-    void commit() override { delete Inst; }
-
     /// \brief Resurrect the instruction and reassign it to the proper uses if
     /// new value was provided when build this action.
     void undo() override {
@@ -2588,6 +2857,7 @@ class TypePromotionTransaction {
       if (Replacer)
         Replacer->undo();
       Hider.undo();
+      RemovedInsts.erase(Inst);
     }
   };
 
@@ -2596,6 +2866,10 @@ public:
   /// The restoration point is a pointer to an action instead of an iterator
   /// because the iterator may be invalidated but not the pointer.
   typedef const TypePromotionAction *ConstRestorationPt;
+
+  TypePromotionTransaction(SetOfInstrs &RemovedInsts)
+      : RemovedInsts(RemovedInsts) {}
+
   /// Advocate every changes made in that transaction.
   void commit();
   /// Undo all the changes made after the given point.
@@ -2627,6 +2901,7 @@ private:
   /// The ordered list of actions made so far.
   SmallVector<std::unique_ptr<TypePromotionAction>, 16> Actions;
   typedef SmallVectorImpl<std::unique_ptr<TypePromotionAction>>::iterator CommitPt;
+  SetOfInstrs &RemovedInsts;
 };
 
 void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
@@ -2638,7 +2913,8 @@ void TypePromotionTransaction::setOperand(Instruction *Inst, unsigned Idx,
 void TypePromotionTransaction::eraseInstruction(Instruction *Inst,
                                                 Value *NewVal) {
   Actions.push_back(
-      make_unique<TypePromotionTransaction::InstructionRemover>(Inst, NewVal));
+      make_unique<TypePromotionTransaction::InstructionRemover>(Inst,
+                                                         RemovedInsts, NewVal));
 }
 
 void TypePromotionTransaction::replaceAllUsesWith(Instruction *Inst,
@@ -2705,8 +2981,8 @@ void TypePromotionTransaction::rollback(
 /// This encapsulates the logic for matching the target-legal addressing modes.
 class AddressingModeMatcher {
   SmallVectorImpl<Instruction*> &AddrModeInsts;
-  const TargetMachine &TM;
   const TargetLowering &TLI;
+  const TargetRegisterInfo &TRI;
   const DataLayout &DL;
 
   /// AccessTy/MemoryInst - This is the type for the access (e.g. double) and
@@ -2731,14 +3007,14 @@ class AddressingModeMatcher {
   bool IgnoreProfitability;
 
   AddressingModeMatcher(SmallVectorImpl<Instruction *> &AMI,
-                        const TargetMachine &TM, Type *AT, unsigned AS,
+                        const TargetLowering &TLI,
+                        const TargetRegisterInfo &TRI,
+                        Type *AT, unsigned AS,
                         Instruction *MI, ExtAddrMode &AM,
                         const SetOfInstrs &InsertedInsts,
                         InstrToOrigTy &PromotedInsts,
                         TypePromotionTransaction &TPT)
-      : AddrModeInsts(AMI), TM(TM),
-        TLI(*TM.getSubtargetImpl(*MI->getParent()->getParent())
-                 ->getTargetLowering()),
+      : AddrModeInsts(AMI), TLI(TLI), TRI(TRI),
         DL(MI->getModule()->getDataLayout()), AccessTy(AT), AddrSpace(AS),
         MemoryInst(MI), AddrMode(AM), InsertedInsts(InsertedInsts),
         PromotedInsts(PromotedInsts), TPT(TPT) {
@@ -2756,13 +3032,15 @@ public:
   static ExtAddrMode Match(Value *V, Type *AccessTy, unsigned AS,
                            Instruction *MemoryInst,
                            SmallVectorImpl<Instruction*> &AddrModeInsts,
-                           const TargetMachine &TM,
+                           const TargetLowering &TLI,
+                           const TargetRegisterInfo &TRI,
                            const SetOfInstrs &InsertedInsts,
                            InstrToOrigTy &PromotedInsts,
                            TypePromotionTransaction &TPT) {
     ExtAddrMode Result;
 
-    bool Success = AddressingModeMatcher(AddrModeInsts, TM, AccessTy, AS,
+    bool Success = AddressingModeMatcher(AddrModeInsts, TLI, TRI,
+                                         AccessTy, AS,
                                          MemoryInst, Result, InsertedInsts,
                                          PromotedInsts, TPT).matchAddr(V, 0);
     (void)Success; assert(Success && "Couldn't select *anything*?");
@@ -3583,18 +3861,18 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
 /// Check to see if all uses of OpVal by the specified inline asm call are due
 /// to memory operands. If so, return true, otherwise return false.
 static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
-                                    const TargetMachine &TM) {
+                                    const TargetLowering &TLI,
+                                    const TargetRegisterInfo &TRI) {
   const Function *F = CI->getParent()->getParent();
-  const TargetLowering *TLI = TM.getSubtargetImpl(*F)->getTargetLowering();
-  const TargetRegisterInfo *TRI = TM.getSubtargetImpl(*F)->getRegisterInfo();
   TargetLowering::AsmOperandInfoVector TargetConstraints =
-      TLI->ParseConstraints(F->getParent()->getDataLayout(), TRI,
+      TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,
                             ImmutableCallSite(CI));
+
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
 
     // Compute the constraint code and ConstraintType to use.
-    TLI->ComputeConstraintToUse(OpInfo, SDValue());
+    TLI.ComputeConstraintToUse(OpInfo, SDValue());
 
     // If this asm operand is our Value*, and if it isn't an indirect memory
     // operand, we can't fold it!
@@ -3613,7 +3891,8 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
 static bool FindAllMemoryUses(
     Instruction *I,
     SmallVectorImpl<std::pair<Instruction *, unsigned>> &MemoryUses,
-    SmallPtrSetImpl<Instruction *> &ConsideredInsts, const TargetMachine &TM) {
+    SmallPtrSetImpl<Instruction *> &ConsideredInsts,
+    const TargetLowering &TLI, const TargetRegisterInfo &TRI) {
   // If we already considered this instruction, we're done.
   if (!ConsideredInsts.insert(I).second)
     return false;
@@ -3635,11 +3914,28 @@ static bool FindAllMemoryUses(
 
     if (StoreInst *SI = dyn_cast<StoreInst>(UserI)) {
       unsigned opNo = U.getOperandNo();
-      if (opNo == 0) return true; // Storing addr, not into addr.
+      if (opNo != StoreInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
       MemoryUses.push_back(std::make_pair(SI, opNo));
       continue;
     }
 
+    if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(UserI)) {
+      unsigned opNo = U.getOperandNo();
+      if (opNo != AtomicRMWInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(RMW, opNo));
+      continue;
+    }
+
+    if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(UserI)) {
+      unsigned opNo = U.getOperandNo();
+      if (opNo != AtomicCmpXchgInst::getPointerOperandIndex())
+        return true; // Storing addr, not into addr.
+      MemoryUses.push_back(std::make_pair(CmpX, opNo));
+      continue;
+    }
+
     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
       // If this is a cold call, we can sink the addressing calculation into
       // the cold path.  See optimizeCallInst
@@ -3650,12 +3946,12 @@ static bool FindAllMemoryUses(
       if (!IA) return true;
 
       // If this is a memory operand, we're cool, otherwise bail out.
-      if (!IsOperandAMemoryOperand(CI, IA, I, TM))
+      if (!IsOperandAMemoryOperand(CI, IA, I, TLI, TRI))
         return true;
       continue;
     }
 
-    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TM))
+    if (FindAllMemoryUses(UserI, MemoryUses, ConsideredInsts, TLI, TRI))
       return true;
   }
 
@@ -3743,7 +4039,7 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
   // the use is just a particularly nice way of sinking it.
   SmallVector<std::pair<Instruction*,unsigned>, 16> MemoryUses;
   SmallPtrSet<Instruction*, 16> ConsideredInsts;
-  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TM))
+  if (FindAllMemoryUses(I, MemoryUses, ConsideredInsts, TLI, TRI))
     return false;  // Has a non-memory, non-foldable use!
 
   // Now that we know that all uses of this instruction are part of a chain of
@@ -3775,7 +4071,8 @@ isProfitableToFoldIntoAddressingMode(Instruction *I, ExtAddrMode &AMBefore,
     ExtAddrMode Result;
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
-    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TM, AddressAccessTy, AS,
+    AddressingModeMatcher Matcher(MatchedAddrModeInsts, TLI, TRI,
+                                  AddressAccessTy, AS,
                                   MemoryInst, Result, InsertedInsts,
                                   PromotedInsts, TPT);
     Matcher.IgnoreProfitability = true;
@@ -3844,7 +4141,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   bool IsNumUsesConsensusValid = false;
   SmallVector<Instruction*, 16> AddrModeInsts;
   ExtAddrMode AddrMode;
-  TypePromotionTransaction TPT;
+  TypePromotionTransaction TPT(RemovedInsts);
   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
       TPT.getRestorationPoint();
   while (!worklist.empty()) {
@@ -3869,7 +4166,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     // addressing instructions might have.
     SmallVector<Instruction*, 16> NewAddrModeInsts;
     ExtAddrMode NewAddrMode = AddressingModeMatcher::Match(
-      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TM,
+      V, AccessTy, AddrSpace, MemoryInst, NewAddrModeInsts, *TLI, *TRI,
       InsertedInsts, PromotedInsts, TPT);
 
     // This check is broken into two cases with very similar code to avoid using
@@ -3935,11 +4232,10 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     DEBUG(dbgs() << "CGP: Reusing nonlocal addrmode: " << AddrMode << " for "
                  << *MemoryInst << "\n");
     if (SunkAddr->getType() != Addr->getType())
-      SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+      SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
   } else if (AddrSinkUsingGEPs ||
              (!AddrSinkUsingGEPs.getNumOccurrences() && TM &&
-              TM->getSubtargetImpl(*MemoryInst->getParent()->getParent())
-                  ->useAA())) {
+              SubtargetInfo->useAA())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
@@ -4042,7 +4338,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
           // We need to add this separately from the scale above to help with
           // SDAG consecutive load/store merging.
           if (ResultPtr->getType() != I8PtrTy)
-            ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+            ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
           ResultPtr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
         }
 
@@ -4053,12 +4349,12 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         SunkAddr = ResultPtr;
       } else {
         if (ResultPtr->getType() != I8PtrTy)
-          ResultPtr = Builder.CreateBitCast(ResultPtr, I8PtrTy);
+          ResultPtr = Builder.CreatePointerCast(ResultPtr, I8PtrTy);
         SunkAddr = Builder.CreateGEP(I8Ty, ResultPtr, ResultIndex, "sunkaddr");
       }
 
       if (SunkAddr->getType() != Addr->getType())
-        SunkAddr = Builder.CreateBitCast(SunkAddr, Addr->getType());
+        SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
     }
   } else {
     DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode << " for "
@@ -4185,14 +4481,14 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
   return MadeChange;
 }
 
-/// \brief Check if all the uses of \p Inst are equivalent (or free) zero or
+/// \brief Check if all the uses of \p Val are equivalent (or free) zero or
 /// sign extensions.
-static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
-  assert(!Inst->use_empty() && "Input must have at least one use");
-  const Instruction *FirstUser = cast<Instruction>(*Inst->user_begin());
+static bool hasSameExtUse(Value *Val, const TargetLowering &TLI) {
+  assert(!Val->use_empty() && "Input must have at least one use");
+  const Instruction *FirstUser = cast<Instruction>(*Val->user_begin());
   bool IsSExt = isa<SExtInst>(FirstUser);
   Type *ExtTy = FirstUser->getType();
-  for (const User *U : Inst->users()) {
+  for (const User *U : Val->users()) {
     const Instruction *UI = cast<Instruction>(U);
     if ((IsSExt && !isa<SExtInst>(UI)) || (!IsSExt && !isa<ZExtInst>(UI)))
       return false;
@@ -4202,11 +4498,11 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
       continue;
 
     // If IsSExt is true, we are in this situation:
-    // a = Inst
+    // a = Val
     // b = sext ty1 a to ty2
     // c = sext ty1 a to ty3
     // Assuming ty2 is shorter than ty3, this could be turned into:
-    // a = Inst
+    // a = Val
     // b = sext ty1 a to ty2
     // c = sext ty2 b to ty3
     // However, the last sext is not free.
@@ -4233,51 +4529,44 @@ static bool hasSameExtUse(Instruction *Inst, const TargetLowering &TLI) {
   return true;
 }
 
-/// \brief Try to form ExtLd by promoting \p Exts until they reach a
-/// load instruction.
-/// If an ext(load) can be formed, it is returned via \p LI for the load
-/// and \p Inst for the extension.
-/// Otherwise LI == nullptr and Inst == nullptr.
-/// When some promotion happened, \p TPT contains the proper state to
-/// revert them.
-///
-/// \return true when promoting was necessary to expose the ext(load)
-/// opportunity, false otherwise.
+/// \brief Try to speculatively promote extensions in \p Exts and continue
+/// promoting through newly promoted operands recursively as far as doing so is
+/// profitable. Save extensions profitably moved up, in \p ProfitablyMovedExts.
+/// When some promotion happened, \p TPT contains the proper state to revert
+/// them.
 ///
-/// Example:
-/// \code
-/// %ld = load i32* %addr
-/// %add = add nuw i32 %ld, 4
-/// %zext = zext i32 %add to i64
-/// \endcode
-/// =>
-/// \code
-/// %ld = load i32* %addr
-/// %zext = zext i32 %ld to i64
-/// %add = add nuw i64 %zext, 4
-/// \endcode
-/// Thanks to the promotion, we can match zext(load i32*) to i64.
-bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,
-                                    LoadInst *&LI, Instruction *&Inst,
-                                    const SmallVectorImpl<Instruction *> &Exts,
-                                    unsigned CreatedInstsCost = 0) {
-  // Iterate over all the extensions to see if one form an ext(load).
+/// \return true if some promotion happened, false otherwise.
+bool CodeGenPrepare::tryToPromoteExts(
+    TypePromotionTransaction &TPT, const SmallVectorImpl<Instruction *> &Exts,
+    SmallVectorImpl<Instruction *> &ProfitablyMovedExts,
+    unsigned CreatedInstsCost) {
+  bool Promoted = false;
+
+  // Iterate over all the extensions to try to promote them.
   for (auto I : Exts) {
-    // Check if we directly have ext(load).
-    if ((LI = dyn_cast<LoadInst>(I->getOperand(0)))) {
-      Inst = I;
-      // No promotion happened here.
-      return false;
+    // Early check if we directly have ext(load).
+    if (isa<LoadInst>(I->getOperand(0))) {
+      ProfitablyMovedExts.push_back(I);
+      continue;
     }
-    // Check whether or not we want to do any promotion.
+
+    // Check whether or not we want to do any promotion.  The reason we have
+    // this check inside the for loop is to catch the case where an extension
+    // is directly fed by a load because in such case the extension can be moved
+    // up without any promotion on its operands.
     if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
-      continue;
+      return false;
+
     // Get the action to perform the promotion.
-    TypePromotionHelper::Action TPH = TypePromotionHelper::getAction(
-        I, InsertedInsts, *TLI, PromotedInsts);
+    TypePromotionHelper::Action TPH =
+        TypePromotionHelper::getAction(I, InsertedInsts, *TLI, PromotedInsts);
     // Check if we can promote.
-    if (!TPH)
+    if (!TPH) {
+      // Save the current extension as we cannot move up through its operand.
+      ProfitablyMovedExts.push_back(I);
       continue;
+    }
+
     // Save the current state.
     TypePromotionTransaction::ConstRestorationPt LastKnownGood =
         TPT.getRestorationPoint();
@@ -4298,112 +4587,292 @@ bool CodeGenPrepare::extLdPromotion(TypePromotionTransaction &TPT,
     // because the new extension may be removed too.
     long long TotalCreatedInstsCost = CreatedInstsCost + NewCreatedInstsCost;
     // FIXME: It would be possible to propagate a negative value instead of
-    // conservatively ceiling it to 0. 
+    // conservatively ceiling it to 0.
     TotalCreatedInstsCost =
         std::max((long long)0, (TotalCreatedInstsCost - ExtCost));
     if (!StressExtLdPromotion &&
         (TotalCreatedInstsCost > 1 ||
          !isPromotedInstructionLegal(*TLI, *DL, PromotedVal))) {
-      // The promotion is not profitable, rollback to the previous state.
+      // This promotion is not profitable, rollback to the previous state, and
+      // save the current extension in ProfitablyMovedExts as the latest
+      // speculative promotion turned out to be unprofitable.
       TPT.rollback(LastKnownGood);
+      ProfitablyMovedExts.push_back(I);
+      continue;
+    }
+    // Continue promoting NewExts as far as doing so is profitable.
+    SmallVector<Instruction *, 2> NewlyMovedExts;
+    (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
+    bool NewPromoted = false;
+    for (auto ExtInst : NewlyMovedExts) {
+      Instruction *MovedExt = cast<Instruction>(ExtInst);
+      Value *ExtOperand = MovedExt->getOperand(0);
+      // If we have reached to a load, we need this extra profitability check
+      // as it could potentially be merged into an ext(load).
+      if (isa<LoadInst>(ExtOperand) &&
+          !(StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
+            (ExtOperand->hasOneUse() || hasSameExtUse(ExtOperand, *TLI))))
+        continue;
+
+      ProfitablyMovedExts.push_back(MovedExt);
+      NewPromoted = true;
+    }
+
+    // If none of speculative promotions for NewExts is profitable, rollback
+    // and save the current extension (I) as the last profitable extension.
+    if (!NewPromoted) {
+      TPT.rollback(LastKnownGood);
+      ProfitablyMovedExts.push_back(I);
       continue;
     }
     // The promotion is profitable.
-    // Check if it exposes an ext(load).
-    (void)extLdPromotion(TPT, LI, Inst, NewExts, TotalCreatedInstsCost);
-    if (LI && (StressExtLdPromotion || NewCreatedInstsCost <= ExtCost ||
-               // If we have created a new extension, i.e., now we have two
-               // extensions. We must make sure one of them is merged with
-               // the load, otherwise we may degrade the code quality.
-               (LI->hasOneUse() || hasSameExtUse(LI, *TLI))))
-      // Promotion happened.
-      return true;
-    // If this does not help to expose an ext(load) then, rollback.
-    TPT.rollback(LastKnownGood);
+    Promoted = true;
   }
-  // None of the extension can form an ext(load).
-  LI = nullptr;
-  Inst = nullptr;
-  return false;
+  return Promoted;
 }
 
-/// Move a zext or sext fed by a load into the same basic block as the load,
-/// unless conditions are unfavorable. This allows SelectionDAG to fold the
-/// extend into the load.
-/// \p I[in/out] the extension may be modified during the process if some
-/// promotions apply.
-///
-bool CodeGenPrepare::moveExtToFormExtLoad(Instruction *&I) {
-  // ExtLoad formation infrastructure requires TLI to be effective.
-  if (!TLI)
-    return false;
+/// Merging redundant sexts when one is dominating the other.
+bool CodeGenPrepare::mergeSExts(Function &F) {
+  DominatorTree DT(F);
+  bool Changed = false;
+  for (auto &Entry : ValToSExtendedUses) {
+    SExts &Insts = Entry.second;
+    SExts CurPts;
+    for (Instruction *Inst : Insts) {
+      if (RemovedInsts.count(Inst) || !isa<SExtInst>(Inst) ||
+          Inst->getOperand(0) != Entry.first)
+        continue;
+      bool inserted = false;
+      for (auto &Pt : CurPts) {
+        if (DT.dominates(Inst, Pt)) {
+          Pt->replaceAllUsesWith(Inst);
+          RemovedInsts.insert(Pt);
+          Pt->removeFromParent();
+          Pt = Inst;
+          inserted = true;
+          Changed = true;
+          break;
+        }
+        if (!DT.dominates(Pt, Inst))
+          // Give up if we need to merge in a common dominator as the
+          // expermients show it is not profitable.
+          continue;
+        Inst->replaceAllUsesWith(Pt);
+        RemovedInsts.insert(Inst);
+        Inst->removeFromParent();
+        inserted = true;
+        Changed = true;
+        break;
+      }
+      if (!inserted)
+        CurPts.push_back(Inst);
+    }
+  }
+  return Changed;
+}
 
-  // Try to promote a chain of computation if it allows to form
-  // an extended load.
-  TypePromotionTransaction TPT;
-  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
-    TPT.getRestorationPoint();
-  SmallVector<Instruction *, 1> Exts;
-  Exts.push_back(I);
-  // Look for a load being extended.
-  LoadInst *LI = nullptr;
-  Instruction *OldExt = I;
-  bool HasPromoted = extLdPromotion(TPT, LI, I, Exts);
-  if (!LI || !I) {
-    assert(!HasPromoted && !LI && "If we did not match any load instruction "
-                                  "the code must remain the same");
-    I = OldExt;
-    return false;
+/// Return true, if an ext(load) can be formed from an extension in
+/// \p MovedExts.
+bool CodeGenPrepare::canFormExtLd(
+    const SmallVectorImpl<Instruction *> &MovedExts, LoadInst *&LI,
+    Instruction *&Inst, bool HasPromoted) {
+  for (auto *MovedExtInst : MovedExts) {
+    if (isa<LoadInst>(MovedExtInst->getOperand(0))) {
+      LI = cast<LoadInst>(MovedExtInst->getOperand(0));
+      Inst = MovedExtInst;
+      break;
+    }
   }
+  if (!LI)
+    return false;
 
   // If they're already in the same block, there's nothing to do.
   // Make the cheap checks first if we did not promote.
   // If we promoted, we need to check if it is indeed profitable.
-  if (!HasPromoted && LI->getParent() == I->getParent())
+  if (!HasPromoted && LI->getParent() == Inst->getParent())
     return false;
 
-  EVT VT = TLI->getValueType(*DL, I->getType());
+  EVT VT = TLI->getValueType(*DL, Inst->getType());
   EVT LoadVT = TLI->getValueType(*DL, LI->getType());
 
   // If the load has other users and the truncate is not free, this probably
   // isn't worthwhile.
-  if (!LI->hasOneUse() &&
-      (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
-      !TLI->isTruncateFree(I->getType(), LI->getType())) {
-    I = OldExt;
-    TPT.rollback(LastKnownGood);
+  if (!LI->hasOneUse() && (TLI->isTypeLegal(LoadVT) || !TLI->isTypeLegal(VT)) &&
+      !TLI->isTruncateFree(Inst->getType(), LI->getType()))
     return false;
-  }
 
   // Check whether the target supports casts folded into loads.
   unsigned LType;
-  if (isa<ZExtInst>(I))
+  if (isa<ZExtInst>(Inst))
     LType = ISD::ZEXTLOAD;
   else {
-    assert(isa<SExtInst>(I) && "Unexpected ext type!");
+    assert(isa<SExtInst>(Inst) && "Unexpected ext type!");
     LType = ISD::SEXTLOAD;
   }
-  if (!TLI->isLoadExtLegal(LType, VT, LoadVT)) {
-    I = OldExt;
-    TPT.rollback(LastKnownGood);
+
+  return TLI->isLoadExtLegal(LType, VT, LoadVT);
+}
+
+/// Move a zext or sext fed by a load into the same basic block as the load,
+/// unless conditions are unfavorable. This allows SelectionDAG to fold the
+/// extend into the load.
+///
+/// E.g.,
+/// \code
+/// %ld = load i32* %addr
+/// %add = add nuw i32 %ld, 4
+/// %zext = zext i32 %add to i64
+// \endcode
+/// =>
+/// \code
+/// %ld = load i32* %addr
+/// %zext = zext i32 %ld to i64
+/// %add = add nuw i64 %zext, 4
+/// \encode
+/// Note that the promotion in %add to i64 is done in tryToPromoteExts(), which
+/// allow us to match zext(load i32*) to i64.
+///
+/// Also, try to promote the computations used to obtain a sign extended
+/// value used into memory accesses.
+/// E.g.,
+/// \code
+/// a = add nsw i32 b, 3
+/// d = sext i32 a to i64
+/// e = getelementptr ..., i64 d
+/// \endcode
+/// =>
+/// \code
+/// f = sext i32 b to i64
+/// a = add nsw i64 f, 3
+/// e = getelementptr ..., i64 a
+/// \endcode
+///
+/// \p Inst[in/out] the extension may be modified during the process if some
+/// promotions apply.
+bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
+  // ExtLoad formation and address type promotion infrastructure requires TLI to
+  // be effective.
+  if (!TLI)
     return false;
+
+  bool AllowPromotionWithoutCommonHeader = false;
+  /// See if it is an interesting sext operations for the address type
+  /// promotion before trying to promote it, e.g., the ones with the right
+  /// type and used in memory accesses.
+  bool ATPConsiderable = TTI->shouldConsiderAddressTypePromotion(
+      *Inst, AllowPromotionWithoutCommonHeader);
+  TypePromotionTransaction TPT(RemovedInsts);
+  TypePromotionTransaction::ConstRestorationPt LastKnownGood =
+      TPT.getRestorationPoint();
+  SmallVector<Instruction *, 1> Exts;
+  SmallVector<Instruction *, 2> SpeculativelyMovedExts;
+  Exts.push_back(Inst);
+
+  bool HasPromoted = tryToPromoteExts(TPT, Exts, SpeculativelyMovedExts);
+
+  // Look for a load being extended.
+  LoadInst *LI = nullptr;
+  Instruction *ExtFedByLoad;
+
+  // Try to promote a chain of computation if it allows to form an extended
+  // load.
+  if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
+    assert(LI && ExtFedByLoad && "Expect a valid load and extension");
+    TPT.commit();
+    // Move the extend into the same block as the load
+    ExtFedByLoad->removeFromParent();
+    ExtFedByLoad->insertAfter(LI);
+    // CGP does not check if the zext would be speculatively executed when moved
+    // to the same basic block as the load. Preserving its original location
+    // would pessimize the debugging experience, as well as negatively impact
+    // the quality of sample pgo. We don't want to use "line 0" as that has a
+    // size cost in the line-table section and logically the zext can be seen as
+    // part of the load. Therefore we conservatively reuse the same debug
+    // location for the load and the zext.
+    ExtFedByLoad->setDebugLoc(LI->getDebugLoc());
+    ++NumExtsMoved;
+    Inst = ExtFedByLoad;
+    return true;
   }
 
-  // Move the extend into the same block as the load, so that SelectionDAG
-  // can fold it.
-  TPT.commit();
-  I->removeFromParent();
-  I->insertAfter(LI);
-  // CGP does not check if the zext would be speculatively executed when moved
-  // to the same basic block as the load. Preserving its original location would
-  // pessimize the debugging experience, as well as negatively impact the 
-  // quality of sample pgo. We don't want to use "line 0" as that has a
-  // size cost in the line-table section and logically the zext can be seen as
-  // part of the load. Therefore we conservatively reuse the same debug location
-  // for the load and the zext.
-  I->setDebugLoc(LI->getDebugLoc());
-  ++NumExtsMoved;
-  return true;
+  // Continue promoting SExts if known as considerable depending on targets.
+  if (ATPConsiderable &&
+      performAddressTypePromotion(Inst, AllowPromotionWithoutCommonHeader,
+                                  HasPromoted, TPT, SpeculativelyMovedExts))
+    return true;
+
+  TPT.rollback(LastKnownGood);
+  return false;
+}
+
+// Perform address type promotion if doing so is profitable.
+// If AllowPromotionWithoutCommonHeader == false, we should find other sext
+// instructions that sign extended the same initial value. However, if
+// AllowPromotionWithoutCommonHeader == true, we expect promoting the
+// extension is just profitable.
+bool CodeGenPrepare::performAddressTypePromotion(
+    Instruction *&Inst, bool AllowPromotionWithoutCommonHeader,
+    bool HasPromoted, TypePromotionTransaction &TPT,
+    SmallVectorImpl<Instruction *> &SpeculativelyMovedExts) {
+  bool Promoted = false;
+  SmallPtrSet<Instruction *, 1> UnhandledExts;
+  bool AllSeenFirst = true;
+  for (auto I : SpeculativelyMovedExts) {
+    Value *HeadOfChain = I->getOperand(0);
+    DenseMap<Value *, Instruction *>::iterator AlreadySeen =
+        SeenChainsForSExt.find(HeadOfChain);
+    // If there is an unhandled SExt which has the same header, try to promote
+    // it as well.
+    if (AlreadySeen != SeenChainsForSExt.end()) {
+      if (AlreadySeen->second != nullptr)
+        UnhandledExts.insert(AlreadySeen->second);
+      AllSeenFirst = false;
+    }
+  }
+
+  if (!AllSeenFirst || (AllowPromotionWithoutCommonHeader &&
+                        SpeculativelyMovedExts.size() == 1)) {
+    TPT.commit();
+    if (HasPromoted)
+      Promoted = true;
+    for (auto I : SpeculativelyMovedExts) {
+      Value *HeadOfChain = I->getOperand(0);
+      SeenChainsForSExt[HeadOfChain] = nullptr;
+      ValToSExtendedUses[HeadOfChain].push_back(I);
+    }
+    // Update Inst as promotion happen.
+    Inst = SpeculativelyMovedExts.pop_back_val();
+  } else {
+    // This is the first chain visited from the header, keep the current chain
+    // as unhandled. Defer to promote this until we encounter another SExt
+    // chain derived from the same header.
+    for (auto I : SpeculativelyMovedExts) {
+      Value *HeadOfChain = I->getOperand(0);
+      SeenChainsForSExt[HeadOfChain] = Inst;
+    }
+    return false;
+  }
+
+  if (!AllSeenFirst && !UnhandledExts.empty())
+    for (auto VisitedSExt : UnhandledExts) {
+      if (RemovedInsts.count(VisitedSExt))
+        continue;
+      TypePromotionTransaction TPT(RemovedInsts);
+      SmallVector<Instruction *, 1> Exts;
+      SmallVector<Instruction *, 2> Chains;
+      Exts.push_back(VisitedSExt);
+      bool HasPromoted = tryToPromoteExts(TPT, Exts, Chains);
+      TPT.commit();
+      if (HasPromoted)
+        Promoted = true;
+      for (auto I : Chains) {
+        Value *HeadOfChain = I->getOperand(0);
+        // Mark this as handled.
+        SeenChainsForSExt[HeadOfChain] = nullptr;
+        ValToSExtendedUses[HeadOfChain].push_back(I);
+      }
+    }
+  return Promoted;
 }
 
 bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
@@ -4537,13 +5006,10 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
       !(Load->getType()->isIntegerTy() || Load->getType()->isPointerTy()))
     return false;
 
-  // Skip loads we've already transformed or have no reason to transform.
-  if (Load->hasOneUse()) {
-    User *LoadUser = *Load->user_begin();
-    if (cast<Instruction>(LoadUser)->getParent() == Load->getParent() &&
-        !dyn_cast<PHINode>(LoadUser))
-      return false;
-  }
+  // Skip loads we've already transformed.
+  if (Load->hasOneUse() &&
+      InsertedInsts.count(cast<Instruction>(*Load->user_begin())))
+    return false;
 
   // Look at all uses of Load, looking through phis, to determine how many bits
   // of the loaded value are needed.
@@ -4623,7 +5089,7 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
   //
   // Also avoid hoisting if we didn't see any ands with the exact DemandBits
   // mask, since these are the only ands that will be removed by isel.
-  if (ActiveBits <= 1 || !APIntOps::isMask(ActiveBits, DemandBits) ||
+  if (ActiveBits <= 1 || !DemandBits.isMask(ActiveBits) ||
       WidestAndBits != DemandBits)
     return false;
 
@@ -4639,6 +5105,9 @@ bool CodeGenPrepare::optimizeLoadExt(LoadInst *Load) {
   IRBuilder<> Builder(Load->getNextNode());
   auto *NewAnd = dyn_cast<Instruction>(
       Builder.CreateAnd(Load, ConstantInt::get(Ctx, DemandBits)));
+  // Mark this instruction as "inserted by CGP", so that other
+  // optimizations don't touch it.
+  InsertedInsts.insert(NewAnd);
 
   // Replace all uses of load with new and (except for the use of load in the
   // new and itself).
@@ -4988,7 +5457,7 @@ bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
   auto *ExtInst = CastInst::Create(ExtType, Cond, NewType);
   ExtInst->insertBefore(SI);
   SI->setCondition(ExtInst);
-  for (SwitchInst::CaseIt Case : SI->cases()) {
+  for (auto Case : SI->cases()) {
     APInt NarrowConst = Case.getCaseValue()->getValue();
     APInt WideConst = (ExtType == Instruction::ZExt) ?
                       NarrowConst.zext(RegWidth) : NarrowConst.sext(RegWidth);
@@ -5517,7 +5986,7 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
               TargetLowering::TypeExpandInteger) {
         return SinkCast(CI);
       } else {
-        bool MadeChange = moveExtToFormExtLoad(I);
+        bool MadeChange = optimizeExt(I);
         return MadeChange | optimizeExtUses(I);
       }
     }
@@ -5551,8 +6020,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool& ModifiedDT) {
     return false;
   }
 
+  if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
+      unsigned AS = RMW->getPointerAddressSpace();
+      return optimizeMemoryInst(I, RMW->getPointerOperand(),
+                                RMW->getType(), AS);
+  }
+
+  if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(I)) {
+      unsigned AS = CmpX->getPointerAddressSpace();
+      return optimizeMemoryInst(I, CmpX->getPointerOperand(),
+                                CmpX->getCompareOperand()->getType(), AS);
+  }
+
   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
 
+  if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
+      EnableAndCmpSinking && TLI)
+    return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
+
   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
                 BinOp->getOpcode() == Instruction::LShr)) {
     ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
@@ -5682,68 +6167,6 @@ bool CodeGenPrepare::placeDbgValues(Function &F) {
   return MadeChange;
 }
 
-// If there is a sequence that branches based on comparing a single bit
-// against zero that can be combined into a single instruction, and the
-// target supports folding these into a single instruction, sink the
-// mask and compare into the branch uses. Do this before OptimizeBlock ->
-// OptimizeInst -> OptimizeCmpExpression, which perturbs the pattern being
-// searched for.
-bool CodeGenPrepare::sinkAndCmp(Function &F) {
-  if (!EnableAndCmpSinking)
-    return false;
-  if (!TLI || !TLI->isMaskAndBranchFoldingLegal())
-    return false;
-  bool MadeChange = false;
-  for (BasicBlock &BB : F) {
-    // Does this BB end with the following?
-    //   %andVal = and %val, #single-bit-set
-    //   %icmpVal = icmp %andResult, 0
-    //   br i1 %cmpVal label %dest1, label %dest2"
-    BranchInst *Brcc = dyn_cast<BranchInst>(BB.getTerminator());
-    if (!Brcc || !Brcc->isConditional())
-      continue;
-    ICmpInst *Cmp = dyn_cast<ICmpInst>(Brcc->getOperand(0));
-    if (!Cmp || Cmp->getParent() != &BB)
-      continue;
-    ConstantInt *Zero = dyn_cast<ConstantInt>(Cmp->getOperand(1));
-    if (!Zero || !Zero->isZero())
-      continue;
-    Instruction *And = dyn_cast<Instruction>(Cmp->getOperand(0));
-    if (!And || And->getOpcode() != Instruction::And || And->getParent() != &BB)
-      continue;
-    ConstantInt* Mask = dyn_cast<ConstantInt>(And->getOperand(1));
-    if (!Mask || !Mask->getUniqueInteger().isPowerOf2())
-      continue;
-    DEBUG(dbgs() << "found and; icmp ?,0; brcc\n"); DEBUG(BB.dump());
-
-    // Push the "and; icmp" for any users that are conditional branches.
-    // Since there can only be one branch use per BB, we don't need to keep
-    // track of which BBs we insert into.
-    for (Use &TheUse : Cmp->uses()) {
-      // Find brcc use.
-      BranchInst *BrccUser = dyn_cast<BranchInst>(TheUse);
-      if (!BrccUser || !BrccUser->isConditional())
-        continue;
-      BasicBlock *UserBB = BrccUser->getParent();
-      if (UserBB == &BB) continue;
-      DEBUG(dbgs() << "found Brcc use\n");
-
-      // Sink the "and; icmp" to use.
-      MadeChange = true;
-      BinaryOperator *NewAnd =
-        BinaryOperator::CreateAnd(And->getOperand(0), And->getOperand(1), "",
-                                  BrccUser);
-      CmpInst *NewCmp =
-        CmpInst::Create(Cmp->getOpcode(), Cmp->getPredicate(), NewAnd, Zero,
-                        "", BrccUser);
-      TheUse = NewCmp;
-      ++NumAndCmpsMoved;
-      DEBUG(BrccUser->getParent()->dump());
-    }
-  }
-  return MadeChange;
-}
-
 /// \brief Scale down both weights to fit into uint32_t.
 static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
   uint64_t NewMax = (NewTrue > NewFalse) ? NewTrue : NewFalse;
diff --git a/lib/CodeGen/CountingFunctionInserter.cpp b/lib/CodeGen/CountingFunctionInserter.cpp
index 1e46a7a99e7e3a76b902bb18901a702e1a75b604..7f7350f5fb5cd97454e05a0cc1beb7827f9e686d 100644
--- a/lib/CodeGen/CountingFunctionInserter.cpp
+++ b/lib/CodeGen/CountingFunctionInserter.cpp
@@ -41,7 +41,7 @@ namespace {
       Type *VoidTy = Type::getVoidTy(F.getContext());
       Constant *CountingFn =
         F.getParent()->getOrInsertFunction(CountingFunctionName,
-                                           VoidTy, nullptr);
+                                           VoidTy);
       CallInst::Create(CountingFn, "", &*F.begin()->getFirstInsertionPt());
       return true;
     }
diff --git a/lib/CodeGen/CriticalAntiDepBreaker.cpp b/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 5d60c3055456d995f62eeb6e45a80e404cbbe55c..e1eeddf0816c164fd70421e08a462c69f848b036 100644
--- a/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -71,8 +71,11 @@ void CriticalAntiDepBreaker::StartBlock(MachineBasicBlock *BB) {
   // callee-saved register that is not saved in the prolog.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   BitVector Pristine = MFI.getPristineRegs(MF);
-  for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I) {
-    if (!IsReturnBlock && !Pristine.test(*I)) continue;
+  for (const MCPhysReg *I = MF.getRegInfo().getCalleeSavedRegs(); *I;
+       ++I) {
+    unsigned Reg = *I;
+    if (!IsReturnBlock && !(Pristine.test(Reg) || BB->isLiveIn(Reg)))
+      continue;
     for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI) {
       unsigned Reg = *AI;
       Classes[Reg] = reinterpret_cast<TargetRegisterClass *>(-1);
diff --git a/lib/CodeGen/DeadMachineInstructionElim.cpp b/lib/CodeGen/DeadMachineInstructionElim.cpp
index 17c229a216ae1b7e3481dec60ccaf3f5685c5a19..7ac2e5445435dac423097c400cb99993cb7ddd2a 100644
--- a/lib/CodeGen/DeadMachineInstructionElim.cpp
+++ b/lib/CodeGen/DeadMachineInstructionElim.cpp
@@ -110,7 +110,7 @@ bool DeadMachineInstructionElim::runOnMachineFunction(MachineFunction &MF) {
     // Start out assuming that reserved registers are live out of this block.
     LivePhysRegs = MRI->getReservedRegs();
 
-    // Add live-ins from sucessors to LivePhysRegs. Normally, physregs are not
+    // Add live-ins from successors to LivePhysRegs. Normally, physregs are not
     // live across blocks, but some targets (x86) can have flags live out of a
     // block.
     for (MachineBasicBlock::succ_iterator S = MBB.succ_begin(),
diff --git a/lib/CodeGen/DetectDeadLanes.cpp b/lib/CodeGen/DetectDeadLanes.cpp
index a7ba694c144d142f319d73d65d4fc21f271b6645..6f4ea1912cf4e27795405cfc9e10712b055fddd1 100644
--- a/lib/CodeGen/DetectDeadLanes.cpp
+++ b/lib/CodeGen/DetectDeadLanes.cpp
@@ -441,7 +441,7 @@ LaneBitmask DetectDeadLanes::determineInitialUsedLanes(unsigned Reg) {
           const TargetRegisterClass *DstRC = MRI->getRegClass(DefReg);
           CrossCopy = isCrossCopy(*MRI, UseMI, DstRC, MO);
           if (CrossCopy)
-            DEBUG(dbgs() << "Copy accross incompatible classes: " << UseMI);
+            DEBUG(dbgs() << "Copy across incompatible classes: " << UseMI);
         }
 
         if (!CrossCopy)
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 2da791ca7833c4172a5122bf018c39ebee5797e1..e272d25047e63929d7a91dcc6f35a59c947d19f0 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -6,21 +6,9 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-//
-// This file contains the execution dependency fix pass.
-//
-// Some X86 SSE instructions like mov, and, or, xor are available in different
-// variants for different operand types. These variant instructions are
-// equivalent, but on Nehalem and newer cpus there is extra latency
-// transferring data between integer and floating point domains.  ARM cores
-// have similar issues when they are configured with both VFP and NEON
-// pipelines.
-//
-// This pass changes the variant instructions to minimize domain crossings.
-//
-//===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/ExecutionDepsFix.h"
+
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -35,212 +23,18 @@
 
 using namespace llvm;
 
-#define DEBUG_TYPE "execution-fix"
-
-/// A DomainValue is a bit like LiveIntervals' ValNo, but it also keeps track
-/// of execution domains.
-///
-/// An open DomainValue represents a set of instructions that can still switch
-/// execution domain. Multiple registers may refer to the same open
-/// DomainValue - they will eventually be collapsed to the same execution
-/// domain.
-///
-/// A collapsed DomainValue represents a single register that has been forced
-/// into one of more execution domains. There is a separate collapsed
-/// DomainValue for each register, but it may contain multiple execution
-/// domains. A register value is initially created in a single execution
-/// domain, but if we were forced to pay the penalty of a domain crossing, we
-/// keep track of the fact that the register is now available in multiple
-/// domains.
-namespace {
-struct DomainValue {
-  // Basic reference counting.
-  unsigned Refs;
-
-  // Bitmask of available domains. For an open DomainValue, it is the still
-  // possible domains for collapsing. For a collapsed DomainValue it is the
-  // domains where the register is available for free.
-  unsigned AvailableDomains;
-
-  // Pointer to the next DomainValue in a chain.  When two DomainValues are
-  // merged, Victim.Next is set to point to Victor, so old DomainValue
-  // references can be updated by following the chain.
-  DomainValue *Next;
-
-  // Twiddleable instructions using or defining these registers.
-  SmallVector<MachineInstr*, 8> Instrs;
-
-  // A collapsed DomainValue has no instructions to twiddle - it simply keeps
-  // track of the domains where the registers are already available.
-  bool isCollapsed() const { return Instrs.empty(); }
-
-  // Is domain available?
-  bool hasDomain(unsigned domain) const {
-    assert(domain <
-               static_cast<unsigned>(std::numeric_limits<unsigned>::digits) &&
-           "undefined behavior");
-    return AvailableDomains & (1u << domain);
-  }
-
-  // Mark domain as available.
-  void addDomain(unsigned domain) {
-    AvailableDomains |= 1u << domain;
-  }
-
-  // Restrict to a single domain available.
-  void setSingleDomain(unsigned domain) {
-    AvailableDomains = 1u << domain;
-  }
-
-  // Return bitmask of domains that are available and in mask.
-  unsigned getCommonDomains(unsigned mask) const {
-    return AvailableDomains & mask;
-  }
-
-  // First domain available.
-  unsigned getFirstDomain() const {
-    return countTrailingZeros(AvailableDomains);
-  }
-
-  DomainValue() : Refs(0) { clear(); }
-
-  // Clear this DomainValue and point to next which has all its data.
-  void clear() {
-    AvailableDomains = 0;
-    Next = nullptr;
-    Instrs.clear();
-  }
-};
-}
-
-namespace {
-/// Information about a live register.
-struct LiveReg {
-  /// Value currently in this register, or NULL when no value is being tracked.
-  /// This counts as a DomainValue reference.
-  DomainValue *Value;
-
-  /// Instruction that defined this register, relative to the beginning of the
-  /// current basic block.  When a LiveReg is used to represent a live-out
-  /// register, this value is relative to the end of the basic block, so it
-  /// will be a negative number.
-  int Def;
-};
-} // anonymous namespace
-
-namespace {
-class ExeDepsFix : public MachineFunctionPass {
-  static char ID;
-  SpecificBumpPtrAllocator<DomainValue> Allocator;
-  SmallVector<DomainValue*,16> Avail;
-
-  const TargetRegisterClass *const RC;
-  MachineFunction *MF;
-  const TargetInstrInfo *TII;
-  const TargetRegisterInfo *TRI;
-  RegisterClassInfo RegClassInfo;
-  std::vector<SmallVector<int, 1>> AliasMap;
-  const unsigned NumRegs;
-  LiveReg *LiveRegs;
-  struct MBBInfo {
-    // Keeps clearance and domain information for all registers. Note that this
-    // is different from the usual definition notion of liveness. The CPU
-    // doesn't care whether or not we consider a register killed.
-    LiveReg *OutRegs;
-
-    // Whether we have gotten to this block in primary processing yet.
-    bool PrimaryCompleted;
-
-    // The number of predecessors for which primary processing has completed
-    unsigned IncomingProcessed;
-
-    // The value of `IncomingProcessed` at the start of primary processing
-    unsigned PrimaryIncoming;
-
-    // The number of predecessors for which all processing steps are done.
-    unsigned IncomingCompleted;
-
-    MBBInfo()
-        : OutRegs(nullptr), PrimaryCompleted(false), IncomingProcessed(0),
-          PrimaryIncoming(0), IncomingCompleted(0) {}
-  };
-  typedef DenseMap<MachineBasicBlock *, MBBInfo> MBBInfoMap;
-  MBBInfoMap MBBInfos;
-
-  /// List of undefined register reads in this block in forward order.
-  std::vector<std::pair<MachineInstr*, unsigned> > UndefReads;
-
-  /// Storage for register unit liveness.
-  LivePhysRegs LiveRegSet;
-
-  /// Current instruction number.
-  /// The first instruction in each basic block is 0.
-  int CurInstr;
-public:
-  ExeDepsFix(const TargetRegisterClass *rc)
-    : MachineFunctionPass(ID), RC(rc), NumRegs(RC->getNumRegs()) {}
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.setPreservesAll();
-    MachineFunctionPass::getAnalysisUsage(AU);
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-  MachineFunctionProperties getRequiredProperties() const override {
-    return MachineFunctionProperties().set(
-        MachineFunctionProperties::Property::NoVRegs);
-  }
-
-  StringRef getPassName() const override { return "Execution dependency fix"; }
-
-private:
-  iterator_range<SmallVectorImpl<int>::const_iterator>
-  regIndices(unsigned Reg) const;
-  // DomainValue allocation.
-  DomainValue *alloc(int domain = -1);
-  DomainValue *retain(DomainValue *DV) {
-    if (DV) ++DV->Refs;
-    return DV;
-  }
-  void release(DomainValue*);
-  DomainValue *resolve(DomainValue*&);
-
-  // LiveRegs manipulations.
-  void setLiveReg(int rx, DomainValue *DV);
-  void kill(int rx);
-  void force(int rx, unsigned domain);
-  void collapse(DomainValue *dv, unsigned domain);
-  bool merge(DomainValue *A, DomainValue *B);
-
-  void enterBasicBlock(MachineBasicBlock*);
-  void leaveBasicBlock(MachineBasicBlock*);
-  bool isBlockDone(MachineBasicBlock *);
-  void processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass);
-  void updateSuccessors(MachineBasicBlock *MBB, bool PrimaryPass);
-  bool visitInstr(MachineInstr *);
-  void processDefs(MachineInstr *, bool breakDependency, bool Kill);
-  void visitSoftInstr(MachineInstr*, unsigned mask);
-  void visitHardInstr(MachineInstr*, unsigned domain);
-  void pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
-                                unsigned Pref);
-  bool shouldBreakDependence(MachineInstr*, unsigned OpIdx, unsigned Pref);
-  void processUndefReads(MachineBasicBlock*);
-};
-}
-
-char ExeDepsFix::ID = 0;
+#define DEBUG_TYPE "execution-deps-fix"
 
 /// Translate TRI register number to a list of indices into our smaller tables
 /// of interesting registers.
 iterator_range<SmallVectorImpl<int>::const_iterator>
-ExeDepsFix::regIndices(unsigned Reg) const {
+ExecutionDepsFix::regIndices(unsigned Reg) const {
   assert(Reg < AliasMap.size() && "Invalid register");
   const auto &Entry = AliasMap[Reg];
   return make_range(Entry.begin(), Entry.end());
 }
 
-DomainValue *ExeDepsFix::alloc(int domain) {
+DomainValue *ExecutionDepsFix::alloc(int domain) {
   DomainValue *dv = Avail.empty() ?
                       new(Allocator.Allocate()) DomainValue :
                       Avail.pop_back_val();
@@ -253,7 +47,7 @@ DomainValue *ExeDepsFix::alloc(int domain) {
 
 /// Release a reference to DV.  When the last reference is released,
 /// collapse if needed.
-void ExeDepsFix::release(DomainValue *DV) {
+void ExecutionDepsFix::release(DomainValue *DV) {
   while (DV) {
     assert(DV->Refs && "Bad DomainValue");
     if (--DV->Refs)
@@ -273,7 +67,7 @@ void ExeDepsFix::release(DomainValue *DV) {
 
 /// Follow the chain of dead DomainValues until a live DomainValue is reached.
 /// Update the referenced pointer when necessary.
-DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) {
+DomainValue *ExecutionDepsFix::resolve(DomainValue *&DVRef) {
   DomainValue *DV = DVRef;
   if (!DV || !DV->Next)
     return DV;
@@ -290,7 +84,7 @@ DomainValue *ExeDepsFix::resolve(DomainValue *&DVRef) {
 }
 
 /// Set LiveRegs[rx] = dv, updating reference counts.
-void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) {
+void ExecutionDepsFix::setLiveReg(int rx, DomainValue *dv) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   assert(LiveRegs && "Must enter basic block first.");
 
@@ -302,7 +96,7 @@ void ExeDepsFix::setLiveReg(int rx, DomainValue *dv) {
 }
 
 // Kill register rx, recycle or collapse any DomainValue.
-void ExeDepsFix::kill(int rx) {
+void ExecutionDepsFix::kill(int rx) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   assert(LiveRegs && "Must enter basic block first.");
   if (!LiveRegs[rx].Value)
@@ -313,7 +107,7 @@ void ExeDepsFix::kill(int rx) {
 }
 
 /// Force register rx into domain.
-void ExeDepsFix::force(int rx, unsigned domain) {
+void ExecutionDepsFix::force(int rx, unsigned domain) {
   assert(unsigned(rx) < NumRegs && "Invalid index");
   assert(LiveRegs && "Must enter basic block first.");
   if (DomainValue *dv = LiveRegs[rx].Value) {
@@ -336,7 +130,7 @@ void ExeDepsFix::force(int rx, unsigned domain) {
 
 /// Collapse open DomainValue into given domain. If there are multiple
 /// registers using dv, they each get a unique collapsed DomainValue.
-void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {
+void ExecutionDepsFix::collapse(DomainValue *dv, unsigned domain) {
   assert(dv->hasDomain(domain) && "Cannot collapse");
 
   // Collapse all the instructions.
@@ -352,7 +146,7 @@ void ExeDepsFix::collapse(DomainValue *dv, unsigned domain) {
 }
 
 /// All instructions and registers in B are moved to A, and B is released.
-bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
+bool ExecutionDepsFix::merge(DomainValue *A, DomainValue *B) {
   assert(!A->isCollapsed() && "Cannot merge into collapsed");
   assert(!B->isCollapsed() && "Cannot merge from collapsed");
   if (A == B)
@@ -378,7 +172,7 @@ bool ExeDepsFix::merge(DomainValue *A, DomainValue *B) {
 }
 
 /// Set up LiveRegs by merging predecessor live-out values.
-void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
+void ExecutionDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
   // Reset instruction counter in each basic block.
   CurInstr = 0;
 
@@ -456,7 +250,7 @@ void ExeDepsFix::enterBasicBlock(MachineBasicBlock *MBB) {
              << (!isBlockDone(MBB) ? ": incomplete\n" : ": all preds known\n"));
 }
 
-void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
+void ExecutionDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
   assert(LiveRegs && "Must enter basic block first.");
   LiveReg *OldOutRegs = MBBInfos[MBB].OutRegs;
   // Save register clearances at end of MBB - used by enterBasicBlock().
@@ -478,7 +272,7 @@ void ExeDepsFix::leaveBasicBlock(MachineBasicBlock *MBB) {
   LiveRegs = nullptr;
 }
 
-bool ExeDepsFix::visitInstr(MachineInstr *MI) {
+bool ExecutionDepsFix::visitInstr(MachineInstr *MI) {
   // Update instructions with explicit execution domains.
   std::pair<uint16_t, uint16_t> DomP = TII->getExecutionDomain(*MI);
   if (DomP.first) {
@@ -494,8 +288,10 @@ bool ExeDepsFix::visitInstr(MachineInstr *MI) {
 /// \brief Helps avoid false dependencies on undef registers by updating the
 /// machine instructions' undef operand to use a register that the instruction
 /// is truly dependent on, or use a register with clearance higher than Pref.
-void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
-                                          unsigned Pref) {
+/// Returns true if it was able to find a true dependency, thus not requiring
+/// a dependency breaking instruction regardless of clearance.
+bool ExecutionDepsFix::pickBestRegisterForUndef(MachineInstr *MI,
+                                                unsigned OpIdx, unsigned Pref) {
   MachineOperand &MO = MI->getOperand(OpIdx);
   assert(MO.isUndef() && "Expected undef machine operand");
 
@@ -503,7 +299,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
 
   // Update only undef operands that are mapped to one register.
   if (AliasMap[OriginalReg].size() != 1)
-    return;
+    return false;
 
   // Get the undef operand's register class
   const TargetRegisterClass *OpRC =
@@ -518,7 +314,7 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
     // We found a true dependency - replace the undef register with the true
     // dependency.
     MO.setReg(CurrMO.getReg());
-    return;
+    return true;
   }
 
   // Go over all registers in the register class and find the register with
@@ -543,12 +339,14 @@ void ExeDepsFix::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
   // Update the operand if we found a register with better clearance.
   if (MaxClearanceReg != OriginalReg)
     MO.setReg(MaxClearanceReg);
+
+  return false;
 }
 
 /// \brief Return true to if it makes sense to break dependence on a partial def
 /// or undef use.
-bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
-                                       unsigned Pref) {
+bool ExecutionDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
+                                             unsigned Pref) {
   unsigned reg = MI->getOperand(OpIdx).getReg();
   for (int rx : regIndices(reg)) {
     unsigned Clearance = CurInstr - LiveRegs[rx].Def;
@@ -568,8 +366,8 @@ bool ExeDepsFix::shouldBreakDependence(MachineInstr *MI, unsigned OpIdx,
 // If Kill is set, also kill off DomainValues clobbered by the defs.
 //
 // Also break dependencies on partial defs and undef uses.
-void ExeDepsFix::processDefs(MachineInstr *MI, bool breakDependency,
-                             bool Kill) {
+void ExecutionDepsFix::processDefs(MachineInstr *MI, bool breakDependency,
+                                   bool Kill) {
   assert(!MI->isDebugValue() && "Won't process debug values");
 
   // Break dependence on undef uses. Do this before updating LiveRegs below.
@@ -577,8 +375,11 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool breakDependency,
   if (breakDependency) {
     unsigned Pref = TII->getUndefRegClearance(*MI, OpNum, TRI);
     if (Pref) {
-      pickBestRegisterForUndef(MI, OpNum, Pref);
-      if (shouldBreakDependence(MI, OpNum, Pref))
+      bool HadTrueDependency = pickBestRegisterForUndef(MI, OpNum, Pref);
+      // We don't need to bother trying to break a dependency if this
+      // instruction has a true dependency on that register through another
+      // operand - we'll have to wait for it to be available regardless.
+      if (!HadTrueDependency && shouldBreakDependence(MI, OpNum, Pref))
         UndefReads.push_back(std::make_pair(MI, OpNum));
     }
   }
@@ -621,7 +422,7 @@ void ExeDepsFix::processDefs(MachineInstr *MI, bool breakDependency,
 /// only do it on demand. Note that the occurrence of undefined register reads
 /// that should be broken is very rare, but when they occur we may have many in
 /// a single block.
-void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
+void ExecutionDepsFix::processUndefReads(MachineBasicBlock *MBB) {
   if (UndefReads.empty())
     return;
 
@@ -654,7 +455,7 @@ void ExeDepsFix::processUndefReads(MachineBasicBlock *MBB) {
 
 // A hard instruction only works in one domain. All input registers will be
 // forced into that domain.
-void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
+void ExecutionDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
   // Collapse all uses.
   for (unsigned i = mi->getDesc().getNumDefs(),
                 e = mi->getDesc().getNumOperands(); i != e; ++i) {
@@ -677,7 +478,7 @@ void ExeDepsFix::visitHardInstr(MachineInstr *mi, unsigned domain) {
 }
 
 // A soft instruction can be changed to work in other domains given by mask.
-void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
+void ExecutionDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   // Bitmask of available domains for this instruction after taking collapsed
   // operands into account.
   unsigned available = mask;
@@ -721,9 +522,8 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
 
   // Kill off any remaining uses that don't match available, and build a list of
   // incoming DomainValues that we want to merge.
-  SmallVector<LiveReg, 4> Regs;
-  for (SmallVectorImpl<int>::iterator i=used.begin(), e=used.end(); i!=e; ++i) {
-    int rx = *i;
+  SmallVector<const LiveReg *, 4> Regs;
+  for (int rx : used) {
     assert(LiveRegs && "no space allocated for live registers");
     const LiveReg &LR = LiveRegs[rx];
     // This useless DomainValue could have been missed above.
@@ -732,16 +532,11 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
       continue;
     }
     // Sorted insertion.
-    bool Inserted = false;
-    for (SmallVectorImpl<LiveReg>::iterator i = Regs.begin(), e = Regs.end();
-           i != e && !Inserted; ++i) {
-      if (LR.Def < i->Def) {
-        Inserted = true;
-        Regs.insert(i, LR);
-      }
-    }
-    if (!Inserted)
-      Regs.push_back(LR);
+    auto I = std::upper_bound(Regs.begin(), Regs.end(), &LR,
+                              [](const LiveReg *LHS, const LiveReg *RHS) {
+                                return LHS->Def < RHS->Def;
+                              });
+    Regs.insert(I, &LR);
   }
 
   // doms are now sorted in order of appearance. Try to merge them all, giving
@@ -749,14 +544,14 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   DomainValue *dv = nullptr;
   while (!Regs.empty()) {
     if (!dv) {
-      dv = Regs.pop_back_val().Value;
+      dv = Regs.pop_back_val()->Value;
       // Force the first dv to match the current instruction.
       dv->AvailableDomains = dv->getCommonDomains(available);
       assert(dv->AvailableDomains && "Domain should have been filtered");
       continue;
     }
 
-    DomainValue *Latest = Regs.pop_back_val().Value;
+    DomainValue *Latest = Regs.pop_back_val()->Value;
     // Skip already merged values.
     if (Latest == dv || Latest->Next)
       continue;
@@ -794,7 +589,8 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   }
 }
 
-void ExeDepsFix::processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) {
+void ExecutionDepsFix::processBasicBlock(MachineBasicBlock *MBB,
+                                         bool PrimaryPass) {
   enterBasicBlock(MBB);
   // If this block is not done, it makes little sense to make any decisions
   // based on clearance information. We need to make a second pass anyway,
@@ -814,33 +610,13 @@ void ExeDepsFix::processBasicBlock(MachineBasicBlock *MBB, bool PrimaryPass) {
   leaveBasicBlock(MBB);
 }
 
-bool ExeDepsFix::isBlockDone(MachineBasicBlock *MBB) {
+bool ExecutionDepsFix::isBlockDone(MachineBasicBlock *MBB) {
   return MBBInfos[MBB].PrimaryCompleted &&
          MBBInfos[MBB].IncomingCompleted == MBBInfos[MBB].PrimaryIncoming &&
          MBBInfos[MBB].IncomingProcessed == MBB->pred_size();
 }
 
-void ExeDepsFix::updateSuccessors(MachineBasicBlock *MBB, bool Primary) {
-  bool Done = isBlockDone(MBB);
-  for (auto *Succ : MBB->successors()) {
-    if (!isBlockDone(Succ)) {
-      if (Primary) {
-        MBBInfos[Succ].IncomingProcessed++;
-      }
-      if (Done) {
-        MBBInfos[Succ].IncomingCompleted++;
-      }
-      if (isBlockDone(Succ)) {
-        // Perform secondary processing for this successor. See the big comment
-        // in runOnMachineFunction, for an explanation of the iteration order.
-        processBasicBlock(Succ, false);
-        updateSuccessors(Succ, false);
-      }
-    }
-  }
-}
-
-bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
+bool ExecutionDepsFix::runOnMachineFunction(MachineFunction &mf) {
   if (skipFunction(*mf.getFunction()))
     return false;
   MF = &mf;
@@ -912,6 +688,7 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
 
   MachineBasicBlock *Entry = &*MF->begin();
   ReversePostOrderTraversal<MachineBasicBlock*> RPOT(Entry);
+  SmallVector<MachineBasicBlock *, 4> Workqueue;
   for (ReversePostOrderTraversal<MachineBasicBlock*>::rpo_iterator
          MBBI = RPOT.begin(), MBBE = RPOT.end(); MBBI != MBBE; ++MBBI) {
     MachineBasicBlock *MBB = *MBBI;
@@ -919,8 +696,28 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
     // processing this block's predecessors.
     MBBInfos[MBB].PrimaryCompleted = true;
     MBBInfos[MBB].PrimaryIncoming = MBBInfos[MBB].IncomingProcessed;
-    processBasicBlock(MBB, true);
-    updateSuccessors(MBB, true);
+    bool Primary = true;
+    Workqueue.push_back(MBB);
+    while (!Workqueue.empty()) {
+      MachineBasicBlock *ActiveMBB = &*Workqueue.back();
+      Workqueue.pop_back();
+      processBasicBlock(ActiveMBB, Primary);
+      bool Done = isBlockDone(ActiveMBB);
+      for (auto *Succ : ActiveMBB->successors()) {
+        if (!isBlockDone(Succ)) {
+          if (Primary) {
+            MBBInfos[Succ].IncomingProcessed++;
+          }
+          if (Done) {
+            MBBInfos[Succ].IncomingCompleted++;
+          }
+          if (isBlockDone(Succ)) {
+            Workqueue.push_back(Succ);
+          }
+        }
+      }
+      Primary = false;
+    }
   }
 
   // We need to go through again and finalize any blocks that are not done yet.
@@ -956,8 +753,3 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
 
   return false;
 }
-
-FunctionPass *
-llvm::createExecutionDependencyFixPass(const TargetRegisterClass *RC) {
-  return new ExeDepsFix(RC);
-}
diff --git a/lib/CodeGen/FEntryInserter.cpp b/lib/CodeGen/FEntryInserter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0759bf6713e079387f585f1af129ec147ec46cec
--- /dev/null
+++ b/lib/CodeGen/FEntryInserter.cpp
@@ -0,0 +1,55 @@
+//===-- FEntryInsertion.cpp - Patchable prologues for LLVM -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file edits function bodies to insert fentry calls.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+namespace {
+struct FEntryInserter : public MachineFunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  FEntryInserter() : MachineFunctionPass(ID) {
+    initializeFEntryInserterPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+};
+}
+
+bool FEntryInserter::runOnMachineFunction(MachineFunction &MF) {
+  const std::string FEntryName =
+      MF.getFunction()->getFnAttribute("fentry-call").getValueAsString();
+  if (FEntryName != "true")
+    return false;
+
+  auto &FirstMBB = *MF.begin();
+  auto &FirstMI = *FirstMBB.begin();
+
+  auto *TII = MF.getSubtarget().getInstrInfo();
+  BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(),
+          TII->get(TargetOpcode::FENTRY_CALL));
+  return true;
+}
+
+char FEntryInserter::ID = 0;
+char &llvm::FEntryInserterID = FEntryInserter::ID;
+INITIALIZE_PASS(FEntryInserter, "fentry-insert", "Insert fentry calls", false,
+                false)
diff --git a/lib/CodeGen/FaultMaps.cpp b/lib/CodeGen/FaultMaps.cpp
index 2acafafdb9fcc141a36ff408cf048259dd7c9cf3..43f3641289787531d1d4578ec135f50496883f6e 100644
--- a/lib/CodeGen/FaultMaps.cpp
+++ b/lib/CodeGen/FaultMaps.cpp
@@ -1,4 +1,4 @@
-//===---------------------------- FaultMaps.cpp ---------------------------===//
+//===- FaultMaps.cpp ------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/FaultMaps.h"
-
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
 
@@ -102,14 +105,16 @@ void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel,
   }
 }
 
-
 const char *FaultMaps::faultTypeToString(FaultMaps::FaultKind FT) {
   switch (FT) {
   default:
     llvm_unreachable("unhandled fault type!");
-
   case FaultMaps::FaultingLoad:
     return "FaultingLoad";
+  case FaultMaps::FaultingLoadStore:
+    return "FaultingLoadStore";
+  case FaultMaps::FaultingStore:
+    return "FaultingStore";
   }
 }
 
diff --git a/lib/CodeGen/GCStrategy.cpp b/lib/CodeGen/GCStrategy.cpp
index 31ab86fdf276120203e35ec1dd10330809ff1a23..6be4c16c6301e750f957e02b4fd1cc31138aa9a4 100644
--- a/lib/CodeGen/GCStrategy.cpp
+++ b/lib/CodeGen/GCStrategy.cpp
@@ -1,4 +1,4 @@
-//===-- GCStrategy.cpp - Garbage Collector Description --------------------===//
+//===- GCStrategy.cpp - Garbage Collector Description ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,7 +18,4 @@ using namespace llvm;
 
 LLVM_INSTANTIATE_REGISTRY(GCRegistry)
 
-GCStrategy::GCStrategy()
-    : UseStatepoints(false), NeededSafePoints(0), CustomReadBarriers(false),
-      CustomWriteBarriers(false), CustomRoots(false), InitRoots(true),
-      UsesMetadata(false) {}
+GCStrategy::GCStrategy() = default;
diff --git a/lib/CodeGen/GlobalISel/CallLowering.cpp b/lib/CodeGen/GlobalISel/CallLowering.cpp
index 33e70856c95792dca5becab4c1b8a7de68c9fd6e..035a2ac78ed997ff6faf482fd94719088f1658e9 100644
--- a/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -24,42 +24,42 @@
 using namespace llvm;
 
 bool CallLowering::lowerCall(
-    MachineIRBuilder &MIRBuilder, const CallInst &CI, unsigned ResReg,
+    MachineIRBuilder &MIRBuilder, ImmutableCallSite CS, unsigned ResReg,
     ArrayRef<unsigned> ArgRegs, std::function<unsigned()> GetCalleeReg) const {
-  auto &DL = CI.getParent()->getParent()->getParent()->getDataLayout();
+  auto &DL = CS.getParent()->getParent()->getParent()->getDataLayout();
 
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
   // we'll pass to the assigner function.
   SmallVector<ArgInfo, 8> OrigArgs;
   unsigned i = 0;
-  unsigned NumFixedArgs = CI.getFunctionType()->getNumParams();
-  for (auto &Arg : CI.arg_operands()) {
+  unsigned NumFixedArgs = CS.getFunctionType()->getNumParams();
+  for (auto &Arg : CS.args()) {
     ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{},
                     i < NumFixedArgs};
-    setArgFlags(OrigArg, i + 1, DL, CI);
+    setArgFlags(OrigArg, i + 1, DL, CS);
     OrigArgs.push_back(OrigArg);
     ++i;
   }
 
   MachineOperand Callee = MachineOperand::CreateImm(0);
-  if (Function *F = CI.getCalledFunction())
+  if (const Function *F = CS.getCalledFunction())
     Callee = MachineOperand::CreateGA(F, 0);
   else
     Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
 
-  ArgInfo OrigRet{ResReg, CI.getType(), ISD::ArgFlagsTy{}};
+  ArgInfo OrigRet{ResReg, CS.getType(), ISD::ArgFlagsTy{}};
   if (!OrigRet.Ty->isVoidTy())
-    setArgFlags(OrigRet, AttributeSet::ReturnIndex, DL, CI);
+    setArgFlags(OrigRet, AttributeList::ReturnIndex, DL, CS);
 
-  return lowerCall(MIRBuilder, Callee, OrigRet, OrigArgs);
+  return lowerCall(MIRBuilder, CS.getCallingConv(), Callee, OrigRet, OrigArgs);
 }
 
 template <typename FuncInfoTy>
 void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                const DataLayout &DL,
                                const FuncInfoTy &FuncInfo) const {
-  const AttributeSet &Attrs = FuncInfo.getAttributes();
+  const AttributeList &Attrs = FuncInfo.getAttributes();
   if (Attrs.hasAttribute(OpIdx, Attribute::ZExt))
     Arg.Flags.setZExt();
   if (Attrs.hasAttribute(OpIdx, Attribute::SExt))
@@ -121,8 +121,16 @@ bool CallLowering::handleAssignments(MachineIRBuilder &MIRBuilder,
       return false;
   }
 
-  for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
+  for (unsigned i = 0, e = Args.size(), j = 0; i != e; ++i, ++j) {
+    assert(j < ArgLocs.size() && "Skipped too many arg locs");
+
+    CCValAssign &VA = ArgLocs[j];
+    assert(VA.getValNo() == i && "Location doesn't correspond to current arg");
+
+    if (VA.needsCustom()) {
+      j += Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j));
+      continue;
+    }
 
     if (VA.isRegLoc())
       Handler.assignValueToReg(Args[i].Reg, VA.getLocReg(), VA);
diff --git a/lib/CodeGen/GlobalISel/IRTranslator.cpp b/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 0f66bd9d1716cebcbc32940a718139fe0892df39..766187378446903fa26e2c3d58885b51d2fa4003 100644
--- a/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -12,8 +12,10 @@
 
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -28,6 +30,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
 #include "llvm/Target/TargetLowering.h"
 
@@ -42,11 +45,21 @@ INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_END(IRTranslator, DEBUG_TYPE, "IRTranslator LLVM IR -> MI",
                 false, false)
 
-static void reportTranslationError(const Value &V, const Twine &Message) {
-  std::string ErrStorage;
-  raw_string_ostream Err(ErrStorage);
-  Err << Message << ": " << V << '\n';
-  report_fatal_error(Err.str());
+static void reportTranslationError(MachineFunction &MF,
+                                   const TargetPassConfig &TPC,
+                                   OptimizationRemarkEmitter &ORE,
+                                   OptimizationRemarkMissed &R) {
+  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled())
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (TPC.isGlobalISelAbortEnabled())
+    report_fatal_error(R.getMsg());
+  else
+    ORE.emit(R);
 }
 
 IRTranslator::IRTranslator() : MachineFunctionPass(ID), MRI(nullptr) {
@@ -69,18 +82,19 @@ unsigned IRTranslator::getOrCreateVReg(const Value &Val) {
   // we need to concat together to produce the value.
   assert(Val.getType()->isSized() &&
          "Don't know how to create an empty vreg");
-  unsigned VReg = MRI->createGenericVirtualRegister(LLT{*Val.getType(), *DL});
+  unsigned VReg =
+      MRI->createGenericVirtualRegister(getLLTForType(*Val.getType(), *DL));
   ValReg = VReg;
 
   if (auto CV = dyn_cast<Constant>(&Val)) {
     bool Success = translate(*CV, VReg);
     if (!Success) {
-      if (!TPC->isGlobalISelAbortEnabled()) {
-        MF->getProperties().set(
-            MachineFunctionProperties::Property::FailedISel);
-        return VReg;
-      }
-      reportTranslationError(Val, "unable to translate constant");
+      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                 MF->getFunction()->getSubprogram(),
+                                 &MF->getFunction()->getEntryBlock());
+      R << "unable to translate constant: " << ore::NV("Type", Val.getType());
+      reportTranslationError(*MF, *TPC, *ORE, R);
+      return VReg;
     }
   }
 
@@ -116,25 +130,19 @@ unsigned IRTranslator::getMemOpAlignment(const Instruction &I) {
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
     Alignment = LI->getAlignment();
     ValTy = LI->getType();
-  } else if (!TPC->isGlobalISelAbortEnabled()) {
-    MF->getProperties().set(
-        MachineFunctionProperties::Property::FailedISel);
+  } else {
+    OptimizationRemarkMissed R("gisel-irtranslator", "", &I);
+    R << "unable to translate memop: " << ore::NV("Opcode", &I);
+    reportTranslationError(*MF, *TPC, *ORE, R);
     return 1;
-  } else
-    llvm_unreachable("unhandled memory instruction");
+  }
 
   return Alignment ? Alignment : DL->getABITypeAlignment(ValTy);
 }
 
-MachineBasicBlock &IRTranslator::getOrCreateBB(const BasicBlock &BB) {
+MachineBasicBlock &IRTranslator::getMBB(const BasicBlock &BB) {
   MachineBasicBlock *&MBB = BBToMBB[&BB];
-  if (!MBB) {
-    MBB = MF->CreateMachineBasicBlock(&BB);
-    MF->push_back(MBB);
-
-    if (BB.hasAddressTaken())
-      MBB->setHasAddressTaken();
-  }
+  assert(MBB && "BasicBlock was not encountered before");
   return *MBB;
 }
 
@@ -158,6 +166,18 @@ bool IRTranslator::translateBinaryOp(unsigned Opcode, const User &U,
   return true;
 }
 
+bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) {
+  // -0.0 - X --> G_FNEG
+  if (isa<Constant>(U.getOperand(0)) &&
+      U.getOperand(0) == ConstantFP::getZeroValueForNegation(U.getType())) {
+    MIRBuilder.buildInstr(TargetOpcode::G_FNEG)
+        .addDef(getOrCreateVReg(U))
+        .addUse(getOrCreateVReg(*U.getOperand(1)));
+    return true;
+  }
+  return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder);
+}
+
 bool IRTranslator::translateCompare(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
   const CmpInst *CI = dyn_cast<CmpInst>(&U);
@@ -167,9 +187,14 @@ bool IRTranslator::translateCompare(const User &U,
   CmpInst::Predicate Pred =
       CI ? CI->getPredicate() : static_cast<CmpInst::Predicate>(
                                     cast<ConstantExpr>(U).getPredicate());
-
   if (CmpInst::isIntPredicate(Pred))
     MIRBuilder.buildICmp(Pred, Res, Op0, Op1);
+  else if (Pred == CmpInst::FCMP_FALSE)
+    MIRBuilder.buildCopy(
+        Res, getOrCreateVReg(*Constant::getNullValue(CI->getType())));
+  else if (Pred == CmpInst::FCMP_TRUE)
+    MIRBuilder.buildCopy(
+        Res, getOrCreateVReg(*Constant::getAllOnesValue(CI->getType())));
   else
     MIRBuilder.buildFCmp(Pred, Res, Op0, Op1);
 
@@ -192,18 +217,21 @@ bool IRTranslator::translateBr(const User &U, MachineIRBuilder &MIRBuilder) {
     // We want a G_BRCOND to the true BB followed by an unconditional branch.
     unsigned Tst = getOrCreateVReg(*BrInst.getCondition());
     const BasicBlock &TrueTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ++));
-    MachineBasicBlock &TrueBB = getOrCreateBB(TrueTgt);
+    MachineBasicBlock &TrueBB = getMBB(TrueTgt);
     MIRBuilder.buildBrCond(Tst, TrueBB);
   }
 
   const BasicBlock &BrTgt = *cast<BasicBlock>(BrInst.getSuccessor(Succ));
-  MachineBasicBlock &TgtBB = getOrCreateBB(BrTgt);
-  MIRBuilder.buildBr(TgtBB);
+  MachineBasicBlock &TgtBB = getMBB(BrTgt);
+  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
+
+  // If the unconditional target is the layout successor, fallthrough.
+  if (!CurBB.isLayoutSuccessor(&TgtBB))
+    MIRBuilder.buildBr(TgtBB);
 
   // Link successors.
-  MachineBasicBlock &CurBB = MIRBuilder.getMBB();
   for (const BasicBlock *Succ : BrInst.successors())
-    CurBB.addSuccessor(&getOrCreateBB(*Succ));
+    CurBB.addSuccessor(&getMBB(*Succ));
   return true;
 }
 
@@ -220,14 +248,14 @@ bool IRTranslator::translateSwitch(const User &U,
   const unsigned SwCondValue = getOrCreateVReg(*SwInst.getCondition());
   const BasicBlock *OrigBB = SwInst.getParent();
 
-  LLT LLTi1 = LLT(*Type::getInt1Ty(U.getContext()), *DL);
+  LLT LLTi1 = getLLTForType(*Type::getInt1Ty(U.getContext()), *DL);
   for (auto &CaseIt : SwInst.cases()) {
     const unsigned CaseValueReg = getOrCreateVReg(*CaseIt.getCaseValue());
     const unsigned Tst = MRI->createGenericVirtualRegister(LLTi1);
     MIRBuilder.buildICmp(CmpInst::ICMP_EQ, Tst, CaseValueReg, SwCondValue);
     MachineBasicBlock &CurMBB = MIRBuilder.getMBB();
     const BasicBlock *TrueBB = CaseIt.getCaseSuccessor();
-    MachineBasicBlock &TrueMBB = getOrCreateBB(*TrueBB);
+    MachineBasicBlock &TrueMBB = getMBB(*TrueBB);
 
     MIRBuilder.buildBrCond(Tst, TrueMBB);
     CurMBB.addSuccessor(&TrueMBB);
@@ -235,7 +263,8 @@ bool IRTranslator::translateSwitch(const User &U,
 
     MachineBasicBlock *FalseMBB =
         MF->CreateMachineBasicBlock(SwInst.getParent());
-    MF->push_back(FalseMBB);
+    // Insert the comparison blocks one after the other.
+    MF->insert(std::next(CurMBB.getIterator()), FalseMBB);
     MIRBuilder.buildBr(*FalseMBB);
     CurMBB.addSuccessor(FalseMBB);
 
@@ -243,7 +272,7 @@ bool IRTranslator::translateSwitch(const User &U,
   }
   // handle default case
   const BasicBlock *DefaultBB = SwInst.getDefaultDest();
-  MachineBasicBlock &DefaultMBB = getOrCreateBB(*DefaultBB);
+  MachineBasicBlock &DefaultMBB = getMBB(*DefaultBB);
   MIRBuilder.buildBr(DefaultMBB);
   MachineBasicBlock &CurMBB = MIRBuilder.getMBB();
   CurMBB.addSuccessor(&DefaultMBB);
@@ -262,7 +291,7 @@ bool IRTranslator::translateIndirectBr(const User &U,
   // Link successors.
   MachineBasicBlock &CurBB = MIRBuilder.getMBB();
   for (const BasicBlock *Succ : BrInst.successors())
-    CurBB.addSuccessor(&getOrCreateBB(*Succ));
+    CurBB.addSuccessor(&getMBB(*Succ));
 
   return true;
 }
@@ -270,47 +299,38 @@ bool IRTranslator::translateIndirectBr(const User &U,
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
 
-  if (!TPC->isGlobalISelAbortEnabled() && LI.isAtomic())
-    return false;
-
-  assert(!LI.isAtomic() && "only non-atomic loads are supported at the moment");
   auto Flags = LI.isVolatile() ? MachineMemOperand::MOVolatile
                                : MachineMemOperand::MONone;
   Flags |= MachineMemOperand::MOLoad;
 
   unsigned Res = getOrCreateVReg(LI);
   unsigned Addr = getOrCreateVReg(*LI.getPointerOperand());
-  LLT VTy{*LI.getType(), *DL}, PTy{*LI.getPointerOperand()->getType(), *DL};
+
   MIRBuilder.buildLoad(
       Res, Addr,
       *MF->getMachineMemOperand(MachinePointerInfo(LI.getPointerOperand()),
                                 Flags, DL->getTypeStoreSize(LI.getType()),
-                                getMemOpAlignment(LI)));
+                                getMemOpAlignment(LI), AAMDNodes(), nullptr,
+                                LI.getSynchScope(), LI.getOrdering()));
   return true;
 }
 
 bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
   const StoreInst &SI = cast<StoreInst>(U);
-
-  if (!TPC->isGlobalISelAbortEnabled() && SI.isAtomic())
-    return false;
-
-  assert(!SI.isAtomic() && "only non-atomic stores supported at the moment");
   auto Flags = SI.isVolatile() ? MachineMemOperand::MOVolatile
                                : MachineMemOperand::MONone;
   Flags |= MachineMemOperand::MOStore;
 
   unsigned Val = getOrCreateVReg(*SI.getValueOperand());
   unsigned Addr = getOrCreateVReg(*SI.getPointerOperand());
-  LLT VTy{*SI.getValueOperand()->getType(), *DL},
-      PTy{*SI.getPointerOperand()->getType(), *DL};
 
   MIRBuilder.buildStore(
       Val, Addr,
       *MF->getMachineMemOperand(
           MachinePointerInfo(SI.getPointerOperand()), Flags,
           DL->getTypeStoreSize(SI.getValueOperand()->getType()),
-          getMemOpAlignment(SI)));
+          getMemOpAlignment(SI), AAMDNodes(), nullptr, SI.getSynchScope(),
+          SI.getOrdering()));
   return true;
 }
 
@@ -335,7 +355,7 @@ bool IRTranslator::translateExtractValue(const User &U,
   uint64_t Offset = 8 * DL->getIndexedOffsetInType(Src->getType(), Indices);
 
   unsigned Res = getOrCreateVReg(U);
-  MIRBuilder.buildExtract(Res, Offset, getOrCreateVReg(*Src));
+  MIRBuilder.buildExtract(Res, getOrCreateVReg(*Src), Offset);
 
   return true;
 }
@@ -378,12 +398,18 @@ bool IRTranslator::translateSelect(const User &U,
 
 bool IRTranslator::translateBitCast(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
-  if (LLT{*U.getOperand(0)->getType(), *DL} == LLT{*U.getType(), *DL}) {
+  // If we're bitcasting to the source type, we can reuse the source vreg.
+  if (getLLTForType(*U.getOperand(0)->getType(), *DL) ==
+      getLLTForType(*U.getType(), *DL)) {
+    // Get the source vreg now, to avoid invalidating ValToVReg.
+    unsigned SrcReg = getOrCreateVReg(*U.getOperand(0));
     unsigned &Reg = ValToVReg[&U];
+    // If we already assigned a vreg for this bitcast, we can't change that.
+    // Emit a copy to satisfy the users we already emitted.
     if (Reg)
-      MIRBuilder.buildCopy(Reg, getOrCreateVReg(*U.getOperand(0)));
+      MIRBuilder.buildCopy(Reg, SrcReg);
     else
-      Reg = getOrCreateVReg(*U.getOperand(0));
+      Reg = SrcReg;
     return true;
   }
   return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder);
@@ -405,9 +431,10 @@ bool IRTranslator::translateGetElementPtr(const User &U,
 
   Value &Op0 = *U.getOperand(0);
   unsigned BaseReg = getOrCreateVReg(Op0);
-  LLT PtrTy{*Op0.getType(), *DL};
-  unsigned PtrSize = DL->getPointerSizeInBits(PtrTy.getAddressSpace());
-  LLT OffsetTy = LLT::scalar(PtrSize);
+  Type *PtrIRTy = Op0.getType();
+  LLT PtrTy = getLLTForType(*PtrIRTy, *DL);
+  Type *OffsetIRTy = DL->getIntPtrType(PtrIRTy);
+  LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
 
   int64_t Offset = 0;
   for (gep_type_iterator GTI = gep_type_begin(&U), E = gep_type_end(&U);
@@ -429,8 +456,8 @@ bool IRTranslator::translateGetElementPtr(const User &U,
 
       if (Offset != 0) {
         unsigned NewBaseReg = MRI->createGenericVirtualRegister(PtrTy);
-        unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
-        MIRBuilder.buildConstant(OffsetReg, Offset);
+        unsigned OffsetReg =
+            getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset));
         MIRBuilder.buildGEP(NewBaseReg, BaseReg, OffsetReg);
 
         BaseReg = NewBaseReg;
@@ -438,8 +465,8 @@ bool IRTranslator::translateGetElementPtr(const User &U,
       }
 
       // N = N + Idx * ElementSize;
-      unsigned ElementSizeReg = MRI->createGenericVirtualRegister(OffsetTy);
-      MIRBuilder.buildConstant(ElementSizeReg, ElementSize);
+      unsigned ElementSizeReg =
+          getOrCreateVReg(*ConstantInt::get(OffsetIRTy, ElementSize));
 
       unsigned IdxReg = getOrCreateVReg(*Idx);
       if (MRI->getType(IdxReg) != OffsetTy) {
@@ -458,8 +485,7 @@ bool IRTranslator::translateGetElementPtr(const User &U,
   }
 
   if (Offset != 0) {
-    unsigned OffsetReg = MRI->createGenericVirtualRegister(OffsetTy);
-    MIRBuilder.buildConstant(OffsetReg, Offset);
+    unsigned OffsetReg = getOrCreateVReg(*ConstantInt::get(OffsetIRTy, Offset));
     MIRBuilder.buildGEP(getOrCreateVReg(U), BaseReg, OffsetReg);
     return true;
   }
@@ -471,7 +497,7 @@ bool IRTranslator::translateGetElementPtr(const User &U,
 bool IRTranslator::translateMemfunc(const CallInst &CI,
                                     MachineIRBuilder &MIRBuilder,
                                     unsigned ID) {
-  LLT SizeTy{*CI.getArgOperand(2)->getType(), *DL};
+  LLT SizeTy = getLLTForType(*CI.getArgOperand(2)->getType(), *DL);
   Type *DstTy = CI.getArgOperand(0)->getType();
   if (cast<PointerType>(DstTy)->getAddressSpace() != 0 ||
       SizeTy.getSizeInBits() != DL->getPointerSizeInBits(0))
@@ -500,7 +526,8 @@ bool IRTranslator::translateMemfunc(const CallInst &CI,
     return false;
   }
 
-  return CLI->lowerCall(MIRBuilder, MachineOperand::CreateES(Callee),
+  return CLI->lowerCall(MIRBuilder, CI.getCallingConv(),
+                        MachineOperand::CreateES(Callee),
                         CallLowering::ArgInfo(0, CI.getType()), Args);
 }
 
@@ -528,7 +555,7 @@ void IRTranslator::getStackGuard(unsigned DstReg,
 
 bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
                                               MachineIRBuilder &MIRBuilder) {
-  LLT Ty{*CI.getOperand(0)->getType(), *DL};
+  LLT Ty = getLLTForType(*CI.getOperand(0)->getType(), *DL);
   LLT s1 = LLT::scalar(1);
   unsigned Width = Ty.getSizeInBits();
   unsigned Res = MRI->createGenericVirtualRegister(Ty);
@@ -540,8 +567,8 @@ bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
                  .addUse(getOrCreateVReg(*CI.getOperand(1)));
 
   if (Op == TargetOpcode::G_UADDE || Op == TargetOpcode::G_USUBE) {
-    unsigned Zero = MRI->createGenericVirtualRegister(s1);
-    EntryBuilder.buildConstant(Zero, 0);
+    unsigned Zero = getOrCreateVReg(
+        *Constant::getNullValue(Type::getInt1Ty(CI.getContext())));
     MIB.addUse(Zero);
   }
 
@@ -554,6 +581,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   switch (ID) {
   default:
     break;
+  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
+    // Stack coloring is not enabled in O0 (which we care about now) so we can
+    // drop these. Make sure someone notices when we start compiling at higher
+    // opts though.
+    if (MF->getTarget().getOptLevel() != CodeGenOpt::None)
+      return false;
+    return true;
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI);
     assert(DI.getVariable() && "Missing variable");
@@ -564,18 +599,33 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
       return true;
     }
 
-    unsigned Reg = getOrCreateVReg(*Address);
-    auto RegDef = MRI->def_instr_begin(Reg);
     assert(DI.getVariable()->isValidLocationForIntrinsic(
                MIRBuilder.getDebugLoc()) &&
            "Expected inlined-at fields to agree");
-
-    if (RegDef != MRI->def_instr_end() &&
-        RegDef->getOpcode() == TargetOpcode::G_FRAME_INDEX) {
-      MIRBuilder.buildFIDbgValue(RegDef->getOperand(1).getIndex(),
-                                 DI.getVariable(), DI.getExpression());
+    auto AI = dyn_cast<AllocaInst>(Address);
+    if (AI && AI->isStaticAlloca()) {
+      // Static allocas are tracked at the MF level, no need for DBG_VALUE
+      // instructions (in fact, they get ignored if they *do* exist).
+      MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(),
+                             getOrCreateFrameIndex(*AI), DI.getDebugLoc());
     } else
-      MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), DI.getExpression());
+      MIRBuilder.buildDirectDbgValue(getOrCreateVReg(*Address),
+                                     DI.getVariable(), DI.getExpression());
+    return true;
+  }
+  case Intrinsic::vaend:
+    // No target I know of cares about va_end. Certainly no in-tree target
+    // does. Simplest intrinsic ever!
+    return true;
+  case Intrinsic::vastart: {
+    auto &TLI = *MF->getSubtarget().getTargetLowering();
+    Value *Ptr = CI.getArgOperand(0);
+    unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8;
+
+    MIRBuilder.buildInstr(TargetOpcode::G_VASTART)
+        .addUse(getOrCreateVReg(*Ptr))
+        .addMemOperand(MF->getMachineMemOperand(
+            MachinePointerInfo(Ptr), MachineMemOperand::MOStore, ListSize, 0));
     return true;
   }
   case Intrinsic::dbg_value: {
@@ -620,6 +670,12 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder);
   case Intrinsic::smul_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder);
+  case Intrinsic::pow:
+    MIRBuilder.buildInstr(TargetOpcode::G_FPOW)
+        .addDef(getOrCreateVReg(CI))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(0)))
+        .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
+    return true;
   case Intrinsic::memcpy:
   case Intrinsic::memmove:
   case Intrinsic::memset:
@@ -642,7 +698,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     getStackGuard(getOrCreateVReg(CI), MIRBuilder);
     return true;
   case Intrinsic::stackprotector: {
-    LLT PtrTy{*CI.getArgOperand(0)->getType(), *DL};
+    LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
     unsigned GuardVal = MRI->createGenericVirtualRegister(PtrTy);
     getStackGuard(GuardVal, MIRBuilder);
 
@@ -660,13 +716,32 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   return false;
 }
 
+bool IRTranslator::translateInlineAsm(const CallInst &CI,
+                                      MachineIRBuilder &MIRBuilder) {
+  const InlineAsm &IA = cast<InlineAsm>(*CI.getCalledValue());
+  if (!IA.getConstraintString().empty())
+    return false;
+
+  unsigned ExtraInfo = 0;
+  if (IA.hasSideEffects())
+    ExtraInfo |= InlineAsm::Extra_HasSideEffects;
+  if (IA.getDialect() == InlineAsm::AD_Intel)
+    ExtraInfo |= InlineAsm::Extra_AsmDialect;
+
+  MIRBuilder.buildInstr(TargetOpcode::INLINEASM)
+    .addExternalSymbol(IA.getAsmString().c_str())
+    .addImm(ExtraInfo);
+
+  return true;
+}
+
 bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   const CallInst &CI = cast<CallInst>(U);
   auto TII = MF->getTarget().getIntrinsicInfo();
   const Function *F = CI.getCalledFunction();
 
   if (CI.isInlineAsm())
-    return false;
+    return translateInlineAsm(CI, MIRBuilder);
 
   if (!F || !F->isIntrinsic()) {
     unsigned Res = CI.getType()->isVoidTy() ? 0 : getOrCreateVReg(CI);
@@ -674,7 +749,8 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
     for (auto &Arg: CI.arg_operands())
       Args.push_back(getOrCreateVReg(*Arg));
 
-    return CLI->lowerCall(MIRBuilder, CI, Res, Args, [&]() {
+    MF->getFrameInfo().setHasCalls(true);
+    return CLI->lowerCall(MIRBuilder, &CI, Res, Args, [&]() {
       return getOrCreateVReg(*CI.getCalledValue());
     });
   }
@@ -693,10 +769,10 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
       MIRBuilder.buildIntrinsic(ID, Res, !CI.doesNotAccessMemory());
 
   for (auto &Arg : CI.arg_operands()) {
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(Arg))
-      MIB.addImm(CI->getSExtValue());
-    else
-      MIB.addUse(getOrCreateVReg(*Arg));
+    // Some intrinsics take metadata parameters. Reject them.
+    if (isa<MetadataAsValue>(Arg))
+      return false;
+    MIB.addUse(getOrCreateVReg(*Arg));
   }
   return true;
 }
@@ -709,7 +785,7 @@ bool IRTranslator::translateInvoke(const User &U,
   const BasicBlock *ReturnBB = I.getSuccessor(0);
   const BasicBlock *EHPadBB = I.getSuccessor(1);
 
-  const Value *Callee(I.getCalledValue());
+  const Value *Callee = I.getCalledValue();
   const Function *Fn = dyn_cast<Function>(Callee);
   if (isa<InlineAsm>(Callee))
     return false;
@@ -733,27 +809,24 @@ bool IRTranslator::translateInvoke(const User &U,
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
 
   unsigned Res = I.getType()->isVoidTy() ? 0 : getOrCreateVReg(I);
-  SmallVector<CallLowering::ArgInfo, 8> Args;
+  SmallVector<unsigned, 8> Args;
   for (auto &Arg: I.arg_operands())
-    Args.emplace_back(getOrCreateVReg(*Arg), Arg->getType());
+    Args.push_back(getOrCreateVReg(*Arg));
 
-  auto CalleeMO =
-      Fn ? MachineOperand::CreateGA(Fn, 0)
-         : MachineOperand::CreateReg(getOrCreateVReg(*Callee), false);
-
-  if (!CLI->lowerCall(MIRBuilder, CalleeMO,
-                      CallLowering::ArgInfo(Res, I.getType()), Args))
+  if (!CLI->lowerCall(MIRBuilder, &I, Res, Args,
+                      [&]() { return getOrCreateVReg(*I.getCalledValue()); }))
     return false;
 
   MCSymbol *EndSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(EndSymbol);
 
   // FIXME: track probabilities.
-  MachineBasicBlock &EHPadMBB = getOrCreateBB(*EHPadBB),
-                    &ReturnMBB = getOrCreateBB(*ReturnBB);
+  MachineBasicBlock &EHPadMBB = getMBB(*EHPadBB),
+                    &ReturnMBB = getMBB(*ReturnBB);
   MF->addInvoke(&EHPadMBB, BeginSymbol, EndSymbol);
   MIRBuilder.getMBB().addSuccessor(&ReturnMBB);
   MIRBuilder.getMBB().addSuccessor(&EHPadMBB);
+  MIRBuilder.buildBr(ReturnMBB);
 
   return true;
 }
@@ -787,52 +860,158 @@ bool IRTranslator::translateLandingPad(const User &U,
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL)
     .addSym(MF->addLandingPad(&MBB));
 
+  LLT Ty = getLLTForType(*LP.getType(), *DL);
+  unsigned Undef = MRI->createGenericVirtualRegister(Ty);
+  MIRBuilder.buildUndef(Undef);
+
   SmallVector<LLT, 2> Tys;
   for (Type *Ty : cast<StructType>(LP.getType())->elements())
-    Tys.push_back(LLT{*Ty, *DL});
+    Tys.push_back(getLLTForType(*Ty, *DL));
   assert(Tys.size() == 2 && "Only two-valued landingpads are supported");
 
   // Mark exception register as live in.
-  SmallVector<unsigned, 2> Regs;
-  SmallVector<uint64_t, 2> Offsets;
-  if (unsigned Reg = TLI.getExceptionPointerRegister(PersonalityFn)) {
-    MBB.addLiveIn(Reg);
-    unsigned VReg = MRI->createGenericVirtualRegister(Tys[0]);
-    MIRBuilder.buildCopy(VReg, Reg);
-    Regs.push_back(VReg);
-    Offsets.push_back(0);
-  }
-
-  if (unsigned Reg = TLI.getExceptionSelectorRegister(PersonalityFn)) {
-    MBB.addLiveIn(Reg);
-
-    // N.b. the exception selector register always has pointer type and may not
-    // match the actual IR-level type in the landingpad so an extra cast is
-    // needed.
-    unsigned PtrVReg = MRI->createGenericVirtualRegister(Tys[0]);
-    MIRBuilder.buildCopy(PtrVReg, Reg);
-
-    unsigned VReg = MRI->createGenericVirtualRegister(Tys[1]);
-    MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT)
-        .addDef(VReg)
-        .addUse(PtrVReg);
-    Regs.push_back(VReg);
-    Offsets.push_back(Tys[0].getSizeInBits());
-  }
-
-  MIRBuilder.buildSequence(getOrCreateVReg(LP), Regs, Offsets);
+  unsigned ExceptionReg = TLI.getExceptionPointerRegister(PersonalityFn);
+  if (!ExceptionReg)
+    return false;
+
+  MBB.addLiveIn(ExceptionReg);
+  unsigned VReg = MRI->createGenericVirtualRegister(Tys[0]),
+           Tmp = MRI->createGenericVirtualRegister(Ty);
+  MIRBuilder.buildCopy(VReg, ExceptionReg);
+  MIRBuilder.buildInsert(Tmp, Undef, VReg, 0);
+
+  unsigned SelectorReg = TLI.getExceptionSelectorRegister(PersonalityFn);
+  if (!SelectorReg)
+    return false;
+
+  MBB.addLiveIn(SelectorReg);
+
+  // N.b. the exception selector register always has pointer type and may not
+  // match the actual IR-level type in the landingpad so an extra cast is
+  // needed.
+  unsigned PtrVReg = MRI->createGenericVirtualRegister(Tys[0]);
+  MIRBuilder.buildCopy(PtrVReg, SelectorReg);
+
+  VReg = MRI->createGenericVirtualRegister(Tys[1]);
+  MIRBuilder.buildInstr(TargetOpcode::G_PTRTOINT).addDef(VReg).addUse(PtrVReg);
+  MIRBuilder.buildInsert(getOrCreateVReg(LP), Tmp, VReg,
+                         Tys[0].getSizeInBits());
   return true;
 }
 
-bool IRTranslator::translateStaticAlloca(const AllocaInst &AI,
-                                         MachineIRBuilder &MIRBuilder) {
-  if (!TPC->isGlobalISelAbortEnabled() && !AI.isStaticAlloca())
-    return false;
+bool IRTranslator::translateAlloca(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  auto &AI = cast<AllocaInst>(U);
+
+  if (AI.isStaticAlloca()) {
+    unsigned Res = getOrCreateVReg(AI);
+    int FI = getOrCreateFrameIndex(AI);
+    MIRBuilder.buildFrameIndex(Res, FI);
+    return true;
+  }
 
-  assert(AI.isStaticAlloca() && "only handle static allocas now");
-  unsigned Res = getOrCreateVReg(AI);
-  int FI = getOrCreateFrameIndex(AI);
-  MIRBuilder.buildFrameIndex(Res, FI);
+  // Now we're in the harder dynamic case.
+  Type *Ty = AI.getAllocatedType();
+  unsigned Align =
+      std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI.getAlignment());
+
+  unsigned NumElts = getOrCreateVReg(*AI.getArraySize());
+
+  Type *IntPtrIRTy = DL->getIntPtrType(AI.getType());
+  LLT IntPtrTy = getLLTForType(*IntPtrIRTy, *DL);
+  if (MRI->getType(NumElts) != IntPtrTy) {
+    unsigned ExtElts = MRI->createGenericVirtualRegister(IntPtrTy);
+    MIRBuilder.buildZExtOrTrunc(ExtElts, NumElts);
+    NumElts = ExtElts;
+  }
+
+  unsigned AllocSize = MRI->createGenericVirtualRegister(IntPtrTy);
+  unsigned TySize =
+      getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, -DL->getTypeAllocSize(Ty)));
+  MIRBuilder.buildMul(AllocSize, NumElts, TySize);
+
+  LLT PtrTy = getLLTForType(*AI.getType(), *DL);
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  unsigned SPReg = TLI.getStackPointerRegisterToSaveRestore();
+
+  unsigned SPTmp = MRI->createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildCopy(SPTmp, SPReg);
+
+  unsigned AllocTmp = MRI->createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildGEP(AllocTmp, SPTmp, AllocSize);
+
+  // Handle alignment. We have to realign if the allocation granule was smaller
+  // than stack alignment, or the specific alloca requires more than stack
+  // alignment.
+  unsigned StackAlign =
+      MF->getSubtarget().getFrameLowering()->getStackAlignment();
+  Align = std::max(Align, StackAlign);
+  if (Align > StackAlign || DL->getTypeAllocSize(Ty) % StackAlign != 0) {
+    // Round the size of the allocation up to the stack alignment size
+    // by add SA-1 to the size. This doesn't overflow because we're computing
+    // an address inside an alloca.
+    unsigned AlignedAlloc = MRI->createGenericVirtualRegister(PtrTy);
+    MIRBuilder.buildPtrMask(AlignedAlloc, AllocTmp, Log2_32(Align));
+    AllocTmp = AlignedAlloc;
+  }
+
+  MIRBuilder.buildCopy(SPReg, AllocTmp);
+  MIRBuilder.buildCopy(getOrCreateVReg(AI), AllocTmp);
+
+  MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI);
+  assert(MF->getFrameInfo().hasVarSizedObjects());
+  return true;
+}
+
+bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) {
+  // FIXME: We may need more info about the type. Because of how LLT works,
+  // we're completely discarding the i64/double distinction here (amongst
+  // others). Fortunately the ABIs I know of where that matters don't use va_arg
+  // anyway but that's not guaranteed.
+  MIRBuilder.buildInstr(TargetOpcode::G_VAARG)
+    .addDef(getOrCreateVReg(U))
+    .addUse(getOrCreateVReg(*U.getOperand(0)))
+    .addImm(DL->getABITypeAlignment(U.getType()));
+  return true;
+}
+
+bool IRTranslator::translateInsertElement(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  // If it is a <1 x Ty> vector, use the scalar as it is
+  // not a legal vector type in LLT.
+  if (U.getType()->getVectorNumElements() == 1) {
+    unsigned Elt = getOrCreateVReg(*U.getOperand(1));
+    ValToVReg[&U] = Elt;
+    return true;
+  }
+  MIRBuilder.buildInsertVectorElement(
+      getOrCreateVReg(U), getOrCreateVReg(*U.getOperand(0)),
+      getOrCreateVReg(*U.getOperand(1)), getOrCreateVReg(*U.getOperand(2)));
+  return true;
+}
+
+bool IRTranslator::translateExtractElement(const User &U,
+                                           MachineIRBuilder &MIRBuilder) {
+  // If it is a <1 x Ty> vector, use the scalar as it is
+  // not a legal vector type in LLT.
+  if (U.getOperand(0)->getType()->getVectorNumElements() == 1) {
+    unsigned Elt = getOrCreateVReg(*U.getOperand(0));
+    ValToVReg[&U] = Elt;
+    return true;
+  }
+  MIRBuilder.buildExtractVectorElement(getOrCreateVReg(U),
+                                       getOrCreateVReg(*U.getOperand(0)),
+                                       getOrCreateVReg(*U.getOperand(1)));
+  return true;
+}
+
+bool IRTranslator::translateShuffleVector(const User &U,
+                                          MachineIRBuilder &MIRBuilder) {
+  MIRBuilder.buildInstr(TargetOpcode::G_SHUFFLE_VECTOR)
+      .addDef(getOrCreateVReg(U))
+      .addUse(getOrCreateVReg(*U.getOperand(0)))
+      .addUse(getOrCreateVReg(*U.getOperand(1)))
+      .addUse(getOrCreateVReg(*U.getOperand(2)));
   return true;
 }
 
@@ -880,9 +1059,7 @@ bool IRTranslator::translate(const Instruction &Inst) {
     case Instruction::OPCODE: return translate##OPCODE(Inst, CurBuilder);
 #include "llvm/IR/Instruction.def"
   default:
-    if (!TPC->isGlobalISelAbortEnabled())
-      return false;
-    llvm_unreachable("unknown opcode");
+    return false;
   }
 }
 
@@ -892,25 +1069,43 @@ bool IRTranslator::translate(const Constant &C, unsigned Reg) {
   else if (auto CF = dyn_cast<ConstantFP>(&C))
     EntryBuilder.buildFConstant(Reg, *CF);
   else if (isa<UndefValue>(C))
-    EntryBuilder.buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Reg);
+    EntryBuilder.buildUndef(Reg);
   else if (isa<ConstantPointerNull>(C))
     EntryBuilder.buildConstant(Reg, 0);
   else if (auto GV = dyn_cast<GlobalValue>(&C))
     EntryBuilder.buildGlobalValue(Reg, GV);
-  else if (auto CE = dyn_cast<ConstantExpr>(&C)) {
+  else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) {
+    if (!CAZ->getType()->isVectorTy())
+      return false;
+    // Return the scalar if it is a <1 x Ty> vector.
+    if (CAZ->getNumElements() == 1)
+      return translate(*CAZ->getElementValue(0u), Reg);
+    std::vector<unsigned> Ops;
+    for (unsigned i = 0; i < CAZ->getNumElements(); ++i) {
+      Constant &Elt = *CAZ->getElementValue(i);
+      Ops.push_back(getOrCreateVReg(Elt));
+    }
+    EntryBuilder.buildMerge(Reg, Ops);
+  } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) {
+    // Return the scalar if it is a <1 x Ty> vector.
+    if (CV->getNumElements() == 1)
+      return translate(*CV->getElementAsConstant(0), Reg);
+    std::vector<unsigned> Ops;
+    for (unsigned i = 0; i < CV->getNumElements(); ++i) {
+      Constant &Elt = *CV->getElementAsConstant(i);
+      Ops.push_back(getOrCreateVReg(Elt));
+    }
+    EntryBuilder.buildMerge(Reg, Ops);
+  } else if (auto CE = dyn_cast<ConstantExpr>(&C)) {
     switch(CE->getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS)                         \
       case Instruction::OPCODE: return translate##OPCODE(*CE, EntryBuilder);
 #include "llvm/IR/Instruction.def"
     default:
-      if (!TPC->isGlobalISelAbortEnabled())
-        return false;
-      llvm_unreachable("unknown opcode");
+      return false;
     }
-  } else if (!TPC->isGlobalISelAbortEnabled())
+  } else
     return false;
-  else
-    llvm_unreachable("unhandled constant kind");
 
   return true;
 }
@@ -921,7 +1116,6 @@ void IRTranslator::finalizeFunction() {
   PendingPHIs.clear();
   ValToVReg.clear();
   FrameIndices.clear();
-  Constants.clear();
   MachinePreds.clear();
 }
 
@@ -936,86 +1130,101 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   MRI = &MF->getRegInfo();
   DL = &F.getParent()->getDataLayout();
   TPC = &getAnalysis<TargetPassConfig>();
+  ORE = make_unique<OptimizationRemarkEmitter>(&F);
 
   assert(PendingPHIs.empty() && "stale PHIs");
 
-  // Setup a separate basic-block for the arguments and constants, falling
-  // through to the IR-level Function's entry block.
+  // Release the per-function state when we return, whether we succeeded or not.
+  auto FinalizeOnReturn = make_scope_exit([this]() { finalizeFunction(); });
+
+  // Setup a separate basic-block for the arguments and constants
   MachineBasicBlock *EntryBB = MF->CreateMachineBasicBlock();
   MF->push_back(EntryBB);
-  EntryBB->addSuccessor(&getOrCreateBB(F.front()));
   EntryBuilder.setMBB(*EntryBB);
 
+  // Create all blocks, in IR order, to preserve the layout.
+  for (const BasicBlock &BB: F) {
+    auto *&MBB = BBToMBB[&BB];
+
+    MBB = MF->CreateMachineBasicBlock(&BB);
+    MF->push_back(MBB);
+
+    if (BB.hasAddressTaken())
+      MBB->setHasAddressTaken();
+  }
+
+  // Make our arguments/constants entry block fallthrough to the IR entry block.
+  EntryBB->addSuccessor(&getMBB(F.front()));
+
   // Lower the actual args into this basic block.
   SmallVector<unsigned, 8> VRegArgs;
   for (const Argument &Arg: F.args())
     VRegArgs.push_back(getOrCreateVReg(Arg));
-  bool Succeeded = CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs);
-  if (!Succeeded) {
-    if (!TPC->isGlobalISelAbortEnabled()) {
-      MF->getProperties().set(
-          MachineFunctionProperties::Property::FailedISel);
-      finalizeFunction();
-      return false;
-    }
-    report_fatal_error("Unable to lower arguments");
+  if (!CLI->lowerFormalArguments(EntryBuilder, F, VRegArgs)) {
+    OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                               MF->getFunction()->getSubprogram(),
+                               &MF->getFunction()->getEntryBlock());
+    R << "unable to lower arguments: " << ore::NV("Prototype", F.getType());
+    reportTranslationError(*MF, *TPC, *ORE, R);
+    return false;
   }
 
   // And translate the function!
   for (const BasicBlock &BB: F) {
-    MachineBasicBlock &MBB = getOrCreateBB(BB);
+    MachineBasicBlock &MBB = getMBB(BB);
     // Set the insertion point of all the following translations to
     // the end of this basic block.
     CurBuilder.setMBB(MBB);
 
     for (const Instruction &Inst: BB) {
-      Succeeded &= translate(Inst);
-      if (!Succeeded) {
-        if (TPC->isGlobalISelAbortEnabled())
-          reportTranslationError(Inst, "unable to translate instruction");
-        MF->getProperties().set(
-            MachineFunctionProperties::Property::FailedISel);
-        break;
-      }
-    }
-  }
-
-  if (Succeeded) {
-    finishPendingPhis();
-
-    // Now that the MachineFrameInfo has been configured, no further changes to
-    // the reserved registers are possible.
-    MRI->freezeReservedRegs(*MF);
-
-    // Merge the argument lowering and constants block with its single
-    // successor, the LLVM-IR entry block.  We want the basic block to
-    // be maximal.
-    assert(EntryBB->succ_size() == 1 &&
-           "Custom BB used for lowering should have only one successor");
-    // Get the successor of the current entry block.
-    MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin();
-    assert(NewEntryBB.pred_size() == 1 &&
-           "LLVM-IR entry block has a predecessor!?");
-    // Move all the instruction from the current entry block to the
-    // new entry block.
-    NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(),
-                      EntryBB->end());
-
-    // Update the live-in information for the new entry block.
-    for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins())
-      NewEntryBB.addLiveIn(LiveIn);
-    NewEntryBB.sortUniqueLiveIns();
+      if (translate(Inst))
+        continue;
 
-    // Get rid of the now empty basic block.
-    EntryBB->removeSuccessor(&NewEntryBB);
-    MF->remove(EntryBB);
-    MF->DeleteMachineBasicBlock(EntryBB);
+      std::string InstStrStorage;
+      raw_string_ostream InstStr(InstStrStorage);
+      InstStr << Inst;
 
-    assert(&MF->front() == &NewEntryBB &&
-           "New entry wasn't next in the list of basic block!");
+      OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                                 Inst.getDebugLoc(), &BB);
+      R << "unable to translate instruction: " << ore::NV("Opcode", &Inst)
+        << ": '" << InstStr.str() << "'";
+      reportTranslationError(*MF, *TPC, *ORE, R);
+      return false;
+    }
   }
 
-  finalizeFunction();
+  finishPendingPhis();
+
+  // Now that the MachineFrameInfo has been configured, no further changes to
+  // the reserved registers are possible.
+  MRI->freezeReservedRegs(*MF);
+
+  // Merge the argument lowering and constants block with its single
+  // successor, the LLVM-IR entry block.  We want the basic block to
+  // be maximal.
+  assert(EntryBB->succ_size() == 1 &&
+         "Custom BB used for lowering should have only one successor");
+  // Get the successor of the current entry block.
+  MachineBasicBlock &NewEntryBB = **EntryBB->succ_begin();
+  assert(NewEntryBB.pred_size() == 1 &&
+         "LLVM-IR entry block has a predecessor!?");
+  // Move all the instruction from the current entry block to the
+  // new entry block.
+  NewEntryBB.splice(NewEntryBB.begin(), EntryBB, EntryBB->begin(),
+                    EntryBB->end());
+
+  // Update the live-in information for the new entry block.
+  for (const MachineBasicBlock::RegisterMaskPair &LiveIn : EntryBB->liveins())
+    NewEntryBB.addLiveIn(LiveIn);
+  NewEntryBB.sortUniqueLiveIns();
+
+  // Get rid of the now empty basic block.
+  EntryBB->removeSuccessor(&NewEntryBB);
+  MF->remove(EntryBB);
+  MF->DeleteMachineBasicBlock(EntryBB);
+
+  assert(&MF->front() == &NewEntryBB &&
+         "New entry wasn't next in the list of basic block!");
 
   return false;
 }
diff --git a/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 9dba15dd8c54061c607b45e624974e52b9d25472..26454c1ef00f942dc84b43769a281a98f7274074 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -12,11 +12,15 @@
 
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -44,17 +48,14 @@ void InstructionSelect::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-static void reportSelectionError(const MachineFunction &MF,
-                                 const MachineInstr *MI, const Twine &Message) {
-  std::string ErrStorage;
-  raw_string_ostream Err(ErrStorage);
-  Err << Message << ":\nIn function: " << MF.getName() << '\n';
-  if (MI)
-    Err << *MI << '\n';
-  report_fatal_error(Err.str());
-}
-
 bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // No matter what happens, whether we successfully select the function or not,
+  // nothing is going to use the vreg types after us.  Make sure they disappear.
+  auto ClearVRegTypesOnReturn =
+      make_scope_exit([&]() { MRI.getVRegToType().clear(); });
+
   // If the ISel pipeline failed, do not bother running that pass.
   if (MF.getProperties().hasProperty(
           MachineFunctionProperties::Property::FailedISel))
@@ -66,11 +67,12 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   const InstructionSelector *ISel = MF.getSubtarget().getInstructionSelector();
   assert(ISel && "Cannot work without InstructionSelector");
 
+  // An optimization remark emitter. Used to report failures.
+  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
+
   // FIXME: freezeReservedRegs is now done in IRTranslator, but there are many
   // other MF/MFI fields we need to initialize.
 
-  const MachineRegisterInfo &MRI = MF.getRegInfo();
-
 #ifndef NDEBUG
   // Check that our input is fully legal: we require the function to have the
   // Legalized property, so it should be.
@@ -80,17 +82,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
   // that it has the same layering problem, but we only use inline methods so
   // end up not needing to link against the GlobalISel library.
   if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo())
-    for (const MachineBasicBlock &MBB : MF)
-      for (const MachineInstr &MI : MBB)
-        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI))
-          reportSelectionError(MF, &MI, "Instruction is not legal");
+    for (MachineBasicBlock &MBB : MF)
+      for (MachineInstr &MI : MBB)
+        if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) {
+          reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                             "instruction is not legal", MI);
+          return false;
+        }
 
 #endif
   // FIXME: We could introduce new blocks and will need to fix the outer loop.
   // Until then, keep track of the number of blocks to assert that we don't.
   const size_t NumBlocks = MF.size();
 
-  bool Failed = false;
   for (MachineBasicBlock *MBB : post_order(&MF)) {
     if (MBB->empty())
       continue;
@@ -115,13 +119,19 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
 
       DEBUG(dbgs() << "Selecting: \n  " << MI);
 
+      // We could have folded this instruction away already, making it dead.
+      // If so, erase it.
+      if (isTriviallyDead(MI, MRI)) {
+        DEBUG(dbgs() << "Is dead; erasing.\n");
+        MI.eraseFromParentAndMarkDBGValuesForRemoval();
+        continue;
+      }
+
       if (!ISel->select(MI)) {
-        if (TPC.isGlobalISelAbortEnabled())
-          // FIXME: It would be nice to dump all inserted instructions.  It's
-          // not obvious how, esp. considering select() can insert after MI.
-          reportSelectionError(MF, &MI, "Cannot select");
-        Failed = true;
-        break;
+        // FIXME: It would be nice to dump all inserted instructions.  It's
+        // not obvious how, esp. considering select() can insert after MI.
+        reportGISelFailure(MF, TPC, MORE, "gisel-select", "cannot select", MI);
+        return false;
       }
 
       // Dump the range of instructions that MI expanded into.
@@ -148,30 +158,29 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
       MI = &*MRI.use_instr_begin(VReg);
 
     if (MI && !RC) {
-      if (TPC.isGlobalISelAbortEnabled())
-        reportSelectionError(MF, MI, "VReg has no regclass after selection");
-      Failed = true;
-      break;
+      reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                         "VReg has no regclass after selection", *MI);
+      return false;
     } else if (!RC)
       continue;
 
     if (VRegToType.second.isValid() &&
         VRegToType.second.getSizeInBits() > (RC->getSize() * 8)) {
-      if (TPC.isGlobalISelAbortEnabled())
-        reportSelectionError(
-            MF, MI, "VReg has explicit size different from class size");
-      Failed = true;
-      break;
+      reportGISelFailure(MF, TPC, MORE, "gisel-select",
+                         "VReg has explicit size different from class size",
+                         *MI);
+      return false;
     }
   }
 
-  MRI.getVRegToType().clear();
-
-  if (!TPC.isGlobalISelAbortEnabled() && (Failed || MF.size() != NumBlocks)) {
-    MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+  if (MF.size() != NumBlocks) {
+    MachineOptimizationRemarkMissed R("gisel-select", "GISelFailure",
+                                      MF.getFunction()->getSubprogram(),
+                                      /*MBB=*/nullptr);
+    R << "inserting blocks is not supported yet";
+    reportGISelFailure(MF, TPC, MORE, R);
     return false;
   }
-  assert(MF.size() == NumBlocks && "Inserting blocks is not supported yet");
 
   // FIXME: Should we accurately track changes?
   return true;
diff --git a/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index 5c34da0dc557996d82ea275dd5edc441bb3604b7..fb9d01ef8542a3efd4646914111ec0f88bec4b0f 100644
--- a/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -14,6 +14,8 @@
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -55,6 +57,45 @@ bool InstructionSelector::constrainSelectedInstRegOperands(
     // constrainOperandRegClass does that for us.
     MO.setReg(constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, I.getDesc(),
                                        Reg, OpI));
+
+    // Tie uses to defs as indicated in MCInstrDesc.
+    if (MO.isUse()) {
+      int DefIdx = I.getDesc().getOperandConstraint(OpI, MCOI::TIED_TO);
+      if (DefIdx != -1)
+        I.tieOperands(DefIdx, OpI);
+    }
   }
   return true;
 }
+
+Optional<int64_t>
+InstructionSelector::getConstantVRegVal(unsigned VReg,
+                                        const MachineRegisterInfo &MRI) const {
+  MachineInstr *MI = MRI.getVRegDef(VReg);
+  if (MI->getOpcode() != TargetOpcode::G_CONSTANT)
+    return None;
+
+  if (MI->getOperand(1).isImm())
+    return MI->getOperand(1).getImm();
+
+  if (MI->getOperand(1).isCImm() &&
+      MI->getOperand(1).getCImm()->getBitWidth() <= 64)
+    return MI->getOperand(1).getCImm()->getSExtValue();
+
+  return None;
+}
+
+bool InstructionSelector::isOperandImmEqual(
+    const MachineOperand &MO, int64_t Value,
+    const MachineRegisterInfo &MRI) const {
+
+  if (MO.getReg())
+    if (auto VRegVal = getConstantVRegVal(MO.getReg(), MRI))
+      return *VRegVal == Value;
+  return false;
+}
+
+bool InstructionSelector::isObviouslySafeToFold(MachineInstr &MI) const {
+  return !MI.mayLoadOrStore() && !MI.hasUnmodeledSideEffects() &&
+         MI.implicit_operands().begin() == MI.implicit_operands().end();
+}
diff --git a/lib/CodeGen/GlobalISel/Legalizer.cpp b/lib/CodeGen/GlobalISel/Legalizer.cpp
index e86356880e99f098f4be470824663d47bd081b3b..657ddb30791952af164493ff81a6a93bebe83993 100644
--- a/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -16,6 +16,8 @@
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Support/Debug.h"
@@ -92,10 +94,7 @@ bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI,
            "unexpected physical register in G_SEQUENCE");
 
     // Finally we can replace the uses.
-    for (auto &Use : MRI.use_operands(ExtractReg)) {
-      Changed = true;
-      Use.setReg(OrigReg);
-    }
+    MRI.replaceRegWith(ExtractReg, OrigReg);
   }
 
   if (AllDefsReplaced) {
@@ -114,6 +113,36 @@ bool Legalizer::combineExtracts(MachineInstr &MI, MachineRegisterInfo &MRI,
   return Changed;
 }
 
+bool Legalizer::combineMerges(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              const TargetInstrInfo &TII) {
+  if (MI.getOpcode() != TargetOpcode::G_UNMERGE_VALUES)
+    return false;
+
+  unsigned NumDefs = MI.getNumOperands() - 1;
+  unsigned SrcReg = MI.getOperand(NumDefs).getReg();
+  MachineInstr &MergeI = *MRI.def_instr_begin(SrcReg);
+  if (MergeI.getOpcode() != TargetOpcode::G_MERGE_VALUES)
+    return false;
+
+  if (MergeI.getNumOperands() - 1 != NumDefs)
+    return false;
+
+  // FIXME: is a COPY appropriate if the types mismatch? We know both registers
+  // are allocatable by now.
+  if (MRI.getType(MI.getOperand(0).getReg()) !=
+      MRI.getType(MergeI.getOperand(1).getReg()))
+    return false;
+
+  for (unsigned Idx = 0; Idx < NumDefs; ++Idx)
+    MRI.replaceRegWith(MI.getOperand(Idx).getReg(),
+                       MergeI.getOperand(Idx + 1).getReg());
+
+  MI.eraseFromParent();
+  if (MRI.use_empty(MergeI.getOperand(0).getReg()))
+    MergeI.eraseFromParent();
+  return true;
+}
+
 bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   // If the ISel pipeline failed, do not bother running that pass.
   if (MF.getProperties().hasProperty(
@@ -122,7 +151,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
   DEBUG(dbgs() << "Legalize Machine IR for: " << MF.getName() << '\n');
   init(MF);
   const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
-  const LegalizerInfo &LegalizerInfo = *MF.getSubtarget().getLegalizerInfo();
+  MachineOptimizationRemarkEmitter MORE(MF, /*MBFI=*/nullptr);
   LegalizerHelper Helper(MF);
 
   // FIXME: an instruction may need more than one pass before it is legal. For
@@ -142,27 +171,33 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       // and are assumed to be legal.
       if (!isPreISelGenericOpcode(MI->getOpcode()))
         continue;
-
-      auto Res = Helper.legalizeInstr(*MI, LegalizerInfo);
-
-      // Error out if we couldn't legalize this instruction. We may want to fall
-      // back to DAG ISel instead in the future.
-      if (Res == LegalizerHelper::UnableToLegalize) {
-        if (!TPC.isGlobalISelAbortEnabled()) {
-          MF.getProperties().set(
-              MachineFunctionProperties::Property::FailedISel);
-          return false;
+      SmallVector<MachineInstr *, 4> WorkList;
+      Helper.MIRBuilder.recordInsertions(
+          [&](MachineInstr *MI) { WorkList.push_back(MI); });
+      WorkList.push_back(&*MI);
+
+      LegalizerHelper::LegalizeResult Res;
+      unsigned Idx = 0;
+      do {
+        Res = Helper.legalizeInstrStep(*WorkList[Idx]);
+        // Error out if we couldn't legalize this instruction. We may want to
+        // fall
+        // back to DAG ISel instead in the future.
+        if (Res == LegalizerHelper::UnableToLegalize) {
+          Helper.MIRBuilder.stopRecordingInsertions();
+          if (Res == LegalizerHelper::UnableToLegalize) {
+            reportGISelFailure(MF, TPC, MORE, "gisel-legalize",
+                               "unable to legalize instruction",
+                               *WorkList[Idx]);
+            return false;
+          }
         }
-        std::string Msg;
-        raw_string_ostream OS(Msg);
-        OS << "unable to legalize instruction: ";
-        MI->print(OS);
-        report_fatal_error(OS.str());
-      }
-
-      Changed |= Res == LegalizerHelper::Legalized;
-    }
+        Changed |= Res == LegalizerHelper::Legalized;
+        ++Idx;
+      } while (Idx < WorkList.size());
 
+      Helper.MIRBuilder.stopRecordingInsertions();
+    }
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -173,6 +208,7 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
       NextMI = std::next(MI);
 
       Changed |= combineExtracts(*MI, MRI, TII);
+      Changed |= combineMerges(*MI, MRI, TII);
     }
   }
 
diff --git a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 6da02646250beeca455200c9b1ad187fceff0282..20358f7ee6c2ed99f5cc0890d1a94854b91b573a 100644
--- a/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -29,14 +29,13 @@
 using namespace llvm;
 
 LegalizerHelper::LegalizerHelper(MachineFunction &MF)
-  : MRI(MF.getRegInfo()) {
+    : MRI(MF.getRegInfo()), LI(*MF.getSubtarget().getLegalizerInfo()) {
   MIRBuilder.setMF(MF);
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
-                                   const LegalizerInfo &LegalizerInfo) {
-  auto Action = LegalizerInfo.getAction(MI, MRI);
+LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
+  auto Action = LI.getAction(MI, MRI);
   switch (std::get<0>(Action)) {
   case LegalizerInfo::Legal:
     return AlreadyLegal;
@@ -50,46 +49,32 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI,
     return lower(MI, std::get<1>(Action), std::get<2>(Action));
   case LegalizerInfo::FewerElements:
     return fewerElementsVector(MI, std::get<1>(Action), std::get<2>(Action));
+  case LegalizerInfo::Custom:
+    return LI.legalizeCustom(MI, MRI, MIRBuilder) ? Legalized
+                                                  : UnableToLegalize;
   default:
     return UnableToLegalize;
   }
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::legalizeInstr(MachineInstr &MI,
-                               const LegalizerInfo &LegalizerInfo) {
-  SmallVector<MachineInstr *, 4> WorkList;
-  MIRBuilder.recordInsertions(
-      [&](MachineInstr *MI) { WorkList.push_back(MI); });
-  WorkList.push_back(&MI);
-
-  bool Changed = false;
-  LegalizeResult Res;
-  unsigned Idx = 0;
-  do {
-    Res = legalizeInstrStep(*WorkList[Idx], LegalizerInfo);
-    if (Res == UnableToLegalize) {
-      MIRBuilder.stopRecordingInsertions();
-      return UnableToLegalize;
-    }
-    Changed |= Res == Legalized;
-    ++Idx;
-  } while (Idx < WorkList.size());
-
-  MIRBuilder.stopRecordingInsertions();
-
-  return Changed ? Legalized : AlreadyLegal;
-}
-
 void LegalizerHelper::extractParts(unsigned Reg, LLT Ty, int NumParts,
                                    SmallVectorImpl<unsigned> &VRegs) {
-  unsigned Size = Ty.getSizeInBits();
-  SmallVector<uint64_t, 4> Indexes;
-  for (int i = 0; i < NumParts; ++i) {
+  for (int i = 0; i < NumParts; ++i)
     VRegs.push_back(MRI.createGenericVirtualRegister(Ty));
-    Indexes.push_back(i * Size);
+  MIRBuilder.buildUnmerge(VRegs, Reg);
+}
+
+static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
+  switch (Opcode) {
+  case TargetOpcode::G_FADD:
+    assert((Size == 32 || Size == 64) && "Unsupported size");
+    return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32;
+  case TargetOpcode::G_FREM:
+    return Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32;
+  case TargetOpcode::G_FPOW:
+    return Size == 64 ? RTLIB::POW_F64 : RTLIB::POW_F32;
   }
-  MIRBuilder.buildExtract(VRegs, Indexes, Reg);
+  llvm_unreachable("Unknown libcall function");
 }
 
 LegalizerHelper::LegalizeResult
@@ -101,17 +86,19 @@ LegalizerHelper::libcall(MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FPOW:
   case TargetOpcode::G_FREM: {
     auto &Ctx = MIRBuilder.getMF().getFunction()->getContext();
     Type *Ty = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx);
     auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
     auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
-    const char *Name =
-        TLI.getLibcallName(Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32);
-
+    auto Libcall = getRTLibDesc(MI.getOpcode(), Size);
+    const char *Name = TLI.getLibcallName(Libcall);
+    MIRBuilder.getMF().getFrameInfo().setHasCalls(true);
     CLI.lowerCall(
-        MIRBuilder, MachineOperand::CreateES(Name),
-        {MI.getOperand(0).getReg(), Ty},
+        MIRBuilder, TLI.getLibcallCallingConv(Libcall),
+        MachineOperand::CreateES(Name), {MI.getOperand(0).getReg(), Ty},
         {{MI.getOperand(1).getReg(), Ty}, {MI.getOperand(2).getReg(), Ty}});
     MI.eraseFromParent();
     return Legalized;
@@ -133,12 +120,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     return UnableToLegalize;
   case TargetOpcode::G_ADD: {
     // Expand in terms of carry-setting/consuming G_ADDE instructions.
-    unsigned NarrowSize = NarrowTy.getSizeInBits();
     int NumParts = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() /
                    NarrowTy.getSizeInBits();
 
     SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
-    SmallVector<uint64_t, 2> Indexes;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
 
@@ -153,11 +138,70 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
                             Src2Regs[i], CarryIn);
 
       DstRegs.push_back(DstReg);
-      Indexes.push_back(i * NarrowSize);
       CarryIn = CarryOut;
     }
     unsigned DstReg = MI.getOperand(0).getReg();
-    MIRBuilder.buildSequence(DstReg, DstRegs, Indexes);
+    MIRBuilder.buildMerge(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_INSERT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    int64_t NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+
+    SmallVector<unsigned, 2> SrcRegs, DstRegs;
+    SmallVector<uint64_t, 2> Indexes;
+    extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, SrcRegs);
+
+    unsigned OpReg = MI.getOperand(2).getReg();
+    int64_t OpStart = MI.getOperand(3).getImm();
+    int64_t OpSize = MRI.getType(OpReg).getSizeInBits();
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned DstStart = i * NarrowSize;
+
+      if (DstStart + NarrowSize <= OpStart || DstStart >= OpStart + OpSize) {
+        // No part of the insert affects this subregister, forward the original.
+        DstRegs.push_back(SrcRegs[i]);
+        continue;
+      } else if (DstStart == OpStart && NarrowTy == MRI.getType(OpReg)) {
+        // The entire subregister is defined by this insert, forward the new
+        // value.
+        DstRegs.push_back(OpReg);
+        continue;
+      }
+
+      // OpSegStart is where this destination segment would start in OpReg if it
+      // extended infinitely in both directions.
+      int64_t ExtractOffset, InsertOffset, SegSize;
+      if (OpStart < DstStart) {
+        InsertOffset = 0;
+        ExtractOffset = DstStart - OpStart;
+        SegSize = std::min(NarrowSize, OpStart + OpSize - DstStart);
+      } else {
+        InsertOffset = OpStart - DstStart;
+        ExtractOffset = 0;
+        SegSize =
+            std::min(NarrowSize - InsertOffset, OpStart + OpSize - DstStart);
+      }
+
+      unsigned SegReg = OpReg;
+      if (ExtractOffset != 0 || SegSize != OpSize) {
+        // A genuine extract is needed.
+        SegReg = MRI.createGenericVirtualRegister(LLT::scalar(SegSize));
+        MIRBuilder.buildExtract(SegReg, OpReg, ExtractOffset);
+      }
+
+      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
+      MIRBuilder.buildInsert(DstReg, SrcRegs[i], SegReg, InsertOffset);
+      DstRegs.push_back(DstReg);
+    }
+
+    assert(DstRegs.size() == (unsigned)NumParts && "not all parts covered");
+    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -169,7 +213,6 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
         MRI.getType(MI.getOperand(1).getReg()).getAddressSpace(), NarrowSize);
 
     SmallVector<unsigned, 2> DstRegs;
-    SmallVector<uint64_t, 2> Indexes;
     for (int i = 0; i < NumParts; ++i) {
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       unsigned SrcReg = MRI.createGenericVirtualRegister(NarrowPtrTy);
@@ -182,10 +225,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       MIRBuilder.buildLoad(DstReg, SrcReg, **MI.memoperands_begin());
 
       DstRegs.push_back(DstReg);
-      Indexes.push_back(i * NarrowSize);
     }
     unsigned DstReg = MI.getOperand(0).getReg();
-    MIRBuilder.buildSequence(DstReg, DstRegs, Indexes);
+    MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -211,6 +253,26 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_CONSTANT: {
+    unsigned NarrowSize = NarrowTy.getSizeInBits();
+    int NumParts =
+        MRI.getType(MI.getOperand(0).getReg()).getSizeInBits() / NarrowSize;
+    const APInt &Cst = MI.getOperand(1).getCImm()->getValue();
+    LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext();
+
+    SmallVector<unsigned, 2> DstRegs;
+    for (int i = 0; i < NumParts; ++i) {
+      unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
+      ConstantInt *CI =
+          ConstantInt::get(Ctx, Cst.lshr(NarrowSize * i).trunc(NarrowSize));
+      MIRBuilder.buildConstant(DstReg, *CI);
+      DstRegs.push_back(DstReg);
+    }
+    unsigned DstReg = MI.getOperand(0).getReg();
+    MIRBuilder.buildMerge(DstReg, DstRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
@@ -273,6 +335,29 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SELECT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    // Perform operation at larger width (any extension is fine here, high bits
+    // don't affect the result) and then truncate the result back to the
+    // original type.
+    unsigned Src1Ext = MRI.createGenericVirtualRegister(WideTy);
+    unsigned Src2Ext = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildAnyExt(Src1Ext, MI.getOperand(2).getReg());
+    MIRBuilder.buildAnyExt(Src2Ext, MI.getOperand(3).getReg());
+
+    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildInstr(TargetOpcode::G_SELECT)
+        .addDef(DstExt)
+        .addReg(MI.getOperand(1).getReg())
+        .addUse(Src1Ext)
+        .addUse(Src2Ext);
+
+    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI: {
     if (TypeIdx != 0)
@@ -309,6 +394,26 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_INSERT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    unsigned Src = MI.getOperand(1).getReg();
+    unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
+    MIRBuilder.buildAnyExt(SrcExt, Src);
+
+    unsigned DstExt = MRI.createGenericVirtualRegister(WideTy);
+    auto MIB = MIRBuilder.buildInsert(DstExt, SrcExt, MI.getOperand(2).getReg(),
+                                      MI.getOperand(3).getImm());
+    for (unsigned OpNum = 4; OpNum < MI.getNumOperands(); OpNum += 2) {
+      MIB.addReg(MI.getOperand(OpNum).getReg());
+      MIB.addImm(MI.getOperand(OpNum + 1).getImm());
+    }
+
+    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), DstExt);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   case TargetOpcode::G_LOAD: {
     assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) ==
                WideTy.getSizeInBits() &&
@@ -322,12 +427,24 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     return Legalized;
   }
   case TargetOpcode::G_STORE: {
-    assert(alignTo(MRI.getType(MI.getOperand(0).getReg()).getSizeInBits(), 8) ==
-               WideTy.getSizeInBits() &&
-           "illegal to increase number of bytes modified by a store");
+    if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(1) ||
+        WideTy != LLT::scalar(8))
+      return UnableToLegalize;
+
+    auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+    auto Content = TLI.getBooleanContents(false, false);
+
+    unsigned ExtOp = TargetOpcode::G_ANYEXT;
+    if (Content == TargetLoweringBase::ZeroOrOneBooleanContent)
+      ExtOp = TargetOpcode::G_ZEXT;
+    else if (Content == TargetLoweringBase::ZeroOrNegativeOneBooleanContent)
+      ExtOp = TargetOpcode::G_SEXT;
+    else
+      ExtOp = TargetOpcode::G_ANYEXT;
 
     unsigned SrcExt = MRI.createGenericVirtualRegister(WideTy);
-    MIRBuilder.buildAnyExt(SrcExt, MI.getOperand(0).getReg());
+    MIRBuilder.buildInstr(ExtOp).addDef(SrcExt).addUse(
+        MI.getOperand(0).getReg());
     MIRBuilder.buildStore(SrcExt, MI.getOperand(1).getReg(),
                           **MI.memoperands_begin());
     MI.eraseFromParent();
@@ -406,6 +523,83 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SMULO:
+  case TargetOpcode::G_UMULO: {
+    // Generate G_UMULH/G_SMULH to check for overflow and a normal G_MUL for the
+    // result.
+    unsigned Res = MI.getOperand(0).getReg();
+    unsigned Overflow = MI.getOperand(1).getReg();
+    unsigned LHS = MI.getOperand(2).getReg();
+    unsigned RHS = MI.getOperand(3).getReg();
+
+    MIRBuilder.buildMul(Res, LHS, RHS);
+
+    unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
+                          ? TargetOpcode::G_SMULH
+                          : TargetOpcode::G_UMULH;
+
+    unsigned HiPart = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildInstr(Opcode)
+      .addDef(HiPart)
+      .addUse(LHS)
+      .addUse(RHS);
+
+    unsigned Zero = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildConstant(Zero, 0);
+    MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_FNEG: {
+    // TODO: Handle vector types once we are able to
+    // represent them.
+    if (Ty.isVector())
+      return UnableToLegalize;
+    unsigned Res = MI.getOperand(0).getReg();
+    Type *ZeroTy;
+    LLVMContext &Ctx = MIRBuilder.getMF().getFunction()->getContext();
+    switch (Ty.getSizeInBits()) {
+    case 16:
+      ZeroTy = Type::getHalfTy(Ctx);
+      break;
+    case 32:
+      ZeroTy = Type::getFloatTy(Ctx);
+      break;
+    case 64:
+      ZeroTy = Type::getDoubleTy(Ctx);
+      break;
+    default:
+      llvm_unreachable("unexpected floating-point type");
+    }
+    ConstantFP &ZeroForNegation =
+        *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy));
+    unsigned Zero = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildFConstant(Zero, ZeroForNegation);
+    MIRBuilder.buildInstr(TargetOpcode::G_FSUB)
+        .addDef(Res)
+        .addUse(Zero)
+        .addUse(MI.getOperand(1).getReg());
+    MI.eraseFromParent();
+    return Legalized;
+  }
+  case TargetOpcode::G_FSUB: {
+    // Lower (G_FSUB LHS, RHS) to (G_FADD LHS, (G_FNEG RHS)).
+    // First, check if G_FNEG is marked as Lower. If so, we may
+    // end up with an infinite loop as G_FSUB is used to legalize G_FNEG.
+    if (LI.getAction({G_FNEG, Ty}).first == LegalizerInfo::Lower)
+      return UnableToLegalize;
+    unsigned Res = MI.getOperand(0).getReg();
+    unsigned LHS = MI.getOperand(1).getReg();
+    unsigned RHS = MI.getOperand(2).getReg();
+    unsigned Neg = MRI.createGenericVirtualRegister(Ty);
+    MIRBuilder.buildInstr(TargetOpcode::G_FNEG).addDef(Neg).addUse(RHS);
+    MIRBuilder.buildInstr(TargetOpcode::G_FADD)
+        .addDef(Res)
+        .addUse(LHS)
+        .addUse(Neg);
+    MI.eraseFromParent();
+    return Legalized;
+  }
   }
 }
 
@@ -426,7 +620,6 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
     MIRBuilder.setInstr(MI);
 
     SmallVector<unsigned, 2> Src1Regs, Src2Regs, DstRegs;
-    SmallVector<uint64_t, 2> Indexes;
     extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src1Regs);
     extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src2Regs);
 
@@ -434,10 +627,9 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
       unsigned DstReg = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.buildAdd(DstReg, Src1Regs[i], Src2Regs[i]);
       DstRegs.push_back(DstReg);
-      Indexes.push_back(i * NarrowSize);
     }
 
-    MIRBuilder.buildSequence(DstReg, DstRegs, Indexes);
+    MIRBuilder.buildMerge(DstReg, DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index e49662075ed5e10cd70b251fc4ade320ec9a7465..eaf4056e47eafd24771abd5d107cfaa3c72b95fb 100644
--- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -41,6 +41,8 @@ LegalizerInfo::LegalizerInfo() : TablesInitialized(false) {
   DefaultActions[TargetOpcode::G_STORE] = NarrowScalar;
 
   DefaultActions[TargetOpcode::G_BRCOND] = WidenScalar;
+  DefaultActions[TargetOpcode::G_INSERT] = NarrowScalar;
+  DefaultActions[TargetOpcode::G_FNEG] = Lower;
 }
 
 void LegalizerInfo::computeTables() {
@@ -71,28 +73,36 @@ LegalizerInfo::getAction(const InstrAspect &Aspect) const {
   // These *have* to be implemented for now, they're the fundamental basis of
   // how everything else is transformed.
 
-  // Nothing is going to go well with types that aren't a power of 2 yet, so
-  // don't even try because we might make things worse.
-  if (!isPowerOf2_64(Aspect.Type.getSizeInBits()))
-      return std::make_pair(Unsupported, LLT());
-
   // FIXME: the long-term plan calls for expansion in terms of load/store (if
   // they're not legal).
   if (Aspect.Opcode == TargetOpcode::G_SEQUENCE ||
-      Aspect.Opcode == TargetOpcode::G_EXTRACT)
+      Aspect.Opcode == TargetOpcode::G_EXTRACT ||
+      Aspect.Opcode == TargetOpcode::G_MERGE_VALUES ||
+      Aspect.Opcode == TargetOpcode::G_UNMERGE_VALUES)
     return std::make_pair(Legal, Aspect.Type);
 
+  LLT Ty = Aspect.Type;
   LegalizeAction Action = findInActions(Aspect);
+  // LegalizerHelper is not able to handle non-power-of-2 types right now, so do
+  // not try to legalize them unless they are marked as Legal or Custom.
+  // FIXME: This is a temporary hack until the general non-power-of-2
+  // legalization works.
+  if (!isPowerOf2_64(Ty.getSizeInBits()) &&
+      !(Action == Legal || Action == Custom))
+    return std::make_pair(Unsupported, LLT());
+
   if (Action != NotFound)
     return findLegalAction(Aspect, Action);
 
   unsigned Opcode = Aspect.Opcode;
-  LLT Ty = Aspect.Type;
   if (!Ty.isVector()) {
     auto DefaultAction = DefaultActions.find(Aspect.Opcode);
     if (DefaultAction != DefaultActions.end() && DefaultAction->second == Legal)
       return std::make_pair(Legal, Ty);
 
+    if (DefaultAction != DefaultActions.end() && DefaultAction->second == Lower)
+      return std::make_pair(Lower, Ty);
+
     if (DefaultAction == DefaultActions.end() ||
         DefaultAction->second != NarrowScalar)
       return std::make_pair(Unsupported, LLT());
@@ -160,6 +170,7 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
   case Legal:
   case Lower:
   case Libcall:
+  case Custom:
     return Aspect.Type;
   case NarrowScalar: {
     return findLegalType(Aspect,
@@ -180,3 +191,9 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect,
   }
   }
 }
+
+bool LegalizerInfo::legalizeCustom(MachineInstr &MI,
+                                   MachineRegisterInfo &MRI,
+                                   MachineIRBuilder &MIRBuilder) const {
+  return false;
+}
diff --git a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 01457260ca64755aeb20800644d41bff93d9b291..8d1a263395a0e913d52825e00883e16b6f6ba4e1 100644
--- a/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -137,8 +137,12 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
       MIB.addCImm(CI);
     else
       MIB.addImm(CI->getZExtValue());
-  } else
-    MIB.addFPImm(&cast<ConstantFP>(C));
+  } else if (auto *CFP = dyn_cast<ConstantFP>(&C)) {
+    MIB.addFPImm(CFP);
+  } else {
+    // Insert %noreg if we didn't find a usable constant and had to drop it.
+    MIB.addReg(0U);
+  }
 
   return MIB.addImm(Offset).addMetadata(Variable).addMetadata(Expr);
 }
@@ -187,6 +191,17 @@ MachineInstrBuilder MachineIRBuilder::buildGEP(unsigned Res, unsigned Op0,
       .addUse(Op1);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildPtrMask(unsigned Res, unsigned Op0,
+                                                   uint32_t NumBits) {
+  assert(MRI->getType(Res).isPointer() &&
+         MRI->getType(Res) == MRI->getType(Op0) && "type mismatch");
+
+  return buildInstr(TargetOpcode::G_PTR_MASK)
+      .addDef(Res)
+      .addUse(Op0)
+      .addImm(NumBits);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildSub(unsigned Res, unsigned Op0,
                                                unsigned Op1) {
   assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) &&
@@ -213,6 +228,19 @@ MachineInstrBuilder MachineIRBuilder::buildMul(unsigned Res, unsigned Op0,
       .addUse(Op1);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildAnd(unsigned Res, unsigned Op0,
+                                               unsigned Op1) {
+  assert((MRI->getType(Res).isScalar() || MRI->getType(Res).isVector()) &&
+         "invalid operand type");
+  assert(MRI->getType(Res) == MRI->getType(Op0) &&
+         MRI->getType(Res) == MRI->getType(Op1) && "type mismatch");
+
+  return buildInstr(TargetOpcode::G_AND)
+      .addDef(Res)
+      .addUse(Op0)
+      .addUse(Op1);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {
   return buildInstr(TargetOpcode::G_BR).addMBB(&Dest);
 }
@@ -327,34 +355,56 @@ MachineInstrBuilder MachineIRBuilder::buildSExtOrTrunc(unsigned Res,
   return buildInstr(Opcode).addDef(Res).addUse(Op);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildExtract(ArrayRef<unsigned> Results,
-                                                   ArrayRef<uint64_t> Indices,
-                                                   unsigned Src) {
-#ifndef NDEBUG
-  assert(Results.size() == Indices.size() && "inconsistent number of regs");
-  assert(!Results.empty() && "invalid trivial extract");
-  assert(std::is_sorted(Indices.begin(), Indices.end()) &&
-         "extract offsets must be in ascending order");
+MachineInstrBuilder MachineIRBuilder::buildZExtOrTrunc(unsigned Res,
+                                                       unsigned Op) {
+  unsigned Opcode = TargetOpcode::COPY;
+  if (MRI->getType(Res).getSizeInBits() > MRI->getType(Op).getSizeInBits())
+    Opcode = TargetOpcode::G_ZEXT;
+  else if (MRI->getType(Res).getSizeInBits() < MRI->getType(Op).getSizeInBits())
+    Opcode = TargetOpcode::G_TRUNC;
 
-  assert(MRI->getType(Src).isValid() && "invalid operand type");
-  for (auto Res : Results)
-    assert(MRI->getType(Res).isValid() && "invalid operand type");
-#endif
+  return buildInstr(Opcode).addDef(Res).addUse(Op);
+}
 
-  auto MIB = BuildMI(getMF(), DL, getTII().get(TargetOpcode::G_EXTRACT));
-  for (auto Res : Results)
-    MIB.addDef(Res);
 
-  MIB.addUse(Src);
+MachineInstrBuilder MachineIRBuilder::buildCast(unsigned Dst, unsigned Src) {
+  LLT SrcTy = MRI->getType(Src);
+  LLT DstTy = MRI->getType(Dst);
+  if (SrcTy == DstTy)
+    return buildCopy(Dst, Src);
+
+  unsigned Opcode;
+  if (SrcTy.isPointer() && DstTy.isScalar())
+    Opcode = TargetOpcode::G_PTRTOINT;
+  else if (DstTy.isPointer() && SrcTy.isScalar())
+    Opcode = TargetOpcode::G_INTTOPTR;
+  else {
+    assert(!SrcTy.isPointer() && !DstTy.isPointer() && "n G_ADDRCAST yet");
+    Opcode = TargetOpcode::G_BITCAST;
+  }
 
-  for (auto Idx : Indices)
-    MIB.addImm(Idx);
+  return buildInstr(Opcode).addDef(Dst).addUse(Src);
+}
 
-  getMBB().insert(getInsertPt(), MIB);
-  if (InsertedInstr)
-    InsertedInstr(MIB);
+MachineInstrBuilder MachineIRBuilder::buildExtract(unsigned Res, unsigned Src,
+                                                   uint64_t Index) {
+#ifndef NDEBUG
+  assert(MRI->getType(Src).isValid() && "invalid operand type");
+  assert(MRI->getType(Res).isValid() && "invalid operand type");
+  assert(Index + MRI->getType(Res).getSizeInBits() <=
+             MRI->getType(Src).getSizeInBits() &&
+         "extracting off end of register");
+#endif
 
-  return MIB;
+  if (MRI->getType(Res).getSizeInBits() == MRI->getType(Src).getSizeInBits()) {
+    assert(Index == 0 && "insertion past the end of a register");
+    return buildCast(Res, Src);
+  }
+
+  return buildInstr(TargetOpcode::G_EXTRACT)
+      .addDef(Res)
+      .addUse(Src)
+      .addImm(Index);
 }
 
 MachineInstrBuilder
@@ -381,6 +431,64 @@ MachineIRBuilder::buildSequence(unsigned Res,
   return MIB;
 }
 
+MachineInstrBuilder MachineIRBuilder::buildUndef(unsigned Res) {
+  return buildInstr(TargetOpcode::IMPLICIT_DEF).addDef(Res);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildMerge(unsigned Res,
+                                                 ArrayRef<unsigned> Ops) {
+
+#ifndef NDEBUG
+  assert(!Ops.empty() && "invalid trivial sequence");
+  LLT Ty = MRI->getType(Ops[0]);
+  for (auto Reg : Ops)
+    assert(MRI->getType(Reg) == Ty && "type mismatch in input list");
+  assert(Ops.size() * MRI->getType(Ops[0]).getSizeInBits() ==
+             MRI->getType(Res).getSizeInBits() &&
+         "input operands do not cover output register");
+#endif
+
+  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_MERGE_VALUES);
+  MIB.addDef(Res);
+  for (unsigned i = 0; i < Ops.size(); ++i)
+    MIB.addUse(Ops[i]);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<unsigned> Res,
+                                                   unsigned Op) {
+
+#ifndef NDEBUG
+  assert(!Res.empty() && "invalid trivial sequence");
+  LLT Ty = MRI->getType(Res[0]);
+  for (auto Reg : Res)
+    assert(MRI->getType(Reg) == Ty && "type mismatch in input list");
+  assert(Res.size() * MRI->getType(Res[0]).getSizeInBits() ==
+             MRI->getType(Op).getSizeInBits() &&
+         "input operands do not cover output register");
+#endif
+
+  MachineInstrBuilder MIB = buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+  for (unsigned i = 0; i < Res.size(); ++i)
+    MIB.addDef(Res[i]);
+  MIB.addUse(Op);
+  return MIB;
+}
+
+MachineInstrBuilder MachineIRBuilder::buildInsert(unsigned Res, unsigned Src,
+                                                  unsigned Op, unsigned Index) {
+  if (MRI->getType(Res).getSizeInBits() == MRI->getType(Op).getSizeInBits()) {
+    assert(Index == 0 && "insertion past the end of a register");
+    return buildCast(Res, Op);
+  }
+
+  return buildInstr(TargetOpcode::G_INSERT)
+      .addDef(Res)
+      .addUse(Src)
+      .addUse(Op)
+      .addImm(Index);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,
                                                      unsigned Res,
                                                      bool HasSideEffects) {
@@ -460,9 +568,10 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,
   if (ResTy.isScalar() || ResTy.isPointer())
     assert(MRI->getType(Tst).isScalar() && "type mismatch");
   else
-    assert(MRI->getType(Tst).isVector() &&
-           MRI->getType(Tst).getNumElements() ==
-               MRI->getType(Op0).getNumElements() &&
+    assert((MRI->getType(Tst).isScalar() ||
+            (MRI->getType(Tst).isVector() &&
+             MRI->getType(Tst).getNumElements() ==
+                 MRI->getType(Op0).getNumElements())) &&
            "type mismatch");
 #endif
 
@@ -473,6 +582,46 @@ MachineInstrBuilder MachineIRBuilder::buildSelect(unsigned Res, unsigned Tst,
       .addUse(Op1);
 }
 
+MachineInstrBuilder MachineIRBuilder::buildInsertVectorElement(unsigned Res,
+                                                               unsigned Val,
+                                                               unsigned Elt,
+                                                               unsigned Idx) {
+#ifndef NDEBUG
+  LLT ResTy = MRI->getType(Res);
+  LLT ValTy = MRI->getType(Val);
+  LLT EltTy = MRI->getType(Elt);
+  LLT IdxTy = MRI->getType(Idx);
+  assert(ResTy.isVector() && ValTy.isVector() && "invalid operand type");
+  assert(EltTy.isScalar() && IdxTy.isScalar() && "invalid operand type");
+  assert(ResTy.getNumElements() == ValTy.getNumElements() && "type mismatch");
+  assert(ResTy.getElementType() == EltTy && "type mismatch");
+#endif
+
+  return buildInstr(TargetOpcode::G_INSERT_VECTOR_ELT)
+      .addDef(Res)
+      .addUse(Val)
+      .addUse(Elt)
+      .addUse(Idx);
+}
+
+MachineInstrBuilder MachineIRBuilder::buildExtractVectorElement(unsigned Res,
+                                                                unsigned Val,
+                                                                unsigned Idx) {
+#ifndef NDEBUG
+  LLT ResTy = MRI->getType(Res);
+  LLT ValTy = MRI->getType(Val);
+  LLT IdxTy = MRI->getType(Idx);
+  assert(ValTy.isVector() && "invalid operand type");
+  assert(ResTy.isScalar() && IdxTy.isScalar() && "invalid operand type");
+  assert(ValTy.getElementType() == ResTy && "type mismatch");
+#endif
+
+  return buildInstr(TargetOpcode::G_EXTRACT_VECTOR_ELT)
+      .addDef(Res)
+      .addUse(Val)
+      .addUse(Idx);
+}
+
 void MachineIRBuilder::validateTruncExt(unsigned Dst, unsigned Src,
                                         bool IsExtend) {
 #ifndef NDEBUG
diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 111cfa9c6d17b9c5a753f5af1db114b57d41fb82..f935390a8d1bd0c8438c4c035d34866ac7b0d195 100644
--- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -71,6 +72,7 @@ void RegBankSelect::init(MachineFunction &MF) {
     MBPI = nullptr;
   }
   MIRBuilder.setMF(MF);
+  MORE = make_unique<MachineOptimizationRemarkEmitter>(MF, MBFI);
 }
 
 void RegBankSelect::getAnalysisUsage(AnalysisUsage &AU) const {
@@ -585,18 +587,12 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
   // LegalizerInfo as it's currently in the separate GlobalISel library.
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   if (const LegalizerInfo *MLI = MF.getSubtarget().getLegalizerInfo()) {
-    for (const MachineBasicBlock &MBB : MF) {
-      for (const MachineInstr &MI : MBB) {
+    for (MachineBasicBlock &MBB : MF) {
+      for (MachineInstr &MI : MBB) {
         if (isPreISelGenericOpcode(MI.getOpcode()) && !MLI->isLegal(MI, MRI)) {
-          if (!TPC->isGlobalISelAbortEnabled()) {
-            MF.getProperties().set(
-                MachineFunctionProperties::Property::FailedISel);
-            return false;
-          }
-          std::string ErrStorage;
-          raw_string_ostream Err(ErrStorage);
-          Err << "Instruction is not legal: " << MI << '\n';
-          report_fatal_error(Err.str());
+          reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
+                             "instruction is not legal", MI);
+          return false;
         }
       }
     }
@@ -622,9 +618,8 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
         continue;
 
       if (!assignInstr(MI)) {
-        if (TPC->isGlobalISelAbortEnabled())
-          report_fatal_error("Unable to map instruction");
-        MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+        reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
+                           "unable to map instruction", MI);
         return false;
       }
     }
diff --git a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
index 74161e3c6bdf487bc5fb29776767ca96ad4e345e..b2df2f1596769d1bf85fe144f01d55e03346604d 100644
--- a/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
+++ b/lib/CodeGen/GlobalISel/RegisterBankInfo.cpp
@@ -126,15 +126,26 @@ const TargetRegisterClass *RegisterBankInfo::constrainGenericRegister(
   return &RC;
 }
 
+/// Check whether or not \p MI should be treated like a copy
+/// for the mappings.
+/// Copy like instruction are special for mapping because
+/// they don't have actual register constraints. Moreover,
+/// they sometimes have register classes assigned and we can
+/// just use that instead of failing to provide a generic mapping.
+static bool isCopyLike(const MachineInstr &MI) {
+  return MI.isCopy() || MI.isPHI() ||
+         MI.getOpcode() == TargetOpcode::REG_SEQUENCE;
+}
+
 RegisterBankInfo::InstructionMapping
 RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
   // For copies we want to walk over the operands and try to find one
   // that has a register bank since the instruction itself will not get
   // us any constraint.
-  bool isCopyLike = MI.isCopy() || MI.isPHI();
+  bool IsCopyLike = isCopyLike(MI);
   // For copy like instruction, only the mapping of the definition
   // is important. The rest is not constrained.
-  unsigned NumOperandsForMapping = isCopyLike ? 1 : MI.getNumOperands();
+  unsigned NumOperandsForMapping = IsCopyLike ? 1 : MI.getNumOperands();
 
   RegisterBankInfo::InstructionMapping Mapping(DefaultMappingID, /*Cost*/ 1,
                                                /*OperandsMapping*/ nullptr,
@@ -168,7 +179,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
     // For copy-like instruction, we want to reuse the register bank
     // that is already set on Reg, if any, since those instructions do
     // not have any constraints.
-    const RegisterBank *CurRegBank = isCopyLike ? AltRegBank : nullptr;
+    const RegisterBank *CurRegBank = IsCopyLike ? AltRegBank : nullptr;
     if (!CurRegBank) {
       // If this is a target specific instruction, we can deduce
       // the register bank from the encoding constraints.
@@ -177,7 +188,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
         // All our attempts failed, give up.
         CompleteMapping = false;
 
-        if (!isCopyLike)
+        if (!IsCopyLike)
           // MI does not carry enough information to guess the mapping.
           return InstructionMapping();
         continue;
@@ -185,7 +196,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
     }
     const ValueMapping *ValMapping =
         &getValueMapping(0, getSizeInBits(Reg, MRI, TRI), *CurRegBank);
-    if (isCopyLike) {
+    if (IsCopyLike) {
       OperandsMapping[0] = ValMapping;
       CompleteMapping = true;
       break;
@@ -193,7 +204,7 @@ RegisterBankInfo::getInstrMappingImpl(const MachineInstr &MI) const {
     OperandsMapping[OpIdx] = ValMapping;
   }
 
-  if (isCopyLike && !CompleteMapping)
+  if (IsCopyLike && !CompleteMapping)
     // No way to deduce the type from what we have.
     return InstructionMapping();
 
@@ -352,6 +363,13 @@ void RegisterBankInfo::applyDefaultMapping(const OperandsMapper &OpdMapper) {
       DEBUG(dbgs() << " is not a register, nothing to be done\n");
       continue;
     }
+    if (!MO.getReg()) {
+      DEBUG(dbgs() << " is %%noreg, nothing to be done\n");
+      continue;
+    }
+    assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns !=
+               0 &&
+           "Invalid mapping");
     assert(OpdMapper.getInstrMapping().getOperandMapping(OpIdx).NumBreakDowns ==
                1 &&
            "This mapping is too complex for this function");
@@ -485,8 +503,7 @@ bool RegisterBankInfo::InstructionMapping::verify(
   // Check that all the register operands are properly mapped.
   // Check the constructor invariant.
   // For PHI, we only care about mapping the definition.
-  assert(NumOperands ==
-             ((MI.isCopy() || MI.isPHI()) ? 1 : MI.getNumOperands()) &&
+  assert(NumOperands == (isCopyLike(MI) ? 1 : MI.getNumOperands()) &&
          "NumOperands must match, see constructor");
   assert(MI.getParent() && MI.getParent()->getParent() &&
          "MI must be connected to a MachineFunction");
diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp
index e50091833c26d93e93ccfdc2e7cb5e0d8322dec0..606a59680a3d4a618caedf55077da371be44df55 100644
--- a/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/lib/CodeGen/GlobalISel/Utils.cpp
@@ -11,10 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
@@ -43,3 +46,50 @@ unsigned llvm::constrainOperandRegClass(
 
   return Reg;
 }
+
+bool llvm::isTriviallyDead(const MachineInstr &MI,
+                           const MachineRegisterInfo &MRI) {
+  // If we can move an instruction, we can remove it.  Otherwise, it has
+  // a side-effect of some sort.
+  bool SawStore = false;
+  if (!MI.isSafeToMove(/*AA=*/nullptr, SawStore))
+    return false;
+
+  // Instructions without side-effects are dead iff they only define dead vregs.
+  for (auto &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef())
+      continue;
+
+    unsigned Reg = MO.getReg();
+    if (TargetRegisterInfo::isPhysicalRegister(Reg) ||
+        !MRI.use_nodbg_empty(Reg))
+      return false;
+  }
+  return true;
+}
+
+void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              MachineOptimizationRemarkMissed &R) {
+  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled())
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (TPC.isGlobalISelAbortEnabled())
+    report_fatal_error(R.getMsg());
+  else
+    MORE.emit(R);
+}
+
+void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              const char *PassName, StringRef Msg,
+                              const MachineInstr &MI) {
+  MachineOptimizationRemarkMissed R(PassName, "GISelFailure: ",
+                                    MI.getDebugLoc(), MI.getParent());
+  R << Msg << ": " << ore::MNV("Inst", MI);
+  reportGISelFailure(MF, TPC, MORE, R);
+}
diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp
index 8b87aa16b8193c12dee3522e0e686652470a2577..37fe41582333dfd0bd036f0de00062a38f5e4845 100644
--- a/lib/CodeGen/IfConversion.cpp
+++ b/lib/CodeGen/IfConversion.cpp
@@ -2148,7 +2148,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
   // unknown probabilities into known ones.
   // FIXME: This usage is too tricky and in the future we would like to
   // eliminate all unknown probabilities in MBB.
-  ToBBI.BB->normalizeSuccProbs();
+  if (ToBBI.IsBrAnalyzable)
+    ToBBI.BB->normalizeSuccProbs();
 
   SmallVector<MachineBasicBlock *, 4> FromSuccs(FromMBB.succ_begin(),
                                                 FromMBB.succ_end());
@@ -2228,7 +2229,8 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
 
   // Normalize the probabilities of ToBBI.BB's successors with all adjustment
   // we've done above.
-  ToBBI.BB->normalizeSuccProbs();
+  if (ToBBI.IsBrAnalyzable && FromBBI.IsBrAnalyzable)
+    ToBBI.BB->normalizeSuccProbs();
 
   ToBBI.Predicate.append(FromBBI.Predicate.begin(), FromBBI.Predicate.end());
   FromBBI.Predicate.clear();
diff --git a/lib/CodeGen/ImplicitNullChecks.cpp b/lib/CodeGen/ImplicitNullChecks.cpp
index 0440555342872e0e6db3214fe18cc91b3961e5b4..920c2a372a9b8d3416ef1f48f5bc3a177ec27594 100644
--- a/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/lib/CodeGen/ImplicitNullChecks.cpp
@@ -22,6 +22,7 @@
 // With the help of a runtime that understands the .fault_maps section,
 // faulting_load_op branches to throw_npe if executing movl (%r10), %esi incurs
 // a page fault.
+// Store and LoadStore are also supported.
 //
 //===----------------------------------------------------------------------===//
 
@@ -29,6 +30,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -151,25 +153,44 @@ class ImplicitNullChecks : public MachineFunctionPass {
   const TargetRegisterInfo *TRI = nullptr;
   AliasAnalysis *AA = nullptr;
   MachineModuleInfo *MMI = nullptr;
+  MachineFrameInfo *MFI = nullptr;
 
   bool analyzeBlockForNullChecks(MachineBasicBlock &MBB,
                                  SmallVectorImpl<NullCheck> &NullCheckList);
-  MachineInstr *insertFaultingLoad(MachineInstr *LoadMI, MachineBasicBlock *MBB,
-                                   MachineBasicBlock *HandlerMBB);
+  MachineInstr *insertFaultingInstr(MachineInstr *MI, MachineBasicBlock *MBB,
+                                    MachineBasicBlock *HandlerMBB);
   void rewriteNullChecks(ArrayRef<NullCheck> NullCheckList);
 
-  /// Is \p MI a memory operation that can be used to implicitly null check the
-  /// value in \p PointerReg?  \p PrevInsts is the set of instruction seen since
+  enum AliasResult {
+    AR_NoAlias,
+    AR_MayAlias,
+    AR_WillAliasEverything
+  };
+  /// Returns AR_NoAlias if \p MI memory operation does not alias with
+  /// \p PrevMI, AR_MayAlias if they may alias and AR_WillAliasEverything if
+  /// they may alias and any further memory operation may alias with \p PrevMI.
+  AliasResult areMemoryOpsAliased(MachineInstr &MI, MachineInstr *PrevMI);
+
+  enum SuitabilityResult {
+    SR_Suitable,
+    SR_Unsuitable,
+    SR_Impossible
+  };
+  /// Return SR_Suitable if \p MI a memory operation that can be used to
+  /// implicitly null check the value in \p PointerReg, SR_Unsuitable if
+  /// \p MI cannot be used to null check and SR_Impossible if there is
+  /// no sense to continue lookup due to any other instruction will not be able
+  /// to be used. \p PrevInsts is the set of instruction seen since
   /// the explicit null check on \p PointerReg.
-  bool isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
-                          ArrayRef<MachineInstr *> PrevInsts);
+  SuitabilityResult isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
+                                       ArrayRef<MachineInstr *> PrevInsts);
 
   /// Return true if \p FaultingMI can be hoisted from after the the
   /// instructions in \p InstsSeenSoFar to before them.  Set \p Dependence to a
   /// non-null value if we also need to (and legally can) hoist a depedency.
-  bool canHoistLoadInst(MachineInstr *FaultingMI, unsigned PointerReg,
-                        ArrayRef<MachineInstr *> InstsSeenSoFar,
-                        MachineBasicBlock *NullSucc, MachineInstr *&Dependence);
+  bool canHoistInst(MachineInstr *FaultingMI, unsigned PointerReg,
+                    ArrayRef<MachineInstr *> InstsSeenSoFar,
+                    MachineBasicBlock *NullSucc, MachineInstr *&Dependence);
 
 public:
   static char ID;
@@ -193,7 +214,7 @@ public:
 }
 
 bool ImplicitNullChecks::canHandle(const MachineInstr *MI) {
-  if (MI->isCall() || MI->mayStore() || MI->hasUnmodeledSideEffects())
+  if (MI->isCall() || MI->hasUnmodeledSideEffects())
     return false;
   auto IsRegMask = [](const MachineOperand &MO) { return MO.isRegMask(); };
   (void)IsRegMask;
@@ -248,7 +269,7 @@ bool ImplicitNullChecks::canReorder(const MachineInstr *A,
 
       unsigned RegB = MOB.getReg();
 
-      if (TRI->regsOverlap(RegA, RegB))
+      if (TRI->regsOverlap(RegA, RegB) && (MOA.isDef() || MOB.isDef()))
         return false;
     }
   }
@@ -260,6 +281,7 @@ bool ImplicitNullChecks::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget().getInstrInfo();
   TRI = MF.getRegInfo().getTargetRegisterInfo();
   MMI = &MF.getMMI();
+  MFI = &MF.getFrameInfo();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   SmallVector<NullCheck, 16> NullCheckList;
@@ -283,36 +305,91 @@ static bool AnyAliasLiveIn(const TargetRegisterInfo *TRI,
   return false;
 }
 
-bool ImplicitNullChecks::isSuitableMemoryOp(
-    MachineInstr &MI, unsigned PointerReg, ArrayRef<MachineInstr *> PrevInsts) {
+ImplicitNullChecks::AliasResult
+ImplicitNullChecks::areMemoryOpsAliased(MachineInstr &MI,
+                                        MachineInstr *PrevMI) {
+  // If it is not memory access, skip the check.
+  if (!(PrevMI->mayStore() || PrevMI->mayLoad()))
+    return AR_NoAlias;
+  // Load-Load may alias
+  if (!(MI.mayStore() || PrevMI->mayStore()))
+    return AR_NoAlias;
+  // We lost info, conservatively alias. If it was store then no sense to
+  // continue because we won't be able to check against it further.
+  if (MI.memoperands_empty())
+    return MI.mayStore() ? AR_WillAliasEverything : AR_MayAlias;
+  if (PrevMI->memoperands_empty())
+    return PrevMI->mayStore() ? AR_WillAliasEverything : AR_MayAlias;
+
+  for (MachineMemOperand *MMO1 : MI.memoperands()) {
+    // MMO1 should have a value due it comes from operation we'd like to use
+    // as implicit null check.
+    assert(MMO1->getValue() && "MMO1 should have a Value!");
+    for (MachineMemOperand *MMO2 : PrevMI->memoperands()) {
+      if (const PseudoSourceValue *PSV = MMO2->getPseudoValue()) {
+        if (PSV->mayAlias(MFI))
+          return AR_MayAlias;
+        continue;
+      }
+      llvm::AliasResult AAResult = AA->alias(
+          MemoryLocation(MMO1->getValue(), MemoryLocation::UnknownSize,
+                         MMO1->getAAInfo()),
+          MemoryLocation(MMO2->getValue(), MemoryLocation::UnknownSize,
+                         MMO2->getAAInfo()));
+      if (AAResult != NoAlias)
+        return AR_MayAlias;
+    }
+  }
+  return AR_NoAlias;
+}
+
+ImplicitNullChecks::SuitabilityResult
+ImplicitNullChecks::isSuitableMemoryOp(MachineInstr &MI, unsigned PointerReg,
+                                       ArrayRef<MachineInstr *> PrevInsts) {
   int64_t Offset;
   unsigned BaseReg;
 
   if (!TII->getMemOpBaseRegImmOfs(MI, BaseReg, Offset, TRI) ||
       BaseReg != PointerReg)
-    return false;
-
-  // We want the load to be issued at a sane offset from PointerReg, so that
-  // if PointerReg is null then the load reliably page faults.
-  if (!(MI.mayLoad() && !MI.isPredicable() && Offset < PageSize))
-    return false;
-
-  // Finally, we need to make sure that the load instruction actually is
-  // loading from PointerReg, and there isn't some re-definition of PointerReg
-  // between the compare and the load.
+    return SR_Unsuitable;
+
+  // We want the mem access to be issued at a sane offset from PointerReg,
+  // so that if PointerReg is null then the access reliably page faults.
+  if (!((MI.mayLoad() || MI.mayStore()) && !MI.isPredicable() &&
+        Offset < PageSize))
+    return SR_Unsuitable;
+
+  // Finally, we need to make sure that the access instruction actually is
+  // accessing from PointerReg, and there isn't some re-definition of PointerReg
+  // between the compare and the memory access.
+  // If PointerReg has been redefined before then there is no sense to continue
+  // lookup due to this condition will fail for any further instruction.
+  SuitabilityResult Suitable = SR_Suitable;
   for (auto *PrevMI : PrevInsts)
-    for (auto &PrevMO : PrevMI->operands())
-      if (PrevMO.isReg() && PrevMO.getReg() &&
+    for (auto &PrevMO : PrevMI->operands()) {
+      if (PrevMO.isReg() && PrevMO.getReg() && PrevMO.isDef() &&
           TRI->regsOverlap(PrevMO.getReg(), PointerReg))
-        return false;
-
-  return true;
+        return SR_Impossible;
+
+      // Check whether the current memory access aliases with previous one.
+      // If we already found that it aliases then no need to continue.
+      // But we continue base pointer check as it can result in SR_Impossible.
+      if (Suitable == SR_Suitable) {
+        AliasResult AR = areMemoryOpsAliased(MI, PrevMI);
+        if (AR == AR_WillAliasEverything)
+          return SR_Impossible;
+        if (AR == AR_MayAlias)
+          Suitable = SR_Unsuitable;
+      }
+    }
+  return Suitable;
 }
 
-bool ImplicitNullChecks::canHoistLoadInst(
-    MachineInstr *FaultingMI, unsigned PointerReg,
-    ArrayRef<MachineInstr *> InstsSeenSoFar, MachineBasicBlock *NullSucc,
-    MachineInstr *&Dependence) {
+bool ImplicitNullChecks::canHoistInst(MachineInstr *FaultingMI,
+                                      unsigned PointerReg,
+                                      ArrayRef<MachineInstr *> InstsSeenSoFar,
+                                      MachineBasicBlock *NullSucc,
+                                      MachineInstr *&Dependence) {
   auto DepResult = computeDependence(FaultingMI, InstsSeenSoFar);
   if (!DepResult.CanReorder)
     return false;
@@ -359,7 +436,8 @@ bool ImplicitNullChecks::canHoistLoadInst(
     // The Dependency can't be re-defining the base register -- then we won't
     // get the memory operation on the address we want.  This is already
     // checked in \c IsSuitableMemoryOp.
-    assert(!TRI->regsOverlap(DependenceMO.getReg(), PointerReg) &&
+    assert(!(DependenceMO.isDef() &&
+             TRI->regsOverlap(DependenceMO.getReg(), PointerReg)) &&
            "Should have been checked before!");
   }
 
@@ -481,9 +559,11 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
       return false;
 
     MachineInstr *Dependence;
-    if (isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar) &&
-        canHoistLoadInst(&MI, PointerReg, InstsSeenSoFar, NullSucc,
-                         Dependence)) {
+    SuitabilityResult SR = isSuitableMemoryOp(MI, PointerReg, InstsSeenSoFar);
+    if (SR == SR_Impossible)
+      return false;
+    if (SR == SR_Suitable &&
+        canHoistInst(&MI, PointerReg, InstsSeenSoFar, NullSucc, Dependence)) {
       NullCheckList.emplace_back(&MI, MBP.ConditionDef, &MBB, NotNullSucc,
                                  NullSucc, Dependence);
       return true;
@@ -495,36 +575,42 @@ bool ImplicitNullChecks::analyzeBlockForNullChecks(
   return false;
 }
 
-/// Wrap a machine load instruction, LoadMI, into a FAULTING_LOAD_OP machine
-/// instruction.  The FAULTING_LOAD_OP instruction does the same load as LoadMI
-/// (defining the same register), and branches to HandlerMBB if the load
-/// faults.  The FAULTING_LOAD_OP instruction is inserted at the end of MBB.
-MachineInstr *
-ImplicitNullChecks::insertFaultingLoad(MachineInstr *LoadMI,
-                                       MachineBasicBlock *MBB,
-                                       MachineBasicBlock *HandlerMBB) {
+/// Wrap a machine instruction, MI, into a FAULTING machine instruction.
+/// The FAULTING instruction does the same load/store as MI
+/// (defining the same register), and branches to HandlerMBB if the mem access
+/// faults.  The FAULTING instruction is inserted at the end of MBB.
+MachineInstr *ImplicitNullChecks::insertFaultingInstr(
+    MachineInstr *MI, MachineBasicBlock *MBB, MachineBasicBlock *HandlerMBB) {
   const unsigned NoRegister = 0; // Guaranteed to be the NoRegister value for
                                  // all targets.
 
   DebugLoc DL;
-  unsigned NumDefs = LoadMI->getDesc().getNumDefs();
+  unsigned NumDefs = MI->getDesc().getNumDefs();
   assert(NumDefs <= 1 && "other cases unhandled!");
 
   unsigned DefReg = NoRegister;
   if (NumDefs != 0) {
-    DefReg = LoadMI->defs().begin()->getReg();
-    assert(std::distance(LoadMI->defs().begin(), LoadMI->defs().end()) == 1 &&
+    DefReg = MI->defs().begin()->getReg();
+    assert(std::distance(MI->defs().begin(), MI->defs().end()) == 1 &&
            "expected exactly one def!");
   }
 
-  auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_LOAD_OP), DefReg)
+  FaultMaps::FaultKind FK;
+  if (MI->mayLoad())
+    FK =
+        MI->mayStore() ? FaultMaps::FaultingLoadStore : FaultMaps::FaultingLoad;
+  else
+    FK = FaultMaps::FaultingStore;
+
+  auto MIB = BuildMI(MBB, DL, TII->get(TargetOpcode::FAULTING_OP), DefReg)
+                 .addImm(FK)
                  .addMBB(HandlerMBB)
-                 .addImm(LoadMI->getOpcode());
+                 .addImm(MI->getOpcode());
 
-  for (auto &MO : LoadMI->uses())
+  for (auto &MO : MI->uses())
     MIB.add(MO);
 
-  MIB.setMemRefs(LoadMI->memoperands_begin(), LoadMI->memoperands_end());
+  MIB.setMemRefs(MI->memoperands_begin(), MI->memoperands_end());
 
   return MIB;
 }
@@ -545,18 +631,18 @@ void ImplicitNullChecks::rewriteNullChecks(
       NC.getCheckBlock()->insert(NC.getCheckBlock()->end(), DepMI);
     }
 
-    // Insert a faulting load where the conditional branch was originally.  We
-    // check earlier ensures that this bit of code motion is legal.  We do not
-    // touch the successors list for any basic block since we haven't changed
-    // control flow, we've just made it implicit.
-    MachineInstr *FaultingLoad = insertFaultingLoad(
+    // Insert a faulting instruction where the conditional branch was
+    // originally. We check earlier ensures that this bit of code motion
+    // is legal.  We do not touch the successors list for any basic block
+    // since we haven't changed control flow, we've just made it implicit.
+    MachineInstr *FaultingInstr = insertFaultingInstr(
         NC.getMemOperation(), NC.getCheckBlock(), NC.getNullSucc());
     // Now the values defined by MemOperation, if any, are live-in of
     // the block of MemOperation.
-    // The original load operation may define implicit-defs alongside
-    // the loaded value.
+    // The original operation may define implicit-defs alongside
+    // the value.
     MachineBasicBlock *MBB = NC.getMemOperation()->getParent();
-    for (const MachineOperand &MO : FaultingLoad->operands()) {
+    for (const MachineOperand &MO : FaultingInstr->operands()) {
       if (!MO.isReg() || !MO.isDef())
         continue;
       unsigned Reg = MO.getReg();
diff --git a/lib/CodeGen/InlineSpiller.cpp b/lib/CodeGen/InlineSpiller.cpp
index 3d81184f774a3f58e2057fdaa9d532902edd14f8..a1cb0a0695bfa85a75934546a3a4873ded3d35ad 100644
--- a/lib/CodeGen/InlineSpiller.cpp
+++ b/lib/CodeGen/InlineSpiller.cpp
@@ -558,7 +558,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
       Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI);
 
   // We take the DebugLoc from MI, since OrigMI may be attributed to a
-  // different source location. 
+  // different source location.
   auto *NewMI = LIS.getInstructionFromIndex(DefIdx);
   NewMI->setDebugLoc(MI.getDebugLoc());
 
@@ -686,7 +686,8 @@ bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
   return true;
 }
 
-#if !defined(NDEBUG)
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
 // Dump the range of instructions from B to E with their slot indexes.
 static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
                                                MachineBasicBlock::iterator E,
diff --git a/lib/CodeGen/InterleavedAccessPass.cpp b/lib/CodeGen/InterleavedAccessPass.cpp
index c8f79d7fb71cdc693e98d80d9f6f4c08e7ed4b8e..ec35b3f6449e168f319922b99b4d19ac428a17b1 100644
--- a/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/lib/CodeGen/InterleavedAccessPass.cpp
@@ -174,7 +174,7 @@ static bool isDeInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
 /// I.e. <0, LaneLen, ... , LaneLen*(Factor - 1), 1, LaneLen + 1, ...>
 /// E.g. For a Factor of 2 (LaneLen=4): <0, 4, 1, 5, 2, 6, 3, 7>
 static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
-                               unsigned MaxFactor) {
+                               unsigned MaxFactor, unsigned OpNumElts) {
   unsigned NumElts = Mask.size();
   if (NumElts < 4)
     return false;
@@ -246,6 +246,9 @@ static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
 
       if (StartMask < 0)
         break;
+      // We must stay within the vectors; This case can happen with undefs.
+      if (StartMask + LaneLen > OpNumElts*2)
+        break;
     }
 
     // Found an interleaved mask of current factor.
@@ -406,7 +409,8 @@ bool InterleavedAccess::lowerInterleavedStore(
 
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
-  if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor))
+  unsigned OpNumElts = SVI->getOperand(0)->getType()->getVectorNumElements();
+  if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts))
     return false;
 
   DEBUG(dbgs() << "IA: Found an interleaved store: " << *SI << "\n");
diff --git a/lib/CodeGen/IntrinsicLowering.cpp b/lib/CodeGen/IntrinsicLowering.cpp
index afd24067ace76342f4e7aa8a98dc61f97bab78a8..c6cc909e25d38a147f7a6fdd93781731584e8d05 100644
--- a/lib/CodeGen/IntrinsicLowering.cpp
+++ b/lib/CodeGen/IntrinsicLowering.cpp
@@ -115,21 +115,21 @@ void IntrinsicLowering::AddPrototypes(Module &M) {
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt8PtrTy(Context), 
-                              DL.getIntPtrType(Context), nullptr);
+                              DL.getIntPtrType(Context));
         break;
       case Intrinsic::memmove:
         M.getOrInsertFunction("memmove",
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt8PtrTy(Context), 
-                              DL.getIntPtrType(Context), nullptr);
+                              DL.getIntPtrType(Context));
         break;
       case Intrinsic::memset:
         M.getOrInsertFunction("memset",
           Type::getInt8PtrTy(Context),
                               Type::getInt8PtrTy(Context), 
                               Type::getInt32Ty(M.getContext()), 
-                              DL.getIntPtrType(Context), nullptr);
+                              DL.getIntPtrType(Context));
         break;
       case Intrinsic::sqrt:
         EnsureFPIntrinsicsExist(M, F, "sqrtf", "sqrt", "sqrtl");
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 367fd66304ac5afe9c0512fe473285a88985ea39..7b1706f0f4ba9336e181639c45657ce2a9385409 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -42,8 +42,8 @@ static cl::opt<cl::boolOrDefault>
 EnableFastISelOption("fast-isel", cl::Hidden,
   cl::desc("Enable the \"fast\" instruction selector"));
 
-static cl::opt<bool>
-    EnableGlobalISel("global-isel", cl::Hidden, cl::init(false),
+static cl::opt<cl::boolOrDefault>
+    EnableGlobalISel("global-isel", cl::Hidden,
                      cl::desc("Enable the \"global\" instruction selector"));
 
 void LLVMTargetMachine::initAsmInfo() {
@@ -149,7 +149,9 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
     TM->setFastISel(true);
 
   // Ask the target for an isel.
-  if (LLVM_UNLIKELY(EnableGlobalISel)) {
+  // Enable GlobalISel if the target wants to, but allow that to be overriden.
+  if (EnableGlobalISel == cl::BOU_TRUE || (EnableGlobalISel == cl::BOU_UNSET &&
+                                           PassConfig->isGlobalISelEnabled())) {
     if (PassConfig->addIRTranslator())
       return nullptr;
 
@@ -177,7 +179,7 @@ addPassesToGenerateCode(LLVMTargetMachine *TM, PassManagerBase &PM,
 
     // Provide a fallback path when we do not want to abort on
     // not-yet-supported input.
-    if (LLVM_UNLIKELY(!PassConfig->isGlobalISelAbortEnabled()) &&
+    if (!PassConfig->isGlobalISelAbortEnabled() &&
         PassConfig->addInstSelector())
       return nullptr;
 
diff --git a/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..996d40ca6e1eefe51cee58c9a40f928d5eb2d47d
--- /dev/null
+++ b/lib/CodeGen/LazyMachineBlockFrequencyInfo.cpp
@@ -0,0 +1,97 @@
+///===- LazyMachineBlockFrequencyInfo.cpp - Lazy Machine Block Frequency --===//
+///
+///                     The LLVM Compiler Infrastructure
+///
+/// This file is distributed under the University of Illinois Open Source
+/// License. See LICENSE.TXT for details.
+///
+///===---------------------------------------------------------------------===//
+/// \file
+/// This is an alternative analysis pass to MachineBlockFrequencyInfo.  The
+/// difference is that with this pass the block frequencies are not computed
+/// when the analysis pass is executed but rather when the BFI result is
+/// explicitly requested by the analysis client.
+///
+///===---------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "lazy-machine-block-freq"
+
+INITIALIZE_PASS_BEGIN(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE,
+                      "Lazy Machine Block Frequency Analysis", true, true)
+INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_END(LazyMachineBlockFrequencyInfoPass, DEBUG_TYPE,
+                    "Lazy Machine Block Frequency Analysis", true, true)
+
+char LazyMachineBlockFrequencyInfoPass::ID = 0;
+
+LazyMachineBlockFrequencyInfoPass::LazyMachineBlockFrequencyInfoPass()
+    : MachineFunctionPass(ID) {
+  initializeLazyMachineBlockFrequencyInfoPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+void LazyMachineBlockFrequencyInfoPass::print(raw_ostream &OS,
+                                              const Module *M) const {
+  getBFI().print(OS, M);
+}
+
+void LazyMachineBlockFrequencyInfoPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.addRequired<MachineBranchProbabilityInfo>();
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+void LazyMachineBlockFrequencyInfoPass::releaseMemory() {
+  OwnedMBFI.reset();
+  OwnedMLI.reset();
+  OwnedMDT.reset();
+}
+
+MachineBlockFrequencyInfo &
+LazyMachineBlockFrequencyInfoPass::calculateIfNotAvailable() const {
+  auto *MBFI = getAnalysisIfAvailable<MachineBlockFrequencyInfo>();
+  if (MBFI) {
+    DEBUG(dbgs() << "MachineBlockFrequencyInfo is available\n");
+    return *MBFI;
+  }
+
+  auto &MBPI = getAnalysis<MachineBranchProbabilityInfo>();
+  auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+  auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+  DEBUG(dbgs() << "Building MachineBlockFrequencyInfo on the fly\n");
+  DEBUG(if (MLI) dbgs() << "LoopInfo is available\n");
+
+  if (!MLI) {
+    DEBUG(dbgs() << "Building LoopInfo on the fly\n");
+    // First create a dominator tree.
+    DEBUG(if (MDT) dbgs() << "DominatorTree is available\n");
+
+    if (!MDT) {
+      DEBUG(dbgs() << "Building DominatorTree on the fly\n");
+      OwnedMDT = make_unique<MachineDominatorTree>();
+      OwnedMDT->getBase().recalculate(*MF);
+      MDT = OwnedMDT.get();
+    }
+
+    // Generate LoopInfo from it.
+    OwnedMLI = make_unique<MachineLoopInfo>();
+    OwnedMLI->getBase().analyze(MDT->getBase());
+    MLI = OwnedMLI.get();
+  }
+
+  OwnedMBFI = make_unique<MachineBlockFrequencyInfo>();
+  OwnedMBFI->calculate(*MF, MBPI, *MLI);
+  return *OwnedMBFI.get();
+}
+
+bool LazyMachineBlockFrequencyInfoPass::runOnMachineFunction(
+    MachineFunction &F) {
+  MF = &F;
+  return false;
+}
diff --git a/lib/CodeGen/LexicalScopes.cpp b/lib/CodeGen/LexicalScopes.cpp
index fc84ddb1f6b6b3f4a2b2d0d626c0261620751968..275d84e2c185ff17e6d421f5b2619e2feecf8095 100644
--- a/lib/CodeGen/LexicalScopes.cpp
+++ b/lib/CodeGen/LexicalScopes.cpp
@@ -14,14 +14,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/Function.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <string>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "lexicalscopes"
@@ -38,6 +47,10 @@ void LexicalScopes::reset() {
 
 /// initialize - Scan machine function and constuct lexical scope nest.
 void LexicalScopes::initialize(const MachineFunction &Fn) {
+  // Don't attempt any lexical scope creation for a NoDebug compile unit.
+  if (Fn.getFunction()->getSubprogram()->getUnit()->getEmissionKind() ==
+      DICompileUnit::NoDebug)
+    return;
   reset();
   MF = &Fn;
   SmallVector<InsnRange, 4> MIRanges;
@@ -54,7 +67,6 @@ void LexicalScopes::initialize(const MachineFunction &Fn) {
 void LexicalScopes::extractLexicalScopes(
     SmallVectorImpl<InsnRange> &MIRanges,
     DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
-
   // Scan each instruction and create scopes. First build working set of scopes.
   for (const auto &MBB : *MF) {
     const MachineInstr *RangeBeginMI = nullptr;
@@ -127,6 +139,10 @@ LexicalScope *LexicalScopes::findLexicalScope(const DILocation *DL) {
 LexicalScope *LexicalScopes::getOrCreateLexicalScope(const DILocalScope *Scope,
                                                      const DILocation *IA) {
   if (IA) {
+    // Skip scopes inlined from a NoDebug compile unit.
+    if (Scope->getSubprogram()->getUnit()->getEmissionKind() ==
+        DICompileUnit::NoDebug)
+      return getOrCreateLexicalScope(IA);
     // Create an abstract scope for inlined function.
     getOrCreateAbstractScope(Scope);
     // Create an inlined scope for inlined function.
@@ -181,10 +197,9 @@ LexicalScopes::getOrCreateInlinedScope(const DILocalScope *Scope,
   else
     Parent = getOrCreateLexicalScope(InlinedAt);
 
-  I = InlinedLexicalScopeMap.emplace(std::piecewise_construct,
-                                     std::forward_as_tuple(P),
-                                     std::forward_as_tuple(Parent, Scope,
-                                                           InlinedAt, false))
+  I = InlinedLexicalScopeMap
+          .emplace(std::piecewise_construct, std::forward_as_tuple(P),
+                   std::forward_as_tuple(Parent, Scope, InlinedAt, false))
           .first;
   return &I->second;
 }
@@ -241,7 +256,6 @@ void LexicalScopes::constructScopeNest(LexicalScope *Scope) {
 void LexicalScopes::assignInstructionRanges(
     SmallVectorImpl<InsnRange> &MIRanges,
     DenseMap<const MachineInstr *, LexicalScope *> &MI2ScopeMap) {
-
   LexicalScope *PrevLexicalScope = nullptr;
   for (const auto &R : MIRanges) {
     LexicalScope *S = MI2ScopeMap.lookup(R.first);
diff --git a/lib/CodeGen/LiveDebugValues.cpp b/lib/CodeGen/LiveDebugValues.cpp
index 131167630d655daaab0560de2e4d6bcbfaa63563..f956974b1aafee9cc011aed41b21535d2dac05a5 100644
--- a/lib/CodeGen/LiveDebugValues.cpp
+++ b/lib/CodeGen/LiveDebugValues.cpp
@@ -24,13 +24,16 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/CodeGen/LexicalScopes.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetRegisterInfo.h"
@@ -61,6 +64,7 @@ class LiveDebugValues : public MachineFunctionPass {
 private:
   const TargetRegisterInfo *TRI;
   const TargetInstrInfo *TII;
+  const TargetFrameLowering *TFI;
   LexicalScopes LS;
 
   /// Keeps track of lexical scopes associated with a user value's source
@@ -127,11 +131,13 @@ private:
       if (int RegNo = isDbgValueDescribedByReg(MI)) {
         Kind = RegisterKind;
         Loc.RegisterLoc.RegNo = RegNo;
-        uint64_t Offset =
+        int64_t Offset =
             MI.isIndirectDebugValue() ? MI.getOperand(1).getImm() : 0;
         // We don't support offsets larger than 4GiB here. They are
         // slated to be replaced with DIExpressions anyway.
-        if (Offset >= (1ULL << 32))
+        // With indirect debug values used for spill locations, Offset 
+        // can be negative.
+        if (Offset == INT64_MIN || std::abs(Offset) >= (1LL << 32))
           Kind = InvalidKind;
         else
           Loc.RegisterLoc.Offset = Offset;
@@ -169,6 +175,11 @@ private:
   typedef UniqueVector<VarLoc> VarLocMap;
   typedef SparseBitVector<> VarLocSet;
   typedef SmallDenseMap<const MachineBasicBlock *, VarLocSet> VarLocInMBB;
+  struct SpillDebugPair {
+    MachineInstr *SpillInst;
+    MachineInstr *DebugInst;
+  };
+  typedef SmallVector<SpillDebugPair, 4> SpillMap;
 
   /// This holds the working set of currently open ranges. For fast
   /// access, this is done both as a set of VarLocIDs, and a map of
@@ -218,14 +229,21 @@ private:
     }
   };
 
+  bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF,
+                          unsigned &Reg);
+  int extractSpillBaseRegAndOffset(const MachineInstr &MI, unsigned &Reg);
+
   void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
                           VarLocMap &VarLocIDs);
+  void transferSpillInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
+                         VarLocMap &VarLocIDs, SpillMap &Spills);
   void transferRegisterDef(MachineInstr &MI, OpenRangesSet &OpenRanges,
                            const VarLocMap &VarLocIDs);
   bool transferTerminatorInst(MachineInstr &MI, OpenRangesSet &OpenRanges,
                               VarLocInMBB &OutLocs, const VarLocMap &VarLocIDs);
   bool transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs);
+                VarLocInMBB &OutLocs, VarLocMap &VarLocIDs, SpillMap &Spills,
+                bool transferSpills);
 
   bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
             const VarLocMap &VarLocIDs,
@@ -305,6 +323,21 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
 }
 #endif
 
+/// Given a spill instruction, extract the register and offset used to
+/// address the spill location in a target independent way.
+int LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI,
+                                                  unsigned &Reg) {
+  assert(MI.hasOneMemOperand() && 
+         "Spill instruction does not have exactly one memory operand?");
+  auto MMOI = MI.memoperands_begin();
+  const PseudoSourceValue *PVal = (*MMOI)->getPseudoValue();
+  assert(PVal->kind() == PseudoSourceValue::FixedStack &&
+         "Inconsistent memory operand in spill instruction");
+  int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
+  const MachineBasicBlock *MBB = MI.getParent();
+  return TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
+}
+
 /// End all previous ranges related to @MI and start a new range from @MI
 /// if it is a DBG_VALUE instr.
 void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
@@ -340,8 +373,12 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
   unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
   SparseBitVector<> KillSet;
   for (const MachineOperand &MO : MI.operands()) {
+    // Determine whether the operand is a register def.  Assume that call
+    // instructions never clobber SP, because some backends (e.g., AArch64)
+    // never list SP in the regmask.
     if (MO.isReg() && MO.isDef() && MO.getReg() &&
-        TRI->isPhysicalRegister(MO.getReg())) {
+        TRI->isPhysicalRegister(MO.getReg()) &&
+        !(MI.isCall() && MO.getReg() == SP)) {
       // Remove ranges of all aliased registers.
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
         for (unsigned ID : OpenRanges.getVarLocs())
@@ -362,6 +399,91 @@ void LiveDebugValues::transferRegisterDef(MachineInstr &MI,
   OpenRanges.erase(KillSet, VarLocIDs);
 }
 
+/// Decide if @MI is a spill instruction and return true if it is. We use 2
+/// criteria to make this decision:
+/// - Is this instruction a store to a spill slot?
+/// - Is there a register operand that is both used and killed?
+/// TODO: Store optimization can fold spills into other stores (including
+/// other spills). We do not handle this yet (more than one memory operand).
+bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
+                                         MachineFunction *MF, unsigned &Reg) {
+  const MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+  int FI;
+  const MachineMemOperand *MMO;
+
+  // TODO: Handle multiple stores folded into one. 
+  if (!MI.hasOneMemOperand())
+    return false;
+
+  // To identify a spill instruction, use the same criteria as in AsmPrinter.
+  if (!((TII->isStoreToStackSlotPostFE(MI, FI) ||
+         TII->hasStoreToStackSlot(MI, MMO, FI)) &&
+        FrameInfo.isSpillSlotObjectIndex(FI)))
+    return false;
+
+  // In a spill instruction generated by the InlineSpiller the spilled register
+  // has its kill flag set. Return false if we don't find such a register.
+  Reg = 0;
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isReg() && MO.isUse() && MO.isKill()) {
+      Reg = MO.getReg();
+      break;
+    }
+  }
+  return Reg != 0;
+}
+
+/// A spilled register may indicate that we have to end the current range of
+/// a variable and create a new one for the spill location.
+/// We don't want to insert any instructions in transfer(), so we just create
+/// the DBG_VALUE witout inserting it and keep track of it in @Spills.
+/// It will be inserted into the BB when we're done iterating over the
+/// instructions.
+void LiveDebugValues::transferSpillInst(MachineInstr &MI,
+                                        OpenRangesSet &OpenRanges,
+                                        VarLocMap &VarLocIDs,
+                                        SpillMap &Spills) {
+  unsigned Reg;
+  MachineFunction *MF = MI.getParent()->getParent();
+  if (!isSpillInstruction(MI, MF, Reg))
+    return;
+
+  // Check if the register is the location of a debug value.
+  for (unsigned ID : OpenRanges.getVarLocs()) {
+    if (VarLocIDs[ID].isDescribedByReg() == Reg) {
+      DEBUG(dbgs() << "Spilling Register " << PrintReg(Reg, TRI) << '('
+                   << VarLocIDs[ID].Var.getVar()->getName() << ")\n");
+
+      // Create a DBG_VALUE instruction to describe the Var in its spilled
+      // location, but don't insert it yet to avoid invalidating the
+      // iterator in our caller.
+      unsigned SpillBase;
+      int SpillOffset = extractSpillBaseRegAndOffset(MI, SpillBase);
+      const MachineInstr *DMI = &VarLocIDs[ID].MI;
+      MachineInstr *SpDMI =
+          BuildMI(*MF, DMI->getDebugLoc(), DMI->getDesc(), true, SpillBase, 0,
+                  DMI->getDebugVariable(), DMI->getDebugExpression());
+      SpDMI->getOperand(1).setImm(SpillOffset);
+      DEBUG(dbgs() << "Creating DBG_VALUE inst for spill: ";
+            SpDMI->print(dbgs(), false, TII));
+
+      // The newly created DBG_VALUE instruction SpDMI must be inserted after
+      // MI. Keep track of the pairing.
+      SpillDebugPair MIP = {&MI, SpDMI};
+      Spills.push_back(MIP);
+
+      // End all previous ranges of Var.
+      OpenRanges.erase(VarLocIDs[ID].Var);
+
+      // Add the VarLoc to OpenRanges.
+      VarLoc VL(*SpDMI, LS);
+      unsigned SpillLocID = VarLocIDs.insert(VL);
+      OpenRanges.insert(SpillLocID, VL.Var);
+      return;
+    }
+  }
+}
+
 /// Terminate all open ranges at the end of the current basic block.
 bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
                                              OpenRangesSet &OpenRanges,
@@ -387,10 +509,13 @@ bool LiveDebugValues::transferTerminatorInst(MachineInstr &MI,
 
 /// This routine creates OpenRanges and OutLocs.
 bool LiveDebugValues::transfer(MachineInstr &MI, OpenRangesSet &OpenRanges,
-                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs) {
+                               VarLocInMBB &OutLocs, VarLocMap &VarLocIDs,
+                               SpillMap &Spills, bool transferSpills) {
   bool Changed = false;
   transferDebugValue(MI, OpenRanges, VarLocIDs);
   transferRegisterDef(MI, OpenRanges, VarLocIDs);
+  if (transferSpills)
+    transferSpillInst(MI, OpenRanges, VarLocIDs, Spills);
   Changed = transferTerminatorInst(MI, OpenRanges, OutLocs, VarLocIDs);
   return Changed;
 }
@@ -479,10 +604,11 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
   bool OLChanged = false;
   bool MBBJoined = false;
 
-  VarLocMap VarLocIDs;   // Map VarLoc<>unique ID for use in bitvectors.
+  VarLocMap VarLocIDs;      // Map VarLoc<>unique ID for use in bitvectors.
   OpenRangesSet OpenRanges; // Ranges that are open until end of bb.
-  VarLocInMBB OutLocs;   // Ranges that exist beyond bb.
-  VarLocInMBB InLocs;    // Ranges that are incoming after joining.
+  VarLocInMBB OutLocs;      // Ranges that exist beyond bb.
+  VarLocInMBB InLocs;       // Ranges that are incoming after joining.
+  SpillMap Spills;          // DBG_VALUEs associated with spills.
 
   DenseMap<unsigned int, MachineBasicBlock *> OrderToBB;
   DenseMap<MachineBasicBlock *, unsigned int> BBToOrder;
@@ -494,9 +620,14 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
       Pending;
 
   // Initialize every mbb with OutLocs.
+  // We are not looking at any spill instructions during the initial pass
+  // over the BBs. The LiveDebugVariables pass has already created DBG_VALUE
+  // instructions for spills of registers that are known to be user variables
+  // within the BB in which the spill occurs.
   for (auto &MBB : MF)
     for (auto &MI : MBB)
-      transfer(MI, OpenRanges, OutLocs, VarLocIDs);
+      transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills,
+               /*transferSpills=*/false);
 
   DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "OutLocs after initialization",
                          dbgs()));
@@ -528,8 +659,18 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
       if (MBBJoined) {
         MBBJoined = false;
         Changed = true;
+        // Now that we have started to extend ranges across BBs we need to
+        // examine spill instructions to see whether they spill registers that
+        // correspond to user variables.
         for (auto &MI : *MBB)
-          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs);
+          OLChanged |= transfer(MI, OpenRanges, OutLocs, VarLocIDs, Spills,
+                                /*transferSpills=*/true);
+
+        // Add any DBG_VALUE instructions necessitated by spills.
+        for (auto &SP : Spills)
+          MBB->insertAfter(MachineBasicBlock::iterator(*SP.SpillInst),
+                           SP.DebugInst);
+        Spills.clear();
 
         DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs,
                                "OutLocs after propagating", dbgs()));
@@ -563,6 +704,7 @@ bool LiveDebugValues::runOnMachineFunction(MachineFunction &MF) {
 
   TRI = MF.getSubtarget().getRegisterInfo();
   TII = MF.getSubtarget().getInstrInfo();
+  TFI = MF.getSubtarget().getFrameLowering();
   LS.initialize(MF);
 
   bool Changed = ExtendRanges(MF);
diff --git a/lib/CodeGen/LiveInterval.cpp b/lib/CodeGen/LiveInterval.cpp
index 124e801359305f7bde5679978bfe41d51766a701..9ef9f238fdcea475ddc152dcb93a019226d939ee 100644
--- a/lib/CodeGen/LiveInterval.cpp
+++ b/lib/CodeGen/LiveInterval.cpp
@@ -863,6 +863,37 @@ void LiveInterval::clearSubRanges() {
   SubRanges = nullptr;
 }
 
+void LiveInterval::refineSubRanges(BumpPtrAllocator &Allocator,
+    LaneBitmask LaneMask, std::function<void(LiveInterval::SubRange&)> Apply) {
+
+  LaneBitmask ToApply = LaneMask;
+  for (SubRange &SR : subranges()) {
+    LaneBitmask SRMask = SR.LaneMask;
+    LaneBitmask Matching = SRMask & LaneMask;
+    if (Matching.none())
+      continue;
+
+    SubRange *MatchingRange;
+    if (SRMask == Matching) {
+      // The subrange fits (it does not cover bits outside \p LaneMask).
+      MatchingRange = &SR;
+    } else {
+      // We have to split the subrange into a matching and non-matching part.
+      // Reduce lanemask of existing lane to non-matching part.
+      SR.LaneMask = SRMask & ~Matching;
+      // Create a new subrange for the matching part
+      MatchingRange = createSubRangeFrom(Allocator, Matching, SR);
+    }
+    Apply(*MatchingRange);
+    ToApply &= ~Matching;
+  }
+  // Create a new subrange if there are uncovered bits left.
+  if (ToApply.any()) {
+    SubRange *NewRange = createSubRange(Allocator, ToApply);
+    Apply(*NewRange);
+  }
+}
+
 unsigned LiveInterval::getSize() const {
   unsigned Sum = 0;
   for (const Segment &S : segments)
diff --git a/lib/CodeGen/LiveIntervalAnalysis.cpp b/lib/CodeGen/LiveIntervalAnalysis.cpp
index 2720609ab758a9a2d6186cee88d5fee0f7afe276..3f5b8e19d1f0cb7f710562e1a80b13f4186bdb37 100644
--- a/lib/CodeGen/LiveIntervalAnalysis.cpp
+++ b/lib/CodeGen/LiveIntervalAnalysis.cpp
@@ -1232,10 +1232,12 @@ private:
           LiveRange::iterator NewIdxIn = NewIdxOut;
           assert(NewIdxIn == LR.find(NewIdx.getBaseIndex()));
           const SlotIndex SplitPos = NewIdxDef;
+          OldIdxVNI = OldIdxIn->valno;
 
           // Merge the OldIdxIn and OldIdxOut segments into OldIdxOut.
+          OldIdxOut->valno->def = OldIdxIn->start;
           *OldIdxOut = LiveRange::Segment(OldIdxIn->start, OldIdxOut->end,
-                                          OldIdxIn->valno);
+                                          OldIdxOut->valno);
           // OldIdxIn and OldIdxVNI are now undef and can be overridden.
           // We Slide [NewIdxIn, OldIdxIn) down one position.
           //    |- X0/NewIdxIn -| ... |- Xn-1 -||- Xn/OldIdxIn -||- OldIdxOut -|
diff --git a/lib/CodeGen/LiveIntervalUnion.cpp b/lib/CodeGen/LiveIntervalUnion.cpp
index fc2f233f6d687de87e08be5652375ff7ecba193e..b4aa0dc326a58452343089924d7381a9ce68ac73 100644
--- a/lib/CodeGen/LiveIntervalUnion.cpp
+++ b/lib/CodeGen/LiveIntervalUnion.cpp
@@ -1,4 +1,4 @@
-//===-- LiveIntervalUnion.cpp - Live interval union data structure --------===//
+//===- LiveIntervalUnion.cpp - Live interval union data structure ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,19 +13,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/LiveIntervalUnion.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SparseBitVector.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
-#include <algorithm>
+#include <cassert>
+#include <cstdlib>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "regalloc"
 
-
 // Merge a LiveInterval's segments. Guarantee no overlaps.
 void LiveIntervalUnion::unify(LiveInterval &VirtReg, const LiveRange &Range) {
   if (Range.empty())
@@ -64,7 +64,7 @@ void LiveIntervalUnion::extract(LiveInterval &VirtReg, const LiveRange &Range) {
   LiveRange::const_iterator RegEnd = Range.end();
   SegmentIter SegPos = Segments.find(RegPos->start);
 
-  for (;;) {
+  while (true) {
     assert(SegPos.value() == &VirtReg && "Inconsistent LiveInterval");
     SegPos.erase();
     if (!SegPos.valid())
@@ -126,25 +126,24 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {
     CheckedFirstInterference = true;
 
     // Quickly skip interference check for empty sets.
-    if (VirtReg->empty() || LiveUnion->empty()) {
+    if (LR->empty() || LiveUnion->empty()) {
       SeenAllInterferences = true;
       return 0;
     }
 
-    // In most cases, the union will start before VirtReg.
-    VirtRegI = VirtReg->begin();
+    // In most cases, the union will start before LR.
+    LRI = LR->begin();
     LiveUnionI.setMap(LiveUnion->getMap());
-    LiveUnionI.find(VirtRegI->start);
+    LiveUnionI.find(LRI->start);
   }
 
-  LiveInterval::iterator VirtRegEnd = VirtReg->end();
+  LiveRange::const_iterator LREnd = LR->end();
   LiveInterval *RecentReg = nullptr;
   while (LiveUnionI.valid()) {
-    assert(VirtRegI != VirtRegEnd && "Reached end of VirtReg");
+    assert(LRI != LREnd && "Reached end of LR");
 
     // Check for overlapping interference.
-    while (VirtRegI->start < LiveUnionI.stop() &&
-           VirtRegI->end > LiveUnionI.start()) {
+    while (LRI->start < LiveUnionI.stop() && LRI->end > LiveUnionI.start()) {
       // This is an overlap, record the interfering register.
       LiveInterval *VReg = LiveUnionI.value();
       if (VReg != RecentReg && !isSeenInterference(VReg)) {
@@ -161,20 +160,20 @@ collectInterferingVRegs(unsigned MaxInterferingRegs) {
     }
 
     // The iterators are now not overlapping, LiveUnionI has been advanced
-    // beyond VirtRegI.
-    assert(VirtRegI->end <= LiveUnionI.start() && "Expected non-overlap");
+    // beyond LRI.
+    assert(LRI->end <= LiveUnionI.start() && "Expected non-overlap");
 
     // Advance the iterator that ends first.
-    VirtRegI = VirtReg->advanceTo(VirtRegI, LiveUnionI.start());
-    if (VirtRegI == VirtRegEnd)
+    LRI = LR->advanceTo(LRI, LiveUnionI.start());
+    if (LRI == LREnd)
       break;
 
     // Detect overlap, handle above.
-    if (VirtRegI->start < LiveUnionI.stop())
+    if (LRI->start < LiveUnionI.stop())
       continue;
 
     // Still not overlapping. Catch up LiveUnionI.
-    LiveUnionI.advanceTo(VirtRegI->start);
+    LiveUnionI.advanceTo(LRI->start);
   }
   SeenAllInterferences = true;
   return InterferingVRegs.size();
diff --git a/lib/CodeGen/LivePhysRegs.cpp b/lib/CodeGen/LivePhysRegs.cpp
index a3bed3b76c02e3e1e4a9338f824c7f0b8a45f012..9f7d7cf54848077557d368cc3a3c99025e0dd0c9 100644
--- a/lib/CodeGen/LivePhysRegs.cpp
+++ b/lib/CodeGen/LivePhysRegs.cpp
@@ -160,7 +160,9 @@ void LivePhysRegs::addBlockLiveIns(const MachineBasicBlock &MBB) {
 static void addPristines(LivePhysRegs &LiveRegs, const MachineFunction &MF,
                          const MachineFrameInfo &MFI,
                          const TargetRegisterInfo &TRI) {
-  for (const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR;
+       ++CSR)
     LiveRegs.addReg(*CSR);
   for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo())
     LiveRegs.removeReg(Info.getReg());
@@ -179,7 +181,8 @@ void LivePhysRegs::addLiveOuts(const MachineBasicBlock &MBB) {
     if (MBB.isReturnBlock()) {
       // The return block has no successors whose live-ins we could merge
       // below. So instead we add the callee saved registers manually.
-      for (const MCPhysReg *I = TRI->getCalleeSavedRegs(&MF); *I; ++I)
+      const MachineRegisterInfo &MRI = MF.getRegInfo();
+      for (const MCPhysReg *I = MRI.getCalleeSavedRegs(); *I; ++I)
         addReg(*I);
     } else {
       addPristines(*this, MF, MFI, *TRI);
diff --git a/lib/CodeGen/LiveRangeCalc.cpp b/lib/CodeGen/LiveRangeCalc.cpp
index e2ae7c0a0c946abef2cc8c389a2bc9264f2899f9..398066bf8903e7afeb0a0a8f0f03bf97ddcd2eb3 100644
--- a/lib/CodeGen/LiveRangeCalc.cpp
+++ b/lib/CodeGen/LiveRangeCalc.cpp
@@ -75,34 +75,11 @@ void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
         LI.createSubRangeFrom(*Alloc, ClassMask, LI);
       }
 
-      LaneBitmask Mask = SubMask;
-      for (LiveInterval::SubRange &S : LI.subranges()) {
-        // A Mask for subregs common to the existing subrange and current def.
-        LaneBitmask Common = S.LaneMask & Mask;
-        if (Common.none())
-          continue;
-        LiveInterval::SubRange *CommonRange;
-        // A Mask for subregs covered by the subrange but not the current def.
-        LaneBitmask RM = S.LaneMask & ~Mask;
-        if (RM.any()) {
-          // Split the subrange S into two parts: one covered by the current
-          // def (CommonRange), and the one not affected by it (updated S).
-          S.LaneMask = RM;
-          CommonRange = LI.createSubRangeFrom(*Alloc, Common, S);
-        } else {
-          assert(Common == S.LaneMask);
-          CommonRange = &S;
-        }
+      LI.refineSubRanges(*Alloc, SubMask,
+          [&MO, this](LiveInterval::SubRange &SR) {
         if (MO.isDef())
-          createDeadDef(*Indexes, *Alloc, *CommonRange, MO);
-        Mask &= ~Common;
-      }
-      // Create a new SubRange for subregs we did not cover yet.
-      if (Mask.any()) {
-        LiveInterval::SubRange *NewRange = LI.createSubRange(*Alloc, Mask);
-        if (MO.isDef())
-          createDeadDef(*Indexes, *Alloc, *NewRange, MO);
-      }
+          createDeadDef(*Indexes, *Alloc, SR, MO);
+      });
     }
 
     // Create the def in the main liverange. We do not have to do this if
diff --git a/lib/CodeGen/LiveRangeEdit.cpp b/lib/CodeGen/LiveRangeEdit.cpp
index 7f1c69c0b4a2a19a7e9dffe6c5e2c107fb6c5126..92cca1a54951e4e6f31c59ce2147580ac38aefcb 100644
--- a/lib/CodeGen/LiveRangeEdit.cpp
+++ b/lib/CodeGen/LiveRangeEdit.cpp
@@ -37,6 +37,8 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
   LiveInterval &LI = LIS.createEmptyInterval(VReg);
+  if (Parent && !Parent->isSpillable())
+    LI.markNotSpillable();
   // Create empty subranges if the OldReg's interval has them. Do not create
   // the main range here---it will be constructed later after the subranges
   // have been finalized.
@@ -52,6 +54,14 @@ unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
   if (VRM) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
   }
+  // FIXME: Getting the interval here actually computes it.
+  // In theory, this may not be what we want, but in practice
+  // the createEmptyIntervalFrom API is used when this is not
+  // the case. Generally speaking we just want to annotate the
+  // LiveInterval when it gets created but we cannot do that at
+  // the moment.
+  if (Parent && !Parent->isSpillable())
+    LIS.getInterval(VReg).markNotSpillable();
   return VReg;
 }
 
@@ -442,9 +452,6 @@ LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg)
   if (VRM)
     VRM->grow();
 
-  if (Parent && !Parent->isSpillable())
-    LIS.getInterval(VReg).markNotSpillable();
-
   NewRegs.push_back(VReg);
 }
 
diff --git a/lib/CodeGen/LiveRegMatrix.cpp b/lib/CodeGen/LiveRegMatrix.cpp
index 7a51386aa9caaaa7cc22d3fb3aadc8cced7d92b4..882de1a3fad966d50e2ab49d68df480a73a9cedb 100644
--- a/lib/CodeGen/LiveRegMatrix.cpp
+++ b/lib/CodeGen/LiveRegMatrix.cpp
@@ -1,4 +1,4 @@
-//===-- LiveRegMatrix.cpp - Track register interference -------------------===//
+//===- LiveRegMatrix.cpp - Track register interference --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,15 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/LiveRegMatrix.h"
 #include "RegisterCoalescer.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/LiveRegMatrix.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/CodeGen/LiveIntervalUnion.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/Pass.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -36,8 +43,7 @@ INITIALIZE_PASS_DEPENDENCY(VirtRegMap)
 INITIALIZE_PASS_END(LiveRegMatrix, "liveregmatrix",
                     "Live Register Matrix", false, false)
 
-LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID),
-  UserTag(0), RegMaskTag(0), RegMaskVirtReg(0) {}
+LiveRegMatrix::LiveRegMatrix() : MachineFunctionPass(ID) {}
 
 void LiveRegMatrix::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
@@ -169,10 +175,10 @@ bool LiveRegMatrix::checkRegUnitInterference(LiveInterval &VirtReg,
   return Result;
 }
 
-LiveIntervalUnion::Query &LiveRegMatrix::query(LiveInterval &VirtReg,
+LiveIntervalUnion::Query &LiveRegMatrix::query(const LiveRange &LR,
                                                unsigned RegUnit) {
   LiveIntervalUnion::Query &Q = Queries[RegUnit];
-  Q.init(UserTag, &VirtReg, &Matrix[RegUnit]);
+  Q.init(UserTag, LR, Matrix[RegUnit]);
   return Q;
 }
 
@@ -190,9 +196,12 @@ LiveRegMatrix::checkInterference(LiveInterval &VirtReg, unsigned PhysReg) {
     return IK_RegUnit;
 
   // Check the matrix for virtual register interference.
-  for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units)
-    if (query(VirtReg, *Units).checkInterference())
-      return IK_VirtReg;
+  bool Interference = foreachUnit(TRI, VirtReg, PhysReg,
+                                  [&](unsigned Unit, const LiveRange &LR) {
+    return query(LR, Unit).checkInterference();
+  });
+  if (Interference)
+    return IK_VirtReg;
 
   return IK_Free;
 }
diff --git a/lib/CodeGen/LiveRegUnits.cpp b/lib/CodeGen/LiveRegUnits.cpp
index 0a10b4e6265c209bbbe58c701906d9e8a0635ad2..dff555f49565e9071e9e2a1d50b1d59b2d1dd3f7 100644
--- a/lib/CodeGen/LiveRegUnits.cpp
+++ b/lib/CodeGen/LiveRegUnits.cpp
@@ -1,4 +1,4 @@
-//===--- LiveRegUnits.cpp - Register Unit Set -----------------------------===//
+//===- LiveRegUnits.cpp - Register Unit Set -------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,9 +12,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/LiveRegUnits.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
 using namespace llvm;
 
 void LiveRegUnits::removeRegsNotPreserved(const uint32_t *RegMask) {
diff --git a/lib/CodeGen/LowLevelType.cpp b/lib/CodeGen/LowLevelType.cpp
index d74b7306e0f43d4216f3385e172cb24abc466e68..c4b9068fa905ab29bc2d1a02381367d2f1ad1a90 100644
--- a/lib/CodeGen/LowLevelType.cpp
+++ b/lib/CodeGen/LowLevelType.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/GlobalISel/LowLevelType.cpp --------------------------===//
+//===-- llvm/CodeGen/LowLevelType.cpp -------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,54 +18,21 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-LLT::LLT(Type &Ty, const DataLayout &DL) {
+LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
   if (auto VTy = dyn_cast<VectorType>(&Ty)) {
-    SizeInBits = VTy->getElementType()->getPrimitiveSizeInBits();
-    ElementsOrAddrSpace = VTy->getNumElements();
-    Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector;
+    auto NumElements = VTy->getNumElements();
+    auto ScalarSizeInBits = VTy->getElementType()->getPrimitiveSizeInBits();
+    if (NumElements == 1)
+      return LLT::scalar(ScalarSizeInBits);
+    return LLT::vector(NumElements, ScalarSizeInBits);
   } else if (auto PTy = dyn_cast<PointerType>(&Ty)) {
-    Kind = Pointer;
-    SizeInBits = DL.getTypeSizeInBits(&Ty);
-    ElementsOrAddrSpace = PTy->getAddressSpace();
+    return LLT::pointer(PTy->getAddressSpace(), DL.getTypeSizeInBits(&Ty));
   } else if (Ty.isSized()) {
     // Aggregates are no different from real scalars as far as GlobalISel is
     // concerned.
-    Kind = Scalar;
-    SizeInBits = DL.getTypeSizeInBits(&Ty);
-    ElementsOrAddrSpace = 1;
+    auto SizeInBits = DL.getTypeSizeInBits(&Ty);
     assert(SizeInBits != 0 && "invalid zero-sized type");
-  } else {
-    Kind = Invalid;
-    SizeInBits = ElementsOrAddrSpace = 0;
+    return LLT::scalar(SizeInBits);
   }
-}
-
-LLT::LLT(MVT VT) {
-  if (VT.isVector()) {
-    SizeInBits = VT.getVectorElementType().getSizeInBits();
-    ElementsOrAddrSpace = VT.getVectorNumElements();
-    Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector;
-  } else if (VT.isValid()) {
-    // Aggregates are no different from real scalars as far as GlobalISel is
-    // concerned.
-    Kind = Scalar;
-    SizeInBits = VT.getSizeInBits();
-    ElementsOrAddrSpace = 1;
-    assert(SizeInBits != 0 && "invalid zero-sized type");
-  } else {
-    Kind = Invalid;
-    SizeInBits = ElementsOrAddrSpace = 0;
-  }
-}
-
-void LLT::print(raw_ostream &OS) const {
-  if (isVector())
-    OS << "<" << ElementsOrAddrSpace << " x s" << SizeInBits << ">";
-  else if (isPointer())
-    OS << "p" << getAddressSpace();
-  else if (isValid()) {
-    assert(isScalar() && "unexpected type");
-    OS << "s" << getScalarSizeInBits();
-  } else
-    llvm_unreachable("trying to print an invalid type");
+  return LLT();
 }
diff --git a/lib/CodeGen/MIRParser/MIParser.cpp b/lib/CodeGen/MIRParser/MIParser.cpp
index e302de26d1fb7897c29e09df76cbdab4a81d594c..cac22af32956ebe44b9d0a23bcc06ba6a015879f 100644
--- a/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/lib/CodeGen/MIRParser/MIParser.cpp
@@ -176,6 +176,7 @@ public:
   bool parseIntrinsicOperand(MachineOperand &Dest);
   bool parsePredicateOperand(MachineOperand &Dest);
   bool parseTargetIndexOperand(MachineOperand &Dest);
+  bool parseCustomRegisterMaskOperand(MachineOperand &Dest);
   bool parseLiveoutRegisterMaskOperand(MachineOperand &Dest);
   bool parseMachineOperand(MachineOperand &Dest,
                            Optional<unsigned> &TiedDefIdx);
@@ -188,6 +189,7 @@ public:
   bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags);
   bool parseMemoryPseudoSourceValue(const PseudoSourceValue *&PSV);
   bool parseMachinePointerInfo(MachinePointerInfo &Dest);
+  bool parseOptionalAtomicOrdering(AtomicOrdering &Order);
   bool parseMachineMemoryOperand(MachineMemOperand *&Dest);
 
 private:
@@ -1669,6 +1671,35 @@ bool MIParser::parseTargetIndexOperand(MachineOperand &Dest) {
   return false;
 }
 
+bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) {
+  assert(Token.stringValue() == "CustomRegMask" && "Expected a custom RegMask");
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  assert(TRI && "Expected target register info");
+  lex();
+  if (expectAndConsume(MIToken::lparen))
+    return true;
+
+  uint32_t *Mask = MF.allocateRegisterMask(TRI->getNumRegs());
+  while (true) {
+    if (Token.isNot(MIToken::NamedRegister))
+      return error("expected a named register");
+    unsigned Reg;
+    if (parseNamedRegister(Reg))
+      return true;
+    lex();
+    Mask[Reg / 32] |= 1U << (Reg % 32);
+    // TODO: Report an error if the same register is used more than once.
+    if (Token.isNot(MIToken::comma))
+      break;
+    lex();
+  }
+
+  if (expectAndConsume(MIToken::rparen))
+    return true;
+  Dest = MachineOperand::CreateRegMask(Mask);
+  return false;
+}
+
 bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   assert(Token.is(MIToken::kw_liveout));
   const auto *TRI = MF.getSubtarget().getRegisterInfo();
@@ -1766,8 +1797,8 @@ bool MIParser::parseMachineOperand(MachineOperand &Dest,
       Dest = MachineOperand::CreateRegMask(RegMask);
       lex();
       break;
-    }
-    LLVM_FALLTHROUGH;
+    } else
+      return parseCustomRegisterMaskOperand(Dest);
   default:
     // FIXME: Parse the MCSymbol machine operand.
     return error("expected a machine operand");
@@ -2040,6 +2071,28 @@ bool MIParser::parseMachinePointerInfo(MachinePointerInfo &Dest) {
   return false;
 }
 
+bool MIParser::parseOptionalAtomicOrdering(AtomicOrdering &Order) {
+  Order = AtomicOrdering::NotAtomic;
+  if (Token.isNot(MIToken::Identifier))
+    return false;
+
+  Order = StringSwitch<AtomicOrdering>(Token.stringValue())
+              .Case("unordered", AtomicOrdering::Unordered)
+              .Case("monotonic", AtomicOrdering::Monotonic)
+              .Case("acquire", AtomicOrdering::Acquire)
+              .Case("release", AtomicOrdering::Release)
+              .Case("acq_rel", AtomicOrdering::AcquireRelease)
+              .Case("seq_cst", AtomicOrdering::SequentiallyConsistent)
+              .Default(AtomicOrdering::NotAtomic);
+
+  if (Order != AtomicOrdering::NotAtomic) {
+    lex();
+    return false;
+  }
+
+  return error("expected an atomic scope, ordering or a size integer literal");
+}
+
 bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   if (expectAndConsume(MIToken::lparen))
     return true;
@@ -2057,6 +2110,21 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
     Flags |= MachineMemOperand::MOStore;
   lex();
 
+  // Optional "singlethread" scope.
+  SynchronizationScope Scope = SynchronizationScope::CrossThread;
+  if (Token.is(MIToken::Identifier) && Token.stringValue() == "singlethread") {
+    Scope = SynchronizationScope::SingleThread;
+    lex();
+  }
+
+  // Up to two atomic orderings (cmpxchg provides guarantees on failure).
+  AtomicOrdering Order, FailureOrder;
+  if (parseOptionalAtomicOrdering(Order))
+    return true;
+
+  if (parseOptionalAtomicOrdering(FailureOrder))
+    return true;
+
   if (Token.isNot(MIToken::IntegerLiteral))
     return error("expected the size integer literal after memory operation");
   uint64_t Size;
@@ -2111,8 +2179,8 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   }
   if (expectAndConsume(MIToken::rparen))
     return true;
-  Dest =
-      MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range);
+  Dest = MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range,
+                                 Scope, Order, FailureOrder);
   return false;
 }
 
diff --git a/lib/CodeGen/MIRParser/MIRParser.cpp b/lib/CodeGen/MIRParser/MIRParser.cpp
index 0c694c465b6ce66aac057f084b5781805e0188da..a2773cccc5dbd9859f879e75660ea0023ca9b44d 100644
--- a/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -332,6 +332,8 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
     MF.setAlignment(YamlMF.Alignment);
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
 
+  if (YamlMF.NoVRegs)
+    MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
   if (YamlMF.Legalized)
     MF.getProperties().set(MachineFunctionProperties::Property::Legalized);
   if (YamlMF.RegBankSelected)
@@ -365,9 +367,6 @@ bool MIRParserImpl::initializeMachineFunction(MachineFunction &MF) {
   }
   PFS.SM = &SM;
 
-  if (MF.empty())
-    return error(Twine("machine function '") + Twine(MF.getName()) +
-                 "' requires at least one machine basic block in its body");
   // Initialize the frame information after creating all the MBBs so that the
   // MBB references in the frame information can be resolved.
   if (initializeFrameInfo(PFS, YamlMF))
@@ -465,17 +464,19 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
     RegInfo.addLiveIn(Reg, VReg);
   }
 
-  // Parse the callee saved register mask.
-  BitVector CalleeSavedRegisterMask(RegInfo.getUsedPhysRegsMask().size());
-  if (!YamlMF.CalleeSavedRegisters)
-    return false;
-  for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
-    unsigned Reg = 0;
-    if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
-      return error(Error, RegSource.SourceRange);
-    CalleeSavedRegisterMask[Reg] = true;
+  // Parse the callee saved registers (Registers that will
+  // be saved for the caller).
+  if (YamlMF.CalleeSavedRegisters) {
+    SmallVector<MCPhysReg, 16> CalleeSavedRegisters;
+    for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
+      unsigned Reg = 0;
+      if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
+        return error(Error, RegSource.SourceRange);
+      CalleeSavedRegisters.push_back(Reg);
+    }
+    RegInfo.setCalleeSavedRegs(CalleeSavedRegisters);
   }
-  RegInfo.setUsedPhysRegMask(CalleeSavedRegisterMask.flip());
+
   return false;
 }
 
@@ -508,14 +509,12 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
   }
 
   // Compute MachineRegisterInfo::UsedPhysRegMask
-  if (!YamlMF.CalleeSavedRegisters) {
-    for (const MachineBasicBlock &MBB : MF) {
-      for (const MachineInstr &MI : MBB) {
-        for (const MachineOperand &MO : MI.operands()) {
-          if (!MO.isRegMask())
-            continue;
-          MRI.addPhysRegsUsedFromRegMask(MO.getRegMask());
-        }
+  for (const MachineBasicBlock &MBB : MF) {
+    for (const MachineInstr &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
+        if (!MO.isRegMask())
+          continue;
+        MRI.addPhysRegsUsedFromRegMask(MO.getRegMask());
       }
     }
   }
diff --git a/lib/CodeGen/MIRPrinter.cpp b/lib/CodeGen/MIRPrinter.cpp
index db87092177cad9a1bd0164a63ca3596ded81f9a7..6da174a536666ca3da43270d32743f1f0cf61838 100644
--- a/lib/CodeGen/MIRPrinter.cpp
+++ b/lib/CodeGen/MIRPrinter.cpp
@@ -175,6 +175,8 @@ void MIRPrinter::print(const MachineFunction &MF) {
   YamlMF.Alignment = MF.getAlignment();
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
 
+  YamlMF.NoVRegs = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::NoVRegs);
   YamlMF.Legalized = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::Legalized);
   YamlMF.RegBankSelected = MF.getProperties().hasProperty(
@@ -205,6 +207,25 @@ void MIRPrinter::print(const MachineFunction &MF) {
   Out << YamlMF;
 }
 
+static void printCustomRegMask(const uint32_t *RegMask, raw_ostream &OS,
+                               const TargetRegisterInfo *TRI) {
+  assert(RegMask && "Can't print an empty register mask");
+  OS << StringRef("CustomRegMask(");
+
+  bool IsRegInRegMaskFound = false;
+  for (int I = 0, E = TRI->getNumRegs(); I < E; I++) {
+    // Check whether the register is asserted in regmask.
+    if (RegMask[I / 32] & (1u << (I % 32))) {
+      if (IsRegInRegMaskFound)
+        OS << ',';
+      printReg(I, OS, TRI);
+      IsRegInRegMaskFound = true;
+    }
+  }
+
+  OS << ')';
+}
+
 void MIRPrinter::convert(yaml::MachineFunction &MF,
                          const MachineRegisterInfo &RegInfo,
                          const TargetRegisterInfo *TRI) {
@@ -239,20 +260,18 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
       printReg(I->second, LiveIn.VirtualRegister, TRI);
     MF.LiveIns.push_back(LiveIn);
   }
-  // The used physical register mask is printed as an inverted callee saved
-  // register mask.
-  const BitVector &UsedPhysRegMask = RegInfo.getUsedPhysRegsMask();
-  if (UsedPhysRegMask.none())
-    return;
-  std::vector<yaml::FlowStringValue> CalleeSavedRegisters;
-  for (unsigned I = 0, E = UsedPhysRegMask.size(); I != E; ++I) {
-    if (!UsedPhysRegMask[I]) {
+
+  // Prints the callee saved registers.
+  if (RegInfo.isUpdatedCSRsInitialized()) {
+    const MCPhysReg *CalleeSavedRegs = RegInfo.getCalleeSavedRegs();
+    std::vector<yaml::FlowStringValue> CalleeSavedRegisters;
+    for (const MCPhysReg *I = CalleeSavedRegs; *I; ++I) {
       yaml::FlowStringValue Reg;
-      printReg(I, Reg, TRI);
+      printReg(*I, Reg, TRI);
       CalleeSavedRegisters.push_back(Reg);
     }
+    MF.CalleeSavedRegisters = CalleeSavedRegisters;
   }
-  MF.CalleeSavedRegisters = CalleeSavedRegisters;
 }
 
 void MIRPrinter::convert(ModuleSlotTracker &MST,
@@ -860,7 +879,7 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
     if (RegMaskInfo != RegisterMaskIds.end())
       OS << StringRef(TRI->getRegMaskNames()[RegMaskInfo->second]).lower();
     else
-      llvm_unreachable("Can't print this machine register mask yet.");
+      printCustomRegMask(Op.getRegMask(), OS, TRI);
     break;
   }
   case MachineOperand::MO_RegisterLiveOut: {
@@ -906,6 +925,9 @@ void MIPrinter::print(const MachineOperand &Op, const TargetRegisterInfo *TRI,
        << CmpInst::getPredicateName(Pred) << ')';
     break;
   }
+  case MachineOperand::MO_Placeholder:
+    OS << "<placeholder>";
+    break;
   }
 }
 
@@ -926,6 +948,15 @@ void MIPrinter::print(const MachineMemOperand &Op) {
     assert(Op.isStore() && "Non load machine operand must be a store");
     OS << "store ";
   }
+
+  if (Op.getSynchScope() == SynchronizationScope::SingleThread)
+    OS << "singlethread ";
+
+  if (Op.getOrdering() != AtomicOrdering::NotAtomic)
+    OS << toIRString(Op.getOrdering()) << ' ';
+  if (Op.getFailureOrdering() != AtomicOrdering::NotAtomic)
+    OS << toIRString(Op.getFailureOrdering()) << ' ';
+
   OS << Op.getSize();
   if (const Value *Val = Op.getValue()) {
     OS << (Op.isLoad() ? " from " : " into ");
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index a7d87799f6796f7796d1f799296b548315a2628d..06112723497b098e50fb58aea89b3e13ef3422cc 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -231,7 +232,7 @@ StringRef MachineBasicBlock::getName() const {
   if (const BasicBlock *LBB = getBasicBlock())
     return LBB->getName();
   else
-    return "(null)";
+    return StringRef("", 0);
 }
 
 /// Return a hopefully unique identifier for this block.
@@ -423,7 +424,7 @@ void MachineBasicBlock::updateTerminator() {
 
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
-  DebugLoc DL;  // FIXME: this is nowhere
+  DebugLoc DL = findBranchDebugLoc();
   bool B = TII->analyzeBranch(*this, TBB, FBB, Cond);
   (void) B;
   assert(!B && "UpdateTerminators requires analyzable predecessors!");
@@ -491,7 +492,7 @@ void MachineBasicBlock::updateTerminator() {
       // FIXME: This does not seem like a reasonable pattern to support, but it
       // has been seen in the wild coming out of degenerate ARM test cases.
       TII->removeBranch(*this);
-  
+
       // Finally update the unconditional successor to be reached via a branch if
       // it would not be reached by fallthrough.
       if (!isLayoutSuccessor(TBB))
@@ -687,16 +688,16 @@ bool MachineBasicBlock::isLayoutSuccessor(const MachineBasicBlock *MBB) const {
   return std::next(I) == MachineFunction::const_iterator(MBB);
 }
 
-bool MachineBasicBlock::canFallThrough() {
+MachineBasicBlock *MachineBasicBlock::getFallThrough() {
   MachineFunction::iterator Fallthrough = getIterator();
   ++Fallthrough;
   // If FallthroughBlock is off the end of the function, it can't fall through.
   if (Fallthrough == getParent()->end())
-    return false;
+    return nullptr;
 
   // If FallthroughBlock isn't a successor, no fallthrough is possible.
   if (!isSuccessor(&*Fallthrough))
-    return false;
+    return nullptr;
 
   // Analyze the branches, if any, at the end of the block.
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
@@ -708,25 +709,31 @@ bool MachineBasicBlock::canFallThrough() {
     // is possible. The isPredicated check is needed because this code can be
     // called during IfConversion, where an instruction which is normally a
     // Barrier is predicated and thus no longer an actual control barrier.
-    return empty() || !back().isBarrier() || TII->isPredicated(back());
+    return (empty() || !back().isBarrier() || TII->isPredicated(back()))
+               ? &*Fallthrough
+               : nullptr;
   }
 
   // If there is no branch, control always falls through.
-  if (!TBB) return true;
+  if (!TBB) return &*Fallthrough;
 
   // If there is some explicit branch to the fallthrough block, it can obviously
   // reach, even though the branch should get folded to fall through implicitly.
   if (MachineFunction::iterator(TBB) == Fallthrough ||
       MachineFunction::iterator(FBB) == Fallthrough)
-    return true;
+    return &*Fallthrough;
 
   // If it's an unconditional branch to some block not the fall through, it
   // doesn't fall through.
-  if (Cond.empty()) return false;
+  if (Cond.empty()) return nullptr;
 
   // Otherwise, if it is conditional and has no explicit false block, it falls
   // through.
-  return FBB == nullptr;
+  return (FBB == nullptr) ? &*Fallthrough : nullptr;
+}
+
+bool MachineBasicBlock::canFallThrough() {
+  return getFallThrough() != nullptr;
 }
 
 MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
@@ -1150,6 +1157,24 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
   return {};
 }
 
+/// Find and return the merged DebugLoc of the branch instructions of the block.
+/// Return UnknownLoc if there is none.
+DebugLoc
+MachineBasicBlock::findBranchDebugLoc() {
+  DebugLoc DL;
+  auto TI = getFirstTerminator();
+  while (TI != end() && !TI->isBranch())
+    ++TI;
+
+  if (TI != end()) {
+    DL = TI->getDebugLoc();
+    for (++TI ; TI != end() ; ++TI)
+      if (TI->isBranch())
+        DL = DILocation::getMergedLocation(DL, TI->getDebugLoc());
+  }
+  return DL;
+}
+
 /// Return probability of the edge from this block to MBB.
 BranchProbability
 MachineBasicBlock::getSuccProbability(const_succ_iterator Succ) const {
diff --git a/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index d8ee2438ae2d4eaf39acb920b0f5b3a6ee89ac53..9c7367b4c78020ecb2b6260658c8a67f46d00750 100644
--- a/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -28,7 +28,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "block-freq"
 
-#ifndef NDEBUG
 
 static cl::opt<GVDAGType> ViewMachineBlockFreqPropagationDAG(
     "view-machine-block-freq-propagation-dags", cl::Hidden,
@@ -60,7 +59,11 @@ cl::opt<GVDAGType> ViewBlockLayoutWithBFI(
                           "display a graph using the real "
                           "profile count if available.")));
 
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
 extern cl::opt<std::string> ViewBlockFreqFuncName;
+// Command line option to specify hot frequency threshold.
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-hot-freq-perc=
 extern cl::opt<unsigned> ViewHotFreqPercent;
 
 static GVDAGType getGVDT() {
@@ -145,7 +148,6 @@ struct DOTGraphTraits<MachineBlockFrequencyInfo *>
 };
 
 } // end namespace llvm
-#endif
 
 INITIALIZE_PASS_BEGIN(MachineBlockFrequencyInfo, "machine-block-freq",
                       "Machine Block Frequency Analysis", true, true)
@@ -170,20 +172,24 @@ void MachineBlockFrequencyInfo::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
-  MachineBranchProbabilityInfo &MBPI =
-      getAnalysis<MachineBranchProbabilityInfo>();
-  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+void MachineBlockFrequencyInfo::calculate(
+    const MachineFunction &F, const MachineBranchProbabilityInfo &MBPI,
+    const MachineLoopInfo &MLI) {
   if (!MBFI)
     MBFI.reset(new ImplType);
   MBFI->calculate(F, MBPI, MLI);
-#ifndef NDEBUG
   if (ViewMachineBlockFreqPropagationDAG != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
        F.getName().equals(ViewBlockFreqFuncName))) {
-    view();
+    view("MachineBlockFrequencyDAGS." + F.getName());
   }
-#endif
+}
+
+bool MachineBlockFrequencyInfo::runOnMachineFunction(MachineFunction &F) {
+  MachineBranchProbabilityInfo &MBPI =
+      getAnalysis<MachineBranchProbabilityInfo>();
+  MachineLoopInfo &MLI = getAnalysis<MachineLoopInfo>();
+  calculate(F, MBPI, MLI);
   return false;
 }
 
@@ -191,15 +197,9 @@ void MachineBlockFrequencyInfo::releaseMemory() { MBFI.reset(); }
 
 /// Pop up a ghostview window with the current block frequency propagation
 /// rendered using dot.
-void MachineBlockFrequencyInfo::view(bool isSimple) const {
-// This code is only for debugging.
-#ifndef NDEBUG
-  ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this),
-            "MachineBlockFrequencyDAGs", isSimple);
-#else
-  errs() << "MachineBlockFrequencyInfo::view is only available in debug builds "
-            "on systems with Graphviz or gv!\n";
-#endif // NDEBUG
+void MachineBlockFrequencyInfo::view(const Twine &Name, bool isSimple) const {
+  // This code is only for debugging.
+  ViewGraph(const_cast<MachineBlockFrequencyInfo *>(this), Name, isSimple);
 }
 
 BlockFrequency
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 7d57cc0956dba73c1afafb965366e1402a90f192..e23f90be40992d8b7e1fba0dd0f5635a887d58a4 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -36,11 +36,11 @@
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/TailDuplicator.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
@@ -50,6 +50,8 @@
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include <algorithm>
+#include <functional>
+#include <utility>
 using namespace llvm;
 
 #define DEBUG_TYPE "block-placement"
@@ -83,19 +85,6 @@ static cl::opt<unsigned> ExitBlockBias(
 // Definition:
 // - Outlining: placement of a basic block outside the chain or hot path.
 
-static cl::opt<bool> OutlineOptionalBranches(
-    "outline-optional-branches",
-    cl::desc("Outlining optional branches will place blocks that are optional "
-              "branches, i.e. branches with a common post dominator, outside "
-              "the hot path or chain"),
-    cl::init(false), cl::Hidden);
-
-static cl::opt<unsigned> OutlineOptionalThreshold(
-    "outline-optional-threshold",
-    cl::desc("Don't outline optional branches that are a single block with an "
-             "instruction count below this threshold"),
-    cl::init(4), cl::Hidden);
-
 static cl::opt<unsigned> LoopToColdBlockRatio(
     "loop-to-cold-block-ratio",
     cl::desc("Outline loop blocks from loop chain if (frequency of loop) / "
@@ -137,25 +126,47 @@ BranchFoldPlacement("branch-fold-placement",
               cl::init(true), cl::Hidden);
 
 // Heuristic for tail duplication.
-static cl::opt<unsigned> TailDuplicatePlacementThreshold(
+static cl::opt<unsigned> TailDupPlacementThreshold(
     "tail-dup-placement-threshold",
     cl::desc("Instruction cutoff for tail duplication during layout. "
              "Tail merging during layout is forced to have a threshold "
              "that won't conflict."), cl::init(2),
     cl::Hidden);
 
+// Heuristic for tail duplication.
+static cl::opt<unsigned> TailDupPlacementPenalty(
+    "tail-dup-placement-penalty",
+    cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
+             "Copying can increase fallthrough, but it also increases icache "
+             "pressure. This parameter controls the penalty to account for that. "
+             "Percent as integer."),
+    cl::init(2),
+    cl::Hidden);
+
+// Heuristic for triangle chains.
+static cl::opt<unsigned> TriangleChainCount(
+    "triangle-chain-count",
+    cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
+             "triangle tail duplication heuristic to kick in. 0 to disable."),
+    cl::init(2),
+    cl::Hidden);
+
 extern cl::opt<unsigned> StaticLikelyProb;
 extern cl::opt<unsigned> ProfileLikelyProb;
 
-#ifndef NDEBUG
+// Internal option used to control BFI display only after MBP pass.
+// Defined in CodeGen/MachineBlockFrequencyInfo.cpp:
+// -view-block-layout-with-bfi=
 extern cl::opt<GVDAGType> ViewBlockLayoutWithBFI;
+
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
 extern cl::opt<std::string> ViewBlockFreqFuncName;
-#endif
 
 namespace {
 class BlockChain;
 /// \brief Type for our function-wide basic block -> block chain mapping.
-typedef DenseMap<MachineBasicBlock *, BlockChain *> BlockToChainMapType;
+typedef DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChainMapType;
 }
 
 namespace {
@@ -199,12 +210,15 @@ public:
 
   /// \brief Iterator over blocks within the chain.
   typedef SmallVectorImpl<MachineBasicBlock *>::iterator iterator;
+  typedef SmallVectorImpl<MachineBasicBlock *>::const_iterator const_iterator;
 
   /// \brief Beginning of blocks within the chain.
   iterator begin() { return Blocks.begin(); }
+  const_iterator begin() const { return Blocks.begin(); }
 
   /// \brief End of blocks within the chain.
   iterator end() { return Blocks.end(); }
+  const_iterator end() const { return Blocks.end(); }
 
   bool remove(MachineBasicBlock* BB) {
     for(iterator i = begin(); i != end(); ++i) {
@@ -270,12 +284,28 @@ public:
 namespace {
 class MachineBlockPlacement : public MachineFunctionPass {
   /// \brief A typedef for a block filter set.
-  typedef SmallSetVector<MachineBasicBlock *, 16> BlockFilterSet;
+  typedef SmallSetVector<const MachineBasicBlock *, 16> BlockFilterSet;
+
+  /// Pair struct containing basic block and taildup profitiability
+  struct BlockAndTailDupResult {
+    MachineBasicBlock *BB;
+    bool ShouldTailDup;
+  };
+
+  /// Triple struct containing edge weight and the edge.
+  struct WeightedEdge {
+    BlockFrequency Weight;
+    MachineBasicBlock *Src;
+    MachineBasicBlock *Dest;
+  };
 
   /// \brief work lists of blocks that are ready to be laid out
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
   SmallVector<MachineBasicBlock *, 16> EHPadWorkList;
 
+  /// Edges that have already been computed as optimal.
+  DenseMap<const MachineBasicBlock *, BlockAndTailDupResult> ComputedEdges;
+
   /// \brief Machine Function
   MachineFunction *F;
 
@@ -300,7 +330,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   const TargetLoweringBase *TLI;
 
   /// \brief A handle to the post dominator tree.
-  MachineDominatorTree *MDT;
+  MachinePostDominatorTree *MPDT;
 
   /// \brief Duplicator used to duplicate tails during placement.
   ///
@@ -309,10 +339,6 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// must be done inline.
   TailDuplicator TailDup;
 
-  /// \brief A set of blocks that are unavoidably execute, i.e. they dominate
-  /// all terminators of the MachineFunction.
-  SmallPtrSet<MachineBasicBlock *, 4> UnavoidableBlocks;
-
   /// \brief Allocator and owner of BlockChain structures.
   ///
   /// We build BlockChains lazily while processing the loop structure of
@@ -328,7 +354,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// BlockChain it participates in, if any. We use it to, among other things,
   /// allow implicitly defining edges between chains as the existing edges
   /// between basic blocks.
-  DenseMap<MachineBasicBlock *, BlockChain *> BlockToChain;
+  DenseMap<const MachineBasicBlock *, BlockChain *> BlockToChain;
 
 #ifndef NDEBUG
   /// The set of basic blocks that have terminators that cannot be fully
@@ -340,75 +366,107 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
   /// if the count goes to 0, add them to the appropriate work list.
-  void markChainSuccessors(BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
-                           const BlockFilterSet *BlockFilter = nullptr);
+  void markChainSuccessors(
+      const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
+      const BlockFilterSet *BlockFilter = nullptr);
 
   /// Decrease the UnscheduledPredecessors count for a single block, and
   /// if the count goes to 0, add them to the appropriate work list.
   void markBlockSuccessors(
-      BlockChain &Chain, MachineBasicBlock *BB, MachineBasicBlock *LoopHeaderBB,
+      const BlockChain &Chain, const MachineBasicBlock *BB,
+      const MachineBasicBlock *LoopHeaderBB,
       const BlockFilterSet *BlockFilter = nullptr);
 
-
   BranchProbability
-  collectViableSuccessors(MachineBasicBlock *BB, BlockChain &Chain,
-                          const BlockFilterSet *BlockFilter,
-                          SmallVector<MachineBasicBlock *, 4> &Successors);
-  bool shouldPredBlockBeOutlined(MachineBasicBlock *BB, MachineBasicBlock *Succ,
-                                 BlockChain &Chain,
-                                 const BlockFilterSet *BlockFilter,
-                                 BranchProbability SuccProb,
-                                 BranchProbability HotProb);
+  collectViableSuccessors(
+      const MachineBasicBlock *BB, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter,
+      SmallVector<MachineBasicBlock *, 4> &Successors);
+  bool shouldPredBlockBeOutlined(
+      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter,
+      BranchProbability SuccProb, BranchProbability HotProb);
   bool repeatedlyTailDuplicateBlock(
       MachineBasicBlock *BB, MachineBasicBlock *&LPred,
-      MachineBasicBlock *LoopHeaderBB,
+      const MachineBasicBlock *LoopHeaderBB,
       BlockChain &Chain, BlockFilterSet *BlockFilter,
       MachineFunction::iterator &PrevUnplacedBlockIt);
-  bool maybeTailDuplicateBlock(MachineBasicBlock *BB, MachineBasicBlock *LPred,
-                               const BlockChain &Chain,
-                               BlockFilterSet *BlockFilter,
-                               MachineFunction::iterator &PrevUnplacedBlockIt,
-                               bool &DuplicatedToPred);
-  bool
-  hasBetterLayoutPredecessor(MachineBasicBlock *BB, MachineBasicBlock *Succ,
-                             BlockChain &SuccChain, BranchProbability SuccProb,
-                             BranchProbability RealSuccProb, BlockChain &Chain,
-                             const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *selectBestSuccessor(MachineBasicBlock *BB,
-                                         BlockChain &Chain,
-                                         const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *
-  selectBestCandidateBlock(BlockChain &Chain,
-                           SmallVectorImpl<MachineBasicBlock *> &WorkList);
-  MachineBasicBlock *
-  getFirstUnplacedBlock(const BlockChain &PlacedChain,
-                        MachineFunction::iterator &PrevUnplacedBlockIt,
-                        const BlockFilterSet *BlockFilter);
+  bool maybeTailDuplicateBlock(
+      MachineBasicBlock *BB, MachineBasicBlock *LPred,
+      BlockChain &Chain, BlockFilterSet *BlockFilter,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      bool &DuplicatedToPred);
+  bool hasBetterLayoutPredecessor(
+      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+      const BlockChain &SuccChain, BranchProbability SuccProb,
+      BranchProbability RealSuccProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  BlockAndTailDupResult selectBestSuccessor(
+      const MachineBasicBlock *BB, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *selectBestCandidateBlock(
+      const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
+  MachineBasicBlock *getFirstUnplacedBlock(
+      const BlockChain &PlacedChain,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      const BlockFilterSet *BlockFilter);
 
   /// \brief Add a basic block to the work list if it is appropriate.
   ///
   /// If the optional parameter BlockFilter is provided, only MBB
   /// present in the set will be added to the worklist. If nullptr
   /// is provided, no filtering occurs.
-  void fillWorkLists(MachineBasicBlock *MBB,
+  void fillWorkLists(const MachineBasicBlock *MBB,
                      SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
                      const BlockFilterSet *BlockFilter);
-  void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
+  void buildChain(const MachineBasicBlock *BB, BlockChain &Chain,
                   BlockFilterSet *BlockFilter = nullptr);
-  MachineBasicBlock *findBestLoopTop(MachineLoop &L,
-                                     const BlockFilterSet &LoopBlockSet);
-  MachineBasicBlock *findBestLoopExit(MachineLoop &L,
-                                      const BlockFilterSet &LoopBlockSet);
-  BlockFilterSet collectLoopBlockSet(MachineLoop &L);
-  void buildLoopChains(MachineLoop &L);
-  void rotateLoop(BlockChain &LoopChain, MachineBasicBlock *ExitingBB,
-                  const BlockFilterSet &LoopBlockSet);
-  void rotateLoopWithProfile(BlockChain &LoopChain, MachineLoop &L,
-                             const BlockFilterSet &LoopBlockSet);
-  void collectMustExecuteBBs();
+  MachineBasicBlock *findBestLoopTop(
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(
+      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
+  BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
+  void buildLoopChains(const MachineLoop &L);
+  void rotateLoop(
+      BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
+      const BlockFilterSet &LoopBlockSet);
+  void rotateLoopWithProfile(
+      BlockChain &LoopChain, const MachineLoop &L,
+      const BlockFilterSet &LoopBlockSet);
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
+  /// Returns true if a block should be tail-duplicated to increase fallthrough
+  /// opportunities.
+  bool shouldTailDuplicate(MachineBasicBlock *BB);
+  /// Check the edge frequencies to see if tail duplication will increase
+  /// fallthroughs.
+  bool isProfitableToTailDup(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    BranchProbability AdjustedSumProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Check for a trellis layout.
+  bool isTrellis(const MachineBasicBlock *BB,
+                 const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+                 const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Get the best successor given a trellis layout.
+  BlockAndTailDupResult getBestTrellisSuccessor(
+      const MachineBasicBlock *BB,
+      const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+      BranchProbability AdjustedSumProb, const BlockChain &Chain,
+      const BlockFilterSet *BlockFilter);
+  /// Get the best pair of non-conflicting edges.
+  static std::pair<WeightedEdge, WeightedEdge> getBestNonConflictingEdges(
+      const MachineBasicBlock *BB,
+      MutableArrayRef<SmallVector<WeightedEdge, 8>> Edges);
+  /// Returns true if a block can tail duplicate into all unplaced
+  /// predecessors. Filters based on loop.
+  bool canTailDuplicateUnplacedPreds(
+      const MachineBasicBlock *BB, MachineBasicBlock *Succ,
+      const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  /// Find chains of triangles to tail-duplicate where a global analysis works,
+  /// but a local analysis would not find them.
+  void precomputeTriangleChains();
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -421,7 +479,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineBranchProbabilityInfo>();
     AU.addRequired<MachineBlockFrequencyInfo>();
-    AU.addRequired<MachineDominatorTree>();
+    if (TailDupPlacement)
+      AU.addRequired<MachinePostDominatorTree>();
     AU.addRequired<MachineLoopInfo>();
     AU.addRequired<TargetPassConfig>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -435,7 +494,7 @@ INITIALIZE_PASS_BEGIN(MachineBlockPlacement, "block-placement",
                       "Branch Probability Basic Block Placement", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfo)
 INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
                     "Branch Probability Basic Block Placement", false, false)
@@ -444,7 +503,7 @@ INITIALIZE_PASS_END(MachineBlockPlacement, "block-placement",
 /// \brief Helper to print the name of a MBB.
 ///
 /// Only used by debug logging.
-static std::string getBlockName(MachineBasicBlock *BB) {
+static std::string getBlockName(const MachineBasicBlock *BB) {
   std::string Result;
   raw_string_ostream OS(Result);
   OS << "BB#" << BB->getNumber();
@@ -461,7 +520,7 @@ static std::string getBlockName(MachineBasicBlock *BB) {
 /// having one fewer active predecessor. It also adds any successors of this
 /// chain which reach the zero-predecessor state to the appropriate worklist.
 void MachineBlockPlacement::markChainSuccessors(
-    BlockChain &Chain, MachineBasicBlock *LoopHeaderBB,
+    const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
     const BlockFilterSet *BlockFilter) {
   // Walk all the blocks in this chain, marking their successors as having
   // a predecessor placed.
@@ -477,8 +536,8 @@ void MachineBlockPlacement::markChainSuccessors(
 /// and was duplicated into the chain end, we need to redo markBlockSuccessors
 /// for just that block.
 void MachineBlockPlacement::markBlockSuccessors(
-    BlockChain &Chain, MachineBasicBlock *MBB, MachineBasicBlock *LoopHeaderBB,
-    const BlockFilterSet *BlockFilter) {
+    const BlockChain &Chain, const MachineBasicBlock *MBB,
+    const MachineBasicBlock *LoopHeaderBB, const BlockFilterSet *BlockFilter) {
   // Add any successors for which this is the only un-placed in-loop
   // predecessor to the worklist as a viable candidate for CFG-neutral
   // placement. No subsequent placement of this block will violate the CFG
@@ -510,7 +569,8 @@ void MachineBlockPlacement::markBlockSuccessors(
 /// the total branch probability of edges from \p BB to those
 /// blocks.
 BranchProbability MachineBlockPlacement::collectViableSuccessors(
-    MachineBasicBlock *BB, BlockChain &Chain, const BlockFilterSet *BlockFilter,
+    const MachineBasicBlock *BB, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter,
     SmallVector<MachineBasicBlock *, 4> &Successors) {
   // Adjust edge probabilities by excluding edges pointing to blocks that is
   // either not in BlockFilter or is already in the current chain. Consider the
@@ -567,46 +627,570 @@ getAdjustedProbability(BranchProbability OrigProb,
   return SuccProb;
 }
 
-/// When the option OutlineOptionalBranches is on, this method
-/// checks if the fallthrough candidate block \p Succ (of block
-/// \p BB) also has other unscheduled predecessor blocks which
-/// are also successors of \p BB (forming triangular shape CFG).
-/// If none of such predecessors are small, it returns true.
-/// The caller can choose to select \p Succ as the layout successors
-/// so that \p Succ's predecessors (optional branches) can be
-/// outlined.
-/// FIXME: fold this with more general layout cost analysis.
-bool MachineBlockPlacement::shouldPredBlockBeOutlined(
-    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &Chain,
-    const BlockFilterSet *BlockFilter, BranchProbability SuccProb,
-    BranchProbability HotProb) {
-  if (!OutlineOptionalBranches)
+/// Check if \p BB has exactly the successors in \p Successors.
+static bool
+hasSameSuccessors(MachineBasicBlock &BB,
+                  SmallPtrSetImpl<const MachineBasicBlock *> &Successors) {
+  if (BB.succ_size() != Successors.size())
+    return false;
+  // We don't want to count self-loops
+  if (Successors.count(&BB))
+    return false;
+  for (MachineBasicBlock *Succ : BB.successors())
+    if (!Successors.count(Succ))
+      return false;
+  return true;
+}
+
+/// Check if a block should be tail duplicated to increase fallthrough
+/// opportunities.
+/// \p BB Block to check.
+bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
+  // Blocks with single successors don't create additional fallthrough
+  // opportunities. Don't duplicate them. TODO: When conditional exits are
+  // analyzable, allow them to be duplicated.
+  bool IsSimple = TailDup.isSimpleBB(BB);
+
+  if (BB->succ_size() == 1)
+    return false;
+  return TailDup.shouldTailDuplicate(IsSimple, *BB);
+}
+
+/// Compare 2 BlockFrequency's with a small penalty for \p A.
+/// In order to be conservative, we apply a X% penalty to account for
+/// increased icache pressure and static heuristics. For small frequencies
+/// we use only the numerators to improve accuracy. For simplicity, we assume the
+/// penalty is less than 100%
+/// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
+static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
+                            uint64_t EntryFreq) {
+  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
+  BlockFrequency Gain = A - B;
+  return (Gain / ThresholdProb).getFrequency() >= EntryFreq;
+}
+
+/// Check the edge frequencies to see if tail duplication will increase
+/// fallthroughs. It only makes sense to call this function when
+/// \p Succ would not be chosen otherwise. Tail duplication of \p Succ is
+/// always locally profitable if we would have picked \p Succ without
+/// considering duplication.
+bool MachineBlockPlacement::isProfitableToTailDup(
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    BranchProbability QProb,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // We need to do a probability calculation to make sure this is profitable.
+  // First: does succ have a successor that post-dominates? This affects the
+  // calculation. The 2 relevant cases are:
+  //    BB         BB
+  //    | \Qout    | \Qout
+  //   P|  C       |P C
+  //    =   C'     =   C'
+  //    |  /Qin    |  /Qin
+  //    | /        | /
+  //    Succ       Succ
+  //    / \        | \  V
+  //  U/   =V      |U \
+  //  /     \      =   D
+  //  D      E     |  /
+  //               | /
+  //               |/
+  //               PDom
+  //  '=' : Branch taken for that CFG edge
+  // In the second case, Placing Succ while duplicating it into C prevents the
+  // fallthrough of Succ into either D or PDom, because they now have C as an
+  // unplaced predecessor
+
+  // Start by figuring out which case we fall into
+  MachineBasicBlock *PDom = nullptr;
+  SmallVector<MachineBasicBlock *, 4> SuccSuccs;
+  // Only scan the relevant successors
+  auto AdjustedSuccSumProb =
+      collectViableSuccessors(Succ, Chain, BlockFilter, SuccSuccs);
+  BranchProbability PProb = MBPI->getEdgeProbability(BB, Succ);
+  auto BBFreq = MBFI->getBlockFreq(BB);
+  auto SuccFreq = MBFI->getBlockFreq(Succ);
+  BlockFrequency P = BBFreq * PProb;
+  BlockFrequency Qout = BBFreq * QProb;
+  uint64_t EntryFreq = MBFI->getEntryFreq();
+  // If there are no more successors, it is profitable to copy, as it strictly
+  // increases fallthrough.
+  if (SuccSuccs.size() == 0)
+    return greaterWithBias(P, Qout, EntryFreq);
+
+  auto BestSuccSucc = BranchProbability::getZero();
+  // Find the PDom or the best Succ if no PDom exists.
+  for (MachineBasicBlock *SuccSucc : SuccSuccs) {
+    auto Prob = MBPI->getEdgeProbability(Succ, SuccSucc);
+    if (Prob > BestSuccSucc)
+      BestSuccSucc = Prob;
+    if (PDom == nullptr)
+      if (MPDT->dominates(SuccSucc, Succ)) {
+        PDom = SuccSucc;
+        break;
+      }
+  }
+  // For the comparisons, we need to know Succ's best incoming edge that isn't
+  // from BB.
+  auto SuccBestPred = BlockFrequency(0);
+  for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+    if (SuccPred == Succ || SuccPred == BB
+        || BlockToChain[SuccPred] == &Chain
+        || (BlockFilter && !BlockFilter->count(SuccPred)))
+      continue;
+    auto Freq = MBFI->getBlockFreq(SuccPred)
+        * MBPI->getEdgeProbability(SuccPred, Succ);
+    if (Freq > SuccBestPred)
+      SuccBestPred = Freq;
+  }
+  // Qin is Succ's best unplaced incoming edge that isn't BB
+  BlockFrequency Qin = SuccBestPred;
+  // If it doesn't have a post-dominating successor, here is the calculation:
+  //    BB        BB
+  //    | \Qout   |  \
+  //   P|  C      |   =
+  //    =   C'    |    C
+  //    |  /Qin   |     |
+  //    | /       |     C' (+Succ)
+  //    Succ      Succ /|
+  //    / \       |  \/ |
+  //  U/   =V     |  == |
+  //  /     \     | /  \|
+  //  D      E    D     E
+  //  '=' : Branch taken for that CFG edge
+  //  Cost in the first case is: P + V
+  //  For this calculation, we always assume P > Qout. If Qout > P
+  //  The result of this function will be ignored at the caller.
+  //  Let F = SuccFreq - Qin
+  //  Cost in the second case is: Qout + min(Qin, F) * U + max(Qin, F) * V
+
+  if (PDom == nullptr || !Succ->isSuccessor(PDom)) {
+    BranchProbability UProb = BestSuccSucc;
+    BranchProbability VProb = AdjustedSuccSumProb - UProb;
+    BlockFrequency F = SuccFreq - Qin;
+    BlockFrequency V = SuccFreq * VProb;
+    BlockFrequency QinU = std::min(Qin, F) * UProb;
+    BlockFrequency BaseCost = P + V;
+    BlockFrequency DupCost = Qout + QinU + std::max(Qin, F) * VProb;
+    return greaterWithBias(BaseCost, DupCost, EntryFreq);
+  }
+  BranchProbability UProb = MBPI->getEdgeProbability(Succ, PDom);
+  BranchProbability VProb = AdjustedSuccSumProb - UProb;
+  BlockFrequency U = SuccFreq * UProb;
+  BlockFrequency V = SuccFreq * VProb;
+  BlockFrequency F = SuccFreq - Qin;
+  // If there is a post-dominating successor, here is the calculation:
+  // BB         BB                 BB          BB
+  // | \Qout    |   \               | \Qout     |  \
+  // |P C       |    =              |P C        |   =
+  // =   C'     |P    C             =   C'      |P   C
+  // |  /Qin    |      |            |  /Qin     |     |
+  // | /        |      C' (+Succ)   | /         |     C' (+Succ)
+  // Succ       Succ  /|            Succ        Succ /|
+  // | \  V     |   \/ |            | \  V      |  \/ |
+  // |U \       |U  /\ =?           |U =        |U /\ |
+  // =   D      = =  =?|            |   D       | =  =|
+  // |  /       |/     D            |  /        |/    D
+  // | /        |     /             | =         |    /
+  // |/         |    /              |/          |   =
+  // Dom         Dom                Dom         Dom
+  //  '=' : Branch taken for that CFG edge
+  // The cost for taken branches in the first case is P + U
+  // Let F = SuccFreq - Qin
+  // The cost in the second case (assuming independence), given the layout:
+  // BB, Succ, (C+Succ), D, Dom or the layout:
+  // BB, Succ, D, Dom, (C+Succ)
+  // is Qout + max(F, Qin) * U + min(F, Qin)
+  // compare P + U vs Qout + P * U + Qin.
+  //
+  // The 3rd and 4th cases cover when Dom would be chosen to follow Succ.
+  //
+  // For the 3rd case, the cost is P + 2 * V
+  // For the 4th case, the cost is Qout + min(Qin, F) * U + max(Qin, F) * V + V
+  // We choose 4 over 3 when (P + V) > Qout + min(Qin, F) * U + max(Qin, F) * V
+  if (UProb > AdjustedSuccSumProb / 2 &&
+      !hasBetterLayoutPredecessor(Succ, PDom, *BlockToChain[PDom], UProb, UProb,
+                                  Chain, BlockFilter))
+    // Cases 3 & 4
+    return greaterWithBias(
+        (P + V), (Qout + std::max(Qin, F) * VProb + std::min(Qin, F) * UProb),
+        EntryFreq);
+  // Cases 1 & 2
+  return greaterWithBias((P + U),
+                         (Qout + std::min(Qin, F) * AdjustedSuccSumProb +
+                          std::max(Qin, F) * UProb),
+                         EntryFreq);
+}
+
+/// Check for a trellis layout. \p BB is the upper part of a trellis if its
+/// successors form the lower part of a trellis. A successor set S forms the
+/// lower part of a trellis if all of the predecessors of S are either in S or
+/// have all of S as successors. We ignore trellises where BB doesn't have 2
+/// successors because for fewer than 2, it's trivial, and for 3 or greater they
+/// are very uncommon and complex to compute optimally. Allowing edges within S
+/// is not strictly a trellis, but the same algorithm works, so we allow it.
+bool MachineBlockPlacement::isTrellis(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  // Technically BB could form a trellis with branching factor higher than 2.
+  // But that's extremely uncommon.
+  if (BB->succ_size() != 2 || ViableSuccs.size() != 2)
     return false;
-  // If we outline optional branches, look whether Succ is unavoidable, i.e.
-  // dominates all terminators of the MachineFunction. If it does, other
-  // successors must be optional. Don't do this for cold branches.
-  if (SuccProb > HotProb.getCompl() && UnavoidableBlocks.count(Succ) > 0) {
-    for (MachineBasicBlock *Pred : Succ->predecessors()) {
-      // Check whether there is an unplaced optional branch.
-      if (Pred == Succ || (BlockFilter && !BlockFilter->count(Pred)) ||
-          BlockToChain[Pred] == &Chain)
+
+  SmallPtrSet<const MachineBasicBlock *, 2> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  // To avoid reviewing the same predecessors twice.
+  SmallPtrSet<const MachineBasicBlock *, 8> SeenPreds;
+
+  for (MachineBasicBlock *Succ : ViableSuccs) {
+    int PredCount = 0;
+    for (auto SuccPred : Succ->predecessors()) {
+      // Allow triangle successors, but don't count them.
+      if (Successors.count(SuccPred)) {
+        // Make sure that it is actually a triangle.
+        for (MachineBasicBlock *CheckSucc : SuccPred->successors())
+          if (!Successors.count(CheckSucc))
+            return false;
+        continue;
+      }
+      const BlockChain *PredChain = BlockToChain[SuccPred];
+      if (SuccPred == BB || (BlockFilter && !BlockFilter->count(SuccPred)) ||
+          PredChain == &Chain || PredChain == BlockToChain[Succ])
         continue;
-      // Check whether the optional branch has exactly one BB.
-      if (Pred->pred_size() > 1 || *Pred->pred_begin() != BB)
+      ++PredCount;
+      // Perform the successor check only once.
+      if (!SeenPreds.insert(SuccPred).second)
         continue;
-      // Check whether the optional branch is small.
-      if (Pred->size() < OutlineOptionalThreshold)
+      if (!hasSameSuccessors(*SuccPred, Successors))
         return false;
     }
-    return true;
-  } else
+    // If one of the successors has only BB as a predecessor, it is not a
+    // trellis.
+    if (PredCount < 1)
+      return false;
+  }
+  return true;
+}
+
+/// Pick the highest total weight pair of edges that can both be laid out.
+/// The edges in \p Edges[0] are assumed to have a different destination than
+/// the edges in \p Edges[1]. Simple counting shows that the best pair is either
+/// the individual highest weight edges to the 2 different destinations, or in
+/// case of a conflict, one of them should be replaced with a 2nd best edge.
+std::pair<MachineBlockPlacement::WeightedEdge,
+          MachineBlockPlacement::WeightedEdge>
+MachineBlockPlacement::getBestNonConflictingEdges(
+    const MachineBasicBlock *BB,
+    MutableArrayRef<SmallVector<MachineBlockPlacement::WeightedEdge, 8>>
+        Edges) {
+  // Sort the edges, and then for each successor, find the best incoming
+  // predecessor. If the best incoming predecessors aren't the same,
+  // then that is clearly the best layout. If there is a conflict, one of the
+  // successors will have to fallthrough from the second best predecessor. We
+  // compare which combination is better overall.
+
+  // Sort for highest frequency.
+  auto Cmp = [](WeightedEdge A, WeightedEdge B) { return A.Weight > B.Weight; };
+
+  std::stable_sort(Edges[0].begin(), Edges[0].end(), Cmp);
+  std::stable_sort(Edges[1].begin(), Edges[1].end(), Cmp);
+  auto BestA = Edges[0].begin();
+  auto BestB = Edges[1].begin();
+  // Arrange for the correct answer to be in BestA and BestB
+  // If the 2 best edges don't conflict, the answer is already there.
+  if (BestA->Src == BestB->Src) {
+    // Compare the total fallthrough of (Best + Second Best) for both pairs
+    auto SecondBestA = std::next(BestA);
+    auto SecondBestB = std::next(BestB);
+    BlockFrequency BestAScore = BestA->Weight + SecondBestB->Weight;
+    BlockFrequency BestBScore = BestB->Weight + SecondBestA->Weight;
+    if (BestAScore < BestBScore)
+      BestA = SecondBestA;
+    else
+      BestB = SecondBestB;
+  }
+  // Arrange for the BB edge to be in BestA if it exists.
+  if (BestB->Src == BB)
+    std::swap(BestA, BestB);
+  return std::make_pair(*BestA, *BestB);
+}
+
+/// Get the best successor from \p BB based on \p BB being part of a trellis.
+/// We only handle trellises with 2 successors, so the algorithm is
+/// straightforward: Find the best pair of edges that don't conflict. We find
+/// the best incoming edge for each successor in the trellis. If those conflict,
+/// we consider which of them should be replaced with the second best.
+/// Upon return the two best edges will be in \p BestEdges. If one of the edges
+/// comes from \p BB, it will be in \p BestEdges[0]
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::getBestTrellisSuccessor(
+    const MachineBasicBlock *BB,
+    const SmallVectorImpl<MachineBasicBlock *> &ViableSuccs,
+    BranchProbability AdjustedSumProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
+
+  BlockAndTailDupResult Result = {nullptr, false};
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+
+  // We assume size 2 because it's common. For general n, we would have to do
+  // the Hungarian algorithm, but it's not worth the complexity because more
+  // than 2 successors is fairly uncommon, and a trellis even more so.
+  if (Successors.size() != 2 || ViableSuccs.size() != 2)
+    return Result;
+
+  // Collect the edge frequencies of all edges that form the trellis.
+  SmallVector<WeightedEdge, 8> Edges[2];
+  int SuccIndex = 0;
+  for (auto Succ : ViableSuccs) {
+    for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
+      // Skip any placed predecessors that are not BB
+      if (SuccPred != BB)
+        if ((BlockFilter && !BlockFilter->count(SuccPred)) ||
+            BlockToChain[SuccPred] == &Chain ||
+            BlockToChain[SuccPred] == BlockToChain[Succ])
+          continue;
+      BlockFrequency EdgeFreq = MBFI->getBlockFreq(SuccPred) *
+                                MBPI->getEdgeProbability(SuccPred, Succ);
+      Edges[SuccIndex].push_back({EdgeFreq, SuccPred, Succ});
+    }
+    ++SuccIndex;
+  }
+
+  // Pick the best combination of 2 edges from all the edges in the trellis.
+  WeightedEdge BestA, BestB;
+  std::tie(BestA, BestB) = getBestNonConflictingEdges(BB, Edges);
+
+  if (BestA.Src != BB) {
+    // If we have a trellis, and BB doesn't have the best fallthrough edges,
+    // we shouldn't choose any successor. We've already looked and there's a
+    // better fallthrough edge for all the successors.
+    DEBUG(dbgs() << "Trellis, but not one of the chosen edges.\n");
+    return Result;
+  }
+
+  // Did we pick the triangle edge? If tail-duplication is profitable, do
+  // that instead. Otherwise merge the triangle edge now while we know it is
+  // optimal.
+  if (BestA.Dest == BestB.Src) {
+    // The edges are BB->Succ1->Succ2, and we're looking to see if BB->Succ2
+    // would be better.
+    MachineBasicBlock *Succ1 = BestA.Dest;
+    MachineBasicBlock *Succ2 = BestB.Dest;
+    // Check to see if tail-duplication would be profitable.
+    if (TailDupPlacement && shouldTailDuplicate(Succ2) &&
+        canTailDuplicateUnplacedPreds(BB, Succ2, Chain, BlockFilter) &&
+        isProfitableToTailDup(BB, Succ2, MBPI->getEdgeProbability(BB, Succ1),
+                              Chain, BlockFilter)) {
+      DEBUG(BranchProbability Succ2Prob = getAdjustedProbability(
+                MBPI->getEdgeProbability(BB, Succ2), AdjustedSumProb);
+            dbgs() << "    Selected: " << getBlockName(Succ2)
+                   << ", probability: " << Succ2Prob << " (Tail Duplicate)\n");
+      Result.BB = Succ2;
+      Result.ShouldTailDup = true;
+      return Result;
+    }
+  }
+  // We have already computed the optimal edge for the other side of the
+  // trellis.
+  ComputedEdges[BestB.Src] = { BestB.Dest, false };
+
+  auto TrellisSucc = BestA.Dest;
+  DEBUG(BranchProbability SuccProb = getAdjustedProbability(
+            MBPI->getEdgeProbability(BB, TrellisSucc), AdjustedSumProb);
+        dbgs() << "    Selected: " << getBlockName(TrellisSucc)
+               << ", probability: " << SuccProb << " (Trellis)\n");
+  Result.BB = TrellisSucc;
+  return Result;
+}
+
+/// When the option TailDupPlacement is on, this method checks if the
+/// fallthrough candidate block \p Succ (of block \p BB) can be tail-duplicated
+/// into all of its unplaced, unfiltered predecessors, that are not BB.
+bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
+    const MachineBasicBlock *BB, MachineBasicBlock *Succ,
+    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+  if (!shouldTailDuplicate(Succ))
     return false;
+
+  // For CFG checking.
+  SmallPtrSet<const MachineBasicBlock *, 4> Successors(BB->succ_begin(),
+                                                       BB->succ_end());
+  for (MachineBasicBlock *Pred : Succ->predecessors()) {
+    // Make sure all unplaced and unfiltered predecessors can be
+    // tail-duplicated into.
+    // Skip any blocks that are already placed or not in this loop.
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
+        || BlockToChain[Pred] == &Chain)
+      continue;
+    if (!TailDup.canTailDuplicate(Succ, Pred)) {
+      if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
+        // This will result in a trellis after tail duplication, so we don't
+        // need to copy Succ into this predecessor. In the presence
+        // of a trellis tail duplication can continue to be profitable.
+        // For example:
+        // A            A
+        // |\           |\
+        // | \          | \
+        // |  C         |  C+BB
+        // | /          |  |
+        // |/           |  |
+        // BB    =>     BB |
+        // |\           |\/|
+        // | \          |/\|
+        // |  D         |  D
+        // | /          | /
+        // |/           |/
+        // Succ         Succ
+        //
+        // After BB was duplicated into C, the layout looks like the one on the
+        // right. BB and C now have the same successors. When considering
+        // whether Succ can be duplicated into all its unplaced predecessors, we
+        // ignore C.
+        // We can do this because C already has a profitable fallthrough, namely
+        // D. TODO(iteratee): ignore sufficiently cold predecessors for
+        // duplication and for this test.
+        //
+        // This allows trellises to be laid out in 2 separate chains
+        // (A,B,Succ,...) and later (C,D,...) This is a reasonable heuristic
+        // because it allows the creation of 2 fallthrough paths with links
+        // between them, and we correctly identify the best layout for these
+        // CFGs. We want to extend trellises that the user created in addition
+        // to trellises created by tail-duplication, so we just look for the
+        // CFG.
+        continue;
+      return false;
+    }
+  }
+  return true;
+}
+
+/// Find chains of triangles where we believe it would be profitable to
+/// tail-duplicate them all, but a local analysis would not find them.
+/// There are 3 ways this can be profitable:
+/// 1) The post-dominators marked 50% are actually taken 55% (This shrinks with
+///    longer chains)
+/// 2) The chains are statically correlated. Branch probabilities have a very
+///    U-shaped distribution.
+///    [http://nrs.harvard.edu/urn-3:HUL.InstRepos:24015805]
+///    If the branches in a chain are likely to be from the same side of the
+///    distribution as their predecessor, but are independent at runtime, this
+///    transformation is profitable. (Because the cost of being wrong is a small
+///    fixed cost, unlike the standard triangle layout where the cost of being
+///    wrong scales with the # of triangles.)
+/// 3) The chains are dynamically correlated. If the probability that a previous
+///    branch was taken positively influences whether the next branch will be
+///    taken
+/// We believe that 2 and 3 are common enough to justify the small margin in 1.
+void MachineBlockPlacement::precomputeTriangleChains() {
+  struct TriangleChain {
+    std::vector<MachineBasicBlock *> Edges;
+    TriangleChain(MachineBasicBlock *src, MachineBasicBlock *dst)
+        : Edges({src, dst}) {}
+
+    void append(MachineBasicBlock *dst) {
+      assert(getKey()->isSuccessor(dst) &&
+             "Attempting to append a block that is not a successor.");
+      Edges.push_back(dst);
+    }
+
+    unsigned count() const { return Edges.size() - 1; }
+
+    MachineBasicBlock *getKey() const {
+      return Edges.back();
+    }
+  };
+
+  if (TriangleChainCount == 0)
+    return;
+
+  DEBUG(dbgs() << "Pre-computing triangle chains.\n");
+  // Map from last block to the chain that contains it. This allows us to extend
+  // chains as we find new triangles.
+  DenseMap<const MachineBasicBlock *, TriangleChain> TriangleChainMap;
+  for (MachineBasicBlock &BB : *F) {
+    // If BB doesn't have 2 successors, it doesn't start a triangle.
+    if (BB.succ_size() != 2)
+      continue;
+    MachineBasicBlock *PDom = nullptr;
+    for (MachineBasicBlock *Succ : BB.successors()) {
+      if (!MPDT->dominates(Succ, &BB))
+        continue;
+      PDom = Succ;
+      break;
+    }
+    // If BB doesn't have a post-dominating successor, it doesn't form a
+    // triangle.
+    if (PDom == nullptr)
+      continue;
+    // If PDom has a hint that it is low probability, skip this triangle.
+    if (MBPI->getEdgeProbability(&BB, PDom) < BranchProbability(50, 100))
+      continue;
+    // If PDom isn't eligible for duplication, this isn't the kind of triangle
+    // we're looking for.
+    if (!shouldTailDuplicate(PDom))
+      continue;
+    bool CanTailDuplicate = true;
+    // If PDom can't tail-duplicate into it's non-BB predecessors, then this
+    // isn't the kind of triangle we're looking for.
+    for (MachineBasicBlock* Pred : PDom->predecessors()) {
+      if (Pred == &BB)
+        continue;
+      if (!TailDup.canTailDuplicate(PDom, Pred)) {
+        CanTailDuplicate = false;
+        break;
+      }
+    }
+    // If we can't tail-duplicate PDom to its predecessors, then skip this
+    // triangle.
+    if (!CanTailDuplicate)
+      continue;
+
+    // Now we have an interesting triangle. Insert it if it's not part of an
+    // existing chain
+    // Note: This cannot be replaced with a call insert() or emplace() because
+    // the find key is BB, but the insert/emplace key is PDom.
+    auto Found = TriangleChainMap.find(&BB);
+    // If it is, remove the chain from the map, grow it, and put it back in the
+    // map with the end as the new key.
+    if (Found != TriangleChainMap.end()) {
+      TriangleChain Chain = std::move(Found->second);
+      TriangleChainMap.erase(Found);
+      Chain.append(PDom);
+      TriangleChainMap.insert(std::make_pair(Chain.getKey(), std::move(Chain)));
+    } else {
+      auto InsertResult = TriangleChainMap.try_emplace(PDom, &BB, PDom);
+      assert(InsertResult.second && "Block seen twice.");
+      (void)InsertResult;
+    }
+  }
+
+  for (auto &ChainPair : TriangleChainMap) {
+    TriangleChain &Chain = ChainPair.second;
+    // Benchmarking has shown that due to branch correlation duplicating 2 or
+    // more triangles is profitable, despite the calculations assuming
+    // independence.
+    if (Chain.count() < TriangleChainCount)
+      continue;
+    MachineBasicBlock *dst = Chain.Edges.back();
+    Chain.Edges.pop_back();
+    for (MachineBasicBlock *src : reverse(Chain.Edges)) {
+      DEBUG(dbgs() << "Marking edge: " << getBlockName(src) << "->" <<
+            getBlockName(dst) << " as pre-computed based on triangles.\n");
+
+      auto InsertResult = ComputedEdges.insert({src, {dst, true}});
+      assert(InsertResult.second && "Block seen twice.");
+      (void)InsertResult;
+
+      dst = src;
+    }
+  }
 }
 
 // When profile is not present, return the StaticLikelyProb.
 // When profile is available, we need to handle the triangle-shape CFG.
 static BranchProbability getLayoutSuccessorProbThreshold(
-      MachineBasicBlock *BB) {
+      const MachineBasicBlock *BB) {
   if (!BB->getParent()->getFunction()->getEntryCount())
     return BranchProbability(StaticLikelyProb, 100);
   if (BB->succ_size() == 2) {
@@ -615,11 +1199,11 @@ static BranchProbability getLayoutSuccessorProbThreshold(
     if (Succ1->isSuccessor(Succ2) || Succ2->isSuccessor(Succ1)) {
       /* See case 1 below for the cost analysis. For BB->Succ to
        * be taken with smaller cost, the following needs to hold:
-       *   Prob(BB->Succ) > 2* Prob(BB->Pred)
-       *   So the threshold T
-       *   T = 2 * (1-Prob(BB->Pred). Since T + Prob(BB->Pred) == 1,
-       * We have  T + T/2 = 1, i.e. T = 2/3. Also adding user specified
-       * branch bias, we have
+       *   Prob(BB->Succ) > 2 * Prob(BB->Pred)
+       *   So the threshold T in the calculation below
+       *   (1-T) * Prob(BB->Succ) > T * Prob(BB->Pred)
+       *   So T / (1 - T) = 2, Yielding T = 2/3
+       * Also adding user specified branch bias, we have
        *   T = (2/3)*(ProfileLikelyProb/50)
        *     = (2*ProfileLikelyProb)/150)
        */
@@ -631,10 +1215,17 @@ static BranchProbability getLayoutSuccessorProbThreshold(
 
 /// Checks to see if the layout candidate block \p Succ has a better layout
 /// predecessor than \c BB. If yes, returns true.
+/// \p SuccProb: The probability adjusted for only remaining blocks.
+///   Only used for logging
+/// \p RealSuccProb: The un-adjusted probability.
+/// \p Chain: The chain that BB belongs to and Succ is being considered for.
+/// \p BlockFilter: if non-null, the set of blocks that make up the loop being
+///    considered
 bool MachineBlockPlacement::hasBetterLayoutPredecessor(
-    MachineBasicBlock *BB, MachineBasicBlock *Succ, BlockChain &SuccChain,
-    BranchProbability SuccProb, BranchProbability RealSuccProb,
-    BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
+    const BlockChain &SuccChain, BranchProbability SuccProb,
+    BranchProbability RealSuccProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
 
   // There isn't a better layout when there are no unscheduled predecessors.
   if (SuccChain.UnscheduledPredecessors == 0)
@@ -740,11 +1331,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   //  |  Pred----|                     |  S1----
   //  |  |                             |       |
   //  --(S1 or S2)                     ---Pred--
+  //                                        |
+  //                                       S2
   //
   // topo-cost = freq(S->Pred) + freq(BB->S1) + freq(BB->S2)
   //    + min(freq(Pred->S1), freq(Pred->S2))
   // Non-topo-order cost:
-  // In the worst case, S2 will not get laid out after Pred.
   // non-topo-cost = 2 * freq(S->Pred) + freq(BB->S2).
   // To be conservative, we can assume that min(freq(Pred->S1), freq(Pred->S2))
   // is 0. Then the non topo layout is better when
@@ -762,13 +1354,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   for (MachineBasicBlock *Pred : Succ->predecessors()) {
     if (Pred == Succ || BlockToChain[Pred] == &SuccChain ||
         (BlockFilter && !BlockFilter->count(Pred)) ||
-        BlockToChain[Pred] == &Chain)
+        BlockToChain[Pred] == &Chain ||
+        // This check is redundant except for look ahead. This function is
+        // called for lookahead by isProfitableToTailDup when BB hasn't been
+        // placed yet.
+        (Pred == BB))
       continue;
     // Do backward checking.
     // For all cases above, we need a backward checking to filter out edges that
-    // are not 'strongly' biased. With profile data available, the check is
-    // mostly redundant for case 2 (when threshold prob is set at 50%) unless S
-    // has more than two successors.
+    // are not 'strongly' biased.
     // BB  Pred
     //  \ /
     //  Succ
@@ -804,14 +1398,15 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
 /// breaking CFG structure, but cave and break such structures in the case of
 /// very hot successor edges.
 ///
-/// \returns The best successor block found, or null if none are viable.
-MachineBasicBlock *
-MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
-                                           BlockChain &Chain,
-                                           const BlockFilterSet *BlockFilter) {
+/// \returns The best successor block found, or null if none are viable, along
+/// with a boolean indicating if tail duplication is necessary.
+MachineBlockPlacement::BlockAndTailDupResult
+MachineBlockPlacement::selectBestSuccessor(
+    const MachineBasicBlock *BB, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
   const BranchProbability HotProb(StaticLikelyProb, 100);
 
-  MachineBasicBlock *BestSucc = nullptr;
+  BlockAndTailDupResult BestSucc = { nullptr, false };
   auto BestProb = BranchProbability::getZero();
 
   SmallVector<MachineBasicBlock *, 4> Successors;
@@ -819,22 +1414,45 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
       collectViableSuccessors(BB, Chain, BlockFilter, Successors);
 
   DEBUG(dbgs() << "Selecting best successor for: " << getBlockName(BB) << "\n");
+
+  // if we already precomputed the best successor for BB, return that if still
+  // applicable.
+  auto FoundEdge = ComputedEdges.find(BB);
+  if (FoundEdge != ComputedEdges.end()) {
+    MachineBasicBlock *Succ = FoundEdge->second.BB;
+    ComputedEdges.erase(FoundEdge);
+    BlockChain *SuccChain = BlockToChain[Succ];
+    if (BB->isSuccessor(Succ) && (!BlockFilter || BlockFilter->count(Succ)) &&
+        SuccChain != &Chain && Succ == *SuccChain->begin())
+      return FoundEdge->second;
+  }
+
+  // if BB is part of a trellis, Use the trellis to determine the optimal
+  // fallthrough edges
+  if (isTrellis(BB, Successors, Chain, BlockFilter))
+    return getBestTrellisSuccessor(BB, Successors, AdjustedSumProb, Chain,
+                                   BlockFilter);
+
+  // For blocks with CFG violations, we may be able to lay them out anyway with
+  // tail-duplication. We keep this vector so we can perform the probability
+  // calculations the minimum number of times.
+  SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4>
+      DupCandidates;
   for (MachineBasicBlock *Succ : Successors) {
     auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
     BranchProbability SuccProb =
         getAdjustedProbability(RealSuccProb, AdjustedSumProb);
 
-    // This heuristic is off by default.
-    if (shouldPredBlockBeOutlined(BB, Succ, Chain, BlockFilter, SuccProb,
-                                  HotProb))
-      return Succ;
-
     BlockChain &SuccChain = *BlockToChain[Succ];
     // Skip the edge \c BB->Succ if block \c Succ has a better layout
     // predecessor that yields lower global cost.
     if (hasBetterLayoutPredecessor(BB, Succ, SuccChain, SuccProb, RealSuccProb,
-                                   Chain, BlockFilter))
+                                   Chain, BlockFilter)) {
+      // If tail duplication would make Succ profitable, place it.
+      if (TailDupPlacement && shouldTailDuplicate(Succ))
+        DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
       continue;
+    }
 
     DEBUG(
         dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: "
@@ -842,17 +1460,48 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
                << (SuccChain.UnscheduledPredecessors != 0 ? " (CFG break)" : "")
                << "\n");
 
-    if (BestSucc && BestProb >= SuccProb) {
+    if (BestSucc.BB && BestProb >= SuccProb) {
       DEBUG(dbgs() << "    Not the best candidate, continuing\n");
       continue;
     }
 
     DEBUG(dbgs() << "    Setting it as best candidate\n");
-    BestSucc = Succ;
+    BestSucc.BB = Succ;
     BestProb = SuccProb;
   }
-  if (BestSucc)
-    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc) << "\n");
+  // Handle the tail duplication candidates in order of decreasing probability.
+  // Stop at the first one that is profitable. Also stop if they are less
+  // profitable than BestSucc. Position is important because we preserve it and
+  // prefer first best match. Here we aren't comparing in order, so we capture
+  // the position instead.
+  if (DupCandidates.size() != 0) {
+    auto cmp =
+        [](const std::tuple<BranchProbability, MachineBasicBlock *> &a,
+           const std::tuple<BranchProbability, MachineBasicBlock *> &b) {
+          return std::get<0>(a) > std::get<0>(b);
+        };
+    std::stable_sort(DupCandidates.begin(), DupCandidates.end(), cmp);
+  }
+  for(auto &Tup : DupCandidates) {
+    BranchProbability DupProb;
+    MachineBasicBlock *Succ;
+    std::tie(DupProb, Succ) = Tup;
+    if (DupProb < BestProb)
+      break;
+    if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
+        && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
+      DEBUG(
+          dbgs() << "    Candidate: " << getBlockName(Succ) << ", probability: "
+                 << DupProb
+                 << " (Tail Duplicate)\n");
+      BestSucc.BB = Succ;
+      BestSucc.ShouldTailDup = true;
+      break;
+    }
+  }
+
+  if (BestSucc.BB)
+    DEBUG(dbgs() << "    Selected: " << getBlockName(BestSucc.BB) << "\n");
 
   return BestSucc;
 }
@@ -868,7 +1517,7 @@ MachineBlockPlacement::selectBestSuccessor(MachineBasicBlock *BB,
 ///
 /// \returns The best block found, or null if none are viable.
 MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
-    BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
+    const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList) {
   // Once we need to walk the worklist looking for a candidate, cleanup the
   // worklist of already placed entries.
   // FIXME: If this shows up on profiles, it could be folded (at the cost of
@@ -954,7 +1603,7 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
 }
 
 void MachineBlockPlacement::fillWorkLists(
-    MachineBasicBlock *MBB,
+    const MachineBasicBlock *MBB,
     SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
     const BlockFilterSet *BlockFilter = nullptr) {
   BlockChain &Chain = *BlockToChain[MBB];
@@ -976,23 +1625,23 @@ void MachineBlockPlacement::fillWorkLists(
   if (Chain.UnscheduledPredecessors != 0)
     return;
 
-  MBB = *Chain.begin();
-  if (MBB->isEHPad())
-    EHPadWorkList.push_back(MBB);
+  MachineBasicBlock *BB = *Chain.begin();
+  if (BB->isEHPad())
+    EHPadWorkList.push_back(BB);
   else
-    BlockWorkList.push_back(MBB);
+    BlockWorkList.push_back(BB);
 }
 
 void MachineBlockPlacement::buildChain(
-    MachineBasicBlock *BB, BlockChain &Chain,
+    const MachineBasicBlock *HeadBB, BlockChain &Chain,
     BlockFilterSet *BlockFilter) {
-  assert(BB && "BB must not be null.\n");
-  assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match.\n");
+  assert(HeadBB && "BB must not be null.\n");
+  assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
   MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
 
-  MachineBasicBlock *LoopHeaderBB = BB;
+  const MachineBasicBlock *LoopHeaderBB = HeadBB;
   markChainSuccessors(Chain, LoopHeaderBB, BlockFilter);
-  BB = *std::prev(Chain.end());
+  MachineBasicBlock *BB = *std::prev(Chain.end());
   for (;;) {
     assert(BB && "null block found at end of chain in loop.");
     assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
@@ -1001,7 +1650,11 @@ void MachineBlockPlacement::buildChain(
 
     // Look for the best viable successor if there is one to place immediately
     // after this block.
-    MachineBasicBlock *BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+    auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
+    MachineBasicBlock* BestSucc = Result.BB;
+    bool ShouldTailDup = Result.ShouldTailDup;
+    if (TailDupPlacement)
+      ShouldTailDup |= (BestSucc && shouldTailDuplicate(BestSucc));
 
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
@@ -1022,7 +1675,7 @@ void MachineBlockPlacement::buildChain(
 
     // Placement may have changed tail duplication opportunities.
     // Check for that now.
-    if (TailDupPlacement && BestSucc) {
+    if (TailDupPlacement && BestSucc && ShouldTailDup) {
       // If the chosen successor was duplicated into all its predecessors,
       // don't bother laying it out, just go round the loop again with BB as
       // the chain end.
@@ -1058,7 +1711,7 @@ void MachineBlockPlacement::buildChain(
 /// unconditional jump (for the backedge) rotating it in front of the loop
 /// header is always profitable.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
+MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
                                        const BlockFilterSet &LoopBlockSet) {
   // Placing the latch block before the header may introduce an extra branch
   // that skips this block the first time the loop is executed, which we want
@@ -1122,7 +1775,7 @@ MachineBlockPlacement::findBestLoopTop(MachineLoop &L,
 /// block to layout at the top of the loop. Typically this is done to maximize
 /// fallthrough opportunities.
 MachineBasicBlock *
-MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
+MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
                                         const BlockFilterSet &LoopBlockSet) {
   // We don't want to layout the loop linearly in all cases. If the loop header
   // is just a normal basic block in the loop, we want to look for what block
@@ -1241,7 +1894,7 @@ MachineBlockPlacement::findBestLoopExit(MachineLoop &L,
 /// branches. For example, if the loop has fallthrough into its header and out
 /// of its bottom already, don't rotate it.
 void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
-                                       MachineBasicBlock *ExitingBB,
+                                       const MachineBasicBlock *ExitingBB,
                                        const BlockFilterSet &LoopBlockSet) {
   if (!ExitingBB)
     return;
@@ -1291,7 +1944,8 @@ void MachineBlockPlacement::rotateLoop(BlockChain &LoopChain,
 ///  Therefore, the cost for a given rotation is the sum of costs listed above.
 ///  We select the best rotation with the smallest cost.
 void MachineBlockPlacement::rotateLoopWithProfile(
-    BlockChain &LoopChain, MachineLoop &L, const BlockFilterSet &LoopBlockSet) {
+    BlockChain &LoopChain, const MachineLoop &L,
+    const BlockFilterSet &LoopBlockSet) {
   auto HeaderBB = L.getHeader();
   auto HeaderIter = find(LoopChain, HeaderBB);
   auto RotationPos = LoopChain.end();
@@ -1428,7 +2082,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
 /// When profile data is available, exclude cold blocks from the returned set;
 /// otherwise, collect all blocks in the loop.
 MachineBlockPlacement::BlockFilterSet
-MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
+MachineBlockPlacement::collectLoopBlockSet(const MachineLoop &L) {
   BlockFilterSet LoopBlockSet;
 
   // Filter cold blocks off from LoopBlockSet when profile data is available.
@@ -1465,10 +2119,10 @@ MachineBlockPlacement::collectLoopBlockSet(MachineLoop &L) {
 /// as much as possible. We can then stitch the chains together in a way which
 /// both preserves the topological structure and minimizes taken conditional
 /// branches.
-void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
+void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
   // First recurse through any nested loops, building chains for those inner
   // loops.
-  for (MachineLoop *InnerLoop : L)
+  for (const MachineLoop *InnerLoop : L)
     buildLoopChains(*InnerLoop);
 
   assert(BlockWorkList.empty());
@@ -1505,7 +2159,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
   assert(LoopChain.UnscheduledPredecessors == 0);
   UpdatedPreds.insert(&LoopChain);
 
-  for (MachineBasicBlock *LoopBB : LoopBlockSet)
+  for (const MachineBasicBlock *LoopBB : LoopBlockSet)
     fillWorkLists(LoopBB, UpdatedPreds, &LoopBlockSet);
 
   buildChain(LoopTop, LoopChain, &LoopBlockSet);
@@ -1539,7 +2193,7 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
 
     if (!LoopBlockSet.empty()) {
       BadLoop = true;
-      for (MachineBasicBlock *LoopBB : LoopBlockSet)
+      for (const MachineBasicBlock *LoopBB : LoopBlockSet)
         dbgs() << "Loop contains blocks never placed into a chain!\n"
                << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
@@ -1552,31 +2206,6 @@ void MachineBlockPlacement::buildLoopChains(MachineLoop &L) {
   EHPadWorkList.clear();
 }
 
-/// When OutlineOpitonalBranches is on, this method collects BBs that
-/// dominates all terminator blocks of the function \p F.
-void MachineBlockPlacement::collectMustExecuteBBs() {
-  if (OutlineOptionalBranches) {
-    // Find the nearest common dominator of all of F's terminators.
-    MachineBasicBlock *Terminator = nullptr;
-    for (MachineBasicBlock &MBB : *F) {
-      if (MBB.succ_size() == 0) {
-        if (Terminator == nullptr)
-          Terminator = &MBB;
-        else
-          Terminator = MDT->findNearestCommonDominator(Terminator, &MBB);
-      }
-    }
-
-    // MBBs dominating this common dominator are unavoidable.
-    UnavoidableBlocks.clear();
-    for (MachineBasicBlock &MBB : *F) {
-      if (MDT->dominates(&MBB, Terminator)) {
-        UnavoidableBlocks.insert(&MBB);
-      }
-    }
-  }
-}
-
 void MachineBlockPlacement::buildCFGChains() {
   // Ensure that every BB in the function has an associated chain to simplify
   // the assumptions of the remaining algorithm.
@@ -1611,9 +2240,6 @@ void MachineBlockPlacement::buildCFGChains() {
     }
   }
 
-  // Turned on with OutlineOptionalBranches option
-  collectMustExecuteBBs();
-
   // Build any loop-based chains.
   PreferredLoopExit = nullptr;
   for (MachineLoop *L : *MLI)
@@ -1845,7 +2471,7 @@ void MachineBlockPlacement::alignBlocks() {
 /// @return true if \p BB was removed.
 bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
     MachineBasicBlock *BB, MachineBasicBlock *&LPred,
-    MachineBasicBlock *LoopHeaderBB,
+    const MachineBasicBlock *LoopHeaderBB,
     BlockChain &Chain, BlockFilterSet *BlockFilter,
     MachineFunction::iterator &PrevUnplacedBlockIt) {
   bool Removed, DuplicatedToLPred;
@@ -1907,21 +2533,16 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
 /// \return  - True if the block was duplicated into all preds and removed.
 bool MachineBlockPlacement::maybeTailDuplicateBlock(
     MachineBasicBlock *BB, MachineBasicBlock *LPred,
-    const BlockChain &Chain, BlockFilterSet *BlockFilter,
+    BlockChain &Chain, BlockFilterSet *BlockFilter,
     MachineFunction::iterator &PrevUnplacedBlockIt,
     bool &DuplicatedToLPred) {
-
   DuplicatedToLPred = false;
+  if (!shouldTailDuplicate(BB))
+    return false;
+
   DEBUG(dbgs() << "Redoing tail duplication for Succ#"
         << BB->getNumber() << "\n");
-  bool IsSimple = TailDup.isSimpleBB(BB);
-  // Blocks with single successors don't create additional fallthrough
-  // opportunities. Don't duplicate them. TODO: When conditional exits are
-  // analyzable, allow them to be duplicated.
-  if (!IsSimple && BB->succ_size() == 1)
-    return false;
-  if (!TailDup.shouldTailDuplicate(IsSimple, *BB))
-    return false;
+
   // This has to be a callback because none of it can be done after
   // BB is deleted.
   bool Removed = false;
@@ -1973,6 +2594,7 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
       llvm::function_ref<void(MachineBasicBlock*)>(RemovalCallback);
 
   SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
+  bool IsSimple = TailDup.isSimpleBB(BB);
   TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
                                  &DuplicatedPreds, &RemovalCallbackRef);
 
@@ -2012,21 +2634,24 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   MLI = &getAnalysis<MachineLoopInfo>();
   TII = MF.getSubtarget().getInstrInfo();
   TLI = MF.getSubtarget().getTargetLowering();
-  MDT = &getAnalysis<MachineDominatorTree>();
+  MPDT = nullptr;
 
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
   PreferredLoopExit = nullptr;
 
+  assert(BlockToChain.empty());
+  assert(ComputedEdges.empty());
+
   if (TailDupPlacement) {
-    unsigned TailDupSize = TailDuplicatePlacementThreshold;
+    MPDT = &getAnalysis<MachinePostDominatorTree>();
+    unsigned TailDupSize = TailDupPlacementThreshold;
     if (MF.getFunction()->optForSize())
       TailDupSize = 1;
     TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize);
+    precomputeTriangleChains();
   }
 
-  assert(BlockToChain.empty());
-
   buildCFGChains();
 
   // Changing the layout can create new tail merging opportunities.
@@ -2038,7 +2663,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                          BranchFoldPlacement;
   // No tail merging opportunities if the block number is less than four.
   if (MF.size() > 3 && EnableTailMerge) {
-    unsigned TailMergeSize = TailDuplicatePlacementThreshold + 1;
+    unsigned TailMergeSize = TailDupPlacementThreshold + 1;
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, TailMergeSize);
 
@@ -2047,8 +2672,10 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
                             /*AfterBlockPlacement=*/true)) {
       // Redo the layout if tail merging creates/removes/moves blocks.
       BlockToChain.clear();
-      // Must redo the dominator tree if blocks were changed.
-      MDT->runOnMachineFunction(MF);
+      ComputedEdges.clear();
+      // Must redo the post-dominator tree if blocks were changed.
+      if (MPDT)
+        MPDT->runOnMachineFunction(MF);
       ChainAllocator.DestroyAll();
       buildCFGChains();
     }
@@ -2058,6 +2685,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   alignBlocks();
 
   BlockToChain.clear();
+  ComputedEdges.clear();
   ChainAllocator.DestroyAll();
 
   if (AlignAllBlock)
@@ -2073,13 +2701,11 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
         MBI->setAlignment(AlignAllNonFallThruBlocks);
     }
   }
-#ifndef NDEBUG
   if (ViewBlockLayoutWithBFI != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
        F->getFunction()->getName().equals(ViewBlockFreqFuncName))) {
-    MBFI->view(false);
+    MBFI->view("MBP." + MF.getName(), false);
   }
-#endif
 
 
   // We always return true as we have no way to track whether the final order
diff --git a/lib/CodeGen/MachineCombiner.cpp b/lib/CodeGen/MachineCombiner.cpp
index 1ec03c0e0e10b3379ed7235e855d0600f13b2896..50e453e4067ccf9b631768c692b650cc9a3234f5 100644
--- a/lib/CodeGen/MachineCombiner.cpp
+++ b/lib/CodeGen/MachineCombiner.cpp
@@ -8,7 +8,7 @@
 //===----------------------------------------------------------------------===//
 //
 // The machine combiner pass uses machine trace metrics to ensure the combined
-// instructions does not lengthen the critical path or the resource depth.
+// instructions do not lengthen the critical path or the resource depth.
 //===----------------------------------------------------------------------===//
 
 #define DEBUG_TYPE "machine-combiner"
@@ -354,6 +354,19 @@ bool MachineCombiner::doSubstitute(unsigned NewSize, unsigned OldSize) {
   return false;
 }
 
+static void insertDeleteInstructions(MachineBasicBlock *MBB, MachineInstr &MI,
+                                     SmallVector<MachineInstr *, 16> InsInstrs,
+                                     SmallVector<MachineInstr *, 16> DelInstrs,
+                                     MachineTraceMetrics *Traces) {
+  for (auto *InstrPtr : InsInstrs)
+    MBB->insert((MachineBasicBlock::iterator)&MI, InstrPtr);
+  for (auto *InstrPtr : DelInstrs)
+    InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
+  ++NumInstCombined;
+  Traces->invalidate(MBB);
+  Traces->verifyAnalysis();
+}
+
 /// Substitute a slow code sequence with a faster one by
 /// evaluating instruction combining pattern.
 /// The prototype of such a pattern is MUl + ADD -> MADD. Performs instruction
@@ -408,7 +421,6 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
       DenseMap<unsigned, unsigned> InstrIdxForVirtReg;
       if (!MinInstr)
         MinInstr = Traces->getEnsemble(MachineTraceMetrics::TS_MinInstrCount);
-      MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
       Traces->verifyAnalysis();
       TII->genAlternativeCodeSequence(MI, P, InsInstrs, DelInstrs,
                                       InstrIdxForVirtReg);
@@ -428,23 +440,23 @@ bool MachineCombiner::combineInstructions(MachineBasicBlock *MBB) {
       // fewer instructions OR
       // the new sequence neither lengthens the critical path nor increases
       // resource pressure.
-      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount) ||
-          (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs,
-                                   DelInstrs, InstrIdxForVirtReg, P) &&
-           preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs))) {
-        for (auto *InstrPtr : InsInstrs)
-          MBB->insert((MachineBasicBlock::iterator) &MI, InstrPtr);
-        for (auto *InstrPtr : DelInstrs)
-          InstrPtr->eraseFromParentAndMarkDBGValuesForRemoval();
-
-        Changed = true;
-        ++NumInstCombined;
-
-        Traces->invalidate(MBB);
-        Traces->verifyAnalysis();
+      if (SubstituteAlways || doSubstitute(NewInstCount, OldInstCount)) {
+        insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
         // Eagerly stop after the first pattern fires.
+        Changed = true;
         break;
       } else {
+        // Calculating the trace metrics may be expensive,
+        // so only do this when necessary.
+        MachineTraceMetrics::Trace BlockTrace = MinInstr->getTrace(MBB);
+        if (improvesCriticalPathLen(MBB, &MI, BlockTrace, InsInstrs, DelInstrs,
+                                    InstrIdxForVirtReg, P) &&
+            preservesResourceLen(MBB, BlockTrace, InsInstrs, DelInstrs)) {
+          insertDeleteInstructions(MBB, MI, InsInstrs, DelInstrs, Traces);
+          // Eagerly stop after the first pattern fires.
+          Changed = true;
+          break;
+        }
         // Cleanup instructions of the alternative code sequence. There is no
         // use for them.
         MachineFunction *MF = MBB->getParent();
diff --git a/lib/CodeGen/MachineCopyPropagation.cpp b/lib/CodeGen/MachineCopyPropagation.cpp
index 92d043df26b85601dace8e432aea12c59f414ff1..7312dc5e94bddc6a6cf3511bbbb12121cc66bcf5 100644
--- a/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/lib/CodeGen/MachineCopyPropagation.cpp
@@ -61,6 +61,7 @@ namespace {
 
   private:
     void ClobberRegister(unsigned Reg);
+    void ReadRegister(unsigned Reg);
     void CopyPropagateBlock(MachineBasicBlock &MBB);
     bool eraseIfRedundant(MachineInstr &Copy, unsigned Src, unsigned Def);
 
@@ -120,6 +121,18 @@ void MachineCopyPropagation::ClobberRegister(unsigned Reg) {
   }
 }
 
+void MachineCopyPropagation::ReadRegister(unsigned Reg) {
+  // If 'Reg' is defined by a copy, the copy is no longer a candidate
+  // for elimination.
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
+    Reg2MIMap::iterator CI = CopyMap.find(*AI);
+    if (CI != CopyMap.end()) {
+      DEBUG(dbgs() << "MCP: Copy is used - not dead: "; CI->second->dump());
+      MaybeDeadCopies.remove(CI->second);
+    }
+  }
+}
+
 /// Return true if \p PreviousCopy did copy register \p Src to register \p Def.
 /// This fact may have been obscured by sub register usage or may not be true at
 /// all even though Src and Def are subregisters of the registers used in
@@ -212,12 +225,14 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
 
       // If Src is defined by a previous copy, the previous copy cannot be
       // eliminated.
-      for (MCRegAliasIterator AI(Src, TRI, true); AI.isValid(); ++AI) {
-        Reg2MIMap::iterator CI = CopyMap.find(*AI);
-        if (CI != CopyMap.end()) {
-          DEBUG(dbgs() << "MCP: Copy is no longer dead: "; CI->second->dump());
-          MaybeDeadCopies.remove(CI->second);
-        }
+      ReadRegister(Src);
+      for (const MachineOperand &MO : MI->implicit_operands()) {
+        if (!MO.isReg() || !MO.readsReg())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        ReadRegister(Reg);
       }
 
       DEBUG(dbgs() << "MCP: Copy is a deletion candidate: "; MI->dump());
@@ -234,6 +249,14 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       // ...
       // %xmm2<def> = copy %xmm9
       ClobberRegister(Def);
+      for (const MachineOperand &MO : MI->implicit_operands()) {
+        if (!MO.isReg() || !MO.isDef())
+          continue;
+        unsigned Reg = MO.getReg();
+        if (!Reg)
+          continue;
+        ClobberRegister(Reg);
+      }
 
       // Remember Def is defined by the copy.
       for (MCSubRegIterator SR(Def, TRI, /*IncludeSelf=*/true); SR.isValid();
@@ -269,25 +292,8 @@ void MachineCopyPropagation::CopyPropagateBlock(MachineBasicBlock &MBB) {
       if (MO.isDef()) {
         Defs.push_back(Reg);
         continue;
-      }
-
-      // If 'Reg' is defined by a copy, the copy is no longer a candidate
-      // for elimination.
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-        Reg2MIMap::iterator CI = CopyMap.find(*AI);
-        if (CI != CopyMap.end()) {
-          DEBUG(dbgs() << "MCP: Copy is used - not dead: "; CI->second->dump());
-          MaybeDeadCopies.remove(CI->second);
-        }
-      }
-      // Treat undef use like defs for copy propagation but not for
-      // dead copy. We would need to do a liveness check to be sure the copy
-      // is dead for undef uses.
-      // The backends are allowed to do whatever they want with undef value
-      // and we cannot be sure this register will not be rewritten to break
-      // some false dependencies for the hardware for instance.
-      if (MO.isUndef())
-        Defs.push_back(Reg);
+      } else if (MO.readsReg())
+        ReadRegister(Reg);
     }
 
     // The instruction has a register mask operand which means that it clobbers
diff --git a/lib/CodeGen/MachineDominators.cpp b/lib/CodeGen/MachineDominators.cpp
index 303a6a9263be7a51c81cc7ccc1c23386f40e48b0..e3a6c51c47ad55e71e3753b97781f820754bfc9c 100644
--- a/lib/CodeGen/MachineDominators.cpp
+++ b/lib/CodeGen/MachineDominators.cpp
@@ -49,32 +49,29 @@ void MachineDominatorTree::getAnalysisUsage(AnalysisUsage &AU) const {
 bool MachineDominatorTree::runOnMachineFunction(MachineFunction &F) {
   CriticalEdgesToSplit.clear();
   NewBBs.clear();
+  DT.reset(new DominatorTreeBase<MachineBasicBlock>(false));
   DT->recalculate(F);
-
   return false;
 }
 
 MachineDominatorTree::MachineDominatorTree()
     : MachineFunctionPass(ID) {
   initializeMachineDominatorTreePass(*PassRegistry::getPassRegistry());
-  DT = new DominatorTreeBase<MachineBasicBlock>(false);
-}
-
-MachineDominatorTree::~MachineDominatorTree() {
-  delete DT;
 }
 
 void MachineDominatorTree::releaseMemory() {
-  DT->releaseMemory();
+  CriticalEdgesToSplit.clear();
+  DT.reset(nullptr);
 }
 
 void MachineDominatorTree::verifyAnalysis() const {
-  if (VerifyMachineDomInfo)
+  if (DT && VerifyMachineDomInfo)
     verifyDomTree();
 }
 
 void MachineDominatorTree::print(raw_ostream &OS, const Module*) const {
-  DT->print(OS);
+  if (DT)
+    DT->print(OS);
 }
 
 void MachineDominatorTree::applySplitCriticalEdges() const {
@@ -143,15 +140,18 @@ void MachineDominatorTree::applySplitCriticalEdges() const {
 }
 
 void MachineDominatorTree::verifyDomTree() const {
+  if (!DT)
+    return;
   MachineFunction &F = *getRoot()->getParent();
 
-  MachineDominatorTree OtherDT;
-  OtherDT.DT->recalculate(F);
-  if (compare(OtherDT)) {
+  DominatorTreeBase<MachineBasicBlock> OtherDT(false);
+  OtherDT.recalculate(F);
+  if (getRootNode()->getBlock() != OtherDT.getRootNode()->getBlock() ||
+      DT->compare(OtherDT)) {
     errs() << "MachineDominatorTree is not up to date!\nComputed:\n";
-    print(errs(), nullptr);
+    DT->print(errs());
     errs() << "\nActual:\n";
-    OtherDT.print(errs(), nullptr);
+    OtherDT.print(errs());
     abort();
   }
 }
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 831e7d859bb2c8571d5f7ec539e5c8ffccddc33d..c9767a25e908dcf4cc5a84a115a2dc5ee1e6a4de 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -169,6 +169,7 @@ void MachineFunction::clear() {
   InstructionRecycler.clear(Allocator);
   OperandRecycler.clear(Allocator);
   BasicBlockRecycler.clear(Allocator);
+  VariableDbgInfos.clear();
   if (RegInfo) {
     RegInfo->~MachineRegisterInfo();
     Allocator.Deallocate(RegInfo);
@@ -859,7 +860,9 @@ BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const {
   if (!isCalleeSavedInfoValid())
     return BV;
 
-  for (const MCPhysReg *CSR = TRI->getCalleeSavedRegs(&MF); CSR && *CSR; ++CSR)
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (const MCPhysReg *CSR = MRI.getCalleeSavedRegs(); CSR && *CSR;
+       ++CSR)
     BV.set(*CSR);
 
   // Saved CSRs are not pristine.
diff --git a/lib/CodeGen/MachineInstr.cpp b/lib/CodeGen/MachineInstr.cpp
index 011f6be77706055a1d457de94396a8e933a589b8..c0a8b95ed8a06dfccdb908b9d52fa1e9064caca2 100644
--- a/lib/CodeGen/MachineInstr.cpp
+++ b/lib/CodeGen/MachineInstr.cpp
@@ -262,8 +262,21 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
     return getBlockAddress() == Other.getBlockAddress() &&
            getOffset() == Other.getOffset();
   case MachineOperand::MO_RegisterMask:
-  case MachineOperand::MO_RegisterLiveOut:
-    return getRegMask() == Other.getRegMask();
+  case MachineOperand::MO_RegisterLiveOut: {
+    // Shallow compare of the two RegMasks
+    const uint32_t *RegMask = getRegMask();
+    const uint32_t *OtherRegMask = Other.getRegMask();
+    if (RegMask == OtherRegMask)
+      return true;
+
+    // Calculate the size of the RegMask
+    const MachineFunction *MF = getParent()->getParent()->getParent();
+    const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+    unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
+
+    // Deep compare of the two RegMasks
+    return std::equal(RegMask, RegMask + RegMaskSize, OtherRegMask);
+  }
   case MachineOperand::MO_MCSymbol:
     return getMCSymbol() == Other.getMCSymbol();
   case MachineOperand::MO_CFIIndex:
@@ -274,6 +287,8 @@ bool MachineOperand::isIdenticalTo(const MachineOperand &Other) const {
     return getIntrinsicID() == Other.getIntrinsicID();
   case MachineOperand::MO_Predicate:
     return getPredicate() == Other.getPredicate();
+  case MachineOperand::MO_Placeholder:
+    return true;
   }
   llvm_unreachable("Invalid machine operand type");
 }
@@ -322,6 +337,8 @@ hash_code llvm::hash_value(const MachineOperand &MO) {
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getIntrinsicID());
   case MachineOperand::MO_Predicate:
     return hash_combine(MO.getType(), MO.getTargetFlags(), MO.getPredicate());
+  case MachineOperand::MO_Placeholder:
+    return hash_combine();
   }
   llvm_unreachable("Invalid machine operand type");
 }
@@ -403,6 +420,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
       bool Unused;
       APF.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven, &Unused);
       OS << "half " << APF.convertToFloat();
+    } else if (getFPImm()->getType()->isFP128Ty()) {
+      APFloat APF = getFPImm()->getValueAPF();
+      SmallString<16> Str;
+      getFPImm()->getValueAPF().toString(Str);
+      OS << "quad " << Str;
     } else {
       OS << getFPImm()->getValueAPF().convertToDouble();
     }
@@ -491,7 +513,11 @@ void MachineOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     auto Pred = static_cast<CmpInst::Predicate>(getPredicate());
     OS << '<' << (CmpInst::isIntPredicate(Pred) ? "intpred" : "floatpred")
        << CmpInst::getPredicateName(Pred) << '>';
+    break;
   }
+  case MachineOperand::MO_Placeholder:
+    OS << "<placeholder>";
+    break;
   }
   if (unsigned TF = getTargetFlags())
     OS << "[TF=" << TF << ']';
@@ -1571,6 +1597,65 @@ bool MachineInstr::isSafeToMove(AliasAnalysis *AA, bool &SawStore) const {
   return true;
 }
 
+bool MachineInstr::mayAlias(AliasAnalysis *AA, MachineInstr &Other,
+                            bool UseTBAA) {
+  const MachineFunction *MF = getParent()->getParent();
+  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
+
+  // If neither instruction stores to memory, they can't alias in any
+  // meaningful way, even if they read from the same address.
+  if (!mayStore() && !Other.mayStore())
+    return false;
+
+  // Let the target decide if memory accesses cannot possibly overlap.
+  if (TII->areMemAccessesTriviallyDisjoint(*this, Other, AA))
+    return false;
+
+  if (!AA)
+    return true;
+
+  // FIXME: Need to handle multiple memory operands to support all targets.
+  if (!hasOneMemOperand() || !Other.hasOneMemOperand())
+    return true;
+
+  MachineMemOperand *MMOa = *memoperands_begin();
+  MachineMemOperand *MMOb = *Other.memoperands_begin();
+
+  if (!MMOa->getValue() || !MMOb->getValue())
+    return true;
+
+  // The following interface to AA is fashioned after DAGCombiner::isAlias
+  // and operates with MachineMemOperand offset with some important
+  // assumptions:
+  //   - LLVM fundamentally assumes flat address spaces.
+  //   - MachineOperand offset can *only* result from legalization and
+  //     cannot affect queries other than the trivial case of overlap
+  //     checking.
+  //   - These offsets never wrap and never step outside
+  //     of allocated objects.
+  //   - There should never be any negative offsets here.
+  //
+  // FIXME: Modify API to hide this math from "user"
+  // FIXME: Even before we go to AA we can reason locally about some
+  // memory objects. It can save compile time, and possibly catch some
+  // corner cases not currently covered.
+
+  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
+  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");
+
+  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
+  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
+  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;
+
+  AliasResult AAResult =
+      AA->alias(MemoryLocation(MMOa->getValue(), Overlapa,
+                               UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
+                MemoryLocation(MMOb->getValue(), Overlapb,
+                               UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
+
+  return (AAResult != NoAlias);
+}
+
 /// hasOrderedMemoryRef - Return true if this instruction may have an ordered
 /// or volatile memory reference, or if the information describing the memory
 /// reference is not available. Return false if it is known to have no ordered
@@ -1699,7 +1784,7 @@ LLVM_DUMP_METHOD void MachineInstr::dump() const {
 }
 #endif
 
-void MachineInstr::print(raw_ostream &OS, bool SkipOpers,
+void MachineInstr::print(raw_ostream &OS, bool SkipOpers, bool SkipDebugLoc,
                          const TargetInstrInfo *TII) const {
   const Module *M = nullptr;
   if (const MachineBasicBlock *MBB = getParent())
@@ -1707,11 +1792,12 @@ void MachineInstr::print(raw_ostream &OS, bool SkipOpers,
       M = MF->getFunction()->getParent();
 
   ModuleSlotTracker MST(M);
-  print(OS, MST, SkipOpers, TII);
+  print(OS, MST, SkipOpers, SkipDebugLoc, TII);
 }
 
 void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
-                         bool SkipOpers, const TargetInstrInfo *TII) const {
+                         bool SkipOpers, bool SkipDebugLoc,
+                         const TargetInstrInfo *TII) const {
   // We can be a bit tidier if we know the MachineFunction.
   const MachineFunction *MF = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
@@ -1987,6 +2073,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     }
     if (isIndirectDebugValue())
       OS << " indirect";
+  } else if (SkipDebugLoc) {
+    return;
   } else if (debugLoc && MF) {
     if (!HaveSemi)
       OS << ";";
diff --git a/lib/CodeGen/MachineModuleInfo.cpp b/lib/CodeGen/MachineModuleInfo.cpp
index 6618857477ed4763de3c701a6e5389bba20b1fcb..2f0f4297ef5c5c5b2ba3498e37be8705fa2e14d8 100644
--- a/lib/CodeGen/MachineModuleInfo.cpp
+++ b/lib/CodeGen/MachineModuleInfo.cpp
@@ -306,6 +306,10 @@ public:
     MMI.deleteMachineFunctionFor(F);
     return true;
   }
+  
+  StringRef getPassName() const override {
+    return "Free MachineFunction";
+  } 
 };
 char FreeMachineFunction::ID;
 } // end anonymous namespace
diff --git a/lib/CodeGen/MachineModuleInfoImpls.cpp b/lib/CodeGen/MachineModuleInfoImpls.cpp
index 22d519e5d88fa38a95e99ac1780a0daa31971e79..4c81fd91cb829e9cf6bc5b16fafa5af5bd9b0b1e 100644
--- a/lib/CodeGen/MachineModuleInfoImpls.cpp
+++ b/lib/CodeGen/MachineModuleInfoImpls.cpp
@@ -23,6 +23,7 @@ using namespace llvm;
 // Out of line virtual method.
 void MachineModuleInfoMachO::anchor() {}
 void MachineModuleInfoELF::anchor() {}
+void MachineModuleInfoWasm::anchor() {}
 
 static int SortSymbolPair(const void *LHS, const void *RHS) {
   typedef std::pair<MCSymbol*, MachineModuleInfoImpl::StubValueTy> PairTy;
diff --git a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
index 0fc286969ebc81bcc010765aac16eed46f2804ef..6b6b5f2814a9089848850ee5c2ae48aa5d7d2a2d 100644
--- a/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
+++ b/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -14,13 +14,23 @@
 ///===---------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
-#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/LLVMContext.h"
 
 using namespace llvm;
 
+DiagnosticInfoMIROptimization::MachineArgument::MachineArgument(
+    StringRef MKey, const MachineInstr &MI)
+    : Argument() {
+  Key = MKey;
+
+  raw_string_ostream OS(Val);
+  MI.print(OS, /*SkipOpers=*/false, /*SkipDebugLoc=*/true);
+}
+
 Optional<uint64_t>
 MachineOptimizationRemarkEmitter::computeHotness(const MachineBasicBlock &MBB) {
   if (!MBFI)
@@ -64,7 +74,7 @@ bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction(
   MachineBlockFrequencyInfo *MBFI;
 
   if (MF.getFunction()->getContext().getDiagnosticHotnessRequested())
-    MBFI = &getAnalysis<MachineBlockFrequencyInfo>();
+    MBFI = &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI();
   else
     MBFI = nullptr;
 
@@ -74,7 +84,7 @@ bool MachineOptimizationRemarkEmitterPass::runOnMachineFunction(
 
 void MachineOptimizationRemarkEmitterPass::getAnalysisUsage(
     AnalysisUsage &AU) const {
-  AU.addRequired<MachineBlockFrequencyInfo>();
+  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
@@ -85,6 +95,6 @@ static const char ore_name[] = "Machine Optimization Remark Emitter";
 
 INITIALIZE_PASS_BEGIN(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name,
                       false, true)
-INITIALIZE_PASS_DEPENDENCY(MachineBlockFrequencyInfo)
+INITIALIZE_PASS_DEPENDENCY(LazyMachineBlockFrequencyInfoPass)
 INITIALIZE_PASS_END(MachineOptimizationRemarkEmitterPass, ORE_NAME, ore_name,
                     false, true)
diff --git a/lib/CodeGen/MachineOutliner.cpp b/lib/CodeGen/MachineOutliner.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..581a8ad811497847eb6b7a777518b8cda82cbbe6
--- /dev/null
+++ b/lib/CodeGen/MachineOutliner.cpp
@@ -0,0 +1,1251 @@
+//===---- MachineOutliner.cpp - Outline instructions -----------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Replaces repeated sequences of instructions with function calls.
+///
+/// This works by placing every instruction from every basic block in a
+/// suffix tree, and repeatedly querying that tree for repeated sequences of
+/// instructions. If a sequence of instructions appears often, then it ought
+/// to be beneficial to pull out into a function.
+///
+/// This was originally presented at the 2016 LLVM Developers' Meeting in the
+/// talk "Reducing Code Size Using Outlining". For a high-level overview of
+/// how this pass works, the talk is available on YouTube at
+///
+/// https://www.youtube.com/watch?v=yorld-WSOeU
+///
+/// The slides for the talk are available at
+///
+/// http://www.llvm.org/devmtg/2016-11/Slides/Paquette-Outliner.pdf
+///
+/// The talk provides an overview of how the outliner finds candidates and
+/// ultimately outlines them. It describes how the main data structure for this
+/// pass, the suffix tree, is queried and purged for candidates. It also gives
+/// a simplified suffix tree construction algorithm for suffix trees based off
+/// of the algorithm actually used here, Ukkonen's algorithm.
+///
+/// For the original RFC for this pass, please see
+///
+/// http://lists.llvm.org/pipermail/llvm-dev/2016-August/104170.html
+///
+/// For more information on the suffix tree data structure, please see
+/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
+///
+//===----------------------------------------------------------------------===//
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <functional>
+#include <map>
+#include <sstream>
+#include <tuple>
+#include <vector>
+
+#define DEBUG_TYPE "machine-outliner"
+
+using namespace llvm;
+
+STATISTIC(NumOutlined, "Number of candidates outlined");
+STATISTIC(FunctionsCreated, "Number of functions created");
+
+namespace {
+
+/// \brief An individual sequence of instructions to be replaced with a call to
+/// an outlined function.
+struct Candidate {
+
+  /// Set to false if the candidate overlapped with another candidate.
+  bool InCandidateList = true;
+
+  /// The start index of this \p Candidate.
+  size_t StartIdx;
+
+  /// The number of instructions in this \p Candidate.
+  size_t Len;
+
+  /// The index of this \p Candidate's \p OutlinedFunction in the list of
+  /// \p OutlinedFunctions.
+  size_t FunctionIdx;
+
+  /// \brief The number of instructions that would be saved by outlining every
+  /// candidate of this type.
+  ///
+  /// This is a fixed value which is not updated during the candidate pruning
+  /// process. It is only used for deciding which candidate to keep if two
+  /// candidates overlap. The true benefit is stored in the OutlinedFunction
+  /// for some given candidate.
+  unsigned Benefit = 0;
+
+  Candidate(size_t StartIdx, size_t Len, size_t FunctionIdx)
+      : StartIdx(StartIdx), Len(Len), FunctionIdx(FunctionIdx) {}
+
+  Candidate() {}
+
+  /// \brief Used to ensure that \p Candidates are outlined in an order that
+  /// preserves the start and end indices of other \p Candidates.
+  bool operator<(const Candidate &RHS) const { return StartIdx > RHS.StartIdx; }
+};
+
+/// \brief The information necessary to create an outlined function for some
+/// class of candidate.
+struct OutlinedFunction {
+
+  /// The actual outlined function created.
+  /// This is initialized after we go through and create the actual function.
+  MachineFunction *MF = nullptr;
+
+  /// A number assigned to this function which appears at the end of its name.
+  size_t Name;
+
+  /// The number of candidates for this OutlinedFunction.
+  size_t OccurrenceCount = 0;
+
+  /// \brief The sequence of integers corresponding to the instructions in this
+  /// function.
+  std::vector<unsigned> Sequence;
+
+  /// The number of instructions this function would save.
+  unsigned Benefit = 0;
+
+  /// \brief Set to true if candidates for this outlined function should be
+  /// replaced with tail calls to this OutlinedFunction.
+  bool IsTailCall = false;
+
+  OutlinedFunction(size_t Name, size_t OccurrenceCount,
+                   const std::vector<unsigned> &Sequence,
+                   unsigned Benefit, bool IsTailCall)
+      : Name(Name), OccurrenceCount(OccurrenceCount), Sequence(Sequence),
+        Benefit(Benefit), IsTailCall(IsTailCall)
+        {}
+};
+
+/// Represents an undefined index in the suffix tree.
+const size_t EmptyIdx = -1;
+
+/// A node in a suffix tree which represents a substring or suffix.
+///
+/// Each node has either no children or at least two children, with the root
+/// being a exception in the empty tree.
+///
+/// Children are represented as a map between unsigned integers and nodes. If
+/// a node N has a child M on unsigned integer k, then the mapping represented
+/// by N is a proper prefix of the mapping represented by M. Note that this,
+/// although similar to a trie is somewhat different: each node stores a full
+/// substring of the full mapping rather than a single character state.
+///
+/// Each internal node contains a pointer to the internal node representing
+/// the same string, but with the first character chopped off. This is stored
+/// in \p Link. Each leaf node stores the start index of its respective
+/// suffix in \p SuffixIdx.
+struct SuffixTreeNode {
+
+  /// The children of this node.
+  ///
+  /// A child existing on an unsigned integer implies that from the mapping
+  /// represented by the current node, there is a way to reach another
+  /// mapping by tacking that character on the end of the current string.
+  DenseMap<unsigned, SuffixTreeNode *> Children;
+
+  /// A flag set to false if the node has been pruned from the tree.
+  bool IsInTree = true;
+
+  /// The start index of this node's substring in the main string.
+  size_t StartIdx = EmptyIdx;
+
+  /// The end index of this node's substring in the main string.
+  ///
+  /// Every leaf node must have its \p EndIdx incremented at the end of every
+  /// step in the construction algorithm. To avoid having to update O(N)
+  /// nodes individually at the end of every step, the end index is stored
+  /// as a pointer.
+  size_t *EndIdx = nullptr;
+
+  /// For leaves, the start index of the suffix represented by this node.
+  ///
+  /// For all other nodes, this is ignored.
+  size_t SuffixIdx = EmptyIdx;
+
+  /// \brief For internal nodes, a pointer to the internal node representing
+  /// the same sequence with the first character chopped off.
+  ///
+  /// This has two major purposes in the suffix tree. The first is as a
+  /// shortcut in Ukkonen's construction algorithm. One of the things that
+  /// Ukkonen's algorithm does to achieve linear-time construction is
+  /// keep track of which node the next insert should be at. This makes each
+  /// insert O(1), and there are a total of O(N) inserts. The suffix link
+  /// helps with inserting children of internal nodes.
+  ///
+  /// Say we add a child to an internal node with associated mapping S. The 
+  /// next insertion must be at the node representing S - its first character.
+  /// This is given by the way that we iteratively build the tree in Ukkonen's
+  /// algorithm. The main idea is to look at the suffixes of each prefix in the
+  /// string, starting with the longest suffix of the prefix, and ending with
+  /// the shortest. Therefore, if we keep pointers between such nodes, we can
+  /// move to the next insertion point in O(1) time. If we don't, then we'd
+  /// have to query from the root, which takes O(N) time. This would make the
+  /// construction algorithm O(N^2) rather than O(N).
+  ///
+  /// The suffix link is also used during the tree pruning process to let us
+  /// quickly throw out a bunch of potential overlaps. Say we have a sequence
+  /// S we want to outline. Then each of its suffixes contribute to at least
+  /// one overlapping case. Therefore, we can follow the suffix links
+  /// starting at the node associated with S to the root and "delete" those
+  /// nodes, save for the root. For each candidate, this removes
+  /// O(|candidate|) overlaps from the search space. We don't actually
+  /// completely invalidate these nodes though; doing that is far too
+  /// aggressive. Consider the following pathological string:
+  ///
+  /// 1 2 3 1 2 3 2 3 2 3 2 3 2 3 2 3 2 3
+  ///
+  /// If we, for the sake of example, outlined 1 2 3, then we would throw
+  /// out all instances of 2 3. This isn't desirable. To get around this,
+  /// when we visit a link node, we decrement its occurrence count by the
+  /// number of sequences we outlined in the current step. In the pathological
+  /// example, the 2 3 node would have an occurrence count of 8, while the
+  /// 1 2 3 node would have an occurrence count of 2. Thus, the 2 3 node
+  /// would survive to the next round allowing us to outline the extra
+  /// instances of 2 3.
+  SuffixTreeNode *Link = nullptr;
+
+  /// The parent of this node. Every node except for the root has a parent.
+  SuffixTreeNode *Parent = nullptr;
+
+  /// The number of times this node's string appears in the tree.
+  ///
+  /// This is equal to the number of leaf children of the string. It represents
+  /// the number of suffixes that the node's string is a prefix of.
+  size_t OccurrenceCount = 0;
+
+  /// The length of the string formed by concatenating the edge labels from the
+  /// root to this node.
+  size_t ConcatLen = 0;
+
+  /// Returns true if this node is a leaf.
+  bool isLeaf() const { return SuffixIdx != EmptyIdx; }
+
+  /// Returns true if this node is the root of its owning \p SuffixTree.
+  bool isRoot() const { return StartIdx == EmptyIdx; }
+
+  /// Return the number of elements in the substring associated with this node.
+  size_t size() const {
+
+    // Is it the root? If so, it's the empty string so return 0.
+    if (isRoot())
+      return 0;
+
+    assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
+
+    // Size = the number of elements in the string.
+    // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
+    return *EndIdx - StartIdx + 1;
+  }
+
+  SuffixTreeNode(size_t StartIdx, size_t *EndIdx, SuffixTreeNode *Link,
+                 SuffixTreeNode *Parent)
+      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link), Parent(Parent) {}
+
+  SuffixTreeNode() {}
+};
+
+/// A data structure for fast substring queries.
+///
+/// Suffix trees represent the suffixes of their input strings in their leaves.
+/// A suffix tree is a type of compressed trie structure where each node
+/// represents an entire substring rather than a single character. Each leaf
+/// of the tree is a suffix.
+///
+/// A suffix tree can be seen as a type of state machine where each state is a
+/// substring of the full string. The tree is structured so that, for a string
+/// of length N, there are exactly N leaves in the tree. This structure allows
+/// us to quickly find repeated substrings of the input string.
+///
+/// In this implementation, a "string" is a vector of unsigned integers.
+/// These integers may result from hashing some data type. A suffix tree can
+/// contain 1 or many strings, which can then be queried as one large string.
+///
+/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
+/// suffix tree construction. Ukkonen's algorithm is explained in more detail
+/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
+/// paper is available at
+///
+/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
+class SuffixTree {
+private:
+  /// Each element is an integer representing an instruction in the module.
+  ArrayRef<unsigned> Str;
+
+  /// Maintains each node in the tree.
+  SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
+
+  /// The root of the suffix tree.
+  ///
+  /// The root represents the empty string. It is maintained by the
+  /// \p NodeAllocator like every other node in the tree.
+  SuffixTreeNode *Root = nullptr;
+
+  /// Stores each leaf node in the tree.
+  ///
+  /// This is used for finding outlining candidates.
+  std::vector<SuffixTreeNode *> LeafVector;
+
+  /// Maintains the end indices of the internal nodes in the tree.
+  ///
+  /// Each internal node is guaranteed to never have its end index change
+  /// during the construction algorithm; however, leaves must be updated at
+  /// every step. Therefore, we need to store leaf end indices by reference
+  /// to avoid updating O(N) leaves at every step of construction. Thus,
+  /// every internal node must be allocated its own end index.
+  BumpPtrAllocator InternalEndIdxAllocator;
+
+  /// The end index of each leaf in the tree.
+  size_t LeafEndIdx = -1;
+
+  /// \brief Helper struct which keeps track of the next insertion point in
+  /// Ukkonen's algorithm.
+  struct ActiveState {
+    /// The next node to insert at.
+    SuffixTreeNode *Node;
+
+    /// The index of the first character in the substring currently being added.
+    size_t Idx = EmptyIdx;
+
+    /// The length of the substring we have to add at the current step.
+    size_t Len = 0;
+  };
+
+  /// \brief The point the next insertion will take place at in the
+  /// construction algorithm.
+  ActiveState Active;
+
+  /// Allocate a leaf node and add it to the tree.
+  ///
+  /// \param Parent The parent of this node.
+  /// \param StartIdx The start index of this node's associated string.
+  /// \param Edge The label on the edge leaving \p Parent to this node.
+  ///
+  /// \returns A pointer to the allocated leaf node.
+  SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, size_t StartIdx,
+                             unsigned Edge) {
+
+    assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
+
+    SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, 
+                                                                   &LeafEndIdx,
+                                                                       nullptr,
+                                                                      &Parent);
+    Parent.Children[Edge] = N;
+
+    return N;
+  }
+
+  /// Allocate an internal node and add it to the tree.
+  ///
+  /// \param Parent The parent of this node. Only null when allocating the root.
+  /// \param StartIdx The start index of this node's associated string.
+  /// \param EndIdx The end index of this node's associated string.
+  /// \param Edge The label on the edge leaving \p Parent to this node.
+  ///
+  /// \returns A pointer to the allocated internal node.
+  SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, size_t StartIdx,
+                                     size_t EndIdx, unsigned Edge) {
+
+    assert(StartIdx <= EndIdx && "String can't start after it ends!");
+    assert(!(!Parent && StartIdx != EmptyIdx) &&
+    "Non-root internal nodes must have parents!");
+
+    size_t *E = new (InternalEndIdxAllocator) size_t(EndIdx);
+    SuffixTreeNode *N = new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx,
+                                                                      E,
+                                                                      Root,
+                                                                      Parent);
+    if (Parent)
+      Parent->Children[Edge] = N;
+
+    return N;
+  }
+
+  /// \brief Set the suffix indices of the leaves to the start indices of their
+  /// respective suffixes. Also stores each leaf in \p LeafVector at its
+  /// respective suffix index.
+  ///
+  /// \param[in] CurrNode The node currently being visited.
+  /// \param CurrIdx The current index of the string being visited.
+  void setSuffixIndices(SuffixTreeNode &CurrNode, size_t CurrIdx) {
+
+    bool IsLeaf = CurrNode.Children.size() == 0 && !CurrNode.isRoot();
+
+    // Store the length of the concatenation of all strings from the root to
+    // this node.
+    if (!CurrNode.isRoot()) {
+      if (CurrNode.ConcatLen == 0)
+        CurrNode.ConcatLen = CurrNode.size();
+
+      if (CurrNode.Parent)
+       CurrNode.ConcatLen += CurrNode.Parent->ConcatLen;
+    }
+
+    // Traverse the tree depth-first.
+    for (auto &ChildPair : CurrNode.Children) {
+      assert(ChildPair.second && "Node had a null child!");
+      setSuffixIndices(*ChildPair.second,
+                       CurrIdx + ChildPair.second->size());
+    }
+
+    // Is this node a leaf?
+    if (IsLeaf) {
+      // If yes, give it a suffix index and bump its parent's occurrence count.
+      CurrNode.SuffixIdx = Str.size() - CurrIdx;
+      assert(CurrNode.Parent && "CurrNode had no parent!");
+      CurrNode.Parent->OccurrenceCount++;
+
+      // Store the leaf in the leaf vector for pruning later.
+      LeafVector[CurrNode.SuffixIdx] = &CurrNode;
+    }
+  }
+
+  /// \brief Construct the suffix tree for the prefix of the input ending at
+  /// \p EndIdx.
+  ///
+  /// Used to construct the full suffix tree iteratively. At the end of each
+  /// step, the constructed suffix tree is either a valid suffix tree, or a
+  /// suffix tree with implicit suffixes. At the end of the final step, the
+  /// suffix tree is a valid tree.
+  ///
+  /// \param EndIdx The end index of the current prefix in the main string.
+  /// \param SuffixesToAdd The number of suffixes that must be added
+  /// to complete the suffix tree at the current phase.
+  ///
+  /// \returns The number of suffixes that have not been added at the end of
+  /// this step.
+  unsigned extend(size_t EndIdx, size_t SuffixesToAdd) {
+    SuffixTreeNode *NeedsLink = nullptr;
+
+    while (SuffixesToAdd > 0) {
+    
+      // Are we waiting to add anything other than just the last character?
+      if (Active.Len == 0) {
+        // If not, then say the active index is the end index.
+        Active.Idx = EndIdx;
+      }
+
+      assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
+
+      // The first character in the current substring we're looking at.
+      unsigned FirstChar = Str[Active.Idx];
+
+      // Have we inserted anything starting with FirstChar at the current node?
+      if (Active.Node->Children.count(FirstChar) == 0) {
+        // If not, then we can just insert a leaf and move too the next step.
+        insertLeaf(*Active.Node, EndIdx, FirstChar);
+
+        // The active node is an internal node, and we visited it, so it must
+        // need a link if it doesn't have one.
+        if (NeedsLink) {
+          NeedsLink->Link = Active.Node;
+          NeedsLink = nullptr;
+        }
+      } else {
+        // There's a match with FirstChar, so look for the point in the tree to
+        // insert a new node.
+        SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
+
+        size_t SubstringLen = NextNode->size();
+
+        // Is the current suffix we're trying to insert longer than the size of
+        // the child we want to move to?
+        if (Active.Len >= SubstringLen) {
+          // If yes, then consume the characters we've seen and move to the next
+          // node.
+          Active.Idx += SubstringLen;
+          Active.Len -= SubstringLen;
+          Active.Node = NextNode;
+          continue;
+        }
+
+        // Otherwise, the suffix we're trying to insert must be contained in the
+        // next node we want to move to.
+        unsigned LastChar = Str[EndIdx];
+
+        // Is the string we're trying to insert a substring of the next node?
+        if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
+          // If yes, then we're done for this step. Remember our insertion point
+          // and move to the next end index. At this point, we have an implicit
+          // suffix tree.
+          if (NeedsLink && !Active.Node->isRoot()) {
+            NeedsLink->Link = Active.Node;
+            NeedsLink = nullptr;
+          }
+
+          Active.Len++;
+          break;
+        }
+
+        // The string we're trying to insert isn't a substring of the next node,
+        // but matches up to a point. Split the node.
+        //
+        // For example, say we ended our search at a node n and we're trying to
+        // insert ABD. Then we'll create a new node s for AB, reduce n to just
+        // representing C, and insert a new leaf node l to represent d. This
+        // allows us to ensure that if n was a leaf, it remains a leaf.
+        //
+        //   | ABC  ---split--->  | AB
+        //   n                    s
+        //                     C / \ D
+        //                      n   l
+
+        // The node s from the diagram
+        SuffixTreeNode *SplitNode =
+            insertInternalNode(Active.Node,
+                               NextNode->StartIdx,
+                               NextNode->StartIdx + Active.Len - 1,
+                               FirstChar);
+
+        // Insert the new node representing the new substring into the tree as
+        // a child of the split node. This is the node l from the diagram.
+        insertLeaf(*SplitNode, EndIdx, LastChar);
+
+        // Make the old node a child of the split node and update its start
+        // index. This is the node n from the diagram.
+        NextNode->StartIdx += Active.Len;
+        NextNode->Parent = SplitNode;
+        SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
+
+        // SplitNode is an internal node, update the suffix link.
+        if (NeedsLink)
+          NeedsLink->Link = SplitNode;
+
+        NeedsLink = SplitNode;
+      }
+
+      // We've added something new to the tree, so there's one less suffix to
+      // add.
+      SuffixesToAdd--;
+
+      if (Active.Node->isRoot()) {
+        if (Active.Len > 0) {
+          Active.Len--;
+          Active.Idx = EndIdx - SuffixesToAdd + 1;
+        }
+      } else {
+        // Start the next phase at the next smallest suffix.
+        Active.Node = Active.Node->Link;
+      }
+    }
+
+    return SuffixesToAdd;
+  }
+
+public:
+
+  /// Find all repeated substrings that satisfy \p BenefitFn.
+  ///
+  /// If a substring appears at least twice, then it must be represented by
+  /// an internal node which appears in at least two suffixes. Each suffix is
+  /// represented by a leaf node. To do this, we visit each internal node in
+  /// the tree, using the leaf children of each internal node. If an internal
+  /// node represents a beneficial substring, then we use each of its leaf
+  /// children to find the locations of its substring.
+  ///
+  /// \param[out] CandidateList Filled with candidates representing each
+  /// beneficial substring.
+  /// \param[out] FunctionList Filled with a list of \p OutlinedFunctions each
+  /// type of candidate.
+  /// \param BenefitFn The function to satisfy.
+  ///
+  /// \returns The length of the longest candidate found.
+  size_t findCandidates(std::vector<Candidate> &CandidateList,
+  std::vector<OutlinedFunction> &FunctionList,
+  const std::function<unsigned(SuffixTreeNode &, size_t, unsigned)>
+  &BenefitFn) {
+
+    CandidateList.clear();
+    FunctionList.clear();
+    size_t FnIdx = 0;
+    size_t MaxLen = 0;
+
+    for (SuffixTreeNode* Leaf : LeafVector) {
+      assert(Leaf && "Leaves in LeafVector cannot be null!");
+      if (!Leaf->IsInTree)
+        continue;
+
+      assert(Leaf->Parent && "All leaves must have parents!");
+      SuffixTreeNode &Parent = *(Leaf->Parent);
+
+      // If it doesn't appear enough, or we already outlined from it, skip it.
+      if (Parent.OccurrenceCount < 2 || Parent.isRoot() || !Parent.IsInTree)
+        continue;
+
+      size_t StringLen = Leaf->ConcatLen - Leaf->size();
+
+      // How many instructions would outlining this string save?
+      unsigned Benefit = BenefitFn(Parent,
+        StringLen, Str[Leaf->SuffixIdx + StringLen - 1]);
+
+      // If it's not beneficial, skip it.
+      if (Benefit < 1)
+        continue;
+
+      if (StringLen > MaxLen)
+        MaxLen = StringLen;
+
+      unsigned OccurrenceCount = 0;
+      for (auto &ChildPair : Parent.Children) {
+        SuffixTreeNode *M = ChildPair.second;
+
+        // Is it a leaf? If so, we have an occurrence of this candidate.
+        if (M && M->IsInTree && M->isLeaf()) {
+          OccurrenceCount++;
+          CandidateList.emplace_back(M->SuffixIdx, StringLen, FnIdx);
+          CandidateList.back().Benefit = Benefit;
+          M->IsInTree = false;
+        }
+      }
+
+      // Save the function for the new candidate sequence.
+      std::vector<unsigned> CandidateSequence;
+      for (unsigned i = Leaf->SuffixIdx; i < Leaf->SuffixIdx + StringLen; i++)
+        CandidateSequence.push_back(Str[i]);
+
+      FunctionList.emplace_back(FnIdx, OccurrenceCount, CandidateSequence,
+                                Benefit, false);
+
+      // Move to the next function.
+      FnIdx++;
+      Parent.IsInTree = false;
+    }
+
+    return MaxLen;
+  }
+ 
+  /// Construct a suffix tree from a sequence of unsigned integers.
+  ///
+  /// \param Str The string to construct the suffix tree for.
+  SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
+    Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
+    Root->IsInTree = true;
+    Active.Node = Root;
+    LeafVector = std::vector<SuffixTreeNode*>(Str.size());
+
+    // Keep track of the number of suffixes we have to add of the current
+    // prefix.
+    size_t SuffixesToAdd = 0;
+    Active.Node = Root;
+
+    // Construct the suffix tree iteratively on each prefix of the string.
+    // PfxEndIdx is the end index of the current prefix.
+    // End is one past the last element in the string.
+    for (size_t PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
+      SuffixesToAdd++;
+      LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
+      SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
+    }
+
+    // Set the suffix indices of each leaf.
+    assert(Root && "Root node can't be nullptr!");
+    setSuffixIndices(*Root, 0);
+  }
+};
+
+/// \brief Maps \p MachineInstrs to unsigned integers and stores the mappings.
+struct InstructionMapper {
+
+  /// \brief The next available integer to assign to a \p MachineInstr that
+  /// cannot be outlined.
+  ///
+  /// Set to -3 for compatability with \p DenseMapInfo<unsigned>.
+  unsigned IllegalInstrNumber = -3;
+
+  /// \brief The next available integer to assign to a \p MachineInstr that can
+  /// be outlined.
+  unsigned LegalInstrNumber = 0;
+
+  /// Correspondence from \p MachineInstrs to unsigned integers.
+  DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>
+      InstructionIntegerMap;
+
+  /// Corresponcence from unsigned integers to \p MachineInstrs.
+  /// Inverse of \p InstructionIntegerMap.
+  DenseMap<unsigned, MachineInstr *> IntegerInstructionMap;
+
+  /// The vector of unsigned integers that the module is mapped to.
+  std::vector<unsigned> UnsignedVec;
+
+  /// \brief Stores the location of the instruction associated with the integer
+  /// at index i in \p UnsignedVec for each index i.
+  std::vector<MachineBasicBlock::iterator> InstrList;
+
+  /// \brief Maps \p *It to a legal integer.
+  ///
+  /// Updates \p InstrList, \p UnsignedVec, \p InstructionIntegerMap,
+  /// \p IntegerInstructionMap, and \p LegalInstrNumber.
+  ///
+  /// \returns The integer that \p *It was mapped to.
+  unsigned mapToLegalUnsigned(MachineBasicBlock::iterator &It) {
+
+    // Get the integer for this instruction or give it the current
+    // LegalInstrNumber.
+    InstrList.push_back(It);
+    MachineInstr &MI = *It;
+    bool WasInserted;
+    DenseMap<MachineInstr *, unsigned, MachineInstrExpressionTrait>::iterator
+    ResultIt;
+    std::tie(ResultIt, WasInserted) =
+    InstructionIntegerMap.insert(std::make_pair(&MI, LegalInstrNumber));
+    unsigned MINumber = ResultIt->second;
+
+    // There was an insertion.
+    if (WasInserted) {
+      LegalInstrNumber++;
+      IntegerInstructionMap.insert(std::make_pair(MINumber, &MI));
+    }
+
+    UnsignedVec.push_back(MINumber);
+
+    // Make sure we don't overflow or use any integers reserved by the DenseMap.
+    if (LegalInstrNumber >= IllegalInstrNumber)
+      report_fatal_error("Instruction mapping overflow!");
+
+    assert(LegalInstrNumber != DenseMapInfo<unsigned>::getEmptyKey()
+          && "Tried to assign DenseMap tombstone or empty key to instruction.");
+    assert(LegalInstrNumber != DenseMapInfo<unsigned>::getTombstoneKey()
+          && "Tried to assign DenseMap tombstone or empty key to instruction.");
+
+    return MINumber;
+  }
+
+  /// Maps \p *It to an illegal integer.
+  ///
+  /// Updates \p InstrList, \p UnsignedVec, and \p IllegalInstrNumber.
+  ///
+  /// \returns The integer that \p *It was mapped to.
+  unsigned mapToIllegalUnsigned(MachineBasicBlock::iterator &It) {
+    unsigned MINumber = IllegalInstrNumber;
+
+    InstrList.push_back(It);
+    UnsignedVec.push_back(IllegalInstrNumber);
+    IllegalInstrNumber--;
+
+    assert(LegalInstrNumber < IllegalInstrNumber &&
+           "Instruction mapping overflow!");
+
+    assert(IllegalInstrNumber !=
+      DenseMapInfo<unsigned>::getEmptyKey() &&
+      "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
+
+    assert(IllegalInstrNumber !=
+      DenseMapInfo<unsigned>::getTombstoneKey() &&
+      "IllegalInstrNumber cannot be DenseMap tombstone or empty key!");
+
+    return MINumber;
+  }
+
+  /// \brief Transforms a \p MachineBasicBlock into a \p vector of \p unsigneds
+  /// and appends it to \p UnsignedVec and \p InstrList.
+  ///
+  /// Two instructions are assigned the same integer if they are identical.
+  /// If an instruction is deemed unsafe to outline, then it will be assigned an
+  /// unique integer. The resulting mapping is placed into a suffix tree and
+  /// queried for candidates.
+  ///
+  /// \param MBB The \p MachineBasicBlock to be translated into integers.
+  /// \param TRI \p TargetRegisterInfo for the module.
+  /// \param TII \p TargetInstrInfo for the module.
+  void convertToUnsignedVec(MachineBasicBlock &MBB,
+                            const TargetRegisterInfo &TRI,
+                            const TargetInstrInfo &TII) {
+    for (MachineBasicBlock::iterator It = MBB.begin(), Et = MBB.end(); It != Et;
+         It++) {
+
+      // Keep track of where this instruction is in the module.
+      switch(TII.getOutliningType(*It)) {
+        case TargetInstrInfo::MachineOutlinerInstrType::Illegal:
+          mapToIllegalUnsigned(It);
+          break;
+
+        case TargetInstrInfo::MachineOutlinerInstrType::Legal:
+          mapToLegalUnsigned(It);
+          break;
+
+        case TargetInstrInfo::MachineOutlinerInstrType::Invisible:
+          break;
+      }
+    }
+
+    // After we're done every insertion, uniquely terminate this part of the
+    // "string". This makes sure we won't match across basic block or function
+    // boundaries since the "end" is encoded uniquely and thus appears in no
+    // repeated substring.
+    InstrList.push_back(MBB.end());
+    UnsignedVec.push_back(IllegalInstrNumber);
+    IllegalInstrNumber--;
+  }
+
+  InstructionMapper() {
+    // Make sure that the implementation of DenseMapInfo<unsigned> hasn't
+    // changed.
+    assert(DenseMapInfo<unsigned>::getEmptyKey() == (unsigned)-1 &&
+                "DenseMapInfo<unsigned>'s empty key isn't -1!");
+    assert(DenseMapInfo<unsigned>::getTombstoneKey() == (unsigned)-2 &&
+                "DenseMapInfo<unsigned>'s tombstone key isn't -2!");
+  }
+};
+
+/// \brief An interprocedural pass which finds repeated sequences of
+/// instructions and replaces them with calls to functions.
+///
+/// Each instruction is mapped to an unsigned integer and placed in a string.
+/// The resulting mapping is then placed in a \p SuffixTree. The \p SuffixTree
+/// is then repeatedly queried for repeated sequences of instructions. Each
+/// non-overlapping repeated sequence is then placed in its own
+/// \p MachineFunction and each instance is then replaced with a call to that
+/// function.
+struct MachineOutliner : public ModulePass {
+
+  static char ID;
+
+  StringRef getPassName() const override { return "Machine Outliner"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfo>();
+    AU.addPreserved<MachineModuleInfo>();
+    AU.setPreservesAll();
+    ModulePass::getAnalysisUsage(AU);
+  }
+
+  MachineOutliner() : ModulePass(ID) {
+    initializeMachineOutlinerPass(*PassRegistry::getPassRegistry());
+  }
+
+  /// \brief Replace the sequences of instructions represented by the
+  /// \p Candidates in \p CandidateList with calls to \p MachineFunctions
+  /// described in \p FunctionList.
+  ///
+  /// \param M The module we are outlining from.
+  /// \param CandidateList A list of candidates to be outlined.
+  /// \param FunctionList A list of functions to be inserted into the module.
+  /// \param Mapper Contains the instruction mappings for the module.
+  bool outline(Module &M, const ArrayRef<Candidate> &CandidateList,
+               std::vector<OutlinedFunction> &FunctionList,
+               InstructionMapper &Mapper);
+
+  /// Creates a function for \p OF and inserts it into the module.
+  MachineFunction *createOutlinedFunction(Module &M, const OutlinedFunction &OF,
+                                          InstructionMapper &Mapper);
+
+  /// Find potential outlining candidates and store them in \p CandidateList.
+  ///
+  /// For each type of potential candidate, also build an \p OutlinedFunction
+  /// struct containing the information to build the function for that
+  /// candidate.
+  ///
+  /// \param[out] CandidateList Filled with outlining candidates for the module.
+  /// \param[out] FunctionList Filled with functions corresponding to each type
+  /// of \p Candidate.
+  /// \param ST The suffix tree for the module.
+  /// \param TII TargetInstrInfo for the module.
+  ///
+  /// \returns The length of the longest candidate found. 0 if there are none.
+  unsigned buildCandidateList(std::vector<Candidate> &CandidateList,
+                              std::vector<OutlinedFunction> &FunctionList,
+                              SuffixTree &ST,
+                              InstructionMapper &Mapper,
+                              const TargetInstrInfo &TII);
+
+  /// \brief Remove any overlapping candidates that weren't handled by the
+  /// suffix tree's pruning method.
+  ///
+  /// Pruning from the suffix tree doesn't necessarily remove all overlaps.
+  /// If a short candidate is chosen for outlining, then a longer candidate
+  /// which has that short candidate as a suffix is chosen, the tree's pruning
+  /// method will not find it. Thus, we need to prune before outlining as well.
+  ///
+  /// \param[in,out] CandidateList A list of outlining candidates.
+  /// \param[in,out] FunctionList A list of functions to be outlined.
+  /// \param MaxCandidateLen The length of the longest candidate.
+  /// \param TII TargetInstrInfo for the module.
+  void pruneOverlaps(std::vector<Candidate> &CandidateList,
+                     std::vector<OutlinedFunction> &FunctionList,
+                     unsigned MaxCandidateLen,
+                     const TargetInstrInfo &TII);
+
+  /// Construct a suffix tree on the instructions in \p M and outline repeated
+  /// strings from that tree.
+  bool runOnModule(Module &M) override;
+};
+
+} // Anonymous namespace.
+
+char MachineOutliner::ID = 0;
+
+namespace llvm {
+ModulePass *createMachineOutlinerPass() { return new MachineOutliner(); }
+}
+
+INITIALIZE_PASS(MachineOutliner, "machine-outliner",
+                "Machine Function Outliner", false, false)
+
+void MachineOutliner::pruneOverlaps(std::vector<Candidate> &CandidateList,
+                                    std::vector<OutlinedFunction> &FunctionList,
+                                    unsigned MaxCandidateLen,
+                                    const TargetInstrInfo &TII) {
+  // TODO: Experiment with interval trees or other interval-checking structures
+  // to lower the time complexity of this function.
+  // TODO: Can we do better than the simple greedy choice?
+  // Check for overlaps in the range.
+  // This is O(MaxCandidateLen * CandidateList.size()).
+  for (auto It = CandidateList.begin(), Et = CandidateList.end(); It != Et;
+       It++) {
+    Candidate &C1 = *It;
+    OutlinedFunction &F1 = FunctionList[C1.FunctionIdx];
+
+    // If we removed this candidate, skip it.
+    if (!C1.InCandidateList)
+      continue;
+
+    // Is it still worth it to outline C1?
+    if (F1.Benefit < 1 || F1.OccurrenceCount < 2) {
+      assert(F1.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+      F1.OccurrenceCount--;
+      C1.InCandidateList = false;
+      continue;
+    }
+
+    // The minimum start index of any candidate that could overlap with this
+    // one.
+    unsigned FarthestPossibleIdx = 0;
+
+    // Either the index is 0, or it's at most MaxCandidateLen indices away.
+    if (C1.StartIdx > MaxCandidateLen)
+      FarthestPossibleIdx = C1.StartIdx - MaxCandidateLen;
+
+    // Compare against the candidates in the list that start at at most
+    // FarthestPossibleIdx indices away from C1. There are at most
+    // MaxCandidateLen of these.
+    for (auto Sit = It + 1; Sit != Et; Sit++) {
+      Candidate &C2 = *Sit;
+      OutlinedFunction &F2 = FunctionList[C2.FunctionIdx];
+
+      // Is this candidate too far away to overlap?
+      if (C2.StartIdx < FarthestPossibleIdx)
+        break;
+
+      // Did we already remove this candidate in a previous step?
+      if (!C2.InCandidateList)
+        continue;
+
+      // Is the function beneficial to outline?
+      if (F2.OccurrenceCount < 2 || F2.Benefit < 1) {
+        // If not, remove this candidate and move to the next one.
+        assert(F2.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+        F2.OccurrenceCount--;
+        C2.InCandidateList = false;
+        continue;
+      }
+
+      size_t C2End = C2.StartIdx + C2.Len - 1;
+
+      // Do C1 and C2 overlap?
+      //
+      // Not overlapping:
+      // High indices... [C1End ... C1Start][C2End ... C2Start] ...Low indices
+      //
+      // We sorted our candidate list so C2Start <= C1Start. We know that
+      // C2End > C2Start since each candidate has length >= 2. Therefore, all we
+      // have to check is C2End < C2Start to see if we overlap.
+      if (C2End < C1.StartIdx)
+        continue;
+
+      // C1 and C2 overlap.
+      // We need to choose the better of the two.
+      //
+      // Approximate this by picking the one which would have saved us the
+      // most instructions before any pruning.
+      if (C1.Benefit >= C2.Benefit) {
+
+        // C1 is better, so remove C2 and update C2's OutlinedFunction to
+        // reflect the removal.
+        assert(F2.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+        F2.OccurrenceCount--;
+        F2.Benefit = TII.getOutliningBenefit(F2.Sequence.size(),
+                                             F2.OccurrenceCount,
+                                             F2.IsTailCall
+                                             );
+
+        C2.InCandidateList = false;
+
+        DEBUG (
+          dbgs() << "- Removed C2. \n";
+          dbgs() << "--- Num fns left for C2: " << F2.OccurrenceCount << "\n";
+          dbgs() << "--- C2's benefit: " << F2.Benefit << "\n";
+        );
+
+      } else {
+        // C2 is better, so remove C1 and update C1's OutlinedFunction to
+        // reflect the removal.
+        assert(F1.OccurrenceCount > 0 &&
+               "Can't remove OutlinedFunction with no occurrences!");
+        F1.OccurrenceCount--;
+        F1.Benefit = TII.getOutliningBenefit(F1.Sequence.size(),
+                                             F1.OccurrenceCount,
+                                             F1.IsTailCall
+                                             );
+        C1.InCandidateList = false;
+
+        DEBUG (
+          dbgs() << "- Removed C1. \n";
+          dbgs() << "--- Num fns left for C1: " << F1.OccurrenceCount << "\n";
+          dbgs() << "--- C1's benefit: " << F1.Benefit << "\n";
+        );
+
+        // C1 is out, so we don't have to compare it against anyone else.
+        break;
+      }
+    }
+  }
+}
+
+unsigned
+MachineOutliner::buildCandidateList(std::vector<Candidate> &CandidateList,
+                                    std::vector<OutlinedFunction> &FunctionList,
+                                    SuffixTree &ST,
+                                    InstructionMapper &Mapper,
+                                    const TargetInstrInfo &TII) {
+
+  std::vector<unsigned> CandidateSequence; // Current outlining candidate.
+  size_t MaxCandidateLen = 0; // Length of the longest candidate.
+
+  // Function for maximizing query in the suffix tree.
+  // This allows us to define more fine-grained types of things to outline in
+  // the target without putting target-specific info in the suffix tree.
+  auto BenefitFn = [&TII, &Mapper](const SuffixTreeNode &Curr,
+                                   size_t StringLen, unsigned EndVal) {
+
+    // The root represents the empty string.
+    if (Curr.isRoot())
+      return 0u;
+
+    // Is this long enough to outline?
+	// TODO: Let the target decide how "long" a string is in terms of the sizes
+	// of the instructions in the string. For example, if a call instruction
+	// is smaller than a one instruction string, we should outline that string.
+    if (StringLen < 2)
+      return 0u;
+
+    size_t Occurrences = Curr.OccurrenceCount;
+
+    // Anything we want to outline has to appear at least twice.
+    if (Occurrences < 2)
+      return 0u;
+
+    // Check if the last instruction in the sequence is a return.
+    MachineInstr *LastInstr =
+    Mapper.IntegerInstructionMap[EndVal];
+    assert(LastInstr && "Last instruction in sequence was unmapped!");
+
+    // The only way a terminator could be mapped as legal is if it was safe to
+    // tail call.
+    bool IsTailCall = LastInstr->isTerminator();
+    return TII.getOutliningBenefit(StringLen, Occurrences, IsTailCall);
+  };
+
+  MaxCandidateLen = ST.findCandidates(CandidateList, FunctionList, BenefitFn);
+
+  for (auto &OF : FunctionList)
+    OF.IsTailCall = Mapper.
+                    IntegerInstructionMap[OF.Sequence.back()]->isTerminator();
+
+  // Sort the candidates in decending order. This will simplify the outlining
+  // process when we have to remove the candidates from the mapping by
+  // allowing us to cut them out without keeping track of an offset.
+  std::stable_sort(CandidateList.begin(), CandidateList.end());
+
+  return MaxCandidateLen;
+}
+
+MachineFunction *
+MachineOutliner::createOutlinedFunction(Module &M, const OutlinedFunction &OF,
+  InstructionMapper &Mapper) {
+
+  // Create the function name. This should be unique. For now, just hash the
+  // module name and include it in the function name plus the number of this
+  // function.
+  std::ostringstream NameStream;
+  NameStream << "OUTLINED_FUNCTION" << "_" << OF.Name;
+
+  // Create the function using an IR-level function.
+  LLVMContext &C = M.getContext();
+  Function *F = dyn_cast<Function>(
+      M.getOrInsertFunction(NameStream.str(), Type::getVoidTy(C)));
+  assert(F && "Function was null!");
+
+  // NOTE: If this is linkonceodr, then we can take advantage of linker deduping
+  // which gives us better results when we outline from linkonceodr functions.
+  F->setLinkage(GlobalValue::PrivateLinkage);
+  F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+
+  BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
+  IRBuilder<> Builder(EntryBB);
+  Builder.CreateRetVoid();
+
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  MachineFunction &MF = MMI.getMachineFunction(*F);
+  MachineBasicBlock &MBB = *MF.CreateMachineBasicBlock();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+  // Insert the new function into the module.
+  MF.insert(MF.begin(), &MBB);
+
+  TII.insertOutlinerPrologue(MBB, MF, OF.IsTailCall);
+
+  // Copy over the instructions for the function using the integer mappings in
+  // its sequence.
+  for (unsigned Str : OF.Sequence) {
+    MachineInstr *NewMI =
+        MF.CloneMachineInstr(Mapper.IntegerInstructionMap.find(Str)->second);
+    NewMI->dropMemRefs();
+
+    // Don't keep debug information for outlined instructions.
+    // FIXME: This means outlined functions are currently undebuggable.
+    NewMI->setDebugLoc(DebugLoc());
+    MBB.insert(MBB.end(), NewMI);
+  }
+
+  TII.insertOutlinerEpilogue(MBB, MF, OF.IsTailCall);
+
+  return &MF;
+}
+
+bool MachineOutliner::outline(Module &M,
+                              const ArrayRef<Candidate> &CandidateList,
+                              std::vector<OutlinedFunction> &FunctionList,
+                              InstructionMapper &Mapper) {
+
+  bool OutlinedSomething = false;
+
+  // Replace the candidates with calls to their respective outlined functions.
+  for (const Candidate &C : CandidateList) {
+
+    // Was the candidate removed during pruneOverlaps?
+    if (!C.InCandidateList)
+      continue;
+
+    // If not, then look at its OutlinedFunction.
+    OutlinedFunction &OF = FunctionList[C.FunctionIdx];
+
+    // Was its OutlinedFunction made unbeneficial during pruneOverlaps?
+    if (OF.OccurrenceCount < 2 || OF.Benefit < 1)
+      continue;
+
+    // If not, then outline it.
+    assert(C.StartIdx < Mapper.InstrList.size() && "Candidate out of bounds!");
+    MachineBasicBlock *MBB = (*Mapper.InstrList[C.StartIdx]).getParent();
+    MachineBasicBlock::iterator StartIt = Mapper.InstrList[C.StartIdx];
+    unsigned EndIdx = C.StartIdx + C.Len - 1;
+
+    assert(EndIdx < Mapper.InstrList.size() && "Candidate out of bounds!");
+    MachineBasicBlock::iterator EndIt = Mapper.InstrList[EndIdx];
+    assert(EndIt != MBB->end() && "EndIt out of bounds!");
+
+    EndIt++; // Erase needs one past the end index.
+
+    // Does this candidate have a function yet?
+    if (!OF.MF) {
+      OF.MF = createOutlinedFunction(M, OF, Mapper);
+      FunctionsCreated++;
+    }
+
+    MachineFunction *MF = OF.MF;
+    const TargetSubtargetInfo &STI = MF->getSubtarget();
+    const TargetInstrInfo &TII = *STI.getInstrInfo();
+
+    // Insert a call to the new function and erase the old sequence.
+    TII.insertOutlinedCall(M, *MBB, StartIt, *MF, OF.IsTailCall);
+    StartIt = Mapper.InstrList[C.StartIdx];
+    MBB->erase(StartIt, EndIt);
+
+    OutlinedSomething = true;
+
+    // Statistics.
+    NumOutlined++;
+  }
+
+  DEBUG (
+    dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";
+  );
+
+  return OutlinedSomething;
+}
+
+bool MachineOutliner::runOnModule(Module &M) {
+
+  // Is there anything in the module at all?
+  if (M.empty())
+    return false;
+
+  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfo>();
+  const TargetSubtargetInfo &STI = MMI.getMachineFunction(*M.begin())
+                                      .getSubtarget();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+  const TargetInstrInfo *TII = STI.getInstrInfo();
+
+  InstructionMapper Mapper;
+
+  // Build instruction mappings for each function in the module.
+  for (Function &F : M) {
+    MachineFunction &MF = MMI.getMachineFunction(F);
+
+    // Is the function empty? Safe to outline from?
+    if (F.empty() || !TII->isFunctionSafeToOutlineFrom(MF))
+      continue;
+
+    // If it is, look at each MachineBasicBlock in the function.
+    for (MachineBasicBlock &MBB : MF) {
+
+      // Is there anything in MBB?
+      if (MBB.empty())
+        continue;
+
+      // If yes, map it.
+      Mapper.convertToUnsignedVec(MBB, *TRI, *TII);
+    }
+  }
+
+  // Construct a suffix tree, use it to find candidates, and then outline them.
+  SuffixTree ST(Mapper.UnsignedVec);
+  std::vector<Candidate> CandidateList;
+  std::vector<OutlinedFunction> FunctionList;
+
+  // Find all of the outlining candidates.
+  unsigned MaxCandidateLen =
+      buildCandidateList(CandidateList, FunctionList, ST, Mapper, *TII);
+
+  // Remove candidates that overlap with other candidates.
+  pruneOverlaps(CandidateList, FunctionList, MaxCandidateLen, *TII);
+
+  // Outline each of the candidates and return true if something was outlined.
+  return outline(M, CandidateList, FunctionList, Mapper);
+}
diff --git a/lib/CodeGen/MachinePipeliner.cpp b/lib/CodeGen/MachinePipeliner.cpp
index f816e27e50e3edbf3831428858a50ed18d2bcd84..d06c38cf4ed81a92635f4ef0c26c713cbb604d7f 100644
--- a/lib/CodeGen/MachinePipeliner.cpp
+++ b/lib/CodeGen/MachinePipeliner.cpp
@@ -595,7 +595,7 @@ private:
   /// Virtual register information.
   MachineRegisterInfo &MRI;
 
-  DFAPacketizer *Resources;
+  std::unique_ptr<DFAPacketizer> Resources;
 
 public:
   SMSchedule(MachineFunction *mf)
@@ -606,13 +606,6 @@ public:
     InitiationInterval = 0;
   }
 
-  ~SMSchedule() {
-    ScheduledInstrs.clear();
-    InstrToCycle.clear();
-    RegToStageDiff.clear();
-    delete Resources;
-  }
-
   void reset() {
     ScheduledInstrs.clear();
     InstrToCycle.clear();
@@ -740,7 +733,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
     return false;
 
   if (mf.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::OptimizeForSize) &&
+          AttributeList::FunctionIndex, Attribute::OptimizeForSize) &&
       !EnableSWPOptSize.getPosition())
     return false;
 
@@ -962,7 +955,7 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
     if (Phi.getOperand(i + 1).getMBB() != Loop)
       InitVal = Phi.getOperand(i).getReg();
-    else if (Phi.getOperand(i + 1).getMBB() == Loop)
+    else
       LoopVal = Phi.getOperand(i).getReg();
 
   assert(InitVal != 0 && LoopVal != 0 && "Unexpected Phi structure.");
@@ -2516,7 +2509,7 @@ void SwingSchedulerDAG::generateExistingPhis(
     MachineBasicBlock *KernelBB, SMSchedule &Schedule, ValueMapTy *VRMap,
     InstrMapTy &InstrMap, unsigned LastStageNum, unsigned CurStageNum,
     bool IsLast) {
-  // Compute the stage number for the inital value of the Phi, which
+  // Compute the stage number for the initial value of the Phi, which
   // comes from the prolog. The prolog to use depends on to which kernel/
   // epilog that we're adding the Phi.
   unsigned PrologStage = 0;
@@ -3482,7 +3475,7 @@ bool SwingSchedulerDAG::isLoopCarriedOrder(SUnit *Source, const SDep &Dep,
   // increment value to determine if the accesses may be loop carried.
   if (OffsetS >= OffsetD)
     return OffsetS + AccessSizeS > DeltaS;
-  else if (OffsetS < OffsetD)
+  else
     return OffsetD + AccessSizeD > DeltaD;
 
   return true;
diff --git a/lib/CodeGen/MachineRegionInfo.cpp b/lib/CodeGen/MachineRegionInfo.cpp
index fc32183c7f63943117386fbe37e6ec1c70173800..71ad4e6aa7f520ab4a4eb50b080a69fe20dd27a2 100644
--- a/lib/CodeGen/MachineRegionInfo.cpp
+++ b/lib/CodeGen/MachineRegionInfo.cpp
@@ -1,10 +1,9 @@
-
 #include "llvm/CodeGen/MachineRegionInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/RegionInfoImpl.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 
-#define DEBUG_TYPE "region"
+#define DEBUG_TYPE "machine-region-info"
 
 using namespace llvm;
 
@@ -86,6 +85,9 @@ bool MachineRegionInfoPass::runOnMachineFunction(MachineFunction &F) {
   auto DF = &getAnalysis<MachineDominanceFrontier>();
 
   RI.recalculate(F, DT, PDT, DF);
+
+  DEBUG(RI.dump());
+
   return false;
 }
 
@@ -103,9 +105,10 @@ void MachineRegionInfoPass::verifyAnalysis() const {
 
 void MachineRegionInfoPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
-  AU.addRequired<PostDominatorTreeWrapperPass>();
-  AU.addRequired<DominanceFrontierWrapperPass>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachinePostDominatorTree>();
+  AU.addRequired<MachineDominanceFrontier>();
+  MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 void MachineRegionInfoPass::print(raw_ostream &OS, const Module *) const {
@@ -119,14 +122,15 @@ LLVM_DUMP_METHOD void MachineRegionInfoPass::dump() const {
 #endif
 
 char MachineRegionInfoPass::ID = 0;
+char &MachineRegionInfoPassID = MachineRegionInfoPass::ID;
 
-INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, "regions",
-                "Detect single entry single exit regions", true, true)
+INITIALIZE_PASS_BEGIN(MachineRegionInfoPass, DEBUG_TYPE,
+                      "Detect single entry single exit regions", true, true)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
 INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
-INITIALIZE_PASS_END(MachineRegionInfoPass, "regions",
-                "Detect single entry single exit regions", true, true)
+INITIALIZE_PASS_END(MachineRegionInfoPass, DEBUG_TYPE,
+                    "Detect single entry single exit regions", true, true)
 
 // Create methods available outside of this file, to use them
 // "include/llvm/LinkAllPasses.h". Otherwise the pass would be deleted by
diff --git a/lib/CodeGen/MachineRegisterInfo.cpp b/lib/CodeGen/MachineRegisterInfo.cpp
index be4f3cc9dc57b3d40d3747f40084b148af081baf..128910f8eb2aa35f72915369480b6d83749d03b6 100644
--- a/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/lib/CodeGen/MachineRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//===-- lib/Codegen/MachineRegisterInfo.cpp -------------------------------===//
+//===- lib/Codegen/MachineRegisterInfo.cpp --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,13 +11,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/raw_os_ostream.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -28,9 +42,9 @@ static cl::opt<bool> EnableSubRegLiveness("enable-subreg-liveness", cl::Hidden,
 void MachineRegisterInfo::Delegate::anchor() {}
 
 MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF)
-    : MF(MF), TheDelegate(nullptr),
-      TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() &&
-                           EnableSubRegLiveness) {
+    : MF(MF), TracksSubRegLiveness(MF->getSubtarget().enableSubRegLiveness() &&
+                                   EnableSubRegLiveness),
+      IsUpdatedCSRsInitialized(false) {
   unsigned NumRegs = getTargetRegisterInfo()->getNumRegs();
   VRegInfo.reserve(256);
   RegAllocHints.reserve(256);
@@ -543,3 +557,47 @@ bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const {
   }
   return false;
 }
+
+void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) {
+
+  const TargetRegisterInfo *TRI = getTargetRegisterInfo();
+  assert(Reg && (Reg < TRI->getNumRegs()) &&
+         "Trying to disable an invalid register");
+
+  if (!IsUpdatedCSRsInitialized) {
+    const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
+    for (const MCPhysReg *I = CSR; *I; ++I)
+      UpdatedCSRs.push_back(*I);
+
+    // Zero value represents the end of the register list
+    // (no more registers should be pushed).
+    UpdatedCSRs.push_back(0);
+
+    IsUpdatedCSRsInitialized = true;
+  }
+
+  // Remove the register (and its aliases from the list).
+  for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+    UpdatedCSRs.erase(std::remove(UpdatedCSRs.begin(), UpdatedCSRs.end(), *AI),
+                      UpdatedCSRs.end());
+}
+
+const MCPhysReg *MachineRegisterInfo::getCalleeSavedRegs() const {
+  if (IsUpdatedCSRsInitialized)
+    return UpdatedCSRs.data();
+
+  return getTargetRegisterInfo()->getCalleeSavedRegs(MF);
+}
+
+void MachineRegisterInfo::setCalleeSavedRegs(ArrayRef<MCPhysReg> CSRs) {
+  if (IsUpdatedCSRsInitialized)
+    UpdatedCSRs.clear();
+
+  for (MCPhysReg Reg : CSRs)
+    UpdatedCSRs.push_back(Reg);
+
+  // Zero value represents the end of the register list
+  // (no more registers should be pushed).
+  UpdatedCSRs.push_back(0);
+  IsUpdatedCSRsInitialized = true;
+}
diff --git a/lib/CodeGen/MachineScheduler.cpp b/lib/CodeGen/MachineScheduler.cpp
index ad3603e9c9e40d0ec36585aea83368c6d705f040..fe7b2c8399b15d5492096cf9bcd1a7c7dd1ddfee 100644
--- a/lib/CodeGen/MachineScheduler.cpp
+++ b/lib/CodeGen/MachineScheduler.cpp
@@ -12,30 +12,67 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
+#include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/CodeGen/ScheduleDAGInstrs.h"
+#include "llvm/CodeGen/ScheduleDAGMutation.h"
 #include "llvm/CodeGen/ScheduleDFS.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <limits>
+#include <memory>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "misched"
 
 namespace llvm {
+
 cl::opt<bool> ForceTopDown("misched-topdown", cl::Hidden,
                            cl::desc("Force top-down list scheduling"));
 cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
@@ -43,7 +80,8 @@ cl::opt<bool> ForceBottomUp("misched-bottomup", cl::Hidden,
 cl::opt<bool>
 DumpCriticalPathLength("misched-dcpl", cl::Hidden,
                        cl::desc("Print critical path length to stdout"));
-}
+
+} // end namespace llvm
 
 #ifndef NDEBUG
 static cl::opt<bool> ViewMISchedDAGs("view-misched-dags", cl::Hidden,
@@ -80,10 +118,6 @@ static cl::opt<bool> EnableMemOpCluster("misched-cluster", cl::Hidden,
                                         cl::desc("Enable memop clustering."),
                                         cl::init(true));
 
-// Experimental heuristics
-static cl::opt<bool> EnableMacroFusion("misched-fusion", cl::Hidden,
-  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
-
 static cl::opt<bool> VerifyScheduling("verify-misched", cl::Hidden,
   cl::desc("Verify machine instrs before and after machine scheduling"));
 
@@ -92,14 +126,14 @@ static const unsigned MinSubtreeSize = 8;
 
 // Pin the vtables to this file.
 void MachineSchedStrategy::anchor() {}
+
 void ScheduleDAGMutation::anchor() {}
 
 //===----------------------------------------------------------------------===//
 // Machine Instruction Scheduling Pass and Registry
 //===----------------------------------------------------------------------===//
 
-MachineSchedContext::MachineSchedContext():
-    MF(nullptr), MLI(nullptr), MDT(nullptr), PassConfig(nullptr), AA(nullptr), LIS(nullptr) {
+MachineSchedContext::MachineSchedContext() {
   RegClassInfo = new RegisterClassInfo();
 }
 
@@ -108,6 +142,7 @@ MachineSchedContext::~MachineSchedContext() {
 }
 
 namespace {
+
 /// Base class for a machine scheduler class that can run at any point.
 class MachineSchedulerBase : public MachineSchedContext,
                              public MachineFunctionPass {
@@ -149,7 +184,8 @@ public:
 protected:
   ScheduleDAGInstrs *createPostMachineScheduler();
 };
-} // namespace
+
+} // end anonymous namespace
 
 char MachineScheduler::ID = 0;
 
@@ -158,6 +194,7 @@ char &llvm::MachineSchedulerID = MachineScheduler::ID;
 INITIALIZE_PASS_BEGIN(MachineScheduler, "machine-scheduler",
                       "Machine Instruction Scheduler", false, false)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_DEPENDENCY(SlotIndexes)
 INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_END(MachineScheduler, "machine-scheduler",
@@ -211,7 +248,7 @@ static ScheduleDAGInstrs *useDefaultMachineSched(MachineSchedContext *C) {
 
 /// MachineSchedOpt allows command line selection of the scheduler.
 static cl::opt<MachineSchedRegistry::ScheduleDAGCtor, false,
-               RegisterPassParser<MachineSchedRegistry> >
+               RegisterPassParser<MachineSchedRegistry>>
 MachineSchedOpt("misched",
                 cl::init(&useDefaultMachineSched), cl::Hidden,
                 cl::desc("Machine instruction scheduler to use"));
@@ -448,7 +485,7 @@ void MachineSchedulerBase::scheduleRegions(ScheduleDAGInstrs &Scheduler,
       // instruction stream until we find the nearest boundary.
       unsigned NumRegionInstrs = 0;
       MachineBasicBlock::iterator I = RegionEnd;
-      for (;I != MBB->begin(); --I) {
+      for (; I != MBB->begin(); --I) {
         MachineInstr &MI = *std::prev(I);
         if (isSchedBoundary(&MI, &*MBB, MF, TII))
           break;
@@ -520,8 +557,7 @@ LLVM_DUMP_METHOD void ReadyQueue::dump() {
 // ===----------------------------------------------------------------------===/
 
 // Provide a vtable anchor.
-ScheduleDAGMI::~ScheduleDAGMI() {
-}
+ScheduleDAGMI::~ScheduleDAGMI() = default;
 
 bool ScheduleDAGMI::canAddEdge(SUnit *SuccSU, SUnit *PredSU) {
   return SuccSU == &ExitSU || !Topo.IsReachable(PredSU, SuccSU);
@@ -826,7 +862,7 @@ void ScheduleDAGMI::placeDebugValues() {
     RegionBegin = FirstDbgValue;
   }
 
-  for (std::vector<std::pair<MachineInstr *, MachineInstr *> >::iterator
+  for (std::vector<std::pair<MachineInstr *, MachineInstr *>>::iterator
          DI = DbgValues.end(), DE = DbgValues.begin(); DI != DE; --DI) {
     std::pair<MachineInstr *, MachineInstr *> P = *std::prev(DI);
     MachineInstr *DbgValue = P.first;
@@ -1013,7 +1049,7 @@ updateScheduledPressure(const SUnit *SU,
       ++CritIdx;
     if (CritIdx != CritEnd && RegionCriticalPSets[CritIdx].getPSet() == ID) {
       if ((int)NewMaxPressure[ID] > RegionCriticalPSets[CritIdx].getUnitInc()
-          && NewMaxPressure[ID] <= INT16_MAX)
+          && NewMaxPressure[ID] <= (unsigned)std::numeric_limits<int16_t>::max())
         RegionCriticalPSets[CritIdx].setUnitInc(NewMaxPressure[ID]);
     }
     unsigned Limit = RegClassInfo->getRegPressureSetLimit(ID);
@@ -1137,6 +1173,12 @@ void ScheduleDAGMILive::schedule() {
         dbgs() << "  Pressure Diff      : ";
         getPressureDiff(&SU).dump(*TRI);
       }
+      dbgs() << "  Single Issue       : ";
+      if (SchedModel.mustBeginGroup(SU.getInstr()) &&
+         SchedModel.mustEndGroup(SU.getInstr()))
+        dbgs() << "true;";
+      else
+        dbgs() << "false;";
       dbgs() << '\n';
     }
     if (ExitSU.getInstr() != nullptr)
@@ -1397,6 +1439,7 @@ void ScheduleDAGMILive::scheduleMI(SUnit *SU, bool IsTopNode) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Post-process the DAG to create cluster edges between neighboring
 /// loads or between neighboring stores.
 class BaseMemOpClusterMutation : public ScheduleDAGMutation {
@@ -1404,6 +1447,7 @@ class BaseMemOpClusterMutation : public ScheduleDAGMutation {
     SUnit *SU;
     unsigned BaseReg;
     int64_t Offset;
+
     MemOpInfo(SUnit *su, unsigned reg, int64_t ofs)
         : SU(su), BaseReg(reg), Offset(ofs) {}
 
@@ -1440,25 +1484,26 @@ public:
   LoadClusterMutation(const TargetInstrInfo *tii, const TargetRegisterInfo *tri)
       : BaseMemOpClusterMutation(tii, tri, true) {}
 };
-} // anonymous
+
+} // end anonymous namespace
 
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createLoadClusterDAGMutation(const TargetInstrInfo *TII,
                              const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? make_unique<LoadClusterMutation>(TII, TRI)
+  return EnableMemOpCluster ? llvm::make_unique<LoadClusterMutation>(TII, TRI)
                             : nullptr;
 }
 
 std::unique_ptr<ScheduleDAGMutation>
 createStoreClusterDAGMutation(const TargetInstrInfo *TII,
                               const TargetRegisterInfo *TRI) {
-  return EnableMemOpCluster ? make_unique<StoreClusterMutation>(TII, TRI)
+  return EnableMemOpCluster ? llvm::make_unique<StoreClusterMutation>(TII, TRI)
                             : nullptr;
 }
 
-} // namespace llvm
+} // end namespace llvm
 
 void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     ArrayRef<SUnit *> MemOps, ScheduleDAGMI *DAG) {
@@ -1543,81 +1588,12 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAGInstrs) {
     clusterNeighboringMemOps(StoreChainDependents[Idx], DAG);
 }
 
-//===----------------------------------------------------------------------===//
-// MacroFusion - DAG post-processing to encourage fusion of macro ops.
-//===----------------------------------------------------------------------===//
-
-namespace {
-/// \brief Post-process the DAG to create cluster edges between instructions
-/// that may be fused by the processor into a single operation.
-class MacroFusion : public ScheduleDAGMutation {
-  const TargetInstrInfo &TII;
-public:
-  MacroFusion(const TargetInstrInfo &TII)
-    : TII(TII) {}
-
-  void apply(ScheduleDAGInstrs *DAGInstrs) override;
-};
-} // anonymous
-
-namespace llvm {
-
-std::unique_ptr<ScheduleDAGMutation>
-createMacroFusionDAGMutation(const TargetInstrInfo *TII) {
-  return EnableMacroFusion ? make_unique<MacroFusion>(*TII) : nullptr;
-}
-
-} // namespace llvm
-
-/// \brief Callback from DAG postProcessing to create cluster edges to encourage
-/// fused operations.
-void MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
-  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
-
-  // For now, assume targets can only fuse with the branch.
-  SUnit &ExitSU = DAG->ExitSU;
-  MachineInstr *Branch = ExitSU.getInstr();
-  if (!Branch)
-    return;
-
-  for (SDep &PredDep : ExitSU.Preds) {
-    if (PredDep.isWeak())
-      continue;
-    SUnit &SU = *PredDep.getSUnit();
-    MachineInstr &Pred = *SU.getInstr();
-    if (!TII.shouldScheduleAdjacent(Pred, *Branch))
-      continue;
-
-    // Create a single weak edge from SU to ExitSU. The only effect is to cause
-    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
-    // need to copy predecessor edges from ExitSU to SU, since top-down
-    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
-    // of SU, we could create an artificial edge from the deepest root, but it
-    // hasn't been needed yet.
-    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
-    (void)Success;
-    assert(Success && "No DAG nodes should be reachable from ExitSU");
-
-    // Adjust latency of data deps between the nodes.
-    for (SDep &PredDep : ExitSU.Preds) {
-      if (PredDep.getSUnit() == &SU)
-        PredDep.setLatency(0);
-    }
-    for (SDep &SuccDep : SU.Succs) {
-      if (SuccDep.getSUnit() == &ExitSU)
-        SuccDep.setLatency(0);
-    }
-
-    DEBUG(dbgs() << "Macro Fuse SU(" << SU.NodeNum << ")\n");
-    break;
-  }
-}
-
 //===----------------------------------------------------------------------===//
 // CopyConstrain - DAG post-processing to encourage copy elimination.
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Post-process the DAG to create weak edges from all uses of a copy to
 /// the one use that defines the copy's source vreg, most likely an induction
 /// variable increment.
@@ -1627,6 +1603,7 @@ class CopyConstrain : public ScheduleDAGMutation {
   // RegionEndIdx is the slot index of the last non-debug instruction in the
   // scheduling region. So we may have RegionBeginIdx == RegionEndIdx.
   SlotIndex RegionEndIdx;
+
 public:
   CopyConstrain(const TargetInstrInfo *, const TargetRegisterInfo *) {}
 
@@ -1635,17 +1612,18 @@ public:
 protected:
   void constrainLocalCopy(SUnit *CopySU, ScheduleDAGMILive *DAG);
 };
-} // anonymous
+
+} // end anonymous namespace
 
 namespace llvm {
 
 std::unique_ptr<ScheduleDAGMutation>
 createCopyConstrainDAGMutation(const TargetInstrInfo *TII,
-                             const TargetRegisterInfo *TRI) {
-  return make_unique<CopyConstrain>(TII, TRI);
+                               const TargetRegisterInfo *TRI) {
+  return llvm::make_unique<CopyConstrain>(TII, TRI);
 }
 
-} // namespace llvm
+} // end namespace llvm
 
 /// constrainLocalCopy handles two possibilities:
 /// 1) Local src:
@@ -1837,7 +1815,7 @@ void SchedBoundary::reset() {
   CheckPending = false;
   CurrCycle = 0;
   CurrMOps = 0;
-  MinReadyCycle = UINT_MAX;
+  MinReadyCycle = std::numeric_limits<unsigned>::max();
   ExpectedLatency = 0;
   DependentLatency = 0;
   RetiredMOps = 0;
@@ -1938,12 +1916,22 @@ bool SchedBoundary::checkHazard(SUnit *SU) {
       && HazardRec->getHazardType(SU) != ScheduleHazardRecognizer::NoHazard) {
     return true;
   }
+
   unsigned uops = SchedModel->getNumMicroOps(SU->getInstr());
   if ((CurrMOps > 0) && (CurrMOps + uops > SchedModel->getIssueWidth())) {
     DEBUG(dbgs() << "  SU(" << SU->NodeNum << ") uops="
           << SchedModel->getNumMicroOps(SU->getInstr()) << '\n');
     return true;
   }
+
+  if (CurrMOps > 0 &&
+      ((isTop() && SchedModel->mustBeginGroup(SU->getInstr())) ||
+       (!isTop() && SchedModel->mustEndGroup(SU->getInstr())))) {
+    DEBUG(dbgs() << "  hazard: SU(" << SU->NodeNum << ") must "
+                 << (isTop()? "begin" : "end") << " group\n");
+    return true;
+  }
+
   if (SchedModel->hasInstrSchedModel() && SU->hasReservedResource) {
     const MCSchedClassDesc *SC = DAG->getSchedClass(SU);
     for (TargetSchedModel::ProcResIter
@@ -2040,7 +2028,8 @@ void SchedBoundary::releaseNode(SUnit *SU, unsigned ReadyCycle) {
 /// Move the boundary of scheduled code by one cycle.
 void SchedBoundary::bumpCycle(unsigned NextCycle) {
   if (SchedModel->getMicroOpBufferSize() == 0) {
-    assert(MinReadyCycle < UINT_MAX && "MinReadyCycle uninitialized");
+    assert(MinReadyCycle < std::numeric_limits<unsigned>::max() &&
+           "MinReadyCycle uninitialized");
     if (MinReadyCycle > NextCycle)
       NextCycle = MinReadyCycle;
   }
@@ -2238,6 +2227,18 @@ void SchedBoundary::bumpNode(SUnit *SU) {
   // one cycle.  Since we commonly reach the max MOps here, opportunistically
   // bump the cycle to avoid uselessly checking everything in the readyQ.
   CurrMOps += IncMOps;
+
+  // Bump the cycle count for issue group constraints.
+  // This must be done after NextCycle has been adjust for all other stalls.
+  // Calling bumpCycle(X) will reduce CurrMOps by one issue group and set
+  // currCycle to X.
+  if ((isTop() &&  SchedModel->mustEndGroup(SU->getInstr())) ||
+      (!isTop() && SchedModel->mustBeginGroup(SU->getInstr()))) {
+    DEBUG(dbgs() << "  Bump cycle to "
+                 << (isTop() ? "end" : "begin") << " group\n");
+    bumpCycle(++NextCycle);
+  }
+
   while (CurrMOps >= SchedModel->getIssueWidth()) {
     DEBUG(dbgs() << "  *** Max MOps " << CurrMOps
           << " at cycle " << CurrCycle << '\n');
@@ -2251,7 +2252,7 @@ void SchedBoundary::bumpNode(SUnit *SU) {
 void SchedBoundary::releasePending() {
   // If the available queue is empty, it is safe to reset MinReadyCycle.
   if (Available.empty())
-    MinReadyCycle = UINT_MAX;
+    MinReadyCycle = std::numeric_limits<unsigned>::max();
 
   // Check to see if any of the pending instructions are ready to issue.  If
   // so, add them to the available queue.
@@ -3110,7 +3111,6 @@ SUnit *GenericScheduler::pickNode(bool &IsTopNode) {
 }
 
 void GenericScheduler::reschedulePhysRegCopies(SUnit *SU, bool isTop) {
-
   MachineBasicBlock::iterator InsertPos = SU->getInstr();
   if (!isTop)
     ++InsertPos;
@@ -3158,7 +3158,8 @@ void GenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 /// Create the standard converging machine scheduler. This will be used as the
 /// default scheduler if the target does not set a default.
 ScheduleDAGMILive *llvm::createGenericSchedLive(MachineSchedContext *C) {
-  ScheduleDAGMILive *DAG = new ScheduleDAGMILive(C, make_unique<GenericScheduler>(C));
+  ScheduleDAGMILive *DAG =
+      new ScheduleDAGMILive(C, llvm::make_unique<GenericScheduler>(C));
   // Register DAG post-processors.
   //
   // FIXME: extend the mutation API to allow earlier mutations to instantiate
@@ -3199,7 +3200,6 @@ void PostGenericScheduler::initialize(ScheduleDAGMI *Dag) {
   }
 }
 
-
 void PostGenericScheduler::registerRoots() {
   Rem.CriticalPath = DAG->ExitSU.getDepth();
 
@@ -3306,7 +3306,7 @@ void PostGenericScheduler::schedNode(SUnit *SU, bool IsTopNode) {
 }
 
 ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
-  return new ScheduleDAGMI(C, make_unique<PostGenericScheduler>(C),
+  return new ScheduleDAGMI(C, llvm::make_unique<PostGenericScheduler>(C),
                            /*RemoveKillFlags=*/true);
 }
 
@@ -3315,14 +3315,14 @@ ScheduleDAGMI *llvm::createGenericSchedPostRA(MachineSchedContext *C) {
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// \brief Order nodes by the ILP metric.
 struct ILPOrder {
-  const SchedDFSResult *DFSResult;
-  const BitVector *ScheduledTrees;
+  const SchedDFSResult *DFSResult = nullptr;
+  const BitVector *ScheduledTrees = nullptr;
   bool MaximizeILP;
 
-  ILPOrder(bool MaxILP)
-    : DFSResult(nullptr), ScheduledTrees(nullptr), MaximizeILP(MaxILP) {}
+  ILPOrder(bool MaxILP) : MaximizeILP(MaxILP) {}
 
   /// \brief Apply a less-than relation on node priority.
   ///
@@ -3351,12 +3351,13 @@ struct ILPOrder {
 
 /// \brief Schedule based on the ILP metric.
 class ILPScheduler : public MachineSchedStrategy {
-  ScheduleDAGMILive *DAG;
+  ScheduleDAGMILive *DAG = nullptr;
   ILPOrder Cmp;
 
   std::vector<SUnit*> ReadyQ;
+
 public:
-  ILPScheduler(bool MaximizeILP): DAG(nullptr), Cmp(MaximizeILP) {}
+  ILPScheduler(bool MaximizeILP) : Cmp(MaximizeILP) {}
 
   void initialize(ScheduleDAGMI *dag) override {
     assert(dag->hasVRegLiveness() && "ILPScheduler needs vreg liveness");
@@ -3409,14 +3410,16 @@ public:
     std::push_heap(ReadyQ.begin(), ReadyQ.end(), Cmp);
   }
 };
-} // namespace
+
+} // end anonymous namespace
 
 static ScheduleDAGInstrs *createILPMaxScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(true));
+  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(true));
 }
 static ScheduleDAGInstrs *createILPMinScheduler(MachineSchedContext *C) {
-  return new ScheduleDAGMILive(C, make_unique<ILPScheduler>(false));
+  return new ScheduleDAGMILive(C, llvm::make_unique<ILPScheduler>(false));
 }
+
 static MachineSchedRegistry ILPMaxRegistry(
   "ilpmax", "Schedule bottom-up for max ILP", createILPMaxScheduler);
 static MachineSchedRegistry ILPMinRegistry(
@@ -3428,6 +3431,7 @@ static MachineSchedRegistry ILPMinRegistry(
 
 #ifndef NDEBUG
 namespace {
+
 /// Apply a less-than relation on the node order, which corresponds to the
 /// instruction order prior to scheduling. IsReverse implements greater-than.
 template<bool IsReverse>
@@ -3448,11 +3452,12 @@ class InstructionShuffler : public MachineSchedStrategy {
   // Using a less-than relation (SUnitOrder<false>) for the TopQ priority
   // gives nodes with a higher number higher priority causing the latest
   // instructions to be scheduled first.
-  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false> >
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<false>>
     TopQ;
   // When scheduling bottom-up, use greater-than as the queue priority.
-  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true> >
+  PriorityQueue<SUnit*, std::vector<SUnit*>, SUnitOrder<true>>
     BottomQ;
+
 public:
   InstructionShuffler(bool alternate, bool topdown)
     : IsAlternating(alternate), IsTopDown(topdown) {}
@@ -3496,15 +3501,18 @@ public:
     BottomQ.push(SU);
   }
 };
-} // namespace
+
+} // end anonymous namespace
 
 static ScheduleDAGInstrs *createInstructionShuffler(MachineSchedContext *C) {
   bool Alternate = !ForceTopDown && !ForceBottomUp;
   bool TopDown = !ForceBottomUp;
   assert((TopDown || !ForceTopDown) &&
          "-misched-topdown incompatible with -misched-bottomup");
-  return new ScheduleDAGMILive(C, make_unique<InstructionShuffler>(Alternate, TopDown));
+  return new ScheduleDAGMILive(
+      C, llvm::make_unique<InstructionShuffler>(Alternate, TopDown));
 }
+
 static MachineSchedRegistry ShufflerRegistry(
   "shuffle", "Shuffle machine instructions alternating directions",
   createInstructionShuffler);
@@ -3522,8 +3530,7 @@ template<> struct GraphTraits<
 
 template<>
 struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
-
-  DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
   static std::string getGraphName(const ScheduleDAG *G) {
     return G->MF.getName();
@@ -3580,7 +3587,8 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
     return Str;
   }
 };
-} // namespace llvm
+
+} // end namespace llvm
 #endif // NDEBUG
 
 /// viewGraph - Pop up a ghostview window with the reachable parts of the DAG
diff --git a/lib/CodeGen/MachineTraceMetrics.cpp b/lib/CodeGen/MachineTraceMetrics.cpp
index ef7e525e8165dde75fcceffaad7d5cc4569bbbed..998a9645e68bfa2761ee180bbad983a3881bbf39 100644
--- a/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/lib/CodeGen/MachineTraceMetrics.cpp
@@ -1,4 +1,4 @@
-//===- lib/CodeGen/MachineTraceMetrics.cpp ----------------------*- C++ -*-===//
+//===- lib/CodeGen/MachineTraceMetrics.cpp --------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,21 +7,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <tuple>
+#include <utility>
 
 using namespace llvm;
 
@@ -37,9 +51,7 @@ INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
 INITIALIZE_PASS_END(MachineTraceMetrics,
                   "machine-trace-metrics", "Machine Trace Metrics", false, true)
 
-MachineTraceMetrics::MachineTraceMetrics()
-  : MachineFunctionPass(ID), MF(nullptr), TII(nullptr), TRI(nullptr),
-    MRI(nullptr), Loops(nullptr) {
+MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
   std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);
 }
 
@@ -137,7 +149,6 @@ MachineTraceMetrics::getProcResourceCycles(unsigned MBBNum) const {
   return makeArrayRef(ProcResourceCycles.data() + MBBNum * PRKinds, PRKinds);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                         Ensemble utility functions
 //===----------------------------------------------------------------------===//
@@ -151,7 +162,7 @@ MachineTraceMetrics::Ensemble::Ensemble(MachineTraceMetrics *ct)
 }
 
 // Virtual destructor serves as an anchor.
-MachineTraceMetrics::Ensemble::~Ensemble() {}
+MachineTraceMetrics::Ensemble::~Ensemble() = default;
 
 const MachineLoop*
 MachineTraceMetrics::Ensemble::getLoopFor(const MachineBasicBlock *MBB) const {
@@ -297,6 +308,7 @@ static bool isExitingLoop(const MachineLoop *From, const MachineLoop *To) {
 // MinInstrCountEnsemble - Pick the trace that executes the least number of
 // instructions.
 namespace {
+
 class MinInstrCountEnsemble : public MachineTraceMetrics::Ensemble {
   const char *getName() const override { return "MinInstr"; }
   const MachineBasicBlock *pickTracePred(const MachineBasicBlock*) override;
@@ -306,7 +318,8 @@ public:
   MinInstrCountEnsemble(MachineTraceMetrics *mtm)
     : MachineTraceMetrics::Ensemble(mtm) {}
 };
-}
+
+} // end anonymous namespace
 
 // Select the preferred predecessor for MBB.
 const MachineBasicBlock*
@@ -409,25 +422,30 @@ void MachineTraceMetrics::verifyAnalysis() const {
 // revisit blocks.
 
 namespace {
+
 struct LoopBounds {
   MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> Blocks;
   SmallPtrSet<const MachineBasicBlock*, 8> Visited;
   const MachineLoopInfo *Loops;
-  bool Downward;
+  bool Downward = false;
+
   LoopBounds(MutableArrayRef<MachineTraceMetrics::TraceBlockInfo> blocks,
-             const MachineLoopInfo *loops)
-    : Blocks(blocks), Loops(loops), Downward(false) {}
+             const MachineLoopInfo *loops) : Blocks(blocks), Loops(loops) {}
 };
-}
+
+} // end anonymous namespace
 
 // Specialize po_iterator_storage in order to prune the post-order traversal so
 // it is limited to the current loop and doesn't traverse the loop back edges.
 namespace llvm {
+
 template<>
 class po_iterator_storage<LoopBounds, true> {
   LoopBounds &LB;
+
 public:
   po_iterator_storage(LoopBounds &lb) : LB(lb) {}
+
   void finishPostorder(const MachineBasicBlock*) {}
 
   bool insertEdge(Optional<const MachineBasicBlock *> From,
@@ -452,7 +470,8 @@ public:
     return LB.Visited.insert(To).second;
   }
 };
-}
+
+} // end namespace llvm
 
 /// Compute the trace through MBB.
 void MachineTraceMetrics::Ensemble::computeTrace(const MachineBasicBlock *MBB) {
@@ -603,6 +622,7 @@ void MachineTraceMetrics::Ensemble::verify() const {
 // A data dependency is represented as a defining MI and operand numbers on the
 // defining and using MI.
 namespace {
+
 struct DataDep {
   const MachineInstr *DefMI;
   unsigned DefOp;
@@ -622,7 +642,8 @@ struct DataDep {
     assert((++DefI).atEnd() && "Register has multiple defs");
   }
 };
-}
+
+} // end anonymous namespace
 
 // Get the input data dependencies that must be ready before UseMI can issue.
 // Return true if UseMI has any physreg operands.
@@ -678,17 +699,19 @@ static void getPHIDeps(const MachineInstr &UseMI,
 // direction instructions are scanned, it could be the operand that defined the
 // regunit, or the highest operand to read the regunit.
 namespace {
+
 struct LiveRegUnit {
   unsigned RegUnit;
-  unsigned Cycle;
-  const MachineInstr *MI;
-  unsigned Op;
+  unsigned Cycle = 0;
+  const MachineInstr *MI = nullptr;
+  unsigned Op = 0;
 
   unsigned getSparseSetIndex() const { return RegUnit; }
 
-  LiveRegUnit(unsigned RU) : RegUnit(RU), Cycle(0), MI(nullptr), Op(0) {}
+  LiveRegUnit(unsigned RU) : RegUnit(RU) {}
 };
-}
+
+} // end anonymous namespace
 
 // Identify physreg dependencies for UseMI, and update the live regunit
 // tracking set when scanning instructions downwards.
@@ -922,7 +945,6 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height,
   return Height;
 }
 
-
 typedef DenseMap<const MachineInstr *, unsigned> MIHeightMap;
 
 // Push the height of DefMI upwards if required to match UseMI.
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index a98139f9e5af3ef6756a32003402454d8deb5e6f..f49232b0f8ad140b6600311326a7ffc8bfd44e2c 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -260,8 +260,8 @@ namespace {
     static char ID; // Pass ID, replacement for typeid
     const std::string Banner;
 
-    MachineVerifierPass(const std::string &banner = nullptr)
-      : MachineFunctionPass(ID), Banner(banner) {
+    MachineVerifierPass(std::string banner = std::string())
+      : MachineFunctionPass(ID), Banner(std::move(banner)) {
         initializeMachineVerifierPassPass(*PassRegistry::getPassRegistry());
       }
 
@@ -528,7 +528,8 @@ void MachineVerifier::visitMachineFunctionBefore() {
   lastIndex = SlotIndex();
   regsReserved = MRI->getReservedRegs();
 
-  markReachable(&MF->front());
+  if (!MF->empty())
+    markReachable(&MF->front());
 
   // Build a set of the basic blocks in the function.
   FunctionBlocks.clear();
@@ -548,7 +549,8 @@ void MachineVerifier::visitMachineFunctionBefore() {
   // Check that the register use lists are sane.
   MRI->verifyUseLists();
 
-  verifyStackFrame();
+  if (!MF->empty())
+    verifyStackFrame();
 }
 
 // Does iterator point to a and b as the first two elements?
@@ -572,7 +574,7 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     for (const auto &LI : MBB->liveins()) {
       if (isAllocatable(LI.PhysReg) && !MBB->isEHPad() &&
           MBB->getIterator() != MBB->getParent()->begin()) {
-        report("MBB has allocable live-in, but isn't entry or landing-pad.", MBB);
+        report("MBB has allocatable live-in, but isn't entry or landing-pad.", MBB);
       }
     }
   }
@@ -908,6 +910,14 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
   }
 
+  // Generic loads and stores must have a single MachineMemOperand
+  // describing that access.
+  if ((MI->getOpcode() == TargetOpcode::G_LOAD ||
+       MI->getOpcode() == TargetOpcode::G_STORE) &&
+      !MI->hasOneMemOperand())
+    report("Generic instruction accessing memory must have one mem operand",
+           MI);
+
   StringRef ErrorInfo;
   if (!TII->verifyInstruction(*MI, ErrorInfo))
     report(ErrorInfo.data(), MI);
diff --git a/lib/CodeGen/PrologEpilogInserter.cpp b/lib/CodeGen/PrologEpilogInserter.cpp
index 708f95c55b265c851b3ce3f4db10513534462a64..9f608957ca2ab9bde90542c45b2be503b9b1f352 100644
--- a/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/lib/CodeGen/PrologEpilogInserter.cpp
@@ -336,7 +336,7 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
     return;
 
   const TargetRegisterInfo *RegInfo = F.getSubtarget().getRegisterInfo();
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&F);
+  const MCPhysReg *CSRegs = F.getRegInfo().getCalleeSavedRegs();
 
   std::vector<CalleeSavedInfo> CSI;
   for (unsigned i = 0; CSRegs[i]; ++i) {
diff --git a/lib/CodeGen/PseudoSourceValue.cpp b/lib/CodeGen/PseudoSourceValue.cpp
index 804a4c3dad6698afcb39dc5433b90f002e4e55c0..b29e62bf1aa3cd68eca4ee334022b100316a2c8d 100644
--- a/lib/CodeGen/PseudoSourceValue.cpp
+++ b/lib/CodeGen/PseudoSourceValue.cpp
@@ -29,7 +29,10 @@ PseudoSourceValue::PseudoSourceValue(PSVKind Kind) : Kind(Kind) {}
 PseudoSourceValue::~PseudoSourceValue() {}
 
 void PseudoSourceValue::printCustom(raw_ostream &O) const {
-  O << PSVNames[Kind];
+  if (Kind < TargetCustom)
+    O << PSVNames[Kind];
+  else
+    O << "TargetCustom" << Kind;
 }
 
 bool PseudoSourceValue::isConstant(const MachineFrameInfo *) const {
diff --git a/lib/CodeGen/RegAllocBasic.cpp b/lib/CodeGen/RegAllocBasic.cpp
index a558e371ad4c687fe8b7806180c3f4ba38f44d04..a87fed3a687e1a0d9a09fcc7679ece3b4192ec74 100644
--- a/lib/CodeGen/RegAllocBasic.cpp
+++ b/lib/CodeGen/RegAllocBasic.cpp
@@ -176,8 +176,6 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
     Q.collectInterferingVRegs();
-    if (Q.seenUnspillableVReg())
-      return false;
     for (unsigned i = Q.interferingVRegs().size(); i; --i) {
       LiveInterval *Intf = Q.interferingVRegs()[i - 1];
       if (!Intf->isSpillable() || Intf->weight > VirtReg.weight)
diff --git a/lib/CodeGen/RegAllocGreedy.cpp b/lib/CodeGen/RegAllocGreedy.cpp
index 01753ceb01709eb317ace47d8491d8d962ab4918..06500289c971ab85bbb9e1a520c55ef7adfe7f9f 100644
--- a/lib/CodeGen/RegAllocGreedy.cpp
+++ b/lib/CodeGen/RegAllocGreedy.cpp
@@ -698,7 +698,7 @@ unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) {
     MCRegUnitIterator Units(PhysReg, TRI);
     for (; Units.isValid(); ++Units) {
       // Instantiate a "subquery", not to be confused with the Queries array.
-      LiveIntervalUnion::Query subQ(&VirtReg, &Matrix->getLiveUnions()[*Units]);
+      LiveIntervalUnion::Query subQ(VirtReg, Matrix->getLiveUnions()[*Units]);
       if (subQ.checkInterference())
         break;
     }
@@ -849,7 +849,11 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
   SmallVector<LiveInterval*, 8> Intfs;
   for (MCRegUnitIterator Units(PhysReg, TRI); Units.isValid(); ++Units) {
     LiveIntervalUnion::Query &Q = Matrix->query(VirtReg, *Units);
-    assert(Q.seenAllInterferences() && "Didn't check all interfererences.");
+    // We usually have the interfering VRegs cached so collectInterferingVRegs()
+    // should be fast, we may need to recalculate if when different physregs
+    // overlap the same register unit so we had different SubRanges queried
+    // against it.
+    Q.collectInterferingVRegs();
     ArrayRef<LiveInterval*> IVR = Q.interferingVRegs();
     Intfs.append(IVR.begin(), IVR.end());
   }
diff --git a/lib/CodeGen/RegAllocPBQP.cpp b/lib/CodeGen/RegAllocPBQP.cpp
index e4a02170ec41a8527acffdd3b2123aaa1f189e69..3b5964eef55e4960fa563ec0c29279ed5e74412e 100644
--- a/lib/CodeGen/RegAllocPBQP.cpp
+++ b/lib/CodeGen/RegAllocPBQP.cpp
@@ -1,4 +1,4 @@
-//===------ RegAllocPBQP.cpp ---- PBQP Register Allocator -------*- C++ -*-===//
+//===- RegAllocPBQP.cpp ---- PBQP Register Allocator ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -29,34 +29,61 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/RegAllocPBQP.h"
 #include "RegisterCoalescer.h"
 #include "Spiller.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStackAnalysis.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/PBQP/Graph.h"
+#include "llvm/CodeGen/PBQP/Solution.h"
+#include "llvm/CodeGen/PBQPRAConstraint.h"
+#include "llvm/CodeGen/RegAllocPBQP.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/VirtRegMap.h"
+#include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Printable.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
 #include <limits>
+#include <map>
 #include <memory>
 #include <queue>
 #include <set>
 #include <sstream>
+#include <string>
+#include <system_error>
+#include <tuple>
 #include <vector>
+#include <utility>
 
 using namespace llvm;
 
@@ -86,7 +113,6 @@ namespace {
 /// Programming problems.
 class RegAllocPBQP : public MachineFunctionPass {
 public:
-
   static char ID;
 
   /// Construct a PBQP register allocator.
@@ -113,7 +139,6 @@ public:
   }
 
 private:
-
   typedef std::map<const LiveInterval*, unsigned> LI2NodeMap;
   typedef std::vector<const LiveInterval*> Node2LIMap;
   typedef std::vector<unsigned> AllowedSet;
@@ -187,7 +212,6 @@ public:
 /// @brief Add interference edges between overlapping vregs.
 class Interference : public PBQPRAConstraint {
 private:
-
   typedef const PBQP::RegAlloc::AllowedRegVector* AllowedRegVecPtr;
   typedef std::pair<AllowedRegVecPtr, AllowedRegVecPtr> IKey;
   typedef DenseMap<IKey, PBQPRAGraph::MatrixPtr> IMatrixCache;
@@ -276,7 +300,6 @@ private:
   }
 
 public:
-
   void apply(PBQPRAGraph &G) override {
     // The following is loosely based on the linear scan algorithm introduced in
     // "Linear Scan Register Allocation" by Poletto and Sarkar. This version
@@ -363,7 +386,6 @@ public:
   }
 
 private:
-
   // Create an Interference edge and add it to the graph, unless it is
   // a null matrix, meaning the nodes' allowed registers do not have any
   // interference. This case occurs frequently between integer and floating
@@ -372,7 +394,6 @@ private:
   bool createInterferenceEdge(PBQPRAGraph &G,
                               PBQPRAGraph::NodeId NId, PBQPRAGraph::NodeId MId,
                               IMatrixCache &C) {
-
     const TargetRegisterInfo &TRI =
         *G.getMetadata().MF.getSubtarget().getRegisterInfo();
     const auto &NRegs = G.getNodeMetadata(NId).getAllowedRegs();
@@ -409,7 +430,6 @@ private:
   }
 };
 
-
 class Coalescing : public PBQPRAConstraint {
 public:
   void apply(PBQPRAGraph &G) override {
@@ -421,7 +441,6 @@ public:
     // gives the Ok.
     for (const auto &MBB : MF) {
       for (const auto &MI : MBB) {
-
         // Skip not-coalescable or already coalesced copies.
         if (!CP.setRegisters(&MI) || CP.getSrcReg() == CP.getDstReg())
           continue;
@@ -479,7 +498,6 @@ public:
   }
 
 private:
-
   void addVirtRegCoalesce(
                     PBQPRAGraph::RawMatrix &CostMat,
                     const PBQPRAGraph::NodeMetadata::AllowedRegVector &Allowed1,
@@ -496,14 +514,15 @@ private:
       }
     }
   }
-
 };
 
-} // End anonymous namespace.
+} // end anonymous namespace
 
 // Out-of-line destructor/anchor for PBQPRAConstraint.
-PBQPRAConstraint::~PBQPRAConstraint() {}
+PBQPRAConstraint::~PBQPRAConstraint() = default;
+
 void PBQPRAConstraint::anchor() {}
+
 void PBQPRAConstraintList::anchor() {}
 
 void RegAllocPBQP::getAnalysisUsage(AnalysisUsage &au) const {
@@ -554,7 +573,7 @@ void RegAllocPBQP::findVRegIntervalsToAlloc(const MachineFunction &MF,
 
 static bool isACalleeSavedRegister(unsigned reg, const TargetRegisterInfo &TRI,
                                    const MachineFunction &MF) {
-  const MCPhysReg *CSR = TRI.getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSR = MF.getRegInfo().getCalleeSavedRegs();
   for (unsigned i = 0; CSR[i] != 0; ++i)
     if (TRI.regsOverlap(reg, CSR[i]))
       return true;
@@ -777,7 +796,6 @@ bool RegAllocPBQP::runOnMachineFunction(MachineFunction &MF) {
 
   // If there are non-empty intervals allocate them using pbqp.
   if (!VRegsToAlloc.empty()) {
-
     const TargetSubtargetInfo &Subtarget = MF.getSubtarget();
     std::unique_ptr<PBQPRAConstraintList> ConstraintsRoot =
       llvm::make_unique<PBQPRAConstraintList>();
diff --git a/lib/CodeGen/RegUsageInfoCollector.cpp b/lib/CodeGen/RegUsageInfoCollector.cpp
index ece44c28e9ede2c4cdf35b3894bda5f8685818d4..855aa37ff3c36a4564052b0ef70a4afc59f5c22c 100644
--- a/lib/CodeGen/RegUsageInfoCollector.cpp
+++ b/lib/CodeGen/RegUsageInfoCollector.cpp
@@ -103,9 +103,27 @@ bool RegUsageInfoCollector::runOnMachineFunction(MachineFunction &MF) {
 
   DEBUG(dbgs() << "Clobbered Registers: ");
 
-  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg)
-    if (MRI->isPhysRegModified(PReg, true))
-      RegMask[PReg / 32] &= ~(1u << PReg % 32);
+  const BitVector &UsedPhysRegsMask = MRI->getUsedPhysRegsMask();
+  auto SetRegAsDefined = [&RegMask] (unsigned Reg) {
+    RegMask[Reg / 32] &= ~(1u << Reg % 32);
+  };
+  // Scan all the physical registers. When a register is defined in the current
+  // function set it and all the aliasing registers as defined in the regmask.
+  for (unsigned PReg = 1, PRegE = TRI->getNumRegs(); PReg < PRegE; ++PReg) {
+    // If a register is in the UsedPhysRegsMask set then mark it as defined.
+    // All it's aliases will also be in the set, so we can skip setting
+    // as defined all the aliases here.
+    if (UsedPhysRegsMask.test(PReg)) {
+      SetRegAsDefined(PReg);
+      continue;
+    }
+    // If a register is defined by an instruction mark it as defined together
+    // with all it's aliases.
+    if (!MRI->def_empty(PReg)) {
+      for (MCRegAliasIterator AI(PReg, TRI, true); AI.isValid(); ++AI)
+        SetRegAsDefined(*AI);
+    }
+  }
 
   if (!TargetFrameLowering::isSafeForNoCSROpt(F)) {
     const uint32_t *CallPreservedMask =
diff --git a/lib/CodeGen/RegisterClassInfo.cpp b/lib/CodeGen/RegisterClassInfo.cpp
index e93766ec01ba18baf32c3b4c534e0c7bea3c2676..82a3bd9a0bd174d48d28ce137c620fde1e53dead 100644
--- a/lib/CodeGen/RegisterClassInfo.cpp
+++ b/lib/CodeGen/RegisterClassInfo.cpp
@@ -1,4 +1,4 @@
-//===-- RegisterClassInfo.cpp - Dynamic Register Class Info ---------------===//
+//===- RegisterClassInfo.cpp - Dynamic Register Class Info ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,12 +14,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -29,8 +39,7 @@ static cl::opt<unsigned>
 StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"),
          cl::desc("Limit all regclasses to N registers"));
 
-RegisterClassInfo::RegisterClassInfo()
-  : Tag(0), MF(nullptr), TRI(nullptr), CalleeSaved(nullptr) {}
+RegisterClassInfo::RegisterClassInfo() = default;
 
 void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
   bool Update = false;
@@ -48,18 +57,20 @@ void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) {
 
   // Does this MF have different CSRs?
   assert(TRI && "no register info set");
-  const MCPhysReg *CSR = TRI->getCalleeSavedRegs(MF);
-  if (Update || CSR != CalleeSaved) {
-    // Build a CSRNum map. Every CSR alias gets an entry pointing to the last
+
+  // Get the callee saved registers.
+  const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs();
+  if (Update || CSR != CalleeSavedRegs) {
+    // Build a CSRAlias map. Every CSR alias saves the last
     // overlapping CSR.
-    CSRNum.clear();
-    CSRNum.resize(TRI->getNumRegs(), 0);
-    for (unsigned N = 0; unsigned Reg = CSR[N]; ++N)
-      for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
-        CSRNum[*AI] = N + 1; // 0 means no CSR, 1 means CalleeSaved[0], ...
+    CalleeSavedAliases.resize(TRI->getNumRegs(), 0);
+    for (const MCPhysReg *I = CSR; *I; ++I)
+      for (MCRegAliasIterator AI(*I, TRI, true); AI.isValid(); ++AI)
+        CalleeSavedAliases[*AI] = *I;
+
     Update = true;
   }
-  CalleeSaved = CSR;
+  CalleeSavedRegs = CSR;
 
   // Different reserved registers?
   const BitVector &RR = MF->getRegInfo().getReservedRegs();
@@ -103,7 +114,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
     unsigned Cost = TRI->getCostPerUse(PhysReg);
     MinCost = std::min(MinCost, Cost);
 
-    if (CSRNum[PhysReg])
+    if (CalleeSavedAliases[PhysReg])
       // PhysReg aliases a CSR, save it for later.
       CSRAlias.push_back(PhysReg);
     else {
@@ -114,7 +125,7 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const {
     }
   }
   RCI.NumRegs = N + CSRAlias.size();
-  assert (RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
+  assert(RCI.NumRegs <= NumRegs && "Allocation order larger than regclass");
 
   // CSR aliases go after the volatile registers, preserve the target's order.
   for (unsigned i = 0, e = CSRAlias.size(); i != e; ++i) {
diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp
index 778ea8eaca1bdf0aaa5f5252d8f393787260ab80..bf44ee8453b613ed129338d439b0e99c920bc6aa 100644
--- a/lib/CodeGen/RegisterCoalescer.cpp
+++ b/lib/CodeGen/RegisterCoalescer.cpp
@@ -815,42 +815,14 @@ bool RegisterCoalescer::removeCopyByCommutingDef(const CoalescerPair &CP,
       VNInfo *ASubValNo = SA.getVNInfoAt(AIdx);
       assert(ASubValNo != nullptr);
 
-      LaneBitmask AMask = SA.LaneMask;
-      for (LiveInterval::SubRange &SB : IntB.subranges()) {
-        LaneBitmask BMask = SB.LaneMask;
-        LaneBitmask Common = BMask & AMask;
-        if (Common.none())
-          continue;
-
-        DEBUG( dbgs() << "\t\tCopy_Merge " << PrintLaneMask(BMask)
-                      << " into " << PrintLaneMask(Common) << '\n');
-        LaneBitmask BRest = BMask & ~AMask;
-        LiveInterval::SubRange *CommonRange;
-        if (BRest.any()) {
-          SB.LaneMask = BRest;
-          DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(BRest)
-                       << '\n');
-          // Duplicate SubRange for newly merged common stuff.
-          CommonRange = IntB.createSubRangeFrom(Allocator, Common, SB);
-        } else {
-          // We van reuse the L SubRange.
-          SB.LaneMask = Common;
-          CommonRange = &SB;
-        }
-        LiveRange RangeCopy(SB, Allocator);
-
-        VNInfo *BSubValNo = CommonRange->getVNInfoAt(CopyIdx);
-        assert(BSubValNo->def == CopyIdx);
-        BSubValNo->def = ASubValNo->def;
-        addSegmentsWithValNo(*CommonRange, BSubValNo, SA, ASubValNo);
-        AMask &= ~BMask;
-      }
-      if (AMask.any()) {
-        DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(AMask) << '\n');
-        LiveRange *NewRange = IntB.createSubRange(Allocator, AMask);
-        VNInfo *BSubValNo = NewRange->getNextValue(CopyIdx, Allocator);
-        addSegmentsWithValNo(*NewRange, BSubValNo, SA, ASubValNo);
-      }
+      IntB.refineSubRanges(Allocator, SA.LaneMask,
+          [&Allocator,&SA,CopyIdx,ASubValNo](LiveInterval::SubRange &SR) {
+        VNInfo *BSubValNo = SR.empty()
+          ? SR.getNextValue(CopyIdx, Allocator)
+          : SR.getVNInfoAt(CopyIdx);
+        assert(BSubValNo != nullptr);
+        addSegmentsWithValNo(SR, BSubValNo, SA, ASubValNo);
+      });
     }
   }
 
@@ -1472,7 +1444,7 @@ void RegisterCoalescer::updateRegDefsUses(unsigned SrcReg,
 
     // If SrcReg wasn't read, it may still be the case that DstReg is live-in
     // because SrcReg is a sub-register.
-    if (DstInt && !Reads && SubIdx)
+    if (DstInt && !Reads && SubIdx && !UseMI->isDebugValue())
       Reads = DstInt->liveAt(LIS->getInstructionIndex(*UseMI));
 
     // Replace SrcReg with DstReg in all UseMI operands.
@@ -1744,9 +1716,10 @@ bool RegisterCoalescer::joinCopy(MachineInstr *CopyMI, bool &Again) {
 
 bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   unsigned DstReg = CP.getDstReg();
+  unsigned SrcReg = CP.getSrcReg();
   assert(CP.isPhys() && "Must be a physreg copy");
   assert(MRI->isReserved(DstReg) && "Not a reserved register");
-  LiveInterval &RHS = LIS->getInterval(CP.getSrcReg());
+  LiveInterval &RHS = LIS->getInterval(SrcReg);
   DEBUG(dbgs() << "\t\tRHS = " << RHS << '\n');
 
   assert(RHS.containsOneValue() && "Invalid join with reserved register");
@@ -1788,17 +1761,36 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
   // Delete the identity copy.
   MachineInstr *CopyMI;
   if (CP.isFlipped()) {
-    CopyMI = MRI->getVRegDef(RHS.reg);
+    // Physreg is copied into vreg
+    //   %vregY = COPY %X
+    //   ...  //< no other def of %X here
+    //   use %vregY
+    // =>
+    //   ...
+    //   use %X
+    CopyMI = MRI->getVRegDef(SrcReg);
   } else {
-    if (!MRI->hasOneNonDBGUse(RHS.reg)) {
+    // VReg is copied into physreg:
+    //   %vregX = def
+    //   ... //< no other def or use of %Y here
+    //   %Y = COPY %vregX
+    // =>
+    //   %Y = def
+    //   ...
+    if (!MRI->hasOneNonDBGUse(SrcReg)) {
       DEBUG(dbgs() << "\t\tMultiple vreg uses!\n");
       return false;
     }
 
-    MachineInstr *DestMI = MRI->getVRegDef(RHS.reg);
-    CopyMI = &*MRI->use_instr_nodbg_begin(RHS.reg);
-    const SlotIndex CopyRegIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
-    const SlotIndex DestRegIdx = LIS->getInstructionIndex(*DestMI).getRegSlot();
+    if (!LIS->intervalIsInOneMBB(RHS)) {
+      DEBUG(dbgs() << "\t\tComplex control flow!\n");
+      return false;
+    }
+
+    MachineInstr &DestMI = *MRI->getVRegDef(SrcReg);
+    CopyMI = &*MRI->use_instr_nodbg_begin(SrcReg);
+    SlotIndex CopyRegIdx = LIS->getInstructionIndex(*CopyMI).getRegSlot();
+    SlotIndex DestRegIdx = LIS->getInstructionIndex(DestMI).getRegSlot();
 
     if (!MRI->isConstantPhysReg(DstReg)) {
       // We checked above that there are no interfering defs of the physical
@@ -1817,8 +1809,8 @@ bool RegisterCoalescer::joinReservedPhysReg(CoalescerPair &CP) {
 
     // We're going to remove the copy which defines a physical reserved
     // register, so remove its valno, etc.
-    DEBUG(dbgs() << "\t\tRemoving phys reg def of " << DstReg << " at "
-          << CopyRegIdx << "\n");
+    DEBUG(dbgs() << "\t\tRemoving phys reg def of " << PrintReg(DstReg, TRI)
+          << " at " << CopyRegIdx << "\n");
 
     LIS->removePhysRegDefAt(DstReg, CopyRegIdx);
     // Create a new dead def at the new def location.
@@ -2906,39 +2898,16 @@ void RegisterCoalescer::mergeSubRangeInto(LiveInterval &LI,
                                           LaneBitmask LaneMask,
                                           CoalescerPair &CP) {
   BumpPtrAllocator &Allocator = LIS->getVNInfoAllocator();
-  for (LiveInterval::SubRange &R : LI.subranges()) {
-    LaneBitmask RMask = R.LaneMask;
-    // LaneMask of subregisters common to subrange R and ToMerge.
-    LaneBitmask Common = RMask & LaneMask;
-    // There is nothing to do without common subregs.
-    if (Common.none())
-      continue;
-
-    DEBUG(dbgs() << "\t\tCopy+Merge " << PrintLaneMask(RMask) << " into "
-                 << PrintLaneMask(Common) << '\n');
-    // LaneMask of subregisters contained in the R range but not in ToMerge,
-    // they have to split into their own subrange.
-    LaneBitmask LRest = RMask & ~LaneMask;
-    LiveInterval::SubRange *CommonRange;
-    if (LRest.any()) {
-      R.LaneMask = LRest;
-      DEBUG(dbgs() << "\t\tReduce Lane to " << PrintLaneMask(LRest) << '\n');
-      // Duplicate SubRange for newly merged common stuff.
-      CommonRange = LI.createSubRangeFrom(Allocator, Common, R);
+  LI.refineSubRanges(Allocator, LaneMask,
+      [this,&Allocator,&ToMerge,&CP](LiveInterval::SubRange &SR) {
+    if (SR.empty()) {
+      SR.assign(ToMerge, Allocator);
     } else {
-      // Reuse the existing range.
-      R.LaneMask = Common;
-      CommonRange = &R;
+      // joinSubRegRange() destroys the merged range, so we need a copy.
+      LiveRange RangeCopy(ToMerge, Allocator);
+      joinSubRegRanges(SR, RangeCopy, SR.LaneMask, CP);
     }
-    LiveRange RangeCopy(ToMerge, Allocator);
-    joinSubRegRanges(*CommonRange, RangeCopy, Common, CP);
-    LaneMask &= ~RMask;
-  }
-
-  if (LaneMask.any()) {
-    DEBUG(dbgs() << "\t\tNew Lane " << PrintLaneMask(LaneMask) << '\n');
-    LI.createSubRangeFrom(Allocator, LaneMask, ToMerge);
-  }
+  });
 }
 
 bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
diff --git a/lib/CodeGen/RegisterPressure.cpp b/lib/CodeGen/RegisterPressure.cpp
index 478274411c39ff69c13c41045bf0b5251b5502f5..c726edc88b41c1de86eae0d4f2f3665c1162d428 100644
--- a/lib/CodeGen/RegisterPressure.cpp
+++ b/lib/CodeGen/RegisterPressure.cpp
@@ -1,4 +1,4 @@
-//===-- RegisterPressure.cpp - Dynamic Register Pressure ------------------===//
+//===- RegisterPressure.cpp - Dynamic Register Pressure -------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,13 +12,37 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/CodeGen/RegisterPressure.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -267,7 +291,6 @@ bool RegPressureTracker::isBottomClosed() const {
           MachineBasicBlock::const_iterator());
 }
 
-
 SlotIndex RegPressureTracker::getCurrSlot() const {
   MachineBasicBlock::const_iterator IdxPos =
     skipDebugInstructionsForward(CurrPos, MBB->end());
@@ -331,7 +354,7 @@ void RegPressureTracker::initLiveThru(const RegPressureTracker &RPTracker) {
 
 static LaneBitmask getRegLanes(ArrayRef<RegisterMaskPair> RegUnits,
                                unsigned RegUnit) {
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I == RegUnits.end())
@@ -343,7 +366,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                         RegisterMaskPair Pair) {
   unsigned RegUnit = Pair.RegUnit;
   assert(Pair.LaneMask.any());
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I == RegUnits.end()) {
@@ -355,7 +378,7 @@ static void addRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
 
 static void setRegZero(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                        unsigned RegUnit) {
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I == RegUnits.end()) {
@@ -369,7 +392,7 @@ static void removeRegLanes(SmallVectorImpl<RegisterMaskPair> &RegUnits,
                            RegisterMaskPair Pair) {
   unsigned RegUnit = Pair.RegUnit;
   assert(Pair.LaneMask.any());
-  auto I = find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
+  auto I = llvm::find_if(RegUnits, [RegUnit](const RegisterMaskPair Other) {
     return Other.RegUnit == RegUnit;
   });
   if (I != RegUnits.end()) {
@@ -426,6 +449,8 @@ namespace {
 ///
 /// FIXME: always ignore tied opers
 class RegisterOperandsCollector {
+  friend class llvm::RegisterOperands;
+
   RegisterOperands &RegOpers;
   const TargetRegisterInfo &TRI;
   const MachineRegisterInfo &MRI;
@@ -520,11 +545,9 @@ class RegisterOperandsCollector {
         addRegLanes(RegUnits, RegisterMaskPair(*Units, LaneBitmask::getAll()));
     }
   }
-
-  friend class llvm::RegisterOperands;
 };
 
-} // namespace
+} // end anonymous namespace
 
 void RegisterOperands::collect(const MachineInstr &MI,
                                const TargetRegisterInfo &TRI,
@@ -677,7 +700,7 @@ void RegPressureTracker::discoverLiveInOrOut(RegisterMaskPair Pair,
   assert(Pair.LaneMask.any());
 
   unsigned RegUnit = Pair.RegUnit;
-  auto I = find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) {
+  auto I = llvm::find_if(LiveInOrOut, [RegUnit](const RegisterMaskPair &Other) {
     return Other.RegUnit == RegUnit;
   });
   LaneBitmask PrevMask;
@@ -775,9 +798,10 @@ void RegPressureTracker::recede(const RegisterOperands &RegOpers,
         if (!TrackLaneMasks) {
           addRegLanes(*LiveUses, RegisterMaskPair(Reg, NewMask));
         } else {
-          auto I = find_if(*LiveUses, [Reg](const RegisterMaskPair Other) {
-            return Other.RegUnit == Reg;
-          });
+          auto I =
+              llvm::find_if(*LiveUses, [Reg](const RegisterMaskPair Other) {
+                return Other.RegUnit == Reg;
+              });
           bool IsRedef = I != LiveUses->end();
           if (IsRedef) {
             // ignore re-defs here...
@@ -1157,7 +1181,7 @@ getUpwardPressureDelta(const MachineInstr *MI, /*const*/ PressureDiff &PDiff,
 
       if (CritIdx != CritEnd && CriticalPSets[CritIdx].getPSet() == PSetID) {
         int CritInc = (int)MNew - (int)CriticalPSets[CritIdx].getUnitInc();
-        if (CritInc > 0 && CritInc <= INT16_MAX) {
+        if (CritInc > 0 && CritInc <= std::numeric_limits<int16_t>::max()) {
           Delta.CriticalMax = PressureChange(PSetID);
           Delta.CriticalMax.setUnitInc(CritInc);
         }
diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp
index 23e07581fc80041f95431d44afcda749919d8d16..6392136fa290993a1b133d0cb6950735f686d7a9 100644
--- a/lib/CodeGen/RegisterScavenging.cpp
+++ b/lib/CodeGen/RegisterScavenging.cpp
@@ -1,4 +1,4 @@
-//===-- RegisterScavenging.cpp - Machine register scavenging --------------===//
+//===- RegisterScavenging.cpp - Machine register scavenging ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,18 +15,26 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <string>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "reg-scavenging"
@@ -390,7 +398,7 @@ unsigned RegScavenger::scavengeRegister(const TargetRegisterClass *RC,
   unsigned NeedSize = RC->getSize();
   unsigned NeedAlign = RC->getAlignment();
 
-  unsigned SI = Scavenged.size(), Diff = UINT_MAX;
+  unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max();
   int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd();
   for (unsigned I = 0; I < Scavenged.size(); ++I) {
     if (Scavenged[I].Reg != 0)
diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp
index 2b82df293c1488100e0c62936f679566b09c7bda..fa68411284e77d0a61547692ed7fcc05c7509bd5 100644
--- a/lib/CodeGen/SafeStack.cpp
+++ b/lib/CodeGen/SafeStack.cpp
@@ -451,7 +451,7 @@ void SafeStack::checkStackGuard(IRBuilder<> &IRB, Function &F, ReturnInst &RI,
   IRBuilder<> IRBFail(CheckTerm);
   // FIXME: respect -fsanitize-trap / -ftrap-function here?
   Constant *StackChkFail = F.getParent()->getOrInsertFunction(
-      "__stack_chk_fail", IRB.getVoidTy(), nullptr);
+      "__stack_chk_fail", IRB.getVoidTy());
   IRBFail.CreateCall(StackChkFail, {});
 }
 
diff --git a/lib/CodeGen/ScheduleDAG.cpp b/lib/CodeGen/ScheduleDAG.cpp
index 5c177463a5d91801f02369f44edd642b8eb3da0c..dc72ac07325882f623ea5cb0dc17655d31a4c674 100644
--- a/lib/CodeGen/ScheduleDAG.cpp
+++ b/lib/CodeGen/ScheduleDAG.cpp
@@ -1,4 +1,4 @@
-//===---- ScheduleDAG.cpp - Implement the ScheduleDAG class ---------------===//
+//===- ScheduleDAG.cpp - Implement the ScheduleDAG class ------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,22 +7,32 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This implements the ScheduleDAG class, which is a base class used by
-// scheduling implementation classes.
+/// \file Implements the ScheduleDAG class, which is a base class used by
+/// scheduling implementation classes.
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleHazardRecognizer.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <climits>
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <limits>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "pre-RA-sched"
@@ -33,58 +43,52 @@ static cl::opt<bool> StressSchedOpt(
   cl::desc("Stress test instruction scheduling"));
 #endif
 
-void SchedulingPriorityQueue::anchor() { }
+void SchedulingPriorityQueue::anchor() {}
 
 ScheduleDAG::ScheduleDAG(MachineFunction &mf)
     : TM(mf.getTarget()), TII(mf.getSubtarget().getInstrInfo()),
       TRI(mf.getSubtarget().getRegisterInfo()), MF(mf),
-      MRI(mf.getRegInfo()), EntrySU(), ExitSU() {
+      MRI(mf.getRegInfo()) {
 #ifndef NDEBUG
   StressSched = StressSchedOpt;
 #endif
 }
 
-ScheduleDAG::~ScheduleDAG() {}
+ScheduleDAG::~ScheduleDAG() = default;
 
-/// Clear the DAG state (e.g. between scheduling regions).
 void ScheduleDAG::clearDAG() {
   SUnits.clear();
   EntrySU = SUnit();
   ExitSU = SUnit();
 }
 
-/// getInstrDesc helper to handle SDNodes.
 const MCInstrDesc *ScheduleDAG::getNodeDesc(const SDNode *Node) const {
   if (!Node || !Node->isMachineOpcode()) return nullptr;
   return &TII->get(Node->getMachineOpcode());
 }
 
-/// addPred - This adds the specified edge as a pred of the current node if
-/// not already.  It also adds the current node as a successor of the
-/// specified node.
 bool SUnit::addPred(const SDep &D, bool Required) {
   // If this node already has this dependence, don't add a redundant one.
-  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
-         I != E; ++I) {
+  for (SDep &PredDep : Preds) {
     // Zero-latency weak edges may be added purely for heuristic ordering. Don't
     // add them if another kind of edge already exists.
-    if (!Required && I->getSUnit() == D.getSUnit())
+    if (!Required && PredDep.getSUnit() == D.getSUnit())
       return false;
-    if (I->overlaps(D)) {
-      // Extend the latency if needed. Equivalent to removePred(I) + addPred(D).
-      if (I->getLatency() < D.getLatency()) {
-        SUnit *PredSU = I->getSUnit();
+    if (PredDep.overlaps(D)) {
+      // Extend the latency if needed. Equivalent to
+      // removePred(PredDep) + addPred(D).
+      if (PredDep.getLatency() < D.getLatency()) {
+        SUnit *PredSU = PredDep.getSUnit();
         // Find the corresponding successor in N.
-        SDep ForwardD = *I;
+        SDep ForwardD = PredDep;
         ForwardD.setSUnit(this);
-        for (SmallVectorImpl<SDep>::iterator II = PredSU->Succs.begin(),
-               EE = PredSU->Succs.end(); II != EE; ++II) {
-          if (*II == ForwardD) {
-            II->setLatency(D.getLatency());
+        for (SDep &SuccDep : PredSU->Succs) {
+          if (SuccDep == ForwardD) {
+            SuccDep.setLatency(D.getLatency());
             break;
           }
         }
-        I->setLatency(D.getLatency());
+        PredDep.setLatency(D.getLatency());
       }
       return false;
     }
@@ -95,8 +99,10 @@ bool SUnit::addPred(const SDep &D, bool Required) {
   SUnit *N = D.getSUnit();
   // Update the bookkeeping.
   if (D.getKind() == SDep::Data) {
-    assert(NumPreds < UINT_MAX && "NumPreds will overflow!");
-    assert(N->NumSuccs < UINT_MAX && "NumSuccs will overflow!");
+    assert(NumPreds < std::numeric_limits<unsigned>::max() &&
+           "NumPreds will overflow!");
+    assert(N->NumSuccs < std::numeric_limits<unsigned>::max() &&
+           "NumSuccs will overflow!");
     ++NumPreds;
     ++N->NumSuccs;
   }
@@ -105,7 +111,8 @@ bool SUnit::addPred(const SDep &D, bool Required) {
       ++WeakPredsLeft;
     }
     else {
-      assert(NumPredsLeft < UINT_MAX && "NumPredsLeft will overflow!");
+      assert(NumPredsLeft < std::numeric_limits<unsigned>::max() &&
+             "NumPredsLeft will overflow!");
       ++NumPredsLeft;
     }
   }
@@ -114,7 +121,8 @@ bool SUnit::addPred(const SDep &D, bool Required) {
       ++N->WeakSuccsLeft;
     }
     else {
-      assert(N->NumSuccsLeft < UINT_MAX && "NumSuccsLeft will overflow!");
+      assert(N->NumSuccsLeft < std::numeric_limits<unsigned>::max() &&
+             "NumSuccsLeft will overflow!");
       ++N->NumSuccsLeft;
     }
   }
@@ -127,51 +135,46 @@ bool SUnit::addPred(const SDep &D, bool Required) {
   return true;
 }
 
-/// removePred - This removes the specified edge as a pred of the current
-/// node if it exists.  It also removes the current node as a successor of
-/// the specified node.
 void SUnit::removePred(const SDep &D) {
   // Find the matching predecessor.
-  for (SmallVectorImpl<SDep>::iterator I = Preds.begin(), E = Preds.end();
-         I != E; ++I)
-    if (*I == D) {
-      // Find the corresponding successor in N.
-      SDep P = D;
-      P.setSUnit(this);
-      SUnit *N = D.getSUnit();
-      SmallVectorImpl<SDep>::iterator Succ = find(N->Succs, P);
-      assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!");
-      N->Succs.erase(Succ);
-      Preds.erase(I);
-      // Update the bookkeeping.
-      if (P.getKind() == SDep::Data) {
-        assert(NumPreds > 0 && "NumPreds will underflow!");
-        assert(N->NumSuccs > 0 && "NumSuccs will underflow!");
-        --NumPreds;
-        --N->NumSuccs;
-      }
-      if (!N->isScheduled) {
-        if (D.isWeak())
-          --WeakPredsLeft;
-        else {
-          assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!");
-          --NumPredsLeft;
-        }
-      }
-      if (!isScheduled) {
-        if (D.isWeak())
-          --N->WeakSuccsLeft;
-        else {
-          assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!");
-          --N->NumSuccsLeft;
-        }
-      }
-      if (P.getLatency() != 0) {
-        this->setDepthDirty();
-        N->setHeightDirty();
-      }
-      return;
+  SmallVectorImpl<SDep>::iterator I = llvm::find(Preds, D);
+  if (I == Preds.end())
+    return;
+  // Find the corresponding successor in N.
+  SDep P = D;
+  P.setSUnit(this);
+  SUnit *N = D.getSUnit();
+  SmallVectorImpl<SDep>::iterator Succ = llvm::find(N->Succs, P);
+  assert(Succ != N->Succs.end() && "Mismatching preds / succs lists!");
+  N->Succs.erase(Succ);
+  Preds.erase(I);
+  // Update the bookkeeping.
+  if (P.getKind() == SDep::Data) {
+    assert(NumPreds > 0 && "NumPreds will underflow!");
+    assert(N->NumSuccs > 0 && "NumSuccs will underflow!");
+    --NumPreds;
+    --N->NumSuccs;
+  }
+  if (!N->isScheduled) {
+    if (D.isWeak())
+      --WeakPredsLeft;
+    else {
+      assert(NumPredsLeft > 0 && "NumPredsLeft will underflow!");
+      --NumPredsLeft;
     }
+  }
+  if (!isScheduled) {
+    if (D.isWeak())
+      --N->WeakSuccsLeft;
+    else {
+      assert(N->NumSuccsLeft > 0 && "NumSuccsLeft will underflow!");
+      --N->NumSuccsLeft;
+    }
+  }
+  if (P.getLatency() != 0) {
+    this->setDepthDirty();
+    N->setHeightDirty();
+  }
 }
 
 void SUnit::setDepthDirty() {
@@ -181,9 +184,8 @@ void SUnit::setDepthDirty() {
   do {
     SUnit *SU = WorkList.pop_back_val();
     SU->isDepthCurrent = false;
-    for (SUnit::const_succ_iterator I = SU->Succs.begin(),
-         E = SU->Succs.end(); I != E; ++I) {
-      SUnit *SuccSU = I->getSUnit();
+    for (SDep &SuccDep : SU->Succs) {
+      SUnit *SuccSU = SuccDep.getSUnit();
       if (SuccSU->isDepthCurrent)
         WorkList.push_back(SuccSU);
     }
@@ -197,18 +199,14 @@ void SUnit::setHeightDirty() {
   do {
     SUnit *SU = WorkList.pop_back_val();
     SU->isHeightCurrent = false;
-    for (SUnit::const_pred_iterator I = SU->Preds.begin(),
-         E = SU->Preds.end(); I != E; ++I) {
-      SUnit *PredSU = I->getSUnit();
+    for (SDep &PredDep : SU->Preds) {
+      SUnit *PredSU = PredDep.getSUnit();
       if (PredSU->isHeightCurrent)
         WorkList.push_back(PredSU);
     }
   } while (!WorkList.empty());
 }
 
-/// setDepthToAtLeast - Update this node's successors to reflect the
-/// fact that this node's depth just increased.
-///
 void SUnit::setDepthToAtLeast(unsigned NewDepth) {
   if (NewDepth <= getDepth())
     return;
@@ -217,9 +215,6 @@ void SUnit::setDepthToAtLeast(unsigned NewDepth) {
   isDepthCurrent = true;
 }
 
-/// setHeightToAtLeast - Update this node's predecessors to reflect the
-/// fact that this node's height just increased.
-///
 void SUnit::setHeightToAtLeast(unsigned NewHeight) {
   if (NewHeight <= getHeight())
     return;
@@ -228,8 +223,7 @@ void SUnit::setHeightToAtLeast(unsigned NewHeight) {
   isHeightCurrent = true;
 }
 
-/// ComputeDepth - Calculate the maximal path from the node to the exit.
-///
+/// Calculates the maximal path from the node to the exit.
 void SUnit::ComputeDepth() {
   SmallVector<SUnit*, 8> WorkList;
   WorkList.push_back(this);
@@ -238,12 +232,11 @@ void SUnit::ComputeDepth() {
 
     bool Done = true;
     unsigned MaxPredDepth = 0;
-    for (SUnit::const_pred_iterator I = Cur->Preds.begin(),
-         E = Cur->Preds.end(); I != E; ++I) {
-      SUnit *PredSU = I->getSUnit();
+    for (const SDep &PredDep : Cur->Preds) {
+      SUnit *PredSU = PredDep.getSUnit();
       if (PredSU->isDepthCurrent)
         MaxPredDepth = std::max(MaxPredDepth,
-                                PredSU->Depth + I->getLatency());
+                                PredSU->Depth + PredDep.getLatency());
       else {
         Done = false;
         WorkList.push_back(PredSU);
@@ -261,8 +254,7 @@ void SUnit::ComputeDepth() {
   } while (!WorkList.empty());
 }
 
-/// ComputeHeight - Calculate the maximal path from the node to the entry.
-///
+/// Calculates the maximal path from the node to the entry.
 void SUnit::ComputeHeight() {
   SmallVector<SUnit*, 8> WorkList;
   WorkList.push_back(this);
@@ -271,12 +263,11 @@ void SUnit::ComputeHeight() {
 
     bool Done = true;
     unsigned MaxSuccHeight = 0;
-    for (SUnit::const_succ_iterator I = Cur->Succs.begin(),
-         E = Cur->Succs.end(); I != E; ++I) {
-      SUnit *SuccSU = I->getSUnit();
+    for (const SDep &SuccDep : Cur->Succs) {
+      SUnit *SuccSU = SuccDep.getSUnit();
       if (SuccSU->isHeightCurrent)
         MaxSuccHeight = std::max(MaxSuccHeight,
-                                 SuccSU->Height + I->getLatency());
+                                 SuccSU->Height + SuccDep.getLatency());
       else {
         Done = false;
         WorkList.push_back(SuccSU);
@@ -320,8 +311,6 @@ void SUnit::print(raw_ostream &OS, const ScheduleDAG *DAG) const {
     OS << "SU(" << NodeNum << ")";
 }
 
-/// SUnit - Scheduling unit. It's an wrapper around either a single SDNode or
-/// a group of nodes flagged together.
 LLVM_DUMP_METHOD void SUnit::dump(const ScheduleDAG *G) const {
   print(dbgs(), G);
   dbgs() << ": ";
@@ -344,41 +333,39 @@ LLVM_DUMP_METHOD void SUnit::dumpAll(const ScheduleDAG *G) const {
 
   if (Preds.size() != 0) {
     dbgs() << "  Predecessors:\n";
-    for (SUnit::const_succ_iterator I = Preds.begin(), E = Preds.end();
-         I != E; ++I) {
+    for (const SDep &SuccDep : Preds) {
       dbgs() << "   ";
-      switch (I->getKind()) {
+      switch (SuccDep.getKind()) {
       case SDep::Data:   dbgs() << "data "; break;
       case SDep::Anti:   dbgs() << "anti "; break;
       case SDep::Output: dbgs() << "out  "; break;
       case SDep::Order:  dbgs() << "ord  "; break;
       }
-      I->getSUnit()->print(dbgs(), G);
-      if (I->isArtificial())
+      SuccDep.getSUnit()->print(dbgs(), G);
+      if (SuccDep.isArtificial())
         dbgs() << " *";
-      dbgs() << ": Latency=" << I->getLatency();
-      if (I->isAssignedRegDep())
-        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI);
+      dbgs() << ": Latency=" << SuccDep.getLatency();
+      if (SuccDep.isAssignedRegDep())
+        dbgs() << " Reg=" << PrintReg(SuccDep.getReg(), G->TRI);
       dbgs() << "\n";
     }
   }
   if (Succs.size() != 0) {
     dbgs() << "  Successors:\n";
-    for (SUnit::const_succ_iterator I = Succs.begin(), E = Succs.end();
-         I != E; ++I) {
+    for (const SDep &SuccDep : Succs) {
       dbgs() << "   ";
-      switch (I->getKind()) {
+      switch (SuccDep.getKind()) {
       case SDep::Data:   dbgs() << "data "; break;
       case SDep::Anti:   dbgs() << "anti "; break;
       case SDep::Output: dbgs() << "out  "; break;
       case SDep::Order:  dbgs() << "ord  "; break;
       }
-      I->getSUnit()->print(dbgs(), G);
-      if (I->isArtificial())
+      SuccDep.getSUnit()->print(dbgs(), G);
+      if (SuccDep.isArtificial())
         dbgs() << " *";
-      dbgs() << ": Latency=" << I->getLatency();
-      if (I->isAssignedRegDep())
-        dbgs() << " Reg=" << PrintReg(I->getReg(), G->TRI);
+      dbgs() << ": Latency=" << SuccDep.getLatency();
+      if (SuccDep.isAssignedRegDep())
+        dbgs() << " Reg=" << PrintReg(SuccDep.getReg(), G->TRI);
       dbgs() << "\n";
     }
   }
@@ -386,47 +373,44 @@ LLVM_DUMP_METHOD void SUnit::dumpAll(const ScheduleDAG *G) const {
 #endif
 
 #ifndef NDEBUG
-/// VerifyScheduledDAG - Verify that all SUnits were scheduled and that
-/// their state is consistent. Return the number of scheduled nodes.
-///
 unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {
   bool AnyNotSched = false;
   unsigned DeadNodes = 0;
-  for (unsigned i = 0, e = SUnits.size(); i != e; ++i) {
-    if (!SUnits[i].isScheduled) {
-      if (SUnits[i].NumPreds == 0 && SUnits[i].NumSuccs == 0) {
+  for (const SUnit &SUnit : SUnits) {
+    if (!SUnit.isScheduled) {
+      if (SUnit.NumPreds == 0 && SUnit.NumSuccs == 0) {
         ++DeadNodes;
         continue;
       }
       if (!AnyNotSched)
         dbgs() << "*** Scheduling failed! ***\n";
-      SUnits[i].dump(this);
+      SUnit.dump(this);
       dbgs() << "has not been scheduled!\n";
       AnyNotSched = true;
     }
-    if (SUnits[i].isScheduled &&
-        (isBottomUp ? SUnits[i].getHeight() : SUnits[i].getDepth()) >
-          unsigned(INT_MAX)) {
+    if (SUnit.isScheduled &&
+        (isBottomUp ? SUnit.getHeight() : SUnit.getDepth()) >
+          unsigned(std::numeric_limits<int>::max())) {
       if (!AnyNotSched)
         dbgs() << "*** Scheduling failed! ***\n";
-      SUnits[i].dump(this);
+      SUnit.dump(this);
       dbgs() << "has an unexpected "
            << (isBottomUp ? "Height" : "Depth") << " value!\n";
       AnyNotSched = true;
     }
     if (isBottomUp) {
-      if (SUnits[i].NumSuccsLeft != 0) {
+      if (SUnit.NumSuccsLeft != 0) {
         if (!AnyNotSched)
           dbgs() << "*** Scheduling failed! ***\n";
-        SUnits[i].dump(this);
+        SUnit.dump(this);
         dbgs() << "has successors left!\n";
         AnyNotSched = true;
       }
     } else {
-      if (SUnits[i].NumPredsLeft != 0) {
+      if (SUnit.NumPredsLeft != 0) {
         if (!AnyNotSched)
           dbgs() << "*** Scheduling failed! ***\n";
-        SUnits[i].dump(this);
+        SUnit.dump(this);
         dbgs() << "has predecessors left!\n";
         AnyNotSched = true;
       }
@@ -437,36 +421,33 @@ unsigned ScheduleDAG::VerifyScheduledDAG(bool isBottomUp) {
 }
 #endif
 
-/// InitDAGTopologicalSorting - create the initial topological
-/// ordering from the DAG to be scheduled.
-///
-/// The idea of the algorithm is taken from
-/// "Online algorithms for managing the topological order of
-/// a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
-/// This is the MNR algorithm, which was first introduced by
-/// A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
-/// "Maintaining a topological order under edge insertions".
-///
-/// Short description of the algorithm:
-///
-/// Topological ordering, ord, of a DAG maps each node to a topological
-/// index so that for all edges X->Y it is the case that ord(X) < ord(Y).
-///
-/// This means that if there is a path from the node X to the node Z,
-/// then ord(X) < ord(Z).
-///
-/// This property can be used to check for reachability of nodes:
-/// if Z is reachable from X, then an insertion of the edge Z->X would
-/// create a cycle.
-///
-/// The algorithm first computes a topological ordering for the DAG by
-/// initializing the Index2Node and Node2Index arrays and then tries to keep
-/// the ordering up-to-date after edge insertions by reordering the DAG.
-///
-/// On insertion of the edge X->Y, the algorithm first marks by calling DFS
-/// the nodes reachable from Y, and then shifts them using Shift to lie
-/// immediately after X in Index2Node.
 void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
+  // The idea of the algorithm is taken from
+  // "Online algorithms for managing the topological order of
+  // a directed acyclic graph" by David J. Pearce and Paul H.J. Kelly
+  // This is the MNR algorithm, which was first introduced by
+  // A. Marchetti-Spaccamela, U. Nanni and H. Rohnert in
+  // "Maintaining a topological order under edge insertions".
+  //
+  // Short description of the algorithm:
+  //
+  // Topological ordering, ord, of a DAG maps each node to a topological
+  // index so that for all edges X->Y it is the case that ord(X) < ord(Y).
+  //
+  // This means that if there is a path from the node X to the node Z,
+  // then ord(X) < ord(Z).
+  //
+  // This property can be used to check for reachability of nodes:
+  // if Z is reachable from X, then an insertion of the edge Z->X would
+  // create a cycle.
+  //
+  // The algorithm first computes a topological ordering for the DAG by
+  // initializing the Index2Node and Node2Index arrays and then tries to keep
+  // the ordering up-to-date after edge insertions by reordering the DAG.
+  //
+  // On insertion of the edge X->Y, the algorithm first marks by calling DFS
+  // the nodes reachable from Y, and then shifts them using Shift to lie
+  // immediately after X in Index2Node.
   unsigned DAGSize = SUnits.size();
   std::vector<SUnit*> WorkList;
   WorkList.reserve(DAGSize);
@@ -477,18 +458,17 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
   // Initialize the data structures.
   if (ExitSU)
     WorkList.push_back(ExitSU);
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    int NodeNum = SU->NodeNum;
-    unsigned Degree = SU->Succs.size();
+  for (SUnit &SU : SUnits) {
+    int NodeNum = SU.NodeNum;
+    unsigned Degree = SU.Succs.size();
     // Temporarily use the Node2Index array as scratch space for degree counts.
     Node2Index[NodeNum] = Degree;
 
     // Is it a node without dependencies?
     if (Degree == 0) {
-      assert(SU->Succs.empty() && "SUnit should have no successors");
+      assert(SU.Succs.empty() && "SUnit should have no successors");
       // Collect leaf nodes.
-      WorkList.push_back(SU);
+      WorkList.push_back(&SU);
     }
   }
 
@@ -498,9 +478,8 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
     WorkList.pop_back();
     if (SU->NodeNum < DAGSize)
       Allocate(SU->NodeNum, --Id);
-    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-         I != E; ++I) {
-      SUnit *SU = I->getSUnit();
+    for (const SDep &PredDep : SU->Preds) {
+      SUnit *SU = PredDep.getSUnit();
       if (SU->NodeNum < DAGSize && !--Node2Index[SU->NodeNum])
         // If all dependencies of the node are processed already,
         // then the node can be computed now.
@@ -512,19 +491,15 @@ void ScheduleDAGTopologicalSort::InitDAGTopologicalSorting() {
 
 #ifndef NDEBUG
   // Check correctness of the ordering
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &SUnits[i];
-    for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
-         I != E; ++I) {
-      assert(Node2Index[SU->NodeNum] > Node2Index[I->getSUnit()->NodeNum] &&
+  for (SUnit &SU : SUnits)  {
+    for (const SDep &PD : SU.Preds) {
+      assert(Node2Index[SU.NodeNum] > Node2Index[PD.getSUnit()->NodeNum] &&
       "Wrong topological sorting");
     }
   }
 #endif
 }
 
-/// AddPred - Updates the topological ordering to accommodate an edge
-/// to be added from SUnit X to SUnit Y.
 void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
   int UpperBound, LowerBound;
   LowerBound = Node2Index[Y->NodeNum];
@@ -541,16 +516,10 @@ void ScheduleDAGTopologicalSort::AddPred(SUnit *Y, SUnit *X) {
   }
 }
 
-/// RemovePred - Updates the topological ordering to accommodate an
-/// an edge to be removed from the specified node N from the predecessors
-/// of the current node M.
 void ScheduleDAGTopologicalSort::RemovePred(SUnit *M, SUnit *N) {
   // InitDAGTopologicalSorting();
 }
 
-/// DFS - Make a DFS traversal to mark all nodes reachable from SU and mark
-/// all nodes affected by the edge insertion. These nodes will later get new
-/// topological indexes by means of the Shift method.
 void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
                                      bool &HasLoop) {
   std::vector<const SUnit*> WorkList;
@@ -561,8 +530,9 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
     SU = WorkList.back();
     WorkList.pop_back();
     Visited.set(SU->NodeNum);
-    for (int I = SU->Succs.size()-1; I >= 0; --I) {
-      unsigned s = SU->Succs[I].getSUnit()->NodeNum;
+    for (const SDep &SuccDep
+         : make_range(SU->Succs.rbegin(), SU->Succs.rend())) {
+      unsigned s = SuccDep.getSUnit()->NodeNum;
       // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
       if (s >= Node2Index.size())
         continue;
@@ -572,14 +542,93 @@ void ScheduleDAGTopologicalSort::DFS(const SUnit *SU, int UpperBound,
       }
       // Visit successors if not already and in affected region.
       if (!Visited.test(s) && Node2Index[s] < UpperBound) {
-        WorkList.push_back(SU->Succs[I].getSUnit());
+        WorkList.push_back(SuccDep.getSUnit());
       }
     }
   } while (!WorkList.empty());
 }
 
-/// Shift - Renumber the nodes so that the topological ordering is
-/// preserved.
+std::vector<int> ScheduleDAGTopologicalSort::GetSubGraph(const SUnit &StartSU,
+                                                         const SUnit &TargetSU,
+                                                         bool &Success) {
+  std::vector<const SUnit*> WorkList;
+  int LowerBound = Node2Index[StartSU.NodeNum];
+  int UpperBound = Node2Index[TargetSU.NodeNum];
+  bool Found = false;
+  BitVector VisitedBack;
+  std::vector<int> Nodes;
+
+  if (LowerBound > UpperBound) {
+    Success = false;
+    return Nodes;
+  }
+
+  WorkList.reserve(SUnits.size());
+  Visited.reset();
+
+  // Starting from StartSU, visit all successors up
+  // to UpperBound.
+  WorkList.push_back(&StartSU);
+  do {
+    const SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    for (int I = SU->Succs.size()-1; I >= 0; --I) {
+      const SUnit *Succ = SU->Succs[I].getSUnit();
+      unsigned s = Succ->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. ExitSU).
+      if (Succ->isBoundaryNode())
+        continue;
+      if (Node2Index[s] == UpperBound) {
+        Found = true;
+        continue;
+      }
+      // Visit successors if not already and in affected region.
+      if (!Visited.test(s) && Node2Index[s] < UpperBound) {
+        Visited.set(s);
+        WorkList.push_back(Succ);
+      }
+    }
+  } while (!WorkList.empty());
+
+  if (!Found) {
+    Success = false;
+    return Nodes;
+  }
+
+  WorkList.clear();
+  VisitedBack.resize(SUnits.size());
+  Found = false;
+
+  // Starting from TargetSU, visit all predecessors up
+  // to LowerBound. SUs that are visited by the two
+  // passes are added to Nodes.
+  WorkList.push_back(&TargetSU);
+  do {
+    const SUnit *SU = WorkList.back();
+    WorkList.pop_back();
+    for (int I = SU->Preds.size()-1; I >= 0; --I) {
+      const SUnit *Pred = SU->Preds[I].getSUnit();
+      unsigned s = Pred->NodeNum;
+      // Edges to non-SUnits are allowed but ignored (e.g. EntrySU).
+      if (Pred->isBoundaryNode())
+        continue;
+      if (Node2Index[s] == LowerBound) {
+        Found = true;
+        continue;
+      }
+      if (!VisitedBack.test(s) && Visited.test(s)) {
+        VisitedBack.set(s);
+        WorkList.push_back(Pred);
+        Nodes.push_back(s);
+      }
+    }
+  } while (!WorkList.empty());
+
+  assert(Found && "Error in SUnit Graph!");
+  Success = true;
+  return Nodes;
+}
+
 void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
                                        int UpperBound) {
   std::vector<int> L;
@@ -599,28 +648,23 @@ void ScheduleDAGTopologicalSort::Shift(BitVector& Visited, int LowerBound,
     }
   }
 
-  for (unsigned j = 0; j < L.size(); ++j) {
-    Allocate(L[j], i - shift);
+  for (unsigned LI : L) {
+    Allocate(LI, i - shift);
     i = i + 1;
   }
 }
 
-
-/// WillCreateCycle - Returns true if adding an edge to TargetSU from SU will
-/// create a cycle. If so, it is not safe to call AddPred(TargetSU, SU).
 bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) {
   // Is SU reachable from TargetSU via successor edges?
   if (IsReachable(SU, TargetSU))
     return true;
-  for (SUnit::pred_iterator
-         I = TargetSU->Preds.begin(), E = TargetSU->Preds.end(); I != E; ++I)
-    if (I->isAssignedRegDep() &&
-        IsReachable(SU, I->getSUnit()))
+  for (const SDep &PredDep : TargetSU->Preds)
+    if (PredDep.isAssignedRegDep() &&
+        IsReachable(SU, PredDep.getSUnit()))
       return true;
   return false;
 }
 
-/// IsReachable - Checks if SU is reachable from TargetSU.
 bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
                                              const SUnit *TargetSU) {
   // If insertion of the edge SU->TargetSU would create a cycle
@@ -638,7 +682,6 @@ bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
   return HasLoop;
 }
 
-/// Allocate - assign the topological index to the node n.
 void ScheduleDAGTopologicalSort::Allocate(int n, int index) {
   Node2Index[n] = index;
   Index2Node[index] = n;
@@ -648,4 +691,4 @@ ScheduleDAGTopologicalSort::
 ScheduleDAGTopologicalSort(std::vector<SUnit> &sunits, SUnit *exitsu)
   : SUnits(sunits), ExitSU(exitsu) {}
 
-ScheduleHazardRecognizer::~ScheduleHazardRecognizer() {}
+ScheduleHazardRecognizer::~ScheduleHazardRecognizer() = default;
diff --git a/lib/CodeGen/ScheduleDAGInstrs.cpp b/lib/CodeGen/ScheduleDAGInstrs.cpp
index 20197295457814c0bcda77d3423d9036e1f80b4e..18823b74c47fe13bb54a466df383fecd148c0cd1 100644
--- a/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -537,71 +537,9 @@ static inline bool isGlobalMemoryObject(AliasAnalysis *AA, MachineInstr *MI) {
          (MI->hasOrderedMemoryRef() && !MI->isDereferenceableInvariantLoad(AA));
 }
 
-/// Returns true if the two MIs need a chain edge between them.
-/// This is called on normal stores and loads.
-static bool MIsNeedChainEdge(AliasAnalysis *AA, const MachineFrameInfo *MFI,
-                             const DataLayout &DL, MachineInstr *MIa,
-                             MachineInstr *MIb) {
-  const MachineFunction *MF = MIa->getParent()->getParent();
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-
-  assert ((MIa->mayStore() || MIb->mayStore()) &&
-          "Dependency checked between two loads");
-
-  // Let the target decide if memory accesses cannot possibly overlap.
-  if (TII->areMemAccessesTriviallyDisjoint(*MIa, *MIb, AA))
-    return false;
-
-  // To this point analysis is generic. From here on we do need AA.
-  if (!AA)
-    return true;
-
-  // FIXME: Need to handle multiple memory operands to support all targets.
-  if (!MIa->hasOneMemOperand() || !MIb->hasOneMemOperand())
-    return true;
-
-  MachineMemOperand *MMOa = *MIa->memoperands_begin();
-  MachineMemOperand *MMOb = *MIb->memoperands_begin();
-
-  if (!MMOa->getValue() || !MMOb->getValue())
-    return true;
-
-  // The following interface to AA is fashioned after DAGCombiner::isAlias
-  // and operates with MachineMemOperand offset with some important
-  // assumptions:
-  //   - LLVM fundamentally assumes flat address spaces.
-  //   - MachineOperand offset can *only* result from legalization and
-  //     cannot affect queries other than the trivial case of overlap
-  //     checking.
-  //   - These offsets never wrap and never step outside
-  //     of allocated objects.
-  //   - There should never be any negative offsets here.
-  //
-  // FIXME: Modify API to hide this math from "user"
-  // FIXME: Even before we go to AA we can reason locally about some
-  // memory objects. It can save compile time, and possibly catch some
-  // corner cases not currently covered.
-
-  assert ((MMOa->getOffset() >= 0) && "Negative MachineMemOperand offset");
-  assert ((MMOb->getOffset() >= 0) && "Negative MachineMemOperand offset");
-
-  int64_t MinOffset = std::min(MMOa->getOffset(), MMOb->getOffset());
-  int64_t Overlapa = MMOa->getSize() + MMOa->getOffset() - MinOffset;
-  int64_t Overlapb = MMOb->getSize() + MMOb->getOffset() - MinOffset;
-
-  AliasResult AAResult =
-      AA->alias(MemoryLocation(MMOa->getValue(), Overlapa,
-                               UseTBAA ? MMOa->getAAInfo() : AAMDNodes()),
-                MemoryLocation(MMOb->getValue(), Overlapb,
-                               UseTBAA ? MMOb->getAAInfo() : AAMDNodes()));
-
-  return (AAResult != NoAlias);
-}
-
 void ScheduleDAGInstrs::addChainDependency (SUnit *SUa, SUnit *SUb,
                                             unsigned Latency) {
-  if (MIsNeedChainEdge(AAForDep, &MFI, MF.getDataLayout(), SUa->getInstr(),
-                       SUb->getInstr())) {
+  if (SUa->getInstr()->mayAlias(AAForDep, *SUb->getInstr(), UseTBAA)) {
     SDep Dep(SUa, SDep::MayAliasMem);
     Dep.setLatency(Latency);
     SUb->addPred(Dep);
diff --git a/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index 83bc1ba7beb94263167da738a5bf5e7e0d73d072..b3d83d5313aff75bce8fb7c9aa0ae7601fbf9ae4 100644
--- a/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -1,4 +1,4 @@
-//===----- ScoreboardHazardRecognizer.cpp - Scheduler Support -------------===//
+//===- ScoreboardHazardRecognizer.cpp - Scheduler Support -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,11 +15,13 @@
 
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -29,8 +31,7 @@ ScoreboardHazardRecognizer::ScoreboardHazardRecognizer(
     const InstrItineraryData *II, const ScheduleDAG *SchedDAG,
     const char *ParentDebugType)
     : ScheduleHazardRecognizer(), DebugType(ParentDebugType), ItinData(II),
-      DAG(SchedDAG), IssueWidth(0), IssueCount(0) {
-
+      DAG(SchedDAG) {
   // Determine the maximum depth of any itinerary. This determines the depth of
   // the scoreboard. We always make the scoreboard at least 1 cycle deep to
   // avoid dealing with the boundary condition.
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 33863ca16f9ee59bbeb2a243fa5ff9ecb886d9d8..306c1974ab5a8878f4541c6adf77a2f3e1501a28 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -52,10 +52,6 @@ STATISTIC(LdStFP2Int      , "Number of fp load/store pairs transformed to int");
 STATISTIC(SlicedLoads, "Number of load sliced");
 
 namespace {
-  static cl::opt<bool>
-    CombinerAA("combiner-alias-analysis", cl::Hidden,
-               cl::desc("Enable DAG combiner alias-analysis heuristics"));
-
   static cl::opt<bool>
     CombinerGlobalAA("combiner-global-alias-analysis", cl::Hidden,
                cl::desc("Enable DAG combiner's use of IR alias analysis"));
@@ -133,6 +129,9 @@ namespace {
     /// Add to the worklist making sure its instance is at the back (next to be
     /// processed.)
     void AddToWorklist(SDNode *N) {
+      assert(N->getOpcode() != ISD::DELETED_NODE &&
+             "Deleted Node added to Worklist");
+
       // Skip handle nodes as they can't usefully be combined and confuse the
       // zero-use deletion strategy.
       if (N->getOpcode() == ISD::HANDLENODE)
@@ -177,6 +176,7 @@ namespace {
     void CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO);
 
   private:
+    unsigned MaximumLegalStoreInBits;
 
     /// Check the specified integer node value to see if it can be simplified or
     /// if things it uses can be simplified by bit propagation.
@@ -232,9 +232,12 @@ namespace {
     SDValue visitTokenFactor(SDNode *N);
     SDValue visitMERGE_VALUES(SDNode *N);
     SDValue visitADD(SDNode *N);
+    SDValue visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference);
     SDValue visitSUB(SDNode *N);
     SDValue visitADDC(SDNode *N);
+    SDValue visitUADDO(SDNode *N);
     SDValue visitSUBC(SDNode *N);
+    SDValue visitUSUBO(SDNode *N);
     SDValue visitADDE(SDNode *N);
     SDValue visitSUBE(SDNode *N);
     SDValue visitMUL(SDNode *N);
@@ -259,6 +262,7 @@ namespace {
     SDValue visitSRA(SDNode *N);
     SDValue visitSRL(SDNode *N);
     SDValue visitRotate(SDNode *N);
+    SDValue visitABS(SDNode *N);
     SDValue visitBSWAP(SDNode *N);
     SDValue visitBITREVERSE(SDNode *N);
     SDValue visitCTLZ(SDNode *N);
@@ -274,6 +278,7 @@ namespace {
     SDValue visitSIGN_EXTEND(SDNode *N);
     SDValue visitZERO_EXTEND(SDNode *N);
     SDValue visitANY_EXTEND(SDNode *N);
+    SDValue visitAssertZext(SDNode *N);
     SDValue visitSIGN_EXTEND_INREG(SDNode *N);
     SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
     SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
@@ -336,6 +341,7 @@ namespace {
     SDValue visitShiftByConstant(SDNode *N, ConstantSDNode *Amt);
 
     SDValue foldSelectOfConstants(SDNode *N);
+    SDValue foldBinOpIntoSelect(SDNode *BO);
     bool SimplifySelectOps(SDNode *SELECT, SDValue LHS, SDValue RHS);
     SDValue SimplifyBinOpWithSameOpcodeHands(SDNode *N);
     SDValue SimplifySelect(const SDLoc &DL, SDValue N0, SDValue N1, SDValue N2);
@@ -344,6 +350,8 @@ namespace {
                              bool NotExtCompare = false);
     SDValue foldSelectCCToShiftAnd(const SDLoc &DL, SDValue N0, SDValue N1,
                                    SDValue N2, SDValue N3, ISD::CondCode CC);
+    SDValue foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
+                              const SDLoc &DL);
     SDValue SimplifySetCC(EVT VT, SDValue N0, SDValue N1, ISD::CondCode Cond,
                           const SDLoc &DL, bool foldBooleans = true);
 
@@ -417,15 +425,12 @@ namespace {
     /// Holds a pointer to an LSBaseSDNode as well as information on where it
     /// is located in a sequence of memory operations connected by a chain.
     struct MemOpLink {
-      MemOpLink (LSBaseSDNode *N, int64_t Offset, unsigned Seq):
-      MemNode(N), OffsetFromBase(Offset), SequenceNum(Seq) { }
+      MemOpLink(LSBaseSDNode *N, int64_t Offset)
+          : MemNode(N), OffsetFromBase(Offset) {}
       // Ptr to the mem node.
       LSBaseSDNode *MemNode;
       // Offset from the base ptr.
       int64_t OffsetFromBase;
-      // What is the sequence number of this mem node.
-      // Lowest mem operand in the DAG starts at zero.
-      unsigned SequenceNum;
     };
 
     /// This is a helper function for visitMUL to check the profitability
@@ -436,12 +441,6 @@ namespace {
                                      SDValue &AddNode,
                                      SDValue &ConstNode);
 
-    /// This is a helper function for MergeStoresOfConstantsOrVecElts. Returns a
-    /// constant build_vector of the stored constant values in Stores.
-    SDValue getMergedConstantVectorStore(SelectionDAG &DAG, const SDLoc &SL,
-                                         ArrayRef<MemOpLink> Stores,
-                                         SmallVectorImpl<SDValue> &Chains,
-                                         EVT Ty) const;
 
     /// This is a helper function for visitAND and visitZERO_EXTEND.  Returns
     /// true if the (and (load x) c) pattern matches an extload.  ExtVT returns
@@ -452,34 +451,35 @@ namespace {
                           EVT LoadResultTy, EVT &ExtVT, EVT &LoadedVT,
                           bool &NarrowLoad);
 
+    /// Helper function for MergeConsecutiveStores which merges the
+    /// component store chains.
+    SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                unsigned NumStores);
+
     /// This is a helper function for MergeConsecutiveStores. When the source
     /// elements of the consecutive stores are all constants or all extracted
     /// vector elements, try to merge them into one larger store.
-    /// \return number of stores that were merged into a merged store (always
-    /// a prefix of \p StoreNode).
-    bool MergeStoresOfConstantsOrVecElts(
-        SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
-        bool IsConstantSrc, bool UseVector);
+    /// \return True if a merged store was created.
+    bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         EVT MemVT, unsigned NumStores,
+                                         bool IsConstantSrc, bool UseVector);
 
     /// This is a helper function for MergeConsecutiveStores.
     /// Stores that may be merged are placed in StoreNodes.
-    /// Loads that may alias with those stores are placed in AliasLoadNodes.
-    void getStoreMergeAndAliasCandidates(
-        StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
-        SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes);
+    void getStoreMergeCandidates(StoreSDNode *St,
+                                 SmallVectorImpl<MemOpLink> &StoreNodes);
 
     /// Helper function for MergeConsecutiveStores. Checks if
     /// Candidate stores have indirect dependency through their
     /// operands. \return True if safe to merge
     bool checkMergeStoreCandidatesForDependencies(
-        SmallVectorImpl<MemOpLink> &StoreNodes);
+        SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores);
 
     /// Merge consecutive store operations into a wide store.
     /// This optimization uses wide integers or vectors when possible.
     /// \return number of stores that were merged into a merged store (the
     /// affected nodes are stored as a prefix in \p StoreNodes).
-    bool MergeConsecutiveStores(StoreSDNode *N,
-                                SmallVectorImpl<MemOpLink> &StoreNodes);
+    bool MergeConsecutiveStores(StoreSDNode *N);
 
     /// \brief Try to transform a truncation where C is a constant:
     ///     (trunc (and X, C)) -> (and (trunc X), (trunc C))
@@ -494,6 +494,13 @@ namespace {
         : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
           OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) {
       ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize();
+
+      MaximumLegalStoreInBits = 0;
+      for (MVT VT : MVT::all_valuetypes())
+        if (EVT(VT).isSimple() && VT != MVT::Other &&
+            TLI.isTypeLegal(EVT(VT)) &&
+            VT.getSizeInBits() >= MaximumLegalStoreInBits)
+          MaximumLegalStoreInBits = VT.getSizeInBits();
     }
 
     /// Runs the dag combiner on all nodes in the work list
@@ -1087,37 +1094,36 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
+    DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+
     bool Replace0 = false;
     SDValue N0 = Op.getOperand(0);
     SDValue NN0 = PromoteOperand(N0, PVT, Replace0);
-    if (!NN0.getNode())
-      return SDValue();
 
     bool Replace1 = false;
     SDValue N1 = Op.getOperand(1);
-    SDValue NN1;
-    if (N0 == N1)
-      NN1 = NN0;
-    else {
-      NN1 = PromoteOperand(N1, PVT, Replace1);
-      if (!NN1.getNode())
-        return SDValue();
-    }
+    SDValue NN1 = PromoteOperand(N1, PVT, Replace1);
+    SDLoc DL(Op);
 
-    AddToWorklist(NN0.getNode());
-    if (NN1.getNode())
-      AddToWorklist(NN1.getNode());
+    SDValue RV =
+        DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));
 
-    if (Replace0)
+    // New replace instances of N0 and N1
+    if (Replace0 && N0 && N0.getOpcode() != ISD::DELETED_NODE && NN0 &&
+        NN0.getOpcode() != ISD::DELETED_NODE) {
+      AddToWorklist(NN0.getNode());
       ReplaceLoadWithPromotedLoad(N0.getNode(), NN0.getNode());
-    if (Replace1)
+    }
+
+    if (Replace1 && N1 && N1.getOpcode() != ISD::DELETED_NODE && NN1 &&
+        NN1.getOpcode() != ISD::DELETED_NODE) {
+      AddToWorklist(NN1.getNode());
       ReplaceLoadWithPromotedLoad(N1.getNode(), NN1.getNode());
+    }
 
-    DEBUG(dbgs() << "\nPromoting ";
-          Op.getNode()->dump(&DAG));
-    SDLoc DL(Op);
-    return DAG.getNode(ISD::TRUNCATE, DL, VT,
-                       DAG.getNode(Opc, DL, PVT, NN0, NN1));
+    // Deal with Op being deleted.
+    if (Op && Op.getOpcode() != ISD::DELETED_NODE)
+      return RV;
   }
   return SDValue();
 }
@@ -1145,26 +1151,32 @@ SDValue DAGCombiner::PromoteIntShiftOp(SDValue Op) {
   if (TLI.IsDesirableToPromoteOp(Op, PVT)) {
     assert(PVT != VT && "Don't know what type to promote to!");
 
+    DEBUG(dbgs() << "\nPromoting "; Op.getNode()->dump(&DAG));
+
     bool Replace = false;
     SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
     if (Opc == ISD::SRA)
-      N0 = SExtPromoteOperand(Op.getOperand(0), PVT);
+      N0 = SExtPromoteOperand(N0, PVT);
     else if (Opc == ISD::SRL)
-      N0 = ZExtPromoteOperand(Op.getOperand(0), PVT);
+      N0 = ZExtPromoteOperand(N0, PVT);
     else
       N0 = PromoteOperand(N0, PVT, Replace);
+
     if (!N0.getNode())
       return SDValue();
 
+    SDLoc DL(Op);
+    SDValue RV =
+        DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, N0, N1));
+
     AddToWorklist(N0.getNode());
     if (Replace)
       ReplaceLoadWithPromotedLoad(Op.getOperand(0).getNode(), N0.getNode());
 
-    DEBUG(dbgs() << "\nPromoting ";
-          Op.getNode()->dump(&DAG));
-    SDLoc DL(Op);
-    return DAG.getNode(ISD::TRUNCATE, DL, VT,
-                       DAG.getNode(Opc, DL, PVT, N0, Op.getOperand(1)));
+    // Deal with Op being deleted.
+    if (Op && Op.getOpcode() != ISD::DELETED_NODE)
+      return RV;
   }
   return SDValue();
 }
@@ -1369,8 +1381,7 @@ void DAGCombiner::Run(CombineLevel AtLevel) {
     else {
       assert(N->getValueType(0) == RV.getValueType() &&
              N->getNumValues() == 1 && "Type mismatch");
-      SDValue OpV = RV;
-      DAG.ReplaceAllUsesWith(N, &OpV);
+      DAG.ReplaceAllUsesWith(N, &RV);
     }
 
     // Push the new node and any users onto the worklist
@@ -1397,7 +1408,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::ADD:                return visitADD(N);
   case ISD::SUB:                return visitSUB(N);
   case ISD::ADDC:               return visitADDC(N);
+  case ISD::UADDO:              return visitUADDO(N);
   case ISD::SUBC:               return visitSUBC(N);
+  case ISD::USUBO:              return visitUSUBO(N);
   case ISD::ADDE:               return visitADDE(N);
   case ISD::SUBE:               return visitSUBE(N);
   case ISD::MUL:                return visitMUL(N);
@@ -1423,6 +1436,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::SRL:                return visitSRL(N);
   case ISD::ROTR:
   case ISD::ROTL:               return visitRotate(N);
+  case ISD::ABS:                return visitABS(N);
   case ISD::BSWAP:              return visitBSWAP(N);
   case ISD::BITREVERSE:         return visitBITREVERSE(N);
   case ISD::CTLZ:               return visitCTLZ(N);
@@ -1438,6 +1452,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::SIGN_EXTEND:        return visitSIGN_EXTEND(N);
   case ISD::ZERO_EXTEND:        return visitZERO_EXTEND(N);
   case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
+  case ISD::AssertZext:         return visitAssertZext(N);
   case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
   case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
   case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
@@ -1582,7 +1597,7 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
   }
 
   SmallVector<SDNode *, 8> TFs;     // List of token factors to visit.
-  SmallVector<SDValue, 8> Ops;    // Ops for replacing token factor.
+  SmallVector<SDValue, 8> Ops;      // Ops for replacing token factor.
   SmallPtrSet<SDNode*, 16> SeenOps;
   bool Changed = false;             // If we should replace this token factor.
 
@@ -1626,6 +1641,86 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
     }
   }
 
+  // Remove Nodes that are chained to another node in the list. Do so
+  // by walking up chains breath-first stopping when we've seen
+  // another operand. In general we must climb to the EntryNode, but we can exit
+  // early if we find all remaining work is associated with just one operand as
+  // no further pruning is possible.
+
+  // List of nodes to search through and original Ops from which they originate.
+  SmallVector<std::pair<SDNode *, unsigned>, 8> Worklist;
+  SmallVector<unsigned, 8> OpWorkCount; // Count of work for each Op.
+  SmallPtrSet<SDNode *, 16> SeenChains;
+  bool DidPruneOps = false;
+
+  unsigned NumLeftToConsider = 0;
+  for (const SDValue &Op : Ops) {
+    Worklist.push_back(std::make_pair(Op.getNode(), NumLeftToConsider++));
+    OpWorkCount.push_back(1);
+  }
+
+  auto AddToWorklist = [&](unsigned CurIdx, SDNode *Op, unsigned OpNumber) {
+    // If this is an Op, we can remove the op from the list. Remark any
+    // search associated with it as from the current OpNumber.
+    if (SeenOps.count(Op) != 0) {
+      Changed = true;
+      DidPruneOps = true;
+      unsigned OrigOpNumber = 0;
+      while (OrigOpNumber < Ops.size() && Ops[OrigOpNumber].getNode() != Op)
+        OrigOpNumber++;
+      assert((OrigOpNumber != Ops.size()) &&
+             "expected to find TokenFactor Operand");
+      // Re-mark worklist from OrigOpNumber to OpNumber
+      for (unsigned i = CurIdx + 1; i < Worklist.size(); ++i) {
+        if (Worklist[i].second == OrigOpNumber) {
+          Worklist[i].second = OpNumber;
+        }
+      }
+      OpWorkCount[OpNumber] += OpWorkCount[OrigOpNumber];
+      OpWorkCount[OrigOpNumber] = 0;
+      NumLeftToConsider--;
+    }
+    // Add if it's a new chain
+    if (SeenChains.insert(Op).second) {
+      OpWorkCount[OpNumber]++;
+      Worklist.push_back(std::make_pair(Op, OpNumber));
+    }
+  };
+
+  for (unsigned i = 0; i < Worklist.size() && i < 1024; ++i) {
+    // We need at least be consider at least 2 Ops to prune.
+    if (NumLeftToConsider <= 1)
+      break;
+    auto CurNode = Worklist[i].first;
+    auto CurOpNumber = Worklist[i].second;
+    assert((OpWorkCount[CurOpNumber] > 0) &&
+           "Node should not appear in worklist");
+    switch (CurNode->getOpcode()) {
+    case ISD::EntryToken:
+      // Hitting EntryToken is the only way for the search to terminate without
+      // hitting
+      // another operand's search. Prevent us from marking this operand
+      // considered.
+      NumLeftToConsider++;
+      break;
+    case ISD::TokenFactor:
+      for (const SDValue &Op : CurNode->op_values())
+        AddToWorklist(i, Op.getNode(), CurOpNumber);
+      break;
+    case ISD::CopyFromReg:
+    case ISD::CopyToReg:
+      AddToWorklist(i, CurNode->getOperand(0).getNode(), CurOpNumber);
+      break;
+    default:
+      if (auto *MemNode = dyn_cast<MemSDNode>(CurNode))
+        AddToWorklist(i, MemNode->getChain().getNode(), CurOpNumber);
+      break;
+    }
+    OpWorkCount[CurOpNumber]--;
+    if (OpWorkCount[CurOpNumber] == 0)
+      NumLeftToConsider--;
+  }
+
   SDValue Result;
 
   // If we've changed things around then replace token factor.
@@ -1634,15 +1729,22 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) {
       // The entry token is the only possible outcome.
       Result = DAG.getEntryNode();
     } else {
-      // New and improved token factor.
-      Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
+      if (DidPruneOps) {
+        SmallVector<SDValue, 8> PrunedOps;
+        //
+        for (const SDValue &Op : Ops) {
+          if (SeenChains.count(Op.getNode()) == 0)
+            PrunedOps.push_back(Op);
+        }
+        Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, PrunedOps);
+      } else {
+        Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops);
+      }
     }
 
-    // Add users to worklist if AA is enabled, since it may introduce
-    // a lot of new chained token factors while removing memory deps.
-    bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-      : DAG.getSubtarget().useAA();
-    return CombineTo(N, Result, UseAA /*add to worklist*/);
+    // Add users to worklist, since we may introduce a lot of new
+    // chained token factors while removing memory deps.
+    return CombineTo(N, Result, true /*add to worklist*/);
   }
 
   return Result;
@@ -1672,6 +1774,60 @@ static ConstantSDNode *getAsNonOpaqueConstant(SDValue N) {
   return Const != nullptr && !Const->isOpaque() ? Const : nullptr;
 }
 
+SDValue DAGCombiner::foldBinOpIntoSelect(SDNode *BO) {
+  auto BinOpcode = BO->getOpcode();
+  assert((BinOpcode == ISD::ADD || BinOpcode == ISD::SUB ||
+          BinOpcode == ISD::MUL || BinOpcode == ISD::SDIV ||
+          BinOpcode == ISD::UDIV || BinOpcode == ISD::SREM ||
+          BinOpcode == ISD::UREM || BinOpcode == ISD::AND ||
+          BinOpcode == ISD::OR || BinOpcode == ISD::XOR ||
+          BinOpcode == ISD::SHL || BinOpcode == ISD::SRL ||
+          BinOpcode == ISD::SRA || BinOpcode == ISD::FADD ||
+          BinOpcode == ISD::FSUB || BinOpcode == ISD::FMUL ||
+          BinOpcode == ISD::FDIV || BinOpcode == ISD::FREM) &&
+         "Unexpected binary operator");
+
+  // Bail out if any constants are opaque because we can't constant fold those.
+  SDValue C1 = BO->getOperand(1);
+  if (!isConstantOrConstantVector(C1, true) &&
+      !isConstantFPBuildVectorOrConstantFP(C1))
+    return SDValue();
+
+  // Don't do this unless the old select is going away. We want to eliminate the
+  // binary operator, not replace a binop with a select.
+  // TODO: Handle ISD::SELECT_CC.
+  SDValue Sel = BO->getOperand(0);
+  if (Sel.getOpcode() != ISD::SELECT || !Sel.hasOneUse())
+    return SDValue();
+
+  SDValue CT = Sel.getOperand(1);
+  if (!isConstantOrConstantVector(CT, true) &&
+      !isConstantFPBuildVectorOrConstantFP(CT))
+    return SDValue();
+
+  SDValue CF = Sel.getOperand(2);
+  if (!isConstantOrConstantVector(CF, true) &&
+      !isConstantFPBuildVectorOrConstantFP(CF))
+    return SDValue();
+
+  // We have a select-of-constants followed by a binary operator with a
+  // constant. Eliminate the binop by pulling the constant math into the select.
+  // Example: add (select Cond, CT, CF), C1 --> select Cond, CT + C1, CF + C1
+  EVT VT = Sel.getValueType();
+  SDLoc DL(Sel);
+  SDValue NewCT = DAG.getNode(BinOpcode, DL, VT, CT, C1);
+  assert((NewCT.isUndef() || isConstantOrConstantVector(NewCT) ||
+          isConstantFPBuildVectorOrConstantFP(NewCT)) &&
+         "Failed to constant fold a binop with constant operands");
+
+  SDValue NewCF = DAG.getNode(BinOpcode, DL, VT, CF, C1);
+  assert((NewCF.isUndef() || isConstantOrConstantVector(NewCF) ||
+          isConstantFPBuildVectorOrConstantFP(NewCF)) &&
+         "Failed to constant fold a binop with constant operands");
+
+  return DAG.getSelect(DL, VT, Sel.getOperand(0), NewCT, NewCF);
+}
+
 SDValue DAGCombiner::visitADD(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -1720,6 +1876,9 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
       }
   }
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // reassociate add
   if (SDValue RADD = ReassociateOps(ISD::ADD, DL, N0, N1))
     return RADD;
@@ -1782,6 +1941,19 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
       VT.isInteger() && DAG.haveNoCommonBitsSet(N0, N1))
     return DAG.getNode(ISD::OR, DL, VT, N0, N1);
 
+  if (SDValue Combined = visitADDLike(N0, N1, N))
+    return Combined;
+
+  if (SDValue Combined = visitADDLike(N1, N0, N))
+    return Combined;
+
+  return SDValue();
+}
+
+SDValue DAGCombiner::visitADDLike(SDValue N0, SDValue N1, SDNode *LocReference) {
+  EVT VT = N0.getValueType();
+  SDLoc DL(LocReference);
+
   // fold (add x, shl(0 - y, n)) -> sub(x, shl(y, n))
   if (N1.getOpcode() == ISD::SHL && N1.getOperand(0).getOpcode() == ISD::SUB &&
       isNullConstantOrNullSplatConstant(N1.getOperand(0).getOperand(0)))
@@ -1789,12 +1961,6 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
                        DAG.getNode(ISD::SHL, DL, VT,
                                    N1.getOperand(0).getOperand(1),
                                    N1.getOperand(1)));
-  if (N0.getOpcode() == ISD::SHL && N0.getOperand(0).getOpcode() == ISD::SUB &&
-      isNullConstantOrNullSplatConstant(N0.getOperand(0).getOperand(0)))
-    return DAG.getNode(ISD::SUB, DL, VT, N1,
-                       DAG.getNode(ISD::SHL, DL, VT,
-                                   N0.getOperand(0).getOperand(1),
-                                   N0.getOperand(1)));
 
   if (N1.getOpcode() == ISD::AND) {
     SDValue AndOp0 = N1.getOperand(0);
@@ -1805,7 +1971,7 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
     // and similar xforms where the inner op is either ~0 or 0.
     if (NumSignBits == DestBits &&
         isOneConstantOrOneSplatConstant(N1->getOperand(1)))
-      return DAG.getNode(ISD::SUB, DL, VT, N->getOperand(0), AndOp0);
+      return DAG.getNode(ISD::SUB, DL, VT, N0, AndOp0);
   }
 
   // add (sext i1), X -> sub X, (zext i1)
@@ -1833,39 +1999,61 @@ SDValue DAGCombiner::visitADDC(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
+  SDLoc DL(N);
 
   // If the flag result is dead, turn this into an ADD.
   if (!N->hasAnyUseOfValue(1))
-    return CombineTo(N, DAG.getNode(ISD::ADD, SDLoc(N), VT, N0, N1),
-                     DAG.getNode(ISD::CARRY_FALSE,
-                                 SDLoc(N), MVT::Glue));
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
   // canonicalize constant to RHS.
   ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
   if (N0C && !N1C)
-    return DAG.getNode(ISD::ADDC, SDLoc(N), N->getVTList(), N1, N0);
+    return DAG.getNode(ISD::ADDC, DL, N->getVTList(), N1, N0);
 
   // fold (addc x, 0) -> x + no carry out
   if (isNullConstant(N1))
     return CombineTo(N, N0, DAG.getNode(ISD::CARRY_FALSE,
-                                        SDLoc(N), MVT::Glue));
+                                        DL, MVT::Glue));
 
-  // fold (addc a, b) -> (or a, b), CARRY_FALSE iff a and b share no bits.
-  APInt LHSZero, LHSOne;
-  APInt RHSZero, RHSOne;
-  DAG.computeKnownBits(N0, LHSZero, LHSOne);
+  // If it cannot overflow, transform into an add.
+  if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getNode(ISD::CARRY_FALSE, DL, MVT::Glue));
 
-  if (LHSZero.getBoolValue()) {
-    DAG.computeKnownBits(N1, RHSZero, RHSOne);
+  return SDValue();
+}
 
-    // If all possibly-set bits on the LHS are clear on the RHS, return an OR.
-    // If all possibly-set bits on the RHS are clear on the LHS, return an OR.
-    if ((RHSZero & ~LHSZero) == ~LHSZero || (LHSZero & ~RHSZero) == ~RHSZero)
-      return CombineTo(N, DAG.getNode(ISD::OR, SDLoc(N), VT, N0, N1),
-                       DAG.getNode(ISD::CARRY_FALSE,
-                                   SDLoc(N), MVT::Glue));
-  }
+SDValue DAGCombiner::visitUADDO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an ADD.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getUNDEF(CarryVT));
+
+  // canonicalize constant to RHS.
+  ConstantSDNode *N0C = dyn_cast<ConstantSDNode>(N0);
+  ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
+  if (N0C && !N1C)
+    return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N1, N0);
+
+  // fold (uaddo x, 0) -> x + no carry out
+  if (isNullConstant(N1))
+    return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
+
+  // If it cannot overflow, transform into an add.
+  if (DAG.computeOverflowKind(N0, N1) == SelectionDAG::OFK_Never)
+    return CombineTo(N, DAG.getNode(ISD::ADD, DL, VT, N0, N1),
+                     DAG.getConstant(0, DL, CarryVT));
 
   return SDValue();
 }
@@ -1928,6 +2116,9 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
                                       N1.getNode());
   }
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
 
   // fold (sub x, c) -> (add x, -c)
@@ -2074,6 +2265,38 @@ SDValue DAGCombiner::visitSUBC(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitUSUBO(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N0.getValueType();
+  if (VT.isVector())
+    return SDValue();
+
+  EVT CarryVT = N->getValueType(1);
+  SDLoc DL(N);
+
+  // If the flag result is dead, turn this into an SUB.
+  if (!N->hasAnyUseOfValue(1))
+    return CombineTo(N, DAG.getNode(ISD::SUB, DL, VT, N0, N1),
+                     DAG.getUNDEF(CarryVT));
+
+  // fold (usubo x, x) -> 0 + no borrow
+  if (N0 == N1)
+    return CombineTo(N, DAG.getConstant(0, DL, VT),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  // fold (usubo x, 0) -> x + no borrow
+  if (isNullConstant(N1))
+    return CombineTo(N, N0, DAG.getConstant(0, DL, CarryVT));
+
+  // Canonicalize (usubo -1, x) -> ~x, i.e. (xor x, -1) + no borrow
+  if (isAllOnesConstant(N0))
+    return CombineTo(N, DAG.getNode(ISD::XOR, DL, VT, N1, N0),
+                     DAG.getConstant(0, DL, CarryVT));
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSUBE(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2139,6 +2362,10 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   // fold (mul x, 1) -> x
   if (N1IsConst && ConstValue1 == 1 && IsFullSplat)
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (mul x, -1) -> 0-x
   if (N1IsConst && ConstValue1.isAllOnesValue()) {
     SDLoc DL(N);
@@ -2305,6 +2532,23 @@ SDValue DAGCombiner::useDivRem(SDNode *Node) {
   return combined;
 }
 
+static SDValue simplifyDivRem(SDNode *N, SelectionDAG &DAG) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT VT = N->getValueType(0);
+  SDLoc DL(N);
+
+  if (DAG.isUndef(N->getOpcode(), {N0, N1}))
+    return DAG.getUNDEF(VT);
+
+  // undef / X -> 0
+  // undef % X -> 0
+  if (N0.isUndef())
+    return DAG.getConstant(0, DL, VT);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -2327,8 +2571,13 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     return N0;
   // fold (sdiv X, -1) -> 0-X
   if (N1C && N1C->isAllOnesValue())
-    return DAG.getNode(ISD::SUB, DL, VT,
-                       DAG.getConstant(0, DL, VT), N0);
+    return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
+
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
 
   // If we know the sign bits of both operands are zero, strength reduce to a
   // udiv instead.  Handles (X&15) /s 4 -> X&15 >> 2
@@ -2380,7 +2629,7 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   // If integer divide is expensive and we satisfy the requirements, emit an
   // alternate sequence.  Targets may check function attributes for size/speed
   // trade-offs.
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
     if (SDValue Op = BuildSDIV(N))
       return Op;
@@ -2392,13 +2641,6 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
     if (SDValue DivRem = useDivRem(N))
         return DivRem;
 
-  // undef / X -> 0
-  if (N0.isUndef())
-    return DAG.getConstant(0, DL, VT);
-  // X / undef -> undef
-  if (N1.isUndef())
-    return N1;
-
   return SDValue();
 }
 
@@ -2422,6 +2664,12 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
                                                     N0C, N1C))
       return Folded;
 
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (udiv x, (1 << c)) -> x >>u c
   if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
       DAG.isKnownToBeAPowerOfTwo(N1)) {
@@ -2452,7 +2700,7 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
   }
 
   // fold (udiv x, c) -> alternate
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   if (N1C && !TLI.isIntDivCheap(N->getValueType(0), Attr))
     if (SDValue Op = BuildUDIV(N))
       return Op;
@@ -2464,13 +2712,6 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
     if (SDValue DivRem = useDivRem(N))
         return DivRem;
 
-  // undef / X -> 0
-  if (N0.isUndef())
-    return DAG.getConstant(0, DL, VT);
-  // X / undef -> undef
-  if (N1.isUndef())
-    return N1;
-
   return SDValue();
 }
 
@@ -2490,32 +2731,35 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
     if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))
       return Folded;
 
+  if (SDValue V = simplifyDivRem(N, DAG))
+    return V;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   if (isSigned) {
     // If we know the sign bits of both operands are zero, strength reduce to a
     // urem instead.  Handles (X & 0x0FFFFFFF) %s 16 -> X&15
     if (DAG.SignBitIsZero(N1) && DAG.SignBitIsZero(N0))
       return DAG.getNode(ISD::UREM, DL, VT, N0, N1);
   } else {
-    // fold (urem x, pow2) -> (and x, pow2-1)
+    SDValue NegOne = DAG.getAllOnesConstant(DL, VT);
     if (DAG.isKnownToBeAPowerOfTwo(N1)) {
-      APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits());
-      SDValue Add =
-          DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT));
+      // fold (urem x, pow2) -> (and x, pow2-1)
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
     }
-    // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
     if (N1.getOpcode() == ISD::SHL &&
         DAG.isKnownToBeAPowerOfTwo(N1.getOperand(0))) {
-      APInt NegOne = APInt::getAllOnesValue(VT.getScalarSizeInBits());
-      SDValue Add =
-          DAG.getNode(ISD::ADD, DL, VT, N1, DAG.getConstant(NegOne, DL, VT));
+      // fold (urem x, (shl pow2, y)) -> (and x, (add (shl pow2, y), -1))
+      SDValue Add = DAG.getNode(ISD::ADD, DL, VT, N1, NegOne);
       AddToWorklist(Add.getNode());
       return DAG.getNode(ISD::AND, DL, VT, N0, Add);
     }
   }
 
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
 
   // If X/C can be simplified by the division-by-constant logic, lower
   // X%C to the equivalent of X-X/C*C.
@@ -2544,13 +2788,6 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
   if (SDValue DivRem = useDivRem(N))
     return DivRem.getValue(1);
 
-  // undef % X -> 0
-  if (N0.isUndef())
-    return DAG.getConstant(0, DL, VT);
-  // X % undef -> undef
-  if (N1.isUndef())
-    return N1;
-
   return SDValue();
 }
 
@@ -2940,95 +3177,139 @@ SDValue DAGCombiner::SimplifyBinOpWithSameOpcodeHands(SDNode *N) {
   return SDValue();
 }
 
+/// Try to make (and/or setcc (LL, LR), setcc (RL, RR)) more efficient.
+SDValue DAGCombiner::foldLogicOfSetCCs(bool IsAnd, SDValue N0, SDValue N1,
+                                       const SDLoc &DL) {
+  SDValue LL, LR, RL, RR, N0CC, N1CC;
+  if (!isSetCCEquivalent(N0, LL, LR, N0CC) ||
+      !isSetCCEquivalent(N1, RL, RR, N1CC))
+    return SDValue();
+
+  assert(N0.getValueType() == N1.getValueType() &&
+         "Unexpected operand types for bitwise logic op");
+  assert(LL.getValueType() == LR.getValueType() &&
+         RL.getValueType() == RR.getValueType() &&
+         "Unexpected operand types for setcc");
+
+  // If we're here post-legalization or the logic op type is not i1, the logic
+  // op type must match a setcc result type. Also, all folds require new
+  // operations on the left and right operands, so those types must match.
+  EVT VT = N0.getValueType();
+  EVT OpVT = LL.getValueType();
+  if (LegalOperations || VT != MVT::i1)
+    if (VT != getSetCCResultType(OpVT))
+      return SDValue();
+  if (OpVT != RL.getValueType())
+    return SDValue();
+
+  ISD::CondCode CC0 = cast<CondCodeSDNode>(N0CC)->get();
+  ISD::CondCode CC1 = cast<CondCodeSDNode>(N1CC)->get();
+  bool IsInteger = OpVT.isInteger();
+  if (LR == RR && CC0 == CC1 && IsInteger) {
+    bool IsZero = isNullConstantOrNullSplatConstant(LR);
+    bool IsNeg1 = isAllOnesConstantOrAllOnesSplatConstant(LR);
+
+    // All bits clear?
+    bool AndEqZero = IsAnd && CC1 == ISD::SETEQ && IsZero;
+    // All sign bits clear?
+    bool AndGtNeg1 = IsAnd && CC1 == ISD::SETGT && IsNeg1;
+    // Any bits set?
+    bool OrNeZero = !IsAnd && CC1 == ISD::SETNE && IsZero;
+    // Any sign bits set?
+    bool OrLtZero = !IsAnd && CC1 == ISD::SETLT && IsZero;
+
+    // (and (seteq X,  0), (seteq Y,  0)) --> (seteq (or X, Y),  0)
+    // (and (setgt X, -1), (setgt Y, -1)) --> (setgt (or X, Y), -1)
+    // (or  (setne X,  0), (setne Y,  0)) --> (setne (or X, Y),  0)
+    // (or  (setlt X,  0), (setlt Y,  0)) --> (setlt (or X, Y),  0)
+    if (AndEqZero || AndGtNeg1 || OrNeZero || OrLtZero) {
+      SDValue Or = DAG.getNode(ISD::OR, SDLoc(N0), OpVT, LL, RL);
+      AddToWorklist(Or.getNode());
+      return DAG.getSetCC(DL, VT, Or, LR, CC1);
+    }
+
+    // All bits set?
+    bool AndEqNeg1 = IsAnd && CC1 == ISD::SETEQ && IsNeg1;
+    // All sign bits set?
+    bool AndLtZero = IsAnd && CC1 == ISD::SETLT && IsZero;
+    // Any bits clear?
+    bool OrNeNeg1 = !IsAnd && CC1 == ISD::SETNE && IsNeg1;
+    // Any sign bits clear?
+    bool OrGtNeg1 = !IsAnd && CC1 == ISD::SETGT && IsNeg1;
+
+    // (and (seteq X, -1), (seteq Y, -1)) --> (seteq (and X, Y), -1)
+    // (and (setlt X,  0), (setlt Y,  0)) --> (setlt (and X, Y),  0)
+    // (or  (setne X, -1), (setne Y, -1)) --> (setne (and X, Y), -1)
+    // (or  (setgt X, -1), (setgt Y  -1)) --> (setgt (and X, Y), -1)
+    if (AndEqNeg1 || AndLtZero || OrNeNeg1 || OrGtNeg1) {
+      SDValue And = DAG.getNode(ISD::AND, SDLoc(N0), OpVT, LL, RL);
+      AddToWorklist(And.getNode());
+      return DAG.getSetCC(DL, VT, And, LR, CC1);
+    }
+  }
+
+  // TODO: What is the 'or' equivalent of this fold?
+  // (and (setne X, 0), (setne X, -1)) --> (setuge (add X, 1), 2)
+  if (IsAnd && LL == RL && CC0 == CC1 && IsInteger && CC0 == ISD::SETNE &&
+      ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
+       (isAllOnesConstant(LR) && isNullConstant(RR)))) {
+    SDValue One = DAG.getConstant(1, DL, OpVT);
+    SDValue Two = DAG.getConstant(2, DL, OpVT);
+    SDValue Add = DAG.getNode(ISD::ADD, SDLoc(N0), OpVT, LL, One);
+    AddToWorklist(Add.getNode());
+    return DAG.getSetCC(DL, VT, Add, Two, ISD::SETUGE);
+  }
+
+  // Try more general transforms if the predicates match and the only user of
+  // the compares is the 'and' or 'or'.
+  if (IsInteger && TLI.convertSetCCLogicToBitwiseLogic(OpVT) && CC0 == CC1 &&
+      N0.hasOneUse() && N1.hasOneUse()) {
+    // and (seteq A, B), (seteq C, D) --> seteq (or (xor A, B), (xor C, D)), 0
+    // or  (setne A, B), (setne C, D) --> setne (or (xor A, B), (xor C, D)), 0
+    if ((IsAnd && CC1 == ISD::SETEQ) || (!IsAnd && CC1 == ISD::SETNE)) {
+      SDValue XorL = DAG.getNode(ISD::XOR, SDLoc(N0), OpVT, LL, LR);
+      SDValue XorR = DAG.getNode(ISD::XOR, SDLoc(N1), OpVT, RL, RR);
+      SDValue Or = DAG.getNode(ISD::OR, DL, OpVT, XorL, XorR);
+      SDValue Zero = DAG.getConstant(0, DL, OpVT);
+      return DAG.getSetCC(DL, VT, Or, Zero, CC1);
+    }
+  }
+
+  // Canonicalize equivalent operands to LL == RL.
+  if (LL == RR && LR == RL) {
+    CC1 = ISD::getSetCCSwappedOperands(CC1);
+    std::swap(RL, RR);
+  }
+
+  // (and (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
+  // (or  (setcc X, Y, CC0), (setcc X, Y, CC1)) --> (setcc X, Y, NewCC)
+  if (LL == RL && LR == RR) {
+    ISD::CondCode NewCC = IsAnd ? ISD::getSetCCAndOperation(CC0, CC1, IsInteger)
+                                : ISD::getSetCCOrOperation(CC0, CC1, IsInteger);
+    if (NewCC != ISD::SETCC_INVALID &&
+        (!LegalOperations ||
+         (TLI.isCondCodeLegal(NewCC, LL.getSimpleValueType()) &&
+          TLI.isOperationLegal(ISD::SETCC, OpVT))))
+      return DAG.getSetCC(DL, VT, LL, LR, NewCC);
+  }
+
+  return SDValue();
+}
+
 /// This contains all DAGCombine rules which reduce two values combined by
 /// an And operation to a single value. This makes them reusable in the context
 /// of visitSELECT(). Rules involving constants are not included as
 /// visitSELECT() already handles those cases.
-SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
-                                  SDNode *LocReference) {
+SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1, SDNode *N) {
   EVT VT = N1.getValueType();
+  SDLoc DL(N);
 
   // fold (and x, undef) -> 0
   if (N0.isUndef() || N1.isUndef())
-    return DAG.getConstant(0, SDLoc(LocReference), VT);
-  // fold (and (setcc x), (setcc y)) -> (setcc (and x, y))
-  SDValue LL, LR, RL, RR, CC0, CC1;
-  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
-    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
-    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
-
-    if (LR == RR && isa<ConstantSDNode>(LR) && Op0 == Op1 &&
-        LL.getValueType().isInteger()) {
-      // fold (and (seteq X, 0), (seteq Y, 0)) -> (seteq (or X, Y), 0)
-      if (isNullConstant(LR) && Op1 == ISD::SETEQ) {
-        EVT CCVT = getSetCCResultType(LR.getValueType());
-        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-          SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
-                                       LR.getValueType(), LL, RL);
-          AddToWorklist(ORNode.getNode());
-          return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
-        }
-      }
-      if (isAllOnesConstant(LR)) {
-        // fold (and (seteq X, -1), (seteq Y, -1)) -> (seteq (and X, Y), -1)
-        if (Op1 == ISD::SETEQ) {
-          EVT CCVT = getSetCCResultType(LR.getValueType());
-          if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-            SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(N0),
-                                          LR.getValueType(), LL, RL);
-            AddToWorklist(ANDNode.getNode());
-            return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1);
-          }
-        }
-        // fold (and (setgt X, -1), (setgt Y, -1)) -> (setgt (or X, Y), -1)
-        if (Op1 == ISD::SETGT) {
-          EVT CCVT = getSetCCResultType(LR.getValueType());
-          if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-            SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(N0),
-                                         LR.getValueType(), LL, RL);
-            AddToWorklist(ORNode.getNode());
-            return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
-          }
-        }
-      }
-    }
-    // Simplify (and (setne X, 0), (setne X, -1)) -> (setuge (add X, 1), 2)
-    if (LL == RL && isa<ConstantSDNode>(LR) && isa<ConstantSDNode>(RR) &&
-        Op0 == Op1 && LL.getValueType().isInteger() &&
-      Op0 == ISD::SETNE && ((isNullConstant(LR) && isAllOnesConstant(RR)) ||
-                            (isAllOnesConstant(LR) && isNullConstant(RR)))) {
-      EVT CCVT = getSetCCResultType(LL.getValueType());
-      if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-        SDLoc DL(N0);
-        SDValue ADDNode = DAG.getNode(ISD::ADD, DL, LL.getValueType(),
-                                      LL, DAG.getConstant(1, DL,
-                                                          LL.getValueType()));
-        AddToWorklist(ADDNode.getNode());
-        return DAG.getSetCC(SDLoc(LocReference), VT, ADDNode,
-                            DAG.getConstant(2, DL, LL.getValueType()),
-                            ISD::SETUGE);
-      }
-    }
-    // canonicalize equivalent to ll == rl
-    if (LL == RR && LR == RL) {
-      Op1 = ISD::getSetCCSwappedOperands(Op1);
-      std::swap(RL, RR);
-    }
-    if (LL == RL && LR == RR) {
-      bool isInteger = LL.getValueType().isInteger();
-      ISD::CondCode Result = ISD::getSetCCAndOperation(Op0, Op1, isInteger);
-      if (Result != ISD::SETCC_INVALID &&
-          (!LegalOperations ||
-           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
-            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) {
-        EVT CCVT = getSetCCResultType(LL.getValueType());
-        if (N0.getValueType() == CCVT ||
-            (!LegalOperations && N0.getValueType() == MVT::i1))
-          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
-                              LL, LR, Result);
-      }
-    }
-  }
+    return DAG.getConstant(0, DL, VT);
+
+  if (SDValue V = foldLogicOfSetCCs(true, N0, N1, DL))
+    return V;
 
   if (N0.getOpcode() == ISD::ADD && N1.getOpcode() == ISD::SRL &&
       VT.getSizeInBits() <= 64) {
@@ -3045,13 +3326,13 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
           if (DAG.MaskedValueIsZero(N0.getOperand(1), Mask)) {
             ADDC |= Mask;
             if (TLI.isLegalAddImmediate(ADDC.getSExtValue())) {
-              SDLoc DL(N0);
+              SDLoc DL0(N0);
               SDValue NewAdd =
-                DAG.getNode(ISD::ADD, DL, VT,
+                DAG.getNode(ISD::ADD, DL0, VT,
                             N0.getOperand(0), DAG.getConstant(ADDC, DL, VT));
               CombineTo(N0.getNode(), NewAdd);
               // Return N so it doesn't get rechecked!
-              return SDValue(LocReference, 0);
+              return SDValue(N, 0);
             }
           }
         }
@@ -3076,7 +3357,7 @@ SDValue DAGCombiner::visitANDLike(SDValue N0, SDValue N1,
         unsigned MaskBits = AndMask.countTrailingOnes();
         EVT HalfVT = EVT::getIntegerVT(*DAG.getContext(), Size / 2);
 
-        if (APIntOps::isMask(AndMask) &&
+        if (AndMask.isMask() &&
             // Required bits must not span the two halves of the integer and
             // must fit in the half size type.
             (ShiftBits + MaskBits <= Size / 2) &&
@@ -3116,7 +3397,7 @@ bool DAGCombiner::isAndLoadExtLoad(ConstantSDNode *AndC, LoadSDNode *LoadN,
                                    bool &NarrowLoad) {
   uint32_t ActiveBits = AndC->getAPIntValue().getActiveBits();
 
-  if (ActiveBits == 0 || !APIntOps::isMask(ActiveBits, AndC->getAPIntValue()))
+  if (ActiveBits == 0 || !AndC->getAPIntValue().isMask(ActiveBits))
     return false;
 
   ExtVT = EVT::getIntegerVT(*DAG.getContext(), ActiveBits);
@@ -3199,6 +3480,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
                                    APInt::getAllOnesValue(BitWidth)))
     return DAG.getConstant(0, SDLoc(N), VT);
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // reassociate and
   if (SDValue RAND = ReassociateOps(ISD::AND, SDLoc(N), N0, N1))
     return RAND;
@@ -3307,6 +3592,10 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
       // If the load type was an EXTLOAD, convert to ZEXTLOAD in order to
       // preserve semantics once we get rid of the AND.
       SDValue NewLoad(Load, 0);
+
+      // Fold the AND away. NewLoad may get replaced immediately.
+      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
+
       if (Load->getExtensionType() == ISD::EXTLOAD) {
         NewLoad = DAG.getLoad(Load->getAddressingMode(), ISD::ZEXTLOAD,
                               Load->getValueType(0), SDLoc(Load),
@@ -3324,10 +3613,6 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
         }
       }
 
-      // Fold the AND away, taking care not to fold to the old load node if we
-      // replaced it.
-      CombineTo(N, (N0.getNode() == Load) ? NewLoad : N0);
-
       return SDValue(N, 0); // Return N so it doesn't get rechecked!
     }
   }
@@ -3731,65 +4016,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
 
 /// This contains all DAGCombine rules which reduce two values combined by
 /// an Or operation to a single value \see visitANDLike().
-SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
+SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *N) {
   EVT VT = N1.getValueType();
+  SDLoc DL(N);
+
   // fold (or x, undef) -> -1
-  if (!LegalOperations &&
-      (N0.isUndef() || N1.isUndef())) {
-    EVT EltVT = VT.isVector() ? VT.getVectorElementType() : VT;
-    return DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()),
-                           SDLoc(LocReference), VT);
-  }
-  // fold (or (setcc x), (setcc y)) -> (setcc (or x, y))
-  SDValue LL, LR, RL, RR, CC0, CC1;
-  if (isSetCCEquivalent(N0, LL, LR, CC0) && isSetCCEquivalent(N1, RL, RR, CC1)){
-    ISD::CondCode Op0 = cast<CondCodeSDNode>(CC0)->get();
-    ISD::CondCode Op1 = cast<CondCodeSDNode>(CC1)->get();
-
-    if (LR == RR && Op0 == Op1 && LL.getValueType().isInteger()) {
-      // fold (or (setne X, 0), (setne Y, 0)) -> (setne (or X, Y), 0)
-      // fold (or (setlt X, 0), (setlt Y, 0)) -> (setne (or X, Y), 0)
-      if (isNullConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETLT)) {
-        EVT CCVT = getSetCCResultType(LR.getValueType());
-        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-          SDValue ORNode = DAG.getNode(ISD::OR, SDLoc(LR),
-                                       LR.getValueType(), LL, RL);
-          AddToWorklist(ORNode.getNode());
-          return DAG.getSetCC(SDLoc(LocReference), VT, ORNode, LR, Op1);
-        }
-      }
-      // fold (or (setne X, -1), (setne Y, -1)) -> (setne (and X, Y), -1)
-      // fold (or (setgt X, -1), (setgt Y  -1)) -> (setgt (and X, Y), -1)
-      if (isAllOnesConstant(LR) && (Op1 == ISD::SETNE || Op1 == ISD::SETGT)) {
-        EVT CCVT = getSetCCResultType(LR.getValueType());
-        if (VT == CCVT || (!LegalOperations && VT == MVT::i1)) {
-          SDValue ANDNode = DAG.getNode(ISD::AND, SDLoc(LR),
-                                        LR.getValueType(), LL, RL);
-          AddToWorklist(ANDNode.getNode());
-          return DAG.getSetCC(SDLoc(LocReference), VT, ANDNode, LR, Op1);
-        }
-      }
-    }
-    // canonicalize equivalent to ll == rl
-    if (LL == RR && LR == RL) {
-      Op1 = ISD::getSetCCSwappedOperands(Op1);
-      std::swap(RL, RR);
-    }
-    if (LL == RL && LR == RR) {
-      bool isInteger = LL.getValueType().isInteger();
-      ISD::CondCode Result = ISD::getSetCCOrOperation(Op0, Op1, isInteger);
-      if (Result != ISD::SETCC_INVALID &&
-          (!LegalOperations ||
-           (TLI.isCondCodeLegal(Result, LL.getSimpleValueType()) &&
-            TLI.isOperationLegal(ISD::SETCC, LL.getValueType())))) {
-        EVT CCVT = getSetCCResultType(LL.getValueType());
-        if (N0.getValueType() == CCVT ||
-            (!LegalOperations && N0.getValueType() == MVT::i1))
-          return DAG.getSetCC(SDLoc(LocReference), N0.getValueType(),
-                              LL, LR, Result);
-      }
-    }
-  }
+  if (!LegalOperations && (N0.isUndef() || N1.isUndef()))
+    return DAG.getAllOnesConstant(DL, VT);
+
+  if (SDValue V = foldLogicOfSetCCs(false, N0, N1, DL))
+    return V;
 
   // (or (and X, C1), (and Y, C2))  -> (and (or X, Y), C3) if possible.
   if (N0.getOpcode() == ISD::AND && N1.getOpcode() == ISD::AND &&
@@ -3810,7 +4046,6 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
             DAG.MaskedValueIsZero(N1.getOperand(0), LHSMask&~RHSMask)) {
           SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                                   N0.getOperand(0), N1.getOperand(0));
-          SDLoc DL(LocReference);
           return DAG.getNode(ISD::AND, DL, VT, X,
                              DAG.getConstant(LHSMask | RHSMask, DL, VT));
         }
@@ -3826,7 +4061,7 @@ SDValue DAGCombiner::visitORLike(SDValue N0, SDValue N1, SDNode *LocReference) {
       (N0.getNode()->hasOneUse() || N1.getNode()->hasOneUse())) {
     SDValue X = DAG.getNode(ISD::OR, SDLoc(N0), VT,
                             N0.getOperand(1), N1.getOperand(1));
-    return DAG.getNode(ISD::AND, SDLoc(LocReference), VT, N0.getOperand(0), X);
+    return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), X);
   }
 
   return SDValue();
@@ -3855,14 +4090,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
     // fold (or x, -1) -> -1, vector edition
     if (ISD::isBuildVectorAllOnes(N0.getNode()))
       // do not return N0, because undef node may exist in N0
-      return DAG.getConstant(
-          APInt::getAllOnesValue(N0.getScalarValueSizeInBits()), SDLoc(N),
-          N0.getValueType());
+      return DAG.getAllOnesConstant(SDLoc(N), N0.getValueType());
     if (ISD::isBuildVectorAllOnes(N1.getNode()))
       // do not return N1, because undef node may exist in N1
-      return DAG.getConstant(
-          APInt::getAllOnesValue(N1.getScalarValueSizeInBits()), SDLoc(N),
-          N1.getValueType());
+      return DAG.getAllOnesConstant(SDLoc(N), N1.getValueType());
 
     // fold (or (shuf A, V_0, MA), (shuf B, V_0, MB)) -> (shuf A, B, Mask)
     // Do this only if the resulting shuffle is legal.
@@ -3875,7 +4106,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
       bool ZeroN10 = ISD::isBuildVectorAllZeros(N1.getOperand(0).getNode());
       bool ZeroN11 = ISD::isBuildVectorAllZeros(N1.getOperand(1).getNode());
       // Ensure both shuffles have a zero input.
-      if ((ZeroN00 || ZeroN01) && (ZeroN10 || ZeroN11)) {
+      if ((ZeroN00 != ZeroN01) && (ZeroN10 != ZeroN11)) {
         assert((!ZeroN00 || !ZeroN01) && "Both inputs zero!");
         assert((!ZeroN10 || !ZeroN11) && "Both inputs zero!");
         const ShuffleVectorSDNode *SV0 = cast<ShuffleVectorSDNode>(N0);
@@ -3947,6 +4178,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   // fold (or x, -1) -> -1
   if (isAllOnesConstant(N1))
     return N1;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (or x, c) -> c iff (x & ~c) == 0
   if (N1C && DAG.MaskedValueIsZero(N0, ~N1C->getAPIntValue()))
     return N1;
@@ -3964,7 +4199,7 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   if (SDValue ROR = ReassociateOps(ISD::OR, SDLoc(N), N0, N1))
     return ROR;
   // Canonicalize (or (and X, c1), c2) -> (and (or X, c2), c1|c2)
-  // iff (c1 & c2) == 0.
+  // iff (c1 & c2) != 0.
   if (N1C && N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
              isa<ConstantSDNode>(N0.getOperand(1))) {
     ConstantSDNode *C1 = cast<ConstantSDNode>(N0.getOperand(1));
@@ -4201,8 +4436,7 @@ SDNode *DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
-      APInt AllBits = APInt::getAllOnesValue(EltSizeInBits);
-      SDValue Mask = DAG.getConstant(AllBits, DL, VT);
+      SDValue Mask = DAG.getAllOnesConstant(DL, VT);
 
       if (LHSMask.getNode()) {
         APInt RHSBits = APInt::getLowBitsSet(EltSizeInBits, LShVal);
@@ -4377,8 +4611,8 @@ struct ByteProvider {
   }
   static ByteProvider getConstantZero() { return ByteProvider(nullptr, 0); }
 
-  bool isConstantZero() { return !Load; }
-  bool isMemory() { return Load; }
+  bool isConstantZero() const { return !Load; }
+  bool isMemory() const { return Load; }
 
   bool operator==(const ByteProvider &Other) const {
     return Other.Load == Load && Other.ByteOffset == ByteOffset;
@@ -4429,10 +4663,9 @@ const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index,
 
     if (LHS->isConstantZero())
       return RHS;
-    else if (RHS->isConstantZero())
+    if (RHS->isConstantZero())
       return LHS;
-    else
-      return None;
+    return None;
   }
   case ISD::SHL: {
     auto ShiftOp = dyn_cast<ConstantSDNode>(Op->getOperand(1));
@@ -4449,6 +4682,8 @@ const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index,
                : calculateByteProvider(Op->getOperand(0), Index - ByteShift,
                                        Depth + 1);
   }
+  case ISD::ANY_EXTEND:
+  case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND: {
     SDValue NarrowOp = Op->getOperand(0);
     unsigned NarrowBitWidth = NarrowOp.getScalarValueSizeInBits();
@@ -4456,18 +4691,29 @@ const Optional<ByteProvider> calculateByteProvider(SDValue Op, unsigned Index,
       return None;
     uint64_t NarrowByteWidth = NarrowBitWidth / 8;
 
-    return Index >= NarrowByteWidth
-               ? ByteProvider::getConstantZero()
-               : calculateByteProvider(NarrowOp, Index, Depth + 1);
+    if (Index >= NarrowByteWidth)
+      return Op.getOpcode() == ISD::ZERO_EXTEND
+                 ? Optional<ByteProvider>(ByteProvider::getConstantZero())
+                 : None;
+    return calculateByteProvider(NarrowOp, Index, Depth + 1);
   }
+  case ISD::BSWAP:
+    return calculateByteProvider(Op->getOperand(0), ByteWidth - Index - 1,
+                                 Depth + 1);
   case ISD::LOAD: {
     auto L = cast<LoadSDNode>(Op.getNode());
+    if (L->isVolatile() || L->isIndexed())
+      return None;
 
-    // TODO: support ext loads
-    if (L->isVolatile() || L->isIndexed() ||
-        L->getExtensionType() != ISD::NON_EXTLOAD)
+    unsigned NarrowBitWidth = L->getMemoryVT().getSizeInBits();
+    if (NarrowBitWidth % 8 != 0)
       return None;
+    uint64_t NarrowByteWidth = NarrowBitWidth / 8;
 
+    if (Index >= NarrowByteWidth)
+      return L->getExtensionType() == ISD::ZEXTLOAD
+                 ? Optional<ByteProvider>(ByteProvider::getConstantZero())
+                 : None;
     return ByteProvider::getMemory(L, Index);
   }
   }
@@ -4528,14 +4774,24 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
   std::function<unsigned(unsigned, unsigned)> BigEndianByteAt = [](
     unsigned BW, unsigned i) { return BW - i - 1; };
 
+  bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
+  auto MemoryByteOffset = [&] (ByteProvider P) {
+    assert(P.isMemory() && "Must be a memory byte provider");
+    unsigned LoadBitWidth = P.Load->getMemoryVT().getSizeInBits();
+    assert(LoadBitWidth % 8 == 0 &&
+           "can only analyze providers for individual bytes not bit");
+    unsigned LoadByteWidth = LoadBitWidth / 8;
+    return IsBigEndianTarget
+            ? BigEndianByteAt(LoadByteWidth, P.ByteOffset)
+            : LittleEndianByteAt(LoadByteWidth, P.ByteOffset);
+  };
+
   Optional<BaseIndexOffset> Base;
   SDValue Chain;
 
   SmallSet<LoadSDNode *, 8> Loads;
-  LoadSDNode *FirstLoad = nullptr;
-
-  bool IsBigEndianTarget = DAG.getDataLayout().isBigEndian();
-  auto ByteAt = IsBigEndianTarget ? BigEndianByteAt : LittleEndianByteAt;
+  Optional<ByteProvider> FirstByteProvider;
+  int64_t FirstOffset = INT64_MAX;
 
   // Check if all the bytes of the OR we are looking at are loaded from the same
   // base address. Collect bytes offsets from Base address in ByteOffsets.
@@ -4547,7 +4803,6 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
 
     LoadSDNode *L = P->Load;
     assert(L->hasNUsesOfValue(1, 0) && !L->isVolatile() && !L->isIndexed() &&
-           (L->getExtensionType() == ISD::NON_EXTLOAD) &&
            "Must be enforced by calculateByteProvider");
     assert(L->getOffset().isUndef() && "Unindexed load must have undef offset");
 
@@ -4566,35 +4821,40 @@ SDValue DAGCombiner::MatchLoadCombine(SDNode *N) {
       return SDValue();
 
     // Calculate the offset of the current byte from the base address
-    unsigned LoadBitWidth = L->getMemoryVT().getSizeInBits();
-    assert(LoadBitWidth % 8 == 0 &&
-           "can only analyze providers for individual bytes not bit");
-    unsigned LoadByteWidth = LoadBitWidth / 8;
-    int64_t MemoryByteOffset = ByteAt(LoadByteWidth, P->ByteOffset);
-    int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset;
+    int64_t ByteOffsetFromBase = Ptr.Offset + MemoryByteOffset(*P);
     ByteOffsets[i] = ByteOffsetFromBase;
 
     // Remember the first byte load
-    if (ByteOffsetFromBase == 0)
-      FirstLoad = L;
+    if (ByteOffsetFromBase < FirstOffset) {
+      FirstByteProvider = P;
+      FirstOffset = ByteOffsetFromBase;
+    }
 
     Loads.insert(L);
   }
   assert(Loads.size() > 0 && "All the bytes of the value must be loaded from "
          "memory, so there must be at least one load which produces the value");
   assert(Base && "Base address of the accessed memory location must be set");
+  assert(FirstOffset != INT64_MAX && "First byte offset must be set");
 
   // Check if the bytes of the OR we are looking at match with either big or
   // little endian value load
   bool BigEndian = true, LittleEndian = true;
   for (unsigned i = 0; i < ByteWidth; i++) {
-    LittleEndian &= ByteOffsets[i] == LittleEndianByteAt(ByteWidth, i);
-    BigEndian &= ByteOffsets[i] == BigEndianByteAt(ByteWidth, i);
+    int64_t CurrentByteOffset = ByteOffsets[i] - FirstOffset;
+    LittleEndian &= CurrentByteOffset == LittleEndianByteAt(ByteWidth, i);
+    BigEndian &= CurrentByteOffset == BigEndianByteAt(ByteWidth, i);
     if (!BigEndian && !LittleEndian)
       return SDValue();
   }
   assert((BigEndian != LittleEndian) && "should be either or");
-  assert(FirstLoad && "must be set");
+  assert(FirstByteProvider && "must be set");
+
+  // Ensure that the first byte is loaded from zero offset of the first load.
+  // So the combined value can be loaded from the first load address.
+  if (MemoryByteOffset(*FirstByteProvider) != 0)
+    return SDValue();
+  LoadSDNode *FirstLoad = FirstByteProvider->Load;
 
   // The node we are looking at matches with the pattern, check if we can
   // replace it with a single load and bswap if needed.
@@ -4664,6 +4924,10 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   // fold (xor x, 0) -> x
   if (isNullConstant(N1))
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // reassociate xor
   if (SDValue RXOR = ReassociateOps(ISD::XOR, SDLoc(N), N0, N1))
     return RXOR;
@@ -4681,9 +4945,9 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
       default:
         llvm_unreachable("Unhandled SetCC Equivalent!");
       case ISD::SETCC:
-        return DAG.getSetCC(SDLoc(N), VT, LHS, RHS, NotCC);
+        return DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC);
       case ISD::SELECT_CC:
-        return DAG.getSelectCC(SDLoc(N), LHS, RHS, N0.getOperand(2),
+        return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
                                N0.getOperand(3), NotCC);
       }
     }
@@ -4748,6 +5012,17 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
                                          N01C->getAPIntValue(), DL, VT));
     }
   }
+
+  // fold Y = sra (X, size(X)-1); xor (add (X, Y), Y) -> (abs X)
+  unsigned OpSizeInBits = VT.getScalarSizeInBits();
+  if (N0.getOpcode() == ISD::ADD && N0.getOperand(1) == N1 &&
+      N1.getOpcode() == ISD::SRA && N1.getOperand(0) == N0.getOperand(0) &&
+      TLI.isOperationLegalOrCustom(ISD::ABS, VT)) {
+    if (ConstantSDNode *C = isConstOrConstSplat(N1.getOperand(1)))
+      if (C->getAPIntValue() == (OpSizeInBits - 1))
+        return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0.getOperand(0));
+  }
+
   // fold (xor x, x) -> 0
   if (N0 == N1)
     return tryFoldToZero(SDLoc(N), TLI, VT, DAG, LegalOperations, LegalTypes);
@@ -4951,6 +5226,10 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl undef, x) -> 0
   if (N0.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // if (shl x, c) is known to be zero, return 0
   if (DAG.MaskedValueIsZero(SDValue(N, 0),
                             APInt::getAllOnesValue(OpSizeInBits)))
@@ -5086,9 +5365,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   // fold (shl (sra x, c1), c1) -> (and x, (shl -1, c1))
   if (N0.getOpcode() == ISD::SRA && N1 == N0.getOperand(1) &&
       isConstantOrConstantVector(N1, /* No Opaques */ true)) {
-    unsigned BitSize = VT.getScalarSizeInBits();
     SDLoc DL(N);
-    SDValue AllBits = DAG.getConstant(APInt::getAllOnesValue(BitSize), DL, VT);
+    SDValue AllBits = DAG.getAllOnesConstant(DL, VT);
     SDValue HiBitsMask = DAG.getNode(ISD::SHL, DL, VT, AllBits, N1);
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), HiBitsMask);
   }
@@ -5155,6 +5433,10 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   // fold (sra x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (sra (shl x, c1), c1) -> sext_inreg for some c1 and target supports
   // sext_inreg.
   if (N1C && N0.getOpcode() == ISD::SHL && N1 == N0.getOperand(1)) {
@@ -5302,6 +5584,10 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   // fold (srl x, 0) -> x
   if (N1C && N1C->isNullValue())
     return N0;
+
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // if (srl x, c) is known to be zero, return 0
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
                                    APInt::getAllOnesValue(OpSizeInBits)))
@@ -5352,9 +5638,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   if (N0.getOpcode() == ISD::SHL && N0.getOperand(1) == N1 &&
       isConstantOrConstantVector(N1, /* NoOpaques */ true)) {
     SDLoc DL(N);
-    APInt AllBits = APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
     SDValue Mask =
-        DAG.getNode(ISD::SRL, DL, VT, DAG.getConstant(AllBits, DL, VT), N1);
+        DAG.getNode(ISD::SRL, DL, VT, DAG.getAllOnesConstant(DL, VT), N1);
     AddToWorklist(Mask.getNode());
     return DAG.getNode(ISD::AND, DL, VT, N0.getOperand(0), Mask);
   }
@@ -5480,6 +5765,22 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitABS(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (abs c1) -> c2
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
+    return DAG.getNode(ISD::ABS, SDLoc(N), VT, N0);
+  // fold (abs (abs x)) -> (abs x)
+  if (N0.getOpcode() == ISD::ABS)
+    return N0;
+  // fold (abs x) -> x iff not-negative
+  if (DAG.SignBitIsZero(N0))
+    return N0;
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitBSWAP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
@@ -5593,7 +5894,6 @@ static SDValue combineMinNumMaxNum(const SDLoc &DL, EVT VT, SDValue LHS,
   }
 }
 
-// TODO: We should handle other cases of selecting between {-1,0,1} here.
 SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   SDValue Cond = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -5602,6 +5902,67 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   EVT CondVT = Cond.getValueType();
   SDLoc DL(N);
 
+  if (!VT.isInteger())
+    return SDValue();
+
+  auto *C1 = dyn_cast<ConstantSDNode>(N1);
+  auto *C2 = dyn_cast<ConstantSDNode>(N2);
+  if (!C1 || !C2)
+    return SDValue();
+
+  // Only do this before legalization to avoid conflicting with target-specific
+  // transforms in the other direction (create a select from a zext/sext). There
+  // is also a target-independent combine here in DAGCombiner in the other
+  // direction for (select Cond, -1, 0) when the condition is not i1.
+  if (CondVT == MVT::i1 && !LegalOperations) {
+    if (C1->isNullValue() && C2->isOne()) {
+      // select Cond, 0, 1 --> zext (!Cond)
+      SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
+      if (VT != MVT::i1)
+        NotCond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, NotCond);
+      return NotCond;
+    }
+    if (C1->isNullValue() && C2->isAllOnesValue()) {
+      // select Cond, 0, -1 --> sext (!Cond)
+      SDValue NotCond = DAG.getNOT(DL, Cond, MVT::i1);
+      if (VT != MVT::i1)
+        NotCond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, NotCond);
+      return NotCond;
+    }
+    if (C1->isOne() && C2->isNullValue()) {
+      // select Cond, 1, 0 --> zext (Cond)
+      if (VT != MVT::i1)
+        Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
+      return Cond;
+    }
+    if (C1->isAllOnesValue() && C2->isNullValue()) {
+      // select Cond, -1, 0 --> sext (Cond)
+      if (VT != MVT::i1)
+        Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
+      return Cond;
+    }
+
+    // For any constants that differ by 1, we can transform the select into an
+    // extend and add. Use a target hook because some targets may prefer to
+    // transform in the other direction.
+    if (TLI.convertSelectOfConstantsToMath()) {
+      if (C1->getAPIntValue() - 1 == C2->getAPIntValue()) {
+        // select Cond, C1, C1-1 --> add (zext Cond), C1-1
+        if (VT != MVT::i1)
+          Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, Cond);
+        return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
+      }
+      if (C1->getAPIntValue() + 1 == C2->getAPIntValue()) {
+        // select Cond, C1, C1+1 --> add (sext Cond), C1+1
+        if (VT != MVT::i1)
+          Cond = DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Cond);
+        return DAG.getNode(ISD::ADD, DL, VT, Cond, N2);
+      }
+    }
+
+    return SDValue();
+  }
+
   // fold (select Cond, 0, 1) -> (xor Cond, 1)
   // We can't do this reliably if integer based booleans have different contents
   // to floating point based booleans. This is because we can't tell whether we
@@ -5611,15 +5972,14 @@ SDValue DAGCombiner::foldSelectOfConstants(SDNode *N) {
   // undiscoverable (or not reasonably discoverable). For example, it could be
   // in another basic block or it could require searching a complicated
   // expression.
-  if (VT.isInteger() &&
-      (CondVT == MVT::i1 || (CondVT.isInteger() &&
-                             TLI.getBooleanContents(false, true) ==
-                                 TargetLowering::ZeroOrOneBooleanContent &&
-                             TLI.getBooleanContents(false, false) ==
-                                 TargetLowering::ZeroOrOneBooleanContent)) &&
-      isNullConstant(N1) && isOneConstant(N2)) {
-    SDValue NotCond = DAG.getNode(ISD::XOR, DL, CondVT, Cond,
-                                  DAG.getConstant(1, DL, CondVT));
+  if (CondVT.isInteger() &&
+      TLI.getBooleanContents(false, true) ==
+          TargetLowering::ZeroOrOneBooleanContent &&
+      TLI.getBooleanContents(false, false) ==
+          TargetLowering::ZeroOrOneBooleanContent &&
+      C1->isNullValue() && C2->isOne()) {
+    SDValue NotCond =
+        DAG.getNode(ISD::XOR, DL, CondVT, Cond, DAG.getConstant(1, DL, CondVT));
     if (VT.bitsEq(CondVT))
       return NotCond;
     return DAG.getZExtOrTrunc(NotCond, DL, VT);
@@ -6129,7 +6489,7 @@ SDValue DAGCombiner::visitMLOAD(SDNode *N) {
                            ISD::NON_EXTLOAD, MLD->isExpandingLoad());
 
     Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
-                                     MLD->isExpandingLoad()); 
+                                     MLD->isExpandingLoad());
 
     MMO = DAG.getMachineFunction().
     getMachineMemOperand(MLD->getPointerInfo(),
@@ -6203,34 +6563,6 @@ SDValue DAGCombiner::visitVSELECT(SDNode *N) {
   if (SimplifySelectOps(N, N1, N2))
     return SDValue(N, 0);  // Don't revisit N.
 
-  // If the VSELECT result requires splitting and the mask is provided by a
-  // SETCC, then split both nodes and its operands before legalization. This
-  // prevents the type legalizer from unrolling SETCC into scalar comparisons
-  // and enables future optimizations (e.g. min/max pattern matching on X86).
-  if (N0.getOpcode() == ISD::SETCC) {
-    EVT VT = N->getValueType(0);
-
-    // Check if any splitting is required.
-    if (TLI.getTypeAction(*DAG.getContext(), VT) !=
-        TargetLowering::TypeSplitVector)
-      return SDValue();
-
-    SDValue Lo, Hi, CCLo, CCHi, LL, LH, RL, RH;
-    std::tie(CCLo, CCHi) = SplitVSETCC(N0.getNode(), DAG);
-    std::tie(LL, LH) = DAG.SplitVectorOperand(N, 1);
-    std::tie(RL, RH) = DAG.SplitVectorOperand(N, 2);
-
-    Lo = DAG.getNode(N->getOpcode(), DL, LL.getValueType(), CCLo, LL, RL);
-    Hi = DAG.getNode(N->getOpcode(), DL, LH.getValueType(), CCHi, LH, RH);
-
-    // Add the new VSELECT nodes to the work list in case they need to be split
-    // again.
-    AddToWorklist(Lo.getNode());
-    AddToWorklist(Hi.getNode());
-
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-  }
-
   // Fold (vselect (build_vector all_ones), N1, N2) -> N1
   if (ISD::isBuildVectorAllOnes(N0.getNode()))
     return N1;
@@ -6540,6 +6872,9 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
   SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   SDValue NewValue = DAG.getNode(ISD::CONCAT_VECTORS, DL, DstVT, Loads);
 
+  // Simplify TF.
+  AddToWorklist(NewChain.getNode());
+
   CombineTo(N, NewValue);
 
   // Replace uses of the original load (before extension)
@@ -6555,6 +6890,7 @@ SDValue DAGCombiner::CombineExtLoad(SDNode *N) {
 SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   if (SDNode *Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes,
                                               LegalOperations))
@@ -6563,8 +6899,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   // fold (sext (sext x)) -> (sext x)
   // fold (sext (aext x)) -> (sext x)
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND)
-    return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT,
-                       N0.getOperand(0));
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N0.getOperand(0));
 
   if (N0.getOpcode() == ISD::TRUNCATE) {
     // fold (sext (truncate (load x))) -> (sext (smaller load x))
@@ -6596,12 +6931,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       // Op is i32, Mid is i8, and Dest is i64.  If Op has more than 24 sign
       // bits, just sext from i32.
       if (NumSignBits > OpBits-MidBits)
-        return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, Op);
+        return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, Op);
     } else {
       // Op is i64, Mid is i8, and Dest is i32.  If Op has more than 56 sign
       // bits, just truncate to i32.
       if (NumSignBits > OpBits-MidBits)
-        return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op);
+        return DAG.getNode(ISD::TRUNCATE, DL, VT, Op);
     }
 
     // fold (sext (truncate x)) -> (sextinreg x).
@@ -6611,7 +6946,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
         Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N0), VT, Op);
       else if (OpBits > DestBits)
         Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N0), VT, Op);
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, Op,
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Op,
                          DAG.getValueType(N0.getValueType()));
     }
   }
@@ -6631,16 +6966,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       DoXform &= TLI.isVectorLoadExtDesirable(SDValue(N, 0));
     if (DoXform) {
       LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(),
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
       CombineTo(N, ExtLoad);
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
-      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
-                      ISD::SIGN_EXTEND);
+      ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
@@ -6658,8 +6991,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     EVT MemVT = LN0->getMemoryVT();
     if ((!LegalOperations && !LN0->isVolatile()) ||
         TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, MemVT)) {
-      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
-                                       LN0->getChain(),
+      SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
                                        LN0->getBasePtr(), MemVT,
                                        LN0->getMemOperand());
       CombineTo(N, ExtLoad);
@@ -6693,7 +7025,6 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
                                          LN0->getMemOperand());
         APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
         Mask = Mask.sext(VT.getSizeInBits());
-        SDLoc DL(N);
         SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
                                   ExtLoad, DAG.getConstant(Mask, DL, VT));
         SDValue Trunc = DAG.getNode(ISD::TRUNCATE,
@@ -6701,24 +7032,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
                                     N0.getOperand(0).getValueType(), ExtLoad);
         CombineTo(N, And);
         CombineTo(N0.getOperand(0).getNode(), Trunc, ExtLoad.getValue(1));
-        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL,
-                        ISD::SIGN_EXTEND);
+        ExtendSetCCUses(SetCCs, Trunc, ExtLoad, DL, ISD::SIGN_EXTEND);
         return SDValue(N, 0);   // Return N so it doesn't get rechecked!
       }
     }
   }
 
   if (N0.getOpcode() == ISD::SETCC) {
-    EVT N0VT = N0.getOperand(0).getValueType();
+    SDValue N00 = N0.getOperand(0);
+    SDValue N01 = N0.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
+    EVT N00VT = N0.getOperand(0).getValueType();
+
     // sext(setcc) -> sext_in_reg(vsetcc) for vectors.
     // Only do this before legalize for now.
     if (VT.isVector() && !LegalOperations &&
-        TLI.getBooleanContents(N0VT) ==
+        TLI.getBooleanContents(N00VT) ==
             TargetLowering::ZeroOrNegativeOneBooleanContent) {
       // On some architectures (such as SSE/NEON/etc) the SETCC result type is
       // of the same size as the compared operands. Only optimize sext(setcc())
       // if this is the case.
-      EVT SVT = getSetCCResultType(N0VT);
+      EVT SVT = getSetCCResultType(N00VT);
 
       // We know that the # elements of the results is the same as the
       // # elements of the compare (and the # elements of the compare result
@@ -6726,19 +7060,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
       // we know that the element size of the sext'd result matches the
       // element size of the compare operands.
       if (VT.getSizeInBits() == SVT.getSizeInBits())
-        return DAG.getSetCC(SDLoc(N), VT, N0.getOperand(0),
-                             N0.getOperand(1),
-                             cast<CondCodeSDNode>(N0.getOperand(2))->get());
+        return DAG.getSetCC(DL, VT, N00, N01, CC);
 
       // If the desired elements are smaller or larger than the source
-      // elements we can use a matching integer vector type and then
-      // truncate/sign extend
-      EVT MatchingVectorType = N0VT.changeVectorElementTypeToInteger();
-      if (SVT == MatchingVectorType) {
-        SDValue VsetCC = DAG.getSetCC(SDLoc(N), MatchingVectorType,
-                               N0.getOperand(0), N0.getOperand(1),
-                               cast<CondCodeSDNode>(N0.getOperand(2))->get());
-        return DAG.getSExtOrTrunc(VsetCC, SDLoc(N), VT);
+      // elements, we can use a matching integer vector type and then
+      // truncate/sign extend.
+      EVT MatchingVecType = N00VT.changeVectorElementTypeToInteger();
+      if (SVT == MatchingVecType) {
+        SDValue VsetCC = DAG.getSetCC(DL, MatchingVecType, N00, N01, CC);
+        return DAG.getSExtOrTrunc(VsetCC, DL, VT);
       }
     }
 
@@ -6747,36 +7077,30 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
     // getBooleanContents().
     unsigned SetCCWidth = N0.getScalarValueSizeInBits();
 
-    SDLoc DL(N);
     // To determine the "true" side of the select, we need to know the high bit
     // of the value returned by the setcc if it evaluates to true.
     // If the type of the setcc is i1, then the true case of the select is just
     // sext(i1 1), that is, -1.
     // If the type of the setcc is larger (say, i8) then the value of the high
-    // bit depends on getBooleanContents(). So, ask TLI for a real "true" value
+    // bit depends on getBooleanContents(), so ask TLI for a real "true" value
     // of the appropriate width.
-    SDValue ExtTrueVal =
-        (SetCCWidth == 1)
-            ? DAG.getConstant(APInt::getAllOnesValue(VT.getScalarSizeInBits()),
-                              DL, VT)
-            : TLI.getConstTrueVal(DAG, VT, DL);
-
-    if (SDValue SCC = SimplifySelectCC(
-            DL, N0.getOperand(0), N0.getOperand(1), ExtTrueVal,
-            DAG.getConstant(0, DL, VT),
-            cast<CondCodeSDNode>(N0.getOperand(2))->get(), true))
+    SDValue ExtTrueVal = (SetCCWidth == 1) ? DAG.getAllOnesConstant(DL, VT)
+                                           : TLI.getConstTrueVal(DAG, VT, DL);
+    SDValue Zero = DAG.getConstant(0, DL, VT);
+    if (SDValue SCC =
+            SimplifySelectCC(DL, N00, N01, ExtTrueVal, Zero, CC, true))
       return SCC;
 
     if (!VT.isVector()) {
-      EVT SetCCVT = getSetCCResultType(N0.getOperand(0).getValueType());
-      if (!LegalOperations ||
-          TLI.isOperationLegal(ISD::SETCC, N0.getOperand(0).getValueType())) {
-        SDLoc DL(N);
-        ISD::CondCode CC = cast<CondCodeSDNode>(N0.getOperand(2))->get();
-        SDValue SetCC =
-            DAG.getSetCC(DL, SetCCVT, N0.getOperand(0), N0.getOperand(1), CC);
-        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal,
-                             DAG.getConstant(0, DL, VT));
+      EVT SetCCVT = getSetCCResultType(N00VT);
+      // Don't do this transform for i1 because there's a select transform
+      // that would reverse it.
+      // TODO: We should not do this transform at all without a target hook
+      // because a sext is likely cheaper than a select?
+      if (SetCCVT.getScalarSizeInBits() != 1 &&
+          (!LegalOperations || TLI.isOperationLegal(ISD::SETCC, N00VT))) {
+        SDValue SetCC = DAG.getSetCC(DL, SetCCVT, N00, N01, CC);
+        return DAG.getSelect(DL, VT, SetCC, ExtTrueVal, Zero);
       }
     }
   }
@@ -6784,7 +7108,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
   // fold (sext x) -> (zext x) if the sign bit is known zero.
   if ((!LegalOperations || TLI.isOperationLegal(ISD::ZERO_EXTEND, VT)) &&
       DAG.SignBitIsZero(N0))
-    return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, N0);
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0);
 
   return SDValue();
 }
@@ -6959,13 +7283,14 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                        LN0->getChain(),
                                        LN0->getBasePtr(), N0.getValueType(),
                                        LN0->getMemOperand());
-      CombineTo(N, ExtLoad);
+
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
                                   N0.getValueType(), ExtLoad);
       CombineTo(N0.getNode(), Trunc, ExtLoad.getValue(1));
 
       ExtendSetCCUses(SetCCs, Trunc, ExtLoad, SDLoc(N),
                       ISD::ZERO_EXTEND);
+      CombineTo(N, ExtLoad);
       return SDValue(N, 0);   // Return N so it doesn't get rechecked!
     }
   }
@@ -7273,9 +7598,25 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitAssertZext(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  EVT EVT = cast<VTSDNode>(N1)->getVT();
+
+  // fold (assertzext (assertzext x, vt), vt) -> (assertzext x, vt)
+  if (N0.getOpcode() == ISD::AssertZext &&
+      EVT == cast<VTSDNode>(N0.getOperand(1))->getVT())
+    return N0;
+
+  return SDValue();
+}
+
 /// See if the specified operand can be simplified with the knowledge that only
 /// the bits specified by Mask are used.  If so, return the simpler operand,
 /// otherwise return a null SDValue.
+///
+/// (This exists alongside SimplifyDemandedBits because GetDemandedBits can
+/// simplify nodes with multiple uses more aggressively.)
 SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
   switch (V.getOpcode()) {
   default: break;
@@ -7311,6 +7652,14 @@ SDValue DAGCombiner::GetDemandedBits(SDValue V, const APInt &Mask) {
         return DAG.getNode(ISD::SRL, SDLoc(V), V.getValueType(),
                            SimplifyLHS, V.getOperand(1));
     }
+    break;
+  case ISD::AND: {
+    // X & -1 -> X (ignoring bits which aren't demanded).
+    ConstantSDNode *AndVal = isConstOrConstSplat(V.getOperand(1));
+    if (AndVal && (AndVal->getAPIntValue() & Mask) == Mask)
+      return V.getOperand(0);
+    break;
+  }
   }
   return SDValue();
 }
@@ -7526,6 +7875,16 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
   }
 
+  // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_in_reg x)
+  if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
+       N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ||
+       N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
+      N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) {
+    if (!LegalOperations ||
+        TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))
+      return DAG.getSignExtendVectorInReg(N0.getOperand(0), SDLoc(N), VT);
+  }
+
   // fold (sext_in_reg (zext x)) -> (sext x)
   // iff we are extending the source sign bit.
   if (N0.getOpcode() == ISD::ZERO_EXTEND) {
@@ -7536,7 +7895,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   }
 
   // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
-  if (DAG.MaskedValueIsZero(N0, APInt::getBitsSet(VTBits, EVTBits-1, EVTBits)))
+  if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1)))
     return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType());
 
   // fold operands of sext_in_reg based on knowledge that the top bits are not
@@ -7778,6 +8137,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
                                                      VT.getSizeInBits())))
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Shorter);
   }
+
   // fold (truncate (load x)) -> (smaller load x)
   // fold (truncate (srl (load x), c)) -> (smaller load (x+c/evtbits))
   if (!LegalTypes || TLI.isTypeDesirableForOp(N0.getOpcode(), VT)) {
@@ -7799,6 +8159,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       }
     }
   }
+
   // fold (trunc (concat ... x ...)) -> (concat ..., (trunc x), ...)),
   // where ... are all 'undef'.
   if (N0.getOpcode() == ISD::CONCAT_VECTORS && !LegalTypes) {
@@ -7864,6 +8225,18 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
+  // (trunc adde(X, Y, Carry)) -> (adde trunc(X), trunc(Y), Carry)
+  // When the adde's carry is not used.
+  if (N0.getOpcode() == ISD::ADDE && N0.hasOneUse() &&
+      !N0.getNode()->hasAnyUseOfValue(1) &&
+      (!LegalOperations || TLI.isOperationLegal(ISD::ADDE, VT))) {
+    SDLoc SL(N);
+    auto X = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(0));
+    auto Y = DAG.getNode(ISD::TRUNCATE, SL, VT, N0.getOperand(1));
+    return DAG.getNode(ISD::ADDE, SL, DAG.getVTList(VT, MVT::Glue),
+                       X, Y, N0.getOperand(2));
+  }
+
   return SDValue();
 }
 
@@ -7954,6 +8327,9 @@ SDValue DAGCombiner::visitBITCAST(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  if (N0.isUndef())
+    return DAG.getUNDEF(VT);
+
   // If the input is a BUILD_VECTOR with all constant elements, fold this now.
   // Only do this before legalize, since afterward the target may be depending
   // on the bitconvert.
@@ -8322,6 +8698,11 @@ ConstantFoldBITCASTofBUILD_VECTOR(SDNode *BV, EVT DstEltVT) {
   return DAG.getBuildVector(VT, DL, Ops);
 }
 
+static bool isContractable(SDNode *N) {
+  SDNodeFlags F = cast<BinaryWithFlagsSDNode>(N)->Flags;
+  return F.hasAllowContract() || F.hasUnsafeAlgebra();
+}
+
 /// Try to perform FMA combining on a given FADD node.
 SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   SDValue N0 = N->getOperand(0);
@@ -8330,24 +8711,27 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   SDLoc SL(N);
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  bool AllowFusion =
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
 
   // Floating-point multiply-add with intermediate rounding.
   bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
-      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
+  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                              Options.UnsafeFPMath || HasFMAD);
+  // If the addition is not contractable, do not combine.
+  if (!AllowFusionGlobally && !isContractable(N))
+    return SDValue();
+
   const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
-  ;
-  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel))
+  if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
@@ -8355,35 +8739,39 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
   bool LookThroughFPExt = TLI.isFPExtFree(VT);
 
+  // Is the node an FMUL and contractable either due to global flags or
+  // SDNodeFlags.
+  auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
+    if (N.getOpcode() != ISD::FMUL)
+      return false;
+    return AllowFusionGlobally || isContractable(N.getNode());
+  };
   // If we have two choices trying to fold (fadd (fmul u, v), (fmul x, y)),
   // prefer to fold the multiply with fewer uses.
-  if (Aggressive && N0.getOpcode() == ISD::FMUL &&
-      N1.getOpcode() == ISD::FMUL) {
+  if (Aggressive && isContractableFMUL(N0) && isContractableFMUL(N1)) {
     if (N0.getNode()->use_size() > N1.getNode()->use_size())
       std::swap(N0, N1);
   }
 
   // fold (fadd (fmul x, y), z) -> (fma x, y, z)
-  if (N0.getOpcode() == ISD::FMUL &&
-      (Aggressive || N0->hasOneUse())) {
+  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N0.getOperand(0), N0.getOperand(1), N1);
   }
 
   // fold (fadd x, (fmul y, z)) -> (fma y, z, x)
   // Note: Commutes FADD operands.
-  if (N1.getOpcode() == ISD::FMUL &&
-      (Aggressive || N1->hasOneUse())) {
+  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N1.getOperand(0), N1.getOperand(1), N0);
   }
 
   // Look through FP_EXTEND nodes to do more combining.
-  if (AllowFusion && LookThroughFPExt) {
+  if (LookThroughFPExt) {
     // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
-      if (N00.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N00))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                        N00.getOperand(0)),
@@ -8395,7 +8783,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     // Note: Commutes FADD operands.
     if (N1.getOpcode() == ISD::FP_EXTEND) {
       SDValue N10 = N1.getOperand(0);
-      if (N10.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N10))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                        N10.getOperand(0)),
@@ -8405,9 +8793,12 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   }
 
   // More folding opportunities when target permits.
-  if ((AllowFusion || HasFMAD)  && Aggressive) {
+  if (Aggressive) {
     // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
-    if (N0.getOpcode() == PreferredFusedOpcode &&
+    // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
+    // are currently only supported on binary nodes.
+    if (Options.UnsafeFPMath &&
+        N0.getOpcode() == PreferredFusedOpcode &&
         N0.getOperand(2).getOpcode() == ISD::FMUL &&
         N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8419,7 +8810,10 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
     }
 
     // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
-    if (N1->getOpcode() == PreferredFusedOpcode &&
+    // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
+    // are currently only supported on binary nodes.
+    if (Options.UnsafeFPMath &&
+        N1->getOpcode() == PreferredFusedOpcode &&
         N1.getOperand(2).getOpcode() == ISD::FMUL &&
         N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8430,7 +8824,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                                      N0));
     }
 
-    if (AllowFusion && LookThroughFPExt) {
+    if (LookThroughFPExt) {
       // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
       //   -> (fma x, y, (fma (fpext u), (fpext v), z))
       auto FoldFAddFMAFPExtFMul = [&] (
@@ -8445,7 +8839,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N02 = N0.getOperand(2);
         if (N02.getOpcode() == ISD::FP_EXTEND) {
           SDValue N020 = N02.getOperand(0);
-          if (N020.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N020))
             return FoldFAddFMAFPExtFMul(N0.getOperand(0), N0.getOperand(1),
                                         N020.getOperand(0), N020.getOperand(1),
                                         N1);
@@ -8471,7 +8865,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N00 = N0.getOperand(0);
         if (N00.getOpcode() == PreferredFusedOpcode) {
           SDValue N002 = N00.getOperand(2);
-          if (N002.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N002))
             return FoldFAddFPExtFMAFMul(N00.getOperand(0), N00.getOperand(1),
                                         N002.getOperand(0), N002.getOperand(1),
                                         N1);
@@ -8484,7 +8878,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N12 = N1.getOperand(2);
         if (N12.getOpcode() == ISD::FP_EXTEND) {
           SDValue N120 = N12.getOperand(0);
-          if (N120.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N120))
             return FoldFAddFMAFPExtFMul(N1.getOperand(0), N1.getOperand(1),
                                         N120.getOperand(0), N120.getOperand(1),
                                         N0);
@@ -8500,7 +8894,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
         SDValue N10 = N1.getOperand(0);
         if (N10.getOpcode() == PreferredFusedOpcode) {
           SDValue N102 = N10.getOperand(2);
-          if (N102.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N102))
             return FoldFAddFPExtFMAFMul(N10.getOperand(0), N10.getOperand(1),
                                         N102.getOperand(0), N102.getOperand(1),
                                         N0);
@@ -8520,23 +8914,26 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   SDLoc SL(N);
 
   const TargetOptions &Options = DAG.getTarget().Options;
-  bool AllowFusion =
-      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath);
-
   // Floating-point multiply-add with intermediate rounding.
   bool HasFMAD = (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
-      AllowFusion && TLI.isFMAFasterThanFMulAndFAdd(VT) &&
+      TLI.isFMAFasterThanFMulAndFAdd(VT) &&
       (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FMA, VT));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
     return SDValue();
 
+  bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
+                              Options.UnsafeFPMath || HasFMAD);
+  // If the subtraction is not contractable, do not combine.
+  if (!AllowFusionGlobally && !isContractable(N))
+    return SDValue();
+
   const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
-  if (AllowFusion && STI && STI->generateFMAsInMachineCombiner(OptLevel))
+  if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
@@ -8544,9 +8941,16 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
   bool LookThroughFPExt = TLI.isFPExtFree(VT);
 
+  // Is the node an FMUL and contractable either due to global flags or
+  // SDNodeFlags.
+  auto isContractableFMUL = [AllowFusionGlobally](SDValue N) {
+    if (N.getOpcode() != ISD::FMUL)
+      return false;
+    return AllowFusionGlobally || isContractable(N.getNode());
+  };
+
   // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-  if (N0.getOpcode() == ISD::FMUL &&
-      (Aggressive || N0->hasOneUse())) {
+  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        N0.getOperand(0), N0.getOperand(1),
                        DAG.getNode(ISD::FNEG, SL, VT, N1));
@@ -8554,16 +8958,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
   // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
   // Note: Commutes FSUB operands.
-  if (N1.getOpcode() == ISD::FMUL &&
-      (Aggressive || N1->hasOneUse()))
+  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse()))
     return DAG.getNode(PreferredFusedOpcode, SL, VT,
                        DAG.getNode(ISD::FNEG, SL, VT,
                                    N1.getOperand(0)),
                        N1.getOperand(1), N0);
 
   // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
-  if (N0.getOpcode() == ISD::FNEG &&
-      N0.getOperand(0).getOpcode() == ISD::FMUL &&
+  if (N0.getOpcode() == ISD::FNEG && isContractableFMUL(N0.getOperand(0)) &&
       (Aggressive || (N0->hasOneUse() && N0.getOperand(0).hasOneUse()))) {
     SDValue N00 = N0.getOperand(0).getOperand(0);
     SDValue N01 = N0.getOperand(0).getOperand(1);
@@ -8573,12 +8975,12 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   }
 
   // Look through FP_EXTEND nodes to do more combining.
-  if (AllowFusion && LookThroughFPExt) {
+  if (LookThroughFPExt) {
     // fold (fsub (fpext (fmul x, y)), z)
     //   -> (fma (fpext x), (fpext y), (fneg z))
     if (N0.getOpcode() == ISD::FP_EXTEND) {
       SDValue N00 = N0.getOperand(0);
-      if (N00.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N00))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                        N00.getOperand(0)),
@@ -8592,7 +8994,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     // Note: Commutes FSUB operands.
     if (N1.getOpcode() == ISD::FP_EXTEND) {
       SDValue N10 = N1.getOperand(0);
-      if (N10.getOpcode() == ISD::FMUL)
+      if (isContractableFMUL(N10))
         return DAG.getNode(PreferredFusedOpcode, SL, VT,
                            DAG.getNode(ISD::FNEG, SL, VT,
                                        DAG.getNode(ISD::FP_EXTEND, SL, VT,
@@ -8612,7 +9014,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       SDValue N00 = N0.getOperand(0);
       if (N00.getOpcode() == ISD::FNEG) {
         SDValue N000 = N00.getOperand(0);
-        if (N000.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N000)) {
           return DAG.getNode(ISD::FNEG, SL, VT,
                              DAG.getNode(PreferredFusedOpcode, SL, VT,
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
@@ -8634,7 +9036,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       SDValue N00 = N0.getOperand(0);
       if (N00.getOpcode() == ISD::FP_EXTEND) {
         SDValue N000 = N00.getOperand(0);
-        if (N000.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N000)) {
           return DAG.getNode(ISD::FNEG, SL, VT,
                              DAG.getNode(PreferredFusedOpcode, SL, VT,
                                          DAG.getNode(ISD::FP_EXTEND, SL, VT,
@@ -8649,12 +9051,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   }
 
   // More folding opportunities when target permits.
-  if ((AllowFusion || HasFMAD) && Aggressive) {
+  if (Aggressive) {
     // fold (fsub (fma x, y, (fmul u, v)), z)
     //   -> (fma x, y (fma u, v, (fneg z)))
-    if (N0.getOpcode() == PreferredFusedOpcode &&
-        N0.getOperand(2).getOpcode() == ISD::FMUL &&
-        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
+    // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
+    // are currently only supported on binary nodes.
+    if (Options.UnsafeFPMath && N0.getOpcode() == PreferredFusedOpcode &&
+        isContractableFMUL(N0.getOperand(2)) && N0->hasOneUse() &&
+        N0.getOperand(2)->hasOneUse()) {
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
                          N0.getOperand(0), N0.getOperand(1),
                          DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8666,8 +9070,10 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
     // fold (fsub x, (fma y, z, (fmul u, v)))
     //   -> (fma (fneg y), z, (fma (fneg u), v, x))
-    if (N1.getOpcode() == PreferredFusedOpcode &&
-        N1.getOperand(2).getOpcode() == ISD::FMUL) {
+    // FIXME: The UnsafeAlgebra flag should be propagated to FMA/FMAD, but FMF
+    // are currently only supported on binary nodes.
+    if (Options.UnsafeFPMath && N1.getOpcode() == PreferredFusedOpcode &&
+        isContractableFMUL(N1.getOperand(2))) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
       SDValue N21 = N1.getOperand(2).getOperand(1);
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8680,14 +9086,14 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
                                      N21, N0));
     }
 
-    if (AllowFusion && LookThroughFPExt) {
+    if (LookThroughFPExt) {
       // fold (fsub (fma x, y, (fpext (fmul u, v))), z)
       //   -> (fma x, y (fma (fpext u), (fpext v), (fneg z)))
       if (N0.getOpcode() == PreferredFusedOpcode) {
         SDValue N02 = N0.getOperand(2);
         if (N02.getOpcode() == ISD::FP_EXTEND) {
           SDValue N020 = N02.getOperand(0);
-          if (N020.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N020))
             return DAG.getNode(PreferredFusedOpcode, SL, VT,
                                N0.getOperand(0), N0.getOperand(1),
                                DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8710,7 +9116,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
         SDValue N00 = N0.getOperand(0);
         if (N00.getOpcode() == PreferredFusedOpcode) {
           SDValue N002 = N00.getOperand(2);
-          if (N002.getOpcode() == ISD::FMUL)
+          if (isContractableFMUL(N002))
             return DAG.getNode(PreferredFusedOpcode, SL, VT,
                                DAG.getNode(ISD::FP_EXTEND, SL, VT,
                                            N00.getOperand(0)),
@@ -8731,7 +9137,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
       if (N1.getOpcode() == PreferredFusedOpcode &&
         N1.getOperand(2).getOpcode() == ISD::FP_EXTEND) {
         SDValue N120 = N1.getOperand(2).getOperand(0);
-        if (N120.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N120)) {
           SDValue N1200 = N120.getOperand(0);
           SDValue N1201 = N120.getOperand(1);
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8758,7 +9164,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
         SDValue N100 = N1.getOperand(0).getOperand(0);
         SDValue N101 = N1.getOperand(0).getOperand(1);
         SDValue N102 = N1.getOperand(0).getOperand(2);
-        if (N102.getOpcode() == ISD::FMUL) {
+        if (isContractableFMUL(N102)) {
           SDValue N1020 = N102.getOperand(0);
           SDValue N1021 = N102.getOperand(1);
           return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -8894,6 +9300,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   if (N0CFP && !N1CFP)
     return DAG.getNode(ISD::FADD, DL, VT, N1, N0, Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (fadd A, (fneg B)) -> (fsub A, B)
   if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
       isNegatibleForFree(N1, LegalOperations, TLI, &Options) == 2)
@@ -8907,7 +9316,7 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
                        GetNegatedExpression(N0, DAG, LegalOperations), Flags);
 
   // FIXME: Auto-upgrade the target/function-level option.
-  if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) {
+  if (Options.NoSignedZerosFPMath || N->getFlags()->hasNoSignedZeros()) {
     // fold (fadd A, 0) -> A
     if (ConstantFPSDNode *N1C = isConstOrConstSplatFP(N1))
       if (N1C->isZero())
@@ -9041,13 +9450,16 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FSUB, DL, VT, N0, N1, Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   // fold (fsub A, (fneg B)) -> (fadd A, B)
   if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
     return DAG.getNode(ISD::FADD, DL, VT, N0,
                        GetNegatedExpression(N1, DAG, LegalOperations), Flags);
 
   // FIXME: Auto-upgrade the target/function-level option.
-  if (Options.UnsafeFPMath || N->getFlags()->hasNoSignedZeros()) {
+  if (Options.NoSignedZerosFPMath  || N->getFlags()->hasNoSignedZeros()) {
     // (fsub 0, B) -> -B
     if (N0CFP && N0CFP->isZero()) {
       if (isNegatibleForFree(N1, LegalOperations, TLI, &Options))
@@ -9120,6 +9532,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   if (N1CFP && N1CFP->isExactlyValue(1.0))
     return N0;
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   if (Options.UnsafeFPMath) {
     // fold (fmul A, 0) -> 0
     if (N1CFP && N1CFP->isZero())
@@ -9374,6 +9789,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   if (N0CFP && N1CFP)
     return DAG.getNode(ISD::FDIV, SDLoc(N), VT, N0, N1, Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   if (Options.UnsafeFPMath) {
     // fold (fdiv X, c2) -> fmul X, 1/c2 if losing precision is acceptable.
     if (N1CFP) {
@@ -9477,6 +9895,9 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
     return DAG.getNode(ISD::FREM, SDLoc(N), VT, N0, N1,
                        &cast<BinaryWithFlagsSDNode>(N)->Flags);
 
+  if (SDValue NewSel = foldBinOpIntoSelect(N))
+    return NewSel;
+
   return SDValue();
 }
 
@@ -10631,7 +11052,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
               dbgs() << "\n");
         WorklistRemover DeadNodes(*this);
         DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
-
+        AddUsersToWorklist(Chain.getNode());
         if (N->use_empty())
           deleteAndRecombine(N);
 
@@ -10684,7 +11105,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
       StoreSDNode *PrevST = cast<StoreSDNode>(Chain);
       if (PrevST->getBasePtr() == Ptr &&
           PrevST->getValue().getValueType() == N->getValueType(0))
-      return CombineTo(N, Chain.getOperand(1), Chain);
+        return CombineTo(N, PrevST->getOperand(1), Chain);
     }
   }
 
@@ -10702,14 +11123,7 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
     }
   }
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-#ifndef NDEBUG
-  if (CombinerAAOnlyFunc.getNumOccurrences() &&
-      CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
-    UseAA = false;
-#endif
-  if (UseAA && LD->isUnindexed()) {
+  if (LD->isUnindexed()) {
     // Walk up chain skipping non-aliasing memory nodes.
     SDValue BetterChain = FindBetterChain(N, Chain);
 
@@ -11291,6 +11705,7 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
   SDValue Chain = DAG.getNode(ISD::TokenFactor, SDLoc(LD), MVT::Other,
                               ArgChains);
   DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Chain);
+  AddToWorklist(Chain.getNode());
   return true;
 }
 
@@ -11684,18 +12099,24 @@ bool DAGCombiner::isMulAddWithConstProfitable(SDNode *MulNode,
   return false;
 }
 
-SDValue DAGCombiner::getMergedConstantVectorStore(
-    SelectionDAG &DAG, const SDLoc &SL, ArrayRef<MemOpLink> Stores,
-    SmallVectorImpl<SDValue> &Chains, EVT Ty) const {
-  SmallVector<SDValue, 8> BuildVector;
+SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                         unsigned NumStores) {
+  SmallVector<SDValue, 8> Chains;
+  SmallPtrSet<const SDNode *, 8> Visited;
+  SDLoc StoreDL(StoreNodes[0].MemNode);
+
+  for (unsigned i = 0; i < NumStores; ++i) {
+    Visited.insert(StoreNodes[i].MemNode);
+  }
 
-  for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
-    StoreSDNode *St = cast<StoreSDNode>(Stores[I].MemNode);
-    Chains.push_back(St->getChain());
-    BuildVector.push_back(St->getValue());
+  // don't include nodes that are children
+  for (unsigned i = 0; i < NumStores; ++i) {
+    if (Visited.count(StoreNodes[i].MemNode->getChain().getNode()) == 0)
+      Chains.push_back(StoreNodes[i].MemNode->getChain());
   }
 
-  return DAG.getBuildVector(Ty, SL, BuildVector);
+  assert(Chains.size() > 0 && "Chain should have generated a chain");
+  return DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, Chains);
 }
 
 bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
@@ -11706,22 +12127,8 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     return false;
 
   int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
-  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-  unsigned LatestNodeUsed = 0;
-
-  for (unsigned i=0; i < NumStores; ++i) {
-    // Find a chain for the new wide-store operand. Notice that some
-    // of the store nodes that we found may not be selected for inclusion
-    // in the wide store. The chain we use needs to be the chain of the
-    // latest store node which is *used* and replaced by the wide store.
-    if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
-      LatestNodeUsed = i;
-  }
-
-  SmallVector<SDValue, 8> Chains;
 
   // The latest Node in the DAG.
-  LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
   SDLoc DL(StoreNodes[0].MemNode);
 
   SDValue StoredVal;
@@ -11737,7 +12144,18 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     assert(TLI.isTypeLegal(Ty) && "Illegal vector store");
 
     if (IsConstantSrc) {
-      StoredVal = getMergedConstantVectorStore(DAG, DL, StoreNodes, Chains, Ty);
+      SmallVector<SDValue, 8> BuildVector;
+      for (unsigned I = 0, E = Ty.getVectorNumElements(); I != E; ++I) {
+        StoreSDNode *St = cast<StoreSDNode>(StoreNodes[I].MemNode);
+        SDValue Val = St->getValue();
+        if (MemVT.getScalarType().isInteger())
+          if (auto *CFP = dyn_cast<ConstantFPSDNode>(St->getValue()))
+            Val = DAG.getConstant(
+                (uint32_t)CFP->getValueAPF().bitcastToAPInt().getZExtValue(),
+                SDLoc(CFP), MemVT);
+        BuildVector.push_back(Val);
+      }
+      StoredVal = DAG.getBuildVector(Ty, DL, BuildVector);
     } else {
       SmallVector<SDValue, 8> Ops;
       for (unsigned i = 0; i < NumStores; ++i) {
@@ -11747,7 +12165,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
         if (Val.getValueType() != MemVT)
           return false;
         Ops.push_back(Val);
-        Chains.push_back(St->getChain());
       }
 
       // Build the extracted vector elements back into a vector.
@@ -11767,7 +12184,6 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     for (unsigned i = 0; i < NumStores; ++i) {
       unsigned Idx = IsLE ? (NumStores - 1 - i) : i;
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[Idx].MemNode);
-      Chains.push_back(St->getChain());
 
       SDValue Val = St->getValue();
       StoreInt <<= ElementSizeBytes * 8;
@@ -11785,54 +12201,27 @@ bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
     StoredVal = DAG.getConstant(StoreInt, DL, StoreTy);
   }
 
-  assert(!Chains.empty());
-
-  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+  SDValue NewChain = getMergeStoreChains(StoreNodes, NumStores);
   SDValue NewStore = DAG.getStore(NewChain, DL, StoredVal,
                                   FirstInChain->getBasePtr(),
                                   FirstInChain->getPointerInfo(),
                                   FirstInChain->getAlignment());
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-  if (UseAA) {
-    // Replace all merged stores with the new store.
-    for (unsigned i = 0; i < NumStores; ++i)
-      CombineTo(StoreNodes[i].MemNode, NewStore);
-  } else {
-    // Replace the last store with the new store.
-    CombineTo(LatestOp, NewStore);
-    // Erase all other stores.
-    for (unsigned i = 0; i < NumStores; ++i) {
-      if (StoreNodes[i].MemNode == LatestOp)
-        continue;
-      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      // ReplaceAllUsesWith will replace all uses that existed when it was
-      // called, but graph optimizations may cause new ones to appear. For
-      // example, the case in pr14333 looks like
-      //
-      //  St's chain -> St -> another store -> X
-      //
-      // And the only difference from St to the other store is the chain.
-      // When we change it's chain to be St's chain they become identical,
-      // get CSEed and the net result is that X is now a use of St.
-      // Since we know that St is redundant, just iterate.
-      while (!St->use_empty())
-        DAG.ReplaceAllUsesWith(SDValue(St, 0), St->getChain());
-      deleteAndRecombine(St);
-    }
-  }
+  // Replace all merged stores with the new store.
+  for (unsigned i = 0; i < NumStores; ++i)
+    CombineTo(StoreNodes[i].MemNode, NewStore);
 
-  StoreNodes.erase(StoreNodes.begin() + NumStores, StoreNodes.end());
+  AddToWorklist(NewChain.getNode());
   return true;
 }
 
-void DAGCombiner::getStoreMergeAndAliasCandidates(
-    StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes,
-    SmallVectorImpl<LSBaseSDNode*> &AliasLoadNodes) {
+void DAGCombiner::getStoreMergeCandidates(
+    StoreSDNode *St, SmallVectorImpl<MemOpLink> &StoreNodes) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
   BaseIndexOffset BasePtr = BaseIndexOffset::match(St->getBasePtr(), DAG);
+  EVT MemVT = St->getMemoryVT();
 
   // We must have a base and an offset.
   if (!BasePtr.Base.getNode())
@@ -11842,104 +12231,71 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
   if (BasePtr.Base.isUndef())
     return;
 
-  // Walk up the chain and look for nodes with offsets from the same
-  // base pointer. Stop when reaching an instruction with a different kind
-  // or instruction which has a different base pointer.
-  EVT MemVT = St->getMemoryVT();
-  unsigned Seq = 0;
-  StoreSDNode *Index = St;
-
-
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-
-  if (UseAA) {
-    // Look at other users of the same chain. Stores on the same chain do not
-    // alias. If combiner-aa is enabled, non-aliasing stores are canonicalized
-    // to be on the same chain, so don't bother looking at adjacent chains.
-
-    SDValue Chain = St->getChain();
-    for (auto I = Chain->use_begin(), E = Chain->use_end(); I != E; ++I) {
-      if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
-        if (I.getOperandNo() != 0)
-          continue;
-
-        if (OtherST->isVolatile() || OtherST->isIndexed())
-          continue;
-
-        if (OtherST->getMemoryVT() != MemVT)
-          continue;
-
-        BaseIndexOffset Ptr = BaseIndexOffset::match(OtherST->getBasePtr(), DAG);
-
-        if (Ptr.equalBaseIndex(BasePtr))
-          StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset, Seq++));
-      }
-    }
-
-    return;
-  }
-
-  while (Index) {
-    // If the chain has more than one use, then we can't reorder the mem ops.
-    if (Index != St && !SDValue(Index, 0)->hasOneUse())
-      break;
-
-    // Find the base pointer and offset for this memory node.
-    BaseIndexOffset Ptr = BaseIndexOffset::match(Index->getBasePtr(), DAG);
-
-    // Check that the base pointer is the same as the original one.
+  bool IsLoadSrc = isa<LoadSDNode>(St->getValue());
+  bool IsConstantSrc = isa<ConstantSDNode>(St->getValue()) ||
+                       isa<ConstantFPSDNode>(St->getValue());
+  bool IsExtractVecSrc =
+      (St->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+       St->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR);
+  auto CandidateMatch = [&](StoreSDNode *Other, BaseIndexOffset &Ptr) -> bool {
+    if (Other->isVolatile() || Other->isIndexed())
+      return false;
+    // We can merge constant floats to equivalent integers
+    if (Other->getMemoryVT() != MemVT)
+      if (!(MemVT.isInteger() && MemVT.bitsEq(Other->getMemoryVT()) &&
+            isa<ConstantFPSDNode>(Other->getValue())))
+        return false;
+    Ptr = BaseIndexOffset::match(Other->getBasePtr(), DAG);
     if (!Ptr.equalBaseIndex(BasePtr))
-      break;
-
-    // The memory operands must not be volatile.
-    if (Index->isVolatile() || Index->isIndexed())
-      break;
-
-    // No truncation.
-    if (Index->isTruncatingStore())
-      break;
-
-    // The stored memory type must be the same.
-    if (Index->getMemoryVT() != MemVT)
-      break;
+      return false;
+    if (IsLoadSrc)
+      return isa<LoadSDNode>(Other->getValue());
+    if (IsConstantSrc)
+      return (isa<ConstantSDNode>(Other->getValue()) ||
+              isa<ConstantFPSDNode>(Other->getValue()));
+    if (IsExtractVecSrc)
+      return (Other->getValue().getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+              Other->getValue().getOpcode() == ISD::EXTRACT_SUBVECTOR);
+    return false;
+  };
+  // We looking for a root node which is an ancestor to all mergable
+  // stores. We search up through a load, to our root and then down
+  // through all children. For instance we will find Store{1,2,3} if
+  // St is Store1, Store2. or Store3 where the root is not a load
+  // which always true for nonvolatile ops. TODO: Expand
+  // the search to find all valid candidates through multiple layers of loads.
+  //
+  // Root
+  // |-------|-------|
+  // Load    Load    Store3
+  // |       |
+  // Store1   Store2
+  //
+  // FIXME: We should be able to climb and
+  // descend TokenFactors to find candidates as well.
 
-    // We do not allow under-aligned stores in order to prevent
-    // overriding stores. NOTE: this is a bad hack. Alignment SHOULD
-    // be irrelevant here; what MATTERS is that we not move memory
-    // operations that potentially overlap past each-other.
-    if (Index->getAlignment() < MemVT.getStoreSize())
-      break;
+  SDNode *RootNode = (St->getChain()).getNode();
 
-    // We found a potential memory operand to merge.
-    StoreNodes.push_back(MemOpLink(Index, Ptr.Offset, Seq++));
+  // Set of Parents of Candidates
+  std::set<SDNode *> CandidateParents;
 
-    // Find the next memory operand in the chain. If the next operand in the
-    // chain is a store then move up and continue the scan with the next
-    // memory operand. If the next operand is a load save it and use alias
-    // information to check if it interferes with anything.
-    SDNode *NextInChain = Index->getChain().getNode();
-    while (1) {
-      if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
-        // We found a store node. Use it for the next iteration.
-        Index = STn;
-        break;
-      } else if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(NextInChain)) {
-        if (Ldn->isVolatile()) {
-          Index = nullptr;
-          break;
+  if (LoadSDNode *Ldn = dyn_cast<LoadSDNode>(RootNode)) {
+    RootNode = Ldn->getChain().getNode();
+    for (auto I = RootNode->use_begin(), E = RootNode->use_end(); I != E; ++I)
+      if (I.getOperandNo() == 0 && isa<LoadSDNode>(*I)) // walk down chain
+        CandidateParents.insert(*I);
+  } else
+    CandidateParents.insert(RootNode);
+
+  // check all parents of mergable children
+  for (auto P = CandidateParents.begin(); P != CandidateParents.end(); ++P)
+    for (auto I = (*P)->use_begin(), E = (*P)->use_end(); I != E; ++I)
+      if (I.getOperandNo() == 0)
+        if (StoreSDNode *OtherST = dyn_cast<StoreSDNode>(*I)) {
+          BaseIndexOffset Ptr;
+          if (CandidateMatch(OtherST, Ptr))
+            StoreNodes.push_back(MemOpLink(OtherST, Ptr.Offset));
         }
-
-        // Save the load node for later. Continue the scan.
-        AliasLoadNodes.push_back(Ldn);
-        NextInChain = Ldn->getChain().getNode();
-        continue;
-      } else {
-        Index = nullptr;
-        break;
-      }
-    }
-  }
 }
 
 // We need to check that merging these stores does not cause a loop
@@ -11948,31 +12304,34 @@ void DAGCombiner::getStoreMergeAndAliasCandidates(
 // through the chain). Check in parallel by searching up from
 // non-chain operands of candidates.
 bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
-    SmallVectorImpl<MemOpLink> &StoreNodes) {
+    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores) {
   SmallPtrSet<const SDNode *, 16> Visited;
   SmallVector<const SDNode *, 8> Worklist;
   // search ops of store candidates
-  for (unsigned i = 0; i < StoreNodes.size(); ++i) {
+  for (unsigned i = 0; i < NumStores; ++i) {
     SDNode *n = StoreNodes[i].MemNode;
     // Potential loops may happen only through non-chain operands
     for (unsigned j = 1; j < n->getNumOperands(); ++j)
       Worklist.push_back(n->getOperand(j).getNode());
   }
   // search through DAG. We can stop early if we find a storenode
-  for (unsigned i = 0; i < StoreNodes.size(); ++i) {
+  for (unsigned i = 0; i < NumStores; ++i) {
     if (SDNode::hasPredecessorHelper(StoreNodes[i].MemNode, Visited, Worklist))
       return false;
   }
   return true;
 }
 
-bool DAGCombiner::MergeConsecutiveStores(
-    StoreSDNode* St, SmallVectorImpl<MemOpLink> &StoreNodes) {
+bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
   if (OptLevel == CodeGenOpt::None)
     return false;
 
   EVT MemVT = St->getMemoryVT();
   int64_t ElementSizeBytes = MemVT.getSizeInBits() / 8;
+
+  if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
+    return false;
+
   bool NoVectors = DAG.getMachineFunction().getFunction()->hasFnAttribute(
       Attribute::NoImplicitFloat);
 
@@ -12001,145 +12360,137 @@ bool DAGCombiner::MergeConsecutiveStores(
   if (MemVT.isVector() && IsLoadSrc)
     return false;
 
-  // Only look at ends of store sequences.
-  SDValue Chain = SDValue(St, 0);
-  if (Chain->hasOneUse() && Chain->use_begin()->getOpcode() == ISD::STORE)
-    return false;
-
-  // Save the LoadSDNodes that we find in the chain.
-  // We need to make sure that these nodes do not interfere with
-  // any of the store nodes.
-  SmallVector<LSBaseSDNode*, 8> AliasLoadNodes;
-
-  getStoreMergeAndAliasCandidates(St, StoreNodes, AliasLoadNodes);
+  SmallVector<MemOpLink, 8> StoreNodes;
+  // Find potential store merge candidates by searching through chain sub-DAG
+  getStoreMergeCandidates(St, StoreNodes);
 
   // Check if there is anything to merge.
   if (StoreNodes.size() < 2)
     return false;
 
-  // only do dependence check in AA case
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-  if (UseAA && !checkMergeStoreCandidatesForDependencies(StoreNodes))
-    return false;
-
   // Sort the memory operands according to their distance from the
-  // base pointer.  As a secondary criteria: make sure stores coming
-  // later in the code come first in the list. This is important for
-  // the non-UseAA case, because we're merging stores into the FINAL
-  // store along a chain which potentially contains aliasing stores.
-  // Thus, if there are multiple stores to the same address, the last
-  // one can be considered for merging but not the others.
+  // base pointer.
   std::sort(StoreNodes.begin(), StoreNodes.end(),
             [](MemOpLink LHS, MemOpLink RHS) {
-    return LHS.OffsetFromBase < RHS.OffsetFromBase ||
-           (LHS.OffsetFromBase == RHS.OffsetFromBase &&
-            LHS.SequenceNum < RHS.SequenceNum);
-  });
+              return LHS.OffsetFromBase < RHS.OffsetFromBase;
+            });
 
   // Scan the memory operations on the chain and find the first non-consecutive
   // store memory address.
-  unsigned LastConsecutiveStore = 0;
+  unsigned NumConsecutiveStores = 0;
   int64_t StartAddress = StoreNodes[0].OffsetFromBase;
-  for (unsigned i = 0, e = StoreNodes.size(); i < e; ++i) {
-
-    // Check that the addresses are consecutive starting from the second
-    // element in the list of stores.
-    if (i > 0) {
-      int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
-      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
-        break;
-    }
 
-    // Check if this store interferes with any of the loads that we found.
-    // If we find a load that alias with this store. Stop the sequence.
-    if (any_of(AliasLoadNodes, [&](LSBaseSDNode *Ldn) {
-          return isAlias(Ldn, StoreNodes[i].MemNode);
-        }))
+  // Check that the addresses are consecutive starting from the second
+  // element in the list of stores.
+  for (unsigned i = 1, e = StoreNodes.size(); i < e; ++i) {
+    int64_t CurrAddress = StoreNodes[i].OffsetFromBase;
+    if (CurrAddress - StartAddress != (ElementSizeBytes * i))
       break;
-
-    // Mark this node as useful.
-    LastConsecutiveStore = i;
+    NumConsecutiveStores = i + 1;
   }
 
+  if (NumConsecutiveStores < 2)
+    return false;
+
+  // Check that we can merge these candidates without causing a cycle
+  if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumConsecutiveStores))
+    return false;
+
+
   // The node with the lowest store address.
-  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-  unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-  unsigned FirstStoreAlign = FirstInChain->getAlignment();
   LLVMContext &Context = *DAG.getContext();
   const DataLayout &DL = DAG.getDataLayout();
 
   // Store the constants into memory as one consecutive store.
   if (IsConstantSrc) {
-    unsigned LastLegalType = 0;
-    unsigned LastLegalVectorType = 0;
-    bool NonZero = false;
-    for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
-      StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      SDValue StoredVal = St->getValue();
-
-      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) {
-        NonZero |= !C->isNullValue();
-      } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal)) {
-        NonZero |= !C->getConstantFPValue()->isNullValue();
-      } else {
-        // Non-constant.
-        break;
-      }
+    bool RV = false;
+    while (NumConsecutiveStores > 1) {
+      LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+      unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+      unsigned FirstStoreAlign = FirstInChain->getAlignment();
+      unsigned LastLegalType = 0;
+      unsigned LastLegalVectorType = 0;
+      bool NonZero = false;
+      for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+        StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
+        SDValue StoredVal = ST->getValue();
+
+        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal)) {
+          NonZero |= !C->isNullValue();
+        } else if (ConstantFPSDNode *C =
+                       dyn_cast<ConstantFPSDNode>(StoredVal)) {
+          NonZero |= !C->getConstantFPValue()->isNullValue();
+        } else {
+          // Non-constant.
+          break;
+        }
 
-      // Find a legal type for the constant store.
-      unsigned SizeInBits = (i+1) * ElementSizeBytes * 8;
-      EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
-      bool IsFast;
-      if (TLI.isTypeLegal(StoreTy) &&
-          TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
-                                 FirstStoreAlign, &IsFast) && IsFast) {
-        LastLegalType = i+1;
-      // Or check whether a truncstore is legal.
-      } else if (TLI.getTypeAction(Context, StoreTy) ==
-                 TargetLowering::TypePromoteInteger) {
-        EVT LegalizedStoredValueTy =
-          TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
-        if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
-            TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
-                                   FirstStoreAS, FirstStoreAlign, &IsFast) &&
+        // Find a legal type for the constant store.
+        unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+        EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+        bool IsFast = false;
+        if (TLI.isTypeLegal(StoreTy) &&
+            TLI.allowsMemoryAccess(Context, DL, StoreTy, FirstStoreAS,
+                                   FirstStoreAlign, &IsFast) &&
             IsFast) {
           LastLegalType = i + 1;
+          // Or check whether a truncstore is legal.
+        } else if (TLI.getTypeAction(Context, StoreTy) ==
+                   TargetLowering::TypePromoteInteger) {
+          EVT LegalizedStoredValueTy =
+              TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
+          if (TLI.isTruncStoreLegal(LegalizedStoredValueTy, StoreTy) &&
+              TLI.allowsMemoryAccess(Context, DL, LegalizedStoredValueTy,
+                                     FirstStoreAS, FirstStoreAlign, &IsFast) &&
+              IsFast) {
+            LastLegalType = i + 1;
+          }
         }
-      }
 
-      // We only use vectors if the constant is known to be zero or the target
-      // allows it and the function is not marked with the noimplicitfloat
-      // attribute.
-      if ((!NonZero || TLI.storeOfVectorConstantIsCheap(MemVT, i+1,
-                                                        FirstStoreAS)) &&
-          !NoVectors) {
-        // Find a legal type for the vector store.
-        EVT Ty = EVT::getVectorVT(Context, MemVT, i+1);
-        if (TLI.isTypeLegal(Ty) &&
-            TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
-                                   FirstStoreAlign, &IsFast) && IsFast)
-          LastLegalVectorType = i + 1;
+        // We only use vectors if the constant is known to be zero or the target
+        // allows it and the function is not marked with the noimplicitfloat
+        // attribute.
+        if ((!NonZero ||
+             TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
+            !NoVectors) {
+          // Find a legal type for the vector store.
+          EVT Ty = EVT::getVectorVT(Context, MemVT, i + 1);
+          if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(Ty) &&
+              TLI.allowsMemoryAccess(Context, DL, Ty, FirstStoreAS,
+                                     FirstStoreAlign, &IsFast) &&
+              IsFast)
+            LastLegalVectorType = i + 1;
+        }
       }
-    }
 
-    // Check if we found a legal integer type to store.
-    if (LastLegalType == 0 && LastLegalVectorType == 0)
-      return false;
+      // Check if we found a legal integer type that creates a meaningful merge.
+      if (LastLegalType < 2 && LastLegalVectorType < 2)
+        break;
 
-    bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
-    unsigned NumElem = UseVector ? LastLegalVectorType : LastLegalType;
+      bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
+      unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
 
-    return MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
-                                           true, UseVector);
+      bool Merged = MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem,
+                                                    true, UseVector);
+      if (!Merged)
+        break;
+      // Remove merged stores for next iteration.
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+      RV = true;
+      NumConsecutiveStores -= NumElem;
+    }
+    return RV;
   }
 
   // When extracting multiple vector elements, try to store them
   // in one vector store rather than a sequence of scalar stores.
   if (IsExtractVecSrc) {
+    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+    unsigned FirstStoreAlign = FirstInChain->getAlignment();
     unsigned NumStoresToMerge = 0;
     bool IsVec = MemVT.isVector();
-    for (unsigned i = 0; i < LastConsecutiveStore + 1; ++i) {
+    for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
       StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
       unsigned StoreValOpcode = St->getValue().getOpcode();
       // This restriction could be loosened.
@@ -12179,7 +12530,7 @@ bool DAGCombiner::MergeConsecutiveStores(
   // Find acceptable loads. Loads need to have the same chain (token factor),
   // must not be zext, volatile, indexed, and they must be consecutive.
   BaseIndexOffset LdBasePtr;
-  for (unsigned i=0; i<LastConsecutiveStore+1; ++i) {
+  for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
     StoreSDNode *St  = cast<StoreSDNode>(StoreNodes[i].MemNode);
     LoadSDNode *Ld = dyn_cast<LoadSDNode>(St->getValue());
     if (!Ld) break;
@@ -12212,7 +12563,7 @@ bool DAGCombiner::MergeConsecutiveStores(
     }
 
     // We found a potential memory operand to merge.
-    LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset, 0));
+    LoadNodes.push_back(MemOpLink(Ld, LdPtr.Offset));
   }
 
   if (LoadNodes.size() < 2)
@@ -12224,7 +12575,9 @@ bool DAGCombiner::MergeConsecutiveStores(
   if (LoadNodes.size() == 2 && TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
       St->getAlignment() >= RequiredAlignment)
     return false;
-
+  LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+  unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+  unsigned FirstStoreAlign = FirstInChain->getAlignment();
   LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
   unsigned FirstLoadAS = FirstLoad->getAddressSpace();
   unsigned FirstLoadAlign = FirstLoad->getAlignment();
@@ -12293,31 +12646,12 @@ bool DAGCombiner::MergeConsecutiveStores(
 
   // We add +1 here because the LastXXX variables refer to location while
   // the NumElem refers to array/index size.
-  unsigned NumElem = std::min(LastConsecutiveStore, LastConsecutiveLoad) + 1;
+  unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
   NumElem = std::min(LastLegalType, NumElem);
 
   if (NumElem < 2)
     return false;
 
-  // Collect the chains from all merged stores.
-  SmallVector<SDValue, 8> MergeStoreChains;
-  MergeStoreChains.push_back(StoreNodes[0].MemNode->getChain());
-
-  // The latest Node in the DAG.
-  unsigned LatestNodeUsed = 0;
-  for (unsigned i=1; i<NumElem; ++i) {
-    // Find a chain for the new wide-store operand. Notice that some
-    // of the store nodes that we found may not be selected for inclusion
-    // in the wide store. The chain we use needs to be the chain of the
-    // latest store node which is *used* and replaced by the wide store.
-    if (StoreNodes[i].SequenceNum < StoreNodes[LatestNodeUsed].SequenceNum)
-      LatestNodeUsed = i;
-
-    MergeStoreChains.push_back(StoreNodes[i].MemNode->getChain());
-  }
-
-  LSBaseSDNode *LatestOp = StoreNodes[LatestNodeUsed].MemNode;
-
   // Find if it is better to use vectors or integers to load and store
   // to memory.
   EVT JointMemOpVT;
@@ -12337,8 +12671,9 @@ bool DAGCombiner::MergeConsecutiveStores(
                                 FirstLoad->getBasePtr(),
                                 FirstLoad->getPointerInfo(), FirstLoadAlign);
 
-  SDValue NewStoreChain =
-    DAG.getNode(ISD::TokenFactor, StoreDL, MVT::Other, MergeStoreChains);
+  SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
+
+  AddToWorklist(NewStoreChain.getNode());
 
   SDValue NewStore =
       DAG.getStore(NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
@@ -12351,25 +12686,9 @@ bool DAGCombiner::MergeConsecutiveStores(
                                   SDValue(NewLoad.getNode(), 1));
   }
 
-  if (UseAA) {
-    // Replace the all stores with the new store.
-    for (unsigned i = 0; i < NumElem; ++i)
-      CombineTo(StoreNodes[i].MemNode, NewStore);
-  } else {
-    // Replace the last store with the new store.
-    CombineTo(LatestOp, NewStore);
-    // Erase all other stores.
-    for (unsigned i = 0; i < NumElem; ++i) {
-      // Remove all Store nodes.
-      if (StoreNodes[i].MemNode == LatestOp)
-        continue;
-      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(St, 0), St->getChain());
-      deleteAndRecombine(St);
-    }
-  }
-
-  StoreNodes.erase(StoreNodes.begin() + NumElem, StoreNodes.end());
+  // Replace the all stores with the new store.
+  for (unsigned i = 0; i < NumElem; ++i)
+    CombineTo(StoreNodes[i].MemNode, NewStore);
   return true;
 }
 
@@ -12526,19 +12845,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   if (SDValue NewST = TransformFPLoadStorePair(N))
     return NewST;
 
-  bool UseAA = CombinerAA.getNumOccurrences() > 0 ? CombinerAA
-                                                  : DAG.getSubtarget().useAA();
-#ifndef NDEBUG
-  if (CombinerAAOnlyFunc.getNumOccurrences() &&
-      CombinerAAOnlyFunc != DAG.getMachineFunction().getName())
-    UseAA = false;
-#endif
-  if (UseAA && ST->isUnindexed()) {
-    // FIXME: We should do this even without AA enabled. AA will just allow
-    // FindBetterChain to work in more situations. The problem with this is that
-    // any combine that expects memory operations to be on consecutive chains
-    // first needs to be updated to look for users of the same chain.
-
+  if (ST->isUnindexed()) {
     // Walk up chain skipping non-aliasing memory nodes, on this store and any
     // adjacent stores.
     if (findBetterNeighborChains(ST)) {
@@ -12572,8 +12879,15 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
     if (SimplifyDemandedBits(
             Value,
             APInt::getLowBitsSet(Value.getScalarValueSizeInBits(),
-                                 ST->getMemoryVT().getScalarSizeInBits())))
+                                 ST->getMemoryVT().getScalarSizeInBits()))) {
+      // Re-visit the store if anything changed and the store hasn't been merged
+      // with another node (N is deleted) SimplifyDemandedBits will add Value's
+      // node back to the worklist if necessary, but we also need to re-visit
+      // the Store node itself.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
       return SDValue(N, 0);
+    }
   }
 
   // If this is a load followed by a store to the same location, then the store
@@ -12617,15 +12931,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       // There can be multiple store sequences on the same chain.
       // Keep trying to merge store sequences until we are unable to do so
       // or until we merge the last store on the chain.
-      SmallVector<MemOpLink, 8> StoreNodes;
-      bool Changed = MergeConsecutiveStores(ST, StoreNodes);
+      bool Changed = MergeConsecutiveStores(ST);
       if (!Changed) break;
-
-      if (any_of(StoreNodes,
-                 [ST](const MemOpLink &Link) { return Link.MemNode == ST; })) {
-        // ST has been merged and no longer exists.
+      // Return N as merge only uses CombineTo and no worklist clean
+      // up is necessary.
+      if (N->getOpcode() == ISD::DELETED_NODE || !isa<StoreSDNode>(N))
         return SDValue(N, 0);
-      }
     }
   }
 
@@ -12634,7 +12945,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
   // Make sure to do this only after attempting to merge stores in order to
   //  avoid changing the types of some subset of stores due to visit order,
   //  preventing their merging.
-  if (isa<ConstantFPSDNode>(Value)) {
+  if (isa<ConstantFPSDNode>(ST->getValue())) {
     if (SDValue NewSt = replaceStoreOfFPConstant(ST))
       return NewSt;
   }
@@ -12763,10 +13074,6 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
 
   EVT VT = InVec.getValueType();
 
-  // If we can't generate a legal BUILD_VECTOR, exit
-  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
-    return SDValue();
-
   // Check that we know which element is being inserted
   if (!isa<ConstantSDNode>(EltNo))
     return SDValue();
@@ -12793,6 +13100,10 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     }
   }
 
+  // If we can't generate a legal BUILD_VECTOR, exit
+  if (LegalOperations && !TLI.isOperationLegal(ISD::BUILD_VECTOR, VT))
+    return SDValue();
+
   // Check that the operand is a BUILD_VECTOR (or UNDEF, which can essentially
   // be converted to a BUILD_VECTOR).  Fill in the Ops vector with the
   // vector elements.
@@ -12814,11 +13125,7 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
     // All the operands of BUILD_VECTOR must have the same type;
     // we enforce that here.
     EVT OpVT = Ops[0].getValueType();
-    if (InVal.getValueType() != OpVT)
-      InVal = OpVT.bitsGT(InVal.getValueType()) ?
-                DAG.getNode(ISD::ANY_EXTEND, DL, OpVT, InVal) :
-                DAG.getNode(ISD::TRUNCATE, DL, OpVT, InVal);
-    Ops[Elt] = InVal;
+    Ops[Elt] = OpVT.isInteger() ? DAG.getAnyExtOrTrunc(InVal, DL, OpVT) : InVal;
   }
 
   // Return the new vector
@@ -12914,6 +13221,9 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
   EVT VT = InVec.getValueType();
   EVT NVT = N->getValueType(0);
 
+  if (InVec.isUndef())
+    return DAG.getUNDEF(NVT);
+
   if (InVec.getOpcode() == ISD::SCALAR_TO_VECTOR) {
     // Check if the result type doesn't match the inserted element type. A
     // SCALAR_TO_VECTOR may truncate the inserted element and the
@@ -13347,9 +13657,15 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
             !TLI.isOperationLegal(ISD::VECTOR_SHUFFLE, InVT1))
           return SDValue();
 
-        if (InVT1 != InVT2)
+        // Legalizing INSERT_SUBVECTOR is tricky - you basically have to
+        // lower it back into a BUILD_VECTOR. So if the inserted type is
+        // illegal, don't even try.
+        if (InVT1 != InVT2) {
+          if (!TLI.isTypeLegal(InVT2))
+            return SDValue();
           VecIn2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, InVT1,
                                DAG.getUNDEF(InVT1), VecIn2, ZeroIdx);
+        }
         ShuffleNumElems = NumElems * 2;
       } else {
         // Both VecIn1 and VecIn2 are wider than the output, and VecIn2 is wider
@@ -13569,6 +13885,35 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   if (ISD::allOperandsUndef(N))
     return DAG.getUNDEF(VT);
 
+  // Check if we can express BUILD VECTOR via subvector extract.
+  if (!LegalTypes && (N->getNumOperands() > 1)) {
+    SDValue Op0 = N->getOperand(0);
+    auto checkElem = [&](SDValue Op) -> uint64_t {
+      if ((Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) &&
+          (Op0.getOperand(0) == Op.getOperand(0)))
+        if (auto CNode = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+          return CNode->getZExtValue();
+      return -1;
+    };
+
+    int Offset = checkElem(Op0);
+    for (unsigned i = 0; i < N->getNumOperands(); ++i) {
+      if (Offset + i != checkElem(N->getOperand(i))) {
+        Offset = -1;
+        break;
+      }
+    }
+
+    if ((Offset == 0) &&
+        (Op0.getOperand(0).getValueType() == N->getValueType(0)))
+      return Op0.getOperand(0);
+    if ((Offset != -1) &&
+        ((Offset % N->getValueType(0).getVectorNumElements()) ==
+         0)) // IDX must be multiple of output size.
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), N->getValueType(0),
+                         Op0.getOperand(0), Op0.getOperand(1));
+  }
+
   if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
     return V;
 
@@ -13760,8 +14105,11 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
       if (!SclTy.isFloatingPoint() && !SclTy.isInteger())
         return SDValue();
 
-      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy,
-                                 VT.getSizeInBits() / SclTy.getSizeInBits());
+      unsigned VNTNumElms = VT.getSizeInBits() / SclTy.getSizeInBits();
+      if (VNTNumElms < 2)
+        return SDValue();
+
+      EVT NVT = EVT::getVectorVT(*DAG.getContext(), SclTy, VNTNumElms);
       if (!TLI.isTypeLegal(NVT) || !TLI.isTypeLegal(Scalar.getValueType()))
         return SDValue();
 
@@ -13906,13 +14254,12 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode* N) {
 
   if (V->getOpcode() == ISD::INSERT_SUBVECTOR) {
     // Handle only simple case where vector being inserted and vector
-    // being extracted are of same type, and are half size of larger vectors.
-    EVT BigVT = V->getOperand(0).getValueType();
+    // being extracted are of same size.
     EVT SmallVT = V->getOperand(1).getValueType();
-    if (!NVT.bitsEq(SmallVT) || NVT.getSizeInBits()*2 != BigVT.getSizeInBits())
+    if (!NVT.bitsEq(SmallVT))
       return SDValue();
 
-    // Only handle cases where both indexes are constants with the same type.
+    // Only handle cases where both indexes are constants.
     ConstantSDNode *ExtIdx = dyn_cast<ConstantSDNode>(N->getOperand(1));
     ConstantSDNode *InsIdx = dyn_cast<ConstantSDNode>(V->getOperand(2));
 
@@ -14163,6 +14510,113 @@ static SDValue combineShuffleOfScalars(ShuffleVectorSDNode *SVN,
   return DAG.getBuildVector(VT, SDLoc(SVN), Ops);
 }
 
+// Match shuffles that can be converted to any_vector_extend_in_reg.
+// This is often generated during legalization.
+// e.g. v4i32 <0,u,1,u> -> (v2i64 any_vector_extend_in_reg(v4i32 src))
+// TODO Add support for ZERO_EXTEND_VECTOR_INREG when we have a test case.
+SDValue combineShuffleToVectorExtend(ShuffleVectorSDNode *SVN,
+                                     SelectionDAG &DAG,
+                                     const TargetLowering &TLI,
+                                     bool LegalOperations) {
+  EVT VT = SVN->getValueType(0);
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+  // TODO Add support for big-endian when we have a test case.
+  if (!VT.isInteger() || IsBigEndian)
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  ArrayRef<int> Mask = SVN->getMask();
+  SDValue N0 = SVN->getOperand(0);
+
+  // shuffle<0,-1,1,-1> == (v2i64 anyextend_vector_inreg(v4i32))
+  auto isAnyExtend = [&Mask, &NumElts](unsigned Scale) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      if ((i % Scale) == 0 && Mask[i] == (int)(i / Scale))
+        continue;
+      return false;
+    }
+    return true;
+  };
+
+  // Attempt to match a '*_extend_vector_inreg' shuffle, we just search for
+  // power-of-2 extensions as they are the most likely.
+  for (unsigned Scale = 2; Scale < NumElts; Scale *= 2) {
+    if (!isAnyExtend(Scale))
+      continue;
+
+    EVT OutSVT = EVT::getIntegerVT(*DAG.getContext(), EltSizeInBits * Scale);
+    EVT OutVT = EVT::getVectorVT(*DAG.getContext(), OutSVT, NumElts / Scale);
+    if (!LegalOperations ||
+        TLI.isOperationLegalOrCustom(ISD::ANY_EXTEND_VECTOR_INREG, OutVT))
+      return DAG.getBitcast(VT,
+                            DAG.getAnyExtendVectorInReg(N0, SDLoc(SVN), OutVT));
+  }
+
+  return SDValue();
+}
+
+// Detect 'truncate_vector_inreg' style shuffles that pack the lower parts of
+// each source element of a large type into the lowest elements of a smaller
+// destination type. This is often generated during legalization.
+// If the source node itself was a '*_extend_vector_inreg' node then we should
+// then be able to remove it.
+SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) {
+  EVT VT = SVN->getValueType(0);
+  bool IsBigEndian = DAG.getDataLayout().isBigEndian();
+
+  // TODO Add support for big-endian when we have a test case.
+  if (!VT.isInteger() || IsBigEndian)
+    return SDValue();
+
+  SDValue N0 = SVN->getOperand(0);
+  while (N0.getOpcode() == ISD::BITCAST)
+    N0 = N0.getOperand(0);
+
+  unsigned Opcode = N0.getOpcode();
+  if (Opcode != ISD::ANY_EXTEND_VECTOR_INREG &&
+      Opcode != ISD::SIGN_EXTEND_VECTOR_INREG &&
+      Opcode != ISD::ZERO_EXTEND_VECTOR_INREG)
+    return SDValue();
+
+  SDValue N00 = N0.getOperand(0);
+  ArrayRef<int> Mask = SVN->getMask();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned ExtSrcSizeInBits = N00.getScalarValueSizeInBits();
+
+  // (v4i32 truncate_vector_inreg(v2i64)) == shuffle<0,2-1,-1>
+  // (v8i16 truncate_vector_inreg(v4i32)) == shuffle<0,2,4,6,-1,-1,-1,-1>
+  // (v8i16 truncate_vector_inreg(v2i64)) == shuffle<0,4,-1,-1,-1,-1,-1,-1>
+  auto isTruncate = [&Mask, &NumElts](unsigned Scale) {
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (Mask[i] < 0)
+        continue;
+      if ((i * Scale) < NumElts && Mask[i] == (int)(i * Scale))
+        continue;
+      return false;
+    }
+    return true;
+  };
+
+  // At the moment we just handle the case where we've truncated back to the
+  // same size as before the extension.
+  // TODO: handle more extension/truncation cases as cases arise.
+  if (EltSizeInBits != ExtSrcSizeInBits)
+    return SDValue();
+
+  // Attempt to match a 'truncate_vector_inreg' shuffle, we just search for
+  // power-of-2 truncations as they are the most likely.
+  for (unsigned Scale = 2; Scale < NumElts; Scale *= 2)
+    if (isTruncate(Scale))
+      return DAG.getBitcast(VT, N00);
+
+  return SDValue();
+}
+
 SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   EVT VT = N->getValueType(0);
   unsigned NumElts = VT.getVectorNumElements();
@@ -14267,6 +14721,14 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   if (SDValue S = simplifyShuffleOperands(SVN, N0, N1, DAG))
     return S;
 
+  // Match shuffles that can be converted to any_vector_extend_in_reg.
+  if (SDValue V = combineShuffleToVectorExtend(SVN, DAG, TLI, LegalOperations))
+    return V;
+
+  // Combine "truncate_vector_in_reg" style shuffles.
+  if (SDValue V = combineTruncationShuffle(SVN, DAG))
+    return V;
+
   if (N0.getOpcode() == ISD::CONCAT_VECTORS &&
       Level < AfterLegalizeVectorOps &&
       (N1.isUndef() ||
@@ -14528,6 +14990,12 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   if (N1.isUndef())
     return N0;
 
+  // If this is an insert of an extracted vector into an undef vector, we can
+  // just use the input to the extract.
+  if (N0.isUndef() && N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N1.getOperand(1) == N2 && N1.getOperand(0).getValueType() == VT)
+    return N1.getOperand(0);
+
   // Combine INSERT_SUBVECTORs where we are inserting to the same index.
   // INSERT_SUBVECTOR( INSERT_SUBVECTOR( Vec, SubOld, Idx ), SubNew, Idx )
   // --> INSERT_SUBVECTOR( Vec, SubNew, Idx )
@@ -14537,26 +15005,39 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
     return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0.getOperand(0),
                        N1, N2);
 
-  if (N0.getValueType() != N1.getValueType())
+  if (!isa<ConstantSDNode>(N2))
     return SDValue();
 
+  unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue();
+
+  // Canonicalize insert_subvector dag nodes.
+  // Example:
+  // (insert_subvector (insert_subvector A, Idx0), Idx1)
+  // -> (insert_subvector (insert_subvector A, Idx1), Idx0)
+  if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
+      N1.getValueType() == N0.getOperand(1).getValueType() &&
+      isa<ConstantSDNode>(N0.getOperand(2))) {
+    unsigned OtherIdx = cast<ConstantSDNode>(N0.getOperand(2))->getZExtValue();
+    if (InsIdx < OtherIdx) {
+      // Swap nodes.
+      SDValue NewOp = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT,
+                                  N0.getOperand(0), N1, N2);
+      AddToWorklist(NewOp.getNode());
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N0.getNode()),
+                         VT, NewOp, N0.getOperand(1), N0.getOperand(2));
+    }
+  }
+
   // If the input vector is a concatenation, and the insert replaces
-  // one of the halves, we can optimize into a single concat_vectors.
-  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0->getNumOperands() == 2 &&
-      isa<ConstantSDNode>(N2)) {
-    unsigned InsIdx = cast<ConstantSDNode>(N2)->getZExtValue();
+  // one of the pieces, we can optimize into a single concat_vectors.
+  if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
+      N0.getOperand(0).getValueType() == N1.getValueType()) {
+    unsigned Factor = N1.getValueType().getVectorNumElements();
 
-    // Lower half: fold (insert_subvector (concat_vectors X, Y), Z) ->
-    // (concat_vectors Z, Y)
-    if (InsIdx == 0)
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N1,
-                         N0.getOperand(1));
+    SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
+    Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1;
 
-    // Upper half: fold (insert_subvector (concat_vectors X, Y), Z) ->
-    // (concat_vectors X, Z)
-    if (InsIdx == VT.getVectorNumElements() / 2)
-      return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, N0.getOperand(0),
-                         N1);
+    return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
   }
 
   return SDValue();
@@ -15532,7 +16013,7 @@ static bool FindBaseOffset(SDValue Ptr, SDValue &Base, int64_t &Offset,
   if (Base.getOpcode() == ISD::ADD) {
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Base.getOperand(1))) {
       Base = Base.getOperand(0);
-      Offset += C->getZExtValue();
+      Offset += C->getSExtValue();
     }
   }
 
@@ -15729,6 +16210,12 @@ void DAGCombiner::GatherAllAliases(SDNode *N, SDValue OriginalChain,
       ++Depth;
       break;
 
+    case ISD::CopyFromReg:
+      // Forward past CopyFromReg.
+      Chains.push_back(Chain.getOperand(0));
+      ++Depth;
+      break;
+
     default:
       // For all other instructions we will just have to take what we can get.
       Aliases.push_back(Chain);
@@ -15757,6 +16244,18 @@ SDValue DAGCombiner::FindBetterChain(SDNode *N, SDValue OldChain) {
   return DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Aliases);
 }
 
+// This function tries to collect a bunch of potentially interesting
+// nodes to improve the chains of, all at once. This might seem
+// redundant, as this function gets called when visiting every store
+// node, so why not let the work be done on each store as it's visited?
+//
+// I believe this is mainly important because MergeConsecutiveStores
+// is unable to deal with merging stores of different sizes, so unless
+// we improve the chains of all the potential candidates up-front
+// before running MergeConsecutiveStores, it might only see some of
+// the nodes that will eventually be candidates, and then not be able
+// to go from a partially-merged state to the desired final
+// fully-merged state.
 bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
   // This holds the base pointer, index, and the offset in bytes from the base
   // pointer.
@@ -15792,10 +16291,8 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
     if (!Ptr.equalBaseIndex(BasePtr))
       break;
 
-    // Find the next memory operand in the chain. If the next operand in the
-    // chain is a store then move up and continue the scan with the next
-    // memory operand. If the next operand is a load save it and use alias
-    // information to check if it interferes with anything.
+    // Walk up the chain to find the next store node, ignoring any
+    // intermediate loads. Any other kind of node will halt the loop.
     SDNode *NextInChain = Index->getChain().getNode();
     while (true) {
       if (StoreSDNode *STn = dyn_cast<StoreSDNode>(NextInChain)) {
@@ -15814,9 +16311,14 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) {
         Index = nullptr;
         break;
       }
-    }
+    } // end while
   }
 
+  // At this point, ChainedStores lists all of the Store nodes
+  // reachable by iterating up through chain nodes matching the above
+  // conditions.  For each such store identified, try to find an
+  // earlier chain to attach the store to which won't violate the
+  // required ordering.
   bool MadeChangeToSt = false;
   SmallVector<std::pair<StoreSDNode *, SDValue>, 8> BetterChains;
 
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index f30f57d0c1e92c17cae257bd30aa812c1c265403..4f6290b751bf0fb40ef2228115eb9e3c7f98c2b3 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -1,4 +1,4 @@
-//===-- FastISel.cpp - Implementation of the FastISel class ---------------===//
+//===- FastISel.cpp - Implementation of the FastISel class ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -39,35 +39,76 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APSInt.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
-#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "isel"
@@ -78,21 +119,6 @@ STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by "
                                     "target-specific selector");
 STATISTIC(NumFastIselDead, "Number of dead insts removed on failure");
 
-void FastISel::ArgListEntry::setAttributes(ImmutableCallSite *CS,
-                                           unsigned AttrIdx) {
-  IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt);
-  IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
-  IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg);
-  IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
-  IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest);
-  IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
-  IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
-  IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
-  IsSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
-  IsSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
-  Alignment = CS->getParamAlignment(AttrIdx);
-}
-
 /// Set the current block to which generated machine instructions will be
 /// appended, and clear the local CSE map.
 void FastISel::startNewBlock() {
@@ -231,17 +257,13 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
       // Try to emit the constant by using an integer constant with a cast.
       const APFloat &Flt = CF->getValueAPF();
       EVT IntVT = TLI.getPointerTy(DL);
-
-      uint64_t x[2];
       uint32_t IntBitWidth = IntVT.getSizeInBits();
+      APSInt SIntVal(IntBitWidth, /*isUnsigned=*/false);
       bool isExact;
-      (void)Flt.convertToInteger(x, IntBitWidth, /*isSigned=*/true,
-                                 APFloat::rmTowardZero, &isExact);
+      (void)Flt.convertToInteger(SIntVal, APFloat::rmTowardZero, &isExact);
       if (isExact) {
-        APInt IntVal(IntBitWidth, x);
-
         unsigned IntegerReg =
-            getRegForValue(ConstantInt::get(V->getContext(), IntVal));
+            getRegForValue(ConstantInt::get(V->getContext(), SIntVal));
         if (IntegerReg != 0)
           Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg,
                            /*Kill=*/false);
@@ -841,9 +863,9 @@ bool FastISel::selectPatchpoint(const CallInst *I) {
   return true;
 }
 
-/// Returns an AttributeSet representing the attributes applied to the return
+/// Returns an AttributeList representing the attributes applied to the return
 /// value of the given call.
-static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
+static AttributeList getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
   SmallVector<Attribute::AttrKind, 2> Attrs;
   if (CLI.RetSExt)
     Attrs.push_back(Attribute::SExt);
@@ -852,8 +874,8 @@ static AttributeSet getReturnAttrs(FastISel::CallLoweringInfo &CLI) {
   if (CLI.IsInReg)
     Attrs.push_back(Attribute::InReg);
 
-  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
-                           Attrs);
+  return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
+                            Attrs);
 }
 
 bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName,
@@ -888,6 +910,7 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,
     Entry.setAttributes(&CS, ArgI + 1);
     Args.push_back(Entry);
   }
+  TLI.markLibCallAttributes(MF, CS.getCallingConv(), Args);
 
   CallLoweringInfo CLI;
   CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), CS, NumArgs);
@@ -1665,7 +1688,7 @@ FastISel::FastISel(FunctionLoweringInfo &FuncInfo,
       TRI(*MF->getSubtarget().getRegisterInfo()), LibInfo(LibInfo),
       SkipTargetIndependentISel(SkipTargetIndependentISel) {}
 
-FastISel::~FastISel() {}
+FastISel::~FastISel() = default;
 
 bool FastISel::fastLowerArguments() { return false; }
 
diff --git a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index 3b91e58879b49e6d09ebc55c4be0ead8174d2560..e85d1951e3aed9956b0245e6b3f7560bc15e38b8 100644
--- a/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -235,7 +235,6 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
 
     if (II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
-      unsigned NumResults = CountResults(Node);
       VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();
       assert(TargetRegisterInfo::isPhysicalRegister(VRBase));
       MIB.addReg(VRBase, RegState::Define);
@@ -502,8 +501,17 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     const TargetRegisterClass *TRC =
       TLI->getRegClassFor(Node->getSimpleValueType(0));
 
-    unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
-    MachineInstr *DefMI = MRI->getVRegDef(VReg);
+    unsigned Reg;
+    MachineInstr *DefMI;
+    RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(0));
+    if (R && TargetRegisterInfo::isPhysicalRegister(R->getReg())) {
+      Reg = R->getReg();
+      DefMI = nullptr;
+    } else {
+      Reg = getVR(Node->getOperand(0), VRBaseMap);
+      DefMI = MRI->getVRegDef(Reg);
+    }
+
     unsigned SrcReg, DstReg, DefSubIdx;
     if (DefMI &&
         TII->isCoalescableExtInstr(*DefMI, SrcReg, DstReg, DefSubIdx) &&
@@ -519,20 +527,26 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
               TII->get(TargetOpcode::COPY), VRBase).addReg(SrcReg);
       MRI->clearKillFlags(SrcReg);
     } else {
-      // VReg may not support a SubIdx sub-register, and we may need to
+      // Reg may not support a SubIdx sub-register, and we may need to
       // constrain its register class or issue a COPY to a compatible register
       // class.
-      VReg = ConstrainForSubReg(VReg, SubIdx,
-                                Node->getOperand(0).getSimpleValueType(),
-                                Node->getDebugLoc());
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        Reg = ConstrainForSubReg(Reg, SubIdx,
+                                 Node->getOperand(0).getSimpleValueType(),
+                                 Node->getDebugLoc());
 
       // Create the destreg if it is missing.
       if (VRBase == 0)
         VRBase = MRI->createVirtualRegister(TRC);
 
       // Create the extract_subreg machine instruction.
-      BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
-              TII->get(TargetOpcode::COPY), VRBase).addReg(VReg, 0, SubIdx);
+      MachineInstrBuilder CopyMI =
+          BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
+                  TII->get(TargetOpcode::COPY), VRBase);
+      if (TargetRegisterInfo::isVirtualRegister(Reg))
+        CopyMI.addReg(Reg, 0, SubIdx);
+      else
+        CopyMI.addReg(TRI->getSubReg(Reg, SubIdx));
     }
   } else if (Opc == TargetOpcode::INSERT_SUBREG ||
              Opc == TargetOpcode::SUBREG_TO_REG) {
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ffecd04ed7c464207ac3f6fd13ba68c9e18aa8aa..b7378b360211f1c9ed7226f144cc258c0b272c33 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -1909,8 +1909,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -1935,9 +1935,13 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, SDNode *Node,
     InChain = TCChain;
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setTailCall(isTailCall).setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(SDLoc(Node))
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setTailCall(isTailCall)
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -1960,8 +1964,8 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   for (unsigned i = 0; i != NumOps; ++i) {
     Entry.Node = Ops[i];
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -1970,9 +1974,12 @@ SDValue SelectionDAGLegalize::ExpandLibCall(RTLIB::Libcall LC, EVT RetVT,
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue,SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -1994,8 +2001,8 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i);
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -2004,9 +2011,12 @@ SelectionDAGLegalize::ExpandChainLibCall(RTLIB::Libcall LC,
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(SDLoc(Node))
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2081,8 +2091,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
 
@@ -2090,8 +2100,8 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
   SDValue FIPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = FIPtr;
   Entry.Ty = RetTy->getPointerTo();
-  Entry.isSExt = isSigned;
-  Entry.isZExt = !isSigned;
+  Entry.IsSExt = isSigned;
+  Entry.IsZExt = !isSigned;
   Args.push_back(Entry);
 
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -2099,9 +2109,12 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
 
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2185,24 +2198,24 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
   // Pass the argument.
   Entry.Node = Node->getOperand(0);
   Entry.Ty = RetTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   // Pass the return address of sin.
   SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = SinPtr;
   Entry.Ty = RetTy->getPointerTo();
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   // Also pass the return address of the cos.
   SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
   Entry.Node = CosPtr;
   Entry.Ty = RetTy->getPointerTo();
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -2210,9 +2223,9 @@ SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
 
   SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC),
-               Type::getVoidTy(*DAG.getContext()), Callee, std::move(Args));
+  CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
+      TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
+      std::move(Args));
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
@@ -2529,12 +2542,12 @@ SDValue SelectionDAGLegalize::ExpandBITREVERSE(SDValue Op, const SDLoc &dl) {
     APInt MaskHi4(Sz, 0), MaskHi2(Sz, 0), MaskHi1(Sz, 0);
     APInt MaskLo4(Sz, 0), MaskLo2(Sz, 0), MaskLo1(Sz, 0);
     for (unsigned J = 0; J != Sz; J += 8) {
-      MaskHi4 = MaskHi4.Or(APInt(Sz, 0xF0ull << J));
-      MaskLo4 = MaskLo4.Or(APInt(Sz, 0x0Full << J));
-      MaskHi2 = MaskHi2.Or(APInt(Sz, 0xCCull << J));
-      MaskLo2 = MaskLo2.Or(APInt(Sz, 0x33ull << J));
-      MaskHi1 = MaskHi1.Or(APInt(Sz, 0xAAull << J));
-      MaskLo1 = MaskLo1.Or(APInt(Sz, 0x55ull << J));
+      MaskHi4 = MaskHi4 | (0xF0ull << J);
+      MaskLo4 = MaskLo4 | (0x0Full << J);
+      MaskHi2 = MaskHi2 | (0xCCull << J);
+      MaskLo2 = MaskLo2 | (0x33ull << J);
+      MaskHi1 = MaskHi1 | (0xAAull << J);
+      MaskLo1 = MaskLo1 | (0x55ull << J);
     }
 
     // BSWAP if the type is wider than a single byte.
@@ -3830,10 +3843,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Node->getOperand(0))
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("__sync_synchronize",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args));
+        .setLibCallee(
+            CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+            DAG.getExternalSymbol("__sync_synchronize",
+                                  TLI.getPointerTy(DAG.getDataLayout())),
+            std::move(Args));
 
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -3870,10 +3884,10 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Node->getOperand(0))
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("abort",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args));
+        .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                      DAG.getExternalSymbol(
+                          "abort", TLI.getPointerTy(DAG.getDataLayout())),
+                      std::move(Args));
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
     Results.push_back(CallResult.second);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 72b56d84d94523f96364324e15278a7bdce4fdc5..6f2b1b94ce465069e8cbb2477be2c47055792171 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -459,7 +459,7 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   if (Op.getValueType() == MVT::f16 && N->getValueType(0) != MVT::f32) {
     Op = DAG.getNode(ISD::FP_EXTEND, SDLoc(N), MVT::f32, Op);
     if (getTypeAction(MVT::f32) == TargetLowering::TypeSoftenFloat)
-      SoftenFloatResult(Op.getNode(), 0);
+      AddToWorklist(Op.getNode());
   }
 
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat) {
@@ -472,8 +472,6 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FP_EXTEND(SDNode *N) {
   }
 
   RTLIB::Libcall LC = RTLIB::getFPEXT(Op.getValueType(), N->getValueType(0));
-  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftenFloat)
-    Op = GetSoftenedFloat(Op);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unsupported FP_EXTEND!");
   return TLI.makeLibCall(DAG, LC, NVT, Op, false, SDLoc(N)).first;
 }
@@ -1054,15 +1052,15 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
 void DAGTypeLegalizer::ExpandFloatRes_ConstantFP(SDNode *N, SDValue &Lo,
                                                  SDValue &Hi) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
-  assert(NVT.getSizeInBits() == integerPartWidth &&
+  assert(NVT.getSizeInBits() == 64 &&
          "Do not know how to expand this float constant!");
   APInt C = cast<ConstantFPSDNode>(N)->getValueAPF().bitcastToAPInt();
   SDLoc dl(N);
   Lo = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
-                                 APInt(integerPartWidth, C.getRawData()[1])),
+                                 APInt(64, C.getRawData()[1])),
                          dl, NVT);
   Hi = DAG.getConstantFP(APFloat(DAG.EVTToAPFloatSemantics(NVT),
-                                 APInt(integerPartWidth, C.getRawData()[0])),
+                                 APInt(64, C.getRawData()[0])),
                          dl, NVT);
 }
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index dc436ce045142d6546489a066a9b2c09e88ea332..85068e890756b0a6ae700835e3ca51cb608a105f 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -690,7 +690,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
   case TargetLowering::TypePromoteInteger:
     Res = GetPromotedInteger(InOp);
     break;
-  case TargetLowering::TypeSplitVector:
+  case TargetLowering::TypeSplitVector: {
     EVT InVT = InOp.getValueType();
     assert(InVT.isVector() && "Cannot split scalar types");
     unsigned NumElts = InVT.getVectorNumElements();
@@ -709,6 +709,26 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
 
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, NVT, EOp1, EOp2);
   }
+  case TargetLowering::TypeWidenVector: {
+    SDValue WideInOp = GetWidenedVector(InOp);
+
+    // Truncate widened InOp.
+    unsigned NumElem = WideInOp.getValueType().getVectorNumElements();
+    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(),
+                                   N->getValueType(0).getScalarType(), NumElem);
+    SDValue WideTrunc = DAG.getNode(ISD::TRUNCATE, dl, TruncVT, WideInOp);
+
+    // Zero extend so that the elements are of same type as those of NVT
+    EVT ExtVT = EVT::getVectorVT(*DAG.getContext(), NVT.getVectorElementType(),
+                                 NumElem);
+    SDValue WideExt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, WideTrunc);
+
+    // Extract the low NVT subvector.
+    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+    SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy);
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, WideExt, ZeroIdx);
+  }
+  }
 
   // Truncate to NVT instead of VT
   return DAG.getNode(ISD::TRUNCATE, dl, NVT, Res);
@@ -1089,6 +1109,10 @@ SDValue DAGTypeLegalizer::PromoteIntOp_SELECT(SDNode *N, unsigned OpNo) {
   SDValue Cond = N->getOperand(0);
   EVT OpTy = N->getOperand(1).getValueType();
 
+  if (N->getOpcode() == ISD::VSELECT)
+    if (SDValue Res = WidenVSELECTAndMask(N))
+      return Res;
+
   // Promote all the way up to the canonical SetCC type.
   EVT OpVT = N->getOpcode() == ISD::SELECT ? OpTy.getScalarType() : OpTy;
   Cond = PromoteTargetBoolean(Cond, OpVT);
@@ -2586,24 +2610,25 @@ void DAGTypeLegalizer::ExpandIntRes_XMULO(SDNode *N,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Op;
     Entry.Ty = ArgTy;
-    Entry.isSExt = true;
-    Entry.isZExt = false;
+    Entry.IsSExt = true;
+    Entry.IsZExt = false;
     Args.push_back(Entry);
   }
 
   // Also pass the address of the overflow check.
   Entry.Node = Temp;
   Entry.Ty = PtrTy->getPointerTo();
-  Entry.isSExt = true;
-  Entry.isZExt = false;
+  Entry.IsSExt = true;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   SDValue Func = DAG.getExternalSymbol(TLI.getLibcallName(LC), PtrVT);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args))
-    .setSExtResult();
+  CLI.setDebugLoc(dl)
+      .setChain(Chain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Func, std::move(Args))
+      .setSExtResult();
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index cf19d75676cdaeadb75479fcf14651dab556aa6a..0a2b680e1c66edc630614a3c39f280db93a358eb 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -199,8 +199,7 @@ bool DAGTypeLegalizer::run() {
   // non-leaves.
   for (SDNode &Node : DAG.allnodes()) {
     if (Node.getNumOperands() == 0) {
-      Node.setNodeId(ReadyToProcess);
-      Worklist.push_back(&Node);
+      AddToWorklist(&Node);
     } else {
       Node.setNodeId(Unanalyzed);
     }
@@ -331,6 +330,12 @@ ScanOperands:
     // to the worklist etc.
     if (NeedsReanalyzing) {
       assert(N->getNodeId() == ReadyToProcess && "Node ID recalculated?");
+
+      // Remove any result values from SoftenedFloats as N will be revisited
+      // again.
+      for (unsigned i = 0, NumResults = N->getNumValues(); i < NumResults; ++i)
+        SoftenedFloats.erase(SDValue(N, i));
+
       N->setNodeId(NewNode);
       // Recompute the NodeId and correct processed operands, adding the node to
       // the worklist if ready.
@@ -749,6 +754,8 @@ void DAGTypeLegalizer::ReplaceValueWith(SDValue From, SDValue To) {
     // new uses of From due to CSE. If this happens, replace the new uses of
     // From with To.
   } while (!From.use_empty());
+
+  SoftenedFloats.erase(From);
 }
 
 void DAGTypeLegalizer::SetPromotedInteger(SDValue Op, SDValue Result) {
@@ -1077,8 +1084,8 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node,
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Node = Node->getOperand(i);
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
@@ -1087,9 +1094,12 @@ DAGTypeLegalizer::ExpandChainLibCall(RTLIB::Libcall LC, SDNode *Node,
   Type *RetTy = Node->getValueType(0).getTypeForEVT(*DAG.getContext());
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(SDLoc(Node)).setChain(InChain)
-    .setCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(SDLoc(Node))
+      .setChain(InChain)
+      .setLibCallee(TLI.getLibcallCallingConv(LC), RetTy, Callee,
+                    std::move(Args))
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
 
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index ec55662d75c0b0aa814d7c9cd2dbd0ceb1473f26..80c939700518f7dbee3262fbad87580afee45627 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -191,6 +191,11 @@ private:
   void SplitInteger(SDValue Op, EVT LoVT, EVT HiVT,
                     SDValue &Lo, SDValue &Hi);
 
+  void AddToWorklist(SDNode *N) {
+    N->setNodeId(ReadyToProcess);
+    Worklist.push_back(N);
+  }
+
   //===--------------------------------------------------------------------===//
   // Integer Promotion Support: LegalizeIntegerTypes.cpp
   //===--------------------------------------------------------------------===//
@@ -597,6 +602,7 @@ private:
   SDValue ScalarizeVecRes_TernaryOp(SDNode *N);
   SDValue ScalarizeVecRes_UnaryOp(SDNode *N);
   SDValue ScalarizeVecRes_InregOp(SDNode *N);
+  SDValue ScalarizeVecRes_VecInregOp(SDNode *N);
 
   SDValue ScalarizeVecRes_BITCAST(SDNode *N);
   SDValue ScalarizeVecRes_BUILD_VECTOR(SDNode *N);
@@ -672,6 +678,7 @@ private:
   SDValue SplitVecOp_BITCAST(SDNode *N);
   SDValue SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N);
   SDValue SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SplitVecOp_ExtVecInRegOp(SDNode *N);
   SDValue SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_MSTORE(MaskedStoreSDNode *N, unsigned OpNo);
   SDValue SplitVecOp_MSCATTER(MaskedScatterSDNode *N, unsigned OpNo);
@@ -713,6 +720,7 @@ private:
   SDValue WidenVecRes_MGATHER(MaskedGatherSDNode* N);
   SDValue WidenVecRes_SCALAR_TO_VECTOR(SDNode* N);
   SDValue WidenVecRes_SELECT(SDNode* N);
+  SDValue WidenVSELECTAndMask(SDNode *N);
   SDValue WidenVecRes_SELECT_CC(SDNode* N);
   SDValue WidenVecRes_SETCC(SDNode* N);
   SDValue WidenVecRes_UNDEF(SDNode *N);
@@ -782,6 +790,13 @@ private:
   /// By default, the vector will be widened with undefined values.
   SDValue ModifyToType(SDValue InOp, EVT NVT, bool FillWithZeroes = false);
 
+  /// Return a mask of vector type MaskVT to replace InMask. Also adjust
+  /// MaskVT to ToMaskVT if needed with vector extension or truncation.
+  SDValue convertMask(SDValue InMask, EVT MaskVT, EVT ToMaskVT);
+
+  /// Get the target mask VT, and widen if needed.
+  EVT getSETCCWidenedResultTy(SDValue SetCC);
+
   //===--------------------------------------------------------------------===//
   // Generic Splitting: LegalizeTypesGeneric.cpp
   //===--------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 3682c32460c66fc9a00de6fc6ac30b426c442ceb..c02b8960b36cbc908281088eafb237a4c9e56579 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -512,8 +512,24 @@ void DAGTypeLegalizer::SplitRes_MERGE_VALUES(SDNode *N, unsigned ResNo,
   GetSplitOp(Op, Lo, Hi);
 }
 
-void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
-                                       SDValue &Hi) {
+static std::pair<SDValue, SDValue> SplitVSETCC(const SDNode *N,
+                                               SelectionDAG &DAG) {
+  SDLoc DL(N);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+
+  // Split the inputs.
+  SDValue Lo, Hi, LL, LH, RL, RH;
+  std::tie(LL, LH) = DAG.SplitVectorOperand(N, 0);
+  std::tie(RL, RH) = DAG.SplitVectorOperand(N, 1);
+
+  Lo = DAG.getNode(N->getOpcode(), DL, LoVT, LL, RL, N->getOperand(2));
+  Hi = DAG.getNode(N->getOpcode(), DL, HiVT, LH, RH, N->getOperand(2));
+
+  return std::make_pair(Lo, Hi);
+}
+
+void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue LL, LH, RL, RH, CL, CH;
   SDLoc dl(N);
   GetSplitOp(N->getOperand(1), LL, LH);
@@ -522,9 +538,16 @@ void DAGTypeLegalizer::SplitRes_SELECT(SDNode *N, SDValue &Lo,
   SDValue Cond = N->getOperand(0);
   CL = CH = Cond;
   if (Cond.getValueType().isVector()) {
+    if (SDValue Res = WidenVSELECTAndMask(N))
+      std::tie(CL, CH) = DAG.SplitVector(Res->getOperand(0), dl);
+    // It seems to improve code to generate two narrow SETCCs as opposed to
+    // splitting a wide result vector.
+    else if (Cond.getOpcode() == ISD::SETCC)
+      std::tie(CL, CH) = SplitVSETCC(Cond.getNode(), DAG);
     // Check if there are already splitted versions of the vector available and
     // use those instead of splitting the mask operand again.
-    if (getTypeAction(Cond.getValueType()) == TargetLowering::TypeSplitVector)
+    else if (getTypeAction(Cond.getValueType()) ==
+             TargetLowering::TypeSplitVector)
       GetSplitVector(Cond, CL, CH);
     else
       std::tie(CL, CH) = DAG.SplitVector(Cond, dl);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 13fe5e9cf197fa316ad4fc864b22011d6f516a6a..5f167f8de1cfc55da1a6d539741da241b65dc0fe 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -105,6 +105,7 @@ class VectorLegalizer {
   SDValue ExpandLoad(SDValue Op);
   SDValue ExpandStore(SDValue Op);
   SDValue ExpandFNEG(SDValue Op);
+  SDValue ExpandFSUB(SDValue Op);
   SDValue ExpandBITREVERSE(SDValue Op);
   SDValue ExpandCTLZ(SDValue Op);
   SDValue ExpandCTTZ_ZERO_UNDEF(SDValue Op);
@@ -691,6 +692,8 @@ SDValue VectorLegalizer::Expand(SDValue Op) {
     return ExpandUINT_TO_FLOAT(Op);
   case ISD::FNEG:
     return ExpandFNEG(Op);
+  case ISD::FSUB:
+    return ExpandFSUB(Op);
   case ISD::SETCC:
     return UnrollVSETCC(Op);
   case ISD::BITREVERSE:
@@ -1021,6 +1024,18 @@ SDValue VectorLegalizer::ExpandFNEG(SDValue Op) {
   return DAG.UnrollVectorOp(Op.getNode());
 }
 
+SDValue VectorLegalizer::ExpandFSUB(SDValue Op) {
+  // For floating-point values, (a-b) is the same as a+(-b). If FNEG is legal,
+  // we can defer this to operation legalization where it will be lowered as
+  // a+(-b).
+  EVT VT = Op.getValueType();
+  if (TLI.isOperationLegalOrCustom(ISD::FNEG, VT) &&
+      TLI.isOperationLegalOrCustom(ISD::FADD, VT))
+    return Op; // Defer to LegalizeDAG
+
+  return DAG.UnrollVectorOp(Op.getNode());
+}
+
 SDValue VectorLegalizer::ExpandCTLZ(SDValue Op) {
   EVT VT = Op.getValueType();
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 143e293984e21d2b8bbbdcdceacd5441001cf9df..78fddb5ce8f582a96108435091ebc79ecfb45628 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -65,6 +65,11 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::SETCC:             R = ScalarizeVecRes_SETCC(N); break;
   case ISD::UNDEF:             R = ScalarizeVecRes_UNDEF(N); break;
   case ISD::VECTOR_SHUFFLE:    R = ScalarizeVecRes_VECTOR_SHUFFLE(N); break;
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    R = ScalarizeVecRes_VecInregOp(N);
+    break;
   case ISD::ANY_EXTEND:
   case ISD::BITREVERSE:
   case ISD::BSWAP:
@@ -258,6 +263,34 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
                      LHS, DAG.getValueType(ExtVT));
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
+  SDLoc DL(N);
+  SDValue Op = N->getOperand(0);
+
+  EVT OpVT = Op.getValueType();
+  EVT OpEltVT = OpVT.getVectorElementType();
+  EVT EltVT = N->getValueType(0).getVectorElementType();
+
+  if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
+    Op = GetScalarizedVector(Op);
+  } else {
+    Op = DAG.getNode(
+        ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op,
+        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  }
+
+  switch (N->getOpcode()) {
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::ANY_EXTEND, DL, EltVT, Op);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::SIGN_EXTEND, DL, EltVT, Op);
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, EltVT, Op);
+  }
+
+  llvm_unreachable("Illegal extend_vector_inreg opcode");
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N) {
   // If the operand is wider than the vector element type then it is implicitly
   // truncated.  Make that explicit here.
@@ -930,7 +963,12 @@ void DAGTypeLegalizer::SplitVecRes_ExtVecInRegOp(SDNode *N, SDValue &Lo,
 
   SDLoc dl(N);
   SDValue InLo, InHi;
-  GetSplitVector(N0, InLo, InHi);
+
+  if (getTypeAction(N0.getValueType()) == TargetLowering::TypeSplitVector)
+    GetSplitVector(N0, InLo, InHi);
+  else
+    std::tie(InLo, InHi) = DAG.SplitVectorOperand(N, 0);
+
   EVT InLoVT = InLo.getValueType();
   unsigned InNumElements = InLoVT.getVectorNumElements();
 
@@ -1471,6 +1509,12 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) {
     case ISD::FCANONICALIZE:
       Res = SplitVecOp_UnaryOp(N);
       break;
+
+    case ISD::ANY_EXTEND_VECTOR_INREG:
+    case ISD::SIGN_EXTEND_VECTOR_INREG:
+    case ISD::ZERO_EXTEND_VECTOR_INREG:
+      Res = SplitVecOp_ExtVecInRegOp(N);
+      break;
     }
   }
 
@@ -1632,6 +1676,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
                         MachinePointerInfo(), EltVT);
 }
 
+SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
+  SDValue Lo, Hi;
+
+  // *_EXTEND_VECTOR_INREG only reference the lower half of the input, so
+  // splitting the result has the same effect as splitting the input operand.
+  SplitVecRes_ExtVecInRegOp(N, Lo, Hi);
+
+  return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), N->getValueType(0), Lo, Hi);
+}
+
 SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
                                              unsigned OpNo) {
   EVT LoVT, HiVT;
@@ -2826,6 +2880,212 @@ SDValue DAGTypeLegalizer::WidenVecRes_SCALAR_TO_VECTOR(SDNode *N) {
                      WidenVT, N->getOperand(0));
 }
 
+// Return true if this is a node that could have two SETCCs as operands.
+static inline bool isLogicalMaskOp(unsigned Opcode) {
+  switch (Opcode) {
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    return true;
+  }
+  return false;
+}
+
+// This is used just for the assert in convertMask(). Check that this either
+// a SETCC or a previously handled SETCC by convertMask().
+#ifndef NDEBUG
+static inline bool isSETCCorConvertedSETCC(SDValue N) {
+  if (N.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+    N = N.getOperand(0);
+  else if (N.getOpcode() == ISD::CONCAT_VECTORS) {
+    for (unsigned i = 1; i < N->getNumOperands(); ++i)
+      if (!N->getOperand(i)->isUndef())
+        return false;
+    N = N.getOperand(0);
+  }
+
+  if (N.getOpcode() == ISD::TRUNCATE)
+    N = N.getOperand(0);
+  else if (N.getOpcode() == ISD::SIGN_EXTEND)
+    N = N.getOperand(0);
+
+  return (N.getOpcode() == ISD::SETCC);
+}
+#endif
+
+// Return a mask of vector type MaskVT to replace InMask. Also adjust MaskVT
+// to ToMaskVT if needed with vector extension or truncation.
+SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
+                                      EVT ToMaskVT) {
+  LLVMContext &Ctx = *DAG.getContext();
+
+  // Currently a SETCC or a AND/OR/XOR with two SETCCs are handled.
+  unsigned InMaskOpc = InMask->getOpcode();
+  assert((InMaskOpc == ISD::SETCC ||
+          (isLogicalMaskOp(InMaskOpc) &&
+           isSETCCorConvertedSETCC(InMask->getOperand(0)) &&
+           isSETCCorConvertedSETCC(InMask->getOperand(1)))) &&
+         "Unexpected mask argument.");
+
+  // Make a new Mask node, with a legal result VT.
+  SmallVector<SDValue, 4> Ops;
+  for (unsigned i = 0; i < InMask->getNumOperands(); ++i)
+    Ops.push_back(InMask->getOperand(i));
+  SDValue Mask = DAG.getNode(InMaskOpc, SDLoc(InMask), MaskVT, Ops);
+
+  // If MaskVT has smaller or bigger elements than ToMaskVT, a vector sign
+  // extend or truncate is needed.
+  unsigned MaskScalarBits = MaskVT.getScalarSizeInBits();
+  unsigned ToMaskScalBits = ToMaskVT.getScalarSizeInBits();
+  if (MaskScalarBits < ToMaskScalBits) {
+    EVT ExtVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
+                                 MaskVT.getVectorNumElements());
+    Mask = DAG.getNode(ISD::SIGN_EXTEND, SDLoc(Mask), ExtVT, Mask);
+  } else if (MaskScalarBits > ToMaskScalBits) {
+    EVT TruncVT = EVT::getVectorVT(Ctx, ToMaskVT.getVectorElementType(),
+                                   MaskVT.getVectorNumElements());
+    Mask = DAG.getNode(ISD::TRUNCATE, SDLoc(Mask), TruncVT, Mask);
+  }
+
+  assert(Mask->getValueType(0).getScalarSizeInBits() ==
+             ToMaskVT.getScalarSizeInBits() &&
+         "Mask should have the right element size by now.");
+
+  // Adjust Mask to the right number of elements.
+  unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
+  if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
+    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
+    SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy);
+    Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
+                       ZeroIdx);
+  } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
+    unsigned NumSubVecs = (ToMaskVT.getVectorNumElements() / CurrMaskNumEls);
+    EVT SubVT = Mask->getValueType(0);
+    SmallVector<SDValue, 16> SubConcatOps(NumSubVecs);
+    SubConcatOps[0] = Mask;
+    for (unsigned i = 1; i < NumSubVecs; ++i)
+      SubConcatOps[i] = DAG.getUNDEF(SubVT);
+    Mask =
+        DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Mask), ToMaskVT, SubConcatOps);
+  }
+
+  assert((Mask->getValueType(0) == ToMaskVT) &&
+         "A mask of ToMaskVT should have been produced by now.");
+
+  return Mask;
+}
+
+// Get the target mask VT, and widen if needed.
+EVT DAGTypeLegalizer::getSETCCWidenedResultTy(SDValue SetCC) {
+  assert(SetCC->getOpcode() == ISD::SETCC);
+  LLVMContext &Ctx = *DAG.getContext();
+  EVT MaskVT = getSetCCResultType(SetCC->getOperand(0).getValueType());
+  if (getTypeAction(MaskVT) == TargetLowering::TypeWidenVector)
+    MaskVT = TLI.getTypeToTransformTo(Ctx, MaskVT);
+  return MaskVT;
+}
+
+// This method tries to handle VSELECT and its mask by legalizing operands
+// (which may require widening) and if needed adjusting the mask vector type
+// to match that of the VSELECT. Without it, many cases end up with
+// scalarization of the SETCC, with many unnecessary instructions.
+SDValue DAGTypeLegalizer::WidenVSELECTAndMask(SDNode *N) {
+  LLVMContext &Ctx = *DAG.getContext();
+  SDValue Cond = N->getOperand(0);
+
+  if (N->getOpcode() != ISD::VSELECT)
+    return SDValue();
+
+  if (Cond->getOpcode() != ISD::SETCC && !isLogicalMaskOp(Cond->getOpcode()))
+    return SDValue();
+
+  // If this is a splitted VSELECT that was previously already handled, do
+  // nothing.
+  if (Cond->getValueType(0).getScalarSizeInBits() != 1)
+    return SDValue();
+
+  EVT VSelVT = N->getValueType(0);
+  // Only handle vector types which are a power of 2.
+  if (!isPowerOf2_64(VSelVT.getSizeInBits()))
+    return SDValue();
+
+  // Don't touch if this will be scalarized.
+  EVT FinalVT = VSelVT;
+  while (getTypeAction(FinalVT) == TargetLowering::TypeSplitVector)
+    FinalVT = EVT::getVectorVT(Ctx, FinalVT.getVectorElementType(),
+                               FinalVT.getVectorNumElements() / 2);
+  if (FinalVT.getVectorNumElements() == 1)
+    return SDValue();
+
+  // If there is support for an i1 vector mask, don't touch.
+  if (Cond.getOpcode() == ISD::SETCC) {
+    EVT SetCCOpVT = Cond->getOperand(0).getValueType();
+    while (TLI.getTypeAction(Ctx, SetCCOpVT) != TargetLowering::TypeLegal)
+      SetCCOpVT = TLI.getTypeToTransformTo(Ctx, SetCCOpVT);
+    EVT SetCCResVT = getSetCCResultType(SetCCOpVT);
+    if (SetCCResVT.getScalarSizeInBits() == 1)
+      return SDValue();
+  }
+
+  // Get the VT and operands for VSELECT, and widen if needed.
+  SDValue VSelOp1 = N->getOperand(1);
+  SDValue VSelOp2 = N->getOperand(2);
+  if (getTypeAction(VSelVT) == TargetLowering::TypeWidenVector) {
+    VSelVT = TLI.getTypeToTransformTo(Ctx, VSelVT);
+    VSelOp1 = GetWidenedVector(VSelOp1);
+    VSelOp2 = GetWidenedVector(VSelOp2);
+  }
+
+  // The mask of the VSELECT should have integer elements.
+  EVT ToMaskVT = VSelVT;
+  if (!ToMaskVT.getScalarType().isInteger())
+    ToMaskVT = ToMaskVT.changeVectorElementTypeToInteger();
+
+  SDValue Mask;
+  if (Cond->getOpcode() == ISD::SETCC) {
+    EVT MaskVT = getSETCCWidenedResultTy(Cond);
+    Mask = convertMask(Cond, MaskVT, ToMaskVT);
+  } else if (isLogicalMaskOp(Cond->getOpcode()) &&
+             Cond->getOperand(0).getOpcode() == ISD::SETCC &&
+             Cond->getOperand(1).getOpcode() == ISD::SETCC) {
+    // Cond is (AND/OR/XOR (SETCC, SETCC))
+    SDValue SETCC0 = Cond->getOperand(0);
+    SDValue SETCC1 = Cond->getOperand(1);
+    EVT VT0 = getSETCCWidenedResultTy(SETCC0);
+    EVT VT1 = getSETCCWidenedResultTy(SETCC1);
+    unsigned ScalarBits0 = VT0.getScalarSizeInBits();
+    unsigned ScalarBits1 = VT1.getScalarSizeInBits();
+    unsigned ScalarBits_ToMask = ToMaskVT.getScalarSizeInBits();
+    EVT MaskVT;
+    // If the two SETCCs have different VTs, either extend/truncate one of
+    // them to the other "towards" ToMaskVT, or truncate one and extend the
+    // other to ToMaskVT.
+    if (ScalarBits0 != ScalarBits1) {
+      EVT NarrowVT = ((ScalarBits0 < ScalarBits1) ? VT0 : VT1);
+      EVT WideVT = ((NarrowVT == VT0) ? VT1 : VT0);
+      if (ScalarBits_ToMask >= WideVT.getScalarSizeInBits())
+        MaskVT = WideVT;
+      else if (ScalarBits_ToMask <= NarrowVT.getScalarSizeInBits())
+        MaskVT = NarrowVT;
+      else
+        MaskVT = ToMaskVT;
+    } else
+      // If the two SETCCs have the same VT, don't change it.
+      MaskVT = VT0;
+
+    // Make new SETCCs and logical nodes.
+    SETCC0 = convertMask(SETCC0, VT0, MaskVT);
+    SETCC1 = convertMask(SETCC1, VT1, MaskVT);
+    Cond = DAG.getNode(Cond->getOpcode(), SDLoc(Cond), MaskVT, SETCC0, SETCC1);
+
+    // Convert the logical op for VSELECT if needed.
+    Mask = convertMask(Cond, MaskVT, ToMaskVT);
+  } else
+    return SDValue();
+
+  return DAG.getNode(ISD::VSELECT, SDLoc(N), VSelVT, Mask, VSelOp1, VSelOp2);
+}
+
 SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
@@ -2833,6 +3093,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_SELECT(SDNode *N) {
   SDValue Cond1 = N->getOperand(0);
   EVT CondVT = Cond1.getValueType();
   if (CondVT.isVector()) {
+    if (SDValue Res = WidenVSELECTAndMask(N))
+      return Res;
+
     EVT CondEltVT = CondVT.getVectorElementType();
     EVT CondWidenVT =  EVT::getVectorVT(*DAG.getContext(),
                                         CondEltVT, WidenNumElts);
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index a0e2fb62f1dee7395107324c8d44fef85d865012..e923e30e5037732c5087e83c7117bfcbe5d0a793 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -422,11 +422,9 @@ static bool IsChainDependent(SDNode *Outer, SDNode *Inner,
     }
     // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
     if (N->isMachineOpcode()) {
-      if (N->getMachineOpcode() ==
-          (unsigned)TII->getCallFrameDestroyOpcode()) {
+      if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         ++NestLevel;
-      } else if (N->getMachineOpcode() ==
-                 (unsigned)TII->getCallFrameSetupOpcode()) {
+      } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
         if (NestLevel == 0)
           return false;
         --NestLevel;
@@ -480,12 +478,10 @@ FindCallSeqStart(SDNode *N, unsigned &NestLevel, unsigned &MaxNest,
     }
     // Check for a lowered CALLSEQ_BEGIN or CALLSEQ_END.
     if (N->isMachineOpcode()) {
-      if (N->getMachineOpcode() ==
-          (unsigned)TII->getCallFrameDestroyOpcode()) {
+      if (N->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         ++NestLevel;
         MaxNest = std::max(MaxNest, NestLevel);
-      } else if (N->getMachineOpcode() ==
-                 (unsigned)TII->getCallFrameSetupOpcode()) {
+      } else if (N->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
         assert(NestLevel != 0);
         --NestLevel;
         if (NestLevel == 0)
@@ -550,7 +546,7 @@ void ScheduleDAGRRList::ReleasePredecessors(SUnit *SU) {
   if (!LiveRegDefs[CallResource])
     for (SDNode *Node = SU->getNode(); Node; Node = Node->getGluedNode())
       if (Node->isMachineOpcode() &&
-          Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+          Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         unsigned NestLevel = 0;
         unsigned MaxNest = 0;
         SDNode *N = FindCallSeqStart(Node, NestLevel, MaxNest, TII);
@@ -755,7 +751,7 @@ void ScheduleDAGRRList::ScheduleNodeBottomUp(SUnit *SU) {
     for (const SDNode *SUNode = SU->getNode(); SUNode;
          SUNode = SUNode->getGluedNode()) {
       if (SUNode->isMachineOpcode() &&
-          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
+          SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
         assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
         --NumLiveRegs;
         LiveRegDefs[CallResource] = nullptr;
@@ -826,7 +822,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
   for (const SDNode *SUNode = SU->getNode(); SUNode;
        SUNode = SUNode->getGluedNode()) {
     if (SUNode->isMachineOpcode() &&
-        SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameSetupOpcode()) {
+        SUNode->getMachineOpcode() == TII->getCallFrameSetupOpcode()) {
       ++NumLiveRegs;
       LiveRegDefs[CallResource] = SU;
       LiveRegGens[CallResource] = CallSeqEndForStart[SU];
@@ -839,7 +835,7 @@ void ScheduleDAGRRList::UnscheduleNodeBottomUp(SUnit *SU) {
     for (const SDNode *SUNode = SU->getNode(); SUNode;
          SUNode = SUNode->getGluedNode()) {
       if (SUNode->isMachineOpcode() &&
-          SUNode->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+          SUNode->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) {
         assert(NumLiveRegs > 0 && "NumLiveRegs is already zero!");
         --NumLiveRegs;
         LiveRegDefs[CallResource] = nullptr;
@@ -1305,7 +1301,8 @@ DelayForLiveRegsBottomUp(SUnit *SU, SmallVectorImpl<unsigned> &LRegs) {
     // If we're in the middle of scheduling a call, don't begin scheduling
     // another call. Also, don't allow any physical registers to be live across
     // the call.
-    if (Node->getMachineOpcode() == (unsigned)TII->getCallFrameDestroyOpcode()) {
+    if ((Node->getMachineOpcode() == TII->getCallFrameDestroyOpcode()) ||
+        (Node->getMachineOpcode() == TII->getCallFrameSetupOpcode())) {
       // Check the special calling-sequence resource.
       unsigned CallResource = TRI->getNumRegs();
       if (LiveRegDefs[CallResource]) {
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 25324aa442b4a9824463e4964b11da3a69eae38e..3c8526ebb702985be7fbeb0d98ee727f66e4bf31 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -836,8 +836,7 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
       GluedNodes.push_back(N);
     while (!GluedNodes.empty()) {
       SDNode *N = GluedNodes.back();
-      Emitter.EmitNode(GluedNodes.back(), SU->OrigNode != SU, SU->isCloned,
-                       VRBaseMap);
+      Emitter.EmitNode(N, SU->OrigNode != SU, SU->isCloned, VRBaseMap);
       // Remember the source order of the inserted instruction.
       if (HasDbg)
         ProcessSourceNode(N, DAG, Emitter, VRBaseMap, Orders, Seen);
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 46dde01970b9bd30171f22268fd9c92150da41ab..003ea5030bfce21241550e0da29349732c61815c 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -289,28 +289,28 @@ static int isSignedOp(ISD::CondCode Opcode) {
 }
 
 ISD::CondCode ISD::getSetCCOrOperation(ISD::CondCode Op1, ISD::CondCode Op2,
-                                       bool isInteger) {
-  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+                                       bool IsInteger) {
+  if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
     // Cannot fold a signed integer setcc with an unsigned integer setcc.
     return ISD::SETCC_INVALID;
 
   unsigned Op = Op1 | Op2;  // Combine all of the condition bits.
 
-  // If the N and U bits get set then the resultant comparison DOES suddenly
-  // care about orderedness, and is true when ordered.
+  // If the N and U bits get set, then the resultant comparison DOES suddenly
+  // care about orderedness, and it is true when ordered.
   if (Op > ISD::SETTRUE2)
     Op &= ~16;     // Clear the U bit if the N bit is set.
 
   // Canonicalize illegal integer setcc's.
-  if (isInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
+  if (IsInteger && Op == ISD::SETUNE)  // e.g. SETUGT | SETULT
     Op = ISD::SETNE;
 
   return ISD::CondCode(Op);
 }
 
 ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
-                                        bool isInteger) {
-  if (isInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
+                                        bool IsInteger) {
+  if (IsInteger && (isSignedOp(Op1) | isSignedOp(Op2)) == 3)
     // Cannot fold a signed setcc with an unsigned setcc.
     return ISD::SETCC_INVALID;
 
@@ -318,7 +318,7 @@ ISD::CondCode ISD::getSetCCAndOperation(ISD::CondCode Op1, ISD::CondCode Op2,
   ISD::CondCode Result = ISD::CondCode(Op1 & Op2);
 
   // Canonicalize illegal integer setcc's.
-  if (isInteger) {
+  if (IsInteger) {
     switch (Result) {
     default: break;
     case ISD::SETUO : Result = ISD::SETFALSE; break;  // SETUGT & SETULT
@@ -871,11 +871,13 @@ SelectionDAG::SelectionDAG(const TargetMachine &tm, CodeGenOpt::Level OL)
   DbgInfo = new SDDbgInfo();
 }
 
-void SelectionDAG::init(MachineFunction &mf) {
-  MF = &mf;
+void SelectionDAG::init(MachineFunction &NewMF,
+                        OptimizationRemarkEmitter &NewORE) {
+  MF = &NewMF;
+  ORE = &NewORE;
   TLI = getSubtarget().getTargetLowering();
   TSI = getSubtarget().getSelectionDAGInfo();
-  Context = &mf.getFunction()->getContext();
+  Context = &MF->getFunction()->getContext();
 }
 
 SelectionDAG::~SelectionDAG() {
@@ -1994,8 +1996,6 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
 /// them in the KnownZero/KnownOne bitsets. The DemandedElts argument allows
 /// us to only collect the known bits that are shared by the requested vector
 /// elements.
-/// TODO: We only support DemandedElts on a few opcodes so far, the remainder
-/// should be added when they become necessary.
 void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
                                     APInt &KnownOne, const APInt &DemandedElts,
                                     unsigned Depth) const {
@@ -2251,10 +2251,9 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
                                KnownZero2.countLeadingOnes(),
                                BitWidth) - BitWidth;
 
-    TrailZ = std::min(TrailZ, BitWidth);
-    LeadZ = std::min(LeadZ, BitWidth);
-    KnownZero = APInt::getLowBitsSet(BitWidth, TrailZ) |
-                APInt::getHighBitsSet(BitWidth, LeadZ);
+    KnownZero.clearAllBits();
+    KnownZero.setLowBits(std::min(TrailZ, BitWidth));
+    KnownZero.setHighBits(std::min(LeadZ, BitWidth));
     break;
   }
   case ISD::UDIV: {
@@ -2272,7 +2271,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       LeadZ = std::min(BitWidth,
                        LeadZ + BitWidth - RHSUnknownLeadingOnes - 1);
 
-    KnownZero = APInt::getHighBitsSet(BitWidth, LeadZ);
+    KnownZero.setHighBits(LeadZ);
     break;
   }
   case ISD::SELECT:
@@ -2297,10 +2296,6 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     KnownOne &= KnownOne2;
     KnownZero &= KnownZero2;
     break;
-  case ISD::SADDO:
-  case ISD::UADDO:
-  case ISD::SSUBO:
-  case ISD::USUBO:
   case ISD::SMULO:
   case ISD::UMULO:
     if (Op.getResNo() != 1)
@@ -2312,14 +2307,14 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     if (TLI->getBooleanContents(Op.getValueType().isVector(), false) ==
             TargetLowering::ZeroOrOneBooleanContent &&
         BitWidth > 1)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+      KnownZero.setBitsFrom(1);
     break;
   case ISD::SETCC:
     // If we know the result of a setcc has the top bits zero, use this info.
     if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
             TargetLowering::ZeroOrOneBooleanContent &&
         BitWidth > 1)
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+      KnownZero.setBitsFrom(1);
     break;
   case ISD::SHL:
     if (const APInt *ShAmt = getValidShiftAmountConstant(Op)) {
@@ -2328,7 +2323,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       KnownZero = KnownZero << *ShAmt;
       KnownOne = KnownOne << *ShAmt;
       // Low bits are known zero.
-      KnownZero |= APInt::getLowBitsSet(BitWidth, ShAmt->getZExtValue());
+      KnownZero.setLowBits(ShAmt->getZExtValue());
     }
     break;
   case ISD::SRL:
@@ -2338,8 +2333,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       KnownZero = KnownZero.lshr(*ShAmt);
       KnownOne  = KnownOne.lshr(*ShAmt);
       // High bits are known zero.
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue());
-      KnownZero |= HighBits;
+      KnownZero.setHighBits(ShAmt->getZExtValue());
     }
     break;
   case ISD::SRA:
@@ -2350,13 +2344,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
       KnownOne  = KnownOne.lshr(*ShAmt);
       // If we know the value of the sign bit, then we know it is copied across
       // the high bits by the shift amount.
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt->getZExtValue());
       APInt SignBit = APInt::getSignBit(BitWidth);
       SignBit = SignBit.lshr(*ShAmt);  // Adjust to where it is now in the mask.
       if (KnownZero.intersects(SignBit)) {
-        KnownZero |= HighBits;  // New bits are known zero.
+        KnownZero.setHighBits(ShAmt->getZExtValue());// New bits are known zero.
       } else if (KnownOne.intersects(SignBit)) {
-        KnownOne  |= HighBits;  // New bits are known one.
+        KnownOne.setHighBits(ShAmt->getZExtValue()); // New bits are known one.
       }
     }
     break;
@@ -2401,9 +2394,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTPOP: {
-    unsigned LowBits = Log2_32(BitWidth)+1;
-    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - LowBits);
-    KnownOne.clearAllBits();
+    KnownZero.setBitsFrom(Log2_32(BitWidth)+1);
     break;
   }
   case ISD::LOAD: {
@@ -2412,26 +2403,39 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
       EVT VT = LD->getMemoryVT();
       unsigned MemBits = VT.getScalarSizeInBits();
-      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - MemBits);
+      KnownZero.setBitsFrom(MemBits);
     } else if (const MDNode *Ranges = LD->getRanges()) {
       if (LD->getExtensionType() == ISD::NON_EXTLOAD)
         computeKnownBitsFromRangeMetadata(*Ranges, KnownZero, KnownOne);
     }
     break;
   }
+  case ISD::ZERO_EXTEND_VECTOR_INREG: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    unsigned InBits = InVT.getScalarSizeInBits();
+    KnownZero = KnownZero.trunc(InBits);
+    KnownOne = KnownOne.trunc(InBits);
+    computeKnownBits(Op.getOperand(0), KnownZero, KnownOne,
+                     DemandedElts.zext(InVT.getVectorNumElements()),
+                     Depth + 1);
+    KnownZero = KnownZero.zext(BitWidth);
+    KnownOne = KnownOne.zext(BitWidth);
+    KnownZero.setBitsFrom(InBits);
+    break;
+  }
   case ISD::ZERO_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
-    APInt NewBits   = APInt::getHighBitsSet(BitWidth, BitWidth - InBits);
     KnownZero = KnownZero.trunc(InBits);
     KnownOne = KnownOne.trunc(InBits);
     computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
                      Depth + 1);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
-    KnownZero |= NewBits;
+    KnownZero.setBitsFrom(InBits);
     break;
   }
+  // TODO ISD::SIGN_EXTEND_VECTOR_INREG
   case ISD::SIGN_EXTEND: {
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
@@ -2478,10 +2482,21 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   }
   case ISD::FGETSIGN:
     // All bits are zero except the low bit.
-    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    KnownZero.setBitsFrom(1);
     break;
-
-  case ISD::SUB: {
+  case ISD::USUBO:
+  case ISD::SSUBO:
+    if (Op.getResNo() == 1) {
+      // If we know the result of a setcc has the top bits zero, use this info.
+      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        KnownZero.setBitsFrom(1);
+      break;
+    }
+    LLVM_FALLTHROUGH;
+  case ISD::SUB:
+  case ISD::SUBC: {
     if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0))) {
       // We know that the top bits of C-X are clear if X contains less bits
       // than C (i.e. no wrap-around can happen).  For example, 20-X is
@@ -2499,13 +2514,40 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
         if ((KnownZero2 & MaskV) == MaskV) {
           unsigned NLZ2 = CLHS->getAPIntValue().countLeadingZeros();
           // Top bits known zero.
-          KnownZero = APInt::getHighBitsSet(BitWidth, NLZ2);
+          KnownZero.setHighBits(NLZ2);
         }
       }
     }
-    LLVM_FALLTHROUGH;
+
+    // If low bits are know to be zero in both operands, then we know they are
+    // going to be 0 in the result. Both addition and complement operations
+    // preserve the low zero bits.
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
+                     Depth + 1);
+    unsigned KnownZeroLow = KnownZero2.countTrailingOnes();
+    if (KnownZeroLow == 0)
+      break;
+
+    computeKnownBits(Op.getOperand(1), KnownZero2, KnownOne2, DemandedElts,
+                     Depth + 1);
+    KnownZeroLow = std::min(KnownZeroLow,
+                            KnownZero2.countTrailingOnes());
+    KnownZero.setBits(0, KnownZeroLow);
+    break;
   }
+  case ISD::UADDO:
+  case ISD::SADDO:
+    if (Op.getResNo() == 1) {
+      // If we know the result of a setcc has the top bits zero, use this info.
+      if (TLI->getBooleanContents(Op.getOperand(0).getValueType()) ==
+              TargetLowering::ZeroOrOneBooleanContent &&
+          BitWidth > 1)
+        KnownZero.setBitsFrom(1);
+      break;
+    }
+    LLVM_FALLTHROUGH;
   case ISD::ADD:
+  case ISD::ADDC:
   case ISD::ADDE: {
     // Output known-0 bits are known if clear or set in both the low clear bits
     // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
@@ -2526,19 +2568,19 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     KnownZeroLow = std::min(KnownZeroLow,
                             KnownZero2.countTrailingOnes());
 
-    if (Opcode == ISD::ADD) {
-      KnownZero |= APInt::getLowBitsSet(BitWidth, KnownZeroLow);
-      if (KnownZeroHigh > 1)
-        KnownZero |= APInt::getHighBitsSet(BitWidth, KnownZeroHigh - 1);
+    if (Opcode == ISD::ADDE) {
+      // With ADDE, a carry bit may be added in, so we can only use this
+      // information if we know (at least) that the low two bits are clear.
+      // We then return to the caller that the low bit is unknown but that
+      // other bits are known zero.
+      if (KnownZeroLow >= 2)
+        KnownZero.setBits(1, KnownZeroLow);
       break;
     }
 
-    // With ADDE, a carry bit may be added in, so we can only use this
-    // information if we know (at least) that the low two bits are clear.  We
-    // then return to the caller that the low bit is unknown but that other bits
-    // are known zero.
-    if (KnownZeroLow >= 2) // ADDE
-      KnownZero |= APInt::getBitsSet(BitWidth, 1, KnownZeroLow);
+    KnownZero.setLowBits(KnownZeroLow);
+    if (KnownZeroHigh > 1)
+      KnownZero.setHighBits(KnownZeroHigh - 1);
     break;
   }
   case ISD::SREM:
@@ -2591,7 +2633,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     uint32_t Leaders = std::max(KnownZero.countLeadingOnes(),
                                 KnownZero2.countLeadingOnes());
     KnownOne.clearAllBits();
-    KnownZero = APInt::getHighBitsSet(BitWidth, Leaders);
+    KnownZero.clearAllBits();
+    KnownZero.setHighBits(Leaders);
     break;
   }
   case ISD::EXTRACT_ELEMENT: {
@@ -2687,6 +2730,26 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
     KnownOne = KnownOne2.byteSwap();
     break;
   }
+  case ISD::ABS: {
+    computeKnownBits(Op.getOperand(0), KnownZero2, KnownOne2, DemandedElts,
+                     Depth + 1);
+
+    // If the source's MSB is zero then we know the rest of the bits already.
+    if (KnownZero2[BitWidth - 1]) {
+      KnownZero = KnownZero2;
+      KnownOne = KnownOne2;
+      break;
+    }
+
+    // We only know that the absolute values's MSB will be zero iff there is
+    // a set bit that isn't the sign bit (otherwise it could be INT_MIN).
+    KnownOne2.clearBit(BitWidth - 1);
+    if (KnownOne2.getBoolValue()) {
+      KnownZero = APInt::getSignBit(BitWidth);
+      break;
+    }
+    break;
+  }
   case ISD::UMIN: {
     computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, DemandedElts,
                      Depth + 1);
@@ -2700,7 +2763,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
 
     KnownZero &= KnownZero2;
     KnownOne &= KnownOne2;
-    KnownZero |= APInt::getHighBitsSet(BitWidth, LeadZero);
+    KnownZero.setHighBits(LeadZero);
     break;
   }
   case ISD::UMAX: {
@@ -2716,7 +2779,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
 
     KnownZero &= KnownZero2;
     KnownOne &= KnownOne2;
-    KnownOne |= APInt::getHighBitsSet(BitWidth, LeadOne);
+    KnownOne.setHighBits(LeadOne);
     break;
   }
   case ISD::SMIN:
@@ -2736,7 +2799,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   case ISD::TargetFrameIndex:
     if (unsigned Align = InferPtrAlignment(Op)) {
       // The low bits are known zero if the pointer is aligned.
-      KnownZero = APInt::getLowBitsSet(BitWidth, Log2_32(Align));
+      KnownZero.setLowBits(Log2_32(Align));
       break;
     }
     break;
@@ -2749,13 +2812,48 @@ void SelectionDAG::computeKnownBits(SDValue Op, APInt &KnownZero,
   case ISD::INTRINSIC_W_CHAIN:
   case ISD::INTRINSIC_VOID:
     // Allow the target to implement this method for its nodes.
-    TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, *this, Depth);
+    TLI->computeKnownBitsForTargetNode(Op, KnownZero, KnownOne, DemandedElts,
+                                       *this, Depth);
     break;
   }
 
   assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 }
 
+SelectionDAG::OverflowKind SelectionDAG::computeOverflowKind(SDValue N0,
+                                                             SDValue N1) const {
+  // X + 0 never overflow
+  if (isNullConstant(N1))
+    return OFK_Never;
+
+  APInt N1Zero, N1One;
+  computeKnownBits(N1, N1Zero, N1One);
+  if (N1Zero.getBoolValue()) {
+    APInt N0Zero, N0One;
+    computeKnownBits(N0, N0Zero, N0One);
+
+    bool overflow;
+    (~N0Zero).uadd_ov(~N1Zero, overflow);
+    if (!overflow)
+      return OFK_Never;
+  }
+
+  // mulhi + 1 never overflow
+  if (N0.getOpcode() == ISD::UMUL_LOHI && N0.getResNo() == 1 &&
+      (~N1Zero & 0x01) == ~N1Zero)
+    return OFK_Never;
+
+  if (N1.getOpcode() == ISD::UMUL_LOHI && N1.getResNo() == 1) {
+    APInt N0Zero, N0One;
+    computeKnownBits(N0, N0Zero, N0One);
+
+    if ((~N0Zero & 0x01) == ~N0Zero)
+      return OFK_Never;
+  }
+
+  return OFK_Sometime;
+}
+
 bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
   EVT OpVT = Val.getValueType();
   unsigned BitWidth = OpVT.getScalarSizeInBits();
@@ -2801,6 +2899,15 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
 
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
   EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return ComputeNumSignBits(Op, DemandedElts, Depth);
+}
+
+unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
+                                          unsigned Depth) const {
+  EVT VT = Op.getValueType();
   assert(VT.isInteger() && "Invalid VT!");
   unsigned VTBits = VT.getScalarSizeInBits();
   unsigned Tmp, Tmp2;
@@ -2809,6 +2916,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
   if (Depth == 6)
     return 1;  // Limit search depth.
 
+  if (!DemandedElts)
+    return 1;  // No demanded elts, better to assume we don't know anything.
+
   switch (Op.getOpcode()) {
   default: break;
   case ISD::AssertSext:
@@ -2823,7 +2933,28 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     return Val.getNumSignBits();
   }
 
+  case ISD::BUILD_VECTOR:
+    Tmp = VTBits;
+    for (unsigned i = 0, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) {
+      if (!DemandedElts[i])
+        continue;
+
+      SDValue SrcOp = Op.getOperand(i);
+      Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);
+
+      // BUILD_VECTOR can implicitly truncate sources, we must handle this.
+      if (SrcOp.getValueSizeInBits() != VTBits) {
+        assert(SrcOp.getValueSizeInBits() > VTBits &&
+               "Expected BUILD_VECTOR implicit truncation");
+        unsigned ExtraBits = SrcOp.getValueSizeInBits() - VTBits;
+        Tmp2 = (Tmp2 > ExtraBits ? Tmp2 - ExtraBits : 1);
+      }
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    return Tmp;
+
   case ISD::SIGN_EXTEND:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
     Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits();
     return ComputeNumSignBits(Op.getOperand(0), Depth+1) + Tmp;
 
@@ -2836,7 +2967,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     return std::max(Tmp, Tmp2);
 
   case ISD::SRA:
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth+1);
     // SRA X, C   -> adds C sign bits.
     if (ConstantSDNode *C = isConstOrConstSplat(Op.getOperand(1))) {
       APInt ShiftVal = C->getAPIntValue();
@@ -2924,6 +3055,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     }
     break;
   case ISD::ADD:
+  case ISD::ADDC:
     // Add can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
     Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
@@ -2998,19 +3130,63 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
     // result. Otherwise it gives either negative or > bitwidth result
     return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
   }
+  case ISD::INSERT_VECTOR_ELT: {
+    SDValue InVec = Op.getOperand(0);
+    SDValue InVal = Op.getOperand(1);
+    SDValue EltNo = Op.getOperand(2);
+    unsigned NumElts = InVec.getValueType().getVectorNumElements();
+
+    ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
+      // If we know the element index, split the demand between the
+      // source vector and the inserted element.
+      unsigned EltIdx = CEltNo->getZExtValue();
+
+      // If we demand the inserted element then get its sign bits.
+      Tmp = UINT_MAX;
+      if (DemandedElts[EltIdx])
+        Tmp = ComputeNumSignBits(InVal, Depth + 1);
+
+      // If we demand the source vector then get its sign bits, and determine
+      // the minimum.
+      APInt VectorElts = DemandedElts;
+      VectorElts.clearBit(EltIdx);
+      if (!!VectorElts) {
+        Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1);
+        Tmp = std::min(Tmp, Tmp2);
+      }
+    } else {
+      // Unknown element index, so ignore DemandedElts and demand them all.
+      Tmp = ComputeNumSignBits(InVec, Depth + 1);
+      Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
+    assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
+    return Tmp;
+  }
   case ISD::EXTRACT_VECTOR_ELT: {
-    // At the moment we keep this simple and skip tracking the specific
-    // element. This way we get the lowest common denominator for all elements
-    // of the vector.
-    // TODO: get information for given vector element
+    SDValue InVec = Op.getOperand(0);
+    SDValue EltNo = Op.getOperand(1);
+    EVT VecVT = InVec.getValueType();
     const unsigned BitWidth = Op.getValueSizeInBits();
     const unsigned EltBitWidth = Op.getOperand(0).getScalarValueSizeInBits();
+    const unsigned NumSrcElts = VecVT.getVectorNumElements();
+
     // If BitWidth > EltBitWidth the value is anyext:ed, and we do not know
     // anything about sign bits. But if the sizes match we can derive knowledge
     // about sign bits from the vector operand.
-    if (BitWidth == EltBitWidth)
-      return ComputeNumSignBits(Op.getOperand(0), Depth+1);
-    break;
+    if (BitWidth != EltBitWidth)
+      break;
+
+    // If we know the element index, just demand that vector element, else for
+    // an unknown element index, ignore DemandedElts and demand them all.
+    APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
+    ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
+      DemandedSrcElts =
+          APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
+
+    return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
   }
   case ISD::EXTRACT_SUBVECTOR:
     return ComputeNumSignBits(Op.getOperand(0), Depth + 1);
@@ -3045,14 +3221,16 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
       Op.getOpcode() == ISD::INTRINSIC_WO_CHAIN ||
       Op.getOpcode() == ISD::INTRINSIC_W_CHAIN ||
       Op.getOpcode() == ISD::INTRINSIC_VOID) {
-    unsigned NumBits = TLI->ComputeNumSignBitsForTargetNode(Op, *this, Depth);
-    if (NumBits > 1) FirstAnswer = std::max(FirstAnswer, NumBits);
+    unsigned NumBits =
+        TLI->ComputeNumSignBitsForTargetNode(Op, DemandedElts, *this, Depth);
+    if (NumBits > 1)
+      FirstAnswer = std::max(FirstAnswer, NumBits);
   }
 
   // Finally, if we can prove that the top bits of the result are 0's or 1's,
   // use this information.
   APInt KnownZero, KnownOne;
-  computeKnownBits(Op, KnownZero, KnownOne, Depth);
+  computeKnownBits(Op, KnownZero, KnownOne, DemandedElts, Depth);
 
   APInt Mask;
   if (KnownZero.isNegative()) {        // sign bit is 0
@@ -3246,6 +3424,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       if (VT == MVT::f128 && C->getValueType(0) == MVT::i128)
         return getConstantFP(APFloat(APFloat::IEEEquad(), Val), DL, VT);
       break;
+    case ISD::ABS:
+      return getConstant(Val.abs(), DL, VT, C->isTargetOpcode(),
+                         C->isOpaque());
     case ISD::BITREVERSE:
       return getConstant(Val.reverseBits(), DL, VT, C->isTargetOpcode(),
                          C->isOpaque());
@@ -3315,17 +3496,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     }
     case ISD::FP_TO_SINT:
     case ISD::FP_TO_UINT: {
-      integerPart x[2];
       bool ignored;
-      static_assert(integerPartWidth >= 64, "APFloat parts too small!");
+      APSInt IntVal(VT.getSizeInBits(), Opcode == ISD::FP_TO_UINT);
       // FIXME need to be more flexible about rounding mode.
-      APFloat::opStatus s = V.convertToInteger(x, VT.getSizeInBits(),
-                            Opcode==ISD::FP_TO_SINT,
-                            APFloat::rmTowardZero, &ignored);
-      if (s==APFloat::opInvalidOp)     // inexact is OK, in fact usual
+      APFloat::opStatus s =
+          V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored);
+      if (s == APFloat::opInvalidOp) // inexact is OK, in fact usual
         break;
-      APInt api(VT.getSizeInBits(), x);
-      return getConstant(api, DL, VT);
+      return getConstant(IntVal, DL, VT);
     }
     case ISD::BITCAST:
       if (VT == MVT::i16 && C->getValueType(0) == MVT::f16)
@@ -3365,6 +3543,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       case ISD::TRUNCATE:
       case ISD::UINT_TO_FP:
       case ISD::SINT_TO_FP:
+      case ISD::ABS:
       case ISD::BITREVERSE:
       case ISD::BSWAP:
       case ISD::CTLZ:
@@ -3483,6 +3662,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (OpOpcode == ISD::UNDEF)
       return getUNDEF(VT);
     break;
+  case ISD::ABS:
+    assert(VT.isInteger() && VT == Operand.getValueType() &&
+           "Invalid ABS!");
+    if (OpOpcode == ISD::UNDEF)
+      return getUNDEF(VT);
+    break;
   case ISD::BSWAP:
     assert(VT.isInteger() && VT == Operand.getValueType() &&
            "Invalid BSWAP!");
@@ -3632,6 +3817,30 @@ SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
                           GA->getOffset() + uint64_t(Offset));
 }
 
+bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
+  switch (Opcode) {
+  case ISD::SDIV:
+  case ISD::UDIV:
+  case ISD::SREM:
+  case ISD::UREM: {
+    // If a divisor is zero/undef or any element of a divisor vector is
+    // zero/undef, the whole op is undef.
+    assert(Ops.size() == 2 && "Div/rem should have 2 operands");
+    SDValue Divisor = Ops[1];
+    if (Divisor.isUndef() || isNullConstant(Divisor))
+      return true;
+
+    return ISD::isBuildVectorOfConstantSDNodes(Divisor.getNode()) &&
+           any_of(Divisor->op_values(),
+                  [](SDValue V) { return V.isUndef() || isNullConstant(V); });
+    // TODO: Handle signed overflow.
+  }
+  // TODO: Handle oversized shifts.
+  default:
+    return false;
+  }
+}
+
 SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
                                              EVT VT, SDNode *Cst1,
                                              SDNode *Cst2) {
@@ -3641,6 +3850,9 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
+  if (isUndef(Opcode, {SDValue(Cst1, 0), SDValue(Cst2, 0)}))
+    return getUNDEF(VT);
+
   // Handle the case of two scalars.
   if (const ConstantSDNode *Scalar1 = dyn_cast<ConstantSDNode>(Cst1)) {
     if (const ConstantSDNode *Scalar2 = dyn_cast<ConstantSDNode>(Cst2)) {
@@ -3708,6 +3920,9 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
+  if (isUndef(Opcode, Ops))
+    return getUNDEF(VT);
+
   // We can only fold vectors - maybe merge with FoldConstantArithmetic someday?
   if (!VT.isVector())
     return SDValue();
@@ -3739,7 +3954,7 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
   // Find legal integer scalar type for constant promotion and
   // ensure that its scalar size is at least as large as source.
   EVT LegalSVT = VT.getScalarType();
-  if (LegalSVT.isInteger()) {
+  if (NewNodesMustHaveLegalTypes && LegalSVT.isInteger()) {
     LegalSVT = TLI->getTypeToTransformTo(*getContext(), LegalSVT);
     if (LegalSVT.bitsLT(VT.getScalarType()))
       return SDValue();
@@ -3973,35 +4188,31 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(EVT.bitsLE(VT) && "Not extending!");
     if (EVT == VT) return N1;  // Not actually extending
 
-    auto SignExtendInReg = [&](APInt Val) {
+    auto SignExtendInReg = [&](APInt Val, llvm::EVT ConstantVT) {
       unsigned FromBits = EVT.getScalarSizeInBits();
       Val <<= Val.getBitWidth() - FromBits;
       Val = Val.ashr(Val.getBitWidth() - FromBits);
-      return getConstant(Val, DL, VT.getScalarType());
+      return getConstant(Val, DL, ConstantVT);
     };
 
     if (N1C) {
       const APInt &Val = N1C->getAPIntValue();
-      return SignExtendInReg(Val);
+      return SignExtendInReg(Val, VT);
     }
     if (ISD::isBuildVectorOfConstantSDNodes(N1.getNode())) {
       SmallVector<SDValue, 8> Ops;
+      llvm::EVT OpVT = N1.getOperand(0).getValueType();
       for (int i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
         SDValue Op = N1.getOperand(i);
         if (Op.isUndef()) {
-          Ops.push_back(getUNDEF(VT.getScalarType()));
-          continue;
-        }
-        if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
-          APInt Val = C->getAPIntValue();
-          Val = Val.zextOrTrunc(VT.getScalarSizeInBits());
-          Ops.push_back(SignExtendInReg(Val));
+          Ops.push_back(getUNDEF(OpVT));
           continue;
         }
-        break;
+        ConstantSDNode *C = cast<ConstantSDNode>(Op);
+        APInt Val = C->getAPIntValue();
+        Ops.push_back(SignExtendInReg(Val, OpVT));
       }
-      if (Ops.size() == VT.getVectorNumElements())
-        return getBuildVector(VT, DL, Ops);
+      return getBuildVector(VT, DL, Ops);
     }
     break;
   }
@@ -5019,11 +5230,11 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
-                 Dst.getValueType().getTypeForEVT(*getContext()),
-                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
-                                   TLI->getPointerTy(getDataLayout())),
-                 std::move(Args))
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMCPY),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMCPY),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -5080,11 +5291,11 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
-                 Dst.getValueType().getTypeForEVT(*getContext()),
-                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
-                                   TLI->getPointerTy(getDataLayout())),
-                 std::move(Args))
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMMOVE),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMMOVE),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -5142,11 +5353,11 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
   TargetLowering::CallLoweringInfo CLI(*this);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
-                 Dst.getValueType().getTypeForEVT(*getContext()),
-                 getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
-                                   TLI->getPointerTy(getDataLayout())),
-                 std::move(Args))
+      .setLibCallee(TLI->getLibcallCallingConv(RTLIB::MEMSET),
+                    Dst.getValueType().getTypeForEVT(*getContext()),
+                    getExternalSymbol(TLI->getLibcallName(RTLIB::MEMSET),
+                                      TLI->getPointerTy(getDataLayout())),
+                    std::move(Args))
       .setDiscardResult()
       .setTailCall(isTailCall);
 
@@ -7125,6 +7336,21 @@ bool SDNode::isOnlyUserOf(const SDNode *N) const {
   return Seen;
 }
 
+/// Return true if the only users of N are contained in Nodes.
+bool SDNode::areOnlyUsersOf(ArrayRef<const SDNode *> Nodes, const SDNode *N) {
+  bool Seen = false;
+  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+    SDNode *User = *I;
+    if (llvm::any_of(Nodes,
+                     [&User](const SDNode *Node) { return User == Node; }))
+      Seen = true;
+    else
+      return false;
+  }
+
+  return Seen;
+}
+
 /// isOperand - Return true if this node is an operand of N.
 ///
 bool SDValue::isOperandOf(const SDNode *N) const {
@@ -7146,21 +7372,39 @@ bool SDNode::isOperandOf(const SDNode *N) const {
 /// side-effecting instructions on any chain path.  In practice, this looks
 /// through token factors and non-volatile loads.  In order to remain efficient,
 /// this only looks a couple of nodes in, it does not do an exhaustive search.
+///
+/// Note that we only need to examine chains when we're searching for
+/// side-effects; SelectionDAG requires that all side-effects are represented
+/// by chains, even if another operand would force a specific ordering. This
+/// constraint is necessary to allow transformations like splitting loads.
 bool SDValue::reachesChainWithoutSideEffects(SDValue Dest,
-                                               unsigned Depth) const {
+                                             unsigned Depth) const {
   if (*this == Dest) return true;
 
   // Don't search too deeply, we just want to be able to see through
   // TokenFactor's etc.
   if (Depth == 0) return false;
 
-  // If this is a token factor, all inputs to the TF happen in parallel.  If any
-  // of the operands of the TF does not reach dest, then we cannot do the xform.
+  // If this is a token factor, all inputs to the TF happen in parallel.
   if (getOpcode() == ISD::TokenFactor) {
-    for (unsigned i = 0, e = getNumOperands(); i != e; ++i)
-      if (!getOperand(i).reachesChainWithoutSideEffects(Dest, Depth-1))
-        return false;
-    return true;
+    // First, try a shallow search.
+    if (is_contained((*this)->ops(), Dest)) {
+      // We found the chain we want as an operand of this TokenFactor.
+      // Essentially, we reach the chain without side-effects if we could
+      // serialize the TokenFactor into a simple chain of operations with
+      // Dest as the last operation. This is automatically true if the
+      // chain has one use: there are no other ordering constraints.
+      // If the chain has more than one use, we give up: some other
+      // use of Dest might force a side-effect between Dest and the current
+      // node.
+      if (Dest.hasOneUse())
+        return true;
+    }
+    // Next, try a deep search: check whether every operand of the TokenFactor
+    // reaches Dest.
+    return all_of((*this)->ops(), [=](SDValue Op) {
+      return Op.reachesChainWithoutSideEffects(Dest, Depth - 1);
+    });
   }
 
   // Loads don't have side effects, look through them.
@@ -7448,13 +7692,13 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue,
     unsigned BitPos = j * EltBitSize;
 
     if (OpVal.isUndef())
-      SplatUndef |= APInt::getBitsSet(sz, BitPos, BitPos + EltBitSize);
+      SplatUndef.setBits(BitPos, BitPos + EltBitSize);
     else if (ConstantSDNode *CN = dyn_cast<ConstantSDNode>(OpVal))
-      SplatValue |= CN->getAPIntValue().zextOrTrunc(EltBitSize).
-                    zextOrTrunc(sz) << BitPos;
+      SplatValue.insertBits(CN->getAPIntValue().zextOrTrunc(EltBitSize),
+                            BitPos);
     else if (ConstantFPSDNode *CN = dyn_cast<ConstantFPSDNode>(OpVal))
-      SplatValue |= CN->getValueAPF().bitcastToAPInt().zextOrTrunc(sz) <<BitPos;
-     else
+      SplatValue.insertBits(CN->getValueAPF().bitcastToAPInt(), BitPos);
+    else
       return false;
   }
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index b654086dd1ac3b9097af60b9e9123790f969e76e..315d841cf3cb8057daab5aeede1172abf3a10213 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -84,10 +84,6 @@ LimitFPPrecision("limit-float-precision",
                  cl::location(LimitFloatPrecision),
                  cl::init(0));
 
-static cl::opt<bool>
-EnableFMFInDAG("enable-fmf-dag", cl::init(true), cl::Hidden,
-                cl::desc("Enable fast-math-flags for DAG nodes"));
-
 /// Minimum jump table density for normal functions.
 static cl::opt<unsigned>
 JumpTableDensity("jump-table-density", cl::init(10), cl::Hidden,
@@ -634,10 +630,6 @@ RegsForValue::RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
   }
 }
 
-/// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
-/// this value and returns the result as a ValueVT value.  This uses
-/// Chain/Flag as the input and updates them for the output Chain/Flag.
-/// If the Flag pointer is NULL, no flag is used.
 SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
                                       FunctionLoweringInfo &FuncInfo,
                                       const SDLoc &dl, SDValue &Chain,
@@ -739,10 +731,6 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG,
   return DAG.getNode(ISD::MERGE_VALUES, dl, DAG.getVTList(ValueVTs), Values);
 }
 
-/// getCopyToRegs - Emit a series of CopyToReg nodes that copies the
-/// specified value into the registers specified by this object.  This uses
-/// Chain/Flag as the input and updates them for the output Chain/Flag.
-/// If the Flag pointer is NULL, no flag is used.
 void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
                                  const SDLoc &dl, SDValue &Chain, SDValue *Flag,
                                  const Value *V,
@@ -796,9 +784,6 @@ void RegsForValue::getCopyToRegs(SDValue Val, SelectionDAG &DAG,
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
 }
 
-/// AddInlineAsmOperands - Add this value to the specified inlineasm node
-/// operand list.  This adds the code marker and includes the number of
-/// values added into it.
 void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching,
                                         unsigned MatchingIdx, const SDLoc &dl,
                                         SelectionDAG &DAG,
@@ -850,12 +835,6 @@ void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa,
   LPadToCallSiteMap.clear();
 }
 
-/// clear - Clear out the current SelectionDAG and the associated
-/// state and prepare this SelectionDAGBuilder object to be used
-/// for a new block. This doesn't clear out information about
-/// additional blocks that are needed to complete switch lowering
-/// or PHI node updating; that information is cleared out as it is
-/// consumed.
 void SelectionDAGBuilder::clear() {
   NodeMap.clear();
   UnusedArgNodeMap.clear();
@@ -867,21 +846,10 @@ void SelectionDAGBuilder::clear() {
   StatepointLowering.clear();
 }
 
-/// clearDanglingDebugInfo - Clear the dangling debug information
-/// map. This function is separated from the clear so that debug
-/// information that is dangling in a basic block can be properly
-/// resolved in a different basic block. This allows the
-/// SelectionDAG to resolve dangling debug information attached
-/// to PHI nodes.
 void SelectionDAGBuilder::clearDanglingDebugInfo() {
   DanglingDebugInfoMap.clear();
 }
 
-/// getRoot - Return the current virtual root of the Selection DAG,
-/// flushing any PendingLoad items. This must be done before emitting
-/// a store or any other node that may need to be ordered after any
-/// prior load instructions.
-///
 SDValue SelectionDAGBuilder::getRoot() {
   if (PendingLoads.empty())
     return DAG.getRoot();
@@ -901,10 +869,6 @@ SDValue SelectionDAGBuilder::getRoot() {
   return Root;
 }
 
-/// getControlRoot - Similar to getRoot, but instead of flushing all the
-/// PendingLoad items, flush all the PendingExports items. It is necessary
-/// to do this before emitting a terminator instruction.
-///
 SDValue SelectionDAGBuilder::getControlRoot() {
   SDValue Root = DAG.getRoot();
 
@@ -1405,16 +1369,16 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
       const Function *F = I.getParent()->getParent();
 
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
-      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                           Attribute::SExt))
         ExtendKind = ISD::SIGN_EXTEND;
-      else if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+      else if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                                Attribute::ZExt))
         ExtendKind = ISD::ZERO_EXTEND;
 
       LLVMContext &Context = F->getContext();
-      bool RetInReg = F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
-                                                      Attribute::InReg);
+      bool RetInReg = F->getAttributes().hasAttribute(
+          AttributeList::ReturnIndex, Attribute::InReg);
 
       for (unsigned j = 0; j != NumValues; ++j) {
         EVT VT = ValueVTs[j];
@@ -1638,10 +1602,12 @@ void SelectionDAGBuilder::FindMergedConditions(const Value *Cond,
   // Skip over not part of the tree and remember to invert op and operands at
   // next level.
   if (BinaryOperator::isNot(Cond) && Cond->hasOneUse()) {
-    Cond = cast<Instruction>(Cond)->getOperand(0);
-    FindMergedConditions(Cond, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
-                         !InvertCond);
-    return;
+    const Value *CondOp = BinaryOperator::getNotArgument(Cond);
+    if (InBlock(CondOp, CurBB->getBasicBlock())) {
+      FindMergedConditions(CondOp, TBB, FBB, CurBB, SwitchBB, Opc, TProb, FProb,
+                           !InvertCond);
+      return;
+    }
   }
 
   const Instruction *BOp = dyn_cast<Instruction>(Cond);
@@ -2062,7 +2028,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
     Entry.Node = StackSlot;
     Entry.Ty = FnTy->getParamType(0);
     if (Fn->hasAttribute(1, Attribute::AttrKind::InReg))
-      Entry.isInReg = true;
+      Entry.IsInReg = true;
     Args.push_back(Entry);
 
     TargetLowering::CallLoweringInfo CLI(DAG);
@@ -2616,13 +2582,13 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned OpCode) {
   Flags.setNoSignedWrap(nsw);
   Flags.setNoUnsignedWrap(nuw);
   Flags.setVectorReduction(vec_redux);
-  if (EnableFMFInDAG) {
-    Flags.setAllowReciprocal(FMF.allowReciprocal());
-    Flags.setNoInfs(FMF.noInfs());
-    Flags.setNoNaNs(FMF.noNaNs());
-    Flags.setNoSignedZeros(FMF.noSignedZeros());
-    Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());
-  }
+  Flags.setAllowReciprocal(FMF.allowReciprocal());
+  Flags.setAllowContract(FMF.allowContract());
+  Flags.setNoInfs(FMF.noInfs());
+  Flags.setNoNaNs(FMF.noNaNs());
+  Flags.setNoSignedZeros(FMF.noSignedZeros());
+  Flags.setUnsafeAlgebra(FMF.unsafeAlgebra());
+
   SDValue BinNodeValue = DAG.getNode(OpCode, getCurSDLoc(), Op1.getValueType(),
                                      Op1, Op2, &Flags);
   setValue(&I, BinNodeValue);
@@ -2949,7 +2915,7 @@ void SelectionDAGBuilder::visitBitCast(const User &I) {
                              DestVT, N)); // convert types.
   // Check if the original LLVM IR Operand was a ConstantInt, because getValue()
   // might fold any kind of constant expression to an integer constant and that
-  // is not what we are looking for. Only regcognize a bitcast of a genuine
+  // is not what we are looking for. Only recognize a bitcast of a genuine
   // constant integer as an opaque constant.
   else if(ConstantInt *C = dyn_cast<ConstantInt>(I.getOperand(0)))
     setValue(&I, DAG.getConstant(C->getValue(), dl, DestVT, /*isTarget=*/false,
@@ -3102,14 +3068,10 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
 
   if (SrcNumElts > MaskNumElts) {
     // Analyze the access pattern of the vector to see if we can extract
-    // two subvectors and do the shuffle. The analysis is done by calculating
-    // the range of elements the mask access on both vectors.
-    int MinRange[2] = { static_cast<int>(SrcNumElts),
-                        static_cast<int>(SrcNumElts)};
-    int MaxRange[2] = {-1, -1};
-
-    for (unsigned i = 0; i != MaskNumElts; ++i) {
-      int Idx = Mask[i];
+    // two subvectors and do the shuffle.
+    int StartIdx[2] = { -1, -1 };  // StartIdx to extract from
+    bool CanExtract = true;
+    for (int Idx : Mask) {
       unsigned Input = 0;
       if (Idx < 0)
         continue;
@@ -3118,41 +3080,28 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
         Input = 1;
         Idx -= SrcNumElts;
       }
-      if (Idx > MaxRange[Input])
-        MaxRange[Input] = Idx;
-      if (Idx < MinRange[Input])
-        MinRange[Input] = Idx;
-    }
-
-    // Check if the access is smaller than the vector size and can we find
-    // a reasonable extract index.
-    int RangeUse[2] = { -1, -1 };  // 0 = Unused, 1 = Extract, -1 = Can not
-                                   // Extract.
-    int StartIdx[2];  // StartIdx to extract from
-    for (unsigned Input = 0; Input < 2; ++Input) {
-      if (MinRange[Input] >= (int)SrcNumElts && MaxRange[Input] < 0) {
-        RangeUse[Input] = 0; // Unused
-        StartIdx[Input] = 0;
-        continue;
-      }
 
-      // Find a good start index that is a multiple of the mask length. Then
-      // see if the rest of the elements are in range.
-      StartIdx[Input] = (MinRange[Input]/MaskNumElts)*MaskNumElts;
-      if (MaxRange[Input] - StartIdx[Input] < (int)MaskNumElts &&
-          StartIdx[Input] + MaskNumElts <= SrcNumElts)
-        RangeUse[Input] = 1; // Extract from a multiple of the mask length.
+      // If all the indices come from the same MaskNumElts sized portion of
+      // the sources we can use extract. Also make sure the extract wouldn't
+      // extract past the end of the source.
+      int NewStartIdx = alignDown(Idx, MaskNumElts);
+      if (NewStartIdx + MaskNumElts > SrcNumElts ||
+          (StartIdx[Input] >= 0 && StartIdx[Input] != NewStartIdx))
+        CanExtract = false;
+      // Make sure we always update StartIdx as we use it to track if all
+      // elements are undef.
+      StartIdx[Input] = NewStartIdx;
     }
 
-    if (RangeUse[0] == 0 && RangeUse[1] == 0) {
+    if (StartIdx[0] < 0 && StartIdx[1] < 0) {
       setValue(&I, DAG.getUNDEF(VT)); // Vectors are not used.
       return;
     }
-    if (RangeUse[0] >= 0 && RangeUse[1] >= 0) {
+    if (CanExtract) {
       // Extract appropriate subvector and generate a vector shuffle
       for (unsigned Input = 0; Input < 2; ++Input) {
         SDValue &Src = Input == 0 ? Src1 : Src2;
-        if (RangeUse[Input] == 0)
+        if (StartIdx[Input] < 0)
           Src = DAG.getUNDEF(VT);
         else {
           Src = DAG.getNode(
@@ -3163,16 +3112,12 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
       }
 
       // Calculate new mask.
-      SmallVector<int, 8> MappedOps;
-      for (unsigned i = 0; i != MaskNumElts; ++i) {
-        int Idx = Mask[i];
-        if (Idx >= 0) {
-          if (Idx < (int)SrcNumElts)
-            Idx -= StartIdx[0];
-          else
-            Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
-        }
-        MappedOps.push_back(Idx);
+      SmallVector<int, 8> MappedOps(Mask.begin(), Mask.end());
+      for (int &Idx : MappedOps) {
+        if (Idx >= (int)SrcNumElts)
+          Idx -= SrcNumElts + StartIdx[1] - MaskNumElts;
+        else if (Idx >= 0)
+          Idx -= StartIdx[0];
       }
 
       setValue(&I, DAG.getVectorShuffle(VT, DL, Src1, Src2, MappedOps));
@@ -3186,8 +3131,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   EVT EltVT = VT.getVectorElementType();
   EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
   SmallVector<SDValue,8> Ops;
-  for (unsigned i = 0; i != MaskNumElts; ++i) {
-    int Idx = Mask[i];
+  for (int Idx : Mask) {
     SDValue Res;
 
     if (Idx < 0) {
@@ -3316,7 +3260,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         // N = N + Offset
         uint64_t Offset = DL->getStructLayout(StTy)->getElementOffset(Field);
 
-        // In an inbouds GEP with an offset that is nonnegative even when
+        // In an inbounds GEP with an offset that is nonnegative even when
         // interpreted as signed, assume there is no unsigned overflow.
         SDNodeFlags Flags;
         if (int64_t(Offset) >= 0 && cast<GEPOperator>(I).isInBounds())
@@ -4829,9 +4773,9 @@ SDDbgValue *SelectionDAGBuilder::getDbgValue(SDValue N,
 #  define setjmp_undefined_for_msvc
 #endif
 
-/// visitIntrinsicCall - Lower the call to the specified intrinsic function.  If
-/// we want to emit this as a call to a named external function, return the name
-/// otherwise lower it and return null.
+/// Lower the call to the specified intrinsic function. If we want to emit this
+/// as a call to a named external function, return the name. Otherwise, lower it
+/// and return null.
 const char *
 SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -4964,14 +4908,12 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
       report_fatal_error("Unsupported element size");
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(sdl)
-        .setChain(getRoot())
-        .setCallee(TLI.getLibcallCallingConv(LibraryCall),
-                   Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol(
-                       TLI.getLibcallName(LibraryCall),
-                       TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args));
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
+        TLI.getLibcallCallingConv(LibraryCall),
+        Type::getVoidTy(*DAG.getContext()),
+        DAG.getExternalSymbol(TLI.getLibcallName(LibraryCall),
+                              TLI.getPointerTy(DAG.getDataLayout())),
+        std::move(Args));
 
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
     DAG.setRoot(CallResult.second);
@@ -5579,7 +5521,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
   case Intrinsic::trap: {
     StringRef TrapFuncName =
         I.getAttributes()
-            .getAttribute(AttributeSet::FunctionIndex, "trap-func-name")
+            .getAttribute(AttributeList::FunctionIndex, "trap-func-name")
             .getValueAsString();
     if (TrapFuncName.empty()) {
       ISD::NodeType Op = (Intrinsic == Intrinsic::trap) ?
@@ -5590,7 +5532,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) {
     TargetLowering::ArgListTy Args;
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(sdl).setChain(getRoot()).setCallee(
+    CLI.setDebugLoc(sdl).setChain(getRoot()).setLibCallee(
         CallingConv::C, I.getType(),
         DAG.getExternalSymbol(TrapFuncName.data(),
                               TLI.getPointerTy(DAG.getDataLayout())),
@@ -5909,13 +5851,22 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   Type *RetTy = CS.getType();
 
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
   Args.reserve(CS.arg_size());
 
   const Value *SwiftErrorVal = nullptr;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+
+  // We can't tail call inside a function with a swifterror argument. Lowering
+  // does not support this yet. It would have to move into the swifterror
+  // register before the call.
+  auto *Caller = CS.getInstruction()->getParent()->getParent();
+  if (TLI.supportSwiftError() &&
+      Caller->getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    isTailCall = false;
+
   for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
        i != e; ++i) {
+    TargetLowering::ArgListEntry Entry;
     const Value *V = *i;
 
     // Skip empty types
@@ -5929,7 +5880,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     Entry.setAttributes(&CS, i - CS.arg_begin() + 1);
 
     // Use swifterror virtual register as input to the call.
-    if (Entry.isSwiftError && TLI.supportSwiftError()) {
+    if (Entry.IsSwiftError && TLI.supportSwiftError()) {
       SwiftErrorVal = V;
       // We find the virtual register for the actual swifterror argument.
       // Instead of using the Value, we use the virtual register instead.
@@ -5942,7 +5893,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
     // If we have an explicit sret argument that is an Instruction, (i.e., it
     // might point to function-local memory), we can't meaningfully tail-call.
-    if (Entry.isSRet && isa<Instruction>(V))
+    if (Entry.IsSRet && isa<Instruction>(V))
       isTailCall = false;
   }
 
@@ -5985,8 +5936,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   }
 }
 
-/// IsOnlyUsedInZeroEqualityComparison - Return true if it only matters that the
-/// value is equal or not-equal to zero.
+/// Return true if it only matters that the value is equal or not-equal to zero.
 static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
   for (const User *U : V->users()) {
     if (const ICmpInst *IC = dyn_cast<ICmpInst>(U))
@@ -6001,13 +5951,17 @@ static bool IsOnlyUsedInZeroEqualityComparison(const Value *V) {
 }
 
 static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
-                             Type *LoadTy,
                              SelectionDAGBuilder &Builder) {
 
   // Check to see if this load can be trivially constant folded, e.g. if the
   // input is from a string literal.
   if (const Constant *LoadInput = dyn_cast<Constant>(PtrVal)) {
     // Cast pointer to the type we really want to load.
+    Type *LoadTy =
+        Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
+    if (LoadVT.isVector())
+      LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());
+
     LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
                                          PointerType::getUnqual(LoadTy));
 
@@ -6040,8 +5994,8 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
   return LoadVal;
 }
 
-/// processIntegerCallValue - Record the value for an instruction that
-/// produces an integer result, converting the type where necessary.
+/// Record the value for an instruction that produces an integer result,
+/// converting the type where necessary.
 void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
                                                   SDValue Value,
                                                   bool IsSigned) {
@@ -6054,20 +6008,13 @@ void SelectionDAGBuilder::processIntegerCallValue(const Instruction &I,
   setValue(&I, Value);
 }
 
-/// visitMemCmpCall - See if we can lower a call to memcmp in an optimized form.
-/// If so, return true and lower it, otherwise return false and it will be
-/// lowered like a normal call.
+/// See if we can lower a memcmp call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  int memcmp(void*,void*,size_t)
-  if (I.getNumArgOperands() != 3)
-    return false;
-
   const Value *LHS = I.getArgOperand(0), *RHS = I.getArgOperand(1);
-  if (!LHS->getType()->isPointerTy() || !RHS->getType()->isPointerTy() ||
-      !I.getArgOperand(2)->getType()->isIntegerTy() ||
-      !I.getType()->isIntegerTy())
-    return false;
-
   const Value *Size = I.getArgOperand(2);
   const ConstantInt *CSize = dyn_cast<ConstantInt>(Size);
   if (CSize && CSize->getZExtValue() == 0) {
@@ -6078,11 +6025,9 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
   }
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
-  std::pair<SDValue, SDValue> Res =
-    TSI.EmitTargetCodeForMemcmp(DAG, getCurSDLoc(), DAG.getRoot(),
-                                getValue(LHS), getValue(RHS), getValue(Size),
-                                MachinePointerInfo(LHS),
-                                MachinePointerInfo(RHS));
+  std::pair<SDValue, SDValue> Res = TSI.EmitTargetCodeForMemcmp(
+      DAG, getCurSDLoc(), DAG.getRoot(), getValue(LHS), getValue(RHS),
+      getValue(Size), MachinePointerInfo(LHS), MachinePointerInfo(RHS));
   if (Res.first.getNode()) {
     processIntegerCallValue(I, Res.first, true);
     PendingLoads.push_back(Res.second);
@@ -6091,88 +6036,79 @@ bool SelectionDAGBuilder::visitMemCmpCall(const CallInst &I) {
 
   // memcmp(S1,S2,2) != 0 -> (*(short*)LHS != *(short*)RHS)  != 0
   // memcmp(S1,S2,4) != 0 -> (*(int*)LHS != *(int*)RHS)  != 0
-  if (CSize && IsOnlyUsedInZeroEqualityComparison(&I)) {
-    bool ActuallyDoIt = true;
-    MVT LoadVT;
-    Type *LoadTy;
-    switch (CSize->getZExtValue()) {
-    default:
-      LoadVT = MVT::Other;
-      LoadTy = nullptr;
-      ActuallyDoIt = false;
-      break;
-    case 2:
-      LoadVT = MVT::i16;
-      LoadTy = Type::getInt16Ty(CSize->getContext());
-      break;
-    case 4:
-      LoadVT = MVT::i32;
-      LoadTy = Type::getInt32Ty(CSize->getContext());
-      break;
-    case 8:
-      LoadVT = MVT::i64;
-      LoadTy = Type::getInt64Ty(CSize->getContext());
-      break;
-        /*
-    case 16:
-      LoadVT = MVT::v4i32;
-      LoadTy = Type::getInt32Ty(CSize->getContext());
-      LoadTy = VectorType::get(LoadTy, 4);
-      break;
-         */
-    }
-
-    // This turns into unaligned loads.  We only do this if the target natively
-    // supports the MVT we'll be loading or if it is small enough (<= 4) that
-    // we'll only produce a small number of byte loads.
+  if (!CSize || !IsOnlyUsedInZeroEqualityComparison(&I))
+    return false;
 
-    // Require that we can find a legal MVT, and only do this if the target
-    // supports unaligned loads of that type.  Expanding into byte loads would
-    // bloat the code.
+  // If the target has a fast compare for the given size, it will return a
+  // preferred load type for that size. Require that the load VT is legal and
+  // that the target supports unaligned loads of that type. Otherwise, return
+  // INVALID.
+  auto hasFastLoadsAndCompare = [&](unsigned NumBits) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-    if (ActuallyDoIt && CSize->getZExtValue() > 4) {
-      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
-      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+    MVT LVT = TLI.hasFastEqualityCompare(NumBits);
+    if (LVT != MVT::INVALID_SIMPLE_VALUE_TYPE) {
       // TODO: Handle 5 byte compare as 4-byte + 1 byte.
       // TODO: Handle 8 byte compare on x86-32 as two 32-bit loads.
       // TODO: Check alignment of src and dest ptrs.
-      if (!TLI.isTypeLegal(LoadVT) ||
-          !TLI.allowsMisalignedMemoryAccesses(LoadVT, SrcAS) ||
-          !TLI.allowsMisalignedMemoryAccesses(LoadVT, DstAS))
-        ActuallyDoIt = false;
+      unsigned DstAS = LHS->getType()->getPointerAddressSpace();
+      unsigned SrcAS = RHS->getType()->getPointerAddressSpace();
+      if (!TLI.isTypeLegal(LVT) ||
+          !TLI.allowsMisalignedMemoryAccesses(LVT, SrcAS) ||
+          !TLI.allowsMisalignedMemoryAccesses(LVT, DstAS))
+        LVT = MVT::INVALID_SIMPLE_VALUE_TYPE;
     }
 
-    if (ActuallyDoIt) {
-      SDValue LHSVal = getMemCmpLoad(LHS, LoadVT, LoadTy, *this);
-      SDValue RHSVal = getMemCmpLoad(RHS, LoadVT, LoadTy, *this);
+    return LVT;
+  };
 
-      SDValue Res = DAG.getSetCC(getCurSDLoc(), MVT::i1, LHSVal, RHSVal,
-                                 ISD::SETNE);
-      processIntegerCallValue(I, Res, false);
-      return true;
-    }
+  // This turns into unaligned loads. We only do this if the target natively
+  // supports the MVT we'll be loading or if it is small enough (<= 4) that
+  // we'll only produce a small number of byte loads.
+  MVT LoadVT;
+  unsigned NumBitsToCompare = CSize->getZExtValue() * 8;
+  switch (NumBitsToCompare) {
+  default:
+    return false;
+  case 16:
+    LoadVT = MVT::i16;
+    break;
+  case 32:
+    LoadVT = MVT::i32;
+    break;
+  case 64:
+  case 128:
+  case 256:
+    LoadVT = hasFastLoadsAndCompare(NumBitsToCompare);
+    break;
   }
 
+  if (LoadVT == MVT::INVALID_SIMPLE_VALUE_TYPE)
+    return false;
 
-  return false;
+  SDValue LoadL = getMemCmpLoad(LHS, LoadVT, *this);
+  SDValue LoadR = getMemCmpLoad(RHS, LoadVT, *this);
+
+  // Bitcast to a wide integer type if the loads are vectors.
+  if (LoadVT.isVector()) {
+    EVT CmpVT = EVT::getIntegerVT(LHS->getContext(), LoadVT.getSizeInBits());
+    LoadL = DAG.getBitcast(CmpVT, LoadL);
+    LoadR = DAG.getBitcast(CmpVT, LoadR);
+  }
+
+  SDValue Cmp = DAG.getSetCC(getCurSDLoc(), MVT::i1, LoadL, LoadR, ISD::SETNE);
+  processIntegerCallValue(I, Cmp, false);
+  return true;
 }
 
-/// visitMemChrCall -- See if we can lower a memchr call into an optimized
-/// form.  If so, return true and lower it, otherwise return false and it
-/// will be lowered like a normal call.
+/// See if we can lower a memchr call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  void *memchr(void *, int, size_t)
-  if (I.getNumArgOperands() != 3)
-    return false;
-
   const Value *Src = I.getArgOperand(0);
   const Value *Char = I.getArgOperand(1);
   const Value *Length = I.getArgOperand(2);
-  if (!Src->getType()->isPointerTy() ||
-      !Char->getType()->isIntegerTy() ||
-      !Length->getType()->isIntegerTy() ||
-      !I.getType()->isPointerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6188,15 +6124,12 @@ bool SelectionDAGBuilder::visitMemChrCall(const CallInst &I) {
   return false;
 }
 
-///
-/// visitMemPCpyCall -- lower a mempcpy call as a memcpy followed by code to
-/// to adjust the dst pointer by the size of the copied memory.
+/// See if we can lower a mempcpy call into an optimized form. If so, return
+/// true and lower it. Otherwise return false, and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
-
-  // Verify argument count: void *mempcpy(void *, const void *, size_t)
-  if (I.getNumArgOperands() != 3)
-    return false;
-
   SDValue Dst = getValue(I.getArgOperand(0));
   SDValue Src = getValue(I.getArgOperand(1));
   SDValue Size = getValue(I.getArgOperand(2));
@@ -6231,19 +6164,13 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
   return true;
 }
 
-/// visitStrCpyCall -- See if we can lower a strcpy or stpcpy call into an
-/// optimized form.  If so, return true and lower it, otherwise return false
-/// and it will be lowered like a normal call.
+/// See if we can lower a strcpy call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
-  // Verify that the prototype makes sense.  char *strcpy(char *, char *)
-  if (I.getNumArgOperands() != 2)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
-  if (!Arg0->getType()->isPointerTy() ||
-      !Arg1->getType()->isPointerTy() ||
-      !I.getType()->isPointerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6260,19 +6187,13 @@ bool SelectionDAGBuilder::visitStrCpyCall(const CallInst &I, bool isStpcpy) {
   return false;
 }
 
-/// visitStrCmpCall - See if we can lower a call to strcmp in an optimized form.
-/// If so, return true and lower it, otherwise return false and it will be
-/// lowered like a normal call.
+/// See if we can lower a strcmp call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  int strcmp(void*,void*)
-  if (I.getNumArgOperands() != 2)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
-  if (!Arg0->getType()->isPointerTy() ||
-      !Arg1->getType()->isPointerTy() ||
-      !I.getType()->isIntegerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6289,17 +6210,13 @@ bool SelectionDAGBuilder::visitStrCmpCall(const CallInst &I) {
   return false;
 }
 
-/// visitStrLenCall -- See if we can lower a strlen call into an optimized
-/// form.  If so, return true and lower it, otherwise return false and it
-/// will be lowered like a normal call.
+/// See if we can lower a strlen call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  size_t strlen(char *)
-  if (I.getNumArgOperands() != 1)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0);
-  if (!Arg0->getType()->isPointerTy() || !I.getType()->isIntegerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6314,19 +6231,13 @@ bool SelectionDAGBuilder::visitStrLenCall(const CallInst &I) {
   return false;
 }
 
-/// visitStrNLenCall -- See if we can lower a strnlen call into an optimized
-/// form.  If so, return true and lower it, otherwise return false and it
-/// will be lowered like a normal call.
+/// See if we can lower a strnlen call into an optimized form.  If so, return
+/// true and lower it, otherwise return false and it will be lowered like a
+/// normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
-  // Verify that the prototype makes sense.  size_t strnlen(char *, size_t)
-  if (I.getNumArgOperands() != 2)
-    return false;
-
   const Value *Arg0 = I.getArgOperand(0), *Arg1 = I.getArgOperand(1);
-  if (!Arg0->getType()->isPointerTy() ||
-      !Arg1->getType()->isIntegerTy() ||
-      !I.getType()->isIntegerTy())
-    return false;
 
   const SelectionDAGTargetInfo &TSI = DAG.getSelectionDAGInfo();
   std::pair<SDValue, SDValue> Res =
@@ -6342,16 +6253,15 @@ bool SelectionDAGBuilder::visitStrNLenCall(const CallInst &I) {
   return false;
 }
 
-/// visitUnaryFloatCall - If a call instruction is a unary floating-point
-/// operation (as expected), translate it to an SDNode with the specified opcode
-/// and return true.
+/// See if we can lower a unary floating-point operation into an SDNode with
+/// the specified Opcode.  If so, return true and lower it, otherwise return
+/// false and it will be lowered like a normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
                                               unsigned Opcode) {
-  // Sanity check that it really is a unary floating-point call.
-  if (I.getNumArgOperands() != 1 ||
-      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
-      I.getType() != I.getArgOperand(0)->getType() ||
-      !I.onlyReadsMemory())
+  // We already checked this call's prototype; verify it doesn't modify errno.
+  if (!I.onlyReadsMemory())
     return false;
 
   SDValue Tmp = getValue(I.getArgOperand(0));
@@ -6359,17 +6269,15 @@ bool SelectionDAGBuilder::visitUnaryFloatCall(const CallInst &I,
   return true;
 }
 
-/// visitBinaryFloatCall - If a call instruction is a binary floating-point
-/// operation (as expected), translate it to an SDNode with the specified opcode
-/// and return true.
+/// See if we can lower a binary floating-point operation into an SDNode with
+/// the specified Opcode. If so, return true and lower it. Otherwise return
+/// false, and it will be lowered like a normal call.
+/// The caller already checked that \p I calls the appropriate LibFunc with a
+/// correct prototype.
 bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
                                                unsigned Opcode) {
-  // Sanity check that it really is a binary floating-point call.
-  if (I.getNumArgOperands() != 2 ||
-      !I.getArgOperand(0)->getType()->isFloatingPointTy() ||
-      I.getType() != I.getArgOperand(0)->getType() ||
-      I.getType() != I.getArgOperand(1)->getType() ||
-      !I.onlyReadsMemory())
+  // We already checked this call's prototype; verify it doesn't modify errno.
+  if (!I.onlyReadsMemory())
     return false;
 
   SDValue Tmp0 = getValue(I.getArgOperand(0));
@@ -6411,18 +6319,16 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
     // some reason.
     LibFunc Func;
     if (!I.isNoBuiltin() && !F->hasLocalLinkage() && F->hasName() &&
-        LibInfo->getLibFunc(F->getName(), Func) &&
+        LibInfo->getLibFunc(*F, Func) &&
         LibInfo->hasOptimizedCodeGen(Func)) {
       switch (Func) {
       default: break;
       case LibFunc_copysign:
       case LibFunc_copysignf:
       case LibFunc_copysignl:
-        if (I.getNumArgOperands() == 2 &&   // Basic sanity checks.
-            I.getArgOperand(0)->getType()->isFloatingPointTy() &&
-            I.getType() == I.getArgOperand(0)->getType() &&
-            I.getType() == I.getArgOperand(1)->getType() &&
-            I.onlyReadsMemory()) {
+        // We already checked this call's prototype; verify it doesn't modify
+        // errno.
+        if (I.onlyReadsMemory()) {
           SDValue LHS = getValue(I.getArgOperand(0));
           SDValue RHS = getValue(I.getArgOperand(1));
           setValue(&I, DAG.getNode(ISD::FCOPYSIGN, getCurSDLoc(),
@@ -7704,9 +7610,9 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   FuncInfo.MF->getFrameInfo().setHasPatchPoint();
 }
 
-/// Returns an AttributeSet representing the attributes applied to the return
+/// Returns an AttributeList representing the attributes applied to the return
 /// value of the given call.
-static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
+static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
   SmallVector<Attribute::AttrKind, 2> Attrs;
   if (CLI.RetSExt)
     Attrs.push_back(Attribute::SExt);
@@ -7715,8 +7621,8 @@ static AttributeSet getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) {
   if (CLI.IsInReg)
     Attrs.push_back(Attribute::InReg);
 
-  return AttributeSet::get(CLI.RetTy->getContext(), AttributeSet::ReturnIndex,
-                           Attrs);
+  return AttributeList::get(CLI.RetTy->getContext(), AttributeList::ReturnIndex,
+                            Attrs);
 }
 
 /// TargetLowering::LowerCallTo - This is the default LowerCallTo
@@ -7756,15 +7662,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     ArgListEntry Entry;
     Entry.Node = DemoteStackSlot;
     Entry.Ty = StackSlotPtrType;
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Entry.isInReg = false;
-    Entry.isSRet = true;
-    Entry.isNest = false;
-    Entry.isByVal = false;
-    Entry.isReturned = false;
-    Entry.isSwiftSelf = false;
-    Entry.isSwiftError = false;
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
+    Entry.IsInReg = false;
+    Entry.IsSRet = true;
+    Entry.IsNest = false;
+    Entry.IsByVal = false;
+    Entry.IsReturned = false;
+    Entry.IsSwiftSelf = false;
+    Entry.IsSwiftError = false;
     Entry.Alignment = Align;
     CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
     CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
@@ -7797,7 +7703,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
   ArgListTy &Args = CLI.getArgs();
   if (supportSwiftError()) {
     for (unsigned i = 0, e = Args.size(); i != e; ++i) {
-      if (Args[i].isSwiftError) {
+      if (Args[i].IsSwiftError) {
         ISD::InputArg MyFlags;
         MyFlags.VT = getPointerTy(DL);
         MyFlags.ArgVT = EVT(getPointerTy(DL));
@@ -7814,7 +7720,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     SmallVector<EVT, 4> ValueVTs;
     ComputeValueVTs(*this, DL, Args[i].Ty, ValueVTs);
     Type *FinalType = Args[i].Ty;
-    if (Args[i].isByVal)
+    if (Args[i].IsByVal)
       FinalType = cast<PointerType>(Args[i].Ty)->getElementType();
     bool NeedsRegBlock = functionArgumentNeedsConsecutiveRegisters(
         FinalType, CLI.CallConv, CLI.IsVarArg);
@@ -7827,11 +7733,11 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       ISD::ArgFlagsTy Flags;
       unsigned OriginalAlignment = DL.getABITypeAlignment(ArgTy);
 
-      if (Args[i].isZExt)
+      if (Args[i].IsZExt)
         Flags.setZExt();
-      if (Args[i].isSExt)
+      if (Args[i].IsSExt)
         Flags.setSExt();
-      if (Args[i].isInReg) {
+      if (Args[i].IsInReg) {
         // If we are using vectorcall calling convention, a structure that is
         // passed InReg - is surely an HVA
         if (CLI.CallConv == CallingConv::X86_VectorCall &&
@@ -7844,15 +7750,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // Set InReg Flag
         Flags.setInReg();
       }
-      if (Args[i].isSRet)
+      if (Args[i].IsSRet)
         Flags.setSRet();
-      if (Args[i].isSwiftSelf)
+      if (Args[i].IsSwiftSelf)
         Flags.setSwiftSelf();
-      if (Args[i].isSwiftError)
+      if (Args[i].IsSwiftError)
         Flags.setSwiftError();
-      if (Args[i].isByVal)
+      if (Args[i].IsByVal)
         Flags.setByVal();
-      if (Args[i].isInAlloca) {
+      if (Args[i].IsInAlloca) {
         Flags.setInAlloca();
         // Set the byval flag for CCAssignFn callbacks that don't know about
         // inalloca.  This way we can know how many bytes we should've allocated
@@ -7861,7 +7767,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
-      if (Args[i].isByVal || Args[i].isInAlloca) {
+      if (Args[i].IsByVal || Args[i].IsInAlloca) {
         PointerType *Ty = cast<PointerType>(Args[i].Ty);
         Type *ElementTy = Ty->getElementType();
         Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
@@ -7874,7 +7780,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
           FrameAlign = getByValTypeAlignment(ElementTy, DL);
         Flags.setByValAlign(FrameAlign);
       }
-      if (Args[i].isNest)
+      if (Args[i].IsNest)
         Flags.setNest();
       if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
@@ -7885,13 +7791,13 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
       SmallVector<SDValue, 4> Parts(NumParts);
       ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
-      if (Args[i].isSExt)
+      if (Args[i].IsSExt)
         ExtendKind = ISD::SIGN_EXTEND;
-      else if (Args[i].isZExt)
+      else if (Args[i].IsZExt)
         ExtendKind = ISD::ZERO_EXTEND;
 
       // Conservatively only handle 'returned' on non-vectors for now
-      if (Args[i].isReturned && !Op.getValueType().isVector()) {
+      if (Args[i].IsReturned && !Op.getValueType().isVector()) {
         assert(CLI.RetTy == Args[i].Ty && RetTys.size() == NumValues &&
                "unexpected use of 'returned'");
         // Before passing 'returned' to the target lowering code, ensure that
@@ -7905,9 +7811,9 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // parameter extension method is not compatible with the return
         // extension method
         if ((NumParts * PartVT.getSizeInBits() == VT.getSizeInBits()) ||
-            (ExtendKind != ISD::ANY_EXTEND &&
-             CLI.RetSExt == Args[i].isSExt && CLI.RetZExt == Args[i].isZExt))
-        Flags.setReturned();
+            (ExtendKind != ISD::ANY_EXTEND && CLI.RetSExt == Args[i].IsSExt &&
+             CLI.RetZExt == Args[i].IsZExt))
+          Flags.setReturned();
       }
 
       getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
@@ -8083,6 +7989,173 @@ static bool isOnlyUsedInEntryBlock(const Argument *A, bool FastISel) {
   return true;
 }
 
+typedef DenseMap<const Argument *,
+                 std::pair<const AllocaInst *, const StoreInst *>>
+    ArgCopyElisionMapTy;
+
+/// Scan the entry block of the function in FuncInfo for arguments that look
+/// like copies into a local alloca. Record any copied arguments in
+/// ArgCopyElisionCandidates.
+static void
+findArgumentCopyElisionCandidates(const DataLayout &DL,
+                                  FunctionLoweringInfo *FuncInfo,
+                                  ArgCopyElisionMapTy &ArgCopyElisionCandidates) {
+  // Record the state of every static alloca used in the entry block. Argument
+  // allocas are all used in the entry block, so we need approximately as many
+  // entries as we have arguments.
+  enum StaticAllocaInfo { Unknown, Clobbered, Elidable };
+  SmallDenseMap<const AllocaInst *, StaticAllocaInfo, 8> StaticAllocas;
+  unsigned NumArgs = FuncInfo->Fn->arg_size();
+  StaticAllocas.reserve(NumArgs * 2);
+
+  auto GetInfoIfStaticAlloca = [&](const Value *V) -> StaticAllocaInfo * {
+    if (!V)
+      return nullptr;
+    V = V->stripPointerCasts();
+    const auto *AI = dyn_cast<AllocaInst>(V);
+    if (!AI || !AI->isStaticAlloca() || !FuncInfo->StaticAllocaMap.count(AI))
+      return nullptr;
+    auto Iter = StaticAllocas.insert({AI, Unknown});
+    return &Iter.first->second;
+  };
+
+  // Look for stores of arguments to static allocas. Look through bitcasts and
+  // GEPs to handle type coercions, as long as the alloca is fully initialized
+  // by the store. Any non-store use of an alloca escapes it and any subsequent
+  // unanalyzed store might write it.
+  // FIXME: Handle structs initialized with multiple stores.
+  for (const Instruction &I : FuncInfo->Fn->getEntryBlock()) {
+    // Look for stores, and handle non-store uses conservatively.
+    const auto *SI = dyn_cast<StoreInst>(&I);
+    if (!SI) {
+      // We will look through cast uses, so ignore them completely.
+      if (I.isCast())
+        continue;
+      // Ignore debug info intrinsics, they don't escape or store to allocas.
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+      // This is an unknown instruction. Assume it escapes or writes to all
+      // static alloca operands.
+      for (const Use &U : I.operands()) {
+        if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(U))
+          *Info = StaticAllocaInfo::Clobbered;
+      }
+      continue;
+    }
+
+    // If the stored value is a static alloca, mark it as escaped.
+    if (StaticAllocaInfo *Info = GetInfoIfStaticAlloca(SI->getValueOperand()))
+      *Info = StaticAllocaInfo::Clobbered;
+
+    // Check if the destination is a static alloca.
+    const Value *Dst = SI->getPointerOperand()->stripPointerCasts();
+    StaticAllocaInfo *Info = GetInfoIfStaticAlloca(Dst);
+    if (!Info)
+      continue;
+    const AllocaInst *AI = cast<AllocaInst>(Dst);
+
+    // Skip allocas that have been initialized or clobbered.
+    if (*Info != StaticAllocaInfo::Unknown)
+      continue;
+
+    // Check if the stored value is an argument, and that this store fully
+    // initializes the alloca. Don't elide copies from the same argument twice.
+    const Value *Val = SI->getValueOperand()->stripPointerCasts();
+    const auto *Arg = dyn_cast<Argument>(Val);
+    if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() ||
+        Arg->getType()->isEmptyTy() ||
+        DL.getTypeStoreSize(Arg->getType()) !=
+            DL.getTypeAllocSize(AI->getAllocatedType()) ||
+        ArgCopyElisionCandidates.count(Arg)) {
+      *Info = StaticAllocaInfo::Clobbered;
+      continue;
+    }
+
+    DEBUG(dbgs() << "Found argument copy elision candidate: " << *AI << '\n');
+
+    // Mark this alloca and store for argument copy elision.
+    *Info = StaticAllocaInfo::Elidable;
+    ArgCopyElisionCandidates.insert({Arg, {AI, SI}});
+
+    // Stop scanning if we've seen all arguments. This will happen early in -O0
+    // builds, which is useful, because -O0 builds have large entry blocks and
+    // many allocas.
+    if (ArgCopyElisionCandidates.size() == NumArgs)
+      break;
+  }
+}
+
+/// Try to elide argument copies from memory into a local alloca. Succeeds if
+/// ArgVal is a load from a suitable fixed stack object.
+static void tryToElideArgumentCopy(
+    FunctionLoweringInfo *FuncInfo, SmallVectorImpl<SDValue> &Chains,
+    DenseMap<int, int> &ArgCopyElisionFrameIndexMap,
+    SmallPtrSetImpl<const Instruction *> &ElidedArgCopyInstrs,
+    ArgCopyElisionMapTy &ArgCopyElisionCandidates, const Argument &Arg,
+    SDValue ArgVal, bool &ArgHasUses) {
+  // Check if this is a load from a fixed stack object.
+  auto *LNode = dyn_cast<LoadSDNode>(ArgVal);
+  if (!LNode)
+    return;
+  auto *FINode = dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode());
+  if (!FINode)
+    return;
+
+  // Check that the fixed stack object is the right size and alignment.
+  // Look at the alignment that the user wrote on the alloca instead of looking
+  // at the stack object.
+  auto ArgCopyIter = ArgCopyElisionCandidates.find(&Arg);
+  assert(ArgCopyIter != ArgCopyElisionCandidates.end());
+  const AllocaInst *AI = ArgCopyIter->second.first;
+  int FixedIndex = FINode->getIndex();
+  int &AllocaIndex = FuncInfo->StaticAllocaMap[AI];
+  int OldIndex = AllocaIndex;
+  MachineFrameInfo &MFI = FuncInfo->MF->getFrameInfo();
+  if (MFI.getObjectSize(FixedIndex) != MFI.getObjectSize(OldIndex)) {
+    DEBUG(dbgs() << "  argument copy elision failed due to bad fixed stack "
+                    "object size\n");
+    return;
+  }
+  unsigned RequiredAlignment = AI->getAlignment();
+  if (!RequiredAlignment) {
+    RequiredAlignment = FuncInfo->MF->getDataLayout().getABITypeAlignment(
+        AI->getAllocatedType());
+  }
+  if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
+    DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
+                    "greater than stack argument alignment ("
+                 << RequiredAlignment << " vs "
+                 << MFI.getObjectAlignment(FixedIndex) << ")\n");
+    return;
+  }
+
+  // Perform the elision. Delete the old stack object and replace its only use
+  // in the variable info map. Mark the stack object as mutable.
+  DEBUG({
+    dbgs() << "Eliding argument copy from " << Arg << " to " << *AI << '\n'
+           << "  Replacing frame index " << OldIndex << " with " << FixedIndex
+           << '\n';
+  });
+  MFI.RemoveStackObject(OldIndex);
+  MFI.setIsImmutableObjectIndex(FixedIndex, false);
+  AllocaIndex = FixedIndex;
+  ArgCopyElisionFrameIndexMap.insert({OldIndex, FixedIndex});
+  Chains.push_back(ArgVal.getValue(1));
+
+  // Avoid emitting code for the store implementing the copy.
+  const StoreInst *SI = ArgCopyIter->second.second;
+  ElidedArgCopyInstrs.insert(SI);
+
+  // Check for uses of the argument again so that we can avoid exporting ArgVal
+  // if it is't used by anything other than the store.
+  for (const Value *U : Arg.users()) {
+    if (U != SI) {
+      ArgHasUses = true;
+      break;
+    }
+  }
+}
+
 void SelectionDAGISel::LowerArguments(const Function &F) {
   SelectionDAG &DAG = SDB->DAG;
   SDLoc dl = SDB->getCurSDLoc();
@@ -8105,15 +8178,21 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     Ins.push_back(RetArg);
   }
 
+  // Look for stores of arguments to static allocas. Mark such arguments with a
+  // flag to ask the target to give us the memory location of that argument if
+  // available.
+  ArgCopyElisionMapTy ArgCopyElisionCandidates;
+  findArgumentCopyElisionCandidates(DL, FuncInfo, ArgCopyElisionCandidates);
+
   // Set up the incoming argument description vector.
-  unsigned Idx = 1;
-  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end();
-       I != E; ++I, ++Idx) {
+  unsigned Idx = 0;
+  for (const Argument &Arg : F.args()) {
+    ++Idx;
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs);
-    bool isArgValueUsed = !I->use_empty();
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
+    bool isArgValueUsed = !Arg.use_empty();
     unsigned PartBase = 0;
-    Type *FinalType = I->getType();
+    Type *FinalType = Arg.getType();
     if (F.getAttributes().hasAttribute(Idx, Attribute::ByVal))
       FinalType = cast<PointerType>(FinalType)->getElementType();
     bool NeedsRegBlock = TLI->functionArgumentNeedsConsecutiveRegisters(
@@ -8133,7 +8212,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // If we are using vectorcall calling convention, a structure that is
         // passed InReg - is surely an HVA
         if (F.getCallingConv() == CallingConv::X86_VectorCall &&
-            isa<StructType>(I->getType())) {
+            isa<StructType>(Arg.getType())) {
           // The first value of a structure is marked
           if (0 == Value)
             Flags.setHvaStart();
@@ -8165,7 +8244,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           Flags.setByVal();
       }
       if (Flags.isByVal() || Flags.isInAlloca()) {
-        PointerType *Ty = cast<PointerType>(I->getType());
+        PointerType *Ty = cast<PointerType>(Arg.getType());
         Type *ElementTy = Ty->getElementType();
         Flags.setByValSize(DL.getTypeAllocSize(ElementTy));
         // For ByVal, alignment should be passed from FE.  BE will guess if
@@ -8182,6 +8261,8 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       if (NeedsRegBlock)
         Flags.setInConsecutiveRegs();
       Flags.setOrigAlign(OriginalAlignment);
+      if (ArgCopyElisionCandidates.count(&Arg))
+        Flags.setCopyElisionCandidate();
 
       MVT RegisterVT = TLI->getRegisterType(*CurDAG->getContext(), VT);
       unsigned NumRegs = TLI->getNumRegisters(*CurDAG->getContext(), VT);
@@ -8228,7 +8309,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
 
   // Set up the argument values.
   unsigned i = 0;
-  Idx = 1;
+  Idx = 0;
   if (!FuncInfo->CanLowerReturn) {
     // Create a virtual register for the sret pointer, and put in a copy
     // from the sret argument into it.
@@ -8254,25 +8335,39 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     ++i;
   }
 
-  for (Function::const_arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E;
-      ++I, ++Idx) {
+  SmallVector<SDValue, 4> Chains;
+  DenseMap<int, int> ArgCopyElisionFrameIndexMap;
+  for (const Argument &Arg : F.args()) {
+    ++Idx;
     SmallVector<SDValue, 4> ArgValues;
     SmallVector<EVT, 4> ValueVTs;
-    ComputeValueVTs(*TLI, DAG.getDataLayout(), I->getType(), ValueVTs);
+    ComputeValueVTs(*TLI, DAG.getDataLayout(), Arg.getType(), ValueVTs);
     unsigned NumValues = ValueVTs.size();
+    if (NumValues == 0)
+      continue;
+
+    bool ArgHasUses = !Arg.use_empty();
+
+    // Elide the copying store if the target loaded this argument from a
+    // suitable fixed stack object.
+    if (Ins[i].Flags.isCopyElisionCandidate()) {
+      tryToElideArgumentCopy(FuncInfo, Chains, ArgCopyElisionFrameIndexMap,
+                             ElidedArgCopyInstrs, ArgCopyElisionCandidates, Arg,
+                             InVals[i], ArgHasUses);
+    }
 
     // If this argument is unused then remember its value. It is used to generate
     // debugging information.
     bool isSwiftErrorArg =
         TLI->supportSwiftError() &&
         F.getAttributes().hasAttribute(Idx, Attribute::SwiftError);
-    if (I->use_empty() && NumValues && !isSwiftErrorArg) {
-      SDB->setUnusedArgValue(&*I, InVals[i]);
+    if (!ArgHasUses && !isSwiftErrorArg) {
+      SDB->setUnusedArgValue(&Arg, InVals[i]);
 
       // Also remember any frame index for use in FastISel.
       if (FrameIndexSDNode *FI =
           dyn_cast<FrameIndexSDNode>(InVals[i].getNode()))
-        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+        FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
     for (unsigned Val = 0; Val != NumValues; ++Val) {
@@ -8283,16 +8378,15 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       // Even an apparant 'unused' swifterror argument needs to be returned. So
       // we do generate a copy for it that can be used on return from the
       // function.
-      if (!I->use_empty() || isSwiftErrorArg) {
+      if (ArgHasUses || isSwiftErrorArg) {
         Optional<ISD::NodeType> AssertOp;
         if (F.getAttributes().hasAttribute(Idx, Attribute::SExt))
           AssertOp = ISD::AssertSext;
         else if (F.getAttributes().hasAttribute(Idx, Attribute::ZExt))
           AssertOp = ISD::AssertZext;
 
-        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i],
-                                             NumParts, PartVT, VT,
-                                             nullptr, AssertOp));
+        ArgValues.push_back(getCopyFromParts(DAG, dl, &InVals[i], NumParts,
+                                             PartVT, VT, nullptr, AssertOp));
       }
 
       i += NumParts;
@@ -8305,18 +8399,18 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
     // Note down frame index.
     if (FrameIndexSDNode *FI =
         dyn_cast<FrameIndexSDNode>(ArgValues[0].getNode()))
-      FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+      FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
 
     SDValue Res = DAG.getMergeValues(makeArrayRef(ArgValues.data(), NumValues),
                                      SDB->getCurSDLoc());
 
-    SDB->setValue(&*I, Res);
+    SDB->setValue(&Arg, Res);
     if (!TM.Options.EnableFastISel && Res.getOpcode() == ISD::BUILD_PAIR) {
       if (LoadSDNode *LNode =
           dyn_cast<LoadSDNode>(Res.getOperand(0).getNode()))
         if (FrameIndexSDNode *FI =
             dyn_cast<FrameIndexSDNode>(LNode->getBasePtr().getNode()))
-        FuncInfo->setArgumentFrameIndex(&*I, FI->getIndex());
+        FuncInfo->setArgumentFrameIndex(&Arg, FI->getIndex());
     }
 
     // Update the SwiftErrorVRegDefMap.
@@ -8336,18 +8430,36 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
       // uses with vregs.
       unsigned Reg = cast<RegisterSDNode>(Res.getOperand(1))->getReg();
       if (TargetRegisterInfo::isVirtualRegister(Reg)) {
-        FuncInfo->ValueMap[&*I] = Reg;
+        FuncInfo->ValueMap[&Arg] = Reg;
         continue;
       }
     }
-    if (!isOnlyUsedInEntryBlock(&*I, TM.Options.EnableFastISel)) {
-      FuncInfo->InitializeRegForValue(&*I);
-      SDB->CopyToExportRegsIfNeeded(&*I);
+    if (!isOnlyUsedInEntryBlock(&Arg, TM.Options.EnableFastISel)) {
+      FuncInfo->InitializeRegForValue(&Arg);
+      SDB->CopyToExportRegsIfNeeded(&Arg);
     }
   }
 
+  if (!Chains.empty()) {
+    Chains.push_back(NewRoot);
+    NewRoot = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Chains);
+  }
+
+  DAG.setRoot(NewRoot);
+
   assert(i == InVals.size() && "Argument register count mismatch!");
 
+  // If any argument copy elisions occurred and we have debug info, update the
+  // stale frame indices used in the dbg.declare variable info table.
+  MachineFunction::VariableDbgInfoMapTy &DbgDeclareInfo = MF->getVariableDbgInfo();
+  if (!DbgDeclareInfo.empty() && !ArgCopyElisionFrameIndexMap.empty()) {
+    for (MachineFunction::VariableDbgInfo &VI : DbgDeclareInfo) {
+      auto I = ArgCopyElisionFrameIndexMap.find(VI.Slot);
+      if (I != ArgCopyElisionFrameIndexMap.end())
+        VI.Slot = I->second;
+    }
+  }
+
   // Finally, if the target has anything special to do, allow it to do so.
   EmitFunctionEntryCode();
 }
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 5ecc753d23d7925589e3ce47519ab03305df2f21..c6acc09b660289be7e2b71c476438dff8ef9c80a 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -616,33 +616,27 @@ public:
   void init(GCFunctionInfo *gfi, AliasAnalysis &aa,
             const TargetLibraryInfo *li);
 
-  /// clear - Clear out the current SelectionDAG and the associated
-  /// state and prepare this SelectionDAGBuilder object to be used
-  /// for a new block. This doesn't clear out information about
-  /// additional blocks that are needed to complete switch lowering
-  /// or PHI node updating; that information is cleared out as it is
-  /// consumed.
+  /// Clear out the current SelectionDAG and the associated state and prepare
+  /// this SelectionDAGBuilder object to be used for a new block. This doesn't
+  /// clear out information about additional blocks that are needed to complete
+  /// switch lowering or PHI node updating; that information is cleared out as
+  /// it is consumed.
   void clear();
 
-  /// clearDanglingDebugInfo - Clear the dangling debug information
-  /// map. This function is separated from the clear so that debug
-  /// information that is dangling in a basic block can be properly
-  /// resolved in a different basic block. This allows the
-  /// SelectionDAG to resolve dangling debug information attached
-  /// to PHI nodes.
+  /// Clear the dangling debug information map. This function is separated from
+  /// the clear so that debug information that is dangling in a basic block can
+  /// be properly resolved in a different basic block. This allows the
+  /// SelectionDAG to resolve dangling debug information attached to PHI nodes.
   void clearDanglingDebugInfo();
 
-  /// getRoot - Return the current virtual root of the Selection DAG,
-  /// flushing any PendingLoad items. This must be done before emitting
-  /// a store or any other node that may need to be ordered after any
-  /// prior load instructions.
-  ///
+  /// Return the current virtual root of the Selection DAG, flushing any
+  /// PendingLoad items. This must be done before emitting a store or any other
+  /// node that may need to be ordered after any prior load instructions.
   SDValue getRoot();
 
-  /// getControlRoot - Similar to getRoot, but instead of flushing all the
-  /// PendingLoad items, flush all the PendingExports items. It is necessary
-  /// to do this before emitting a terminator instruction.
-  ///
+  /// Similar to getRoot, but instead of flushing all the PendingLoad items,
+  /// flush all the PendingExports items. It is necessary to do this before
+  /// emitting a terminator instruction.
   SDValue getControlRoot();
 
   SDLoc getCurSDLoc() const {
@@ -960,26 +954,23 @@ private:
 /// type.
 ///
 struct RegsForValue {
-  /// ValueVTs - The value types of the values, which may not be legal, and
+  /// The value types of the values, which may not be legal, and
   /// may need be promoted or synthesized from one or more registers.
-  ///
   SmallVector<EVT, 4> ValueVTs;
 
-  /// RegVTs - The value types of the registers. This is the same size as
-  /// ValueVTs and it records, for each value, what the type of the assigned
-  /// register or registers are. (Individual values are never synthesized
-  /// from more than one type of register.)
+  /// The value types of the registers. This is the same size as ValueVTs and it
+  /// records, for each value, what the type of the assigned register or
+  /// registers are. (Individual values are never synthesized from more than one
+  /// type of register.)
   ///
   /// With virtual registers, the contents of RegVTs is redundant with TLI's
   /// getRegisterType member function, however when with physical registers
   /// it is necessary to have a separate record of the types.
-  ///
   SmallVector<MVT, 4> RegVTs;
 
-  /// Regs - This list holds the registers assigned to the values.
+  /// This list holds the registers assigned to the values.
   /// Each legal or promoted value requires one register, and each
   /// expanded value requires multiple registers.
-  ///
   SmallVector<unsigned, 4> Regs;
 
   RegsForValue();
@@ -989,33 +980,33 @@ struct RegsForValue {
   RegsForValue(LLVMContext &Context, const TargetLowering &TLI,
                const DataLayout &DL, unsigned Reg, Type *Ty);
 
-  /// append - Add the specified values to this one.
+  /// Add the specified values to this one.
   void append(const RegsForValue &RHS) {
     ValueVTs.append(RHS.ValueVTs.begin(), RHS.ValueVTs.end());
     RegVTs.append(RHS.RegVTs.begin(), RHS.RegVTs.end());
     Regs.append(RHS.Regs.begin(), RHS.Regs.end());
   }
 
-  /// getCopyFromRegs - Emit a series of CopyFromReg nodes that copies from
-  /// this value and returns the result as a ValueVTs value.  This uses
-  /// Chain/Flag as the input and updates them for the output Chain/Flag.
-  /// If the Flag pointer is NULL, no flag is used.
+  /// Emit a series of CopyFromReg nodes that copies from this value and returns
+  /// the result as a ValueVTs value. This uses Chain/Flag as the input and
+  /// updates them for the output Chain/Flag. If the Flag pointer is NULL, no
+  /// flag is used.
   SDValue getCopyFromRegs(SelectionDAG &DAG, FunctionLoweringInfo &FuncInfo,
                           const SDLoc &dl, SDValue &Chain, SDValue *Flag,
                           const Value *V = nullptr) const;
 
-  /// getCopyToRegs - Emit a series of CopyToReg nodes that copies the specified
-  /// value into the registers specified by this object.  This uses Chain/Flag
-  /// as the input and updates them for the output Chain/Flag.  If the Flag
-  /// pointer is nullptr, no flag is used.  If V is not nullptr, then it is used
-  /// in printing better diagnostic messages on error.
+  /// Emit a series of CopyToReg nodes that copies the specified value into the
+  /// registers specified by this object. This uses Chain/Flag as the input and
+  /// updates them for the output Chain/Flag. If the Flag pointer is nullptr, no
+  /// flag is used. If V is not nullptr, then it is used in printing better
+  /// diagnostic messages on error.
   void getCopyToRegs(SDValue Val, SelectionDAG &DAG, const SDLoc &dl,
                      SDValue &Chain, SDValue *Flag, const Value *V = nullptr,
                      ISD::NodeType PreferredExtendType = ISD::ANY_EXTEND) const;
 
-  /// AddInlineAsmOperands - Add this value to the specified inlineasm node
-  /// operand list.  This adds the code marker, matching input operand index
-  /// (if applicable), and includes the number of values added into it.
+  /// Add this value to the specified inlineasm node operand list. This adds the
+  /// code marker, matching input operand index (if applicable), and includes
+  /// the number of values added into it.
   void AddInlineAsmOperands(unsigned Kind, bool HasMatching,
                             unsigned MatchingIdx, const SDLoc &dl,
                             SelectionDAG &DAG, std::vector<SDValue> &Ops) const;
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index ba9ec68f1feda26f68ffeddd489b89432f71b1e1..488c60a28ffbcaac12acb1d5db5b39ab2a88df89 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -300,6 +300,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
 
   // Bit manipulation
+  case ISD::ABS:                        return "abs";
   case ISD::BITREVERSE:                 return "bitreverse";
   case ISD::BSWAP:                      return "bswap";
   case ISD::CTPOP:                      return "ctpop";
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index d9d8d0c1fd10b8f55d00bd820965a72c3850e446..e21204dbb966194d9836e6b61f337c288efa2f0b 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -11,40 +11,65 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "ScheduleDAGSDNodes.h"
 #include "SelectionDAGBuilder.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/FastISel.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GCMetadata.h"
-#include "llvm/CodeGen/GCStrategy.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachinePassRegistry.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/ScheduleHazardRecognizer.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/StackProtector.h"
-#include "llvm/CodeGen/WinEHFuncInfo.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstrTypes.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/Module.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -59,6 +84,13 @@
 #include "llvm/Target/TargetSubtargetInfo.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -73,104 +105,6 @@ STATISTIC(NumEntryBlocks, "Number of entry blocks encountered");
 STATISTIC(NumFastIselFailLowerArguments,
           "Number of entry blocks where fast isel failed to lower arguments");
 
-#ifndef NDEBUG
-static cl::opt<bool>
-EnableFastISelVerbose2("fast-isel-verbose2", cl::Hidden,
-          cl::desc("Enable extra verbose messages in the \"fast\" "
-                   "instruction selector"));
-
-  // Terminators
-STATISTIC(NumFastIselFailRet,"Fast isel fails on Ret");
-STATISTIC(NumFastIselFailBr,"Fast isel fails on Br");
-STATISTIC(NumFastIselFailSwitch,"Fast isel fails on Switch");
-STATISTIC(NumFastIselFailIndirectBr,"Fast isel fails on IndirectBr");
-STATISTIC(NumFastIselFailInvoke,"Fast isel fails on Invoke");
-STATISTIC(NumFastIselFailResume,"Fast isel fails on Resume");
-STATISTIC(NumFastIselFailUnreachable,"Fast isel fails on Unreachable");
-
-  // Standard binary operators...
-STATISTIC(NumFastIselFailAdd,"Fast isel fails on Add");
-STATISTIC(NumFastIselFailFAdd,"Fast isel fails on FAdd");
-STATISTIC(NumFastIselFailSub,"Fast isel fails on Sub");
-STATISTIC(NumFastIselFailFSub,"Fast isel fails on FSub");
-STATISTIC(NumFastIselFailMul,"Fast isel fails on Mul");
-STATISTIC(NumFastIselFailFMul,"Fast isel fails on FMul");
-STATISTIC(NumFastIselFailUDiv,"Fast isel fails on UDiv");
-STATISTIC(NumFastIselFailSDiv,"Fast isel fails on SDiv");
-STATISTIC(NumFastIselFailFDiv,"Fast isel fails on FDiv");
-STATISTIC(NumFastIselFailURem,"Fast isel fails on URem");
-STATISTIC(NumFastIselFailSRem,"Fast isel fails on SRem");
-STATISTIC(NumFastIselFailFRem,"Fast isel fails on FRem");
-
-  // Logical operators...
-STATISTIC(NumFastIselFailAnd,"Fast isel fails on And");
-STATISTIC(NumFastIselFailOr,"Fast isel fails on Or");
-STATISTIC(NumFastIselFailXor,"Fast isel fails on Xor");
-
-  // Memory instructions...
-STATISTIC(NumFastIselFailAlloca,"Fast isel fails on Alloca");
-STATISTIC(NumFastIselFailLoad,"Fast isel fails on Load");
-STATISTIC(NumFastIselFailStore,"Fast isel fails on Store");
-STATISTIC(NumFastIselFailAtomicCmpXchg,"Fast isel fails on AtomicCmpXchg");
-STATISTIC(NumFastIselFailAtomicRMW,"Fast isel fails on AtomicRWM");
-STATISTIC(NumFastIselFailFence,"Fast isel fails on Frence");
-STATISTIC(NumFastIselFailGetElementPtr,"Fast isel fails on GetElementPtr");
-
-  // Convert instructions...
-STATISTIC(NumFastIselFailTrunc,"Fast isel fails on Trunc");
-STATISTIC(NumFastIselFailZExt,"Fast isel fails on ZExt");
-STATISTIC(NumFastIselFailSExt,"Fast isel fails on SExt");
-STATISTIC(NumFastIselFailFPTrunc,"Fast isel fails on FPTrunc");
-STATISTIC(NumFastIselFailFPExt,"Fast isel fails on FPExt");
-STATISTIC(NumFastIselFailFPToUI,"Fast isel fails on FPToUI");
-STATISTIC(NumFastIselFailFPToSI,"Fast isel fails on FPToSI");
-STATISTIC(NumFastIselFailUIToFP,"Fast isel fails on UIToFP");
-STATISTIC(NumFastIselFailSIToFP,"Fast isel fails on SIToFP");
-STATISTIC(NumFastIselFailIntToPtr,"Fast isel fails on IntToPtr");
-STATISTIC(NumFastIselFailPtrToInt,"Fast isel fails on PtrToInt");
-STATISTIC(NumFastIselFailBitCast,"Fast isel fails on BitCast");
-
-  // Other instructions...
-STATISTIC(NumFastIselFailICmp,"Fast isel fails on ICmp");
-STATISTIC(NumFastIselFailFCmp,"Fast isel fails on FCmp");
-STATISTIC(NumFastIselFailPHI,"Fast isel fails on PHI");
-STATISTIC(NumFastIselFailSelect,"Fast isel fails on Select");
-STATISTIC(NumFastIselFailCall,"Fast isel fails on Call");
-STATISTIC(NumFastIselFailShl,"Fast isel fails on Shl");
-STATISTIC(NumFastIselFailLShr,"Fast isel fails on LShr");
-STATISTIC(NumFastIselFailAShr,"Fast isel fails on AShr");
-STATISTIC(NumFastIselFailVAArg,"Fast isel fails on VAArg");
-STATISTIC(NumFastIselFailExtractElement,"Fast isel fails on ExtractElement");
-STATISTIC(NumFastIselFailInsertElement,"Fast isel fails on InsertElement");
-STATISTIC(NumFastIselFailShuffleVector,"Fast isel fails on ShuffleVector");
-STATISTIC(NumFastIselFailExtractValue,"Fast isel fails on ExtractValue");
-STATISTIC(NumFastIselFailInsertValue,"Fast isel fails on InsertValue");
-STATISTIC(NumFastIselFailLandingPad,"Fast isel fails on LandingPad");
-
-// Intrinsic instructions...
-STATISTIC(NumFastIselFailIntrinsicCall, "Fast isel fails on Intrinsic call");
-STATISTIC(NumFastIselFailSAddWithOverflow,
-          "Fast isel fails on sadd.with.overflow");
-STATISTIC(NumFastIselFailUAddWithOverflow,
-          "Fast isel fails on uadd.with.overflow");
-STATISTIC(NumFastIselFailSSubWithOverflow,
-          "Fast isel fails on ssub.with.overflow");
-STATISTIC(NumFastIselFailUSubWithOverflow,
-          "Fast isel fails on usub.with.overflow");
-STATISTIC(NumFastIselFailSMulWithOverflow,
-          "Fast isel fails on smul.with.overflow");
-STATISTIC(NumFastIselFailUMulWithOverflow,
-          "Fast isel fails on umul.with.overflow");
-STATISTIC(NumFastIselFailFrameaddress, "Fast isel fails on Frameaddress");
-STATISTIC(NumFastIselFailSqrt, "Fast isel fails on sqrt call");
-STATISTIC(NumFastIselFailStackMap, "Fast isel fails on StackMap call");
-STATISTIC(NumFastIselFailPatchPoint, "Fast isel fails on PatchPoint call");
-#endif
-
-static cl::opt<bool>
-EnableFastISelVerbose("fast-isel-verbose", cl::Hidden,
-          cl::desc("Enable verbose messages in the \"fast\" "
-                   "instruction selector"));
 static cl::opt<int> EnableFastISelAbort(
     "fast-isel-abort", cl::Hidden,
     cl::desc("Enable abort calls when \"fast\" instruction selection "
@@ -179,6 +113,11 @@ static cl::opt<int> EnableFastISelAbort(
              "abort for argument lowering, and 3 will never fallback "
              "to SelectionDAG."));
 
+static cl::opt<bool> EnableFastISelFallbackReport(
+    "fast-isel-report-on-fallback", cl::Hidden,
+    cl::desc("Emit a diagnostic when \"fast\" instruction selection "
+             "falls back to SelectionDAG."));
+
 static cl::opt<bool>
 UseMBPI("use-mbpi",
         cl::desc("use Machine Branch Probability Info"),
@@ -238,7 +177,7 @@ MachinePassRegistry RegisterScheduler::Registry;
 ///
 //===---------------------------------------------------------------------===//
 static cl::opt<RegisterScheduler::FunctionPassCtor, false,
-               RegisterPassParser<RegisterScheduler> >
+               RegisterPassParser<RegisterScheduler>>
 ISHeuristic("pre-RA-sched",
             cl::init(&createDefaultScheduler), cl::Hidden,
             cl::desc("Instruction schedulers available (before register"
@@ -249,6 +188,7 @@ defaultListDAGScheduler("default", "Best scheduler for the target",
                         createDefaultScheduler);
 
 namespace llvm {
+
   //===--------------------------------------------------------------------===//
   /// \brief This class is used by SelectionDAGISel to temporarily override
   /// the optimization level on a per-function basis.
@@ -318,6 +258,7 @@ namespace llvm {
            "Unknown sched type!");
     return createILPListDAGScheduler(IS, OptLevel);
   }
+
 } // end namespace llvm
 
 // EmitInstrWithCustomInserter - This method should be implemented by targets
@@ -431,8 +372,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
           MachineFunctionProperties::Property::Selected))
     return false;
   // Do some sanity-checking on the command-line options.
-  assert((!EnableFastISelVerbose || TM.Options.EnableFastISel) &&
-         "-fast-isel-verbose requires -fast-isel");
   assert((!EnableFastISelAbort || TM.Options.EnableFastISel) &&
          "-fast-isel-abort > 0 requires -fast-isel");
 
@@ -457,12 +396,13 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   LibInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   GFI = Fn.hasGC() ? &getAnalysis<GCModuleInfo>().getFunctionInfo(Fn) : nullptr;
+  ORE = make_unique<OptimizationRemarkEmitter>(&Fn);
 
   DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
   SplitCriticalSideEffectEdges(const_cast<Function &>(Fn));
 
-  CurDAG->init(*MF);
+  CurDAG->init(*MF, *ORE);
   FuncInfo->set(Fn, *MF, CurDAG);
 
   if (UseMBPI && OptLevel != CodeGenOpt::None)
@@ -502,6 +442,10 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     TLI->initializeSplitCSR(EntryMBB);
 
   SelectAllBasicBlocks(Fn);
+  if (FastISelFailed && EnableFastISelFallbackReport) {
+    DiagnosticInfoISelFallback DiagFallback(Fn);
+    Fn.getContext().diagnose(DiagFallback);
+  }
 
   // If the first basic block in the function has live ins that need to be
   // copied into vregs, emit the copies into the top of the block before
@@ -628,7 +572,7 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
     unsigned To = I->second;
     // If To is also scheduled to be replaced, find what its ultimate
     // replacement is.
-    for (;;) {
+    while (true) {
       DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);
       if (J == E) break;
       To = J->second;
@@ -666,13 +610,30 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   return true;
 }
 
+static void reportFastISelFailure(MachineFunction &MF,
+                                  OptimizationRemarkEmitter &ORE,
+                                  OptimizationRemarkMissed &R,
+                                  bool ShouldAbort) {
+  // Print the function name explicitly if we don't have a debug location (which
+  // makes the diagnostic less useful) or if we're going to emit a raw error.
+  if (!R.getLocation().isValid() || ShouldAbort)
+    R << (" (in function: " + MF.getName() + ")").str();
+
+  if (ShouldAbort)
+    report_fatal_error(R.getMsg());
+
+  ORE.emit(R);
+}
+
 void SelectionDAGISel::SelectBasicBlock(BasicBlock::const_iterator Begin,
                                         BasicBlock::const_iterator End,
                                         bool &HadTailCall) {
   // Lower the instructions. If a call is emitted as a tail call, cease emitting
   // nodes for this block.
-  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I)
-    SDB->visit(*I);
+  for (BasicBlock::const_iterator I = Begin; I != End && !SDB->HasTailCall; ++I) {
+    if (!ElidedArgCopyInstrs.count(&*I))
+      SDB->visit(*I);
+  }
 
   // Make sure the root of the DAG is up-to-date.
   CurDAG->setRoot(SDB->getControlRoot());
@@ -731,6 +692,10 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   int BlockNumber = -1;
   (void)BlockNumber;
   bool MatchFilterBB = false; (void)MatchFilterBB;
+
+  // Pre-type legalization allow creation of any node types.
+  CurDAG->NewNodesMustHaveLegalTypes = false;
+
 #ifndef NDEBUG
   MatchFilterBB = (FilterDAGBasicBlockName.empty() ||
                    FilterDAGBasicBlockName ==
@@ -777,6 +742,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   DEBUG(dbgs() << "Type-legalized selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
 
+  // Only allow creation of legal node types.
   CurDAG->NewNodesMustHaveLegalTypes = true;
 
   if (Changed) {
@@ -802,12 +768,18 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
   }
 
   if (Changed) {
+    DEBUG(dbgs() << "Vector-legalized selection DAG: BB#" << BlockNumber
+          << " '" << BlockName << "'\n"; CurDAG->dump());
+
     {
       NamedRegionTimer T("legalize_types2", "Type Legalization 2", GroupName,
                          GroupDescription, TimePassesIsEnabled);
       CurDAG->LegalizeTypes();
     }
 
+    DEBUG(dbgs() << "Vector/type-legalized selection DAG: BB#" << BlockNumber
+          << " '" << BlockName << "'\n"; CurDAG->dump());
+
     if (ViewDAGCombineLT && MatchFilterBB)
       CurDAG->viewGraph("dag-combine-lv input for " + BlockName);
 
@@ -907,10 +879,12 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 }
 
 namespace {
+
 /// ISelUpdater - helper class to handle updates of the instruction selection
 /// graph.
 class ISelUpdater : public SelectionDAG::DAGUpdateListener {
   SelectionDAG::allnodes_iterator &ISelPosition;
+
 public:
   ISelUpdater(SelectionDAG &DAG, SelectionDAG::allnodes_iterator &isp)
     : SelectionDAG::DAGUpdateListener(DAG), ISelPosition(isp) {}
@@ -923,6 +897,7 @@ public:
       ++ISelPosition;
   }
 };
+
 } // end anonymous namespace
 
 static bool isStrictFPOp(SDNode *Node, unsigned &NewOpc) {
@@ -1106,116 +1081,6 @@ static bool isFoldedOrDeadInstruction(const Instruction *I,
          !FuncInfo->isExportedInst(I); // Exported instrs must be computed.
 }
 
-#ifndef NDEBUG
-// Collect per Instruction statistics for fast-isel misses.  Only those
-// instructions that cause the bail are accounted for.  It does not account for
-// instructions higher in the block.  Thus, summing the per instructions stats
-// will not add up to what is reported by NumFastIselFailures.
-static void collectFailStats(const Instruction *I) {
-  switch (I->getOpcode()) {
-  default: assert (0 && "<Invalid operator> ");
-
-  // Terminators
-  case Instruction::Ret:         NumFastIselFailRet++; return;
-  case Instruction::Br:          NumFastIselFailBr++; return;
-  case Instruction::Switch:      NumFastIselFailSwitch++; return;
-  case Instruction::IndirectBr:  NumFastIselFailIndirectBr++; return;
-  case Instruction::Invoke:      NumFastIselFailInvoke++; return;
-  case Instruction::Resume:      NumFastIselFailResume++; return;
-  case Instruction::Unreachable: NumFastIselFailUnreachable++; return;
-
-  // Standard binary operators...
-  case Instruction::Add:  NumFastIselFailAdd++; return;
-  case Instruction::FAdd: NumFastIselFailFAdd++; return;
-  case Instruction::Sub:  NumFastIselFailSub++; return;
-  case Instruction::FSub: NumFastIselFailFSub++; return;
-  case Instruction::Mul:  NumFastIselFailMul++; return;
-  case Instruction::FMul: NumFastIselFailFMul++; return;
-  case Instruction::UDiv: NumFastIselFailUDiv++; return;
-  case Instruction::SDiv: NumFastIselFailSDiv++; return;
-  case Instruction::FDiv: NumFastIselFailFDiv++; return;
-  case Instruction::URem: NumFastIselFailURem++; return;
-  case Instruction::SRem: NumFastIselFailSRem++; return;
-  case Instruction::FRem: NumFastIselFailFRem++; return;
-
-  // Logical operators...
-  case Instruction::And: NumFastIselFailAnd++; return;
-  case Instruction::Or:  NumFastIselFailOr++; return;
-  case Instruction::Xor: NumFastIselFailXor++; return;
-
-  // Memory instructions...
-  case Instruction::Alloca:        NumFastIselFailAlloca++; return;
-  case Instruction::Load:          NumFastIselFailLoad++; return;
-  case Instruction::Store:         NumFastIselFailStore++; return;
-  case Instruction::AtomicCmpXchg: NumFastIselFailAtomicCmpXchg++; return;
-  case Instruction::AtomicRMW:     NumFastIselFailAtomicRMW++; return;
-  case Instruction::Fence:         NumFastIselFailFence++; return;
-  case Instruction::GetElementPtr: NumFastIselFailGetElementPtr++; return;
-
-  // Convert instructions...
-  case Instruction::Trunc:    NumFastIselFailTrunc++; return;
-  case Instruction::ZExt:     NumFastIselFailZExt++; return;
-  case Instruction::SExt:     NumFastIselFailSExt++; return;
-  case Instruction::FPTrunc:  NumFastIselFailFPTrunc++; return;
-  case Instruction::FPExt:    NumFastIselFailFPExt++; return;
-  case Instruction::FPToUI:   NumFastIselFailFPToUI++; return;
-  case Instruction::FPToSI:   NumFastIselFailFPToSI++; return;
-  case Instruction::UIToFP:   NumFastIselFailUIToFP++; return;
-  case Instruction::SIToFP:   NumFastIselFailSIToFP++; return;
-  case Instruction::IntToPtr: NumFastIselFailIntToPtr++; return;
-  case Instruction::PtrToInt: NumFastIselFailPtrToInt++; return;
-  case Instruction::BitCast:  NumFastIselFailBitCast++; return;
-
-  // Other instructions...
-  case Instruction::ICmp:           NumFastIselFailICmp++; return;
-  case Instruction::FCmp:           NumFastIselFailFCmp++; return;
-  case Instruction::PHI:            NumFastIselFailPHI++; return;
-  case Instruction::Select:         NumFastIselFailSelect++; return;
-  case Instruction::Call: {
-    if (auto const *Intrinsic = dyn_cast<IntrinsicInst>(I)) {
-      switch (Intrinsic->getIntrinsicID()) {
-      default:
-        NumFastIselFailIntrinsicCall++; return;
-      case Intrinsic::sadd_with_overflow:
-        NumFastIselFailSAddWithOverflow++; return;
-      case Intrinsic::uadd_with_overflow:
-        NumFastIselFailUAddWithOverflow++; return;
-      case Intrinsic::ssub_with_overflow:
-        NumFastIselFailSSubWithOverflow++; return;
-      case Intrinsic::usub_with_overflow:
-        NumFastIselFailUSubWithOverflow++; return;
-      case Intrinsic::smul_with_overflow:
-        NumFastIselFailSMulWithOverflow++; return;
-      case Intrinsic::umul_with_overflow:
-        NumFastIselFailUMulWithOverflow++; return;
-      case Intrinsic::frameaddress:
-        NumFastIselFailFrameaddress++; return;
-      case Intrinsic::sqrt:
-          NumFastIselFailSqrt++; return;
-      case Intrinsic::experimental_stackmap:
-        NumFastIselFailStackMap++; return;
-      case Intrinsic::experimental_patchpoint_void: // fall-through
-      case Intrinsic::experimental_patchpoint_i64:
-        NumFastIselFailPatchPoint++; return;
-      }
-    }
-    NumFastIselFailCall++;
-    return;
-  }
-  case Instruction::Shl:            NumFastIselFailShl++; return;
-  case Instruction::LShr:           NumFastIselFailLShr++; return;
-  case Instruction::AShr:           NumFastIselFailAShr++; return;
-  case Instruction::VAArg:          NumFastIselFailVAArg++; return;
-  case Instruction::ExtractElement: NumFastIselFailExtractElement++; return;
-  case Instruction::InsertElement:  NumFastIselFailInsertElement++; return;
-  case Instruction::ShuffleVector:  NumFastIselFailShuffleVector++; return;
-  case Instruction::ExtractValue:   NumFastIselFailExtractValue++; return;
-  case Instruction::InsertValue:    NumFastIselFailInsertValue++; return;
-  case Instruction::LandingPad:     NumFastIselFailLandingPad++; return;
-  }
-}
-#endif // NDEBUG
-
 /// Set up SwiftErrorVals by going through the function. If the function has
 /// swifterror argument, it will be the first entry.
 static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,
@@ -1250,9 +1115,9 @@ static void setupSwiftErrorVals(const Function &Fn, const TargetLowering *TLI,
 }
 
 static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
+                                                FastISel *FastIS,
                                                 const TargetLowering *TLI,
                                                 const TargetInstrInfo *TII,
-                                                const BasicBlock *LLVMBB,
                                                 SelectionDAGBuilder *SDB) {
   if (!TLI->supportSwiftError())
     return;
@@ -1262,22 +1127,27 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo,
   if (FuncInfo->SwiftErrorVals.empty())
     return;
 
-  if (pred_begin(LLVMBB) == pred_end(LLVMBB)) {
-    auto &DL = FuncInfo->MF->getDataLayout();
-    auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
-    for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) {
-      // We will always generate a copy from the argument. It is always used at
-      // least by the 'return' of the swifterror.
-      if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal)
-        continue;
-      unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
-      // Assign Undef to Vreg. We construct MI directly to make sure it works
-      // with FastISel.
-      BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(),
-              SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
-              VReg);
-      FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg);
-    }
+  assert(FuncInfo->MBB == &*FuncInfo->MF->begin() &&
+         "expected to insert into entry block");
+  auto &DL = FuncInfo->MF->getDataLayout();
+  auto const *RC = TLI->getRegClassFor(TLI->getPointerTy(DL));
+  for (const auto *SwiftErrorVal : FuncInfo->SwiftErrorVals) {
+    // We will always generate a copy from the argument. It is always used at
+    // least by the 'return' of the swifterror.
+    if (FuncInfo->SwiftErrorArg && FuncInfo->SwiftErrorArg == SwiftErrorVal)
+      continue;
+    unsigned VReg = FuncInfo->MF->getRegInfo().createVirtualRegister(RC);
+    // Assign Undef to Vreg. We construct MI directly to make sure it works
+    // with FastISel.
+    BuildMI(*FuncInfo->MBB, FuncInfo->MBB->getFirstNonPHI(),
+            SDB->getCurDebugLoc(), TII->get(TargetOpcode::IMPLICIT_DEF),
+            VReg);
+
+    // Keep FastIS informed about the value we just inserted.
+    if (FastIS)
+      FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
+
+    FuncInfo->setCurrentSwiftErrorVReg(FuncInfo->MBB, SwiftErrorVal, VReg);
   }
 }
 
@@ -1400,6 +1270,7 @@ static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) {
 }
 
 void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
+  FastISelFailed = false;
   // Initialize the Fast-ISel state, if needed.
   FastISel *FastIS = nullptr;
   if (TM.Options.EnableFastISel)
@@ -1407,12 +1278,53 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
   setupSwiftErrorVals(Fn, TLI, FuncInfo);
 
-  // Iterate over all basic blocks in the function.
   ReversePostOrderTraversal<const Function*> RPOT(&Fn);
-  for (ReversePostOrderTraversal<const Function*>::rpo_iterator
-       I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    const BasicBlock *LLVMBB = *I;
 
+  // Lower arguments up front. An RPO iteration always visits the entry block
+  // first.
+  assert(*RPOT.begin() == &Fn.getEntryBlock());
+  ++NumEntryBlocks;
+
+  // Set up FuncInfo for ISel. Entry blocks never have PHIs.
+  FuncInfo->MBB = FuncInfo->MBBMap[&Fn.getEntryBlock()];
+  FuncInfo->InsertPt = FuncInfo->MBB->begin();
+
+  if (!FastIS) {
+    LowerArguments(Fn);
+  } else {
+    // See if fast isel can lower the arguments.
+    FastIS->startNewBlock();
+    if (!FastIS->lowerArguments()) {
+      FastISelFailed = true;
+      // Fast isel failed to lower these arguments
+      ++NumFastIselFailLowerArguments;
+
+      OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                 Fn.getSubprogram(),
+                                 &Fn.getEntryBlock());
+      R << "FastISel didn't lower all arguments: "
+        << ore::NV("Prototype", Fn.getType());
+      reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 1);
+
+      // Use SelectionDAG argument lowering
+      LowerArguments(Fn);
+      CurDAG->setRoot(SDB->getControlRoot());
+      SDB->clear();
+      CodeGenAndEmitDAG();
+    }
+
+    // If we inserted any instructions at the beginning, make a note of
+    // where they are, so we can be sure to emit subsequent instructions
+    // after them.
+    if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
+      FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
+    else
+      FastIS->setLastLocalValue(nullptr);
+  }
+  createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB);
+
+  // Iterate over all basic blocks in the function.
+  for (const BasicBlock *LLVMBB : RPOT) {
     if (OptLevel != CodeGenOpt::None) {
       bool AllPredsVisited = true;
       for (const_pred_iterator PI = pred_begin(LLVMBB), PE = pred_end(LLVMBB);
@@ -1444,8 +1356,9 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
     FuncInfo->MBB = FuncInfo->MBBMap[LLVMBB];
     if (!FuncInfo->MBB)
       continue; // Some blocks like catchpads have no code or MBB.
-    FuncInfo->InsertPt = FuncInfo->MBB->getFirstNonPHI();
-    createSwiftErrorEntriesInEntryBlock(FuncInfo, TLI, TII, LLVMBB, SDB);
+
+    // Insert new instructions after any phi or argument setup code.
+    FuncInfo->InsertPt = FuncInfo->MBB->end();
 
     // Setup an EH landing-pad block.
     FuncInfo->ExceptionPointerVirtReg = 0;
@@ -1456,35 +1369,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
     // Before doing SelectionDAG ISel, see if FastISel has been requested.
     if (FastIS) {
-      FastIS->startNewBlock();
-
-      // Emit code for any incoming arguments. This must happen before
-      // beginning FastISel on the entry block.
-      if (LLVMBB == &Fn.getEntryBlock()) {
-        ++NumEntryBlocks;
-
-        // Lower any arguments needed in this block if this is the entry block.
-        if (!FastIS->lowerArguments()) {
-          // Fast isel failed to lower these arguments
-          ++NumFastIselFailLowerArguments;
-          if (EnableFastISelAbort > 1)
-            report_fatal_error("FastISel didn't lower all arguments");
-
-          // Use SelectionDAG argument lowering
-          LowerArguments(Fn);
-          CurDAG->setRoot(SDB->getControlRoot());
-          SDB->clear();
-          CodeGenAndEmitDAG();
-        }
-
-        // If we inserted any instructions at the beginning, make a note of
-        // where they are, so we can be sure to emit subsequent instructions
-        // after them.
-        if (FuncInfo->InsertPt != FuncInfo->MBB->begin())
-          FastIS->setLastLocalValue(&*std::prev(FuncInfo->InsertPt));
-        else
-          FastIS->setLastLocalValue(nullptr);
-      }
+      if (LLVMBB != &Fn.getEntryBlock())
+        FastIS->startNewBlock();
 
       unsigned NumFastIselRemaining = std::distance(Begin, End);
       // Do FastISel on as many instructions as possible.
@@ -1492,7 +1378,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         const Instruction *Inst = &*std::prev(BI);
 
         // If we no longer require this instruction, skip it.
-        if (isFoldedOrDeadInstruction(Inst, FuncInfo)) {
+        if (isFoldedOrDeadInstruction(Inst, FuncInfo) ||
+            ElidedArgCopyInstrs.count(Inst)) {
           --NumFastIselRemaining;
           continue;
         }
@@ -1503,6 +1390,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
         // Try to select the instruction with FastISel.
         if (FastIS->selectInstruction(Inst)) {
+          FastISelFailed = true;
           --NumFastIselRemaining;
           ++NumFastIselSuccess;
           // If fast isel succeeded, skip over all the folded instructions, and
@@ -1525,22 +1413,22 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           continue;
         }
 
-#ifndef NDEBUG
-        if (EnableFastISelVerbose2)
-          collectFailStats(Inst);
-#endif
-
         // Then handle certain instructions as single-LLVM-Instruction blocks.
         if (isa<CallInst>(Inst)) {
+          OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                     Inst->getDebugLoc(), LLVMBB);
+
+          R << "FastISel missed call";
 
-          if (EnableFastISelVerbose || EnableFastISelAbort) {
-            dbgs() << "FastISel missed call: ";
-            Inst->print(dbgs());
+          if (R.isEnabled() || EnableFastISelAbort) {
+            std::string InstStrStorage;
+            raw_string_ostream InstStr(InstStrStorage);
+            InstStr << *Inst;
+
+            R << ": " << InstStr.str();
           }
-          if (EnableFastISelAbort > 2)
-            // FastISel selector couldn't handle something and bailed.
-            // For the purpose of debugging, just abort.
-            report_fatal_error("FastISel didn't select the entire block");
+
+          reportFastISelFailure(*MF, *ORE, R, EnableFastISelAbort > 2);
 
           if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() &&
               !Inst->use_empty()) {
@@ -1569,35 +1457,35 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           continue;
         }
 
+        OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
+                                   Inst->getDebugLoc(), LLVMBB);
+
         bool ShouldAbort = EnableFastISelAbort;
-        if (EnableFastISelVerbose || EnableFastISelAbort) {
-          if (isa<TerminatorInst>(Inst)) {
-            // Use a different message for terminator misses.
-            dbgs() << "FastISel missed terminator: ";
-            // Don't abort unless for terminator unless the level is really high
-            ShouldAbort = (EnableFastISelAbort > 2);
-          } else {
-            dbgs() << "FastISel miss: ";
-          }
-          Inst->print(dbgs());
+        if (isa<TerminatorInst>(Inst)) {
+          // Use a different message for terminator misses.
+          R << "FastISel missed terminator";
+          // Don't abort for terminator unless the level is really high
+          ShouldAbort = (EnableFastISelAbort > 2);
+        } else {
+          R << "FastISel missed";
         }
-        if (ShouldAbort)
-          // FastISel selector couldn't handle something and bailed.
-          // For the purpose of debugging, just abort.
-          report_fatal_error("FastISel didn't select the entire block");
+
+        if (R.isEnabled() || EnableFastISelAbort) {
+          std::string InstStrStorage;
+          raw_string_ostream InstStr(InstStrStorage);
+          InstStr << *Inst;
+          R << ": " << InstStr.str();
+        }
+
+        reportFastISelFailure(*MF, *ORE, R, ShouldAbort);
 
         NumFastIselFailures += NumFastIselRemaining;
         break;
       }
 
       FastIS->recomputeInsertPt();
-    } else {
-      // Lower any arguments needed in this block if this is the entry block.
-      if (LLVMBB == &Fn.getEntryBlock()) {
-        ++NumEntryBlocks;
-        LowerArguments(Fn);
-      }
     }
+
     if (getAnalysis<StackProtector>().shouldEmitSDCheck(*LLVMBB)) {
       bool FunctionBasedInstrumentation =
           TLI->getSSPStackGuardCheck(*Fn.getParent());
@@ -1616,10 +1504,17 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
       // block.
       bool HadTailCall;
       SelectBasicBlock(Begin, BI, HadTailCall);
+
+      // But if FastISel was run, we already selected some of the block.
+      // If we emitted a tail-call, we need to delete any previously emitted
+      // instruction that follows it.
+      if (HadTailCall && FuncInfo->InsertPt != FuncInfo->MBB->end())
+        FastIS->removeDeadCode(FuncInfo->InsertPt, FuncInfo->MBB->end());
     }
 
     FinishBasicBlock();
     FuncInfo->PHINodesToUpdate.clear();
+    ElidedArgCopyInstrs.clear();
   }
 
   propagateSwiftErrorVRegs(FuncInfo);
@@ -2237,7 +2132,6 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
     IgnoreChains = false;
   }
 
-
   SmallPtrSet<SDNode*, 16> Visited;
   return !findNonImmUse(Root, N.getNode(), U, Root, Visited, IgnoreChains);
 }
@@ -2614,7 +2508,7 @@ MorphNode(SDNode *Node, unsigned TargetOpc, SDVTList VTList,
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
           SDValue N,
-          const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
+          const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {
   // Accept if it is exactly the same as a previously recorded node.
   unsigned RecNo = MatcherTable[MatcherIndex++];
   assert(RecNo < RecordedNodes.size() && "Invalid CheckSame");
@@ -2624,9 +2518,9 @@ CheckSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
 /// CheckChildSame - Implements OP_CheckChildXSame.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline bool
 CheckChildSame(const unsigned char *MatcherTable, unsigned &MatcherIndex,
-             SDValue N,
-             const SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes,
-             unsigned ChildNo) {
+              SDValue N,
+              const SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes,
+              unsigned ChildNo) {
   if (ChildNo >= N.getNumOperands())
     return false;  // Match fails if out of range child #.
   return ::CheckSame(MatcherTable, MatcherIndex, N.getOperand(ChildNo),
@@ -2748,7 +2642,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
                                        unsigned Index, SDValue N,
                                        bool &Result,
                                        const SelectionDAGISel &SDISel,
-                 SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes) {
+                  SmallVectorImpl<std::pair<SDValue, SDNode*>> &RecordedNodes) {
   switch (Table[Index++]) {
   default:
     Result = false;
@@ -2816,6 +2710,7 @@ static unsigned IsPredicateKnownToFail(const unsigned char *Table,
 }
 
 namespace {
+
 struct MatchScope {
   /// FailIndex - If this match fails, this is the index to continue with.
   unsigned FailIndex;
@@ -2842,14 +2737,16 @@ struct MatchScope {
 /// for this.
 class MatchStateUpdater : public SelectionDAG::DAGUpdateListener
 {
-      SmallVectorImpl<std::pair<SDValue, SDNode*> > &RecordedNodes;
-      SmallVectorImpl<MatchScope> &MatchScopes;
+  SDNode **NodeToMatch;
+  SmallVectorImpl<std::pair<SDValue, SDNode *>> &RecordedNodes;
+  SmallVectorImpl<MatchScope> &MatchScopes;
+
 public:
-  MatchStateUpdater(SelectionDAG &DAG,
-                    SmallVectorImpl<std::pair<SDValue, SDNode*> > &RN,
-                    SmallVectorImpl<MatchScope> &MS) :
-    SelectionDAG::DAGUpdateListener(DAG),
-    RecordedNodes(RN), MatchScopes(MS) { }
+  MatchStateUpdater(SelectionDAG &DAG, SDNode **NodeToMatch,
+                    SmallVectorImpl<std::pair<SDValue, SDNode *>> &RN,
+                    SmallVectorImpl<MatchScope> &MS)
+      : SelectionDAG::DAGUpdateListener(DAG), NodeToMatch(NodeToMatch),
+        RecordedNodes(RN), MatchScopes(MS) {}
 
   void NodeDeleted(SDNode *N, SDNode *E) override {
     // Some early-returns here to avoid the search if we deleted the node or
@@ -2859,6 +2756,9 @@ public:
     // update listener during matching a complex patterns.
     if (!E || E->isMachineOpcode())
       return;
+    // Check if NodeToMatch was updated.
+    if (N == *NodeToMatch)
+      *NodeToMatch = E;
     // Performing linear search here does not matter because we almost never
     // run this code.  You'd have to have a CSE during complex pattern
     // matching.
@@ -2872,6 +2772,7 @@ public:
           J.setNode(E);
   }
 };
+
 } // end anonymous namespace
 
 void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
@@ -2977,7 +2878,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     // with an OPC_SwitchOpcode instruction.  Populate the table now, since this
     // is the first time we're selecting an instruction.
     unsigned Idx = 1;
-    while (1) {
+    while (true) {
       // Get the size of this case.
       unsigned CaseSize = MatcherTable[Idx++];
       if (CaseSize & 128)
@@ -2998,7 +2899,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       MatcherIndex = OpcodeOffset[N.getOpcode()];
   }
 
-  while (1) {
+  while (true) {
     assert(MatcherIndex < TableSize && "Invalid index");
 #ifndef NDEBUG
     unsigned CurrentOpcodeIndex = MatcherIndex;
@@ -3013,7 +2914,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // immediately fail, don't even bother pushing a scope for them.
       unsigned FailIndex;
 
-      while (1) {
+      while (true) {
         unsigned NumToSkip = MatcherTable[MatcherIndex++];
         if (NumToSkip & 128)
           NumToSkip = GetVBR(NumToSkip, MatcherTable, MatcherIndex);
@@ -3151,7 +3052,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // consistent.
       std::unique_ptr<MatchStateUpdater> MSU;
       if (ComplexPatternFuncMutatesDAG())
-        MSU.reset(new MatchStateUpdater(*CurDAG, RecordedNodes,
+        MSU.reset(new MatchStateUpdater(*CurDAG, &NodeToMatch, RecordedNodes,
                                         MatchScopes));
 
       if (!CheckComplexPattern(NodeToMatch, RecordedNodes[RecNo].second,
@@ -3174,7 +3075,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       unsigned CurNodeOpcode = N.getOpcode();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
-      while (1) {
+      while (true) {
         // Get the size of this case.
         CaseSize = MatcherTable[MatcherIndex++];
         if (CaseSize & 128)
@@ -3205,7 +3106,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       MVT CurNodeVT = N.getSimpleValueType();
       unsigned SwitchStart = MatcherIndex-1; (void)SwitchStart;
       unsigned CaseSize;
-      while (1) {
+      while (true) {
         // Get the size of this case.
         CaseSize = MatcherTable[MatcherIndex++];
         if (CaseSize & 128)
@@ -3271,7 +3172,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       // a single use.
       bool HasMultipleUses = false;
       for (unsigned i = 1, e = NodeStack.size()-1; i != e; ++i)
-        if (!NodeStack[i].hasOneUse()) {
+        if (!NodeStack[i].getNode()->hasOneUse()) {
           HasMultipleUses = true;
           break;
         }
@@ -3437,6 +3338,15 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
       RecordedNodes.push_back(std::pair<SDValue,SDNode*>(Res, nullptr));
       continue;
     }
+    case OPC_Coverage: {
+      // This is emitted right before MorphNode/EmitNode.
+      // So it should be safe to assume that this node has been selected
+      unsigned index = MatcherTable[MatcherIndex++];
+      index |= (MatcherTable[MatcherIndex++] << 8);
+      dbgs() << "COVERED: " << getPatternForIndex(index) << "\n";
+      dbgs() << "INCLUDED: " << getIncludePathForIndex(index) << "\n";
+      continue;
+    }
 
     case OPC_EmitNode:     case OPC_MorphNodeTo:
     case OPC_EmitNode0:    case OPC_EmitNode1:    case OPC_EmitNode2:
@@ -3529,7 +3439,6 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
           RecordedNodes.push_back(std::pair<SDValue,SDNode*>(SDValue(Res, i),
                                                              nullptr));
         }
-
       } else {
         assert(NodeToMatch->getOpcode() != ISD::DELETED_NODE &&
                "NodeToMatch was removed partway through selection");
@@ -3666,7 +3575,7 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     // find a case to check.
     DEBUG(dbgs() << "  Match failed at index " << CurrentOpcodeIndex << "\n");
     ++NumDAGIselRetries;
-    while (1) {
+    while (true) {
       if (MatchScopes.empty()) {
         CannotYetSelect(NodeToMatch);
         return;
diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index b37025882d9b1a6d3fa5d45a808e9b5ad49b997c..034591a00fecd790f6980b1694682557b8584ff1 100644
--- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -55,14 +55,15 @@ bool TargetLowering::isInTailCallPosition(SelectionDAG &DAG, SDNode *Node,
 
   // Conservatively require the attributes of the call to match those of
   // the return. Ignore noalias because it doesn't affect the call sequence.
-  AttributeSet CallerAttrs = F->getAttributes();
-  if (AttrBuilder(CallerAttrs, AttributeSet::ReturnIndex)
-      .removeAttribute(Attribute::NoAlias).hasAttributes())
+  AttributeList CallerAttrs = F->getAttributes();
+  if (AttrBuilder(CallerAttrs, AttributeList::ReturnIndex)
+          .removeAttribute(Attribute::NoAlias)
+          .hasAttributes())
     return false;
 
   // It's not safe to eliminate the sign / zero extension of the return value.
-  if (CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt) ||
-      CallerAttrs.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+  if (CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt) ||
+      CallerAttrs.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
     return false;
 
   // Check if the only use is a function return node.
@@ -96,18 +97,18 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
 
 /// \brief Set CallLoweringInfo attribute flags based on a call instruction
 /// and called function attributes.
-void TargetLowering::ArgListEntry::setAttributes(ImmutableCallSite *CS,
-                                                 unsigned AttrIdx) {
-  isSExt     = CS->paramHasAttr(AttrIdx, Attribute::SExt);
-  isZExt     = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
-  isInReg    = CS->paramHasAttr(AttrIdx, Attribute::InReg);
-  isSRet     = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
-  isNest     = CS->paramHasAttr(AttrIdx, Attribute::Nest);
-  isByVal    = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
-  isInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
-  isReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
-  isSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
-  isSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
+void TargetLoweringBase::ArgListEntry::setAttributes(ImmutableCallSite *CS,
+                                                     unsigned AttrIdx) {
+  IsSExt = CS->paramHasAttr(AttrIdx, Attribute::SExt);
+  IsZExt = CS->paramHasAttr(AttrIdx, Attribute::ZExt);
+  IsInReg = CS->paramHasAttr(AttrIdx, Attribute::InReg);
+  IsSRet = CS->paramHasAttr(AttrIdx, Attribute::StructRet);
+  IsNest = CS->paramHasAttr(AttrIdx, Attribute::Nest);
+  IsByVal = CS->paramHasAttr(AttrIdx, Attribute::ByVal);
+  IsInAlloca = CS->paramHasAttr(AttrIdx, Attribute::InAlloca);
+  IsReturned = CS->paramHasAttr(AttrIdx, Attribute::Returned);
+  IsSwiftSelf = CS->paramHasAttr(AttrIdx, Attribute::SwiftSelf);
+  IsSwiftError = CS->paramHasAttr(AttrIdx, Attribute::SwiftError);
   Alignment  = CS->getParamAlignment(AttrIdx);
 }
 
@@ -125,8 +126,8 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
   for (SDValue Op : Ops) {
     Entry.Node = Op;
     Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
-    Entry.isZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Entry.IsSExt = shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
+    Entry.IsZExt = !shouldSignExtendTypeInLibCall(Op.getValueType(), isSigned);
     Args.push_back(Entry);
   }
 
@@ -138,10 +139,13 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
   Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
   TargetLowering::CallLoweringInfo CLI(DAG);
   bool signExtend = shouldSignExtendTypeInLibCall(RetVT, isSigned);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
-    .setNoReturn(doesNotReturn).setDiscardResult(!isReturnValueUsed)
-    .setSExtResult(signExtend).setZExtResult(!signExtend);
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+      .setNoReturn(doesNotReturn)
+      .setDiscardResult(!isReturnValueUsed)
+      .setSExtResult(signExtend)
+      .setZExtResult(!signExtend);
   return LowerCallTo(CLI);
 }
 
@@ -334,34 +338,35 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 //  Optimization Methods
 //===----------------------------------------------------------------------===//
 
-/// Check to see if the specified operand of the specified instruction is a
-/// constant integer. If so, check to see if there are any bits set in the
-/// constant that are not demanded. If so, shrink the constant and return true.
-bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(SDValue Op,
-                                                        const APInt &Demanded) {
-  SDLoc dl(Op);
+/// If the specified instruction has a constant integer operand and there are
+/// bits set in that constant that are not demanded, then clear those bits and
+/// return true.
+bool TargetLowering::TargetLoweringOpt::ShrinkDemandedConstant(
+    SDValue Op, const APInt &Demanded) {
+  SDLoc DL(Op);
+  unsigned Opcode = Op.getOpcode();
 
   // FIXME: ISD::SELECT, ISD::SELECT_CC
-  switch (Op.getOpcode()) {
-  default: break;
+  switch (Opcode) {
+  default:
+    break;
   case ISD::XOR:
   case ISD::AND:
   case ISD::OR: {
-    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-    if (!C) return false;
+    auto *Op1C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (!Op1C)
+      return false;
 
-    if (Op.getOpcode() == ISD::XOR &&
-        (C->getAPIntValue() | (~Demanded)).isAllOnesValue())
+    // If this is a 'not' op, don't touch it because that's a canonical form.
+    const APInt &C = Op1C->getAPIntValue();
+    if (Opcode == ISD::XOR && (C | ~Demanded).isAllOnesValue())
       return false;
 
-    // if we can expand it to have all bits set, do it
-    if (C->getAPIntValue().intersects(~Demanded)) {
+    if (C.intersects(~Demanded)) {
       EVT VT = Op.getValueType();
-      SDValue New = DAG.getNode(Op.getOpcode(), dl, VT, Op.getOperand(0),
-                                DAG.getConstant(Demanded &
-                                                C->getAPIntValue(),
-                                                dl, VT));
-      return CombineTo(Op, New);
+      SDValue NewC = DAG.getConstant(Demanded & C, DL, VT);
+      SDValue NewOp = DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC);
+      return CombineTo(Op, NewOp);
     }
 
     break;
@@ -470,6 +475,21 @@ TargetLowering::TargetLoweringOpt::SimplifyDemandedBits(SDNode *User,
   return true;
 }
 
+bool TargetLowering::SimplifyDemandedBits(SDValue Op, APInt &DemandedMask,
+                                          DAGCombinerInfo &DCI) const {
+
+  SelectionDAG &DAG = DCI.DAG;
+  TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
+                        !DCI.isBeforeLegalizeOps());
+  APInt KnownZero, KnownOne;
+
+  bool Simplified = SimplifyDemandedBits(Op, DemandedMask, KnownZero, KnownOne,
+                                         TLO);
+  if (Simplified)
+    DCI.CommitTargetLoweringOpt(TLO);
+  return Simplified;
+}
+
 /// Look at Op. At this point, we know that only the DemandedMask bits of the
 /// result of Op are ever used downstream. If we can use this information to
 /// simplify Op, create a new simplified DAG node and return true, returning the
@@ -750,6 +770,33 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     KnownOne &= KnownOne2;
     KnownZero &= KnownZero2;
     break;
+  case ISD::SETCC: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+    // If (1) we only need the sign-bit, (2) the setcc operands are the same
+    // width as the setcc result, and (3) the result of a setcc conforms to 0 or
+    // -1, we may be able to bypass the setcc.
+    if (NewMask.isSignBit() && Op0.getScalarValueSizeInBits() == BitWidth &&
+        getBooleanContents(Op.getValueType()) ==
+            BooleanContent::ZeroOrNegativeOneBooleanContent) {
+      // If we're testing X < 0, then this compare isn't needed - just use X!
+      // FIXME: We're limiting to integer types here, but this should also work
+      // if we don't care about FP signed-zero. The use of SETLT with FP means
+      // that we don't care about NaNs.
+      if (CC == ISD::SETLT && Op1.getValueType().isInteger() &&
+          (isNullConstant(Op1) || ISD::isBuildVectorAllZeros(Op1.getNode())))
+        return TLO.CombineTo(Op, Op0);
+
+      // TODO: Should we check for other forms of sign-bit comparisons?
+      // Examples: X <= -1, X >= 0
+    }
+    if (getBooleanContents(Op0.getValueType()) ==
+            TargetLowering::ZeroOrOneBooleanContent &&
+        BitWidth > 1)
+      KnownZero.setBitsFrom(1);
+    break;
+  }
   case ISD::SHL:
     if (ConstantSDNode *SA = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
       unsigned ShAmt = SA->getZExtValue();
@@ -834,7 +881,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       KnownZero <<= SA->getZExtValue();
       KnownOne  <<= SA->getZExtValue();
       // low bits known zero.
-      KnownZero |= APInt::getLowBitsSet(BitWidth, SA->getZExtValue());
+      KnownZero.setLowBits(SA->getZExtValue());
     }
     break;
   case ISD::SRL:
@@ -853,7 +900,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact())
-        InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt);
+        InDemandedMask.setLowBits(ShAmt);
 
       // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
       // single shift.  We can do this if the top bits (which are shifted out)
@@ -884,8 +931,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       KnownZero = KnownZero.lshr(ShAmt);
       KnownOne  = KnownOne.lshr(ShAmt);
 
-      APInt HighBits = APInt::getHighBitsSet(BitWidth, ShAmt);
-      KnownZero |= HighBits;  // High bits known zero.
+      KnownZero.setHighBits(ShAmt);  // High bits known zero.
     }
     break;
   case ISD::SRA:
@@ -911,7 +957,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<BinaryWithFlagsSDNode>(Op)->Flags.hasExact())
-        InDemandedMask |= APInt::getLowBitsSet(BitWidth, ShAmt);
+        InDemandedMask.setLowBits(ShAmt);
 
       // If any of the demanded bits are produced by the sign extension, we also
       // demand the input sign bit.
@@ -1075,7 +1121,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
     EVT InVT = Op.getOperand(0).getValueType();
     unsigned InBits = InVT.getScalarSizeInBits();
     APInt InMask    = APInt::getLowBitsSet(BitWidth, InBits);
-    APInt InSignBit = APInt::getBitsSet(BitWidth, InBits - 1, InBits);
+    APInt InSignBit = APInt::getOneBitSet(BitWidth, InBits - 1);
     APInt NewBits   = ~InMask & NewMask;
 
     // If none of the top bits are demanded, convert this into an any_extend.
@@ -1191,7 +1237,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
       return true;
     assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?");
 
-    KnownZero |= ~InMask & NewMask;
+    KnownZero |= ~InMask;
     break;
   }
   case ISD::BITCAST:
@@ -1281,6 +1327,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op,
 void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                    APInt &KnownZero,
                                                    APInt &KnownOne,
+                                                   const APInt &DemandedElts,
                                                    const SelectionDAG &DAG,
                                                    unsigned Depth) const {
   assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
@@ -1295,6 +1342,7 @@ void TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
 /// This method can be implemented by targets that want to expose additional
 /// information about sign bits to the DAG Combiner.
 unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
+                                                         const APInt &,
                                                          const SelectionDAG &,
                                                          unsigned Depth) const {
   assert((Op.getOpcode() >= ISD::BUILTIN_OP_END ||
@@ -2940,7 +2988,7 @@ static SDValue BuildExactSDIV(const TargetLowering &TLI, SDValue Op1, APInt d,
 SDValue TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                       SelectionDAG &DAG,
                                       std::vector<SDNode *> *Created) const {
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.isIntDivCheap(N->getValueType(0), Attr))
     return SDValue(N,0); // Lower SDIV as SDIV
@@ -3815,7 +3863,7 @@ SDValue TargetLowering::LowerToTLSEmulatedModel(const GlobalAddressSDNode *GA,
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl).setChain(DAG.getEntryNode());
-  CLI.setCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
+  CLI.setLibCallee(CallingConv::C, VoidPtrType, EmuTlsGetAddr, std::move(Args));
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
   // TLSADDR will be codegen'ed as call. Inform MFI that function has calls.
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index 209bbe54ea2362092c95409795abdb850ea088eb..ab578df4069d5462c8331f9a527fad8748ed0454 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -64,6 +64,7 @@ public:
 
 private:
   bool setupEntryBlockAndCallSites(Function &F);
+  bool undoSwiftErrorSelect(Function &F);
   void substituteLPadValues(LandingPadInst *LPI, Value *ExnVal, Value *SelVal);
   Value *setupFunctionContext(Function &F, ArrayRef<LandingPadInst *> LPads);
   void lowerIncomingArguments(Function &F);
@@ -174,8 +175,8 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
   // because the value needs to be added to the global context list.
   auto &DL = F.getParent()->getDataLayout();
   unsigned Align = DL.getPrefTypeAlignment(FunctionContextTy);
-  FuncCtx = new AllocaInst(FunctionContextTy, nullptr, Align, "fn_context",
-                           &EntryBB->front());
+  FuncCtx = new AllocaInst(FunctionContextTy, DL.getAllocaAddrSpace(),
+                           nullptr, Align, "fn_context", &EntryBB->front());
 
   // Fill in the function context structure.
   for (LandingPadInst *LPI : LPads) {
@@ -458,14 +459,33 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
   return true;
 }
 
+bool SjLjEHPrepare::undoSwiftErrorSelect(Function &F) {
+  // We have inserted dummy copies 'select true, arg, undef' in the entry block
+  // for arguments to simplify this pass.
+  // swifterror arguments cannot be used in this way. Undo the select for the
+  // swifterror argument.
+  for (auto &AI : F.args()) {
+    if (AI.isSwiftError()) {
+      assert(AI.hasOneUse() && "Must have converted the argument to a select");
+      auto *Select = dyn_cast<SelectInst>(AI.use_begin()->getUser());
+      assert(Select && "There must be single select user");
+      auto *OrigSwiftError = cast<Argument>(Select->getTrueValue());
+      Select->replaceAllUsesWith(OrigSwiftError);
+      Select->eraseFromParent();
+      return true;
+    }
+  }
+  return false;
+}
+
 bool SjLjEHPrepare::runOnFunction(Function &F) {
   Module &M = *F.getParent();
   RegisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Register", Type::getVoidTy(M.getContext()),
-      PointerType::getUnqual(FunctionContextTy), nullptr);
+      PointerType::getUnqual(FunctionContextTy));
   UnregisterFn = M.getOrInsertFunction(
       "_Unwind_SjLj_Unregister", Type::getVoidTy(M.getContext()),
-      PointerType::getUnqual(FunctionContextTy), nullptr);
+      PointerType::getUnqual(FunctionContextTy));
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
   StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
@@ -476,5 +496,7 @@ bool SjLjEHPrepare::runOnFunction(Function &F) {
   FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
 
   bool Res = setupEntryBlockAndCallSites(F);
+  if (Res)
+    Res |= undoSwiftErrorSelect(F);
   return Res;
 }
diff --git a/lib/CodeGen/SlotIndexes.cpp b/lib/CodeGen/SlotIndexes.cpp
index dba103e9bfb1586af90aa3f0d470da702f06bef5..bc2a1d09056bd5d60427ce6604c3ae02ec0be1d9 100644
--- a/lib/CodeGen/SlotIndexes.cpp
+++ b/lib/CodeGen/SlotIndexes.cpp
@@ -103,6 +103,48 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   return false;
 }
 
+void SlotIndexes::removeMachineInstrFromMaps(MachineInstr &MI) {
+  assert(!MI.isBundledWithPred() &&
+         "Use removeSingleMachineInstrFromMaps() instread");
+  Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI);
+  if (mi2iItr == mi2iMap.end())
+    return;
+
+  SlotIndex MIIndex = mi2iItr->second;
+  IndexListEntry &MIEntry = *MIIndex.listEntry();
+  assert(MIEntry.getInstr() == &MI && "Instruction indexes broken.");
+  mi2iMap.erase(mi2iItr);
+  // FIXME: Eventually we want to actually delete these indexes.
+  MIEntry.setInstr(nullptr);
+}
+
+void SlotIndexes::removeSingleMachineInstrFromMaps(MachineInstr &MI) {
+  Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI);
+  if (mi2iItr == mi2iMap.end())
+    return;
+
+  SlotIndex MIIndex = mi2iItr->second;
+  IndexListEntry &MIEntry = *MIIndex.listEntry();
+  assert(MIEntry.getInstr() == &MI && "Instruction indexes broken.");
+  mi2iMap.erase(mi2iItr);
+
+  // When removing the first instruction of a bundle update mapping to next
+  // instruction.
+  if (MI.isBundledWithSucc()) {
+    // Only the first instruction of a bundle should have an index assigned.
+    assert(!MI.isBundledWithPred() && "Should have first bundle isntruction");
+
+    MachineBasicBlock::instr_iterator Next = std::next(MI.getIterator());
+    MachineInstr &NextMI = *Next;
+    MIEntry.setInstr(&NextMI);
+    mi2iMap.insert(std::make_pair(&NextMI, MIIndex));
+    return;
+  } else {
+    // FIXME: Eventually we want to actually delete these indexes.
+    MIEntry.setInstr(nullptr);
+  }
+}
+
 void SlotIndexes::renumberIndexes() {
   // Renumber updates the index of every element of the index list.
   DEBUG(dbgs() << "\n*** Renumbering SlotIndexes ***\n");
diff --git a/lib/CodeGen/SplitKit.cpp b/lib/CodeGen/SplitKit.cpp
index 1c6a84e539440df857cb32a0fa497bd5ea454e47..3a50aaa69985d3509e27ef7d848521d0b32bb647 100644
--- a/lib/CodeGen/SplitKit.cpp
+++ b/lib/CodeGen/SplitKit.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -487,12 +488,126 @@ void SplitEditor::forceRecompute(unsigned RegIdx, const VNInfo *ParentVNI) {
   VFP = ValueForcePair(nullptr, true);
 }
 
+SlotIndex SplitEditor::buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+    unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) {
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  bool FirstCopy = !Def.isValid();
+  MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc)
+      .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy)
+              | getInternalReadRegState(!FirstCopy), SubIdx)
+      .addReg(FromReg, 0, SubIdx);
+
+  BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator();
+  if (FirstCopy) {
+    SlotIndexes &Indexes = *LIS.getSlotIndexes();
+    Def = Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
+  } else {
+    CopyMI->bundleWithPred();
+  }
+  LaneBitmask LaneMask = TRI.getSubRegIndexLaneMask(SubIdx);
+  DestLI.refineSubRanges(Allocator, LaneMask,
+                         [Def, &Allocator](LiveInterval::SubRange& SR) {
+    SR.createDeadDef(Def, Allocator);
+  });
+  return Def;
+}
+
+SlotIndex SplitEditor::buildCopy(unsigned FromReg, unsigned ToReg,
+    LaneBitmask LaneMask, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) {
+  const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY);
+  if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) {
+    // The full vreg is copied.
+    MachineInstr *CopyMI =
+        BuildMI(MBB, InsertBefore, DebugLoc(), Desc, ToReg).addReg(FromReg);
+    SlotIndexes &Indexes = *LIS.getSlotIndexes();
+    return Indexes.insertMachineInstrInMaps(*CopyMI, Late).getRegSlot();
+  }
+
+  // Only a subset of lanes needs to be copied. The following is a simple
+  // heuristic to construct a sequence of COPYs. We could add a target
+  // specific callback if this turns out to be suboptimal.
+  LiveInterval &DestLI = LIS.getInterval(Edit->get(RegIdx));
+
+  // First pass: Try to find a perfectly matching subregister index. If none
+  // exists find the one covering the most lanemask bits.
+  SmallVector<unsigned, 8> PossibleIndexes;
+  unsigned BestIdx = 0;
+  unsigned BestCover = 0;
+  const TargetRegisterClass *RC = MRI.getRegClass(FromReg);
+  assert(RC == MRI.getRegClass(ToReg) && "Should have same reg class");
+  for (unsigned Idx = 1, E = TRI.getNumSubRegIndices(); Idx < E; ++Idx) {
+    // Is this index even compatible with the given class?
+    if (TRI.getSubClassWithSubReg(RC, Idx) != RC)
+      continue;
+    LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx);
+    // Early exit if we found a perfect match.
+    if (SubRegMask == LaneMask) {
+      BestIdx = Idx;
+      break;
+    }
+
+    // The index must not cover any lanes outside \p LaneMask.
+    if ((SubRegMask & ~LaneMask).any())
+      continue;
+
+    unsigned PopCount = countPopulation(SubRegMask.getAsInteger());
+    PossibleIndexes.push_back(Idx);
+    if (PopCount > BestCover) {
+      BestCover = PopCount;
+      BestIdx = Idx;
+    }
+  }
+
+  // Abort if we cannot possibly implement the COPY with the given indexes.
+  if (BestIdx == 0)
+    report_fatal_error("Impossible to implement partial COPY");
+
+  SlotIndex Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore,
+                                        BestIdx, DestLI, Late, SlotIndex());
+
+  // Greedy heuristic: Keep iterating keeping the best covering subreg index
+  // each time.
+  LaneBitmask LanesLeft =
+      LaneMask & ~(TRI.getSubRegIndexLaneMask(BestCover));
+  while (LanesLeft.any()) {
+    unsigned BestIdx = 0;
+    int BestCover = INT_MIN;
+    for (unsigned Idx : PossibleIndexes) {
+      LaneBitmask SubRegMask = TRI.getSubRegIndexLaneMask(Idx);
+      // Early exit if we found a perfect match.
+      if (SubRegMask == LanesLeft) {
+        BestIdx = Idx;
+        break;
+      }
+
+      // Try to cover as much of the remaining lanes as possible but
+      // as few of the already covered lanes as possible.
+      int Cover = countPopulation((SubRegMask & LanesLeft).getAsInteger())
+                - countPopulation((SubRegMask & ~LanesLeft).getAsInteger());
+      if (Cover > BestCover) {
+        BestCover = Cover;
+        BestIdx = Idx;
+      }
+    }
+
+    if (BestIdx == 0)
+      report_fatal_error("Impossible to implement partial COPY");
+
+    buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx,
+                          DestLI, Late, Def);
+    LanesLeft &= ~TRI.getSubRegIndexLaneMask(BestIdx);
+  }
+
+  return Def;
+}
+
 VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
                                    VNInfo *ParentVNI,
                                    SlotIndex UseIdx,
                                    MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I) {
-  MachineInstr *CopyMI = nullptr;
   SlotIndex Def;
   LiveInterval *LI = &LIS.getInterval(Edit->get(RegIdx));
 
@@ -505,24 +620,29 @@ VNInfo *SplitEditor::defFromParent(unsigned RegIdx,
   LiveInterval &OrigLI = LIS.getInterval(Original);
   VNInfo *OrigVNI = OrigLI.getVNInfoAt(UseIdx);
 
+  unsigned Reg = LI->reg;
   bool DidRemat = false;
   if (OrigVNI) {
     LiveRangeEdit::Remat RM(ParentVNI);
     RM.OrigMI = LIS.getInstructionFromIndex(OrigVNI->def);
     if (Edit->canRematerializeAt(RM, OrigVNI, UseIdx, true)) {
-      Def = Edit->rematerializeAt(MBB, I, LI->reg, RM, TRI, Late);
+      Def = Edit->rematerializeAt(MBB, I, Reg, RM, TRI, Late);
       ++NumRemats;
       DidRemat = true;
     }
   }
   if (!DidRemat) {
-    // Can't remat, just insert a copy from parent.
-    CopyMI = BuildMI(MBB, I, DebugLoc(), TII.get(TargetOpcode::COPY), LI->reg)
-               .addReg(Edit->getReg());
-    Def = LIS.getSlotIndexes()
-              ->insertMachineInstrInMaps(*CopyMI, Late)
-              .getRegSlot();
+    LaneBitmask LaneMask;
+    if (LI->hasSubRanges()) {
+      LaneMask = LaneBitmask::getNone();
+      for (LiveInterval::SubRange &S : LI->subranges())
+        LaneMask |= S.LaneMask;
+    } else {
+      LaneMask = LaneBitmask::getAll();
+    }
+
     ++NumCopies;
+    Def = buildCopy(Edit->getReg(), Reg, LaneMask, MBB, I, Late, RegIdx);
   }
 
   // Define the value in Reg.
diff --git a/lib/CodeGen/SplitKit.h b/lib/CodeGen/SplitKit.h
index a75738aaf44678f38a2300a37ae77b92e8aead3f..9d409e924a3d85392ee26ca25079f21043d330e1 100644
--- a/lib/CodeGen/SplitKit.h
+++ b/lib/CodeGen/SplitKit.h
@@ -405,6 +405,17 @@ private:
   /// deleteRematVictims - Delete defs that are dead after rematerializing.
   void deleteRematVictims();
 
+  /// Add a copy instruction copying \p FromReg to \p ToReg before
+  /// \p InsertBefore. This can be invoked with a \p LaneMask which may make it
+  /// necessary to construct a sequence of copies to cover it exactly.
+  SlotIndex buildCopy(unsigned FromReg, unsigned ToReg, LaneBitmask LaneMask,
+      MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore,
+      bool Late, unsigned RegIdx);
+
+  SlotIndex buildSingleSubRegCopy(unsigned FromReg, unsigned ToReg,
+      MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore,
+      unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex PrevCopy);
+
 public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
   /// Newly created intervals will be appended to newIntervals.
diff --git a/lib/CodeGen/StackColoring.cpp b/lib/CodeGen/StackColoring.cpp
index cf722538513c240e7b76b9cbf42f4ce20bdc2969..f51d959a089aa382b5a939fc1cbd63ec23da045b 100644
--- a/lib/CodeGen/StackColoring.cpp
+++ b/lib/CodeGen/StackColoring.cpp
@@ -23,7 +23,6 @@
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
@@ -568,9 +567,8 @@ unsigned StackColoring::collectMarkers(unsigned NumSlot)
 
   // Step 2: compute begin/end sets for each block
 
-  // NOTE: We use a reverse-post-order iteration to ensure that we obtain a
-  // deterministic numbering, and because we'll need a post-order iteration
-  // later for solving the liveness dataflow problem.
+  // NOTE: We use a depth-first iteration to ensure that we obtain a
+  // deterministic numbering.
   for (MachineBasicBlock *MBB : depth_first(MF)) {
 
     // Assign a serial number to this basic block.
diff --git a/lib/CodeGen/StackMaps.cpp b/lib/CodeGen/StackMaps.cpp
index 9b7dd400fc92e540929453ab8cfc3527d644ecd1..1a8ec5bff3229f58cdb15706390a90ef298fcb13 100644
--- a/lib/CodeGen/StackMaps.cpp
+++ b/lib/CodeGen/StackMaps.cpp
@@ -1,4 +1,4 @@
-//===---------------------------- StackMaps.cpp ---------------------------===//
+//===- StackMaps.cpp ------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,23 +7,34 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetMachine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 #include <iterator>
+#include <utility>
 
 using namespace llvm;
 
@@ -276,7 +287,8 @@ StackMaps::parseRegisterLiveOutMask(const uint32_t *Mask) const {
   }
 
   LiveOuts.erase(
-      remove_if(LiveOuts, [](const LiveOutReg &LO) { return LO.Reg == 0; }),
+      llvm::remove_if(LiveOuts,
+                      [](const LiveOutReg &LO) { return LO.Reg == 0; }),
       LiveOuts.end());
 
   return LiveOuts;
@@ -286,7 +298,6 @@ void StackMaps::recordStackMapOpers(const MachineInstr &MI, uint64_t ID,
                                     MachineInstr::const_mop_iterator MOI,
                                     MachineInstr::const_mop_iterator MOE,
                                     bool recordResult) {
-
   MCContext &OutContext = AP.OutStreamer->getContext();
   MCSymbol *MILabel = OutContext.createTempSymbol();
   AP.OutStreamer->EmitLabel(MILabel);
@@ -378,6 +389,7 @@ void StackMaps::recordPatchPoint(const MachineInstr &MI) {
   }
 #endif
 }
+
 void StackMaps::recordStatepoint(const MachineInstr &MI) {
   assert(MI.getOpcode() == TargetOpcode::STATEPOINT && "expected statepoint");
 
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index c2c010a29d44b40f88957ddfa5cc57903999792d..a8aafe78748dc7d10a66b748767873e67b3bc361 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -1,4 +1,4 @@
-//===-- StackProtector.cpp - Stack Protector Insertion --------------------===//
+//===- StackProtector.cpp - Stack Protector Insertion ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,30 +14,38 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackProtector.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <cstdlib>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "stack-protector"
@@ -51,7 +59,7 @@ static cl::opt<bool> EnableSelectionDAGSP("enable-selectiondag-sp",
 
 char StackProtector::ID = 0;
 INITIALIZE_TM_PASS(StackProtector, "stack-protector", "Insert stack protectors",
-                false, true)
+                   false, true)
 
 FunctionPass *llvm::createStackProtectorPass(const TargetMachine *TM) {
   return new StackProtector(TM);
@@ -222,7 +230,16 @@ bool StackProtector::RequiresStackProtector() {
   if (F->hasFnAttribute(Attribute::SafeStack))
     return false;
 
+  // We are constructing the OptimizationRemarkEmitter on the fly rather than
+  // using the analysis pass to avoid building DominatorTree and LoopInfo which
+  // are not available this late in the IR pipeline.
+  OptimizationRemarkEmitter ORE(F);
+
   if (F->hasFnAttribute(Attribute::StackProtectReq)) {
+    ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorRequested", F)
+             << "Stack protection applied to function "
+             << ore::NV("Function", F)
+             << " due to a function attribute or command-line switch");
     NeedsProtector = true;
     Strong = true; // Use the same heuristic as strong to determine SSPLayout
   } else if (F->hasFnAttribute(Attribute::StackProtectStrong))
@@ -236,20 +253,29 @@ bool StackProtector::RequiresStackProtector() {
     for (const Instruction &I : BB) {
       if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
         if (AI->isArrayAllocation()) {
+          OptimizationRemark Remark(DEBUG_TYPE, "StackProtectorAllocaOrArray",
+                                    &I);
+          Remark
+              << "Stack protection applied to function "
+              << ore::NV("Function", F)
+              << " due to a call to alloca or use of a variable length array";
           if (const auto *CI = dyn_cast<ConstantInt>(AI->getArraySize())) {
             if (CI->getLimitedValue(SSPBufferSize) >= SSPBufferSize) {
               // A call to alloca with size >= SSPBufferSize requires
               // stack protectors.
               Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+              ORE.emit(Remark);
               NeedsProtector = true;
             } else if (Strong) {
               // Require protectors for all alloca calls in strong mode.
               Layout.insert(std::make_pair(AI, SSPLK_SmallArray));
+              ORE.emit(Remark);
               NeedsProtector = true;
             }
           } else {
             // A call to alloca with a variable size requires protectors.
             Layout.insert(std::make_pair(AI, SSPLK_LargeArray));
+            ORE.emit(Remark);
             NeedsProtector = true;
           }
           continue;
@@ -259,6 +285,11 @@ bool StackProtector::RequiresStackProtector() {
         if (ContainsProtectableArray(AI->getAllocatedType(), IsLarge, Strong)) {
           Layout.insert(std::make_pair(AI, IsLarge ? SSPLK_LargeArray
                                                    : SSPLK_SmallArray));
+          ORE.emit(OptimizationRemark(DEBUG_TYPE, "StackProtectorBuffer", &I)
+                   << "Stack protection applied to function "
+                   << ore::NV("Function", F)
+                   << " due to a stack allocated buffer or struct containing a "
+                      "buffer");
           NeedsProtector = true;
           continue;
         }
@@ -266,6 +297,11 @@ bool StackProtector::RequiresStackProtector() {
         if (Strong && HasAddressTaken(AI)) {
           ++NumAddrTaken;
           Layout.insert(std::make_pair(AI, SSPLK_AddrOf));
+          ORE.emit(
+              OptimizationRemark(DEBUG_TYPE, "StackProtectorAddressTaken", &I)
+              << "Stack protection applied to function "
+              << ore::NV("Function", F)
+              << " due to the address of a local variable being taken");
           NeedsProtector = true;
         }
       }
@@ -448,13 +484,13 @@ BasicBlock *StackProtector::CreateFailBB() {
     Constant *StackChkFail =
         M->getOrInsertFunction("__stack_smash_handler",
                                Type::getVoidTy(Context),
-                               Type::getInt8PtrTy(Context), nullptr);
+                               Type::getInt8PtrTy(Context));
 
     B.CreateCall(StackChkFail, B.CreateGlobalStringPtr(F->getName(), "SSH"));
   } else {
     Constant *StackChkFail =
-        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context),
-                               nullptr);
+        M->getOrInsertFunction("__stack_chk_fail", Type::getVoidTy(Context));
+
     B.CreateCall(StackChkFail, {});
   }
   B.CreateUnreachable();
diff --git a/lib/CodeGen/TailDuplicator.cpp b/lib/CodeGen/TailDuplicator.cpp
index 7709236bbaa83fd92650e678d0bfc9154a688dc3..d2414200e9d579680d5560ba63d4c1b39ac66632 100644
--- a/lib/CodeGen/TailDuplicator.cpp
+++ b/lib/CodeGen/TailDuplicator.cpp
@@ -725,6 +725,7 @@ bool TailDuplicator::duplicateSimpleBB(
     if (PredTBB == NextBB && PredFBB == nullptr)
       PredTBB = nullptr;
 
+    auto DL = PredBB->findBranchDebugLoc();
     TII->removeBranch(*PredBB);
 
     if (!PredBB->isSuccessor(NewTarget))
@@ -735,7 +736,7 @@ bool TailDuplicator::duplicateSimpleBB(
     }
 
     if (PredTBB)
-      TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DebugLoc());
+      TII->insertBranch(*PredBB, PredTBB, PredFBB, PredCond, DL);
 
     TDBBs.push_back(PredBB);
   }
diff --git a/lib/CodeGen/TargetFrameLoweringImpl.cpp b/lib/CodeGen/TargetFrameLoweringImpl.cpp
index f082add8c7dd58c0a10a76282e26f5f5535d6eed..e5def6752e0715aa0768fde9859719c14f701a6d 100644
--- a/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -73,7 +73,7 @@ void TargetFrameLowering::determineCalleeSaves(MachineFunction &MF,
     return;
 
   // Get the callee saved register list...
-  const MCPhysReg *CSRegs = TRI.getCalleeSavedRegs(&MF);
+  const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
 
   // Early exit if there are no callee saved registers.
   if (!CSRegs || CSRegs[0] == 0)
diff --git a/lib/CodeGen/TargetLoweringBase.cpp b/lib/CodeGen/TargetLoweringBase.cpp
index f18cdca486115a540fa6e273f31629bcffa8c028..27630a3055cb386235086a005783580fe60f449a 100644
--- a/lib/CodeGen/TargetLoweringBase.cpp
+++ b/lib/CodeGen/TargetLoweringBase.cpp
@@ -838,7 +838,6 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   HasExtractBitsInsn = false;
   JumpIsExpensive = JumpIsExpensiveOverride;
   PredictableSelectIsExpensive = false;
-  MaskAndBranchFoldingIsLegal = false;
   EnableExtLdPromotion = false;
   HasFloatingPointExceptions = true;
   StackPointerRegisterToSaveRestore = 0;
@@ -851,7 +850,7 @@ TargetLoweringBase::TargetLoweringBase(const TargetMachine &tm) : TM(tm) {
   MinFunctionAlignment = 0;
   PrefFunctionAlignment = 0;
   PrefLoopAlignment = 0;
-  GatherAllAliasesMaxDepth = 6;
+  GatherAllAliasesMaxDepth = 18;
   MinStackArgumentAlignment = 1;
   // TODO: the default will be switched to 0 in the next commit, along
   // with the Target-specific changes necessary.
@@ -901,6 +900,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::SMAX, VT, Expand);
     setOperationAction(ISD::UMIN, VT, Expand);
     setOperationAction(ISD::UMAX, VT, Expand);
+    setOperationAction(ISD::ABS, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
@@ -1589,7 +1589,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
 /// type of the given function.  This does not require a DAG or a return value,
 /// and is suitable for use before any DAGs for the function are constructed.
 /// TODO: Move this out of TargetLowering.cpp.
-void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
+void llvm::GetReturnInfo(Type *ReturnType, AttributeList attr,
                          SmallVectorImpl<ISD::OutputArg> &Outs,
                          const TargetLowering &TLI, const DataLayout &DL) {
   SmallVector<EVT, 4> ValueVTs;
@@ -1601,9 +1601,9 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
     EVT VT = ValueVTs[j];
     ISD::NodeType ExtendKind = ISD::ANY_EXTEND;
 
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
       ExtendKind = ISD::SIGN_EXTEND;
-    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
       ExtendKind = ISD::ZERO_EXTEND;
 
     // FIXME: C calling convention requires the return type to be promoted to
@@ -1621,13 +1621,13 @@ void llvm::GetReturnInfo(Type *ReturnType, AttributeSet attr,
 
     // 'inreg' on function refers to return value
     ISD::ArgFlagsTy Flags = ISD::ArgFlagsTy();
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::InReg))
+    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::InReg))
       Flags.setInReg();
 
     // Propagate extension type if any
-    if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::SExt))
+    if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::SExt))
       Flags.setSExt();
-    else if (attr.hasAttribute(AttributeSet::ReturnIndex, Attribute::ZExt))
+    else if (attr.hasAttribute(AttributeList::ReturnIndex, Attribute::ZExt))
       Flags.setZExt();
 
     for (unsigned i = 0; i < NumParts; ++i)
@@ -1818,7 +1818,7 @@ Value *TargetLoweringBase::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   Type *StackPtrTy = Type::getInt8PtrTy(M->getContext());
   Value *Fn = M->getOrInsertFunction("__safestack_pointer_address",
-                                     StackPtrTy->getPointerTo(0), nullptr);
+                                     StackPtrTy->getPointerTo(0));
   return IRB.CreateCall(Fn);
 }
 
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index eb2a28f574a5449b45e9fa7821a5777a271d7088..78afeda67dbf2dcd4e7e39b7b19254c26fa48373 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info --===//
+//===- llvm/CodeGen/TargetLoweringObjectFileImpl.cpp - Object File Info ---===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,36 +12,52 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
+#include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
+#include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalAlias.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <string>
+
 using namespace llvm;
 using namespace dwarf;
 
@@ -53,10 +69,10 @@ MCSymbol *TargetLoweringObjectFileELF::getCFIPersonalitySymbol(
     const GlobalValue *GV, const TargetMachine &TM,
     MachineModuleInfo *MMI) const {
   unsigned Encoding = getPersonalityEncoding();
-  if ((Encoding & 0x80) == dwarf::DW_EH_PE_indirect)
+  if ((Encoding & 0x80) == DW_EH_PE_indirect)
     return getContext().getOrCreateSymbol(StringRef("DW.ref.") +
                                           TM.getSymbol(GV)->getName());
-  if ((Encoding & 0x70) == dwarf::DW_EH_PE_absptr)
+  if ((Encoding & 0x70) == DW_EH_PE_absptr)
     return TM.getSymbol(GV);
   report_fatal_error("We do not support this DWARF encoding yet!");
 }
@@ -86,8 +102,7 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(
 const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
     const GlobalValue *GV, unsigned Encoding, const TargetMachine &TM,
     MachineModuleInfo *MMI, MCStreamer &Streamer) const {
-
-  if (Encoding & dwarf::DW_EH_PE_indirect) {
+  if (Encoding & DW_EH_PE_indirect) {
     MachineModuleInfoELF &ELFMMI = MMI->getObjFileInfo<MachineModuleInfoELF>();
 
     MCSymbol *SSym = getSymbolWithGlobalValueBase(GV, ".DW.stub", TM);
@@ -102,7 +117,7 @@ const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
 
     return TargetLoweringObjectFile::
       getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()),
-                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+                        Encoding & ~DW_EH_PE_indirect, Streamer);
   }
 
   return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM,
@@ -149,7 +164,6 @@ getELFKindForNamedSection(StringRef Name, SectionKind K) {
   return K;
 }
 
-
 static unsigned getELFSectionType(StringRef Name, SectionKind K) {
   // Use SHT_NOTE for section whose name starts with ".note" to allow
   // emitting ELF notes from C variable declaration.
@@ -211,6 +225,20 @@ static const Comdat *getELFComdat(const GlobalValue *GV) {
   return C;
 }
 
+static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
+                                              const TargetMachine &TM) {
+  MDNode *MD = GO->getMetadata(LLVMContext::MD_associated);
+  if (!MD)
+    return nullptr;
+
+  auto *VM = dyn_cast<ValueAsMetadata>(MD->getOperand(0));
+  if (!VM)
+    report_fatal_error("MD_associated operand is not ValueAsMetadata");
+
+  GlobalObject *OtherGO = dyn_cast<GlobalObject>(VM->getValue());
+  return OtherGO ? dyn_cast<MCSymbolELF>(TM.getSymbol(OtherGO)) : nullptr;
+}
+
 MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   StringRef SectionName = GO->getSection();
@@ -224,9 +252,23 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     Group = C->getName();
     Flags |= ELF::SHF_GROUP;
   }
-  return getContext().getELFSection(SectionName,
-                                    getELFSectionType(SectionName, Kind), Flags,
-                                    /*EntrySize=*/0, Group);
+
+  // A section can have at most one associated section. Put each global with
+  // MD_associated in a unique section.
+  unsigned UniqueID = MCContext::GenericSectionID;
+  const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM);
+  if (AssociatedSymbol) {
+    UniqueID = NextUniqueID++;
+    Flags |= ELF::SHF_LINK_ORDER;
+  }
+
+  MCSectionELF *Section = getContext().getELFSection(
+      SectionName, getELFSectionType(SectionName, Kind), Flags,
+      /*EntrySize=*/0, Group, UniqueID, AssociatedSymbol);
+  // Make sure that we did not get some other section with incompatible sh_link.
+  // This should not be possible due to UniqueID code above.
+  assert(Section->getAssociatedSymbol() == AssociatedSymbol);
+  return Section;
 }
 
 /// Return the section prefix name used by options FunctionsSections and
@@ -248,11 +290,10 @@ static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
   return ".data.rel.ro";
 }
 
-static MCSectionELF *
-selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO,
-                          SectionKind Kind, Mangler &Mang,
-                          const TargetMachine &TM, bool EmitUniqueSection,
-                          unsigned Flags, unsigned *NextUniqueID) {
+static MCSectionELF *selectELFSectionForGlobal(
+    MCContext &Ctx, const GlobalObject *GO, SectionKind Kind, Mangler &Mang,
+    const TargetMachine &TM, bool EmitUniqueSection, unsigned Flags,
+    unsigned *NextUniqueID, const MCSymbolELF *AssociatedSymbol) {
   unsigned EntrySize = 0;
   if (Kind.isMergeableCString()) {
     if (Kind.isMergeable2ByteCString()) {
@@ -319,7 +360,7 @@ selectELFSectionForGlobal(MCContext &Ctx, const GlobalObject *GO,
   if (Kind.isExecuteOnly())
     UniqueID = 0;
   return Ctx.getELFSection(Name, getELFSectionType(Name, Kind), Flags,
-                           EntrySize, Group, UniqueID);
+                           EntrySize, Group, UniqueID, AssociatedSymbol);
 }
 
 MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal(
@@ -337,8 +378,17 @@ MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal(
   }
   EmitUniqueSection |= GO->hasComdat();
 
-  return selectELFSectionForGlobal(getContext(), GO, Kind, getMangler(), TM,
-                                   EmitUniqueSection, Flags, &NextUniqueID);
+  const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM);
+  if (AssociatedSymbol) {
+    EmitUniqueSection = true;
+    Flags |= ELF::SHF_LINK_ORDER;
+  }
+
+  MCSectionELF *Section = selectELFSectionForGlobal(
+      getContext(), GO, Kind, getMangler(), TM, EmitUniqueSection, Flags,
+      &NextUniqueID, AssociatedSymbol);
+  assert(Section->getAssociatedSymbol() == AssociatedSymbol);
+  return Section;
 }
 
 MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
@@ -351,8 +401,9 @@ MCSection *TargetLoweringObjectFileELF::getSectionForJumpTable(
     return ReadOnlySection;
 
   return selectELFSectionForGlobal(getContext(), &F, SectionKind::getReadOnly(),
-                                   getMangler(), TM, EmitUniqueSection, ELF::SHF_ALLOC,
-                                   &NextUniqueID);
+                                   getMangler(), TM, EmitUniqueSection,
+                                   ELF::SHF_ALLOC, &NextUniqueID,
+                                   /* AssociatedSymbol */ nullptr);
 }
 
 bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
@@ -723,7 +774,7 @@ const MCExpr *TargetLoweringObjectFileMachO::getTTypeGlobalReference(
 
     return TargetLoweringObjectFile::
       getTTypeReference(MCSymbolRefExpr::create(SSym, getContext()),
-                        Encoding & ~dwarf::DW_EH_PE_indirect, Streamer);
+                        Encoding & ~DW_EH_PE_indirect, Streamer);
   }
 
   return TargetLoweringObjectFile::getTTypeGlobalReference(GV, Encoding, TM,
@@ -1122,33 +1173,110 @@ MCSection *TargetLoweringObjectFileCOFF::getStaticDtorSection(
 
 void TargetLoweringObjectFileCOFF::emitLinkerFlagsForGlobal(
     raw_ostream &OS, const GlobalValue *GV) const {
-  if (!GV->hasDLLExportStorageClass() || GV->isDeclaration())
-    return;
+  emitLinkerFlagsForGlobalCOFF(OS, GV, getTargetTriple(), getMangler());
+}
 
-  const Triple &TT = getTargetTriple();
+//===----------------------------------------------------------------------===//
+//                                  Wasm
+//===----------------------------------------------------------------------===//
 
-  if (TT.isKnownWindowsMSVCEnvironment())
-    OS << " /EXPORT:";
-  else
-    OS << " -export:";
-
-  if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) {
-    std::string Flag;
-    raw_string_ostream FlagOS(Flag);
-    getMangler().getNameWithPrefix(FlagOS, GV, false);
-    FlagOS.flush();
-    if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix())
-      OS << Flag.substr(1);
-    else
-      OS << Flag;
-  } else {
-    getMangler().getNameWithPrefix(OS, GV, false);
+static const Comdat *getWasmComdat(const GlobalValue *GV) {
+  const Comdat *C = GV->getComdat();
+  if (!C)
+    return nullptr;
+
+  if (C->getSelectionKind() != Comdat::Any)
+    report_fatal_error("Wasm COMDATs only support SelectionKind::Any, '" +
+                       C->getName() + "' cannot be lowered.");
+
+  return C;
+}
+
+MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+  llvm_unreachable("getExplicitSectionGlobal not yet implemented");
+  return nullptr;
+}
+
+static MCSectionWasm *
+selectWasmSectionForGlobal(MCContext &Ctx, const GlobalObject *GO,
+                           SectionKind Kind, Mangler &Mang,
+                           const TargetMachine &TM, bool EmitUniqueSection,
+                           unsigned Flags, unsigned *NextUniqueID) {
+  StringRef Group = "";
+  if (getWasmComdat(GO))
+    llvm_unreachable("comdat not yet supported for wasm");
+
+  bool UniqueSectionNames = TM.getUniqueSectionNames();
+  SmallString<128> Name = getSectionPrefixForGlobal(Kind);
+
+  if (const auto *F = dyn_cast<Function>(GO)) {
+    const auto &OptionalPrefix = F->getSectionPrefix();
+    if (OptionalPrefix)
+      Name += *OptionalPrefix;
   }
 
-  if (!GV->getValueType()->isFunctionTy()) {
-    if (TT.isKnownWindowsMSVCEnvironment())
-      OS << ",DATA";
-    else
-      OS << ",data";
+  if (EmitUniqueSection && UniqueSectionNames) {
+    Name.push_back('.');
+    TM.getNameWithPrefix(Name, GO, Mang, true);
+  }
+  unsigned UniqueID = MCContext::GenericSectionID;
+  if (EmitUniqueSection && !UniqueSectionNames) {
+    UniqueID = *NextUniqueID;
+    (*NextUniqueID)++;
   }
+  return Ctx.getWasmSection(Name, /*Type=*/0, Flags,
+                            Group, UniqueID);
+}
+
+MCSection *TargetLoweringObjectFileWasm::SelectSectionForGlobal(
+    const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
+
+  if (Kind.isCommon())
+    report_fatal_error("mergable sections not supported yet on wasm");
+
+  // If we have -ffunction-section or -fdata-section then we should emit the
+  // global value to a uniqued section specifically for it.
+  bool EmitUniqueSection = false;
+  if (Kind.isText())
+    EmitUniqueSection = TM.getFunctionSections();
+  else
+    EmitUniqueSection = TM.getDataSections();
+  EmitUniqueSection |= GO->hasComdat();
+
+  return selectWasmSectionForGlobal(getContext(), GO, Kind, getMangler(), TM,
+                                    EmitUniqueSection, /*Flags=*/0,
+                                    &NextUniqueID);
+}
+
+bool TargetLoweringObjectFileWasm::shouldPutJumpTableInFunctionSection(
+    bool UsesLabelDifference, const Function &F) const {
+  // We can always create relative relocations, so use another section
+  // that can be marked non-executable.
+  return false;
+}
+
+const MCExpr *TargetLoweringObjectFileWasm::lowerRelativeReference(
+    const GlobalValue *LHS, const GlobalValue *RHS,
+    const TargetMachine &TM) const {
+  // We may only use a PLT-relative relocation to refer to unnamed_addr
+  // functions.
+  if (!LHS->hasGlobalUnnamedAddr() || !LHS->getValueType()->isFunctionTy())
+    return nullptr;
+
+  // Basic sanity checks.
+  if (LHS->getType()->getPointerAddressSpace() != 0 ||
+      RHS->getType()->getPointerAddressSpace() != 0 || LHS->isThreadLocal() ||
+      RHS->isThreadLocal())
+    return nullptr;
+
+  return MCBinaryExpr::createSub(
+      MCSymbolRefExpr::create(TM.getSymbol(LHS), MCSymbolRefExpr::VK_None,
+                              getContext()),
+      MCSymbolRefExpr::create(TM.getSymbol(RHS), getContext()), getContext());
+}
+
+void
+TargetLoweringObjectFileWasm::InitializeWasm() {
+  // TODO: Initialize StaticCtorSection and StaticDtorSection.
 }
diff --git a/lib/CodeGen/TargetOptionsImpl.cpp b/lib/CodeGen/TargetOptionsImpl.cpp
index b6da8e0aa60db0b628a6c9ad81e10a8e6716a9ae..c20d5ab814f82f94d64a3279d4c7c37953695461 100644
--- a/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/lib/CodeGen/TargetOptionsImpl.cpp
@@ -34,14 +34,6 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
   return false;
 }
 
-/// LessPreciseFPMAD - This flag return true when -enable-fp-mad option
-/// is specified on the command line.  When this flag is off(default), the
-/// code generator is not allowed to generate mad (multiply add) if the
-/// result is "less precise" than doing those operations individually.
-bool TargetOptions::LessPreciseFPMAD() const {
-  return UnsafeFPMath || LessPreciseFPMADOption;
-}
-
 /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
 /// that the rounding mode of the FPU can change from its default.
 bool TargetOptions::HonorSignDependentRoundingFPMath() const {
diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp
index e7ea2b4563f96af3685afee9e8b91ce590724650..150195f5f85bcff9b5b99fed84cdb8f9e847c61b 100644
--- a/lib/CodeGen/TargetPassConfig.cpp
+++ b/lib/CodeGen/TargetPassConfig.cpp
@@ -92,6 +92,9 @@ static cl::opt<bool> VerifyMachineCode("verify-machineinstrs", cl::Hidden,
     cl::desc("Verify generated machine code"),
     cl::init(false),
     cl::ZeroOrMore);
+static cl::opt<bool> EnableMachineOutliner("enable-machine-outliner",
+    cl::Hidden,
+    cl::desc("Enable machine outliner"));
 
 static cl::opt<std::string>
 PrintMachineInstrs("print-machineinstrs", cl::ValueOptional,
@@ -261,7 +264,8 @@ TargetPassConfig::~TargetPassConfig() {
 TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
     : ImmutablePass(ID), PM(&pm), Started(true), Stopped(false),
       AddingMachinePasses(false), TM(tm), Impl(nullptr), Initialized(false),
-      DisableVerify(false), EnableTailMerge(true) {
+      DisableVerify(false), EnableTailMerge(true),
+      RequireCodeGenSCCOrder(false) {
 
   Impl = new PassConfigImpl();
 
@@ -279,6 +283,9 @@ TargetPassConfig::TargetPassConfig(TargetMachine *tm, PassManagerBase &pm)
 
   if (StringRef(PrintMachineInstrs.getValue()).equals(""))
     TM->Options.PrintMachineCode = true;
+
+  if (TM->Options.EnableIPRA)
+    setRequiresCodeGenSCCOrder();
 }
 
 CodeGenOpt::Level TargetPassConfig::getOptLevel() const {
@@ -531,7 +538,7 @@ void TargetPassConfig::addISelPrepare() {
   addPreISel();
 
   // Force codegen to run according to the callgraph.
-  if (TM->Options.EnableIPRA)
+  if (requiresCodeGenSCCOrder())
     addPass(new DummyCGSCCPass);
 
   // Add both the safe stack and the stack protection passes: each of them will
@@ -668,9 +675,15 @@ void TargetPassConfig::addMachinePasses() {
   addPass(&StackMapLivenessID, false);
   addPass(&LiveDebugValuesID, false);
 
+  // Insert before XRay Instrumentation.
+  addPass(&FEntryInserterID, false);
+
   addPass(&XRayInstrumentationID, false);
   addPass(&PatchableFunctionID, false);
 
+  if (EnableMachineOutliner)
+    PM->add(createMachineOutlinerPass());
+
   AddingMachinePasses = false;
 }
 
@@ -704,6 +717,10 @@ void TargetPassConfig::addMachineSSAOptimization() {
 
   addPass(&MachineLICMID, false);
   addPass(&MachineCSEID, false);
+
+  // Coalesce basic blocks with the same branch condition
+  addPass(&BranchCoalescingID);
+
   addPass(&MachineSinkingID);
 
   addPass(&PeepholeOptimizerID);
@@ -730,7 +747,7 @@ MachinePassRegistry RegisterRegAlloc::Registry;
 
 /// A dummy default pass factory indicates whether the register allocator is
 /// overridden on the command line.
-LLVM_DEFINE_ONCE_FLAG(InitializeDefaultRegisterAllocatorFlag);
+static llvm::once_flag InitializeDefaultRegisterAllocatorFlag;
 static FunctionPass *useDefaultRegisterAllocator() { return nullptr; }
 static RegisterRegAlloc
 defaultRegAlloc("default",
@@ -903,6 +920,11 @@ void TargetPassConfig::addBlockPlacement() {
 //===---------------------------------------------------------------------===//
 /// GlobalISel Configuration
 //===---------------------------------------------------------------------===//
+
+bool TargetPassConfig::isGlobalISelEnabled() const {
+  return false;
+}
+
 bool TargetPassConfig::isGlobalISelAbortEnabled() const {
   return EnableGlobalISelAbort == 1;
 }
diff --git a/lib/CodeGen/TargetSchedule.cpp b/lib/CodeGen/TargetSchedule.cpp
index 83e52d3353548ca9eaeb7f7225b8bdaf0fc9cf02..04edf0e62857b3fe302962ca138dc5ae5159e6d2 100644
--- a/lib/CodeGen/TargetSchedule.cpp
+++ b/lib/CodeGen/TargetSchedule.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/Target/TargetSchedule.cpp - Sched Machine Model ----*- C++ -*-===//
+//===- llvm/Target/TargetSchedule.cpp - Sched Machine Model ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,12 +12,22 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetSchedule.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -37,13 +47,14 @@ bool TargetSchedModel::hasInstrItineraries() const {
 
 static unsigned gcd(unsigned Dividend, unsigned Divisor) {
   // Dividend and Divisor will be naturally swapped as needed.
-  while(Divisor) {
+  while (Divisor) {
     unsigned Rem = Dividend % Divisor;
     Dividend = Divisor;
     Divisor = Rem;
   };
   return Dividend;
 }
+
 static unsigned lcm(unsigned A, unsigned B) {
   unsigned LCM = (uint64_t(A) * B) / gcd(A, B);
   assert((LCM >= A && LCM >= B) && "LCM overflow");
@@ -73,6 +84,29 @@ void TargetSchedModel::init(const MCSchedModel &sm,
   }
 }
 
+/// Returns true only if instruction is specified as single issue.
+bool TargetSchedModel::mustBeginGroup(const MachineInstr *MI,
+                                     const MCSchedClassDesc *SC) const {
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->BeginGroup;
+  }
+  return false;
+}
+
+bool TargetSchedModel::mustEndGroup(const MachineInstr *MI,
+                                     const MCSchedClassDesc *SC) const {
+  if (hasInstrSchedModel()) {
+    if (!SC)
+      SC = resolveSchedClass(MI);
+    if (SC->isValid())
+      return SC->EndGroup;
+  }
+  return false;
+}
+
 unsigned TargetSchedModel::getNumMicroOps(const MachineInstr *MI,
                                           const MCSchedClassDesc *SC) const {
   if (hasInstrItineraries()) {
@@ -100,7 +134,6 @@ static unsigned capLatency(int Cycles) {
 /// evaluation of predicates that depend on instruction operands or flags.
 const MCSchedClassDesc *TargetSchedModel::
 resolveSchedClass(const MachineInstr *MI) const {
-
   // Get the definition's scheduling class descriptor from this machine model.
   unsigned SchedClass = MI->getDesc().getSchedClass();
   const MCSchedClassDesc *SCDesc = SchedModel.getSchedClassDesc(SchedClass);
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index a0c1a4f70b33582573fcd035f31366a79b8e1bcf..75359fe3c0ea695de0424e976a83065af87b858c 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -905,7 +905,7 @@ rescheduleMIBelowKill(MachineBasicBlock::iterator &mi,
     ++End;
   }
 
-  // Check if the reschedule will not break depedencies.
+  // Check if the reschedule will not break dependencies.
   unsigned NumVisited = 0;
   MachineBasicBlock::iterator KillPos = KillMI;
   ++KillPos;
diff --git a/lib/CodeGen/VirtRegMap.cpp b/lib/CodeGen/VirtRegMap.cpp
index 0d506d6466598747d263591ffbd45aef3f29c8ad..c8946010e9d15fb426f493bb3527608e6cd39bc0 100644
--- a/lib/CodeGen/VirtRegMap.cpp
+++ b/lib/CodeGen/VirtRegMap.cpp
@@ -167,6 +167,7 @@ class VirtRegRewriter : public MachineFunctionPass {
   bool readsUndefSubreg(const MachineOperand &MO) const;
   void addLiveInsForSubRanges(const LiveInterval &LI, unsigned PhysReg) const;
   void handleIdentityCopy(MachineInstr &MI) const;
+  void expandCopyBundle(MachineInstr &MI) const;
 
 public:
   static char ID;
@@ -367,11 +368,41 @@ void VirtRegRewriter::handleIdentityCopy(MachineInstr &MI) const {
   }
 
   if (Indexes)
-    Indexes->removeMachineInstrFromMaps(MI);
-  MI.eraseFromParent();
+    Indexes->removeSingleMachineInstrFromMaps(MI);
+  MI.eraseFromBundle();
   DEBUG(dbgs() << "  deleted.\n");
 }
 
+/// The liverange splitting logic sometimes produces bundles of copies when
+/// subregisters are involved. Expand these into a sequence of copy instructions
+/// after processing the last in the bundle. Does not update LiveIntervals
+/// which we shouldn't need for this instruction anymore.
+void VirtRegRewriter::expandCopyBundle(MachineInstr &MI) const {
+  if (!MI.isCopy())
+    return;
+
+  if (MI.isBundledWithPred() && !MI.isBundledWithSucc()) {
+    // Only do this when the complete bundle is made out of COPYs.
+    MachineBasicBlock &MBB = *MI.getParent();
+    for (MachineBasicBlock::reverse_instr_iterator I =
+         std::next(MI.getReverseIterator()), E = MBB.instr_rend();
+         I != E && I->isBundledWithSucc(); ++I) {
+      if (!I->isCopy())
+        return;
+    }
+
+    for (MachineBasicBlock::reverse_instr_iterator I = MI.getReverseIterator();
+         I->isBundledWithPred(); ) {
+      MachineInstr &MI = *I;
+      ++I;
+
+      MI.unbundleFromPred();
+      if (Indexes)
+        Indexes->insertMachineInstrInMaps(MI);
+    }
+  }
+}
+
 void VirtRegRewriter::rewrite() {
   bool NoSubRegLiveness = !MRI->subRegLivenessEnabled();
   SmallVector<unsigned, 8> SuperDeads;
@@ -431,12 +462,14 @@ void VirtRegRewriter::rewrite() {
             }
           }
 
-          // The <def,undef> flag only makes sense for sub-register defs, and
-          // we are substituting a full physreg.  An <imp-use,kill> operand
-          // from the SuperKills list will represent the partial read of the
-          // super-register.
-          if (MO.isDef())
+          // The <def,undef> and <def,internal> flags only make sense for
+          // sub-register defs, and we are substituting a full physreg.  An
+          // <imp-use,kill> operand from the SuperKills list will represent the
+          // partial read of the super-register.
+          if (MO.isDef()) {
             MO.setIsUndef(false);
+            MO.setIsInternalRead(false);
+          }
 
           // PhysReg operands cannot have subregister indexes.
           PhysReg = TRI->getSubReg(PhysReg, SubReg);
@@ -461,6 +494,8 @@ void VirtRegRewriter::rewrite() {
 
       DEBUG(dbgs() << "> " << *MI);
 
+      expandCopyBundle(*MI);
+
       // We can remove identity copies right now.
       handleIdentityCopy(*MI);
     }
diff --git a/lib/CodeGen/WinEHPrepare.cpp b/lib/CodeGen/WinEHPrepare.cpp
index 568720c66e551669bd653014b3fcda14b072c448..ae07e8b2fa03229a76eb51b327665d25c23a0e4e 100644
--- a/lib/CodeGen/WinEHPrepare.cpp
+++ b/lib/CodeGen/WinEHPrepare.cpp
@@ -86,6 +86,7 @@ private:
   // All fields are reset by runOnFunction.
   EHPersonality Personality = EHPersonality::Unknown;
 
+  const DataLayout *DL = nullptr;
   DenseMap<BasicBlock *, ColorVector> BlockColors;
   MapVector<BasicBlock *, std::vector<BasicBlock *>> FuncletBlocks;
 };
@@ -111,6 +112,7 @@ bool WinEHPrepare::runOnFunction(Function &Fn) {
   if (!isFuncletEHPersonality(Personality))
     return false;
 
+  DL = &Fn.getParent()->getDataLayout();
   return prepareExplicitEH(Fn);
 }
 
@@ -1070,7 +1072,7 @@ AllocaInst *WinEHPrepare::insertPHILoads(PHINode *PN, Function &F) {
   if (!isa<TerminatorInst>(EHPad)) {
     // If the EHPad isn't a terminator, then we can insert a load in this block
     // that will dominate all uses.
-    SpillSlot = new AllocaInst(PN->getType(), nullptr,
+    SpillSlot = new AllocaInst(PN->getType(), DL->getAllocaAddrSpace(), nullptr,
                                Twine(PN->getName(), ".wineh.spillslot"),
                                &F.getEntryBlock().front());
     Value *V = new LoadInst(SpillSlot, Twine(PN->getName(), ".wineh.reload"),
@@ -1157,7 +1159,7 @@ void WinEHPrepare::replaceUseWithLoad(Value *V, Use &U, AllocaInst *&SpillSlot,
                                       Function &F) {
   // Lazilly create the spill slot.
   if (!SpillSlot)
-    SpillSlot = new AllocaInst(V->getType(), nullptr,
+    SpillSlot = new AllocaInst(V->getType(), DL->getAllocaAddrSpace(), nullptr,
                                Twine(V->getName(), ".wineh.spillslot"),
                                &F.getEntryBlock().front());
 
diff --git a/lib/CodeGen/XRayInstrumentation.cpp b/lib/CodeGen/XRayInstrumentation.cpp
index 760683bc3bf70376eb0a28054329481502a4104c..7d2848bdc13b1c3d6243290ca74dee8bc7ebb73e 100644
--- a/lib/CodeGen/XRayInstrumentation.cpp
+++ b/lib/CodeGen/XRayInstrumentation.cpp
@@ -157,6 +157,11 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
   case Triple::ArchType::arm:
   case Triple::ArchType::thumb:
   case Triple::ArchType::aarch64:
+  case Triple::ArchType::ppc64le:
+  case Triple::ArchType::mips:
+  case Triple::ArchType::mipsel:
+  case Triple::ArchType::mips64:
+  case Triple::ArchType::mips64el:
     // For the architectures which don't have a single return instruction
     prependRetWithPatchableExit(MF, TII);
     break;
diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt
index f9bff86b41c83c388bf40bd284e80ef9c2c3c487..6e9214d72adc94002f20a1f5ae2da0ba0757bb86 100644
--- a/lib/DebugInfo/CodeView/CMakeLists.txt
+++ b/lib/DebugInfo/CodeView/CMakeLists.txt
@@ -5,16 +5,17 @@ add_llvm_library(LLVMDebugInfoCodeView
   CVTypeDumper.cpp
   CVTypeVisitor.cpp
   EnumTables.cpp
+  Formatters.cpp
   Line.cpp
   ModuleSubstream.cpp
   ModuleSubstreamVisitor.cpp
   RecordSerialization.cpp
   SymbolRecordMapping.cpp
   SymbolDumper.cpp
+  SymbolSerializer.cpp
   TypeDatabase.cpp
   TypeDatabaseVisitor.cpp
   TypeDumpVisitor.cpp
-  TypeRecord.cpp
   TypeRecordMapping.cpp
   TypeSerializer.cpp
   TypeStreamMerger.cpp
diff --git a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
index 75cfd0dd184e880dbea1d2a643e648c2d027230e..4c78caf034777ecc95f0df7492314c27f6f23f1f 100644
--- a/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVSymbolVisitor.cpp
@@ -11,20 +11,11 @@
 
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbacks.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
 
-template <typename T>
-static Error takeObject(ArrayRef<uint8_t> &Data, const T *&Res) {
-  if (Data.size() < sizeof(*Res))
-    return llvm::make_error<CodeViewError>(cv_error_code::insufficient_buffer);
-  Res = reinterpret_cast<const T *>(Data.data());
-  Data = Data.drop_front(sizeof(*Res));
-  return Error::success();
-}
-
 CVSymbolVisitor::CVSymbolVisitor(SymbolVisitorCallbacks &Callbacks)
     : Callbacks(Callbacks) {}
 
diff --git a/lib/DebugInfo/CodeView/CVTypeDumper.cpp b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
index fcd239cce0dd39cf85b5e46b90aad888b291e3c0..bcc8218d94460e8a4ff98edb653207e815e0c673 100644
--- a/lib/DebugInfo/CodeView/CVTypeDumper.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeDumper.cpp
@@ -14,7 +14,7 @@
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -28,6 +28,8 @@ Error CVTypeDumper::dump(const CVType &Record, TypeVisitorCallbacks &Dumper) {
   Pipeline.addCallbackToPipeline(Dumper);
 
   CVTypeVisitor Visitor(Pipeline);
+  if (Handler)
+    Visitor.addTypeServerHandler(*Handler);
 
   CVType RecordCopy = Record;
   if (auto EC = Visitor.visitTypeRecord(RecordCopy))
@@ -45,6 +47,8 @@ Error CVTypeDumper::dump(const CVTypeArray &Types,
   Pipeline.addCallbackToPipeline(Dumper);
 
   CVTypeVisitor Visitor(Pipeline);
+  if (Handler)
+    Visitor.addTypeServerHandler(*Handler);
 
   if (auto EC = Visitor.visitTypeStream(Types))
     return EC;
@@ -52,9 +56,9 @@ Error CVTypeDumper::dump(const CVTypeArray &Types,
 }
 
 Error CVTypeDumper::dump(ArrayRef<uint8_t> Data, TypeVisitorCallbacks &Dumper) {
-  msf::ByteStream Stream(Data);
+  BinaryByteStream Stream(Data, llvm::support::little);
   CVTypeArray Types;
-  msf::StreamReader Reader(Stream);
+  BinaryStreamReader Reader(Stream);
   if (auto EC = Reader.readArray(Types, Reader.getLength()))
     return EC;
 
diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
index 5171e24f3aacce436afcd7a7f581c4ff9f32ede4..0069ee3cc9043058db648c06e3da764d8c479b15 100644
--- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
+++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp
@@ -10,9 +10,14 @@
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -21,7 +26,8 @@ CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks)
     : Callbacks(Callbacks) {}
 
 template <typename T>
-static Error visitKnownRecord(CVType &Record, TypeVisitorCallbacks &Callbacks) {
+static Error visitKnownRecord(CVTypeVisitor &Visitor, CVType &Record,
+                              TypeVisitorCallbacks &Callbacks) {
   TypeRecordKind RK = static_cast<TypeRecordKind>(Record.Type);
   T KnownRecord(RK);
   if (auto EC = Callbacks.visitKnownRecord(Record, KnownRecord))
@@ -39,7 +45,58 @@ static Error visitKnownMember(CVMemberRecord &Record,
   return Error::success();
 }
 
+static Expected<TypeServer2Record> deserializeTypeServerRecord(CVType &Record) {
+  class StealTypeServerVisitor : public TypeVisitorCallbacks {
+  public:
+    explicit StealTypeServerVisitor(TypeServer2Record &TR) : TR(TR) {}
+
+    Error visitKnownRecord(CVType &CVR, TypeServer2Record &Record) override {
+      TR = Record;
+      return Error::success();
+    }
+
+  private:
+    TypeServer2Record &TR;
+  };
+
+  TypeServer2Record R(TypeRecordKind::TypeServer2);
+  TypeDeserializer Deserializer;
+  StealTypeServerVisitor Thief(R);
+  TypeVisitorCallbackPipeline Pipeline;
+  Pipeline.addCallbackToPipeline(Deserializer);
+  Pipeline.addCallbackToPipeline(Thief);
+  CVTypeVisitor Visitor(Pipeline);
+  if (auto EC = Visitor.visitTypeRecord(Record))
+    return std::move(EC);
+
+  return R;
+}
+
+void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) {
+  Handlers.push_back(&Handler);
+}
+
 Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
+  if (Record.Type == TypeLeafKind::LF_TYPESERVER2 && !Handlers.empty()) {
+    auto TS = deserializeTypeServerRecord(Record);
+    if (!TS)
+      return TS.takeError();
+
+    for (auto Handler : Handlers) {
+      auto ExpectedResult = Handler->handle(*TS, Callbacks);
+      // If there was an error, return the error.
+      if (!ExpectedResult)
+        return ExpectedResult.takeError();
+
+      // If the handler processed the record, return success.
+      if (*ExpectedResult)
+        return Error::success();
+
+      // Otherwise keep searching for a handler, eventually falling out and
+      // using the default record handler.
+    }
+  }
+
   if (auto EC = Callbacks.visitTypeBegin(Record))
     return EC;
 
@@ -50,7 +107,7 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) {
     break;
 #define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
   case EnumName: {                                                             \
-    if (auto EC = visitKnownRecord<Name##Record>(Record, Callbacks))           \
+    if (auto EC = visitKnownRecord<Name##Record>(*this, Record, Callbacks))    \
       return EC;                                                               \
     break;                                                                     \
   }
@@ -109,7 +166,15 @@ Error CVTypeVisitor::visitTypeStream(const CVTypeArray &Types) {
   return Error::success();
 }
 
-Error CVTypeVisitor::visitFieldListMemberStream(msf::StreamReader Reader) {
+Error CVTypeVisitor::visitTypeStream(CVTypeRange Types) {
+  for (auto I : Types) {
+    if (auto EC = visitTypeRecord(I))
+      return EC;
+  }
+  return Error::success();
+}
+
+Error CVTypeVisitor::visitFieldListMemberStream(BinaryStreamReader Reader) {
   FieldListDeserializer Deserializer(Reader);
   TypeVisitorCallbackPipeline Pipeline;
   Pipeline.addCallbackToPipeline(Deserializer);
@@ -130,7 +195,7 @@ Error CVTypeVisitor::visitFieldListMemberStream(msf::StreamReader Reader) {
 }
 
 Error CVTypeVisitor::visitFieldListMemberStream(ArrayRef<uint8_t> Data) {
-  msf::ByteStream S(Data);
-  msf::StreamReader SR(S);
+  BinaryByteStream S(Data, llvm::support::little);
+  BinaryStreamReader SR(S);
   return visitFieldListMemberStream(SR);
 }
diff --git a/lib/DebugInfo/CodeView/CodeViewError.cpp b/lib/DebugInfo/CodeView/CodeViewError.cpp
index 55c10c076eef9fb8c0be450bddec631841b94073..8de266b836b4b851d059a61c55fc199452a17f0c 100644
--- a/lib/DebugInfo/CodeView/CodeViewError.cpp
+++ b/lib/DebugInfo/CodeView/CodeViewError.cpp
@@ -31,6 +31,8 @@ public:
              "bytes.";
     case cv_error_code::corrupt_record:
       return "The CodeView record is corrupted.";
+    case cv_error_code::no_records:
+      return "There are no records";
     case cv_error_code::operation_unsupported:
       return "The requested operation is not supported.";
     case cv_error_code::unknown_member_record:
diff --git a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
index 9bd85cf9dc681f3baa21dcf5ab471382d1ed192f..282e3103adc9381c335f1ea0fb40ba74a4b9389b 100644
--- a/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
+++ b/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
@@ -10,8 +10,8 @@
 #include "llvm/DebugInfo/CodeView/CodeViewRecordIO.h"
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -145,10 +145,10 @@ Error CodeViewRecordIO::mapStringZ(StringRef &Value) {
   if (isWriting()) {
     // Truncate if we attempt to write too much.
     StringRef S = Value.take_front(maxFieldLength() - 1);
-    if (auto EC = Writer->writeZeroString(S))
+    if (auto EC = Writer->writeCString(S))
       return EC;
   } else {
-    if (auto EC = Reader->readZeroString(Value))
+    if (auto EC = Reader->readCString(Value))
       return EC;
   }
   return Error::success();
@@ -176,7 +176,7 @@ Error CodeViewRecordIO::mapStringZVectorZ(std::vector<StringRef> &Value) {
       if (auto EC = mapStringZ(V))
         return EC;
     }
-    if (auto EC = Writer->writeInteger(uint8_t(0)))
+    if (auto EC = Writer->writeInteger<uint8_t>(0))
       return EC;
   } else {
     StringRef S;
@@ -194,22 +194,22 @@ Error CodeViewRecordIO::mapStringZVectorZ(std::vector<StringRef> &Value) {
 Error CodeViewRecordIO::writeEncodedSignedInteger(const int64_t &Value) {
   assert(Value < 0 && "Encoded integer is not signed!");
   if (Value >= std::numeric_limits<int8_t>::min()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_CHAR)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_CHAR))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<int8_t>(Value)))
+    if (auto EC = Writer->writeInteger<int8_t>(Value))
       return EC;
   } else if (Value >= std::numeric_limits<int16_t>::min()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_SHORT)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_SHORT))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<int16_t>(Value)))
+    if (auto EC = Writer->writeInteger<int16_t>(Value))
       return EC;
   } else if (Value >= std::numeric_limits<int32_t>::min()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_LONG)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_LONG))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<int32_t>(Value)))
+    if (auto EC = Writer->writeInteger<int32_t>(Value))
       return EC;
   } else {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_QUADWORD)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_QUADWORD))
       return EC;
     if (auto EC = Writer->writeInteger(Value))
       return EC;
@@ -219,20 +219,20 @@ Error CodeViewRecordIO::writeEncodedSignedInteger(const int64_t &Value) {
 
 Error CodeViewRecordIO::writeEncodedUnsignedInteger(const uint64_t &Value) {
   if (Value < LF_NUMERIC) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(Value)))
+    if (auto EC = Writer->writeInteger<uint16_t>(Value))
       return EC;
   } else if (Value <= std::numeric_limits<uint16_t>::max()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_USHORT)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_USHORT))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(Value)))
+    if (auto EC = Writer->writeInteger<uint16_t>(Value))
       return EC;
   } else if (Value <= std::numeric_limits<uint32_t>::max()) {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_ULONG)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_ULONG))
       return EC;
-    if (auto EC = Writer->writeInteger(static_cast<uint32_t>(Value)))
+    if (auto EC = Writer->writeInteger<uint32_t>(Value))
       return EC;
   } else {
-    if (auto EC = Writer->writeInteger(static_cast<uint16_t>(LF_UQUADWORD)))
+    if (auto EC = Writer->writeInteger<uint16_t>(LF_UQUADWORD))
       return EC;
     if (auto EC = Writer->writeInteger(Value))
       return EC;
diff --git a/lib/DebugInfo/CodeView/Formatters.cpp b/lib/DebugInfo/CodeView/Formatters.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ef00bd8570fa95d8a28a4a52de9253cd7ec57652
--- /dev/null
+++ b/lib/DebugInfo/CodeView/Formatters.cpp
@@ -0,0 +1,37 @@
+//===- Formatters.cpp -------------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/Formatters.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::codeview::detail;
+
+GuidAdapter::GuidAdapter(StringRef Guid)
+    : FormatAdapter(makeArrayRef(Guid.bytes_begin(), Guid.bytes_end())) {}
+
+GuidAdapter::GuidAdapter(ArrayRef<uint8_t> Guid)
+    : FormatAdapter(std::move(Guid)) {}
+
+void GuidAdapter::format(llvm::raw_ostream &Stream, StringRef Style) {
+  static const char *Lookup = "0123456789ABCDEF";
+
+  assert(Item.size() == 16 && "Expected 16-byte GUID");
+  Stream << "{";
+  for (int i = 0; i < 16;) {
+    uint8_t Byte = Item[i];
+    uint8_t HighNibble = (Byte >> 4) & 0xF;
+    uint8_t LowNibble = Byte & 0xF;
+    Stream << Lookup[HighNibble] << Lookup[LowNibble];
+    ++i;
+    if (i >= 4 && i <= 10 && i % 2 == 0)
+      Stream << "-";
+  }
+  Stream << "}";
+}
diff --git a/lib/DebugInfo/CodeView/ModuleSubstream.cpp b/lib/DebugInfo/CodeView/ModuleSubstream.cpp
index 768ebaa1c980150a463290798c5762a1564304fc..69a7c59116cff54426853bdc1104025cb5b4cc62 100644
--- a/lib/DebugInfo/CodeView/ModuleSubstream.cpp
+++ b/lib/DebugInfo/CodeView/ModuleSubstream.cpp
@@ -9,22 +9,20 @@
 
 #include "llvm/DebugInfo/CodeView/ModuleSubstream.h"
 
-#include "llvm/DebugInfo/MSF/StreamReader.h"
+#include "llvm/Support/BinaryStreamReader.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
-using namespace llvm::msf;
 
 ModuleSubstream::ModuleSubstream() : Kind(ModuleSubstreamKind::None) {}
 
-ModuleSubstream::ModuleSubstream(ModuleSubstreamKind Kind,
-                                 ReadableStreamRef Data)
+ModuleSubstream::ModuleSubstream(ModuleSubstreamKind Kind, BinaryStreamRef Data)
     : Kind(Kind), Data(Data) {}
 
-Error ModuleSubstream::initialize(ReadableStreamRef Stream,
+Error ModuleSubstream::initialize(BinaryStreamRef Stream,
                                   ModuleSubstream &Info) {
   const ModuleSubsectionHeader *Header;
-  StreamReader Reader(Stream);
+  BinaryStreamReader Reader(Stream);
   if (auto EC = Reader.readObject(Header))
     return EC;
 
@@ -42,4 +40,4 @@ uint32_t ModuleSubstream::getRecordLength() const {
 
 ModuleSubstreamKind ModuleSubstream::getSubstreamKind() const { return Kind; }
 
-ReadableStreamRef ModuleSubstream::getRecordData() const { return Data; }
+BinaryStreamRef ModuleSubstream::getRecordData() const { return Data; }
diff --git a/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp b/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp
index 5247932779800eee3939e9b8f5263127e54d561d..e490a78cadbc6a920431a60bc8ef50d6d5666fdf 100644
--- a/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp
+++ b/lib/DebugInfo/CodeView/ModuleSubstreamVisitor.cpp
@@ -8,54 +8,52 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/CodeView/ModuleSubstreamVisitor.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
-using namespace llvm::msf;
 
-Error IModuleSubstreamVisitor::visitSymbols(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitSymbols(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::Symbols, Data);
 }
-Error IModuleSubstreamVisitor::visitLines(ReadableStreamRef Data,
+Error IModuleSubstreamVisitor::visitLines(BinaryStreamRef Data,
                                           const LineSubstreamHeader *Header,
                                           const LineInfoArray &Lines) {
   return visitUnknown(ModuleSubstreamKind::Lines, Data);
 }
-Error IModuleSubstreamVisitor::visitStringTable(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitStringTable(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::StringTable, Data);
 }
 Error IModuleSubstreamVisitor::visitFileChecksums(
-    ReadableStreamRef Data, const FileChecksumArray &Checksums) {
+    BinaryStreamRef Data, const FileChecksumArray &Checksums) {
   return visitUnknown(ModuleSubstreamKind::FileChecksums, Data);
 }
-Error IModuleSubstreamVisitor::visitFrameData(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitFrameData(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::FrameData, Data);
 }
-Error IModuleSubstreamVisitor::visitInlineeLines(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitInlineeLines(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::InlineeLines, Data);
 }
-Error IModuleSubstreamVisitor::visitCrossScopeImports(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitCrossScopeImports(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::CrossScopeExports, Data);
 }
-Error IModuleSubstreamVisitor::visitCrossScopeExports(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitCrossScopeExports(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::CrossScopeImports, Data);
 }
-Error IModuleSubstreamVisitor::visitILLines(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitILLines(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::ILLines, Data);
 }
-Error IModuleSubstreamVisitor::visitFuncMDTokenMap(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitFuncMDTokenMap(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::FuncMDTokenMap, Data);
 }
-Error IModuleSubstreamVisitor::visitTypeMDTokenMap(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitTypeMDTokenMap(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::TypeMDTokenMap, Data);
 }
-Error IModuleSubstreamVisitor::visitMergedAssemblyInput(
-    ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitMergedAssemblyInput(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::MergedAssemblyInput, Data);
 }
-Error IModuleSubstreamVisitor::visitCoffSymbolRVA(ReadableStreamRef Data) {
+Error IModuleSubstreamVisitor::visitCoffSymbolRVA(BinaryStreamRef Data) {
   return visitUnknown(ModuleSubstreamKind::CoffSymbolRVA, Data);
 }
 
@@ -65,7 +63,7 @@ Error llvm::codeview::visitModuleSubstream(const ModuleSubstream &R,
   case ModuleSubstreamKind::Symbols:
     return V.visitSymbols(R.getRecordData());
   case ModuleSubstreamKind::Lines: {
-    StreamReader Reader(R.getRecordData());
+    BinaryStreamReader Reader(R.getRecordData());
     const LineSubstreamHeader *Header;
     if (auto EC = Reader.readObject(Header))
       return EC;
@@ -78,7 +76,7 @@ Error llvm::codeview::visitModuleSubstream(const ModuleSubstream &R,
   case ModuleSubstreamKind::StringTable:
     return V.visitStringTable(R.getRecordData());
   case ModuleSubstreamKind::FileChecksums: {
-    StreamReader Reader(R.getRecordData());
+    BinaryStreamReader Reader(R.getRecordData());
     FileChecksumArray Checksums;
     if (auto EC = Reader.readArray(Checksums, Reader.bytesRemaining()))
       return EC;
diff --git a/lib/DebugInfo/CodeView/RecordSerialization.cpp b/lib/DebugInfo/CodeView/RecordSerialization.cpp
index 6f29caa9bbfc154dc74709548ea39bcf0bc88298..6446670f60d84ba6c0b1f6b82698297b09555624 100644
--- a/lib/DebugInfo/CodeView/RecordSerialization.cpp
+++ b/lib/DebugInfo/CodeView/RecordSerialization.cpp
@@ -16,7 +16,7 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/DebugInfo/CodeView/CodeViewError.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -33,7 +33,7 @@ StringRef llvm::codeview::getBytesAsCString(ArrayRef<uint8_t> LeafData) {
   return getBytesAsCharacters(LeafData).split('\0').first;
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, APSInt &Num) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, APSInt &Num) {
   // Used to avoid overload ambiguity on APInt construtor.
   bool FalseVal = false;
   uint16_t Short;
@@ -103,15 +103,15 @@ Error llvm::codeview::consume(msf::StreamReader &Reader, APSInt &Num) {
 
 Error llvm::codeview::consume(StringRef &Data, APSInt &Num) {
   ArrayRef<uint8_t> Bytes(Data.bytes_begin(), Data.bytes_end());
-  msf::ByteStream S(Bytes);
-  msf::StreamReader SR(S);
+  BinaryByteStream S(Bytes, llvm::support::little);
+  BinaryStreamReader SR(S);
   auto EC = consume(SR, Num);
   Data = Data.take_back(SR.bytesRemaining());
   return EC;
 }
 
 /// Decode a numeric leaf value that is known to be a uint64_t.
-Error llvm::codeview::consume_numeric(msf::StreamReader &Reader,
+Error llvm::codeview::consume_numeric(BinaryStreamReader &Reader,
                                       uint64_t &Num) {
   APSInt N;
   if (auto EC = consume(Reader, N))
@@ -123,27 +123,27 @@ Error llvm::codeview::consume_numeric(msf::StreamReader &Reader,
   return Error::success();
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, uint32_t &Item) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, uint32_t &Item) {
   return Reader.readInteger(Item);
 }
 
 Error llvm::codeview::consume(StringRef &Data, uint32_t &Item) {
   ArrayRef<uint8_t> Bytes(Data.bytes_begin(), Data.bytes_end());
-  msf::ByteStream S(Bytes);
-  msf::StreamReader SR(S);
+  BinaryByteStream S(Bytes, llvm::support::little);
+  BinaryStreamReader SR(S);
   auto EC = consume(SR, Item);
   Data = Data.take_back(SR.bytesRemaining());
   return EC;
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, int32_t &Item) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, int32_t &Item) {
   return Reader.readInteger(Item);
 }
 
-Error llvm::codeview::consume(msf::StreamReader &Reader, StringRef &Item) {
+Error llvm::codeview::consume(BinaryStreamReader &Reader, StringRef &Item) {
   if (Reader.empty())
     return make_error<CodeViewError>(cv_error_code::corrupt_record,
                                      "Null terminated string buffer is empty!");
 
-  return Reader.readZeroString(Item);
+  return Reader.readCString(Item);
 }
diff --git a/lib/DebugInfo/CodeView/SymbolDumper.cpp b/lib/DebugInfo/CodeView/SymbolDumper.cpp
index fd54fba13c763627c518de5f4758a706f2ee2065..134471e81cacd0a275a16752e89a9d10558d9032 100644
--- a/lib/DebugInfo/CodeView/SymbolDumper.cpp
+++ b/lib/DebugInfo/CodeView/SymbolDumper.cpp
@@ -468,8 +468,8 @@ Error CVSymbolDumperImpl::visitKnownRecord(CVSymbol &CVR,
   for (auto &Annotation : InlineSite.annotations()) {
     switch (Annotation.OpCode) {
     case BinaryAnnotationsOpCode::Invalid:
-      return llvm::make_error<CodeViewError>(
-          "Invalid binary annotation opcode!");
+      W.printString("(Annotation Padding)");
+      break;
     case BinaryAnnotationsOpCode::CodeOffset:
     case BinaryAnnotationsOpCode::ChangeCodeOffset:
     case BinaryAnnotationsOpCode::ChangeCodeLength:
diff --git a/lib/DebugInfo/CodeView/SymbolSerializer.cpp b/lib/DebugInfo/CodeView/SymbolSerializer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..251cc431f52b373ed2aec7872e2f78847da3cac6
--- /dev/null
+++ b/lib/DebugInfo/CodeView/SymbolSerializer.cpp
@@ -0,0 +1,52 @@
+//===- SymbolSerializer.cpp -------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+
+SymbolSerializer::SymbolSerializer(BumpPtrAllocator &Allocator)
+  : Storage(Allocator), RecordBuffer(MaxRecordLength), Stream(RecordBuffer, llvm::support::little),
+  Writer(Stream), Mapping(Writer) { }
+
+Error SymbolSerializer::visitSymbolBegin(CVSymbol &Record) {
+  assert(!CurrentSymbol.hasValue() && "Already in a symbol mapping!");
+
+  Writer.setOffset(0);
+
+  if (auto EC = writeRecordPrefix(Record.kind()))
+    return EC;
+
+  CurrentSymbol = Record.kind();
+  if (auto EC = Mapping.visitSymbolBegin(Record))
+    return EC;
+
+  return Error::success();
+}
+
+Error SymbolSerializer::visitSymbolEnd(CVSymbol &Record) {
+  assert(CurrentSymbol.hasValue() && "Not in a symbol mapping!");
+
+  if (auto EC = Mapping.visitSymbolEnd(Record))
+    return EC;
+
+  uint32_t RecordEnd = Writer.getOffset();
+  uint16_t Length = RecordEnd - 2;
+  Writer.setOffset(0);
+  if (auto EC = Writer.writeInteger(Length))
+    return EC;
+
+  uint8_t *StableStorage = Storage.Allocate<uint8_t>(RecordEnd);
+  ::memcpy(StableStorage, &RecordBuffer[0], RecordEnd);
+  Record.RecordData = ArrayRef<uint8_t>(StableStorage, RecordEnd);
+  CurrentSymbol.reset();
+
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp
index aec9e2d904fbe1c7dbdde5815a3d30baf45b75c2..f9ded6ce2a86a8c24684d53192fb0ae6ddbe3211 100644
--- a/lib/DebugInfo/CodeView/TypeDatabase.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabase.cpp
@@ -106,6 +106,10 @@ StringRef TypeDatabase::getTypeName(TypeIndex Index) const {
   return "<unknown UDT>";
 }
 
+const CVType &TypeDatabase::getTypeRecord(TypeIndex Index) const {
+  return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex];
+}
+
 bool TypeDatabase::containsTypeIndex(TypeIndex Index) const {
   uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex;
   return I < CVUDTNames.size();
diff --git a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
index d9d5639021826f94738ef2522d74705ec93f4fc1..c234afd2288bdfa862033d6bf7585874f940f0ef 100644
--- a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp
@@ -83,6 +83,22 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
   return Error::success();
 }
 
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR,
+                                            StringListRecord &Strings) {
+  auto Indices = Strings.getIndices();
+  uint32_t Size = Indices.size();
+  SmallString<256> TypeName("\"");
+  for (uint32_t I = 0; I < Size; ++I) {
+    StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]);
+    TypeName.append(ArgTypeName);
+    if (I + 1 != Size)
+      TypeName.append("\" \"");
+  }
+  TypeName.push_back('\"');
+  Name = TypeDB.saveTypeName(TypeName);
+  return Error::success();
+}
+
 Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
   Name = Class.getName();
   return Error::success();
@@ -283,6 +299,10 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &BI) {
   return Error::success();
 }
 
+Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, LabelRecord &R) {
+  return Error::success();
+}
+
 Error TypeDatabaseVisitor::visitKnownMember(CVMemberRecord &CVR,
                                             VFPtrRecord &VFP) {
   return Error::success();
diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
index 033585ba8cc9b317158e3a556583e81e4bda8d05..870d95221e7d0f0549d7dc6f144cc531c3f64f50 100644
--- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
+++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp
@@ -1,5 +1,4 @@
-//===-- TypeDumpVisitor.cpp - CodeView type info dumper -----------*- C++
-//-*-===//
+//===-- TypeDumpVisitor.cpp - CodeView type info dumper ----------*- C++-*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,13 +12,15 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm;
@@ -145,6 +146,10 @@ static const EnumEntry<uint8_t> FunctionOptionEnum[] = {
     ENUM_ENTRY(FunctionOptions, ConstructorWithVirtualBases),
 };
 
+static const EnumEntry<uint16_t> LabelTypeEnum[] = {
+    ENUM_ENTRY(LabelType, Near), ENUM_ENTRY(LabelType, Far),
+};
+
 #undef ENUM_ENTRY
 
 static StringRef getLeafTypeName(TypeLeafKind LT) {
@@ -163,9 +168,14 @@ void TypeDumpVisitor::printTypeIndex(StringRef FieldName, TypeIndex TI) const {
   CVTypeDumper::printTypeIndex(*W, FieldName, TI, TypeDB);
 }
 
+void TypeDumpVisitor::printItemIndex(StringRef FieldName, TypeIndex TI) const {
+  CVTypeDumper::printTypeIndex(*W, FieldName, TI, getSourceDB());
+}
+
 Error TypeDumpVisitor::visitTypeBegin(CVType &Record) {
   W->startLine() << getLeafTypeName(Record.Type);
-  W->getOStream() << " (" << HexNumber(TypeDB.getNextTypeIndex().getIndex())
+  W->getOStream() << " ("
+                  << HexNumber(getSourceDB().getNextTypeIndex().getIndex())
                   << ")";
   W->getOStream() << " {\n";
   W->indent();
@@ -211,7 +221,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, StringIdRecord &String) {
-  printTypeIndex("Id", String.getId());
+  printItemIndex("Id", String.getId());
   W->printString("StringData", String.getString());
   return Error::success();
 }
@@ -227,6 +237,17 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) {
   return Error::success();
 }
 
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, StringListRecord &Strs) {
+  auto Indices = Strs.getIndices();
+  uint32_t Size = Indices.size();
+  W->printNumber("NumStrings", Size);
+  ListScope Arguments(*W, "Strings");
+  for (uint32_t I = 0; I < Size; ++I) {
+    printTypeIndex("String", Indices[I]);
+  }
+  return Error::success();
+}
+
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, ClassRecord &Class) {
   uint16_t Props = static_cast<uint16_t>(Class.getOptions());
   W->printNumber("MemberCount", Class.getMemberCount());
@@ -329,14 +350,14 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, FuncIdRecord &Func) {
-  printTypeIndex("ParentScope", Func.getParentScope());
+  printItemIndex("ParentScope", Func.getParentScope());
   printTypeIndex("FunctionType", Func.getFunctionType());
   W->printString("Name", Func.getName());
   return Error::success();
 }
 
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, TypeServer2Record &TS) {
-  W->printBinary("Signature", TS.getGuid());
+  W->printString("Guid", formatv("{0}", fmt_guid(TS.getGuid())).str());
   W->printNumber("Age", TS.getAge());
   W->printString("Name", TS.getName());
   return Error::success();
@@ -390,7 +411,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
                                         UdtSourceLineRecord &Line) {
   printTypeIndex("UDT", Line.getUDT());
-  printTypeIndex("SourceFile", Line.getSourceFile());
+  printItemIndex("SourceFile", Line.getSourceFile());
   W->printNumber("LineNumber", Line.getLineNumber());
   return Error::success();
 }
@@ -398,7 +419,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
 Error TypeDumpVisitor::visitKnownRecord(CVType &CVR,
                                         UdtModSourceLineRecord &Line) {
   printTypeIndex("UDT", Line.getUDT());
-  printTypeIndex("SourceFile", Line.getSourceFile());
+  printItemIndex("SourceFile", Line.getSourceFile());
   W->printNumber("LineNumber", Line.getLineNumber());
   W->printNumber("Module", Line.getModule());
   return Error::success();
@@ -409,7 +430,7 @@ Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, BuildInfoRecord &Args) {
 
   ListScope Arguments(*W, "Arguments");
   for (auto Arg : Args.getArgs()) {
-    printTypeIndex("ArgType", Arg);
+    printItemIndex("ArgType", Arg);
   }
   return Error::success();
 }
@@ -530,3 +551,8 @@ Error TypeDumpVisitor::visitKnownMember(CVMemberRecord &CVR,
   printTypeIndex("ContinuationIndex", Cont.getContinuationIndex());
   return Error::success();
 }
+
+Error TypeDumpVisitor::visitKnownRecord(CVType &CVR, LabelRecord &LR) {
+  W->printEnum("Mode", uint16_t(LR.Mode), makeArrayRef(LabelTypeEnum));
+  return Error::success();
+}
diff --git a/lib/DebugInfo/CodeView/TypeRecord.cpp b/lib/DebugInfo/CodeView/TypeRecord.cpp
deleted file mode 100644
index b951c068ca8644a521ba62f29776af7fed3cfc63..0000000000000000000000000000000000000000
--- a/lib/DebugInfo/CodeView/TypeRecord.cpp
+++ /dev/null
@@ -1,213 +0,0 @@
-//===-- TypeRecord.cpp ------------------------------------------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/CodeView/RecordSerialization.h"
-#include "llvm/DebugInfo/CodeView/TypeIndex.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-
-using namespace llvm;
-using namespace llvm::codeview;
-
-//===----------------------------------------------------------------------===//
-// Type index remapping
-//===----------------------------------------------------------------------===//
-
-static bool remapIndex(ArrayRef<TypeIndex> IndexMap, TypeIndex &Idx) {
-  // Simple types are unchanged.
-  if (Idx.isSimple())
-    return true;
-  unsigned MapPos = Idx.getIndex() - TypeIndex::FirstNonSimpleIndex;
-  if (MapPos < IndexMap.size()) {
-    Idx = IndexMap[MapPos];
-    return true;
-  }
-
-  // This type index is invalid. Remap this to "not translated by cvpack",
-  // and return failure.
-  Idx = TypeIndex(SimpleTypeKind::NotTranslated, SimpleTypeMode::Direct);
-  return false;
-}
-
-bool ModifierRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, ModifiedType);
-}
-
-bool ProcedureRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ReturnType);
-  Success &= remapIndex(IndexMap, ArgumentList);
-  return Success;
-}
-
-bool MemberFunctionRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ReturnType);
-  Success &= remapIndex(IndexMap, ClassType);
-  Success &= remapIndex(IndexMap, ThisType);
-  Success &= remapIndex(IndexMap, ArgumentList);
-  return Success;
-}
-
-bool MemberFuncIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ClassType);
-  Success &= remapIndex(IndexMap, FunctionType);
-  return Success;
-}
-
-bool ArgListRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  for (TypeIndex &Str : StringIndices)
-    Success &= remapIndex(IndexMap, Str);
-  return Success;
-}
-
-bool MemberPointerInfo::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, ContainingType);
-}
-
-bool PointerRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ReferentType);
-  if (isPointerToMember())
-    Success &= MemberInfo->remapTypeIndices(IndexMap);
-  return Success;
-}
-
-bool NestedTypeRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool ArrayRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ElementType);
-  Success &= remapIndex(IndexMap, IndexType);
-  return Success;
-}
-
-bool TagRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, FieldList);
-}
-
-bool ClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= TagRecord::remapTypeIndices(IndexMap);
-  Success &= remapIndex(IndexMap, DerivationList);
-  Success &= remapIndex(IndexMap, VTableShape);
-  return Success;
-}
-
-bool EnumRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= TagRecord::remapTypeIndices(IndexMap);
-  Success &= remapIndex(IndexMap, UnderlyingType);
-  return Success;
-}
-
-bool BitFieldRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool VFTableShapeRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return true;
-}
-
-bool TypeServer2Record::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return true;
-}
-
-bool StringIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Id);
-}
-
-bool FuncIdRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, ParentScope);
-  Success &= remapIndex(IndexMap, FunctionType);
-  return Success;
-}
-
-bool UdtSourceLineRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, UDT);
-  Success &= remapIndex(IndexMap, SourceFile);
-  return Success;
-}
-
-bool UdtModSourceLineRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, UDT);
-  Success &= remapIndex(IndexMap, SourceFile);
-  return Success;
-}
-
-bool BuildInfoRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  for (TypeIndex &Arg : ArgIndices)
-    Success &= remapIndex(IndexMap, Arg);
-  return Success;
-}
-
-bool VFTableRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, CompleteClass);
-  Success &= remapIndex(IndexMap, OverriddenVFTable);
-  return Success;
-}
-
-bool OneMethodRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, Type);
-  return Success;
-}
-
-bool MethodOverloadListRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  for (OneMethodRecord &Meth : Methods)
-    if ((Success = Meth.remapTypeIndices(IndexMap)))
-      return Success;
-  return Success;
-}
-
-bool OverloadedMethodRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, MethodList);
-}
-
-bool DataMemberRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool StaticDataMemberRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool EnumeratorRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return true;
-}
-
-bool VFPtrRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool BaseClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, Type);
-}
-
-bool VirtualBaseClassRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  bool Success = true;
-  Success &= remapIndex(IndexMap, BaseType);
-  Success &= remapIndex(IndexMap, VBPtrType);
-  return Success;
-}
-
-bool ListContinuationRecord::remapTypeIndices(ArrayRef<TypeIndex> IndexMap) {
-  return remapIndex(IndexMap, ContinuationIndex);
-}
diff --git a/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
index f46e08d5542913bad2b02b3cd8884f4ba35ffa55..114f6fd2897e711d7f36c9b0be0aeaefb16138ff 100644
--- a/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
+++ b/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
@@ -67,12 +67,9 @@ static Error mapNameAndUniqueName(CodeViewRecordIO &IO, StringRef &Name,
       error(IO.mapStringZ(N));
       error(IO.mapStringZ(U));
     } else {
-      size_t BytesNeeded = Name.size() + 1;
-      StringRef N = Name;
-      if (BytesNeeded > BytesLeft) {
-        size_t BytesToDrop = std::min(N.size(), BytesToDrop);
-        N = N.drop_back(BytesToDrop);
-      }
+      // Cap the length of the string at however many bytes we have available,
+      // plus one for the required null terminator.
+      auto N = StringRef(Name).take_front(BytesLeft - 1);
       error(IO.mapStringZ(N));
     }
   } else {
@@ -173,6 +170,15 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 }
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR, ArgListRecord &Record) {
+  error(IO.mapVectorN<uint32_t>(
+      Record.ArgIndices,
+      [](CodeViewRecordIO &IO, TypeIndex &N) { return IO.mapInteger(N); }));
+
+  return Error::success();
+}
+
+Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
+                                          StringListRecord &Record) {
   error(IO.mapVectorN<uint32_t>(
       Record.StringIndices,
       [](CodeViewRecordIO &IO, TypeIndex &N) { return IO.mapInteger(N); }));
@@ -368,6 +374,14 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
                                           TypeServer2Record &Record) {
+  error(IO.mapGuid(Record.Guid));
+  error(IO.mapInteger(Record.Age));
+  error(IO.mapStringZ(Record.Name));
+  return Error::success();
+}
+
+Error TypeRecordMapping::visitKnownRecord(CVType &CVR, LabelRecord &Record) {
+  error(IO.mapEnum(Record.Mode));
   return Error::success();
 }
 
diff --git a/lib/DebugInfo/CodeView/TypeSerializer.cpp b/lib/DebugInfo/CodeView/TypeSerializer.cpp
index f24fcff8627401dc28e95c439fcf7efa2be468c0..fd4d1853fa544f047c244a48925ffacaa9e86545 100644
--- a/lib/DebugInfo/CodeView/TypeSerializer.cpp
+++ b/lib/DebugInfo/CodeView/TypeSerializer.cpp
@@ -9,7 +9,7 @@
 
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
 
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 #include <string.h>
 
@@ -85,7 +85,8 @@ TypeSerializer::addPadding(MutableArrayRef<uint8_t> Record) {
 
 TypeSerializer::TypeSerializer(BumpPtrAllocator &Storage)
     : RecordStorage(Storage), LastTypeIndex(),
-      RecordBuffer(MaxRecordLength * 2), Stream(RecordBuffer), Writer(Stream),
+      RecordBuffer(MaxRecordLength * 2),
+      Stream(RecordBuffer, llvm::support::little), Writer(Stream),
       Mapping(Writer) {
   // RecordBuffer needs to be able to hold enough data so that if we are 1
   // byte short of MaxRecordLen, and then we try to write MaxRecordLen bytes,
@@ -203,15 +204,15 @@ Error TypeSerializer::visitMemberEnd(CVMemberRecord &Record) {
 
     uint8_t *SegmentBytes = RecordStorage.Allocate<uint8_t>(LengthWithSize);
     auto SavedSegment = MutableArrayRef<uint8_t>(SegmentBytes, LengthWithSize);
-    msf::MutableByteStream CS(SavedSegment);
-    msf::StreamWriter CW(CS);
+    MutableBinaryByteStream CS(SavedSegment, llvm::support::little);
+    BinaryStreamWriter CW(CS);
     if (auto EC = CW.writeBytes(CopyData))
       return EC;
     if (auto EC = CW.writeEnum(TypeLeafKind::LF_INDEX))
       return EC;
-    if (auto EC = CW.writeInteger(uint16_t(0)))
+    if (auto EC = CW.writeInteger<uint16_t>(0))
       return EC;
-    if (auto EC = CW.writeInteger(uint32_t(0xB0C0B0C0)))
+    if (auto EC = CW.writeInteger<uint32_t>(0xB0C0B0C0))
       return EC;
     FieldListSegments.push_back(SavedSegment);
 
diff --git a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index ed6cf5743a1255d1e4abb854f05b3af533796703..aad20ae6dda16689e0ce0b968c86d73c2c658278 100644
--- a/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -52,12 +52,19 @@ namespace {
 /// - If the type record already exists in the destination stream, discard it
 ///   and update the type index map to forward the source type index to the
 ///   existing destination type index.
+///
+/// As an additional complication, type stream merging actually produces two
+/// streams: an item (or IPI) stream and a type stream, as this is what is
+/// actually stored in the final PDB. We choose which records go where by
+/// looking at the record kind.
 class TypeStreamMerger : public TypeVisitorCallbacks {
 public:
-  TypeStreamMerger(TypeTableBuilder &DestStream)
-      : DestStream(DestStream), FieldListBuilder(DestStream) {
-    assert(!hadError());
-  }
+  TypeStreamMerger(TypeTableBuilder &DestIdStream,
+                   TypeTableBuilder &DestTypeStream, TypeServerHandler *Handler)
+      : DestIdStream(DestIdStream), DestTypeStream(DestTypeStream),
+        FieldListBuilder(DestTypeStream), Handler(Handler) {}
+
+  static const TypeIndex Untranslated;
 
 /// TypeVisitorCallbacks overrides.
 #define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
@@ -74,42 +81,65 @@ public:
   Error visitTypeEnd(CVType &Record) override;
   Error visitMemberEnd(CVMemberRecord &Record) override;
 
-  bool mergeStream(const CVTypeArray &Types);
+  Error mergeStream(const CVTypeArray &Types);
 
 private:
+  void addMapping(TypeIndex Idx);
+
+  bool remapIndex(TypeIndex &Idx);
+
+  size_t slotForIndex(TypeIndex Idx) const {
+    assert(!Idx.isSimple() && "simple type indices have no slots");
+    return Idx.getIndex() - TypeIndex::FirstNonSimpleIndex;
+  }
+
+  Error errorCorruptRecord() const {
+    return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
+  }
+
   template <typename RecordType>
-  Error visitKnownRecordImpl(RecordType &Record) {
-    FoundBadTypeIndex |= !Record.remapTypeIndices(IndexMap);
-    IndexMap.push_back(DestStream.writeKnownType(Record));
+  Error writeRecord(RecordType &R, bool RemapSuccess) {
+    TypeIndex DestIdx = Untranslated;
+    if (RemapSuccess)
+      DestIdx = DestTypeStream.writeKnownType(R);
+    addMapping(DestIdx);
     return Error::success();
   }
 
-  Error visitKnownRecordImpl(FieldListRecord &Record) {
-    CVTypeVisitor Visitor(*this);
-
-    if (auto EC = Visitor.visitFieldListMemberStream(Record.Data))
-      return EC;
+  template <typename RecordType>
+  Error writeIdRecord(RecordType &R, bool RemapSuccess) {
+    TypeIndex DestIdx = Untranslated;
+    if (RemapSuccess)
+      DestIdx = DestIdStream.writeKnownType(R);
+    addMapping(DestIdx);
     return Error::success();
   }
 
   template <typename RecordType>
-  Error visitKnownMemberRecordImpl(RecordType &Record) {
-    FoundBadTypeIndex |= !Record.remapTypeIndices(IndexMap);
-    FieldListBuilder.writeMemberType(Record);
+  Error writeMember(RecordType &R, bool RemapSuccess) {
+    if (RemapSuccess)
+      FieldListBuilder.writeMemberType(R);
+    else
+      HadUntranslatedMember = true;
     return Error::success();
   }
 
-  bool hadError() { return FoundBadTypeIndex; }
+  Optional<Error> LastError;
+
+  bool IsSecondPass = false;
+
+  bool HadUntranslatedMember = false;
 
-  bool FoundBadTypeIndex = false;
+  unsigned NumBadIndices = 0;
 
   BumpPtrAllocator Allocator;
 
-  TypeTableBuilder &DestStream;
+  TypeTableBuilder &DestIdStream;
+  TypeTableBuilder &DestTypeStream;
   FieldListRecordBuilder FieldListBuilder;
+  TypeServerHandler *Handler;
 
-  bool IsInFieldList{false};
-  size_t BeginIndexMapSize = 0;
+  TypeIndex CurIndex{TypeIndex::FirstNonSimpleIndex};
 
   /// Map from source type index to destination type index. Indexed by source
   /// type index minus 0x1000.
@@ -118,70 +148,346 @@ private:
 
 } // end anonymous namespace
 
+const TypeIndex TypeStreamMerger::Untranslated(SimpleTypeKind::NotTranslated);
+
 Error TypeStreamMerger::visitTypeBegin(CVRecord<TypeLeafKind> &Rec) {
-  if (Rec.Type == TypeLeafKind::LF_FIELDLIST) {
-    assert(!IsInFieldList);
-    IsInFieldList = true;
-    FieldListBuilder.begin();
-  } else
-    BeginIndexMapSize = IndexMap.size();
   return Error::success();
 }
 
 Error TypeStreamMerger::visitTypeEnd(CVRecord<TypeLeafKind> &Rec) {
-  if (Rec.Type == TypeLeafKind::LF_FIELDLIST) {
-    TypeIndex Index = FieldListBuilder.end();
-    IndexMap.push_back(Index);
-    IsInFieldList = false;
-  }
+  CurIndex = TypeIndex(CurIndex.getIndex() + 1);
+  if (!IsSecondPass)
+    assert(IndexMap.size() == slotForIndex(CurIndex) &&
+           "visitKnownRecord should add one index map entry");
   return Error::success();
 }
 
 Error TypeStreamMerger::visitMemberEnd(CVMemberRecord &Rec) {
-  assert(IndexMap.size() == BeginIndexMapSize + 1);
   return Error::success();
 }
 
-#define TYPE_RECORD(EnumName, EnumVal, Name)                                   \
-  Error TypeStreamMerger::visitKnownRecord(CVType &CVR,                        \
-                                           Name##Record &Record) {             \
-    return visitKnownRecordImpl(Record);                                       \
+void TypeStreamMerger::addMapping(TypeIndex Idx) {
+  if (!IsSecondPass) {
+    assert(IndexMap.size() == slotForIndex(CurIndex) &&
+           "visitKnownRecord should add one index map entry");
+    IndexMap.push_back(Idx);
+  } else {
+    assert(slotForIndex(CurIndex) < IndexMap.size());
+    IndexMap[slotForIndex(CurIndex)] = Idx;
   }
-#define TYPE_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#define MEMBER_RECORD(EnumName, EnumVal, Name)                                 \
-  Error TypeStreamMerger::visitKnownMember(CVMemberRecord &CVR,                \
-                                           Name##Record &Record) {             \
-    return visitKnownMemberRecordImpl(Record);                                 \
+}
+
+bool TypeStreamMerger::remapIndex(TypeIndex &Idx) {
+  // Simple types are unchanged.
+  if (Idx.isSimple())
+    return true;
+
+  // Check if this type index refers to a record we've already translated
+  // successfully. If it refers to a type later in the stream or a record we
+  // had to defer, defer it until later pass.
+  unsigned MapPos = slotForIndex(Idx);
+  if (MapPos < IndexMap.size() && IndexMap[MapPos] != Untranslated) {
+    Idx = IndexMap[MapPos];
+    return true;
   }
-#define MEMBER_RECORD_ALIAS(EnumName, EnumVal, Name, AliasName)
-#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+
+  // If this is the second pass and this index isn't in the map, then it points
+  // outside the current type stream, and this is a corrupt record.
+  if (IsSecondPass && MapPos >= IndexMap.size()) {
+    // FIXME: Print a more useful error. We can give the current record and the
+    // index that we think its pointing to.
+    LastError = joinErrors(std::move(*LastError), errorCorruptRecord());
+  }
+
+  ++NumBadIndices;
+
+  // This type index is invalid. Remap this to "not translated by cvpack",
+  // and return failure.
+  Idx = Untranslated;
+  return false;
+}
+
+//----------------------------------------------------------------------------//
+// Item records
+//----------------------------------------------------------------------------//
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, FuncIdRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ParentScope);
+  Success &= remapIndex(R.FunctionType);
+  return writeIdRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, MemberFuncIdRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ClassType);
+  Success &= remapIndex(R.FunctionType);
+  return writeIdRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, StringIdRecord &R) {
+  return writeIdRecord(R, remapIndex(R.Id));
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, StringListRecord &R) {
+  bool Success = true;
+  for (TypeIndex &Str : R.StringIndices)
+    Success &= remapIndex(Str);
+  return writeIdRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, BuildInfoRecord &R) {
+  bool Success = true;
+  for (TypeIndex &Arg : R.ArgIndices)
+    Success &= remapIndex(Arg);
+  return writeIdRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, UdtSourceLineRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.UDT);
+  Success &= remapIndex(R.SourceFile);
+  // FIXME: Translate UdtSourceLineRecord into UdtModSourceLineRecords in the
+  // IPI stream.
+  return writeIdRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, UdtModSourceLineRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.UDT);
+  Success &= remapIndex(R.SourceFile);
+  return writeIdRecord(R, Success);
+}
+
+//----------------------------------------------------------------------------//
+// Type records
+//----------------------------------------------------------------------------//
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, ModifierRecord &R) {
+  return writeRecord(R, remapIndex(R.ModifiedType));
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, ProcedureRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ReturnType);
+  Success &= remapIndex(R.ArgumentList);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, MemberFunctionRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ReturnType);
+  Success &= remapIndex(R.ClassType);
+  Success &= remapIndex(R.ThisType);
+  Success &= remapIndex(R.ArgumentList);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &Type, ArgListRecord &R) {
+  bool Success = true;
+  for (TypeIndex &Arg : R.ArgIndices)
+    Success &= remapIndex(Arg);
+  if (auto EC = writeRecord(R, Success))
+    return EC;
+  return Error::success();
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, PointerRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ReferentType);
+  if (R.isPointerToMember())
+    Success &= remapIndex(R.MemberInfo->ContainingType);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, ArrayRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.ElementType);
+  Success &= remapIndex(R.IndexType);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, ClassRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.FieldList);
+  Success &= remapIndex(R.DerivationList);
+  Success &= remapIndex(R.VTableShape);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, UnionRecord &R) {
+  return writeRecord(R, remapIndex(R.FieldList));
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, EnumRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.FieldList);
+  Success &= remapIndex(R.UnderlyingType);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, BitFieldRecord &R) {
+  return writeRecord(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, VFTableShapeRecord &R) {
+  return writeRecord(R, true);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, TypeServer2Record &R) {
+  return writeRecord(R, true);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, LabelRecord &R) {
+  return writeRecord(R, true);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, VFTableRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.CompleteClass);
+  Success &= remapIndex(R.OverriddenVFTable);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &,
+                                         MethodOverloadListRecord &R) {
+  bool Success = true;
+  for (OneMethodRecord &Meth : R.Methods)
+    Success &= remapIndex(Meth.Type);
+  return writeRecord(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownRecord(CVType &, FieldListRecord &R) {
+  // Visit the members inside the field list.
+  HadUntranslatedMember = false;
+  FieldListBuilder.begin();
+  CVTypeVisitor Visitor(*this);
+  if (auto EC = Visitor.visitFieldListMemberStream(R.Data))
+    return EC;
+
+  // Write the record if we translated all field list members.
+  TypeIndex DestIdx = Untranslated;
+  if (!HadUntranslatedMember)
+    DestIdx = FieldListBuilder.end();
+  else
+    FieldListBuilder.reset();
+  addMapping(DestIdx);
+
+  return Error::success();
+}
+
+//----------------------------------------------------------------------------//
+// Member records
+//----------------------------------------------------------------------------//
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         NestedTypeRecord &R) {
+  return writeMember(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, OneMethodRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.Type);
+  return writeMember(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         OverloadedMethodRecord &R) {
+  return writeMember(R, remapIndex(R.MethodList));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         DataMemberRecord &R) {
+  return writeMember(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         StaticDataMemberRecord &R) {
+  return writeMember(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         EnumeratorRecord &R) {
+  return writeMember(R, true);
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, VFPtrRecord &R) {
+  return writeMember(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &, BaseClassRecord &R) {
+  return writeMember(R, remapIndex(R.Type));
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         VirtualBaseClassRecord &R) {
+  bool Success = true;
+  Success &= remapIndex(R.BaseType);
+  Success &= remapIndex(R.VBPtrType);
+  return writeMember(R, Success);
+}
+
+Error TypeStreamMerger::visitKnownMember(CVMemberRecord &,
+                                         ListContinuationRecord &R) {
+  return writeMember(R, remapIndex(R.ContinuationIndex));
+}
 
 Error TypeStreamMerger::visitUnknownType(CVType &Rec) {
   // We failed to translate a type. Translate this index as "not translated".
-  IndexMap.push_back(
-      TypeIndex(SimpleTypeKind::NotTranslated, SimpleTypeMode::Direct));
-  return llvm::make_error<CodeViewError>(cv_error_code::corrupt_record);
+  addMapping(TypeIndex(SimpleTypeKind::NotTranslated));
+  return errorCorruptRecord();
 }
 
-bool TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
+Error TypeStreamMerger::mergeStream(const CVTypeArray &Types) {
   assert(IndexMap.empty());
   TypeVisitorCallbackPipeline Pipeline;
+  LastError = Error::success();
 
   TypeDeserializer Deserializer;
   Pipeline.addCallbackToPipeline(Deserializer);
   Pipeline.addCallbackToPipeline(*this);
 
   CVTypeVisitor Visitor(Pipeline);
+  if (Handler)
+    Visitor.addTypeServerHandler(*Handler);
+
+  if (auto EC = Visitor.visitTypeStream(Types))
+    return EC;
+
+  // If we found bad indices but no other errors, try doing another pass and see
+  // if we can resolve the indices that weren't in the map on the first pass.
+  // This may require multiple passes, but we should always make progress. MASM
+  // is the only known CodeView producer that makes type streams that aren't
+  // topologically sorted. The standard library contains MASM-produced objects,
+  // so this is important to handle correctly, but we don't have to be too
+  // efficient. MASM type streams are usually very small.
+  while (!*LastError && NumBadIndices > 0) {
+    unsigned BadIndicesRemaining = NumBadIndices;
+    IsSecondPass = true;
+    NumBadIndices = 0;
+    CurIndex = TypeIndex(TypeIndex::FirstNonSimpleIndex);
+    if (auto EC = Visitor.visitTypeStream(Types))
+      return EC;
 
-  if (auto EC = Visitor.visitTypeStream(Types)) {
-    consumeError(std::move(EC));
-    return false;
+    assert(NumBadIndices <= BadIndicesRemaining &&
+           "second pass found more bad indices");
+    if (!*LastError && NumBadIndices == BadIndicesRemaining) {
+      return llvm::make_error<CodeViewError>(
+          cv_error_code::corrupt_record, "input type graph contains cycles");
+    }
   }
+
   IndexMap.clear();
-  return !hadError();
+
+  Error Ret = std::move(*LastError);
+  LastError.reset();
+  return Ret;
 }
 
-bool llvm::codeview::mergeTypeStreams(TypeTableBuilder &DestStream,
-                                      const CVTypeArray &Types) {
-  return TypeStreamMerger(DestStream).mergeStream(Types);
+Error llvm::codeview::mergeTypeStreams(TypeTableBuilder &DestIdStream,
+                                       TypeTableBuilder &DestTypeStream,
+                                       TypeServerHandler *Handler,
+                                       const CVTypeArray &Types) {
+  return TypeStreamMerger(DestIdStream, DestTypeStream, Handler)
+      .mergeStream(Types);
 }
diff --git a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index 08bc74a81e9afea1f21607f5e264cc5e04650f88..e7b4b777b43fae386e1445dd6fdaee3ccf7b7572 100644
--- a/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFAbbreviationDeclaration.cpp ----------------------------------===//
+//===- DWARFAbbreviationDeclaration.cpp -----------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <cstdint>
+
 using namespace llvm;
 using namespace dwarf;
 
@@ -86,7 +92,6 @@ DWARFAbbreviationDeclaration::extract(DataExtractor Data,
           case DW_FORM_line_strp:
           case DW_FORM_sec_offset:
           case DW_FORM_strp_sup:
-          case DW_FORM_ref_sup:
             ++FixedAttributeSize->NumDwarfOffsets;
             break;
 
diff --git a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 2eac2cd50b8dd4ee841878005d639a3ae5fe314b..85e1eaedfc6138c05ec00914106f93aa0d636f38 100644
--- a/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -1,4 +1,4 @@
-//===--- DWARFAcceleratorTable.cpp ----------------------------------------===//
+//===- DWARFAcceleratorTable.cpp ------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <cstdint>
+#include <utility>
 
-namespace llvm {
+using namespace llvm;
 
 bool DWARFAcceleratorTable::extract() {
   uint32_t Offset = 0;
@@ -131,4 +138,3 @@ LLVM_DUMP_METHOD void DWARFAcceleratorTable::dump(raw_ostream &OS) const {
     }
   }
 }
-}
diff --git a/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index 948972f8f1361e329e4ca8ab68b1e6892e50e96b..6e550f2e9ec954ab4b6571969cd1a9b1f3ed095c 100644
--- a/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -18,8 +18,10 @@ using namespace llvm;
 void DWARFCompileUnit::dump(raw_ostream &OS) {
   OS << format("0x%08x", getOffset()) << ": Compile Unit:"
      << " length = " << format("0x%08x", getLength())
-     << " version = " << format("0x%04x", getVersion())
-     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " version = " << format("0x%04x", getVersion());
+  if (getVersion() >= 5)
+    OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
+  OS << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
      << " addr_size = " << format("0x%02x", getAddressByteSize())
      << " (next unit at " << format("0x%08x", getNextUnitOffset())
      << ")\n";
diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp
index ab9f76a0bd5fcc0a7436f46264e42a3fc366e7ac..ce5c4ae89e88317a85001e16e1b81a730f6f66a3 100644
--- a/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFContext.cpp --------------------------------------------------===//
+//===- DWARFContext.cpp ---------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,23 +7,45 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAranges.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
+#include "llvm/DebugInfo/DWARF/DWARFSection.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Object/Decompressor.h"
 #include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/RelocVisitor.h"
-#include "llvm/Support/Compression.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/Path.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
+
 using namespace llvm;
 using namespace dwarf;
 using namespace object;
@@ -439,23 +461,32 @@ DWARFCompileUnit *DWARFContext::getCompileUnitForAddress(uint64_t Address) {
   return getCompileUnitForOffset(CUOffset);
 }
 
-static bool getFunctionNameForAddress(DWARFCompileUnit *CU, uint64_t Address,
-                                      FunctionNameKind Kind,
-                                      std::string &FunctionName) {
-  if (Kind == FunctionNameKind::None)
-    return false;
+static bool getFunctionNameAndStartLineForAddress(DWARFCompileUnit *CU,
+                                                  uint64_t Address,
+                                                  FunctionNameKind Kind,
+                                                  std::string &FunctionName,
+                                                  uint32_t &StartLine) {
   // The address may correspond to instruction in some inlined function,
   // so we have to build the chain of inlined functions and take the
-  // name of the topmost function in it.SmallVectorImpl<DWARFDie> &InlinedChain
+  // name of the topmost function in it.
   SmallVector<DWARFDie, 4> InlinedChain;
   CU->getInlinedChainForAddress(Address, InlinedChain);
-  if (InlinedChain.size() == 0)
+  if (InlinedChain.empty())
     return false;
-  if (const char *Name = InlinedChain[0].getSubroutineName(Kind)) {
+
+  const DWARFDie &DIE = InlinedChain[0];
+  bool FoundResult = false;
+  const char *Name = nullptr;
+  if (Kind != FunctionNameKind::None && (Name = DIE.getSubroutineName(Kind))) {
     FunctionName = Name;
-    return true;
+    FoundResult = true;
+  }
+  if (auto DeclLineResult = DIE.getDeclLine()) {
+    StartLine = DeclLineResult;
+    FoundResult = true;
   }
-  return false;
+
+  return FoundResult;
 }
 
 DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
@@ -465,7 +496,9 @@ DILineInfo DWARFContext::getLineInfoForAddress(uint64_t Address,
   DWARFCompileUnit *CU = getCompileUnitForAddress(Address);
   if (!CU)
     return Result;
-  getFunctionNameForAddress(CU, Address, Spec.FNKind, Result.FunctionName);
+  getFunctionNameAndStartLineForAddress(CU, Address, Spec.FNKind,
+                                        Result.FunctionName,
+                                        Result.StartLine);
   if (Spec.FLIKind != FileLineInfoKind::None) {
     if (const DWARFLineTable *LineTable = getLineTableForUnit(CU))
       LineTable->getFileLineInfoForAddress(Address, CU->getCompilationDir(),
@@ -483,13 +516,16 @@ DWARFContext::getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
     return Lines;
 
   std::string FunctionName = "<invalid>";
-  getFunctionNameForAddress(CU, Address, Spec.FNKind, FunctionName);
+  uint32_t StartLine = 0;
+  getFunctionNameAndStartLineForAddress(CU, Address, Spec.FNKind, FunctionName,
+                                        StartLine);
 
   // If the Specifier says we don't need FileLineInfo, just
   // return the top-most function at the starting address.
   if (Spec.FLIKind == FileLineInfoKind::None) {
     DILineInfo Result;
     Result.FunctionName = FunctionName;
+    Result.StartLine = StartLine;
     Lines.push_back(std::make_pair(Address, Result));
     return Lines;
   }
@@ -510,6 +546,7 @@ DWARFContext::getLineInfoForAddressRange(uint64_t Address, uint64_t Size,
     Result.FunctionName = FunctionName;
     Result.Line = Row.Line;
     Result.Column = Row.Column;
+    Result.StartLine = StartLine;
     Lines.push_back(std::make_pair(Row.Address, Result));
   }
 
@@ -549,6 +586,8 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
     // Get function name if necessary.
     if (const char *Name = FunctionDIE.getSubroutineName(Spec.FNKind))
       Frame.FunctionName = Name;
+    if (auto DeclLineResult = FunctionDIE.getDeclLine())
+      Frame.StartLine = DeclLineResult;
     if (Spec.FLIKind != FileLineInfoKind::None) {
       if (i == 0) {
         // For the topmost frame, initialize the line table of this
@@ -577,6 +616,66 @@ DWARFContext::getInliningInfoForAddress(uint64_t Address,
   return InliningInfo;
 }
 
+static Error createError(const Twine &Reason, llvm::Error E) {
+  return make_error<StringError>(Reason + toString(std::move(E)),
+                                 inconvertibleErrorCode());
+}
+
+/// Returns the address of symbol relocation used against. Used for futher
+/// relocations computation. Symbol's section load address is taken in account if
+/// LoadedObjectInfo interface is provided.
+static Expected<uint64_t> getSymbolAddress(const object::ObjectFile &Obj,
+                                           const RelocationRef &Reloc,
+                                           const LoadedObjectInfo *L) {
+  uint64_t Ret = 0;
+  object::section_iterator RSec = Obj.section_end();
+  object::symbol_iterator Sym = Reloc.getSymbol();
+
+  // First calculate the address of the symbol or section as it appears
+  // in the object file
+  if (Sym != Obj.symbol_end()) {
+    Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
+    if (!SymAddrOrErr)
+      return createError("error: failed to compute symbol address: ",
+                         SymAddrOrErr.takeError());
+
+    // Also remember what section this symbol is in for later
+    auto SectOrErr = Sym->getSection();
+    if (!SectOrErr)
+      return createError("error: failed to get symbol section: ",
+                         SectOrErr.takeError());
+
+    RSec = *SectOrErr;
+    Ret = *SymAddrOrErr;
+  } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
+    RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl());
+    Ret = RSec->getAddress();
+  }
+
+  // If we are given load addresses for the sections, we need to adjust:
+  // SymAddr = (Address of Symbol Or Section in File) -
+  //           (Address of Section in File) +
+  //           (Load Address of Section)
+  // RSec is now either the section being targeted or the section
+  // containing the symbol being targeted. In either case,
+  // we need to perform the same computation.
+  if (L && RSec != Obj.section_end())
+    if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec))
+      Ret += SectionLoadAddress - RSec->getAddress();
+  return Ret;
+}
+
+static bool isRelocScattered(const object::ObjectFile &Obj,
+                             const RelocationRef &Reloc) {
+  if (!isa<MachOObjectFile>(&Obj))
+    return false;
+  // MachO also has relocations that point to sections and
+  // scattered relocations.
+  const MachOObjectFile *MachObj = cast<MachOObjectFile>(&Obj);
+  auto RelocInfo = MachObj->getRelocation(Reloc.getRawDataRefImpl());
+  return MachObj->isRelocationScattered(RelocInfo);
+}
+
 DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     const LoadedObjectInfo *L)
     : IsLittleEndian(Obj.isLittleEndian()),
@@ -682,73 +781,19 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
     if (Section.relocation_begin() != Section.relocation_end()) {
       uint64_t SectionSize = RelocatedSection->getSize();
       for (const RelocationRef &Reloc : Section.relocations()) {
-        uint64_t Address = Reloc.getOffset();
-        uint64_t Type = Reloc.getType();
-        uint64_t SymAddr = 0;
-        uint64_t SectionLoadAddress = 0;
-        object::symbol_iterator Sym = Reloc.getSymbol();
-        object::section_iterator RSec = Obj.section_end();
-
-        // First calculate the address of the symbol or section as it appears
-        // in the objct file
-        if (Sym != Obj.symbol_end()) {
-          Expected<uint64_t> SymAddrOrErr = Sym->getAddress();
-          if (!SymAddrOrErr) {
-            std::string Buf;
-            raw_string_ostream OS(Buf);
-            logAllUnhandledErrors(SymAddrOrErr.takeError(), OS, "");
-            OS.flush();
-            errs() << "error: failed to compute symbol address: "
-                   << Buf << '\n';
-            continue;
-          }
-          SymAddr = *SymAddrOrErr;
-          // Also remember what section this symbol is in for later
-          auto SectOrErr = Sym->getSection();
-          if (!SectOrErr) {
-            std::string Buf;
-            raw_string_ostream OS(Buf);
-            logAllUnhandledErrors(SectOrErr.takeError(), OS, "");
-            OS.flush();
-            errs() << "error: failed to get symbol section: "
-                   << Buf << '\n';
-            continue;
-          }
-          RSec = *SectOrErr;
-        } else if (auto *MObj = dyn_cast<MachOObjectFile>(&Obj)) {
-          // MachO also has relocations that point to sections and
-          // scattered relocations.
-          auto RelocInfo = MObj->getRelocation(Reloc.getRawDataRefImpl());
-          if (MObj->isRelocationScattered(RelocInfo)) {
-            // FIXME: it's not clear how to correctly handle scattered
-            // relocations.
-            continue;
-          } else {
-            RSec = MObj->getRelocationSection(Reloc.getRawDataRefImpl());
-            SymAddr = RSec->getAddress();
-          }
-        }
+        // FIXME: it's not clear how to correctly handle scattered
+        // relocations.
+        if (isRelocScattered(Obj, Reloc))
+          continue;
 
-        // If we are given load addresses for the sections, we need to adjust:
-        // SymAddr = (Address of Symbol Or Section in File) -
-        //           (Address of Section in File) +
-        //           (Load Address of Section)
-        if (L != nullptr && RSec != Obj.section_end()) {
-          // RSec is now either the section being targeted or the section
-          // containing the symbol being targeted. In either case,
-          // we need to perform the same computation.
-          StringRef SecName;
-          RSec->getName(SecName);
-//           llvm::dbgs() << "Name: '" << SecName
-//                        << "', RSec: " << RSec->getRawDataRefImpl()
-//                        << ", Section: " << Section.getRawDataRefImpl() << "\n";
-          SectionLoadAddress = L->getSectionLoadAddress(*RSec);
-          if (SectionLoadAddress != 0)
-            SymAddr += SectionLoadAddress - RSec->getAddress();
+        Expected<uint64_t> SymAddrOrErr = getSymbolAddress(Obj, Reloc, L);
+        if (!SymAddrOrErr) {
+          errs() << toString(SymAddrOrErr.takeError()) << '\n';
+          continue;
         }
 
         object::RelocVisitor V(Obj);
-        object::RelocToApply R(V.visit(Type, Reloc, SymAddr));
+        object::RelocToApply R(V.visit(Reloc.getType(), Reloc, *SymAddrOrErr));
         if (V.error()) {
           SmallString<32> Name;
           Reloc.getTypeName(Name);
@@ -756,7 +801,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj,
                  << Name << "\n";
           continue;
         }
-
+        uint64_t Address = Reloc.getOffset();
         if (Address + R.Width > SectionSize) {
           errs() << "error: " << R.Width << "-byte relocation starting "
                  << Address << " bytes into section " << name << " which is "
@@ -823,4 +868,4 @@ StringRef *DWARFContextInMemory::MapSectionToMember(StringRef Name) {
       .Default(nullptr);
 }
 
-void DWARFContextInMemory::anchor() { }
+void DWARFContextInMemory::anchor() {}
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp b/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
index e63e28997ed06bb1f95d2e5126114f894c36a18d..76dd2e4c21bcc0f3058c3c5542e31d499437a3d5 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAbbrev.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugAbbrev.cpp ----------------------------------------------===//
+//===- DWARFDebugAbbrev.cpp -----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,6 +10,10 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
+
 using namespace llvm;
 
 DWARFAbbreviationDeclarationSet::DWARFAbbreviationDeclarationSet() {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp b/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
index 67589cd01e550e3da26f8e4ce1c8e9e7a08370f2..ed5d726ae4e2bf1b5697e36cd6b68c199276fd65 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugArangeSet.cpp -------------------------------------------===//
+//===- DWARFDebugArangeSet.cpp --------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,8 +10,11 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include <algorithm>
 #include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <cstring>
+
 using namespace llvm;
 
 void DWARFDebugArangeSet::clear() {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index 27a02c4c50d0eae7bee4a2343abbc1435f5bad81..0cf71f530446b2cafcd98da9fbdf37524e824b61 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugAranges.cpp -----------------------------------*- C++ -*-===//
+//===- DWARFDebugAranges.cpp ----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,11 +11,13 @@
 #include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/DataExtractor.h"
 #include <algorithm>
 #include <cassert>
+#include <cstdint>
 #include <set>
+#include <vector>
+
 using namespace llvm;
 
 void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
@@ -81,7 +83,7 @@ void DWARFDebugAranges::construct() {
   std::sort(Endpoints.begin(), Endpoints.end());
   uint64_t PrevAddress = -1ULL;
   for (const auto &E : Endpoints) {
-    if (PrevAddress < E.Address && ValidCUs.size() > 0) {
+    if (PrevAddress < E.Address && !ValidCUs.empty()) {
       // If the address range between two endpoints is described by some
       // CU, first try to extend the last range in Aranges. If we can't
       // do it, start a new range.
diff --git a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 32b8320e26c5a002e4d51c36333dd3ac92a35914..b55ed6a468496280446d39f5a679f65d252996ae 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugFrame.h - Parsing of .debug_frame -------------*- C++ -*-===//
+//===- DWARFDebugFrame.h - Parsing of .debug_frame ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Optional.h"
@@ -15,6 +14,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugFrame.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
@@ -465,8 +465,7 @@ void FrameEntry::dumpInstructions(raw_ostream &OS) const {
   }
 }
 
-DWARFDebugFrame::DWARFDebugFrame(bool IsEH) : IsEH(IsEH) {
-}
+DWARFDebugFrame::DWARFDebugFrame(bool IsEH) : IsEH(IsEH) {}
 
 DWARFDebugFrame::~DWARFDebugFrame() = default;
 
@@ -485,17 +484,17 @@ static unsigned getSizeForEncoding(const DataExtractor &Data,
   unsigned format = symbolEncoding & 0x0f;
   switch (format) {
     default: llvm_unreachable("Unknown Encoding");
-    case dwarf::DW_EH_PE_absptr:
-    case dwarf::DW_EH_PE_signed:
+    case DW_EH_PE_absptr:
+    case DW_EH_PE_signed:
       return Data.getAddressSize();
-    case dwarf::DW_EH_PE_udata2:
-    case dwarf::DW_EH_PE_sdata2:
+    case DW_EH_PE_udata2:
+    case DW_EH_PE_sdata2:
       return 2;
-    case dwarf::DW_EH_PE_udata4:
-    case dwarf::DW_EH_PE_sdata4:
+    case DW_EH_PE_udata4:
+    case DW_EH_PE_sdata4:
       return 4;
-    case dwarf::DW_EH_PE_udata8:
-    case dwarf::DW_EH_PE_sdata8:
+    case DW_EH_PE_udata8:
+    case DW_EH_PE_sdata8:
       return 8;
   }
 }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
index c487e1dca7c6a9e2bf5451b600f0d6d9613530e1..35f673c7acc690f5a0203ca9f5537deabc45a16c 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugInfoEntry.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugInfoEntry.cpp -------------------------------------------===//
+//===- DWARFDebugInfoEntry.cpp --------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,20 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SyntaxHighlighting.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/DataExtractor.h"
+#include <cstddef>
+#include <cstdint>
+
 using namespace llvm;
 using namespace dwarf;
-using namespace syntax;
 
 bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U,
                                              uint32_t *OffsetPtr) {
@@ -28,6 +25,7 @@ bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U,
   const uint32_t UEndOffset = U.getNextUnitOffset();
   return extractFast(U, OffsetPtr, DebugInfoData, UEndOffset, 0);
 }
+
 bool DWARFDebugInfoEntry::extractFast(const DWARFUnit &U, uint32_t *OffsetPtr,
                                       const DataExtractor &DebugInfoData,
                                       uint32_t UEndOffset, uint32_t D) {
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 494059461fd7759ccd49a206388814361edd9a52..e4670519b7979212cfa8fcd98f9d451639d69795 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugLine.cpp ------------------------------------------------===//
+//===- DWARFDebugLine.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <cstdio>
+#include <utility>
+
 using namespace llvm;
 using namespace dwarf;
+
 typedef DILineInfoSpecifier::FileLineInfoKind FileLineInfoKind;
 
 DWARFDebugLine::Prologue::Prologue() { clear(); }
diff --git a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index ae5b9d70a2eb3dff07d64dd251bce527cbeafc0b..e2799ab2d243da40dfc70a604ef86a4c7f163679 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugLoc.cpp -------------------------------------------------===//
+//===- DWARFDebugLoc.cpp --------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugLoc.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cinttypes>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -71,7 +76,7 @@ void DWARFDebugLoc::parse(DataExtractor data, unsigned AddressSize) {
     }
   }
   if (data.isValidOffset(Offset))
-    llvm::errs() << "error: failed to consume entire .debug_loc section\n";
+    errs() << "error: failed to consume entire .debug_loc section\n";
 }
 
 void DWARFDebugLocDWO::parse(DataExtractor data) {
@@ -85,8 +90,8 @@ void DWARFDebugLocDWO::parse(DataExtractor data) {
                 data.getU8(&Offset))) != dwarf::DW_LLE_end_of_list) {
 
       if (Kind != dwarf::DW_LLE_startx_length) {
-        llvm::errs() << "error: dumping support for LLE of kind " << (int)Kind
-                     << " not implemented\n";
+        errs() << "error: dumping support for LLE of kind " << (int)Kind
+               << " not implemented\n";
         return;
       }
 
@@ -123,4 +128,3 @@ void DWARFDebugLocDWO::dump(raw_ostream &OS) const {
     }
   }
 }
-
diff --git a/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index 7710a90b5e13e36578f805112875b070b2e66af6..e0a9adde8e58db055b98336a16140062f4b2caa2 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugMacro.cpp -----------------------------------------------===//
+//===- DWARFDebugMacro.cpp ------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "SyntaxHighlighting.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
 using namespace dwarf;
diff --git a/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp b/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
index 3c1fe93090c694fe703f4975e6f27952a8af815a..662e53d9d7e660c2d7599fc688797cd14d96e5f0 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugPubTable.cpp ---------------------------------------------===//
+//===- DWARFDebugPubTable.cpp ---------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugPubTable.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
-using namespace llvm::dwarf;
+using namespace dwarf;
 
 DWARFDebugPubTable::DWARFDebugPubTable(StringRef Data, bool LittleEndian,
                                        bool GnuStyle)
@@ -54,7 +58,7 @@ void DWARFDebugPubTable::dump(StringRef Name, raw_ostream &OS) const {
       OS << format("0x%8.8x ", E.SecOffset);
       if (GnuStyle) {
         StringRef EntryLinkage =
-            dwarf::GDBIndexEntryLinkageString(E.Descriptor.Linkage);
+            GDBIndexEntryLinkageString(E.Descriptor.Linkage);
         StringRef EntryKind = dwarf::GDBIndexEntryKindString(E.Descriptor.Kind);
         OS << format("%-8s", EntryLinkage.data()) << ' '
            << format("%-8s", EntryKind.data()) << ' ';
diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
index d5df6885f5e9ed5ff2970446014d958edaa6ab02..f1d82fda8c06b5be6674a25dfbc0557f6167dfb0 100644
--- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDebugRangesList.cpp ------------------------------------------===//
+//===- DWARFDebugRangesList.cpp -------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,6 +10,9 @@
 #include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
+#include <utility>
 
 using namespace llvm;
 
diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp
index 9a05ae8d38ac58e7b726b5da45b9eb1e44505c2e..4308cc2e26396bc0ff332769cb6e199fcbcef655 100644
--- a/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFDie.cpp ------------------------------------------------------===//
+//===- DWARFDie.cpp -------------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,25 +7,33 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "SyntaxHighlighting.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
-#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/DataTypes.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Dwarf.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <string>
+#include <utility>
 
 using namespace llvm;
 using namespace dwarf;
 using namespace syntax;
 
-namespace {
- static void dumpApplePropertyAttribute(raw_ostream &OS, uint64_t Val) {
+static void dumpApplePropertyAttribute(raw_ostream &OS, uint64_t Val) {
   OS << " (";
   do {
     uint64_t Shift = countTrailingZeros(Val);
@@ -122,8 +130,6 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
   OS << ")\n";
 }
 
-} // end anonymous namespace
-
 bool DWARFDie::isSubprogramDIE() const {
   return getTag() == DW_TAG_subprogram;
 }
@@ -143,21 +149,6 @@ DWARFDie::find(dwarf::Attribute Attr) const {
   return None;
 }
 
-Optional<DWARFFormValue>
-DWARFDie::findRecursively(dwarf::Attribute Attr) const {
-  if (!isValid())
-    return None;
-  if (auto Value = find(Attr))
-    return Value;
-  if (auto Die = getAttributeValueAsReferencedDie(DW_AT_abstract_origin))
-    if (auto Value = Die.find(Attr))
-      return Value;
-  if (auto Die = getAttributeValueAsReferencedDie(DW_AT_specification))
-    if (auto Value = Die.find(Attr))
-      return Value;
-  return None;
-}
-
 Optional<DWARFFormValue>
 DWARFDie::find(ArrayRef<dwarf::Attribute> Attrs) const {
   if (!isValid())
@@ -176,14 +167,17 @@ Optional<DWARFFormValue>
 DWARFDie::findRecursively(ArrayRef<dwarf::Attribute> Attrs) const {
   if (!isValid())
     return None;
-  if (auto Value = find(Attrs))
+  auto Die = *this;
+  if (auto Value = Die.find(Attrs))
+    return Value;
+  if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_abstract_origin))
+    Die = D;
+  if (auto Value = Die.find(Attrs))
+    return Value;
+  if (auto D = Die.getAttributeValueAsReferencedDie(DW_AT_specification))
+    Die = D;
+  if (auto Value = Die.find(Attrs))
     return Value;
-  if (auto Die = getAttributeValueAsReferencedDie(DW_AT_abstract_origin))
-    if (auto Value = Die.find(Attrs))
-      return Value;
-  if (auto Die = getAttributeValueAsReferencedDie(DW_AT_specification))
-    if (auto Value = Die.find(Attrs))
-      return Value;
   return None;
 }
 
@@ -291,6 +285,10 @@ DWARFDie::getName(DINameKind Kind) const {
   return nullptr;
 }
 
+uint64_t DWARFDie::getDeclLine() const {
+  return toUnsigned(findRecursively(DW_AT_decl_line), 0);
+}
+
 void DWARFDie::getCallerFrame(uint32_t &CallFile, uint32_t &CallLine,
                               uint32_t &CallColumn) const {
   CallFile = toUnsigned(find(DW_AT_call_file), 0);
@@ -325,6 +323,12 @@ void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth,
         
         // Dump all data in the DIE for the attributes.
         for (const auto &AttrSpec : AbbrevDecl->attributes()) {
+          if (AttrSpec.Form == DW_FORM_implicit_const) {
+            // We are dumping .debug_info section ,
+            // implicit_const attribute values are not really stored here,
+            // but in .debug_abbrev section. So we just skip such attrs.
+            continue;
+          }
           dumpAttribute(OS, *this, &offset, AttrSpec.Attr, AttrSpec.Form,
                         Indent);
         }
@@ -346,7 +350,6 @@ void DWARFDie::dump(raw_ostream &OS, unsigned RecurseDepth,
   }
 }
 
-
 void DWARFDie::getInlinedChainForAddress(
     const uint64_t Address, SmallVectorImpl<DWARFDie> &InlinedChain) const {
   if (isNULL())
diff --git a/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 0bed1a1e99c23deea31c401031a44f795e3196f7..6de57b999adcc3fd249e7588efd413306c562722 100644
--- a/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFFormValue.cpp ------------------------------------------------===//
+//===- DWARFFormValue.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,16 +9,21 @@
 
 #include "SyntaxHighlighting.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/Dwarf.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
-#include <cassert>
+#include <cinttypes>
+#include <cstdint>
 #include <limits>
+
 using namespace llvm;
 using namespace dwarf;
 using namespace syntax;
@@ -66,13 +71,16 @@ class FormSizeHelper {
 
 public:
   FormSizeHelper(uint16_t V, uint8_t A, llvm::dwarf::DwarfFormat F)
-  : Version(V), AddrSize(A), Format(F) {}
+      : Version(V), AddrSize(A), Format(F) {}
+
   uint8_t getAddressByteSize() const { return AddrSize; }
+
   uint8_t getRefAddrByteSize() const {
     if (Version == 2)
       return AddrSize;
     return getDwarfOffsetByteSize();
   }
+
   uint8_t getDwarfOffsetByteSize() const {
     switch (Format) {
       case dwarf::DwarfFormat::DWARF32:
@@ -120,14 +128,21 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
     case DW_FORM_flag:
     case DW_FORM_data1:
     case DW_FORM_ref1:
+    case DW_FORM_strx1:
+    case DW_FORM_addrx1:
       return 1;
 
     case DW_FORM_data2:
     case DW_FORM_ref2:
+    case DW_FORM_strx2:
+    case DW_FORM_addrx2:
       return 2;
 
     case DW_FORM_data4:
     case DW_FORM_ref4:
+    case DW_FORM_ref_sup4:
+    case DW_FORM_strx4:
+    case DW_FORM_addrx4:
       return 4;
 
     case DW_FORM_strp:
@@ -136,7 +151,6 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
     case DW_FORM_line_strp:
     case DW_FORM_sec_offset:
     case DW_FORM_strp_sup:
-    case DW_FORM_ref_sup:
       if (U)
         return U->getDwarfOffsetByteSize();
       return None;
@@ -144,6 +158,7 @@ static Optional<uint8_t> getFixedByteSize(dwarf::Form Form, const T *U) {
     case DW_FORM_data8:
     case DW_FORM_ref8:
     case DW_FORM_ref_sig8:
+    case DW_FORM_ref_sup8:
       return 8;
 
     case DW_FORM_flag_present:
@@ -211,7 +226,14 @@ static bool skipFormValue(dwarf::Form Form, const DataExtractor &DebugInfoData,
       case DW_FORM_ref4:
       case DW_FORM_ref8:
       case DW_FORM_ref_sig8:
-      case DW_FORM_ref_sup:
+      case DW_FORM_ref_sup4:
+      case DW_FORM_ref_sup8:
+      case DW_FORM_strx1:
+      case DW_FORM_strx2:
+      case DW_FORM_strx4:
+      case DW_FORM_addrx1:
+      case DW_FORM_addrx2:
+      case DW_FORM_addrx4:
       case DW_FORM_sec_offset:
       case DW_FORM_strp:
       case DW_FORM_strp_sup:
@@ -339,14 +361,21 @@ bool DWARFFormValue::extractValue(const DataExtractor &data,
     case DW_FORM_data1:
     case DW_FORM_ref1:
     case DW_FORM_flag:
+    case DW_FORM_strx1:
+    case DW_FORM_addrx1:
       Value.uval = data.getU8(offset_ptr);
       break;
     case DW_FORM_data2:
     case DW_FORM_ref2:
+    case DW_FORM_strx2:
+    case DW_FORM_addrx2:
       Value.uval = data.getU16(offset_ptr);
       break;
     case DW_FORM_data4:
-    case DW_FORM_ref4: {
+    case DW_FORM_ref4:
+    case DW_FORM_ref_sup4:
+    case DW_FORM_strx4:
+    case DW_FORM_addrx4: {
       Value.uval = data.getU32(offset_ptr);
       if (!U)
         break;
@@ -357,6 +386,7 @@ bool DWARFFormValue::extractValue(const DataExtractor &data,
     }
     case DW_FORM_data8:
     case DW_FORM_ref8:
+    case DW_FORM_ref_sup8:
       Value.uval = data.getU64(offset_ptr);
       break;
     case DW_FORM_sdata:
@@ -378,8 +408,7 @@ bool DWARFFormValue::extractValue(const DataExtractor &data,
     case DW_FORM_GNU_ref_alt:
     case DW_FORM_GNU_strp_alt:
     case DW_FORM_line_strp:
-    case DW_FORM_strp_sup:
-    case DW_FORM_ref_sup: {
+    case DW_FORM_strp_sup: {
       if (!U)
         return false;
       RelocAddrMap::const_iterator AI = U->getRelocMap()->find(*offset_ptr);
@@ -497,21 +526,18 @@ DWARFFormValue::dump(raw_ostream &OS) const {
 
   case DW_FORM_sdata:     OS << Value.sval; break;
   case DW_FORM_udata:     OS << Value.uval; break;
-  case DW_FORM_strp: {
+  case DW_FORM_strp:
     OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)uvalue);
     dumpString(OS);
     break;
-  }
-  case DW_FORM_GNU_str_index: {
+  case DW_FORM_GNU_str_index:
     OS << format(" indexed (%8.8x) string = ", (uint32_t)uvalue);
     dumpString(OS);
     break;
-  }
-  case DW_FORM_GNU_strp_alt: {
+  case DW_FORM_GNU_strp_alt:
     OS << format("alt indirect string, offset: 0x%" PRIx64 "", uvalue);
     dumpString(OS);
     break;
-  }
   case DW_FORM_ref_addr:
     OS << format("0x%016" PRIx64, uvalue);
     break;
@@ -676,4 +702,3 @@ Optional<uint64_t> DWARFFormValue::getAsReferenceUVal() const {
     return None;
   return Value.uval;
 }
-
diff --git a/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp b/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
index ebb996162f1bdcd88a4b7954a25a35b2a8edd504..76354a9b1ddb652f85062a3283f862caa97e312a 100644
--- a/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFGdbIndex.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFGdbIndex.cpp -------------------------------------------------===//
+//===- DWARFGdbIndex.cpp --------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/DWARF/DWARFGdbIndex.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+#include <utility>
 
 using namespace llvm;
 
diff --git a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index b468ba28c9431ebb5335c3c0f4b0e97529e92bdb..e0f819383289be33d0513c8ee6645c966d2afe7c 100644
--- a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFTypeUnit.cpp -------------------------------------------------===//
+//===- DWARFTypeUnit.cpp --------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,11 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFTypeUnit.h"
-#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
 
 using namespace llvm;
 
@@ -30,18 +33,20 @@ void DWARFTypeUnit::dump(raw_ostream &OS, bool SummarizeTypes) {
 
   if (SummarizeTypes) {
     OS << "name = '" << Name << "'"
-       << " type_signature = " << format("0x%16" PRIx64, TypeHash)
+       << " type_signature = " << format("0x%016" PRIx64, TypeHash)
        << " length = " << format("0x%08x", getLength()) << '\n';
     return;
   }
 
   OS << format("0x%08x", getOffset()) << ": Type Unit:"
      << " length = " << format("0x%08x", getLength())
-     << " version = " << format("0x%04x", getVersion())
-     << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
+     << " version = " << format("0x%04x", getVersion());
+  if (getVersion() >= 5)
+    OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
+  OS << " abbr_offset = " << format("0x%04x", getAbbreviations()->getOffset())
      << " addr_size = " << format("0x%02x", getAddressByteSize())
      << " name = '" << Name << "'"
-     << " type_signature = " << format("0x%16" PRIx64, TypeHash)
+     << " type_signature = " << format("0x%016" PRIx64, TypeHash)
      << " type_offset = " << format("0x%04x", TypeOffset)
      << " (next unit at " << format("0x%08x", getNextUnitOffset()) << ")\n";
 
diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 84dc904f9b82f4cf44bf266467e7da103e3d4269..4ee8e8f46d2eb540b0bc21f001023106da81d08d 100644
--- a/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -13,6 +13,8 @@
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugAbbrev.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugInfoEntry.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
 #include "llvm/Object/ObjectFile.h"
@@ -21,12 +23,12 @@
 #include "llvm/Support/Path.h"
 #include <algorithm>
 #include <cassert>
+#include <cstddef>
 #include <cstdint>
 #include <cstdio>
 #include <vector>
 
-namespace llvm {
-
+using namespace llvm;
 using namespace dwarf;
 
 void DWARFUnitSectionBase::parse(DWARFContext &C, const DWARFSection &Section) {
@@ -88,7 +90,15 @@ bool DWARFUnit::getStringOffsetSectionItem(uint32_t Index,
 bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
   Length = debug_info.getU32(offset_ptr);
   Version = debug_info.getU16(offset_ptr);
-  uint64_t AbbrOffset = debug_info.getU32(offset_ptr);
+  uint64_t AbbrOffset;
+  if (Version >= 5) {
+    UnitType = debug_info.getU8(offset_ptr);
+    AddrSize = debug_info.getU8(offset_ptr);
+    AbbrOffset = debug_info.getU32(offset_ptr);
+  } else {
+    AbbrOffset = debug_info.getU32(offset_ptr);
+    AddrSize = debug_info.getU8(offset_ptr);
+  }
   if (IndexEntry) {
     if (AbbrOffset)
       return false;
@@ -100,7 +110,6 @@ bool DWARFUnit::extractImpl(DataExtractor debug_info, uint32_t *offset_ptr) {
       return false;
     AbbrOffset = AbbrEntry->Offset;
   }
-  AddrSize = debug_info.getU8(offset_ptr);
 
   bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
   bool VersionOK = DWARFContext::isSupportedVersion(Version);
@@ -152,7 +161,7 @@ void DWARFUnit::clear() {
 }
 
 const char *DWARFUnit::getCompilationDir() {
-  return toString(getUnitDIE().find(DW_AT_comp_dir), nullptr);
+  return dwarf::toString(getUnitDIE().find(DW_AT_comp_dir), nullptr);
 }
 
 Optional<uint64_t> DWARFUnit::getDWOId() {
@@ -238,8 +247,7 @@ size_t DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
   return DieArray.size();
 }
 
-DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath)
-    : DWOU(nullptr) {
+DWARFUnit::DWOHolder::DWOHolder(StringRef DWOPath) {
   auto Obj = object::ObjectFile::createObjectFile(DWOPath);
   if (!Obj) {
     // TODO: Actually report errors helpfully.
@@ -261,10 +269,10 @@ bool DWARFUnit::parseDWO() {
   DWARFDie UnitDie = getUnitDIE();
   if (!UnitDie)
     return false;
-  auto DWOFileName = toString(UnitDie.find(DW_AT_GNU_dwo_name));
+  auto DWOFileName = dwarf::toString(UnitDie.find(DW_AT_GNU_dwo_name));
   if (!DWOFileName)
     return false;
-  auto CompilationDir = toString(UnitDie.find(DW_AT_comp_dir));
+  auto CompilationDir = dwarf::toString(UnitDie.find(DW_AT_comp_dir));
   SmallString<16> AbsolutePath;
   if (sys::path::is_relative(*DWOFileName) && CompilationDir &&
       *CompilationDir) {
@@ -368,8 +376,8 @@ DWARFUnit::getInlinedChainForAddress(uint64_t Address,
     InlinedChain.clear();
 }
 
-const DWARFUnitIndex &getDWARFUnitIndex(DWARFContext &Context,
-                                        DWARFSectionKind Kind) {
+const DWARFUnitIndex &llvm::getDWARFUnitIndex(DWARFContext &Context,
+                                              DWARFSectionKind Kind) {
   if (Kind == DW_SECT_INFO)
     return Context.getCUIndex();
   assert(Kind == DW_SECT_TYPES);
@@ -407,11 +415,10 @@ DWARFDie DWARFUnit::getSibling(const DWARFDebugInfoEntry *Die) {
     return DWARFDie();
   
   // Find the next DIE whose depth is the same as the Die's depth.
-  for (size_t I=getDIEIndex(Die)+1, EndIdx = DieArray.size(); I<EndIdx; ++I) {
+  for (size_t I = getDIEIndex(Die) + 1, EndIdx = DieArray.size(); I < EndIdx;
+       ++I) {
     if (DieArray[I].getDepth() == Depth)
       return DWARFDie(this, &DieArray[I]);
   }
   return DWARFDie();
 }
-
-} // end namespace llvm
diff --git a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index 96b316957dfd88c3d21c72e4f3d13912f1f27b93..0981a4dfdfa57d4eceddb0f0df26546d122ff977 100644
--- a/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -1,4 +1,4 @@
-//===-- DWARFUnitIndex.cpp ------------------------------------------------===//
+//===- DWARFUnitIndex.cpp -------------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
-
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnitIndex.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
 
-namespace llvm {
+using namespace llvm;
 
 bool DWARFUnitIndex::Header::parse(DataExtractor IndexData,
                                    uint32_t *OffsetPtr) {
@@ -152,6 +156,7 @@ DWARFUnitIndex::Entry::getOffset(DWARFSectionKind Sec) const {
       return &Contributions[i];
   return nullptr;
 }
+
 const DWARFUnitIndex::Entry::SectionContribution *
 DWARFUnitIndex::Entry::getOffset() const {
   return &Contributions[Index->InfoColumn];
@@ -165,4 +170,3 @@ DWARFUnitIndex::getFromOffset(uint32_t Offset) const {
         return &Rows[i];
   return nullptr;
 }
-}
diff --git a/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp b/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
index 4f561d062b12c7bb1ea4e9ac6bd03531c757ff12..d4f44e446954f8f1289b24fec3f8d7007882d243 100644
--- a/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
+++ b/lib/DebugInfo/DWARF/SyntaxHighlighting.cpp
@@ -1,4 +1,4 @@
-//===-- SyntaxHighlighting.cpp ----------------------------------*- C++ -*-===//
+//===- SyntaxHighlighting.cpp ---------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,6 +9,8 @@
 
 #include "SyntaxHighlighting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 using namespace dwarf;
 using namespace syntax;
@@ -18,16 +20,16 @@ static cl::opt<cl::boolOrDefault>
              cl::desc("use colored syntax highlighting (default=autodetect)"),
              cl::init(cl::BOU_UNSET));
 
-WithColor::WithColor(llvm::raw_ostream &OS, enum HighlightColor Type) : OS(OS) {
+WithColor::WithColor(raw_ostream &OS, enum HighlightColor Type) : OS(OS) {
   // Detect color from terminal type unless the user passed the --color option.
   if (UseColor == cl::BOU_UNSET ? OS.has_colors() : UseColor == cl::BOU_TRUE) {
     switch (Type) {
-    case Address:    OS.changeColor(llvm::raw_ostream::YELLOW);  break;
-    case String:     OS.changeColor(llvm::raw_ostream::GREEN);   break;
-    case Tag:        OS.changeColor(llvm::raw_ostream::BLUE);    break;
-    case Attribute:  OS.changeColor(llvm::raw_ostream::CYAN);    break;
-    case Enumerator: OS.changeColor(llvm::raw_ostream::MAGENTA); break;
-    case Macro:      OS.changeColor(llvm::raw_ostream::RED);     break;
+    case Address:    OS.changeColor(raw_ostream::YELLOW);  break;
+    case String:     OS.changeColor(raw_ostream::GREEN);   break;
+    case Tag:        OS.changeColor(raw_ostream::BLUE);    break;
+    case Attribute:  OS.changeColor(raw_ostream::CYAN);    break;
+    case Enumerator: OS.changeColor(raw_ostream::MAGENTA); break;
+    case Macro:      OS.changeColor(raw_ostream::RED);     break;
     }
   }
 }
diff --git a/lib/DebugInfo/DWARF/SyntaxHighlighting.h b/lib/DebugInfo/DWARF/SyntaxHighlighting.h
index 16e68351d5e181e00599ce8c2ffcc53bbbe8dc0a..277de973dbf0ee0e8528a92b03112d202225b80e 100644
--- a/lib/DebugInfo/DWARF/SyntaxHighlighting.h
+++ b/lib/DebugInfo/DWARF/SyntaxHighlighting.h
@@ -1,4 +1,4 @@
-//===-- SyntaxHighlighting.h ------------------------------------*- C++ -*-===//
+//===- SyntaxHighlighting.h -------------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,9 +10,10 @@
 #ifndef LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
 #define LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
 
-#include "llvm/Support/raw_ostream.h"
-
 namespace llvm {
+
+class raw_ostream;
+
 namespace dwarf {
 namespace syntax {
 
@@ -22,18 +23,20 @@ enum HighlightColor { Address, String, Tag, Attribute, Enumerator, Macro };
 /// An RAII object that temporarily switches an output stream to a
 /// specific color.
 class WithColor {
-  llvm::raw_ostream &OS;
+  raw_ostream &OS;
 
 public:
   /// To be used like this: WithColor(OS, syntax::String) << "text";
-  WithColor(llvm::raw_ostream &OS, enum HighlightColor Type);
+  WithColor(raw_ostream &OS, enum HighlightColor Type);
   ~WithColor();
 
-  llvm::raw_ostream& get() { return OS; }
-  operator llvm::raw_ostream& () { return OS; }
+  raw_ostream& get() { return OS; }
+  operator raw_ostream& () { return OS; }
 };
-}
-}
-}
 
-#endif
+} // end namespace syntax
+} // end namespace dwarf
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_DEBUGINFO_SYNTAXHIGHLIGHTING_H
diff --git a/lib/DebugInfo/MSF/CMakeLists.txt b/lib/DebugInfo/MSF/CMakeLists.txt
index dcb2a8e0cc9c9d686dda6bb9780e2b8e3a94a66d..6f38de336ee02021f8fa881bde0a83ebca2e6b8d 100644
--- a/lib/DebugInfo/MSF/CMakeLists.txt
+++ b/lib/DebugInfo/MSF/CMakeLists.txt
@@ -3,8 +3,6 @@ add_llvm_library(LLVMDebugInfoMSF
   MSFBuilder.cpp
   MSFCommon.cpp
   MSFError.cpp
-  StreamReader.cpp
-  StreamWriter.cpp
   ADDITIONAL_HEADER_DIRS
   "${LLVM_MAIN_INCLUDE_DIR}/llvm/DebugInfo/MSF"
   )
diff --git a/lib/DebugInfo/MSF/MappedBlockStream.cpp b/lib/DebugInfo/MSF/MappedBlockStream.cpp
index e52c88a5bfb821f85c166e20f714b0833c43bd1e..57953cfa338ef07419b683f2e2963197b12c8eb2 100644
--- a/lib/DebugInfo/MSF/MappedBlockStream.cpp
+++ b/lib/DebugInfo/MSF/MappedBlockStream.cpp
@@ -11,8 +11,8 @@
 
 #include "llvm/DebugInfo/MSF/IMSFFile.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
-#include "llvm/DebugInfo/MSF/MSFError.h"
 #include "llvm/DebugInfo/MSF/MSFStreamLayout.h"
+#include "llvm/Support/BinaryStreamError.h"
 
 using namespace llvm;
 using namespace llvm::msf;
@@ -47,22 +47,20 @@ static Interval intersect(const Interval &I1, const Interval &I2) {
 
 MappedBlockStream::MappedBlockStream(uint32_t BlockSize, uint32_t NumBlocks,
                                      const MSFStreamLayout &Layout,
-                                     const ReadableStream &MsfData)
+                                     BinaryStreamRef MsfData)
     : BlockSize(BlockSize), NumBlocks(NumBlocks), StreamLayout(Layout),
       MsfData(MsfData) {}
 
 std::unique_ptr<MappedBlockStream>
 MappedBlockStream::createStream(uint32_t BlockSize, uint32_t NumBlocks,
                                 const MSFStreamLayout &Layout,
-                                const ReadableStream &MsfData) {
+                                BinaryStreamRef MsfData) {
   return llvm::make_unique<MappedBlockStreamImpl<MappedBlockStream>>(
       BlockSize, NumBlocks, Layout, MsfData);
 }
 
-std::unique_ptr<MappedBlockStream>
-MappedBlockStream::createIndexedStream(const MSFLayout &Layout,
-                                       const ReadableStream &MsfData,
-                                       uint32_t StreamIndex) {
+std::unique_ptr<MappedBlockStream> MappedBlockStream::createIndexedStream(
+    const MSFLayout &Layout, BinaryStreamRef MsfData, uint32_t StreamIndex) {
   assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index");
   MSFStreamLayout SL;
   SL.Blocks = Layout.StreamMap[StreamIndex];
@@ -73,7 +71,7 @@ MappedBlockStream::createIndexedStream(const MSFLayout &Layout,
 
 std::unique_ptr<MappedBlockStream>
 MappedBlockStream::createDirectoryStream(const MSFLayout &Layout,
-                                         const ReadableStream &MsfData) {
+                                         BinaryStreamRef MsfData) {
   MSFStreamLayout SL;
   SL.Blocks = Layout.DirectoryBlocks;
   SL.Length = Layout.SB->NumDirectoryBytes;
@@ -82,19 +80,17 @@ MappedBlockStream::createDirectoryStream(const MSFLayout &Layout,
 
 std::unique_ptr<MappedBlockStream>
 MappedBlockStream::createFpmStream(const MSFLayout &Layout,
-                                   const ReadableStream &MsfData) {
+                                   BinaryStreamRef MsfData) {
   MSFStreamLayout SL;
   initializeFpmStreamLayout(Layout, SL);
   return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
 }
 
 Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
-                                   ArrayRef<uint8_t> &Buffer) const {
+                                   ArrayRef<uint8_t> &Buffer) {
   // Make sure we aren't trying to read beyond the end of the stream.
-  if (Size > StreamLayout.Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  if (Offset > StreamLayout.Length - Size)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, Size))
+    return EC;
 
   if (tryReadContiguously(Offset, Size, Buffer))
     return Error::success();
@@ -168,11 +164,12 @@ Error MappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
   return Error::success();
 }
 
-Error MappedBlockStream::readLongestContiguousChunk(
-    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+Error MappedBlockStream::readLongestContiguousChunk(uint32_t Offset,
+                                                    ArrayRef<uint8_t> &Buffer) {
   // Make sure we aren't trying to read beyond the end of the stream.
-  if (Offset >= StreamLayout.Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, 1))
+    return EC;
+
   uint32_t First = Offset / BlockSize;
   uint32_t Last = First;
 
@@ -197,10 +194,10 @@ Error MappedBlockStream::readLongestContiguousChunk(
   return Error::success();
 }
 
-uint32_t MappedBlockStream::getLength() const { return StreamLayout.Length; }
+uint32_t MappedBlockStream::getLength() { return StreamLayout.Length; }
 
 bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
-                                            ArrayRef<uint8_t> &Buffer) const {
+                                            ArrayRef<uint8_t> &Buffer) {
   if (Size == 0) {
     Buffer = ArrayRef<uint8_t>();
     return true;
@@ -241,15 +238,13 @@ bool MappedBlockStream::tryReadContiguously(uint32_t Offset, uint32_t Size,
 }
 
 Error MappedBlockStream::readBytes(uint32_t Offset,
-                                   MutableArrayRef<uint8_t> Buffer) const {
+                                   MutableArrayRef<uint8_t> Buffer) {
   uint32_t BlockNum = Offset / BlockSize;
   uint32_t OffsetInBlock = Offset % BlockSize;
 
   // Make sure we aren't trying to read beyond the end of the stream.
-  if (Buffer.size() > StreamLayout.Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  if (Offset > StreamLayout.Length - Buffer.size())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, Buffer.size()))
+    return EC;
 
   uint32_t BytesLeft = Buffer.size();
   uint32_t BytesWritten = 0;
@@ -319,21 +314,21 @@ void MappedBlockStream::fixCacheAfterWrite(uint32_t Offset,
 
 WritableMappedBlockStream::WritableMappedBlockStream(
     uint32_t BlockSize, uint32_t NumBlocks, const MSFStreamLayout &Layout,
-    const WritableStream &MsfData)
+    WritableBinaryStreamRef MsfData)
     : ReadInterface(BlockSize, NumBlocks, Layout, MsfData),
       WriteInterface(MsfData) {}
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createStream(uint32_t BlockSize, uint32_t NumBlocks,
                                         const MSFStreamLayout &Layout,
-                                        const WritableStream &MsfData) {
+                                        WritableBinaryStreamRef MsfData) {
   return llvm::make_unique<MappedBlockStreamImpl<WritableMappedBlockStream>>(
       BlockSize, NumBlocks, Layout, MsfData);
 }
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createIndexedStream(const MSFLayout &Layout,
-                                               const WritableStream &MsfData,
+                                               WritableBinaryStreamRef MsfData,
                                                uint32_t StreamIndex) {
   assert(StreamIndex < Layout.StreamMap.size() && "Invalid stream index");
   MSFStreamLayout SL;
@@ -344,7 +339,7 @@ WritableMappedBlockStream::createIndexedStream(const MSFLayout &Layout,
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createDirectoryStream(
-    const MSFLayout &Layout, const WritableStream &MsfData) {
+    const MSFLayout &Layout, WritableBinaryStreamRef MsfData) {
   MSFStreamLayout SL;
   SL.Blocks = Layout.DirectoryBlocks;
   SL.Length = Layout.SB->NumDirectoryBytes;
@@ -353,34 +348,31 @@ WritableMappedBlockStream::createDirectoryStream(
 
 std::unique_ptr<WritableMappedBlockStream>
 WritableMappedBlockStream::createFpmStream(const MSFLayout &Layout,
-                                           const WritableStream &MsfData) {
+                                           WritableBinaryStreamRef MsfData) {
   MSFStreamLayout SL;
   initializeFpmStreamLayout(Layout, SL);
   return createStream(Layout.SB->BlockSize, Layout.SB->NumBlocks, SL, MsfData);
 }
 
 Error WritableMappedBlockStream::readBytes(uint32_t Offset, uint32_t Size,
-                                           ArrayRef<uint8_t> &Buffer) const {
+                                           ArrayRef<uint8_t> &Buffer) {
   return ReadInterface.readBytes(Offset, Size, Buffer);
 }
 
 Error WritableMappedBlockStream::readLongestContiguousChunk(
-    uint32_t Offset, ArrayRef<uint8_t> &Buffer) const {
+    uint32_t Offset, ArrayRef<uint8_t> &Buffer) {
   return ReadInterface.readLongestContiguousChunk(Offset, Buffer);
 }
 
-uint32_t WritableMappedBlockStream::getLength() const {
+uint32_t WritableMappedBlockStream::getLength() {
   return ReadInterface.getLength();
 }
 
 Error WritableMappedBlockStream::writeBytes(uint32_t Offset,
-                                            ArrayRef<uint8_t> Buffer) const {
+                                            ArrayRef<uint8_t> Buffer) {
   // Make sure we aren't trying to write beyond the end of the stream.
-  if (Buffer.size() > getStreamLength())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-
-  if (Offset > getStreamLayout().Length - Buffer.size())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  if (auto EC = checkOffset(Offset, Buffer.size()))
+    return EC;
 
   uint32_t BlockNum = Offset / getBlockSize();
   uint32_t OffsetInBlock = Offset % getBlockSize();
@@ -410,6 +402,4 @@ Error WritableMappedBlockStream::writeBytes(uint32_t Offset,
   return Error::success();
 }
 
-Error WritableMappedBlockStream::commit() const {
-  return WriteInterface.commit();
-}
+Error WritableMappedBlockStream::commit() { return WriteInterface.commit(); }
diff --git a/lib/DebugInfo/MSF/StreamReader.cpp b/lib/DebugInfo/MSF/StreamReader.cpp
deleted file mode 100644
index b85fd14a3b7f70acd326049ab10317bcf89d2ea3..0000000000000000000000000000000000000000
--- a/lib/DebugInfo/MSF/StreamReader.cpp
+++ /dev/null
@@ -1,156 +0,0 @@
-//===- StreamReader.cpp - Reads bytes and objects from a stream -----------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-
-using namespace llvm;
-using namespace llvm::msf;
-
-StreamReader::StreamReader(ReadableStreamRef S) : Stream(S), Offset(0) {}
-
-Error StreamReader::readLongestContiguousChunk(ArrayRef<uint8_t> &Buffer) {
-  if (auto EC = Stream.readLongestContiguousChunk(Offset, Buffer))
-    return EC;
-  Offset += Buffer.size();
-  return Error::success();
-}
-
-Error StreamReader::readBytes(ArrayRef<uint8_t> &Buffer, uint32_t Size) {
-  if (auto EC = Stream.readBytes(Offset, Size, Buffer))
-    return EC;
-  Offset += Size;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint8_t &Dest) {
-  const uint8_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint16_t &Dest) {
-  const support::ulittle16_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint32_t &Dest) {
-  const support::ulittle32_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(uint64_t &Dest) {
-  const support::ulittle64_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int8_t &Dest) {
-  const int8_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int16_t &Dest) {
-  const support::little16_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int32_t &Dest) {
-  const support::little32_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readInteger(int64_t &Dest) {
-  const support::little64_t *P;
-  if (auto EC = readObject(P))
-    return EC;
-  Dest = *P;
-  return Error::success();
-}
-
-Error StreamReader::readZeroString(StringRef &Dest) {
-  uint32_t Length = 0;
-  // First compute the length of the string by reading 1 byte at a time.
-  uint32_t OriginalOffset = getOffset();
-  const char *C;
-  do {
-    if (auto EC = readObject(C))
-      return EC;
-    if (*C != '\0')
-      ++Length;
-  } while (*C != '\0');
-  // Now go back and request a reference for that many bytes.
-  uint32_t NewOffset = getOffset();
-  setOffset(OriginalOffset);
-
-  ArrayRef<uint8_t> Data;
-  if (auto EC = readBytes(Data, Length))
-    return EC;
-  Dest = StringRef(reinterpret_cast<const char *>(Data.begin()), Data.size());
-
-  // Now set the offset back to where it was after we calculated the length.
-  setOffset(NewOffset);
-  return Error::success();
-}
-
-Error StreamReader::readFixedString(StringRef &Dest, uint32_t Length) {
-  ArrayRef<uint8_t> Bytes;
-  if (auto EC = readBytes(Bytes, Length))
-    return EC;
-  Dest = StringRef(reinterpret_cast<const char *>(Bytes.begin()), Bytes.size());
-  return Error::success();
-}
-
-Error StreamReader::readStreamRef(ReadableStreamRef &Ref) {
-  return readStreamRef(Ref, bytesRemaining());
-}
-
-Error StreamReader::readStreamRef(ReadableStreamRef &Ref, uint32_t Length) {
-  if (bytesRemaining() < Length)
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  Ref = Stream.slice(Offset, Length);
-  Offset += Length;
-  return Error::success();
-}
-
-Error StreamReader::skip(uint32_t Amount) {
-  if (Amount > bytesRemaining())
-    return make_error<MSFError>(msf_error_code::insufficient_buffer);
-  Offset += Amount;
-  return Error::success();
-}
-
-uint8_t StreamReader::peek() const {
-  ArrayRef<uint8_t> Buffer;
-  auto EC = Stream.readBytes(Offset, 1, Buffer);
-  assert(!EC && "Cannot peek an empty buffer!");
-  llvm::consumeError(std::move(EC));
-  return Buffer[0];
-}
diff --git a/lib/DebugInfo/MSF/StreamWriter.cpp b/lib/DebugInfo/MSF/StreamWriter.cpp
deleted file mode 100644
index cdae7c5acc04323327b1b03831f298cb877568bb..0000000000000000000000000000000000000000
--- a/lib/DebugInfo/MSF/StreamWriter.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-//===- StreamWrite.cpp - Writes bytes and objects to a stream -------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
-
-#include "llvm/DebugInfo/MSF/MSFError.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-
-using namespace llvm;
-using namespace llvm::msf;
-
-StreamWriter::StreamWriter(WritableStreamRef S) : Stream(S), Offset(0) {}
-
-Error StreamWriter::writeBytes(ArrayRef<uint8_t> Buffer) {
-  if (auto EC = Stream.writeBytes(Offset, Buffer))
-    return EC;
-  Offset += Buffer.size();
-  return Error::success();
-}
-
-Error StreamWriter::writeInteger(uint8_t Int) { return writeObject(Int); }
-
-Error StreamWriter::writeInteger(uint16_t Int) {
-  return writeObject(support::ulittle16_t(Int));
-}
-
-Error StreamWriter::writeInteger(uint32_t Int) {
-  return writeObject(support::ulittle32_t(Int));
-}
-
-Error StreamWriter::writeInteger(uint64_t Int) {
-  return writeObject(support::ulittle64_t(Int));
-}
-
-Error StreamWriter::writeInteger(int8_t Int) { return writeObject(Int); }
-
-Error StreamWriter::writeInteger(int16_t Int) {
-  return writeObject(support::little16_t(Int));
-}
-
-Error StreamWriter::writeInteger(int32_t Int) {
-  return writeObject(support::little32_t(Int));
-}
-
-Error StreamWriter::writeInteger(int64_t Int) {
-  return writeObject(support::little64_t(Int));
-}
-
-Error StreamWriter::writeZeroString(StringRef Str) {
-  if (auto EC = writeFixedString(Str))
-    return EC;
-  if (auto EC = writeObject('\0'))
-    return EC;
-
-  return Error::success();
-}
-
-Error StreamWriter::writeFixedString(StringRef Str) {
-  ArrayRef<uint8_t> Bytes(Str.bytes_begin(), Str.bytes_end());
-  if (auto EC = Stream.writeBytes(Offset, Bytes))
-    return EC;
-
-  Offset += Str.size();
-  return Error::success();
-}
-
-Error StreamWriter::writeStreamRef(ReadableStreamRef Ref) {
-  if (auto EC = writeStreamRef(Ref, Ref.getLength()))
-    return EC;
-  // Don't increment Offset here, it is done by the overloaded call to
-  // writeStreamRef.
-  return Error::success();
-}
-
-Error StreamWriter::writeStreamRef(ReadableStreamRef Ref, uint32_t Length) {
-  Ref = Ref.slice(0, Length);
-
-  StreamReader SrcReader(Ref);
-  // This is a bit tricky.  If we just call readBytes, we are requiring that it
-  // return us the entire stream as a contiguous buffer.  For large streams this
-  // will allocate a huge amount of space from the pool.  Instead, iterate over
-  // each contiguous chunk until we've consumed the entire stream.
-  while (SrcReader.bytesRemaining() > 0) {
-    ArrayRef<uint8_t> Chunk;
-    if (auto EC = SrcReader.readLongestContiguousChunk(Chunk))
-      return EC;
-    if (auto EC = writeBytes(Chunk))
-      return EC;
-  }
-  return Error::success();
-}
diff --git a/lib/DebugInfo/PDB/CMakeLists.txt b/lib/DebugInfo/PDB/CMakeLists.txt
index 0f68c633732944543c17937f531a525b4f534098..1295d2a19ce26c5f138458cb631061279736fc58 100644
--- a/lib/DebugInfo/PDB/CMakeLists.txt
+++ b/lib/DebugInfo/PDB/CMakeLists.txt
@@ -38,11 +38,17 @@ add_pdb_impl_folder(Native
   Native/InfoStream.cpp
   Native/InfoStreamBuilder.cpp
   Native/ModInfo.cpp
+  Native/ModInfoBuilder.cpp
   Native/ModStream.cpp
+  Native/NativeCompilandSymbol.cpp
+  Native/NativeEnumModules.cpp
+  Native/NativeExeSymbol.cpp
+  Native/NativeRawSymbol.cpp
   Native/NamedStreamMap.cpp
   Native/NativeSession.cpp
   Native/PDBFile.cpp
   Native/PDBFileBuilder.cpp
+  Native/PDBTypeServerHandler.cpp
   Native/PublicsStream.cpp
   Native/RawError.cpp
   Native/StringTable.cpp
diff --git a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
index bba5b0f94dcaebb3cb8944a5a9c1092c52ebd313..6182dab213c448cc1725b80e0ea58554ae427a95 100644
--- a/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIARawSymbol.cpp
@@ -10,6 +10,7 @@
 #include "llvm/DebugInfo/PDB/DIA/DIARawSymbol.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/Formatters.h"
 #include "llvm/DebugInfo/PDB/DIA/DIAEnumSymbols.h"
 #include "llvm/DebugInfo/PDB/DIA/DIASession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
@@ -178,9 +179,10 @@ void DumpDIAValue(llvm::raw_ostream &OS, int Indent, StringRef Name,
 }
 
 namespace llvm {
-raw_ostream &operator<<(raw_ostream &OS, const GUID &Guid) {
-  const PDB_UniqueId *Id = reinterpret_cast<const PDB_UniqueId *>(&Guid);
-  OS << *Id;
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS, const GUID &G) {
+  StringRef GuidBytes(reinterpret_cast<const char *>(&G), sizeof(G));
+  codeview::detail::GuidAdapter A(GuidBytes);
+  A.format(OS, "");
   return OS;
 }
 }
diff --git a/lib/DebugInfo/PDB/DIA/DIASession.cpp b/lib/DebugInfo/PDB/DIA/DIASession.cpp
index 6ecf335812b5380f899760aae5c6a6b89f2300fa..7077bda4a534795c9492bef4c021efaac1700d56 100644
--- a/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -140,7 +140,7 @@ void DIASession::setLoadAddress(uint64_t Address) {
   Session->put_loadAddress(Address);
 }
 
-std::unique_ptr<PDBSymbolExe> DIASession::getGlobalScope() const {
+std::unique_ptr<PDBSymbolExe> DIASession::getGlobalScope() {
   CComPtr<IDiaSymbol> GlobalScope;
   if (S_OK != Session->get_globalScope(&GlobalScope))
     return nullptr;
diff --git a/lib/DebugInfo/PDB/Native/DbiStream.cpp b/lib/DebugInfo/PDB/Native/DbiStream.cpp
index 5d15e62d3e49a697064b42fc0cd69fccfb80de16..b9f53578d3267ffc173bcdb9f3eeb115d0525154 100644
--- a/lib/DebugInfo/PDB/Native/DbiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStream.cpp
@@ -10,8 +10,6 @@
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/ModInfo.h"
@@ -21,6 +19,8 @@
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/PDBTypes.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstddef>
@@ -34,7 +34,7 @@ using namespace llvm::support;
 
 template <typename ContribType>
 static Error loadSectionContribs(FixedStreamArray<ContribType> &Output,
-                                 StreamReader &Reader) {
+                                 BinaryStreamReader &Reader) {
   if (Reader.bytesRemaining() % sizeof(ContribType) != 0)
     return make_error<RawError>(
         raw_error_code::corrupt_file,
@@ -52,7 +52,7 @@ DbiStream::DbiStream(PDBFile &File, std::unique_ptr<MappedBlockStream> Stream)
 DbiStream::~DbiStream() = default;
 
 Error DbiStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   if (Stream->getLength() < sizeof(DbiStreamHeader))
     return make_error<RawError>(raw_error_code::corrupt_file,
@@ -145,7 +145,7 @@ Error DbiStream::reload() {
                                 "Found unexpected bytes in DBI Stream.");
 
   if (ECSubstream.getLength() > 0) {
-    StreamReader ECReader(ECSubstream);
+    BinaryStreamReader ECReader(ECSubstream);
     if (auto EC = ECNames.load(ECReader))
       return EC;
   }
@@ -207,16 +207,16 @@ PDB_Machine DbiStream::getMachineType() const {
   return static_cast<PDB_Machine>(Machine);
 }
 
-msf::FixedStreamArray<object::coff_section> DbiStream::getSectionHeaders() {
+FixedStreamArray<object::coff_section> DbiStream::getSectionHeaders() {
   return SectionHeaders;
 }
 
-msf::FixedStreamArray<object::FpoData> DbiStream::getFpoRecords() {
+FixedStreamArray<object::FpoData> DbiStream::getFpoRecords() {
   return FpoRecords;
 }
 
 ArrayRef<ModuleInfoEx> DbiStream::modules() const { return ModuleInfos; }
-msf::FixedStreamArray<SecMapEntry> DbiStream::getSectionMap() const {
+FixedStreamArray<SecMapEntry> DbiStream::getSectionMap() const {
   return SectionMap;
 }
 
@@ -235,7 +235,7 @@ Error DbiStream::initializeSectionContributionData() {
   if (SecContrSubstream.getLength() == 0)
     return Error::success();
 
-  StreamReader SCReader(SecContrSubstream);
+  BinaryStreamReader SCReader(SecContrSubstream);
   if (auto EC = SCReader.readEnum(SectionContribVersion))
     return EC;
 
@@ -254,7 +254,7 @@ Error DbiStream::initializeModInfoArray() {
 
   // Since each ModInfo in the stream is a variable length, we have to iterate
   // them to know how many there actually are.
-  StreamReader Reader(ModInfoSubstream);
+  BinaryStreamReader Reader(ModInfoSubstream);
 
   VarStreamArray<ModInfo> ModInfoArray;
   if (auto EC = Reader.readArray(ModInfoArray, ModInfoSubstream.getLength()))
@@ -284,7 +284,7 @@ Error DbiStream::initializeSectionHeadersData() {
                                 "Corrupted section header stream.");
 
   size_t NumSections = StreamLen / sizeof(object::coff_section);
-  msf::StreamReader Reader(*SHS);
+  BinaryStreamReader Reader(*SHS);
   if (auto EC = Reader.readArray(SectionHeaders, NumSections))
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Could not read a bitmap.");
@@ -316,7 +316,7 @@ Error DbiStream::initializeFpoRecords() {
                                 "Corrupted New FPO stream.");
 
   size_t NumRecords = StreamLen / sizeof(object::FpoData);
-  msf::StreamReader Reader(*FS);
+  BinaryStreamReader Reader(*FS);
   if (auto EC = Reader.readArray(FpoRecords, NumRecords))
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Corrupted New FPO stream.");
@@ -328,7 +328,7 @@ Error DbiStream::initializeSectionMapData() {
   if (SecMapSubstream.getLength() == 0)
     return Error::success();
 
-  StreamReader SMReader(SecMapSubstream);
+  BinaryStreamReader SMReader(SecMapSubstream);
   const SecMapHeader *Header;
   if (auto EC = SMReader.readObject(Header))
     return EC;
@@ -342,7 +342,7 @@ Error DbiStream::initializeFileInfo() {
     return Error::success();
 
   const FileInfoSubstreamHeader *FH;
-  StreamReader FISR(FileInfoSubstream);
+  BinaryStreamReader FISR(FileInfoSubstream);
   if (auto EC = FISR.readObject(FH))
     return EC;
 
@@ -411,14 +411,14 @@ uint32_t DbiStream::getDebugStreamIndex(DbgHeaderType Type) const {
 }
 
 Expected<StringRef> DbiStream::getFileNameForIndex(uint32_t Index) const {
-  StreamReader Names(NamesBuffer);
+  BinaryStreamReader Names(NamesBuffer);
   if (Index >= FileNameOffsets.size())
     return make_error<RawError>(raw_error_code::index_out_of_bounds);
 
   uint32_t FileOffset = FileNameOffsets[Index];
   Names.setOffset(FileOffset);
   StringRef Name;
-  if (auto EC = Names.readZeroString(Name))
+  if (auto EC = Names.readCString(Name))
     return std::move(EC);
   return Name;
 }
diff --git a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index df6e9b060df135913ec01f3de97c84641a0b415c..a203aea60fe7ae0f18d23aca7d721452f01ed423 100644
--- a/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -12,10 +12,11 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/ModInfoBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/COFF.h"
 
 using namespace llvm;
@@ -23,15 +24,13 @@ using namespace llvm::codeview;
 using namespace llvm::msf;
 using namespace llvm::pdb;
 
-namespace {
-class ModiSubstreamBuilder {};
-}
-
 DbiStreamBuilder::DbiStreamBuilder(msf::MSFBuilder &Msf)
     : Msf(Msf), Allocator(Msf.getAllocator()), Age(1), BuildNumber(0),
       PdbDllVersion(0), PdbDllRbld(0), Flags(0), MachineType(PDB_Machine::x86),
       Header(nullptr), DbgStreams((int)DbgHeaderType::Max) {}
 
+DbiStreamBuilder::~DbiStreamBuilder() {}
+
 void DbiStreamBuilder::setVersionHeader(PdbRaw_DbiVer V) { VerHeader = V; }
 
 void DbiStreamBuilder::setAge(uint32_t A) { Age = A; }
@@ -75,39 +74,37 @@ uint32_t DbiStreamBuilder::calculateSerializedLength() const {
          calculateSectionMapStreamSize() + calculateDbgStreamsSize();
 }
 
-Error DbiStreamBuilder::addModuleInfo(StringRef ObjFile, StringRef Module) {
-  auto Entry = llvm::make_unique<ModuleInfo>();
-  ModuleInfo *M = Entry.get();
-  Entry->Mod = Module;
-  Entry->Obj = ObjFile;
-  auto Result = ModuleInfos.insert(std::make_pair(Module, std::move(Entry)));
+Expected<ModInfoBuilder &>
+DbiStreamBuilder::addModuleInfo(StringRef ModuleName) {
+  uint32_t Index = ModiList.size();
+  auto MIB = llvm::make_unique<ModInfoBuilder>(ModuleName, Index, Msf);
+  auto M = MIB.get();
+  auto Result = ModiMap.insert(std::make_pair(ModuleName, std::move(MIB)));
+
   if (!Result.second)
     return make_error<RawError>(raw_error_code::duplicate_entry,
                                 "The specified module already exists");
-  ModuleInfoList.push_back(M);
-  return Error::success();
+  ModiList.push_back(M);
+  return *M;
 }
 
 Error DbiStreamBuilder::addModuleSourceFile(StringRef Module, StringRef File) {
-  auto ModIter = ModuleInfos.find(Module);
-  if (ModIter == ModuleInfos.end())
+  auto ModIter = ModiMap.find(Module);
+  if (ModIter == ModiMap.end())
     return make_error<RawError>(raw_error_code::no_entry,
                                 "The specified module was not found");
   uint32_t Index = SourceFileNames.size();
   SourceFileNames.insert(std::make_pair(File, Index));
   auto &ModEntry = *ModIter;
-  ModEntry.second->SourceFiles.push_back(File);
+  ModEntry.second->addSourceFile(File);
   return Error::success();
 }
 
 uint32_t DbiStreamBuilder::calculateModiSubstreamSize() const {
   uint32_t Size = 0;
-  for (const auto &M : ModuleInfoList) {
-    Size += sizeof(ModuleInfoHeader);
-    Size += M->Mod.size() + 1;
-    Size += M->Obj.size() + 1;
-  }
-  return alignTo(Size, sizeof(uint32_t));
+  for (const auto &M : ModiList)
+    Size += M->calculateSerializedLength();
+  return Size;
 }
 
 uint32_t DbiStreamBuilder::calculateSectionContribsStreamSize() const {
@@ -127,11 +124,11 @@ uint32_t DbiStreamBuilder::calculateFileInfoSubstreamSize() const {
   uint32_t Size = 0;
   Size += sizeof(ulittle16_t);                         // NumModules
   Size += sizeof(ulittle16_t);                         // NumSourceFiles
-  Size += ModuleInfoList.size() * sizeof(ulittle16_t); // ModIndices
-  Size += ModuleInfoList.size() * sizeof(ulittle16_t); // ModFileCounts
+  Size += ModiList.size() * sizeof(ulittle16_t);       // ModIndices
+  Size += ModiList.size() * sizeof(ulittle16_t);       // ModFileCounts
   uint32_t NumFileInfos = 0;
-  for (const auto &M : ModuleInfoList)
-    NumFileInfos += M->SourceFiles.size();
+  for (const auto &M : ModiList)
+    NumFileInfos += M->source_files().size();
   Size += NumFileInfos * sizeof(ulittle32_t); // FileNameOffsets
   Size += calculateNamesBufferSize();
   return alignTo(Size, sizeof(uint32_t));
@@ -149,43 +146,20 @@ uint32_t DbiStreamBuilder::calculateDbgStreamsSize() const {
   return DbgStreams.size() * sizeof(uint16_t);
 }
 
-Error DbiStreamBuilder::generateModiSubstream() {
-  uint32_t Size = calculateModiSubstreamSize();
-  auto Data = Allocator.Allocate<uint8_t>(Size);
-
-  ModInfoBuffer = MutableByteStream(MutableArrayRef<uint8_t>(Data, Size));
-
-  StreamWriter ModiWriter(ModInfoBuffer);
-  for (const auto &M : ModuleInfoList) {
-    ModuleInfoHeader Layout = {};
-    Layout.ModDiStream = kInvalidStreamIndex;
-    Layout.NumFiles = M->SourceFiles.size();
-    if (auto EC = ModiWriter.writeObject(Layout))
-      return EC;
-    if (auto EC = ModiWriter.writeZeroString(M->Mod))
-      return EC;
-    if (auto EC = ModiWriter.writeZeroString(M->Obj))
-      return EC;
-  }
-  if (ModiWriter.bytesRemaining() > sizeof(uint32_t))
-    return make_error<RawError>(raw_error_code::invalid_format,
-                                "Unexpected bytes in Modi Stream Data");
-  return Error::success();
-}
-
 Error DbiStreamBuilder::generateFileInfoSubstream() {
   uint32_t Size = calculateFileInfoSubstreamSize();
   uint32_t NameSize = calculateNamesBufferSize();
   auto Data = Allocator.Allocate<uint8_t>(Size);
   uint32_t NamesOffset = Size - NameSize;
 
-  FileInfoBuffer = MutableByteStream(MutableArrayRef<uint8_t>(Data, Size));
+  FileInfoBuffer = MutableBinaryByteStream(MutableArrayRef<uint8_t>(Data, Size),
+                                           llvm::support::little);
 
-  WritableStreamRef MetadataBuffer =
-      WritableStreamRef(FileInfoBuffer).keep_front(NamesOffset);
-  StreamWriter MetadataWriter(MetadataBuffer);
+  WritableBinaryStreamRef MetadataBuffer =
+      WritableBinaryStreamRef(FileInfoBuffer).keep_front(NamesOffset);
+  BinaryStreamWriter MetadataWriter(MetadataBuffer);
 
-  uint16_t ModiCount = std::min<uint32_t>(UINT16_MAX, ModuleInfos.size());
+  uint16_t ModiCount = std::min<uint32_t>(UINT16_MAX, ModiList.size());
   uint16_t FileCount = std::min<uint32_t>(UINT16_MAX, SourceFileNames.size());
   if (auto EC = MetadataWriter.writeInteger(ModiCount)) // NumModules
     return EC;
@@ -195,8 +169,8 @@ Error DbiStreamBuilder::generateFileInfoSubstream() {
     if (auto EC = MetadataWriter.writeInteger(I)) // Mod Indices
       return EC;
   }
-  for (const auto MI : ModuleInfoList) {
-    FileCount = static_cast<uint16_t>(MI->SourceFiles.size());
+  for (const auto &MI : ModiList) {
+    FileCount = static_cast<uint16_t>(MI->source_files().size());
     if (auto EC = MetadataWriter.writeInteger(FileCount)) // Mod File Counts
       return EC;
   }
@@ -205,16 +179,16 @@ Error DbiStreamBuilder::generateFileInfoSubstream() {
   // A side effect of this is that this will actually compute the various
   // file name offsets, so we can then go back and write the FileNameOffsets
   // array to the other substream.
-  NamesBuffer = WritableStreamRef(FileInfoBuffer).drop_front(NamesOffset);
-  StreamWriter NameBufferWriter(NamesBuffer);
+  NamesBuffer = WritableBinaryStreamRef(FileInfoBuffer).drop_front(NamesOffset);
+  BinaryStreamWriter NameBufferWriter(NamesBuffer);
   for (auto &Name : SourceFileNames) {
     Name.second = NameBufferWriter.getOffset();
-    if (auto EC = NameBufferWriter.writeZeroString(Name.getKey()))
+    if (auto EC = NameBufferWriter.writeCString(Name.getKey()))
       return EC;
   }
 
-  for (const auto MI : ModuleInfoList) {
-    for (StringRef Name : MI->SourceFiles) {
+  for (const auto &MI : ModiList) {
+    for (StringRef Name : MI->source_files()) {
       auto Result = SourceFileNames.find(Name);
       if (Result == SourceFileNames.end())
         return make_error<RawError>(raw_error_code::no_entry,
@@ -240,13 +214,13 @@ Error DbiStreamBuilder::finalize() {
   if (Header)
     return Error::success();
 
-  DbiStreamHeader *H = Allocator.Allocate<DbiStreamHeader>();
+  for (auto &MI : ModiList)
+    MI->finalize();
 
-  if (auto EC = generateModiSubstream())
-    return EC;
   if (auto EC = generateFileInfoSubstream())
     return EC;
 
+  DbiStreamHeader *H = Allocator.Allocate<DbiStreamHeader>();
   H->VersionHeader = *VerHeader;
   H->VersionSignature = -1;
   H->Age = Age;
@@ -258,7 +232,7 @@ Error DbiStreamBuilder::finalize() {
 
   H->ECSubstreamSize = 0;
   H->FileInfoSize = FileInfoBuffer.getLength();
-  H->ModiSubstreamSize = ModInfoBuffer.getLength();
+  H->ModiSubstreamSize = calculateModiSubstreamSize();
   H->OptionalDbgHdrSize = DbgStreams.size() * sizeof(uint16_t);
   H->SecContrSubstreamSize = calculateSectionContribsStreamSize();
   H->SectionMapSize = calculateSectionMapStreamSize();
@@ -273,6 +247,11 @@ Error DbiStreamBuilder::finalize() {
 }
 
 Error DbiStreamBuilder::finalizeMsfLayout() {
+  for (auto &MI : ModiList) {
+    if (auto EC = MI->finalizeMsfLayout())
+      return EC;
+  }
+
   uint32_t Length = calculateSerializedLength();
   if (auto EC = Msf.setStreamSize(StreamDBI, Length))
     return EC;
@@ -358,19 +337,21 @@ std::vector<SecMapEntry> DbiStreamBuilder::createSectionMap(
 }
 
 Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
-                               const msf::WritableStream &Buffer) {
+                               WritableBinaryStreamRef MsfBuffer) {
   if (auto EC = finalize())
     return EC;
 
-  auto InfoS =
-      WritableMappedBlockStream::createIndexedStream(Layout, Buffer, StreamDBI);
+  auto DbiS = WritableMappedBlockStream::createIndexedStream(Layout, MsfBuffer,
+                                                             StreamDBI);
 
-  StreamWriter Writer(*InfoS);
+  BinaryStreamWriter Writer(*DbiS);
   if (auto EC = Writer.writeObject(*Header))
     return EC;
 
-  if (auto EC = Writer.writeStreamRef(ModInfoBuffer))
-    return EC;
+  for (auto &M : ModiList) {
+    if (auto EC = M->commit(Writer, Layout, MsfBuffer))
+      return EC;
+  }
 
   if (!SectionContribs.empty()) {
     if (auto EC = Writer.writeEnum(DbiSecContribVer60))
@@ -399,8 +380,8 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
     if (Stream.StreamNumber == kInvalidStreamIndex)
       continue;
     auto WritableStream = WritableMappedBlockStream::createIndexedStream(
-        Layout, Buffer, Stream.StreamNumber);
-    StreamWriter DbgStreamWriter(*WritableStream);
+        Layout, MsfBuffer, Stream.StreamNumber);
+    BinaryStreamWriter DbgStreamWriter(*WritableStream);
     if (auto EC = DbgStreamWriter.writeArray(Stream.Data))
       return EC;
   }
diff --git a/lib/DebugInfo/PDB/Native/GSI.cpp b/lib/DebugInfo/PDB/Native/GSI.cpp
index c98603f87e1c45bdac7332fc0299c1a69ed1d684..b219fe275f73577c63b0f553e966bc6b421644c0 100644
--- a/lib/DebugInfo/PDB/Native/GSI.cpp
+++ b/lib/DebugInfo/PDB/Native/GSI.cpp
@@ -9,10 +9,10 @@
 
 #include "GSI.h"
 
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
 
 #include "llvm/Support/Error.h"
 
@@ -28,9 +28,9 @@ static Error checkHashHdrVersion(const GSIHashHeader *HashHdr) {
   return Error::success();
 }
 
-Error readGSIHashBuckets(
-    msf::FixedStreamArray<support::ulittle32_t> &HashBuckets,
-    const GSIHashHeader *HashHdr, msf::StreamReader &Reader) {
+Error readGSIHashBuckets(FixedStreamArray<support::ulittle32_t> &HashBuckets,
+                         const GSIHashHeader *HashHdr,
+                         BinaryStreamReader &Reader) {
   if (auto EC = checkHashHdrVersion(HashHdr))
     return EC;
 
@@ -57,7 +57,7 @@ Error readGSIHashBuckets(
 }
 
 Error readGSIHashHeader(const GSIHashHeader *&HashHdr,
-                        msf::StreamReader &Reader) {
+                        BinaryStreamReader &Reader) {
   if (Reader.readObject(HashHdr))
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Stream does not contain a GSIHashHeader.");
@@ -70,9 +70,9 @@ Error readGSIHashHeader(const GSIHashHeader *&HashHdr,
   return Error::success();
 }
 
-Error readGSIHashRecords(msf::FixedStreamArray<PSHashRecord> &HashRecords,
+Error readGSIHashRecords(FixedStreamArray<PSHashRecord> &HashRecords,
                          const GSIHashHeader *HashHdr,
-                         msf::StreamReader &Reader) {
+                         BinaryStreamReader &Reader) {
   if (auto EC = checkHashHdrVersion(HashHdr))
     return EC;
 
diff --git a/lib/DebugInfo/PDB/Native/GSI.h b/lib/DebugInfo/PDB/Native/GSI.h
index d5f2fb1f12116764d42d8acd8523f9bfd8a4ad32..9e63bc83548fb547bc18abcde88ffc9794482012 100644
--- a/lib/DebugInfo/PDB/Native/GSI.h
+++ b/lib/DebugInfo/PDB/Native/GSI.h
@@ -25,17 +25,15 @@
 #ifndef LLVM_LIB_DEBUGINFO_PDB_RAW_GSI_H
 #define LLVM_LIB_DEBUGINFO_PDB_RAW_GSI_H
 
-#include "llvm/DebugInfo/MSF/StreamArray.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamArray.h"
 
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 
 namespace llvm {
 
-namespace msf {
-class StreamReader;
-}
+class BinaryStreamReader;
 
 namespace pdb {
 
@@ -56,14 +54,14 @@ struct GSIHashHeader {
   support::ulittle32_t NumBuckets;
 };
 
-Error readGSIHashBuckets(
-    msf::FixedStreamArray<support::ulittle32_t> &HashBuckets,
-    const GSIHashHeader *HashHdr, msf::StreamReader &Reader);
+Error readGSIHashBuckets(FixedStreamArray<support::ulittle32_t> &HashBuckets,
+                         const GSIHashHeader *HashHdr,
+                         BinaryStreamReader &Reader);
 Error readGSIHashHeader(const GSIHashHeader *&HashHdr,
-                        msf::StreamReader &Reader);
-Error readGSIHashRecords(msf::FixedStreamArray<PSHashRecord> &HashRecords,
+                        BinaryStreamReader &Reader);
+Error readGSIHashRecords(FixedStreamArray<PSHashRecord> &HashRecords,
                          const GSIHashHeader *HashHdr,
-                         msf::StreamReader &Reader);
+                         BinaryStreamReader &Reader);
 }
 }
 
diff --git a/lib/DebugInfo/PDB/Native/GlobalsStream.cpp b/lib/DebugInfo/PDB/Native/GlobalsStream.cpp
index 2f204ed01906aaac377c09d9e61c9ba694e0a1ea..a2ee0f047c58ae065391f9ac3997d49f464f9132 100644
--- a/lib/DebugInfo/PDB/Native/GlobalsStream.cpp
+++ b/lib/DebugInfo/PDB/Native/GlobalsStream.cpp
@@ -9,7 +9,7 @@
 
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "GSI.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 
@@ -23,7 +23,7 @@ GlobalsStream::GlobalsStream(std::unique_ptr<MappedBlockStream> Stream)
 GlobalsStream::~GlobalsStream() = default;
 
 Error GlobalsStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   const GSIHashHeader *HashHdr;
   if (auto EC = readGSIHashHeader(HashHdr, Reader))
diff --git a/lib/DebugInfo/PDB/Native/HashTable.cpp b/lib/DebugInfo/PDB/Native/HashTable.cpp
index b3fe6fa45c5b1f13c91f7f08f176d42f37e1f9cb..ebf8c9c04db1621fd48ffaf9d84468d878d7943b 100644
--- a/lib/DebugInfo/PDB/Native/HashTable.cpp
+++ b/lib/DebugInfo/PDB/Native/HashTable.cpp
@@ -22,7 +22,7 @@ HashTable::HashTable() : HashTable(8) {}
 
 HashTable::HashTable(uint32_t Capacity) { Buckets.resize(Capacity); }
 
-Error HashTable::load(msf::StreamReader &Stream) {
+Error HashTable::load(BinaryStreamReader &Stream) {
   const Header *H;
   if (auto EC = Stream.readObject(H))
     return EC;
@@ -77,7 +77,7 @@ uint32_t HashTable::calculateSerializedLength() const {
   return Size;
 }
 
-Error HashTable::commit(msf::StreamWriter &Writer) const {
+Error HashTable::commit(BinaryStreamWriter &Writer) const {
   Header H;
   H.Size = size();
   H.Capacity = capacity();
@@ -209,7 +209,7 @@ void HashTable::grow() {
   assert(size() == S);
 }
 
-Error HashTable::readSparseBitVector(msf::StreamReader &Stream,
+Error HashTable::readSparseBitVector(BinaryStreamReader &Stream,
                                      SparseBitVector<> &V) {
   uint32_t NumWords;
   if (auto EC = Stream.readInteger(NumWords))
@@ -231,7 +231,7 @@ Error HashTable::readSparseBitVector(msf::StreamReader &Stream,
   return Error::success();
 }
 
-Error HashTable::writeSparseBitVector(msf::StreamWriter &Writer,
+Error HashTable::writeSparseBitVector(BinaryStreamWriter &Writer,
                                       SparseBitVector<> &Vec) {
   int ReqBits = Vec.find_last() + 1;
   uint32_t NumWords = alignTo(ReqBits, sizeof(uint32_t)) / sizeof(uint32_t);
diff --git a/lib/DebugInfo/PDB/Native/InfoStream.cpp b/lib/DebugInfo/PDB/Native/InfoStream.cpp
index b003ecc14400cf9de14002764ca144347f116fe6..2a1d12e82390257c52543c0b33f4b818398f2048 100644
--- a/lib/DebugInfo/PDB/Native/InfoStream.cpp
+++ b/lib/DebugInfo/PDB/Native/InfoStream.cpp
@@ -10,12 +10,12 @@
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -26,7 +26,7 @@ InfoStream::InfoStream(std::unique_ptr<MappedBlockStream> Stream)
     : Stream(std::move(Stream)) {}
 
 Error InfoStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   const InfoStreamHeader *H;
   if (auto EC = Reader.readObject(H))
@@ -51,9 +51,44 @@ Error InfoStream::reload() {
   Age = H->Age;
   Guid = H->Guid;
 
-  return NamedStreams.load(Reader);
+  uint32_t Offset = Reader.getOffset();
+  if (auto EC = NamedStreams.load(Reader))
+    return EC;
+  uint32_t NewOffset = Reader.getOffset();
+  NamedStreamMapByteSize = NewOffset - Offset;
+
+  bool Stop = false;
+  while (!Stop && !Reader.empty()) {
+    PdbRaw_FeatureSig Sig;
+    if (auto EC = Reader.readEnum(Sig))
+      return EC;
+    // Since this value comes from a file, it's possible we have some strange
+    // value which doesn't correspond to any value.  We don't want to warn on
+    // -Wcovered-switch-default in this case, so switch on the integral value
+    // instead of the enumeration value.
+    switch (uint32_t(Sig)) {
+    case uint32_t(PdbRaw_FeatureSig::VC110):
+      // No other flags for VC110 PDB.
+      Stop = true;
+      LLVM_FALLTHROUGH;
+    case uint32_t(PdbRaw_FeatureSig::VC140):
+      Features |= PdbFeatureContainsIdStream;
+      break;
+    case uint32_t(PdbRaw_FeatureSig::NoTypeMerge):
+      Features |= PdbFeatureNoTypeMerging;
+      break;
+    case uint32_t(PdbRaw_FeatureSig::MinimalDebugInfo):
+      Features |= PdbFeatureMinimalDebugInfo;
+    default:
+      continue;
+    }
+    FeatureSignatures.push_back(Sig);
+  }
+  return Error::success();
 }
 
+uint32_t InfoStream::getStreamSize() const { return Stream->getLength(); }
+
 uint32_t InfoStream::getNamedStreamIndex(llvm::StringRef Name) const {
   uint32_t Result;
   if (!NamedStreams.get(Name, Result))
@@ -76,6 +111,16 @@ uint32_t InfoStream::getAge() const { return Age; }
 
 PDB_UniqueId InfoStream::getGuid() const { return Guid; }
 
+uint32_t InfoStream::getNamedStreamMapByteSize() const {
+  return NamedStreamMapByteSize;
+}
+
+PdbRaw_Features InfoStream::getFeatures() const { return Features; }
+
+ArrayRef<PdbRaw_FeatureSig> InfoStream::getFeatureSignatures() const {
+  return FeatureSignatures;
+}
+
 const NamedStreamMap &InfoStream::getNamedStreams() const {
   return NamedStreams;
 }
diff --git a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
index d45bd10729f2e82b2d9d162b8bd39a954a960ce9..f019d410328a84357d30f0876da42efbf8f19012 100644
--- a/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/InfoStreamBuilder.cpp
@@ -11,12 +11,12 @@
 
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/NamedStreamMap.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -36,18 +36,23 @@ void InfoStreamBuilder::setAge(uint32_t A) { Age = A; }
 
 void InfoStreamBuilder::setGuid(PDB_UniqueId G) { Guid = G; }
 
+void InfoStreamBuilder::addFeature(PdbRaw_FeatureSig Sig) {
+  Features.push_back(Sig);
+}
+
 Error InfoStreamBuilder::finalizeMsfLayout() {
-  uint32_t Length = sizeof(InfoStreamHeader) + NamedStreams.finalize();
+  uint32_t Length = sizeof(InfoStreamHeader) + NamedStreams.finalize() +
+                    (Features.size() + 1) * sizeof(uint32_t);
   if (auto EC = Msf.setStreamSize(StreamPDB, Length))
     return EC;
   return Error::success();
 }
 
 Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout,
-                                const msf::WritableStream &Buffer) const {
+                                WritableBinaryStreamRef Buffer) const {
   auto InfoS =
       WritableMappedBlockStream::createIndexedStream(Layout, Buffer, StreamPDB);
-  StreamWriter Writer(*InfoS);
+  BinaryStreamWriter Writer(*InfoS);
 
   InfoStreamHeader H;
   H.Age = Age;
@@ -57,5 +62,13 @@ Error InfoStreamBuilder::commit(const msf::MSFLayout &Layout,
   if (auto EC = Writer.writeObject(H))
     return EC;
 
-  return NamedStreams.commit(Writer);
+  if (auto EC = NamedStreams.commit(Writer))
+    return EC;
+  if (auto EC = Writer.writeInteger(0))
+    return EC;
+  for (auto E : Features) {
+    if (auto EC = Writer.writeEnum(E))
+      return EC;
+  }
+  return Error::success();
 }
diff --git a/lib/DebugInfo/PDB/Native/ModInfo.cpp b/lib/DebugInfo/PDB/Native/ModInfo.cpp
index 762a92bc18ebf6073971ca34f7eec8db3cb109cf..1405286fd0885f81f9e1ba085797380d7c507ec7 100644
--- a/lib/DebugInfo/PDB/Native/ModInfo.cpp
+++ b/lib/DebugInfo/PDB/Native/ModInfo.cpp
@@ -8,15 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/ModInfo.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
 #include <cstdint>
 
 using namespace llvm;
-using namespace llvm::msf;
 using namespace llvm::pdb;
 using namespace llvm::support;
 
@@ -26,15 +25,15 @@ ModInfo::ModInfo(const ModInfo &Info) = default;
 
 ModInfo::~ModInfo() = default;
 
-Error ModInfo::initialize(ReadableStreamRef Stream, ModInfo &Info) {
-  StreamReader Reader(Stream);
+Error ModInfo::initialize(BinaryStreamRef Stream, ModInfo &Info) {
+  BinaryStreamReader Reader(Stream);
   if (auto EC = Reader.readObject(Info.Layout))
     return EC;
 
-  if (auto EC = Reader.readZeroString(Info.ModuleName))
+  if (auto EC = Reader.readCString(Info.ModuleName))
     return EC;
 
-  if (auto EC = Reader.readZeroString(Info.ObjFileName))
+  if (auto EC = Reader.readCString(Info.ObjFileName))
     return EC;
   return Error::success();
 }
diff --git a/lib/DebugInfo/PDB/Native/ModInfoBuilder.cpp b/lib/DebugInfo/PDB/Native/ModInfoBuilder.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..73c45a9535202d786bf0395b8c9e0c2d2e6b10a4
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/ModInfoBuilder.cpp
@@ -0,0 +1,136 @@
+//===- ModInfoBuilder.cpp - PDB Module Info Stream Creation -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/ModInfoBuilder.h"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/MSF/MSFBuilder.h"
+#include "llvm/DebugInfo/MSF/MSFCommon.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/PDB/Native/ModInfo.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryItemStream.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/COFF.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::msf;
+using namespace llvm::pdb;
+
+namespace llvm {
+template <> struct BinaryItemTraits<CVSymbol> {
+  static size_t length(const CVSymbol &Item) { return Item.RecordData.size(); }
+
+  static ArrayRef<uint8_t> bytes(const CVSymbol &Item) {
+    return Item.RecordData;
+  }
+};
+}
+
+static uint32_t calculateDiSymbolStreamSize(uint32_t SymbolByteSize) {
+  uint32_t Size = sizeof(uint32_t); // Signature
+  Size += SymbolByteSize;           // Symbol Data
+  Size += 0;                        // TODO: Layout.LineBytes
+  Size += 0;                        // TODO: Layout.C13Bytes
+  Size += sizeof(uint32_t);         // GlobalRefs substream size (always 0)
+  Size += 0;                        // GlobalRefs substream bytes
+  return Size;
+}
+
+ModInfoBuilder::ModInfoBuilder(StringRef ModuleName, uint32_t ModIndex,
+                               msf::MSFBuilder &Msf)
+    : MSF(Msf), ModuleName(ModuleName) {
+  Layout.Mod = ModIndex;
+}
+
+uint16_t ModInfoBuilder::getStreamIndex() const { return Layout.ModDiStream; }
+
+void ModInfoBuilder::setObjFileName(StringRef Name) { ObjFileName = Name; }
+
+void ModInfoBuilder::addSymbol(CVSymbol Symbol) {
+  Symbols.push_back(Symbol);
+  SymbolByteSize += Symbol.data().size();
+}
+
+void ModInfoBuilder::addSourceFile(StringRef Path) {
+  SourceFiles.push_back(Path);
+}
+
+uint32_t ModInfoBuilder::calculateSerializedLength() const {
+  uint32_t L = sizeof(Layout);
+  uint32_t M = ModuleName.size() + 1;
+  uint32_t O = ObjFileName.size() + 1;
+  return alignTo(L + M + O, sizeof(uint32_t));
+}
+
+void ModInfoBuilder::finalize() {
+  Layout.C13Bytes = 0;
+  Layout.FileNameOffs = 0; // TODO: Fix this
+  Layout.Flags = 0;        // TODO: Fix this
+  Layout.LineBytes = 0;
+  (void)Layout.Mod;         // Set in constructor
+  (void)Layout.ModDiStream; // Set in finalizeMsfLayout
+  Layout.NumFiles = SourceFiles.size();
+  Layout.PdbFilePathNI = 0;
+  Layout.SrcFileNameNI = 0;
+
+  // This value includes both the signature field as well as the record bytes
+  // from the symbol stream.
+  Layout.SymBytes = SymbolByteSize + sizeof(uint32_t);
+}
+
+Error ModInfoBuilder::finalizeMsfLayout() {
+  this->Layout.ModDiStream = kInvalidStreamIndex;
+  auto ExpectedSN = MSF.addStream(calculateDiSymbolStreamSize(SymbolByteSize));
+  if (!ExpectedSN)
+    return ExpectedSN.takeError();
+  Layout.ModDiStream = *ExpectedSN;
+  return Error::success();
+}
+
+Error ModInfoBuilder::commit(BinaryStreamWriter &ModiWriter,
+                             const msf::MSFLayout &MsfLayout,
+                             WritableBinaryStreamRef MsfBuffer) {
+  // We write the Modi record to the `ModiWriter`, but we additionally write its
+  // symbol stream to a brand new stream.
+  if (auto EC = ModiWriter.writeObject(Layout))
+    return EC;
+  if (auto EC = ModiWriter.writeCString(ModuleName))
+    return EC;
+  if (auto EC = ModiWriter.writeCString(ObjFileName))
+    return EC;
+  if (auto EC = ModiWriter.padToAlignment(sizeof(uint32_t)))
+    return EC;
+
+  if (Layout.ModDiStream != kInvalidStreamIndex) {
+    auto NS = WritableMappedBlockStream::createIndexedStream(
+        MsfLayout, MsfBuffer, Layout.ModDiStream);
+    WritableBinaryStreamRef Ref(*NS);
+    BinaryStreamWriter SymbolWriter(Ref);
+    // Write the symbols.
+    if (auto EC =
+            SymbolWriter.writeInteger<uint32_t>(COFF::DEBUG_SECTION_MAGIC))
+      return EC;
+    BinaryItemStream<CVSymbol> Records(llvm::support::endianness::little);
+    Records.setItems(Symbols);
+    BinaryStreamRef RecordsRef(Records);
+    if (auto EC = SymbolWriter.writeStreamRef(RecordsRef))
+      return EC;
+    // TODO: Write C11 Line data
+    // TODO: Write C13 Line data
+    // TODO: Figure out what GlobalRefs substream actually is and populate it.
+    if (auto EC = SymbolWriter.writeInteger<uint32_t>(0))
+      return EC;
+    if (SymbolWriter.bytesRemaining() > 0)
+      return make_error<RawError>(raw_error_code::stream_too_long);
+  }
+  return Error::success();
+}
diff --git a/lib/DebugInfo/PDB/Native/ModStream.cpp b/lib/DebugInfo/PDB/Native/ModStream.cpp
index 25370f26ec3ba2c4641273f9e1be4c53a649bfba..08798cf0ed286f63fd3f101393d0173f345492f7 100644
--- a/lib/DebugInfo/PDB/Native/ModStream.cpp
+++ b/lib/DebugInfo/PDB/Native/ModStream.cpp
@@ -10,12 +10,12 @@
 #include "llvm/DebugInfo/PDB/Native/ModStream.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
 #include "llvm/DebugInfo/PDB/Native/ModInfo.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
@@ -31,7 +31,7 @@ ModStream::ModStream(const ModInfo &Module,
 ModStream::~ModStream() = default;
 
 Error ModStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   uint32_t SymbolSize = Mod.getSymbolDebugInfoByteSize();
   uint32_t C11Size = Mod.getLineInfoByteSize();
@@ -41,7 +41,7 @@ Error ModStream::reload() {
     return make_error<RawError>(raw_error_code::corrupt_file,
                                 "Module has both C11 and C13 line info");
 
-  ReadableStreamRef S;
+  BinaryStreamRef S;
 
   if (auto EC = Reader.readInteger(Signature))
     return EC;
@@ -53,7 +53,7 @@ Error ModStream::reload() {
   if (auto EC = Reader.readStreamRef(C13LinesSubstream, C13Size))
     return EC;
 
-  StreamReader LineReader(C13LinesSubstream);
+  BinaryStreamReader LineReader(C13LinesSubstream);
   if (auto EC = LineReader.readArray(LineInfo, LineReader.bytesRemaining()))
     return EC;
 
diff --git a/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp b/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
index 47fd0e1ba24caf0ec7b8ff761aeb2ef1f4f2854f..c7ba32b82bc6be11e9a6e62d17bcb428dfcb2add 100644
--- a/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
+++ b/lib/DebugInfo/PDB/Native/NamedStreamMap.cpp
@@ -13,20 +13,19 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
 #include <cstdint>
 
 using namespace llvm;
-using namespace llvm::msf;
 using namespace llvm::pdb;
 
 NamedStreamMap::NamedStreamMap() = default;
 
-Error NamedStreamMap::load(StreamReader &Stream) {
+Error NamedStreamMap::load(BinaryStreamReader &Stream) {
   Mapping.clear();
   FinalizedHashTable.clear();
   FinalizedInfo.reset();
@@ -37,7 +36,7 @@ Error NamedStreamMap::load(StreamReader &Stream) {
                       make_error<RawError>(raw_error_code::corrupt_file,
                                            "Expected string buffer size"));
 
-  msf::ReadableStreamRef StringsBuffer;
+  BinaryStreamRef StringsBuffer;
   if (auto EC = Stream.readStreamRef(StringsBuffer, StringBufferSize))
     return EC;
 
@@ -51,11 +50,11 @@ Error NamedStreamMap::load(StreamReader &Stream) {
     std::tie(NameOffset, NameIndex) = Entry;
 
     // Compute the offset of the start of the string relative to the stream.
-    msf::StreamReader NameReader(StringsBuffer);
+    BinaryStreamReader NameReader(StringsBuffer);
     NameReader.setOffset(NameOffset);
     // Pump out our c-string from the stream.
     StringRef Str;
-    if (auto EC = NameReader.readZeroString(Str))
+    if (auto EC = NameReader.readCString(Str))
       return joinErrors(std::move(EC),
                         make_error<RawError>(raw_error_code::corrupt_file,
                                              "Expected name map name"));
@@ -67,17 +66,16 @@ Error NamedStreamMap::load(StreamReader &Stream) {
   return Error::success();
 }
 
-Error NamedStreamMap::commit(msf::StreamWriter &Writer) const {
+Error NamedStreamMap::commit(BinaryStreamWriter &Writer) const {
   assert(FinalizedInfo.hasValue());
 
   // The first field is the number of bytes of string data.
-  if (auto EC = Writer.writeInteger(
-          FinalizedInfo->StringDataBytes)) // Number of bytes of string data
+  if (auto EC = Writer.writeInteger(FinalizedInfo->StringDataBytes))
     return EC;
 
   // Now all of the string data itself.
   for (const auto &Item : Mapping) {
-    if (auto EC = Writer.writeZeroString(Item.getKey()))
+    if (auto EC = Writer.writeCString(Item.getKey()))
       return EC;
   }
 
@@ -116,6 +114,8 @@ NamedStreamMap::entries() const {
                                                       Mapping.end());
 }
 
+uint32_t NamedStreamMap::size() const { return Mapping.size(); }
+
 bool NamedStreamMap::get(StringRef Stream, uint32_t &StreamNo) const {
   auto Iter = Mapping.find(Stream);
   if (Iter == Mapping.end())
diff --git a/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..9c0cc0bf82337b6756a001cbff5c0fb72c56e9e9
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
@@ -0,0 +1,43 @@
+//===- NativeCompilandSymbol.cpp - Native impl for compilands ---*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
+
+namespace llvm {
+namespace pdb {
+
+NativeCompilandSymbol::NativeCompilandSymbol(NativeSession &Session,
+                                             const ModuleInfoEx &MI)
+    : NativeRawSymbol(Session), Module(MI) {}
+
+PDB_SymType NativeCompilandSymbol::getSymTag() const {
+  return PDB_SymType::Compiland;
+}
+
+bool NativeCompilandSymbol::isEditAndContinueEnabled() const {
+  return Module.Info.hasECInfo();
+}
+
+uint32_t NativeCompilandSymbol::getLexicalParentId() const { return 0; }
+
+// The usage of getObjFileName for getLibraryName and getModuleName for getName
+// may seem backwards, but it is consistent with DIA, which is what this API
+// was modeled after.  We may rename these methods later to try to eliminate
+// this potential confusion.
+
+std::string NativeCompilandSymbol::getLibraryName() const {
+  return Module.Info.getObjFileName();
+}
+
+std::string NativeCompilandSymbol::getName() const {
+  return Module.Info.getModuleName();
+}
+
+} // namespace pdb
+} // namespace llvm
diff --git a/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp b/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..7532110d005c924d78e878a6c10bb40be77c138d
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/NativeEnumModules.cpp
@@ -0,0 +1,52 @@
+//==- NativeEnumModules.cpp - Native Symbol Enumerator impl ------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
+
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBSymbol.h"
+#include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
+
+namespace llvm {
+namespace pdb {
+
+NativeEnumModules::NativeEnumModules(NativeSession &PDBSession,
+                                     ArrayRef<ModuleInfoEx> Modules,
+                                     uint32_t Index)
+    : Session(PDBSession), Modules(Modules), Index(Index) {}
+
+uint32_t NativeEnumModules::getChildCount() const {
+  return static_cast<uint32_t>(Modules.size());
+}
+
+std::unique_ptr<PDBSymbol>
+NativeEnumModules::getChildAtIndex(uint32_t Index) const {
+  if (Index >= Modules.size())
+    return nullptr;
+  return std::unique_ptr<PDBSymbol>(new PDBSymbolCompiland(Session,
+      std::unique_ptr<IPDBRawSymbol>(
+          new NativeCompilandSymbol(Session, Modules[Index]))));
+}
+
+std::unique_ptr<PDBSymbol> NativeEnumModules::getNext() {
+  if (Index >= Modules.size())
+    return nullptr;
+  return getChildAtIndex(Index++);
+}
+
+void NativeEnumModules::reset() { Index = 0; }
+
+NativeEnumModules *NativeEnumModules::clone() const {
+  return new NativeEnumModules(Session, Modules, Index);
+}
+
+}
+}
diff --git a/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec2a4b87457c1a872df356652c7f629089437efb
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
@@ -0,0 +1,79 @@
+//===- NativeExeSymbol.cpp - native impl for PDBSymbolExe -------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
+
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/NativeEnumModules.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+
+namespace llvm {
+namespace pdb {
+
+NativeExeSymbol::NativeExeSymbol(NativeSession &Session)
+    : NativeRawSymbol(Session), File(Session.getPDBFile()) {}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeExeSymbol::findChildren(PDB_SymType Type) const {
+  switch (Type) {
+  case PDB_SymType::Compiland: {
+    auto Dbi = File.getPDBDbiStream();
+    if (Dbi) {
+      const auto Modules = Dbi->modules();
+      return std::unique_ptr<IPDBEnumSymbols>(
+          new NativeEnumModules(Session, Modules));
+    }
+    consumeError(Dbi.takeError());
+    break;
+  }
+  default:
+    break;
+  }
+  return nullptr;
+}
+
+uint32_t NativeExeSymbol::getAge() const {
+  auto IS = File.getPDBInfoStream();
+  if (IS)
+    return IS->getAge();
+  consumeError(IS.takeError());
+  return 0;
+}
+
+std::string NativeExeSymbol::getSymbolsFileName() const {
+  return File.getFilePath();
+}
+
+PDB_UniqueId NativeExeSymbol::getGuid() const {
+  auto IS = File.getPDBInfoStream();
+  if (IS)
+    return IS->getGuid();
+  consumeError(IS.takeError());
+  return PDB_UniqueId{{0}};
+}
+
+bool NativeExeSymbol::hasCTypes() const {
+  auto Dbi = File.getPDBDbiStream();
+  if (Dbi)
+    return Dbi->hasCTypes();
+  consumeError(Dbi.takeError());
+  return false;
+}
+
+bool NativeExeSymbol::hasPrivateSymbols() const {
+  auto Dbi = File.getPDBDbiStream();
+  if (Dbi)
+    return !Dbi->isStripped();
+  consumeError(Dbi.takeError());
+  return false;
+}
+
+} // namespace pdb
+} // namespace llvm
diff --git a/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4841ded7410268533ba5e720e8a79aad999d282b
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/NativeRawSymbol.cpp
@@ -0,0 +1,699 @@
+//===- NativeRawSymbol.cpp - Native implementation of IPDBRawSymbol -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/PDBExtras.h"
+#include "llvm/Support/ConvertUTF.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+NativeRawSymbol::NativeRawSymbol(NativeSession &PDBSession)
+  : Session(PDBSession) {}
+
+void NativeRawSymbol::dump(raw_ostream &OS, int Indent) const {}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildren(PDB_SymType Type) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildren(PDB_SymType Type, StringRef Name,
+    PDB_NameSearchFlags Flags) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findChildrenByRVA(PDB_SymType Type, StringRef Name,
+    PDB_NameSearchFlags Flags, uint32_t RVA) const {
+  return nullptr;
+}
+
+std::unique_ptr<IPDBEnumSymbols>
+NativeRawSymbol::findInlineFramesByRVA(uint32_t RVA) const {
+  return nullptr;
+}
+
+void NativeRawSymbol::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) const {
+  bytes.clear();
+}
+
+PDB_MemberAccess NativeRawSymbol::getAccess() const {
+  return PDB_MemberAccess::Private;
+}
+
+uint32_t NativeRawSymbol::getAddressOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getAddressSection() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getAge() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getArrayIndexTypeId() const {
+  return 0;
+}
+
+void NativeRawSymbol::getBackEndVersion(VersionInfo &Version) const {
+  Version.Major = 0;
+  Version.Minor = 0;
+  Version.Build = 0;
+  Version.QFE = 0;
+}
+
+uint32_t NativeRawSymbol::getBaseDataOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getBaseDataSlot() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getBaseSymbolId() const {
+  return 0;
+}
+
+PDB_BuiltinType NativeRawSymbol::getBuiltinType() const {
+  return PDB_BuiltinType::None;
+}
+
+uint32_t NativeRawSymbol::getBitPosition() const {
+  return 0;
+}
+
+PDB_CallingConv NativeRawSymbol::getCallingConvention() const {
+  return PDB_CallingConv::FarStdCall;
+}
+
+uint32_t NativeRawSymbol::getClassParentId() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getCompilerName() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getCount() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getCountLiveRanges() const {
+  return 0;
+}
+
+void NativeRawSymbol::getFrontEndVersion(VersionInfo &Version) const {
+  Version.Major = 0;
+  Version.Minor = 0;
+  Version.Build = 0;
+  Version.QFE = 0;
+}
+
+PDB_Lang NativeRawSymbol::getLanguage() const {
+  return PDB_Lang::Cobol;
+}
+
+uint32_t NativeRawSymbol::getLexicalParentId() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getLibraryName() const {
+  return "";
+}
+
+uint32_t NativeRawSymbol::getLiveRangeStartAddressOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getLiveRangeStartAddressSection() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getLiveRangeStartRelativeVirtualAddress() const {
+  return 0;
+}
+
+codeview::RegisterId NativeRawSymbol::getLocalBasePointerRegisterId() const {
+  return codeview::RegisterId::EAX;
+}
+
+uint32_t NativeRawSymbol::getLowerBoundId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getMemorySpaceKind() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getName() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfAcceleratorPointerTags() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfColumns() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfModifiers() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfRegisterIndices() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getNumberOfRows() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getObjectFileName() const {
+  return "";
+}
+
+uint32_t NativeRawSymbol::getOemId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getOemSymbolId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getOffsetInUdt() const {
+  return 0;
+}
+
+PDB_Cpu NativeRawSymbol::getPlatform() const {
+  return PDB_Cpu::Intel8080;
+}
+
+uint32_t NativeRawSymbol::getRank() const {
+  return 0;
+}
+
+codeview::RegisterId NativeRawSymbol::getRegisterId() const {
+  return codeview::RegisterId::EAX;
+}
+
+uint32_t NativeRawSymbol::getRegisterType() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getRelativeVirtualAddress() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSamplerSlot() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSignature() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSizeInUdt() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSlot() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getSourceFileName() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getStride() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getSubTypeId() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getSymbolsFileName() const { return ""; }
+
+uint32_t NativeRawSymbol::getSymIndexId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTargetOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTargetRelativeVirtualAddress() const {
+  return 0;
+}
+
+uint64_t NativeRawSymbol::getTargetVirtualAddress() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTargetSection() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTextureSlot() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTimeStamp() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getToken() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getTypeId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getUavSlot() const {
+  return 0;
+}
+
+std::string NativeRawSymbol::getUndecoratedName() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getUnmodifiedTypeId() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getUpperBoundId() const {
+  return 0;
+}
+
+Variant NativeRawSymbol::getValue() const {
+  return Variant();
+}
+
+uint32_t NativeRawSymbol::getVirtualBaseDispIndex() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getVirtualBaseOffset() const {
+  return 0;
+}
+
+uint32_t NativeRawSymbol::getVirtualTableShapeId() const {
+  return 0;
+}
+
+PDB_DataKind NativeRawSymbol::getDataKind() const {
+  return PDB_DataKind::Unknown;
+}
+
+PDB_SymType NativeRawSymbol::getSymTag() const {
+  return PDB_SymType::None;
+}
+
+PDB_UniqueId NativeRawSymbol::getGuid() const {
+  return PDB_UniqueId{{0}};
+}
+
+int32_t NativeRawSymbol::getOffset() const {
+  return 0;
+}
+
+int32_t NativeRawSymbol::getThisAdjust() const {
+  return 0;
+}
+
+int32_t NativeRawSymbol::getVirtualBasePointerOffset() const {
+  return 0;
+}
+
+PDB_LocType NativeRawSymbol::getLocationType() const {
+  return PDB_LocType::Null;
+}
+
+PDB_Machine NativeRawSymbol::getMachineType() const {
+  return PDB_Machine::Invalid;
+}
+
+codeview::ThunkOrdinal NativeRawSymbol::getThunkOrdinal() const {
+  return codeview::ThunkOrdinal::Standard;
+}
+
+uint64_t NativeRawSymbol::getLength() const {
+  return 0;
+}
+
+uint64_t NativeRawSymbol::getLiveRangeLength() const {
+  return 0;
+}
+
+uint64_t NativeRawSymbol::getVirtualAddress() const {
+  return 0;
+}
+
+PDB_UdtType NativeRawSymbol::getUdtKind() const {
+  return PDB_UdtType::Struct;
+}
+
+bool NativeRawSymbol::hasConstructor() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasCustomCallingConvention() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasFarReturn() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCompilerGenerated() const {
+  return false;
+}
+
+bool NativeRawSymbol::isConstType() const {
+  return false;
+}
+
+bool NativeRawSymbol::isEditAndContinueEnabled() const {
+  return false;
+}
+
+bool NativeRawSymbol::isFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::getAddressTaken() const {
+  return false;
+}
+
+bool NativeRawSymbol::getNoStackOrdering() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasAlloca() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasAssignmentOperator() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasCTypes() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasCastOperator() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasDebugInfo() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasEH() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasEHa() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasInlAsm() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasInlineAttribute() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasInterruptReturn() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasFramePointer() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasLongJump() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasManagedCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasNestedTypes() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasNoInlineAttribute() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasNoReturnAttribute() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasOptimizedCodeDebugInfo() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasOverloadedOperator() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasSEH() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasSecurityChecks() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasSetJump() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasStrictGSCheck() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAcceleratorGroupSharedLocal() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAcceleratorPointerTagLiveRange() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAcceleratorStubFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::isAggregated() const {
+  return false;
+}
+
+bool NativeRawSymbol::isIntroVirtualFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCVTCIL() const {
+  return false;
+}
+
+bool NativeRawSymbol::isConstructorVirtualBase() const {
+  return false;
+}
+
+bool NativeRawSymbol::isCxxReturnUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isDataAligned() const {
+  return false;
+}
+
+bool NativeRawSymbol::isHLSLData() const {
+  return false;
+}
+
+bool NativeRawSymbol::isHotpatchable() const {
+  return false;
+}
+
+bool NativeRawSymbol::isIndirectVirtualBaseClass() const {
+  return false;
+}
+
+bool NativeRawSymbol::isInterfaceUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isIntrinsic() const {
+  return false;
+}
+
+bool NativeRawSymbol::isLTCG() const {
+  return false;
+}
+
+bool NativeRawSymbol::isLocationControlFlowDependent() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMSILNetmodule() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMatrixRowMajor() const {
+  return false;
+}
+
+bool NativeRawSymbol::isManagedCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMSILCode() const {
+  return false;
+}
+
+bool NativeRawSymbol::isMultipleInheritance() const {
+  return false;
+}
+
+bool NativeRawSymbol::isNaked() const {
+  return false;
+}
+
+bool NativeRawSymbol::isNested() const {
+  return false;
+}
+
+bool NativeRawSymbol::isOptimizedAway() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPacked() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPointerBasedOnSymbolValue() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPointerToDataMember() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPointerToMemberFunction() const {
+  return false;
+}
+
+bool NativeRawSymbol::isPureVirtual() const {
+  return false;
+}
+
+bool NativeRawSymbol::isRValueReference() const {
+  return false;
+}
+
+bool NativeRawSymbol::isRefUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isReference() const {
+  return false;
+}
+
+bool NativeRawSymbol::isRestrictedType() const {
+  return false;
+}
+
+bool NativeRawSymbol::isReturnValue() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSafeBuffers() const {
+  return false;
+}
+
+bool NativeRawSymbol::isScoped() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSdl() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSingleInheritance() const {
+  return false;
+}
+
+bool NativeRawSymbol::isSplitted() const {
+  return false;
+}
+
+bool NativeRawSymbol::isStatic() const {
+  return false;
+}
+
+bool NativeRawSymbol::hasPrivateSymbols() const {
+  return false;
+}
+
+bool NativeRawSymbol::isUnalignedType() const {
+  return false;
+}
+
+bool NativeRawSymbol::isUnreached() const {
+  return false;
+}
+
+bool NativeRawSymbol::isValueUdt() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVirtual() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVirtualBaseClass() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVirtualInheritance() const {
+  return false;
+}
+
+bool NativeRawSymbol::isVolatileType() const {
+  return false;
+}
+
+bool NativeRawSymbol::wasInlined() const {
+  return false;
+}
+
+std::string NativeRawSymbol::getUnused() const {
+  return "";
+}
diff --git a/lib/DebugInfo/PDB/Native/NativeSession.cpp b/lib/DebugInfo/PDB/Native/NativeSession.cpp
index e2c23317511511ef9eb497c4e21d0a27ab54525a..3a83a326cfe63bee7b39b57fb2e53bfb8e3acd15 100644
--- a/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -10,15 +10,16 @@
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
 #include "llvm/DebugInfo/PDB/GenericError.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+#include "llvm/DebugInfo/PDB/Native/NativeExeSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -44,10 +45,11 @@ Error NativeSession::createFromPdb(StringRef Path,
     return make_error<GenericError>(generic_error_code::invalid_path);
 
   std::unique_ptr<MemoryBuffer> Buffer = std::move(*ErrorOrBuffer);
-  auto Stream = llvm::make_unique<MemoryBufferByteStream>(std::move(Buffer));
+  auto Stream = llvm::make_unique<MemoryBufferByteStream>(
+      std::move(Buffer), llvm::support::little);
 
   auto Allocator = llvm::make_unique<BumpPtrAllocator>();
-  auto File = llvm::make_unique<PDBFile>(std::move(Stream), *Allocator);
+  auto File = llvm::make_unique<PDBFile>(Path, std::move(Stream), *Allocator);
   if (auto EC = File->parseFileHeaders())
     return EC;
   if (auto EC = File->parseStreamData())
@@ -68,8 +70,12 @@ uint64_t NativeSession::getLoadAddress() const { return 0; }
 
 void NativeSession::setLoadAddress(uint64_t Address) {}
 
-std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() const {
-  return nullptr;
+std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() {
+  auto RawSymbol = llvm::make_unique<NativeExeSymbol>(*this);
+  auto PdbSymbol(PDBSymbol::create(*this, std::move(RawSymbol)));
+  std::unique_ptr<PDBSymbolExe> ExeSymbol(
+    static_cast<PDBSymbolExe *>(PdbSymbol.release()));
+  return ExeSymbol;
 }
 
 std::unique_ptr<PDBSymbol>
diff --git a/lib/DebugInfo/PDB/Native/PDBFile.cpp b/lib/DebugInfo/PDB/Native/PDBFile.cpp
index 02e883b841868e001163d1de7c41a000eeecd49e..943e7fa13ab76f019abcd25d5cb125c6a2a6922a 100644
--- a/lib/DebugInfo/PDB/Native/PDBFile.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFile.cpp
@@ -12,9 +12,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/MSF/MSFCommon.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
@@ -23,8 +20,12 @@
 #include "llvm/DebugInfo/PDB/Native/StringTable.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/Path.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -38,12 +39,18 @@ namespace {
 typedef FixedStreamArray<support::ulittle32_t> ulittle_array;
 } // end anonymous namespace
 
-PDBFile::PDBFile(std::unique_ptr<ReadableStream> PdbFileBuffer,
+PDBFile::PDBFile(StringRef Path, std::unique_ptr<BinaryStream> PdbFileBuffer,
                  BumpPtrAllocator &Allocator)
-    : Allocator(Allocator), Buffer(std::move(PdbFileBuffer)) {}
+    : FilePath(Path), Allocator(Allocator), Buffer(std::move(PdbFileBuffer)) {}
 
 PDBFile::~PDBFile() = default;
 
+StringRef PDBFile::getFilePath() const { return FilePath; }
+
+StringRef PDBFile::getFileDirectory() const {
+  return sys::path::parent_path(FilePath);
+}
+
 uint32_t PDBFile::getBlockSize() const { return ContainerLayout.SB->BlockSize; }
 
 uint32_t PDBFile::getFreeBlockMapBlock() const {
@@ -106,7 +113,7 @@ Error PDBFile::setBlockData(uint32_t BlockIndex, uint32_t Offset,
 }
 
 Error PDBFile::parseFileHeaders() {
-  StreamReader Reader(*Buffer);
+  BinaryStreamReader Reader(*Buffer);
 
   // Initialize SB.
   const msf::SuperBlock *SB = nullptr;
@@ -140,7 +147,7 @@ Error PDBFile::parseFileHeaders() {
   // See the function fpmPn() for more information:
   // https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/msf/msf.cpp#L489
   auto FpmStream = MappedBlockStream::createFpmStream(ContainerLayout, *Buffer);
-  StreamReader FpmReader(*FpmStream);
+  BinaryStreamReader FpmReader(*FpmStream);
   ArrayRef<uint8_t> FpmBytes;
   if (auto EC = FpmReader.readBytes(FpmBytes,
                                     msf::getFullFpmByteSize(ContainerLayout)))
@@ -178,7 +185,7 @@ Error PDBFile::parseStreamData() {
   // subclass of IPDBStreamData which only accesses the fields that have already
   // been parsed, we can avoid this and reuse MappedBlockStream.
   auto DS = MappedBlockStream::createDirectoryStream(ContainerLayout, *Buffer);
-  StreamReader Reader(*DS);
+  BinaryStreamReader Reader(*DS);
   if (auto EC = Reader.readInteger(NumStreams))
     return EC;
 
@@ -343,7 +350,7 @@ Expected<StringTable &> PDBFile::getStringTable() {
     if (!NS)
       return NS.takeError();
 
-    StreamReader Reader(**NS);
+    BinaryStreamReader Reader(**NS);
     auto N = llvm::make_unique<StringTable>();
     if (auto EC = N->load(Reader))
       return std::move(EC);
@@ -389,14 +396,13 @@ bool PDBFile::hasStringTable() {
   return IS->getNamedStreamIndex("/names") < getNumStreams();
 }
 
-/// Wrapper around MappedBlockStream::createIndexedStream()
-/// that checks if a stream with that index actually exists.
-/// If it does not, the return value will have an MSFError with
-/// code msf_error_code::no_stream. Else, the return value will
-/// contain the stream returned by createIndexedStream().
+/// Wrapper around MappedBlockStream::createIndexedStream() that checks if a
+/// stream with that index actually exists.  If it does not, the return value
+/// will have an MSFError with code msf_error_code::no_stream.  Else, the return
+/// value will contain the stream returned by createIndexedStream().
 Expected<std::unique_ptr<MappedBlockStream>>
 PDBFile::safelyCreateIndexedStream(const MSFLayout &Layout,
-                                   const ReadableStream &MsfData,
+                                   BinaryStreamRef MsfData,
                                    uint32_t StreamIndex) const {
   if (StreamIndex >= getNumStreams())
     return make_error<RawError>(raw_error_code::no_stream);
diff --git a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index 57ca2ba488ae2298ada405805ef9b2f141251269..b3c84903bc7e7f9c0402fe15cd48a663ea5e0676 100644
--- a/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -12,8 +12,6 @@
 #include "llvm/ADT/BitVector.h"
 
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
-#include "llvm/DebugInfo/MSF/StreamInterface.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
 #include "llvm/DebugInfo/PDB/GenericError.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
@@ -23,6 +21,8 @@
 #include "llvm/DebugInfo/PDB/Native/StringTableBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h"
+#include "llvm/Support/BinaryStream.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
@@ -118,8 +118,9 @@ Error PDBFileBuilder::commit(StringRef Filename) {
   if (OutFileOrError.getError())
     return llvm::make_error<pdb::GenericError>(generic_error_code::invalid_path,
                                                Filename);
-  FileBufferByteStream Buffer(std::move(*OutFileOrError));
-  StreamWriter Writer(Buffer);
+  FileBufferByteStream Buffer(std::move(*OutFileOrError),
+                              llvm::support::little);
+  BinaryStreamWriter Writer(Buffer);
 
   if (auto EC = Writer.writeObject(*Layout.SB))
     return EC;
@@ -131,9 +132,8 @@ Error PDBFileBuilder::commit(StringRef Filename) {
 
   auto DirStream =
       WritableMappedBlockStream::createDirectoryStream(Layout, Buffer);
-  StreamWriter DW(*DirStream);
-  if (auto EC =
-          DW.writeInteger(static_cast<uint32_t>(Layout.StreamSizes.size())))
+  BinaryStreamWriter DW(*DirStream);
+  if (auto EC = DW.writeInteger<uint32_t>(Layout.StreamSizes.size()))
     return EC;
 
   if (auto EC = DW.writeArray(Layout.StreamSizes))
@@ -150,7 +150,7 @@ Error PDBFileBuilder::commit(StringRef Filename) {
 
   auto NS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer,
                                                            StringTableStreamNo);
-  StreamWriter NSWriter(*NS);
+  BinaryStreamWriter NSWriter(*NS);
   if (auto EC = Strings.commit(NSWriter))
     return EC;
 
diff --git a/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..629f3e80b0ed57b1c102bef62a8595a92071321a
--- /dev/null
+++ b/lib/DebugInfo/PDB/Native/PDBTypeServerHandler.cpp
@@ -0,0 +1,119 @@
+//===- PDBTypeServerHandler.cpp ---------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Handles CodeView LF_TYPESERVER2 records by attempting to locate a matching
+// PDB file, then loading the PDB file and visiting all types from the
+// referenced PDB using the original supplied visitor.
+//
+// The net effect of this is that when visiting a PDB containing a TypeServer
+// record, the TypeServer record is "replaced" with all of the records in
+// the referenced PDB file.  If a single instance of PDBTypeServerHandler
+// encounters the same TypeServer multiple times (for example reusing one
+// PDBTypeServerHandler across multiple visitations of distinct object files or
+// PDB files), PDBTypeServerHandler will optionally revisit all the records
+// again, or simply consume the record and do nothing.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h"
+
+#include "llvm/DebugInfo/CodeView/CodeViewError.h"
+#include "llvm/DebugInfo/PDB/GenericError.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+#include "llvm/DebugInfo/PDB/PDB.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Path.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+static void ignoreErrors(Error EC) {
+  llvm::handleAllErrors(std::move(EC), [&](ErrorInfoBase &EIB) {});
+}
+
+PDBTypeServerHandler::PDBTypeServerHandler(bool RevisitAlways)
+    : RevisitAlways(RevisitAlways) {}
+
+void PDBTypeServerHandler::addSearchPath(StringRef Path) {
+  if (Path.empty() || !sys::fs::is_directory(Path))
+    return;
+
+  SearchPaths.push_back(Path);
+}
+
+Expected<bool>
+PDBTypeServerHandler::handleInternal(PDBFile &File,
+                                     TypeVisitorCallbacks &Callbacks) {
+  auto ExpectedTpi = File.getPDBTpiStream();
+  if (!ExpectedTpi)
+    return ExpectedTpi.takeError();
+  CVTypeVisitor Visitor(Callbacks);
+
+  if (auto EC = Visitor.visitTypeStream(ExpectedTpi->types(nullptr)))
+    return std::move(EC);
+
+  return true;
+}
+
+Expected<bool> PDBTypeServerHandler::handle(TypeServer2Record &TS,
+                                            TypeVisitorCallbacks &Callbacks) {
+  if (Session) {
+    // If we've already handled this TypeServer and we only want to handle each
+    // TypeServer once, consume the record without doing anything.
+    if (!RevisitAlways)
+      return true;
+
+    return handleInternal(Session->getPDBFile(), Callbacks);
+  }
+
+  StringRef File = sys::path::filename(TS.Name);
+  if (File.empty())
+    return make_error<CodeViewError>(
+        cv_error_code::corrupt_record,
+        "TypeServer2Record does not contain filename!");
+
+  for (auto Path : SearchPaths) {
+    sys::path::append(Path, File);
+    if (!sys::fs::exists(Path))
+      continue;
+
+    std::unique_ptr<IPDBSession> ThisSession;
+    if (auto EC = loadDataForPDB(PDB_ReaderType::Native, Path, ThisSession)) {
+      // It is not an error if this PDB fails to load, it just means that it
+      // doesn't match and we should continue searching.
+      ignoreErrors(std::move(EC));
+      continue;
+    }
+
+    std::unique_ptr<NativeSession> NS(
+        static_cast<NativeSession *>(ThisSession.release()));
+    PDBFile &File = NS->getPDBFile();
+    auto ExpectedInfo = File.getPDBInfoStream();
+    // All PDB Files should have an Info stream.
+    if (!ExpectedInfo)
+      return ExpectedInfo.takeError();
+
+    // Just because a file with a matching name was found and it was an actual
+    // PDB file doesn't mean it matches.  For it to match the InfoStream's GUID
+    // must match the GUID specified in the TypeServer2 record.
+    ArrayRef<uint8_t> GuidBytes(ExpectedInfo->getGuid().Guid);
+    StringRef GuidStr(reinterpret_cast<const char *>(GuidBytes.begin()),
+                      GuidBytes.size());
+    if (GuidStr != TS.Guid)
+      continue;
+
+    Session = std::move(NS);
+    return handleInternal(File, Callbacks);
+  }
+
+  // We couldn't find a matching PDB, so let it be handled by someone else.
+  return false;
+}
diff --git a/lib/DebugInfo/PDB/Native/PublicsStream.cpp b/lib/DebugInfo/PDB/Native/PublicsStream.cpp
index ddec60a4371bc37862e22b79711cbba742223410..58202577672a31efeea76af54cc1839761a00ce9 100644
--- a/lib/DebugInfo/PDB/Native/PublicsStream.cpp
+++ b/lib/DebugInfo/PDB/Native/PublicsStream.cpp
@@ -27,10 +27,10 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
@@ -69,7 +69,7 @@ uint32_t PublicsStream::getAddrMap() const { return Header->AddrMap; }
 // we skip over the hash table which we believe contains information about
 // public symbols.
 Error PublicsStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   // Check stream size.
   if (Reader.bytesRemaining() < sizeof(HeaderInfo) + sizeof(GSIHashHeader))
diff --git a/lib/DebugInfo/PDB/Native/RawError.cpp b/lib/DebugInfo/PDB/Native/RawError.cpp
index aa126bb8f1ad864bde12665256ae041646c1b497..548289fff3dfeee12c4192e025dce10245ece939 100644
--- a/lib/DebugInfo/PDB/Native/RawError.cpp
+++ b/lib/DebugInfo/PDB/Native/RawError.cpp
@@ -38,6 +38,8 @@ public:
       return "The entry does not exist.";
     case raw_error_code::not_writable:
       return "The PDB does not support writing.";
+    case raw_error_code::stream_too_long:
+      return "The stream was longer than expected.";
     case raw_error_code::invalid_tpi_hash:
       return "The Type record has an invalid hash value.";
     }
diff --git a/lib/DebugInfo/PDB/Native/StringTable.cpp b/lib/DebugInfo/PDB/Native/StringTable.cpp
index 5b8ae9b7e9c0f3b5a5c5e8e98070ef0b1e7b00ed..7e28389b838313f5a3a22dc1c309b0a9d401e8d4 100644
--- a/lib/DebugInfo/PDB/Native/StringTable.cpp
+++ b/lib/DebugInfo/PDB/Native/StringTable.cpp
@@ -10,20 +10,21 @@
 #include "llvm/DebugInfo/PDB/Native/StringTable.h"
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
-using namespace llvm::msf;
 using namespace llvm::support;
 using namespace llvm::pdb;
 
-StringTable::StringTable() : Signature(0), HashVersion(0), NameCount(0) {}
+StringTable::StringTable() {}
+
+Error StringTable::load(BinaryStreamReader &Stream) {
+  ByteSize = Stream.getLength();
 
-Error StringTable::load(StreamReader &Stream) {
   const StringTableHeader *H;
   if (auto EC = Stream.readObject(H))
     return EC;
@@ -57,9 +58,18 @@ Error StringTable::load(StreamReader &Stream) {
 
   if (auto EC = Stream.readInteger(NameCount))
     return EC;
+
+  if (Stream.bytesRemaining() > 0)
+    return make_error<RawError>(raw_error_code::stream_too_long,
+      "Unexpected bytes found in string table");
+
   return Error::success();
 }
 
+uint32_t StringTable::getByteSize() const {
+  return ByteSize;
+}
+
 StringRef StringTable::getStringForID(uint32_t ID) const {
   if (ID == IDs[0])
     return StringRef();
@@ -68,9 +78,9 @@ StringRef StringTable::getStringForID(uint32_t ID) const {
   // the starting offset of the string we're looking for.  So just seek into
   // the desired offset and a read a null terminated stream from that offset.
   StringRef Result;
-  StreamReader NameReader(NamesBuffer);
+  BinaryStreamReader NameReader(NamesBuffer);
   NameReader.setOffset(ID);
-  if (auto EC = NameReader.readZeroString(Result))
+  if (auto EC = NameReader.readCString(Result))
     consumeError(std::move(EC));
   return Result;
 }
diff --git a/lib/DebugInfo/PDB/Native/StringTableBuilder.cpp b/lib/DebugInfo/PDB/Native/StringTableBuilder.cpp
index ef9caee4cd6b090f9920a493999918203d540b33..e0f8370ab6087e39ace75162b2da7bdea5e85da0 100644
--- a/lib/DebugInfo/PDB/Native/StringTableBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/StringTableBuilder.cpp
@@ -9,9 +9,9 @@
 
 #include "llvm/DebugInfo/PDB/Native/StringTableBuilder.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -52,7 +52,7 @@ uint32_t StringTableBuilder::finalize() {
   return Size;
 }
 
-Error StringTableBuilder::commit(msf::StreamWriter &Writer) const {
+Error StringTableBuilder::commit(BinaryStreamWriter &Writer) const {
   // Write a header
   StringTableHeader H;
   H.Signature = StringTableSignature;
@@ -67,7 +67,7 @@ Error StringTableBuilder::commit(msf::StreamWriter &Writer) const {
     StringRef S = Pair.first;
     uint32_t Offset = Pair.second;
     Writer.setOffset(StringStart + Offset);
-    if (auto EC = Writer.writeZeroString(S))
+    if (auto EC = Writer.writeCString(S))
       return EC;
   }
   Writer.setOffset(StringStart + StringSize);
diff --git a/lib/DebugInfo/PDB/Native/SymbolStream.cpp b/lib/DebugInfo/PDB/Native/SymbolStream.cpp
index a38d0957092c6334a89db452ffbab20cc030c111..9e9ebd11495b292caf40dfb28e53de3af45f0191 100644
--- a/lib/DebugInfo/PDB/Native/SymbolStream.cpp
+++ b/lib/DebugInfo/PDB/Native/SymbolStream.cpp
@@ -12,11 +12,10 @@
 #include "llvm/DebugInfo/CodeView/CodeView.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
-
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 
 using namespace llvm;
@@ -30,7 +29,7 @@ SymbolStream::SymbolStream(std::unique_ptr<MappedBlockStream> Stream)
 SymbolStream::~SymbolStream() {}
 
 Error SymbolStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   if (auto EC = Reader.readArray(SymbolRecords, Stream->getLength()))
     return EC;
diff --git a/lib/DebugInfo/PDB/Native/TpiStream.cpp b/lib/DebugInfo/PDB/Native/TpiStream.cpp
index 603a2c5e833999ccfe374527c5464419a5a5b7bd..5fef3edf8c2db45e5b4056ccc05080f5979b0168 100644
--- a/lib/DebugInfo/PDB/Native/TpiStream.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStream.cpp
@@ -14,12 +14,13 @@
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/PDBTypeServerHandler.h"
 #include "llvm/DebugInfo/PDB/Native/RawConstants.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/Native/TpiHashing.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
@@ -53,7 +54,7 @@ Error TpiStream::verifyHashValues() {
 }
 
 Error TpiStream::reload() {
-  StreamReader Reader(*Stream);
+  BinaryStreamReader Reader(*Stream);
 
   if (Reader.bytesRemaining() < sizeof(TpiStreamHeader))
     return make_error<RawError>(raw_error_code::corrupt_file,
@@ -92,11 +93,12 @@ Error TpiStream::reload() {
 
     auto HS = MappedBlockStream::createIndexedStream(
         Pdb.getMsfLayout(), Pdb.getMsfBuffer(), Header->HashStreamIndex);
-    StreamReader HSR(*HS);
+    BinaryStreamReader HSR(*HS);
 
+    // There should be a hash value for every type record, or no hashes at all.
     uint32_t NumHashValues =
         Header->HashValueBuffer.Length / sizeof(ulittle32_t);
-    if (NumHashValues != NumTypeRecords())
+    if (NumHashValues != NumTypeRecords() && NumHashValues != 0)
       return make_error<RawError>(
           raw_error_code::corrupt_file,
           "TPI hash count does not match with the number of type records.");
@@ -123,8 +125,9 @@ Error TpiStream::reload() {
 
     // TPI hash table is a parallel array for the type records.
     // Verify that the hash values match with type records.
-    if (auto EC = verifyHashValues())
-      return EC;
+    if (NumHashValues > 0)
+      if (auto EC = verifyHashValues())
+        return EC;
   }
 
   return Error::success();
@@ -164,7 +167,7 @@ FixedStreamArray<TypeIndexOffset> TpiStream::getTypeIndexOffsets() const {
 
 HashTable &TpiStream::getHashAdjusters() { return HashAdjusters; }
 
-iterator_range<CVTypeArray::Iterator> TpiStream::types(bool *HadError) const {
+CVTypeRange TpiStream::types(bool *HadError) const {
   return make_range(TypeRecords.begin(HadError), TypeRecords.end());
 }
 
diff --git a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index cff1977c8c412be2eddd2e69b6ce2d90ca438068..375c35b111455c7002ed499e13ed189a38cba523 100644
--- a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -12,17 +12,17 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamArray.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/RawError.h"
 #include "llvm/DebugInfo/PDB/Native/RawTypes.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/Support/Allocator.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include <algorithm>
@@ -43,9 +43,22 @@ void TpiStreamBuilder::setVersionHeader(PdbRaw_TpiVer Version) {
   VerHeader = Version;
 }
 
-void TpiStreamBuilder::addTypeRecord(const codeview::CVType &Record) {
+void TpiStreamBuilder::addTypeRecord(ArrayRef<uint8_t> Record,
+                                     Optional<uint32_t> Hash) {
+  // If we just crossed an 8KB threshold, add a type index offset.
+  size_t NewSize = TypeRecordBytes + Record.size();
+  constexpr size_t EightKB = 8 * 1024;
+  if (NewSize / EightKB > TypeRecordBytes / EightKB || TypeRecords.empty()) {
+    TypeIndexOffsets.push_back(
+        {codeview::TypeIndex(codeview::TypeIndex::FirstNonSimpleIndex +
+                             TypeRecords.size()),
+         ulittle32_t(TypeRecordBytes)});
+  }
+  TypeRecordBytes = NewSize;
+
   TypeRecords.push_back(Record);
-  TypeRecordStream.setItems(TypeRecords);
+  if (Hash)
+    TypeHashes.push_back(*Hash);
 }
 
 Error TpiStreamBuilder::finalize() {
@@ -55,13 +68,12 @@ Error TpiStreamBuilder::finalize() {
   TpiStreamHeader *H = Allocator.Allocate<TpiStreamHeader>();
 
   uint32_t Count = TypeRecords.size();
-  uint32_t HashBufferSize = calculateHashBufferSize();
 
   H->Version = *VerHeader;
   H->HeaderSize = sizeof(TpiStreamHeader);
   H->TypeIndexBegin = codeview::TypeIndex::FirstNonSimpleIndex;
   H->TypeIndexEnd = H->TypeIndexBegin + Count;
-  H->TypeRecordBytes = TypeRecordStream.getLength();
+  H->TypeRecordBytes = TypeRecordBytes;
 
   H->HashStreamIndex = HashStreamIndex;
   H->HashAuxStreamIndex = kInvalidStreamIndex;
@@ -72,24 +84,32 @@ Error TpiStreamBuilder::finalize() {
   // the `HashStreamIndex` field of the `TpiStreamHeader`.  Therefore, the data
   // begins at offset 0 of this independent stream.
   H->HashValueBuffer.Off = 0;
-  H->HashValueBuffer.Length = HashBufferSize;
+  H->HashValueBuffer.Length = calculateHashBufferSize();
+
+  // We never write any adjustments into our PDBs, so this is usually some
+  // offset with zero length.
   H->HashAdjBuffer.Off = H->HashValueBuffer.Off + H->HashValueBuffer.Length;
   H->HashAdjBuffer.Length = 0;
+
   H->IndexOffsetBuffer.Off = H->HashAdjBuffer.Off + H->HashAdjBuffer.Length;
-  H->IndexOffsetBuffer.Length = 0;
+  H->IndexOffsetBuffer.Length = calculateIndexOffsetSize();
 
   Header = H;
   return Error::success();
 }
 
-uint32_t TpiStreamBuilder::calculateSerializedLength() const {
-  return sizeof(TpiStreamHeader) + TypeRecordStream.getLength();
+uint32_t TpiStreamBuilder::calculateSerializedLength() {
+  return sizeof(TpiStreamHeader) + TypeRecordBytes;
 }
 
 uint32_t TpiStreamBuilder::calculateHashBufferSize() const {
-  if (TypeRecords.empty() || !TypeRecords[0].Hash.hasValue())
-    return 0;
-  return TypeRecords.size() * sizeof(ulittle32_t);
+  assert((TypeRecords.size() == TypeHashes.size() || TypeHashes.empty()) &&
+         "either all or no type records should have hashes");
+  return TypeHashes.size() * sizeof(ulittle32_t);
+}
+
+uint32_t TpiStreamBuilder::calculateIndexOffsetSize() const {
+  return TypeIndexOffsets.size() * sizeof(TypeIndexOffset);
 }
 
 Error TpiStreamBuilder::finalizeMsfLayout() {
@@ -97,48 +117,60 @@ Error TpiStreamBuilder::finalizeMsfLayout() {
   if (auto EC = Msf.setStreamSize(Idx, Length))
     return EC;
 
-  uint32_t HashBufferSize = calculateHashBufferSize();
+  uint32_t HashStreamSize =
+      calculateHashBufferSize() + calculateIndexOffsetSize();
 
-  if (HashBufferSize == 0)
+  if (HashStreamSize == 0)
     return Error::success();
 
-  auto ExpectedIndex = Msf.addStream(HashBufferSize);
+  auto ExpectedIndex = Msf.addStream(HashStreamSize);
   if (!ExpectedIndex)
     return ExpectedIndex.takeError();
   HashStreamIndex = *ExpectedIndex;
-  ulittle32_t *H = Allocator.Allocate<ulittle32_t>(TypeRecords.size());
-  MutableArrayRef<ulittle32_t> HashBuffer(H, TypeRecords.size());
-  for (uint32_t I = 0; I < TypeRecords.size(); ++I) {
-    HashBuffer[I] = *TypeRecords[I].Hash % MinTpiHashBuckets;
+  if (!TypeHashes.empty()) {
+    ulittle32_t *H = Allocator.Allocate<ulittle32_t>(TypeHashes.size());
+    MutableArrayRef<ulittle32_t> HashBuffer(H, TypeHashes.size());
+    for (uint32_t I = 0; I < TypeHashes.size(); ++I) {
+      HashBuffer[I] = TypeHashes[I] % MinTpiHashBuckets;
+    }
+    ArrayRef<uint8_t> Bytes(
+        reinterpret_cast<const uint8_t *>(HashBuffer.data()),
+        calculateHashBufferSize());
+    HashValueStream =
+        llvm::make_unique<BinaryByteStream>(Bytes, llvm::support::little);
   }
-  ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(HashBuffer.data()),
-                          HashBufferSize);
-  HashValueStream = llvm::make_unique<ByteStream>(Bytes);
   return Error::success();
 }
 
 Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout,
-                               const msf::WritableStream &Buffer) {
+                               WritableBinaryStreamRef Buffer) {
   if (auto EC = finalize())
     return EC;
 
   auto InfoS =
       WritableMappedBlockStream::createIndexedStream(Layout, Buffer, Idx);
 
-  StreamWriter Writer(*InfoS);
+  BinaryStreamWriter Writer(*InfoS);
   if (auto EC = Writer.writeObject(*Header))
     return EC;
 
-  auto RecordArray = VarStreamArray<codeview::CVType>(TypeRecordStream);
-  if (auto EC = Writer.writeArray(RecordArray))
-    return EC;
+  for (auto Rec : TypeRecords)
+    if (auto EC = Writer.writeBytes(Rec))
+      return EC;
 
   if (HashStreamIndex != kInvalidStreamIndex) {
     auto HVS = WritableMappedBlockStream::createIndexedStream(Layout, Buffer,
                                                               HashStreamIndex);
-    StreamWriter HW(*HVS);
-    if (auto EC = HW.writeStreamRef(*HashValueStream))
-      return EC;
+    BinaryStreamWriter HW(*HVS);
+    if (HashValueStream) {
+      if (auto EC = HW.writeStreamRef(*HashValueStream))
+        return EC;
+    }
+
+    for (auto &IndexOffset : TypeIndexOffsets) {
+      if (auto EC = HW.writeObject(IndexOffset))
+        return EC;
+    }
   }
 
   return Error::success();
diff --git a/lib/DebugInfo/PDB/PDBExtras.cpp b/lib/DebugInfo/PDB/PDBExtras.cpp
index b7eee6e53941d492ad61ebe6ba98b0c82643ca32..dc22a30facab3ace8be855fccb5e48c8030b9f83 100644
--- a/lib/DebugInfo/PDB/PDBExtras.cpp
+++ b/lib/DebugInfo/PDB/PDBExtras.cpp
@@ -10,6 +10,7 @@
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/DebugInfo/CodeView/Formatters.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
@@ -259,6 +260,12 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
   return OS;
 }
 
+raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UniqueId &Guid) {
+  codeview::detail::GuidAdapter A(Guid.Guid);
+  A.format(OS, "");
+  return OS;
+}
+
 raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UdtType &Type) {
   switch (Type) {
     CASE_OUTPUT_ENUM_CLASS_STR(PDB_UdtType, Class, "class", OS)
@@ -269,25 +276,6 @@ raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UdtType &Type) {
   return OS;
 }
 
-raw_ostream &llvm::pdb::operator<<(raw_ostream &OS, const PDB_UniqueId &Id) {
-  static const char *Lookup = "0123456789ABCDEF";
-
-  static_assert(sizeof(PDB_UniqueId) == 16, "Expected 16-byte GUID");
-  ArrayRef<uint8_t> GuidBytes(reinterpret_cast<const uint8_t*>(&Id), 16);
-  OS << "{";
-  for (int i=0; i < 16;) {
-    uint8_t Byte = GuidBytes[i];
-    uint8_t HighNibble = (Byte >> 4) & 0xF;
-    uint8_t LowNibble = Byte & 0xF;
-    OS << Lookup[HighNibble] << Lookup[LowNibble];
-    ++i;
-    if (i>=4 && i<=10 && i%2==0)
-      OS << "-";
-  }
-  OS << "}";
-  return OS;
-}
-
 raw_ostream &llvm::pdb::operator<<(raw_ostream &OS,
                                    const PDB_Machine &Machine) {
   switch (Machine) {
diff --git a/lib/DebugInfo/PDB/PDBSymbol.cpp b/lib/DebugInfo/PDB/PDBSymbol.cpp
index 633e11aacf129bbb0b01fdc2f109f16cc978d3d5..2c8438f9c23416ecfdc1039b4c698a04ea1b0cec 100644
--- a/lib/DebugInfo/PDB/PDBSymbol.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbol.cpp
@@ -10,6 +10,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBRawSymbol.h"
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolAnnotation.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolBlock.h"
@@ -100,7 +101,7 @@ PDBSymbol::create(const IPDBSession &PDBSession,
 }
 
 #define TRY_DUMP_TYPE(Type)                                                    \
-  if (const Type *DerivedThis = dyn_cast<Type>(this))                          \
+  if (const Type *DerivedThis = this->cast<Type>())                            \
     Dumper.dump(OS, Indent, *DerivedThis);
 
 #define ELSE_TRY_DUMP_TYPE(Type, Dumper) else TRY_DUMP_TYPE(Type, Dumper)
@@ -109,6 +110,26 @@ void PDBSymbol::defaultDump(raw_ostream &OS, int Indent) const {
   RawSymbol->dump(OS, Indent);
 }
 
+void PDBSymbol::dumpProperties() const {
+  outs() << "\n";
+  defaultDump(outs(), 0);
+  outs().flush();
+}
+
+void PDBSymbol::dumpChildStats() const {
+  TagStats Stats;
+  getChildStats(Stats);
+  outs() << "\n";
+  for (auto &Stat : Stats) {
+    outs() << Stat.first << ": " << Stat.second << "\n";
+  }
+  outs().flush();
+}
+
+std::unique_ptr<PDBSymbol> PDBSymbol::clone() const {
+  return Session.getSymbolById(getSymIndexId());
+}
+
 PDB_SymType PDBSymbol::getSymTag() const { return RawSymbol->getSymTag(); }
 uint32_t PDBSymbol::getSymIndexId() const { return RawSymbol->getSymIndexId(); }
 
@@ -148,3 +169,7 @@ PDBSymbol::getChildStats(TagStats &Stats) const {
   Result->reset();
   return Result;
 }
+
+std::unique_ptr<PDBSymbol> PDBSymbol::getSymbolByIdHelper(uint32_t Id) const {
+  return Session.getSymbolById(Id);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp b/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
index cdb167b6191c47d6bf916d59b97d7ff0daeb7d83..3648272e1d0e27f91271ac4e794108817edf87e9 100644
--- a/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolAnnotation.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolAnnotation::PDBSymbolAnnotation(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Annotation);
+}
 
 void PDBSymbolAnnotation::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolBlock.cpp b/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
index fd5dc9427abf8754510a037dcfa4f35c1745dc7a..7385d3ba1489843fa19897984b7c29d454a722bd 100644
--- a/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolBlock.cpp
@@ -19,6 +19,8 @@ using namespace llvm::pdb;
 
 PDBSymbolBlock::PDBSymbolBlock(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Block);
+}
 
 void PDBSymbolBlock::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp b/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
index ebff08846cacea616fbdcbf8151d1e2c0776a9a8..854cf42d1baeeb2bb1a251eec00907077f433040 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCompiland.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCompiland::PDBSymbolCompiland(const IPDBSession &PDBSession,
                                        std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Compiland);
+}
 
 void PDBSymbolCompiland::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp b/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
index 6dbd5228f2cd9de9c2061206c33f93fcc3321731..e08450e0ad0c4d6f9d233e68b3c8f30df9aa95ab 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCompilandDetails.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCompilandDetails::PDBSymbolCompilandDetails(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::CompilandDetails);
+}
 
 void PDBSymbolCompilandDetails::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp b/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
index 9c7f0b1be56f461490fcee1d74fd3cd26f04425a..2f1c43666ae547dfff62c81b0099b227d22cf7f1 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCompilandEnv.cpp
@@ -20,7 +20,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCompilandEnv::PDBSymbolCompilandEnv(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::CompilandEnv);
+}
 
 std::string PDBSymbolCompilandEnv::getValue() const {
   Variant Value = RawSymbol->getValue();
diff --git a/lib/DebugInfo/PDB/PDBSymbolCustom.cpp b/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
index 0ea387a0eabb3639708e03ae8f6e3b80f5a07fd3..9ec20bb62d758a40a7c36985b4617ce4644cc80f 100644
--- a/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolCustom.cpp
@@ -20,7 +20,9 @@ using namespace llvm::pdb;
 
 PDBSymbolCustom::PDBSymbolCustom(const IPDBSession &PDBSession,
                                  std::unique_ptr<IPDBRawSymbol> CustomSymbol)
-    : PDBSymbol(PDBSession, std::move(CustomSymbol)) {}
+    : PDBSymbol(PDBSession, std::move(CustomSymbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Custom);
+}
 
 void PDBSymbolCustom::getDataBytes(llvm::SmallVector<uint8_t, 32> &bytes) {
   RawSymbol->getDataBytes(bytes);
diff --git a/lib/DebugInfo/PDB/PDBSymbolData.cpp b/lib/DebugInfo/PDB/PDBSymbolData.cpp
index 62bb6f3f41e224238598c8d4bfdbaeafbcea41a2..60026689c6f1b2f9bd8bf0b98178efa237f1ac52 100644
--- a/lib/DebugInfo/PDB/PDBSymbolData.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolData.cpp
@@ -19,10 +19,8 @@ using namespace llvm::pdb;
 
 PDBSymbolData::PDBSymbolData(const IPDBSession &PDBSession,
                              std::unique_ptr<IPDBRawSymbol> DataSymbol)
-    : PDBSymbol(PDBSession, std::move(DataSymbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolData::getType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(DataSymbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Data);
 }
 
 void PDBSymbolData::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolExe.cpp b/lib/DebugInfo/PDB/PDBSymbolExe.cpp
index 60101c168a793b56ad952ae1482bfebbeccc8caf..b9fcac78c36d721d4725e55bc967cd8817c6904a 100644
--- a/lib/DebugInfo/PDB/PDBSymbolExe.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolExe.cpp
@@ -18,6 +18,8 @@ using namespace llvm::pdb;
 
 PDBSymbolExe::PDBSymbolExe(const IPDBSession &PDBSession,
                            std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Exe);
+}
 
 void PDBSymbolExe::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolFunc.cpp b/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
index 35251c0cc1c1e60f89ca4d1035ddf6450bbf77d6..3c0bd25ed096c18c2d889eaf7cde7d31ebe67377 100644
--- a/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolFunc.cpp
@@ -85,10 +85,8 @@ private:
 
 PDBSymbolFunc::PDBSymbolFunc(const IPDBSession &PDBSession,
                              std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbolTypeFunctionSig> PDBSymbolFunc::getSignature() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeFunctionSig>(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Function);
 }
 
 std::unique_ptr<IPDBEnumChildren<PDBSymbolData>>
@@ -96,8 +94,4 @@ PDBSymbolFunc::getArguments() const {
   return llvm::make_unique<FunctionArgEnumerator>(Session, *this);
 }
 
-std::unique_ptr<PDBSymbolTypeUDT> PDBSymbolFunc::getClassParent() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeUDT>(getClassParentId());
-}
-
 void PDBSymbolFunc::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp b/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
index 77e996f651df8101989d1310d3389656e3e9c0aa..482c95e3a8509a590d6a4496b245a56cb3214dbd 100644
--- a/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolFuncDebugEnd.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolFuncDebugEnd::PDBSymbolFuncDebugEnd(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FuncDebugEnd);
+}
 
 void PDBSymbolFuncDebugEnd::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp b/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
index 9c653879176b0b647806f56d28c5c09ff046354e..ae23c7619e2aa3986b680d267fd3d2bd30664e12 100644
--- a/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolFuncDebugStart.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolFuncDebugStart::PDBSymbolFuncDebugStart(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FuncDebugStart);
+}
 
 void PDBSymbolFuncDebugStart::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolLabel.cpp b/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
index d2cfd11c35e4a712aba7d44d1c09df63211ede5a..a67a20d8e35203cc7f7a13805ff13eeff066bf3b 100644
--- a/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolLabel.cpp
@@ -18,6 +18,8 @@ using namespace llvm::pdb;
 
 PDBSymbolLabel::PDBSymbolLabel(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Label);
+}
 
 void PDBSymbolLabel::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp b/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
index 97d668740818493de9cdb7b8909798e471e3a91a..87bb4044216b0c2263de5128932d70de7d4959db 100644
--- a/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolPublicSymbol.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolPublicSymbol::PDBSymbolPublicSymbol(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::PublicSymbol);
+}
 
 void PDBSymbolPublicSymbol::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolThunk.cpp b/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
index ef8897d12af41f47d1801ab351913867f7062d43..b2648197f9ccc9596e51fd08af63fdaa6a7c54fa 100644
--- a/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolThunk.cpp
@@ -18,6 +18,8 @@ using namespace llvm::pdb;
 
 PDBSymbolThunk::PDBSymbolThunk(const IPDBSession &PDBSession,
                                std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Thunk);
+}
 
 void PDBSymbolThunk::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
index c010cc5d7678fe2159355613856647a0be0fd150..a8054a42d8660eb962c6048ba65dca4f9279a07c 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeArray.cpp
@@ -19,12 +19,14 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeArray::PDBSymbolTypeArray(const IPDBSession &PDBSession,
                                        std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolTypeArray::getElementType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::ArrayType);
 }
 
 void PDBSymbolTypeArray::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
+
+void PDBSymbolTypeArray::dumpRight(PDBSymDumper &Dumper) const {
+  Dumper.dumpRight(*this);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
index 382c397b24d2fc02f2747ca605324cec88d36d43..0ee18d47162493ab09fa0ae325fb76cad9c026a7 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeBaseClass.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeBaseClass::PDBSymbolTypeBaseClass(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::BaseClass);
+}
 
 void PDBSymbolTypeBaseClass::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
index e5d65bf5d1fd5e2e1f7a9794256f4902a8f65614..0bf563af7df5ac21f21fb838b80df7415814f475 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeBuiltin.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeBuiltin::PDBSymbolTypeBuiltin(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::BuiltinType);
+}
 
 void PDBSymbolTypeBuiltin::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
index 1d80c97f9ede2a2918461c0e1872d077eeb18ff0..f617d8d0c2df549ed43898f33713a3d2dbb01401 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeCustom.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeCustom::PDBSymbolTypeCustom(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::CustomType);
+}
 
 void PDBSymbolTypeCustom::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
index 535d97dcd21e70dceb166dd1872ab15369928ae5..68ba87c1cdf8dc4ed38c05259659b1e6b29ef888 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeDimension.cpp
@@ -20,7 +20,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeDimension::PDBSymbolTypeDimension(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Dimension);
+}
 
 void PDBSymbolTypeDimension::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
index 788f2b732aaa134f5b86bad1079fe6155d6ba234..2addea072c8851fa6e6a603b085ff6fbcff5efc3 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeEnum.cpp
@@ -21,15 +21,8 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeEnum::PDBSymbolTypeEnum(const IPDBSession &PDBSession,
                                      std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbolTypeUDT> PDBSymbolTypeEnum::getClassParent() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeUDT>(getClassParentId());
-}
-
-std::unique_ptr<PDBSymbolTypeBuiltin>
-PDBSymbolTypeEnum::getUnderlyingType() const {
-  return Session.getConcreteSymbolById<PDBSymbolTypeBuiltin>(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Enum);
 }
 
 void PDBSymbolTypeEnum::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
index 5831baebb993c58c96078c1a276a67f65fc56a23..ec27985e91d1701ba1a49fd22c2c1287a726cf60 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFriend.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeFriend::PDBSymbolTypeFriend(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Friend);
+}
 
 void PDBSymbolTypeFriend::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
index c6f586db9e57bc4a2481d10fa6fd65ba65686d0f..4d5cd63f6857955ec552dd965fec80fe3ff53dbf 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionArg.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeFunctionArg::PDBSymbolTypeFunctionArg(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FunctionArg);
+}
 
 void PDBSymbolTypeFunctionArg::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
index 057ae260885f3ecfb44dcf3d3d4b571d71ec49cf..473529d1b04321fc9917a14bcb0d5c78a5e71f17 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeFunctionSig.cpp
@@ -68,10 +68,8 @@ private:
 
 PDBSymbolTypeFunctionSig::PDBSymbolTypeFunctionSig(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolTypeFunctionSig::getReturnType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::FunctionSig);
 }
 
 std::unique_ptr<IPDBEnumSymbols>
@@ -79,13 +77,10 @@ PDBSymbolTypeFunctionSig::getArguments() const {
   return llvm::make_unique<FunctionArgEnumerator>(Session, *this);
 }
 
-std::unique_ptr<PDBSymbol> PDBSymbolTypeFunctionSig::getClassParent() const {
-  uint32_t ClassId = getClassParentId();
-  if (ClassId == 0)
-    return nullptr;
-  return Session.getSymbolById(ClassId);
-}
-
 void PDBSymbolTypeFunctionSig::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
+
+void PDBSymbolTypeFunctionSig::dumpRight(PDBSymDumper &Dumper) const {
+  Dumper.dumpRight(*this);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
index 072d2cfd42fbfc08b6f0bd31d978e6113ca7749a..86e0ec4f8565769bb361d7558d70492587300a8d 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeManaged.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeManaged::PDBSymbolTypeManaged(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::ManagedType);
+}
 
 void PDBSymbolTypeManaged::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp b/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
index 699771450a5d8cf34a647cf410e3fcc43beae11c..69819811d61fcf3446b7afdcc799c51b54a58871 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypePointer.cpp
@@ -19,12 +19,14 @@ using namespace llvm::pdb;
 
 PDBSymbolTypePointer::PDBSymbolTypePointer(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
-
-std::unique_ptr<PDBSymbol> PDBSymbolTypePointer::getPointeeType() const {
-  return Session.getSymbolById(getTypeId());
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::PointerType);
 }
 
 void PDBSymbolTypePointer::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
 }
+
+void PDBSymbolTypePointer::dumpRight(PDBSymDumper &Dumper) const {
+  Dumper.dumpRight(*this);
+}
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
index 0f283b9e21a4f196ff37ec1b74a1d8841913d231..102b540e0fef1acf95db39e6e99a113a927532b2 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeTypedef.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeTypedef::PDBSymbolTypeTypedef(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::Typedef);
+}
 
 void PDBSymbolTypeTypedef::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
index c71838cc7a6f96b8694ed0db6d1a66bd26c45bbc..4a9a9ed5fda8e82c5648c8102fb09e18a382a632 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeUDT.cpp
@@ -18,6 +18,8 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeUDT::PDBSymbolTypeUDT(const IPDBSession &PDBSession,
                                    std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::UDT);
+}
 
 void PDBSymbolTypeUDT::dump(PDBSymDumper &Dumper) const { Dumper.dump(*this); }
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
index 6b76db5912ce8a753924eb5ee7e6b7cf1920be8d..9a21855f57f05e599910ea772f2f0bd0a50ce5a9 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeVTable.cpp
@@ -18,7 +18,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeVTable::PDBSymbolTypeVTable(const IPDBSession &PDBSession,
                                          std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::VTable);
+}
 
 void PDBSymbolTypeVTable::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp b/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
index ef509d64bf6017e68e908f9f8290e999c874557d..a516a4d2c42999f9969fd6668f7b8f8c243e3531 100644
--- a/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolTypeVTableShape.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolTypeVTableShape::PDBSymbolTypeVTableShape(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::VTableShape);
+}
 
 void PDBSymbolTypeVTableShape::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp b/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
index 6a62d554f42ce24878cf467408abbd7e949125eb..020aec9e98a88161ba541cbdc91e4ca193a02293 100644
--- a/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
+++ b/lib/DebugInfo/PDB/PDBSymbolUsingNamespace.cpp
@@ -19,7 +19,9 @@ using namespace llvm::pdb;
 
 PDBSymbolUsingNamespace::PDBSymbolUsingNamespace(
     const IPDBSession &PDBSession, std::unique_ptr<IPDBRawSymbol> Symbol)
-    : PDBSymbol(PDBSession, std::move(Symbol)) {}
+    : PDBSymbol(PDBSession, std::move(Symbol)) {
+  assert(RawSymbol->getSymTag() == PDB_SymType::UsingNamespace);
+}
 
 void PDBSymbolUsingNamespace::dump(PDBSymDumper &Dumper) const {
   Dumper.dump(*this);
diff --git a/lib/DebugInfo/Symbolize/DIPrinter.cpp b/lib/DebugInfo/Symbolize/DIPrinter.cpp
index be5c603a38ef33da7cf2747d7387610bfb35b3c4..c1e2536d6e209eb0e54c140635dd2c321505ad25 100644
--- a/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -78,8 +78,18 @@ void DIPrinter::print(const DILineInfo &Info, bool Inlined) {
   std::string Filename = Info.FileName;
   if (Filename == kDILineInfoBadString)
     Filename = kBadString;
-  OS << Filename << ":" << Info.Line << ":" << Info.Column << "\n";
-  printContext(Filename, Info.Line);
+  if (!Verbose) {
+    OS << Filename << ":" << Info.Line << ":" << Info.Column << "\n";
+    printContext(Filename, Info.Line);
+    return;
+  }
+  OS << "  Filename: " << Filename << "\n";
+  if (Info.StartLine)
+    OS << "Function start line: " << Info.StartLine << "\n";
+  OS << "  Line: " << Info.Line << "\n";
+  OS << "  Column: " << Info.Column << "\n";
+  if (Info.Discriminator)
+    OS << "  Discriminator: " << Info.Discriminator << "\n";
 }
 
 DIPrinter &DIPrinter::operator<<(const DILineInfo &Info) {
diff --git a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index f6940080089f7f2e783e3109a28af292784339f9..f672680cb9ea7c82cb38086166e4fed878a07a3a 100644
--- a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -1,4 +1,4 @@
-//===-- SymbolizableObjectFile.cpp ----------------------------------------===//
+//===- SymbolizableObjectFile.cpp -----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +12,29 @@
 //===----------------------------------------------------------------------===//
 
 #include "SymbolizableObjectFile.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/Object/SymbolSize.h"
+#include "llvm/Support/COFF.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/DebugInfo/DWARF/DWARFContext.h"
-
-namespace llvm {
-namespace symbolize {
+#include "llvm/Support/Error.h"
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
+using namespace llvm;
 using namespace object;
+using namespace symbolize;
 
 static DILineInfoSpecifier
 getDILineInfoSpecifier(FunctionNameKind FNKind) {
@@ -73,14 +87,17 @@ SymbolizableObjectFile::SymbolizableObjectFile(ObjectFile *Obj,
     : Module(Obj), DebugInfoContext(std::move(DICtx)) {}
 
 namespace {
+
 struct OffsetNamePair {
   uint32_t Offset;
   StringRef Name;
+
   bool operator<(const OffsetNamePair &R) const {
     return Offset < R.Offset;
   }
 };
-}
+
+} // end anonymous namespace
 
 std::error_code SymbolizableObjectFile::addCoffExportSymbols(
     const COFFObjectFile *CoffObj) {
@@ -147,7 +164,7 @@ std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
     return errorToErrorCode(SymbolNameOrErr.takeError());
   StringRef SymbolName = *SymbolNameOrErr;
   // Mach-O symbol table names have leading underscore, skip it.
-  if (Module->isMachO() && SymbolName.size() > 0 && SymbolName[0] == '_')
+  if (Module->isMachO() && !SymbolName.empty() && SymbolName[0] == '_')
     SymbolName = SymbolName.drop_front();
   // FIXME: If a function has alias, there are two entries in symbol table
   // with same address size. Make sure we choose the correct one.
@@ -252,7 +269,3 @@ DIGlobal SymbolizableObjectFile::symbolizeData(uint64_t ModuleOffset) const {
                          Res.Size);
   return Res;
 }
-
-}  // namespace symbolize
-}  // namespace llvm
-
diff --git a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
index 8583b6a36e630227e3e878d5728e66d777e60f4d..216cca8de4f5d4b184c43f416c457f4b1000ec4f 100644
--- a/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ b/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -1,4 +1,4 @@
-//===-- SymbolizableObjectFile.h -------------------------------- C++ -----===//
+//===- SymbolizableObjectFile.h ---------------------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,14 +13,20 @@
 #ifndef LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
 #define LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/DebugInfo/DIContext.h"
 #include "llvm/DebugInfo/Symbolize/SymbolizableModule.h"
+#include "llvm/Support/ErrorOr.h"
+#include <cstdint>
 #include <map>
+#include <memory>
+#include <string>
+#include <system_error>
 
 namespace llvm {
+
 class DataExtractor;
-}
 
-namespace llvm {
 namespace symbolize {
 
 class SymbolizableObjectFile : public SymbolizableModule {
@@ -65,6 +71,7 @@ private:
     // If size is 0, assume that symbol occupies the whole memory range up to
     // the following symbol.
     uint64_t Size;
+
     friend bool operator<(const SymbolDesc &s1, const SymbolDesc &s2) {
       return s1.Addr < s2.Addr;
     }
@@ -76,7 +83,8 @@ private:
                          std::unique_ptr<DIContext> DICtx);
 };
 
-}  // namespace symbolize
-}  // namespace llvm
+} // end namespace symbolize
+
+} // end namespace llvm
 
-#endif  // LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
+#endif // LLVM_LIB_DEBUGINFO_SYMBOLIZE_SYMBOLIZABLEOBJECTFILE_H
diff --git a/lib/Demangle/ItaniumDemangle.cpp b/lib/Demangle/ItaniumDemangle.cpp
index 5a0352c539038cf256187b2346986485144a7561..49dbe74d25df8527f238d7d8ea6fef8f4c78c4a2 100644
--- a/lib/Demangle/ItaniumDemangle.cpp
+++ b/lib/Demangle/ItaniumDemangle.cpp
@@ -3836,6 +3836,8 @@ static const char *parse_call_offset(const char *first, const char *last) {
 //                ::= GV <object name> # Guard variable for one-time
 //                initialization
 //                                     # No <type>
+//                ::= TW <object name> # Thread-local wrapper
+//                ::= TH <object name> # Thread-local initialization
 //      extension ::= TC <first type> <number> _ <second type> # construction
 //      vtable for second-in-first
 //      extension ::= GR <object name> # reference temporary for object
@@ -3929,6 +3931,27 @@ static const char *parse_special_name(const char *first, const char *last,
           }
         }
         break;
+      case 'W':
+        // TW <object name> # Thread-local wrapper
+        t = parse_name(first + 2, last, db);
+        if (t != first + 2) {
+          if (db.names.empty())
+            return first;
+          db.names.back().first.insert(0, "thread-local wrapper routine for ");
+          first = t;
+        }
+        break;
+      case 'H':
+        // TH <object name> # Thread-local initialization
+        t = parse_name(first + 2, last, db);
+        if (t != first + 2) {
+          if (db.names.empty())
+            return first;
+          db.names.back().first.insert(
+              0, "thread-local initialization routine for ");
+          first = t;
+        }
+        break;
       default:
         // T <call-offset> <base encoding>
         {
diff --git a/lib/ExecutionEngine/ExecutionEngine.cpp b/lib/ExecutionEngine/ExecutionEngine.cpp
index b4bed325f491523ff883a6ab1ada656718e02e7e..2ee72f9a8c16ac76abd9fcf2e8e95270349b7818 100644
--- a/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -515,7 +515,7 @@ ExecutionEngine *EngineBuilder::create(TargetMachine *TM) {
   // to the function tells DynamicLibrary to load the program, not a library.
   if (sys::DynamicLibrary::LoadLibraryPermanently(nullptr, ErrorStr))
     return nullptr;
-  
+
   // If the user specified a memory manager but didn't specify which engine to
   // create, we assume they only want the JIT, and we fail if they only want
   // the interpreter.
@@ -616,7 +616,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
         for (unsigned int i = 0; i < elemNum; ++i) {
           Type *ElemTy = STy->getElementType(i);
           if (ElemTy->isIntegerTy())
-            Result.AggregateVal[i].IntVal = 
+            Result.AggregateVal[i].IntVal =
               APInt(ElemTy->getPrimitiveSizeInBits(), 0);
           else if (ElemTy->isAggregateType()) {
               const Constant *ElemUndef = UndefValue::get(ElemTy);
@@ -727,7 +727,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
         APFloat apf = APFloat(APFloat::x87DoubleExtended(), GV.IntVal);
         uint64_t v;
         bool ignored;
-        (void)apf.convertToInteger(&v, BitWidth,
+        (void)apf.convertToInteger(makeMutableArrayRef(v), BitWidth,
                                    CE->getOpcode()==Instruction::FPToSI,
                                    APFloat::rmTowardZero, &ignored);
         GV.IntVal = v; // endian?
@@ -979,7 +979,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     // Check if vector holds integers.
     if (ElemTy->isIntegerTy()) {
       if (CAZ) {
-        GenericValue intZero;     
+        GenericValue intZero;
         intZero.IntVal = APInt(ElemTy->getScalarSizeInBits(), 0ull);
         std::fill(Result.AggregateVal.begin(), Result.AggregateVal.end(),
                   intZero);
@@ -1079,7 +1079,7 @@ void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
         *(((float*)Ptr)+i) = Val.AggregateVal[i].FloatVal;
       if (cast<VectorType>(Ty)->getElementType()->isIntegerTy()) {
         unsigned numOfBytes =(Val.AggregateVal[i].IntVal.getBitWidth()+7)/8;
-        StoreIntToMemory(Val.AggregateVal[i].IntVal, 
+        StoreIntToMemory(Val.AggregateVal[i].IntVal,
           (uint8_t*)Ptr + numOfBytes*i, numOfBytes);
       }
     }
@@ -1186,7 +1186,7 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
   DEBUG(Init->dump());
   if (isa<UndefValue>(Init))
     return;
-  
+
   if (const ConstantVector *CP = dyn_cast<ConstantVector>(Init)) {
     unsigned ElementSize =
         getDataLayout().getTypeAllocSize(CP->getType()->getElementType());
@@ -1194,12 +1194,12 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
       InitializeMemory(CP->getOperand(i), (char*)Addr+i*ElementSize);
     return;
   }
-  
+
   if (isa<ConstantAggregateZero>(Init)) {
     memset(Addr, 0, (size_t)getDataLayout().getTypeAllocSize(Init->getType()));
     return;
   }
-  
+
   if (const ConstantArray *CPA = dyn_cast<ConstantArray>(Init)) {
     unsigned ElementSize =
         getDataLayout().getTypeAllocSize(CPA->getType()->getElementType());
@@ -1207,7 +1207,7 @@ void ExecutionEngine::InitializeMemory(const Constant *Init, void *Addr) {
       InitializeMemory(CPA->getOperand(i), (char*)Addr+i*ElementSize);
     return;
   }
-  
+
   if (const ConstantStruct *CPS = dyn_cast<ConstantStruct>(Init)) {
     const StructLayout *SL =
         getDataLayout().getStructLayout(cast<StructType>(CPS->getType()));
diff --git a/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index 1d7c6e714ed0c894051db6ebda66e80d3885488e..e956dbebaffe77ac8aca9b048e9f05c3d2f98001 100644
--- a/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -188,7 +188,7 @@ LLVMBool LLVMCreateMCJITCompilerForModule(
     for (auto &F : *Mod) {
       auto Attrs = F.getAttributes();
       StringRef Value(options.NoFramePointerElim ? "true" : "false");
-      Attrs = Attrs.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+      Attrs = Attrs.addAttribute(F.getContext(), AttributeList::FunctionIndex,
                                  "no-frame-pointer-elim", Value);
       F.setAttributes(Attrs);
     }
diff --git a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
index 3b8c4b973e689dbc4eb8d25216c689af4e6a4869..e6c33b2ecc2a06b4d0ae0645cbae8c1ac42cc1d4 100644
--- a/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
+++ b/lib/ExecutionEngine/IntelJITEvents/CMakeLists.txt
@@ -4,7 +4,7 @@ if( HAVE_LIBDL )
     set(LLVM_INTEL_JIT_LIBS ${CMAKE_DL_LIBS})
 endif()
 
-set(LLVM_INTEL_JIT_LIBS ${PTHREAD_LIB} ${LLVM_INTEL_JIT_LIBS})
+set(LLVM_INTEL_JIT_LIBS ${LLVM_PTHREAD_LIB} ${LLVM_INTEL_JIT_LIBS})
 
 
 add_llvm_library(LLVMIntelJITEvents
diff --git a/lib/ExecutionEngine/Interpreter/Execution.cpp b/lib/ExecutionEngine/Interpreter/Execution.cpp
index 923f6e7147db1f1ca4ac4487283b6d87f679fe26..e29e9fc2c702e1e6985b711b4f5631cbd0619772 100644
--- a/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -899,10 +899,10 @@ void Interpreter::visitSwitchInst(SwitchInst &I) {
 
   // Check to see if any of the cases match...
   BasicBlock *Dest = nullptr;
-  for (SwitchInst::CaseIt i = I.case_begin(), e = I.case_end(); i != e; ++i) {
-    GenericValue CaseVal = getOperandValue(i.getCaseValue(), SF);
+  for (auto Case : I.cases()) {
+    GenericValue CaseVal = getOperandValue(Case.getCaseValue(), SF);
     if (executeICMP_EQ(CondVal, CaseVal, ElTy).IntVal != 0) {
-      Dest = cast<BasicBlock>(i.getCaseSuccessor());
+      Dest = cast<BasicBlock>(Case.getCaseSuccessor());
       break;
     }
   }
diff --git a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index a74fae775ac4f72d379d501a3d0722021862446e..a79dd844bf4f7f85d57ac683b866a1fc41aff38d 100644
--- a/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -16,7 +16,7 @@
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/Support/Error.h"
 
@@ -30,7 +30,7 @@ DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
 class OrcCBindingsStack {
 public:
   typedef orc::JITCompileCallbackManager CompileCallbackMgr;
-  typedef orc::ObjectLinkingLayer<> ObjLayerT;
+  typedef orc::RTDyldObjectLinkingLayer<> ObjLayerT;
   typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
   typedef orc::CompileOnDemandLayer<CompileLayerT, CompileCallbackMgr>
       CODLayerT;
diff --git a/lib/ExecutionEngine/Orc/OrcError.cpp b/lib/ExecutionEngine/Orc/OrcError.cpp
index eaa75ad06a2ef504cd6d4868a2b771a668d2e101..dcbbf5f2ae7000f64bc2a39c0cb865036ae654d6 100644
--- a/lib/ExecutionEngine/Orc/OrcError.cpp
+++ b/lib/ExecutionEngine/Orc/OrcError.cpp
@@ -60,19 +60,16 @@ namespace orc {
 
 char RPCFunctionNotSupported::ID = 0;
 
-Error orcError(OrcErrorCode ErrCode) {
+std::error_code orcError(OrcErrorCode ErrCode) {
   typedef std::underlying_type<OrcErrorCode>::type UT;
-  return errorCodeToError(
-      std::error_code(static_cast<UT>(ErrCode), *OrcErrCat));
+  return std::error_code(static_cast<UT>(ErrCode), *OrcErrCat);
 }
 
 RPCFunctionNotSupported::RPCFunctionNotSupported(std::string RPCFunctionSignature)
   : RPCFunctionSignature(std::move(RPCFunctionSignature)) {}
 
 std::error_code RPCFunctionNotSupported::convertToErrorCode() const {
-  typedef std::underlying_type<OrcErrorCode>::type UT;
-  return std::error_code(static_cast<UT>(OrcErrorCode::UnknownRPCFunction),
-                         *OrcErrCat);
+  return orcError(OrcErrorCode::UnknownRPCFunction);
 }
 
 void RPCFunctionNotSupported::log(raw_ostream &OS) const {
diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index af70960a1f9273a21dc44df599eccc54188308bc..a5100a56bcf1cb0bd3f7990d0b4831bc939eebe1 100644
--- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -24,7 +24,7 @@
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/LazyEmittingLayer.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/Object/Archive.h"
@@ -315,7 +315,7 @@ private:
     NotifyObjectLoadedT(OrcMCJITReplacement &M) : M(M) {}
 
     template <typename ObjListT>
-    void operator()(ObjectLinkingLayerBase::ObjSetHandleT H,
+    void operator()(RTDyldObjectLinkingLayerBase::ObjSetHandleT H,
                     const ObjListT &Objects,
                     const LoadedObjInfoListT &Infos) const {
       M.UnfinalizedSections[H] = std::move(M.SectionsAllocatedSinceLastLoad);
@@ -344,7 +344,7 @@ private:
   public:
     NotifyFinalizedT(OrcMCJITReplacement &M) : M(M) {}
 
-    void operator()(ObjectLinkingLayerBase::ObjSetHandleT H) {
+    void operator()(RTDyldObjectLinkingLayerBase::ObjSetHandleT H) {
       M.UnfinalizedSections.erase(H);
     }
 
@@ -361,7 +361,7 @@ private:
     return MangledName;
   }
 
-  typedef ObjectLinkingLayer<NotifyObjectLoadedT> ObjectLayerT;
+  typedef RTDyldObjectLinkingLayer<NotifyObjectLoadedT> ObjectLayerT;
   typedef IRCompileLayer<ObjectLayerT> CompileLayerT;
   typedef LazyEmittingLayer<CompileLayerT> LazyEmitLayerT;
 
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 63b56f7252093e5fa437c7993f2e500df2eec46c..df9d2ceba3292ced81ad350133a7d84ca79674ea 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -443,7 +443,7 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
        SI != SE; ++SI) {
     const SectionRef &Section = *SI;
 
-    bool IsRequired = isRequiredForExecution(Section);
+    bool IsRequired = isRequiredForExecution(Section) || ProcessAllSections;
 
     // Consider only the sections that are required to be loaded for execution
     if (IsRequired) {
@@ -484,6 +484,14 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
     }
   }
 
+  // Compute Global Offset Table size. If it is not zero we
+  // also update alignment, which is equal to a size of a
+  // single GOT entry.
+  if (unsigned GotSize = computeGOTSize(Obj)) {
+    RWSectionSizes.push_back(GotSize);
+    RWDataAlign = std::max<uint32_t>(RWDataAlign, getGOTEntrySize());
+  }
+
   // Compute the size of all common symbols
   uint64_t CommonSize = 0;
   uint32_t CommonAlign = 1;
@@ -518,6 +526,24 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
   return Error::success();
 }
 
+// compute GOT size
+unsigned RuntimeDyldImpl::computeGOTSize(const ObjectFile &Obj) {
+  size_t GotEntrySize = getGOTEntrySize();
+  if (!GotEntrySize)
+    return 0;
+
+  size_t GotSize = 0;
+  for (section_iterator SI = Obj.section_begin(), SE = Obj.section_end();
+       SI != SE; ++SI) {
+
+    for (const RelocationRef &Reloc : SI->relocations())
+      if (relocationNeedsGot(Reloc))
+        GotSize += GotEntrySize;
+  }
+
+  return GotSize;
+}
+
 // compute stub buffer size for the given section
 unsigned RuntimeDyldImpl::computeSectionStubBufSize(const ObjectFile &Obj,
                                                     const SectionRef &Section) {
@@ -677,7 +703,7 @@ RuntimeDyldImpl::emitSection(const ObjectFile &Obj,
   unsigned Alignment = (unsigned)Alignment64 & 0xffffffffL;
   unsigned PaddingSize = 0;
   unsigned StubBufSize = 0;
-  bool IsRequired = isRequiredForExecution(Section);
+  bool IsRequired = isRequiredForExecution(Section) || ProcessAllSections;
   bool IsVirtual = Section.isVirtual();
   bool IsZeroInit = isZeroInit(Section);
   bool IsReadOnly = isReadOnlyData(Section);
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 8e7437ac4ffb8bf5642aebdc3c1b1a0f86fae56d..f780137d0874f77663b50ba6717d3d9492732690 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -961,6 +961,61 @@ bool RuntimeDyldELF::resolveAArch64ShortBranch(
   return true;
 }
 
+void RuntimeDyldELF::resolveAArch64Branch(unsigned SectionID,
+                                          const RelocationValueRef &Value,
+                                          relocation_iterator RelI,
+                                          StubMap &Stubs) {
+
+  DEBUG(dbgs() << "\t\tThis is an AArch64 branch relocation.");
+  SectionEntry &Section = Sections[SectionID];
+
+  uint64_t Offset = RelI->getOffset();
+  unsigned RelType = RelI->getType();
+  // Look for an existing stub.
+  StubMap::const_iterator i = Stubs.find(Value);
+  if (i != Stubs.end()) {
+    resolveRelocation(Section, Offset,
+                      (uint64_t)Section.getAddressWithOffset(i->second),
+                      RelType, 0);
+    DEBUG(dbgs() << " Stub function found\n");
+  } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
+    // Create a new stub function.
+    DEBUG(dbgs() << " Create a new stub function\n");
+    Stubs[Value] = Section.getStubOffset();
+    uint8_t *StubTargetAddr = createStubFunction(
+        Section.getAddressWithOffset(Section.getStubOffset()));
+
+    RelocationEntry REmovz_g3(SectionID, StubTargetAddr - Section.getAddress(),
+                              ELF::R_AARCH64_MOVW_UABS_G3, Value.Addend);
+    RelocationEntry REmovk_g2(SectionID,
+                              StubTargetAddr - Section.getAddress() + 4,
+                              ELF::R_AARCH64_MOVW_UABS_G2_NC, Value.Addend);
+    RelocationEntry REmovk_g1(SectionID,
+                              StubTargetAddr - Section.getAddress() + 8,
+                              ELF::R_AARCH64_MOVW_UABS_G1_NC, Value.Addend);
+    RelocationEntry REmovk_g0(SectionID,
+                              StubTargetAddr - Section.getAddress() + 12,
+                              ELF::R_AARCH64_MOVW_UABS_G0_NC, Value.Addend);
+
+    if (Value.SymbolName) {
+      addRelocationForSymbol(REmovz_g3, Value.SymbolName);
+      addRelocationForSymbol(REmovk_g2, Value.SymbolName);
+      addRelocationForSymbol(REmovk_g1, Value.SymbolName);
+      addRelocationForSymbol(REmovk_g0, Value.SymbolName);
+    } else {
+      addRelocationForSection(REmovz_g3, Value.SectionID);
+      addRelocationForSection(REmovk_g2, Value.SectionID);
+      addRelocationForSection(REmovk_g1, Value.SectionID);
+      addRelocationForSection(REmovk_g0, Value.SectionID);
+    }
+    resolveRelocation(Section, Offset,
+                      reinterpret_cast<uint64_t>(Section.getAddressWithOffset(
+                          Section.getStubOffset())),
+                      RelType, 0);
+    Section.advanceStubOffset(getMaxStubSize());
+  }
+}
+
 Expected<relocation_iterator>
 RuntimeDyldELF::processRelocationRef(
     unsigned SectionID, relocation_iterator RelI, const ObjectFile &O,
@@ -1055,55 +1110,22 @@ RuntimeDyldELF::processRelocationRef(
 
   DEBUG(dbgs() << "\t\tSectionID: " << SectionID << " Offset: " << Offset
                << "\n");
-  if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be) &&
-      (RelType == ELF::R_AARCH64_CALL26 || RelType == ELF::R_AARCH64_JUMP26)) {
-    // This is an AArch64 branch relocation, need to use a stub function.
-    DEBUG(dbgs() << "\t\tThis is an AArch64 branch relocation.");
-    SectionEntry &Section = Sections[SectionID];
-
-    // Look for an existing stub.
-    StubMap::const_iterator i = Stubs.find(Value);
-    if (i != Stubs.end()) {
-      resolveRelocation(Section, Offset,
-                        (uint64_t)Section.getAddressWithOffset(i->second),
-                        RelType, 0);
-      DEBUG(dbgs() << " Stub function found\n");
-    } else if (!resolveAArch64ShortBranch(SectionID, RelI, Value)) {
-      // Create a new stub function.
-      DEBUG(dbgs() << " Create a new stub function\n");
-      Stubs[Value] = Section.getStubOffset();
-      uint8_t *StubTargetAddr = createStubFunction(
-          Section.getAddressWithOffset(Section.getStubOffset()));
-
-      RelocationEntry REmovz_g3(SectionID,
-                                StubTargetAddr - Section.getAddress(),
-                                ELF::R_AARCH64_MOVW_UABS_G3, Value.Addend);
-      RelocationEntry REmovk_g2(SectionID, StubTargetAddr -
-                                               Section.getAddress() + 4,
-                                ELF::R_AARCH64_MOVW_UABS_G2_NC, Value.Addend);
-      RelocationEntry REmovk_g1(SectionID, StubTargetAddr -
-                                               Section.getAddress() + 8,
-                                ELF::R_AARCH64_MOVW_UABS_G1_NC, Value.Addend);
-      RelocationEntry REmovk_g0(SectionID, StubTargetAddr -
-                                               Section.getAddress() + 12,
-                                ELF::R_AARCH64_MOVW_UABS_G0_NC, Value.Addend);
-
-      if (Value.SymbolName) {
-        addRelocationForSymbol(REmovz_g3, Value.SymbolName);
-        addRelocationForSymbol(REmovk_g2, Value.SymbolName);
-        addRelocationForSymbol(REmovk_g1, Value.SymbolName);
-        addRelocationForSymbol(REmovk_g0, Value.SymbolName);
-      } else {
-        addRelocationForSection(REmovz_g3, Value.SectionID);
-        addRelocationForSection(REmovk_g2, Value.SectionID);
-        addRelocationForSection(REmovk_g1, Value.SectionID);
-        addRelocationForSection(REmovk_g0, Value.SectionID);
-      }
-      resolveRelocation(Section, Offset,
-                        reinterpret_cast<uint64_t>(Section.getAddressWithOffset(
-                            Section.getStubOffset())),
-                        RelType, 0);
-      Section.advanceStubOffset(getMaxStubSize());
+  if ((Arch == Triple::aarch64 || Arch == Triple::aarch64_be)) {
+    if (RelType == ELF::R_AARCH64_CALL26 || RelType == ELF::R_AARCH64_JUMP26) {
+      resolveAArch64Branch(SectionID, Value, RelI, Stubs);
+    } else if (RelType == ELF::R_AARCH64_ADR_GOT_PAGE) {
+      // Craete new GOT entry or find existing one. If GOT entry is
+      // to be created, then we also emit ABS64 relocation for it.
+      uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_AARCH64_ABS64);
+      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
+                                 ELF::R_AARCH64_ADR_PREL_PG_HI21);
+
+    } else if (RelType == ELF::R_AARCH64_LD64_GOT_LO12_NC) {
+      uint64_t GOTOffset = findOrAllocGOTEntry(Value, ELF::R_AARCH64_ABS64);
+      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
+                                 ELF::R_AARCH64_LDST64_ABS_LO12_NC);
+    } else {
+      processSimpleRelocation(SectionID, Offset, RelType, Value);
     }
   } else if (Arch == Triple::arm) {
     if (RelType == ELF::R_ARM_PC24 || RelType == ELF::R_ARM_CALL ||
@@ -1252,7 +1274,7 @@ RuntimeDyldELF::processRelocationRef(
       if (i != GOTSymbolOffsets.end())
         RE.SymOffset = i->second;
       else {
-        RE.SymOffset = allocateGOTEntries(SectionID, 1);
+        RE.SymOffset = allocateGOTEntries(1);
         GOTSymbolOffsets[TargetName] = RE.SymOffset;
       }
     }
@@ -1509,14 +1531,15 @@ RuntimeDyldELF::processRelocationRef(
           Section.advanceStubOffset(getMaxStubSize());
 
           // Allocate a GOT Entry
-          uint64_t GOTOffset = allocateGOTEntries(SectionID, 1);
+          uint64_t GOTOffset = allocateGOTEntries(1);
 
           // The load of the GOT address has an addend of -4
-          resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4);
+          resolveGOTOffsetRelocation(SectionID, StubOffset + 2, GOTOffset - 4,
+                                     ELF::R_X86_64_PC32);
 
           // Fill in the value of the symbol we're targeting into the GOT
           addRelocationForSymbol(
-              computeGOTOffsetRE(SectionID, GOTOffset, 0, ELF::R_X86_64_64),
+              computeGOTOffsetRE(GOTOffset, 0, ELF::R_X86_64_64),
               Value.SymbolName);
         }
 
@@ -1531,11 +1554,13 @@ RuntimeDyldELF::processRelocationRef(
     } else if (RelType == ELF::R_X86_64_GOTPCREL ||
                RelType == ELF::R_X86_64_GOTPCRELX ||
                RelType == ELF::R_X86_64_REX_GOTPCRELX) {
-      uint64_t GOTOffset = allocateGOTEntries(SectionID, 1);
-      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend);
+      uint64_t GOTOffset = allocateGOTEntries(1);
+      resolveGOTOffsetRelocation(SectionID, Offset, GOTOffset + Addend,
+                                 ELF::R_X86_64_PC32);
 
       // Fill in the value of the symbol we're targeting into the GOT
-      RelocationEntry RE = computeGOTOffsetRE(SectionID, GOTOffset, Value.Offset, ELF::R_X86_64_64);
+      RelocationEntry RE =
+          computeGOTOffsetRE(GOTOffset, Value.Offset, ELF::R_X86_64_64);
       if (Value.SymbolName)
         addRelocationForSymbol(RE, Value.SymbolName);
       else
@@ -1593,9 +1618,7 @@ size_t RuntimeDyldELF::getGOTEntrySize() {
   return Result;
 }
 
-uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned SectionID, unsigned no)
-{
-  (void)SectionID; // The GOT Section is the same for all section in the object file
+uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned no) {
   if (GOTSectionID == 0) {
     GOTSectionID = Sections.size();
     // Reserve a section id. We'll allocate the section later
@@ -1607,17 +1630,38 @@ uint64_t RuntimeDyldELF::allocateGOTEntries(unsigned SectionID, unsigned no)
   return StartOffset;
 }
 
-void RuntimeDyldELF::resolveGOTOffsetRelocation(unsigned SectionID, uint64_t Offset, uint64_t GOTOffset)
-{
+uint64_t RuntimeDyldELF::findOrAllocGOTEntry(const RelocationValueRef &Value,
+                                             unsigned GOTRelType) {
+  auto E = GOTOffsetMap.insert({Value, 0});
+  if (E.second) {
+    uint64_t GOTOffset = allocateGOTEntries(1);
+
+    // Create relocation for newly created GOT entry
+    RelocationEntry RE =
+        computeGOTOffsetRE(GOTOffset, Value.Offset, GOTRelType);
+    if (Value.SymbolName)
+      addRelocationForSymbol(RE, Value.SymbolName);
+    else
+      addRelocationForSection(RE, Value.SectionID);
+
+    E.first->second = GOTOffset;
+  }
+
+  return E.first->second;
+}
+
+void RuntimeDyldELF::resolveGOTOffsetRelocation(unsigned SectionID,
+                                                uint64_t Offset,
+                                                uint64_t GOTOffset,
+                                                uint32_t Type) {
   // Fill in the relative address of the GOT Entry into the stub
-  RelocationEntry GOTRE(SectionID, Offset, ELF::R_X86_64_PC32, GOTOffset);
+  RelocationEntry GOTRE(SectionID, Offset, Type, GOTOffset);
   addRelocationForSection(GOTRE, GOTSectionID);
 }
 
-RelocationEntry RuntimeDyldELF::computeGOTOffsetRE(unsigned SectionID, uint64_t GOTOffset, uint64_t SymbolOffset,
-                                                   uint32_t Type)
-{
-  (void)SectionID; // The GOT Section is the same for all section in the object file
+RelocationEntry RuntimeDyldELF::computeGOTOffsetRE(uint64_t GOTOffset,
+                                                   uint64_t SymbolOffset,
+                                                   uint32_t Type) {
   return RelocationEntry(GOTSectionID, GOTOffset, Type, SymbolOffset);
 }
 
@@ -1683,6 +1727,19 @@ bool RuntimeDyldELF::isCompatibleFile(const object::ObjectFile &Obj) const {
   return Obj.isELF();
 }
 
+bool RuntimeDyldELF::relocationNeedsGot(const RelocationRef &R) const {
+  unsigned RelTy = R.getType();
+  if (Arch == Triple::aarch64 || Arch == Triple::aarch64_be)
+    return RelTy == ELF::R_AARCH64_ADR_GOT_PAGE ||
+           RelTy == ELF::R_AARCH64_LD64_GOT_LO12_NC;
+
+  if (Arch == Triple::x86_64)
+    return RelTy == ELF::R_X86_64_GOTPCREL ||
+           RelTy == ELF::R_X86_64_GOTPCRELX ||
+           RelTy == ELF::R_X86_64_REX_GOTPCRELX;
+  return false;
+}
+
 bool RuntimeDyldELF::relocationNeedsStub(const RelocationRef &R) const {
   if (Arch != Triple::x86_64)
     return true;  // Conservative answer
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index d1867d091fe2962196a95b8a41e17f3923abb587..498979705b775c71e89dd232e585581b4d9356a4 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -43,6 +43,9 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
   bool resolveAArch64ShortBranch(unsigned SectionID, relocation_iterator RelI,
                                  const RelocationValueRef &Value);
 
+  void resolveAArch64Branch(unsigned SectionID, const RelocationValueRef &Value,
+                            relocation_iterator RelI, StubMap &Stubs);
+
   void resolveARMRelocation(const SectionEntry &Section, uint64_t Offset,
                             uint32_t Value, uint32_t Type, int32_t Addend);
 
@@ -88,24 +91,26 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
                             ObjSectionToIDMap &LocalSections,
                             RelocationValueRef &Rel);
 protected:
-  size_t getGOTEntrySize();
+  size_t getGOTEntrySize() override;
 
 private:
   SectionEntry &getSection(unsigned SectionID) { return Sections[SectionID]; }
 
   // Allocate no GOT entries for use in the given section.
-  uint64_t allocateGOTEntries(unsigned SectionID, unsigned no);
+  uint64_t allocateGOTEntries(unsigned no);
+
+  // Find GOT entry corresponding to relocation or create new one.
+  uint64_t findOrAllocGOTEntry(const RelocationValueRef &Value,
+                               unsigned GOTRelType);
 
   // Resolve the relvative address of GOTOffset in Section ID and place
   // it at the given Offset
   void resolveGOTOffsetRelocation(unsigned SectionID, uint64_t Offset,
-                                  uint64_t GOTOffset);
+                                  uint64_t GOTOffset, uint32_t Type);
 
   // For a GOT entry referenced from SectionID, compute a relocation entry
   // that will place the final resolved value in the GOT slot
-  RelocationEntry computeGOTOffsetRE(unsigned SectionID,
-                                     uint64_t GOTOffset,
-                                     uint64_t SymbolOffset,
+  RelocationEntry computeGOTOffsetRE(uint64_t GOTOffset, uint64_t SymbolOffset,
                                      unsigned Type);
 
   // Compute the address in memory where we can find the placeholder
@@ -146,6 +151,10 @@ private:
   SmallVector<SID, 2> UnregisteredEHFrameSections;
   SmallVector<SID, 2> RegisteredEHFrameSections;
 
+  // Map between GOT relocation value and corresponding GOT offset
+  std::map<RelocationValueRef, uint64_t> GOTOffsetMap;
+
+  bool relocationNeedsGot(const RelocationRef &R) const override;
   bool relocationNeedsStub(const RelocationRef &R) const override;
 
 public:
diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index 279d0de2da7602cb107e06fbbb8655d1898e5254..f5cc883d98fdf189329f8a2da4662e219701f9cd 100644
--- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -213,7 +213,7 @@ public:
   }
 };
 
-/// @brief Symbol info for RuntimeDyld. 
+/// @brief Symbol info for RuntimeDyld.
 class SymbolTableEntry {
 public:
   SymbolTableEntry()
@@ -426,6 +426,9 @@ protected:
                               uint64_t &RODataSize, uint32_t &RODataAlign,
                               uint64_t &RWDataSize, uint32_t &RWDataAlign);
 
+  // \brief Compute GOT size
+  unsigned computeGOTSize(const ObjectFile &Obj);
+
   // \brief Compute the stub buffer size required for a section
   unsigned computeSectionStubBufSize(const ObjectFile &Obj,
                                      const SectionRef &Section);
@@ -433,6 +436,14 @@ protected:
   // \brief Implementation of the generic part of the loadObject algorithm.
   Expected<ObjSectionToIDMap> loadObjectImpl(const object::ObjectFile &Obj);
 
+  // \brief Return size of Global Offset Table (GOT) entry
+  virtual size_t getGOTEntrySize() { return 0; }
+
+  // \brief Return true if the relocation R may require allocating a GOT entry.
+  virtual bool relocationNeedsGot(const RelocationRef &R) const {
+    return false;
+  }
+
   // \brief Return true if the relocation R may require allocating a stub.
   virtual bool relocationNeedsStub(const RelocationRef &R) const {
     return true;    // Conservative answer
diff --git a/lib/Fuzzer/CMakeLists.txt b/lib/Fuzzer/CMakeLists.txt
index 7e41ef89e67c4da6d7c409dc16a89f8cddbb3f33..59cef04cdece1eb653bb9d446aa9c227fcd478b7 100644
--- a/lib/Fuzzer/CMakeLists.txt
+++ b/lib/Fuzzer/CMakeLists.txt
@@ -12,8 +12,9 @@ if( LLVM_USE_SANITIZE_COVERAGE )
     FuzzerCrossOver.cpp
     FuzzerDriver.cpp
     FuzzerExtFunctionsDlsym.cpp
+    FuzzerExtFunctionsDlsymWin.cpp
     FuzzerExtFunctionsWeak.cpp
-    FuzzerExtFunctionsWeakAlias.cpp
+    FuzzerExtraCounters.cpp
     FuzzerIO.cpp
     FuzzerIOPosix.cpp
     FuzzerIOWindows.cpp
@@ -22,6 +23,7 @@ if( LLVM_USE_SANITIZE_COVERAGE )
     FuzzerMutate.cpp
     FuzzerSHA1.cpp
     FuzzerShmemPosix.cpp
+    FuzzerShmemWindows.cpp
     FuzzerTracePC.cpp
     FuzzerTraceState.cpp
     FuzzerUtil.cpp
@@ -33,12 +35,12 @@ if( LLVM_USE_SANITIZE_COVERAGE )
   add_library(LLVMFuzzerNoMain STATIC
     $<TARGET_OBJECTS:LLVMFuzzerNoMainObjects>
     )
-  target_link_libraries(LLVMFuzzerNoMain ${PTHREAD_LIB})
+  target_link_libraries(LLVMFuzzerNoMain ${LLVM_PTHREAD_LIB})
   add_library(LLVMFuzzer STATIC
     FuzzerMain.cpp
     $<TARGET_OBJECTS:LLVMFuzzerNoMainObjects>
     )
-  target_link_libraries(LLVMFuzzer ${PTHREAD_LIB})
+  target_link_libraries(LLVMFuzzer ${LLVM_PTHREAD_LIB})
 
   if( LLVM_INCLUDE_TESTS )
     add_subdirectory(test)
diff --git a/lib/Fuzzer/FuzzerCorpus.h b/lib/Fuzzer/FuzzerCorpus.h
index 468d5e5ddc7012660afc2b9d367676768edcc62b..0f0573994a0358aab18e45e819ae3ab349335ec2 100644
--- a/lib/Fuzzer/FuzzerCorpus.h
+++ b/lib/Fuzzer/FuzzerCorpus.h
@@ -37,8 +37,8 @@ struct InputInfo {
 };
 
 class InputCorpus {
+  static const size_t kFeatureSetSize = 1 << 21;
  public:
-  static const size_t kFeatureSetSize = 1 << 16;
   InputCorpus(const std::string &OutputCorpus) : OutputCorpus(OutputCorpus) {
     memset(InputSizesPerFeature, 0, sizeof(InputSizesPerFeature));
     memset(SmallestElementPerFeature, 0, sizeof(SmallestElementPerFeature));
@@ -68,7 +68,8 @@ class InputCorpus {
   }
   bool empty() const { return Inputs.empty(); }
   const Unit &operator[] (size_t Idx) const { return Inputs[Idx]->U; }
-  void AddToCorpus(const Unit &U, size_t NumFeatures, bool MayDeleteFile = false) {
+  void AddToCorpus(const Unit &U, size_t NumFeatures,
+                   bool MayDeleteFile = false) {
     assert(!U.empty());
     uint8_t Hash[kSHA1NumBytes];
     if (FeatureDebug)
@@ -82,7 +83,7 @@ class InputCorpus {
     II.MayDeleteFile = MayDeleteFile;
     memcpy(II.Sha1, Hash, kSHA1NumBytes);
     UpdateCorpusDistribution();
-    ValidateFeatureSet();
+    // ValidateFeatureSet();
   }
 
   bool HasUnit(const Unit &U) { return Hashes.count(Hash(U)); }
@@ -97,7 +98,7 @@ class InputCorpus {
   // Hypothesis: units added to the corpus last are more likely to be
   // interesting. This function gives more weight to the more recent units.
   size_t ChooseUnitIdxToMutate(Random &Rand) {
-    size_t Idx = static_cast<size_t>(CorpusDistribution(Rand.Get_mt19937()));
+    size_t Idx = static_cast<size_t>(CorpusDistribution(Rand));
     assert(Idx < Inputs.size());
     return Idx;
   }
@@ -132,7 +133,7 @@ class InputCorpus {
       Printf("EVICTED %zd\n", Idx);
   }
 
-  bool AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
+  void AddFeature(size_t Idx, uint32_t NewSize, bool Shrink) {
     assert(NewSize);
     Idx = Idx % kFeatureSetSize;
     uint32_t OldSize = GetFeature(Idx);
@@ -144,23 +145,20 @@ class InputCorpus {
         II.NumFeatures--;
         if (II.NumFeatures == 0)
           DeleteInput(OldIdx);
+      } else {
+        NumAddedFeatures++;
       }
+      NumUpdatedFeatures++;
       if (FeatureDebug)
         Printf("ADD FEATURE %zd sz %d\n", Idx, NewSize);
       SmallestElementPerFeature[Idx] = Inputs.size();
       InputSizesPerFeature[Idx] = NewSize;
       CountingFeatures = true;
-      return true;
     }
-    return false;
   }
 
-  size_t NumFeatures() const {
-    size_t Res = 0;
-    for (size_t i = 0; i < kFeatureSetSize; i++)
-      Res += GetFeature(i) != 0;
-    return Res;
-  }
+  size_t NumFeatures() const { return NumAddedFeatures; }
+  size_t NumFeatureUpdates() const { return NumUpdatedFeatures; }
 
   void ResetFeatureSet() {
     assert(Inputs.empty());
@@ -213,6 +211,8 @@ private:
   std::vector<InputInfo*> Inputs;
 
   bool CountingFeatures = false;
+  size_t NumAddedFeatures = 0;
+  size_t NumUpdatedFeatures = 0;
   uint32_t InputSizesPerFeature[kFeatureSetSize];
   uint32_t SmallestElementPerFeature[kFeatureSetSize];
 
diff --git a/lib/Fuzzer/FuzzerDefs.h b/lib/Fuzzer/FuzzerDefs.h
index 6c1f6a18333ad6a304acbce28ef527ecd5ed9bc7..bd182750800257e33a308d08cb2a44223dc277f5 100644
--- a/lib/Fuzzer/FuzzerDefs.h
+++ b/lib/Fuzzer/FuzzerDefs.h
@@ -55,8 +55,17 @@
 
 #define ATTRIBUTE_NO_SANITIZE_ADDRESS __attribute__((no_sanitize_address))
 
-#define ATTRIBUTE_NO_SANITIZE_ALL ATTRIBUTE_NO_SANITIZE_ADDRESS ATTRIBUTE_NO_SANITIZE_MEMORY
-
+#if defined(__has_feature)
+#  if __has_feature(address_sanitizer)
+#    define ATTRIBUTE_NO_SANITIZE_ALL ATTRIBUTE_NO_SANITIZE_ADDRESS
+#  elif __has_feature(memory_sanitizer)
+#    define ATTRIBUTE_NO_SANITIZE_ALL ATTRIBUTE_NO_SANITIZE_MEMORY
+#  else
+#    define ATTRIBUTE_NO_SANITIZE_ALL
+#  endif
+#else
+#  define ATTRIBUTE_NO_SANITIZE_ALL
+#endif
 
 #if LIBFUZZER_WINDOWS
 #define ATTRIBUTE_INTERFACE __declspec(dllexport)
@@ -87,9 +96,10 @@ typedef int (*UserCallback)(const uint8_t *Data, size_t Size);
 
 int FuzzerDriver(int *argc, char ***argv, UserCallback Callback);
 
-struct ScopedDoingMyOwnMemmem {
-  ScopedDoingMyOwnMemmem();
-  ~ScopedDoingMyOwnMemmem();
+struct ScopedDoingMyOwnMemOrStr {
+  ScopedDoingMyOwnMemOrStr() { DoingMyOwnMemOrStr++; }
+  ~ScopedDoingMyOwnMemOrStr() { DoingMyOwnMemOrStr--; }
+  static int DoingMyOwnMemOrStr;
 };
 
 inline uint8_t  Bswap(uint8_t x)  { return x; }
@@ -97,6 +107,10 @@ inline uint16_t Bswap(uint16_t x) { return __builtin_bswap16(x); }
 inline uint32_t Bswap(uint32_t x) { return __builtin_bswap32(x); }
 inline uint64_t Bswap(uint64_t x) { return __builtin_bswap64(x); }
 
+uint8_t *ExtraCountersBegin();
+uint8_t *ExtraCountersEnd();
+void ClearExtraCounters();
+
 }  // namespace fuzzer
 
 #endif  // LLVM_FUZZER_DEFS_H
diff --git a/lib/Fuzzer/FuzzerDictionary.h b/lib/Fuzzer/FuzzerDictionary.h
index f15ac930f2c53bab55501d5e31275fb57219bbcc..84cee87b8971f4ad7b3c92f0d8fc940c3436b046 100644
--- a/lib/Fuzzer/FuzzerDictionary.h
+++ b/lib/Fuzzer/FuzzerDictionary.h
@@ -33,10 +33,12 @@ public:
   }
 
   bool operator==(const FixedWord<kMaxSize> &w) const {
+    ScopedDoingMyOwnMemOrStr scoped_doing_my_own_mem_os_str;
     return Size == w.Size && 0 == memcmp(Data, w.Data, Size);
   }
 
   bool operator<(const FixedWord<kMaxSize> &w) const {
+    ScopedDoingMyOwnMemOrStr scoped_doing_my_own_mem_os_str;
     if (Size != w.Size)
       return Size < w.Size;
     return memcmp(Data, w.Data, Size) < 0;
@@ -51,7 +53,7 @@ private:
   uint8_t Data[kMaxSize];
 };
 
-typedef FixedWord<27> Word; // 28 bytes.
+typedef FixedWord<64> Word;
 
 class DictionaryEntry {
  public:
diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp
index 5620cdea3fa2fe2e7947983a0b66cf184a795640..0fb83ca64de618e1a52dd5820ab775b9b08bf61a 100644
--- a/lib/Fuzzer/FuzzerDriver.cpp
+++ b/lib/Fuzzer/FuzzerDriver.cpp
@@ -278,6 +278,17 @@ static bool AllInputsAreFiles() {
   return true;
 }
 
+static std::string GetDedupTokenFromFile(const std::string &Path) {
+  auto S = FileToString(Path);
+  auto Beg = S.find("DEDUP_TOKEN:");
+  if (Beg == std::string::npos)
+    return "";
+  auto End = S.find('\n', Beg);
+  if (End == std::string::npos)
+    return "";
+  return S.substr(Beg, End - Beg);
+}
+
 int MinimizeCrashInput(const std::vector<std::string> &Args,
                        const FuzzingOptions &Options) {
   if (Inputs->size() != 1) {
@@ -296,7 +307,10 @@ int MinimizeCrashInput(const std::vector<std::string> &Args,
            "INFO: defaulting to -max_total_time=600\n");
     BaseCmd += " -max_total_time=600";
   }
-  // BaseCmd += " >  /dev/null 2>&1 ";
+
+  auto LogFilePath = DirPlusFile(
+      TmpDir(), "libFuzzerTemp." + std::to_string(GetPid()) + ".txt");
+  auto LogFileRedirect = " > " + LogFilePath + " 2>&1 ";
 
   std::string CurrentFilePath = InputFilePath;
   while (true) {
@@ -304,7 +318,7 @@ int MinimizeCrashInput(const std::vector<std::string> &Args,
     Printf("CRASH_MIN: minimizing crash input: '%s' (%zd bytes)\n",
            CurrentFilePath.c_str(), U.size());
 
-    auto Cmd = BaseCmd + " " + CurrentFilePath;
+    auto Cmd = BaseCmd + " " + CurrentFilePath + LogFileRedirect;
 
     Printf("CRASH_MIN: executing: %s\n", Cmd.c_str());
     int ExitCode = ExecuteCommand(Cmd);
@@ -315,13 +329,19 @@ int MinimizeCrashInput(const std::vector<std::string> &Args,
     Printf("CRASH_MIN: '%s' (%zd bytes) caused a crash. Will try to minimize "
            "it further\n",
            CurrentFilePath.c_str(), U.size());
+    auto DedupToken1 = GetDedupTokenFromFile(LogFilePath);
+    if (!DedupToken1.empty())
+      Printf("CRASH_MIN: DedupToken1: %s\n", DedupToken1.c_str());
 
     std::string ArtifactPath =
-        Options.ArtifactPrefix + "minimized-from-" + Hash(U);
+        Flags.exact_artifact_path
+            ? Flags.exact_artifact_path
+            : Options.ArtifactPrefix + "minimized-from-" + Hash(U);
     Cmd += " -minimize_crash_internal_step=1 -exact_artifact_path=" +
         ArtifactPath;
     Printf("CRASH_MIN: executing: %s\n", Cmd.c_str());
     ExitCode = ExecuteCommand(Cmd);
+    CopyFileToErr(LogFilePath);
     if (ExitCode == 0) {
       if (Flags.exact_artifact_path) {
         CurrentFilePath = Flags.exact_artifact_path;
@@ -329,11 +349,26 @@ int MinimizeCrashInput(const std::vector<std::string> &Args,
       }
       Printf("CRASH_MIN: failed to minimize beyond %s (%d bytes), exiting\n",
              CurrentFilePath.c_str(), U.size());
-      return 0;
+      break;
+    }
+    auto DedupToken2 = GetDedupTokenFromFile(LogFilePath);
+    if (!DedupToken2.empty())
+      Printf("CRASH_MIN: DedupToken2: %s\n", DedupToken2.c_str());
+
+    if (DedupToken1 != DedupToken2) {
+      if (Flags.exact_artifact_path) {
+        CurrentFilePath = Flags.exact_artifact_path;
+        WriteToFile(U, CurrentFilePath);
+      }
+      Printf("CRASH_MIN: mismatch in dedup tokens"
+             " (looks like a different bug). Won't minimize further\n");
+      break;
     }
+
     CurrentFilePath = ArtifactPath;
-    Printf("\n\n\n\n\n\n*********************************\n");
+    Printf("*********************************\n");
   }
+  RemoveFile(LogFilePath);
   return 0;
 }
 
@@ -355,6 +390,74 @@ int MinimizeCrashInputInternalStep(Fuzzer *F, InputCorpus *Corpus) {
   return 0;
 }
 
+int AnalyzeDictionary(Fuzzer *F, const std::vector<Unit>& Dict,
+                      UnitVector& Corpus) {
+  Printf("Started dictionary minimization (up to %d tests)\n",
+         Dict.size() * Corpus.size() * 2);
+
+  // Scores and usage count for each dictionary unit.
+  std::vector<int> Scores(Dict.size());
+  std::vector<int> Usages(Dict.size());
+
+  std::vector<size_t> InitialFeatures;
+  std::vector<size_t> ModifiedFeatures;
+  for (auto &C : Corpus) {
+    // Get coverage for the testcase without modifications.
+    F->ExecuteCallback(C.data(), C.size());
+    InitialFeatures.clear();
+    TPC.CollectFeatures([&](size_t Feature) -> bool {
+      InitialFeatures.push_back(Feature);
+      return true;
+    });
+
+    for (size_t i = 0; i < Dict.size(); ++i) {
+      auto Data = C;
+      auto StartPos = std::search(Data.begin(), Data.end(),
+                                  Dict[i].begin(), Dict[i].end());
+      // Skip dictionary unit, if the testcase does not contain it.
+      if (StartPos == Data.end())
+        continue;
+
+      ++Usages[i];
+      while (StartPos != Data.end()) {
+        // Replace all occurrences of dictionary unit in the testcase.
+        auto EndPos = StartPos + Dict[i].size();
+        for (auto It = StartPos; It != EndPos; ++It)
+          *It ^= 0xFF;
+
+        StartPos = std::search(EndPos, Data.end(),
+                               Dict[i].begin(), Dict[i].end());
+      }
+
+      // Get coverage for testcase with masked occurrences of dictionary unit.
+      F->ExecuteCallback(Data.data(), Data.size());
+      ModifiedFeatures.clear();
+      TPC.CollectFeatures([&](size_t Feature) -> bool {
+        ModifiedFeatures.push_back(Feature);
+        return true;
+      });
+
+      if (InitialFeatures == ModifiedFeatures)
+        --Scores[i];
+      else
+        Scores[i] += 2;
+    }
+  }
+
+  Printf("###### Useless dictionary elements. ######\n");
+  for (size_t i = 0; i < Dict.size(); ++i) {
+    // Dictionary units with positive score are treated as useful ones.
+    if (Scores[i] > 0)
+       continue;
+
+    Printf("\"");
+    PrintASCII(Dict[i].data(), Dict[i].size(), "\"");
+    Printf(" # Score: %d, Used: %d\n", Scores[i], Usages[i]);
+  }
+  Printf("###### End of useless dictionary elements. ######\n");
+  return 0;
+}
+
 int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   using namespace fuzzer;
   assert(argc && argv && "Argument pointers cannot be nullptr");
@@ -482,8 +585,8 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
 
   if (auto Name = Flags.run_equivalence_server) {
     SMR.Destroy(Name);
-    if (!SMR.Create(Name, 1 << 12)) {
-      Printf("ERROR: can't create shared memory region\n");
+    if (!SMR.Create(Name)) {
+       Printf("ERROR: can't create shared memory region\n");
       return 1;
     }
     Printf("INFO: EQUIVALENCE SERVER UP\n");
@@ -530,14 +633,12 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   if (Flags.merge) {
     if (Options.MaxLen == 0)
       F->SetMaxInputLen(kMaxSaneLen);
-    if (TPC.UsingTracePcGuard()) {
-      if (Flags.merge_control_file)
-        F->CrashResistantMergeInternalStep(Flags.merge_control_file);
-      else
-        F->CrashResistantMerge(Args, *Inputs);
-    } else {
-      F->Merge(*Inputs);
-    }
+    if (Flags.merge_control_file)
+      F->CrashResistantMergeInternalStep(Flags.merge_control_file);
+    else
+      F->CrashResistantMerge(Args, *Inputs,
+                             Flags.load_coverage_summary,
+                             Flags.save_coverage_summary);
     exit(0);
   }
 
@@ -550,6 +651,19 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
                            TemporaryMaxLen, /*ExitOnError=*/false);
   }
 
+  if (Flags.analyze_dict) {
+    if (Dictionary.empty() || Inputs->empty()) {
+      Printf("ERROR: can't analyze dict without dict and corpus provided\n");
+      return 1;
+    }
+    if (AnalyzeDictionary(F, Dictionary, InitialCorpus)) {
+      Printf("Dictionary analysis failed\n");
+      exit(1);
+    }
+    Printf("Dictionary analysis suceeded\n");
+    exit(0);
+  }
+
   if (Options.MaxLen == 0) {
     size_t MaxLen = 0;
     for (auto &U : InitialCorpus)
@@ -567,7 +681,7 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) {
   F->Loop();
 
   if (Flags.verbosity)
-    Printf("Done %d runs in %zd second(s)\n", F->getTotalNumberOfRuns(),
+    Printf("Done %zd runs in %zd second(s)\n", F->getTotalNumberOfRuns(),
            F->secondsSinceProcessStartUp());
   F->PrintFinalStats();
 
diff --git a/lib/Fuzzer/FuzzerExtFunctions.def b/lib/Fuzzer/FuzzerExtFunctions.def
index 91c9b07b6652a157f0d29e278d74892f51378af1..3bc5302c31c63e03ef3199888b0641af22dd2f90 100644
--- a/lib/Fuzzer/FuzzerExtFunctions.def
+++ b/lib/Fuzzer/FuzzerExtFunctions.def
@@ -29,24 +29,18 @@ EXT_FUNC(LLVMFuzzerCustomCrossOver, size_t,
 EXT_FUNC(__lsan_enable, void, (), false);
 EXT_FUNC(__lsan_disable, void, (), false);
 EXT_FUNC(__lsan_do_recoverable_leak_check, int, (), false);
-EXT_FUNC(__sanitizer_get_number_of_counters, size_t, (), false);
 EXT_FUNC(__sanitizer_install_malloc_and_free_hooks, int,
          (void (*malloc_hook)(const volatile void *, size_t),
           void (*free_hook)(const volatile void *)),
          false);
-EXT_FUNC(__sanitizer_get_total_unique_caller_callee_pairs, size_t, (), false);
-EXT_FUNC(__sanitizer_get_total_unique_coverage, size_t, (), true);
-EXT_FUNC(__sanitizer_print_memory_profile, int, (size_t), false);
+EXT_FUNC(__sanitizer_print_memory_profile, int, (size_t, size_t), false);
 EXT_FUNC(__sanitizer_print_stack_trace, void, (), true);
 EXT_FUNC(__sanitizer_symbolize_pc, void,
          (void *, const char *fmt, char *out_buf, size_t out_buf_size), false);
 EXT_FUNC(__sanitizer_get_module_and_offset_for_pc, int,
          (void *pc, char *module_path,
          size_t module_path_len,void **pc_offset), false);
-EXT_FUNC(__sanitizer_reset_coverage, void, (), true);
 EXT_FUNC(__sanitizer_set_death_callback, void, (void (*)(void)), true);
 EXT_FUNC(__sanitizer_set_report_fd, void, (void*), false);
-EXT_FUNC(__sanitizer_update_counter_bitset_and_clear_counters, uintptr_t,
-  (uint8_t*), false);
 EXT_FUNC(__sanitizer_dump_coverage, void, (const uintptr_t *, uintptr_t),
          false);
diff --git a/lib/Fuzzer/FuzzerExtFunctionsDlsymWin.cpp b/lib/Fuzzer/FuzzerExtFunctionsDlsymWin.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..77521698c80a4decfcb2dc8e5c2e841ee2a8dfbe
--- /dev/null
+++ b/lib/Fuzzer/FuzzerExtFunctionsDlsymWin.cpp
@@ -0,0 +1,60 @@
+//===- FuzzerExtFunctionsDlsymWin.cpp - Interface to external functions ---===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Implementation using dynamic loading for Windows.
+//===----------------------------------------------------------------------===//
+#include "FuzzerDefs.h"
+#if LIBFUZZER_WINDOWS
+
+#include "FuzzerExtFunctions.h"
+#include "FuzzerIO.h"
+#include "Windows.h"
+#include "Psapi.h"
+
+namespace fuzzer {
+
+ExternalFunctions::ExternalFunctions() {
+  HMODULE Modules[1024];
+  DWORD BytesNeeded;
+  HANDLE CurrentProcess = GetCurrentProcess();
+
+  if (!EnumProcessModules(CurrentProcess, Modules, sizeof(Modules),
+                          &BytesNeeded)) {
+    Printf("EnumProcessModules failed (error: %d).\n", GetLastError());
+    exit(1);
+  }
+
+  if (sizeof(Modules) < BytesNeeded) {
+    Printf("Error: the array is not big enough to hold all loaded modules.\n");
+    exit(1);
+  }
+
+  for (size_t i = 0; i < (BytesNeeded / sizeof(HMODULE)); i++)
+  {
+    FARPROC Fn;
+#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
+    if (this->NAME == nullptr) {                                               \
+      Fn = GetProcAddress(Modules[i], #NAME);                                  \
+      if (Fn == nullptr)                                                       \
+         Fn = GetProcAddress(Modules[i], #NAME "__dll");                       \
+      this->NAME = (decltype(ExternalFunctions::NAME)) Fn;                     \
+    }
+#include "FuzzerExtFunctions.def"
+#undef EXT_FUNC
+  }
+
+#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
+  if (this->NAME == nullptr && WARN)                                           \
+    Printf("WARNING: Failed to find function \"%s\".\n", #NAME);
+#include "FuzzerExtFunctions.def"
+#undef EXT_FUNC
+}
+
+} // namespace fuzzer
+
+#endif // LIBFUZZER_WINDOWS
diff --git a/lib/Fuzzer/FuzzerExtraCounters.cpp b/lib/Fuzzer/FuzzerExtraCounters.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07dbe0fdee7652c2899396f71a20c82a47a828cd
--- /dev/null
+++ b/lib/Fuzzer/FuzzerExtraCounters.cpp
@@ -0,0 +1,41 @@
+//===- FuzzerExtraCounters.cpp - Extra coverage counters ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// Extra coverage counters defined by user code.
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerDefs.h"
+
+#if LIBFUZZER_LINUX
+__attribute__((weak)) extern uint8_t __start___libfuzzer_extra_counters;
+__attribute__((weak)) extern uint8_t __stop___libfuzzer_extra_counters;
+
+namespace fuzzer {
+uint8_t *ExtraCountersBegin() { return &__start___libfuzzer_extra_counters; }
+uint8_t *ExtraCountersEnd() { return &__stop___libfuzzer_extra_counters; }
+ATTRIBUTE_NO_SANITIZE_ALL
+void ClearExtraCounters() {  // hand-written memset, don't asan-ify.
+  uintptr_t *Beg = reinterpret_cast<uintptr_t*>(ExtraCountersBegin());
+  uintptr_t *End = reinterpret_cast<uintptr_t*>(ExtraCountersEnd());
+  for (; Beg < End; Beg++) {
+    *Beg = 0;
+    __asm__ __volatile__("" : : : "memory");
+  }
+}
+
+}  // namespace fuzzer
+
+#else
+// TODO: implement for other platforms.
+namespace fuzzer {
+uint8_t *ExtraCountersBegin() { return nullptr; }
+uint8_t *ExtraCountersEnd() { return nullptr; }
+void ClearExtraCounters() {}
+}  // namespace fuzzer
+
+#endif
diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def
index 7b8d38bb6da1a17042de99b8759ef09df7b82faa..28bf0ca8ce691fbc39d45e4a6b88480731616355 100644
--- a/lib/Fuzzer/FuzzerFlags.def
+++ b/lib/Fuzzer/FuzzerFlags.def
@@ -39,6 +39,13 @@ FUZZER_FLAG_INT(merge, 0, "If 1, the 2-nd, 3-rd, etc corpora will be "
   "merged into the 1-st corpus. Only interesting units will be taken. "
   "This flag can be used to minimize a corpus.")
 FUZZER_FLAG_STRING(merge_control_file, "internal flag")
+FUZZER_FLAG_STRING(save_coverage_summary, "Experimental:"
+                   " save coverage summary to a given file."
+                   " Used with -merge=1")
+FUZZER_FLAG_STRING(load_coverage_summary, "Experimental:"
+                   " load coverage summary from a given file."
+                   " Treat this coverage as belonging to the first corpus. "
+                   " Used with -merge=1")
 FUZZER_FLAG_INT(minimize_crash, 0, "If 1, minimizes the provided"
   " crash input. Use with -runs=N or -max_total_time=N to limit "
   "the number attempts")
@@ -83,7 +90,7 @@ FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information at exit."
 FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information at exit."
                                   " Experimental, only with trace-pc-guard")
 FUZZER_FLAG_INT(handle_segv, 1, "If 1, try to intercept SIGSEGV.")
-FUZZER_FLAG_INT(handle_bus, 1, "If 1, try to intercept SIGSEGV.")
+FUZZER_FLAG_INT(handle_bus, 1, "If 1, try to intercept SIGBUS.")
 FUZZER_FLAG_INT(handle_abrt, 1, "If 1, try to intercept SIGABRT.")
 FUZZER_FLAG_INT(handle_ill, 1, "If 1, try to intercept SIGILL.")
 FUZZER_FLAG_INT(handle_fpe, 1, "If 1, try to intercept SIGFPE.")
@@ -108,6 +115,7 @@ FUZZER_FLAG_STRING(exit_on_item, "Exit if an item with a given sha1 sum"
 
 FUZZER_FLAG_STRING(run_equivalence_server, "Experimental")
 FUZZER_FLAG_STRING(use_equivalence_server, "Experimental")
+FUZZER_FLAG_INT(analyze_dict, 0, "Experimental")
 
 FUZZER_DEPRECATED_FLAG(exit_on_first)
 FUZZER_DEPRECATED_FLAG(save_minimized_corpus)
diff --git a/lib/Fuzzer/FuzzerIO.cpp b/lib/Fuzzer/FuzzerIO.cpp
index 45445fa3ba41af2602ea165d8178abd5b59fea91..e3f609ed8a80063f1aa062c900625f664016ce51 100644
--- a/lib/Fuzzer/FuzzerIO.cpp
+++ b/lib/Fuzzer/FuzzerIO.cpp
@@ -96,7 +96,8 @@ void DupAndCloseStderr() {
     if (NewOutputFile) {
       OutputFile = NewOutputFile;
       if (EF->__sanitizer_set_report_fd)
-        EF->__sanitizer_set_report_fd(reinterpret_cast<void *>(OutputFd));
+        EF->__sanitizer_set_report_fd(
+            reinterpret_cast<void *>(GetHandleFromFd(OutputFd)));
       DiscardOutput(2);
     }
   }
diff --git a/lib/Fuzzer/FuzzerIO.h b/lib/Fuzzer/FuzzerIO.h
index 28c6ba095864f2e155f812343d6472716b161c49..3b66a52d1a6492793ddad07faa057796919711bb 100644
--- a/lib/Fuzzer/FuzzerIO.h
+++ b/lib/Fuzzer/FuzzerIO.h
@@ -69,6 +69,8 @@ void RemoveFile(const std::string &Path);
 
 void DiscardOutput(int Fd);
 
+intptr_t GetHandleFromFd(int fd);
+
 }  // namespace fuzzer
 
 #endif  // LLVM_FUZZER_IO_H
diff --git a/lib/Fuzzer/FuzzerIOPosix.cpp b/lib/Fuzzer/FuzzerIOPosix.cpp
index 40209a034e37dd77e38ba200129e25401330507d..c5ebdbac467bfb771cba34f811a4ca3c8cf09db9 100644
--- a/lib/Fuzzer/FuzzerIOPosix.cpp
+++ b/lib/Fuzzer/FuzzerIOPosix.cpp
@@ -83,6 +83,10 @@ void DiscardOutput(int Fd) {
   fclose(Temp);
 }
 
+intptr_t GetHandleFromFd(int fd) {
+  return static_cast<intptr_t>(fd);
+}
+
 std::string DirName(const std::string &FileName) {
   char *Tmp = new char[FileName.size() + 1];
   memcpy(Tmp, FileName.c_str(), FileName.size() + 1);
diff --git a/lib/Fuzzer/FuzzerIOWindows.cpp b/lib/Fuzzer/FuzzerIOWindows.cpp
index 536e130672557b6ad2b0c77a215bffc2c8156a17..75d4e3a06071ecf32427896a75f6de1207277f7d 100644
--- a/lib/Fuzzer/FuzzerIOWindows.cpp
+++ b/lib/Fuzzer/FuzzerIOWindows.cpp
@@ -149,6 +149,10 @@ void DiscardOutput(int Fd) {
   fclose(Temp);
 }
 
+intptr_t GetHandleFromFd(int fd) {
+  return _get_osfhandle(fd);
+}
+
 static bool IsSeparator(char C) {
   return C == '\\' || C == '/';
 }
diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h
index 31c6585a91d24dea6a34903b772624534b28e1bc..c26615631ecd013856d574f7516e6f66c17d9012 100644
--- a/lib/Fuzzer/FuzzerInternal.h
+++ b/lib/Fuzzer/FuzzerInternal.h
@@ -32,24 +32,6 @@ using namespace std::chrono;
 class Fuzzer {
 public:
 
-  // Aggregates all available coverage measurements.
-  struct Coverage {
-    Coverage() { Reset(); }
-
-    void Reset() {
-      BlockCoverage = 0;
-      CallerCalleeCoverage = 0;
-      CounterBitmapBits = 0;
-      CounterBitmap.clear();
-    }
-
-    size_t BlockCoverage;
-    size_t CallerCalleeCoverage;
-    // Precalculated number of bits in CounterBitmap.
-    size_t CounterBitmapBits;
-    std::vector<uint8_t> CounterBitmap;
-  };
-
   Fuzzer(UserCallback CB, InputCorpus &Corpus, MutationDispatcher &MD,
          FuzzingOptions Options);
   ~Fuzzer();
@@ -88,19 +70,16 @@ public:
   // Merge Corpora[1:] into Corpora[0].
   void Merge(const std::vector<std::string> &Corpora);
   void CrashResistantMerge(const std::vector<std::string> &Args,
-                           const std::vector<std::string> &Corpora);
+                           const std::vector<std::string> &Corpora,
+                           const char *CoverageSummaryInputPathOrNull,
+                           const char *CoverageSummaryOutputPathOrNull);
   void CrashResistantMergeInternalStep(const std::string &ControlFilePath);
-  // Returns a subset of 'Extra' that adds coverage to 'Initial'.
-  UnitVector FindExtraUnits(const UnitVector &Initial, const UnitVector &Extra);
   MutationDispatcher &GetMD() { return MD; }
   void PrintFinalStats();
   void SetMaxInputLen(size_t MaxInputLen);
   void SetMaxMutationLen(size_t MaxMutationLen);
   void RssLimitCallback();
 
-  // Public for tests.
-  void ResetCoverage();
-
   bool InFuzzingThread() const { return IsMyThread; }
   size_t GetCurrentUnitInFuzzingThead(const uint8_t **Data) const;
   void TryDetectingAMemoryLeak(const uint8_t *Data, size_t Size,
@@ -133,16 +112,10 @@ private:
   // Stop tracing.
   void StopTraceRecording();
 
-  void SetDeathCallback();
   static void StaticDeathCallback();
   void DumpCurrentUnit(const char *Prefix);
   void DeathCallback();
 
-  void ResetEdgeCoverage();
-  void ResetCounters();
-  void PrepareCounters(Fuzzer::Coverage *C);
-  bool RecordMaxCoverage(Fuzzer::Coverage *C);
-
   void AllocateCurrentUnitData();
   uint8_t *CurrentUnitData = nullptr;
   std::atomic<size_t> CurrentUnitSize;
@@ -165,16 +138,11 @@ private:
   long TimeOfLongestUnitInSeconds = 0;
   long EpochOfLastReadOfOutputCorpus = 0;
 
-  // Maximum recorded coverage.
-  Coverage MaxCoverage;
-
   size_t MaxInputLen = 0;
   size_t MaxMutationLen = 0;
 
   // Need to know our own thread.
   static thread_local bool IsMyThread;
-
-  bool InMergeMode = false;
 };
 
 }; // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp
index 8f4161ebe9809700dc99812e48a3dbac28b1b180..704092896eb6691a96c4711ecc5c33a6a2e40d7e 100644
--- a/lib/Fuzzer/FuzzerLoop.cpp
+++ b/lib/Fuzzer/FuzzerLoop.cpp
@@ -45,73 +45,9 @@ thread_local bool Fuzzer::IsMyThread;
 
 SharedMemoryRegion SMR;
 
-static void MissingExternalApiFunction(const char *FnName) {
-  Printf("ERROR: %s is not defined. Exiting.\n"
-         "Did you use -fsanitize-coverage=... to build your code?\n",
-         FnName);
-  exit(1);
-}
-
-#define CHECK_EXTERNAL_FUNCTION(fn)                                            \
-  do {                                                                         \
-    if (!(EF->fn))                                                             \
-      MissingExternalApiFunction(#fn);                                         \
-  } while (false)
-
 // Only one Fuzzer per process.
 static Fuzzer *F;
 
-void Fuzzer::ResetEdgeCoverage() {
-  CHECK_EXTERNAL_FUNCTION(__sanitizer_reset_coverage);
-  EF->__sanitizer_reset_coverage();
-}
-
-void Fuzzer::ResetCounters() {
-  if (Options.UseCounters)
-    EF->__sanitizer_update_counter_bitset_and_clear_counters(0);
-}
-
-void Fuzzer::PrepareCounters(Fuzzer::Coverage *C) {
-  if (Options.UseCounters) {
-    size_t NumCounters = EF->__sanitizer_get_number_of_counters();
-    C->CounterBitmap.resize(NumCounters);
-  }
-}
-
-// Records data to a maximum coverage tracker. Returns true if additional
-// coverage was discovered.
-bool Fuzzer::RecordMaxCoverage(Fuzzer::Coverage *C) {
-  bool Res = false;
-
-  uint64_t NewBlockCoverage = EF->__sanitizer_get_total_unique_coverage();
-  if (NewBlockCoverage > C->BlockCoverage) {
-    Res = true;
-    C->BlockCoverage = NewBlockCoverage;
-  }
-
-  if (Options.UseIndirCalls &&
-      EF->__sanitizer_get_total_unique_caller_callee_pairs) {
-    uint64_t NewCallerCalleeCoverage =
-        EF->__sanitizer_get_total_unique_caller_callee_pairs();
-    if (NewCallerCalleeCoverage > C->CallerCalleeCoverage) {
-      Res = true;
-      C->CallerCalleeCoverage = NewCallerCalleeCoverage;
-    }
-  }
-
-  if (Options.UseCounters) {
-    uint64_t CounterDelta =
-        EF->__sanitizer_update_counter_bitset_and_clear_counters(
-            C->CounterBitmap.data());
-    if (CounterDelta > 0) {
-      Res = true;
-      C->CounterBitmapBits += CounterDelta;
-    }
-  }
-
-  return Res;
-}
-
 // Leak detection is expensive, so we first check if there were more mallocs
 // than frees (using the sanitizer malloc hooks) and only then try to call lsan.
 struct MallocFreeTracer {
@@ -179,12 +115,12 @@ void Fuzzer::HandleMalloc(size_t Size) {
 Fuzzer::Fuzzer(UserCallback CB, InputCorpus &Corpus, MutationDispatcher &MD,
                FuzzingOptions Options)
     : CB(CB), Corpus(Corpus), MD(MD), Options(Options) {
-  SetDeathCallback();
+  if (EF->__sanitizer_set_death_callback)
+    EF->__sanitizer_set_death_callback(StaticDeathCallback);
   InitializeTraceState();
   assert(!F);
   F = this;
   TPC.ResetMaps();
-  ResetCoverage();
   IsMyThread = true;
   if (Options.DetectLeaks && EF->__sanitizer_install_malloc_and_free_hooks)
     EF->__sanitizer_install_malloc_and_free_hooks(MallocHook, FreeHook);
@@ -209,33 +145,12 @@ void Fuzzer::AllocateCurrentUnitData() {
   CurrentUnitData = new uint8_t[MaxInputLen];
 }
 
-void Fuzzer::SetDeathCallback() {
-  CHECK_EXTERNAL_FUNCTION(__sanitizer_set_death_callback);
-  EF->__sanitizer_set_death_callback(StaticDeathCallback);
-}
-
 void Fuzzer::StaticDeathCallback() {
   assert(F);
   F->DeathCallback();
 }
 
-static void WarnOnUnsuccessfullMerge(bool DoWarn) {
-  if (!DoWarn) return;
-  Printf(
-   "***\n"
-   "***\n"
-   "***\n"
-   "*** NOTE: merge did not succeed due to a failure on one of the inputs.\n"
-   "*** You will need to filter out crashes from the corpus, e.g. like this:\n"
-   "***   for f in WITH_CRASHES/*; do ./fuzzer $f && cp $f NO_CRASHES; done\n"
-   "*** Future versions may have crash-resistant merge, stay tuned.\n"
-   "***\n"
-   "***\n"
-   "***\n");
-}
-
 void Fuzzer::DumpCurrentUnit(const char *Prefix) {
-  WarnOnUnsuccessfullMerge(InMergeMode);
   if (!CurrentUnitData) return;  // Happens when running individual inputs.
   MD.PrintMutationSequence();
   Printf("; base unit: %s\n", Sha1ToString(BaseSha1).c_str());
@@ -329,7 +244,7 @@ void Fuzzer::RssLimitCallback() {
       GetPid(), GetPeakRSSMb(), Options.RssLimitMb);
   Printf("   To change the out-of-memory limit use -rss_limit_mb=<N>\n\n");
   if (EF->__sanitizer_print_memory_profile)
-    EF->__sanitizer_print_memory_profile(95);
+    EF->__sanitizer_print_memory_profile(95, 8);
   DumpCurrentUnit("oom-");
   Printf("SUMMARY: libFuzzer: out-of-memory\n");
   PrintFinalStats();
@@ -344,24 +259,18 @@ void Fuzzer::PrintStats(const char *Where, const char *End, size_t Units) {
       csvHeaderPrinted = true;
       Printf("runs,block_cov,bits,cc_cov,corpus,execs_per_sec,tbms,reason\n");
     }
-    Printf("%zd,%zd,%zd,%zd,%zd,%zd,%s\n", TotalNumberOfRuns,
-           MaxCoverage.BlockCoverage, MaxCoverage.CounterBitmapBits,
-           MaxCoverage.CallerCalleeCoverage, Corpus.size(), ExecPerSec, Where);
+    Printf("%zd,%zd,%zd,%zd,%s\n", TotalNumberOfRuns,
+           TPC.GetTotalPCCoverage(),
+           Corpus.size(), ExecPerSec, Where);
   }
 
   if (!Options.Verbosity)
     return;
   Printf("#%zd\t%s", TotalNumberOfRuns, Where);
-  if (MaxCoverage.BlockCoverage)
-    Printf(" cov: %zd", MaxCoverage.BlockCoverage);
   if (size_t N = TPC.GetTotalPCCoverage())
     Printf(" cov: %zd", N);
-  if (auto TB = MaxCoverage.CounterBitmapBits)
-    Printf(" bits: %zd", TB);
   if (size_t N = Corpus.NumFeatures())
     Printf( " ft: %zd", N);
-  if (MaxCoverage.CallerCalleeCoverage)
-    Printf(" indir: %zd", MaxCoverage.CallerCalleeCoverage);
   if (!Corpus.empty()) {
     Printf(" corp: %zd", Corpus.NumActiveUnits());
     if (size_t N = Corpus.SizeInBytes()) {
@@ -460,7 +369,7 @@ void Fuzzer::RereadOutputCorpus(size_t MaxSize) {
 }
 
 void Fuzzer::ShuffleCorpus(UnitVector *V) {
-  std::random_shuffle(V->begin(), V->end(), MD.GetRand());
+  std::shuffle(V->begin(), V->end(), MD.GetRand());
   if (Options.PreferSmall)
     std::stable_sort(V->begin(), V->end(), [](const Unit &A, const Unit &B) {
       return A.size() < B.size();
@@ -480,8 +389,6 @@ void Fuzzer::ShuffleAndMinimize(UnitVector *InitialCorpus) {
     if (size_t NumFeatures = RunOne(U)) {
       CheckExitOnSrcPosOrItem();
       Corpus.AddToCorpus(U, NumFeatures);
-      if (Options.Verbosity >= 2)
-        Printf("NEW0: %zd L %zd\n", MaxCoverage.BlockCoverage, U.size());
     }
     TryDetectingAMemoryLeak(U.data(), U.size(),
                             /*DuringInitialCorpusExecution*/ true);
@@ -500,16 +407,11 @@ size_t Fuzzer::RunOne(const uint8_t *Data, size_t Size) {
 
   ExecuteCallback(Data, Size);
 
-  size_t Res = 0;
-  if (size_t NumFeatures = TPC.CollectFeatures([&](size_t Feature) -> bool {
-        return Corpus.AddFeature(Feature, Size, Options.Shrink);
-      }))
-    Res = NumFeatures;
-
-  if (!TPC.UsingTracePcGuard()) {
-    if (!Res && RecordMaxCoverage(&MaxCoverage))
-      Res = 1;
-  }
+  size_t NumUpdatesBefore = Corpus.NumFeatureUpdates();
+  TPC.CollectFeatures([&](size_t Feature) {
+    Corpus.AddFeature(Feature, Size, Options.Shrink);
+  });
+  size_t NumUpdatesAfter = Corpus.NumFeatureUpdates();
 
   auto TimeOfUnit =
       duration_cast<seconds>(UnitStopTime - UnitStartTime).count();
@@ -522,7 +424,7 @@ size_t Fuzzer::RunOne(const uint8_t *Data, size_t Size) {
     Printf("Slowest unit: %zd s:\n", TimeOfLongestUnitInSeconds);
     WriteUnitToFileWithPrefix({Data, Data + Size}, "slow-unit-");
   }
-  return Res;
+  return NumUpdatesAfter - NumUpdatesBefore;
 }
 
 size_t Fuzzer::GetCurrentUnitInFuzzingThead(const uint8_t **Data) const {
@@ -544,7 +446,6 @@ void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) {
   CurrentUnitSize = Size;
   AllocTracer.Start(Options.TraceMalloc);
   UnitStartTime = system_clock::now();
-  ResetCounters();  // Reset coverage right before the callback.
   TPC.ResetMaps();
   RunningCB = true;
   int Res = CB(DataCopy, Size);
@@ -601,77 +502,6 @@ void Fuzzer::ReportNewCoverage(InputInfo *II, const Unit &U) {
   TPC.PrintNewPCs();
 }
 
-// Finds minimal number of units in 'Extra' that add coverage to 'Initial'.
-// We do it by actually executing the units, sometimes more than once,
-// because we may be using different coverage-like signals and the only
-// common thing between them is that we can say "this unit found new stuff".
-UnitVector Fuzzer::FindExtraUnits(const UnitVector &Initial,
-                                  const UnitVector &Extra) {
-  UnitVector Res = Extra;
-  UnitVector Tmp;
-  size_t OldSize = Res.size();
-  for (int Iter = 0; Iter < 10; Iter++) {
-    ShuffleCorpus(&Res);
-    TPC.ResetMaps();
-    Corpus.ResetFeatureSet();
-    ResetCoverage();
-
-    for (auto &U : Initial) {
-      TPC.ResetMaps();
-      RunOne(U);
-    }
-
-    Tmp.clear();
-    for (auto &U : Res) {
-      TPC.ResetMaps();
-      if (RunOne(U))
-        Tmp.push_back(U);
-    }
-
-    char Stat[7] = "MIN   ";
-    Stat[3] = '0' + Iter;
-    PrintStats(Stat, "\n", Tmp.size());
-
-    size_t NewSize = Tmp.size();
-    assert(NewSize <= OldSize);
-    Res.swap(Tmp);
-
-    if (NewSize + 5 >= OldSize)
-      break;
-    OldSize = NewSize;
-  }
-  return Res;
-}
-
-void Fuzzer::Merge(const std::vector<std::string> &Corpora) {
-  if (Corpora.size() <= 1) {
-    Printf("Merge requires two or more corpus dirs\n");
-    return;
-  }
-  InMergeMode = true;
-  std::vector<std::string> ExtraCorpora(Corpora.begin() + 1, Corpora.end());
-
-  assert(MaxInputLen > 0);
-  UnitVector Initial, Extra;
-  ReadDirToVectorOfUnits(Corpora[0].c_str(), &Initial, nullptr, MaxInputLen,
-                         true);
-  for (auto &C : ExtraCorpora)
-    ReadDirToVectorOfUnits(C.c_str(), &Extra, nullptr, MaxInputLen, true);
-
-  if (!Initial.empty()) {
-    Printf("=== Minimizing the initial corpus of %zd units\n", Initial.size());
-    Initial = FindExtraUnits({}, Initial);
-  }
-
-  Printf("=== Merging extra %zd units\n", Extra.size());
-  auto Res = FindExtraUnits(Initial, Extra);
-
-  for (auto &U: Res)
-    WriteToOutputCorpus(U);
-
-  Printf("=== Merge: written %zd units\n", Res.size());
-}
-
 // Tries detecting a memory leak on the particular input that we have just
 // executed before calling this function.
 void Fuzzer::TryDetectingAMemoryLeak(const uint8_t *Data, size_t Size,
@@ -766,12 +596,6 @@ void Fuzzer::MutateAndTestOne() {
   }
 }
 
-void Fuzzer::ResetCoverage() {
-  ResetEdgeCoverage();
-  MaxCoverage.Reset();
-  PrepareCounters(&MaxCoverage);
-}
-
 void Fuzzer::Loop() {
   TPC.InitializePrintNewPCs();
   system_clock::time_point LastCorpusReload = system_clock::now();
diff --git a/lib/Fuzzer/FuzzerMerge.cpp b/lib/Fuzzer/FuzzerMerge.cpp
index 344d7a83959377b764da366e6cca2622c93a3cca..e66460c29e2f8f69247d774d3c1e934b6a28dd44 100644
--- a/lib/Fuzzer/FuzzerMerge.cpp
+++ b/lib/Fuzzer/FuzzerMerge.cpp
@@ -17,6 +17,7 @@
 
 #include <fstream>
 #include <iterator>
+#include <set>
 #include <sstream>
 
 namespace fuzzer {
@@ -73,6 +74,7 @@ bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
   size_t ExpectedStartMarker = 0;
   const size_t kInvalidStartMarker = -1;
   size_t LastSeenStartMarker = kInvalidStartMarker;
+  std::vector<uint32_t> TmpFeatures;
   while (std::getline(IS, Line, '\n')) {
     std::istringstream ISS1(Line);
     std::string Marker;
@@ -88,17 +90,17 @@ bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
       assert(ExpectedStartMarker < Files.size());
       ExpectedStartMarker++;
     } else if (Marker == "DONE") {
-      // DONE FILE_SIZE COV1 COV2 COV3 ...
+      // DONE FILE_ID COV1 COV2 COV3 ...
       size_t CurrentFileIdx = N;
       if (CurrentFileIdx != LastSeenStartMarker)
         return false;
       LastSeenStartMarker = kInvalidStartMarker;
       if (ParseCoverage) {
-        auto &V = Files[CurrentFileIdx].Features;
-        V.clear();
+        TmpFeatures.clear();  // use a vector from outer scope to avoid resizes.
         while (ISS1 >> std::hex >> N)
-          V.push_back(N);
-        std::sort(V.begin(), V.end());
+          TmpFeatures.push_back(N);
+        std::sort(TmpFeatures.begin(), TmpFeatures.end());
+        Files[CurrentFileIdx].Features = TmpFeatures;
       }
     } else {
       return false;
@@ -111,12 +113,20 @@ bool Merger::Parse(std::istream &IS, bool ParseCoverage) {
   return true;
 }
 
+size_t Merger::ApproximateMemoryConsumption() const  {
+  size_t Res = 0;
+  for (const auto &F: Files)
+    Res += sizeof(F) + F.Features.size() * sizeof(F.Features[0]);
+  return Res;
+}
+
 // Decides which files need to be merged (add thost to NewFiles).
 // Returns the number of new features added.
-size_t Merger::Merge(std::vector<std::string> *NewFiles) {
+size_t Merger::Merge(const std::set<uint32_t> &InitialFeatures,
+                     std::vector<std::string> *NewFiles) {
   NewFiles->clear();
   assert(NumFilesInFirstCorpus <= Files.size());
-  std::set<uint32_t> AllFeatures;
+  std::set<uint32_t> AllFeatures(InitialFeatures);
 
   // What features are in the initial corpus?
   for (size_t i = 0; i < NumFilesInFirstCorpus; i++) {
@@ -158,6 +168,42 @@ size_t Merger::Merge(std::vector<std::string> *NewFiles) {
   return AllFeatures.size() - InitialNumFeatures;
 }
 
+void Merger::PrintSummary(std::ostream &OS) {
+  for (auto &File : Files) {
+    OS << std::hex;
+    OS << File.Name << " size: " << File.Size << " features: ";
+    for (auto Feature : File.Features)
+      OS << " " << Feature;
+    OS << "\n";
+  }
+}
+
+std::set<uint32_t> Merger::AllFeatures() const {
+  std::set<uint32_t> S;
+  for (auto &File : Files)
+    S.insert(File.Features.begin(), File.Features.end());
+  return S;
+}
+
+std::set<uint32_t> Merger::ParseSummary(std::istream &IS) {
+  std::string Line, Tmp;
+  std::set<uint32_t> Res;
+  while (std::getline(IS, Line, '\n')) {
+    size_t N;
+    std::istringstream ISS1(Line);
+    ISS1 >> Tmp;  // Name
+    ISS1 >> Tmp;  // size:
+    assert(Tmp == "size:" && "Corrupt summary file");
+    ISS1 >> std::hex;
+    ISS1 >> N;    // File Size
+    ISS1 >> Tmp;  // features:
+    assert(Tmp == "features:" && "Corrupt summary file");
+    while (ISS1 >> std::hex >> N)
+      Res.insert(N);
+  }
+  return Res;
+}
+
 // Inner process. May crash if the target crashes.
 void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath) {
   Printf("MERGE-INNER: using the control file '%s'\n", CFPath.c_str());
@@ -208,7 +254,9 @@ void Fuzzer::CrashResistantMergeInternalStep(const std::string &CFPath) {
 
 // Outer process. Does not call the target code and thus sohuld not fail.
 void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
-                                 const std::vector<std::string> &Corpora) {
+                                 const std::vector<std::string> &Corpora,
+                                 const char *CoverageSummaryInputPathOrNull,
+                                 const char *CoverageSummaryOutputPathOrNull) {
   if (Corpora.size() <= 1) {
     Printf("Merge requires two or more corpus dirs\n");
     return;
@@ -262,8 +310,23 @@ void Fuzzer::CrashResistantMerge(const std::vector<std::string> &Args,
   IF.seekg(0, IF.beg);
   M.ParseOrExit(IF, true);
   IF.close();
+  Printf("MERGE-OUTER: consumed %zdMb (%zdMb rss) to parse the control file\n",
+         M.ApproximateMemoryConsumption() >> 20, GetPeakRSSMb());
+  if (CoverageSummaryOutputPathOrNull) {
+    Printf("MERGE-OUTER: writing coverage summary for %zd files to %s\n",
+           M.Files.size(), CoverageSummaryOutputPathOrNull);
+    std::ofstream SummaryOut(CoverageSummaryOutputPathOrNull);
+    M.PrintSummary(SummaryOut);
+  }
   std::vector<std::string> NewFiles;
-  size_t NumNewFeatures = M.Merge(&NewFiles);
+  std::set<uint32_t> InitialFeatures;
+  if (CoverageSummaryInputPathOrNull) {
+    std::ifstream SummaryIn(CoverageSummaryInputPathOrNull);
+    InitialFeatures = M.ParseSummary(SummaryIn);
+    Printf("MERGE-OUTER: coverage summary loaded from %s, %zd features found\n",
+           CoverageSummaryInputPathOrNull, InitialFeatures.size());
+  }
+  size_t NumNewFeatures = M.Merge(InitialFeatures, &NewFiles);
   Printf("MERGE-OUTER: %zd new files with %zd new features added\n",
          NewFiles.size(), NumNewFeatures);
   for (auto &F: NewFiles)
diff --git a/lib/Fuzzer/FuzzerMerge.h b/lib/Fuzzer/FuzzerMerge.h
index 8a2fe5d74f88daebb8fd52f6ed934f07ade606c0..cf4a0863571d70d29494289bf670edba8a0621d5 100644
--- a/lib/Fuzzer/FuzzerMerge.h
+++ b/lib/Fuzzer/FuzzerMerge.h
@@ -43,7 +43,9 @@
 #include "FuzzerDefs.h"
 
 #include <istream>
+#include <ostream>
 #include <set>
+#include <vector>
 
 namespace fuzzer {
 
@@ -62,7 +64,15 @@ struct Merger {
   bool Parse(std::istream &IS, bool ParseCoverage);
   bool Parse(const std::string &Str, bool ParseCoverage);
   void ParseOrExit(std::istream &IS, bool ParseCoverage);
-  size_t Merge(std::vector<std::string> *NewFiles);
+  void PrintSummary(std::ostream &OS);
+  std::set<uint32_t> ParseSummary(std::istream &IS);
+  size_t Merge(const std::set<uint32_t> &InitialFeatures,
+               std::vector<std::string> *NewFiles);
+  size_t Merge(std::vector<std::string> *NewFiles) {
+    return Merge({}, NewFiles);
+  }
+  size_t ApproximateMemoryConsumption() const;
+  std::set<uint32_t> AllFeatures() const;
 };
 
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp
index f502915c68a03e721239c7d174fc6a88ecd9291b..cd846c7deec5086ef0ee7fadfca46efa699f655a 100644
--- a/lib/Fuzzer/FuzzerMutate.cpp
+++ b/lib/Fuzzer/FuzzerMutate.cpp
@@ -81,8 +81,8 @@ size_t MutationDispatcher::Mutate_CustomCrossOver(uint8_t *Data, size_t Size,
   const Unit &Other = (*Corpus)[Idx];
   if (Other.empty())
     return 0;
-  MutateInPlaceHere.resize(MaxSize);
-  auto &U = MutateInPlaceHere;
+  CustomCrossOverInPlaceHere.resize(MaxSize);
+  auto &U = CustomCrossOverInPlaceHere;
   size_t NewSize = EF->LLVMFuzzerCustomCrossOver(
       Data, Size, Other.data(), Other.size(), U.data(), U.size(), Rand.Rand());
   if (!NewSize)
@@ -99,8 +99,7 @@ size_t MutationDispatcher::Mutate_ShuffleBytes(uint8_t *Data, size_t Size,
       Rand(std::min(Size, (size_t)8)) + 1; // [1,8] and <= Size.
   size_t ShuffleStart = Rand(Size - ShuffleAmount);
   assert(ShuffleStart + ShuffleAmount <= Size);
-  std::random_shuffle(Data + ShuffleStart, Data + ShuffleStart + ShuffleAmount,
-                      Rand);
+  std::shuffle(Data + ShuffleStart, Data + ShuffleStart + ShuffleAmount, Rand);
   return Size;
 }
 
@@ -203,7 +202,7 @@ DictionaryEntry MutationDispatcher::MakeDictionaryEntryFromCMP(
     const void *Arg1Mutation, const void *Arg2Mutation,
     size_t ArgSize, const uint8_t *Data,
     size_t Size) {
-  ScopedDoingMyOwnMemmem scoped_doing_my_own_memmem;
+  ScopedDoingMyOwnMemOrStr scoped_doing_my_own_mem_os_str;
   bool HandleFirst = Rand.RandBool();
   const void *ExistingBytes, *DesiredBytes;
   Word W;
@@ -437,9 +436,9 @@ size_t MutationDispatcher::Mutate_CrossOver(uint8_t *Data, size_t Size,
       break;
     case 1:
       NewSize = InsertPartOf(O.data(), O.size(), U.data(), U.size(), MaxSize);
-      if (NewSize)
-        break;
-      // LLVM_FALLTHROUGH;
+      if (!NewSize)
+        NewSize = CopyPartOf(O.data(), O.size(), U.data(), U.size());
+      break;
     case 2:
       NewSize = CopyPartOf(O.data(), O.size(), U.data(), U.size());
       break;
diff --git a/lib/Fuzzer/FuzzerMutate.h b/lib/Fuzzer/FuzzerMutate.h
index 3d78b111c665ab0d9d57a1bf7560e859d872533f..8c8fb3fd74c7b2dd64ba190df0772f81e201d50e 100644
--- a/lib/Fuzzer/FuzzerMutate.h
+++ b/lib/Fuzzer/FuzzerMutate.h
@@ -143,6 +143,9 @@ private:
 
   const InputCorpus *Corpus = nullptr;
   std::vector<uint8_t> MutateInPlaceHere;
+  // CustomCrossOver needs its own buffer as a custom implementation may call
+  // LLVMFuzzerMutate, which in turn may resize MutateInPlaceHere.
+  std::vector<uint8_t> CustomCrossOverInPlaceHere;
 
   std::vector<Mutator> Mutators;
   std::vector<Mutator> DefaultMutators;
diff --git a/lib/Fuzzer/FuzzerRandom.h b/lib/Fuzzer/FuzzerRandom.h
index b1be0bb935fab94ec53ca83444ac227b3fbeb93a..8a1aa3ef5fdc145fee0bf1b43bcaa578fc18dcf9 100644
--- a/lib/Fuzzer/FuzzerRandom.h
+++ b/lib/Fuzzer/FuzzerRandom.h
@@ -15,10 +15,11 @@
 #include <random>
 
 namespace fuzzer {
-class Random {
+class Random : public std::mt19937 {
  public:
-  Random(unsigned int seed) : R(seed) {}
-  size_t Rand() { return R(); }
+  Random(unsigned int seed) : std::mt19937(seed) {}
+  result_type operator()() { return this->std::mt19937::operator()(); }
+  size_t Rand() { return this->operator()(); }
   size_t RandBool() { return Rand() % 2; }
   size_t operator()(size_t n) { return n ? Rand() % n : 0; }
   intptr_t operator()(intptr_t From, intptr_t To) {
@@ -26,9 +27,6 @@ class Random {
     intptr_t RangeSize = To - From + 1;
     return operator()(RangeSize) + From;
   }
-  std::mt19937 &Get_mt19937() { return R; }
- private:
-  std::mt19937 R;
 };
 
 }  // namespace fuzzer
diff --git a/lib/Fuzzer/FuzzerShmem.h b/lib/Fuzzer/FuzzerShmem.h
index 88719c1775fdda427cc39f14c2f880bc416c412a..53568e0acb69c93c6789b3a7354c2479f3db4fa3 100644
--- a/lib/Fuzzer/FuzzerShmem.h
+++ b/lib/Fuzzer/FuzzerShmem.h
@@ -22,10 +22,9 @@ namespace fuzzer {
 
 class SharedMemoryRegion {
  public:
-  bool Create(const char *Name, size_t Size);
+  bool Create(const char *Name);
   bool Open(const char *Name);
   bool Destroy(const char *Name);
-  size_t GetSize() const { return Size; }
   uint8_t *GetData() { return Data; }
   void PostServer() {Post(0);}
   void WaitServer() {Wait(0);}
@@ -33,7 +32,7 @@ class SharedMemoryRegion {
   void WaitClient() {Wait(1);}
 
   size_t WriteByteArray(const uint8_t *Bytes, size_t N) {
-    N = std::min(N, GetSize() - sizeof(N));
+    assert(N <= kShmemSize - sizeof(N));
     memcpy(GetData(), &N, sizeof(N));
     memcpy(GetData() + sizeof(N), Bytes, N);
     assert(N == ReadByteArraySize());
@@ -50,6 +49,8 @@ class SharedMemoryRegion {
   bool IsClient() const { return Data && !IAmServer; }
 
 private:
+
+  static const size_t kShmemSize = 1 << 22;
   bool IAmServer;
   std::string Path(const char *Name);
   std::string SemName(const char *Name, int Idx);
@@ -57,7 +58,6 @@ private:
   void Wait(int Idx);
 
   bool Map(int fd);
-  size_t Size = 0;
   uint8_t *Data = nullptr;
   void *Semaphore[2];
 };
diff --git a/lib/Fuzzer/FuzzerShmemPosix.cpp b/lib/Fuzzer/FuzzerShmemPosix.cpp
index c87407bb1d61f4e205cde4bb244fefe5cbe7e01b..2723bdd86f487518ba3fa0439f02e7911f44f477 100644
--- a/lib/Fuzzer/FuzzerShmemPosix.cpp
+++ b/lib/Fuzzer/FuzzerShmemPosix.cpp
@@ -21,6 +21,7 @@
 #include <sys/mman.h>
 #include <semaphore.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <unistd.h>
 
 namespace fuzzer {
@@ -35,17 +36,17 @@ std::string SharedMemoryRegion::SemName(const char *Name, int Idx) {
 }
 
 bool SharedMemoryRegion::Map(int fd) {
-  Data = (uint8_t *)mmap(0, Size, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
+  Data =
+      (uint8_t *)mmap(0, kShmemSize, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
   if (Data == (uint8_t*)-1)
     return false;
   return true;
 }
 
-bool SharedMemoryRegion::Create(const char *Name, size_t Size) {
+bool SharedMemoryRegion::Create(const char *Name) {
   int fd = open(Path(Name).c_str(), O_CREAT | O_RDWR, 0777);
   if (fd < 0) return false;
-  if (ftruncate(fd, Size) < 0) return false;
-  this->Size = Size;
+  if (ftruncate(fd, kShmemSize) < 0) return false;
   if (!Map(fd))
     return false;
   for (int i = 0; i < 2; i++) {
@@ -64,7 +65,7 @@ bool SharedMemoryRegion::Open(const char *Name) {
   struct stat stat_res;
   if (0 != fstat(fd, &stat_res))
     return false;
-  Size = stat_res.st_size;
+  assert(stat_res.st_size == kShmemSize);
   if (!Map(fd))
     return false;
   for (int i = 0; i < 2; i++) {
diff --git a/lib/Fuzzer/FuzzerShmemWindows.cpp b/lib/Fuzzer/FuzzerShmemWindows.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6325b4b8e5b4129bd240f5a5f23ba04ab6e565d6
--- /dev/null
+++ b/lib/Fuzzer/FuzzerShmemWindows.cpp
@@ -0,0 +1,64 @@
+//===- FuzzerShmemWindows.cpp - Posix shared memory -------------*- C++ -* ===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// SharedMemoryRegion
+//===----------------------------------------------------------------------===//
+#include "FuzzerDefs.h"
+#if LIBFUZZER_WINDOWS
+
+#include "FuzzerIO.h"
+#include "FuzzerShmem.h"
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stdio.h>
+
+namespace fuzzer {
+
+std::string SharedMemoryRegion::Path(const char *Name) {
+  return DirPlusFile(TmpDir(), Name);
+}
+
+std::string SharedMemoryRegion::SemName(const char *Name, int Idx) {
+  std::string Res(Name);
+  return Res + (char)('0' + Idx);
+}
+
+bool SharedMemoryRegion::Map(int fd) {
+  assert(0 && "UNIMPLEMENTED");
+  return false;
+}
+
+bool SharedMemoryRegion::Create(const char *Name) {
+  assert(0 && "UNIMPLEMENTED");
+  return false;
+}
+
+bool SharedMemoryRegion::Open(const char *Name) {
+  assert(0 && "UNIMPLEMENTED");
+  return false;
+}
+
+bool SharedMemoryRegion::Destroy(const char *Name) {
+  assert(0 && "UNIMPLEMENTED");
+  return false;
+}
+
+void SharedMemoryRegion::Post(int Idx) {
+  assert(0 && "UNIMPLEMENTED");
+}
+
+void SharedMemoryRegion::Wait(int Idx) {
+  Semaphore[1] = nullptr;
+  assert(0 && "UNIMPLEMENTED");
+}
+
+}  // namespace fuzzer
+
+#endif  // LIBFUZZER_WINDOWS
diff --git a/lib/Fuzzer/FuzzerTracePC.cpp b/lib/Fuzzer/FuzzerTracePC.cpp
index 9981fc3459dc3ec7ffd43165800376198a72a617..ce0f7a47eee64bfdc6a0b6dee2aa741455e16667 100644
--- a/lib/Fuzzer/FuzzerTracePC.cpp
+++ b/lib/Fuzzer/FuzzerTracePC.cpp
@@ -24,21 +24,31 @@
 #include <set>
 #include <sstream>
 
+// The coverage counters and PCs.
+// These are declared as global variables named "__sancov_*" to simplify
+// experiments with inlined instrumentation.
+alignas(64) ATTRIBUTE_INTERFACE
+uint8_t __sancov_trace_pc_guard_8bit_counters[fuzzer::TracePC::kNumPCs];
+
+ATTRIBUTE_INTERFACE
+uintptr_t __sancov_trace_pc_pcs[fuzzer::TracePC::kNumPCs];
+
 namespace fuzzer {
 
 TracePC TPC;
 
-ATTRIBUTE_NO_SANITIZE_ALL
-void TracePC::HandleTrace(uint32_t *Guard, uintptr_t PC) {
-  uint32_t Idx = *Guard;
-  PCs[Idx] = PC;
-  Counters[Idx]++;
+uint8_t *TracePC::Counters() const {
+  return __sancov_trace_pc_guard_8bit_counters;
+}
+
+uintptr_t *TracePC::PCs() const {
+  return __sancov_trace_pc_pcs;
 }
 
 size_t TracePC::GetTotalPCCoverage() {
   size_t Res = 0;
   for (size_t i = 1, N = GetNumPCs(); i < N; i++)
-    if (PCs[i])
+    if (PCs()[i])
       Res++;
   return Res;
 }
@@ -81,16 +91,16 @@ void TracePC::InitializePrintNewPCs() {
   assert(!PrintedPCs);
   PrintedPCs = new std::set<uintptr_t>;
   for (size_t i = 1; i < GetNumPCs(); i++)
-    if (PCs[i])
-      PrintedPCs->insert(PCs[i]);
+    if (PCs()[i])
+      PrintedPCs->insert(PCs()[i]);
 }
 
 void TracePC::PrintNewPCs() {
   if (!DoPrintNewPCs) return;
   assert(PrintedPCs);
   for (size_t i = 1; i < GetNumPCs(); i++)
-    if (PCs[i] && PrintedPCs->insert(PCs[i]).second)
-      PrintPC("\tNEW_PC: %p %F %L\n", "\tNEW_PC: %p\n", PCs[i]);
+    if (PCs()[i] && PrintedPCs->insert(PCs()[i]).second)
+      PrintPC("\tNEW_PC: %p %F %L\n", "\tNEW_PC: %p\n", PCs()[i]);
 }
 
 void TracePC::PrintCoverage() {
@@ -107,20 +117,21 @@ void TracePC::PrintCoverage() {
       CoveredLines;
   Printf("COVERAGE:\n");
   for (size_t i = 1; i < GetNumPCs(); i++) {
-    if (!PCs[i]) continue;
-    std::string FileStr = DescribePC("%s", PCs[i]);
+    uintptr_t PC = PCs()[i];
+    if (!PC) continue;
+    std::string FileStr = DescribePC("%s", PC);
     if (!IsInterestingCoverageFile(FileStr)) continue;
-    std::string FixedPCStr = DescribePC("%p", PCs[i]);
-    std::string FunctionStr = DescribePC("%F", PCs[i]);
-    std::string LineStr = DescribePC("%l", PCs[i]);
+    std::string FixedPCStr = DescribePC("%p", PC);
+    std::string FunctionStr = DescribePC("%F", PC);
+    std::string LineStr = DescribePC("%l", PC);
     char ModulePathRaw[4096] = "";  // What's PATH_MAX in portable C++?
     void *OffsetRaw = nullptr;
     if (!EF->__sanitizer_get_module_and_offset_for_pc(
-            reinterpret_cast<void *>(PCs[i]), ModulePathRaw,
+            reinterpret_cast<void *>(PC), ModulePathRaw,
             sizeof(ModulePathRaw), &OffsetRaw))
       continue;
     std::string Module = ModulePathRaw;
-    uintptr_t FixedPC = std::stol(FixedPCStr, 0, 16);
+    uintptr_t FixedPC = std::stoull(FixedPCStr, 0, 16);
     uintptr_t PcOffset = reinterpret_cast<uintptr_t>(OffsetRaw);
     ModuleOffsets[Module] = FixedPC - PcOffset;
     CoveredPCsPerModule[Module].push_back(PcOffset);
@@ -165,7 +176,7 @@ void TracePC::PrintCoverage() {
       if (PcOffsetEnd == std::string::npos)
         continue;
       S.resize(PcOffsetEnd);
-      uintptr_t PcOffset = std::stol(S, 0, 16);
+      uintptr_t PcOffset = std::stoull(S, 0, 16);
       if (!std::binary_search(CoveredOffsets.begin(), CoveredOffsets.end(),
                               PcOffset)) {
         uintptr_t PC = ModuleOffset + PcOffset;
@@ -207,7 +218,7 @@ void TracePC::DumpCoverage() {
   if (EF->__sanitizer_dump_coverage) {
     std::vector<uintptr_t> PCsCopy(GetNumPCs());
     for (size_t i = 0; i < GetNumPCs(); i++)
-      PCsCopy[i] = PCs[i] ? GetPreviousInstructionPc(PCs[i]) : 0;
+      PCsCopy[i] = PCs()[i] ? GetPreviousInstructionPc(PCs()[i]) : 0;
     EF->__sanitizer_dump_coverage(PCsCopy.data(), PCsCopy.size());
   }
 }
@@ -222,7 +233,7 @@ void TracePC::DumpCoverage() {
 // For cmp instructions the interesting value is a XOR of the parameters.
 // The interesting value is mixed up with the PC and is then added to the map.
 
-ATTRIBUTE_NO_SANITIZE_ADDRESS
+ATTRIBUTE_NO_SANITIZE_ALL
 void TracePC::AddValueForMemcmp(void *caller_pc, const void *s1, const void *s2,
                                 size_t n, bool StopAtZero) {
   if (!n) return;
@@ -255,7 +266,7 @@ ATTRIBUTE_TARGET_POPCNT ALWAYS_INLINE
 ATTRIBUTE_NO_SANITIZE_ALL
 void TracePC::HandleCmp(uintptr_t PC, T Arg1, T Arg2) {
   uint64_t ArgXor = Arg1 ^ Arg2;
-  uint64_t ArgDistance = __builtin_popcountl(ArgXor) + 1; // [1,65]
+  uint64_t ArgDistance = __builtin_popcountll(ArgXor) + 1; // [1,65]
   uintptr_t Idx = ((PC & 4095) + 1) * ArgDistance;
   if (sizeof(T) == 4)
       TORC4.Insert(ArgXor, Arg1, Arg2);
@@ -271,7 +282,20 @@ ATTRIBUTE_INTERFACE
 ATTRIBUTE_NO_SANITIZE_ALL
 void __sanitizer_cov_trace_pc_guard(uint32_t *Guard) {
   uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
-  fuzzer::TPC.HandleTrace(Guard, PC);
+  uint32_t Idx = *Guard;
+  __sancov_trace_pc_pcs[Idx] = PC;
+  __sancov_trace_pc_guard_8bit_counters[Idx]++;
+}
+
+// Best-effort support for -fsanitize-coverage=trace-pc, which is available
+// in both Clang and GCC.
+ATTRIBUTE_INTERFACE
+ATTRIBUTE_NO_SANITIZE_ALL
+void __sanitizer_cov_trace_pc() {
+  uintptr_t PC = reinterpret_cast<uintptr_t>(__builtin_return_address(0));
+  uintptr_t Idx = PC & (((uintptr_t)1 << fuzzer::TracePC::kTracePcBits) - 1);
+  __sancov_trace_pc_pcs[Idx] = PC;
+  __sancov_trace_pc_guard_8bit_counters[Idx]++;
 }
 
 ATTRIBUTE_INTERFACE
diff --git a/lib/Fuzzer/FuzzerTracePC.h b/lib/Fuzzer/FuzzerTracePC.h
index 0b666e1963d990539c9755c3af7a2fbeca97e86f..6523fa06005c48ace8f865a857b10f383c607cf5 100644
--- a/lib/Fuzzer/FuzzerTracePC.h
+++ b/lib/Fuzzer/FuzzerTracePC.h
@@ -34,7 +34,7 @@ struct TableOfRecentCompares {
     T A, B;
   };
   ATTRIBUTE_NO_SANITIZE_ALL
-  void Insert(size_t Idx, T Arg1, T Arg2) {
+  void Insert(size_t Idx, const T &Arg1, const T &Arg2) {
     Idx = Idx % kSize;
     Table[Idx].A = Arg1;
     Table[Idx].B = Arg2;
@@ -47,8 +47,10 @@ struct TableOfRecentCompares {
 
 class TracePC {
  public:
+  static const size_t kNumPCs = 1 << 21;
+  // How many bits of PC are used from __sanitizer_cov_trace_pc.
+  static const size_t kTracePcBits = 18;
 
-  void HandleTrace(uint32_t *guard, uintptr_t PC);
   void HandleInit(uint32_t *start, uint32_t *stop);
   void HandleCallerCallee(uintptr_t Caller, uintptr_t Callee);
   template <class T> void HandleCmp(uintptr_t PC, T Arg1, T Arg2);
@@ -56,11 +58,12 @@ class TracePC {
   void SetUseCounters(bool UC) { UseCounters = UC; }
   void SetUseValueProfile(bool VP) { UseValueProfile = VP; }
   void SetPrintNewPCs(bool P) { DoPrintNewPCs = P; }
-  template <class Callback> size_t CollectFeatures(Callback CB);
+  template <class Callback> void CollectFeatures(Callback CB) const;
 
   void ResetMaps() {
     ValueProfileMap.Reset();
-    memset(Counters, 0, GetNumPCs());
+    memset(Counters(), 0, GetNumPCs());
+    ClearExtraCounters();
   }
 
   void UpdateFeatureSet(size_t CurrentElementIdx, size_t CurrentElementSize);
@@ -74,18 +77,18 @@ class TracePC {
   void AddValueForMemcmp(void *caller_pc, const void *s1, const void *s2,
                          size_t n, bool StopAtZero);
 
-  bool UsingTracePcGuard() const {return NumModules; }
-
   TableOfRecentCompares<uint32_t, 32> TORC4;
   TableOfRecentCompares<uint64_t, 32> TORC8;
   TableOfRecentCompares<Word, 32> TORCW;
 
   void PrintNewPCs();
   void InitializePrintNewPCs();
-  size_t GetNumPCs() const { return Min(kNumPCs, NumGuards + 1); }
+  size_t GetNumPCs() const {
+    return NumGuards == 0 ? (1 << kTracePcBits) : Min(kNumPCs, NumGuards + 1);
+  }
   uintptr_t GetPC(size_t Idx) {
     assert(Idx < GetNumPCs());
-    return PCs[Idx];
+    return PCs()[Idx];
   }
 
 private:
@@ -101,49 +104,55 @@ private:
   size_t NumModules;  // linker-initialized.
   size_t NumGuards;  // linker-initialized.
 
-  static const size_t kNumPCs = 1 << 21;
-  alignas(8) uint8_t Counters[kNumPCs];
-  uintptr_t PCs[kNumPCs];
+  uint8_t *Counters() const;
+  uintptr_t *PCs() const;
 
   std::set<uintptr_t> *PrintedPCs;
 
   ValueBitMap ValueProfileMap;
 };
 
-template <class Callback>
-size_t TracePC::CollectFeatures(Callback CB) {
-  if (!UsingTracePcGuard()) return 0;
-  size_t Res = 0;
-  const size_t Step = 8;
-  assert(reinterpret_cast<uintptr_t>(Counters) % Step == 0);
+template <class Callback> // void Callback(size_t Idx, uint8_t Value);
+ATTRIBUTE_NO_SANITIZE_ALL
+void ForEachNonZeroByte(const uint8_t *Begin, const uint8_t *End,
+                        size_t FirstFeature, Callback Handle8bitCounter) {
+  typedef uintptr_t LargeType;
+  const size_t Step = sizeof(LargeType) / sizeof(uint8_t);
+  assert(!(reinterpret_cast<uintptr_t>(Begin) % 64));
+  for (auto P = Begin; P < End; P += Step)
+    if (LargeType Bundle = *reinterpret_cast<const LargeType *>(P))
+      for (size_t I = 0; I < Step; I++, Bundle >>= 8)
+        if (uint8_t V = Bundle & 0xff)
+          Handle8bitCounter(FirstFeature + P - Begin + I, V);
+}
+
+template <class Callback>  // bool Callback(size_t Feature)
+ATTRIBUTE_NO_SANITIZE_ALL
+__attribute__((noinline))
+void TracePC::CollectFeatures(Callback HandleFeature) const {
+  uint8_t *Counters = this->Counters();
   size_t N = GetNumPCs();
-  N = (N + Step - 1) & ~(Step - 1);  // Round up.
-  for (size_t Idx = 0; Idx < N; Idx += Step) {
-    uint64_t Bundle = *reinterpret_cast<uint64_t*>(&Counters[Idx]);
-    if (!Bundle) continue;
-    for (size_t i = Idx; i < Idx + Step; i++) {
-      uint8_t Counter = (Bundle >> ((i - Idx) * 8)) & 0xff;
-      if (!Counter) continue;
-      Counters[i] = 0;
-      unsigned Bit = 0;
-      /**/ if (Counter >= 128) Bit = 7;
-      else if (Counter >= 32) Bit = 6;
-      else if (Counter >= 16) Bit = 5;
-      else if (Counter >= 8) Bit = 4;
-      else if (Counter >= 4) Bit = 3;
-      else if (Counter >= 3) Bit = 2;
-      else if (Counter >= 2) Bit = 1;
-      size_t Feature = (i * 8 + Bit);
-      if (CB(Feature))
-        Res++;
-    }
-  }
+  auto Handle8bitCounter = [&](size_t Idx, uint8_t Counter) {
+    assert(Counter);
+    unsigned Bit = 0;
+    /**/ if (Counter >= 128) Bit = 7;
+    else if (Counter >= 32) Bit = 6;
+    else if (Counter >= 16) Bit = 5;
+    else if (Counter >= 8) Bit = 4;
+    else if (Counter >= 4) Bit = 3;
+    else if (Counter >= 3) Bit = 2;
+    else if (Counter >= 2) Bit = 1;
+    HandleFeature(Idx * 8 + Bit);
+  };
+
+  ForEachNonZeroByte(Counters, Counters + N, 0, Handle8bitCounter);
+  ForEachNonZeroByte(ExtraCountersBegin(), ExtraCountersEnd(), N * 8,
+                     Handle8bitCounter);
+
   if (UseValueProfile)
     ValueProfileMap.ForEach([&](size_t Idx) {
-      if (CB(N * 8 + Idx))
-        Res++;
+      HandleFeature(N * 8 + Idx);
     });
-  return Res;
 }
 
 extern TracePC TPC;
diff --git a/lib/Fuzzer/FuzzerTraceState.cpp b/lib/Fuzzer/FuzzerTraceState.cpp
index 3c9233e155e1fa2234c1b3c7c988ecb892e53285..a486223d650c93d8605fb9df031f8dcdf9a4b45d 100644
--- a/lib/Fuzzer/FuzzerTraceState.cpp
+++ b/lib/Fuzzer/FuzzerTraceState.cpp
@@ -24,10 +24,8 @@ namespace fuzzer {
 
 // Declared as static globals for faster checks inside the hooks.
 static bool RecordingMemmem = false;
-static bool DoingMyOwnMemmem = false;
 
-ScopedDoingMyOwnMemmem::ScopedDoingMyOwnMemmem() { DoingMyOwnMemmem = true; }
-ScopedDoingMyOwnMemmem::~ScopedDoingMyOwnMemmem() { DoingMyOwnMemmem = false; }
+int ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr;
 
 class TraceState {
 public:
@@ -90,6 +88,14 @@ static size_t InternalStrnlen(const char *S, size_t MaxLen) {
   return Len;
 }
 
+// Finds min of (strlen(S1), strlen(S2)).
+// Needed bacause one of these strings may actually be non-zero terminated.
+static size_t InternalStrnlen2(const char *S1, const char *S2) {
+  size_t Len = 0;
+  for (; S1[Len] && S2[Len]; Len++)  {}
+  return Len;
+}
+
 }  // namespace fuzzer
 
 using fuzzer::TS;
@@ -103,17 +109,19 @@ extern "C" {
 
 #if LLVM_FUZZER_DEFINES_SANITIZER_WEAK_HOOOKS
 
-ATTRIBUTE_NO_SANITIZE_MEMORY
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_memcmp(void *caller_pc, const void *s1,
                                   const void *s2, size_t n, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   if (result == 0) return;  // No reason to mutate.
   if (n <= 1) return;  // Not interesting.
   fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, n, /*StopAtZero*/false);
 }
 
-ATTRIBUTE_NO_SANITIZE_MEMORY
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1,
                                    const char *s2, size_t n, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   if (result == 0) return;  // No reason to mutate.
   size_t Len1 = fuzzer::InternalStrnlen(s1, n);
   size_t Len2 = fuzzer::InternalStrnlen(s2, n);
@@ -124,45 +132,48 @@ void __sanitizer_weak_hook_strncmp(void *caller_pc, const char *s1,
 }
 
 
-ATTRIBUTE_NO_SANITIZE_MEMORY
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strcmp(void *caller_pc, const char *s1,
                                    const char *s2, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   if (result == 0) return;  // No reason to mutate.
-  size_t Len1 = strlen(s1);
-  size_t Len2 = strlen(s2);
-  size_t N = std::min(Len1, Len2);
+  size_t N = fuzzer::InternalStrnlen2(s1, s2);
   if (N <= 1) return;  // Not interesting.
   fuzzer::TPC.AddValueForMemcmp(caller_pc, s1, s2, N, /*StopAtZero*/true);
 }
 
-ATTRIBUTE_NO_SANITIZE_MEMORY
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strncasecmp(void *called_pc, const char *s1,
                                        const char *s2, size_t n, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   return __sanitizer_weak_hook_strncmp(called_pc, s1, s2, n, result);
 }
 
-ATTRIBUTE_NO_SANITIZE_MEMORY
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strcasecmp(void *called_pc, const char *s1,
                                       const char *s2, int result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   return __sanitizer_weak_hook_strcmp(called_pc, s1, s2, result);
 }
 
-ATTRIBUTE_NO_SANITIZE_MEMORY
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strstr(void *called_pc, const char *s1,
                                   const char *s2, char *result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
 }
 
-ATTRIBUTE_NO_SANITIZE_MEMORY
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_strcasestr(void *called_pc, const char *s1,
                                       const char *s2, char *result) {
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), strlen(s2));
 }
 
-ATTRIBUTE_NO_SANITIZE_MEMORY
+ATTRIBUTE_INTERFACE ATTRIBUTE_NO_SANITIZE_MEMORY
 void __sanitizer_weak_hook_memmem(void *called_pc, const void *s1, size_t len1,
                                   const void *s2, size_t len2, void *result) {
-  if (fuzzer::DoingMyOwnMemmem) return;
+  if (fuzzer::ScopedDoingMyOwnMemOrStr::DoingMyOwnMemOrStr) return;
   TS->AddInterestingWord(reinterpret_cast<const uint8_t *>(s2), len2);
 }
 
diff --git a/lib/Fuzzer/FuzzerUtilWindows.cpp b/lib/Fuzzer/FuzzerUtilWindows.cpp
index b9e039f81e53ee9514b4a123a40471813e2922f9..08bb3cf3be157b3a98055f751d4f3ed412f0433f 100644
--- a/lib/Fuzzer/FuzzerUtilWindows.cpp
+++ b/lib/Fuzzer/FuzzerUtilWindows.cpp
@@ -28,7 +28,7 @@ namespace fuzzer {
 
 static const FuzzingOptions* HandlerOpt = nullptr;
 
-LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo) {
+static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo) {
   switch (ExceptionInfo->ExceptionRecord->ExceptionCode) {
     case EXCEPTION_ACCESS_VIOLATION:
     case EXCEPTION_ARRAY_BOUNDS_EXCEEDED:
@@ -126,10 +126,7 @@ void SetSignalHandler(const FuzzingOptions& Options) {
 
   if (Options.HandleSegv || Options.HandleBus || Options.HandleIll ||
       Options.HandleFpe)
-    if (!AddVectoredExceptionHandler(1, ExceptionHandler)) {
-      Printf("libFuzzer: AddVectoredExceptionHandler failed.\n");
-      exit(1);
-    }
+    SetUnhandledExceptionFilter(ExceptionHandler);
 
   if (Options.HandleAbrt)
     if (SIG_ERR == signal(SIGABRT, CrashHandler)) {
@@ -179,12 +176,9 @@ const void *SearchMemory(const void *Data, size_t DataLen, const void *Patt,
 }
 
 std::string DisassembleCmd(const std::string &FileName) {
-  if (ExecuteCommand("dumpbin > nul") == 0)
+  if (ExecuteCommand("dumpbin /summary > nul") == 0)
     return "dumpbin /disasm " + FileName;
-  if (ExecuteCommand("llvm-objdump > nul") == 0)
-    return "llvm-objdump -d " + FileName;
-  Printf("libFuzzer: couldn't find tool to disassemble (dumpbin, "
-      "llvm-objdump)\n");
+  Printf("libFuzzer: couldn't find tool to disassemble (dumpbin)\n");
   exit(1);
 }
 
diff --git a/lib/Fuzzer/FuzzerValueBitMap.h b/lib/Fuzzer/FuzzerValueBitMap.h
index 68dc3a9fc3ac10589a69ce253b2afc0e385750fc..8f7ff74300f45c4c55d0de83196125a9be305844 100644
--- a/lib/Fuzzer/FuzzerValueBitMap.h
+++ b/lib/Fuzzer/FuzzerValueBitMap.h
@@ -68,7 +68,7 @@ struct ValueBitMap {
         Other.Map[i] = 0;
       }
       if (M)
-        Res += __builtin_popcountl(M);
+        Res += __builtin_popcountll(M);
     }
     NumBits = Res;
     return OldNumBits < NumBits;
@@ -76,7 +76,7 @@ struct ValueBitMap {
 
   template <class Callback>
   ATTRIBUTE_NO_SANITIZE_ALL
-  void ForEach(Callback CB) {
+  void ForEach(Callback CB) const {
     for (size_t i = 0; i < kMapSizeInWords; i++)
       if (uintptr_t M = Map[i])
         for (size_t j = 0; j < sizeof(M) * 8; j++)
diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp
index fc9589552ba35d9fdb11d1a7b5ce30e24de33ab7..b3a54e57fcebdbe2c4980109a9928b1b747904fa 100644
--- a/lib/Fuzzer/afl/afl_driver.cpp
+++ b/lib/Fuzzer/afl/afl_driver.cpp
@@ -238,6 +238,13 @@ static void maybe_duplicate_stderr() {
   }
 }
 
+// Define LLVMFuzzerMutate to avoid link failures for targets that use it
+// with libFuzzer's LLVMFuzzerCustomMutator.
+extern "C" size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize) {
+  assert(false && "LLVMFuzzerMutate should not be called from afl_driver");
+  return 0;
+}
+
 int main(int argc, char **argv) {
   fprintf(stderr, "======================= INFO =========================\n"
                   "This binary is built for AFL-fuzz.\n"
diff --git a/lib/Fuzzer/test/AbsNegAndConstant64Test.cpp b/lib/Fuzzer/test/AbsNegAndConstant64Test.cpp
index 577481431ae29411d7cc0411c466a282468aeb1e..69b0d59fb8eff432ddd5e62a33c0000b8f0f90e3 100644
--- a/lib/Fuzzer/test/AbsNegAndConstant64Test.cpp
+++ b/lib/Fuzzer/test/AbsNegAndConstant64Test.cpp
@@ -14,7 +14,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   uint64_t y;
   memcpy(&x, Data, sizeof(x));
   memcpy(&y, Data + sizeof(x), sizeof(y));
-  if (labs(x) < 0 && y == 0xbaddcafedeadbeefUL) {
+  if (llabs(x) < 0 && y == 0xbaddcafedeadbeefULL) {
     printf("BINGO; Found the target, exiting; x = 0x%lx y 0x%lx\n", x, y);
     exit(1);
   }
diff --git a/lib/Fuzzer/test/BadStrcmpTest.cpp b/lib/Fuzzer/test/BadStrcmpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..159cd7ea5f7081def7d3a3401b01602fea98069b
--- /dev/null
+++ b/lib/Fuzzer/test/BadStrcmpTest.cpp
@@ -0,0 +1,19 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Test that we don't creash in case of bad strcmp params.
+#include <cstdint>
+#include <cstring>
+#include <cstddef>
+
+static volatile int Sink;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size != 10) return 0;
+  // Data is not zero-terminated, so this call is bad.
+  // Still, there are cases when such calles appear, see e.g.
+  // https://bugs.llvm.org/show_bug.cgi?id=32357
+  Sink = strcmp(reinterpret_cast<const char*>(Data), "123456789");
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt
index 1f9999f440123d7bb85fbcf665912a88802d9ff3..f72bc3909a3cf6fd6f5b439bf3a865cf8f6c45de 100644
--- a/lib/Fuzzer/test/CMakeLists.txt
+++ b/lib/Fuzzer/test/CMakeLists.txt
@@ -39,6 +39,8 @@ if(MSVC)
   set(CMAKE_CXX_CREATE_SHARED_LIBRARY "<CMAKE_CXX_COMPILER> ${CMAKE_CXX_FLAGS} ${CRT_FLAG} /LD <CMAKE_SHARED_LIBRARY_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_CREATE_CXX_FLAGS> <CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG> <TARGET_SONAME> -o <TARGET> <OBJECTS> <LINK_LIBRARIES> /link <LINK_FLAGS>")
 endif()
 
+add_custom_target(TestBinaries)
+
 # add_libfuzzer_test(<name>
 #   SOURCES source0.cpp [source1.cpp ...]
 #   )
@@ -63,12 +65,9 @@ function(add_libfuzzer_test name)
     PROPERTIES RUNTIME_OUTPUT_DIRECTORY
     "${CMAKE_BINARY_DIR}/lib/Fuzzer/test"
     )
-  set(TestBinaries ${TestBinaries} LLVMFuzzer-${name} PARENT_SCOPE)
+  add_dependencies(TestBinaries LLVMFuzzer-${name})
 endfunction()
 
-# Variable to keep track of all test targets
-set(TestBinaries)
-
 ###############################################################################
 # Basic tests
 ###############################################################################
@@ -77,10 +76,12 @@ set(Tests
   AbsNegAndConstantTest
   AbsNegAndConstant64Test
   AccumulateAllocationsTest
+  BadStrcmpTest
   BogusInitializeTest
   BufferOverflowOnInput
   CallerCalleeTest
   CounterTest
+  CustomCrossOverAndMutateTest
   CustomCrossOverTest
   CustomMutatorTest
   CxxStringEqTest
@@ -91,6 +92,7 @@ set(Tests
   FourIndependentBranchesTest
   FullCoverageSetTest
   InitializeTest
+  Memcmp64BytesTest
   MemcmpTest
   LeakTest
   LeakTimeoutTest
@@ -122,11 +124,13 @@ set(Tests
   SwapCmpTest
   SwitchTest
   Switch2Test
+  TableLookupTest
   ThreadedLeakTest
   ThreadedTest
   TimeoutTest
   TimeoutEmptyTest
   TraceMallocTest
+  TwoDifferentBugsTest
   )
 
 if(APPLE OR MSVC)
@@ -143,6 +147,18 @@ foreach(Test ${Tests})
   add_libfuzzer_test(${Test} SOURCES ${Test}.cpp)
 endforeach()
 
+function(test_export_symbol target symbol)
+  if(MSVC)
+    set_target_properties(LLVMFuzzer-${target} PROPERTIES LINK_FLAGS
+        "-export:${symbol}")
+  endif()
+endfunction()
+
+test_export_symbol(InitializeTest "LLVMFuzzerInitialize")
+test_export_symbol(BogusInitializeTest "LLVMFuzzerInitialize")
+test_export_symbol(CustomCrossOverTest "LLVMFuzzerCustomCrossOver")
+test_export_symbol(CustomMutatorTest "LLVMFuzzerCustomMutator")
+
 ###############################################################################
 # Unit tests
 ###############################################################################
@@ -166,13 +182,13 @@ target_include_directories(LLVMFuzzer-Unittest PRIVATE
   "${LLVM_MAIN_SRC_DIR}/utils/unittest/googletest/include"
   )
 
-set(TestBinaries ${TestBinaries} LLVMFuzzer-Unittest)
+add_dependencies(TestBinaries LLVMFuzzer-Unittest)
 set_target_properties(LLVMFuzzer-Unittest
   PROPERTIES RUNTIME_OUTPUT_DIRECTORY
   "${CMAKE_CURRENT_BINARY_DIR}"
 )
 
-set(TestBinaries ${TestBinaries} LLVMFuzzer-StandaloneInitializeTest)
+add_dependencies(TestBinaries LLVMFuzzer-StandaloneInitializeTest)
 set_target_properties(LLVMFuzzer-StandaloneInitializeTest
   PROPERTIES RUNTIME_OUTPUT_DIRECTORY
   "${CMAKE_CURRENT_BINARY_DIR}"
@@ -186,6 +202,7 @@ include_directories(..)
 
 # add_subdirectory(uninstrumented)
 add_subdirectory(no-coverage)
+add_subdirectory(trace-pc)
 add_subdirectory(ubsan)
 
 add_library(LLVMFuzzer-DSO1 SHARED DSO1.cpp)
@@ -218,7 +235,7 @@ else(MSVC)
     LIBRARY_DIR "${CMAKE_BINARY_DIR}/lib/Fuzzer/lib")
 endif()
 
-set(TestBinaries ${TestBinaries} LLVMFuzzer-DSOTest)
+add_dependencies(TestBinaries LLVMFuzzer-DSOTest)
 
 ###############################################################################
 # Configure lit to run the tests
@@ -226,6 +243,10 @@ set(TestBinaries ${TestBinaries} LLVMFuzzer-DSOTest)
 # Note this is done after declaring all tests so we can inform lit if any tests
 # need to be disabled.
 ###############################################################################
+set(LIBFUZZER_POSIX 1)
+if (MSVC)
+  set(LIBFUZZER_POSIX 0)
+endif()
 
 configure_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.site.cfg.in
@@ -239,7 +260,7 @@ configure_lit_site_cfg(
 
 add_lit_testsuite(check-fuzzer "Running Fuzzer tests"
     ${CMAKE_CURRENT_BINARY_DIR}
-    DEPENDS ${TestBinaries}
+    DEPENDS TestBinaries
     )
 
 # Don't add dependencies on Windows. The linker step would fail on Windows,
diff --git a/lib/Fuzzer/test/CustomCrossOverAndMutateTest.cpp b/lib/Fuzzer/test/CustomCrossOverAndMutateTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..74fc939534ca4ce2f1c405b3434ede01b875be45
--- /dev/null
+++ b/lib/Fuzzer/test/CustomCrossOverAndMutateTest.cpp
@@ -0,0 +1,34 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Test that libFuzzer does not crash when LLVMFuzzerMutate called from
+// LLVMFuzzerCustomCrossOver.
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <cstdlib>
+#include <string.h>
+#include <string>
+#include <vector>
+
+#include "FuzzerInterface.h"
+
+static volatile int sink;
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  std::string Str(reinterpret_cast<const char *>(Data), Size);
+  if (Size && Data[0] == '0')
+    sink++;
+  return 0;
+}
+
+extern "C" size_t LLVMFuzzerCustomCrossOver(const uint8_t *Data1, size_t Size1,
+                                            const uint8_t *Data2, size_t Size2,
+                                            uint8_t *Out, size_t MaxOutSize,
+                                            unsigned int Seed) {
+  std::vector<uint8_t> Buffer(MaxOutSize * 10);
+  LLVMFuzzerMutate(Buffer.data(), Buffer.size(), Buffer.size());
+  size_t Size = std::min(Size1, MaxOutSize);
+  memcpy(Out, Data1, Size);
+  return Size;
+}
diff --git a/lib/Fuzzer/test/CxxStringEqTest.cpp b/lib/Fuzzer/test/CxxStringEqTest.cpp
index 9005ab8467b30a1142f37ffce5aae89e6e027755..e0e23c972ccbacff501387cf6726a6be37d1a9f0 100644
--- a/lib/Fuzzer/test/CxxStringEqTest.cpp
+++ b/lib/Fuzzer/test/CxxStringEqTest.cpp
@@ -17,6 +17,7 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   Sink = Str == "123456";   // Try to confuse the fuzzer
   if (Eq) {
     std::cout << "BINGO; Found the target, exiting\n";
+    std::cout.flush();
     abort();
   }
   return 0;
diff --git a/lib/Fuzzer/test/FuzzerUnittest.cpp b/lib/Fuzzer/test/FuzzerUnittest.cpp
index 4992ef57b6ca683e5bee4d23a28255a992087899..78ea874f2ce23adb9c975ea88f3647e528c22e31 100644
--- a/lib/Fuzzer/test/FuzzerUnittest.cpp
+++ b/lib/Fuzzer/test/FuzzerUnittest.cpp
@@ -10,10 +10,12 @@
 #include "FuzzerDictionary.h"
 #include "FuzzerMerge.h"
 #include "FuzzerMutate.h"
+#include "FuzzerTracePC.h"
 #include "FuzzerRandom.h"
 #include "gtest/gtest.h"
 #include <memory>
 #include <set>
+#include <sstream>
 
 using namespace fuzzer;
 
@@ -584,15 +586,15 @@ TEST(FuzzerUtil, Base64) {
 
 TEST(Corpus, Distribution) {
   Random Rand(0);
-  InputCorpus C("");
+  std::unique_ptr<InputCorpus> C(new InputCorpus(""));
   size_t N = 10;
   size_t TriesPerUnit = 1<<16;
   for (size_t i = 0; i < N; i++)
-    C.AddToCorpus(Unit{ static_cast<uint8_t>(i) }, 0);
+    C->AddToCorpus(Unit{ static_cast<uint8_t>(i) }, 0);
 
   std::vector<size_t> Hist(N);
   for (size_t i = 0; i < N * TriesPerUnit; i++) {
-    Hist[C.ChooseUnitIdxToMutate(Rand)]++;
+    Hist[C->ChooseUnitIdxToMutate(Rand)]++;
   }
   for (size_t i = 0; i < N; i++) {
     // A weak sanity check that every unit gets invoked.
@@ -636,7 +638,10 @@ static void Merge(const std::string &Input,
   Merger M;
   std::vector<std::string> NewFiles;
   EXPECT_TRUE(M.Parse(Input, true));
+  std::stringstream SS;
+  M.PrintSummary(SS);
   EXPECT_EQ(NumNewFeatures, M.Merge(&NewFiles));
+  EXPECT_EQ(M.AllFeatures(), M.ParseSummary(SS));
   EQ(NewFiles, Result);
 }
 
@@ -706,6 +711,16 @@ TEST(Merge, Good) {
   EQ(M.Files[2].Features, {1, 3, 6});
   EXPECT_EQ(3U, M.Merge(&NewFiles));
   EQ(NewFiles, {"B"});
+
+  // Same as the above, but with InitialFeatures.
+  EXPECT_TRUE(M.Parse("2\n0\nB\nC\n"
+                        "STARTED 0 1001\nDONE 0 4 5 6 \n"
+                        "STARTED 1 1002\nDONE 1 6 1 3\n"
+                        "", true));
+  EQ(M.Files[0].Features, {4, 5, 6});
+  EQ(M.Files[1].Features, {1, 3, 6});
+  EXPECT_EQ(3U, M.Merge({1, 2, 3}, &NewFiles));
+  EQ(NewFiles, {"B"});
 }
 
 TEST(Merge, Merge) {
@@ -736,3 +751,25 @@ TEST(Merge, Merge) {
         "STARTED 3 1000\nDONE 3 1  \n",
         {"B", "D"}, 3);
 }
+
+TEST(Fuzzer, ForEachNonZeroByte) {
+  const size_t N = 64;
+  alignas(64) uint8_t Ar[N + 8] = {
+    0, 0, 0, 0, 0, 0, 0, 0,
+    1, 2, 0, 0, 0, 0, 0, 0,
+    0, 0, 3, 0, 4, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 5, 0, 6, 0, 0,
+    0, 0, 0, 0, 0, 0, 7, 0,
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 8,
+    9, 9, 9, 9, 9, 9, 9, 9,
+  };
+  typedef std::vector<std::pair<size_t, uint8_t> > Vec;
+  Vec Res, Expected;
+  auto CB = [&](size_t Idx, uint8_t V) { Res.push_back({Idx, V}); };
+  ForEachNonZeroByte(Ar, Ar + N, 100, CB);
+  Expected = {{108, 1}, {109, 2}, {118, 3}, {120, 4},
+              {135, 5}, {137, 6}, {146, 7}, {163, 8}};
+  EXPECT_EQ(Res, Expected);
+}
diff --git a/lib/Fuzzer/test/LargeTest.cpp b/lib/Fuzzer/test/LargeTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..83ed6197180196fa283d77bf9256534292f7a314
--- /dev/null
+++ b/lib/Fuzzer/test/LargeTest.cpp
@@ -0,0 +1,37 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// A fuzz target with lots of edges.
+#include <cstdint>
+#include <cstdlib>
+
+static inline void break_optimization(const void *arg) {
+    __asm__ __volatile__("" : : "r" (arg) : "memory");
+}
+
+#define A                                         \
+  do {                                            \
+    i++;                                          \
+    c++;                                          \
+    if (Data[(i + __LINE__) % Size] == (c % 256)) \
+      break_optimization(Data);                   \
+    else                                          \
+      break_optimization(0);                      \
+  } while (0)
+
+// for (int i = 0, n = Data[(__LINE__ - 1) % Size] % 16; i < n; i++)
+
+#define B do{A; A; A; A; A; A; A; A; A; A; A; A; A; A; A; A; A; A; }while(0)
+#define C do{B; B; B; B; B; B; B; B; B; B; B; B; B; B; B; B; B; B; }while(0)
+#define D do{C; C; C; C; C; C; C; C; C; C; C; C; C; C; C; C; C; C; }while(0)
+#define E do{D; D; D; D; D; D; D; D; D; D; D; D; D; D; D; D; D; D; }while(0)
+
+volatile int sink;
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (!Size) return 0;
+  int c = 0;
+  int i = 0;
+  D;
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/LoadTest.cpp b/lib/Fuzzer/test/LoadTest.cpp
index c1780d5c7bd9ad8fed2e0f73d019cb8c8ed035fc..eef16c7be51eee4572d2c5f4b0dc1d1e0d75ce62 100644
--- a/lib/Fuzzer/test/LoadTest.cpp
+++ b/lib/Fuzzer/test/LoadTest.cpp
@@ -14,7 +14,7 @@ int array[kArraySize];
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   if (Size < 8) return 0;
-  size_t a = 0;
+  uint64_t a = 0;
   memcpy(&a, Data, 8);
   Sink = array[a % (kArraySize + 1)];
   return 0;
diff --git a/lib/Fuzzer/test/Memcmp64BytesTest.cpp b/lib/Fuzzer/test/Memcmp64BytesTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e81526b578a36ff66c58d78bca5361b023455fbf
--- /dev/null
+++ b/lib/Fuzzer/test/Memcmp64BytesTest.cpp
@@ -0,0 +1,20 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. The fuzzer must find a particular string.
+#include <cassert>
+#include <cstring>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  const char kString64Bytes[] =
+      "123456789 123456789 123456789 123456789 123456789 123456789 1234";
+  assert(sizeof(kString64Bytes) == 65);
+  if (Size >= 64 && memcmp(Data, kString64Bytes, 64) == 0) {
+    fprintf(stderr, "BINGO\n");
+    exit(1);
+  }
+  return 0;
+}
diff --git a/lib/Fuzzer/test/UninstrumentedTest.cpp b/lib/Fuzzer/test/NotinstrumentedTest.cpp
similarity index 100%
rename from lib/Fuzzer/test/UninstrumentedTest.cpp
rename to lib/Fuzzer/test/NotinstrumentedTest.cpp
diff --git a/lib/Fuzzer/test/SimpleCmpTest.cpp b/lib/Fuzzer/test/SimpleCmpTest.cpp
index 0220c30f9a6bed63096887d068c43ed2e05a81a2..12b5cdda066076f1ed6b66522fb36b6e0aa46069 100644
--- a/lib/Fuzzer/test/SimpleCmpTest.cpp
+++ b/lib/Fuzzer/test/SimpleCmpTest.cpp
@@ -26,12 +26,13 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   memcpy(&y, Data + 8, 8);  // 16
   memcpy(&z, Data + 16, sizeof(z));  // 20
   memcpy(&a, Data + 20, sizeof(a));  // 22
+  const bool k32bit = sizeof(void*) == 4;
 
-  if (x > 1234567890 && PrintOnce(__LINE__) &&
-      x < 1234567895 && PrintOnce(__LINE__) &&
+  if ((k32bit || x > 1234567890) && PrintOnce(__LINE__) &&
+      (k32bit || x < 1234567895) && PrintOnce(__LINE__) &&
       a == 0x4242 && PrintOnce(__LINE__) &&
-      y >= 987654321 && PrintOnce(__LINE__) &&
-      y <= 987654325 && PrintOnce(__LINE__) &&
+      (k32bit || y >= 987654321) && PrintOnce(__LINE__) &&
+      (k32bit || y <= 987654325) && PrintOnce(__LINE__) &&
       z < -10000 && PrintOnce(__LINE__) &&
       z >= -10005 && PrintOnce(__LINE__) &&
       z != -10003 && PrintOnce(__LINE__) &&
diff --git a/lib/Fuzzer/test/SingleStrncmpTest.cpp b/lib/Fuzzer/test/SingleStrncmpTest.cpp
index dbcc464b0a782de4190dbdae3f6df44f45dcbdf2..e5601da86329959754d5ba183a37fb77766e9ad8 100644
--- a/lib/Fuzzer/test/SingleStrncmpTest.cpp
+++ b/lib/Fuzzer/test/SingleStrncmpTest.cpp
@@ -9,7 +9,8 @@
 
 extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   char *S = (char*)Data;
-  if (Size >= 6 && !strncmp(S, "qwerty", 6)) {
+  volatile auto Strncmp = &(strncmp);   // Make sure strncmp is not inlined.
+  if (Size >= 6 && !Strncmp(S, "qwerty", 6)) {
     fprintf(stderr, "BINGO\n");
     exit(1);
   }
diff --git a/lib/Fuzzer/test/SwapCmpTest.cpp b/lib/Fuzzer/test/SwapCmpTest.cpp
index f79db4ccf714a2f7e23092ead18added13b31ff3..b90ac72c22c4b4c6843130defbb1b06b3e219e8e 100644
--- a/lib/Fuzzer/test/SwapCmpTest.cpp
+++ b/lib/Fuzzer/test/SwapCmpTest.cpp
@@ -19,8 +19,9 @@ extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
   x = __builtin_bswap64(x);
   y = __builtin_bswap32(y);
   z = __builtin_bswap16(z);
+  const bool k32bit = sizeof(void*) == 4;
 
-  if (x == 0x46555A5A5A5A5546ULL &&
+  if ((k32bit || x == 0x46555A5A5A5A5546ULL) &&
       z == 0x4F4B &&
       y == 0x66757A7A &&
       true
diff --git a/lib/Fuzzer/test/TableLookupTest.cpp b/lib/Fuzzer/test/TableLookupTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f9d5610820ff589217f76ef396c7f631ccfa0f43
--- /dev/null
+++ b/lib/Fuzzer/test/TableLookupTest.cpp
@@ -0,0 +1,45 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Make sure the fuzzer eventually finds all possible values of a variable
+// within a range.
+#include <cstring>
+#include <cstdint>
+#include <cstdio>
+#include <cstdlib>
+#include <cassert>
+#include <set>
+
+const size_t N = 1 << 12;
+
+// Define an array of counters that will be understood by libFuzzer
+// as extra coverage signal. The array must be:
+//  * uint8_t
+//  * aligned by 64
+//  * in the section named __libfuzzer_extra_counters.
+// The target code may declare more than one such array.
+//
+// Use either `Counters[Idx] = 1` or `Counters[Idx]++;`
+// depending on whether multiple occurrences of the event 'Idx'
+// is important to distinguish from one occurrence.
+#ifdef __linux__
+alignas(64) __attribute__((section("__libfuzzer_extra_counters")))
+#endif
+static uint8_t Counters[N];
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  static std::set<uint16_t> SeenIdx;
+  if (Size != 4) return 0;
+  uint32_t Idx;
+  memcpy(&Idx, Data, 4);
+  Idx %= N;
+  assert(Counters[Idx] == 0);  // libFuzzer should reset these between the runs.
+  // Or Counters[Idx]=1 if we don't care how many times this happened.
+  Counters[Idx]++;
+  SeenIdx.insert(Idx);
+  if (SeenIdx.size() == N) {
+    fprintf(stderr, "BINGO: found all values\n");
+    abort();
+  }
+  return 0;
+}
diff --git a/lib/Fuzzer/test/TwoDifferentBugsTest.cpp b/lib/Fuzzer/test/TwoDifferentBugsTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..42c0d192ba8669b99a38b3e8c1bc165eedc2249c
--- /dev/null
+++ b/lib/Fuzzer/test/TwoDifferentBugsTest.cpp
@@ -0,0 +1,22 @@
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+
+// Simple test for a fuzzer. This test may trigger two different bugs.
+#include <cstdint>
+#include <cstdlib>
+#include <cstddef>
+#include <iostream>
+
+static volatile int *Null = 0;
+
+void Foo() { Null[1] = 0; }
+void Bar() { Null[2] = 0; }
+
+extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  if (Size < 10 && Data[0] == 'H')
+    Foo();
+  if (Size >= 10 && Data[0] == 'H')
+    Bar();
+  return 0;
+}
+
diff --git a/lib/Fuzzer/test/afl-driver-extra-stats.test b/lib/Fuzzer/test/afl-driver-extra-stats.test
index 81e384e7dad2e557065d3a0a5561b1c7c6fa366d..1b0818e55ea58b4998e0ebdee7812e6c02292d1b 100644
--- a/lib/Fuzzer/test/afl-driver-extra-stats.test
+++ b/lib/Fuzzer/test/afl-driver-extra-stats.test
@@ -1,3 +1,5 @@
+REQUIRES: posix
+
 ; Test that not specifying an extra stats file isn't broken.
 RUN: unset AFL_DRIVER_EXTRA_STATS_FILENAME
 RUN: AFLDriverTest
diff --git a/lib/Fuzzer/test/afl-driver-stderr.test b/lib/Fuzzer/test/afl-driver-stderr.test
index c0f9c8398c2a118485fb003efa9129ce0a846034..e835acd4275b9079792c95a1d4c0853bf03e2089 100644
--- a/lib/Fuzzer/test/afl-driver-stderr.test
+++ b/lib/Fuzzer/test/afl-driver-stderr.test
@@ -1,3 +1,5 @@
+REQUIRES: posix
+
 ; Test that not specifying a stderr file isn't broken.
 RUN: unset AFL_DRIVER_STDERR_DUPLICATE_FILENAME
 RUN: AFLDriverTest
diff --git a/lib/Fuzzer/test/bad-strcmp.test b/lib/Fuzzer/test/bad-strcmp.test
new file mode 100644
index 0000000000000000000000000000000000000000..9a2f3742a5f4bb723d9d4e6431deaefc3f32fd9b
--- /dev/null
+++ b/lib/Fuzzer/test/bad-strcmp.test
@@ -0,0 +1 @@
+RUN: LLVMFuzzer-BadStrcmpTest -runs=100000
diff --git a/lib/Fuzzer/test/coverage.test b/lib/Fuzzer/test/coverage.test
index 07a6f0b87c1e7060aa6acf0fc16027abbc960f0c..ff3fdff57a3d7eeeb7700e3e152d2813bd76da6a 100644
--- a/lib/Fuzzer/test/coverage.test
+++ b/lib/Fuzzer/test/coverage.test
@@ -1,3 +1,5 @@
+XFAIL: darwin
+
 CHECK: COVERAGE:
 CHECK-DAG: COVERED: {{.*}}in LLVMFuzzerTestOneInput {{.*}}NullDerefTest.cpp:13
 CHECK-DAG: COVERED: {{.*}}in LLVMFuzzerTestOneInput {{.*}}NullDerefTest.cpp:14
diff --git a/lib/Fuzzer/test/disable-leaks.test b/lib/Fuzzer/test/disable-leaks.test
new file mode 100644
index 0000000000000000000000000000000000000000..467b64ccc6f4205042fa60f5a4bc85d28957bc32
--- /dev/null
+++ b/lib/Fuzzer/test/disable-leaks.test
@@ -0,0 +1,4 @@
+REQUIRES: lsan
+RUN: LLVMFuzzer-AccumulateAllocationsTest -detect_leaks=1 -runs=100000 2>&1 | FileCheck %s --check-prefix=ACCUMULATE_ALLOCS
+ACCUMULATE_ALLOCS: INFO: libFuzzer disabled leak detection after every mutation
+
diff --git a/lib/Fuzzer/test/dump_coverage.test b/lib/Fuzzer/test/dump_coverage.test
index af1063d0b031f3a33b9cadfcf2ee50bd3f566273..8acc8304fc60dea46f05407d54453ac680aade78 100644
--- a/lib/Fuzzer/test/dump_coverage.test
+++ b/lib/Fuzzer/test/dump_coverage.test
@@ -1,17 +1,13 @@
-RUN: DIR=%t_workdir
-RUN: BUILD_DIR=$(pwd)
-RUN: rm -rf $DIR && mkdir -p $DIR && cd $DIR
-RUN: not $BUILD_DIR/LLVMFuzzer-NullDerefTest -dump_coverage=1 2>&1 | FileCheck %s
-RUN: sancov -covered-functions *.sancov $BUILD_DIR/LLVMFuzzer-NullDerefTest | FileCheck %s --check-prefix=SANCOV
-RUN: $BUILD_DIR/LLVMFuzzer-DSOTest -dump_coverage=1 -runs=0 2>&1 | FileCheck %s --check-prefix=DSO
-RUN: not $BUILD_DIR/LLVMFuzzer-NullDerefTest -dump_coverage=0 2>&1 | FileCheck %s --check-prefix=NOCOV
-RUN: rm -rf $DIR
+RUN: rm -rf %t_workdir && mkdir -p %t_workdir
+RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' not LLVMFuzzer-NullDerefTest -dump_coverage=1 2>&1 | FileCheck %s
+RUN: sancov -covered-functions LLVMFuzzer-NullDerefTest* %t_workdir/*.sancov | FileCheck %s --check-prefix=SANCOV
+RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' LLVMFuzzer-DSOTest -dump_coverage=1 -runs=0 2>&1 | FileCheck %s --check-prefix=DSO
+RUN: env ASAN_OPTIONS=coverage_dir='"%t_workdir"' not LLVMFuzzer-NullDerefTest -dump_coverage=0 2>&1 | FileCheck %s --check-prefix=NOCOV
 
-
-CHECK: SanitizerCoverage: ./LLVMFuzzer-NullDerefTest.{{.*}}.sancov {{.*}} PCs written
+CHECK: SanitizerCoverage: {{.*}}LLVMFuzzer-NullDerefTest.{{.*}}.sancov {{.*}} PCs written
 SANCOV: LLVMFuzzerTestOneInput
 
-DSO: SanitizerCoverage: ./LLVMFuzzer-DSOTest.{{.*}}.sancov {{.*}} PCs written
+DSO: SanitizerCoverage: {{.*}}LLVMFuzzer-DSOTest.{{.*}}.sancov {{.*}} PCs written
 DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO1.{{.*}}.sancov {{.*}} PCs written
 DSO-DAG: SanitizerCoverage: {{.*}}LLVMFuzzer-DSO2.{{.*}}.sancov {{.*}} PCs written
 
diff --git a/lib/Fuzzer/test/equivalence-signals.test b/lib/Fuzzer/test/equivalence-signals.test
new file mode 100644
index 0000000000000000000000000000000000000000..81a7f37602ccd90a57e6595f75bfd6b202d7ce8e
--- /dev/null
+++ b/lib/Fuzzer/test/equivalence-signals.test
@@ -0,0 +1,9 @@
+REQUIRES: posix
+# Run EquivalenceATest against itself with a small timeout
+# to stress the signal handling and ensure that shmem doesn't mind
+# the signals.
+
+RUN: LLVMFuzzer-EquivalenceATest -timeout=1 -run_equivalence_server=EQUIV_SIG_TEST & export APID=$!
+RUN: sleep 3
+RUN: LLVMFuzzer-EquivalenceATest -timeout=1 -use_equivalence_server=EQUIV_SIG_TEST -runs=500000 2>&1
+RUN: kill -9 $APID
diff --git a/lib/Fuzzer/test/equivalence.test b/lib/Fuzzer/test/equivalence.test
index 6c9d87888e078e6e63ddc4edd4638f63d3652ade..015ba855c600622e187ebf3ffe0dadd4e4ab7445 100644
--- a/lib/Fuzzer/test/equivalence.test
+++ b/lib/Fuzzer/test/equivalence.test
@@ -1,16 +1,8 @@
+REQUIRES: posix
+
 RUN: LLVMFuzzer-EquivalenceATest -run_equivalence_server=EQUIV_TEST & export APID=$!
 RUN: sleep 3
-RUN: not LLVMFuzzer-EquivalenceBTest -use_equivalence_server=EQUIV_TEST 2>&1 | FileCheck %s
+RUN: not LLVMFuzzer-EquivalenceBTest -use_equivalence_server=EQUIV_TEST -max_len=4096 2>&1 | FileCheck %s
 CHECK: ERROR: libFuzzer: equivalence-mismatch. Sizes: {{.*}}; offset 2
 CHECK: SUMMARY: libFuzzer: equivalence-mismatch
 RUN: kill -9 $APID
-
-
-# Run EquivalenceATest against itself with a small timeout
-# to stress the signal handling and ensure that shmem doesn't mind
-# the signals.
-
-RUN: LLVMFuzzer-EquivalenceATest -timeout=1 -run_equivalence_server=EQUIV_TEST & export APID=$!
-RUN: sleep 3
-RUN: LLVMFuzzer-EquivalenceATest -timeout=1 -use_equivalence_server=EQUIV_TEST -runs=500000 2>&1
-RUN: kill -9 $APID
diff --git a/lib/Fuzzer/test/extra-counters.test b/lib/Fuzzer/test/extra-counters.test
new file mode 100644
index 0000000000000000000000000000000000000000..61fce44784b78d8fe0e2d8c912976168e90583c8
--- /dev/null
+++ b/lib/Fuzzer/test/extra-counters.test
@@ -0,0 +1,6 @@
+REQUIRES: linux
+
+RUN: not LLVMFuzzer-TableLookupTest -print_final_stats=1 2>&1 | FileCheck %s
+CHECK: BINGO
+// Expecting >= 4096 new_units_added
+CHECK: stat::new_units_added:{{.*[4][0-9][0-9][0-9]}}
diff --git a/lib/Fuzzer/test/fuzzer-customcrossover.test b/lib/Fuzzer/test/fuzzer-customcrossover.test
index 28d39ce31decc312e7350c67f2e12b193d49d50c..ccf8261af8adf8f6edcce7d6d5add208245b536f 100644
--- a/lib/Fuzzer/test/fuzzer-customcrossover.test
+++ b/lib/Fuzzer/test/fuzzer-customcrossover.test
@@ -2,7 +2,7 @@ RUN: rm -rf %t/CustomCrossover
 RUN: mkdir -p %t/CustomCrossover
 RUN: echo "0123456789" > %t/CustomCrossover/digits
 RUN: echo "abcdefghij" > %t/CustomCrossover/chars
-RUN: not LLVMFuzzer-CustomCrossOverTest -seed=1 -use_memcmp=0 -runs=100000 %t/CustomCrossover 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomCrossover
+RUN: not LLVMFuzzer-CustomCrossOverTest -seed=1 -runs=100000 %t/CustomCrossover 2>&1 | FileCheck %s --check-prefix=LLVMFuzzerCustomCrossover
 RUN: rm -rf %t/CustomCrossover
 
 LLVMFuzzerCustomCrossover: In LLVMFuzzerCustomCrossover
diff --git a/lib/Fuzzer/test/fuzzer-customcrossoverandmutate.test b/lib/Fuzzer/test/fuzzer-customcrossoverandmutate.test
new file mode 100644
index 0000000000000000000000000000000000000000..1e322ec0da631a1e7e4c802ea21e6618052efa16
--- /dev/null
+++ b/lib/Fuzzer/test/fuzzer-customcrossoverandmutate.test
@@ -0,0 +1 @@
+RUN: LLVMFuzzer-CustomCrossOverAndMutateTest -seed=1 -runs=100000
diff --git a/lib/Fuzzer/test/fuzzer-dirs.test b/lib/Fuzzer/test/fuzzer-dirs.test
index 63afe8dfcf9c839d9bad82b45e84f143065a2f02..3de64f278f5dfda6d40b8ec89812dd77c7a9c062 100644
--- a/lib/Fuzzer/test/fuzzer-dirs.test
+++ b/lib/Fuzzer/test/fuzzer-dirs.test
@@ -5,9 +5,9 @@ RUN: echo b > %t/SUB1/SUB2/b
 RUN: echo c > %t/SUB1/SUB2/SUB3/c
 RUN: LLVMFuzzer-SimpleTest %t/SUB1 -runs=0 2>&1 | FileCheck %s --check-prefix=SUBDIRS
 SUBDIRS: READ   units: 3
-RUN: echo zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz > %t/SUB1/long
+RUN: echo -n zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz > %t/SUB1/long
 RUN: LLVMFuzzer-SimpleTest %t/SUB1 -runs=0 2>&1 | FileCheck %s --check-prefix=LONG
-LONG: INFO: -max_len is not provided, using 94
+LONG: INFO: -max_len is not provided, using 93
 RUN: rm -rf %t/SUB1
 
 RUN: not LLVMFuzzer-SimpleTest NONEXISTENT_DIR 2>&1 | FileCheck %s --check-prefix=NONEXISTENT_DIR
diff --git a/lib/Fuzzer/test/fuzzer-jobs.test b/lib/Fuzzer/test/fuzzer-jobs.test
deleted file mode 100644
index 5bf8cfadfb75a6d7c1b154a433a19602778e44f6..0000000000000000000000000000000000000000
--- a/lib/Fuzzer/test/fuzzer-jobs.test
+++ /dev/null
@@ -1,29 +0,0 @@
-RUN: rm -rf %tmp
-RUN: mkdir %tmp && cd %tmp
-# Create a shared corpus directory
-RUN: rm -rf FuzzerJobsTestCORPUS
-RUN: mkdir FuzzerJobsTestCORPUS
-RUN: rm -f fuzz-{0,1}.log
-# Start fuzzer and in parallel check that the output files
-# that should be created exist.
-RUN: LLVMFuzzer-EmptyTest -max_total_time=4 -jobs=2 -workers=2 FuzzerJobsTestCORPUS > %t-fuzzer-jobs-test.log 2>&1 & export FUZZER_PID=$!
-# Wait a short while to give time for the child processes
-# to start fuzzing
-RUN: sleep 2
-# If the instances are running in parallel they should have created their log
-# files by now.
-RUN: ls fuzz-0.log
-RUN: ls fuzz-1.log
-# Wait for libfuzzer to finish.
-# This probably isn't portable but we need a way to block until
-# the fuzzer is done otherwise we might remove the files while
-# they are being used.
-RUN: while kill -0 ${FUZZER_PID}; do : ; done
-RUN: rm -f fuzz-{0,1}.log
-RUN: rm -rf FuzzerJobsTestCORPUS
-RUN: FileCheck -input-file=%t-fuzzer-jobs-test.log %s
-RUN: rm %t-fuzzer-jobs-test.log
-RUN: cd ../
-
-CHECK-DAG: Job 0 exited with exit code 0
-CHECK-DAG: Job 1 exited with exit code 0
diff --git a/lib/Fuzzer/test/fuzzer-leak.test b/lib/Fuzzer/test/fuzzer-leak.test
index 9cf5c743fff584a23e2481544e75fac680d4a575..13e3ad740e6db34d75fa19e324b63d5356f92bfa 100644
--- a/lib/Fuzzer/test/fuzzer-leak.test
+++ b/lib/Fuzzer/test/fuzzer-leak.test
@@ -29,7 +29,5 @@ RUN: not LLVMFuzzer-LeakTimeoutTest -timeout=1 2>&1 | FileCheck %s --check-prefi
 LEAK_TIMEOUT: ERROR: libFuzzer: timeout after
 LEAK_TIMEOUT-NOT: LeakSanitizer
 
-RUN: LLVMFuzzer-AccumulateAllocationsTest -detect_leaks=1 -runs=100000 2>&1 | FileCheck %s --check-prefix=ACCUMULATE_ALLOCS
-ACCUMULATE_ALLOCS: INFO: libFuzzer disabled leak detection after every mutation
 
 RUN: LLVMFuzzer-LeakTest -error_exitcode=0
diff --git a/lib/Fuzzer/test/fuzzer-oom.test b/lib/Fuzzer/test/fuzzer-oom.test
index 5c3bf78158a0c9a4e308acf68ae3a655b076d037..2db91915876e33d838e52e9417b29ff718b68fc2 100644
--- a/lib/Fuzzer/test/fuzzer-oom.test
+++ b/lib/Fuzzer/test/fuzzer-oom.test
@@ -1,4 +1,5 @@
 RUN: not LLVMFuzzer-OutOfMemoryTest -rss_limit_mb=300 2>&1 | FileCheck %s
+
 CHECK: ERROR: libFuzzer: out-of-memory (used: {{.*}}; limit: 300Mb)
 CHECK: Test unit written to ./oom-
 SUMMARY: libFuzzer: out-of-memory
diff --git a/lib/Fuzzer/test/fuzzer-segv.test b/lib/Fuzzer/test/fuzzer-segv.test
index 330f03bcc49457f7184098381c6c23c2ecfe9f15..b9a6a5ce44ca004006a1f09169de92819797f641 100644
--- a/lib/Fuzzer/test/fuzzer-segv.test
+++ b/lib/Fuzzer/test/fuzzer-segv.test
@@ -1,4 +1,4 @@
-RUN: ASAN_OPTIONS=handle_segv=0 not LLVMFuzzer-NullDerefTest 2>&1 | FileCheck %s --check-prefix=LIBFUZZER_OWN_SEGV_HANDLER
+RUN: env ASAN_OPTIONS=handle_segv=0 not LLVMFuzzer-NullDerefTest 2>&1 | FileCheck %s --check-prefix=LIBFUZZER_OWN_SEGV_HANDLER
 LIBFUZZER_OWN_SEGV_HANDLER: == ERROR: libFuzzer: deadly signal
 LIBFUZZER_OWN_SEGV_HANDLER: SUMMARY: libFuzzer: deadly signal
 LIBFUZZER_OWN_SEGV_HANDLER: Test unit written to ./crash-
diff --git a/lib/Fuzzer/test/fuzzer-singleinputs.test b/lib/Fuzzer/test/fuzzer-singleinputs.test
index ca8403bff81fb59264d6fda8c0f7bce13227af5e..500e5da8faa962b2717e61e027ef4910ec236228 100644
--- a/lib/Fuzzer/test/fuzzer-singleinputs.test
+++ b/lib/Fuzzer/test/fuzzer-singleinputs.test
@@ -8,7 +8,7 @@ RUN: echo bbb > %tmp/SINGLE_INPUTS/bbb
 RUN: LLVMFuzzer-SimpleTest            %tmp/SINGLE_INPUTS/aaa %tmp/SINGLE_INPUTS/bbb 2>&1 | FileCheck %s --check-prefix=SINGLE_INPUTS
 RUN: LLVMFuzzer-SimpleTest -max_len=2 %tmp/SINGLE_INPUTS/aaa %tmp/SINGLE_INPUTS/bbb 2>&1 | FileCheck %s --check-prefix=SINGLE_INPUTS
 RUN: rm -rf  %tmp/SINGLE_INPUTS
-SINGLE_INPUTS: LLVMFuzzer-SimpleTest: Running 2 inputs 1 time(s) each.
+SINGLE_INPUTS: LLVMFuzzer-SimpleTest{{.*}}: Running 2 inputs 1 time(s) each.
 SINGLE_INPUTS: aaa in
 SINGLE_INPUTS: bbb in
 SINGLE_INPUTS: NOTE: fuzzing was not performed, you have only
diff --git a/lib/Fuzzer/test/fuzzer-traces-hooks.test b/lib/Fuzzer/test/fuzzer-traces-hooks.test
index 14f4f8bfbb9d0d8b15157f72bda512aa686b9891..f93a8b7199e25039d9ec117f3806d5b398d6ad59 100644
--- a/lib/Fuzzer/test/fuzzer-traces-hooks.test
+++ b/lib/Fuzzer/test/fuzzer-traces-hooks.test
@@ -1,7 +1,6 @@
-// FIXME: Support sanitizer hooks for memcmp and strcmp need
-// to be implemented in the sanitizer runtime for platforms other
-// than linux
-REQUIRES: linux
+// FIXME: Support for sanitizer hooks for memcmp and strcmp needs to
+// be implemented in the sanitizer runtime for this test
+UNSUPPORTED: windows
 CHECK: BINGO
 
 RUN: not LLVMFuzzer-MemcmpTest               -seed=1 -runs=2000000   2>&1 | FileCheck %s
@@ -9,9 +8,10 @@ RUN: not LLVMFuzzer-StrncmpTest              -seed=1 -runs=2000000   2>&1 | File
 RUN: not LLVMFuzzer-StrcmpTest               -seed=1 -runs=2000000   2>&1 | FileCheck %s
 RUN: not LLVMFuzzer-StrstrTest               -seed=1 -runs=2000000   2>&1 | FileCheck %s
 
+RUN: not LLVMFuzzer-Memcmp64BytesTest        -seed=1 -runs=1000000   2>&1 | FileCheck %s
+
 RUN: LLVMFuzzer-RepeatedMemcmp -seed=11 -runs=100000 2>&1 | FileCheck %s --check-prefix=RECOMMENDED_DICT
 RECOMMENDED_DICT:###### Recommended dictionary. ######
 RECOMMENDED_DICT-DAG: "foo"
 RECOMMENDED_DICT-DAG: "bar"
 RECOMMENDED_DICT:###### End of recommended dictionary. ######
-
diff --git a/lib/Fuzzer/test/fuzzer.test b/lib/Fuzzer/test/fuzzer.test
index a54e75c4f5fcc65cf483000cb8fb9447b90b5462..ff46d32b387d74e855c87399c34c734bccd98ca3 100644
--- a/lib/Fuzzer/test/fuzzer.test
+++ b/lib/Fuzzer/test/fuzzer.test
@@ -11,7 +11,7 @@ MaxTotalTime: Done {{.*}} runs in {{.}} second(s)
 
 RUN: not LLVMFuzzer-NullDerefTest                  2>&1 | FileCheck %s --check-prefix=NullDerefTest
 RUN: not LLVMFuzzer-NullDerefTest -close_fd_mask=3 2>&1 | FileCheck %s --check-prefix=NullDerefTest
-NullDerefTest: ERROR: AddressSanitizer: SEGV on unknown address
+NullDerefTest: ERROR: AddressSanitizer: {{SEGV|access-violation}} on unknown address
 NullDerefTest: Test unit written to ./crash-
 RUN: not LLVMFuzzer-NullDerefTest  -artifact_prefix=ZZZ 2>&1 | FileCheck %s --check-prefix=NullDerefTestPrefix
 NullDerefTestPrefix: Test unit written to ZZZcrash-
@@ -34,7 +34,7 @@ COUNTERS: BINGO
 DISABLED: not LLVMFuzzer-UninstrumentedTest-Uninstrumented 2>&1 | FileCheck %s --check-prefix=UNINSTRUMENTED
 UNINSTRUMENTED: ERROR: __sanitizer_set_death_callback is not defined. Exiting.
 
-RUN: not LLVMFuzzer-UninstrumentedTest-NoCoverage 2>&1 | FileCheck %s --check-prefix=NO_COVERAGE
+RUN: not LLVMFuzzer-NotinstrumentedTest-NoCoverage 2>&1 | FileCheck %s --check-prefix=NO_COVERAGE
 NO_COVERAGE: ERROR: no interesting inputs were found. Is the code instrumented for coverage? Exiting
 
 RUN: not LLVMFuzzer-BufferOverflowOnInput 2>&1 | FileCheck %s --check-prefix=OOB
@@ -51,7 +51,7 @@ RUN: LLVMFuzzer-SimpleTest  -exit_on_src_pos=SimpleTest.cpp:17                 2
 RUN: LLVMFuzzer-ShrinkControlFlowTest  -exit_on_src_pos=ShrinkControlFlowTest.cpp:23 2>&1 | FileCheck %s --check-prefix=EXIT_ON_SRC_POS
 EXIT_ON_SRC_POS: INFO: found line matching '{{.*}}', exiting.
 
-RUN: ASAN_OPTIONS=strict_string_checks=1 not LLVMFuzzer-StrncmpOOBTest -seed=1 -runs=1000000 2>&1 | FileCheck %s --check-prefix=STRNCMP
+RUN: env ASAN_OPTIONS=strict_string_checks=1 not LLVMFuzzer-StrncmpOOBTest -seed=1 -runs=1000000 2>&1 | FileCheck %s --check-prefix=STRNCMP
 STRNCMP: AddressSanitizer: heap-buffer-overflow
 STRNCMP-NOT: __sanitizer_weak_hook_strncmp
 STRNCMP: in LLVMFuzzerTestOneInput
diff --git a/lib/Fuzzer/test/lit.cfg b/lib/Fuzzer/test/lit.cfg
index 745af0c3824507f409d125c020b80f95c32d72d4..85c95b42d1eaf1ba60882af7c6b38f051acb57d1 100644
--- a/lib/Fuzzer/test/lit.cfg
+++ b/lib/Fuzzer/test/lit.cfg
@@ -6,6 +6,23 @@ config.test_format = lit.formats.ShTest(True)
 config.suffixes = ['.test']
 config.test_source_root = os.path.dirname(__file__)
 
+# Choose between lit's internal shell pipeline runner and a real shell.  If
+# LIT_USE_INTERNAL_SHELL is in the environment, we use that as an override.
+use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL")
+if use_lit_shell:
+    # 0 is external, "" is default, and everything else is internal.
+    execute_external = (use_lit_shell == "0")
+else:
+    # Otherwise we default to internal on Windows and external elsewhere, as
+    # bash on Windows is usually very slow.
+    execute_external = (not sys.platform in ['win32'])
+
+# testFormat: The test format to use to interpret tests.
+#
+# For now we require '&&' between commands, until they get globally killed and
+# the test runner updated.
+config.test_format = lit.formats.ShTest(execute_external)
+
 # Tweak PATH to include llvm tools dir and current exec dir.
 llvm_tools_dir = getattr(config, 'llvm_tools_dir', None)
 if (not llvm_tools_dir) or (not os.path.exists(llvm_tools_dir)):
@@ -20,6 +37,15 @@ if config.has_lsan:
 else:
   lit_config.note('lsan feature unavailable')
 
+if sys.platform.startswith('win') or sys.platform.startswith('cygwin'):
+  config.available_features.add('windows')
+
+if sys.platform.startswith('darwin'):
+  config.available_features.add('darwin')
+
+if config.is_posix:
+  config.available_features.add('posix')
+
 if sys.platform.startswith('linux'):
   # Note the value of ``sys.platform`` is not consistent
   # between python 2 and 3, hence the use of ``.startswith()``.
diff --git a/lib/Fuzzer/test/lit.site.cfg.in b/lib/Fuzzer/test/lit.site.cfg.in
index 03e86c487ca90fa26d7df8a0de2ce965737eda42..069f2b72c0d94132139d5e71345e4209f5423843 100644
--- a/lib/Fuzzer/test/lit.site.cfg.in
+++ b/lib/Fuzzer/test/lit.site.cfg.in
@@ -1,4 +1,5 @@
 config.test_exec_root = "@CMAKE_CURRENT_BINARY_DIR@"
 config.llvm_tools_dir = "@LLVM_TOOLS_DIR@"
 config.has_lsan = True if @HAS_LSAN@ == 1 else False
+config.is_posix = @LIBFUZZER_POSIX@
 lit_config.load_config(config, "@CMAKE_CURRENT_SOURCE_DIR@/lit.cfg")
diff --git a/lib/Fuzzer/test/merge-posix.test b/lib/Fuzzer/test/merge-posix.test
new file mode 100644
index 0000000000000000000000000000000000000000..47b90b986791887913692d75f47848f3e73d29e3
--- /dev/null
+++ b/lib/Fuzzer/test/merge-posix.test
@@ -0,0 +1,23 @@
+REQUIRES: posix
+
+RUN: rm -rf  %tmp/T1 %tmp/T2
+RUN: mkdir -p %tmp/T1 %tmp/T2
+
+RUN: echo F..... > %tmp/T1/1
+RUN: echo .U.... > %tmp/T1/2
+RUN: echo ..Z... > %tmp/T1/3
+
+RUN: echo .....F > %tmp/T2/1
+RUN: echo ....U. > %tmp/T2/2
+RUN: echo ...Z.. > %tmp/T2/3
+RUN: echo ...Z.. > %tmp/T2/4
+RUN: echo ....E. > %tmp/T2/5
+RUN: echo .....R > %tmp/T2/6
+
+# Check that we can report an error if file size exceeded
+RUN: (ulimit -f 1; not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=SIGXFSZ)
+SIGXFSZ: ERROR: libFuzzer: file size exceeded
+
+# Check that we honor TMPDIR
+RUN: TMPDIR=DIR_DOES_NOT_EXIST not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=TMPDIR
+TMPDIR: MERGE-OUTER: failed to write to the control file: DIR_DOES_NOT_EXIST/libFuzzerTemp
diff --git a/lib/Fuzzer/test/merge-summary.test b/lib/Fuzzer/test/merge-summary.test
new file mode 100644
index 0000000000000000000000000000000000000000..df9d62dec6364e2b0a11a57435c1107ce401c177
--- /dev/null
+++ b/lib/Fuzzer/test/merge-summary.test
@@ -0,0 +1,15 @@
+RUN: rm -rf %t/T1 %t/T2
+RUN: mkdir -p %t/T0 %t/T1 %t/T2
+RUN: echo ...Z.. > %t/T2/1
+RUN: echo ....E. > %t/T2/2
+RUN: echo .....R > %t/T2/3
+RUN: echo F..... > %t/T2/a
+RUN: echo .U.... > %t/T2/b
+RUN: echo ..Z... > %t/T2/c
+
+RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %t/T1 %t/T2 -save_coverage_summary=%t/SUMMARY 2>&1 | FileCheck %s --check-prefix=SAVE_SUMMARY
+SAVE_SUMMARY: MERGE-OUTER: writing coverage summary for 6 files to {{.*}}SUMMARY
+RUN: rm %t/T1/*
+RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %t/T1 %t/T2 -load_coverage_summary=%t/SUMMARY 2>&1 | FileCheck %s --check-prefix=LOAD_SUMMARY
+LOAD_SUMMARY: MERGE-OUTER: coverage summary loaded from {{.*}}SUMMAR
+LOAD_SUMMARY: MERGE-OUTER: 0 new files with 0 new features added
diff --git a/lib/Fuzzer/test/merge.test b/lib/Fuzzer/test/merge.test
index e1445c2b4c1700b78591939439736c72de16965f..e59da8c3e091789f7b2e8322d713a3171fb47788 100644
--- a/lib/Fuzzer/test/merge.test
+++ b/lib/Fuzzer/test/merge.test
@@ -1,12 +1,13 @@
 CHECK: BINGO
 
-RUN: rm -rf  %tmp/T1 %tmp/T2
-RUN: mkdir -p %tmp/T1 %tmp/T2
-RUN: echo F..... > %tmp/T1/1
-RUN: echo .U.... > %tmp/T1/2
-RUN: echo ..Z... > %tmp/T1/3
+RUN: rm -rf %tmp/T0 %tmp/T1 %tmp/T2
+RUN: mkdir -p %tmp/T0 %tmp/T1 %tmp/T2
+RUN: echo F..... > %tmp/T0/1
+RUN: echo .U.... > %tmp/T0/2
+RUN: echo ..Z... > %tmp/T0/3
 
 # T1 has 3 elements, T2 is empty.
+RUN: cp %tmp/T0/* %tmp/T1/
 RUN: LLVMFuzzer-FullCoverageSetTest         -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=CHECK1
 CHECK1: MERGE-OUTER: 3 files, 3 in the initial corpus
 CHECK1: MERGE-OUTER: 0 new files with 0 new features added
@@ -29,13 +30,15 @@ CHECK3: MERGE-OUTER: 12 files, 6 in the initial corpus
 CHECK3: MERGE-OUTER: 0 new files with 0 new features added
 
 # Check that we respect max_len during the merge and don't crash.
-RUN: rm %tmp/T1/??*
+RUN: rm %tmp/T1/*
+RUN: cp %tmp/T0/* %tmp/T1/
 RUN: echo looooooooong > %tmp/T2/looooooooong
 RUN: LLVMFuzzer-FullCoverageSetTest         -merge=1 %tmp/T1 %tmp/T2 -max_len=6 2>&1 | FileCheck %s --check-prefix=MAX_LEN
 MAX_LEN: MERGE-OUTER: 3 new files
 
 # Check that merge tolerates failures.
-RUN: rm %tmp/T1/??*
+RUN: rm %tmp/T1/*
+RUN: cp %tmp/T0/* %tmp/T1/
 RUN: echo 'FUZZER' > %tmp/T2/FUZZER
 RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=MERGE_WITH_CRASH
 MERGE_WITH_CRASH: MERGE-OUTER: succesfull in 2 attempt(s)
@@ -45,14 +48,6 @@ MERGE_WITH_CRASH: MERGE-OUTER: 3 new files
 RUN: LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2  -max_len=5 2>&1 | FileCheck %s --check-prefix=MERGE_LEN5
 MERGE_LEN5: MERGE-OUTER: succesfull in 1 attempt(s)
 
-# Check that we honor TMPDIR
-RUN: TMPDIR=DIR_DOES_NOT_EXIST not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=TMPDIR
-TMPDIR: MERGE-OUTER: failed to write to the control file: DIR_DOES_NOT_EXIST/libFuzzerTemp
-
-# Check that we can report an error if file size exceeded
-RUN: (ulimit -f 1; not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=SIGXFSZ)
-SIGXFSZ: ERROR: libFuzzer: file size exceeded
-
 RUN: rm -rf  %tmp/T1/* %tmp/T2/*
 RUN: not LLVMFuzzer-FullCoverageSetTest -merge=1 %tmp/T1 %tmp/T2 2>&1 | FileCheck %s --check-prefix=EMPTY
 EMPTY: MERGE-OUTER: zero succesfull attempts, exiting
diff --git a/lib/Fuzzer/test/minimize_crash.test b/lib/Fuzzer/test/minimize_crash.test
index ec54ec59d6dc6c3ee81ff3ed2b05a95f52814f82..5643c6bacb093a4abcd108f474374be4c7aa492b 100644
--- a/lib/Fuzzer/test/minimize_crash.test
+++ b/lib/Fuzzer/test/minimize_crash.test
@@ -5,8 +5,9 @@ RUN: LLVMFuzzer-NullDerefTest -minimize_crash=1 not_minimal_crash -max_total_tim
 CHECK_EXACT: CRASH_MIN: failed to minimize beyond exact_minimized_path (3 bytes), exiting
 RUN: rm not_minimal_crash minimized-from-* exact_minimized_path
 
-RUN: echo 'abcd*xyz' > not_minimal_crash
-RUN: LLVMFuzzer-SingleByteInputTest -minimize_crash=1 not_minimal_crash -artifact_prefix=./ZZZ- -exact_artifact_path=exact_minimized_path 2>&1 | FileCheck %s --check-prefix=MIN1
-MIN1: Test unit written to ./ZZZ-minimized-from-
+RUN: echo -n 'abcd*xyz' > not_minimal_crash
+RUN: LLVMFuzzer-SingleByteInputTest -minimize_crash=1 not_minimal_crash -exact_artifact_path=exact_minimized_path 2>&1 | FileCheck %s --check-prefix=MIN1
+MIN1: Test unit written to exact_minimized_path
+MIN1: Test unit written to exact_minimized_path
 MIN1: INFO: The input is small enough, exiting
 MIN1: CRASH_MIN: failed to minimize beyond exact_minimized_path (1 bytes), exiting
diff --git a/lib/Fuzzer/test/minimize_two_crashes.test b/lib/Fuzzer/test/minimize_two_crashes.test
new file mode 100644
index 0000000000000000000000000000000000000000..2358d8c2a92e8d3d9a12bd7458413a795211ceb8
--- /dev/null
+++ b/lib/Fuzzer/test/minimize_two_crashes.test
@@ -0,0 +1,16 @@
+# Test that the minimizer stops when it sees a differe bug.
+
+RUN: rm -rf %t && mkdir %t
+RUN: echo H12345678901234667888090 > %t/long_crash
+RUN: env ASAN_OPTIONS=dedup_token_length=3 LLVMFuzzer-TwoDifferentBugsTest -seed=1 -minimize_crash=1 %t/long_crash -exact_artifact_path=%t/result 2>&1 | FileCheck %s
+
+CHECK: DedupToken1: DEDUP_TOKEN: Bar
+CHECK: DedupToken2: DEDUP_TOKEN: Bar
+CHECK: DedupToken1: DEDUP_TOKEN: Bar
+CHECK: DedupToken2: DEDUP_TOKEN: Foo
+CHECK: CRASH_MIN: mismatch in dedup tokens
+
+RUN: not  LLVMFuzzer-TwoDifferentBugsTest %t/result 2>&1 | FileCheck %s --check-prefix=VERIFY
+
+VERIFY: ERROR: AddressSanitizer:
+VERIFY: in Bar
diff --git a/lib/Fuzzer/test/no-coverage/CMakeLists.txt b/lib/Fuzzer/test/no-coverage/CMakeLists.txt
index 9cbe65c6719d935fb8cc3b021651817b63a35638..52e7240333ee3689d325b1d554aec3e08c3f4cfd 100644
--- a/lib/Fuzzer/test/no-coverage/CMakeLists.txt
+++ b/lib/Fuzzer/test/no-coverage/CMakeLists.txt
@@ -5,7 +5,7 @@ set(CMAKE_CXX_FLAGS
   "${LIBFUZZER_FLAGS_BASE} -fno-sanitize-coverage=edge,trace-cmp,indirect-calls,8bit-counters,trace-pc-guard")
 
 set(NoCoverageTests
-  UninstrumentedTest
+  NotinstrumentedTest
   )
 
 foreach(Test ${NoCoverageTests})
@@ -25,6 +25,5 @@ if(NOT MSVC)
       "${CMAKE_BINARY_DIR}/lib/Fuzzer/test"
       )
 
-  # Propagate value into parent directory
-  set(TestBinaries ${TestBinaries} AFLDriverTest PARENT_SCOPE)
+  add_dependencies(TestBinaries AFLDriverTest)
 endif()
diff --git a/lib/Fuzzer/test/trace-malloc-2.test b/lib/Fuzzer/test/trace-malloc-2.test
new file mode 100644
index 0000000000000000000000000000000000000000..7719b650c791e9c00ee092391fe24617292f8b7c
--- /dev/null
+++ b/lib/Fuzzer/test/trace-malloc-2.test
@@ -0,0 +1,8 @@
+// FIXME: This test infinite loops on darwin because it crashes
+// printing a stack trace repeatedly
+UNSUPPORTED: darwin
+
+RUN: LLVMFuzzer-TraceMallocTest -seed=1 -trace_malloc=2 -runs=1000 2>&1 | FileCheck %s --check-prefix=TRACE2
+TRACE2-DAG: FREE[0]
+TRACE2-DAG: MALLOC[0]
+TRACE2-DAG: in LLVMFuzzerTestOneInput
diff --git a/lib/Fuzzer/test/trace-malloc.test b/lib/Fuzzer/test/trace-malloc.test
index c95147904d4256176bc27d558f674ba6d753c713..25694cc2de5c4b83eae0a80ac9991fb11d10da49 100644
--- a/lib/Fuzzer/test/trace-malloc.test
+++ b/lib/Fuzzer/test/trace-malloc.test
@@ -3,8 +3,3 @@ CHECK-DAG: MallocFreeTracer: STOP 0 0 (same)
 CHECK-DAG: MallocFreeTracer: STOP 0 1 (DIFFERENT)
 CHECK-DAG: MallocFreeTracer: STOP 1 0 (DIFFERENT)
 CHECK-DAG: MallocFreeTracer: STOP 1 1 (same)
-
-RUN: LLVMFuzzer-TraceMallocTest -seed=1 -trace_malloc=2 -runs=1000 2>&1 | FileCheck %s --check-prefix=TRACE2
-TRACE2-DAG: FREE[0]
-TRACE2-DAG: MALLOC[0]
-TRACE2-DAG: in LLVMFuzzerTestOneInput
diff --git a/lib/Fuzzer/test/trace-pc.test b/lib/Fuzzer/test/trace-pc.test
new file mode 100644
index 0000000000000000000000000000000000000000..3709677b71b6fff1a0a8ab9b61f51d242ec12d52
--- /dev/null
+++ b/lib/Fuzzer/test/trace-pc.test
@@ -0,0 +1,2 @@
+CHECK: BINGO
+RUN: LLVMFuzzer-SimpleTest-TracePC -runs=100000 -seed=1 2>&1 | FileCheck %s
diff --git a/lib/Fuzzer/test/trace-pc/CMakeLists.txt b/lib/Fuzzer/test/trace-pc/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e800f82cc5dcd8f552ebe8fd88bd79bf02a939b5
--- /dev/null
+++ b/lib/Fuzzer/test/trace-pc/CMakeLists.txt
@@ -0,0 +1,13 @@
+# These tests are not instrumented with coverage and don't
+# have coverage rt in the binary.
+
+set(CMAKE_CXX_FLAGS
+  "${LIBFUZZER_FLAGS_BASE} -fno-sanitize-coverage=edge,trace-cmp,indirect-calls,8bit-counters,trace-pc-guard -fsanitize-coverage=trace-pc")
+
+set(TracePCTests
+  SimpleTest
+  )
+
+foreach(Test ${TracePCTests})
+  add_libfuzzer_test(${Test}-TracePC SOURCES ../${Test}.cpp)
+endforeach()
diff --git a/lib/Fuzzer/test/ubsan/CMakeLists.txt b/lib/Fuzzer/test/ubsan/CMakeLists.txt
index 7a9eacdbe7df36e933f92d14e43a6ce87cc12f7c..55e0a118186ba8c58d8363f3946f3d23fedd7ad2 100644
--- a/lib/Fuzzer/test/ubsan/CMakeLists.txt
+++ b/lib/Fuzzer/test/ubsan/CMakeLists.txt
@@ -10,6 +10,3 @@ set(UbsanTests
 foreach(Test ${UbsanTests})
   add_libfuzzer_test(${Test}-Ubsan SOURCES ../${Test}.cpp)
 endforeach()
-
-# Propagate value into parent directory
-set(TestBinaries ${TestBinaries} PARENT_SCOPE)
diff --git a/lib/Fuzzer/test/ulimit.test b/lib/Fuzzer/test/ulimit.test
index a60636c351bd0a8e0dc2b6795dd4aac6f36595fd..c2faca13f728ae31b6d845388df90b80aa1bcad4 100644
--- a/lib/Fuzzer/test/ulimit.test
+++ b/lib/Fuzzer/test/ulimit.test
@@ -1,2 +1,4 @@
+REQUIRES: posix
+
 RUN: ulimit -s 1000
 RUN: LLVMFuzzer-SimpleTest
diff --git a/lib/Fuzzer/test/uninstrumented/CMakeLists.txt b/lib/Fuzzer/test/uninstrumented/CMakeLists.txt
index 29b66e6e586a6c8a30551bb563a91cdf5b2920fa..f4ab59e5b18df48ea3548898640b3f9e20a75afd 100644
--- a/lib/Fuzzer/test/uninstrumented/CMakeLists.txt
+++ b/lib/Fuzzer/test/uninstrumented/CMakeLists.txt
@@ -11,6 +11,3 @@ set(UninstrumentedTests
 foreach(Test ${UninstrumentedTests})
   add_libfuzzer_test(${Test}-Uninstrumented SOURCES ../${Test}.cpp)
 endforeach()
-
-# Propagate value into parent directory
-set(TestBinaries ${TestBinaries} PARENT_SCOPE)
diff --git a/lib/Fuzzer/test/value-profile-div.test b/lib/Fuzzer/test/value-profile-div.test
index ba45e4129d3003a0eae8d521c7abdcb4f6de3be1..b966a8916512dfa0019688c0949c72a1012c0f09 100644
--- a/lib/Fuzzer/test/value-profile-div.test
+++ b/lib/Fuzzer/test/value-profile-div.test
@@ -1,3 +1,3 @@
-CHECK: AddressSanitizer: FPE
+CHECK: AddressSanitizer: {{FPE|int-divide-by-zero}}
 RUN: not LLVMFuzzer-DivTest -seed=1 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
 
diff --git a/lib/Fuzzer/test/value-profile-mem.test b/lib/Fuzzer/test/value-profile-mem.test
index 09d737dbe736d0bfbb67b4e9d09c7199d6703d40..880b2692910a6782b7dfbc88a7d091beb250f1d4 100644
--- a/lib/Fuzzer/test/value-profile-mem.test
+++ b/lib/Fuzzer/test/value-profile-mem.test
@@ -1,2 +1,2 @@
 CHECK: BINGO
-RUN: not LLVMFuzzer-SingleMemcmpTest -seed=1  -use_cmp=0 -use_memcmp=0 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
+RUN: not LLVMFuzzer-SingleMemcmpTest -seed=1  -use_cmp=0 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
diff --git a/lib/Fuzzer/test/value-profile-strcmp.test b/lib/Fuzzer/test/value-profile-strcmp.test
index 1e7ef9b45e964de564493d886dafe71613452081..7f1047594548b577b036bae6591962abc9a6ec27 100644
--- a/lib/Fuzzer/test/value-profile-strcmp.test
+++ b/lib/Fuzzer/test/value-profile-strcmp.test
@@ -1,2 +1,2 @@
 CHECK: BINGO
-RUN: not LLVMFuzzer-SingleStrcmpTest -seed=1  -use_cmp=0 -use_memcmp=0 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
+RUN: not LLVMFuzzer-SingleStrcmpTest -seed=1  -use_cmp=0 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
diff --git a/lib/Fuzzer/test/value-profile-strncmp.test b/lib/Fuzzer/test/value-profile-strncmp.test
index 650973180c06dec4d5de9d6d784d7ff1fa918233..84a74c4f0ad23d357e57c2ace2ff3a622f39b4d7 100644
--- a/lib/Fuzzer/test/value-profile-strncmp.test
+++ b/lib/Fuzzer/test/value-profile-strncmp.test
@@ -1,2 +1,2 @@
 CHECK: BINGO
-RUN: not LLVMFuzzer-SingleStrncmpTest -seed=1 -use_cmp=0 -use_memcmp=0 -use_value_profile=1 -runs=10000000 2>&1 | FileCheck %s
+RUN: not LLVMFuzzer-SingleStrncmpTest -seed=1 -use_cmp=0 -use_value_profile=1 -runs=100000000 2>&1 | FileCheck %s
diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp
index 9e2c8813a745f475979dbf16c788f6694f68e0b6..d0b77e7218b93ad0e5513cb0099fd2057d223cf5 100644
--- a/lib/IR/AsmWriter.cpp
+++ b/lib/IR/AsmWriter.cpp
@@ -21,6 +21,7 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
@@ -832,7 +833,7 @@ void SlotTracker::processModule() {
     // Add all the function attributes to the table.
     // FIXME: Add attributes of other objects?
     AttributeSet FnAttrs = F.getAttributes().getFnAttributes();
-    if (FnAttrs.hasAttributes(AttributeSet::FunctionIndex))
+    if (FnAttrs.hasAttributes())
       CreateAttributeSetSlot(FnAttrs);
   }
 
@@ -867,15 +868,10 @@ void SlotTracker::processFunction() {
 
       // We allow direct calls to any llvm.foo function here, because the
       // target may not be linked into the optimizer.
-      if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
+      if (auto CS = ImmutableCallSite(&I)) {
         // Add all the call attributes to the table.
-        AttributeSet Attrs = CI->getAttributes().getFnAttributes();
-        if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
-          CreateAttributeSetSlot(Attrs);
-      } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
-        // Add all the call attributes to the table.
-        AttributeSet Attrs = II->getAttributes().getFnAttributes();
-        if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+        AttributeSet Attrs = CS.getAttributes().getFnAttributes();
+        if (Attrs.hasAttributes())
           CreateAttributeSetSlot(Attrs);
       }
     }
@@ -1016,8 +1012,7 @@ void SlotTracker::CreateMetadataSlot(const MDNode *N) {
 }
 
 void SlotTracker::CreateAttributeSetSlot(AttributeSet AS) {
-  assert(AS.hasAttributes(AttributeSet::FunctionIndex) &&
-         "Doesn't need a slot!");
+  assert(AS.hasAttributes() && "Doesn't need a slot!");
 
   as_iterator I = asMap.find(AS);
   if (I != asMap.end())
@@ -1073,6 +1068,8 @@ static void WriteOptimizationInfo(raw_ostream &Out, const User *U) {
         Out << " nsz";
       if (FPO->hasAllowReciprocal())
         Out << " arcp";
+      if (FPO->hasAllowContract())
+        Out << " contract";
     }
   }
 
@@ -1614,6 +1611,9 @@ static void writeDIDerivedType(raw_ostream &Out, const DIDerivedType *N,
   Printer.printInt("offset", N->getOffsetInBits());
   Printer.printDIFlags("flags", N->getFlags());
   Printer.printMetadata("extraData", N->getRawExtraData());
+  if (const auto &DWARFAddressSpace = N->getDWARFAddressSpace())
+    Printer.printInt("dwarfAddressSpace", *DWARFAddressSpace,
+                     /* ShouldSkipZero */ false);
   Out << ")";
 }
 
@@ -1688,6 +1688,8 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
   Printer.printMetadata("macros", N->getRawMacros());
   Printer.printInt("dwoId", N->getDWOId());
   Printer.printBool("splitDebugInlining", N->getSplitDebugInlining(), true);
+  Printer.printBool("debugInfoForProfiling", N->getDebugInfoForProfiling(),
+                    false);
   Out << ")";
 }
 
@@ -2083,7 +2085,8 @@ public:
   void printModule(const Module *M);
 
   void writeOperand(const Value *Op, bool PrintType);
-  void writeParamOperand(const Value *Operand, AttributeSet Attrs,unsigned Idx);
+  void writeParamOperand(const Value *Operand, AttributeList Attrs,
+                         unsigned Idx);
   void writeOperandBundles(ImmutableCallSite CS);
   void writeAtomic(AtomicOrdering Ordering, SynchronizationScope SynchScope);
   void writeAtomicCmpXchg(AtomicOrdering SuccessOrdering,
@@ -2099,7 +2102,7 @@ public:
   void printIndirectSymbol(const GlobalIndirectSymbol *GIS);
   void printComdat(const Comdat *C);
   void printFunction(const Function *F);
-  void printArgument(const Argument *FA, AttributeSet Attrs, unsigned Idx);
+  void printArgument(const Argument *FA, AttributeList Attrs, unsigned Idx);
   void printBasicBlock(const BasicBlock *BB);
   void printInstructionLine(const Instruction &I);
   void printInstruction(const Instruction &I);
@@ -2178,7 +2181,7 @@ void AssemblyWriter::writeAtomicCmpXchg(AtomicOrdering SuccessOrdering,
 }
 
 void AssemblyWriter::writeParamOperand(const Value *Operand,
-                                       AttributeSet Attrs, unsigned Idx) {
+                                       AttributeList Attrs, unsigned Idx) {
   if (!Operand) {
     Out << "<null operand!>";
     return;
@@ -2596,19 +2599,12 @@ void AssemblyWriter::printFunction(const Function *F) {
   if (F->isMaterializable())
     Out << "; Materializable\n";
 
-  const AttributeSet &Attrs = F->getAttributes();
-  if (Attrs.hasAttributes(AttributeSet::FunctionIndex)) {
+  const AttributeList &Attrs = F->getAttributes();
+  if (Attrs.hasAttributes(AttributeList::FunctionIndex)) {
     AttributeSet AS = Attrs.getFnAttributes();
     std::string AttrStr;
 
-    unsigned Idx = 0;
-    for (unsigned E = AS.getNumSlots(); Idx != E; ++Idx)
-      if (AS.getSlotIndex(Idx) == AttributeSet::FunctionIndex)
-        break;
-
-    for (AttributeSet::iterator I = AS.begin(Idx), E = AS.end(Idx);
-         I != E; ++I) {
-      Attribute Attr = *I;
+    for (const Attribute &Attr : AS) {
       if (!Attr.isStringAttribute()) {
         if (!AttrStr.empty()) AttrStr += ' ';
         AttrStr += Attr.getAsString();
@@ -2641,8 +2637,8 @@ void AssemblyWriter::printFunction(const Function *F) {
   }
 
   FunctionType *FT = F->getFunctionType();
-  if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
-    Out <<  Attrs.getAsString(AttributeSet::ReturnIndex) << ' ';
+  if (Attrs.hasAttributes(AttributeList::ReturnIndex))
+    Out << Attrs.getAsString(AttributeList::ReturnIndex) << ' ';
   TypePrinter.print(F->getReturnType(), Out);
   Out << ' ';
   WriteAsOperandInternal(Out, F, &TypePrinter, &Machine, F->getParent());
@@ -2681,7 +2677,7 @@ void AssemblyWriter::printFunction(const Function *F) {
   StringRef UA = getUnnamedAddrEncoding(F->getUnnamedAddr());
   if (!UA.empty())
     Out << ' ' << UA;
-  if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
+  if (Attrs.hasAttributes(AttributeList::FunctionIndex))
     Out << " #" << Machine.getAttributeGroupSlot(Attrs.getFnAttributes());
   if (F->hasSection()) {
     Out << " section \"";
@@ -2730,8 +2726,8 @@ void AssemblyWriter::printFunction(const Function *F) {
 /// printArgument - This member is called for every argument that is passed into
 /// the function.  Simply print it out
 ///
-void AssemblyWriter::printArgument(const Argument *Arg,
-                                   AttributeSet Attrs, unsigned Idx) {
+void AssemblyWriter::printArgument(const Argument *Arg, AttributeList Attrs,
+                                   unsigned Idx) {
   // Output type...
   TypePrinter.print(Arg->getType(), Out);
 
@@ -2901,12 +2897,11 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << ", ";
     writeOperand(SI.getDefaultDest(), true);
     Out << " [";
-    for (SwitchInst::ConstCaseIt i = SI.case_begin(), e = SI.case_end();
-         i != e; ++i) {
+    for (auto Case : SI.cases()) {
       Out << "\n    ";
-      writeOperand(i.getCaseValue(), true);
+      writeOperand(Case.getCaseValue(), true);
       Out << ", ";
-      writeOperand(i.getCaseSuccessor(), true);
+      writeOperand(Case.getCaseSuccessor(), true);
     }
     Out << "\n  ]";
   } else if (isa<IndirectBrInst>(I)) {
@@ -3015,10 +3010,10 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Operand = CI->getCalledValue();
     FunctionType *FTy = CI->getFunctionType();
     Type *RetTy = FTy->getReturnType();
-    const AttributeSet &PAL = CI->getAttributes();
+    const AttributeList &PAL = CI->getAttributes();
 
-    if (PAL.hasAttributes(AttributeSet::ReturnIndex))
-      Out << ' ' << PAL.getAsString(AttributeSet::ReturnIndex);
+    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+      Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // If possible, print out the short form of the call instruction.  We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -3043,7 +3038,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       Out << ", ...";
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    if (PAL.hasAttributes(AttributeList::FunctionIndex))
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
 
     writeOperandBundles(CI);
@@ -3052,7 +3047,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Operand = II->getCalledValue();
     FunctionType *FTy = II->getFunctionType();
     Type *RetTy = FTy->getReturnType();
-    const AttributeSet &PAL = II->getAttributes();
+    const AttributeList &PAL = II->getAttributes();
 
     // Print the calling convention being used.
     if (II->getCallingConv() != CallingConv::C) {
@@ -3060,8 +3055,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       PrintCallingConv(II->getCallingConv(), Out);
     }
 
-    if (PAL.hasAttributes(AttributeSet::ReturnIndex))
-      Out << ' ' << PAL.getAsString(AttributeSet::ReturnIndex);
+    if (PAL.hasAttributes(AttributeList::ReturnIndex))
+      Out << ' ' << PAL.getAsString(AttributeList::ReturnIndex);
 
     // If possible, print out the short form of the invoke instruction. We can
     // only do this if the first argument is a pointer to a nonvararg function,
@@ -3079,7 +3074,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     }
 
     Out << ')';
-    if (PAL.hasAttributes(AttributeSet::FunctionIndex))
+    if (PAL.hasAttributes(AttributeList::FunctionIndex))
       Out << " #" << Machine.getAttributeGroupSlot(PAL.getFnAttributes());
 
     writeOperandBundles(II);
@@ -3109,6 +3104,12 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     if (AI->getAlignment()) {
       Out << ", align " << AI->getAlignment();
     }
+
+    unsigned AddrSpace = AI->getType()->getAddressSpace();
+    if (AddrSpace != 0) {
+      Out << ", addrspace(" << AddrSpace << ')';
+    }
+
   } else if (isa<CastInst>(I)) {
     if (Operand) {
       Out << ' ';
@@ -3242,7 +3243,7 @@ void AssemblyWriter::printMDNodeBody(const MDNode *Node) {
 }
 
 void AssemblyWriter::writeAllAttributeGroups() {
-  std::vector<std::pair<AttributeSet, unsigned> > asVec;
+  std::vector<std::pair<AttributeSet, unsigned>> asVec;
   asVec.resize(Machine.as_size());
 
   for (SlotTracker::as_iterator I = Machine.as_begin(), E = Machine.as_end();
@@ -3251,7 +3252,7 @@ void AssemblyWriter::writeAllAttributeGroups() {
 
   for (const auto &I : asVec)
     Out << "attributes #" << I.second << " = { "
-        << I.first.getAsString(AttributeSet::FunctionIndex, true) << " }\n";
+        << I.first.getAsString(true) << " }\n";
 }
 
 void AssemblyWriter::printUseListOrder(const UseListOrder &Order) {
diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h
index d0d27101aa867f353dcf552ee2fba1049e274860..09f037365793d3377e6ba848af28ccbd868b042a 100644
--- a/lib/IR/AttributeImpl.h
+++ b/lib/IR/AttributeImpl.h
@@ -16,7 +16,6 @@
 #ifndef LLVM_LIB_IR_ATTRIBUTEIMPL_H
 #define LLVM_LIB_IR_ATTRIBUTEIMPL_H
 
-#include "AttributeSetNode.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/StringRef.h"
@@ -144,16 +143,74 @@ public:
   StringRef getStringValue() const { return Val; }
 };
 
-typedef std::pair<unsigned, AttributeSetNode *> IndexAttrPair;
+//===----------------------------------------------------------------------===//
+/// \class
+/// \brief This class represents a group of attributes that apply to one
+/// element: function, return type, or parameter.
+class AttributeSetNode final
+    : public FoldingSetNode,
+      private TrailingObjects<AttributeSetNode, Attribute> {
+  friend TrailingObjects;
+
+  /// Bitset with a bit for each available attribute Attribute::AttrKind.
+  uint64_t AvailableAttrs;
+  unsigned NumAttrs; ///< Number of attributes in this node.
+
+  AttributeSetNode(ArrayRef<Attribute> Attrs);
+
+public:
+  // AttributesSetNode is uniqued, these should not be available.
+  AttributeSetNode(const AttributeSetNode &) = delete;
+  AttributeSetNode &operator=(const AttributeSetNode &) = delete;
+
+  void operator delete(void *p) { ::operator delete(p); }
+
+  static AttributeSetNode *get(LLVMContext &C, const AttrBuilder &B);
+
+  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
+
+  /// \brief Return the number of attributes this AttributeList contains.
+  unsigned getNumAttributes() const { return NumAttrs; }
+
+  bool hasAttribute(Attribute::AttrKind Kind) const {
+    return AvailableAttrs & ((uint64_t)1) << Kind;
+  }
+  bool hasAttribute(StringRef Kind) const;
+  bool hasAttributes() const { return NumAttrs != 0; }
+
+  Attribute getAttribute(Attribute::AttrKind Kind) const;
+  Attribute getAttribute(StringRef Kind) const;
+
+  unsigned getAlignment() const;
+  unsigned getStackAlignment() const;
+  uint64_t getDereferenceableBytes() const;
+  uint64_t getDereferenceableOrNullBytes() const;
+  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
+  std::string getAsString(bool InAttrGrp) const;
+
+  typedef const Attribute *iterator;
+  iterator begin() const { return getTrailingObjects<Attribute>(); }
+  iterator end() const { return begin() + NumAttrs; }
+
+  void Profile(FoldingSetNodeID &ID) const {
+    Profile(ID, makeArrayRef(begin(), end()));
+  }
+  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
+    for (const auto &Attr : AttrList)
+      Attr.Profile(ID);
+  }
+};
+
+typedef std::pair<unsigned, AttributeSet> IndexAttrPair;
 
 //===----------------------------------------------------------------------===//
 /// \class
 /// \brief This class represents a set of attributes that apply to the function,
 /// return type, and parameters.
-class AttributeSetImpl final
+class AttributeListImpl final
     : public FoldingSetNode,
-      private TrailingObjects<AttributeSetImpl, IndexAttrPair> {
-  friend class AttributeSet;
+      private TrailingObjects<AttributeListImpl, IndexAttrPair> {
+  friend class AttributeList;
   friend TrailingObjects;
 
 private:
@@ -166,52 +223,21 @@ private:
   size_t numTrailingObjects(OverloadToken<IndexAttrPair>) { return NumSlots; }
 
   /// \brief Return a pointer to the IndexAttrPair for the specified slot.
-  const IndexAttrPair *getNode(unsigned Slot) const {
+  const IndexAttrPair *getSlotPair(unsigned Slot) const {
     return getTrailingObjects<IndexAttrPair>() + Slot;
   }
 
 public:
-  AttributeSetImpl(LLVMContext &C,
-                   ArrayRef<std::pair<unsigned, AttributeSetNode *>> Slots)
-      : Context(C), NumSlots(Slots.size()), AvailableFunctionAttrs(0) {
-    static_assert(Attribute::EndAttrKinds <=
-                      sizeof(AvailableFunctionAttrs) * CHAR_BIT,
-                  "Too many attributes");
-
-#ifndef NDEBUG
-    if (Slots.size() >= 2) {
-      for (const std::pair<unsigned, AttributeSetNode *> *i = Slots.begin() + 1,
-                                                         *e = Slots.end();
-           i != e; ++i) {
-        assert((i-1)->first <= i->first && "Attribute set not ordered!");
-      }
-    }
-#endif
-    // There's memory after the node where we can store the entries in.
-    std::copy(Slots.begin(), Slots.end(), getTrailingObjects<IndexAttrPair>());
-
-    // Initialize AvailableFunctionAttrs summary bitset.
-    if (NumSlots > 0) {
-      static_assert(AttributeSet::FunctionIndex == ~0u,
-                    "FunctionIndex should be biggest possible index");
-      const std::pair<unsigned, AttributeSetNode *> &Last = Slots.back();
-      if (Last.first == AttributeSet::FunctionIndex) {
-        const AttributeSetNode *Node = Last.second;
-        for (Attribute I : *Node) {
-          if (!I.isStringAttribute())
-            AvailableFunctionAttrs |= ((uint64_t)1) << I.getKindAsEnum();
-        }
-      }
-    }
-  }
+  AttributeListImpl(LLVMContext &C,
+                    ArrayRef<std::pair<unsigned, AttributeSet>> Slots);
 
   // AttributesSetImpt is uniqued, these should not be available.
-  AttributeSetImpl(const AttributeSetImpl &) = delete;
-  AttributeSetImpl &operator=(const AttributeSetImpl &) = delete;
+  AttributeListImpl(const AttributeListImpl &) = delete;
+  AttributeListImpl &operator=(const AttributeListImpl &) = delete;
 
   void operator delete(void *p) { ::operator delete(p); }
 
-  /// \brief Get the context that created this AttributeSetImpl.
+  /// \brief Get the context that created this AttributeListImpl.
   LLVMContext &getContext() { return Context; }
 
   /// \brief Return the number of slots used in this attribute list. This is
@@ -224,42 +250,35 @@ public:
   /// attributes are applied to, not the index into the AttrNodes list where the
   /// attributes reside.
   unsigned getSlotIndex(unsigned Slot) const {
-    return getNode(Slot)->first;
+    return getSlotPair(Slot)->first;
+  }
+
+  /// \brief Retrieve the attribute set node for the given "slot" in the
+  /// AttrNode list.
+  AttributeSet getSlotNode(unsigned Slot) const {
+    return getSlotPair(Slot)->second;
   }
 
   /// \brief Retrieve the attributes for the given "slot" in the AttrNode list.
   /// \p Slot is an index into the AttrNodes list, not the index of the return /
   /// parameter/ function which the attributes apply to.
-  AttributeSet getSlotAttributes(unsigned Slot) const {
-    return AttributeSet::get(Context, *getNode(Slot));
+  AttributeList getSlotAttributes(unsigned Slot) const {
+    return AttributeList::get(Context, *getSlotPair(Slot));
   }
 
-  /// \brief Retrieve the attribute set node for the given "slot" in the
-  /// AttrNode list.
-  AttributeSetNode *getSlotNode(unsigned Slot) const {
-    return getNode(Slot)->second;
-  }
-
-  /// \brief Return true if the AttributeSetNode for the FunctionIndex has an
+  /// \brief Return true if the AttributeSet or the FunctionIndex has an
   /// enum attribute of the given kind.
   bool hasFnAttribute(Attribute::AttrKind Kind) const {
     return AvailableFunctionAttrs & ((uint64_t)1) << Kind;
   }
 
-  typedef AttributeSetNode::iterator iterator;
-  iterator begin(unsigned Slot) const { return getSlotNode(Slot)->begin(); }
-  iterator end(unsigned Slot) const { return getSlotNode(Slot)->end(); }
+  typedef AttributeSet::iterator iterator;
+  iterator begin(unsigned Slot) const { return getSlotNode(Slot).begin(); }
+  iterator end(unsigned Slot) const { return getSlotNode(Slot).end(); }
 
-  void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, makeArrayRef(getNode(0), getNumSlots()));
-  }
+  void Profile(FoldingSetNodeID &ID) const;
   static void Profile(FoldingSetNodeID &ID,
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*>> Nodes) {
-    for (const auto &Node : Nodes) {
-      ID.AddInteger(Node.first);
-      ID.AddPointer(Node.second);
-    }
-  }
+                      ArrayRef<std::pair<unsigned, AttributeSet>> Nodes);
 
   void dump() const;
 };
diff --git a/lib/IR/AttributeSetNode.h b/lib/IR/AttributeSetNode.h
deleted file mode 100644
index 23ce3713c20bceeeae3a4d070db8caae9e232b02..0000000000000000000000000000000000000000
--- a/lib/IR/AttributeSetNode.h
+++ /dev/null
@@ -1,106 +0,0 @@
-//===-- AttributeSetNode.h - AttributeSet Internal Node ---------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// \brief This file defines the node class used internally by AttributeSet.
-///
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_IR_ATTRIBUTESETNODE_H
-#define LLVM_IR_ATTRIBUTESETNODE_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/Optional.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/Support/TrailingObjects.h"
-#include <algorithm>
-#include <climits>
-#include <cstdint>
-#include <string>
-#include <utility>
-
-namespace llvm {
-
-//===----------------------------------------------------------------------===//
-/// \class
-/// \brief This class represents a group of attributes that apply to one
-/// element: function, return type, or parameter.
-class AttributeSetNode final
-    : public FoldingSetNode,
-      private TrailingObjects<AttributeSetNode, Attribute> {
-  friend TrailingObjects;
-
-  unsigned NumAttrs; ///< Number of attributes in this node.
-  /// Bitset with a bit for each available attribute Attribute::AttrKind.
-  uint64_t AvailableAttrs;
-
-  AttributeSetNode(ArrayRef<Attribute> Attrs)
-    : NumAttrs(Attrs.size()), AvailableAttrs(0) {
-    static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
-                  "Too many attributes for AvailableAttrs");
-    // There's memory after the node where we can store the entries in.
-    std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
-
-    for (Attribute I : *this) {
-      if (!I.isStringAttribute()) {
-        AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
-      }
-    }
-  }
-
-public:
-  // AttributesSetNode is uniqued, these should not be available.
-  AttributeSetNode(const AttributeSetNode &) = delete;
-  AttributeSetNode &operator=(const AttributeSetNode &) = delete;
-
-  void operator delete(void *p) { ::operator delete(p); }
-
-  static AttributeSetNode *get(LLVMContext &C, ArrayRef<Attribute> Attrs);
-
-  static AttributeSetNode *get(AttributeSet AS, unsigned Index) {
-    return AS.getAttributes(Index);
-  }
-
-  /// \brief Return the number of attributes this AttributeSet contains.
-  unsigned getNumAttributes() const { return NumAttrs; }
-
-  bool hasAttribute(Attribute::AttrKind Kind) const {
-    return AvailableAttrs & ((uint64_t)1) << Kind;
-  }
-  bool hasAttribute(StringRef Kind) const;
-  bool hasAttributes() const { return NumAttrs != 0; }
-
-  Attribute getAttribute(Attribute::AttrKind Kind) const;
-  Attribute getAttribute(StringRef Kind) const;
-
-  unsigned getAlignment() const;
-  unsigned getStackAlignment() const;
-  uint64_t getDereferenceableBytes() const;
-  uint64_t getDereferenceableOrNullBytes() const;
-  std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
-  std::string getAsString(bool InAttrGrp) const;
-
-  typedef const Attribute *iterator;
-  iterator begin() const { return getTrailingObjects<Attribute>(); }
-  iterator end() const { return begin() + NumAttrs; }
-
-  void Profile(FoldingSetNodeID &ID) const {
-    Profile(ID, makeArrayRef(begin(), end()));
-  }
-  static void Profile(FoldingSetNodeID &ID, ArrayRef<Attribute> AttrList) {
-    for (const auto &Attr : AttrList)
-      Attr.Profile(ID);
-  }
-};
-
-} // end namespace llvm
-
-#endif // LLVM_IR_ATTRIBUTESETNODE_H
diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp
index 2c1bc2b66aa615f3e4aa4c3a3a67e9b38a84b5e9..4b840c36ccb0f2303f53647c7bd6836d60d51194 100644
--- a/lib/IR/Attributes.cpp
+++ b/lib/IR/Attributes.cpp
@@ -1,4 +1,4 @@
-//===-- Attributes.cpp - Implement AttributesList -------------------------===//
+//===- Attributes.cpp - Implement AttributesList --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,23 +9,38 @@
 //
 // \file
 // \brief This file implements the Attribute, AttributeImpl, AttrBuilder,
-// AttributeSetImpl, and AttributeSet classes.
+// AttributeListImpl, and AttributeList classes.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/Function.h"
 #include "AttributeImpl.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FoldingSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Type.h"
-#include "llvm/Support/Atomic.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Mutex.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <map>
+#include <string>
+#include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -411,9 +426,12 @@ bool Attribute::operator<(Attribute A) const {
 //===----------------------------------------------------------------------===//
 
 // Pin the vtables to this file.
-AttributeImpl::~AttributeImpl() {}
+AttributeImpl::~AttributeImpl() = default;
+
 void EnumAttributeImpl::anchor() {}
+
 void IntAttributeImpl::anchor() {}
+
 void StringAttributeImpl::anchor() {}
 
 bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
@@ -472,10 +490,86 @@ bool AttributeImpl::operator<(const AttributeImpl &AI) const {
   return getKindAsString() < AI.getKindAsString();
 }
 
+//===----------------------------------------------------------------------===//
+// AttributeSet Definition
+//===----------------------------------------------------------------------===//
+
+AttributeSet AttributeSet::get(LLVMContext &C, const AttrBuilder &B) {
+  return AttributeSet(AttributeSetNode::get(C, B));
+}
+
+AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<Attribute> Attrs) {
+  return AttributeSet(AttributeSetNode::get(C, Attrs));
+}
+
+unsigned AttributeSet::getNumAttributes() const {
+  return SetNode ? SetNode->getNumAttributes() : 0;
+}
+
+bool AttributeSet::hasAttribute(Attribute::AttrKind Kind) const {
+  return SetNode ? SetNode->hasAttribute(Kind) : 0;
+}
+
+bool AttributeSet::hasAttribute(StringRef Kind) const {
+  return SetNode ? SetNode->hasAttribute(Kind) : 0;
+}
+
+Attribute AttributeSet::getAttribute(Attribute::AttrKind Kind) const {
+  return SetNode ? SetNode->getAttribute(Kind) : Attribute();
+}
+
+Attribute AttributeSet::getAttribute(StringRef Kind) const {
+  return SetNode ? SetNode->getAttribute(Kind) : Attribute();
+}
+
+unsigned AttributeSet::getAlignment() const {
+  return SetNode ? SetNode->getAlignment() : 0;
+}
+
+unsigned AttributeSet::getStackAlignment() const {
+  return SetNode ? SetNode->getStackAlignment() : 0;
+}
+
+uint64_t AttributeSet::getDereferenceableBytes() const {
+  return SetNode ? SetNode->getDereferenceableBytes() : 0;
+}
+
+uint64_t AttributeSet::getDereferenceableOrNullBytes() const {
+  return SetNode ? SetNode->getDereferenceableOrNullBytes() : 0;
+}
+
+std::pair<unsigned, Optional<unsigned>> AttributeSet::getAllocSizeArgs() const {
+  return SetNode ? SetNode->getAllocSizeArgs() : std::make_pair(0, 0);
+}
+
+std::string AttributeSet::getAsString(bool InAttrGrp) const {
+  return SetNode ? SetNode->getAsString(InAttrGrp) : "";
+}
+
+AttributeSet::iterator AttributeSet::begin() const {
+  return SetNode ? SetNode->begin() : nullptr;
+}
+
+AttributeSet::iterator AttributeSet::end() const {
+  return SetNode ? SetNode->end() : nullptr;
+}
+
 //===----------------------------------------------------------------------===//
 // AttributeSetNode Definition
 //===----------------------------------------------------------------------===//
 
+AttributeSetNode::AttributeSetNode(ArrayRef<Attribute> Attrs)
+    : AvailableAttrs(0), NumAttrs(Attrs.size()) {
+  // There's memory after the node where we can store the entries in.
+  std::copy(Attrs.begin(), Attrs.end(), getTrailingObjects<Attribute>());
+
+  for (Attribute I : *this) {
+    if (!I.isStringAttribute()) {
+      AvailableAttrs |= ((uint64_t)1) << I.getKindAsEnum();
+    }
+  }
+}
+
 AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
                                         ArrayRef<Attribute> Attrs) {
   if (Attrs.empty())
@@ -504,10 +598,52 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
     pImpl->AttrsSetNodes.InsertNode(PA, InsertPoint);
   }
 
-  // Return the AttributesListNode that we found or created.
+  // Return the AttributeSetNode that we found or created.
   return PA;
 }
 
+AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) {
+  // Add target-independent attributes.
+  SmallVector<Attribute, 8> Attrs;
+  for (Attribute::AttrKind Kind = Attribute::None;
+       Kind != Attribute::EndAttrKinds; Kind = Attribute::AttrKind(Kind + 1)) {
+    if (!B.contains(Kind))
+      continue;
+
+    Attribute Attr;
+    switch (Kind) {
+    case Attribute::Alignment:
+      Attr = Attribute::getWithAlignment(C, B.getAlignment());
+      break;
+    case Attribute::StackAlignment:
+      Attr = Attribute::getWithStackAlignment(C, B.getStackAlignment());
+      break;
+    case Attribute::Dereferenceable:
+      Attr = Attribute::getWithDereferenceableBytes(
+          C, B.getDereferenceableBytes());
+      break;
+    case Attribute::DereferenceableOrNull:
+      Attr = Attribute::getWithDereferenceableOrNullBytes(
+          C, B.getDereferenceableOrNullBytes());
+      break;
+    case Attribute::AllocSize: {
+      auto A = B.getAllocSizeArgs();
+      Attr = Attribute::getWithAllocSizeArgs(C, A.first, A.second);
+      break;
+    }
+    default:
+      Attr = Attribute::get(C, Kind);
+    }
+    Attrs.push_back(Attr);
+  }
+
+  // Add target-dependent (string) attributes.
+  for (const auto &TDA : B.td_attrs())
+    Attrs.emplace_back(Attribute::get(C, TDA.first, TDA.second));
+
+  return get(C, Attrs);
+}
+
 bool AttributeSetNode::hasAttribute(StringRef Kind) const {
   for (Attribute I : *this)
     if (I.hasAttribute(Kind))
@@ -578,48 +714,107 @@ std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
 }
 
 //===----------------------------------------------------------------------===//
-// AttributeSetImpl Definition
+// AttributeListImpl Definition
 //===----------------------------------------------------------------------===//
 
+AttributeListImpl::AttributeListImpl(
+    LLVMContext &C, ArrayRef<std::pair<unsigned, AttributeSet>> Slots)
+    : Context(C), NumSlots(Slots.size()), AvailableFunctionAttrs(0) {
+#ifndef NDEBUG
+  if (Slots.size() >= 2) {
+    auto &PrevPair = Slots.front();
+    for (auto &CurPair : Slots.drop_front()) {
+      assert(PrevPair.first <= CurPair.first && "Attribute set not ordered!");
+    }
+  }
+#endif
+
+  // There's memory after the node where we can store the entries in.
+  std::copy(Slots.begin(), Slots.end(), getTrailingObjects<IndexAttrPair>());
+
+  // Initialize AvailableFunctionAttrs summary bitset.
+  if (NumSlots > 0) {
+    static_assert(Attribute::EndAttrKinds <=
+                      sizeof(AvailableFunctionAttrs) * CHAR_BIT,
+                  "Too many attributes");
+    static_assert(AttributeList::FunctionIndex == ~0u,
+                  "FunctionIndex should be biggest possible index");
+    const auto &Last = Slots.back();
+    if (Last.first == AttributeList::FunctionIndex) {
+      AttributeSet Node = Last.second;
+      for (Attribute I : Node) {
+        if (!I.isStringAttribute())
+          AvailableFunctionAttrs |= ((uint64_t)1) << I.getKindAsEnum();
+      }
+    }
+  }
+}
+
+void AttributeListImpl::Profile(FoldingSetNodeID &ID) const {
+  Profile(ID, makeArrayRef(getSlotPair(0), getNumSlots()));
+}
+
+void AttributeListImpl::Profile(
+    FoldingSetNodeID &ID, ArrayRef<std::pair<unsigned, AttributeSet>> Nodes) {
+  for (const auto &Node : Nodes) {
+    ID.AddInteger(Node.first);
+    ID.AddPointer(Node.second.SetNode);
+  }
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void AttributeSetImpl::dump() const {
-  AttributeSet(const_cast<AttributeSetImpl *>(this)).dump();
+LLVM_DUMP_METHOD void AttributeListImpl::dump() const {
+  AttributeList(const_cast<AttributeListImpl *>(this)).dump();
 }
 #endif
 
 //===----------------------------------------------------------------------===//
-// AttributeSet Construction and Mutation Methods
+// AttributeList Construction and Mutation Methods
 //===----------------------------------------------------------------------===//
 
-AttributeSet
-AttributeSet::getImpl(LLVMContext &C,
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> > Attrs) {
+AttributeList AttributeList::getImpl(
+    LLVMContext &C, ArrayRef<std::pair<unsigned, AttributeSet>> Attrs) {
+  assert(!Attrs.empty() && "creating pointless AttributeList");
+#ifndef NDEBUG
+  unsigned LastIndex = 0;
+  bool IsFirst = true;
+  for (const auto &AttrPair : Attrs) {
+    assert((IsFirst || LastIndex < AttrPair.first) &&
+           "unsorted or duplicate AttributeList indices");
+    assert(AttrPair.second.hasAttributes() && "pointless AttributeList slot");
+    LastIndex = AttrPair.first;
+    IsFirst = false;
+  }
+#endif
+
   LLVMContextImpl *pImpl = C.pImpl;
   FoldingSetNodeID ID;
-  AttributeSetImpl::Profile(ID, Attrs);
+  AttributeListImpl::Profile(ID, Attrs);
 
   void *InsertPoint;
-  AttributeSetImpl *PA = pImpl->AttrsLists.FindNodeOrInsertPos(ID, InsertPoint);
+  AttributeListImpl *PA =
+      pImpl->AttrsLists.FindNodeOrInsertPos(ID, InsertPoint);
 
   // If we didn't find any existing attributes of the same shape then
   // create a new one and insert it.
   if (!PA) {
-    // Coallocate entries after the AttributeSetImpl itself.
+    // Coallocate entries after the AttributeListImpl itself.
     void *Mem = ::operator new(
-        AttributeSetImpl::totalSizeToAlloc<IndexAttrPair>(Attrs.size()));
-    PA = new (Mem) AttributeSetImpl(C, Attrs);
+        AttributeListImpl::totalSizeToAlloc<IndexAttrPair>(Attrs.size()));
+    PA = new (Mem) AttributeListImpl(C, Attrs);
     pImpl->AttrsLists.InsertNode(PA, InsertPoint);
   }
 
   // Return the AttributesList that we found or created.
-  return AttributeSet(PA);
+  return AttributeList(PA);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C,
-                               ArrayRef<std::pair<unsigned, Attribute> > Attrs){
+AttributeList
+AttributeList::get(LLVMContext &C,
+                   ArrayRef<std::pair<unsigned, Attribute>> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
   if (Attrs.empty())
-    return AttributeSet();
+    return AttributeList();
 
   assert(std::is_sorted(Attrs.begin(), Attrs.end(),
                         [](const std::pair<unsigned, Attribute> &LHS,
@@ -634,8 +829,8 @@ AttributeSet AttributeSet::get(LLVMContext &C,
 
   // Create a vector if (unsigned, AttributeSetNode*) pairs from the attributes
   // list.
-  SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrPairVec;
-  for (ArrayRef<std::pair<unsigned, Attribute> >::iterator I = Attrs.begin(),
+  SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrPairVec;
+  for (ArrayRef<std::pair<unsigned, Attribute>>::iterator I = Attrs.begin(),
          E = Attrs.end(); I != E; ) {
     unsigned Index = I->first;
     SmallVector<Attribute, 4> AttrVec;
@@ -644,103 +839,87 @@ AttributeSet AttributeSet::get(LLVMContext &C,
       ++I;
     }
 
-    AttrPairVec.emplace_back(Index, AttributeSetNode::get(C, AttrVec));
+    AttrPairVec.emplace_back(Index, AttributeSet::get(C, AttrVec));
   }
 
   return getImpl(C, AttrPairVec);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C,
-                               ArrayRef<std::pair<unsigned,
-                                                  AttributeSetNode*> > Attrs) {
+AttributeList
+AttributeList::get(LLVMContext &C,
+                   ArrayRef<std::pair<unsigned, AttributeSet>> Attrs) {
   // If there are no attributes then return a null AttributesList pointer.
   if (Attrs.empty())
-    return AttributeSet();
+    return AttributeList();
 
   return getImpl(C, Attrs);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
-                               const AttrBuilder &B) {
-  if (!B.hasAttributes())
-    return AttributeSet();
-
-  // Add target-independent attributes.
-  SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
-  for (Attribute::AttrKind Kind = Attribute::None;
-       Kind != Attribute::EndAttrKinds; Kind = Attribute::AttrKind(Kind + 1)) {
-    if (!B.contains(Kind))
-      continue;
-
-    Attribute Attr;
-    switch (Kind) {
-    case Attribute::Alignment:
-      Attr = Attribute::getWithAlignment(C, B.getAlignment());
-      break;
-    case Attribute::StackAlignment:
-      Attr = Attribute::getWithStackAlignment(C, B.getStackAlignment());
-      break;
-    case Attribute::Dereferenceable:
-      Attr = Attribute::getWithDereferenceableBytes(
-          C, B.getDereferenceableBytes());
-      break;
-    case Attribute::DereferenceableOrNull:
-      Attr = Attribute::getWithDereferenceableOrNullBytes(
-          C, B.getDereferenceableOrNullBytes());
-      break;
-    case Attribute::AllocSize: {
-      auto A = B.getAllocSizeArgs();
-      Attr = Attribute::getWithAllocSizeArgs(C, A.first, A.second);
-      break;
+AttributeList AttributeList::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
+  assert(Attrs.size() >= 2 &&
+         "should always have function and return attr slots");
+  SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrPairs;
+  size_t Index = 0;
+  for (AttributeSet AS : Attrs) {
+    if (AS.hasAttributes()) {
+      // If this is the last AttributeSetNode, it's for the function.
+      if (Index == Attrs.size() - 1)
+        Index = AttributeList::FunctionIndex;
+      AttrPairs.emplace_back(Index, AS);
     }
-    default:
-      Attr = Attribute::get(C, Kind);
-    }
-    Attrs.emplace_back(Index, Attr);
+    ++Index;
   }
+  if (AttrPairs.empty())
+    return AttributeList();
+  return getImpl(C, AttrPairs);
+}
 
-  // Add target-dependent (string) attributes.
-  for (const auto &TDA : B.td_attrs())
-    Attrs.emplace_back(Index, Attribute::get(C, TDA.first, TDA.second));
-
-  return get(C, Attrs);
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 const AttrBuilder &B) {
+  if (!B.hasAttributes())
+    return AttributeList();
+  AttributeSet AS = AttributeSet::get(C, B);
+  std::pair<unsigned, AttributeSet> Arr[1] = {{Index, AS}};
+  return getImpl(C, Arr);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
-                               ArrayRef<Attribute::AttrKind> Kinds) {
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 ArrayRef<Attribute::AttrKind> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
   for (Attribute::AttrKind K : Kinds)
     Attrs.emplace_back(Index, Attribute::get(C, K));
   return get(C, Attrs);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, unsigned Index,
-                               ArrayRef<StringRef> Kinds) {
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 ArrayRef<StringRef> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
   for (StringRef K : Kinds)
     Attrs.emplace_back(Index, Attribute::get(C, K));
   return get(C, Attrs);
 }
 
-AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
-  if (Attrs.empty()) return AttributeSet();
+AttributeList AttributeList::get(LLVMContext &C,
+                                 ArrayRef<AttributeList> Attrs) {
+  if (Attrs.empty())
+    return AttributeList();
   if (Attrs.size() == 1) return Attrs[0];
 
-  SmallVector<std::pair<unsigned, AttributeSetNode*>, 8> AttrNodeVec;
-  AttributeSetImpl *A0 = Attrs[0].pImpl;
+  SmallVector<std::pair<unsigned, AttributeSet>, 8> AttrNodeVec;
+  AttributeListImpl *A0 = Attrs[0].pImpl;
   if (A0)
-    AttrNodeVec.append(A0->getNode(0), A0->getNode(A0->getNumSlots()));
+    AttrNodeVec.append(A0->getSlotPair(0), A0->getSlotPair(A0->getNumSlots()));
   // Copy all attributes from Attrs into AttrNodeVec while keeping AttrNodeVec
   // ordered by index.  Because we know that each list in Attrs is ordered by
   // index we only need to merge each successive list in rather than doing a
   // full sort.
   for (unsigned I = 1, E = Attrs.size(); I != E; ++I) {
-    AttributeSetImpl *AS = Attrs[I].pImpl;
-    if (!AS) continue;
-    SmallVector<std::pair<unsigned, AttributeSetNode *>, 8>::iterator
+    AttributeListImpl *ALI = Attrs[I].pImpl;
+    if (!ALI) continue;
+    SmallVector<std::pair<unsigned, AttributeSet>, 8>::iterator
       ANVI = AttrNodeVec.begin(), ANVE;
-    for (const IndexAttrPair *AI = AS->getNode(0),
-                             *AE = AS->getNode(AS->getNumSlots());
+    for (const IndexAttrPair *AI = ALI->getSlotPair(0),
+                             *AE = ALI->getSlotPair(ALI->getNumSlots());
          AI != AE; ++AI) {
       ANVE = AttrNodeVec.end();
       while (ANVI != ANVE && ANVI->first <= AI->first)
@@ -752,113 +931,123 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef<AttributeSet> Attrs) {
   return getImpl(C, AttrNodeVec);
 }
 
-AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
-                                        Attribute::AttrKind Kind) const {
+AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
+                                          Attribute::AttrKind Kind) const {
   if (hasAttribute(Index, Kind)) return *this;
-  return addAttributes(C, Index, AttributeSet::get(C, Index, Kind));
+  return addAttributes(C, Index, AttributeList::get(C, Index, Kind));
 }
 
-AttributeSet AttributeSet::addAttribute(LLVMContext &C, unsigned Index,
-                                        StringRef Kind, StringRef Value) const {
-  llvm::AttrBuilder B;
+AttributeList AttributeList::addAttribute(LLVMContext &C, unsigned Index,
+                                          StringRef Kind,
+                                          StringRef Value) const {
+  AttrBuilder B;
   B.addAttribute(Kind, Value);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, AttributeList::get(C, Index, B));
 }
 
-AttributeSet AttributeSet::addAttribute(LLVMContext &C,
-                                        ArrayRef<unsigned> Indices,
-                                        Attribute A) const {
+AttributeList AttributeList::addAttribute(LLVMContext &C,
+                                          ArrayRef<unsigned> Indices,
+                                          Attribute A) const {
+  assert(std::is_sorted(Indices.begin(), Indices.end()));
+
   unsigned I = 0, E = pImpl ? pImpl->getNumSlots() : 0;
-  auto IdxI = Indices.begin(), IdxE = Indices.end();
-  SmallVector<AttributeSet, 4> AttrSet;
-
-  while (I != E && IdxI != IdxE) {
-    if (getSlotIndex(I) < *IdxI)
-      AttrSet.emplace_back(getSlotAttributes(I++));
-    else if (getSlotIndex(I) > *IdxI)
-      AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
-    else {
-      AttrBuilder B(getSlotAttributes(I), *IdxI);
-      B.addAttribute(A);
-      AttrSet.emplace_back(AttributeSet::get(C, *IdxI, B));
+  SmallVector<IndexAttrPair, 4> AttrVec;
+  for (unsigned Index : Indices) {
+    // Add all attribute slots before the current index.
+    for (; I < E && getSlotIndex(I) < Index; ++I)
+      AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I));
+
+    // Add the attribute at this index. If we already have attributes at this
+    // index, merge them into a new set.
+    AttrBuilder B;
+    if (I < E && getSlotIndex(I) == Index) {
+      B.merge(AttrBuilder(pImpl->getSlotNode(I)));
       ++I;
-      ++IdxI;
     }
+    B.addAttribute(A);
+    AttrVec.emplace_back(Index, AttributeSet::get(C, B));
   }
 
-  while (I != E)
-    AttrSet.emplace_back(getSlotAttributes(I++));
+  // Add remaining attributes.
+  for (; I < E; ++I)
+    AttrVec.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I));
 
-  while (IdxI != IdxE)
-    AttrSet.emplace_back(AttributeSet::get(C, std::make_pair(*IdxI++, A)));
-
-  return get(C, AttrSet);
+  return get(C, AttrVec);
 }
 
-AttributeSet AttributeSet::addAttributes(LLVMContext &C, unsigned Index,
-                                         AttributeSet Attrs) const {
+AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
+                                           AttributeList Attrs) const {
   if (!pImpl) return Attrs;
   if (!Attrs.pImpl) return *this;
 
+  return addAttributes(C, Index, Attrs.getAttributes(Index));
+}
+
+AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
+                                           AttributeSet AS) const {
+  if (!AS.hasAttributes())
+    return *this;
+
 #ifndef NDEBUG
   // FIXME it is not obvious how this should work for alignment. For now, say
   // we can't change a known alignment.
   unsigned OldAlign = getParamAlignment(Index);
-  unsigned NewAlign = Attrs.getParamAlignment(Index);
+  unsigned NewAlign = AS.getAlignment();
   assert((!OldAlign || !NewAlign || OldAlign == NewAlign) &&
          "Attempt to change alignment!");
 #endif
 
-  // Add the attribute slots before the one we're trying to add.
-  SmallVector<AttributeSet, 4> AttrSet;
+  SmallVector<std::pair<unsigned, AttributeSet>, 4> AttrSet;
   uint64_t NumAttrs = pImpl->getNumSlots();
-  AttributeSet AS;
-  uint64_t LastIndex = 0;
-  for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
-    if (getSlotIndex(I) >= Index) {
-      if (getSlotIndex(I) == Index) AS = getSlotAttributes(LastIndex++);
-      break;
-    }
-    LastIndex = I + 1;
-    AttrSet.push_back(getSlotAttributes(I));
-  }
+  unsigned I;
 
-  // Now add the attribute into the correct slot. There may already be an
-  // AttributeSet there.
-  AttrBuilder B(AS, Index);
-
-  for (unsigned I = 0, E = Attrs.pImpl->getNumSlots(); I != E; ++I)
-    if (Attrs.getSlotIndex(I) == Index) {
-      for (AttributeSetImpl::iterator II = Attrs.pImpl->begin(I),
-             IE = Attrs.pImpl->end(I); II != IE; ++II)
-        B.addAttribute(*II);
+  // Add all the attribute slots before the one we need to merge.
+  for (I = 0; I < NumAttrs; ++I) {
+    if (getSlotIndex(I) >= Index)
       break;
-    }
+    AttrSet.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I));
+  }
 
-  AttrSet.push_back(AttributeSet::get(C, Index, B));
+  if (I < NumAttrs && getSlotIndex(I) == Index) {
+    // We need to merge two AttributeSets.
+    AttributeSet Merged = AttributeSet::get(
+        C, AttrBuilder(pImpl->getSlotNode(I)).merge(AttrBuilder(AS)));
+    AttrSet.emplace_back(Index, Merged);
+    ++I;
+  } else {
+    // Otherwise, there were no attributes at this position in the original
+    // list. Add the set as is.
+    AttrSet.emplace_back(Index, AS);
+  }
 
-  // Add the remaining attribute slots.
-  for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
-    AttrSet.push_back(getSlotAttributes(I));
+  // Add the remaining entries.
+  for (; I < NumAttrs; ++I)
+    AttrSet.emplace_back(getSlotIndex(I), pImpl->getSlotNode(I));
 
   return get(C, AttrSet);
 }
 
-AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Index,
-                                           Attribute::AttrKind Kind) const {
+AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
+                                           const AttrBuilder &B) const {
+  return get(C, Index, AttributeSet::get(C, B));
+}
+
+AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
+                                             Attribute::AttrKind Kind) const {
   if (!hasAttribute(Index, Kind)) return *this;
-  return removeAttributes(C, Index, AttributeSet::get(C, Index, Kind));
+  return removeAttributes(C, Index, AttributeList::get(C, Index, Kind));
 }
 
-AttributeSet AttributeSet::removeAttribute(LLVMContext &C, unsigned Index,
-                                           StringRef Kind) const {
+AttributeList AttributeList::removeAttribute(LLVMContext &C, unsigned Index,
+                                             StringRef Kind) const {
   if (!hasAttribute(Index, Kind)) return *this;
-  return removeAttributes(C, Index, AttributeSet::get(C, Index, Kind));
+  return removeAttributes(C, Index, AttributeList::get(C, Index, Kind));
 }
 
-AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
-                                            AttributeSet Attrs) const {
-  if (!pImpl) return AttributeSet();
+AttributeList AttributeList::removeAttributes(LLVMContext &C, unsigned Index,
+                                              AttributeList Attrs) const {
+  if (!pImpl)
+    return AttributeList();
   if (!Attrs.pImpl) return *this;
 
   // FIXME it is not obvious how this should work for alignment.
@@ -867,13 +1056,13 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
          "Attempt to change alignment!");
 
   // Add the attribute slots before the one we're trying to add.
-  SmallVector<AttributeSet, 4> AttrSet;
+  SmallVector<AttributeList, 4> AttrSet;
   uint64_t NumAttrs = pImpl->getNumSlots();
-  AttributeSet AS;
+  AttributeList AL;
   uint64_t LastIndex = 0;
   for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
     if (getSlotIndex(I) >= Index) {
-      if (getSlotIndex(I) == Index) AS = getSlotAttributes(LastIndex++);
+      if (getSlotIndex(I) == Index) AL = getSlotAttributes(LastIndex++);
       break;
     }
     LastIndex = I + 1;
@@ -881,8 +1070,8 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
   }
 
   // Now remove the attribute from the correct slot. There may already be an
-  // AttributeSet there.
-  AttrBuilder B(AS, Index);
+  // AttributeList there.
+  AttrBuilder B(AL, Index);
 
   for (unsigned I = 0, E = Attrs.pImpl->getNumSlots(); I != E; ++I)
     if (Attrs.getSlotIndex(I) == Index) {
@@ -890,7 +1079,7 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
       break;
     }
 
-  AttrSet.push_back(AttributeSet::get(C, Index, B));
+  AttrSet.push_back(AttributeList::get(C, Index, B));
 
   // Add the remaining attribute slots.
   for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
@@ -899,22 +1088,23 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
   return get(C, AttrSet);
 }
 
-AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
-                                            const AttrBuilder &Attrs) const {
-  if (!pImpl) return AttributeSet();
+AttributeList AttributeList::removeAttributes(LLVMContext &C, unsigned Index,
+                                              const AttrBuilder &Attrs) const {
+  if (!pImpl)
+    return AttributeList();
 
   // FIXME it is not obvious how this should work for alignment.
   // For now, say we can't pass in alignment, which no current use does.
   assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!");
 
   // Add the attribute slots before the one we're trying to add.
-  SmallVector<AttributeSet, 4> AttrSet;
+  SmallVector<AttributeList, 4> AttrSet;
   uint64_t NumAttrs = pImpl->getNumSlots();
-  AttributeSet AS;
+  AttributeList AL;
   uint64_t LastIndex = 0;
   for (unsigned I = 0, E = NumAttrs; I != E; ++I) {
     if (getSlotIndex(I) >= Index) {
-      if (getSlotIndex(I) == Index) AS = getSlotAttributes(LastIndex++);
+      if (getSlotIndex(I) == Index) AL = getSlotAttributes(LastIndex++);
       break;
     }
     LastIndex = I + 1;
@@ -922,11 +1112,11 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
   }
 
   // Now remove the attribute from the correct slot. There may already be an
-  // AttributeSet there.
-  AttrBuilder B(AS, Index);
+  // AttributeList there.
+  AttrBuilder B(AL, Index);
   B.remove(Attrs);
 
-  AttrSet.push_back(AttributeSet::get(C, Index, B));
+  AttrSet.push_back(AttributeList::get(C, Index, B));
 
   // Add the remaining attribute slots.
   for (unsigned I = LastIndex, E = NumAttrs; I < E; ++I)
@@ -935,94 +1125,91 @@ AttributeSet AttributeSet::removeAttributes(LLVMContext &C, unsigned Index,
   return get(C, AttrSet);
 }
 
-AttributeSet AttributeSet::addDereferenceableAttr(LLVMContext &C, unsigned Index,
-                                                  uint64_t Bytes) const {
-  llvm::AttrBuilder B;
+AttributeList AttributeList::removeAttributes(LLVMContext &C,
+                                              unsigned WithoutIndex) const {
+  if (!pImpl)
+    return AttributeList();
+
+  SmallVector<std::pair<unsigned, AttributeSet>, 4> AttrSet;
+  for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) {
+    unsigned Index = getSlotIndex(I);
+    if (Index != WithoutIndex)
+      AttrSet.push_back({Index, pImpl->getSlotNode(I)});
+  }
+  return get(C, AttrSet);
+}
+
+AttributeList AttributeList::addDereferenceableAttr(LLVMContext &C,
+                                                    unsigned Index,
+                                                    uint64_t Bytes) const {
+  AttrBuilder B;
   B.addDereferenceableAttr(Bytes);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, AttributeList::get(C, Index, B));
 }
 
-AttributeSet AttributeSet::addDereferenceableOrNullAttr(LLVMContext &C,
-                                                        unsigned Index,
-                                                        uint64_t Bytes) const {
-  llvm::AttrBuilder B;
+AttributeList
+AttributeList::addDereferenceableOrNullAttr(LLVMContext &C, unsigned Index,
+                                            uint64_t Bytes) const {
+  AttrBuilder B;
   B.addDereferenceableOrNullAttr(Bytes);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, AttributeList::get(C, Index, B));
 }
 
-AttributeSet
-AttributeSet::addAllocSizeAttr(LLVMContext &C, unsigned Index,
-                               unsigned ElemSizeArg,
-                               const Optional<unsigned> &NumElemsArg) {
-  llvm::AttrBuilder B;
+AttributeList
+AttributeList::addAllocSizeAttr(LLVMContext &C, unsigned Index,
+                                unsigned ElemSizeArg,
+                                const Optional<unsigned> &NumElemsArg) {
+  AttrBuilder B;
   B.addAllocSizeAttr(ElemSizeArg, NumElemsArg);
-  return addAttributes(C, Index, AttributeSet::get(C, Index, B));
+  return addAttributes(C, Index, AttributeList::get(C, Index, B));
 }
 
 //===----------------------------------------------------------------------===//
-// AttributeSet Accessor Methods
+// AttributeList Accessor Methods
 //===----------------------------------------------------------------------===//
 
-LLVMContext &AttributeSet::getContext() const {
-  return pImpl->getContext();
-}
+LLVMContext &AttributeList::getContext() const { return pImpl->getContext(); }
 
-AttributeSet AttributeSet::getParamAttributes(unsigned Index) const {
-  return pImpl && hasAttributes(Index) ?
-    AttributeSet::get(pImpl->getContext(),
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
-                        std::make_pair(Index, getAttributes(Index)))) :
-    AttributeSet();
+AttributeSet AttributeList::getParamAttributes(unsigned Index) const {
+  return getAttributes(Index);
 }
 
-AttributeSet AttributeSet::getRetAttributes() const {
-  return pImpl && hasAttributes(ReturnIndex) ?
-    AttributeSet::get(pImpl->getContext(),
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
-                        std::make_pair(ReturnIndex,
-                                       getAttributes(ReturnIndex)))) :
-    AttributeSet();
+AttributeSet AttributeList::getRetAttributes() const {
+  return getAttributes(ReturnIndex);
 }
 
-AttributeSet AttributeSet::getFnAttributes() const {
-  return pImpl && hasAttributes(FunctionIndex) ?
-    AttributeSet::get(pImpl->getContext(),
-                      ArrayRef<std::pair<unsigned, AttributeSetNode*> >(
-                        std::make_pair(FunctionIndex,
-                                       getAttributes(FunctionIndex)))) :
-    AttributeSet();
+AttributeSet AttributeList::getFnAttributes() const {
+  return getAttributes(FunctionIndex);
 }
 
-bool AttributeSet::hasAttribute(unsigned Index, Attribute::AttrKind Kind) const{
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN && ASN->hasAttribute(Kind);
+bool AttributeList::hasAttribute(unsigned Index,
+                                 Attribute::AttrKind Kind) const {
+  return getAttributes(Index).hasAttribute(Kind);
 }
 
-bool AttributeSet::hasAttribute(unsigned Index, StringRef Kind) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN && ASN->hasAttribute(Kind);
+bool AttributeList::hasAttribute(unsigned Index, StringRef Kind) const {
+  return getAttributes(Index).hasAttribute(Kind);
 }
 
-bool AttributeSet::hasAttributes(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN && ASN->hasAttributes();
+bool AttributeList::hasAttributes(unsigned Index) const {
+  return getAttributes(Index).hasAttributes();
 }
 
-bool AttributeSet::hasFnAttribute(Attribute::AttrKind Kind) const {
+bool AttributeList::hasFnAttribute(Attribute::AttrKind Kind) const {
   return pImpl && pImpl->hasFnAttribute(Kind);
 }
 
-bool AttributeSet::hasFnAttribute(StringRef Kind) const {
-  return hasAttribute(AttributeSet::FunctionIndex, Kind);
+bool AttributeList::hasFnAttribute(StringRef Kind) const {
+  return hasAttribute(AttributeList::FunctionIndex, Kind);
 }
 
-bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr,
-                                    unsigned *Index) const {
+bool AttributeList::hasAttrSomewhere(Attribute::AttrKind Attr,
+                                     unsigned *Index) const {
   if (!pImpl) return false;
 
   for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I)
-    for (AttributeSetImpl::iterator II = pImpl->begin(I),
-           IE = pImpl->end(I); II != IE; ++II)
+    for (AttributeListImpl::iterator II = pImpl->begin(I), IE = pImpl->end(I);
+         II != IE; ++II)
       if (II->hasAttribute(Attr)) {
         if (Index) *Index = pImpl->getSlotIndex(I);
         return true;
@@ -1031,94 +1218,85 @@ bool AttributeSet::hasAttrSomewhere(Attribute::AttrKind Attr,
   return false;
 }
 
-Attribute AttributeSet::getAttribute(unsigned Index,
-                                     Attribute::AttrKind Kind) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAttribute(Kind) : Attribute();
+Attribute AttributeList::getAttribute(unsigned Index,
+                                      Attribute::AttrKind Kind) const {
+  return getAttributes(Index).getAttribute(Kind);
 }
 
-Attribute AttributeSet::getAttribute(unsigned Index,
-                                     StringRef Kind) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAttribute(Kind) : Attribute();
+Attribute AttributeList::getAttribute(unsigned Index, StringRef Kind) const {
+  return getAttributes(Index).getAttribute(Kind);
 }
 
-unsigned AttributeSet::getParamAlignment(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAlignment() : 0;
+unsigned AttributeList::getParamAlignment(unsigned Index) const {
+  return getAttributes(Index).getAlignment();
 }
 
-unsigned AttributeSet::getStackAlignment(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getStackAlignment() : 0;
+unsigned AttributeList::getStackAlignment(unsigned Index) const {
+  return getAttributes(Index).getStackAlignment();
 }
 
-uint64_t AttributeSet::getDereferenceableBytes(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getDereferenceableBytes() : 0;
+uint64_t AttributeList::getDereferenceableBytes(unsigned Index) const {
+  return getAttributes(Index).getDereferenceableBytes();
 }
 
-uint64_t AttributeSet::getDereferenceableOrNullBytes(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getDereferenceableOrNullBytes() : 0;
+uint64_t AttributeList::getDereferenceableOrNullBytes(unsigned Index) const {
+  return getAttributes(Index).getDereferenceableOrNullBytes();
 }
 
 std::pair<unsigned, Optional<unsigned>>
-AttributeSet::getAllocSizeArgs(unsigned Index) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAllocSizeArgs() : std::make_pair(0u, Optional<unsigned>(0u));
+AttributeList::getAllocSizeArgs(unsigned Index) const {
+  return getAttributes(Index).getAllocSizeArgs();
 }
 
-std::string AttributeSet::getAsString(unsigned Index, bool InAttrGrp) const {
-  AttributeSetNode *ASN = getAttributes(Index);
-  return ASN ? ASN->getAsString(InAttrGrp) : std::string("");
+std::string AttributeList::getAsString(unsigned Index, bool InAttrGrp) const {
+  return getAttributes(Index).getAsString(InAttrGrp);
 }
 
-AttributeSetNode *AttributeSet::getAttributes(unsigned Index) const {
-  if (!pImpl) return nullptr;
+AttributeSet AttributeList::getAttributes(unsigned Index) const {
+  if (!pImpl) return AttributeSet();
 
   // Loop through to find the attribute node we want.
   for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I)
     if (pImpl->getSlotIndex(I) == Index)
       return pImpl->getSlotNode(I);
 
-  return nullptr;
+  return AttributeSet();
 }
 
-AttributeSet::iterator AttributeSet::begin(unsigned Slot) const {
+AttributeList::iterator AttributeList::begin(unsigned Slot) const {
   if (!pImpl)
     return ArrayRef<Attribute>().begin();
   return pImpl->begin(Slot);
 }
 
-AttributeSet::iterator AttributeSet::end(unsigned Slot) const {
+AttributeList::iterator AttributeList::end(unsigned Slot) const {
   if (!pImpl)
     return ArrayRef<Attribute>().end();
   return pImpl->end(Slot);
 }
 
 //===----------------------------------------------------------------------===//
-// AttributeSet Introspection Methods
+// AttributeList Introspection Methods
 //===----------------------------------------------------------------------===//
 
-unsigned AttributeSet::getNumSlots() const {
+unsigned AttributeList::getNumSlots() const {
   return pImpl ? pImpl->getNumSlots() : 0;
 }
 
-unsigned AttributeSet::getSlotIndex(unsigned Slot) const {
+unsigned AttributeList::getSlotIndex(unsigned Slot) const {
   assert(pImpl && Slot < pImpl->getNumSlots() &&
          "Slot # out of range!");
   return pImpl->getSlotIndex(Slot);
 }
 
-AttributeSet AttributeSet::getSlotAttributes(unsigned Slot) const {
+AttributeList AttributeList::getSlotAttributes(unsigned Slot) const {
   assert(pImpl && Slot < pImpl->getNumSlots() &&
          "Slot # out of range!");
   return pImpl->getSlotAttributes(Slot);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void AttributeSet::dump() const {
+LLVM_DUMP_METHOD void AttributeList::dump() const {
   dbgs() << "PAL[\n";
 
   for (unsigned i = 0, e = getNumSlots(); i < e; ++i) {
@@ -1139,23 +1317,28 @@ LLVM_DUMP_METHOD void AttributeSet::dump() const {
 // AttrBuilder Method Implementations
 //===----------------------------------------------------------------------===//
 
-AttrBuilder::AttrBuilder(AttributeSet AS, unsigned Index)
-    : Attrs(0), Alignment(0), StackAlignment(0), DerefBytes(0),
-      DerefOrNullBytes(0), AllocSizeArgs(0) {
-  AttributeSetImpl *pImpl = AS.pImpl;
+AttrBuilder::AttrBuilder(AttributeList AL, unsigned Index) {
+  AttributeListImpl *pImpl = AL.pImpl;
   if (!pImpl) return;
 
   for (unsigned I = 0, E = pImpl->getNumSlots(); I != E; ++I) {
     if (pImpl->getSlotIndex(I) != Index) continue;
 
-    for (AttributeSetImpl::iterator II = pImpl->begin(I),
-           IE = pImpl->end(I); II != IE; ++II)
+    for (AttributeListImpl::iterator II = pImpl->begin(I), IE = pImpl->end(I);
+         II != IE; ++II)
       addAttribute(*II);
 
     break;
   }
 }
 
+AttrBuilder::AttrBuilder(AttributeSet AS) {
+  if (AS.hasAttributes()) {
+    for (const Attribute &A : AS)
+      addAttribute(A);
+  }
+}
+
 void AttrBuilder::clear() {
   Attrs.reset();
   TargetDepAttrs.clear();
@@ -1217,7 +1400,7 @@ AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
   return *this;
 }
 
-AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) {
+AttrBuilder &AttrBuilder::removeAttributes(AttributeList A, uint64_t Index) {
   unsigned Slot = ~0U;
   for (unsigned I = 0, E = A.getNumSlots(); I != E; ++I)
     if (A.getSlotIndex(I) == Index) {
@@ -1225,9 +1408,10 @@ AttrBuilder &AttrBuilder::removeAttributes(AttributeSet A, uint64_t Index) {
       break;
     }
 
-  assert(Slot != ~0U && "Couldn't find index in AttributeSet!");
+  assert(Slot != ~0U && "Couldn't find index in AttributeList!");
 
-  for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) {
+  for (AttributeList::iterator I = A.begin(Slot), E = A.end(Slot); I != E;
+       ++I) {
     Attribute Attr = *I;
     if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
       removeAttribute(Attr.getKindAsEnum());
@@ -1363,7 +1547,7 @@ bool AttrBuilder::overlaps(const AttrBuilder &B) const {
     return true;
 
   // Then check if any target dependent ones do.
-  for (auto I : td_attrs())
+  for (const auto &I : td_attrs())
     if (B.contains(I.first))
       return true;
 
@@ -1378,7 +1562,7 @@ bool AttrBuilder::hasAttributes() const {
   return !Attrs.none() || !TargetDepAttrs.empty();
 }
 
-bool AttrBuilder::hasAttributes(AttributeSet A, uint64_t Index) const {
+bool AttrBuilder::hasAttributes(AttributeList A, uint64_t Index) const {
   unsigned Slot = ~0U;
   for (unsigned I = 0, E = A.getNumSlots(); I != E; ++I)
     if (A.getSlotIndex(I) == Index) {
@@ -1388,7 +1572,8 @@ bool AttrBuilder::hasAttributes(AttributeSet A, uint64_t Index) const {
 
   assert(Slot != ~0U && "Couldn't find the index!");
 
-  for (AttributeSet::iterator I = A.begin(Slot), E = A.end(Slot); I != E; ++I) {
+  for (AttributeList::iterator I = A.begin(Slot), E = A.end(Slot); I != E;
+       ++I) {
     Attribute Attr = *I;
     if (Attr.isEnumAttribute() || Attr.isIntAttribute()) {
       if (Attrs[I->getKindAsEnum()])
@@ -1489,16 +1674,15 @@ static void adjustCallerSSPLevel(Function &Caller, const Function &Callee) {
   B.addAttribute(Attribute::StackProtect)
     .addAttribute(Attribute::StackProtectStrong)
     .addAttribute(Attribute::StackProtectReq);
-  AttributeSet OldSSPAttr = AttributeSet::get(Caller.getContext(),
-                                              AttributeSet::FunctionIndex,
-                                              B);
+  AttributeList OldSSPAttr =
+      AttributeList::get(Caller.getContext(), AttributeList::FunctionIndex, B);
 
   if (Callee.hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller.removeAttributes(AttributeList::FunctionIndex, OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectReq);
   } else if (Callee.hasFnAttribute(Attribute::StackProtectStrong) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq)) {
-    Caller.removeAttributes(AttributeSet::FunctionIndex, OldSSPAttr);
+    Caller.removeAttributes(AttributeList::FunctionIndex, OldSSPAttr);
     Caller.addFnAttr(Attribute::StackProtectStrong);
   } else if (Callee.hasFnAttribute(Attribute::StackProtect) &&
              !Caller.hasFnAttribute(Attribute::StackProtectReq) &&
@@ -1514,7 +1698,6 @@ bool AttributeFuncs::areInlineCompatible(const Function &Caller,
   return hasCompatibleFnAttrs(Caller, Callee);
 }
 
-
 void AttributeFuncs::mergeAttributesForInlining(Function &Caller,
                                                 const Function &Callee) {
   mergeFnAttrs(Caller, Callee);
diff --git a/lib/IR/AutoUpgrade.cpp b/lib/IR/AutoUpgrade.cpp
index b68bc3f056172e77062bfe658e119d8f899df668..0262e2cc05e85a971ea29699727e899de068b186 100644
--- a/lib/IR/AutoUpgrade.cpp
+++ b/lib/IR/AutoUpgrade.cpp
@@ -34,10 +34,10 @@ using namespace llvm;
 
 static void rename(GlobalValue *GV) { GV->setName(GV->getName() + ".old"); }
 
-// Upgrade the declarations of the SSE4.1 functions whose arguments have
+// Upgrade the declarations of the SSE4.1 ptest intrinsics whose arguments have
 // changed their type from v4f32 to v2i64.
-static bool UpgradeSSE41Function(Function* F, Intrinsic::ID IID,
-                                 Function *&NewFn) {
+static bool UpgradePTESTIntrinsic(Function* F, Intrinsic::ID IID,
+                                  Function *&NewFn) {
   // Check whether this is an old version of the function, which received
   // v4f32 arguments.
   Type *Arg0Type = F->getFunctionType()->getParamType(0);
@@ -66,6 +66,262 @@ static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
   return true;
 }
 
+static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
+  // All of the intrinsics matches below should be marked with which llvm
+  // version started autoupgrading them. At some point in the future we would
+  // like to use this information to remove upgrade code for some older
+  // intrinsics. It is currently undecided how we will determine that future
+  // point.
+  if (Name.startswith("sse2.pcmpeq.") || // Added in 3.1
+      Name.startswith("sse2.pcmpgt.") || // Added in 3.1
+      Name.startswith("avx2.pcmpeq.") || // Added in 3.1
+      Name.startswith("avx2.pcmpgt.") || // Added in 3.1
+      Name.startswith("avx512.mask.pcmpeq.") || // Added in 3.9
+      Name.startswith("avx512.mask.pcmpgt.") || // Added in 3.9
+      Name == "sse.add.ss" || // Added in 4.0
+      Name == "sse2.add.sd" || // Added in 4.0
+      Name == "sse.sub.ss" || // Added in 4.0
+      Name == "sse2.sub.sd" || // Added in 4.0
+      Name == "sse.mul.ss" || // Added in 4.0
+      Name == "sse2.mul.sd" || // Added in 4.0
+      Name == "sse.div.ss" || // Added in 4.0
+      Name == "sse2.div.sd" || // Added in 4.0
+      Name == "sse41.pmaxsb" || // Added in 3.9
+      Name == "sse2.pmaxs.w" || // Added in 3.9
+      Name == "sse41.pmaxsd" || // Added in 3.9
+      Name == "sse2.pmaxu.b" || // Added in 3.9
+      Name == "sse41.pmaxuw" || // Added in 3.9
+      Name == "sse41.pmaxud" || // Added in 3.9
+      Name == "sse41.pminsb" || // Added in 3.9
+      Name == "sse2.pmins.w" || // Added in 3.9
+      Name == "sse41.pminsd" || // Added in 3.9
+      Name == "sse2.pminu.b" || // Added in 3.9
+      Name == "sse41.pminuw" || // Added in 3.9
+      Name == "sse41.pminud" || // Added in 3.9
+      Name.startswith("avx512.mask.pshuf.b.") || // Added in 4.0
+      Name.startswith("avx2.pmax") || // Added in 3.9
+      Name.startswith("avx2.pmin") || // Added in 3.9
+      Name.startswith("avx512.mask.pmax") || // Added in 4.0
+      Name.startswith("avx512.mask.pmin") || // Added in 4.0
+      Name.startswith("avx2.vbroadcast") || // Added in 3.8
+      Name.startswith("avx2.pbroadcast") || // Added in 3.8
+      Name.startswith("avx.vpermil.") || // Added in 3.1
+      Name.startswith("sse2.pshuf") || // Added in 3.9
+      Name.startswith("avx512.pbroadcast") || // Added in 3.9
+      Name.startswith("avx512.mask.broadcast.s") || // Added in 3.9
+      Name.startswith("avx512.mask.movddup") || // Added in 3.9
+      Name.startswith("avx512.mask.movshdup") || // Added in 3.9
+      Name.startswith("avx512.mask.movsldup") || // Added in 3.9
+      Name.startswith("avx512.mask.pshuf.d.") || // Added in 3.9
+      Name.startswith("avx512.mask.pshufl.w.") || // Added in 3.9
+      Name.startswith("avx512.mask.pshufh.w.") || // Added in 3.9
+      Name.startswith("avx512.mask.shuf.p") || // Added in 4.0
+      Name.startswith("avx512.mask.vpermil.p") || // Added in 3.9
+      Name.startswith("avx512.mask.perm.df.") || // Added in 3.9
+      Name.startswith("avx512.mask.perm.di.") || // Added in 3.9
+      Name.startswith("avx512.mask.punpckl") || // Added in 3.9
+      Name.startswith("avx512.mask.punpckh") || // Added in 3.9
+      Name.startswith("avx512.mask.unpckl.") || // Added in 3.9
+      Name.startswith("avx512.mask.unpckh.") || // Added in 3.9
+      Name.startswith("avx512.mask.pand.") || // Added in 3.9
+      Name.startswith("avx512.mask.pandn.") || // Added in 3.9
+      Name.startswith("avx512.mask.por.") || // Added in 3.9
+      Name.startswith("avx512.mask.pxor.") || // Added in 3.9
+      Name.startswith("avx512.mask.and.") || // Added in 3.9
+      Name.startswith("avx512.mask.andn.") || // Added in 3.9
+      Name.startswith("avx512.mask.or.") || // Added in 3.9
+      Name.startswith("avx512.mask.xor.") || // Added in 3.9
+      Name.startswith("avx512.mask.padd.") || // Added in 4.0
+      Name.startswith("avx512.mask.psub.") || // Added in 4.0
+      Name.startswith("avx512.mask.pmull.") || // Added in 4.0
+      Name.startswith("avx512.mask.cvtdq2pd.") || // Added in 4.0
+      Name.startswith("avx512.mask.cvtudq2pd.") || // Added in 4.0
+      Name.startswith("avx512.mask.pmul.dq.") || // Added in 4.0
+      Name.startswith("avx512.mask.pmulu.dq.") || // Added in 4.0
+      Name.startswith("avx512.mask.packsswb.") || // Added in 5.0
+      Name.startswith("avx512.mask.packssdw.") || // Added in 5.0
+      Name.startswith("avx512.mask.packuswb.") || // Added in 5.0
+      Name.startswith("avx512.mask.packusdw.") || // Added in 5.0
+      Name == "avx512.mask.add.pd.128" || // Added in 4.0
+      Name == "avx512.mask.add.pd.256" || // Added in 4.0
+      Name == "avx512.mask.add.ps.128" || // Added in 4.0
+      Name == "avx512.mask.add.ps.256" || // Added in 4.0
+      Name == "avx512.mask.div.pd.128" || // Added in 4.0
+      Name == "avx512.mask.div.pd.256" || // Added in 4.0
+      Name == "avx512.mask.div.ps.128" || // Added in 4.0
+      Name == "avx512.mask.div.ps.256" || // Added in 4.0
+      Name == "avx512.mask.mul.pd.128" || // Added in 4.0
+      Name == "avx512.mask.mul.pd.256" || // Added in 4.0
+      Name == "avx512.mask.mul.ps.128" || // Added in 4.0
+      Name == "avx512.mask.mul.ps.256" || // Added in 4.0
+      Name == "avx512.mask.sub.pd.128" || // Added in 4.0
+      Name == "avx512.mask.sub.pd.256" || // Added in 4.0
+      Name == "avx512.mask.sub.ps.128" || // Added in 4.0
+      Name == "avx512.mask.sub.ps.256" || // Added in 4.0
+      Name == "avx512.mask.max.pd.128" || // Added in 5.0
+      Name == "avx512.mask.max.pd.256" || // Added in 5.0
+      Name == "avx512.mask.max.ps.128" || // Added in 5.0
+      Name == "avx512.mask.max.ps.256" || // Added in 5.0
+      Name == "avx512.mask.min.pd.128" || // Added in 5.0
+      Name == "avx512.mask.min.pd.256" || // Added in 5.0
+      Name == "avx512.mask.min.ps.128" || // Added in 5.0
+      Name == "avx512.mask.min.ps.256" || // Added in 5.0
+      Name.startswith("avx512.mask.vpermilvar.") || // Added in 4.0
+      Name.startswith("avx512.mask.psll.d") || // Added in 4.0
+      Name.startswith("avx512.mask.psll.q") || // Added in 4.0
+      Name.startswith("avx512.mask.psll.w") || // Added in 4.0
+      Name.startswith("avx512.mask.psra.d") || // Added in 4.0
+      Name.startswith("avx512.mask.psra.q") || // Added in 4.0
+      Name.startswith("avx512.mask.psra.w") || // Added in 4.0
+      Name.startswith("avx512.mask.psrl.d") || // Added in 4.0
+      Name.startswith("avx512.mask.psrl.q") || // Added in 4.0
+      Name.startswith("avx512.mask.psrl.w") || // Added in 4.0
+      Name.startswith("avx512.mask.pslli") || // Added in 4.0
+      Name.startswith("avx512.mask.psrai") || // Added in 4.0
+      Name.startswith("avx512.mask.psrli") || // Added in 4.0
+      Name.startswith("avx512.mask.psllv") || // Added in 4.0
+      Name.startswith("avx512.mask.psrav") || // Added in 4.0
+      Name.startswith("avx512.mask.psrlv") || // Added in 4.0
+      Name.startswith("sse41.pmovsx") || // Added in 3.8
+      Name.startswith("sse41.pmovzx") || // Added in 3.9
+      Name.startswith("avx2.pmovsx") || // Added in 3.9
+      Name.startswith("avx2.pmovzx") || // Added in 3.9
+      Name.startswith("avx512.mask.pmovsx") || // Added in 4.0
+      Name.startswith("avx512.mask.pmovzx") || // Added in 4.0
+      Name.startswith("avx512.mask.lzcnt.") || // Added in 5.0
+      Name == "sse2.cvtdq2pd" || // Added in 3.9
+      Name == "sse2.cvtps2pd" || // Added in 3.9
+      Name == "avx.cvtdq2.pd.256" || // Added in 3.9
+      Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
+      Name.startswith("avx.vinsertf128.") || // Added in 3.7
+      Name == "avx2.vinserti128" || // Added in 3.7
+      Name.startswith("avx512.mask.insert") || // Added in 4.0
+      Name.startswith("avx.vextractf128.") || // Added in 3.7
+      Name == "avx2.vextracti128" || // Added in 3.7
+      Name.startswith("avx512.mask.vextract") || // Added in 4.0
+      Name.startswith("sse4a.movnt.") || // Added in 3.9
+      Name.startswith("avx.movnt.") || // Added in 3.2
+      Name.startswith("avx512.storent.") || // Added in 3.9
+      Name == "sse2.storel.dq" || // Added in 3.9
+      Name.startswith("sse.storeu.") || // Added in 3.9
+      Name.startswith("sse2.storeu.") || // Added in 3.9
+      Name.startswith("avx.storeu.") || // Added in 3.9
+      Name.startswith("avx512.mask.storeu.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.p") || // Added in 3.9
+      Name.startswith("avx512.mask.store.b.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.w.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.d.") || // Added in 3.9
+      Name.startswith("avx512.mask.store.q.") || // Added in 3.9
+      Name.startswith("avx512.mask.loadu.") || // Added in 3.9
+      Name.startswith("avx512.mask.load.") || // Added in 3.9
+      Name == "sse42.crc32.64.8" || // Added in 3.4
+      Name.startswith("avx.vbroadcast.s") || // Added in 3.5
+      Name.startswith("avx512.mask.palignr.") || // Added in 3.9
+      Name.startswith("avx512.mask.valign.") || // Added in 4.0
+      Name.startswith("sse2.psll.dq") || // Added in 3.7
+      Name.startswith("sse2.psrl.dq") || // Added in 3.7
+      Name.startswith("avx2.psll.dq") || // Added in 3.7
+      Name.startswith("avx2.psrl.dq") || // Added in 3.7
+      Name.startswith("avx512.psll.dq") || // Added in 3.9
+      Name.startswith("avx512.psrl.dq") || // Added in 3.9
+      Name == "sse41.pblendw" || // Added in 3.7
+      Name.startswith("sse41.blendp") || // Added in 3.7
+      Name.startswith("avx.blend.p") || // Added in 3.7
+      Name == "avx2.pblendw" || // Added in 3.7
+      Name.startswith("avx2.pblendd.") || // Added in 3.7
+      Name.startswith("avx.vbroadcastf128") || // Added in 4.0
+      Name == "avx2.vbroadcasti128" || // Added in 3.7
+      Name == "xop.vpcmov" || // Added in 3.8
+      Name == "xop.vpcmov.256" || // Added in 5.0
+      Name.startswith("avx512.mask.move.s") || // Added in 4.0
+      Name.startswith("avx512.cvtmask2") || // Added in 5.0
+      (Name.startswith("xop.vpcom") && // Added in 3.2
+       F->arg_size() == 2))
+    return true;
+
+  return false;
+}
+
+static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
+                                        Function *&NewFn) {
+  // Only handle intrinsics that start with "x86.".
+  if (!Name.startswith("x86."))
+    return false;
+  // Remove "x86." prefix.
+  Name = Name.substr(4);
+
+  if (ShouldUpgradeX86Intrinsic(F, Name)) {
+    NewFn = nullptr;
+    return true;
+  }
+
+  // SSE4.1 ptest functions may have an old signature.
+  if (Name.startswith("sse41.ptest")) { // Added in 3.2
+    if (Name.substr(11) == "c")
+      return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestc, NewFn);
+    if (Name.substr(11) == "z")
+      return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestz, NewFn);
+    if (Name.substr(11) == "nzc")
+      return UpgradePTESTIntrinsic(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
+  }
+  // Several blend and other instructions with masks used the wrong number of
+  // bits.
+  if (Name == "sse41.insertps") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_insertps,
+                                            NewFn);
+  if (Name == "sse41.dppd") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dppd,
+                                            NewFn);
+  if (Name == "sse41.dpps") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dpps,
+                                            NewFn);
+  if (Name == "sse41.mpsadbw") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,
+                                            NewFn);
+  if (Name == "avx.dp.ps.256") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,
+                                            NewFn);
+  if (Name == "avx2.mpsadbw") // Added in 3.6
+    return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
+                                            NewFn);
+
+  // frcz.ss/sd may need to have an argument dropped. Added in 3.2
+  if (Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
+    rename(F);
+    NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                      Intrinsic::x86_xop_vfrcz_ss);
+    return true;
+  }
+  if (Name.startswith("xop.vfrcz.sd") && F->arg_size() == 2) {
+    rename(F);
+    NewFn = Intrinsic::getDeclaration(F->getParent(),
+                                      Intrinsic::x86_xop_vfrcz_sd);
+    return true;
+  }
+  // Upgrade any XOP PERMIL2 index operand still using a float/double vector.
+  if (Name.startswith("xop.vpermil2")) { // Added in 3.9
+    auto Idx = F->getFunctionType()->getParamType(2);
+    if (Idx->isFPOrFPVectorTy()) {
+      rename(F);
+      unsigned IdxSize = Idx->getPrimitiveSizeInBits();
+      unsigned EltSize = Idx->getScalarSizeInBits();
+      Intrinsic::ID Permil2ID;
+      if (EltSize == 64 && IdxSize == 128)
+        Permil2ID = Intrinsic::x86_xop_vpermil2pd;
+      else if (EltSize == 32 && IdxSize == 128)
+        Permil2ID = Intrinsic::x86_xop_vpermil2ps;
+      else if (EltSize == 64 && IdxSize == 256)
+        Permil2ID = Intrinsic::x86_xop_vpermil2pd_256;
+      else
+        Permil2ID = Intrinsic::x86_xop_vpermil2ps_256;
+      NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   assert(F && "Illegal to upgrade a non-existent Function.");
 
@@ -156,26 +412,31 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
   }
-  case 'i': {
-    if (Name.startswith("invariant.start")) {
+  case 'i':
+  case 'l': {
+    bool IsLifetimeStart = Name.startswith("lifetime.start");
+    if (IsLifetimeStart || Name.startswith("invariant.start")) {
+      Intrinsic::ID ID = IsLifetimeStart ?
+        Intrinsic::lifetime_start : Intrinsic::invariant_start;
       auto Args = F->getFunctionType()->params();
       Type* ObjectPtr[1] = {Args[1]};
-      if (F->getName() !=
-          Intrinsic::getName(Intrinsic::invariant_start, ObjectPtr)) {
+      if (F->getName() != Intrinsic::getName(ID, ObjectPtr)) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(
-            F->getParent(), Intrinsic::invariant_start, ObjectPtr);
+        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ObjectPtr);
         return true;
       }
     }
-    if (Name.startswith("invariant.end")) {
+
+    bool IsLifetimeEnd = Name.startswith("lifetime.end");
+    if (IsLifetimeEnd || Name.startswith("invariant.end")) {
+      Intrinsic::ID ID = IsLifetimeEnd ?
+        Intrinsic::lifetime_end : Intrinsic::invariant_end;
+
       auto Args = F->getFunctionType()->params();
-      Type* ObjectPtr[1] = {Args[2]};
-      if (F->getName() !=
-          Intrinsic::getName(Intrinsic::invariant_end, ObjectPtr)) {
+      Type* ObjectPtr[1] = {Args[IsLifetimeEnd ? 1 : 2]};
+      if (F->getName() != Intrinsic::getName(ID, ObjectPtr)) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                          Intrinsic::invariant_end, ObjectPtr);
+        NewFn = Intrinsic::getDeclaration(F->getParent(), ID, ObjectPtr);
         return true;
       }
     }
@@ -240,12 +501,13 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   case 'o':
     // We only need to change the name to match the mangling including the
     // address space.
-    if (F->arg_size() == 2 && Name.startswith("objectsize.")) {
+    if (Name.startswith("objectsize.")) {
       Type *Tys[2] = { F->getReturnType(), F->arg_begin()->getType() };
-      if (F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys)) {
+      if (F->arg_size() == 2 ||
+          F->getName() != Intrinsic::getName(Intrinsic::objectsize, Tys)) {
         rename(F);
-        NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                          Intrinsic::objectsize, Tys);
+        NewFn = Intrinsic::getDeclaration(F->getParent(), Intrinsic::objectsize,
+                                          Tys);
         return true;
       }
     }
@@ -258,236 +520,15 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     }
     break;
 
-  case 'x': {
-    bool IsX86 = Name.startswith("x86.");
-    if (IsX86)
-      Name = Name.substr(4);
-
-    // All of the intrinsics matches below should be marked with which llvm
-    // version started autoupgrading them. At some point in the future we would
-    // like to use this information to remove upgrade code for some older
-    // intrinsics. It is currently undecided how we will determine that future
-    // point.
-    if (IsX86 &&
-        (Name.startswith("sse2.pcmpeq.") || // Added in 3.1
-         Name.startswith("sse2.pcmpgt.") || // Added in 3.1
-         Name.startswith("avx2.pcmpeq.") || // Added in 3.1
-         Name.startswith("avx2.pcmpgt.") || // Added in 3.1
-         Name.startswith("avx512.mask.pcmpeq.") || // Added in 3.9
-         Name.startswith("avx512.mask.pcmpgt.") || // Added in 3.9
-         Name == "sse.add.ss" || // Added in 4.0
-         Name == "sse2.add.sd" || // Added in 4.0
-         Name == "sse.sub.ss" || // Added in 4.0
-         Name == "sse2.sub.sd" || // Added in 4.0
-         Name == "sse.mul.ss" || // Added in 4.0
-         Name == "sse2.mul.sd" || // Added in 4.0
-         Name == "sse.div.ss" || // Added in 4.0
-         Name == "sse2.div.sd" || // Added in 4.0
-         Name == "sse41.pmaxsb" || // Added in 3.9
-         Name == "sse2.pmaxs.w" || // Added in 3.9
-         Name == "sse41.pmaxsd" || // Added in 3.9
-         Name == "sse2.pmaxu.b" || // Added in 3.9
-         Name == "sse41.pmaxuw" || // Added in 3.9
-         Name == "sse41.pmaxud" || // Added in 3.9
-         Name == "sse41.pminsb" || // Added in 3.9
-         Name == "sse2.pmins.w" || // Added in 3.9
-         Name == "sse41.pminsd" || // Added in 3.9
-         Name == "sse2.pminu.b" || // Added in 3.9
-         Name == "sse41.pminuw" || // Added in 3.9
-         Name == "sse41.pminud" || // Added in 3.9
-         Name.startswith("avx512.mask.pshuf.b.") || // Added in 4.0
-         Name.startswith("avx2.pmax") || // Added in 3.9
-         Name.startswith("avx2.pmin") || // Added in 3.9
-         Name.startswith("avx512.mask.pmax") || // Added in 4.0
-         Name.startswith("avx512.mask.pmin") || // Added in 4.0
-         Name.startswith("avx2.vbroadcast") || // Added in 3.8
-         Name.startswith("avx2.pbroadcast") || // Added in 3.8
-         Name.startswith("avx.vpermil.") || // Added in 3.1
-         Name.startswith("sse2.pshuf") || // Added in 3.9
-         Name.startswith("avx512.pbroadcast") || // Added in 3.9
-         Name.startswith("avx512.mask.broadcast.s") || // Added in 3.9
-         Name.startswith("avx512.mask.movddup") || // Added in 3.9
-         Name.startswith("avx512.mask.movshdup") || // Added in 3.9
-         Name.startswith("avx512.mask.movsldup") || // Added in 3.9
-         Name.startswith("avx512.mask.pshuf.d.") || // Added in 3.9
-         Name.startswith("avx512.mask.pshufl.w.") || // Added in 3.9
-         Name.startswith("avx512.mask.pshufh.w.") || // Added in 3.9
-         Name.startswith("avx512.mask.shuf.p") || // Added in 4.0
-         Name.startswith("avx512.mask.vpermil.p") || // Added in 3.9
-         Name.startswith("avx512.mask.perm.df.") || // Added in 3.9
-         Name.startswith("avx512.mask.perm.di.") || // Added in 3.9
-         Name.startswith("avx512.mask.punpckl") || // Added in 3.9
-         Name.startswith("avx512.mask.punpckh") || // Added in 3.9
-         Name.startswith("avx512.mask.unpckl.") || // Added in 3.9
-         Name.startswith("avx512.mask.unpckh.") || // Added in 3.9
-         Name.startswith("avx512.mask.pand.") || // Added in 3.9
-         Name.startswith("avx512.mask.pandn.") || // Added in 3.9
-         Name.startswith("avx512.mask.por.") || // Added in 3.9
-         Name.startswith("avx512.mask.pxor.") || // Added in 3.9
-         Name.startswith("avx512.mask.and.") || // Added in 3.9
-         Name.startswith("avx512.mask.andn.") || // Added in 3.9
-         Name.startswith("avx512.mask.or.") || // Added in 3.9
-         Name.startswith("avx512.mask.xor.") || // Added in 3.9
-         Name.startswith("avx512.mask.padd.") || // Added in 4.0
-         Name.startswith("avx512.mask.psub.") || // Added in 4.0
-         Name.startswith("avx512.mask.pmull.") || // Added in 4.0
-         Name.startswith("avx512.mask.cvtdq2pd.") || // Added in 4.0
-         Name.startswith("avx512.mask.cvtudq2pd.") || // Added in 4.0
-         Name.startswith("avx512.mask.pmul.dq.") || // Added in 4.0
-         Name.startswith("avx512.mask.pmulu.dq.") || // Added in 4.0
-         Name == "avx512.mask.add.pd.128" || // Added in 4.0
-         Name == "avx512.mask.add.pd.256" || // Added in 4.0
-         Name == "avx512.mask.add.ps.128" || // Added in 4.0
-         Name == "avx512.mask.add.ps.256" || // Added in 4.0
-         Name == "avx512.mask.div.pd.128" || // Added in 4.0
-         Name == "avx512.mask.div.pd.256" || // Added in 4.0
-         Name == "avx512.mask.div.ps.128" || // Added in 4.0
-         Name == "avx512.mask.div.ps.256" || // Added in 4.0
-         Name == "avx512.mask.mul.pd.128" || // Added in 4.0
-         Name == "avx512.mask.mul.pd.256" || // Added in 4.0
-         Name == "avx512.mask.mul.ps.128" || // Added in 4.0
-         Name == "avx512.mask.mul.ps.256" || // Added in 4.0
-         Name == "avx512.mask.sub.pd.128" || // Added in 4.0
-         Name == "avx512.mask.sub.pd.256" || // Added in 4.0
-         Name == "avx512.mask.sub.ps.128" || // Added in 4.0
-         Name == "avx512.mask.sub.ps.256" || // Added in 4.0
-         Name.startswith("avx512.mask.vpermilvar.") || // Added in 4.0
-         Name.startswith("avx512.mask.psll.d") || // Added in 4.0
-         Name.startswith("avx512.mask.psll.q") || // Added in 4.0
-         Name.startswith("avx512.mask.psll.w") || // Added in 4.0
-         Name.startswith("avx512.mask.psra.d") || // Added in 4.0
-         Name.startswith("avx512.mask.psra.q") || // Added in 4.0
-         Name.startswith("avx512.mask.psra.w") || // Added in 4.0
-         Name.startswith("avx512.mask.psrl.d") || // Added in 4.0
-         Name.startswith("avx512.mask.psrl.q") || // Added in 4.0
-         Name.startswith("avx512.mask.psrl.w") || // Added in 4.0
-         Name.startswith("avx512.mask.pslli") || // Added in 4.0
-         Name.startswith("avx512.mask.psrai") || // Added in 4.0
-         Name.startswith("avx512.mask.psrli") || // Added in 4.0
-         Name.startswith("avx512.mask.psllv") || // Added in 4.0
-         Name.startswith("avx512.mask.psrav") || // Added in 4.0
-         Name.startswith("avx512.mask.psrlv") || // Added in 4.0
-         Name.startswith("sse41.pmovsx") || // Added in 3.8
-         Name.startswith("sse41.pmovzx") || // Added in 3.9
-         Name.startswith("avx2.pmovsx") || // Added in 3.9
-         Name.startswith("avx2.pmovzx") || // Added in 3.9
-         Name.startswith("avx512.mask.pmovsx") || // Added in 4.0
-         Name.startswith("avx512.mask.pmovzx") || // Added in 4.0
-         Name == "sse2.cvtdq2pd" || // Added in 3.9
-         Name == "sse2.cvtps2pd" || // Added in 3.9
-         Name == "avx.cvtdq2.pd.256" || // Added in 3.9
-         Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
-         Name.startswith("avx.vinsertf128.") || // Added in 3.7
-         Name == "avx2.vinserti128" || // Added in 3.7
-         Name.startswith("avx512.mask.insert") || // Added in 4.0
-         Name.startswith("avx.vextractf128.") || // Added in 3.7
-         Name == "avx2.vextracti128" || // Added in 3.7
-         Name.startswith("avx512.mask.vextract") || // Added in 4.0
-         Name.startswith("sse4a.movnt.") || // Added in 3.9
-         Name.startswith("avx.movnt.") || // Added in 3.2
-         Name.startswith("avx512.storent.") || // Added in 3.9
-         Name == "sse2.storel.dq" || // Added in 3.9
-         Name.startswith("sse.storeu.") || // Added in 3.9
-         Name.startswith("sse2.storeu.") || // Added in 3.9
-         Name.startswith("avx.storeu.") || // Added in 3.9
-         Name.startswith("avx512.mask.storeu.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.p") || // Added in 3.9
-         Name.startswith("avx512.mask.store.b.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.w.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.d.") || // Added in 3.9
-         Name.startswith("avx512.mask.store.q.") || // Added in 3.9
-         Name.startswith("avx512.mask.loadu.") || // Added in 3.9
-         Name.startswith("avx512.mask.load.") || // Added in 3.9
-         Name == "sse42.crc32.64.8" || // Added in 3.4
-         Name.startswith("avx.vbroadcast.s") || // Added in 3.5
-         Name.startswith("avx512.mask.palignr.") || // Added in 3.9
-         Name.startswith("avx512.mask.valign.") || // Added in 4.0
-         Name.startswith("sse2.psll.dq") || // Added in 3.7
-         Name.startswith("sse2.psrl.dq") || // Added in 3.7
-         Name.startswith("avx2.psll.dq") || // Added in 3.7
-         Name.startswith("avx2.psrl.dq") || // Added in 3.7
-         Name.startswith("avx512.psll.dq") || // Added in 3.9
-         Name.startswith("avx512.psrl.dq") || // Added in 3.9
-         Name == "sse41.pblendw" || // Added in 3.7
-         Name.startswith("sse41.blendp") || // Added in 3.7
-         Name.startswith("avx.blend.p") || // Added in 3.7
-         Name == "avx2.pblendw" || // Added in 3.7
-         Name.startswith("avx2.pblendd.") || // Added in 3.7
-         Name.startswith("avx.vbroadcastf128") || // Added in 4.0
-         Name == "avx2.vbroadcasti128" || // Added in 3.7
-         Name == "xop.vpcmov" || // Added in 3.8
-         Name.startswith("avx512.mask.move.s") || // Added in 4.0
-         (Name.startswith("xop.vpcom") && // Added in 3.2
-          F->arg_size() == 2))) {
-      NewFn = nullptr;
-      return true;
-    }
-    // SSE4.1 ptest functions may have an old signature.
-    if (IsX86 && Name.startswith("sse41.ptest")) { // Added in 3.2
-      if (Name.substr(11) == "c")
-        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestc, NewFn);
-      if (Name.substr(11) == "z")
-        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestz, NewFn);
-      if (Name.substr(11) == "nzc")
-        return UpgradeSSE41Function(F, Intrinsic::x86_sse41_ptestnzc, NewFn);
-    }
-    // Several blend and other instructions with masks used the wrong number of
-    // bits.
-    if (IsX86 && Name == "sse41.insertps") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_insertps,
-                                              NewFn);
-    if (IsX86 && Name == "sse41.dppd") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dppd,
-                                              NewFn);
-    if (IsX86 && Name == "sse41.dpps") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_dpps,
-                                              NewFn);
-    if (IsX86 && Name == "sse41.mpsadbw") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_sse41_mpsadbw,
-                                              NewFn);
-    if (IsX86 && Name == "avx.dp.ps.256") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx_dp_ps_256,
-                                              NewFn);
-    if (IsX86 && Name == "avx2.mpsadbw") // Added in 3.6
-      return UpgradeX86IntrinsicsWith8BitMask(F, Intrinsic::x86_avx2_mpsadbw,
-                                              NewFn);
-
-    // frcz.ss/sd may need to have an argument dropped. Added in 3.2
-    if (IsX86 && Name.startswith("xop.vfrcz.ss") && F->arg_size() == 2) {
-      rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                        Intrinsic::x86_xop_vfrcz_ss);
-      return true;
-    }
-    if (IsX86 && Name.startswith("xop.vfrcz.sd") && F->arg_size() == 2) {
-      rename(F);
-      NewFn = Intrinsic::getDeclaration(F->getParent(),
-                                        Intrinsic::x86_xop_vfrcz_sd);
+  case 'x':
+    if (UpgradeX86IntrinsicFunction(F, Name, NewFn))
       return true;
-    }
-    // Upgrade any XOP PERMIL2 index operand still using a float/double vector.
-    if (IsX86 && Name.startswith("xop.vpermil2")) { // Added in 3.9
-      auto Params = F->getFunctionType()->params();
-      auto Idx = Params[2];
-      if (Idx->getScalarType()->isFloatingPointTy()) {
-        rename(F);
-        unsigned IdxSize = Idx->getPrimitiveSizeInBits();
-        unsigned EltSize = Idx->getScalarSizeInBits();
-        Intrinsic::ID Permil2ID;
-        if (EltSize == 64 && IdxSize == 128)
-          Permil2ID = Intrinsic::x86_xop_vpermil2pd;
-        else if (EltSize == 32 && IdxSize == 128)
-          Permil2ID = Intrinsic::x86_xop_vpermil2ps;
-        else if (EltSize == 64 && IdxSize == 256)
-          Permil2ID = Intrinsic::x86_xop_vpermil2pd_256;
-        else
-          Permil2ID = Intrinsic::x86_xop_vpermil2ps_256;
-        NewFn = Intrinsic::getDeclaration(F->getParent(), Permil2ID);
-        return true;
-      }
-    }
-    break;
   }
+  // Remangle our intrinsic since we upgrade the mangling
+  auto Result = llvm::Intrinsic::remangleIntrinsicFunction(F);
+  if (Result != None) {
+    NewFn = Result.getValue();
+    return true;
   }
 
   //  This may not belong here. This function is effectively being overloaded
@@ -765,6 +806,15 @@ static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) {
   return Builder.CreateInsertElement(A, Select, (uint64_t)0);
 }
 
+
+static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallInst &CI) {
+  Value* Op = CI.getArgOperand(0);
+  Type* ReturnOp = CI.getType();
+  unsigned NumElts = CI.getType()->getVectorNumElements();
+  Value *Mask = getX86MaskVec(Builder, Op, NumElts);
+  return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2");
+}
+
 /// Upgrade a call to an old intrinsic. All argument and return casting must be
 /// provided to seamlessly integrate with existing context.
 void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
@@ -873,18 +923,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       return;
     }
 
-    if (IsX86 && (Name.startswith("avx512.mask.storeu."))) {
-      UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(2), /*Aligned*/false);
-
-      // Remove intrinsic.
-      CI->eraseFromParent();
-      return;
-    }
-
-    if (IsX86 && (Name.startswith("avx512.mask.store."))) {
+    if (IsX86 && (Name.startswith("avx512.mask.store"))) {
+      // "avx512.mask.storeu." or "avx512.mask.store."
+      bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
       UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
-                         CI->getArgOperand(2), /*Aligned*/true);
+                         CI->getArgOperand(2), Aligned);
 
       // Remove intrinsic.
       CI->eraseFromParent();
@@ -893,15 +936,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
     Value *Rep;
     // Upgrade packed integer vector compare intrinsics to compare instructions.
-    if (IsX86 && (Name.startswith("sse2.pcmpeq.") ||
-                  Name.startswith("avx2.pcmpeq."))) {
-      Rep = Builder.CreateICmpEQ(CI->getArgOperand(0), CI->getArgOperand(1),
-                                 "pcmpeq");
-      Rep = Builder.CreateSExt(Rep, CI->getType(), "");
-    } else if (IsX86 && (Name.startswith("sse2.pcmpgt.") ||
-                         Name.startswith("avx2.pcmpgt."))) {
-      Rep = Builder.CreateICmpSGT(CI->getArgOperand(0), CI->getArgOperand(1),
-                                  "pcmpgt");
+    if (IsX86 && (Name.startswith("sse2.pcmp") ||
+                  Name.startswith("avx2.pcmp"))) {
+      // "sse2.pcpmpeq." "sse2.pcmpgt." "avx2.pcmpeq." or "avx2.pcmpgt."
+      bool CmpEq = Name[9] == 'e';
+      Rep = Builder.CreateICmp(CmpEq ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_SGT,
+                               CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = Builder.CreateSExt(Rep, CI->getType(), "");
     } else if (IsX86 && (Name == "sse.add.ss" || Name == "sse2.add.sd")) {
       Type *I32Ty = Type::getInt32Ty(C);
@@ -939,10 +979,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateInsertElement(CI->getArgOperand(0),
                                         Builder.CreateFDiv(Elt0, Elt1),
                                         ConstantInt::get(I32Ty, 0));
-    } else if (IsX86 && Name.startswith("avx512.mask.pcmpeq.")) {
-      Rep = upgradeMaskedCompare(Builder, *CI, ICmpInst::ICMP_EQ);
-    } else if (IsX86 && Name.startswith("avx512.mask.pcmpgt.")) {
-      Rep = upgradeMaskedCompare(Builder, *CI, ICmpInst::ICMP_SGT);
+    } else if (IsX86 && Name.startswith("avx512.mask.pcmp")) {
+      // "avx512.mask.pcmpeq." or "avx512.mask.pcmpgt."
+      bool CmpEq = Name[16] == 'e';
+      Rep = upgradeMaskedCompare(Builder, *CI,
+                                 CmpEq ? ICmpInst::ICMP_EQ
+                                       : ICmpInst::ICMP_SGT);
     } else if (IsX86 && (Name == "sse41.pmaxsb" ||
                          Name == "sse2.pmaxs.w" ||
                          Name == "sse41.pmaxsd" ||
@@ -1054,15 +1096,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep =
           Builder.CreateCall(VPCOM, {CI->getArgOperand(0), CI->getArgOperand(1),
                                      Builder.getInt8(Imm)});
-    } else if (IsX86 && Name == "xop.vpcmov") {
-      Value *Arg0 = CI->getArgOperand(0);
-      Value *Arg1 = CI->getArgOperand(1);
+    } else if (IsX86 && Name.startswith("xop.vpcmov")) {
       Value *Sel = CI->getArgOperand(2);
-      unsigned NumElts = CI->getType()->getVectorNumElements();
-      Constant *MinusOne = ConstantVector::getSplat(NumElts, Builder.getInt64(-1));
-      Value *NotSel = Builder.CreateXor(Sel, MinusOne);
-      Value *Sel0 = Builder.CreateAnd(Arg0, Sel);
-      Value *Sel1 = Builder.CreateAnd(Arg1, NotSel);
+      Value *NotSel = Builder.CreateNot(Sel);
+      Value *Sel0 = Builder.CreateAnd(CI->getArgOperand(0), Sel);
+      Value *Sel1 = Builder.CreateAnd(CI->getArgOperand(1), NotSel);
       Rep = Builder.CreateOr(Sel0, Sel1);
     } else if (IsX86 && Name == "sse42.crc32.64.8") {
       Function *CRC32 = Intrinsic::getDeclaration(F->getParent(),
@@ -1496,6 +1534,43 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.lzcnt.")) {
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+                                                         Intrinsic::ctlz,
+                                                         CI->getType()),
+                               { CI->getArgOperand(0), Builder.getInt1(false) });
+      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+                          CI->getArgOperand(1));
+    } else if (IsX86 && (Name.startswith("avx512.mask.max.p") ||
+                         Name.startswith("avx512.mask.min.p"))) {
+      bool IsMin = Name[13] == 'i';
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      unsigned VecWidth = VecTy->getPrimitiveSizeInBits();
+      unsigned EltWidth = VecTy->getScalarSizeInBits();
+      Intrinsic::ID IID;
+      if (!IsMin && VecWidth == 128 && EltWidth == 32)
+        IID = Intrinsic::x86_sse_max_ps;
+      else if (!IsMin && VecWidth == 128 && EltWidth == 64)
+        IID = Intrinsic::x86_sse2_max_pd;
+      else if (!IsMin && VecWidth == 256 && EltWidth == 32)
+        IID = Intrinsic::x86_avx_max_ps_256;
+      else if (!IsMin && VecWidth == 256 && EltWidth == 64)
+        IID = Intrinsic::x86_avx_max_pd_256;
+      else if (IsMin && VecWidth == 128 && EltWidth == 32)
+        IID = Intrinsic::x86_sse_min_ps;
+      else if (IsMin && VecWidth == 128 && EltWidth == 64)
+        IID = Intrinsic::x86_sse2_min_pd;
+      else if (IsMin && VecWidth == 256 && EltWidth == 32)
+        IID = Intrinsic::x86_avx_min_ps_256;
+      else if (IsMin && VecWidth == 256 && EltWidth == 64)
+        IID = Intrinsic::x86_avx_min_pd_256;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               { CI->getArgOperand(0), CI->getArgOperand(1) });
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
     } else if (IsX86 && Name.startswith("avx512.mask.pshuf.b.")) {
       VectorType *VecTy = cast<VectorType>(CI->getType());
       Intrinsic::ID IID;
@@ -1532,6 +1607,42 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       else
         llvm_unreachable("Unexpected intrinsic");
 
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               { CI->getArgOperand(0), CI->getArgOperand(1) });
+      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+                          CI->getArgOperand(2));
+    } else if (IsX86 && Name.startswith("avx512.mask.pack")) {
+      bool IsUnsigned = Name[16] == 'u';
+      bool IsDW = Name[18] == 'd';
+      VectorType *VecTy = cast<VectorType>(CI->getType());
+      Intrinsic::ID IID;
+      if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse2_packsswb_128;
+      else if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packsswb;
+      else if (!IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packsswb_512;
+      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse2_packssdw_128;
+      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packssdw;
+      else if (!IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packssdw_512;
+      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse2_packuswb_128;
+      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packuswb;
+      else if (IsUnsigned && !IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packuswb_512;
+      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 128)
+        IID = Intrinsic::x86_sse41_packusdw;
+      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 256)
+        IID = Intrinsic::x86_avx2_packusdw;
+      else if (IsUnsigned && IsDW && VecTy->getPrimitiveSizeInBits() == 512)
+        IID = Intrinsic::x86_avx512_packusdw_512;
+      else
+        llvm_unreachable("Unexpected intrinsic");
+
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                { CI->getArgOperand(0), CI->getArgOperand(1) });
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
@@ -1740,6 +1851,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
     } else if (IsX86 && Name.startswith("avx512.mask.move.s")) {
       Rep = upgradeMaskedMove(Builder, *CI);
+    } else if (IsX86 && Name.startswith("avx512.cvtmask2")) {
+      Rep = UpgradeMaskToInt(Builder, *CI);
     } else if (IsX86 && Name.startswith("avx512.mask.vpermilvar.")) {
       Intrinsic::ID IID;
       if (Name.endswith("ps.128"))
@@ -1816,13 +1929,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     return;
   }
 
-  std::string Name = CI->getName();
-  if (!Name.empty())
-    CI->setName(Name + ".old");
-
+  CallInst *NewCall = nullptr;
   switch (NewFn->getIntrinsicID()) {
-  default:
-    llvm_unreachable("Unknown function for CallInst upgrade.");
+  default: {
+    // Handle generic mangling change, but nothing else
+    assert(
+        (CI->getCalledFunction()->getName() != NewFn->getName()) &&
+        "Unknown function for CallInst upgrade and isn't just a name change");
+    CI->setCalledFunction(NewFn);
+    return;
+  }
 
   case Intrinsic::arm_neon_vld1:
   case Intrinsic::arm_neon_vld2:
@@ -1840,47 +1956,43 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::arm_neon_vst4lane: {
     SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
                                  CI->arg_operands().end());
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
 
   case Intrinsic::bitreverse:
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)}));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
+    break;
 
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
     assert(CI->getNumArgOperands() == 1 &&
            "Mismatch between function args and call args");
-    CI->replaceAllUsesWith(Builder.CreateCall(
-        NewFn, {CI->getArgOperand(0), Builder.getFalse()}, Name));
-    CI->eraseFromParent();
-    return;
+    NewCall =
+        Builder.CreateCall(NewFn, {CI->getArgOperand(0), Builder.getFalse()});
+    break;
 
-  case Intrinsic::objectsize:
-    CI->replaceAllUsesWith(Builder.CreateCall(
-        NewFn, {CI->getArgOperand(0), CI->getArgOperand(1)}, Name));
-    CI->eraseFromParent();
-    return;
+  case Intrinsic::objectsize: {
+    Value *NullIsUnknownSize = CI->getNumArgOperands() == 2
+                                   ? Builder.getFalse()
+                                   : CI->getArgOperand(2);
+    NewCall = Builder.CreateCall(
+        NewFn, {CI->getArgOperand(0), CI->getArgOperand(1), NullIsUnknownSize});
+    break;
+  }
 
   case Intrinsic::ctpop:
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)}));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
+    break;
 
   case Intrinsic::convert_from_fp16:
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {CI->getArgOperand(0)}));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(0)});
+    break;
 
   case Intrinsic::x86_xop_vfrcz_ss:
   case Intrinsic::x86_xop_vfrcz_sd:
-    CI->replaceAllUsesWith(
-        Builder.CreateCall(NewFn, {CI->getArgOperand(1)}, Name));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {CI->getArgOperand(1)});
+    break;
 
   case Intrinsic::x86_xop_vpermil2pd:
   case Intrinsic::x86_xop_vpermil2ps:
@@ -1891,9 +2003,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     VectorType *FltIdxTy = cast<VectorType>(Args[2]->getType());
     VectorType *IntIdxTy = VectorType::getInteger(FltIdxTy);
     Args[2] = Builder.CreateBitCast(Args[2], IntIdxTy);
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args, Name));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
 
   case Intrinsic::x86_sse41_ptestc:
@@ -1915,10 +2026,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     Value *BC0 = Builder.CreateBitCast(Arg0, NewVecTy, "cast");
     Value *BC1 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
 
-    CallInst *NewCall = Builder.CreateCall(NewFn, {BC0, BC1}, Name);
-    CI->replaceAllUsesWith(NewCall);
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {BC0, BC1});
+    break;
   }
 
   case Intrinsic::x86_sse41_insertps:
@@ -1934,17 +2043,13 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
     // Replace the last argument with a trunc.
     Args.back() = Builder.CreateTrunc(Args.back(), Type::getInt8Ty(C), "trunc");
-
-    CallInst *NewCall = Builder.CreateCall(NewFn, Args);
-    CI->replaceAllUsesWith(NewCall);
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
   }
 
   case Intrinsic::thread_pointer: {
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, {}));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, {});
+    break;
   }
 
   case Intrinsic::invariant_start:
@@ -1953,11 +2058,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
   case Intrinsic::masked_store: {
     SmallVector<Value *, 4> Args(CI->arg_operands().begin(),
                                  CI->arg_operands().end());
-    CI->replaceAllUsesWith(Builder.CreateCall(NewFn, Args));
-    CI->eraseFromParent();
-    return;
+    NewCall = Builder.CreateCall(NewFn, Args);
+    break;
+  }
   }
+  assert(NewCall && "Should have either set this variable or returned through "
+                    "the default case");
+  std::string Name = CI->getName();
+  if (!Name.empty()) {
+    CI->setName(Name + ".old");
+    NewCall->setName(Name);
   }
+  CI->replaceAllUsesWith(NewCall);
+  CI->eraseFromParent();
 }
 
 void llvm::UpgradeCallsToIntrinsic(Function *F) {
diff --git a/lib/IR/BasicBlock.cpp b/lib/IR/BasicBlock.cpp
index 8187ee93f17fa994acc1a2d7c64b84d51102536f..90ca21ab91f8fcfcaf3ae1e8f0a8ea509dcb91da 100644
--- a/lib/IR/BasicBlock.cpp
+++ b/lib/IR/BasicBlock.cpp
@@ -113,23 +113,23 @@ void BasicBlock::moveAfter(BasicBlock *MovePos) {
       getIterator());
 }
 
-Module *BasicBlock::getModule() {
+const Module *BasicBlock::getModule() const {
   return getParent()->getParent();
 }
 
-TerminatorInst *BasicBlock::getTerminator() {
+const TerminatorInst *BasicBlock::getTerminator() const {
   if (InstList.empty()) return nullptr;
   return dyn_cast<TerminatorInst>(&InstList.back());
 }
 
-CallInst *BasicBlock::getTerminatingMustTailCall() {
+const CallInst *BasicBlock::getTerminatingMustTailCall() const {
   if (InstList.empty())
     return nullptr;
-  ReturnInst *RI = dyn_cast<ReturnInst>(&InstList.back());
+  const ReturnInst *RI = dyn_cast<ReturnInst>(&InstList.back());
   if (!RI || RI == &InstList.front())
     return nullptr;
 
-  Instruction *Prev = RI->getPrevNode();
+  const Instruction *Prev = RI->getPrevNode();
   if (!Prev)
     return nullptr;
 
@@ -153,7 +153,7 @@ CallInst *BasicBlock::getTerminatingMustTailCall() {
   return nullptr;
 }
 
-CallInst *BasicBlock::getTerminatingDeoptimizeCall() {
+const CallInst *BasicBlock::getTerminatingDeoptimizeCall() const {
   if (InstList.empty())
     return nullptr;
   auto *RI = dyn_cast<ReturnInst>(&InstList.back());
@@ -168,22 +168,22 @@ CallInst *BasicBlock::getTerminatingDeoptimizeCall() {
   return nullptr;
 }
 
-Instruction* BasicBlock::getFirstNonPHI() {
-  for (Instruction &I : *this)
+const Instruction* BasicBlock::getFirstNonPHI() const {
+  for (const Instruction &I : *this)
     if (!isa<PHINode>(I))
       return &I;
   return nullptr;
 }
 
-Instruction* BasicBlock::getFirstNonPHIOrDbg() {
-  for (Instruction &I : *this)
+const Instruction* BasicBlock::getFirstNonPHIOrDbg() const {
+  for (const Instruction &I : *this)
     if (!isa<PHINode>(I) && !isa<DbgInfoIntrinsic>(I))
       return &I;
   return nullptr;
 }
 
-Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() {
-  for (Instruction &I : *this) {
+const Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() const {
+  for (const Instruction &I : *this) {
     if (isa<PHINode>(I) || isa<DbgInfoIntrinsic>(I))
       continue;
 
@@ -197,12 +197,12 @@ Instruction* BasicBlock::getFirstNonPHIOrDbgOrLifetime() {
   return nullptr;
 }
 
-BasicBlock::iterator BasicBlock::getFirstInsertionPt() {
-  Instruction *FirstNonPHI = getFirstNonPHI();
+BasicBlock::const_iterator BasicBlock::getFirstInsertionPt() const {
+  const Instruction *FirstNonPHI = getFirstNonPHI();
   if (!FirstNonPHI)
     return end();
 
-  iterator InsertPt = FirstNonPHI->getIterator();
+  const_iterator InsertPt = FirstNonPHI->getIterator();
   if (InsertPt->isEHPad()) ++InsertPt;
   return InsertPt;
 }
@@ -214,10 +214,10 @@ void BasicBlock::dropAllReferences() {
 
 /// If this basic block has a single predecessor block,
 /// return the block, otherwise return a null pointer.
-BasicBlock *BasicBlock::getSinglePredecessor() {
-  pred_iterator PI = pred_begin(this), E = pred_end(this);
+const BasicBlock *BasicBlock::getSinglePredecessor() const {
+  const_pred_iterator PI = pred_begin(this), E = pred_end(this);
   if (PI == E) return nullptr;         // No preds.
-  BasicBlock *ThePred = *PI;
+  const BasicBlock *ThePred = *PI;
   ++PI;
   return (PI == E) ? ThePred : nullptr /*multiple preds*/;
 }
@@ -227,10 +227,10 @@ BasicBlock *BasicBlock::getSinglePredecessor() {
 /// Note that unique predecessor doesn't mean single edge, there can be
 /// multiple edges from the unique predecessor to this block (for example
 /// a switch statement with multiple cases having the same destination).
-BasicBlock *BasicBlock::getUniquePredecessor() {
-  pred_iterator PI = pred_begin(this), E = pred_end(this);
+const BasicBlock *BasicBlock::getUniquePredecessor() const {
+  const_pred_iterator PI = pred_begin(this), E = pred_end(this);
   if (PI == E) return nullptr; // No preds.
-  BasicBlock *PredBB = *PI;
+  const BasicBlock *PredBB = *PI;
   ++PI;
   for (;PI != E; ++PI) {
     if (*PI != PredBB)
@@ -241,18 +241,18 @@ BasicBlock *BasicBlock::getUniquePredecessor() {
   return PredBB;
 }
 
-BasicBlock *BasicBlock::getSingleSuccessor() {
-  succ_iterator SI = succ_begin(this), E = succ_end(this);
+const BasicBlock *BasicBlock::getSingleSuccessor() const {
+  succ_const_iterator SI = succ_begin(this), E = succ_end(this);
   if (SI == E) return nullptr; // no successors
-  BasicBlock *TheSucc = *SI;
+  const BasicBlock *TheSucc = *SI;
   ++SI;
   return (SI == E) ? TheSucc : nullptr /* multiple successors */;
 }
 
-BasicBlock *BasicBlock::getUniqueSuccessor() {
-  succ_iterator SI = succ_begin(this), E = succ_end(this);
+const BasicBlock *BasicBlock::getUniqueSuccessor() const {
+  succ_const_iterator SI = succ_begin(this), E = succ_end(this);
   if (SI == E) return nullptr; // No successors
-  BasicBlock *SuccBB = *SI;
+  const BasicBlock *SuccBB = *SI;
   ++SI;
   for (;SI != E; ++SI) {
     if (*SI != SuccBB)
@@ -429,9 +429,6 @@ bool BasicBlock::isLandingPad() const {
 }
 
 /// Return the landingpad instruction associated with the landing pad.
-LandingPadInst *BasicBlock::getLandingPadInst() {
-  return dyn_cast<LandingPadInst>(getFirstNonPHI());
-}
 const LandingPadInst *BasicBlock::getLandingPadInst() const {
   return dyn_cast<LandingPadInst>(getFirstNonPHI());
 }
diff --git a/lib/IR/Comdat.cpp b/lib/IR/Comdat.cpp
index fc1b48d1c190e8e98457f66e1c70d1dd954c555e..e27ecad0a8841698dd56d86cefc79b9f126e6741 100644
--- a/lib/IR/Comdat.cpp
+++ b/lib/IR/Comdat.cpp
@@ -1,4 +1,4 @@
-//===-- Comdat.cpp - Implement Metadata classes --------------------------===//
+//===- Comdat.cpp - Implement Metadata classes ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Comdat.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Comdat.h"
+
 using namespace llvm;
 
 Comdat::Comdat(Comdat &&C) : Name(C.Name), SK(C.SK) {}
 
-Comdat::Comdat() : Name(nullptr), SK(Comdat::Any) {}
+Comdat::Comdat() = default;
 
 StringRef Comdat::getName() const { return Name->first(); }
diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp
index 098ff90a0a95cc791eaf201589b9e24b71feb604..bba230677ebf711fba4538ce26fc7ffe07dd6561 100644
--- a/lib/IR/ConstantFold.cpp
+++ b/lib/IR/ConstantFold.cpp
@@ -18,6 +18,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ConstantFold.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -606,17 +607,15 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
     if (ConstantFP *FPC = dyn_cast<ConstantFP>(V)) {
       const APFloat &V = FPC->getValueAPF();
       bool ignored;
-      uint64_t x[2];
       uint32_t DestBitWidth = cast<IntegerType>(DestTy)->getBitWidth();
+      APSInt IntVal(DestBitWidth, opc == Instruction::FPToUI);
       if (APFloat::opInvalidOp ==
-          V.convertToInteger(x, DestBitWidth, opc==Instruction::FPToSI,
-                             APFloat::rmTowardZero, &ignored)) {
+          V.convertToInteger(IntVal, APFloat::rmTowardZero, &ignored)) {
         // Undefined behavior invoked - the destination type can't represent
         // the input constant.
         return UndefValue::get(DestTy);
       }
-      APInt Val(DestBitWidth, x);
-      return ConstantInt::get(FPC->getContext(), Val);
+      return ConstantInt::get(FPC->getContext(), IntVal);
     }
     return nullptr; // Can't fold.
   case Instruction::IntToPtr:   //always treated as unsigned
@@ -1209,10 +1208,15 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode,
     SmallVector<Constant*, 16> Result;
     Type *Ty = IntegerType::get(VTy->getContext(), 32);
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
-      Constant *LHS =
-        ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
-      Constant *RHS =
-        ConstantExpr::getExtractElement(C2, ConstantInt::get(Ty, i));
+      Constant *ExtractIdx = ConstantInt::get(Ty, i);
+      Constant *LHS = ConstantExpr::getExtractElement(C1, ExtractIdx);
+      Constant *RHS = ConstantExpr::getExtractElement(C2, ExtractIdx);
+
+      // If any element of a divisor vector is zero, the whole op is undef.
+      if ((Opcode == Instruction::SDiv || Opcode == Instruction::UDiv ||
+           Opcode == Instruction::SRem || Opcode == Instruction::URem) &&
+          RHS->isNullValue())
+        return UndefValue::get(VTy);
 
       Result.push_back(ConstantExpr::get(Opcode, LHS, RHS));
     }
@@ -2231,7 +2235,8 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     ConstantInt *Factor = ConstantInt::get(CI->getType(), NumElements);
     NewIdxs[i] = ConstantExpr::getSRem(CI, Factor);
 
-    Constant *PrevIdx = cast<Constant>(Idxs[i - 1]);
+    Constant *PrevIdx = NewIdxs[i-1] ? NewIdxs[i-1] :
+                           cast<Constant>(Idxs[i - 1]);
     Constant *Div = ConstantExpr::getSDiv(CI, Factor);
 
     unsigned CommonExtendedWidth =
diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp
index f940955a745c652da3c0b588be9bd30b317a31f8..f1826c029795bd3ee79080a8d7a875b73a1a57f7 100644
--- a/lib/IR/ConstantRange.cpp
+++ b/lib/IR/ConstantRange.cpp
@@ -272,6 +272,22 @@ APInt ConstantRange::getSetSize() const {
   return (Upper - Lower).zext(getBitWidth()+1);
 }
 
+/// isSizeStrictlySmallerThanOf - Compare set size of this range with the range
+/// CR.
+/// This function is faster than comparing results of getSetSize for the two
+/// ranges, because we don't need to extend bitwidth of APInts we're operating
+/// with.
+///
+bool
+ConstantRange::isSizeStrictlySmallerThanOf(const ConstantRange &Other) const {
+  assert(getBitWidth() == Other.getBitWidth());
+  if (isFullSet())
+    return false;
+  if (Other.isFullSet())
+    return true;
+  return (Upper - Lower).ult(Other.Upper - Other.Lower);
+}
+
 /// getUnsignedMax - Return the largest unsigned value contained in the
 /// ConstantRange.
 ///
@@ -414,7 +430,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
       if (CR.Upper.ule(Lower))
         return ConstantRange(CR.Lower, Upper);
 
-      if (getSetSize().ult(CR.getSetSize()))
+      if (isSizeStrictlySmallerThanOf(CR))
         return *this;
       return CR;
     }
@@ -429,7 +445,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
 
   if (CR.Upper.ult(Upper)) {
     if (CR.Lower.ult(Upper)) {
-      if (getSetSize().ult(CR.getSetSize()))
+      if (isSizeStrictlySmallerThanOf(CR))
         return *this;
       return CR;
     }
@@ -445,7 +461,7 @@ ConstantRange ConstantRange::intersectWith(const ConstantRange &CR) const {
 
     return ConstantRange(CR.Lower, Upper);
   }
-  if (getSetSize().ult(CR.getSetSize()))
+  if (isSizeStrictlySmallerThanOf(CR))
     return *this;
   return CR;
 }
@@ -739,17 +755,16 @@ ConstantRange::add(const ConstantRange &Other) const {
   if (isFullSet() || Other.isFullSet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  APInt Spread_X = getSetSize(), Spread_Y = Other.getSetSize();
   APInt NewLower = getLower() + Other.getLower();
   APInt NewUpper = getUpper() + Other.getUpper() - 1;
   if (NewLower == NewUpper)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
   ConstantRange X = ConstantRange(NewLower, NewUpper);
-  if (X.getSetSize().ult(Spread_X) || X.getSetSize().ult(Spread_Y))
+  if (X.isSizeStrictlySmallerThanOf(*this) ||
+      X.isSizeStrictlySmallerThanOf(Other))
     // We've wrapped, therefore, full set.
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-
   return X;
 }
 
@@ -773,17 +788,16 @@ ConstantRange::sub(const ConstantRange &Other) const {
   if (isFullSet() || Other.isFullSet())
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
-  APInt Spread_X = getSetSize(), Spread_Y = Other.getSetSize();
   APInt NewLower = getLower() - Other.getUpper() + 1;
   APInt NewUpper = getUpper() - Other.getLower();
   if (NewLower == NewUpper)
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
 
   ConstantRange X = ConstantRange(NewLower, NewUpper);
-  if (X.getSetSize().ult(Spread_X) || X.getSetSize().ult(Spread_Y))
+  if (X.isSizeStrictlySmallerThanOf(*this) ||
+      X.isSizeStrictlySmallerThanOf(Other))
     // We've wrapped, therefore, full set.
     return ConstantRange(getBitWidth(), /*isFullSet=*/true);
-
   return X;
 }
 
@@ -837,7 +851,7 @@ ConstantRange::multiply(const ConstantRange &Other) const {
   ConstantRange Result_sext(std::min(L, Compare), std::max(L, Compare) + 1);
   ConstantRange SR = Result_sext.truncate(getBitWidth());
 
-  return UR.getSetSize().ult(SR.getSetSize()) ? UR : SR;
+  return UR.isSizeStrictlySmallerThanOf(SR) ? UR : SR;
 }
 
 ConstantRange
diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp
index 533b9245277f2eb73c1ede3e9adf79669dca34b8..c5f93c9f4db018aeba731798482a32a746cfb775 100644
--- a/lib/IR/Constants.cpp
+++ b/lib/IR/Constants.cpp
@@ -1027,7 +1027,7 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
     return getSequenceIfElementsMatch<ConstantDataVector>(C, V);
 
   // Otherwise, the element type isn't compatible with ConstantDataVector, or
-  // the operand list constants a ConstantExpr or something else strange.
+  // the operand list contains a ConstantExpr or something else strange.
   return nullptr;
 }
 
diff --git a/lib/IR/Core.cpp b/lib/IR/Core.cpp
index 7236e4c454ebf2aa89811d59cd78f6978024707d..b5ed30b85c8a13a1e06730db83a0a247ce5438f5 100644
--- a/lib/IR/Core.cpp
+++ b/lib/IR/Core.cpp
@@ -16,7 +16,6 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/Attributes.h"
-#include "AttributeSetNode.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -1847,18 +1846,14 @@ void LLVMAddAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
 }
 
 unsigned LLVMGetAttributeCountAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx) {
-  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
-  if (!ASN)
-    return 0;
-  return ASN->getNumAttributes();
+  auto AS = unwrap<Function>(F)->getAttributes().getAttributes(Idx);
+  return AS.getNumAttributes();
 }
 
 void LLVMGetAttributesAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
                               LLVMAttributeRef *Attrs) {
-  auto *ASN = AttributeSetNode::get(unwrap<Function>(F)->getAttributes(), Idx);
-  if (!ASN)
-    return;
-  for (auto A: make_range(ASN->begin(), ASN->end()))
+  auto AS = unwrap<Function>(F)->getAttributes().getAttributes(Idx);
+  for (auto A : AS)
     *Attrs++ = wrap(A);
 }
 
@@ -1888,12 +1883,12 @@ void LLVMRemoveStringAttributeAtIndex(LLVMValueRef F, LLVMAttributeIndex Idx,
 void LLVMAddTargetDependentFunctionAttr(LLVMValueRef Fn, const char *A,
                                         const char *V) {
   Function *Func = unwrap<Function>(Fn);
-  AttributeSet::AttrIndex Idx =
-    AttributeSet::AttrIndex(AttributeSet::FunctionIndex);
+  AttributeList::AttrIndex Idx =
+      AttributeList::AttrIndex(AttributeList::FunctionIndex);
   AttrBuilder B;
 
   B.addAttribute(A, V);
-  AttributeSet Set = AttributeSet::get(Func->getContext(), Idx, B);
+  AttributeList Set = AttributeList::get(Func->getContext(), Idx, B);
   Func->addAttributes(Idx, Set);
 }
 
@@ -1913,10 +1908,8 @@ void LLVMGetParams(LLVMValueRef FnRef, LLVMValueRef *ParamRefs) {
 }
 
 LLVMValueRef LLVMGetParam(LLVMValueRef FnRef, unsigned index) {
-  Function::arg_iterator AI = unwrap<Function>(FnRef)->arg_begin();
-  while (index --> 0)
-    AI++;
-  return wrap(&*AI);
+  Function *Fn = unwrap<Function>(FnRef);
+  return wrap(&Fn->arg_begin()[index]);
 }
 
 LLVMValueRef LLVMGetParamParent(LLVMValueRef V) {
@@ -1941,25 +1934,24 @@ LLVMValueRef LLVMGetLastParam(LLVMValueRef Fn) {
 
 LLVMValueRef LLVMGetNextParam(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
-  Function::arg_iterator I(A);
-  if (++I == A->getParent()->arg_end())
+  Function *Fn = A->getParent();
+  if (A->getArgNo() + 1 >= Fn->arg_size())
     return nullptr;
-  return wrap(&*I);
+  return wrap(&Fn->arg_begin()[A->getArgNo() + 1]);
 }
 
 LLVMValueRef LLVMGetPreviousParam(LLVMValueRef Arg) {
   Argument *A = unwrap<Argument>(Arg);
-  Function::arg_iterator I(A);
-  if (I == A->getParent()->arg_begin())
+  if (A->getArgNo() == 0)
     return nullptr;
-  return wrap(&*--I);
+  return wrap(&A->getParent()->arg_begin()[A->getArgNo() - 1]);
 }
 
 void LLVMSetParamAlignment(LLVMValueRef Arg, unsigned align) {
   Argument *A = unwrap<Argument>(Arg);
   AttrBuilder B;
   B.addAlignmentAttr(align);
-  A->addAttr(AttributeSet::get(A->getContext(),A->getArgNo() + 1, B));
+  A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B));
 }
 
 /*--.. Operations on basic blocks ..........................................--*/
@@ -2168,10 +2160,9 @@ void LLVMSetInstrParamAlignment(LLVMValueRef Instr, unsigned index,
   CallSite Call = CallSite(unwrap<Instruction>(Instr));
   AttrBuilder B;
   B.addAlignmentAttr(align);
-  Call.setAttributes(Call.getAttributes()
-                       .addAttributes(Call->getContext(), index,
-                                      AttributeSet::get(Call->getContext(),
-                                                        index, B)));
+  Call.setAttributes(Call.getAttributes().addAttributes(
+      Call->getContext(), index,
+      AttributeList::get(Call->getContext(), index, B)));
 }
 
 void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
@@ -2182,19 +2173,15 @@ void LLVMAddCallSiteAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
 unsigned LLVMGetCallSiteAttributeCount(LLVMValueRef C,
                                        LLVMAttributeIndex Idx) {
   auto CS = CallSite(unwrap<Instruction>(C));
-  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
-  if (!ASN)
-    return 0;
-  return ASN->getNumAttributes();
+  auto AS = CS.getAttributes().getAttributes(Idx);
+  return AS.getNumAttributes();
 }
 
 void LLVMGetCallSiteAttributes(LLVMValueRef C, LLVMAttributeIndex Idx,
                                LLVMAttributeRef *Attrs) {
   auto CS = CallSite(unwrap<Instruction>(C));
-  auto *ASN = AttributeSetNode::get(CS.getAttributes(), Idx);
-  if (!ASN)
-    return;
-  for (auto A: make_range(ASN->begin(), ASN->end()))
+  auto AS = CS.getAttributes().getAttributes(Idx);
+  for (auto A : AS)
     *Attrs++ = wrap(A);
 }
 
diff --git a/lib/IR/DIBuilder.cpp b/lib/IR/DIBuilder.cpp
index d06161067f5f3f96acafc1bee8eb4ed11319ac4e..9407c805b92a5b85ba52e1bc7cf1cc1f906da171 100644
--- a/lib/IR/DIBuilder.cpp
+++ b/lib/IR/DIBuilder.cpp
@@ -126,7 +126,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
     unsigned Lang, DIFile *File, StringRef Producer, bool isOptimized,
     StringRef Flags, unsigned RunTimeVer, StringRef SplitName,
     DICompileUnit::DebugEmissionKind Kind, uint64_t DWOId,
-    bool SplitDebugInlining) {
+    bool SplitDebugInlining, bool DebugInfoForProfiling) {
 
   assert(((Lang <= dwarf::DW_LANG_Fortran08 && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
@@ -136,7 +136,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
   CUNode = DICompileUnit::getDistinct(
       VMContext, Lang, File, Producer, isOptimized, Flags, RunTimeVer,
       SplitName, Kind, nullptr, nullptr, nullptr, nullptr, nullptr, DWOId,
-      SplitDebugInlining);
+      SplitDebugInlining, DebugInfoForProfiling);
 
   // Create a named metadata so that it is easier to find cu in a module.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
@@ -241,17 +241,20 @@ DIBasicType *DIBuilder::createBasicType(StringRef Name, uint64_t SizeInBits,
 
 DIDerivedType *DIBuilder::createQualifiedType(unsigned Tag, DIType *FromTy) {
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, FromTy, 0,
-                            0, 0, DINode::FlagZero);
+                            0, 0, None, DINode::FlagZero);
 }
 
-DIDerivedType *DIBuilder::createPointerType(DIType *PointeeTy,
-                                            uint64_t SizeInBits,
-                                            uint32_t AlignInBits,
-                                            StringRef Name) {
+DIDerivedType *DIBuilder::createPointerType(
+    DIType *PointeeTy,
+    uint64_t SizeInBits,
+    uint32_t AlignInBits,
+    Optional<unsigned> DWARFAddressSpace,
+    StringRef Name) {
   // FIXME: Why is there a name here?
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_pointer_type, Name,
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, DINode::FlagZero);
+                            AlignInBits, 0, DWARFAddressSpace,
+                            DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
@@ -261,15 +264,18 @@ DIDerivedType *DIBuilder::createMemberPointerType(DIType *PointeeTy,
                                                   DINode::DIFlags Flags) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_ptr_to_member_type, "",
                             nullptr, 0, nullptr, PointeeTy, SizeInBits,
-                            AlignInBits, 0, Flags, Base);
+                            AlignInBits, 0, None, Flags, Base);
 }
 
-DIDerivedType *DIBuilder::createReferenceType(unsigned Tag, DIType *RTy,
-                                              uint64_t SizeInBits,
-                                              uint32_t AlignInBits) {
+DIDerivedType *DIBuilder::createReferenceType(
+    unsigned Tag, DIType *RTy,
+    uint64_t SizeInBits,
+    uint32_t AlignInBits,
+    Optional<unsigned> DWARFAddressSpace) {
   assert(RTy && "Unable to create reference type");
   return DIDerivedType::get(VMContext, Tag, "", nullptr, 0, nullptr, RTy,
-                            SizeInBits, AlignInBits, 0, DINode::FlagZero);
+                            SizeInBits, AlignInBits, 0, DWARFAddressSpace,
+                            DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
@@ -277,14 +283,14 @@ DIDerivedType *DIBuilder::createTypedef(DIType *Ty, StringRef Name,
                                         DIScope *Context) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_typedef, Name, File,
                             LineNo, getNonCompileUnitScope(Context), Ty, 0, 0,
-                            0, DINode::FlagZero);
+                            0, None, DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createFriend(DIType *Ty, DIType *FriendTy) {
   assert(Ty && "Invalid type!");
   assert(FriendTy && "Invalid friend type!");
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_friend, "", nullptr, 0, Ty,
-                            FriendTy, 0, 0, 0, DINode::FlagZero);
+                            FriendTy, 0, 0, 0, None, DINode::FlagZero);
 }
 
 DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
@@ -292,7 +298,7 @@ DIDerivedType *DIBuilder::createInheritance(DIType *Ty, DIType *BaseTy,
                                             DINode::DIFlags Flags) {
   assert(Ty && "Unable to create inheritance");
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_inheritance, "", nullptr,
-                            0, Ty, BaseTy, 0, 0, BaseOffset, Flags);
+                            0, Ty, BaseTy, 0, 0, BaseOffset, None, Flags);
 }
 
 DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
@@ -303,7 +309,7 @@ DIDerivedType *DIBuilder::createMemberType(DIScope *Scope, StringRef Name,
                                            DINode::DIFlags Flags, DIType *Ty) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(Scope), Ty,
-                            SizeInBits, AlignInBits, OffsetInBits, Flags);
+                            SizeInBits, AlignInBits, OffsetInBits, None, Flags);
 }
 
 static ConstantAsMetadata *getConstantOrNull(Constant *C) {
@@ -320,7 +326,7 @@ DIDerivedType *DIBuilder::createBitFieldMemberType(
   return DIDerivedType::get(
       VMContext, dwarf::DW_TAG_member, Name, File, LineNumber,
       getNonCompileUnitScope(Scope), Ty, SizeInBits, /* AlignInBits */ 0,
-      OffsetInBits, Flags,
+      OffsetInBits, None, Flags,
       ConstantAsMetadata::get(ConstantInt::get(IntegerType::get(VMContext, 64),
                                                StorageOffsetInBits)));
 }
@@ -333,7 +339,8 @@ DIBuilder::createStaticMemberType(DIScope *Scope, StringRef Name, DIFile *File,
   Flags |= DINode::FlagStaticMember;
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(Scope), Ty, 0,
-                            AlignInBits, 0, Flags, getConstantOrNull(Val));
+                            AlignInBits, 0, None, Flags,
+                            getConstantOrNull(Val));
 }
 
 DIDerivedType *
@@ -343,7 +350,7 @@ DIBuilder::createObjCIVar(StringRef Name, DIFile *File, unsigned LineNumber,
                           DIType *Ty, MDNode *PropertyNode) {
   return DIDerivedType::get(VMContext, dwarf::DW_TAG_member, Name, File,
                             LineNumber, getNonCompileUnitScope(File), Ty,
-                            SizeInBits, AlignInBits, OffsetInBits, Flags,
+                            SizeInBits, AlignInBits, OffsetInBits, None, Flags,
                             PropertyNode);
 }
 
@@ -442,14 +449,6 @@ DISubroutineType *DIBuilder::createSubroutineType(DITypeRefArray ParameterTypes,
   return DISubroutineType::get(VMContext, Flags, CC, ParameterTypes);
 }
 
-DICompositeType *DIBuilder::createExternalTypeRef(unsigned Tag, DIFile *File,
-                                                  StringRef UniqueIdentifier) {
-  assert(!UniqueIdentifier.empty() && "external type ref without uid");
-  return DICompositeType::get(VMContext, Tag, "", nullptr, 0, nullptr, nullptr,
-                              0, 0, 0, DINode::FlagExternalTypeRef, nullptr, 0,
-                              nullptr, nullptr, UniqueIdentifier);
-}
-
 DICompositeType *DIBuilder::createEnumerationType(
     DIScope *Scope, StringRef Name, DIFile *File, unsigned LineNumber,
     uint64_t SizeInBits, uint32_t AlignInBits, DINodeArray Elements,
diff --git a/lib/IR/DataLayout.cpp b/lib/IR/DataLayout.cpp
index d15a34c0b936cc2d9e778528ca245b13ab3c286f..6f90ce598568628478e4a669da60d103f9a644b5 100644
--- a/lib/IR/DataLayout.cpp
+++ b/lib/IR/DataLayout.cpp
@@ -118,9 +118,6 @@ LayoutAlignElem::operator==(const LayoutAlignElem &rhs) const {
           && TypeBitWidth == rhs.TypeBitWidth);
 }
 
-const LayoutAlignElem
-DataLayout::InvalidAlignmentElem = { INVALID_ALIGN, 0, 0, 0 };
-
 //===----------------------------------------------------------------------===//
 // PointerAlignElem, PointerAlign support
 //===----------------------------------------------------------------------===//
@@ -145,9 +142,6 @@ PointerAlignElem::operator==(const PointerAlignElem &rhs) const {
           && TypeByteWidth == rhs.TypeByteWidth);
 }
 
-const PointerAlignElem
-DataLayout::InvalidPointerElem = { 0U, 0U, 0U, ~0U };
-
 //===----------------------------------------------------------------------===//
 //                       DataLayout Class Implementation
 //===----------------------------------------------------------------------===//
@@ -180,6 +174,7 @@ void DataLayout::reset(StringRef Desc) {
 
   LayoutMap = nullptr;
   BigEndian = false;
+  AllocaAddrSpace = 0;
   StackNaturalAlign = 0;
   ManglingMode = MM_None;
   NonIntegralAddressSpaces.clear();
@@ -358,6 +353,12 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       StackNaturalAlign = inBytes(getInt(Tok));
       break;
     }
+    case 'A': { // Default stack/alloca address space.
+      AllocaAddrSpace = getInt(Tok);
+      if (!isUInt<24>(AllocaAddrSpace))
+        report_fatal_error("Invalid address space, must be a 24bit integer");
+      break;
+    }
     case 'm':
       if (!Tok.empty())
         report_fatal_error("Unexpected trailing characters after mangling specifier in datalayout string");
@@ -400,6 +401,7 @@ void DataLayout::init(const Module *M) { *this = M->getDataLayout(); }
 
 bool DataLayout::operator==(const DataLayout &Other) const {
   bool Ret = BigEndian == Other.BigEndian &&
+             AllocaAddrSpace == Other.AllocaAddrSpace &&
              StackNaturalAlign == Other.StackNaturalAlign &&
              ManglingMode == Other.ManglingMode &&
              LegalIntWidths == Other.LegalIntWidths &&
@@ -408,6 +410,18 @@ bool DataLayout::operator==(const DataLayout &Other) const {
   return Ret;
 }
 
+DataLayout::AlignmentsTy::iterator
+DataLayout::findAlignmentLowerBound(AlignTypeEnum AlignType,
+                                    uint32_t BitWidth) {
+  auto Pair = std::make_pair((unsigned)AlignType, BitWidth);
+  return std::lower_bound(Alignments.begin(), Alignments.end(), Pair,
+                          [](const LayoutAlignElem &LHS,
+                             const std::pair<unsigned, uint32_t> &RHS) {
+                            return std::tie(LHS.AlignType, LHS.TypeBitWidth) <
+                                   std::tie(RHS.first, RHS.second);
+                          });
+}
+
 void
 DataLayout::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
                          unsigned pref_align, uint32_t bit_width) {
@@ -426,18 +440,17 @@ DataLayout::setAlignment(AlignTypeEnum align_type, unsigned abi_align,
     report_fatal_error(
         "Preferred alignment cannot be less than the ABI alignment");
 
-  for (LayoutAlignElem &Elem : Alignments) {
-    if (Elem.AlignType == (unsigned)align_type &&
-        Elem.TypeBitWidth == bit_width) {
-      // Update the abi, preferred alignments.
-      Elem.ABIAlign = abi_align;
-      Elem.PrefAlign = pref_align;
-      return;
-    }
+  AlignmentsTy::iterator I = findAlignmentLowerBound(align_type, bit_width);
+  if (I != Alignments.end() &&
+      I->AlignType == (unsigned)align_type && I->TypeBitWidth == bit_width) {
+    // Update the abi, preferred alignments.
+    I->ABIAlign = abi_align;
+    I->PrefAlign = pref_align;
+  } else {
+    // Insert before I to keep the vector sorted.
+    Alignments.insert(I, LayoutAlignElem::get(align_type, abi_align,
+                                              pref_align, bit_width));
   }
-
-  Alignments.push_back(LayoutAlignElem::get(align_type, abi_align,
-                                            pref_align, bit_width));
 }
 
 DataLayout::PointersTy::iterator
@@ -471,45 +484,29 @@ void DataLayout::setPointerAlignment(uint32_t AddrSpace, unsigned ABIAlign,
 unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
                                       uint32_t BitWidth, bool ABIInfo,
                                       Type *Ty) const {
-  // Check to see if we have an exact match and remember the best match we see.
-  int BestMatchIdx = -1;
-  int LargestInt = -1;
-  for (unsigned i = 0, e = Alignments.size(); i != e; ++i) {
-    if (Alignments[i].AlignType == (unsigned)AlignType &&
-        Alignments[i].TypeBitWidth == BitWidth)
-      return ABIInfo ? Alignments[i].ABIAlign : Alignments[i].PrefAlign;
-
-    // The best match so far depends on what we're looking for.
-    if (AlignType == INTEGER_ALIGN &&
-        Alignments[i].AlignType == INTEGER_ALIGN) {
-      // The "best match" for integers is the smallest size that is larger than
-      // the BitWidth requested.
-      if (Alignments[i].TypeBitWidth > BitWidth && (BestMatchIdx == -1 ||
-          Alignments[i].TypeBitWidth < Alignments[BestMatchIdx].TypeBitWidth))
-        BestMatchIdx = i;
-      // However, if there isn't one that's larger, then we must use the
-      // largest one we have (see below)
-      if (LargestInt == -1 ||
-          Alignments[i].TypeBitWidth > Alignments[LargestInt].TypeBitWidth)
-        LargestInt = i;
+  AlignmentsTy::const_iterator I = findAlignmentLowerBound(AlignType, BitWidth);
+  // See if we found an exact match. Of if we are looking for an integer type,
+  // but don't have an exact match take the next largest integer. This is where
+  // the lower_bound will point to when it fails an exact match.
+  if (I != Alignments.end() && I->AlignType == (unsigned)AlignType &&
+      (I->TypeBitWidth == BitWidth || AlignType == INTEGER_ALIGN))
+    return ABIInfo ? I->ABIAlign : I->PrefAlign;
+
+  if (AlignType == INTEGER_ALIGN) {
+    // If we didn't have a larger value try the largest value we have.
+    if (I != Alignments.begin()) {
+      --I; // Go to the previous entry and see if its an integer.
+      if (I->AlignType == INTEGER_ALIGN)
+        return ABIInfo ? I->ABIAlign : I->PrefAlign;
     }
-  }
-
-  // Okay, we didn't find an exact solution.  Fall back here depending on what
-  // is being looked for.
-  if (BestMatchIdx == -1) {
-    // If we didn't find an integer alignment, fall back on most conservative.
-    if (AlignType == INTEGER_ALIGN) {
-      BestMatchIdx = LargestInt;
-    } else if (AlignType == VECTOR_ALIGN) {
-      // By default, use natural alignment for vector types. This is consistent
-      // with what clang and llvm-gcc do.
-      unsigned Align = getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
-      Align *= cast<VectorType>(Ty)->getNumElements();
-      Align = PowerOf2Ceil(Align);
-      return Align;
-    }
-  }
+  } else if (AlignType == VECTOR_ALIGN) {
+    // By default, use natural alignment for vector types. This is consistent
+    // with what clang and llvm-gcc do.
+    unsigned Align = getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
+    Align *= cast<VectorType>(Ty)->getNumElements();
+    Align = PowerOf2Ceil(Align);
+    return Align;
+   }
 
   // If we still couldn't find a reasonable default alignment, fall back
   // to a simple heuristic that the alignment is the first power of two
@@ -517,15 +514,9 @@ unsigned DataLayout::getAlignmentInfo(AlignTypeEnum AlignType,
   // approximation of reality, and if the user wanted something less
   // less conservative, they should have specified it explicitly in the data
   // layout.
-  if (BestMatchIdx == -1) {
-    unsigned Align = getTypeStoreSize(Ty);
-    Align = PowerOf2Ceil(Align);
-    return Align;
-  }
-
-  // Since we got a "best match" index, just return it.
-  return ABIInfo ? Alignments[BestMatchIdx].ABIAlign
-                 : Alignments[BestMatchIdx].PrefAlign;
+  unsigned Align = getTypeStoreSize(Ty);
+  Align = PowerOf2Ceil(Align);
+  return Align;
 }
 
 namespace {
diff --git a/lib/IR/DebugInfo.cpp b/lib/IR/DebugInfo.cpp
index 3bdd5a6bd95c8f6aa70c92f60037f1a0c793f09d..c5d39c5443049d0edf70a27349bf438db64bc635 100644
--- a/lib/IR/DebugInfo.cpp
+++ b/lib/IR/DebugInfo.cpp
@@ -79,9 +79,19 @@ void DebugInfoFinder::processModule(const Module &M) {
         processScope(M->getScope());
     }
   }
-  for (auto &F : M.functions())
+  for (auto &F : M.functions()) {
     if (auto *SP = cast_or_null<DISubprogram>(F.getSubprogram()))
       processSubprogram(SP);
+    // There could be subprograms from inlined functions referenced from
+    // instructions only. Walk the function to find them.
+    for (const BasicBlock &BB : F) {
+      for (const Instruction &I : BB) {
+        if (!I.getDebugLoc())
+          continue;
+        processLocation(M, I.getDebugLoc().get());
+      }
+    }
+  }
 }
 
 void DebugInfoFinder::processLocation(const Module &M, const DILocation *Loc) {
@@ -241,26 +251,29 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
 
 static llvm::MDNode *stripDebugLocFromLoopID(llvm::MDNode *N) {
   assert(N->op_begin() != N->op_end() && "Missing self reference?");
-  auto DebugLocOp =
-      std::find_if(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) {
-        return isa<DILocation>(Op.get());
-      });
 
-  // No debug location, we do not have to rewrite this MDNode.
-  if (DebugLocOp == N->op_end())
+  // if there is no debug location, we do not have to rewrite this MDNode.
+  if (std::none_of(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) {
+        return isa<DILocation>(Op.get());
+      }))
     return N;
 
-  // There is only the debug location without any actual loop metadata, hence we
+  // If there is only the debug location without any actual loop metadata, we
   // can remove the metadata.
-  if (N->getNumOperands() == 2)
+  if (std::none_of(N->op_begin() + 1, N->op_end(), [](const MDOperand &Op) {
+        return !isa<DILocation>(Op.get());
+      }))
     return nullptr;
 
   SmallVector<Metadata *, 4> Args;
   // Reserve operand 0 for loop id self reference.
   auto TempNode = MDNode::getTemporary(N->getContext(), None);
   Args.push_back(TempNode.get());
-  Args.append(N->op_begin() + 1, DebugLocOp);
-  Args.append(DebugLocOp + 1, N->op_end());
+  // Add all non-debug location operands back.
+  for (auto Op = N->op_begin() + 1; Op != N->op_end(); Op++) {
+    if (!isa<DILocation>(*Op))
+      Args.push_back(*Op);
+  }
 
   // Set the first operand to itself.
   MDNode *LoopID = MDNode::get(N->getContext(), Args);
@@ -449,7 +462,8 @@ private:
         CU->isOptimized(), CU->getFlags(), CU->getRuntimeVersion(),
         CU->getSplitDebugFilename(), DICompileUnit::LineTablesOnly, EnumTypes,
         RetainedTypes, GlobalVariables, ImportedEntities, CU->getMacros(),
-        CU->getDWOId(), CU->getSplitDebugInlining());
+        CU->getDWOId(), CU->getSplitDebugInlining(),
+        CU->getDebugInfoForProfiling());
   }
 
   DILocation *getReplacementMDLocation(DILocation *MLD) {
@@ -597,17 +611,26 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
     }
     for (auto &BB : F) {
       for (auto &I : BB) {
-        if (I.getDebugLoc() == DebugLoc())
-          continue;
-
-        // Make a replacement.
-        auto &DL = I.getDebugLoc();
-        auto *Scope = DL.getScope();
-        MDNode *InlinedAt = DL.getInlinedAt();
-        Scope = remap(Scope);
-        InlinedAt = remap(InlinedAt);
-        I.setDebugLoc(
-            DebugLoc::get(DL.getLine(), DL.getCol(), Scope, InlinedAt));
+        auto remapDebugLoc = [&](DebugLoc DL) -> DebugLoc {
+          auto *Scope = DL.getScope();
+          MDNode *InlinedAt = DL.getInlinedAt();
+          Scope = remap(Scope);
+          InlinedAt = remap(InlinedAt);
+          return DebugLoc::get(DL.getLine(), DL.getCol(), Scope, InlinedAt);
+        };
+
+        if (I.getDebugLoc() != DebugLoc())
+          I.setDebugLoc(remapDebugLoc(I.getDebugLoc()));
+
+        // Remap DILocations in untyped MDNodes (e.g., llvm.loop).
+        SmallVector<std::pair<unsigned, MDNode *>, 2> MDs;
+        I.getAllMetadata(MDs);
+        for (auto Attachment : MDs)
+          if (auto *T = dyn_cast_or_null<MDTuple>(Attachment.second))
+            for (unsigned N = 0; N < T->getNumOperands(); ++N)
+              if (auto *Loc = dyn_cast_or_null<DILocation>(T->getOperand(N)))
+                if (Loc != DebugLoc())
+                  T->replaceOperandWith(N, remapDebugLoc(Loc));
       }
     }
   }
diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp
index 8e21a907e15ed2538e347aa9892f22695cc58f8e..d14c6018d40990de7fb21ad3286beb28f21f82b3 100644
--- a/lib/IR/DebugInfoMetadata.cpp
+++ b/lib/IR/DebugInfoMetadata.cpp
@@ -245,16 +245,18 @@ DIBasicType *DIBasicType::getImpl(LLVMContext &Context, unsigned Tag,
 DIDerivedType *DIDerivedType::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *File,
     unsigned Line, Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-    uint32_t AlignInBits, uint64_t OffsetInBits, DIFlags Flags,
-    Metadata *ExtraData, StorageType Storage, bool ShouldCreate) {
+    uint32_t AlignInBits, uint64_t OffsetInBits,
+    Optional<unsigned> DWARFAddressSpace, DIFlags Flags, Metadata *ExtraData,
+    StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
   DEFINE_GETIMPL_LOOKUP(DIDerivedType,
                         (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                         AlignInBits, OffsetInBits, Flags, ExtraData));
+                         AlignInBits, OffsetInBits, DWARFAddressSpace, Flags,
+                         ExtraData));
   Metadata *Ops[] = {File, Scope, Name, BaseType, ExtraData};
   DEFINE_GETIMPL_STORE(
-      DIDerivedType, (Tag, Line, SizeInBits, AlignInBits, OffsetInBits, Flags),
-      Ops);
+      DIDerivedType, (Tag, Line, SizeInBits, AlignInBits, OffsetInBits,
+                      DWARFAddressSpace, Flags), Ops);
 }
 
 DICompositeType *DICompositeType::getImpl(
@@ -383,8 +385,8 @@ DICompileUnit *DICompileUnit::getImpl(
     unsigned RuntimeVersion, MDString *SplitDebugFilename,
     unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
     Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros,
-    uint64_t DWOId, bool SplitDebugInlining, StorageType Storage,
-    bool ShouldCreate) {
+    uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling,
+    StorageType Storage, bool ShouldCreate) {
   assert(Storage != Uniqued && "Cannot unique DICompileUnit");
   assert(isCanonical(Producer) && "Expected canonical MDString");
   assert(isCanonical(Flags) && "Expected canonical MDString");
@@ -397,7 +399,8 @@ DICompileUnit *DICompileUnit::getImpl(
   return storeImpl(new (array_lengthof(Ops))
                        DICompileUnit(Context, Storage, SourceLanguage,
                                      IsOptimized, RuntimeVersion, EmissionKind,
-                                     DWOId, SplitDebugInlining, Ops),
+                                     DWOId, SplitDebugInlining,
+                                     DebugInfoForProfiling, Ops),
                    Storage);
 }
 
@@ -611,10 +614,23 @@ bool DIExpression::isValid() const {
         return false;
       break;
     }
+    case dwarf::DW_OP_swap: {
+      // Must be more than one implicit element on the stack.
+
+      // FIXME: A better way to implement this would be to add a local variable
+      // that keeps track of the stack depth and introduce something like a
+      // DW_LLVM_OP_implicit_location as a placeholder for the location this
+      // DIExpression is attached to, or else pass the number of implicit stack
+      // elements into isValid.
+      if (getNumElements() == 1)
+        return false;
+      break;
+    }
     case dwarf::DW_OP_constu:
     case dwarf::DW_OP_plus:
     case dwarf::DW_OP_minus:
     case dwarf::DW_OP_deref:
+    case dwarf::DW_OP_xderef:
       break;
     }
   }
diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp
index f9815eb45d46dc2e6d9acdf4eccedc6c3745a9dd..395b6158e0c8666f62ffd0a11f4c539d5f0f3767 100644
--- a/lib/IR/DiagnosticInfo.cpp
+++ b/lib/IR/DiagnosticInfo.cpp
@@ -148,21 +148,31 @@ void DiagnosticInfoPGOProfile::print(DiagnosticPrinter &DP) const {
   DP << getMsg();
 }
 
-bool DiagnosticInfoWithDebugLocBase::isLocationAvailable() const {
-  return getDebugLoc();
+DiagnosticLocation::DiagnosticLocation(const DebugLoc &DL) {
+  if (!DL)
+    return;
+  Filename = DL->getFilename();
+  Line = DL->getLine();
+  Column = DL->getColumn();
 }
 
-void DiagnosticInfoWithDebugLocBase::getLocation(StringRef *Filename,
+DiagnosticLocation::DiagnosticLocation(const DISubprogram *SP) {
+  if (!SP)
+    return;
+  Filename = SP->getFilename();
+  Line = SP->getScopeLine();
+  Column = 0;
+}
+
+void DiagnosticInfoWithLocationBase::getLocation(StringRef *Filename,
                                                  unsigned *Line,
                                                  unsigned *Column) const {
-  DILocation *L = getDebugLoc();
-  assert(L != nullptr && "debug location is invalid");
-  *Filename = L->getFilename();
-  *Line = L->getLine();
-  *Column = L->getColumn();
+  *Filename = Loc.getFilename();
+  *Line = Loc.getLine();
+  *Column = Loc.getColumn();
 }
 
-const std::string DiagnosticInfoWithDebugLocBase::getLocationStr() const {
+const std::string DiagnosticInfoWithLocationBase::getLocationStr() const {
   StringRef Filename("<unknown>");
   unsigned Line = 0;
   unsigned Column = 0;
@@ -171,14 +181,14 @@ const std::string DiagnosticInfoWithDebugLocBase::getLocationStr() const {
   return (Filename + ":" + Twine(Line) + ":" + Twine(Column)).str();
 }
 
-DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, Value *V)
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V)
     : Key(Key) {
   if (auto *F = dyn_cast<Function>(V)) {
     if (DISubprogram *SP = F->getSubprogram())
-      DLoc = DebugLoc::get(SP->getScopeLine(), 0, SP);
+      Loc = SP;
   }
   else if (auto *I = dyn_cast<Instruction>(V))
-    DLoc = I->getDebugLoc();
+    Loc = I->getDebugLoc();
 
   // Only include names that correspond to user variables.  FIXME: we should use
   // debug info if available to get the name of the user variable.
@@ -191,7 +201,7 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, Value *V)
     Val = I->getOpcodeName();
 }
 
-DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, Type *T)
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Type *T)
     : Key(Key) {
   raw_string_ostream OS(Val);
   OS << *T;
@@ -211,33 +221,48 @@ void DiagnosticInfoOptimizationBase::print(DiagnosticPrinter &DP) const {
 
 OptimizationRemark::OptimizationRemark(const char *PassName,
                                        StringRef RemarkName,
-                                       const DebugLoc &DLoc, Value *CodeRegion)
+                                       const DiagnosticLocation &Loc,
+                                       const Value *CodeRegion)
     : DiagnosticInfoIROptimization(
           DK_OptimizationRemark, DS_Remark, PassName, RemarkName,
-          *cast<BasicBlock>(CodeRegion)->getParent(), DLoc, CodeRegion) {}
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
 
 OptimizationRemark::OptimizationRemark(const char *PassName,
-                                       StringRef RemarkName, Instruction *Inst)
+                                       StringRef RemarkName,
+                                       const Instruction *Inst)
     : DiagnosticInfoIROptimization(DK_OptimizationRemark, DS_Remark, PassName,
                                    RemarkName, *Inst->getParent()->getParent(),
                                    Inst->getDebugLoc(), Inst->getParent()) {}
 
+// Helper to allow for an assert before attempting to return an invalid
+// reference.
+static const BasicBlock &getFirstFunctionBlock(const Function *Func) {
+  assert(!Func->empty() && "Function does not have a body");
+  return Func->front();
+}
+
+OptimizationRemark::OptimizationRemark(const char *PassName,
+                                       StringRef RemarkName,
+                                       const Function *Func)
+    : DiagnosticInfoIROptimization(DK_OptimizationRemark, DS_Remark, PassName,
+                                   RemarkName, *Func, Func->getSubprogram(),
+                                   &getFirstFunctionBlock(Func)) {}
+
 bool OptimizationRemark::isEnabled(StringRef PassName) {
   return PassRemarksOptLoc.Pattern &&
          PassRemarksOptLoc.Pattern->match(PassName);
 }
 
-OptimizationRemarkMissed::OptimizationRemarkMissed(const char *PassName,
-                                                   StringRef RemarkName,
-                                                   const DebugLoc &DLoc,
-                                                   Value *CodeRegion)
+OptimizationRemarkMissed::OptimizationRemarkMissed(
+    const char *PassName, StringRef RemarkName, const DiagnosticLocation &Loc,
+    const Value *CodeRegion)
     : DiagnosticInfoIROptimization(
           DK_OptimizationRemarkMissed, DS_Remark, PassName, RemarkName,
-          *cast<BasicBlock>(CodeRegion)->getParent(), DLoc, CodeRegion) {}
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
 
 OptimizationRemarkMissed::OptimizationRemarkMissed(const char *PassName,
                                                    StringRef RemarkName,
-                                                   Instruction *Inst)
+                                                   const Instruction *Inst)
     : DiagnosticInfoIROptimization(DK_OptimizationRemarkMissed, DS_Remark,
                                    PassName, RemarkName,
                                    *Inst->getParent()->getParent(),
@@ -248,30 +273,27 @@ bool OptimizationRemarkMissed::isEnabled(StringRef PassName) {
          PassRemarksMissedOptLoc.Pattern->match(PassName);
 }
 
-OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(const char *PassName,
-                                                       StringRef RemarkName,
-                                                       const DebugLoc &DLoc,
-                                                       Value *CodeRegion)
+OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(
+    const char *PassName, StringRef RemarkName, const DiagnosticLocation &Loc,
+    const Value *CodeRegion)
     : DiagnosticInfoIROptimization(
           DK_OptimizationRemarkAnalysis, DS_Remark, PassName, RemarkName,
-          *cast<BasicBlock>(CodeRegion)->getParent(), DLoc, CodeRegion) {}
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
 
 OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(const char *PassName,
                                                        StringRef RemarkName,
-                                                       Instruction *Inst)
+                                                       const Instruction *Inst)
     : DiagnosticInfoIROptimization(DK_OptimizationRemarkAnalysis, DS_Remark,
                                    PassName, RemarkName,
                                    *Inst->getParent()->getParent(),
                                    Inst->getDebugLoc(), Inst->getParent()) {}
 
-OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(enum DiagnosticKind Kind,
-                                                       const char *PassName,
-                                                       StringRef RemarkName,
-                                                       const DebugLoc &DLoc,
-                                                       Value *CodeRegion)
+OptimizationRemarkAnalysis::OptimizationRemarkAnalysis(
+    enum DiagnosticKind Kind, const char *PassName, StringRef RemarkName,
+    const DiagnosticLocation &Loc, const Value *CodeRegion)
     : DiagnosticInfoIROptimization(Kind, DS_Remark, PassName, RemarkName,
                                    *cast<BasicBlock>(CodeRegion)->getParent(),
-                                   DLoc, CodeRegion) {}
+                                   Loc, CodeRegion) {}
 
 bool OptimizationRemarkAnalysis::isEnabled(StringRef PassName) {
   return PassRemarksAnalysisOptLoc.Pattern &&
@@ -283,42 +305,48 @@ void DiagnosticInfoMIRParser::print(DiagnosticPrinter &DP) const {
 }
 
 void llvm::emitOptimizationRemark(LLVMContext &Ctx, const char *PassName,
-                                  const Function &Fn, const DebugLoc &DLoc,
+                                  const Function &Fn,
+                                  const DiagnosticLocation &Loc,
                                   const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemark(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemark(PassName, Fn, Loc, Msg));
 }
 
 void llvm::emitOptimizationRemarkMissed(LLVMContext &Ctx, const char *PassName,
                                         const Function &Fn,
-                                        const DebugLoc &DLoc,
+                                        const DiagnosticLocation &Loc,
                                         const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkMissed(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemarkMissed(PassName, Fn, Loc, Msg));
 }
 
 void llvm::emitOptimizationRemarkAnalysis(LLVMContext &Ctx,
                                           const char *PassName,
                                           const Function &Fn,
-                                          const DebugLoc &DLoc,
+                                          const DiagnosticLocation &Loc,
                                           const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkAnalysis(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemarkAnalysis(PassName, Fn, Loc, Msg));
 }
 
-void llvm::emitOptimizationRemarkAnalysisFPCommute(LLVMContext &Ctx,
-                                                   const char *PassName,
-                                                   const Function &Fn,
-                                                   const DebugLoc &DLoc,
-                                                   const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkAnalysisFPCommute(PassName, Fn, DLoc, Msg));
+void llvm::emitOptimizationRemarkAnalysisFPCommute(
+    LLVMContext &Ctx, const char *PassName, const Function &Fn,
+    const DiagnosticLocation &Loc, const Twine &Msg) {
+  Ctx.diagnose(OptimizationRemarkAnalysisFPCommute(PassName, Fn, Loc, Msg));
 }
 
 void llvm::emitOptimizationRemarkAnalysisAliasing(LLVMContext &Ctx,
                                                   const char *PassName,
                                                   const Function &Fn,
-                                                  const DebugLoc &DLoc,
+                                                  const DiagnosticLocation &Loc,
                                                   const Twine &Msg) {
-  Ctx.diagnose(OptimizationRemarkAnalysisAliasing(PassName, Fn, DLoc, Msg));
+  Ctx.diagnose(OptimizationRemarkAnalysisAliasing(PassName, Fn, Loc, Msg));
 }
 
+DiagnosticInfoOptimizationFailure::DiagnosticInfoOptimizationFailure(
+    const char *PassName, StringRef RemarkName, const DiagnosticLocation &Loc,
+    const Value *CodeRegion)
+    : DiagnosticInfoIROptimization(
+          DK_OptimizationFailure, DS_Warning, PassName, RemarkName,
+          *cast<BasicBlock>(CodeRegion)->getParent(), Loc, CodeRegion) {}
+
 bool DiagnosticInfoOptimizationFailure::isEnabled() const {
   // Only print warnings.
   return getSeverity() == DS_Warning;
@@ -334,18 +362,6 @@ void DiagnosticInfoUnsupported::print(DiagnosticPrinter &DP) const {
   DP << Str;
 }
 
-void llvm::emitLoopVectorizeWarning(LLVMContext &Ctx, const Function &Fn,
-                                    const DebugLoc &DLoc, const Twine &Msg) {
-  Ctx.diagnose(DiagnosticInfoOptimizationFailure(
-      Fn, DLoc, Twine("loop not vectorized: " + Msg)));
-}
-
-void llvm::emitLoopInterleaveWarning(LLVMContext &Ctx, const Function &Fn,
-                                     const DebugLoc &DLoc, const Twine &Msg) {
-  Ctx.diagnose(DiagnosticInfoOptimizationFailure(
-      Fn, DLoc, Twine("loop not interleaved: " + Msg)));
-}
-
 void DiagnosticInfoISelFallback::print(DiagnosticPrinter &DP) const {
   DP << "Instruction selection used fallback path for " << getFunction();
 }
diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp
index 05419aa3d2bbcee784933c83ccb8b254dcbe447d..3953a6e1352743d34bcf4e9617dfda5f43219ad8 100644
--- a/lib/IR/Function.cpp
+++ b/lib/IR/Function.cpp
@@ -30,7 +30,6 @@ using namespace llvm;
 
 // Explicit instantiations of SymbolTableListTraits since some of the methods
 // are not in the public header file...
-template class llvm::SymbolTableListTraits<Argument>;
 template class llvm::SymbolTableListTraits<BasicBlock>;
 
 //===----------------------------------------------------------------------===//
@@ -39,12 +38,8 @@ template class llvm::SymbolTableListTraits<BasicBlock>;
 
 void Argument::anchor() { }
 
-Argument::Argument(Type *Ty, const Twine &Name, Function *Par)
-  : Value(Ty, Value::ArgumentVal) {
-  Parent = nullptr;
-
-  if (Par)
-    Par->getArgumentList().push_back(this);
+Argument::Argument(Type *Ty, const Twine &Name, Function *Par, unsigned ArgNo)
+    : Value(Ty, Value::ArgumentVal), Parent(Par), ArgNo(ArgNo) {
   setName(Name);
 }
 
@@ -52,23 +47,6 @@ void Argument::setParent(Function *parent) {
   Parent = parent;
 }
 
-/// getArgNo - Return the index of this formal argument in its containing
-/// function.  For example in "void foo(int a, float b)" a is 0 and b is 1.
-unsigned Argument::getArgNo() const {
-  const Function *F = getParent();
-  assert(F && "Argument is not in a function");
-
-  Function::const_arg_iterator AI = F->arg_begin();
-  unsigned ArgIdx = 0;
-  for (; &*AI != this; ++AI)
-    ++ArgIdx;
-
-  return ArgIdx;
-}
-
-/// hasNonNullAttr - Return true if this argument has the nonnull attribute on
-/// it in its containing function. Also returns true if at least one byte is
-/// known to be dereferenceable and the pointer is in addrspace(0).
 bool Argument::hasNonNullAttr() const {
   if (!getType()->isPointerTy()) return false;
   if (getParent()->getAttributes().
@@ -80,8 +58,6 @@ bool Argument::hasNonNullAttr() const {
   return false;
 }
 
-/// hasByValAttr - Return true if this argument has the byval attribute on it
-/// in its containing function.
 bool Argument::hasByValAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::ByVal);
@@ -97,8 +73,6 @@ bool Argument::hasSwiftErrorAttr() const {
     hasAttribute(getArgNo()+1, Attribute::SwiftError);
 }
 
-/// \brief Return true if this argument has the inalloca attribute on it in
-/// its containing function.
 bool Argument::hasInAllocaAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::InAlloca);
@@ -106,7 +80,7 @@ bool Argument::hasInAllocaAttr() const {
 
 bool Argument::hasByValOrInAllocaAttr() const {
   if (!getType()->isPointerTy()) return false;
-  AttributeSet Attrs = getParent()->getAttributes();
+  AttributeList Attrs = getParent()->getAttributes();
   return Attrs.hasAttribute(getArgNo() + 1, Attribute::ByVal) ||
          Attrs.hasAttribute(getArgNo() + 1, Attribute::InAlloca);
 }
@@ -129,54 +103,38 @@ uint64_t Argument::getDereferenceableOrNullBytes() const {
   return getParent()->getDereferenceableOrNullBytes(getArgNo()+1);
 }
 
-/// hasNestAttr - Return true if this argument has the nest attribute on
-/// it in its containing function.
 bool Argument::hasNestAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::Nest);
 }
 
-/// hasNoAliasAttr - Return true if this argument has the noalias attribute on
-/// it in its containing function.
 bool Argument::hasNoAliasAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::NoAlias);
 }
 
-/// hasNoCaptureAttr - Return true if this argument has the nocapture attribute
-/// on it in its containing function.
 bool Argument::hasNoCaptureAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::NoCapture);
 }
 
-/// hasSRetAttr - Return true if this argument has the sret attribute on
-/// it in its containing function.
 bool Argument::hasStructRetAttr() const {
   if (!getType()->isPointerTy()) return false;
   return hasAttribute(Attribute::StructRet);
 }
 
-/// hasReturnedAttr - Return true if this argument has the returned attribute on
-/// it in its containing function.
 bool Argument::hasReturnedAttr() const {
   return hasAttribute(Attribute::Returned);
 }
 
-/// hasZExtAttr - Return true if this argument has the zext attribute on it in
-/// its containing function.
 bool Argument::hasZExtAttr() const {
   return hasAttribute(Attribute::ZExt);
 }
 
-/// hasSExtAttr Return true if this argument has the sext attribute on it in its
-/// containing function.
 bool Argument::hasSExtAttr() const {
   return hasAttribute(Attribute::SExt);
 }
 
-/// Return true if this argument has the readonly or readnone attribute on it
-/// in its containing function.
 bool Argument::onlyReadsMemory() const {
   return getParent()->getAttributes().
       hasAttribute(getArgNo()+1, Attribute::ReadOnly) ||
@@ -184,27 +142,24 @@ bool Argument::onlyReadsMemory() const {
       hasAttribute(getArgNo()+1, Attribute::ReadNone);
 }
 
-/// addAttr - Add attributes to an argument.
-void Argument::addAttr(AttributeSet AS) {
+void Argument::addAttr(AttributeList AS) {
   assert(AS.getNumSlots() <= 1 &&
          "Trying to add more than one attribute set to an argument!");
   AttrBuilder B(AS, AS.getSlotIndex(0));
-  getParent()->addAttributes(getArgNo() + 1,
-                             AttributeSet::get(Parent->getContext(),
-                                               getArgNo() + 1, B));
+  getParent()->addAttributes(
+      getArgNo() + 1,
+      AttributeList::get(Parent->getContext(), getArgNo() + 1, B));
 }
 
-/// removeAttr - Remove attributes from an argument.
-void Argument::removeAttr(AttributeSet AS) {
+void Argument::removeAttr(AttributeList AS) {
   assert(AS.getNumSlots() <= 1 &&
          "Trying to remove more than one attribute set from an argument!");
   AttrBuilder B(AS, AS.getSlotIndex(0));
-  getParent()->removeAttributes(getArgNo() + 1,
-                                AttributeSet::get(Parent->getContext(),
-                                                  getArgNo() + 1, B));
+  getParent()->removeAttributes(
+      getArgNo() + 1,
+      AttributeList::get(Parent->getContext(), getArgNo() + 1, B));
 }
 
-/// hasAttribute - Checks if an argument has a given attribute.
 bool Argument::hasAttribute(Attribute::AttrKind Kind) const {
   return getParent()->hasAttribute(getArgNo() + 1, Kind);
 }
@@ -213,32 +168,10 @@ bool Argument::hasAttribute(Attribute::AttrKind Kind) const {
 // Helper Methods in Function
 //===----------------------------------------------------------------------===//
 
-bool Function::isMaterializable() const {
-  return getGlobalObjectSubClassData() & (1 << IsMaterializableBit);
-}
-
-void Function::setIsMaterializable(bool V) {
-  unsigned Mask = 1 << IsMaterializableBit;
-  setGlobalObjectSubClassData((~Mask & getGlobalObjectSubClassData()) |
-                              (V ? Mask : 0u));
-}
-
 LLVMContext &Function::getContext() const {
   return getType()->getContext();
 }
 
-FunctionType *Function::getFunctionType() const {
-  return cast<FunctionType>(getValueType());
-}
-
-bool Function::isVarArg() const {
-  return getFunctionType()->isVarArg();
-}
-
-Type *Function::getReturnType() const {
-  return getFunctionType()->getReturnType();
-}
-
 void Function::removeFromParent() {
   getParent()->getFunctionList().remove(getIterator());
 }
@@ -254,7 +187,8 @@ void Function::eraseFromParent() {
 Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name,
                    Module *ParentModule)
     : GlobalObject(Ty, Value::FunctionVal,
-                   OperandTraits<Function>::op_begin(this), 0, Linkage, name) {
+                   OperandTraits<Function>::op_begin(this), 0, Linkage, name),
+      Arguments(nullptr), NumArgs(Ty->getNumParams()) {
   assert(FunctionType::isValidReturnType(getReturnType()) &&
          "invalid return type");
   setGlobalObjectSubClassData(0);
@@ -282,7 +216,8 @@ Function::~Function() {
   dropAllReferences();    // After this it is safe to delete instructions.
 
   // Delete all of the method arguments and unlink from symbol table...
-  ArgumentList.clear();
+  if (Arguments)
+    clearArguments();
 
   // Remove the function from the on-the-side GC table.
   clearGC();
@@ -290,16 +225,33 @@ Function::~Function() {
 
 void Function::BuildLazyArguments() const {
   // Create the arguments vector, all arguments start out unnamed.
-  FunctionType *FT = getFunctionType();
-  for (unsigned i = 0, e = FT->getNumParams(); i != e; ++i) {
-    assert(!FT->getParamType(i)->isVoidTy() &&
-           "Cannot have void typed arguments!");
-    ArgumentList.push_back(new Argument(FT->getParamType(i)));
+  auto *FT = getFunctionType();
+  if (NumArgs > 0) {
+    Arguments = std::allocator<Argument>().allocate(NumArgs);
+    for (unsigned i = 0, e = NumArgs; i != e; ++i) {
+      Type *ArgTy = FT->getParamType(i);
+      assert(!ArgTy->isVoidTy() && "Cannot have void typed arguments!");
+      new (Arguments + i) Argument(ArgTy, "", const_cast<Function *>(this), i);
+    }
   }
 
   // Clear the lazy arguments bit.
   unsigned SDC = getSubclassDataFromValue();
   const_cast<Function*>(this)->setValueSubclassData(SDC &= ~(1<<0));
+  assert(!hasLazyArguments());
+}
+
+static MutableArrayRef<Argument> makeArgArray(Argument *Args, size_t Count) {
+  return MutableArrayRef<Argument>(Args, Count);
+}
+
+void Function::clearArguments() {
+  for (Argument &A : makeArgArray(Arguments, NumArgs)) {
+    A.setName("");
+    A.~Argument();
+  }
+  std::allocator<Argument>().deallocate(Arguments, NumArgs);
+  Arguments = nullptr;
 }
 
 void Function::stealArgumentListFrom(Function &Src) {
@@ -307,10 +259,10 @@ void Function::stealArgumentListFrom(Function &Src) {
 
   // Drop the current arguments, if any, and set the lazy argument bit.
   if (!hasLazyArguments()) {
-    assert(llvm::all_of(ArgumentList,
+    assert(llvm::all_of(makeArgArray(Arguments, NumArgs),
                         [](const Argument &A) { return A.use_empty(); }) &&
            "Expected arguments to be unused in declaration");
-    ArgumentList.clear();
+    clearArguments();
     setValueSubclassData(getSubclassDataFromValue() | (1 << 0));
   }
 
@@ -319,18 +271,26 @@ void Function::stealArgumentListFrom(Function &Src) {
     return;
 
   // Steal arguments from Src, and fix the lazy argument bits.
-  ArgumentList.splice(ArgumentList.end(), Src.ArgumentList);
+  assert(arg_size() == Src.arg_size());
+  Arguments = Src.Arguments;
+  Src.Arguments = nullptr;
+  for (Argument &A : makeArgArray(Arguments, NumArgs)) {
+    // FIXME: This does the work of transferNodesFromList inefficiently.
+    SmallString<128> Name;
+    if (A.hasName())
+      Name = A.getName();
+    if (!Name.empty())
+      A.setName("");
+    A.setParent(this);
+    if (!Name.empty())
+      A.setName(Name);
+  }
+
   setValueSubclassData(getSubclassDataFromValue() & ~(1 << 0));
+  assert(!hasLazyArguments());
   Src.setValueSubclassData(Src.getSubclassDataFromValue() | (1 << 0));
 }
 
-size_t Function::arg_size() const {
-  return getFunctionType()->getNumParams();
-}
-bool Function::arg_empty() const {
-  return getFunctionType()->getNumParams() == 0;
-}
-
 // dropAllReferences() - This function causes all the subinstructions to "let
 // go" of all references that they are maintaining.  This allows one to
 // 'delete' a whole class at a time, even though there may be circular
@@ -362,49 +322,49 @@ void Function::dropAllReferences() {
 }
 
 void Function::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void Function::addAttribute(unsigned i, Attribute Attr) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Attr);
   setAttributes(PAL);
 }
 
-void Function::addAttributes(unsigned i, AttributeSet Attrs) {
-  AttributeSet PAL = getAttributes();
+void Function::addAttributes(unsigned i, AttributeList Attrs) {
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttributes(getContext(), i, Attrs);
   setAttributes(PAL);
 }
 
 void Function::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void Function::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
-void Function::removeAttributes(unsigned i, AttributeSet Attrs) {
-  AttributeSet PAL = getAttributes();
+void Function::removeAttributes(unsigned i, AttributeList Attrs) {
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttributes(getContext(), i, Attrs);
   setAttributes(PAL);
 }
 
 void Function::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
 void Function::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
@@ -533,10 +493,18 @@ static std::string getMangledTypeStr(Type* Ty) {
   } else if (ArrayType* ATyp = dyn_cast<ArrayType>(Ty)) {
     Result += "a" + llvm::utostr(ATyp->getNumElements()) +
       getMangledTypeStr(ATyp->getElementType());
-  } else if (StructType* STyp = dyn_cast<StructType>(Ty)) {
-    assert(!STyp->isLiteral() && "TODO: implement literal types");
-    Result += STyp->getName();
-  } else if (FunctionType* FT = dyn_cast<FunctionType>(Ty)) {
+  } else if (StructType *STyp = dyn_cast<StructType>(Ty)) {
+    if (!STyp->isLiteral()) {
+      Result += "s_";
+      Result += STyp->getName();
+    } else {
+      Result += "sl_";
+      for (auto Elem : STyp->elements())
+        Result += getMangledTypeStr(Elem);
+    }
+    // Ensure nested structs are distinguishable.
+    Result += "s";
+  } else if (FunctionType *FT = dyn_cast<FunctionType>(Ty)) {
     Result += "f_" + getMangledTypeStr(FT->getReturnType());
     for (size_t i = 0; i < FT->getNumParams(); i++)
       Result += getMangledTypeStr(FT->getParamType(i));
@@ -1279,9 +1247,10 @@ void Function::setValueSubclassDataBit(unsigned Bit, bool On) {
     setValueSubclassData(getSubclassDataFromValue() & ~(1 << Bit));
 }
 
-void Function::setEntryCount(uint64_t Count) {
+void Function::setEntryCount(uint64_t Count,
+                             const DenseSet<GlobalValue::GUID> *S) {
   MDBuilder MDB(getContext());
-  setMetadata(LLVMContext::MD_prof, MDB.createFunctionEntryCount(Count));
+  setMetadata(LLVMContext::MD_prof, MDB.createFunctionEntryCount(Count, S));
 }
 
 Optional<uint64_t> Function::getEntryCount() const {
@@ -1298,6 +1267,18 @@ Optional<uint64_t> Function::getEntryCount() const {
   return None;
 }
 
+DenseSet<GlobalValue::GUID> Function::getImportGUIDs() const {
+  DenseSet<GlobalValue::GUID> R;
+  if (MDNode *MD = getMetadata(LLVMContext::MD_prof))
+    if (MDString *MDS = dyn_cast<MDString>(MD->getOperand(0)))
+      if (MDS->getString().equals("function_entry_count"))
+        for (unsigned i = 2; i < MD->getNumOperands(); i++)
+          R.insert(mdconst::extract<ConstantInt>(MD->getOperand(i))
+                       ->getValue()
+                       .getZExtValue());
+  return R;
+}
+
 void Function::setSectionPrefix(StringRef Prefix) {
   MDBuilder MDB(getContext());
   setMetadata(LLVMContext::MD_section_prefix,
diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp
index 6f7356524d38e9193c9baa3b1a6a36d35a6226eb..5f338f58d9403677532cb22196e39a517937ab6f 100644
--- a/lib/IR/Globals.cpp
+++ b/lib/IR/Globals.cpp
@@ -93,18 +93,6 @@ void GlobalObject::setAlignment(unsigned Align) {
   assert(getAlignment() == Align && "Alignment representation error!");
 }
 
-unsigned GlobalObject::getGlobalObjectSubClassData() const {
-  unsigned ValueData = getGlobalValueSubClassData();
-  return ValueData >> GlobalObjectBits;
-}
-
-void GlobalObject::setGlobalObjectSubClassData(unsigned Val) {
-  unsigned OldData = getGlobalValueSubClassData();
-  setGlobalValueSubClassData((OldData & GlobalObjectMask) |
-                             (Val << GlobalObjectBits));
-  assert(getGlobalObjectSubClassData() == Val && "representation error");
-}
-
 void GlobalObject::copyAttributesFrom(const GlobalValue *Src) {
   GlobalValue::copyAttributesFrom(Src);
   if (const auto *GV = dyn_cast<GlobalObject>(Src)) {
@@ -152,7 +140,7 @@ StringRef GlobalValue::getSection() const {
   return cast<GlobalObject>(this)->getSection();
 }
 
-Comdat *GlobalValue::getComdat() {
+const Comdat *GlobalValue::getComdat() const {
   if (auto *GA = dyn_cast<GlobalAlias>(this)) {
     // In general we cannot compute this at the IR level, but we try.
     if (const GlobalObject *GO = GA->getBaseObject())
@@ -177,7 +165,9 @@ void GlobalObject::setSection(StringRef S) {
 
   // Get or create a stable section name string and put it in the table in the
   // context.
-  S = getContext().pImpl->SectionStrings.insert(S).first->first();
+  if (!S.empty()) {
+    S = getContext().pImpl->SectionStrings.insert(S).first->first();
+  }
   getContext().pImpl->GlobalObjectSections[this] = S;
 
   // Update the HasSectionHashEntryBit. Setting the section to the empty string
@@ -240,7 +230,7 @@ bool GlobalValue::canIncreaseAlignment() const {
   return true;
 }
 
-GlobalObject *GlobalValue::getBaseObject() {
+const GlobalObject *GlobalValue::getBaseObject() const {
   if (auto *GO = dyn_cast<GlobalObject>(this))
     return GO;
   if (auto *GA = dyn_cast<GlobalAlias>(this))
diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp
index d3e410d6d033bbab7f2f5502ce3a975ec121d2c7..fd5ae71a2f3ccce10661be82532cdec31a17d6b3 100644
--- a/lib/IR/IRBuilder.cpp
+++ b/lib/IR/IRBuilder.cpp
@@ -172,7 +172,8 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
            "lifetime.start requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
-  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start);
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_start,
+                                           { Ptr->getType() });
   return createCallHelper(TheFn, Ops, this);
 }
 
@@ -187,7 +188,8 @@ CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
            "lifetime.end requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
   Module *M = BB->getParent()->getParent();
-  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end);
+  Value *TheFn = Intrinsic::getDeclaration(M, Intrinsic::lifetime_end,
+                                           { Ptr->getType() });
   return createCallHelper(TheFn, Ops, this);
 }
 
@@ -482,3 +484,11 @@ CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint,
                   getInt32(DerivedOffset)};
  return createCallHelper(FnGCRelocate, Args, this, Name);
 }
+
+CallInst *IRBuilderBase::CreateBinaryIntrinsic(Intrinsic::ID ID,
+                                               Value *LHS, Value *RHS,
+                                               const Twine &Name) {
+  Module *M = BB->getParent()->getParent();
+  Function *Fn =  Intrinsic::getDeclaration(M, ID, { LHS->getType() });
+  return createCallHelper(Fn, { LHS, RHS }, this, Name);
+}
diff --git a/lib/IR/IRPrintingPasses.cpp b/lib/IR/IRPrintingPasses.cpp
index 05e206cfd6cb81bf70bdb8a7756d71be1c810401..955fdc749b2bce3b82b1827319bf92441db68154 100644
--- a/lib/IR/IRPrintingPasses.cpp
+++ b/lib/IR/IRPrintingPasses.cpp
@@ -70,6 +70,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
+
+  StringRef getPassName() const override { return "Print Module IR"; }
 };
 
 class PrintFunctionPassWrapper : public FunctionPass {
@@ -91,6 +93,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
+
+  StringRef getPassName() const override { return "Print Function IR"; }
 };
 
 class PrintBasicBlockPass : public BasicBlockPass {
@@ -111,6 +115,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
   }
+
+  StringRef getPassName() const override { return "Print BasicBlock IR"; }
 };
 
 }
diff --git a/lib/IR/InlineAsm.cpp b/lib/IR/InlineAsm.cpp
index 5a91185710409579b10d845fc999cb357f644f83..8feeeb65d445ef4f664140ffa7fcc94876f483b4 100644
--- a/lib/IR/InlineAsm.cpp
+++ b/lib/IR/InlineAsm.cpp
@@ -1,4 +1,4 @@
-//===-- InlineAsm.cpp - Implement the InlineAsm class ---------------------===//
+//===- InlineAsm.cpp - Implement the InlineAsm class ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,27 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/InlineAsm.h"
 #include "ConstantsContext.h"
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include <algorithm>
+#include <cassert>
 #include <cctype>
-using namespace llvm;
-
-// Implement the first virtual method in this class in this file so the
-// InlineAsm vtable is emitted here.
-InlineAsm::~InlineAsm() {
-}
+#include <cstddef>
+#include <cstdlib>
 
-InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString,
-                          StringRef Constraints, bool hasSideEffects,
-                          bool isAlignStack, AsmDialect asmDialect) {
-  InlineAsmKeyType Key(AsmString, Constraints, FTy, hasSideEffects,
-                       isAlignStack, asmDialect);
-  LLVMContextImpl *pImpl = FTy->getContext().pImpl;
-  return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(FTy), Key);
-}
+using namespace llvm;
 
 InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString,
                      const std::string &constraints, bool hasSideEffects,
@@ -40,12 +35,24 @@ InlineAsm::InlineAsm(FunctionType *FTy, const std::string &asmString,
       AsmString(asmString), Constraints(constraints), FTy(FTy),
       HasSideEffects(hasSideEffects), IsAlignStack(isAlignStack),
       Dialect(asmDialect) {
-
   // Do various checks on the constraint string and type.
   assert(Verify(getFunctionType(), constraints) &&
          "Function type not legal for constraints!");
 }
 
+// Implement the first virtual method in this class in this file so the
+// InlineAsm vtable is emitted here.
+InlineAsm::~InlineAsm() = default;
+
+InlineAsm *InlineAsm::get(FunctionType *FTy, StringRef AsmString,
+                          StringRef Constraints, bool hasSideEffects,
+                          bool isAlignStack, AsmDialect asmDialect) {
+  InlineAsmKeyType Key(AsmString, Constraints, FTy, hasSideEffects,
+                       isAlignStack, asmDialect);
+  LLVMContextImpl *pImpl = FTy->getContext().pImpl;
+  return pImpl->InlineAsms.getOrCreate(PointerType::getUnqual(FTy), Key);
+}
+
 void InlineAsm::destroyConstant() {
   getType()->getContext().pImpl->InlineAsms.remove(this);
   delete this;
@@ -55,14 +62,6 @@ FunctionType *InlineAsm::getFunctionType() const {
   return FTy;
 }
     
-///Default constructor.
-InlineAsm::ConstraintInfo::ConstraintInfo() :
-  Type(isInput), isEarlyClobber(false),
-  MatchingInput(-1), isCommutative(false),
-  isIndirect(false), isMultipleAlternative(false),
-  currentAlternativeIndex(0) {
-}
-
 /// Parse - Analyze the specified string (e.g. "==&{eax}") and fill in the
 /// fields in this structure.  If the constraint string is not understood,
 /// return true, otherwise return false.
diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp
index 2fa03489081d0d8d0f39cc003840744aa4ffeda4..c26699eab4e2aa74768e237c5e8e86fb322f8f57 100644
--- a/lib/IR/Instruction.cpp
+++ b/lib/IR/Instruction.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
 using namespace llvm;
@@ -59,12 +60,6 @@ const Module *Instruction::getModule() const {
   return getParent()->getModule();
 }
 
-Module *Instruction::getModule() {
-  return getParent()->getModule();
-}
-
-Function *Instruction::getFunction() { return getParent()->getParent(); }
-
 const Function *Instruction::getFunction() const {
   return getParent()->getParent();
 }
@@ -122,6 +117,29 @@ bool Instruction::hasNoSignedWrap() const {
   return cast<OverflowingBinaryOperator>(this)->hasNoSignedWrap();
 }
 
+void Instruction::dropPoisonGeneratingFlags() {
+  switch (getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::Shl:
+    cast<OverflowingBinaryOperator>(this)->setHasNoUnsignedWrap(false);
+    cast<OverflowingBinaryOperator>(this)->setHasNoSignedWrap(false);
+    break;
+
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::AShr:
+  case Instruction::LShr:
+    cast<PossiblyExactOperator>(this)->setIsExact(false);
+    break;
+
+  case Instruction::GetElementPtr:
+    cast<GetElementPtrInst>(this)->setIsInBounds(false);
+    break;
+  }
+}
+
 bool Instruction::isExact() const {
   return cast<PossiblyExactOperator>(this)->isExact();
 }
@@ -186,6 +204,11 @@ bool Instruction::hasAllowReciprocal() const {
   return cast<FPMathOperator>(this)->hasAllowReciprocal();
 }
 
+bool Instruction::hasAllowContract() const {
+  assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
+  return cast<FPMathOperator>(this)->hasAllowContract();
+}
+
 FastMathFlags Instruction::getFastMathFlags() const {
   assert(isa<FPMathOperator>(this) && "getting fast-math flag on invalid op");
   return cast<FPMathOperator>(this)->getFastMathFlags();
@@ -521,17 +544,6 @@ bool Instruction::mayThrow() const {
   return isa<ResumeInst>(this);
 }
 
-/// Return true if the instruction is associative:
-///
-///   Associative operators satisfy:  x op (y op z) === (x op y) op z
-///
-/// In LLVM, the Add, Mul, And, Or, and Xor operators are associative.
-///
-bool Instruction::isAssociative(unsigned Opcode) {
-  return Opcode == And || Opcode == Or || Opcode == Xor ||
-         Opcode == Add || Opcode == Mul;
-}
-
 bool Instruction::isAssociative() const {
   unsigned Opcode = getOpcode();
   if (isAssociative(Opcode))
@@ -546,51 +558,6 @@ bool Instruction::isAssociative() const {
   }
 }
 
-/// Return true if the instruction is commutative:
-///
-///   Commutative operators satisfy: (x op y) === (y op x)
-///
-/// In LLVM, these are the associative operators, plus SetEQ and SetNE, when
-/// applied to any type.
-///
-bool Instruction::isCommutative(unsigned op) {
-  switch (op) {
-  case Add:
-  case FAdd:
-  case Mul:
-  case FMul:
-  case And:
-  case Or:
-  case Xor:
-    return true;
-  default:
-    return false;
-  }
-}
-
-/// Return true if the instruction is idempotent:
-///
-///   Idempotent operators satisfy:  x op x === x
-///
-/// In LLVM, the And and Or operators are idempotent.
-///
-bool Instruction::isIdempotent(unsigned Opcode) {
-  return Opcode == And || Opcode == Or;
-}
-
-/// Return true if the instruction is nilpotent:
-///
-///   Nilpotent operators satisfy:  x op x === Id,
-///
-///   where Id is the identity for the operator, i.e. a constant such that
-///     x op Id === x and Id op x === x for all x.
-///
-/// In LLVM, the Xor operator is nilpotent.
-///
-bool Instruction::isNilpotent(unsigned Opcode) {
-  return Opcode == Xor;
-}
-
 Instruction *Instruction::cloneImpl() const {
   llvm_unreachable("Subclass of Instruction failed to implement cloneImpl");
 }
@@ -651,3 +618,34 @@ Instruction *Instruction::clone() const {
   New->copyMetadata(*this);
   return New;
 }
+
+void Instruction::updateProfWeight(uint64_t S, uint64_t T) {
+  auto *ProfileData = getMetadata(LLVMContext::MD_prof);
+  if (ProfileData == nullptr)
+    return;
+
+  auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(0));
+  if (!ProfDataName || !ProfDataName->getString().equals("branch_weights"))
+    return;
+
+  SmallVector<uint32_t, 4> Weights;
+  for (unsigned i = 1; i < ProfileData->getNumOperands(); i++) {
+    // Using APInt::div may be expensive, but most cases should fit in 64 bits.
+    APInt Val(128, mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i))
+                       ->getValue()
+                       .getZExtValue());
+    Val *= APInt(128, S);
+    Weights.push_back(Val.udiv(APInt(128, T)).getLimitedValue());
+  }
+  MDBuilder MDB(getContext());
+  setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+}
+
+void Instruction::setProfWeight(uint64_t W) {
+  assert((isa<CallInst>(this) || isa<InvokeInst>(this)) &&
+         "Can only set weights for call and invoke instrucitons");
+  SmallVector<uint32_t, 1> Weights;
+  Weights.push_back(W);
+  MDBuilder MDB(getContext());
+  setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+}
diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp
index b6792694342959424860495dad213319b35647f2..faa5ed078cf78c5727d19062982f1e5c34679532 100644
--- a/lib/IR/Instructions.cpp
+++ b/lib/IR/Instructions.cpp
@@ -307,7 +307,7 @@ CallInst::CallInst(const CallInst &CI)
     : Instruction(CI.getType(), Instruction::Call,
                   OperandTraits<CallInst>::op_end(this) - CI.getNumOperands(),
                   CI.getNumOperands()),
-      AttributeList(CI.AttributeList), FTy(CI.FTy) {
+      Attrs(CI.Attrs), FTy(CI.FTy) {
   setTailCallKind(CI.getTailCallKind());
   setCallingConv(CI.getCallingConv());
 
@@ -334,7 +334,7 @@ CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
 Value *CallInst::getReturnedArgOperand() const {
   unsigned Index;
 
-  if (AttributeList.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
     return getArgOperand(Index-1);
   if (const Function *F = getCalledFunction())
     if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
@@ -345,37 +345,37 @@ Value *CallInst::getReturnedArgOperand() const {
 }
 
 void CallInst::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void CallInst::addAttribute(unsigned i, Attribute Attr) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Attr);
   setAttributes(PAL);
 }
 
 void CallInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void CallInst::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void CallInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
 void CallInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
@@ -383,7 +383,7 @@ void CallInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
 bool CallInst::paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
   assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
 
-  if (AttributeList.hasAttribute(i, Kind))
+  if (Attrs.hasAttribute(i, Kind))
     return true;
   if (const Function *F = getCalledFunction())
     return F->getAttributes().hasAttribute(i, Kind);
@@ -466,7 +466,7 @@ static Instruction *createMalloc(Instruction *InsertBefore,
   Value *MallocFunc = MallocF;
   if (!MallocFunc)
     // prototype malloc as "void *malloc(size_t)"
-    MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy, nullptr);
+    MallocFunc = M->getOrInsertFunction("malloc", BPTy, IntPtrTy);
   PointerType *AllocPtrType = PointerType::getUnqual(AllocTy);
   CallInst *MCall = nullptr;
   Instruction *Result = nullptr;
@@ -560,7 +560,7 @@ static Instruction *createFree(Value *Source,
   Type *VoidTy = Type::getVoidTy(M->getContext());
   Type *IntPtrTy = Type::getInt8PtrTy(M->getContext());
   // prototype free as "void free(void*)"
-  Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy, nullptr);
+  Value *FreeFunc = M->getOrInsertFunction("free", VoidTy, IntPtrTy);
   CallInst *Result = nullptr;
   Value *PtrCast = Source;
   if (InsertBefore) {
@@ -646,7 +646,7 @@ InvokeInst::InvokeInst(const InvokeInst &II)
                      OperandTraits<InvokeInst>::op_end(this) -
                          II.getNumOperands(),
                      II.getNumOperands()),
-      AttributeList(II.AttributeList), FTy(II.FTy) {
+      Attrs(II.Attrs), FTy(II.FTy) {
   setCallingConv(II.getCallingConv());
   std::copy(II.op_begin(), II.op_end(), op_begin());
   std::copy(II.bundle_op_info_begin(), II.bundle_op_info_end(),
@@ -681,7 +681,7 @@ void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) {
 Value *InvokeInst::getReturnedArgOperand() const {
   unsigned Index;
 
-  if (AttributeList.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
+  if (Attrs.hasAttrSomewhere(Attribute::Returned, &Index) && Index)
     return getArgOperand(Index-1);
   if (const Function *F = getCalledFunction())
     if (F->getAttributes().hasAttrSomewhere(Attribute::Returned, &Index) &&
@@ -694,7 +694,7 @@ Value *InvokeInst::getReturnedArgOperand() const {
 bool InvokeInst::paramHasAttr(unsigned i, Attribute::AttrKind Kind) const {
   assert(i < (getNumArgOperands() + 1) && "Param index out of bounds!");
 
-  if (AttributeList.hasAttribute(i, Kind))
+  if (Attrs.hasAttribute(i, Kind))
     return true;
   if (const Function *F = getCalledFunction())
     return F->getAttributes().hasAttribute(i, Kind);
@@ -720,37 +720,37 @@ bool InvokeInst::dataOperandHasImpliedAttr(unsigned i,
 }
 
 void InvokeInst::addAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void InvokeInst::addAttribute(unsigned i, Attribute Attr) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addAttribute(getContext(), i, Attr);
   setAttributes(PAL);
 }
 
 void InvokeInst::removeAttribute(unsigned i, Attribute::AttrKind Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void InvokeInst::removeAttribute(unsigned i, StringRef Kind) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.removeAttribute(getContext(), i, Kind);
   setAttributes(PAL);
 }
 
 void InvokeInst::addDereferenceableAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
 
 void InvokeInst::addDereferenceableOrNullAttr(unsigned i, uint64_t Bytes) {
-  AttributeSet PAL = getAttributes();
+  AttributeList PAL = getAttributes();
   PAL = PAL.addDereferenceableOrNullAttr(getContext(), i, Bytes);
   setAttributes(PAL);
 }
@@ -1199,34 +1199,38 @@ static Value *getAISize(LLVMContext &Context, Value *Amt) {
   return Amt;
 }
 
-AllocaInst::AllocaInst(Type *Ty, const Twine &Name, Instruction *InsertBefore)
-    : AllocaInst(Ty, /*ArraySize=*/nullptr, Name, InsertBefore) {}
-
-AllocaInst::AllocaInst(Type *Ty, const Twine &Name, BasicBlock *InsertAtEnd)
-    : AllocaInst(Ty, /*ArraySize=*/nullptr, Name, InsertAtEnd) {}
-
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, const Twine &Name,
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
                        Instruction *InsertBefore)
-    : AllocaInst(Ty, ArraySize, /*Align=*/0, Name, InsertBefore) {}
+  : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertBefore) {}
 
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, const Twine &Name,
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
                        BasicBlock *InsertAtEnd)
-    : AllocaInst(Ty, ArraySize, /*Align=*/0, Name, InsertAtEnd) {}
+  : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertAtEnd) {}
 
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                        const Twine &Name, Instruction *InsertBefore)
-    : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
-                       getAISize(Ty->getContext(), ArraySize), InsertBefore),
-      AllocatedType(Ty) {
+  : AllocaInst(Ty, AddrSpace, ArraySize, /*Align=*/0, Name, InsertBefore) {}
+
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       const Twine &Name, BasicBlock *InsertAtEnd)
+  : AllocaInst(Ty, AddrSpace, ArraySize, /*Align=*/0, Name, InsertAtEnd) {}
+
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       unsigned Align, const Twine &Name,
+                       Instruction *InsertBefore)
+  : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
+                     getAISize(Ty->getContext(), ArraySize), InsertBefore),
+    AllocatedType(Ty) {
   setAlignment(Align);
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
   setName(Name);
 }
 
-AllocaInst::AllocaInst(Type *Ty, Value *ArraySize, unsigned Align,
-                       const Twine &Name, BasicBlock *InsertAtEnd)
-    : UnaryInstruction(PointerType::getUnqual(Ty), Alloca,
-                       getAISize(Ty->getContext(), ArraySize), InsertAtEnd),
+AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
+                       unsigned Align, const Twine &Name,
+                       BasicBlock *InsertAtEnd)
+  : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
+                     getAISize(Ty->getContext(), ArraySize), InsertAtEnd),
       AllocatedType(Ty) {
   setAlignment(Align);
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
@@ -3655,16 +3659,16 @@ void SwitchInst::addCase(ConstantInt *OnVal, BasicBlock *Dest) {
   // Initialize some new operands.
   assert(OpNo+1 < ReservedSpace && "Growing didn't work!");
   setNumHungOffUseOperands(OpNo+2);
-  CaseIt Case(this, NewCaseIdx);
+  CaseHandle Case(this, NewCaseIdx);
   Case.setValue(OnVal);
   Case.setSuccessor(Dest);
 }
 
 /// removeCase - This method removes the specified case and its successor
 /// from the switch instruction.
-void SwitchInst::removeCase(CaseIt i) {
-  unsigned idx = i.getCaseIndex();
-  
+SwitchInst::CaseIt SwitchInst::removeCase(CaseIt I) {
+  unsigned idx = I->getCaseIndex();
+
   assert(2 + idx*2 < getNumOperands() && "Case index out of range!!!");
 
   unsigned NumOps = getNumOperands();
@@ -3680,6 +3684,8 @@ void SwitchInst::removeCase(CaseIt i) {
   OL[NumOps-2].set(nullptr);
   OL[NumOps-2+1].set(nullptr);
   setNumHungOffUseOperands(NumOps-2);
+
+  return CaseIt(this, idx);
 }
 
 /// growOperands - grow operands - This grows the operand list in response
@@ -3826,6 +3832,7 @@ InsertValueInst *InsertValueInst::cloneImpl() const {
 
 AllocaInst *AllocaInst::cloneImpl() const {
   AllocaInst *Result = new AllocaInst(getAllocatedType(),
+                                      getType()->getAddressSpace(),
                                       (Value *)getOperand(0), getAlignment());
   Result->setUsedWithInAlloca(isUsedWithInAlloca());
   Result->setSwiftError(isSwiftError());
diff --git a/lib/IR/LLVMContext.cpp b/lib/IR/LLVMContext.cpp
index dd66f144f04fd6a8bf628346109b7dbd2ac694c1..6c6383c22255d288b800e5bdb0433009473f14c4 100644
--- a/lib/IR/LLVMContext.cpp
+++ b/lib/IR/LLVMContext.cpp
@@ -58,6 +58,7 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
     {MD_type, "type"},
     {MD_section_prefix, "section_prefix"},
     {MD_absolute_symbol, "absolute_symbol"},
+    {MD_associated, "associated"},
   };
 
   for (auto &MDKind : MDKinds) {
diff --git a/lib/IR/LLVMContextImpl.cpp b/lib/IR/LLVMContextImpl.cpp
index c43356c5382647345421751adf3ef12d11b30eda..343722463e5faed2bda618bb584b156a5f88c495 100644
--- a/lib/IR/LLVMContextImpl.cpp
+++ b/lib/IR/LLVMContextImpl.cpp
@@ -114,9 +114,10 @@ LLVMContextImpl::~LLVMContextImpl() {
   }
 
   // Destroy attribute lists.
-  for (FoldingSetIterator<AttributeSetImpl> I = AttrsLists.begin(),
-         E = AttrsLists.end(); I != E; ) {
-    FoldingSetIterator<AttributeSetImpl> Elem = I++;
+  for (FoldingSetIterator<AttributeListImpl> I = AttrsLists.begin(),
+                                             E = AttrsLists.end();
+       I != E;) {
+    FoldingSetIterator<AttributeListImpl> Elem = I++;
     delete &*Elem;
   }
 
diff --git a/lib/IR/LLVMContextImpl.h b/lib/IR/LLVMContextImpl.h
index 850c81cfabb2fe8e406993205359179b250aeb98..0ee0b9c0da2542b29e809776f2604706c6ca18a5 100644
--- a/lib/IR/LLVMContextImpl.h
+++ b/lib/IR/LLVMContextImpl.h
@@ -352,22 +352,26 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
   uint64_t SizeInBits;
   uint64_t OffsetInBits;
   uint32_t AlignInBits;
+  Optional<unsigned> DWARFAddressSpace;
   unsigned Flags;
   Metadata *ExtraData;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
-                uint32_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
+                uint32_t AlignInBits, uint64_t OffsetInBits,
+                Optional<unsigned> DWARFAddressSpace, unsigned Flags,
                 Metadata *ExtraData)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
-        AlignInBits(AlignInBits), Flags(Flags), ExtraData(ExtraData) {}
+        AlignInBits(AlignInBits), DWARFAddressSpace(DWARFAddressSpace),
+        Flags(Flags), ExtraData(ExtraData) {}
   MDNodeKeyImpl(const DIDerivedType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
         BaseType(N->getRawBaseType()), SizeInBits(N->getSizeInBits()),
         OffsetInBits(N->getOffsetInBits()), AlignInBits(N->getAlignInBits()),
-        Flags(N->getFlags()), ExtraData(N->getRawExtraData()) {}
+        DWARFAddressSpace(N->getDWARFAddressSpace()), Flags(N->getFlags()),
+        ExtraData(N->getRawExtraData()) {}
 
   bool isKeyOf(const DIDerivedType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -375,7 +379,9 @@ template <> struct MDNodeKeyImpl<DIDerivedType> {
            Scope == RHS->getRawScope() && BaseType == RHS->getRawBaseType() &&
            SizeInBits == RHS->getSizeInBits() &&
            AlignInBits == RHS->getAlignInBits() &&
-           OffsetInBits == RHS->getOffsetInBits() && Flags == RHS->getFlags() &&
+           OffsetInBits == RHS->getOffsetInBits() &&
+           DWARFAddressSpace == RHS->getDWARFAddressSpace() &&
+           Flags == RHS->getFlags() &&
            ExtraData == RHS->getRawExtraData();
   }
   unsigned getHashValue() const {
@@ -612,17 +618,19 @@ template <> struct MDNodeSubsetEqualImpl<DISubprogram> {
   typedef MDNodeKeyImpl<DISubprogram> KeyTy;
   static bool isSubsetEqual(const KeyTy &LHS, const DISubprogram *RHS) {
     return isDeclarationOfODRMember(LHS.IsDefinition, LHS.Scope,
-                                    LHS.LinkageName, RHS);
+                                    LHS.LinkageName, LHS.TemplateParams, RHS);
   }
   static bool isSubsetEqual(const DISubprogram *LHS, const DISubprogram *RHS) {
     return isDeclarationOfODRMember(LHS->isDefinition(), LHS->getRawScope(),
-                                    LHS->getRawLinkageName(), RHS);
+                                    LHS->getRawLinkageName(),
+                                    LHS->getRawTemplateParams(), RHS);
   }
 
   /// Subprograms compare equal if they declare the same function in an ODR
   /// type.
   static bool isDeclarationOfODRMember(bool IsDefinition, const Metadata *Scope,
                                        const MDString *LinkageName,
+                                       const Metadata *TemplateParams,
                                        const DISubprogram *RHS) {
     // Check whether the LHS is eligible.
     if (IsDefinition || !Scope || !LinkageName)
@@ -633,8 +641,14 @@ template <> struct MDNodeSubsetEqualImpl<DISubprogram> {
       return false;
 
     // Compare to the RHS.
+    // FIXME: We need to compare template parameters here to avoid incorrect
+    // collisions in mapMetadata when RF_MoveDistinctMDs and a ODR-DISubprogram
+    // has a non-ODR template parameter (i.e., a DICompositeType that does not
+    // have an identifier). Eventually we should decouple ODR logic from
+    // uniquing logic.
     return IsDefinition == RHS->isDefinition() && Scope == RHS->getRawScope() &&
-           LinkageName == RHS->getRawLinkageName();
+           LinkageName == RHS->getRawLinkageName() &&
+           TemplateParams == RHS->getRawTemplateParams();
   }
 };
 
@@ -1105,7 +1119,7 @@ public:
   FPMapTy FPConstants;
 
   FoldingSet<AttributeImpl> AttrsSet;
-  FoldingSet<AttributeSetImpl> AttrsLists;
+  FoldingSet<AttributeListImpl> AttrsLists;
   FoldingSet<AttributeSetNode> AttrsSetNodes;
 
   StringMap<MDString, BumpPtrAllocator> MDStringCache;
diff --git a/lib/IR/MDBuilder.cpp b/lib/IR/MDBuilder.cpp
index f4bfd59921516ae199217048b725483c58c6f734..b9c4f482adf577b87c3446043ef895780045d0c5 100644
--- a/lib/IR/MDBuilder.cpp
+++ b/lib/IR/MDBuilder.cpp
@@ -56,11 +56,16 @@ MDNode *MDBuilder::createUnpredictable() {
   return MDNode::get(Context, None);
 }
 
-MDNode *MDBuilder::createFunctionEntryCount(uint64_t Count) {
+MDNode *MDBuilder::createFunctionEntryCount(
+    uint64_t Count, const DenseSet<GlobalValue::GUID> *Imports) {
   Type *Int64Ty = Type::getInt64Ty(Context);
-  return MDNode::get(Context,
-                     {createString("function_entry_count"),
-                      createConstant(ConstantInt::get(Int64Ty, Count))});
+  SmallVector<Metadata *, 8> Ops;
+  Ops.push_back(createString("function_entry_count"));
+  Ops.push_back(createConstant(ConstantInt::get(Int64Ty, Count)));
+  if (Imports)
+    for (auto ID : *Imports)
+      Ops.push_back(createConstant(ConstantInt::get(Int64Ty, ID)));
+  return MDNode::get(Context, Ops);
 }
 
 MDNode *MDBuilder::createFunctionSectionPrefix(StringRef Prefix) {
diff --git a/lib/IR/Mangler.cpp b/lib/IR/Mangler.cpp
index 41e11b3945e40b5c573657dcf294251513e622ac..03723bfd2ddb7b8d27e418fd315951fc7f7fd926 100644
--- a/lib/IR/Mangler.cpp
+++ b/lib/IR/Mangler.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/IR/Mangler.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -172,3 +173,34 @@ void Mangler::getNameWithPrefix(SmallVectorImpl<char> &OutName,
   raw_svector_ostream OS(OutName);
   getNameWithPrefix(OS, GV, CannotUsePrivateLabel);
 }
+
+void llvm::emitLinkerFlagsForGlobalCOFF(raw_ostream &OS, const GlobalValue *GV,
+                                        const Triple &TT, Mangler &Mangler) {
+  if (!GV->hasDLLExportStorageClass() || GV->isDeclaration())
+    return;
+
+  if (TT.isKnownWindowsMSVCEnvironment())
+    OS << " /EXPORT:";
+  else
+    OS << " -export:";
+
+  if (TT.isWindowsGNUEnvironment() || TT.isWindowsCygwinEnvironment()) {
+    std::string Flag;
+    raw_string_ostream FlagOS(Flag);
+    Mangler.getNameWithPrefix(FlagOS, GV, false);
+    FlagOS.flush();
+    if (Flag[0] == GV->getParent()->getDataLayout().getGlobalPrefix())
+      OS << Flag.substr(1);
+    else
+      OS << Flag;
+  } else {
+    Mangler.getNameWithPrefix(OS, GV, false);
+  }
+
+  if (!GV->getValueType()->isFunctionTy()) {
+    if (TT.isKnownWindowsMSVCEnvironment())
+      OS << ",DATA";
+    else
+      OS << ",data";
+  }
+}
diff --git a/lib/IR/Metadata.cpp b/lib/IR/Metadata.cpp
index 1d1930459239a45e3c30df5f2423648bcc49354a..7228de3d23702d140e2254f86a24f25937fd1caf 100644
--- a/lib/IR/Metadata.cpp
+++ b/lib/IR/Metadata.cpp
@@ -11,20 +11,50 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/Metadata.h"
 #include "LLVMContextImpl.h"
 #include "MetadataImpl.h"
 #include "SymbolTableListTraitsImpl.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalObject.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/TrackingMDRef.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/IR/ValueHandle.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <tuple>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -1027,8 +1057,7 @@ static SmallVector<TrackingMDRef, 4> &getNMDOps(void *Operands) {
 }
 
 NamedMDNode::NamedMDNode(const Twine &N)
-    : Name(N.str()), Parent(nullptr),
-      Operands(new SmallVector<TrackingMDRef, 4>()) {}
+    : Name(N.str()), Operands(new SmallVector<TrackingMDRef, 4>()) {}
 
 NamedMDNode::~NamedMDNode() {
   dropAllReferences();
@@ -1308,17 +1337,26 @@ bool Instruction::extractProfTotalWeight(uint64_t &TotalVal) const {
     return false;
 
   auto *ProfDataName = dyn_cast<MDString>(ProfileData->getOperand(0));
-  if (!ProfDataName || !ProfDataName->getString().equals("branch_weights"))
+  if (!ProfDataName)
     return false;
 
-  TotalVal = 0;
-  for (unsigned i = 1; i < ProfileData->getNumOperands(); i++) {
-    auto *V = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i));
-    if (!V)
-      return false;
-    TotalVal += V->getValue().getZExtValue();
+  if (ProfDataName->getString().equals("branch_weights")) {
+    TotalVal = 0;
+    for (unsigned i = 1; i < ProfileData->getNumOperands(); i++) {
+      auto *V = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(i));
+      if (!V)
+        return false;
+      TotalVal += V->getValue().getZExtValue();
+    }
+    return true;
+  } else if (ProfDataName->getString().equals("VP") &&
+             ProfileData->getNumOperands() > 3) {
+    TotalVal = mdconst::dyn_extract<ConstantInt>(ProfileData->getOperand(2))
+                   ->getValue()
+                   .getZExtValue();
+    return true;
   }
-  return true;
+  return false;
 }
 
 void Instruction::clearMetadataHashEntries() {
@@ -1446,7 +1484,7 @@ void GlobalObject::addTypeMetadata(unsigned Offset, Metadata *TypeID) {
   addMetadata(
       LLVMContext::MD_type,
       *MDTuple::get(getContext(),
-                    {llvm::ConstantAsMetadata::get(llvm::ConstantInt::get(
+                    {ConstantAsMetadata::get(llvm::ConstantInt::get(
                          Type::getInt64Ty(getContext()), Offset)),
                      TypeID}));
 }
@@ -1459,6 +1497,15 @@ DISubprogram *Function::getSubprogram() const {
   return cast_or_null<DISubprogram>(getMetadata(LLVMContext::MD_dbg));
 }
 
+bool Function::isDebugInfoForProfiling() const {
+  if (DISubprogram *SP = getSubprogram()) {
+    if (DICompileUnit *CU = SP->getUnit()) {
+      return CU->getDebugInfoForProfiling();
+    }
+  }
+  return false;
+}
+
 void GlobalVariable::addDebugInfo(DIGlobalVariableExpression *GV) {
   addMetadata(LLVMContext::MD_dbg, *GV);
 }
diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp
index 1911f84340c6dffe256953252cf1b87184af7ff2..fec9df193685de4819b291dc79effe5bc0aeb103 100644
--- a/lib/IR/Module.cpp
+++ b/lib/IR/Module.cpp
@@ -120,9 +120,8 @@ void Module::getOperandBundleTags(SmallVectorImpl<StringRef> &Result) const {
 // it.  This is nice because it allows most passes to get away with not handling
 // the symbol table directly for this common task.
 //
-Constant *Module::getOrInsertFunction(StringRef Name,
-                                      FunctionType *Ty,
-                                      AttributeSet AttributeList) {
+Constant *Module::getOrInsertFunction(StringRef Name, FunctionType *Ty,
+                                      AttributeList AttributeList) {
   // See if we have a definition for the specified function already.
   GlobalValue *F = getNamedValue(Name);
   if (!F) {
@@ -145,49 +144,7 @@ Constant *Module::getOrInsertFunction(StringRef Name,
 
 Constant *Module::getOrInsertFunction(StringRef Name,
                                       FunctionType *Ty) {
-  return getOrInsertFunction(Name, Ty, AttributeSet());
-}
-
-// getOrInsertFunction - Look up the specified function in the module symbol
-// table.  If it does not exist, add a prototype for the function and return it.
-// This version of the method takes a null terminated list of function
-// arguments, which makes it easier for clients to use.
-//
-Constant *Module::getOrInsertFunction(StringRef Name,
-                                      AttributeSet AttributeList,
-                                      Type *RetTy, ...) {
-  va_list Args;
-  va_start(Args, RetTy);
-
-  // Build the list of argument types...
-  std::vector<Type*> ArgTys;
-  while (Type *ArgTy = va_arg(Args, Type*))
-    ArgTys.push_back(ArgTy);
-
-  va_end(Args);
-
-  // Build the function type and chain to the other getOrInsertFunction...
-  return getOrInsertFunction(Name,
-                             FunctionType::get(RetTy, ArgTys, false),
-                             AttributeList);
-}
-
-Constant *Module::getOrInsertFunction(StringRef Name,
-                                      Type *RetTy, ...) {
-  va_list Args;
-  va_start(Args, RetTy);
-
-  // Build the list of argument types...
-  std::vector<Type*> ArgTys;
-  while (Type *ArgTy = va_arg(Args, Type*))
-    ArgTys.push_back(ArgTy);
-
-  va_end(Args);
-
-  // Build the function type and chain to the other getOrInsertFunction...
-  return getOrInsertFunction(Name,
-                             FunctionType::get(RetTy, ArgTys, false),
-                             AttributeSet());
+  return getOrInsertFunction(Name, Ty, AttributeList());
 }
 
 // getFunction - Look up the specified function in the module symbol table.
@@ -208,7 +165,8 @@ Function *Module::getFunction(StringRef Name) const {
 /// If AllowLocal is set to true, this function will return types that
 /// have an local. By default, these types are not returned.
 ///
-GlobalVariable *Module::getGlobalVariable(StringRef Name, bool AllowLocal) {
+GlobalVariable *Module::getGlobalVariable(StringRef Name,
+                                          bool AllowLocal) const {
   if (GlobalVariable *Result =
       dyn_cast_or_null<GlobalVariable>(getNamedValue(Name)))
     if (AllowLocal || !Result->hasLocalLinkage())
@@ -465,6 +423,14 @@ void Module::dropAllReferences() {
     GIF.dropAllReferences();
 }
 
+unsigned Module::getNumberRegisterParameters() const {
+  auto *Val =
+      cast_or_null<ConstantAsMetadata>(getModuleFlag("NumRegisterParameters"));
+  if (!Val)
+    return 0;
+  return cast<ConstantInt>(Val->getValue())->getZExtValue();
+}
+
 unsigned Module::getDwarfVersion() const {
   auto *Val = cast_or_null<ConstantAsMetadata>(getModuleFlag("Dwarf Version"));
   if (!Val)
diff --git a/lib/IR/Operator.cpp b/lib/IR/Operator.cpp
index 2fba24d99b30ab4afa2cfea71f333ca784fc130c..7d819f3aae8dfd7e791a70a49c4e592857fae343 100644
--- a/lib/IR/Operator.cpp
+++ b/lib/IR/Operator.cpp
@@ -1,4 +1,18 @@
+//===-- Operator.cpp - Implement the LLVM operators -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the non-inline methods for the LLVM Operator classes.
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/IR/Operator.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Type.h"
diff --git a/lib/IR/OptBisect.cpp b/lib/IR/OptBisect.cpp
index e9574ca81261c720ccbd14c0e20edf83c124959a..b670c817569a9d39c86f0dde7f720fa22792f243 100644
--- a/lib/IR/OptBisect.cpp
+++ b/lib/IR/OptBisect.cpp
@@ -39,14 +39,6 @@ static void printPassMessage(const StringRef &Name, int PassNum,
          << "(" << PassNum << ") " << Name << " on " << TargetDesc << "\n";
 }
 
-static void printCaseMessage(int CaseNum, StringRef Msg, bool Running) {
-  if (Running)
-    errs() << "BISECT: running case (";
-  else
-    errs() << "BISECT: NOT running case (";
-  errs() << CaseNum << "): " << Msg << "\n";
-}
-
 static std::string getDescription(const Module &M) {
   return "module (" + M.getName().str() + ")";
 }
@@ -108,13 +100,3 @@ bool OptBisect::checkPass(const StringRef PassName,
   printPassMessage(PassName, CurBisectNum, TargetDesc, ShouldRun);
   return ShouldRun;
 }
-
-bool OptBisect::shouldRunCase(const Twine &Msg) {
-  if (!BisectEnabled)
-    return true;
-  int CurFuelNum = ++LastBisectNum;
-  bool ShouldRun = (OptBisectLimit == -1 || CurFuelNum <= OptBisectLimit);
-  printCaseMessage(CurFuelNum, Msg.str(), ShouldRun);
-  return ShouldRun;
-}
-
diff --git a/lib/IR/Statepoint.cpp b/lib/IR/Statepoint.cpp
index 63be1e780d8144a82c74361559cabcd0cd0cb8d0..8c3f0f208cc67ae591cb4e8511b41995e95c9ed6 100644
--- a/lib/IR/Statepoint.cpp
+++ b/lib/IR/Statepoint.cpp
@@ -53,18 +53,19 @@ bool llvm::isStatepointDirectiveAttr(Attribute Attr) {
          Attr.hasAttribute("statepoint-num-patch-bytes");
 }
 
-StatepointDirectives llvm::parseStatepointDirectivesFromAttrs(AttributeSet AS) {
+StatepointDirectives
+llvm::parseStatepointDirectivesFromAttrs(AttributeList AS) {
   StatepointDirectives Result;
 
   Attribute AttrID =
-      AS.getAttribute(AttributeSet::FunctionIndex, "statepoint-id");
+      AS.getAttribute(AttributeList::FunctionIndex, "statepoint-id");
   uint64_t StatepointID;
   if (AttrID.isStringAttribute())
     if (!AttrID.getValueAsString().getAsInteger(10, StatepointID))
       Result.StatepointID = StatepointID;
 
   uint32_t NumPatchBytes;
-  Attribute AttrNumPatchBytes = AS.getAttribute(AttributeSet::FunctionIndex,
+  Attribute AttrNumPatchBytes = AS.getAttribute(AttributeList::FunctionIndex,
                                                 "statepoint-num-patch-bytes");
   if (AttrNumPatchBytes.isStringAttribute())
     if (!AttrNumPatchBytes.getValueAsString().getAsInteger(10, NumPatchBytes))
diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp
index ca866738f8828f08fbd0ef860471f6c54a080774..b67b0a307861b50feee3d5fcbbd376fc12b998ef 100644
--- a/lib/IR/Type.cpp
+++ b/lib/IR/Type.cpp
@@ -41,12 +41,6 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   }
 }
 
-Type *Type::getScalarType() const {
-  if (auto *VTy = dyn_cast<VectorType>(this))
-    return VTy->getElementType();
-  return const_cast<Type*>(this);
-}
-
 bool Type::isIntegerTy(unsigned Bitwidth) const {
   return isIntegerTy() && cast<IntegerType>(this)->getBitWidth() == Bitwidth;
 }
diff --git a/lib/IR/TypeFinder.cpp b/lib/IR/TypeFinder.cpp
index dc4c1cffb20c5973fb3b01052c74c60914ab3f38..a178b9ec0f09b95bd92cdeee156f27b38b66cf62 100644
--- a/lib/IR/TypeFinder.cpp
+++ b/lib/IR/TypeFinder.cpp
@@ -1,4 +1,4 @@
-//===-- TypeFinder.cpp - Implement the TypeFinder class -------------------===//
+//===- TypeFinder.cpp - Implement the TypeFinder class --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,13 +11,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/TypeFinder.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/TypeFinder.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include <utility>
+
 using namespace llvm;
 
 void TypeFinder::run(const Module &M, bool onlyNamed) {
diff --git a/lib/IR/Value.cpp b/lib/IR/Value.cpp
index d86d7fc11643706e4f7af5e174d83b2d0330e5d8..b07c57685a26682abac72e52de0f7ed657171526 100644
--- a/lib/IR/Value.cpp
+++ b/lib/IR/Value.cpp
@@ -437,17 +437,17 @@ enum PointerStripKind {
 };
 
 template <PointerStripKind StripKind>
-static Value *stripPointerCastsAndOffsets(Value *V) {
+static const Value *stripPointerCastsAndOffsets(const Value *V) {
   if (!V->getType()->isPointerTy())
     return V;
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
-  SmallPtrSet<Value *, 4> Visited;
+  SmallPtrSet<const Value *, 4> Visited;
 
   Visited.insert(V);
   do {
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    if (auto *GEP = dyn_cast<GEPOperator>(V)) {
       switch (StripKind) {
       case PSK_ZeroIndicesAndAliases:
       case PSK_ZeroIndices:
@@ -467,13 +467,13 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
     } else if (Operator::getOpcode(V) == Instruction::BitCast ||
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
-    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    } else if (auto *GA = dyn_cast<GlobalAlias>(V)) {
       if (StripKind == PSK_ZeroIndices || GA->isInterposable())
         return V;
       V = GA->getAliasee();
     } else {
-      if (auto CS = CallSite(V))
-        if (Value *RV = CS.getReturnedArgOperand()) {
+      if (auto CS = ImmutableCallSite(V))
+        if (const Value *RV = CS.getReturnedArgOperand()) {
           V = RV;
           continue;
         }
@@ -487,20 +487,21 @@ static Value *stripPointerCastsAndOffsets(Value *V) {
 }
 } // end anonymous namespace
 
-Value *Value::stripPointerCasts() {
+const Value *Value::stripPointerCasts() const {
   return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndAliases>(this);
 }
 
-Value *Value::stripPointerCastsNoFollowAliases() {
+const Value *Value::stripPointerCastsNoFollowAliases() const {
   return stripPointerCastsAndOffsets<PSK_ZeroIndices>(this);
 }
 
-Value *Value::stripInBoundsConstantOffsets() {
+const Value *Value::stripInBoundsConstantOffsets() const {
   return stripPointerCastsAndOffsets<PSK_InBoundsConstantIndices>(this);
 }
 
-Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
-                                                        APInt &Offset) {
+const Value *
+Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
+                                                 APInt &Offset) const {
   if (!getType()->isPointerTy())
     return this;
 
@@ -510,11 +511,11 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
 
   // Even though we don't look through PHI nodes, we could be called on an
   // instruction in an unreachable block, which may be on a cycle.
-  SmallPtrSet<Value *, 4> Visited;
+  SmallPtrSet<const Value *, 4> Visited;
   Visited.insert(this);
-  Value *V = this;
+  const Value *V = this;
   do {
-    if (GEPOperator *GEP = dyn_cast<GEPOperator>(V)) {
+    if (auto *GEP = dyn_cast<GEPOperator>(V)) {
       if (!GEP->isInBounds())
         return V;
       APInt GEPOffset(Offset);
@@ -524,11 +525,11 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
-    } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
+    } else if (auto *GA = dyn_cast<GlobalAlias>(V)) {
       V = GA->getAliasee();
     } else {
-      if (auto CS = CallSite(V))
-        if (Value *RV = CS.getReturnedArgOperand()) {
+      if (auto CS = ImmutableCallSite(V))
+        if (const Value *RV = CS.getReturnedArgOperand()) {
           V = RV;
           continue;
         }
@@ -541,7 +542,7 @@ Value *Value::stripAndAccumulateInBoundsConstantOffsets(const DataLayout &DL,
   return V;
 }
 
-Value *Value::stripInBoundsOffsets() {
+const Value *Value::stripInBoundsOffsets() const {
   return stripPointerCastsAndOffsets<PSK_InBounds>(this);
 }
 
@@ -633,7 +634,7 @@ unsigned Value::getPointerAlignment(const DataLayout &DL) const {
         Align = DL.getPrefTypeAlignment(AllocatedType);
     }
   } else if (auto CS = ImmutableCallSite(this))
-    Align = CS.getAttributes().getParamAlignment(AttributeSet::ReturnIndex);
+    Align = CS.getAttributes().getParamAlignment(AttributeList::ReturnIndex);
   else if (const LoadInst *LI = dyn_cast<LoadInst>(this))
     if (MDNode *MD = LI->getMetadata(LLVMContext::MD_align)) {
       ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
@@ -643,9 +644,9 @@ unsigned Value::getPointerAlignment(const DataLayout &DL) const {
   return Align;
 }
 
-Value *Value::DoPHITranslation(const BasicBlock *CurBB,
-                               const BasicBlock *PredBB) {
-  PHINode *PN = dyn_cast<PHINode>(this);
+const Value *Value::DoPHITranslation(const BasicBlock *CurBB,
+                                     const BasicBlock *PredBB) const {
+  auto *PN = dyn_cast<PHINode>(this);
   if (PN && PN->getParent() == CurBB)
     return PN->getIncomingValueForBlock(PredBB);
   return this;
diff --git a/lib/IR/ValueSymbolTable.cpp b/lib/IR/ValueSymbolTable.cpp
index 1eb2498d8acdaa6d68afbc552a6bd0b55aee53dd..0c3946c8661ebe488af64a611ee5e157ed9f93e8 100644
--- a/lib/IR/ValueSymbolTable.cpp
+++ b/lib/IR/ValueSymbolTable.cpp
@@ -1,4 +1,4 @@
-//===-- ValueSymbolTable.cpp - Implement the ValueSymbolTable class -------===//
+//===- ValueSymbolTable.cpp - Implement the ValueSymbolTable class --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp
index 41593af000ca85340f4dc2cf9eaef9a78fb6f0e6..893890446b7a96148ea357007b78b8136a8c6160 100644
--- a/lib/IR/Verifier.cpp
+++ b/lib/IR/Verifier.cpp
@@ -277,6 +277,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   /// already.
   bool SawFrameEscape;
 
+  /// Whether the current function has a DISubprogram attached to it.
+  bool HasDebugInfo = false;
+
   /// Stores the count of how many objects were passed to llvm.localescape for a
   /// given function and the largest index passed to llvm.localrecover.
   DenseMap<Function *, std::pair<unsigned, unsigned>> FrameEscapeInfo;
@@ -297,6 +300,9 @@ class Verifier : public InstVisitor<Verifier>, VerifierSupport {
   // constant expressions, we can arrive at a particular user many times.
   SmallPtrSet<const Value *, 32> GlobalValueVisited;
 
+  // Keeps track of duplicate function argument debug info.
+  SmallVector<const DILocalVariable *, 16> DebugFnArgs;
+
   TBAAVerifier TBAAVerifyHelper;
 
   void checkAtomicMemAccessSize(Type *Ty, const Instruction *I);
@@ -342,6 +348,7 @@ public:
     visit(const_cast<Function &>(F));
     verifySiblingFuncletUnwinds();
     InstsInThisBlock.clear();
+    DebugFnArgs.clear();
     LandingPadResultTy = nullptr;
     SawFrameEscape = false;
     SiblingFuncletInfo.clear();
@@ -482,12 +489,12 @@ private:
   void verifyMustTailCall(CallInst &CI);
   bool performTypeCheck(Intrinsic::ID ID, Function *F, Type *Ty, int VT,
                         unsigned ArgNo, std::string &Suffix);
-  bool verifyAttributeCount(AttributeSet Attrs, unsigned Params);
-  void verifyAttributeTypes(AttributeSet Attrs, unsigned Idx, bool isFunction,
+  bool verifyAttributeCount(AttributeList Attrs, unsigned Params);
+  void verifyAttributeTypes(AttributeList Attrs, unsigned Idx, bool isFunction,
                             const Value *V);
-  void verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
+  void verifyParameterAttrs(AttributeList Attrs, unsigned Idx, Type *Ty,
                             bool isReturnValue, const Value *V);
-  void verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
+  void verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
                            const Value *V);
   void verifyFunctionMetadata(ArrayRef<std::pair<unsigned, MDNode *>> MDs);
 
@@ -498,6 +505,7 @@ private:
   void verifySiblingFuncletUnwinds();
 
   void verifyFragmentExpression(const DbgInfoIntrinsic &I);
+  void verifyFnArgs(const DbgInfoIntrinsic &I);
 
   /// Module-level debug info verification...
   void verifyCompileUnits();
@@ -653,7 +661,8 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
     if (auto *GVE = dyn_cast<DIGlobalVariableExpression>(MD))
       visitDIGlobalVariableExpression(*GVE);
     else
-      AssertDI(false, "!dbg attachment of global variable must be a DIGlobalVariableExpression");
+      AssertDI(false, "!dbg attachment of global variable must be a "
+                      "DIGlobalVariableExpression");
   }
 
   if (!GV.hasInitializer()) {
@@ -823,28 +832,6 @@ static bool isType(const Metadata *MD) { return !MD || isa<DIType>(MD); }
 static bool isScope(const Metadata *MD) { return !MD || isa<DIScope>(MD); }
 static bool isDINode(const Metadata *MD) { return !MD || isa<DINode>(MD); }
 
-template <class Ty>
-static bool isValidMetadataArrayImpl(const MDTuple &N, bool AllowNull) {
-  for (Metadata *MD : N.operands()) {
-    if (MD) {
-      if (!isa<Ty>(MD))
-        return false;
-    } else {
-      if (!AllowNull)
-        return false;
-    }
-  }
-  return true;
-}
-
-template <class Ty> static bool isValidMetadataArray(const MDTuple &N) {
-  return isValidMetadataArrayImpl<Ty>(N, /* AllowNull */ false);
-}
-
-template <class Ty> static bool isValidMetadataNullArray(const MDTuple &N) {
-  return isValidMetadataArrayImpl<Ty>(N, /* AllowNull */ true);
-}
-
 void Verifier::visitDILocation(const DILocation &N) {
   AssertDI(N.getRawScope() && isa<DILocalScope>(N.getRawScope()),
            "location requires a valid scope", &N, N.getRawScope());
@@ -901,6 +888,13 @@ void Verifier::visitDIDerivedType(const DIDerivedType &N) {
   AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
   AssertDI(isType(N.getRawBaseType()), "invalid base type", &N,
            N.getRawBaseType());
+
+  if (N.getDWARFAddressSpace()) {
+    AssertDI(N.getTag() == dwarf::DW_TAG_pointer_type ||
+                 N.getTag() == dwarf::DW_TAG_reference_type,
+             "DWARF address space only applies to pointer or reference types",
+             &N);
+  }
 }
 
 static bool hasConflictingReferenceFlags(unsigned Flags) {
@@ -1025,6 +1019,8 @@ void Verifier::visitDISubprogram(const DISubprogram &N) {
   AssertDI(isScope(N.getRawScope()), "invalid scope", &N, N.getRawScope());
   if (auto *F = N.getRawFile())
     AssertDI(isa<DIFile>(F), "invalid file", &N, F);
+  else
+    AssertDI(N.getLine() == 0, "line specified with no file", &N, N.getLine());
   if (auto *T = N.getRawType())
     AssertDI(isa<DISubroutineType>(T), "invalid subroutine type", &N, T);
   AssertDI(isType(N.getRawContainingType()), "invalid containing type", &N,
@@ -1313,7 +1309,7 @@ Verifier::visitModuleFlag(const MDNode *Op,
   }
 }
 
-void Verifier::verifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
+void Verifier::verifyAttributeTypes(AttributeList Attrs, unsigned Idx,
                                     bool isFunction, const Value *V) {
   unsigned Slot = ~0U;
   for (unsigned I = 0, E = Attrs.getNumSlots(); I != E; ++I)
@@ -1324,8 +1320,8 @@ void Verifier::verifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
 
   assert(Slot != ~0U && "Attribute set inconsistency!");
 
-  for (AttributeSet::iterator I = Attrs.begin(Slot), E = Attrs.end(Slot);
-         I != E; ++I) {
+  for (AttributeList::iterator I = Attrs.begin(Slot), E = Attrs.end(Slot);
+       I != E; ++I) {
     if (I->isStringAttribute())
       continue;
 
@@ -1385,7 +1381,7 @@ void Verifier::verifyAttributeTypes(AttributeSet Attrs, unsigned Idx,
 
 // VerifyParameterAttrs - Check the given attributes for an argument or return
 // value of the specified type.  The value V is printed in error messages.
-void Verifier::verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
+void Verifier::verifyParameterAttrs(AttributeList Attrs, unsigned Idx, Type *Ty,
                                     bool isReturnValue, const Value *V) {
   if (!Attrs.hasAttributes(Idx))
     return;
@@ -1463,7 +1459,7 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
   Assert(
       !AttrBuilder(Attrs, Idx).overlaps(AttributeFuncs::typeIncompatible(Ty)),
       "Wrong types for attribute: " +
-          AttributeSet::get(Context, Idx, AttributeFuncs::typeIncompatible(Ty))
+          AttributeList::get(Context, Idx, AttributeFuncs::typeIncompatible(Ty))
               .getAsString(Idx),
       V);
 
@@ -1493,7 +1489,7 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, unsigned Idx, Type *Ty,
 
 // Check parameter attributes against a function type.
 // The value V is printed in error messages.
-void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
+void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
                                    const Value *V) {
   if (Attrs.isEmpty())
     return;
@@ -1559,67 +1555,70 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeSet Attrs,
     }
   }
 
-  if (!Attrs.hasAttributes(AttributeSet::FunctionIndex))
+  if (!Attrs.hasAttributes(AttributeList::FunctionIndex))
     return;
 
-  verifyAttributeTypes(Attrs, AttributeSet::FunctionIndex, true, V);
+  verifyAttributeTypes(Attrs, AttributeList::FunctionIndex, true, V);
 
   Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly)),
+      !(Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReadNone) &&
+        Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly)),
       "Attributes 'readnone and readonly' are incompatible!", V);
 
   Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::WriteOnly)),
+      !(Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReadNone) &&
+        Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly)),
       "Attributes 'readnone and writeonly' are incompatible!", V);
 
   Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::WriteOnly)),
+      !(Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly) &&
+        Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::WriteOnly)),
       "Attributes 'readonly and writeonly' are incompatible!", V);
 
   Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, 
+      !(Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReadNone) &&
+        Attrs.hasAttribute(AttributeList::FunctionIndex,
                            Attribute::InaccessibleMemOrArgMemOnly)),
-      "Attributes 'readnone and inaccessiblemem_or_argmemonly' are incompatible!", V);
+      "Attributes 'readnone and inaccessiblemem_or_argmemonly' are "
+      "incompatible!",
+      V);
 
   Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex, 
+      !(Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReadNone) &&
+        Attrs.hasAttribute(AttributeList::FunctionIndex,
                            Attribute::InaccessibleMemOnly)),
       "Attributes 'readnone and inaccessiblememonly' are incompatible!", V);
 
   Assert(
-      !(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::NoInline) &&
-        Attrs.hasAttribute(AttributeSet::FunctionIndex,
+      !(Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::NoInline) &&
+        Attrs.hasAttribute(AttributeList::FunctionIndex,
                            Attribute::AlwaysInline)),
       "Attributes 'noinline and alwaysinline' are incompatible!", V);
 
-  if (Attrs.hasAttribute(AttributeSet::FunctionIndex, 
+  if (Attrs.hasAttribute(AttributeList::FunctionIndex,
                          Attribute::OptimizeNone)) {
-    Assert(Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::NoInline),
-           "Attribute 'optnone' requires 'noinline'!", V);
+    Assert(
+        Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::NoInline),
+        "Attribute 'optnone' requires 'noinline'!", V);
 
-    Assert(!Attrs.hasAttribute(AttributeSet::FunctionIndex,
+    Assert(!Attrs.hasAttribute(AttributeList::FunctionIndex,
                                Attribute::OptimizeForSize),
            "Attributes 'optsize and optnone' are incompatible!", V);
 
-    Assert(!Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize),
-           "Attributes 'minsize and optnone' are incompatible!", V);
+    Assert(
+        !Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize),
+        "Attributes 'minsize and optnone' are incompatible!", V);
   }
 
-  if (Attrs.hasAttribute(AttributeSet::FunctionIndex,
-                         Attribute::JumpTable)) {
+  if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::JumpTable)) {
     const GlobalValue *GV = cast<GlobalValue>(V);
     Assert(GV->hasGlobalUnnamedAddr(),
            "Attribute 'jumptable' requires 'unnamed_addr'", V);
   }
 
-  if (Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::AllocSize)) {
+  if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::AllocSize)) {
     std::pair<unsigned, Optional<unsigned>> Args =
-        Attrs.getAllocSizeArgs(AttributeSet::FunctionIndex);
+        Attrs.getAllocSizeArgs(AttributeList::FunctionIndex);
 
     auto CheckParam = [&](StringRef Name, unsigned ParamNo) {
       if (ParamNo >= FT->getNumParams()) {
@@ -1650,8 +1649,8 @@ void Verifier::verifyFunctionMetadata(
   for (const auto &Pair : MDs) {
     if (Pair.first == LLVMContext::MD_prof) {
       MDNode *MD = Pair.second;
-      Assert(MD->getNumOperands() == 2,
-             "!prof annotations should have exactly 2 operands", MD);
+      Assert(MD->getNumOperands() >= 2,
+             "!prof annotations should have no less than 2 operands", MD);
 
       // Check first operand.
       Assert(MD->getOperand(0) != nullptr, "first operand should not be null",
@@ -1726,15 +1725,15 @@ void Verifier::visitConstantExpr(const ConstantExpr *CE) {
   }
 }
 
-bool Verifier::verifyAttributeCount(AttributeSet Attrs, unsigned Params) {
+bool Verifier::verifyAttributeCount(AttributeList Attrs, unsigned Params) {
   if (Attrs.getNumSlots() == 0)
     return true;
 
   unsigned LastSlot = Attrs.getNumSlots() - 1;
   unsigned LastIndex = Attrs.getSlotIndex(LastSlot);
-  if (LastIndex <= Params
-      || (LastIndex == AttributeSet::FunctionIndex
-          && (LastSlot == 0 || Attrs.getSlotIndex(LastSlot - 1) <= Params)))
+  if (LastIndex <= Params ||
+      (LastIndex == AttributeList::FunctionIndex &&
+       (LastSlot == 0 || Attrs.getSlotIndex(LastSlot - 1) <= Params)))
     return true;
 
   return false;
@@ -1964,7 +1963,7 @@ void Verifier::visitFunction(const Function &F) {
   Assert(!F.hasStructRetAttr() || F.getReturnType()->isVoidTy(),
          "Invalid struct return type!", &F);
 
-  AttributeSet Attrs = F.getAttributes();
+  AttributeList Attrs = F.getAttributes();
 
   Assert(verifyAttributeCount(Attrs, FT->getNumParams()),
          "Attribute after last parameter!", &F);
@@ -1975,7 +1974,7 @@ void Verifier::visitFunction(const Function &F) {
   // On function declarations/definitions, we do not support the builtin
   // attribute. We do not check this in VerifyFunctionAttrs since that is
   // checking for Attributes that can/can not ever be on functions.
-  Assert(!Attrs.hasAttribute(AttributeSet::FunctionIndex, Attribute::Builtin),
+  Assert(!Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::Builtin),
          "Attribute 'builtin' can only be applied to a callsite.", &F);
 
   // Check that this function meets the restrictions on this calling convention.
@@ -1985,6 +1984,18 @@ void Verifier::visitFunction(const Function &F) {
   default:
   case CallingConv::C:
     break;
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    Assert(F.getReturnType()->isVoidTy(),
+           "Calling convention requires void return type", &F);
+    LLVM_FALLTHROUGH;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+    Assert(!F.hasStructRetAttr(),
+           "Calling convention does not allow sret", &F);
+    LLVM_FALLTHROUGH;
   case CallingConv::Fast:
   case CallingConv::Cold:
   case CallingConv::Intel_OCL_BI:
@@ -2114,11 +2125,10 @@ void Verifier::visitFunction(const Function &F) {
          "Function is marked as dllimport, but not external.", &F);
 
   auto *N = F.getSubprogram();
-  if (!N)
+  HasDebugInfo = (N != nullptr);
+  if (!HasDebugInfo)
     return;
 
-  visitDISubprogram(*N);
-
   // Check that all !dbg attachments lead to back to N (or, at least, another
   // subprogram that describes the same function).
   //
@@ -2602,7 +2612,7 @@ void Verifier::verifyCallSite(CallSite CS) {
            "Call parameter type does not match function signature!",
            CS.getArgument(i), FTy->getParamType(i), I);
 
-  AttributeSet Attrs = CS.getAttributes();
+  AttributeList Attrs = CS.getAttributes();
 
   Assert(verifyAttributeCount(Attrs, CS.arg_size()),
          "Attribute after last parameter!", I);
@@ -2727,9 +2737,9 @@ void Verifier::verifyCallSite(CallSite CS) {
   // do so causes assertion failures when the inliner sets up inline scope info.
   if (I->getFunction()->getSubprogram() && CS.getCalledFunction() &&
       CS.getCalledFunction()->getSubprogram())
-    Assert(I->getDebugLoc(), "inlinable function call in a function with debug "
-                             "info must have a !dbg location",
-           I);
+    AssertDI(I->getDebugLoc(), "inlinable function call in a function with "
+                               "debug info must have a !dbg location",
+             I);
 
   visitInstruction(*I);
 }
@@ -2746,7 +2756,7 @@ static bool isTypeCongruent(Type *L, Type *R) {
   return PL->getAddressSpace() == PR->getAddressSpace();
 }
 
-static AttrBuilder getParameterABIAttributes(int I, AttributeSet Attrs) {
+static AttrBuilder getParameterABIAttributes(int I, AttributeList Attrs) {
   static const Attribute::AttrKind ABIAttrs[] = {
       Attribute::StructRet, Attribute::ByVal, Attribute::InAlloca,
       Attribute::InReg, Attribute::Returned, Attribute::SwiftSelf,
@@ -2788,8 +2798,8 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
 
   // - All ABI-impacting function attributes, such as sret, byval, inreg,
   //   returned, and inalloca, must match.
-  AttributeSet CallerAttrs = F->getAttributes();
-  AttributeSet CalleeAttrs = CI.getAttributes();
+  AttributeList CallerAttrs = F->getAttributes();
+  AttributeList CalleeAttrs = CI.getAttributes();
   for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
     AttrBuilder CallerABIAttrs = getParameterABIAttributes(I, CallerAttrs);
     AttrBuilder CalleeABIAttrs = getParameterABIAttributes(I, CalleeAttrs);
@@ -3149,8 +3159,9 @@ void Verifier::verifySwiftErrorValue(const Value *SwiftErrorVal) {
 void Verifier::visitAllocaInst(AllocaInst &AI) {
   SmallPtrSet<Type*, 4> Visited;
   PointerType *PTy = AI.getType();
-  Assert(PTy->getAddressSpace() == 0,
-         "Allocation instruction pointer not in the generic address space!",
+  // TODO: Relax this restriction?
+  Assert(PTy->getAddressSpace() == DL.getAllocaAddrSpace(),
+         "Allocation instruction pointer not in the stack address space!",
          &AI);
   Assert(AI.getAllocatedType()->isSized(&Visited),
          "Cannot allocate unsized type", &AI);
@@ -4348,6 +4359,8 @@ void Verifier::visitDbgIntrinsic(StringRef Kind, DbgIntrinsicTy &DII) {
                                " variable and !dbg attachment",
            &DII, BB, F, Var, Var->getScope()->getSubprogram(), Loc,
            Loc->getScope()->getSubprogram());
+
+  verifyFnArgs(DII);
 }
 
 static uint64_t getVariableSize(const DILocalVariable &V) {
@@ -4416,15 +4429,49 @@ void Verifier::verifyFragmentExpression(const DbgInfoIntrinsic &I) {
   AssertDI(FragSize != VarSize, "fragment covers entire variable", &I, V, E);
 }
 
+void Verifier::verifyFnArgs(const DbgInfoIntrinsic &I) {
+  // This function does not take the scope of noninlined function arguments into
+  // account. Don't run it if current function is nodebug, because it may
+  // contain inlined debug intrinsics.
+  if (!HasDebugInfo)
+    return;
+
+  DILocalVariable *Var;
+  if (auto *DV = dyn_cast<DbgValueInst>(&I)) {
+    // For performance reasons only check non-inlined ones.
+    if (DV->getDebugLoc()->getInlinedAt())
+      return;
+    Var = DV->getVariable();
+  } else {
+    auto *DD = cast<DbgDeclareInst>(&I);
+    if (DD->getDebugLoc()->getInlinedAt())
+      return;
+    Var = DD->getVariable();
+  }
+  AssertDI(Var, "dbg intrinsic without variable");
+
+  unsigned ArgNo = Var->getArg();
+  if (!ArgNo)
+    return;
+
+  // Verify there are no duplicate function argument debug info entries.
+  // These will cause hard-to-debug assertions in the DWARF backend.
+  if (DebugFnArgs.size() < ArgNo)
+    DebugFnArgs.resize(ArgNo, nullptr);
+
+  auto *Prev = DebugFnArgs[ArgNo - 1];
+  DebugFnArgs[ArgNo - 1] = Var;
+  AssertDI(!Prev || (Prev == Var), "conflicting debug info for argument", &I,
+           Prev, Var);
+}
+
 void Verifier::verifyCompileUnits() {
   auto *CUs = M.getNamedMetadata("llvm.dbg.cu");
   SmallPtrSet<const Metadata *, 2> Listed;
   if (CUs)
     Listed.insert(CUs->op_begin(), CUs->op_end());
-  AssertDI(
-      all_of(CUVisited,
-             [&Listed](const Metadata *CU) { return Listed.count(CU); }),
-      "All DICompileUnits must be listed in llvm.dbg.cu");
+  for (auto *CU : CUVisited)
+    AssertDI(Listed.count(CU), "DICompileUnit not listed in llvm.dbg.cu", CU);
   CUVisited.clear();
 }
 
diff --git a/lib/LTO/Caching.cpp b/lib/LTO/Caching.cpp
index f635369df8d751039224ae2960b053e399ddff2a..e32e46c4c3c8d9070a407d95edef6640e973e090 100644
--- a/lib/LTO/Caching.cpp
+++ b/lib/LTO/Caching.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/LTO/Caching.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -21,71 +22,71 @@
 using namespace llvm;
 using namespace llvm::lto;
 
-static void commitEntry(StringRef TempFilename, StringRef EntryPath) {
-  // Rename to final destination (hopefully race condition won't matter here)
-  auto EC = sys::fs::rename(TempFilename, EntryPath);
-  if (EC) {
-    // Renaming failed, probably not the same filesystem, copy and delete.
-    // FIXME: Avoid needing to do this by creating the temporary file in the
-    // cache directory.
-    {
-      auto ReloadedBufferOrErr = MemoryBuffer::getFile(TempFilename);
-      if (auto EC = ReloadedBufferOrErr.getError())
-        report_fatal_error(Twine("Failed to open temp file '") + TempFilename +
-                           "': " + EC.message() + "\n");
+Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
+                                            AddBufferFn AddBuffer) {
+  if (std::error_code EC = sys::fs::create_directories(CacheDirectoryPath))
+    return errorCodeToError(EC);
 
-      raw_fd_ostream OS(EntryPath, EC, sys::fs::F_None);
-      if (EC)
-        report_fatal_error(Twine("Failed to open ") + EntryPath +
-                           " to save cached entry\n");
-      // I'm not sure what are the guarantee if two processes are doing this
-      // at the same time.
-      OS << (*ReloadedBufferOrErr)->getBuffer();
-    }
-    sys::fs::remove(TempFilename);
-  }
-}
-
-NativeObjectCache lto::localCache(StringRef CacheDirectoryPath,
-                                  AddFileFn AddFile) {
   return [=](unsigned Task, StringRef Key) -> AddStreamFn {
-    // First, see if we have a cache hit.
+    // This choice of file name allows the cache to be pruned (see pruneCache()
+    // in include/llvm/Support/CachePruning.h).
     SmallString<64> EntryPath;
-    sys::path::append(EntryPath, CacheDirectoryPath, Key);
-    if (sys::fs::exists(EntryPath)) {
-      AddFile(Task, EntryPath);
+    sys::path::append(EntryPath, CacheDirectoryPath, "llvmcache-" + Key);
+    // First, see if we have a cache hit.
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+        MemoryBuffer::getFile(EntryPath);
+    if (MBOrErr) {
+      AddBuffer(Task, std::move(*MBOrErr));
       return AddStreamFn();
     }
 
+    if (MBOrErr.getError() != errc::no_such_file_or_directory)
+      report_fatal_error(Twine("Failed to open cache file ") + EntryPath +
+                         ": " + MBOrErr.getError().message() + "\n");
+
     // This native object stream is responsible for commiting the resulting
-    // file to the cache and calling AddFile to add it to the link.
+    // file to the cache and calling AddBuffer to add it to the link.
     struct CacheStream : NativeObjectStream {
-      AddFileFn AddFile;
+      AddBufferFn AddBuffer;
       std::string TempFilename;
       std::string EntryPath;
       unsigned Task;
 
-      CacheStream(std::unique_ptr<raw_pwrite_stream> OS, AddFileFn AddFile,
+      CacheStream(std::unique_ptr<raw_pwrite_stream> OS, AddBufferFn AddBuffer,
                   std::string TempFilename, std::string EntryPath,
                   unsigned Task)
-          : NativeObjectStream(std::move(OS)), AddFile(std::move(AddFile)),
+          : NativeObjectStream(std::move(OS)), AddBuffer(std::move(AddBuffer)),
             TempFilename(std::move(TempFilename)),
             EntryPath(std::move(EntryPath)), Task(Task) {}
 
       ~CacheStream() {
+        // FIXME: This code could race with the cache pruner, but it is unlikely
+        // that the cache pruner will choose to remove a newly created file.
+
         // Make sure the file is closed before committing it.
         OS.reset();
-        commitEntry(TempFilename, EntryPath);
-        AddFile(Task, EntryPath);
+        // This is atomic on POSIX systems.
+        if (auto EC = sys::fs::rename(TempFilename, EntryPath))
+          report_fatal_error(Twine("Failed to rename temporary file ") +
+                             TempFilename + ": " + EC.message() + "\n");
+
+        ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+            MemoryBuffer::getFile(EntryPath);
+        if (!MBOrErr)
+          report_fatal_error(Twine("Failed to open cache file ") + EntryPath +
+                             ": " + MBOrErr.getError().message() + "\n");
+        AddBuffer(Task, std::move(*MBOrErr));
       }
     };
 
     return [=](size_t Task) -> std::unique_ptr<NativeObjectStream> {
       // Write to a temporary to avoid race condition
       int TempFD;
-      SmallString<64> TempFilename;
+      SmallString<64> TempFilenameModel, TempFilename;
+      sys::path::append(TempFilenameModel, CacheDirectoryPath, "Thin-%%%%%%.tmp.o");
       std::error_code EC =
-          sys::fs::createTemporaryFile("Thin", "tmp.o", TempFD, TempFilename);
+          sys::fs::createUniqueFile(TempFilenameModel, TempFD, TempFilename,
+                                    sys::fs::owner_read | sys::fs::owner_write);
       if (EC) {
         errs() << "Error: " << EC.message() << "\n";
         report_fatal_error("ThinLTO: Can't get a temporary file");
@@ -94,7 +95,7 @@ NativeObjectCache lto::localCache(StringRef CacheDirectoryPath,
       // This CacheStream will move the temporary file into the cache when done.
       return llvm::make_unique<CacheStream>(
           llvm::make_unique<raw_fd_ostream>(TempFD, /* ShouldClose */ true),
-          AddFile, TempFilename.str(), EntryPath.str(), Task);
+          AddBuffer, TempFilename.str(), EntryPath.str(), Task);
     };
   };
 }
diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp
index e602c64337c33bc618e9f3b6fdb1e64642eee611..6ca7e34527c7d91f2352d40b96a6544d482fbe1e 100644
--- a/lib/LTO/LTO.cpp
+++ b/lib/LTO/LTO.cpp
@@ -20,9 +20,13 @@
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/Linker/IRMover.h"
+#include "llvm/Object/IRObjectFile.h"
 #include "llvm/Object/ModuleSummaryIndexObjectFile.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -46,6 +50,12 @@ using namespace object;
 
 #define DEBUG_TYPE "lto"
 
+// The values are (type identifier, summary) pairs.
+typedef DenseMap<
+    GlobalValue::GUID,
+    TinyPtrVector<const std::pair<const std::string, TypeIdSummary> *>>
+    TypeIdSummariesByGuidTy;
+
 // Returns a unique hash for the Module considering the current list of
 // export/import and other global analysis results.
 // The hash is produced in \p Key.
@@ -54,7 +64,8 @@ static void computeCacheKey(
     StringRef ModuleID, const FunctionImporter::ImportMapTy &ImportList,
     const FunctionImporter::ExportSetTy &ExportList,
     const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
-    const GVSummaryMapTy &DefinedGlobals) {
+    const GVSummaryMapTy &DefinedGlobals,
+    const TypeIdSummariesByGuidTy &TypeIdSummariesByGuid) {
   // Compute the unique hash for this entry.
   // This is based on the current compiler version, the module itself, the
   // export list, the hash for every single module in the import list, the
@@ -80,6 +91,18 @@ static void computeCacheKey(
     Data[3] = I >> 24;
     Hasher.update(ArrayRef<uint8_t>{Data, 4});
   };
+  auto AddUint64 = [&](uint64_t I) {
+    uint8_t Data[8];
+    Data[0] = I;
+    Data[1] = I >> 8;
+    Data[2] = I >> 16;
+    Data[3] = I >> 24;
+    Data[4] = I >> 32;
+    Data[5] = I >> 40;
+    Data[6] = I >> 48;
+    Data[7] = I >> 56;
+    Hasher.update(ArrayRef<uint8_t>{Data, 8});
+  };
   AddString(Conf.CPU);
   // FIXME: Hash more of Options. For now all clients initialize Options from
   // command-line flags (which is unsupported in production), but may set
@@ -94,6 +117,7 @@ static void computeCacheKey(
   AddUnsigned(Conf.RelocModel);
   AddUnsigned(Conf.CodeModel);
   AddUnsigned(Conf.CGOptLevel);
+  AddUnsigned(Conf.CGFileType);
   AddUnsigned(Conf.OptLevel);
   AddString(Conf.OptPipeline);
   AddString(Conf.AAPipeline);
@@ -107,10 +131,16 @@ static void computeCacheKey(
     // The export list can impact the internalization, be conservative here
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&F, sizeof(F)));
 
-  // Include the hash for every module we import functions from
+  // Include the hash for every module we import functions from. The set of
+  // imported symbols for each module may affect code generation and is
+  // sensitive to link order, so include that as well.
   for (auto &Entry : ImportList) {
     auto ModHash = Index.getModuleHash(Entry.first());
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
+
+    AddUint64(Entry.second.size());
+    for (auto &Fn : Entry.second)
+      AddUint64(Fn.first);
   }
 
   // Include the hash for the resolved ODR.
@@ -121,12 +151,68 @@ static void computeCacheKey(
                                     sizeof(GlobalValue::LinkageTypes)));
   }
 
+  std::set<GlobalValue::GUID> UsedTypeIds;
+
+  auto AddUsedTypeIds = [&](GlobalValueSummary *GS) {
+    auto *FS = dyn_cast_or_null<FunctionSummary>(GS);
+    if (!FS)
+      return;
+    for (auto &TT : FS->type_tests())
+      UsedTypeIds.insert(TT);
+    for (auto &TT : FS->type_test_assume_vcalls())
+      UsedTypeIds.insert(TT.GUID);
+    for (auto &TT : FS->type_checked_load_vcalls())
+      UsedTypeIds.insert(TT.GUID);
+    for (auto &TT : FS->type_test_assume_const_vcalls())
+      UsedTypeIds.insert(TT.VFunc.GUID);
+    for (auto &TT : FS->type_checked_load_const_vcalls())
+      UsedTypeIds.insert(TT.VFunc.GUID);
+  };
+
   // Include the hash for the linkage type to reflect internalization and weak
-  // resolution.
+  // resolution, and collect any used type identifier resolutions.
   for (auto &GS : DefinedGlobals) {
     GlobalValue::LinkageTypes Linkage = GS.second->linkage();
     Hasher.update(
         ArrayRef<uint8_t>((const uint8_t *)&Linkage, sizeof(Linkage)));
+    AddUsedTypeIds(GS.second);
+  }
+
+  // Imported functions may introduce new uses of type identifier resolutions,
+  // so we need to collect their used resolutions as well.
+  for (auto &ImpM : ImportList)
+    for (auto &ImpF : ImpM.second)
+      AddUsedTypeIds(Index.findSummaryInModule(ImpF.first, ImpM.first()));
+
+  auto AddTypeIdSummary = [&](StringRef TId, const TypeIdSummary &S) {
+    AddString(TId);
+
+    AddUnsigned(S.TTRes.TheKind);
+    AddUnsigned(S.TTRes.SizeM1BitWidth);
+
+    AddUint64(S.WPDRes.size());
+    for (auto &WPD : S.WPDRes) {
+      AddUnsigned(WPD.first);
+      AddUnsigned(WPD.second.TheKind);
+      AddString(WPD.second.SingleImplName);
+
+      AddUint64(WPD.second.ResByArg.size());
+      for (auto &ByArg : WPD.second.ResByArg) {
+        AddUint64(ByArg.first.size());
+        for (uint64_t Arg : ByArg.first)
+          AddUint64(Arg);
+        AddUnsigned(ByArg.second.TheKind);
+        AddUint64(ByArg.second.Info);
+      }
+    }
+  };
+
+  // Include the hash for all type identifiers used by this module.
+  for (GlobalValue::GUID TId : UsedTypeIds) {
+    auto SummariesI = TypeIdSummariesByGuid.find(TId);
+    if (SummariesI != TypeIdSummariesByGuid.end())
+      for (auto *Summary : SummariesI->second)
+        AddTypeIdSummary(Summary->first, Summary->second);
   }
 
   if (!Conf.SampleProfile.empty()) {
@@ -218,14 +304,6 @@ void llvm::thinLTOInternalizeAndPromoteInIndex(
     thinLTOInternalizeAndPromoteGUID(I.second, I.first, isExported);
 }
 
-struct InputFile::InputModule {
-  BitcodeModule BM;
-  std::unique_ptr<Module> Mod;
-
-  // The range of ModuleSymbolTable entries for this input module.
-  size_t SymBegin, SymEnd;
-};
-
 // Requires a destructor for std::vector<InputModule>.
 InputFile::~InputFile() = default;
 
@@ -246,61 +324,51 @@ Expected<std::unique_ptr<InputFile>> InputFile::create(MemoryBufferRef Object) {
     return make_error<StringError>("Bitcode file does not contain any modules",
                                    inconvertibleErrorCode());
 
-  // Create an InputModule for each module in the InputFile, and add it to the
-  // ModuleSymbolTable.
+  File->Mods = *BMsOrErr;
+
+  LLVMContext Ctx;
+  std::vector<Module *> Mods;
+  std::vector<std::unique_ptr<Module>> OwnedMods;
   for (auto BM : *BMsOrErr) {
     Expected<std::unique_ptr<Module>> MOrErr =
-        BM.getLazyModule(File->Ctx, /*ShouldLazyLoadMetadata*/ true,
+        BM.getLazyModule(Ctx, /*ShouldLazyLoadMetadata*/ true,
                          /*IsImporting*/ false);
     if (!MOrErr)
       return MOrErr.takeError();
 
-    size_t SymBegin = File->SymTab.symbols().size();
-    File->SymTab.addModule(MOrErr->get());
-    size_t SymEnd = File->SymTab.symbols().size();
+    if ((*MOrErr)->getDataLayoutStr().empty())
+      return make_error<StringError>("input module has no datalayout",
+                                     inconvertibleErrorCode());
 
-    for (const auto &C : (*MOrErr)->getComdatSymbolTable()) {
-      auto P = File->ComdatMap.insert(
-          std::make_pair(&C.second, File->Comdats.size()));
-      assert(P.second);
-      (void)P;
-      File->Comdats.push_back(C.first());
-    }
+    Mods.push_back(MOrErr->get());
+    OwnedMods.push_back(std::move(*MOrErr));
+  }
 
-    File->Mods.push_back({BM, std::move(*MOrErr), SymBegin, SymEnd});
+  SmallVector<char, 0> Symtab;
+  if (Error E = irsymtab::build(Mods, Symtab, File->Strtab))
+    return std::move(E);
+
+  irsymtab::Reader R({Symtab.data(), Symtab.size()},
+                     {File->Strtab.data(), File->Strtab.size()});
+  File->SourceFileName = R.getSourceFileName();
+  File->COFFLinkerOpts = R.getCOFFLinkerOpts();
+  File->ComdatTable = R.getComdatTable();
+
+  for (unsigned I = 0; I != Mods.size(); ++I) {
+    size_t Begin = File->Symbols.size();
+    for (const irsymtab::Reader::SymbolRef &Sym : R.module_symbols(I))
+      // Skip symbols that are irrelevant to LTO. Note that this condition needs
+      // to match the one in Skip() in LTO::addRegularLTO().
+      if (Sym.isGlobal() && !Sym.isFormatSpecific())
+        File->Symbols.push_back(Sym);
+    File->ModuleSymIndices.push_back({Begin, File->Symbols.size()});
   }
 
   return std::move(File);
 }
 
-Expected<int> InputFile::Symbol::getComdatIndex() const {
-  if (!isGV())
-    return -1;
-  const GlobalObject *GO = getGV()->getBaseObject();
-  if (!GO)
-    return make_error<StringError>("Unable to determine comdat of alias!",
-                                   inconvertibleErrorCode());
-  if (const Comdat *C = GO->getComdat()) {
-    auto I = File->ComdatMap.find(C);
-    assert(I != File->ComdatMap.end());
-    return I->second;
-  }
-  return -1;
-}
-
 StringRef InputFile::getName() const {
-  return Mods[0].BM.getModuleIdentifier();
-}
-
-StringRef InputFile::getSourceFileName() const {
-  return Mods[0].Mod->getSourceFileName();
-}
-
-iterator_range<InputFile::symbol_iterator>
-InputFile::module_symbols(InputModule &IM) {
-  return llvm::make_range(
-      symbol_iterator(SymTab.symbols().data() + IM.SymBegin, SymTab, this),
-      symbol_iterator(SymTab.symbols().data() + IM.SymEnd, SymTab, this));
+  return Mods[0].getModuleIdentifier();
 }
 
 LTO::RegularLTOState::RegularLTOState(unsigned ParallelCodeGenParallelismLevel,
@@ -324,21 +392,17 @@ LTO::LTO(Config Conf, ThinBackend Backend,
 LTO::~LTO() = default;
 
 // Add the given symbol to the GlobalResolutions map, and resolve its partition.
-void LTO::addSymbolToGlobalRes(SmallPtrSet<GlobalValue *, 8> &Used,
-                               const InputFile::Symbol &Sym,
+void LTO::addSymbolToGlobalRes(const InputFile::Symbol &Sym,
                                SymbolResolution Res, unsigned Partition) {
-  GlobalValue *GV = Sym.isGV() ? Sym.getGV() : nullptr;
-
   auto &GlobalRes = GlobalResolutions[Sym.getName()];
-  if (GV) {
-    GlobalRes.UnnamedAddr &= GV->hasGlobalUnnamedAddr();
-    if (Res.Prevailing)
-      GlobalRes.IRName = GV->getName();
-  }
+  GlobalRes.UnnamedAddr &= Sym.isUnnamedAddr();
+  if (Res.Prevailing)
+    GlobalRes.IRName = Sym.getIRName();
+
   // Set the partition to external if we know it is used elsewhere, e.g.
   // it is visible to a regular object, is referenced from llvm.compiler_used,
   // or was already recorded as being referenced from a different partition.
-  if (Res.VisibleToRegularObj || (GV && Used.count(GV)) ||
+  if (Res.VisibleToRegularObj || Sym.isUsed() ||
       (GlobalRes.Partition != GlobalResolution::Unknown &&
        GlobalRes.Partition != Partition)) {
     GlobalRes.Partition = GlobalResolution::External;
@@ -382,41 +446,32 @@ Error LTO::add(std::unique_ptr<InputFile> Input,
     writeToResolutionFile(*Conf.ResolutionFile, Input.get(), Res);
 
   const SymbolResolution *ResI = Res.begin();
-  for (InputFile::InputModule &IM : Input->Mods)
-    if (Error Err = addModule(*Input, IM, ResI, Res.end()))
+  for (unsigned I = 0; I != Input->Mods.size(); ++I)
+    if (Error Err = addModule(*Input, I, ResI, Res.end()))
       return Err;
 
   assert(ResI == Res.end());
   return Error::success();
 }
 
-Error LTO::addModule(InputFile &Input, InputFile::InputModule &IM,
+Error LTO::addModule(InputFile &Input, unsigned ModI,
                      const SymbolResolution *&ResI,
                      const SymbolResolution *ResE) {
-  // FIXME: move to backend
-  Module &M = *IM.Mod;
-
-  if (M.getDataLayoutStr().empty())
-    return make_error<StringError>("input module has no datalayout",
-                                    inconvertibleErrorCode());
-
-  if (!Conf.OverrideTriple.empty())
-    M.setTargetTriple(Conf.OverrideTriple);
-  else if (M.getTargetTriple().empty())
-    M.setTargetTriple(Conf.DefaultTriple);
-
-  Expected<bool> HasThinLTOSummary = IM.BM.hasSummary();
+  Expected<bool> HasThinLTOSummary = Input.Mods[ModI].hasSummary();
   if (!HasThinLTOSummary)
     return HasThinLTOSummary.takeError();
 
+  auto ModSyms = Input.module_symbols(ModI);
   if (*HasThinLTOSummary)
-    return addThinLTO(IM.BM, M, Input.module_symbols(IM), ResI, ResE);
+    return addThinLTO(Input.Mods[ModI], ModSyms, ResI, ResE);
   else
-    return addRegularLTO(IM.BM, ResI, ResE);
+    return addRegularLTO(Input.Mods[ModI], ModSyms, ResI, ResE);
 }
 
 // Add a regular LTO object to the link.
-Error LTO::addRegularLTO(BitcodeModule BM, const SymbolResolution *&ResI,
+Error LTO::addRegularLTO(BitcodeModule BM,
+                         ArrayRef<InputFile::Symbol> Syms,
+                         const SymbolResolution *&ResI,
                          const SymbolResolution *ResE) {
   if (!RegularLTO.CombinedModule) {
     RegularLTO.CombinedModule =
@@ -437,47 +492,84 @@ Error LTO::addRegularLTO(BitcodeModule BM, const SymbolResolution *&ResI,
   ModuleSymbolTable SymTab;
   SymTab.addModule(&M);
 
-  SmallPtrSet<GlobalValue *, 8> Used;
-  collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
-
   std::vector<GlobalValue *> Keep;
 
   for (GlobalVariable &GV : M.globals())
     if (GV.hasAppendingLinkage())
       Keep.push_back(&GV);
 
-  for (const InputFile::Symbol &Sym :
-       make_range(InputFile::symbol_iterator(SymTab.symbols().begin(), SymTab,
-                                             nullptr),
-                  InputFile::symbol_iterator(SymTab.symbols().end(), SymTab,
-                                             nullptr))) {
+  DenseSet<GlobalObject *> AliasedGlobals;
+  for (auto &GA : M.aliases())
+    if (GlobalObject *GO = GA.getBaseObject())
+      AliasedGlobals.insert(GO);
+
+  // In this function we need IR GlobalValues matching the symbols in Syms
+  // (which is not backed by a module), so we need to enumerate them in the same
+  // order. The symbol enumeration order of a ModuleSymbolTable intentionally
+  // matches the order of an irsymtab, but when we read the irsymtab in
+  // InputFile::create we omit some symbols that are irrelevant to LTO. The
+  // Skip() function skips the same symbols from the module as InputFile does
+  // from the symbol table.
+  auto MsymI = SymTab.symbols().begin(), MsymE = SymTab.symbols().end();
+  auto Skip = [&]() {
+    while (MsymI != MsymE) {
+      auto Flags = SymTab.getSymbolFlags(*MsymI);
+      if ((Flags & object::BasicSymbolRef::SF_Global) &&
+          !(Flags & object::BasicSymbolRef::SF_FormatSpecific))
+        return;
+      ++MsymI;
+    }
+  };
+  Skip();
+
+  for (const InputFile::Symbol &Sym : Syms) {
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
-    addSymbolToGlobalRes(Used, Sym, Res, 0);
-
-    if (Sym.getFlags() & object::BasicSymbolRef::SF_Undefined)
-      continue;
-    if (Res.Prevailing && Sym.isGV()) {
-      GlobalValue *GV = Sym.getGV();
-      Keep.push_back(GV);
-      switch (GV->getLinkage()) {
-      default:
-        break;
-      case GlobalValue::LinkOnceAnyLinkage:
-        GV->setLinkage(GlobalValue::WeakAnyLinkage);
-        break;
-      case GlobalValue::LinkOnceODRLinkage:
-        GV->setLinkage(GlobalValue::WeakODRLinkage);
-        break;
+    addSymbolToGlobalRes(Sym, Res, 0);
+
+    assert(MsymI != MsymE);
+    ModuleSymbolTable::Symbol Msym = *MsymI++;
+    Skip();
+
+    if (GlobalValue *GV = Msym.dyn_cast<GlobalValue *>()) {
+      if (Res.Prevailing) {
+        if (Sym.isUndefined())
+          continue;
+        Keep.push_back(GV);
+        switch (GV->getLinkage()) {
+        default:
+          break;
+        case GlobalValue::LinkOnceAnyLinkage:
+          GV->setLinkage(GlobalValue::WeakAnyLinkage);
+          break;
+        case GlobalValue::LinkOnceODRLinkage:
+          GV->setLinkage(GlobalValue::WeakODRLinkage);
+          break;
+        }
+      } else if (isa<GlobalObject>(GV) &&
+                 (GV->hasLinkOnceODRLinkage() || GV->hasWeakODRLinkage() ||
+                  GV->hasAvailableExternallyLinkage()) &&
+                 !AliasedGlobals.count(cast<GlobalObject>(GV))) {
+        // Either of the above three types of linkage indicates that the
+        // chosen prevailing symbol will have the same semantics as this copy of
+        // the symbol, so we can link it with available_externally linkage. We
+        // only need to do this if the symbol is undefined.
+        GlobalValue *CombinedGV =
+            RegularLTO.CombinedModule->getNamedValue(GV->getName());
+        if (!CombinedGV || CombinedGV->isDeclaration()) {
+          Keep.push_back(GV);
+          GV->setLinkage(GlobalValue::AvailableExternallyLinkage);
+          cast<GlobalObject>(GV)->setComdat(nullptr);
+        }
       }
     }
     // Common resolution: collect the maximum size/alignment over all commons.
     // We also record if we see an instance of a common as prevailing, so that
     // if none is prevailing we can ignore it later.
-    if (Sym.getFlags() & object::BasicSymbolRef::SF_Common) {
+    if (Sym.isCommon()) {
       // FIXME: We should figure out what to do about commons defined by asm.
       // For now they aren't reported correctly by ModuleSymbolTable.
-      auto &CommonRes = RegularLTO.Commons[Sym.getGV()->getName()];
+      auto &CommonRes = RegularLTO.Commons[Sym.getIRName()];
       CommonRes.Size = std::max(CommonRes.Size, Sym.getCommonSize());
       CommonRes.Align = std::max(CommonRes.Align, Sym.getCommonAlignment());
       CommonRes.Prevailing |= Res.Prevailing;
@@ -485,23 +577,18 @@ Error LTO::addRegularLTO(BitcodeModule BM, const SymbolResolution *&ResI,
 
     // FIXME: use proposed local attribute for FinalDefinitionInLinkageUnit.
   }
+  assert(MsymI == MsymE);
 
   return RegularLTO.Mover->move(std::move(*MOrErr), Keep,
                                 [](GlobalValue &, IRMover::ValueAdder) {},
-                                /* LinkModuleInlineAsm */ true,
                                 /* IsPerformingImport */ false);
 }
 
 // Add a ThinLTO object to the link.
-// FIXME: This function should not need to take as many parameters once we have
-// a bitcode symbol table.
-Error LTO::addThinLTO(BitcodeModule BM, Module &M,
-                      iterator_range<InputFile::symbol_iterator> Syms,
+Error LTO::addThinLTO(BitcodeModule BM,
+                      ArrayRef<InputFile::Symbol> Syms,
                       const SymbolResolution *&ResI,
                       const SymbolResolution *ResE) {
-  SmallPtrSet<GlobalValue *, 8> Used;
-  collectUsedGlobalVariables(M, Used, /*CompilerUsed*/ false);
-
   Expected<std::unique_ptr<ModuleSummaryIndex>> SummaryOrErr = BM.getSummary();
   if (!SummaryOrErr)
     return SummaryOrErr.takeError();
@@ -511,11 +598,15 @@ Error LTO::addThinLTO(BitcodeModule BM, Module &M,
   for (const InputFile::Symbol &Sym : Syms) {
     assert(ResI != ResE);
     SymbolResolution Res = *ResI++;
-    addSymbolToGlobalRes(Used, Sym, Res, ThinLTO.ModuleMap.size() + 1);
+    addSymbolToGlobalRes(Sym, Res, ThinLTO.ModuleMap.size() + 1);
 
-    if (Res.Prevailing && Sym.isGV())
-      ThinLTO.PrevailingModuleForGUID[Sym.getGV()->getGUID()] =
-          BM.getModuleIdentifier();
+    if (Res.Prevailing) {
+      if (!Sym.getIRName().empty()) {
+        auto GUID = GlobalValue::getGUID(GlobalValue::getGlobalIdentifier(
+            Sym.getIRName(), GlobalValue::ExternalLinkage, ""));
+        ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+      }
+    }
   }
 
   if (!ThinLTO.ModuleMap.insert({BM.getModuleIdentifier(), BM}).second)
@@ -632,6 +723,7 @@ class InProcessThinBackend : public ThinBackendProc {
   ThreadPool BackendThreadPool;
   AddStreamFn AddStream;
   NativeObjectCache Cache;
+  TypeIdSummariesByGuidTy TypeIdSummariesByGuid;
 
   Optional<Error> Err;
   std::mutex ErrMu;
@@ -644,7 +736,14 @@ public:
       AddStreamFn AddStream, NativeObjectCache Cache)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
         BackendThreadPool(ThinLTOParallelismLevel),
-        AddStream(std::move(AddStream)), Cache(std::move(Cache)) {}
+        AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
+    // Create a mapping from type identifier GUIDs to type identifier summaries.
+    // This allows backends to use the type identifier GUIDs stored in the
+    // function summaries to determine which type identifier summaries affect
+    // each function without needing to compute GUIDs in each backend.
+    for (auto &TId : CombinedIndex.typeIds())
+      TypeIdSummariesByGuid[GlobalValue::getGUID(TId.first)].push_back(&TId);
+  }
 
   Error runThinLTOBackendThread(
       AddStreamFn AddStream, NativeObjectCache Cache, unsigned Task,
@@ -653,7 +752,8 @@ public:
       const FunctionImporter::ExportSetTy &ExportList,
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedGlobals,
-      MapVector<StringRef, BitcodeModule> &ModuleMap) {
+      MapVector<StringRef, BitcodeModule> &ModuleMap,
+      const TypeIdSummariesByGuidTy &TypeIdSummariesByGuid) {
     auto RunThinBackend = [&](AddStreamFn AddStream) {
       LTOLLVMContext BackendContext(Conf);
       Expected<std::unique_ptr<Module>> MOrErr = BM.parseModule(BackendContext);
@@ -676,7 +776,7 @@ public:
     SmallString<40> Key;
     // The module may be cached, this helps handling it.
     computeCacheKey(Key, Conf, CombinedIndex, ModuleID, ImportList, ExportList,
-                    ResolvedODR, DefinedGlobals);
+                    ResolvedODR, DefinedGlobals, TypeIdSummariesByGuid);
     if (AddStreamFn CacheAddStream = Cache(Task, Key))
       return RunThinBackend(CacheAddStream);
 
@@ -700,10 +800,11 @@ public:
             const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes>
                 &ResolvedODR,
             const GVSummaryMapTy &DefinedGlobals,
-            MapVector<StringRef, BitcodeModule> &ModuleMap) {
+            MapVector<StringRef, BitcodeModule> &ModuleMap,
+            const TypeIdSummariesByGuidTy &TypeIdSummariesByGuid) {
           Error E = runThinLTOBackendThread(
-              AddStream, Cache, Task, BM, CombinedIndex, ImportList,
-              ExportList, ResolvedODR, DefinedGlobals, ModuleMap);
+              AddStream, Cache, Task, BM, CombinedIndex, ImportList, ExportList,
+              ResolvedODR, DefinedGlobals, ModuleMap, TypeIdSummariesByGuid);
           if (E) {
             std::unique_lock<std::mutex> L(ErrMu);
             if (Err)
@@ -712,9 +813,9 @@ public:
               Err = std::move(E);
           }
         },
-        BM, std::ref(CombinedIndex), std::ref(ImportList),
-        std::ref(ExportList), std::ref(ResolvedODR), std::ref(DefinedGlobals),
-        std::ref(ModuleMap));
+        BM, std::ref(CombinedIndex), std::ref(ImportList), std::ref(ExportList),
+        std::ref(ResolvedODR), std::ref(DefinedGlobals), std::ref(ModuleMap),
+        std::ref(TypeIdSummariesByGuid));
     return Error::success();
   }
 
@@ -870,7 +971,8 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
           // IRName will be defined if we have seen the prevailing copy of
           // this value. If not, no need to preserve any ThinLTO copies.
           !Res.second.IRName.empty())
-        GUIDPreservedSymbols.insert(GlobalValue::getGUID(Res.second.IRName));
+        GUIDPreservedSymbols.insert(GlobalValue::getGUID(
+            GlobalValue::getRealLinkageName(Res.second.IRName)));
     }
 
     auto DeadSymbols =
@@ -889,10 +991,11 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
       // partition (and we can't get the GUID).
       if (Res.second.IRName.empty())
         continue;
-      auto GUID = GlobalValue::getGUID(Res.second.IRName);
+      auto GUID = GlobalValue::getGUID(
+          GlobalValue::getRealLinkageName(Res.second.IRName));
       // Mark exported unless index-based analysis determined it to be dead.
       if (!DeadSymbols.count(GUID))
-        ExportedGUIDs.insert(GlobalValue::getGUID(Res.second.IRName));
+        ExportedGUIDs.insert(GUID);
     }
 
     auto isPrevailing = [&](GlobalValue::GUID GUID,
@@ -936,3 +1039,27 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
 
   return BackendProc->wait();
 }
+
+Expected<std::unique_ptr<tool_output_file>>
+lto::setupOptimizationRemarks(LLVMContext &Context,
+                              StringRef LTORemarksFilename,
+                              bool LTOPassRemarksWithHotness, int Count) {
+  if (LTORemarksFilename.empty())
+    return nullptr;
+
+  std::string Filename = LTORemarksFilename;
+  if (Count != -1)
+    Filename += ".thin." + llvm::utostr(Count) + ".yaml";
+
+  std::error_code EC;
+  auto DiagnosticFile =
+      llvm::make_unique<tool_output_file>(Filename, EC, sys::fs::F_None);
+  if (EC)
+    return errorCodeToError(EC);
+  Context.setDiagnosticsOutputFile(
+      llvm::make_unique<yaml::Output>(DiagnosticFile->os()));
+  if (LTOPassRemarksWithHotness)
+    Context.setDiagnosticHotnessRequested(true);
+  DiagnosticFile->keep();
+  return std::move(DiagnosticFile);
+}
diff --git a/lib/LTO/LTOBackend.cpp b/lib/LTO/LTOBackend.cpp
index 5c3e442faa9a1ed480913c45b5e3a831182fb152..4bd251f727a4353da699102e9c1ee4e0b5bdac18 100644
--- a/lib/LTO/LTOBackend.cpp
+++ b/lib/LTO/LTOBackend.cpp
@@ -27,6 +27,7 @@
 #include "llvm/LTO/LTO.h"
 #include "llvm/LTO/legacy/UpdateCompilerUsed.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
@@ -223,14 +224,16 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
 }
 
 static void runOldPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
-                           bool IsThinLTO, ModuleSummaryIndex &CombinedIndex) {
+                           bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
+                           const ModuleSummaryIndex *ImportSummary) {
   legacy::PassManager passes;
   passes.add(createTargetTransformInfoWrapperPass(TM->getTargetIRAnalysis()));
 
   PassManagerBuilder PMB;
   PMB.LibraryInfo = new TargetLibraryInfoImpl(Triple(TM->getTargetTriple()));
   PMB.Inliner = createFunctionInliningPass();
-  PMB.Summary = &CombinedIndex;
+  PMB.ExportSummary = ExportSummary;
+  PMB.ImportSummary = ImportSummary;
   // Unconditionally verify input since it is not verified before this
   // point and has unknown origin.
   PMB.VerifyInput = true;
@@ -247,7 +250,8 @@ static void runOldPMPasses(Config &Conf, Module &Mod, TargetMachine *TM,
 }
 
 bool opt(Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
-         bool IsThinLTO, ModuleSummaryIndex &CombinedIndex) {
+         bool IsThinLTO, ModuleSummaryIndex *ExportSummary,
+         const ModuleSummaryIndex *ImportSummary) {
   // There's still no ThinLTO pipeline hooked up in the new pass manager,
   // once there is one, we can just remove this.
   if (LTOUseNewPM && IsThinLTO)
@@ -260,7 +264,7 @@ bool opt(Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
   else if (LTOUseNewPM)
     runNewPMPasses(Mod, TM, Conf.OptLevel);
   else
-    runOldPMPasses(Conf, Mod, TM, IsThinLTO, CombinedIndex);
+    runOldPMPasses(Conf, Mod, TM, IsThinLTO, ExportSummary, ImportSummary);
   return !Conf.PostOptModuleHook || Conf.PostOptModuleHook(Task, Mod);
 }
 
@@ -271,8 +275,7 @@ void codegen(Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
 
   auto Stream = AddStream(Task);
   legacy::PassManager CodeGenPasses;
-  if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS,
-                              TargetMachine::CGFT_ObjectFile))
+  if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS, Conf.CGFileType))
     report_fatal_error("Failed to setup codegen");
   CodeGenPasses.run(Mod);
 }
@@ -340,12 +343,22 @@ Expected<const Target *> initAndLookupTarget(Config &C, Module &Mod) {
 
 }
 
+static void
+finalizeOptimizationRemarks(std::unique_ptr<tool_output_file> DiagOutputFile) {
+  // Make sure we flush the diagnostic remarks file in case the linker doesn't
+  // call the global destructors before exiting.
+  if (!DiagOutputFile)
+    return;
+  DiagOutputFile->keep();
+  DiagOutputFile->os().flush();
+}
+
 static void handleAsmUndefinedRefs(Module &Mod, TargetMachine &TM) {
   // Collect the list of undefined symbols used in asm and update
   // llvm.compiler.used to prevent optimization to drop these from the output.
   StringSet<> AsmUndefinedRefs;
   ModuleSymbolTable::CollectAsmSymbols(
-      Triple(Mod.getTargetTriple()), Mod.getModuleInlineAsm(),
+      Mod,
       [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) {
         if (Flags & object::BasicSymbolRef::SF_Undefined)
           AsmUndefinedRefs.insert(Name);
@@ -366,9 +379,20 @@ Error lto::backend(Config &C, AddStreamFn AddStream,
 
   handleAsmUndefinedRefs(*Mod, *TM);
 
-  if (!C.CodeGenOnly)
-    if (!opt(C, TM.get(), 0, *Mod, /*IsThinLTO=*/false, CombinedIndex))
+  // Setup optimization remarks.
+  auto DiagFileOrErr = lto::setupOptimizationRemarks(
+      Mod->getContext(), C.RemarksFilename, C.RemarksWithHotness);
+  if (!DiagFileOrErr)
+    return DiagFileOrErr.takeError();
+  auto DiagnosticOutputFile = std::move(*DiagFileOrErr);
+
+  if (!C.CodeGenOnly) {
+    if (!opt(C, TM.get(), 0, *Mod, /*IsThinLTO=*/false,
+             /*ExportSummary=*/&CombinedIndex, /*ImportSummary=*/nullptr)) {
+      finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
       return Error::success();
+    }
+  }
 
   if (ParallelCodeGenParallelismLevel == 1) {
     codegen(C, TM.get(), AddStream, 0, *Mod);
@@ -376,11 +400,12 @@ Error lto::backend(Config &C, AddStreamFn AddStream,
     splitCodeGen(C, TM.get(), AddStream, ParallelCodeGenParallelismLevel,
                  std::move(Mod));
   }
+  finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   return Error::success();
 }
 
 Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
-                       Module &Mod, ModuleSummaryIndex &CombinedIndex,
+                       Module &Mod, const ModuleSummaryIndex &CombinedIndex,
                        const FunctionImporter::ImportMapTy &ImportList,
                        const GVSummaryMapTy &DefinedGlobals,
                        MapVector<StringRef, BitcodeModule> &ModuleMap) {
@@ -432,7 +457,8 @@ Error lto::thinBackend(Config &Conf, unsigned Task, AddStreamFn AddStream,
   if (Conf.PostImportModuleHook && !Conf.PostImportModuleHook(Task, Mod))
     return Error::success();
 
-  if (!opt(Conf, TM.get(), Task, Mod, /*IsThinLTO=*/true, CombinedIndex))
+  if (!opt(Conf, TM.get(), Task, Mod, /*IsThinLTO=*/true,
+           /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex))
     return Error::success();
 
   codegen(Conf, TM.get(), AddStream, Task, Mod);
diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp
index 6af31e61f94637342b2949ab9d0abd87095e7676..86fba843e980bf1fdc27f486e9a5416e87a44048 100644
--- a/lib/LTO/LTOCodeGenerator.cpp
+++ b/lib/LTO/LTOCodeGenerator.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/LTO/LTO.h"
 #include "llvm/LTO/legacy/LTOModule.h"
 #include "llvm/LTO/legacy/UpdateCompilerUsed.h"
 #include "llvm/Linker/Linker.h"
@@ -140,6 +141,7 @@ void LTOCodeGenerator::initializeLTOPasses() {
   initializeMemCpyOptLegacyPassPass(R);
   initializeDCELegacyPassPass(R);
   initializeCFGSimplifyPassPass(R);
+  initializeLateCFGSimplifyPassPass(R);
 }
 
 void LTOCodeGenerator::setAsmUndefinedRefs(LTOModule *Mod) {
@@ -506,25 +508,6 @@ void LTOCodeGenerator::verifyMergedModuleOnce() {
     report_fatal_error("Broken module found, compilation aborted!");
 }
 
-bool LTOCodeGenerator::setupOptimizationRemarks() {
-  if (LTORemarksFilename != "") {
-    std::error_code EC;
-    DiagnosticOutputFile = llvm::make_unique<tool_output_file>(
-        LTORemarksFilename, EC, sys::fs::F_None);
-    if (EC) {
-      emitError(EC.message());
-      return false;
-    }
-    Context.setDiagnosticsOutputFile(
-        llvm::make_unique<yaml::Output>(DiagnosticOutputFile->os()));
-  }
-
-  if (LTOPassRemarksWithHotness)
-    Context.setDiagnosticHotnessRequested(true);
-
-  return true;
-}
-
 void LTOCodeGenerator::finishOptimizationRemarks() {
   if (DiagnosticOutputFile) {
     DiagnosticOutputFile->keep();
@@ -540,8 +523,13 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
   if (!this->determineTarget())
     return false;
 
-  if (!setupOptimizationRemarks())
-    return false;
+  auto DiagFileOrErr = lto::setupOptimizationRemarks(
+      Context, LTORemarksFilename, LTOPassRemarksWithHotness);
+  if (!DiagFileOrErr) {
+    errs() << "Error: " << toString(DiagFileOrErr.takeError()) << "\n";
+    report_fatal_error("Can't get an output file for the remarks");
+  }
+  DiagnosticOutputFile = std::move(*DiagFileOrErr);
 
   // We always run the verifier once on the merged module, the `DisableVerify`
   // parameter only applies to subsequent verify.
@@ -567,6 +555,8 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
   if (!DisableInline)
     PMB.Inliner = createFunctionInliningPass();
   PMB.LibraryInfo = new TargetLibraryInfoImpl(TargetTriple);
+  if (Freestanding)
+    PMB.LibraryInfo->disableAllFunctions();
   PMB.OptLevel = OptLevel;
   PMB.VerifyInput = !DisableVerify;
   PMB.VerifyOutput = !DisableVerify;
diff --git a/lib/LTO/LTOModule.cpp b/lib/LTO/LTOModule.cpp
index 89aeb80000388a8cbe6b218794500ae2a1d2988e..11f0982c6a6029fa45a5208f6df87b9d2cc7cd95 100644
--- a/lib/LTO/LTOModule.cpp
+++ b/lib/LTO/LTOModule.cpp
@@ -14,11 +14,12 @@
 
 #include "llvm/LTO/legacy/LTOModule.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/ObjectUtils.h"
 #include "llvm/Bitcode/BitcodeReader.h"
-#include "llvm/CodeGen/Analysis.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCExpr.h"
@@ -647,11 +648,15 @@ void LTOModule::parseMetadata() {
     }
   }
 
-  // Globals
+  // Globals - we only need to do this for COFF.
+  const Triple TT(_target->getTargetTriple());
+  if (!TT.isOSBinFormatCOFF())
+    return;
+  Mangler M;
   for (const NameAndAttributes &Sym : _symbols) {
     if (!Sym.symbol)
       continue;
-    _target->getObjFileLowering()->emitLinkerFlagsForGlobal(OS, Sym.symbol);
+    emitLinkerFlagsForGlobalCOFF(OS, Sym.symbol, TT, M);
   }
 
   // Add other interesting metadata here.
diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp
index 104fb199da0802e0297f36927dae0961a8199399..970c75e4aceac3ef9e6072cf3ee02b363f09cd4f 100644
--- a/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -73,27 +73,6 @@ namespace {
 static cl::opt<int>
     ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency()));
 
-Expected<std::unique_ptr<tool_output_file>>
-setupOptimizationRemarks(LLVMContext &Ctx, int Count) {
-  if (LTOPassRemarksWithHotness)
-    Ctx.setDiagnosticHotnessRequested(true);
-
-  if (LTORemarksFilename.empty())
-    return nullptr;
-
-  std::string FileName =
-      LTORemarksFilename + ".thin." + llvm::utostr(Count) + ".yaml";
-  std::error_code EC;
-  auto DiagnosticOutputFile =
-      llvm::make_unique<tool_output_file>(FileName, EC, sys::fs::F_None);
-  if (EC)
-    return errorCodeToError(EC);
-  Ctx.setDiagnosticsOutputFile(
-      llvm::make_unique<yaml::Output>(DiagnosticOutputFile->os()));
-  DiagnosticOutputFile->keep();
-  return std::move(DiagnosticOutputFile);
-}
-
 // Simple helper to save temporary files for debug.
 static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
                             unsigned count, StringRef Suffix) {
@@ -150,13 +129,13 @@ static void computePrevailingCopies(
 }
 
 static StringMap<MemoryBufferRef>
-generateModuleMap(const std::vector<MemoryBufferRef> &Modules) {
+generateModuleMap(const std::vector<ThinLTOBuffer> &Modules) {
   StringMap<MemoryBufferRef> ModuleMap;
   for (auto &ModuleBuffer : Modules) {
     assert(ModuleMap.find(ModuleBuffer.getBufferIdentifier()) ==
                ModuleMap.end() &&
            "Expect unique Buffer Identifier");
-    ModuleMap[ModuleBuffer.getBufferIdentifier()] = ModuleBuffer;
+    ModuleMap[ModuleBuffer.getBufferIdentifier()] = ModuleBuffer.getMemBuffer();
   }
   return ModuleMap;
 }
@@ -208,10 +187,12 @@ crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
 }
 
 static void optimizeModule(Module &TheModule, TargetMachine &TM,
-                           unsigned OptLevel) {
+                           unsigned OptLevel, bool Freestanding) {
   // Populate the PassManager
   PassManagerBuilder PMB;
   PMB.LibraryInfo = new TargetLibraryInfoImpl(TM.getTargetTriple());
+  if (Freestanding)
+    PMB.LibraryInfo->disableAllFunctions();
   PMB.Inliner = createFunctionInliningPass();
   // FIXME: should get it from the bitcode?
   PMB.OptLevel = OptLevel;
@@ -285,7 +266,7 @@ public:
       const std::map<GlobalValue::GUID, GlobalValue::LinkageTypes> &ResolvedODR,
       const GVSummaryMapTy &DefinedFunctions,
       const DenseSet<GlobalValue::GUID> &PreservedSymbols, unsigned OptLevel,
-      const TargetMachineBuilder &TMBuilder) {
+      bool Freestanding, const TargetMachineBuilder &TMBuilder) {
     if (CachePath.empty())
       return;
 
@@ -342,6 +323,7 @@ public:
       AddUnsigned(*TMBuilder.RelocModel);
     AddUnsigned(TMBuilder.CGOptLevel);
     AddUnsigned(OptLevel);
+    AddUnsigned(Freestanding);
 
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
     for (auto F : ExportList)
@@ -369,7 +351,10 @@ public:
             ArrayRef<uint8_t>((const uint8_t *)&Entry, sizeof(GlobalValue::GUID)));
     }
 
-    sys::path::append(EntryPath, CachePath, toHex(Hasher.result()));
+    // This choice of file name allows the cache to be pruned (see pruneCache()
+    // in include/llvm/Support/CachePruning.h).
+    sys::path::append(EntryPath, CachePath,
+                      "llvmcache-" + toHex(Hasher.result()));
   }
 
   // Access the path to this entry in the cache.
@@ -422,7 +407,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
                      const GVSummaryMapTy &DefinedGlobals,
                      const ThinLTOCodeGenerator::CachingOptions &CacheOptions,
                      bool DisableCodeGen, StringRef SaveTempsDir,
-                     unsigned OptLevel, unsigned count) {
+                     bool Freestanding, unsigned OptLevel, unsigned count) {
 
   // "Benchmark"-like optimization: single-source case
   bool SingleModule = (ModuleMap.size() == 1);
@@ -454,7 +439,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
     saveTempBitcode(TheModule, SaveTempsDir, count, ".3.imported.bc");
   }
 
-  optimizeModule(TheModule, TM, OptLevel);
+  optimizeModule(TheModule, TM, OptLevel, Freestanding);
 
   saveTempBitcode(TheModule, SaveTempsDir, count, ".4.opt.bc");
 
@@ -522,13 +507,13 @@ static void initTMBuilder(TargetMachineBuilder &TMBuilder,
 } // end anonymous namespace
 
 void ThinLTOCodeGenerator::addModule(StringRef Identifier, StringRef Data) {
-  MemoryBufferRef Buffer(Data, Identifier);
+  ThinLTOBuffer Buffer(Data, Identifier);
   if (Modules.empty()) {
     // First module added, so initialize the triple and some options
     LLVMContext Context;
     StringRef TripleStr;
-    ErrorOr<std::string> TripleOrErr =
-        expectedToErrorOrAndEmitErrors(Context, getBitcodeTargetTriple(Buffer));
+    ErrorOr<std::string> TripleOrErr = expectedToErrorOrAndEmitErrors(
+        Context, getBitcodeTargetTriple(Buffer.getMemBuffer()));
     if (TripleOrErr)
       TripleStr = *TripleOrErr;
     Triple TheTriple(TripleStr);
@@ -538,8 +523,8 @@ void ThinLTOCodeGenerator::addModule(StringRef Identifier, StringRef Data) {
   else {
     LLVMContext Context;
     StringRef TripleStr;
-    ErrorOr<std::string> TripleOrErr =
-        expectedToErrorOrAndEmitErrors(Context, getBitcodeTargetTriple(Buffer));
+    ErrorOr<std::string> TripleOrErr = expectedToErrorOrAndEmitErrors(
+        Context, getBitcodeTargetTriple(Buffer.getMemBuffer()));
     if (TripleOrErr)
       TripleStr = *TripleOrErr;
     assert(TMBuilder.TheTriple.str() == TripleStr &&
@@ -588,7 +573,8 @@ std::unique_ptr<ModuleSummaryIndex> ThinLTOCodeGenerator::linkCombinedIndex() {
   uint64_t NextModuleId = 0;
   for (auto &ModuleBuffer : Modules) {
     Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
-        object::ModuleSummaryIndexObjectFile::create(ModuleBuffer);
+        object::ModuleSummaryIndexObjectFile::create(
+            ModuleBuffer.getMemBuffer());
     if (!ObjOrErr) {
       // FIXME diagnose
       logAllUnhandledErrors(
@@ -779,7 +765,7 @@ void ThinLTOCodeGenerator::optimize(Module &TheModule) {
   initTMBuilder(TMBuilder, Triple(TheModule.getTargetTriple()));
 
   // Optimize now
-  optimizeModule(TheModule, *TMBuilder.create(), OptLevel);
+  optimizeModule(TheModule, *TMBuilder.create(), OptLevel, Freestanding);
 }
 
 /**
@@ -852,8 +838,9 @@ void ThinLTOCodeGenerator::run() {
         Context.setDiscardValueNames(LTODiscardValueNames);
 
         // Parse module now
-        auto TheModule = loadModuleFromBuffer(ModuleBuffer, Context, false,
-                                              /*IsImporting*/ false);
+        auto TheModule =
+            loadModuleFromBuffer(ModuleBuffer.getMemBuffer(), Context, false,
+                                 /*IsImporting*/ false);
 
         // CodeGen
         auto OutputBuffer = codegen(*TheModule);
@@ -943,8 +930,8 @@ void ThinLTOCodeGenerator::run() {
   std::iota(ModulesOrdering.begin(), ModulesOrdering.end(), 0);
   std::sort(ModulesOrdering.begin(), ModulesOrdering.end(),
             [&](int LeftIndex, int RightIndex) {
-              auto LSize = Modules[LeftIndex].getBufferSize();
-              auto RSize = Modules[RightIndex].getBufferSize();
+              auto LSize = Modules[LeftIndex].getBuffer().size();
+              auto RSize = Modules[RightIndex].getBuffer().size();
               return LSize > RSize;
             });
 
@@ -964,7 +951,7 @@ void ThinLTOCodeGenerator::run() {
                                     ImportLists[ModuleIdentifier], ExportList,
                                     ResolvedODR[ModuleIdentifier],
                                     DefinedFunctions, GUIDPreservedSymbols,
-                                    OptLevel, TMBuilder);
+                                    OptLevel, Freestanding, TMBuilder);
         auto CacheEntryPath = CacheEntry.getEntryPath();
 
         {
@@ -988,7 +975,8 @@ void ThinLTOCodeGenerator::run() {
         LLVMContext Context;
         Context.setDiscardValueNames(LTODiscardValueNames);
         Context.enableDebugTypeODRUniquing();
-        auto DiagFileOrErr = setupOptimizationRemarks(Context, count);
+        auto DiagFileOrErr = lto::setupOptimizationRemarks(
+            Context, LTORemarksFilename, LTOPassRemarksWithHotness, count);
         if (!DiagFileOrErr) {
           errs() << "Error: " << toString(DiagFileOrErr.takeError()) << "\n";
           report_fatal_error("ThinLTO: Can't get an output file for the "
@@ -996,8 +984,9 @@ void ThinLTOCodeGenerator::run() {
         }
 
         // Parse module now
-        auto TheModule = loadModuleFromBuffer(ModuleBuffer, Context, false,
-                                              /*IsImporting*/ false);
+        auto TheModule =
+            loadModuleFromBuffer(ModuleBuffer.getMemBuffer(), Context, false,
+                                 /*IsImporting*/ false);
 
         // Save temps: original file.
         saveTempBitcode(*TheModule, SaveTempsDir, count, ".0.original.bc");
@@ -1008,7 +997,7 @@ void ThinLTOCodeGenerator::run() {
             *TheModule, *Index, ModuleMap, *TMBuilder.create(), ImportList,
             ExportList, GUIDPreservedSymbols,
             ModuleToDefinedGVSummaries[ModuleIdentifier], CacheOptions,
-            DisableCodeGen, SaveTempsDir, OptLevel, count);
+            DisableCodeGen, SaveTempsDir, Freestanding, OptLevel, count);
 
         // Commit to the cache (if enabled)
         CacheEntry.write(*OutputBuffer);
@@ -1040,11 +1029,7 @@ void ThinLTOCodeGenerator::run() {
     }
   }
 
-  CachePruning(CacheOptions.Path)
-      .setPruningInterval(std::chrono::seconds(CacheOptions.PruningInterval))
-      .setEntryExpiration(std::chrono::seconds(CacheOptions.Expiration))
-      .setMaxSize(CacheOptions.MaxPercentageOfAvailableSpace)
-      .prune();
+  pruneCache(CacheOptions.Path, CacheOptions.Policy);
 
   // If statistics were requested, print them out now.
   if (llvm::AreStatisticsEnabled())
diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp
index 9f3cfc0eace45c7795a132ccf889a7c062c42328..15a46a2d0420f1d368a1b31a8f9ba4a2e6e0dc81 100644
--- a/lib/Linker/IRMover.cpp
+++ b/lib/Linker/IRMover.cpp
@@ -395,11 +395,12 @@ class IRLinker {
       Worklist.push_back(GV);
   }
 
-  /// Flag whether the ModuleInlineAsm string in Src should be linked with
-  /// (concatenated into) the ModuleInlineAsm string for the destination
-  /// module. It should be true for full LTO, but not when importing for
-  /// ThinLTO, otherwise we can have duplicate symbols.
-  bool LinkModuleInlineAsm;
+  /// Whether we are importing globals for ThinLTO, as opposed to linking the
+  /// source module. If this flag is set, it means that we can rely on some
+  /// other object file to define any non-GlobalValue entities defined by the
+  /// source module. This currently causes us to not link retained types in
+  /// debug info metadata and module inline asm.
+  bool IsPerformingImport;
 
   /// Set to true when all global value body linking is complete (including
   /// lazy linking). Used to prevent metadata linking from creating new
@@ -491,10 +492,10 @@ public:
            IRMover::IdentifiedStructTypeSet &Set, std::unique_ptr<Module> SrcM,
            ArrayRef<GlobalValue *> ValuesToLink,
            std::function<void(GlobalValue &, IRMover::ValueAdder)> AddLazyFor,
-           bool LinkModuleInlineAsm, bool IsPerformingImport)
+           bool IsPerformingImport)
       : DstM(DstM), SrcM(std::move(SrcM)), AddLazyFor(std::move(AddLazyFor)),
         TypeMap(Set), GValMaterializer(*this), LValMaterializer(*this),
-        SharedMDs(SharedMDs), LinkModuleInlineAsm(LinkModuleInlineAsm),
+        SharedMDs(SharedMDs), IsPerformingImport(IsPerformingImport),
         Mapper(ValueMap, RF_MoveDistinctMDs | RF_IgnoreMissingLocals, &TypeMap,
                &GValMaterializer),
         AliasMCID(Mapper.registerAlternateMappingContext(AliasValueMap,
@@ -870,9 +871,6 @@ bool IRLinker::shouldLink(GlobalValue *DGV, GlobalValue &SGV) {
   if (DGV && !DGV->isDeclarationForLinker())
     return false;
 
-  if (SGV.hasAvailableExternallyLinkage())
-    return true;
-
   if (SGV.isDeclaration() || DoneLinkingBodies)
     return false;
 
@@ -1297,7 +1295,7 @@ Error IRLinker::run() {
   DstM.setTargetTriple(mergeTriples(SrcTriple, DstTriple));
 
   // Append the module inline asm string.
-  if (LinkModuleInlineAsm && !SrcM->getModuleInlineAsm().empty()) {
+  if (!IsPerformingImport && !SrcM->getModuleInlineAsm().empty()) {
     if (DstM.getModuleInlineAsm().empty())
       DstM.setModuleInlineAsm(SrcM->getModuleInlineAsm());
     else
@@ -1436,10 +1434,10 @@ IRMover::IRMover(Module &M) : Composite(M) {
 Error IRMover::move(
     std::unique_ptr<Module> Src, ArrayRef<GlobalValue *> ValuesToLink,
     std::function<void(GlobalValue &, ValueAdder Add)> AddLazyFor,
-    bool LinkModuleInlineAsm, bool IsPerformingImport) {
+    bool IsPerformingImport) {
   IRLinker TheIRLinker(Composite, SharedMDs, IdentifiedStructTypes,
                        std::move(Src), ValuesToLink, std::move(AddLazyFor),
-                       LinkModuleInlineAsm, IsPerformingImport);
+                       IsPerformingImport);
   Error E = TheIRLinker.run();
   Composite.dropTriviallyDeadConstantArrays();
   return E;
diff --git a/lib/Linker/LinkModules.cpp b/lib/Linker/LinkModules.cpp
index cf2c4ccf523e6b9e695d9680ea21cc0539882628..c0ce4bf76b9fd9a23103921dedb12a66d2e3c53b 100644
--- a/lib/Linker/LinkModules.cpp
+++ b/lib/Linker/LinkModules.cpp
@@ -14,12 +14,13 @@
 #include "LinkDiagnosticInfo.h"
 #include "llvm-c/Linker.h"
 #include "llvm/ADT/SetVector.h"
-#include "llvm/ADT/StringSet.h"
+#include "llvm/IR/Comdat.h"
 #include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/Linker/Linker.h"
 #include "llvm/Support/Error.h"
-#include "llvm/Transforms/Utils/FunctionImportUtils.h"
 using namespace llvm;
 
 namespace {
@@ -31,14 +32,17 @@ class ModuleLinker {
   std::unique_ptr<Module> SrcM;
 
   SetVector<GlobalValue *> ValuesToLink;
-  StringSet<> Internalize;
 
   /// For symbol clashes, prefer those from Src.
   unsigned Flags;
 
-  /// Functions to import from source module, all other functions are
-  /// imported as declarations instead of definitions.
-  DenseSet<const GlobalValue *> *GlobalsToImport;
+  /// List of global value names that should be internalized.
+  StringSet<> Internalize;
+
+  /// Function that will perform the actual internalization. The reason for a
+  /// callback is that the linker cannot call internalizeModule without
+  /// creating a circular dependency between IPO and the linker.
+  std::function<void(Module &, const StringSet<> &)> InternalizeCallback;
 
   /// Used as the callback for lazy linking.
   /// The mover has just hit GV and we have to decide if it, and other members
@@ -46,14 +50,8 @@ class ModuleLinker {
   /// to Add.
   void addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add);
 
-  bool shouldLinkReferencedLinkOnce() {
-    return !(Flags & Linker::DontForceLinkLinkonceODR);
-  }
   bool shouldOverrideFromSrc() { return Flags & Linker::OverrideFromSrc; }
   bool shouldLinkOnlyNeeded() { return Flags & Linker::LinkOnlyNeeded; }
-  bool shouldInternalizeLinkedSymbols() {
-    return Flags & Linker::InternalizeLinkedSymbols;
-  }
 
   bool shouldLinkFromSource(bool &LinkFromSrc, const GlobalValue &Dest,
                             const GlobalValue &Src);
@@ -108,31 +106,17 @@ class ModuleLinker {
 
   bool linkIfNeeded(GlobalValue &GV);
 
-  /// Helper method to check if we are importing from the current source
-  /// module.
-  bool isPerformingImport() const { return GlobalsToImport != nullptr; }
-
-  /// If we are importing from the source module, checks if we should
-  /// import SGV as a definition, otherwise import as a declaration.
-  bool doImportAsDefinition(const GlobalValue *SGV);
-
 public:
   ModuleLinker(IRMover &Mover, std::unique_ptr<Module> SrcM, unsigned Flags,
-               DenseSet<const GlobalValue *> *GlobalsToImport = nullptr)
+               std::function<void(Module &, const StringSet<> &)>
+                   InternalizeCallback = {})
       : Mover(Mover), SrcM(std::move(SrcM)), Flags(Flags),
-        GlobalsToImport(GlobalsToImport) {}
+        InternalizeCallback(std::move(InternalizeCallback)) {}
 
   bool run();
 };
 }
 
-bool ModuleLinker::doImportAsDefinition(const GlobalValue *SGV) {
-  if (!isPerformingImport())
-    return false;
-  return FunctionImportGlobalProcessing::doImportAsDefinition(SGV,
-                                                              GlobalsToImport);
-}
-
 static GlobalValue::VisibilityTypes
 getMinVisibility(GlobalValue::VisibilityTypes A,
                  GlobalValue::VisibilityTypes B) {
@@ -266,18 +250,10 @@ bool ModuleLinker::shouldLinkFromSource(bool &LinkFromSrc,
 
   // We always have to add Src if it has appending linkage.
   if (Src.hasAppendingLinkage()) {
-    // Should have prevented importing for appending linkage in linkIfNeeded.
-    assert(!isPerformingImport());
     LinkFromSrc = true;
     return false;
   }
 
-  if (isPerformingImport()) {
-    // LinkFromSrc iff this is a global requested for importing.
-    LinkFromSrc = GlobalsToImport->count(&Src);
-    return false;
-  }
-
   bool SrcIsDeclaration = Src.isDeclarationForLinker();
   bool DestIsDeclaration = Dest.isDeclarationForLinker();
 
@@ -383,19 +359,9 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
     GV.setUnnamedAddr(UnnamedAddr);
   }
 
-  // Don't want to append to global_ctors list, for example, when we
-  // are importing for ThinLTO, otherwise the global ctors and dtors
-  // get executed multiple times for local variables (the latter causing
-  // double frees).
-  if (GV.hasAppendingLinkage() && isPerformingImport())
-    return false;
-
-  if (isPerformingImport()) {
-    if (!doImportAsDefinition(&GV))
-      return false;
-  } else if (!DGV && !shouldOverrideFromSrc() &&
-             (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
-              GV.hasAvailableExternallyLinkage()))
+  if (!DGV && !shouldOverrideFromSrc() &&
+      (GV.hasLocalLinkage() || GV.hasLinkOnceLinkage() ||
+       GV.hasAvailableExternallyLinkage()))
     return false;
 
   if (GV.isDeclaration())
@@ -418,17 +384,12 @@ bool ModuleLinker::linkIfNeeded(GlobalValue &GV) {
 }
 
 void ModuleLinker::addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add) {
-  if (!shouldLinkReferencedLinkOnce())
-    // For ThinLTO we don't import more than what was required.
-    // The client has to guarantee that the linkonce will be availabe at link
-    // time (by promoting it to weak for instance).
-    return;
-
   // Add these to the internalize list
-  if (!GV.hasLinkOnceLinkage() && !shouldLinkOnlyNeeded())
+  if (!GV.hasLinkOnceLinkage() && !GV.hasAvailableExternallyLinkage() &&
+      !shouldLinkOnlyNeeded())
     return;
 
-  if (shouldInternalizeLinkedSymbols())
+  if (InternalizeCallback)
     Internalize.insert(GV.getName());
   Add(GV);
 
@@ -442,7 +403,7 @@ void ModuleLinker::addLazyFor(GlobalValue &GV, const IRMover::ValueAdder &Add) {
       return;
     if (!LinkFromSrc)
       continue;
-    if (shouldInternalizeLinkedSymbols())
+    if (InternalizeCallback)
       Internalize.insert(GV2->getName());
     Add(*GV2);
   }
@@ -571,7 +532,7 @@ bool ModuleLinker::run() {
     }
   }
 
-  if (shouldInternalizeLinkedSymbols()) {
+  if (InternalizeCallback) {
     for (GlobalValue *GV : ValuesToLink)
       Internalize.insert(GV->getName());
   }
@@ -583,8 +544,7 @@ bool ModuleLinker::run() {
                            [this](GlobalValue &GV, IRMover::ValueAdder Add) {
                              addLazyFor(GV, Add);
                            },
-                           /* LinkModuleInlineAsm */ !isPerformingImport(),
-                           /* IsPerformingImport */ isPerformingImport())) {
+                           /* IsPerformingImport */ false)) {
     handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
       DstM.getContext().diagnose(LinkDiagnosticInfo(DS_Error, EIB.message()));
       HasErrors = true;
@@ -593,19 +553,19 @@ bool ModuleLinker::run() {
   if (HasErrors)
     return true;
 
-  for (auto &P : Internalize) {
-    GlobalValue *GV = DstM.getNamedValue(P.first());
-    GV->setLinkage(GlobalValue::InternalLinkage);
-  }
+  if (InternalizeCallback)
+    InternalizeCallback(DstM, Internalize);
 
   return false;
 }
 
 Linker::Linker(Module &M) : Mover(M) {}
 
-bool Linker::linkInModule(std::unique_ptr<Module> Src, unsigned Flags,
-                          DenseSet<const GlobalValue *> *GlobalsToImport) {
-  ModuleLinker ModLinker(Mover, std::move(Src), Flags, GlobalsToImport);
+bool Linker::linkInModule(
+    std::unique_ptr<Module> Src, unsigned Flags,
+    std::function<void(Module &, const StringSet<> &)> InternalizeCallback) {
+  ModuleLinker ModLinker(Mover, std::move(Src), Flags,
+                         std::move(InternalizeCallback));
   return ModLinker.run();
 }
 
@@ -618,10 +578,11 @@ bool Linker::linkInModule(std::unique_ptr<Module> Src, unsigned Flags,
 /// true is returned and ErrorMsg (if not null) is set to indicate the problem.
 /// Upon failure, the Dest module could be in a modified state, and shouldn't be
 /// relied on to be consistent.
-bool Linker::linkModules(Module &Dest, std::unique_ptr<Module> Src,
-                         unsigned Flags) {
+bool Linker::linkModules(
+    Module &Dest, std::unique_ptr<Module> Src, unsigned Flags,
+    std::function<void(Module &, const StringSet<> &)> InternalizeCallback) {
   Linker L(Dest);
-  return L.linkInModule(std::move(Src), Flags);
+  return L.linkInModule(std::move(Src), Flags, std::move(InternalizeCallback));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/MC/CMakeLists.txt b/lib/MC/CMakeLists.txt
index 2f1b39e58e33fa4a07ed4de97ce097989e631ce9..a86fd383003dacb6aecb26c21d047a89fbd3576c 100644
--- a/lib/MC/CMakeLists.txt
+++ b/lib/MC/CMakeLists.txt
@@ -6,6 +6,7 @@ add_llvm_library(LLVMMC
   MCAsmInfoCOFF.cpp
   MCAsmInfoDarwin.cpp
   MCAsmInfoELF.cpp
+  MCAsmInfoWasm.cpp
   MCAsmStreamer.cpp
   MCAssembler.cpp
   MCCodeEmitter.cpp
@@ -34,17 +35,21 @@ add_llvm_library(LLVMMC
   MCSectionCOFF.cpp
   MCSectionELF.cpp
   MCSectionMachO.cpp
+  MCSectionWasm.cpp
   MCStreamer.cpp
   MCSubtargetInfo.cpp
   MCSymbol.cpp
   MCSymbolELF.cpp
   MCTargetOptions.cpp
   MCValue.cpp
+  MCWasmObjectTargetWriter.cpp
+  MCWasmStreamer.cpp
   MCWin64EH.cpp
   MCWinEH.cpp
   MachObjectWriter.cpp
   StringTableBuilder.cpp
   SubtargetFeature.cpp
+  WasmObjectWriter.cpp
   WinCOFFObjectWriter.cpp
   WinCOFFStreamer.cpp
 
diff --git a/lib/MC/ConstantPools.cpp b/lib/MC/ConstantPools.cpp
index 9608c2c656b7c0a8162fa0f1235c7044792473da..8c94e2780998f1851048b1f9bf620669e6a802de 100644
--- a/lib/MC/ConstantPools.cpp
+++ b/lib/MC/ConstantPools.cpp
@@ -1,4 +1,4 @@
-//===- ConstantPools.cpp - ConstantPool class --*- C++ -*---------===//
+//===- ConstantPools.cpp - ConstantPool class -----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,13 +10,16 @@
 // This file implements the ConstantPool and  AssemblerConstantPools classes.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/ADT/MapVector.h"
+
 #include "llvm/MC/ConstantPools.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Casting.h"
 
 using namespace llvm;
+
 //
 // ConstantPool implementation
 //
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index 0e02cdb4ca034fdd87e65bbef0601b2e2f18c7d3..ee9c25cda94fd89815e83328fb1419786eb203b3 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -11,30 +11,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
-#include "llvm/ADT/StringMap.h"
-#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/StringSaver.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <map>
+#include <memory>
+#include <string>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
@@ -43,6 +62,7 @@ using namespace llvm;
 #define DEBUG_TYPE "reloc-info"
 
 namespace {
+
 typedef DenseMap<const MCSectionELF *, uint32_t> SectionIndexMapTy;
 
 class ELFObjectWriter;
@@ -100,8 +120,7 @@ class ELFObjectWriter : public MCObjectWriter {
 
   DenseMap<const MCSymbolELF *, const MCSymbolELF *> Renames;
 
-  llvm::DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>>
-      Relocations;
+  DenseMap<const MCSectionELF *, std::vector<ELFRelocationEntry>> Relocations;
 
   /// @}
   /// @name Symbol Table Data
@@ -145,6 +164,8 @@ public:
                   bool IsLittleEndian)
       : MCObjectWriter(OS, IsLittleEndian), TargetObjectWriter(MOTW) {}
 
+  ~ELFObjectWriter() override = default;
+
   void reset() override {
     Renames.clear();
     Relocations.clear();
@@ -153,8 +174,6 @@ public:
     MCObjectWriter::reset();
   }
 
-  ~ELFObjectWriter() override;
-
   void WriteWord(uint64_t W) {
     if (is64Bit())
       write64(W);
@@ -223,18 +242,18 @@ public:
 
   void writeRelocations(const MCAssembler &Asm, const MCSectionELF &Sec);
 
+  using MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl;
   bool isSymbolRefDifferenceFullyResolvedImpl(const MCAssembler &Asm,
                                               const MCSymbol &SymA,
                                               const MCFragment &FB, bool InSet,
                                               bool IsPCRel) const override;
 
-  bool isWeak(const MCSymbol &Sym) const override;
-
   void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
   void writeSection(const SectionIndexMapTy &SectionIndexMap,
                     uint32_t GroupSymbolIndex, uint64_t Offset, uint64_t Size,
                     const MCSectionELF &Section);
 };
+
 } // end anonymous namespace
 
 void ELFObjectWriter::align(unsigned Alignment) {
@@ -298,9 +317,6 @@ void SymbolTableWriter::writeSymbol(uint32_t name, uint8_t info, uint64_t value,
   ++NumWritten;
 }
 
-ELFObjectWriter::~ELFObjectWriter()
-{}
-
 // Emit the ELF header.
 void ELFObjectWriter::writeHeader(const MCAssembler &Asm) {
   // ELF Header
@@ -371,22 +387,6 @@ uint64_t ELFObjectWriter::SymbolValue(const MCSymbol &Sym,
 
 void ELFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
                                                const MCAsmLayout &Layout) {
-  // Section symbols are used as definitions for undefined symbols with matching
-  // names. If there are multiple sections with the same name, the first one is
-  // used.
-  for (const MCSection &Sec : Asm) {
-    const MCSymbol *Begin = Sec.getBeginSymbol();
-    if (!Begin)
-      continue;
-
-    const MCSymbol *Alias = Asm.getContext().lookupSymbol(Begin->getName());
-    if (!Alias || !Alias->isUndefined())
-      continue;
-
-    Renames.insert(
-        std::make_pair(cast<MCSymbolELF>(Alias), cast<MCSymbolELF>(Begin)));
-  }
-
   // The presence of symbol versions causes undefined symbols and
   // versions declared with @@@ to be renamed.
   for (const MCSymbol &A : Asm.symbols()) {
@@ -901,6 +901,8 @@ void ELFObjectWriter::computeSymbolTable(
 
   StrTabBuilder.finalize();
 
+  // File symbols are emitted first and handled separately from normal symbols,
+  // i.e. a non-STT_FILE symbol with the same name may appear.
   for (const std::string &Name : FileNames)
     Writer.writeSymbol(StrTabBuilder.getOffset(Name),
                        ELF::STT_FILE | ELF::STB_LOCAL, 0, 0, ELF::STV_DEFAULT,
@@ -1152,8 +1154,8 @@ void ELFObjectWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
   case ELF::SHT_RELA: {
     sh_link = SymbolTableIndex;
     assert(sh_link && ".symtab not found");
-    const MCSectionELF *InfoSection = Section.getAssociatedSection();
-    sh_info = SectionIndexMap.lookup(InfoSection);
+    const MCSection *InfoSection = Section.getAssociatedSection();
+    sh_info = SectionIndexMap.lookup(cast<MCSectionELF>(InfoSection));
     break;
   }
 
@@ -1173,9 +1175,11 @@ void ELFObjectWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
     break;
   }
 
-  if (TargetObjectWriter->getEMachine() == ELF::EM_ARM &&
-      Section.getType() == ELF::SHT_ARM_EXIDX)
-    sh_link = SectionIndexMap.lookup(Section.getAssociatedSection());
+  if (Section.getFlags() & ELF::SHF_LINK_ORDER) {
+    const MCSymbol *Sym = Section.getAssociatedSymbol();
+    const MCSectionELF *Sec = cast<MCSectionELF>(&Sym->getSection());
+    sh_link = SectionIndexMap.lookup(Sec);
+  }
 
   WriteSecHdrEntry(StrTabBuilder.getOffset(Section.getSectionName()),
                    Section.getType(), Section.getFlags(), 0, Offset, Size,
@@ -1299,7 +1303,8 @@ void ELFObjectWriter::writeObject(MCAssembler &Asm,
     // Remember the offset into the file for this section.
     uint64_t SecStart = getStream().tell();
 
-    writeRelocations(Asm, *RelSection->getAssociatedSection());
+    writeRelocations(Asm,
+                     cast<MCSectionELF>(*RelSection->getAssociatedSection()));
 
     uint64_t SecEnd = getStream().tell();
     SectionOffsets[RelSection] = std::make_pair(SecStart, SecEnd);
@@ -1352,34 +1357,13 @@ bool ELFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   const auto &SymA = cast<MCSymbolELF>(SA);
   if (IsPCRel) {
     assert(!InSet);
-    if (::isWeak(SymA))
+    if (isWeak(SymA))
       return false;
   }
   return MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(Asm, SymA, FB,
                                                                 InSet, IsPCRel);
 }
 
-bool ELFObjectWriter::isWeak(const MCSymbol &S) const {
-  const auto &Sym = cast<MCSymbolELF>(S);
-  if (::isWeak(Sym))
-    return true;
-
-  // It is invalid to replace a reference to a global in a comdat
-  // with a reference to a local since out of comdat references
-  // to a local are forbidden.
-  // We could try to return false for more cases, like the reference
-  // being in the same comdat or Sym being an alias to another global,
-  // but it is not clear if it is worth the effort.
-  if (Sym.getBinding() != ELF::STB_GLOBAL)
-    return false;
-
-  if (!Sym.isInSection())
-    return false;
-
-  const auto &Sec = cast<MCSectionELF>(Sym.getSection());
-  return Sec.getGroup();
-}
-
 MCObjectWriter *llvm::createELFObjectWriter(MCELFObjectTargetWriter *MOTW,
                                             raw_pwrite_stream &OS,
                                             bool IsLittleEndian) {
diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index 570f764f664269e3504f776935763aeab054277f..fc0aa788f6d3a88e02c4bf650bd0bb6116dd833c 100644
--- a/lib/MC/MCAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmBackend.cpp - Target MC Assembly Backend ----------------------==//
+//===- MCAsmBackend.cpp - Target MC Assembly Backend ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,14 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+
 using namespace llvm;
 
-MCAsmBackend::MCAsmBackend() {}
+MCAsmBackend::MCAsmBackend() = default;
 
-MCAsmBackend::~MCAsmBackend() {}
+MCAsmBackend::~MCAsmBackend() = default;
 
 Optional<MCFixupKind> MCAsmBackend::getFixupKind(StringRef Name) const {
   return None;
diff --git a/lib/MC/MCAsmInfo.cpp b/lib/MC/MCAsmInfo.cpp
index 3eb8f50de5a8a54af8f21c3f972746b697e0e681..b9be685cedc4139dc2d214bd69b98b6baf7a29f6 100644
--- a/lib/MC/MCAsmInfo.cpp
+++ b/lib/MC/MCAsmInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfo.cpp - Asm Info -------------------------------------------==//
+//===- MCAsmInfo.cpp - Asm Info -------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,29 +16,14 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/Dwarf.h"
-#include <cctype>
-#include <cstring>
+
 using namespace llvm;
 
 MCAsmInfo::MCAsmInfo() {
-  PointerSize = 4;
-  CalleeSaveStackSlotSize = 4;
-
-  IsLittleEndian = true;
-  StackGrowsUp = false;
-  HasSubsectionsViaSymbols = false;
-  HasMachoZeroFillDirective = false;
-  HasMachoTBSSDirective = false;
-  MaxInstLength = 4;
-  MinInstAlignment = 1;
-  DollarIsPC = false;
   SeparatorString = ";";
   CommentString = "#";
   LabelSuffix = ":";
-  UseAssignmentForEHBegin = false;
-  NeedsLocalForSize = false;
   PrivateGlobalPrefix = "L";
   PrivateLabelPrefix = PrivateGlobalPrefix;
   LinkerPrivateGlobalPrefix = "";
@@ -47,10 +32,6 @@ MCAsmInfo::MCAsmInfo() {
   Code16Directive = ".code16";
   Code32Directive = ".code32";
   Code64Directive = ".code64";
-  AssemblerDialect = 0;
-  AllowAtInName = false;
-  SupportsQuotedNames = true;
-  UseDataRegionDirectives = false;
   ZeroDirective = "\t.zero\t";
   AsciiDirective = "\t.ascii\t";
   AscizDirective = "\t.asciz\t";
@@ -58,40 +39,8 @@ MCAsmInfo::MCAsmInfo() {
   Data16bitsDirective = "\t.short\t";
   Data32bitsDirective = "\t.long\t";
   Data64bitsDirective = "\t.quad\t";
-  SunStyleELFSectionSwitchSyntax = false;
-  UsesELFSectionDirectiveForBSS = false;
-  AlignmentIsInBytes = true;
-  TextAlignFillValue = 0;
-  GPRel64Directive = nullptr;
-  GPRel32Directive = nullptr;
   GlobalDirective = "\t.globl\t";
-  SetDirectiveSuppressesReloc = false;
-  HasAggressiveSymbolFolding = true;
-  COMMDirectiveAlignmentIsInBytes = true;
-  LCOMMDirectiveAlignmentType = LCOMM::NoAlignment;
-  HasFunctionAlignment = true;
-  HasDotTypeDotSizeDirective = true;
-  HasSingleParameterDotFile = true;
-  HasIdentDirective = false;
-  HasNoDeadStrip = false;
-  HasAltEntry = false;
   WeakDirective = "\t.weak\t";
-  WeakRefDirective = nullptr;
-  HasWeakDefDirective = false;
-  HasWeakDefCanBeHiddenDirective = false;
-  HasLinkOnceDirective = false;
-  HiddenVisibilityAttr = MCSA_Hidden;
-  HiddenDeclarationVisibilityAttr = MCSA_Hidden;
-  ProtectedVisibilityAttr = MCSA_Protected;
-  SupportsDebugInformation = false;
-  ExceptionsType = ExceptionHandling::None;
-  WinEHEncodingType = WinEH::EncodingType::Invalid;
-  DwarfUsesRelocationsAcrossSections = true;
-  DwarfFDESymbolsUseAbsDiff = false;
-  DwarfRegNumForCFI = false;
-  NeedsDwarfSectionOffsetDirective = false;
-  UseParensForSymbolVariant = false;
-  UseLogicalShr = true;
 
   // FIXME: Clang's logic should be synced with the logic used to initialize
   //        this member and the two implementations should be merged.
@@ -107,12 +56,9 @@ MCAsmInfo::MCAsmInfo() {
   //   - The target subclasses for AArch64, ARM, and X86 handle these cases
   UseIntegratedAssembler = false;
   PreserveAsmComments = true;
-
-  CompressDebugSections = DebugCompressionType::DCT_None;
 }
 
-MCAsmInfo::~MCAsmInfo() {
-}
+MCAsmInfo::~MCAsmInfo() = default;
 
 bool MCAsmInfo::isSectionAtomizableBySymbols(const MCSection &Section) const {
   return false;
diff --git a/lib/MC/MCAsmInfoCOFF.cpp b/lib/MC/MCAsmInfoCOFF.cpp
index 5b9dd2009f8b2eed2ac0863f1794b447372f9d17..85104484fd40186e7f293fe3ae995d84ade45654 100644
--- a/lib/MC/MCAsmInfoCOFF.cpp
+++ b/lib/MC/MCAsmInfoCOFF.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfoCOFF.cpp - COFF asm properties -----------------*- C++ -*-===//
+//===- MCAsmInfoCOFF.cpp - COFF asm properties ----------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,9 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoCOFF.h"
+#include "llvm/MC/MCDirectives.h"
+
 using namespace llvm;
 
-void MCAsmInfoCOFF::anchor() { }
+void MCAsmInfoCOFF::anchor() {}
 
 MCAsmInfoCOFF::MCAsmInfoCOFF() {
   // MingW 4.5 and later support .comm with log2 alignment, but .lcomm uses byte
@@ -41,13 +43,10 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   UseLogicalShr = false;
 }
 
-void MCAsmInfoMicrosoft::anchor() { }
-
-MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() {
-}
+void MCAsmInfoMicrosoft::anchor() {}
 
-void MCAsmInfoGNUCOFF::anchor() { }
+MCAsmInfoMicrosoft::MCAsmInfoMicrosoft() = default;
 
-MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() {
+void MCAsmInfoGNUCOFF::anchor() {}
 
-}
+MCAsmInfoGNUCOFF::MCAsmInfoGNUCOFF() = default;
diff --git a/lib/MC/MCAsmInfoDarwin.cpp b/lib/MC/MCAsmInfoDarwin.cpp
index e95cf488cd3089936e880c593be27ea16989a760..4b2001764e972b0515971c61716d7c136bb1516d 100644
--- a/lib/MC/MCAsmInfoDarwin.cpp
+++ b/lib/MC/MCAsmInfoDarwin.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfoDarwin.cpp - Darwin asm properties -------------*- C++ -*-===//
+//===- MCAsmInfoDarwin.cpp - Darwin asm properties ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,9 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoDarwin.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/Support/MachO.h"
+
 using namespace llvm;
 
 bool MCAsmInfoDarwin::isSectionAtomizableBySymbols(
diff --git a/lib/MC/MCAsmInfoELF.cpp b/lib/MC/MCAsmInfoELF.cpp
index 26e5608d87333e2e891d91aeb6bf0a8222dc472b..e44c08b50d766b4c9b9ca1638c41da60887daa20 100644
--- a/lib/MC/MCAsmInfoELF.cpp
+++ b/lib/MC/MCAsmInfoELF.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmInfoELF.cpp - ELF asm properties -------------------*- C++ -*-===//
+//===- MCAsmInfoELF.cpp - ELF asm properties ------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,9 +16,10 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/ELF.h"
+
 using namespace llvm;
 
-void MCAsmInfoELF::anchor() { }
+void MCAsmInfoELF::anchor() {}
 
 MCSection *MCAsmInfoELF::getNonexecutableStackSection(MCContext &Ctx) const {
   if (!UsesNonexecutableStackSection)
@@ -31,5 +32,4 @@ MCAsmInfoELF::MCAsmInfoELF() {
   WeakRefDirective = "\t.weak\t";
   PrivateGlobalPrefix = ".L";
   PrivateLabelPrefix = ".L";
-  UsesNonexecutableStackSection = true;
 }
diff --git a/lib/MC/MCAsmInfoWasm.cpp b/lib/MC/MCAsmInfoWasm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..aa26616dda363e6cc79aaac6550fe194222428a5
--- /dev/null
+++ b/lib/MC/MCAsmInfoWasm.cpp
@@ -0,0 +1,27 @@
+//===-- MCAsmInfoWasm.cpp - Wasm asm properties -----------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines target asm properties related what form asm statements
+// should take in general on Wasm-based targets
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCAsmInfoWasm.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionWasm.h"
+using namespace llvm;
+
+void MCAsmInfoWasm::anchor() { }
+
+MCAsmInfoWasm::MCAsmInfoWasm() {
+  HasIdentDirective = true;
+  WeakRefDirective = "\t.weak\t";
+  PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
+}
diff --git a/lib/MC/MCAsmStreamer.cpp b/lib/MC/MCAsmStreamer.cpp
index 2eae26bdc0c09c1080026e793984d6ab52cce447..92dcf535ec996a32a02250f5d2a86da1eb853c6c 100644
--- a/lib/MC/MCAsmStreamer.cpp
+++ b/lib/MC/MCAsmStreamer.cpp
@@ -130,7 +130,7 @@ public:
   void ChangeSection(MCSection *Section, const MCExpr *Subsection) override;
 
   void EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
 
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
   void EmitLinkerOptions(ArrayRef<std::string> Options) override;
@@ -397,9 +397,8 @@ void MCAsmStreamer::ChangeSection(MCSection *Section,
       Subsection);
 }
 
-void MCAsmStreamer::EmitLabel(MCSymbol *Symbol) {
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-  MCStreamer::EmitLabel(Symbol);
+void MCAsmStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  MCStreamer::EmitLabel(Symbol, Loc);
 
   Symbol->print(OS, MAI);
   OS << MAI->getLabelSuffix();
diff --git a/lib/MC/MCAssembler.cpp b/lib/MC/MCAssembler.cpp
index 17a0edcc95450a4f5551aed5c5e00bb7db844995..c2bb7b2771814295bde62d3eed227bdb69dd34a4 100644
--- a/lib/MC/MCAssembler.cpp
+++ b/lib/MC/MCAssembler.cpp
@@ -7,36 +7,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCAssembler.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cstring>
+#include <cassert>
+#include <cstdint>
 #include <tuple>
+#include <utility>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "assembler"
 
 namespace {
 namespace stats {
+
 STATISTIC(EmittedFragments, "Number of emitted assembler fragments - total");
 STATISTIC(EmittedRelaxableFragments,
           "Number of emitted assembler fragments - relaxable");
@@ -55,8 +68,9 @@ STATISTIC(FragmentLayouts, "Number of fragment layouts");
 STATISTIC(ObjectBytes, "Number of emitted object file bytes");
 STATISTIC(RelaxationSteps, "Number of assembler layout and relaxation steps");
 STATISTIC(RelaxedInstructions, "Number of relaxed instructions");
-}
-}
+
+} // end namespace stats
+} // end anonymous namespace
 
 // FIXME FIXME FIXME: There are number of places in this file where we convert
 // what is a 64-bit assembler value used for computation into a value in the
@@ -73,8 +87,7 @@ MCAssembler::MCAssembler(MCContext &Context, MCAsmBackend &Backend,
   VersionMinInfo.Major = 0; // Major version == 0 for "none specified"
 }
 
-MCAssembler::~MCAssembler() {
-}
+MCAssembler::~MCAssembler() = default;
 
 void MCAssembler::reset() {
   Sections.clear();
@@ -225,7 +238,6 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
       Value -= Layout.getSymbolOffset(Sym);
   }
 
-
   bool ShouldAlignPC = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
                          MCFixupKindInfo::FKF_IsAlignedDownTo32Bits;
   assert((ShouldAlignPC ? IsPCRel : true) &&
@@ -647,7 +659,7 @@ std::pair<uint64_t, bool> MCAssembler::handleFixup(const MCAsmLayout &Layout,
 
 void MCAssembler::layout(MCAsmLayout &Layout) {
   DEBUG_WITH_TYPE("mc-dump", {
-      llvm::errs() << "assembler backend - pre-layout\n--\n";
+      errs() << "assembler backend - pre-layout\n--\n";
       dump(); });
 
   // Create dummy fragments and assign section ordinals.
@@ -677,14 +689,14 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
       return;
 
   DEBUG_WITH_TYPE("mc-dump", {
-      llvm::errs() << "assembler backend - post-relaxation\n--\n";
+      errs() << "assembler backend - post-relaxation\n--\n";
       dump(); });
 
   // Finalize the layout, including fragment lowering.
   finishLayout(Layout);
 
   DEBUG_WITH_TYPE("mc-dump", {
-      llvm::errs() << "assembler backend - final-layout\n--\n";
+      errs() << "assembler backend - final-layout\n--\n";
       dump(); });
 
   // Allow the object writer a chance to perform post-layout binding (for
@@ -720,8 +732,8 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
         uint64_t FixedValue;
         bool IsPCRel;
         std::tie(FixedValue, IsPCRel) = handleFixup(Layout, Frag, Fixup);
-        getBackend().applyFixup(Fixup, Contents.data(),
-                                Contents.size(), FixedValue, IsPCRel);
+        getBackend().applyFixup(Fixup, Contents.data(), Contents.size(),
+                                FixedValue, IsPCRel, getContext());
       }
     }
   }
@@ -747,6 +759,10 @@ bool MCAssembler::fixupNeedsRelaxation(const MCFixup &Fixup,
   MCValue Target;
   uint64_t Value;
   bool Resolved = evaluateFixup(Layout, Fixup, DF, Target, Value);
+  if (Target.getSymA() &&
+      Target.getSymA()->getKind() == MCSymbolRefExpr::VK_X86_ABS8 &&
+      Fixup.getKind() == FK_Data_1)
+    return false;
   return getBackend().fixupNeedsRelaxationAdvanced(Fixup, Resolved, Value, DF,
                                                    Layout);
 }
diff --git a/lib/MC/MCCodeEmitter.cpp b/lib/MC/MCCodeEmitter.cpp
index c122763b2fe596a550d3982863450333b0995917..ca69478ed10d3f3287d9de47ba3fe2d2e1b4974d 100644
--- a/lib/MC/MCCodeEmitter.cpp
+++ b/lib/MC/MCCodeEmitter.cpp
@@ -1,4 +1,4 @@
-//===-- MCCodeEmitter.cpp - Instruction Encoding --------------------------===//
+//===- MCCodeEmitter.cpp - Instruction Encoding ---------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,8 +11,6 @@
 
 using namespace llvm;
 
-MCCodeEmitter::MCCodeEmitter() {
-}
+MCCodeEmitter::MCCodeEmitter() = default;
 
-MCCodeEmitter::~MCCodeEmitter() {
-}
+MCCodeEmitter::~MCCodeEmitter() = default;
diff --git a/lib/MC/MCContext.cpp b/lib/MC/MCContext.cpp
index 4798991ceed6f4ba3e61c4122d4f47c0822b4cfa..4628d0ab88f30d3d0b8c3f8f5acf6327eca04b57 100644
--- a/lib/MC/MCContext.cpp
+++ b/lib/MC/MCContext.cpp
@@ -7,30 +7,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCContext.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeView.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCLabel.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbolMachO.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/SourceMgr.h"
+#include <cassert>
+#include <cstdlib>
+#include <tuple>
+#include <utility>
 
 using namespace llvm;
 
@@ -40,19 +53,14 @@ AsSecureLogFileName("as-secure-log-file-name",
                  "AS_SECURE_LOG_FILE env variable)"),
         cl::init(getenv("AS_SECURE_LOG_FILE")), cl::Hidden);
 
-
 MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
                      const MCObjectFileInfo *mofi, const SourceMgr *mgr,
                      bool DoAutoReset)
-    : SrcMgr(mgr), MAI(mai), MRI(mri), MOFI(mofi), Allocator(),
+    : SrcMgr(mgr), InlineSrcMgr(nullptr), MAI(mai), MRI(mri), MOFI(mofi),
       Symbols(Allocator), UsedNames(Allocator),
-      CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0), DwarfLocSeen(false),
-      GenDwarfForAssembly(false), GenDwarfFileNumber(0), DwarfVersion(4),
-      AllowTemporaryLabels(true), DwarfCompileUnitID(0),
-      AutoReset(DoAutoReset), HadError(false) {
+      CurrentDwarfLoc(0, 0, 0, DWARF2_FLAG_IS_STMT, 0, 0),
+      AutoReset(DoAutoReset) {
   SecureLogFile = AsSecureLogFileName;
-  SecureLog = nullptr;
-  SecureLogUsed = false;
 
   if (SrcMgr && SrcMgr->getNumBuffers())
     MainFileName =
@@ -80,7 +88,6 @@ void MCContext::reset() {
   MCSubtargetAllocator.DestroyAll();
   UsedNames.clear();
   Symbols.clear();
-  SectionSymbols.clear();
   Allocator.Reset();
   Instances.clear();
   CompilationDir.clear();
@@ -124,18 +131,6 @@ MCSymbol *MCContext::getOrCreateSymbol(const Twine &Name) {
   return Sym;
 }
 
-MCSymbolELF *MCContext::getOrCreateSectionSymbol(const MCSectionELF &Section) {
-  MCSymbol *&Sym = SectionSymbols[&Section];
-  if (Sym)
-    return cast<MCSymbolELF>(Sym);
-
-  StringRef Name = Section.getSectionName();
-  auto NameIter = UsedNames.insert(std::make_pair(Name, false)).first;
-  Sym = new (&*NameIter, *this) MCSymbolELF(&*NameIter, /*isTemporary*/ false);
-
-  return cast<MCSymbolELF>(Sym);
-}
-
 MCSymbol *MCContext::getOrCreateFrameAllocSymbol(StringRef FuncName,
                                                  unsigned Idx) {
   return getOrCreateSymbol(Twine(MAI->getPrivateGlobalPrefix()) + FuncName +
@@ -162,6 +157,8 @@ MCSymbol *MCContext::createSymbolImpl(const StringMapEntry<bool> *Name,
       return new (Name, *this) MCSymbolELF(Name, IsTemporary);
     case MCObjectFileInfo::IsMachO:
       return new (Name, *this) MCSymbolMachO(Name, IsTemporary);
+    case MCObjectFileInfo::IsWasm:
+      return new (Name, *this) MCSymbolWasm(Name, IsTemporary);
     }
   }
   return new (Name, *this) MCSymbol(MCSymbol::SymbolKindUnset, Name,
@@ -182,7 +179,7 @@ MCSymbol *MCContext::createSymbol(StringRef Name, bool AlwaysAddSuffix,
   SmallString<128> NewName = Name;
   bool AddSuffix = AlwaysAddSuffix;
   unsigned &NextUniqueID = NextID[Name];
-  for (;;) {
+  while (true) {
     if (AddSuffix) {
       NewName.resize(Name.size());
       raw_svector_ostream(NewName) << NextUniqueID++;
@@ -275,7 +272,6 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
                                            unsigned TypeAndAttributes,
                                            unsigned Reserved2, SectionKind Kind,
                                            const char *BeginSymName) {
-
   // We unique sections by their segment/section pair.  The returned section
   // may not have the same flags as the requested section, if so this should be
   // diagnosed by the client as an error.
@@ -316,18 +312,53 @@ void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) {
   const_cast<MCSectionELF *>(Section)->setSectionName(CachedName);
 }
 
+MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
+                                              unsigned Flags, SectionKind K,
+                                              unsigned EntrySize,
+                                              const MCSymbolELF *Group,
+                                              unsigned UniqueID,
+                                              const MCSymbolELF *Associated) {
+  MCSymbolELF *R;
+  MCSymbol *&Sym = Symbols[Section];
+  // A section symbol can not redefine regular symbols. There may be multiple
+  // sections with the same name, in which case the first such section wins.
+  if (Sym && Sym->isDefined() &&
+      (!Sym->isInSection() || Sym->getSection().getBeginSymbol() != Sym))
+    reportError(SMLoc(), "invalid symbol redefinition");
+  if (Sym && Sym->isUndefined()) {
+    R = cast<MCSymbolELF>(Sym);
+  } else {
+    auto NameIter = UsedNames.insert(std::make_pair(Section, false)).first;
+    R = new (&*NameIter, *this) MCSymbolELF(&*NameIter, /*isTemporary*/ false);
+    if (!Sym)
+      Sym = R;
+  }
+  R->setBinding(ELF::STB_LOCAL);
+  R->setType(ELF::STT_SECTION);
+
+  auto *Ret = new (ELFAllocator.Allocate()) MCSectionELF(
+      Section, Type, Flags, K, EntrySize, Group, UniqueID, R, Associated);
+
+  auto *F = new MCDataFragment();
+  Ret->getFragmentList().insert(Ret->begin(), F);
+  F->setParent(Ret);
+  R->setFragment(F);
+
+  return Ret;
+}
+
 MCSectionELF *MCContext::createELFRelSection(const Twine &Name, unsigned Type,
                                              unsigned Flags, unsigned EntrySize,
                                              const MCSymbolELF *Group,
-                                             const MCSectionELF *Associated) {
+                                             const MCSectionELF *RelInfoSection) {
   StringMap<bool>::iterator I;
   bool Inserted;
   std::tie(I, Inserted) =
-      ELFRelSecNames.insert(std::make_pair(Name.str(), true));
+      RelSecNames.insert(std::make_pair(Name.str(), true));
 
-  return new (ELFAllocator.Allocate())
-      MCSectionELF(I->getKey(), Type, Flags, SectionKind::getReadOnly(),
-                   EntrySize, Group, true, nullptr, Associated);
+  return createELFSectionImpl(
+      I->getKey(), Type, Flags, SectionKind::getReadOnly(), EntrySize, Group,
+      true, cast<MCSymbolELF>(RelInfoSection->getBeginSymbol()));
 }
 
 MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix,
@@ -340,21 +371,20 @@ MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix,
 MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
                                        unsigned Flags, unsigned EntrySize,
                                        const Twine &Group, unsigned UniqueID,
-                                       const char *BeginSymName) {
+                                       const MCSymbolELF *Associated) {
   MCSymbolELF *GroupSym = nullptr;
   if (!Group.isTriviallyEmpty() && !Group.str().empty())
     GroupSym = cast<MCSymbolELF>(getOrCreateSymbol(Group));
 
   return getELFSection(Section, Type, Flags, EntrySize, GroupSym, UniqueID,
-                       BeginSymName, nullptr);
+                       Associated);
 }
 
 MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
                                        unsigned Flags, unsigned EntrySize,
                                        const MCSymbolELF *GroupSym,
                                        unsigned UniqueID,
-                                       const char *BeginSymName,
-                                       const MCSectionELF *Associated) {
+                                       const MCSymbolELF *Associated) {
   StringRef Group = "";
   if (GroupSym)
     Group = GroupSym->getName();
@@ -375,22 +405,16 @@ MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
   else
     Kind = SectionKind::getReadOnly();
 
-  MCSymbol *Begin = nullptr;
-  if (BeginSymName)
-    Begin = createTempSymbol(BeginSymName, false);
-
-  MCSectionELF *Result = new (ELFAllocator.Allocate())
-      MCSectionELF(CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID,
-                   Begin, Associated);
+  MCSectionELF *Result = createELFSectionImpl(
+      CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID, Associated);
   Entry.second = Result;
   return Result;
 }
 
 MCSectionELF *MCContext::createELFGroupSection(const MCSymbolELF *Group) {
-  MCSectionELF *Result = new (ELFAllocator.Allocate())
-      MCSectionELF(".group", ELF::SHT_GROUP, 0, SectionKind::getReadOnly(), 4,
-                   Group, ~0, nullptr, nullptr);
-  return Result;
+  return createELFSectionImpl(".group", ELF::SHT_GROUP, 0,
+                              SectionKind::getReadOnly(), 4, Group, ~0,
+                              nullptr);
 }
 
 MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
@@ -462,6 +486,80 @@ MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec,
                         "", 0, UniqueID);
 }
 
+void MCContext::renameWasmSection(MCSectionWasm *Section, StringRef Name) {
+  StringRef GroupName;
+  assert(!Section->getGroup() && "not yet implemented");
+
+  unsigned UniqueID = Section->getUniqueID();
+  WasmUniquingMap.erase(
+      WasmSectionKey{Section->getSectionName(), GroupName, UniqueID});
+  auto I = WasmUniquingMap.insert(std::make_pair(
+                                     WasmSectionKey{Name, GroupName, UniqueID},
+                                     Section))
+               .first;
+  StringRef CachedName = I->first.SectionName;
+  const_cast<MCSectionWasm *>(Section)->setSectionName(CachedName);
+}
+
+MCSectionWasm *MCContext::createWasmRelSection(const Twine &Name, unsigned Type,
+                                               unsigned Flags,
+                                               const MCSymbolWasm *Group) {
+  StringMap<bool>::iterator I;
+  bool Inserted;
+  std::tie(I, Inserted) =
+      RelSecNames.insert(std::make_pair(Name.str(), true));
+
+  return new (WasmAllocator.Allocate())
+      MCSectionWasm(I->getKey(), Type, Flags, SectionKind::getReadOnly(),
+                    Group, ~0, nullptr);
+}
+
+MCSectionWasm *MCContext::getWasmNamedSection(const Twine &Prefix,
+                                              const Twine &Suffix, unsigned Type,
+                                              unsigned Flags) {
+  return getWasmSection(Prefix + "." + Suffix, Type, Flags, Suffix);
+}
+
+MCSectionWasm *MCContext::getWasmSection(const Twine &Section, unsigned Type,
+                                         unsigned Flags,
+                                         const Twine &Group, unsigned UniqueID,
+                                         const char *BeginSymName) {
+  MCSymbolWasm *GroupSym = nullptr;
+  if (!Group.isTriviallyEmpty() && !Group.str().empty())
+    GroupSym = cast<MCSymbolWasm>(getOrCreateSymbol(Group));
+
+  return getWasmSection(Section, Type, Flags, GroupSym, UniqueID, BeginSymName);
+}
+
+MCSectionWasm *MCContext::getWasmSection(const Twine &Section, unsigned Type,
+                                         unsigned Flags,
+                                         const MCSymbolWasm *GroupSym,
+                                         unsigned UniqueID,
+                                         const char *BeginSymName) {
+  StringRef Group = "";
+  if (GroupSym)
+    Group = GroupSym->getName();
+  // Do the lookup, if we have a hit, return it.
+  auto IterBool = WasmUniquingMap.insert(
+      std::make_pair(WasmSectionKey{Section.str(), Group, UniqueID}, nullptr));
+  auto &Entry = *IterBool.first;
+  if (!IterBool.second)
+    return Entry.second;
+
+  StringRef CachedName = Entry.first.SectionName;
+
+  SectionKind Kind = SectionKind::getText();
+
+  MCSymbol *Begin = nullptr;
+  if (BeginSymName)
+    Begin = createTempSymbol(BeginSymName, false);
+
+  MCSectionWasm *Result = new (WasmAllocator.Allocate())
+      MCSectionWasm(CachedName, Type, Flags, Kind, GroupSym, UniqueID, Begin);
+  Entry.second = Result;
+  return Result;
+}
+
 MCSubtargetInfo &MCContext::getSubtargetCopy(const MCSubtargetInfo &STI) {
   return *new (MCSubtargetAllocator.Allocate()) MCSubtargetInfo(STI);
 }
@@ -510,13 +608,15 @@ CodeViewContext &MCContext::getCVContext() {
 void MCContext::reportError(SMLoc Loc, const Twine &Msg) {
   HadError = true;
 
-  // If we have a source manager use it. Otherwise just use the generic
-  // report_fatal_error().
-  if (!SrcMgr)
+  // If we have a source manager use it. Otherwise, try using the inline source
+  // manager.
+  // If that fails, use the generic report_fatal_error().
+  if (SrcMgr)
+    SrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
+  else if (InlineSrcMgr)
+    InlineSrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
+  else
     report_fatal_error(Msg, false);
-
-  // Use the source manager to print the message.
-  SrcMgr->PrintMessage(Loc, SourceMgr::DK_Error, Msg);
 }
 
 void MCContext::reportFatalError(SMLoc Loc, const Twine &Msg) {
diff --git a/lib/MC/MCDisassembler/MCDisassembler.cpp b/lib/MC/MCDisassembler/MCDisassembler.cpp
index 3a4f7382bd3c7c9c73f18f2143fd79fcb7a539ca..2f1275d00b861f3db7a4c87273a915d486453b4f 100644
--- a/lib/MC/MCDisassembler/MCDisassembler.cpp
+++ b/lib/MC/MCDisassembler/MCDisassembler.cpp
@@ -1,4 +1,4 @@
-//===-- MCDisassembler.cpp - Disassembler interface -----------------------===//
+//===- MCDisassembler.cpp - Disassembler interface ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,13 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
-#include "llvm/MC/MCDisassembler/MCExternalSymbolizer.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
 
 using namespace llvm;
 
-MCDisassembler::~MCDisassembler() {
-}
+MCDisassembler::~MCDisassembler() = default;
 
 bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
                                               uint64_t Address, bool IsBranch,
diff --git a/lib/MC/MCDisassembler/MCRelocationInfo.cpp b/lib/MC/MCDisassembler/MCRelocationInfo.cpp
index 1612562497d9e9f725bc8c639b58ff599fa86f16..5805fd7007d2cb60306b305142c1796a933f3c77 100644
--- a/lib/MC/MCDisassembler/MCRelocationInfo.cpp
+++ b/lib/MC/MCDisassembler/MCRelocationInfo.cpp
@@ -1,4 +1,4 @@
-//==-- MCRelocationInfo.cpp ------------------------------------------------==//
+//===-- MCRelocationInfo.cpp ----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,17 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
-#include "llvm-c/Disassembler.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm-c/Disassembler.h"
 
 using namespace llvm;
 
-MCRelocationInfo::MCRelocationInfo(MCContext &Ctx)
-  : Ctx(Ctx) {
-}
+MCRelocationInfo::MCRelocationInfo(MCContext &Ctx) : Ctx(Ctx) {}
 
-MCRelocationInfo::~MCRelocationInfo() {
-}
+MCRelocationInfo::~MCRelocationInfo() = default;
 
 const MCExpr *
 MCRelocationInfo::createExprForCAPIVariantKind(const MCExpr *SubExpr,
diff --git a/lib/MC/MCDisassembler/MCSymbolizer.cpp b/lib/MC/MCDisassembler/MCSymbolizer.cpp
index c0f707d356c1e77d5d01d7bdade73383ed3783a6..78e611e3ddda947585fd5f254b4c28f1558f4e35 100644
--- a/lib/MC/MCDisassembler/MCSymbolizer.cpp
+++ b/lib/MC/MCDisassembler/MCSymbolizer.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCSymbolizer.cpp - MCSymbolizer class -----------*- C++ -*-===//
+//===-- llvm/MC/MCSymbolizer.cpp - MCSymbolizer class ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,5 +11,4 @@
 
 using namespace llvm;
 
-MCSymbolizer::~MCSymbolizer() {
-}
+MCSymbolizer::~MCSymbolizer() = default;
diff --git a/lib/MC/MCDwarf.cpp b/lib/MC/MCDwarf.cpp
index a7551a3283a32fe875b0a191df5a42c1c7595506..cc32e90ad36ee023ea83421094562468178890e8 100644
--- a/lib/MC/MCDwarf.cpp
+++ b/lib/MC/MCDwarf.cpp
@@ -7,27 +7,41 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCDwarf.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -592,7 +606,6 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
   // And the pair of terminating zeros.
   Length += 2 * AddrSize;
 
-
   // Emit the header for this section.
   // The 4 byte length not including the 4 byte value for the length.
   MCOS->EmitIntValue(Length - 4, 4);
@@ -661,7 +674,14 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   // The 2 byte DWARF version.
   MCOS->EmitIntValue(context.getDwarfVersion(), 2);
 
+  // The DWARF v5 header has unit type, address size, abbrev offset.
+  // Earlier versions have abbrev offset, address size.
   const MCAsmInfo &AsmInfo = *context.getAsmInfo();
+  int AddrSize = AsmInfo.getPointerSize();
+  if (context.getDwarfVersion() >= 5) {
+    MCOS->EmitIntValue(dwarf::DW_UT_compile, 1);
+    MCOS->EmitIntValue(AddrSize, 1);
+  }
   // The 4 byte offset to the debug abbrevs from the start of the .debug_abbrev,
   // it is at the start of that section so this is zero.
   if (AbbrevSectionSymbol == nullptr)
@@ -669,11 +689,8 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   else
     MCOS->EmitSymbolValue(AbbrevSectionSymbol, 4,
                           AsmInfo.needsDwarfSectionOffsetDirective());
-
-  const MCAsmInfo *asmInfo = context.getAsmInfo();
-  int AddrSize = asmInfo->getPointerSize();
-  // The 1 byte size of an address.
-  MCOS->EmitIntValue(AddrSize, 1);
+  if (context.getDwarfVersion() <= 4)
+    MCOS->EmitIntValue(AddrSize, 1);
 
   // Second part: the compile_unit DIE.
 
@@ -885,7 +902,7 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
     }
   }
 
-  assert((RangesSectionSymbol != NULL) || !UseRangesSection);
+  assert((RangesSectionSymbol != nullptr) || !UseRangesSection);
 
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
 
@@ -1003,6 +1020,7 @@ static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
 }
 
 namespace {
+
 class FrameEmitterImpl {
   int CFAOffset = 0;
   int InitialCFAOffset = 0;
@@ -1050,10 +1068,10 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     Streamer.EmitULEB128IntValue(Reg2);
     return;
   }
-  case MCCFIInstruction::OpWindowSave: {
+  case MCCFIInstruction::OpWindowSave:
     Streamer.EmitIntValue(dwarf::DW_CFA_GNU_window_save, 1);
     return;
-  }
+
   case MCCFIInstruction::OpUndefined: {
     unsigned Reg = Instr.getRegister();
     Streamer.EmitIntValue(dwarf::DW_CFA_undefined, 1);
@@ -1087,7 +1105,6 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
 
     return;
   }
-
   case MCCFIInstruction::OpDefCfaRegister: {
     unsigned Reg = Instr.getRegister();
     if (!IsEH)
@@ -1097,7 +1114,6 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
 
     return;
   }
-
   case MCCFIInstruction::OpOffset:
   case MCCFIInstruction::OpRelOffset: {
     const bool IsRelative =
@@ -1145,11 +1161,11 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1);
     return;
   }
-  case MCCFIInstruction::OpGnuArgsSize: {
+  case MCCFIInstruction::OpGnuArgsSize:
     Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1);
     Streamer.EmitULEB128IntValue(Instr.getOffset());
     return;
-  }
+
   case MCCFIInstruction::OpEscape:
     Streamer.EmitBytes(Instr.getValues());
     return;
@@ -1444,10 +1460,12 @@ void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart,
 }
 
 namespace {
+
 struct CIEKey {
   static const CIEKey getEmptyKey() {
     return CIEKey(nullptr, 0, -1, false, false);
   }
+
   static const CIEKey getTombstoneKey() {
     return CIEKey(nullptr, -1, 0, false, false);
   }
@@ -1457,23 +1475,28 @@ struct CIEKey {
       : Personality(Personality), PersonalityEncoding(PersonalityEncoding),
         LsdaEncoding(LsdaEncoding), IsSignalFrame(IsSignalFrame),
         IsSimple(IsSimple) {}
+
   const MCSymbol *Personality;
   unsigned PersonalityEncoding;
   unsigned LsdaEncoding;
   bool IsSignalFrame;
   bool IsSimple;
 };
-} // anonymous namespace
+
+} // end anonymous namespace
 
 namespace llvm {
+
 template <> struct DenseMapInfo<CIEKey> {
   static CIEKey getEmptyKey() { return CIEKey::getEmptyKey(); }
   static CIEKey getTombstoneKey() { return CIEKey::getTombstoneKey(); }
+
   static unsigned getHashValue(const CIEKey &Key) {
     return static_cast<unsigned>(
         hash_combine(Key.Personality, Key.PersonalityEncoding, Key.LsdaEncoding,
                      Key.IsSignalFrame, Key.IsSimple));
   }
+
   static bool isEqual(const CIEKey &LHS, const CIEKey &RHS) {
     return LHS.Personality == RHS.Personality &&
            LHS.PersonalityEncoding == RHS.PersonalityEncoding &&
@@ -1482,7 +1505,8 @@ template <> struct DenseMapInfo<CIEKey> {
            LHS.IsSimple == RHS.IsSimple;
   }
 };
-} // namespace llvm
+
+} // end namespace llvm
 
 void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
                                bool IsEH) {
diff --git a/lib/MC/MCELFObjectTargetWriter.cpp b/lib/MC/MCELFObjectTargetWriter.cpp
index de645cac73709f86c73ee75cfcec55d282732bac..68fb5e7cbb3dd128023ec6103d7000554751e679 100644
--- a/lib/MC/MCELFObjectTargetWriter.cpp
+++ b/lib/MC/MCELFObjectTargetWriter.cpp
@@ -7,10 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/MC/MCELFObjectWriter.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCValue.h"
 
 using namespace llvm;
 
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 0ef1b2a8bdcaf9737ff7ed49595a3e20181b5ddb..c8e0223c0573b17c2193342d7b9dc8bc16fb6f5c 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -11,30 +11,31 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -42,9 +43,6 @@ bool MCELFStreamer::isBundleLocked() const {
   return getCurrentSectionOnly()->isBundleLocked();
 }
 
-MCELFStreamer::~MCELFStreamer() {
-}
-
 void MCELFStreamer::mergeFragment(MCDataFragment *DF,
                                   MCDataFragment *EF) {
   MCAssembler &Assembler = getAssembler();
@@ -95,11 +93,19 @@ void MCELFStreamer::InitSections(bool NoExecStack) {
     SwitchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
 }
 
-void MCELFStreamer::EmitLabel(MCSymbol *S) {
+void MCELFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc) {
   auto *Symbol = cast<MCSymbolELF>(S);
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+  MCObjectStreamer::EmitLabel(Symbol, Loc);
+
+  const MCSectionELF &Section =
+      static_cast<const MCSectionELF &>(*getCurrentSectionOnly());
+  if (Section.getFlags() & ELF::SHF_TLS)
+    Symbol->setType(ELF::STT_TLS);
+}
 
-  MCObjectStreamer::EmitLabel(Symbol);
+void MCELFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc, MCFragment *F) {
+  auto *Symbol = cast<MCSymbolELF>(S);
+  MCObjectStreamer::EmitLabel(Symbol, Loc, F);
 
   const MCSectionELF &Section =
       static_cast<const MCSectionELF &>(*getCurrentSectionOnly());
@@ -147,17 +153,8 @@ void MCELFStreamer::ChangeSection(MCSection *Section,
   if (Grp)
     Asm.registerSymbol(*Grp);
 
-  this->MCObjectStreamer::ChangeSection(Section, Subsection);
-  MCContext &Ctx = getContext();
-  auto *Begin = cast_or_null<MCSymbolELF>(Section->getBeginSymbol());
-  if (!Begin) {
-    Begin = Ctx.getOrCreateSectionSymbol(*SectionELF);
-    Section->setBeginSymbol(Begin);
-  }
-  if (Begin->isUndefined()) {
-    Asm.registerSymbol(*Begin);
-    Begin->setType(ELF::STT_SECTION);
-  }
+  changeSectionImpl(Section, Subsection);
+  Asm.registerSymbol(*Section->getBeginSymbol());
 }
 
 void MCELFStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
@@ -361,13 +358,6 @@ void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
                                          ValueSize, MaxBytesToEmit);
 }
 
-// Add a symbol for the file name of this module. They start after the
-// null symbol and don't count as normal symbol, i.e. a non-STT_FILE symbol
-// with the same name may appear.
-void MCELFStreamer::EmitFileDirective(StringRef Filename) {
-  getAssembler().addFileName(Filename);
-}
-
 void MCELFStreamer::EmitIdent(StringRef IdentString) {
   MCSection *Comment = getAssembler().getContext().getELFSection(
       ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
@@ -630,15 +620,6 @@ void MCELFStreamer::FinishImpl() {
   this->MCObjectStreamer::FinishImpl();
 }
 
-MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                    raw_pwrite_stream &OS, MCCodeEmitter *CE,
-                                    bool RelaxAll) {
-  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
-  if (RelaxAll)
-    S->getAssembler().setRelaxAll(true);
-  return S;
-}
-
 void MCELFStreamer::EmitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("Generic ELF doesn't support this directive");
 }
@@ -647,22 +628,6 @@ void MCELFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("ELF doesn't support this directive");
 }
 
-void MCELFStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
-void MCELFStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
-void MCELFStreamer::EmitCOFFSymbolType(int Type) {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
-void MCELFStreamer::EndCOFFSymbolDef() {
-  llvm_unreachable("ELF doesn't support this directive");
-}
-
 void MCELFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, unsigned ByteAlignment) {
   llvm_unreachable("ELF doesn't support this directive");
@@ -672,3 +637,12 @@ void MCELFStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
   llvm_unreachable("ELF doesn't support this directive");
 }
+
+MCStreamer *llvm::createELFStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                    raw_pwrite_stream &OS, MCCodeEmitter *CE,
+                                    bool RelaxAll) {
+  MCELFStreamer *S = new MCELFStreamer(Context, MAB, OS, CE);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index f312b463d2efdca10242ac7c232150a14c605e91..8149aa27327caad72df79ef806040101ae73cd9e 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -7,28 +7,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCExpr.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mcexpr"
 
 namespace {
 namespace stats {
+
 STATISTIC(MCExprEvaluate, "Number of MCExpr evaluations");
-}
-}
+
+} // end namespace stats
+} // end anonymous namespace
 
 void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
   switch (getKind()) {
@@ -44,7 +51,7 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
     // Parenthesize names that start with $ so that they don't look like
     // absolute names.
     bool UseParens =
-        !InParens && Sym.getName().size() && Sym.getName()[0] == '$';
+        !InParens && !Sym.getName().empty() && Sym.getName()[0] == '$';
     if (UseParens) {
       OS << '(';
       Sym.print(OS, MAI);
@@ -145,8 +152,8 @@ const MCBinaryExpr *MCBinaryExpr::create(Opcode Opc, const MCExpr *LHS,
 }
 
 const MCUnaryExpr *MCUnaryExpr::create(Opcode Opc, const MCExpr *Expr,
-                                       MCContext &Ctx) {
-  return new (Ctx) MCUnaryExpr(Opc, Expr);
+                                       MCContext &Ctx, SMLoc Loc) {
+  return new (Ctx) MCUnaryExpr(Opc, Expr, Loc);
 }
 
 const MCConstantExpr *MCConstantExpr::create(int64_t Value, MCContext &Ctx) {
@@ -208,6 +215,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_SECREL: return "SECREL32";
   case VK_SIZE: return "SIZE";
   case VK_WEAKREF: return "WEAKREF";
+  case VK_X86_ABS8: return "ABS8";
   case VK_ARM_NONE: return "none";
   case VK_ARM_GOT_PREL: return "GOT_PREL";
   case VK_ARM_TARGET1: return "target1";
@@ -278,6 +286,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_Hexagon_IE: return "IE";
   case VK_Hexagon_IE_GOT: return "IEGOT";
   case VK_WebAssembly_FUNCTION: return "FUNCTION";
+  case VK_WebAssembly_TYPEINDEX: return "TYPEINDEX";
   case VK_AMDGPU_GOTPCREL32_LO: return "gotpcrel32@lo";
   case VK_AMDGPU_GOTPCREL32_HI: return "gotpcrel32@hi";
   case VK_AMDGPU_REL32_LO: return "rel32@lo";
@@ -317,6 +326,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("imgrel", VK_COFF_IMGREL32)
     .Case("secrel32", VK_SECREL)
     .Case("size", VK_SIZE)
+    .Case("abs8", VK_X86_ABS8)
     .Case("l", VK_PPC_LO)
     .Case("h", VK_PPC_HI)
     .Case("ha", VK_PPC_HA)
diff --git a/lib/MC/MCFragment.cpp b/lib/MC/MCFragment.cpp
index da6ee7a0730265de7537d055939e7dca8880665d..90b44177cf5e8a49864bbaa807dd937007921e1b 100644
--- a/lib/MC/MCFragment.cpp
+++ b/lib/MC/MCFragment.cpp
@@ -7,30 +7,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCFragment.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/LEB128.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
 using namespace llvm;
 
-MCAsmLayout::MCAsmLayout(MCAssembler &Asm)
-  : Assembler(Asm), LastValidFragment()
- {
+MCAsmLayout::MCAsmLayout(MCAssembler &Asm) : Assembler(Asm) {
   // Compute the section layout order. Virtual sections must go last.
   for (MCSection &Sec : Asm)
     if (!Sec.isVirtualSection())
@@ -233,7 +232,7 @@ uint64_t llvm::computeBundlePadding(const MCAssembler &Assembler,
 
 void ilist_alloc_traits<MCFragment>::deleteNode(MCFragment *V) { V->destroy(); }
 
-MCFragment::~MCFragment() { }
+MCFragment::~MCFragment() = default;
 
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
                        uint8_t BundlePadding, MCSection *Parent)
@@ -294,8 +293,6 @@ void MCFragment::destroy() {
   }
 }
 
-/* *** */
-
 // Debugging methods
 
 namespace llvm {
@@ -307,11 +304,11 @@ raw_ostream &operator<<(raw_ostream &OS, const MCFixup &AF) {
   return OS;
 }
 
-}
+} // end namespace llvm
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCFragment::dump() {
-  raw_ostream &OS = llvm::errs();
+  raw_ostream &OS = errs();
 
   OS << "<";
   switch (getKind()) {
@@ -449,7 +446,7 @@ LLVM_DUMP_METHOD void MCFragment::dump() {
 }
 
 LLVM_DUMP_METHOD void MCAssembler::dump() {
-  raw_ostream &OS = llvm::errs();
+  raw_ostream &OS = errs();
 
   OS << "<MCAssembler\n";
   OS << "  Sections:[\n    ";
diff --git a/lib/MC/MCInst.cpp b/lib/MC/MCInst.cpp
index 16bc597cf3a2567dc64e7826ee24b94679c4c78d..f6d1d3cffca03db261e0df51f31f5a965d275845 100644
--- a/lib/MC/MCInst.cpp
+++ b/lib/MC/MCInst.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/lib/MC/MCInstPrinter.cpp b/lib/MC/MCInstPrinter.cpp
index 23afe8054840a08627fc5f2f2be3fc7e92b43ae5..9121790959749cc2ae99e51ffde6a5dd324e159e 100644
--- a/lib/MC/MCInstPrinter.cpp
+++ b/lib/MC/MCInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- MCInstPrinter.cpp - Convert an MCInst to target assembly syntax ---===//
+//===- MCInstPrinter.cpp - Convert an MCInst to target assembly syntax ----===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cinttypes>
+#include <cstdint>
+
 using namespace llvm;
 
 void llvm::dumpBytes(ArrayRef<uint8_t> bytes, raw_ostream &OS) {
@@ -25,8 +29,7 @@ void llvm::dumpBytes(ArrayRef<uint8_t> bytes, raw_ostream &OS) {
   }
 }
 
-MCInstPrinter::~MCInstPrinter() {
-}
+MCInstPrinter::~MCInstPrinter() = default;
 
 /// getOpcodeName - Return the name of the specified opcode enum (e.g.
 /// "MOV32ri") or empty if we can't resolve it.
@@ -68,7 +71,7 @@ StringRef MCInstPrinter::markup(StringRef a, StringRef b) const {
 // For asm-style hex (e.g. 0ffh) the first digit always has to be a number.
 static bool needsLeadingZero(uint64_t Value)
 {
-  while(Value)
+  while (Value)
   {
     uint64_t digit = (Value >> 60) & 0xf;
     if (digit != 0)
diff --git a/lib/MC/MCInstrAnalysis.cpp b/lib/MC/MCInstrAnalysis.cpp
index 2d8336d77ac7a7281854ab1b4e5d97006958cad2..566944c53548a61f623c12ea189eac6aed1e8672 100644
--- a/lib/MC/MCInstrAnalysis.cpp
+++ b/lib/MC/MCInstrAnalysis.cpp
@@ -1,4 +1,4 @@
-//===-- MCInstrAnalysis.cpp - InstrDesc target hooks ------------*- C++ -*-===//
+//===- MCInstrAnalysis.cpp - InstrDesc target hooks -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrAnalysis.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include <cstdint>
+
 using namespace llvm;
 
 bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
diff --git a/lib/MC/MCLabel.cpp b/lib/MC/MCLabel.cpp
index d973fc93b98c72781b84f921cdeff1a83e7bf52b..db25a46fce186aedb0ca3158caee144b6c26eae0 100644
--- a/lib/MC/MCLabel.cpp
+++ b/lib/MC/MCLabel.cpp
@@ -1,4 +1,4 @@
-//===- lib/MC/MCLabel.cpp - MCLabel implementation ----------------------===//
+//===- lib/MC/MCLabel.cpp - MCLabel implementation ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,8 +8,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCLabel.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 void MCLabel::print(raw_ostream &OS) const {
diff --git a/lib/MC/MCLinkerOptimizationHint.cpp b/lib/MC/MCLinkerOptimizationHint.cpp
index f71fc7830129a672a222e1d1f61aa799cf2ab4e9..97f95418e05494d93693fa031ebf01d3a7250f26 100644
--- a/lib/MC/MCLinkerOptimizationHint.cpp
+++ b/lib/MC/MCLinkerOptimizationHint.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/MCLinkerOptimizationHint.cpp ----- LOH handling -*- C++ -*-===//
+//===- llvm/MC/MCLinkerOptimizationHint.cpp ----- LOH handling ------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -9,9 +9,11 @@
 
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstddef>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -41,14 +43,14 @@ void MCLOHDirective::emit(MachObjectWriter &ObjWriter,
 uint64_t MCLOHDirective::getEmitSize(const MachObjectWriter &ObjWriter,
                                      const MCAsmLayout &Layout) const {
   class raw_counting_ostream : public raw_ostream {
-    uint64_t Count;
+    uint64_t Count = 0;
 
     void write_impl(const char *, size_t size) override { Count += size; }
 
     uint64_t current_pos() const override { return Count; }
 
   public:
-    raw_counting_ostream() : Count(0) {}
+    raw_counting_ostream() = default;
     ~raw_counting_ostream() override { flush(); }
   };
 
diff --git a/lib/MC/MCMachOStreamer.cpp b/lib/MC/MCMachOStreamer.cpp
index 45a497240b4e65ae0578a38d7d2fcec8c961ad68..1e9ef4163256ae505421ba44381e655a70df6946 100644
--- a/lib/MC/MCMachOStreamer.cpp
+++ b/lib/MC/MCMachOStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- MCMachOStreamer.cpp - MachO Streamer ------------------------------===//
+//===- MCMachOStreamer.cpp - MachO Streamer -------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,27 +7,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCLinkerOptimizationHint.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <vector>
 
 using namespace llvm;
 
@@ -70,7 +78,7 @@ public:
   /// @{
 
   void ChangeSection(MCSection *Sect, const MCExpr *Subsect) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) override;
   void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
@@ -83,18 +91,7 @@ public:
   void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
-  void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
-  void EmitCOFFSymbolStorageClass(int StorageClass) override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
-  void EmitCOFFSymbolType(int Type) override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
-  void EndCOFFSymbolDef() override {
-    llvm_unreachable("macho doesn't support this directive");
-  }
+
   void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
@@ -102,13 +99,6 @@ public:
   void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
 
-  void EmitFileDirective(StringRef Filename) override {
-    // FIXME: Just ignore the .file; it isn't important enough to fail the
-    // entire assembly.
-
-    // report_fatal_error("unsupported directive: '.file'");
-  }
-
   void EmitIdent(StringRef IdentString) override {
     llvm_unreachable("macho doesn't support this directive");
   }
@@ -142,7 +132,8 @@ static bool canGoAfterDWARF(const MCSectionMachO &MSec) {
   if (SegName == "__TEXT" && SecName == "__eh_frame")
     return true;
 
-  if (SegName == "__DATA" && SecName == "__nl_symbol_ptr")
+  if (SegName == "__DATA" && (SecName == "__nl_symbol_ptr" ||
+                              SecName == "__thread_ptr"))
     return true;
 
   return false;
@@ -151,7 +142,7 @@ static bool canGoAfterDWARF(const MCSectionMachO &MSec) {
 void MCMachOStreamer::ChangeSection(MCSection *Section,
                                     const MCExpr *Subsection) {
   // Change the section normally.
-  bool Created = MCObjectStreamer::changeSectionImpl(Section, Subsection);
+  bool Created = changeSectionImpl(Section, Subsection);
   const MCSectionMachO &MSec = *cast<MCSectionMachO>(Section);
   StringRef SegName = MSec.getSegmentName();
   if (SegName == "__DWARF")
@@ -180,15 +171,13 @@ void MCMachOStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
     EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
 }
 
-void MCMachOStreamer::EmitLabel(MCSymbol *Symbol) {
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-
+void MCMachOStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   // We have to create a new fragment if this is an atom defining symbol,
   // fragments cannot span atoms.
   if (getAssembler().isSymbolLinkerVisible(*Symbol))
     insert(new MCDataFragment());
 
-  MCObjectStreamer::EmitLabel(Symbol);
+  MCObjectStreamer::EmitLabel(Symbol, Loc);
 
   // This causes the reference type flag to be cleared. Darwin 'as' was "trying"
   // to clear the weak reference and weak definition bits too, but the
diff --git a/lib/MC/MCMachObjectTargetWriter.cpp b/lib/MC/MCMachObjectTargetWriter.cpp
index 4ffd6a78a61fdb78cb21b1cfc4021b4166e9c03c..8809a3c320f837773b54f23c72ea04d8a98fe905 100644
--- a/lib/MC/MCMachObjectTargetWriter.cpp
+++ b/lib/MC/MCMachObjectTargetWriter.cpp
@@ -1,4 +1,4 @@
-//===-- MCMachObjectTargetWriter.cpp - Mach-O Target Writer Subclass ------===//
+//===- MCMachObjectTargetWriter.cpp - Mach-O Target Writer Subclass -------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,4 +16,4 @@ MCMachObjectTargetWriter::MCMachObjectTargetWriter(bool Is64Bit_,
                                                    uint32_t CPUSubtype_)
     : Is64Bit(Is64Bit_), CPUType(CPUType_), CPUSubtype(CPUSubtype_) {}
 
-MCMachObjectTargetWriter::~MCMachObjectTargetWriter() {}
+MCMachObjectTargetWriter::~MCMachObjectTargetWriter() = default;
diff --git a/lib/MC/MCNullStreamer.cpp b/lib/MC/MCNullStreamer.cpp
index eb2d91254b34227031aadea5c910f40a4cd2e61a..d156f5d05a3160c0a2b6e87733d56a24285709cf 100644
--- a/lib/MC/MCNullStreamer.cpp
+++ b/lib/MC/MCNullStreamer.cpp
@@ -34,6 +34,10 @@ namespace {
     void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
                       uint64_t Size = 0, unsigned ByteAlignment = 0) override {}
     void EmitGPRel32Value(const MCExpr *Value) override {}
+    void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
+    void EmitCOFFSymbolStorageClass(int StorageClass) override {}
+    void EmitCOFFSymbolType(int Type) override {}
+    void EndCOFFSymbolDef() override {}
   };
 
 }
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index 9238520cc59f40bcbbf1dd992b5aeb01445eff0d..9f94264684f947bcf4e8696be69762ff16eca3fa 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -16,7 +16,9 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/Support/COFF.h"
+#include "llvm/Support/ELF.h"
 
 using namespace llvm;
 
@@ -505,68 +507,75 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T) {
   COFFDebugSymbolsSection = nullptr;
   COFFDebugTypesSection = nullptr;
 
+  unsigned DebugSecType = ELF::SHT_PROGBITS;
+
+  // MIPS .debug_* sections should have SHT_MIPS_DWARF section type
+  // to distinguish among sections contain DWARF and ECOFF debug formats.
+  // Sections with ECOFF debug format are obsoleted and marked by SHT_PROGBITS.
+  if (T.getArch() == Triple::mips || T.getArch() == Triple::mipsel ||
+      T.getArch() == Triple::mips64 || T.getArch() == Triple::mips64el)
+    DebugSecType = ELF::SHT_MIPS_DWARF;
+
   // Debug Info Sections.
-  DwarfAbbrevSection = Ctx->getELFSection(".debug_abbrev", ELF::SHT_PROGBITS, 0,
-                                          "section_abbrev");
-  DwarfInfoSection =
-      Ctx->getELFSection(".debug_info", ELF::SHT_PROGBITS, 0, "section_info");
-  DwarfLineSection = Ctx->getELFSection(".debug_line", ELF::SHT_PROGBITS, 0);
-  DwarfFrameSection = Ctx->getELFSection(".debug_frame", ELF::SHT_PROGBITS, 0);
+  DwarfAbbrevSection =
+      Ctx->getELFSection(".debug_abbrev", DebugSecType, 0);
+  DwarfInfoSection = Ctx->getELFSection(".debug_info", DebugSecType, 0);
+  DwarfLineSection = Ctx->getELFSection(".debug_line", DebugSecType, 0);
+  DwarfFrameSection = Ctx->getELFSection(".debug_frame", DebugSecType, 0);
   DwarfPubNamesSection =
-      Ctx->getELFSection(".debug_pubnames", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_pubnames", DebugSecType, 0);
   DwarfPubTypesSection =
-      Ctx->getELFSection(".debug_pubtypes", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_pubtypes", DebugSecType, 0);
   DwarfGnuPubNamesSection =
-      Ctx->getELFSection(".debug_gnu_pubnames", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_gnu_pubnames", DebugSecType, 0);
   DwarfGnuPubTypesSection =
-      Ctx->getELFSection(".debug_gnu_pubtypes", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_gnu_pubtypes", DebugSecType, 0);
   DwarfStrSection =
-      Ctx->getELFSection(".debug_str", ELF::SHT_PROGBITS,
+      Ctx->getELFSection(".debug_str", DebugSecType,
                          ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
-  DwarfLocSection = Ctx->getELFSection(".debug_loc", ELF::SHT_PROGBITS, 0);
+  DwarfLocSection = Ctx->getELFSection(".debug_loc", DebugSecType, 0);
   DwarfARangesSection =
-      Ctx->getELFSection(".debug_aranges", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_aranges", DebugSecType, 0);
   DwarfRangesSection =
-      Ctx->getELFSection(".debug_ranges", ELF::SHT_PROGBITS, 0, "debug_range");
-  DwarfMacinfoSection = Ctx->getELFSection(".debug_macinfo", ELF::SHT_PROGBITS,
-                                           0, "debug_macinfo");
+      Ctx->getELFSection(".debug_ranges", DebugSecType, 0);
+  DwarfMacinfoSection =
+      Ctx->getELFSection(".debug_macinfo", DebugSecType, 0);
 
   // DWARF5 Experimental Debug Info
 
   // Accelerator Tables
   DwarfAccelNamesSection =
-      Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0, "names_begin");
+      Ctx->getELFSection(".apple_names", ELF::SHT_PROGBITS, 0);
   DwarfAccelObjCSection =
-      Ctx->getELFSection(".apple_objc", ELF::SHT_PROGBITS, 0, "objc_begin");
-  DwarfAccelNamespaceSection = Ctx->getELFSection(
-      ".apple_namespaces", ELF::SHT_PROGBITS, 0, "namespac_begin");
+      Ctx->getELFSection(".apple_objc", ELF::SHT_PROGBITS, 0);
+  DwarfAccelNamespaceSection =
+      Ctx->getELFSection(".apple_namespaces", ELF::SHT_PROGBITS, 0);
   DwarfAccelTypesSection =
-      Ctx->getELFSection(".apple_types", ELF::SHT_PROGBITS, 0, "types_begin");
+      Ctx->getELFSection(".apple_types", ELF::SHT_PROGBITS, 0);
 
   // Fission Sections
   DwarfInfoDWOSection =
-      Ctx->getELFSection(".debug_info.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_info.dwo", DebugSecType, 0);
   DwarfTypesDWOSection =
-      Ctx->getELFSection(".debug_types.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_types.dwo", DebugSecType, 0);
   DwarfAbbrevDWOSection =
-      Ctx->getELFSection(".debug_abbrev.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_abbrev.dwo", DebugSecType, 0);
   DwarfStrDWOSection =
-      Ctx->getELFSection(".debug_str.dwo", ELF::SHT_PROGBITS,
+      Ctx->getELFSection(".debug_str.dwo", DebugSecType,
                          ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
   DwarfLineDWOSection =
-      Ctx->getELFSection(".debug_line.dwo", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_line.dwo", DebugSecType, 0);
   DwarfLocDWOSection =
-      Ctx->getELFSection(".debug_loc.dwo", ELF::SHT_PROGBITS, 0, "skel_loc");
+      Ctx->getELFSection(".debug_loc.dwo", DebugSecType, 0);
   DwarfStrOffDWOSection =
-      Ctx->getELFSection(".debug_str_offsets.dwo", ELF::SHT_PROGBITS, 0);
-  DwarfAddrSection =
-      Ctx->getELFSection(".debug_addr", ELF::SHT_PROGBITS, 0, "addr_sec");
+      Ctx->getELFSection(".debug_str_offsets.dwo", DebugSecType, 0);
+  DwarfAddrSection = Ctx->getELFSection(".debug_addr", DebugSecType, 0);
 
   // DWP Sections
   DwarfCUIndexSection =
-      Ctx->getELFSection(".debug_cu_index", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_cu_index", DebugSecType, 0);
   DwarfTUIndexSection =
-      Ctx->getELFSection(".debug_tu_index", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".debug_tu_index", DebugSecType, 0);
 
   StackMapSection =
       Ctx->getELFSection(".llvm_stackmaps", ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
@@ -799,6 +808,30 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
                                         SectionKind::getReadOnly());
 }
 
+void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
+  // TODO: Set the section types and flags.
+  TextSection = Ctx->getWasmSection(".text", 0, 0);
+  DataSection = Ctx->getWasmSection(".data", 0, 0);
+
+  // TODO: Set the section types and flags.
+  DwarfLineSection = Ctx->getWasmSection(".debug_line", 0, 0);
+  DwarfStrSection = Ctx->getWasmSection(".debug_str", 0, 0);
+  DwarfLocSection = Ctx->getWasmSection(".debug_loc", 0, 0);
+  DwarfAbbrevSection = Ctx->getWasmSection(".debug_abbrev", 0, 0, "section_abbrev");
+  DwarfARangesSection = Ctx->getWasmSection(".debug_aranges", 0, 0);
+  DwarfRangesSection = Ctx->getWasmSection(".debug_ranges", 0, 0, "debug_range");
+  DwarfMacinfoSection = Ctx->getWasmSection(".debug_macinfo", 0, 0, "debug_macinfo");
+  DwarfAddrSection = Ctx->getWasmSection(".debug_addr", 0, 0);
+  DwarfCUIndexSection = Ctx->getWasmSection(".debug_cu_index", 0, 0);
+  DwarfTUIndexSection = Ctx->getWasmSection(".debug_tu_index", 0, 0);
+  DwarfInfoSection = Ctx->getWasmSection(".debug_info", 0, 0, "section_info");
+  DwarfFrameSection = Ctx->getWasmSection(".debug_frame", 0, 0);
+  DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", 0, 0);
+  DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", 0, 0);
+
+  // TODO: Define more sections.
+}
+
 void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
                                             CodeModel::Model cm,
                                             MCContext &ctx) {
@@ -844,7 +877,8 @@ void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
     initELFMCObjectFileInfo(TT);
     break;
   case Triple::Wasm:
-    report_fatal_error("Cannot initialize MC for wasm object file format yet.");
+    Env = IsWasm;
+    initWasmMCObjectFileInfo(TT);
     break;
   case Triple::UnknownObjectFormat:
     report_fatal_error("Cannot initialize MC for unknown object file format.");
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index cae5c1f8d156452d1fce1dd6b9b691a8aebf49d1..726326be2ee1556056298d06789476b466267a6d 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -153,8 +153,8 @@ void MCObjectStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
   EmitLabel(Frame.End);
 }
 
-void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
-  MCStreamer::EmitLabel(Symbol);
+void MCObjectStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  MCStreamer::EmitLabel(Symbol, Loc);
 
   getAssembler().registerSymbol(*Symbol);
 
@@ -171,6 +171,16 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol) {
   }
 }
 
+void MCObjectStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc, MCFragment *F) {
+  MCStreamer::EmitLabel(Symbol, Loc);
+  getAssembler().registerSymbol(*Symbol);
+  auto *DF = dyn_cast_or_null<MCDataFragment>(F);
+  if (DF)
+    Symbol->setFragment(F);
+  else
+    PendingLabels.push_back(Symbol);
+}
+
 void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->evaluateAsAbsolute(IntValue, getAssembler())) {
@@ -203,6 +213,7 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
                                          const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
   flushPendingLabels(nullptr);
+  getContext().clearDwarfLocSeen();
 
   bool Created = getAssembler().registerSection(*Section);
 
@@ -490,8 +501,8 @@ void MCObjectStreamer::EmitGPRel32Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
-  DF->getFixups().push_back(MCFixup::create(DF->getContents().size(), 
-                                            Value, FK_GPRel_4));
+  DF->getFixups().push_back(
+      MCFixup::create(DF->getContents().size(), Value, FK_GPRel_4));
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
@@ -500,8 +511,8 @@ void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
-  DF->getFixups().push_back(MCFixup::create(DF->getContents().size(), 
-                                            Value, FK_GPRel_4));
+  DF->getFixups().push_back(
+      MCFixup::create(DF->getContents().size(), Value, FK_GPRel_4));
   DF->getContents().resize(DF->getContents().size() + 8, 0);
 }
 
@@ -572,6 +583,10 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
   MCStreamer::emitFill(IntNumValues, Size, Expr);
 }
 
+void MCObjectStreamer::EmitFileDirective(StringRef Filename) {
+  getAssembler().addFileName(Filename);
+}
+
 void MCObjectStreamer::FinishImpl() {
   // If we are generating dwarf for assembly source files dump out the sections.
   if (getContext().getGenDwarfForAssembly())
diff --git a/lib/MC/MCObjectWriter.cpp b/lib/MC/MCObjectWriter.cpp
index e84f74ae81d6c7cd21d0d9ec65f48173101f8398..478b4e84e74ac5507b4054bd4c2e71be34d0aeb0 100644
--- a/lib/MC/MCObjectWriter.cpp
+++ b/lib/MC/MCObjectWriter.cpp
@@ -8,14 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSymbol.h"
 
 using namespace llvm;
 
-MCObjectWriter::~MCObjectWriter() {
-}
+MCObjectWriter::~MCObjectWriter() = default;
 
 bool MCObjectWriter::isSymbolRefDifferenceFullyResolved(
     const MCAssembler &Asm, const MCSymbolRefExpr *A, const MCSymbolRefExpr *B,
@@ -51,5 +51,3 @@ bool MCObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
   // On ELF and COFF  A - B is absolute if A and B are in the same section.
   return &SecA == &SecB;
 }
-
-bool MCObjectWriter::isWeak(const MCSymbol &) const { return false; }
diff --git a/lib/MC/MCParser/AsmLexer.cpp b/lib/MC/MCParser/AsmLexer.cpp
index 87ecf9e0227fa639d2b310c78dc4127e13b54906..38dadfe62135561798c08790cf4bd708115fb121 100644
--- a/lib/MC/MCParser/AsmLexer.cpp
+++ b/lib/MC/MCParser/AsmLexer.cpp
@@ -11,12 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SaveAndRestore.h"
@@ -30,15 +30,11 @@
 
 using namespace llvm;
 
-AsmLexer::AsmLexer(const MCAsmInfo &MAI)
-    : MAI(MAI), CurPtr(nullptr), IsAtStartOfLine(true),
-      IsAtStartOfStatement(true), IsParsingMSInlineAsm(false),
-      IsPeeking(false) {
+AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
 }
 
-AsmLexer::~AsmLexer() {
-}
+AsmLexer::~AsmLexer() = default;
 
 void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
   CurBuf = Buf;
diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp
index f8744f5542e868cc9c1c50fb14ee859007a73a6d..e65ce9f0b9361bf69d61504eb9005cee653fdbf6 100644
--- a/lib/MC/MCParser/AsmParser.cpp
+++ b/lib/MC/MCParser/AsmParser.cpp
@@ -35,6 +35,7 @@
 #include "llvm/MC/MCParser/AsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCAsmParserUtils.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
@@ -42,6 +43,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -55,6 +57,7 @@
 #include <algorithm>
 #include <cassert>
 #include <cctype>
+#include <climits>
 #include <cstddef>
 #include <cstdint>
 #include <deque>
@@ -67,7 +70,7 @@
 
 using namespace llvm;
 
-MCAsmParserSemaCallback::~MCAsmParserSemaCallback() {}
+MCAsmParserSemaCallback::~MCAsmParserSemaCallback() = default;
 
 static cl::opt<unsigned> AsmMacroMaxNestingDepth(
      "asm-macro-max-nesting-depth", cl::init(20), cl::Hidden,
@@ -82,10 +85,10 @@ typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
 struct MCAsmMacroParameter {
   StringRef Name;
   MCAsmMacroArgument Value;
-  bool Required;
-  bool Vararg;
+  bool Required = false;
+  bool Vararg = false;
 
-  MCAsmMacroParameter() : Required(false), Vararg(false) {}
+  MCAsmMacroParameter() = default;
 };
 
 typedef std::vector<MCAsmMacroParameter> MCAsmMacroParameters;
@@ -124,23 +127,20 @@ struct ParseStatementInfo {
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> ParsedOperands;
 
   /// \brief The opcode from the last parsed instruction.
-  unsigned Opcode;
+  unsigned Opcode = ~0U;
 
   /// \brief Was there an error parsing the inline assembly?
-  bool ParseError;
+  bool ParseError = false;
 
-  SmallVectorImpl<AsmRewrite> *AsmRewrites;
+  SmallVectorImpl<AsmRewrite> *AsmRewrites = nullptr;
 
-  ParseStatementInfo() : Opcode(~0U), ParseError(false), AsmRewrites(nullptr) {}
+  ParseStatementInfo() = default;
   ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
-    : Opcode(~0), ParseError(false), AsmRewrites(rewrites) {}
+    : AsmRewrites(rewrites) {}
 };
 
 /// \brief The concrete assembly parser instance.
 class AsmParser : public MCAsmParser {
-  AsmParser(const AsmParser &) = delete;
-  void operator=(const AsmParser &) = delete;
-
 private:
   AsmLexer Lexer;
   MCContext &Ctx;
@@ -199,17 +199,19 @@ private:
   unsigned LastQueryLine;
 
   /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
-  unsigned AssemblerDialect;
+  unsigned AssemblerDialect = ~0U;
 
   /// \brief is Darwin compatibility enabled?
-  bool IsDarwin;
+  bool IsDarwin = false;
 
   /// \brief Are we parsing ms-style inline assembly?
-  bool ParsingInlineAsm;
+  bool ParsingInlineAsm = false;
 
 public:
   AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
-            const MCAsmInfo &MAI);
+            const MCAsmInfo &MAI, unsigned CB);
+  AsmParser(const AsmParser &) = delete;
+  AsmParser &operator=(const AsmParser &) = delete;
   ~AsmParser() override;
 
   bool Run(bool NoInitialTextSection, bool NoFinalize = false) override;
@@ -223,7 +225,6 @@ public:
     DirectiveKindMap[Directive] = DirectiveKindMap[Alias];
   }
 
-public:
   /// @name MCAsmParser Interface
   /// {
 
@@ -258,7 +259,7 @@ public:
 
   bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                         unsigned &NumOutputs, unsigned &NumInputs,
-                        SmallVectorImpl<std::pair<void *,bool> > &OpDecls,
+                        SmallVectorImpl<std::pair<void *,bool>> &OpDecls,
                         SmallVectorImpl<std::string> &Constraints,
                         SmallVectorImpl<std::string> &Clobbers,
                         const MCInstrInfo *MII, const MCInstPrinter *IP,
@@ -572,11 +573,9 @@ extern MCAsmParserExtension *createCOFFAsmParser();
 enum { DEFAULT_ADDRSPACE = 0 };
 
 AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
-                     const MCAsmInfo &MAI)
+                     const MCAsmInfo &MAI, unsigned CB = 0)
     : Lexer(MAI), Ctx(Ctx), Out(Out), MAI(MAI), SrcMgr(SM),
-      PlatformParser(nullptr), CurBuffer(SM.getMainFileID()),
-      MacrosEnabledFlag(true), CppHashInfo(), AssemblerDialect(~0U),
-      IsDarwin(false), ParsingInlineAsm(false) {
+      CurBuffer(CB ? CB : SM.getMainFileID()), MacrosEnabledFlag(true) {
   HadError = false;
   // Save the old handler.
   SavedDiagHandler = SrcMgr.getDiagHandler();
@@ -597,6 +596,9 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
   case MCObjectFileInfo::IsELF:
     PlatformParser.reset(createELFAsmParser());
     break;
+  case MCObjectFileInfo::IsWasm:
+    llvm_unreachable("Wasm parsing not supported yet");
+    break;
   }
 
   PlatformParser->Initialize(*this);
@@ -608,6 +610,10 @@ AsmParser::AsmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
 AsmParser::~AsmParser() {
   assert((HadError || ActiveMacros.empty()) &&
          "Unexpected active macro instantiation!");
+
+  // Restore the saved diagnostics handler and context for use during
+  // finalization.
+  SrcMgr.setDiagHandler(SavedDiagHandler, SavedDiagContext);
 }
 
 void AsmParser::printMacroInstantiations() {
@@ -918,7 +924,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createLNot(Res, getContext());
+    Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Dollar:
   case AsmToken::At:
@@ -979,7 +985,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
 
     // Lookup the symbol variant if used.
-    if (Split.second.size()) {
+    if (!Split.second.empty()) {
       Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
       if (Variant != MCSymbolRefExpr::VK_Invalid) {
         SymbolName = Split.first;
@@ -1071,19 +1077,19 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createMinus(Res, getContext());
+    Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Plus:
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createPlus(Res, getContext());
+    Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc);
     return false;
   case AsmToken::Tilde:
     Lex(); // Eat the operator.
     if (parsePrimaryExpr(Res, EndLoc))
       return true;
-    Res = MCUnaryExpr::createNot(Res, getContext());
+    Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc);
     return false;
   // MIPS unary expression operators. The lexer won't generate these tokens if
   // MCAsmInfo::HasMipsExpressions is false for the target.
@@ -1618,7 +1624,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       if (ParsingInlineAsm && SI) {
         StringRef RewrittenLabel =
             SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true);
-        assert(RewrittenLabel.size() &&
+        assert(!RewrittenLabel.empty() &&
                "We should have an internal name here.");
         Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(),
                                        RewrittenLabel);
@@ -1627,12 +1633,6 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       Sym = getContext().getOrCreateSymbol(IDVal);
     } else
       Sym = Ctx.createDirectionalLocalSymbol(LocalLabelVal);
-
-    Sym->redefineIfPossible();
-
-    if (!Sym->isUndefined() || Sym->isVariable())
-      return Error(IDLoc, "invalid symbol redefinition");
-
     // End of Labels should be treated as end of line for lexing
     // purposes but that information is not available to the Lexer who
     // does not understand Labels. This may cause us to see a Hash
@@ -1651,7 +1651,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
 
     // Emit the label.
     if (!ParsingInlineAsm)
-      Out.EmitLabel(Sym);
+      Out.EmitLabel(Sym, IDLoc);
 
     // If we are generating dwarf for assembly source files then gather the
     // info to make a dwarf label entry for this label if needed.
@@ -1980,7 +1980,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
     return parseDirectiveMSAlign(IDLoc, Info);
 
-  if (ParsingInlineAsm && (IDVal == "even"))
+  if (ParsingInlineAsm && (IDVal == "even" || IDVal == "EVEN"))
     Info.AsmRewrites->emplace_back(AOK_EVEN, IDLoc, 4);
   if (checkForValidSection())
     return true;
@@ -2026,7 +2026,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // If we previously parsed a cpp hash file line comment then make sure the
     // current Dwarf File is for the CppHashFilename if not then emit the
     // Dwarf File table for it and adjust the line number for the .loc.
-    if (CppHashInfo.Filename.size()) {
+    if (!CppHashInfo.Filename.empty()) {
       unsigned FileNumber = getStreamer().EmitDwarfFileDirective(
           0, StringRef(), CppHashInfo.Filename);
       getContext().setGenDwarfFileNumber(FileNumber);
@@ -3874,6 +3874,12 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
     if (parseIdentifier(Parameter.Name))
       return TokError("expected identifier in '.macro' directive");
 
+    // Emit an error if two (or more) named parameters share the same name
+    for (const MCAsmMacroParameter& CurrParam : Parameters)
+      if (CurrParam.Name.equals(Parameter.Name))
+        return TokError("macro '" + Name + "' has multiple parameters"
+                        " named '" + Parameter.Name + "'");
+
     if (Lexer.is(AsmToken::Colon)) {
       Lex();  // consume ':'
 
@@ -4192,7 +4198,6 @@ bool AsmParser::parseDirectiveBundleUnlock() {
 /// parseDirectiveSpace
 /// ::= (.skip | .space) expression [ , expression ]
 bool AsmParser::parseDirectiveSpace(StringRef IDVal) {
-
   SMLoc NumBytesLoc = Lexer.getLoc();
   const MCExpr *NumBytes;
   if (checkForValidSection() || parseExpression(NumBytes))
@@ -4288,7 +4293,6 @@ bool AsmParser::parseDirectiveRealDCB(StringRef IDVal, const fltSemantics &Seman
 /// parseDirectiveDS
 /// ::= .ds.{b, d, l, p, s, w, x} expression
 bool AsmParser::parseDirectiveDS(StringRef IDVal, unsigned Size) {
-
   SMLoc NumValuesLoc = Lexer.getLoc();
   int64_t NumValues;
   if (checkForValidSection() || parseAbsoluteExpression(NumValues))
@@ -4417,6 +4421,7 @@ bool AsmParser::parseDirectiveComm(bool IsLocal) {
     return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
                                    "alignment, can't be less than zero");
 
+  Sym->redefineIfPossible();
   if (!Sym->isUndefined())
     return Error(IDLoc, "invalid symbol redefinition");
 
@@ -5209,7 +5214,7 @@ static int rewritesSort(const AsmRewrite *AsmRewriteA,
 
 bool AsmParser::parseMSInlineAsm(
     void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
-    unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool> > &OpDecls,
+    unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool>> &OpDecls,
     SmallVectorImpl<std::string> &Constraints,
     SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
     const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) {
@@ -5519,6 +5524,7 @@ bool parseAssignmentExpression(StringRef Name, bool allow_redef,
 
 /// \brief Create an MCAsmParser instance.
 MCAsmParser *llvm::createMCAsmParser(SourceMgr &SM, MCContext &C,
-                                     MCStreamer &Out, const MCAsmInfo &MAI) {
-  return new AsmParser(SM, C, Out, MAI);
+                                     MCStreamer &Out, const MCAsmInfo &MAI,
+                                     unsigned CB) {
+  return new AsmParser(SM, C, Out, MAI, CB);
 }
diff --git a/lib/MC/MCParser/COFFAsmParser.cpp b/lib/MC/MCParser/COFFAsmParser.cpp
index f4114795a92d29b58b62d8030fc36f8d7f12603f..bec62ccb2f7f784f390e0d47ed77a367115fe535 100644
--- a/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/lib/MC/MCParser/COFFAsmParser.cpp
@@ -7,19 +7,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/COFF.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -98,12 +106,14 @@ class COFFAsmParser : public MCAsmParserExtension {
                             | COFF::IMAGE_SCN_MEM_READ,
                               SectionKind::getText());
   }
+
   bool ParseSectionDirectiveData(StringRef, SMLoc) {
     return ParseSectionSwitch(".data", COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
                                            COFF::IMAGE_SCN_MEM_READ |
                                            COFF::IMAGE_SCN_MEM_WRITE,
                               SectionKind::getData());
   }
+
   bool ParseSectionDirectiveBSS(StringRef, SMLoc) {
     return ParseSectionSwitch(".bss",
                               COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA
@@ -141,8 +151,9 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool ParseAtUnwindOrAtExcept(bool &unwind, bool &except);
   bool ParseSEHRegisterNumber(unsigned &RegNo);
   bool ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc);
+
 public:
-  COFFAsmParser() {}
+  COFFAsmParser() = default;
 };
 
 } // end annonomous namespace.
@@ -277,7 +288,7 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
     .Default(MCSA_Invalid);
   assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       StringRef Name;
 
       if (getParser().parseIdentifier(Name))
@@ -466,10 +477,11 @@ bool COFFAsmParser::ParseDirectiveSecRel32(StringRef, SMLoc) {
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return TokError("unexpected token in directive");
 
-  if (Offset < 0 || Offset > UINT32_MAX)
-    return Error(OffsetLoc,
-                 "invalid '.secrel32' directive offset, can't be less "
-                 "than zero or greater than UINT32_MAX");
+  if (Offset < 0 || Offset > std::numeric_limits<uint32_t>::max())
+    return Error(
+        OffsetLoc,
+        "invalid '.secrel32' directive offset, can't be less "
+        "than zero or greater than std::numeric_limits<uint32_t>::max()");
 
   MCSymbol *Symbol = getContext().getOrCreateSymbol(SymbolID);
 
@@ -817,4 +829,4 @@ MCAsmParserExtension *createCOFFAsmParser() {
   return new COFFAsmParser;
 }
 
-}
+} // end namespace llvm
diff --git a/lib/MC/MCParser/DarwinAsmParser.cpp b/lib/MC/MCParser/DarwinAsmParser.cpp
index 94aa70ef0326aa7659badc4cdb8bccdc7fa854e9..73a7ad0500c372a4fc8a658e22bb4f80ed29a63a 100644
--- a/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -7,22 +7,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/MachO.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/SourceMgr.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <string>
+#include <system_error>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -44,7 +57,7 @@ class DarwinAsmParser : public MCAsmParserExtension {
   SMLoc LastVersionMinDirective;
 
 public:
-  DarwinAsmParser() {}
+  DarwinAsmParser() = default;
 
   void Initialize(MCAsmParser &Parser) override {
     // Call the base implementation.
@@ -209,37 +222,47 @@ public:
   bool parseSectionDirectiveConst(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__const");
   }
+
   bool parseSectionDirectiveStaticConst(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__static_const");
   }
+
   bool parseSectionDirectiveCString(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveLiteral4(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__literal4",
                               MachO::S_4BYTE_LITERALS, 4);
   }
+
   bool parseSectionDirectiveLiteral8(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__literal8",
                               MachO::S_8BYTE_LITERALS, 8);
   }
+
   bool parseSectionDirectiveLiteral16(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__literal16",
                               MachO::S_16BYTE_LITERALS, 16);
   }
+
   bool parseSectionDirectiveConstructor(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__constructor");
   }
+
   bool parseSectionDirectiveDestructor(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__destructor");
   }
+
   bool parseSectionDirectiveFVMLibInit0(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__fvmlib_init0");
   }
+
   bool parseSectionDirectiveFVMLibInit1(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__fvmlib_init1");
   }
+
   bool parseSectionDirectiveSymbolStub(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__symbol_stub",
                               MachO::S_SYMBOL_STUBS |
@@ -247,144 +270,178 @@ public:
                               // FIXME: Different on PPC and ARM.
                               0, 16);
   }
+
   bool parseSectionDirectivePICSymbolStub(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT","__picsymbol_stub",
                               MachO::S_SYMBOL_STUBS |
                               MachO::S_ATTR_PURE_INSTRUCTIONS, 0, 26);
   }
+
   bool parseSectionDirectiveData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__data");
   }
+
   bool parseSectionDirectiveStaticData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__static_data");
   }
+
   bool parseSectionDirectiveNonLazySymbolPointers(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__nl_symbol_ptr",
                               MachO::S_NON_LAZY_SYMBOL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveLazySymbolPointers(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__la_symbol_ptr",
                               MachO::S_LAZY_SYMBOL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveThreadLocalVariablePointers(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_ptr",
                               MachO::S_THREAD_LOCAL_VARIABLE_POINTERS, 4);
   }
+
   bool parseSectionDirectiveDyld(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__dyld");
   }
+
   bool parseSectionDirectiveModInitFunc(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__mod_init_func",
                               MachO::S_MOD_INIT_FUNC_POINTERS, 4);
   }
+
   bool parseSectionDirectiveModTermFunc(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__mod_term_func",
                               MachO::S_MOD_TERM_FUNC_POINTERS, 4);
   }
+
   bool parseSectionDirectiveConstData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__const");
   }
+
   bool parseSectionDirectiveObjCClass(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__class",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCMetaClass(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__meta_class",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCCatClsMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cat_cls_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCCatInstMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cat_inst_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCProtocol(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__protocol",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCStringObject(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__string_object",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClsMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cls_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCInstMeth(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__inst_meth",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClsRefs(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__cls_refs",
                               MachO::S_ATTR_NO_DEAD_STRIP |
                               MachO::S_LITERAL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveObjCMessageRefs(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__message_refs",
                               MachO::S_ATTR_NO_DEAD_STRIP |
                               MachO::S_LITERAL_POINTERS, 4);
   }
+
   bool parseSectionDirectiveObjCSymbols(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__symbols",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCCategory(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__category",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClassVars(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__class_vars",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCInstanceVars(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__instance_vars",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCModuleInfo(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__module_info",
                               MachO::S_ATTR_NO_DEAD_STRIP);
   }
+
   bool parseSectionDirectiveObjCClassNames(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveObjCMethVarTypes(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveObjCMethVarNames(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__cstring",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveObjCSelectorStrs(StringRef, SMLoc) {
     return parseSectionSwitch("__OBJC", "__selector_strs",
                               MachO::S_CSTRING_LITERALS);
   }
+
   bool parseSectionDirectiveTData(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_data",
                               MachO::S_THREAD_LOCAL_REGULAR);
   }
+
   bool parseSectionDirectiveText(StringRef, SMLoc) {
     return parseSectionSwitch("__TEXT", "__text",
                               MachO::S_ATTR_PURE_INSTRUCTIONS);
   }
+
   bool parseSectionDirectiveTLV(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_vars",
                               MachO::S_THREAD_LOCAL_VARIABLES);
   }
+
   bool parseSectionDirectiveIdent(StringRef, SMLoc) {
     // Darwin silently ignores the .ident directive.
     getParser().eatToEndOfStatement();
     return false;
   }
+
   bool parseSectionDirectiveThreadInitFunc(StringRef, SMLoc) {
     return parseSectionSwitch("__DATA", "__thread_init",
                          MachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS);
   }
-  bool parseVersionMin(StringRef, SMLoc);
 
+  bool parseVersionMin(StringRef, SMLoc);
 };
 
 } // end anonymous namespace
@@ -526,7 +583,7 @@ bool DarwinAsmParser::parseDirectiveDumpOrLoad(StringRef Directive,
 ///  ::= .linker_option "string" ( , "string" )*
 bool DarwinAsmParser::parseDirectiveLinkerOption(StringRef IDVal, SMLoc) {
   SmallVector<std::string, 4> Args;
-  for (;;) {
+  while (true) {
     if (getLexer().isNot(AsmToken::String))
       return TokError("expected string in '" + Twine(IDVal) + "' directive");
 
@@ -604,7 +661,6 @@ bool DarwinAsmParser::parseDirectiveSection(StringRef, SMLoc) {
     return TokError("unexpected token in '.section' directive");
   Lex();
 
-
   StringRef Segment, Section;
   unsigned StubSize;
   unsigned TAA;
diff --git a/lib/MC/MCParser/ELFAsmParser.cpp b/lib/MC/MCParser/ELFAsmParser.cpp
index 8d7ba0d03362c06955b3fc255ff1c574053db04a..401011a027f42e8d2e123c1d1453e8e7908df549 100644
--- a/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/lib/MC/MCParser/ELFAsmParser.cpp
@@ -7,17 +7,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+
 using namespace llvm;
 
 namespace {
@@ -142,9 +154,14 @@ private:
   bool ParseSectionName(StringRef &SectionName);
   bool ParseSectionArguments(bool IsPush, SMLoc loc);
   unsigned parseSunStyleSectionFlags();
+  bool maybeParseSectionType(StringRef &TypeName);
+  bool parseMergeSize(int64_t &Size);
+  bool parseGroup(StringRef &GroupName);
+  bool parseMetadataSym(MCSymbolELF *&Associated);
+  bool maybeParseUniqueID(int64_t &UniqueID);
 };
 
-}
+} // end anonymous namespace
 
 /// ParseDirectiveSymbolAttribute
 ///  ::= { ".local", ".weak", ... } [ identifier ( , identifier )* ]
@@ -158,7 +175,7 @@ bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
     .Default(MCSA_Invalid);
   assert(Attr != MCSA_Invalid && "unexpected symbol attribute directive!");
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       StringRef Name;
 
       if (getParser().parseIdentifier(Name))
@@ -230,8 +247,7 @@ bool ELFAsmParser::ParseSectionName(StringRef &SectionName) {
     return false;
   }
 
-  for (;;) {
-    
+  while (true) {
     SMLoc PrevLoc = getLexer().getLoc();
     if (getLexer().is(AsmToken::Comma) ||
       getLexer().is(AsmToken::EndOfStatement))
@@ -282,6 +298,9 @@ static unsigned parseSectionFlags(StringRef flagsStr, bool *UseLastGroup) {
     case 'w':
       flags |= ELF::SHF_WRITE;
       break;
+    case 'o':
+      flags |= ELF::SHF_LINK_ORDER;
+      break;
     case 'M':
       flags |= ELF::SHF_MERGE;
       break;
@@ -366,6 +385,97 @@ bool ELFAsmParser::ParseDirectiveSection(StringRef, SMLoc loc) {
   return ParseSectionArguments(/*IsPush=*/false, loc);
 }
 
+bool ELFAsmParser::maybeParseSectionType(StringRef &TypeName) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return false;
+  Lex();
+  if (L.isNot(AsmToken::At) && L.isNot(AsmToken::Percent) &&
+      L.isNot(AsmToken::String)) {
+    if (L.getAllowAtInIdentifier())
+      return TokError("expected '@<type>', '%<type>' or \"<type>\"");
+    else
+      return TokError("expected '%<type>' or \"<type>\"");
+  }
+  if (!L.is(AsmToken::String))
+    Lex();
+  if (L.is(AsmToken::Integer)) {
+    TypeName = getTok().getString();
+    Lex();
+  } else if (getParser().parseIdentifier(TypeName))
+    return TokError("expected identifier in directive");
+  return false;
+}
+
+bool ELFAsmParser::parseMergeSize(int64_t &Size) {
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("expected the entry size");
+  Lex();
+  if (getParser().parseAbsoluteExpression(Size))
+    return true;
+  if (Size <= 0)
+    return TokError("entry size must be positive");
+  return false;
+}
+
+bool ELFAsmParser::parseGroup(StringRef &GroupName) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return TokError("expected group name");
+  Lex();
+  if (getParser().parseIdentifier(GroupName))
+    return true;
+  if (L.is(AsmToken::Comma)) {
+    Lex();
+    StringRef Linkage;
+    if (getParser().parseIdentifier(Linkage))
+      return true;
+    if (Linkage != "comdat")
+      return TokError("Linkage must be 'comdat'");
+  }
+  return false;
+}
+
+bool ELFAsmParser::parseMetadataSym(MCSymbolELF *&Associated) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return TokError("expected metadata symbol");
+  Lex();
+  StringRef Name;
+  if (getParser().parseIdentifier(Name))
+    return true;
+  Associated = dyn_cast_or_null<MCSymbolELF>(getContext().lookupSymbol(Name));
+  if (!Associated || !Associated->isInSection())
+    return TokError("symbol is not in a section: " + Name);
+  return false;
+}
+
+bool ELFAsmParser::maybeParseUniqueID(int64_t &UniqueID) {
+  MCAsmLexer &L = getLexer();
+  if (L.isNot(AsmToken::Comma))
+    return false;
+  Lex();
+  StringRef UniqueStr;
+  if (getParser().parseIdentifier(UniqueStr))
+    return TokError("expected identifier in directive");
+  if (UniqueStr != "unique")
+    return TokError("expected 'unique'");
+  if (L.isNot(AsmToken::Comma))
+    return TokError("expected commma");
+  Lex();
+  if (getParser().parseAbsoluteExpression(UniqueID))
+    return true;
+  if (UniqueID < 0)
+    return TokError("unique id must be positive");
+  if (!isUInt<32>(UniqueID) || UniqueID == ~0U)
+    return TokError("unique id is too large");
+  return false;
+}
+
+static bool hasPrefix(StringRef SectionName, StringRef Prefix) {
+  return SectionName.startswith(Prefix) || SectionName == Prefix.drop_back();
+}
+
 bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   StringRef SectionName;
 
@@ -379,14 +489,24 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   const MCExpr *Subsection = nullptr;
   bool UseLastGroup = false;
   StringRef UniqueStr;
+  MCSymbolELF *Associated = nullptr;
   int64_t UniqueID = ~0;
 
   // Set the defaults first.
-  if (SectionName == ".fini" || SectionName == ".init" ||
-      SectionName == ".rodata")
+  if (hasPrefix(SectionName, ".rodata.") || SectionName == ".rodata1")
     Flags |= ELF::SHF_ALLOC;
-  if (SectionName == ".fini" || SectionName == ".init")
-    Flags |= ELF::SHF_EXECINSTR;
+  if (SectionName == ".fini" || SectionName == ".init" ||
+      hasPrefix(SectionName, ".text."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
+  if (hasPrefix(SectionName, ".data.") || SectionName == ".data1" ||
+      hasPrefix(SectionName, ".bss.") ||
+      hasPrefix(SectionName, ".init_array.") ||
+      hasPrefix(SectionName, ".fini_array.") ||
+      hasPrefix(SectionName, ".preinit_array."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE;
+  if (hasPrefix(SectionName, ".tdata.") ||
+      hasPrefix(SectionName, ".tbss."))
+    Flags |= ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_TLS;
 
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
@@ -422,65 +542,30 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
       return TokError("Section cannot specifiy a group name while also acting "
                       "as a member of the last group");
 
-    if (getLexer().isNot(AsmToken::Comma)) {
+    if (maybeParseSectionType(TypeName))
+      return true;
+
+    MCAsmLexer &L = getLexer();
+    if (TypeName.empty()) {
       if (Mergeable)
         return TokError("Mergeable section must specify the type");
       if (Group)
         return TokError("Group section must specify the type");
-    } else {
-      Lex();
-      if (getLexer().is(AsmToken::At) || getLexer().is(AsmToken::Percent) ||
-          getLexer().is(AsmToken::String)) {
-        if (!getLexer().is(AsmToken::String))
-          Lex();
-      } else
-        return TokError("expected '@<type>', '%<type>' or \"<type>\"");
-
-      if (getParser().parseIdentifier(TypeName))
-        return TokError("expected identifier in directive");
-
-      if (Mergeable) {
-        if (getLexer().isNot(AsmToken::Comma))
-          return TokError("expected the entry size");
-        Lex();
-        if (getParser().parseAbsoluteExpression(Size))
-          return true;
-        if (Size <= 0)
-          return TokError("entry size must be positive");
-      }
-
-      if (Group) {
-        if (getLexer().isNot(AsmToken::Comma))
-          return TokError("expected group name");
-        Lex();
-        if (getParser().parseIdentifier(GroupName))
-          return true;
-        if (getLexer().is(AsmToken::Comma)) {
-          Lex();
-          StringRef Linkage;
-          if (getParser().parseIdentifier(Linkage))
-            return true;
-          if (Linkage != "comdat")
-            return TokError("Linkage must be 'comdat'");
-        }
-      }
-      if (getLexer().is(AsmToken::Comma)) {
-        Lex();
-        if (getParser().parseIdentifier(UniqueStr))
-          return TokError("expected identifier in directive");
-        if (UniqueStr != "unique")
-          return TokError("expected 'unique'");
-        if (getLexer().isNot(AsmToken::Comma))
-          return TokError("expected commma");
-        Lex();
-        if (getParser().parseAbsoluteExpression(UniqueID))
-          return true;
-        if (UniqueID < 0)
-          return TokError("unique id must be positive");
-        if (!isUInt<32>(UniqueID) || UniqueID == ~0U)
-          return TokError("unique id is too large");
-      }
+      if (L.isNot(AsmToken::EndOfStatement))
+        return TokError("unexpected token in directive");
     }
+
+    if (Mergeable)
+      if (parseMergeSize(Size))
+        return true;
+    if (Group)
+      if (parseGroup(GroupName))
+        return true;
+    if (Flags & ELF::SHF_LINK_ORDER)
+      if (parseMetadataSym(Associated))
+        return true;
+    if (maybeParseUniqueID(UniqueID))
+      return true;
   }
 
 EndStmt:
@@ -493,11 +578,15 @@ EndStmt:
   if (TypeName.empty()) {
     if (SectionName.startswith(".note"))
       Type = ELF::SHT_NOTE;
-    else if (SectionName == ".init_array")
+    else if (hasPrefix(SectionName, ".init_array."))
       Type = ELF::SHT_INIT_ARRAY;
-    else if (SectionName == ".fini_array")
+    else if (hasPrefix(SectionName, ".bss."))
+      Type = ELF::SHT_NOBITS;
+    else if (hasPrefix(SectionName, ".tbss."))
+      Type = ELF::SHT_NOBITS;
+    else if (hasPrefix(SectionName, ".fini_array."))
       Type = ELF::SHT_FINI_ARRAY;
-    else if (SectionName == ".preinit_array")
+    else if (hasPrefix(SectionName, ".preinit_array."))
       Type = ELF::SHT_PREINIT_ARRAY;
   } else {
     if (TypeName == "init_array")
@@ -514,7 +603,7 @@ EndStmt:
       Type = ELF::SHT_NOTE;
     else if (TypeName == "unwind")
       Type = ELF::SHT_X86_64_UNWIND;
-    else
+    else if (TypeName.getAsInteger(0, Type))
       return TokError("unknown section type");
   }
 
@@ -528,8 +617,9 @@ EndStmt:
       }
   }
 
-  MCSection *ELFSection = getContext().getELFSection(SectionName, Type, Flags,
-                                                     Size, GroupName, UniqueID);
+  MCSection *ELFSection =
+      getContext().getELFSection(SectionName, Type, Flags, Size, GroupName,
+                                 UniqueID, Associated);
   getStreamer().SwitchSection(ELFSection, Subsection);
 
   if (getContext().getGenDwarfForAssembly()) {
@@ -677,6 +767,7 @@ bool ELFAsmParser::ParseDirectiveSymver(StringRef, SMLoc) {
   const MCExpr *Value = MCSymbolRefExpr::create(Sym, getContext());
 
   getStreamer().EmitAssignment(Alias, Value);
+  getStreamer().emitELFSymverDirective(Alias, Sym);
   return false;
 }
 
@@ -752,4 +843,4 @@ MCAsmParserExtension *createELFAsmParser() {
   return new ELFAsmParser;
 }
 
-}
+} // end namespace llvm
diff --git a/lib/MC/MCParser/MCAsmLexer.cpp b/lib/MC/MCParser/MCAsmLexer.cpp
index 63c0daba09a043eed5f71a6cb19b1a78dea6a9ec..f8fe78aece0cacd11c43581f330840ea1d3f244b 100644
--- a/lib/MC/MCParser/MCAsmLexer.cpp
+++ b/lib/MC/MCParser/MCAsmLexer.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmLexer.cpp - Abstract Asm Lexer Interface ---------------------===//
+//===- MCAsmLexer.cpp - Abstract Asm Lexer Interface ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SMLoc.h"
 
 using namespace llvm;
 
-MCAsmLexer::MCAsmLexer()
-    : TokStart(nullptr), SkipSpace(true), IsAtStartOfStatement(true),
-      CommentConsumer(nullptr) {
+MCAsmLexer::MCAsmLexer() {
   CurTok.emplace_back(AsmToken::Space, StringRef());
 }
 
-MCAsmLexer::~MCAsmLexer() {
-}
+MCAsmLexer::~MCAsmLexer() = default;
 
 SMLoc MCAsmLexer::getLoc() const {
   return SMLoc::getFromPointer(TokStart);
diff --git a/lib/MC/MCParser/MCAsmParser.cpp b/lib/MC/MCParser/MCAsmParser.cpp
index 055bf5e8210017dc1266092e2d68fa5c95356b93..27b37f3e2dfbc930ab1cab8f3f5b42e029b5940c 100644
--- a/lib/MC/MCParser/MCAsmParser.cpp
+++ b/lib/MC/MCParser/MCAsmParser.cpp
@@ -7,22 +7,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
 using namespace llvm;
 
-MCAsmParser::MCAsmParser()
-    : TargetParser(nullptr), ShowParsedOperands(0), HadError(false),
-      PendingErrors() {}
+MCAsmParser::MCAsmParser() : ShowParsedOperands(0) {}
 
-MCAsmParser::~MCAsmParser() {
-}
+MCAsmParser::~MCAsmParser() = default;
 
 void MCAsmParser::setTargetParser(MCTargetAsmParser &P) {
   assert(!TargetParser && "Target parser is already initialized!");
@@ -121,7 +121,7 @@ bool MCAsmParser::addErrorSuffix(const Twine &Suffix) {
 bool MCAsmParser::parseMany(function_ref<bool()> parseOne, bool hasComma) {
   if (parseOptionalToken(AsmToken::EndOfStatement))
     return false;
-  while (1) {
+  while (true) {
     if (parseOne())
       return true;
     if (parseOptionalToken(AsmToken::EndOfStatement))
diff --git a/lib/MC/MCParser/MCAsmParserExtension.cpp b/lib/MC/MCParser/MCAsmParserExtension.cpp
index 3f25a14926b6a4b82c3f1ff034dbbce1097a9646..031f473dc5fe4bac44daf53786138c8bddbcca88 100644
--- a/lib/MC/MCParser/MCAsmParserExtension.cpp
+++ b/lib/MC/MCParser/MCAsmParserExtension.cpp
@@ -1,4 +1,4 @@
-//===-- MCAsmParserExtension.cpp - Asm Parser Hooks -----------------------===//
+//===- MCAsmParserExtension.cpp - Asm Parser Hooks ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -8,14 +8,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+
 using namespace llvm;
 
-MCAsmParserExtension::MCAsmParserExtension() :
-  BracketExpressionsSupported(false) {
-}
+MCAsmParserExtension::MCAsmParserExtension() = default;
 
-MCAsmParserExtension::~MCAsmParserExtension() {
-}
+MCAsmParserExtension::~MCAsmParserExtension() = default;
 
 void MCAsmParserExtension::Initialize(MCAsmParser &Parser) {
   this->Parser = &Parser;
diff --git a/lib/MC/MCParser/MCTargetAsmParser.cpp b/lib/MC/MCParser/MCTargetAsmParser.cpp
index 14a22c6b8a2fc63374f313bea4191855c9f6c519..5f821443bb9649895ed9939c26c6b151c52da0c6 100644
--- a/lib/MC/MCParser/MCTargetAsmParser.cpp
+++ b/lib/MC/MCParser/MCTargetAsmParser.cpp
@@ -1,4 +1,4 @@
-//===-- MCTargetAsmParser.cpp - Target Assembly Parser ---------------------==//
+//===-- MCTargetAsmParser.cpp - Target Assembly Parser --------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,19 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+
 using namespace llvm;
 
 MCTargetAsmParser::MCTargetAsmParser(MCTargetOptions const &MCOptions,
                                      const MCSubtargetInfo &STI)
-  : AvailableFeatures(0), ParsingInlineAsm(false), MCOptions(MCOptions),
-    STI(&STI)
-{
-}
+  : MCOptions(MCOptions), STI(&STI) {}
 
-MCTargetAsmParser::~MCTargetAsmParser() {
-}
+MCTargetAsmParser::~MCTargetAsmParser() = default;
 
 MCSubtargetInfo &MCTargetAsmParser::copySTI() {
   MCSubtargetInfo &STICopy = getContext().getSubtargetCopy(getSTI());
diff --git a/lib/MC/MCRegisterInfo.cpp b/lib/MC/MCRegisterInfo.cpp
index ea117f3caa85ebe89eb07be5bb45fece942a43db..a75100a4876b7c015f0a81ab0b12c89a449b92a6 100644
--- a/lib/MC/MCRegisterInfo.cpp
+++ b/lib/MC/MCRegisterInfo.cpp
@@ -1,4 +1,4 @@
-//=== MC/MCRegisterInfo.cpp - Target Register Description -------*- C++ -*-===//
+//===- MC/MCRegisterInfo.cpp - Target Register Description ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,9 +11,12 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Support/Format.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
diff --git a/lib/MC/MCSection.cpp b/lib/MC/MCSection.cpp
index 7a42a2758e88f85d3883c1b9e368ab53715735e4..7986c01220434cfd714d1052190c1587c25316ec 100644
--- a/lib/MC/MCSection.cpp
+++ b/lib/MC/MCSection.cpp
@@ -7,17 +7,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-using namespace llvm;
+#include <algorithm>
+#include <utility>
 
-//===----------------------------------------------------------------------===//
-// MCSection
-//===----------------------------------------------------------------------===//
+using namespace llvm;
 
 MCSection::MCSection(SectionVariant V, SectionKind K, MCSymbol *Begin)
     : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
@@ -31,8 +32,7 @@ MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
 
 bool MCSection::hasEnded() const { return End && End->isInSection(); }
 
-MCSection::~MCSection() {
-}
+MCSection::~MCSection() = default;
 
 void MCSection::setBundleLockState(BundleLockStateType NewState) {
   if (NewState == NotBundleLocked) {
@@ -87,7 +87,7 @@ MCSection::getSubsectionInsertionPoint(unsigned Subsection) {
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void MCSection::dump() {
-  raw_ostream &OS = llvm::errs();
+  raw_ostream &OS = errs();
 
   OS << "<MCSection";
   OS << " Fragments:[\n      ";
diff --git a/lib/MC/MCSectionCOFF.cpp b/lib/MC/MCSectionCOFF.cpp
index 4e2df01c89c272aec147ddfb2385025e943c0fcb..f0709cbc25153647aa0fa63a16ce3e3ccb38f904 100644
--- a/lib/MC/MCSectionCOFF.cpp
+++ b/lib/MC/MCSectionCOFF.cpp
@@ -8,14 +8,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
 using namespace llvm;
 
-MCSectionCOFF::~MCSectionCOFF() {} // anchor.
+MCSectionCOFF::~MCSectionCOFF() = default; // anchor.
 
 // ShouldOmitSectionDirective - Decides whether a '.section' directive
 // should be printed before the section name
@@ -40,7 +40,6 @@ void MCSectionCOFF::setSelection(int Selection) const {
 void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                          raw_ostream &OS,
                                          const MCExpr *Subsection) const {
-
   // standard sections don't require the '.section'
   if (ShouldOmitSectionDirective(SectionName, MAI)) {
     OS << '\t' << getSectionName() << '\n';
@@ -94,7 +93,7 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
         OS << "newest,";
         break;
       default:
-        assert (0 && "unsupported COFF selection type");
+        assert(false && "unsupported COFF selection type");
         break;
     }
     assert(COMDATSymbol);
diff --git a/lib/MC/MCSectionELF.cpp b/lib/MC/MCSectionELF.cpp
index 422652e5ef50cf80f3e12992c5256c8ae56afcf5..78fe01cca24a3b1adc37b2133fb2b313d4230a0b 100644
--- a/lib/MC/MCSectionELF.cpp
+++ b/lib/MC/MCSectionELF.cpp
@@ -7,23 +7,23 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCSectionELF.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCAsmInfo.h"
-#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
-MCSectionELF::~MCSectionELF() {} // anchor.
+MCSectionELF::~MCSectionELF() = default; // anchor.
 
 // Decides whether a '.section' directive
 // should be printed before the section name.
 bool MCSectionELF::ShouldOmitSectionDirective(StringRef Name,
                                               const MCAsmInfo &MAI) const {
-
   if (isUnique())
     return false;
 
@@ -56,7 +56,6 @@ static void printName(raw_ostream &OS, StringRef Name) {
 void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                         raw_ostream &OS,
                                         const MCExpr *Subsection) const {
-
   if (ShouldOmitSectionDirective(SectionName, MAI)) {
     OS << '\t' << getSectionName();
     if (Subsection) {
@@ -104,6 +103,8 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     OS << 'S';
   if (Flags & ELF::SHF_TLS)
     OS << 'T';
+  if (Flags & ELF::SHF_LINK_ORDER)
+    OS << 'o';
 
   // If there are target-specific flags, print them.
   Triple::ArchType Arch = T.getArch();
@@ -142,6 +143,13 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     OS << "progbits";
   else if (Type == ELF::SHT_X86_64_UNWIND)
     OS << "unwind";
+  else if (Type == ELF::SHT_MIPS_DWARF)
+    // Print hex value of the flag while we do not have
+    // any standard symbolic representation of the flag.
+    OS << "0x7000001e";
+  else
+    report_fatal_error("unsupported type 0x" + Twine::utohexstr(Type) +
+                       " for section " + getSectionName());
 
   if (EntrySize) {
     assert(Flags & ELF::SHF_MERGE);
@@ -154,6 +162,12 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     OS << ",comdat";
   }
 
+  if (Flags & ELF::SHF_LINK_ORDER) {
+    assert(AssociatedSymbol);
+    OS << ",";
+    printName(OS, AssociatedSymbol->getName());
+  }
+
   if (isUnique())
     OS << ",unique," << UniqueID;
 
diff --git a/lib/MC/MCSectionWasm.cpp b/lib/MC/MCSectionWasm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c61f28e129f575ceb0808cd01f093729bd69ff1a
--- /dev/null
+++ b/lib/MC/MCSectionWasm.cpp
@@ -0,0 +1,97 @@
+//===- lib/MC/MCSectionWasm.cpp - Wasm Code Section Representation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCSectionWasm::~MCSectionWasm() {} // anchor.
+
+// Decides whether a '.section' directive
+// should be printed before the section name.
+bool MCSectionWasm::ShouldOmitSectionDirective(StringRef Name,
+                                               const MCAsmInfo &MAI) const {
+  return MAI.shouldOmitSectionDirective(Name);
+}
+
+static void printName(raw_ostream &OS, StringRef Name) {
+  if (Name.find_first_not_of("0123456789_."
+                             "abcdefghijklmnopqrstuvwxyz"
+                             "ABCDEFGHIJKLMNOPQRSTUVWXYZ") == Name.npos) {
+    OS << Name;
+    return;
+  }
+  OS << '"';
+  for (const char *B = Name.begin(), *E = Name.end(); B < E; ++B) {
+    if (*B == '"') // Unquoted "
+      OS << "\\\"";
+    else if (*B != '\\') // Neither " or backslash
+      OS << *B;
+    else if (B + 1 == E) // Trailing backslash
+      OS << "\\\\";
+    else {
+      OS << B[0] << B[1]; // Quoted character
+      ++B;
+    }
+  }
+  OS << '"';
+}
+
+void MCSectionWasm::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
+                                         raw_ostream &OS,
+                                         const MCExpr *Subsection) const {
+
+  if (ShouldOmitSectionDirective(SectionName, MAI)) {
+    OS << '\t' << getSectionName();
+    if (Subsection) {
+      OS << '\t';
+      Subsection->print(OS, &MAI);
+    }
+    OS << '\n';
+    return;
+  }
+
+  OS << "\t.section\t";
+  printName(OS, getSectionName());
+  OS << ",\"";
+
+  // TODO: Print section flags.
+
+  OS << '"';
+
+  OS << ',';
+
+  // If comment string is '@', e.g. as on ARM - use '%' instead
+  if (MAI.getCommentString()[0] == '@')
+    OS << '%';
+  else
+    OS << '@';
+
+  // TODO: Print section type.
+
+  if (isUnique())
+    OS << ",unique," << UniqueID;
+
+  OS << '\n';
+
+  if (Subsection) {
+    OS << "\t.subsection\t";
+    Subsection->print(OS, &MAI);
+    OS << '\n';
+  }
+}
+
+bool MCSectionWasm::UseCodeAlign() const { return false; }
+
+bool MCSectionWasm::isVirtualSection() const { return false; }
diff --git a/lib/MC/MCStreamer.cpp b/lib/MC/MCStreamer.cpp
index fb28f856f671b08518fcde5b836eddf12000ca63..b9c01c66f31d766830be6a42e309208efdec0c45 100644
--- a/lib/MC/MCStreamer.cpp
+++ b/lib/MC/MCStreamer.cpp
@@ -7,36 +7,44 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeView.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstPrinter.h"
 #include "llvm/MC/MCObjectFileInfo.h"
-#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCWin64EH.h"
+#include "llvm/MC/MCWinEH.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
-using namespace llvm;
+#include <cassert>
+#include <cstdint>
+#include <utility>
 
-// Pin the vtables to this file.
-MCTargetStreamer::~MCTargetStreamer() {}
+using namespace llvm;
 
 MCTargetStreamer::MCTargetStreamer(MCStreamer &S) : Streamer(S) {
   S.setTargetStreamer(this);
 }
 
+// Pin the vtables to this file.
+MCTargetStreamer::~MCTargetStreamer() = default;
+
 void MCTargetStreamer::emitLabel(MCSymbol *Symbol) {}
 
 void MCTargetStreamer::finish() {}
@@ -290,10 +298,17 @@ void MCStreamer::AssignFragment(MCSymbol *Symbol, MCFragment *Fragment) {
   SymbolOrdering[Symbol] = 1 + SymbolOrdering.size();
 }
 
-void MCStreamer::EmitLabel(MCSymbol *Symbol) {
+void MCStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  Symbol->redefineIfPossible();
+
+  if (!Symbol->isUndefined() || Symbol->isVariable())
+    return getContext().reportError(Loc, "invalid symbol redefinition");
+
   assert(!Symbol->isVariable() && "Cannot emit a variable symbol!");
   assert(getCurrentSectionOnly() && "Cannot emit before setting section!");
   assert(!Symbol->getFragment() && "Unexpected fragment on symbol data!");
+  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
+
   Symbol->setFragment(&getCurrentSectionOnly()->getDummyFragment());
 
   MCTargetStreamer *TS = getTargetStreamer();
@@ -666,7 +681,7 @@ void MCStreamer::EmitWinCFISaveXMM(unsigned Register, unsigned Offset) {
 
 void MCStreamer::EmitWinCFIPushFrame(bool Code) {
   EnsureValidWinFrameInfo();
-  if (CurrentWinFrameInfo->Instructions.size() > 0)
+  if (!CurrentWinFrameInfo->Instructions.empty())
     report_fatal_error("If present, PushMachFrame must be the first UOP");
 
   MCSymbol *Label = EmitCFILabel();
@@ -792,12 +807,22 @@ void MCStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo,
 void MCStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {}
 void MCStreamer::EmitThumbFunc(MCSymbol *Func) {}
 void MCStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
-void MCStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {}
-void MCStreamer::EndCOFFSymbolDef() {}
+void MCStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
+void MCStreamer::EndCOFFSymbolDef() {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
 void MCStreamer::EmitFileDirective(StringRef Filename) {}
-void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {}
-void MCStreamer::EmitCOFFSymbolType(int Type) {}
+void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
+void MCStreamer::EmitCOFFSymbolType(int Type) {
+  llvm_unreachable("this directive only supported on COFF targets");
+}
 void MCStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
+void MCStreamer::emitELFSymverDirective(MCSymbol *Alias,
+                                        const MCSymbol *Aliasee) {}
 void MCStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                        unsigned ByteAlignment) {}
 void MCStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
diff --git a/lib/MC/MCSubtargetInfo.cpp b/lib/MC/MCSubtargetInfo.cpp
index 1b592504b1e434a305ab1156b5f0fe15edd726d1..777b4e3d6b676ee38be2b5fe7133afaf26173ef8 100644
--- a/lib/MC/MCSubtargetInfo.cpp
+++ b/lib/MC/MCSubtargetInfo.cpp
@@ -1,4 +1,4 @@
-//===-- MCSubtargetInfo.cpp - Subtarget Information -----------------------===//
+//===- MCSubtargetInfo.cpp - Subtarget Information ------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,13 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/MC/MCSchedule.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cassert>
+#include <cstring>
 
 using namespace llvm;
 
diff --git a/lib/MC/MCSymbol.cpp b/lib/MC/MCSymbol.cpp
index ad303ef0218c6ad8e106f77af1d76df96373b556..cb262542b89f88f33f89bbd1b779659b2e5a2884 100644
--- a/lib/MC/MCSymbol.cpp
+++ b/lib/MC/MCSymbol.cpp
@@ -7,13 +7,19 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCSymbol.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstddef>
+
 using namespace llvm;
 
 // Only the address of this fragment is ever actually used.
diff --git a/lib/MC/MCSymbolELF.cpp b/lib/MC/MCSymbolELF.cpp
index ec7ef447ff89f705c7d37fdfaf983fd5831d5f58..ffa8260d43420c77699b56b3ad447ff68eb4f70f 100644
--- a/lib/MC/MCSymbolELF.cpp
+++ b/lib/MC/MCSymbolELF.cpp
@@ -42,6 +42,8 @@ enum {
 
 void MCSymbolELF::setBinding(unsigned Binding) const {
   setIsBindingSet();
+  if (getType() == ELF::STT_SECTION && Binding != ELF::STB_LOCAL)
+    setType(ELF::STT_NOTYPE);
   unsigned Val;
   switch (Binding) {
   default:
@@ -93,6 +95,8 @@ unsigned MCSymbolELF::getBinding() const {
 
 void MCSymbolELF::setType(unsigned Type) const {
   unsigned Val;
+  if (Type == ELF::STT_SECTION && getBinding() != ELF::STB_LOCAL)
+    return;
   switch (Type) {
   default:
     llvm_unreachable("Unsupported Binding");
diff --git a/lib/MC/MCTargetOptions.cpp b/lib/MC/MCTargetOptions.cpp
index 419210537eea4e001f2eef6a0df1c722e3f13591..5d666b67fddbe858bda50fc29459cbedbac88ae4 100644
--- a/lib/MC/MCTargetOptions.cpp
+++ b/lib/MC/MCTargetOptions.cpp
@@ -1,4 +1,4 @@
-//===- lib/MC/MCTargetOptions.cpp - MC Target Options --------------------===//
+//===- lib/MC/MCTargetOptions.cpp - MC Target Options ---------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,19 +10,16 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCTargetOptions.h"
 
-namespace llvm {
+using namespace llvm;
 
 MCTargetOptions::MCTargetOptions()
     : SanitizeAddress(false), MCRelaxAll(false), MCNoExecStack(false),
       MCFatalWarnings(false), MCNoWarn(false), MCNoDeprecatedWarn(false),
-      MCSaveTempLabels(false),
-      MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false),
-      MCPIECopyRelocations(false), ShowMCEncoding(false),
-      ShowMCInst(false), AsmVerbose(false),
-      PreserveAsmComments(true), DwarfVersion(0), ABIName() {}
+      MCSaveTempLabels(false), MCUseDwarfDirectory(false),
+      MCIncrementalLinkerCompatible(false), MCPIECopyRelocations(false),
+      ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false),
+      PreserveAsmComments(true) {}
 
 StringRef MCTargetOptions::getABIName() const {
   return ABIName;
 }
-
-} // end namespace llvm
diff --git a/lib/MC/MCWasmObjectTargetWriter.cpp b/lib/MC/MCWasmObjectTargetWriter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a09a17d7a124f60d7cf48250006227533ac81acc
--- /dev/null
+++ b/lib/MC/MCWasmObjectTargetWriter.cpp
@@ -0,0 +1,27 @@
+//===-- MCWasmObjectTargetWriter.cpp - Wasm Target Writer Subclass --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
+
+using namespace llvm;
+
+MCWasmObjectTargetWriter::MCWasmObjectTargetWriter(bool Is64Bit_)
+    : Is64Bit(Is64Bit_) {}
+
+bool MCWasmObjectTargetWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+                                                       unsigned Type) const {
+  return false;
+}
+
+void MCWasmObjectTargetWriter::sortRelocs(
+    const MCAssembler &Asm, std::vector<WasmRelocationEntry> &Relocs) {
+}
diff --git a/lib/MC/MCWasmStreamer.cpp b/lib/MC/MCWasmStreamer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..59b62b8d37c30447d1a1bc731dd7b7790aaa8e9f
--- /dev/null
+++ b/lib/MC/MCWasmStreamer.cpp
@@ -0,0 +1,216 @@
+//===- lib/MC/MCWasmStreamer.cpp - Wasm Object Output ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file assembles .s files and emits Wasm .o object files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCWasmStreamer.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+MCWasmStreamer::~MCWasmStreamer() {}
+
+void MCWasmStreamer::mergeFragment(MCDataFragment *DF, MCDataFragment *EF) {
+  flushPendingLabels(DF, DF->getContents().size());
+
+  for (unsigned i = 0, e = EF->getFixups().size(); i != e; ++i) {
+    EF->getFixups()[i].setOffset(EF->getFixups()[i].getOffset() +
+                                 DF->getContents().size());
+    DF->getFixups().push_back(EF->getFixups()[i]);
+  }
+  DF->setHasInstructions(true);
+  DF->getContents().append(EF->getContents().begin(), EF->getContents().end());
+}
+
+void MCWasmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+  // Let the target do whatever target specific stuff it needs to do.
+  getAssembler().getBackend().handleAssemblerFlag(Flag);
+
+  // Do any generic stuff we need to do.
+  llvm_unreachable("invalid assembler flag!");
+}
+
+void MCWasmStreamer::ChangeSection(MCSection *Section,
+                                   const MCExpr *Subsection) {
+  MCAssembler &Asm = getAssembler();
+  auto *SectionWasm = static_cast<const MCSectionWasm *>(Section);
+  const MCSymbol *Grp = SectionWasm->getGroup();
+  if (Grp)
+    Asm.registerSymbol(*Grp);
+
+  this->MCObjectStreamer::ChangeSection(Section, Subsection);
+}
+
+void MCWasmStreamer::EmitWeakReference(MCSymbol *Alias,
+                                       const MCSymbol *Symbol) {
+  getAssembler().registerSymbol(*Symbol);
+  const MCExpr *Value = MCSymbolRefExpr::create(
+      Symbol, MCSymbolRefExpr::VK_WEAKREF, getContext());
+  Alias->setVariableValue(Value);
+}
+
+bool MCWasmStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
+  assert(Attribute != MCSA_IndirectSymbol && "indirect symbols not supported");
+
+  auto *Symbol = cast<MCSymbolWasm>(S);
+
+  // Adding a symbol attribute always introduces the symbol, note that an
+  // important side effect of calling registerSymbol here is to register
+  // the symbol with the assembler.
+  getAssembler().registerSymbol(*Symbol);
+
+  switch (Attribute) {
+  case MCSA_LazyReference:
+  case MCSA_Reference:
+  case MCSA_SymbolResolver:
+  case MCSA_PrivateExtern:
+  case MCSA_WeakDefinition:
+  case MCSA_WeakDefAutoPrivate:
+  case MCSA_Invalid:
+  case MCSA_IndirectSymbol:
+    return false;
+  case MCSA_Global:
+    Symbol->setExternal(true);
+    break;
+  case MCSA_ELF_TypeFunction:
+    Symbol->setIsFunction(true);
+    break;
+  case MCSA_ELF_TypeObject:
+    Symbol->setIsFunction(false);
+    break;
+  default:
+    // unrecognized directive
+    return false;
+  }
+
+  return true;
+}
+
+void MCWasmStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
+                                      unsigned ByteAlignment) {
+  llvm_unreachable("Common symbols are not yet implemented for Wasm");
+}
+
+void MCWasmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
+  cast<MCSymbolWasm>(Symbol)->setSize(Value);
+}
+
+void MCWasmStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
+                                           unsigned ByteAlignment) {
+  llvm_unreachable("Local common symbols are not yet implemented for Wasm");
+}
+
+void MCWasmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+                                   SMLoc Loc) {
+  MCObjectStreamer::EmitValueImpl(Value, Size, Loc);
+}
+
+void MCWasmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+                                          unsigned ValueSize,
+                                          unsigned MaxBytesToEmit) {
+  MCObjectStreamer::EmitValueToAlignment(ByteAlignment, Value, ValueSize,
+                                         MaxBytesToEmit);
+}
+
+void MCWasmStreamer::EmitIdent(StringRef IdentString) {
+  MCSection *Comment = getAssembler().getContext().getWasmSection(
+      ".comment", 0, 0);
+  PushSection();
+  SwitchSection(Comment);
+  if (!SeenIdent) {
+    EmitIntValue(0, 1);
+    SeenIdent = true;
+  }
+  EmitBytes(IdentString);
+  EmitIntValue(0, 1);
+  PopSection();
+}
+
+void MCWasmStreamer::EmitInstToFragment(const MCInst &Inst,
+                                        const MCSubtargetInfo &STI) {
+  this->MCObjectStreamer::EmitInstToFragment(Inst, STI);
+}
+
+void MCWasmStreamer::EmitInstToData(const MCInst &Inst,
+                                    const MCSubtargetInfo &STI) {
+  MCAssembler &Assembler = getAssembler();
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<256> Code;
+  raw_svector_ostream VecOS(Code);
+  Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
+
+  // Append the encoded instruction to the current data fragment (or create a
+  // new such fragment if the current fragment is not a data fragment).
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  // Add the fixups and data.
+  for (unsigned i = 0, e = Fixups.size(); i != e; ++i) {
+    Fixups[i].setOffset(Fixups[i].getOffset() + DF->getContents().size());
+    DF->getFixups().push_back(Fixups[i]);
+  }
+  DF->setHasInstructions(true);
+  DF->getContents().append(Code.begin(), Code.end());
+}
+
+void MCWasmStreamer::FinishImpl() {
+  EmitFrames(nullptr);
+
+  this->MCObjectStreamer::FinishImpl();
+}
+
+MCStreamer *llvm::createWasmStreamer(MCContext &Context, MCAsmBackend &MAB,
+                                     raw_pwrite_stream &OS, MCCodeEmitter *CE,
+                                     bool RelaxAll) {
+  MCWasmStreamer *S = new MCWasmStreamer(Context, MAB, OS, CE);
+  if (RelaxAll)
+    S->getAssembler().setRelaxAll(true);
+  return S;
+}
+
+void MCWasmStreamer::EmitThumbFunc(MCSymbol *Func) {
+  llvm_unreachable("Generic Wasm doesn't support this directive");
+}
+
+void MCWasmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+  llvm_unreachable("Wasm doesn't support this directive");
+}
+
+void MCWasmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
+                                  uint64_t Size, unsigned ByteAlignment) {
+  llvm_unreachable("Wasm doesn't support this directive");
+}
+
+void MCWasmStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
+                                    uint64_t Size, unsigned ByteAlignment) {
+  llvm_unreachable("Wasm doesn't support this directive");
+}
diff --git a/lib/MC/MachObjectWriter.cpp b/lib/MC/MachObjectWriter.cpp
index c4b35f5db9b4ed99be62d792f3e0fd0e45b7fd15..d9ccf0dd661f1307f2871a4a23ebdbf649e2e704 100644
--- a/lib/MC/MachObjectWriter.cpp
+++ b/lib/MC/MachObjectWriter.cpp
@@ -7,23 +7,36 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCFragment.h"
+#include "llvm/MC/MCMachObjectWriter.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolMachO.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachO.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <string>
+#include <utility>
 #include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "mc"
diff --git a/lib/MC/StringTableBuilder.cpp b/lib/MC/StringTableBuilder.cpp
index 1a501bcafc12d90a0509a7c08e9a8a2d3b9d935e..fbd7ba60bc90b20f239faec31f968a711a6a35dd 100644
--- a/lib/MC/StringTableBuilder.cpp
+++ b/lib/MC/StringTableBuilder.cpp
@@ -1,4 +1,4 @@
-//===-- StringTableBuilder.cpp - String table building utility ------------===//
+//===- StringTableBuilder.cpp - String table building utility -------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,18 +7,24 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/StringTableBuilder.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/CachedHashString.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
-
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
+#include <utility>
 #include <vector>
 
 using namespace llvm;
 
-StringTableBuilder::~StringTableBuilder() {}
+StringTableBuilder::~StringTableBuilder() = default;
 
 void StringTableBuilder::initSize() {
   // Account for leading bytes in table so that offsets returned from add are
@@ -48,7 +54,7 @@ void StringTableBuilder::write(raw_ostream &OS) const {
   assert(isFinalized());
   SmallString<0> Data;
   Data.resize(getSize());
-  write((uint8_t *)&Data[0]);
+  write((uint8_t *)Data.data());
   OS << Data;
 }
 
diff --git a/lib/MC/SubtargetFeature.cpp b/lib/MC/SubtargetFeature.cpp
index a97cd1db693207f92d0e067dcec653ae42ceca65..51aaa4b0aa2593b20037beda8c34fa37e38cea1f 100644
--- a/lib/MC/SubtargetFeature.cpp
+++ b/lib/MC/SubtargetFeature.cpp
@@ -7,28 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements the SubtargetFeature interface.
+/// \file Implements the SubtargetFeature interface.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cassert>
-#include <cctype>
-#include <cstdlib>
-using namespace llvm;
+#include <cstddef>
+#include <cstring>
+#include <iterator>
+#include <string>
+#include <vector>
 
-//===----------------------------------------------------------------------===//
-//                          Static Helper Functions
-//===----------------------------------------------------------------------===//
+using namespace llvm;
 
-/// hasFlag - Determine if a feature has a flag; '+' or '-'
-///
+/// Determine if a feature has a flag; '+' or '-'
 static inline bool hasFlag(StringRef Feature) {
   assert(!Feature.empty() && "Empty string");
   // Get first character
@@ -37,14 +40,12 @@ static inline bool hasFlag(StringRef Feature) {
   return Ch == '+' || Ch =='-';
 }
 
-/// StripFlag - Return string stripped of flag.
-///
+/// Return string stripped of flag.
 static inline std::string StripFlag(StringRef Feature) {
   return hasFlag(Feature) ? Feature.substr(1) : Feature;
 }
 
-/// isEnabled - Return true if enable flag; '+'.
-///
+/// Return true if enable flag; '+'.
 static inline bool isEnabled(StringRef Feature) {
   assert(!Feature.empty() && "Empty string");
   // Get first character
@@ -53,15 +54,13 @@ static inline bool isEnabled(StringRef Feature) {
   return Ch == '+';
 }
 
-/// Split - Splits a string of comma separated items in to a vector of strings.
-///
+/// Splits a string of comma separated items in to a vector of strings.
 static void Split(std::vector<std::string> &V, StringRef S) {
   SmallVector<StringRef, 3> Tmp;
   S.split(Tmp, ',', -1, false /* KeepEmpty */);
   V.assign(Tmp.begin(), Tmp.end());
 }
 
-/// Adding features.
 void SubtargetFeatures::AddFeature(StringRef String, bool Enable) {
   // Don't add empty features.
   if (!String.empty())
@@ -81,8 +80,7 @@ static const SubtargetFeatureKV *Find(StringRef S,
   return F;
 }
 
-/// getLongestEntryLength - Return the length of the longest entry in the table.
-///
+/// Return the length of the longest entry in the table.
 static size_t getLongestEntryLength(ArrayRef<SubtargetFeatureKV> Table) {
   size_t MaxLen = 0;
   for (auto &I : Table)
@@ -91,7 +89,6 @@ static size_t getLongestEntryLength(ArrayRef<SubtargetFeatureKV> Table) {
 }
 
 /// Display help for feature choices.
-///
 static void Help(ArrayRef<SubtargetFeatureKV> CPUTable,
                  ArrayRef<SubtargetFeatureKV> FeatTable) {
   // Determine the length of the longest CPU and Feature entries.
@@ -114,58 +111,47 @@ static void Help(ArrayRef<SubtargetFeatureKV> CPUTable,
             "For example, llc -mcpu=mycpu -mattr=+feature1,-feature2\n";
 }
 
-//===----------------------------------------------------------------------===//
-//                    SubtargetFeatures Implementation
-//===----------------------------------------------------------------------===//
-
 SubtargetFeatures::SubtargetFeatures(StringRef Initial) {
   // Break up string into separate features
   Split(Features, Initial);
 }
 
-
 std::string SubtargetFeatures::getString() const {
   return join(Features.begin(), Features.end(), ",");
 }
 
-/// SetImpliedBits - For each feature that is (transitively) implied by this
-/// feature, set it.
-///
+/// For each feature that is (transitively) implied by this feature, set it.
 static
-void SetImpliedBits(FeatureBitset &Bits, const SubtargetFeatureKV *FeatureEntry,
+void SetImpliedBits(FeatureBitset &Bits, const SubtargetFeatureKV &FeatureEntry,
                     ArrayRef<SubtargetFeatureKV> FeatureTable) {
-  for (auto &FE : FeatureTable) {
-    if (FeatureEntry->Value == FE.Value) continue;
+  for (const SubtargetFeatureKV &FE : FeatureTable) {
+    if (FeatureEntry.Value == FE.Value) continue;
 
-    if ((FeatureEntry->Implies & FE.Value).any()) {
+    if ((FeatureEntry.Implies & FE.Value).any()) {
       Bits |= FE.Value;
-      SetImpliedBits(Bits, &FE, FeatureTable);
+      SetImpliedBits(Bits, FE, FeatureTable);
     }
   }
 }
 
-/// ClearImpliedBits - For each feature that (transitively) implies this
-/// feature, clear it.
-///
+/// For each feature that (transitively) implies this feature, clear it.
 static
-void ClearImpliedBits(FeatureBitset &Bits, 
-                      const SubtargetFeatureKV *FeatureEntry,
+void ClearImpliedBits(FeatureBitset &Bits,
+                      const SubtargetFeatureKV &FeatureEntry,
                       ArrayRef<SubtargetFeatureKV> FeatureTable) {
-  for (auto &FE : FeatureTable) {
-    if (FeatureEntry->Value == FE.Value) continue;
+  for (const SubtargetFeatureKV &FE : FeatureTable) {
+    if (FeatureEntry.Value == FE.Value) continue;
 
-    if ((FE.Implies & FeatureEntry->Value).any()) {
+    if ((FE.Implies & FeatureEntry.Value).any()) {
       Bits &= ~FE.Value;
-      ClearImpliedBits(Bits, &FE, FeatureTable);
+      ClearImpliedBits(Bits, FE, FeatureTable);
     }
   }
 }
 
-/// ToggleFeature - Toggle a feature and update the feature bits.
 void
 SubtargetFeatures::ToggleFeature(FeatureBitset &Bits, StringRef Feature,
                                  ArrayRef<SubtargetFeatureKV> FeatureTable) {
-
   // Find feature in table.
   const SubtargetFeatureKV *FeatureEntry =
       Find(StripFlag(Feature), FeatureTable);
@@ -174,23 +160,21 @@ SubtargetFeatures::ToggleFeature(FeatureBitset &Bits, StringRef Feature,
     if ((Bits & FeatureEntry->Value) == FeatureEntry->Value) {
       Bits &= ~FeatureEntry->Value;
       // For each feature that implies this, clear it.
-      ClearImpliedBits(Bits, FeatureEntry, FeatureTable);
+      ClearImpliedBits(Bits, *FeatureEntry, FeatureTable);
     } else {
       Bits |=  FeatureEntry->Value;
 
       // For each feature that this implies, set it.
-      SetImpliedBits(Bits, FeatureEntry, FeatureTable);
+      SetImpliedBits(Bits, *FeatureEntry, FeatureTable);
     }
   } else {
-    errs() << "'" << Feature
-           << "' is not a recognized feature for this target"
+    errs() << "'" << Feature << "' is not a recognized feature for this target"
            << " (ignoring feature)\n";
   }
 }
 
 void SubtargetFeatures::ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
                                     ArrayRef<SubtargetFeatureKV> FeatureTable) {
-
   assert(hasFlag(Feature));
 
   // Find feature in table.
@@ -203,37 +187,30 @@ void SubtargetFeatures::ApplyFeatureFlag(FeatureBitset &Bits, StringRef Feature,
       Bits |= FeatureEntry->Value;
 
       // For each feature that this implies, set it.
-      SetImpliedBits(Bits, FeatureEntry, FeatureTable);
+      SetImpliedBits(Bits, *FeatureEntry, FeatureTable);
     } else {
       Bits &= ~FeatureEntry->Value;
 
       // For each feature that implies this, clear it.
-      ClearImpliedBits(Bits, FeatureEntry, FeatureTable);
+      ClearImpliedBits(Bits, *FeatureEntry, FeatureTable);
     }
   } else {
-    errs() << "'" << Feature
-           << "' is not a recognized feature for this target"
+    errs() << "'" << Feature << "' is not a recognized feature for this target"
            << " (ignoring feature)\n";
   }
 }
 
-
-/// getFeatureBits - Get feature bits a CPU.
-///
 FeatureBitset
 SubtargetFeatures::getFeatureBits(StringRef CPU,
                                   ArrayRef<SubtargetFeatureKV> CPUTable,
                                   ArrayRef<SubtargetFeatureKV> FeatureTable) {
-
   if (CPUTable.empty() || FeatureTable.empty())
     return FeatureBitset();
 
-#ifndef NDEBUG
   assert(std::is_sorted(std::begin(CPUTable), std::end(CPUTable)) &&
          "CPU table is not sorted");
   assert(std::is_sorted(std::begin(FeatureTable), std::end(FeatureTable)) &&
          "CPU features table is not sorted");
-#endif
   // Resulting bits
   FeatureBitset Bits;
 
@@ -253,17 +230,16 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
       // Set the feature implied by this CPU feature, if any.
       for (auto &FE : FeatureTable) {
         if ((CPUEntry->Value & FE.Value).any())
-          SetImpliedBits(Bits, &FE, FeatureTable);
+          SetImpliedBits(Bits, FE, FeatureTable);
       }
     } else {
-      errs() << "'" << CPU
-             << "' is not a recognized processor for this target"
+      errs() << "'" << CPU << "' is not a recognized processor for this target"
              << " (ignoring processor)\n";
     }
   }
 
   // Iterate through each feature
-  for (auto &Feature : Features) {
+  for (const std::string &Feature : Features) {
     // Check for help
     if (Feature == "+help")
       Help(CPUTable, FeatureTable);
@@ -274,8 +250,6 @@ SubtargetFeatures::getFeatureBits(StringRef CPU,
   return Bits;
 }
 
-/// print - Print feature string.
-///
 void SubtargetFeatures::print(raw_ostream &OS) const {
   for (auto &F : Features)
     OS << F << " ";
@@ -283,20 +257,15 @@ void SubtargetFeatures::print(raw_ostream &OS) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-/// dump - Dump feature info.
-///
 LLVM_DUMP_METHOD void SubtargetFeatures::dump() const {
   print(dbgs());
 }
 #endif
 
-/// Adds the default features for the specified target triple.
-///
-/// FIXME: This is an inelegant way of specifying the features of a
-/// subtarget. It would be better if we could encode this information
-/// into the IR. See <rdar://5972456>.
-///
 void SubtargetFeatures::getDefaultSubtargetFeatures(const Triple& Triple) {
+  // FIXME: This is an inelegant way of specifying the features of a
+  // subtarget. It would be better if we could encode this information
+  // into the IR. See <rdar://5972456>.
   if (Triple.getVendor() == Triple::Apple) {
     if (Triple.getArch() == Triple::ppc) {
       // powerpc-apple-*
diff --git a/lib/MC/WasmObjectWriter.cpp b/lib/MC/WasmObjectWriter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..159cc3b4def2a523e8a7afcd4013d912e4944608
--- /dev/null
+++ b/lib/MC/WasmObjectWriter.cpp
@@ -0,0 +1,1149 @@
+//===- lib/MC/WasmObjectWriter.cpp - Wasm File Writer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Wasm object file writer information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionWasm.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/StringSaver.h"
+#include "llvm/Support/Wasm.h"
+#include <vector>
+
+using namespace llvm;
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "reloc-info"
+
+namespace {
+// For patching purposes, we need to remember where each section starts, both
+// for patching up the section size field, and for patching up references to
+// locations within the section.
+struct SectionBookkeeping {
+  // Where the size of the section is written.
+  uint64_t SizeOffset;
+  // Where the contents of the section starts (after the header).
+  uint64_t ContentsOffset;
+};
+
+class WasmObjectWriter : public MCObjectWriter {
+  /// Helper struct for containing some precomputed information on symbols.
+  struct WasmSymbolData {
+    const MCSymbolWasm *Symbol;
+    StringRef Name;
+
+    // Support lexicographic sorting.
+    bool operator<(const WasmSymbolData &RHS) const { return Name < RHS.Name; }
+  };
+
+  /// The target specific Wasm writer instance.
+  std::unique_ptr<MCWasmObjectTargetWriter> TargetObjectWriter;
+
+  // Relocations for fixing up references in the code section.
+  std::vector<WasmRelocationEntry> CodeRelocations;
+
+  // Relocations for fixing up references in the data section.
+  std::vector<WasmRelocationEntry> DataRelocations;
+
+  // Fixups for call_indirect type indices.
+  std::vector<WasmRelocationEntry> TypeIndexFixups;
+
+  // Index values to use for fixing up call_indirect type indices.
+  std::vector<uint32_t> TypeIndexFixupTypes;
+
+  // TargetObjectWriter wrappers.
+  bool is64Bit() const { return TargetObjectWriter->is64Bit(); }
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const {
+    return TargetObjectWriter->getRelocType(Ctx, Target, Fixup, IsPCRel);
+  }
+
+  void startSection(SectionBookkeeping &Section, unsigned SectionId,
+                    const char *Name = nullptr);
+  void endSection(SectionBookkeeping &Section);
+
+public:
+  WasmObjectWriter(MCWasmObjectTargetWriter *MOTW, raw_pwrite_stream &OS)
+      : MCObjectWriter(OS, /*IsLittleEndian=*/true), TargetObjectWriter(MOTW) {}
+
+private:
+  void reset() override {
+    MCObjectWriter::reset();
+  }
+
+  ~WasmObjectWriter() override;
+
+  void writeHeader(const MCAssembler &Asm);
+
+  void writeValueType(wasm::ValType Ty) {
+    encodeSLEB128(int32_t(Ty), getStream());
+  }
+
+  void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
+                        const MCFragment *Fragment, const MCFixup &Fixup,
+                        MCValue Target, bool &IsPCRel,
+                        uint64_t &FixedValue) override;
+
+  void executePostLayoutBinding(MCAssembler &Asm,
+                                const MCAsmLayout &Layout) override;
+
+  void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
+};
+} // end anonymous namespace
+
+WasmObjectWriter::~WasmObjectWriter() {}
+
+// Return the padding size to write a 32-bit value into a 5-byte ULEB128.
+static unsigned PaddingFor5ByteULEB128(uint32_t X) {
+  return X == 0 ? 4 : (4u - (31u - countLeadingZeros(X)) / 7u);
+}
+
+// Return the padding size to write a 32-bit value into a 5-byte SLEB128.
+static unsigned PaddingFor5ByteSLEB128(int32_t X) {
+  return 5 - getSLEB128Size(X);
+}
+
+// Write out a section header and a patchable section size field.
+void WasmObjectWriter::startSection(SectionBookkeeping &Section,
+                                    unsigned SectionId,
+                                    const char *Name) {
+  assert((Name != nullptr) == (SectionId == wasm::WASM_SEC_CUSTOM) &&
+         "Only custom sections can have names");
+
+  encodeULEB128(SectionId, getStream());
+
+  Section.SizeOffset = getStream().tell();
+
+  // The section size. We don't know the size yet, so reserve enough space
+  // for any 32-bit value; we'll patch it later.
+  encodeULEB128(UINT32_MAX, getStream());
+
+  // The position where the section starts, for measuring its size.
+  Section.ContentsOffset = getStream().tell();
+
+  // Custom sections in wasm also have a string identifier.
+  if (SectionId == wasm::WASM_SEC_CUSTOM) {
+    encodeULEB128(strlen(Name), getStream());
+    writeBytes(Name);
+  }
+}
+
+// Now that the section is complete and we know how big it is, patch up the
+// section size field at the start of the section.
+void WasmObjectWriter::endSection(SectionBookkeeping &Section) {
+  uint64_t Size = getStream().tell() - Section.ContentsOffset;
+  if (uint32_t(Size) != Size)
+    report_fatal_error("section size does not fit in a uint32_t");
+
+  unsigned Padding = PaddingFor5ByteULEB128(Size);
+
+  // Write the final section size to the payload_len field, which follows
+  // the section id byte.
+  uint8_t Buffer[16];
+  unsigned SizeLen = encodeULEB128(Size, Buffer, Padding);
+  assert(SizeLen == 5);
+  getStream().pwrite((char *)Buffer, SizeLen, Section.SizeOffset);
+}
+
+// Emit the Wasm header.
+void WasmObjectWriter::writeHeader(const MCAssembler &Asm) {
+  writeBytes(StringRef(wasm::WasmMagic, sizeof(wasm::WasmMagic)));
+  writeLE32(wasm::WasmVersion);
+}
+
+void WasmObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
+                                                const MCAsmLayout &Layout) {
+}
+
+void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
+                                        const MCAsmLayout &Layout,
+                                        const MCFragment *Fragment,
+                                        const MCFixup &Fixup, MCValue Target,
+                                        bool &IsPCRel, uint64_t &FixedValue) {
+  MCSectionWasm &FixupSection = cast<MCSectionWasm>(*Fragment->getParent());
+  uint64_t C = Target.getConstant();
+  uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+  MCContext &Ctx = Asm.getContext();
+
+  if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
+    assert(RefB->getKind() == MCSymbolRefExpr::VK_None &&
+           "Should not have constructed this");
+
+    // Let A, B and C being the components of Target and R be the location of
+    // the fixup. If the fixup is not pcrel, we want to compute (A - B + C).
+    // If it is pcrel, we want to compute (A - B + C - R).
+
+    // In general, Wasm has no relocations for -B. It can only represent (A + C)
+    // or (A + C - R). If B = R + K and the relocation is not pcrel, we can
+    // replace B to implement it: (A - R - K + C)
+    if (IsPCRel) {
+      Ctx.reportError(
+          Fixup.getLoc(),
+          "No relocation available to represent this relative expression");
+      return;
+    }
+
+    const auto &SymB = cast<MCSymbolWasm>(RefB->getSymbol());
+
+    if (SymB.isUndefined()) {
+      Ctx.reportError(Fixup.getLoc(),
+                      Twine("symbol '") + SymB.getName() +
+                          "' can not be undefined in a subtraction expression");
+      return;
+    }
+
+    assert(!SymB.isAbsolute() && "Should have been folded");
+    const MCSection &SecB = SymB.getSection();
+    if (&SecB != &FixupSection) {
+      Ctx.reportError(Fixup.getLoc(),
+                      "Cannot represent a difference across sections");
+      return;
+    }
+
+    uint64_t SymBOffset = Layout.getSymbolOffset(SymB);
+    uint64_t K = SymBOffset - FixupOffset;
+    IsPCRel = true;
+    C -= K;
+  }
+
+  // We either rejected the fixup or folded B into C at this point.
+  const MCSymbolRefExpr *RefA = Target.getSymA();
+  const auto *SymA = RefA ? cast<MCSymbolWasm>(&RefA->getSymbol()) : nullptr;
+
+  bool ViaWeakRef = false;
+  if (SymA && SymA->isVariable()) {
+    const MCExpr *Expr = SymA->getVariableValue();
+    if (const auto *Inner = dyn_cast<MCSymbolRefExpr>(Expr)) {
+      if (Inner->getKind() == MCSymbolRefExpr::VK_WEAKREF) {
+        SymA = cast<MCSymbolWasm>(&Inner->getSymbol());
+        ViaWeakRef = true;
+      }
+    }
+  }
+
+  // Put any constant offset in an addend. Offsets can be negative, and
+  // LLVM expects wrapping, in contrast to wasm's immediates which can't
+  // be negative and don't wrap.
+  FixedValue = 0;
+
+  if (SymA) {
+    if (ViaWeakRef)
+      llvm_unreachable("weakref used in reloc not yet implemented");
+    else
+      SymA->setUsedInReloc();
+  }
+
+  if (RefA) {
+    if (RefA->getKind() == MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX) {
+      assert(C == 0);
+      WasmRelocationEntry Rec(FixupOffset, SymA, C,
+                              wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB,
+                              &FixupSection);
+      TypeIndexFixups.push_back(Rec);
+      return;
+    }
+  }
+
+  unsigned Type = getRelocType(Ctx, Target, Fixup, IsPCRel);
+
+  WasmRelocationEntry Rec(FixupOffset, SymA, C, Type, &FixupSection);
+
+  if (FixupSection.hasInstructions())
+    CodeRelocations.push_back(Rec);
+  else
+    DataRelocations.push_back(Rec);
+}
+
+namespace {
+
+// The signature of a wasm function, in a struct capable of being used as a
+// DenseMap key.
+struct WasmFunctionType {
+  // Support empty and tombstone instances, needed by DenseMap.
+  enum { Plain, Empty, Tombstone } State;
+
+  // The return types of the function.
+  SmallVector<wasm::ValType, 1> Returns;
+
+  // The parameter types of the function.
+  SmallVector<wasm::ValType, 4> Params;
+
+  WasmFunctionType() : State(Plain) {}
+
+  bool operator==(const WasmFunctionType &Other) const {
+    return State == Other.State && Returns == Other.Returns &&
+           Params == Other.Params;
+  }
+};
+
+// Traits for using WasmFunctionType in a DenseMap.
+struct WasmFunctionTypeDenseMapInfo {
+  static WasmFunctionType getEmptyKey() {
+    WasmFunctionType FuncTy;
+    FuncTy.State = WasmFunctionType::Empty;
+    return FuncTy;
+  }
+  static WasmFunctionType getTombstoneKey() {
+    WasmFunctionType FuncTy;
+    FuncTy.State = WasmFunctionType::Tombstone;
+    return FuncTy;
+  }
+  static unsigned getHashValue(const WasmFunctionType &FuncTy) {
+    uintptr_t Value = FuncTy.State;
+    for (wasm::ValType Ret : FuncTy.Returns)
+      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Ret));
+    for (wasm::ValType Param : FuncTy.Params)
+      Value += DenseMapInfo<int32_t>::getHashValue(int32_t(Param));
+    return Value;
+  }
+  static bool isEqual(const WasmFunctionType &LHS,
+                      const WasmFunctionType &RHS) {
+    return LHS == RHS;
+  }
+};
+
+// A wasm import to be written into the import section.
+struct WasmImport {
+  StringRef ModuleName;
+  StringRef FieldName;
+  unsigned Kind;
+  int32_t Type;
+};
+
+// A wasm function to be written into the function section.
+struct WasmFunction {
+  int32_t Type;
+  const MCSymbolWasm *Sym;
+};
+
+// A wasm export to be written into the export section.
+struct WasmExport {
+  StringRef FieldName;
+  unsigned Kind;
+  uint32_t Index;
+};
+
+// A wasm global to be written into the global section.
+struct WasmGlobal {
+  wasm::ValType Type;
+  bool IsMutable;
+  bool HasImport;
+  uint64_t InitialValue;
+  uint32_t ImportIndex;
+};
+
+} // end anonymous namespace
+
+// Write X as an (unsigned) LEB value at offset Offset in Stream, padded
+// to allow patching.
+static void
+WritePatchableLEB(raw_pwrite_stream &Stream, uint32_t X, uint64_t Offset) {
+  uint8_t Buffer[5];
+  unsigned Padding = PaddingFor5ByteULEB128(X);
+  unsigned SizeLen = encodeULEB128(X, Buffer, Padding);
+  assert(SizeLen == 5);
+  Stream.pwrite((char *)Buffer, SizeLen, Offset);
+}
+
+// Write X as an signed LEB value at offset Offset in Stream, padded
+// to allow patching.
+static void
+WritePatchableSLEB(raw_pwrite_stream &Stream, int32_t X, uint64_t Offset) {
+  uint8_t Buffer[5];
+  unsigned Padding = PaddingFor5ByteSLEB128(X);
+  unsigned SizeLen = encodeSLEB128(X, Buffer, Padding);
+  assert(SizeLen == 5);
+  Stream.pwrite((char *)Buffer, SizeLen, Offset);
+}
+
+// Write X as a plain integer value at offset Offset in Stream.
+static void WriteI32(raw_pwrite_stream &Stream, uint32_t X, uint64_t Offset) {
+  uint8_t Buffer[4];
+  support::endian::write32le(Buffer, X);
+  Stream.pwrite((char *)Buffer, sizeof(Buffer), Offset);
+}
+
+// Compute a value to write into the code at the location covered
+// by RelEntry. This value isn't used by the static linker, since
+// we have addends; it just serves to make the code more readable
+// and to make standalone wasm modules directly usable.
+static uint32_t ProvisionalValue(const WasmRelocationEntry &RelEntry) {
+  const MCSymbolWasm *Sym = RelEntry.Symbol;
+
+  // For undefined symbols, use a hopefully invalid value.
+  if (!Sym->isDefined(false))
+    return UINT32_MAX;
+
+  MCSectionWasm &Section =
+    cast<MCSectionWasm>(RelEntry.Symbol->getSection(false));
+  uint64_t Address = Section.getSectionOffset() + RelEntry.Addend;
+
+  // Ignore overflow. LLVM allows address arithmetic to silently wrap.
+  uint32_t Value = Address;
+
+  return Value;
+}
+
+// Apply the portions of the relocation records that we can handle ourselves
+// directly.
+static void ApplyRelocations(
+    ArrayRef<WasmRelocationEntry> Relocations,
+    raw_pwrite_stream &Stream,
+    DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices,
+    uint64_t ContentsOffset)
+{
+  for (const WasmRelocationEntry &RelEntry : Relocations) {
+    uint64_t Offset = ContentsOffset +
+                      RelEntry.FixupSection->getSectionOffset() +
+                      RelEntry.Offset;
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB: {
+      uint32_t Index = SymbolIndices[RelEntry.Symbol];
+      assert(RelEntry.Addend == 0);
+
+      WritePatchableLEB(Stream, Index, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB: {
+      uint32_t Index = SymbolIndices[RelEntry.Symbol];
+      assert(RelEntry.Addend == 0);
+
+      WritePatchableSLEB(Stream, Index, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB: {
+      uint32_t Value = ProvisionalValue(RelEntry);
+
+      WritePatchableSLEB(Stream, Value, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB: {
+      uint32_t Value = ProvisionalValue(RelEntry);
+
+      WritePatchableLEB(Stream, Value, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32: {
+      uint32_t Index = SymbolIndices[RelEntry.Symbol];
+      assert(RelEntry.Addend == 0);
+
+      WriteI32(Stream, Index, Offset);
+      break;
+    }
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32: {
+      uint32_t Value = ProvisionalValue(RelEntry);
+
+      WriteI32(Stream, Value, Offset);
+      break;
+    }
+    default:
+      break;
+    }
+  }
+}
+
+// Write out the portions of the relocation records that the linker will
+// need to handle.
+static void WriteRelocations(
+    ArrayRef<WasmRelocationEntry> Relocations,
+    raw_pwrite_stream &Stream,
+    DenseMap<const MCSymbolWasm *, uint32_t> &SymbolIndices)
+{
+  for (const WasmRelocationEntry RelEntry : Relocations) {
+    encodeULEB128(RelEntry.Type, Stream);
+
+    uint64_t Offset = RelEntry.Offset +
+                      RelEntry.FixupSection->getSectionOffset();
+    uint32_t Index = SymbolIndices[RelEntry.Symbol];
+    int64_t Addend = RelEntry.Addend;
+
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+      encodeULEB128(Offset, Stream);
+      encodeULEB128(Index, Stream);
+      assert(Addend == 0 && "addends not supported for functions");
+      break;
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+      encodeULEB128(Offset, Stream);
+      encodeULEB128(Index, Stream);
+      encodeSLEB128(Addend, Stream);
+      break;
+    default:
+      llvm_unreachable("unsupported relocation type");
+    }
+  }
+}
+
+// Write out the the type relocation records that the linker will
+// need to handle.
+static void WriteTypeRelocations(
+    ArrayRef<WasmRelocationEntry> TypeIndexFixups,
+    ArrayRef<uint32_t> TypeIndexFixupTypes,
+    raw_pwrite_stream &Stream)
+{
+  for (size_t i = 0, e = TypeIndexFixups.size(); i < e; ++i) {
+    const WasmRelocationEntry &Fixup = TypeIndexFixups[i];
+    uint32_t Type = TypeIndexFixupTypes[i];
+
+    assert(Fixup.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB);
+    assert(Fixup.Addend == 0);
+
+    uint64_t Offset = Fixup.Offset +
+                      Fixup.FixupSection->getSectionOffset();
+
+    encodeULEB128(Fixup.Type, Stream);
+    encodeULEB128(Offset, Stream);
+    encodeULEB128(Type, Stream);
+  }
+}
+
+void WasmObjectWriter::writeObject(MCAssembler &Asm,
+                                   const MCAsmLayout &Layout) {
+  MCContext &Ctx = Asm.getContext();
+  wasm::ValType PtrType = is64Bit() ? wasm::ValType::I64 : wasm::ValType::I32;
+
+  // Collect information from the available symbols.
+  DenseMap<WasmFunctionType, int32_t, WasmFunctionTypeDenseMapInfo>
+      FunctionTypeIndices;
+  SmallVector<WasmFunctionType, 4> FunctionTypes;
+  SmallVector<WasmFunction, 4> Functions;
+  SmallVector<uint32_t, 4> TableElems;
+  SmallVector<WasmGlobal, 4> Globals;
+  SmallVector<WasmImport, 4> Imports;
+  SmallVector<WasmExport, 4> Exports;
+  DenseMap<const MCSymbolWasm *, uint32_t> SymbolIndices;
+  SmallPtrSet<const MCSymbolWasm *, 4> IsAddressTaken;
+  unsigned NumFuncImports = 0;
+  unsigned NumGlobalImports = 0;
+  SmallVector<char, 0> DataBytes;
+  uint32_t StackPointerGlobal = 0;
+  bool HasStackPointer = false;
+
+  // Populate the IsAddressTaken set.
+  for (WasmRelocationEntry RelEntry : CodeRelocations) {
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+      IsAddressTaken.insert(RelEntry.Symbol);
+      break;
+    default:
+      break;
+    }
+  }
+  for (WasmRelocationEntry RelEntry : DataRelocations) {
+    switch (RelEntry.Type) {
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+      IsAddressTaken.insert(RelEntry.Symbol);
+      break;
+    default:
+      break;
+    }
+  }
+
+  // Populate the Imports set.
+  for (const MCSymbol &S : Asm.symbols()) {
+    const auto &WS = static_cast<const MCSymbolWasm &>(S);
+    int32_t Type;
+
+    if (WS.isFunction()) {
+      // Prepare the function's type, if we haven't seen it yet.
+      WasmFunctionType F;
+      F.Returns = WS.getReturns();
+      F.Params = WS.getParams();
+      auto Pair =
+          FunctionTypeIndices.insert(std::make_pair(F, FunctionTypes.size()));
+      if (Pair.second)
+        FunctionTypes.push_back(F);
+
+      Type = Pair.first->second;
+    } else {
+      Type = int32_t(PtrType);
+    }
+
+    // If the symbol is not defined in this translation unit, import it.
+    if (!WS.isTemporary() && !WS.isDefined(/*SetUsed=*/false)) {
+      WasmImport Import;
+      Import.ModuleName = WS.getModuleName();
+      Import.FieldName = WS.getName();
+
+      if (WS.isFunction()) {
+        Import.Kind = wasm::WASM_EXTERNAL_FUNCTION;
+        Import.Type = Type;
+        SymbolIndices[&WS] = NumFuncImports;
+        ++NumFuncImports;
+      } else {
+        Import.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+        Import.Type = Type;
+        SymbolIndices[&WS] = NumGlobalImports;
+        ++NumGlobalImports;
+      }
+
+      Imports.push_back(Import);
+    }
+  }
+
+  // In the special .global_variables section, we've encoded global
+  // variables used by the function. Translate them into the Globals
+  // list.
+  MCSectionWasm *GlobalVars = Ctx.getWasmSection(".global_variables", 0, 0);
+  if (!GlobalVars->getFragmentList().empty()) {
+    if (GlobalVars->getFragmentList().size() != 1)
+      report_fatal_error("only one .global_variables fragment supported");
+    const MCFragment &Frag = *GlobalVars->begin();
+    if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
+      report_fatal_error("only data supported in .global_variables");
+    const MCDataFragment &DataFrag = cast<MCDataFragment>(Frag);
+    if (!DataFrag.getFixups().empty())
+      report_fatal_error("fixups not supported in .global_variables");
+    const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+    for (const uint8_t *p = (const uint8_t *)Contents.data(),
+                     *end = (const uint8_t *)Contents.data() + Contents.size();
+         p != end; ) {
+      WasmGlobal G;
+      if (end - p < 3)
+        report_fatal_error("truncated global variable encoding");
+      G.Type = wasm::ValType(int8_t(*p++));
+      G.IsMutable = bool(*p++);
+      G.HasImport = bool(*p++);
+      if (G.HasImport) {
+        G.InitialValue = 0;
+
+        WasmImport Import;
+        Import.ModuleName = (const char *)p;
+        const uint8_t *nul = (const uint8_t *)memchr(p, '\0', end - p);
+        if (!nul)
+          report_fatal_error("global module name must be nul-terminated");
+        p = nul + 1;
+        nul = (const uint8_t *)memchr(p, '\0', end - p);
+        if (!nul)
+          report_fatal_error("global base name must be nul-terminated");
+        Import.FieldName = (const char *)p;
+        p = nul + 1;
+
+        Import.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+        Import.Type = int32_t(G.Type);
+
+        G.ImportIndex = NumGlobalImports;
+        ++NumGlobalImports;
+
+        Imports.push_back(Import);
+      } else {
+        unsigned n;
+        G.InitialValue = decodeSLEB128(p, &n);
+        G.ImportIndex = 0;
+        if ((ptrdiff_t)n > end - p)
+          report_fatal_error("global initial value must be valid SLEB128");
+        p += n;
+      }
+      Globals.push_back(G);
+    }
+  }
+
+  // In the special .stack_pointer section, we've encoded the stack pointer
+  // index.
+  MCSectionWasm *StackPtr = Ctx.getWasmSection(".stack_pointer", 0, 0);
+  if (!StackPtr->getFragmentList().empty()) {
+    if (StackPtr->getFragmentList().size() != 1)
+      report_fatal_error("only one .stack_pointer fragment supported");
+    const MCFragment &Frag = *StackPtr->begin();
+    if (Frag.hasInstructions() || Frag.getKind() != MCFragment::FT_Data)
+      report_fatal_error("only data supported in .stack_pointer");
+    const MCDataFragment &DataFrag = cast<MCDataFragment>(Frag);
+    if (!DataFrag.getFixups().empty())
+      report_fatal_error("fixups not supported in .stack_pointer");
+    const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+    if (Contents.size() != 4)
+      report_fatal_error("only one entry supported in .stack_pointer");
+    HasStackPointer = true;
+    StackPointerGlobal = NumGlobalImports + *(const int32_t *)Contents.data();
+  }
+
+  // Handle defined symbols.
+  for (const MCSymbol &S : Asm.symbols()) {
+    // Ignore unnamed temporary symbols, which aren't ever exported, imported,
+    // or used in relocations.
+    if (S.isTemporary() && S.getName().empty())
+      continue;
+    const auto &WS = static_cast<const MCSymbolWasm &>(S);
+    unsigned Index;
+    if (WS.isFunction()) {
+      // Prepare the function's type, if we haven't seen it yet.
+      WasmFunctionType F;
+      F.Returns = WS.getReturns();
+      F.Params = WS.getParams();
+      auto Pair =
+          FunctionTypeIndices.insert(std::make_pair(F, FunctionTypes.size()));
+      if (Pair.second)
+        FunctionTypes.push_back(F);
+
+      int32_t Type = Pair.first->second;
+
+      if (WS.isDefined(/*SetUsed=*/false)) {
+        // A definition. Take the next available index.
+        Index = NumFuncImports + Functions.size();
+
+        // Prepare the function.
+        WasmFunction Func;
+        Func.Type = Type;
+        Func.Sym = &WS;
+        SymbolIndices[&WS] = Index;
+        Functions.push_back(Func);
+      } else {
+        // An import; the index was assigned above.
+        Index = SymbolIndices.find(&WS)->second;
+      }
+
+      // If needed, prepare the function to be called indirectly.
+      if (IsAddressTaken.count(&WS))
+        TableElems.push_back(Index);
+    } else {
+      // For now, ignore temporary non-function symbols.
+      if (S.isTemporary())
+        continue;
+
+      if (WS.getOffset() != 0)
+        report_fatal_error("data sections must contain one variable each");
+      if (!WS.getSize())
+        report_fatal_error("data symbols must have a size set with .size");
+
+      int64_t Size = 0;
+      if (!WS.getSize()->evaluateAsAbsolute(Size, Layout))
+        report_fatal_error(".size expression must be evaluatable");
+
+      if (WS.isDefined(false)) {
+        MCSectionWasm &DataSection =
+            static_cast<MCSectionWasm &>(WS.getSection());
+
+        if (uint64_t(Size) != Layout.getSectionFileSize(&DataSection))
+          report_fatal_error("data sections must contain at most one variable");
+
+        DataBytes.resize(alignTo(DataBytes.size(), DataSection.getAlignment()));
+
+        DataSection.setSectionOffset(DataBytes.size());
+
+        for (MCSection::iterator I = DataSection.begin(), E = DataSection.end();
+             I != E; ++I) {
+          const MCFragment &Frag = *I;
+          if (Frag.hasInstructions())
+            report_fatal_error("only data supported in data sections");
+
+          if (const MCAlignFragment *Align = dyn_cast<MCAlignFragment>(&Frag)) {
+            if (Align->getValueSize() != 1)
+              report_fatal_error("only byte values supported for alignment");
+            // If nops are requested, use zeros, as this is the data section.
+            uint8_t Value = Align->hasEmitNops() ? 0 : Align->getValue();
+            uint64_t Size = std::min<uint64_t>(alignTo(DataBytes.size(),
+                                                       Align->getAlignment()),
+                                               DataBytes.size() +
+                                                   Align->getMaxBytesToEmit());
+            DataBytes.resize(Size, Value);
+          } else if (const MCFillFragment *Fill =
+                                              dyn_cast<MCFillFragment>(&Frag)) {
+            DataBytes.insert(DataBytes.end(), Size, Fill->getValue());
+          } else {
+            const MCDataFragment &DataFrag = cast<MCDataFragment>(Frag);
+            const SmallVectorImpl<char> &Contents = DataFrag.getContents();
+
+            DataBytes.insert(DataBytes.end(), Contents.begin(), Contents.end());
+          }
+        }
+
+        // For each external global, prepare a corresponding wasm global
+        // holding its address.
+        if (WS.isExternal()) {
+          Index = NumGlobalImports + Globals.size();
+
+          WasmGlobal Global;
+          Global.Type = PtrType;
+          Global.IsMutable = false;
+          Global.HasImport = false;
+          Global.InitialValue = DataSection.getSectionOffset();
+          Global.ImportIndex = 0;
+          SymbolIndices[&WS] = Index;
+          Globals.push_back(Global);
+        }
+      }
+    }
+
+    // If the symbol is visible outside this translation unit, export it.
+    if (WS.isExternal()) {
+      assert(WS.isDefined(false));
+      WasmExport Export;
+      Export.FieldName = WS.getName();
+      Export.Index = Index;
+
+      if (WS.isFunction())
+        Export.Kind = wasm::WASM_EXTERNAL_FUNCTION;
+      else
+        Export.Kind = wasm::WASM_EXTERNAL_GLOBAL;
+
+      Exports.push_back(Export);
+    }
+  }
+
+  // Add types for indirect function calls.
+  for (const WasmRelocationEntry &Fixup : TypeIndexFixups) {
+    assert(Fixup.Addend == 0);
+    assert(Fixup.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB);
+
+    WasmFunctionType F;
+    F.Returns = Fixup.Symbol->getReturns();
+    F.Params = Fixup.Symbol->getParams();
+    auto Pair =
+        FunctionTypeIndices.insert(std::make_pair(F, FunctionTypes.size()));
+    if (Pair.second)
+      FunctionTypes.push_back(F);
+
+    TypeIndexFixupTypes.push_back(Pair.first->second);
+  }
+
+  // Write out the Wasm header.
+  writeHeader(Asm);
+
+  SectionBookkeeping Section;
+
+  // === Type Section =========================================================
+  if (!FunctionTypes.empty()) {
+    startSection(Section, wasm::WASM_SEC_TYPE);
+
+    encodeULEB128(FunctionTypes.size(), getStream());
+
+    for (WasmFunctionType &FuncTy : FunctionTypes) {
+      encodeSLEB128(wasm::WASM_TYPE_FUNC, getStream());
+      encodeULEB128(FuncTy.Params.size(), getStream());
+      for (wasm::ValType Ty : FuncTy.Params)
+        writeValueType(Ty);
+      encodeULEB128(FuncTy.Returns.size(), getStream());
+      for (wasm::ValType Ty : FuncTy.Returns)
+        writeValueType(Ty);
+    }
+
+    endSection(Section);
+  }
+
+  // === Import Section ========================================================
+  if (!Imports.empty()) {
+    startSection(Section, wasm::WASM_SEC_IMPORT);
+
+    encodeULEB128(Imports.size(), getStream());
+    for (const WasmImport &Import : Imports) {
+      StringRef ModuleName = Import.ModuleName;
+      encodeULEB128(ModuleName.size(), getStream());
+      writeBytes(ModuleName);
+
+      StringRef FieldName = Import.FieldName;
+      encodeULEB128(FieldName.size(), getStream());
+      writeBytes(FieldName);
+
+      encodeULEB128(Import.Kind, getStream());
+
+      switch (Import.Kind) {
+      case wasm::WASM_EXTERNAL_FUNCTION:
+        encodeULEB128(Import.Type, getStream());
+        break;
+      case wasm::WASM_EXTERNAL_GLOBAL:
+        encodeSLEB128(int32_t(Import.Type), getStream());
+        encodeULEB128(0, getStream()); // mutability
+        break;
+      default:
+        llvm_unreachable("unsupported import kind");
+      }
+    }
+
+    endSection(Section);
+  }
+
+  // === Function Section ======================================================
+  if (!Functions.empty()) {
+    startSection(Section, wasm::WASM_SEC_FUNCTION);
+
+    encodeULEB128(Functions.size(), getStream());
+    for (const WasmFunction &Func : Functions)
+      encodeULEB128(Func.Type, getStream());
+
+    endSection(Section);
+  }
+
+  // === Table Section =========================================================
+  // For now, always emit the table section, since indirect calls are not
+  // valid without it. In the future, we could perhaps be more clever and omit
+  // it if there are no indirect calls.
+  startSection(Section, wasm::WASM_SEC_TABLE);
+
+  // The number of tables, fixed to 1 for now.
+  encodeULEB128(1, getStream());
+
+  encodeSLEB128(wasm::WASM_TYPE_ANYFUNC, getStream());
+
+  encodeULEB128(0, getStream());                 // flags
+  encodeULEB128(TableElems.size(), getStream()); // initial
+
+  endSection(Section);
+
+  // === Memory Section ========================================================
+  // For now, always emit the memory section, since loads and stores are not
+  // valid without it. In the future, we could perhaps be more clever and omit
+  // it if there are no loads or stores.
+  startSection(Section, wasm::WASM_SEC_MEMORY);
+
+  encodeULEB128(1, getStream()); // number of memory spaces
+
+  encodeULEB128(0, getStream()); // flags
+  encodeULEB128(DataBytes.size(), getStream()); // initial
+
+  endSection(Section);
+
+  // === Global Section ========================================================
+  if (!Globals.empty()) {
+    startSection(Section, wasm::WASM_SEC_GLOBAL);
+
+    encodeULEB128(Globals.size(), getStream());
+    for (const WasmGlobal &Global : Globals) {
+      writeValueType(Global.Type);
+      write8(Global.IsMutable);
+
+      if (Global.HasImport) {
+        assert(Global.InitialValue == 0);
+        write8(wasm::WASM_OPCODE_GET_GLOBAL);
+        encodeULEB128(Global.ImportIndex, getStream());
+      } else {
+        assert(Global.ImportIndex == 0);
+        write8(wasm::WASM_OPCODE_I32_CONST);
+        encodeSLEB128(Global.InitialValue, getStream()); // offset
+      }
+      write8(wasm::WASM_OPCODE_END);
+    }
+
+    endSection(Section);
+  }
+
+  // === Export Section ========================================================
+  if (!Exports.empty()) {
+    startSection(Section, wasm::WASM_SEC_EXPORT);
+
+    encodeULEB128(Exports.size(), getStream());
+    for (const WasmExport &Export : Exports) {
+      encodeULEB128(Export.FieldName.size(), getStream());
+      writeBytes(Export.FieldName);
+
+      encodeSLEB128(Export.Kind, getStream());
+
+      encodeULEB128(Export.Index, getStream());
+    }
+
+    endSection(Section);
+  }
+
+#if 0 // TODO: Start Section
+  if (HaveStartFunction) {
+    // === Start Section =========================================================
+    startSection(Section, wasm::WASM_SEC_START);
+
+    encodeSLEB128(StartFunction, getStream());
+
+    endSection(Section);
+  }
+#endif
+
+  // === Elem Section ==========================================================
+  if (!TableElems.empty()) {
+    startSection(Section, wasm::WASM_SEC_ELEM);
+
+    encodeULEB128(1, getStream()); // number of "segments"
+    encodeULEB128(0, getStream()); // the table index
+
+    // init expr for starting offset
+    write8(wasm::WASM_OPCODE_I32_CONST);
+    encodeSLEB128(0, getStream());
+    write8(wasm::WASM_OPCODE_END);
+
+    encodeULEB128(TableElems.size(), getStream());
+    for (uint32_t Elem : TableElems)
+      encodeULEB128(Elem, getStream());
+
+    endSection(Section);
+  }
+
+  // === Code Section ==========================================================
+  if (!Functions.empty()) {
+    startSection(Section, wasm::WASM_SEC_CODE);
+
+    encodeULEB128(Functions.size(), getStream());
+
+    for (const WasmFunction &Func : Functions) {
+      MCSectionWasm &FuncSection =
+          static_cast<MCSectionWasm &>(Func.Sym->getSection());
+
+      if (Func.Sym->isVariable())
+        report_fatal_error("weak symbols not supported yet");
+
+      if (Func.Sym->getOffset() != 0)
+        report_fatal_error("function sections must contain one function each");
+
+      if (!Func.Sym->getSize())
+        report_fatal_error("function symbols must have a size set with .size");
+
+      int64_t Size = 0;
+      if (!Func.Sym->getSize()->evaluateAsAbsolute(Size, Layout))
+        report_fatal_error(".size expression must be evaluatable");
+
+      encodeULEB128(Size, getStream());
+
+      FuncSection.setSectionOffset(getStream().tell() -
+                                   Section.ContentsOffset);
+
+      Asm.writeSectionData(&FuncSection, Layout);
+    }
+
+    // Apply the type index fixups for call_indirect etc. instructions.
+    for (size_t i = 0, e = TypeIndexFixups.size(); i < e; ++i) {
+      uint32_t Type = TypeIndexFixupTypes[i];
+      unsigned Padding = PaddingFor5ByteULEB128(Type);
+
+      const WasmRelocationEntry &Fixup = TypeIndexFixups[i];
+      assert(Fixup.Addend == 0);
+      assert(Fixup.Type == wasm::R_WEBASSEMBLY_TYPE_INDEX_LEB);
+      uint64_t Offset = Fixup.Offset +
+                        Fixup.FixupSection->getSectionOffset();
+
+      uint8_t Buffer[16];
+      unsigned SizeLen = encodeULEB128(Type, Buffer, Padding);
+      assert(SizeLen == 5);
+      getStream().pwrite((char *)Buffer, SizeLen,
+                         Section.ContentsOffset + Offset);
+    }
+
+    // Apply fixups.
+    ApplyRelocations(CodeRelocations, getStream(), SymbolIndices,
+                     Section.ContentsOffset);
+
+    endSection(Section);
+  }
+
+  // === Data Section ==========================================================
+  if (!DataBytes.empty()) {
+    startSection(Section, wasm::WASM_SEC_DATA);
+
+    encodeULEB128(1, getStream()); // count
+    encodeULEB128(0, getStream()); // memory index
+    write8(wasm::WASM_OPCODE_I32_CONST);
+    encodeSLEB128(0, getStream()); // offset
+    write8(wasm::WASM_OPCODE_END);
+    encodeULEB128(DataBytes.size(), getStream()); // size
+    writeBytes(DataBytes); // data
+
+    // Apply fixups.
+    ApplyRelocations(DataRelocations, getStream(), SymbolIndices,
+                     Section.ContentsOffset);
+
+    endSection(Section);
+  }
+
+  // === Name Section ==========================================================
+  uint32_t TotalFunctions = NumFuncImports + Functions.size();
+  if (TotalFunctions != 0) {
+    startSection(Section, wasm::WASM_SEC_CUSTOM, "name");
+    SectionBookkeeping SubSection;
+    startSection(SubSection, wasm::WASM_NAMES_FUNCTION);
+
+    encodeULEB128(TotalFunctions, getStream());
+    uint32_t Index = 0;
+    for (const WasmImport &Import : Imports) {
+      if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
+        encodeULEB128(Index, getStream());
+        encodeULEB128(Import.FieldName.size(), getStream());
+        writeBytes(Import.FieldName);
+        ++Index;
+      }
+    }
+    for (const WasmFunction &Func : Functions) {
+      encodeULEB128(Index, getStream());
+      encodeULEB128(Func.Sym->getName().size(), getStream());
+      writeBytes(Func.Sym->getName());
+      ++Index;
+    }
+
+    endSection(SubSection);
+    endSection(Section);
+  }
+
+  // See: https://github.com/WebAssembly/tool-conventions/blob/master/Linking.md
+  // for descriptions of the reloc sections.
+
+  // === Code Reloc Section ====================================================
+  if (!CodeRelocations.empty()) {
+    startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.CODE");
+
+    encodeULEB128(wasm::WASM_SEC_CODE, getStream());
+
+    encodeULEB128(CodeRelocations.size(), getStream());
+
+    WriteRelocations(CodeRelocations, getStream(), SymbolIndices);
+    WriteTypeRelocations(TypeIndexFixups, TypeIndexFixupTypes, getStream());
+
+    endSection(Section);
+  }
+
+  // === Data Reloc Section ====================================================
+  if (!DataRelocations.empty()) {
+    startSection(Section, wasm::WASM_SEC_CUSTOM, "reloc.DATA");
+
+    encodeULEB128(wasm::WASM_SEC_DATA, getStream());
+
+    encodeULEB128(DataRelocations.size(), getStream());
+
+    WriteRelocations(DataRelocations, getStream(), SymbolIndices);
+
+    endSection(Section);
+  }
+
+  // === Linking Metadata Section ==============================================
+  if (HasStackPointer) {
+    startSection(Section, wasm::WASM_SEC_CUSTOM, "linking");
+
+    encodeULEB128(1, getStream()); // count
+
+    encodeULEB128(wasm::WASM_STACK_POINTER, getStream()); // type
+    encodeULEB128(StackPointerGlobal, getStream()); // id
+
+    endSection(Section);
+  }
+
+  // TODO: Translate the .comment section to the output.
+
+  // TODO: Translate debug sections to the output.
+}
+
+MCObjectWriter *llvm::createWasmObjectWriter(MCWasmObjectTargetWriter *MOTW,
+                                             raw_pwrite_stream &OS) {
+  return new WasmObjectWriter(MOTW, OS);
+}
diff --git a/lib/MC/WinCOFFObjectWriter.cpp b/lib/MC/WinCOFFObjectWriter.cpp
index afc5c6a14d118e1ef855b0c2c6ae2149328b86a5..da8fe73f823bff7a222a113b4df44ad2818a5126 100644
--- a/lib/MC/WinCOFFObjectWriter.cpp
+++ b/lib/MC/WinCOFFObjectWriter.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/WinCOFFObjectWriter.cpp -------------------------*- C++ -*-===//
+//===- llvm/MC/WinCOFFObjectWriter.cpp ------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,37 +11,49 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
-#include "llvm/Config/config.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolCOFF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/COFF.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/JamCRC.h"
-#include <cstdio>
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <cstring>
 #include <ctime>
+#include <memory>
+#include <string>
+#include <vector>
 
 using namespace llvm;
+using llvm::support::endian::write32le;
 
 #define DEBUG_TYPE "WinCOFFObjectWriter"
 
 namespace {
+
 typedef SmallString<COFF::NameSize> name;
 
 enum AuxiliaryType {
@@ -57,25 +69,24 @@ struct AuxSymbol {
   COFF::Auxiliary Aux;
 };
 
-class COFFSymbol;
 class COFFSection;
 
 class COFFSymbol {
 public:
-  COFF::symbol Data;
+  COFF::symbol Data = {};
 
   typedef SmallVector<AuxSymbol, 1> AuxiliarySymbols;
 
   name Name;
   int Index;
   AuxiliarySymbols Aux;
-  COFFSymbol *Other;
-  COFFSection *Section;
-  int Relocations;
+  COFFSymbol *Other = nullptr;
+  COFFSection *Section = nullptr;
+  int Relocations = 0;
+  const MCSymbol *MC = nullptr;
 
-  const MCSymbol *MC;
+  COFFSymbol(StringRef Name) : Name(Name) {}
 
-  COFFSymbol(StringRef name);
   void set_name_offset(uint32_t Offset);
 
   int64_t getIndex() const { return Index; }
@@ -89,9 +100,10 @@ public:
 // This class contains staging data for a COFF relocation entry.
 struct COFFRelocation {
   COFF::relocation Data;
-  COFFSymbol *Symb;
+  COFFSymbol *Symb = nullptr;
+
+  COFFRelocation() = default;
 
-  COFFRelocation() : Symb(nullptr) {}
   static size_t size() { return COFF::RelocationSize; }
 };
 
@@ -99,15 +111,15 @@ typedef std::vector<COFFRelocation> relocations;
 
 class COFFSection {
 public:
-  COFF::section Header;
+  COFF::section Header = {};
 
   std::string Name;
   int Number;
-  MCSectionCOFF const *MCSection;
-  COFFSymbol *Symbol;
+  MCSectionCOFF const *MCSection = nullptr;
+  COFFSymbol *Symbol = nullptr;
   relocations Relocations;
 
-  COFFSection(StringRef name);
+  COFFSection(StringRef Name) : Name(Name) {}
 };
 
 class WinCOFFObjectWriter : public MCObjectWriter {
@@ -121,7 +133,7 @@ public:
   std::unique_ptr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;
 
   // Root level file contents.
-  COFF::header Header;
+  COFF::header Header = {};
   sections Sections;
   symbols Symbols;
   StringTableBuilder Strings{StringTableBuilder::WinCOFF};
@@ -149,9 +161,6 @@ public:
   COFFSymbol *GetOrCreateCOFFSymbol(const MCSymbol *Symbol);
   COFFSection *createSection(StringRef Name);
 
-  template <typename object_t, typename list_t>
-  object_t *createCOFFEntity(StringRef Name, list_t &List);
-
   void defineSection(MCSectionCOFF const &Sec);
 
   COFFSymbol *getLinkedSymbol(const MCSymbol &Symbol);
@@ -168,8 +177,12 @@ public:
   void WriteFileHeader(const COFF::header &Header);
   void WriteSymbol(const COFFSymbol &S);
   void WriteAuxiliarySymbols(const COFFSymbol::AuxiliarySymbols &S);
-  void writeSectionHeader(const COFF::section &S);
+  void writeSectionHeaders();
   void WriteRelocation(const COFF::relocation &R);
+  uint32_t writeSectionContents(MCAssembler &Asm, const MCAsmLayout &Layout,
+                                const MCSection &MCSec);
+  void writeSection(MCAssembler &Asm, const MCAsmLayout &Layout,
+                    const COFFSection &Sec, const MCSection &MCSec);
 
   // MCObjectWriter interface implementation.
 
@@ -181,45 +194,29 @@ public:
                                               const MCFragment &FB, bool InSet,
                                               bool IsPCRel) const override;
 
-  bool isWeak(const MCSymbol &Sym) const override;
-
   void recordRelocation(MCAssembler &Asm, const MCAsmLayout &Layout,
                         const MCFragment *Fragment, const MCFixup &Fixup,
                         MCValue Target, bool &IsPCRel,
                         uint64_t &FixedValue) override;
 
+  void createFileSymbols(MCAssembler &Asm);
+  void assignSectionNumbers();
+  void assignFileOffsets(MCAssembler &Asm, const MCAsmLayout &Layout);
+
   void writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) override;
 };
-}
 
-static inline void write_uint32_le(void *Data, uint32_t Value) {
-  support::endian::write<uint32_t, support::little, support::unaligned>(Data,
-                                                                        Value);
-}
+} // end anonymous namespace
 
 //------------------------------------------------------------------------------
 // Symbol class implementation
 
-COFFSymbol::COFFSymbol(StringRef name)
-    : Name(name.begin(), name.end()), Other(nullptr), Section(nullptr),
-      Relocations(0), MC(nullptr) {
-  memset(&Data, 0, sizeof(Data));
-}
-
 // In the case that the name does not fit within 8 bytes, the offset
 // into the string table is stored in the last 4 bytes instead, leaving
 // the first 4 bytes as 0.
 void COFFSymbol::set_name_offset(uint32_t Offset) {
-  write_uint32_le(Data.Name + 0, 0);
-  write_uint32_le(Data.Name + 4, Offset);
-}
-
-//------------------------------------------------------------------------------
-// Section class implementation
-
-COFFSection::COFFSection(StringRef name)
-    : Name(name), MCSection(nullptr), Symbol(nullptr) {
-  memset(&Header, 0, sizeof(Header));
+  write32le(Data.Name + 0, 0);
+  write32le(Data.Name + 4, Offset);
 }
 
 //------------------------------------------------------------------------------
@@ -228,115 +225,92 @@ COFFSection::COFFSection(StringRef name)
 WinCOFFObjectWriter::WinCOFFObjectWriter(MCWinCOFFObjectTargetWriter *MOTW,
                                          raw_pwrite_stream &OS)
     : MCObjectWriter(OS, true), TargetObjectWriter(MOTW) {
-  memset(&Header, 0, sizeof(Header));
-
   Header.Machine = TargetObjectWriter->getMachine();
 }
 
 COFFSymbol *WinCOFFObjectWriter::createSymbol(StringRef Name) {
-  return createCOFFEntity<COFFSymbol>(Name, Symbols);
+  Symbols.push_back(make_unique<COFFSymbol>(Name));
+  return Symbols.back().get();
 }
 
 COFFSymbol *WinCOFFObjectWriter::GetOrCreateCOFFSymbol(const MCSymbol *Symbol) {
-  symbol_map::iterator i = SymbolMap.find(Symbol);
-  if (i != SymbolMap.end())
-    return i->second;
-  COFFSymbol *RetSymbol =
-      createCOFFEntity<COFFSymbol>(Symbol->getName(), Symbols);
-  SymbolMap[Symbol] = RetSymbol;
-  return RetSymbol;
+  COFFSymbol *&Ret = SymbolMap[Symbol];
+  if (!Ret)
+    Ret = createSymbol(Symbol->getName());
+  return Ret;
 }
 
 COFFSection *WinCOFFObjectWriter::createSection(StringRef Name) {
-  return createCOFFEntity<COFFSection>(Name, Sections);
+  Sections.emplace_back(make_unique<COFFSection>(Name));
+  return Sections.back().get();
 }
 
-/// A template used to lookup or create a symbol/section, and initialize it if
-/// needed.
-template <typename object_t, typename list_t>
-object_t *WinCOFFObjectWriter::createCOFFEntity(StringRef Name, list_t &List) {
-  List.push_back(make_unique<object_t>(Name));
-
-  return List.back().get();
-}
-
-/// This function takes a section data object from the assembler
-/// and creates the associated COFF section staging object.
-void WinCOFFObjectWriter::defineSection(MCSectionCOFF const &Sec) {
-  COFFSection *coff_section = createSection(Sec.getSectionName());
-  COFFSymbol *coff_symbol = createSymbol(Sec.getSectionName());
-  if (Sec.getSelection() != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
-    if (const MCSymbol *S = Sec.getCOMDATSymbol()) {
-      COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(S);
-      if (COMDATSymbol->Section)
-        report_fatal_error("two sections have the same comdat");
-      COMDATSymbol->Section = coff_section;
-    }
-  }
-
-  coff_section->Symbol = coff_symbol;
-  coff_symbol->Section = coff_section;
-  coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
-
-  // In this case the auxiliary symbol is a Section Definition.
-  coff_symbol->Aux.resize(1);
-  memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
-  coff_symbol->Aux[0].AuxType = ATSectionDefinition;
-  coff_symbol->Aux[0].Aux.SectionDefinition.Selection = Sec.getSelection();
-
-  coff_section->Header.Characteristics = Sec.getCharacteristics();
-
-  uint32_t &Characteristics = coff_section->Header.Characteristics;
+static uint32_t getAlignment(const MCSectionCOFF &Sec) {
   switch (Sec.getAlignment()) {
   case 1:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_1BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_1BYTES;
   case 2:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_2BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_2BYTES;
   case 4:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_4BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_4BYTES;
   case 8:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_8BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_8BYTES;
   case 16:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_16BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_16BYTES;
   case 32:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_32BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_32BYTES;
   case 64:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_64BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_64BYTES;
   case 128:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_128BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_128BYTES;
   case 256:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_256BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_256BYTES;
   case 512:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_512BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_512BYTES;
   case 1024:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_1024BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_1024BYTES;
   case 2048:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_2048BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_2048BYTES;
   case 4096:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_4096BYTES;
-    break;
+    return COFF::IMAGE_SCN_ALIGN_4096BYTES;
   case 8192:
-    Characteristics |= COFF::IMAGE_SCN_ALIGN_8192BYTES;
-    break;
-  default:
-    llvm_unreachable("unsupported section alignment");
+    return COFF::IMAGE_SCN_ALIGN_8192BYTES;
+  }
+  llvm_unreachable("unsupported section alignment");
+}
+
+/// This function takes a section data object from the assembler
+/// and creates the associated COFF section staging object.
+void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec) {
+  COFFSection *Section = createSection(MCSec.getSectionName());
+  COFFSymbol *Symbol = createSymbol(MCSec.getSectionName());
+  Section->Symbol = Symbol;
+  Symbol->Section = Section;
+  Symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
+
+  // Create a COMDAT symbol if needed.
+  if (MCSec.getSelection() != COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE) {
+    if (const MCSymbol *S = MCSec.getCOMDATSymbol()) {
+      COFFSymbol *COMDATSymbol = GetOrCreateCOFFSymbol(S);
+      if (COMDATSymbol->Section)
+        report_fatal_error("two sections have the same comdat");
+      COMDATSymbol->Section = Section;
+    }
   }
 
+  // In this case the auxiliary symbol is a Section Definition.
+  Symbol->Aux.resize(1);
+  Symbol->Aux[0] = {};
+  Symbol->Aux[0].AuxType = ATSectionDefinition;
+  Symbol->Aux[0].Aux.SectionDefinition.Selection = MCSec.getSelection();
+
+  // Set section alignment.
+  Section->Header.Characteristics = MCSec.getCharacteristics();
+  Section->Header.Characteristics |= getAlignment(MCSec);
+
   // Bind internal COFF section to MC section.
-  coff_section->MCSection = &Sec;
-  SectionMap[&Sec] = coff_section;
+  Section->MCSection = &MCSec;
+  SectionMap[&MCSec] = Section;
 }
 
 static uint64_t getSymbolValue(const MCSymbol &Symbol,
@@ -368,25 +342,25 @@ COFFSymbol *WinCOFFObjectWriter::getLinkedSymbol(const MCSymbol &Symbol) {
 
 /// This function takes a symbol data object from the assembler
 /// and creates the associated COFF symbol staging object.
-void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &Symbol,
+void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym,
                                        MCAssembler &Assembler,
                                        const MCAsmLayout &Layout) {
-  COFFSymbol *coff_symbol = GetOrCreateCOFFSymbol(&Symbol);
-  const MCSymbol *Base = Layout.getBaseSymbol(Symbol);
+  COFFSymbol *Sym = GetOrCreateCOFFSymbol(&MCSym);
+  const MCSymbol *Base = Layout.getBaseSymbol(MCSym);
   COFFSection *Sec = nullptr;
   if (Base && Base->getFragment()) {
     Sec = SectionMap[Base->getFragment()->getParent()];
-    if (coff_symbol->Section && coff_symbol->Section != Sec)
+    if (Sym->Section && Sym->Section != Sec)
       report_fatal_error("conflicting sections for symbol");
   }
 
   COFFSymbol *Local = nullptr;
-  if (cast<MCSymbolCOFF>(Symbol).isWeakExternal()) {
-    coff_symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
+  if (cast<MCSymbolCOFF>(MCSym).isWeakExternal()) {
+    Sym->Data.StorageClass = COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL;
 
-    COFFSymbol *WeakDefault = getLinkedSymbol(Symbol);
+    COFFSymbol *WeakDefault = getLinkedSymbol(MCSym);
     if (!WeakDefault) {
-      std::string WeakName = (".weak." + Symbol.getName() + ".default").str();
+      std::string WeakName = (".weak." + MCSym.getName() + ".default").str();
       WeakDefault = createSymbol(WeakName);
       if (!Sec)
         WeakDefault->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
@@ -395,41 +369,41 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &Symbol,
       Local = WeakDefault;
     }
 
-    coff_symbol->Other = WeakDefault;
+    Sym->Other = WeakDefault;
 
     // Setup the Weak External auxiliary symbol.
-    coff_symbol->Aux.resize(1);
-    memset(&coff_symbol->Aux[0], 0, sizeof(coff_symbol->Aux[0]));
-    coff_symbol->Aux[0].AuxType = ATWeakExternal;
-    coff_symbol->Aux[0].Aux.WeakExternal.TagIndex = 0;
-    coff_symbol->Aux[0].Aux.WeakExternal.Characteristics =
+    Sym->Aux.resize(1);
+    memset(&Sym->Aux[0], 0, sizeof(Sym->Aux[0]));
+    Sym->Aux[0].AuxType = ATWeakExternal;
+    Sym->Aux[0].Aux.WeakExternal.TagIndex = 0;
+    Sym->Aux[0].Aux.WeakExternal.Characteristics =
         COFF::IMAGE_WEAK_EXTERN_SEARCH_LIBRARY;
   } else {
     if (!Base)
-      coff_symbol->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
+      Sym->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
     else
-      coff_symbol->Section = Sec;
-    Local = coff_symbol;
+      Sym->Section = Sec;
+    Local = Sym;
   }
 
   if (Local) {
-    Local->Data.Value = getSymbolValue(Symbol, Layout);
+    Local->Data.Value = getSymbolValue(MCSym, Layout);
 
-    const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(Symbol);
+    const MCSymbolCOFF &SymbolCOFF = cast<MCSymbolCOFF>(MCSym);
     Local->Data.Type = SymbolCOFF.getType();
     Local->Data.StorageClass = SymbolCOFF.getClass();
 
     // If no storage class was specified in the streamer, define it here.
     if (Local->Data.StorageClass == COFF::IMAGE_SYM_CLASS_NULL) {
-      bool IsExternal = Symbol.isExternal() ||
-                        (!Symbol.getFragment() && !Symbol.isVariable());
+      bool IsExternal = MCSym.isExternal() ||
+                        (!MCSym.getFragment() && !MCSym.isVariable());
 
       Local->Data.StorageClass = IsExternal ? COFF::IMAGE_SYM_CLASS_EXTERNAL
                                             : COFF::IMAGE_SYM_CLASS_STATIC;
     }
   }
 
-  coff_symbol->MC = &Symbol;
+  Sym->MC = &MCSym;
 }
 
 // Maximum offsets for different string table entry encodings.
@@ -459,24 +433,25 @@ static void encodeBase64StringEntry(char *Buffer, uint64_t Value) {
 }
 
 void WinCOFFObjectWriter::SetSectionName(COFFSection &S) {
-  if (S.Name.size() > COFF::NameSize) {
-    uint64_t StringTableEntry = Strings.getOffset(S.Name);
-
-    if (StringTableEntry <= Max7DecimalOffset) {
-      SmallVector<char, COFF::NameSize> Buffer;
-      Twine('/').concat(Twine(StringTableEntry)).toVector(Buffer);
-      assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
-
-      std::memcpy(S.Header.Name, Buffer.data(), Buffer.size());
-    } else if (StringTableEntry <= MaxBase64Offset) {
-      // Starting with 10,000,000, offsets are encoded as base64.
-      encodeBase64StringEntry(S.Header.Name, StringTableEntry);
-    } else {
-      report_fatal_error("COFF string table is greater than 64 GB.");
-    }
-  } else {
+  if (S.Name.size() <= COFF::NameSize) {
     std::memcpy(S.Header.Name, S.Name.c_str(), S.Name.size());
+    return;
   }
+
+  uint64_t StringTableEntry = Strings.getOffset(S.Name);
+  if (StringTableEntry <= Max7DecimalOffset) {
+    SmallVector<char, COFF::NameSize> Buffer;
+    Twine('/').concat(Twine(StringTableEntry)).toVector(Buffer);
+    assert(Buffer.size() <= COFF::NameSize && Buffer.size() >= 2);
+    std::memcpy(S.Header.Name, Buffer.data(), Buffer.size());
+    return;
+  }
+  if (StringTableEntry <= MaxBase64Offset) {
+    // Starting with 10,000,000, offsets are encoded as base64.
+    encodeBase64StringEntry(S.Header.Name, StringTableEntry);
+    return;
+  }
+  report_fatal_error("COFF string table is greater than 64 GB.");
 }
 
 void WinCOFFObjectWriter::SetSymbolName(COFFSymbol &S) {
@@ -583,18 +558,37 @@ void WinCOFFObjectWriter::WriteAuxiliarySymbols(
   }
 }
 
-void WinCOFFObjectWriter::writeSectionHeader(const COFF::section &S) {
-  writeBytes(StringRef(S.Name, COFF::NameSize));
-
-  writeLE32(S.VirtualSize);
-  writeLE32(S.VirtualAddress);
-  writeLE32(S.SizeOfRawData);
-  writeLE32(S.PointerToRawData);
-  writeLE32(S.PointerToRelocations);
-  writeLE32(S.PointerToLineNumbers);
-  writeLE16(S.NumberOfRelocations);
-  writeLE16(S.NumberOfLineNumbers);
-  writeLE32(S.Characteristics);
+// Write the section header.
+void WinCOFFObjectWriter::writeSectionHeaders() {
+  // Section numbers must be monotonically increasing in the section
+  // header, but our Sections array is not sorted by section number,
+  // so make a copy of Sections and sort it.
+  std::vector<COFFSection *> Arr;
+  for (auto &Section : Sections)
+    Arr.push_back(Section.get());
+  std::sort(Arr.begin(), Arr.end(),
+            [](const COFFSection *A, const COFFSection *B) {
+              return A->Number < B->Number;
+            });
+
+  for (auto &Section : Arr) {
+    if (Section->Number == -1)
+      continue;
+
+    COFF::section &S = Section->Header;
+    if (Section->Relocations.size() >= 0xffff)
+      S.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
+    writeBytes(StringRef(S.Name, COFF::NameSize));
+    writeLE32(S.VirtualSize);
+    writeLE32(S.VirtualAddress);
+    writeLE32(S.SizeOfRawData);
+    writeLE32(S.PointerToRawData);
+    writeLE32(S.PointerToRelocations);
+    writeLE32(S.PointerToLineNumbers);
+    writeLE16(S.NumberOfRelocations);
+    writeLE16(S.NumberOfLineNumbers);
+    writeLE32(S.Characteristics);
+  }
 }
 
 void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
@@ -603,6 +597,87 @@ void WinCOFFObjectWriter::WriteRelocation(const COFF::relocation &R) {
   writeLE16(R.Type);
 }
 
+// Write MCSec's contents. What this function does is essentially
+// "Asm.writeSectionData(&MCSec, Layout)", but it's a bit complicated
+// because it needs to compute a CRC.
+uint32_t WinCOFFObjectWriter::writeSectionContents(MCAssembler &Asm,
+                                                   const MCAsmLayout &Layout,
+                                                   const MCSection &MCSec) {
+  // Save the contents of the section to a temporary buffer, we need this
+  // to CRC the data before we dump it into the object file.
+  SmallVector<char, 128> Buf;
+  raw_svector_ostream VecOS(Buf);
+  raw_pwrite_stream &OldStream = getStream();
+
+  // Redirect the output stream to our buffer and fill our buffer with
+  // the section data.
+  setStream(VecOS);
+  Asm.writeSectionData(&MCSec, Layout);
+
+  // Reset the stream back to what it was before.
+  setStream(OldStream);
+
+  // Write the section contents to the object file.
+  getStream() << Buf;
+
+  // Calculate our CRC with an initial value of '0', this is not how
+  // JamCRC is specified but it aligns with the expected output.
+  JamCRC JC(/*Init=*/0);
+  JC.update(Buf);
+  return JC.getCRC();
+}
+
+void WinCOFFObjectWriter::writeSection(MCAssembler &Asm,
+                                       const MCAsmLayout &Layout,
+                                       const COFFSection &Sec,
+                                       const MCSection &MCSec) {
+  if (Sec.Number == -1)
+    return;
+
+  // Write the section contents.
+  if (Sec.Header.PointerToRawData != 0) {
+    assert(getStream().tell() <= Sec.Header.PointerToRawData &&
+           "Section::PointerToRawData is insane!");
+
+    unsigned PaddingSize = Sec.Header.PointerToRawData - getStream().tell();
+    assert(PaddingSize < 4 &&
+           "Should only need at most three bytes of padding!");
+    WriteZeros(PaddingSize);
+
+    uint32_t CRC = writeSectionContents(Asm, Layout, MCSec);
+
+    // Update the section definition auxiliary symbol to record the CRC.
+    COFFSection *Sec = SectionMap[&MCSec];
+    COFFSymbol::AuxiliarySymbols &AuxSyms = Sec->Symbol->Aux;
+    assert(AuxSyms.size() == 1 && AuxSyms[0].AuxType == ATSectionDefinition);
+    AuxSymbol &SecDef = AuxSyms[0];
+    SecDef.Aux.SectionDefinition.CheckSum = CRC;
+  }
+
+  // Write relocations for this section.
+  if (Sec.Relocations.empty()) {
+    assert(Sec.Header.PointerToRelocations == 0 &&
+           "Section::PointerToRelocations is insane!");
+    return;
+  }
+
+  assert(getStream().tell() == Sec.Header.PointerToRelocations &&
+         "Section::PointerToRelocations is insane!");
+
+  if (Sec.Relocations.size() >= 0xffff) {
+    // In case of overflow, write actual relocation count as first
+    // relocation. Including the synthetic reloc itself (+ 1).
+    COFF::relocation R;
+    R.VirtualAddress = Sec.Relocations.size() + 1;
+    R.SymbolTableIndex = 0;
+    R.Type = 0;
+    WriteRelocation(R);
+  }
+
+  for (const auto &Relocation : Sec.Relocations)
+    WriteRelocation(Relocation.Data);
+}
+
 ////////////////////////////////////////////////////////////////////////////////
 // MCObjectWriter interface implementations
 
@@ -632,23 +707,6 @@ bool WinCOFFObjectWriter::isSymbolRefDifferenceFullyResolvedImpl(
                                                                 InSet, IsPCRel);
 }
 
-bool WinCOFFObjectWriter::isWeak(const MCSymbol &Sym) const {
-  if (!Sym.isExternal())
-    return false;
-
-  if (!Sym.isInSection())
-    return false;
-
-  const auto &Sec = cast<MCSectionCOFF>(Sym.getSection());
-  if (!Sec.getCOMDATSymbol())
-    return false;
-
-  // It looks like for COFF it is invalid to replace a reference to a global
-  // in a comdat with a reference to a local.
-  // FIXME: Add a specification reference if available.
-  return true;
-}
-
 void WinCOFFObjectWriter::recordRelocation(
     MCAssembler &Asm, const MCAsmLayout &Layout, const MCFragment *Fragment,
     const MCFixup &Fixup, MCValue Target, bool &IsPCRel, uint64_t &FixedValue) {
@@ -668,13 +726,13 @@ void WinCOFFObjectWriter::recordRelocation(
     return;
   }
 
-  MCSection *Section = Fragment->getParent();
+  MCSection *MCSec = Fragment->getParent();
 
   // Mark this symbol as requiring an entry in the symbol table.
-  assert(SectionMap.find(Section) != SectionMap.end() &&
+  assert(SectionMap.find(MCSec) != SectionMap.end() &&
          "Section must already have been defined in executePostLayoutBinding!");
 
-  COFFSection *coff_section = SectionMap[Section];
+  COFFSection *Sec = SectionMap[MCSec];
   const MCSymbolRefExpr *SymB = Target.getSymB();
   bool CrossSection = false;
 
@@ -796,46 +854,31 @@ void WinCOFFObjectWriter::recordRelocation(
     FixedValue = 0;
 
   if (TargetObjectWriter->recordRelocation(Fixup))
-    coff_section->Relocations.push_back(Reloc);
+    Sec->Relocations.push_back(Reloc);
 }
 
-void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
-                                      const MCAsmLayout &Layout) {
-  size_t SectionsSize = Sections.size();
-  if (SectionsSize > static_cast<size_t>(INT32_MAX))
-    report_fatal_error(
-        "PE COFF object files can't have more than 2147483647 sections");
-
-  // Assign symbol and section indexes and offsets.
-  int32_t NumberOfSections = static_cast<int32_t>(SectionsSize);
-
-  UseBigObj = NumberOfSections > COFF::MaxNumberOfSections16;
-
-  // Assign section numbers.
-  size_t Number = 1;
-  for (const auto &Section : Sections) {
-    Section->Number = Number;
-    Section->Symbol->Data.SectionNumber = Number;
-    Section->Symbol->Aux[0].Aux.SectionDefinition.Number = Number;
-    ++Number;
-  }
-
-  Header.NumberOfSections = NumberOfSections;
-  Header.NumberOfSymbols = 0;
+static std::time_t getTime() {
+  std::time_t Now = time(nullptr);
+  if (Now < 0 || !isUInt<32>(Now))
+    return UINT32_MAX;
+  return Now;
+}
 
+// Create .file symbols.
+void WinCOFFObjectWriter::createFileSymbols(MCAssembler &Asm) {
   for (const std::string &Name : Asm.getFileNames()) {
     // round up to calculate the number of auxiliary symbols required
     unsigned SymbolSize = UseBigObj ? COFF::Symbol32Size : COFF::Symbol16Size;
     unsigned Count = (Name.size() + SymbolSize - 1) / SymbolSize;
 
-    COFFSymbol *file = createSymbol(".file");
-    file->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
-    file->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
-    file->Aux.resize(Count);
+    COFFSymbol *File = createSymbol(".file");
+    File->Data.SectionNumber = COFF::IMAGE_SYM_DEBUG;
+    File->Data.StorageClass = COFF::IMAGE_SYM_CLASS_FILE;
+    File->Aux.resize(Count);
 
     unsigned Offset = 0;
     unsigned Length = Name.size();
-    for (auto &Aux : file->Aux) {
+    for (auto &Aux : File->Aux) {
       Aux.AuxType = ATFile;
 
       if (Length > SymbolSize) {
@@ -850,6 +893,109 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
       Offset += SymbolSize;
     }
   }
+}
+
+static bool isAssociative(const COFFSection &Section) {
+  return Section.Symbol->Aux[0].Aux.SectionDefinition.Selection ==
+         COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE;
+}
+
+void WinCOFFObjectWriter::assignSectionNumbers() {
+  size_t I = 1;
+  auto Assign = [&](COFFSection &Section) {
+    Section.Number = I;
+    Section.Symbol->Data.SectionNumber = I;
+    Section.Symbol->Aux[0].Aux.SectionDefinition.Number = I;
+    ++I;
+  };
+
+  // Although it is not explicitly requested by the Microsoft COFF spec,
+  // we should avoid emitting forward associative section references,
+  // because MSVC link.exe as of 2017 cannot handle that.
+  for (const std::unique_ptr<COFFSection> &Section : Sections)
+    if (!isAssociative(*Section))
+      Assign(*Section);
+  for (const std::unique_ptr<COFFSection> &Section : Sections)
+    if (isAssociative(*Section))
+      Assign(*Section);
+}
+
+// Assign file offsets to COFF object file structures.
+void WinCOFFObjectWriter::assignFileOffsets(MCAssembler &Asm,
+                                            const MCAsmLayout &Layout) {
+  unsigned Offset = getInitialOffset();
+
+  Offset += UseBigObj ? COFF::Header32Size : COFF::Header16Size;
+  Offset += COFF::SectionSize * Header.NumberOfSections;
+
+  for (const auto &Section : Asm) {
+    COFFSection *Sec = SectionMap[&Section];
+
+    if (Sec->Number == -1)
+      continue;
+
+    Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);
+
+    if (IsPhysicalSection(Sec)) {
+      // Align the section data to a four byte boundary.
+      Offset = alignTo(Offset, 4);
+      Sec->Header.PointerToRawData = Offset;
+
+      Offset += Sec->Header.SizeOfRawData;
+    }
+
+    if (!Sec->Relocations.empty()) {
+      bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;
+
+      if (RelocationsOverflow) {
+        // Signal overflow by setting NumberOfRelocations to max value. Actual
+        // size is found in reloc #0. Microsoft tools understand this.
+        Sec->Header.NumberOfRelocations = 0xffff;
+      } else {
+        Sec->Header.NumberOfRelocations = Sec->Relocations.size();
+      }
+      Sec->Header.PointerToRelocations = Offset;
+
+      if (RelocationsOverflow) {
+        // Reloc #0 will contain actual count, so make room for it.
+        Offset += COFF::RelocationSize;
+      }
+
+      Offset += COFF::RelocationSize * Sec->Relocations.size();
+
+      for (auto &Relocation : Sec->Relocations) {
+        assert(Relocation.Symb->getIndex() != -1);
+        Relocation.Data.SymbolTableIndex = Relocation.Symb->getIndex();
+      }
+    }
+
+    assert(Sec->Symbol->Aux.size() == 1 &&
+           "Section's symbol must have one aux!");
+    AuxSymbol &Aux = Sec->Symbol->Aux[0];
+    assert(Aux.AuxType == ATSectionDefinition &&
+           "Section's symbol's aux symbol must be a Section Definition!");
+    Aux.Aux.SectionDefinition.Length = Sec->Header.SizeOfRawData;
+    Aux.Aux.SectionDefinition.NumberOfRelocations =
+        Sec->Header.NumberOfRelocations;
+    Aux.Aux.SectionDefinition.NumberOfLinenumbers =
+        Sec->Header.NumberOfLineNumbers;
+  }
+
+  Header.PointerToSymbolTable = Offset;
+}
+
+void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
+                                      const MCAsmLayout &Layout) {
+  if (Sections.size() > INT32_MAX)
+    report_fatal_error(
+        "PE COFF object files can't have more than 2147483647 sections");
+
+  UseBigObj = Sections.size() > COFF::MaxNumberOfSections16;
+  Header.NumberOfSections = Sections.size();
+  Header.NumberOfSymbols = 0;
+
+  assignSectionNumbers();
+  createFileSymbols(Asm);
 
   for (auto &Symbol : Symbols) {
     // Update section number & offset for symbols that have them.
@@ -912,78 +1058,12 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
     Section->Symbol->Aux[0].Aux.SectionDefinition.Number = Assoc->Number;
   }
 
-  // Assign file offsets to COFF object file structures.
-
-  unsigned offset = getInitialOffset();
-
-  if (UseBigObj)
-    offset += COFF::Header32Size;
-  else
-    offset += COFF::Header16Size;
-  offset += COFF::SectionSize * Header.NumberOfSections;
-
-  for (const auto &Section : Asm) {
-    COFFSection *Sec = SectionMap[&Section];
-
-    if (Sec->Number == -1)
-      continue;
-
-    Sec->Header.SizeOfRawData = Layout.getSectionAddressSize(&Section);
-
-    if (IsPhysicalSection(Sec)) {
-      // Align the section data to a four byte boundary.
-      offset = alignTo(offset, 4);
-      Sec->Header.PointerToRawData = offset;
-
-      offset += Sec->Header.SizeOfRawData;
-    }
-
-    if (Sec->Relocations.size() > 0) {
-      bool RelocationsOverflow = Sec->Relocations.size() >= 0xffff;
-
-      if (RelocationsOverflow) {
-        // Signal overflow by setting NumberOfRelocations to max value. Actual
-        // size is found in reloc #0. Microsoft tools understand this.
-        Sec->Header.NumberOfRelocations = 0xffff;
-      } else {
-        Sec->Header.NumberOfRelocations = Sec->Relocations.size();
-      }
-      Sec->Header.PointerToRelocations = offset;
-
-      if (RelocationsOverflow) {
-        // Reloc #0 will contain actual count, so make room for it.
-        offset += COFF::RelocationSize;
-      }
-
-      offset += COFF::RelocationSize * Sec->Relocations.size();
-
-      for (auto &Relocation : Sec->Relocations) {
-        assert(Relocation.Symb->getIndex() != -1);
-        Relocation.Data.SymbolTableIndex = Relocation.Symb->getIndex();
-      }
-    }
-
-    assert(Sec->Symbol->Aux.size() == 1 &&
-           "Section's symbol must have one aux!");
-    AuxSymbol &Aux = Sec->Symbol->Aux[0];
-    assert(Aux.AuxType == ATSectionDefinition &&
-           "Section's symbol's aux symbol must be a Section Definition!");
-    Aux.Aux.SectionDefinition.Length = Sec->Header.SizeOfRawData;
-    Aux.Aux.SectionDefinition.NumberOfRelocations =
-        Sec->Header.NumberOfRelocations;
-    Aux.Aux.SectionDefinition.NumberOfLinenumbers =
-        Sec->Header.NumberOfLineNumbers;
-  }
-
-  Header.PointerToSymbolTable = offset;
+  assignFileOffsets(Asm, Layout);
 
   // MS LINK expects to be able to use this timestamp to implement their
   // /INCREMENTAL feature.
   if (Asm.isIncrementalLinkerCompatible()) {
-    std::time_t Now = time(nullptr);
-    if (Now < 0 || !isUInt<32>(Now))
-      Now = UINT32_MAX;
-    Header.TimeDateStamp = Now;
+    Header.TimeDateStamp = getTime();
   } else {
     // Have deterministic output if /INCREMENTAL isn't needed. Also matches GNU.
     Header.TimeDateStamp = 0;
@@ -991,96 +1071,25 @@ void WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
 
   // Write it all to disk...
   WriteFileHeader(Header);
+  writeSectionHeaders();
 
-  {
-    sections::iterator i, ie;
-    MCAssembler::iterator j, je;
-
-    for (auto &Section : Sections) {
-      if (Section->Number != -1) {
-        if (Section->Relocations.size() >= 0xffff)
-          Section->Header.Characteristics |= COFF::IMAGE_SCN_LNK_NRELOC_OVFL;
-        writeSectionHeader(Section->Header);
-      }
-    }
-
-    SmallVector<char, 128> SectionContents;
-    for (i = Sections.begin(), ie = Sections.end(), j = Asm.begin(),
-        je = Asm.end();
-         (i != ie) && (j != je); ++i, ++j) {
-
-      if ((*i)->Number == -1)
-        continue;
-
-      if ((*i)->Header.PointerToRawData != 0) {
-        assert(getStream().tell() <= (*i)->Header.PointerToRawData &&
-               "Section::PointerToRawData is insane!");
-
-        unsigned SectionDataPadding =
-            (*i)->Header.PointerToRawData - getStream().tell();
-        assert(SectionDataPadding < 4 &&
-               "Should only need at most three bytes of padding!");
-
-        WriteZeros(SectionDataPadding);
-
-        // Save the contents of the section to a temporary buffer, we need this
-        // to CRC the data before we dump it into the object file.
-        SectionContents.clear();
-        raw_svector_ostream VecOS(SectionContents);
-        raw_pwrite_stream &OldStream = getStream();
-        // Redirect the output stream to our buffer.
-        setStream(VecOS);
-        // Fill our buffer with the section data.
-        Asm.writeSectionData(&*j, Layout);
-        // Reset the stream back to what it was before.
-        setStream(OldStream);
-
-        // Calculate our CRC with an initial value of '0', this is not how
-        // JamCRC is specified but it aligns with the expected output.
-        JamCRC JC(/*Init=*/0x00000000U);
-        JC.update(SectionContents);
-
-        // Write the section contents to the object file.
-        getStream() << SectionContents;
-
-        // Update the section definition auxiliary symbol to record the CRC.
-        COFFSection *Sec = SectionMap[&*j];
-        COFFSymbol::AuxiliarySymbols &AuxSyms = Sec->Symbol->Aux;
-        assert(AuxSyms.size() == 1 &&
-               AuxSyms[0].AuxType == ATSectionDefinition);
-        AuxSymbol &SecDef = AuxSyms[0];
-        SecDef.Aux.SectionDefinition.CheckSum = JC.getCRC();
-      }
-
-      if ((*i)->Relocations.size() > 0) {
-        assert(getStream().tell() == (*i)->Header.PointerToRelocations &&
-               "Section::PointerToRelocations is insane!");
-
-        if ((*i)->Relocations.size() >= 0xffff) {
-          // In case of overflow, write actual relocation count as first
-          // relocation. Including the synthetic reloc itself (+ 1).
-          COFF::relocation r;
-          r.VirtualAddress = (*i)->Relocations.size() + 1;
-          r.SymbolTableIndex = 0;
-          r.Type = 0;
-          WriteRelocation(r);
-        }
-
-        for (const auto &Relocation : (*i)->Relocations)
-          WriteRelocation(Relocation.Data);
-      } else
-        assert((*i)->Header.PointerToRelocations == 0 &&
-               "Section::PointerToRelocations is insane!");
-    }
-  }
+  // Write section contents.
+  sections::iterator I = Sections.begin();
+  sections::iterator IE = Sections.end();
+  MCAssembler::iterator J = Asm.begin();
+  MCAssembler::iterator JE = Asm.end();
+  for (; I != IE && J != JE; ++I, ++J)
+    writeSection(Asm, Layout, **I, *J);
 
   assert(getStream().tell() == Header.PointerToSymbolTable &&
          "Header::PointerToSymbolTable is insane!");
 
+  // Write a symbol table.
   for (auto &Symbol : Symbols)
     if (Symbol->getIndex() != -1)
       WriteSymbol(*Symbol);
 
+  // Write a string table, which completes the entire COFF file.
   Strings.write(getStream());
 }
 
diff --git a/lib/MC/WinCOFFStreamer.cpp b/lib/MC/WinCOFFStreamer.cpp
index 6383d8794030790894fddd3007b1f9731c27b228..c26d87f36f83d6edaa4f89b8347edad923976a76 100644
--- a/lib/MC/WinCOFFStreamer.cpp
+++ b/lib/MC/WinCOFFStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/MC/WinCOFFStreamer.cpp -----------------------------*- C++ -*-===//
+//===- llvm/MC/WinCOFFStreamer.cpp ----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,32 +11,36 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCObjectFileInfo.h"
 #include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionCOFF.h"
-#include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbolCOFF.h"
-#include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFStreamer.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/COFF.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "WinCOFFStreamer"
 
-namespace llvm {
 MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                      MCCodeEmitter &CE, raw_pwrite_stream &OS)
     : MCObjectStreamer(Context, MAB, OS, &CE), CurSymbol(nullptr) {}
@@ -75,10 +79,9 @@ void MCWinCOFFStreamer::InitSections(bool NoExecStack) {
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
-void MCWinCOFFStreamer::EmitLabel(MCSymbol *S) {
+void MCWinCOFFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
-  assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
-  MCObjectStreamer::EmitLabel(Symbol);
+  MCObjectStreamer::EmitLabel(Symbol, Loc);
 }
 
 void MCWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
@@ -275,10 +278,6 @@ void MCWinCOFFStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
   llvm_unreachable("not implemented");
 }
 
-void MCWinCOFFStreamer::EmitFileDirective(StringRef Filename) {
-  getAssembler().addFileName(Filename);
-}
-
 // TODO: Implement this if you want to emit .comment section in COFF obj files.
 void MCWinCOFFStreamer::EmitIdent(StringRef IdentString) {
   llvm_unreachable("not implemented");
@@ -295,5 +294,3 @@ void MCWinCOFFStreamer::FinishImpl() {
 void MCWinCOFFStreamer::Error(const Twine &Msg) const {
   getContext().reportError(SMLoc(), Msg);
 }
-}
-
diff --git a/lib/Object/ArchiveWriter.cpp b/lib/Object/ArchiveWriter.cpp
index f8e3c5a0a03f7f04ba14fe65ea291212776d0c82..5b233aab2018a7a3f365aefe9addd80b5089103a 100644
--- a/lib/Object/ArchiveWriter.cpp
+++ b/lib/Object/ArchiveWriter.cpp
@@ -122,12 +122,27 @@ static void printWithSpacePadding(raw_fd_ostream &OS, T Data, unsigned Size,
   }
 }
 
+static bool isBSDLike(object::Archive::Kind Kind) {
+  switch (Kind) {
+  case object::Archive::K_GNU:
+    return false;
+  case object::Archive::K_BSD:
+  case object::Archive::K_DARWIN:
+    return true;
+  case object::Archive::K_MIPS64:
+  case object::Archive::K_DARWIN64:
+  case object::Archive::K_COFF:
+    break;
+  }
+  llvm_unreachable("not supported for writting");
+}
+
 static void print32(raw_ostream &Out, object::Archive::Kind Kind,
                     uint32_t Val) {
-  if (Kind == object::Archive::K_GNU)
-    support::endian::Writer<support::big>(Out).write(Val);
-  else
+  if (isBSDLike(Kind))
     support::endian::Writer<support::little>(Out).write(Val);
+  else
+    support::endian::Writer<support::big>(Out).write(Val);
 }
 
 static void printRestOfMemberHeader(
@@ -178,7 +193,7 @@ printMemberHeader(raw_fd_ostream &Out, object::Archive::Kind Kind, bool Thin,
                   std::vector<unsigned>::iterator &StringMapIndexIter,
                   const sys::TimePoint<std::chrono::seconds> &ModTime,
                   unsigned UID, unsigned GID, unsigned Perms, unsigned Size) {
-  if (Kind == object::Archive::K_BSD)
+  if (isBSDLike(Kind))
     return printBSDMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size);
   if (!useStringTable(Thin, Name))
     return printGNUSmallMemberHeader(Out, Name, ModTime, UID, GID, Perms, Size);
@@ -285,10 +300,10 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
 
     if (!HeaderStartOffset) {
       HeaderStartOffset = Out.tell();
-      if (Kind == object::Archive::K_GNU)
-        printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, 0);
-      else
+      if (isBSDLike(Kind))
         printBSDMemberHeader(Out, "__.SYMDEF", now(Deterministic), 0, 0, 0, 0);
+      else
+        printGNUSmallMemberHeader(Out, "", now(Deterministic), 0, 0, 0, 0);
       BodyStartOffset = Out.tell();
       print32(Out, Kind, 0); // number of entries or bytes
     }
@@ -307,7 +322,7 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
         return EC;
       NameOS << '\0';
       MemberOffsetRefs.push_back(MemberNum);
-      if (Kind == object::Archive::K_BSD)
+      if (isBSDLike(Kind))
         print32(Out, Kind, NameOffset);
       print32(Out, Kind, 0); // member offset
     }
@@ -316,10 +331,21 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
   if (HeaderStartOffset == 0)
     return 0;
 
+  // ld64 prefers the cctools type archive which pads its string table to a
+  // boundary of sizeof(int32_t).
+  if (isBSDLike(Kind))
+    for (unsigned P = OffsetToAlignment(NameOS.tell(), sizeof(int32_t)); P--;)
+      NameOS << '\0';
+
   StringRef StringTable = NameOS.str();
-  if (Kind == object::Archive::K_BSD)
+  if (isBSDLike(Kind))
     print32(Out, Kind, StringTable.size()); // byte count of the string table
   Out << StringTable;
+  // If there are no symbols, emit an empty symbol table, to satisfy Solaris
+  // tools, older versions of which expect a symbol table in a non-empty
+  // archive, regardless of whether there are any symbols in it.
+  if (StringTable.size() == 0)
+    print32(Out, Kind, 0);
 
   // ld64 requires the next member header to start at an offset that is
   // 4 bytes aligned.
@@ -336,10 +362,10 @@ writeSymbolTable(raw_fd_ostream &Out, object::Archive::Kind Kind,
   // Patch up the number of symbols.
   Out.seek(BodyStartOffset);
   unsigned NumSyms = MemberOffsetRefs.size();
-  if (Kind == object::Archive::K_GNU)
-    print32(Out, Kind, NumSyms);
-  else
+  if (isBSDLike(Kind))
     print32(Out, Kind, NumSyms * 8);
+  else
+    print32(Out, Kind, NumSyms);
 
   Out.seek(Pos);
   return BodyStartOffset + 4;
@@ -351,8 +377,7 @@ llvm::writeArchive(StringRef ArcName,
                    bool WriteSymtab, object::Archive::Kind Kind,
                    bool Deterministic, bool Thin,
                    std::unique_ptr<MemoryBuffer> OldArchiveBuf) {
-  assert((!Thin || Kind == object::Archive::K_GNU) &&
-         "Only the gnu format has a thin mode");
+  assert((!Thin || !isBSDLike(Kind)) && "Only the gnu format has a thin mode");
   SmallString<128> TmpArchive;
   int TmpArchiveFD;
   if (auto EC = sys::fs::createUniqueFile(ArcName + ".temp-archive-%%%%%%%.a",
@@ -368,10 +393,6 @@ llvm::writeArchive(StringRef ArcName,
 
   std::vector<unsigned> MemberOffsetRefs;
 
-  std::vector<std::unique_ptr<MemoryBuffer>> Buffers;
-  std::vector<MemoryBufferRef> Members;
-  std::vector<sys::fs::file_status> NewMemberStatus;
-
   unsigned MemberReferenceOffset = 0;
   if (WriteSymtab) {
     ErrorOr<unsigned> MemberReferenceOffsetOrErr = writeSymbolTable(
@@ -382,25 +403,35 @@ llvm::writeArchive(StringRef ArcName,
   }
 
   std::vector<unsigned> StringMapIndexes;
-  if (Kind != object::Archive::K_BSD)
+  if (!isBSDLike(Kind))
     writeStringTable(Out, ArcName, NewMembers, StringMapIndexes, Thin);
 
   std::vector<unsigned>::iterator StringMapIndexIter = StringMapIndexes.begin();
   std::vector<unsigned> MemberOffset;
   for (const NewArchiveMember &M : NewMembers) {
     MemoryBufferRef File = M.Buf->getMemBufferRef();
+    unsigned Padding = 0;
 
     unsigned Pos = Out.tell();
     MemberOffset.push_back(Pos);
 
+    // ld64 expects the members to be 8-byte aligned for 64-bit content and at
+    // least 4-byte aligned for 32-bit content.  Opt for the larger encoding
+    // uniformly.  This matches the behaviour with cctools and ensures that ld64
+    // is happy with archives that we generate.
+    if (Kind == object::Archive::K_DARWIN)
+      Padding = OffsetToAlignment(M.Buf->getBufferSize(), 8);
+
     printMemberHeader(Out, Kind, Thin,
                       sys::path::filename(M.Buf->getBufferIdentifier()),
                       StringMapIndexIter, M.ModTime, M.UID, M.GID, M.Perms,
-                      M.Buf->getBufferSize());
+                      M.Buf->getBufferSize() + Padding);
 
     if (!Thin)
       Out << File.getBuffer();
 
+    while (Padding--)
+      Out << '\n';
     if (Out.tell() % 2)
       Out << '\n';
   }
@@ -408,7 +439,7 @@ llvm::writeArchive(StringRef ArcName,
   if (MemberReferenceOffset) {
     Out.seek(MemberReferenceOffset);
     for (unsigned MemberNum : MemberOffsetRefs) {
-      if (Kind == object::Archive::K_BSD)
+      if (isBSDLike(Kind))
         Out.seek(Out.tell() + 4); // skip over the string offset
       print32(Out, Kind, MemberOffset[MemberNum]);
     }
diff --git a/lib/Object/CMakeLists.txt b/lib/Object/CMakeLists.txt
index b895c3fcc0507eefb4f89ddca4332bfc708d1515..2007f560c166da1d8fa709e28dca86c5ccfcf20e 100644
--- a/lib/Object/CMakeLists.txt
+++ b/lib/Object/CMakeLists.txt
@@ -8,6 +8,7 @@ add_llvm_library(LLVMObject
   ELFObjectFile.cpp
   Error.cpp
   IRObjectFile.cpp
+  IRSymtab.cpp
   MachOObjectFile.cpp
   MachOUniversal.cpp
   ModuleSummaryIndexObjectFile.cpp
diff --git a/lib/Object/IRSymtab.cpp b/lib/Object/IRSymtab.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1ef2c4bb730325cbd07b652bd9a52d8f26111fa2
--- /dev/null
+++ b/lib/Object/IRSymtab.cpp
@@ -0,0 +1,228 @@
+//===- IRSymtab.cpp - implementation of IR symbol tables --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/IRSymtab.h"
+#include "llvm/Analysis/ObjectUtils.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Object/ModuleSymbolTable.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/StringSaver.h"
+
+using namespace llvm;
+using namespace irsymtab;
+
+namespace {
+
+/// Stores the temporary state that is required to build an IR symbol table.
+struct Builder {
+  SmallVector<char, 0> &Symtab;
+  SmallVector<char, 0> &Strtab;
+  Builder(SmallVector<char, 0> &Symtab, SmallVector<char, 0> &Strtab)
+      : Symtab(Symtab), Strtab(Strtab) {}
+
+  StringTableBuilder StrtabBuilder{StringTableBuilder::ELF};
+
+  BumpPtrAllocator Alloc;
+  StringSaver Saver{Alloc};
+
+  DenseMap<const Comdat *, unsigned> ComdatMap;
+  ModuleSymbolTable Msymtab;
+  SmallPtrSet<GlobalValue *, 8> Used;
+  Mangler Mang;
+  Triple TT;
+
+  std::vector<storage::Comdat> Comdats;
+  std::vector<storage::Module> Mods;
+  std::vector<storage::Symbol> Syms;
+  std::vector<storage::Uncommon> Uncommons;
+
+  std::string COFFLinkerOpts;
+  raw_string_ostream COFFLinkerOptsOS{COFFLinkerOpts};
+
+  void setStr(storage::Str &S, StringRef Value) {
+    S.Offset = StrtabBuilder.add(Value);
+  }
+  template <typename T>
+  void writeRange(storage::Range<T> &R, const std::vector<T> &Objs) {
+    R.Offset = Symtab.size();
+    R.Size = Objs.size();
+    Symtab.insert(Symtab.end(), reinterpret_cast<const char *>(Objs.data()),
+                  reinterpret_cast<const char *>(Objs.data() + Objs.size()));
+  }
+
+  Error addModule(Module *M);
+  Error addSymbol(ModuleSymbolTable::Symbol Sym);
+
+  Error build(ArrayRef<Module *> Mods);
+};
+
+Error Builder::addModule(Module *M) {
+  collectUsedGlobalVariables(*M, Used, /*CompilerUsed*/ false);
+
+  storage::Module Mod;
+  Mod.Begin = Msymtab.symbols().size();
+  Msymtab.addModule(M);
+  Mod.End = Msymtab.symbols().size();
+  Mods.push_back(Mod);
+
+  if (TT.isOSBinFormatCOFF()) {
+    if (auto E = M->materializeMetadata())
+      return E;
+    if (Metadata *Val = M->getModuleFlag("Linker Options")) {
+      MDNode *LinkerOptions = cast<MDNode>(Val);
+      for (const MDOperand &MDOptions : LinkerOptions->operands())
+        for (const MDOperand &MDOption : cast<MDNode>(MDOptions)->operands())
+          COFFLinkerOptsOS << " " << cast<MDString>(MDOption)->getString();
+    }
+  }
+
+  return Error::success();
+}
+
+Error Builder::addSymbol(ModuleSymbolTable::Symbol Msym) {
+  Syms.emplace_back();
+  storage::Symbol &Sym = Syms.back();
+  Sym = {};
+
+  Sym.UncommonIndex = -1;
+  storage::Uncommon *Unc = nullptr;
+  auto Uncommon = [&]() -> storage::Uncommon & {
+    if (Unc)
+      return *Unc;
+    Sym.UncommonIndex = Uncommons.size();
+    Uncommons.emplace_back();
+    Unc = &Uncommons.back();
+    *Unc = {};
+    setStr(Unc->COFFWeakExternFallbackName, "");
+    return *Unc;
+  };
+
+  SmallString<64> Name;
+  {
+    raw_svector_ostream OS(Name);
+    Msymtab.printSymbolName(OS, Msym);
+  }
+  setStr(Sym.Name, Saver.save(StringRef(Name)));
+
+  auto Flags = Msymtab.getSymbolFlags(Msym);
+  if (Flags & object::BasicSymbolRef::SF_Undefined)
+    Sym.Flags |= 1 << storage::Symbol::FB_undefined;
+  if (Flags & object::BasicSymbolRef::SF_Weak)
+    Sym.Flags |= 1 << storage::Symbol::FB_weak;
+  if (Flags & object::BasicSymbolRef::SF_Common)
+    Sym.Flags |= 1 << storage::Symbol::FB_common;
+  if (Flags & object::BasicSymbolRef::SF_Indirect)
+    Sym.Flags |= 1 << storage::Symbol::FB_indirect;
+  if (Flags & object::BasicSymbolRef::SF_Global)
+    Sym.Flags |= 1 << storage::Symbol::FB_global;
+  if (Flags & object::BasicSymbolRef::SF_FormatSpecific)
+    Sym.Flags |= 1 << storage::Symbol::FB_format_specific;
+
+  Sym.ComdatIndex = -1;
+  auto *GV = Msym.dyn_cast<GlobalValue *>();
+  if (!GV) {
+    setStr(Sym.IRName, "");
+    return Error::success();
+  }
+
+  setStr(Sym.IRName, GV->getName());
+
+  if (Used.count(GV))
+    Sym.Flags |= 1 << storage::Symbol::FB_used;
+  if (GV->isThreadLocal())
+    Sym.Flags |= 1 << storage::Symbol::FB_tls;
+  if (GV->hasGlobalUnnamedAddr())
+    Sym.Flags |= 1 << storage::Symbol::FB_unnamed_addr;
+  if (canBeOmittedFromSymbolTable(GV))
+    Sym.Flags |= 1 << storage::Symbol::FB_may_omit;
+  Sym.Flags |= unsigned(GV->getVisibility()) << storage::Symbol::FB_visibility;
+
+  if (Flags & object::BasicSymbolRef::SF_Common) {
+    Uncommon().CommonSize = GV->getParent()->getDataLayout().getTypeAllocSize(
+        GV->getType()->getElementType());
+    Uncommon().CommonAlign = GV->getAlignment();
+  }
+
+  const GlobalObject *Base = GV->getBaseObject();
+  if (!Base)
+    return make_error<StringError>("Unable to determine comdat of alias!",
+                                   inconvertibleErrorCode());
+  if (const Comdat *C = Base->getComdat()) {
+    auto P = ComdatMap.insert(std::make_pair(C, Comdats.size()));
+    Sym.ComdatIndex = P.first->second;
+
+    if (P.second) {
+      storage::Comdat Comdat;
+      setStr(Comdat.Name, C->getName());
+      Comdats.push_back(Comdat);
+    }
+  }
+
+  if (TT.isOSBinFormatCOFF()) {
+    emitLinkerFlagsForGlobalCOFF(COFFLinkerOptsOS, GV, TT, Mang);
+
+    if ((Flags & object::BasicSymbolRef::SF_Weak) &&
+        (Flags & object::BasicSymbolRef::SF_Indirect)) {
+      std::string FallbackName;
+      raw_string_ostream OS(FallbackName);
+      Msymtab.printSymbolName(
+          OS, cast<GlobalValue>(
+                  cast<GlobalAlias>(GV)->getAliasee()->stripPointerCasts()));
+      OS.flush();
+      setStr(Uncommon().COFFWeakExternFallbackName, Saver.save(FallbackName));
+    }
+  }
+
+  return Error::success();
+}
+
+Error Builder::build(ArrayRef<Module *> IRMods) {
+  storage::Header Hdr;
+
+  assert(!IRMods.empty());
+  setStr(Hdr.SourceFileName, IRMods[0]->getSourceFileName());
+  TT = Triple(IRMods[0]->getTargetTriple());
+
+  // This adds the symbols for each module to Msymtab.
+  for (auto *M : IRMods)
+    if (Error Err = addModule(M))
+      return Err;
+
+  for (ModuleSymbolTable::Symbol Msym : Msymtab.symbols())
+    if (Error Err = addSymbol(Msym))
+      return Err;
+
+  COFFLinkerOptsOS.flush();
+  setStr(Hdr.COFFLinkerOpts, COFFLinkerOpts);
+
+  // We are about to fill in the header's range fields, so reserve space for it
+  // and copy it in afterwards.
+  Symtab.resize(sizeof(storage::Header));
+  writeRange(Hdr.Modules, Mods);
+  writeRange(Hdr.Comdats, Comdats);
+  writeRange(Hdr.Symbols, Syms);
+  writeRange(Hdr.Uncommons, Uncommons);
+
+  *reinterpret_cast<storage::Header *>(Symtab.data()) = Hdr;
+
+  raw_svector_ostream OS(Strtab);
+  StrtabBuilder.finalizeInOrder();
+  StrtabBuilder.write(OS);
+
+  return Error::success();
+}
+
+} // anonymous namespace
+
+Error irsymtab::build(ArrayRef<Module *> Mods, SmallVector<char, 0> &Symtab,
+                      SmallVector<char, 0> &Strtab) {
+  return Builder(Symtab, Strtab).build(Mods);
+}
diff --git a/lib/Object/MachOObjectFile.cpp b/lib/Object/MachOObjectFile.cpp
index c0a7518d1e69f0cce8288cf2f34ef153266c5977..1753d2baaedd2181c72370f1eb6876b4fef164a8 100644
--- a/lib/Object/MachOObjectFile.cpp
+++ b/lib/Object/MachOObjectFile.cpp
@@ -106,13 +106,6 @@ static StringRef parseSegmentOrSectionName(const char *P) {
   return StringRef(P, 16);
 }
 
-// Helper to advance a section or symbol iterator multiple increments at a time.
-template<class T>
-static void advance(T &it, size_t Val) {
-  while (Val--)
-    ++it;
-}
-
 static unsigned getCPUType(const MachOObjectFile &O) {
   return O.getHeader().cputype;
 }
@@ -368,7 +361,7 @@ static Error parseSegmentLoadCommand(
                             CmdName + " extends past the end of the file");
     if (S.vmsize != 0 && S.filesize > S.vmsize)
       return malformedError("load command " + Twine(LoadCommandIndex) +
-                            " fileoff field in " + CmdName +
+                            " filesize field in " + CmdName +
                             " greater than vmsize field");
     IsPageZeroSegment |= StringRef("__PAGEZERO").equals(S.segname);
   } else
@@ -2272,6 +2265,10 @@ std::error_code MachOObjectFile::getLibraryShortNameByIndex(unsigned Index,
   return std::error_code();
 }
 
+uint32_t MachOObjectFile::getLibraryCount() const {
+  return Libraries.size();
+}
+
 section_iterator
 MachOObjectFile::getRelocationRelocatedSection(relocation_iterator Rel) const {
   DataRefImpl Sec;
@@ -2477,6 +2474,8 @@ Triple MachOObjectFile::getArchTriple(uint32_t CPUType, uint32_t CPUSubType,
   case MachO::CPU_TYPE_ARM64:
     switch (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) {
     case MachO::CPU_SUBTYPE_ARM64_ALL:
+      if (McpuDefault)
+        *McpuDefault = "cyclone";
       if (ArchFlag)
         *ArchFlag = "arm64";
       return Triple("arm64-apple-darwin");
@@ -2749,10 +2748,11 @@ iterator_range<export_iterator> MachOObjectFile::exports() const {
   return exports(getDyldInfoExportsTrie());
 }
 
-MachORebaseEntry::MachORebaseEntry(ArrayRef<uint8_t> Bytes, bool is64Bit)
-    : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0),
-      RemainingLoopCount(0), AdvanceAmount(0), RebaseType(0),
-      PointerSize(is64Bit ? 8 : 4), Malformed(false), Done(false) {}
+MachORebaseEntry::MachORebaseEntry(Error *E, const MachOObjectFile *O,
+                                   ArrayRef<uint8_t> Bytes, bool is64Bit)
+    : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0),
+      SegmentIndex(-1), RemainingLoopCount(0), AdvanceAmount(0), RebaseType(0),
+      PointerSize(is64Bit ? 8 : 4), Done(false) {}
 
 void MachORebaseEntry::moveToFirst() {
   Ptr = Opcodes.begin();
@@ -2766,22 +2766,29 @@ void MachORebaseEntry::moveToEnd() {
 }
 
 void MachORebaseEntry::moveNext() {
+  ErrorAsOutParameter ErrAsOutParam(E);
   // If in the middle of some loop, move to next rebasing in loop.
   SegmentOffset += AdvanceAmount;
   if (RemainingLoopCount) {
     --RemainingLoopCount;
     return;
   }
+  // REBASE_OPCODE_DONE is only used for padding if we are not aligned to
+  // pointer size. Therefore it is possible to reach the end without ever having
+  // seen REBASE_OPCODE_DONE.
   if (Ptr == Opcodes.end()) {
     Done = true;
     return;
   }
   bool More = true;
-  while (More && !Malformed) {
+  while (More) {
     // Parse next opcode and set up next loop.
+    const uint8_t *OpcodeStart = Ptr;
     uint8_t Byte = *Ptr++;
     uint8_t ImmValue = Byte & MachO::REBASE_IMMEDIATE_MASK;
     uint8_t Opcode = Byte & MachO::REBASE_OPCODE_MASK;
+    uint32_t Count, Skip;
+    const char *error = nullptr;
     switch (Opcode) {
     case MachO::REBASE_OPCODE_DONE:
       More = false;
@@ -2791,6 +2798,13 @@ void MachORebaseEntry::moveNext() {
       break;
     case MachO::REBASE_OPCODE_SET_TYPE_IMM:
       RebaseType = ImmValue;
+      if (RebaseType > MachO::REBASE_TYPE_TEXT_PCREL32) {
+          *E = malformedError("for REBASE_OPCODE_SET_TYPE_IMM bad bind type: " +
+               Twine((int)RebaseType) + " for opcode at: 0x" +
+               utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_SET_TYPE_IMM: "
@@ -2798,7 +2812,23 @@ void MachORebaseEntry::moveNext() {
       break;
     case MachO::REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
       SegmentIndex = ImmValue;
-      SegmentOffset = readULEB128();
+      SegmentOffset = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: "
@@ -2807,22 +2837,80 @@ void MachORebaseEntry::moveNext() {
                        << "\n");
       break;
     case MachO::REBASE_OPCODE_ADD_ADDR_ULEB:
-      SegmentOffset += readULEB128();
+      SegmentOffset += readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-rebase",
                       llvm::dbgs() << "REBASE_OPCODE_ADD_ADDR_ULEB: "
                                    << format("SegmentOffset=0x%06X",
                                              SegmentOffset) << "\n");
       break;
     case MachO::REBASE_OPCODE_ADD_ADDR_IMM_SCALED:
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_IMM_SCALED " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       SegmentOffset += ImmValue * PointerSize;
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              false);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_ADD_ADDR_IMM_SCALED "
+             " (after adding immediate times the pointer size) " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-rebase",
                       llvm::dbgs() << "REBASE_OPCODE_ADD_ADDR_IMM_SCALED: "
                                    << format("SegmentOffset=0x%06X",
                                              SegmentOffset) << "\n");
       break;
     case MachO::REBASE_OPCODE_DO_REBASE_IMM_TIMES:
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_IMM_TIMES " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       AdvanceAmount = PointerSize;
-      RemainingLoopCount = ImmValue - 1;
+      Skip = 0;
+      Count = ImmValue;
+      if (ImmValue != 0)
+        RemainingLoopCount = ImmValue - 1;
+      else
+        RemainingLoopCount = 0;
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_IMM_TIMES "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_IMM_TIMES: "
@@ -2832,8 +2920,38 @@ void MachORebaseEntry::moveNext() {
                        << "\n");
       return;
     case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES:
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       AdvanceAmount = PointerSize;
-      RemainingLoopCount = readULEB128() - 1;
+      Skip = 0;
+      Count = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (Count != 0)
+        RemainingLoopCount = Count - 1;
+      else
+        RemainingLoopCount = 0;
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES: "
@@ -2843,8 +2961,35 @@ void MachORebaseEntry::moveNext() {
                        << "\n");
       return;
     case MachO::REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB:
-      AdvanceAmount = readULEB128() + PointerSize;
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Skip = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      AdvanceAmount = Skip + PointerSize;
+      Count = 1;
       RemainingLoopCount = 0;
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB: "
@@ -2854,8 +2999,46 @@ void MachORebaseEntry::moveNext() {
                        << "\n");
       return;
     case MachO::REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB:
-      RemainingLoopCount = readULEB128() - 1;
-      AdvanceAmount = readULEB128() + PointerSize;
+      error = O->RebaseEntryCheckSegAndOffset(SegmentIndex, SegmentOffset,
+                                              true);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Count = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (Count != 0)
+        RemainingLoopCount = Count - 1;
+      else
+        RemainingLoopCount = 0;
+      Skip = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      AdvanceAmount = Skip + PointerSize;
+
+      error = O->RebaseEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                              SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_"
+             "ULEB " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-rebase",
           llvm::dbgs() << "REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB: "
@@ -2865,23 +3048,25 @@ void MachORebaseEntry::moveNext() {
                        << "\n");
       return;
     default:
-      Malformed = true;
+      *E = malformedError("bad rebase info (bad opcode value 0x" +
+           utohexstr(Opcode) + " for opcode at: 0x" +
+           utohexstr(OpcodeStart - Opcodes.begin()));
+      moveToEnd();
+      return;
     }
   }
 }
 
-uint64_t MachORebaseEntry::readULEB128() {
+uint64_t MachORebaseEntry::readULEB128(const char **error) {
   unsigned Count;
-  uint64_t Result = decodeULEB128(Ptr, &Count);
+  uint64_t Result = decodeULEB128(Ptr, &Count, Opcodes.end(), error);
   Ptr += Count;
-  if (Ptr > Opcodes.end()) {
+  if (Ptr > Opcodes.end())
     Ptr = Opcodes.end();
-    Malformed = true;
-  }
   return Result;
 }
 
-uint32_t MachORebaseEntry::segmentIndex() const { return SegmentIndex; }
+int32_t MachORebaseEntry::segmentIndex() const { return SegmentIndex; }
 
 uint64_t MachORebaseEntry::segmentOffset() const { return SegmentOffset; }
 
@@ -2897,6 +3082,24 @@ StringRef MachORebaseEntry::typeName() const {
   return "unknown";
 }
 
+// For use with the SegIndex of a checked Mach-O Rebase entry
+// to get the segment name.
+StringRef MachORebaseEntry::segmentName() const {
+  return O->BindRebaseSegmentName(SegmentIndex);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Rebase entry
+// to get the section name.
+StringRef MachORebaseEntry::sectionName() const {
+  return O->BindRebaseSectionName(SegmentIndex, SegmentOffset);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Rebase entry
+// to get the address.
+uint64_t MachORebaseEntry::address() const {
+  return O->BindRebaseAddress(SegmentIndex, SegmentOffset);
+}
+
 bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const {
 #ifdef EXPENSIVE_CHECKS
   assert(Opcodes == Other.Opcodes && "compare iterators of different files");
@@ -2909,25 +3112,29 @@ bool MachORebaseEntry::operator==(const MachORebaseEntry &Other) const {
 }
 
 iterator_range<rebase_iterator>
-MachOObjectFile::rebaseTable(ArrayRef<uint8_t> Opcodes, bool is64) {
-  MachORebaseEntry Start(Opcodes, is64);
+MachOObjectFile::rebaseTable(Error &Err, MachOObjectFile *O,
+                             ArrayRef<uint8_t> Opcodes, bool is64) {
+  if (O->BindRebaseSectionTable == nullptr)
+    O->BindRebaseSectionTable = llvm::make_unique<BindRebaseSegInfo>(O);
+  MachORebaseEntry Start(&Err, O, Opcodes, is64);
   Start.moveToFirst();
 
-  MachORebaseEntry Finish(Opcodes, is64);
+  MachORebaseEntry Finish(&Err, O, Opcodes, is64);
   Finish.moveToEnd();
 
   return make_range(rebase_iterator(Start), rebase_iterator(Finish));
 }
 
-iterator_range<rebase_iterator> MachOObjectFile::rebaseTable() const {
-  return rebaseTable(getDyldInfoRebaseOpcodes(), is64Bit());
+iterator_range<rebase_iterator> MachOObjectFile::rebaseTable(Error &Err) {
+  return rebaseTable(Err, this, getDyldInfoRebaseOpcodes(), is64Bit());
 }
 
-MachOBindEntry::MachOBindEntry(ArrayRef<uint8_t> Bytes, bool is64Bit, Kind BK)
-    : Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0), SegmentIndex(0),
-      Ordinal(0), Flags(0), Addend(0), RemainingLoopCount(0), AdvanceAmount(0),
-      BindType(0), PointerSize(is64Bit ? 8 : 4),
-      TableKind(BK), Malformed(false), Done(false) {}
+MachOBindEntry::MachOBindEntry(Error *E, const MachOObjectFile *O,
+                               ArrayRef<uint8_t> Bytes, bool is64Bit, Kind BK)
+    : E(E), O(O), Opcodes(Bytes), Ptr(Bytes.begin()), SegmentOffset(0),
+      SegmentIndex(-1), LibraryOrdinalSet(false), Ordinal(0), Flags(0),
+      Addend(0), RemainingLoopCount(0), AdvanceAmount(0), BindType(0),
+      PointerSize(is64Bit ? 8 : 4), TableKind(BK), Done(false) {}
 
 void MachOBindEntry::moveToFirst() {
   Ptr = Opcodes.begin();
@@ -2941,24 +3148,31 @@ void MachOBindEntry::moveToEnd() {
 }
 
 void MachOBindEntry::moveNext() {
+  ErrorAsOutParameter ErrAsOutParam(E);
   // If in the middle of some loop, move to next binding in loop.
   SegmentOffset += AdvanceAmount;
   if (RemainingLoopCount) {
     --RemainingLoopCount;
     return;
   }
+  // BIND_OPCODE_DONE is only used for padding if we are not aligned to
+  // pointer size. Therefore it is possible to reach the end without ever having
+  // seen BIND_OPCODE_DONE.
   if (Ptr == Opcodes.end()) {
     Done = true;
     return;
   }
   bool More = true;
-  while (More && !Malformed) {
+  while (More) {
     // Parse next opcode and set up next loop.
+    const uint8_t *OpcodeStart = Ptr;
     uint8_t Byte = *Ptr++;
     uint8_t ImmValue = Byte & MachO::BIND_IMMEDIATE_MASK;
     uint8_t Opcode = Byte & MachO::BIND_OPCODE_MASK;
     int8_t SignExtended;
     const uint8_t *SymStart;
+    uint32_t Count, Skip;
+    const char *error = nullptr;
     switch (Opcode) {
     case MachO::BIND_OPCODE_DONE:
       if (TableKind == Kind::Lazy) {
@@ -2974,28 +3188,81 @@ void MachOBindEntry::moveNext() {
           break;
       }
       More = false;
-      Done = true;
       moveToEnd();
       DEBUG_WITH_TYPE("mach-o-bind", llvm::dbgs() << "BIND_OPCODE_DONE\n");
       break;
     case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_IMM:
+      if (TableKind == Kind::Weak) {
+        *E = malformedError("BIND_OPCODE_SET_DYLIB_ORDINAL_IMM not allowed in "
+             "weak bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       Ordinal = ImmValue;
+      LibraryOrdinalSet = true;
+      if (ImmValue > O->getLibraryCount()) {
+        *E = malformedError("for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad "
+             "library ordinal: " + Twine((int)ImmValue) + " (max " +
+             Twine((int)O->getLibraryCount()) + ") for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_IMM: "
                        << "Ordinal=" << Ordinal << "\n");
       break;
     case MachO::BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB:
-      Ordinal = readULEB128();
+      if (TableKind == Kind::Weak) {
+        *E = malformedError("BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB not allowed in "
+             "weak bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Ordinal = readULEB128(&error);
+      LibraryOrdinalSet = true;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (Ordinal > (int)O->getLibraryCount()) {
+        *E = malformedError("for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad "
+             "library ordinal: " + Twine((int)Ordinal) + " (max " +
+             Twine((int)O->getLibraryCount()) + ") for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB: "
                        << "Ordinal=" << Ordinal << "\n");
       break;
     case MachO::BIND_OPCODE_SET_DYLIB_SPECIAL_IMM:
+      if (TableKind == Kind::Weak) {
+        *E = malformedError("BIND_OPCODE_SET_DYLIB_SPECIAL_IMM not allowed in "
+             "weak bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       if (ImmValue) {
         SignExtended = MachO::BIND_OPCODE_MASK | ImmValue;
         Ordinal = SignExtended;
+        LibraryOrdinalSet = true;
+        if (Ordinal < MachO::BIND_SPECIAL_DYLIB_FLAT_LOOKUP) {
+          *E = malformedError("for BIND_OPCODE_SET_DYLIB_SPECIAL_IMM unknown "
+               "special ordinal: " + Twine((int)Ordinal) + " for opcode at: "
+               "0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+        }
       } else
         Ordinal = 0;
       DEBUG_WITH_TYPE(
@@ -3006,9 +3273,16 @@ void MachOBindEntry::moveNext() {
     case MachO::BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM:
       Flags = ImmValue;
       SymStart = Ptr;
-      while (*Ptr) {
+      while (*Ptr && (Ptr < Opcodes.end())) {
         ++Ptr;
       }
+      if (Ptr == Opcodes.end()) {
+          *E = malformedError("for BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM "
+               "symbol name extends past opcodes for opcode at: 0x" +
+               utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+      }
       SymbolName = StringRef(reinterpret_cast<const char*>(SymStart),
                              Ptr-SymStart);
       ++Ptr;
@@ -3023,15 +3297,27 @@ void MachOBindEntry::moveNext() {
       break;
     case MachO::BIND_OPCODE_SET_TYPE_IMM:
       BindType = ImmValue;
+      if (ImmValue > MachO::BIND_TYPE_TEXT_PCREL32) {
+          *E = malformedError("for BIND_OPCODE_SET_TYPE_IMM bad bind type: " +
+               Twine((int)ImmValue) + " for opcode at: 0x" +
+               utohexstr(OpcodeStart - Opcodes.begin()));
+          moveToEnd();
+          return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_SET_TYPE_IMM: "
                        << "BindType=" << (int)BindType << "\n");
       break;
     case MachO::BIND_OPCODE_SET_ADDEND_SLEB:
-      Addend = readSLEB128();
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
+      Addend = readSLEB128(&error);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_ADDEND_SLEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_SET_ADDEND_SLEB: "
@@ -3039,7 +3325,22 @@ void MachOBindEntry::moveNext() {
       break;
     case MachO::BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB:
       SegmentIndex = ImmValue;
-      SegmentOffset = readULEB128();
+      SegmentOffset = readULEB128(&error);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB: "
@@ -3048,7 +3349,22 @@ void MachOBindEntry::moveNext() {
                        << "\n");
       break;
     case MachO::BIND_OPCODE_ADD_ADDR_ULEB:
-      SegmentOffset += readULEB128();
+      SegmentOffset += readULEB128(&error);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-bind",
                       llvm::dbgs() << "BIND_OPCODE_ADD_ADDR_ULEB: "
                                    << format("SegmentOffset=0x%06X",
@@ -3057,16 +3373,83 @@ void MachOBindEntry::moveNext() {
     case MachO::BIND_OPCODE_DO_BIND:
       AdvanceAmount = PointerSize;
       RemainingLoopCount = 0;
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND " + Twine(error) +
+             " for opcode at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND missing preceding "
+             "BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND missing preceding "
+             "BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-bind",
                       llvm::dbgs() << "BIND_OPCODE_DO_BIND: "
                                    << format("SegmentOffset=0x%06X",
                                              SegmentOffset) << "\n");
       return;
      case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB:
-      AdvanceAmount = readULEB128() + PointerSize;
+      if (TableKind == Kind::Lazy) {
+        *E = malformedError("BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB not allowed in "
+             "lazy bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB missing "
+             "preceding BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for opcode "
+             "at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB missing "
+             "preceding BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      AdvanceAmount = readULEB128(&error) + PointerSize;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      // Note, this is not really an error until the next bind but make no sense
+      // for a BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB to not be followed by another
+      // bind operation.
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset +
+                                            AdvanceAmount, false);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_ADD_ADDR_ULEB (after adding "
+             "ULEB) " + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       RemainingLoopCount = 0;
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB: "
@@ -3076,10 +3459,47 @@ void MachOBindEntry::moveNext() {
                        << "\n");
       return;
     case MachO::BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED:
+      if (TableKind == Kind::Lazy) {
+        *E = malformedError("BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED not "
+             "allowed in lazy bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
+             "missing preceding BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for "
+             "opcode at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
+             "missing preceding BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode "
+             "at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       AdvanceAmount = ImmValue * PointerSize + PointerSize;
       RemainingLoopCount = 0;
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset +
+                                            AdvanceAmount, false);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
+             " (after adding immediate times the pointer size) " +
+             Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
       DEBUG_WITH_TYPE("mach-o-bind",
                       llvm::dbgs()
                       << "BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED: "
@@ -3087,10 +3507,65 @@ void MachOBindEntry::moveNext() {
                                              SegmentOffset) << "\n");
       return;
     case MachO::BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB:
-      RemainingLoopCount = readULEB128() - 1;
-      AdvanceAmount = readULEB128() + PointerSize;
-      if (TableKind == Kind::Lazy)
-        Malformed = true;
+      if (TableKind == Kind::Lazy) {
+        *E = malformedError("BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB not "
+             "allowed in lazy bind table for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Count = readULEB128(&error);
+      if (Count != 0)
+        RemainingLoopCount = Count - 1;
+      else
+        RemainingLoopCount = 0;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+                            " (count value) " + Twine(error) + " for opcode at"
+                            ": 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      Skip = readULEB128(&error);
+      AdvanceAmount = Skip + PointerSize;
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+                            " (skip value) " + Twine(error) + " for opcode at"
+                            ": 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckSegAndOffset(SegmentIndex, SegmentOffset, true);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (SymbolName == StringRef()) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             "missing preceding BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for "
+             "opcode at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      if (!LibraryOrdinalSet && TableKind != Kind::Weak) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             "missing preceding BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode "
+             "at: 0x" + utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+        return;
+      }
+      error = O->BindEntryCheckCountAndSkip(Count, Skip, PointerSize,
+                                            SegmentIndex, SegmentOffset);
+      if (error) {
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB "
+             + Twine(error) + " for opcode at: 0x" +
+             utohexstr(OpcodeStart - Opcodes.begin()));
+        moveToEnd();
+	return;
+      }
       DEBUG_WITH_TYPE(
           "mach-o-bind",
           llvm::dbgs() << "BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB: "
@@ -3100,34 +3575,34 @@ void MachOBindEntry::moveNext() {
                        << "\n");
       return;
     default:
-      Malformed = true;
+      *E = malformedError("bad bind info (bad opcode value 0x" +
+           utohexstr(Opcode) + " for opcode at: 0x" +
+           utohexstr(OpcodeStart - Opcodes.begin()));
+      moveToEnd();
+      return;
     }
   }
 }
 
-uint64_t MachOBindEntry::readULEB128() {
+uint64_t MachOBindEntry::readULEB128(const char **error) {
   unsigned Count;
-  uint64_t Result = decodeULEB128(Ptr, &Count);
+  uint64_t Result = decodeULEB128(Ptr, &Count, Opcodes.end(), error);
   Ptr += Count;
-  if (Ptr > Opcodes.end()) {
+  if (Ptr > Opcodes.end())
     Ptr = Opcodes.end();
-    Malformed = true;
-  }
   return Result;
 }
 
-int64_t MachOBindEntry::readSLEB128() {
+int64_t MachOBindEntry::readSLEB128(const char **error) {
   unsigned Count;
-  int64_t Result = decodeSLEB128(Ptr, &Count);
+  int64_t Result = decodeSLEB128(Ptr, &Count, Opcodes.end(), error);
   Ptr += Count;
-  if (Ptr > Opcodes.end()) {
+  if (Ptr > Opcodes.end())
     Ptr = Opcodes.end();
-    Malformed = true;
-  }
   return Result;
 }
 
-uint32_t MachOBindEntry::segmentIndex() const { return SegmentIndex; }
+int32_t MachOBindEntry::segmentIndex() const { return SegmentIndex; }
 
 uint64_t MachOBindEntry::segmentOffset() const { return SegmentOffset; }
 
@@ -3151,6 +3626,24 @@ uint32_t MachOBindEntry::flags() const { return Flags; }
 
 int MachOBindEntry::ordinal() const { return Ordinal; }
 
+// For use with the SegIndex of a checked Mach-O Bind entry
+// to get the segment name.
+StringRef MachOBindEntry::segmentName() const {
+  return O->BindRebaseSegmentName(SegmentIndex);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind entry
+// to get the section name.
+StringRef MachOBindEntry::sectionName() const {
+  return O->BindRebaseSectionName(SegmentIndex, SegmentOffset);
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind entry
+// to get the address.
+uint64_t MachOBindEntry::address() const {
+  return O->BindRebaseAddress(SegmentIndex, SegmentOffset);
+}
+
 bool MachOBindEntry::operator==(const MachOBindEntry &Other) const {
 #ifdef EXPENSIVE_CHECKS
   assert(Opcodes == Other.Opcodes && "compare iterators of different files");
@@ -3162,30 +3655,149 @@ bool MachOBindEntry::operator==(const MachOBindEntry &Other) const {
          (Done == Other.Done);
 }
 
+// Build table of sections so SegIndex/SegOffset pairs can be translated.
+BindRebaseSegInfo::BindRebaseSegInfo(const object::MachOObjectFile *Obj) {
+  uint32_t CurSegIndex = Obj->hasPageZeroSegment() ? 1 : 0;
+  StringRef CurSegName;
+  uint64_t CurSegAddress;
+  for (const SectionRef &Section : Obj->sections()) {
+    SectionInfo Info;
+    Section.getName(Info.SectionName);
+    Info.Address = Section.getAddress();
+    Info.Size = Section.getSize();
+    Info.SegmentName =
+        Obj->getSectionFinalSegmentName(Section.getRawDataRefImpl());
+    if (!Info.SegmentName.equals(CurSegName)) {
+      ++CurSegIndex;
+      CurSegName = Info.SegmentName;
+      CurSegAddress = Info.Address;
+    }
+    Info.SegmentIndex = CurSegIndex - 1;
+    Info.OffsetInSegment = Info.Address - CurSegAddress;
+    Info.SegmentStartAddress = CurSegAddress;
+    Sections.push_back(Info);
+  }
+  MaxSegIndex = CurSegIndex;
+}
+
+// For use with a SegIndex,SegOffset pair in MachOBindEntry::moveNext() to
+// validate a MachOBindEntry or MachORebaseEntry.
+const char * BindRebaseSegInfo::checkSegAndOffset(int32_t SegIndex,
+                                                  uint64_t SegOffset,
+                                                  bool endInvalid) {
+  if (SegIndex == -1)
+    return "missing preceding *_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB";
+  if (SegIndex >= MaxSegIndex)
+    return "bad segIndex (too large)";
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex != SegIndex)
+      continue;
+    if (SI.OffsetInSegment > SegOffset)
+      continue;
+    if (SegOffset > (SI.OffsetInSegment + SI.Size))
+      continue;
+    if (endInvalid && SegOffset >= (SI.OffsetInSegment + SI.Size))
+      continue;
+    return nullptr;
+  }
+  return "bad segOffset, too large";
+}
+
+// For use in MachOBindEntry::moveNext() to validate a MachOBindEntry for
+// the BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB opcode and for use in
+// MachORebaseEntry::moveNext() to validate a MachORebaseEntry for
+// REBASE_OPCODE_DO_*_TIMES* opcodes.  The SegIndex and SegOffset must have
+// been already checked.
+const char * BindRebaseSegInfo::checkCountAndSkip(uint32_t Count, uint32_t Skip,
+                                                  uint8_t PointerSize,
+                                                  int32_t SegIndex,
+                                                  uint64_t SegOffset) {
+  const SectionInfo &SI = findSection(SegIndex, SegOffset);
+  uint64_t addr = SI.SegmentStartAddress + SegOffset;
+  if (addr >= SI.Address + SI.Size)
+    return "bad segOffset, too large";
+  uint64_t i = 0;
+  if (Count > 1)
+    i = (Skip + PointerSize) * (Count - 1);
+  else if (Count == 1)
+    i = Skip + PointerSize;
+  if (addr + i >= SI.Address + SI.Size) {
+    // For rebase opcodes they can step from one section to another.
+    uint64_t TrailingSegOffset = (addr + i) - SI.SegmentStartAddress;
+    const char *error = checkSegAndOffset(SegIndex, TrailingSegOffset, false);
+    if (error)
+      return "bad count and skip, too large";
+  }
+  return nullptr;
+}
+
+// For use with the SegIndex of a checked Mach-O Bind or Rebase entry
+// to get the segment name.
+StringRef BindRebaseSegInfo::segmentName(int32_t SegIndex) {
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex == SegIndex)
+      return SI.SegmentName;
+  }
+  llvm_unreachable("invalid SegIndex");
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or Rebase
+// to get the SectionInfo.
+const BindRebaseSegInfo::SectionInfo &BindRebaseSegInfo::findSection(
+                                     int32_t SegIndex, uint64_t SegOffset) {
+  for (const SectionInfo &SI : Sections) {
+    if (SI.SegmentIndex != SegIndex)
+      continue;
+    if (SI.OffsetInSegment > SegOffset)
+      continue;
+    if (SegOffset >= (SI.OffsetInSegment + SI.Size))
+      continue;
+    return SI;
+  }
+  llvm_unreachable("SegIndex and SegOffset not in any section");
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or Rebase
+// entry to get the section name.
+StringRef BindRebaseSegInfo::sectionName(int32_t SegIndex,
+                                         uint64_t SegOffset) {
+  return findSection(SegIndex, SegOffset).SectionName;
+}
+
+// For use with a SegIndex,SegOffset pair from a checked Mach-O Bind or Rebase
+// entry to get the address.
+uint64_t BindRebaseSegInfo::address(uint32_t SegIndex, uint64_t OffsetInSeg) {
+  const SectionInfo &SI = findSection(SegIndex, OffsetInSeg);
+  return SI.SegmentStartAddress + OffsetInSeg;
+}
+
 iterator_range<bind_iterator>
-MachOObjectFile::bindTable(ArrayRef<uint8_t> Opcodes, bool is64,
+MachOObjectFile::bindTable(Error &Err, MachOObjectFile *O,
+                           ArrayRef<uint8_t> Opcodes, bool is64,
                            MachOBindEntry::Kind BKind) {
-  MachOBindEntry Start(Opcodes, is64, BKind);
+  if (O->BindRebaseSectionTable == nullptr)
+    O->BindRebaseSectionTable = llvm::make_unique<BindRebaseSegInfo>(O);
+  MachOBindEntry Start(&Err, O, Opcodes, is64, BKind);
   Start.moveToFirst();
 
-  MachOBindEntry Finish(Opcodes, is64, BKind);
+  MachOBindEntry Finish(&Err, O, Opcodes, is64, BKind);
   Finish.moveToEnd();
 
   return make_range(bind_iterator(Start), bind_iterator(Finish));
 }
 
-iterator_range<bind_iterator> MachOObjectFile::bindTable() const {
-  return bindTable(getDyldInfoBindOpcodes(), is64Bit(),
+iterator_range<bind_iterator> MachOObjectFile::bindTable(Error &Err) {
+  return bindTable(Err, this, getDyldInfoBindOpcodes(), is64Bit(),
                    MachOBindEntry::Kind::Regular);
 }
 
-iterator_range<bind_iterator> MachOObjectFile::lazyBindTable() const {
-  return bindTable(getDyldInfoLazyBindOpcodes(), is64Bit(),
+iterator_range<bind_iterator> MachOObjectFile::lazyBindTable(Error &Err) {
+  return bindTable(Err, this, getDyldInfoLazyBindOpcodes(), is64Bit(),
                    MachOBindEntry::Kind::Lazy);
 }
 
-iterator_range<bind_iterator> MachOObjectFile::weakBindTable() const {
-  return bindTable(getDyldInfoWeakBindOpcodes(), is64Bit(),
+iterator_range<bind_iterator> MachOObjectFile::weakBindTable(Error &Err) {
+  return bindTable(Err, this, getDyldInfoWeakBindOpcodes(), is64Bit(),
                    MachOBindEntry::Kind::Weak);
 }
 
diff --git a/lib/Object/ModuleSummaryIndexObjectFile.cpp b/lib/Object/ModuleSummaryIndexObjectFile.cpp
index 11ace84b9cebb6809a1e99db43ca7140fdc00f0d..de1ddab88fd4017859adb90b8b8c6e00baa5e09c 100644
--- a/lib/Object/ModuleSummaryIndexObjectFile.cpp
+++ b/lib/Object/ModuleSummaryIndexObjectFile.cpp
@@ -96,13 +96,18 @@ ModuleSummaryIndexObjectFile::create(MemoryBufferRef Object) {
 // Parse the module summary index out of an IR file and return the summary
 // index object if found, or nullptr if not.
 Expected<std::unique_ptr<ModuleSummaryIndex>>
-llvm::getModuleSummaryIndexForFile(StringRef Path) {
+llvm::getModuleSummaryIndexForFile(StringRef Path, StringRef Identifier) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Path);
   std::error_code EC = FileOrErr.getError();
   if (EC)
     return errorCodeToError(EC);
-  MemoryBufferRef BufferRef = (FileOrErr.get())->getMemBufferRef();
+  std::unique_ptr<MemoryBuffer> MemBuffer = std::move(FileOrErr.get());
+  // If Identifier is non-empty, use it as the buffer identifier, which
+  // will become the module path in the index.
+  if (Identifier.empty())
+    Identifier = MemBuffer->getBufferIdentifier();
+  MemoryBufferRef BufferRef(MemBuffer->getBuffer(), Identifier);
   if (IgnoreEmptyThinLTOIndexFile && !BufferRef.getBufferSize())
     return nullptr;
   Expected<std::unique_ptr<object::ModuleSummaryIndexObjectFile>> ObjOrErr =
diff --git a/lib/Object/ModuleSymbolTable.cpp b/lib/Object/ModuleSymbolTable.cpp
index 90488007ff59a116616baa6e54db9b66e00f56dc..9a935d8e08699dff37691ce50dcb82051b6091e6 100644
--- a/lib/Object/ModuleSymbolTable.cpp
+++ b/lib/Object/ModuleSymbolTable.cpp
@@ -43,27 +43,98 @@ void ModuleSymbolTable::addModule(Module *M) {
   else
     FirstMod = M;
 
-  for (Function &F : *M)
-    SymTab.push_back(&F);
-  for (GlobalVariable &GV : M->globals())
+  for (GlobalValue &GV : M->global_values())
     SymTab.push_back(&GV);
-  for (GlobalAlias &GA : M->aliases())
-    SymTab.push_back(&GA);
-
-  CollectAsmSymbols(Triple(M->getTargetTriple()), M->getModuleInlineAsm(),
-                    [this](StringRef Name, BasicSymbolRef::Flags Flags) {
-                      SymTab.push_back(new (AsmSymbols.Allocate())
-                                           AsmSymbol(Name, Flags));
-                    });
+
+  CollectAsmSymbols(*M, [this](StringRef Name, BasicSymbolRef::Flags Flags) {
+    SymTab.push_back(new (AsmSymbols.Allocate()) AsmSymbol(Name, Flags));
+  });
+}
+
+// Ensure ELF .symver aliases get the same binding as the defined symbol
+// they alias with.
+static void handleSymverAliases(const Module &M, RecordStreamer &Streamer) {
+  if (Streamer.symverAliases().empty())
+    return;
+
+  // The name in the assembler will be mangled, but the name in the IR
+  // might not, so we first compute a mapping from mangled name to GV.
+  Mangler Mang;
+  SmallString<64> MangledName;
+  StringMap<const GlobalValue *> MangledNameMap;
+  auto GetMangledName = [&](const GlobalValue &GV) {
+    if (!GV.hasName())
+      return;
+
+    MangledName.clear();
+    MangledName.reserve(GV.getName().size() + 1);
+    Mang.getNameWithPrefix(MangledName, &GV, /*CannotUsePrivateLabel=*/false);
+    MangledNameMap[MangledName] = &GV;
+  };
+  for (const Function &F : M)
+    GetMangledName(F);
+  for (const GlobalVariable &GV : M.globals())
+    GetMangledName(GV);
+  for (const GlobalAlias &GA : M.aliases())
+    GetMangledName(GA);
+
+  // Walk all the recorded .symver aliases, and set up the binding
+  // for each alias.
+  for (auto &Symver : Streamer.symverAliases()) {
+    const MCSymbol *Aliasee = Symver.first;
+    MCSymbolAttr Attr = MCSA_Invalid;
+
+    // First check if the aliasee binding was recorded in the asm.
+    RecordStreamer::State state = Streamer.getSymbolState(Aliasee);
+    switch (state) {
+    case RecordStreamer::Global:
+    case RecordStreamer::DefinedGlobal:
+      Attr = MCSA_Global;
+      break;
+    case RecordStreamer::UndefinedWeak:
+    case RecordStreamer::DefinedWeak:
+      Attr = MCSA_Weak;
+      break;
+    default:
+      break;
+    }
+
+    // If we don't have a symbol attribute from assembly, then check if
+    // the aliasee was defined in the IR.
+    if (Attr == MCSA_Invalid) {
+      const auto *GV = M.getNamedValue(Aliasee->getName());
+      if (!GV) {
+        auto MI = MangledNameMap.find(Aliasee->getName());
+        if (MI != MangledNameMap.end())
+          GV = MI->second;
+        else
+          continue;
+      }
+      if (GV->hasExternalLinkage())
+        Attr = MCSA_Global;
+      else if (GV->hasLocalLinkage())
+        Attr = MCSA_Local;
+      else if (GV->isWeakForLinker())
+        Attr = MCSA_Weak;
+    }
+    if (Attr == MCSA_Invalid)
+      continue;
+
+    // Set the detected binding on each alias with this aliasee.
+    for (auto &Alias : Symver.second)
+      Streamer.EmitSymbolAttribute(Alias, Attr);
+  }
 }
 
 void ModuleSymbolTable::CollectAsmSymbols(
-    const Triple &TT, StringRef InlineAsm,
+    const Module &M,
     function_ref<void(StringRef, BasicSymbolRef::Flags)> AsmSymbol) {
+  StringRef InlineAsm = M.getModuleInlineAsm();
   if (InlineAsm.empty())
     return;
 
   std::string Err;
+  const Triple TT(M.getTargetTriple());
   const Target *T = TargetRegistry::lookupTarget(TT.str(), Err);
   assert(T && T->hasMCAsmParser());
 
@@ -106,6 +177,8 @@ void ModuleSymbolTable::CollectAsmSymbols(
   if (Parser->Run(false))
     return;
 
+  handleSymverAliases(M, Streamer);
+
   for (auto &KV : Streamer) {
     StringRef Key = KV.first();
     RecordStreamer::State Value = KV.second;
diff --git a/lib/Object/RecordStreamer.cpp b/lib/Object/RecordStreamer.cpp
index 572b960bc85f0a2e755cc92a9fa4bfc52f5bf723..a5018443b87dc6c59e52dc07a824f9b2fef26864 100644
--- a/lib/Object/RecordStreamer.cpp
+++ b/lib/Object/RecordStreamer.cpp
@@ -1,4 +1,4 @@
-//===-- RecordStreamer.cpp - Record asm definde and used symbols ----------===//
+//===-- RecordStreamer.cpp - Record asm defined and used symbols ----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -82,7 +82,7 @@ void RecordStreamer::EmitInstruction(const MCInst &Inst,
   MCStreamer::EmitInstruction(Inst, STI);
 }
 
-void RecordStreamer::EmitLabel(MCSymbol *Symbol) {
+void RecordStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   MCStreamer::EmitLabel(Symbol);
   markDefined(*Symbol);
 }
@@ -110,3 +110,8 @@ void RecordStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment) {
   markDefined(*Symbol);
 }
+
+void RecordStreamer::emitELFSymverDirective(MCSymbol *Alias,
+                                            const MCSymbol *Aliasee) {
+  SymverAliasMap[Aliasee].push_back(Alias);
+}
diff --git a/lib/Object/RecordStreamer.h b/lib/Object/RecordStreamer.h
index 617d8a43fbd26ccd12fb9e48222e866c0e652dbf..c3bd5b09a9bf55993a55bcea000f0846cd8694c1 100644
--- a/lib/Object/RecordStreamer.h
+++ b/lib/Object/RecordStreamer.h
@@ -20,6 +20,10 @@ public:
 
 private:
   StringMap<State> Symbols;
+  // Map of aliases created by .symver directives, saved so we can update
+  // their symbol binding after parsing complete. This maps from each
+  // aliasee to its list of aliases.
+  DenseMap<const MCSymbol *, std::vector<MCSymbol *>> SymverAliasMap;
   void markDefined(const MCSymbol &Symbol);
   void markGlobal(const MCSymbol &Symbol, MCSymbolAttr Attribute);
   void markUsed(const MCSymbol &Symbol);
@@ -31,13 +35,27 @@ public:
   const_iterator end();
   RecordStreamer(MCContext &Context);
   void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
   void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
   bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
   void EmitZerofill(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                     unsigned ByteAlignment) override;
   void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
+  /// Record .symver aliases for later processing.
+  void emitELFSymverDirective(MCSymbol *Alias,
+                              const MCSymbol *Aliasee) override;
+  /// Return the map of .symver aliasee to associated aliases.
+  DenseMap<const MCSymbol *, std::vector<MCSymbol *>> &symverAliases() {
+    return SymverAliasMap;
+  }
+  /// Get the state recorded for the given symbol.
+  State getSymbolState(const MCSymbol *Sym) {
+    auto SI = Symbols.find(Sym->getName());
+    if (SI == Symbols.end())
+      return NeverSeen;
+    return SI->second;
+  }
 };
 }
 #endif
diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp
index e61ae156a749b5386da9d062c0205358507f5e96..fc1dca35424e375d0ddb54258540311f9b7dfc86 100644
--- a/lib/Object/WasmObjectFile.cpp
+++ b/lib/Object/WasmObjectFile.cpp
@@ -1,4 +1,4 @@
-//===- WasmObjectFile.cpp - Wasm object file implementation -----*- C++ -*-===//
+//===- WasmObjectFile.cpp - Wasm object file implementation ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,12 +7,26 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/Error.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Object/SymbolicFile.h"
 #include "llvm/Object/Wasm.h"
 #include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/Wasm.h"
+#include <algorithm>
+#include <cstdint>
+#include <system_error>
 
-namespace llvm {
-namespace object {
+using namespace llvm;
+using namespace object;
 
 Expected<std::unique_ptr<WasmObjectFile>>
 ObjectFile::createWasmObjectFile(MemoryBufferRef Buffer) {
@@ -24,34 +38,139 @@ ObjectFile::createWasmObjectFile(MemoryBufferRef Buffer) {
   return std::move(ObjectFile);
 }
 
-namespace {
+#define VARINT7_MAX ((1<<7)-1)
+#define VARINT7_MIN (-(1<<7))
+#define VARUINT7_MAX (1<<7)
+#define VARUINT1_MAX (1)
 
-uint32_t readUint32(const uint8_t *&Ptr) {
+static uint8_t readUint8(const uint8_t *&Ptr) { return *Ptr++; }
+
+static uint32_t readUint32(const uint8_t *&Ptr) {
   uint32_t Result = support::endian::read32le(Ptr);
   Ptr += sizeof(Result);
   return Result;
 }
 
-uint64_t readULEB128(const uint8_t *&Ptr) {
+static int32_t readFloat32(const uint8_t *&Ptr) {
+  int32_t Result = 0;
+  memcpy(&Result, Ptr, sizeof(Result));
+  Ptr += sizeof(Result);
+  return Result;
+}
+
+static int64_t readFloat64(const uint8_t *&Ptr) {
+  int64_t Result = 0;
+  memcpy(&Result, Ptr, sizeof(Result));
+  Ptr += sizeof(Result);
+  return Result;
+}
+
+static uint64_t readULEB128(const uint8_t *&Ptr) {
   unsigned Count;
   uint64_t Result = decodeULEB128(Ptr, &Count);
   Ptr += Count;
   return Result;
 }
 
-StringRef readString(const uint8_t *&Ptr) {
+static StringRef readString(const uint8_t *&Ptr) {
   uint32_t StringLen = readULEB128(Ptr);
   StringRef Return = StringRef(reinterpret_cast<const char *>(Ptr), StringLen);
   Ptr += StringLen;
   return Return;
 }
 
-Error readSection(wasm::WasmSection &Section, const uint8_t *&Ptr,
-                  const uint8_t *Start) {
+static int64_t readLEB128(const uint8_t *&Ptr) {
+  unsigned Count;
+  uint64_t Result = decodeSLEB128(Ptr, &Count);
+  Ptr += Count;
+  return Result;
+}
+
+static uint8_t readVaruint1(const uint8_t *&Ptr) {
+  int64_t result = readLEB128(Ptr);
+  assert(result <= VARUINT1_MAX && result >= 0);
+  return result;
+}
+
+static int8_t readVarint7(const uint8_t *&Ptr) {
+  int64_t result = readLEB128(Ptr);
+  assert(result <= VARINT7_MAX && result >= VARINT7_MIN);
+  return result;
+}
+
+static uint8_t readVaruint7(const uint8_t *&Ptr) {
+  uint64_t result = readULEB128(Ptr);
+  assert(result <= VARUINT7_MAX);
+  return result;
+}
+
+static int32_t readVarint32(const uint8_t *&Ptr) {
+  int64_t result = readLEB128(Ptr);
+  assert(result <= INT32_MAX && result >= INT32_MIN);
+  return result;
+}
+
+static uint32_t readVaruint32(const uint8_t *&Ptr) {
+  uint64_t result = readULEB128(Ptr);
+  assert(result <= UINT32_MAX);
+  return result;
+}
+
+static int64_t readVarint64(const uint8_t *&Ptr) {
+  return readLEB128(Ptr);
+}
+
+static uint8_t readOpcode(const uint8_t *&Ptr) {
+  return readUint8(Ptr);
+}
+
+static Error readInitExpr(wasm::WasmInitExpr &Expr, const uint8_t *&Ptr) {
+  Expr.Opcode = readOpcode(Ptr);
+
+  switch (Expr.Opcode) {
+  case wasm::WASM_OPCODE_I32_CONST:
+    Expr.Value.Int32 = readVarint32(Ptr);
+    break;
+  case wasm::WASM_OPCODE_I64_CONST:
+    Expr.Value.Int64 = readVarint64(Ptr);
+    break;
+  case wasm::WASM_OPCODE_F32_CONST:
+    Expr.Value.Float32 = readFloat32(Ptr);
+    break;
+  case wasm::WASM_OPCODE_F64_CONST:
+    Expr.Value.Float64 = readFloat64(Ptr);
+    break;
+  case wasm::WASM_OPCODE_GET_GLOBAL:
+    Expr.Value.Global = readUint32(Ptr);
+    break;
+  default:
+    return make_error<GenericBinaryError>("Invalid opcode in init_expr",
+                                          object_error::parse_failed);
+  }
+
+  uint8_t EndOpcode = readOpcode(Ptr);
+  if (EndOpcode != wasm::WASM_OPCODE_END) {
+    return make_error<GenericBinaryError>("Invalid init_expr",
+                                          object_error::parse_failed);
+  }
+  return Error::success();
+}
+
+static wasm::WasmLimits readLimits(const uint8_t *&Ptr) {
+  wasm::WasmLimits Result;
+  Result.Flags = readVaruint1(Ptr);
+  Result.Initial = readVaruint32(Ptr);
+  if (Result.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
+    Result.Maximum = readVaruint32(Ptr);
+  return Result;
+}
+
+static Error readSection(WasmSection &Section, const uint8_t *&Ptr,
+                         const uint8_t *Start) {
   // TODO(sbc): Avoid reading past EOF in the case of malformed files.
   Section.Offset = Ptr - Start;
-  Section.Type = readULEB128(Ptr);
-  uint32_t Size = readULEB128(Ptr);
+  Section.Type = readVaruint7(Ptr);
+  uint32_t Size = readVaruint32(Ptr);
   if (Size == 0)
     return make_error<StringError>("Zero length section",
                                    object_error::parse_failed);
@@ -59,10 +178,9 @@ Error readSection(wasm::WasmSection &Section, const uint8_t *&Ptr,
   Ptr += Size;
   return Error::success();
 }
-}
 
 WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
-    : ObjectFile(Binary::ID_Wasm, Buffer) {
+    : ObjectFile(Binary::ID_Wasm, Buffer), StartFunction(-1) {
   ErrorAsOutParameter ErrAsOutParam(&Err);
   Header.Magic = getData().substr(0, 4);
   if (Header.Magic != StringRef("\0asm", 4)) {
@@ -79,22 +197,388 @@ WasmObjectFile::WasmObjectFile(MemoryBufferRef Buffer, Error &Err)
   }
 
   const uint8_t *Eof = getPtr(getData().size());
-  wasm::WasmSection Sec;
+  WasmSection Sec;
   while (Ptr < Eof) {
     if ((Err = readSection(Sec, Ptr, getPtr(0))))
       return;
-    if (Sec.Type == wasm::WASM_SEC_CUSTOM) {
-      if ((Err =
-               parseCustomSection(Sec, Sec.Content.data(), Sec.Content.size())))
-        return;
-    }
+    if ((Err = parseSection(Sec)))
+      return;
+
     Sections.push_back(Sec);
   }
 }
 
-Error WasmObjectFile::parseCustomSection(wasm::WasmSection &Sec,
-                                         const uint8_t *Ptr, size_t Length) {
+Error WasmObjectFile::parseSection(WasmSection &Sec) {
+  const uint8_t* Start = Sec.Content.data();
+  const uint8_t* End = Start + Sec.Content.size();
+  switch (Sec.Type) {
+  case wasm::WASM_SEC_CUSTOM:
+    return parseCustomSection(Sec, Start, End);
+  case wasm::WASM_SEC_TYPE:
+    return parseTypeSection(Start, End);
+  case wasm::WASM_SEC_IMPORT:
+    return parseImportSection(Start, End);
+  case wasm::WASM_SEC_FUNCTION:
+    return parseFunctionSection(Start, End);
+  case wasm::WASM_SEC_TABLE:
+    return parseTableSection(Start, End);
+  case wasm::WASM_SEC_MEMORY:
+    return parseMemorySection(Start, End);
+  case wasm::WASM_SEC_GLOBAL:
+    return parseGlobalSection(Start, End);
+  case wasm::WASM_SEC_EXPORT:
+    return parseExportSection(Start, End);
+  case wasm::WASM_SEC_START:
+    return parseStartSection(Start, End);
+  case wasm::WASM_SEC_ELEM:
+    return parseElemSection(Start, End);
+  case wasm::WASM_SEC_CODE:
+    return parseCodeSection(Start, End);
+  case wasm::WASM_SEC_DATA:
+    return parseDataSection(Start, End);
+  default:
+    return make_error<GenericBinaryError>("Bad section type",
+                                          object_error::parse_failed);
+  }
+}
+
+Error WasmObjectFile::parseNameSection(const uint8_t *Ptr, const uint8_t *End) {
+  while (Ptr < End) {
+    uint8_t Type = readVarint7(Ptr);
+    uint32_t Size = readVaruint32(Ptr);
+    switch (Type) {
+    case wasm::WASM_NAMES_FUNCTION: {
+      uint32_t Count = readVaruint32(Ptr);
+      while (Count--) {
+        /*uint32_t Index =*/readVaruint32(Ptr);
+        StringRef Name = readString(Ptr);
+        if (Name.size())
+          Symbols.emplace_back(Name,
+                               WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME);
+      }
+      break;
+    }
+    // Ignore local names for now
+    case wasm::WASM_NAMES_LOCAL:
+    default:
+      Ptr += Size;
+      break;
+    }
+  }
+
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Name section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+WasmSection* WasmObjectFile::findCustomSectionByName(StringRef Name) {
+  for (WasmSection& Section : Sections) {
+    if (Section.Type == wasm::WASM_SEC_CUSTOM && Section.Name == Name)
+      return &Section;
+  }
+  return nullptr;
+}
+
+WasmSection* WasmObjectFile::findSectionByType(uint32_t Type) {
+  assert(Type != wasm::WASM_SEC_CUSTOM);
+  for (WasmSection& Section : Sections) {
+    if (Section.Type == Type)
+      return &Section;
+  }
+  return nullptr;
+}
+
+Error WasmObjectFile::parseRelocSection(StringRef Name, const uint8_t *Ptr,
+                                        const uint8_t *End) {
+  uint8_t SectionCode = readVarint7(Ptr);
+  WasmSection* Section = nullptr;
+  if (SectionCode == wasm::WASM_SEC_CUSTOM) {
+    StringRef Name = readString(Ptr);
+    Section = findCustomSectionByName(Name);
+  } else {
+    Section = findSectionByType(SectionCode);
+  }
+  if (!Section)
+    return make_error<GenericBinaryError>("Invalid section code",
+                                          object_error::parse_failed);
+  uint32_t RelocCount = readVaruint32(Ptr);
+  while (RelocCount--) {
+    wasm::WasmRelocation Reloc;
+    memset(&Reloc, 0, sizeof(Reloc));
+    Reloc.Type = readVaruint32(Ptr);
+    Reloc.Offset = readVaruint32(Ptr);
+    Reloc.Index = readVaruint32(Ptr);
+    switch (Reloc.Type) {
+    case wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB:
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB:
+    case wasm::R_WEBASSEMBLY_TABLE_INDEX_I32:
+      break;
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+    case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+      Reloc.Addend = readVaruint32(Ptr);
+      break;
+    default:
+      return make_error<GenericBinaryError>("Bad relocation type",
+                                            object_error::parse_failed);
+    }
+    Section->Relocations.push_back(Reloc);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Reloc section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseCustomSection(WasmSection &Sec,
+                                         const uint8_t *Ptr, const uint8_t *End) {
   Sec.Name = readString(Ptr);
+  if (Sec.Name == "name") {
+    if (Error Err = parseNameSection(Ptr, End))
+      return Err;
+  } else if (Sec.Name.startswith("reloc.")) {
+    if (Error Err = parseRelocSection(Sec.Name, Ptr, End))
+      return Err;
+  }
+  return Error::success();
+}
+
+Error WasmObjectFile::parseTypeSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Signatures.reserve(Count);
+  while (Count--) {
+    wasm::WasmSignature Sig;
+    Sig.ReturnType = wasm::WASM_TYPE_NORESULT;
+    int8_t Form = readVarint7(Ptr);
+    if (Form != wasm::WASM_TYPE_FUNC) {
+      return make_error<GenericBinaryError>("Invalid signature type",
+                                            object_error::parse_failed);
+    }
+    uint32_t ParamCount = readVaruint32(Ptr);
+    Sig.ParamTypes.reserve(ParamCount);
+    while (ParamCount--) {
+      uint32_t ParamType = readVarint7(Ptr);
+      Sig.ParamTypes.push_back(ParamType);
+    }
+    uint32_t ReturnCount = readVaruint32(Ptr);
+    if (ReturnCount) {
+      if (ReturnCount != 1) {
+        return make_error<GenericBinaryError>(
+            "Multiple return types not supported", object_error::parse_failed);
+      }
+      Sig.ReturnType = readVarint7(Ptr);
+    }
+    Signatures.push_back(Sig);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Type section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Imports.reserve(Count);
+  while (Count--) {
+    wasm::WasmImport Im;
+    Im.Module = readString(Ptr);
+    Im.Field = readString(Ptr);
+    Im.Kind = readUint8(Ptr);
+    switch (Im.Kind) {
+    case wasm::WASM_EXTERNAL_FUNCTION:
+      Im.SigIndex = readVaruint32(Ptr);
+      Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::FUNCTION_IMPORT);
+      break;
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      Im.GlobalType = readVarint7(Ptr);
+      Im.GlobalMutable = readVaruint1(Ptr);
+      Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT);
+      break;
+    default:
+      // TODO(sbc): Handle other kinds of imports
+      return make_error<GenericBinaryError>(
+          "Unexpected import kind", object_error::parse_failed);
+    }
+    Imports.push_back(Im);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Import section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseFunctionSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  FunctionTypes.reserve(Count);
+  while (Count--) {
+    FunctionTypes.push_back(readVaruint32(Ptr));
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Function section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseTableSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Tables.reserve(Count);
+  while (Count--) {
+    wasm::WasmTable Table;
+    Table.ElemType = readVarint7(Ptr);
+    if (Table.ElemType != wasm::WASM_TYPE_ANYFUNC) {
+      return make_error<GenericBinaryError>("Invalid table element type",
+                                            object_error::parse_failed);
+    }
+    Table.Limits = readLimits(Ptr);
+    Tables.push_back(Table);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Table section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseMemorySection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Memories.reserve(Count);
+  while (Count--) {
+    Memories.push_back(readLimits(Ptr));
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Memory section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseGlobalSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Globals.reserve(Count);
+  while (Count--) {
+    wasm::WasmGlobal Global;
+    Global.Type = readVarint7(Ptr);
+    Global.Mutable = readVaruint1(Ptr);
+    if (Error Err = readInitExpr(Global.InitExpr, Ptr))
+      return Err;
+    Globals.push_back(Global);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Global section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  Exports.reserve(Count);
+  while (Count--) {
+    wasm::WasmExport Ex;
+    Ex.Name = readString(Ptr);
+    Ex.Kind = readUint8(Ptr);
+    Ex.Index = readVaruint32(Ptr);
+    Exports.push_back(Ex);
+    switch (Ex.Kind) {
+    case wasm::WASM_EXTERNAL_FUNCTION:
+      Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::FUNCTION_EXPORT);
+      break;
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::GLOBAL_EXPORT);
+      break;
+    default:
+      // TODO(sbc): Handle other kinds of exports
+      return make_error<GenericBinaryError>(
+          "Unexpected export kind", object_error::parse_failed);
+    }
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Export section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseStartSection(const uint8_t *Ptr, const uint8_t *End) {
+  StartFunction = readVaruint32(Ptr);
+  if (StartFunction < FunctionTypes.size())
+    return make_error<GenericBinaryError>("Invalid start function",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseCodeSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t FunctionCount = readVaruint32(Ptr);
+  if (FunctionCount != FunctionTypes.size()) {
+    return make_error<GenericBinaryError>("Invalid function count",
+                                          object_error::parse_failed);
+  }
+
+  CodeSection = ArrayRef<uint8_t>(Ptr, End - Ptr);
+
+  while (FunctionCount--) {
+    wasm::WasmFunction Function;
+    uint32_t FunctionSize = readVaruint32(Ptr);
+    const uint8_t *FunctionEnd = Ptr + FunctionSize;
+
+    uint32_t NumLocalDecls = readVaruint32(Ptr);
+    Function.Locals.reserve(NumLocalDecls);
+    while (NumLocalDecls--) {
+      wasm::WasmLocalDecl Decl;
+      Decl.Count = readVaruint32(Ptr);
+      Decl.Type = readVarint7(Ptr);
+      Function.Locals.push_back(Decl);
+    }
+
+    uint32_t BodySize = FunctionEnd - Ptr;
+    Function.Body = ArrayRef<uint8_t>(Ptr, BodySize);
+    Ptr += BodySize;
+    assert(Ptr == FunctionEnd);
+    Functions.push_back(Function);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Code section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseElemSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  ElemSegments.reserve(Count);
+  while (Count--) {
+    wasm::WasmElemSegment Segment;
+    Segment.TableIndex = readVaruint32(Ptr);
+    if (Segment.TableIndex != 0) {
+      return make_error<GenericBinaryError>("Invalid TableIndex",
+                                            object_error::parse_failed);
+    }
+    if (Error Err = readInitExpr(Segment.Offset, Ptr))
+      return Err;
+    uint32_t NumElems = readVaruint32(Ptr);
+    while (NumElems--) {
+      Segment.Functions.push_back(readVaruint32(Ptr));
+    }
+    ElemSegments.push_back(Segment);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Elem section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
+Error WasmObjectFile::parseDataSection(const uint8_t *Ptr, const uint8_t *End) {
+  uint32_t Count = readVaruint32(Ptr);
+  DataSegments.reserve(Count);
+  while (Count--) {
+    wasm::WasmDataSegment Segment;
+    Segment.Index = readVaruint32(Ptr);
+    if (Error Err = readInitExpr(Segment.Offset, Ptr))
+      return Err;
+    uint32_t Size = readVaruint32(Ptr);
+    Segment.Content = ArrayRef<uint8_t>(Ptr, Size);
+    Ptr += Size;
+    DataSegments.push_back(Segment);
+  }
+  if (Ptr != End)
+    return make_error<GenericBinaryError>("Data section ended prematurely",
+                                          object_error::parse_failed);
   return Error::success();
 }
 
@@ -106,37 +590,48 @@ const wasm::WasmObjectHeader &WasmObjectFile::getHeader() const {
   return Header;
 }
 
-void WasmObjectFile::moveSymbolNext(DataRefImpl &Symb) const {
-  llvm_unreachable("not yet implemented");
-}
-
-std::error_code WasmObjectFile::printSymbolName(raw_ostream &OS,
-                                                DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return object_error::invalid_symbol_index;
-}
+void WasmObjectFile::moveSymbolNext(DataRefImpl &Symb) const { Symb.d.a++; }
 
 uint32_t WasmObjectFile::getSymbolFlags(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return 0;
+  const WasmSymbol &Sym = getWasmSymbol(Symb);
+  switch (Sym.Type) {
+  case WasmSymbol::SymbolType::FUNCTION_IMPORT:
+    return object::SymbolRef::SF_Undefined | SymbolRef::SF_Executable;
+  case WasmSymbol::SymbolType::FUNCTION_EXPORT:
+    return object::SymbolRef::SF_Global | SymbolRef::SF_Executable;
+  case WasmSymbol::SymbolType::DEBUG_FUNCTION_NAME:
+    return object::SymbolRef::SF_Executable;
+  case WasmSymbol::SymbolType::GLOBAL_IMPORT:
+    return object::SymbolRef::SF_Undefined;
+  case WasmSymbol::SymbolType::GLOBAL_EXPORT:
+    return object::SymbolRef::SF_Global;
+  }
+  llvm_unreachable("Unknown WasmSymbol::SymbolType");
 }
 
 basic_symbol_iterator WasmObjectFile::symbol_begin() const {
-  return BasicSymbolRef(DataRefImpl(), this);
+  DataRefImpl Ref;
+  Ref.d.a = 0;
+  return BasicSymbolRef(Ref, this);
 }
 
 basic_symbol_iterator WasmObjectFile::symbol_end() const {
-  return BasicSymbolRef(DataRefImpl(), this);
+  DataRefImpl Ref;
+  Ref.d.a = Symbols.size();
+  return BasicSymbolRef(Ref, this);
+}
+
+const WasmSymbol &WasmObjectFile::getWasmSymbol(DataRefImpl Symb) const {
+  return Symbols[Symb.d.a];
 }
 
 Expected<StringRef> WasmObjectFile::getSymbolName(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return errorCodeToError(object_error::invalid_symbol_index);
+  const WasmSymbol &Sym = getWasmSymbol(Symb);
+  return Sym.Name;
 }
 
 Expected<uint64_t> WasmObjectFile::getSymbolAddress(DataRefImpl Symb) const {
-  llvm_unreachable("not yet implemented");
-  return errorCodeToError(object_error::invalid_symbol_index);
+  return (uint64_t)Symb.d.a;
 }
 
 uint64_t WasmObjectFile::getSymbolValueImpl(DataRefImpl Symb) const {
@@ -170,7 +665,7 @@ void WasmObjectFile::moveSectionNext(DataRefImpl &Sec) const { Sec.d.a++; }
 
 std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec,
                                                StringRef &Res) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
+  const WasmSection &S = Sections[Sec.d.a];
 #define ECase(X)                                                               \
   case wasm::WASM_SEC_##X:                                                     \
     Res = #X;                                                                  \
@@ -200,13 +695,13 @@ std::error_code WasmObjectFile::getSectionName(DataRefImpl Sec,
 uint64_t WasmObjectFile::getSectionAddress(DataRefImpl Sec) const { return 0; }
 
 uint64_t WasmObjectFile::getSectionSize(DataRefImpl Sec) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
+  const WasmSection &S = Sections[Sec.d.a];
   return S.Content.size();
 }
 
 std::error_code WasmObjectFile::getSectionContents(DataRefImpl Sec,
                                                    StringRef &Res) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
+  const WasmSection &S = Sections[Sec.d.a];
   // This will never fail since wasm sections can never be empty (user-sections
   // must have a name and non-user sections each have a defined structure).
   Res = StringRef(reinterpret_cast<const char *>(S.Content.data()),
@@ -223,13 +718,11 @@ bool WasmObjectFile::isSectionCompressed(DataRefImpl Sec) const {
 }
 
 bool WasmObjectFile::isSectionText(DataRefImpl Sec) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
-  return S.Type == wasm::WASM_SEC_CODE;
+  return getWasmSection(Sec).Type == wasm::WASM_SEC_CODE;
 }
 
 bool WasmObjectFile::isSectionData(DataRefImpl Sec) const {
-  const wasm::WasmSection &S = Sections[Sec.d.a];
-  return S.Type == wasm::WASM_SEC_DATA;
+  return getWasmSection(Sec).Type == wasm::WASM_SEC_DATA;
 }
 
 bool WasmObjectFile::isSectionBSS(DataRefImpl Sec) const { return false; }
@@ -238,31 +731,28 @@ bool WasmObjectFile::isSectionVirtual(DataRefImpl Sec) const { return false; }
 
 bool WasmObjectFile::isSectionBitcode(DataRefImpl Sec) const { return false; }
 
-relocation_iterator WasmObjectFile::section_rel_begin(DataRefImpl Sec) const {
-  llvm_unreachable("not yet implemented");
-  RelocationRef Rel;
-  return relocation_iterator(Rel);
+relocation_iterator WasmObjectFile::section_rel_begin(DataRefImpl Ref) const {
+  DataRefImpl RelocRef;
+  RelocRef.d.a = Ref.d.a;
+  RelocRef.d.b = 0;
+  return relocation_iterator(RelocationRef(RelocRef, this));
 }
 
-relocation_iterator WasmObjectFile::section_rel_end(DataRefImpl Sec) const {
-  llvm_unreachable("not yet implemented");
-  RelocationRef Rel;
-  return relocation_iterator(Rel);
-}
-
-section_iterator WasmObjectFile::getRelocatedSection(DataRefImpl Sec) const {
-  llvm_unreachable("not yet implemented");
-  SectionRef Ref;
-  return section_iterator(Ref);
+relocation_iterator WasmObjectFile::section_rel_end(DataRefImpl Ref) const {
+  const WasmSection &Sec = getWasmSection(Ref);
+  DataRefImpl RelocRef;
+  RelocRef.d.a = Ref.d.a;
+  RelocRef.d.b = Sec.Relocations.size();
+  return relocation_iterator(RelocationRef(RelocRef, this));
 }
 
 void WasmObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
-  llvm_unreachable("not yet implemented");
+  Rel.d.b++;
 }
 
-uint64_t WasmObjectFile::getRelocationOffset(DataRefImpl Rel) const {
-  llvm_unreachable("not yet implemented");
-  return 0;
+uint64_t WasmObjectFile::getRelocationOffset(DataRefImpl Ref) const {
+  const wasm::WasmRelocation &Rel = getWasmRelocation(Ref);
+  return Rel.Offset;
 }
 
 symbol_iterator WasmObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
@@ -271,14 +761,28 @@ symbol_iterator WasmObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
   return symbol_iterator(Ref);
 }
 
-uint64_t WasmObjectFile::getRelocationType(DataRefImpl Rel) const {
-  llvm_unreachable("not yet implemented");
-  return 0;
+uint64_t WasmObjectFile::getRelocationType(DataRefImpl Ref) const {
+  const wasm::WasmRelocation &Rel = getWasmRelocation(Ref);
+  return Rel.Type;
 }
 
 void WasmObjectFile::getRelocationTypeName(
-    DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
-  llvm_unreachable("not yet implemented");
+    DataRefImpl Ref, SmallVectorImpl<char> &Result) const {
+  const wasm::WasmRelocation& Rel = getWasmRelocation(Ref);
+  StringRef Res = "Unknown";
+
+#define WASM_RELOC(name, value)  \
+  case wasm::name:              \
+    Res = #name;               \
+    break;
+
+  switch (Rel.Type) {
+#include "llvm/Support/WasmRelocs/WebAssembly.def"
+  }
+
+#undef WASM_RELOC
+
+  Result.append(Res.begin(), Res.end());
 }
 
 section_iterator WasmObjectFile::section_begin() const {
@@ -305,10 +809,25 @@ SubtargetFeatures WasmObjectFile::getFeatures() const {
 
 bool WasmObjectFile::isRelocatableObject() const { return false; }
 
-const wasm::WasmSection *
+const WasmSection &WasmObjectFile::getWasmSection(DataRefImpl Ref) const {
+  assert(Ref.d.a < Sections.size());
+  return Sections[Ref.d.a];
+}
+
+const WasmSection &
 WasmObjectFile::getWasmSection(const SectionRef &Section) const {
-  return &Sections[Section.getRawDataRefImpl().d.a];
+  return getWasmSection(Section.getRawDataRefImpl());
 }
 
-} // end namespace object
-} // end namespace llvm
+const wasm::WasmRelocation &
+WasmObjectFile::getWasmRelocation(const RelocationRef &Ref) const {
+  return getWasmRelocation(Ref.getRawDataRefImpl());
+}
+
+const wasm::WasmRelocation &
+WasmObjectFile::getWasmRelocation(DataRefImpl Ref) const {
+  assert(Ref.d.a < Sections.size());
+  const WasmSection& Sec = Sections[Ref.d.a];
+  assert(Ref.d.b < Sec.Relocations.size());
+  return Sec.Relocations[Ref.d.b];
+}
diff --git a/lib/ObjectYAML/CMakeLists.txt b/lib/ObjectYAML/CMakeLists.txt
index ab3939e17d758c794886a226e923840f1a2f4775..37f8fd7bce1a6d53b61d5be6b767e87e64043895 100644
--- a/lib/ObjectYAML/CMakeLists.txt
+++ b/lib/ObjectYAML/CMakeLists.txt
@@ -1,9 +1,11 @@
 add_llvm_library(LLVMObjectYAML
   COFFYAML.cpp
   DWARFEmitter.cpp
+  DWARFVisitor.cpp
   DWARFYAML.cpp
   ELFYAML.cpp
   MachOYAML.cpp
   ObjectYAML.cpp
+  WasmYAML.cpp
   YAML.cpp
   )
diff --git a/lib/ObjectYAML/DWARFEmitter.cpp b/lib/ObjectYAML/DWARFEmitter.cpp
index 1e2e960b9dc57463d7e5c9a33460b205a3a44cee..1aa1519b708ba02d30f3693e2731e8818081290d 100644
--- a/lib/ObjectYAML/DWARFEmitter.cpp
+++ b/lib/ObjectYAML/DWARFEmitter.cpp
@@ -19,19 +19,21 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/SwapByteOrder.h"
 
+#include "DWARFVisitor.h"
+
 #include <algorithm>
 
 using namespace llvm;
 
 template <typename T>
-void writeInteger(T Integer, raw_ostream &OS, bool IsLittleEndian) {
+static void writeInteger(T Integer, raw_ostream &OS, bool IsLittleEndian) {
   if (IsLittleEndian != sys::IsLittleEndianHost)
     sys::swapByteOrder(Integer);
   OS.write(reinterpret_cast<char *>(&Integer), sizeof(T));
 }
 
-void writeVariableSizedInteger(uint64_t Integer, size_t Size, raw_ostream &OS,
-                               bool IsLittleEndian) {
+static void writeVariableSizedInteger(uint64_t Integer, size_t Size,
+                                      raw_ostream &OS, bool IsLittleEndian) {
   if (8 == Size)
     writeInteger((uint64_t)Integer, OS, IsLittleEndian);
   else if (4 == Size)
@@ -44,12 +46,19 @@ void writeVariableSizedInteger(uint64_t Integer, size_t Size, raw_ostream &OS,
     assert(false && "Invalid integer write size.");
 }
 
-void ZeroFillBytes(raw_ostream &OS, size_t Size) {
+static void ZeroFillBytes(raw_ostream &OS, size_t Size) {
   std::vector<uint8_t> FillData;
   FillData.insert(FillData.begin(), Size, 0);
   OS.write(reinterpret_cast<char *>(FillData.data()), Size);
 }
 
+void writeInitialLength(const DWARFYAML::InitialLength &Length, raw_ostream &OS,
+                        bool IsLittleEndian) {
+  writeInteger((uint32_t)Length.TotalLength, OS, IsLittleEndian);
+  if (Length.isDWARF64())
+    writeInteger((uint64_t)Length.TotalLength64, OS, IsLittleEndian);
+}
+
 void DWARFYAML::EmitDebugStr(raw_ostream &OS, const DWARFYAML::Data &DI) {
   for (auto Str : DI.DebugStrings) {
     OS.write(Str.data(), Str.size());
@@ -65,6 +74,8 @@ void DWARFYAML::EmitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
     for (auto Attr : AbbrevDecl.Attributes) {
       encodeULEB128(Attr.Attribute, OS);
       encodeULEB128(Attr.Form, OS);
+      if (Attr.Form == dwarf::DW_FORM_implicit_const)
+        encodeSLEB128(Attr.Value, OS);
     }
     encodeULEB128(0, OS);
     encodeULEB128(0, OS);
@@ -74,7 +85,7 @@ void DWARFYAML::EmitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
 void DWARFYAML::EmitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) {
   for (auto Range : DI.ARanges) {
     auto HeaderStart = OS.tell();
-    writeInteger((uint32_t)Range.Length, OS, DI.IsLittleEndian);
+    writeInitialLength(Range.Length, OS, DI.IsLittleEndian);
     writeInteger((uint16_t)Range.Version, OS, DI.IsLittleEndian);
     writeInteger((uint32_t)Range.CuOffset, OS, DI.IsLittleEndian);
     writeInteger((uint8_t)Range.AddrSize, OS, DI.IsLittleEndian);
@@ -97,7 +108,7 @@ void DWARFYAML::EmitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) {
 void DWARFYAML::EmitPubSection(raw_ostream &OS,
                                const DWARFYAML::PubSection &Sect,
                                bool IsLittleEndian) {
-  writeInteger((uint32_t)Sect.Length, OS, IsLittleEndian);
+  writeInitialLength(Sect.Length, OS, IsLittleEndian);
   writeInteger((uint16_t)Sect.Version, OS, IsLittleEndian);
   writeInteger((uint32_t)Sect.UnitOffset, OS, IsLittleEndian);
   writeInteger((uint32_t)Sect.UnitSize, OS, IsLittleEndian);
@@ -110,133 +121,75 @@ void DWARFYAML::EmitPubSection(raw_ostream &OS,
   }
 }
 
-void DWARFYAML::EmitDebugInfo(raw_ostream &OS, const DWARFYAML::Data &DI) {
-
-  for (auto CU : DI.CompileUnits) {
-    writeInteger((uint32_t)CU.Length, OS, DI.IsLittleEndian);
-    writeInteger((uint16_t)CU.Version, OS, DI.IsLittleEndian);
-    writeInteger((uint32_t)CU.AbbrOffset, OS, DI.IsLittleEndian);
-    writeInteger((uint8_t)CU.AddrSize, OS, DI.IsLittleEndian);
-
-    auto FirstAbbrevCode = CU.Entries[0].AbbrCode;
-
-    for (auto Entry : CU.Entries) {
-      encodeULEB128(Entry.AbbrCode, OS);
-      if (Entry.AbbrCode == 0u)
-        continue;
-      bool Indirect = false;
-      assert(Entry.AbbrCode - FirstAbbrevCode < DI.AbbrevDecls.size() &&
-             "Out of range AbbCode");
-      auto &Abbrev = DI.AbbrevDecls[Entry.AbbrCode - FirstAbbrevCode];
-
-      auto FormVal = Entry.Values.begin();
-      auto AbbrForm = Abbrev.Attributes.begin();
-      for (;
-           FormVal != Entry.Values.end() && AbbrForm != Abbrev.Attributes.end();
-           ++FormVal, ++AbbrForm) {
-        dwarf::Form Form = AbbrForm->Form;
-        do {
-          Indirect = false;
-          switch (Form) {
-          case dwarf::DW_FORM_addr:
-            writeVariableSizedInteger(FormVal->Value, CU.AddrSize, OS,
-                                      DI.IsLittleEndian);
-            break;
-          case dwarf::DW_FORM_ref_addr: {
-            // TODO: Handle DWARF32/DWARF64 after Line Table data is done
-            auto writeSize = CU.Version == 2 ? CU.AddrSize : 4;
-            writeVariableSizedInteger(FormVal->Value, writeSize, OS,
-                                      DI.IsLittleEndian);
-            break;
-          }
-          case dwarf::DW_FORM_exprloc:
-          case dwarf::DW_FORM_block:
-            encodeULEB128(FormVal->BlockData.size(), OS);
-            OS.write(reinterpret_cast<char *>(&FormVal->BlockData[0]),
-                     FormVal->BlockData.size());
-            break;
-          case dwarf::DW_FORM_block1: {
-            auto writeSize = FormVal->BlockData.size();
-            writeInteger((uint8_t)writeSize, OS, DI.IsLittleEndian);
-            OS.write(reinterpret_cast<char *>(&FormVal->BlockData[0]),
-                     FormVal->BlockData.size());
-            break;
-          }
-          case dwarf::DW_FORM_block2: {
-            auto writeSize = FormVal->BlockData.size();
-            writeInteger((uint16_t)writeSize, OS, DI.IsLittleEndian);
-            OS.write(reinterpret_cast<char *>(&FormVal->BlockData[0]),
-                     FormVal->BlockData.size());
-            break;
-          }
-          case dwarf::DW_FORM_block4: {
-            auto writeSize = FormVal->BlockData.size();
-            writeInteger((uint32_t)writeSize, OS, DI.IsLittleEndian);
-            OS.write(reinterpret_cast<char *>(&FormVal->BlockData[0]),
-                     FormVal->BlockData.size());
-            break;
-          }
-          case dwarf::DW_FORM_data1:
-          case dwarf::DW_FORM_ref1:
-          case dwarf::DW_FORM_flag:
-            writeInteger((uint8_t)FormVal->Value, OS, DI.IsLittleEndian);
-            break;
-          case dwarf::DW_FORM_data2:
-          case dwarf::DW_FORM_ref2:
-            writeInteger((uint16_t)FormVal->Value, OS, DI.IsLittleEndian);
-            break;
-          case dwarf::DW_FORM_data4:
-          case dwarf::DW_FORM_ref4:
-            writeInteger((uint32_t)FormVal->Value, OS, DI.IsLittleEndian);
-            break;
-          case dwarf::DW_FORM_data8:
-          case dwarf::DW_FORM_ref8:
-            writeInteger((uint64_t)FormVal->Value, OS, DI.IsLittleEndian);
-            break;
-          case dwarf::DW_FORM_sdata:
-            encodeSLEB128(FormVal->Value, OS);
-            break;
-          case dwarf::DW_FORM_udata:
-          case dwarf::DW_FORM_ref_udata:
-            encodeULEB128(FormVal->Value, OS);
-            break;
-          case dwarf::DW_FORM_string:
-            OS.write(FormVal->CStr.data(), FormVal->CStr.size());
-            OS.write('\0');
-            break;
-          case dwarf::DW_FORM_indirect:
-            encodeULEB128(FormVal->Value, OS);
-            Indirect = true;
-            Form = static_cast<dwarf::Form>((uint64_t)FormVal->Value);
-            ++FormVal;
-            break;
-          case dwarf::DW_FORM_strp:
-          case dwarf::DW_FORM_sec_offset:
-          case dwarf::DW_FORM_GNU_ref_alt:
-          case dwarf::DW_FORM_GNU_strp_alt:
-          case dwarf::DW_FORM_line_strp:
-          case dwarf::DW_FORM_strp_sup:
-          case dwarf::DW_FORM_ref_sup:
-            // TODO: Handle DWARF32/64
-            writeInteger((uint32_t)FormVal->Value, OS, DI.IsLittleEndian);
-            break;
-          case dwarf::DW_FORM_ref_sig8:
-            writeInteger((uint64_t)FormVal->Value, OS, DI.IsLittleEndian);
-            break;
-          case dwarf::DW_FORM_GNU_addr_index:
-          case dwarf::DW_FORM_GNU_str_index:
-            encodeULEB128(FormVal->Value, OS);
-            break;
-          default:
-            break;
-          }
-        } while (Indirect);
-      }
+/// \brief An extension of the DWARFYAML::ConstVisitor which writes compile
+/// units and DIEs to a stream.
+class DumpVisitor : public DWARFYAML::ConstVisitor {
+  raw_ostream &OS;
+
+protected:
+  virtual void onStartCompileUnit(const DWARFYAML::Unit &CU) {
+    writeInitialLength(CU.Length, OS, DebugInfo.IsLittleEndian);
+    writeInteger((uint16_t)CU.Version, OS, DebugInfo.IsLittleEndian);
+    if(CU.Version >= 5) {
+      writeInteger((uint8_t)CU.Type, OS, DebugInfo.IsLittleEndian);
+      writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
+      writeInteger((uint32_t)CU.AbbrOffset, OS, DebugInfo.IsLittleEndian);
+    }else {
+      writeInteger((uint32_t)CU.AbbrOffset, OS, DebugInfo.IsLittleEndian);
+      writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
     }
+    
+  }
+
+  virtual void onStartDIE(const DWARFYAML::Unit &CU,
+                          const DWARFYAML::Entry &DIE) {
+    encodeULEB128(DIE.AbbrCode, OS);
+  }
+
+  virtual void onValue(const uint8_t U) {
+    writeInteger(U, OS, DebugInfo.IsLittleEndian);
   }
+
+  virtual void onValue(const uint16_t U) {
+    writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+  virtual void onValue(const uint32_t U) {
+    writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+  virtual void onValue(const uint64_t U, const bool LEB = false) {
+    if (LEB)
+      encodeULEB128(U, OS);
+    else
+      writeInteger(U, OS, DebugInfo.IsLittleEndian);
+  }
+
+  virtual void onValue(const int64_t S, const bool LEB = false) {
+    if (LEB)
+      encodeSLEB128(S, OS);
+    else
+      writeInteger(S, OS, DebugInfo.IsLittleEndian);
+  }
+
+  virtual void onValue(const StringRef String) {
+    OS.write(String.data(), String.size());
+    OS.write('\0');
+  }
+
+  virtual void onValue(const MemoryBufferRef MBR) {
+    OS.write(MBR.getBufferStart(), MBR.getBufferSize());
+  }
+
+public:
+  DumpVisitor(const DWARFYAML::Data &DI, raw_ostream &Out)
+      : DWARFYAML::ConstVisitor(DI), OS(Out) {}
+};
+
+void DWARFYAML::EmitDebugInfo(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  DumpVisitor Visitor(DI, OS);
+  Visitor.traverseDebugInfo();
 }
 
-void EmitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
+static void EmitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
   OS.write(File.Name.data(), File.Name.size());
   OS.write('\0');
   encodeULEB128(File.DirIdx, OS);
@@ -245,13 +198,9 @@ void EmitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
 }
 
 void DWARFYAML::EmitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
-  for (const auto LineTable : DI.DebugLines) {
-    writeInteger((uint32_t)LineTable.TotalLength, OS, DI.IsLittleEndian);
-    uint64_t SizeOfPrologueLength = 4;
-    if (LineTable.TotalLength == UINT32_MAX) {
-      writeInteger((uint64_t)LineTable.TotalLength64, OS, DI.IsLittleEndian);
-      SizeOfPrologueLength = 8;
-    }
+  for (const auto &LineTable : DI.DebugLines) {
+    writeInitialLength(LineTable.Length, OS, DI.IsLittleEndian);
+    uint64_t SizeOfPrologueLength = LineTable.Length.isDWARF64() ? 8 : 4;
     writeInteger((uint16_t)LineTable.Version, OS, DI.IsLittleEndian);
     writeVariableSizedInteger(LineTable.PrologueLength, SizeOfPrologueLength,
                               OS, DI.IsLittleEndian);
@@ -333,9 +282,10 @@ void DWARFYAML::EmitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
 
 typedef void (*EmitFuncType)(raw_ostream &, const DWARFYAML::Data &);
 
-void EmitDebugSectionImpl(
-    const DWARFYAML::Data &DI, EmitFuncType EmitFunc, StringRef Sec,
-    StringMap<std::unique_ptr<MemoryBuffer>> &OutputBuffers) {
+static void
+EmitDebugSectionImpl(const DWARFYAML::Data &DI, EmitFuncType EmitFunc,
+                     StringRef Sec,
+                     StringMap<std::unique_ptr<MemoryBuffer>> &OutputBuffers) {
   std::string Data;
   raw_string_ostream DebugInfoStream(Data);
   EmitFunc(DebugInfoStream, DI);
diff --git a/lib/ObjectYAML/DWARFVisitor.cpp b/lib/ObjectYAML/DWARFVisitor.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..36a9f7638bd44b53c992f39d5832511b8bf495f5
--- /dev/null
+++ b/lib/ObjectYAML/DWARFVisitor.cpp
@@ -0,0 +1,178 @@
+//===--- DWARFVisitor.cpp ---------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "DWARFVisitor.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
+
+using namespace llvm;
+
+template <typename T>
+void DWARFYAML::VisitorImpl<T>::onVariableSizeValue(uint64_t U, unsigned Size) {
+  switch (Size) {
+  case 8:
+    onValue((uint64_t)U);
+    break;
+  case 4:
+    onValue((uint32_t)U);
+    break;
+  case 2:
+    onValue((uint16_t)U);
+    break;
+  case 1:
+    onValue((uint8_t)U);
+    break;
+  default:
+    llvm_unreachable("Invalid integer write size.");
+  }
+}
+
+unsigned getOffsetSize(const DWARFYAML::Unit &Unit) {
+  return Unit.Length.isDWARF64() ? 8 : 4;
+}
+
+unsigned getRefSize(const DWARFYAML::Unit &Unit) {
+  if (Unit.Version == 2)
+    return Unit.AddrSize;
+  return getOffsetSize(Unit);
+}
+
+template <typename T> void DWARFYAML::VisitorImpl<T>::traverseDebugInfo() {
+  for (auto &Unit : DebugInfo.CompileUnits) {
+    onStartCompileUnit(Unit);
+    auto FirstAbbrevCode = Unit.Entries[0].AbbrCode;
+
+    for (auto &Entry : Unit.Entries) {
+      onStartDIE(Unit, Entry);
+      if (Entry.AbbrCode == 0u)
+        continue;
+      auto &Abbrev = DebugInfo.AbbrevDecls[Entry.AbbrCode - FirstAbbrevCode];
+      auto FormVal = Entry.Values.begin();
+      auto AbbrForm = Abbrev.Attributes.begin();
+      for (;
+           FormVal != Entry.Values.end() && AbbrForm != Abbrev.Attributes.end();
+           ++FormVal, ++AbbrForm) {
+        onForm(*AbbrForm, *FormVal);
+        dwarf::Form Form = AbbrForm->Form;
+        bool Indirect;
+        do {
+          Indirect = false;
+          switch (Form) {
+          case dwarf::DW_FORM_addr:
+            onVariableSizeValue(FormVal->Value, Unit.AddrSize);
+            break;
+          case dwarf::DW_FORM_ref_addr:
+            onVariableSizeValue(FormVal->Value, getRefSize(Unit));
+            break;
+          case dwarf::DW_FORM_exprloc:
+          case dwarf::DW_FORM_block:
+            onValue((uint64_t)FormVal->BlockData.size(), true);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          case dwarf::DW_FORM_block1: {
+            auto writeSize = FormVal->BlockData.size();
+            onValue((uint8_t)writeSize);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          }
+          case dwarf::DW_FORM_block2: {
+            auto writeSize = FormVal->BlockData.size();
+            onValue((uint16_t)writeSize);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          }
+          case dwarf::DW_FORM_block4: {
+            auto writeSize = FormVal->BlockData.size();
+            onValue((uint32_t)writeSize);
+            onValue(
+                MemoryBufferRef(StringRef((const char *)&FormVal->BlockData[0],
+                                          FormVal->BlockData.size()),
+                                ""));
+            break;
+          }
+          case dwarf::DW_FORM_data1:
+          case dwarf::DW_FORM_ref1:
+          case dwarf::DW_FORM_flag:
+          case dwarf::DW_FORM_strx1:
+          case dwarf::DW_FORM_addrx1:
+            onValue((uint8_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_data2:
+          case dwarf::DW_FORM_ref2:
+          case dwarf::DW_FORM_strx2:
+          case dwarf::DW_FORM_addrx2:
+            onValue((uint16_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_data4:
+          case dwarf::DW_FORM_ref4:
+          case dwarf::DW_FORM_ref_sup4:
+          case dwarf::DW_FORM_strx4:
+          case dwarf::DW_FORM_addrx4:
+            onValue((uint32_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_data8:
+          case dwarf::DW_FORM_ref8:
+          case dwarf::DW_FORM_ref_sup8:
+            onValue((uint64_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_sdata:
+            onValue((int64_t)FormVal->Value, true);
+            break;
+          case dwarf::DW_FORM_udata:
+          case dwarf::DW_FORM_ref_udata:
+            onValue((uint64_t)FormVal->Value, true);
+            break;
+          case dwarf::DW_FORM_string:
+            onValue(FormVal->CStr);
+            break;
+          case dwarf::DW_FORM_indirect:
+            onValue((uint64_t)FormVal->Value, true);
+            Indirect = true;
+            Form = static_cast<dwarf::Form>((uint64_t)FormVal->Value);
+            ++FormVal;
+            break;
+          case dwarf::DW_FORM_strp:
+          case dwarf::DW_FORM_sec_offset:
+          case dwarf::DW_FORM_GNU_ref_alt:
+          case dwarf::DW_FORM_GNU_strp_alt:
+          case dwarf::DW_FORM_line_strp:
+          case dwarf::DW_FORM_strp_sup:
+            onVariableSizeValue(FormVal->Value, getOffsetSize(Unit));
+            break;
+          case dwarf::DW_FORM_ref_sig8:
+            onValue((uint64_t)FormVal->Value);
+            break;
+          case dwarf::DW_FORM_GNU_addr_index:
+          case dwarf::DW_FORM_GNU_str_index:
+            onValue((uint64_t)FormVal->Value, true);
+            break;
+          default:
+            break;
+          }
+        } while (Indirect);
+      }
+      onEndDIE(Unit, Entry);
+    }
+    onEndCompileUnit(Unit);
+  }
+}
+
+// Explicitly instantiate the two template expansions.
+template class DWARFYAML::VisitorImpl<DWARFYAML::Data>;
+template class DWARFYAML::VisitorImpl<const DWARFYAML::Data>;
diff --git a/lib/ObjectYAML/DWARFVisitor.h b/lib/ObjectYAML/DWARFVisitor.h
new file mode 100644
index 0000000000000000000000000000000000000000..263e36220a05be65656eb6504e224a1656d64454
--- /dev/null
+++ b/lib/ObjectYAML/DWARFVisitor.h
@@ -0,0 +1,97 @@
+//===--- DWARFVisitor.h -----------------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_OBJECTYAML_DWARFVISITOR_H
+#define LLVM_OBJECTYAML_DWARFVISITOR_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/Support/MemoryBuffer.h"
+
+namespace llvm {
+
+namespace DWARFYAML {
+
+struct Data;
+struct Unit;
+struct Entry;
+struct FormValue;
+struct AttributeAbbrev;
+
+/// \brief A class to visits DWARFYAML Compile Units and DIEs in preorder.
+///
+/// Extensions of this class can either maintain const or non-const references
+/// to the DWARFYAML::Data object.
+template <typename T> class VisitorImpl {
+protected:
+  T &DebugInfo;
+
+  /// Visitor Functions
+  /// @{
+  virtual void onStartCompileUnit(Unit &CU) {}
+  virtual void onEndCompileUnit(Unit &CU) {}
+  virtual void onStartDIE(Unit &CU, Entry &DIE) {}
+  virtual void onEndDIE(Unit &CU, Entry &DIE) {}
+  virtual void onForm(AttributeAbbrev &AttAbbrev, FormValue &Value) {}
+  /// @}
+
+  /// Const Visitor Functions
+  /// @{
+  virtual void onStartCompileUnit(const Unit &CU) {}
+  virtual void onEndCompileUnit(const Unit &CU) {}
+  virtual void onStartDIE(const Unit &CU, const Entry &DIE) {}
+  virtual void onEndDIE(const Unit &CU, const Entry &DIE) {}
+  virtual void onForm(const AttributeAbbrev &AttAbbrev,
+                      const FormValue &Value) {}
+  /// @}
+
+  /// Value visitors
+  /// @{
+  virtual void onValue(const uint8_t U) {}
+  virtual void onValue(const uint16_t U) {}
+  virtual void onValue(const uint32_t U) {}
+  virtual void onValue(const uint64_t U, const bool LEB = false) {}
+  virtual void onValue(const int64_t S, const bool LEB = false) {}
+  virtual void onValue(const StringRef String) {}
+  virtual void onValue(const MemoryBufferRef MBR) {}
+  /// @}
+
+public:
+  VisitorImpl(T &DI) : DebugInfo(DI) {}
+
+  virtual ~VisitorImpl() {}
+
+  void traverseDebugInfo();
+
+private:
+  void onVariableSizeValue(uint64_t U, unsigned Size);
+};
+
+// Making the visior instantiations extern and explicit in the cpp file. This
+// prevents them from being instantiated in every compile unit that uses the
+// visitors.
+extern template class VisitorImpl<DWARFYAML::Data>;
+extern template class VisitorImpl<const DWARFYAML::Data>;
+
+class Visitor : public VisitorImpl<Data> {
+public:
+  Visitor(Data &DI) : VisitorImpl<Data>(DI) {}
+};
+
+class ConstVisitor : public VisitorImpl<const Data> {
+public:
+  ConstVisitor(const Data &DI) : VisitorImpl<const Data>(DI) {}
+};
+
+} // namespace DWARFYAML
+} // namespace llvm
+
+#endif
diff --git a/lib/ObjectYAML/DWARFYAML.cpp b/lib/ObjectYAML/DWARFYAML.cpp
index 014e63fe7d34063ccd3ddb51df19964d417d9fc8..edb9545f14b131805ae6589aea6380607b83f2dc 100644
--- a/lib/ObjectYAML/DWARFYAML.cpp
+++ b/lib/ObjectYAML/DWARFYAML.cpp
@@ -54,6 +54,8 @@ void MappingTraits<DWARFYAML::AttributeAbbrev>::mapping(
     IO &IO, DWARFYAML::AttributeAbbrev &AttAbbrev) {
   IO.mapRequired("Attribute", AttAbbrev.Attribute);
   IO.mapRequired("Form", AttAbbrev.Form);
+  if(AttAbbrev.Form == dwarf::DW_FORM_implicit_const)
+    IO.mapRequired("Value", AttAbbrev.Value);
 }
 
 void MappingTraits<DWARFYAML::ARangeDescriptor>::mapping(
@@ -97,6 +99,8 @@ void MappingTraits<DWARFYAML::PubSection>::mapping(
 void MappingTraits<DWARFYAML::Unit>::mapping(IO &IO, DWARFYAML::Unit &Unit) {
   IO.mapRequired("Length", Unit.Length);
   IO.mapRequired("Version", Unit.Version);
+  if (Unit.Version >= 5)
+    IO.mapRequired("UnitType", Unit.Type);
   IO.mapRequired("AbbrOffset", Unit.AbbrOffset);
   IO.mapRequired("AddrSize", Unit.AddrSize);
   IO.mapOptional("Entries", Unit.Entries);
@@ -144,9 +148,7 @@ void MappingTraits<DWARFYAML::LineTableOpcode>::mapping(
 
 void MappingTraits<DWARFYAML::LineTable>::mapping(
     IO &IO, DWARFYAML::LineTable &LineTable) {
-  IO.mapRequired("TotalLength", LineTable.TotalLength);
-  if (LineTable.TotalLength == UINT32_MAX)
-    IO.mapRequired("TotalLength64", LineTable.TotalLength64);
+  IO.mapRequired("Length", LineTable.Length);
   IO.mapRequired("Version", LineTable.Version);
   IO.mapRequired("PrologueLength", LineTable.PrologueLength);
   IO.mapRequired("MinInstLength", LineTable.MinInstLength);
@@ -162,6 +164,13 @@ void MappingTraits<DWARFYAML::LineTable>::mapping(
   IO.mapRequired("Opcodes", LineTable.Opcodes);
 }
 
+void MappingTraits<DWARFYAML::InitialLength>::mapping(
+    IO &IO, DWARFYAML::InitialLength &InitialLength) {
+  IO.mapRequired("TotalLength", InitialLength.TotalLength);
+  if (InitialLength.isDWARF64())
+    IO.mapRequired("TotalLength64", InitialLength.TotalLength64);
+}
+
 } // namespace llvm::yaml
 
 } // namespace llvm
diff --git a/lib/ObjectYAML/ELFYAML.cpp b/lib/ObjectYAML/ELFYAML.cpp
index fe9af9f3ac767f90daa37e8496883f2b418b5ba1..3052901da45ca98c21e7f25613582a58eaaadf55 100644
--- a/lib/ObjectYAML/ELFYAML.cpp
+++ b/lib/ObjectYAML/ELFYAML.cpp
@@ -21,231 +21,229 @@ ELFYAML::Section::~Section() {}
 
 namespace yaml {
 
-void
-ScalarEnumerationTraits<ELFYAML::ELF_ET>::enumeration(IO &IO,
-                                                      ELFYAML::ELF_ET &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(ET_NONE)
-  ECase(ET_REL)
-  ECase(ET_EXEC)
-  ECase(ET_DYN)
-  ECase(ET_CORE)
+void ScalarEnumerationTraits<ELFYAML::ELF_ET>::enumeration(
+    IO &IO, ELFYAML::ELF_ET &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(ET_NONE);
+  ECase(ET_REL);
+  ECase(ET_EXEC);
+  ECase(ET_DYN);
+  ECase(ET_CORE);
 #undef ECase
   IO.enumFallback<Hex16>(Value);
 }
 
-void
-ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(IO &IO,
-                                                      ELFYAML::ELF_EM &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(EM_NONE)
-  ECase(EM_M32)
-  ECase(EM_SPARC)
-  ECase(EM_386)
-  ECase(EM_68K)
-  ECase(EM_88K)
-  ECase(EM_IAMCU)
-  ECase(EM_860)
-  ECase(EM_MIPS)
-  ECase(EM_S370)
-  ECase(EM_MIPS_RS3_LE)
-  ECase(EM_PARISC)
-  ECase(EM_VPP500)
-  ECase(EM_SPARC32PLUS)
-  ECase(EM_960)
-  ECase(EM_PPC)
-  ECase(EM_PPC64)
-  ECase(EM_S390)
-  ECase(EM_SPU)
-  ECase(EM_V800)
-  ECase(EM_FR20)
-  ECase(EM_RH32)
-  ECase(EM_RCE)
-  ECase(EM_ARM)
-  ECase(EM_ALPHA)
-  ECase(EM_SH)
-  ECase(EM_SPARCV9)
-  ECase(EM_TRICORE)
-  ECase(EM_ARC)
-  ECase(EM_H8_300)
-  ECase(EM_H8_300H)
-  ECase(EM_H8S)
-  ECase(EM_H8_500)
-  ECase(EM_IA_64)
-  ECase(EM_MIPS_X)
-  ECase(EM_COLDFIRE)
-  ECase(EM_68HC12)
-  ECase(EM_MMA)
-  ECase(EM_PCP)
-  ECase(EM_NCPU)
-  ECase(EM_NDR1)
-  ECase(EM_STARCORE)
-  ECase(EM_ME16)
-  ECase(EM_ST100)
-  ECase(EM_TINYJ)
-  ECase(EM_X86_64)
-  ECase(EM_PDSP)
-  ECase(EM_PDP10)
-  ECase(EM_PDP11)
-  ECase(EM_FX66)
-  ECase(EM_ST9PLUS)
-  ECase(EM_ST7)
-  ECase(EM_68HC16)
-  ECase(EM_68HC11)
-  ECase(EM_68HC08)
-  ECase(EM_68HC05)
-  ECase(EM_SVX)
-  ECase(EM_ST19)
-  ECase(EM_VAX)
-  ECase(EM_CRIS)
-  ECase(EM_JAVELIN)
-  ECase(EM_FIREPATH)
-  ECase(EM_ZSP)
-  ECase(EM_MMIX)
-  ECase(EM_HUANY)
-  ECase(EM_PRISM)
-  ECase(EM_AVR)
-  ECase(EM_FR30)
-  ECase(EM_D10V)
-  ECase(EM_D30V)
-  ECase(EM_V850)
-  ECase(EM_M32R)
-  ECase(EM_MN10300)
-  ECase(EM_MN10200)
-  ECase(EM_PJ)
-  ECase(EM_OPENRISC)
-  ECase(EM_ARC_COMPACT)
-  ECase(EM_XTENSA)
-  ECase(EM_VIDEOCORE)
-  ECase(EM_TMM_GPP)
-  ECase(EM_NS32K)
-  ECase(EM_TPC)
-  ECase(EM_SNP1K)
-  ECase(EM_ST200)
-  ECase(EM_IP2K)
-  ECase(EM_MAX)
-  ECase(EM_CR)
-  ECase(EM_F2MC16)
-  ECase(EM_MSP430)
-  ECase(EM_BLACKFIN)
-  ECase(EM_SE_C33)
-  ECase(EM_SEP)
-  ECase(EM_ARCA)
-  ECase(EM_UNICORE)
-  ECase(EM_EXCESS)
-  ECase(EM_DXP)
-  ECase(EM_ALTERA_NIOS2)
-  ECase(EM_CRX)
-  ECase(EM_XGATE)
-  ECase(EM_C166)
-  ECase(EM_M16C)
-  ECase(EM_DSPIC30F)
-  ECase(EM_CE)
-  ECase(EM_M32C)
-  ECase(EM_TSK3000)
-  ECase(EM_RS08)
-  ECase(EM_SHARC)
-  ECase(EM_ECOG2)
-  ECase(EM_SCORE7)
-  ECase(EM_DSP24)
-  ECase(EM_VIDEOCORE3)
-  ECase(EM_LATTICEMICO32)
-  ECase(EM_SE_C17)
-  ECase(EM_TI_C6000)
-  ECase(EM_TI_C2000)
-  ECase(EM_TI_C5500)
-  ECase(EM_MMDSP_PLUS)
-  ECase(EM_CYPRESS_M8C)
-  ECase(EM_R32C)
-  ECase(EM_TRIMEDIA)
-  ECase(EM_HEXAGON)
-  ECase(EM_8051)
-  ECase(EM_STXP7X)
-  ECase(EM_NDS32)
-  ECase(EM_ECOG1)
-  ECase(EM_ECOG1X)
-  ECase(EM_MAXQ30)
-  ECase(EM_XIMO16)
-  ECase(EM_MANIK)
-  ECase(EM_CRAYNV2)
-  ECase(EM_RX)
-  ECase(EM_METAG)
-  ECase(EM_MCST_ELBRUS)
-  ECase(EM_ECOG16)
-  ECase(EM_CR16)
-  ECase(EM_ETPU)
-  ECase(EM_SLE9X)
-  ECase(EM_L10M)
-  ECase(EM_K10M)
-  ECase(EM_AARCH64)
-  ECase(EM_AVR32)
-  ECase(EM_STM8)
-  ECase(EM_TILE64)
-  ECase(EM_TILEPRO)
-  ECase(EM_CUDA)
-  ECase(EM_TILEGX)
-  ECase(EM_CLOUDSHIELD)
-  ECase(EM_COREA_1ST)
-  ECase(EM_COREA_2ND)
-  ECase(EM_ARC_COMPACT2)
-  ECase(EM_OPEN8)
-  ECase(EM_RL78)
-  ECase(EM_VIDEOCORE5)
-  ECase(EM_78KOR)
-  ECase(EM_56800EX)
-  ECase(EM_AMDGPU)
-  ECase(EM_RISCV)
-  ECase(EM_LANAI)
-  ECase(EM_BPF)
+void ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(
+    IO &IO, ELFYAML::ELF_EM &Value) {
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(EM_NONE);
+  ECase(EM_M32);
+  ECase(EM_SPARC);
+  ECase(EM_386);
+  ECase(EM_68K);
+  ECase(EM_88K);
+  ECase(EM_IAMCU);
+  ECase(EM_860);
+  ECase(EM_MIPS);
+  ECase(EM_S370);
+  ECase(EM_MIPS_RS3_LE);
+  ECase(EM_PARISC);
+  ECase(EM_VPP500);
+  ECase(EM_SPARC32PLUS);
+  ECase(EM_960);
+  ECase(EM_PPC);
+  ECase(EM_PPC64);
+  ECase(EM_S390);
+  ECase(EM_SPU);
+  ECase(EM_V800);
+  ECase(EM_FR20);
+  ECase(EM_RH32);
+  ECase(EM_RCE);
+  ECase(EM_ARM);
+  ECase(EM_ALPHA);
+  ECase(EM_SH);
+  ECase(EM_SPARCV9);
+  ECase(EM_TRICORE);
+  ECase(EM_ARC);
+  ECase(EM_H8_300);
+  ECase(EM_H8_300H);
+  ECase(EM_H8S);
+  ECase(EM_H8_500);
+  ECase(EM_IA_64);
+  ECase(EM_MIPS_X);
+  ECase(EM_COLDFIRE);
+  ECase(EM_68HC12);
+  ECase(EM_MMA);
+  ECase(EM_PCP);
+  ECase(EM_NCPU);
+  ECase(EM_NDR1);
+  ECase(EM_STARCORE);
+  ECase(EM_ME16);
+  ECase(EM_ST100);
+  ECase(EM_TINYJ);
+  ECase(EM_X86_64);
+  ECase(EM_PDSP);
+  ECase(EM_PDP10);
+  ECase(EM_PDP11);
+  ECase(EM_FX66);
+  ECase(EM_ST9PLUS);
+  ECase(EM_ST7);
+  ECase(EM_68HC16);
+  ECase(EM_68HC11);
+  ECase(EM_68HC08);
+  ECase(EM_68HC05);
+  ECase(EM_SVX);
+  ECase(EM_ST19);
+  ECase(EM_VAX);
+  ECase(EM_CRIS);
+  ECase(EM_JAVELIN);
+  ECase(EM_FIREPATH);
+  ECase(EM_ZSP);
+  ECase(EM_MMIX);
+  ECase(EM_HUANY);
+  ECase(EM_PRISM);
+  ECase(EM_AVR);
+  ECase(EM_FR30);
+  ECase(EM_D10V);
+  ECase(EM_D30V);
+  ECase(EM_V850);
+  ECase(EM_M32R);
+  ECase(EM_MN10300);
+  ECase(EM_MN10200);
+  ECase(EM_PJ);
+  ECase(EM_OPENRISC);
+  ECase(EM_ARC_COMPACT);
+  ECase(EM_XTENSA);
+  ECase(EM_VIDEOCORE);
+  ECase(EM_TMM_GPP);
+  ECase(EM_NS32K);
+  ECase(EM_TPC);
+  ECase(EM_SNP1K);
+  ECase(EM_ST200);
+  ECase(EM_IP2K);
+  ECase(EM_MAX);
+  ECase(EM_CR);
+  ECase(EM_F2MC16);
+  ECase(EM_MSP430);
+  ECase(EM_BLACKFIN);
+  ECase(EM_SE_C33);
+  ECase(EM_SEP);
+  ECase(EM_ARCA);
+  ECase(EM_UNICORE);
+  ECase(EM_EXCESS);
+  ECase(EM_DXP);
+  ECase(EM_ALTERA_NIOS2);
+  ECase(EM_CRX);
+  ECase(EM_XGATE);
+  ECase(EM_C166);
+  ECase(EM_M16C);
+  ECase(EM_DSPIC30F);
+  ECase(EM_CE);
+  ECase(EM_M32C);
+  ECase(EM_TSK3000);
+  ECase(EM_RS08);
+  ECase(EM_SHARC);
+  ECase(EM_ECOG2);
+  ECase(EM_SCORE7);
+  ECase(EM_DSP24);
+  ECase(EM_VIDEOCORE3);
+  ECase(EM_LATTICEMICO32);
+  ECase(EM_SE_C17);
+  ECase(EM_TI_C6000);
+  ECase(EM_TI_C2000);
+  ECase(EM_TI_C5500);
+  ECase(EM_MMDSP_PLUS);
+  ECase(EM_CYPRESS_M8C);
+  ECase(EM_R32C);
+  ECase(EM_TRIMEDIA);
+  ECase(EM_HEXAGON);
+  ECase(EM_8051);
+  ECase(EM_STXP7X);
+  ECase(EM_NDS32);
+  ECase(EM_ECOG1);
+  ECase(EM_ECOG1X);
+  ECase(EM_MAXQ30);
+  ECase(EM_XIMO16);
+  ECase(EM_MANIK);
+  ECase(EM_CRAYNV2);
+  ECase(EM_RX);
+  ECase(EM_METAG);
+  ECase(EM_MCST_ELBRUS);
+  ECase(EM_ECOG16);
+  ECase(EM_CR16);
+  ECase(EM_ETPU);
+  ECase(EM_SLE9X);
+  ECase(EM_L10M);
+  ECase(EM_K10M);
+  ECase(EM_AARCH64);
+  ECase(EM_AVR32);
+  ECase(EM_STM8);
+  ECase(EM_TILE64);
+  ECase(EM_TILEPRO);
+  ECase(EM_CUDA);
+  ECase(EM_TILEGX);
+  ECase(EM_CLOUDSHIELD);
+  ECase(EM_COREA_1ST);
+  ECase(EM_COREA_2ND);
+  ECase(EM_ARC_COMPACT2);
+  ECase(EM_OPEN8);
+  ECase(EM_RL78);
+  ECase(EM_VIDEOCORE5);
+  ECase(EM_78KOR);
+  ECase(EM_56800EX);
+  ECase(EM_AMDGPU);
+  ECase(EM_RISCV);
+  ECase(EM_LANAI);
+  ECase(EM_BPF);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ELFCLASS>::enumeration(
     IO &IO, ELFYAML::ELF_ELFCLASS &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
   // Since the semantics of ELFCLASSNONE is "invalid", just don't accept it
   // here.
-  ECase(ELFCLASS32)
-  ECase(ELFCLASS64)
+  ECase(ELFCLASS32);
+  ECase(ELFCLASS64);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ELFDATA>::enumeration(
     IO &IO, ELFYAML::ELF_ELFDATA &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
   // Since the semantics of ELFDATANONE is "invalid", just don't accept it
   // here.
-  ECase(ELFDATA2LSB)
-  ECase(ELFDATA2MSB)
+  ECase(ELFDATA2LSB);
+  ECase(ELFDATA2MSB);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_ELFOSABI>::enumeration(
     IO &IO, ELFYAML::ELF_ELFOSABI &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(ELFOSABI_NONE)
-  ECase(ELFOSABI_HPUX)
-  ECase(ELFOSABI_NETBSD)
-  ECase(ELFOSABI_GNU)
-  ECase(ELFOSABI_GNU)
-  ECase(ELFOSABI_HURD)
-  ECase(ELFOSABI_SOLARIS)
-  ECase(ELFOSABI_AIX)
-  ECase(ELFOSABI_IRIX)
-  ECase(ELFOSABI_FREEBSD)
-  ECase(ELFOSABI_TRU64)
-  ECase(ELFOSABI_MODESTO)
-  ECase(ELFOSABI_OPENBSD)
-  ECase(ELFOSABI_OPENVMS)
-  ECase(ELFOSABI_NSK)
-  ECase(ELFOSABI_AROS)
-  ECase(ELFOSABI_FENIXOS)
-  ECase(ELFOSABI_CLOUDABI)
-  ECase(ELFOSABI_C6000_ELFABI)
-  ECase(ELFOSABI_AMDGPU_HSA)
-  ECase(ELFOSABI_C6000_LINUX)
-  ECase(ELFOSABI_ARM)
-  ECase(ELFOSABI_STANDALONE)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(ELFOSABI_NONE);
+  ECase(ELFOSABI_HPUX);
+  ECase(ELFOSABI_NETBSD);
+  ECase(ELFOSABI_GNU);
+  ECase(ELFOSABI_GNU);
+  ECase(ELFOSABI_HURD);
+  ECase(ELFOSABI_SOLARIS);
+  ECase(ELFOSABI_AIX);
+  ECase(ELFOSABI_IRIX);
+  ECase(ELFOSABI_FREEBSD);
+  ECase(ELFOSABI_TRU64);
+  ECase(ELFOSABI_MODESTO);
+  ECase(ELFOSABI_OPENBSD);
+  ECase(ELFOSABI_OPENVMS);
+  ECase(ELFOSABI_NSK);
+  ECase(ELFOSABI_AROS);
+  ECase(ELFOSABI_FENIXOS);
+  ECase(ELFOSABI_CLOUDABI);
+  ECase(ELFOSABI_C6000_ELFABI);
+  ECase(ELFOSABI_AMDGPU_HSA);
+  ECase(ELFOSABI_C6000_LINUX);
+  ECase(ELFOSABI_ARM);
+  ECase(ELFOSABI_STANDALONE);
 #undef ECase
 }
 
@@ -253,92 +251,92 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
                                                  ELFYAML::ELF_EF &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
-#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
-#define BCaseMask(X, M) IO.maskedBitSetCase(Value, #X, ELF::X, ELF::M);
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
+#define BCaseMask(X, M) IO.maskedBitSetCase(Value, #X, ELF::X, ELF::M)
   switch (Object->Header.Machine) {
   case ELF::EM_ARM:
-    BCase(EF_ARM_SOFT_FLOAT)
-    BCase(EF_ARM_VFP_FLOAT)
-    BCaseMask(EF_ARM_EABI_UNKNOWN, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER1, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER2, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER3, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER4, EF_ARM_EABIMASK)
-    BCaseMask(EF_ARM_EABI_VER5, EF_ARM_EABIMASK)
+    BCase(EF_ARM_SOFT_FLOAT);
+    BCase(EF_ARM_VFP_FLOAT);
+    BCaseMask(EF_ARM_EABI_UNKNOWN, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER1, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER2, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER3, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER4, EF_ARM_EABIMASK);
+    BCaseMask(EF_ARM_EABI_VER5, EF_ARM_EABIMASK);
     break;
   case ELF::EM_MIPS:
-    BCase(EF_MIPS_NOREORDER)
-    BCase(EF_MIPS_PIC)
-    BCase(EF_MIPS_CPIC)
-    BCase(EF_MIPS_ABI2)
-    BCase(EF_MIPS_32BITMODE)
-    BCase(EF_MIPS_FP64)
-    BCase(EF_MIPS_NAN2008)
-    BCase(EF_MIPS_MICROMIPS)
-    BCase(EF_MIPS_ARCH_ASE_M16)
-    BCase(EF_MIPS_ARCH_ASE_MDMX)
-    BCaseMask(EF_MIPS_ABI_O32, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_ABI_O64, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_ABI_EABI32, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_ABI_EABI64, EF_MIPS_ABI)
-    BCaseMask(EF_MIPS_MACH_3900, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4010, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4100, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4650, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4120, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_4111, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_SB1, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_OCTEON, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_XLR, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_OCTEON2, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_OCTEON3, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_5400, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_5900, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_5500, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_9000, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_LS2E, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_LS2F, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_MACH_LS3A, EF_MIPS_MACH)
-    BCaseMask(EF_MIPS_ARCH_1, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_2, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_3, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_4, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_5, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_32, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_64, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_32R2, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_64R2, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_32R6, EF_MIPS_ARCH)
-    BCaseMask(EF_MIPS_ARCH_64R6, EF_MIPS_ARCH)
+    BCase(EF_MIPS_NOREORDER);
+    BCase(EF_MIPS_PIC);
+    BCase(EF_MIPS_CPIC);
+    BCase(EF_MIPS_ABI2);
+    BCase(EF_MIPS_32BITMODE);
+    BCase(EF_MIPS_FP64);
+    BCase(EF_MIPS_NAN2008);
+    BCase(EF_MIPS_MICROMIPS);
+    BCase(EF_MIPS_ARCH_ASE_M16);
+    BCase(EF_MIPS_ARCH_ASE_MDMX);
+    BCaseMask(EF_MIPS_ABI_O32, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_ABI_O64, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_ABI_EABI32, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_ABI_EABI64, EF_MIPS_ABI);
+    BCaseMask(EF_MIPS_MACH_3900, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4010, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4100, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4650, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4120, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_4111, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_SB1, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_OCTEON, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_XLR, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_OCTEON2, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_OCTEON3, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_5400, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_5900, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_5500, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_9000, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_LS2E, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_LS2F, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_MACH_LS3A, EF_MIPS_MACH);
+    BCaseMask(EF_MIPS_ARCH_1, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_2, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_3, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_4, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_5, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_32, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_64, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_32R2, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_64R2, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_32R6, EF_MIPS_ARCH);
+    BCaseMask(EF_MIPS_ARCH_64R6, EF_MIPS_ARCH);
     break;
   case ELF::EM_HEXAGON:
-    BCase(EF_HEXAGON_MACH_V2)
-    BCase(EF_HEXAGON_MACH_V3)
-    BCase(EF_HEXAGON_MACH_V4)
-    BCase(EF_HEXAGON_MACH_V5)
-    BCase(EF_HEXAGON_ISA_V2)
-    BCase(EF_HEXAGON_ISA_V3)
-    BCase(EF_HEXAGON_ISA_V4)
-    BCase(EF_HEXAGON_ISA_V5)
+    BCase(EF_HEXAGON_MACH_V2);
+    BCase(EF_HEXAGON_MACH_V3);
+    BCase(EF_HEXAGON_MACH_V4);
+    BCase(EF_HEXAGON_MACH_V5);
+    BCase(EF_HEXAGON_ISA_V2);
+    BCase(EF_HEXAGON_ISA_V3);
+    BCase(EF_HEXAGON_ISA_V4);
+    BCase(EF_HEXAGON_ISA_V5);
     break;
   case ELF::EM_AVR:
-    BCase(EF_AVR_ARCH_AVR1)
-    BCase(EF_AVR_ARCH_AVR2)
-    BCase(EF_AVR_ARCH_AVR25)
-    BCase(EF_AVR_ARCH_AVR3)
-    BCase(EF_AVR_ARCH_AVR31)
-    BCase(EF_AVR_ARCH_AVR35)
-    BCase(EF_AVR_ARCH_AVR4)
-    BCase(EF_AVR_ARCH_AVR51)
-    BCase(EF_AVR_ARCH_AVR6)
-    BCase(EF_AVR_ARCH_AVRTINY)
-    BCase(EF_AVR_ARCH_XMEGA1)
-    BCase(EF_AVR_ARCH_XMEGA2)
-    BCase(EF_AVR_ARCH_XMEGA3)
-    BCase(EF_AVR_ARCH_XMEGA4)
-    BCase(EF_AVR_ARCH_XMEGA5)
-    BCase(EF_AVR_ARCH_XMEGA6)
-    BCase(EF_AVR_ARCH_XMEGA7)
+    BCase(EF_AVR_ARCH_AVR1);
+    BCase(EF_AVR_ARCH_AVR2);
+    BCase(EF_AVR_ARCH_AVR25);
+    BCase(EF_AVR_ARCH_AVR3);
+    BCase(EF_AVR_ARCH_AVR31);
+    BCase(EF_AVR_ARCH_AVR35);
+    BCase(EF_AVR_ARCH_AVR4);
+    BCase(EF_AVR_ARCH_AVR51);
+    BCase(EF_AVR_ARCH_AVR6);
+    BCase(EF_AVR_ARCH_AVRTINY);
+    BCase(EF_AVR_ARCH_XMEGA1);
+    BCase(EF_AVR_ARCH_XMEGA2);
+    BCase(EF_AVR_ARCH_XMEGA3);
+    BCase(EF_AVR_ARCH_XMEGA4);
+    BCase(EF_AVR_ARCH_XMEGA5);
+    BCase(EF_AVR_ARCH_XMEGA6);
+    BCase(EF_AVR_ARCH_XMEGA7);
     break;
   case ELF::EM_AMDGPU:
   case ELF::EM_X86_64:
@@ -354,51 +352,51 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
     IO &IO, ELFYAML::ELF_SHT &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(SHT_NULL)
-  ECase(SHT_PROGBITS)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(SHT_NULL);
+  ECase(SHT_PROGBITS);
   // No SHT_SYMTAB. Use the top-level `Symbols` key instead.
   // FIXME: Issue a diagnostic with this information.
-  ECase(SHT_STRTAB)
-  ECase(SHT_RELA)
-  ECase(SHT_HASH)
-  ECase(SHT_DYNAMIC)
-  ECase(SHT_NOTE)
-  ECase(SHT_NOBITS)
-  ECase(SHT_REL)
-  ECase(SHT_SHLIB)
-  ECase(SHT_DYNSYM)
-  ECase(SHT_INIT_ARRAY)
-  ECase(SHT_FINI_ARRAY)
-  ECase(SHT_PREINIT_ARRAY)
-  ECase(SHT_GROUP)
-  ECase(SHT_SYMTAB_SHNDX)
-  ECase(SHT_LOOS)
-  ECase(SHT_GNU_ATTRIBUTES)
-  ECase(SHT_GNU_HASH)
-  ECase(SHT_GNU_verdef)
-  ECase(SHT_GNU_verneed)
-  ECase(SHT_GNU_versym)
-  ECase(SHT_HIOS)
-  ECase(SHT_LOPROC)
+  ECase(SHT_STRTAB);
+  ECase(SHT_RELA);
+  ECase(SHT_HASH);
+  ECase(SHT_DYNAMIC);
+  ECase(SHT_NOTE);
+  ECase(SHT_NOBITS);
+  ECase(SHT_REL);
+  ECase(SHT_SHLIB);
+  ECase(SHT_DYNSYM);
+  ECase(SHT_INIT_ARRAY);
+  ECase(SHT_FINI_ARRAY);
+  ECase(SHT_PREINIT_ARRAY);
+  ECase(SHT_GROUP);
+  ECase(SHT_SYMTAB_SHNDX);
+  ECase(SHT_LOOS);
+  ECase(SHT_GNU_ATTRIBUTES);
+  ECase(SHT_GNU_HASH);
+  ECase(SHT_GNU_verdef);
+  ECase(SHT_GNU_verneed);
+  ECase(SHT_GNU_versym);
+  ECase(SHT_HIOS);
+  ECase(SHT_LOPROC);
   switch (Object->Header.Machine) {
   case ELF::EM_ARM:
-    ECase(SHT_ARM_EXIDX)
-    ECase(SHT_ARM_PREEMPTMAP)
-    ECase(SHT_ARM_ATTRIBUTES)
-    ECase(SHT_ARM_DEBUGOVERLAY)
-    ECase(SHT_ARM_OVERLAYSECTION)
+    ECase(SHT_ARM_EXIDX);
+    ECase(SHT_ARM_PREEMPTMAP);
+    ECase(SHT_ARM_ATTRIBUTES);
+    ECase(SHT_ARM_DEBUGOVERLAY);
+    ECase(SHT_ARM_OVERLAYSECTION);
     break;
   case ELF::EM_HEXAGON:
-    ECase(SHT_HEX_ORDERED)
+    ECase(SHT_HEX_ORDERED);
     break;
   case ELF::EM_X86_64:
-    ECase(SHT_X86_64_UNWIND)
+    ECase(SHT_X86_64_UNWIND);
     break;
   case ELF::EM_MIPS:
-    ECase(SHT_MIPS_REGINFO)
-    ECase(SHT_MIPS_OPTIONS)
-    ECase(SHT_MIPS_ABIFLAGS)
+    ECase(SHT_MIPS_REGINFO);
+    ECase(SHT_MIPS_OPTIONS);
+    ECase(SHT_MIPS_ABIFLAGS);
     break;
   default:
     // Nothing to do.
@@ -410,43 +408,43 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
 void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
                                                   ELFYAML::ELF_SHF &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
-#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
-  BCase(SHF_WRITE)
-  BCase(SHF_ALLOC)
-  BCase(SHF_EXCLUDE)
-  BCase(SHF_EXECINSTR)
-  BCase(SHF_MERGE)
-  BCase(SHF_STRINGS)
-  BCase(SHF_INFO_LINK)
-  BCase(SHF_LINK_ORDER)
-  BCase(SHF_OS_NONCONFORMING)
-  BCase(SHF_GROUP)
-  BCase(SHF_TLS)
-  switch(Object->Header.Machine) {
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
+  BCase(SHF_WRITE);
+  BCase(SHF_ALLOC);
+  BCase(SHF_EXCLUDE);
+  BCase(SHF_EXECINSTR);
+  BCase(SHF_MERGE);
+  BCase(SHF_STRINGS);
+  BCase(SHF_INFO_LINK);
+  BCase(SHF_LINK_ORDER);
+  BCase(SHF_OS_NONCONFORMING);
+  BCase(SHF_GROUP);
+  BCase(SHF_TLS);
+  switch (Object->Header.Machine) {
   case ELF::EM_ARM:
-    BCase(SHF_ARM_PURECODE)
+    BCase(SHF_ARM_PURECODE);
     break;
   case ELF::EM_AMDGPU:
-    BCase(SHF_AMDGPU_HSA_GLOBAL)
-    BCase(SHF_AMDGPU_HSA_READONLY)
-    BCase(SHF_AMDGPU_HSA_CODE)
-    BCase(SHF_AMDGPU_HSA_AGENT)
+    BCase(SHF_AMDGPU_HSA_GLOBAL);
+    BCase(SHF_AMDGPU_HSA_READONLY);
+    BCase(SHF_AMDGPU_HSA_CODE);
+    BCase(SHF_AMDGPU_HSA_AGENT);
     break;
   case ELF::EM_HEXAGON:
-    BCase(SHF_HEX_GPREL)
+    BCase(SHF_HEX_GPREL);
     break;
   case ELF::EM_MIPS:
-    BCase(SHF_MIPS_NODUPES)
-    BCase(SHF_MIPS_NAMES)
-    BCase(SHF_MIPS_LOCAL)
-    BCase(SHF_MIPS_NOSTRIP)
-    BCase(SHF_MIPS_GPREL)
-    BCase(SHF_MIPS_MERGE)
-    BCase(SHF_MIPS_ADDR)
-    BCase(SHF_MIPS_STRING)
+    BCase(SHF_MIPS_NODUPES);
+    BCase(SHF_MIPS_NAMES);
+    BCase(SHF_MIPS_LOCAL);
+    BCase(SHF_MIPS_NOSTRIP);
+    BCase(SHF_MIPS_GPREL);
+    BCase(SHF_MIPS_MERGE);
+    BCase(SHF_MIPS_ADDR);
+    BCase(SHF_MIPS_STRING);
     break;
   case ELF::EM_X86_64:
-    BCase(SHF_X86_64_LARGE)
+    BCase(SHF_X86_64_LARGE);
     break;
   default:
     // Nothing to do.
@@ -457,25 +455,25 @@ void ScalarBitSetTraits<ELFYAML::ELF_SHF>::bitset(IO &IO,
 
 void ScalarEnumerationTraits<ELFYAML::ELF_STT>::enumeration(
     IO &IO, ELFYAML::ELF_STT &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(STT_NOTYPE)
-  ECase(STT_OBJECT)
-  ECase(STT_FUNC)
-  ECase(STT_SECTION)
-  ECase(STT_FILE)
-  ECase(STT_COMMON)
-  ECase(STT_TLS)
-  ECase(STT_GNU_IFUNC)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(STT_NOTYPE);
+  ECase(STT_OBJECT);
+  ECase(STT_FUNC);
+  ECase(STT_SECTION);
+  ECase(STT_FILE);
+  ECase(STT_COMMON);
+  ECase(STT_TLS);
+  ECase(STT_GNU_IFUNC);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::ELF_STV>::enumeration(
     IO &IO, ELFYAML::ELF_STV &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(STV_DEFAULT)
-  ECase(STV_INTERNAL)
-  ECase(STV_HIDDEN)
-  ECase(STV_PROTECTED)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(STV_DEFAULT);
+  ECase(STV_INTERNAL);
+  ECase(STV_HIDDEN);
+  ECase(STV_PROTECTED);
 #undef ECase
 }
 
@@ -483,13 +481,13 @@ void ScalarBitSetTraits<ELFYAML::ELF_STO>::bitset(IO &IO,
                                                   ELFYAML::ELF_STO &Value) {
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
-#define BCase(X) IO.bitSetCase(Value, #X, ELF::X);
+#define BCase(X) IO.bitSetCase(Value, #X, ELF::X)
   switch (Object->Header.Machine) {
   case ELF::EM_MIPS:
-    BCase(STO_MIPS_OPTIONAL)
-    BCase(STO_MIPS_PLT)
-    BCase(STO_MIPS_PIC)
-    BCase(STO_MIPS_MICROMIPS)
+    BCase(STO_MIPS_OPTIONAL);
+    BCase(STO_MIPS_PLT);
+    BCase(STO_MIPS_PIC);
+    BCase(STO_MIPS_MICROMIPS);
     break;
   default:
     break; // Nothing to do
@@ -500,11 +498,11 @@ void ScalarBitSetTraits<ELFYAML::ELF_STO>::bitset(IO &IO,
 
 void ScalarEnumerationTraits<ELFYAML::ELF_RSS>::enumeration(
     IO &IO, ELFYAML::ELF_RSS &Value) {
-#define ECase(X) IO.enumCase(Value, #X, ELF::X);
-  ECase(RSS_UNDEF)
-  ECase(RSS_GP)
-  ECase(RSS_GP0)
-  ECase(RSS_LOC)
+#define ECase(X) IO.enumCase(Value, #X, ELF::X)
+  ECase(RSS_UNDEF);
+  ECase(RSS_GP);
+  ECase(RSS_GP0);
+  ECase(RSS_LOC);
 #undef ECase
 }
 
@@ -553,51 +551,51 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
 
 void ScalarEnumerationTraits<ELFYAML::MIPS_AFL_REG>::enumeration(
     IO &IO, ELFYAML::MIPS_AFL_REG &Value) {
-#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X);
-  ECase(REG_NONE)
-  ECase(REG_32)
-  ECase(REG_64)
-  ECase(REG_128)
+#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X)
+  ECase(REG_NONE);
+  ECase(REG_32);
+  ECase(REG_64);
+  ECase(REG_128);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::MIPS_ABI_FP>::enumeration(
     IO &IO, ELFYAML::MIPS_ABI_FP &Value) {
-#define ECase(X) IO.enumCase(Value, #X, Mips::Val_GNU_MIPS_ABI_##X);
-  ECase(FP_ANY)
-  ECase(FP_DOUBLE)
-  ECase(FP_SINGLE)
-  ECase(FP_SOFT)
-  ECase(FP_OLD_64)
-  ECase(FP_XX)
-  ECase(FP_64)
-  ECase(FP_64A)
+#define ECase(X) IO.enumCase(Value, #X, Mips::Val_GNU_MIPS_ABI_##X)
+  ECase(FP_ANY);
+  ECase(FP_DOUBLE);
+  ECase(FP_SINGLE);
+  ECase(FP_SOFT);
+  ECase(FP_OLD_64);
+  ECase(FP_XX);
+  ECase(FP_64);
+  ECase(FP_64A);
 #undef ECase
 }
 
 void ScalarEnumerationTraits<ELFYAML::MIPS_AFL_EXT>::enumeration(
     IO &IO, ELFYAML::MIPS_AFL_EXT &Value) {
-#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X);
-  ECase(EXT_NONE)
-  ECase(EXT_XLR)
-  ECase(EXT_OCTEON2)
-  ECase(EXT_OCTEONP)
-  ECase(EXT_LOONGSON_3A)
-  ECase(EXT_OCTEON)
-  ECase(EXT_5900)
-  ECase(EXT_4650)
-  ECase(EXT_4010)
-  ECase(EXT_4100)
-  ECase(EXT_3900)
-  ECase(EXT_10000)
-  ECase(EXT_SB1)
-  ECase(EXT_4111)
-  ECase(EXT_4120)
-  ECase(EXT_5400)
-  ECase(EXT_5500)
-  ECase(EXT_LOONGSON_2E)
-  ECase(EXT_LOONGSON_2F)
-  ECase(EXT_OCTEON3)
+#define ECase(X) IO.enumCase(Value, #X, Mips::AFL_##X)
+  ECase(EXT_NONE);
+  ECase(EXT_XLR);
+  ECase(EXT_OCTEON2);
+  ECase(EXT_OCTEONP);
+  ECase(EXT_LOONGSON_3A);
+  ECase(EXT_OCTEON);
+  ECase(EXT_5900);
+  ECase(EXT_4650);
+  ECase(EXT_4010);
+  ECase(EXT_4100);
+  ECase(EXT_3900);
+  ECase(EXT_10000);
+  ECase(EXT_SB1);
+  ECase(EXT_4111);
+  ECase(EXT_4120);
+  ECase(EXT_5400);
+  ECase(EXT_5500);
+  ECase(EXT_LOONGSON_2E);
+  ECase(EXT_LOONGSON_2F);
+  ECase(EXT_OCTEON3);
 #undef ECase
 }
 
@@ -614,27 +612,27 @@ void ScalarEnumerationTraits<ELFYAML::MIPS_ISA>::enumeration(
 
 void ScalarBitSetTraits<ELFYAML::MIPS_AFL_ASE>::bitset(
     IO &IO, ELFYAML::MIPS_AFL_ASE &Value) {
-#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_ASE_##X);
-  BCase(DSP)
-  BCase(DSPR2)
-  BCase(EVA)
-  BCase(MCU)
-  BCase(MDMX)
-  BCase(MIPS3D)
-  BCase(MT)
-  BCase(SMARTMIPS)
-  BCase(VIRT)
-  BCase(MSA)
-  BCase(MIPS16)
-  BCase(MICROMIPS)
-  BCase(XPA)
+#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_ASE_##X)
+  BCase(DSP);
+  BCase(DSPR2);
+  BCase(EVA);
+  BCase(MCU);
+  BCase(MDMX);
+  BCase(MIPS3D);
+  BCase(MT);
+  BCase(SMARTMIPS);
+  BCase(VIRT);
+  BCase(MSA);
+  BCase(MIPS16);
+  BCase(MICROMIPS);
+  BCase(XPA);
 #undef BCase
 }
 
 void ScalarBitSetTraits<ELFYAML::MIPS_AFL_FLAGS1>::bitset(
     IO &IO, ELFYAML::MIPS_AFL_FLAGS1 &Value) {
-#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_FLAGS1_##X);
-  BCase(ODDSPREG)
+#define BCase(X) IO.bitSetCase(Value, #X, Mips::AFL_FLAGS1_##X)
+  BCase(ODDSPREG);
 #undef BCase
 }
 
diff --git a/lib/ObjectYAML/ObjectYAML.cpp b/lib/ObjectYAML/ObjectYAML.cpp
index cbbaac6062a71e24969552321c0fcbb0a537d119..74581c1ecaacc597328d696cc7f27d25e1537662 100644
--- a/lib/ObjectYAML/ObjectYAML.cpp
+++ b/lib/ObjectYAML/ObjectYAML.cpp
@@ -43,6 +43,9 @@ void MappingTraits<YamlObjectFile>::mapping(IO &IO,
       ObjectFile.FatMachO.reset(new MachOYAML::UniversalBinary());
       MappingTraits<MachOYAML::UniversalBinary>::mapping(IO,
                                                          *ObjectFile.FatMachO);
+    } else if (IO.mapTag("!WASM")) {
+      ObjectFile.Wasm.reset(new WasmYAML::Object());
+      MappingTraits<WasmYAML::Object>::mapping(IO, *ObjectFile.Wasm);
     } else {
       Input &In = (Input &)IO;
       std::string Tag = In.getCurrentNode()->getRawTag();
diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3e1bed19d61fe7004f21253f83d94bdbffd6ea7b
--- /dev/null
+++ b/lib/ObjectYAML/WasmYAML.cpp
@@ -0,0 +1,357 @@
+//===- WasmYAML.cpp - Wasm YAMLIO implementation --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines classes for handling the YAML representation of wasm.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ObjectYAML/WasmYAML.h"
+#include "llvm/Object/Wasm.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/MipsABIFlags.h"
+
+namespace llvm {
+
+namespace WasmYAML {
+
+// Declared here rather than in the header to comply with:
+// http://llvm.org/docs/CodingStandards.html#provide-a-virtual-method-anchor-for-classes-in-headers
+Section::~Section() {}
+
+} // end namespace WasmYAML
+
+namespace yaml {
+
+void MappingTraits<WasmYAML::FileHeader>::mapping(
+    IO &IO, WasmYAML::FileHeader &FileHdr) {
+  IO.mapRequired("Version", FileHdr.Version);
+}
+
+void MappingTraits<WasmYAML::Object>::mapping(IO &IO,
+                                              WasmYAML::Object &Object) {
+  IO.setContext(&Object);
+  IO.mapTag("!WASM", true);
+  IO.mapRequired("FileHeader", Object.Header);
+  IO.mapOptional("Sections", Object.Sections);
+  IO.setContext(nullptr);
+}
+
+static void commonSectionMapping(IO &IO, WasmYAML::Section &Section) {
+  IO.mapRequired("Type", Section.Type);
+  IO.mapOptional("Relocations", Section.Relocations);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::CustomSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Name", Section.Name);
+  IO.mapRequired("Payload", Section.Payload);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::TypeSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Signatures", Section.Signatures);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::ImportSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Imports", Section.Imports);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::FunctionSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("FunctionTypes", Section.FunctionTypes);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::TableSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Tables", Section.Tables);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::MemorySection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Memories", Section.Memories);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::GlobalSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Globals", Section.Globals);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::ExportSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Exports", Section.Exports);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::StartSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("StartFunction", Section.StartFunction);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::ElemSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Segments", Section.Segments);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::CodeSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Functions", Section.Functions);
+}
+
+static void sectionMapping(IO &IO, WasmYAML::DataSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapRequired("Segments", Section.Segments);
+}
+
+void MappingTraits<std::unique_ptr<WasmYAML::Section>>::mapping(
+    IO &IO, std::unique_ptr<WasmYAML::Section> &Section) {
+  WasmYAML::SectionType SectionType;
+  if (IO.outputting())
+    SectionType = Section->Type;
+  else
+    IO.mapRequired("Type", SectionType);
+
+  switch (SectionType) {
+  case wasm::WASM_SEC_CUSTOM:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::CustomSection());
+    sectionMapping(IO, *cast<WasmYAML::CustomSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_TYPE:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::TypeSection());
+    sectionMapping(IO, *cast<WasmYAML::TypeSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_IMPORT:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::ImportSection());
+    sectionMapping(IO, *cast<WasmYAML::ImportSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_FUNCTION:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::FunctionSection());
+    sectionMapping(IO, *cast<WasmYAML::FunctionSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_TABLE:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::TableSection());
+    sectionMapping(IO, *cast<WasmYAML::TableSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_MEMORY:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::MemorySection());
+    sectionMapping(IO, *cast<WasmYAML::MemorySection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_GLOBAL:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::GlobalSection());
+    sectionMapping(IO, *cast<WasmYAML::GlobalSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_EXPORT:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::ExportSection());
+    sectionMapping(IO, *cast<WasmYAML::ExportSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_START:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::StartSection());
+    sectionMapping(IO, *cast<WasmYAML::StartSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_ELEM:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::ElemSection());
+    sectionMapping(IO, *cast<WasmYAML::ElemSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_CODE:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::CodeSection());
+    sectionMapping(IO, *cast<WasmYAML::CodeSection>(Section.get()));
+    break;
+  case wasm::WASM_SEC_DATA:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::DataSection());
+    sectionMapping(IO, *cast<WasmYAML::DataSection>(Section.get()));
+    break;
+  default:
+    llvm_unreachable("Unknown section type");
+  }
+}
+
+void ScalarEnumerationTraits<WasmYAML::SectionType>::enumeration(
+    IO &IO, WasmYAML::SectionType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, wasm::WASM_SEC_##X);
+  ECase(CUSTOM);
+  ECase(TYPE);
+  ECase(IMPORT);
+  ECase(FUNCTION);
+  ECase(TABLE);
+  ECase(MEMORY);
+  ECase(GLOBAL);
+  ECase(EXPORT);
+  ECase(START);
+  ECase(ELEM);
+  ECase(CODE);
+  ECase(DATA);
+#undef ECase
+}
+
+void MappingTraits<WasmYAML::Signature>::mapping(
+    IO &IO, WasmYAML::Signature &Signature) {
+  IO.mapOptional("Index", Signature.Index);
+  IO.mapRequired("ReturnType", Signature.ReturnType);
+  IO.mapRequired("ParamTypes", Signature.ParamTypes);
+}
+
+void MappingTraits<WasmYAML::Table>::mapping(IO &IO, WasmYAML::Table &Table) {
+  IO.mapRequired("ElemType", Table.ElemType);
+  IO.mapRequired("Limits", Table.TableLimits);
+}
+
+void MappingTraits<WasmYAML::Function>::mapping(IO &IO,
+                                                WasmYAML::Function &Function) {
+  IO.mapRequired("Locals", Function.Locals);
+  IO.mapRequired("Body", Function.Body);
+}
+
+void MappingTraits<WasmYAML::Relocation>::mapping(
+    IO &IO, WasmYAML::Relocation &Relocation) {
+  IO.mapRequired("Type", Relocation.Type);
+  IO.mapRequired("Index", Relocation.Index);
+  IO.mapRequired("Offset", Relocation.Offset);
+  IO.mapRequired("Addend", Relocation.Addend);
+}
+
+void MappingTraits<WasmYAML::LocalDecl>::mapping(
+    IO &IO, WasmYAML::LocalDecl &LocalDecl) {
+  IO.mapRequired("Type", LocalDecl.Type);
+  IO.mapRequired("Count", LocalDecl.Count);
+}
+
+void MappingTraits<WasmYAML::Limits>::mapping(IO &IO,
+                                              WasmYAML::Limits &Limits) {
+  if (!IO.outputting() || Limits.Flags)
+    IO.mapOptional("Flags", Limits.Flags);
+  IO.mapRequired("Initial", Limits.Initial);
+  if (!IO.outputting() || Limits.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
+    IO.mapOptional("Maximum", Limits.Maximum);
+}
+
+void MappingTraits<WasmYAML::ElemSegment>::mapping(
+    IO &IO, WasmYAML::ElemSegment &Segment) {
+  IO.mapRequired("Offset", Segment.Offset);
+  IO.mapRequired("Functions", Segment.Functions);
+}
+
+void MappingTraits<WasmYAML::Import>::mapping(IO &IO,
+                                              WasmYAML::Import &Import) {
+  IO.mapRequired("Module", Import.Module);
+  IO.mapRequired("Field", Import.Field);
+  IO.mapRequired("Kind", Import.Kind);
+  if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
+    IO.mapRequired("SigIndex", Import.SigIndex);
+  } else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
+    IO.mapRequired("GlobalType", Import.GlobalType);
+    IO.mapRequired("GlobalMutable", Import.GlobalMutable);
+  } else {
+    llvm_unreachable("unhandled import type");
+  }
+}
+
+void MappingTraits<WasmYAML::Export>::mapping(IO &IO,
+                                              WasmYAML::Export &Export) {
+  IO.mapRequired("Name", Export.Name);
+  IO.mapRequired("Kind", Export.Kind);
+  IO.mapRequired("Index", Export.Index);
+}
+
+void MappingTraits<WasmYAML::Global>::mapping(IO &IO,
+                                              WasmYAML::Global &Global) {
+  IO.mapRequired("Type", Global.Type);
+  IO.mapRequired("Mutable", Global.Mutable);
+  IO.mapRequired("InitExpr", Global.InitExpr);
+}
+
+void MappingTraits<wasm::WasmInitExpr>::mapping(IO &IO,
+                                                wasm::WasmInitExpr &Expr) {
+  WasmYAML::Opcode Op = Expr.Opcode;
+  IO.mapRequired("Opcode", Op);
+  Expr.Opcode = Op;
+  switch (Expr.Opcode) {
+  case wasm::WASM_OPCODE_I32_CONST:
+    IO.mapRequired("Value", Expr.Value.Int32);
+    break;
+  case wasm::WASM_OPCODE_I64_CONST:
+    IO.mapRequired("Value", Expr.Value.Int64);
+    break;
+  case wasm::WASM_OPCODE_F32_CONST:
+    IO.mapRequired("Value", Expr.Value.Float32);
+    break;
+  case wasm::WASM_OPCODE_F64_CONST:
+    IO.mapRequired("Value", Expr.Value.Float64);
+    break;
+  }
+}
+
+void MappingTraits<WasmYAML::DataSegment>::mapping(
+    IO &IO, WasmYAML::DataSegment &Segment) {
+  IO.mapRequired("Index", Segment.Index);
+  IO.mapRequired("Offset", Segment.Offset);
+  IO.mapRequired("Content", Segment.Content);
+}
+
+void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
+    IO &IO, WasmYAML::ValueType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, wasm::WASM_TYPE_##X);
+  ECase(I32);
+  ECase(I64);
+  ECase(F32);
+  ECase(F64);
+  ECase(ANYFUNC);
+  ECase(FUNC);
+  ECase(NORESULT);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::ExportKind>::enumeration(
+    IO &IO, WasmYAML::ExportKind &Kind) {
+#define ECase(X) IO.enumCase(Kind, #X, wasm::WASM_EXTERNAL_##X);
+  ECase(FUNCTION);
+  ECase(TABLE);
+  ECase(MEMORY);
+  ECase(GLOBAL);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::Opcode>::enumeration(
+    IO &IO, WasmYAML::Opcode &Code) {
+#define ECase(X) IO.enumCase(Code, #X, wasm::WASM_OPCODE_##X);
+  ECase(END);
+  ECase(I32_CONST);
+  ECase(I64_CONST);
+  ECase(F64_CONST);
+  ECase(F32_CONST);
+  ECase(GET_GLOBAL);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::TableType>::enumeration(
+    IO &IO, WasmYAML::TableType &Type) {
+#define ECase(X) IO.enumCase(Type, #X, wasm::WASM_TYPE_##X);
+  ECase(ANYFUNC);
+#undef ECase
+}
+
+void ScalarEnumerationTraits<WasmYAML::RelocType>::enumeration(
+    IO &IO, WasmYAML::RelocType &Type) {
+#define WASM_RELOC(name, value) IO.enumCase(Type, #name, wasm::name);
+#include "llvm/Support/WasmRelocs/WebAssembly.def"
+#undef WASM_RELOC
+}
+
+} // end namespace yaml
+} // end namespace llvm
diff --git a/lib/Passes/PassBuilder.cpp b/lib/Passes/PassBuilder.cpp
index efdb54adf0300807892975b704aee7c64dccc4e3..0421946a32a69527f8b9c15a24bbbc4c75d23a95 100644
--- a/lib/Passes/PassBuilder.cpp
+++ b/lib/Passes/PassBuilder.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -61,6 +62,7 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/GCOVProfiler.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/IPO/ConstantMerge.h"
 #include "llvm/Transforms/IPO/CrossDSOCFI.h"
 #include "llvm/Transforms/IPO/DeadArgumentElimination.h"
@@ -134,8 +136,8 @@
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LowerInvoke.h"
 #include "llvm/Transforms/Utils/Mem2Reg.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 #include "llvm/Transforms/Utils/NameAnonGlobals.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
 #include "llvm/Transforms/Utils/SimplifyInstructions.h"
 #include "llvm/Transforms/Utils/SymbolRewriter.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
@@ -145,6 +147,9 @@
 
 using namespace llvm;
 
+static cl::opt<unsigned> MaxDevirtIterations("pm-max-devirt-iterations",
+                                             cl::ReallyHidden, cl::init(4));
+
 static Regex DefaultAliasRegex("^(default|lto-pre-link|lto)<(O[0123sz])>$");
 
 static bool isOptimizingForSize(PassBuilder::OptimizationLevel Level) {
@@ -329,8 +334,11 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   LPM2.addPass(IndVarSimplifyPass());
   LPM2.addPass(LoopIdiomRecognizePass());
   LPM2.addPass(LoopDeletionPass());
-  LPM2.addPass(LoopUnrollPass::createFull());
+  LPM2.addPass(LoopUnrollPass::createFull(Level));
 
+  // We provide the opt remark emitter pass for LICM to use. We only need to do
+  // this once as it is immutable.
+  FPM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   FPM.addPass(createFunctionToLoopPassAdaptor(std::move(LPM1)));
   FPM.addPass(SimplifyCFGPass());
   FPM.addPass(InstCombinePass());
@@ -376,6 +384,56 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   return FPM;
 }
 
+static void addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
+                              PassBuilder::OptimizationLevel Level,
+                              bool RunProfileGen, std::string ProfileGenFile,
+                              std::string ProfileUseFile) {
+  // Generally running simplification passes and the inliner with an high
+  // threshold results in smaller executables, but there may be cases where
+  // the size grows, so let's be conservative here and skip this simplification
+  // at -Os/Oz.
+  if (!isOptimizingForSize(Level)) {
+    InlineParams IP;
+
+    // In the old pass manager, this is a cl::opt. Should still this be one?
+    IP.DefaultThreshold = 75;
+
+    // FIXME: The hint threshold has the same value used by the regular inliner.
+    // This should probably be lowered after performance testing.
+    // FIXME: this comment is cargo culted from the old pass manager, revisit).
+    IP.HintThreshold = 325;
+
+    CGSCCPassManager CGPipeline(DebugLogging);
+
+    CGPipeline.addPass(InlinerPass(IP));
+
+    FunctionPassManager FPM;
+    FPM.addPass(SROA());
+    FPM.addPass(EarlyCSEPass());    // Catch trivial redundancies.
+    FPM.addPass(SimplifyCFGPass()); // Merge & remove basic blocks.
+    FPM.addPass(InstCombinePass()); // Combine silly sequences.
+
+    // FIXME: Here the old pass manager inserts peephole extensions.
+    // Add them when they're supported.
+    CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
+
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPipeline)));
+  }
+
+  if (RunProfileGen) {
+    MPM.addPass(PGOInstrumentationGen());
+
+    // Add the profile lowering pass.
+    InstrProfOptions Options;
+    if (!ProfileGenFile.empty())
+      Options.InstrProfileOutput = ProfileGenFile;
+    MPM.addPass(InstrProfiling(Options));
+  }
+
+  if (!ProfileUseFile.empty())
+    MPM.addPass(PGOInstrumentationUse(ProfileUseFile));
+}
+
 ModulePassManager
 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
                                            bool DebugLogging) {
@@ -426,10 +484,20 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   GlobalCleanupPM.addPass(SimplifyCFGPass());
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(GlobalCleanupPM)));
 
-  // FIXME: Enable this when cross-IR-unit analysis invalidation is working.
-#if 0
-  MPM.addPass(RequireAnalysisPass<GlobalsAA>());
-#endif
+  // Add all the requested passes for PGO Instrumentation, if requested.
+  if (PGOOpt) {
+    assert(PGOOpt->RunProfileGen || PGOOpt->SamplePGO ||
+           !PGOOpt->ProfileUseFile.empty());
+    addPGOInstrPasses(MPM, DebugLogging, Level, PGOOpt->RunProfileGen,
+                      PGOOpt->ProfileGenFile, PGOOpt->ProfileUseFile);
+  }
+
+  // Indirect call promotion that promotes intra-module targes only.
+  MPM.addPass(PGOIndirectCallPromotion(false, PGOOpt && PGOOpt->SamplePGO));
+
+  // Require the GlobalsAA analysis for the module so we can query it within
+  // the CGSCC pipeline.
+  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
 
   // Now begin the main postorder CGSCC pipeline.
   // FIXME: The current CGSCC pipeline has its origins in the legacy pass
@@ -451,13 +519,24 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // Now deduce any function attributes based in the current code.
   MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
 
+  // When at O3 add argument promotion to the pass pipeline.
+  // FIXME: It isn't at all clear why this should be limited to O3.
+  if (Level == O3)
+    MainCGPipeline.addPass(ArgumentPromotionPass());
+
   // Lastly, add the core function simplification pipeline nested inside the
   // CGSCC walk.
   MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
       buildFunctionSimplificationPipeline(Level, DebugLogging)));
 
+  // We wrap the CGSCC pipeline in a devirtualization repeater. This will try
+  // to detect when we devirtualize indirect calls and iterate the SCC passes
+  // in that case to try and catch knock-on inlining or function attrs
+  // opportunities. Then we add it to the module pipeline by walking the SCCs
+  // in postorder (or bottom-up).
   MPM.addPass(
-      createModuleToPostOrderCGSCCPassAdaptor(std::move(MainCGPipeline)));
+      createModuleToPostOrderCGSCCPassAdaptor(createDevirtSCCRepeatedPass(
+          std::move(MainCGPipeline), MaxDevirtIterations, DebugLogging)));
 
   // This ends the canonicalization and simplification phase of the pipeline.
   // At this point, we expect to have canonical and simple IR which we begin
@@ -472,17 +551,14 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // FIXME: Is this really an optimization rather than a canonicalization?
   MPM.addPass(ReversePostOrderFunctionAttrsPass());
 
-  // Recompute GloblasAA here prior to function passes. This is particularly
+  // Re-require GloblasAA here prior to function passes. This is particularly
   // useful as the above will have inlined, DCE'ed, and function-attr
   // propagated everything. We should at this point have a reasonably minimal
   // and richly annotated call graph. By computing aliasing and mod/ref
   // information for all local globals here, the late loop passes and notably
   // the vectorizer will be able to use them to help recognize vectorizable
   // memory operations.
-  // FIXME: Enable this once analysis invalidation is fully supported.
-#if 0
-  MPM.addPass(Require<GlobalsAA>());
-#endif
+  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
 
   FunctionPassManager OptimizePM(DebugLogging);
   OptimizePM.addPass(Float2IntPass());
@@ -530,8 +606,9 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
   // FIXME: It would be really good to use a loop-integrated instruction
   // combiner for cleanup here so that the unrolling and LICM can be pipelined
   // across the loop nests.
-  OptimizePM.addPass(createFunctionToLoopPassAdaptor(LoopUnrollPass::create()));
+  OptimizePM.addPass(createFunctionToLoopPassAdaptor(LoopUnrollPass::create(Level)));
   OptimizePM.addPass(InstCombinePass());
+  OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   OptimizePM.addPass(createFunctionToLoopPassAdaptor(LICMPass()));
 
   // Now that we've vectorized and unrolled loops, we may have more refined
@@ -589,7 +666,8 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     // left by the earlier promotion pass that promotes intra-module targets.
     // This two-step promotion is to save the compile time. For LTO, it should
     // produce the same result as if we only do promotion here.
-    MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */));
+    MPM.addPass(PGOIndirectCallPromotion(true /* InLTO */,
+                                         PGOOpt && PGOOpt->SamplePGO));
 
     // Propagate constants at call sites into the functions they call.  This
     // opens opportunities for globalopt (and inlining) by substituting function
@@ -717,8 +795,8 @@ ModulePassManager PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
   // CFI is disabled.
   // Enable once we add support for the summary in the new PM.
 #if 0
-  MPM.addPass(LowerTypeTestsPass(Summary ? LowerTypeTestsSummaryAction::Export :
-                                           LowerTypeTestsSummaryAction::None,
+  MPM.addPass(LowerTypeTestsPass(Summary ? PassSummaryAction::Export :
+                                           PassSummaryAction::None,
                                 Summary));
 #endif
 
@@ -755,12 +833,8 @@ AAManager PassBuilder::buildDefaultAAPipeline() {
   // Add support for querying global aliasing information when available.
   // Because the `AAManager` is a function analysis and `GlobalsAA` is a module
   // analysis, all that the `AAManager` can do is query for any *cached*
-  // results from `GlobalsAA` through a readonly proxy..
-#if 0
-  // FIXME: Enable once the invalidation logic supports this. Currently, the
-  // `AAManager` will hold stale references to the module analyses.
+  // results from `GlobalsAA` through a readonly proxy.
   AA.registerModuleAnalysis<GlobalsAA>();
-#endif
 
   return AA;
 }
diff --git a/lib/Passes/PassRegistry.def b/lib/Passes/PassRegistry.def
index 5b2e63e850109aa3ef25b7232c92fe6a6ef37881..efd4c097a67572db99561aad063158e1651d8c7f 100644
--- a/lib/Passes/PassRegistry.def
+++ b/lib/Passes/PassRegistry.def
@@ -85,6 +85,7 @@ CGSCC_ANALYSIS("fam-proxy", FunctionAnalysisManagerCGSCCProxy())
 #ifndef CGSCC_PASS
 #define CGSCC_PASS(NAME, CREATE_PASS)
 #endif
+CGSCC_PASS("argpromotion", ArgumentPromotionPass())
 CGSCC_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 CGSCC_PASS("function-attrs", PostOrderFunctionAttrsPass())
 CGSCC_PASS("inline", InlinerPass())
@@ -173,6 +174,7 @@ FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
 FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
 FUNCTION_PASS("loop-distribute", LoopDistributePass())
 FUNCTION_PASS("loop-vectorize", LoopVectorizePass())
+FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
 FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
 FUNCTION_PASS("print<assumptions>", AssumptionPrinterPass(dbgs()))
 FUNCTION_PASS("print<block-freq>", BlockFrequencyPrinterPass(dbgs()))
diff --git a/lib/ProfileData/Coverage/CoverageMapping.cpp b/lib/ProfileData/Coverage/CoverageMapping.cpp
index 6d907c7098e0e0332476985a9f1f67ec28259814..23999a5312c73c0457660828e611cbdab4befb6f 100644
--- a/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -1,4 +1,4 @@
-//=-- CoverageMapping.cpp - Code coverage mapping support ---------*- C++ -*-=//
+//===- CoverageMapping.cpp - Code coverage mapping support ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,32 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/Coverage/CoverageMapping.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ProfileData/Coverage/CoverageMapping.h"
 #include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
 #include "llvm/ProfileData/InstrProfReader.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
-#include "llvm/Support/Path.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace coverage;
@@ -59,7 +73,7 @@ void CounterExpressionBuilder::extractTerms(
 
 Counter CounterExpressionBuilder::simplify(Counter ExpressionTree) {
   // Gather constant terms.
-  llvm::SmallVector<std::pair<unsigned, int>, 32> Terms;
+  SmallVector<std::pair<unsigned, int>, 32> Terms;
   extractTerms(ExpressionTree, +1, Terms);
 
   // If there are no terms, this is just a zero. The algorithm below assumes at
@@ -120,8 +134,7 @@ Counter CounterExpressionBuilder::subtract(Counter LHS, Counter RHS) {
       get(CounterExpression(CounterExpression::Subtract, LHS, RHS)));
 }
 
-void CounterMappingContext::dump(const Counter &C,
-                                 llvm::raw_ostream &OS) const {
+void CounterMappingContext::dump(const Counter &C, raw_ostream &OS) const {
   switch (C.getKind()) {
   case Counter::Zero:
     OS << '0';
@@ -145,7 +158,7 @@ void CounterMappingContext::dump(const Counter &C,
     return;
   Expected<int64_t> Value = evaluate(C);
   if (auto E = Value.takeError()) {
-    llvm::consumeError(std::move(E));
+    consumeError(std::move(E));
     return;
   }
   OS << '[' << *Value << ']';
@@ -217,7 +230,7 @@ Error CoverageMapping::loadFunctionRecord(
   for (const auto &Region : Record.MappingRegions) {
     Expected<int64_t> ExecutionCount = Ctx.evaluate(Region.Count);
     if (auto E = ExecutionCount.takeError()) {
-      llvm::consumeError(std::move(E));
+      consumeError(std::move(E));
       return Error::success();
     }
     Function.pushRegion(Region, *ExecutionCount);
@@ -281,6 +294,7 @@ CoverageMapping::load(ArrayRef<StringRef> ObjectFilenames,
 }
 
 namespace {
+
 /// \brief Distributes functions into instantiation sets.
 ///
 /// An instantiation set is a collection of functions that have the same source
@@ -326,7 +340,7 @@ class SegmentBuilder {
       Segments.pop_back();
     DEBUG(dbgs() << "Segment at " << Line << ":" << Col);
     // Set this region's count.
-    if (Region.Kind != coverage::CounterMappingRegion::SkippedRegion) {
+    if (Region.Kind != CounterMappingRegion::SkippedRegion) {
       DEBUG(dbgs() << " with count " << Region.ExecutionCount);
       Segments.emplace_back(Line, Col, Region.ExecutionCount, IsRegionEntry);
     } else
@@ -380,10 +394,10 @@ class SegmentBuilder {
       // in combineRegions(). Because we accumulate counter values only from
       // regions of the same kind as the first region of the area, prefer
       // CodeRegion to ExpansionRegion and ExpansionRegion to SkippedRegion.
-      static_assert(coverage::CounterMappingRegion::CodeRegion <
-                            coverage::CounterMappingRegion::ExpansionRegion &&
-                        coverage::CounterMappingRegion::ExpansionRegion <
-                            coverage::CounterMappingRegion::SkippedRegion,
+      static_assert(CounterMappingRegion::CodeRegion <
+                            CounterMappingRegion::ExpansionRegion &&
+                        CounterMappingRegion::ExpansionRegion <
+                            CounterMappingRegion::SkippedRegion,
                     "Unexpected order of region kind values");
       return LHS.Kind < RHS.Kind;
     });
@@ -437,7 +451,8 @@ public:
     return Segments;
   }
 };
-}
+
+} // end anonymous namespace
 
 std::vector<StringRef> CoverageMapping::getUniqueSourceFiles() const {
   std::vector<StringRef> Filenames;
@@ -487,7 +502,7 @@ static bool isExpansion(const CountedRegion &R, unsigned FileID) {
 
 CoverageData CoverageMapping::getCoverageForFile(StringRef Filename) const {
   CoverageData FileCoverage(Filename);
-  std::vector<coverage::CountedRegion> Regions;
+  std::vector<CountedRegion> Regions;
 
   for (const auto &Function : Functions) {
     auto MainFileID = findMainViewFileID(Filename, Function);
@@ -533,7 +548,7 @@ CoverageMapping::getCoverageForFunction(const FunctionRecord &Function) const {
     return CoverageData();
 
   CoverageData FunctionCoverage(Function.Filenames[*MainFileID]);
-  std::vector<coverage::CountedRegion> Regions;
+  std::vector<CountedRegion> Regions;
   for (const auto &CR : Function.CountedRegions)
     if (CR.FileID == *MainFileID) {
       Regions.push_back(CR);
@@ -551,7 +566,7 @@ CoverageData CoverageMapping::getCoverageForExpansion(
     const ExpansionRecord &Expansion) const {
   CoverageData ExpansionCoverage(
       Expansion.Function.Filenames[Expansion.FileID]);
-  std::vector<coverage::CountedRegion> Regions;
+  std::vector<CountedRegion> Regions;
   for (const auto &CR : Expansion.Function.CountedRegions)
     if (CR.FileID == Expansion.FileID) {
       Regions.push_back(CR);
@@ -566,8 +581,7 @@ CoverageData CoverageMapping::getCoverageForExpansion(
   return ExpansionCoverage;
 }
 
-namespace {
-std::string getCoverageMapErrString(coveragemap_error Err) {
+static std::string getCoverageMapErrString(coveragemap_error Err) {
   switch (Err) {
   case coveragemap_error::success:
     return "Success";
@@ -585,6 +599,8 @@ std::string getCoverageMapErrString(coveragemap_error Err) {
   llvm_unreachable("A value of coveragemap_error has no message.");
 }
 
+namespace {
+
 // FIXME: This class is only here to support the transition to llvm::Error. It
 // will be removed once this transition is complete. Clients should prefer to
 // deal with the Error value directly, rather than converting to error_code.
@@ -594,6 +610,7 @@ class CoverageMappingErrorCategoryType : public std::error_category {
     return getCoverageMapErrString(static_cast<coveragemap_error>(IE));
   }
 };
+
 } // end anonymous namespace
 
 std::string CoverageMapError::message() const {
diff --git a/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index a6c7031ccd3d204dd7672a2c63743ad8a64c3ae0..05c5b28d7a0765d7e90a2ff52a9f1ce0eef3e271 100644
--- a/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -1,4 +1,4 @@
-//=-- CoverageMappingReader.cpp - Code coverage mapping reader ----*- C++ -*-=//
+//===- CoverageMappingReader.cpp - Code coverage mapping reader -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +12,34 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h" 
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/Error.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/ProfileData/Coverage/CoverageMappingReader.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 using namespace coverage;
@@ -226,9 +245,8 @@ Error RawCoverageMappingReader::readMappingRegionsSubArray(
 }
 
 Error RawCoverageMappingReader::read() {
-
   // Read the virtual file mapping.
-  llvm::SmallVector<unsigned, 8> VirtualFileMapping;
+  SmallVector<unsigned, 8> VirtualFileMapping;
   uint64_t NumFileMappings;
   if (auto Err = readSize(NumFileMappings))
     return Err;
@@ -349,7 +367,10 @@ static Expected<bool> isCoverageMappingDummy(uint64_t Hash, StringRef Mapping) {
 }
 
 namespace {
+
 struct CovMapFuncRecordReader {
+  virtual ~CovMapFuncRecordReader() = default;
+
   // The interface to read coverage mapping function records for a module.
   //
   // \p Buf points to the buffer containing the \c CovHeader of the coverage
@@ -359,26 +380,24 @@ struct CovMapFuncRecordReader {
   // greater than \p End if not.
   virtual Expected<const char *> readFunctionRecords(const char *Buf,
                                                      const char *End) = 0;
-  virtual ~CovMapFuncRecordReader() {}
+
   template <class IntPtrT, support::endianness Endian>
   static Expected<std::unique_ptr<CovMapFuncRecordReader>>
-  get(coverage::CovMapVersion Version, InstrProfSymtab &P,
+  get(CovMapVersion Version, InstrProfSymtab &P,
       std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
       std::vector<StringRef> &F);
 };
 
 // A class for reading coverage mapping function records for a module.
-template <coverage::CovMapVersion Version, class IntPtrT,
-          support::endianness Endian>
+template <CovMapVersion Version, class IntPtrT, support::endianness Endian>
 class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
-  typedef typename coverage::CovMapTraits<
+  typedef typename CovMapTraits<
       Version, IntPtrT>::CovMapFuncRecordType FuncRecordType;
-  typedef typename coverage::CovMapTraits<Version, IntPtrT>::NameRefType
-      NameRefType;
+  typedef typename CovMapTraits<Version, IntPtrT>::NameRefType  NameRefType;
 
   // Maps function's name references to the indexes of their records
   // in \c Records.
-  llvm::DenseMap<NameRefType, size_t> FunctionRecords;
+  DenseMap<NameRefType, size_t> FunctionRecords;
   InstrProfSymtab &ProfileNames;
   std::vector<StringRef> &Filenames;
   std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records;
@@ -432,14 +451,16 @@ public:
       std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
       std::vector<StringRef> &F)
       : ProfileNames(P), Filenames(F), Records(R) {}
-  ~VersionedCovMapFuncRecordReader() override {}
+
+  ~VersionedCovMapFuncRecordReader() override = default;
 
   Expected<const char *> readFunctionRecords(const char *Buf,
                                              const char *End) override {
     using namespace support;
+
     if (Buf + sizeof(CovMapHeader) > End)
       return make_error<CoverageMapError>(coveragemap_error::malformed);
-    auto CovHeader = reinterpret_cast<const coverage::CovMapHeader *>(Buf);
+    auto CovHeader = reinterpret_cast<const CovMapHeader *>(Buf);
     uint32_t NRecords = CovHeader->getNRecords<Endian>();
     uint32_t FilenamesSize = CovHeader->getFilenamesSize<Endian>();
     uint32_t CoverageSize = CovHeader->getCoverageSize<Endian>();
@@ -490,14 +511,16 @@ public:
     return Buf;
   }
 };
+
 } // end anonymous namespace
 
 template <class IntPtrT, support::endianness Endian>
 Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
-    coverage::CovMapVersion Version, InstrProfSymtab &P,
+    CovMapVersion Version, InstrProfSymtab &P,
     std::vector<BinaryCoverageReader::ProfileMappingRecord> &R,
     std::vector<StringRef> &F) {
   using namespace coverage;
+
   switch (Version) {
   case CovMapVersion::Version1:
     return llvm::make_unique<VersionedCovMapFuncRecordReader<
@@ -518,11 +541,12 @@ static Error readCoverageMappingData(
     std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records,
     std::vector<StringRef> &Filenames) {
   using namespace coverage;
+
   // Read the records in the coverage data section.
   auto CovHeader =
-      reinterpret_cast<const coverage::CovMapHeader *>(Data.data());
+      reinterpret_cast<const CovMapHeader *>(Data.data());
   CovMapVersion Version = (CovMapVersion)CovHeader->getVersion<Endian>();
-  if (Version > coverage::CovMapVersion::CurrentVersion)
+  if (Version > CovMapVersion::CurrentVersion)
     return make_error<CoverageMapError>(coveragemap_error::unsupported_version);
   Expected<std::unique_ptr<CovMapFuncRecordReader>> ReaderExpected =
       CovMapFuncRecordReader::get<T, Endian>(Version, ProfileNames, Records,
@@ -538,6 +562,7 @@ static Error readCoverageMappingData(
   }
   return Error::success();
 }
+
 static const char *TestingFormatMagic = "llvmcovmtestdata";
 
 static Error loadTestingFormat(StringRef Data, InstrProfSymtab &ProfileNames,
@@ -595,21 +620,21 @@ static Error loadBinaryFormat(MemoryBufferRef ObjectBuffer,
                               StringRef &CoverageMapping,
                               uint8_t &BytesInAddress,
                               support::endianness &Endian, StringRef Arch) {
-  auto BinOrErr = object::createBinary(ObjectBuffer);
+  auto BinOrErr = createBinary(ObjectBuffer);
   if (!BinOrErr)
     return BinOrErr.takeError();
   auto Bin = std::move(BinOrErr.get());
   std::unique_ptr<ObjectFile> OF;
-  if (auto *Universal = dyn_cast<object::MachOUniversalBinary>(Bin.get())) {
+  if (auto *Universal = dyn_cast<MachOUniversalBinary>(Bin.get())) {
     // If we have a universal binary, try to look up the object for the
     // appropriate architecture.
     auto ObjectFileOrErr = Universal->getObjectForArch(Arch);
     if (!ObjectFileOrErr)
       return ObjectFileOrErr.takeError();
     OF = std::move(ObjectFileOrErr.get());
-  } else if (isa<object::ObjectFile>(Bin.get())) {
+  } else if (isa<ObjectFile>(Bin.get())) {
     // For any other object file, upcast and take ownership.
-    OF.reset(cast<object::ObjectFile>(Bin.release()));
+    OF.reset(cast<ObjectFile>(Bin.release()));
     // If we've asked for a particular arch, make sure they match.
     if (!Arch.empty() && OF->getArch() != Triple(Arch).getArch())
       return errorCodeToError(object_error::arch_not_found);
diff --git a/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index 82356333b937969cc57665b7d990b4cf2908a2f9..f131be2cba4929b4697b035e1201f6dc033ce1e9 100644
--- a/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -1,4 +1,4 @@
-//=-- CoverageMappingWriter.cpp - Code coverage mapping writer -------------=//
+//===- CoverageMappingWriter.cpp - Code coverage mapping writer -----------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,8 +12,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ProfileData/Coverage/CoverageMappingWriter.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <limits>
+#include <vector>
 
 using namespace llvm;
 using namespace coverage;
@@ -27,14 +34,25 @@ void CoverageFilenamesSectionWriter::write(raw_ostream &OS) {
 }
 
 namespace {
+
 /// \brief Gather only the expressions that are used by the mapping
 /// regions in this function.
 class CounterExpressionsMinimizer {
   ArrayRef<CounterExpression> Expressions;
-  llvm::SmallVector<CounterExpression, 16> UsedExpressions;
+  SmallVector<CounterExpression, 16> UsedExpressions;
   std::vector<unsigned> AdjustedExpressionIDs;
 
 public:
+  CounterExpressionsMinimizer(ArrayRef<CounterExpression> Expressions,
+                              ArrayRef<CounterMappingRegion> MappingRegions)
+      : Expressions(Expressions) {
+    AdjustedExpressionIDs.resize(Expressions.size(), 0);
+    for (const auto &I : MappingRegions)
+      mark(I.Count);
+    for (const auto &I : MappingRegions)
+      gatherUsed(I.Count);
+  }
+
   void mark(Counter C) {
     if (!C.isExpression())
       return;
@@ -54,16 +72,6 @@ public:
     gatherUsed(E.RHS);
   }
 
-  CounterExpressionsMinimizer(ArrayRef<CounterExpression> Expressions,
-                              ArrayRef<CounterMappingRegion> MappingRegions)
-      : Expressions(Expressions) {
-    AdjustedExpressionIDs.resize(Expressions.size(), 0);
-    for (const auto &I : MappingRegions)
-      mark(I.Count);
-    for (const auto &I : MappingRegions)
-      gatherUsed(I.Count);
-  }
-
   ArrayRef<CounterExpression> getExpressions() const { return UsedExpressions; }
 
   /// \brief Adjust the given counter to correctly transition from the old
@@ -74,7 +82,8 @@ public:
     return C;
   }
 };
-}
+
+} // end anonymous namespace
 
 /// \brief Encode the counter.
 ///
diff --git a/lib/ProfileData/InstrProf.cpp b/lib/ProfileData/InstrProf.cpp
index ccc99ab430279b81152481dfba659e6a49355efd..0ec3fce4b2377da275fea8959ab7cacb730841af 100644
--- a/lib/ProfileData/InstrProf.cpp
+++ b/lib/ProfileData/InstrProf.cpp
@@ -1,4 +1,4 @@
-//=-- InstrProf.cpp - Instrumented profiling format support -----------------=//
+//===- InstrProf.cpp - Instrumented profiling format support --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,29 +12,68 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Compression.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstring>
+#include <cstdint>
+#include <memory>
+#include <string>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
 static cl::opt<bool> StaticFuncFullModulePrefix(
-    "static-func-full-module-prefix", cl::init(false),
+    "static-func-full-module-prefix", cl::init(true),
     cl::desc("Use full module build paths in the profile counter names for "
              "static functions."));
 
-namespace {
-std::string getInstrProfErrString(instrprof_error Err) {
+// This option is tailored to users that have different top-level directory in
+// profile-gen and profile-use compilation. Users need to specific the number
+// of levels to strip. A value larger than the number of directories in the
+// source file will strip all the directory names and only leave the basename.
+//
+// Note current ThinLTO module importing for the indirect-calls assumes
+// the source directory name not being stripped. A non-zero option value here
+// can potentially prevent some inter-module indirect-call-promotions.
+static cl::opt<unsigned> StaticFuncStripDirNamePrefix(
+    "static-func-strip-dirname-prefix", cl::init(0),
+    cl::desc("Strip specified level of directory name from source path in "
+             "the profile counter name for static functions."));
+
+static std::string getInstrProfErrString(instrprof_error Err) {
   switch (Err) {
   case instrprof_error::success:
     return "Success";
@@ -76,15 +115,19 @@ std::string getInstrProfErrString(instrprof_error Err) {
   llvm_unreachable("A value of instrprof_error has no message.");
 }
 
+namespace {
+
 // FIXME: This class is only here to support the transition to llvm::Error. It
 // will be removed once this transition is complete. Clients should prefer to
 // deal with the Error value directly, rather than converting to error_code.
 class InstrProfErrorCategoryType : public std::error_category {
   const char *name() const noexcept override { return "llvm.instrprof"; }
+
   std::string message(int IE) const override {
     return getInstrProfErrString(static_cast<instrprof_error>(IE));
   }
 };
+
 } // end anonymous namespace
 
 static ManagedStatic<InstrProfErrorCategoryType> ErrorCategory;
@@ -133,6 +176,24 @@ std::string getPGOFuncName(StringRef RawFuncName,
   return GlobalValue::getGlobalIdentifier(RawFuncName, Linkage, FileName);
 }
 
+// Strip NumPrefix level of directory name from PathNameStr. If the number of
+// directory separators is less than NumPrefix, strip all the directories and
+// leave base file name only.
+static StringRef stripDirPrefix(StringRef PathNameStr, uint32_t NumPrefix) {
+  uint32_t Count = NumPrefix;
+  uint32_t Pos = 0, LastPos = 0;
+  for (auto & CI : PathNameStr) {
+    ++Pos;
+    if (llvm::sys::path::is_separator(CI)) {
+      LastPos = Pos;
+      --Count;
+    }
+    if (Count == 0)
+      break;
+  }
+  return PathNameStr.substr(LastPos);
+}
+
 // Return the PGOFuncName. This function has some special handling when called
 // in LTO optimization. The following only applies when calling in LTO passes
 // (when \c InLTO is true): LTO's internalization privatizes many global linkage
@@ -151,6 +212,8 @@ std::string getPGOFuncName(const Function &F, bool InLTO, uint64_t Version) {
     StringRef FileName = (StaticFuncFullModulePrefix
                               ? F.getParent()->getName()
                               : sys::path::filename(F.getParent()->getName()));
+    if (StaticFuncFullModulePrefix && StaticFuncStripDirNamePrefix != 0)
+      FileName = stripDirPrefix(FileName, StaticFuncStripDirNamePrefix);
     return getPGOFuncName(F.getName(), F.getLinkage(), FileName, Version);
   }
 
@@ -198,7 +261,6 @@ std::string getPGOFuncNameVarName(StringRef FuncName,
 GlobalVariable *createPGOFuncNameVar(Module &M,
                                      GlobalValue::LinkageTypes Linkage,
                                      StringRef PGOFuncName) {
-
   // We generally want to match the function's linkage, but available_externally
   // and extern_weak both have the wrong semantics, and anything that doesn't
   // need to link across compilation units doesn't need to be visible at all.
@@ -236,6 +298,17 @@ void InstrProfSymtab::create(Module &M, bool InLTO) {
     const std::string &PGOFuncName = getPGOFuncName(F, InLTO);
     addFuncName(PGOFuncName);
     MD5FuncMap.emplace_back(Function::getGUID(PGOFuncName), &F);
+    // In ThinLTO, local function may have been promoted to global and have
+    // suffix added to the function name. We need to add the stripped function
+    // name to the symbol table so that we can find a match from profile.
+    if (InLTO) {
+      auto pos = PGOFuncName.find('.');
+      if (pos != std::string::npos) {
+        const std::string &OtherFuncName = PGOFuncName.substr(0, pos);
+        addFuncName(OtherFuncName);
+        MD5FuncMap.emplace_back(Function::getGUID(OtherFuncName), &F);
+      }
+    }
   }
 
   finalizeSymtab();
@@ -243,7 +316,7 @@ void InstrProfSymtab::create(Module &M, bool InLTO) {
 
 Error collectPGOFuncNameStrings(const std::vector<std::string> &NameStrs,
                                 bool doCompression, std::string &Result) {
-  assert(NameStrs.size() && "No name data to emit");
+  assert(!NameStrs.empty() && "No name data to emit");
 
   uint8_t Header[16], *P = Header;
   std::string UncompressedNameStrings =
@@ -556,6 +629,7 @@ void ValueProfRecord::deserializeTo(InstrProfRecord &Record,
 void ValueProfRecord::swapBytes(support::endianness Old,
                                 support::endianness New) {
   using namespace support;
+
   if (Old == New)
     return;
 
@@ -592,6 +666,7 @@ void ValueProfData::deserializeTo(InstrProfRecord &Record,
 template <class T>
 static T swapToHostOrder(const unsigned char *&D, support::endianness Orig) {
   using namespace support;
+
   if (Orig == little)
     return endian::readNext<T, little, unaligned>(D);
   else
@@ -626,6 +701,7 @@ ValueProfData::getValueProfData(const unsigned char *D,
                                 const unsigned char *const BufferEnd,
                                 support::endianness Endianness) {
   using namespace support;
+
   if (D + sizeof(ValueProfData) > BufferEnd)
     return make_error<InstrProfError>(instrprof_error::truncated);
 
@@ -648,6 +724,7 @@ ValueProfData::getValueProfData(const unsigned char *D,
 
 void ValueProfData::swapBytesToHost(support::endianness Endianness) {
   using namespace support;
+
   if (Endianness == getHostEndianness())
     return;
 
@@ -663,6 +740,7 @@ void ValueProfData::swapBytesToHost(support::endianness Endianness) {
 
 void ValueProfData::swapBytesFromHost(support::endianness Endianness) {
   using namespace support;
+
   if (Endianness == getHostEndianness())
     return;
 
@@ -857,4 +935,26 @@ bool canRenameComdatFunc(const Function &F, bool CheckAddressTaken) {
   }
   return true;
 }
+
+// Parse the value profile options.
+void getMemOPSizeRangeFromOption(std::string MemOPSizeRange,
+                                 int64_t &RangeStart, int64_t &RangeLast) {
+  static const int64_t DefaultMemOPSizeRangeStart = 0;
+  static const int64_t DefaultMemOPSizeRangeLast = 8;
+  RangeStart = DefaultMemOPSizeRangeStart;
+  RangeLast = DefaultMemOPSizeRangeLast;
+
+  if (!MemOPSizeRange.empty()) {
+    auto Pos = MemOPSizeRange.find(":");
+    if (Pos != std::string::npos) {
+      if (Pos > 0)
+        RangeStart = atoi(MemOPSizeRange.substr(0, Pos).c_str());
+      if (Pos < MemOPSizeRange.size() - 1)
+        RangeLast = atoi(MemOPSizeRange.substr(Pos + 1).c_str());
+    } else
+      RangeLast = atoi(MemOPSizeRange.c_str());
+  }
+  assert(RangeLast >= RangeStart);
+}
+
 } // end namespace llvm
diff --git a/lib/ProfileData/InstrProfReader.cpp b/lib/ProfileData/InstrProfReader.cpp
index ad407f07957ff8211c7b94606a21a5a7a2574fe9..856f793363f7740814e4bf9311a96abc09f16fb9 100644
--- a/lib/ProfileData/InstrProfReader.cpp
+++ b/lib/ProfileData/InstrProfReader.cpp
@@ -1,4 +1,4 @@
-//=-- InstrProfReader.cpp - Instrumented profiling reader -------------------=//
+//===- InstrProfReader.cpp - Instrumented profiling reader ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,9 +12,27 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
-#include <cassert>
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfReader.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SwapByteOrder.h"
+#include <algorithm>
+#include <cctype>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <system_error>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
@@ -78,7 +96,6 @@ IndexedInstrProfReader::create(const Twine &Path) {
   return IndexedInstrProfReader::create(std::move(BufferOrError.get()));
 }
 
-
 Expected<std::unique_ptr<IndexedInstrProfReader>>
 IndexedInstrProfReader::create(std::unique_ptr<MemoryBuffer> Buffer) {
   // Sanity check the buffer.
@@ -182,7 +199,7 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
         CHECK_LINE_END(Line);
         std::pair<StringRef, StringRef> VD = Line->rsplit(':');
         uint64_t TakenCount, Value;
-        if (VK == IPVK_IndirectCallTarget) {
+        if (ValueKind == IPVK_IndirectCallTarget) {
           Symtab->addFuncName(VD.first);
           Value = IndexedInstrProf::ComputeHash(VD.first);
         } else {
@@ -192,7 +209,8 @@ TextInstrProfReader::readValueProfileData(InstrProfRecord &Record) {
         CurrentValues.push_back({Value, TakenCount});
         Line++;
       }
-      Record.addValueData(VK, S, CurrentValues.data(), NumValueData, nullptr);
+      Record.addValueData(ValueKind, S, CurrentValues.data(), NumValueData,
+                          nullptr);
     }
   }
   return success();
@@ -232,7 +250,7 @@ Error TextInstrProfReader::readNextRecord(InstrProfRecord &Record) {
     return error(instrprof_error::malformed);
 
   // Read each counter and fill our internal storage with the values.
-  Record.Counts.clear();
+  Record.Clear();
   Record.Counts.reserve(NumCounters);
   for (uint64_t I = 0; I < NumCounters; ++I) {
     if (Line.is_at_end())
@@ -398,7 +416,6 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
 template <class IntPtrT>
 Error RawInstrProfReader<IntPtrT>::readValueProfilingData(
     InstrProfRecord &Record) {
-
   Record.clearValueData();
   CurValueDataSize = 0;
   // Need to match the logic in value profile dumper code in compiler-rt:
@@ -454,9 +471,11 @@ Error RawInstrProfReader<IntPtrT>::readNextRecord(InstrProfRecord &Record) {
 }
 
 namespace llvm {
+
 template class RawInstrProfReader<uint32_t>;
 template class RawInstrProfReader<uint64_t>;
-}
+
+} // end namespace llvm
 
 InstrProfLookupTrait::hash_value_type
 InstrProfLookupTrait::ComputeHash(StringRef K) {
@@ -482,6 +501,8 @@ bool InstrProfLookupTrait::readValueProfilingData(
 
 data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
                                          offset_type N) {
+  using namespace support;
+
   // Check if the data is corrupt. If so, don't try to read it.
   if (N % sizeof(uint64_t))
     return data_type();
@@ -489,7 +510,6 @@ data_type InstrProfLookupTrait::ReadData(StringRef K, const unsigned char *D,
   DataBuffer.clear();
   std::vector<uint64_t> CounterBuffer;
 
-  using namespace support;
   const unsigned char *End = D + N;
   while (D < End) {
     // Read hash.
@@ -567,9 +587,10 @@ InstrProfReaderIndex<HashTableImpl>::InstrProfReaderIndex(
 }
 
 bool IndexedInstrProfReader::hasFormat(const MemoryBuffer &DataBuffer) {
+  using namespace support;
+
   if (DataBuffer.getBufferSize() < 8)
     return false;
-  using namespace support;
   uint64_t Magic =
       endian::read<uint64_t, little, aligned>(DataBuffer.getBufferStart());
   // Verify that it's magical.
@@ -581,6 +602,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
                                     const unsigned char *Cur) {
   using namespace IndexedInstrProf;
   using namespace support;
+
   if (Version >= IndexedInstrProf::Version4) {
     const IndexedInstrProf::Summary *SummaryInLE =
         reinterpret_cast<const IndexedInstrProf::Summary *>(Cur);
@@ -617,6 +639,7 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
   } else {
     // For older version of profile data, we need to compute on the fly:
     using namespace IndexedInstrProf;
+
     InstrProfSummaryBuilder Builder(ProfileSummaryBuilder::DefaultCutoffs);
     // FIXME: This only computes an empty summary. Need to call addRecord for
     // all InstrProfRecords to get the correct summary.
@@ -626,14 +649,14 @@ IndexedInstrProfReader::readSummary(IndexedInstrProf::ProfVersion Version,
 }
 
 Error IndexedInstrProfReader::readHeader() {
+  using namespace support;
+
   const unsigned char *Start =
       (const unsigned char *)DataBuffer->getBufferStart();
   const unsigned char *Cur = Start;
   if ((const unsigned char *)DataBuffer->getBufferEnd() - Cur < 24)
     return error(instrprof_error::truncated);
 
-  using namespace support;
-
   auto *Header = reinterpret_cast<const IndexedInstrProf::Header *>(Cur);
   Cur += sizeof(IndexedInstrProf::Header);
 
diff --git a/lib/ProfileData/InstrProfWriter.cpp b/lib/ProfileData/InstrProfWriter.cpp
index 029d75660a73ffdc18c27106f1d439cd90b7d880..6b7bd3b2fc0a043226bfdb012f234c4249ecaaa7 100644
--- a/lib/ProfileData/InstrProfWriter.cpp
+++ b/lib/ProfileData/InstrProfWriter.cpp
@@ -1,4 +1,4 @@
-//=-- InstrProfWriter.cpp - Instrumented profiling writer -------------------=//
+//===- InstrProfWriter.cpp - Instrumented profiling writer ----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +12,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ProfileData/InstrProfWriter.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
+#include "llvm/ProfileData/InstrProf.h"
+#include "llvm/ProfileData/InstrProfWriter.h"
 #include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/OnDiskHashTable.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
+#include <cstdint>
+#include <memory>
 #include <string>
 #include <tuple>
 #include <utility>
@@ -41,10 +47,9 @@ namespace llvm {
 // A wrapper class to abstract writer stream with support of bytes
 // back patching.
 class ProfOStream {
-
 public:
-  ProfOStream(llvm::raw_fd_ostream &FD) : IsFDOStream(true), OS(FD), LE(FD) {}
-  ProfOStream(llvm::raw_string_ostream &STR)
+  ProfOStream(raw_fd_ostream &FD) : IsFDOStream(true), OS(FD), LE(FD) {}
+  ProfOStream(raw_string_ostream &STR)
       : IsFDOStream(false), OS(STR), LE(STR) {}
 
   uint64_t tell() { return OS.tell(); }
@@ -55,15 +60,16 @@ public:
   // directly and it won't be reflected in the stream's internal buffer.
   void patch(PatchItem *P, int NItems) {
     using namespace support;
+
     if (IsFDOStream) {
-      llvm::raw_fd_ostream &FDOStream = static_cast<llvm::raw_fd_ostream &>(OS);
+      raw_fd_ostream &FDOStream = static_cast<raw_fd_ostream &>(OS);
       for (int K = 0; K < NItems; K++) {
         FDOStream.seek(P[K].Pos);
         for (int I = 0; I < P[K].N; I++)
           write(P[K].D[I]);
       }
     } else {
-      llvm::raw_string_ostream &SOStream =
+      raw_string_ostream &SOStream =
           static_cast<llvm::raw_string_ostream &>(OS);
       std::string &Data = SOStream.str(); // with flush
       for (int K = 0; K < NItems; K++) {
@@ -94,17 +100,19 @@ public:
   typedef uint64_t hash_value_type;
   typedef uint64_t offset_type;
 
-  support::endianness ValueProfDataEndianness;
+  support::endianness ValueProfDataEndianness = support::little;
   InstrProfSummaryBuilder *SummaryBuilder;
 
-  InstrProfRecordWriterTrait() : ValueProfDataEndianness(support::little) {}
+  InstrProfRecordWriterTrait() = default;
+
   static hash_value_type ComputeHash(key_type_ref K) {
     return IndexedInstrProf::ComputeHash(K);
   }
 
   static std::pair<offset_type, offset_type>
   EmitKeyDataLength(raw_ostream &Out, key_type_ref K, data_type_ref V) {
-    using namespace llvm::support;
+    using namespace support;
+
     endian::Writer<little> LE(Out);
 
     offset_type N = K.size();
@@ -130,7 +138,8 @@ public:
   }
 
   void EmitData(raw_ostream &Out, key_type_ref, data_type_ref V, offset_type) {
-    using namespace llvm::support;
+    using namespace support;
+
     endian::Writer<little> LE(Out);
     for (const auto &ProfileData : *V) {
       const InstrProfRecord &ProfRecord = ProfileData.second;
@@ -154,8 +163,7 @@ public:
 } // end namespace llvm
 
 InstrProfWriter::InstrProfWriter(bool Sparse)
-    : Sparse(Sparse), FunctionData(), ProfileKind(PF_Unknown),
-      InfoObj(new InstrProfRecordWriterTrait()) {}
+    : Sparse(Sparse), InfoObj(new InstrProfRecordWriterTrait()) {}
 
 InstrProfWriter::~InstrProfWriter() { delete InfoObj; }
 
@@ -208,7 +216,7 @@ bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
     return true;
   for (const auto &Func : PD) {
     const InstrProfRecord &IPR = Func.second;
-    if (any_of(IPR.Counts, [](uint64_t Count) { return Count > 0; }))
+    if (llvm::any_of(IPR.Counts, [](uint64_t Count) { return Count > 0; }))
       return true;
   }
   return false;
@@ -217,6 +225,7 @@ bool InstrProfWriter::shouldEncodeData(const ProfilingData &PD) {
 static void setSummary(IndexedInstrProf::Summary *TheSummary,
                        ProfileSummary &PS) {
   using namespace IndexedInstrProf;
+
   std::vector<ProfileSummaryEntry> &Res = PS.getDetailedSummary();
   TheSummary->NumSummaryFields = Summary::NumKinds;
   TheSummary->NumCutoffEntries = Res.size();
@@ -231,9 +240,10 @@ static void setSummary(IndexedInstrProf::Summary *TheSummary,
 }
 
 void InstrProfWriter::writeImpl(ProfOStream &OS) {
+  using namespace IndexedInstrProf;
+
   OnDiskChainedHashTableGenerator<InstrProfRecordWriterTrait> Generator;
 
-  using namespace IndexedInstrProf;
   InstrProfSummaryBuilder ISB(ProfileSummaryBuilder::DefaultCutoffs);
   InfoObj->SummaryBuilder = &ISB;
 
@@ -301,7 +311,7 @@ void InstrProfWriter::write(raw_fd_ostream &OS) {
 
 std::unique_ptr<MemoryBuffer> InstrProfWriter::writeBuffer() {
   std::string Data;
-  llvm::raw_string_ostream OS(Data);
+  raw_string_ostream OS(Data);
   ProfOStream POS(OS);
   // Write the hash table.
   writeImpl(POS);
diff --git a/lib/ProfileData/SampleProf.cpp b/lib/ProfileData/SampleProf.cpp
index 8fe85d69bb630d0177b5103d5c24beb9617f4327..8493acc2d95dda135aa9531a0985b81aa3973ebc 100644
--- a/lib/ProfileData/SampleProf.cpp
+++ b/lib/ProfileData/SampleProf.cpp
@@ -13,18 +13,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ProfileData/SampleProf.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/raw_ostream.h"
+#include <string>
+#include <system_error>
 
-using namespace llvm::sampleprof;
 using namespace llvm;
+using namespace sampleprof;
 
 namespace {
+
 // FIXME: This class is only here to support the transition to llvm::Error. It
 // will be removed once this transition is complete. Clients should prefer to
 // deal with the Error value directly, rather than converting to error_code.
 class SampleProfErrorCategoryType : public std::error_category {
   const char *name() const noexcept override { return "llvm.sampleprof"; }
+
   std::string message(int IE) const override {
     sampleprof_error E = static_cast<sampleprof_error>(IE);
     switch (E) {
@@ -54,7 +61,8 @@ class SampleProfErrorCategoryType : public std::error_category {
     llvm_unreachable("A value of sampleprof_error has no message.");
   }
 };
-}
+
+} // end anonymous namespace
 
 static ManagedStatic<SampleProfErrorCategoryType> ErrorCategory;
 
@@ -105,7 +113,7 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
      << " sampled lines\n";
 
   OS.indent(Indent);
-  if (BodySamples.size() > 0) {
+  if (!BodySamples.empty()) {
     OS << "Samples collected in the function's body {\n";
     SampleSorter<LineLocation, SampleRecord> SortedBodySamples(BodySamples);
     for (const auto &SI : SortedBodySamples.get()) {
@@ -119,7 +127,7 @@ void FunctionSamples::print(raw_ostream &OS, unsigned Indent) const {
   }
 
   OS.indent(Indent);
-  if (CallsiteSamples.size() > 0) {
+  if (!CallsiteSamples.empty()) {
     OS << "Samples collected in inlined callsites {\n";
     SampleSorter<LineLocation, FunctionSamples> SortedCallsiteSamples(
         CallsiteSamples);
@@ -141,5 +149,5 @@ raw_ostream &llvm::sampleprof::operator<<(raw_ostream &OS,
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void FunctionSamples::dump(void) const { print(dbgs(), 0); }
+LLVM_DUMP_METHOD void FunctionSamples::dump() const { print(dbgs(), 0); }
 #endif
diff --git a/lib/ProfileData/SampleProfReader.cpp b/lib/ProfileData/SampleProfReader.cpp
index af80b036a5bbf0e5a866168907453d3468dd0766..3a8d6190d2ca670592f744a8febdfc9855ede272 100644
--- a/lib/ProfileData/SampleProfReader.cpp
+++ b/lib/ProfileData/SampleProfReader.cpp
@@ -23,14 +23,25 @@
 #include "llvm/ProfileData/SampleProfReader.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/ProfileSummary.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/SampleProf.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/LineIterator.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <limits>
+#include <memory>
+#include <system_error>
+#include <vector>
 
-using namespace llvm::sampleprof;
 using namespace llvm;
+using namespace sampleprof;
 
 /// \brief Dump the function profile for \p FName.
 ///
@@ -681,11 +692,9 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
       if (!GcovBuffer.readInt64(TargetCount))
         return sampleprof_error::truncated;
 
-      if (Update) {
-        FunctionSamples &TargetProfile = Profiles[TargetName];
-        TargetProfile.addCalledTargetSamples(LineOffset, Discriminator,
-                                             TargetName, TargetCount);
-      }
+      if (Update)
+        FProfile->addCalledTargetSamples(LineOffset, Discriminator,
+                                         TargetName, TargetCount);
     }
   }
 
diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp
index 4fa71288f8d9fbbe5f949805bc7b9d2faa35432a..e1d6d575631aecea5c01f2a2632625e37949fd8b 100644
--- a/lib/ProfileData/SampleProfWriter.cpp
+++ b/lib/ProfileData/SampleProfWriter.cpp
@@ -18,16 +18,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ProfileData/ProfileCommon.h"
+#include "llvm/ProfileData/SampleProf.h"
 #include "llvm/ProfileData/SampleProfWriter.h"
-#include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/LEB128.h"
-#include "llvm/Support/LineIterator.h"
-#include "llvm/Support/MemoryBuffer.h"
-#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cstdint>
+#include <memory>
+#include <system_error>
+#include <utility>
+#include <vector>
 
-using namespace llvm::sampleprof;
 using namespace llvm;
+using namespace sampleprof;
 
 /// \brief Write samples to a text file.
 ///
diff --git a/lib/Support/APFloat.cpp b/lib/Support/APFloat.cpp
index 4dc9307f762338480e521a4f12ca48bddffd56dc..9778628911cd054acc5585ebcc4c9d53b2f29879 100644
--- a/lib/Support/APFloat.cpp
+++ b/lib/Support/APFloat.cpp
@@ -37,6 +37,10 @@
 
 using namespace llvm;
 
+// TODO: Remove these and use APInt qualified types directly.
+typedef APInt::WordType integerPart;
+const unsigned int integerPartWidth = APInt::APINT_BITS_PER_WORD;
+
 /// A macro used to combine two fcCategory enums into one key which can be used
 /// in a switch statement to classify how the interaction of two APFloat's
 /// categories affects an operation.
@@ -761,7 +765,7 @@ IEEEFloat &IEEEFloat::operator=(IEEEFloat &&rhs) {
 
 bool IEEEFloat::isDenormal() const {
   return isFiniteNonZero() && (exponent == semantics->minExponent) &&
-         (APInt::tcExtractBit(significandParts(), 
+         (APInt::tcExtractBit(significandParts(),
                               semantics->precision - 1) == 0);
 }
 
@@ -980,14 +984,14 @@ lostFraction IEEEFloat::multiplySignificand(const IEEEFloat &rhs,
   //     rhs = b23 . b22 ... b0 * 2^e2
   // the result of multiplication is:
   //   *this = c48 c47 c46 . c45 ... c0 * 2^(e1+e2)
-  // Note that there are three significant bits at the left-hand side of the 
+  // Note that there are three significant bits at the left-hand side of the
   // radix point: two for the multiplication, and an overflow bit for the
   // addition (that will always be zero at this point). Move the radix point
   // toward left by two bits, and adjust exponent accordingly.
   exponent += 2;
 
   if (addend && addend->isNonZero()) {
-    // The intermediate result of the multiplication has "2 * precision" 
+    // The intermediate result of the multiplication has "2 * precision"
     // signicant bit; adjust the addend to be consistent with mul result.
     //
     Significand savedSignificand = significand;
@@ -1039,7 +1043,7 @@ lostFraction IEEEFloat::multiplySignificand(const IEEEFloat &rhs,
   }
 
   // Convert the result having "2 * precision" significant-bits back to the one
-  // having "precision" significant-bits. First, move the radix point from 
+  // having "precision" significant-bits. First, move the radix point from
   // poision "2*precision - 1" to "precision - 1". The exponent need to be
   // adjusted by "2*precision - 1" - "precision - 1" = "precision".
   exponent -= precision + 1;
@@ -1716,9 +1720,10 @@ IEEEFloat::opStatus IEEEFloat::remainder(const IEEEFloat &rhs) {
   int parts = partCount();
   integerPart *x = new integerPart[parts];
   bool ignored;
-  fs = V.convertToInteger(x, parts * integerPartWidth, true,
-                          rmNearestTiesToEven, &ignored);
-  if (fs==opInvalidOp) {
+  fs = V.convertToInteger(makeMutableArrayRef(x, parts),
+                          parts * integerPartWidth, true, rmNearestTiesToEven,
+                          &ignored);
+  if (fs == opInvalidOp) {
     delete[] x;
     return fs;
   }
@@ -1739,43 +1744,20 @@ IEEEFloat::opStatus IEEEFloat::remainder(const IEEEFloat &rhs) {
   return fs;
 }
 
-/* Normalized llvm frem (C fmod).
-   This is not currently correct in all cases.  */
+/* Normalized llvm frem (C fmod). */
 IEEEFloat::opStatus IEEEFloat::mod(const IEEEFloat &rhs) {
   opStatus fs;
   fs = modSpecials(rhs);
 
-  if (isFiniteNonZero() && rhs.isFiniteNonZero()) {
-    IEEEFloat V = *this;
-    unsigned int origSign = sign;
-
-    fs = V.divide(rhs, rmNearestTiesToEven);
-    if (fs == opDivByZero)
-      return fs;
-
-    int parts = partCount();
-    integerPart *x = new integerPart[parts];
-    bool ignored;
-    fs = V.convertToInteger(x, parts * integerPartWidth, true,
-                            rmTowardZero, &ignored);
-    if (fs==opInvalidOp) {
-      delete[] x;
-      return fs;
-    }
-
-    fs = V.convertFromZeroExtendedInteger(x, parts * integerPartWidth, true,
-                                          rmNearestTiesToEven);
-    assert(fs==opOK);   // should always work
-
-    fs = V.multiply(rhs, rmNearestTiesToEven);
-    assert(fs==opOK || fs==opInexact);   // should not overflow or underflow
-
+  while (isFiniteNonZero() && rhs.isFiniteNonZero() &&
+         compareAbsoluteValue(rhs) != cmpLessThan) {
+    IEEEFloat V = scalbn(rhs, ilogb(*this) - ilogb(rhs), rmNearestTiesToEven);
+    if (compareAbsoluteValue(V) == cmpLessThan)
+      V = scalbn(V, -1, rmNearestTiesToEven);
+    V.sign = sign;
+  
     fs = subtract(V, rmNearestTiesToEven);
-    assert(fs==opOK || fs==opInexact);   // likewise
-
-    if (isZero())
-      sign = origSign;    // IEEE754 requires this
-    delete[] x;
+    assert(fs==opOK);
   }
   return fs;
 }
@@ -2051,7 +2033,7 @@ IEEEFloat::opStatus IEEEFloat::convert(const fltSemantics &toSemantics,
    Note that for conversions to integer type the C standard requires
    round-to-zero to always be used.  */
 IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
-    integerPart *parts, unsigned int width, bool isSigned,
+    MutableArrayRef<integerPart> parts, unsigned int width, bool isSigned,
     roundingMode rounding_mode, bool *isExact) const {
   lostFraction lost_fraction;
   const integerPart *src;
@@ -2064,9 +2046,10 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
     return opInvalidOp;
 
   dstPartsCount = partCountForBits(width);
+  assert(dstPartsCount <= parts.size() && "Integer too big");
 
   if (category == fcZero) {
-    APInt::tcSet(parts, 0, dstPartsCount);
+    APInt::tcSet(parts.data(), 0, dstPartsCount);
     // Negative zero can't be represented as an int.
     *isExact = !sign;
     return opOK;
@@ -2078,7 +2061,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
      the destination.  */
   if (exponent < 0) {
     /* Our absolute value is less than one; truncate everything.  */
-    APInt::tcSet(parts, 0, dstPartsCount);
+    APInt::tcSet(parts.data(), 0, dstPartsCount);
     /* For exponent -1 the integer bit represents .5, look at that.
        For smaller exponents leftmost truncated bit is 0. */
     truncatedBits = semantics->precision -1U - exponent;
@@ -2094,11 +2077,13 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
     if (bits < semantics->precision) {
       /* We truncate (semantics->precision - bits) bits.  */
       truncatedBits = semantics->precision - bits;
-      APInt::tcExtract(parts, dstPartsCount, src, bits, truncatedBits);
+      APInt::tcExtract(parts.data(), dstPartsCount, src, bits, truncatedBits);
     } else {
       /* We want at least as many bits as are available.  */
-      APInt::tcExtract(parts, dstPartsCount, src, semantics->precision, 0);
-      APInt::tcShiftLeft(parts, dstPartsCount, bits - semantics->precision);
+      APInt::tcExtract(parts.data(), dstPartsCount, src, semantics->precision,
+                       0);
+      APInt::tcShiftLeft(parts.data(), dstPartsCount,
+                         bits - semantics->precision);
       truncatedBits = 0;
     }
   }
@@ -2110,7 +2095,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
                                                   truncatedBits);
     if (lost_fraction != lfExactlyZero &&
         roundAwayFromZero(rounding_mode, lost_fraction, truncatedBits)) {
-      if (APInt::tcIncrement(parts, dstPartsCount))
+      if (APInt::tcIncrement(parts.data(), dstPartsCount))
         return opInvalidOp;     /* Overflow.  */
     }
   } else {
@@ -2118,7 +2103,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
   }
 
   /* Step 3: check if we fit in the destination.  */
-  unsigned int omsb = APInt::tcMSB(parts, dstPartsCount) + 1;
+  unsigned int omsb = APInt::tcMSB(parts.data(), dstPartsCount) + 1;
 
   if (sign) {
     if (!isSigned) {
@@ -2129,7 +2114,8 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
       /* It takes omsb bits to represent the unsigned integer value.
          We lose a bit for the sign, but care is needed as the
          maximally negative integer is a special case.  */
-      if (omsb == width && APInt::tcLSB(parts, dstPartsCount) + 1 != omsb)
+      if (omsb == width &&
+          APInt::tcLSB(parts.data(), dstPartsCount) + 1 != omsb)
         return opInvalidOp;
 
       /* This case can happen because of rounding.  */
@@ -2137,7 +2123,7 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
         return opInvalidOp;
     }
 
-    APInt::tcNegate (parts, dstPartsCount);
+    APInt::tcNegate (parts.data(), dstPartsCount);
   } else {
     if (omsb >= width + !isSigned)
       return opInvalidOp;
@@ -2159,11 +2145,10 @@ IEEEFloat::opStatus IEEEFloat::convertToSignExtendedInteger(
    the original value.  This is almost equivalent to result==opOK,
    except for negative zeroes.
 */
-IEEEFloat::opStatus IEEEFloat::convertToInteger(integerPart *parts,
-                                                unsigned int width,
-                                                bool isSigned,
-                                                roundingMode rounding_mode,
-                                                bool *isExact) const {
+IEEEFloat::opStatus
+IEEEFloat::convertToInteger(MutableArrayRef<integerPart> parts,
+                            unsigned int width, bool isSigned,
+                            roundingMode rounding_mode, bool *isExact) const {
   opStatus fs;
 
   fs = convertToSignExtendedInteger(parts, width, isSigned, rounding_mode,
@@ -2173,6 +2158,7 @@ IEEEFloat::opStatus IEEEFloat::convertToInteger(integerPart *parts,
     unsigned int bits, dstPartsCount;
 
     dstPartsCount = partCountForBits(width);
+    assert(dstPartsCount <= parts.size() && "Integer too big");
 
     if (category == fcNaN)
       bits = 0;
@@ -2181,9 +2167,9 @@ IEEEFloat::opStatus IEEEFloat::convertToInteger(integerPart *parts,
     else
       bits = width - isSigned;
 
-    APInt::tcSetLeastSignificantBits(parts, dstPartsCount, bits);
+    APInt::tcSetLeastSignificantBits(parts.data(), dstPartsCount, bits);
     if (sign && isSigned)
-      APInt::tcShiftLeft(parts, dstPartsCount, width - 1);
+      APInt::tcShiftLeft(parts.data(), dstPartsCount, width - 1);
   }
 
   return fs;
@@ -2472,7 +2458,7 @@ IEEEFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode) {
 
   // Test if we have a zero number allowing for strings with no null terminators
   // and zero decimals with non-zero exponents.
-  // 
+  //
   // We computed firstSigDigit by ignoring all zeros and dots. Thus if
   // D->firstSigDigit equals str.end(), every digit must be a zero and there can
   // be at most one dot. On the other hand, if we have a zero with a non-zero
@@ -4293,11 +4279,10 @@ APFloat::opStatus DoubleAPFloat::next(bool nextDown) {
   return Ret;
 }
 
-APFloat::opStatus DoubleAPFloat::convertToInteger(integerPart *Input,
-                                                  unsigned int Width,
-                                                  bool IsSigned,
-                                                  roundingMode RM,
-                                                  bool *IsExact) const {
+APFloat::opStatus
+DoubleAPFloat::convertToInteger(MutableArrayRef<integerPart> Input,
+                                unsigned int Width, bool IsSigned,
+                                roundingMode RM, bool *IsExact) const {
   assert(Semantics == &semPPCDoubleDouble && "Unexpected Semantics");
   return APFloat(semPPCDoubleDoubleLegacy, bitcastToAPInt())
       .convertToInteger(Input, Width, IsSigned, RM, IsExact);
@@ -4511,7 +4496,7 @@ APFloat::opStatus APFloat::convertToInteger(APSInt &result,
                                             bool *isExact) const {
   unsigned bitWidth = result.getBitWidth();
   SmallVector<uint64_t, 4> parts(result.getNumWords());
-  opStatus status = convertToInteger(parts.data(), bitWidth, result.isSigned(),
+  opStatus status = convertToInteger(parts, bitWidth, result.isSigned(),
                                      rounding_mode, isExact);
   // Keeps the original signed-ness.
   result = APInt(bitWidth, parts);
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 3c742303936a901eb714eeb69ac7631110ad05ef..00b340e3ee4afbbb5e40af87793372c63f2730aa 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -63,7 +63,7 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) {
     r = cdigit - 'a';
     if (r <= radix - 11U)
       return r + 10;
-    
+
     radix = 10;
   }
 
@@ -76,14 +76,17 @@ inline static unsigned getDigit(char cdigit, uint8_t radix) {
 
 
 void APInt::initSlowCase(uint64_t val, bool isSigned) {
+  VAL = 0;
   pVal = getClearedMemory(getNumWords());
   pVal[0] = val;
   if (isSigned && int64_t(val) < 0)
     for (unsigned i = 1; i < getNumWords(); ++i)
       pVal[i] = -1ULL;
+  clearUnusedBits();
 }
 
 void APInt::initSlowCase(const APInt& that) {
+  VAL = 0;
   pVal = getMemory(getNumWords());
   memcpy(pVal, that.pVal, getNumWords() * APINT_WORD_SIZE);
 }
@@ -95,6 +98,7 @@ void APInt::initFromArray(ArrayRef<uint64_t> bigVal) {
     VAL = bigVal[0];
   else {
     // Get memory, cleared to 0
+    VAL = 0;
     pVal = getClearedMemory(getNumWords());
     // Calculate the number of words to copy
     unsigned words = std::min<unsigned>(bigVal.size(), getNumWords());
@@ -106,12 +110,12 @@ void APInt::initFromArray(ArrayRef<uint64_t> bigVal) {
 }
 
 APInt::APInt(unsigned numBits, ArrayRef<uint64_t> bigVal)
-  : BitWidth(numBits), VAL(0) {
+  : BitWidth(numBits) {
   initFromArray(bigVal);
 }
 
 APInt::APInt(unsigned numBits, unsigned numWords, const uint64_t bigVal[])
-  : BitWidth(numBits), VAL(0) {
+  : BitWidth(numBits) {
   initFromArray(makeArrayRef(bigVal, numWords));
 }
 
@@ -153,16 +157,6 @@ APInt& APInt::AssignSlowCase(const APInt& RHS) {
   return clearUnusedBits();
 }
 
-APInt& APInt::operator=(uint64_t RHS) {
-  if (isSingleWord())
-    VAL = RHS;
-  else {
-    pVal[0] = RHS;
-    memset(pVal+1, 0, (getNumWords() - 1) * APINT_WORD_SIZE);
-  }
-  return clearUnusedBits();
-}
-
 /// This method 'profiles' an APInt for use with FoldingSet.
 void APInt::Profile(FoldingSetNodeID& ID) const {
   ID.AddInteger(BitWidth);
@@ -232,21 +226,6 @@ APInt& APInt::operator--() {
   return clearUnusedBits();
 }
 
-/// This function adds the integer array x to the integer array Y and
-/// places the result in dest.
-/// @returns the carry out from the addition
-/// @brief General addition of 64-bit integer arrays
-static bool add(uint64_t *dest, const uint64_t *x, const uint64_t *y,
-                unsigned len) {
-  bool carry = false;
-  for (unsigned i = 0; i< len; ++i) {
-    uint64_t limit = std::min(x[i],y[i]); // must come first in case dest == x
-    dest[i] = x[i] + y[i] + carry;
-    carry = dest[i] < limit || (carry && dest[i] == limit);
-  }
-  return carry;
-}
-
 /// Adds the RHS APint to this APInt.
 /// @returns this, after addition of RHS.
 /// @brief Addition assignment operator.
@@ -254,9 +233,8 @@ APInt& APInt::operator+=(const APInt& RHS) {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord())
     VAL += RHS.VAL;
-  else {
-    add(pVal, pVal, RHS.pVal, getNumWords());
-  }
+  else
+    tcAdd(pVal, RHS.pVal, 0, getNumWords());
   return clearUnusedBits();
 }
 
@@ -268,20 +246,6 @@ APInt& APInt::operator+=(uint64_t RHS) {
   return clearUnusedBits();
 }
 
-/// Subtracts the integer array y from the integer array x
-/// @returns returns the borrow out.
-/// @brief Generalized subtraction of 64-bit integer arrays.
-static bool sub(uint64_t *dest, const uint64_t *x, const uint64_t *y,
-                unsigned len) {
-  bool borrow = false;
-  for (unsigned i = 0; i < len; ++i) {
-    uint64_t x_tmp = borrow ? x[i] - 1 : x[i];
-    borrow = y[i] > x_tmp || (borrow && x[i] == 0);
-    dest[i] = x_tmp - y[i];
-  }
-  return borrow;
-}
-
 /// Subtracts the RHS APInt from this APInt
 /// @returns this, after subtraction
 /// @brief Subtraction assignment operator.
@@ -290,7 +254,7 @@ APInt& APInt::operator-=(const APInt& RHS) {
   if (isSingleWord())
     VAL -= RHS.VAL;
   else
-    sub(pVal, pVal, RHS.pVal, getNumWords());
+    tcSubtract(pVal, RHS.pVal, 0, getNumWords());
   return clearUnusedBits();
 }
 
@@ -339,7 +303,7 @@ static uint64_t mul_1(uint64_t dest[], uint64_t x[], unsigned len, uint64_t y) {
 
 /// Multiplies integer array x by integer array y and stores the result into
 /// the integer array dest. Note that dest's size must be >= xlen + ylen.
-/// @brief Generalized multiplicate of integer arrays.
+/// @brief Generalized multiplication of integer arrays.
 static void mul(uint64_t dest[], uint64_t x[], unsigned xlen, uint64_t y[],
                 unsigned ylen) {
   dest[xlen] = mul_1(dest, x, xlen, y[0]);
@@ -412,67 +376,21 @@ APInt& APInt::operator*=(const APInt& RHS) {
   return *this;
 }
 
-APInt& APInt::operator&=(const APInt& RHS) {
-  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord()) {
-    VAL &= RHS.VAL;
-    return *this;
-  }
-  unsigned numWords = getNumWords();
-  for (unsigned i = 0; i < numWords; ++i)
-    pVal[i] &= RHS.pVal[i];
+APInt& APInt::AndAssignSlowCase(const APInt& RHS) {
+  tcAnd(pVal, RHS.pVal, getNumWords());
   return *this;
 }
 
-APInt& APInt::operator|=(const APInt& RHS) {
-  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord()) {
-    VAL |= RHS.VAL;
-    return *this;
-  }
-  unsigned numWords = getNumWords();
-  for (unsigned i = 0; i < numWords; ++i)
-    pVal[i] |= RHS.pVal[i];
+APInt& APInt::OrAssignSlowCase(const APInt& RHS) {
+  tcOr(pVal, RHS.pVal, getNumWords());
   return *this;
 }
 
-APInt& APInt::operator^=(const APInt& RHS) {
-  assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
-  if (isSingleWord()) {
-    VAL ^= RHS.VAL;
-    return *this;
-  }
-  unsigned numWords = getNumWords();
-  for (unsigned i = 0; i < numWords; ++i)
-    pVal[i] ^= RHS.pVal[i];
+APInt& APInt::XorAssignSlowCase(const APInt& RHS) {
+  tcXor(pVal, RHS.pVal, getNumWords());
   return *this;
 }
 
-APInt APInt::AndSlowCase(const APInt& RHS) const {
-  unsigned numWords = getNumWords();
-  uint64_t* val = getMemory(numWords);
-  for (unsigned i = 0; i < numWords; ++i)
-    val[i] = pVal[i] & RHS.pVal[i];
-  return APInt(val, getBitWidth());
-}
-
-APInt APInt::OrSlowCase(const APInt& RHS) const {
-  unsigned numWords = getNumWords();
-  uint64_t *val = getMemory(numWords);
-  for (unsigned i = 0; i < numWords; ++i)
-    val[i] = pVal[i] | RHS.pVal[i];
-  return APInt(val, getBitWidth());
-}
-
-APInt APInt::XorSlowCase(const APInt& RHS) const {
-  unsigned numWords = getNumWords();
-  uint64_t *val = getMemory(numWords);
-  for (unsigned i = 0; i < numWords; ++i)
-    val[i] = pVal[i] ^ RHS.pVal[i];
-
-  return APInt(val, getBitWidth());
-}
-
 APInt APInt::operator*(const APInt& RHS) const {
   assert(BitWidth == RHS.BitWidth && "Bit widths must be the same");
   if (isSingleWord())
@@ -507,11 +425,11 @@ bool APInt::ult(const APInt& RHS) const {
   if (n1 < n2)
     return true;
 
-  // If magnitude of RHS is greather than LHS, return false.
+  // If magnitude of RHS is greater than LHS, return false.
   if (n2 < n1)
     return false;
 
-  // If they bot fit in a word, just compare the low order word
+  // If they both fit in a word, just compare the low order word
   if (n1 <= APINT_BITS_PER_WORD && n2 <= APINT_BITS_PER_WORD)
     return pVal[0] < RHS.pVal[0];
 
@@ -541,7 +459,7 @@ bool APInt::slt(const APInt& RHS) const {
   if (lhsNeg != rhsNeg)
     return lhsNeg;
 
-  // Otherwise we can just use an unsigned comparision, because even negative
+  // Otherwise we can just use an unsigned comparison, because even negative
   // numbers compare correctly this way if both have the same signed-ness.
   return ult(RHS);
 }
@@ -553,6 +471,33 @@ void APInt::setBit(unsigned bitPosition) {
     pVal[whichWord(bitPosition)] |= maskBit(bitPosition);
 }
 
+void APInt::setBitsSlowCase(unsigned loBit, unsigned hiBit) {
+  unsigned loWord = whichWord(loBit);
+  unsigned hiWord = whichWord(hiBit);
+
+  // Create an initial mask for the low word with zeros below loBit.
+  uint64_t loMask = UINT64_MAX << whichBit(loBit);
+
+  // If hiBit is not aligned, we need a high mask.
+  unsigned hiShiftAmt = whichBit(hiBit);
+  if (hiShiftAmt != 0) {
+    // Create a high mask with zeros above hiBit.
+    uint64_t hiMask = UINT64_MAX >> (APINT_BITS_PER_WORD - hiShiftAmt);
+    // If loWord and hiWord are equal, then we combine the masks. Otherwise,
+    // set the bits in hiWord.
+    if (hiWord == loWord)
+      loMask &= hiMask;
+    else
+      pVal[hiWord] |= hiMask;
+  }
+  // Apply the mask to the low word.
+  pVal[loWord] |= loMask;
+
+  // Fill any words between loWord and hiWord with all ones.
+  for (unsigned word = loWord + 1; word < hiWord; ++word)
+    pVal[word] = UINT64_MAX;
+}
+
 /// Set the given bit to 0 whose position is given as "bitPosition".
 /// @brief Set a given bit to 0.
 void APInt::clearBit(unsigned bitPosition) {
@@ -563,6 +508,10 @@ void APInt::clearBit(unsigned bitPosition) {
 }
 
 /// @brief Toggle every bit to its opposite value.
+void APInt::flipAllBitsSlowCase() {
+  tcComplement(pVal, getNumWords());
+  clearUnusedBits();
+}
 
 /// Toggle a given bit to its opposite value whose position is given
 /// as "bitPosition".
@@ -573,9 +522,104 @@ void APInt::flipBit(unsigned bitPosition) {
   else setBit(bitPosition);
 }
 
+void APInt::insertBits(const APInt &subBits, unsigned bitPosition) {
+  unsigned subBitWidth = subBits.getBitWidth();
+  assert(0 < subBitWidth && (subBitWidth + bitPosition) <= BitWidth &&
+         "Illegal bit insertion");
+
+  // Insertion is a direct copy.
+  if (subBitWidth == BitWidth) {
+    *this = subBits;
+    return;
+  }
+
+  // Single word result can be done as a direct bitmask.
+  if (isSingleWord()) {
+    uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - subBitWidth);
+    VAL &= ~(mask << bitPosition);
+    VAL |= (subBits.VAL << bitPosition);
+    return;
+  }
+
+  unsigned loBit = whichBit(bitPosition);
+  unsigned loWord = whichWord(bitPosition);
+  unsigned hi1Word = whichWord(bitPosition + subBitWidth - 1);
+
+  // Insertion within a single word can be done as a direct bitmask.
+  if (loWord == hi1Word) {
+    uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - subBitWidth);
+    pVal[loWord] &= ~(mask << loBit);
+    pVal[loWord] |= (subBits.VAL << loBit);
+    return;
+  }
+
+  // Insert on word boundaries.
+  if (loBit == 0) {
+    // Direct copy whole words.
+    unsigned numWholeSubWords = subBitWidth / APINT_BITS_PER_WORD;
+    memcpy(pVal + loWord, subBits.getRawData(),
+           numWholeSubWords * APINT_WORD_SIZE);
+
+    // Mask+insert remaining bits.
+    unsigned remainingBits = subBitWidth % APINT_BITS_PER_WORD;
+    if (remainingBits != 0) {
+      uint64_t mask = UINT64_MAX >> (APINT_BITS_PER_WORD - remainingBits);
+      pVal[hi1Word] &= ~mask;
+      pVal[hi1Word] |= subBits.getWord(subBitWidth - 1);
+    }
+    return;
+  }
+
+  // General case - set/clear individual bits in dst based on src.
+  // TODO - there is scope for optimization here, but at the moment this code
+  // path is barely used so prefer readability over performance.
+  for (unsigned i = 0; i != subBitWidth; ++i) {
+    if (subBits[i])
+      setBit(bitPosition + i);
+    else
+      clearBit(bitPosition + i);
+  }
+}
+
+APInt APInt::extractBits(unsigned numBits, unsigned bitPosition) const {
+  assert(numBits > 0 && "Can't extract zero bits");
+  assert(bitPosition < BitWidth && (numBits + bitPosition) <= BitWidth &&
+         "Illegal bit extraction");
+
+  if (isSingleWord())
+    return APInt(numBits, VAL >> bitPosition);
+
+  unsigned loBit = whichBit(bitPosition);
+  unsigned loWord = whichWord(bitPosition);
+  unsigned hiWord = whichWord(bitPosition + numBits - 1);
+
+  // Single word result extracting bits from a single word source.
+  if (loWord == hiWord)
+    return APInt(numBits, pVal[loWord] >> loBit);
+
+  // Extracting bits that start on a source word boundary can be done
+  // as a fast memory copy.
+  if (loBit == 0)
+    return APInt(numBits, makeArrayRef(pVal + loWord, 1 + hiWord - loWord));
+
+  // General case - shift + copy source words directly into place.
+  APInt Result(numBits, 0);
+  unsigned NumSrcWords = getNumWords();
+  unsigned NumDstWords = Result.getNumWords();
+
+  for (unsigned word = 0; word < NumDstWords; ++word) {
+    uint64_t w0 = pVal[loWord + word];
+    uint64_t w1 =
+        (loWord + word + 1) < NumSrcWords ? pVal[loWord + word + 1] : 0;
+    Result.pVal[word] = (w0 >> loBit) | (w1 << (APINT_BITS_PER_WORD - loBit));
+  }
+
+  return Result.clearUnusedBits();
+}
+
 unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
   assert(!str.empty() && "Invalid string length");
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 || 
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 ||
           radix == 36) &&
          "Radix should be 2, 8, 10, 16, or 36!");
 
@@ -600,7 +644,7 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
     return slen * 4 + isNegative;
 
   // FIXME: base 36
-  
+
   // This is grossly inefficient but accurate. We could probably do something
   // with a computation of roughly slen*64/20 and then adjust by the value of
   // the first few digits. But, I'm not sure how accurate that could be.
@@ -609,7 +653,7 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
   // be too large. This avoids the assertion in the constructor. This
   // calculation doesn't work appropriately for the numbers 0-9, so just use 4
   // bits in that case.
-  unsigned sufficient 
+  unsigned sufficient
     = radix == 10? (slen == 1 ? 4 : slen * 64/18)
                  : (slen == 1 ? 7 : slen * 16/3);
 
@@ -643,19 +687,20 @@ bool APInt::isSplat(unsigned SplatSizeInBits) const {
 
 /// This function returns the high "numBits" bits of this APInt.
 APInt APInt::getHiBits(unsigned numBits) const {
-  return APIntOps::lshr(*this, BitWidth - numBits);
+  return this->lshr(BitWidth - numBits);
 }
 
 /// This function returns the low "numBits" bits of this APInt.
 APInt APInt::getLoBits(unsigned numBits) const {
-  return APIntOps::lshr(APIntOps::shl(*this, BitWidth - numBits),
-                        BitWidth - numBits);
+  APInt Result(getLowBitsSet(BitWidth, numBits));
+  Result &= *this;
+  return Result;
 }
 
 unsigned APInt::countLeadingZerosSlowCase() const {
   unsigned Count = 0;
   for (int i = getNumWords()-1; i >= 0; --i) {
-    integerPart V = pVal[i];
+    uint64_t V = pVal[i];
     if (V == 0)
       Count += APINT_BITS_PER_WORD;
     else {
@@ -794,13 +839,11 @@ APInt APInt::reverseBits() const {
   return Reversed;
 }
 
-APInt llvm::APIntOps::GreatestCommonDivisor(const APInt& API1,
-                                            const APInt& API2) {
-  APInt A = API1, B = API2;
+APInt llvm::APIntOps::GreatestCommonDivisor(APInt A, APInt B) {
   while (!!B) {
-    APInt T = B;
-    B = APIntOps::urem(A, B);
-    A = T;
+    APInt R = A.urem(B);
+    A = std::move(B);
+    B = std::move(R);
   }
   return A;
 }
@@ -1240,8 +1283,21 @@ APInt APInt::shlSlowCase(unsigned shiftAmt) const {
   return Result;
 }
 
+// Calculate the rotate amount modulo the bit width.
+static unsigned rotateModulo(unsigned BitWidth, const APInt &rotateAmt) {
+  unsigned rotBitWidth = rotateAmt.getBitWidth();
+  APInt rot = rotateAmt;
+  if (rotBitWidth < BitWidth) {
+    // Extend the rotate APInt, so that the urem doesn't divide by 0.
+    // e.g. APInt(1, 32) would give APInt(1, 0).
+    rot = rotateAmt.zext(BitWidth);
+  }
+  rot = rot.urem(APInt(rot.getBitWidth(), BitWidth));
+  return rot.getLimitedValue(BitWidth);
+}
+
 APInt APInt::rotl(const APInt &rotateAmt) const {
-  return rotl((unsigned)rotateAmt.getLimitedValue(BitWidth));
+  return rotl(rotateModulo(BitWidth, rotateAmt));
 }
 
 APInt APInt::rotl(unsigned rotateAmt) const {
@@ -1252,7 +1308,7 @@ APInt APInt::rotl(unsigned rotateAmt) const {
 }
 
 APInt APInt::rotr(const APInt &rotateAmt) const {
-  return rotr((unsigned)rotateAmt.getLimitedValue(BitWidth));
+  return rotr(rotateModulo(BitWidth, rotateAmt));
 }
 
 APInt APInt::rotr(unsigned rotateAmt) const {
@@ -1614,7 +1670,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r,
   if (r) {
     // The value d is expressed by the "shift" value above since we avoided
     // multiplication by d by using a shift left. So, all we have to do is
-    // shift right here. In order to mak
+    // shift right here.
     if (shift) {
       unsigned carry = 0;
       DEBUG(dbgs() << "KnuthDiv: remainder:");
@@ -2010,7 +2066,7 @@ APInt APInt::sdiv_ov(const APInt &RHS, bool &Overflow) const {
 
 APInt APInt::smul_ov(const APInt &RHS, bool &Overflow) const {
   APInt Res = *this * RHS;
-  
+
   if (*this != 0 && RHS != 0)
     Overflow = Res.sdiv(RHS) != *this || Res.sdiv(*this) != RHS;
   else
@@ -2037,7 +2093,7 @@ APInt APInt::sshl_ov(const APInt &ShAmt, bool &Overflow) const {
     Overflow = ShAmt.uge(countLeadingZeros());
   else
     Overflow = ShAmt.uge(countLeadingOnes());
-  
+
   return *this << ShAmt;
 }
 
@@ -2057,7 +2113,7 @@ APInt APInt::ushl_ov(const APInt &ShAmt, bool &Overflow) const {
 void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
   // Check our assumptions here
   assert(!str.empty() && "Invalid string length");
-  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 || 
+  assert((radix == 10 || radix == 8 || radix == 16 || radix == 2 ||
           radix == 36) &&
          "Radix should be 2, 8, 10, 16, or 36!");
 
@@ -2082,9 +2138,8 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
   // Figure out if we can shift instead of multiply
   unsigned shift = (radix == 16 ? 4 : radix == 8 ? 3 : radix == 2 ? 1 : 0);
 
-  // Set up an APInt for the digit to add outside the loop so we don't
+  // Set up an APInt for the radix multiplier outside the loop so we don't
   // constantly construct/destruct it.
-  APInt apdigit(getBitWidth(), 0);
   APInt apradix(getBitWidth(), radix);
 
   // Enter digit traversal loop
@@ -2101,11 +2156,7 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
     }
 
     // Add in the digit we just interpreted
-    if (apdigit.isSingleWord())
-      apdigit.VAL = digit;
-    else
-      apdigit.pVal[0] = digit;
-    *this += apdigit;
+    *this += digit;
   }
   // If its negative, put it in two's complement form
   if (isNeg) {
@@ -2116,7 +2167,7 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) {
 
 void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
                      bool Signed, bool formatAsCLiteral) const {
-  assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2 || 
+  assert((Radix == 10 || Radix == 8 || Radix == 16 || Radix == 2 ||
           Radix == 36) &&
          "Radix should be 2, 8, 10, 16, or 36!");
 
@@ -2204,7 +2255,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
 
   // For the 2, 8 and 16 bit cases, we can just shift instead of divide
   // because the number of bits per digit (1, 3 and 4 respectively) divides
-  // equaly.  We just shift until the value is zero.
+  // equally.  We just shift until the value is zero.
   if (Radix == 2 || Radix == 8 || Radix == 16) {
     // Just shift tmp right for each digit width until it becomes zero
     unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1));
@@ -2247,7 +2298,7 @@ LLVM_DUMP_METHOD void APInt::dump() const {
   this->toStringUnsigned(U);
   this->toStringSigned(S);
   dbgs() << "APInt(" << BitWidth << "b, "
-         << U << "u " << S << "s)";
+         << U << "u " << S << "s)\n";
 }
 #endif
 
@@ -2262,83 +2313,60 @@ void APInt::print(raw_ostream &OS, bool isSigned) const {
 
 // Assumed by lowHalf, highHalf, partMSB and partLSB.  A fairly safe
 // and unrestricting assumption.
-static_assert(integerPartWidth % 2 == 0, "Part width must be divisible by 2!");
+static_assert(APInt::APINT_BITS_PER_WORD % 2 == 0,
+              "Part width must be divisible by 2!");
 
 /* Some handy functions local to this file.  */
-namespace {
 
-  /* Returns the integer part with the least significant BITS set.
-     BITS cannot be zero.  */
-  static inline integerPart
-  lowBitMask(unsigned int bits)
-  {
-    assert(bits != 0 && bits <= integerPartWidth);
+/* Returns the integer part with the least significant BITS set.
+   BITS cannot be zero.  */
+static inline APInt::WordType lowBitMask(unsigned bits) {
+  assert(bits != 0 && bits <= APInt::APINT_BITS_PER_WORD);
 
-    return ~(integerPart) 0 >> (integerPartWidth - bits);
-  }
+  return ~(APInt::WordType) 0 >> (APInt::APINT_BITS_PER_WORD - bits);
+}
 
-  /* Returns the value of the lower half of PART.  */
-  static inline integerPart
-  lowHalf(integerPart part)
-  {
-    return part & lowBitMask(integerPartWidth / 2);
-  }
+/* Returns the value of the lower half of PART.  */
+static inline APInt::WordType lowHalf(APInt::WordType part) {
+  return part & lowBitMask(APInt::APINT_BITS_PER_WORD / 2);
+}
 
-  /* Returns the value of the upper half of PART.  */
-  static inline integerPart
-  highHalf(integerPart part)
-  {
-    return part >> (integerPartWidth / 2);
-  }
+/* Returns the value of the upper half of PART.  */
+static inline APInt::WordType highHalf(APInt::WordType part) {
+  return part >> (APInt::APINT_BITS_PER_WORD / 2);
+}
 
-  /* Returns the bit number of the most significant set bit of a part.
-     If the input number has no bits set -1U is returned.  */
-  static unsigned int
-  partMSB(integerPart value)
-  {
-    return findLastSet(value, ZB_Max);
-  }
+/* Returns the bit number of the most significant set bit of a part.
+   If the input number has no bits set -1U is returned.  */
+static unsigned partMSB(APInt::WordType value) {
+  return findLastSet(value, ZB_Max);
+}
 
-  /* Returns the bit number of the least significant set bit of a
-     part.  If the input number has no bits set -1U is returned.  */
-  static unsigned int
-  partLSB(integerPart value)
-  {
-    return findFirstSet(value, ZB_Max);
-  }
+/* Returns the bit number of the least significant set bit of a
+   part.  If the input number has no bits set -1U is returned.  */
+static unsigned partLSB(APInt::WordType value) {
+  return findFirstSet(value, ZB_Max);
 }
 
 /* Sets the least significant part of a bignum to the input value, and
    zeroes out higher parts.  */
-void
-APInt::tcSet(integerPart *dst, integerPart part, unsigned int parts)
-{
-  unsigned int i;
-
+void APInt::tcSet(WordType *dst, WordType part, unsigned parts) {
   assert(parts > 0);
 
   dst[0] = part;
-  for (i = 1; i < parts; i++)
+  for (unsigned i = 1; i < parts; i++)
     dst[i] = 0;
 }
 
 /* Assign one bignum to another.  */
-void
-APInt::tcAssign(integerPart *dst, const integerPart *src, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcAssign(WordType *dst, const WordType *src, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] = src[i];
 }
 
 /* Returns true if a bignum is zero, false otherwise.  */
-bool
-APInt::tcIsZero(const integerPart *src, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+bool APInt::tcIsZero(const WordType *src, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     if (src[i])
       return false;
 
@@ -2346,41 +2374,29 @@ APInt::tcIsZero(const integerPart *src, unsigned int parts)
 }
 
 /* Extract the given bit of a bignum; returns 0 or 1.  */
-int
-APInt::tcExtractBit(const integerPart *parts, unsigned int bit)
-{
-  return (parts[bit / integerPartWidth] &
-          ((integerPart) 1 << bit % integerPartWidth)) != 0;
+int APInt::tcExtractBit(const WordType *parts, unsigned bit) {
+  return (parts[whichWord(bit)] & maskBit(bit)) != 0;
 }
 
 /* Set the given bit of a bignum. */
-void
-APInt::tcSetBit(integerPart *parts, unsigned int bit)
-{
-  parts[bit / integerPartWidth] |= (integerPart) 1 << (bit % integerPartWidth);
+void APInt::tcSetBit(WordType *parts, unsigned bit) {
+  parts[whichWord(bit)] |= maskBit(bit);
 }
 
 /* Clears the given bit of a bignum. */
-void
-APInt::tcClearBit(integerPart *parts, unsigned int bit)
-{
-  parts[bit / integerPartWidth] &=
-    ~((integerPart) 1 << (bit % integerPartWidth));
+void APInt::tcClearBit(WordType *parts, unsigned bit) {
+  parts[whichWord(bit)] &= ~maskBit(bit);
 }
 
 /* Returns the bit number of the least significant set bit of a
    number.  If the input number has no bits set -1U is returned.  */
-unsigned int
-APInt::tcLSB(const integerPart *parts, unsigned int n)
-{
-  unsigned int i, lsb;
+unsigned APInt::tcLSB(const WordType *parts, unsigned n) {
+  for (unsigned i = 0; i < n; i++) {
+    if (parts[i] != 0) {
+      unsigned lsb = partLSB(parts[i]);
 
-  for (i = 0; i < n; i++) {
-      if (parts[i] != 0) {
-          lsb = partLSB(parts[i]);
-
-          return lsb + i * integerPartWidth;
-      }
+      return lsb + i * APINT_BITS_PER_WORD;
+    }
   }
 
   return -1U;
@@ -2388,18 +2404,14 @@ APInt::tcLSB(const integerPart *parts, unsigned int n)
 
 /* Returns the bit number of the most significant set bit of a number.
    If the input number has no bits set -1U is returned.  */
-unsigned int
-APInt::tcMSB(const integerPart *parts, unsigned int n)
-{
-  unsigned int msb;
-
+unsigned APInt::tcMSB(const WordType *parts, unsigned n) {
   do {
     --n;
 
     if (parts[n] != 0) {
-      msb = partMSB(parts[n]);
+      unsigned msb = partMSB(parts[n]);
 
-      return msb + n * integerPartWidth;
+      return msb + n * APINT_BITS_PER_WORD;
     }
   } while (n);
 
@@ -2411,31 +2423,28 @@ APInt::tcMSB(const integerPart *parts, unsigned int n)
    the least significant bit of DST.  All high bits above srcBITS in
    DST are zero-filled.  */
 void
-APInt::tcExtract(integerPart *dst, unsigned int dstCount,const integerPart *src,
-                 unsigned int srcBits, unsigned int srcLSB)
-{
-  unsigned int firstSrcPart, dstParts, shift, n;
-
-  dstParts = (srcBits + integerPartWidth - 1) / integerPartWidth;
+APInt::tcExtract(WordType *dst, unsigned dstCount, const WordType *src,
+                 unsigned srcBits, unsigned srcLSB) {
+  unsigned dstParts = (srcBits + APINT_BITS_PER_WORD - 1) / APINT_BITS_PER_WORD;
   assert(dstParts <= dstCount);
 
-  firstSrcPart = srcLSB / integerPartWidth;
+  unsigned firstSrcPart = srcLSB / APINT_BITS_PER_WORD;
   tcAssign (dst, src + firstSrcPart, dstParts);
 
-  shift = srcLSB % integerPartWidth;
+  unsigned shift = srcLSB % APINT_BITS_PER_WORD;
   tcShiftRight (dst, dstParts, shift);
 
-  /* We now have (dstParts * integerPartWidth - shift) bits from SRC
+  /* We now have (dstParts * APINT_BITS_PER_WORD - shift) bits from SRC
      in DST.  If this is less that srcBits, append the rest, else
      clear the high bits.  */
-  n = dstParts * integerPartWidth - shift;
+  unsigned n = dstParts * APINT_BITS_PER_WORD - shift;
   if (n < srcBits) {
-    integerPart mask = lowBitMask (srcBits - n);
+    WordType mask = lowBitMask (srcBits - n);
     dst[dstParts - 1] |= ((src[firstSrcPart + dstParts] & mask)
-                          << n % integerPartWidth);
+                          << n % APINT_BITS_PER_WORD);
   } else if (n > srcBits) {
-    if (srcBits % integerPartWidth)
-      dst[dstParts - 1] &= lowBitMask (srcBits % integerPartWidth);
+    if (srcBits % APINT_BITS_PER_WORD)
+      dst[dstParts - 1] &= lowBitMask (srcBits % APINT_BITS_PER_WORD);
   }
 
   /* Clear high parts.  */
@@ -2444,18 +2453,12 @@ APInt::tcExtract(integerPart *dst, unsigned int dstCount,const integerPart *src,
 }
 
 /* DST += RHS + C where C is zero or one.  Returns the carry flag.  */
-integerPart
-APInt::tcAdd(integerPart *dst, const integerPart *rhs,
-             integerPart c, unsigned int parts)
-{
-  unsigned int i;
-
+APInt::WordType APInt::tcAdd(WordType *dst, const WordType *rhs,
+                             WordType c, unsigned parts) {
   assert(c <= 1);
 
-  for (i = 0; i < parts; i++) {
-    integerPart l;
-
-    l = dst[i];
+  for (unsigned i = 0; i < parts; i++) {
+    WordType l = dst[i];
     if (c) {
       dst[i] += rhs[i] + 1;
       c = (dst[i] <= l);
@@ -2469,18 +2472,12 @@ APInt::tcAdd(integerPart *dst, const integerPart *rhs,
 }
 
 /* DST -= RHS + C where C is zero or one.  Returns the carry flag.  */
-integerPart
-APInt::tcSubtract(integerPart *dst, const integerPart *rhs,
-                  integerPart c, unsigned int parts)
-{
-  unsigned int i;
-
+APInt::WordType APInt::tcSubtract(WordType *dst, const WordType *rhs,
+                                  WordType c, unsigned parts) {
   assert(c <= 1);
 
-  for (i = 0; i < parts; i++) {
-    integerPart l;
-
-    l = dst[i];
+  for (unsigned i = 0; i < parts; i++) {
+    WordType l = dst[i];
     if (c) {
       dst[i] -= rhs[i] + 1;
       c = (dst[i] >= l);
@@ -2494,9 +2491,7 @@ APInt::tcSubtract(integerPart *dst, const integerPart *rhs,
 }
 
 /* Negate a bignum in-place.  */
-void
-APInt::tcNegate(integerPart *dst, unsigned int parts)
-{
+void APInt::tcNegate(WordType *dst, unsigned parts) {
   tcComplement(dst, parts);
   tcIncrement(dst, parts);
 }
@@ -2512,23 +2507,20 @@ APInt::tcNegate(integerPart *dst, unsigned int parts)
     DSTPARTS parts of the result, and if all of the omitted higher
     parts were zero return zero, otherwise overflow occurred and
     return one.  */
-int
-APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
-                      integerPart multiplier, integerPart carry,
-                      unsigned int srcParts, unsigned int dstParts,
-                      bool add)
-{
-  unsigned int i, n;
-
+int APInt::tcMultiplyPart(WordType *dst, const WordType *src,
+                          WordType multiplier, WordType carry,
+                          unsigned srcParts, unsigned dstParts,
+                          bool add) {
   /* Otherwise our writes of DST kill our later reads of SRC.  */
   assert(dst <= src || dst >= src + srcParts);
   assert(dstParts <= srcParts + 1);
 
   /* N loops; minimum of dstParts and srcParts.  */
-  n = dstParts < srcParts ? dstParts: srcParts;
+  unsigned n = dstParts < srcParts ? dstParts: srcParts;
 
+  unsigned i;
   for (i = 0; i < n; i++) {
-    integerPart low, mid, high, srcPart;
+    WordType low, mid, high, srcPart;
 
       /* [ LOW, HIGH ] = MULTIPLIER * SRC[i] + DST[i] + CARRY.
 
@@ -2540,7 +2532,7 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
 
     srcPart = src[i];
 
-    if (multiplier == 0 || srcPart == 0)        {
+    if (multiplier == 0 || srcPart == 0) {
       low = carry;
       high = 0;
     } else {
@@ -2549,14 +2541,14 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
 
       mid = lowHalf(srcPart) * highHalf(multiplier);
       high += highHalf(mid);
-      mid <<= integerPartWidth / 2;
+      mid <<= APINT_BITS_PER_WORD / 2;
       if (low + mid < low)
         high++;
       low += mid;
 
       mid = highHalf(srcPart) * lowHalf(multiplier);
       high += highHalf(mid);
-      mid <<= integerPartWidth / 2;
+      mid <<= APINT_BITS_PER_WORD / 2;
       if (low + mid < low)
         high++;
       low += mid;
@@ -2605,19 +2597,14 @@ APInt::tcMultiplyPart(integerPart *dst, const integerPart *src,
    is filled with the least significant parts of the result.  Returns
    one if overflow occurred, otherwise zero.  DST must be disjoint
    from both operands.  */
-int
-APInt::tcMultiply(integerPart *dst, const integerPart *lhs,
-                  const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-  int overflow;
-
+int APInt::tcMultiply(WordType *dst, const WordType *lhs,
+                      const WordType *rhs, unsigned parts) {
   assert(dst != lhs && dst != rhs);
 
-  overflow = 0;
+  int overflow = 0;
   tcSet(dst, 0, parts);
 
-  for (i = 0; i < parts; i++)
+  for (unsigned i = 0; i < parts; i++)
     overflow |= tcMultiplyPart(&dst[i], lhs, rhs[i], 0, parts,
                                parts - i, true);
 
@@ -2628,25 +2615,21 @@ APInt::tcMultiply(integerPart *dst, const integerPart *lhs,
    operands.  No overflow occurs.  DST must be disjoint from both
    operands.  Returns the number of parts required to hold the
    result.  */
-unsigned int
-APInt::tcFullMultiply(integerPart *dst, const integerPart *lhs,
-                      const integerPart *rhs, unsigned int lhsParts,
-                      unsigned int rhsParts)
-{
+unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs,
+                               const WordType *rhs, unsigned lhsParts,
+                               unsigned rhsParts) {
   /* Put the narrower number on the LHS for less loops below.  */
   if (lhsParts > rhsParts) {
     return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts);
   } else {
-    unsigned int n;
-
     assert(dst != lhs && dst != rhs);
 
     tcSet(dst, 0, rhsParts);
 
-    for (n = 0; n < lhsParts; n++)
-      tcMultiplyPart(&dst[n], rhs, lhs[n], 0, rhsParts, rhsParts + 1, true);
+    for (unsigned i = 0; i < lhsParts; i++)
+      tcMultiplyPart(&dst[i], rhs, lhs[i], 0, rhsParts, rhsParts + 1, true);
 
-    n = lhsParts + rhsParts;
+    unsigned n = lhsParts + rhsParts;
 
     return n - (dst[n - 1] == 0);
   }
@@ -2662,23 +2645,18 @@ APInt::tcFullMultiply(integerPart *dst, const integerPart *lhs,
    use by the routine; its contents need not be initialized and are
    destroyed.  LHS, REMAINDER and SCRATCH must be distinct.
 */
-int
-APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
-                integerPart *remainder, integerPart *srhs,
-                unsigned int parts)
-{
-  unsigned int n, shiftCount;
-  integerPart mask;
-
+int APInt::tcDivide(WordType *lhs, const WordType *rhs,
+                    WordType *remainder, WordType *srhs,
+                    unsigned parts) {
   assert(lhs != remainder && lhs != srhs && remainder != srhs);
 
-  shiftCount = tcMSB(rhs, parts) + 1;
+  unsigned shiftCount = tcMSB(rhs, parts) + 1;
   if (shiftCount == 0)
     return true;
 
-  shiftCount = parts * integerPartWidth - shiftCount;
-  n = shiftCount / integerPartWidth;
-  mask = (integerPart) 1 << (shiftCount % integerPartWidth);
+  shiftCount = parts * APINT_BITS_PER_WORD - shiftCount;
+  unsigned n = shiftCount / APINT_BITS_PER_WORD;
+  WordType mask = (WordType) 1 << (shiftCount % APINT_BITS_PER_WORD);
 
   tcAssign(srhs, rhs, parts);
   tcShiftLeft(srhs, parts, shiftCount);
@@ -2701,7 +2679,7 @@ APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
       shiftCount--;
       tcShiftRight(srhs, parts, 1);
       if ((mask >>= 1) == 0) {
-        mask = (integerPart) 1 << (integerPartWidth - 1);
+        mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1);
         n--;
       }
   }
@@ -2711,18 +2689,14 @@ APInt::tcDivide(integerPart *lhs, const integerPart *rhs,
 
 /* Shift a bignum left COUNT bits in-place.  Shifted in bits are zero.
    There are no restrictions on COUNT.  */
-void
-APInt::tcShiftLeft(integerPart *dst, unsigned int parts, unsigned int count)
-{
+void APInt::tcShiftLeft(WordType *dst, unsigned parts, unsigned count) {
   if (count) {
-    unsigned int jump, shift;
-
     /* Jump is the inter-part jump; shift is is intra-part shift.  */
-    jump = count / integerPartWidth;
-    shift = count % integerPartWidth;
+    unsigned jump = count / APINT_BITS_PER_WORD;
+    unsigned shift = count % APINT_BITS_PER_WORD;
 
     while (parts > jump) {
-      integerPart part;
+      WordType part;
 
       parts--;
 
@@ -2732,7 +2706,7 @@ APInt::tcShiftLeft(integerPart *dst, unsigned int parts, unsigned int count)
       if (shift) {
         part <<= shift;
         if (parts >= jump + 1)
-          part |= dst[parts - jump - 1] >> (integerPartWidth - shift);
+          part |= dst[parts - jump - 1] >> (APINT_BITS_PER_WORD - shift);
       }
 
       dst[parts] = part;
@@ -2745,20 +2719,16 @@ APInt::tcShiftLeft(integerPart *dst, unsigned int parts, unsigned int count)
 
 /* Shift a bignum right COUNT bits in-place.  Shifted in bits are
    zero.  There are no restrictions on COUNT.  */
-void
-APInt::tcShiftRight(integerPart *dst, unsigned int parts, unsigned int count)
-{
+void APInt::tcShiftRight(WordType *dst, unsigned parts, unsigned count) {
   if (count) {
-    unsigned int i, jump, shift;
-
     /* Jump is the inter-part jump; shift is is intra-part shift.  */
-    jump = count / integerPartWidth;
-    shift = count % integerPartWidth;
+    unsigned jump = count / APINT_BITS_PER_WORD;
+    unsigned shift = count % APINT_BITS_PER_WORD;
 
     /* Perform the shift.  This leaves the most significant COUNT bits
        of the result at zero.  */
-    for (i = 0; i < parts; i++) {
-      integerPart part;
+    for (unsigned i = 0; i < parts; i++) {
+      WordType part;
 
       if (i + jump >= parts) {
         part = 0;
@@ -2767,7 +2737,7 @@ APInt::tcShiftRight(integerPart *dst, unsigned int parts, unsigned int count)
         if (shift) {
           part >>= shift;
           if (i + jump + 1 < parts)
-            part |= dst[i + jump + 1] << (integerPartWidth - shift);
+            part |= dst[i + jump + 1] << (APINT_BITS_PER_WORD - shift);
         }
       }
 
@@ -2777,70 +2747,46 @@ APInt::tcShiftRight(integerPart *dst, unsigned int parts, unsigned int count)
 }
 
 /* Bitwise and of two bignums.  */
-void
-APInt::tcAnd(integerPart *dst, const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcAnd(WordType *dst, const WordType *rhs, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] &= rhs[i];
 }
 
 /* Bitwise inclusive or of two bignums.  */
-void
-APInt::tcOr(integerPart *dst, const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcOr(WordType *dst, const WordType *rhs, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] |= rhs[i];
 }
 
 /* Bitwise exclusive or of two bignums.  */
-void
-APInt::tcXor(integerPart *dst, const integerPart *rhs, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcXor(WordType *dst, const WordType *rhs, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] ^= rhs[i];
 }
 
 /* Complement a bignum in-place.  */
-void
-APInt::tcComplement(integerPart *dst, unsigned int parts)
-{
-  unsigned int i;
-
-  for (i = 0; i < parts; i++)
+void APInt::tcComplement(WordType *dst, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++)
     dst[i] = ~dst[i];
 }
 
 /* Comparison (unsigned) of two bignums.  */
-int
-APInt::tcCompare(const integerPart *lhs, const integerPart *rhs,
-                 unsigned int parts)
-{
+int APInt::tcCompare(const WordType *lhs, const WordType *rhs,
+                     unsigned parts) {
   while (parts) {
-      parts--;
-      if (lhs[parts] == rhs[parts])
-        continue;
+    parts--;
+    if (lhs[parts] == rhs[parts])
+      continue;
 
-      if (lhs[parts] > rhs[parts])
-        return 1;
-      else
-        return -1;
-    }
+    return (lhs[parts] > rhs[parts]) ? 1 : -1;
+  }
 
   return 0;
 }
 
 /* Increment a bignum in-place, return the carry flag.  */
-integerPart
-APInt::tcIncrement(integerPart *dst, unsigned int parts)
-{
-  unsigned int i;
-
+APInt::WordType APInt::tcIncrement(WordType *dst, unsigned parts) {
+  unsigned i;
   for (i = 0; i < parts; i++)
     if (++dst[i] != 0)
       break;
@@ -2849,9 +2795,8 @@ APInt::tcIncrement(integerPart *dst, unsigned int parts)
 }
 
 /* Decrement a bignum in-place, return the borrow flag.  */
-integerPart
-APInt::tcDecrement(integerPart *dst, unsigned int parts) {
-  for (unsigned int i = 0; i < parts; i++) {
+APInt::WordType APInt::tcDecrement(WordType *dst, unsigned parts) {
+  for (unsigned i = 0; i < parts; i++) {
     // If the current word is non-zero, then the decrement has no effect on the
     // higher-order words of the integer and no borrow can occur. Exit early.
     if (dst[i]--)
@@ -2864,20 +2809,16 @@ APInt::tcDecrement(integerPart *dst, unsigned int parts) {
 
 /* Set the least significant BITS bits of a bignum, clear the
    rest.  */
-void
-APInt::tcSetLeastSignificantBits(integerPart *dst, unsigned int parts,
-                                 unsigned int bits)
-{
-  unsigned int i;
-
-  i = 0;
-  while (bits > integerPartWidth) {
-    dst[i++] = ~(integerPart) 0;
-    bits -= integerPartWidth;
+void APInt::tcSetLeastSignificantBits(WordType *dst, unsigned parts,
+                                      unsigned bits) {
+  unsigned i = 0;
+  while (bits > APINT_BITS_PER_WORD) {
+    dst[i++] = ~(WordType) 0;
+    bits -= APINT_BITS_PER_WORD;
   }
 
   if (bits)
-    dst[i++] = ~(integerPart) 0 >> (integerPartWidth - bits);
+    dst[i++] = ~(WordType) 0 >> (APINT_BITS_PER_WORD - bits);
 
   while (i < parts)
     dst[i++] = 0;
diff --git a/lib/Support/BinaryStreamError.cpp b/lib/Support/BinaryStreamError.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..60f5e21f041ab34aea836dc65966321cebff1374
--- /dev/null
+++ b/lib/Support/BinaryStreamError.cpp
@@ -0,0 +1,56 @@
+//===- BinaryStreamError.cpp - Error extensions for streams -----*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+char BinaryStreamError::ID = 0;
+
+BinaryStreamError::BinaryStreamError(stream_error_code C)
+    : BinaryStreamError(C, "") {}
+
+BinaryStreamError::BinaryStreamError(StringRef Context)
+    : BinaryStreamError(stream_error_code::unspecified, Context) {}
+
+BinaryStreamError::BinaryStreamError(stream_error_code C, StringRef Context)
+    : Code(C) {
+  ErrMsg = "Stream Error: ";
+  switch (C) {
+  case stream_error_code::unspecified:
+    ErrMsg += "An unspecified error has occurred.";
+    break;
+  case stream_error_code::stream_too_short:
+    ErrMsg += "The stream is too short to perform the requested operation.";
+    break;
+  case stream_error_code::invalid_array_size:
+    ErrMsg += "The buffer size is not a multiple of the array element size.";
+    break;
+  case stream_error_code::invalid_offset:
+    ErrMsg += "The specified offset is invalid for the current stream.";
+    break;
+  case stream_error_code::filesystem_error:
+    ErrMsg += "An I/O error occurred on the file system.";
+    break;
+  }
+
+  if (!Context.empty()) {
+    ErrMsg += "  ";
+    ErrMsg += Context;
+  }
+}
+
+void BinaryStreamError::log(raw_ostream &OS) const { OS << ErrMsg << "\n"; }
+
+StringRef BinaryStreamError::getErrorMessage() const { return ErrMsg; }
+
+std::error_code BinaryStreamError::convertToErrorCode() const {
+  return inconvertibleErrorCode();
+}
diff --git a/lib/Support/BinaryStreamReader.cpp b/lib/Support/BinaryStreamReader.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7a2e0ddb179b8519cd5407384efa9df13ec8c96
--- /dev/null
+++ b/lib/Support/BinaryStreamReader.cpp
@@ -0,0 +1,95 @@
+//===- BinaryStreamReader.cpp - Reads objects from a binary stream --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamReader.h"
+
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/BinaryStreamRef.h"
+
+using namespace llvm;
+
+BinaryStreamReader::BinaryStreamReader(BinaryStreamRef S)
+    : Stream(S), Offset(0) {}
+
+Error BinaryStreamReader::readLongestContiguousChunk(
+    ArrayRef<uint8_t> &Buffer) {
+  if (auto EC = Stream.readLongestContiguousChunk(Offset, Buffer))
+    return EC;
+  Offset += Buffer.size();
+  return Error::success();
+}
+
+Error BinaryStreamReader::readBytes(ArrayRef<uint8_t> &Buffer, uint32_t Size) {
+  if (auto EC = Stream.readBytes(Offset, Size, Buffer))
+    return EC;
+  Offset += Size;
+  return Error::success();
+}
+
+Error BinaryStreamReader::readCString(StringRef &Dest) {
+  // TODO: This could be made more efficient by using readLongestContiguousChunk
+  // and searching for null terminators in the resulting buffer.
+
+  uint32_t Length = 0;
+  // First compute the length of the string by reading 1 byte at a time.
+  uint32_t OriginalOffset = getOffset();
+  const char *C;
+  while (true) {
+    if (auto EC = readObject(C))
+      return EC;
+    if (*C == '\0')
+      break;
+    ++Length;
+  }
+  // Now go back and request a reference for that many bytes.
+  uint32_t NewOffset = getOffset();
+  setOffset(OriginalOffset);
+
+  if (auto EC = readFixedString(Dest, Length))
+    return EC;
+
+  // Now set the offset back to where it was after we calculated the length.
+  setOffset(NewOffset);
+  return Error::success();
+}
+
+Error BinaryStreamReader::readFixedString(StringRef &Dest, uint32_t Length) {
+  ArrayRef<uint8_t> Bytes;
+  if (auto EC = readBytes(Bytes, Length))
+    return EC;
+  Dest = StringRef(reinterpret_cast<const char *>(Bytes.begin()), Bytes.size());
+  return Error::success();
+}
+
+Error BinaryStreamReader::readStreamRef(BinaryStreamRef &Ref) {
+  return readStreamRef(Ref, bytesRemaining());
+}
+
+Error BinaryStreamReader::readStreamRef(BinaryStreamRef &Ref, uint32_t Length) {
+  if (bytesRemaining() < Length)
+    return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+  Ref = Stream.slice(Offset, Length);
+  Offset += Length;
+  return Error::success();
+}
+
+Error BinaryStreamReader::skip(uint32_t Amount) {
+  if (Amount > bytesRemaining())
+    return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+  Offset += Amount;
+  return Error::success();
+}
+
+uint8_t BinaryStreamReader::peek() const {
+  ArrayRef<uint8_t> Buffer;
+  auto EC = Stream.readBytes(Offset, 1, Buffer);
+  assert(!EC && "Cannot peek an empty buffer!");
+  llvm::consumeError(std::move(EC));
+  return Buffer[0];
+}
diff --git a/lib/Support/BinaryStreamWriter.cpp b/lib/Support/BinaryStreamWriter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d60b75642d0f148cd59a21f09333beb2da087943
--- /dev/null
+++ b/lib/Support/BinaryStreamWriter.cpp
@@ -0,0 +1,68 @@
+//===- BinaryStreamWriter.cpp - Writes objects to a BinaryStream ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryStreamWriter.h"
+
+#include "llvm/Support/BinaryStreamError.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
+
+using namespace llvm;
+
+BinaryStreamWriter::BinaryStreamWriter(WritableBinaryStreamRef S)
+    : Stream(S), Offset(0) {}
+
+Error BinaryStreamWriter::writeBytes(ArrayRef<uint8_t> Buffer) {
+  if (auto EC = Stream.writeBytes(Offset, Buffer))
+    return EC;
+  Offset += Buffer.size();
+  return Error::success();
+}
+
+Error BinaryStreamWriter::writeCString(StringRef Str) {
+  if (auto EC = writeFixedString(Str))
+    return EC;
+  if (auto EC = writeObject('\0'))
+    return EC;
+
+  return Error::success();
+}
+
+Error BinaryStreamWriter::writeFixedString(StringRef Str) {
+  return writeBytes(ArrayRef<uint8_t>(Str.bytes_begin(), Str.bytes_end()));
+}
+
+Error BinaryStreamWriter::writeStreamRef(BinaryStreamRef Ref) {
+  return writeStreamRef(Ref, Ref.getLength());
+}
+
+Error BinaryStreamWriter::writeStreamRef(BinaryStreamRef Ref, uint32_t Length) {
+  BinaryStreamReader SrcReader(Ref.slice(0, Length));
+  // This is a bit tricky.  If we just call readBytes, we are requiring that it
+  // return us the entire stream as a contiguous buffer.  There is no guarantee
+  // this can be satisfied by returning a reference straight from the buffer, as
+  // an implementation may not store all data in a single contiguous buffer.  So
+  // we iterate over each contiguous chunk, writing each one in succession.
+  while (SrcReader.bytesRemaining() > 0) {
+    ArrayRef<uint8_t> Chunk;
+    if (auto EC = SrcReader.readLongestContiguousChunk(Chunk))
+      return EC;
+    if (auto EC = writeBytes(Chunk))
+      return EC;
+  }
+  return Error::success();
+}
+
+Error BinaryStreamWriter::padToAlignment(uint32_t Align) {
+  uint32_t NewOffset = alignTo(Offset, Align);
+  if (NewOffset > getLength())
+    return make_error<BinaryStreamError>(stream_error_code::stream_too_short);
+  Offset = NewOffset;
+  return Error::success();
+}
diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt
index 3301364cf2c6ad52849c40ff5832722f3aa8c4de..491614b4bf632c5e12bdd948578011d6897109ce 100644
--- a/lib/Support/CMakeLists.txt
+++ b/lib/Support/CMakeLists.txt
@@ -9,6 +9,9 @@ elseif( CMAKE_HOST_UNIX )
   if( HAVE_LIBDL )
     set(system_libs ${system_libs} ${CMAKE_DL_LIBS})
   endif()
+  if( HAVE_BACKTRACE )
+    set(system_libs ${system_libs} ${Backtrace_LIBRARIES})
+  endif()
   if(LLVM_ENABLE_TERMINFO)
     if(HAVE_TERMINFO)
       set(system_libs ${system_libs} ${TERMINFO_LIBS})
@@ -17,7 +20,7 @@ elseif( CMAKE_HOST_UNIX )
   if( LLVM_ENABLE_THREADS AND HAVE_LIBATOMIC )
     set(system_libs ${system_libs} atomic)
   endif()
-  set(system_libs ${system_libs} ${PTHREAD_LIB})
+  set(system_libs ${system_libs} ${LLVM_PTHREAD_LIB})
   if ( LLVM_ENABLE_ZLIB AND HAVE_LIBZ )
     set(system_libs ${system_libs} z)
   endif()
@@ -34,6 +37,9 @@ add_llvm_library(LLVMSupport
   ARMAttributeParser.cpp
   ARMWinEH.cpp
   Allocator.cpp
+  BinaryStreamError.cpp
+  BinaryStreamReader.cpp
+  BinaryStreamWriter.cpp
   BlockFrequency.cpp
   BranchProbability.cpp
   CachePruning.cpp
@@ -47,6 +53,7 @@ add_llvm_library(LLVMSupport
   CrashRecoveryContext.cpp
   DataExtractor.cpp
   Debug.cpp
+  DebugCounter.cpp
   DeltaAlgorithm.cpp
   DAGDeltaAlgorithm.cpp
   Dwarf.cpp
@@ -67,6 +74,7 @@ add_llvm_library(LLVMSupport
   LineIterator.cpp
   Locale.cpp
   LockFileManager.cpp
+  LowLevelType.cpp
   ManagedStatic.cpp
   MathExtras.cpp
   MemoryBuffer.cpp
@@ -135,7 +143,7 @@ add_llvm_library(LLVMSupport
   Windows
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/ADT
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Support
-
+  ${Backtrace_INCLUDE_DIRS}
   LINK_LIBS ${system_libs}
   )
 
diff --git a/lib/Support/CachePruning.cpp b/lib/Support/CachePruning.cpp
index 3831625962ca7fb7e3f6279e3417e8bd9a2cff5a..aca1236395655839ca60fca32252113e4139e344 100644
--- a/lib/Support/CachePruning.cpp
+++ b/lib/Support/CachePruning.cpp
@@ -15,6 +15,7 @@
 
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,8 +34,75 @@ static void writeTimestampFile(StringRef TimestampFile) {
   raw_fd_ostream Out(TimestampFile.str(), EC, sys::fs::F_None);
 }
 
+static Expected<std::chrono::seconds> parseDuration(StringRef Duration) {
+  if (Duration.empty())
+    return make_error<StringError>("Duration must not be empty",
+                                   inconvertibleErrorCode());
+
+  StringRef NumStr = Duration.slice(0, Duration.size()-1);
+  uint64_t Num;
+  if (NumStr.getAsInteger(0, Num))
+    return make_error<StringError>("'" + NumStr + "' not an integer",
+                                   inconvertibleErrorCode());
+
+  switch (Duration.back()) {
+  case 's':
+    return std::chrono::seconds(Num);
+  case 'm':
+    return std::chrono::minutes(Num);
+  case 'h':
+    return std::chrono::hours(Num);
+  default:
+    return make_error<StringError>("'" + Duration +
+                                       "' must end with one of 's', 'm' or 'h'",
+                                   inconvertibleErrorCode());
+  }
+}
+
+Expected<CachePruningPolicy>
+llvm::parseCachePruningPolicy(StringRef PolicyStr) {
+  CachePruningPolicy Policy;
+  std::pair<StringRef, StringRef> P = {"", PolicyStr};
+  while (!P.second.empty()) {
+    P = P.second.split(':');
+
+    StringRef Key, Value;
+    std::tie(Key, Value) = P.first.split('=');
+    if (Key == "prune_interval") {
+      auto DurationOrErr = parseDuration(Value);
+      if (!DurationOrErr)
+        return DurationOrErr.takeError();
+      Policy.Interval = *DurationOrErr;
+    } else if (Key == "prune_after") {
+      auto DurationOrErr = parseDuration(Value);
+      if (!DurationOrErr)
+        return DurationOrErr.takeError();
+      Policy.Expiration = *DurationOrErr;
+    } else if (Key == "cache_size") {
+      if (Value.back() != '%')
+        return make_error<StringError>("'" + Value + "' must be a percentage",
+                                       inconvertibleErrorCode());
+      StringRef SizeStr = Value.slice(0, Value.size() - 1);
+      uint64_t Size;
+      if (SizeStr.getAsInteger(0, Size))
+        return make_error<StringError>("'" + SizeStr + "' not an integer",
+                                       inconvertibleErrorCode());
+      if (Size > 100)
+        return make_error<StringError>("'" + SizeStr +
+                                           "' must be between 0 and 100",
+                                       inconvertibleErrorCode());
+      Policy.PercentageOfAvailableSpace = Size;
+    } else {
+      return make_error<StringError>("Unknown key: '" + Key + "'",
+                                     inconvertibleErrorCode());
+    }
+  }
+
+  return Policy;
+}
+
 /// Prune the cache of files that haven't been accessed in a long time.
-bool CachePruning::prune() {
+bool llvm::pruneCache(StringRef Path, CachePruningPolicy Policy) {
   using namespace std::chrono;
 
   if (Path.empty())
@@ -47,7 +115,11 @@ bool CachePruning::prune() {
   if (!isPathDir)
     return false;
 
-  if (Expiration == seconds(0) && PercentageOfAvailableSpace == 0) {
+  Policy.PercentageOfAvailableSpace =
+      std::min(Policy.PercentageOfAvailableSpace, 100u);
+
+  if (Policy.Expiration == seconds(0) &&
+      Policy.PercentageOfAvailableSpace == 0) {
     DEBUG(dbgs() << "No pruning settings set, exit early\n");
     // Nothing will be pruned, early exit
     return false;
@@ -67,12 +139,12 @@ bool CachePruning::prune() {
       return false;
     }
   } else {
-    if (Interval == seconds(0)) {
+    if (Policy.Interval == seconds(0)) {
       // Check whether the time stamp is older than our pruning interval.
       // If not, do nothing.
       const auto TimeStampModTime = FileStatus.getLastModificationTime();
       auto TimeStampAge = CurrentTime - TimeStampModTime;
-      if (TimeStampAge <= Interval) {
+      if (TimeStampAge <= Policy.Interval) {
         DEBUG(dbgs() << "Timestamp file too recent ("
                      << duration_cast<seconds>(TimeStampAge).count()
                      << "s old), do not prune.\n");
@@ -85,7 +157,7 @@ bool CachePruning::prune() {
     writeTimestampFile(TimestampFile);
   }
 
-  bool ShouldComputeSize = (PercentageOfAvailableSpace > 0);
+  bool ShouldComputeSize = (Policy.PercentageOfAvailableSpace > 0);
 
   // Keep track of space
   std::set<std::pair<uint64_t, std::string>> FileSizes;
@@ -108,8 +180,11 @@ bool CachePruning::prune() {
   // Walk all of the files within this directory.
   for (sys::fs::directory_iterator File(CachePathNative, EC), FileEnd;
        File != FileEnd && !EC; File.increment(EC)) {
-    // Do not touch the timestamp.
-    if (File->path() == TimestampFile)
+    // Ignore any files not beginning with the string "llvmcache-". This
+    // includes the timestamp file as well as any files created by the user.
+    // This acts as a safeguard against data loss if the user specifies the
+    // wrong directory as their cache directory.
+    if (!sys::path::filename(File->path()).startswith("llvmcache-"))
       continue;
 
     // Look at this file. If we can't stat it, there's nothing interesting
@@ -122,7 +197,7 @@ bool CachePruning::prune() {
     // If the file hasn't been used recently enough, delete it
     const auto FileAccessTime = FileStatus.getLastAccessedTime();
     auto FileAge = CurrentTime - FileAccessTime;
-    if (FileAge > Expiration) {
+    if (FileAge > Policy.Expiration) {
       DEBUG(dbgs() << "Remove " << File->path() << " ("
                    << duration_cast<seconds>(FileAge).count() << "s old)\n");
       sys::fs::remove(File->path());
@@ -143,9 +218,11 @@ bool CachePruning::prune() {
     auto AvailableSpace = TotalSize + SpaceInfo.free;
     auto FileAndSize = FileSizes.rbegin();
     DEBUG(dbgs() << "Occupancy: " << ((100 * TotalSize) / AvailableSpace)
-                 << "% target is: " << PercentageOfAvailableSpace << "\n");
+                 << "% target is: " << Policy.PercentageOfAvailableSpace
+                 << "\n");
     // Remove the oldest accessed files first, till we get below the threshold
-    while (((100 * TotalSize) / AvailableSpace) > PercentageOfAvailableSpace &&
+    while (((100 * TotalSize) / AvailableSpace) >
+               Policy.PercentageOfAvailableSpace &&
            FileAndSize != FileSizes.rend()) {
       // Remove the file.
       sys::fs::remove(FileAndSize->second);
diff --git a/lib/Support/Chrono.cpp b/lib/Support/Chrono.cpp
index cdadbd87997948dacc2c7151bb4acb71394f1d47..daccaf1fc103d58e4590afb32dcade88663649a7 100644
--- a/lib/Support/Chrono.cpp
+++ b/lib/Support/Chrono.cpp
@@ -16,6 +16,13 @@ namespace llvm {
 
 using namespace sys;
 
+const char llvm::detail::unit<std::ratio<3600>>::value[] = "h";
+const char llvm::detail::unit<std::ratio<60>>::value[] = "m";
+const char llvm::detail::unit<std::ratio<1>>::value[] = "s";
+const char llvm::detail::unit<std::milli>::value[] = "ms";
+const char llvm::detail::unit<std::micro>::value[] = "us";
+const char llvm::detail::unit<std::nano>::value[] = "ns";
+
 static inline struct tm getStructTM(TimePoint<> TP) {
   struct tm Storage;
   std::time_t OurTime = toTimeT(TP);
diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp
index 3889902eea54a208eb5bf209ac15381603d85cde..f4a9108b8544e5ea776b242a59ac4b2906acf696 100644
--- a/lib/Support/CommandLine.cpp
+++ b/lib/Support/CommandLine.cpp
@@ -123,7 +123,7 @@ public:
   void ResetAllOptionOccurrences();
 
   bool ParseCommandLineOptions(int argc, const char *const *argv,
-                               StringRef Overview, bool IgnoreErrors);
+                               StringRef Overview, raw_ostream *Errs = nullptr);
 
   void addLiteralOption(Option &Opt, SubCommand *SC, StringRef Name) {
     if (Opt.hasArgStr())
@@ -1013,9 +1013,9 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar,
 }
 
 bool cl::ParseCommandLineOptions(int argc, const char *const *argv,
-                                 StringRef Overview, bool IgnoreErrors) {
+                                 StringRef Overview, raw_ostream *Errs) {
   return GlobalParser->ParseCommandLineOptions(argc, argv, Overview,
-                                               IgnoreErrors);
+                                               Errs);
 }
 
 void CommandLineParser::ResetAllOptionOccurrences() {
@@ -1030,7 +1030,7 @@ void CommandLineParser::ResetAllOptionOccurrences() {
 bool CommandLineParser::ParseCommandLineOptions(int argc,
                                                 const char *const *argv,
                                                 StringRef Overview,
-                                                bool IgnoreErrors) {
+                                                raw_ostream *Errs) {
   assert(hasOptions() && "No options specified!");
 
   // Expand response files.
@@ -1045,6 +1045,9 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
   ProgramName = sys::path::filename(StringRef(argv[0]));
 
   ProgramOverview = Overview;
+  bool IgnoreErrors = Errs;
+  if (!Errs)
+    Errs = &errs();
   bool ErrorParsing = false;
 
   // Check out the positional arguments to collect information about them.
@@ -1097,15 +1100,14 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
         // not specified after an option that eats all extra arguments, or this
         // one will never get any!
         //
-        if (!IgnoreErrors) {
+        if (!IgnoreErrors)
           Opt->error("error - option can never match, because "
                      "another positional argument will match an "
                      "unbounded number of values, and this option"
                      " does not require a value!");
-          errs() << ProgramName << ": CommandLine Error: Option '"
-                 << Opt->ArgStr << "' is all messed up!\n";
-          errs() << PositionalOpts.size();
-        }
+        *Errs << ProgramName << ": CommandLine Error: Option '" << Opt->ArgStr
+              << "' is all messed up!\n";
+        *Errs << PositionalOpts.size();
         ErrorParsing = true;
       }
       UnboundedFound |= EatsUnboundedNumberOfValues(Opt);
@@ -1200,15 +1202,13 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
 
     if (!Handler) {
       if (SinkOpts.empty()) {
-        if (!IgnoreErrors) {
-          errs() << ProgramName << ": Unknown command line argument '"
-                 << argv[i] << "'.  Try: '" << argv[0] << " -help'\n";
-
-          if (NearestHandler) {
-            // If we know a near match, report it as well.
-            errs() << ProgramName << ": Did you mean '-" << NearestHandlerString
-                   << "'?\n";
-          }
+        *Errs << ProgramName << ": Unknown command line argument '" << argv[i]
+              << "'.  Try: '" << argv[0] << " -help'\n";
+
+        if (NearestHandler) {
+          // If we know a near match, report it as well.
+          *Errs << ProgramName << ": Did you mean '-" << NearestHandlerString
+                 << "'?\n";
         }
 
         ErrorParsing = true;
@@ -1231,22 +1231,18 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
 
   // Check and handle positional arguments now...
   if (NumPositionalRequired > PositionalVals.size()) {
-    if (!IgnoreErrors) {
-      errs() << ProgramName
+      *Errs << ProgramName
              << ": Not enough positional command line arguments specified!\n"
              << "Must specify at least " << NumPositionalRequired
              << " positional argument" << (NumPositionalRequired > 1 ? "s" : "")
              << ": See: " << argv[0] << " - help\n";
-    }
 
     ErrorParsing = true;
   } else if (!HasUnlimitedPositionals &&
              PositionalVals.size() > PositionalOpts.size()) {
-    if (!IgnoreErrors) {
-      errs() << ProgramName << ": Too many positional arguments specified!\n"
-             << "Can specify at most " << PositionalOpts.size()
-             << " positional arguments: See: " << argv[0] << " -help\n";
-    }
+    *Errs << ProgramName << ": Too many positional arguments specified!\n"
+          << "Can specify at most " << PositionalOpts.size()
+          << " positional arguments: See: " << argv[0] << " -help\n";
     ErrorParsing = true;
 
   } else if (!ConsumeAfterOpt) {
@@ -1404,8 +1400,8 @@ static StringRef getValueStr(const Option &O, StringRef DefaultMsg) {
 // Return the width of the option tag for printing...
 size_t alias::getOptionWidth() const { return ArgStr.size() + 6; }
 
-static void printHelpStr(StringRef HelpStr, size_t Indent,
-                         size_t FirstLineIndentedBy) {
+void Option::printHelpStr(StringRef HelpStr, size_t Indent,
+                                 size_t FirstLineIndentedBy) {
   std::pair<StringRef, StringRef> Split = HelpStr.split('\n');
   outs().indent(Indent - FirstLineIndentedBy) << " - " << Split.first << "\n";
   while (!Split.second.empty()) {
@@ -1448,7 +1444,7 @@ void basic_parser_impl::printOptionInfo(const Option &O,
   if (!ValName.empty())
     outs() << "=<" << getValueStr(O, ValName) << '>';
 
-  printHelpStr(O.HelpStr, GlobalWidth, getOptionWidth(O));
+  Option::printHelpStr(O.HelpStr, GlobalWidth, getOptionWidth(O));
 }
 
 void basic_parser_impl::printOptionName(const Option &O,
@@ -1587,7 +1583,7 @@ void generic_parser_base::printOptionInfo(const Option &O,
                                           size_t GlobalWidth) const {
   if (O.hasArgStr()) {
     outs() << "  -" << O.ArgStr;
-    printHelpStr(O.HelpStr, GlobalWidth, O.ArgStr.size() + 6);
+    Option::printHelpStr(O.HelpStr, GlobalWidth, O.ArgStr.size() + 6);
 
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
       size_t NumSpaces = GlobalWidth - getOption(i).size() - 8;
@@ -1600,7 +1596,7 @@ void generic_parser_base::printOptionInfo(const Option &O,
     for (unsigned i = 0, e = getNumOptions(); i != e; ++i) {
       auto Option = getOption(i);
       outs() << "    -" << Option;
-      printHelpStr(getDescription(i), GlobalWidth, Option.size() + 8);
+      Option::printHelpStr(getDescription(i), GlobalWidth, Option.size() + 8);
     }
   }
 }
@@ -1856,10 +1852,11 @@ public:
 
   // Helper function for printOptions().
   // It shall return a negative value if A's name should be lexicographically
-  // ordered before B's name. It returns a value greater equal zero otherwise.
+  // ordered before B's name. It returns a value greater than zero if B's name
+  // should be ordered before A's name, and it returns 0 otherwise.
   static int OptionCategoryCompare(OptionCategory *const *A,
                                    OptionCategory *const *B) {
-    return (*A)->getName() == (*B)->getName();
+    return (*A)->getName().compare((*B)->getName());
   }
 
   // Make sure we inherit our base class's operator=()
@@ -2182,5 +2179,6 @@ void cl::ResetAllOptionOccurrences() {
 
 void LLVMParseCommandLineOptions(int argc, const char *const *argv,
                                  const char *Overview) {
-  llvm::cl::ParseCommandLineOptions(argc, argv, StringRef(Overview), true);
+  llvm::cl::ParseCommandLineOptions(argc, argv, StringRef(Overview),
+                                    &llvm::nulls());
 }
diff --git a/lib/Support/DebugCounter.cpp b/lib/Support/DebugCounter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..29dae8a20f00f312d8f6e4f7db4292f5048897cc
--- /dev/null
+++ b/lib/Support/DebugCounter.cpp
@@ -0,0 +1,108 @@
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/Options.h"
+
+using namespace llvm;
+
+// This class overrides the default list implementation of printing so we
+// can pretty print the list of debug counter options.  This type of
+// dynamic option is pretty rare (basically this and pass lists).
+class DebugCounterList : public cl::list<std::string, DebugCounter> {
+private:
+  using Base = cl::list<std::string, DebugCounter>;
+
+public:
+  template <class... Mods>
+  explicit DebugCounterList(Mods &&... Ms) : Base(std::forward<Mods>(Ms)...) {}
+
+private:
+  void printOptionInfo(size_t GlobalWidth) const override {
+    // This is a variant of from generic_parser_base::printOptionInfo.  Sadly,
+    // it's not easy to make it more usable.  We could get it to print these as
+    // options if we were a cl::opt and registered them, but lists don't have
+    // options, nor does the parser for std::string.  The other mechanisms for
+    // options are global and would pollute the global namespace with our
+    // counters.  Rather than go that route, we have just overridden the
+    // printing, which only a few things call anyway.
+    outs() << "  -" << ArgStr;
+    // All of the other options in CommandLine.cpp use ArgStr.size() + 6 for
+    // width, so we do the same.
+    Option::printHelpStr(HelpStr, GlobalWidth, ArgStr.size() + 6);
+    const auto &CounterInstance = DebugCounter::instance();
+    for (auto Name : CounterInstance) {
+      const auto Info =
+          CounterInstance.getCounterInfo(CounterInstance.getCounterId(Name));
+      size_t NumSpaces = GlobalWidth - Info.first.size() - 8;
+      outs() << "    =" << Info.first;
+      outs().indent(NumSpaces) << " -   " << Info.second << '\n';
+    }
+  }
+};
+
+// Create our command line option.
+static DebugCounterList DebugCounterOption(
+    "debug-counter",
+    cl::desc("Comma separated list of debug counter skip and count"),
+    cl::CommaSeparated, cl::ZeroOrMore, cl::location(DebugCounter::instance()));
+
+static ManagedStatic<DebugCounter> DC;
+
+DebugCounter &DebugCounter::instance() { return *DC; }
+
+// This is called by the command line parser when it sees a value for the
+// debug-counter option defined above.
+void DebugCounter::push_back(const std::string &Val) {
+  if (Val.empty())
+    return;
+  // The strings should come in as counter=value
+  auto CounterPair = StringRef(Val).split('=');
+  if (CounterPair.second.empty()) {
+    errs() << "DebugCounter Error: " << Val << " does not have an = in it\n";
+    return;
+  }
+  // Now we have counter=value.
+  // First, process value.
+  long CounterVal;
+  if (CounterPair.second.getAsInteger(0, CounterVal)) {
+    errs() << "DebugCounter Error: " << CounterPair.second
+           << " is not a number\n";
+    return;
+  }
+  // Now we need to see if this is the skip or the count, remove the suffix, and
+  // add it to the counter values.
+  if (CounterPair.first.endswith("-skip")) {
+    auto CounterName = CounterPair.first.drop_back(5);
+    unsigned CounterID = RegisteredCounters.idFor(CounterName);
+    if (!CounterID) {
+      errs() << "DebugCounter Error: " << CounterName
+             << " is not a registered counter\n";
+      return;
+    }
+
+    auto Res = Counters.insert({CounterID, {0, -1}});
+    Res.first->second.first = CounterVal;
+  } else if (CounterPair.first.endswith("-count")) {
+    auto CounterName = CounterPair.first.drop_back(6);
+    unsigned CounterID = RegisteredCounters.idFor(CounterName);
+    if (!CounterID) {
+      errs() << "DebugCounter Error: " << CounterName
+             << " is not a registered counter\n";
+      return;
+    }
+
+    auto Res = Counters.insert({CounterID, {0, -1}});
+    Res.first->second.second = CounterVal;
+  } else {
+    errs() << "DebugCounter Error: " << CounterPair.first
+           << " does not end with -skip or -count\n";
+  }
+}
+
+void DebugCounter::print(raw_ostream &OS) {
+  OS << "Counters and values:\n";
+  for (const auto &KV : Counters)
+    OS << left_justify(RegisteredCounters[KV.first], 32) << ": {"
+       << KV.second.first << "," << KV.second.second << "}\n";
+}
diff --git a/lib/Support/Dwarf.cpp b/lib/Support/Dwarf.cpp
index 8950e8c919a4fd624319427e3e675718e9efb5b7..f13da62e4a87cdbb6ea9fa33dfdc6617388d256d 100644
--- a/lib/Support/Dwarf.cpp
+++ b/lib/Support/Dwarf.cpp
@@ -304,6 +304,17 @@ StringRef llvm::dwarf::ApplePropertyString(unsigned Prop) {
   }
 }
 
+StringRef llvm::dwarf::UnitTypeString(unsigned UT) {
+  switch (UT) {
+  default:
+    return StringRef();
+#define HANDLE_DW_UT(ID, NAME)                                                 \
+  case DW_UT_##NAME:                                                           \
+    return "DW_UT_" #NAME;
+#include "llvm/Support/Dwarf.def"
+  }
+}
+
 StringRef llvm::dwarf::AtomTypeString(unsigned AT) {
   switch (AT) {
   case dwarf::DW_ATOM_null:
diff --git a/lib/Support/DynamicLibrary.cpp b/lib/Support/DynamicLibrary.cpp
index ced21e46afe80efc4abd19ec827a0574a69c1f6b..92ce6185306afdb9d899050dc02c69dc5507a306 100644
--- a/lib/Support/DynamicLibrary.cpp
+++ b/lib/Support/DynamicLibrary.cpp
@@ -9,8 +9,6 @@
 //
 //  This file implements the operating system DynamicLibrary concept.
 //
-// FIXME: This file leaks ExplicitSymbols and OpenedHandles!
-//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/DynamicLibrary.h"
@@ -51,7 +49,7 @@ using namespace llvm::sys;
 //===          independent code.
 //===----------------------------------------------------------------------===//
 
-static DenseSet<void *> *OpenedHandles = nullptr;
+static llvm::ManagedStatic<DenseSet<void *> > OpenedHandles;
 
 DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
                                                    std::string *errMsg) {
@@ -70,9 +68,6 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     handle = RTLD_DEFAULT;
 #endif
 
-  if (!OpenedHandles)
-    OpenedHandles = new DenseSet<void *>();
-
   // If we've already loaded this library, dlclose() the handle in order to
   // keep the internal refcount at +1.
   if (!OpenedHandles->insert(handle).second)
@@ -81,6 +76,18 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
   return DynamicLibrary(handle);
 }
 
+DynamicLibrary DynamicLibrary::addPermanentLibrary(void *handle,
+                                                   std::string *errMsg) {
+  SmartScopedLock<true> lock(*SymbolsMutex);
+  // If we've already loaded this library, tell the caller.
+  if (!OpenedHandles->insert(handle).second) {
+    if (errMsg) *errMsg = "Library already loaded";
+    return DynamicLibrary();
+  }
+
+  return DynamicLibrary(handle);
+}
+
 void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
   if (!isValid())
     return nullptr;
@@ -121,7 +128,7 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
 
 #if defined(HAVE_DLFCN_H) && defined(HAVE_DLOPEN)
   // Now search the libraries.
-  if (OpenedHandles) {
+  if (OpenedHandles.isConstructed()) {
     for (DenseSet<void *>::iterator I = OpenedHandles->begin(),
          E = OpenedHandles->end(); I != E; ++I) {
       //lt_ptr ptr = lt_dlsym(*I, symbolName);
diff --git a/lib/Support/FileOutputBuffer.cpp b/lib/Support/FileOutputBuffer.cpp
index 57e5a8d7871cd1887638bcb7d09ca7b5a09ba67e..731740d012d90887cce11f5ec2f914efb349c166 100644
--- a/lib/Support/FileOutputBuffer.cpp
+++ b/lib/Support/FileOutputBuffer.cpp
@@ -57,6 +57,8 @@ FileOutputBuffer::create(StringRef FilePath, size_t Size, unsigned Flags) {
         // FIXME: In posix, you use the access() call to check this.
       }
       break;
+    case sys::fs::file_type::directory_file:
+      return errc::is_a_directory;
     default:
       if (EC)
         return EC;
diff --git a/lib/Support/Host.cpp b/lib/Support/Host.cpp
index d1b40412a6fc64e73559b509698e48c0e965d671..ec51314fcbe1d3d47634db3ab584c4c2a813cad5 100644
--- a/lib/Support/Host.cpp
+++ b/lib/Support/Host.cpp
@@ -52,25 +52,218 @@
 
 using namespace llvm;
 
-#if defined(__linux__)
-static ssize_t LLVM_ATTRIBUTE_UNUSED readCpuInfo(void *Buf, size_t Size) {
-  // Note: We cannot mmap /proc/cpuinfo here and then process the resulting
-  // memory buffer because the 'file' has 0 size (it can be read from only
-  // as a stream).
-
-  int FD;
-  std::error_code EC = sys::fs::openFileForRead("/proc/cpuinfo", FD);
-  if (EC) {
-    DEBUG(dbgs() << "Unable to open /proc/cpuinfo: " << EC.message() << "\n");
-    return -1;
+static std::unique_ptr<llvm::MemoryBuffer>
+    LLVM_ATTRIBUTE_UNUSED getProcCpuinfoContent() {
+  llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
+      llvm::MemoryBuffer::getFileAsStream("/proc/cpuinfo");
+  if (std::error_code EC = Text.getError()) {
+    llvm::errs() << "Can't read "
+                 << "/proc/cpuinfo: " << EC.message() << "\n";
+    return nullptr;
   }
-  int Ret = read(FD, Buf, Size);
-  int CloseStatus = close(FD);
-  if (CloseStatus)
-    return -1;
-  return Ret;
+  return std::move(*Text);
+}
+
+StringRef sys::detail::getHostCPUNameForPowerPC(
+    const StringRef &ProcCpuinfoContent) {
+  // Access to the Processor Version Register (PVR) on PowerPC is privileged,
+  // and so we must use an operating-system interface to determine the current
+  // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
+  const char *generic = "generic";
+
+  // The cpu line is second (after the 'processor: 0' line), so if this
+  // buffer is too small then something has changed (or is wrong).
+  StringRef::const_iterator CPUInfoStart = ProcCpuinfoContent.begin();
+  StringRef::const_iterator CPUInfoEnd = ProcCpuinfoContent.end();
+
+  StringRef::const_iterator CIP = CPUInfoStart;
+
+  StringRef::const_iterator CPUStart = 0;
+  size_t CPULen = 0;
+
+  // We need to find the first line which starts with cpu, spaces, and a colon.
+  // After the colon, there may be some additional spaces and then the cpu type.
+  while (CIP < CPUInfoEnd && CPUStart == 0) {
+    if (CIP < CPUInfoEnd && *CIP == '\n')
+      ++CIP;
+
+    if (CIP < CPUInfoEnd && *CIP == 'c') {
+      ++CIP;
+      if (CIP < CPUInfoEnd && *CIP == 'p') {
+        ++CIP;
+        if (CIP < CPUInfoEnd && *CIP == 'u') {
+          ++CIP;
+          while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+            ++CIP;
+
+          if (CIP < CPUInfoEnd && *CIP == ':') {
+            ++CIP;
+            while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
+              ++CIP;
+
+            if (CIP < CPUInfoEnd) {
+              CPUStart = CIP;
+              while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' &&
+                                          *CIP != ',' && *CIP != '\n'))
+                ++CIP;
+              CPULen = CIP - CPUStart;
+            }
+          }
+        }
+      }
+    }
+
+    if (CPUStart == 0)
+      while (CIP < CPUInfoEnd && *CIP != '\n')
+        ++CIP;
+  }
+
+  if (CPUStart == 0)
+    return generic;
+
+  return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
+      .Case("604e", "604e")
+      .Case("604", "604")
+      .Case("7400", "7400")
+      .Case("7410", "7400")
+      .Case("7447", "7400")
+      .Case("7455", "7450")
+      .Case("G4", "g4")
+      .Case("POWER4", "970")
+      .Case("PPC970FX", "970")
+      .Case("PPC970MP", "970")
+      .Case("G5", "g5")
+      .Case("POWER5", "g5")
+      .Case("A2", "a2")
+      .Case("POWER6", "pwr6")
+      .Case("POWER7", "pwr7")
+      .Case("POWER8", "pwr8")
+      .Case("POWER8E", "pwr8")
+      .Case("POWER8NVL", "pwr8")
+      .Case("POWER9", "pwr9")
+      .Default(generic);
+}
+
+StringRef sys::detail::getHostCPUNameForARM(
+    const StringRef &ProcCpuinfoContent) {
+  // The cpuid register on arm is not accessible from user space. On Linux,
+  // it is exposed through the /proc/cpuinfo file.
+
+  // Read 32 lines from /proc/cpuinfo, which should contain the CPU part line
+  // in all cases.
+  SmallVector<StringRef, 32> Lines;
+  ProcCpuinfoContent.split(Lines, "\n");
+
+  // Look for the CPU implementer line.
+  StringRef Implementer;
+  StringRef Hardware;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+    if (Lines[I].startswith("CPU implementer"))
+      Implementer = Lines[I].substr(15).ltrim("\t :");
+    if (Lines[I].startswith("Hardware"))
+      Hardware = Lines[I].substr(8).ltrim("\t :");
+  }
+
+  if (Implementer == "0x41") { // ARM Ltd.
+    // MSM8992/8994 may give cpu part for the core that the kernel is running on,
+    // which is undeterministic and wrong. Always return cortex-a53 for these SoC.
+    if (Hardware.endswith("MSM8994") || Hardware.endswith("MSM8996"))
+      return "cortex-a53";
+
+
+    // Look for the CPU part line.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("CPU part"))
+        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+        // values correspond to the "Part number" in the CP15/c0 register. The
+        // contents are specified in the various processor manuals.
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+            .Case("0x926", "arm926ej-s")
+            .Case("0xb02", "mpcore")
+            .Case("0xb36", "arm1136j-s")
+            .Case("0xb56", "arm1156t2-s")
+            .Case("0xb76", "arm1176jz-s")
+            .Case("0xc08", "cortex-a8")
+            .Case("0xc09", "cortex-a9")
+            .Case("0xc0f", "cortex-a15")
+            .Case("0xc20", "cortex-m0")
+            .Case("0xc23", "cortex-m3")
+            .Case("0xc24", "cortex-m4")
+            .Case("0xd04", "cortex-a35")
+            .Case("0xd03", "cortex-a53")
+            .Case("0xd07", "cortex-a57")
+            .Case("0xd08", "cortex-a72")
+            .Case("0xd09", "cortex-a73")
+            .Default("generic");
+  }
+
+  if (Implementer == "0x51") // Qualcomm Technologies, Inc.
+    // Look for the CPU part line.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+      if (Lines[I].startswith("CPU part"))
+        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
+        // values correspond to the "Part number" in the CP15/c0 register. The
+        // contents are specified in the various processor manuals.
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+            .Case("0x06f", "krait") // APQ8064
+            .Case("0x201", "kryo")
+            .Case("0x205", "kryo")
+            .Default("generic");
+
+  return "generic";
+}
+
+StringRef sys::detail::getHostCPUNameForS390x(
+    const StringRef &ProcCpuinfoContent) {
+  // STIDP is a privileged operation, so use /proc/cpuinfo instead.
+
+  // The "processor 0:" line comes after a fair amount of other information,
+  // including a cache breakdown, but this should be plenty.
+  SmallVector<StringRef, 32> Lines;
+  ProcCpuinfoContent.split(Lines, "\n");
+
+  // Look for the CPU features.
+  SmallVector<StringRef, 32> CPUFeatures;
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
+    if (Lines[I].startswith("features")) {
+      size_t Pos = Lines[I].find(":");
+      if (Pos != StringRef::npos) {
+        Lines[I].drop_front(Pos + 1).split(CPUFeatures, ' ');
+        break;
+      }
+    }
+
+  // We need to check for the presence of vector support independently of
+  // the machine type, since we may only use the vector register set when
+  // supported by the kernel (and hypervisor).
+  bool HaveVectorSupport = false;
+  for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
+    if (CPUFeatures[I] == "vx")
+      HaveVectorSupport = true;
+  }
+
+  // Now check the processor machine type.
+  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+    if (Lines[I].startswith("processor ")) {
+      size_t Pos = Lines[I].find("machine = ");
+      if (Pos != StringRef::npos) {
+        Pos += sizeof("machine = ") - 1;
+        unsigned int Id;
+        if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
+          if (Id >= 2964 && HaveVectorSupport)
+            return "z13";
+          if (Id >= 2827)
+            return "zEC12";
+          if (Id >= 2817)
+            return "z196";
+        }
+      }
+      break;
+    }
+  }
+
+  return "generic";
 }
-#endif
 
 #if defined(__i386__) || defined(_M_IX86) || \
     defined(__x86_64__) || defined(_M_X64)
@@ -1020,201 +1213,21 @@ StringRef sys::getHostCPUName() {
 }
 #elif defined(__linux__) && (defined(__ppc__) || defined(__powerpc__))
 StringRef sys::getHostCPUName() {
-  // Access to the Processor Version Register (PVR) on PowerPC is privileged,
-  // and so we must use an operating-system interface to determine the current
-  // processor type. On Linux, this is exposed through the /proc/cpuinfo file.
-  const char *generic = "generic";
-
-  // The cpu line is second (after the 'processor: 0' line), so if this
-  // buffer is too small then something has changed (or is wrong).
-  char buffer[1024];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
-    return generic;
-
-  const char *CPUInfoStart = buffer;
-  const char *CPUInfoEnd = buffer + CPUInfoSize;
-
-  const char *CIP = CPUInfoStart;
-
-  const char *CPUStart = 0;
-  size_t CPULen = 0;
-
-  // We need to find the first line which starts with cpu, spaces, and a colon.
-  // After the colon, there may be some additional spaces and then the cpu type.
-  while (CIP < CPUInfoEnd && CPUStart == 0) {
-    if (CIP < CPUInfoEnd && *CIP == '\n')
-      ++CIP;
-
-    if (CIP < CPUInfoEnd && *CIP == 'c') {
-      ++CIP;
-      if (CIP < CPUInfoEnd && *CIP == 'p') {
-        ++CIP;
-        if (CIP < CPUInfoEnd && *CIP == 'u') {
-          ++CIP;
-          while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
-            ++CIP;
-
-          if (CIP < CPUInfoEnd && *CIP == ':') {
-            ++CIP;
-            while (CIP < CPUInfoEnd && (*CIP == ' ' || *CIP == '\t'))
-              ++CIP;
-
-            if (CIP < CPUInfoEnd) {
-              CPUStart = CIP;
-              while (CIP < CPUInfoEnd && (*CIP != ' ' && *CIP != '\t' &&
-                                          *CIP != ',' && *CIP != '\n'))
-                ++CIP;
-              CPULen = CIP - CPUStart;
-            }
-          }
-        }
-      }
-    }
-
-    if (CPUStart == 0)
-      while (CIP < CPUInfoEnd && *CIP != '\n')
-        ++CIP;
-  }
-
-  if (CPUStart == 0)
-    return generic;
-
-  return StringSwitch<const char *>(StringRef(CPUStart, CPULen))
-      .Case("604e", "604e")
-      .Case("604", "604")
-      .Case("7400", "7400")
-      .Case("7410", "7400")
-      .Case("7447", "7400")
-      .Case("7455", "7450")
-      .Case("G4", "g4")
-      .Case("POWER4", "970")
-      .Case("PPC970FX", "970")
-      .Case("PPC970MP", "970")
-      .Case("G5", "g5")
-      .Case("POWER5", "g5")
-      .Case("A2", "a2")
-      .Case("POWER6", "pwr6")
-      .Case("POWER7", "pwr7")
-      .Case("POWER8", "pwr8")
-      .Case("POWER8E", "pwr8")
-      .Case("POWER8NVL", "pwr8")
-      .Case("POWER9", "pwr9")
-      .Default(generic);
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  const StringRef& Content = P ? P->getBuffer() : "";
+  return detail::getHostCPUNameForPowerPC(Content);
 }
-#elif defined(__linux__) && defined(__arm__)
+#elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
 StringRef sys::getHostCPUName() {
-  // The cpuid register on arm is not accessible from user space. On Linux,
-  // it is exposed through the /proc/cpuinfo file.
-
-  // Read 1024 bytes from /proc/cpuinfo, which should contain the CPU part line
-  // in all cases.
-  char buffer[1024];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
-    return "generic";
-
-  StringRef Str(buffer, CPUInfoSize);
-
-  SmallVector<StringRef, 32> Lines;
-  Str.split(Lines, "\n");
-
-  // Look for the CPU implementer line.
-  StringRef Implementer;
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-    if (Lines[I].startswith("CPU implementer"))
-      Implementer = Lines[I].substr(15).ltrim("\t :");
-
-  if (Implementer == "0x41") // ARM Ltd.
-    // Look for the CPU part line.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-      if (Lines[I].startswith("CPU part"))
-        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
-        // values correspond to the "Part number" in the CP15/c0 register. The
-        // contents are specified in the various processor manuals.
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-            .Case("0x926", "arm926ej-s")
-            .Case("0xb02", "mpcore")
-            .Case("0xb36", "arm1136j-s")
-            .Case("0xb56", "arm1156t2-s")
-            .Case("0xb76", "arm1176jz-s")
-            .Case("0xc08", "cortex-a8")
-            .Case("0xc09", "cortex-a9")
-            .Case("0xc0f", "cortex-a15")
-            .Case("0xc20", "cortex-m0")
-            .Case("0xc23", "cortex-m3")
-            .Case("0xc24", "cortex-m4")
-            .Default("generic");
-
-  if (Implementer == "0x51") // Qualcomm Technologies, Inc.
-    // Look for the CPU part line.
-    for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-      if (Lines[I].startswith("CPU part"))
-        // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
-        // values correspond to the "Part number" in the CP15/c0 register. The
-        // contents are specified in the various processor manuals.
-        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
-            .Case("0x06f", "krait") // APQ8064
-            .Default("generic");
-
-  return "generic";
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  const StringRef& Content = P ? P->getBuffer() : "";
+  return detail::getHostCPUNameForARM(Content);
 }
 #elif defined(__linux__) && defined(__s390x__)
 StringRef sys::getHostCPUName() {
-  // STIDP is a privileged operation, so use /proc/cpuinfo instead.
-
-  // The "processor 0:" line comes after a fair amount of other information,
-  // including a cache breakdown, but this should be plenty.
-  char buffer[2048];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
-    return "generic";
-
-  StringRef Str(buffer, CPUInfoSize);
-  SmallVector<StringRef, 32> Lines;
-  Str.split(Lines, "\n");
-
-  // Look for the CPU features.
-  SmallVector<StringRef, 32> CPUFeatures;
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I)
-    if (Lines[I].startswith("features")) {
-      size_t Pos = Lines[I].find(":");
-      if (Pos != StringRef::npos) {
-        Lines[I].drop_front(Pos + 1).split(CPUFeatures, ' ');
-        break;
-      }
-    }
-
-  // We need to check for the presence of vector support independently of
-  // the machine type, since we may only use the vector register set when
-  // supported by the kernel (and hypervisor).
-  bool HaveVectorSupport = false;
-  for (unsigned I = 0, E = CPUFeatures.size(); I != E; ++I) {
-    if (CPUFeatures[I] == "vx")
-      HaveVectorSupport = true;
-  }
-
-  // Now check the processor machine type.
-  for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
-    if (Lines[I].startswith("processor ")) {
-      size_t Pos = Lines[I].find("machine = ");
-      if (Pos != StringRef::npos) {
-        Pos += sizeof("machine = ") - 1;
-        unsigned int Id;
-        if (!Lines[I].drop_front(Pos).getAsInteger(10, Id)) {
-          if (Id >= 2964 && HaveVectorSupport)
-            return "z13";
-          if (Id >= 2827)
-            return "zEC12";
-          if (Id >= 2817)
-            return "z196";
-        }
-      }
-      break;
-    }
-  }
-
-  return "generic";
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  const StringRef& Content = P ? P->getBuffer() : "";
+  return detail::getHostCPUNameForS390x(Content);
 }
 #else
 StringRef sys::getHostCPUName() { return "generic"; }
@@ -1353,6 +1366,10 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["tbm"] = HasExtLeaf1 && ((ECX >> 21) & 1);
   Features["mwaitx"] = HasExtLeaf1 && ((ECX >> 29) & 1);
 
+  bool HasExtLeaf8 = MaxExtLevel >= 0x80000008 &&
+                     !getX86CpuIDAndInfoEx(0x80000008,0x0, &EAX, &EBX, &ECX, &EDX);
+  Features["clzero"] = HasExtLeaf8 && ((EBX >> 0) & 1);
+
   bool HasLeaf7 =
       MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x0, &EAX, &EBX, &ECX, &EDX);
 
@@ -1362,14 +1379,10 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["fsgsbase"] = HasLeaf7 && ((EBX >> 0) & 1);
   Features["sgx"] = HasLeaf7 && ((EBX >> 2) & 1);
   Features["bmi"] = HasLeaf7 && ((EBX >> 3) & 1);
-  Features["hle"] = HasLeaf7 && ((EBX >> 4) & 1);
   Features["bmi2"] = HasLeaf7 && ((EBX >> 8) & 1);
-  Features["invpcid"] = HasLeaf7 && ((EBX >> 10) & 1);
   Features["rtm"] = HasLeaf7 && ((EBX >> 11) & 1);
   Features["rdseed"] = HasLeaf7 && ((EBX >> 18) & 1);
   Features["adx"] = HasLeaf7 && ((EBX >> 19) & 1);
-  Features["smap"] = HasLeaf7 && ((EBX >> 20) & 1);
-  Features["pcommit"] = HasLeaf7 && ((EBX >> 22) & 1);
   Features["clflushopt"] = HasLeaf7 && ((EBX >> 23) & 1);
   Features["clwb"] = HasLeaf7 && ((EBX >> 24) & 1);
   Features["sha"] = HasLeaf7 && ((EBX >> 29) & 1);
@@ -1401,17 +1414,12 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
 }
 #elif defined(__linux__) && (defined(__arm__) || defined(__aarch64__))
 bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
-  // Read 1024 bytes from /proc/cpuinfo, which should contain the Features line
-  // in all cases.
-  char buffer[1024];
-  ssize_t CPUInfoSize = readCpuInfo(buffer, sizeof(buffer));
-  if (CPUInfoSize == -1)
+  std::unique_ptr<llvm::MemoryBuffer> P = getProcCpuinfoContent();
+  if (!P)
     return false;
 
-  StringRef Str(buffer, CPUInfoSize);
-
   SmallVector<StringRef, 32> Lines;
-  Str.split(Lines, "\n");
+  P->getBuffer().split(Lines, "\n");
 
   SmallVector<StringRef, 32> CPUFeatures;
 
diff --git a/lib/Support/LockFileManager.cpp b/lib/Support/LockFileManager.cpp
index 444aaa37c8c8360c08c0cdf44af06c794b2a2052..8be9879fbc2436d1f8962c1e91a76ebf7c2f2f20 100644
--- a/lib/Support/LockFileManager.cpp
+++ b/lib/Support/LockFileManager.cpp
@@ -304,9 +304,9 @@ LockFileManager::WaitForUnlockResult LockFileManager::waitForUnlock() {
   Interval.tv_sec = 0;
   Interval.tv_nsec = 1000000;
 #endif
-  // Don't wait more than five minutes per iteration. Total timeout for the file
-  // to appear is ~8.5 mins.
-  const unsigned MaxSeconds = 5*60;
+  // Don't wait more than 40s per iteration. Total timeout for the file
+  // to appear is ~1.5 minutes.
+  const unsigned MaxSeconds = 40;
   do {
     // Sleep for the designated interval, to allow the owning process time to
     // finish up and remove the lock file.
diff --git a/lib/Support/LowLevelType.cpp b/lib/Support/LowLevelType.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4290d69cd197d0645688b91b51be7e1b56f05527
--- /dev/null
+++ b/lib/Support/LowLevelType.cpp
@@ -0,0 +1,47 @@
+//===-- llvm/Support/LowLevelType.cpp -------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file implements the more header-heavy bits of the LLT class to
+/// avoid polluting users' namespaces.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+LLT::LLT(MVT VT) {
+  if (VT.isVector()) {
+    SizeInBits = VT.getVectorElementType().getSizeInBits();
+    ElementsOrAddrSpace = VT.getVectorNumElements();
+    Kind = ElementsOrAddrSpace == 1 ? Scalar : Vector;
+  } else if (VT.isValid()) {
+    // Aggregates are no different from real scalars as far as GlobalISel is
+    // concerned.
+    Kind = Scalar;
+    SizeInBits = VT.getSizeInBits();
+    ElementsOrAddrSpace = 1;
+    assert(SizeInBits != 0 && "invalid zero-sized type");
+  } else {
+    Kind = Invalid;
+    SizeInBits = ElementsOrAddrSpace = 0;
+  }
+}
+
+void LLT::print(raw_ostream &OS) const {
+  if (isVector())
+    OS << "<" << ElementsOrAddrSpace << " x s" << SizeInBits << ">";
+  else if (isPointer())
+    OS << "p" << getAddressSpace();
+  else if (isValid()) {
+    assert(isScalar() && "unexpected type");
+    OS << "s" << getScalarSizeInBits();
+  } else
+    llvm_unreachable("trying to print an invalid type");
+}
diff --git a/lib/Support/MD5.cpp b/lib/Support/MD5.cpp
index 942571eab0f3d9ea2d8ee1f29136b1858a73b4c3..bdbf1d677938350bfaed5e0fed74c7da94eafb6a 100644
--- a/lib/Support/MD5.cpp
+++ b/lib/Support/MD5.cpp
@@ -38,9 +38,13 @@
  */
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
+#include <array>
+#include <cstdint>
 #include <cstring>
 
 // The basic MD5 functions.
@@ -68,7 +72,7 @@
        ((MD5_u32plus) ptr[(n) * 4 + 3] << 24))
 #define GET(n) (block[(n)])
 
-namespace llvm {
+using namespace llvm;
 
 /// \brief This processes one or more 64-byte data blocks, but does NOT update
 ///the bit counters.  There are no alignment requirements.
@@ -179,9 +183,7 @@ const uint8_t *MD5::body(ArrayRef<uint8_t> Data) {
   return ptr;
 }
 
-MD5::MD5()
-    : a(0x67452301), b(0xefcdab89), c(0x98badcfe), d(0x10325476), hi(0), lo(0) {
-}
+MD5::MD5() = default;
 
 /// Incrementally add the bytes in \p Data to the hash.
 void MD5::update(ArrayRef<uint8_t> Data) {
@@ -259,10 +261,16 @@ void MD5::final(MD5Result &Result) {
   support::endian::write32le(&Result[12], d);
 }
 
-void MD5::stringifyResult(MD5Result &Result, SmallString<32> &Str) {
+SmallString<32> MD5::MD5Result::digest() const {
+  SmallString<32> Str;
   raw_svector_ostream Res(Str);
   for (int i = 0; i < 16; ++i)
-    Res << format("%.2x", Result[i]);
+    Res << format("%.2x", Bytes[i]);
+  return Str;
+}
+
+void MD5::stringifyResult(MD5Result &Result, SmallString<32> &Str) {
+  Str = Result.digest();
 }
 
 std::array<uint8_t, 16> MD5::hash(ArrayRef<uint8_t> Data) {
@@ -271,8 +279,5 @@ std::array<uint8_t, 16> MD5::hash(ArrayRef<uint8_t> Data) {
   MD5::MD5Result Res;
   Hash.final(Res);
 
-  std::array<uint8_t, 16> Arr;
-  memcpy(Arr.data(), Res, sizeof(Res));
-  return Arr;
-}
+  return Res;
 }
diff --git a/lib/Support/ManagedStatic.cpp b/lib/Support/ManagedStatic.cpp
index 7dd31315f90daaca8c6903f45a16c1a2e635b4a4..fb7cd070c42de56b83709da922c96cd64dc5cfb2 100644
--- a/lib/Support/ManagedStatic.cpp
+++ b/lib/Support/ManagedStatic.cpp
@@ -21,7 +21,7 @@ using namespace llvm;
 
 static const ManagedStaticBase *StaticList = nullptr;
 static sys::Mutex *ManagedStaticMutex = nullptr;
-LLVM_DEFINE_ONCE_FLAG(mutex_init_flag);
+static llvm::once_flag mutex_init_flag;
 
 static void initializeMutex() {
   ManagedStaticMutex = new sys::Mutex();
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index a3a18c9283ce951297720a53924c4e853c802bca..227e792d83dc477a97bfdb396b030c8712be7fb8 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -103,7 +103,7 @@ public:
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize, 
-           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatileSize);
+           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile);
 
 std::unique_ptr<MemoryBuffer>
 MemoryBuffer::getMemBuffer(StringRef InputData, StringRef BufferName,
@@ -178,8 +178,8 @@ MemoryBuffer::getFileOrSTDIN(const Twine &Filename, int64_t FileSize,
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize, 
-                           uint64_t Offset) {
-  return getFileAux(FilePath, -1, MapSize, Offset, false, false);
+                           uint64_t Offset, bool IsVolatile) {
+  return getFileAux(FilePath, -1, MapSize, Offset, false, IsVolatile);
 }
 
 
@@ -254,19 +254,19 @@ getMemoryBufferForStream(int FD, const Twine &BufferName) {
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getFile(const Twine &Filename, int64_t FileSize,
-                      bool RequiresNullTerminator, bool IsVolatileSize) {
+                      bool RequiresNullTerminator, bool IsVolatile) {
   return getFileAux(Filename, FileSize, FileSize, 0,
-                    RequiresNullTerminator, IsVolatileSize);
+                    RequiresNullTerminator, IsVolatile);
 }
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
-                bool IsVolatileSize);
+                bool IsVolatile);
 
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
-           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatileSize) {
+           uint64_t Offset, bool RequiresNullTerminator, bool IsVolatile) {
   int FD;
   std::error_code EC = sys::fs::openFileForRead(Filename, FD);
   if (EC)
@@ -274,7 +274,7 @@ getFileAux(const Twine &Filename, int64_t FileSize, uint64_t MapSize,
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> Ret =
       getOpenFileImpl(FD, Filename, FileSize, MapSize, Offset,
-                      RequiresNullTerminator, IsVolatileSize);
+                      RequiresNullTerminator, IsVolatile);
   close(FD);
   return Ret;
 }
@@ -285,11 +285,11 @@ static bool shouldUseMmap(int FD,
                           off_t Offset,
                           bool RequiresNullTerminator,
                           int PageSize,
-                          bool IsVolatileSize) {
+                          bool IsVolatile) {
   // mmap may leave the buffer without null terminator if the file size changed
   // by the time the last page is mapped in, so avoid it if the file size is
   // likely to change.
-  if (IsVolatileSize)
+  if (IsVolatile)
     return false;
 
   // We don't use mmap for small files because this can severely fragment our
@@ -300,7 +300,6 @@ static bool shouldUseMmap(int FD,
   if (!RequiresNullTerminator)
     return true;
 
-
   // If we don't know the file size, use fstat to find out.  fstat on an open
   // file descriptor is cheaper than stat on a random path.
   // FIXME: this chunk of code is duplicated, but it avoids a fstat when
@@ -338,7 +337,7 @@ static bool shouldUseMmap(int FD,
 static ErrorOr<std::unique_ptr<MemoryBuffer>>
 getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
                 uint64_t MapSize, int64_t Offset, bool RequiresNullTerminator,
-                bool IsVolatileSize) {
+                bool IsVolatile) {
   static int PageSize = sys::Process::getPageSize();
 
   // Default is to map the full file.
@@ -365,7 +364,7 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
   }
 
   if (shouldUseMmap(FD, FileSize, MapSize, Offset, RequiresNullTerminator,
-                    PageSize, IsVolatileSize)) {
+                    PageSize, IsVolatile)) {
     std::error_code EC;
     std::unique_ptr<MemoryBuffer> Result(
         new (NamedBufferAlloc(Filename))
@@ -415,17 +414,16 @@ getOpenFileImpl(int FD, const Twine &Filename, uint64_t FileSize,
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getOpenFile(int FD, const Twine &Filename, uint64_t FileSize,
-                          bool RequiresNullTerminator, bool IsVolatileSize) {
+                          bool RequiresNullTerminator, bool IsVolatile) {
   return getOpenFileImpl(FD, Filename, FileSize, FileSize, 0,
-                         RequiresNullTerminator, IsVolatileSize);
+                         RequiresNullTerminator, IsVolatile);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>>
 MemoryBuffer::getOpenFileSlice(int FD, const Twine &Filename, uint64_t MapSize,
-                               int64_t Offset) {
+                               int64_t Offset, bool IsVolatile) {
   assert(MapSize != uint64_t(-1));
-  return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false,
-                         /*IsVolatileSize*/ false);
+  return getOpenFileImpl(FD, Filename, -1, MapSize, Offset, false, IsVolatile);
 }
 
 ErrorOr<std::unique_ptr<MemoryBuffer>> MemoryBuffer::getSTDIN() {
diff --git a/lib/Support/Path.cpp b/lib/Support/Path.cpp
index 4bb035eeccca817b9e3b78a2a128adf52ca38578..9fd6652ce4b8c5fbddca3ee3bdb1cf111ec198f9 100644
--- a/lib/Support/Path.cpp
+++ b/lib/Support/Path.cpp
@@ -11,13 +11,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Path.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/COFF.h"
-#include "llvm/Support/MachO.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
-#include "llvm/Support/Path.h"
+#include "llvm/Support/MachO.h"
 #include "llvm/Support/Process.h"
 #include <cctype>
 #include <cstring>
@@ -34,16 +35,29 @@ using namespace llvm::support::endian;
 namespace {
   using llvm::StringRef;
   using llvm::sys::path::is_separator;
+  using llvm::sys::path::Style;
 
+  inline Style real_style(Style style) {
 #ifdef LLVM_ON_WIN32
-  const char *separators = "\\/";
-  const char preferred_separator = '\\';
+    return (style == Style::posix) ? Style::posix : Style::windows;
 #else
-  const char  separators = '/';
-  const char preferred_separator = '/';
+    return (style == Style::windows) ? Style::windows : Style::posix;
 #endif
+  }
 
-  StringRef find_first_component(StringRef path) {
+  inline const char *separators(Style style) {
+    if (real_style(style) == Style::windows)
+      return "\\/";
+    return "/";
+  }
+
+  inline char preferred_separator(Style style) {
+    if (real_style(style) == Style::windows)
+      return '\\';
+    return '/';
+  }
+
+  StringRef find_first_component(StringRef path, Style style) {
     // Look for this first component in the following order.
     // * empty (in this case we return an empty string)
     // * either C: or {//,\\}net.
@@ -53,96 +67,85 @@ namespace {
     if (path.empty())
       return path;
 
-#ifdef LLVM_ON_WIN32
-    // C:
-    if (path.size() >= 2 && std::isalpha(static_cast<unsigned char>(path[0])) &&
-        path[1] == ':')
-      return path.substr(0, 2);
-#endif
+    if (real_style(style) == Style::windows) {
+      // C:
+      if (path.size() >= 2 &&
+          std::isalpha(static_cast<unsigned char>(path[0])) && path[1] == ':')
+        return path.substr(0, 2);
+    }
 
     // //net
-    if ((path.size() > 2) &&
-        is_separator(path[0]) &&
-        path[0] == path[1] &&
-        !is_separator(path[2])) {
+    if ((path.size() > 2) && is_separator(path[0], style) &&
+        path[0] == path[1] && !is_separator(path[2], style)) {
       // Find the next directory separator.
-      size_t end = path.find_first_of(separators, 2);
+      size_t end = path.find_first_of(separators(style), 2);
       return path.substr(0, end);
     }
 
     // {/,\}
-    if (is_separator(path[0]))
+    if (is_separator(path[0], style))
       return path.substr(0, 1);
 
     // * {file,directory}name
-    size_t end = path.find_first_of(separators);
+    size_t end = path.find_first_of(separators(style));
     return path.substr(0, end);
   }
 
-  size_t filename_pos(StringRef str) {
-    if (str.size() == 2 &&
-        is_separator(str[0]) &&
-        str[0] == str[1])
+  size_t filename_pos(StringRef str, Style style) {
+    if (str.size() == 2 && is_separator(str[0], style) && str[0] == str[1])
       return 0;
 
-    if (str.size() > 0 && is_separator(str[str.size() - 1]))
+    if (str.size() > 0 && is_separator(str[str.size() - 1], style))
       return str.size() - 1;
 
-    size_t pos = str.find_last_of(separators, str.size() - 1);
+    size_t pos = str.find_last_of(separators(style), str.size() - 1);
 
-#ifdef LLVM_ON_WIN32
-    if (pos == StringRef::npos)
-      pos = str.find_last_of(':', str.size() - 2);
-#endif
+    if (real_style(style) == Style::windows) {
+      if (pos == StringRef::npos)
+        pos = str.find_last_of(':', str.size() - 2);
+    }
 
-    if (pos == StringRef::npos ||
-        (pos == 1 && is_separator(str[0])))
+    if (pos == StringRef::npos || (pos == 1 && is_separator(str[0], style)))
       return 0;
 
     return pos + 1;
   }
 
-  size_t root_dir_start(StringRef str) {
+  size_t root_dir_start(StringRef str, Style style) {
     // case "c:/"
-#ifdef LLVM_ON_WIN32
-    if (str.size() > 2 &&
-        str[1] == ':' &&
-        is_separator(str[2]))
-      return 2;
-#endif
+    if (real_style(style) == Style::windows) {
+      if (str.size() > 2 && str[1] == ':' && is_separator(str[2], style))
+        return 2;
+    }
 
     // case "//"
-    if (str.size() == 2 &&
-        is_separator(str[0]) &&
-        str[0] == str[1])
+    if (str.size() == 2 && is_separator(str[0], style) && str[0] == str[1])
       return StringRef::npos;
 
     // case "//net"
-    if (str.size() > 3 &&
-        is_separator(str[0]) &&
-        str[0] == str[1] &&
-        !is_separator(str[2])) {
-      return str.find_first_of(separators, 2);
+    if (str.size() > 3 && is_separator(str[0], style) && str[0] == str[1] &&
+        !is_separator(str[2], style)) {
+      return str.find_first_of(separators(style), 2);
     }
 
     // case "/"
-    if (str.size() > 0 && is_separator(str[0]))
+    if (str.size() > 0 && is_separator(str[0], style))
       return 0;
 
     return StringRef::npos;
   }
 
-  size_t parent_path_end(StringRef path) {
-    size_t end_pos = filename_pos(path);
+  size_t parent_path_end(StringRef path, Style style) {
+    size_t end_pos = filename_pos(path, style);
 
-    bool filename_was_sep = path.size() > 0 && is_separator(path[end_pos]);
+    bool filename_was_sep =
+        path.size() > 0 && is_separator(path[end_pos], style);
 
     // Skip separators except for root dir.
-    size_t root_dir_pos = root_dir_start(path.substr(0, end_pos));
+    size_t root_dir_pos = root_dir_start(path.substr(0, end_pos), style);
 
-    while(end_pos > 0 &&
-          (end_pos - 1) != root_dir_pos &&
-          is_separator(path[end_pos - 1]))
+    while (end_pos > 0 && (end_pos - 1) != root_dir_pos &&
+           is_separator(path[end_pos - 1], style))
       --end_pos;
 
     if (end_pos == 1 && root_dir_pos == 0 && filename_was_sep)
@@ -230,11 +233,12 @@ namespace llvm {
 namespace sys  {
 namespace path {
 
-const_iterator begin(StringRef path) {
+const_iterator begin(StringRef path, Style style) {
   const_iterator i;
   i.Path      = path;
-  i.Component = find_first_component(path);
+  i.Component = find_first_component(path, style);
   i.Position  = 0;
+  i.S = style;
   return i;
 }
 
@@ -259,27 +263,21 @@ const_iterator &const_iterator::operator++() {
 
   // Both POSIX and Windows treat paths that begin with exactly two separators
   // specially.
-  bool was_net = Component.size() > 2 &&
-    is_separator(Component[0]) &&
-    Component[1] == Component[0] &&
-    !is_separator(Component[2]);
+  bool was_net = Component.size() > 2 && is_separator(Component[0], S) &&
+                 Component[1] == Component[0] && !is_separator(Component[2], S);
 
   // Handle separators.
-  if (is_separator(Path[Position])) {
+  if (is_separator(Path[Position], S)) {
     // Root dir.
-    if (was_net
-#ifdef LLVM_ON_WIN32
+    if (was_net ||
         // c:/
-        || Component.endswith(":")
-#endif
-        ) {
+        (real_style(S) == Style::windows && Component.endswith(":"))) {
       Component = Path.substr(Position, 1);
       return *this;
     }
 
     // Skip extra separators.
-    while (Position != Path.size() &&
-           is_separator(Path[Position])) {
+    while (Position != Path.size() && is_separator(Path[Position], S)) {
       ++Position;
     }
 
@@ -292,7 +290,7 @@ const_iterator &const_iterator::operator++() {
   }
 
   // Find next component.
-  size_t end_pos = Path.find_first_of(separators, Position);
+  size_t end_pos = Path.find_first_of(separators(S), Position);
   Component = Path.slice(Position, end_pos);
 
   return *this;
@@ -306,10 +304,11 @@ ptrdiff_t const_iterator::operator-(const const_iterator &RHS) const {
   return Position - RHS.Position;
 }
 
-reverse_iterator rbegin(StringRef Path) {
+reverse_iterator rbegin(StringRef Path, Style style) {
   reverse_iterator I;
   I.Path = Path;
   I.Position = Path.size();
+  I.S = style;
   return ++I;
 }
 
@@ -324,10 +323,9 @@ reverse_iterator rend(StringRef Path) {
 reverse_iterator &reverse_iterator::operator++() {
   // If we're at the end and the previous char was a '/', return '.' unless
   // we are the root path.
-  size_t root_dir_pos = root_dir_start(Path);
-  if (Position == Path.size() &&
-      Path.size() > root_dir_pos + 1 &&
-      is_separator(Path[Position - 1])) {
+  size_t root_dir_pos = root_dir_start(Path, S);
+  if (Position == Path.size() && Path.size() > root_dir_pos + 1 &&
+      is_separator(Path[Position - 1], S)) {
     --Position;
     Component = ".";
     return *this;
@@ -336,13 +334,12 @@ reverse_iterator &reverse_iterator::operator++() {
   // Skip separators unless it's the root directory.
   size_t end_pos = Position;
 
-  while(end_pos > 0 &&
-        (end_pos - 1) != root_dir_pos &&
-        is_separator(Path[end_pos - 1]))
+  while (end_pos > 0 && (end_pos - 1) != root_dir_pos &&
+         is_separator(Path[end_pos - 1], S))
     --end_pos;
 
   // Find next separator.
-  size_t start_pos = filename_pos(Path.substr(0, end_pos));
+  size_t start_pos = filename_pos(Path.substr(0, end_pos), S);
   Component = Path.slice(start_pos, end_pos);
   Position = start_pos;
   return *this;
@@ -357,21 +354,15 @@ ptrdiff_t reverse_iterator::operator-(const reverse_iterator &RHS) const {
   return Position - RHS.Position;
 }
 
-StringRef root_path(StringRef path) {
-  const_iterator b = begin(path),
-                 pos = b,
-                 e = end(path);
+StringRef root_path(StringRef path, Style style) {
+  const_iterator b = begin(path, style), pos = b, e = end(path);
   if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
+    bool has_net =
+        b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
+    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
 
     if (has_net || has_drive) {
-      if ((++pos != e) && is_separator((*pos)[0])) {
+      if ((++pos != e) && is_separator((*pos)[0], style)) {
         // {C:/,//net/}, so get the first two components.
         return path.substr(0, b->size() + pos->size());
       } else {
@@ -381,7 +372,7 @@ StringRef root_path(StringRef path) {
     }
 
     // POSIX style root directory.
-    if (is_separator((*b)[0])) {
+    if (is_separator((*b)[0], style)) {
       return *b;
     }
   }
@@ -389,17 +380,12 @@ StringRef root_path(StringRef path) {
   return StringRef();
 }
 
-StringRef root_name(StringRef path) {
-  const_iterator b = begin(path),
-                 e = end(path);
+StringRef root_name(StringRef path, Style style) {
+  const_iterator b = begin(path, style), e = end(path);
   if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
+    bool has_net =
+        b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
+    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
 
     if (has_net || has_drive) {
       // just {C:,//net}, return the first component.
@@ -411,27 +397,21 @@ StringRef root_name(StringRef path) {
   return StringRef();
 }
 
-StringRef root_directory(StringRef path) {
-  const_iterator b = begin(path),
-                 pos = b,
-                 e = end(path);
+StringRef root_directory(StringRef path, Style style) {
+  const_iterator b = begin(path, style), pos = b, e = end(path);
   if (b != e) {
-    bool has_net = b->size() > 2 && is_separator((*b)[0]) && (*b)[1] == (*b)[0];
-    bool has_drive =
-#ifdef LLVM_ON_WIN32
-      b->endswith(":");
-#else
-      false;
-#endif
+    bool has_net =
+        b->size() > 2 && is_separator((*b)[0], style) && (*b)[1] == (*b)[0];
+    bool has_drive = (real_style(style) == Style::windows) && b->endswith(":");
 
     if ((has_net || has_drive) &&
         // {C:,//net}, skip to the next component.
-        (++pos != e) && is_separator((*pos)[0])) {
+        (++pos != e) && is_separator((*pos)[0], style)) {
       return *pos;
     }
 
     // POSIX style root directory.
-    if (!has_net && is_separator((*b)[0])) {
+    if (!has_net && is_separator((*b)[0], style)) {
       return *b;
     }
   }
@@ -440,15 +420,13 @@ StringRef root_directory(StringRef path) {
   return StringRef();
 }
 
-StringRef relative_path(StringRef path) {
-  StringRef root = root_path(path);
+StringRef relative_path(StringRef path, Style style) {
+  StringRef root = root_path(path, style);
   return path.substr(root.size());
 }
 
-void append(SmallVectorImpl<char> &path, const Twine &a,
-                                         const Twine &b,
-                                         const Twine &c,
-                                         const Twine &d) {
+void append(SmallVectorImpl<char> &path, Style style, const Twine &a,
+            const Twine &b, const Twine &c, const Twine &d) {
   SmallString<32> a_storage;
   SmallString<32> b_storage;
   SmallString<32> c_storage;
@@ -461,13 +439,15 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
   if (!d.isTriviallyEmpty()) components.push_back(d.toStringRef(d_storage));
 
   for (auto &component : components) {
-    bool path_has_sep = !path.empty() && is_separator(path[path.size() - 1]);
-    bool component_has_sep = !component.empty() && is_separator(component[0]);
-    bool is_root_name = has_root_name(component);
+    bool path_has_sep =
+        !path.empty() && is_separator(path[path.size() - 1], style);
+    bool component_has_sep =
+        !component.empty() && is_separator(component[0], style);
+    bool is_root_name = has_root_name(component, style);
 
     if (path_has_sep) {
       // Strip separators from beginning of component.
-      size_t loc = component.find_first_not_of(separators);
+      size_t loc = component.find_first_not_of(separators(style));
       StringRef c = component.substr(loc);
 
       // Append it.
@@ -477,41 +457,47 @@ void append(SmallVectorImpl<char> &path, const Twine &a,
 
     if (!component_has_sep && !(path.empty() || is_root_name)) {
       // Add a separator.
-      path.push_back(preferred_separator);
+      path.push_back(preferred_separator(style));
     }
 
     path.append(component.begin(), component.end());
   }
 }
 
-void append(SmallVectorImpl<char> &path,
-            const_iterator begin, const_iterator end) {
+void append(SmallVectorImpl<char> &path, const Twine &a, const Twine &b,
+            const Twine &c, const Twine &d) {
+  append(path, Style::native, a, b, c, d);
+}
+
+void append(SmallVectorImpl<char> &path, const_iterator begin,
+            const_iterator end, Style style) {
   for (; begin != end; ++begin)
-    path::append(path, *begin);
+    path::append(path, style, *begin);
 }
 
-StringRef parent_path(StringRef path) {
-  size_t end_pos = parent_path_end(path);
+StringRef parent_path(StringRef path, Style style) {
+  size_t end_pos = parent_path_end(path, style);
   if (end_pos == StringRef::npos)
     return StringRef();
   else
     return path.substr(0, end_pos);
 }
 
-void remove_filename(SmallVectorImpl<char> &path) {
-  size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()));
+void remove_filename(SmallVectorImpl<char> &path, Style style) {
+  size_t end_pos = parent_path_end(StringRef(path.begin(), path.size()), style);
   if (end_pos != StringRef::npos)
     path.set_size(end_pos);
 }
 
-void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
+void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
+                       Style style) {
   StringRef p(path.begin(), path.size());
   SmallString<32> ext_storage;
   StringRef ext = extension.toStringRef(ext_storage);
 
   // Erase existing extension.
   size_t pos = p.find_last_of('.');
-  if (pos != StringRef::npos && pos >= filename_pos(p))
+  if (pos != StringRef::npos && pos >= filename_pos(p, style))
     path.set_size(pos);
 
   // Append '.' if needed.
@@ -523,8 +509,8 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension) {
 }
 
 void replace_path_prefix(SmallVectorImpl<char> &Path,
-                         const StringRef &OldPrefix,
-                         const StringRef &NewPrefix) {
+                         const StringRef &OldPrefix, const StringRef &NewPrefix,
+                         Style style) {
   if (OldPrefix.empty() && NewPrefix.empty())
     return;
 
@@ -540,53 +526,58 @@ void replace_path_prefix(SmallVectorImpl<char> &Path,
 
   StringRef RelPath = OrigPath.substr(OldPrefix.size());
   SmallString<256> NewPath;
-  path::append(NewPath, NewPrefix);
-  path::append(NewPath, RelPath);
+  path::append(NewPath, style, NewPrefix);
+  path::append(NewPath, style, RelPath);
   Path.swap(NewPath);
 }
 
-void native(const Twine &path, SmallVectorImpl<char> &result) {
+void native(const Twine &path, SmallVectorImpl<char> &result, Style style) {
   assert((!path.isSingleStringRef() ||
           path.getSingleStringRef().data() != result.data()) &&
          "path and result are not allowed to overlap!");
   // Clear result.
   result.clear();
   path.toVector(result);
-  native(result);
+  native(result, style);
 }
 
-void native(SmallVectorImpl<char> &Path) {
-#ifdef LLVM_ON_WIN32
-  std::replace(Path.begin(), Path.end(), '/', '\\');
-#else
-  for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
-    if (*PI == '\\') {
-      auto PN = PI + 1;
-      if (PN < PE && *PN == '\\')
-        ++PI; // increment once, the for loop will move over the escaped slash
-      else
-        *PI = '/';
+void native(SmallVectorImpl<char> &Path, Style style) {
+  if (Path.empty())
+    return;
+  if (real_style(style) == Style::windows) {
+    std::replace(Path.begin(), Path.end(), '/', '\\');
+    if (Path[0] == '~' && (Path.size() == 1 || is_separator(Path[1], style))) {
+      SmallString<128> PathHome;
+      home_directory(PathHome);
+      PathHome.append(Path.begin() + 1, Path.end());
+      Path = PathHome;
+    }
+  } else {
+    for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
+      if (*PI == '\\') {
+        auto PN = PI + 1;
+        if (PN < PE && *PN == '\\')
+          ++PI; // increment once, the for loop will move over the escaped slash
+        else
+          *PI = '/';
+      }
     }
   }
-#endif
 }
 
-std::string convert_to_slash(StringRef path) {
-#ifdef LLVM_ON_WIN32
+std::string convert_to_slash(StringRef path, Style style) {
+  if (real_style(style) != Style::windows)
+    return path;
+
   std::string s = path.str();
   std::replace(s.begin(), s.end(), '\\', '/');
   return s;
-#else
-  return path;
-#endif
 }
 
-StringRef filename(StringRef path) {
-  return *rbegin(path);
-}
+StringRef filename(StringRef path, Style style) { return *rbegin(path, style); }
 
-StringRef stem(StringRef path) {
-  StringRef fname = filename(path);
+StringRef stem(StringRef path, Style style) {
+  StringRef fname = filename(path, style);
   size_t pos = fname.find_last_of('.');
   if (pos == StringRef::npos)
     return fname;
@@ -598,8 +589,8 @@ StringRef stem(StringRef path) {
       return fname.substr(0, pos);
 }
 
-StringRef extension(StringRef path) {
-  StringRef fname = filename(path);
+StringRef extension(StringRef path, Style style) {
+  StringRef fname = filename(path, style);
   size_t pos = fname.find_last_of('.');
   if (pos == StringRef::npos)
     return StringRef();
@@ -611,110 +602,109 @@ StringRef extension(StringRef path) {
       return fname.substr(pos);
 }
 
-bool is_separator(char value) {
-  switch(value) {
-#ifdef LLVM_ON_WIN32
-    case '\\': // fall through
-#endif
-    case '/': return true;
-    default: return false;
-  }
+bool is_separator(char value, Style style) {
+  if (value == '/')
+    return true;
+  if (real_style(style) == Style::windows)
+    return value == '\\';
+  return false;
 }
 
-static const char preferred_separator_string[] = { preferred_separator, '\0' };
-
-StringRef get_separator() {
-  return preferred_separator_string;
+StringRef get_separator(Style style) {
+  if (real_style(style) == Style::windows)
+    return "\\";
+  return "/";
 }
 
-bool has_root_name(const Twine &path) {
+bool has_root_name(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !root_name(p).empty();
+  return !root_name(p, style).empty();
 }
 
-bool has_root_directory(const Twine &path) {
+bool has_root_directory(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !root_directory(p).empty();
+  return !root_directory(p, style).empty();
 }
 
-bool has_root_path(const Twine &path) {
+bool has_root_path(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !root_path(p).empty();
+  return !root_path(p, style).empty();
 }
 
-bool has_relative_path(const Twine &path) {
+bool has_relative_path(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !relative_path(p).empty();
+  return !relative_path(p, style).empty();
 }
 
-bool has_filename(const Twine &path) {
+bool has_filename(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !filename(p).empty();
+  return !filename(p, style).empty();
 }
 
-bool has_parent_path(const Twine &path) {
+bool has_parent_path(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !parent_path(p).empty();
+  return !parent_path(p, style).empty();
 }
 
-bool has_stem(const Twine &path) {
+bool has_stem(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !stem(p).empty();
+  return !stem(p, style).empty();
 }
 
-bool has_extension(const Twine &path) {
+bool has_extension(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  return !extension(p).empty();
+  return !extension(p, style).empty();
 }
 
-bool is_absolute(const Twine &path) {
+bool is_absolute(const Twine &path, Style style) {
   SmallString<128> path_storage;
   StringRef p = path.toStringRef(path_storage);
 
-  bool rootDir = has_root_directory(p),
-#ifdef LLVM_ON_WIN32
-       rootName = has_root_name(p);
-#else
-       rootName = true;
-#endif
+  bool rootDir = has_root_directory(p, style);
+  bool rootName =
+      (real_style(style) != Style::windows) || has_root_name(p, style);
 
   return rootDir && rootName;
 }
 
-bool is_relative(const Twine &path) { return !is_absolute(path); }
+bool is_relative(const Twine &path, Style style) {
+  return !is_absolute(path, style);
+}
 
-StringRef remove_leading_dotslash(StringRef Path) {
+StringRef remove_leading_dotslash(StringRef Path, Style style) {
   // Remove leading "./" (or ".//" or "././" etc.)
-  while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1])) {
+  while (Path.size() > 2 && Path[0] == '.' && is_separator(Path[1], style)) {
     Path = Path.substr(2);
-    while (Path.size() > 0 && is_separator(Path[0]))
+    while (Path.size() > 0 && is_separator(Path[0], style))
       Path = Path.substr(1);
   }
   return Path;
 }
 
-static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot) {
+static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot,
+                                    Style style) {
   SmallVector<StringRef, 16> components;
 
   // Skip the root path, then look for traversal in the components.
-  StringRef rel = path::relative_path(path);
-  for (StringRef C : llvm::make_range(path::begin(rel), path::end(rel))) {
+  StringRef rel = path::relative_path(path, style);
+  for (StringRef C :
+       llvm::make_range(path::begin(rel, style), path::end(rel))) {
     if (C == ".")
       continue;
     // Leading ".." will remain in the path unless it's at the root.
@@ -723,22 +713,23 @@ static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot) {
         components.pop_back();
         continue;
       }
-      if (path::is_absolute(path))
+      if (path::is_absolute(path, style))
         continue;
     }
     components.push_back(C);
   }
 
-  SmallString<256> buffer = path::root_path(path);
+  SmallString<256> buffer = path::root_path(path, style);
   for (StringRef C : components)
-    path::append(buffer, C);
+    path::append(buffer, style, C);
   return buffer;
 }
 
-bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot) {
+bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot,
+                 Style style) {
   StringRef p(path.data(), path.size());
 
-  SmallString<256> result = remove_dots(p, remove_dot_dot);
+  SmallString<256> result = remove_dots(p, remove_dot_dot, style);
   if (result == path)
     return false;
 
@@ -776,7 +767,7 @@ createTemporaryFile(const Twine &Model, int &ResultFD,
                     llvm::SmallVectorImpl<char> &ResultPath, FSEntity Type) {
   SmallString<128> Storage;
   StringRef P = Model.toNullTerminatedStringRef(Storage);
-  assert(P.find_first_of(separators) == StringRef::npos &&
+  assert(P.find_first_of(separators(Style::native)) == StringRef::npos &&
          "Model must be a simple filename.");
   // Use P.begin() so that createUniqueEntity doesn't need to recreate Storage.
   return createUniqueEntity(P.begin(), ResultFD, ResultPath,
@@ -818,12 +809,9 @@ static std::error_code make_absolute(const Twine &current_directory,
                                      bool use_current_directory) {
   StringRef p(path.data(), path.size());
 
-  bool rootDirectory = path::has_root_directory(p),
-#ifdef LLVM_ON_WIN32
-       rootName = path::has_root_name(p);
-#else
-       rootName = true;
-#endif
+  bool rootDirectory = path::has_root_directory(p);
+  bool rootName =
+      (real_style(Style::native) != Style::windows) || path::has_root_name(p);
 
   // Already absolute.
   if (rootName && rootDirectory)
@@ -937,6 +925,36 @@ std::error_code copy_file(const Twine &From, const Twine &To) {
   return std::error_code();
 }
 
+ErrorOr<MD5::MD5Result> md5_contents(int FD) {
+  MD5 Hash;
+
+  constexpr size_t BufSize = 4096;
+  std::vector<uint8_t> Buf(BufSize);
+  int BytesRead = 0;
+  for (;;) {
+    BytesRead = read(FD, Buf.data(), BufSize);
+    if (BytesRead <= 0)
+      break;
+    Hash.update(makeArrayRef(Buf.data(), BytesRead));
+  }
+
+  if (BytesRead < 0)
+    return std::error_code(errno, std::generic_category());
+  MD5::MD5Result Result;
+  Hash.final(Result);
+  return Result;
+}
+
+ErrorOr<MD5::MD5Result> md5_contents(const Twine &Path) {
+  int FD;
+  if (auto EC = openFileForRead(Path, FD))
+    return EC;
+
+  auto Result = md5_contents(FD);
+  close(FD);
+  return Result;
+}
+
 bool exists(file_status status) {
   return status_known(status) && status.type() != file_type::file_not_found;
 }
@@ -945,6 +963,13 @@ bool status_known(file_status s) {
   return s.type() != file_type::status_error;
 }
 
+file_type get_file_type(const Twine &Path, bool Follow) {
+  file_status st;
+  if (status(Path, st, Follow))
+    return file_type::status_error;
+  return st.type();
+}
+
 bool is_directory(file_status status) {
   return status.type() == file_type::directory_file;
 }
@@ -969,6 +994,18 @@ std::error_code is_regular_file(const Twine &path, bool &result) {
   return std::error_code();
 }
 
+bool is_symlink_file(file_status status) {
+  return status.type() == file_type::symlink_file;
+}
+
+std::error_code is_symlink_file(const Twine &path, bool &result) {
+  file_status st;
+  if (std::error_code ec = status(path, st, false))
+    return ec;
+  result = is_symlink_file(st);
+  return std::error_code();
+}
+
 bool is_other(file_status status) {
   return exists(status) &&
          !is_regular_file(status) &&
@@ -1162,7 +1199,15 @@ std::error_code identify_magic(const Twine &Path, file_magic &Result) {
 }
 
 std::error_code directory_entry::status(file_status &result) const {
-  return fs::status(Path, result);
+  return fs::status(Path, result, FollowSymlinks);
+}
+
+ErrorOr<perms> getPermissions(const Twine &Path) {
+  file_status Status;
+  if (std::error_code EC = status(Path, Status))
+    return EC;
+
+  return Status.permissions();
 }
 
 } // end namespace fs
diff --git a/lib/Support/RWMutex.cpp b/lib/Support/RWMutex.cpp
index 3b6309cef21acc941deae3c3ead410fee98cfb72..6c9781c4e2d6d95af19ccea1d91e2275913b438a 100644
--- a/lib/Support/RWMutex.cpp
+++ b/lib/Support/RWMutex.cpp
@@ -13,7 +13,6 @@
 
 #include "llvm/Config/config.h"
 #include "llvm/Support/RWMutex.h"
-#include <cstring>
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only TRULY operating system
@@ -22,29 +21,31 @@
 
 #if !defined(LLVM_ENABLE_THREADS) || LLVM_ENABLE_THREADS == 0
 // Define all methods as no-ops if threading is explicitly disabled
-namespace llvm {
+
+using namespace llvm;
 using namespace sys;
-RWMutexImpl::RWMutexImpl() { }
-RWMutexImpl::~RWMutexImpl() { }
+
+RWMutexImpl::RWMutexImpl() = default;
+RWMutexImpl::~RWMutexImpl() = default;
+
 bool RWMutexImpl::reader_acquire() { return true; }
 bool RWMutexImpl::reader_release() { return true; }
 bool RWMutexImpl::writer_acquire() { return true; }
 bool RWMutexImpl::writer_release() { return true; }
-}
+
 #else
 
 #if defined(HAVE_PTHREAD_H) && defined(HAVE_PTHREAD_RWLOCK_INIT)
 
 #include <cassert>
+#include <cstdlib>
 #include <pthread.h>
-#include <stdlib.h>
 
-namespace llvm {
+using namespace llvm;
 using namespace sys;
 
 // Construct a RWMutex using pthread calls
 RWMutexImpl::RWMutexImpl()
-  : data_(nullptr)
 {
   // Declare the pthread_rwlock data structures
   pthread_rwlock_t* rwlock =
@@ -113,8 +114,6 @@ RWMutexImpl::writer_release()
   return errorcode == 0;
 }
 
-}
-
 #elif defined(LLVM_ON_UNIX)
 #include "Unix/RWMutex.inc"
 #elif defined( LLVM_ON_WIN32)
diff --git a/lib/Support/Signals.cpp b/lib/Support/Signals.cpp
index e5e38f59c0407b99e5f8ea330213c6749d7f64c2..57f36bf175b3a80956b53f4d8f7f3522eba53374 100644
--- a/lib/Support/Signals.cpp
+++ b/lib/Support/Signals.cpp
@@ -29,7 +29,6 @@
 #include <vector>
 
 namespace llvm {
-using namespace sys;
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only TRULY operating system
diff --git a/lib/Support/SourceMgr.cpp b/lib/Support/SourceMgr.cpp
index 4cb9b2ff2cdaea7f459a94025e927f56cf206f80..ca2391c10ff1009743668fbd29e55b2683c2162b 100644
--- a/lib/Support/SourceMgr.cpp
+++ b/lib/Support/SourceMgr.cpp
@@ -13,30 +13,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/Locale.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
+
 using namespace llvm;
 
 static const size_t TabStop = 8;
 
 namespace {
+
   struct LineNoCacheTy {
     const char *LastQuery;
     unsigned LastQueryBufferID;
     unsigned LineNoOfQuery;
   };
-}
+
+} // end anonymous namespace
 
 static LineNoCacheTy *getCache(void *Ptr) {
   return (LineNoCacheTy*)Ptr;
 }
 
-
 SourceMgr::~SourceMgr() {
   // Delete the line # cache if allocated.
   if (LineNoCacheTy *Cache = getCache(LineNoCache))
@@ -132,12 +145,10 @@ void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
      << ":" << FindLineNumber(IncludeLoc, CurBuf) << ":\n";
 }
 
-
 SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                                    const Twine &Msg,
                                    ArrayRef<SMRange> Ranges,
                                    ArrayRef<SMFixIt> FixIts) const {
-
   // First thing to do: find the current buffer containing the specified
   // location to pull out the source line.
   SmallVector<std::pair<unsigned, unsigned>, 4> ColRanges;
@@ -223,7 +234,7 @@ void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
 void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
                              const Twine &Msg, ArrayRef<SMRange> Ranges,
                              ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
-  PrintMessage(llvm::errs(), Loc, Kind, Msg, Ranges, FixIts, ShowColors);
+  PrintMessage(errs(), Loc, Kind, Msg, Ranges, FixIts, ShowColors);
 }
 
 //===----------------------------------------------------------------------===//
@@ -233,7 +244,7 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
 SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
                            int Line, int Col, SourceMgr::DiagKind Kind,
                            StringRef Msg, StringRef LineStr,
-                           ArrayRef<std::pair<unsigned,unsigned> > Ranges,
+                           ArrayRef<std::pair<unsigned,unsigned>> Ranges,
                            ArrayRef<SMFixIt> Hints)
   : SM(&sm), Loc(L), Filename(FN), LineNo(Line), ColumnNo(Col), Kind(Kind),
     Message(Msg), LineContents(LineStr), Ranges(Ranges.vec()),
@@ -286,7 +297,7 @@ static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
     // FIXME: This assertion is intended to catch unintended use of multibyte
     // characters in fixits. If we decide to do this, we'll have to track
     // separate byte widths for the source and fixit lines.
-    assert((size_t)llvm::sys::locale::columnWidth(I->getText()) ==
+    assert((size_t)sys::locale::columnWidth(I->getText()) ==
            I->getText().size());
 
     // This relies on one byte per column in our fixit hints.
diff --git a/lib/Support/StringRef.cpp b/lib/Support/StringRef.cpp
index d81250e48ddee40b6f4a7b50e26e0116da7a85df..9b7cc1c1d182cfeddd9e98bd476cefb886a13778 100644
--- a/lib/Support/StringRef.cpp
+++ b/lib/Support/StringRef.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/edit_distance.h"
@@ -595,6 +596,18 @@ bool StringRef::getAsInteger(unsigned Radix, APInt &Result) const {
   return false;
 }
 
+bool StringRef::getAsDouble(double &Result, bool AllowInexact) const {
+  APFloat F(0.0);
+  APFloat::opStatus Status =
+      F.convertFromString(*this, APFloat::rmNearestTiesToEven);
+  if (Status != APFloat::opOK) {
+    if (!AllowInexact || Status != APFloat::opInexact)
+      return true;
+  }
+
+  Result = F.convertToDouble();
+  return false;
+}
 
 // Implementation of StringRef hashing.
 hash_code llvm::hash_value(StringRef S) {
diff --git a/lib/Support/TargetParser.cpp b/lib/Support/TargetParser.cpp
index 42fab671a251f791e896c013d7c0e8b6a1e7e279..639d2ece263a173b7a4355bc3d3c7929cc493097 100644
--- a/lib/Support/TargetParser.cpp
+++ b/lib/Support/TargetParser.cpp
@@ -448,6 +448,8 @@ bool llvm::AArch64::getExtensionFeatures(unsigned Extensions,
     Features.push_back("+spe");
   if (Extensions & AArch64::AEK_RAS)
     Features.push_back("+ras");
+  if (Extensions & AArch64::AEK_LSE)
+    Features.push_back("+lse");
 
   return true;
 }
@@ -725,6 +727,7 @@ unsigned llvm::ARM::parseArchProfile(StringRef Arch) {
   case ARM::AK_ARMV8R:
     return ARM::PK_R;
   case ARM::AK_ARMV7A:
+  case ARM::AK_ARMV7VE:
   case ARM::AK_ARMV7K:
   case ARM::AK_ARMV8A:
   case ARM::AK_ARMV8_1A:
@@ -761,6 +764,7 @@ unsigned llvm::ARM::parseArchVersion(StringRef Arch) {
   case ARM::AK_ARMV6M:
     return 6;
   case ARM::AK_ARMV7A:
+  case ARM::AK_ARMV7VE:
   case ARM::AK_ARMV7R:
   case ARM::AK_ARMV7M:
   case ARM::AK_ARMV7S:
diff --git a/lib/Support/Threading.cpp b/lib/Support/Threading.cpp
index 760f9e2c388ba0b2e35ca3dfe01340011ad35441..6a10b988d4648a9e95e42c2761b34f0717c52122 100644
--- a/lib/Support/Threading.cpp
+++ b/lib/Support/Threading.cpp
@@ -14,14 +14,20 @@
 
 #include "llvm/Support/Threading.h"
 #include "llvm/Config/config.h"
-#include "llvm/Support/Atomic.h"
 #include "llvm/Support/Host.h"
-#include "llvm/Support/Mutex.h"
-#include "llvm/Support/thread.h"
+
 #include <cassert>
+#include <errno.h>
+#include <stdlib.h>
+#include <string.h>
 
 using namespace llvm;
 
+//===----------------------------------------------------------------------===//
+//=== WARNING: Implementation here must contain only TRULY operating system
+//===          independent code.
+//===----------------------------------------------------------------------===//
+
 bool llvm::llvm_is_multithreaded() {
 #if LLVM_ENABLE_THREADS != 0
   return true;
@@ -30,100 +36,47 @@ bool llvm::llvm_is_multithreaded() {
 #endif
 }
 
-#if LLVM_ENABLE_THREADS != 0 && defined(HAVE_PTHREAD_H)
-#include <pthread.h>
-
-struct ThreadInfo {
-  void (*UserFn)(void *);
-  void *UserData;
-};
-static void *ExecuteOnThread_Dispatch(void *Arg) {
-  ThreadInfo *TI = reinterpret_cast<ThreadInfo*>(Arg);
-  TI->UserFn(TI->UserData);
-  return nullptr;
-}
-
-void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
+#if LLVM_ENABLE_THREADS == 0 ||                                                \
+    (!defined(LLVM_ON_WIN32) && !defined(HAVE_PTHREAD_H))
+// Support for non-Win32, non-pthread implementation.
+void llvm::llvm_execute_on_thread(void (*Fn)(void *), void *UserData,
                                   unsigned RequestedStackSize) {
-  ThreadInfo Info = { Fn, UserData };
-  pthread_attr_t Attr;
-  pthread_t Thread;
-
-  // Construct the attributes object.
-  if (::pthread_attr_init(&Attr) != 0)
-    return;
-
-  // Set the requested stack size, if given.
-  if (RequestedStackSize != 0) {
-    if (::pthread_attr_setstacksize(&Attr, RequestedStackSize) != 0)
-      goto error;
-  }
-
-  // Construct and execute the thread.
-  if (::pthread_create(&Thread, &Attr, ExecuteOnThread_Dispatch, &Info) != 0)
-    goto error;
-
-  // Wait for the thread and clean up.
-  ::pthread_join(Thread, nullptr);
-
- error:
-  ::pthread_attr_destroy(&Attr);
+  (void)RequestedStackSize;
+  Fn(UserData);
 }
-#elif LLVM_ENABLE_THREADS!=0 && defined(LLVM_ON_WIN32)
-#include "Windows/WindowsSupport.h"
-#include <process.h>
 
-// Windows will at times define MemoryFence.
-#ifdef MemoryFence
-#undef MemoryFence
-#endif
+unsigned llvm::heavyweight_hardware_concurrency() { return 1; }
 
-struct ThreadInfo {
-  void (*func)(void*);
-  void *param;
-};
+uint64_t llvm::get_threadid() { return 0; }
 
-static unsigned __stdcall ThreadCallback(void *param) {
-  struct ThreadInfo *info = reinterpret_cast<struct ThreadInfo *>(param);
-  info->func(info->param);
+uint32_t llvm::get_max_thread_name_length() { return 0; }
 
-  return 0;
-}
+void llvm::set_thread_name(const Twine &Name) {}
 
-void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
-                                  unsigned RequestedStackSize) {
-  struct ThreadInfo param = { Fn, UserData };
-
-  HANDLE hThread = (HANDLE)::_beginthreadex(NULL,
-                                            RequestedStackSize, ThreadCallback,
-                                            &param, 0, NULL);
+void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); }
 
-  if (hThread) {
-    // We actually don't care whether the wait succeeds or fails, in
-    // the same way we don't care whether the pthread_join call succeeds
-    // or fails.  There's not much we could do if this were to fail. But
-    // on success, this call will wait until the thread finishes executing
-    // before returning.
-    (void)::WaitForSingleObject(hThread, INFINITE);
-    ::CloseHandle(hThread);
-  }
-}
 #else
-// Support for non-Win32, non-pthread implementation.
-void llvm::llvm_execute_on_thread(void (*Fn)(void*), void *UserData,
-                                  unsigned RequestedStackSize) {
-  (void) RequestedStackSize;
-  Fn(UserData);
-}
-
-#endif
 
+#include <thread>
 unsigned llvm::heavyweight_hardware_concurrency() {
-#if !LLVM_ENABLE_THREADS
-  return 1;
-#endif
+  // Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use
+  // `std::thread` directly instead of `llvm::thread` (and indeed, doing so
+  // allows us to not define `thread` in the llvm namespace, which conflicts
+  // with some platforms such as FreeBSD whose headers also define a struct
+  // called `thread` in the global namespace which can cause ambiguity due to
+  // ADL.
   int NumPhysical = sys::getHostNumPhysicalCores();
   if (NumPhysical == -1)
-    return thread::hardware_concurrency();
+    return std::thread::hardware_concurrency();
   return NumPhysical;
 }
+
+// Include the platform-specific parts of this class.
+#ifdef LLVM_ON_UNIX
+#include "Unix/Threading.inc"
+#endif
+#ifdef LLVM_ON_WIN32
+#include "Windows/Threading.inc"
+#endif
+
+#endif
diff --git a/lib/Support/Timer.cpp b/lib/Support/Timer.cpp
index fbd73d0b6b3b640224ba64897eb061d526670b1d..8d68c6ae9682a6f1cb3e167add2cbb9027127475 100644
--- a/lib/Support/Timer.cpp
+++ b/lib/Support/Timer.cpp
@@ -72,22 +72,9 @@ std::unique_ptr<raw_fd_ostream> llvm::CreateInfoOutputFile() {
   return llvm::make_unique<raw_fd_ostream>(2, false); // stderr.
 }
 
-
-static TimerGroup *DefaultTimerGroup = nullptr;
 static TimerGroup *getDefaultTimerGroup() {
-  TimerGroup *tmp = DefaultTimerGroup;
-  sys::MemoryFence();
-  if (tmp) return tmp;
-
-  sys::SmartScopedLock<true> Lock(*TimerLock);
-  tmp = DefaultTimerGroup;
-  if (!tmp) {
-    tmp = new TimerGroup("misc", "Miscellaneous Ungrouped Timers");
-    sys::MemoryFence();
-    DefaultTimerGroup = tmp;
-  }
-
-  return tmp;
+  static TimerGroup DefaultTimerGroup("misc", "Miscellaneous Ungrouped Timers");
+  return &DefaultTimerGroup;
 }
 
 //===----------------------------------------------------------------------===//
@@ -309,7 +296,7 @@ void TimerGroup::PrintQueuedTimers(raw_ostream &OS) {
   // If this is not an collection of ungrouped times, print the total time.
   // Ungrouped timers don't really make sense to add up.  We still print the
   // TOTAL line to make the percentages make sense.
-  if (this != DefaultTimerGroup)
+  if (this != getDefaultTimerGroup())
     OS << format("  Total Execution Time: %5.4f seconds (%5.4f wall clock)\n",
                  Total.getProcessTime(), Total.getWallTime());
   OS << '\n';
diff --git a/lib/Support/Triple.cpp b/lib/Support/Triple.cpp
index 808c6dc406c0fdbbc6587a69b662128f87b8ae2b..64d5977e2ebd77f41d2f84ebae60077edac67436 100644
--- a/lib/Support/Triple.cpp
+++ b/lib/Support/Triple.cpp
@@ -551,6 +551,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
   case ARM::AK_ARMV7A:
   case ARM::AK_ARMV7R:
     return Triple::ARMSubArch_v7;
+  case ARM::AK_ARMV7VE:
+    return Triple::ARMSubArch_v7ve;
   case ARM::AK_ARMV7K:
     return Triple::ARMSubArch_v7k;
   case ARM::AK_ARMV7M:
@@ -1513,6 +1515,7 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
       return "strongarm";
     }
   case llvm::Triple::NaCl:
+  case llvm::Triple::OpenBSD:
     return "cortex-a8";
   default:
     switch (getEnvironment()) {
diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc
index ecc9a2ea8e2156f38f87d55e05e610417520cc04..93f8982196b3c5285d2742932a7e3762ed3b2f3d 100644
--- a/lib/Support/Unix/Path.inc
+++ b/lib/Support/Unix/Path.inc
@@ -48,6 +48,8 @@
 # endif
 #endif
 
+#include <pwd.h>
+
 #ifdef __APPLE__
 #include <mach-o/dyld.h>
 #include <sys/attr.h>
@@ -65,23 +67,41 @@
 #endif
 
 #include <sys/types.h>
-#if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__ANDROID__)
+#if !defined(__APPLE__) && !defined(__OpenBSD__) && !defined(__FreeBSD__) &&   \
+    !defined(__linux__)
 #include <sys/statvfs.h>
 #define STATVFS statvfs
+#define FSTATVFS fstatvfs
 #define STATVFS_F_FRSIZE(vfs) vfs.f_frsize
 #else
-#ifdef __OpenBSD__
+#if defined(__OpenBSD__) || defined(__FreeBSD__)
 #include <sys/param.h>
 #include <sys/mount.h>
-#elif defined(__ANDROID__)
+#elif defined(__linux__)
+#if defined(HAVE_LINUX_MAGIC_H)
+#include <linux/magic.h>
+#else
+#if defined(HAVE_LINUX_NFS_FS_H)
+#include <linux/nfs_fs.h>
+#endif
+#if defined(HAVE_LINUX_SMB_H)
+#include <linux/smb.h>
+#endif
+#endif
 #include <sys/vfs.h>
 #else
 #include <sys/mount.h>
 #endif
 #define STATVFS statfs
+#define FSTATVFS fstatfs
 #define STATVFS_F_FRSIZE(vfs) static_cast<uint64_t>(vfs.f_bsize)
 #endif
 
+#if defined(__NetBSD__)
+#define STATVFS_F_FLAG(vfs) (vfs).f_flag
+#else
+#define STATVFS_F_FLAG(vfs) (vfs).f_flags
+#endif
 
 using namespace llvm;
 
@@ -180,7 +200,7 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
       if (getprogpath(exe_path, argv0))
         return exe_path;
   }
-#elif defined(HAVE_DLFCN_H)
+#elif defined(HAVE_DLFCN_H) && defined(HAVE_DLADDR)
   // Use dladdr to get executable path if available.
   Dl_info DLInfo;
   int err = dladdr(MainAddr, &DLInfo);
@@ -210,6 +230,10 @@ UniqueID file_status::getUniqueID() const {
   return UniqueID(fs_st_dev, fs_st_ino);
 }
 
+uint32_t file_status::getLinkCount() const {
+  return fs_st_nlinks;
+}
+
 ErrorOr<space_info> disk_space(const Twine &Path) {
   struct STATVFS Vfs;
   if (::STATVFS(Path.str().c_str(), &Vfs))
@@ -335,6 +359,51 @@ std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
   return std::error_code();
 }
 
+static bool is_local_impl(struct STATVFS &Vfs) {
+#if defined(__linux__)
+#ifndef NFS_SUPER_MAGIC
+#define NFS_SUPER_MAGIC 0x6969
+#endif
+#ifndef SMB_SUPER_MAGIC
+#define SMB_SUPER_MAGIC 0x517B
+#endif
+#ifndef CIFS_MAGIC_NUMBER
+#define CIFS_MAGIC_NUMBER 0xFF534D42
+#endif
+  switch ((uint32_t)Vfs.f_type) {
+  case NFS_SUPER_MAGIC:
+  case SMB_SUPER_MAGIC:
+  case CIFS_MAGIC_NUMBER:
+    return false;
+  default:
+    return true;
+  }
+#elif defined(__CYGWIN__)
+  // Cygwin doesn't expose this information; would need to use Win32 API.
+  return false;
+#else
+  return !!(STATVFS_F_FLAG(Vfs) & MNT_LOCAL);
+#endif
+}
+
+std::error_code is_local(const Twine &Path, bool &Result) {
+  struct STATVFS Vfs;
+  if (::STATVFS(Path.str().c_str(), &Vfs))
+    return std::error_code(errno, std::generic_category());
+
+  Result = is_local_impl(Vfs);
+  return std::error_code();
+}
+
+std::error_code is_local(int FD, bool &Result) {
+  struct STATVFS Vfs;
+  if (::FSTATVFS(FD, &Vfs))
+    return std::error_code(errno, std::generic_category());
+
+  Result = is_local_impl(Vfs);
+  return std::error_code();
+}
+
 std::error_code rename(const Twine &from, const Twine &to) {
   // Get arguments.
   SmallString<128> from_storage;
@@ -415,6 +484,46 @@ std::error_code equivalent(const Twine &A, const Twine &B, bool &result) {
   return std::error_code();
 }
 
+static void expandTildeExpr(SmallVectorImpl<char> &Path) {
+  StringRef PathStr(Path.begin(), Path.size());
+  if (PathStr.empty() || !PathStr.startswith("~"))
+    return;
+
+  PathStr = PathStr.drop_front();
+  StringRef Expr =
+      PathStr.take_until([](char c) { return path::is_separator(c); });
+  StringRef Remainder = PathStr.substr(Expr.size() + 1);
+  SmallString<128> Storage;
+  if (Expr.empty()) {
+    // This is just ~/..., resolve it to the current user's home dir.
+    if (!path::home_directory(Storage)) {
+      // For some reason we couldn't get the home directory.  Just exit.
+      return;
+    }
+
+    // Overwrite the first character and insert the rest.
+    Path[0] = Storage[0];
+    Path.insert(Path.begin() + 1, Storage.begin() + 1, Storage.end());
+    return;
+  }
+
+  // This is a string of the form ~username/, look up this user's entry in the
+  // password database.
+  struct passwd *Entry = nullptr;
+  std::string User = Expr.str();
+  Entry = ::getpwnam(User.c_str());
+
+  if (!Entry) {
+    // Unable to look up the entry, just return back the original path.
+    return;
+  }
+
+  Storage = Remainder;
+  Path.clear();
+  Path.append(Entry->pw_dir, Entry->pw_dir + strlen(Entry->pw_dir));
+  llvm::sys::path::append(Path, Storage);
+}
+
 static std::error_code fillStatus(int StatRet, const struct stat &Status,
                              file_status &Result) {
   if (StatRet != 0) {
@@ -440,22 +549,23 @@ static std::error_code fillStatus(int StatRet, const struct stat &Status,
     Type = file_type::fifo_file;
   else if (S_ISSOCK(Status.st_mode))
     Type = file_type::socket_file;
+  else if (S_ISLNK(Status.st_mode))
+    Type = file_type::symlink_file;
 
-  perms Perms = static_cast<perms>(Status.st_mode);
-  Result =
-      file_status(Type, Perms, Status.st_dev, Status.st_ino, Status.st_atime,
-                  Status.st_mtime, Status.st_uid, Status.st_gid,
-                  Status.st_size);
+  perms Perms = static_cast<perms>(Status.st_mode) & all_perms;
+  Result = file_status(Type, Perms, Status.st_dev, Status.st_nlink,
+                       Status.st_ino, Status.st_atime, Status.st_mtime,
+                       Status.st_uid, Status.st_gid, Status.st_size);
 
   return std::error_code();
 }
 
-std::error_code status(const Twine &Path, file_status &Result) {
+std::error_code status(const Twine &Path, file_status &Result, bool Follow) {
   SmallString<128> PathStorage;
   StringRef P = Path.toNullTerminatedStringRef(PathStorage);
 
   struct stat Status;
-  int StatRet = ::stat(P.begin(), &Status);
+  int StatRet = (Follow ? ::stat : ::lstat)(P.begin(), &Status);
   return fillStatus(StatRet, Status, Result);
 }
 
@@ -465,6 +575,15 @@ std::error_code status(int FD, file_status &Result) {
   return fillStatus(StatRet, Status, Result);
 }
 
+std::error_code setPermissions(const Twine &Path, perms Permissions) {
+  SmallString<128> PathStorage;
+  StringRef P = Path.toNullTerminatedStringRef(PathStorage);
+
+  if (::chmod(P.begin(), Permissions))
+    return std::error_code(errno, std::generic_category());
+  return std::error_code();
+}
+
 std::error_code setLastModificationAndAccessTime(int FD, TimePoint<> Time) {
 #if defined(HAVE_FUTIMENS)
   timespec Times[2];
@@ -491,6 +610,26 @@ std::error_code mapped_file_region::init(int FD, uint64_t Offset,
 
   int flags = (Mode == readwrite) ? MAP_SHARED : MAP_PRIVATE;
   int prot = (Mode == readonly) ? PROT_READ : (PROT_READ | PROT_WRITE);
+#if defined(__APPLE__)
+  //----------------------------------------------------------------------
+  // Newer versions of MacOSX have a flag that will allow us to read from
+  // binaries whose code signature is invalid without crashing by using
+  // the MAP_RESILIENT_CODESIGN flag. Also if a file from removable media
+  // is mapped we can avoid crashing and return zeroes to any pages we try
+  // to read if the media becomes unavailable by using the
+  // MAP_RESILIENT_MEDIA flag.  These flags are only usable when mapping
+  // with PROT_READ, so take care not to specify them otherwise.
+  //----------------------------------------------------------------------
+  if (Mode == readonly) {
+#if defined(MAP_RESILIENT_CODESIGN)
+    flags |= MAP_RESILIENT_CODESIGN;
+#endif
+#if defined(MAP_RESILIENT_MEDIA)
+    flags |= MAP_RESILIENT_MEDIA;
+#endif
+  }
+#endif // #if defined (__APPLE__)
+
   Mapping = ::mmap(nullptr, Size, prot, flags, FD, Offset);
   if (Mapping == MAP_FAILED)
     return std::error_code(errno, std::generic_category());
@@ -536,7 +675,8 @@ int mapped_file_region::alignment() {
 }
 
 std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
-                                                StringRef path){
+                                                     StringRef path,
+                                                     bool follow_symlinks) {
   SmallString<128> path_null(path);
   DIR *directory = ::opendir(path_null.c_str());
   if (!directory)
@@ -545,7 +685,7 @@ std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
   it.IterationHandle = reinterpret_cast<intptr_t>(directory);
   // Add something for replace_filename to replace.
   path::append(path_null, ".");
-  it.CurrentEntry = directory_entry(path_null.str());
+  it.CurrentEntry = directory_entry(path_null.str(), follow_symlinks);
   return directory_iterator_increment(it);
 }
 
@@ -713,18 +853,85 @@ std::error_code getPathFromOpenFD(int FD, SmallVectorImpl<char> &ResultPath) {
   return std::error_code();
 }
 
+template <typename T>
+static std::error_code remove_directories_impl(const T &Entry,
+                                               bool IgnoreErrors) {
+  std::error_code EC;
+  directory_iterator Begin(Entry, EC, false);
+  directory_iterator End;
+  while (Begin != End) {
+    auto &Item = *Begin;
+    file_status st;
+    EC = Item.status(st);
+    if (EC && !IgnoreErrors)
+      return EC;
+
+    if (is_directory(st)) {
+      EC = remove_directories_impl(Item, IgnoreErrors);
+      if (EC && !IgnoreErrors)
+        return EC;
+    }
+
+    EC = fs::remove(Item.path(), true);
+    if (EC && !IgnoreErrors)
+      return EC;
+
+    Begin.increment(EC);
+    if (EC && !IgnoreErrors)
+      return EC;
+  }
+  return std::error_code();
+}
+
+std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
+  auto EC = remove_directories_impl(path, IgnoreErrors);
+  if (EC && !IgnoreErrors)
+    return EC;
+  EC = fs::remove(path, true);
+  if (EC && !IgnoreErrors)
+    return EC;
+  return std::error_code();
+}
+
+std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
+                          bool expand_tilde) {
+  dest.clear();
+  if (path.isTriviallyEmpty())
+    return std::error_code();
+
+  if (expand_tilde) {
+    SmallString<128> Storage;
+    path.toVector(Storage);
+    expandTildeExpr(Storage);
+    return real_path(Storage, dest, false);
+  }
+
+  int fd;
+  std::error_code EC = openFileForRead(path, fd, &dest);
+
+  if (EC)
+    return EC;
+  ::close(fd);
+  return std::error_code();
+}
+
 } // end namespace fs
 
 namespace path {
 
 bool home_directory(SmallVectorImpl<char> &result) {
-  if (char *RequestedDir = getenv("HOME")) {
-    result.clear();
-    result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
-    return true;
+  char *RequestedDir = getenv("HOME");
+  if (!RequestedDir) {
+    struct passwd *pw = getpwuid(getuid());
+    if (pw && pw->pw_dir)
+      RequestedDir = pw->pw_dir;
   }
+  if (!RequestedDir)
+    return false;
 
-  return false;
+  result.clear();
+  result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+  return true;
 }
 
 static bool getDarwinConfDir(bool TempDir, SmallVectorImpl<char> &Result) {
diff --git a/lib/Support/Unix/Signals.inc b/lib/Support/Unix/Signals.inc
index 9752b70644c64a2c5a69a4e923e1bb8893507309..88ad21e9806ed7e3282ceb397c2c9355bb6a7d4a 100644
--- a/lib/Support/Unix/Signals.inc
+++ b/lib/Support/Unix/Signals.inc
@@ -25,8 +25,8 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <string>
-#if HAVE_EXECINFO_H
-# include <execinfo.h>         // For backtrace().
+#ifdef HAVE_BACKTRACE
+# include BACKTRACE_HEADER         // For backtrace().
 #endif
 #if HAVE_SIGNAL_H
 #include <signal.h>
@@ -59,7 +59,7 @@ using namespace llvm;
 
 static RETSIGTYPE SignalHandler(int Sig);  // defined below.
 
-static ManagedStatic<SmartMutex<true> > SignalsMutex;
+static ManagedStatic<sys::SmartMutex<true> > SignalsMutex;
 
 /// InterruptFunction - The function to call if ctrl-c is pressed.
 static void (*InterruptFunction)() = nullptr;
@@ -149,11 +149,7 @@ static void CreateSigAltStack() {}
 #endif
 
 static void RegisterHandlers() {
-  // We need to dereference the signals mutex during handler registration so
-  // that we force its construction. This is to prevent the first use being
-  // during handling an actual signal because you can't safely call new in a
-  // signal handler.
-  *SignalsMutex;
+  sys::SmartScopedLock<true> Guard(*SignalsMutex);
 
   // If the handlers are already registered, we're done.
   if (NumRegisteredSignals != 0) return;
@@ -223,7 +219,7 @@ static RETSIGTYPE SignalHandler(int Sig) {
   sigprocmask(SIG_UNBLOCK, &SigMask, nullptr);
 
   {
-    unique_lock<SmartMutex<true>> Guard(*SignalsMutex);
+    unique_lock<sys::SmartMutex<true>> Guard(*SignalsMutex);
     RemoveFilesToRemove();
 
     if (std::find(std::begin(IntSigs), std::end(IntSigs), Sig)
@@ -412,7 +408,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) {
 
   if (printSymbolizedStackTrace(Argv0, StackTrace, depth, OS))
     return;
-#if HAVE_DLFCN_H && __GNUG__ && !defined(__CYGWIN__)
+#if HAVE_DLFCN_H && HAVE_DLADDR
   int width = 0;
   for (int i = 0; i < depth; ++i) {
     Dl_info dlinfo;
@@ -462,7 +458,7 @@ void llvm::sys::PrintStackTrace(raw_ostream &OS) {
 }
 
 static void PrintStackTraceSignalHandler(void *) {
-  PrintStackTrace(llvm::errs());
+  sys::PrintStackTrace(llvm::errs());
 }
 
 void llvm::sys::DisableSystemDialogsOnCrash() {}
diff --git a/lib/Support/Unix/Threading.inc b/lib/Support/Unix/Threading.inc
new file mode 100644
index 0000000000000000000000000000000000000000..407b194e1b6ae4fb02c387e4aa7f3d81ca44f474
--- /dev/null
+++ b/lib/Support/Unix/Threading.inc
@@ -0,0 +1,215 @@
+//===- Unix/Threading.inc - Unix Threading Implementation ----- -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Unix specific implementation of Threading functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+
+#if defined(__APPLE__)
+#include <mach/mach_init.h>
+#include <mach/mach_port.h>
+#endif
+
+#include <pthread.h>
+
+#if defined(__FreeBSD__)
+#include <pthread_np.h> // For pthread_getthreadid_np()
+#endif
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+#include <sys/sysctl.h>
+#include <sys/user.h>
+#include <errno.h>
+#include <unistd.h>
+#endif
+
+#if defined(__NetBSD__)
+#include <lwp.h>  // For _lwp_self()
+#endif
+
+#if defined(__linux__)
+#include <unistd.h> // For syscall()
+#include <sys/syscall.h>  // For syscall codes
+#endif
+
+namespace {
+  struct ThreadInfo {
+    void(*UserFn)(void *);
+    void *UserData;
+  };
+}
+
+static void *ExecuteOnThread_Dispatch(void *Arg) {
+  ThreadInfo *TI = reinterpret_cast<ThreadInfo*>(Arg);
+  TI->UserFn(TI->UserData);
+  return nullptr;
+}
+
+void llvm::llvm_execute_on_thread(void(*Fn)(void*), void *UserData,
+  unsigned RequestedStackSize) {
+  ThreadInfo Info = { Fn, UserData };
+  pthread_attr_t Attr;
+  pthread_t Thread;
+
+  // Construct the attributes object.
+  if (::pthread_attr_init(&Attr) != 0)
+    return;
+
+  // Set the requested stack size, if given.
+  if (RequestedStackSize != 0) {
+    if (::pthread_attr_setstacksize(&Attr, RequestedStackSize) != 0)
+      goto error;
+  }
+
+  // Construct and execute the thread.
+  if (::pthread_create(&Thread, &Attr, ExecuteOnThread_Dispatch, &Info) != 0)
+    goto error;
+
+  // Wait for the thread and clean up.
+  ::pthread_join(Thread, nullptr);
+
+error:
+  ::pthread_attr_destroy(&Attr);
+}
+
+
+uint64_t llvm::get_threadid() {
+#if defined(__APPLE__)
+  // Calling "mach_thread_self()" bumps the reference count on the thread
+  // port, so we need to deallocate it. mach_task_self() doesn't bump the ref
+  // count.
+  thread_port_t Self = mach_thread_self();
+  mach_port_deallocate(mach_task_self(), Self);
+  return Self;
+#elif defined(__FreeBSD__)
+  return uint64_t(pthread_getthreadid_np());
+#elif defined(__NetBSD__)
+  return uint64_t(_lwp_self());
+#elif defined(__ANDROID__)
+  return uint64_t(gettid());
+#elif defined(__linux__)
+  return uint64_t(syscall(SYS_gettid));
+#elif defined(LLVM_ON_WIN32)
+  return uint64_t(::GetCurrentThreadId());
+#else
+  return uint64_t(pthread_self());
+#endif
+}
+
+
+static constexpr uint32_t get_max_thread_name_length_impl() {
+#if defined(__NetBSD__)
+	return PTHREAD_MAX_NAMELEN_NP;
+#elif defined(__APPLE__)
+	return 64;
+#elif defined(__linux__)
+#if HAVE_PTHREAD_SETNAME_NP
+	return 16;
+#else
+	return 0;
+#endif
+#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+  return 16;
+#else
+  return 0;
+#endif
+}
+
+uint32_t llvm::get_max_thread_name_length() {
+  return get_max_thread_name_length_impl();
+}
+
+void llvm::set_thread_name(const Twine &Name) {
+  // Make sure the input is null terminated.
+  SmallString<64> Storage;
+  StringRef NameStr = Name.toNullTerminatedStringRef(Storage);
+
+  // Truncate from the beginning, not the end, if the specified name is too
+  // long.  For one, this ensures that the resulting string is still null
+  // terminated, but additionally the end of a long thread name will usually
+  // be more unique than the beginning, since a common pattern is for similar
+  // threads to share a common prefix.
+  if (get_max_thread_name_length() > 0)
+    NameStr = NameStr.take_back(get_max_thread_name_length());
+  (void)NameStr;
+#if defined(__linux__)
+#if (defined(__GLIBC__) && defined(_GNU_SOURCE)) || defined(__ANDROID__)
+#if HAVE_PTHREAD_SETNAME_NP
+  ::pthread_setname_np(::pthread_self(), NameStr.data());
+#endif
+#endif
+#elif defined(__FreeBSD__)
+  ::pthread_set_name_np(::pthread_self(), NameStr.data());
+#elif defined(__NetBSD__)
+  ::pthread_setname_np(::pthread_self(), "%s",
+    const_cast<char *>(NameStr.data()));
+#elif defined(__APPLE__)
+  ::pthread_setname_np(NameStr.data());
+#endif
+}
+
+void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
+  Name.clear();
+
+#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+  int pid = ::getpid();
+  uint64_t tid = get_threadid();
+
+  struct kinfo_proc *kp = nullptr, *nkp;
+  size_t len = 0;
+  int error;
+  int ctl[4] = { CTL_KERN, KERN_PROC, KERN_PROC_PID | KERN_PROC_INC_THREAD,
+    (int)pid };
+
+  while (1) {
+    error = sysctl(ctl, 4, kp, &len, nullptr, 0);
+    if (kp == nullptr || (error != 0 && errno == ENOMEM)) {
+      // Add extra space in case threads are added before next call.
+      len += sizeof(*kp) + len / 10;
+      nkp = (struct kinfo_proc *)realloc(kp, len);
+      if (nkp == nullptr) {
+        free(kp);
+        return;
+      }
+      kp = nkp;
+      continue;
+    }
+    if (error != 0)
+      len = 0;
+    break;
+  }
+
+  for (size_t i = 0; i < len / sizeof(*kp); i++) {
+    if (kp[i].ki_tid == (lwpid_t)tid) {
+      Name.append(kp[i].ki_tdname, kp[i].ki_tdname + strlen(kp[i].ki_tdname));
+      break;
+    }
+  }
+  free(kp);
+  return;
+#elif defined(__NetBSD__)
+  constexpr uint32_t len = get_max_thread_name_length_impl();
+  char buf[len];
+  ::pthread_getname_np(::pthread_self(), buf, len);
+
+  Name.append(buf, buf + strlen(buf));
+#elif defined(__linux__)
+#if (defined(__GLIBC__) && defined(_GNU_SOURCE)) || defined(__ANDROID__)
+#if HAVE_PTHREAD_GETNAME_NP
+  constexpr uint32_t len = get_max_thread_name_length_impl();
+  char Buffer[len];
+  if (0 == ::pthread_getname_np(::pthread_self(), Buffer, len))
+    Name.append(Buffer, Buffer + strlen(Buffer));
+#endif
+#endif
+#endif
+}
diff --git a/lib/Support/Windows/DynamicLibrary.inc b/lib/Support/Windows/DynamicLibrary.inc
index 050689483deb403a615879517abd0517aecedfec..709499deeafa93c0fadbdc555fa48232850b45dc 100644
--- a/lib/Support/Windows/DynamicLibrary.inc
+++ b/lib/Support/Windows/DynamicLibrary.inc
@@ -24,7 +24,6 @@
 #endif
 
 namespace llvm {
-using namespace sys;
 
 //===----------------------------------------------------------------------===//
 //=== WARNING: Implementation here must contain only Win32 specific code
@@ -33,7 +32,7 @@ using namespace sys;
 
 typedef BOOL (WINAPI *fpEnumerateLoadedModules)(HANDLE,PENUMLOADED_MODULES_CALLBACK64,PVOID);
 static fpEnumerateLoadedModules fEnumerateLoadedModules;
-static DenseSet<HMODULE> *OpenedHandles;
+static llvm::ManagedStatic<DenseSet<HMODULE> > OpenedHandles;
 
 static bool loadDebugHelp(void) {
   HMODULE hLib = ::LoadLibraryW(L"Dbghelp.dll");
@@ -51,15 +50,13 @@ ELM_Callback(PCSTR ModuleName, DWORD64 ModuleBase,
   return TRUE;
 }
 
-DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
-                                                   std::string *errMsg) {
+sys::DynamicLibrary
+sys::DynamicLibrary::getPermanentLibrary(const char *filename,
+                                         std::string *errMsg) {
   SmartScopedLock<true> lock(*SymbolsMutex);
 
   if (!filename) {
     // When no file is specified, enumerate all DLLs and EXEs in the process.
-    if (OpenedHandles == 0)
-      OpenedHandles = new DenseSet<HMODULE>();
-
     if (!fEnumerateLoadedModules) {
       if (!loadDebugHelp()) {
         assert(false && "These APIs should always be available");
@@ -79,7 +76,7 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     MakeErrMsg(errMsg, std::string(filename) + ": Can't convert to UTF-16");
     return DynamicLibrary();
   }
-  
+
   HMODULE a_handle = LoadLibraryW(filenameUnicode.data());
 
   if (a_handle == 0) {
@@ -87,9 +84,6 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
     return DynamicLibrary();
   }
 
-  if (OpenedHandles == 0)
-    OpenedHandles = new DenseSet<HMODULE>();
-
   // If we've already loaded this library, FreeLibrary() the handle in order to
   // keep the internal refcount at +1.
   if (!OpenedHandles->insert(a_handle).second)
@@ -98,6 +92,18 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
   return DynamicLibrary(a_handle);
 }
 
+sys::DynamicLibrary
+sys::DynamicLibrary::addPermanentLibrary(void *handle, std::string *errMsg) {
+  SmartScopedLock<true> lock(*SymbolsMutex);
+  // If we've already loaded this library, tell the caller.
+  if (!OpenedHandles->insert((HMODULE)handle).second) {
+    MakeErrMsg(errMsg, "Library already loaded");
+    return DynamicLibrary();
+  }
+
+  return DynamicLibrary(handle);
+}
+
 // Stack probing routines are in the support library (e.g. libgcc), but we don't
 // have dynamic linking on windows. Provide a hook.
 #define EXPLICIT_SYMBOL(SYM)                    \
@@ -123,7 +129,7 @@ DynamicLibrary DynamicLibrary::getPermanentLibrary(const char *filename,
 #undef INLINE_DEF_SYMBOL1
 #undef INLINE_DEF_SYMBOL2
 
-void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
+void *sys::DynamicLibrary::SearchForAddressOfSymbol(const char *symbolName) {
   SmartScopedLock<true> Lock(*SymbolsMutex);
 
   // First check symbols added via AddSymbol().
@@ -135,7 +141,7 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
   }
 
   // Now search the libraries.
-  if (OpenedHandles) {
+  if (OpenedHandles.isConstructed()) {
     for (DenseSet<HMODULE>::iterator I = OpenedHandles->begin(),
          E = OpenedHandles->end(); I != E; ++I) {
       FARPROC ptr = GetProcAddress((HMODULE)*I, symbolName);
@@ -171,7 +177,7 @@ void* DynamicLibrary::SearchForAddressOfSymbol(const char* symbolName) {
   return 0;
 }
 
-void *DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
+void *sys::DynamicLibrary::getAddressOfSymbol(const char *symbolName) {
   if (!isValid())
     return NULL;
   if (Data == &OpenedHandles)
diff --git a/lib/Support/Windows/Mutex.inc b/lib/Support/Windows/Mutex.inc
index ab79d079122f1f1202099260305a7f11b0012bf8..0af145ec9a4e6c6c61dcf8fd98d9aa39912e744c 100644
--- a/lib/Support/Windows/Mutex.inc
+++ b/lib/Support/Windows/Mutex.inc
@@ -20,15 +20,14 @@
 #include "llvm/Support/Mutex.h"
 
 namespace llvm {
-using namespace sys;
 
-MutexImpl::MutexImpl(bool /*recursive*/)
+sys::MutexImpl::MutexImpl(bool /*recursive*/)
 {
   data_ = new CRITICAL_SECTION;
   InitializeCriticalSection((LPCRITICAL_SECTION)data_);
 }
 
-MutexImpl::~MutexImpl()
+sys::MutexImpl::~MutexImpl()
 {
   DeleteCriticalSection((LPCRITICAL_SECTION)data_);
   delete (LPCRITICAL_SECTION)data_;
@@ -36,21 +35,21 @@ MutexImpl::~MutexImpl()
 }
 
 bool
-MutexImpl::acquire()
+sys::MutexImpl::acquire()
 {
   EnterCriticalSection((LPCRITICAL_SECTION)data_);
   return true;
 }
 
 bool
-MutexImpl::release()
+sys::MutexImpl::release()
 {
   LeaveCriticalSection((LPCRITICAL_SECTION)data_);
   return true;
 }
 
 bool
-MutexImpl::tryacquire()
+sys::MutexImpl::tryacquire()
 {
   return TryEnterCriticalSection((LPCRITICAL_SECTION)data_);
 }
diff --git a/lib/Support/Windows/Path.inc b/lib/Support/Windows/Path.inc
index 3597b55109889aebae02e8f12908a16c303fee2b..b00d3905f6584c1f261e7b490a7c92fa624efb7c 100644
--- a/lib/Support/Windows/Path.inc
+++ b/lib/Support/Windows/Path.inc
@@ -26,6 +26,7 @@
 // These two headers must be included last, and make sure shlobj is required
 // after Windows.h to make sure it picks up our definition of _WIN32_WINNT
 #include "WindowsSupport.h"
+#include <shellapi.h>
 #include <shlobj.h>
 
 #undef max
@@ -178,6 +179,10 @@ TimePoint<> file_status::getLastModificationTime() const {
   return toTimePoint(Time);
 }
 
+uint32_t file_status::getLinkCount() const {
+  return NumLinks;
+}
+
 std::error_code current_path(SmallVectorImpl<char> &result) {
   SmallVector<wchar_t, MAX_PATH> cur_path;
   DWORD len = MAX_PATH;
@@ -277,6 +282,80 @@ std::error_code remove(const Twine &path, bool IgnoreNonExisting) {
   return std::error_code();
 }
 
+static std::error_code is_local_internal(SmallVectorImpl<wchar_t> &Path,
+                                         bool &Result) {
+  SmallVector<wchar_t, 128> VolumePath;
+  size_t Len = 128;
+  while (true) {
+    VolumePath.resize(Len);
+    BOOL Success =
+        ::GetVolumePathNameW(Path.data(), VolumePath.data(), VolumePath.size());
+
+    if (Success)
+      break;
+
+    DWORD Err = ::GetLastError();
+    if (Err != ERROR_INSUFFICIENT_BUFFER)
+      return mapWindowsError(Err);
+
+    Len *= 2;
+  }
+  // If the output buffer has exactly enough space for the path name, but not
+  // the null terminator, it will leave the output unterminated.  Push a null
+  // terminator onto the end to ensure that this never happens.
+  VolumePath.push_back(L'\0');
+  VolumePath.set_size(wcslen(VolumePath.data()));
+  const wchar_t *P = VolumePath.data();
+
+  UINT Type = ::GetDriveTypeW(P);
+  switch (Type) {
+  case DRIVE_FIXED:
+    Result = true;
+    return std::error_code();
+  case DRIVE_REMOTE:
+  case DRIVE_CDROM:
+  case DRIVE_RAMDISK:
+  case DRIVE_REMOVABLE:
+    Result = false;
+    return std::error_code();
+  default:
+    return make_error_code(errc::no_such_file_or_directory);
+  }
+  llvm_unreachable("Unreachable!");
+}
+
+std::error_code is_local(const Twine &path, bool &result) {
+  if (!llvm::sys::fs::exists(path) || !llvm::sys::path::has_root_path(path))
+    return make_error_code(errc::no_such_file_or_directory);
+
+  SmallString<128> Storage;
+  StringRef P = path.toStringRef(Storage);
+
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> WidePath;
+  if (std::error_code ec = widenPath(P, WidePath))
+    return ec;
+  return is_local_internal(WidePath, result);
+}
+
+std::error_code is_local(int FD, bool &Result) {
+  SmallVector<wchar_t, 128> FinalPath;
+  HANDLE Handle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
+
+  size_t Len = 128;
+  do {
+    FinalPath.reserve(Len);
+    Len = ::GetFinalPathNameByHandleW(Handle, FinalPath.data(),
+                                      FinalPath.capacity() - 1, VOLUME_NAME_NT);
+    if (Len == 0)
+      return mapWindowsError(::GetLastError());
+  } while (Len > FinalPath.capacity());
+
+  FinalPath.set_size(Len);
+
+  return is_local_internal(FinalPath, Result);
+}
+
 std::error_code rename(const Twine &from, const Twine &to) {
   // Convert to utf-16.
   SmallVector<wchar_t, 128> wide_from;
@@ -455,13 +534,16 @@ static std::error_code getStatus(HANDLE FileHandle, file_status &Result) {
     file_type Type = (Info.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)
                          ? file_type::directory_file
                          : file_type::regular_file;
-    Result =
-        file_status(Type, Info.ftLastAccessTime.dwHighDateTime,
-                    Info.ftLastAccessTime.dwLowDateTime,
-                    Info.ftLastWriteTime.dwHighDateTime,
-                    Info.ftLastWriteTime.dwLowDateTime,
-                    Info.dwVolumeSerialNumber, Info.nFileSizeHigh,
-                    Info.nFileSizeLow, Info.nFileIndexHigh, Info.nFileIndexLow);
+    perms Permissions = (Info.dwFileAttributes & FILE_ATTRIBUTE_READONLY)
+                            ? (all_read | all_exe)
+                            : all_all;
+    Result = file_status(
+        Type, Permissions, Info.nNumberOfLinks,
+        Info.ftLastAccessTime.dwHighDateTime,
+        Info.ftLastAccessTime.dwLowDateTime,
+        Info.ftLastWriteTime.dwHighDateTime, Info.ftLastWriteTime.dwLowDateTime,
+        Info.dwVolumeSerialNumber, Info.nFileSizeHigh, Info.nFileSizeLow,
+        Info.nFileIndexHigh, Info.nFileIndexLow);
     return std::error_code();
   }
 
@@ -477,7 +559,7 @@ handle_status_error:
   return mapWindowsError(LastError);
 }
 
-std::error_code status(const Twine &path, file_status &result) {
+std::error_code status(const Twine &path, file_status &result, bool Follow) {
   SmallString<128> path_storage;
   SmallVector<wchar_t, 128> path_utf16;
 
@@ -494,28 +576,19 @@ std::error_code status(const Twine &path, file_status &result) {
   if (attr == INVALID_FILE_ATTRIBUTES)
     return getStatus(INVALID_HANDLE_VALUE, result);
 
+  DWORD Flags = FILE_FLAG_BACKUP_SEMANTICS;
   // Handle reparse points.
-  if (attr & FILE_ATTRIBUTE_REPARSE_POINT) {
-    ScopedFileHandle h(
-      ::CreateFileW(path_utf16.begin(),
-                    0, // Attributes only.
-                    FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-                    NULL,
-                    OPEN_EXISTING,
-                    FILE_FLAG_BACKUP_SEMANTICS,
-                    0));
-    if (!h)
-      return getStatus(INVALID_HANDLE_VALUE, result);
-  }
+  if (!Follow && (attr & FILE_ATTRIBUTE_REPARSE_POINT))
+    Flags |= FILE_FLAG_OPEN_REPARSE_POINT;
 
   ScopedFileHandle h(
       ::CreateFileW(path_utf16.begin(), 0, // Attributes only.
                     FILE_SHARE_DELETE | FILE_SHARE_READ | FILE_SHARE_WRITE,
-                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, 0));
-    if (!h)
-      return getStatus(INVALID_HANDLE_VALUE, result);
+                    NULL, OPEN_EXISTING, Flags, 0));
+  if (!h)
+    return getStatus(INVALID_HANDLE_VALUE, result);
 
-    return getStatus(h, result);
+  return getStatus(h, result);
 }
 
 std::error_code status(int FD, file_status &Result) {
@@ -523,6 +596,37 @@ std::error_code status(int FD, file_status &Result) {
   return getStatus(FileHandle, Result);
 }
 
+std::error_code setPermissions(const Twine &Path, perms Permissions) {
+  SmallVector<wchar_t, 128> PathUTF16;
+  if (std::error_code EC = widenPath(Path, PathUTF16))
+    return EC;
+
+  DWORD Attributes = ::GetFileAttributesW(PathUTF16.begin());
+  if (Attributes == INVALID_FILE_ATTRIBUTES)
+    return mapWindowsError(GetLastError());
+
+  // There are many Windows file attributes that are not to do with the file
+  // permissions (e.g. FILE_ATTRIBUTE_HIDDEN). We need to be careful to preserve
+  // them.
+  if (Permissions & all_write) {
+    Attributes &= ~FILE_ATTRIBUTE_READONLY;
+    if (Attributes == 0)
+      // FILE_ATTRIBUTE_NORMAL indicates no other attributes are set.
+      Attributes |= FILE_ATTRIBUTE_NORMAL;
+  }
+  else {
+    Attributes |= FILE_ATTRIBUTE_READONLY;
+    // FILE_ATTRIBUTE_NORMAL is not compatible with any other attributes, so
+    // remove it, if it is present.
+    Attributes &= ~FILE_ATTRIBUTE_NORMAL;
+  }
+
+  if (!::SetFileAttributesW(PathUTF16.begin(), Attributes))
+    return mapWindowsError(GetLastError());
+
+  return std::error_code();
+}
+
 std::error_code setLastModificationAndAccessTime(int FD, TimePoint<> Time) {
   FILETIME FT = toFILETIME(Time);
   HANDLE FileHandle = reinterpret_cast<HANDLE>(_get_osfhandle(FD));
@@ -628,7 +732,8 @@ int mapped_file_region::alignment() {
 }
 
 std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
-                                                StringRef path){
+                                                     StringRef path,
+                                                     bool follow_symlinks) {
   SmallVector<wchar_t, 128> path_utf16;
 
   if (std::error_code ec = widenPath(path, path_utf16))
@@ -673,7 +778,7 @@ std::error_code detail::directory_iterator_construct(detail::DirIterState &it,
   it.IterationHandle = intptr_t(FindHandle.take());
   SmallString<128> directory_entry_path(path);
   path::append(directory_entry_path, directory_entry_name_utf8);
-  it.CurrentEntry = directory_entry(directory_entry_path);
+  it.CurrentEntry = directory_entry(directory_entry_path, follow_symlinks);
 
   return std::error_code();
 }
@@ -713,6 +818,52 @@ std::error_code detail::directory_iterator_increment(detail::DirIterState &it) {
   return std::error_code();
 }
 
+static std::error_code realPathFromHandle(HANDLE H,
+                                          SmallVectorImpl<char> &RealPath) {
+  RealPath.clear();
+  llvm::SmallVector<wchar_t, MAX_PATH> Buffer;
+  DWORD CountChars = ::GetFinalPathNameByHandleW(
+      H, Buffer.begin(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED);
+  if (CountChars > Buffer.capacity()) {
+    // The buffer wasn't big enough, try again.  In this case the return value
+    // *does* indicate the size of the null terminator.
+    Buffer.reserve(CountChars);
+    CountChars = ::GetFinalPathNameByHandleW(
+        H, Buffer.data(), Buffer.capacity() - 1, FILE_NAME_NORMALIZED);
+  }
+  if (CountChars == 0)
+    return mapWindowsError(GetLastError());
+
+  const wchar_t *Data = Buffer.data();
+  if (CountChars >= 4) {
+    if (0 == ::memcmp(Data, L"\\\\?\\", 8)) {
+      CountChars -= 4;
+      Data += 4;
+    }
+  }
+
+  // Convert the result from UTF-16 to UTF-8.
+  return UTF16ToUTF8(Data, CountChars, RealPath);
+}
+
+static std::error_code directoryRealPath(const Twine &Name,
+                                         SmallVectorImpl<char> &RealPath) {
+  SmallVector<wchar_t, 128> PathUTF16;
+
+  if (std::error_code EC = widenPath(Name, PathUTF16))
+    return EC;
+
+  HANDLE H =
+      ::CreateFileW(PathUTF16.begin(), GENERIC_READ,
+                    FILE_SHARE_READ | FILE_SHARE_WRITE | FILE_SHARE_DELETE,
+                    NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+  if (H == INVALID_HANDLE_VALUE)
+    return mapWindowsError(GetLastError());
+  std::error_code EC = realPathFromHandle(H, RealPath);
+  ::CloseHandle(H);
+  return EC;
+}
+
 std::error_code openFileForRead(const Twine &Name, int &ResultFD,
                                 SmallVectorImpl<char> *RealPath) {
   SmallVector<wchar_t, 128> PathUTF16;
@@ -744,20 +895,8 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
   }
 
   // Fetch the real name of the file, if the user asked
-  if (RealPath) {
-    RealPath->clear();
-    wchar_t RealPathUTF16[MAX_PATH];
-    DWORD CountChars =
-      ::GetFinalPathNameByHandleW(H, RealPathUTF16, MAX_PATH,
-                                  FILE_NAME_NORMALIZED);
-    if (CountChars > 0 && CountChars < MAX_PATH) {
-      // Convert the result from UTF-16 to UTF-8.
-      SmallString<MAX_PATH> RealPathUTF8;
-      if (!UTF16ToUTF8(RealPathUTF16, CountChars, RealPathUTF8))
-        RealPath->append(RealPathUTF8.data(),
-                         RealPathUTF8.data() + strlen(RealPathUTF8.data()));
-    }
-  }
+  if (RealPath)
+    realPathFromHandle(H, *RealPath);
 
   ResultFD = FD;
   return std::error_code();
@@ -855,6 +994,81 @@ std::error_code getPathFromOpenFD(int FD, SmallVectorImpl<char> &ResultPath) {
 
   return windows::UTF16ToUTF8(TempPath.data(), CharCount, ResultPath);
 }
+
+std::error_code remove_directories(const Twine &path, bool IgnoreErrors) {
+  // Convert to utf-16.
+  SmallVector<wchar_t, 128> Path16;
+  std::error_code EC = widenPath(path, Path16);
+  if (EC && !IgnoreErrors)
+    return EC;
+
+  // SHFileOperation() accepts a list of paths, and so must be double null-
+  // terminated to indicate the end of the list.  The buffer is already null
+  // terminated, but since that null character is not considered part of the
+  // vector's size, pushing another one will just consume that byte.  So we
+  // need to push 2 null terminators.
+  Path16.push_back(0);
+  Path16.push_back(0);
+
+  SHFILEOPSTRUCTW shfos = {};
+  shfos.wFunc = FO_DELETE;
+  shfos.pFrom = Path16.data();
+  shfos.fFlags = FOF_NO_UI;
+
+  int result = ::SHFileOperationW(&shfos);
+  if (result != 0 && !IgnoreErrors)
+    return mapWindowsError(result);
+  return std::error_code();
+}
+
+static void expandTildeExpr(SmallVectorImpl<char> &Path) {
+  // Path does not begin with a tilde expression.
+  if (Path.empty() || Path[0] != '~')
+    return;
+
+  StringRef PathStr(Path.begin(), Path.size());
+  PathStr = PathStr.drop_front();
+  StringRef Expr = PathStr.take_until([](char c) { return path::is_separator(c); });
+
+  if (!Expr.empty()) {
+    // This is probably a ~username/ expression.  Don't support this on Windows.
+    return;
+  }
+
+  SmallString<128> HomeDir;
+  if (!path::home_directory(HomeDir)) {
+    // For some reason we couldn't get the home directory.  Just exit.
+    return;
+  }
+
+  // Overwrite the first character and insert the rest.
+  Path[0] = HomeDir[0];
+  Path.insert(Path.begin() + 1, HomeDir.begin() + 1, HomeDir.end());
+}
+
+std::error_code real_path(const Twine &path, SmallVectorImpl<char> &dest,
+                          bool expand_tilde) {
+  dest.clear();
+  if (path.isTriviallyEmpty())
+    return std::error_code();
+
+  if (expand_tilde) {
+    SmallString<128> Storage;
+    path.toVector(Storage);
+    expandTildeExpr(Storage);
+    return real_path(Storage, dest, false);
+  }
+
+  if (is_directory(path))
+    return directoryRealPath(path, dest);
+
+  int fd;
+  if (std::error_code EC = llvm::sys::fs::openFileForRead(path, fd, &dest))
+    return EC;
+  ::close(fd);
+  return std::error_code();
+}
+
 } // end namespace fs
 
 namespace path {
diff --git a/lib/Support/Windows/Process.inc b/lib/Support/Windows/Process.inc
index 8d646b3217a0363ca0053946a01e90b4bbcb1ba9..18aef610d54af476a054109ac78a4de4f6cbeb3d 100644
--- a/lib/Support/Windows/Process.inc
+++ b/lib/Support/Windows/Process.inc
@@ -47,7 +47,6 @@
 #endif
 
 using namespace llvm;
-using namespace sys;
 
 // This function retrieves the page size using GetNativeSystemInfo() and is
 // present solely so it can be called once to initialize the self_process member
diff --git a/lib/Support/Windows/Program.inc b/lib/Support/Windows/Program.inc
index 78fc538bd9bfbf5466de40c0959f081747c90249..721167da5b151259da060e205f77fab4177806f0 100644
--- a/lib/Support/Windows/Program.inc
+++ b/lib/Support/Windows/Program.inc
@@ -29,7 +29,6 @@
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
-using namespace sys;
 
 ProcessInfo::ProcessInfo() : ProcessHandle(0), Pid(0), ReturnCode(0) {}
 
diff --git a/lib/Support/Windows/RWMutex.inc b/lib/Support/Windows/RWMutex.inc
index 2d1d25f67b8aa646a009a1efa7588b46128e36a2..ac60c2fc05be8896841feb1f37a080890e4e8704 100644
--- a/lib/Support/Windows/RWMutex.inc
+++ b/lib/Support/Windows/RWMutex.inc
@@ -19,7 +19,6 @@
 #include "WindowsSupport.h"
 
 namespace llvm {
-using namespace sys;
 
 // Windows has slim read-writer lock support on Vista and higher, so we
 // will attempt to load the APIs.  If they exist, we will use them, and
@@ -73,7 +72,7 @@ static bool loadSRW() {
   return sHasSRW;
 }
 
-RWMutexImpl::RWMutexImpl() {
+sys::RWMutexImpl::RWMutexImpl() {
   if (loadSRW()) {
     data_ = calloc(1, sizeof(SRWLOCK));
     fpInitializeSRWLock(static_cast<PSRWLOCK>(data_));
@@ -83,14 +82,14 @@ RWMutexImpl::RWMutexImpl() {
   }
 }
 
-RWMutexImpl::~RWMutexImpl() {
+sys::RWMutexImpl::~RWMutexImpl() {
   if (!sHasSRW)
     DeleteCriticalSection(static_cast<LPCRITICAL_SECTION>(data_));
   // Nothing to do in the case of slim reader/writers except free the memory.
   free(data_);
 }
 
-bool RWMutexImpl::reader_acquire() {
+bool sys::RWMutexImpl::reader_acquire() {
   if (sHasSRW) {
     fpAcquireSRWLockShared(static_cast<PSRWLOCK>(data_));
   } else {
@@ -99,7 +98,7 @@ bool RWMutexImpl::reader_acquire() {
   return true;
 }
 
-bool RWMutexImpl::reader_release() {
+bool sys::RWMutexImpl::reader_release() {
   if (sHasSRW) {
     fpReleaseSRWLockShared(static_cast<PSRWLOCK>(data_));
   } else {
@@ -108,7 +107,7 @@ bool RWMutexImpl::reader_release() {
   return true;
 }
 
-bool RWMutexImpl::writer_acquire() {
+bool sys::RWMutexImpl::writer_acquire() {
   if (sHasSRW) {
     fpAcquireSRWLockExclusive(static_cast<PSRWLOCK>(data_));
   } else {
@@ -117,7 +116,7 @@ bool RWMutexImpl::writer_acquire() {
   return true;
 }
 
-bool RWMutexImpl::writer_release() {
+bool sys::RWMutexImpl::writer_release() {
   if (sHasSRW) {
     fpReleaseSRWLockExclusive(static_cast<PSRWLOCK>(data_));
   } else {
diff --git a/lib/Support/Windows/Signals.inc b/lib/Support/Windows/Signals.inc
index f739421eece463bba9ed34ec150e2d588dc12ad1..1ef51888baf39eaa5cee02b9bb7fac779db89e6e 100644
--- a/lib/Support/Windows/Signals.inc
+++ b/lib/Support/Windows/Signals.inc
@@ -776,7 +776,7 @@ static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
   // the nasty sorts of crashes that aren't 100% reproducible from a set of
   // inputs (or in the event that the user is unable or unwilling to provide a
   // reproducible case).
-  if (!llvm::Process::AreCoreFilesPrevented()) {
+  if (!llvm::sys::Process::AreCoreFilesPrevented()) {
     MINIDUMP_EXCEPTION_INFORMATION ExceptionInfo;
     ExceptionInfo.ThreadId = ::GetCurrentThreadId();
     ExceptionInfo.ExceptionPointers = ep;
diff --git a/lib/Support/Windows/ThreadLocal.inc b/lib/Support/Windows/ThreadLocal.inc
index b9cb8ff9836ecf4c87b1bed5745328bfa5d3ac7d..8be1c3ecfbb908baf27fb4d221d47f602114e94d 100644
--- a/lib/Support/Windows/ThreadLocal.inc
+++ b/lib/Support/Windows/ThreadLocal.inc
@@ -20,33 +20,32 @@
 #include "llvm/Support/ThreadLocal.h"
 
 namespace llvm {
-using namespace sys;
 
-ThreadLocalImpl::ThreadLocalImpl() : data() {
+sys::ThreadLocalImpl::ThreadLocalImpl() : data() {
   static_assert(sizeof(DWORD) <= sizeof(data), "size too big");
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   *tls = TlsAlloc();
   assert(*tls != TLS_OUT_OF_INDEXES);
 }
 
-ThreadLocalImpl::~ThreadLocalImpl() {
+sys::ThreadLocalImpl::~ThreadLocalImpl() {
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   TlsFree(*tls);
 }
 
-void *ThreadLocalImpl::getInstance() {
+void *sys::ThreadLocalImpl::getInstance() {
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   return TlsGetValue(*tls);
 }
 
-void ThreadLocalImpl::setInstance(const void* d){
+void sys::ThreadLocalImpl::setInstance(const void* d){
   DWORD* tls = reinterpret_cast<DWORD*>(&data);
   int errorcode = TlsSetValue(*tls, const_cast<void*>(d));
   assert(errorcode != 0);
   (void)errorcode;
 }
 
-void ThreadLocalImpl::removeInstance() {
+void sys::ThreadLocalImpl::removeInstance() {
   setInstance(0);
 }
 
diff --git a/lib/Support/Windows/Threading.inc b/lib/Support/Windows/Threading.inc
new file mode 100644
index 0000000000000000000000000000000000000000..decb48887af22a7a08f2495c3713766315d4098a
--- /dev/null
+++ b/lib/Support/Windows/Threading.inc
@@ -0,0 +1,109 @@
+//===- Windows/Threading.inc - Win32 Threading Implementation - -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file provides the Win32 specific implementation of Threading functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
+
+#include "Windows/WindowsSupport.h"
+#include <process.h>
+
+// Windows will at times define MemoryFence.
+#ifdef MemoryFence
+#undef MemoryFence
+#endif
+
+namespace {
+  struct ThreadInfo {
+    void(*func)(void*);
+    void *param;
+  };
+}
+
+static unsigned __stdcall ThreadCallback(void *param) {
+  struct ThreadInfo *info = reinterpret_cast<struct ThreadInfo *>(param);
+  info->func(info->param);
+
+  return 0;
+}
+
+void llvm::llvm_execute_on_thread(void(*Fn)(void*), void *UserData,
+  unsigned RequestedStackSize) {
+  struct ThreadInfo param = { Fn, UserData };
+
+  HANDLE hThread = (HANDLE)::_beginthreadex(NULL,
+    RequestedStackSize, ThreadCallback,
+    &param, 0, NULL);
+
+  if (hThread) {
+    // We actually don't care whether the wait succeeds or fails, in
+    // the same way we don't care whether the pthread_join call succeeds
+    // or fails.  There's not much we could do if this were to fail. But
+    // on success, this call will wait until the thread finishes executing
+    // before returning.
+    (void)::WaitForSingleObject(hThread, INFINITE);
+    ::CloseHandle(hThread);
+  }
+}
+
+uint64_t llvm::get_threadid() {
+  return uint64_t(::GetCurrentThreadId());
+}
+
+uint32_t llvm::get_max_thread_name_length() { return 0; }
+
+#if defined(_MSC_VER)
+static void SetThreadName(DWORD Id, LPCSTR Name) {
+  constexpr DWORD MS_VC_EXCEPTION = 0x406D1388;
+
+#pragma pack(push, 8)
+  struct THREADNAME_INFO {
+    DWORD dwType;     // Must be 0x1000.
+    LPCSTR szName;    // Pointer to thread name
+    DWORD dwThreadId; // Thread ID (-1 == current thread)
+    DWORD dwFlags;    // Reserved.  Do not use.
+  };
+#pragma pack(pop)
+
+  THREADNAME_INFO info;
+  info.dwType = 0x1000;
+  info.szName = Name;
+  info.dwThreadId = Id;
+  info.dwFlags = 0;
+
+  __try {
+    ::RaiseException(MS_VC_EXCEPTION, 0, sizeof(info) / sizeof(ULONG_PTR),
+      (ULONG_PTR *)&info);
+  }
+  __except (EXCEPTION_EXECUTE_HANDLER) {
+  }
+}
+#endif
+
+void llvm::set_thread_name(const Twine &Name) {
+#if defined(_MSC_VER)
+  // Make sure the input is null terminated.
+  SmallString<64> Storage;
+  StringRef NameStr = Name.toNullTerminatedStringRef(Storage);
+  SetThreadName(::GetCurrentThreadId(), NameStr.data());
+#endif
+}
+
+void llvm::get_thread_name(SmallVectorImpl<char> &Name) {
+  // "Name" is not an inherent property of a thread on Windows.  In fact, when
+  // you "set" the name, you are only firing a one-time message to a debugger
+  // which it interprets as a program setting its threads' name.  We may be
+  // able to get fancy by creating a TLS entry when someone calls
+  // set_thread_name so that subsequent calls to get_thread_name return this
+  // value.
+  Name.clear();
+}
diff --git a/lib/Support/YAMLTraits.cpp b/lib/Support/YAMLTraits.cpp
index 9849b3aa1ce991f5ebce80ec918f27f8696de837..c410b1d5608605cdc67aa204b7f935e6c89478bd 100644
--- a/lib/Support/YAMLTraits.cpp
+++ b/lib/Support/YAMLTraits.cpp
@@ -398,17 +398,10 @@ bool Input::canElideEmptySequence() {
 //===----------------------------------------------------------------------===//
 
 Output::Output(raw_ostream &yout, void *context, int WrapColumn)
-    : IO(context),
-      Out(yout),
-      WrapColumn(WrapColumn),
-      Column(0),
-      ColumnAtFlowStart(0),
-      ColumnAtMapFlowStart(0),
-      NeedBitValueComma(false),
-      NeedFlowSequenceComma(false),
-      EnumerationMatchFound(false),
-      NeedsNewLine(false) {
-}
+    : IO(context), Out(yout), WrapColumn(WrapColumn), Column(0),
+      ColumnAtFlowStart(0), ColumnAtMapFlowStart(0), NeedBitValueComma(false),
+      NeedFlowSequenceComma(false), EnumerationMatchFound(false),
+      NeedsNewLine(false), WriteDefaultValues(false) {}
 
 Output::~Output() {
 }
@@ -462,7 +455,7 @@ std::vector<StringRef> Output::keys() {
 bool Output::preflightKey(const char *Key, bool Required, bool SameAsDefault,
                           bool &UseDefault, void *&) {
   UseDefault = false;
-  if (Required || !SameAsDefault) {
+  if (Required || !SameAsDefault || WriteDefaultValues) {
     auto State = StateStack.back();
     if (State == inFlowMapFirstKey || State == inFlowMapOtherKey) {
       flowKey(Key);
diff --git a/lib/Support/raw_ostream.cpp b/lib/Support/raw_ostream.cpp
index d073802db932095b308a89707a1157bf77b7fa5f..1abc8ed8683d50cc2a0e87ddc8d10a668d6d6cf4 100644
--- a/lib/Support/raw_ostream.cpp
+++ b/lib/Support/raw_ostream.cpp
@@ -465,8 +465,7 @@ void format_object_base::home() {
 static int getFD(StringRef Filename, std::error_code &EC,
                  sys::fs::OpenFlags Flags) {
   // Handle "-" as stdout. Note that when we do this, we consider ourself
-  // the owner of stdout. This means that we can do things like close the
-  // file descriptor when we're done and set the "binary" flag globally.
+  // the owner of stdout and may set the "binary" flag globally based on Flags.
   if (Filename == "-") {
     EC = std::error_code();
     // If user requested binary then put stdout into binary mode if
@@ -497,6 +496,13 @@ raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered)
     ShouldClose = false;
     return;
   }
+  // We do not want to close STDOUT as there may have been several uses of it
+  // such as the case: llc %s -o=- -pass-remarks-output=- -filetype=asm
+  // which cause multiple closes of STDOUT_FILENO and/or use-after-close of it.
+  // Using dup() in getFD doesn't work as we end up with original STDOUT_FILENO
+  // open anyhow.
+  if (FD <= STDERR_FILENO)
+    ShouldClose = false;
 
   // Get the starting position.
   off_t loc = ::lseek(FD, 0, SEEK_CUR);
diff --git a/lib/Target/AArch64/AArch64.h b/lib/Target/AArch64/AArch64.h
index fd106a8d9b0bcecaf973d7d7fc1874eeb431a8d6..b44b13e36e15d3f6336f585a6703c06cc0e8c280 100644
--- a/lib/Target/AArch64/AArch64.h
+++ b/lib/Target/AArch64/AArch64.h
@@ -22,8 +22,11 @@
 
 namespace llvm {
 
+class AArch64RegisterBankInfo;
+class AArch64Subtarget;
 class AArch64TargetMachine;
 class FunctionPass;
+class InstructionSelector;
 class MachineFunctionPass;
 
 FunctionPass *createAArch64DeadRegisterDefinitions();
@@ -45,6 +48,9 @@ FunctionPass *createAArch64A53Fix835769();
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
 FunctionPass *createAArch64CollectLOHPass();
+InstructionSelector *
+createAArch64InstructionSelector(const AArch64TargetMachine &,
+                                 AArch64Subtarget &, AArch64RegisterBankInfo &);
 
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td
index cb6d8fc96688da34db7196789558115a6153d6d0..519ca28946830be06dc55f6eaff6acdcee3c6e85 100644
--- a/lib/Target/AArch64/AArch64.td
+++ b/lib/Target/AArch64/AArch64.td
@@ -27,7 +27,7 @@ def FeatureNEON : SubtargetFeature<"neon", "HasNEON", "true",
   "Enable Advanced SIMD instructions", [FeatureFPARMv8]>;
 
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
-  "Enable cryptographic instructions">;
+  "Enable cryptographic instructions", [FeatureNEON]>;
 
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
   "Enable ARMv8 CRC-32 checksum instructions">;
@@ -103,6 +103,14 @@ def FeatureArithmeticCbzFusion : SubtargetFeature<
     "arith-cbz-fusion", "HasArithmeticCbzFusion", "true",
     "CPU fuses arithmetic + cbz/cbnz operations">;
 
+def FeatureFuseAES : SubtargetFeature<
+    "fuse-aes", "HasFuseAES", "true",
+    "CPU fuses AES crypto operations">;
+
+def FeatureFuseLiterals : SubtargetFeature<
+    "fuse-literals", "HasFuseLiterals", "true",
+    "CPU fuses literal generation operations">;
+
 def FeatureDisableLatencySchedHeuristic : SubtargetFeature<
     "disable-latency-sched-heuristic", "DisableLatencySchedHeuristic", "true",
     "Disable latency scheduling heuristic">;
@@ -111,6 +119,16 @@ def FeatureUseRSqrt : SubtargetFeature<
     "use-reciprocal-square-root", "UseRSqrt", "true",
     "Use the reciprocal square root approximation">;
 
+def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
+                                        "NegativeImmediates", "false",
+                                        "Convert immediates and instructions "
+                                        "to their negated or complemented "
+                                        "equivalent when the immediate does "
+                                        "not fit in the encoding.">;
+
+def FeatureLSLFast : SubtargetFeature<
+    "lsl-fast", "HasLSLFast", "true",
+    "CPU has a fastpath logical shift of up to 3 places">;
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -153,7 +171,8 @@ include "AArch64SchedCyclone.td"
 include "AArch64SchedFalkor.td"
 include "AArch64SchedKryo.td"
 include "AArch64SchedM1.td"
-include "AArch64SchedVulcan.td"
+include "AArch64SchedThunderX.td"
+include "AArch64SchedThunderX2T99.td"
 
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                    "Cortex-A35 ARM processors", [
@@ -184,6 +203,8 @@ def ProcA57     : SubtargetFeature<"a57", "ARMProcFamily", "CortexA57",
                                    FeatureCrypto,
                                    FeatureCustomCheapAsMoveHandling,
                                    FeatureFPARMv8,
+                                   FeatureFuseAES,
+                                   FeatureFuseLiterals,
                                    FeatureNEON,
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
@@ -230,6 +251,7 @@ def ProcExynosM1 : SubtargetFeature<"exynosm1", "ARMProcFamily", "ExynosM1",
                                      FeatureCrypto,
                                      FeatureCustomCheapAsMoveHandling,
                                      FeatureFPARMv8,
+                                     FeatureFuseAES,
                                      FeatureNEON,
                                      FeaturePerfMon,
                                      FeaturePostRAScheduler,
@@ -260,7 +282,8 @@ def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
                                    FeaturePerfMon,
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
-                                   FeatureZCZeroing
+                                   FeatureZCZeroing,
+                                   FeatureLSLFast
                                    ]>;
 
 def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
@@ -274,19 +297,65 @@ def ProcFalkor  : SubtargetFeature<"falkor", "ARMProcFamily", "Falkor",
                                    FeaturePostRAScheduler,
                                    FeaturePredictableSelectIsExpensive,
                                    FeatureRDM,
-                                   FeatureZCZeroing
+                                   FeatureZCZeroing,
+                                   FeatureLSLFast
                                    ]>;
 
-def ProcVulcan  : SubtargetFeature<"vulcan", "ARMProcFamily", "Vulcan",
-                                   "Broadcom Vulcan processors", [
-                                   FeatureCRC,
-                                   FeatureCrypto,
-                                   FeatureFPARMv8,
-                                   FeatureArithmeticBccFusion,
-                                   FeatureNEON,
-                                   FeaturePostRAScheduler,
-                                   FeaturePredictableSelectIsExpensive,
-                                   HasV8_1aOps]>;
+def ProcThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
+                                         "ThunderX2T99",
+                                         "Cavium ThunderX2 processors", [
+                                          FeatureCRC,
+                                          FeatureCrypto,
+                                          FeatureFPARMv8,
+                                          FeatureArithmeticBccFusion,
+                                          FeatureNEON,
+                                          FeaturePostRAScheduler,
+                                          FeaturePredictableSelectIsExpensive,
+                                          FeatureLSE,
+                                          HasV8_1aOps]>;
+
+def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
+                                    "Cavium ThunderX processors", [
+                                    FeatureCRC,
+                                    FeatureCrypto,
+                                    FeatureFPARMv8,
+                                    FeaturePerfMon,
+                                    FeaturePostRAScheduler,
+                                    FeaturePredictableSelectIsExpensive,
+                                    FeatureNEON]>;
+
+def ProcThunderXT88 : SubtargetFeature<"thunderxt88", "ARMProcFamily",
+                                       "ThunderXT88",
+                                       "Cavium ThunderX processors", [
+                                       FeatureCRC,
+                                       FeatureCrypto,
+                                       FeatureFPARMv8,
+                                       FeaturePerfMon,
+                                       FeaturePostRAScheduler,
+                                       FeaturePredictableSelectIsExpensive,
+                                       FeatureNEON]>;
+
+def ProcThunderXT81 : SubtargetFeature<"thunderxt81", "ARMProcFamily",
+                                       "ThunderXT81",
+                                       "Cavium ThunderX processors", [
+                                       FeatureCRC,
+                                       FeatureCrypto,
+                                       FeatureFPARMv8,
+                                       FeaturePerfMon,
+                                       FeaturePostRAScheduler,
+                                       FeaturePredictableSelectIsExpensive,
+                                       FeatureNEON]>;
+
+def ProcThunderXT83 : SubtargetFeature<"thunderxt83", "ARMProcFamily",
+                                       "ThunderXT83",
+                                       "Cavium ThunderX processors", [
+                                       FeatureCRC,
+                                       FeatureCrypto,
+                                       FeatureFPARMv8,
+                                       FeaturePerfMon,
+                                       FeaturePostRAScheduler,
+                                       FeaturePredictableSelectIsExpensive,
+                                       FeatureNEON]>;
 
 def : ProcessorModel<"generic", NoSchedModel, [
                      FeatureCRC,
@@ -309,7 +378,13 @@ def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM2]>;
 def : ProcessorModel<"exynos-m3", ExynosM1Model, [ProcExynosM2]>;
 def : ProcessorModel<"falkor", FalkorModel, [ProcFalkor]>;
 def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>;
-def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>;
+// Cavium ThunderX/ThunderX T8X  Processors
+def : ProcessorModel<"thunderx", ThunderXT8XModel,  [ProcThunderX]>;
+def : ProcessorModel<"thunderxt88", ThunderXT8XModel,  [ProcThunderXT88]>;
+def : ProcessorModel<"thunderxt81", ThunderXT8XModel,  [ProcThunderXT81]>;
+def : ProcessorModel<"thunderxt83", ThunderXT8XModel,  [ProcThunderXT83]>;
+// Cavium ThunderX2T9X  Processors. Formerly Broadcom Vulcan.
+def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
 
 //===----------------------------------------------------------------------===//
 // Assembly parser
diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp
index 4f5b2886b1a715dc8e3a0ca97c96aa7555bf5367..b2f55a7e1e09163351917b54da282ff191e1d7a5 100644
--- a/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/lib/Target/AArch64/AArch64CallLowering.cpp
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/AArch64/AArch64CallLowering.cpp - Call lowering ---===//
+//===--- AArch64CallLowering.cpp - Call lowering --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,15 +15,36 @@
 
 #include "AArch64CallLowering.h"
 #include "AArch64ISelLowering.h"
-
+#include "AArch64MachineFunctionInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/LowLevelType.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/Argument.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Value.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <iterator>
+
 using namespace llvm;
 
 #ifndef LLVM_BUILD_GLOBAL_ISEL
@@ -31,13 +52,12 @@ using namespace llvm;
 #endif
 
 AArch64CallLowering::AArch64CallLowering(const AArch64TargetLowering &TLI)
-  : CallLowering(&TLI) {
-}
+  : CallLowering(&TLI) {}
 
 struct IncomingArgHandler : public CallLowering::ValueHandler {
   IncomingArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
                      CCAssignFn *AssignFn)
-      : ValueHandler(MIRBuilder, MRI, AssignFn) {}
+      : ValueHandler(MIRBuilder, MRI, AssignFn), StackUsed(0) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
@@ -46,6 +66,7 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
     unsigned AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64));
     MIRBuilder.buildFrameIndex(AddrReg, FI);
+    StackUsed = std::max(StackUsed, Size + Offset);
     return AddrReg;
   }
 
@@ -68,6 +89,8 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
   /// parameters (it's a basic-block live-in), and a call instruction
   /// (it's an implicit-def of the BL).
   virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+
+  uint64_t StackUsed;
 };
 
 struct FormalArgHandler : public IncomingArgHandler {
@@ -113,7 +136,6 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
     MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
 
     MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
-    StackSize = std::max(StackSize, Size + Offset);
     return AddrReg;
   }
 
@@ -131,13 +153,18 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
     MIRBuilder.buildStore(ValVReg, Addr, *MMO);
   }
 
-  virtual bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
-                         CCValAssign::LocInfo LocInfo,
-                         const CallLowering::ArgInfo &Info,
-                         CCState &State) override {
+  bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                 CCValAssign::LocInfo LocInfo,
+                 const CallLowering::ArgInfo &Info,
+                 CCState &State) override {
+    bool Res;
     if (Info.IsFixed)
-      return AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
-    return  AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+      Res = AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+    else
+      Res = AssignFnVarArg(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State);
+
+    StackSize = State.getNextStackOffset();
+    return Res;
   }
 
   MachineInstrBuilder MIB;
@@ -169,19 +196,12 @@ void AArch64CallLowering::splitToValueTypes(
     // FIXME: set split flags if they're actually used (e.g. i128 on AAPCS).
     Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
     SplitArgs.push_back(
-        ArgInfo{MRI.createGenericVirtualRegister(LLT{*SplitTy, DL}), SplitTy,
-                OrigArg.Flags, OrigArg.IsFixed});
+        ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*SplitTy, DL)),
+                SplitTy, OrigArg.Flags, OrigArg.IsFixed});
   }
 
-  SmallVector<uint64_t, 4> BitOffsets;
-  for (auto Offset : Offsets)
-    BitOffsets.push_back(Offset * 8);
-
-  SmallVector<unsigned, 8> SplitRegs;
-  for (auto I = &SplitArgs[FirstRegIdx]; I != SplitArgs.end(); ++I)
-    SplitRegs.push_back(I->Reg);
-
-  PerformArgSplit(SplitRegs, BitOffsets);
+  for (unsigned i = 0; i < Offsets.size(); ++i)
+    PerformArgSplit(SplitArgs[FirstRegIdx + i].Reg, Offsets[i] * 8);
 }
 
 bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -199,12 +219,12 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
     auto &DL = F.getParent()->getDataLayout();
 
     ArgInfo OrigArg{VReg, Val->getType()};
-    setArgFlags(OrigArg, AttributeSet::ReturnIndex, DL, F);
+    setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
 
     SmallVector<ArgInfo, 8> SplitArgs;
     splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        MIRBuilder.buildExtract(Regs, Offsets, VReg);
+                      [&](unsigned Reg, uint64_t Offset) {
+                        MIRBuilder.buildExtract(Reg, VReg, Offset);
                       });
 
     OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFn, AssignFn);
@@ -218,7 +238,6 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
 bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                                const Function &F,
                                                ArrayRef<unsigned> VRegs) const {
-  auto &Args = F.getArgumentList();
   MachineFunction &MF = MIRBuilder.getMF();
   MachineBasicBlock &MBB = MIRBuilder.getMBB();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -226,13 +245,27 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 
   SmallVector<ArgInfo, 8> SplitArgs;
   unsigned i = 0;
-  for (auto &Arg : Args) {
+  for (auto &Arg : F.args()) {
     ArgInfo OrigArg{VRegs[i], Arg.getType()};
     setArgFlags(OrigArg, i + 1, DL, F);
+    bool Split = false;
+    LLT Ty = MRI.getType(VRegs[i]);
+    unsigned Dst = VRegs[i];
+
     splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        MIRBuilder.buildSequence(VRegs[i], Regs, Offsets);
+                      [&](unsigned Reg, uint64_t Offset) {
+                        if (!Split) {
+                          Split = true;
+                          Dst = MRI.createGenericVirtualRegister(Ty);
+                          MIRBuilder.buildUndef(Dst);
+                        }
+                        unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
+                        MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset);
+                        Dst = Tmp;
                       });
+
+    if (Dst != VRegs[i])
+      MIRBuilder.buildCopy(VRegs[i], Dst);
     ++i;
   }
 
@@ -247,6 +280,21 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
     return false;
 
+  if (F.isVarArg()) {
+    if (!MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
+      // FIXME: we need to reimplement saveVarArgsRegisters from
+      // AArch64ISelLowering.
+      return false;
+    }
+
+    // We currently pass all varargs at 8-byte alignment.
+    uint64_t StackOffset = alignTo(Handler.StackUsed, 8);
+
+    auto &MFI = MIRBuilder.getMF().getFrameInfo();
+    AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+    FuncInfo->setVarArgsStackIndex(MFI.CreateFixedObject(4, StackOffset, true));
+  }
+
   // Move back to the end of the basic block.
   MIRBuilder.setMBB(MBB);
 
@@ -254,6 +302,7 @@ bool AArch64CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 }
 
 bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                    CallingConv::ID CallConv,
                                     const MachineOperand &Callee,
                                     const ArgInfo &OrigRet,
                                     ArrayRef<ArgInfo> OrigArgs) const {
@@ -265,17 +314,17 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   SmallVector<ArgInfo, 8> SplitArgs;
   for (auto &OrigArg : OrigArgs) {
     splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        MIRBuilder.buildExtract(Regs, Offsets, OrigArg.Reg);
+                      [&](unsigned Reg, uint64_t Offset) {
+                        MIRBuilder.buildExtract(Reg, OrigArg.Reg, Offset);
                       });
   }
 
   // Find out which ABI gets to decide where things go.
   const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
   CCAssignFn *AssignFnFixed =
-      TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/false);
+      TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
   CCAssignFn *AssignFnVarArg =
-      TLI.CCAssignFnForCall(F.getCallingConv(), /*IsVarArg=*/true);
+      TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/true);
 
   auto CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
 
@@ -318,11 +367,9 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     SmallVector<uint64_t, 8> RegOffsets;
     SmallVector<unsigned, 8> SplitRegs;
     splitToValueTypes(OrigRet, SplitArgs, DL, MRI,
-                      [&](ArrayRef<unsigned> Regs, ArrayRef<uint64_t> Offsets) {
-                        std::copy(Offsets.begin(), Offsets.end(),
-                                  std::back_inserter(RegOffsets));
-                        std::copy(Regs.begin(), Regs.end(),
-                                  std::back_inserter(SplitRegs));
+                      [&](unsigned Reg, uint64_t Offset) {
+                        RegOffsets.push_back(Offset);
+                        SplitRegs.push_back(Reg);
                       });
 
     CallReturnHandler Handler(MIRBuilder, MRI, MIB, RetAssignFn);
diff --git a/lib/Target/AArch64/AArch64CallLowering.h b/lib/Target/AArch64/AArch64CallLowering.h
index d1453d0071f9cb12d1f482e6dab4ae07731d71b6..d96ce95c4de0b39d54b71a69110a33007ccbcf78 100644
--- a/lib/Target/AArch64/AArch64CallLowering.h
+++ b/lib/Target/AArch64/AArch64CallLowering.h
@@ -1,4 +1,4 @@
-//===-- llvm/lib/Target/AArch64/AArch64CallLowering.h - Call lowering -----===//
+//===--- AArch64CallLowering.h - Call lowering ------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,18 +12,20 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
-#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING
+#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
+#define LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/ValueTypes.h"
+#include <cstdint>
+#include <functional>
 
 namespace llvm {
 
 class AArch64TargetLowering;
 
 class AArch64CallLowering: public CallLowering {
- public:
+public:
   AArch64CallLowering(const AArch64TargetLowering &TLI);
 
   bool lowerReturn(MachineIRBuilder &MIRBuiler, const Value *Val,
@@ -32,8 +34,8 @@ class AArch64CallLowering: public CallLowering {
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
 
-  bool lowerCall(MachineIRBuilder &MIRBuilder, const MachineOperand &Callee,
-                 const ArgInfo &OrigRet,
+  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+                 const MachineOperand &Callee, const ArgInfo &OrigRet,
                  ArrayRef<ArgInfo> OrigArgs) const override;
 
 private:
@@ -44,13 +46,14 @@ private:
   typedef std::function<void(MachineIRBuilder &, int, CCValAssign &)>
       MemHandler;
 
-  typedef std::function<void(ArrayRef<unsigned>, ArrayRef<uint64_t>)>
-      SplitArgTy;
+  typedef std::function<void(unsigned, uint64_t)> SplitArgTy;
 
   void splitToValueTypes(const ArgInfo &OrigArgInfo,
                          SmallVectorImpl<ArgInfo> &SplitArgs,
                          const DataLayout &DL, MachineRegisterInfo &MRI,
                          const SplitArgTy &SplitArg) const;
 };
-} // End of namespace llvm;
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_AARCH64CALLLOWERING_H
diff --git a/lib/Target/AArch64/AArch64CallingConvention.td b/lib/Target/AArch64/AArch64CallingConvention.td
index 9058617768dd0205d1635fac0dc9493d1ca1d079..938779d23690dbaedadc61fe2b8533ea8f49bdef 100644
--- a/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/lib/Target/AArch64/AArch64CallingConvention.td
@@ -91,7 +91,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
   CCIfType<[v2f32], CCBitConvertToType<v2i32>>,
   CCIfType<[v2f64, v4f32], CCBitConvertToType<v2i64>>,
 
-  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
   // their lanes are in a consistent order.
@@ -138,8 +138,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i64], CCAssignToRegWithShadow<[X20], [W20]>>>,
 
-  // A SwiftError is passed in X19.
-  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X19], [W19]>>>,
+  // A SwiftError is passed in X21.
+  CCIfSwiftError<CCIfType<[i64], CCAssignToRegWithShadow<[X21], [W21]>>>,
 
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
@@ -289,7 +289,7 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
 def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
 
 def CSR_AArch64_AAPCS_SwiftError
-    : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X19)>;
+    : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>;
 
 // The function used by Darwin to obtain the address of a thread-local variable
 // guarantees more than a normal AAPCS function. x16 and x17 are used on the
diff --git a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 5b8a1e05d140c96570b456bbb3005803b456906c..d0c0956b87ca8fb66e6a2a79d043a525c144be55 100644
--- a/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "AArch64InstrInfo.h"
 #include "AArch64Subtarget.h"
+#include "Utils/AArch64BaseInfo.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -889,6 +890,18 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     MI.eraseFromParent();
     return true;
   }
+  case AArch64::MOVbaseTLS: {
+    unsigned DstReg = MI.getOperand(0).getReg();
+    auto SysReg = AArch64SysReg::TPIDR_EL0;
+    MachineFunction *MF = MBB.getParent();
+    if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
+        MF->getTarget().getCodeModel() == CodeModel::Kernel)
+      SysReg = AArch64SysReg::TPIDR_EL1;
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::MRS), DstReg)
+        .addImm(SysReg);
+    MI.eraseFromParent();
+    return true;
+  }
 
   case AArch64::MOVi32imm:
     return expandMOVImm(MBB, MBBI, 32);
diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp
index 030bd9e28e549b37f278475b555954547406c83a..4e5e3e43a468798df05cc75bbc2d9eb948093ca7 100644
--- a/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/lib/Target/AArch64/AArch64FastISel.cpp
@@ -458,7 +458,7 @@ unsigned AArch64FastISel::materializeGV(const GlobalValue *GV) {
 
   // MachO still uses GOT for large code-model accesses, but ELF requires
   // movz/movk sequences, which FastISel doesn't handle yet.
-  if (TM.getCodeModel() != CodeModel::Small && !Subtarget->isTargetMachO())
+  if (!Subtarget->useSmallAddressing() && !Subtarget->isTargetMachO())
     return 0;
 
   unsigned char OpFlags = Subtarget->ClassifyGlobalReference(GV, TM);
@@ -3147,8 +3147,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     return false;
 
   CodeModel::Model CM = TM.getCodeModel();
-  // Only support the small and large code model.
-  if (CM != CodeModel::Small && CM != CodeModel::Large)
+  // Only support the small-addressing and large code models.
+  if (CM != CodeModel::Large && !Subtarget->useSmallAddressing())
     return false;
 
   // FIXME: Add large code model support for ELF.
@@ -3199,7 +3199,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // Issue the call.
   MachineInstrBuilder MIB;
-  if (CM == CodeModel::Small) {
+  if (Subtarget->useSmallAddressing()) {
     const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
     if (Symbol)
@@ -3410,8 +3410,7 @@ bool AArch64FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     MachineFrameInfo &MFI = FuncInfo.MF->getFrameInfo();
     MFI.setFrameAddressIsTaken(true);
 
-    const AArch64RegisterInfo *RegInfo =
-        static_cast<const AArch64RegisterInfo *>(Subtarget->getRegisterInfo());
+    const AArch64RegisterInfo *RegInfo = Subtarget->getRegisterInfo();
     unsigned FramePtr = RegInfo->getFrameRegister(*(FuncInfo.MF));
     unsigned SrcReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
diff --git a/lib/Target/AArch64/AArch64FrameLowering.cpp b/lib/Target/AArch64/AArch64FrameLowering.cpp
index 5cce8f92db3afa7daba06d75972621235425fc37..550174b22a8960a9f693090e471dcce299cfe131 100644
--- a/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -266,14 +266,13 @@ static unsigned findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) {
   if (&MF->front() == MBB)
     return AArch64::X9;
 
-  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
-  LivePhysRegs LiveRegs(&TRI);
+  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
+  const AArch64RegisterInfo *TRI = Subtarget.getRegisterInfo();
+  LivePhysRegs LiveRegs(TRI);
   LiveRegs.addLiveIns(*MBB);
 
   // Mark callee saved registers as used so we will not choose them.
-  const AArch64Subtarget &Subtarget = MF->getSubtarget<AArch64Subtarget>();
-  const AArch64RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
-  const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MF);
+  const MCPhysReg *CSRegs = TRI->getCalleeSavedRegs(MF);
   for (unsigned i = 0; CSRegs[i]; ++i)
     LiveRegs.addReg(CSRegs[i]);
 
@@ -883,7 +882,7 @@ static unsigned getPrologueDeath(MachineFunction &MF, unsigned Reg) {
 
 static bool produceCompactUnwindFrame(MachineFunction &MF) {
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  AttributeSet Attrs = MF.getFunction()->getAttributes();
+  AttributeList Attrs = MF.getFunction()->getAttributes();
   return Subtarget.isTargetMachO() &&
          !(Subtarget.getTargetLowering()->supportSwiftError() &&
            Attrs.hasAttrSomewhere(Attribute::SwiftError));
diff --git a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
index ea46a0a5da86634b7b38de95ddf69121cf6dd16f..8b1c9740d2adba748fc6bf4fcdb4b30d43438916 100644
--- a/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
+++ b/lib/Target/AArch64/AArch64GenRegisterBankInfo.def
@@ -37,57 +37,59 @@ RegisterBankInfo::PartialMapping AArch64GenRegisterBankInfo::PartMappings[]{
 // ValueMappings.
 RegisterBankInfo::ValueMapping AArch64GenRegisterBankInfo::ValMappings[]{
     /* BreakDown, NumBreakDowns */
+    // 0: invalid
+    {nullptr, 0},
     // 3-operands instructions (all binary operations should end up with one of
     // those mapping).
-    // 0: FPR 32-bit value. <-- This must match First3OpsIdx.
+    // 1: FPR 32-bit value. <-- This must match First3OpsIdx.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
-    // 3: FPR 64-bit value.
+    // 4: FPR 64-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
-    // 6: FPR 128-bit value.
+    // 7: FPR 128-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR128 - PMI_Min], 1},
-    // 9: FPR 256-bit value.
+    // 10: FPR 256-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR256 - PMI_Min], 1},
-    // 12: FPR 512-bit value.
+    // 13: FPR 512-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR512 - PMI_Min], 1},
-    // 15: GPR 32-bit value.
+    // 16: GPR 32-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
-    // 18: GPR 64-bit value. <-- This must match Last3OpsIdx.
+    // 19: GPR 64-bit value. <-- This must match Last3OpsIdx.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
     // Cross register bank copies.
-    // 21: FPR 32-bit value to GPR 32-bit value. <-- This must match
+    // 22: FPR 32-bit value to GPR 32-bit value. <-- This must match
     //                                               FirstCrossRegCpyIdx.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
-    // 23: FPR 64-bit value to GPR 64-bit value.
+    // 24: FPR 64-bit value to GPR 64-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
-    // 25: FPR 128-bit value to GPR 128-bit value (invalid)
+    // 26: FPR 128-bit value to GPR 128-bit value (invalid)
     {nullptr, 1},
     {nullptr, 1},
-    // 27: FPR 256-bit value to GPR 256-bit value (invalid)
+    // 28: FPR 256-bit value to GPR 256-bit value (invalid)
     {nullptr, 1},
     {nullptr, 1},
-    // 29: FPR 512-bit value to GPR 512-bit value (invalid)
+    // 30: FPR 512-bit value to GPR 512-bit value (invalid)
     {nullptr, 1},
     {nullptr, 1},
-    // 31: GPR 32-bit value to FPR 32-bit value.
+    // 32: GPR 32-bit value to FPR 32-bit value.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR32 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR32 - PMI_Min], 1},
-    // 33: GPR 64-bit value to FPR 64-bit value. <-- This must match
+    // 34: GPR 64-bit value to FPR 64-bit value. <-- This must match
     //                                               LastCrossRegCpyIdx.
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_GPR64 - PMI_Min], 1},
     {&AArch64GenRegisterBankInfo::PartMappings[PMI_FPR64 - PMI_Min], 1},
@@ -144,7 +146,7 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
       return 0;
     if (Size <= 64)
       return 1;
-    llvm_unreachable("Unexpected size");
+    return -1;
   }
   if (RBIdx == PMI_FirstFPR) {
     if (Size <= 32)
@@ -157,19 +159,22 @@ unsigned AArch64GenRegisterBankInfo::getRegBankBaseIdxOffset(unsigned RBIdx,
       return 3;
     if (Size <= 512)
       return 4;
-    llvm_unreachable("Unexpected size");
+    return -1;
   }
-  llvm_unreachable("Unexpected bank");
+  return -1;
 }
 
 const RegisterBankInfo::ValueMapping *
 AArch64GenRegisterBankInfo::getValueMapping(PartialMappingIdx RBIdx,
                                             unsigned Size) {
   assert(RBIdx != PartialMappingIdx::PMI_None && "No mapping needed for that");
-  unsigned ValMappingIdx = First3OpsIdx +
-                           (RBIdx - PartialMappingIdx::PMI_Min +
-                            getRegBankBaseIdxOffset(RBIdx, Size)) *
-                               ValueMappingIdx::DistanceBetweenRegBanks;
+  unsigned BaseIdxOffset = getRegBankBaseIdxOffset(RBIdx, Size);
+  if (BaseIdxOffset == -1u)
+    return &ValMappings[InvalidIdx];
+
+  unsigned ValMappingIdx =
+      First3OpsIdx + (RBIdx - PartialMappingIdx::PMI_Min + BaseIdxOffset) *
+                         ValueMappingIdx::DistanceBetweenRegBanks;
   assert(ValMappingIdx >= First3OpsIdx && ValMappingIdx <= Last3OpsIdx &&
          "Mapping out of bound");
 
diff --git a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 3099383e5b323b33b9f6ba7d79fd962333d07382..ae01ea477bb9a0d4943851972f4c7f3dc59ee7e1 100644
--- a/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -328,11 +328,52 @@ static AArch64_AM::ShiftExtendType getShiftTypeForNode(SDValue N) {
   }
 }
 
+/// \brief Determine whether it is worth it to fold SHL into the addressing
+/// mode.
+static bool isWorthFoldingSHL(SDValue V) {
+  assert(V.getOpcode() == ISD::SHL && "invalid opcode");
+  // It is worth folding logical shift of up to three places.
+  auto *CSD = dyn_cast<ConstantSDNode>(V.getOperand(1));
+  if (!CSD)
+    return false;
+  unsigned ShiftVal = CSD->getZExtValue();
+  if (ShiftVal > 3)
+    return false;
+
+  // Check if this particular node is reused in any non-memory related
+  // operation.  If yes, do not try to fold this node into the address
+  // computation, since the computation will be kept.
+  const SDNode *Node = V.getNode();
+  for (SDNode *UI : Node->uses())
+    if (!isa<MemSDNode>(*UI))
+      for (SDNode *UII : UI->uses())
+        if (!isa<MemSDNode>(*UII))
+          return false;
+  return true;
+}
+
 /// \brief Determine whether it is worth to fold V into an extended register.
 bool AArch64DAGToDAGISel::isWorthFolding(SDValue V) const {
-  // it hurts if the value is used at least twice, unless we are optimizing
-  // for code size.
-  return ForCodeSize || V.hasOneUse();
+  // Trivial if we are optimizing for code size or if there is only
+  // one use of the value.
+  if (ForCodeSize || V.hasOneUse())
+    return true;
+  // If a subtarget has a fastpath LSL we can fold a logical shift into
+  // the addressing mode and save a cycle.
+  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::SHL &&
+      isWorthFoldingSHL(V))
+    return true;
+  if (Subtarget->hasLSLFast() && V.getOpcode() == ISD::ADD) {
+    const SDValue LHS = V.getOperand(0);
+    const SDValue RHS = V.getOperand(1);
+    if (LHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(LHS))
+      return true;
+    if (RHS.getOpcode() == ISD::SHL && isWorthFoldingSHL(RHS))
+      return true;
+  }
+
+  // It hurts otherwise, since the value will be reused.
+  return false;
 }
 
 /// SelectShiftedRegister - Select a "shifted register" operand.  If the value
diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7b581a706fa223f4b3c904d40dfa45f51e1805c3..504cb5615b69bd04a13ece712d0fa31feae3677b 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -554,8 +555,6 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setSchedulingPreference(Sched::Hybrid);
 
-  // Enable TBZ/TBNZ
-  MaskAndBranchFoldingIsLegal = true;
   EnableExtLdPromotion = true;
 
   // Set required alignment.
@@ -793,7 +792,7 @@ EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
 /// KnownZero/KnownOne bitsets.
 void AArch64TargetLowering::computeKnownBitsForTargetNode(
     const SDValue Op, APInt &KnownZero, APInt &KnownOne,
-    const SelectionDAG &DAG, unsigned Depth) const {
+    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
   switch (Op.getOpcode()) {
   default:
     break;
@@ -2113,8 +2112,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   const char *LibcallName =
@@ -2124,8 +2123,9 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 
   StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr);
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallingConv::Fast, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -2231,19 +2231,13 @@ static SDValue skipExtensionForVectorMULL(SDNode *N, SelectionDAG &DAG) {
 }
 
 static bool isSignExtended(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::SIGN_EXTEND)
-    return true;
-  if (isExtendedBUILD_VECTOR(N, DAG, true))
-    return true;
-  return false;
+  return N->getOpcode() == ISD::SIGN_EXTEND ||
+         isExtendedBUILD_VECTOR(N, DAG, true);
 }
 
 static bool isZeroExtended(SDNode *N, SelectionDAG &DAG) {
-  if (N->getOpcode() == ISD::ZERO_EXTEND)
-    return true;
-  if (isExtendedBUILD_VECTOR(N, DAG, false))
-    return true;
-  return false;
+  return N->getOpcode() == ISD::ZERO_EXTEND ||
+         isExtendedBUILD_VECTOR(N, DAG, false);
 }
 
 static bool isAddSubSExt(SDNode *N, SelectionDAG &DAG) {
@@ -3155,7 +3149,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
 
     if (VA.isRegLoc()) {
-      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i64) {
+      if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
+          Outs[0].VT == MVT::i64) {
         assert(VA.getLocVT() == MVT::i64 &&
                "unexpected calling convention register assignment");
         assert(!Ins.empty() && Ins[0].VT == MVT::i64 &&
@@ -3577,7 +3572,7 @@ SDValue
 AArch64TargetLowering::LowerELFGlobalTLSAddress(SDValue Op,
                                                 SelectionDAG &DAG) const {
   assert(Subtarget->isTargetELF() && "This function expects an ELF target");
-  assert(getTargetMachine().getCodeModel() == CodeModel::Small &&
+  assert(Subtarget->useSmallAddressing() &&
          "ELF TLS only supported in small memory model");
   // Different choices can be made for the maximum size of the TLS area for a
   // module. For the small address model, the default TLS size is 16MiB and the
@@ -3678,7 +3673,7 @@ SDValue AArch64TargetLowering::LowerGlobalTLSAddress(SDValue Op,
                                                      SelectionDAG &DAG) const {
   if (Subtarget->isTargetDarwin())
     return LowerDarwinGlobalTLSAddress(Op, DAG);
-  else if (Subtarget->isTargetELF())
+  if (Subtarget->isTargetELF())
     return LowerELFGlobalTLSAddress(Op, DAG);
 
   llvm_unreachable("Unexpected platform trying to use TLS");
@@ -4515,7 +4510,12 @@ unsigned AArch64TargetLowering::getRegisterByName(const char* RegName, EVT VT,
                                                   SelectionDAG &DAG) const {
   unsigned Reg = StringSwitch<unsigned>(RegName)
                        .Case("sp", AArch64::SP)
+                       .Case("x18", AArch64::X18)
+                       .Case("w18", AArch64::W18)
                        .Default(0);
+  if ((Reg == AArch64::X18 || Reg == AArch64::W18) &&
+      !Subtarget->isX18Reserved())
+    Reg = 0;
   if (Reg)
     return Reg;
   report_fatal_error(Twine("Invalid register name \""
@@ -6592,19 +6592,13 @@ FailedModImm:
     SDValue Op0 = Op.getOperand(0);
     unsigned ElemSize = VT.getScalarSizeInBits();
     unsigned i = 0;
-    // For 32 and 64 bit types, use INSERT_SUBREG for lane zero to
+    // For 32 and 64 bit types, use SCALAR_TO_VECTOR for lane zero to
     // a) Avoid a RMW dependency on the full vector register, and
     // b) Allow the register coalescer to fold away the copy if the
-    //    value is already in an S or D register.
-    // Do not do this for UNDEF/LOAD nodes because we have better patterns
-    // for those avoiding the SCALAR_TO_VECTOR/BUILD_VECTOR.
-    if (!Op0.isUndef() && Op0.getOpcode() != ISD::LOAD &&
-        (ElemSize == 32 || ElemSize == 64)) {
-      unsigned SubIdx = ElemSize == 32 ? AArch64::ssub : AArch64::dsub;
-      MachineSDNode *N =
-          DAG.getMachineNode(TargetOpcode::INSERT_SUBREG, dl, VT, Vec, Op0,
-                             DAG.getTargetConstant(SubIdx, dl, MVT::i32));
-      Vec = SDValue(N, 0);
+    //    value is already in an S or D register, and we're forced to emit an
+    //    INSERT_SUBREG that we can't fold anywhere.
+    if (!Op0.isUndef() && (ElemSize == 32 || ElemSize == 64)) {
+      Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op0);
       ++i;
     }
     for (; i < NumElts; ++i) {
@@ -7248,6 +7242,33 @@ bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
   return NumBits == 32 || NumBits == 64;
 }
 
+/// A helper function for determining the number of interleaved accesses we
+/// will generate when lowering accesses of the given type.
+unsigned
+AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
+                                                 const DataLayout &DL) const {
+  return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
+}
+
+bool AArch64TargetLowering::isLegalInterleavedAccessType(
+    VectorType *VecTy, const DataLayout &DL) const {
+
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+
+  // Ensure the number of vector elements is greater than 1.
+  if (VecTy->getNumElements() < 2)
+    return false;
+
+  // Ensure the element type is legal.
+  if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64)
+    return false;
+
+  // Ensure the total vector size is 64 or a multiple of 128. Types larger than
+  // 128 will be split into multiple interleaved accesses.
+  return VecSize == 64 || VecSize % 128 == 0;
+}
+
 /// \brief Lower an interleaved load into a ldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -7271,12 +7292,15 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
   VectorType *VecTy = Shuffles[0]->getType();
-  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
 
-  // Skip if we do not have NEON and skip illegal vector types.
-  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128))
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
     return false;
 
+  unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   Type *EltTy = VecTy->getVectorElementType();
@@ -7284,6 +7308,25 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
     VecTy =
         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
+  IRBuilder<> Builder(LI);
+
+  // The base address of the load.
+  Value *BaseAddr = LI->getPointerOperand();
+
+  if (NumLoads > 1) {
+    // If we're going to generate more than one load, reset the sub-vector type
+    // to something legal.
+    VecTy = VectorType::get(VecTy->getVectorElementType(),
+                            VecTy->getVectorNumElements() / NumLoads);
+
+    // We will compute the pointer operand of each load from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, VecTy->getVectorElementType()->getPointerTo(
+                      LI->getPointerAddressSpace()));
+  }
+
   Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
   Type *Tys[2] = {VecTy, PtrTy};
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
@@ -7292,39 +7335,49 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   Function *LdNFunc =
       Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
 
-  IRBuilder<> Builder(LI);
-  Value *Ptr = Builder.CreateBitCast(LI->getPointerOperand(), PtrTy);
+  // Holds sub-vectors extracted from the load intrinsic return values. The
+  // sub-vectors are associated with the shufflevector instructions they will
+  // replace.
+  DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
 
-  CallInst *LdN = Builder.CreateCall(LdNFunc, Ptr, "ldN");
+  for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
 
-  // Replace uses of each shufflevector with the corresponding vector loaded
-  // by ldN.
-  for (unsigned i = 0; i < Shuffles.size(); i++) {
-    ShuffleVectorInst *SVI = Shuffles[i];
-    unsigned Index = Indices[i];
+    // If we're generating more than one load, compute the base address of
+    // subsequent loads as an offset from the previous.
+    if (LoadCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(
+          BaseAddr, VecTy->getVectorNumElements() * Factor);
 
-    Value *SubVec = Builder.CreateExtractValue(LdN, Index);
+    CallInst *LdN = Builder.CreateCall(
+        LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
 
-    // Convert the integer vector to pointer vector if the element is pointer.
-    if (EltTy->isPointerTy())
-      SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
+    // Extract and store the sub-vectors returned by the load intrinsic.
+    for (unsigned i = 0; i < Shuffles.size(); i++) {
+      ShuffleVectorInst *SVI = Shuffles[i];
+      unsigned Index = Indices[i];
 
-    SVI->replaceAllUsesWith(SubVec);
-  }
+      Value *SubVec = Builder.CreateExtractValue(LdN, Index);
 
-  return true;
-}
+      // Convert the integer vector to pointer vector if the element is pointer.
+      if (EltTy->isPointerTy())
+        SubVec = Builder.CreateIntToPtr(SubVec, SVI->getType());
 
-/// \brief Get a mask consisting of sequential integers starting from \p Start.
-///
-/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
-                                   unsigned NumElts) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < NumElts; i++)
-    Mask.push_back(Builder.getInt32(Start + i));
+      SubVecs[SVI].push_back(SubVec);
+    }
+  }
 
-  return ConstantVector::get(Mask);
+  // Replace uses of the shufflevector instructions with the sub-vectors
+  // returned by the load intrinsic. If a shufflevector instruction is
+  // associated with more than one sub-vector, those sub-vectors will be
+  // concatenated into a single wide vector.
+  for (ShuffleVectorInst *SVI : Shuffles) {
+    auto &SubVec = SubVecs[SVI];
+    auto *WideVec =
+        SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
+    SVI->replaceAllUsesWith(WideVec);
+  }
+
+  return true;
 }
 
 /// \brief Lower an interleaved store into a stN intrinsic.
@@ -7368,12 +7421,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
 
-  // Skip if we do not have NEON and skip illegal vector types.
-  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128))
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
     return false;
 
+  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
+
   Value *Op0 = SVI->getOperand(0);
   Value *Op1 = SVI->getOperand(1);
   IRBuilder<> Builder(SI);
@@ -7393,6 +7449,25 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
     SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
+  // The base address of the store.
+  Value *BaseAddr = SI->getPointerOperand();
+
+  if (NumStores > 1) {
+    // If we're going to generate more than one store, reset the lane length
+    // and sub-vector type to something legal.
+    LaneLen /= NumStores;
+    SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+
+    // We will compute the pointer operand of each store from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
+                      SI->getPointerAddressSpace()));
+  }
+
+  auto Mask = SVI->getShuffleMask();
+
   Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace());
   Type *Tys[2] = {SubVecTy, PtrTy};
   static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2,
@@ -7401,34 +7476,43 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   Function *StNFunc =
       Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
 
-  SmallVector<Value *, 5> Ops;
+  for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
 
-  // Split the shufflevector operands into sub vectors for the new stN call.
-  auto Mask = SVI->getShuffleMask();
-  for (unsigned i = 0; i < Factor; i++) {
-    if (Mask[i] >= 0) {
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
-    } else {
-      unsigned StartMask = 0;
-      for (unsigned j = 1; j < LaneLen; j++) {
-        if (Mask[j*Factor + i] >= 0) {
-          StartMask = Mask[j*Factor + i] - j;
-          break;
+    SmallVector<Value *, 5> Ops;
+
+    // Split the shufflevector operands into sub vectors for the new stN call.
+    for (unsigned i = 0; i < Factor; i++) {
+      unsigned IdxI = StoreCount * LaneLen * Factor + i;
+      if (Mask[IdxI] >= 0) {
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+      } else {
+        unsigned StartMask = 0;
+        for (unsigned j = 1; j < LaneLen; j++) {
+          unsigned IdxJ = StoreCount * LaneLen * Factor + j;
+          if (Mask[IdxJ * Factor + IdxI] >= 0) {
+            StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
+            break;
+          }
         }
+        // Note: Filling undef gaps with random elements is ok, since
+        // those elements were being written anyway (with undefs).
+        // In the case of all undefs we're defaulting to using elems from 0
+        // Note: StartMask cannot be negative, it's checked in
+        // isReInterleaveMask
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
       }
-      // Note: If all elements in a chunk are undefs, StartMask=0!
-      // Note: Filling undef gaps with random elements is ok, since
-      // those elements were being written anyway (with undefs).
-      // In the case of all undefs we're defaulting to using elems from 0
-      // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
     }
-  }
 
-  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), PtrTy));
-  Builder.CreateCall(StNFunc, Ops);
+    // If we generating more than one store, we compute the base address of
+    // subsequent stores as an offset from the previous.
+    if (StoreCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
+
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
+    Builder.CreateCall(StNFunc, Ops);
+  }
   return true;
 }
 
@@ -7689,7 +7773,7 @@ SDValue
 AArch64TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
                                      SelectionDAG &DAG,
                                      std::vector<SDNode *> *Created) const {
-  AttributeSet Attr = DAG.getMachineFunction().getFunction()->getAttributes();
+  AttributeList Attr = DAG.getMachineFunction().getFunction()->getAttributes();
   if (isIntDivCheap(N->getValueType(0), Attr))
     return SDValue(N,0); // Lower SDIV as SDIV
 
@@ -8934,8 +9018,9 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
   // instructions (stp).
   SDLoc DL(&St);
   SDValue BasePtr = St.getBasePtr();
+  const MachinePointerInfo &PtrInfo = St.getPointerInfo();
   SDValue NewST1 =
-      DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, St.getPointerInfo(),
+      DAG.getStore(St.getChain(), DL, SplatVal, BasePtr, PtrInfo,
                    OrigAlignment, St.getMemOperand()->getFlags());
 
   unsigned Offset = EltOffset;
@@ -8944,7 +9029,7 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
     SDValue OffsetPtr = DAG.getNode(ISD::ADD, DL, MVT::i64, BasePtr,
                                     DAG.getConstant(Offset, DL, MVT::i64));
     NewST1 = DAG.getStore(NewST1.getValue(0), DL, SplatVal, OffsetPtr,
-                          St.getPointerInfo(), Alignment,
+                          PtrInfo.getWithOffset(Offset), Alignment,
                           St.getMemOperand()->getFlags());
     Offset += EltOffset;
   }
@@ -9265,7 +9350,7 @@ static SDValue performSTORECombine(SDNode *N,
   return SDValue();
 }
 
-  /// This function handles the log2-shuffle pattern produced by the
+/// This function handles the log2-shuffle pattern produced by the
 /// LoopVectorizer for the across vector reduction. It consists of
 /// log2(NumVectorElements) steps and, in each step, 2^(s) elements
 /// are reduced, where s is an induction variable from 0 to
@@ -10481,9 +10566,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
 }
 
 bool AArch64TargetLowering::useLoadStackGuardNode() const {
-  if (!Subtarget->isTargetAndroid())
-    return true;
-  return TargetLowering::useLoadStackGuardNode();
+  if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
+    return TargetLowering::useLoadStackGuardNode();
+  return true;
 }
 
 unsigned AArch64TargetLowering::combineRepeatedFPDivisors() const {
@@ -10621,36 +10706,56 @@ bool AArch64TargetLowering::shouldNormalizeToSelectSequence(LLVMContext &,
   return false;
 }
 
-Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
-  if (!Subtarget->isTargetAndroid())
-    return TargetLowering::getIRStackGuard(IRB);
-
-  // Android provides a fixed TLS slot for the stack cookie. See the definition
-  // of TLS_SLOT_STACK_GUARD in
-  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
-  const unsigned TlsOffset = 0x28;
+static Value *UseTlsOffset(IRBuilder<> &IRB, unsigned Offset) {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   Function *ThreadPointerFunc =
       Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
   return IRB.CreatePointerCast(
-      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
+      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), Offset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
 }
 
-Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
-  if (!Subtarget->isTargetAndroid())
-    return TargetLowering::getSafeStackPointerLocation(IRB);
+Value *AArch64TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  // Android provides a fixed TLS slot for the stack cookie. See the definition
+  // of TLS_SLOT_STACK_GUARD in
+  // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
+  if (Subtarget->isTargetAndroid())
+    return UseTlsOffset(IRB, 0x28);
 
+  // Fuchsia is similar.
+  // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
+  if (Subtarget->isTargetFuchsia())
+    return UseTlsOffset(IRB, -0x10);
+
+  return TargetLowering::getIRStackGuard(IRB);
+}
+
+Value *AArch64TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
-  const unsigned TlsOffset = 0x48;
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  Function *ThreadPointerFunc =
-      Intrinsic::getDeclaration(M, Intrinsic::thread_pointer);
-  return IRB.CreatePointerCast(
-      IRB.CreateConstGEP1_32(IRB.CreateCall(ThreadPointerFunc), TlsOffset),
-      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(0));
+  if (Subtarget->isTargetAndroid())
+    return UseTlsOffset(IRB, 0x48);
+
+  // Fuchsia is similar.
+  // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
+  if (Subtarget->isTargetFuchsia())
+    return UseTlsOffset(IRB, -0x8);
+
+  return TargetLowering::getSafeStackPointerLocation(IRB);
+}
+
+bool AArch64TargetLowering::isMaskAndCmp0FoldingBeneficial(
+    const Instruction &AndI) const {
+  // Only sink 'and' mask to cmp use block if it is masking a single bit, since
+  // this is likely to be fold the and/cmp/br into a single tbz instruction.  It
+  // may be beneficial to sink in other cases, but we would have to check that
+  // the cmp would not get folded into the br to form a cbz for these to be
+  // beneficial.
+  ConstantInt* Mask = dyn_cast<ConstantInt>(AndI.getOperand(1));
+  if (!Mask)
+    return false;
+  return Mask->getUniqueInteger().isPowerOf2();
 }
 
 void AArch64TargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
@@ -10700,7 +10805,7 @@ void AArch64TargetLowering::insertCopiesSplitCSR(
   }
 }
 
-bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // Integer division on AArch64 is expensive. However, when aggressively
   // optimizing for code size, we prefer to use a div instruction, as it is
   // usually smaller than the alternative sequence.
@@ -10709,6 +10814,14 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
   bool OptSize =
-      Attr.hasAttribute(AttributeSet::FunctionIndex, Attribute::MinSize);
+      Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
+
+unsigned
+AArch64TargetLowering::getVaListSizeInBits(const DataLayout &DL) const {
+  if (Subtarget->isTargetDarwin())
+    return getPointerTy(DL).getSizeInBits();
+
+  return 3 * getPointerTy(DL).getSizeInBits() + 2 * 32;
+}
diff --git a/lib/Target/AArch64/AArch64ISelLowering.h b/lib/Target/AArch64/AArch64ISelLowering.h
index 054ccc31674f58864e3cf24f58c6755de8bc9983..2ad6c8b23df8c3c9e510153fd38a9a5191dc372e 100644
--- a/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/lib/Target/AArch64/AArch64ISelLowering.h
@@ -251,7 +251,8 @@ public:
   /// Determine which of the bits specified in Mask are known to be either zero
   /// or one and return them in the KnownZero/KnownOne bitsets.
   void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
-                                     APInt &KnownOne, const SelectionDAG &DAG,
+                                     APInt &KnownOne, const APInt &DemandedElts,
+                                     const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
   MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
@@ -402,7 +403,7 @@ public:
     return AArch64::X1;
   }
 
-  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+  bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
   bool isCheapToSpeculateCttz() const override {
     return true;
@@ -412,6 +413,8 @@ public:
     return true;
   }
 
+  bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
   bool hasAndNotCompare(SDValue) const override {
     // 'bics'
     return true;
@@ -435,6 +438,20 @@ public:
     return true;
   }
 
+  /// Returns the size of the platform's va_list object.
+  unsigned getVaListSizeInBits(const DataLayout &DL) const override;
+
+  /// Returns true if \p VecTy is a legal interleaved access type. This
+  /// function checks the vector element type and the overall width of the
+  /// vector.
+  bool isLegalInterleavedAccessType(VectorType *VecTy,
+                                    const DataLayout &DL) const;
+
+  /// Returns the number of interleaved accesses that will be generated when
+  /// lowering accesses of the given type.
+  unsigned getNumInterleavedAccesses(VectorType *VecTy,
+                                     const DataLayout &DL) const;
+
 private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
diff --git a/lib/Target/AArch64/AArch64InstrFormats.td b/lib/Target/AArch64/AArch64InstrFormats.td
index cfc4dd24e968b2e8e5b2aa7521b7654485f1d350..16be4432b160707e4bbfb8fa74572cbd32488ca9 100644
--- a/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/lib/Target/AArch64/AArch64InstrFormats.td
@@ -39,6 +39,9 @@ class AArch64Inst<Format f, string cstr> : Instruction {
   let Constraints = cstr;
 }
 
+class InstSubst<string Asm, dag Result, bit EmitPriority = 0>
+  : InstAlias<Asm, Result, EmitPriority>, Requires<[UseNegativeImmediates]>;
+
 // Pseudo instructions (don't have encoding information)
 class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
     : AArch64Inst<PseudoFrm, cstr> {
@@ -257,6 +260,7 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
 class AsmImmRange<int Low, int High> : AsmOperandClass {
   let Name = "Imm" # Low # "_" # High;
   let DiagnosticType = "InvalidImm" # Low # "_" # High;
+  let PredicateMethod = "isImmInRange<" # Low # "," # High # ">";
 }
 
 def Imm1_8Operand : AsmImmRange<1, 8>;
@@ -264,6 +268,20 @@ def Imm1_16Operand : AsmImmRange<1, 16>;
 def Imm1_32Operand : AsmImmRange<1, 32>;
 def Imm1_64Operand : AsmImmRange<1, 64>;
 
+class BranchTarget<int N> : AsmOperandClass {
+  let Name = "BranchTarget" # N;
+  let DiagnosticType = "InvalidLabel";
+  let PredicateMethod = "isBranchTarget<" # N # ">";
+}
+
+class PCRelLabel<int N> : BranchTarget<N> {
+  let Name = "PCRelLabel" # N;
+}
+
+def BranchTarget14Operand : BranchTarget<14>;
+def BranchTarget26Operand : BranchTarget<26>;
+def PCRelLabel19Operand   : PCRelLabel<19>;
+
 def MovZSymbolG3AsmOperand : AsmOperandClass {
   let Name = "MovZSymbolG3";
   let RenderMethod = "addImmOperands";
@@ -500,7 +518,8 @@ def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 // imm0_255 predicate - True if the immediate is in the range [0,255].
-def Imm0_255Operand : AsmOperandClass { let Name = "Imm0_255"; }
+def Imm0_255Operand : AsmImmRange<0,255>;
+
 def imm0_255 : Operand<i32>, ImmLeaf<i32, [{
   return ((uint32_t)Imm) < 256;
 }]> {
@@ -673,6 +692,14 @@ def addsub_shifted_imm64 : addsub_shifted_imm<i64>;
 def addsub_shifted_imm32_neg : addsub_shifted_imm_neg<i32>;
 def addsub_shifted_imm64_neg : addsub_shifted_imm_neg<i64>;
 
+def gi_addsub_shifted_imm32 :
+    GIComplexOperandMatcher<s32, (ops i32imm, i32imm), "selectArithImmed">,
+    GIComplexPatternEquiv<addsub_shifted_imm32>;
+
+def gi_addsub_shifted_imm64 :
+    GIComplexOperandMatcher<s64, (ops i32imm, i32imm), "selectArithImmed">,
+    GIComplexPatternEquiv<addsub_shifted_imm64>;
+
 class neg_addsub_shifted_imm<ValueType Ty>
     : Operand<Ty>, ComplexPattern<Ty, 2, "SelectNegArithImmed", [imm]> {
   let PrintMethod = "printAddSubImm";
@@ -1094,10 +1121,6 @@ def inv_ccode : Operand<i32> {
 
 // Conditional branch target. 19-bit immediate. The low two bits of the target
 // offset are implied zero and so are not part of the immediate.
-def PCRelLabel19Operand : AsmOperandClass {
-  let Name = "PCRelLabel19";
-  let DiagnosticType = "InvalidLabel";
-}
 def am_brcond : Operand<OtherVT> {
   let EncoderMethod = "getCondBranchTargetOpValue";
   let DecoderMethod = "DecodePCRelLabel19";
@@ -1154,9 +1177,6 @@ multiclass CmpBranch<bit op, string asm, SDNode node> {
 //---
 // Test-and-branch target. 14-bit sign-extended immediate. The low two bits of
 // the target offset are implied zero and so are not part of the immediate.
-def BranchTarget14Operand : AsmOperandClass {
-  let Name = "BranchTarget14";
-}
 def am_tbrcond : Operand<OtherVT> {
   let EncoderMethod = "getTestBranchTargetOpValue";
   let PrintMethod = "printAlignedLabel";
@@ -1166,11 +1186,12 @@ def am_tbrcond : Operand<OtherVT> {
 // AsmOperand classes to emit (or not) special diagnostics
 def TBZImm0_31Operand : AsmOperandClass {
   let Name = "TBZImm0_31";
-  let PredicateMethod = "isImm0_31";
+  let PredicateMethod = "isImmInRange<0,31>";
   let RenderMethod = "addImm0_31Operands";
 }
 def TBZImm32_63Operand : AsmOperandClass {
   let Name = "Imm32_63";
+  let PredicateMethod = "isImmInRange<32,63>";
   let DiagnosticType = "InvalidImm0_63";
 }
 
@@ -1232,10 +1253,6 @@ multiclass TestBranch<bit op, string asm, SDNode node> {
 //---
 // Unconditional branch (immediate) instructions.
 //---
-def BranchTarget26Operand : AsmOperandClass {
-  let Name = "BranchTarget26";
-  let DiagnosticType = "InvalidLabel";
-}
 def am_b_target : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
   let PrintMethod = "printAlignedLabel";
@@ -1784,10 +1801,10 @@ multiclass AddSub<bit isSub, string mnemonic, string alias,
   }
 
   // add Rd, Rb, -imm -> sub Rd, Rn, imm
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32sp:$Rn,
                       addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64sp:$Rn,
                        addsub_shifted_imm64_neg:$imm), 0>;
 
@@ -1859,10 +1876,10 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
   } // Defs = [NZCV]
 
   // Support negative immediates, e.g. adds Rd, Rn, -imm -> subs Rd, Rn, imm
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32sp:$Rn,
                       addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<alias#"\t$Rd, $Rn, $imm",
+  def : InstSubst<alias#"\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64sp:$Rn,
                        addsub_shifted_imm64_neg:$imm), 0>;
 
@@ -1883,9 +1900,9 @@ multiclass AddSubS<bit isSub, string mnemonic, SDNode OpNode, string cmp,
                   XZR, GPR64:$src1, GPR64:$src2, arith_shift64:$sh), 4>;
 
   // Support negative immediates, e.g. cmp Rn, -imm -> cmn Rn, imm
-  def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
+  def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Wri")
                   WZR, GPR32sp:$src, addsub_shifted_imm32_neg:$imm), 0>;
-  def : InstAlias<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
+  def : InstSubst<cmpAlias#"\t$src, $imm", (!cast<Instruction>(NAME#"Xri")
                   XZR, GPR64sp:$src, addsub_shifted_imm64_neg:$imm), 0>;
 
   // Compare shorthands
@@ -2100,10 +2117,10 @@ multiclass LogicalImm<bits<2> opc, string mnemonic, SDNode OpNode,
     let Inst{31} = 1;
   }
 
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32sp:$Rd, GPR32:$Rn,
                       logical_imm32_not:$imm), 0>;
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64sp:$Rd, GPR64:$Rn,
                        logical_imm64_not:$imm), 0>;
 }
@@ -2122,10 +2139,10 @@ multiclass LogicalImmS<bits<2> opc, string mnemonic, SDNode OpNode,
   }
   } // end Defs = [NZCV]
 
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Wri") GPR32:$Rd, GPR32:$Rn,
                       logical_imm32_not:$imm), 0>;
-  def : InstAlias<Alias # "\t$Rd, $Rn, $imm",
+  def : InstSubst<Alias # "\t$Rd, $Rn, $imm",
                   (!cast<Instruction>(NAME # "Xri") GPR64:$Rd, GPR64:$Rn,
                        logical_imm64_not:$imm), 0>;
 }
@@ -2454,7 +2471,7 @@ class PrefetchUI<bits<2> sz, bit V, bits<2> opc, string asm, list<dag> pat>
 
 // Load literal address: 19-bit immediate. The low two bits of the target
 // offset are implied zero and so are not part of the immediate.
-def am_ldrlit : Operand<OtherVT> {
+def am_ldrlit : Operand<iPTR> {
   let EncoderMethod = "getLoadLiteralOpValue";
   let DecoderMethod = "DecodePCRelLabel19";
   let PrintMethod = "printAlignedLabel";
diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp
index 8e3b7658c3ec8c09e9f886e116de47af4b1f9ad1..41fc8eceab5c7f67152602fefe1e6eba0560a6b4 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64Subtarget.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "Utils/AArch64BaseInfo.h"
@@ -762,6 +763,17 @@ bool AArch64InstrInfo::isAsCheapAsAMove(const MachineInstr &MI) const {
   llvm_unreachable("Unknown opcode to check as cheap as a move!");
 }
 
+bool AArch64InstrInfo::isFalkorLSLFast(const MachineInstr &MI) const {
+  if (MI.getNumOperands() < 4)
+    return false;
+  unsigned ShOpVal = MI.getOperand(3).getImm();
+  unsigned ShImm = AArch64_AM::getShiftValue(ShOpVal);
+  if (AArch64_AM::getShiftType(ShOpVal) == AArch64_AM::LSL &&
+       ShImm < 4)
+    return true;
+  return false;
+}
+
 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
                                              unsigned &SrcReg, unsigned &DstReg,
                                              unsigned &SubIdx) const {
@@ -1345,14 +1357,6 @@ bool AArch64InstrInfo::hasShiftedReg(const MachineInstr &MI) const {
   case AArch64::BICSXrs:
   case AArch64::BICWrs:
   case AArch64::BICXrs:
-  case AArch64::CRC32Brr:
-  case AArch64::CRC32CBrr:
-  case AArch64::CRC32CHrr:
-  case AArch64::CRC32CWrr:
-  case AArch64::CRC32CXrr:
-  case AArch64::CRC32Hrr:
-  case AArch64::CRC32Wrr:
-  case AArch64::CRC32Xrr:
   case AArch64::EONWrs:
   case AArch64::EONXrs:
   case AArch64::EORWrs:
@@ -1627,17 +1631,6 @@ bool AArch64InstrInfo::isUnscaledLdSt(MachineInstr &MI) const {
   return isUnscaledLdSt(MI.getOpcode());
 }
 
-bool AArch64InstrInfo::isTailCall(const MachineInstr &Inst) const
-{
-  switch (Inst.getOpcode()) {
-  case AArch64::TCRETURNdi:
-  case AArch64::TCRETURNri:
-    return true;
-  default:
-    return false;
-  }
-}
-
 // Is this a candidate for ld/st merging or pairing?  For example, we don't
 // touch volatiles or load/stores that have a hint to avoid pair formation.
 bool AArch64InstrInfo::isCandidateToMergeOrPair(MachineInstr &MI) const {
@@ -1702,16 +1695,59 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   } else
     return false;
 
-  // Offset is calculated as the immediate operand multiplied by the scaling factor.
-  // Unscaled instructions have scaling factor set to 1.
+  // Get the scaling factor for the instruction and set the width for the 
+  // instruction.
   unsigned Scale = 0;
-  switch (LdSt.getOpcode()) {
+  int64_t Dummy1, Dummy2;
+
+  // If this returns false, then it's an instruction we don't want to handle.
+  if (!getMemOpInfo(LdSt.getOpcode(), Scale, Width, Dummy1, Dummy2))
+    return false;
+
+  // Compute the offset. Offset is calculated as the immediate operand
+  // multiplied by the scaling factor. Unscaled instructions have scaling factor
+  // set to 1.
+  if (LdSt.getNumExplicitOperands() == 3) {
+    BaseReg = LdSt.getOperand(1).getReg();
+    Offset = LdSt.getOperand(2).getImm() * Scale;
+  } else {
+    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
+    BaseReg = LdSt.getOperand(2).getReg();
+    Offset = LdSt.getOperand(3).getImm() * Scale;
+  }
+  return true;
+}
+
+MachineOperand&
+AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
+  assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
+  MachineOperand &OfsOp = LdSt.getOperand(LdSt.getNumExplicitOperands()-1);
+  assert(OfsOp.isImm() && "Offset operand wasn't immediate.");
+  return OfsOp;
+}
+
+bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
+                                    unsigned &Width, int64_t &MinOffset,
+                                    int64_t &MaxOffset) const {
+  switch (Opcode) {
+  // Not a memory operation or something we want to handle.  
   default:
+    Scale = Width = 0;
+    MinOffset = MaxOffset = 0;
     return false;
+  case AArch64::STRWpost:
+  case AArch64::LDRWpost:
+    Width = 32;
+    Scale = 4;
+    MinOffset = -256;
+    MaxOffset = 255;
+    break;
   case AArch64::LDURQi:
   case AArch64::STURQi:
     Width = 16;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURXi:
   case AArch64::LDURDi:
@@ -1719,6 +1755,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURDi:
     Width = 8;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURWi:
   case AArch64::LDURSi:
@@ -1727,6 +1765,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURSi:
     Width = 4;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURHi:
   case AArch64::LDURHHi:
@@ -1736,6 +1776,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURHHi:
     Width = 2;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDURBi:
   case AArch64::LDURBBi:
@@ -1745,6 +1787,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STURBBi:
     Width = 1;
     Scale = 1;
+    MinOffset = -256;
+    MaxOffset = 255;
     break;
   case AArch64::LDPQi:
   case AArch64::LDNPQi:
@@ -1752,10 +1796,14 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STNPQi:
     Scale = 16;
     Width = 32;
+    MinOffset = -64;
+    MaxOffset = 63;
     break;
   case AArch64::LDRQui:
   case AArch64::STRQui:
     Scale = Width = 16;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDPXi:
   case AArch64::LDPDi:
@@ -1767,12 +1815,16 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STNPDi:
     Scale = 8;
     Width = 16;
+    MinOffset = -64;
+    MaxOffset = 63;
     break;
   case AArch64::LDRXui:
   case AArch64::LDRDui:
   case AArch64::STRXui:
   case AArch64::STRDui:
     Scale = Width = 8;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDPWi:
   case AArch64::LDPSi:
@@ -1784,6 +1836,8 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STNPSi:
     Scale = 4;
     Width = 8;
+    MinOffset = -64;
+    MaxOffset = 63;
     break;
   case AArch64::LDRWui:
   case AArch64::LDRSui:
@@ -1791,29 +1845,27 @@ bool AArch64InstrInfo::getMemOpBaseRegImmOfsWidth(
   case AArch64::STRWui:
   case AArch64::STRSui:
     Scale = Width = 4;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDRHui:
   case AArch64::LDRHHui:
   case AArch64::STRHui:
   case AArch64::STRHHui:
     Scale = Width = 2;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   case AArch64::LDRBui:
   case AArch64::LDRBBui:
   case AArch64::STRBui:
   case AArch64::STRBBui:
     Scale = Width = 1;
+    MinOffset = 0;
+    MaxOffset = 4095;
     break;
   }
 
-  if (LdSt.getNumExplicitOperands() == 3) {
-    BaseReg = LdSt.getOperand(1).getReg();
-    Offset = LdSt.getOperand(2).getImm() * Scale;
-  } else {
-    assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
-    BaseReg = LdSt.getOperand(2).getReg();
-    Offset = LdSt.getOperand(3).getImm() * Scale;
-  }
   return true;
 }
 
@@ -1914,88 +1966,6 @@ bool AArch64InstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
   return Offset1 + 1 == Offset2;
 }
 
-bool AArch64InstrInfo::shouldScheduleAdjacent(
-    const MachineInstr &First, const MachineInstr &Second) const {
-  if (Subtarget.hasArithmeticBccFusion()) {
-    // Fuse CMN, CMP, TST followed by Bcc.
-    unsigned SecondOpcode = Second.getOpcode();
-    if (SecondOpcode == AArch64::Bcc) {
-      switch (First.getOpcode()) {
-      default:
-        return false;
-      case AArch64::ADDSWri:
-      case AArch64::ADDSWrr:
-      case AArch64::ADDSXri:
-      case AArch64::ADDSXrr:
-      case AArch64::ANDSWri:
-      case AArch64::ANDSWrr:
-      case AArch64::ANDSXri:
-      case AArch64::ANDSXrr:
-      case AArch64::SUBSWri:
-      case AArch64::SUBSWrr:
-      case AArch64::SUBSXri:
-      case AArch64::SUBSXrr:
-      case AArch64::BICSWrr:
-      case AArch64::BICSXrr:
-        return true;
-      case AArch64::ADDSWrs:
-      case AArch64::ADDSXrs:
-      case AArch64::ANDSWrs:
-      case AArch64::ANDSXrs:
-      case AArch64::SUBSWrs:
-      case AArch64::SUBSXrs:
-      case AArch64::BICSWrs:
-      case AArch64::BICSXrs:
-        // Shift value can be 0 making these behave like the "rr" variant...
-        return !hasShiftedReg(Second);
-      }
-    }
-  }
-  if (Subtarget.hasArithmeticCbzFusion()) {
-    // Fuse ALU operations followed by CBZ/CBNZ.
-    unsigned SecondOpcode = Second.getOpcode();
-    if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
-        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX) {
-      switch (First.getOpcode()) {
-      default:
-        return false;
-      case AArch64::ADDWri:
-      case AArch64::ADDWrr:
-      case AArch64::ADDXri:
-      case AArch64::ADDXrr:
-      case AArch64::ANDWri:
-      case AArch64::ANDWrr:
-      case AArch64::ANDXri:
-      case AArch64::ANDXrr:
-      case AArch64::EORWri:
-      case AArch64::EORWrr:
-      case AArch64::EORXri:
-      case AArch64::EORXrr:
-      case AArch64::ORRWri:
-      case AArch64::ORRWrr:
-      case AArch64::ORRXri:
-      case AArch64::ORRXrr:
-      case AArch64::SUBWri:
-      case AArch64::SUBWrr:
-      case AArch64::SUBXri:
-      case AArch64::SUBXrr:
-        return true;
-      case AArch64::ADDWrs:
-      case AArch64::ADDXrs:
-      case AArch64::ANDWrs:
-      case AArch64::ANDXrs:
-      case AArch64::SUBWrs:
-      case AArch64::SUBXrs:
-      case AArch64::BICWrs:
-      case AArch64::BICXrs:
-        // Shift value can be 0 making these behave like the "rr" variant...
-        return !hasShiftedReg(Second);
-      }
-    }
-  }
-  return false;
-}
-
 MachineInstr *AArch64InstrInfo::emitFrameIndexDebugValue(
     MachineFunction &MF, int FrameIx, uint64_t Offset, const MDNode *Var,
     const MDNode *Expr, const DebugLoc &DL) const {
@@ -4297,3 +4267,199 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
       {MO_TLS, "aarch64-tls"}};
   return makeArrayRef(TargetFlags);
 }
+
+unsigned AArch64InstrInfo::getOutliningBenefit(size_t SequenceSize,
+                                               size_t Occurrences,
+                                               bool CanBeTailCall) const {
+  unsigned NotOutlinedSize = SequenceSize * Occurrences;
+  unsigned OutlinedSize;
+
+  // Is this candidate something we can outline as a tail call?
+  if (CanBeTailCall) {
+    // If yes, then we just outline the sequence and replace each of its
+    // occurrences with a branch instruction.
+    OutlinedSize = SequenceSize + Occurrences;
+  } else {
+    // If no, then we outline the sequence (SequenceSize), add a return (+1),
+    // and replace each occurrence with a save/restore to LR and a call
+    // (3 * Occurrences)
+    OutlinedSize = (SequenceSize + 1) + (3 * Occurrences);
+  }
+
+  // Return the number of instructions saved by outlining this sequence.
+  return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0;
+}
+
+bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const {
+  return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+AArch64GenInstrInfo::MachineOutlinerInstrType
+AArch64InstrInfo::getOutliningType(MachineInstr &MI) const {
+
+  MachineFunction *MF = MI.getParent()->getParent();
+  AArch64FunctionInfo *FuncInfo = MF->getInfo<AArch64FunctionInfo>();
+
+  // Don't outline LOHs.
+  if (FuncInfo->getLOHRelated().count(&MI))
+    return MachineOutlinerInstrType::Illegal;
+
+  // Don't allow debug values to impact outlining type.
+  if (MI.isDebugValue() || MI.isIndirectDebugValue())
+    return MachineOutlinerInstrType::Invisible;
+
+  // Is this a terminator for a basic block?
+  if (MI.isTerminator()) {
+
+    // Is this the end of a function?
+    if (MI.getParent()->succ_empty())
+        return MachineOutlinerInstrType::Legal;
+
+    // It's not, so don't outline it.
+    return MachineOutlinerInstrType::Illegal;
+  }
+
+  // Don't outline positions.
+  if (MI.isPosition())
+    return MachineOutlinerInstrType::Illegal;
+
+  // Make sure none of the operands are un-outlinable.
+  for (const MachineOperand &MOP : MI.operands())
+    if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+        MOP.isTargetIndex())
+      return MachineOutlinerInstrType::Illegal;
+
+  // Don't outline anything that uses the link register.
+  if (MI.modifiesRegister(AArch64::LR, &RI) ||
+      MI.readsRegister(AArch64::LR, &RI))
+      return MachineOutlinerInstrType::Illegal;
+
+  // Does this use the stack?
+  if (MI.modifiesRegister(AArch64::SP, &RI) ||
+      MI.readsRegister(AArch64::SP, &RI)) {
+
+    // Is it a memory operation?
+    if (MI.mayLoadOrStore()) {
+      unsigned Base; // Filled with the base regiser of MI.
+      int64_t Offset; // Filled with the offset of MI.
+      unsigned DummyWidth;
+
+      // Does it allow us to offset the base register and is the base SP?
+      if (!getMemOpBaseRegImmOfsWidth(MI, Base, Offset, DummyWidth, &RI) ||
+                                      Base != AArch64::SP)
+        return MachineOutlinerInstrType::Illegal;
+
+      // Find the minimum/maximum offset for this instruction and check if
+      // fixing it up would be in range.
+      int64_t MinOffset, MaxOffset;
+      unsigned DummyScale;
+      getMemOpInfo(MI.getOpcode(), DummyScale, DummyWidth, MinOffset,
+                   MaxOffset);
+
+      // TODO: We should really test what happens if an instruction overflows.
+      // This is tricky to test with IR tests, but when the outliner is moved
+      // to a MIR test, it really ought to be checked.
+      if (Offset + 16 < MinOffset || Offset + 16 > MaxOffset)
+	return MachineOutlinerInstrType::Illegal;
+
+      // It's in range, so we can outline it.
+      return MachineOutlinerInstrType::Legal;
+    }
+
+    // We can't fix it up, so don't outline it.
+    return MachineOutlinerInstrType::Illegal;
+  }
+
+  return MachineOutlinerInstrType::Legal;
+}
+
+void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
+  for (MachineInstr &MI : MBB) {
+    unsigned Base, Width;
+    int64_t Offset;
+
+    // Is this a load or store with an immediate offset with SP as the base?
+    if (!MI.mayLoadOrStore() ||
+        !getMemOpBaseRegImmOfsWidth(MI, Base, Offset, Width, &RI) ||
+        Base != AArch64::SP)
+      continue;
+
+    // It is, so we have to fix it up.
+    unsigned Scale;
+    int64_t Dummy1, Dummy2;
+
+    MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
+    assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
+    getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
+    assert(Scale != 0 && "Unexpected opcode!");
+
+    // We've pushed the return address to the stack, so add 16 to the offset.
+    // This is safe, since we already checked if it would overflow when we
+    // checked if this instruction was legal to outline.
+    int64_t NewImm = (Offset + 16)/Scale;
+    StackOffsetOperand.setImm(NewImm);
+  }
+}
+
+void AArch64InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                                              MachineFunction &MF,
+                                              bool IsTailCall) const {
+
+  // If this is a tail call outlined function, then there's already a return.
+  if (IsTailCall)
+    return;
+
+  // It's not a tail call, so we have to insert the return ourselves.
+  MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
+                          .addReg(AArch64::LR, RegState::Undef);
+  MBB.insert(MBB.end(), ret);
+
+  // Walk over the basic block and fix up all the stack accesses.
+  fixupPostOutline(MBB);
+}
+
+void AArch64InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
+                                              MachineFunction &MF,
+                                              bool IsTailCall) const {}
+
+MachineBasicBlock::iterator AArch64InstrInfo::insertOutlinedCall(
+    Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
+    MachineFunction &MF, bool IsTailCall) const {
+
+  // Are we tail calling?
+  if (IsTailCall) {
+    // If yes, then we can just branch to the label.
+    It = MBB.insert(It,
+                    BuildMI(MF, DebugLoc(), get(AArch64::B))
+                        .addGlobalAddress(M.getNamedValue(MF.getName())));
+    return It;
+  }
+
+  // We're not tail calling, so we have to save LR before the call and restore
+  // it after.
+  MachineInstr *STRXpre = BuildMI(MF, DebugLoc(), get(AArch64::STRXpre))
+                              .addReg(AArch64::SP, RegState::Define)
+                              .addReg(AArch64::LR)
+                              .addReg(AArch64::SP)
+                              .addImm(-16);
+  It = MBB.insert(It, STRXpre);
+  It++;
+
+  // Insert the call.
+  It = MBB.insert(It,
+                  BuildMI(MF, DebugLoc(), get(AArch64::BL))
+                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+
+  It++;
+
+  // Restore the link register.
+  MachineInstr *LDRXpost = BuildMI(MF, DebugLoc(), get(AArch64::LDRXpost))
+                               .addReg(AArch64::SP, RegState::Define)
+                               .addReg(AArch64::LR)
+                               .addReg(AArch64::SP)
+                               .addImm(16);
+  It = MBB.insert(It, LDRXpost);
+
+  return It;
+}
+
diff --git a/lib/Target/AArch64/AArch64InstrInfo.h b/lib/Target/AArch64/AArch64InstrInfo.h
index 8d9e8e3da1c5e864db5af718b18f88daf599e290..bacce441f6c57d8405d82de0078e4d035296df54 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/lib/Target/AArch64/AArch64InstrInfo.h
@@ -87,8 +87,6 @@ public:
   /// Return true if this is an unscaled load/store.
   bool isUnscaledLdSt(MachineInstr &MI) const;
 
-  bool isTailCall(const MachineInstr &Inst) const override;
-
   static bool isPairableLdStInst(const MachineInstr &MI) {
     switch (MI.getOpcode()) {
     default:
@@ -135,12 +133,19 @@ public:
                                   int64_t &Offset, unsigned &Width,
                                   const TargetRegisterInfo *TRI) const;
 
+  /// Return the immediate offset of the base register in a load/store \p LdSt.
+  MachineOperand &getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const;
+
+  /// \brief Returns true if opcode \p Opc is a memory operation. If it is, set
+  /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly.
+  ///
+  /// For unscaled instructions, \p Scale is set to 1.
+  bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
+                    int64_t &MinOffset, int64_t &MaxOffset) const;
+
   bool shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt,
                            unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(const MachineInstr &First,
-                              const MachineInstr &Second) const override;
-
   MachineInstr *emitFrameIndexDebugValue(MachineFunction &MF, int FrameIx,
                                          uint64_t Offset, const MDNode *Var,
                                          const MDNode *Expr,
@@ -247,7 +252,33 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableBitmaskMachineOperandTargetFlags() const override;
 
+  bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override;
+  unsigned getOutliningBenefit(size_t SequenceSize, size_t Occurrences,
+                               bool CanBeTailCall) const override;
+  AArch64GenInstrInfo::MachineOutlinerInstrType
+  getOutliningType(MachineInstr &MI) const override;
+  void insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool IsTailCall) const override;
+  void insertOutlinerPrologue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool isTailCall) const override;
+  MachineBasicBlock::iterator
+  insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator &It,
+                     MachineFunction &MF,
+                     bool IsTailCall) const override;
+  /// Returns true if the instruction has a shift by immediate that can be
+  /// executed in one cycle less.
+  bool isFalkorLSLFast(const MachineInstr &MI) const;
 private:
+
+  /// \brief Sets the offsets on outlined instructions in \p MBB which use SP
+  /// so that they will be valid post-outlining.
+  ///
+  /// \param MBB A \p MachineBasicBlock in an outlined function.
+  void fixupPostOutline(MachineBasicBlock &MBB) const;
+
   void instantiateCondBranch(MachineBasicBlock &MBB, const DebugLoc &DL,
                              MachineBasicBlock *TBB,
                              ArrayRef<MachineOperand> Cond) const;
diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td
index 96251cf12401a9aa0b0c1327df722cd2e2701784..4449412532f30464189b7627c73e326f263eab24 100644
--- a/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/lib/Target/AArch64/AArch64InstrInfo.td
@@ -43,6 +43,11 @@ def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def UseAlternateSExtLoadCVTF32
     : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
 
+def UseNegativeImmediates
+    : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
+                                             "NegativeImmediates">;
+
+
 //===----------------------------------------------------------------------===//
 // AArch64-specific DAG Nodes.
 //
@@ -426,8 +431,10 @@ def MSRpstateImm1 : MSRpstateImm0_1;
 def MSRpstateImm4 : MSRpstateImm0_15;
 
 // The thread pointer (on Linux, at least, where this has been implemented) is
-// TPIDR_EL0.
-def : Pat<(AArch64threadpointer), (MRS 0xde82)>;
+// TPIDR_EL0.  Add pseudo op so we can mark it as not having any side effects.
+let hasSideEffects = 0 in
+def MOVbaseTLS : Pseudo<(outs GPR64:$dst), (ins),
+                       [(set GPR64:$dst, AArch64threadpointer)]>, Sched<[]>;
 
 // The cycle counter PMC register is PMCCNTR_EL0.
 let Predicates = [HasPerfMon] in
@@ -5031,7 +5038,7 @@ class SExtLoadi16CVTf64Pat<dag addrmode, dag INST>
                                0),
                              dsub)))>,
     Requires<[NotForCodeSize, UseAlternateSExtLoadCVTF32]>;
- 
+
 def : SExtLoadi16CVTf64Pat<(ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext),
                            (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$ext)>;
 def : SExtLoadi16CVTf64Pat<(ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$ext),
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.cpp b/lib/Target/AArch64/AArch64InstructionSelector.cpp
index 21d80ca9f812f9a98be1c78e2c0e64096b59069a..878dac6bff1e31002e992421d5e8bee215c8a153 100644
--- a/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/lib/Target/AArch64/AArch64InstructionSelector.cpp
@@ -12,17 +12,19 @@
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//
 
-#include "AArch64InstructionSelector.h"
 #include "AArch64InstrInfo.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64RegisterBankInfo.h"
 #include "AArch64RegisterInfo.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetMachine.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Debug.h"
@@ -36,13 +38,61 @@ using namespace llvm;
 #error "You shouldn't build this"
 #endif
 
+namespace {
+
+class AArch64InstructionSelector : public InstructionSelector {
+public:
+  AArch64InstructionSelector(const AArch64TargetMachine &TM,
+                             const AArch64Subtarget &STI,
+                             const AArch64RegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) const override;
+
+private:
+  /// tblgen-erated 'select' implementation, used as the initial selector for
+  /// the patterns that don't require complex C++.
+  bool selectImpl(MachineInstr &I) const;
+
+  bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
+                          MachineRegisterInfo &MRI) const;
+  bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
+                           MachineRegisterInfo &MRI) const;
+
+  bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
+                           MachineRegisterInfo &MRI) const;
+
+  bool selectArithImmed(MachineOperand &Root, MachineOperand &Result1,
+                        MachineOperand &Result2) const;
+
+  const AArch64TargetMachine &TM;
+  const AArch64Subtarget &STI;
+  const AArch64InstrInfo &TII;
+  const AArch64RegisterInfo &TRI;
+  const AArch64RegisterBankInfo &RBI;
+
+// We declare the temporaries used by selectImpl() in the class to minimize the
+// cost of constructing placeholder values.
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
 #include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
 
 AArch64InstructionSelector::AArch64InstructionSelector(
     const AArch64TargetMachine &TM, const AArch64Subtarget &STI,
     const AArch64RegisterBankInfo &RBI)
-  : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI) {}
+    : InstructionSelector(), TM(TM), STI(STI), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI)
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "AArch64GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
 
 // FIXME: This should be target-independent, inferred from the types declared
 // for each class in the bank.
@@ -440,6 +490,82 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
   }
 }
 
+bool AArch64InstructionSelector::selectCompareBranch(
+    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+
+  const unsigned CondReg = I.getOperand(0).getReg();
+  MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
+  MachineInstr *CCMI = MRI.getVRegDef(CondReg);
+  if (CCMI->getOpcode() != TargetOpcode::G_ICMP)
+    return false;
+
+  unsigned LHS = CCMI->getOperand(2).getReg();
+  unsigned RHS = CCMI->getOperand(3).getReg();
+  if (!getConstantVRegVal(RHS, MRI))
+    std::swap(RHS, LHS);
+
+  const auto RHSImm = getConstantVRegVal(RHS, MRI);
+  if (!RHSImm || *RHSImm != 0)
+    return false;
+
+  const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
+  if (RB.getID() != AArch64::GPRRegBankID)
+    return false;
+
+  const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
+  if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
+    return false;
+
+  const unsigned CmpWidth = MRI.getType(LHS).getSizeInBits();
+  unsigned CBOpc = 0;
+  if (CmpWidth <= 32)
+    CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZW : AArch64::CBNZW);
+  else if (CmpWidth == 64)
+    CBOpc = (Pred == CmpInst::ICMP_EQ ? AArch64::CBZX : AArch64::CBNZX);
+  else
+    return false;
+
+  auto MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(CBOpc))
+                 .addUse(LHS)
+                 .addMBB(DestMBB);
+
+  constrainSelectedInstRegOperands(*MIB.getInstr(), TII, TRI, RBI);
+  I.eraseFromParent();
+  return true;
+}
+
+bool AArch64InstructionSelector::selectVaStartAAPCS(
+    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+  return false;
+}
+
+bool AArch64InstructionSelector::selectVaStartDarwin(
+    MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
+  AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
+  unsigned ListReg = I.getOperand(0).getReg();
+
+  unsigned ArgsAddrReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
+
+  auto MIB =
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::ADDXri))
+          .addDef(ArgsAddrReg)
+          .addFrameIndex(FuncInfo->getVarArgsStackIndex())
+          .addImm(0)
+          .addImm(0);
+
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+
+  MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AArch64::STRXui))
+            .addUse(ArgsAddrReg)
+            .addUse(ListReg)
+            .addImm(0)
+            .addMemOperand(*I.memoperands_begin());
+
+  constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AArch64InstructionSelector::select(MachineInstr &I) const {
   assert(I.getParent() && "Instruction should be in a basic block!");
   assert(I.getParent()->getParent() && "Instruction should be in a function!");
@@ -516,6 +642,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     const unsigned CondReg = I.getOperand(0).getReg();
     MachineBasicBlock *DestMBB = I.getOperand(1).getMBB();
 
+    if (selectCompareBranch(I, MF, MRI))
+      return true;
+
     auto MIB = BuildMI(MBB, I, I.getDebugLoc(), TII.get(AArch64::TBNZW))
                    .addUse(CondReg)
                    .addImm(/*bit offset=*/0)
@@ -601,9 +730,12 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
       // FIXME: Is going through int64_t always correct?
       ImmOp.ChangeToImmediate(
           ImmOp.getFPImm()->getValueAPF().bitcastToAPInt().getZExtValue());
-    } else {
+    } else if (I.getOperand(1).isCImm()) {
       uint64_t Val = I.getOperand(1).getCImm()->getZExtValue();
       I.getOperand(1).ChangeToImmediate(Val);
+    } else if (I.getOperand(1).isImm()) {
+      uint64_t Val = I.getOperand(1).getImm();
+      I.getOperand(1).ChangeToImmediate(Val);
     }
 
     constrainSelectedInstRegOperands(I, TII, TRI, RBI);
@@ -658,10 +790,16 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
       return false;
     }
 
-#ifndef NDEBUG
-    // Sanity-check the pointer register.
+    auto &MemOp = **I.memoperands_begin();
+    if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+      DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+      return false;
+    }
+
     const unsigned PtrReg = I.getOperand(1).getReg();
+#ifndef NDEBUG
     const RegisterBank &PtrRB = *RBI.getRegBank(PtrReg, MRI, TRI);
+    // Sanity-check the pointer register.
     assert(PtrRB.getID() == AArch64::GPRRegBankID &&
            "Load/Store pointer operand isn't a GPR");
     assert(MRI.getType(PtrReg).isPointer() &&
@@ -678,11 +816,46 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
 
     I.setDesc(TII.get(NewOpc));
 
-    I.addOperand(MachineOperand::CreateImm(0));
+    uint64_t Offset = 0;
+    auto *PtrMI = MRI.getVRegDef(PtrReg);
+
+    // Try to fold a GEP into our unsigned immediate addressing mode.
+    if (PtrMI->getOpcode() == TargetOpcode::G_GEP) {
+      if (auto COff = getConstantVRegVal(PtrMI->getOperand(2).getReg(), MRI)) {
+        int64_t Imm = *COff;
+        const unsigned Size = MemTy.getSizeInBits() / 8;
+        const unsigned Scale = Log2_32(Size);
+        if ((Imm & (Size - 1)) == 0 && Imm >= 0 && Imm < (0x1000 << Scale)) {
+          unsigned Ptr2Reg = PtrMI->getOperand(1).getReg();
+          I.getOperand(1).setReg(Ptr2Reg);
+          PtrMI = MRI.getVRegDef(Ptr2Reg);
+          Offset = Imm / Size;
+        }
+      }
+    }
+
+    // If we haven't folded anything into our addressing mode yet, try to fold
+    // a frame index into the base+offset.
+    if (!Offset && PtrMI->getOpcode() == TargetOpcode::G_FRAME_INDEX)
+      I.getOperand(1).ChangeToFrameIndex(PtrMI->getOperand(1).getIndex());
+
+    I.addOperand(MachineOperand::CreateImm(Offset));
+
+    // If we're storing a 0, use WZR/XZR.
+    if (auto CVal = getConstantVRegVal(ValReg, MRI)) {
+      if (*CVal == 0 && Opcode == TargetOpcode::G_STORE) {
+        if (I.getOpcode() == AArch64::STRWui)
+          I.getOperand(0).setReg(AArch64::WZR);
+        else if (I.getOpcode() == AArch64::STRXui)
+          I.getOperand(0).setReg(AArch64::XZR);
+      }
+    }
+
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
-  case TargetOpcode::G_MUL: {
+  case TargetOpcode::G_SMULH:
+  case TargetOpcode::G_UMULH: {
     // Reject the various things we don't support yet.
     if (unsupportedBinOp(I, RBI, MRI, TRI))
       return false;
@@ -691,33 +864,24 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
 
     if (RB.getID() != AArch64::GPRRegBankID) {
-      DEBUG(dbgs() << "G_MUL on bank: " << RB << ", expected: GPR\n");
+      DEBUG(dbgs() << "G_[SU]MULH on bank: " << RB << ", expected: GPR\n");
       return false;
     }
 
-    unsigned ZeroReg;
-    unsigned NewOpc;
-    if (Ty.isScalar() && Ty.getSizeInBits() <= 32) {
-      NewOpc = AArch64::MADDWrrr;
-      ZeroReg = AArch64::WZR;
-    } else if (Ty == LLT::scalar(64)) {
-      NewOpc = AArch64::MADDXrrr;
-      ZeroReg = AArch64::XZR;
-    } else {
-      DEBUG(dbgs() << "G_MUL has type: " << Ty << ", expected: "
-                   << LLT::scalar(32) << " or " << LLT::scalar(64) << '\n');
+    if (Ty != LLT::scalar(64)) {
+      DEBUG(dbgs() << "G_[SU]MULH has type: " << Ty
+                   << ", expected: " << LLT::scalar(64) << '\n');
       return false;
     }
 
+    unsigned NewOpc = I.getOpcode() == TargetOpcode::G_SMULH ? AArch64::SMULHrr
+                                                             : AArch64::UMULHrr;
     I.setDesc(TII.get(NewOpc));
 
-    I.addOperand(MachineOperand::CreateReg(ZeroReg, /*isDef=*/false));
-
     // Now that we selected an opcode, we need to constrain the register
     // operands to use appropriate classes.
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
-
   case TargetOpcode::G_FADD:
   case TargetOpcode::G_FSUB:
   case TargetOpcode::G_FMUL:
@@ -749,6 +913,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
 
+  case TargetOpcode::G_PTR_MASK: {
+    uint64_t Align = I.getOperand(2).getImm();
+    if (Align >= 64 || Align == 0)
+      return false;
+
+    uint64_t Mask = ~((1ULL << Align) - 1);
+    I.setDesc(TII.get(AArch64::ANDXri));
+    I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64));
+
+    return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+  }
   case TargetOpcode::G_PTRTOINT:
   case TargetOpcode::G_TRUNC: {
     const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
@@ -1125,7 +1300,69 @@ bool AArch64InstructionSelector::select(MachineInstr &I) const {
     I.eraseFromParent();
     return true;
   }
+  case TargetOpcode::G_VASTART:
+    return STI.isTargetDarwin() ? selectVaStartDarwin(I, MF, MRI)
+                                : selectVaStartAAPCS(I, MF, MRI);
   }
 
   return false;
 }
+
+/// SelectArithImmed - Select an immediate value that can be represented as
+/// a 12-bit value shifted left by either 0 or 12.  If so, return true with
+/// Val set to the 12-bit value and Shift set to the shifter operand.
+bool AArch64InstructionSelector::selectArithImmed(
+    MachineOperand &Root, MachineOperand &Result1,
+    MachineOperand &Result2) const {
+  MachineInstr &MI = *Root.getParent();
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  // This function is called from the addsub_shifted_imm ComplexPattern,
+  // which lists [imm] as the list of opcode it's interested in, however
+  // we still need to check whether the operand is actually an immediate
+  // here because the ComplexPattern opcode list is only used in
+  // root-level opcode matching.
+  uint64_t Immed;
+  if (Root.isImm())
+    Immed = Root.getImm();
+  else if (Root.isCImm())
+    Immed = Root.getCImm()->getZExtValue();
+  else if (Root.isReg()) {
+    MachineInstr *Def = MRI.getVRegDef(Root.getReg());
+    if (Def->getOpcode() != TargetOpcode::G_CONSTANT)
+      return false;
+    MachineOperand &Op1 = Def->getOperand(1);
+    if (!Op1.isCImm() || Op1.getCImm()->getBitWidth() > 64)
+      return false;
+    Immed = Op1.getCImm()->getZExtValue();
+  } else
+    return false;
+
+  unsigned ShiftAmt;
+
+  if (Immed >> 12 == 0) {
+    ShiftAmt = 0;
+  } else if ((Immed & 0xfff) == 0 && Immed >> 24 == 0) {
+    ShiftAmt = 12;
+    Immed = Immed >> 12;
+  } else
+    return false;
+
+  unsigned ShVal = AArch64_AM::getShifterImm(AArch64_AM::LSL, ShiftAmt);
+  Result1.ChangeToImmediate(Immed);
+  Result1.clearParent();
+  Result2.ChangeToImmediate(ShVal);
+  Result2.clearParent();
+  return true;
+}
+
+namespace llvm {
+InstructionSelector *
+createAArch64InstructionSelector(const AArch64TargetMachine &TM,
+                                 AArch64Subtarget &Subtarget,
+                                 AArch64RegisterBankInfo &RBI) {
+  return new AArch64InstructionSelector(TM, Subtarget, RBI);
+}
+}
diff --git a/lib/Target/AArch64/AArch64InstructionSelector.h b/lib/Target/AArch64/AArch64InstructionSelector.h
deleted file mode 100644
index 2c6e5a912fb77d1d714c6076a10f181b33819d78..0000000000000000000000000000000000000000
--- a/lib/Target/AArch64/AArch64InstructionSelector.h
+++ /dev/null
@@ -1,49 +0,0 @@
-//===- AArch64InstructionSelector --------------------------------*- C++ -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-/// \file
-/// This file declares the targeting of the InstructionSelector class for
-/// AArch64.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
-#define LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
-
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-
-namespace llvm {
-
-class AArch64InstrInfo;
-class AArch64RegisterBankInfo;
-class AArch64RegisterInfo;
-class AArch64Subtarget;
-class AArch64TargetMachine;
-
-class AArch64InstructionSelector : public InstructionSelector {
-public:
-  AArch64InstructionSelector(const AArch64TargetMachine &TM,
-                             const AArch64Subtarget &STI,
-                             const AArch64RegisterBankInfo &RBI);
-
-  bool select(MachineInstr &I) const override;
-
-private:
-  /// tblgen-erated 'select' implementation, used as the initial selector for
-  /// the patterns that don't require complex C++.
-  bool selectImpl(MachineInstr &I) const;
-
-  const AArch64TargetMachine &TM;
-  const AArch64Subtarget &STI;
-  const AArch64InstrInfo &TII;
-  const AArch64RegisterInfo &TRI;
-  const AArch64RegisterBankInfo &RBI;
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_AARCH64_AARCH64INSTRUCTIONSELECTOR_H
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
index 86037c97731c4c457e6e0fada7153fa6621e2230..6e6daf8122951526b1c0973a964358985ce66f99 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.cpp
@@ -13,7 +13,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64LegalizerInfo.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/Target/TargetOpcodes.h"
@@ -52,6 +55,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   for (auto Ty : {s1, s8, s16, s32})
     setAction({G_GEP, 1, Ty}, WidenScalar);
 
+  setAction({G_PTR_MASK, p0}, Legal);
+
   for (unsigned BinOp : {G_LSHR, G_ASHR, G_SDIV, G_UDIV}) {
     for (auto Ty : {s32, s64})
       setAction({BinOp, Ty}, Legal);
@@ -64,7 +69,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     for (auto Ty : { s1, s8, s16, s32, s64 })
       setAction({BinOp, Ty}, Lower);
 
-  for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULO, G_UMULO}) {
+  for (unsigned Op : {G_SMULO, G_UMULO})
+      setAction({Op, s64}, Lower);
+
+  for (unsigned Op : {G_UADDE, G_USUBE, G_SADDO, G_SSUBO, G_SMULH, G_UMULH}) {
     for (auto Ty : { s32, s64 })
       setAction({Op, Ty}, Legal);
 
@@ -75,8 +83,21 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     for (auto Ty : {s32, s64})
       setAction({BinOp, Ty}, Legal);
 
-  setAction({G_FREM, s32}, Libcall);
-  setAction({G_FREM, s64}, Libcall);
+  for (unsigned BinOp : {G_FREM, G_FPOW}) {
+    setAction({BinOp, s32}, Libcall);
+    setAction({BinOp, s64}, Libcall);
+  }
+
+  for (auto Ty : {s32, s64, p0}) {
+    setAction({G_INSERT, Ty}, Legal);
+    setAction({G_INSERT, 1, Ty}, Legal);
+  }
+  for (auto Ty : {s1, s8, s16}) {
+    setAction({G_INSERT, Ty}, WidenScalar);
+    setAction({G_INSERT, 1, Ty}, Legal);
+    // FIXME: Can't widen the sources because that violates the constraints on
+    // G_INSERT (It seems entirely reasonable that inputs shouldn't overlap).
+  }
 
   for (unsigned MemOp : {G_LOAD, G_STORE}) {
     for (auto Ty : {s8, s16, s32, s64, p0, v2s32})
@@ -170,7 +191,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
   setAction({G_BRINDIRECT, p0}, Legal);
 
   // Select
-  for (auto Ty : {s1, s8, s16, s32, s64, p0})
+  for (auto Ty : {s1, s8, s16})
+    setAction({G_SELECT, Ty}, WidenScalar);
+
+  for (auto Ty : {s32, s64, p0})
     setAction({G_SELECT, Ty}, Legal);
 
   setAction({G_SELECT, 1, s1}, Legal);
@@ -210,5 +234,82 @@ AArch64LegalizerInfo::AArch64LegalizerInfo() {
     setAction({G_BITCAST, 1, LLT::vector(32/EltSize, EltSize)}, Legal);
   }
 
+  setAction({G_VASTART, p0}, Legal);
+
+  // va_list must be a pointer, but most sized types are pretty easy to handle
+  // as the destination.
+  setAction({G_VAARG, 1, p0}, Legal);
+
+  for (auto Ty : {s8, s16, s32, s64, p0})
+    setAction({G_VAARG, Ty}, Custom);
+
   computeTables();
 }
+
+bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
+                                          MachineRegisterInfo &MRI,
+                                          MachineIRBuilder &MIRBuilder) const {
+  switch (MI.getOpcode()) {
+  default:
+    // No idea what to do.
+    return false;
+  case TargetOpcode::G_VAARG:
+    return legalizeVaArg(MI, MRI, MIRBuilder);
+  }
+
+  llvm_unreachable("expected switch to return");
+}
+
+bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineIRBuilder &MIRBuilder) const {
+  MIRBuilder.setInstr(MI);
+  MachineFunction &MF = MIRBuilder.getMF();
+  unsigned Align = MI.getOperand(2).getImm();
+  unsigned Dst = MI.getOperand(0).getReg();
+  unsigned ListPtr = MI.getOperand(1).getReg();
+
+  LLT PtrTy = MRI.getType(ListPtr);
+  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
+
+  const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
+  unsigned List = MRI.createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildLoad(
+      List, ListPtr,
+      *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
+                               PtrSize, /* Align = */ PtrSize));
+
+  unsigned DstPtr;
+  if (Align > PtrSize) {
+    // Realign the list to the actual required alignment.
+    unsigned AlignMinus1 = MRI.createGenericVirtualRegister(IntPtrTy);
+    MIRBuilder.buildConstant(AlignMinus1, Align - 1);
+
+    unsigned ListTmp = MRI.createGenericVirtualRegister(PtrTy);
+    MIRBuilder.buildGEP(ListTmp, List, AlignMinus1);
+
+    DstPtr = MRI.createGenericVirtualRegister(PtrTy);
+    MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align));
+  } else
+    DstPtr = List;
+
+  uint64_t ValSize = MRI.getType(Dst).getSizeInBits() / 8;
+  MIRBuilder.buildLoad(
+      Dst, DstPtr,
+      *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
+                               ValSize, std::max(Align, PtrSize)));
+
+  unsigned SizeReg = MRI.createGenericVirtualRegister(IntPtrTy);
+  MIRBuilder.buildConstant(SizeReg, alignTo(ValSize, PtrSize));
+
+  unsigned NewList = MRI.createGenericVirtualRegister(PtrTy);
+  MIRBuilder.buildGEP(NewList, DstPtr, SizeReg);
+
+  MIRBuilder.buildStore(
+      NewList, ListPtr,
+      *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
+                               PtrSize, /* Align = */ PtrSize));
+
+  MI.eraseFromParent();
+  return true;
+}
diff --git a/lib/Target/AArch64/AArch64LegalizerInfo.h b/lib/Target/AArch64/AArch64LegalizerInfo.h
index feacbef9f147b002ddb69800ff719180488308a3..42d4ac130c5c83f3d9891bbc1fc3eec99e3c56b5 100644
--- a/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/lib/Target/AArch64/AArch64LegalizerInfo.h
@@ -25,6 +25,13 @@ class LLVMContext;
 class AArch64LegalizerInfo : public LegalizerInfo {
 public:
   AArch64LegalizerInfo();
+
+  bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      MachineIRBuilder &MIRBuilder) const override;
+
+private:
+  bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     MachineIRBuilder &MIRBuilder) const;
 };
 } // End llvm namespace.
 #endif
diff --git a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 443a14f7068bd1f2dca64c09016349ea42e33b2c..976498aa70d6d87d00f6b2d843739925d6246c5d 100644
--- a/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -93,6 +93,7 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
     initializeAArch64LoadStoreOptPass(*PassRegistry::getPassRegistry());
   }
 
+  AliasAnalysis *AA;
   const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
   const AArch64Subtarget *Subtarget;
@@ -100,6 +101,11 @@ struct AArch64LoadStoreOpt : public MachineFunctionPass {
   // Track which registers have been modified and used.
   BitVector ModifiedRegs, UsedRegs;
 
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AAResultsWrapperPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
   // Scan the instructions looking for a load/store that can be combined
   // with the current instruction into a load/store pair.
   // Return the matching instruction if one is found, else MBB->end().
@@ -866,9 +872,11 @@ AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
               .addImm(Imms);
     }
   }
-  StoreI->clearRegisterKills(StRt, TRI);
 
-  (void)BitExtMI;
+  // Clear kill flags between store and load.
+  for (MachineInstr &MI : make_range(StoreI->getIterator(),
+                                     BitExtMI->getIterator()))
+    MI.clearRegisterKills(StRt, TRI);
 
   DEBUG(dbgs() << "Promoting load by replacing :\n    ");
   DEBUG(StoreI->print(dbgs()));
@@ -934,7 +942,7 @@ static int alignTo(int Num, int PowOf2) {
 }
 
 static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
-                     const AArch64InstrInfo *TII) {
+                     AliasAnalysis *AA) {
   // One of the instructions must modify memory.
   if (!MIa.mayStore() && !MIb.mayStore())
     return false;
@@ -943,14 +951,14 @@ static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
   if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
     return false;
 
-  return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb);
+  return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
 }
 
 static bool mayAlias(MachineInstr &MIa,
                      SmallVectorImpl<MachineInstr *> &MemInsns,
-                     const AArch64InstrInfo *TII) {
+                     AliasAnalysis *AA) {
   for (MachineInstr *MIb : MemInsns)
-    if (mayAlias(MIa, *MIb, TII))
+    if (mayAlias(MIa, *MIb, AA))
       return true;
 
   return false;
@@ -1008,7 +1016,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
       return false;
 
     // If we encounter a store aliased with the load, return early.
-    if (MI.mayStore() && mayAlias(LoadMI, MI, TII))
+    if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
       return false;
   } while (MBBI != B && Count < Limit);
   return false;
@@ -1178,7 +1186,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // first.
         if (!ModifiedRegs[getLdStRegOp(MI).getReg()] &&
             !(MI.mayLoad() && UsedRegs[getLdStRegOp(MI).getReg()]) &&
-            !mayAlias(MI, MemInsns, TII)) {
+            !mayAlias(MI, MemInsns, AA)) {
           Flags.setMergeForward(false);
           return MBBI;
         }
@@ -1189,7 +1197,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
         // into the second.
         if (!ModifiedRegs[getLdStRegOp(FirstMI).getReg()] &&
             !(MayLoad && UsedRegs[getLdStRegOp(FirstMI).getReg()]) &&
-            !mayAlias(FirstMI, MemInsns, TII)) {
+            !mayAlias(FirstMI, MemInsns, AA)) {
           Flags.setMergeForward(true);
           return MBBI;
         }
@@ -1732,6 +1740,7 @@ bool AArch64LoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   Subtarget = &static_cast<const AArch64Subtarget &>(Fn.getSubtarget());
   TII = static_cast<const AArch64InstrInfo *>(Subtarget->getInstrInfo());
   TRI = Subtarget->getRegisterInfo();
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   // Resize the modified and used register bitfield trackers.  We do this once
   // per function and then clear the bitfield each time we optimize a load or
diff --git a/lib/Target/AArch64/AArch64MacroFusion.cpp b/lib/Target/AArch64/AArch64MacroFusion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..a6926a6700e187299d9de480eb9b4a6e1625bdab
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MacroFusion.cpp
@@ -0,0 +1,272 @@
+//===- AArch64MacroFusion.cpp - AArch64 Macro Fusion ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// \file This file contains the AArch64 implementation of the DAG scheduling mutation
+// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MacroFusion.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define DEBUG_TYPE "misched"
+
+STATISTIC(NumFused, "Number of instr pairs fused");
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMacroFusion("aarch64-misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+namespace {
+
+/// \brief Verify that the instr pair, FirstMI and SecondMI, should be fused
+/// together.  Given an anchor instr, when the other instr is unspecified, then
+/// check if the anchor instr may be part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr *SecondMI) {
+  assert((FirstMI || SecondMI) && "At least one instr must be specified");
+
+  const AArch64InstrInfo &II = static_cast<const AArch64InstrInfo&>(TII);
+  const AArch64Subtarget &ST = static_cast<const AArch64Subtarget&>(TSI);
+
+  // Assume wildcards for unspecified instrs.
+  unsigned FirstOpcode =
+    FirstMI ? FirstMI->getOpcode()
+	    : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode =
+    SecondMI ? SecondMI->getOpcode()
+             : static_cast<unsigned>(AArch64::INSTRUCTION_LIST_END);
+
+  if (ST.hasArithmeticBccFusion())
+    // Fuse CMN, CMP, TST followed by Bcc.
+    if (SecondOpcode == AArch64::Bcc)
+      switch (FirstOpcode) {
+      default:
+        return false;
+      case AArch64::ADDSWri:
+      case AArch64::ADDSWrr:
+      case AArch64::ADDSXri:
+      case AArch64::ADDSXrr:
+      case AArch64::ANDSWri:
+      case AArch64::ANDSWrr:
+      case AArch64::ANDSXri:
+      case AArch64::ANDSXrr:
+      case AArch64::SUBSWri:
+      case AArch64::SUBSWrr:
+      case AArch64::SUBSXri:
+      case AArch64::SUBSXrr:
+      case AArch64::BICSWrr:
+      case AArch64::BICSXrr:
+        return true;
+      case AArch64::ADDSWrs:
+      case AArch64::ADDSXrs:
+      case AArch64::ANDSWrs:
+      case AArch64::ANDSXrs:
+      case AArch64::SUBSWrs:
+      case AArch64::SUBSXrs:
+      case AArch64::BICSWrs:
+      case AArch64::BICSXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !II.hasShiftedReg(*FirstMI);
+      case AArch64::INSTRUCTION_LIST_END:
+        return true;
+      }
+
+  if (ST.hasArithmeticCbzFusion())
+    // Fuse ALU operations followed by CBZ/CBNZ.
+    if (SecondOpcode == AArch64::CBNZW || SecondOpcode == AArch64::CBNZX ||
+        SecondOpcode == AArch64::CBZW || SecondOpcode == AArch64::CBZX)
+      switch (FirstOpcode) {
+      default:
+        return false;
+      case AArch64::ADDWri:
+      case AArch64::ADDWrr:
+      case AArch64::ADDXri:
+      case AArch64::ADDXrr:
+      case AArch64::ANDWri:
+      case AArch64::ANDWrr:
+      case AArch64::ANDXri:
+      case AArch64::ANDXrr:
+      case AArch64::EORWri:
+      case AArch64::EORWrr:
+      case AArch64::EORXri:
+      case AArch64::EORXrr:
+      case AArch64::ORRWri:
+      case AArch64::ORRWrr:
+      case AArch64::ORRXri:
+      case AArch64::ORRXrr:
+      case AArch64::SUBWri:
+      case AArch64::SUBWrr:
+      case AArch64::SUBXri:
+      case AArch64::SUBXrr:
+        return true;
+      case AArch64::ADDWrs:
+      case AArch64::ADDXrs:
+      case AArch64::ANDWrs:
+      case AArch64::ANDXrs:
+      case AArch64::SUBWrs:
+      case AArch64::SUBXrs:
+      case AArch64::BICWrs:
+      case AArch64::BICXrs:
+        // Shift value can be 0 making these behave like the "rr" variant...
+        return !II.hasShiftedReg(*FirstMI);
+      case AArch64::INSTRUCTION_LIST_END:
+        return true;
+      }
+
+  if (ST.hasFuseAES())
+    // Fuse AES crypto operations.
+    switch(FirstOpcode) {
+    // AES encode.
+    case AArch64::AESErr:
+      return SecondOpcode == AArch64::AESMCrr ||
+             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    // AES decode.
+    case AArch64::AESDrr:
+      return SecondOpcode == AArch64::AESIMCrr ||
+             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    }
+
+  if (ST.hasFuseLiterals())
+    // Fuse literal generation operations.
+    switch (FirstOpcode) {
+    // PC relative address.
+    case AArch64::ADRP:
+      return SecondOpcode == AArch64::ADDXri ||
+             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    // 32 bit immediate.
+    case AArch64::MOVZWi:
+      return (SecondOpcode == AArch64::MOVKWi &&
+              SecondMI->getOperand(3).getImm() == 16) ||
+             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    // Lower half of 64 bit immediate.
+    case AArch64::MOVZXi:
+      return (SecondOpcode == AArch64::MOVKXi &&
+              SecondMI->getOperand(3).getImm() == 16) ||
+             SecondOpcode == AArch64::INSTRUCTION_LIST_END;
+    // Upper half of 64 bit immediate.
+    case AArch64::MOVKXi:
+      return FirstMI->getOperand(3).getImm() == 32 &&
+             ((SecondOpcode == AArch64::MOVKXi &&
+               SecondMI->getOperand(3).getImm() == 48) ||
+              SecondOpcode == AArch64::INSTRUCTION_LIST_END);
+    }
+
+  return false;
+}
+
+/// \brief Implement the fusion of instr pairs in the scheduling DAG,
+/// anchored at the instr in AnchorSU..
+static bool scheduleAdjacentImpl(ScheduleDAGMI *DAG, SUnit &AnchorSU) {
+  const MachineInstr *AnchorMI = AnchorSU.getInstr();
+  if (!AnchorMI || AnchorMI->isPseudo() || AnchorMI->isTransient())
+    return false;
+
+  // If the anchor instr is the ExitSU, then consider its predecessors;
+  // otherwise, its successors.
+  bool Preds = (&AnchorSU == &DAG->ExitSU);
+  SmallVectorImpl<SDep> &AnchorDeps = Preds ? AnchorSU.Preds : AnchorSU.Succs;
+
+  const MachineInstr *FirstMI = Preds ? nullptr : AnchorMI;
+  const MachineInstr *SecondMI = Preds ? AnchorMI : nullptr;
+
+  // Check if the anchor instr may be fused.
+  if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
+                              FirstMI, SecondMI))
+    return false;
+
+  // Explorer for fusion candidates among the dependencies of the anchor instr.
+  for (SDep &Dep : AnchorDeps) {
+    // Ignore dependencies that don't enforce ordering.
+    if (Dep.isWeak())
+      continue;
+
+    SUnit &DepSU = *Dep.getSUnit();
+    // Ignore the ExitSU if the dependents are successors.
+    if (!Preds && &DepSU == &DAG->ExitSU)
+      continue;
+
+    const MachineInstr *DepMI = DepSU.getInstr();
+    if (!DepMI || DepMI->isPseudo() || DepMI->isTransient())
+      continue;
+
+    FirstMI = Preds ? DepMI : AnchorMI;
+    SecondMI = Preds ? AnchorMI : DepMI;
+    if (!shouldScheduleAdjacent(*DAG->TII, DAG->MF.getSubtarget(),
+                                FirstMI, SecondMI))
+      continue;
+
+    // Create a single weak edge between the adjacent instrs. The only effect is
+    // to cause bottom-up scheduling to heavily prioritize the clustered instrs.
+    SUnit &FirstSU = Preds ? DepSU : AnchorSU;
+    SUnit &SecondSU = Preds ? AnchorSU : DepSU;
+    DAG->addEdge(&SecondSU, SDep(&FirstSU, SDep::Cluster));
+
+    // Adjust the latency between the anchor instr and its
+    // predecessors/successors.
+    for (SDep &IDep : AnchorDeps)
+      if (IDep.getSUnit() == &DepSU)
+        IDep.setLatency(0);
+
+    // Adjust the latency between the dependent instr and its
+    // successors/predecessors.
+    for (SDep &IDep : Preds ? DepSU.Succs : DepSU.Preds)
+      if (IDep.getSUnit() == &AnchorSU)
+        IDep.setLatency(0);
+
+    DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
+          FirstSU.print(dbgs(), DAG); dbgs() << " - ";
+          SecondSU.print(dbgs(), DAG); dbgs() << " /  ";
+          dbgs() << DAG->TII->getName(FirstMI->getOpcode()) << " - " <<
+                    DAG->TII->getName(SecondMI->getOpcode()) << '\n'; );
+
+    ++NumFused;
+    return true;
+  }
+
+  return false;
+}
+
+/// \brief Post-process the DAG to create cluster edges between instrs that may
+/// be fused by the processor into a single operation.
+class AArch64MacroFusion : public ScheduleDAGMutation {
+public:
+  AArch64MacroFusion() {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+void AArch64MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+
+  // For each of the SUnits in the scheduling block, try to fuse the instr in it
+  // with one in its successors.
+  for (SUnit &ISU : DAG->SUnits)
+    scheduleAdjacentImpl(DAG, ISU);
+
+  // Try to fuse the instr in the ExitSU with one in its predecessors.
+  scheduleAdjacentImpl(DAG, DAG->ExitSU);
+}
+
+} // end namespace
+
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation () {
+  return EnableMacroFusion ? make_unique<AArch64MacroFusion>() : nullptr;
+}
+
+} // end namespace llvm
diff --git a/lib/Target/AArch64/AArch64MacroFusion.h b/lib/Target/AArch64/AArch64MacroFusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..e5efedd9fbfd9a5ec67b69deadffcaa93b0621c9
--- /dev/null
+++ b/lib/Target/AArch64/AArch64MacroFusion.h
@@ -0,0 +1,29 @@
+//===- AArch64MacroFusion.h - AArch64 Macro Fusion ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// \fileThis file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the AArch64 definition of the DAG scheduling mutation
+// to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+
+//===----------------------------------------------------------------------===//
+// AArch64MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createAArch64MacroFusionDAGMutation());
+/// to AArch64PassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createAArch64MacroFusionDAGMutation();
+
+} // llvm
diff --git a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
index 45fd358a99bef96057f7682def7be4615bb53bd0..f3c8e7e9bdc2b4002eed03d3b7d4df20c0dcbccb 100644
--- a/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
+++ b/lib/Target/AArch64/AArch64RedundantCopyElimination.cpp
@@ -12,13 +12,14 @@
 //    CBZW %W0, <BB#2>
 //  BB#2:
 //    %W0 = COPY %WZR
-// This pass should be run after register allocation.
+// Similarly, this pass also handles non-zero copies.
+//  BB#0:
+//    cmp x0, #1
+//    b.eq .LBB0_1
+//  .LBB0_1:
+//    orr x0, xzr, #0x1
 //
-// FIXME: This should be extended to handle any constant other than zero. E.g.,
-//   cmp w0, #1
-//     b.eq .BB1
-//   BB1:
-//     mov w0, #1
+// This pass should be run after register allocation.
 //
 // FIXME: This could also be extended to check the whole dominance subtree below
 // the comparison if the compile time regression is acceptable.
@@ -26,6 +27,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
@@ -43,6 +45,7 @@ namespace {
 class AArch64RedundantCopyElimination : public MachineFunctionPass {
   const MachineRegisterInfo *MRI;
   const TargetRegisterInfo *TRI;
+  BitVector ClobberedRegs;
 
 public:
   static char ID;
@@ -50,6 +53,16 @@ public:
     initializeAArch64RedundantCopyEliminationPass(
         *PassRegistry::getPassRegistry());
   }
+
+  struct RegImm {
+    MCPhysReg Reg;
+    int32_t Imm;
+    RegImm(MCPhysReg Reg, int32_t Imm) : Reg(Reg), Imm(Imm) {}
+  };
+
+  Optional<RegImm> knownRegValInBlock(MachineInstr &CondBr,
+                                      MachineBasicBlock *MBB,
+                                      MachineBasicBlock::iterator &FirstUse);
   bool optimizeCopy(MachineBasicBlock *MBB);
   bool runOnMachineFunction(MachineFunction &MF) override;
   MachineFunctionProperties getRequiredProperties() const override {
@@ -66,14 +79,120 @@ char AArch64RedundantCopyElimination::ID = 0;
 INITIALIZE_PASS(AArch64RedundantCopyElimination, "aarch64-copyelim",
                 "AArch64 redundant copy elimination pass", false, false)
 
-static bool guaranteesZeroRegInBlock(MachineInstr &MI, MachineBasicBlock *MBB) {
-  unsigned Opc = MI.getOpcode();
+/// Remember what registers the specified instruction modifies.
+static void trackRegDefs(const MachineInstr &MI, BitVector &ClobberedRegs,
+                         const TargetRegisterInfo *TRI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (MO.isRegMask()) {
+      ClobberedRegs.setBitsNotInMask(MO.getRegMask());
+      continue;
+    }
+
+    if (!MO.isReg())
+      continue;
+    unsigned Reg = MO.getReg();
+    if (!Reg)
+      continue;
+    if (!MO.isDef())
+      continue;
+
+    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
+      ClobberedRegs.set(*AI);
+  }
+}
+
+/// It's possible to determine the value of a register based on a dominating
+/// condition.  To do so, this function checks to see if the basic block \p MBB
+/// is the target to which a conditional branch \p CondBr jumps and whose
+/// equality comparison is against a constant.  If so, return a known physical
+/// register and constant value pair.  Otherwise, return None.
+Optional<AArch64RedundantCopyElimination::RegImm>
+AArch64RedundantCopyElimination::knownRegValInBlock(
+    MachineInstr &CondBr, MachineBasicBlock *MBB,
+    MachineBasicBlock::iterator &FirstUse) {
+  unsigned Opc = CondBr.getOpcode();
+
   // Check if the current basic block is the target block to which the
   // CBZ/CBNZ instruction jumps when its Wt/Xt is zero.
-  return ((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
-          MBB == MI.getOperand(1).getMBB()) ||
-         ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
-          MBB != MI.getOperand(1).getMBB());
+  if (((Opc == AArch64::CBZW || Opc == AArch64::CBZX) &&
+       MBB == CondBr.getOperand(1).getMBB()) ||
+      ((Opc == AArch64::CBNZW || Opc == AArch64::CBNZX) &&
+       MBB != CondBr.getOperand(1).getMBB())) {
+    FirstUse = CondBr;
+    return RegImm(CondBr.getOperand(0).getReg(), 0);
+  }
+
+  // Otherwise, must be a conditional branch.
+  if (Opc != AArch64::Bcc)
+    return None;
+
+  // Must be an equality check (i.e., == or !=).
+  AArch64CC::CondCode CC = (AArch64CC::CondCode)CondBr.getOperand(0).getImm();
+  if (CC != AArch64CC::EQ && CC != AArch64CC::NE)
+    return None;
+
+  MachineBasicBlock *BrTarget = CondBr.getOperand(1).getMBB();
+  if ((CC == AArch64CC::EQ && BrTarget != MBB) ||
+      (CC == AArch64CC::NE && BrTarget == MBB))
+    return None;
+
+  // Stop if we get to the beginning of PredMBB.
+  MachineBasicBlock *PredMBB = *MBB->pred_begin();
+  assert(PredMBB == CondBr.getParent() &&
+         "Conditional branch not in predecessor block!");
+  if (CondBr == PredMBB->begin())
+    return None;
+
+  // Registers clobbered in PredMBB between CondBr instruction and current
+  // instruction being checked in loop.
+  ClobberedRegs.reset();
+
+  // Find compare instruction that sets NZCV used by CondBr.
+  MachineBasicBlock::reverse_iterator RIt = CondBr.getReverseIterator();
+  for (MachineInstr &PredI : make_range(std::next(RIt), PredMBB->rend())) {
+
+    // Track clobbered registers.
+    trackRegDefs(PredI, ClobberedRegs, TRI);
+
+    bool IsCMN = false;
+    switch (PredI.getOpcode()) {
+    default:
+      break;
+
+    // CMN is an alias for ADDS with a dead destination register.
+    case AArch64::ADDSWri:
+    case AArch64::ADDSXri:
+      IsCMN = true;
+    // CMP is an alias for SUBS with a dead destination register.
+    case AArch64::SUBSWri:
+    case AArch64::SUBSXri: {
+      MCPhysReg SrcReg = PredI.getOperand(1).getReg();
+
+      // Must not be a symbolic immediate.
+      if (!PredI.getOperand(2).isImm())
+        return None;
+
+      // The src register must not be modified between the cmp and conditional
+      // branch.  This includes a self-clobbering compare.
+      if (ClobberedRegs[SrcReg])
+        return None;
+
+      // We've found the Cmp that sets NZCV.
+      int32_t KnownImm = PredI.getOperand(2).getImm();
+      int32_t Shift = PredI.getOperand(3).getImm();
+      KnownImm <<= Shift;
+      if (IsCMN)
+        KnownImm = -KnownImm;
+      FirstUse = PredI;
+      return RegImm(SrcReg, KnownImm);
+    }
+    }
+
+    // Bail if we see an instruction that defines NZCV that we don't handle.
+    if (PredI.definesRegister(AArch64::NZCV))
+      return None;
+  }
+  return None;
 }
 
 bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
@@ -87,78 +206,181 @@ bool AArch64RedundantCopyElimination::optimizeCopy(MachineBasicBlock *MBB) {
   if (PredMBB->succ_size() != 2)
     return false;
 
-  MachineBasicBlock::iterator CompBr = PredMBB->getLastNonDebugInstr();
-  if (CompBr == PredMBB->end())
+  MachineBasicBlock::iterator CondBr = PredMBB->getLastNonDebugInstr();
+  if (CondBr == PredMBB->end())
     return false;
 
-  ++CompBr;
+  // Keep track of the earliest point in the PredMBB block where kill markers
+  // need to be removed if a COPY is removed.
+  MachineBasicBlock::iterator FirstUse;
+  // After calling knownRegValInBlock, FirstUse will either point to a CBZ/CBNZ
+  // or a compare (i.e., SUBS).  In the latter case, we must take care when
+  // updating FirstUse when scanning for COPY instructions.  In particular, if
+  // there's a COPY in between the compare and branch the COPY should not
+  // update FirstUse.
+  bool SeenFirstUse = false;
+  // Registers that contain a known value at the start of MBB.
+  SmallVector<RegImm, 4> KnownRegs;
+
+  MachineBasicBlock::iterator Itr = std::next(CondBr);
   do {
-    --CompBr;
-    if (guaranteesZeroRegInBlock(*CompBr, MBB))
-      break;
-  } while (CompBr != PredMBB->begin() && CompBr->isTerminator());
+    --Itr;
 
-  // We've not found a CBZ/CBNZ, time to bail out.
-  if (!guaranteesZeroRegInBlock(*CompBr, MBB))
-    return false;
+    Optional<RegImm> KnownRegImm = knownRegValInBlock(*Itr, MBB, FirstUse);
+    if (KnownRegImm == None)
+      continue;
 
-  unsigned TargetReg = CompBr->getOperand(0).getReg();
-  if (!TargetReg)
-    return false;
-  assert(TargetRegisterInfo::isPhysicalRegister(TargetReg) &&
-         "Expect physical register");
+    KnownRegs.push_back(*KnownRegImm);
+
+    // Reset the clobber list, which is used by knownRegValInBlock.
+    ClobberedRegs.reset();
+
+    // Look backward in PredMBB for COPYs from the known reg to find other
+    // registers that are known to be a constant value.
+    for (auto PredI = Itr;; --PredI) {
+      if (FirstUse == PredI)
+        SeenFirstUse = true;
+
+      if (PredI->isCopy()) {
+        MCPhysReg CopyDstReg = PredI->getOperand(0).getReg();
+        MCPhysReg CopySrcReg = PredI->getOperand(1).getReg();
+        for (auto &KnownReg : KnownRegs) {
+          if (ClobberedRegs[KnownReg.Reg])
+            continue;
+          // If we have X = COPY Y, and Y is known to be zero, then now X is
+          // known to be zero.
+          if (CopySrcReg == KnownReg.Reg && !ClobberedRegs[CopyDstReg]) {
+            KnownRegs.push_back(RegImm(CopyDstReg, KnownReg.Imm));
+            if (SeenFirstUse)
+              FirstUse = PredI;
+            break;
+          }
+          // If we have X = COPY Y, and X is known to be zero, then now Y is
+          // known to be zero.
+          if (CopyDstReg == KnownReg.Reg && !ClobberedRegs[CopySrcReg]) {
+            KnownRegs.push_back(RegImm(CopySrcReg, KnownReg.Imm));
+            if (SeenFirstUse)
+              FirstUse = PredI;
+            break;
+          }
+        }
+      }
+
+      // Stop if we get to the beginning of PredMBB.
+      if (PredI == PredMBB->begin())
+        break;
+
+      trackRegDefs(*PredI, ClobberedRegs, TRI);
+      // Stop if all of the known-zero regs have been clobbered.
+      if (all_of(KnownRegs, [&](RegImm KnownReg) {
+            return ClobberedRegs[KnownReg.Reg];
+          }))
+        break;
+    }
+    break;
 
-  // Remember all registers aliasing with TargetReg.
-  SmallSetVector<unsigned, 8> TargetRegs;
-  for (MCRegAliasIterator AI(TargetReg, TRI, true); AI.isValid(); ++AI)
-    TargetRegs.insert(*AI);
+  } while (Itr != PredMBB->begin() && Itr->isTerminator());
+
+  // We've not found a registers with a known value, time to bail out.
+  if (KnownRegs.empty())
+    return false;
 
   bool Changed = false;
+  // UsedKnownRegs is the set of KnownRegs that have had uses added to MBB.
+  SmallSetVector<unsigned, 4> UsedKnownRegs;
   MachineBasicBlock::iterator LastChange = MBB->begin();
-  unsigned SmallestDef = TargetReg;
-  // Remove redundant Copy instructions unless TargetReg is modified.
+  // Remove redundant Copy instructions unless KnownReg is modified.
   for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;) {
     MachineInstr *MI = &*I;
     ++I;
-    if (MI->isCopy() && MI->getOperand(0).isReg() &&
-        MI->getOperand(1).isReg()) {
-
-      unsigned DefReg = MI->getOperand(0).getReg();
-      unsigned SrcReg = MI->getOperand(1).getReg();
-
-      if ((SrcReg == AArch64::XZR || SrcReg == AArch64::WZR) &&
-          !MRI->isReserved(DefReg) &&
-          (TargetReg == DefReg || TRI->isSuperRegister(DefReg, TargetReg))) {
-        DEBUG(dbgs() << "Remove redundant Copy : ");
-        DEBUG((MI)->print(dbgs()));
-
-        MI->eraseFromParent();
-        Changed = true;
-        LastChange = I;
-        NumCopiesRemoved++;
-        SmallestDef =
-            TRI->isSubRegister(SmallestDef, DefReg) ? DefReg : SmallestDef;
-        continue;
+    bool RemovedMI = false;
+    bool IsCopy = MI->isCopy();
+    bool IsMoveImm = MI->isMoveImmediate();
+    if (IsCopy || IsMoveImm) {
+      MCPhysReg DefReg = MI->getOperand(0).getReg();
+      MCPhysReg SrcReg = IsCopy ? MI->getOperand(1).getReg() : 0;
+      int64_t SrcImm = IsMoveImm ? MI->getOperand(1).getImm() : 0;
+      if (!MRI->isReserved(DefReg) &&
+          ((IsCopy && (SrcReg == AArch64::XZR || SrcReg == AArch64::WZR)) ||
+           IsMoveImm)) {
+        for (RegImm &KnownReg : KnownRegs) {
+          if (KnownReg.Reg != DefReg &&
+              !TRI->isSuperRegister(DefReg, KnownReg.Reg))
+            continue;
+
+          // For a copy, the known value must be a zero.
+          if (IsCopy && KnownReg.Imm != 0)
+            continue;
+
+          if (IsMoveImm) {
+            // For a move immediate, the known immediate must match the source
+            // immediate.
+            if (KnownReg.Imm != SrcImm)
+              continue;
+
+            // Don't remove a move immediate that implicitly defines the upper
+            // bits when only the lower 32 bits are known.
+            MCPhysReg CmpReg = KnownReg.Reg;
+            if (any_of(MI->implicit_operands(), [CmpReg](MachineOperand &O) {
+                  return !O.isDead() && O.isReg() && O.isDef() &&
+                         O.getReg() != CmpReg;
+                }))
+              continue;
+          }
+
+          if (IsCopy)
+            DEBUG(dbgs() << "Remove redundant Copy : " << *MI);
+          else
+            DEBUG(dbgs() << "Remove redundant Move : " << *MI);
+
+          MI->eraseFromParent();
+          Changed = true;
+          LastChange = I;
+          NumCopiesRemoved++;
+          UsedKnownRegs.insert(KnownReg.Reg);
+          RemovedMI = true;
+          break;
+        }
       }
     }
 
-    if (MI->modifiesRegister(TargetReg, TRI))
+    // Skip to the next instruction if we removed the COPY/MovImm.
+    if (RemovedMI)
+      continue;
+
+    // Remove any regs the MI clobbers from the KnownConstRegs set.
+    for (unsigned RI = 0; RI < KnownRegs.size();)
+      if (MI->modifiesRegister(KnownRegs[RI].Reg, TRI)) {
+        std::swap(KnownRegs[RI], KnownRegs[KnownRegs.size() - 1]);
+        KnownRegs.pop_back();
+        // Don't increment RI since we need to now check the swapped-in
+        // KnownRegs[RI].
+      } else {
+        ++RI;
+      }
+
+    // Continue until the KnownRegs set is empty.
+    if (KnownRegs.empty())
       break;
   }
 
   if (!Changed)
     return false;
 
-  // Otherwise, we have to fixup the use-def chain, starting with the
-  // CBZ/CBNZ. Conservatively mark as much as we can live.
-  CompBr->clearRegisterKills(SmallestDef, TRI);
+  // Add newly used regs to the block's live-in list if they aren't there
+  // already.
+  for (MCPhysReg KnownReg : UsedKnownRegs)
+    if (!MBB->isLiveIn(KnownReg))
+      MBB->addLiveIn(KnownReg);
 
-  if (none_of(TargetRegs, [&](unsigned Reg) { return MBB->isLiveIn(Reg); }))
-    MBB->addLiveIn(TargetReg);
-
-  // Clear any kills of TargetReg between CompBr and the last removed COPY.
+  // Clear kills in the range where changes were made.  This is conservative,
+  // but should be okay since kill markers are being phased out.
+  DEBUG(dbgs() << "Clearing kill flags.\n\tFirstUse: " << *FirstUse
+               << "\tLastChange: " << *LastChange);
+  for (MachineInstr &MMI : make_range(FirstUse, PredMBB->end()))
+    MMI.clearKillInfo();
   for (MachineInstr &MMI : make_range(MBB->begin(), LastChange))
-    MMI.clearRegisterKills(SmallestDef, TRI);
+    MMI.clearKillInfo();
 
   return true;
 }
@@ -169,6 +391,11 @@ bool AArch64RedundantCopyElimination::runOnMachineFunction(
     return false;
   TRI = MF.getSubtarget().getRegisterInfo();
   MRI = &MF.getRegInfo();
+
+  // Resize the clobber register bitfield tracker.  We do this once per
+  // function and then clear the bitfield each time we optimize a copy.
+  ClobberedRegs.resize(TRI->getNumRegs());
+
   bool Changed = false;
   for (MachineBasicBlock &MBB : MF)
     Changed |= optimizeCopy(&MBB);
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
index 2568f11bd24e5179e341eaa2e11b1bb7f0ce4186..20a5979f9b4b7115966426587083ea121c11c481 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
@@ -1,4 +1,4 @@
-//===- AArch64RegisterBankInfo.cpp -------------------------------*- C++ -*-==//
+//===- AArch64RegisterBankInfo.cpp ----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,13 +13,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64RegisterBankInfo.h"
-#include "AArch64InstrInfo.h" // For XXXRegClassID.
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Target/TargetOpcodes.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
+#include <algorithm>
+#include <cassert>
 
 #define GET_TARGET_REGBANK_IMPL
 #include "AArch64GenRegisterBank.inc"
@@ -95,7 +103,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
     assert(                                                                    \
         checkPartialMap(PartialMappingIdx::Idx, ValStartIdx, ValLength, RB) && \
         #Idx " is incorrectly initialized");                                   \
-  } while (0)
+  } while (false)
 
   CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
   CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
@@ -112,7 +120,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
                              PartialMappingIdx::PMI_First##RBName, Size,       \
                              Offset) &&                                        \
            #RBName #Size " " #Offset " is incorrectly initialized");           \
-  } while (0)
+  } while (false)
 
 #define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0)
 
@@ -131,7 +139,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
     CHECK_VALUEMAP_IMPL(RBName, Size, 0);                                      \
     CHECK_VALUEMAP_IMPL(RBName, Size, 1);                                      \
     CHECK_VALUEMAP_IMPL(RBName, Size, 2);                                      \
-  } while (0)
+  } while (false)
 
   CHECK_VALUEMAP_3OPS(GPR, 32);
   CHECK_VALUEMAP_3OPS(GPR, 64);
@@ -159,7 +167,7 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
            Map[1].NumBreakDowns == 1 && #RBNameSrc #Size                       \
            " Src is incorrectly initialized");                                 \
                                                                                \
-  } while (0)
+  } while (false)
 
   CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32);
   CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32);
@@ -338,13 +346,12 @@ void AArch64RegisterBankInfo::applyMappingImpl(
   switch (OpdMapper.getMI().getOpcode()) {
   case TargetOpcode::G_OR:
   case TargetOpcode::G_BITCAST:
-  case TargetOpcode::G_LOAD: {
+  case TargetOpcode::G_LOAD:
     // Those ID must match getInstrAlternativeMappings.
     assert((OpdMapper.getInstrMapping().getID() >= 1 &&
             OpdMapper.getInstrMapping().getID() <= 4) &&
            "Don't know how to handle that ID");
     return applyDefaultMapping(OpdMapper);
-  }
   default:
     llvm_unreachable("Don't know how to handle that operation");
   }
@@ -494,21 +501,18 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   // fine-tune the computed mapping.
   switch (Opc) {
   case TargetOpcode::G_SITOFP:
-  case TargetOpcode::G_UITOFP: {
+  case TargetOpcode::G_UITOFP:
     OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
     break;
-  }
   case TargetOpcode::G_FPTOSI:
-  case TargetOpcode::G_FPTOUI: {
+  case TargetOpcode::G_FPTOUI:
     OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
     break;
-  }
-  case TargetOpcode::G_FCMP: {
+  case TargetOpcode::G_FCMP:
     OpRegBankIdx = {PMI_FirstGPR,
                     /* Predicate */ PMI_None, PMI_FirstFPR, PMI_FirstFPR};
     break;
-  }
-  case TargetOpcode::G_BITCAST: {
+  case TargetOpcode::G_BITCAST:
     // This is going to be a cross register bank copy and this is expensive.
     if (OpRegBankIdx[0] != OpRegBankIdx[1])
       Cost = copyCost(
@@ -516,8 +520,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
           *AArch64GenRegisterBankInfo::PartMappings[OpRegBankIdx[1]].RegBank,
           OpSize[0]);
     break;
-  }
-  case TargetOpcode::G_LOAD: {
+  case TargetOpcode::G_LOAD:
     // Loading in vector unit is slightly more expensive.
     // This is actually only true for the LD1R and co instructions,
     // but anyway for the fast mode this number does not matter and
@@ -526,16 +529,22 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // FIXME: Should be derived from the scheduling model.
     if (OpRegBankIdx[0] >= PMI_FirstFPR)
       Cost = 2;
-  }
+    break;
   }
 
   // Finally construct the computed mapping.
   RegisterBankInfo::InstructionMapping Mapping =
       InstructionMapping{DefaultMappingID, Cost, nullptr, NumOperands};
   SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
-  for (unsigned Idx = 0; Idx < NumOperands; ++Idx)
-    if (MI.getOperand(Idx).isReg())
-      OpdsMapping[Idx] = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    if (MI.getOperand(Idx).isReg()) {
+      auto Mapping = getValueMapping(OpRegBankIdx[Idx], OpSize[Idx]);
+      if (!Mapping->isValid())
+        return InstructionMapping();
+
+      OpdsMapping[Idx] = Mapping;
+    }
+  }
 
   Mapping.setOperandsMapping(getOperandsMapping(OpdsMapping));
   return Mapping;
diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.h b/lib/Target/AArch64/AArch64RegisterBankInfo.h
index bc609f422ac60d96e6d1d9d8b2d108d9b0724761..0a795a42c0b1a1824ebc4f397e143f22893e9ef6 100644
--- a/lib/Target/AArch64/AArch64RegisterBankInfo.h
+++ b/lib/Target/AArch64/AArch64RegisterBankInfo.h
@@ -47,11 +47,12 @@ protected:
   static PartialMappingIdx BankIDToCopyMapIdx[];
 
   enum ValueMappingIdx {
-    First3OpsIdx = 0,
-    Last3OpsIdx = 18,
+    InvalidIdx = 0,
+    First3OpsIdx = 1,
+    Last3OpsIdx = 19,
     DistanceBetweenRegBanks = 3,
-    FirstCrossRegCpyIdx = 21,
-    LastCrossRegCpyIdx = 33,
+    FirstCrossRegCpyIdx = 22,
+    LastCrossRegCpyIdx = 34,
     DistanceBetweenCrossRegCpy = 2
   };
 
diff --git a/lib/Target/AArch64/AArch64RegisterInfo.cpp b/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 98fad71aa18a5dcb206c34cc251231f50183d5f2..baf15ac540cfbf7636e767c7b397fbf92bfd3483 100644
--- a/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -118,25 +118,17 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // FIXME: avoid re-calculating this every time.
   BitVector Reserved(getNumRegs());
-  markSuperRegs(Reserved, AArch64::SP);
-  markSuperRegs(Reserved, AArch64::XZR);
   markSuperRegs(Reserved, AArch64::WSP);
   markSuperRegs(Reserved, AArch64::WZR);
 
-  if (TFI->hasFP(MF) || TT.isOSDarwin()) {
-    markSuperRegs(Reserved, AArch64::FP);
+  if (TFI->hasFP(MF) || TT.isOSDarwin())
     markSuperRegs(Reserved, AArch64::W29);
-  }
 
-  if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved()) {
-    markSuperRegs(Reserved, AArch64::X18); // Platform register
-    markSuperRegs(Reserved, AArch64::W18);
-  }
+  if (MF.getSubtarget<AArch64Subtarget>().isX18Reserved())
+    markSuperRegs(Reserved, AArch64::W18); // Platform register
 
-  if (hasBasePointer(MF)) {
-    markSuperRegs(Reserved, AArch64::X19);
+  if (hasBasePointer(MF))
     markSuperRegs(Reserved, AArch64::W19);
-  }
 
   assert(checkAllSuperRegsMarked(Reserved));
   return Reserved;
diff --git a/lib/Target/AArch64/AArch64SchedA53.td b/lib/Target/AArch64/AArch64SchedA53.td
index 93ca079275c8c75cf822601d9135165a5bfeef5f..18d000ace94c6b9c2e2ef5fbc9fd05b82684a107 100644
--- a/lib/Target/AArch64/AArch64SchedA53.td
+++ b/lib/Target/AArch64/AArch64SchedA53.td
@@ -13,7 +13,7 @@
 
 // ===---------------------------------------------------------------------===//
 // The following definitions describe the simpler per-operand machine model.
-// This works with MachineScheduler. See MCSchedModel.h for details.
+// This works with MachineScheduler. See MCSchedule.h for details.
 
 // Cortex-A53 machine model for scheduling and other instruction cost heuristics.
 def CortexA53Model : SchedMachineModel {
diff --git a/lib/Target/AArch64/AArch64SchedA57.td b/lib/Target/AArch64/AArch64SchedA57.td
index 99c48d0146e473959b7f252cb92a77e925d1f71a..303398ea0b7f3b524b9419a3aecf45c34412642f 100644
--- a/lib/Target/AArch64/AArch64SchedA57.td
+++ b/lib/Target/AArch64/AArch64SchedA57.td
@@ -162,7 +162,9 @@ def : InstRW<[A57Write_2cyc_1M],    (instregex "BFM")>;
 // Cryptography Extensions
 // -----------------------------------------------------------------------------
 
-def : InstRW<[A57Write_3cyc_1W], (instregex "^AES")>;
+def A57ReadAES  : SchedReadAdvance<3, [A57Write_3cyc_1W]>;
+def : InstRW<[A57Write_3cyc_1W], (instregex "^AES[DE]")>;
+def : InstRW<[A57Write_3cyc_1W, A57ReadAES], (instregex "^AESI?MC")>;
 def : InstRW<[A57Write_6cyc_2V], (instregex "^SHA1SU0")>;
 def : InstRW<[A57Write_3cyc_1W], (instregex "^SHA1(H|SU1)")>;
 def : InstRW<[A57Write_6cyc_2W], (instregex "^SHA1[CMP]")>;
diff --git a/lib/Target/AArch64/AArch64SchedFalkor.td b/lib/Target/AArch64/AArch64SchedFalkor.td
index 19a6d6f2a1ad490de8f8b0c020d18422cf687f9d..eec089087fe0ad6c5d1ca32153418e3f4b085f98 100644
--- a/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -17,10 +17,112 @@
 // instruction cost model.
 
 def FalkorModel : SchedMachineModel {
-  let IssueWidth = 4;          // 4-wide issue for expanded uops.
+  let IssueWidth = 8;          // 8 uops are dispatched per cycle.
   let MicroOpBufferSize = 128; // Out-of-order with temporary unified issue buffer.
   let LoopMicroOpBufferSize = 16;
   let LoadLatency = 3;         // Optimistic load latency.
   let MispredictPenalty = 11;  // Minimum branch misprediction penalty.
-  let CompleteModel = 0;
+  let CompleteModel = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available on Falkor.
+
+let SchedModel = FalkorModel in {
+
+  def FalkorUnitB    : ProcResource<1>; // Branch
+  def FalkorUnitLD   : ProcResource<1>; // Load pipe
+  def FalkorUnitSD   : ProcResource<1>; // Store data
+  def FalkorUnitST   : ProcResource<1>; // Store pipe
+  def FalkorUnitX    : ProcResource<1>; // Complex arithmetic
+  def FalkorUnitY    : ProcResource<1>; // Simple arithmetic
+  def FalkorUnitZ    : ProcResource<1>; // Simple arithmetic
+
+  def FalkorUnitVSD  : ProcResource<1>; // Vector store data
+  def FalkorUnitVX   : ProcResource<1>; // Vector X-pipe
+  def FalkorUnitVY   : ProcResource<1>; // Vector Y-pipe
+
+  def FalkorUnitGTOV : ProcResource<1>; // Scalar to Vector
+  def FalkorUnitVTOG : ProcResource<1>; // Vector to Scalar
+
+  // Define the resource groups.
+  def FalkorUnitXY   : ProcResGroup<[FalkorUnitX, FalkorUnitY]>;
+  def FalkorUnitXYZ  : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ]>;
+  def FalkorUnitXYZB : ProcResGroup<[FalkorUnitX, FalkorUnitY, FalkorUnitZ,
+                                     FalkorUnitB]>;
+  def FalkorUnitZB   : ProcResGroup<[FalkorUnitZ, FalkorUnitB]>;
+  def FalkorUnitVXVY : ProcResGroup<[FalkorUnitVX, FalkorUnitVY]>;
+
+}
+
+//===----------------------------------------------------------------------===//
+// Map the target-defined scheduler read/write resources and latency for
+// Falkor.
+
+let SchedModel = FalkorModel in {
+
+def : WriteRes<WriteImm,   [FalkorUnitXYZ]> { let Latency = 1; }
+def : WriteRes<WriteI,     [FalkorUnitXYZ]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [FalkorUnitVXVY, FalkorUnitVXVY]>
+      { let Latency = 1; let NumMicroOps = 2; }
+def : WriteRes<WriteIEReg, [FalkorUnitXYZ, FalkorUnitXYZ]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteExtr,  [FalkorUnitXYZ, FalkorUnitXYZ]>
+      { let Latency = 2; let NumMicroOps = 2; }
+def : WriteRes<WriteIS,    [FalkorUnitXYZ]> { let Latency = 1; }
+def : WriteRes<WriteID32,  [FalkorUnitX, FalkorUnitZ]>
+      { let Latency = 8; let NumMicroOps = 2; }
+def : WriteRes<WriteID64,  [FalkorUnitX, FalkorUnitZ]>
+      { let Latency = 16; let NumMicroOps = 2; }
+def : WriteRes<WriteIM32,  [FalkorUnitX]> { let Latency = 4; }
+def : WriteRes<WriteIM64,  [FalkorUnitX]> { let Latency = 5; }
+def : WriteRes<WriteBr,    [FalkorUnitB]> { let Latency = 1; }
+def : WriteRes<WriteBrReg, [FalkorUnitB]> { let Latency = 1; }
+def : WriteRes<WriteLD,    [FalkorUnitLD]> { let Latency = 3; }
+def : WriteRes<WriteST,    [FalkorUnitLD, FalkorUnitST, FalkorUnitSD]>
+      { let Latency = 3; let NumMicroOps = 3; }
+def : WriteRes<WriteSTP,   [FalkorUnitST, FalkorUnitSD]>
+      { let Latency = 0; let NumMicroOps = 2; }
+def : WriteRes<WriteAdr,   [FalkorUnitXYZ]> { let Latency = 5; }
+def : WriteRes<WriteLDIdx, [FalkorUnitLD]> { let Latency = 5; }
+def : WriteRes<WriteSTIdx, [FalkorUnitLD, FalkorUnitST, FalkorUnitSD]>
+      { let Latency = 4; let NumMicroOps = 3; }
+def : WriteRes<WriteF,     [FalkorUnitVXVY, FalkorUnitVXVY]>
+      { let Latency = 3; let NumMicroOps = 2; }
+def : WriteRes<WriteFCmp,  [FalkorUnitVXVY]> { let Latency = 2; }
+def : WriteRes<WriteFCvt,  [FalkorUnitVXVY]> { let Latency = 4; }
+def : WriteRes<WriteFCopy, [FalkorUnitVXVY]> { let Latency = 4; }
+def : WriteRes<WriteFImm,  [FalkorUnitVXVY]> { let Latency = 4; }
+def : WriteRes<WriteFMul,  [FalkorUnitVXVY, FalkorUnitVXVY]>
+      { let Latency = 6; let NumMicroOps = 2; }
+def : WriteRes<WriteFDiv,  [FalkorUnitVXVY, FalkorUnitVXVY]>
+      { let Latency = 12; let NumMicroOps = 2; } // Fragent -1 / NoRSV +1
+def : WriteRes<WriteV,     [FalkorUnitVXVY]> { let Latency = 6; }
+def : WriteRes<WriteVLD,   [FalkorUnitLD]> { let Latency = 3; }
+def : WriteRes<WriteVST,   [FalkorUnitST, FalkorUnitVSD]>
+      { let Latency = 0; let NumMicroOps = 2; }
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteLDHi,    []> { let Latency = 3; }
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// No forwarding logic is modelled yet.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+// Detailed Refinements
+// -----------------------------------------------------------------------------
+include "AArch64SchedFalkorDetails.td"
+
 }
diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
new file mode 100644
index 0000000000000000000000000000000000000000..6bce4ef6b652bfa13f05f357755b25ae614e51ca
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -0,0 +1,523 @@
+//==- AArch64SchedFalkorDetails.td - Falkor Scheduling Defs -*- tablegen -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the uop and latency details for the machine model for the
+// Qualcomm Falkor subtarget.
+//
+//===----------------------------------------------------------------------===//
+
+include "AArch64SchedFalkorWriteRes.td"
+
+//===----------------------------------------------------------------------===//
+// Specialize the coarse model by associating instruction groups with the
+// subtarget-defined types. As the modeled is refined, this will override most
+// of the earlier mappings.
+
+// Miscellaneous
+// -----------------------------------------------------------------------------
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// SIMD Floating-point Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(FABS|FNEG)(v2f32|v4f16)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT))(v2f32|v4f16|v2i16p|v2i32p)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FAC(GE|GT)(16|32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FCM(EQ|GE|GT)(16|32|64|v2f32|v4f16|v2i32|v4i16)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FCM(EQ|LE|GE|GT|LT)(v1i16|v1i32|v1i64|v2i32|v4i16)rz$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f32|v4f16)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^F(MAX|MIN)(NM)?V(v4i16|v4i32|v8i16)v$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(FABD|FADD|FSUB)(v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^FADDP(v2i16p|v2i32p|v2i64p|v2f32|v4f16)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTXNv1i64)>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instrs FMULX16, FMULX32)>;
+
+def : InstRW<[FalkorWr_1VXVY_6cyc],   (instregex "^(FMUL|FMULX)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_1VXVY_6cyc],   (instrs FMULX64)>;
+
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(F(MAX|MIN)(NM)?P?|FAC(GE|GT)|FCM(EQ|GE|GT))(v2f64|v4f32|v8f16|v2i64p)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^FCM(EQ|LE|GE|GT|LT)(v2i64|v4i32|v8i16)rz$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(v2f64|v4f32|v8f16)$")>;
+
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^(FDIV|FSQRT)(v2f32|v4f16)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(FABD|FADD(P)?|FSUB)(v2f64|v4f32|v8f16)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4f32|v8f16)$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>;
+
+def : InstRW<[FalkorWr_2VXVY_5cyc],   (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_6cyc],   (instregex "^(FMUL|FMULX)v2i64_indexed$")>;
+
+def : InstRW<[FalkorWr_3VXVY_4cyc],   (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>;
+
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i16|v4i32|v8i16|v4f32)$")>;
+
+def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>;
+def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v1i64_indexed$")>;
+def : InstRW<[FalkorWr_2VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>;
+def : InstRW<[FalkorWr_2VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v2i64_indexed$")>;
+// SIMD Integer Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs ADDPv2i64p)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(AND|ORR|ORN|BIC|EOR)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIC|ORR)(v2i32|v4i16)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^NEG(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^SUB(v1i64|v2i32|v4i16|v8i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v2i32|v4i16|v8i8)(_v.*)?$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)SHLv1i64$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(S|U)SHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^((S|U)?(MAX|MIN)P?|ABS|ADDP|CM(EQ|GE|HS|GT|HI))(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CM(EQ|GE|HS|GT|HI)(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CM(EQ|LE|GE|GT|LT)(v1i64|v2i32|v4i16|v8i8)rz$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^CMTST(v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs PMULv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^SHL(v2i32|v4i16|v8i8)_shift$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQNEG(v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)R?SRA(d|(v2i32|v4i16|v8i8)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)(ABD|ADALP)(v8i8|v4i16|v2i32)(_v.*)?$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)ADDLVv4i16v$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)QSHLU?(d|s|h|b|(v8i8|v4i16|v2i32)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)(QSHL|RSHL|QRSHL)(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(s|h|b)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)QSUB(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)RHADD(v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)RSHR(v2i32|v4i16|v8i8)_shift$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(SU|US)QADD(v1i8|v1i16|v2i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(S|U)?(MAX|MIN)V(v4i16v|v4i32v)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs ADDVv4i16v)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^S(L|R)I(d|(v8i8|v4i16|v2i32)_shift)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQABS(v1i8|v1i16|v1i32|v1i64|v2i32|v4i16|v8i8)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)ADDLVv8i8v$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs ADDVv8i8v)>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQDMULL(i16|i32)$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^SQRDML(A|S)?H(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs ADDVv4i32v)>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs ADDVv8i16v)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(ADD|SUB)HNv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)ABA(v2i32|v4i16|v8i8)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_5cyc],   (instrs ADDVv16i8v)>;
+
+def : InstRW<[FalkorWr_2VXVY_6cyc],   (instregex "^(SQR?SHRN|UQR?SHRN|SQR?SHRUN)(v8i8|v16i8|v4i16|v8i16|v2i32|v4i32)_shift?$")>;
+def : InstRW<[FalkorWr_2VXVY_6cyc],   (instregex "^R(ADD|SUB)HNv.*$")>;
+
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^ADD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs ADDPv2i64)>; // sz==11
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(AND|ORR|ORN|BIC|EOR)v16i8$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIC|ORR)(v8i16|v4i32)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(NEG|SUB)(v16i8|v8i16|v4i32|v2i64)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)ADDLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)(ADDLP|HADD|HSUB|SHL)(v16i8|v2i64|v4i32|v8i16)(_v.*)?$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)SHR(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(S|U)SUBLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^((S|U)?(MAX|MIN)P?|ABS)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^ADDP(v4i32|v8i16|v16i8)$")>; // sz!=11
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^CM(EQ|GE|HS|GT|HI)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^CM(EQ|LE|GE|GT|LT)(v16i8|v2i64|v4i32|v8i16)rz$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^(CMTST|PMUL)(v16i8|v2i64|v4i32|v8i16)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^PMULL2?(v8i8|v16i8)$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^SHL(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instregex "^SHLL(v16i8|v8i16|v4i32|v8i8|v4i16|v2i32)(_shift)?$")>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)R?SRA(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)ABD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)ABDLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)(ADALP|QADD)(v16i8|v8i16|v4i32|v2i64)(_v.*)?$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)QSHLU?(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)(QSHL|RSHL|QRSHL|QSUB|RHADD)(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(S|U)RSHR(v2i64|v4i32|v8i16|v16i8)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^(SU|US)QADD(v16i8|v8i16|v4i32|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^PMULL2?(v1i64|v2i64)$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^SQDMULLv.*$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>;
+
+def : InstRW<[FalkorWr_3VXVY_3cyc],   (instregex "^(S|U)ADDLVv4i32v$")>;
+
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^(S|U)ADDLVv8i16v$")>;
+
+def : InstRW<[FalkorWr_3VXVY_6cyc],   (instregex "^(S|U)ADDLVv16i8v$")>;
+
+def : InstRW<[FalkorWr_4VXVY_2cyc],   (instregex "^(S|U)(ADD|SUB)Wv.*$")>;
+
+def : InstRW<[FalkorWr_4VXVY_3cyc],   (instregex "^(S|U)ABALv.*$")>;
+
+def : InstRW<[FalkorWr_4VXVY_4cyc],   (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)(i16|i32)$")>;
+def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)v.*$")>;
+// SIMD Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[WriteVLD],                               (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>;
+def : InstRW<[WriteVLD],                               (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[WriteVLD],                               (instrs LD2i64)>;
+def : InstRW<[WriteVLD, WriteAdr],                     (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))_POST$")>;
+def : InstRW<[WriteVLD, WriteAdr],                     (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[WriteVLD, WriteAdr],                     (instrs LD2i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc],                (instregex "LD1i(8|16|32)$")>;
+def : InstRW<[FalkorWr_1LD_1VXVY_4cyc, WriteAdr],      (instregex "LD1i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_1none_3cyc],                (instregex "^LD1Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc],                (instregex "^LD2Twov(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc],                (instregex "^LD2Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr],      (instregex "^LD1Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr],      (instregex "^LD2Twov(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteAdr],      (instregex "^LD2Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_3cyc],                      (instregex "^LD1Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc],                      (instregex "^LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc],                      (instregex "^LD2Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc],                      (instrs LD3i64)>;
+def : InstRW<[FalkorWr_2LD_3cyc],                      (instrs LD4i64)>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instregex "^LD1Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instregex "^LD2Twov(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instregex "^LD2Rv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instrs LD3i64_POST)>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteAdr],            (instrs LD4i64_POST)>;
+
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc],                (instregex "^LD2i(8|16|32)$")>;
+def : InstRW<[FalkorWr_1LD_2VXVY_4cyc, WriteAdr],      (instregex "^LD2i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_1none_3cyc],                (instregex "^LD1Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc],                (instregex "^LD3Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr],      (instregex "^LD1Threev(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_1none_3cyc, WriteAdr],      (instregex "^LD3Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_3LD_3cyc],                      (instregex "^LD1Threev(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_3LD_3cyc],                      (instrs LD3Threev2d)>;
+def : InstRW<[FalkorWr_3LD_3cyc],                      (instregex "^LD3Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr],            (instregex "^LD1Threev(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr],            (instrs LD3Threev2d_POST)>;
+def : InstRW<[FalkorWr_3LD_3cyc, WriteAdr],            (instregex "^LD3Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc],                (instregex "LD3i(8|16|32)$")>;
+def : InstRW<[FalkorWr_1LD_3VXVY_4cyc, WriteAdr],      (instregex "LD3i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2none_3cyc],                (instregex "^LD1Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc],                (instregex "^LD4Rv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr],      (instregex "^LD1Fourv(8b|4h|2s|1d)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2none_3cyc, WriteAdr],      (instregex "^LD4Rv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_4LD_3cyc],                      (instregex "^LD1Fourv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_4LD_3cyc],                      (instrs LD4Fourv2d)>;
+def : InstRW<[FalkorWr_4LD_3cyc],                      (instregex "^LD4Rv(16b|8h|4s|2d)$")>;
+def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr],            (instregex "^LD1Fourv(16b|8h|4s|2d)_POST$")>;
+def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr],            (instrs LD4Fourv2d_POST)>;
+def : InstRW<[FalkorWr_4LD_3cyc, WriteAdr],            (instregex "^LD4Rv(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc],                (instregex "^LD4i(8|16|32)$")>;
+def : InstRW<[FalkorWr_1LD_4VXVY_4cyc, WriteAdr],      (instregex "^LD4i(8|16|32)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc],          (instregex "LD3Threev(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_1none_4cyc, WriteAdr],(instregex "LD3Threev(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc],          (instregex "^LD4Fourv(8b|4h|2s|1d)$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_2none_4cyc, WriteAdr],(instregex "^LD4Fourv(8b|4h|2s|1d)_POST$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],      (instregex "LD3Threev(16b|8h|4s)$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc],      (instregex "^LD4Fourv(16b|8h|4s)$")>;
+
+def : InstRW<[FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc, WriteAdr],(instregex "LD3Threev(16b|8h|4s)_POST$")>;
+def : InstRW<[FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc, WriteAdr],(instregex "^LD4Fourv(16b|8h|4s)_POST$")>;
+
+// Arithmetic and Logical Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_ADD],          (instregex "^ADD(S)?(W|X)r(s|x)$")>;
+def : InstRW<[FalkorWr_2XYZ_2cyc],    (instregex "^SUB(S)?(W|X)r(s|x)$")>;
+
+// SIMD Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^DUP(v8i8|v4i16|v2i32)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^INSv(i8|i16)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^(S|U)MOVv.*$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs EXTv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs TBLv8i8One)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs NOTv8i8)>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^REV(16|32|64)v.*$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(TRN1|TRN2|ZIP1|UZP1|UZP2|ZIP2|XTN|XTN2)(v2i32|v2i64|v4i16|v4i32|v8i8|v8i16|v16i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^(CLS|CLZ|CNT|RBIT)(v4i32|v8i16|v16i8)$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "(S|U)QXTU?Nv.*$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv1i32, FRSQRTEv1i64, FRECPEv2f32, FRSQRTEv2f32)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FRECPXv1i32, FRECPXv1i64)>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs URECPEv2i32, URSQRTEv2i32)>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>;
+
+def : InstRW<[FalkorWr_1VXVY_6cyc],   (instrs FRECPS64, FRSQRTS64)>;
+
+def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_2GTOV_1cyc],   (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs EXTv16i8)>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs NOTv16i8)>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs TBLv16i8One)>;
+
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs FRECPEv2f64, FRECPEv4f32, FRSQRTEv2f64, FRSQRTEv4f32)>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs URECPEv4i32, URSQRTEv4i32)>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instrs TBLv8i8Two)>;
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^TBX(v8|v16)i8One$")>;
+
+def : InstRW<[FalkorWr_2VXVY_5cyc],   (instrs FRECPSv4f32, FRSQRTSv4f32)>;
+
+def : InstRW<[FalkorWr_2VXVY_6cyc],   (instrs FRECPSv2f64, FRSQRTSv2f64)>;
+
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBL(v8i8Three|v16i8Two)$")>;
+def : InstRW<[FalkorWr_3VXVY_5cyc],   (instregex "^TBX(v8i8Two|v16i8Two)$")>;
+
+def : InstRW<[FalkorWr_4VXVY_6cyc],   (instregex "^TBL(v8i8Four|v16i8Three)$")>;
+def : InstRW<[FalkorWr_4VXVY_6cyc],   (instregex "^TBX(v8i8Three|v16i8Three)$")>;
+
+def : InstRW<[FalkorWr_5VXVY_7cyc],   (instrs TBLv16i8Four)>;
+def : InstRW<[FalkorWr_5VXVY_7cyc],   (instregex "^TBX(v8i8Four|v16i8Four)$")>;
+
+// SIMD Store Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[WriteVST],                                                        (instregex "^ST1(One(v8b|v4h|v2s|v1d)(_POST)?|(i8|i16|i32|i64)(_POST)?|One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[WriteVST],                                                        (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))$")>;
+def : InstRW<[WriteVST, WriteAdr],                                              (instregex "^ST1(One(v16b|v8h|v4s|v2d)|Two(v8b|v4h|v2s|v1d))_POST$")>;
+def : InstRW<[WriteVST, WriteAdr],                                              (instregex "^ST2(Two(v8b|v4h|v2s|v1d)|(i8|i16|i32|i64))_POST$")>;
+
+def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))$")>;
+def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST2Two(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST3(i8|i16|i32|i64)$")>;
+def : InstRW<[WriteVST, WriteVST],                                              (instregex "^ST4(i8|i16|i32|i64)$")>;
+def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST1(Two(v16b|v8h|v4s|v2d)|(Three|Four)(v8b|v4h|v2s|v1d))_POST$")>;
+def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST2Two(v16b|v8h|v4s|v2d)_POST$")>;
+def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST3(i8|i16|i32|i64)_POST$")>;
+def : InstRW<[WriteVST, WriteVST, WriteAdr],                                    (instregex "^ST4(i8|i16|i32|i64)_POST$")>;
+
+def : InstRW<[WriteV, WriteVST, WriteVST],                                      (instregex "^ST3Three(v8b|v4h|v2s|v1d)$")>;
+def : InstRW<[WriteV, WriteVST, WriteVST, WriteAdr],                            (instregex "^ST3Three(v8b|v4h|v2s|v1d)_POST$")>;
+
+def : InstRW<[WriteVST, WriteVST, WriteVST],                                    (instregex "^ST1Three(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST],                                    (instrs ST3Threev2d)>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr],                          (instregex "^ST1Three(v16b|v8h|v4s|v2d)_POST$")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteAdr],                          (instrs ST3Threev2d_POST)>;
+
+def : InstRW<[WriteV, WriteV, WriteVST, WriteVST],                              (instregex "^ST4Four(v8b|v4h|v2s|v1d)$")>;
+def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteAdr],                    (instregex "^ST4Four(v8b|v4h|v2s|v1d)_POST$")>;
+
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],                          (instregex "^ST1Four(v16b|v8h|v4s|v2d)$")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST],                          (instrs ST4Fourv2d)>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],                (instregex "^ST1Four(v16b|v8h|v4s|v2d)_POST$")>;
+def : InstRW<[WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],                (instrs ST4Fourv2d_POST)>;
+
+def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST],          (instregex "^ST3Three(v16b|v8h|v4s)$")>;
+def : InstRW<[WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST3Three(v16b|v8h|v4s)_POST$")>;
+
+def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST],          (instregex "^ST4Four(v16b|v8h|v4s)$")>;
+def : InstRW<[WriteV, WriteV, WriteV, WriteV, WriteVST, WriteVST, WriteVST, WriteVST, WriteAdr],(instregex "^ST4Four(v16b|v8h|v4s)_POST$")>;
+
+// Branch Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1none_0cyc],   (instrs B)>;
+def : InstRW<[FalkorWr_1Z_0cyc],      (instregex "^(BR|RET|(CBZ|CBNZ|TBZ|TBNZ)(W|X))$")>;
+def : InstRW<[FalkorWr_1ZB_0cyc],     (instrs Bcc)>;
+def : InstRW<[FalkorWr_1XYZB_0cyc],   (instrs BL)>;
+def : InstRW<[FalkorWr_1Z_1XY_0cyc],  (instrs BLR)>;
+
+// Cryptography Extensions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs SHA1Hrr)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs AESIMCrr, AESMCrr)>;
+def : InstRW<[FalkorWr_2VXVY_3cyc],   (instrs AESDrr, AESErr)>;
+def : InstRW<[FalkorWr_2VXVY_2cyc],   (instrs SHA1SU0rrr, SHA1SU1rr, SHA256SU0rr)>;
+def : InstRW<[FalkorWr_1VX_1VY_4cyc], (instregex "^SHA1(C|M|P)rrr$")>;
+def : InstRW<[FalkorWr_1VX_1VY_5cyc], (instrs SHA256H2rrr, SHA256Hrrr)>;
+def : InstRW<[FalkorWr_4VXVY_3cyc],   (instrs SHA256SU1rrr)>;
+
+// FP Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[WriteLD],               (instregex "^LDR((Q|D|S|H|B)ui|(Q|D|S)l)$")>;
+def : InstRW<[WriteLD, WriteAdr],     (instregex "^LDR(Q|D|S|H|B)(post|pre)$")>;
+def : InstRW<[WriteLD],               (instregex "^LDUR(Q|D|S|H|B)i$")>;
+def : InstRW<[FalkorWr_LDR],          (instregex "^LDR(Q|D|H|S|B)ro(W|X)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDNPQi)>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi],(instrs LDPQi)>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDNP(D|S)i$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi],(instregex "LDP(D|S)i$")>;
+def : InstRW<[FalkorWr_1LD_1none_3cyc, WriteLDHi, WriteAdr],(instregex "LDP(D|S)(pre|post)$")>;
+def : InstRW<[FalkorWr_2LD_3cyc, WriteLDHi, WriteAdr],(instregex "^LDPQ(pre|post)$")>;
+
+// FP Data Processing Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCCMP(E)?(H|S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCMP(E)?(H|S|D)r(r|i)$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVT(A|M|N|P)(S|U)U(W|X)(H|S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(FABS|FNEG)(H|S|D)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FCSEL(H|S|D)rrr$")>;
+
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^F(MAX|MIN)(NM)?(H|S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^F(MAX|MIN)(NM)?Pv2i(16|32|64)p$")>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instrs FCVTHSr, FCVTHDr)>;
+def : InstRW<[FalkorWr_1VXVY_2cyc],   (instregex "^FRINT(A|I|M|N|P|X|Z)(H|S|D)r$")>;
+
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^FABD(16|32|64)$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instregex "^(FADD|FSUB)(H|S|D)rr$")>;
+def : InstRW<[FalkorWr_1VXVY_3cyc],   (instrs FCVTSHr, FCVTDHr)>;
+
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instrs FCVTSDr, FCVTDSr)>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc],   (instregex "^F(N)?MUL(H|S)rr$")>;
+
+def : InstRW<[FalkorWr_1VXVY_6cyc],   (instregex "^F(N)?MULDrr$")>;
+
+def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>;
+def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>;
+
+def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>;
+def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>;
+// FP Miscellaneous Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_FMOV],         (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>;
+def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^FMOV(Hi|Hr|S0|Si|Sr|D0|Di|Dr|v.*_ns)$")>;
+
+def : InstRW<[FalkorWr_1GTOV_4cyc],   (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>;
+def : InstRW<[FalkorWr_1VXVY_4cyc],   (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)CVTF(v2i64|v4i32|v8i16|v2f64|v4f32|v8f16)(_shift)?")>;
+
+
+// Load Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFMui, PRFMl)>;
+def : InstRW<[FalkorWr_1ST_0cyc],     (instrs PRFUMi)>;
+
+def : InstRW<[WriteLD, WriteLDHi],    (instregex "^LDNP(W|X)i$")>;
+def : InstRW<[WriteLD, WriteLDHi],    (instregex "^LDP(W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(B|H|W|X)ui$")>;
+def : InstRW<[WriteLD, WriteAdr],     (instregex "^LDR(B|H|W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDR(W|X)l$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDTR(B|H|W|X)i$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^LDUR(B|H|W|X)i$")>;
+
+def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDRS(BW|BX|HW|HX|W)ui$")>;
+def : InstRW<[FalkorWr_1LD_4cyc],     (instrs LDRSWl)>;
+def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDTRS(BW|BX|HW|HX|W)i$")>;
+def : InstRW<[FalkorWr_1LD_4cyc],     (instregex "^LDURS(BW|BX|HW|HX|W)i$")>;
+
+def : InstRW<[FalkorWr_PRFM],         (instregex "^PRFMro(W|X)$")>;
+def : InstRW<[FalkorWr_LDR],          (instregex "^LDR(B|H|W|X)ro(W|X)$")>;
+
+def : InstRW<[FalkorWr_LDRS],         (instregex "^LDRS(BW|BX|HW|HX|W)ro(W|X)$")>;
+
+def : InstRW<[FalkorWr_1LD_4cyc, WriteAdr],(instregex "^LDRS(BW|BX|HW|HX|W)(post|pre)$")>;
+def : InstRW<[WriteLD, WriteLDHi, WriteAdr],(instregex "^LDP(W|X)(post|pre)$")>;
+def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi],(instrs LDPSWi)>;
+def : InstRW<[FalkorWr_1LD_4cyc, WriteLDHi, WriteAdr],(instregex "^LDPSW(post|pre)$")>;
+// Miscellaneous Data-Processing Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(S|U)?BFM(W|X)ri$")>;
+def : InstRW<[FalkorWr_1X_2cyc],      (instregex "^CRC32.*$")>;
+def : InstRW<[FalkorWr_1XYZ_2cyc],    (instregex "^(CLS|CLZ|RBIT|REV|REV16|REV32)(W|X)r$")>;
+def : InstRW<[FalkorWr_2XYZ_2cyc],    (instregex "^EXTR(W|X)rri$")>;
+
+// Divide and Multiply Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1X_4cyc],      (instregex "^(S|U)M(ADD|SUB)Lrrr$")>;
+def : InstRW<[FalkorWr_1X_4cyc],      (instregex "^M(ADD|SUB)Wrrr$")>;
+
+def : InstRW<[FalkorWr_1X_5cyc],      (instregex "^(S|U)MULHrr$")>;
+def : InstRW<[FalkorWr_1X_5cyc],      (instregex "^M(ADD|SUB)Xrrr$")>;
+
+def : InstRW<[FalkorWr_1X_1Z_8cyc],   (instregex "^(S|U)DIVWr$")>;
+def : InstRW<[FalkorWr_1X_1Z_16cyc],  (instregex "^(S|U)DIVXr$")>;
+
+def : InstRW<[FalkorWr_2VXVY_4cyc],   (instregex "^(S|U)(MLAL|MLSL|MULL)v.*$")>;
+
+// Move and Shift Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1XYZ_1cyc],    (instregex "^(LSLV|LSRV|ASRV|RORV|MOVK)(W|X).*")>;
+def : InstRW<[FalkorWr_1XYZB_1cyc],   (instregex "^ADRP?$")>;
+def : InstRW<[FalkorWr_1XYZB_1cyc],   (instregex "^MOVN(W|X)i$")>;
+def : InstRW<[FalkorWr_MOVZ],         (instregex "^MOVZ(W|X)i$")>;
+
+// Other Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[FalkorWr_1LD_0cyc],     (instrs CLREX, DMB, DSB)>;
+def : InstRW<[FalkorWr_1none_0cyc],   (instrs BRK, DCPS1, DCPS2, DCPS3, HINT, HLT, HVC, ISB, SMC, SVC)>;
+def : InstRW<[FalkorWr_1ST_0cyc],     (instrs SYSxt, SYSLxt)>;
+def : InstRW<[FalkorWr_1Z_0cyc],      (instrs MSRpstateImm1, MSRpstateImm4)>;
+
+def : InstRW<[FalkorWr_1LD_3cyc],     (instregex "^(LDAR(B|H|W|X)|LDAXP(W|X)|LDAXR(B|H|W|X)|LDXP(W|X)|LDXR(B|H|W|X))$")>;
+def : InstRW<[FalkorWr_1LD_3cyc],     (instrs MRS)>;
+
+def : InstRW<[FalkorWr_1LD_1Z_3cyc],  (instrs DRPS)>;
+
+def : InstRW<[FalkorWr_1SD_1ST_0cyc], (instrs MSR)>;
+def : InstRW<[WriteVST],              (instrs STNPDi, STNPSi)>;
+def : InstRW<[WriteSTP],               (instrs STNPWi, STNPXi)>;
+def : InstRW<[FalkorWr_2LD_1Z_3cyc],  (instrs ERET)>;
+
+def : InstRW<[WriteST],               (instregex "^LDC.*$")>;
+def : InstRW<[WriteST],               (instregex "^STLR(B|H|W|X)$")>;
+def : InstRW<[WriteST],               (instregex "^STXP(W|X)$")>;
+def : InstRW<[WriteST],               (instregex "^STXR(B|H|W|X)$")>;
+
+def : InstRW<[WriteSTX],              (instregex "^STLXP(W|X)$")>;
+def : InstRW<[WriteSTX],              (instregex "^STLXR(B|H|W|X)$")>;
+def : InstRW<[WriteVST, WriteVST],    (instrs STNPQi)>;
+
+// Store Instructions
+// -----------------------------------------------------------------------------
+def : InstRW<[WriteVST],              (instregex "^STP(D|S)(i|post|pre)$")>;
+def : InstRW<[WriteST],               (instregex "^STP(W|X)(i|post|pre)$")>;
+def : InstRW<[WriteST],               (instregex "^STR(Q|D|S|BB|HH)ui$")>;
+def : InstRW<[WriteST],               (instregex "^STUR(Q|D|S|BB|HH)i$")>;
+def : InstRW<[WriteST],               (instregex "^STR(B|H|W|X)(post|pre|ui)$")>;
+def : InstRW<[WriteST],               (instregex "^STTR(B|H|W|X)i$")>;
+def : InstRW<[WriteST],               (instregex "^STUR(B|H|W|X)i$")>;
+
+def : InstRW<[WriteST, WriteAdr],     (instregex "^STR(B|H|W|X)ro(W|X)$")>;
+
+def : InstRW<[WriteVST, WriteVST],    (instregex "^STPQ(i|post|pre)$")>;
diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
new file mode 100644
index 0000000000000000000000000000000000000000..9cdb4be4246bc8c3daa0cc08a79fb7ac6f4be949
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td
@@ -0,0 +1,361 @@
+//=- AArch64SchedFalkorWrRes.td - Falkor Write Res ---*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains all of the Falkor specific SchedWriteRes types. The approach
+// below is to define a generic SchedWriteRes for every combination of
+// latency and microOps. The naming conventions is to use a prefix, one field
+// for latency, and one or more microOp count/type designators.
+//   Prefix: FalkorWr
+//   MicroOp Count/Types: #(B|X|Y|Z|LD|ST|SD|VX|VY|VSD)
+//   Latency: #cyc
+//
+// e.g. FalkorWr_1Z_6SD_4VX_6cyc means there are 11 micro-ops to be issued
+//      down one Z pipe, six SD pipes, four VX pipes and the total latency is
+//      six cycles.
+//
+// Contains all of the Falkor specific ReadAdvance types for forwarding logic.
+//
+// Contains all of the Falkor specific WriteVariant types for immediate zero
+// and LSLFast.
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Define 1 micro-op types
+
+
+def FalkorWr_1X_2cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 2; }
+def FalkorWr_1X_4cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 4; }
+def FalkorWr_1X_5cyc    : SchedWriteRes<[FalkorUnitX]>   { let Latency = 5; }
+def FalkorWr_1Z_0cyc    : SchedWriteRes<[FalkorUnitZ]>   { let Latency = 0; }
+def FalkorWr_1ZB_0cyc   : SchedWriteRes<[FalkorUnitZB]>  { let Latency = 0; }
+def FalkorWr_1LD_3cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 3; }
+def FalkorWr_1LD_4cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 4; }
+def FalkorWr_1XYZ_1cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 1; }
+def FalkorWr_1XYZ_2cyc  : SchedWriteRes<[FalkorUnitXYZ]> { let Latency = 2; }
+def FalkorWr_1XYZB_0cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 0; }
+def FalkorWr_1XYZB_1cyc : SchedWriteRes<[FalkorUnitXYZB]>{ let Latency = 1; }
+def FalkorWr_1none_0cyc : SchedWriteRes<[]>              { let Latency = 0; }
+
+def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; }
+def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; }
+def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; }
+def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; }
+def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; }
+def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; }
+
+def FalkorWr_1LD_0cyc   : SchedWriteRes<[FalkorUnitLD]>  { let Latency = 0; }
+def FalkorWr_1ST_0cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 0; }
+def FalkorWr_1ST_3cyc   : SchedWriteRes<[FalkorUnitST]>  { let Latency = 3; }
+
+def FalkorWr_1GTOV_1cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 1; }
+def FalkorWr_1GTOV_4cyc : SchedWriteRes<[FalkorUnitGTOV]>{ let Latency = 4; }
+def FalkorWr_1VTOG_1cyc : SchedWriteRes<[FalkorUnitVTOG]>{ let Latency = 1; }
+
+//===----------------------------------------------------------------------===//
+// Define 2 micro-op types
+
+def FalkorWr_2VXVY_1cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_2cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_3cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_4cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_5cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2VXVY_6cyc   : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_4cyc  : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_2LD_3cyc   : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_5cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_2cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_4cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1VX_1VY_10cyc : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1GTOV_1VXVY_2cyc : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_2GTOV_1cyc    : SchedWriteRes<[FalkorUnitGTOV, FalkorUnitGTOV]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1XYZ_1ST_4cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitST]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+def FalkorWr_1XYZ_1LD_5cyc: SchedWriteRes<[FalkorUnitXYZ, FalkorUnitLD]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_2XYZ_2cyc   : SchedWriteRes<[FalkorUnitXYZ, FalkorUnitXYZ]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1Z_1XY_0cyc : SchedWriteRes<[FalkorUnitZ, FalkorUnitXY]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1X_1Z_8cyc  : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+  let Latency = 8;
+  let ResourceCycles = [2, 8];
+}
+
+def FalkorWr_1X_1Z_16cyc : SchedWriteRes<[FalkorUnitX, FalkorUnitZ]> {
+  let Latency = 16;
+  let ResourceCycles = [2, 16];
+}
+
+def FalkorWr_1LD_1Z_3cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1LD_1none_3cyc : SchedWriteRes<[FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+def FalkorWr_1SD_1ST_0cyc: SchedWriteRes<[FalkorUnitSD, FalkorUnitST]> {
+  let Latency = 0;
+  let NumMicroOps = 2;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 3 micro-op types
+
+def FalkorWr_3VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+def FalkorWr_3VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+def FalkorWr_3VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+def FalkorWr_3VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_1LD_2VXVY_4cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+def FalkorWr_2LD_1none_3cyc  : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+def FalkorWr_3LD_3cyc        : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                              FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+def FalkorWr_2LD_1Z_3cyc     : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitZ]> {
+  let Latency = 3;
+  let NumMicroOps = 3;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 4 micro-op types
+
+def FalkorWr_2VX_2VY_2cyc  : SchedWriteRes<[FalkorUnitVX, FalkorUnitVY,
+                                            FalkorUnitVX, FalkorUnitVY]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_4VXVY_2cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 2;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_3cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_4cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+def FalkorWr_4VXVY_6cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_4LD_3cyc      : SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_1LD_3VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+
+def FalkorWr_2LD_2none_3cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD]> {
+  let Latency = 3;
+  let NumMicroOps = 4;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 5 micro-op types
+
+def FalkorWr_1LD_4VXVY_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+}
+def FalkorWr_2LD_2VXVY_1none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 5;
+}
+def FalkorWr_5VXVY_7cyc    : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY, FalkorUnitVXVY,
+                                            FalkorUnitVXVY]> {
+  let Latency = 7;
+  let NumMicroOps = 5;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 6 micro-op types
+
+def FalkorWr_2LD_2VXVY_2none_4cyc: SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                            FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 6;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 8 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY,
+                                             FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 8;
+}
+
+//===----------------------------------------------------------------------===//
+// Define 9 micro-op types
+
+def FalkorWr_2LD_2VXVY_2LD_1XYZ_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitVXVY,
+                                             FalkorUnitVXVY, FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitXYZ,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 9;
+}
+
+def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD,
+                                             FalkorUnitLD, FalkorUnitVXVY,
+                                             FalkorUnitVXVY, FalkorUnitXYZ,
+                                             FalkorUnitLD, FalkorUnitLD,
+                                             FalkorUnitVXVY, FalkorUnitVXVY]> {
+  let Latency = 4;
+  let NumMicroOps = 9;
+}
+
+// Forwarding logic is modeled for vector multiply and accumulate
+// -----------------------------------------------------------------------------
+def FalkorReadVMA : SchedReadAdvance<2, [FalkorWr_1VXVY_4cyc,
+                                         FalkorWr_2VXVY_4cyc]>;
+def FalkorReadFMA : SchedReadAdvance<3, [FalkorWr_1VXVY_5cyc,
+                                         FalkorWr_1VXVY_6cyc,
+                                         FalkorWr_2VXVY_5cyc,
+                                         FalkorWr_2VXVY_6cyc]>;
+
+// SchedPredicates and WriteVariants for Immediate Zero and LSLFast
+// -----------------------------------------------------------------------------
+def FalkorImmZPred    : SchedPredicate<[{TII->isGPRZero(*MI)}]>;
+def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>; 
+
+def FalkorWr_FMOV  : SchedWriteVariant<[
+                       SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
+                       SchedVar<NoSchedPred,    [FalkorWr_1GTOV_1cyc]>]>;
+
+def FalkorWr_MOVZ  : SchedWriteVariant<[
+                       SchedVar<FalkorImmZPred, [FalkorWr_1none_0cyc]>,
+                       SchedVar<NoSchedPred,    [FalkorWr_1XYZB_1cyc]>]>;
+
+def FalkorWr_LDR   : SchedWriteVariant<[
+                       SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_3cyc]>,
+                       SchedVar<NoSchedPred,       [FalkorWr_1XYZ_1LD_4cyc]>]>;
+
+def FalkorWr_ADD   : SchedWriteVariant<[
+                       SchedVar<FalkorLSLFastPred, [FalkorWr_1XYZ_1cyc]>,
+                       SchedVar<FalkorImmZPred,    [FalkorWr_1XYZ_1cyc]>,
+                       SchedVar<NoSchedPred,       [FalkorWr_2XYZ_2cyc]>]>;
+
+def FalkorWr_PRFM  : SchedWriteVariant<[
+                       SchedVar<FalkorLSLFastPred, [FalkorWr_1ST_3cyc]>,
+                       SchedVar<NoSchedPred,       [FalkorWr_1XYZ_1ST_4cyc]>]>;
+
+def FalkorWr_LDRS  : SchedWriteVariant<[
+                       SchedVar<FalkorLSLFastPred, [FalkorWr_1LD_4cyc]>,
+                       SchedVar<NoSchedPred,       [FalkorWr_1XYZ_1LD_5cyc]>]>;
diff --git a/lib/Target/AArch64/AArch64SchedM1.td b/lib/Target/AArch64/AArch64SchedM1.td
index 14d6891253facf6856eb19d2f5fd5594cc611ab9..3fbbc0be682d739c4cf60a633565519fd5d8d1c0 100644
--- a/lib/Target/AArch64/AArch64SchedM1.td
+++ b/lib/Target/AArch64/AArch64SchedM1.td
@@ -366,7 +366,8 @@ def : InstRW<[M1WriteNALU1],  (instregex "^ZIP[12]v")>;
 // Cryptography instructions.
 def M1WriteAES : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; }
 def M1ReadAES  : SchedReadAdvance<1, [M1WriteAES]>;
-def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AES")>;
+def : InstRW<[M1WriteAES], (instregex "^AES[DE]")>;
+def : InstRW<[M1WriteAES, M1ReadAES], (instregex "^AESI?MC")>;
 
 def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>;
 def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>;
diff --git a/lib/Target/AArch64/AArch64SchedThunderX.td b/lib/Target/AArch64/AArch64SchedThunderX.td
new file mode 100644
index 0000000000000000000000000000000000000000..9a0cb702518daf5e37d58d1b057bbe9d3f7b6a4a
--- /dev/null
+++ b/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -0,0 +1,352 @@
+//==- AArch64SchedThunderX.td - Cavium ThunderX T8X Scheduling Definitions -*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the itinerary class data for the ARM ThunderX T8X
+// (T88, T81, T83) processors.
+// Loosely based on Cortex-A53 which is somewhat similar.
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See llvm/MC/MCSchedule.h for details.
+
+// Cavium ThunderX T8X scheduling machine model.
+def ThunderXT8XModel : SchedMachineModel {
+  let IssueWidth = 2;         // 2 micro-ops dispatched per cycle.
+  let MicroOpBufferSize = 0;  // ThunderX T88/T81/T83 are in-order.
+  let LoadLatency = 3;        // Optimistic load latency.
+  let MispredictPenalty = 8;  // Branch mispredict penalty.
+  let PostRAScheduler = 1;    // Use PostRA scheduler.
+  let CompleteModel = 1;
+}
+
+// Modeling each pipeline with BufferSize == 0 since T8X is in-order.
+def THXT8XUnitALU    : ProcResource<2> { let BufferSize = 0; } // Int ALU
+def THXT8XUnitMAC    : ProcResource<1> { let BufferSize = 0; } // Int MAC
+def THXT8XUnitDiv    : ProcResource<1> { let BufferSize = 0; } // Int Division
+def THXT8XUnitLdSt   : ProcResource<1> { let BufferSize = 0; } // Load/Store
+def THXT8XUnitBr     : ProcResource<1> { let BufferSize = 0; } // Branch
+def THXT8XUnitFPALU  : ProcResource<1> { let BufferSize = 0; } // FP ALU
+def THXT8XUnitFPMDS  : ProcResource<1> { let BufferSize = 0; } // FP Mul/Div/Sqrt
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types mapping the ProcResources and
+// latencies.
+
+let SchedModel = ThunderXT8XModel in {
+
+// ALU
+def : WriteRes<WriteImm, [THXT8XUnitALU]> { let Latency = 1; }
+def : WriteRes<WriteI, [THXT8XUnitALU]> { let Latency = 1; }
+def : WriteRes<WriteISReg, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteIEReg, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteIS, [THXT8XUnitALU]> { let Latency = 2; }
+def : WriteRes<WriteExtr, [THXT8XUnitALU]> { let Latency = 2; }
+
+// MAC
+def : WriteRes<WriteIM32, [THXT8XUnitMAC]> {
+  let Latency = 4;
+  let ResourceCycles = [1];
+}
+
+def : WriteRes<WriteIM64, [THXT8XUnitMAC]> {
+  let Latency = 4;
+  let ResourceCycles = [1];
+}
+
+// Div
+def : WriteRes<WriteID32, [THXT8XUnitDiv]> {
+  let Latency = 12;
+  let ResourceCycles = [6];
+}
+
+def : WriteRes<WriteID64, [THXT8XUnitDiv]> {
+  let Latency = 14;
+  let ResourceCycles = [8];
+}
+
+// Load
+def : WriteRes<WriteLD, [THXT8XUnitLdSt]> { let Latency = 3; }
+def : WriteRes<WriteLDIdx, [THXT8XUnitLdSt]> { let Latency = 3; }
+def : WriteRes<WriteLDHi, [THXT8XUnitLdSt]> { let Latency = 3; }
+
+// Vector Load
+def : WriteRes<WriteVLD, [THXT8XUnitLdSt]> {
+  let Latency = 8;
+  let ResourceCycles = [3];
+}
+
+def THXT8XWriteVLD1 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 6;
+  let ResourceCycles = [1];
+}
+
+def THXT8XWriteVLD2 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 11;
+  let ResourceCycles = [7];
+}
+
+def THXT8XWriteVLD3 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 12;
+  let ResourceCycles = [8];
+}
+
+def THXT8XWriteVLD4 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 13;
+  let ResourceCycles = [9];
+}
+
+def THXT8XWriteVLD5 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 13;
+  let ResourceCycles = [9];
+}
+
+// Pre/Post Indexing
+def : WriteRes<WriteAdr, []> { let Latency = 0; }
+
+// Store
+def : WriteRes<WriteST, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTP, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTIdx, [THXT8XUnitLdSt]> { let Latency = 1; }
+def : WriteRes<WriteSTX, [THXT8XUnitLdSt]> { let Latency = 1; }
+
+// Vector Store
+def : WriteRes<WriteVST, [THXT8XUnitLdSt]>;
+def THXT8XWriteVST1 : SchedWriteRes<[THXT8XUnitLdSt]>;
+
+def THXT8XWriteVST2 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 10;
+  let ResourceCycles = [9];
+}
+
+def THXT8XWriteVST3 : SchedWriteRes<[THXT8XUnitLdSt]> {
+  let Latency = 11;
+  let ResourceCycles = [10];
+}
+
+def : WriteRes<WriteAtomic, []> { let Unsupported = 1; }
+
+// Branch
+def : WriteRes<WriteBr, [THXT8XUnitBr]>;
+def THXT8XWriteBR : SchedWriteRes<[THXT8XUnitBr]>;
+def : WriteRes<WriteBrReg, [THXT8XUnitBr]>;
+def THXT8XWriteBRR : SchedWriteRes<[THXT8XUnitBr]>;
+def THXT8XWriteRET : SchedWriteRes<[THXT8XUnitALU]>;
+def : WriteRes<WriteSys, [THXT8XUnitBr]>;
+def : WriteRes<WriteBarrier, [THXT8XUnitBr]>;
+def : WriteRes<WriteHint, [THXT8XUnitBr]>;
+
+// FP ALU
+def : WriteRes<WriteF, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCmp, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCvt, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFCopy, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteFImm, [THXT8XUnitFPALU]> { let Latency = 6; }
+def : WriteRes<WriteV, [THXT8XUnitFPALU]> { let Latency = 6; }
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFMul, [THXT8XUnitFPMDS]> { let Latency = 6; }
+def : WriteRes<WriteFDiv, [THXT8XUnitFPMDS]> {
+  let Latency = 22;
+  let ResourceCycles = [19];
+}
+
+def THXT8XWriteFMAC : SchedWriteRes<[THXT8XUnitFPMDS]> { let Latency = 10; }
+
+def THXT8XWriteFDivSP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 12;
+  let ResourceCycles = [9];
+}
+
+def THXT8XWriteFDivDP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 22;
+  let ResourceCycles = [19];
+}
+
+def THXT8XWriteFSqrtSP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 17;
+  let ResourceCycles = [14];
+}
+
+def THXT8XWriteFSqrtDP : SchedWriteRes<[THXT8XUnitFPMDS]> {
+  let Latency = 31;
+  let ResourceCycles = [28];
+}
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types.
+
+// No forwarding for these reads.
+def : ReadAdvance<ReadExtrHi, 1>;
+def : ReadAdvance<ReadAdrBase, 2>;
+def : ReadAdvance<ReadVLD, 2>;
+
+// FIXME: This needs more targeted benchmarking.
+// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable
+//       operands are needed one cycle later if and only if they are to be
+//       shifted. Otherwise, they too are needed two cycles later. This same
+//       ReadAdvance applies to Extended registers as well, even though there is
+//       a separate SchedPredicate for them.
+def : ReadAdvance<ReadI, 2, [WriteImm, WriteI,
+                             WriteISReg, WriteIEReg, WriteIS,
+                             WriteID32, WriteID64,
+                             WriteIM32, WriteIM64]>;
+def THXT8XReadShifted : SchedReadAdvance<1, [WriteImm, WriteI,
+                                          WriteISReg, WriteIEReg, WriteIS,
+                                          WriteID32, WriteID64,
+                                          WriteIM32, WriteIM64]>;
+def THXT8XReadNotShifted : SchedReadAdvance<2, [WriteImm, WriteI,
+                                             WriteISReg, WriteIEReg, WriteIS,
+                                             WriteID32, WriteID64,
+                                             WriteIM32, WriteIM64]>;
+def THXT8XReadISReg : SchedReadVariant<[
+	SchedVar<RegShiftedPred, [THXT8XReadShifted]>,
+	SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>;
+def : SchedAlias<ReadISReg, THXT8XReadISReg>;
+
+def THXT8XReadIEReg : SchedReadVariant<[
+	SchedVar<RegExtendedPred, [THXT8XReadShifted]>,
+	SchedVar<NoSchedPred, [THXT8XReadNotShifted]>]>;
+def : SchedAlias<ReadIEReg, THXT8XReadIEReg>;
+
+// MAC - Operands are generally needed one cycle later in the MAC pipe.
+//       Accumulator operands are needed two cycles later.
+def : ReadAdvance<ReadIM, 1, [WriteImm,WriteI,
+                              WriteISReg, WriteIEReg, WriteIS,
+                              WriteID32, WriteID64,
+                              WriteIM32, WriteIM64]>;
+def : ReadAdvance<ReadIMA, 2, [WriteImm, WriteI,
+                               WriteISReg, WriteIEReg, WriteIS,
+                               WriteID32, WriteID64,
+                               WriteIM32, WriteIM64]>;
+
+// Div
+def : ReadAdvance<ReadID, 1, [WriteImm, WriteI,
+                              WriteISReg, WriteIEReg, WriteIS,
+                              WriteID32, WriteID64,
+                              WriteIM32, WriteIM64]>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific InstRW.
+
+//---
+// Branch
+//---
+def : InstRW<[THXT8XWriteBR], (instregex "^B")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^BL")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^B.*")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^CBNZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^CBZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^TBNZ")>;
+def : InstRW<[THXT8XWriteBR], (instregex "^TBZ")>;
+def : InstRW<[THXT8XWriteBRR], (instregex "^BR")>;
+def : InstRW<[THXT8XWriteBRR], (instregex "^BLR")>;
+
+//---
+// Ret
+//---
+def : InstRW<[THXT8XWriteRET], (instregex "^RET")>;
+
+//---
+// Miscellaneous
+//---
+def : InstRW<[WriteI], (instrs COPY)>;
+
+//---
+// Vector Loads
+//---
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD2i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>;
+
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD3i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVLD3], (instregex "LD3Threev(2d)$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD4i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVLD4], (instregex "LD4Fourv(2d)$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>;
+
+//---
+// Vector Stores
+//---
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST1], (instregex "ST2i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST2], (instregex "ST3i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST3Threev(2d)$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>;
+
+def : InstRW<[THXT8XWriteVST2], (instregex "ST4i(8|16|32|64)$")>;
+def : InstRW<[THXT8XWriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>;
+def : InstRW<[THXT8XWriteVST2], (instregex "ST4Fourv(2d)$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>;
+def : InstRW<[THXT8XWriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>;
+def : InstRW<[THXT8XWriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>;
+
+//---
+// Floating Point MAC, DIV, SQRT
+//---
+def : InstRW<[THXT8XWriteFMAC], (instregex "^FN?M(ADD|SUB).*")>;
+def : InstRW<[THXT8XWriteFMAC], (instregex "^FML(A|S).*")>;
+def : InstRW<[THXT8XWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[THXT8XWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[THXT8XWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[THXT8XWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[THXT8XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[THXT8XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+
+}
diff --git a/lib/Target/AArch64/AArch64SchedVulcan.td b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
similarity index 64%
rename from lib/Target/AArch64/AArch64SchedVulcan.td
rename to lib/Target/AArch64/AArch64SchedThunderX2T99.td
index 35a40c314bf4f93da2a6877c076aef82c6832a0f..3654eeca530a09e919e24ac18bbc1e47b2ce3126 100644
--- a/lib/Target/AArch64/AArch64SchedVulcan.td
+++ b/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -1,4 +1,4 @@
-//=- AArch64SchedVulcan.td - Vulcan Scheduling Defs ----------*- tablegen -*-=//
+//=- AArch64SchedThunderX2T99.td - Cavium ThunderX T99 Scheduling ---*- tablegen -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,23 +6,23 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-// 1. Introduction
 //
-// This file defines the machine model for Broadcom Vulcan to support
-// instruction scheduling and other instruction cost heuristics.
+// This file defines the scheduling model for Cavium ThunderX2T99
+// processors.
+// Based on Broadcom Vulcan.
 //
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // 2. Pipeline Description.
 
-def VulcanModel : SchedMachineModel {
+def ThunderX2T99Model : SchedMachineModel {
   let IssueWidth            =   4; // 4 micro-ops dispatched at a time.
   let MicroOpBufferSize     = 180; // 180 entries in micro-op re-order buffer.
   let LoadLatency           =   4; // Optimistic load latency.
   let MispredictPenalty     =  12; // Extra cycles for mispredicted branch.
   // Determined via a mix of micro-arch details and experimentation.
-  let LoopMicroOpBufferSize =  32; 
+  let LoopMicroOpBufferSize =  32;
   let PostRAScheduler       =   1; // Using PostRA sched.
   let CompleteModel         =   1;
 }
@@ -30,155 +30,155 @@ def VulcanModel : SchedMachineModel {
 // Define the issue ports.
 
 // Port 0: ALU, FP/SIMD.
-def VulcanP0 : ProcResource<1>;
+def THX2T99P0 : ProcResource<1>;
 
 // Port 1: ALU, FP/SIMD, integer mul/div.
-def VulcanP1 : ProcResource<1>;
+def THX2T99P1 : ProcResource<1>;
 
 // Port 2: ALU, Branch.
-def VulcanP2 : ProcResource<1>;
+def THX2T99P2 : ProcResource<1>;
 
 // Port 3: Store data.
-def VulcanP3 : ProcResource<1>;
+def THX2T99P3 : ProcResource<1>;
 
 // Port 4: Load/store.
-def VulcanP4 : ProcResource<1>;
+def THX2T99P4 : ProcResource<1>;
 
 // Port 5: Load/store.
-def VulcanP5 : ProcResource<1>;
+def THX2T99P5 : ProcResource<1>;
 
-let SchedModel = VulcanModel in {
+let SchedModel = ThunderX2T99Model in {
 
 // Define groups for the functional units on each issue port.  Each group
 // created will be used by a WriteRes later on.
 //
 // NOTE: Some groups only contain one member.  This is a way to create names for
 // the various functional units that share a single issue port.  For example,
-// "VulcanI1" for ALU ops on port 1 and "VulcanF1" for FP ops on port 1.
+// "THX2T99I1" for ALU ops on port 1 and "THX2T99F1" for FP ops on port 1.
 
 // Integer divide and multiply micro-ops only on port 1.
-def VulcanI1 : ProcResGroup<[VulcanP1]>;
+def THX2T99I1 : ProcResGroup<[THX2T99P1]>;
 
 // Branch micro-ops only on port 2.
-def VulcanI2 : ProcResGroup<[VulcanP2]>;
+def THX2T99I2 : ProcResGroup<[THX2T99P2]>;
 
 // ALU micro-ops on ports 0, 1, and 2.
-def VulcanI012 : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2]>;
+def THX2T99I012 : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2]>;
 
 // Crypto FP/SIMD micro-ops only on port 1.
-def VulcanF1 : ProcResGroup<[VulcanP1]>;
+def THX2T99F1 : ProcResGroup<[THX2T99P1]>;
 
 // FP/SIMD micro-ops on ports 0 and 1.
-def VulcanF01 : ProcResGroup<[VulcanP0, VulcanP1]>;
+def THX2T99F01 : ProcResGroup<[THX2T99P0, THX2T99P1]>;
 
 // Store data micro-ops only on port 3.
-def VulcanSD : ProcResGroup<[VulcanP3]>;
+def THX2T99SD : ProcResGroup<[THX2T99P3]>;
 
 // Load/store micro-ops on ports 4 and 5.
-def VulcanLS01 : ProcResGroup<[VulcanP4, VulcanP5]>;
+def THX2T99LS01 : ProcResGroup<[THX2T99P4, THX2T99P5]>;
 
 // 60 entry unified scheduler.
-def VulcanAny : ProcResGroup<[VulcanP0, VulcanP1, VulcanP2,
-                              VulcanP3, VulcanP4, VulcanP5]> {
+def THX2T99Any : ProcResGroup<[THX2T99P0, THX2T99P1, THX2T99P2,
+                              THX2T99P3, THX2T99P4, THX2T99P5]> {
   let BufferSize=60;
 }
 
 // Define commonly used write types for InstRW specializations.
-// All definitions follow the format: VulcanWrite_<NumCycles>Cyc_<Resources>.
+// All definitions follow the format: THX2T99Write_<NumCycles>Cyc_<Resources>.
 
 // 3 cycles on I1.
-def VulcanWrite_3Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 3; }
+def THX2T99Write_3Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 3; }
 
 // 4 cycles on I1.
-def VulcanWrite_4Cyc_I1 : SchedWriteRes<[VulcanI1]> { let Latency = 4; }
+def THX2T99Write_4Cyc_I1 : SchedWriteRes<[THX2T99I1]> { let Latency = 4; }
 
 // 1 cycle on I0, I1, or I2.
-def VulcanWrite_1Cyc_I012 : SchedWriteRes<[VulcanI012]> { let Latency = 1; }
+def THX2T99Write_1Cyc_I012 : SchedWriteRes<[THX2T99I012]> { let Latency = 1; }
 
 // 5 cycles on F1.
-def VulcanWrite_5Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 5; }
+def THX2T99Write_5Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 5; }
 
 // 7 cycles on F1.
-def VulcanWrite_7Cyc_F1 : SchedWriteRes<[VulcanF1]> { let Latency = 7; }
+def THX2T99Write_7Cyc_F1 : SchedWriteRes<[THX2T99F1]> { let Latency = 7; }
 
 // 4 cycles on F0 or F1.
-def VulcanWrite_4Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 4; }
+def THX2T99Write_4Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 4; }
 
 // 5 cycles on F0 or F1.
-def VulcanWrite_5Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 5; }
+def THX2T99Write_5Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 5; }
 
 // 6 cycles on F0 or F1.
-def VulcanWrite_6Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 6; }
+def THX2T99Write_6Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 6; }
 
 // 7 cycles on F0 or F1.
-def VulcanWrite_7Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 7; }
+def THX2T99Write_7Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 7; }
 
 // 8 cycles on F0 or F1.
-def VulcanWrite_8Cyc_F01 : SchedWriteRes<[VulcanF01]> { let Latency = 8; }
+def THX2T99Write_8Cyc_F01 : SchedWriteRes<[THX2T99F01]> { let Latency = 8; }
 
 // 16 cycles on F0 or F1.
-def VulcanWrite_16Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+def THX2T99Write_16Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
   let Latency = 16;
   let ResourceCycles = [8];
 }
 
 // 23 cycles on F0 or F1.
-def VulcanWrite_23Cyc_F01 : SchedWriteRes<[VulcanF01]> {
+def THX2T99Write_23Cyc_F01 : SchedWriteRes<[THX2T99F01]> {
   let Latency = 23;
   let ResourceCycles = [11];
 }
 
 // 1 cycles on LS0 or LS1.
-def VulcanWrite_1Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 1; }
+def THX2T99Write_1Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 1; }
 
 // 4 cycles on LS0 or LS1.
-def VulcanWrite_4Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 4; }
+def THX2T99Write_4Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 4; }
 
 // 5 cycles on LS0 or LS1.
-def VulcanWrite_5Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 5; }
+def THX2T99Write_5Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 5; }
 
 // 6 cycles on LS0 or LS1.
-def VulcanWrite_6Cyc_LS01 : SchedWriteRes<[VulcanLS01]> { let Latency = 6; }
+def THX2T99Write_6Cyc_LS01 : SchedWriteRes<[THX2T99LS01]> { let Latency = 6; }
 
 // 5 cycles on LS0 or LS1 and I0, I1, or I2.
-def VulcanWrite_5Cyc_LS01_I012 : SchedWriteRes<[VulcanLS01, VulcanI012]> {
+def THX2T99Write_5Cyc_LS01_I012 : SchedWriteRes<[THX2T99LS01, THX2T99I012]> {
   let Latency = 5;
   let NumMicroOps = 2;
 }
 
 // 5 cycles on LS0 or LS1 and 2 of I0, I1, or I2.
-def VulcanWrite_6Cyc_LS01_I012_I012 : 
-  SchedWriteRes<[VulcanLS01, VulcanI012, VulcanI012]> {
+def THX2T99Write_6Cyc_LS01_I012_I012 : 
+  SchedWriteRes<[THX2T99LS01, THX2T99I012, THX2T99I012]> {
   let Latency = 6;
   let NumMicroOps = 3;
 }
 
 // 1 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_1Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+def THX2T99Write_1Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 1;
   let NumMicroOps = 2;
 }
 
 // 5 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_5Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+def THX2T99Write_5Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 5;
   let NumMicroOps = 2;
 }
 
 // 6 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_6Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+def THX2T99Write_6Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 6;
   let NumMicroOps = 2;
 }
 
 // 7 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_7Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+def THX2T99Write_7Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 7;
   let NumMicroOps = 2;
 }
 
 // 8 cycles on LS0 or LS1 and F0 or F1.
-def VulcanWrite_8Cyc_LS01_F01 : SchedWriteRes<[VulcanLS01, VulcanF01]> {
+def THX2T99Write_8Cyc_LS01_F01 : SchedWriteRes<[THX2T99LS01, THX2T99F01]> {
   let Latency = 8;
   let NumMicroOps = 2;
 }
@@ -202,7 +202,7 @@ def : ReadAdvance<ReadVLD,     0>;
 //===----------------------------------------------------------------------===//
 // 3. Instruction Tables.
 
-let SchedModel = VulcanModel in {
+let SchedModel = ThunderX2T99Model in {
 
 //---
 // 3.1 Branch Instructions
@@ -211,7 +211,7 @@ let SchedModel = VulcanModel in {
 // Branch, immed
 // Branch and link, immed
 // Compare and branch
-def : WriteRes<WriteBr,      [VulcanI2]> { let Latency = 1; }
+def : WriteRes<WriteBr,      [THX2T99I2]> { let Latency = 1; }
 
 def : WriteRes<WriteSys,     []> { let Latency = 1; }
 def : WriteRes<WriteBarrier, []> { let Latency = 1; }
@@ -222,7 +222,7 @@ def : WriteRes<WriteAtomic,  []> { let Unsupported = 1; }
 // Branch, register
 // Branch and link, register != LR
 // Branch and link, register = LR
-def : WriteRes<WriteBrReg,   [VulcanI2]> { let Latency = 1; }
+def : WriteRes<WriteBrReg,   [THX2T99I2]> { let Latency = 1; }
 
 //---
 // 3.2 Arithmetic and Logical Instructions
@@ -233,25 +233,25 @@ def : WriteRes<WriteBrReg,   [VulcanI2]> { let Latency = 1; }
 // Conditional compare
 // Conditional select
 // Address generation
-def : WriteRes<WriteI,       [VulcanI012]> { let Latency = 1; }
+def : WriteRes<WriteI,       [THX2T99I012]> { let Latency = 1; }
 def : InstRW<[WriteI], (instrs COPY)>;
 
 // ALU, extend and/or shift
-def : WriteRes<WriteISReg,   [VulcanI012]> {
+def : WriteRes<WriteISReg,   [THX2T99I012]> {
   let Latency = 2;
   let ResourceCycles = [2];
 }
 
-def : WriteRes<WriteIEReg,   [VulcanI012]> {
+def : WriteRes<WriteIEReg,   [THX2T99I012]> {
   let Latency = 2;
   let ResourceCycles = [2];
 }
 
 // Move immed
-def : WriteRes<WriteImm,     [VulcanI012]> { let Latency = 1; }
+def : WriteRes<WriteImm,     [THX2T99I012]> { let Latency = 1; }
 
 // Variable shift
-def : WriteRes<WriteIS,      [VulcanI012]> { let Latency = 1; }
+def : WriteRes<WriteIS,      [THX2T99I012]> { let Latency = 1; }
 
 //---
 // 3.4 Divide and Multiply Instructions
@@ -259,33 +259,33 @@ def : WriteRes<WriteIS,      [VulcanI012]> { let Latency = 1; }
 
 // Divide, W-form
 // Latency range of 13-23.  Take the average.
-def : WriteRes<WriteID32,    [VulcanI1]> {
+def : WriteRes<WriteID32,    [THX2T99I1]> {
   let Latency = 18;
   let ResourceCycles = [18];
 }
 
 // Divide, X-form
 // Latency range of 13-39.  Take the average.
-def : WriteRes<WriteID64,    [VulcanI1]> {
+def : WriteRes<WriteID64,    [THX2T99I1]> {
   let Latency = 26;
   let ResourceCycles = [26];
 }
 
 // Multiply accumulate, W-form
-def : WriteRes<WriteIM32,    [VulcanI012]> { let Latency = 5; }
+def : WriteRes<WriteIM32,    [THX2T99I012]> { let Latency = 5; }
 
 // Multiply accumulate, X-form
-def : WriteRes<WriteIM64,    [VulcanI012]> { let Latency = 5; }
+def : WriteRes<WriteIM64,    [THX2T99I012]> { let Latency = 5; }
 
 // Bitfield extract, two reg
-def : WriteRes<WriteExtr,    [VulcanI012]> { let Latency = 1; }
+def : WriteRes<WriteExtr,    [THX2T99I012]> { let Latency = 1; }
 
 // Bitfield move, basic
 // Bitfield move, insert
 // NOTE: Handled by WriteIS.
 
 // Count leading
-def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
+def : InstRW<[THX2T99Write_3Cyc_I1], (instregex "^CLS(W|X)r$",
                                                "^CLZ(W|X)r$")>;
 
 // Reverse bits/bytes
@@ -300,13 +300,13 @@ def : InstRW<[VulcanWrite_3Cyc_I1], (instregex "^CLS(W|X)r$",
 // Load register, unscaled immed
 // Load register, immed unprivileged
 // Load register, unsigned immed
-def : WriteRes<WriteLD,      [VulcanLS01]> { let Latency = 4; }
+def : WriteRes<WriteLD,      [THX2T99LS01]> { let Latency = 4; }
 
 // Load register, immed post-index
 // NOTE: Handled by WriteLD, WriteI.
 // Load register, immed pre-index
 // NOTE: Handled by WriteLD, WriteAdr.
-def : WriteRes<WriteAdr,     [VulcanI012]> { let Latency = 1; }
+def : WriteRes<WriteAdr,     [THX2T99I012]> { let Latency = 1; }
 
 // Load register offset, basic
 // Load register, register offset, scale by 4/8
@@ -314,15 +314,15 @@ def : WriteRes<WriteAdr,     [VulcanI012]> { let Latency = 1; }
 // Load register offset, extend
 // Load register, register offset, extend, scale by 4/8
 // Load register, register offset, extend, scale by 2
-def VulcanWriteLDIdx : SchedWriteVariant<[
-  SchedVar<ScaledIdxPred, [VulcanWrite_6Cyc_LS01_I012_I012]>,
-  SchedVar<NoSchedPred,   [VulcanWrite_5Cyc_LS01_I012]>]>;
-def : SchedAlias<WriteLDIdx, VulcanWriteLDIdx>;
+def THX2T99WriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [THX2T99Write_6Cyc_LS01_I012_I012]>,
+  SchedVar<NoSchedPred,   [THX2T99Write_5Cyc_LS01_I012]>]>;
+def : SchedAlias<WriteLDIdx, THX2T99WriteLDIdx>;
 
-def VulcanReadAdrBase : SchedReadVariant<[
+def THX2T99ReadAdrBase : SchedReadVariant<[
   SchedVar<ScaledIdxPred, [ReadDefault]>,
   SchedVar<NoSchedPred,   [ReadDefault]>]>;
-def : SchedAlias<ReadAdrBase, VulcanReadAdrBase>;
+def : SchedAlias<ReadAdrBase, THX2T99ReadAdrBase>;
 
 // Load pair, immed offset, normal
 // Load pair, immed offset, signed words, base != SP
@@ -347,7 +347,7 @@ def : WriteRes<WriteLDHi,    []> {
 // Store register, unscaled immed
 // Store register, immed unprivileged
 // Store register, unsigned immed
-def : WriteRes<WriteST,      [VulcanLS01, VulcanSD]> {
+def : WriteRes<WriteST,      [THX2T99LS01, THX2T99SD]> {
   let Latency = 1;
   let NumMicroOps = 2;
 }
@@ -364,14 +364,14 @@ def : WriteRes<WriteST,      [VulcanLS01, VulcanSD]> {
 // Store register, register offset, extend
 // Store register, register offset, extend, scale by 4/8
 // Store register, register offset, extend, scale by 1
-def : WriteRes<WriteSTIdx, [VulcanLS01, VulcanSD, VulcanI012]> {
+def : WriteRes<WriteSTIdx, [THX2T99LS01, THX2T99SD, THX2T99I012]> {
   let Latency = 1;
   let NumMicroOps = 3;
 }
 
 // Store pair, immed offset, W-form
 // Store pair, immed offset, X-form
-def : WriteRes<WriteSTP,     [VulcanLS01, VulcanSD]> {
+def : WriteRes<WriteSTP,     [THX2T99LS01, THX2T99SD]> {
   let Latency = 1;
   let NumMicroOps = 2;
 }
@@ -389,35 +389,35 @@ def : WriteRes<WriteSTP,     [VulcanLS01, VulcanSD]> {
 // FP absolute value
 // FP min/max
 // FP negate
-def : WriteRes<WriteF,       [VulcanF01]> { let Latency = 5; }
+def : WriteRes<WriteF,       [THX2T99F01]> { let Latency = 5; }
 
 // FP arithmetic
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
 
 // FP compare
-def : WriteRes<WriteFCmp,    [VulcanF01]> { let Latency = 5; }
+def : WriteRes<WriteFCmp,    [THX2T99F01]> { let Latency = 5; }
 
 // FP divide, S-form
 // FP square root, S-form
-def : WriteRes<WriteFDiv,    [VulcanF01]> {
+def : WriteRes<WriteFDiv,    [THX2T99F01]> {
   let Latency = 16;
   let ResourceCycles = [8];
 }
 
 // FP divide, D-form
 // FP square root, D-form
-def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVDrr, FSQRTDr)>;
 
 // FP multiply
 // FP multiply accumulate
-def : WriteRes<WriteFMul, [VulcanF01]> { let Latency = 6; }
+def : WriteRes<WriteFMul, [THX2T99F01]> { let Latency = 6; }
 
 // FP round to integral
-def : InstRW<[VulcanWrite_7Cyc_F01],
+def : InstRW<[THX2T99Write_7Cyc_F01],
             (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
 
 // FP select
-def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
+def : InstRW<[THX2T99Write_4Cyc_F01], (instregex "^FCSEL")>;
 
 //---
 // 3.9 FP Miscellaneous Instructions
@@ -426,16 +426,16 @@ def : InstRW<[VulcanWrite_4Cyc_F01], (instregex "^FCSEL")>;
 // FP convert, from vec to vec reg
 // FP convert, from gen to vec reg
 // FP convert, from vec to gen reg
-def : WriteRes<WriteFCvt, [VulcanF01]> { let Latency = 7; }
+def : WriteRes<WriteFCvt, [THX2T99F01]> { let Latency = 7; }
 
 // FP move, immed
 // FP move, register
-def : WriteRes<WriteFImm, [VulcanF01]> { let Latency = 4; }
+def : WriteRes<WriteFImm, [THX2T99F01]> { let Latency = 4; }
 
 // FP transfer, from gen to vec reg
 // FP transfer, from vec to gen reg
-def : WriteRes<WriteFCopy, [VulcanF01]> { let Latency = 4; }
-def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+def : WriteRes<WriteFCopy, [THX2T99F01]> { let Latency = 4; }
+def : InstRW<[THX2T99Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 
 //---
 // 3.12 ASIMD Integer Instructions
@@ -470,39 +470,39 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
 // ASIMD shift by register, basic, Q-form
 // ASIMD shift by register, complex, D-form
 // ASIMD shift by register, complex, Q-form
-def : WriteRes<WriteV, [VulcanF01]> { let Latency = 7; }
+def : WriteRes<WriteV, [THX2T99F01]> { let Latency = 7; }
 
 // ASIMD arith, reduce, 4H/4S
 // ASIMD arith, reduce, 8B/8H
 // ASIMD arith, reduce, 16B
-def : InstRW<[VulcanWrite_5Cyc_F01], 
+def : InstRW<[THX2T99Write_5Cyc_F01], 
             (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
 
 // ASIMD logical (MOV, MVN, ORN, ORR)
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ORRv", "^ORNv", "^NOTv")>;
 
 // ASIMD polynomial (8x8) multiply long
-def : InstRW<[VulcanWrite_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instrs PMULLv8i8, PMULLv16i8)>;
 
 //---
 // 3.13 ASIMD Floating-point Instructions
 //---
 
 // ASIMD FP absolute value
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FABSv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FABSv")>;
 
 // ASIMD FP arith, normal, D-form
 // ASIMD FP arith, normal, Q-form
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FABDv", "^FADDv", "^FSUBv")>;
 
 // ASIMD FP arith,pairwise, D-form
 // ASIMD FP arith, pairwise, Q-form
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FADDPv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FADDPv")>;
 
 // ASIMD FP compare, D-form
 // ASIMD FP compare, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
                                                 "^FCMGTv", "^FCMLEv",
                                                 "^FCMLTv")>;
 
@@ -513,42 +513,42 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FCMEQv", "^FCMGEv",
 // NOTE: Handled by WriteV.
 
 // ASIMD FP divide, D-form, F32
-def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv2f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv2f32)>;
 
 // ASIMD FP divide, Q-form, F32
-def : InstRW<[VulcanWrite_16Cyc_F01], (instrs FDIVv4f32)>;
+def : InstRW<[THX2T99Write_16Cyc_F01], (instrs FDIVv4f32)>;
 
 // ASIMD FP divide, Q-form, F64
-def : InstRW<[VulcanWrite_23Cyc_F01], (instrs FDIVv2f64)>;
+def : InstRW<[THX2T99Write_23Cyc_F01], (instrs FDIVv2f64)>;
 
 // ASIMD FP max/min, normal, D-form
 // ASIMD FP max/min, normal, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXv", "^FMAXNMv",
                                                 "^FMINv", "^FMINNMv")>;
 
 // ASIMD FP max/min, pairwise, D-form
 // ASIMD FP max/min, pairwise, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXPv", "^FMAXNMPv",
                                                 "^FMINPv", "^FMINNMPv")>;
 
 // ASIMD FP max/min, reduce
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMAXVv", "^FMAXNMVv",
                                                 "^FMINVv", "^FMINNMVv")>;
 
 // ASIMD FP multiply, D-form, FZ
 // ASIMD FP multiply, D-form, no FZ
 // ASIMD FP multiply, Q-form, FZ
 // ASIMD FP multiply, Q-form, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMULv", "^FMULXv")>;
 
 // ASIMD FP multiply accumulate, Dform, FZ
 // ASIMD FP multiply accumulate, Dform, no FZ
 // ASIMD FP multiply accumulate, Qform, FZ
 // ASIMD FP multiply accumulate, Qform, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FMLAv", "^FMLSv")>;
 
 // ASIMD FP negate
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FNEGv")>;
 
 // ASIMD FP round, D-form
 // ASIMD FP round, Q-form
@@ -559,39 +559,39 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FNEGv")>;
 //--
 
 // ASIMD bit reverse
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^RBITv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>;
 
 // ASIMD bitwise insert, D-form
 // ASIMD bitwise insert, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^BIFv", "^BITv", "^BSLv")>;
 
 // ASIMD count, D-form
 // ASIMD count, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^CLSv", "^CLZv", "^CNTv")>;
 
 // ASIMD duplicate, gen reg
 // ASIMD duplicate, element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^DUPv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^DUPv")>;
 
 // ASIMD extract
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^EXTv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^EXTv")>;
 
 // ASIMD extract narrow
 // ASIMD extract narrow, saturating
 // NOTE: Handled by WriteV.
 
 // ASIMD insert, element to element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
 
 // ASIMD move, integer immed
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^MOVIv", "^MOVIDv")>;
 
 // ASIMD move, FP immed
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^FMOVv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^FMOVv")>;
 
 // ASIMD reciprocal estimate, D-form
 // ASIMD reciprocal estimate, Q-form
-def : InstRW<[VulcanWrite_5Cyc_F01], 
+def : InstRW<[THX2T99Write_5Cyc_F01], 
             (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
                                    "^FRSQRTEv", "^URSQRTEv")>;
 
@@ -599,31 +599,31 @@ def : InstRW<[VulcanWrite_5Cyc_F01],
 // ASIMD reciprocal step, D-form, no FZ
 // ASIMD reciprocal step, Q-form, FZ
 // ASIMD reciprocal step, Q-form, no FZ
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^FRECPSv", "^FRSQRTSv")>;
 
 // ASIMD reverse
-def : InstRW<[VulcanWrite_5Cyc_F01], 
+def : InstRW<[THX2T99Write_5Cyc_F01], 
             (instregex "^REV16v", "^REV32v", "^REV64v")>;
 
 // ASIMD table lookup, D-form
 // ASIMD table lookup, Q-form
-def : InstRW<[VulcanWrite_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
+def : InstRW<[THX2T99Write_8Cyc_F01], (instregex "^TBLv", "^TBXv")>;
 
 // ASIMD transfer, element to word or word
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^UMOVv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^UMOVv")>;
 
 // ASIMD transfer, element to gen reg
-def : InstRW<[VulcanWrite_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
+def : InstRW<[THX2T99Write_6Cyc_F01], (instregex "^SMOVv", "^UMOVv")>;
 
 // ASIMD transfer gen reg to element
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^INSv")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^INSv")>;
 
 // ASIMD transpose
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^TRN1v", "^TRN2v",
                                                 "^UZP1v", "^UZP2v")>;
 
 // ASIMD unzip/zip
-def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
+def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
 
 //--
 // 3.15 ASIMD Load Instructions 
@@ -631,114 +631,114 @@ def : InstRW<[VulcanWrite_5Cyc_F01], (instregex "^ZIP1v", "^ZIP2v")>;
 
 // ASIMD load, 1 element, multiple, 1 reg, D-form
 // ASIMD load, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[VulcanWrite_4Cyc_LS01], 
+def : InstRW<[THX2T99Write_4Cyc_LS01], 
             (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], 
             (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 2 reg, D-form
 // ASIMD load, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[VulcanWrite_4Cyc_LS01], 
+def : InstRW<[THX2T99Write_4Cyc_LS01], 
             (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_4Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_4Cyc_LS01, WriteAdr], 
             (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 3 reg, D-form
 // ASIMD load, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01], 
             (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01, WriteAdr], 
             (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, multiple, 4 reg, D-form
 // ASIMD load, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[VulcanWrite_6Cyc_LS01], 
+def : InstRW<[THX2T99Write_6Cyc_LS01], 
             (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01, WriteAdr], 
             (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 1 element, one lane, B/H/S
 // ASIMD load, 1 element, one lane, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD1i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 1 element, all lanes, D-form, B/H/S
 // ASIMD load, 1 element, all lanes, D-form, D
 // ASIMD load, 1 element, all lanes, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
             (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 2 element, multiple, D-form, B/H/S
 // ASIMD load, 2 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
             (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 2 element, one lane, B/H
 // ASIMD load, 2 element, one lane, S
 // ASIMD load, 2 element, one lane, D
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD2i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 2 element, all lanes, D-form, B/H/S
 // ASIMD load, 2 element, all lanes, D-form, D
 // ASIMD load, 2 element, all lanes, Q-form
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01], 
             (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_5Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_5Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 3 element, multiple, D-form, B/H/S
 // ASIMD load, 3 element, multiple, Q-form, B/H/S
 // ASIMD load, 3 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01], 
             (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 3 element, one lone, B/H
 // ASIMD load, 3 element, one lane, S
 // ASIMD load, 3 element, one lane, D
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01], (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD3i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 3 element, all lanes, D-form, B/H/S
 // ASIMD load, 3 element, all lanes, D-form, D
 // ASIMD load, 3 element, all lanes, Q-form, B/H/S
 // ASIMD load, 3 element, all lanes, Q-form, D
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01], 
             (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_7Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_7Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, multiple, D-form, B/H/S
 // ASIMD load, 4 element, multiple, Q-form, B/H/S
 // ASIMD load, 4 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01], 
             (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_8Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_8Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD load, 4 element, one lane, B/H
 // ASIMD load, 4 element, one lane, S
 // ASIMD load, 4 element, one lane, D
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01], (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD4i(8|16|32|64)_POST$")>;
 
 // ASIMD load, 4 element, all lanes, D-form, B/H/S
 // ASIMD load, 4 element, all lanes, D-form, D
 // ASIMD load, 4 element, all lanes, Q-form, B/H/S
 // ASIMD load, 4 element, all lanes, Q-form, D
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01], 
             (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_6Cyc_LS01_F01, WriteAdr], 
             (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 //--
@@ -747,82 +747,82 @@ def : InstRW<[VulcanWrite_6Cyc_LS01_F01, WriteAdr],
 
 // ASIMD store, 1 element, multiple, 1 reg, D-form
 // ASIMD store, 1 element, multiple, 1 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01], 
             (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
             (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 2 reg, D-form
 // ASIMD store, 1 element, multiple, 2 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01], 
             (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
             (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 3 reg, D-form
 // ASIMD store, 1 element, multiple, 3 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01], 
             (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
             (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, multiple, 4 reg, D-form
 // ASIMD store, 1 element, multiple, 4 reg, Q-form
-def : InstRW<[VulcanWrite_1Cyc_LS01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01], 
             (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01, WriteAdr], 
             (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 1 element, one lane, B/H/S
 // ASIMD store, 1 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
             (instregex "^ST1i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST1i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 2 element, multiple, D-form, B/H/S
 // ASIMD store, 2 element, multiple, Q-form, B/H/S
 // ASIMD store, 2 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
             (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 2 element, one lane, B/H/S
 // ASIMD store, 2 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
             (instregex "^ST2i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST2i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 3 element, multiple, D-form, B/H/S
 // ASIMD store, 3 element, multiple, Q-form, B/H/S
 // ASIMD store, 3 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
             (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 3 element, one lane, B/H
 // ASIMD store, 3 element, one lane, S
 // ASIMD store, 3 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST3i(8|16|32|64)_POST$")>;
 
 // ASIMD store, 4 element, multiple, D-form, B/H/S
 // ASIMD store, 4 element, multiple, Q-form, B/H/S
 // ASIMD store, 4 element, multiple, Q-form, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], 
             (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
 
 // ASIMD store, 4 element, one lane, B/H
 // ASIMD store, 4 element, one lane, S
 // ASIMD store, 4 element, one lane, D
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
-def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr], 
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01], (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[THX2T99Write_1Cyc_LS01_F01, WriteAdr], 
             (instregex "^ST4i(8|16|32|64)_POST$")>;
 
 //--
@@ -830,23 +830,23 @@ def : InstRW<[VulcanWrite_1Cyc_LS01_F01, WriteAdr],
 //--
 
 // Crypto AES ops
-def : InstRW<[VulcanWrite_5Cyc_F1], (instregex "^AES")>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instregex "^AES")>;
 
 // Crypto polynomial (64x64) multiply long
-def : InstRW<[VulcanWrite_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
+def : InstRW<[THX2T99Write_5Cyc_F1], (instrs PMULLv1i64, PMULLv2i64)>;
 
 // Crypto SHA1 xor ops
 // Crypto SHA1 schedule acceleration ops
 // Crypto SHA256 schedule acceleration op (1 u-op)
 // Crypto SHA256 schedule acceleration op (2 u-ops)
 // Crypto SHA256 hash acceleration ops
-def : InstRW<[VulcanWrite_7Cyc_F1], (instregex "^SHA")>;
+def : InstRW<[THX2T99Write_7Cyc_F1], (instregex "^SHA")>;
 
 //--
 // 3.18 CRC
 //--
 
 // CRC checksum ops
-def : InstRW<[VulcanWrite_4Cyc_I1], (instregex "^CRC32")>;
+def : InstRW<[THX2T99Write_4Cyc_I1], (instregex "^CRC32")>;
 
-} // SchedModel = VulcanModel
+} // SchedModel = ThunderX2T99Model
diff --git a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index 66a8f332513a738d3ac42faa3276624e2c8207ad..7f5507371fa0341bf4af04dc999db3dc3469724c 100644
--- a/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -42,10 +42,12 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     Entry.Node = Size;
     Args.push_back(Entry);
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(dl).setChain(Chain)
-      .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                 DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
-      .setDiscardResult();
+    CLI.setDebugLoc(dl)
+        .setChain(Chain)
+        .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                      DAG.getExternalSymbol(bzeroEntry, IntPtr),
+                      std::move(Args))
+        .setDiscardResult();
     std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
     return CallResult.second;
   }
@@ -53,7 +55,5 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
 }
 bool AArch64SelectionDAGInfo::generateFMAsInMachineCombiner(
     CodeGenOpt::Level OptLevel) const {
-  if (OptLevel >= CodeGenOpt::Aggressive)
-    return true;
-  return false;
+  return OptLevel >= CodeGenOpt::Aggressive;
 }
diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp
index 03e01329e0367604f32984441caaee241d2e1256..b3aba4781db8954aa33b845cfedf7bb1e30e0b6b 100644
--- a/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -81,8 +81,22 @@ void AArch64Subtarget::initializeProperties() {
     MinPrefetchStride = 1024;
     MaxPrefetchIterationsAhead = 11;
     break;
-  case Vulcan:
+  case ThunderX2T99:
+    CacheLineSize = 64;
+    PrefFunctionAlignment = 3;
+    PrefLoopAlignment = 2;
     MaxInterleaveFactor = 4;
+    PrefetchDistance = 128;
+    MinPrefetchStride = 1024;
+    MaxPrefetchIterationsAhead = 4;
+    break;
+  case ThunderX:
+  case ThunderXT88:
+  case ThunderXT81:
+  case ThunderXT83:
+    CacheLineSize = 128;
+    PrefFunctionAlignment = 3;
+    PrefLoopAlignment = 2;
     break;
   case CortexA35: break;
   case CortexA53: break;
@@ -133,9 +147,9 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
     return AArch64II::MO_GOT;
 
-  // The small code mode's direct accesses use ADRP, which cannot necessarily
-  // produce the value 0 (if the code is above 4GB).
-  if (TM.getCodeModel() == CodeModel::Small && GV->hasExternalWeakLinkage())
+  // The small code model's direct accesses use ADRP, which cannot
+  // necessarily produce the value 0 (if the code is above 4GB).
+  if (useSmallAddressing() && GV->hasExternalWeakLinkage())
     return AArch64II::MO_GOT;
 
   return AArch64II::MO_NO_FLAG;
diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h
index 48bcdbc82d09cbce3b4b0263664686e4bc2d7f39..40ad9185012cbfb114835470addd290dc271efaf 100644
--- a/lib/Target/AArch64/AArch64Subtarget.h
+++ b/lib/Target/AArch64/AArch64Subtarget.h
@@ -45,7 +45,11 @@ public:
     ExynosM1,
     Falkor,
     Kryo,
-    Vulcan
+    ThunderX2T99,
+    ThunderX,
+    ThunderXT81,
+    ThunderXT83,
+    ThunderXT88
   };
 
 protected:
@@ -65,6 +69,7 @@ protected:
   bool HasPerfMon = false;
   bool HasFullFP16 = false;
   bool HasSPE = false;
+  bool HasLSLFast = false;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove = false;
@@ -74,6 +79,10 @@ protected:
 
   // StrictAlign - Disallow unaligned memory accesses.
   bool StrictAlign = false;
+
+  // NegativeImmediates - transform instructions with negative immediates
+  bool NegativeImmediates = true;
+
   bool UseAA = false;
   bool PredictableSelectIsExpensive = false;
   bool BalanceFPOps = false;
@@ -84,6 +93,8 @@ protected:
   bool UseAlternateSExtLoadCVTF32Pattern = false;
   bool HasArithmeticBccFusion = false;
   bool HasArithmeticCbzFusion = false;
+  bool HasFuseAES = false;
+  bool HasFuseLiterals = false;
   bool DisableLatencySchedHeuristic = false;
   bool UseRSqrt = false;
   uint8_t MaxInterleaveFactor = 2;
@@ -197,6 +208,8 @@ public:
   }
   bool hasArithmeticBccFusion() const { return HasArithmeticBccFusion; }
   bool hasArithmeticCbzFusion() const { return HasArithmeticCbzFusion; }
+  bool hasFuseAES() const { return HasFuseAES; }
+  bool hasFuseLiterals() const { return HasFuseLiterals; }
   bool useRSqrt() const { return UseRSqrt; }
   unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; }
   unsigned getVectorInsertExtractBaseCost() const {
@@ -220,6 +233,7 @@ public:
   bool hasPerfMon() const { return HasPerfMon; }
   bool hasFullFP16() const { return HasFullFP16; }
   bool hasSPE() const { return HasSPE; }
+  bool hasLSLFast() const { return HasLSLFast; }
 
   bool isLittleEndian() const { return IsLittle; }
 
@@ -228,6 +242,7 @@ public:
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
   bool isTargetWindows() const { return TargetTriple.isOSWindows(); }
   bool isTargetAndroid() const { return TargetTriple.isAndroid(); }
+  bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
 
   bool isTargetCOFF() const { return TargetTriple.isOSBinFormatCOFF(); }
   bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
@@ -235,9 +250,17 @@ public:
 
   bool useAA() const override { return UseAA; }
 
-  /// getMaxInlineSizeThreshold - Returns the maximum memset / memcpy size
-  /// that still makes it profitable to inline the call.
-  unsigned getMaxInlineSizeThreshold() const { return 64; }
+  bool useSmallAddressing() const {
+    switch (TLInfo.getTargetMachine().getCodeModel()) {
+      case CodeModel::Kernel:
+        // Kernel is currently allowed only for Fuchsia targets,
+        // where it is the same as Small for almost all purposes.
+      case CodeModel::Small:
+        return true;
+      default:
+        return false;
+    }
+  }
 
   /// ParseSubtargetFeatures - Parses features string setting specified
   /// subtarget options.  Definition of function is auto generated by tblgen.
diff --git a/lib/Target/AArch64/AArch64SystemOperands.td b/lib/Target/AArch64/AArch64SystemOperands.td
index a3736c0868fb7745e247d8669492285c95a2cf5a..7c5dcb0853ebaeb34ac74558385233f50fc885de 100644
--- a/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/lib/Target/AArch64/AArch64SystemOperands.td
@@ -18,35 +18,37 @@ include "llvm/TableGen/SearchableTable.td"
 // AT (address translate) instruction options.
 //===----------------------------------------------------------------------===//
 
-class AT<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class AT<string name, bits<3> op1, bits<4> crn, bits<4> crm,
          bits<3> op2> : SearchableTable {
   let SearchableFields = ["Name", "Encoding"];
   let EnumValueField = "Encoding";
 
   string Name = name;
-  bits<16> Encoding;
-  let Encoding{15-14} = op0;
+  bits<14> Encoding;
   let Encoding{13-11} = op1;
   let Encoding{10-7} = crn;
   let Encoding{6-3} = crm;
   let Encoding{2-0} = op2;
+  code Requires = [{ {} }];
 }
 
-def : AT<"S1E1R",  0b01, 0b000, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E2R",  0b01, 0b100, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E3R",  0b01, 0b110, 0b0111, 0b1000, 0b000>;
-def : AT<"S1E1W",  0b01, 0b000, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E2W",  0b01, 0b100, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E3W",  0b01, 0b110, 0b0111, 0b1000, 0b001>;
-def : AT<"S1E0R",  0b01, 0b000, 0b0111, 0b1000, 0b010>;
-def : AT<"S1E0W",  0b01, 0b000, 0b0111, 0b1000, 0b011>;
-def : AT<"S12E1R", 0b01, 0b100, 0b0111, 0b1000, 0b100>;
-def : AT<"S12E1W", 0b01, 0b100, 0b0111, 0b1000, 0b101>;
-def : AT<"S12E0R", 0b01, 0b100, 0b0111, 0b1000, 0b110>;
-def : AT<"S12E0W", 0b01, 0b100, 0b0111, 0b1000, 0b111>;
-def : AT<"S1E1RP", 0b01, 0b000, 0b0111, 0b1001, 0b000>;
-def : AT<"S1E1WP", 0b01, 0b000, 0b0111, 0b1001, 0b001>;
-
+def : AT<"S1E1R",  0b000, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E2R",  0b100, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E3R",  0b110, 0b0111, 0b1000, 0b000>;
+def : AT<"S1E1W",  0b000, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E2W",  0b100, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E3W",  0b110, 0b0111, 0b1000, 0b001>;
+def : AT<"S1E0R",  0b000, 0b0111, 0b1000, 0b010>;
+def : AT<"S1E0W",  0b000, 0b0111, 0b1000, 0b011>;
+def : AT<"S12E1R", 0b100, 0b0111, 0b1000, 0b100>;
+def : AT<"S12E1W", 0b100, 0b0111, 0b1000, 0b101>;
+def : AT<"S12E0R", 0b100, 0b0111, 0b1000, 0b110>;
+def : AT<"S12E0W", 0b100, 0b0111, 0b1000, 0b111>;
+
+let Requires = [{ {AArch64::HasV8_2aOps} }] in {
+def : AT<"S1E1RP", 0b000, 0b0111, 0b1001, 0b000>;
+def : AT<"S1E1WP", 0b000, 0b0111, 0b1001, 0b001>;
+}
 
 //===----------------------------------------------------------------------===//
 // DMB/DSB (data barrier) instruction options.
@@ -77,28 +79,31 @@ def : DB<"sy",    0xf>;
 // DC (data cache maintenance) instruction options.
 //===----------------------------------------------------------------------===//
 
-class DC<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class DC<string name, bits<3> op1, bits<4> crn, bits<4> crm,
          bits<3> op2> : SearchableTable {
   let SearchableFields = ["Name", "Encoding"];
   let EnumValueField = "Encoding";
 
   string Name = name;
-  bits<16> Encoding;
-  let Encoding{15-14} = op0;
+  bits<14> Encoding;
   let Encoding{13-11} = op1;
   let Encoding{10-7} = crn;
   let Encoding{6-3} = crm;
   let Encoding{2-0} = op2;
+  code Requires = [{ {} }];
 }
 
-def : DC<"ZVA",   0b01, 0b011, 0b0111, 0b0100, 0b001>;
-def : DC<"IVAC",  0b01, 0b000, 0b0111, 0b0110, 0b001>;
-def : DC<"ISW",   0b01, 0b000, 0b0111, 0b0110, 0b010>;
-def : DC<"CVAC",  0b01, 0b011, 0b0111, 0b1010, 0b001>;
-def : DC<"CSW",   0b01, 0b000, 0b0111, 0b1010, 0b010>;
-def : DC<"CVAU",  0b01, 0b011, 0b0111, 0b1011, 0b001>;
-def : DC<"CIVAC", 0b01, 0b011, 0b0111, 0b1110, 0b001>;
-def : DC<"CISW",  0b01, 0b000, 0b0111, 0b1110, 0b010>;
+def : DC<"ZVA",   0b011, 0b0111, 0b0100, 0b001>;
+def : DC<"IVAC",  0b000, 0b0111, 0b0110, 0b001>;
+def : DC<"ISW",   0b000, 0b0111, 0b0110, 0b010>;
+def : DC<"CVAC",  0b011, 0b0111, 0b1010, 0b001>;
+def : DC<"CSW",   0b000, 0b0111, 0b1010, 0b010>;
+def : DC<"CVAU",  0b011, 0b0111, 0b1011, 0b001>;
+def : DC<"CIVAC", 0b011, 0b0111, 0b1110, 0b001>;
+def : DC<"CISW",  0b000, 0b0111, 0b1110, 0b010>;
+
+let Requires = [{ {AArch64::HasV8_2aOps} }] in
+def : DC<"CVAP",  0b011, 0b0111, 0b1100, 0b001>;
 
 //===----------------------------------------------------------------------===//
 // IC (instruction cache maintenance) instruction options.
@@ -120,7 +125,7 @@ class IC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2,
 
 def : IC<"IALLUIS", 0b000, 0b0111, 0b0001, 0b000, 0>;
 def : IC<"IALLU",   0b000, 0b0111, 0b0101, 0b000, 0>;
-def : IC<"IVAU",    0b000, 0b0111, 0b0001, 0b000, 1>;
+def : IC<"IVAU",    0b011, 0b0111, 0b0101, 0b001, 1>;
 
 //===----------------------------------------------------------------------===//
 // ISB (instruction-fetch barrier) instruction options.
@@ -213,14 +218,13 @@ def : PSB<"csync", 0x11>;
 // TLBI (translation lookaside buffer invalidate) instruction options.
 //===----------------------------------------------------------------------===//
 
-class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
+class TLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm,
              bits<3> op2, bit needsreg = 1> : SearchableTable {
   let SearchableFields = ["Name", "Encoding"];
   let EnumValueField = "Encoding";
 
   string Name = name;
-  bits<16> Encoding;
-  let Encoding{15-14} = op0;
+  bits<14> Encoding;
   let Encoding{13-11} = op1;
   let Encoding{10-7} = crn;
   let Encoding{6-3} = crm;
@@ -228,38 +232,38 @@ class TLBI<string name, bits<2> op0, bits<3> op1, bits<4> crn, bits<4> crm,
   bit NeedsReg = needsreg;
 }
 
-def : TLBI<"IPAS2E1IS",    0b01, 0b100, 0b1000, 0b0000, 0b001>;
-def : TLBI<"IPAS2LE1IS",   0b01, 0b100, 0b1000, 0b0000, 0b101>;
-def : TLBI<"VMALLE1IS",    0b01, 0b000, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"ALLE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b000, 0>;
-def : TLBI<"VAE1IS",       0b01, 0b000, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE2IS",       0b01, 0b100, 0b1000, 0b0011, 0b001>;
-def : TLBI<"VAE3IS",       0b01, 0b110, 0b1000, 0b0011, 0b001>;
-def : TLBI<"ASIDE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b010>;
-def : TLBI<"VAAE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b011>;
-def : TLBI<"ALLE1IS",      0b01, 0b100, 0b1000, 0b0011, 0b100, 0>;
-def : TLBI<"VALE1IS",      0b01, 0b000, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE2IS",      0b01, 0b100, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VALE3IS",      0b01, 0b110, 0b1000, 0b0011, 0b101>;
-def : TLBI<"VMALLS12E1IS", 0b01, 0b100, 0b1000, 0b0011, 0b110, 0>;
-def : TLBI<"VAALE1IS",     0b01, 0b000, 0b1000, 0b0011, 0b111>;
-def : TLBI<"IPAS2E1",      0b01, 0b100, 0b1000, 0b0100, 0b001>;
-def : TLBI<"IPAS2LE1",     0b01, 0b100, 0b1000, 0b0100, 0b101>;
-def : TLBI<"VMALLE1",      0b01, 0b000, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE2",        0b01, 0b100, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"ALLE3",        0b01, 0b110, 0b1000, 0b0111, 0b000, 0>;
-def : TLBI<"VAE1",         0b01, 0b000, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE2",         0b01, 0b100, 0b1000, 0b0111, 0b001>;
-def : TLBI<"VAE3",         0b01, 0b110, 0b1000, 0b0111, 0b001>;
-def : TLBI<"ASIDE1",       0b01, 0b000, 0b1000, 0b0111, 0b010>;
-def : TLBI<"VAAE1",        0b01, 0b000, 0b1000, 0b0111, 0b011>;
-def : TLBI<"ALLE1",        0b01, 0b100, 0b1000, 0b0111, 0b100, 0>;
-def : TLBI<"VALE1",        0b01, 0b000, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE2",        0b01, 0b100, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VALE3",        0b01, 0b110, 0b1000, 0b0111, 0b101>;
-def : TLBI<"VMALLS12E1",   0b01, 0b100, 0b1000, 0b0111, 0b110, 0>;
-def : TLBI<"VAALE1",       0b01, 0b000, 0b1000, 0b0111, 0b111>;
+def : TLBI<"IPAS2E1IS",    0b100, 0b1000, 0b0000, 0b001>;
+def : TLBI<"IPAS2LE1IS",   0b100, 0b1000, 0b0000, 0b101>;
+def : TLBI<"VMALLE1IS",    0b000, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE2IS",      0b100, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"ALLE3IS",      0b110, 0b1000, 0b0011, 0b000, 0>;
+def : TLBI<"VAE1IS",       0b000, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE2IS",       0b100, 0b1000, 0b0011, 0b001>;
+def : TLBI<"VAE3IS",       0b110, 0b1000, 0b0011, 0b001>;
+def : TLBI<"ASIDE1IS",     0b000, 0b1000, 0b0011, 0b010>;
+def : TLBI<"VAAE1IS",      0b000, 0b1000, 0b0011, 0b011>;
+def : TLBI<"ALLE1IS",      0b100, 0b1000, 0b0011, 0b100, 0>;
+def : TLBI<"VALE1IS",      0b000, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE2IS",      0b100, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VALE3IS",      0b110, 0b1000, 0b0011, 0b101>;
+def : TLBI<"VMALLS12E1IS", 0b100, 0b1000, 0b0011, 0b110, 0>;
+def : TLBI<"VAALE1IS",     0b000, 0b1000, 0b0011, 0b111>;
+def : TLBI<"IPAS2E1",      0b100, 0b1000, 0b0100, 0b001>;
+def : TLBI<"IPAS2LE1",     0b100, 0b1000, 0b0100, 0b101>;
+def : TLBI<"VMALLE1",      0b000, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE2",        0b100, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"ALLE3",        0b110, 0b1000, 0b0111, 0b000, 0>;
+def : TLBI<"VAE1",         0b000, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE2",         0b100, 0b1000, 0b0111, 0b001>;
+def : TLBI<"VAE3",         0b110, 0b1000, 0b0111, 0b001>;
+def : TLBI<"ASIDE1",       0b000, 0b1000, 0b0111, 0b010>;
+def : TLBI<"VAAE1",        0b000, 0b1000, 0b0111, 0b011>;
+def : TLBI<"ALLE1",        0b100, 0b1000, 0b0111, 0b100, 0>;
+def : TLBI<"VALE1",        0b000, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE2",        0b100, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VALE3",        0b110, 0b1000, 0b0111, 0b101>;
+def : TLBI<"VMALLS12E1",   0b100, 0b1000, 0b0111, 0b110, 0>;
+def : TLBI<"VAALE1",       0b000, 0b1000, 0b0111, 0b111>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AArch64/AArch64TargetMachine.cpp b/lib/Target/AArch64/AArch64TargetMachine.cpp
index 9e93833c0a0958247813e55772b74898d9de1fe4..dcc51bf023299b64231c44ea1a3a4aecf65d6762 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -12,8 +12,8 @@
 
 #include "AArch64.h"
 #include "AArch64CallLowering.h"
-#include "AArch64InstructionSelector.h"
 #include "AArch64LegalizerInfo.h"
+#include "AArch64MacroFusion.h"
 #ifdef LLVM_BUILD_GLOBAL_ISEL
 #include "AArch64RegisterBankInfo.h"
 #endif
@@ -117,7 +117,7 @@ EnableA53Fix835769("aarch64-fix-cortex-a53-835769", cl::Hidden,
 static cl::opt<bool>
     EnableAddressTypePromotion("aarch64-enable-type-promotion", cl::Hidden,
                                cl::desc("Enable the type promotion pass"),
-                               cl::init(true));
+                               cl::init(false));
 
 static cl::opt<bool>
     EnableGEPOpt("aarch64-enable-gep-opt", cl::Hidden,
@@ -138,6 +138,11 @@ static cl::opt<bool>
                            cl::desc("Enable the loop data prefetch pass"),
                            cl::init(true));
 
+static cl::opt<int> EnableGlobalISelAtO(
+    "aarch64-enable-global-isel-at-O", cl::Hidden,
+    cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
+    cl::init(-1));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -280,7 +285,8 @@ AArch64TargetMachine::getSubtargetImpl(const Function &F) const {
     // FIXME: At this point, we can't rely on Subtarget having RBI.
     // It's awkward to mix passing RBI and the Subtarget; should we pass
     // TII/TRI as well?
-    GISel->InstSelector.reset(new AArch64InstructionSelector(*this, *I, *RBI));
+    GISel->InstSelector.reset(
+        createAArch64InstructionSelector(*this, *I, *RBI));
 
     GISel->RegBankInfo.reset(RBI);
 #endif
@@ -325,10 +331,24 @@ public:
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
     DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
     DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
-    DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+    DAG->addMutation(createAArch64MacroFusionDAGMutation());
     return DAG;
   }
 
+  ScheduleDAGInstrs *
+  createPostMachineScheduler(MachineSchedContext *C) const override {
+    const AArch64Subtarget &ST = C->MF->getSubtarget<AArch64Subtarget>();
+    if (ST.hasFuseLiterals()) {
+      // Run the Macro Fusion after RA again since literals are expanded from
+      // pseudos then (v. addPreSched2()).
+      ScheduleDAGMI *DAG = createGenericSchedPostRA(C);
+      DAG->addMutation(createAArch64MacroFusionDAGMutation());
+      return DAG;
+    }
+
+    return nullptr;
+  }
+
   void addIRPasses()  override;
   bool addPreISel() override;
   bool addInstSelector() override;
@@ -343,6 +363,8 @@ public:
   void addPostRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
+
+  bool isGlobalISelEnabled() const override;
 };
 
 } // end anonymous namespace
@@ -452,6 +474,10 @@ bool AArch64PassConfig::addGlobalInstructionSelect() {
 }
 #endif
 
+bool AArch64PassConfig::isGlobalISelEnabled() const {
+  return TM->getOptLevel() <= EnableGlobalISelAtO;
+}
+
 bool AArch64PassConfig::addILPOpts() {
   if (EnableCondOpt)
     addPass(createAArch64ConditionOptimizerPass());
diff --git a/lib/Target/AArch64/AArch64TargetMachine.h b/lib/Target/AArch64/AArch64TargetMachine.h
index 6fa5e83957e1bee05643bae84ea8938f19507b6e..2c75a3258c1cb2004a81885721c75ddac1cab5d3 100644
--- a/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/lib/Target/AArch64/AArch64TargetMachine.h
@@ -21,6 +21,8 @@
 
 namespace llvm {
 
+class AArch64RegisterBankInfo;
+
 class AArch64TargetMachine : public LLVMTargetMachine {
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index b8833e5a5552d09545a739ac9941f5cbdc726658..4d59da0c646d26b41751f0eaa51cca0f86cf53a1 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -176,7 +176,8 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
-int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -436,7 +437,7 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 }
 
 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                       Type *CondTy) {
+                                       Type *CondTy, const Instruction *I) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // We don't lower some vector selects well that are wider than the register
@@ -463,11 +464,12 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         return Entry->Cost;
     }
   }
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
-                                    unsigned Alignment, unsigned AddressSpace) {
+                                    unsigned Alignment, unsigned AddressSpace,
+                                    const Instruction *I) {
   auto LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
@@ -505,12 +507,14 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor()) {
     unsigned NumElts = VecTy->getVectorNumElements();
-    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
-    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
-      return Factor;
+    // Accesses having vector types that are a multiple of 128 bits can be
+    // matched to more than one ldN/stN instruction.
+    if (NumElts % Factor == 0 &&
+        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
+      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
@@ -594,8 +598,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_ld4:
     Info.ReadMem = true;
     Info.WriteMem = false;
-    Info.IsSimple = true;
-    Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(0);
     break;
   case Intrinsic::aarch64_neon_st2:
@@ -603,8 +605,6 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   case Intrinsic::aarch64_neon_st4:
     Info.ReadMem = false;
     Info.WriteMem = true;
-    Info.IsSimple = true;
-    Info.NumMemRefs = 1;
     Info.PtrVal = Inst->getArgOperand(Inst->getNumArgOperands() - 1);
     break;
   }
@@ -628,6 +628,38 @@ bool AArch64TTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
   return true;
 }
 
+/// See if \p I should be considered for address type promotion. We check if \p
+/// I is a sext with right type and used in memory accesses. If it used in a
+/// "complex" getelementptr, we allow it to be promoted without finding other
+/// sext instructions that sign extended the same initial value. A getelementptr
+/// is considered as "complex" if it has more than 2 operands.
+bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
+    const Instruction &I, bool &AllowPromotionWithoutCommonHeader) {
+  bool Considerable = false;
+  AllowPromotionWithoutCommonHeader = false;
+  if (!isa<SExtInst>(&I))
+    return false;
+  Type *ConsideredSExtType =
+      Type::getInt64Ty(I.getParent()->getParent()->getContext());
+  if (I.getType() != ConsideredSExtType)
+    return false;
+  // See if the sext is the one with the right type and used in at least one
+  // GetElementPtrInst.
+  for (const User *U : I.users()) {
+    if (const GetElementPtrInst *GEPInst = dyn_cast<GetElementPtrInst>(U)) {
+      Considerable = true;
+      // A getelementptr is considered as "complex" if it has more than 2
+      // operands. We will promote a SExt used in such complex GEP as we
+      // expect some computation to be merged if they are done on 64 bits.
+      if (GEPInst->getNumOperands() > 2) {
+        AllowPromotionWithoutCommonHeader = true;
+        break;
+      }
+    }
+  }
+  return Considerable;
+}
+
 unsigned AArch64TTIImpl::getCacheLineSize() {
   return ST->getCacheLineSize();
 }
diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h
index a7f7fb9457aa6aaa99c88783895e3d7f4a090f4b..e37c003e064c56f548c22790f1dfa51d0c183f70 100644
--- a/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -86,7 +86,8 @@ public:
 
   unsigned getMaxInterleaveFactor(unsigned VF);
 
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
 
   int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                                unsigned Index);
@@ -103,10 +104,11 @@ public:
 
   int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
 
   int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
 
@@ -121,6 +123,10 @@ public:
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
                                  unsigned AddressSpace);
 
+  bool
+  shouldConsiderAddressTypePromotion(const Instruction &I,
+                                     bool &AllowPromotionWithoutCommonHeader);
+
   unsigned getCacheLineSize();
 
   unsigned getPrefetchDistance();
diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index b86a283b40d493c6839558c54865445ac7b69d6f..cbab68979c56783aa6d55957c4a09430dcd9ad14 100644
--- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -74,6 +74,7 @@ private:
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
 
   bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
   AArch64CC::CondCode parseCondCodeString(StringRef Cond);
   bool parseCondCode(OperandVector &Operands, bool invertCondCode);
   unsigned matchRegisterNameAlias(StringRef Name, bool isVector);
@@ -537,154 +538,15 @@ public:
     return (Val % Scale) == 0 && Val >= 0 && (Val / Scale) < 0x1000;
   }
 
-  bool isImm0_1() const {
+  template <int N, int M>
+  bool isImmInRange() const {
     if (!isImm())
       return false;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
     if (!MCE)
       return false;
     int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 2);
-  }
-
-  bool isImm0_7() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 8);
-  }
-
-  bool isImm1_8() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 9);
-  }
-
-  bool isImm0_15() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 16);
-  }
-
-  bool isImm1_16() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val > 0 && Val < 17);
-  }
-
-  bool isImm0_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 32);
-  }
-
-  bool isImm1_31() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 32);
-  }
-
-  bool isImm1_32() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 33);
-  }
-
-  bool isImm0_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 64);
-  }
-
-  bool isImm1_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 64);
-  }
-
-  bool isImm1_64() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 1 && Val < 65);
-  }
-
-  bool isImm0_127() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 128);
-  }
-
-  bool isImm0_255() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 256);
-  }
-
-  bool isImm0_65535() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 0 && Val < 65536);
-  }
-
-  bool isImm32_63() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return false;
-    int64_t Val = MCE->getValue();
-    return (Val >= 32 && Val < 64);
+    return (Val >= N && Val <= M);
   }
 
   bool isLogicalImm32() const {
@@ -804,31 +666,8 @@ public:
     return AArch64_AM::isAdvSIMDModImmType10(MCE->getValue());
   }
 
-  bool isBranchTarget26() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x2000000 << 2) && Val <= (0x1ffffff << 2));
-  }
-
-  bool isPCRelLabel19() const {
-    if (!isImm())
-      return false;
-    const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
-    if (!MCE)
-      return true;
-    int64_t Val = MCE->getValue();
-    if (Val & 0x3)
-      return false;
-    return (Val >= -(0x40000 << 2) && Val <= (0x3ffff << 2));
-  }
-
-  bool isBranchTarget14() const {
+  template<int N>
+  bool isBranchTarget() const {
     if (!isImm())
       return false;
     const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(getImm());
@@ -837,7 +676,8 @@ public:
     int64_t Val = MCE->getValue();
     if (Val & 0x3)
       return false;
-    return (Val >= -(0x2000 << 2) && Val <= (0x1fff << 2));
+    assert(N > 0 && "Branch target immediate cannot be 0 bits!");
+    return (Val >= -((1<<(N-1)) << 2) && Val <= (((1<<(N-1))-1) << 2));
   }
 
   bool
@@ -2494,6 +2334,35 @@ AArch64AsmParser::tryParseOptionalShiftExtend(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
+  if (FBS[AArch64::HasV8_1aOps])
+    Str += "ARMv8.1a";
+  else if (FBS[AArch64::HasV8_2aOps])
+    Str += "ARMv8.2a";
+  else
+    Str += "(unknown)";
+}
+
+void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands,
+                                      SMLoc S) {
+  const uint16_t Op2 = Encoding & 7;
+  const uint16_t Cm = (Encoding & 0x78) >> 3;
+  const uint16_t Cn = (Encoding & 0x780) >> 7;
+  const uint16_t Op1 = (Encoding & 0x3800) >> 11;
+
+  const MCExpr *Expr = MCConstantExpr::create(Op1, getContext());
+
+  Operands.push_back(
+      AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));
+  Operands.push_back(
+      AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));
+  Expr = MCConstantExpr::create(Op2, getContext());
+  Operands.push_back(
+      AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
+}
+
 /// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
 /// the SYS instruction. Parse them specially so that we create a SYS MCInst.
 bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
@@ -2510,228 +2379,48 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
   StringRef Op = Tok.getString();
   SMLoc S = Tok.getLoc();
 
-  const MCExpr *Expr = nullptr;
-
-#define SYS_ALIAS(op1, Cn, Cm, op2)                                            \
-  do {                                                                         \
-    Expr = MCConstantExpr::create(op1, getContext());                          \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateSysCR(Cn, S, getLoc(), getContext()));           \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateSysCR(Cm, S, getLoc(), getContext()));           \
-    Expr = MCConstantExpr::create(op2, getContext());                          \
-    Operands.push_back(                                                        \
-        AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));           \
-  } while (false)
-
   if (Mnemonic == "ic") {
-    if (!Op.compare_lower("ialluis")) {
-      // SYS #0, C7, C1, #0
-      SYS_ALIAS(0, 7, 1, 0);
-    } else if (!Op.compare_lower("iallu")) {
-      // SYS #0, C7, C5, #0
-      SYS_ALIAS(0, 7, 5, 0);
-    } else if (!Op.compare_lower("ivau")) {
-      // SYS #3, C7, C5, #1
-      SYS_ALIAS(3, 7, 5, 1);
-    } else {
+    const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op);
+    if (!IC)
       return TokError("invalid operand for IC instruction");
+    else if (!IC->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("IC " + std::string(IC->Name) + " requires ");
+      setRequiredFeatureString(IC->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(IC->Encoding, Operands, S);
   } else if (Mnemonic == "dc") {
-    if (!Op.compare_lower("zva")) {
-      // SYS #3, C7, C4, #1
-      SYS_ALIAS(3, 7, 4, 1);
-    } else if (!Op.compare_lower("ivac")) {
-      // SYS #3, C7, C6, #1
-      SYS_ALIAS(0, 7, 6, 1);
-    } else if (!Op.compare_lower("isw")) {
-      // SYS #0, C7, C6, #2
-      SYS_ALIAS(0, 7, 6, 2);
-    } else if (!Op.compare_lower("cvac")) {
-      // SYS #3, C7, C10, #1
-      SYS_ALIAS(3, 7, 10, 1);
-    } else if (!Op.compare_lower("csw")) {
-      // SYS #0, C7, C10, #2
-      SYS_ALIAS(0, 7, 10, 2);
-    } else if (!Op.compare_lower("cvau")) {
-      // SYS #3, C7, C11, #1
-      SYS_ALIAS(3, 7, 11, 1);
-    } else if (!Op.compare_lower("civac")) {
-      // SYS #3, C7, C14, #1
-      SYS_ALIAS(3, 7, 14, 1);
-    } else if (!Op.compare_lower("cisw")) {
-      // SYS #0, C7, C14, #2
-      SYS_ALIAS(0, 7, 14, 2);
-    } else if (!Op.compare_lower("cvap")) {
-      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
-        // SYS #3, C7, C12, #1
-        SYS_ALIAS(3, 7, 12, 1);
-      } else {
-        return TokError("DC CVAP requires ARMv8.2a");
-      }
-    } else {
+    const AArch64DC::DC *DC = AArch64DC::lookupDCByName(Op);
+    if (!DC)
       return TokError("invalid operand for DC instruction");
+    else if (!DC->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("DC " + std::string(DC->Name) + " requires ");
+      setRequiredFeatureString(DC->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(DC->Encoding, Operands, S);
   } else if (Mnemonic == "at") {
-    if (!Op.compare_lower("s1e1r")) {
-      // SYS #0, C7, C8, #0
-      SYS_ALIAS(0, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e2r")) {
-      // SYS #4, C7, C8, #0
-      SYS_ALIAS(4, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e3r")) {
-      // SYS #6, C7, C8, #0
-      SYS_ALIAS(6, 7, 8, 0);
-    } else if (!Op.compare_lower("s1e1w")) {
-      // SYS #0, C7, C8, #1
-      SYS_ALIAS(0, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e2w")) {
-      // SYS #4, C7, C8, #1
-      SYS_ALIAS(4, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e3w")) {
-      // SYS #6, C7, C8, #1
-      SYS_ALIAS(6, 7, 8, 1);
-    } else if (!Op.compare_lower("s1e0r")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 2);
-    } else if (!Op.compare_lower("s1e0w")) {
-      // SYS #0, C7, C8, #3
-      SYS_ALIAS(0, 7, 8, 3);
-    } else if (!Op.compare_lower("s12e1r")) {
-      // SYS #4, C7, C8, #4
-      SYS_ALIAS(4, 7, 8, 4);
-    } else if (!Op.compare_lower("s12e1w")) {
-      // SYS #4, C7, C8, #5
-      SYS_ALIAS(4, 7, 8, 5);
-    } else if (!Op.compare_lower("s12e0r")) {
-      // SYS #4, C7, C8, #6
-      SYS_ALIAS(4, 7, 8, 6);
-    } else if (!Op.compare_lower("s12e0w")) {
-      // SYS #4, C7, C8, #7
-      SYS_ALIAS(4, 7, 8, 7);
-    } else if (!Op.compare_lower("s1e1rp")) {
-      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
-        // SYS #0, C7, C9, #0
-        SYS_ALIAS(0, 7, 9, 0);
-      } else {
-        return TokError("AT S1E1RP requires ARMv8.2a");
-      }
-    } else if (!Op.compare_lower("s1e1wp")) {
-      if (getSTI().getFeatureBits()[AArch64::HasV8_2aOps]) {
-        // SYS #0, C7, C9, #1
-        SYS_ALIAS(0, 7, 9, 1);
-      } else {
-        return TokError("AT S1E1WP requires ARMv8.2a");
-      }
-    } else {
+    const AArch64AT::AT *AT = AArch64AT::lookupATByName(Op);
+    if (!AT)
       return TokError("invalid operand for AT instruction");
+    else if (!AT->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("AT " + std::string(AT->Name) + " requires ");
+      setRequiredFeatureString(AT->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(AT->Encoding, Operands, S);
   } else if (Mnemonic == "tlbi") {
-    if (!Op.compare_lower("vmalle1is")) {
-      // SYS #0, C8, C3, #0
-      SYS_ALIAS(0, 8, 3, 0);
-    } else if (!Op.compare_lower("alle2is")) {
-      // SYS #4, C8, C3, #0
-      SYS_ALIAS(4, 8, 3, 0);
-    } else if (!Op.compare_lower("alle3is")) {
-      // SYS #6, C8, C3, #0
-      SYS_ALIAS(6, 8, 3, 0);
-    } else if (!Op.compare_lower("vae1is")) {
-      // SYS #0, C8, C3, #1
-      SYS_ALIAS(0, 8, 3, 1);
-    } else if (!Op.compare_lower("vae2is")) {
-      // SYS #4, C8, C3, #1
-      SYS_ALIAS(4, 8, 3, 1);
-    } else if (!Op.compare_lower("vae3is")) {
-      // SYS #6, C8, C3, #1
-      SYS_ALIAS(6, 8, 3, 1);
-    } else if (!Op.compare_lower("aside1is")) {
-      // SYS #0, C8, C3, #2
-      SYS_ALIAS(0, 8, 3, 2);
-    } else if (!Op.compare_lower("vaae1is")) {
-      // SYS #0, C8, C3, #3
-      SYS_ALIAS(0, 8, 3, 3);
-    } else if (!Op.compare_lower("alle1is")) {
-      // SYS #4, C8, C3, #4
-      SYS_ALIAS(4, 8, 3, 4);
-    } else if (!Op.compare_lower("vale1is")) {
-      // SYS #0, C8, C3, #5
-      SYS_ALIAS(0, 8, 3, 5);
-    } else if (!Op.compare_lower("vaale1is")) {
-      // SYS #0, C8, C3, #7
-      SYS_ALIAS(0, 8, 3, 7);
-    } else if (!Op.compare_lower("vmalle1")) {
-      // SYS #0, C8, C7, #0
-      SYS_ALIAS(0, 8, 7, 0);
-    } else if (!Op.compare_lower("alle2")) {
-      // SYS #4, C8, C7, #0
-      SYS_ALIAS(4, 8, 7, 0);
-    } else if (!Op.compare_lower("vale2is")) {
-      // SYS #4, C8, C3, #5
-      SYS_ALIAS(4, 8, 3, 5);
-    } else if (!Op.compare_lower("vale3is")) {
-      // SYS #6, C8, C3, #5
-      SYS_ALIAS(6, 8, 3, 5);
-    } else if (!Op.compare_lower("alle3")) {
-      // SYS #6, C8, C7, #0
-      SYS_ALIAS(6, 8, 7, 0);
-    } else if (!Op.compare_lower("vae1")) {
-      // SYS #0, C8, C7, #1
-      SYS_ALIAS(0, 8, 7, 1);
-    } else if (!Op.compare_lower("vae2")) {
-      // SYS #4, C8, C7, #1
-      SYS_ALIAS(4, 8, 7, 1);
-    } else if (!Op.compare_lower("vae3")) {
-      // SYS #6, C8, C7, #1
-      SYS_ALIAS(6, 8, 7, 1);
-    } else if (!Op.compare_lower("aside1")) {
-      // SYS #0, C8, C7, #2
-      SYS_ALIAS(0, 8, 7, 2);
-    } else if (!Op.compare_lower("vaae1")) {
-      // SYS #0, C8, C7, #3
-      SYS_ALIAS(0, 8, 7, 3);
-    } else if (!Op.compare_lower("alle1")) {
-      // SYS #4, C8, C7, #4
-      SYS_ALIAS(4, 8, 7, 4);
-    } else if (!Op.compare_lower("vale1")) {
-      // SYS #0, C8, C7, #5
-      SYS_ALIAS(0, 8, 7, 5);
-    } else if (!Op.compare_lower("vale2")) {
-      // SYS #4, C8, C7, #5
-      SYS_ALIAS(4, 8, 7, 5);
-    } else if (!Op.compare_lower("vale3")) {
-      // SYS #6, C8, C7, #5
-      SYS_ALIAS(6, 8, 7, 5);
-    } else if (!Op.compare_lower("vaale1")) {
-      // SYS #0, C8, C7, #7
-      SYS_ALIAS(0, 8, 7, 7);
-    } else if (!Op.compare_lower("ipas2e1")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 4, 1);
-    } else if (!Op.compare_lower("ipas2le1")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 4, 5);
-    } else if (!Op.compare_lower("ipas2e1is")) {
-      // SYS #4, C8, C4, #1
-      SYS_ALIAS(4, 8, 0, 1);
-    } else if (!Op.compare_lower("ipas2le1is")) {
-      // SYS #4, C8, C4, #5
-      SYS_ALIAS(4, 8, 0, 5);
-    } else if (!Op.compare_lower("vmalls12e1")) {
-      // SYS #4, C8, C7, #6
-      SYS_ALIAS(4, 8, 7, 6);
-    } else if (!Op.compare_lower("vmalls12e1is")) {
-      // SYS #4, C8, C3, #6
-      SYS_ALIAS(4, 8, 3, 6);
-    } else {
+    const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByName(Op);
+    if (!TLBI)
       return TokError("invalid operand for TLBI instruction");
+    else if (!TLBI->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("TLBI " + std::string(TLBI->Name) + " requires ");
+      setRequiredFeatureString(TLBI->getRequiredFeatures(), Str);
+      return TokError(Str.c_str());
     }
+    createSysAlias(TLBI->Encoding, Operands, S);
   }
 
-#undef SYS_ALIAS
-
   Parser.Lex(); // Eat operand.
 
   bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
@@ -2744,12 +2433,10 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     HasRegister = true;
   }
 
-  if (ExpectRegister && !HasRegister) {
+  if (ExpectRegister && !HasRegister)
     return TokError("specified " + Mnemonic + " op requires a register");
-  }
-  else if (!ExpectRegister && HasRegister) {
+  else if (!ExpectRegister && HasRegister)
     return TokError("specified " + Mnemonic + " op does not use a register");
-  }
 
   if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
     return true;
@@ -2884,7 +2571,6 @@ bool AArch64AsmParser::tryParseVectorRegister(OperandVector &Operands) {
 
 /// parseRegister - Parse a non-vector register operand.
 bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
-  MCAsmParser &Parser = getParser();
   SMLoc S = getLoc();
   // Try for a vector register.
   if (!tryParseVectorRegister(Operands))
@@ -2897,30 +2583,6 @@ bool AArch64AsmParser::parseRegister(OperandVector &Operands) {
   Operands.push_back(
       AArch64Operand::CreateReg(Reg, false, S, getLoc(), getContext()));
 
-  // A small number of instructions (FMOVXDhighr, for example) have "[1]"
-  // as a string token in the instruction itself.
-  SMLoc LBracS = getLoc();
-  const AsmToken &Tok = Parser.getTok();
-  if (parseOptionalToken(AsmToken::LBrac)) {
-    if (Tok.is(AsmToken::Integer)) {
-      SMLoc IntS = getLoc();
-      int64_t Val = Tok.getIntVal();
-      if (Val == 1) {
-        Parser.Lex();
-        SMLoc RBracS = getLoc();
-        if (parseOptionalToken(AsmToken::RBrac)) {
-          Operands.push_back(
-              AArch64Operand::CreateToken("[", false, LBracS, getContext()));
-          Operands.push_back(
-              AArch64Operand::CreateToken("1", false, IntS, getContext()));
-          Operands.push_back(
-              AArch64Operand::CreateToken("]", false, RBracS, getContext()));
-          return false;
-        }
-      }
-    }
-  }
-
   return false;
 }
 
@@ -3696,6 +3358,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode) {
     return Error(Loc, "immediate must be an integer in range [0, 63].");
   case Match_InvalidImm0_127:
     return Error(Loc, "immediate must be an integer in range [0, 127].");
+  case Match_InvalidImm0_255:
+    return Error(Loc, "immediate must be an integer in range [0, 255].");
   case Match_InvalidImm0_65535:
     return Error(Loc, "immediate must be an integer in range [0, 65535].");
   case Match_InvalidImm1_8:
@@ -4120,6 +3784,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidImm0_31:
   case Match_InvalidImm0_63:
   case Match_InvalidImm0_127:
+  case Match_InvalidImm0_255:
   case Match_InvalidImm0_65535:
   case Match_InvalidImm1_8:
   case Match_InvalidImm1_16:
diff --git a/lib/Target/AArch64/CMakeLists.txt b/lib/Target/AArch64/CMakeLists.txt
index ae4fbf15a76ff40085306184fb8bb1483016c1ee..6d0930c358f1d60c2781aa5dae7c366804e5515b 100644
--- a/lib/Target/AArch64/CMakeLists.txt
+++ b/lib/Target/AArch64/CMakeLists.txt
@@ -56,6 +56,7 @@ add_llvm_target(AArch64CodeGen
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
   AArch64LoadStoreOptimizer.cpp
+  AArch64MacroFusion.cpp
   AArch64MCInstLower.cpp
   AArch64PromoteConstant.cpp
   AArch64PBQPRegAlloc.cpp
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
index b4f85204714f1a5ee4810e6785e7dfe7e90f963c..41ae70f85e584119f118022b3f8aa40642fc119e 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -16,12 +16,20 @@
 #include "Utils/AArch64BaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <string>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -451,8 +459,8 @@ static const LdStNInstrDesc LdStNInstInfo[] = {
   { AArch64::LD3i64,            "ld3",  ".d",     1, true,  0  },
   { AArch64::LD3i8_POST,        "ld3",  ".b",     2, true,  3  },
   { AArch64::LD3i16_POST,       "ld3",  ".h",     2, true,  6  },
-  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12  },
-  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24  },
+  { AArch64::LD3i32_POST,       "ld3",  ".s",     2, true,  12 },
+  { AArch64::LD3i64_POST,       "ld3",  ".d",     2, true,  24 },
   { AArch64::LD3Rv16b,          "ld3r", ".16b",   0, false, 0  },
   { AArch64::LD3Rv8h,           "ld3r", ".8h",    0, false, 0  },
   { AArch64::LD3Rv4s,           "ld3r", ".4s",    0, false, 0  },
@@ -731,7 +739,6 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
   assert(Opcode == AArch64::SYSxt && "Invalid opcode for SYS alias!");
 #endif
 
-  const char *Asm = nullptr;
   const MCOperand &Op1 = MI->getOperand(0);
   const MCOperand &Cn = MI->getOperand(1);
   const MCOperand &Cm = MI->getOperand(2);
@@ -742,230 +749,74 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
   unsigned CmVal = Cm.getImm();
   unsigned Op2Val = Op2.getImm();
 
+  uint16_t Encoding = Op2Val;
+  Encoding |= CmVal << 3;
+  Encoding |= CnVal << 7;
+  Encoding |= Op1Val << 11;
+
+  bool NeedsReg;
+  std::string Ins;
+  std::string Name;
+
   if (CnVal == 7) {
     switch (CmVal) {
-    default:
-      break;
-
+    default: return false;
     // IC aliases
-    case 1:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tialluis";
-      break;
-    case 5:
-      if (Op1Val == 0 && Op2Val == 0)
-        Asm = "ic\tiallu";
-      else if (Op1Val == 3 && Op2Val == 1)
-        Asm = "ic\tivau";
-      break;
-
+    case 1: case 5: {
+      const AArch64IC::IC *IC = AArch64IC::lookupICByEncoding(Encoding);
+      if (!IC || !IC->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = IC->NeedsReg;
+      Ins = "ic\t";
+      Name = std::string(IC->Name);
+    }
+    break;
     // DC aliases
-    case 4:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tzva";
-      break;
-    case 6:
-      if (Op1Val == 0 && Op2Val == 1)
-        Asm = "dc\tivac";
-      if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tisw";
-      break;
-    case 10:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcsw";
-      break;
-    case 11:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcvau";
-      break;
-    case 12:
-      if (Op1Val == 3 && Op2Val == 1 &&
-          (STI.getFeatureBits()[AArch64::HasV8_2aOps]))
-        Asm = "dc\tcvap";
-      break;
-    case 14:
-      if (Op1Val == 3 && Op2Val == 1)
-        Asm = "dc\tcivac";
-      else if (Op1Val == 0 && Op2Val == 2)
-        Asm = "dc\tcisw";
-      break;
-
+    case 4: case 6: case 10: case 11: case 12: case 14:
+    {
+      const AArch64DC::DC *DC = AArch64DC::lookupDCByEncoding(Encoding);
+      if (!DC || !DC->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = true;
+      Ins = "dc\t";
+      Name = std::string(DC->Name);
+    }
+    break;
     // AT aliases
-    case 8:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e1r"; break;
-        case 1: Asm = "at\ts1e1w"; break;
-        case 2: Asm = "at\ts1e0r"; break;
-        case 3: Asm = "at\ts1e0w"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e2r"; break;
-        case 1: Asm = "at\ts1e2w"; break;
-        case 4: Asm = "at\ts12e1r"; break;
-        case 5: Asm = "at\ts12e1w"; break;
-        case 6: Asm = "at\ts12e0r"; break;
-        case 7: Asm = "at\ts12e0w"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "at\ts1e3r"; break;
-        case 1: Asm = "at\ts1e3w"; break;
-        }
-        break;
-      }
-      break;
-    case 9:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        if (STI.getFeatureBits()[AArch64::HasV8_2aOps]) {
-          switch (Op2Val) {
-          default:
-            break;
-          case 0: Asm = "at\ts1e1rp"; break;
-          case 1: Asm = "at\ts1e1wp"; break;
-          }
-        }
-        break;
-      }
+    case 8: case 9: {
+      const AArch64AT::AT *AT = AArch64AT::lookupATByEncoding(Encoding);
+      if (!AT || !AT->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = true;
+      Ins = "at\t";
+      Name = std::string(AT->Name);
+    }
+    break;
     }
   } else if (CnVal == 8) {
     // TLBI aliases
-    switch (CmVal) {
-    default:
-      break;
-    case 3:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1is"; break;
-        case 1: Asm = "tlbi\tvae1is"; break;
-        case 2: Asm = "tlbi\taside1is"; break;
-        case 3: Asm = "tlbi\tvaae1is"; break;
-        case 5: Asm = "tlbi\tvale1is"; break;
-        case 7: Asm = "tlbi\tvaale1is"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2is"; break;
-        case 1: Asm = "tlbi\tvae2is"; break;
-        case 4: Asm = "tlbi\talle1is"; break;
-        case 5: Asm = "tlbi\tvale2is"; break;
-        case 6: Asm = "tlbi\tvmalls12e1is"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3is"; break;
-        case 1: Asm = "tlbi\tvae3is"; break;
-        case 5: Asm = "tlbi\tvale3is"; break;
-        }
-        break;
-      }
-      break;
-    case 0:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1is"; break;
-        case 5: Asm = "tlbi\tipas2le1is"; break;
-        }
-        break;
-      }
-      break;
-    case 4:
-      switch (Op1Val) {
-      default:
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 1: Asm = "tlbi\tipas2e1"; break;
-        case 5: Asm = "tlbi\tipas2le1"; break;
-        }
-        break;
-      }
-      break;
-    case 7:
-      switch (Op1Val) {
-      default:
-        break;
-      case 0:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\tvmalle1"; break;
-        case 1: Asm = "tlbi\tvae1"; break;
-        case 2: Asm = "tlbi\taside1"; break;
-        case 3: Asm = "tlbi\tvaae1"; break;
-        case 5: Asm = "tlbi\tvale1"; break;
-        case 7: Asm = "tlbi\tvaale1"; break;
-        }
-        break;
-      case 4:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle2"; break;
-        case 1: Asm = "tlbi\tvae2"; break;
-        case 4: Asm = "tlbi\talle1"; break;
-        case 5: Asm = "tlbi\tvale2"; break;
-        case 6: Asm = "tlbi\tvmalls12e1"; break;
-        }
-        break;
-      case 6:
-        switch (Op2Val) {
-        default:
-          break;
-        case 0: Asm = "tlbi\talle3"; break;
-        case 1: Asm = "tlbi\tvae3";  break;
-        case 5: Asm = "tlbi\tvale3"; break;
-        }
-        break;
-      }
-      break;
-    }
+    const AArch64TLBI::TLBI *TLBI = AArch64TLBI::lookupTLBIByEncoding(Encoding);
+    if (!TLBI || !TLBI->haveFeatures(STI.getFeatureBits()))
+      return false;
+
+    NeedsReg = TLBI->NeedsReg;
+    Ins = "tlbi\t";
+    Name = std::string(TLBI->Name);
   }
+  else
+    return false;
 
-  if (Asm) {
-    unsigned Reg = MI->getOperand(4).getReg();
+  std::string Str = Ins + Name;
+  std::transform(Str.begin(), Str.end(), Str.begin(), ::tolower);
 
-    O << '\t' << Asm;
-    if (StringRef(Asm).lower().find("all") == StringRef::npos)
-      O << ", " << getRegisterName(Reg);
-  }
+  O << '\t' << Str;
+  if (NeedsReg)
+    O << ", " << getRegisterName(MI->getOperand(4).getReg());
 
-  return Asm != nullptr;
+  return true;
 }
 
 void AArch64InstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
index 65dca99ed04e7799e64997bcf47d94f25e1f8ebf..a45258cb97b7e7396b8d36272982dac8325f3c24 100644
--- a/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ b/lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
 
 #include "MCTargetDesc/AArch64MCTargetDesc.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -37,9 +38,11 @@ public:
                                        unsigned PrintMethodIdx,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O);
+
   virtual StringRef getRegName(unsigned RegNo) const {
     return getRegisterName(RegNo);
   }
+
   static const char *getRegisterName(unsigned RegNo,
                                      unsigned AltIdx = AArch64::NoRegAltName);
 
@@ -177,12 +180,15 @@ public:
                                unsigned PrintMethodIdx,
                                const MCSubtargetInfo &STI,
                                raw_ostream &O) override;
+
   StringRef getRegName(unsigned RegNo) const override {
     return getRegisterName(RegNo);
   }
+
   static const char *getRegisterName(unsigned RegNo,
                                      unsigned AltIdx = AArch64::NoRegAltName);
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AARCH64_INSTPRINTER_AARCH64INSTPRINTER_H
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 14c0327f5fa8df07c28ea148e030aa0f43165000..ebf05ae303ddda4e2fa5d9b980085d5757158afb 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -73,7 +73,7 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   bool mayNeedRelaxation(const MCInst &Inst) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
@@ -138,15 +138,15 @@ static unsigned AdrImmBits(unsigned Value) {
 }
 
 static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 MCContext *Ctx) {
+                                 MCContext &Ctx) {
   unsigned Kind = Fixup.getKind();
   int64_t SignedValue = static_cast<int64_t>(Value);
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
   case AArch64::fixup_aarch64_pcrel_adr_imm21:
-    if (Ctx && (SignedValue > 2097151 || SignedValue < -2097152))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (SignedValue > 2097151 || SignedValue < -2097152)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     return AdrImmBits(Value & 0x1fffffULL);
   case AArch64::fixup_aarch64_pcrel_adrp_imm21:
     return AdrImmBits((Value & 0x1fffff000ULL) >> 12);
@@ -154,66 +154,65 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case AArch64::fixup_aarch64_pcrel_branch19:
     // Signed 21-bit immediate
     if (SignedValue > 2097151 || SignedValue < -2097152)
-      if (Ctx) Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     // Low two bits are not encoded.
     return (Value >> 2) & 0x7ffff;
   case AArch64::fixup_aarch64_add_imm12:
   case AArch64::fixup_aarch64_ldst_imm12_scale1:
     // Unsigned 12-bit immediate
-    if (Ctx && Value >= 0x1000)
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value >= 0x1000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     return Value;
   case AArch64::fixup_aarch64_ldst_imm12_scale2:
     // Unsigned 12-bit immediate which gets multiplied by 2
-    if (Ctx && (Value >= 0x2000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x1))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
+    if (Value >= 0x2000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x1)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 2-byte aligned");
     return Value >> 1;
   case AArch64::fixup_aarch64_ldst_imm12_scale4:
     // Unsigned 12-bit immediate which gets multiplied by 4
-    if (Ctx && (Value >= 0x4000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
+    if (Value >= 0x4000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 4-byte aligned");
     return Value >> 2;
   case AArch64::fixup_aarch64_ldst_imm12_scale8:
     // Unsigned 12-bit immediate which gets multiplied by 8
-    if (Ctx && (Value >= 0x8000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0x7))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
+    if (Value >= 0x8000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0x7)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 8-byte aligned");
     return Value >> 3;
   case AArch64::fixup_aarch64_ldst_imm12_scale16:
     // Unsigned 12-bit immediate which gets multiplied by 16
-    if (Ctx && (Value >= 0x10000))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
-    if (Ctx && (Value & 0xf))
-      Ctx->reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
+    if (Value >= 0x10000)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
+    if (Value & 0xf)
+      Ctx.reportError(Fixup.getLoc(), "fixup must be 16-byte aligned");
     return Value >> 4;
   case AArch64::fixup_aarch64_movw:
-    if (Ctx)
-      Ctx->reportError(Fixup.getLoc(),
-                       "no resolvable MOVZ/MOVK fixups supported yet");
+    Ctx.reportError(Fixup.getLoc(),
+                    "no resolvable MOVZ/MOVK fixups supported yet");
     return Value;
   case AArch64::fixup_aarch64_pcrel_branch14:
     // Signed 16-bit immediate
-    if (Ctx && (SignedValue > 32767 || SignedValue < -32768))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (SignedValue > 32767 || SignedValue < -32768)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     // Low two bits are not encoded (4-byte alignment assumed).
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3fff;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
     // Signed 28-bit immediate
-    if (Ctx && (SignedValue > 134217727 || SignedValue < -134217728))
-      Ctx->reportError(Fixup.getLoc(), "fixup value out of range");
+    if (SignedValue > 134217727 || SignedValue < -134217728)
+      Ctx.reportError(Fixup.getLoc(), "fixup value out of range");
     // Low two bits are not encoded (4-byte alignment assumed).
-    if (Ctx && (Value & 0x3))
-      Ctx->reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
+    if (Value & 0x3)
+      Ctx.reportError(Fixup.getLoc(), "fixup not sufficiently aligned");
     return (Value >> 2) & 0x3ffffff;
   case FK_Data_1:
   case FK_Data_2:
@@ -264,13 +263,13 @@ unsigned AArch64AsmBackend::getFixupKindContainereSizeInBytes(unsigned Kind) con
 
 void AArch64AsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                    unsigned DataSize, uint64_t Value,
-                                   bool IsPCRel) const {
+                                   bool IsPCRel, MCContext &Ctx) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   if (!Value)
     return; // Doesn't change encoding.
   MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
   // Apply any target-specific value adjustments.
-  Value = adjustFixupValue(Fixup, Value, nullptr);
+  Value = adjustFixupValue(Fixup, Value, Ctx);
 
   // Shift the value into position.
   Value <<= Info.TargetOffset;
@@ -521,17 +520,6 @@ public:
 
     return CompactUnwindEncoding;
   }
-
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override {
-    // Try to get the encoded value for the fixup as-if we're mapping it into
-    // the instruction. This allows adjustFixupValue() to issue a diagnostic
-    // if the value is invalid.
-    if (IsResolved)
-      (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
-  }
 };
 
 } // end anonymous namespace
@@ -575,12 +563,6 @@ void ELFAArch64AsmBackend::processFixupValue(
   // to the linker -- a relocation!
   if ((uint32_t)Fixup.getKind() == AArch64::fixup_aarch64_pcrel_adrp_imm21)
     IsResolved = false;
-
-  // Try to get the encoded value for the fixup as-if we're mapping it into
-  // the instruction. This allows adjustFixupValue() to issue a diagnostic
-  // if the value is invalid.
-  if (IsResolved)
-    (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
 }
 
 }
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index 685907a2178e3c5925c816d090897d0e1f11a934..5903e1e36d453fdec4828e0c708540bb395707d4 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -14,27 +14,23 @@
 //===----------------------------------------------------------------------===//
 
 #include "AArch64TargetStreamer.h"
-#include "llvm/MC/MCELFStreamer.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/MC/MCAsmBackend.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSection.h"
-#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/Debug.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -180,6 +176,7 @@ private:
   DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
   ElfMappingSymbol LastEMS;
 };
+
 } // end anonymous namespace
 
 AArch64ELFStreamer &AArch64TargetELFStreamer::getStreamer() {
@@ -191,6 +188,7 @@ void AArch64TargetELFStreamer::emitInst(uint32_t Inst) {
 }
 
 namespace llvm {
+
 MCTargetStreamer *createAArch64AsmTargetStreamer(MCStreamer &S,
                                                  formatted_raw_ostream &OS,
                                                  MCInstPrinter *InstPrint,
@@ -214,4 +212,5 @@ createAArch64ObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
     return new AArch64TargetELFStreamer(S);
   return nullptr;
 }
-}
+
+} // end namespace llvm
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index e9d38d3dcf1084d90bf63b41ba799b7102fbcd40..f710065d9bc75677ac2c8840ab1a0f36d8e69cfd 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -84,9 +84,14 @@ static void adjustCodeGenOpts(const Triple &TT, Reloc::Model RM,
   // no matter how far away they are.
   else if (CM == CodeModel::JITDefault)
     CM = CodeModel::Large;
-  else if (CM != CodeModel::Small && CM != CodeModel::Large)
-    report_fatal_error(
-        "Only small and large code models are allowed on AArch64");
+  else if (CM != CodeModel::Small && CM != CodeModel::Large) {
+    if (!TT.isOSFuchsia())
+      report_fatal_error(
+          "Only small and large code models are allowed on AArch64");
+    else if (CM != CodeModel::Kernel)
+      report_fatal_error(
+          "Only small, kernel, and large code models are allowed on AArch64");
+  }
 }
 
 static MCInstPrinter *createAArch64MCInstPrinter(const Triple &T,
diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index 53a68527ee8e4b05b3068830df791c9d006bb44f..3d296ba4806b90157d48e664ef51523b8d8459c4 100644
--- a/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -16,14 +16,22 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFragment.h"
 #include "llvm/MC/MCMachObjectWriter.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/MachO.h"
+#include "llvm/Support/MathExtras.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 namespace {
+
 class AArch64MachObjectWriter : public MCMachObjectTargetWriter {
   bool getAArch64FixupKindMachOInfo(const MCFixup &Fixup, unsigned &RelocType,
                                   const MCSymbolRefExpr *Sym,
@@ -38,7 +46,8 @@ public:
                         const MCFixup &Fixup, MCValue Target,
                         uint64_t &FixedValue) override;
 };
-}
+
+} // end anonymous namespace
 
 bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     const MCFixup &Fixup, unsigned &RelocType, const MCSymbolRefExpr *Sym,
@@ -51,18 +60,18 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     return false;
 
   case FK_Data_1:
-    Log2Size = llvm::Log2_32(1);
+    Log2Size = Log2_32(1);
     return true;
   case FK_Data_2:
-    Log2Size = llvm::Log2_32(2);
+    Log2Size = Log2_32(2);
     return true;
   case FK_Data_4:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
       RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
     return true;
   case FK_Data_8:
-    Log2Size = llvm::Log2_32(8);
+    Log2Size = Log2_32(8);
     if (Sym->getKind() == MCSymbolRefExpr::VK_GOT)
       RelocType = unsigned(MachO::ARM64_RELOC_POINTER_TO_GOT);
     return true;
@@ -72,7 +81,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
   case AArch64::fixup_aarch64_ldst_imm12_scale4:
   case AArch64::fixup_aarch64_ldst_imm12_scale8:
   case AArch64::fixup_aarch64_ldst_imm12_scale16:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     switch (Sym->getKind()) {
     default:
       return false;
@@ -87,14 +96,13 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
       return true;
     }
   case AArch64::fixup_aarch64_pcrel_adrp_imm21:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     // This encompasses the relocation for the whole 21-bit value.
     switch (Sym->getKind()) {
-    default: {
+    default:
       Asm.getContext().reportError(Fixup.getLoc(),
                                    "ADR/ADRP relocations must be GOT relative");
       return false;
-    }
     case MCSymbolRefExpr::VK_PAGE:
       RelocType = unsigned(MachO::ARM64_RELOC_PAGE21);
       return true;
@@ -108,7 +116,7 @@ bool AArch64MachObjectWriter::getAArch64FixupKindMachOInfo(
     return true;
   case AArch64::fixup_aarch64_pcrel_branch26:
   case AArch64::fixup_aarch64_pcrel_call26:
-    Log2Size = llvm::Log2_32(4);
+    Log2Size = Log2_32(4);
     RelocType = unsigned(MachO::ARM64_RELOC_BRANCH26);
     return true;
   }
diff --git a/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index dcc39176031c582345727b5f46360384bbe82272..5d76681cd97b0d7f43dae0fcd71e93c441a56630 100644
--- a/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -266,82 +266,86 @@ inline static unsigned getNZCVToSatisfyCondCode(CondCode Code) {
 }
 } // end namespace AArch64CC
 
+struct SysAlias {
+  const char *Name;
+  uint16_t Encoding;
+  FeatureBitset FeaturesRequired;
+
+  SysAlias (const char *N, uint16_t E) : Name(N), Encoding(E) {};
+  SysAlias (const char *N, uint16_t E, FeatureBitset F) :
+    Name(N), Encoding(E), FeaturesRequired(F) {};
+
+  bool haveFeatures(FeatureBitset ActiveFeatures) const {
+    return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
+  }
+
+  FeatureBitset getRequiredFeatures() const { return FeaturesRequired; }
+};
+
+struct SysAliasReg : SysAlias {
+  bool NeedsReg;
+  SysAliasReg(const char *N, uint16_t E, bool R) : SysAlias(N, E), NeedsReg(R) {};
+};
+
 namespace AArch64AT{
-  struct AT {
-    const char *Name;
-    uint16_t Encoding;
+  struct AT : SysAlias {
+    using SysAlias::SysAlias;
   };
-
   #define GET_AT_DECL
   #include "AArch64GenSystemOperands.inc"
-
 }
+
 namespace AArch64DB {
-  struct DB {
-    const char *Name;
-    uint16_t Encoding;
+  struct DB : SysAlias {
+    using SysAlias::SysAlias;
   };
-
   #define GET_DB_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64DC {
-  struct DC {
-    const char *Name;
-    uint16_t Encoding;
+  struct DC : SysAlias {
+    using SysAlias::SysAlias;
   };
-
   #define GET_DC_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64IC {
-  struct IC {
-    const char *Name;
-    uint16_t Encoding;
-    bool NeedsReg;
+  struct IC : SysAliasReg {
+    using SysAliasReg::SysAliasReg;
   };
   #define GET_IC_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace  AArch64ISB {
-  struct ISB {
-    const char *Name;
-    uint16_t Encoding;
+  struct ISB : SysAlias {
+    using SysAlias::SysAlias;
   };
   #define GET_ISB_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PRFM {
-  struct PRFM {
-    const char *Name;
-    uint16_t Encoding;
+  struct PRFM : SysAlias {
+    using SysAlias::SysAlias;
   };
   #define GET_PRFM_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PState {
-  struct PState {
-    const char *Name;
-    uint16_t Encoding;
-    FeatureBitset FeaturesRequired;
-
-    bool haveFeatures(FeatureBitset ActiveFeatures) const {
-      return (FeaturesRequired & ActiveFeatures) == FeaturesRequired;
-    }
+  struct PState : SysAlias{
+    using SysAlias::SysAlias;
   };
   #define GET_PSTATE_DECL
   #include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64PSBHint {
-  struct PSB {
-    const char *Name;
-    uint16_t Encoding;
+  struct PSB : SysAlias {
+    using SysAlias::SysAlias;
   };
   #define GET_PSB_DECL
   #include "AArch64GenSystemOperands.inc"
@@ -451,10 +455,8 @@ namespace AArch64SysReg {
 }
 
 namespace AArch64TLBI {
-  struct TLBI {
-    const char *Name;
-    uint16_t Encoding;
-    bool NeedsReg;
+  struct TLBI : SysAliasReg {
+    using SysAliasReg::SysAliasReg;
   };
   #define GET_TLBI_DECL
   #include "AArch64GenSystemOperands.inc"
diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h
index 0916bdd2fcb68cf65c3d9ae4e7eff1f31580ab90..6725fb37cab029945fd151d881081c747a9f3695 100644
--- a/lib/Target/AMDGPU/AMDGPU.h
+++ b/lib/Target/AMDGPU/AMDGPU.h
@@ -11,6 +11,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
@@ -23,6 +24,7 @@ class Pass;
 class Target;
 class TargetMachine;
 class PassRegistry;
+class Module;
 
 // R600 Passes
 FunctionPass *createR600VectorRegMerger(TargetMachine &tm);
@@ -37,6 +39,7 @@ FunctionPass *createAMDGPUCFGStructurizerPass();
 FunctionPass *createSITypeRewriter();
 FunctionPass *createSIAnnotateControlFlowPass();
 FunctionPass *createSIFoldOperandsPass();
+FunctionPass *createSIPeepholeSDWAPass();
 FunctionPass *createSILowerI1CopiesPass();
 FunctionPass *createSIShrinkInstructionsPass();
 FunctionPass *createSILoadStoreOptimizerPass(TargetMachine &tm);
@@ -45,15 +48,23 @@ FunctionPass *createSIFixControlFlowLiveIntervalsPass();
 FunctionPass *createSIFixSGPRCopiesPass();
 FunctionPass *createSIDebuggerInsertNopsPass();
 FunctionPass *createSIInsertWaitsPass();
+FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr);
 
-ModulePass *createAMDGPUAnnotateKernelFeaturesPass();
+ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr);
 void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &);
 extern char &AMDGPUAnnotateKernelFeaturesID;
 
+ModulePass *createAMDGPULowerIntrinsicsPass();
+void initializeAMDGPULowerIntrinsicsPass(PassRegistry &);
+extern char &AMDGPULowerIntrinsicsID;
+
 void initializeSIFoldOperandsPass(PassRegistry &);
 extern char &SIFoldOperandsID;
 
+void initializeSIPeepholeSDWAPass(PassRegistry &);
+extern char &SIPeepholeSDWAID;
+
 void initializeSIShrinkInstructionsPass(PassRegistry&);
 extern char &SIShrinkInstructionsID;
 
@@ -89,7 +100,7 @@ extern char &AMDGPUPromoteAllocaID;
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(TargetMachine &TM,
                                   CodeGenOpt::Level OptLevel);
-ModulePass *createAMDGPUAlwaysInlinePass();
+ModulePass *createAMDGPUAlwaysInlinePass(bool GlobalOpt = true);
 ModulePass *createAMDGPUOpenCLImageTypeLoweringPass();
 FunctionPass *createAMDGPUAnnotateUniformValues();
 
@@ -115,6 +126,15 @@ extern char &SIDebuggerInsertNopsID;
 void initializeSIInsertWaitsPass(PassRegistry&);
 extern char &SIInsertWaitsID;
 
+void initializeSIInsertWaitcntsPass(PassRegistry&);
+extern char &SIInsertWaitcntsID;
+
+void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
+extern char &AMDGPUUnifyDivergentExitNodesID;
+
+ImmutablePass *createAMDGPUAAWrapperPass();
+void initializeAMDGPUAAWrapperPassPass(PassRegistry&);
+
 Target &getTheAMDGPUTarget();
 Target &getTheGCNTarget();
 
@@ -136,43 +156,53 @@ enum TargetIndex {
 /// however on the GPU, each address space points to
 /// a separate piece of memory that is unique from other
 /// memory locations.
-namespace AMDGPUAS {
-enum AddressSpaces : unsigned {
-  PRIVATE_ADDRESS  = 0, ///< Address space for private memory.
-  GLOBAL_ADDRESS   = 1, ///< Address space for global memory (RAT0, VTX0).
-  CONSTANT_ADDRESS = 2, ///< Address space for constant memory (VTX2)
-  LOCAL_ADDRESS    = 3, ///< Address space for local memory.
-  FLAT_ADDRESS     = 4, ///< Address space for flat memory.
-  REGION_ADDRESS   = 5, ///< Address space for region memory.
-  PARAM_D_ADDRESS  = 6, ///< Address space for direct addressible parameter memory (CONST0)
-  PARAM_I_ADDRESS  = 7, ///< Address space for indirect addressible parameter memory (VTX1)
+struct AMDGPUAS {
+  // The following address space values depend on the triple environment.
+  unsigned PRIVATE_ADDRESS;  ///< Address space for private memory.
+  unsigned FLAT_ADDRESS;     ///< Address space for flat memory.
+  unsigned REGION_ADDRESS;   ///< Address space for region memory.
+
+  // The maximum value for flat, generic, local, private, constant and region.
+  const static unsigned MAX_COMMON_ADDRESS = 5;
+
+  const static unsigned GLOBAL_ADDRESS   = 1;  ///< Address space for global memory (RAT0, VTX0).
+  const static unsigned CONSTANT_ADDRESS = 2;  ///< Address space for constant memory (VTX2)
+  const static unsigned LOCAL_ADDRESS    = 3;  ///< Address space for local memory.
+  const static unsigned PARAM_D_ADDRESS  = 6;  ///< Address space for direct addressible parameter memory (CONST0)
+  const static unsigned PARAM_I_ADDRESS  = 7;  ///< Address space for indirect addressible parameter memory (VTX1)
 
   // Do not re-order the CONSTANT_BUFFER_* enums.  Several places depend on this
   // order to be able to dynamically index a constant buffer, for example:
   //
   // ConstantBufferAS = CONSTANT_BUFFER_0 + CBIdx
 
-  CONSTANT_BUFFER_0 = 8,
-  CONSTANT_BUFFER_1 = 9,
-  CONSTANT_BUFFER_2 = 10,
-  CONSTANT_BUFFER_3 = 11,
-  CONSTANT_BUFFER_4 = 12,
-  CONSTANT_BUFFER_5 = 13,
-  CONSTANT_BUFFER_6 = 14,
-  CONSTANT_BUFFER_7 = 15,
-  CONSTANT_BUFFER_8 = 16,
-  CONSTANT_BUFFER_9 = 17,
-  CONSTANT_BUFFER_10 = 18,
-  CONSTANT_BUFFER_11 = 19,
-  CONSTANT_BUFFER_12 = 20,
-  CONSTANT_BUFFER_13 = 21,
-  CONSTANT_BUFFER_14 = 22,
-  CONSTANT_BUFFER_15 = 23,
+  const static unsigned CONSTANT_BUFFER_0 = 8;
+  const static unsigned CONSTANT_BUFFER_1 = 9;
+  const static unsigned CONSTANT_BUFFER_2 = 10;
+  const static unsigned CONSTANT_BUFFER_3 = 11;
+  const static unsigned CONSTANT_BUFFER_4 = 12;
+  const static unsigned CONSTANT_BUFFER_5 = 13;
+  const static unsigned CONSTANT_BUFFER_6 = 14;
+  const static unsigned CONSTANT_BUFFER_7 = 15;
+  const static unsigned CONSTANT_BUFFER_8 = 16;
+  const static unsigned CONSTANT_BUFFER_9 = 17;
+  const static unsigned CONSTANT_BUFFER_10 = 18;
+  const static unsigned CONSTANT_BUFFER_11 = 19;
+  const static unsigned CONSTANT_BUFFER_12 = 20;
+  const static unsigned CONSTANT_BUFFER_13 = 21;
+  const static unsigned CONSTANT_BUFFER_14 = 22;
+  const static unsigned CONSTANT_BUFFER_15 = 23;
 
   // Some places use this if the address space can't be determined.
-  UNKNOWN_ADDRESS_SPACE = ~0u
+  const static unsigned UNKNOWN_ADDRESS_SPACE = ~0u;
 };
 
-} // namespace AMDGPUAS
+namespace llvm {
+namespace AMDGPU {
+AMDGPUAS getAMDGPUAS(const Module &M);
+AMDGPUAS getAMDGPUAS(const TargetMachine &TM);
+AMDGPUAS getAMDGPUAS(Triple T);
+} // namespace AMDGPU
+} // namespace llvm
 
 #endif
diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td
index aed8ce17b8c933d157dd34b1e66b1ec7673ef6f3..2c7a2d8962d02067b2e4f0fef40cc711083d743b 100644
--- a/lib/Target/AMDGPU/AMDGPU.td
+++ b/lib/Target/AMDGPU/AMDGPU.td
@@ -67,12 +67,24 @@ def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access",
   "Support unaligned global loads and stores"
 >;
 
+def FeatureTrapHandler: SubtargetFeature<"trap-handler",
+  "TrapHandler",
+  "true",
+  "Trap handler support"
+>;
+
 def FeatureUnalignedScratchAccess : SubtargetFeature<"unaligned-scratch-access",
   "UnalignedScratchAccess",
   "true",
   "Support unaligned scratch loads and stores"
 >;
 
+def FeatureApertureRegs : SubtargetFeature<"aperture-regs",
+  "HasApertureRegs",
+  "true",
+  "Has Memory Aperture Base and Size Registers"
+>;
+
 // XNACK is disabled if SH_MEM_CONFIG.ADDRESS_MODE = GPUVM on chips that support
 // XNACK. The current default kernel driver setting is:
 // - graphics ring: XNACK disabled
@@ -154,6 +166,12 @@ def FeatureCIInsts : SubtargetFeature<"ci-insts",
   "Additional intstructions for CI+"
 >;
 
+def FeatureGFX9Insts : SubtargetFeature<"gfx9-insts",
+  "GFX9Insts",
+  "true",
+  "Additional intstructions for GFX9+"
+>;
+
 def FeatureSMemRealTime : SubtargetFeature<"s-memrealtime",
   "HasSMemRealTime",
   "true",
@@ -172,6 +190,12 @@ def Feature16BitInsts : SubtargetFeature<"16-bit-insts",
   "Has i16/f16 instructions"
 >;
 
+def FeatureVOP3P : SubtargetFeature<"vop3p",
+  "HasVOP3PInsts",
+  "true",
+  "Has VOP3P packed instructions"
+>;
+
 def FeatureMovrel : SubtargetFeature<"movrel",
   "HasMovrel",
   "true",
@@ -239,6 +263,12 @@ def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
   [FeatureFP64FP16Denormals]
 >;
 
+def FeatureDX10Clamp : SubtargetFeature<"dx10-clamp",
+  "DX10Clamp",
+  "true",
+  "clamp modifier clamps NaNs to 0.0"
+>;
+
 def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
   "FPExceptions",
   "true",
@@ -371,6 +401,15 @@ def FeatureVolcanicIslands : SubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
   ]
 >;
 
+def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9",
+  [FeatureFP64, FeatureLocalMemorySize65536,
+   FeatureWavefrontSize64, FeatureFlatAddressSpace, FeatureGCN,
+   FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts,
+   FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm,
+   FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode
+  ]
+>;
+
 class SubtargetFeatureISAVersion <int Major, int Minor, int Stepping,
                                   list<SubtargetFeature> Implies>
                                  : SubtargetFeature <
@@ -423,6 +462,9 @@ def FeatureISAVersion8_1_0 : SubtargetFeatureISAVersion <8,1,0,
    FeatureLDSBankCount16,
    FeatureXNACK]>;
 
+def FeatureISAVersion9_0_0 : SubtargetFeatureISAVersion <9,0,0,[]>;
+def FeatureISAVersion9_0_1 : SubtargetFeatureISAVersion <9,0,1,[]>;
+
 //===----------------------------------------------------------------------===//
 // Debugger related subtarget features.
 //===----------------------------------------------------------------------===//
@@ -528,14 +570,21 @@ def isVI : Predicate <
   "Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
   AssemblerPredicate<"FeatureGCN3Encoding">;
 
+def isGFX9 : Predicate <
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
+  AssemblerPredicate<"FeatureGFX9Insts">;
+
+// TODO: Either the name to be changed or we simply use IsCI!
 def isCIVI : Predicate <
-  "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS || "
-  "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS"
->, AssemblerPredicate<"FeatureCIInsts">;
+  "Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
+  AssemblerPredicate<"FeatureCIInsts">;
 
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">;
 
-def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">;
+def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
+  AssemblerPredicate<"Feature16BitInsts">;
+def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
+  AssemblerPredicate<"FeatureVOP3P">;
 
 def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
   AssemblerPredicate<"FeatureSDWA">;
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f1fde96eeb69c6b4fec58176b8b959fe33dda5b5
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -0,0 +1,147 @@
+//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This is the AMGPU address space based alias analysis pass.
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUAliasAnalysis.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/Passes.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-aa"
+
+// Register this pass...
+char AMDGPUAAWrapperPass::ID = 0;
+INITIALIZE_PASS(AMDGPUAAWrapperPass, "amdgpu-aa",
+                "AMDGPU Address space based Alias Analysis", false, true)
+
+ImmutablePass *llvm::createAMDGPUAAWrapperPass() {
+  return new AMDGPUAAWrapperPass();
+}
+
+void AMDGPUAAWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
+
+// Must match the table in getAliasResult.
+AMDGPUAAResult::ASAliasRulesTy::ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_)
+  : Arch(Arch_), AS(AS_) {
+  // These arrarys are indexed by address space value
+  // enum elements 0 ... to 5
+  static const AliasResult ASAliasRulesPrivIsZero[6][6] = {
+  /*             Private    Global    Constant  Group     Flat      Region*/
+  /* Private  */ {MayAlias, NoAlias , NoAlias , NoAlias , MayAlias, NoAlias},
+  /* Global   */ {NoAlias , MayAlias, NoAlias , NoAlias , MayAlias, NoAlias},
+  /* Constant */ {NoAlias , NoAlias , MayAlias, NoAlias , MayAlias, NoAlias},
+  /* Group    */ {NoAlias , NoAlias , NoAlias , MayAlias, MayAlias, NoAlias},
+  /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
+  /* Region   */ {NoAlias , NoAlias , NoAlias , NoAlias , MayAlias, MayAlias}
+  };
+  static const AliasResult ASAliasRulesGenIsZero[6][6] = {
+  /*             Flat       Global    Constant  Group     Region    Private */
+  /* Flat     */ {MayAlias, MayAlias, MayAlias, MayAlias, MayAlias, MayAlias},
+  /* Global   */ {MayAlias, MayAlias, NoAlias , NoAlias , NoAlias , NoAlias},
+  /* Constant */ {MayAlias, NoAlias , MayAlias, NoAlias , NoAlias,  NoAlias},
+  /* Group    */ {MayAlias, NoAlias , NoAlias , MayAlias, NoAlias , NoAlias},
+  /* Region   */ {MayAlias, NoAlias , NoAlias , NoAlias,  MayAlias, NoAlias},
+  /* Private  */ {MayAlias, NoAlias , NoAlias , NoAlias , NoAlias , MayAlias}
+  };
+  assert(AS.MAX_COMMON_ADDRESS <= 5);
+  if (AS.FLAT_ADDRESS == 0) {
+    assert(AS.GLOBAL_ADDRESS   == 1 &&
+           AS.REGION_ADDRESS   == 4 &&
+           AS.LOCAL_ADDRESS    == 3 &&
+           AS.CONSTANT_ADDRESS == 2 &&
+           AS.PRIVATE_ADDRESS  == 5);
+    ASAliasRules = &ASAliasRulesGenIsZero;
+  } else {
+    assert(AS.PRIVATE_ADDRESS  == 0 &&
+           AS.GLOBAL_ADDRESS   == 1 &&
+           AS.CONSTANT_ADDRESS == 2 &&
+           AS.LOCAL_ADDRESS    == 3 &&
+           AS.FLAT_ADDRESS     == 4 &&
+           AS.REGION_ADDRESS   == 5);
+    ASAliasRules = &ASAliasRulesPrivIsZero;
+  }
+}
+
+AliasResult AMDGPUAAResult::ASAliasRulesTy::getAliasResult(unsigned AS1,
+    unsigned AS2) const {
+  if (AS1 > AS.MAX_COMMON_ADDRESS || AS2 > AS.MAX_COMMON_ADDRESS) {
+    if (Arch == Triple::amdgcn)
+      report_fatal_error("Pointer address space out of range");
+    return AS1 == AS2 ? MayAlias : NoAlias;
+  }
+
+  return (*ASAliasRules)[AS1][AS2];
+}
+
+AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
+                                  const MemoryLocation &LocB) {
+  unsigned asA = LocA.Ptr->getType()->getPointerAddressSpace();
+  unsigned asB = LocB.Ptr->getType()->getPointerAddressSpace();
+
+  AliasResult Result = ASAliasRules.getAliasResult(asA, asB);
+  if (Result == NoAlias) return Result;
+
+  // Forward the query to the next alias analysis.
+  return AAResultBase::alias(LocA, LocB);
+}
+
+bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
+                                            bool OrLocal) {
+  const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
+
+  if (Base->getType()->getPointerAddressSpace() == AS.CONSTANT_ADDRESS) {
+    return true;
+  }
+
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
+    if (GV->isConstant())
+      return true;
+  } else if (const Argument *Arg = dyn_cast<Argument>(Base)) {
+    const Function *F = Arg->getParent();
+
+    // Only assume constant memory for arguments on kernels.
+    switch (F->getCallingConv()) {
+    default:
+      return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
+    case CallingConv::AMDGPU_VS:
+    case CallingConv::AMDGPU_GS:
+    case CallingConv::AMDGPU_PS:
+    case CallingConv::AMDGPU_CS:
+    case CallingConv::AMDGPU_KERNEL:
+    case CallingConv::SPIR_KERNEL:
+      break;
+    }
+
+    unsigned ArgNo = Arg->getArgNo();
+    /* On an argument, ReadOnly attribute indicates that the function does
+       not write through this pointer argument, even though it may write
+       to the memory that the pointer points to.
+       On an argument, ReadNone attribute indicates that the function does
+       not dereference that pointer argument, even though it may read or write
+       the memory that the pointer points to if accessed through other pointers.
+     */
+    if (F->getAttributes().hasAttribute(ArgNo + 1, Attribute::NoAlias) &&
+          (F->getAttributes().hasAttribute(ArgNo + 1, Attribute::ReadNone) ||
+           F->getAttributes().hasAttribute(ArgNo + 1, Attribute::ReadOnly))) {
+      return true;
+    }
+  }
+  return AAResultBase::pointsToConstantMemory(Loc, OrLocal);
+}
diff --git a/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
new file mode 100644
index 0000000000000000000000000000000000000000..5f8ed9b1f9a3e99df50e18f3f68757e9f870544e
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -0,0 +1,102 @@
+//===- AMDGPUAliasAnalysis ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This is the AMGPU address space based alias analysis pass.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
+#define LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
+
+#include "AMDGPU.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Pass.h"
+
+namespace llvm {
+
+/// A simple AA result that uses TBAA metadata to answer queries.
+class AMDGPUAAResult : public AAResultBase<AMDGPUAAResult> {
+  friend AAResultBase<AMDGPUAAResult>;
+
+  const DataLayout &DL;
+  AMDGPUAS AS;
+
+public:
+  explicit AMDGPUAAResult(const DataLayout &DL, Triple T) : AAResultBase(),
+    DL(DL), AS(AMDGPU::getAMDGPUAS(T)), ASAliasRules(AS, T.getArch()) {}
+  AMDGPUAAResult(AMDGPUAAResult &&Arg)
+      : AAResultBase(std::move(Arg)), DL(Arg.DL), AS(Arg.AS),
+        ASAliasRules(Arg.ASAliasRules){}
+
+  /// Handle invalidation events from the new pass manager.
+  ///
+  /// By definition, this result is stateless and so remains valid.
+  bool invalidate(Function &, const PreservedAnalyses &) { return false; }
+
+  AliasResult alias(const MemoryLocation &LocA, const MemoryLocation &LocB);
+  bool pointsToConstantMemory(const MemoryLocation &Loc, bool OrLocal);
+
+private:
+  bool Aliases(const MDNode *A, const MDNode *B) const;
+  bool PathAliases(const MDNode *A, const MDNode *B) const;
+
+  class ASAliasRulesTy {
+  public:
+    ASAliasRulesTy(AMDGPUAS AS_, Triple::ArchType Arch_);
+    AliasResult getAliasResult(unsigned AS1, unsigned AS2) const;
+  private:
+    Triple::ArchType Arch;
+    AMDGPUAS AS;
+    const AliasResult (*ASAliasRules)[6][6];
+  } ASAliasRules;
+};
+
+/// Analysis pass providing a never-invalidated alias analysis result.
+class AMDGPUAA : public AnalysisInfoMixin<AMDGPUAA> {
+  friend AnalysisInfoMixin<AMDGPUAA>;
+  static char PassID;
+
+public:
+  typedef AMDGPUAAResult Result;
+
+  AMDGPUAAResult run(Function &F, AnalysisManager<Function> &AM) {
+    return AMDGPUAAResult(F.getParent()->getDataLayout(),
+        Triple(F.getParent()->getTargetTriple()));
+  }
+};
+
+/// Legacy wrapper pass to provide the AMDGPUAAResult object.
+class AMDGPUAAWrapperPass : public ImmutablePass {
+  std::unique_ptr<AMDGPUAAResult> Result;
+
+public:
+  static char ID;
+
+  AMDGPUAAWrapperPass() : ImmutablePass(ID) {
+    initializeAMDGPUAAWrapperPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  AMDGPUAAResult &getResult() { return *Result; }
+  const AMDGPUAAResult &getResult() const { return *Result; }
+
+  bool doInitialization(Module &M) override {
+    Result.reset(new AMDGPUAAResult(M.getDataLayout(),
+        Triple(M.getTargetTriple())));
+    return false;
+  }
+  bool doFinalization(Module &M) override {
+    Result.reset();
+    return false;
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+}
+#endif // LLVM_ANALYSIS_AMDGPUALIASANALYSIS_H
diff --git a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index 067a16a2af7f32c7294eaf518f3b37cfc39ab36b..1d03714874e284277aea2b137a4d4ac3d2dfd49c 100644
--- a/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -24,8 +24,10 @@ namespace {
 class AMDGPUAlwaysInline : public ModulePass {
   static char ID;
 
+  bool GlobalOpt;
+
 public:
-  AMDGPUAlwaysInline() : ModulePass(ID) { }
+  AMDGPUAlwaysInline(bool GlobalOpt) : ModulePass(ID), GlobalOpt(GlobalOpt) { }
   bool runOnModule(Module &M) override;
   StringRef getPassName() const override { return "AMDGPU Always Inline Pass"; }
 };
@@ -45,8 +47,10 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
     }
   }
 
-  for (GlobalAlias* A : AliasesToRemove) {
-    A->eraseFromParent();
+  if (GlobalOpt) {
+    for (GlobalAlias* A : AliasesToRemove) {
+      A->eraseFromParent();
+    }
   }
 
   for (Function &F : M) {
@@ -70,6 +74,6 @@ bool AMDGPUAlwaysInline::runOnModule(Module &M) {
   return false;
 }
 
-ModulePass *llvm::createAMDGPUAlwaysInlinePass() {
-  return new AMDGPUAlwaysInline();
+ModulePass *llvm::createAMDGPUAlwaysInlinePass(bool GlobalOpt) {
+  return new AMDGPUAlwaysInline(GlobalOpt);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index 85c76737b8039c7332fbe6ec71ecede14c72073d..3d8db7cd8af55a75fe1493cc2a2885fb29ca760c 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
@@ -26,7 +27,9 @@ namespace {
 
 class AMDGPUAnnotateKernelFeatures : public ModulePass {
 private:
-  static bool hasAddrSpaceCast(const Function &F);
+  const TargetMachine *TM;
+  AMDGPUAS AS;
+  static bool hasAddrSpaceCast(const Function &F, AMDGPUAS AS);
 
   void addAttrToCallers(Function *Intrin, StringRef AttrName);
   bool addAttrsForIntrinsics(Module &M, ArrayRef<StringRef[2]>);
@@ -34,7 +37,8 @@ private:
 public:
   static char ID;
 
-  AMDGPUAnnotateKernelFeatures() : ModulePass(ID) { }
+  AMDGPUAnnotateKernelFeatures(const TargetMachine *TM_ = nullptr) :
+                               ModulePass(ID), TM(TM_) {}
   bool runOnModule(Module &M) override;
   StringRef getPassName() const override {
     return "AMDGPU Annotate Kernel Features";
@@ -45,10 +49,11 @@ public:
     ModulePass::getAnalysisUsage(AU);
   }
 
-  static bool visitConstantExpr(const ConstantExpr *CE);
+  static bool visitConstantExpr(const ConstantExpr *CE, AMDGPUAS AS);
   static bool visitConstantExprsRecursively(
     const Constant *EntryC,
-    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
+    SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
+    AMDGPUAS AS);
 };
 
 }
@@ -62,18 +67,20 @@ INITIALIZE_PASS(AMDGPUAnnotateKernelFeatures, DEBUG_TYPE,
 
 
 // The queue ptr is only needed when casting to flat, not from it.
-static bool castRequiresQueuePtr(unsigned SrcAS) {
-  return SrcAS == AMDGPUAS::LOCAL_ADDRESS || SrcAS == AMDGPUAS::PRIVATE_ADDRESS;
+static bool castRequiresQueuePtr(unsigned SrcAS, const AMDGPUAS &AS) {
+  return SrcAS == AS.LOCAL_ADDRESS || SrcAS == AS.PRIVATE_ADDRESS;
 }
 
-static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
-  return castRequiresQueuePtr(ASC->getSrcAddressSpace());
+static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC,
+    const AMDGPUAS &AS) {
+  return castRequiresQueuePtr(ASC->getSrcAddressSpace(), AS);
 }
 
-bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
+bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE,
+    AMDGPUAS AS) {
   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
-    return castRequiresQueuePtr(SrcAS);
+    return castRequiresQueuePtr(SrcAS, AS);
   }
 
   return false;
@@ -81,7 +88,8 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
 
 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   const Constant *EntryC,
-  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
+  SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
+  AMDGPUAS AS) {
 
   if (!ConstantExprVisited.insert(EntryC).second)
     return false;
@@ -94,7 +102,7 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 
     // Check this constant expression.
     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
-      if (visitConstantExpr(CE))
+      if (visitConstantExpr(CE, AS))
         return true;
     }
 
@@ -115,13 +123,14 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
 }
 
 // Return true if an addrspacecast is used that requires the queue ptr.
-bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
+bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F,
+    AMDGPUAS AS) {
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 
   for (const BasicBlock &BB : F) {
     for (const Instruction &I : BB) {
       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
-        if (castRequiresQueuePtr(ASC))
+        if (castRequiresQueuePtr(ASC, AS))
           return true;
       }
 
@@ -130,7 +139,7 @@ bool AMDGPUAnnotateKernelFeatures::hasAddrSpaceCast(const Function &F) {
         if (!OpC)
           continue;
 
-        if (visitConstantExprsRecursively(OpC, ConstantExprVisited))
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, AS))
           return true;
       }
     }
@@ -170,6 +179,7 @@ bool AMDGPUAnnotateKernelFeatures::addAttrsForIntrinsics(
 
 bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
   Triple TT(M.getTargetTriple());
+  AS = AMDGPU::getAMDGPUAS(M);
 
   static const StringRef IntrinsicToAttr[][2] = {
     // .x omitted
@@ -191,7 +201,8 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
     { "llvm.amdgcn.dispatch.ptr", "amdgpu-dispatch-ptr" },
     { "llvm.amdgcn.queue.ptr", "amdgpu-queue-ptr" },
     { "llvm.amdgcn.dispatch.id", "amdgpu-dispatch-id" },
-	{ "llvm.trap", "amdgpu-queue-ptr" }
+    { "llvm.trap", "amdgpu-queue-ptr" },
+    { "llvm.debugtrap", "amdgpu-queue-ptr" }
   };
 
   // TODO: We should not add the attributes if the known compile time workgroup
@@ -210,7 +221,9 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
       if (F.hasFnAttribute("amdgpu-queue-ptr"))
         continue;
 
-      if (hasAddrSpaceCast(F))
+      bool HasApertureRegs =
+        TM && TM->getSubtarget<AMDGPUSubtarget>(F).hasApertureRegs();
+      if (!HasApertureRegs && hasAddrSpaceCast(F, AS))
         F.addFnAttr("amdgpu-queue-ptr");
     }
   }
@@ -218,6 +231,6 @@ bool AMDGPUAnnotateKernelFeatures::runOnModule(Module &M) {
   return Changed;
 }
 
-ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass() {
-  return new AMDGPUAnnotateKernelFeatures();
+ModulePass *llvm::createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM) {
+  return new AMDGPUAnnotateKernelFeatures(TM);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index c011be6fa1692735ad32956ff3c457b977e82f78..91b3649f5c39da89dd579ea9a0a706a912b93b54 100644
--- a/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -37,6 +37,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
   LoopInfo *LI;
   DenseMap<Value*, GetElementPtrInst*> noClobberClones;
   bool isKernelFunc;
+  AMDGPUAS AMDGPUASI;
 
 public:
   static char ID;
@@ -130,8 +131,8 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   Value *Ptr = I.getPointerOperand();
   if (!DA->isUniform(Ptr))
     return;
-  auto isGlobalLoad = [](LoadInst &Load)->bool {
-    return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  auto isGlobalLoad = [&](LoadInst &Load)->bool {
+    return Load.getPointerAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
   };
   // We're tracking up to the Function boundaries
   // We cannot go beyond because of FunctionPass restrictions
@@ -166,6 +167,7 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
 }
 
 bool AMDGPUAnnotateUniformValues::doInitialization(Module &M) {
+  AMDGPUASI = AMDGPU::getAMDGPUAS(M);
   return false;
 }
 
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index d1bc438a6c556cc70910d1500138a933ee8ccdd8..0446655830d1f6a6060d5eb5c55192f2ed887bf5 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -17,11 +17,11 @@
 //
 
 #include "AMDGPUAsmPrinter.h"
+#include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUTargetStreamer.h"
 #include "InstPrinter/AMDGPUInstPrinter.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "AMDGPU.h"
-#include "AMDKernelCodeT.h"
 #include "AMDGPUSubtarget.h"
 #include "R600Defines.h"
 #include "R600MachineFunctionInfo.h"
@@ -93,28 +93,40 @@ extern "C" void LLVMInitializeAMDGPUAsmPrinter() {
 
 AMDGPUAsmPrinter::AMDGPUAsmPrinter(TargetMachine &TM,
                                    std::unique_ptr<MCStreamer> Streamer)
-  : AsmPrinter(TM, std::move(Streamer)) {}
+  : AsmPrinter(TM, std::move(Streamer)) {
+    AMDGPUASI = static_cast<AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
+  }
 
 StringRef AMDGPUAsmPrinter::getPassName() const {
   return "AMDGPU Assembly Printer";
 }
 
+const MCSubtargetInfo* AMDGPUAsmPrinter::getSTI() const {
+  return TM.getMCSubtargetInfo();
+}
+
+AMDGPUTargetStreamer& AMDGPUAsmPrinter::getTargetStreamer() const {
+  return static_cast<AMDGPUTargetStreamer&>(*OutStreamer->getTargetStreamer());
+}
+
 void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
     return;
 
-  AMDGPUTargetStreamer *TS =
-      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(getSTI()->getFeatureBits());
 
-  TS->EmitDirectiveHSACodeObjectVersion(2, 1);
+  getTargetStreamer().EmitDirectiveHSACodeObjectVersion(2, 1);
+  getTargetStreamer().EmitDirectiveHSACodeObjectISA(
+      ISA.Major, ISA.Minor, ISA.Stepping, "AMD", "AMDGPU");
+  getTargetStreamer().EmitStartOfCodeObjectMetadata(M);
+}
 
-  const MCSubtargetInfo *STI = TM.getMCSubtargetInfo();
-  AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(STI->getFeatureBits());
-  TS->EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor, ISA.Stepping,
-                                    "AMD", "AMDGPU");
+void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
 
-  // Emit runtime metadata.
-  TS->EmitRuntimeMetadata(M);
+  getTargetStreamer().EmitEndOfCodeObjectMetadata();
 }
 
 bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
@@ -131,25 +143,32 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
   return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
 }
 
-
 void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
   SIProgramInfo KernelInfo;
+  amd_kernel_code_t KernelCode;
   if (STM.isAmdCodeObjectV2(*MF)) {
     getSIProgramInfo(KernelInfo, *MF);
-    EmitAmdKernelCodeT(*MF, KernelInfo);
+    getAmdKernelCode(KernelCode, KernelInfo, *MF);
+
+    OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
+    getTargetStreamer().EmitAMDKernelCodeT(KernelCode);
   }
+
+  if (TM.getTargetTriple().getOS() != Triple::AMDHSA)
+    return;
+  getTargetStreamer().EmitKernelCodeObjectMetadata(*MF->getFunction(),
+                                                   KernelCode);
 }
 
 void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
   const SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   const AMDGPUSubtarget &STM = MF->getSubtarget<AMDGPUSubtarget>();
-  if (MFI->isKernel() && STM.isAmdCodeObjectV2(*MF)) {
-    AMDGPUTargetStreamer *TS =
-        static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
+  if (MFI->isEntryFunction() && STM.isAmdCodeObjectV2(*MF)) {
     SmallString<128> SymbolName;
     getNameWithPrefix(SymbolName, MF->getFunction()),
-    TS->EmitAMDGPUSymbolType(SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
+    getTargetStreamer().EmitAMDGPUSymbolType(
+        SymbolName, ELF::STT_AMDGPU_HSA_KERNEL);
   }
 
   AsmPrinter::EmitFunctionEntryLabel();
@@ -158,7 +177,7 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
 void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   // Group segment variables aren't emitted in HSA.
-  if (AMDGPU::isGroupSegment(GV))
+  if (AMDGPU::isGroupSegment(GV, AMDGPUASI))
     return;
 
   AsmPrinter::EmitGlobalVariable(GV);
@@ -242,6 +261,9 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:USER_SGPR: " +
                                   Twine(G_00B84C_USER_SGPR(KernelInfo.ComputePGMRSrc2)),
                                   false);
+      OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TRAP_HANDLER: " +
+                                  Twine(G_00B84C_TRAP_HANDLER(KernelInfo.ComputePGMRSrc2)),
+                                  false);
       OutStreamer->emitRawComment(" COMPUTE_PGM_RSRC2:TGID_X_EN: " +
                                   Twine(G_00B84C_TGID_X_EN(KernelInfo.ComputePGMRSrc2)),
                                   false);
@@ -377,6 +399,10 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
         case AMDGPU::EXEC_HI:
         case AMDGPU::SCC:
         case AMDGPU::M0:
+        case AMDGPU::SRC_SHARED_BASE:
+        case AMDGPU::SRC_SHARED_LIMIT:
+        case AMDGPU::SRC_PRIVATE_BASE:
+        case AMDGPU::SRC_PRIVATE_LIMIT:
           continue;
 
         case AMDGPU::VCC:
@@ -473,33 +499,20 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       ExtraSGPRs = 6;
   }
 
-  // Record first reserved register and reserved register count fields, and
-  // update max register counts if "amdgpu-debugger-reserve-regs" attribute was
-  // requested.
-  ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
-  ProgInfo.ReservedVGPRCount = RI->getNumDebuggerReservedVGPRs(STM);
-
-  // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
-  // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
-  // attribute was requested.
-  if (STM.debuggerEmitPrologue()) {
-    ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
-      RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
-    ProgInfo.DebuggerPrivateSegmentBufferSGPR =
-      RI->getHWRegIndex(MFI->getScratchRSrcReg());
-  }
+  unsigned ExtraVGPRs = STM.getReservedNumVGPRs(MF);
 
   // Check the addressable register limit before we add ExtraSGPRs.
   if (STM.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS &&
       !STM.hasSGPRInitBug()) {
-    unsigned MaxAddressableNumSGPRs = STM.getMaxNumSGPRs();
+    unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
     if (MaxSGPR + 1 > MaxAddressableNumSGPRs) {
       // This can happen due to a compiler bug or when using inline asm.
       LLVMContext &Ctx = MF.getFunction()->getContext();
       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
                                        "addressable scalar registers",
                                        MaxSGPR + 1, DS_Error,
-                                       DK_ResourceLimit, MaxAddressableNumSGPRs);
+                                       DK_ResourceLimit,
+                                       MaxAddressableNumSGPRs);
       Ctx.diagnose(Diag);
       MaxSGPR = MaxAddressableNumSGPRs - 1;
     }
@@ -507,41 +520,43 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   // Account for extra SGPRs and VGPRs reserved for debugger use.
   MaxSGPR += ExtraSGPRs;
-  MaxVGPR += RI->getNumDebuggerReservedVGPRs(STM);
+  MaxVGPR += ExtraVGPRs;
 
   // We found the maximum register index. They start at 0, so add one to get the
   // number of registers.
-  ProgInfo.NumVGPR = MaxVGPR + 1;
   ProgInfo.NumSGPR = MaxSGPR + 1;
+  ProgInfo.NumVGPR = MaxVGPR + 1;
 
   // Adjust number of registers used to meet default/requested minimum/maximum
   // number of waves per execution unit request.
   ProgInfo.NumSGPRsForWavesPerEU = std::max(
-    ProgInfo.NumSGPR, RI->getMinNumSGPRs(STM, MFI->getMaxWavesPerEU()));
+    ProgInfo.NumSGPR, STM.getMinNumSGPRs(MFI->getMaxWavesPerEU()));
   ProgInfo.NumVGPRsForWavesPerEU = std::max(
-    ProgInfo.NumVGPR, RI->getMinNumVGPRs(MFI->getMaxWavesPerEU()));
+    ProgInfo.NumVGPR, STM.getMinNumVGPRs(MFI->getMaxWavesPerEU()));
 
   if (STM.getGeneration() <= AMDGPUSubtarget::SEA_ISLANDS ||
       STM.hasSGPRInitBug()) {
-    unsigned MaxNumSGPRs = STM.getMaxNumSGPRs();
-    if (ProgInfo.NumSGPR > MaxNumSGPRs) {
-      // This can happen due to a compiler bug or when using inline asm to use the
-      // registers which are usually reserved for vcc etc.
-
+    unsigned MaxAddressableNumSGPRs = STM.getAddressableNumSGPRs();
+    if (ProgInfo.NumSGPR > MaxAddressableNumSGPRs) {
+      // This can happen due to a compiler bug or when using inline asm to use
+      // the registers which are usually reserved for vcc etc.
       LLVMContext &Ctx = MF.getFunction()->getContext();
       DiagnosticInfoResourceLimit Diag(*MF.getFunction(),
                                        "scalar registers",
                                        ProgInfo.NumSGPR, DS_Error,
-                                       DK_ResourceLimit, MaxNumSGPRs);
+                                       DK_ResourceLimit,
+                                       MaxAddressableNumSGPRs);
       Ctx.diagnose(Diag);
-      ProgInfo.NumSGPR = MaxNumSGPRs;
-      ProgInfo.NumSGPRsForWavesPerEU = MaxNumSGPRs;
+      ProgInfo.NumSGPR = MaxAddressableNumSGPRs;
+      ProgInfo.NumSGPRsForWavesPerEU = MaxAddressableNumSGPRs;
     }
   }
 
   if (STM.hasSGPRInitBug()) {
-    ProgInfo.NumSGPR = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
-    ProgInfo.NumSGPRsForWavesPerEU = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+    ProgInfo.NumSGPR =
+        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
+    ProgInfo.NumSGPRsForWavesPerEU =
+        AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
   }
 
   if (MFI->NumUserSGPRs > STM.getMaxNumUserSGPRs()) {
@@ -560,13 +575,27 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
 
   // SGPRBlocks is actual number of SGPR blocks minus 1.
   ProgInfo.SGPRBlocks = alignTo(ProgInfo.NumSGPRsForWavesPerEU,
-                                RI->getSGPRAllocGranule());
-  ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / RI->getSGPRAllocGranule() - 1;
+                                STM.getSGPREncodingGranule());
+  ProgInfo.SGPRBlocks = ProgInfo.SGPRBlocks / STM.getSGPREncodingGranule() - 1;
 
   // VGPRBlocks is actual number of VGPR blocks minus 1.
   ProgInfo.VGPRBlocks = alignTo(ProgInfo.NumVGPRsForWavesPerEU,
-                                RI->getVGPRAllocGranule());
-  ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / RI->getVGPRAllocGranule() - 1;
+                                STM.getVGPREncodingGranule());
+  ProgInfo.VGPRBlocks = ProgInfo.VGPRBlocks / STM.getVGPREncodingGranule() - 1;
+
+  // Record first reserved VGPR and number of reserved VGPRs.
+  ProgInfo.ReservedVGPRFirst = STM.debuggerReserveRegs() ? MaxVGPR + 1 : 0;
+  ProgInfo.ReservedVGPRCount = STM.getReservedNumVGPRs(MF);
+
+  // Update DebuggerWavefrontPrivateSegmentOffsetSGPR and
+  // DebuggerPrivateSegmentBufferSGPR fields if "amdgpu-debugger-emit-prologue"
+  // attribute was requested.
+  if (STM.debuggerEmitPrologue()) {
+    ProgInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR =
+      RI->getHWRegIndex(MFI->getScratchWaveOffsetReg());
+    ProgInfo.DebuggerPrivateSegmentBufferSGPR =
+      RI->getHWRegIndex(MFI->getScratchRSrcReg());
+  }
 
   // Set the value to initialize FP_ROUND and FP_DENORM parts of the mode
   // register.
@@ -575,7 +604,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.IEEEMode = STM.enableIEEEBit(MF);
 
   // Make clamp modifier on NaN input returns 0.
-  ProgInfo.DX10Clamp = 1;
+  ProgInfo.DX10Clamp = STM.enableDX10Clamp();
 
   const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   ProgInfo.ScratchSize = FrameInfo.getStackSize();
@@ -630,6 +659,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
   ProgInfo.ComputePGMRSrc2 =
       S_00B84C_SCRATCH_EN(ProgInfo.ScratchBlocks > 0) |
       S_00B84C_USER_SGPR(MFI->getNumUserSGPRs()) |
+      S_00B84C_TRAP_HANDLER(STM.isTrapHandlerEnabled()) |
       S_00B84C_TGID_X_EN(MFI->hasWorkGroupIDX()) |
       S_00B84C_TGID_Y_EN(MFI->hasWorkGroupIDY()) |
       S_00B84C_TGID_Z_EN(MFI->hasWorkGroupIDZ()) |
@@ -683,7 +713,7 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
     OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
     OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(KernelInfo.LDSBlocks), 4);
     OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
-    OutStreamer->EmitIntValue(MFI->PSInputEna, 4);
+    OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
     OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
     OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
   }
@@ -708,97 +738,88 @@ static amd_element_byte_size_t getElementByteSizeValue(unsigned Size) {
   }
 }
 
-void AMDGPUAsmPrinter::EmitAmdKernelCodeT(const MachineFunction &MF,
-                                         const SIProgramInfo &KernelInfo) const {
+void AMDGPUAsmPrinter::getAmdKernelCode(amd_kernel_code_t &Out,
+                                        const SIProgramInfo &KernelInfo,
+                                        const MachineFunction &MF) const {
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const SISubtarget &STM = MF.getSubtarget<SISubtarget>();
-  amd_kernel_code_t header;
 
-  AMDGPU::initDefaultAMDKernelCodeT(header, STM.getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Out, STM.getFeatureBits());
 
-  header.compute_pgm_resource_registers =
+  Out.compute_pgm_resource_registers =
       KernelInfo.ComputePGMRSrc1 |
       (KernelInfo.ComputePGMRSrc2 << 32);
-  header.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
-
+  Out.code_properties = AMD_CODE_PROPERTY_IS_PTR64;
 
-  AMD_HSA_BITS_SET(header.code_properties,
+  AMD_HSA_BITS_SET(Out.code_properties,
                    AMD_CODE_PROPERTY_PRIVATE_ELEMENT_SIZE,
                    getElementByteSizeValue(STM.getMaxPrivateElementSize()));
 
   if (MFI->hasPrivateSegmentBuffer()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_PRIVATE_SEGMENT_BUFFER;
   }
 
   if (MFI->hasDispatchPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
   if (MFI->hasQueuePtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_QUEUE_PTR;
 
   if (MFI->hasKernargSegmentPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_KERNARG_SEGMENT_PTR;
 
   if (MFI->hasDispatchID())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_ID;
 
   if (MFI->hasFlatScratchInit())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
-
-  // TODO: Private segment size
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_FLAT_SCRATCH_INIT;
 
   if (MFI->hasGridWorkgroupCountX()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_X;
   }
 
   if (MFI->hasGridWorkgroupCountY()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Y;
   }
 
   if (MFI->hasGridWorkgroupCountZ()) {
-    header.code_properties |=
+    Out.code_properties |=
       AMD_CODE_PROPERTY_ENABLE_SGPR_GRID_WORKGROUP_COUNT_Z;
   }
 
   if (MFI->hasDispatchPtr())
-    header.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
+    Out.code_properties |= AMD_CODE_PROPERTY_ENABLE_SGPR_DISPATCH_PTR;
 
   if (STM.debuggerSupported())
-    header.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED;
 
   if (STM.isXNACKEnabled())
-    header.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
+    Out.code_properties |= AMD_CODE_PROPERTY_IS_XNACK_SUPPORTED;
 
   // FIXME: Should use getKernArgSize
-  header.kernarg_segment_byte_size =
+  Out.kernarg_segment_byte_size =
     STM.getKernArgSegmentSize(MF, MFI->getABIArgOffset());
-  header.wavefront_sgpr_count = KernelInfo.NumSGPR;
-  header.workitem_vgpr_count = KernelInfo.NumVGPR;
-  header.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
-  header.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
-  header.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
-  header.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
+  Out.wavefront_sgpr_count = KernelInfo.NumSGPR;
+  Out.workitem_vgpr_count = KernelInfo.NumVGPR;
+  Out.workitem_private_segment_byte_size = KernelInfo.ScratchSize;
+  Out.workgroup_group_segment_byte_size = KernelInfo.LDSSize;
+  Out.reserved_vgpr_first = KernelInfo.ReservedVGPRFirst;
+  Out.reserved_vgpr_count = KernelInfo.ReservedVGPRCount;
 
   // These alignment values are specified in powers of two, so alignment =
   // 2^n.  The minimum alignment is 2^4 = 16.
-  header.kernarg_segment_alignment = std::max((size_t)4,
+  Out.kernarg_segment_alignment = std::max((size_t)4,
       countTrailingZeros(MFI->getMaxKernArgAlign()));
 
   if (STM.debuggerEmitPrologue()) {
-    header.debug_wavefront_private_segment_offset_sgpr =
+    Out.debug_wavefront_private_segment_offset_sgpr =
       KernelInfo.DebuggerWavefrontPrivateSegmentOffsetSGPR;
-    header.debug_private_segment_buffer_sgpr =
+    Out.debug_private_segment_buffer_sgpr =
       KernelInfo.DebuggerPrivateSegmentBufferSGPR;
   }
-
-  AMDGPUTargetStreamer *TS =
-      static_cast<AMDGPUTargetStreamer *>(OutStreamer->getTargetStreamer());
-
-  OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
-  TS->EmitAMDKernelCodeT(header);
 }
 
 bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index 99fa157a3ec31dbd1d5c53b4da2d9dcffc08fb50..13425c8b2a0f5954617e1abb623aa489139d4e31 100644
--- a/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -15,6 +15,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUASMPRINTER_H
 
+#include "AMDKernelCodeT.h"
+#include "AMDGPU.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include <cstddef>
@@ -26,6 +28,7 @@
 
 namespace llvm {
 
+class AMDGPUTargetStreamer;
 class MCOperand;
 
 class AMDGPUAsmPrinter final : public AsmPrinter {
@@ -88,6 +91,8 @@ private:
   };
 
   void getSIProgramInfo(SIProgramInfo &Out, const MachineFunction &MF) const;
+  void getAmdKernelCode(amd_kernel_code_t &Out, const SIProgramInfo &KernelInfo,
+                        const MachineFunction &MF) const;
   void findNumUsedRegistersSI(const MachineFunction &MF,
                               unsigned &NumSGPR,
                               unsigned &NumVGPR) const;
@@ -96,21 +101,28 @@ private:
   /// can correctly setup the GPU state.
   void EmitProgramInfoR600(const MachineFunction &MF);
   void EmitProgramInfoSI(const MachineFunction &MF, const SIProgramInfo &KernelInfo);
-  void EmitAmdKernelCodeT(const MachineFunction &MF,
-                          const SIProgramInfo &KernelInfo) const;
 
 public:
   explicit AMDGPUAsmPrinter(TargetMachine &TM,
                             std::unique_ptr<MCStreamer> Streamer);
 
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
   StringRef getPassName() const override;
 
+  const MCSubtargetInfo* getSTI() const;
+
+  AMDGPUTargetStreamer& getTargetStreamer() const;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
   /// \brief Wrapper for MCInstLowering.lowerOperand() for the tblgen'erated
   /// pseudo lowering.
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const;
 
+  /// \brief Lower the specified LLVM Constant to an MCExpr.
+  /// The AsmPrinter::lowerConstantof does not know how to lower
+  /// addrspacecast, therefore they should be lowered by this function.
+  const MCExpr *lowerConstant(const Constant *CV) override;
+
   /// \brief tblgen'erated driver function for lowering simple MI->MC pseudo
   /// instructions.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
@@ -127,6 +139,8 @@ public:
 
   void EmitStartOfAsmFile(Module &M) override;
 
+  void EmitEndOfAsmFile(Module &M) override;
+
   bool isBlockOnlyReachableByFallthrough(
     const MachineBasicBlock *MBB) const override;
 
@@ -137,6 +151,7 @@ public:
 protected:
   std::vector<std::string> DisasmLines, HexLines;
   size_t DisasmLineMaxLen;
+  AMDGPUAS AMDGPUASI;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index ae5fb358154244b57131e9ace20723a0edece7c8..e67ae092fddae9c5006eae11bcdca356464b22cd 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 #endif
 
 AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
-  : CallLowering(&TLI) {
+  : CallLowering(&TLI), AMDGPUASI(TLI.getAMDGPUAS()) {
 }
 
 bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -49,8 +49,8 @@ unsigned AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &MIRBuilder,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const Function &F = *MF.getFunction();
   const DataLayout &DL = F.getParent()->getDataLayout();
-  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
-  LLT PtrType(*PtrTy, DL);
+  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
+  LLT PtrType = getLLTForType(*PtrTy, DL);
   unsigned DstReg = MRI.createGenericVirtualRegister(PtrType);
   unsigned KernArgSegmentPtr =
       TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
@@ -70,7 +70,7 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &MIRBuilder,
   MachineFunction &MF = MIRBuilder.getMF();
   const Function &F = *MF.getFunction();
   const DataLayout &DL = F.getParent()->getDataLayout();
-  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUASI.CONSTANT_ADDRESS);
   MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
   unsigned Align = DL.getABITypeAlignment(ParamTy);
diff --git a/lib/Target/AMDGPU/AMDGPUCallLowering.h b/lib/Target/AMDGPU/AMDGPUCallLowering.h
index b5f3fa5617b896488ff29ba5652d288ab3e529c6..09bdf8ffcde7b1e1a7ad123d129c5e925be696d6 100644
--- a/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUCALLLOWERING_H
 
+#include "AMDGPU.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 
 namespace llvm {
@@ -22,6 +23,7 @@ namespace llvm {
 class AMDGPUTargetLowering;
 
 class AMDGPUCallLowering: public CallLowering {
+  AMDGPUAS AMDGPUASI;
 
   unsigned lowerParameterPtr(MachineIRBuilder &MIRBuilder, Type *ParamTy,
                              unsigned Offset) const;
diff --git a/lib/Target/AMDGPU/AMDGPUCallingConv.td b/lib/Target/AMDGPU/AMDGPUCallingConv.td
index 47dfa499206811a6841c0241a08535f01f75818a..d308f718aae130f3e4fc5815ba9c86ae7212cb25 100644
--- a/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -17,7 +17,7 @@ class CCIfNotInReg<CCAction A> : CCIf<"!ArgFlags.isInReg()", A> {}
 // Calling convention for SI
 def CC_SI : CallingConv<[
 
-  CCIfInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+  CCIfInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -25,17 +25,13 @@ def CC_SI : CallingConv<[
     SGPR32, SGPR33, SGPR34, SGPR35, SGPR36, SGPR37, SGPR38, SGPR39
   ]>>>,
 
-  CCIfInReg<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
-      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
-      SGPR32, SGPR34, SGPR36, SGPR38 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
-      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
-      SGPR33, SGPR35, SGPR37, SGPR39 ]
-  >>>,
+  // We have no way of referring to the generated register tuples
+  // here, so use a custom function.
+  CCIfInReg<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
+  CCIfByVal<CCIfType<[i64], CCCustom<"allocateSGPRTuple">>>,
 
   // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
-  CCIfNotInReg<CCIfType<[f32, i32] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -53,17 +49,7 @@ def CC_SI : CallingConv<[
     VGPR112, VGPR113, VGPR114, VGPR115, VGPR116, VGPR117, VGPR118, VGPR119,
     VGPR120, VGPR121, VGPR122, VGPR123, VGPR124, VGPR125, VGPR126, VGPR127,
     VGPR128, VGPR129, VGPR130, VGPR131, VGPR132, VGPR133, VGPR134, VGPR135
-  ]>>>,
-
-  CCIfByVal<CCIfType<[i64] , CCAssignToRegWithShadow<
-    [ SGPR0, SGPR2, SGPR4, SGPR6, SGPR8, SGPR10, SGPR12, SGPR14,
-      SGPR16, SGPR18, SGPR20, SGPR22, SGPR24, SGPR26, SGPR28, SGPR30,
-      SGPR32, SGPR34, SGPR36, SGPR38 ],
-    [ SGPR1, SGPR3, SGPR5, SGPR7, SGPR9, SGPR11, SGPR13, SGPR15,
-      SGPR17, SGPR19, SGPR21, SGPR23, SGPR25, SGPR27, SGPR29, SGPR31,
-      SGPR33, SGPR35, SGPR37, SGPR39 ]
-  >>>
-
+  ]>>>
 ]>;
 
 def RetCC_SI : CallingConv<[
@@ -76,7 +62,7 @@ def RetCC_SI : CallingConv<[
   ]>>,
 
   // 32*4 + 4 is the minimum for a fetch shader with 32 outputs.
-  CCIfType<[f32] , CCAssignToReg<[
+  CCIfType<[f32, f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
diff --git a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 96131d461515b14c3aaea42bda8181eff59e1bd3..e19314fe0a6c83f01843f565274f4759021fcc63 100644
--- a/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -14,7 +14,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPUIntrinsicInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "llvm/ADT/StringRef.h"
@@ -59,8 +58,6 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// binary operation \p V.
   ///
   /// \returns Binary operation \p V.
-  Value *copyFlags(const BinaryOperator &I, Value *V) const;
-
   /// \returns \p T's base element bit width.
   unsigned getBaseElementBitWidth(const Type *T) const;
 
@@ -156,21 +153,6 @@ public:
 
 } // end anonymous namespace
 
-Value *AMDGPUCodeGenPrepare::copyFlags(
-    const BinaryOperator &I, Value *V) const {
-  BinaryOperator *BinOp = dyn_cast<BinaryOperator>(V);
-  if (!BinOp) // Possibly constant expression.
-    return V;
-
-  if (isa<OverflowingBinaryOperator>(BinOp)) {
-    BinOp->setHasNoSignedWrap(I.hasNoSignedWrap());
-    BinOp->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
-  } else if (isa<PossiblyExactOperator>(BinOp))
-    BinOp->setIsExact(I.isExact());
-
-  return V;
-}
-
 unsigned AMDGPUCodeGenPrepare::getBaseElementBitWidth(const Type *T) const {
   assert(needsPromotionToI32(T) && "T does not need promotion to i32");
 
@@ -198,12 +180,48 @@ bool AMDGPUCodeGenPrepare::isSigned(const SelectInst &I) const {
 }
 
 bool AMDGPUCodeGenPrepare::needsPromotionToI32(const Type *T) const {
-  if (T->isIntegerTy() && T->getIntegerBitWidth() > 1 &&
-      T->getIntegerBitWidth() <= 16)
+  const IntegerType *IntTy = dyn_cast<IntegerType>(T);
+  if (IntTy && IntTy->getBitWidth() > 1 && IntTy->getBitWidth() <= 16)
+    return true;
+
+  if (const VectorType *VT = dyn_cast<VectorType>(T)) {
+    // TODO: The set of packed operations is more limited, so may want to
+    // promote some anyway.
+    if (ST->hasVOP3PInsts())
+      return false;
+
+    return needsPromotionToI32(VT->getElementType());
+  }
+
+  return false;
+}
+
+// Return true if the op promoted to i32 should have nsw set.
+static bool promotedOpIsNSW(const Instruction &I) {
+  switch (I.getOpcode()) {
+  case Instruction::Shl:
+  case Instruction::Add:
+  case Instruction::Sub:
     return true;
-  if (!T->isVectorTy())
+  case Instruction::Mul:
+    return I.hasNoUnsignedWrap();
+  default:
     return false;
-  return needsPromotionToI32(cast<VectorType>(T)->getElementType());
+  }
+}
+
+// Return true if the op promoted to i32 should have nuw set.
+static bool promotedOpIsNUW(const Instruction &I) {
+  switch (I.getOpcode()) {
+  case Instruction::Shl:
+  case Instruction::Add:
+  case Instruction::Mul:
+    return true;
+  case Instruction::Sub:
+    return I.hasNoUnsignedWrap();
+  default:
+    return false;
+  }
 }
 
 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
@@ -230,7 +248,19 @@ bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
     ExtOp0 = Builder.CreateZExt(I.getOperand(0), I32Ty);
     ExtOp1 = Builder.CreateZExt(I.getOperand(1), I32Ty);
   }
-  ExtRes = copyFlags(I, Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1));
+
+  ExtRes = Builder.CreateBinOp(I.getOpcode(), ExtOp0, ExtOp1);
+  if (Instruction *Inst = dyn_cast<Instruction>(ExtRes)) {
+    if (promotedOpIsNSW(cast<Instruction>(I)))
+      Inst->setHasNoSignedWrap();
+
+    if (promotedOpIsNUW(cast<Instruction>(I)))
+      Inst->setHasNoUnsignedWrap();
+
+    if (const auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I))
+      Inst->setIsExact(ExactOp->isExact());
+  }
+
   TruncRes = Builder.CreateTrunc(ExtRes, I.getType());
 
   I.replaceAllUsesWith(TruncRes);
@@ -358,9 +388,7 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
   Builder.setFastMathFlags(FMF);
   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 
-  const AMDGPUIntrinsicInfo *II = TM->getIntrinsicInfo();
-  Function *Decl
-    = II->getDeclaration(Mod, AMDGPUIntrinsic::amdgcn_fdiv_fast, {});
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
 
   Value *Num = FDiv.getOperand(0);
   Value *Den = FDiv.getOperand(1);
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
index 805fb7102a353dfaeafdbd6d97fa44cf2fd2b153..e32ca9653b3a1f314d2b0cbba9dcbd98bcd9a638 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.cpp
@@ -12,11 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUFrameLowering.h"
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUSubtarget.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 AMDGPUFrameLowering::AMDGPUFrameLowering(StackDirection D, unsigned StackAl,
@@ -69,34 +64,3 @@ unsigned AMDGPUFrameLowering::getStackWidth(const MachineFunction &MF) const {
   // T1.W = stack[1].w
   return 1;
 }
-
-/// \returns The number of registers allocated for \p FI.
-int AMDGPUFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                                int FI,
-                                                unsigned &FrameReg) const {
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const AMDGPURegisterInfo *RI
-    = MF.getSubtarget<AMDGPUSubtarget>().getRegisterInfo();
-
-  // Fill in FrameReg output argument.
-  FrameReg = RI->getFrameRegister(MF);
-
-  // Start the offset at 2 so we don't overwrite work group information.
-  // XXX: We should only do this when the shader actually uses this
-  // information.
-  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
-  int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
-
-  for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
-    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
-    OffsetBytes += MFI.getObjectSize(i);
-    // Each register holds 4 bytes, so we must always align the offset to at
-    // least 4 bytes, so that 2 frame objects won't share the same register.
-    OffsetBytes = alignTo(OffsetBytes, 4);
-  }
-
-  if (FI != -1)
-    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
-
-  return OffsetBytes / (getStackWidth(MF) * 4);
-}
diff --git a/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 5d51351a00d27c735d67ebc55e34376e3a6bde42..8e187c7e56c197cfbcb94c331d810a8655686705 100644
--- a/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -34,9 +34,6 @@ public:
   /// values to the stack.
   unsigned getStackWidth(const MachineFunction &MF) const;
 
-  int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
-
   bool hasFP(const MachineFunction &MF) const override {
     return false;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 774810e2e01a8c72f8784619a701f8ffcc96005f..fc3ea67fad01a2d5edf07a3b3cc54bddc54fde05 100644
--- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -67,10 +67,13 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   // Subtarget - Keep a pointer to the AMDGPU Subtarget around so that we can
   // make the right decision when generating code for different targets.
   const AMDGPUSubtarget *Subtarget;
+  AMDGPUAS AMDGPUASI;
 
 public:
   explicit AMDGPUDAGToDAGISel(TargetMachine &TM, CodeGenOpt::Level OptLevel)
-      : SelectionDAGISel(TM, OptLevel) {}
+      : SelectionDAGISel(TM, OptLevel){
+    AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
+  }
   ~AMDGPUDAGToDAGISel() override = default;
 
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -80,6 +83,7 @@ public:
 
 private:
   SDValue foldFrameIndex(SDValue N) const;
+  bool isNoNanSrc(SDValue N) const;
   bool isInlineImmediate(const SDNode *N) const;
   bool FoldOperand(SDValue &Src, SDValue &Sel, SDValue &Neg, SDValue &Abs,
                    const R600InstrInfo *TII);
@@ -143,6 +147,8 @@ private:
   bool SelectSMRDBufferImm32(SDValue Addr, SDValue &Offset) const;
   bool SelectSMRDBufferSgpr(SDValue Addr, SDValue &Offset) const;
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
+
+  bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3Mods0(SDValue In, SDValue &Src, SDValue &SrcMods,
@@ -156,6 +162,13 @@ private:
                                  SDValue &Clamp,
                                  SDValue &Omod) const;
 
+  bool SelectVOP3OMods(SDValue In, SDValue &Src,
+                       SDValue &Clamp, SDValue &Omod) const;
+
+  bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
+                        SDValue &Clamp) const;
+
   void SelectADD_SUB_I64(SDNode *N);
   void SelectUADDO_USUBO(SDNode *N);
   void SelectDIV_SCALE(SDNode *N);
@@ -188,6 +201,17 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
+bool AMDGPUDAGToDAGISel::isNoNanSrc(SDValue N) const {
+  if (TM.Options.NoNaNsFPMath)
+    return true;
+
+  // TODO: Move into isKnownNeverNaN
+  if (const auto *BO = dyn_cast<BinaryWithFlagsSDNode>(N))
+    return BO->Flags.hasNoNaNs();
+
+  return CurDAG->isKnownNeverNaN(N);
+}
+
 bool AMDGPUDAGToDAGISel::isInlineImmediate(const SDNode *N) const {
   const SIInstrInfo *TII
     = static_cast<const SISubtarget *>(Subtarget)->getInstrInfo();
@@ -251,7 +275,7 @@ const TargetRegisterClass *AMDGPUDAGToDAGISel::getOperandRegClass(SDNode *N,
 
 SDNode *AMDGPUDAGToDAGISel::glueCopyToM0(SDNode *N) const {
   if (Subtarget->getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS ||
-      cast<MemSDNode>(N)->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+      cast<MemSDNode>(N)->getAddressSpace() != AMDGPUASI.LOCAL_ADDRESS)
     return N;
 
   const SITargetLowering& Lowering =
@@ -291,6 +315,20 @@ static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
   llvm_unreachable("invalid vector size");
 }
 
+static bool getConstantValue(SDValue N, uint32_t &Out) {
+  if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(N)) {
+    Out = C->getAPIntValue().getZExtValue();
+    return true;
+  }
+
+  if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(N)) {
+    Out = C->getValueAPF().bitcastToAPInt().getZExtValue();
+    return true;
+  }
+
+  return false;
+}
+
 void AMDGPUDAGToDAGISel::Select(SDNode *N) {
   unsigned int Opc = N->getOpcode();
   if (N->isMachineOpcode()) {
@@ -342,7 +380,24 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     EVT VT = N->getValueType(0);
     unsigned NumVectorElts = VT.getVectorNumElements();
     EVT EltVT = VT.getVectorElementType();
+
+    if (VT == MVT::v2i16 || VT == MVT::v2f16) {
+      if (Opc == ISD::BUILD_VECTOR) {
+        uint32_t LHSVal, RHSVal;
+        if (getConstantValue(N->getOperand(0), LHSVal) &&
+            getConstantValue(N->getOperand(1), RHSVal)) {
+          uint32_t K = LHSVal | (RHSVal << 16);
+          CurDAG->SelectNodeTo(N, AMDGPU::S_MOV_B32, VT,
+                               CurDAG->getTargetConstant(K, SDLoc(N), MVT::i32));
+          return;
+        }
+      }
+
+      break;
+    }
+
     assert(EltVT.bitsEq(MVT::i32));
+
     if (Subtarget->getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
       RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
     } else {
@@ -537,9 +592,9 @@ bool AMDGPUDAGToDAGISel::isConstantLoad(const MemSDNode *N, int CbId) const {
   if (!N->readMem())
     return false;
   if (CbId == -1)
-    return N->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+    return N->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
 
-  return N->getAddressSpace() == AMDGPUAS::CONSTANT_BUFFER_0 + CbId;
+  return N->getAddressSpace() == AMDGPUASI.CONSTANT_BUFFER_0 + CbId;
 }
 
 bool AMDGPUDAGToDAGISel::isUniformBr(const SDNode *N) const {
@@ -1487,7 +1542,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
 void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
   MemSDNode *Mem = cast<MemSDNode>(N);
   unsigned AS = Mem->getAddressSpace();
-  if (AS == AMDGPUAS::FLAT_ADDRESS) {
+  if (AS == AMDGPUASI.FLAT_ADDRESS) {
     SelectCode(N);
     return;
   }
@@ -1551,7 +1606,6 @@ void AMDGPUDAGToDAGISel::SelectATOMIC_CMP_SWAP(SDNode *N) {
 bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
                                         SDValue &SrcMods) const {
   unsigned Mods = 0;
-
   Src = In;
 
   if (Src.getOpcode() == ISD::FNEG) {
@@ -1565,10 +1619,15 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods(SDValue In, SDValue &Src,
   }
 
   SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
-
   return true;
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
+                                             SDValue &SrcMods) const {
+  SelectVOP3Mods(In, Src, SrcMods);
+  return isNoNanSrc(Src);
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src,
                                          SDValue &SrcMods) const {
   bool Res = SelectVOP3Mods(In, Src, SrcMods);
@@ -1613,6 +1672,50 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods0Clamp0OMod(SDValue In, SDValue &Src,
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
+bool AMDGPUDAGToDAGISel::SelectVOP3OMods(SDValue In, SDValue &Src,
+                                         SDValue &Clamp, SDValue &Omod) const {
+  Src = In;
+
+  SDLoc DL(In);
+  // FIXME: Handle Clamp and Omod
+  Clamp = CurDAG->getTargetConstant(0, DL, MVT::i32);
+  Omod = CurDAG->getTargetConstant(0, DL, MVT::i32);
+
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
+                                         SDValue &SrcMods) const {
+  unsigned Mods = 0;
+  Src = In;
+
+  // FIXME: Look for on separate components
+  if (Src.getOpcode() == ISD::FNEG) {
+    Mods |= (SISrcMods::NEG | SISrcMods::NEG_HI);
+    Src = Src.getOperand(0);
+  }
+
+  // Packed instructions do not have abs modifiers.
+
+  // FIXME: Handle abs/neg of individual components.
+  // FIXME: Handle swizzling with op_sel
+  Mods |= SISrcMods::OP_SEL_1;
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
+                                          SDValue &SrcMods,
+                                          SDValue &Clamp) const {
+  SDLoc SL(In);
+
+  // FIXME: Handle clamp and op_sel
+  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
+
+  return SelectVOP3PMods(In, Src, SrcMods);
+}
+
 void AMDGPUDAGToDAGISel::PostprocessISelDAG() {
   const AMDGPUTargetLowering& Lowering =
     *static_cast<const AMDGPUTargetLowering*>(getTargetLowering());
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index e7f15bcb0818b05741ac5e5803cd86b920610fe6..c0f336e082bd2b71e554ad95ef58f419daace437 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -44,6 +44,37 @@ static bool allocateKernArg(unsigned ValNo, MVT ValVT, MVT LocVT,
   return true;
 }
 
+static bool allocateCCRegs(unsigned ValNo, MVT ValVT, MVT LocVT,
+                           CCValAssign::LocInfo LocInfo,
+                           ISD::ArgFlagsTy ArgFlags, CCState &State,
+                           const TargetRegisterClass *RC,
+                           unsigned NumRegs) {
+  ArrayRef<MCPhysReg> RegList = makeArrayRef(RC->begin(), NumRegs);
+  unsigned RegResult = State.AllocateReg(RegList);
+  if (RegResult == AMDGPU::NoRegister)
+    return false;
+
+  State.addLoc(CCValAssign::getReg(ValNo, ValVT, RegResult, LocVT, LocInfo));
+  return true;
+}
+
+static bool allocateSGPRTuple(unsigned ValNo, MVT ValVT, MVT LocVT,
+                              CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  switch (LocVT.SimpleTy) {
+  case MVT::i64:
+  case MVT::f64:
+  case MVT::v2i32:
+  case MVT::v2f32: {
+    // Up to SGPR0-SGPR39
+    return allocateCCRegs(ValNo, ValVT, LocVT, LocInfo, ArgFlags, State,
+                          &AMDGPU::SGPR_64RegClass, 20);
+  }
+  default:
+    return false;
+  }
+}
+
 #include "AMDGPUGenCallingConv.inc"
 
 // Find a larger type to do a load / store of a vector with.
@@ -59,6 +90,7 @@ EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
 AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
                                            const AMDGPUSubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
+  AMDGPUASI = AMDGPU::getAMDGPUAS(TM);
   // Lower floating point store/load to integer store/load to reduce the number
   // of patterns in tablegen.
   setOperationAction(ISD::LOAD, MVT::f32, Promote);
@@ -212,10 +244,6 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // This is totally unsupported, just custom lower to produce an error.
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
 
-  // We need to custom lower some of the intrinsics
-  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
-  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
-
   // Library functions.  These default to Expand, but we have instructions
   // for them.
   setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
@@ -271,6 +299,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f32, Custom);
 
   const MVT ScalarIntVTs[] = { MVT::i32, MVT::i64 };
   for (MVT VT : ScalarIntVTs) {
@@ -461,10 +490,11 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   // N > 4 stores on the same chain.
   GatherAllAliasesMaxDepth = 16;
 
-  // FIXME: Need to really handle these.
-  MaxStoresPerMemcpy  = 4096;
-  MaxStoresPerMemmove = 4096;
-  MaxStoresPerMemset  = 4096;
+  // memcpy/memmove/memset are expanded in the IR, so we shouldn't need to worry
+  // about these during lowering.
+  MaxStoresPerMemcpy  = 0xffffffff;
+  MaxStoresPerMemmove = 0xffffffff;
+  MaxStoresPerMemset  = 0xffffffff;
 
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::SHL);
@@ -479,12 +509,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FNEG);
+  setTargetDAGCombine(ISD::FABS);
 }
 
 //===----------------------------------------------------------------------===//
 // Target Information
 //===----------------------------------------------------------------------===//
 
+LLVM_READNONE
 static bool fnegFoldsIntoOp(unsigned Opc) {
   switch (Opc) {
   case ISD::FADD:
@@ -492,6 +524,8 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case ISD::FMUL:
   case ISD::FMA:
   case ISD::FMAD:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
   case ISD::FSIN:
   case ISD::FTRUNC:
   case ISD::FRINT:
@@ -500,12 +534,67 @@ static bool fnegFoldsIntoOp(unsigned Opc) {
   case AMDGPUISD::RCP_LEGACY:
   case AMDGPUISD::SIN_HW:
   case AMDGPUISD::FMUL_LEGACY:
+  case AMDGPUISD::FMIN_LEGACY:
+  case AMDGPUISD::FMAX_LEGACY:
     return true;
   default:
     return false;
   }
 }
 
+/// \p returns true if the operation will definitely need to use a 64-bit
+/// encoding, and thus will use a VOP3 encoding regardless of the source
+/// modifiers.
+LLVM_READONLY
+static bool opMustUseVOP3Encoding(const SDNode *N, MVT VT) {
+  return N->getNumOperands() > 2 || VT == MVT::f64;
+}
+
+// Most FP instructions support source modifiers, but this could be refined
+// slightly.
+LLVM_READONLY
+static bool hasSourceMods(const SDNode *N) {
+  if (isa<MemSDNode>(N))
+    return false;
+
+  switch (N->getOpcode()) {
+  case ISD::CopyToReg:
+  case ISD::SELECT:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::INLINEASM:
+  case AMDGPUISD::INTERP_P1:
+  case AMDGPUISD::INTERP_P2:
+  case AMDGPUISD::DIV_SCALE:
+    return false;
+  default:
+    return true;
+  }
+}
+
+static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) {
+  // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus
+  // it is truly free to use a source modifier in all cases. If there are
+  // multiple users but for each one will necessitate using VOP3, there will be
+  // a code size increase. Try to avoid increasing code size unless we know it
+  // will save on the instruction count.
+  unsigned NumMayIncreaseSize = 0;
+  MVT VT = N->getValueType(0).getScalarType().getSimpleVT();
+
+  // XXX - Should this limit number of uses to check?
+  for (const SDNode *U : N->uses()) {
+    if (!hasSourceMods(U))
+      return false;
+
+    if (!opMustUseVOP3Encoding(U, VT)) {
+      if (++NumMayIncreaseSize > CostThreshold)
+        return false;
+    }
+  }
+
+  return true;
+}
+
 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
   return MVT::i32;
 }
@@ -584,12 +673,17 @@ bool AMDGPUTargetLowering::isCheapToSpeculateCtlz() const {
 
 bool AMDGPUTargetLowering::isFAbsFree(EVT VT) const {
   assert(VT.isFloatingPoint());
-  return VT == MVT::f32 || VT == MVT::f64 || (Subtarget->has16BitInsts() &&
-                                              VT == MVT::f16);
+
+  // Packed operations do not have a fabs modifier.
+  return VT == MVT::f32 || VT == MVT::f64 ||
+         (Subtarget->has16BitInsts() && VT == MVT::f16);
 }
 
 bool AMDGPUTargetLowering::isFNegFree(EVT VT) const {
-  return isFAbsFree(VT);
+  assert(VT.isFloatingPoint());
+  return VT == MVT::f32 || VT == MVT::f64 ||
+         (Subtarget->has16BitInsts() && VT == MVT::f16) ||
+         (Subtarget->hasVOP3PInsts() && VT == MVT::v2f16);
 }
 
 bool AMDGPUTargetLowering:: storeOfVectorConstantIsCheap(EVT MemVT,
@@ -773,11 +867,6 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State,
   }
 }
 
-void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State,
-                              const SmallVectorImpl<ISD::InputArg> &Ins) const {
-  State.AnalyzeFormalArguments(Ins, CC_AMDGPU);
-}
-
 void AMDGPUTargetLowering::AnalyzeReturn(CCState &State,
                            const SmallVectorImpl<ISD::OutputArg> &Outs) const {
 
@@ -797,6 +886,24 @@ AMDGPUTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 // Target specific lowering
 //===---------------------------------------------------------------------===//
 
+/// Selects the correct CCAssignFn for a given CallingConvention value.
+CCAssignFn *AMDGPUTargetLowering::CCAssignFnForCall(CallingConv::ID CC,
+                                                    bool IsVarArg) {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return CC_AMDGPU_Kernel;
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+    return CC_AMDGPU;
+  default:
+    report_fatal_error("Unsupported calling convention.");
+  }
+}
+
 SDValue AMDGPUTargetLowering::LowerCall(CallLoweringInfo &CLI,
                                         SmallVectorImpl<SDValue> &InVals) const {
   SDValue Callee = CLI.Callee;
@@ -845,7 +952,6 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::SIGN_EXTEND_INREG: return LowerSIGN_EXTEND_INREG(Op, DAG);
   case ISD::CONCAT_VECTORS: return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR: return LowerEXTRACT_SUBVECTOR(Op, DAG);
-  case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   case ISD::UDIVREM: return LowerUDIVREM(Op, DAG);
   case ISD::SDIVREM: return LowerSDIVREM(Op, DAG);
   case ISD::FREM: return LowerFREM(Op, DAG);
@@ -901,19 +1007,16 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   GlobalAddressSDNode *G = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = G->getGlobal();
 
-  switch (G->getAddressSpace()) {
-  case AMDGPUAS::LOCAL_ADDRESS: {
+  if  (G->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS) {
     // XXX: What does the value of G->getOffset() mean?
     assert(G->getOffset() == 0 &&
          "Do not know what to do with an non-zero offset");
 
     // TODO: We could emit code to handle the initialization somewhere.
-    if (hasDefinedInitializer(GV))
-      break;
-
-    unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
-    return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
-  }
+    if (!hasDefinedInitializer(GV)) {
+      unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
+      return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
+    }
   }
 
   const Function &Fn = *DAG.getMachineFunction().getFunction();
@@ -945,41 +1048,12 @@ SDValue AMDGPUTargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
   return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Args);
 }
 
-SDValue AMDGPUTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
-    SelectionDAG &DAG) const {
-  unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
-  SDLoc DL(Op);
-  EVT VT = Op.getValueType();
-
-  switch (IntrinsicID) {
-    default: return Op;
-    case AMDGPUIntrinsic::AMDGPU_clamp: // Legacy name.
-      return DAG.getNode(AMDGPUISD::CLAMP, DL, VT,
-                         Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfe_i32:
-      return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-
-    case AMDGPUIntrinsic::AMDGPU_bfe_u32:
-      return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
-                         Op.getOperand(1),
-                         Op.getOperand(2),
-                         Op.getOperand(3));
-  }
-}
-
 /// \brief Generate Min/Max node
-SDValue AMDGPUTargetLowering::CombineFMinMaxLegacy(const SDLoc &DL, EVT VT,
+SDValue AMDGPUTargetLowering::combineFMinMaxLegacy(const SDLoc &DL, EVT VT,
                                                    SDValue LHS, SDValue RHS,
                                                    SDValue True, SDValue False,
                                                    SDValue CC,
                                                    DAGCombinerInfo &DCI) const {
-  if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return SDValue();
-
   if (!(LHS == True && RHS == False) && !(LHS == False && RHS == True))
     return SDValue();
 
@@ -1237,7 +1311,10 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
   SDValue fqneg = DAG.getNode(ISD::FNEG, DL, FltVT, fq);
 
   // float fr = mad(fqneg, fb, fa);
-  SDValue fr = DAG.getNode(ISD::FMAD, DL, FltVT, fqneg, fb, fa);
+  unsigned OpCode = Subtarget->hasFP32Denormals() ?
+                    (unsigned)AMDGPUISD::FMAD_FTZ :
+                    (unsigned)ISD::FMAD;
+  SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
 
   // int iq = (int)fq;
   SDValue iq = DAG.getNode(ToInt, DL, IntVT, fq);
@@ -1671,32 +1748,37 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
 }
 
 // XXX - May require not supporting f32 denormals?
-SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const {
+
+// Don't handle v2f16. The extra instructions to scalarize and repack around the
+// compare and vselect end up producing worse code than scalarizing the whole
+// operation.
+SDValue AMDGPUTargetLowering::LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue X = Op.getOperand(0);
+  EVT VT = Op.getValueType();
 
-  SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X);
+  SDValue T = DAG.getNode(ISD::FTRUNC, SL, VT, X);
 
   // TODO: Should this propagate fast-math-flags?
 
-  SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T);
+  SDValue Diff = DAG.getNode(ISD::FSUB, SL, VT, X, T);
 
-  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff);
+  SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, VT, Diff);
 
-  const SDValue Zero = DAG.getConstantFP(0.0, SL, MVT::f32);
-  const SDValue One = DAG.getConstantFP(1.0, SL, MVT::f32);
-  const SDValue Half = DAG.getConstantFP(0.5, SL, MVT::f32);
+  const SDValue Zero = DAG.getConstantFP(0.0, SL, VT);
+  const SDValue One = DAG.getConstantFP(1.0, SL, VT);
+  const SDValue Half = DAG.getConstantFP(0.5, SL, VT);
 
-  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X);
+  SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, VT, One, X);
 
   EVT SetCCVT =
-      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::f32);
+      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
   SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE);
 
-  SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero);
+  SDValue Sel = DAG.getNode(ISD::SELECT, SL, VT, Cmp, SignOne, Zero);
 
-  return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel);
+  return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
 }
 
 SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
@@ -1759,8 +1841,8 @@ SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const
 SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
 
-  if (VT == MVT::f32)
-    return LowerFROUND32(Op, DAG);
+  if (VT == MVT::f32 || VT == MVT::f16)
+    return LowerFROUND32_16(Op, DAG);
 
   if (VT == MVT::f64)
     return LowerFROUND64(Op, DAG);
@@ -2039,15 +2121,19 @@ SDValue AMDGPUTargetLowering::LowerFP64_TO_INT(SDValue Op, SelectionDAG &DAG,
 }
 
 SDValue AMDGPUTargetLowering::LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  SDValue N0 = Op.getOperand(0);
+
+  // Convert to target node to get known bits
+  if (N0.getValueType() == MVT::f32)
+    return DAG.getNode(AMDGPUISD::FP_TO_FP16, DL, Op.getValueType(), N0);
 
   if (getTargetMachine().Options.UnsafeFPMath) {
     // There is a generic expand for FP_TO_FP16 with unsafe fast math.
     return SDValue();
   }
 
-  SDLoc DL(Op);
-  SDValue N0 = Op.getOperand(0);
-  assert (N0.getSimpleValueType() == MVT::f64);
+  assert(N0.getSimpleValueType() == MVT::f64);
 
   // f64 -> f16 conversion using round-to-nearest-even rounding mode.
   const unsigned ExpMask = 0x7ff;
@@ -2388,6 +2474,28 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
                       SN->getBasePtr(), SN->getMemOperand());
 }
 
+SDValue AMDGPUTargetLowering::performClampCombine(SDNode *N,
+                                                  DAGCombinerInfo &DCI) const {
+  ConstantFPSDNode *CSrc = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  if (!CSrc)
+    return SDValue();
+
+  const APFloat &F = CSrc->getValueAPF();
+  APFloat Zero = APFloat::getZero(F.getSemantics());
+  APFloat::cmpResult Cmp0 = F.compare(Zero);
+  if (Cmp0 == APFloat::cmpLessThan ||
+      (Cmp0 == APFloat::cmpUnordered && Subtarget->enableDX10Clamp())) {
+    return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
+  }
+
+  APFloat One(F.getSemantics(), "1.0");
+  APFloat::cmpResult Cmp1 = F.compare(One);
+  if (Cmp1 == APFloat::cmpGreaterThan)
+    return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
+
+  return SDValue(CSrc, 0);
+}
+
 /// Split the 64-bit value \p LHS into two 32-bit components, and perform the
 /// binary operation \p Opc to it with the corresponding constant operands.
 SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl(
@@ -2830,20 +2938,41 @@ SDValue AMDGPUTargetLowering::performSelectCombine(SDNode *N,
       SDValue NewCond = DAG.getSetCC(SL, Cond.getValueType(), LHS, RHS, NewCC);
       return DAG.getNode(ISD::SELECT, SL, VT, NewCond, False, True);
     }
-  }
 
-  if (VT == MVT::f32 && Cond.hasOneUse()) {
-    SDValue MinMax
-      = CombineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
-    // Revisit this node so we can catch min3/max3/med3 patterns.
-    //DCI.AddToWorklist(MinMax.getNode());
-    return MinMax;
+    if (VT == MVT::f32 && Subtarget->hasFminFmaxLegacy()) {
+      SDValue MinMax
+        = combineFMinMaxLegacy(SDLoc(N), VT, LHS, RHS, True, False, CC, DCI);
+      // Revisit this node so we can catch min3/max3/med3 patterns.
+      //DCI.AddToWorklist(MinMax.getNode());
+      return MinMax;
+    }
   }
 
   // There's no reason to not do this if the condition has other uses.
   return performCtlzCombine(SDLoc(N), Cond, True, False, DCI);
 }
 
+static bool isConstantFPZero(SDValue N) {
+  if (const ConstantFPSDNode *C = isConstOrConstSplatFP(N))
+    return C->isZero() && !C->isNegative();
+  return false;
+}
+
+static unsigned inverseMinMax(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FMAXNUM:
+    return ISD::FMINNUM;
+  case ISD::FMINNUM:
+    return ISD::FMAXNUM;
+  case AMDGPUISD::FMAX_LEGACY:
+    return AMDGPUISD::FMIN_LEGACY;
+  case AMDGPUISD::FMIN_LEGACY:
+    return  AMDGPUISD::FMAX_LEGACY;
+  default:
+    llvm_unreachable("invalid min/max opcode");
+  }
+}
+
 SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -2856,10 +2985,16 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
   // the other uses cannot, give up. This both prevents unprofitable
   // transformations and infinite loops: we won't repeatedly try to fold around
   // a negate that has no 'good' form.
-  //
-  // TODO: Check users can fold
-  if (fnegFoldsIntoOp(Opc) && !N0.hasOneUse())
-    return SDValue();
+  if (N0.hasOneUse()) {
+    // This may be able to fold into the source, but at a code size cost. Don't
+    // fold if the fold into the user is free.
+    if (allUsesHaveSourceMods(N, 0))
+      return SDValue();
+  } else {
+    if (fnegFoldsIntoOp(Opc) &&
+        (allUsesHaveSourceMods(N) || !allUsesHaveSourceMods(N0.getNode())))
+      return SDValue();
+  }
 
   SDLoc SL(N);
   switch (Opc) {
@@ -2932,6 +3067,33 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
       DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
     return Res;
   }
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM:
+  case AMDGPUISD::FMAX_LEGACY:
+  case AMDGPUISD::FMIN_LEGACY: {
+    // fneg (fmaxnum x, y) -> fminnum (fneg x), (fneg y)
+    // fneg (fminnum x, y) -> fmaxnum (fneg x), (fneg y)
+    // fneg (fmax_legacy x, y) -> fmin_legacy (fneg x), (fneg y)
+    // fneg (fmin_legacy x, y) -> fmax_legacy (fneg x), (fneg y)
+
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+
+    // 0 doesn't have a negated inline immediate.
+    // TODO: Shouldn't fold 1/2pi either, and should be generalized to other
+    // operations.
+    if (isConstantFPZero(RHS))
+      return SDValue();
+
+    SDValue NegLHS = DAG.getNode(ISD::FNEG, SL, VT, LHS);
+    SDValue NegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+    unsigned Opposite = inverseMinMax(Opc);
+
+    SDValue Res = DAG.getNode(Opposite, SL, VT, NegLHS, NegRHS, N0->getFlags());
+    if (!N0.hasOneUse())
+      DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res));
+    return Res;
+  }
   case ISD::FP_EXTEND:
   case ISD::FTRUNC:
   case ISD::FRINT:
@@ -2971,6 +3133,45 @@ SDValue AMDGPUTargetLowering::performFNegCombine(SDNode *N,
     SDValue Neg = DAG.getNode(ISD::FNEG, SL, CvtSrc.getValueType(), CvtSrc);
     return DAG.getNode(ISD::FP_ROUND, SL, VT, Neg, N0.getOperand(1));
   }
+  case ISD::FP16_TO_FP: {
+    // v_cvt_f32_f16 supports source modifiers on pre-VI targets without legal
+    // f16, but legalization of f16 fneg ends up pulling it out of the source.
+    // Put the fneg back as a legal source operation that can be matched later.
+    SDLoc SL(N);
+
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+
+    // fneg (fp16_to_fp x) -> fp16_to_fp (xor x, 0x8000)
+    SDValue IntFNeg = DAG.getNode(ISD::XOR, SL, SrcVT, Src,
+                                  DAG.getConstant(0x8000, SL, SrcVT));
+    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFNeg);
+  }
+  default:
+    return SDValue();
+  }
+}
+
+SDValue AMDGPUTargetLowering::performFAbsCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue N0 = N->getOperand(0);
+
+  if (!N0.hasOneUse())
+    return SDValue();
+
+  switch (N0.getOpcode()) {
+  case ISD::FP16_TO_FP: {
+    assert(!Subtarget->has16BitInsts() && "should only see if f16 is illegal");
+    SDLoc SL(N);
+    SDValue Src = N0.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+
+    // fabs (fp16_to_fp x) -> fp16_to_fp (and x, 0x7fff)
+    SDValue IntFAbs = DAG.getNode(ISD::AND, SL, SrcVT, Src,
+                                  DAG.getConstant(0x7fff, SL, SrcVT));
+    return DAG.getNode(ISD::FP16_TO_FP, SL, N->getValueType(0), IntFAbs);
+  }
   default:
     return SDValue();
   }
@@ -3083,6 +3284,8 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performSelectCombine(N, DCI);
   case ISD::FNEG:
     return performFNegCombine(N, DCI);
+  case ISD::FABS:
+    return performFAbsCombine(N, DCI);
   case AMDGPUISD::BFE_I32:
   case AMDGPUISD::BFE_U32: {
     assert(!N->getValueType(0).isVector() &&
@@ -3171,6 +3374,18 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
     return performLoadCombine(N, DCI);
   case ISD::STORE:
     return performStoreCombine(N, DCI);
+  case AMDGPUISD::CLAMP:
+    return performClampCombine(N, DCI);
+  case AMDGPUISD::RCP: {
+    if (const auto *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0))) {
+      // XXX - Should this flush denormals?
+      const APFloat &Val = CFP->getValueAPF();
+      APFloat One(Val.getSemantics(), "1.0");
+      return DAG.getConstantFP(One / Val, SDLoc(N), N->getValueType(0));
+    }
+
+    break;
+  }
   }
   return SDValue();
 }
@@ -3213,13 +3428,17 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((AMDGPUISD::NodeType)Opcode) {
   case AMDGPUISD::FIRST_NUMBER: break;
   // AMDIL DAG nodes
-  NODE_NAME_CASE(CALL);
   NODE_NAME_CASE(UMUL);
   NODE_NAME_CASE(BRANCH_COND);
 
   // AMDGPU DAG nodes
+  NODE_NAME_CASE(IF)
+  NODE_NAME_CASE(ELSE)
+  NODE_NAME_CASE(LOOP)
+  NODE_NAME_CASE(CALL)
+  NODE_NAME_CASE(RET_FLAG)
+  NODE_NAME_CASE(RETURN_TO_EPILOG)
   NODE_NAME_CASE(ENDPGM)
-  NODE_NAME_CASE(RETURN)
   NODE_NAME_CASE(DWORDADDR)
   NODE_NAME_CASE(FRACT)
   NODE_NAME_CASE(SETCC)
@@ -3244,6 +3463,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(DIV_SCALE)
   NODE_NAME_CASE(DIV_FMAS)
   NODE_NAME_CASE(DIV_FIXUP)
+  NODE_NAME_CASE(FMAD_FTZ)
   NODE_NAME_CASE(TRIG_PREOP)
   NODE_NAME_CASE(RCP)
   NODE_NAME_CASE(RSQ)
@@ -3277,7 +3497,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
   NODE_NAME_CASE(REGISTER_STORE)
-  NODE_NAME_CASE(LOAD_INPUT)
   NODE_NAME_CASE(SAMPLE)
   NODE_NAME_CASE(SAMPLEB)
   NODE_NAME_CASE(SAMPLED)
@@ -3286,6 +3505,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CVT_F32_UBYTE1)
   NODE_NAME_CASE(CVT_F32_UBYTE2)
   NODE_NAME_CASE(CVT_F32_UBYTE3)
+  NODE_NAME_CASE(CVT_PKRTZ_F16_F32)
+  NODE_NAME_CASE(FP_TO_FP16)
+  NODE_NAME_CASE(FP16_ZEXT)
   NODE_NAME_CASE(BUILD_VERTICAL_VECTOR)
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
@@ -3350,13 +3572,11 @@ SDValue AMDGPUTargetLowering::getRecipEstimate(SDValue Operand,
 }
 
 void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
-  const SDValue Op,
-  APInt &KnownZero,
-  APInt &KnownOne,
-  const SelectionDAG &DAG,
-  unsigned Depth) const {
+    const SDValue Op, APInt &KnownZero, APInt &KnownOne,
+    const APInt &DemandedElts, const SelectionDAG &DAG, unsigned Depth) const {
 
-  KnownZero = KnownOne = APInt(KnownOne.getBitWidth(), 0); // Don't know anything.
+  unsigned BitWidth = KnownZero.getBitWidth();
+  KnownZero = KnownOne = APInt(BitWidth, 0); // Don't know anything.
 
   APInt KnownZero2;
   APInt KnownOne2;
@@ -3377,21 +3597,27 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
     if (!CWidth)
       return;
 
-    unsigned BitWidth = 32;
     uint32_t Width = CWidth->getZExtValue() & 0x1f;
 
     if (Opc == AMDGPUISD::BFE_U32)
-      KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - Width);
+      KnownZero = APInt::getHighBitsSet(32, 32 - Width);
 
     break;
   }
+  case AMDGPUISD::FP_TO_FP16:
+  case AMDGPUISD::FP16_ZEXT: {
+    unsigned BitWidth = KnownZero.getBitWidth();
+
+    // High bits are zero.
+    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - 16);
+    break;
+  }
   }
 }
 
 unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
-  SDValue Op,
-  const SelectionDAG &DAG,
-  unsigned Depth) const {
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
   switch (Op.getOpcode()) {
   case AMDGPUISD::BFE_I32: {
     ConstantSDNode *Width = dyn_cast<ConstantSDNode>(Op.getOperand(2));
@@ -3415,7 +3641,9 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
   case AMDGPUISD::CARRY:
   case AMDGPUISD::BORROW:
     return 31;
-
+  case AMDGPUISD::FP_TO_FP16:
+  case AMDGPUISD::FP16_ZEXT:
+    return 16;
   default:
     return 1;
   }
diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h
index f323c786e1da06dd19a0a21a256c0c4a186116f7..d6aa0ba92bf7816e1bfd6f8c502b089d5e557fe0 100644
--- a/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -16,6 +16,8 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUISELLOWERING_H
 
+#include "AMDGPU.h"
+#include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/Target/TargetLowering.h"
 
 namespace llvm {
@@ -34,10 +36,10 @@ private:
 
 protected:
   const AMDGPUSubtarget *Subtarget;
+  AMDGPUAS AMDGPUASI;
 
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   /// \brief Split a vector store into multiple scalar stores.
   /// \returns The resulting chain.
 
@@ -47,7 +49,7 @@ protected:
   SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFROUND32_16(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
@@ -70,6 +72,7 @@ protected:
   bool shouldCombineMemoryType(EVT VT) const;
   SDValue performLoadCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performStoreCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performClampCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue splitBinaryBitConstantOpImpl(DAGCombinerInfo &DCI, const SDLoc &SL,
                                        unsigned Opc, SDValue LHS,
@@ -85,6 +88,7 @@ protected:
                              SDValue RHS, DAGCombinerInfo &DCI) const;
   SDValue performSelectCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFNegCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFAbsCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   static EVT getEquivalentMemType(LLVMContext &Context, EVT VT);
 
@@ -111,8 +115,6 @@ protected:
                                     SmallVectorImpl<SDValue> &Results) const;
   void analyzeFormalArgumentsCompute(CCState &State,
                               const SmallVectorImpl<ISD::InputArg> &Ins) const;
-  void AnalyzeFormalArguments(CCState &State,
-                              const SmallVectorImpl<ISD::InputArg> &Ins) const;
   void AnalyzeReturn(CCState &State,
                      const SmallVectorImpl<ISD::OutputArg> &Outs) const;
 
@@ -158,6 +160,7 @@ public:
   bool isCheapToSpeculateCttz() const override;
   bool isCheapToSpeculateCtlz() const override;
 
+  static CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool IsVarArg);
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
@@ -174,7 +177,7 @@ public:
                           SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
 
-  SDValue CombineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
+  SDValue combineFMinMaxLegacy(const SDLoc &DL, EVT VT, SDValue LHS,
                                SDValue RHS, SDValue True, SDValue False,
                                SDValue CC, DAGCombinerInfo &DCI) const;
 
@@ -198,10 +201,12 @@ public:
   void computeKnownBitsForTargetNode(const SDValue Op,
                                      APInt &KnownZero,
                                      APInt &KnownOne,
+                                     const APInt &DemandedElts,
                                      const SelectionDAG &DAG,
                                      unsigned Depth = 0) const override;
 
-  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const SelectionDAG &DAG,
+  unsigned ComputeNumSignBitsForTargetNode(SDValue Op, const APInt &DemandedElts,
+                                           const SelectionDAG &DAG,
                                            unsigned Depth = 0) const override;
 
   /// \brief Helper function that adds Reg to the LiveIn list of the DAG's
@@ -222,6 +227,10 @@ public:
   /// type of implicit parameter.
   uint32_t getImplicitParameterOffset(const AMDGPUMachineFunction *MFI,
                                       const ImplicitParameter Param) const;
+
+  AMDGPUAS getAMDGPUAS() const {
+    return AMDGPUASI;
+  }
 };
 
 namespace AMDGPUISD {
@@ -229,15 +238,34 @@ namespace AMDGPUISD {
 enum NodeType : unsigned {
   // AMDIL ISD Opcodes
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  CALL,        // Function call based on a single integer
   UMUL,        // 32bit unsigned multiplication
   BRANCH_COND,
   // End AMDIL ISD Opcodes
+
+  // Function call.
+  CALL,
+
+  // Masked control flow nodes.
+  IF,
+  ELSE,
+  LOOP,
+
+  // A uniform kernel return that terminates the wavefront.
   ENDPGM,
-  RETURN,
+
+  // Return to a shader part's epilog code.
+  RETURN_TO_EPILOG,
+
+  // Return with values from a non-entry function.
+  RET_FLAG,
+
   DWORDADDR,
   FRACT,
+
+  /// CLAMP value between 0.0 and 1.0. NaN clamped to 0, following clamp output
+  /// modifier behavior with dx10_enable.
   CLAMP,
+
   // This is SETCC with the full mask result which is used for a compare with a
   // result bit per item in the wavefront.
   SETCC,
@@ -265,6 +293,9 @@ enum NodeType : unsigned {
   DIV_SCALE,
   DIV_FMAS,
   DIV_FIXUP,
+  // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
+  // treated as an illegal operation.
+  FMAD_FTZ,
   TRIG_PREOP, // 1 ULP max error for f64
 
   // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
@@ -301,7 +332,6 @@ enum NodeType : unsigned {
   CONST_ADDRESS,
   REGISTER_LOAD,
   REGISTER_STORE,
-  LOAD_INPUT,
   SAMPLE,
   SAMPLEB,
   SAMPLED,
@@ -312,6 +342,18 @@ enum NodeType : unsigned {
   CVT_F32_UBYTE1,
   CVT_F32_UBYTE2,
   CVT_F32_UBYTE3,
+
+  // Convert two float 32 numbers into a single register holding two packed f16
+  // with round to zero.
+  CVT_PKRTZ_F16_F32,
+
+  // Same as the standard node, except the high bits of the resulting integer
+  // are known 0.
+  FP_TO_FP16,
+
+  // Wrapper around fp16 results that are known to zero the high bits.
+  FP16_ZEXT,
+
   /// This node is for VLIW targets and it is used to represent a vector
   /// that is stored in consecutive registers with the same channel.
   /// For example:
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index e4dc6599e156e29da26484ac9993e936017924bf..a01f5d37c7c1645b535e8abf86f2ab6bf8c10c74 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -30,7 +30,7 @@ using namespace llvm;
 void AMDGPUInstrInfo::anchor() {}
 
 AMDGPUInstrInfo::AMDGPUInstrInfo(const AMDGPUSubtarget &ST)
-  : AMDGPUGenInstrInfo(-1, -1), ST(ST) {}
+  : AMDGPUGenInstrInfo(-1, -1), ST(ST), AMDGPUASI(ST.getAMDGPUAS()) {}
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
 // the first 16 loads will be interleaved with the stores, and the next 16 will
@@ -86,6 +86,7 @@ static SIEncodingFamily subtargetEncodingFamily(const AMDGPUSubtarget &ST) {
   case AMDGPUSubtarget::SEA_ISLANDS:
     return SIEncodingFamily::SI;
   case AMDGPUSubtarget::VOLCANIC_ISLANDS:
+  case AMDGPUSubtarget::GFX9:
     return SIEncodingFamily::VI;
 
   // FIXME: This should never be called for r600 GPUs.
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index bd8e389639f5d897f3f9589cee21ce2bfa20569d..12caa5118342a13a74490c808d81667a43cf33bd 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -16,11 +16,11 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRINFO_H
 
+#include "AMDGPU.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 
 #define GET_INSTRINFO_HEADER
-#define GET_INSTRINFO_ENUM
 #include "AMDGPUGenInstrInfo.inc"
 
 namespace llvm {
@@ -35,6 +35,8 @@ private:
   const AMDGPUSubtarget &ST;
 
   virtual void anchor();
+protected:
+  AMDGPUAS AMDGPUASI;
 
 public:
   explicit AMDGPUInstrInfo(const AMDGPUSubtarget &st);
diff --git a/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index cfef17572773efe52501d664736c5a6b6d7bd723..56f060984f0840f134652c76bbd547f4265982ab 100644
--- a/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -31,6 +31,10 @@ def AMDGPUFPClassOp : SDTypeProfile<1, 2,
   [SDTCisInt<0>, SDTCisFP<1>, SDTCisInt<2>]
 >;
 
+def AMDGPUFPPackOp : SDTypeProfile<1, 2,
+  [SDTCisFP<1>, SDTCisSameAs<1, 2>]
+>;
+
 def AMDGPUDivScaleOp : SDTypeProfile<2, 3,
   [SDTCisFP<0>, SDTCisInt<1>, SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, SDTCisSameAs<0, 4>]
 >;
@@ -42,10 +46,38 @@ def AMDGPUFmasOp : SDTypeProfile<1, 4,
 
 def AMDGPUKillSDT : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
+def AMDGPUIfOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, OtherVT>]
+>;
+
+def AMDGPUElseOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, OtherVT>]
+>;
+
+def AMDGPULoopOp : SDTypeProfile<0, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, OtherVT>]
+>;
+
+def AMDGPUBreakOp : SDTypeProfile<1, 1,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>]
+>;
+
+def AMDGPUIfBreakOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i1>, SDTCisVT<2, i64>]
+>;
+
+def AMDGPUElseBreakOp : SDTypeProfile<1, 2,
+  [SDTCisVT<0, i64>, SDTCisVT<1, i64>, SDTCisVT<2, i64>]
+>;
+
 //===----------------------------------------------------------------------===//
 // AMDGPU DAG Nodes
 //
 
+def AMDGPUif : SDNode<"AMDGPUISD::IF", AMDGPUIfOp, [SDNPHasChain]>;
+def AMDGPUelse : SDNode<"AMDGPUISD::ELSE", AMDGPUElseOp, [SDNPHasChain]>;
+def AMDGPUloop : SDNode<"AMDGPUISD::LOOP", AMDGPULoopOp, [SDNPHasChain]>;
+
 def AMDGPUconstdata_ptr : SDNode<
   "AMDGPUISD::CONST_DATA_PTR", SDTypeProfile <1, 1, [SDTCisVT<0, iPTR>,
                                                      SDTCisVT<0, iPTR>]>
@@ -78,6 +110,11 @@ def AMDGPUrsq_clamp : SDNode<"AMDGPUISD::RSQ_CLAMP", SDTFPUnaryOp>;
 
 def AMDGPUldexp : SDNode<"AMDGPUISD::LDEXP", AMDGPULdExpOp>;
 
+def AMDGPUpkrtz_f16_f32 : SDNode<"AMDGPUISD::CVT_PKRTZ_F16_F32", AMDGPUFPPackOp>;
+def AMDGPUfp_to_f16 : SDNode<"AMDGPUISD::FP_TO_FP16" , SDTFPToIntOp>;
+def AMDGPUfp16_zext : SDNode<"AMDGPUISD::FP16_ZEXT" , SDTFPToIntOp>;
+
+
 def AMDGPUfp_class : SDNode<"AMDGPUISD::FP_CLASS", AMDGPUFPClassOp>;
 
 // out = max(a, b) a and b are floats, where a nan comparison fails.
@@ -92,17 +129,7 @@ def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
-def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPTernaryOp, []>;
-
-// out = max(a, b) a and b are signed ints
-def AMDGPUsmax : SDNode<"AMDGPUISD::SMAX", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
->;
-
-// out = max(a, b) a and b are unsigned ints
-def AMDGPUumax : SDNode<"AMDGPUISD::UMAX", SDTIntBinOp,
-  [SDNPCommutative, SDNPAssociative]
->;
+def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
 
 // out = min(a, b) a and b are floats, where a nan comparison fails.
 def AMDGPUfmin_legacy : SDNode<"AMDGPUISD::FMIN_LEGACY", SDTFPBinOp,
@@ -194,6 +221,8 @@ def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp>;
 // Denominator, src2 = Numerator).
 def AMDGPUdiv_fixup : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
 
+def AMDGPUfmad_ftz : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>;
+
 // Look Up 2.0 / pi src0 with segment select src1[4:0]
 def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
 
@@ -334,5 +363,9 @@ def IL_brcond      : SDNode<"AMDGPUISD::BRANCH_COND", SDTIL_BRCond, [SDNPHasChai
 def AMDGPUendpgm : SDNode<"AMDGPUISD::ENDPGM", SDTNone,
     [SDNPHasChain, SDNPOptInGlue]>;
 
-def AMDGPUreturn : SDNode<"AMDGPUISD::RETURN", SDTNone,
+def AMDGPUreturn_to_epilog : SDNode<"AMDGPUISD::RETURN_TO_EPILOG", SDTNone,
     [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTNone,
+  [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]
+>;
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 6e5e9825eb8f8968c805138aead667164a5d6935..8867ed689a31160c5134aef19f169049436158c4 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -33,7 +33,7 @@ using namespace llvm;
 AMDGPUInstructionSelector::AMDGPUInstructionSelector(
     const SISubtarget &STI, const AMDGPURegisterBankInfo &RBI)
     : InstructionSelector(), TII(*STI.getInstrInfo()),
-      TRI(*STI.getRegisterInfo()), RBI(RBI) {}
+      TRI(*STI.getRegisterInfo()), RBI(RBI), AMDGPUASI(STI.getAMDGPUAS()) {}
 
 MachineOperand
 AMDGPUInstructionSelector::getSubOperand64(MachineOperand &MO,
@@ -84,13 +84,19 @@ bool AMDGPUInstructionSelector::selectG_ADD(MachineInstr &I) const {
 
   DebugLoc DL = I.getDebugLoc();
 
+  MachineOperand Lo1(getSubOperand64(I.getOperand(1), AMDGPU::sub0));
+  MachineOperand Lo2(getSubOperand64(I.getOperand(2), AMDGPU::sub0));
+
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADD_U32), DstLo)
-          .add(getSubOperand64(I.getOperand(1), AMDGPU::sub0))
-          .add(getSubOperand64(I.getOperand(2), AMDGPU::sub0));
+          .add(Lo1)
+          .add(Lo2);
+
+  MachineOperand Hi1(getSubOperand64(I.getOperand(1), AMDGPU::sub1));
+  MachineOperand Hi2(getSubOperand64(I.getOperand(2), AMDGPU::sub1));
 
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::S_ADDC_U32), DstHi)
-          .add(getSubOperand64(I.getOperand(1), AMDGPU::sub1))
-          .add(getSubOperand64(I.getOperand(2), AMDGPU::sub1));
+          .add(Hi1)
+          .add(Hi2);
 
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), I.getOperand(0).getReg())
           .addReg(DstLo)
@@ -285,7 +291,7 @@ bool AMDGPUInstructionSelector::selectSMRD(MachineInstr &I,
   if (!I.hasOneMemOperand())
     return false;
 
-  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+  if ((*I.memoperands_begin())->getAddrSpace() != AMDGPUASI.CONSTANT_ADDRESS)
     return false;
 
   if (!isInstrUniform(I))
diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 783f1408b3d0cd35494b45eaf9718350d16a5bab..c87102e55dfb0d0b15d40ace3586e1f7b2144500 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUINSTRUCTIONSELECTOR_H
 
+#include "AMDGPU.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
@@ -35,7 +36,6 @@ public:
                             const AMDGPURegisterBankInfo &RBI);
 
   bool select(MachineInstr &I) const override;
-
 private:
   struct GEPInfo {
     const MachineInstr &GEP;
@@ -59,6 +59,8 @@ private:
   const SIInstrInfo &TII;
   const SIRegisterInfo &TRI;
   const AMDGPURegisterBankInfo &RBI;
+protected:
+  AMDGPUAS AMDGPUASI;
 };
 
 } // End llvm namespace.
diff --git a/lib/Target/AMDGPU/AMDGPUInstructions.td b/lib/Target/AMDGPU/AMDGPUInstructions.td
index 606b6cea2e4f85fec97c8d1c668b8ce8efab4475..b8d681298dee9f7116fc0732b67477423dc8808c 100644
--- a/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -71,6 +71,49 @@ def u8imm : Operand<i8> {
 //===--------------------------------------------------------------------===//
 def brtarget   : Operand<OtherVT>;
 
+//===----------------------------------------------------------------------===//
+// Misc. PatFrags
+//===----------------------------------------------------------------------===//
+
+class HasOneUseUnaryOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0),
+  (op $src0),
+  [{ return N->hasOneUse(); }]
+>;
+
+class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0, node:$src1),
+  (op $src0, $src1),
+  [{ return N->hasOneUse(); }]
+>;
+
+class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
+  (ops node:$src0, node:$src1, node:$src2),
+  (op $src0, $src1, $src2),
+  [{ return N->hasOneUse(); }]
+>;
+
+def trunc_oneuse : HasOneUseUnaryOp<trunc>;
+
+let Properties = [SDNPCommutative, SDNPAssociative] in {
+def smax_oneuse : HasOneUseBinOp<smax>;
+def smin_oneuse : HasOneUseBinOp<smin>;
+def umax_oneuse : HasOneUseBinOp<umax>;
+def umin_oneuse : HasOneUseBinOp<umin>;
+def fminnum_oneuse : HasOneUseBinOp<fminnum>;
+def fmaxnum_oneuse : HasOneUseBinOp<fmaxnum>;
+def and_oneuse : HasOneUseBinOp<and>;
+def or_oneuse : HasOneUseBinOp<or>;
+def xor_oneuse : HasOneUseBinOp<xor>;
+} // Properties = [SDNPCommutative, SDNPAssociative]
+
+def sub_oneuse : HasOneUseBinOp<sub>;
+
+def srl_oneuse : HasOneUseBinOp<srl>;
+def shl_oneuse : HasOneUseBinOp<shl>;
+
+def select_oneuse : HasOneUseTernaryOp<select>;
+
 //===----------------------------------------------------------------------===//
 // PatLeafs for floating-point comparisons
 //===----------------------------------------------------------------------===//
@@ -156,28 +199,12 @@ def COND_NULL : PatLeaf <
 >;
 
 
-//===----------------------------------------------------------------------===//
-// Misc. PatFrags
-//===----------------------------------------------------------------------===//
-
-class HasOneUseBinOp<SDPatternOperator op> : PatFrag<
-  (ops node:$src0, node:$src1),
-  (op $src0, $src1),
-  [{ return N->hasOneUse(); }]
->;
-
-class HasOneUseTernaryOp<SDPatternOperator op> : PatFrag<
-  (ops node:$src0, node:$src1, node:$src2),
-  (op $src0, $src1, $src2),
-  [{ return N->hasOneUse(); }]
->;
-
 //===----------------------------------------------------------------------===//
 // Load/Store Pattern Fragments
 //===----------------------------------------------------------------------===//
 
 class PrivateMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS;
 }]>;
 
 class PrivateLoad <SDPatternOperator op> : PrivateMemOp <
@@ -195,7 +222,7 @@ def truncstorei16_private : PrivateStore <truncstorei16>;
 def store_private : PrivateStore <store>;
 
 class GlobalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
 }]>;
 
 // Global address space loads
@@ -215,7 +242,7 @@ def global_store_atomic : GlobalStore<atomic_store>;
 
 
 class ConstantMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS;
 }]>;
 
 // Constant address space loads
@@ -226,7 +253,7 @@ class ConstantLoad <SDPatternOperator op> : ConstantMemOp <
 def constant_load : ConstantLoad<load>;
 
 class LocalMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 // Local address space loads
@@ -239,7 +266,7 @@ class LocalStore <SDPatternOperator op> : LocalMemOp <
 >;
 
 class FlatMemOp <dag ops, dag frag> : PatFrag <ops, frag, [{
-  return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUAS::FLAT_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSPace() == AMDGPUASI.FLAT_ADDRESS;
 }]>;
 
 class FlatLoad <SDPatternOperator op> : FlatMemOp <
@@ -321,7 +348,7 @@ def local_store_aligned8bytes : Aligned8Bytes <
 class local_binary_atomic_op<SDNode atomic_op> :
   PatFrag<(ops node:$ptr, node:$value),
     (atomic_op node:$ptr, node:$value), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 
@@ -339,7 +366,7 @@ def atomic_load_umax_local : local_binary_atomic_op<atomic_load_umax>;
 
 def mskor_global : PatFrag<(ops node:$val, node:$ptr),
                             (AMDGPUstore_mskor node:$val, node:$ptr), [{
-  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+  return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;
 }]>;
 
 multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
@@ -349,7 +376,7 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
     (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
       AtomicSDNode *AN = cast<AtomicSDNode>(N);
       return AN->getMemoryVT() == MVT::i32 &&
-             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+             AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
   }]>;
 
   def _64_local : PatFrag<
@@ -357,7 +384,7 @@ multiclass AtomicCmpSwapLocal <SDNode cmp_swap_node> {
     (cmp_swap_node node:$ptr, node:$cmp, node:$swap), [{
       AtomicSDNode *AN = cast<AtomicSDNode>(N);
       return AN->getMemoryVT() == MVT::i64 &&
-             AN->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+             AN->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
   }]>;
 }
 
@@ -367,17 +394,17 @@ multiclass global_binary_atomic_op<SDNode atomic_op> {
   def "" : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
 
   def _noret : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
 
   def _ret : PatFrag<
         (ops node:$ptr, node:$value),
         (atomic_op node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
 }
 
 defm atomic_swap_global : global_binary_atomic_op<atomic_swap>;
@@ -395,22 +422,22 @@ defm atomic_xor_global : global_binary_atomic_op<atomic_load_xor>;
 def AMDGPUatomic_cmp_swap_global : PatFrag<
         (ops node:$ptr, node:$value),
         (AMDGPUatomic_cmp_swap node:$ptr, node:$value),
-        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+        [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
 
 def atomic_cmp_swap_global : PatFrag<
       (ops node:$ptr, node:$cmp, node:$value),
       (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;}]>;
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS;}]>;
 
 def atomic_cmp_swap_global_noret : PatFrag<
       (ops node:$ptr, node:$cmp, node:$value),
       (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (SDValue(N, 0).use_empty());}]>;
 
 def atomic_cmp_swap_global_ret : PatFrag<
       (ops node:$ptr, node:$cmp, node:$value),
       (atomic_cmp_swap node:$ptr, node:$cmp, node:$value),
-      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
+      [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS && (!SDValue(N, 0).use_empty());}]>;
 
 //===----------------------------------------------------------------------===//
 // Misc Pattern Fragments
@@ -422,6 +449,7 @@ int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
 int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
 int FP16_ONE = 0x3C00;
+int V2FP16_ONE = 0x3C003C00;
 int FP32_ONE = 0x3f800000;
 int FP32_NEG_ONE = 0xbf800000;
 int FP64_ONE = 0x3ff0000000000000;
@@ -452,7 +480,7 @@ class CLAMP <RegisterClass rc> : AMDGPUShaderInst <
   (outs rc:$dst),
   (ins rc:$src0),
   "CLAMP $dst, $src0",
-  [(set f32:$dst, (AMDGPUclamp f32:$src0, (f32 FP_ZERO), (f32 FP_ONE)))]
+  [(set f32:$dst, (AMDGPUclamp f32:$src0))]
 >;
 
 class FABS <RegisterClass rc> : AMDGPUShaderInst <
@@ -608,10 +636,22 @@ def IMMPopCount : SDNodeXForm<imm, [{
                                    MVT::i32);
 }]>;
 
-class BFEPattern <Instruction BFE, Instruction MOV> : Pat <
-  (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
-  (BFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
->;
+multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
+  def : Pat <
+    (i32 (and (i32 (srl i32:$src, i32:$rshift)), IMMZeroBasedBitfieldMask:$mask)),
+    (UBFE $src, $rshift, (MOV (i32 (IMMPopCount $mask))))
+  >;
+
+  def : Pat <
+    (srl (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
+    (UBFE $src, (i32 0), $width)
+  >;
+
+  def : Pat <
+    (sra (shl_oneuse i32:$src, (sub 32, i32:$width)), (sub 32, i32:$width)),
+    (SBFE $src, (i32 0), $width)
+  >;
+}
 
 // rotr pattern
 class ROTRPattern <Instruction BIT_ALIGN> : Pat <
@@ -624,23 +664,13 @@ class ROTRPattern <Instruction BIT_ALIGN> : Pat <
 class IntMed3Pat<Instruction med3Inst,
                  SDPatternOperator max,
                  SDPatternOperator max_oneuse,
-                 SDPatternOperator min_oneuse> : Pat<
-  (max (min_oneuse i32:$src0, i32:$src1),
-       (min_oneuse (max_oneuse i32:$src0, i32:$src1), i32:$src2)),
+                 SDPatternOperator min_oneuse,
+                 ValueType vt = i32> : Pat<
+  (max (min_oneuse vt:$src0, vt:$src1),
+       (min_oneuse (max_oneuse vt:$src0, vt:$src1), vt:$src2)),
   (med3Inst $src0, $src1, $src2)
 >;
 
-let Properties = [SDNPCommutative, SDNPAssociative] in {
-def smax_oneuse : HasOneUseBinOp<smax>;
-def smin_oneuse : HasOneUseBinOp<smin>;
-def umax_oneuse : HasOneUseBinOp<umax>;
-def umin_oneuse : HasOneUseBinOp<umin>;
-} // Properties = [SDNPCommutative, SDNPAssociative]
-
-def sub_oneuse : HasOneUseBinOp<sub>;
-
-def select_oneuse : HasOneUseTernaryOp<select>;
-
 // Special conversion patterns
 
 def cvt_rpi_i32_f32 : PatFrag <
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
index 8e3471bd20835ced0f153fc9329e7a61ad3b4a5a..86dc9bd9ea74a30b73d94ab139e6ba04b9d37c53 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsicInfo.cpp
@@ -54,14 +54,7 @@ std::string AMDGPUIntrinsicInfo::getName(unsigned IntrID, Type **Tys,
 FunctionType *AMDGPUIntrinsicInfo::getType(LLVMContext &Context, unsigned ID,
                                            ArrayRef<Type*> Tys) const {
   // FIXME: Re-use Intrinsic::getType machinery
-  switch (ID) {
-  case AMDGPUIntrinsic::amdgcn_fdiv_fast: {
-    Type *F32Ty = Type::getFloatTy(Context);
-    return FunctionType::get(F32Ty, { F32Ty, F32Ty }, false);
-  }
-  default:
-    llvm_unreachable("unhandled intrinsic");
-  }
+  llvm_unreachable("unhandled intrinsic");
 }
 
 unsigned AMDGPUIntrinsicInfo::lookupName(const char *NameData,
@@ -97,8 +90,8 @@ Function *AMDGPUIntrinsicInfo::getDeclaration(Module *M, unsigned IntrID,
   Function *F
     = cast<Function>(M->getOrInsertFunction(getName(IntrID, Tys), FTy));
 
-  AttributeSet AS = getAttributes(M->getContext(),
-                                  static_cast<AMDGPUIntrinsic::ID>(IntrID));
+  AttributeList AS =
+      getAttributes(M->getContext(), static_cast<AMDGPUIntrinsic::ID>(IntrID));
   F->setAttributes(AS);
   return F;
 }
diff --git a/lib/Target/AMDGPU/AMDGPUIntrinsics.td b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
index ceae0b57539581d5f51d82b9ba83f4c55f59fc85..18c9bd933af27a3f64449fccfd95187551228910 100644
--- a/lib/Target/AMDGPU/AMDGPUIntrinsics.td
+++ b/lib/Target/AMDGPU/AMDGPUIntrinsics.td
@@ -12,25 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 let TargetPrefix = "AMDGPU", isTarget = 1 in {
-  def int_AMDGPU_clamp : Intrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>;
-
   def int_AMDGPU_kill : Intrinsic<[], [llvm_float_ty], []>;
   def int_AMDGPU_kilp : Intrinsic<[], [], []>;
-
-  // Deprecated in favor of llvm.amdgcn.sffbh
-  def int_AMDGPU_flbit_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>;
-
-  // Deprecated in favor of separate int_amdgcn_cube* intrinsics.
-  def int_AMDGPU_cube : Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>;
-
-  // Deprecated in favor of expanded bit operations
-  def int_AMDGPU_bfe_i32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_AMDGPU_bfe_u32 : Intrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-
-  // Deprecated in favor of llvm.amdgcn.rsq
-  def int_AMDGPU_rsq : Intrinsic<
-    [llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]
-  >;
 }
 
 include "SIIntrinsics.td"
diff --git a/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5721ea41e3bdba8f8ec09bf40c0d45dab313de8c
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -0,0 +1,123 @@
+//===-- AMDGPULowerIntrinsics.cpp -----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+
+#define DEBUG_TYPE "amdgpu-lower-intrinsics"
+
+using namespace llvm;
+
+namespace {
+
+const unsigned MaxStaticSize = 1024;
+
+class AMDGPULowerIntrinsics : public ModulePass {
+public:
+  static char ID;
+
+  AMDGPULowerIntrinsics() : ModulePass(ID) { }
+  bool runOnModule(Module &M) override;
+  StringRef getPassName() const override {
+    return "AMDGPU Lower Intrinsics";
+  }
+};
+
+}
+
+char AMDGPULowerIntrinsics::ID = 0;
+
+char &llvm::AMDGPULowerIntrinsicsID = AMDGPULowerIntrinsics::ID;
+
+INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE,
+                "Lower intrinsics", false, false)
+
+// TODO: Should refine based on estimated number of accesses (e.g. does it
+// require splitting based on alignment)
+static bool shouldExpandOperationWithSize(Value *Size) {
+  ConstantInt *CI = dyn_cast<ConstantInt>(Size);
+  return !CI || (CI->getZExtValue() > MaxStaticSize);
+}
+
+static bool expandMemIntrinsicUses(Function &F) {
+  Intrinsic::ID ID = F.getIntrinsicID();
+  bool Changed = false;
+
+  for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
+    Instruction *Inst = cast<Instruction>(*I);
+    ++I;
+
+    switch (ID) {
+    case Intrinsic::memcpy: {
+      auto *Memcpy = cast<MemCpyInst>(Inst);
+      if (shouldExpandOperationWithSize(Memcpy->getLength())) {
+        expandMemCpyAsLoop(Memcpy);
+        Changed = true;
+        Memcpy->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memmove: {
+      auto *Memmove = cast<MemMoveInst>(Inst);
+      if (shouldExpandOperationWithSize(Memmove->getLength())) {
+        expandMemMoveAsLoop(Memmove);
+        Changed = true;
+        Memmove->eraseFromParent();
+      }
+
+      break;
+    }
+    case Intrinsic::memset: {
+      auto *Memset = cast<MemSetInst>(Inst);
+      if (shouldExpandOperationWithSize(Memset->getLength())) {
+        expandMemSetAsLoop(Memset);
+        Changed = true;
+        Memset->eraseFromParent();
+      }
+
+      break;
+    }
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+bool AMDGPULowerIntrinsics::runOnModule(Module &M) {
+  bool Changed = false;
+
+  for (Function &F : M) {
+    if (!F.isDeclaration())
+      continue;
+
+    switch (F.getIntrinsicID()) {
+    case Intrinsic::memcpy:
+    case Intrinsic::memmove:
+    case Intrinsic::memset:
+      if (expandMemIntrinsicUses(F))
+        Changed = true;
+      break;
+    default:
+      break;
+    }
+  }
+
+  return Changed;
+}
+
+ModulePass *llvm::createAMDGPULowerIntrinsicsPass() {
+  return new AMDGPULowerIntrinsics();
+}
diff --git a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index 140fd4fc988b29f77fce9ebb94bb8aa7b22e2763..14ee1c81f8fa7cfbccff0a4ed6381b9fab0920de 100644
--- a/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -151,6 +151,28 @@ bool AMDGPUAsmPrinter::lowerOperand(const MachineOperand &MO,
   return MCInstLowering.lowerOperand(MO, MCOp);
 }
 
+const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
+  // TargetMachine does not support llvm-style cast. Use C++-style cast.
+  // This is safe since TM is always of type AMDGPUTargetMachine or its
+  // derived class.
+  auto *AT = static_cast<AMDGPUTargetMachine*>(&TM);
+  auto *CE = dyn_cast<ConstantExpr>(CV);
+
+  // Lower null pointers in private and local address space.
+  // Clang generates addrspacecast for null pointers in private and local
+  // address space, which needs to be lowered.
+  if (CE && CE->getOpcode() == Instruction::AddrSpaceCast) {
+    auto Op = CE->getOperand(0);
+    auto SrcAddr = Op->getType()->getPointerAddressSpace();
+    if (Op->isNullValue() && AT->getNullPointerValue(SrcAddr) == 0) {
+      auto DstAddr = CE->getType()->getPointerAddressSpace();
+      return MCConstantExpr::create(AT->getNullPointerValue(DstAddr),
+        OutContext);
+    }
+  }
+  return AsmPrinter::lowerConstant(CV);
+}
+
 void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
@@ -173,8 +195,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       ++I;
     }
   } else {
-    // We don't want SI_MASK_BRANCH/SI_RETURN encoded. They are placeholder
-    // terminator instructions and should only be printed as comments.
+    // We don't want SI_MASK_BRANCH/SI_RETURN_TO_EPILOG encoded. They are
+    // placeholder terminator instructions and should only be printed as
+    // comments.
     if (MI->getOpcode() == AMDGPU::SI_MASK_BRANCH) {
       if (isVerbose()) {
         SmallVector<char, 16> BBStr;
@@ -190,9 +213,9 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       return;
     }
 
-    if (MI->getOpcode() == AMDGPU::SI_RETURN) {
+    if (MI->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
       if (isVerbose())
-        OutStreamer->emitRawComment(" return");
+        OutStreamer->emitRawComment(" return to shader part epilog");
       return;
     }
 
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 40c3327a98db2a2e88103e838697c281989ba9c2..27fe639e3d4bb208e61d85633afd17ed5bcf332e 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -12,6 +12,20 @@
 
 using namespace llvm;
 
+static bool isEntryFunctionCC(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+  case CallingConv::AMDGPU_VS:
+  case CallingConv::AMDGPU_GS:
+  case CallingConv::AMDGPU_PS:
+  case CallingConv::AMDGPU_CS:
+    return true;
+  default:
+    return false;
+  }
+}
+
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
   LocalMemoryObjects(),
@@ -19,8 +33,8 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MaxKernArgAlign(0),
   LDSSize(0),
   ABIArgOffset(0),
-  IsKernel(MF.getFunction()->getCallingConv() == CallingConv::AMDGPU_KERNEL ||
-           MF.getFunction()->getCallingConv() == CallingConv::SPIR_KERNEL) {
+  IsEntryFunction(isEntryFunctionCC(MF.getFunction()->getCallingConv())),
+  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
   // except reserved size is not correctly aligned.
 }
diff --git a/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 5d0640b816f369b12253f4d4eaa817a6aefc4225..8bfeb67ad4ecdcb040df0182ead2f26986ef93c1 100644
--- a/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -30,7 +30,11 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   /// Start of implicit kernel args
   unsigned ABIArgOffset;
 
-  bool IsKernel;
+  // Kernels + shaders. i.e. functions called by the driver and not not called
+  // by other functions.
+  bool IsEntryFunction;
+
+  bool NoSignedZerosFPMath;
 
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
@@ -66,8 +70,12 @@ public:
     return LDSSize;
   }
 
-  bool isKernel() const {
-    return IsKernel;
+  bool isEntryFunction() const {
+    return IsEntryFunction;
+  }
+
+  bool hasNoSignedZerosFPMath() const {
+    return NoSignedZerosFPMath;
   }
 
   unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
diff --git a/lib/Target/AMDGPU/AMDGPUPTNote.h b/lib/Target/AMDGPU/AMDGPUPTNote.h
index 05df863e7f74bb3bdf1f6759cf074bde57dd427e..71b9ab699b96fe83f2c444988d3f64630a2ff728 100644
--- a/lib/Target/AMDGPU/AMDGPUPTNote.h
+++ b/lib/Target/AMDGPU/AMDGPUPTNote.h
@@ -19,7 +19,7 @@
 
 namespace AMDGPU {
 
-namespace PT_NOTE {
+namespace ElfNote {
 
 const char SectionName[] = ".note";
 
@@ -33,9 +33,7 @@ enum NoteType{
     NT_AMDGPU_HSA_PRODUCER = 4,
     NT_AMDGPU_HSA_PRODUCER_OPTIONS = 5,
     NT_AMDGPU_HSA_EXTENSION = 6,
-    NT_AMDGPU_HSA_RUNTIME_METADATA_V_1 = 7, // deprecated since 12/14/16.
-    NT_AMDGPU_HSA_RUNTIME_METADATA_V_2 = 8,
-    NT_AMDGPU_HSA_RUNTIME_METADATA = NT_AMDGPU_HSA_RUNTIME_METADATA_V_2,
+    NT_AMDGPU_HSA_CODE_OBJECT_METADATA = 10,
     NT_AMDGPU_HSA_HLDEBUG_DEBUG = 101,
     NT_AMDGPU_HSA_HLDEBUG_TARGET = 102
 };
diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index b928e88719269cc3cbc83e0970eb99d5cc25a225..96bc53d06cd9531b785b41eac7b06a13a0c2e5bd 100644
--- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -72,6 +72,7 @@ private:
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
   MDNode *MaxWorkGroupSizeRange = nullptr;
+  AMDGPUAS AS;
 
   // FIXME: This should be per-kernel.
   uint32_t LocalMemLimit = 0;
@@ -154,6 +155,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   const AMDGPUSubtarget &ST = TM->getSubtarget<AMDGPUSubtarget>(F);
   if (!ST.isPromoteAllocaEnabled())
     return false;
+  AS = AMDGPU::getAMDGPUAS(*F.getParent());
 
   FunctionType *FTy = F.getFunctionType();
 
@@ -162,7 +164,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   // we cannot use local memory in the pass.
   for (Type *ParamTy : FTy->params()) {
     PointerType *PtrTy = dyn_cast<PointerType>(ParamTy);
-    if (PtrTy && PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+    if (PtrTy && PtrTy->getAddressSpace() == AS.LOCAL_ADDRESS) {
       LocalMemLimit = 0;
       DEBUG(dbgs() << "Function has local memory argument. Promoting to "
                       "local memory disabled.\n");
@@ -179,7 +181,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   // Check how much local memory is being used by global objects
   CurrentLocalMemUsage = 0;
   for (GlobalVariable &GV : Mod->globals()) {
-    if (GV.getType()->getAddressSpace() != AMDGPUAS::LOCAL_ADDRESS)
+    if (GV.getType()->getAddressSpace() != AS.LOCAL_ADDRESS)
       continue;
 
     for (const User *U : GV.users()) {
@@ -204,7 +206,8 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
     }
   }
 
-  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage);
+  unsigned MaxOccupancy = ST.getOccupancyWithLocalMemSize(CurrentLocalMemUsage,
+                                                          F);
 
   // Restrict local memory usage so that we don't drastically reduce occupancy,
   // unless it is already significantly reduced.
@@ -225,7 +228,7 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
 
   // Round up to the next tier of usage.
   unsigned MaxSizeWithWaveCount
-    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy);
+    = ST.getMaxLocalMemSizeWithWaveCount(MaxOccupancy, F);
 
   // Program is possibly broken by using more local mem than available.
   if (CurrentLocalMemUsage > MaxSizeWithWaveCount)
@@ -308,15 +311,15 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
     = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
 
   CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
-  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
-  DispatchPtr->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+  DispatchPtr->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
 
   // Size of the dispatch packet struct.
-  DispatchPtr->addDereferenceableAttr(AttributeSet::ReturnIndex, 64);
+  DispatchPtr->addDereferenceableAttr(AttributeList::ReturnIndex, 64);
 
   Type *I32Ty = Type::getInt32Ty(Mod->getContext());
   Value *CastDispatchPtr = Builder.CreateBitCast(
-    DispatchPtr, PointerType::get(I32Ty, AMDGPUAS::CONSTANT_ADDRESS));
+    DispatchPtr, PointerType::get(I32Ty, AS.CONSTANT_ADDRESS));
 
   // We could do a single 64-bit load here, but it's likely that the basic
   // 32-bit and extract sequence is already present, and it is probably easier
@@ -412,7 +415,7 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
   }
 }
 
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, AMDGPUAS AS) {
   ArrayType *AllocaTy = dyn_cast<ArrayType>(Alloca->getAllocatedType());
 
   DEBUG(dbgs() << "Alloca candidate for vectorization\n");
@@ -467,7 +470,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
       Value *Ptr = Inst->getOperand(0);
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
 
@@ -479,7 +482,7 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
       break;
     }
     case Instruction::Store: {
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
+      Type *VecPtrTy = VectorTy->getPointerTo(AS.PRIVATE_ADDRESS);
 
       Value *Ptr = Inst->getOperand(1);
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
@@ -672,7 +675,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
 
   DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I)) {
+  if (tryPromoteAllocaToVector(&I, AS)) {
     DEBUG(dbgs() << " alloca is not a candidate for vectorization.\n");
     return;
   }
@@ -733,7 +736,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       Twine(F->getName()) + Twine('.') + I.getName(),
       nullptr,
       GlobalVariable::NotThreadLocal,
-      AMDGPUAS::LOCAL_ADDRESS);
+      AS.LOCAL_ADDRESS);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   GV->setAlignment(I.getAlignment());
 
@@ -766,7 +769,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       if (ICmpInst *CI = dyn_cast<ICmpInst>(V)) {
         Value *Src0 = CI->getOperand(0);
         Type *EltTy = Src0->getType()->getPointerElementType();
-        PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+        PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
 
         if (isa<ConstantPointerNull>(CI->getOperand(0)))
           CI->setOperand(0, ConstantPointerNull::get(NewTy));
@@ -783,7 +786,7 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
         continue;
 
       Type *EltTy = V->getType()->getPointerElementType();
-      PointerType *NewTy = PointerType::get(EltTy, AMDGPUAS::LOCAL_ADDRESS);
+      PointerType *NewTy = PointerType::get(EltTy, AS.LOCAL_ADDRESS);
 
       // FIXME: It doesn't really make sense to try to do this for all
       // instructions.
@@ -851,11 +854,11 @@ void AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I) {
       Type *SrcTy = Src->getType()->getPointerElementType();
       Function *ObjectSize = Intrinsic::getDeclaration(Mod,
         Intrinsic::objectsize,
-        { Intr->getType(), PointerType::get(SrcTy, AMDGPUAS::LOCAL_ADDRESS) }
+        { Intr->getType(), PointerType::get(SrcTy, AS.LOCAL_ADDRESS) }
       );
 
-      CallInst *NewCall
-        = Builder.CreateCall(ObjectSize, { Src, Intr->getOperand(1) });
+      CallInst *NewCall = Builder.CreateCall(
+          ObjectSize, {Src, Intr->getOperand(1), Intr->getOperand(2)});
       Intr->replaceAllUsesWith(NewCall);
       Intr->eraseFromParent();
       continue;
diff --git a/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
index ef51aad95dce87d4acc996078b44f1555b8012c1..22b1663821d96840ed86b1363f1535f8e261e575 100644
--- a/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ b/lib/Target/AMDGPU/AMDGPURegisterInfo.h
@@ -16,10 +16,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
 
-#include "llvm/Target/TargetRegisterInfo.h"
-
 #define GET_REGINFO_HEADER
-#define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
 
 namespace llvm {
diff --git a/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h b/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
deleted file mode 100644
index c62b96b85c68facc478bb7227586c558eae3427c..0000000000000000000000000000000000000000
--- a/lib/Target/AMDGPU/AMDGPURuntimeMetadata.h
+++ /dev/null
@@ -1,203 +0,0 @@
-//===-- AMDGPURuntimeMetadata.h - AMDGPU Runtime Metadata -------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// Enums and structure types used by runtime metadata.
-///
-/// Runtime requests certain information (metadata) about kernels to be able
-/// to execute the kernels and answer the queries about the kernels.
-/// The metadata is represented as a note element in the .note ELF section of a
-/// binary (code object). The desc field of the note element is a YAML string
-/// consisting of key-value pairs. Each key is a string. Each value can be
-/// an integer, a string, or an YAML sequence. There are 3 levels of YAML maps.
-/// At the beginning of the YAML string is the module level YAML map. A
-/// kernel-level YAML map is in the amd.Kernels sequence. A
-/// kernel-argument-level map is in the amd.Args sequence.
-///
-/// The format should be kept backward compatible. New enum values and bit
-/// fields should be appended at the end. It is suggested to bump up the
-/// revision number whenever the format changes and document the change
-/// in the revision in this header.
-///
-//
-//===----------------------------------------------------------------------===//
-//
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
-
-#include <cstdint>
-#include <vector>
-#include <string>
-
-namespace AMDGPU {
-namespace RuntimeMD {
-
-  // Version and revision of runtime metadata
-  const unsigned char MDVersion   = 2;
-  const unsigned char MDRevision  = 0;
-
-  // Name of keys for runtime metadata.
-  namespace KeyName {
-
-    const char MDVersion[]                = "amd.MDVersion";            // Runtime metadata version
-    const char Language[]                 = "amd.Language";             // Language
-    const char LanguageVersion[]          = "amd.LanguageVersion";      // Language version
-    const char Kernels[]                  = "amd.Kernels";              // Kernels
-    const char KernelName[]               = "amd.KernelName";           // Kernel name
-    const char Args[]                     = "amd.Args";                 // Kernel arguments
-    const char ArgSize[]                  = "amd.ArgSize";              // Kernel arg size
-    const char ArgAlign[]                 = "amd.ArgAlign";             // Kernel arg alignment
-    const char ArgTypeName[]              = "amd.ArgTypeName";          // Kernel type name
-    const char ArgName[]                  = "amd.ArgName";              // Kernel name
-    const char ArgKind[]                  = "amd.ArgKind";              // Kernel argument kind
-    const char ArgValueType[]             = "amd.ArgValueType";         // Kernel argument value type
-    const char ArgAddrQual[]              = "amd.ArgAddrQual";          // Kernel argument address qualifier
-    const char ArgAccQual[]               = "amd.ArgAccQual";           // Kernel argument access qualifier
-    const char ArgIsConst[]               = "amd.ArgIsConst";           // Kernel argument is const qualified
-    const char ArgIsRestrict[]            = "amd.ArgIsRestrict";        // Kernel argument is restrict qualified
-    const char ArgIsVolatile[]            = "amd.ArgIsVolatile";        // Kernel argument is volatile qualified
-    const char ArgIsPipe[]                = "amd.ArgIsPipe";            // Kernel argument is pipe qualified
-    const char ReqdWorkGroupSize[]        = "amd.ReqdWorkGroupSize";    // Required work group size
-    const char WorkGroupSizeHint[]        = "amd.WorkGroupSizeHint";    // Work group size hint
-    const char VecTypeHint[]              = "amd.VecTypeHint";          // Vector type hint
-    const char KernelIndex[]              = "amd.KernelIndex";          // Kernel index for device enqueue
-    const char NoPartialWorkGroups[]      = "amd.NoPartialWorkGroups";  // No partial work groups
-    const char PrintfInfo[]               = "amd.PrintfInfo";           // Prinf function call information
-    const char ArgActualAcc[]             = "amd.ArgActualAcc";         // The actual kernel argument access qualifier
-    const char ArgPointeeAlign[]          = "amd.ArgPointeeAlign";      // Alignment of pointee type
-
-  } // end namespace KeyName
-
-  namespace KernelArg {
-
-    enum Kind : uint8_t {
-      ByValue                 = 0,
-      GlobalBuffer            = 1,
-      DynamicSharedPointer    = 2,
-      Sampler                 = 3,
-      Image                   = 4,
-      Pipe                    = 5,
-      Queue                   = 6,
-      HiddenGlobalOffsetX     = 7,
-      HiddenGlobalOffsetY     = 8,
-      HiddenGlobalOffsetZ     = 9,
-      HiddenNone              = 10,
-      HiddenPrintfBuffer      = 11,
-      HiddenDefaultQueue      = 12,
-      HiddenCompletionAction  = 13,
-    };
-
-    enum ValueType : uint16_t {
-      Struct  = 0,
-      I8      = 1,
-      U8      = 2,
-      I16     = 3,
-      U16     = 4,
-      F16     = 5,
-      I32     = 6,
-      U32     = 7,
-      F32     = 8,
-      I64     = 9,
-      U64     = 10,
-      F64     = 11,
-    };
-
-    // Avoid using 'None' since it conflicts with a macro in X11 header file.
-    enum AccessQualifer : uint8_t {
-      AccNone    = 0,
-      ReadOnly   = 1,
-      WriteOnly  = 2,
-      ReadWrite  = 3,
-    };
-
-    enum AddressSpaceQualifer : uint8_t {
-      Private    = 0,
-      Global     = 1,
-      Constant   = 2,
-      Local      = 3,
-      Generic    = 4,
-      Region     = 5,
-    };
-
-  } // end namespace KernelArg
-
-  // Invalid values are used to indicate an optional key should not be emitted.
-  const uint8_t INVALID_ADDR_QUAL     = 0xff;
-  const uint8_t INVALID_ACC_QUAL      = 0xff;
-  const uint32_t INVALID_KERNEL_INDEX = ~0U;
-
-  namespace KernelArg {
-
-    // In-memory representation of kernel argument information.
-    struct Metadata {
-      uint32_t Size = 0;
-      uint32_t Align = 0;
-      uint32_t PointeeAlign = 0;
-      uint8_t Kind = 0;
-      uint16_t ValueType = 0;
-      std::string TypeName;
-      std::string Name;
-      uint8_t AddrQual = INVALID_ADDR_QUAL;
-      uint8_t AccQual = INVALID_ACC_QUAL;
-      uint8_t IsVolatile = 0;
-      uint8_t IsConst = 0;
-      uint8_t IsRestrict = 0;
-      uint8_t IsPipe = 0;
-
-      Metadata() = default;
-    };
-
-  } // end namespace KernelArg
-
-  namespace Kernel {
-
-    // In-memory representation of kernel information.
-    struct Metadata {
-      std::string Name;
-      std::string Language;
-      std::vector<uint8_t> LanguageVersion;
-      std::vector<uint32_t> ReqdWorkGroupSize;
-      std::vector<uint32_t> WorkGroupSizeHint;
-      std::string VecTypeHint;
-      uint32_t KernelIndex = INVALID_KERNEL_INDEX;
-      uint8_t NoPartialWorkGroups = 0;
-      std::vector<KernelArg::Metadata> Args;
-
-      Metadata() = default;
-    };
-
-  } // end namespace Kernel
-
-  namespace Program {
-
-    // In-memory representation of program information.
-    struct Metadata {
-      std::vector<uint8_t> MDVersionSeq;
-      std::vector<std::string> PrintfInfo;
-      std::vector<Kernel::Metadata> Kernels;
-
-      explicit Metadata() = default;
-
-      // Construct from an YAML string.
-      explicit Metadata(const std::string &YAML);
-
-      // Convert to YAML string.
-      std::string toYAML();
-
-      // Convert from YAML string.
-      static Metadata fromYAML(const std::string &S);
-    };
-
-  } //end namespace Program
-
-} // end namespace RuntimeMD
-} // end namespace AMDGPU
-
-#endif // LLVM_LIB_TARGET_AMDGPU_AMDGPURUNTIMEMETADATA_H
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index c85d2159bdb78e5f60496e9b103ebe76e9abfe56..695d51a5353264d810a78078a8c7c7ff507b10ef 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUSubtarget.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/Target/TargetFrameLowering.h"
@@ -22,7 +23,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "amdgpu-subtarget"
 
-#define GET_SUBTARGETINFO_ENUM
 #define GET_SUBTARGETINFO_TARGET_DESC
 #define GET_SUBTARGETINFO_CTOR
 #include "AMDGPUGenSubtargetInfo.inc"
@@ -41,9 +41,9 @@ AMDGPUSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // for SI has the unhelpful behavior that it unsets everything else if you
   // disable it.
 
-  SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+load-store-opt,");
+  SmallString<256> FullFS("+promote-alloca,+fp64-fp16-denormals,+dx10-clamp,+load-store-opt,");
   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
-    FullFS += "+flat-for-global,+unaligned-buffer-access,";
+    FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
 
   FullFS += FS;
 
@@ -88,11 +88,14 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     FP32Denormals(false),
     FP64FP16Denormals(false),
     FPExceptions(false),
+    DX10Clamp(false),
     FlatForGlobal(false),
     UnalignedScratchAccess(false),
     UnalignedBufferAccess(false),
 
+    HasApertureRegs(false),
     EnableXNACK(false),
+    TrapHandler(false),
     DebuggerInsertNops(false),
     DebuggerReserveRegs(false),
     DebuggerEmitPrologue(false),
@@ -109,9 +112,11 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     GCN1Encoding(false),
     GCN3Encoding(false),
     CIInsts(false),
+    GFX9Insts(false),
     SGPRInitBug(false),
     HasSMemRealTime(false),
     Has16BitInsts(false),
+    HasVOP3PInsts(false),
     HasMovrel(false),
     HasVGPRIndexMode(false),
     HasScalarStores(false),
@@ -129,65 +134,30 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
 
     FeatureDisable(false),
     InstrItins(getInstrItineraryForCPU(GPU)) {
+  AS = AMDGPU::getAMDGPUAS(TT);
   initializeSubtargetDependencies(TT, GPU, FS);
 }
 
-// FIXME: These limits are for SI. Did they change with the larger maximum LDS
-// size?
-unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves) const {
-  switch (NWaves) {
-  case 10:
-    return 1638;
-  case 9:
-    return 1820;
-  case 8:
-    return 2048;
-  case 7:
-    return 2340;
-  case 6:
-    return 2730;
-  case 5:
-    return 3276;
-  case 4:
-    return 4096;
-  case 3:
-    return 5461;
-  case 2:
-    return 8192;
-  default:
+unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
+  const Function &F) const {
+  if (NWaves == 1)
     return getLocalMemorySize();
-  }
+  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+  unsigned MaxWaves = getMaxWavesPerEU();
+  return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 }
 
-unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes) const {
-  if (Bytes <= 1638)
-    return 10;
-
-  if (Bytes <= 1820)
-    return 9;
-
-  if (Bytes <= 2048)
-    return 8;
-
-  if (Bytes <= 2340)
-    return 7;
-
-  if (Bytes <= 2730)
-    return 6;
-
-  if (Bytes <= 3276)
-    return 5;
-
-  if (Bytes <= 4096)
-    return 4;
-
-  if (Bytes <= 5461)
-    return 3;
-
-  if (Bytes <= 8192)
-    return 2;
-
-  return 1;
+unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
+  const Function &F) const {
+  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
+  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
+  unsigned MaxWaves = getMaxWavesPerEU();
+  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
+  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
+  NumWaves = std::min(NumWaves, MaxWaves);
+  NumWaves = std::max(NumWaves, 1u);
+  return NumWaves;
 }
 
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
@@ -225,7 +195,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getFlatWorkGroupSizes(
 std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   const Function &F) const {
   // Default minimum/maximum number of waves per execution unit.
-  std::pair<unsigned, unsigned> Default(1, 0);
+  std::pair<unsigned, unsigned> Default(1, getMaxWavesPerEU());
 
   // Default/requested minimum/maximum flat work group sizes.
   std::pair<unsigned, unsigned> FlatWorkGroupSizes = getFlatWorkGroupSizes(F);
@@ -306,7 +276,7 @@ bool SISubtarget::isVGPRSpillingEnabled(const Function& F) const {
 }
 
 unsigned SISubtarget::getKernArgSegmentSize(const MachineFunction &MF,
-					    unsigned ExplicitArgBytes) const {
+                                            unsigned ExplicitArgBytes) const {
   unsigned ImplicitBytes = getImplicitArgNumBytes(MF);
   if (ImplicitBytes == 0)
     return ExplicitArgBytes;
@@ -360,12 +330,100 @@ unsigned SISubtarget::getOccupancyWithNumVGPRs(unsigned VGPRs) const {
   return 1;
 }
 
-unsigned SISubtarget::getMaxNumSGPRs() const {
+unsigned SISubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+  if (MFI.hasFlatScratchInit()) {
+    if (getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order).
+    if (getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
+      return 4; // FLAT_SCRATCH, VCC (in that order).
+  }
+
+  if (isXNACKEnabled())
+    return 4; // XNACK, VCC (in that order).
+  return 2; // VCC.
+}
+
+unsigned SISubtarget::getMaxNumSGPRs(const MachineFunction &MF) const {
+  const Function &F = *MF.getFunction();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  // Compute maximum number of SGPRs function can use using default/requested
+  // minimum number of waves per execution unit.
+  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+  unsigned MaxNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, false);
+  unsigned MaxAddressableNumSGPRs = getMaxNumSGPRs(WavesPerEU.first, true);
+
+  // Check if maximum number of SGPRs was explicitly requested using
+  // "amdgpu-num-sgpr" attribute.
+  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
+    unsigned Requested = AMDGPU::getIntegerAttribute(
+      F, "amdgpu-num-sgpr", MaxNumSGPRs);
+
+    // Make sure requested value does not violate subtarget's specifications.
+    if (Requested && (Requested <= getReservedNumSGPRs(MF)))
+      Requested = 0;
+
+    // If more SGPRs are required to support the input user/system SGPRs,
+    // increase to accommodate them.
+    //
+    // FIXME: This really ends up using the requested number of SGPRs + number
+    // of reserved special registers in total. Theoretically you could re-use
+    // the last input registers for these special registers, but this would
+    // require a lot of complexity to deal with the weird aliasing.
+    unsigned InputNumSGPRs = MFI.getNumPreloadedSGPRs();
+    if (Requested && Requested < InputNumSGPRs)
+      Requested = InputNumSGPRs;
+
+    // Make sure requested value is compatible with values implied by
+    // default/requested minimum/maximum number of waves per execution unit.
+    if (Requested && Requested > getMaxNumSGPRs(WavesPerEU.first, false))
+      Requested = 0;
+    if (WavesPerEU.second &&
+        Requested && Requested < getMinNumSGPRs(WavesPerEU.second))
+      Requested = 0;
+
+    if (Requested)
+      MaxNumSGPRs = Requested;
+  }
+
   if (hasSGPRInitBug())
-    return SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
+    MaxNumSGPRs = AMDGPU::IsaInfo::FIXED_NUM_SGPRS_FOR_INIT_BUG;
 
-  if (getGeneration() >= VOLCANIC_ISLANDS)
-    return 102;
+  return std::min(MaxNumSGPRs - getReservedNumSGPRs(MF),
+                  MaxAddressableNumSGPRs);
+}
+
+unsigned SISubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
+  const Function &F = *MF.getFunction();
+  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
+
+  // Compute maximum number of VGPRs function can use using default/requested
+  // minimum number of waves per execution unit.
+  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
+  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
+
+  // Check if maximum number of VGPRs was explicitly requested using
+  // "amdgpu-num-vgpr" attribute.
+  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
+    unsigned Requested = AMDGPU::getIntegerAttribute(
+      F, "amdgpu-num-vgpr", MaxNumVGPRs);
+
+    // Make sure requested value does not violate subtarget's specifications.
+    if (Requested && Requested <= getReservedNumVGPRs(MF))
+      Requested = 0;
+
+    // Make sure requested value is compatible with values implied by
+    // default/requested minimum/maximum number of waves per execution unit.
+    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
+      Requested = 0;
+    if (WavesPerEU.second &&
+        Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
+      Requested = 0;
+
+    if (Requested)
+      MaxNumVGPRs = Requested;
+  }
 
-  return 104;
+  return MaxNumVGPRs - getReservedNumVGPRs(MF);
 }
diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 485f6f29de49679fe2b43ae3711497616beabf50..c61a2ff818fc13c3623a978de09efbcc5e61c812 100644
--- a/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -22,6 +22,7 @@
 #include "SIInstrInfo.h"
 #include "SIISelLowering.h"
 #include "SIFrameLowering.h"
+#include "SIMachineFunctionInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
@@ -51,6 +52,7 @@ public:
     SOUTHERN_ISLANDS,
     SEA_ISLANDS,
     VOLCANIC_ISLANDS,
+    GFX9,
   };
 
   enum {
@@ -64,6 +66,28 @@ public:
     ISAVersion8_0_3,
     ISAVersion8_0_4,
     ISAVersion8_1_0,
+    ISAVersion9_0_0,
+    ISAVersion9_0_1
+  };
+
+  enum TrapHandlerAbi {
+    TrapHandlerAbiNone = 0,
+    TrapHandlerAbiHsa = 1
+  };
+
+  enum TrapID {
+    TrapIDHardwareReserved = 0,
+    TrapIDHSADebugTrap = 1,
+    TrapIDLLVMTrap = 2,
+    TrapIDLLVMDebugTrap = 3,
+    TrapIDDebugBreakpoint = 7,
+    TrapIDDebugReserved8 = 8,
+    TrapIDDebugReservedFE = 0xfe,
+    TrapIDDebugReservedFF = 0xff
+  };
+
+  enum TrapRegValues {
+    LLVMTrapHandlerRegValue = 1
   };
 
 protected:
@@ -84,10 +108,13 @@ protected:
   bool FP32Denormals;
   bool FP64FP16Denormals;
   bool FPExceptions;
+  bool DX10Clamp;
   bool FlatForGlobal;
   bool UnalignedScratchAccess;
   bool UnalignedBufferAccess;
+  bool HasApertureRegs;
   bool EnableXNACK;
+  bool TrapHandler;
   bool DebuggerInsertNops;
   bool DebuggerReserveRegs;
   bool DebuggerEmitPrologue;
@@ -106,9 +133,11 @@ protected:
   bool GCN1Encoding;
   bool GCN3Encoding;
   bool CIInsts;
+  bool GFX9Insts;
   bool SGPRInitBug;
   bool HasSMemRealTime;
   bool Has16BitInsts;
+  bool HasVOP3PInsts;
   bool HasMovrel;
   bool HasVGPRIndexMode;
   bool HasScalarStores;
@@ -128,6 +157,7 @@ protected:
 
   InstrItineraryData InstrItins;
   SelectionDAGTargetInfo TSInfo;
+  AMDGPUAS AS;
 
 public:
   AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
@@ -185,10 +215,18 @@ public:
     return MaxPrivateElementSize;
   }
 
+  AMDGPUAS getAMDGPUAS() const {
+    return AS;
+  }
+
   bool has16BitInsts() const {
     return Has16BitInsts;
   }
 
+  bool hasVOP3PInsts() const {
+    return HasVOP3PInsts;
+  }
+
   bool hasHWFP64() const {
     return FP64;
   }
@@ -244,6 +282,10 @@ public:
     return (getGeneration() >= EVERGREEN);
   }
 
+  bool hasMed3_16() const {
+    return getGeneration() >= GFX9;
+  }
+
   bool hasCARRY() const {
     return (getGeneration() >= EVERGREEN);
   }
@@ -256,6 +298,10 @@ public:
     return CaymanISA;
   }
 
+  TrapHandlerAbi getTrapHandlerAbi() const {
+    return isAmdHsaOS() ? TrapHandlerAbiHsa : TrapHandlerAbiNone;
+  }
+
   bool isPromoteAllocaEnabled() const {
     return EnablePromoteAlloca;
   }
@@ -268,17 +314,19 @@ public:
     return DumpCode;
   }
 
-  bool enableIEEEBit(const MachineFunction &MF) const {
-    return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
-  }
-
   /// Return the amount of LDS that can be used that will not restrict the
   /// occupancy lower than WaveCount.
-  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount) const;
+  unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
+                                           const Function &) const;
 
   /// Inverse of getMaxLocalMemWithWaveCount. Return the maximum wavecount if
   /// the given LDS memory size is the only constraint.
-  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes) const;
+  unsigned getOccupancyWithLocalMemSize(uint32_t Bytes, const Function &) const;
+
+  unsigned getOccupancyWithLocalMemSize(const MachineFunction &MF) const {
+    const auto *MFI = MF.getInfo<SIMachineFunctionInfo>();
+    return getOccupancyWithLocalMemSize(MFI->getLDSSize(), *MF.getFunction());
+  }
 
   bool hasFP16Denormals() const {
     return FP64FP16Denormals;
@@ -296,6 +344,14 @@ public:
     return FPExceptions;
   }
 
+  bool enableDX10Clamp() const {
+    return DX10Clamp;
+  }
+
+  bool enableIEEEBit(const MachineFunction &MF) const {
+    return AMDGPU::isCompute(MF.getFunction()->getCallingConv());
+  }
+
   bool useFlatForGlobal() const {
     return FlatForGlobal;
   }
@@ -308,6 +364,14 @@ public:
     return UnalignedScratchAccess;
   }
 
+  bool hasApertureRegs() const {
+   return HasApertureRegs;
+  }
+
+  bool isTrapHandlerEnabled() const {
+    return TrapHandler;
+  }
+
   bool isXNACKEnabled() const {
     return EnableXNACK;
   }
@@ -329,6 +393,10 @@ public:
     return isAmdHsaOS() || isMesaKernel(MF);
   }
 
+  bool hasFminFmaxLegacy() const {
+    return getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
+  }
+
   /// \brief Returns the offset in bytes from the start of the input buffer
   ///        of the first explicit kernel argument.
   unsigned getExplicitKernelArgOffset(const MachineFunction &MF) const {
@@ -360,72 +428,71 @@ public:
     return true;
   }
 
+  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
+  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
+
   /// \returns Number of execution units per compute unit supported by the
   /// subtarget.
   unsigned getEUsPerCU() const {
-    return 4;
+    return AMDGPU::IsaInfo::getEUsPerCU(getFeatureBits());
   }
 
   /// \returns Maximum number of work groups per compute unit supported by the
-  /// subtarget and limited by given flat work group size.
+  /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWorkGroupsPerCU(unsigned FlatWorkGroupSize) const {
-    if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      return 8;
-    return getWavesPerWorkGroup(FlatWorkGroupSize) == 1 ? 40 : 16;
+    return AMDGPU::IsaInfo::getMaxWorkGroupsPerCU(getFeatureBits(),
+                                                  FlatWorkGroupSize);
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerCU() const {
-    return getMaxWavesPerEU() * getEUsPerCU();
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits());
   }
 
   /// \returns Maximum number of waves per compute unit supported by the
-  /// subtarget and limited by given flat work group size.
+  /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
-    return getWavesPerWorkGroup(FlatWorkGroupSize);
+    return AMDGPU::IsaInfo::getMaxWavesPerCU(getFeatureBits(),
+                                             FlatWorkGroupSize);
   }
 
   /// \returns Minimum number of waves per execution unit supported by the
   /// subtarget.
   unsigned getMinWavesPerEU() const {
-    return 1;
+    return AMDGPU::IsaInfo::getMinWavesPerEU(getFeatureBits());
   }
 
   /// \returns Maximum number of waves per execution unit supported by the
   /// subtarget without any kind of limitation.
   unsigned getMaxWavesPerEU() const {
-    if (getGeneration() < AMDGPUSubtarget::SOUTHERN_ISLANDS)
-      return 8;
-    // FIXME: Need to take scratch memory into account.
-    return 10;
+    return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits());
   }
 
   /// \returns Maximum number of waves per execution unit supported by the
-  /// subtarget and limited by given flat work group size.
+  /// subtarget and limited by given \p FlatWorkGroupSize.
   unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const {
-    return alignTo(getMaxWavesPerCU(FlatWorkGroupSize), getEUsPerCU()) /
-      getEUsPerCU();
+    return AMDGPU::IsaInfo::getMaxWavesPerEU(getFeatureBits(),
+                                             FlatWorkGroupSize);
   }
 
   /// \returns Minimum flat work group size supported by the subtarget.
   unsigned getMinFlatWorkGroupSize() const {
-    return 1;
+    return AMDGPU::IsaInfo::getMinFlatWorkGroupSize(getFeatureBits());
   }
 
   /// \returns Maximum flat work group size supported by the subtarget.
   unsigned getMaxFlatWorkGroupSize() const {
-    return 2048;
+    return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(getFeatureBits());
   }
 
-  /// \returns Number of waves per work group given the flat work group size.
+  /// \returns Number of waves per work group supported by the subtarget and
+  /// limited by given \p FlatWorkGroupSize.
   unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
-    return alignTo(FlatWorkGroupSize, getWavefrontSize()) / getWavefrontSize();
+    return AMDGPU::IsaInfo::getWavesPerWorkGroup(getFeatureBits(),
+                                                 FlatWorkGroupSize);
   }
 
-  void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b;}
-  bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal;}
-
   /// \returns Subtarget's default pair of minimum/maximum flat work group sizes
   /// for function \p F, or minimum/maximum flat work group sizes explicitly
   /// requested using "amdgpu-flat-work-group-size" attribute attached to
@@ -487,13 +554,6 @@ public:
 };
 
 class SISubtarget final : public AMDGPUSubtarget {
-public:
-  enum {
-    // The closed Vulkan driver sets 96, which limits the wave count to 8 but
-    // doesn't spill SGPRs as much as when 80 is set.
-    FIXED_SGPR_COUNT_FOR_INIT_BUG = 96
-  };
-
 private:
   SIInstrInfo InstrInfo;
   SIFrameLowering FrameLowering;
@@ -570,6 +630,10 @@ public:
     return HasVGPRIndexMode;
   }
 
+  bool useVGPRIndexMode(bool UserEnable) const {
+    return !hasMovrel() || (UserEnable && hasVGPRIndexMode());
+  }
+
   bool hasScalarCompareEq64() const {
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
@@ -623,6 +687,14 @@ public:
     return getGeneration() != AMDGPUSubtarget::SOUTHERN_ISLANDS;
   }
 
+  bool hasSMovFedHazard() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
+  bool hasReadM0Hazard() const {
+    return getGeneration() >= AMDGPUSubtarget::GFX9;
+  }
+
   unsigned getKernArgSegmentSize(const MachineFunction &MF, unsigned ExplictArgBytes) const;
 
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs SGPRs
@@ -634,10 +706,107 @@ public:
   /// \returns True if waitcnt instruction is needed before barrier instruction,
   /// false otherwise.
   bool needWaitcntBeforeBarrier() const {
-    return true;
+    return getGeneration() < GFX9;
+  }
+
+  /// \returns true if the flat_scratch register should be initialized with the
+  /// pointer to the wave's scratch memory rather than a size and offset.
+  bool flatScratchIsPointer() const {
+    return getGeneration() >= GFX9;
+  }
+
+  /// \returns SGPR allocation granularity supported by the subtarget.
+  unsigned getSGPRAllocGranule() const {
+    return AMDGPU::IsaInfo::getSGPRAllocGranule(getFeatureBits());
+  }
+
+  /// \returns SGPR encoding granularity supported by the subtarget.
+  unsigned getSGPREncodingGranule() const {
+    return AMDGPU::IsaInfo::getSGPREncodingGranule(getFeatureBits());
+  }
+
+  /// \returns Total number of SGPRs supported by the subtarget.
+  unsigned getTotalNumSGPRs() const {
+    return AMDGPU::IsaInfo::getTotalNumSGPRs(getFeatureBits());
+  }
+
+  /// \returns Addressable number of SGPRs supported by the subtarget.
+  unsigned getAddressableNumSGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumSGPRs(getFeatureBits());
+  }
+
+  /// \returns Minimum number of SGPRs that meets the given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMinNumSGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMinNumSGPRs(getFeatureBits(), WavesPerEU);
   }
 
-  unsigned getMaxNumSGPRs() const;
+  /// \returns Maximum number of SGPRs that meets the given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMaxNumSGPRs(unsigned WavesPerEU, bool Addressable) const {
+    return AMDGPU::IsaInfo::getMaxNumSGPRs(getFeatureBits(), WavesPerEU,
+                                           Addressable);
+  }
+
+  /// \returns Reserved number of SGPRs for given function \p MF.
+  unsigned getReservedNumSGPRs(const MachineFunction &MF) const;
+
+  /// \returns Maximum number of SGPRs that meets number of waves per execution
+  /// unit requirement for function \p MF, or number of SGPRs explicitly
+  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
+
+  /// \returns VGPR allocation granularity supported by the subtarget.
+  unsigned getVGPRAllocGranule() const {
+    return AMDGPU::IsaInfo::getVGPRAllocGranule(getFeatureBits());;
+  }
+
+  /// \returns VGPR encoding granularity supported by the subtarget.
+  unsigned getVGPREncodingGranule() const {
+    return AMDGPU::IsaInfo::getVGPREncodingGranule(getFeatureBits());
+  }
+
+  /// \returns Total number of VGPRs supported by the subtarget.
+  unsigned getTotalNumVGPRs() const {
+    return AMDGPU::IsaInfo::getTotalNumVGPRs(getFeatureBits());
+  }
+
+  /// \returns Addressable number of VGPRs supported by the subtarget.
+  unsigned getAddressableNumVGPRs() const {
+    return AMDGPU::IsaInfo::getAddressableNumVGPRs(getFeatureBits());
+  }
+
+  /// \returns Minimum number of VGPRs that meets given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMinNumVGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMinNumVGPRs(getFeatureBits(), WavesPerEU);
+  }
+
+  /// \returns Maximum number of VGPRs that meets given number of waves per
+  /// execution unit requirement supported by the subtarget.
+  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const {
+    return AMDGPU::IsaInfo::getMaxNumVGPRs(getFeatureBits(), WavesPerEU);
+  }
+
+  /// \returns Reserved number of VGPRs for given function \p MF.
+  unsigned getReservedNumVGPRs(const MachineFunction &MF) const {
+    return debuggerReserveRegs() ? 4 : 0;
+  }
+
+  /// \returns Maximum number of VGPRs that meets number of waves per execution
+  /// unit requirement for function \p MF, or number of VGPRs explicitly
+  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
+  ///
+  /// \returns Value that meets number of waves per execution unit requirement
+  /// if explicitly requested value cannot be converted to integer, violates
+  /// subtarget's specifications, or does not meet number of waves per execution
+  /// unit requirement.
+  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 345dc6b4fbf3e246c159f83b2ecb7a086339a3b5..1e7ef584d6e29d8180ce9a58ae8273ff99ca5093 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPUTargetMachine.h"
 #include "AMDGPU.h"
+#include "AMDGPUAliasAnalysis.h"
 #include "AMDGPUCallLowering.h"
 #include "AMDGPUInstructionSelector.h"
 #include "AMDGPULegalizerInfo.h"
@@ -23,6 +24,7 @@
 #endif
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPUTargetTransformInfo.h"
+#include "GCNIterativeScheduler.h"
 #include "GCNSchedStrategy.h"
 #include "R600MachineScheduler.h"
 #include "SIMachineScheduler.h"
@@ -93,6 +95,29 @@ static cl::opt<bool> InternalizeSymbols(
   cl::init(false),
   cl::Hidden);
 
+// Option to inline all early.
+static cl::opt<bool> EarlyInlineAll(
+  "amdgpu-early-inline-all",
+  cl::desc("Inline all functions early"),
+  cl::init(false),
+  cl::Hidden);
+
+static cl::opt<bool> EnableSDWAPeephole(
+  "amdgpu-sdwa-peephole",
+  cl::desc("Enable SDWA peepholer"),
+  cl::init(true));
+
+// Enable address space based alias analysis
+static cl::opt<bool> EnableAMDGPUAliasAnalysis("enable-amdgpu-aa", cl::Hidden,
+  cl::desc("Enable AMDGPU Alias Analysis"),
+  cl::init(true));
+
+// Option to enable new waitcnt insertion pass.
+static cl::opt<bool> EnableSIInsertWaitcntsPass(
+  "enable-si-insert-waitcnts",
+  cl::desc("Use new waitcnt insertion pass"),
+  cl::init(false));
+
 extern "C" void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -103,21 +128,26 @@ extern "C" void LLVMInitializeAMDGPUTarget() {
   initializeSIFixSGPRCopiesPass(*PR);
   initializeSIFixVGPRCopiesPass(*PR);
   initializeSIFoldOperandsPass(*PR);
+  initializeSIPeepholeSDWAPass(*PR);
   initializeSIShrinkInstructionsPass(*PR);
   initializeSIFixControlFlowLiveIntervalsPass(*PR);
   initializeSILoadStoreOptimizerPass(*PR);
   initializeAMDGPUAnnotateKernelFeaturesPass(*PR);
   initializeAMDGPUAnnotateUniformValuesPass(*PR);
+  initializeAMDGPULowerIntrinsicsPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
   initializeSIInsertWaitsPass(*PR);
+  initializeSIInsertWaitcntsPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
   initializeSIInsertSkipsPass(*PR);
   initializeSIDebuggerInsertNopsPass(*PR);
   initializeSIOptimizeExecMaskingPass(*PR);
+  initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
+  initializeAMDGPUAAWrapperPassPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -135,13 +165,26 @@ static ScheduleDAGInstrs *createSIMachineScheduler(MachineSchedContext *C) {
 static ScheduleDAGInstrs *
 createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   ScheduleDAGMILive *DAG =
-      new ScheduleDAGMILive(C,
-                            llvm::make_unique<GCNMaxOccupancySchedStrategy>(C));
+    new GCNScheduleDAGMILive(C, make_unique<GCNMaxOccupancySchedStrategy>(C));
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   return DAG;
 }
 
+static ScheduleDAGInstrs *
+createIterativeGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
+  auto DAG = new GCNIterativeScheduler(C,
+    GCNIterativeScheduler::SCHEDULE_LEGACYMAXOCCUPANCY);
+  DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
+  DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
+  return DAG;
+}
+
+static ScheduleDAGInstrs *createMinRegScheduler(MachineSchedContext *C) {
+  return new GCNIterativeScheduler(C,
+    GCNIterativeScheduler::SCHEDULE_MINREGFORCED);
+}
+
 static MachineSchedRegistry
 R600SchedRegistry("r600", "Run R600's custom scheduler",
                    createR600MachineScheduler);
@@ -155,6 +198,16 @@ GCNMaxOccupancySchedRegistry("gcn-max-occupancy",
                              "Run GCN scheduler to maximize occupancy",
                              createGCNMaxOccupancyMachineScheduler);
 
+static MachineSchedRegistry
+IterativeGCNMaxOccupancySchedRegistry("gcn-max-occupancy-experimental",
+  "Run GCN scheduler to maximize occupancy (experimental)",
+  createIterativeGCNMaxOccupancyMachineScheduler);
+
+static MachineSchedRegistry
+GCNMinRegSchedRegistry("gcn-minreg",
+  "Run GCN iterative scheduler for minimal register usage (experimental)",
+  createMinRegScheduler);
+
 static StringRef computeDataLayout(const Triple &TT) {
   if (TT.getArch() == Triple::r600) {
     // 32-bit pointers.
@@ -164,9 +217,14 @@ static StringRef computeDataLayout(const Triple &TT) {
 
   // 32-bit private, local, and region pointers. 64-bit global, constant and
   // flat.
-  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+  if (TT.getEnvironmentName() == "amdgiz" ||
+      TT.getEnvironmentName() == "amdgizcl")
+    return "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:32:32-p5:32:32"
          "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
-         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
+         "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5";
+  return "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32"
+      "-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128"
+      "-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64";
 }
 
 LLVM_READNONE
@@ -196,6 +254,7 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
   : LLVMTargetMachine(T, computeDataLayout(TT), TT, getGPUOrDefault(TT, CPU),
                       FS, Options, getEffectiveRelocModel(RM), CM, OptLevel),
     TLOF(createTLOF(getTargetTriple())) {
+  AS = AMDGPU::getAMDGPUAS(TT);
   initAsmInfo();
 }
 
@@ -215,13 +274,31 @@ StringRef AMDGPUTargetMachine::getFeatureString(const Function &F) const {
     FSAttr.getValueAsString();
 }
 
+static ImmutablePass *createAMDGPUExternalAAWrapperPass() {
+  return createExternalAAWrapperPass([](Pass &P, Function &, AAResults &AAR) {
+      if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+        AAR.addAAResult(WrapperPass->getResult());
+      });
+}
+
 void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
+  Builder.DivergentTarget = true;
+
   bool Internalize = InternalizeSymbols &&
                      (getOptLevel() > CodeGenOpt::None) &&
                      (getTargetTriple().getArch() == Triple::amdgcn);
+  bool EarlyInline = EarlyInlineAll &&
+                     (getOptLevel() > CodeGenOpt::None);
+  bool AMDGPUAA = EnableAMDGPUAliasAnalysis && getOptLevel() > CodeGenOpt::None;
+
   Builder.addExtension(
     PassManagerBuilder::EP_ModuleOptimizerEarly,
-    [Internalize](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+    [Internalize, EarlyInline, AMDGPUAA](const PassManagerBuilder &,
+                                         legacy::PassManagerBase &PM) {
+      if (AMDGPUAA) {
+        PM.add(createAMDGPUAAWrapperPass());
+        PM.add(createAMDGPUExternalAAWrapperPass());
+      }
       PM.add(createAMDGPUUnifyMetadataPass());
       if (Internalize) {
         PM.add(createInternalizePass([=](const GlobalValue &GV) -> bool {
@@ -244,6 +321,17 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
         }));
         PM.add(createGlobalDCEPass());
       }
+      if (EarlyInline)
+        PM.add(createAMDGPUAlwaysInlinePass(false));
+  });
+
+  Builder.addExtension(
+    PassManagerBuilder::EP_EarlyAsPossible,
+    [AMDGPUAA](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+      if (AMDGPUAA) {
+        PM.add(createAMDGPUAAWrapperPass());
+        PM.add(createAMDGPUExternalAAWrapperPass());
+      }
   });
 }
 
@@ -472,6 +560,8 @@ void AMDGPUPassConfig::addIRPasses() {
   disablePass(&FuncletLayoutID);
   disablePass(&PatchableFunctionID);
 
+  addPass(createAMDGPULowerIntrinsicsPass());
+
   // Function calls are not supported, so make sure we inline everything.
   addPass(createAMDGPUAlwaysInlinePass());
   addPass(createAlwaysInlinerLegacyPass());
@@ -495,12 +585,22 @@ void AMDGPUPassConfig::addIRPasses() {
   addPass(createAMDGPUOpenCLImageTypeLoweringPass());
 
   if (TM.getOptLevel() > CodeGenOpt::None) {
+    addPass(createInferAddressSpacesPass());
     addPass(createAMDGPUPromoteAlloca(&TM));
 
     if (EnableSROA)
       addPass(createSROAPass());
 
     addStraightLineScalarOptimizationPasses();
+
+    if (EnableAMDGPUAliasAnalysis) {
+      addPass(createAMDGPUAAWrapperPass());
+      addPass(createExternalAAWrapperPass([](Pass &P, Function &,
+                                             AAResults &AAR) {
+        if (auto *WrapperPass = P.getAnalysisIfAvailable<AMDGPUAAWrapperPass>())
+          AAR.addAAResult(WrapperPass->getResult());
+        }));
+    }
   }
 
   TargetPassConfig::addIRPasses();
@@ -595,7 +695,12 @@ bool GCNPassConfig::addPreISel() {
 
   // FIXME: We need to run a pass to propagate the attributes when calls are
   // supported.
-  addPass(&AMDGPUAnnotateKernelFeaturesID);
+  const AMDGPUTargetMachine &TM = getAMDGPUTargetMachine();
+  addPass(createAMDGPUAnnotateKernelFeaturesPass(&TM));
+
+  // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit
+  // regions formed by them.
+  addPass(&AMDGPUUnifyDivergentExitNodesID);
   addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
   addPass(createSinkingPass());
   addPass(createSITypeRewriter());
@@ -618,6 +723,11 @@ void GCNPassConfig::addMachineSSAOptimization() {
   addPass(&SIFoldOperandsID);
   addPass(&DeadMachineInstructionElimID);
   addPass(&SILoadStoreOptimizerID);
+  addPass(createSIShrinkInstructionsPass());
+  if (EnableSDWAPeephole) {
+    addPass(&SIPeepholeSDWAID);
+    addPass(&DeadMachineInstructionElimID);
+  }
 }
 
 bool GCNPassConfig::addILPOpts() {
@@ -659,7 +769,6 @@ bool GCNPassConfig::addGlobalInstructionSelect() {
 #endif
 
 void GCNPassConfig::addPreRegAlloc() {
-  addPass(createSIShrinkInstructionsPass());
   addPass(createSIWholeQuadModePass());
 }
 
@@ -708,7 +817,10 @@ void GCNPassConfig::addPreEmitPass() {
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
-  addPass(createSIInsertWaitsPass());
+  if (EnableSIInsertWaitcntsPass)
+    addPass(createSIInsertWaitcntsPass());
+  else
+    addPass(createSIInsertWaitsPass());
   addPass(createSIShrinkInstructionsPass());
   addPass(&SIInsertSkipsPassID);
   addPass(createSIDebuggerInsertNopsPass());
@@ -718,3 +830,4 @@ void GCNPassConfig::addPreEmitPass() {
 TargetPassConfig *GCNTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new GCNPassConfig(this, PM);
 }
+
diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index e6981943f494b9ef38736f3b6c2aafbdecbdccd9..934bf7f31bab45353c909d3263c04b4306d2d9cd 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -35,6 +35,7 @@ class AMDGPUTargetMachine : public LLVMTargetMachine {
 protected:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   AMDGPUIntrinsicInfo IntrinsicInfo;
+  AMDGPUAS AS;
 
   StringRef getGPUName(const Function &F) const;
   StringRef getFeatureString(const Function &F) const;
@@ -57,8 +58,18 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+  AMDGPUAS getAMDGPUAS() const {
+    return AS;
+  }
 
   void adjustPassManager(PassManagerBuilder &) override;
+  /// Get the integer value of a null pointer in the given address space.
+  uint64_t getNullPointerValue(unsigned AddrSpace) const {
+    if (AddrSpace == AS.LOCAL_ADDRESS || AddrSpace == AS.REGION_ADDRESS)
+      return -1;
+    return 0;
+  }
+
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
index 1fddc88a705afe32bdac9ea4af004e6f2083bfaa..c96761c0b04ecf9bda9306bc3b28255ffd502e91 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.cpp
@@ -7,6 +7,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPUTargetMachine.h"
 #include "AMDGPUTargetObjectFile.h"
 #include "AMDGPU.h"
 #include "llvm/MC/MCContext.h"
@@ -22,7 +23,8 @@ using namespace llvm;
 
 MCSection *AMDGPUTargetObjectFile::SelectSectionForGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
-  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO) &&
+  auto AS = static_cast<const AMDGPUTargetMachine*>(&TM)->getAMDGPUAS();
+  if (Kind.isReadOnly() && AMDGPU::isReadOnlySegment(GO, AS) &&
       AMDGPU::shouldEmitConstantsToTextSection(TM.getTargetTriple()))
     return TextSection;
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index de327786dff6da087a913e3bd2ce995eff24339f..ca6210f692989a6830d76920a922fe942121593c 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
 
+#include "AMDGPU.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/Target/TargetMachine.h"
 
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 9fb68a38415f8c9234ec801e1caa712e4e7c8842..f6d33740a4ff5df700c0341f5c26e560a1af6875 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -29,6 +29,39 @@ using namespace llvm;
 
 #define DEBUG_TYPE "AMDGPUtti"
 
+static cl::opt<unsigned> UnrollThresholdPrivate(
+  "amdgpu-unroll-threshold-private",
+  cl::desc("Unroll threshold for AMDGPU if private memory used in a loop"),
+  cl::init(2500), cl::Hidden);
+
+static cl::opt<unsigned> UnrollThresholdLocal(
+  "amdgpu-unroll-threshold-local",
+  cl::desc("Unroll threshold for AMDGPU if local memory used in a loop"),
+  cl::init(1000), cl::Hidden);
+
+static cl::opt<unsigned> UnrollThresholdIf(
+  "amdgpu-unroll-threshold-if",
+  cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
+  cl::init(150), cl::Hidden);
+
+static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
+                              unsigned Depth = 0) {
+  const Instruction *I = dyn_cast<Instruction>(Cond);
+  if (!I)
+    return false;
+
+  for (const Value *V : I->operand_values()) {
+    if (!L->contains(I))
+      continue;
+    if (const PHINode *PHI = dyn_cast<PHINode>(V)) {
+      if (none_of(L->getSubLoops(), [PHI](const Loop* SubLoop) {
+                  return SubLoop->contains(PHI); }))
+        return true;
+    } else if (Depth < 10 && dependsOnLocalPhi(L, V, Depth+1))
+      return true;
+  }
+  return false;
+}
 
 void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
                                             TTI::UnrollingPreferences &UP) {
@@ -38,29 +71,115 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L,
 
   // TODO: Do we want runtime unrolling?
 
+  // Maximum alloca size than can fit registers. Reserve 16 registers.
+  const unsigned MaxAlloca = (256 - 16) * 4;
+  unsigned ThresholdPrivate = UnrollThresholdPrivate;
+  unsigned ThresholdLocal = UnrollThresholdLocal;
+  unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
+  AMDGPUAS ASST = ST->getAMDGPUAS();
   for (const BasicBlock *BB : L->getBlocks()) {
     const DataLayout &DL = BB->getModule()->getDataLayout();
+    unsigned LocalGEPsSeen = 0;
+
+    if (any_of(L->getSubLoops(), [BB](const Loop* SubLoop) {
+               return SubLoop->contains(BB); }))
+        continue; // Block belongs to an inner loop.
+
     for (const Instruction &I : *BB) {
+
+      // Unroll a loop which contains an "if" statement whose condition
+      // defined by a PHI belonging to the loop. This may help to eliminate
+      // if region and potentially even PHI itself, saving on both divergence
+      // and registers used for the PHI.
+      // Add a small bonus for each of such "if" statements.
+      if (const BranchInst *Br = dyn_cast<BranchInst>(&I)) {
+        if (UP.Threshold < MaxBoost && Br->isConditional()) {
+          if (L->isLoopExiting(Br->getSuccessor(0)) ||
+              L->isLoopExiting(Br->getSuccessor(1)))
+            continue;
+          if (dependsOnLocalPhi(L, Br->getCondition())) {
+            UP.Threshold += UnrollThresholdIf;
+            DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
+                         << " for loop:\n" << *L << " due to " << *Br << '\n');
+            if (UP.Threshold >= MaxBoost)
+              return;
+          }
+        }
+        continue;
+      }
+
       const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I);
-      if (!GEP || GEP->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS)
+      if (!GEP)
+        continue;
+
+      unsigned AS = GEP->getAddressSpace();
+      unsigned Threshold = 0;
+      if (AS == ASST.PRIVATE_ADDRESS)
+        Threshold = ThresholdPrivate;
+      else if (AS == ASST.LOCAL_ADDRESS)
+        Threshold = ThresholdLocal;
+      else
+        continue;
+
+      if (UP.Threshold >= Threshold)
         continue;
 
-      const Value *Ptr = GEP->getPointerOperand();
-      const AllocaInst *Alloca =
-          dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
-      if (Alloca) {
-        // We want to do whatever we can to limit the number of alloca
-        // instructions that make it through to the code generator.  allocas
-        // require us to use indirect addressing, which is slow and prone to
-        // compiler bugs.  If this loop does an address calculation on an
-        // alloca ptr, then we want to use a higher than normal loop unroll
-        // threshold. This will give SROA a better chance to eliminate these
-        // allocas.
-        //
-        // Don't use the maximum allowed value here as it will make some
-        // programs way too big.
-        UP.Threshold = 800;
+      if (AS == ASST.PRIVATE_ADDRESS) {
+        const Value *Ptr = GEP->getPointerOperand();
+        const AllocaInst *Alloca =
+            dyn_cast<AllocaInst>(GetUnderlyingObject(Ptr, DL));
+        if (!Alloca || !Alloca->isStaticAlloca())
+          continue;
+        Type *Ty = Alloca->getAllocatedType();
+        unsigned AllocaSize = Ty->isSized() ? DL.getTypeAllocSize(Ty) : 0;
+        if (AllocaSize > MaxAlloca)
+          continue;
+      } else if (AS == ASST.LOCAL_ADDRESS) {
+        LocalGEPsSeen++;
+        // Inhibit unroll for local memory if we have seen addressing not to
+        // a variable, most likely we will be unable to combine it.
+        // Do not unroll too deep inner loops for local memory to give a chance
+        // to unroll an outer loop for a more important reason.
+        if (LocalGEPsSeen > 1 || L->getLoopDepth() > 2 ||
+            (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
+             !isa<Argument>(GEP->getPointerOperand())))
+          continue;
       }
+
+      // Check if GEP depends on a value defined by this loop itself.
+      bool HasLoopDef = false;
+      for (const Value *Op : GEP->operands()) {
+        const Instruction *Inst = dyn_cast<Instruction>(Op);
+        if (!Inst || L->isLoopInvariant(Op))
+          continue;
+
+        if (any_of(L->getSubLoops(), [Inst](const Loop* SubLoop) {
+             return SubLoop->contains(Inst); }))
+          continue;
+        HasLoopDef = true;
+        break;
+      }
+      if (!HasLoopDef)
+        continue;
+
+      // We want to do whatever we can to limit the number of alloca
+      // instructions that make it through to the code generator.  allocas
+      // require us to use indirect addressing, which is slow and prone to
+      // compiler bugs.  If this loop does an address calculation on an
+      // alloca ptr, then we want to use a higher than normal loop unroll
+      // threshold. This will give SROA a better chance to eliminate these
+      // allocas.
+      //
+      // We also want to have more unrolling for local memory to let ds
+      // instructions with different offsets combine.
+      //
+      // Don't use the maximum allowed value here as it will make some
+      // programs way too big.
+      UP.Threshold = Threshold;
+      DEBUG(dbgs() << "Set unroll threshold " << Threshold << " for loop:\n"
+                   << *L << " due to " << *GEP << '\n');
+      if (UP.Threshold >= MaxBoost)
+        return;
     }
   }
 }
@@ -81,28 +200,56 @@ unsigned AMDGPUTTIImpl::getRegisterBitWidth(bool Vector) {
 }
 
 unsigned AMDGPUTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
-  switch (AddrSpace) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::CONSTANT_ADDRESS:
-  case AMDGPUAS::FLAT_ADDRESS:
+  AMDGPUAS AS = ST->getAMDGPUAS();
+  if (AddrSpace == AS.GLOBAL_ADDRESS ||
+      AddrSpace == AS.CONSTANT_ADDRESS ||
+      AddrSpace == AS.FLAT_ADDRESS)
     return 128;
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS:
+  if (AddrSpace == AS.LOCAL_ADDRESS ||
+      AddrSpace == AS.REGION_ADDRESS)
     return 64;
-  case AMDGPUAS::PRIVATE_ADDRESS:
+  if (AddrSpace == AS.PRIVATE_ADDRESS)
     return 8 * ST->getMaxPrivateElementSize();
-  default:
-    if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
-        (AddrSpace == AMDGPUAS::PARAM_D_ADDRESS ||
-         AddrSpace == AMDGPUAS::PARAM_I_ADDRESS ||
-         (AddrSpace >= AMDGPUAS::CONSTANT_BUFFER_0 &&
-          AddrSpace <= AMDGPUAS::CONSTANT_BUFFER_15)))
-      return 128;
-    llvm_unreachable("unhandled address space");
+
+  if (ST->getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS &&
+      (AddrSpace == AS.PARAM_D_ADDRESS ||
+      AddrSpace == AS.PARAM_I_ADDRESS ||
+      (AddrSpace >= AS.CONSTANT_BUFFER_0 &&
+      AddrSpace <= AS.CONSTANT_BUFFER_15)))
+    return 128;
+  llvm_unreachable("unhandled address space");
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+                                               unsigned Alignment,
+                                               unsigned AddrSpace) const {
+  // We allow vectorization of flat stores, even though we may need to decompose
+  // them later if they may access private memory. We don't have enough context
+  // here, and legalization can handle it.
+  if (AddrSpace == ST->getAMDGPUAS().PRIVATE_ADDRESS) {
+    return (Alignment >= 4 || ST->hasUnalignedScratchAccess()) &&
+      ChainSizeInBytes <= ST->getMaxPrivateElementSize();
   }
+  return true;
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+                                                unsigned Alignment,
+                                                unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
+}
+
+bool AMDGPUTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                                 unsigned Alignment,
+                                                 unsigned AddrSpace) const {
+  return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
 unsigned AMDGPUTTIImpl::getMaxInterleaveFactor(unsigned VF) {
+  // Disable unrolling if the loop is not vectorized.
+  if (VF == 1)
+    return 1;
+
   // Semi-arbitrary large amount.
   return 64;
 }
@@ -228,16 +375,8 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   }
 }
 
-static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
-                                          const IntrinsicInst *I) {
+static bool isIntrinsicSourceOfDivergence(const IntrinsicInst *I) {
   switch (I->getIntrinsicID()) {
-  default:
-    return false;
-  case Intrinsic::not_intrinsic:
-    // This means we have an intrinsic that isn't defined in
-    // IntrinsicsAMDGPU.td
-    break;
-
   case Intrinsic::amdgcn_workitem_id_x:
   case Intrinsic::amdgcn_workitem_id_y:
   case Intrinsic::amdgcn_workitem_id_z:
@@ -278,15 +417,8 @@ static bool isIntrinsicSourceOfDivergence(const TargetIntrinsicInfo *TII,
   case Intrinsic::amdgcn_ps_live:
   case Intrinsic::amdgcn_ds_swizzle:
     return true;
-  }
-
-  StringRef Name = I->getCalledFunction()->getName();
-  switch (TII->lookupName((const char *)Name.bytes_begin(), Name.size())) {
   default:
     return false;
-  case AMDGPUIntrinsic::SI_fs_interp:
-  case AMDGPUIntrinsic::SI_fs_constant:
-    return true;
   }
 }
 
@@ -321,7 +453,7 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
   // All other loads are not divergent, because if threads issue loads with the
   // same arguments, they will always get the same result.
   if (const LoadInst *Load = dyn_cast<LoadInst>(V))
-    return Load->getPointerAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS;
+    return Load->getPointerAddressSpace() == ST->getAMDGPUAS().PRIVATE_ADDRESS;
 
   // Atomics are divergent because they are executed sequentially: when an
   // atomic operation refers to the same address in each thread, then each
@@ -330,10 +462,8 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const {
   if (isa<AtomicRMWInst>(V) || isa<AtomicCmpXchgInst>(V))
     return true;
 
-  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
-    const TargetMachine &TM = getTLI()->getTargetMachine();
-    return isIntrinsicSourceOfDivergence(TM.getIntrinsicInfo(), Intrinsic);
-  }
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V))
+    return isIntrinsicSourceOfDivergence(Intrinsic);
 
   // Assume all function calls are a source of divergence.
   if (isa<CallInst>(V) || isa<InvokeInst>(V))
diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 996c9053ff0c6f0aec688d8e18cfb84ad1b76951..71d6306bc1a5ce67b4fc5208a38cac9697084d0b 100644
--- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -78,6 +78,17 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
+
+  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
+                                  unsigned Alignment,
+                                  unsigned AddrSpace) const;
+  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
+                                   unsigned Alignment,
+                                   unsigned AddrSpace) const;
+  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
+                                    unsigned Alignment,
+                                    unsigned AddrSpace) const;
+
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getArithmeticInstrCost(
@@ -98,7 +109,8 @@ public:
     // don't use flat addressing.
     if (IsGraphicsShader)
       return -1;
-    return ST->hasFlatAddressSpace() ? AMDGPUAS::FLAT_ADDRESS : -1;
+    return ST->hasFlatAddressSpace() ?
+      ST->getAMDGPUAS().FLAT_ADDRESS : ST->getAMDGPUAS().UNKNOWN_ADDRESS_SPACE;
   }
 
   unsigned getVectorSplitCost() { return 0; }
diff --git a/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..309913f87fb6962b0445335e1421ecaf1a6e22bb
--- /dev/null
+++ b/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -0,0 +1,225 @@
+//===- AMDGPUUnifyDivergentExitNodes.cpp ----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a variant of the UnifyDivergentExitNodes pass. Rather than ensuring
+// there is at most one ret and one unreachable instruction, it ensures there is
+// at most one divergent exiting block.
+//
+// StructurizeCFG can't deal with multi-exit regions formed by branches to
+// multiple return nodes. It is not desirable to structurize regions with
+// uniform branches, so unifying those to the same return block as divergent
+// branches inhibits use of scalar branching. It still can't deal with the case
+// where one branch goes to return, and one unreachable. Replace unreachable in
+// this case with a return.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-unify-divergent-exit-nodes"
+
+namespace {
+
+class AMDGPUUnifyDivergentExitNodes : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid
+  AMDGPUUnifyDivergentExitNodes() : FunctionPass(ID) {
+    initializeAMDGPUUnifyDivergentExitNodesPass(*PassRegistry::getPassRegistry());
+  }
+
+  // We can preserve non-critical-edgeness when we unify function exit nodes
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnFunction(Function &F) override;
+};
+
+}
+
+char AMDGPUUnifyDivergentExitNodes::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+                     "Unify divergent function exit nodes", false, false)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
+INITIALIZE_PASS_END(AMDGPUUnifyDivergentExitNodes, DEBUG_TYPE,
+                    "Unify divergent function exit nodes", false, false)
+
+char &llvm::AMDGPUUnifyDivergentExitNodesID = AMDGPUUnifyDivergentExitNodes::ID;
+
+void AMDGPUUnifyDivergentExitNodes::getAnalysisUsage(AnalysisUsage &AU) const{
+  // TODO: Preserve dominator tree.
+  AU.addRequired<PostDominatorTreeWrapperPass>();
+
+  AU.addRequired<DivergenceAnalysis>();
+
+  // No divergent values are changed, only blocks and branch edges.
+  AU.addPreserved<DivergenceAnalysis>();
+
+  // We preserve the non-critical-edgeness property
+  AU.addPreservedID(BreakCriticalEdgesID);
+
+  // This is a cluster of orthogonal Transforms
+  AU.addPreservedID(LowerSwitchID);
+  FunctionPass::getAnalysisUsage(AU);
+
+  AU.addRequired<TargetTransformInfoWrapperPass>();
+}
+
+/// \returns true if \p BB is reachable through only uniform branches.
+/// XXX - Is there a more efficient way to find this?
+static bool isUniformlyReached(const DivergenceAnalysis &DA,
+                               BasicBlock &BB) {
+  SmallVector<BasicBlock *, 8> Stack;
+  SmallPtrSet<BasicBlock *, 8> Visited;
+
+  for (BasicBlock *Pred : predecessors(&BB))
+    Stack.push_back(Pred);
+
+  while (!Stack.empty()) {
+    BasicBlock *Top = Stack.pop_back_val();
+    if (!DA.isUniform(Top->getTerminator()))
+      return false;
+
+    for (BasicBlock *Pred : predecessors(Top)) {
+      if (Visited.insert(Pred).second)
+        Stack.push_back(Pred);
+    }
+  }
+
+  return true;
+}
+
+static BasicBlock *unifyReturnBlockSet(Function &F,
+                                       ArrayRef<BasicBlock *> ReturningBlocks,
+                                       const TargetTransformInfo &TTI,
+                                       StringRef Name) {
+  // Otherwise, we need to insert a new basic block into the function, add a PHI
+  // nodes (if the function returns values), and convert all of the return
+  // instructions into unconditional branches.
+  //
+  BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+
+  PHINode *PN = nullptr;
+  if (F.getReturnType()->isVoidTy()) {
+    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+  } else {
+    // If the function doesn't return void... add a PHI node to the block...
+    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
+                         "UnifiedRetVal");
+    NewRetBlock->getInstList().push_back(PN);
+    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+  }
+
+  // Loop over all of the blocks, replacing the return instruction with an
+  // unconditional branch.
+  //
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Add an incoming element to the PHI node for every return instruction that
+    // is merging into this new block...
+    if (PN)
+      PN->addIncoming(BB->getTerminator()->getOperand(0), BB);
+
+    BB->getInstList().pop_back();  // Remove the return insn
+    BranchInst::Create(NewRetBlock, BB);
+  }
+
+  for (BasicBlock *BB : ReturningBlocks) {
+    // Cleanup possible branch to unconditional branch to the return.
+    SimplifyCFG(BB, TTI, 2);
+  }
+
+  return NewRetBlock;
+}
+
+bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
+  auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+  if (PDT.getRoots().size() <= 1)
+    return false;
+
+  DivergenceAnalysis &DA = getAnalysis<DivergenceAnalysis>();
+
+  // Loop over all of the blocks in a function, tracking all of the blocks that
+  // return.
+  //
+  SmallVector<BasicBlock *, 4> ReturningBlocks;
+  SmallVector<BasicBlock *, 4> UnreachableBlocks;
+
+  for (BasicBlock *BB : PDT.getRoots()) {
+    if (isa<ReturnInst>(BB->getTerminator())) {
+      if (!isUniformlyReached(DA, *BB))
+        ReturningBlocks.push_back(BB);
+    } else if (isa<UnreachableInst>(BB->getTerminator())) {
+      if (!isUniformlyReached(DA, *BB))
+        UnreachableBlocks.push_back(BB);
+    }
+  }
+
+  if (!UnreachableBlocks.empty()) {
+    BasicBlock *UnreachableBlock = nullptr;
+
+    if (UnreachableBlocks.size() == 1) {
+      UnreachableBlock = UnreachableBlocks.front();
+    } else {
+      UnreachableBlock = BasicBlock::Create(F.getContext(),
+                                            "UnifiedUnreachableBlock", &F);
+      new UnreachableInst(F.getContext(), UnreachableBlock);
+
+      for (BasicBlock *BB : UnreachableBlocks) {
+        BB->getInstList().pop_back();  // Remove the unreachable inst.
+        BranchInst::Create(UnreachableBlock, BB);
+      }
+    }
+
+    if (!ReturningBlocks.empty()) {
+      // Don't create a new unreachable inst if we have a return. The
+      // structurizer/annotator can't handle the multiple exits
+
+      Type *RetTy = F.getReturnType();
+      Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+      UnreachableBlock->getInstList().pop_back();  // Remove the unreachable inst.
+
+      Function *UnreachableIntrin =
+        Intrinsic::getDeclaration(F.getParent(), Intrinsic::amdgcn_unreachable);
+
+      // Insert a call to an intrinsic tracking that this is an unreachable
+      // point, in case we want to kill the active lanes or something later.
+      CallInst::Create(UnreachableIntrin, {}, "", UnreachableBlock);
+
+      // Don't create a scalar trap. We would only want to trap if this code was
+      // really reached, but a scalar trap would happen even if no lanes
+      // actually reached here.
+      ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
+      ReturningBlocks.push_back(UnreachableBlock);
+    }
+  }
+
+  // Now handle return blocks.
+  if (ReturningBlocks.empty())
+    return false; // No blocks return
+
+  if (ReturningBlocks.size() == 1)
+    return false; // Already has a single return block
+
+  const TargetTransformInfo &TTI
+    = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+
+  unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
+  return true;
+}
diff --git a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
index 71ed299cc389aeec5e8f4544c04c075c78560278..1a393845a82244626aff42168b494584b4ff3e2a 100644
--- a/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
+++ b/lib/Target/AMDGPU/AMDILCFGStructurizer.cpp
@@ -99,16 +99,6 @@ DEBUG( \
 
 #define INVALIDSCCNUM -1
 
-template<class NodeT>
-void ReverseVector(SmallVectorImpl<NodeT *> &Src) {
-  size_t sz = Src.size();
-  for (size_t i = 0; i < sz/2; ++i) {
-    NodeT *t = Src[i];
-    Src[i] = Src[sz - i - 1];
-    Src[sz - i - 1] = t;
-  }
-}
-
 //===----------------------------------------------------------------------===//
 //
 // supporting data structure for CFGStructurizer
@@ -911,11 +901,8 @@ void AMDGPUCFGStructurizer::orderBlocks(MachineFunction *MF) {
     }
   }
 
-  //walk through all the block in func to check for unreachable
-  typedef GraphTraits<MachineFunction *> GTM;
-  auto It = GTM::nodes_begin(MF), E = GTM::nodes_end(MF);
-  for (; It != E; ++It) {
-    MachineBasicBlock *MBB = *It;
+  // walk through all the block in func to check for unreachable
+  for (auto *MBB : nodes(MF)) {
     SccNum = getSCCNum(MBB);
     if (SccNum == INVALIDSCCNUM)
       dbgs() << "unreachable block BB" << MBB->getNumber() << "\n";
@@ -1081,13 +1068,9 @@ int AMDGPUCFGStructurizer::mergeLoop(MachineLoop *LoopRep) {
   MachineBasicBlock *ExitBlk = *ExitBlks.begin();
   assert(ExitBlk && "Loop has several exit block");
   MBBVector LatchBlks;
-  typedef GraphTraits<Inverse<MachineBasicBlock*>> InvMBBTraits;
-  InvMBBTraits::ChildIteratorType PI = InvMBBTraits::child_begin(LoopHeader),
-      PE = InvMBBTraits::child_end(LoopHeader);
-  for (; PI != PE; PI++) {
-    if (LoopRep->contains(*PI))
-      LatchBlks.push_back(*PI);
-  }
+  for (auto *LB : inverse_children<MachineBasicBlock*>(LoopHeader))
+    if (LoopRep->contains(LB))
+      LatchBlks.push_back(LB);
 
   for (unsigned i = 0, e = ExitingMBBs.size(); i < e; ++i)
     mergeLoopbreakBlock(ExitingMBBs[i], ExitBlk);
diff --git a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index f892f7b951780f06105a2808a7568c4fb12101b0..961f7186f3731d6b84855edfe57f25a69fcf9aa6 100644
--- a/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -80,7 +80,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   const AMDGPUAsmParser *AsmParser;
 
 public:
-  AMDGPUOperand(enum KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
+  AMDGPUOperand(KindTy Kind_, const AMDGPUAsmParser *AsmParser_)
     : MCParsedAsmOperand(), Kind(Kind_), AsmParser(AsmParser_) {}
 
   typedef std::unique_ptr<AMDGPUOperand> Ptr;
@@ -157,7 +157,11 @@ public:
     ImmTySendMsg,
     ImmTyInterpSlot,
     ImmTyInterpAttr,
-    ImmTyAttrChan
+    ImmTyAttrChan,
+    ImmTyOpSel,
+    ImmTyOpSelHi,
+    ImmTyNegLo,
+    ImmTyNegHi
   };
 
   struct TokOp {
@@ -294,6 +298,10 @@ public:
   bool isInterpSlot() const { return isImmTy(ImmTyInterpSlot); }
   bool isInterpAttr() const { return isImmTy(ImmTyInterpAttr); }
   bool isAttrChan() const { return isImmTy(ImmTyAttrChan); }
+  bool isOpSel() const { return isImmTy(ImmTyOpSel); }
+  bool isOpSelHi() const { return isImmTy(ImmTyOpSelHi); }
+  bool isNegLo() const { return isImmTy(ImmTyNegLo); }
+  bool isNegHi() const { return isImmTy(ImmTyNegHi); }
 
   bool isMod() const {
     return isClampSI() || isOModSI();
@@ -313,6 +321,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i16);
   }
 
+  bool isSCSrcV2B16() const {
+    return isSCSrcB16();
+  }
+
   bool isSCSrcB32() const {
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::i32);
   }
@@ -325,6 +337,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f16);
   }
 
+  bool isSCSrcV2F16() const {
+    return isSCSrcF16();
+  }
+
   bool isSCSrcF32() const {
     return isRegOrInlineNoMods(AMDGPU::SReg_32RegClassID, MVT::f32);
   }
@@ -341,6 +357,11 @@ public:
     return isSCSrcB16() || isLiteralImm(MVT::i16);
   }
 
+  bool isSSrcV2B16() const {
+    llvm_unreachable("cannot happen");
+    return isSSrcB16();
+  }
+
   bool isSSrcB64() const {
     // TODO: Find out how SALU supports extension of 32-bit literals to 64 bits.
     // See isVSrc64().
@@ -359,6 +380,11 @@ public:
     return isSCSrcB16() || isLiteralImm(MVT::f16);
   }
 
+  bool isSSrcV2F16() const {
+    llvm_unreachable("cannot happen");
+    return isSSrcF16();
+  }
+
   bool isVCSrcB32() const {
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i32);
   }
@@ -371,6 +397,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::i16);
   }
 
+  bool isVCSrcV2B16() const {
+    return isVCSrcB16();
+  }
+
   bool isVCSrcF32() const {
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f32);
   }
@@ -383,6 +413,10 @@ public:
     return isRegOrInlineNoMods(AMDGPU::VS_32RegClassID, MVT::f16);
   }
 
+  bool isVCSrcV2F16() const {
+    return isVCSrcF16();
+  }
+
   bool isVSrcB32() const {
     return isVCSrcF32() || isLiteralImm(MVT::i32);
   }
@@ -395,6 +429,11 @@ public:
     return isVCSrcF16() || isLiteralImm(MVT::i16);
   }
 
+  bool isVSrcV2B16() const {
+    llvm_unreachable("cannot happen");
+    return isVSrcB16();
+  }
+
   bool isVSrcF32() const {
     return isVCSrcF32() || isLiteralImm(MVT::f32);
   }
@@ -407,6 +446,11 @@ public:
     return isVCSrcF16() || isLiteralImm(MVT::f16);
   }
 
+  bool isVSrcV2F16() const {
+    llvm_unreachable("cannot happen");
+    return isVSrcF16();
+  }
+
   bool isKImmFP32() const {
     return isLiteralImm(MVT::f32);
   }
@@ -456,7 +500,7 @@ public:
     return Imm.Val;
   }
 
-  enum ImmTy getImmTy() const {
+  ImmTy getImmTy() const {
     assert(isImm());
     return Imm.Type;
   }
@@ -498,9 +542,11 @@ public:
     return getModifiers().hasIntModifiers();
   }
 
+  uint64_t applyInputFPModifiers(uint64_t Val, unsigned Size) const;
+
   void addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers = true) const;
 
-  void addLiteralImmOperand(MCInst &Inst, int64_t Val) const;
+  void addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const;
 
   template <unsigned Bitwidth>
   void addKImmFPOperands(MCInst &Inst, unsigned N) const;
@@ -607,6 +653,10 @@ public:
     case ImmTyInterpSlot: OS << "InterpSlot"; break;
     case ImmTyInterpAttr: OS << "InterpAttr"; break;
     case ImmTyAttrChan: OS << "AttrChan"; break;
+    case ImmTyOpSel: OS << "OpSel"; break;
+    case ImmTyOpSelHi: OS << "OpSelHi"; break;
+    case ImmTyNegLo: OS << "NegLo"; break;
+    case ImmTyNegHi: OS << "NegHi"; break;
     }
   }
 
@@ -633,7 +683,7 @@ public:
 
   static AMDGPUOperand::Ptr CreateImm(const AMDGPUAsmParser *AsmParser,
                                       int64_t Val, SMLoc Loc,
-                                      enum ImmTy Type = ImmTyNone,
+                                      ImmTy Type = ImmTyNone,
                                       bool IsFPImm = false) {
     auto Op = llvm::make_unique<AMDGPUOperand>(Immediate, AsmParser);
     Op->Imm.Val = Val;
@@ -756,7 +806,7 @@ private:
   bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
   bool ParseDirectiveHSACodeObjectVersion();
   bool ParseDirectiveHSACodeObjectISA();
-  bool ParseDirectiveRuntimeMetadata();
+  bool ParseDirectiveCodeObjectMetadata();
   bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
   bool ParseDirectiveAMDKernelCodeT();
   bool ParseSectionDirectiveHSAText();
@@ -767,41 +817,52 @@ private:
   bool ParseSectionDirectiveHSADataGlobalAgent();
   bool ParseSectionDirectiveHSADataGlobalProgram();
   bool ParseSectionDirectiveHSARodataReadonlyAgent();
-  bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth, RegisterKind RegKind, unsigned Reg1, unsigned RegNum);
-  bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg, unsigned& RegNum, unsigned& RegWidth, unsigned *DwordRegIndex);
-  void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands, bool IsAtomic, bool IsAtomicReturn);
+  bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
+                             RegisterKind RegKind, unsigned Reg1,
+                             unsigned RegNum);
+  bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
+                           unsigned& RegNum, unsigned& RegWidth,
+                           unsigned *DwordRegIndex);
+  void cvtMubufImpl(MCInst &Inst, const OperandVector &Operands,
+                    bool IsAtomic, bool IsAtomicReturn);
+  void cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
+                 bool IsGdsHardcoded);
 
 public:
   enum AMDGPUMatchResultTy {
     Match_PreferE32 = FIRST_TARGET_MATCH_RESULT_TY
   };
 
+  typedef std::map<AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
+
   AMDGPUAsmParser(const MCSubtargetInfo &STI, MCAsmParser &_Parser,
                const MCInstrInfo &MII,
                const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, STI), MII(MII), Parser(_Parser) {
     MCAsmParserExtension::Initialize(Parser);
 
-    if (getSTI().getFeatureBits().none()) {
+    if (getFeatureBits().none()) {
       // Set default features.
       copySTI().ToggleFeature("SOUTHERN_ISLANDS");
     }
 
-    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+    setAvailableFeatures(ComputeAvailableFeatures(getFeatureBits()));
 
     {
       // TODO: make those pre-defined variables read-only.
       // Currently there is none suitable machinery in the core llvm-mc for this.
       // MCSymbol::isRedefinable is intended for another purpose, and
       // AsmParser::parseDirectiveSet() cannot be specialized for specific target.
-      AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
+      AMDGPU::IsaInfo::IsaVersion ISA =
+          AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
       MCContext &Ctx = getContext();
-      MCSymbol *Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
-      Sym->setVariableValue(MCConstantExpr::create(Isa.Major, Ctx));
+      MCSymbol *Sym =
+          Ctx.getOrCreateSymbol(Twine(".option.machine_version_major"));
+      Sym->setVariableValue(MCConstantExpr::create(ISA.Major, Ctx));
       Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_minor"));
-      Sym->setVariableValue(MCConstantExpr::create(Isa.Minor, Ctx));
+      Sym->setVariableValue(MCConstantExpr::create(ISA.Minor, Ctx));
       Sym = Ctx.getOrCreateSymbol(Twine(".option.machine_version_stepping"));
-      Sym->setVariableValue(MCConstantExpr::create(Isa.Stepping, Ctx));
+      Sym->setVariableValue(MCConstantExpr::create(ISA.Stepping, Ctx));
     }
     KernelScope.initialize(getContext());
   }
@@ -819,7 +880,7 @@ public:
   }
 
   bool hasInv2PiInlineImm() const {
-    return getSTI().getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
+    return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
   }
 
   bool hasSGPR102_SGPR103() const {
@@ -841,6 +902,10 @@ public:
     return &MII;
   }
 
+  const FeatureBitset &getFeatureBits() const {
+    return getSTI().getFeatureBits();
+  }
+
   void setForcedEncodingSize(unsigned Size) { ForcedEncodingSize = Size; }
   void setForcedDPP(bool ForceDPP_) { ForcedDPP = ForceDPP_; }
   void setForcedSDWA(bool ForceSDWA_) { ForcedSDWA = ForceSDWA_; }
@@ -868,19 +933,28 @@ public:
   //bool ProcessInstruction(MCInst &Inst);
 
   OperandMatchResultTy parseIntWithPrefix(const char *Prefix, int64_t &Int);
+
   OperandMatchResultTy
   parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
-                     enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+                     AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
                      bool (*ConvertResult)(int64_t &) = nullptr);
+
+  OperandMatchResultTy parseOperandArrayWithPrefix(
+    const char *Prefix,
+    OperandVector &Operands,
+    AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone,
+    bool (*ConvertResult)(int64_t&) = nullptr);
+
   OperandMatchResultTy
   parseNamedBit(const char *Name, OperandVector &Operands,
-                enum AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
+                AMDGPUOperand::ImmTy ImmTy = AMDGPUOperand::ImmTyNone);
   OperandMatchResultTy parseStringWithPrefix(StringRef Prefix,
                                              StringRef &Value);
 
-  OperandMatchResultTy parseImm(OperandVector &Operands);
+  bool parseAbsoluteExpr(int64_t &Val, bool AbsMod = false);
+  OperandMatchResultTy parseImm(OperandVector &Operands, bool AbsMod = false);
   OperandMatchResultTy parseReg(OperandVector &Operands);
-  OperandMatchResultTy parseRegOrImm(OperandVector &Operands);
+  OperandMatchResultTy parseRegOrImm(OperandVector &Operands, bool AbsMod = false);
   OperandMatchResultTy parseRegOrImmWithFPInputMods(OperandVector &Operands, bool AllowImm = true);
   OperandMatchResultTy parseRegOrImmWithIntInputMods(OperandVector &Operands, bool AllowImm = true);
   OperandMatchResultTy parseRegWithFPInputMods(OperandVector &Operands);
@@ -888,7 +962,8 @@ public:
   OperandMatchResultTy parseVReg32OrOff(OperandVector &Operands);
 
   void cvtDSOffset01(MCInst &Inst, const OperandVector &Operands);
-  void cvtDS(MCInst &Inst, const OperandVector &Operands);
+  void cvtDS(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, false); }
+  void cvtDSGds(MCInst &Inst, const OperandVector &Operands) { cvtDSImpl(Inst, Operands, true); }
   void cvtExp(MCInst &Inst, const OperandVector &Operands);
 
   bool parseCnt(int64_t &IntVal);
@@ -908,6 +983,12 @@ private:
   void errorExpTgt();
   OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
 
+  bool validateOperandLimitations(const MCInst &Inst);
+  bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
+  bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
+  unsigned findImplicitSGPRReadInVOP(const MCInst &Inst) const;
+  bool isSGPR(unsigned Reg);
+
 public:
   OperandMatchResultTy parseOptionalOperand(OperandVector &Operands);
 
@@ -937,7 +1018,13 @@ public:
 
   void cvtId(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3_2_mod(MCInst &Inst, const OperandVector &Operands);
+
+  void cvtVOP3Impl(MCInst &Inst,
+                   const OperandVector &Operands,
+                   OptionalImmIndexMap &OptionalIdx);
   void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3OMod(MCInst &Inst, const OperandVector &Operands);
+  void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
 
   void cvtMIMG(MCInst &Inst, const OperandVector &Operands);
   void cvtMIMGAtomic(MCInst &Inst, const OperandVector &Operands);
@@ -985,6 +1072,30 @@ static const fltSemantics *getFltSemantics(MVT VT) {
   return getFltSemantics(VT.getSizeInBits() / 8);
 }
 
+static const fltSemantics *getOpFltSemantics(uint8_t OperandType) {
+  switch (OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    return &APFloat::IEEEsingle();
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
+    return &APFloat::IEEEdouble();
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    return &APFloat::IEEEhalf();
+  default:
+    llvm_unreachable("unsupported fp type");
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Operand
 //===----------------------------------------------------------------------===//
@@ -1030,7 +1141,7 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
 
     if (type.getScalarSizeInBits() == 16) {
       return AMDGPU::isInlinableLiteral16(
-        static_cast<int32_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
+        static_cast<int16_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
         AsmParser->hasInv2PiInlineImm());
     }
 
@@ -1066,6 +1177,13 @@ bool AMDGPUOperand::isLiteralImm(MVT type) const {
   if (!Imm.IsFPImm) {
     // We got int literal token.
 
+    if (type == MVT::f64 && hasFPModifiers()) {
+      // Cannot apply fp modifiers to int literals preserving the same semantics
+      // for VOP1/2/C and VOP3 because of integer truncation. To avoid ambiguity,
+      // disable these cases.
+      return false;
+    }
+
     unsigned Size = type.getSizeInBits();
     if (Size == 64)
       Size = 32;
@@ -1095,40 +1213,57 @@ bool AMDGPUOperand::isRegClass(unsigned RCID) const {
   return isRegKind() && AsmParser->getMRI()->getRegClass(RCID).contains(getReg());
 }
 
-void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
-  int64_t Val = Imm.Val;
-  if (isImmTy(ImmTyNone) && ApplyModifiers && Imm.Mods.hasFPModifiers() && Imm.Mods.Neg) {
-    // Apply modifiers to immediate value. Only negate can get here
-    if (Imm.IsFPImm) {
-      APFloat F(BitsToDouble(Val));
-      F.changeSign();
-      Val = F.bitcastToAPInt().getZExtValue();
-    } else {
-      Val = -Val;
-    }
+uint64_t AMDGPUOperand::applyInputFPModifiers(uint64_t Val, unsigned Size) const
+{
+  assert(isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
+  assert(Size == 2 || Size == 4 || Size == 8);
+
+  const uint64_t FpSignMask = (1ULL << (Size * 8 - 1));
+
+  if (Imm.Mods.Abs) {
+    Val &= ~FpSignMask;
+  }
+  if (Imm.Mods.Neg) {
+    Val ^= FpSignMask;
   }
 
+  return Val;
+}
+
+void AMDGPUOperand::addImmOperands(MCInst &Inst, unsigned N, bool ApplyModifiers) const {
+
   if (AMDGPU::isSISrcOperand(AsmParser->getMII()->get(Inst.getOpcode()),
                              Inst.getNumOperands())) {
-    addLiteralImmOperand(Inst, Val);
+    addLiteralImmOperand(Inst, Imm.Val,
+                         ApplyModifiers &
+                         isImmTy(ImmTyNone) && Imm.Mods.hasFPModifiers());
   } else {
-    Inst.addOperand(MCOperand::createImm(Val));
+    assert(!isImmTy(ImmTyNone) || !hasModifiers());
+    Inst.addOperand(MCOperand::createImm(Imm.Val));
   }
 }
 
-void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
+void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val, bool ApplyModifiers) const {
   const auto& InstDesc = AsmParser->getMII()->get(Inst.getOpcode());
   auto OpNum = Inst.getNumOperands();
   // Check that this operand accepts literals
   assert(AMDGPU::isSISrcOperand(InstDesc, OpNum));
 
-  auto OpSize = AMDGPU::getOperandSize(InstDesc, OpNum); // expected operand size
+  if (ApplyModifiers) {
+    assert(AMDGPU::isSISrcFPOperand(InstDesc, OpNum));
+    const unsigned Size = Imm.IsFPImm ? sizeof(double) : getOperandSize(InstDesc, OpNum);
+    Val = applyInputFPModifiers(Val, Size);
+  }
+
+  APInt Literal(64, Val);
+  uint8_t OpTy = InstDesc.OpInfo[OpNum].OperandType;
 
   if (Imm.IsFPImm) { // We got fp literal token
-    APInt Literal(64, Val);
-
-    switch (OpSize) {
-    case 8:
+    switch (OpTy) {
+    case AMDGPU::OPERAND_REG_IMM_INT64:
+    case AMDGPU::OPERAND_REG_IMM_FP64:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
       if (AMDGPU::isInlinableLiteral64(Literal.getZExtValue(),
                                        AsmParser->hasInv2PiInlineImm())) {
         Inst.addOperand(MCOperand::createImm(Literal.getZExtValue()));
@@ -1152,17 +1287,32 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
       // unclear how we should encode them. This case should be checked earlier
       // in predicate methods (isLiteralImm())
       llvm_unreachable("fp literal in 64-bit integer instruction.");
-
-    case 4:
-    case 2: {
+    }
+    case AMDGPU::OPERAND_REG_IMM_INT32:
+    case AMDGPU::OPERAND_REG_IMM_FP32:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
+    case AMDGPU::OPERAND_REG_IMM_INT16:
+    case AMDGPU::OPERAND_REG_IMM_FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
       bool lost;
       APFloat FPLiteral(APFloat::IEEEdouble(), Literal);
       // Convert literal to single precision
-      FPLiteral.convert(*getFltSemantics(OpSize),
+      FPLiteral.convert(*getOpFltSemantics(OpTy),
                         APFloat::rmNearestTiesToEven, &lost);
       // We allow precision lost but not overflow or underflow. This should be
       // checked earlier in isLiteralImm()
-      Inst.addOperand(MCOperand::createImm(FPLiteral.bitcastToAPInt().getZExtValue()));
+
+      uint64_t ImmVal = FPLiteral.bitcastToAPInt().getZExtValue();
+      if (OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
+          OpTy == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
+        ImmVal |= (ImmVal << 16);
+      }
+
+      Inst.addOperand(MCOperand::createImm(ImmVal));
       return;
     }
     default:
@@ -1175,8 +1325,11 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
    // We got int literal token.
   // Only sign extend inline immediates.
   // FIXME: No errors on truncation
-  switch (OpSize) {
-  case 4:
+  switch (OpTy) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
     if (isInt<32>(Val) &&
         AMDGPU::isInlinableLiteral32(static_cast<int32_t>(Val),
                                      AsmParser->hasInv2PiInlineImm())) {
@@ -1186,18 +1339,23 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
 
     Inst.addOperand(MCOperand::createImm(Val & 0xffffffff));
     return;
-
-  case 8:
-    if (AMDGPU::isInlinableLiteral64(Val,
-                                     AsmParser->hasInv2PiInlineImm())) {
+  }
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
+    if (AMDGPU::isInlinableLiteral64(Val, AsmParser->hasInv2PiInlineImm())) {
       Inst.addOperand(MCOperand::createImm(Val));
       return;
     }
 
     Inst.addOperand(MCOperand::createImm(Lo_32(Val)));
     return;
-
-  case 2:
+  }
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
     if (isInt<16>(Val) &&
         AMDGPU::isInlinableLiteral16(static_cast<int16_t>(Val),
                                      AsmParser->hasInv2PiInlineImm())) {
@@ -1207,7 +1365,18 @@ void AMDGPUOperand::addLiteralImmOperand(MCInst &Inst, int64_t Val) const {
 
     Inst.addOperand(MCOperand::createImm(Val & 0xffff));
     return;
+  }
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    auto LiteralVal = static_cast<uint16_t>(Literal.getLoBits(16).getZExtValue());
+    assert(AMDGPU::isInlinableLiteral16(LiteralVal,
+                                        AsmParser->hasInv2PiInlineImm()));
 
+    uint32_t ImmVal = static_cast<uint32_t>(LiteralVal) << 16 |
+                      static_cast<uint32_t>(LiteralVal);
+    Inst.addOperand(MCOperand::createImm(ImmVal));
+    return;
+  }
   default:
     llvm_unreachable("invalid operand size");
   }
@@ -1489,8 +1658,33 @@ std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
   return AMDGPUOperand::CreateReg(this, Reg, StartLoc, EndLoc, false);
 }
 
+bool
+AMDGPUAsmParser::parseAbsoluteExpr(int64_t &Val, bool AbsMod) {
+  if (AbsMod && getLexer().peekTok().is(AsmToken::Pipe) &&
+      (getLexer().getKind() == AsmToken::Integer ||
+       getLexer().getKind() == AsmToken::Real)) {
+
+    // This is a workaround for handling operands like these:
+    //     |1.0|
+    //     |-1|
+    // This syntax is not compatible with syntax of standard
+    // MC expressions (due to the trailing '|').
+
+    SMLoc EndLoc;
+    const MCExpr *Expr;
+
+    if (getParser().parsePrimaryExpr(Expr, EndLoc)) {
+      return true;
+    }
+
+    return !Expr->evaluateAsAbsolute(Val);
+  }
+
+  return getParser().parseAbsoluteExpression(Val);
+}
+
 OperandMatchResultTy
-AMDGPUAsmParser::parseImm(OperandVector &Operands) {
+AMDGPUAsmParser::parseImm(OperandVector &Operands, bool AbsMod) {
   // TODO: add syntactic sugar for 1/(2*PI)
   bool Minus = false;
   if (getLexer().getKind() == AsmToken::Minus) {
@@ -1502,7 +1696,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
   switch(getLexer().getKind()) {
   case AsmToken::Integer: {
     int64_t IntVal;
-    if (getParser().parseAbsoluteExpression(IntVal))
+    if (parseAbsoluteExpr(IntVal, AbsMod))
       return MatchOperand_ParseFail;
     if (Minus)
       IntVal *= -1;
@@ -1511,7 +1705,7 @@ AMDGPUAsmParser::parseImm(OperandVector &Operands) {
   }
   case AsmToken::Real: {
     int64_t IntVal;
-    if (getParser().parseAbsoluteExpression(IntVal))
+    if (parseAbsoluteExpr(IntVal, AbsMod))
       return MatchOperand_ParseFail;
 
     APFloat F(BitsToDouble(IntVal));
@@ -1539,8 +1733,8 @@ AMDGPUAsmParser::parseReg(OperandVector &Operands) {
 }
 
 OperandMatchResultTy
-AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
-  auto res = parseImm(Operands);
+AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands, bool AbsMod) {
+  auto res = parseImm(Operands, AbsMod);
   if (res != MatchOperand_NoMatch) {
     return res;
   }
@@ -1551,14 +1745,44 @@ AMDGPUAsmParser::parseRegOrImm(OperandVector &Operands) {
 OperandMatchResultTy
 AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
                                               bool AllowImm) {
-  // XXX: During parsing we can't determine if minus sign means
-  // negate-modifier or negative immediate value.
-  // By default we suppose it is modifier.
-  bool Negate = false, Abs = false, Abs2 = false;
+  bool Negate = false, Negate2 = false, Abs = false, Abs2 = false;
 
   if (getLexer().getKind()== AsmToken::Minus) {
+    const AsmToken NextToken = getLexer().peekTok();
+
+    // Disable ambiguous constructs like '--1' etc. Should use neg(-1) instead.
+    if (NextToken.is(AsmToken::Minus)) {
+      Error(Parser.getTok().getLoc(), "invalid syntax, expected 'neg' modifier");
+      return MatchOperand_ParseFail;
+    }
+
+    // '-' followed by an integer literal N should be interpreted as integer
+    // negation rather than a floating-point NEG modifier applied to N.
+    // Beside being contr-intuitive, such use of floating-point NEG modifier
+    // results in different meaning of integer literals used with VOP1/2/C
+    // and VOP3, for example:
+    //    v_exp_f32_e32 v5, -1 // VOP1: src0 = 0xFFFFFFFF
+    //    v_exp_f32_e64 v5, -1 // VOP3: src0 = 0x80000001
+    // Negative fp literals should be handled likewise for unifomtity
+    if (!NextToken.is(AsmToken::Integer) && !NextToken.is(AsmToken::Real)) {
+      Parser.Lex();
+      Negate = true;
+    }
+  }
+
+  if (getLexer().getKind() == AsmToken::Identifier &&
+      Parser.getTok().getString() == "neg") {
+    if (Negate) {
+      Error(Parser.getTok().getLoc(), "expected register or immediate");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Negate2 = true;
+    if (getLexer().isNot(AsmToken::LParen)) {
+      Error(Parser.getTok().getLoc(), "expected left paren after neg");
+      return MatchOperand_ParseFail;
+    }
     Parser.Lex();
-    Negate = true;
   }
 
   if (getLexer().getKind() == AsmToken::Identifier &&
@@ -1583,7 +1807,7 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
 
   OperandMatchResultTy Res;
   if (AllowImm) {
-    Res = parseRegOrImm(Operands);
+    Res = parseRegOrImm(Operands, Abs);
   } else {
     Res = parseReg(Operands);
   }
@@ -1592,9 +1816,6 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
   }
 
   AMDGPUOperand::Modifiers Mods;
-  if (Negate) {
-    Mods.Neg = true;
-  }
   if (Abs) {
     if (getLexer().getKind() != AsmToken::Pipe) {
       Error(Parser.getTok().getLoc(), "expected vertical bar");
@@ -1612,6 +1833,17 @@ AMDGPUAsmParser::parseRegOrImmWithFPInputMods(OperandVector &Operands,
     Mods.Abs = true;
   }
 
+  if (Negate) {
+    Mods.Neg = true;
+  } else if (Negate2) {
+    if (getLexer().isNot(AsmToken::RParen)) {
+      Error(Parser.getTok().getLoc(), "expected closing parentheses");
+      return MatchOperand_ParseFail;
+    }
+    Parser.Lex();
+    Mods.Neg = true;
+  }
+
   if (Mods.hasFPModifiers()) {
     AMDGPUOperand &Op = static_cast<AMDGPUOperand &>(*Operands.back());
     Op.setModifiers(Mods);
@@ -1749,6 +1981,128 @@ ArrayRef<unsigned> AMDGPUAsmParser::getMatchedVariants() const {
   return makeArrayRef(Variants);
 }
 
+unsigned AMDGPUAsmParser::findImplicitSGPRReadInVOP(const MCInst &Inst) const {
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  const unsigned Num = Desc.getNumImplicitUses();
+  for (unsigned i = 0; i < Num; ++i) {
+    unsigned Reg = Desc.ImplicitUses[i];
+    switch (Reg) {
+    case AMDGPU::FLAT_SCR:
+    case AMDGPU::VCC:
+    case AMDGPU::M0:
+      return Reg;
+    default:
+      break;
+    }
+  }
+  return AMDGPU::NoRegister;
+}
+
+bool AMDGPUAsmParser::isSGPR(unsigned Reg) {
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+  const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
+  const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
+  return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
+         Reg == AMDGPU::SCC;
+}
+
+// NB: This code is correct only when used to check constant
+// bus limitations because GFX7 support no f16 inline constants.
+// Note that there are no cases when a GFX7 opcode violates
+// constant bus limitations due to the use of an f16 constant.
+bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
+                                       unsigned OpIdx) const {
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+
+  if (!AMDGPU::isSISrcOperand(Desc, OpIdx)) {
+    return false;
+  }
+
+  const MCOperand &MO = Inst.getOperand(OpIdx);
+
+  int64_t Val = MO.getImm();
+  auto OpSize = AMDGPU::getOperandSize(Desc, OpIdx);
+
+  switch (OpSize) { // expected operand size
+  case 8:
+    return AMDGPU::isInlinableLiteral64(Val, hasInv2PiInlineImm());
+  case 4:
+    return AMDGPU::isInlinableLiteral32(Val, hasInv2PiInlineImm());
+  case 2: {
+    const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType;
+    if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
+        OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16) {
+      return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
+    } else {
+      return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
+    }
+  }
+  default:
+    llvm_unreachable("invalid operand size");
+  }
+}
+
+bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
+  const MCOperand &MO = Inst.getOperand(OpIdx);
+  if (MO.isImm()) {
+    return !isInlineConstant(Inst, OpIdx);
+  }
+  return !MO.isReg() || isSGPR(mc2PseudoReg(MO.getReg()));
+}
+
+bool AMDGPUAsmParser::validateOperandLimitations(const MCInst &Inst) {
+  const unsigned Opcode = Inst.getOpcode();
+  const MCInstrDesc &Desc = MII.get(Opcode);
+  unsigned ConstantBusUseCount = 0;
+
+  if (Desc.TSFlags &
+      (SIInstrFlags::VOPC |
+       SIInstrFlags::VOP1 | SIInstrFlags::VOP2 |
+       SIInstrFlags::VOP3 | SIInstrFlags::VOP3P)) {
+
+    // Check special imm operands (used by madmk, etc)
+    if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1) {
+      ++ConstantBusUseCount;
+    }
+
+    unsigned SGPRUsed = findImplicitSGPRReadInVOP(Inst);
+    if (SGPRUsed != AMDGPU::NoRegister) {
+      ++ConstantBusUseCount;
+    }
+
+    const int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0);
+    const int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1);
+    const int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2);
+
+    const int OpIndices[] = { Src0Idx, Src1Idx, Src2Idx };
+
+    for (int OpIdx : OpIndices) {
+      if (OpIdx == -1) break;
+
+      const MCOperand &MO = Inst.getOperand(OpIdx);
+      if (usesConstantBus(Inst, OpIdx)) {
+        if (MO.isReg()) {
+          const unsigned Reg = mc2PseudoReg(MO.getReg());
+          // Pairs of registers with a partial intersections like these
+          //   s0, s[0:1]
+          //   flat_scratch_lo, flat_scratch
+          //   flat_scratch_lo, flat_scratch_hi
+          // are theoretically valid but they are disabled anyway.
+          // Note that this code mimics SIInstrInfo::verifyInstruction
+          if (Reg != SGPRUsed) {
+            ++ConstantBusUseCount;
+          }
+          SGPRUsed = Reg;
+        } else { // Expression or a literal
+          ++ConstantBusUseCount;
+        }
+      }
+    }
+  }
+
+  return ConstantBusUseCount <= 1;
+}
+
 bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                               OperandVector &Operands,
                                               MCStreamer &Out,
@@ -1781,6 +2135,10 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   switch (Result) {
   default: break;
   case Match_Success:
+    if (!validateOperandLimitations(Inst)) {
+      return Error(IDLoc,
+                   "invalid operand (violates constant bus restrictions)");
+    }
     Inst.setLoc(IDLoc);
     Out.EmitInstruction(Inst, getSTI());
     return false;
@@ -1859,9 +2217,10 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
   // If this directive has no arguments, then use the ISA version for the
   // targeted GPU.
   if (getLexer().is(AsmToken::EndOfStatement)) {
-    AMDGPU::IsaVersion Isa = AMDGPU::getIsaVersion(getSTI().getFeatureBits());
-    getTargetStreamer().EmitDirectiveHSACodeObjectISA(Isa.Major, Isa.Minor,
-                                                      Isa.Stepping,
+    AMDGPU::IsaInfo::IsaVersion ISA =
+        AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+    getTargetStreamer().EmitDirectiveHSACodeObjectISA(ISA.Major, ISA.Minor,
+                                                      ISA.Stepping,
                                                       "AMD", "AMDGPU");
     return false;
   }
@@ -1901,42 +2260,45 @@ bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
   return false;
 }
 
-bool AMDGPUAsmParser::ParseDirectiveRuntimeMetadata() {
-  std::string Metadata;
-  raw_string_ostream MS(Metadata);
+bool AMDGPUAsmParser::ParseDirectiveCodeObjectMetadata() {
+  std::string YamlString;
+  raw_string_ostream YamlStream(YamlString);
 
   getLexer().setSkipSpace(false);
 
   bool FoundEnd = false;
   while (!getLexer().is(AsmToken::Eof)) {
     while (getLexer().is(AsmToken::Space)) {
-      MS << ' ';
+      YamlStream << getLexer().getTok().getString();
       Lex();
     }
 
     if (getLexer().is(AsmToken::Identifier)) {
       StringRef ID = getLexer().getTok().getIdentifier();
-      if (ID == ".end_amdgpu_runtime_metadata") {
+      if (ID == AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd) {
         Lex();
         FoundEnd = true;
         break;
       }
     }
 
-    MS << Parser.parseStringToEndOfStatement()
-       << getContext().getAsmInfo()->getSeparatorString();
+    YamlStream << Parser.parseStringToEndOfStatement()
+               << getContext().getAsmInfo()->getSeparatorString();
 
     Parser.eatToEndOfStatement();
   }
 
   getLexer().setSkipSpace(true);
 
-  if (getLexer().is(AsmToken::Eof) && !FoundEnd)
-    return TokError("expected directive .end_amdgpu_runtime_metadata not found");
+  if (getLexer().is(AsmToken::Eof) && !FoundEnd) {
+    return TokError(
+        "expected directive .end_amdgpu_code_object_metadata not found");
+  }
 
-  MS.flush();
+  YamlStream.flush();
 
-  getTargetStreamer().EmitRuntimeMetadata(Metadata);
+  if (!getTargetStreamer().EmitCodeObjectMetadata(YamlString))
+    return Error(getParser().getTok().getLoc(), "invalid code object metadata");
 
   return false;
 }
@@ -1954,7 +2316,7 @@ bool AMDGPUAsmParser::ParseAMDKernelCodeTValue(StringRef ID,
 
 bool AMDGPUAsmParser::ParseDirectiveAMDKernelCodeT() {
   amd_kernel_code_t Header;
-  AMDGPU::initDefaultAMDKernelCodeT(Header, getSTI().getFeatureBits());
+  AMDGPU::initDefaultAMDKernelCodeT(Header, getFeatureBits());
 
   while (true) {
     // Lex EndOfStatement.  This is in a while loop, because lexing a comment
@@ -2048,8 +2410,8 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
   if (IDVal == ".hsa_code_object_isa")
     return ParseDirectiveHSACodeObjectISA();
 
-  if (IDVal == ".amdgpu_runtime_metadata")
-    return ParseDirectiveRuntimeMetadata();
+  if (IDVal == AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin)
+    return ParseDirectiveCodeObjectMetadata();
 
   if (IDVal == ".amd_kernel_code_t")
     return ParseDirectiveAMDKernelCodeT();
@@ -2235,7 +2597,7 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, int64_t &Int) {
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
-                                    enum AMDGPUOperand::ImmTy ImmTy,
+                                    AMDGPUOperand::ImmTy ImmTy,
                                     bool (*ConvertResult)(int64_t&)) {
   SMLoc S = Parser.getTok().getLoc();
   int64_t Value = 0;
@@ -2252,9 +2614,59 @@ AMDGPUAsmParser::parseIntWithPrefix(const char *Prefix, OperandVector &Operands,
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy AMDGPUAsmParser::parseOperandArrayWithPrefix(
+  const char *Prefix,
+  OperandVector &Operands,
+  AMDGPUOperand::ImmTy ImmTy,
+  bool (*ConvertResult)(int64_t&)) {
+  StringRef Name = Parser.getTok().getString();
+  if (!Name.equals(Prefix))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::Colon))
+    return MatchOperand_ParseFail;
+
+  Parser.Lex();
+  if (getLexer().isNot(AsmToken::LBrac))
+    return MatchOperand_ParseFail;
+  Parser.Lex();
+
+  unsigned Val = 0;
+  SMLoc S = Parser.getTok().getLoc();
+
+  // FIXME: How to verify the number of elements matches the number of src
+  // operands?
+  for (int I = 0; I < 3; ++I) {
+    if (I != 0) {
+      if (getLexer().is(AsmToken::RBrac))
+        break;
+
+      if (getLexer().isNot(AsmToken::Comma))
+        return MatchOperand_ParseFail;
+      Parser.Lex();
+    }
+
+    if (getLexer().isNot(AsmToken::Integer))
+      return MatchOperand_ParseFail;
+
+    int64_t Op;
+    if (getParser().parseAbsoluteExpression(Op))
+      return MatchOperand_ParseFail;
+
+    if (Op != 0 && Op != 1)
+      return MatchOperand_ParseFail;
+    Val |= (Op << I);
+  }
+
+  Parser.Lex();
+  Operands.push_back(AMDGPUOperand::CreateImm(this, Val, S, ImmTy));
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy
 AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
-                               enum AMDGPUOperand::ImmTy ImmTy) {
+                               AMDGPUOperand::ImmTy ImmTy) {
   int64_t Bit = 0;
   SMLoc S = Parser.getTok().getLoc();
 
@@ -2284,11 +2696,11 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
   return MatchOperand_Success;
 }
 
-typedef std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalImmIndexMap;
-
-void addOptionalImmOperand(MCInst& Inst, const OperandVector& Operands,
-                           OptionalImmIndexMap& OptionalIdx,
-                           enum AMDGPUOperand::ImmTy ImmT, int64_t Default = 0) {
+static void addOptionalImmOperand(
+  MCInst& Inst, const OperandVector& Operands,
+  AMDGPUAsmParser::OptionalImmIndexMap& OptionalIdx,
+  AMDGPUOperand::ImmTy ImmT,
+  int64_t Default = 0) {
   auto i = OptionalIdx.find(ImmT);
   if (i != OptionalIdx.end()) {
     unsigned Idx = i->second;
@@ -2350,9 +2762,9 @@ void AMDGPUAsmParser::cvtDSOffset01(MCInst &Inst,
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
 }
 
-void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
-  std::map<enum AMDGPUOperand::ImmTy, unsigned> OptionalIdx;
-  bool GDSOnly = false;
+void AMDGPUAsmParser::cvtDSImpl(MCInst &Inst, const OperandVector &Operands,
+                                bool IsGdsHardcoded) {
+  OptionalImmIndexMap OptionalIdx;
 
   for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
@@ -2364,7 +2776,7 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
     }
 
     if (Op.isToken() && Op.getToken() == "gds") {
-      GDSOnly = true;
+      IsGdsHardcoded = true;
       continue;
     }
 
@@ -2373,7 +2785,7 @@ void AMDGPUAsmParser::cvtDS(MCInst &Inst, const OperandVector &Operands) {
   }
 
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOffset);
-  if (!GDSOnly) {
+  if (!IsGdsHardcoded) {
     addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGDS);
   }
   Inst.addOperand(MCOperand::createReg(AMDGPU::M0)); // m0
@@ -2446,13 +2858,14 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
   if (getLexer().is(AsmToken::Amp) || getLexer().is(AsmToken::Comma))
     Parser.Lex();
 
-  IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
   if (CntName == "vmcnt")
-    IntVal = encodeVmcnt(IV, IntVal, CntVal);
+    IntVal = encodeVmcnt(ISA, IntVal, CntVal);
   else if (CntName == "expcnt")
-    IntVal = encodeExpcnt(IV, IntVal, CntVal);
+    IntVal = encodeExpcnt(ISA, IntVal, CntVal);
   else if (CntName == "lgkmcnt")
-    IntVal = encodeLgkmcnt(IV, IntVal, CntVal);
+    IntVal = encodeLgkmcnt(ISA, IntVal, CntVal);
   else
     return true;
 
@@ -2461,8 +2874,9 @@ bool AMDGPUAsmParser::parseCnt(int64_t &IntVal) {
 
 OperandMatchResultTy
 AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
-  IsaVersion IV = getIsaVersion(getSTI().getFeatureBits());
-  int64_t Waitcnt = getWaitcntBitMask(IV);
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(getFeatureBits());
+  int64_t Waitcnt = getWaitcntBitMask(ISA);
   SMLoc S = Parser.getTok().getLoc();
 
   switch(getLexer().getKind()) {
@@ -2484,7 +2898,8 @@ AMDGPUAsmParser::parseSWaitCntOps(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset, int64_t &Width) {
+bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
+                                          int64_t &Width) {
   using namespace llvm::AMDGPU::Hwreg;
 
   if (Parser.getTok().getString() != "hwreg")
@@ -2545,8 +2960,7 @@ bool AMDGPUAsmParser::parseHwregConstruct(OperandInfoTy &HwReg, int64_t &Offset,
   return false;
 }
 
-OperandMatchResultTy
-AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
+OperandMatchResultTy AMDGPUAsmParser::parseHwreg(OperandVector &Operands) {
   using namespace llvm::AMDGPU::Hwreg;
 
   int64_t Imm16Val = 0;
@@ -3195,6 +3609,10 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"src1_sel",   AMDGPUOperand::ImmTySdwaSrc1Sel, false, nullptr},
   {"dst_unused", AMDGPUOperand::ImmTySdwaDstUnused, false, nullptr},
   {"vm", AMDGPUOperand::ImmTyExpVM, true, nullptr},
+  {"op_sel", AMDGPUOperand::ImmTyOpSel, false, nullptr},
+  {"op_sel_hi", AMDGPUOperand::ImmTyOpSelHi, false, nullptr},
+  {"neg_lo", AMDGPUOperand::ImmTyNegLo, false, nullptr},
+  {"neg_hi", AMDGPUOperand::ImmTyNegHi, false, nullptr}
 };
 
 OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operands) {
@@ -3211,6 +3629,12 @@ OperandMatchResultTy AMDGPUAsmParser::parseOptionalOperand(OperandVector &Operan
       res = parseSDWASel(Operands, Op.Name, Op.Type);
     } else if (Op.Type == AMDGPUOperand::ImmTySdwaDstUnused) {
       res = parseSDWADstUnused(Operands);
+    } else if (Op.Type == AMDGPUOperand::ImmTyOpSel ||
+               Op.Type == AMDGPUOperand::ImmTyOpSelHi ||
+               Op.Type == AMDGPUOperand::ImmTyNegLo ||
+               Op.Type == AMDGPUOperand::ImmTyNegHi) {
+      res = parseOperandArrayWithPrefix(Op.Name, Operands, Op.Type,
+                                        Op.ConvertResult);
     } else {
       res = parseIntWithPrefix(Op.Name, Operands, Op.Type, Op.ConvertResult);
     }
@@ -3266,8 +3690,8 @@ static bool isRegOrImmWithInputMods(const MCInstrDesc &Desc, unsigned OpNum) {
       && Desc.getOperandConstraint(OpNum + 1, MCOI::OperandConstraint::TIED_TO) == -1;
 }
 
-void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
-  OptionalImmIndexMap OptionalIdx;
+void AMDGPUAsmParser::cvtVOP3Impl(MCInst &Inst, const OperandVector &Operands,
+                                  OptionalImmIndexMap &OptionalIdx) {
   unsigned I = 1;
   const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
   for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
@@ -3278,12 +3702,20 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
     AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
     if (isRegOrImmWithInputMods(Desc, Inst.getNumOperands())) {
       Op.addRegOrImmWithFPInputModsOperands(Inst, 2);
-    } else if (Op.isImm()) {
+    } else if (Op.isImmModifier()) {
       OptionalIdx[Op.getImmTy()] = I;
+    } else if (Op.isRegOrImm()) {
+      Op.addRegOrImmOperands(Inst, 1);
     } else {
       llvm_unreachable("unhandled operand type");
     }
   }
+}
+
+void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  cvtVOP3Impl(Inst, Operands, OptionalIdx);
 
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
@@ -3308,6 +3740,96 @@ void AMDGPUAsmParser::cvtVOP3(MCInst &Inst, const OperandVector &Operands) {
   }
 }
 
+void AMDGPUAsmParser::cvtVOP3OMod(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptionalIdx;
+
+  unsigned I = 1;
+  const MCInstrDesc &Desc = MII.get(Inst.getOpcode());
+  for (unsigned J = 0; J < Desc.getNumDefs(); ++J) {
+    ((AMDGPUOperand &)*Operands[I++]).addRegOperands(Inst, 1);
+  }
+
+  for (unsigned E = Operands.size(); I != E; ++I) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[I]);
+    if (Op.isMod()) {
+      OptionalIdx[Op.getImmTy()] = I;
+    } else {
+      Op.addRegOrImmOperands(Inst, 1);
+    }
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyClampSI);
+  addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyOModSI);
+}
+
+void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
+  OptionalImmIndexMap OptIdx;
+
+  cvtVOP3Impl(Inst, Operands, OptIdx);
+
+  // FIXME: This is messy. Parse the modifiers as if it was a normal VOP3
+  // instruction, and then figure out where to actually put the modifiers
+  int Opc = Inst.getOpcode();
+
+  if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::clamp) != -1) {
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI);
+  }
+
+  addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSel);
+  addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyOpSelHi, -1);
+
+  int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
+  if (NegLoIdx != -1) {
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
+  }
+
+  const int Ops[] = { AMDGPU::OpName::src0,
+                      AMDGPU::OpName::src1,
+                      AMDGPU::OpName::src2 };
+  const int ModOps[] = { AMDGPU::OpName::src0_modifiers,
+                         AMDGPU::OpName::src1_modifiers,
+                         AMDGPU::OpName::src2_modifiers };
+
+  int OpSelIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel);
+  int OpSelHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::op_sel_hi);
+
+  unsigned OpSel = Inst.getOperand(OpSelIdx).getImm();
+  unsigned OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
+  unsigned NegLo = 0;
+  unsigned NegHi = 0;
+
+  if (NegLoIdx != -1) {
+    int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+    NegLo = Inst.getOperand(NegLoIdx).getImm();
+    NegHi = Inst.getOperand(NegHiIdx).getImm();
+  }
+
+  for (int J = 0; J < 3; ++J) {
+    int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
+    if (OpIdx == -1)
+      break;
+
+    uint32_t ModVal = 0;
+
+    if ((OpSel & (1 << J)) != 0)
+      ModVal |= SISrcMods::OP_SEL_0;
+
+    if ((OpSelHi & (1 << J)) != 0)
+      ModVal |= SISrcMods::OP_SEL_1;
+
+    if ((NegLo & (1 << J)) != 0)
+      ModVal |= SISrcMods::NEG;
+
+    if ((NegHi & (1 << J)) != 0)
+      ModVal |= SISrcMods::NEG_HI;
+
+    int ModIdx = AMDGPU::getNamedOperandIdx(Opc, ModOps[J]);
+
+    Inst.getOperand(ModIdx).setImm(ModVal);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // dpp
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/AMDGPU/BUFInstructions.td b/lib/Target/AMDGPU/BUFInstructions.td
index 45a7fe6d3439587707cf12becb61c2cc0ef7d6a5..a6609f0725ab6a8fea26984ff779c1e6bc679979 100644
--- a/lib/Target/AMDGPU/BUFInstructions.td
+++ b/lib/Target/AMDGPU/BUFInstructions.td
@@ -21,8 +21,8 @@ def MUBUFIntrinsicVOffset : ComplexPattern<i32, 3, "SelectMUBUFIntrinsicVOffset"
 class MubufLoad <SDPatternOperator op> : PatFrag <
   (ops node:$ptr), (op node:$ptr), [{
   auto const AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
+  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS;
 }]>;
 
 def mubuf_load          : MubufLoad <load>;
@@ -705,12 +705,6 @@ def BUFFER_WBINVL1_VOL : MUBUF_Invalidate <"buffer_wbinvl1_vol",
 
 let Predicates = [isGCN] in {
 
-// int_SI_vs_load_input
-def : Pat<
-  (SIload_input v4i32:$tlst, imm:$attr_offset, i32:$buf_idx_vgpr),
-  (BUFFER_LOAD_FORMAT_XYZW_IDXEN $buf_idx_vgpr, $tlst, (i32 0), imm:$attr_offset, 0, 0, 0)
->;
-
 // Offset in an 32-bit VGPR
 def : Pat <
   (SIload_constant v4i32:$sbase, i32:$voff),
diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt
index 65853bb6a51a1c6d5c79a2710b35de81e1d45361..7c0ef4aeac3c7105937557856cce146bf795ab62 100644
--- a/lib/Target/AMDGPU/CMakeLists.txt
+++ b/lib/Target/AMDGPU/CMakeLists.txt
@@ -36,6 +36,7 @@ endif()
 
 add_llvm_target(AMDGPUCodeGen
   AMDILCFGStructurizer.cpp
+  AMDGPUAliasAnalysis.cpp
   AMDGPUAlwaysInlinePass.cpp
   AMDGPUAnnotateKernelFeatures.cpp
   AMDGPUAnnotateUniformValues.cpp
@@ -45,6 +46,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUTargetObjectFile.cpp
   AMDGPUIntrinsicInfo.cpp
   AMDGPUISelDAGToDAG.cpp
+  AMDGPULowerIntrinsics.cpp
   AMDGPUMCInstLower.cpp
   AMDGPUMachineFunction.cpp
   AMDGPUUnifyMetadata.cpp
@@ -56,6 +58,7 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUInstrInfo.cpp
   AMDGPUPromoteAlloca.cpp
   AMDGPURegisterInfo.cpp
+  AMDGPUUnifyDivergentExitNodes.cpp
   GCNHazardRecognizer.cpp
   GCNSchedStrategy.cpp
   R600ClauseMergePass.cpp
@@ -79,6 +82,7 @@ add_llvm_target(AMDGPUCodeGen
   SIFrameLowering.cpp
   SIInsertSkips.cpp
   SIInsertWaits.cpp
+  SIInsertWaitcnts.cpp
   SIInstrInfo.cpp
   SIISelLowering.cpp
   SILoadStoreOptimizer.cpp
@@ -87,10 +91,14 @@ add_llvm_target(AMDGPUCodeGen
   SIMachineFunctionInfo.cpp
   SIMachineScheduler.cpp
   SIOptimizeExecMasking.cpp
+  SIPeepholeSDWA.cpp
   SIRegisterInfo.cpp
   SIShrinkInstructions.cpp
   SITypeRewriter.cpp
   SIWholeQuadMode.cpp
+  GCNIterativeScheduler.cpp
+  GCNMinRegStrategy.cpp
+  GCNRegPressure.cpp
   ${GLOBAL_ISEL_BUILD_FILES}
   )
 
diff --git a/lib/Target/AMDGPU/DSInstructions.td b/lib/Target/AMDGPU/DSInstructions.td
index a077001df6bd18c8cc706c16e40bea9d8a3a1e31..65dcd27ae7a026386fa89de82b2fc932b0a24cfb 100644
--- a/lib/Target/AMDGPU/DSInstructions.td
+++ b/lib/Target/AMDGPU/DSInstructions.td
@@ -143,6 +143,20 @@ class DS_1A2D_RET<string opName,
   let hasPostISelHook = 1;
 }
 
+class DS_1A2D_Off8_RET<string opName,
+                       RegisterClass rc = VGPR_32,
+                       RegisterClass src = rc>
+: DS_Pseudo<opName,
+  (outs rc:$vdst),
+  (ins VGPR_32:$addr, src:$data0, src:$data1, offset0:$offset0, offset1:$offset1, gds:$gds),
+  "$vdst, $addr, $data0, $data1$offset0$offset1$gds"> {
+
+  let has_offset = 0;
+  let AsmMatchConverter = "cvtDSOffset01";
+
+  let hasPostISelHook = 1;
+}
+
 class DS_1A_RET<string opName, RegisterClass rc = VGPR_32>
 : DS_Pseudo<opName,
   (outs rc:$vdst),
@@ -174,6 +188,7 @@ class DS_1A_RET_GDS <string opName> : DS_Pseudo<opName,
   let has_data1 = 0;
   let has_gds = 0;
   let gdsValue = 1;
+  let AsmMatchConverter = "cvtDSGds";
 }
 
 class DS_0A_RET <string opName> : DS_Pseudo<opName,
@@ -202,20 +217,46 @@ class DS_1A <string opName> : DS_Pseudo<opName,
   let has_data1 = 0;
 }
 
-class DS_1A_GDS <string opName> : DS_Pseudo<opName,
-  (outs),
-  (ins VGPR_32:$addr),
-  "$addr gds"> {
+class DS_GWS <string opName, dag ins, string asmOps>
+: DS_Pseudo<opName, (outs), ins, asmOps> {
+
+  let has_vdst  = 0;
+  let has_addr  = 0;
+  let has_data0 = 0;
+  let has_data1 = 0;
+
+  let has_gds   = 0;
+  let gdsValue  = 1;
+  let AsmMatchConverter = "cvtDSGds";
+}
+
+class DS_GWS_0D <string opName>
+: DS_GWS<opName,
+  (ins offset:$offset, gds:$gds), "$offset gds">;
 
-  let has_vdst    = 0;
-  let has_data0   = 0;
-  let has_data1   = 0;
-  let has_offset  = 0;
+class DS_GWS_1D <string opName>
+: DS_GWS<opName,
+  (ins VGPR_32:$data0, offset:$offset, gds:$gds), "$data0$offset gds"> {
+
+  let has_data0 = 1;
+}
+
+class DS_VOID <string opName> : DS_Pseudo<opName,
+  (outs), (ins), ""> {
+  let mayLoad = 0;
+  let mayStore = 0;
+  let hasSideEffects = 1;
+  let UseNamedOperandTable = 0;
+  let AsmMatchConverter = "";
+
+  let has_vdst = 0;
+  let has_addr = 0;
+  let has_data0 = 0;
+  let has_data1 = 0;
+  let has_offset = 0;
   let has_offset0 = 0;
   let has_offset1 = 0;
-
-  let has_gds     = 0;
-  let gdsValue    = 1;
+  let has_gds = 0;
 }
 
 class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
@@ -226,6 +267,8 @@ class DS_1A1D_PERMUTE <string opName, SDPatternOperator node = null_frag>
   [(set i32:$vdst,
    (node (DS1Addr1Offset i32:$addr, i16:$offset), i32:$data0))] > {
 
+  let LGKM_CNT = 0;
+
   let mayLoad = 0;
   let mayStore = 0;
   let isConvergent = 1;
@@ -324,9 +367,9 @@ def DS_MAX_RTN_F32    : DS_1A1D_RET <"ds_max_rtn_f32">,
 
 def DS_WRXCHG_RTN_B32      : DS_1A1D_RET<"ds_wrxchg_rtn_b32">,
                              AtomicNoRet<"", 1>;
-def DS_WRXCHG2_RTN_B32     : DS_1A2D_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>,
+def DS_WRXCHG2_RTN_B32     : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b32", VReg_64, VGPR_32>,
                              AtomicNoRet<"", 1>;
-def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>,
+def DS_WRXCHG2ST64_RTN_B32 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b32", VReg_64, VGPR_32>,
                              AtomicNoRet<"", 1>;
 
 def DS_ADD_RTN_U64    : DS_1A1D_RET<"ds_add_rtn_u64", VReg_64>,
@@ -365,17 +408,17 @@ def DS_MAX_RTN_F64    : DS_1A1D_RET<"ds_max_rtn_f64", VReg_64>,
                         AtomicNoRet<"ds_max_f64", 1>;
 
 def DS_WRXCHG_RTN_B64      : DS_1A1D_RET<"ds_wrxchg_rtn_b64", VReg_64>,
-                             AtomicNoRet<"ds_wrxchg_b64", 1>;
-def DS_WRXCHG2_RTN_B64     : DS_1A2D_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>,
-                             AtomicNoRet<"ds_wrxchg2_b64", 1>;
-def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>,
-                             AtomicNoRet<"ds_wrxchg2st64_b64", 1>;
-
-def DS_GWS_INIT       : DS_1A_GDS<"ds_gws_init">;
-def DS_GWS_SEMA_V     : DS_1A_GDS<"ds_gws_sema_v">;
-def DS_GWS_SEMA_BR    : DS_1A_GDS<"ds_gws_sema_br">;
-def DS_GWS_SEMA_P     : DS_1A_GDS<"ds_gws_sema_p">;
-def DS_GWS_BARRIER    : DS_1A_GDS<"ds_gws_barrier">;
+                             AtomicNoRet<"", 1>;
+def DS_WRXCHG2_RTN_B64     : DS_1A2D_Off8_RET<"ds_wrxchg2_rtn_b64", VReg_128, VReg_64>,
+                             AtomicNoRet<"", 1>;
+def DS_WRXCHG2ST64_RTN_B64 : DS_1A2D_Off8_RET<"ds_wrxchg2st64_rtn_b64", VReg_128, VReg_64>,
+                             AtomicNoRet<"", 1>;
+
+def DS_GWS_INIT       : DS_GWS_1D<"ds_gws_init">;
+def DS_GWS_SEMA_V     : DS_GWS_0D<"ds_gws_sema_v">;
+def DS_GWS_SEMA_BR    : DS_GWS_1D<"ds_gws_sema_br">;
+def DS_GWS_SEMA_P     : DS_GWS_0D<"ds_gws_sema_p">;
+def DS_GWS_BARRIER    : DS_GWS_1D<"ds_gws_barrier">;
 
 def DS_ADD_SRC2_U32   : DS_1A<"ds_add_src2_u32">;
 def DS_SUB_SRC2_U32   : DS_1A<"ds_sub_src2_u32">;
@@ -386,7 +429,7 @@ def DS_MIN_SRC2_I32   : DS_1A<"ds_min_src2_i32">;
 def DS_MAX_SRC2_I32   : DS_1A<"ds_max_src2_i32">;
 def DS_MIN_SRC2_U32   : DS_1A<"ds_min_src2_u32">;
 def DS_MAX_SRC2_U32   : DS_1A<"ds_max_src2_u32">;
-def DS_AND_SRC2_B32   : DS_1A<"ds_and_src_b32">;
+def DS_AND_SRC2_B32   : DS_1A<"ds_and_src2_b32">;
 def DS_OR_SRC2_B32    : DS_1A<"ds_or_src2_b32">;
 def DS_XOR_SRC2_B32   : DS_1A<"ds_xor_src2_b32">;
 def DS_MIN_SRC2_F32   : DS_1A<"ds_min_src2_f32">;
@@ -429,30 +472,34 @@ def DS_READ2_B64     : DS_1A_Off8_RET<"ds_read2_b64", VReg_128>;
 def DS_READ2ST64_B64 : DS_1A_Off8_RET<"ds_read2st64_b64", VReg_128>;
 }
 
-let SubtargetPredicate = isSICI in {
 def DS_CONSUME       : DS_0A_RET<"ds_consume">;
 def DS_APPEND        : DS_0A_RET<"ds_append">;
 def DS_ORDERED_COUNT : DS_1A_RET_GDS<"ds_ordered_count">;
-}
 
 //===----------------------------------------------------------------------===//
 // Instruction definitions for CI and newer.
 //===----------------------------------------------------------------------===//
-// Remaining instructions:
-// DS_NOP
-// DS_GWS_SEMA_RELEASE_ALL
-// DS_WRAP_RTN_B32
-// DS_CNDXCHG32_RTN_B64
-// DS_WRITE_B96
-// DS_WRITE_B128
-// DS_CONDXCHG32_RTN_B128
-// DS_READ_B96
-// DS_READ_B128
 
 let SubtargetPredicate = isCIVI in {
 
-def DS_WRAP_RTN_F32 : DS_1A1D_RET <"ds_wrap_rtn_f32">,
-                      AtomicNoRet<"ds_wrap_f32", 1>;
+def DS_WRAP_RTN_B32 : DS_1A2D_RET<"ds_wrap_rtn_b32">, AtomicNoRet<"", 1>;
+
+def DS_CONDXCHG32_RTN_B64 : DS_1A1D_RET<"ds_condxchg32_rtn_b64", VReg_64>,
+                            AtomicNoRet<"", 1>;
+
+def DS_GWS_SEMA_RELEASE_ALL : DS_GWS_0D<"ds_gws_sema_release_all">;
+
+let mayStore = 0 in {
+def DS_READ_B96 : DS_1A_RET<"ds_read_b96", VReg_96>;
+def DS_READ_B128: DS_1A_RET<"ds_read_b128", VReg_128>;
+} // End mayStore = 0
+
+let mayLoad = 0 in {
+def DS_WRITE_B96 : DS_1A1D_NORET<"ds_write_b96", VReg_96>;
+def DS_WRITE_B128 : DS_1A1D_NORET<"ds_write_b128", VReg_128>;
+} // End mayLoad = 0
+
+def DS_NOP : DS_VOID<"ds_nop">;
 
 } // let SubtargetPredicate = isCIVI
 
@@ -623,6 +670,7 @@ def DS_CMPST_B32_si       : DS_Real_si<0x10, DS_CMPST_B32>;
 def DS_CMPST_F32_si       : DS_Real_si<0x11, DS_CMPST_F32>;
 def DS_MIN_F32_si         : DS_Real_si<0x12, DS_MIN_F32>;
 def DS_MAX_F32_si         : DS_Real_si<0x13, DS_MAX_F32>;
+def DS_NOP_si             : DS_Real_si<0x14, DS_NOP>;
 def DS_GWS_INIT_si        : DS_Real_si<0x19, DS_GWS_INIT>;
 def DS_GWS_SEMA_V_si      : DS_Real_si<0x1a, DS_GWS_SEMA_V>;
 def DS_GWS_SEMA_BR_si     : DS_Real_si<0x1b, DS_GWS_SEMA_BR>;
@@ -651,8 +699,10 @@ def DS_CMPST_RTN_F32_si   : DS_Real_si<0x31, DS_CMPST_RTN_F32>;
 def DS_MIN_RTN_F32_si     : DS_Real_si<0x32, DS_MIN_RTN_F32>;
 def DS_MAX_RTN_F32_si     : DS_Real_si<0x33, DS_MAX_RTN_F32>;
 
-// FIXME: this instruction is actually CI/VI
-def DS_WRAP_RTN_F32_si    : DS_Real_si<0x34, DS_WRAP_RTN_F32>;
+// These instruction are CI/VI only
+def DS_WRAP_RTN_B32_si    : DS_Real_si<0x34, DS_WRAP_RTN_B32>;
+def DS_CONDXCHG32_RTN_B64_si   : DS_Real_si<0x7e, DS_CONDXCHG32_RTN_B64>;
+def DS_GWS_SEMA_RELEASE_ALL_si : DS_Real_si<0x18, DS_GWS_SEMA_RELEASE_ALL>;
 
 def DS_SWIZZLE_B32_si     : DS_Real_si<0x35, DS_SWIZZLE_B32>;
 def DS_READ_B32_si        : DS_Real_si<0x36, DS_READ_B32>;
@@ -744,6 +794,10 @@ def DS_WRITE_SRC2_B64_si  : DS_Real_si<0xcd, DS_WRITE_SRC2_B64>;
 
 def DS_MIN_SRC2_F64_si    : DS_Real_si<0xd2, DS_MIN_SRC2_F64>;
 def DS_MAX_SRC2_F64_si    : DS_Real_si<0xd3, DS_MAX_SRC2_F64>;
+def DS_WRITE_B96_si       : DS_Real_si<0xde, DS_WRITE_B96>;
+def DS_WRITE_B128_si      : DS_Real_si<0xdf, DS_WRITE_B128>;
+def DS_READ_B96_si        : DS_Real_si<0xfe, DS_READ_B96>;
+def DS_READ_B128_si       : DS_Real_si<0xff, DS_READ_B128>;
 
 //===----------------------------------------------------------------------===//
 // VIInstructions.td
@@ -787,12 +841,13 @@ def DS_CMPST_B32_vi       : DS_Real_vi<0x10, DS_CMPST_B32>;
 def DS_CMPST_F32_vi       : DS_Real_vi<0x11, DS_CMPST_F32>;
 def DS_MIN_F32_vi         : DS_Real_vi<0x12, DS_MIN_F32>;
 def DS_MAX_F32_vi         : DS_Real_vi<0x13, DS_MAX_F32>;
+def DS_NOP_vi             : DS_Real_vi<0x14, DS_NOP>;
 def DS_ADD_F32_vi         : DS_Real_vi<0x15, DS_ADD_F32>;
-def DS_GWS_INIT_vi        : DS_Real_vi<0x19, DS_GWS_INIT>;
-def DS_GWS_SEMA_V_vi      : DS_Real_vi<0x1a, DS_GWS_SEMA_V>;
-def DS_GWS_SEMA_BR_vi     : DS_Real_vi<0x1b, DS_GWS_SEMA_BR>;
-def DS_GWS_SEMA_P_vi      : DS_Real_vi<0x1c, DS_GWS_SEMA_P>;
-def DS_GWS_BARRIER_vi     : DS_Real_vi<0x1d, DS_GWS_BARRIER>;
+def DS_GWS_INIT_vi        : DS_Real_vi<0x99, DS_GWS_INIT>;
+def DS_GWS_SEMA_V_vi      : DS_Real_vi<0x9a, DS_GWS_SEMA_V>;
+def DS_GWS_SEMA_BR_vi     : DS_Real_vi<0x9b, DS_GWS_SEMA_BR>;
+def DS_GWS_SEMA_P_vi      : DS_Real_vi<0x9c, DS_GWS_SEMA_P>;
+def DS_GWS_BARRIER_vi     : DS_Real_vi<0x9d, DS_GWS_BARRIER>;
 def DS_WRITE_B8_vi        : DS_Real_vi<0x1e, DS_WRITE_B8>;
 def DS_WRITE_B16_vi       : DS_Real_vi<0x1f, DS_WRITE_B16>;
 def DS_ADD_RTN_U32_vi     : DS_Real_vi<0x20, DS_ADD_RTN_U32>;
@@ -815,7 +870,7 @@ def DS_CMPST_RTN_B32_vi   : DS_Real_vi<0x30, DS_CMPST_RTN_B32>;
 def DS_CMPST_RTN_F32_vi   : DS_Real_vi<0x31, DS_CMPST_RTN_F32>;
 def DS_MIN_RTN_F32_vi     : DS_Real_vi<0x32, DS_MIN_RTN_F32>;
 def DS_MAX_RTN_F32_vi     : DS_Real_vi<0x33, DS_MAX_RTN_F32>;
-def DS_WRAP_RTN_F32_vi    : DS_Real_vi<0x34, DS_WRAP_RTN_F32>;
+def DS_WRAP_RTN_B32_vi    : DS_Real_vi<0x34, DS_WRAP_RTN_B32>;
 def DS_ADD_RTN_F32_vi     : DS_Real_vi<0x35, DS_ADD_RTN_F32>;
 def DS_READ_B32_vi        : DS_Real_vi<0x36, DS_READ_B32>;
 def DS_READ2_B32_vi       : DS_Real_vi<0x37, DS_READ2_B32>;
@@ -824,6 +879,9 @@ def DS_READ_I8_vi         : DS_Real_vi<0x39, DS_READ_I8>;
 def DS_READ_U8_vi         : DS_Real_vi<0x3a, DS_READ_U8>;
 def DS_READ_I16_vi        : DS_Real_vi<0x3b, DS_READ_I16>;
 def DS_READ_U16_vi        : DS_Real_vi<0x3c, DS_READ_U16>;
+def DS_CONSUME_vi         : DS_Real_vi<0xbd, DS_CONSUME>;
+def DS_APPEND_vi          : DS_Real_vi<0xbe, DS_APPEND>;
+def DS_ORDERED_COUNT_vi   : DS_Real_vi<0xbf, DS_ORDERED_COUNT>;
 def DS_SWIZZLE_B32_vi     : DS_Real_vi<0x3d, DS_SWIZZLE_B32>;
 def DS_PERMUTE_B32_vi     : DS_Real_vi<0x3e, DS_PERMUTE_B32>;
 def DS_BPERMUTE_B32_vi    : DS_Real_vi<0x3f, DS_BPERMUTE_B32>;
@@ -865,6 +923,8 @@ def DS_MSKOR_RTN_B64_vi   : DS_Real_vi<0x6c, DS_MSKOR_RTN_B64>;
 def DS_WRXCHG_RTN_B64_vi  : DS_Real_vi<0x6d, DS_WRXCHG_RTN_B64>;
 def DS_WRXCHG2_RTN_B64_vi : DS_Real_vi<0x6e, DS_WRXCHG2_RTN_B64>;
 def DS_WRXCHG2ST64_RTN_B64_vi : DS_Real_vi<0x6f, DS_WRXCHG2ST64_RTN_B64>;
+def DS_CONDXCHG32_RTN_B64_vi   : DS_Real_vi<0x7e, DS_CONDXCHG32_RTN_B64>;
+def DS_GWS_SEMA_RELEASE_ALL_vi : DS_Real_vi<0x98, DS_GWS_SEMA_RELEASE_ALL>;
 def DS_CMPST_RTN_B64_vi   : DS_Real_vi<0x70, DS_CMPST_RTN_B64>;
 def DS_CMPST_RTN_F64_vi   : DS_Real_vi<0x71, DS_CMPST_RTN_F64>;
 def DS_MIN_RTN_F64_vi     : DS_Real_vi<0x72, DS_MIN_RTN_F64>;
@@ -904,3 +964,7 @@ def DS_XOR_SRC2_B64_vi    : DS_Real_vi<0xcb, DS_XOR_SRC2_B64>;
 def DS_WRITE_SRC2_B64_vi  : DS_Real_vi<0xcd, DS_WRITE_SRC2_B64>;
 def DS_MIN_SRC2_F64_vi    : DS_Real_vi<0xd2, DS_MIN_SRC2_F64>;
 def DS_MAX_SRC2_F64_vi    : DS_Real_vi<0xd3, DS_MAX_SRC2_F64>;
+def DS_WRITE_B96_vi       : DS_Real_vi<0xde, DS_WRITE_B96>;
+def DS_WRITE_B128_vi      : DS_Real_vi<0xdf, DS_WRITE_B128>;
+def DS_READ_B96_vi        : DS_Real_vi<0xfe, DS_READ_B96>;
+def DS_READ_B128_vi       : DS_Real_vi<0xff, DS_READ_B128>;
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 2247cad7bb517a2fff89df6cdc47819e44441ef4..4fb03b62bba9a50caf7151875dceed15e5dc4a5a 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -22,6 +22,7 @@
 #include "AMDGPURegisterInfo.h"
 #include "SIDefines.h"
 #include "Utils/AMDGPUBaseInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
@@ -97,9 +98,13 @@ static DecodeStatus decodeOperand_VSrc16(MCInst &Inst,
   return addOperand(Inst, DAsm->decodeOperand_VSrc16(Imm));
 }
 
-#define GET_SUBTARGETINFO_ENUM
-#include "AMDGPUGenSubtargetInfo.inc"
-#undef GET_SUBTARGETINFO_ENUM
+static DecodeStatus decodeOperand_VSrcV216(MCInst &Inst,
+                                         unsigned Imm,
+                                         uint64_t Addr,
+                                         const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  return addOperand(Inst, DAsm->decodeOperand_VSrcV216(Imm));
+}
 
 #include "AMDGPUGenDisassemblerTables.inc"
 
@@ -138,7 +143,8 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   CommentStream = &CS;
 
   // ToDo: AMDGPUDisassembler supports only VI ISA.
-  assert(AMDGPU::isVI(STI) && "Can disassemble only VI ISA.");
+  if (!STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding])
+    report_fatal_error("Disassembly not yet supported for subtarget");
 
   const unsigned MaxInstBytesNum = (std::min)((size_t)8, Bytes_.size());
   Bytes = Bytes_.slice(0, MaxInstBytesNum);
@@ -179,6 +185,17 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Res = tryDecodeInst(DecoderTableAMDGPU64, MI, QW, Address);
   } while (false);
 
+  if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
+              MI.getOpcode() == AMDGPU::V_MAC_F32_e64_si ||
+              MI.getOpcode() == AMDGPU::V_MAC_F16_e64_vi)) {
+    // Insert dummy unused src2_modifiers.
+    int Src2ModIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                                AMDGPU::OpName::src2_modifiers);
+    auto I = MI.begin();
+    std::advance(I, Src2ModIdx);
+    MI.insert(I, MCOperand::createImm(0));
+  }
+
   Size = Res ? (MaxInstBytesNum - Bytes.size()) : 0;
   return Res;
 }
@@ -263,6 +280,10 @@ MCOperand AMDGPUDisassembler::decodeOperand_VSrc16(unsigned Val) const {
   return decodeSrcOp(OPW16, Val);
 }
 
+MCOperand AMDGPUDisassembler::decodeOperand_VSrcV216(unsigned Val) const {
+  return decodeSrcOp(OPWV216, Val);
+}
+
 MCOperand AMDGPUDisassembler::decodeOperand_VGPR_32(unsigned Val) const {
   // Some instructions have operand restrictions beyond what the encoding
   // allows. Some ordinarily VSrc_32 operands are VGPR_32, so clear the extra
@@ -423,6 +444,7 @@ MCOperand AMDGPUDisassembler::decodeFPImmed(OpWidthTy Width, unsigned Imm) {
   case OPW64:
     return MCOperand::createImm(getInlineImmVal64(Imm));
   case OPW16:
+  case OPWV216:
     return MCOperand::createImm(getInlineImmVal16(Imm));
   default:
     llvm_unreachable("implement me");
@@ -436,6 +458,7 @@ unsigned AMDGPUDisassembler::getVgprClassId(const OpWidthTy Width) const {
   default: // fall
   case OPW32:
   case OPW16:
+  case OPWV216:
     return VGPR_32RegClassID;
   case OPW64: return VReg_64RegClassID;
   case OPW128: return VReg_128RegClassID;
@@ -449,6 +472,7 @@ unsigned AMDGPUDisassembler::getSgprClassId(const OpWidthTy Width) const {
   default: // fall
   case OPW32:
   case OPW16:
+  case OPWV216:
     return SGPR_32RegClassID;
   case OPW64: return SGPR_64RegClassID;
   case OPW128: return SGPR_128RegClassID;
@@ -462,6 +486,7 @@ unsigned AMDGPUDisassembler::getTtmpClassId(const OpWidthTy Width) const {
   default: // fall
   case OPW32:
   case OPW16:
+  case OPWV216:
     return TTMP_32RegClassID;
   case OPW64: return TTMP_64RegClassID;
   case OPW128: return TTMP_128RegClassID;
@@ -497,6 +522,7 @@ MCOperand AMDGPUDisassembler::decodeSrcOp(const OpWidthTy Width, unsigned Val) c
   switch (Width) {
   case OPW32:
   case OPW16:
+  case OPWV216:
     return decodeSpecialReg32(Val);
   case OPW64:
     return decodeSpecialReg64(Val);
@@ -522,6 +548,11 @@ MCOperand AMDGPUDisassembler::decodeSpecialReg32(unsigned Val) const {
   case 124: return createRegOperand(M0);
   case 126: return createRegOperand(EXEC_LO);
   case 127: return createRegOperand(EXEC_HI);
+  case 235: return createRegOperand(SRC_SHARED_BASE);
+  case 236: return createRegOperand(SRC_SHARED_LIMIT);
+  case 237: return createRegOperand(SRC_PRIVATE_BASE);
+  case 238: return createRegOperand(SRC_PRIVATE_LIMIT);
+    // TODO: SRC_POPS_EXITING_WAVE_ID
     // ToDo: no support for vccz register
   case 251: break;
     // ToDo: no support for execz register
diff --git a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
index ee5883a984e0caafc97b1c21543c9d3387e9ba28..d50665187e10ba49aba71a5b8d27c3efeade312c 100644
--- a/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
+++ b/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.h
@@ -67,6 +67,7 @@ public:
   MCOperand decodeOperand_VS_32(unsigned Val) const;
   MCOperand decodeOperand_VS_64(unsigned Val) const;
   MCOperand decodeOperand_VSrc16(unsigned Val) const;
+  MCOperand decodeOperand_VSrcV216(unsigned Val) const;
 
   MCOperand decodeOperand_VReg_64(unsigned Val) const;
   MCOperand decodeOperand_VReg_96(unsigned Val) const;
@@ -85,6 +86,7 @@ public:
     OPW64,
     OPW128,
     OPW16,
+    OPWV216,
     OPW_LAST_,
     OPW_FIRST_ = OPW32
   };
diff --git a/lib/Target/AMDGPU/EvergreenInstructions.td b/lib/Target/AMDGPU/EvergreenInstructions.td
index 10d32482a60871f3fb520c11eac7785c10cbb4f0..5480110d83153b329a6b5b526436cba8b0c3497f 100644
--- a/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -388,7 +388,7 @@ def BFE_INT_eg : R600_3OP <0x5, "BFE_INT",
   VecALU
 >;
 
-def : BFEPattern <BFE_UINT_eg, MOV_IMM_I32>;
+defm : BFEPattern <BFE_UINT_eg, BFE_INT_eg, MOV_IMM_I32>;
 
 def BFI_INT_eg : R600_3OP <0x06, "BFI_INT",
   [(set i32:$dst, (AMDGPUbfi i32:$src0, i32:$src1, i32:$src2))],
@@ -438,7 +438,7 @@ defm CUBE_eg : CUBE_Common<0xC0>;
 def ADDC_UINT : R600_2OP_Helper <0x52, "ADDC_UINT", AMDGPUcarry>;
 def SUBB_UINT : R600_2OP_Helper <0x53, "SUBB_UINT", AMDGPUborrow>;
 
-def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", fp_to_f16, VecALU>;
+def FLT32_TO_FLT16 : R600_1OP_Helper <0xA2, "FLT32_TO_FLT16", AMDGPUfp_to_f16, VecALU>;
 def FLT16_TO_FLT32 : R600_1OP_Helper <0xA3, "FLT16_TO_FLT32", f16_to_fp, VecALU>;
 def BCNT_INT : R600_1OP_Helper <0xAA, "BCNT_INT", ctpop, VecALU>;
 def FFBH_UINT : R600_1OP_Helper <0xAB, "FFBH_UINT", AMDGPUffbh_u32, VecALU>;
diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td
index 849fb8ad50f504c5e896e58c94d1b069318a4f0b..b0ac0e689a0b6d8c72fef67188efb0cc5f4e5345 100644
--- a/lib/Target/AMDGPU/FLATInstructions.td
+++ b/lib/Target/AMDGPU/FLATInstructions.td
@@ -136,7 +136,7 @@ multiclass FLAT_Atomic_Pseudo<
 class flat_binary_atomic_op<SDNode atomic_op> : PatFrag<
   (ops node:$ptr, node:$value),
   (atomic_op node:$ptr, node:$value),
-  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUAS::FLAT_ADDRESS;}]
+  [{return cast<MemSDNode>(N)->getAddressSpace() == AMDGPUASI.FLAT_ADDRESS;}]
 >;
 
 def atomic_cmp_swap_flat : flat_binary_atomic_op<AMDGPUatomic_cmp_swap>;
@@ -284,16 +284,16 @@ defm FLAT_ATOMIC_FMAX_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
 class flat_ld <SDPatternOperator ld> : PatFrag<(ops node:$ptr),
                                                (ld node:$ptr), [{
   auto const AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
+  return AS == AMDGPUASI.FLAT_ADDRESS ||
+         AS == AMDGPUASI.GLOBAL_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS;
 }]>;
 
 class flat_st <SDPatternOperator st> : PatFrag<(ops node:$val, node:$ptr),
                                                (st node:$val, node:$ptr), [{
   auto const AS = cast<MemSDNode>(N)->getAddressSpace();
-  return AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::GLOBAL_ADDRESS;
+  return AS == AMDGPUASI.FLAT_ADDRESS ||
+         AS == AMDGPUASI.GLOBAL_ADDRESS;
 }]>;
 
 def atomic_flat_load   : flat_ld <atomic_load>;
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 16b0b788318e8a9c2c0c6d404919f1a48a7e8d6f..80fc4ac9d2a3e922a714f437b449a334957c06a5 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -39,7 +39,8 @@ using namespace llvm;
 GCNHazardRecognizer::GCNHazardRecognizer(const MachineFunction &MF) :
   CurrCycleInstr(nullptr),
   MF(MF),
-  ST(MF.getSubtarget<SISubtarget>()) {
+  ST(MF.getSubtarget<SISubtarget>()),
+  TII(*ST.getInstrInfo()) {
   MaxLookAhead = 5;
 }
 
@@ -71,6 +72,18 @@ static bool isRFE(unsigned Opcode) {
   return Opcode == AMDGPU::S_RFE_B64;
 }
 
+static bool isSMovRel(unsigned Opcode) {
+  switch (Opcode) {
+  case AMDGPU::S_MOVRELS_B32:
+  case AMDGPU::S_MOVRELS_B64:
+  case AMDGPU::S_MOVRELD_B32:
+  case AMDGPU::S_MOVRELD_B64:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static unsigned getHWReg(const SIInstrInfo *TII, const MachineInstr &RegInstr) {
   const MachineOperand *RegOp = TII->getNamedOperand(RegInstr,
                                                      AMDGPU::OpName::simm16);
@@ -108,6 +121,13 @@ GCNHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
   if (isRFE(MI->getOpcode()) && checkRFEHazards(MI) > 0)
     return NoopHazard;
 
+  if ((TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode())) &&
+      checkReadM0Hazards(MI) > 0)
+    return NoopHazard;
+
+  if (checkAnyInstHazards(MI) > 0)
+    return NoopHazard;
+
   return NoHazard;
 }
 
@@ -116,11 +136,13 @@ unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
 }
 
 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
+  int WaitStates = std::max(0, checkAnyInstHazards(MI));
+
   if (SIInstrInfo::isSMRD(*MI))
-    return std::max(0, checkSMRDHazards(MI));
+    return std::max(WaitStates, checkSMRDHazards(MI));
 
   if (SIInstrInfo::isVALU(*MI)) {
-    int WaitStates = std::max(0, checkVALUHazards(MI));
+      WaitStates = std::max(WaitStates, checkVALUHazards(MI));
 
     if (SIInstrInfo::isVMEM(*MI))
       WaitStates = std::max(WaitStates, checkVMEMHazards(MI));
@@ -134,19 +156,25 @@ unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
     if (isRWLane(MI->getOpcode()))
       WaitStates = std::max(WaitStates, checkRWLaneHazards(MI));
 
+    if (TII.isVINTRP(*MI))
+      WaitStates = std::max(WaitStates, checkReadM0Hazards(MI));
+
     return WaitStates;
   }
 
   if (isSGetReg(MI->getOpcode()))
-    return std::max(0, checkGetRegHazards(MI));
+    return std::max(WaitStates, checkGetRegHazards(MI));
 
   if (isSSetReg(MI->getOpcode()))
-    return std::max(0, checkSetRegHazards(MI));
+    return std::max(WaitStates, checkSetRegHazards(MI));
 
   if (isRFE(MI->getOpcode()))
-    return std::max(0, checkRFEHazards(MI));
+    return std::max(WaitStates, checkRFEHazards(MI));
 
-  return 0;
+  if (TII.isVINTRP(*MI) || isSMovRel(MI->getOpcode()))
+    return std::max(WaitStates, checkReadM0Hazards(MI));
+
+  return WaitStates;
 }
 
 void GCNHazardRecognizer::EmitNoop() {
@@ -159,8 +187,7 @@ void GCNHazardRecognizer::AdvanceCycle() {
   if (!CurrCycleInstr)
     return;
 
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  unsigned NumWaitStates = TII->getNumWaitStates(*CurrCycleInstr);
+  unsigned NumWaitStates = TII.getNumWaitStates(*CurrCycleInstr);
 
   // Keep track of emitted instructions
   EmittedInstrs.push_front(CurrCycleInstr);
@@ -290,7 +317,6 @@ int GCNHazardRecognizer::checkSMEMSoftClauseHazards(MachineInstr *SMEM) {
 
 int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
   int WaitStatesNeeded = 0;
 
   WaitStatesNeeded = checkSMEMSoftClauseHazards(SMRD);
@@ -302,7 +328,7 @@ int GCNHazardRecognizer::checkSMRDHazards(MachineInstr *SMRD) {
   // A read of an SGPR by SMRD instruction requires 4 wait states when the
   // SGPR was written by a VALU instruction.
   int SmrdSgprWaitStates = 4;
-  auto IsHazardDefFn = [TII] (MachineInstr *MI) { return TII->isVALU(*MI); };
+  auto IsHazardDefFn = [this] (MachineInstr *MI) { return TII.isVALU(*MI); };
 
   for (const MachineOperand &Use : SMRD->uses()) {
     if (!Use.isReg())
@@ -508,3 +534,42 @@ int GCNHazardRecognizer::checkRFEHazards(MachineInstr *RFE) {
   int WaitStatesNeeded = getWaitStatesSinceSetReg(IsHazardFn);
   return RFEWaitStates - WaitStatesNeeded;
 }
+
+int GCNHazardRecognizer::checkAnyInstHazards(MachineInstr *MI) {
+  if (MI->isDebugValue())
+    return 0;
+
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  if (!ST.hasSMovFedHazard())
+    return 0;
+
+  // Check for any instruction reading an SGPR after a write from
+  // s_mov_fed_b32.
+  int MovFedWaitStates = 1;
+  int WaitStatesNeeded = 0;
+
+  for (const MachineOperand &Use : MI->uses()) {
+    if (!Use.isReg() || TRI->isVGPR(MF.getRegInfo(), Use.getReg()))
+      continue;
+    auto IsHazardFn = [] (MachineInstr *MI) {
+      return MI->getOpcode() == AMDGPU::S_MOV_FED_B32;
+    };
+    int WaitStatesNeededForUse =
+        MovFedWaitStates - getWaitStatesSinceDef(Use.getReg(), IsHazardFn);
+    WaitStatesNeeded = std::max(WaitStatesNeeded, WaitStatesNeededForUse);
+  }
+
+  return WaitStatesNeeded;
+}
+
+int GCNHazardRecognizer::checkReadM0Hazards(MachineInstr *MI) {
+  if (!ST.hasReadM0Hazard())
+    return 0;
+
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  int SMovRelWaitStates = 1;
+  auto IsHazardFn = [TII] (MachineInstr *MI) {
+    return TII->isSALU(*MI);
+  };
+  return SMovRelWaitStates - getWaitStatesSinceDef(AMDGPU::M0, IsHazardFn);
+}
diff --git a/lib/Target/AMDGPU/GCNHazardRecognizer.h b/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 0ab82ff4635b0e1355f3b05cec1f7d99ea6202c0..5680c3de6a1a323b4a9d797b199eff9e13d61c7e 100644
--- a/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -34,6 +34,7 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   std::list<MachineInstr*> EmittedInstrs;
   const MachineFunction &MF;
   const SISubtarget &ST;
+  const SIInstrInfo &TII;
 
   int getWaitStatesSince(function_ref<bool(MachineInstr *)> IsHazard);
   int getWaitStatesSinceDef(unsigned Reg,
@@ -52,6 +53,8 @@ class GCNHazardRecognizer final : public ScheduleHazardRecognizer {
   int checkVALUHazards(MachineInstr *VALU);
   int checkRWLaneHazards(MachineInstr *RWLane);
   int checkRFEHazards(MachineInstr *RFE);
+  int checkAnyInstHazards(MachineInstr *MI);
+  int checkReadM0Hazards(MachineInstr *SMovRel);
 public:
   GCNHazardRecognizer(const MachineFunction &MF);
   // We can only issue one instruction per cycle.
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..3bb5c9bc22b7dbd1a40ccb0d4a5ca416a665463d
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -0,0 +1,528 @@
+//===--------------------- GCNIterativeScheduler.cpp - --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNIterativeScheduler.h"
+#include "GCNSchedStrategy.h"
+#include "SIMachineFunctionInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+namespace llvm {
+  std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+    const ScheduleDAG &DAG);
+}
+
+// shim accessors for different order containers
+static inline MachineInstr *getMachineInstr(MachineInstr *MI) {
+  return MI;
+}
+static inline MachineInstr *getMachineInstr(const SUnit *SU) {
+  return SU->getInstr();
+}
+static inline MachineInstr *getMachineInstr(const SUnit &SU) {
+  return SU.getInstr();
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+static void printRegion(raw_ostream &OS,
+                        MachineBasicBlock::iterator Begin,
+                        MachineBasicBlock::iterator End,
+                        const LiveIntervals *LIS,
+                        unsigned MaxInstNum =
+                          std::numeric_limits<unsigned>::max()) {
+  auto BB = Begin->getParent();
+  OS << BB->getParent()->getName() << ":BB#" << BB->getNumber()
+     << ' ' << BB->getName() << ":\n";
+  auto I = Begin;
+  MaxInstNum = std::max(MaxInstNum, 1u);
+  for (; I != End && MaxInstNum; ++I, --MaxInstNum) {
+    if (!I->isDebugValue() && LIS)
+      OS << LIS->getInstructionIndex(*I);
+    OS << '\t' << *I;
+  }
+  if (I != End) {
+    OS << "\t...\n";
+    I = std::prev(End);
+    if (!I->isDebugValue() && LIS)
+      OS << LIS->getInstructionIndex(*I);
+    OS << '\t' << *I;
+  }
+  if (End != BB->end()) { // print boundary inst if present
+    OS << "----\n";
+    if (LIS) OS << LIS->getInstructionIndex(*End) << '\t';
+    OS << *End;
+  }
+}
+
+LLVM_DUMP_METHOD
+static void printLivenessInfo(raw_ostream &OS,
+                              MachineBasicBlock::iterator Begin,
+                              MachineBasicBlock::iterator End,
+                              const LiveIntervals *LIS) {
+  const auto BB = Begin->getParent();
+  const auto &MRI = BB->getParent()->getRegInfo();
+
+  const auto LiveIns = getLiveRegsBefore(*Begin, *LIS);
+  OS << "LIn RP: ";
+  getRegPressure(MRI, LiveIns).print(OS);
+
+  const auto BottomMI = End == BB->end() ? std::prev(End) : End;
+  const auto LiveOuts = getLiveRegsAfter(*BottomMI, *LIS);
+  OS << "LOt RP: ";
+  getRegPressure(MRI, LiveOuts).print(OS);
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printRegions(raw_ostream &OS) const {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  for (const auto R : Regions) {
+    OS << "Region to schedule ";
+    printRegion(OS, R->Begin, R->End, LIS, 1);
+    printLivenessInfo(OS, R->Begin, R->End, LIS);
+    OS << "Max RP: ";
+    R->MaxPressure.print(OS, &ST);
+  }
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printSchedResult(raw_ostream &OS,
+                                             const Region *R,
+                                             const GCNRegPressure &RP) const {
+  OS << "\nAfter scheduling ";
+  printRegion(OS, R->Begin, R->End, LIS);
+  printSchedRP(OS, R->MaxPressure, RP);
+  OS << '\n';
+}
+
+LLVM_DUMP_METHOD
+void GCNIterativeScheduler::printSchedRP(raw_ostream &OS,
+                                         const GCNRegPressure &Before,
+                                         const GCNRegPressure &After) const {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  OS << "RP before: ";
+  Before.print(OS, &ST);
+  OS << "RP after:  ";
+  After.print(OS, &ST);
+}
+
+#endif
+
+// DAG builder helper
+class GCNIterativeScheduler::BuildDAG {
+  GCNIterativeScheduler &Sch;
+  SmallVector<SUnit*, 8> TopRoots;
+public:
+  BuildDAG(const Region &R, GCNIterativeScheduler &_Sch)
+    : Sch(_Sch) {
+    auto BB = R.Begin->getParent();
+    Sch.BaseClass::startBlock(BB);
+    Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
+
+    Sch.buildSchedGraph(Sch.AA, nullptr, nullptr, nullptr,
+                        /*TrackLaneMask*/true);
+    Sch.Topo.InitDAGTopologicalSorting();
+
+    SmallVector<SUnit*, 8> BotRoots;
+    Sch.findRootsAndBiasEdges(TopRoots, BotRoots);
+  }
+  ~BuildDAG() {
+    Sch.BaseClass::exitRegion();
+    Sch.BaseClass::finishBlock();
+  }
+  ArrayRef<const SUnit*> getTopRoots() const {
+    return TopRoots;
+  }
+};
+
+class GCNIterativeScheduler::OverrideLegacyStrategy {
+  GCNIterativeScheduler &Sch;
+  Region &Rgn;
+  std::unique_ptr<MachineSchedStrategy> SaveSchedImpl;
+  GCNRegPressure SaveMaxRP;
+public:
+  OverrideLegacyStrategy(Region &R,
+                         MachineSchedStrategy &OverrideStrategy,
+                         GCNIterativeScheduler &_Sch)
+    : Sch(_Sch)
+    , Rgn(R)
+    , SaveSchedImpl(std::move(_Sch.SchedImpl))
+    , SaveMaxRP(R.MaxPressure) {
+    Sch.SchedImpl.reset(&OverrideStrategy);
+    auto BB = R.Begin->getParent();
+    Sch.BaseClass::startBlock(BB);
+    Sch.BaseClass::enterRegion(BB, R.Begin, R.End, R.NumRegionInstrs);
+  }
+  ~OverrideLegacyStrategy() {
+    Sch.BaseClass::exitRegion();
+    Sch.BaseClass::finishBlock();
+    Sch.SchedImpl.release();
+    Sch.SchedImpl = std::move(SaveSchedImpl);
+  }
+  void schedule() {
+    assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
+    DEBUG(dbgs() << "\nScheduling ";
+      printRegion(dbgs(), Rgn.Begin, Rgn.End, Sch.LIS, 2));
+    Sch.BaseClass::schedule();
+
+    // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+    Sch.RegionEnd = Rgn.End;
+    //assert(Rgn.End == Sch.RegionEnd);
+    Rgn.Begin = Sch.RegionBegin;
+    Rgn.MaxPressure.clear();
+  }
+  void restoreOrder() {
+    assert(Sch.RegionBegin == Rgn.Begin && Sch.RegionEnd == Rgn.End);
+    // DAG SUnits are stored using original region's order
+    // so just use SUnits as the restoring schedule
+    Sch.scheduleRegion(Rgn, Sch.SUnits, SaveMaxRP);
+  }
+};
+
+// just a stub to make base class happy
+class SchedStrategyStub : public MachineSchedStrategy {
+public:
+  bool shouldTrackPressure() const override { return false; }
+  bool shouldTrackLaneMasks() const override { return false; }
+  void initialize(ScheduleDAGMI *DAG) override {}
+  SUnit *pickNode(bool &IsTopNode) override { return nullptr; }
+  void schedNode(SUnit *SU, bool IsTopNode) override {}
+  void releaseTopNode(SUnit *SU) override {}
+  void releaseBottomNode(SUnit *SU) override {}
+};
+
+GCNIterativeScheduler::GCNIterativeScheduler(MachineSchedContext *C,
+                                             StrategyKind S)
+  : BaseClass(C, llvm::make_unique<SchedStrategyStub>())
+  , Context(C)
+  , Strategy(S)
+  , UPTracker(*LIS) {
+}
+
+// returns max pressure for a region
+GCNRegPressure
+GCNIterativeScheduler::getRegionPressure(MachineBasicBlock::iterator Begin,
+                                         MachineBasicBlock::iterator End)
+  const {
+  // For the purpose of pressure tracking bottom inst of the region should
+  // be also processed. End is either BB end, BB terminator inst or sched
+  // boundary inst.
+  auto const BBEnd = Begin->getParent()->end();
+  auto const BottomMI = End == BBEnd ? std::prev(End) : End;
+
+  // scheduleRegions walks bottom to top, so its likely we just get next
+  // instruction to track
+  auto AfterBottomMI = std::next(BottomMI);
+  if (AfterBottomMI == BBEnd ||
+      &*AfterBottomMI != UPTracker.getLastTrackedMI()) {
+    UPTracker.reset(*BottomMI);
+  } else {
+    assert(UPTracker.isValid());
+  }
+
+  for (auto I = BottomMI; I != Begin; --I)
+    UPTracker.recede(*I);
+
+  UPTracker.recede(*Begin);
+
+  assert(UPTracker.isValid() ||
+         (dbgs() << "Tracked region ",
+          printRegion(dbgs(), Begin, End, LIS), false));
+  return UPTracker.moveMaxPressure();
+}
+
+// returns max pressure for a tentative schedule
+template <typename Range> GCNRegPressure
+GCNIterativeScheduler::getSchedulePressure(const Region &R,
+                                           Range &&Schedule) const {
+  auto const BBEnd = R.Begin->getParent()->end();
+  GCNUpwardRPTracker RPTracker(*LIS);
+  if (R.End != BBEnd) {
+    // R.End points to the boundary instruction but the
+    // schedule doesn't include it
+    RPTracker.reset(*R.End);
+    RPTracker.recede(*R.End);
+  } else {
+    // R.End doesn't point to the boundary instruction
+    RPTracker.reset(*std::prev(BBEnd));
+  }
+  for (auto I = Schedule.end(), B = Schedule.begin(); I != B;) {
+    RPTracker.recede(*getMachineInstr(*--I));
+  }
+  return RPTracker.moveMaxPressure();
+}
+
+void GCNIterativeScheduler::enterRegion(MachineBasicBlock *BB, // overriden
+                                        MachineBasicBlock::iterator Begin,
+                                        MachineBasicBlock::iterator End,
+                                        unsigned NumRegionInstrs) {
+  BaseClass::enterRegion(BB, Begin, End, NumRegionInstrs);
+  if (NumRegionInstrs > 2) {
+    Regions.push_back(
+      new (Alloc.Allocate())
+      Region { Begin, End, NumRegionInstrs,
+               getRegionPressure(Begin, End), nullptr });
+  }
+}
+
+void GCNIterativeScheduler::schedule() { // overriden
+  // do nothing
+  DEBUG(
+    printLivenessInfo(dbgs(), RegionBegin, RegionEnd, LIS);
+    if (!Regions.empty() && Regions.back()->Begin == RegionBegin) {
+      dbgs() << "Max RP: ";
+      Regions.back()->MaxPressure.print(dbgs(), &MF.getSubtarget<SISubtarget>());
+    }
+    dbgs() << '\n';
+  );
+}
+
+void GCNIterativeScheduler::finalizeSchedule() { // overriden
+  if (Regions.empty())
+    return;
+  switch (Strategy) {
+  case SCHEDULE_MINREGONLY: scheduleMinReg(); break;
+  case SCHEDULE_MINREGFORCED: scheduleMinReg(true); break;
+  case SCHEDULE_LEGACYMAXOCCUPANCY: scheduleLegacyMaxOccupancy(); break;
+  }
+}
+
+// Detach schedule from SUnits and interleave it with debug values.
+// Returned schedule becomes independent of DAG state.
+std::vector<MachineInstr*>
+GCNIterativeScheduler::detachSchedule(ScheduleRef Schedule) const {
+  std::vector<MachineInstr*> Res;
+  Res.reserve(Schedule.size() * 2);
+
+  if (FirstDbgValue)
+    Res.push_back(FirstDbgValue);
+
+  const auto DbgB = DbgValues.begin(), DbgE = DbgValues.end();
+  for (auto SU : Schedule) {
+    Res.push_back(SU->getInstr());
+    const auto &D = std::find_if(DbgB, DbgE, [SU](decltype(*DbgB) &P) {
+      return P.second == SU->getInstr();
+    });
+    if (D != DbgE)
+      Res.push_back(D->first);
+  }
+  return Res;
+}
+
+void GCNIterativeScheduler::setBestSchedule(Region &R,
+                                            ScheduleRef Schedule,
+                                            const GCNRegPressure &MaxRP) {
+  R.BestSchedule.reset(
+    new TentativeSchedule{ detachSchedule(Schedule), MaxRP });
+}
+
+void GCNIterativeScheduler::scheduleBest(Region &R) {
+  assert(R.BestSchedule.get() && "No schedule specified");
+  scheduleRegion(R, R.BestSchedule->Schedule, R.BestSchedule->MaxPressure);
+  R.BestSchedule.reset();
+}
+
+// minimal required region scheduler, works for ranges of SUnits*,
+// SUnits or MachineIntrs*
+template <typename Range>
+void GCNIterativeScheduler::scheduleRegion(Region &R, Range &&Schedule,
+                                           const GCNRegPressure &MaxRP) {
+  assert(RegionBegin == R.Begin && RegionEnd == R.End);
+  assert(LIS != nullptr);
+#ifndef NDEBUG
+  const auto SchedMaxRP = getSchedulePressure(R, Schedule);
+#endif
+  auto BB = R.Begin->getParent();
+  auto Top = R.Begin;
+  for (const auto &I : Schedule) {
+    auto MI = getMachineInstr(I);
+    if (MI != &*Top) {
+      BB->remove(MI);
+      BB->insert(Top, MI);
+      if (!MI->isDebugValue())
+        LIS->handleMove(*MI, true);
+    }
+    if (!MI->isDebugValue()) {
+      // Reset read - undef flags and update them later.
+      for (auto &Op : MI->operands())
+        if (Op.isReg() && Op.isDef())
+          Op.setIsUndef(false);
+
+      RegisterOperands RegOpers;
+      RegOpers.collect(*MI, *TRI, MRI, /*ShouldTrackLaneMasks*/true,
+                                       /*IgnoreDead*/false);
+      // Adjust liveness and add missing dead+read-undef flags.
+      auto SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+      RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+    }
+    Top = std::next(MI->getIterator());
+  }
+  RegionBegin = getMachineInstr(Schedule.front());
+
+  // Schedule consisting of MachineInstr* is considered 'detached'
+  // and already interleaved with debug values
+  if (!std::is_same<decltype(*Schedule.begin()), MachineInstr*>::value) {
+    placeDebugValues();
+    // Unfortunatelly placeDebugValues incorrectly modifies RegionEnd, restore
+    //assert(R.End == RegionEnd);
+    RegionEnd = R.End;
+  }
+
+  R.Begin = RegionBegin;
+  R.MaxPressure = MaxRP;
+
+#ifndef NDEBUG
+  const auto RegionMaxRP = getRegionPressure(R);
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+#endif
+  assert((SchedMaxRP == RegionMaxRP && (MaxRP.empty() || SchedMaxRP == MaxRP))
+  || (dbgs() << "Max RP mismatch!!!\n"
+                "RP for schedule (calculated): ",
+      SchedMaxRP.print(dbgs(), &ST),
+      dbgs() << "RP for schedule (reported): ",
+      MaxRP.print(dbgs(), &ST),
+      dbgs() << "RP after scheduling: ",
+      RegionMaxRP.print(dbgs(), &ST),
+      false));
+}
+
+// Sort recorded regions by pressure - highest at the front
+void GCNIterativeScheduler::sortRegionsByPressure(unsigned TargetOcc) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  std::sort(Regions.begin(), Regions.end(),
+    [&ST, TargetOcc](const Region *R1, const Region *R2) {
+    return R2->MaxPressure.less(ST, R1->MaxPressure, TargetOcc);
+  });
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Legacy MaxOccupancy Strategy
+
+// Tries to increase occupancy applying minreg scheduler for a sequence of
+// most demanding regions. Obtained schedules are saved as BestSchedule for a
+// region.
+// TargetOcc is the best achievable occupancy for a kernel.
+// Returns better occupancy on success or current occupancy on fail.
+// BestSchedules aren't deleted on fail.
+unsigned GCNIterativeScheduler::tryMaximizeOccupancy(unsigned TargetOcc) {
+  // TODO: assert Regions are sorted descending by pressure
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+  DEBUG(dbgs() << "Trying to to improve occupancy, target = " << TargetOcc
+               << ", current = " << Occ << '\n');
+
+  auto NewOcc = TargetOcc;
+  for (auto R : Regions) {
+    if (R->MaxPressure.getOccupancy(ST) >= NewOcc)
+      break;
+
+    DEBUG(printRegion(dbgs(), R->Begin, R->End, LIS, 3);
+          printLivenessInfo(dbgs(), R->Begin, R->End, LIS));
+
+    BuildDAG DAG(*R, *this);
+    const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
+    const auto MaxRP = getSchedulePressure(*R, MinSchedule);
+    DEBUG(dbgs() << "Occupancy improvement attempt:\n";
+          printSchedRP(dbgs(), R->MaxPressure, MaxRP));
+
+    NewOcc = std::min(NewOcc, MaxRP.getOccupancy(ST));
+    if (NewOcc <= Occ)
+      break;
+
+    setBestSchedule(*R, MinSchedule, MaxRP);
+  }
+  DEBUG(dbgs() << "New occupancy = " << NewOcc
+               << ", prev occupancy = " << Occ << '\n');
+  return std::max(NewOcc, Occ);
+}
+
+void GCNIterativeScheduler::scheduleLegacyMaxOccupancy(
+  bool TryMaximizeOccupancy) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+
+  sortRegionsByPressure(TgtOcc);
+  auto Occ = Regions.front()->MaxPressure.getOccupancy(ST);
+
+  if (TryMaximizeOccupancy && Occ < TgtOcc)
+    Occ = tryMaximizeOccupancy(TgtOcc);
+
+  // This is really weird but for some magic scheduling regions twice
+  // gives performance improvement
+  const int NumPasses = Occ < TgtOcc ? 2 : 1;
+
+  TgtOcc = std::min(Occ, TgtOcc);
+  DEBUG(dbgs() << "Scheduling using default scheduler, "
+                  "target occupancy = " << TgtOcc << '\n');
+  GCNMaxOccupancySchedStrategy LStrgy(Context);
+
+  for (int I = 0; I < NumPasses; ++I) {
+    // running first pass with TargetOccupancy = 0 mimics previous scheduling
+    // approach and is a performance magic
+    LStrgy.setTargetOccupancy(I == 0 ? 0 : TgtOcc);
+    for (auto R : Regions) {
+      OverrideLegacyStrategy Ovr(*R, LStrgy, *this);
+
+      Ovr.schedule();
+      const auto RP = getRegionPressure(*R);
+      DEBUG(printSchedRP(dbgs(), R->MaxPressure, RP));
+
+      if (RP.getOccupancy(ST) < TgtOcc) {
+        DEBUG(dbgs() << "Didn't fit into target occupancy O" << TgtOcc);
+        if (R->BestSchedule.get() &&
+            R->BestSchedule->MaxPressure.getOccupancy(ST) >= TgtOcc) {
+          DEBUG(dbgs() << ", scheduling minimal register\n");
+          scheduleBest(*R);
+        } else {
+          DEBUG(dbgs() << ", restoring\n");
+          Ovr.restoreOrder();
+          assert(R->MaxPressure.getOccupancy(ST) >= TgtOcc);
+        }
+      }
+    }
+  }
+}
+
+///////////////////////////////////////////////////////////////////////////////
+// Minimal Register Strategy
+
+void GCNIterativeScheduler::scheduleMinReg(bool force) {
+  const auto &ST = MF.getSubtarget<SISubtarget>();
+  const auto TgtOcc = ST.getOccupancyWithLocalMemSize(MF);
+  sortRegionsByPressure(TgtOcc);
+
+  auto MaxPressure = Regions.front()->MaxPressure;
+  for (auto R : Regions) {
+    if (!force && R->MaxPressure.less(ST, MaxPressure, TgtOcc))
+      break;
+
+    BuildDAG DAG(*R, *this);
+    const auto MinSchedule = makeMinRegSchedule(DAG.getTopRoots(), *this);
+
+    const auto RP = getSchedulePressure(*R, MinSchedule);
+    DEBUG(if (R->MaxPressure.less(ST, RP, TgtOcc)) {
+      dbgs() << "\nWarning: Pressure becomes worse after minreg!";
+      printSchedRP(dbgs(), R->MaxPressure, RP);
+    });
+
+    if (!force && MaxPressure.less(ST, RP, TgtOcc))
+      break;
+
+    scheduleRegion(*R, MinSchedule, RP);
+    DEBUG(printSchedResult(dbgs(), R, RP));
+
+    MaxPressure = RP;
+  }
+}
diff --git a/lib/Target/AMDGPU/GCNIterativeScheduler.h b/lib/Target/AMDGPU/GCNIterativeScheduler.h
new file mode 100644
index 0000000000000000000000000000000000000000..df3afce21ebc3a26087720479d28325ba454ea64
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -0,0 +1,118 @@
+//===--------- GCNIterativeScheduler.h - GCN Scheduler -*- C++ -*----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
+
+#include "GCNRegPressure.h"
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+class GCNIterativeScheduler : public ScheduleDAGMILive {
+  typedef ScheduleDAGMILive BaseClass;
+public:
+  enum StrategyKind {
+    SCHEDULE_MINREGONLY,
+    SCHEDULE_MINREGFORCED,
+    SCHEDULE_LEGACYMAXOCCUPANCY
+  };
+
+  GCNIterativeScheduler(MachineSchedContext *C,
+                        StrategyKind S);
+
+  void schedule() override;
+
+  void enterRegion(MachineBasicBlock *BB,
+                   MachineBasicBlock::iterator Begin,
+                   MachineBasicBlock::iterator End,
+                   unsigned RegionInstrs) override;
+
+  void finalizeSchedule() override;
+
+protected:
+
+  typedef ArrayRef<const SUnit*> ScheduleRef;
+
+  struct TentativeSchedule {
+    std::vector<MachineInstr*> Schedule;
+    GCNRegPressure MaxPressure;
+  };
+
+  struct Region {
+    // Fields except for BestSchedule are supposed to reflect current IR state
+    // `const` fields are to emphasize they shouldn't change for any schedule.
+    MachineBasicBlock::iterator Begin;
+    // End is either a boundary instruction or end of basic block
+    const MachineBasicBlock::iterator End;
+    const unsigned NumRegionInstrs;
+    GCNRegPressure MaxPressure;
+
+    // best schedule for the region so far (not scheduled yet)
+    std::unique_ptr<TentativeSchedule> BestSchedule;
+  };
+
+  SpecificBumpPtrAllocator<Region> Alloc;
+  std::vector<Region*> Regions;
+
+  MachineSchedContext *Context;
+  const StrategyKind Strategy;
+  mutable GCNUpwardRPTracker UPTracker;
+
+  class BuildDAG;
+  class OverrideLegacyStrategy;
+
+  template <typename Range>
+  GCNRegPressure getSchedulePressure(const Region &R,
+                                     Range &&Schedule) const;
+
+  GCNRegPressure getRegionPressure(MachineBasicBlock::iterator Begin,
+                                   MachineBasicBlock::iterator End) const;
+
+  GCNRegPressure getRegionPressure(const Region &R) const {
+    return getRegionPressure(R.Begin, R.End);
+  }
+
+  void setBestSchedule(Region &R,
+                       ScheduleRef Schedule,
+                       const GCNRegPressure &MaxRP = GCNRegPressure());
+
+  void scheduleBest(Region &R);
+
+  std::vector<MachineInstr*> detachSchedule(ScheduleRef Schedule) const;
+
+  void sortRegionsByPressure(unsigned TargetOcc);
+
+  template <typename Range>
+  void scheduleRegion(Region &R, Range &&Schedule,
+                      const GCNRegPressure &MaxRP = GCNRegPressure());
+
+  unsigned tryMaximizeOccupancy(unsigned TargetOcc =
+                                std::numeric_limits<unsigned>::max());
+
+  void scheduleLegacyMaxOccupancy(bool TryMaximizeOccupancy = true);
+  void scheduleMinReg(bool force = false);
+
+  void printRegions(raw_ostream &OS) const;
+  void printSchedResult(raw_ostream &OS,
+                        const Region *R,
+                        const GCNRegPressure &RP) const;
+  void printSchedRP(raw_ostream &OS,
+                    const GCNRegPressure &Before,
+                    const GCNRegPressure &After) const;
+};
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
diff --git a/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c6d0f217995089df9becf6b1d4584e6e256ec89a
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -0,0 +1,266 @@
+//===----------------------- GCNMinRegStrategy.cpp - ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ScheduleDAG.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+class GCNMinRegScheduler {
+  struct Candidate : ilist_node<Candidate> {
+    const SUnit *SU;
+    int Priority;
+
+    Candidate(const SUnit *SU_, int Priority_ = 0)
+      : SU(SU_), Priority(Priority_) {}
+  };
+
+  SpecificBumpPtrAllocator<Candidate> Alloc;
+  typedef simple_ilist<Candidate> Queue;
+  Queue RQ; // Ready queue
+
+  std::vector<unsigned> NumPreds;
+
+  bool isScheduled(const SUnit *SU) const {
+    assert(!SU->isBoundaryNode());
+    return NumPreds[SU->NodeNum] == std::numeric_limits<unsigned>::max();
+  }
+
+  void setIsScheduled(const SUnit *SU)  {
+    assert(!SU->isBoundaryNode());
+    NumPreds[SU->NodeNum] = std::numeric_limits<unsigned>::max();
+  }
+
+  unsigned getNumPreds(const SUnit *SU) const {
+    assert(!SU->isBoundaryNode());
+    assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
+    return NumPreds[SU->NodeNum];
+  }
+
+  unsigned decNumPreds(const SUnit *SU) {
+    assert(!SU->isBoundaryNode());
+    assert(NumPreds[SU->NodeNum] != std::numeric_limits<unsigned>::max());
+    return --NumPreds[SU->NodeNum];
+  }
+
+  void initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits);
+
+  int getReadySuccessors(const SUnit *SU) const;
+  int getNotReadySuccessors(const SUnit *SU) const;
+
+  template <typename Calc>
+  unsigned findMax(unsigned Num, Calc C);
+
+  Candidate* pickCandidate();
+
+  void bumpPredsPriority(const SUnit *SchedSU, int Priority);
+  void releaseSuccessors(const SUnit* SU, int Priority);
+
+public:
+  std::vector<const SUnit*> schedule(ArrayRef<const SUnit*> TopRoots,
+                                     const ScheduleDAG &DAG);
+};
+
+void GCNMinRegScheduler::initNumPreds(const decltype(ScheduleDAG::SUnits) &SUnits) {
+  NumPreds.resize(SUnits.size());
+  for (unsigned I = 0; I < SUnits.size(); ++I)
+    NumPreds[I] = SUnits[I].NumPredsLeft;
+}
+
+int GCNMinRegScheduler::getReadySuccessors(const SUnit *SU) const {
+  unsigned NumSchedSuccs = 0;
+  for (auto SDep : SU->Succs) {
+    bool wouldBeScheduled = true;
+    for (auto PDep : SDep.getSUnit()->Preds) {
+      auto PSU = PDep.getSUnit();
+      assert(!PSU->isBoundaryNode());
+      if (PSU != SU && !isScheduled(PSU)) {
+        wouldBeScheduled = false;
+        break;
+      }
+    }
+    NumSchedSuccs += wouldBeScheduled ? 1 : 0;
+  }
+  return NumSchedSuccs;
+}
+
+int GCNMinRegScheduler::getNotReadySuccessors(const SUnit *SU) const {
+  return SU->Succs.size() - getReadySuccessors(SU);
+}
+
+template <typename Calc>
+unsigned GCNMinRegScheduler::findMax(unsigned Num, Calc C) {
+  assert(!RQ.empty() && Num <= RQ.size());
+  typedef decltype(C(*RQ.begin())) T;
+  T Max = std::numeric_limits<T>::min();
+  unsigned NumMax = 0;
+  for (auto I = RQ.begin(); Num; --Num) {
+    T Cur = C(*I);
+    if (Cur >= Max) {
+      if (Cur > Max) {
+        Max = Cur;
+        NumMax = 1;
+      } else
+        ++NumMax;
+      auto &Cand = *I++;
+      RQ.remove(Cand);
+      RQ.push_front(Cand);
+      continue;
+    }
+    ++I;
+  }
+  return NumMax;
+}
+
+GCNMinRegScheduler::Candidate* GCNMinRegScheduler::pickCandidate() {
+  do {
+    unsigned Num = RQ.size();
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting max priority candidates among " << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) { return C.Priority; });
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting min non-ready producing candidate among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) {
+      auto SU = C.SU;
+      int Res = getNotReadySuccessors(SU);
+      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would left non-ready "
+                   << Res << " successors, metric = " << -Res << '\n');
+      return -Res;
+    });
+    if (Num == 1) break;
+
+    DEBUG(dbgs() << "\nSelecting most producing candidate among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) {
+      auto SU = C.SU;
+      auto Res = getReadySuccessors(SU);
+      DEBUG(dbgs() << "SU(" << SU->NodeNum << ") would make ready "
+                   << Res << " successors, metric = " << Res << '\n');
+      return Res;
+    });
+    if (Num == 1) break;
+
+    Num = Num ? Num : RQ.size();
+    DEBUG(dbgs() << "\nCan't find best candidate, selecting in program order among "
+                 << Num << '\n');
+    Num = findMax(Num, [=](const Candidate &C) { return -(int64_t)C.SU->NodeNum; });
+    assert(Num == 1);
+  } while (false);
+
+  return &RQ.front();
+}
+
+void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
+  SmallPtrSet<const SUnit*, 32> Set;
+  for (const auto &S : SchedSU->Succs) {
+    if (S.getSUnit()->isBoundaryNode() || isScheduled(S.getSUnit()) ||
+        S.getKind() != SDep::Data)
+      continue;
+    for (const auto &P : S.getSUnit()->Preds) {
+      auto PSU = P.getSUnit();
+      assert(!PSU->isBoundaryNode());
+      if (PSU != SchedSU && !isScheduled(PSU)) {
+        Set.insert(PSU);
+      }
+    }
+  }
+  SmallVector<const SUnit*, 32> Worklist(Set.begin(), Set.end());
+  while (!Worklist.empty()) {
+    auto SU = Worklist.pop_back_val();
+    assert(!SU->isBoundaryNode());
+    for (const auto &P : SU->Preds) {
+      if (!P.getSUnit()->isBoundaryNode() && !isScheduled(P.getSUnit()) &&
+          Set.insert(P.getSUnit()).second)
+        Worklist.push_back(P.getSUnit());
+    }
+  }
+  DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
+               << ")'s non-ready successors of " << Priority
+               << " priority in ready queue: ");
+  const auto SetEnd = Set.end();
+  for (auto &C : RQ) {
+    if (Set.find(C.SU) != SetEnd) {
+      C.Priority = Priority;
+      DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
+    }
+  }
+  DEBUG(dbgs() << '\n');
+}
+
+void GCNMinRegScheduler::releaseSuccessors(const SUnit* SU, int Priority) {
+  for (const auto &S : SU->Succs) {
+    auto SuccSU = S.getSUnit();
+    if (S.isWeak())
+      continue;
+    assert(SuccSU->isBoundaryNode() || getNumPreds(SuccSU) > 0);
+    if (!SuccSU->isBoundaryNode() && decNumPreds(SuccSU) == 0)
+      RQ.push_front(*new (Alloc.Allocate()) Candidate(SuccSU, Priority));
+  }
+}
+
+std::vector<const SUnit*>
+GCNMinRegScheduler::schedule(ArrayRef<const SUnit*> TopRoots,
+                             const ScheduleDAG &DAG) {
+  const auto &SUnits = DAG.SUnits;
+  std::vector<const SUnit*> Schedule;
+  Schedule.reserve(SUnits.size());
+
+  initNumPreds(SUnits);
+
+  int StepNo = 0;
+
+  for (auto SU : TopRoots) {
+    RQ.push_back(*new (Alloc.Allocate()) Candidate(SU, StepNo));
+  }
+  releaseSuccessors(&DAG.EntrySU, StepNo);
+
+  while (!RQ.empty()) {
+    DEBUG(
+      dbgs() << "\n=== Picking candidate, Step = " << StepNo << "\n"
+                "Ready queue:";
+      for (auto &C : RQ)
+        dbgs() << ' ' << C.SU->NodeNum << "(P" << C.Priority << ')';
+      dbgs() << '\n';
+    );
+
+    auto C = pickCandidate();
+    assert(C);
+    RQ.remove(*C);
+    auto SU = C->SU;
+    DEBUG(dbgs() << "Selected "; SU->dump(&DAG));
+
+    releaseSuccessors(SU, StepNo);
+    Schedule.push_back(SU);
+    setIsScheduled(SU);
+
+    if (getReadySuccessors(SU) == 0)
+      bumpPredsPriority(SU, StepNo);
+
+    ++StepNo;
+  }
+  assert(SUnits.size() == Schedule.size());
+
+  return Schedule;
+}
+
+namespace llvm {
+std::vector<const SUnit*> makeMinRegSchedule(ArrayRef<const SUnit*> TopRoots,
+                                             const ScheduleDAG &DAG) {
+  GCNMinRegScheduler S;
+  return S.schedule(TopRoots, DAG);
+}
+}
diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4ecfa118fb27517e277f974f246a3925d06643da
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -0,0 +1,355 @@
+//===------------------------- GCNRegPressure.cpp - -----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#include "GCNRegPressure.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "misched"
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void llvm::printLivesAt(SlotIndex SI,
+                        const LiveIntervals &LIS,
+                        const MachineRegisterInfo &MRI) {
+  dbgs() << "Live regs at " << SI << ": "
+         << *LIS.getInstructionFromIndex(SI);
+  unsigned Num = 0;
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    const unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    const auto &LI = LIS.getInterval(Reg);
+    if (LI.hasSubRanges()) {
+      bool firstTime = true;
+      for (const auto &S : LI.subranges()) {
+        if (!S.liveAt(SI)) continue;
+        if (firstTime) {
+          dbgs() << "  " << PrintReg(Reg, MRI.getTargetRegisterInfo())
+                 << '\n';
+          firstTime = false;
+        }
+        dbgs() << "  " << S << '\n';
+        ++Num;
+      }
+    } else if (LI.liveAt(SI)) {
+      dbgs() << "  " << LI << '\n';
+      ++Num;
+    }
+  }
+  if (!Num) dbgs() << "  <none>\n";
+}
+
+static bool isEqual(const GCNRPTracker::LiveRegSet &S1,
+                    const GCNRPTracker::LiveRegSet &S2) {
+  if (S1.size() != S2.size())
+    return false;
+
+  for (const auto &P : S1) {
+    auto I = S2.find(P.first);
+    if (I == S2.end() || I->second != P.second)
+      return false;
+  }
+  return true;
+}
+
+static GCNRPTracker::LiveRegSet
+stripEmpty(const GCNRPTracker::LiveRegSet &LR) {
+  GCNRPTracker::LiveRegSet Res;
+  for (const auto &P : LR) {
+    if (P.second.any())
+      Res.insert(P);
+  }
+  return Res;
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// GCNRegPressure
+
+unsigned GCNRegPressure::getRegKind(unsigned Reg,
+                                    const MachineRegisterInfo &MRI) {
+  assert(TargetRegisterInfo::isVirtualRegister(Reg));
+  const auto RC = MRI.getRegClass(Reg);
+  auto STI = static_cast<const SIRegisterInfo*>(MRI.getTargetRegisterInfo());
+  return STI->isSGPRClass(RC) ?
+    (RC->getSize() == 4 ? SGPR32 : SGPR_TUPLE) :
+    (RC->getSize() == 4 ? VGPR32 : VGPR_TUPLE);
+}
+
+void GCNRegPressure::inc(unsigned Reg,
+                         LaneBitmask PrevMask,
+                         LaneBitmask NewMask,
+                         const MachineRegisterInfo &MRI) {
+  if (NewMask == PrevMask)
+    return;
+
+  int Sign = 1;
+  if (NewMask < PrevMask) {
+    std::swap(NewMask, PrevMask);
+    Sign = -1;
+  }
+#ifndef NDEBUG
+  const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg);
+#endif
+  switch (auto Kind = getRegKind(Reg, MRI)) {
+  case SGPR32:
+  case VGPR32:
+    assert(PrevMask.none() && NewMask == MaxMask);
+    Value[Kind] += Sign;
+    break;
+
+  case SGPR_TUPLE:
+  case VGPR_TUPLE:
+    assert(NewMask < MaxMask || NewMask == MaxMask);
+    assert(PrevMask < NewMask);
+
+    Value[Kind == SGPR_TUPLE ? SGPR32 : VGPR32] +=
+      Sign * countPopulation((~PrevMask & NewMask).getAsInteger());
+
+    if (PrevMask.none()) {
+      assert(NewMask.any());
+      Value[Kind] += Sign * MRI.getPressureSets(Reg).getWeight();
+    }
+    break;
+
+  default: llvm_unreachable("Unknown register kind");
+  }
+}
+
+bool GCNRegPressure::less(const SISubtarget &ST,
+                          const GCNRegPressure& O,
+                          unsigned MaxOccupancy) const {
+  const auto SGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumSGPRs(getSGRPNum()));
+  const auto VGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+  const auto OtherSGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumSGPRs(O.getSGRPNum()));
+  const auto OtherVGPROcc = std::min(MaxOccupancy,
+                                ST.getOccupancyWithNumVGPRs(O.getVGRPNum()));
+
+  const auto Occ = std::min(SGPROcc, VGPROcc);
+  const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc);
+  if (Occ != OtherOcc)
+    return Occ > OtherOcc;
+
+  bool SGPRImportant = SGPROcc < VGPROcc;
+  const bool OtherSGPRImportant = OtherSGPROcc < OtherVGPROcc;
+
+  // if both pressures disagree on what is more important compare vgprs
+  if (SGPRImportant != OtherSGPRImportant) {
+    SGPRImportant = false;
+  }
+
+  // compare large regs pressure
+  bool SGPRFirst = SGPRImportant;
+  for (int I = 2; I > 0; --I, SGPRFirst = !SGPRFirst) {
+    if (SGPRFirst) {
+      auto SW = getSGPRTuplesWeight();
+      auto OtherSW = O.getSGPRTuplesWeight();
+      if (SW != OtherSW)
+        return SW < OtherSW;
+    } else {
+      auto VW = getVGPRTuplesWeight();
+      auto OtherVW = O.getVGPRTuplesWeight();
+      if (VW != OtherVW)
+        return VW < OtherVW;
+    }
+  }
+  return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()):
+                         (getVGRPNum() < O.getVGRPNum());
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const {
+  OS << "VGPRs: " << getVGRPNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')';
+  OS << ", SGPRs: " << getSGRPNum();
+  if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')';
+  OS << ", LVGPR WT: " << getVGPRTuplesWeight()
+     << ", LSGPR WT: " << getSGPRTuplesWeight();
+  if (ST) OS << " -> Occ: " << getOccupancy(*ST);
+  OS << '\n';
+}
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// GCNRPTracker
+
+LaneBitmask llvm::getLiveLaneMask(unsigned Reg,
+                                  SlotIndex SI,
+                                  const LiveIntervals &LIS,
+                                  const MachineRegisterInfo &MRI) {
+  assert(!MRI.reg_nodbg_empty(Reg));
+  LaneBitmask LiveMask;
+  const auto &LI = LIS.getInterval(Reg);
+  if (LI.hasSubRanges()) {
+    for (const auto &S : LI.subranges())
+      if (S.liveAt(SI)) {
+        LiveMask |= S.LaneMask;
+        assert(LiveMask < MRI.getMaxLaneMaskForVReg(Reg) ||
+               LiveMask == MRI.getMaxLaneMaskForVReg(Reg));
+      }
+  } else if (LI.liveAt(SI)) {
+    LiveMask = MRI.getMaxLaneMaskForVReg(Reg);
+  }
+  return LiveMask;
+}
+
+GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI,
+                                           const LiveIntervals &LIS,
+                                           const MachineRegisterInfo &MRI) {
+  GCNRPTracker::LiveRegSet LiveRegs;
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    auto Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI);
+    if (LiveMask.any())
+      LiveRegs[Reg] = LiveMask;
+  }
+  return LiveRegs;
+}
+
+void GCNUpwardRPTracker::reset(const MachineInstr &MI) {
+  MRI = &MI.getParent()->getParent()->getRegInfo();
+  LiveRegs = getLiveRegsAfter(MI, LIS);
+  MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs);
+}
+
+LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const {
+  assert(MO.isDef() && MO.isReg() &&
+    TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+  // We don't rely on read-undef flag because in case of tentative schedule
+  // tracking it isn't set correctly yet. This works correctly however since
+  // use mask has been tracked before using LIS.
+  return MO.getSubReg() == 0 ?
+    MRI->getMaxLaneMaskForVReg(MO.getReg()) :
+    MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg());
+}
+
+LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const {
+  assert(MO.isUse() && MO.isReg() &&
+         TargetRegisterInfo::isVirtualRegister(MO.getReg()));
+
+  if (auto SubReg = MO.getSubReg())
+    return MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
+
+  auto MaxMask = MRI->getMaxLaneMaskForVReg(MO.getReg());
+  if (MaxMask.getAsInteger() == 1) // cannot have subregs
+    return MaxMask;
+
+  // For a tentative schedule LIS isn't updated yet but livemask should remain
+  // the same on any schedule. Subreg defs can be reordered but they all must
+  // dominate uses anyway.
+  auto SI = LIS.getInstructionIndex(*MO.getParent()).getBaseIndex();
+  return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI);
+}
+
+void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
+  assert(MRI && "call reset first");
+
+  LastTrackedMI = &MI;
+
+  if (MI.isDebugValue())
+    return;
+
+  // process all defs first to ensure early clobbers are handled correctly
+  // iterating over operands() to catch implicit defs
+  for (const auto &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef() ||
+      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+
+    auto Reg = MO.getReg();
+    auto &LiveMask = LiveRegs[Reg];
+    auto PrevMask = LiveMask;
+    LiveMask &= ~getDefRegMask(MO);
+    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+  }
+
+  // then all uses
+  for (const auto &MO : MI.uses()) {
+    if (!MO.isReg() || !MO.readsReg() ||
+      !TargetRegisterInfo::isVirtualRegister(MO.getReg()))
+      continue;
+
+    auto Reg = MO.getReg();
+    auto &LiveMask = LiveRegs[Reg];
+    auto PrevMask = LiveMask;
+    LiveMask |= getUsedRegMask(MO);
+    CurPressure.inc(Reg, PrevMask, LiveMask, *MRI);
+  }
+
+  MaxPressure = max(MaxPressure, CurPressure);
+}
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD
+static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR,
+                           const GCNRPTracker::LiveRegSet &TrackedLR,
+                           const TargetRegisterInfo *TRI) {
+  for (auto const &P : TrackedLR) {
+    auto I = LISLR.find(P.first);
+    if (I == LISLR.end()) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+             << ":L" << PrintLaneMask(P.second)
+             << " isn't found in LIS reported set\n";
+    }
+    else if (I->second != P.second) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+        << " masks doesn't match: LIS reported "
+        << PrintLaneMask(I->second)
+        << ", tracked "
+        << PrintLaneMask(P.second)
+        << '\n';
+    }
+  }
+  for (auto const &P : LISLR) {
+    auto I = TrackedLR.find(P.first);
+    if (I == TrackedLR.end()) {
+      dbgs() << "  " << PrintReg(P.first, TRI)
+             << ":L" << PrintLaneMask(P.second)
+             << " isn't found in tracked set\n";
+    }
+  }
+}
+
+bool GCNUpwardRPTracker::isValid() const {
+  const auto &SI = LIS.getInstructionIndex(*LastTrackedMI).getBaseIndex();
+  const auto LISLR = llvm::getLiveRegs(SI, LIS, *MRI);
+  const auto TrackedLR = stripEmpty(LiveRegs);
+
+  if (!isEqual(LISLR, TrackedLR)) {
+    dbgs() << "\nGCNUpwardRPTracker error: Tracked and"
+              " LIS reported livesets mismatch:\n";
+    printLivesAt(SI, LIS, *MRI);
+    reportMismatch(LISLR, TrackedLR, MRI->getTargetRegisterInfo());
+    return false;
+  }
+
+  auto LISPressure = getRegPressure(*MRI, LISLR);
+  if (LISPressure != CurPressure) {
+    dbgs() << "GCNUpwardRPTracker error: Pressure sets different\nTracked: ";
+    CurPressure.print(dbgs());
+    dbgs() << "LIS rpt: ";
+    LISPressure.print(dbgs());
+    return false;
+  }
+  return true;
+}
+
+#endif
diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h
new file mode 100644
index 0000000000000000000000000000000000000000..82e76a7bfddccfc5e2e55a29b9c84e11484fb7f7
--- /dev/null
+++ b/lib/Target/AMDGPU/GCNRegPressure.h
@@ -0,0 +1,170 @@
+//===---------------------- GCNRegPressure.h -*- C++ -*--------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
+#define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
+
+#include "AMDGPUSubtarget.h"
+
+#include <limits>
+
+namespace llvm {
+
+struct GCNRegPressure {
+  enum RegKind {
+    SGPR32,
+    SGPR_TUPLE,
+    VGPR32,
+    VGPR_TUPLE,
+    TOTAL_KINDS
+  };
+
+  GCNRegPressure() {
+    clear();
+  }
+
+  bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; }
+
+  void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); }
+
+  unsigned getSGRPNum() const { return Value[SGPR32]; }
+  unsigned getVGRPNum() const { return Value[VGPR32]; }
+
+  unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; }
+  unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; }
+
+  unsigned getOccupancy(const SISubtarget &ST) const {
+    return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()),
+                    ST.getOccupancyWithNumVGPRs(getVGRPNum()));
+  }
+
+  void inc(unsigned Reg,
+           LaneBitmask PrevMask,
+           LaneBitmask NewMask,
+           const MachineRegisterInfo &MRI);
+
+  bool higherOccupancy(const SISubtarget &ST, const GCNRegPressure& O) const {
+    return getOccupancy(ST) > O.getOccupancy(ST);
+  }
+
+  bool less(const SISubtarget &ST, const GCNRegPressure& O,
+    unsigned MaxOccupancy = std::numeric_limits<unsigned>::max()) const;
+
+  bool operator==(const GCNRegPressure &O) const {
+    return std::equal(&Value[0], &Value[TOTAL_KINDS], O.Value);
+  }
+
+  bool operator!=(const GCNRegPressure &O) const {
+    return !(*this == O);
+  }
+
+  void print(raw_ostream &OS, const SISubtarget *ST=nullptr) const;
+  void dump() const { print(dbgs()); }
+
+private:
+  unsigned Value[TOTAL_KINDS];
+
+  static unsigned getRegKind(unsigned Reg, const MachineRegisterInfo &MRI);
+
+  friend GCNRegPressure max(const GCNRegPressure &P1,
+                            const GCNRegPressure &P2);
+};
+
+inline GCNRegPressure max(const GCNRegPressure &P1, const GCNRegPressure &P2) {
+  GCNRegPressure Res;
+  for (unsigned I = 0; I < GCNRegPressure::TOTAL_KINDS; ++I)
+    Res.Value[I] = std::max(P1.Value[I], P2.Value[I]);
+  return Res;
+}
+
+class GCNRPTracker {
+public:
+  typedef DenseMap<unsigned, LaneBitmask> LiveRegSet;
+
+protected:
+  LiveRegSet LiveRegs;
+  GCNRegPressure CurPressure, MaxPressure;
+  const MachineInstr *LastTrackedMI = nullptr;
+  mutable const MachineRegisterInfo *MRI = nullptr;
+  GCNRPTracker() {}
+public:
+  // live regs for the current state
+  const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; }
+  const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; }
+
+  // returns MaxPressure, resetting it
+  decltype(MaxPressure) moveMaxPressure() {
+    auto Res = MaxPressure;
+    MaxPressure.clear();
+    return Res;
+  }
+  decltype(LiveRegs) moveLiveRegs() {
+    return std::move(LiveRegs);
+  }
+};
+
+class GCNUpwardRPTracker : public GCNRPTracker {
+  const LiveIntervals &LIS;
+  LaneBitmask getDefRegMask(const MachineOperand &MO) const;
+  LaneBitmask getUsedRegMask(const MachineOperand &MO) const;
+public:
+  GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {}
+  // reset tracker to the point just below MI
+  // filling live regs upon this point using LIS
+  void reset(const MachineInstr &MI);
+
+  // move to the state just above the MI
+  void recede(const MachineInstr &MI);
+
+  // checks whether the tracker's state after receding MI corresponds
+  // to reported by LIS
+  bool isValid() const;
+};
+
+LaneBitmask getLiveLaneMask(unsigned Reg,
+                            SlotIndex SI,
+                            const LiveIntervals &LIS,
+                            const MachineRegisterInfo &MRI);
+
+GCNRPTracker::LiveRegSet getLiveRegs(SlotIndex SI,
+                                     const LiveIntervals &LIS,
+                                     const MachineRegisterInfo &MRI);
+
+inline GCNRPTracker::LiveRegSet getLiveRegsAfter(const MachineInstr &MI,
+                                                 const LiveIntervals &LIS) {
+  return getLiveRegs(LIS.getInstructionIndex(MI).getDeadSlot(), LIS,
+                     MI.getParent()->getParent()->getRegInfo());
+}
+
+inline GCNRPTracker::LiveRegSet getLiveRegsBefore(const MachineInstr &MI,
+                                                  const LiveIntervals &LIS) {
+  return getLiveRegs(LIS.getInstructionIndex(MI).getBaseIndex(), LIS,
+                     MI.getParent()->getParent()->getRegInfo());
+}
+
+template <typename Range>
+GCNRegPressure getRegPressure(const MachineRegisterInfo &MRI,
+                              Range &&LiveRegs) {
+  GCNRegPressure Res;
+  for (const auto &RM : LiveRegs)
+    Res.inc(RM.first, LaneBitmask::getNone(), RM.second, MRI);
+  return Res;
+}
+
+void printLivesAt(SlotIndex SI,
+                  const LiveIntervals &LIS,
+                  const MachineRegisterInfo &MRI);
+
+} // End namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e714eeffad008ee1210a84767a82549765c2c254..ea305a92fc60df4829cfc3da558e1c78ca72d8f2 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -18,6 +18,7 @@
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
+#include "llvm/Support/MathExtras.h"
 
 #define DEBUG_TYPE "misched"
 
@@ -25,7 +26,7 @@ using namespace llvm;
 
 GCNMaxOccupancySchedStrategy::GCNMaxOccupancySchedStrategy(
     const MachineSchedContext *C) :
-    GenericScheduler(C) { }
+    GenericScheduler(C), TargetOccupancy(0), MF(nullptr) { }
 
 static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
                             const MachineFunction &MF) {
@@ -35,18 +36,46 @@ static unsigned getMaxWaves(unsigned SGPRs, unsigned VGPRs,
   unsigned MinRegOccupancy = std::min(ST.getOccupancyWithNumSGPRs(SGPRs),
                                       ST.getOccupancyWithNumVGPRs(VGPRs));
   return std::min(MinRegOccupancy,
-                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize()));
+                  ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
+                                                  *MF.getFunction()));
+}
+
+void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
+  GenericScheduler::initialize(DAG);
+
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+
+  MF = &DAG->MF;
+
+  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
+
+  // FIXME: This is also necessary, because some passes that run after
+  // scheduling and before regalloc increase register pressure.
+  const int ErrorMargin = 3;
+
+  SGPRExcessLimit = Context->RegClassInfo
+    ->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass) - ErrorMargin;
+  VGPRExcessLimit = Context->RegClassInfo
+    ->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass) - ErrorMargin;
+  if (TargetOccupancy) {
+    SGPRCriticalLimit = ST.getMaxNumSGPRs(TargetOccupancy, true);
+    VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy);
+  } else {
+    SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
+                                                    SRI->getSGPRPressureSet());
+    VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
+                                                    SRI->getVGPRPressureSet());
+  }
+
+  SGPRCriticalLimit -= ErrorMargin;
+  VGPRCriticalLimit -= ErrorMargin;
 }
 
 void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU,
                                      bool AtTop, const RegPressureTracker &RPTracker,
                                      const SIRegisterInfo *SRI,
-                                     int SGPRPressure,
-                                     int VGPRPressure,
-                                     int SGPRExcessLimit,
-                                     int VGPRExcessLimit,
-                                     int SGPRCriticalLimit,
-                                     int VGPRCriticalLimit) {
+                                     unsigned SGPRPressure,
+                                     unsigned VGPRPressure) {
 
   Cand.SU = SU;
   Cand.AtTop = AtTop;
@@ -66,8 +95,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
     TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
   }
 
-  int NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
-  int NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+  unsigned NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
+  unsigned NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
 
   // If two instructions increase the pressure of different register sets
   // by the same amount, the generic scheduler will prefer to schedule the
@@ -77,7 +106,7 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // only for VGPRs or only for SGPRs.
 
   // FIXME: Better heuristics to determine whether to prefer SGPRs or VGPRs.
-  const int MaxVGPRPressureInc = 16;
+  const unsigned MaxVGPRPressureInc = 16;
   bool ShouldTrackVGPRs = VGPRPressure + MaxVGPRPressureInc >= VGPRExcessLimit;
   bool ShouldTrackSGPRs = !ShouldTrackVGPRs && SGPRPressure >= SGPRExcessLimit;
 
@@ -86,11 +115,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // to increase the likelihood we don't go over the limits.  We should improve
   // the analysis to look through dependencies to find the path with the least
   // register pressure.
-  // FIXME: This is also necessary, because some passes that run after
-  // scheduling and before regalloc increase register pressure.
-  const int ErrorMargin = 3;
-  VGPRExcessLimit -= ErrorMargin;
-  SGPRExcessLimit -= ErrorMargin;
 
   // We only need to update the RPDelata for instructions that increase
   // register pressure.  Instructions that decrease or keep reg pressure
@@ -111,9 +135,6 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // register pressure is 'CRITICAL', increading SGPR and VGPR pressure both
   // has the same cost, so we don't need to prefer one over the other.
 
-  VGPRCriticalLimit -= ErrorMargin;
-  SGPRCriticalLimit -= ErrorMargin;
-
   int SGPRDelta = NewSGPRPressure - SGPRCriticalLimit;
   int VGPRDelta = NewVGPRPressure - VGPRCriticalLimit;
 
@@ -134,27 +155,16 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                          const CandPolicy &ZonePolicy,
                                          const RegPressureTracker &RPTracker,
                                          SchedCandidate &Cand) {
-  const SISubtarget &ST = DAG->MF.getSubtarget<SISubtarget>();
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
   unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
   unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
-  unsigned SGPRExcessLimit =
-      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::SGPR_32RegClass);
-  unsigned VGPRExcessLimit =
-      Context->RegClassInfo->getNumAllocatableRegs(&AMDGPU::VGPR_32RegClass);
-  unsigned MaxWaves = getMaxWaves(SGPRPressure, VGPRPressure, DAG->MF);
-  unsigned SGPRCriticalLimit = SRI->getMaxNumSGPRs(ST, MaxWaves, true);
-  unsigned VGPRCriticalLimit = SRI->getMaxNumVGPRs(MaxWaves);
-
   ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {
 
     SchedCandidate TryCand(ZonePolicy);
     initCandidate(TryCand, SU, Zone.isTop(), RPTracker, SRI,
-                  SGPRPressure, VGPRPressure,
-                  SGPRExcessLimit, VGPRExcessLimit,
-                  SGPRCriticalLimit, VGPRCriticalLimit);
+                  SGPRPressure, VGPRPressure);
     // Pass SchedBoundary only when comparing nodes from the same boundary.
     SchedBoundary *ZoneArg = Cand.AtTop == TryCand.AtTop ? &Zone : nullptr;
     GenericScheduler::tryCandidate(Cand, TryCand, ZoneArg);
@@ -167,16 +177,6 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
   }
 }
 
-static int getBidirectionalReasonRank(GenericSchedulerBase::CandReason Reason) {
-  switch (Reason) {
-  default:
-    return Reason;
-  case GenericSchedulerBase::RegCritical:
-  case GenericSchedulerBase::RegExcess:
-    return -Reason;
- }
-}
-
 // This function is mostly cut and pasted from
 // GenericScheduler::pickNodeBidirectional()
 SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
@@ -224,9 +224,9 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   // Pick best from BotCand and TopCand.
   DEBUG(
     dbgs() << "Top Cand: ";
-    traceCandidate(BotCand);
-    dbgs() << "Bot Cand: ";
     traceCandidate(TopCand);
+    dbgs() << "Bot Cand: ";
+    traceCandidate(BotCand);
   );
   SchedCandidate Cand;
   if (TopCand.Reason == BotCand.Reason) {
@@ -249,9 +249,7 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
     } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
       Cand = BotCand;
     } else {
-      int TopRank = getBidirectionalReasonRank(TopCand.Reason);
-      int BotRank = getBidirectionalReasonRank(BotCand.Reason);
-      if (TopRank > BotRank) {
+      if (BotCand.Reason > TopCand.Reason) {
         Cand = TopCand;
       } else {
         Cand = BotCand;
@@ -310,3 +308,255 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNode(bool &IsTopNode) {
   DEBUG(dbgs() << "Scheduling SU(" << SU->NodeNum << ") " << *SU->getInstr());
   return SU;
 }
+
+GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
+                        std::unique_ptr<MachineSchedStrategy> S) :
+  ScheduleDAGMILive(C, std::move(S)),
+  ST(MF.getSubtarget<SISubtarget>()),
+  MFI(*MF.getInfo<SIMachineFunctionInfo>()),
+  StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(),
+                                                    *MF.getFunction())),
+  MinOccupancy(StartingOccupancy), Stage(0) {
+
+  DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
+}
+
+void GCNScheduleDAGMILive::schedule() {
+  std::vector<MachineInstr*> Unsched;
+  Unsched.reserve(NumRegionInstrs);
+  for (auto &I : *this)
+    Unsched.push_back(&I);
+
+  std::pair<unsigned, unsigned> PressureBefore;
+  if (LIS) {
+    DEBUG(dbgs() << "Pressure before scheduling:\n");
+    discoverLiveIns();
+    PressureBefore = getRealRegPressure();
+  }
+
+  ScheduleDAGMILive::schedule();
+  if (Stage == 0)
+    Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
+
+  if (!LIS)
+    return;
+
+  // Check the results of scheduling.
+  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+  DEBUG(dbgs() << "Pressure after scheduling:\n");
+  auto PressureAfter = getRealRegPressure();
+  LiveIns.clear();
+
+  if (PressureAfter.first <= S.SGPRCriticalLimit &&
+      PressureAfter.second <= S.VGPRCriticalLimit) {
+    DEBUG(dbgs() << "Pressure in desired limits, done.\n");
+    return;
+  }
+  unsigned WavesAfter = getMaxWaves(PressureAfter.first,
+                                    PressureAfter.second, MF);
+  unsigned WavesBefore = getMaxWaves(PressureBefore.first,
+                                      PressureBefore.second, MF);
+  DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore <<
+                  ", after " << WavesAfter << ".\n");
+
+  // We could not keep current target occupancy because of the just scheduled
+  // region. Record new occupancy for next scheduling cycle.
+  unsigned NewOccupancy = std::max(WavesAfter, WavesBefore);
+  if (NewOccupancy < MinOccupancy) {
+    MinOccupancy = NewOccupancy;
+    DEBUG(dbgs() << "Occupancy lowered for the function to "
+                 << MinOccupancy << ".\n");
+  }
+
+  if (WavesAfter >= WavesBefore)
+    return;
+
+  DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  RegionEnd = RegionBegin;
+  for (MachineInstr *MI : Unsched) {
+    if (MI->getIterator() != RegionEnd) {
+      BB->remove(MI);
+      BB->insert(RegionEnd, MI);
+      LIS->handleMove(*MI, true);
+    }
+    // Reset read-undef flags and update them later.
+    for (auto &Op : MI->operands())
+      if (Op.isReg() && Op.isDef())
+        Op.setIsUndef(false);
+    RegisterOperands RegOpers;
+    RegOpers.collect(*MI, *TRI, MRI, ShouldTrackLaneMasks, false);
+    if (ShouldTrackLaneMasks) {
+      // Adjust liveness and add missing dead+read-undef flags.
+      SlotIndex SlotIdx = LIS->getInstructionIndex(*MI).getRegSlot();
+      RegOpers.adjustLaneLiveness(*LIS, MRI, SlotIdx, MI);
+    } else {
+      // Adjust for missing dead-def flags.
+      RegOpers.detectDeadDefs(*MI, *LIS);
+    }
+    RegionEnd = MI->getIterator();
+    ++RegionEnd;
+    DEBUG(dbgs() << "Scheduling " << *MI);
+  }
+  RegionBegin = Unsched.front()->getIterator();
+  if (Stage == 0)
+    Regions.back() = std::make_pair(RegionBegin, RegionEnd);
+
+  placeDebugValues();
+}
+
+static inline void setMask(const MachineRegisterInfo &MRI,
+                           const SIRegisterInfo *SRI, unsigned Reg,
+                           LaneBitmask &PrevMask, LaneBitmask NewMask,
+                           unsigned &SGPRs, unsigned &VGPRs) {
+  int NewRegs = countPopulation(NewMask.getAsInteger()) -
+                countPopulation(PrevMask.getAsInteger());
+  if (SRI->isSGPRReg(MRI, Reg))
+    SGPRs += NewRegs;
+  if (SRI->isVGPR(MRI, Reg))
+    VGPRs += NewRegs;
+  assert ((int)SGPRs >= 0 && (int)VGPRs >= 0);
+  PrevMask = NewMask;
+}
+
+void GCNScheduleDAGMILive::discoverLiveIns() {
+  unsigned SGPRs = 0;
+  unsigned VGPRs = 0;
+
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+  SlotIndex SI = LIS->getInstructionIndex(*begin()).getBaseIndex();
+  assert (SI.isValid());
+
+  DEBUG(dbgs() << "Region live-ins:");
+  for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) {
+    unsigned Reg = TargetRegisterInfo::index2VirtReg(I);
+    if (MRI.reg_nodbg_empty(Reg))
+      continue;
+    const LiveInterval &LI = LIS->getInterval(Reg);
+    LaneBitmask LaneMask = LaneBitmask::getNone();
+    if (LI.hasSubRanges()) {
+      for (const auto &S : LI.subranges())
+        if (S.liveAt(SI))
+          LaneMask |= S.LaneMask;
+    } else if (LI.liveAt(SI)) {
+      LaneMask = MRI.getMaxLaneMaskForVReg(Reg);
+    }
+
+    if (LaneMask.any()) {
+      setMask(MRI, SRI, Reg, LiveIns[Reg], LaneMask, SGPRs, VGPRs);
+
+      DEBUG(dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':'
+                   << PrintLaneMask(LiveIns[Reg]));
+    }
+  }
+
+  LiveInPressure = std::make_pair(SGPRs, VGPRs);
+
+  DEBUG(dbgs() << "\nLive-in pressure:\nSGPR = " << SGPRs
+               << "\nVGPR = " << VGPRs << '\n');
+}
+
+std::pair<unsigned, unsigned>
+GCNScheduleDAGMILive::getRealRegPressure() const {
+  unsigned SGPRs, MaxSGPRs, VGPRs, MaxVGPRs;
+  SGPRs = MaxSGPRs = LiveInPressure.first;
+  VGPRs = MaxVGPRs = LiveInPressure.second;
+
+  const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
+  DenseMap<unsigned, LaneBitmask> LiveRegs(LiveIns);
+
+  for (const MachineInstr &MI : *this) {
+    if (MI.isDebugValue())
+      continue;
+    SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex();
+    assert (SI.isValid());
+
+    // Remove dead registers or mask bits.
+    for (auto &It : LiveRegs) {
+      if (It.second.none())
+        continue;
+      const LiveInterval &LI = LIS->getInterval(It.first);
+      if (LI.hasSubRanges()) {
+        for (const auto &S : LI.subranges())
+          if (!S.liveAt(SI))
+            setMask(MRI, SRI, It.first, It.second, It.second & ~S.LaneMask,
+                    SGPRs, VGPRs);
+      } else if (!LI.liveAt(SI)) {
+        setMask(MRI, SRI, It.first, It.second, LaneBitmask::getNone(),
+                SGPRs, VGPRs);
+      }
+    }
+
+    // Add new registers or mask bits.
+    for (const auto &MO : MI.defs()) {
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (!TargetRegisterInfo::isVirtualRegister(Reg))
+        continue;
+      unsigned SubRegIdx = MO.getSubReg();
+      LaneBitmask LaneMask = SubRegIdx != 0
+                             ? TRI->getSubRegIndexLaneMask(SubRegIdx)
+                             : MRI.getMaxLaneMaskForVReg(Reg);
+      LaneBitmask &LM = LiveRegs[Reg];
+      setMask(MRI, SRI, Reg, LM, LM | LaneMask, SGPRs, VGPRs);
+    }
+    MaxSGPRs = std::max(MaxSGPRs, SGPRs);
+    MaxVGPRs = std::max(MaxVGPRs, VGPRs);
+  }
+
+  DEBUG(dbgs() << "Real region's register pressure:\nSGPR = " << MaxSGPRs
+               << "\nVGPR = " << MaxVGPRs << '\n');
+
+  return std::make_pair(MaxSGPRs, MaxVGPRs);
+}
+
+void GCNScheduleDAGMILive::finalizeSchedule() {
+  // Retry function scheduling if we found resulting occupancy and it is
+  // lower than used for first pass scheduling. This will give more freedom
+  // to schedule low register pressure blocks.
+  // Code is partially copied from MachineSchedulerBase::scheduleRegions().
+
+  if (!LIS || StartingOccupancy <= MinOccupancy)
+    return;
+
+  DEBUG(dbgs() << "Retrying function scheduling with lowest recorded occupancy "
+               << MinOccupancy << ".\n");
+
+  Stage++;
+  GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl;
+  S.setTargetOccupancy(MinOccupancy);
+
+  MachineBasicBlock *MBB = nullptr;
+  for (auto Region : Regions) {
+    RegionBegin = Region.first;
+    RegionEnd = Region.second;
+
+    if (RegionBegin->getParent() != MBB) {
+      if (MBB) finishBlock();
+      MBB = RegionBegin->getParent();
+      startBlock(MBB);
+    }
+
+    unsigned NumRegionInstrs = std::distance(begin(), end());
+    enterRegion(MBB, begin(), end(), NumRegionInstrs);
+
+    // Skip empty scheduling regions (0 or 1 schedulable instructions).
+    if (begin() == end() || begin() == std::prev(end())) {
+      exitRegion();
+      continue;
+    }
+    DEBUG(dbgs() << "********** MI Scheduling **********\n");
+    DEBUG(dbgs() << MF.getName()
+          << ":BB#" << MBB->getNumber() << " " << MBB->getName()
+          << "\n  From: " << *begin() << "    To: ";
+          if (RegionEnd != MBB->end()) dbgs() << *RegionEnd;
+          else dbgs() << "End";
+          dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n');
+
+    schedule();
+
+    exitRegion();
+  }
+  finishBlock();
+  LiveIns.shrink_and_clear();
+}
diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h
index 4cfc0cea81fb4a77a2446b73775927892e2f18d7..15af232704ffa091dae67ec27581e8b3a06b7555 100644
--- a/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -18,13 +18,16 @@
 
 namespace llvm {
 
+class SIMachineFunctionInfo;
 class SIRegisterInfo;
+class SISubtarget;
 
 /// This is a minimal scheduler strategy.  The main difference between this
 /// and the GenericScheduler is that GCNSchedStrategy uses different
 /// heuristics to determine excess/critical pressure sets.  Its goal is to
 /// maximize kernel occupancy (i.e. maximum number of waves per simd).
 class GCNMaxOccupancySchedStrategy : public GenericScheduler {
+  friend class GCNScheduleDAGMILive;
 
   SUnit *pickNodeBidirectional(bool &IsTopNode);
 
@@ -35,18 +38,65 @@ class GCNMaxOccupancySchedStrategy : public GenericScheduler {
   void initCandidate(SchedCandidate &Cand, SUnit *SU,
                      bool AtTop, const RegPressureTracker &RPTracker,
                      const SIRegisterInfo *SRI,
-                     int SGPRPressure, int VGPRPressure,
-                     int SGPRExcessLimit, int VGPRExcessLimit,
-                     int SGPRCriticalLimit, int VGPRCriticalLimit);
+                     unsigned SGPRPressure, unsigned VGPRPressure);
 
-  void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand,
-                    SchedBoundary *Zone, const SIRegisterInfo *SRI,
-                    unsigned SGPRPressure, unsigned VGPRPressure);
+  unsigned SGPRExcessLimit;
+  unsigned VGPRExcessLimit;
+  unsigned SGPRCriticalLimit;
+  unsigned VGPRCriticalLimit;
+
+  unsigned TargetOccupancy;
+
+  MachineFunction *MF;
 
 public:
   GCNMaxOccupancySchedStrategy(const MachineSchedContext *C);
 
   SUnit *pickNode(bool &IsTopNode) override;
+
+  void initialize(ScheduleDAGMI *DAG) override;
+
+  void setTargetOccupancy(unsigned Occ) { TargetOccupancy = Occ; }
+};
+
+class GCNScheduleDAGMILive : public ScheduleDAGMILive {
+
+  const SISubtarget &ST;
+
+  const SIMachineFunctionInfo &MFI;
+
+  // Occupancy target at the begining of function scheduling cycle.
+  unsigned StartingOccupancy;
+
+  // Minimal real occupancy recorder for the function.
+  unsigned MinOccupancy;
+
+  // Scheduling stage number.
+  unsigned Stage;
+
+  // Vecor of regions recorder for later rescheduling
+  SmallVector<std::pair<MachineBasicBlock::iterator,
+                        MachineBasicBlock::iterator>, 32> Regions;
+
+  // Region live-ins.
+  DenseMap<unsigned, LaneBitmask> LiveIns;
+
+  // Number of live-ins to the current region, first SGPR then VGPR.
+  std::pair<unsigned, unsigned> LiveInPressure;
+
+  // Collect current region live-ins.
+  void discoverLiveIns();
+
+  // Return current region pressure. First value is SGPR number, second is VGPR.
+  std::pair<unsigned, unsigned> getRealRegPressure() const;
+
+public:
+  GCNScheduleDAGMILive(MachineSchedContext *C,
+                       std::unique_ptr<MachineSchedStrategy> S);
+
+  void schedule() override;
+
+  void finalizeSchedule() override;
 };
 
 } // End namespace llvm
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
index 7172a0aa7167b4fb82207981625026d2882dc2c1..a817ff3cbaf09697f35938a8bf5958c878486a17 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.cpp
@@ -113,7 +113,7 @@ void AMDGPUInstPrinter::printOffset(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   uint16_t Imm = MI->getOperand(OpNo).getImm();
   if (Imm != 0) {
-    O << " offset:";
+    O << ((OpNo == 0)? "offset:" : " offset:");
     printU16ImmDecOperand(MI, OpNo, O);
   }
 }
@@ -375,6 +375,14 @@ void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
     O << formatHex(static_cast<uint64_t>(Imm));
 }
 
+void AMDGPUInstPrinter::printImmediateV216(uint32_t Imm,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  uint16_t Lo16 = static_cast<uint16_t>(Imm);
+  assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
+  printImmediate16(Lo16, STI, O);
+}
+
 void AMDGPUInstPrinter::printImmediate32(uint32_t Imm,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
@@ -489,6 +497,10 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
     case AMDGPU::OPERAND_REG_IMM_FP16:
       printImmediate16(Op.getImm(), STI, O);
       break;
+    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+      printImmediateV216(Op.getImm(), STI, O);
+      break;
     case MCOI::OPERAND_UNKNOWN:
     case MCOI::OPERAND_PCREL:
       O << formatDec(Op.getImm());
@@ -531,13 +543,34 @@ void AMDGPUInstPrinter::printOperandAndFPInputMods(const MCInst *MI,
                                                    const MCSubtargetInfo &STI,
                                                    raw_ostream &O) {
   unsigned InputModifiers = MI->getOperand(OpNo).getImm();
-  if (InputModifiers & SISrcMods::NEG)
-    O << '-';
+
+  // Use 'neg(...)' instead of '-' to avoid ambiguity.
+  // This is important for integer literals because
+  // -1 is not the same value as neg(1).
+  bool NegMnemo = false;
+
+  if (InputModifiers & SISrcMods::NEG) {
+    if (OpNo + 1 < MI->getNumOperands() &&
+        (InputModifiers & SISrcMods::ABS) == 0) {
+      const MCOperand &Op = MI->getOperand(OpNo + 1);
+      NegMnemo = Op.isImm() || Op.isFPImm();
+    }
+    if (NegMnemo) {
+      O << "neg(";
+    } else {
+      O << '-';
+    }
+  }
+
   if (InputModifiers & SISrcMods::ABS)
     O << '|';
   printOperand(MI, OpNo + 1, STI, O);
   if (InputModifiers & SISrcMods::ABS)
     O << '|';
+
+  if (NegMnemo) {
+    O << ')';
+  }
 }
 
 void AMDGPUInstPrinter::printOperandAndIntInputMods(const MCInst *MI,
@@ -672,11 +705,19 @@ template <unsigned N>
 void AMDGPUInstPrinter::printExpSrcN(const MCInst *MI, unsigned OpNo,
                                      const MCSubtargetInfo &STI,
                                      raw_ostream &O) {
-  int EnIdx = AMDGPU::getNamedOperandIdx(MI->getOpcode(), AMDGPU::OpName::en);
+  unsigned Opc = MI->getOpcode();
+  int EnIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::en);
   unsigned En = MI->getOperand(EnIdx).getImm();
 
-  // FIXME: What do we do with compr? The meaning of en changes depending on if
-  // compr is set.
+  int ComprIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::compr);
+
+  // If compr is set, print as src0, src0, src1, src1
+  if (MI->getOperand(ComprIdx).getImm()) {
+    if (N == 1 || N == 2)
+      --OpNo;
+    else if (N == 3)
+      OpNo -= 2;
+  }
 
   if (En & (1 << N))
     printRegOperand(MI->getOperand(OpNo).getReg(), O, MRI);
@@ -730,6 +771,71 @@ void AMDGPUInstPrinter::printExpTgt(const MCInst *MI, unsigned OpNo,
   }
 }
 
+static bool allOpsDefaultValue(const int* Ops, int NumOps, int Mod) {
+  int DefaultValue = (Mod == SISrcMods::OP_SEL_1);
+
+  for (int I = 0; I < NumOps; ++I) {
+    if (!!(Ops[I] & Mod) != DefaultValue)
+      return false;
+  }
+
+  return true;
+}
+
+static void printPackedModifier(const MCInst *MI, StringRef Name, unsigned Mod,
+                                raw_ostream &O) {
+  unsigned Opc = MI->getOpcode();
+  int NumOps = 0;
+  int Ops[3];
+
+  for (int OpName : { AMDGPU::OpName::src0_modifiers,
+                      AMDGPU::OpName::src1_modifiers,
+                      AMDGPU::OpName::src2_modifiers }) {
+    int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+    if (Idx == -1)
+      break;
+
+    Ops[NumOps++] = MI->getOperand(Idx).getImm();
+  }
+
+  if (allOpsDefaultValue(Ops, NumOps, Mod))
+    return;
+
+  O << Name;
+  for (int I = 0; I < NumOps; ++I) {
+    if (I != 0)
+      O << ',';
+
+    O << !!(Ops[I] & Mod);
+  }
+
+  O << ']';
+}
+
+void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  printPackedModifier(MI, " op_sel:[", SISrcMods::OP_SEL_0, O);
+}
+
+void AMDGPUInstPrinter::printOpSelHi(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  printPackedModifier(MI, " op_sel_hi:[", SISrcMods::OP_SEL_1, O);
+}
+
+void AMDGPUInstPrinter::printNegLo(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  printPackedModifier(MI, " neg_lo:[", SISrcMods::NEG, O);
+}
+
+void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI,
+                                   raw_ostream &O) {
+  printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O);
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
@@ -1057,27 +1163,28 @@ void AMDGPUInstPrinter::printSendMsg(const MCInst *MI, unsigned OpNo,
 void AMDGPUInstPrinter::printWaitFlag(const MCInst *MI, unsigned OpNo,
                                       const MCSubtargetInfo &STI,
                                       raw_ostream &O) {
-  IsaVersion IV = getIsaVersion(STI.getFeatureBits());
+  AMDGPU::IsaInfo::IsaVersion ISA =
+      AMDGPU::IsaInfo::getIsaVersion(STI.getFeatureBits());
 
   unsigned SImm16 = MI->getOperand(OpNo).getImm();
   unsigned Vmcnt, Expcnt, Lgkmcnt;
-  decodeWaitcnt(IV, SImm16, Vmcnt, Expcnt, Lgkmcnt);
+  decodeWaitcnt(ISA, SImm16, Vmcnt, Expcnt, Lgkmcnt);
 
   bool NeedSpace = false;
 
-  if (Vmcnt != getVmcntBitMask(IV)) {
+  if (Vmcnt != getVmcntBitMask(ISA)) {
     O << "vmcnt(" << Vmcnt << ')';
     NeedSpace = true;
   }
 
-  if (Expcnt != getExpcntBitMask(IV)) {
+  if (Expcnt != getExpcntBitMask(ISA)) {
     if (NeedSpace)
       O << ' ';
     O << "expcnt(" << Expcnt << ')';
     NeedSpace = true;
   }
 
-  if (Lgkmcnt != getLgkmcntBitMask(IV)) {
+  if (Lgkmcnt != getLgkmcntBitMask(ISA)) {
     if (NeedSpace)
       O << ' ';
     O << "lgkmcnt(" << Lgkmcnt << ')';
diff --git a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
index a6d348ff0f1211d832d665d19c54ba41ce9c97a7..c0b8e5c510893be223ddf280ffe1fff581b01d7d 100644
--- a/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
+++ b/lib/Target/AMDGPU/InstPrinter/AMDGPUInstPrinter.h
@@ -90,6 +90,8 @@ private:
                    raw_ostream &O);
   void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
+  void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
+                          raw_ostream &O);
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
   void printImmediate64(uint64_t Imm, const MCSubtargetInfo &STI,
@@ -117,6 +119,14 @@ private:
                         const MCSubtargetInfo &STI, raw_ostream &O);
   void printSDWADstUnused(const MCInst *MI, unsigned OpNo,
                           const MCSubtargetInfo &STI, raw_ostream &O);
+  void printOpSel(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printOpSelHi(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNegLo(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printNegHi(const MCInst *MI, unsigned OpNo,
+                  const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index ffb92aae599e0670dcbcc6c9764da99bceed6031..f3266fe82955c5a349ea0378a5036643ab46d39b 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -37,7 +37,7 @@ public:
                          bool &IsResolved) override;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override {
@@ -131,7 +131,7 @@ void AMDGPUAsmBackend::processFixupValue(const MCAssembler &Asm,
 
 void AMDGPUAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                   unsigned DataSize, uint64_t Value,
-                                  bool IsPCRel) const {
+                                  bool IsPCRel, MCContext &Ctx) const {
   if (!Value)
     return; // Doesn't change encoding.
 
@@ -164,7 +164,20 @@ const MCFixupKindInfo &AMDGPUAsmBackend::getFixupKindInfo(
 }
 
 bool AMDGPUAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
-  OW->WriteZeros(Count);
+  // If the count is not 4-byte aligned, we must be writing data into the text
+  // section (otherwise we have unaligned instructions, and thus have far
+  // bigger problems), so just write zeros instead.
+  OW->WriteZeros(Count % 4);
+
+  // We are properly aligned, so write NOPs as requested.
+  Count /= 4;
+
+  // FIXME: R600 support.
+  // s_nop 0
+  const uint32_t Encoded_S_NOP_0 = 0xbf800000;
+
+  for (uint64_t I = 0; I != Count; ++I)
+    OW->write32(Encoded_S_NOP_0);
 
   return true;
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadata.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..816e8c744b27d1ba9a5c71b1ec0f36cb7ae02141
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadata.h
@@ -0,0 +1,422 @@
+//===--- AMDGPUCodeObjectMetadata.h -----------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Code Object Metadata definitions and in-memory
+/// representations.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATA_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATA_H
+
+#include <cstdint>
+#include <string>
+#include <system_error>
+#include <vector>
+
+namespace llvm {
+namespace AMDGPU {
+
+//===----------------------------------------------------------------------===//
+// Code Object Metadata.
+//===----------------------------------------------------------------------===//
+namespace CodeObject {
+
+/// \brief Code object metadata major version.
+constexpr uint32_t MetadataVersionMajor = 1;
+/// \brief Code object metadata minor version.
+constexpr uint32_t MetadataVersionMinor = 0;
+
+/// \brief Code object metadata beginning assembler directive.
+constexpr char MetadataAssemblerDirectiveBegin[] =
+    ".amdgpu_code_object_metadata";
+/// \brief Code object metadata ending assembler directive.
+constexpr char MetadataAssemblerDirectiveEnd[] =
+    ".end_amdgpu_code_object_metadata";
+
+/// \brief Access qualifiers.
+enum class AccessQualifier : uint8_t {
+  Default   = 0,
+  ReadOnly  = 1,
+  WriteOnly = 2,
+  ReadWrite = 3,
+  Unknown   = 0xff
+};
+
+/// \brief Address space qualifiers.
+enum class AddressSpaceQualifier : uint8_t {
+  Private  = 0,
+  Global   = 1,
+  Constant = 2,
+  Local    = 3,
+  Generic  = 4,
+  Region   = 5,
+  Unknown  = 0xff
+};
+
+/// \brief Value kinds.
+enum class ValueKind : uint8_t {
+  ByValue                = 0,
+  GlobalBuffer           = 1,
+  DynamicSharedPointer   = 2,
+  Sampler                = 3,
+  Image                  = 4,
+  Pipe                   = 5,
+  Queue                  = 6,
+  HiddenGlobalOffsetX    = 7,
+  HiddenGlobalOffsetY    = 8,
+  HiddenGlobalOffsetZ    = 9,
+  HiddenNone             = 10,
+  HiddenPrintfBuffer     = 11,
+  HiddenDefaultQueue     = 12,
+  HiddenCompletionAction = 13,
+  Unknown                = 0xff
+};
+
+/// \brief Value types.
+enum class ValueType : uint8_t {
+  Struct  = 0,
+  I8      = 1,
+  U8      = 2,
+  I16     = 3,
+  U16     = 4,
+  F16     = 5,
+  I32     = 6,
+  U32     = 7,
+  F32     = 8,
+  I64     = 9,
+  U64     = 10,
+  F64     = 11,
+  Unknown = 0xff
+};
+
+//===----------------------------------------------------------------------===//
+// Kernel Metadata.
+//===----------------------------------------------------------------------===//
+namespace Kernel {
+
+//===----------------------------------------------------------------------===//
+// Kernel Attributes Metadata.
+//===----------------------------------------------------------------------===//
+namespace Attrs {
+
+namespace Key {
+/// \brief Key for Kernel::Attr::Metadata::mReqdWorkGroupSize.
+constexpr char ReqdWorkGroupSize[] = "ReqdWorkGroupSize";
+/// \brief Key for Kernel::Attr::Metadata::mWorkGroupSizeHint.
+constexpr char WorkGroupSizeHint[] = "WorkGroupSizeHint";
+/// \brief Key for Kernel::Attr::Metadata::mVecTypeHint.
+constexpr char VecTypeHint[] = "VecTypeHint";
+} // end namespace Key
+
+/// \brief In-memory representation of kernel attributes metadata.
+struct Metadata final {
+  /// \brief 'reqd_work_group_size' attribute. Optional.
+  std::vector<uint32_t> mReqdWorkGroupSize = std::vector<uint32_t>();
+  /// \brief 'work_group_size_hint' attribute. Optional.
+  std::vector<uint32_t> mWorkGroupSizeHint = std::vector<uint32_t>();
+  /// \brief 'vec_type_hint' attribute. Optional.
+  std::string mVecTypeHint = std::string();
+
+  /// \brief Default constructor.
+  Metadata() = default;
+
+  /// \returns True if kernel attributes metadata is empty, false otherwise.
+  bool empty() const {
+    return mReqdWorkGroupSize.empty() &&
+           mWorkGroupSizeHint.empty() &&
+           mVecTypeHint.empty();
+  }
+
+  /// \returns True if kernel attributes metadata is not empty, false otherwise.
+  bool notEmpty() const {
+    return !empty();
+  }
+};
+
+} // end namespace Attrs
+
+//===----------------------------------------------------------------------===//
+// Kernel Argument Metadata.
+//===----------------------------------------------------------------------===//
+namespace Arg {
+
+namespace Key {
+/// \brief Key for Kernel::Arg::Metadata::mSize.
+constexpr char Size[] = "Size";
+/// \brief Key for Kernel::Arg::Metadata::mAlign.
+constexpr char Align[] = "Align";
+/// \brief Key for Kernel::Arg::Metadata::mValueKind.
+constexpr char ValueKind[] = "ValueKind";
+/// \brief Key for Kernel::Arg::Metadata::mValueType.
+constexpr char ValueType[] = "ValueType";
+/// \brief Key for Kernel::Arg::Metadata::mPointeeAlign.
+constexpr char PointeeAlign[] = "PointeeAlign";
+/// \brief Key for Kernel::Arg::Metadata::mAccQual.
+constexpr char AccQual[] = "AccQual";
+/// \brief Key for Kernel::Arg::Metadata::mAddrSpaceQual.
+constexpr char AddrSpaceQual[] = "AddrSpaceQual";
+/// \brief Key for Kernel::Arg::Metadata::mIsConst.
+constexpr char IsConst[] = "IsConst";
+/// \brief Key for Kernel::Arg::Metadata::mIsPipe.
+constexpr char IsPipe[] = "IsPipe";
+/// \brief Key for Kernel::Arg::Metadata::mIsRestrict.
+constexpr char IsRestrict[] = "IsRestrict";
+/// \brief Key for Kernel::Arg::Metadata::mIsVolatile.
+constexpr char IsVolatile[] = "IsVolatile";
+/// \brief Key for Kernel::Arg::Metadata::mName.
+constexpr char Name[] = "Name";
+/// \brief Key for Kernel::Arg::Metadata::mTypeName.
+constexpr char TypeName[] = "TypeName";
+} // end namespace Key
+
+/// \brief In-memory representation of kernel argument metadata.
+struct Metadata final {
+  /// \brief Size in bytes. Required.
+  uint32_t mSize = 0;
+  /// \brief Alignment in bytes. Required.
+  uint32_t mAlign = 0;
+  /// \brief Value kind. Required.
+  ValueKind mValueKind = ValueKind::Unknown;
+  /// \brief Value type. Required.
+  ValueType mValueType = ValueType::Unknown;
+  /// \brief Pointee alignment in bytes. Optional.
+  uint32_t mPointeeAlign = 0;
+  /// \brief Access qualifier. Optional.
+  AccessQualifier mAccQual = AccessQualifier::Unknown;
+  /// \brief Address space qualifier. Optional.
+  AddressSpaceQualifier mAddrSpaceQual = AddressSpaceQualifier::Unknown;
+  /// \brief True if 'const' qualifier is specified. Optional.
+  bool mIsConst = false;
+  /// \brief True if 'pipe' qualifier is specified. Optional.
+  bool mIsPipe = false;
+  /// \brief True if 'restrict' qualifier is specified. Optional.
+  bool mIsRestrict = false;
+  /// \brief True if 'volatile' qualifier is specified. Optional.
+  bool mIsVolatile = false;
+  /// \brief Name. Optional.
+  std::string mName = std::string();
+  /// \brief Type name. Optional.
+  std::string mTypeName = std::string();
+
+  /// \brief Default constructor.
+  Metadata() = default;
+};
+
+} // end namespace Arg
+
+//===----------------------------------------------------------------------===//
+// Kernel Code Properties Metadata.
+//===----------------------------------------------------------------------===//
+namespace CodeProps {
+
+namespace Key {
+/// \brief Key for Kernel::CodeProps::Metadata::mKernargSegmentSize.
+constexpr char KernargSegmentSize[] = "KernargSegmentSize";
+/// \brief Key for Kernel::CodeProps::Metadata::mWorkgroupGroupSegmentSize.
+constexpr char WorkgroupGroupSegmentSize[] = "WorkgroupGroupSegmentSize";
+/// \brief Key for Kernel::CodeProps::Metadata::mWorkitemPrivateSegmentSize.
+constexpr char WorkitemPrivateSegmentSize[] = "WorkitemPrivateSegmentSize";
+/// \brief Key for Kernel::CodeProps::Metadata::mWavefrontNumSGPRs.
+constexpr char WavefrontNumSGPRs[] = "WavefrontNumSGPRs";
+/// \brief Key for Kernel::CodeProps::Metadata::mWorkitemNumVGPRs.
+constexpr char WorkitemNumVGPRs[] = "WorkitemNumVGPRs";
+/// \brief Key for Kernel::CodeProps::Metadata::mKernargSegmentAlign.
+constexpr char KernargSegmentAlign[] = "KernargSegmentAlign";
+/// \brief Key for Kernel::CodeProps::Metadata::mGroupSegmentAlign.
+constexpr char GroupSegmentAlign[] = "GroupSegmentAlign";
+/// \brief Key for Kernel::CodeProps::Metadata::mPrivateSegmentAlign.
+constexpr char PrivateSegmentAlign[] = "PrivateSegmentAlign";
+/// \brief Key for Kernel::CodeProps::Metadata::mWavefrontSize.
+constexpr char WavefrontSize[] = "WavefrontSize";
+} // end namespace Key
+
+/// \brief In-memory representation of kernel code properties metadata.
+struct Metadata final {
+  /// \brief Size in bytes of the kernarg segment memory. Kernarg segment memory
+  /// holds the values of the arguments to the kernel. Optional.
+  uint64_t mKernargSegmentSize = 0;
+  /// \brief Size in bytes of the group segment memory required by a workgroup.
+  /// This value does not include any dynamically allocated group segment memory
+  /// that may be added when the kernel is dispatched. Optional.
+  uint32_t mWorkgroupGroupSegmentSize = 0;
+  /// \brief Size in bytes of the private segment memory required by a workitem.
+  /// Private segment memory includes arg, spill and private segments. Optional.
+  uint32_t mWorkitemPrivateSegmentSize = 0;
+  /// \brief Total number of SGPRs used by a wavefront. Optional.
+  uint16_t mWavefrontNumSGPRs = 0;
+  /// \brief Total number of VGPRs used by a workitem. Optional.
+  uint16_t mWorkitemNumVGPRs = 0;
+  /// \brief Maximum byte alignment of variables used by the kernel in the
+  /// kernarg memory segment. Expressed as a power of two. Optional.
+  uint8_t mKernargSegmentAlign = 0;
+  /// \brief Maximum byte alignment of variables used by the kernel in the
+  /// group memory segment. Expressed as a power of two. Optional.
+  uint8_t mGroupSegmentAlign = 0;
+  /// \brief Maximum byte alignment of variables used by the kernel in the
+  /// private memory segment. Expressed as a power of two. Optional.
+  uint8_t mPrivateSegmentAlign = 0;
+  /// \brief Wavefront size. Expressed as a power of two. Optional.
+  uint8_t mWavefrontSize = 0;
+
+  /// \brief Default constructor.
+  Metadata() = default;
+
+  /// \returns True if kernel code properties metadata is empty, false
+  /// otherwise.
+  bool empty() const {
+    return !notEmpty();
+  }
+
+  /// \returns True if kernel code properties metadata is not empty, false
+  /// otherwise.
+  bool notEmpty() const {
+    return mKernargSegmentSize || mWorkgroupGroupSegmentSize ||
+           mWorkitemPrivateSegmentSize || mWavefrontNumSGPRs ||
+           mWorkitemNumVGPRs || mKernargSegmentAlign || mGroupSegmentAlign ||
+           mPrivateSegmentAlign || mWavefrontSize;
+  }
+};
+
+} // end namespace CodeProps
+
+//===----------------------------------------------------------------------===//
+// Kernel Debug Properties Metadata.
+//===----------------------------------------------------------------------===//
+namespace DebugProps {
+
+namespace Key {
+/// \brief Key for Kernel::DebugProps::Metadata::mDebuggerABIVersion.
+constexpr char DebuggerABIVersion[] = "DebuggerABIVersion";
+/// \brief Key for Kernel::DebugProps::Metadata::mReservedNumVGPRs.
+constexpr char ReservedNumVGPRs[] = "ReservedNumVGPRs";
+/// \brief Key for Kernel::DebugProps::Metadata::mReservedFirstVGPR.
+constexpr char ReservedFirstVGPR[] = "ReservedFirstVGPR";
+/// \brief Key for Kernel::DebugProps::Metadata::mPrivateSegmentBufferSGPR.
+constexpr char PrivateSegmentBufferSGPR[] = "PrivateSegmentBufferSGPR";
+/// \brief Key for
+///     Kernel::DebugProps::Metadata::mWavefrontPrivateSegmentOffsetSGPR.
+constexpr char WavefrontPrivateSegmentOffsetSGPR[] =
+    "WavefrontPrivateSegmentOffsetSGPR";
+} // end namespace Key
+
+/// \brief In-memory representation of kernel debug properties metadata.
+struct Metadata final {
+  /// \brief Debugger ABI version. Optional.
+  std::vector<uint32_t> mDebuggerABIVersion = std::vector<uint32_t>();
+  /// \brief Consecutive number of VGPRs reserved for debugger use. Must be 0 if
+  /// mDebuggerABIVersion is not set. Optional.
+  uint16_t mReservedNumVGPRs = 0;
+  /// \brief First fixed VGPR reserved. Must be uint16_t(-1) if
+  /// mDebuggerABIVersion is not set or mReservedFirstVGPR is 0. Optional.
+  uint16_t mReservedFirstVGPR = uint16_t(-1);
+  /// \brief Fixed SGPR of the first of 4 SGPRs used to hold the scratch V# used
+  /// for the entire kernel execution. Must be uint16_t(-1) if
+  /// mDebuggerABIVersion is not set or SGPR not used or not known. Optional.
+  uint16_t mPrivateSegmentBufferSGPR = uint16_t(-1);
+  /// \brief Fixed SGPR used to hold the wave scratch offset for the entire
+  /// kernel execution. Must be uint16_t(-1) if mDebuggerABIVersion is not set
+  /// or SGPR is not used or not known. Optional.
+  uint16_t mWavefrontPrivateSegmentOffsetSGPR = uint16_t(-1);
+
+  /// \brief Default constructor.
+  Metadata() = default;
+
+  /// \returns True if kernel debug properties metadata is empty, false
+  /// otherwise.
+  bool empty() const {
+    return !notEmpty();
+  }
+
+  /// \returns True if kernel debug properties metadata is not empty, false
+  /// otherwise.
+  bool notEmpty() const {
+    return !mDebuggerABIVersion.empty();
+  }
+};
+
+} // end namespace DebugProps
+
+namespace Key {
+/// \brief Key for Kernel::Metadata::mName.
+constexpr char Name[] = "Name";
+/// \brief Key for Kernel::Metadata::mLanguage.
+constexpr char Language[] = "Language";
+/// \brief Key for Kernel::Metadata::mLanguageVersion.
+constexpr char LanguageVersion[] = "LanguageVersion";
+/// \brief Key for Kernel::Metadata::mAttrs.
+constexpr char Attrs[] = "Attrs";
+/// \brief Key for Kernel::Metadata::mArgs.
+constexpr char Args[] = "Args";
+/// \brief Key for Kernel::Metadata::mCodeProps.
+constexpr char CodeProps[] = "CodeProps";
+/// \brief Key for Kernel::Metadata::mDebugProps.
+constexpr char DebugProps[] = "DebugProps";
+} // end namespace Key
+
+/// \brief In-memory representation of kernel metadata.
+struct Metadata final {
+  /// \brief Name. Required.
+  std::string mName = std::string();
+  /// \brief Language. Optional.
+  std::string mLanguage = std::string();
+  /// \brief Language version. Optional.
+  std::vector<uint32_t> mLanguageVersion = std::vector<uint32_t>();
+  /// \brief Attributes metadata. Optional.
+  Attrs::Metadata mAttrs = Attrs::Metadata();
+  /// \brief Arguments metadata. Optional.
+  std::vector<Arg::Metadata> mArgs = std::vector<Arg::Metadata>();
+  /// \brief Code properties metadata. Optional.
+  CodeProps::Metadata mCodeProps = CodeProps::Metadata();
+  /// \brief Debug properties metadata. Optional.
+  DebugProps::Metadata mDebugProps = DebugProps::Metadata();
+
+  /// \brief Default constructor.
+  Metadata() = default;
+};
+
+} // end namespace Kernel
+
+namespace Key {
+/// \brief Key for CodeObject::Metadata::mVersion.
+constexpr char Version[] = "Version";
+/// \brief Key for CodeObject::Metadata::mPrintf.
+constexpr char Printf[] = "Printf";
+/// \brief Key for CodeObject::Metadata::mKernels.
+constexpr char Kernels[] = "Kernels";
+} // end namespace Key
+
+/// \brief In-memory representation of code object metadata.
+struct Metadata final {
+  /// \brief Code object metadata version. Required.
+  std::vector<uint32_t> mVersion = std::vector<uint32_t>();
+  /// \brief Printf metadata. Optional.
+  std::vector<std::string> mPrintf = std::vector<std::string>();
+  /// \brief Kernels metadata. Optional.
+  std::vector<Kernel::Metadata> mKernels = std::vector<Kernel::Metadata>();
+
+  /// \brief Default constructor.
+  Metadata() = default;
+
+  /// \brief Converts \p YamlString to \p CodeObjectMetadata.
+  static std::error_code fromYamlString(std::string YamlString,
+                                        Metadata &CodeObjectMetadata);
+
+  /// \brief Converts \p CodeObjectMetadata to \p YamlString.
+  static std::error_code toYamlString(Metadata CodeObjectMetadata,
+                                      std::string &YamlString);
+};
+
+} // end namespace CodeObject
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATA_H
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..59571a48a962b4d7ca5aac762a78ebb944de306c
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.cpp
@@ -0,0 +1,620 @@
+//===--- AMDGPUCodeObjectMetadataStreamer.cpp -------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Code Object Metadata Streamer.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUCodeObjectMetadataStreamer.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/YAMLTraits.h"
+
+using namespace llvm::AMDGPU;
+using namespace llvm::AMDGPU::CodeObject;
+
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string)
+LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Arg::Metadata)
+LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata)
+
+namespace llvm {
+
+static cl::opt<bool> DumpCodeObjectMetadata(
+    "amdgpu-dump-comd",
+    cl::desc("Dump AMDGPU Code Object Metadata"));
+static cl::opt<bool> VerifyCodeObjectMetadata(
+    "amdgpu-verify-comd",
+    cl::desc("Verify AMDGPU Code Object Metadata"));
+
+namespace yaml {
+
+template <>
+struct ScalarEnumerationTraits<AccessQualifier> {
+  static void enumeration(IO &YIO, AccessQualifier &EN) {
+    YIO.enumCase(EN, "Default", AccessQualifier::Default);
+    YIO.enumCase(EN, "ReadOnly", AccessQualifier::ReadOnly);
+    YIO.enumCase(EN, "WriteOnly", AccessQualifier::WriteOnly);
+    YIO.enumCase(EN, "ReadWrite", AccessQualifier::ReadWrite);
+  }
+};
+
+template <>
+struct ScalarEnumerationTraits<AddressSpaceQualifier> {
+  static void enumeration(IO &YIO, AddressSpaceQualifier &EN) {
+    YIO.enumCase(EN, "Private", AddressSpaceQualifier::Private);
+    YIO.enumCase(EN, "Global", AddressSpaceQualifier::Global);
+    YIO.enumCase(EN, "Constant", AddressSpaceQualifier::Constant);
+    YIO.enumCase(EN, "Local", AddressSpaceQualifier::Local);
+    YIO.enumCase(EN, "Generic", AddressSpaceQualifier::Generic);
+    YIO.enumCase(EN, "Region", AddressSpaceQualifier::Region);
+  }
+};
+
+template <>
+struct ScalarEnumerationTraits<ValueKind> {
+  static void enumeration(IO &YIO, ValueKind &EN) {
+    YIO.enumCase(EN, "ByValue", ValueKind::ByValue);
+    YIO.enumCase(EN, "GlobalBuffer", ValueKind::GlobalBuffer);
+    YIO.enumCase(EN, "DynamicSharedPointer", ValueKind::DynamicSharedPointer);
+    YIO.enumCase(EN, "Sampler", ValueKind::Sampler);
+    YIO.enumCase(EN, "Image", ValueKind::Image);
+    YIO.enumCase(EN, "Pipe", ValueKind::Pipe);
+    YIO.enumCase(EN, "Queue", ValueKind::Queue);
+    YIO.enumCase(EN, "HiddenGlobalOffsetX", ValueKind::HiddenGlobalOffsetX);
+    YIO.enumCase(EN, "HiddenGlobalOffsetY", ValueKind::HiddenGlobalOffsetY);
+    YIO.enumCase(EN, "HiddenGlobalOffsetZ", ValueKind::HiddenGlobalOffsetZ);
+    YIO.enumCase(EN, "HiddenNone", ValueKind::HiddenNone);
+    YIO.enumCase(EN, "HiddenPrintfBuffer", ValueKind::HiddenPrintfBuffer);
+    YIO.enumCase(EN, "HiddenDefaultQueue", ValueKind::HiddenDefaultQueue);
+    YIO.enumCase(EN, "HiddenCompletionAction",
+                 ValueKind::HiddenCompletionAction);
+  }
+};
+
+template <>
+struct ScalarEnumerationTraits<ValueType> {
+  static void enumeration(IO &YIO, ValueType &EN) {
+    YIO.enumCase(EN, "Struct", ValueType::Struct);
+    YIO.enumCase(EN, "I8", ValueType::I8);
+    YIO.enumCase(EN, "U8", ValueType::U8);
+    YIO.enumCase(EN, "I16", ValueType::I16);
+    YIO.enumCase(EN, "U16", ValueType::U16);
+    YIO.enumCase(EN, "F16", ValueType::F16);
+    YIO.enumCase(EN, "I32", ValueType::I32);
+    YIO.enumCase(EN, "U32", ValueType::U32);
+    YIO.enumCase(EN, "F32", ValueType::F32);
+    YIO.enumCase(EN, "I64", ValueType::I64);
+    YIO.enumCase(EN, "U64", ValueType::U64);
+    YIO.enumCase(EN, "F64", ValueType::F64);
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::Attrs::Metadata> {
+  static void mapping(IO &YIO, Kernel::Attrs::Metadata &MD) {
+    YIO.mapOptional(Kernel::Attrs::Key::ReqdWorkGroupSize,
+                    MD.mReqdWorkGroupSize, std::vector<uint32_t>());
+    YIO.mapOptional(Kernel::Attrs::Key::WorkGroupSizeHint,
+                    MD.mWorkGroupSizeHint, std::vector<uint32_t>());
+    YIO.mapOptional(Kernel::Attrs::Key::VecTypeHint,
+                    MD.mVecTypeHint, std::string());
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::Arg::Metadata> {
+  static void mapping(IO &YIO, Kernel::Arg::Metadata &MD) {
+    YIO.mapRequired(Kernel::Arg::Key::Size, MD.mSize);
+    YIO.mapRequired(Kernel::Arg::Key::Align, MD.mAlign);
+    YIO.mapRequired(Kernel::Arg::Key::ValueKind, MD.mValueKind);
+    YIO.mapRequired(Kernel::Arg::Key::ValueType, MD.mValueType);
+    YIO.mapOptional(Kernel::Arg::Key::PointeeAlign, MD.mPointeeAlign,
+                    uint32_t(0));
+    YIO.mapOptional(Kernel::Arg::Key::AccQual, MD.mAccQual,
+                    AccessQualifier::Unknown);
+    YIO.mapOptional(Kernel::Arg::Key::AddrSpaceQual, MD.mAddrSpaceQual,
+                    AddressSpaceQualifier::Unknown);
+    YIO.mapOptional(Kernel::Arg::Key::IsConst, MD.mIsConst, false);
+    YIO.mapOptional(Kernel::Arg::Key::IsPipe, MD.mIsPipe, false);
+    YIO.mapOptional(Kernel::Arg::Key::IsRestrict, MD.mIsRestrict, false);
+    YIO.mapOptional(Kernel::Arg::Key::IsVolatile, MD.mIsVolatile, false);
+    YIO.mapOptional(Kernel::Arg::Key::Name, MD.mName, std::string());
+    YIO.mapOptional(Kernel::Arg::Key::TypeName, MD.mTypeName, std::string());
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::CodeProps::Metadata> {
+  static void mapping(IO &YIO, Kernel::CodeProps::Metadata &MD) {
+    YIO.mapOptional(Kernel::CodeProps::Key::KernargSegmentSize,
+                    MD.mKernargSegmentSize, uint64_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WorkgroupGroupSegmentSize,
+                    MD.mWorkgroupGroupSegmentSize, uint32_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WorkitemPrivateSegmentSize,
+                    MD.mWorkitemPrivateSegmentSize, uint32_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WavefrontNumSGPRs,
+                    MD.mWavefrontNumSGPRs, uint16_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WorkitemNumVGPRs,
+                    MD.mWorkitemNumVGPRs, uint16_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::KernargSegmentAlign,
+                    MD.mKernargSegmentAlign, uint8_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::GroupSegmentAlign,
+                    MD.mGroupSegmentAlign, uint8_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::PrivateSegmentAlign,
+                    MD.mPrivateSegmentAlign, uint8_t(0));
+    YIO.mapOptional(Kernel::CodeProps::Key::WavefrontSize,
+                    MD.mWavefrontSize, uint8_t(0));
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::DebugProps::Metadata> {
+  static void mapping(IO &YIO, Kernel::DebugProps::Metadata &MD) {
+    YIO.mapOptional(Kernel::DebugProps::Key::DebuggerABIVersion,
+                    MD.mDebuggerABIVersion, std::vector<uint32_t>());
+    YIO.mapOptional(Kernel::DebugProps::Key::ReservedNumVGPRs,
+                    MD.mReservedNumVGPRs, uint16_t(0));
+    YIO.mapOptional(Kernel::DebugProps::Key::ReservedFirstVGPR,
+                    MD.mReservedFirstVGPR, uint16_t(-1));
+    YIO.mapOptional(Kernel::DebugProps::Key::PrivateSegmentBufferSGPR,
+                    MD.mPrivateSegmentBufferSGPR, uint16_t(-1));
+    YIO.mapOptional(Kernel::DebugProps::Key::WavefrontPrivateSegmentOffsetSGPR,
+                    MD.mWavefrontPrivateSegmentOffsetSGPR, uint16_t(-1));
+  }
+};
+
+template <>
+struct MappingTraits<Kernel::Metadata> {
+  static void mapping(IO &YIO, Kernel::Metadata &MD) {
+    YIO.mapRequired(Kernel::Key::Name, MD.mName);
+    YIO.mapOptional(Kernel::Key::Language, MD.mLanguage, std::string());
+    YIO.mapOptional(Kernel::Key::LanguageVersion, MD.mLanguageVersion,
+                    std::vector<uint32_t>());
+    if (!MD.mAttrs.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::Attrs, MD.mAttrs);
+    if (!MD.mArgs.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::Args, MD.mArgs);
+    if (!MD.mCodeProps.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::CodeProps, MD.mCodeProps);
+    if (!MD.mDebugProps.empty() || !YIO.outputting())
+      YIO.mapOptional(Kernel::Key::DebugProps, MD.mDebugProps);
+  }
+};
+
+template <>
+struct MappingTraits<CodeObject::Metadata> {
+  static void mapping(IO &YIO, CodeObject::Metadata &MD) {
+    YIO.mapRequired(Key::Version, MD.mVersion);
+    YIO.mapOptional(Key::Printf, MD.mPrintf, std::vector<std::string>());
+    if (!MD.mKernels.empty() || !YIO.outputting())
+      YIO.mapOptional(Key::Kernels, MD.mKernels);
+  }
+};
+
+} // end namespace yaml
+
+namespace AMDGPU {
+
+/* static */
+std::error_code CodeObject::Metadata::fromYamlString(
+    std::string YamlString, CodeObject::Metadata &CodeObjectMetadata) {
+  yaml::Input YamlInput(YamlString);
+  YamlInput >> CodeObjectMetadata;
+  return YamlInput.error();
+}
+
+/* static */
+std::error_code CodeObject::Metadata::toYamlString(
+    CodeObject::Metadata CodeObjectMetadata, std::string &YamlString) {
+  raw_string_ostream YamlStream(YamlString);
+  yaml::Output YamlOutput(YamlStream, nullptr, std::numeric_limits<int>::max());
+  YamlOutput << CodeObjectMetadata;
+  return std::error_code();
+}
+
+namespace CodeObject {
+
+void MetadataStreamer::dump(StringRef YamlString) const {
+  errs() << "AMDGPU Code Object Metadata:\n" << YamlString << '\n';
+}
+
+void MetadataStreamer::verify(StringRef YamlString) const {
+  errs() << "AMDGPU Code Object Metadata Parser Test: ";
+
+  CodeObject::Metadata FromYamlString;
+  if (Metadata::fromYamlString(YamlString, FromYamlString)) {
+    errs() << "FAIL\n";
+    return;
+  }
+
+  std::string ToYamlString;
+  if (Metadata::toYamlString(FromYamlString, ToYamlString)) {
+    errs() << "FAIL\n";
+    return;
+  }
+
+  errs() << (YamlString == ToYamlString ? "PASS" : "FAIL") << '\n';
+  if (YamlString != ToYamlString) {
+    errs() << "Original input: " << YamlString << '\n'
+           << "Produced output: " << ToYamlString << '\n';
+  }
+}
+
+AccessQualifier MetadataStreamer::getAccessQualifier(StringRef AccQual) const {
+  if (AccQual.empty())
+    return AccessQualifier::Unknown;
+
+  return StringSwitch<AccessQualifier>(AccQual)
+             .Case("read_only",  AccessQualifier::ReadOnly)
+             .Case("write_only", AccessQualifier::WriteOnly)
+             .Case("read_write", AccessQualifier::ReadWrite)
+             .Default(AccessQualifier::Default);
+}
+
+AddressSpaceQualifier MetadataStreamer::getAddressSpaceQualifer(
+    unsigned AddressSpace) const {
+  if (AddressSpace == AMDGPUASI.PRIVATE_ADDRESS)
+    return AddressSpaceQualifier::Private;
+  if (AddressSpace == AMDGPUASI.GLOBAL_ADDRESS)
+    return AddressSpaceQualifier::Global;
+  if (AddressSpace == AMDGPUASI.CONSTANT_ADDRESS)
+    return AddressSpaceQualifier::Constant;
+  if (AddressSpace == AMDGPUASI.LOCAL_ADDRESS)
+    return AddressSpaceQualifier::Local;
+  if (AddressSpace == AMDGPUASI.FLAT_ADDRESS)
+    return AddressSpaceQualifier::Generic;
+  if (AddressSpace == AMDGPUASI.REGION_ADDRESS)
+    return AddressSpaceQualifier::Region;
+
+  llvm_unreachable("Unknown address space qualifier");
+}
+
+ValueKind MetadataStreamer::getValueKind(Type *Ty, StringRef TypeQual,
+                                         StringRef BaseTypeName) const {
+  if (TypeQual.find("pipe") != StringRef::npos)
+    return ValueKind::Pipe;
+
+  return StringSwitch<ValueKind>(BaseTypeName)
+             .Case("sampler_t", ValueKind::Sampler)
+             .Case("queue_t", ValueKind::Queue)
+             .Cases("image1d_t",
+                    "image1d_array_t",
+                    "image1d_buffer_t",
+                    "image2d_t" ,
+                    "image2d_array_t",
+                    "image2d_array_depth_t",
+                    "image2d_array_msaa_t"
+                    "image2d_array_msaa_depth_t"
+                    "image2d_depth_t",
+                    "image2d_msaa_t",
+                    "image2d_msaa_depth_t",
+                    "image3d_t", ValueKind::Image)
+             .Default(isa<PointerType>(Ty) ?
+                          (Ty->getPointerAddressSpace() ==
+                           AMDGPUASI.LOCAL_ADDRESS ?
+                           ValueKind::DynamicSharedPointer :
+                           ValueKind::GlobalBuffer) :
+                      ValueKind::ByValue);
+}
+
+ValueType MetadataStreamer::getValueType(Type *Ty, StringRef TypeName) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    auto Signed = !TypeName.startswith("u");
+    switch (Ty->getIntegerBitWidth()) {
+    case 8:
+      return Signed ? ValueType::I8 : ValueType::U8;
+    case 16:
+      return Signed ? ValueType::I16 : ValueType::U16;
+    case 32:
+      return Signed ? ValueType::I32 : ValueType::U32;
+    case 64:
+      return Signed ? ValueType::I64 : ValueType::U64;
+    default:
+      return ValueType::Struct;
+    }
+  }
+  case Type::HalfTyID:
+    return ValueType::F16;
+  case Type::FloatTyID:
+    return ValueType::F32;
+  case Type::DoubleTyID:
+    return ValueType::F64;
+  case Type::PointerTyID:
+    return getValueType(Ty->getPointerElementType(), TypeName);
+  case Type::VectorTyID:
+    return getValueType(Ty->getVectorElementType(), TypeName);
+  default:
+    return ValueType::Struct;
+  }
+}
+
+std::string MetadataStreamer::getTypeName(Type *Ty, bool Signed) const {
+  switch (Ty->getTypeID()) {
+  case Type::IntegerTyID: {
+    if (!Signed)
+      return (Twine('u') + getTypeName(Ty, true)).str();
+
+    auto BitWidth = Ty->getIntegerBitWidth();
+    switch (BitWidth) {
+    case 8:
+      return "char";
+    case 16:
+      return "short";
+    case 32:
+      return "int";
+    case 64:
+      return "long";
+    default:
+      return (Twine('i') + Twine(BitWidth)).str();
+    }
+  }
+  case Type::HalfTyID:
+    return "half";
+  case Type::FloatTyID:
+    return "float";
+  case Type::DoubleTyID:
+    return "double";
+  case Type::VectorTyID: {
+    auto VecTy = cast<VectorType>(Ty);
+    auto ElTy = VecTy->getElementType();
+    auto NumElements = VecTy->getVectorNumElements();
+    return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
+  }
+  default:
+    return "unknown";
+  }
+}
+
+std::vector<uint32_t> MetadataStreamer::getWorkGroupDimensions(
+    MDNode *Node) const {
+  std::vector<uint32_t> Dims;
+  if (Node->getNumOperands() != 3)
+    return Dims;
+
+  for (auto &Op : Node->operands())
+    Dims.push_back(mdconst::extract<ConstantInt>(Op)->getZExtValue());
+  return Dims;
+}
+
+void MetadataStreamer::emitVersion() {
+  auto &Version = CodeObjectMetadata.mVersion;
+
+  Version.push_back(MetadataVersionMajor);
+  Version.push_back(MetadataVersionMinor);
+}
+
+void MetadataStreamer::emitPrintf(const Module &Mod) {
+  auto &Printf = CodeObjectMetadata.mPrintf;
+
+  auto Node = Mod.getNamedMetadata("llvm.printf.fmts");
+  if (!Node)
+    return;
+
+  for (auto Op : Node->operands())
+    if (Op->getNumOperands())
+      Printf.push_back(cast<MDString>(Op->getOperand(0))->getString());
+}
+
+void MetadataStreamer::emitKernelLanguage(const Function &Func) {
+  auto &Kernel = CodeObjectMetadata.mKernels.back();
+
+  // TODO: What about other languages?
+  auto Node = Func.getParent()->getNamedMetadata("opencl.ocl.version");
+  if (!Node || !Node->getNumOperands())
+    return;
+  auto Op0 = Node->getOperand(0);
+  if (Op0->getNumOperands() <= 1)
+    return;
+
+  Kernel.mLanguage = "OpenCL C";
+  Kernel.mLanguageVersion.push_back(
+      mdconst::extract<ConstantInt>(Op0->getOperand(0))->getZExtValue());
+  Kernel.mLanguageVersion.push_back(
+      mdconst::extract<ConstantInt>(Op0->getOperand(1))->getZExtValue());
+}
+
+void MetadataStreamer::emitKernelAttrs(const Function &Func) {
+  auto &Attrs = CodeObjectMetadata.mKernels.back().mAttrs;
+
+  if (auto Node = Func.getMetadata("reqd_work_group_size"))
+    Attrs.mReqdWorkGroupSize = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("work_group_size_hint"))
+    Attrs.mWorkGroupSizeHint = getWorkGroupDimensions(Node);
+  if (auto Node = Func.getMetadata("vec_type_hint")) {
+    Attrs.mVecTypeHint = getTypeName(
+        cast<ValueAsMetadata>(Node->getOperand(0))->getType(),
+        mdconst::extract<ConstantInt>(Node->getOperand(1))->getZExtValue());
+  }
+}
+
+void MetadataStreamer::emitKernelArgs(const Function &Func) {
+  for (auto &Arg : Func.args())
+    emitKernelArg(Arg);
+
+  // TODO: What about other languages?
+  if (!Func.getParent()->getNamedMetadata("opencl.ocl.version"))
+    return;
+
+  auto &DL = Func.getParent()->getDataLayout();
+  auto Int64Ty = Type::getInt64Ty(Func.getContext());
+
+  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetX);
+  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetY);
+  emitKernelArg(DL, Int64Ty, ValueKind::HiddenGlobalOffsetZ);
+
+  if (!Func.getParent()->getNamedMetadata("llvm.printf.fmts"))
+    return;
+
+  auto Int8PtrTy = Type::getInt8PtrTy(Func.getContext(),
+                                      AMDGPUASI.GLOBAL_ADDRESS);
+  emitKernelArg(DL, Int8PtrTy, ValueKind::HiddenPrintfBuffer);
+}
+
+void MetadataStreamer::emitKernelArg(const Argument &Arg) {
+  auto Func = Arg.getParent();
+  auto ArgNo = Arg.getArgNo();
+  const MDNode *Node;
+
+  StringRef TypeQual;
+  Node = Func->getMetadata("kernel_arg_type_qual");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef BaseTypeName;
+  Node = Func->getMetadata("kernel_arg_base_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    BaseTypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef AccQual;
+  Node = Func->getMetadata("kernel_arg_access_qual");
+  if (Node && ArgNo < Node->getNumOperands())
+    AccQual = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef Name;
+  Node = Func->getMetadata("kernel_arg_name");
+  if (Node && ArgNo < Node->getNumOperands())
+    Name = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  StringRef TypeName;
+  Node = Func->getMetadata("kernel_arg_type");
+  if (Node && ArgNo < Node->getNumOperands())
+    TypeName = cast<MDString>(Node->getOperand(ArgNo))->getString();
+
+  emitKernelArg(Func->getParent()->getDataLayout(), Arg.getType(),
+                getValueKind(Arg.getType(), TypeQual, BaseTypeName), TypeQual,
+                BaseTypeName, AccQual, Name, TypeName);
+}
+
+void MetadataStreamer::emitKernelArg(const DataLayout &DL, Type *Ty,
+                                     ValueKind ValueKind, StringRef TypeQual,
+                                     StringRef BaseTypeName, StringRef AccQual,
+                                     StringRef Name, StringRef TypeName) {
+  CodeObjectMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
+  auto &Arg = CodeObjectMetadata.mKernels.back().mArgs.back();
+
+  Arg.mSize = DL.getTypeAllocSize(Ty);
+  Arg.mAlign = DL.getABITypeAlignment(Ty);
+  Arg.mValueKind = ValueKind;
+  Arg.mValueType = getValueType(Ty, BaseTypeName);
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
+    auto ElTy = PtrTy->getElementType();
+    if (PtrTy->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS && ElTy->isSized())
+      Arg.mPointeeAlign = DL.getABITypeAlignment(ElTy);
+  }
+
+  Arg.mAccQual = getAccessQualifier(AccQual);
+
+  if (auto PtrTy = dyn_cast<PointerType>(Ty))
+    Arg.mAddrSpaceQual = getAddressSpaceQualifer(PtrTy->getAddressSpace());
+
+  SmallVector<StringRef, 1> SplitTypeQuals;
+  TypeQual.split(SplitTypeQuals, " ", -1, false);
+  for (StringRef Key : SplitTypeQuals) {
+    auto P = StringSwitch<bool*>(Key)
+                 .Case("const",    &Arg.mIsConst)
+                 .Case("pipe",     &Arg.mIsPipe)
+                 .Case("restrict", &Arg.mIsRestrict)
+                 .Case("volatile", &Arg.mIsVolatile)
+                 .Default(nullptr);
+    if (P)
+      *P = true;
+  }
+
+  Arg.mName = Name;
+  Arg.mTypeName = TypeName;
+}
+
+void MetadataStreamer::emitKernelCodeProps(
+    const amd_kernel_code_t &KernelCode) {
+  auto &CodeProps = CodeObjectMetadata.mKernels.back().mCodeProps;
+
+  CodeProps.mKernargSegmentSize = KernelCode.kernarg_segment_byte_size;
+  CodeProps.mWorkgroupGroupSegmentSize =
+      KernelCode.workgroup_group_segment_byte_size;
+  CodeProps.mWorkitemPrivateSegmentSize =
+      KernelCode.workitem_private_segment_byte_size;
+  CodeProps.mWavefrontNumSGPRs = KernelCode.wavefront_sgpr_count;
+  CodeProps.mWorkitemNumVGPRs = KernelCode.workitem_vgpr_count;
+  CodeProps.mKernargSegmentAlign = KernelCode.kernarg_segment_alignment;
+  CodeProps.mGroupSegmentAlign = KernelCode.group_segment_alignment;
+  CodeProps.mPrivateSegmentAlign = KernelCode.private_segment_alignment;
+  CodeProps.mWavefrontSize = KernelCode.wavefront_size;
+}
+
+void MetadataStreamer::emitKernelDebugProps(
+    const amd_kernel_code_t &KernelCode) {
+  if (!(KernelCode.code_properties & AMD_CODE_PROPERTY_IS_DEBUG_SUPPORTED))
+    return;
+
+  auto &DebugProps = CodeObjectMetadata.mKernels.back().mDebugProps;
+
+  // FIXME: Need to pass down debugger ABI version through features. This is ok
+  // for now because we only have one version.
+  DebugProps.mDebuggerABIVersion.push_back(1);
+  DebugProps.mDebuggerABIVersion.push_back(0);
+  DebugProps.mReservedNumVGPRs = KernelCode.reserved_vgpr_count;
+  DebugProps.mReservedFirstVGPR = KernelCode.reserved_vgpr_first;
+  DebugProps.mPrivateSegmentBufferSGPR =
+      KernelCode.debug_private_segment_buffer_sgpr;
+  DebugProps.mWavefrontPrivateSegmentOffsetSGPR =
+      KernelCode.debug_wavefront_private_segment_offset_sgpr;
+}
+
+void MetadataStreamer::begin(const Module &Mod) {
+  AMDGPUASI = getAMDGPUAS(Mod);
+  emitVersion();
+  emitPrintf(Mod);
+}
+
+void MetadataStreamer::emitKernel(const Function &Func,
+                                  const amd_kernel_code_t &KernelCode) {
+  if (Func.getCallingConv() != CallingConv::AMDGPU_KERNEL)
+    return;
+
+  CodeObjectMetadata.mKernels.push_back(Kernel::Metadata());
+  auto &Kernel = CodeObjectMetadata.mKernels.back();
+
+  Kernel.mName = Func.getName();
+  emitKernelLanguage(Func);
+  emitKernelAttrs(Func);
+  emitKernelArgs(Func);
+  emitKernelCodeProps(KernelCode);
+  emitKernelDebugProps(KernelCode);
+}
+
+ErrorOr<std::string> MetadataStreamer::toYamlString() {
+  std::string YamlString;
+  if (auto Error = Metadata::toYamlString(CodeObjectMetadata, YamlString))
+    return Error;
+
+  if (DumpCodeObjectMetadata)
+    dump(YamlString);
+  if (VerifyCodeObjectMetadata)
+    verify(YamlString);
+
+  return YamlString;
+}
+
+ErrorOr<std::string> MetadataStreamer::toYamlString(StringRef YamlString) {
+  if (auto Error = Metadata::fromYamlString(YamlString, CodeObjectMetadata))
+    return Error;
+
+  return toYamlString();
+}
+
+} // end namespace CodeObject
+} // end namespace AMDGPU
+} // end namespace llvm
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h
new file mode 100644
index 0000000000000000000000000000000000000000..8d4c51763f63d8387e3f23cd0fec9082288d69de
--- /dev/null
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUCodeObjectMetadataStreamer.h
@@ -0,0 +1,99 @@
+//===--- AMDGPUCodeObjectMetadataStreamer.h ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief AMDGPU Code Object Metadata Streamer.
+///
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
+#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
+
+#include "AMDGPU.h"
+#include "AMDGPUCodeObjectMetadata.h"
+#include "AMDKernelCodeT.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/ErrorOr.h"
+
+namespace llvm {
+
+class Argument;
+class DataLayout;
+class Function;
+class MDNode;
+class Module;
+class Type;
+
+namespace AMDGPU {
+namespace CodeObject {
+
+class MetadataStreamer final {
+private:
+  Metadata CodeObjectMetadata;
+  AMDGPUAS AMDGPUASI;
+
+  void dump(StringRef YamlString) const;
+
+  void verify(StringRef YamlString) const;
+
+  AccessQualifier getAccessQualifier(StringRef AccQual) const;
+
+  AddressSpaceQualifier getAddressSpaceQualifer(unsigned AddressSpace) const;
+
+  ValueKind getValueKind(Type *Ty, StringRef TypeQual,
+                         StringRef BaseTypeName) const;
+
+  ValueType getValueType(Type *Ty, StringRef TypeName) const;
+
+  std::string getTypeName(Type *Ty, bool Signed) const;
+
+  std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const;
+
+  void emitVersion();
+
+  void emitPrintf(const Module &Mod);
+
+  void emitKernelLanguage(const Function &Func);
+
+  void emitKernelAttrs(const Function &Func);
+
+  void emitKernelArgs(const Function &Func);
+
+  void emitKernelArg(const Argument &Arg);
+
+  void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
+                     StringRef TypeQual = "", StringRef BaseTypeName = "",
+                     StringRef AccQual = "", StringRef Name = "",
+                     StringRef TypeName = "");
+
+  void emitKernelCodeProps(const amd_kernel_code_t &KernelCode);
+
+  void emitKernelDebugProps(const amd_kernel_code_t &KernelCode);
+
+public:
+  MetadataStreamer() = default;
+  ~MetadataStreamer() = default;
+
+  void begin(const Module &Mod);
+
+  void end() {}
+
+  void emitKernel(const Function &Func, const amd_kernel_code_t &KernelCode);
+
+  ErrorOr<std::string> toYamlString();
+
+  ErrorOr<std::string> toYamlString(StringRef YamlString);
+};
+
+} // end namespace CodeObject
+} // end namespace AMDGPU
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUCODEOBJECTMETADATASTREAMER_H
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 1847d7a67328ecd0ef3f6874825054f6572166f5..073d19422e863cef76a5ee503028fb8239ecf5e8 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -1,16 +1,20 @@
-//===-- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------==//
+//===- AMDGPUELFObjectWriter.cpp - AMDGPU ELF Writer ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
-/// \file
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUMCTargetDesc.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
 
 using namespace llvm;
 
@@ -19,20 +23,21 @@ namespace {
 class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   AMDGPUELFObjectWriter(bool Is64Bit, bool HasRelocationAddend);
+
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
                         const MCFixup &Fixup, bool IsPCRel) const override;
 };
 
 
-} // End anonymous namespace
+} // end anonymous namespace
 
 AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
                                              bool HasRelocationAddend)
   : MCELFObjectTargetWriter(Is64Bit,
                             ELF::ELFOSABI_AMDGPU_HSA,
                             ELF::EM_AMDGPU,
-                            HasRelocationAddend) { }
+                            HasRelocationAddend) {}
 
 unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
                                              const MCValue &Target,
@@ -77,7 +82,6 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
   llvm_unreachable("unhandled relocation type");
 }
 
-
 MCObjectWriter *llvm::createAMDGPUELFObjectWriter(bool Is64Bit,
                                                   bool HasRelocationAddend,
                                                   raw_pwrite_stream &OS) {
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 548bad56e174a2be371d284e77b9726a46a0e98a..f80b5f3a6dba2b1abd7861c1162aa7be17f22a83 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -54,11 +54,17 @@ MCObjectWriter *createAMDGPUELFObjectWriter(bool Is64Bit,
 
 #define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
+#undef GET_REGINFO_ENUM
 
 #define GET_INSTRINFO_ENUM
+#define GET_INSTRINFO_OPERAND_ENUM
 #include "AMDGPUGenInstrInfo.inc"
+#undef GET_INSTRINFO_OPERAND_ENUM
+#undef GET_INSTRINFO_ENUM
+
 
 #define GET_SUBTARGETINFO_ENUM
 #include "AMDGPUGenSubtargetInfo.inc"
+#undef GET_SUBTARGETINFO_ENUM
 
 #endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
deleted file mode 100644
index 926a43cf458a7bf6642d8947d5fc562558d3e1fa..0000000000000000000000000000000000000000
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.cpp
+++ /dev/null
@@ -1,417 +0,0 @@
-//===-- AMDGPURuntimeMD.cpp - Generates runtime metadata ------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-///
-/// Generates AMDGPU runtime metadata for YAML mapping.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPURuntimeMetadata.h"
-#include "MCTargetDesc/AMDGPURuntimeMD.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/IR/Constants.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DerivedTypes.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-#include "llvm/Support/Casting.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Support/YAMLTraits.h"
-#include <cassert>
-#include <cstdint>
-#include <limits>
-#include <vector>
-
-using namespace llvm;
-using namespace ::AMDGPU::RuntimeMD;
-
-static cl::opt<bool>
-DumpRuntimeMD("amdgpu-dump-rtmd",
-              cl::desc("Dump AMDGPU runtime metadata"));
-
-static cl::opt<bool>
-CheckRuntimeMDParser("amdgpu-check-rtmd-parser", cl::Hidden,
-                     cl::desc("Check AMDGPU runtime metadata YAML parser"));
-
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
-LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(std::string)
-LLVM_YAML_IS_SEQUENCE_VECTOR(Kernel::Metadata)
-LLVM_YAML_IS_SEQUENCE_VECTOR(KernelArg::Metadata)
-
-namespace llvm {
-namespace yaml {
-
-template <> struct MappingTraits<KernelArg::Metadata> {
-  static void mapping(IO &YamlIO, KernelArg::Metadata &A) {
-    YamlIO.mapRequired(KeyName::ArgSize, A.Size);
-    YamlIO.mapRequired(KeyName::ArgAlign, A.Align);
-    YamlIO.mapOptional(KeyName::ArgPointeeAlign, A.PointeeAlign, 0U);
-    YamlIO.mapRequired(KeyName::ArgKind, A.Kind);
-    YamlIO.mapRequired(KeyName::ArgValueType, A.ValueType);
-    YamlIO.mapOptional(KeyName::ArgTypeName, A.TypeName, std::string());
-    YamlIO.mapOptional(KeyName::ArgName, A.Name, std::string());
-    YamlIO.mapOptional(KeyName::ArgAddrQual, A.AddrQual, INVALID_ADDR_QUAL);
-    YamlIO.mapOptional(KeyName::ArgAccQual, A.AccQual, INVALID_ACC_QUAL);
-    YamlIO.mapOptional(KeyName::ArgIsVolatile, A.IsVolatile, uint8_t(0));
-    YamlIO.mapOptional(KeyName::ArgIsConst, A.IsConst, uint8_t(0));
-    YamlIO.mapOptional(KeyName::ArgIsRestrict, A.IsRestrict, uint8_t(0));
-    YamlIO.mapOptional(KeyName::ArgIsPipe, A.IsPipe, uint8_t(0));
-  }
-  static const bool flow = true;
-};
-
-template <> struct MappingTraits<Kernel::Metadata> {
-  static void mapping(IO &YamlIO, Kernel::Metadata &K) {
-    YamlIO.mapRequired(KeyName::KernelName, K.Name);
-    YamlIO.mapOptional(KeyName::Language, K.Language, std::string());
-    YamlIO.mapOptional(KeyName::LanguageVersion, K.LanguageVersion);
-    YamlIO.mapOptional(KeyName::ReqdWorkGroupSize, K.ReqdWorkGroupSize);
-    YamlIO.mapOptional(KeyName::WorkGroupSizeHint, K.WorkGroupSizeHint);
-    YamlIO.mapOptional(KeyName::VecTypeHint, K.VecTypeHint, std::string());
-    YamlIO.mapOptional(KeyName::KernelIndex, K.KernelIndex,
-        INVALID_KERNEL_INDEX);
-    YamlIO.mapOptional(KeyName::NoPartialWorkGroups, K.NoPartialWorkGroups,
-        uint8_t(0));
-    YamlIO.mapRequired(KeyName::Args, K.Args);
-  }
-  static const bool flow = true;
-};
-
-template <> struct MappingTraits<Program::Metadata> {
-  static void mapping(IO &YamlIO, Program::Metadata &Prog) {
-    YamlIO.mapRequired(KeyName::MDVersion, Prog.MDVersionSeq);
-    YamlIO.mapOptional(KeyName::PrintfInfo, Prog.PrintfInfo);
-    YamlIO.mapOptional(KeyName::Kernels, Prog.Kernels);
-  }
-  static const bool flow = true;
-};
-
-} // end namespace yaml
-} // end namespace llvm
-
-// Get a vector of three integer values from MDNode \p Node;
-static std::vector<uint32_t> getThreeInt32(MDNode *Node) {
-  assert(Node->getNumOperands() == 3);
-  std::vector<uint32_t> V;
-  for (const MDOperand &Op : Node->operands()) {
-    const ConstantInt *CI = mdconst::extract<ConstantInt>(Op);
-    V.push_back(CI->getZExtValue());
-  }
-  return V;
-}
-
-static std::string getOCLTypeName(Type *Ty, bool Signed) {
-  switch (Ty->getTypeID()) {
-  case Type::HalfTyID:
-    return "half";
-  case Type::FloatTyID:
-    return "float";
-  case Type::DoubleTyID:
-    return "double";
-  case Type::IntegerTyID: {
-    if (!Signed)
-      return (Twine('u') + getOCLTypeName(Ty, true)).str();
-    unsigned BW = Ty->getIntegerBitWidth();
-    switch (BW) {
-    case 8:
-      return "char";
-    case 16:
-      return "short";
-    case 32:
-      return "int";
-    case 64:
-      return "long";
-    default:
-      return (Twine('i') + Twine(BW)).str();
-    }
-  }
-  case Type::VectorTyID: {
-    VectorType *VecTy = cast<VectorType>(Ty);
-    Type *EleTy = VecTy->getElementType();
-    unsigned Size = VecTy->getVectorNumElements();
-    return (Twine(getOCLTypeName(EleTy, Signed)) + Twine(Size)).str();
-  }
-  default:
-    return "unknown";
-  }
-}
-
-static KernelArg::ValueType getRuntimeMDValueType(
-  Type *Ty, StringRef TypeName) {
-  switch (Ty->getTypeID()) {
-  case Type::HalfTyID:
-    return KernelArg::F16;
-  case Type::FloatTyID:
-    return KernelArg::F32;
-  case Type::DoubleTyID:
-    return KernelArg::F64;
-  case Type::IntegerTyID: {
-    bool Signed = !TypeName.startswith("u");
-    switch (Ty->getIntegerBitWidth()) {
-    case 8:
-      return Signed ? KernelArg::I8 : KernelArg::U8;
-    case 16:
-      return Signed ? KernelArg::I16 : KernelArg::U16;
-    case 32:
-      return Signed ? KernelArg::I32 : KernelArg::U32;
-    case 64:
-      return Signed ? KernelArg::I64 : KernelArg::U64;
-    default:
-      // Runtime does not recognize other integer types. Report as struct type.
-      return KernelArg::Struct;
-    }
-  }
-  case Type::VectorTyID:
-    return getRuntimeMDValueType(Ty->getVectorElementType(), TypeName);
-  case Type::PointerTyID:
-    return getRuntimeMDValueType(Ty->getPointerElementType(), TypeName);
-  default:
-    return KernelArg::Struct;
-  }
-}
-
-static KernelArg::AddressSpaceQualifer getRuntimeAddrSpace(
-    AMDGPUAS::AddressSpaces A) {
-  switch (A) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-    return KernelArg::Global;
-  case AMDGPUAS::CONSTANT_ADDRESS:
-    return KernelArg::Constant;
-  case AMDGPUAS::LOCAL_ADDRESS:
-    return KernelArg::Local;
-  case AMDGPUAS::FLAT_ADDRESS:
-    return KernelArg::Generic;
-  case AMDGPUAS::REGION_ADDRESS:
-    return KernelArg::Region;
-  default:
-    return KernelArg::Private;
-  }
-}
-
-static KernelArg::Metadata getRuntimeMDForKernelArg(const DataLayout &DL,
-    Type *T, KernelArg::Kind Kind, StringRef BaseTypeName = "",
-    StringRef TypeName = "", StringRef ArgName = "", StringRef TypeQual = "",
-    StringRef AccQual = "") {
-  KernelArg::Metadata Arg;
-
-  // Set ArgSize and ArgAlign.
-  Arg.Size = DL.getTypeAllocSize(T);
-  Arg.Align = DL.getABITypeAlignment(T);
-  if (auto PT = dyn_cast<PointerType>(T)) {
-    auto ET = PT->getElementType();
-    if (PT->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && ET->isSized())
-      Arg.PointeeAlign = DL.getABITypeAlignment(ET);
-  }
-
-  // Set ArgTypeName.
-  Arg.TypeName = TypeName;
-
-  // Set ArgName.
-  Arg.Name = ArgName;
-
-  // Set ArgIsVolatile, ArgIsRestrict, ArgIsConst and ArgIsPipe.
-  SmallVector<StringRef, 1> SplitQ;
-  TypeQual.split(SplitQ, " ", -1, false /* Drop empty entry */);
-
-  for (StringRef KeyName : SplitQ) {
-    auto *P = StringSwitch<uint8_t *>(KeyName)
-      .Case("volatile", &Arg.IsVolatile)
-      .Case("restrict", &Arg.IsRestrict)
-      .Case("const",    &Arg.IsConst)
-      .Case("pipe",     &Arg.IsPipe)
-      .Default(nullptr);
-    if (P)
-      *P = 1;
-  }
-
-  // Set ArgKind.
-  Arg.Kind = Kind;
-
-  // Set ArgValueType.
-  Arg.ValueType = getRuntimeMDValueType(T, BaseTypeName);
-
-  // Set ArgAccQual.
-  if (!AccQual.empty()) {
-    Arg.AccQual = StringSwitch<KernelArg::AccessQualifer>(AccQual)
-      .Case("read_only",  KernelArg::ReadOnly)
-      .Case("write_only", KernelArg::WriteOnly)
-      .Case("read_write", KernelArg::ReadWrite)
-      .Default(KernelArg::AccNone);
-  }
-
-  // Set ArgAddrQual.
-  if (auto *PT = dyn_cast<PointerType>(T)) {
-    Arg.AddrQual = getRuntimeAddrSpace(static_cast<AMDGPUAS::AddressSpaces>(
-        PT->getAddressSpace()));
-  }
-
-  return Arg;
-}
-
-static Kernel::Metadata getRuntimeMDForKernel(const Function &F) {
-  Kernel::Metadata Kernel;
-  Kernel.Name = F.getName();
-  auto &M = *F.getParent();
-
-  // Set Language and LanguageVersion.
-  if (auto MD = M.getNamedMetadata("opencl.ocl.version")) {
-    if (MD->getNumOperands() != 0) {
-      auto Node = MD->getOperand(0);
-      if (Node->getNumOperands() > 1) {
-        Kernel.Language = "OpenCL C";
-        uint16_t Major = mdconst::extract<ConstantInt>(Node->getOperand(0))
-                         ->getZExtValue();
-        uint16_t Minor = mdconst::extract<ConstantInt>(Node->getOperand(1))
-                         ->getZExtValue();
-        Kernel.LanguageVersion.push_back(Major);
-        Kernel.LanguageVersion.push_back(Minor);
-      }
-    }
-  }
-
-  const DataLayout &DL = F.getParent()->getDataLayout();
-  for (auto &Arg : F.args()) {
-    unsigned I = Arg.getArgNo();
-    Type *T = Arg.getType();
-    auto TypeName = dyn_cast<MDString>(F.getMetadata(
-        "kernel_arg_type")->getOperand(I))->getString();
-    auto BaseTypeName = cast<MDString>(F.getMetadata(
-        "kernel_arg_base_type")->getOperand(I))->getString();
-    StringRef ArgName;
-    if (auto ArgNameMD = F.getMetadata("kernel_arg_name"))
-      ArgName = cast<MDString>(ArgNameMD->getOperand(I))->getString();
-    auto TypeQual = cast<MDString>(F.getMetadata(
-        "kernel_arg_type_qual")->getOperand(I))->getString();
-    auto AccQual = cast<MDString>(F.getMetadata(
-        "kernel_arg_access_qual")->getOperand(I))->getString();
-    KernelArg::Kind Kind;
-    if (TypeQual.find("pipe") != StringRef::npos)
-      Kind = KernelArg::Pipe;
-    else Kind = StringSwitch<KernelArg::Kind>(BaseTypeName)
-      .Case("sampler_t", KernelArg::Sampler)
-      .Case("queue_t",   KernelArg::Queue)
-      .Cases("image1d_t", "image1d_array_t", "image1d_buffer_t",
-             "image2d_t" , "image2d_array_t",  KernelArg::Image)
-      .Cases("image2d_depth_t", "image2d_array_depth_t",
-             "image2d_msaa_t", "image2d_array_msaa_t",
-             "image2d_msaa_depth_t",  KernelArg::Image)
-      .Cases("image2d_array_msaa_depth_t", "image3d_t",
-             KernelArg::Image)
-      .Default(isa<PointerType>(T) ?
-                   (T->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ?
-                   KernelArg::DynamicSharedPointer :
-                   KernelArg::GlobalBuffer) :
-                   KernelArg::ByValue);
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, T, Kind,
-        BaseTypeName, TypeName, ArgName, TypeQual, AccQual));
-  }
-
-  // Emit hidden kernel arguments for OpenCL kernels.
-  if (F.getParent()->getNamedMetadata("opencl.ocl.version")) {
-    auto Int64T = Type::getInt64Ty(F.getContext());
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
-        KernelArg::HiddenGlobalOffsetX));
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
-        KernelArg::HiddenGlobalOffsetY));
-    Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int64T,
-        KernelArg::HiddenGlobalOffsetZ));
-    if (F.getParent()->getNamedMetadata("llvm.printf.fmts")) {
-      auto Int8PtrT = Type::getInt8PtrTy(F.getContext(),
-          KernelArg::Global);
-      Kernel.Args.emplace_back(getRuntimeMDForKernelArg(DL, Int8PtrT,
-          KernelArg::HiddenPrintfBuffer));
-    }
-  }
-
-  // Set ReqdWorkGroupSize, WorkGroupSizeHint, and VecTypeHint.
-  if (auto RWGS = F.getMetadata("reqd_work_group_size"))
-    Kernel.ReqdWorkGroupSize = getThreeInt32(RWGS);
-
-  if (auto WGSH = F.getMetadata("work_group_size_hint"))
-    Kernel.WorkGroupSizeHint = getThreeInt32(WGSH);
-
-  if (auto VTH = F.getMetadata("vec_type_hint"))
-    Kernel.VecTypeHint = getOCLTypeName(cast<ValueAsMetadata>(
-      VTH->getOperand(0))->getType(), mdconst::extract<ConstantInt>(
-      VTH->getOperand(1))->getZExtValue());
-
-  return Kernel;
-}
-
-Program::Metadata::Metadata(const std::string &YAML) {
-  yaml::Input Input(YAML);
-  Input >> *this;
-}
-
-std::string Program::Metadata::toYAML() {
-  std::string Text;
-  raw_string_ostream Stream(Text);
-  yaml::Output Output(Stream, nullptr,
-                      std::numeric_limits<int>::max() /* do not wrap line */);
-  Output << *this;
-  return Stream.str();
-}
-
-Program::Metadata Program::Metadata::fromYAML(const std::string &S) {
-  return Program::Metadata(S);
-}
-
-// Check if the YAML string can be parsed.
-static void checkRuntimeMDYAMLString(const std::string &YAML) {
-  auto P = Program::Metadata::fromYAML(YAML);
-  auto S = P.toYAML();
-  errs() << "AMDGPU runtime metadata parser test "
-         << (YAML == S ? "passes" : "fails") << ".\n";
-  if (YAML != S) {
-    errs() << "First output: " << YAML << '\n'
-           << "Second output: " << S << '\n';
-  }
-}
-
-std::string llvm::getRuntimeMDYAMLString(Module &M) {
-  Program::Metadata Prog;
-  Prog.MDVersionSeq.push_back(MDVersion);
-  Prog.MDVersionSeq.push_back(MDRevision);
-
-  // Set PrintfInfo.
-  if (auto MD = M.getNamedMetadata("llvm.printf.fmts")) {
-    for (unsigned I = 0; I < MD->getNumOperands(); ++I) {
-      auto Node = MD->getOperand(I);
-      if (Node->getNumOperands() > 0)
-        Prog.PrintfInfo.push_back(cast<MDString>(Node->getOperand(0))
-            ->getString());
-    }
-  }
-
-  // Set Kernels.
-  for (auto &F: M.functions()) {
-    if (!F.getMetadata("kernel_arg_type"))
-      continue;
-    Prog.Kernels.emplace_back(getRuntimeMDForKernel(F));
-  }
-
-  auto YAML = Prog.toYAML();
-
-  if (DumpRuntimeMD)
-    errs() << "AMDGPU runtime metadata:\n" << YAML << '\n';
-
-  if (CheckRuntimeMDParser)
-    checkRuntimeMDYAMLString(YAML);
-
-  return YAML;
-}
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
deleted file mode 100644
index a92fdd4bebc28fd20fd171f33279b5c22dca06ff..0000000000000000000000000000000000000000
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPURuntimeMD.h
+++ /dev/null
@@ -1,26 +0,0 @@
-//===- AMDGPURuntimeMD.h - Generate runtime metadata ---------------*- C++ -*-//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares functions for generating runtime metadata.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
-#define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPURUNTIMEMD_H
-
-#include <string>
-
-namespace llvm {
-class Module;
-
-// Get runtime metadata as YAML string.
-std::string getRuntimeMDYAMLString(Module &M);
-
-}
-#endif
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index 9be6d5160c6c938235b931c81ab6c0f859641ef0..8dc863f723e2e260ef57f61b888786c124769612 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -27,7 +27,6 @@
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/FormattedStream.h"
-#include "AMDGPURuntimeMD.h"
 
 namespace llvm {
 #include "AMDGPUPTNote.h"
@@ -36,9 +35,27 @@ namespace llvm {
 using namespace llvm;
 using namespace llvm::AMDGPU;
 
+//===----------------------------------------------------------------------===//
+// AMDGPUTargetStreamer
+//===----------------------------------------------------------------------===//
+
 AMDGPUTargetStreamer::AMDGPUTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S) {}
 
+void AMDGPUTargetStreamer::EmitStartOfCodeObjectMetadata(const Module &Mod) {
+  CodeObjectMetadataStreamer.begin(Mod);
+}
+
+void AMDGPUTargetStreamer::EmitKernelCodeObjectMetadata(
+    const Function &Func, const amd_kernel_code_t &KernelCode) {
+  CodeObjectMetadataStreamer.emitKernel(Func, KernelCode);
+}
+
+void AMDGPUTargetStreamer::EmitEndOfCodeObjectMetadata() {
+  CodeObjectMetadataStreamer.end();
+  EmitCodeObjectMetadata(CodeObjectMetadataStreamer.toYamlString().get());
+}
+
 //===----------------------------------------------------------------------===//
 // AMDGPUTargetAsmStreamer
 //===----------------------------------------------------------------------===//
@@ -93,16 +110,16 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUHsaProgramScopeGlobal(
   OS << "\t.amdgpu_hsa_program_global " << GlobalName << '\n';
 }
 
-void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(Module &M) {
-  OS << "\t.amdgpu_runtime_metadata\n";
-  OS << getRuntimeMDYAMLString(M);
-  OS << "\n\t.end_amdgpu_runtime_metadata\n";
-}
+bool AMDGPUTargetAsmStreamer::EmitCodeObjectMetadata(StringRef YamlString) {
+  auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString);
+  if (!VerifiedYamlString)
+    return false;
+
+  OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveBegin << '\n';
+  OS << VerifiedYamlString.get();
+  OS << '\t' << AMDGPU::CodeObject::MetadataAssemblerDirectiveEnd << '\n';
 
-void AMDGPUTargetAsmStreamer::EmitRuntimeMetadata(StringRef Metadata) {
-  OS << "\t.amdgpu_runtime_metadata";
-  OS << Metadata;
-  OS << "\t.end_amdgpu_runtime_metadata\n";
+  return true;
 }
 
 //===----------------------------------------------------------------------===//
@@ -117,20 +134,20 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
 }
 
 void AMDGPUTargetELFStreamer::EmitAMDGPUNote(
-    const MCExpr *DescSZ, PT_NOTE::NoteType Type,
+    const MCExpr *DescSZ, ElfNote::NoteType Type,
     function_ref<void(MCELFStreamer &)> EmitDesc) {
   auto &S = getStreamer();
   auto &Context = S.getContext();
 
-  auto NameSZ = sizeof(PT_NOTE::NoteName);
+  auto NameSZ = sizeof(ElfNote::NoteName);
 
   S.PushSection();
   S.SwitchSection(Context.getELFSection(
-    PT_NOTE::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
+    ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
   S.EmitIntValue(NameSZ, 4);                                  // namesz
   S.EmitValue(DescSZ, 4);                                     // descz
-  S.EmitIntValue(Type, 4); // type
-  S.EmitBytes(StringRef(PT_NOTE::NoteName, NameSZ));          // name
+  S.EmitIntValue(Type, 4);                                    // type
+  S.EmitBytes(StringRef(ElfNote::NoteName, NameSZ));          // name
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
   EmitDesc(S);                                                // desc
   S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
@@ -143,7 +160,7 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(uint32_t Major,
 
   EmitAMDGPUNote(
     MCConstantExpr::create(8, getContext()),
-    PT_NOTE::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
+    ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION,
     [&](MCELFStreamer &OS){
       OS.EmitIntValue(Major, 4);
       OS.EmitIntValue(Minor, 4);
@@ -159,14 +176,14 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
                                                        StringRef ArchName) {
   uint16_t VendorNameSize = VendorName.size() + 1;
   uint16_t ArchNameSize = ArchName.size() + 1;
-  
+
   unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) +
     sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
     VendorNameSize + ArchNameSize;
 
   EmitAMDGPUNote(
     MCConstantExpr::create(DescSZ, getContext()),
-    PT_NOTE::NT_AMDGPU_HSA_ISA,
+    ElfNote::NT_AMDGPU_HSA_ISA,
     [&](MCELFStreamer &OS) {
       OS.EmitIntValue(VendorNameSize, 2);
       OS.EmitIntValue(ArchNameSize, 2);
@@ -215,7 +232,11 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUHsaProgramScopeGlobal(
   Symbol->setBinding(ELF::STB_GLOBAL);
 }
 
-void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) {
+bool AMDGPUTargetELFStreamer::EmitCodeObjectMetadata(StringRef YamlString) {
+  auto VerifiedYamlString = CodeObjectMetadataStreamer.toYamlString(YamlString);
+  if (!VerifiedYamlString)
+    return false;
+
   // Create two labels to mark the beginning and end of the desc field
   // and a MCExpr to calculate the size of the desc field.
   auto &Context = getContext();
@@ -227,15 +248,13 @@ void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(StringRef Metadata) {
 
   EmitAMDGPUNote(
     DescSZ,
-    PT_NOTE::NT_AMDGPU_HSA_RUNTIME_METADATA,
+    ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_METADATA,
     [&](MCELFStreamer &OS) {
       OS.EmitLabel(DescBegin);
-      OS.EmitBytes(Metadata);
+      OS.EmitBytes(VerifiedYamlString.get());
       OS.EmitLabel(DescEnd);
     }
   );
-}
 
-void AMDGPUTargetELFStreamer::EmitRuntimeMetadata(Module &M) {
-  EmitRuntimeMetadata(getRuntimeMDYAMLString(M));
+  return true;
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 6407f5e3ee7a1f5f818002fa639c5e9db1cecb40..5c588bbded9c0b34158ac51f3ea8df7ca89f6d3d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -10,6 +10,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 #define LLVM_LIB_TARGET_AMDGPU_MCTARGETDESC_AMDGPUTARGETSTREAMER_H
 
+#include "AMDGPUCodeObjectMetadataStreamer.h"
 #include "AMDKernelCodeT.h"
 #include "llvm/MC/MCStreamer.h"
 
@@ -26,6 +27,7 @@ class Type;
 
 class AMDGPUTargetStreamer : public MCTargetStreamer {
 protected:
+  AMDGPU::CodeObject::MetadataStreamer CodeObjectMetadataStreamer;
   MCContext &getContext() const { return Streamer.getContext(); }
 
 public:
@@ -46,12 +48,18 @@ public:
 
   virtual void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) = 0;
 
-  virtual void EmitRuntimeMetadata(Module &M) = 0;
+  virtual void EmitStartOfCodeObjectMetadata(const Module &Mod);
 
-  virtual void EmitRuntimeMetadata(StringRef Metadata) = 0;
+  virtual void EmitKernelCodeObjectMetadata(
+      const Function &Func, const amd_kernel_code_t &KernelCode);
+
+  virtual void EmitEndOfCodeObjectMetadata();
+
+  /// \returns True on success, false on failure.
+  virtual bool EmitCodeObjectMetadata(StringRef YamlString) = 0;
 };
 
-class AMDGPUTargetAsmStreamer : public AMDGPUTargetStreamer {
+class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   formatted_raw_ostream &OS;
 public:
   AMDGPUTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
@@ -70,15 +78,15 @@ public:
 
   void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 
-  void EmitRuntimeMetadata(Module &M) override;
-
-  void EmitRuntimeMetadata(StringRef Metadata) override;
+  /// \returns True on success, false on failure.
+  bool EmitCodeObjectMetadata(StringRef YamlString) override;
 };
 
-class AMDGPUTargetELFStreamer : public AMDGPUTargetStreamer {
+class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   MCStreamer &Streamer;
 
-  void EmitAMDGPUNote(const MCExpr *DescSize, AMDGPU::PT_NOTE::NoteType Type,
+  void EmitAMDGPUNote(const MCExpr *DescSize,
+                      AMDGPU::ElfNote::NoteType Type,
                       function_ref<void(MCELFStreamer &)> EmitDesc);
 
 public:
@@ -101,9 +109,8 @@ public:
 
   void EmitAMDGPUHsaProgramScopeGlobal(StringRef GlobalName) override;
 
-  void EmitRuntimeMetadata(Module &M) override;
-
-  void EmitRuntimeMetadata(StringRef Metadata) override;
+  /// \returns True on success, false on failure.
+  bool EmitCodeObjectMetadata(StringRef YamlString) override;
 };
 
 }
diff --git a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
index 8a6d00ce69ed59b210252435ff619b7facabc408..09e3efad10af1aea0d6291abd5b588808a15b35d 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/AMDGPU/MCTargetDesc/CMakeLists.txt
@@ -1,13 +1,12 @@
-
 add_llvm_library(LLVMAMDGPUDesc
   AMDGPUAsmBackend.cpp
+  AMDGPUCodeObjectMetadataStreamer.cpp
   AMDGPUELFObjectWriter.cpp
   AMDGPUELFStreamer.cpp
+  AMDGPUMCAsmInfo.cpp
   AMDGPUMCCodeEmitter.cpp
   AMDGPUMCTargetDesc.cpp
-  AMDGPUMCAsmInfo.cpp
-  AMDGPURuntimeMD.cpp
   AMDGPUTargetStreamer.cpp
   R600MCCodeEmitter.cpp
   SIMCCodeEmitter.cpp
-  )
+)
diff --git a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index 0c5bb0648a1615484abaa3e2ca040cadaf3a2872..bda0928036fdeccea63121580aecf0cb19acd007 100644
--- a/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -220,13 +220,35 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
     Imm = MO.getImm();
   }
 
-  switch (AMDGPU::getOperandSize(OpInfo)) {
-  case 4:
+  switch (OpInfo.OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32:
     return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
-  case 8:
+
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64:
     return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
-  case 2:
+
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    // FIXME Is this correct? What do inline immediates do on SI for f16 src
+    // which does not have f16 support?
     return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
+
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    uint16_t Lo16 = static_cast<uint16_t>(Imm);
+    assert(Lo16 == static_cast<uint16_t>(Imm >> 16));
+    uint32_t Encoding = getLit16Encoding(Lo16, STI);
+    assert(Encoding != 255 && "packed constants can only be inline immediates");
+    return Encoding;
+  }
   default:
     llvm_unreachable("invalid operand size");
   }
diff --git a/lib/Target/AMDGPU/MIMGInstructions.td b/lib/Target/AMDGPU/MIMGInstructions.td
index 46803e555711b255668b170d338b4c1b6c1d563b..a515eecc222afbd3d9a978949e68a7ad6dc1964a 100644
--- a/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/lib/Target/AMDGPU/MIMGInstructions.td
@@ -475,106 +475,6 @@ class ImageAtomicCmpSwapPattern<MIMG opcode, ValueType vt> : Pat <
     sub0)
 >;
 
-// ======= SI Image Intrinsics ================
-
-// Image load
-defm : ImagePatterns<int_SI_image_load, "IMAGE_LOAD">;
-defm : ImagePatterns<int_SI_image_load_mip, "IMAGE_LOAD_MIP">;
-def : ImagePattern<int_SI_getresinfo, IMAGE_GET_RESINFO_V4_V1, i32>;
-
-// Basic sample
-defm : SampleRawPatterns<int_SI_image_sample,           "IMAGE_SAMPLE">;
-defm : SampleRawPatterns<int_SI_image_sample_cl,        "IMAGE_SAMPLE_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_d,         "IMAGE_SAMPLE_D">;
-defm : SampleRawPatterns<int_SI_image_sample_d_cl,      "IMAGE_SAMPLE_D_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_l,         "IMAGE_SAMPLE_L">;
-defm : SampleRawPatterns<int_SI_image_sample_b,         "IMAGE_SAMPLE_B">;
-defm : SampleRawPatterns<int_SI_image_sample_b_cl,      "IMAGE_SAMPLE_B_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_lz,        "IMAGE_SAMPLE_LZ">;
-defm : SampleRawPatterns<int_SI_image_sample_cd,        "IMAGE_SAMPLE_CD">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_cl,     "IMAGE_SAMPLE_CD_CL">;
-
-// Sample with comparison
-defm : SampleRawPatterns<int_SI_image_sample_c,         "IMAGE_SAMPLE_C">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cl,      "IMAGE_SAMPLE_C_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d,       "IMAGE_SAMPLE_C_D">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_cl,    "IMAGE_SAMPLE_C_D_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_l,       "IMAGE_SAMPLE_C_L">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b,       "IMAGE_SAMPLE_C_B">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_cl,    "IMAGE_SAMPLE_C_B_CL">;
-defm : SampleRawPatterns<int_SI_image_sample_c_lz,      "IMAGE_SAMPLE_C_LZ">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd,      "IMAGE_SAMPLE_C_CD">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl,   "IMAGE_SAMPLE_C_CD_CL">;
-
-// Sample with offsets
-defm : SampleRawPatterns<int_SI_image_sample_o,         "IMAGE_SAMPLE_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cl_o,      "IMAGE_SAMPLE_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_d_o,       "IMAGE_SAMPLE_D_O">;
-defm : SampleRawPatterns<int_SI_image_sample_d_cl_o,    "IMAGE_SAMPLE_D_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_l_o,       "IMAGE_SAMPLE_L_O">;
-defm : SampleRawPatterns<int_SI_image_sample_b_o,       "IMAGE_SAMPLE_B_O">;
-defm : SampleRawPatterns<int_SI_image_sample_b_cl_o,    "IMAGE_SAMPLE_B_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_lz_o,      "IMAGE_SAMPLE_LZ_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_o,      "IMAGE_SAMPLE_CD_O">;
-defm : SampleRawPatterns<int_SI_image_sample_cd_cl_o,   "IMAGE_SAMPLE_CD_CL_O">;
-
-// Sample with comparison and offsets
-defm : SampleRawPatterns<int_SI_image_sample_c_o,       "IMAGE_SAMPLE_C_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cl_o,    "IMAGE_SAMPLE_C_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_o,     "IMAGE_SAMPLE_C_D_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_d_cl_o,  "IMAGE_SAMPLE_C_D_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_l_o,     "IMAGE_SAMPLE_C_L_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_o,     "IMAGE_SAMPLE_C_B_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_b_cl_o,  "IMAGE_SAMPLE_C_B_CL_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_lz_o,    "IMAGE_SAMPLE_C_LZ_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_o,    "IMAGE_SAMPLE_C_CD_O">;
-defm : SampleRawPatterns<int_SI_image_sample_c_cd_cl_o, "IMAGE_SAMPLE_C_CD_CL_O">;
-
-// Gather opcodes
-// Only the variants which make sense are defined.
-def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V2,        v2i32>;
-def : SampleRawPattern<int_SI_gather4,           IMAGE_GATHER4_V4_V4,        v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl,        IMAGE_GATHER4_CL_V4_V4,     v4i32>;
-def : SampleRawPattern<int_SI_gather4_l,         IMAGE_GATHER4_L_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_b,         IMAGE_GATHER4_B_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl,      IMAGE_GATHER4_B_CL_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V2,     v2i32>;
-def : SampleRawPattern<int_SI_gather4_lz,        IMAGE_GATHER4_LZ_V4_V4,     v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_c,         IMAGE_GATHER4_C_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl,      IMAGE_GATHER4_C_CL_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_l,       IMAGE_GATHER4_C_L_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_b,       IMAGE_GATHER4_C_B_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_cl,    IMAGE_GATHER4_C_B_CL_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz,      IMAGE_GATHER4_C_LZ_V4_V4,   v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_o,         IMAGE_GATHER4_O_V4_V4,      v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V4,   v4i32>;
-def : SampleRawPattern<int_SI_gather4_cl_o,      IMAGE_GATHER4_CL_O_V4_V8,   v8i32>;
-def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_l_o,       IMAGE_GATHER4_L_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_b_o,       IMAGE_GATHER4_B_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_b_cl_o,    IMAGE_GATHER4_B_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_lz_o,      IMAGE_GATHER4_LZ_O_V4_V4,   v4i32>;
-
-def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V4,    v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_o,       IMAGE_GATHER4_C_O_V4_V8,    v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_cl_o,    IMAGE_GATHER4_C_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_l_o,     IMAGE_GATHER4_C_L_O_V4_V8,  v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_o,     IMAGE_GATHER4_C_B_O_V4_V8,  v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_b_cl_o,  IMAGE_GATHER4_C_B_CL_O_V4_V8, v8i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V4, v4i32>;
-def : SampleRawPattern<int_SI_gather4_c_lz_o,    IMAGE_GATHER4_C_LZ_O_V4_V8, v8i32>;
-
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V1, i32>;
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V2, v2i32>;
-def : SampleRawPattern<int_SI_getlod, IMAGE_GET_LOD_V4_V4, v4i32>;
-
 // ======= amdgcn Image Intrinsics ==============
 
 // Image load
diff --git a/lib/Target/AMDGPU/Processors.td b/lib/Target/AMDGPU/Processors.td
index 3c07cc76b9a14d8e21b4a29ac4de36e81732db80..0e4eda982139d3c5950969a80fb0c26dcdbb9566 100644
--- a/lib/Target/AMDGPU/Processors.td
+++ b/lib/Target/AMDGPU/Processors.td
@@ -187,3 +187,10 @@ def : ProcessorModel<"gfx810", SIQuarterSpeedModel,
   [FeatureISAVersion8_1_0]
 >;
 
+def : ProcessorModel<"gfx900",   SIQuarterSpeedModel,
+  [FeatureGFX9, FeatureISAVersion9_0_0, FeatureLDSBankCount32]
+>;
+
+def : ProcessorModel<"gfx901",   SIQuarterSpeedModel,
+  [FeatureGFX9, FeatureXNACK, FeatureISAVersion9_0_1, FeatureLDSBankCount32]
+>;
diff --git a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
index ceb5870a3494a0428db6fed06794cb9e8274b37c..03fc1aff5ec1596a70de05b413da71939b3f0672 100644
--- a/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
+++ b/lib/Target/AMDGPU/R600EmitClauseMarkers.cpp
@@ -196,6 +196,7 @@ private:
                         MachineBasicBlock::iterator Def,
                         MachineBasicBlock::iterator BBEnd) {
     const R600RegisterInfo &TRI = TII->getRegisterInfo();
+    //TODO: change this to defs?
     for (MachineInstr::const_mop_iterator
            MOI = Def->operands_begin(),
            MOE = Def->operands_end(); MOI != MOE; ++MOI) {
@@ -218,15 +219,17 @@ private:
         if (AluInstCount >= TII->getMaxAlusPerClause())
           return false;
 
+        // TODO: Is this true? kill flag appears to work OK below
         // Register kill flags have been cleared by the time we get to this
         // pass, but it is safe to assume that all uses of this register
         // occur in the same basic block as its definition, because
         // it is illegal for the scheduler to schedule them in
         // different blocks.
-        if (UseI->findRegisterUseOperandIdx(MOI->getReg()))
+        if (UseI->readsRegister(MOI->getReg()))
           LastUseCount = AluInstCount;
 
-        if (UseI != Def && UseI->findRegisterDefOperandIdx(MOI->getReg()) != -1)
+        // Exit early if the current use kills the register
+        if (UseI != Def && UseI->killsRegister(MOI->getReg()))
           break;
       }
       if (LastUseCount)
@@ -321,9 +324,11 @@ public:
       if (I != MBB.end() && I->getOpcode() == AMDGPU::CF_ALU)
         continue; // BB was already parsed
       for (MachineBasicBlock::iterator E = MBB.end(); I != E;) {
-        if (isALU(*I))
-          I = MakeALUClause(MBB, I);
-        else
+        if (isALU(*I)) {
+          auto next = MakeALUClause(MBB, I);
+          assert(next != I);
+          I = next;
+        } else
           ++I;
       }
     }
diff --git a/lib/Target/AMDGPU/R600FrameLowering.cpp b/lib/Target/AMDGPU/R600FrameLowering.cpp
index 5813786abe0164d11777c7095a64b38952cb942b..1f01ad732e00acc05a0cefe3e63fa05a7c578a8c 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -8,7 +8,43 @@
 //==-----------------------------------------------------------------------===//
 
 #include "R600FrameLowering.h"
+#include "AMDGPUSubtarget.h"
+#include "R600RegisterInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/Support/MathExtras.h"
 
 using namespace llvm;
 
 R600FrameLowering::~R600FrameLowering() = default;
+
+/// \returns The number of registers allocated for \p FI.
+int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                              int FI,
+                                              unsigned &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const R600RegisterInfo *RI
+    = MF.getSubtarget<R600Subtarget>().getRegisterInfo();
+
+  // Fill in FrameReg output argument.
+  FrameReg = RI->getFrameRegister(MF);
+
+  // Start the offset at 2 so we don't overwrite work group information.
+  // FIXME: We should only do this when the shader actually uses this
+  // information.
+  unsigned OffsetBytes = 2 * (getStackWidth(MF) * 4);
+  int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
+
+  for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
+    OffsetBytes += MFI.getObjectSize(i);
+    // Each register holds 4 bytes, so we must always align the offset to at
+    // least 4 bytes, so that 2 frame objects won't share the same register.
+    OffsetBytes = alignTo(OffsetBytes, 4);
+  }
+
+  if (FI != -1)
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
+
+  return OffsetBytes / (getStackWidth(MF) * 4);
+}
diff --git a/lib/Target/AMDGPU/R600FrameLowering.h b/lib/Target/AMDGPU/R600FrameLowering.h
index 874435f35ce45784874fc55f7bb867ff4b5523f2..142f70967edace578391cfaea7f1d2334b7feb2c 100644
--- a/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/lib/Target/AMDGPU/R600FrameLowering.h
@@ -25,6 +25,8 @@ public:
                     MachineBasicBlock &MBB) const override {}
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override {}
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
 };
 
 } // end namespace llvm
diff --git a/lib/Target/AMDGPU/R600ISelLowering.cpp b/lib/Target/AMDGPU/R600ISelLowering.cpp
index c31af80258f9949793fdbc79259521c92c3b1968..3590a9b05e1d04773bc6cb8d08e5cb319801cb52 100644
--- a/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -226,6 +226,10 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::ATOMIC_LOAD, MVT::i32, Expand);
   setOperationAction(ISD::ATOMIC_STORE, MVT::i32, Expand);
 
+  // We need to custom lower some of the intrinsics
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
+
   setSchedulingPreference(Sched::Source);
 
   setTargetDAGCombine(ISD::FP_ROUND);
@@ -495,8 +499,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
                          cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
     EVT VT = Op.getValueType();
     SDLoc DL(Op);
-    switch(IntrinsicID) {
-    default: return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    switch (IntrinsicID) {
     case AMDGPUIntrinsic::r600_tex:
     case AMDGPUIntrinsic::r600_texc: {
       unsigned TextureOp;
@@ -557,7 +560,7 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
     }
 
     case Intrinsic::r600_implicitarg_ptr: {
-      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUAS::PARAM_I_ADDRESS);
+      MVT PtrVT = getPointerTy(DAG.getDataLayout(), AMDGPUASI.PARAM_I_ADDRESS);
       uint32_t ByteOffset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
       return DAG.getConstant(ByteOffset, DL, PtrVT);
     }
@@ -604,6 +607,8 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
 
     case Intrinsic::r600_recipsqrt_clamped:
       return DAG.getNode(AMDGPUISD::RSQ_CLAMP, DL, VT, Op.getOperand(1));
+    default:
+      return Op;
     }
 
     // break out of case ISD::INTRINSIC_WO_CHAIN in switch(Op.getOpcode())
@@ -707,12 +712,12 @@ SDValue R600TargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                                SDValue Op,
                                                SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
-  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS)
+  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   const DataLayout &DL = DAG.getDataLayout();
   const GlobalValue *GV = GSD->getGlobal();
-  MVT ConstPtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+  MVT ConstPtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
 
   SDValue GA = DAG.getTargetGlobalAddress(GV, SDLoc(GSD), ConstPtrVT);
   return DAG.getNode(AMDGPUISD::CONST_DATA_PTR, SDLoc(GSD), ConstPtrVT, GA);
@@ -869,7 +874,7 @@ SDValue R600TargetLowering::LowerImplicitParameter(SelectionDAG &DAG, EVT VT,
                                                    unsigned DwordOffset) const {
   unsigned ByteOffset = DwordOffset * 4;
   PointerType * PtrType = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                      AMDGPUAS::CONSTANT_BUFFER_0);
+                                      AMDGPUASI.CONSTANT_BUFFER_0);
 
   // We shouldn't be using an offset wider than 16-bits for implicit parameters.
   assert(isInt<16>(ByteOffset));
@@ -916,7 +921,7 @@ SDValue R600TargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const
 
   if (VT == MVT::f32) {
     DAGCombinerInfo DCI(DAG, AfterLegalizeVectorOps, true, nullptr);
-    SDValue MinMax = CombineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
+    SDValue MinMax = combineFMinMaxLegacy(DL, VT, LHS, RHS, True, False, CC, DCI);
     if (MinMax)
       return MinMax;
   }
@@ -1107,7 +1112,7 @@ SDValue R600TargetLowering::lowerPrivateTruncStore(StoreSDNode *Store,
   //TODO: Who creates the i8 stores?
   assert(Store->isTruncatingStore()
          || Store->getValue().getValueType() == MVT::i8);
-  assert(Store->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS);
+  assert(Store->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS);
 
   SDValue Mask;
   if (Store->getMemoryVT() == MVT::i8) {
@@ -1205,9 +1210,10 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
 
   // Neither LOCAL nor PRIVATE can do vectors at the moment
-  if ((AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::PRIVATE_ADDRESS) &&
+  if ((AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.PRIVATE_ADDRESS) &&
       VT.isVector()) {
-    if ((AS == AMDGPUAS::PRIVATE_ADDRESS) && StoreNode->isTruncatingStore()) {
+    if ((AS == AMDGPUASI.PRIVATE_ADDRESS) &&
+         StoreNode->isTruncatingStore()) {
       // Add an extra level of chain to isolate this vector
       SDValue NewChain = DAG.getNode(AMDGPUISD::DUMMY_CHAIN, DL, MVT::Other, Chain);
       // TODO: can the chain be replaced without creating a new store?
@@ -1230,7 +1236,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SDValue DWordAddr = DAG.getNode(ISD::SRL, DL, PtrVT, Ptr,
                                   DAG.getConstant(2, DL, PtrVT));
 
-  if (AS == AMDGPUAS::GLOBAL_ADDRESS) {
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
     // It is beneficial to create MSKOR here instead of combiner to avoid
     // artificial dependencies introduced by RMW
     if (StoreNode->isTruncatingStore()) {
@@ -1283,7 +1289,7 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // GLOBAL_ADDRESS has been handled above, LOCAL_ADDRESS allows all sizes
-  if (AS != AMDGPUAS::PRIVATE_ADDRESS)
+  if (AS != AMDGPUASI.PRIVATE_ADDRESS)
     return SDValue();
 
   if (MemVT.bitsLT(MVT::i32))
@@ -1302,39 +1308,39 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
 // return (512 + (kc_bank << 12)
 static int
-ConstantAddressBlock(unsigned AddressSpace) {
+ConstantAddressBlock(unsigned AddressSpace, AMDGPUAS AMDGPUASI) {
   switch (AddressSpace) {
-  case AMDGPUAS::CONSTANT_BUFFER_0:
+  case AMDGPUASI.CONSTANT_BUFFER_0:
     return 512;
-  case AMDGPUAS::CONSTANT_BUFFER_1:
+  case AMDGPUASI.CONSTANT_BUFFER_1:
     return 512 + 4096;
-  case AMDGPUAS::CONSTANT_BUFFER_2:
+  case AMDGPUASI.CONSTANT_BUFFER_2:
     return 512 + 4096 * 2;
-  case AMDGPUAS::CONSTANT_BUFFER_3:
+  case AMDGPUASI.CONSTANT_BUFFER_3:
     return 512 + 4096 * 3;
-  case AMDGPUAS::CONSTANT_BUFFER_4:
+  case AMDGPUASI.CONSTANT_BUFFER_4:
     return 512 + 4096 * 4;
-  case AMDGPUAS::CONSTANT_BUFFER_5:
+  case AMDGPUASI.CONSTANT_BUFFER_5:
     return 512 + 4096 * 5;
-  case AMDGPUAS::CONSTANT_BUFFER_6:
+  case AMDGPUASI.CONSTANT_BUFFER_6:
     return 512 + 4096 * 6;
-  case AMDGPUAS::CONSTANT_BUFFER_7:
+  case AMDGPUASI.CONSTANT_BUFFER_7:
     return 512 + 4096 * 7;
-  case AMDGPUAS::CONSTANT_BUFFER_8:
+  case AMDGPUASI.CONSTANT_BUFFER_8:
     return 512 + 4096 * 8;
-  case AMDGPUAS::CONSTANT_BUFFER_9:
+  case AMDGPUASI.CONSTANT_BUFFER_9:
     return 512 + 4096 * 9;
-  case AMDGPUAS::CONSTANT_BUFFER_10:
+  case AMDGPUASI.CONSTANT_BUFFER_10:
     return 512 + 4096 * 10;
-  case AMDGPUAS::CONSTANT_BUFFER_11:
+  case AMDGPUASI.CONSTANT_BUFFER_11:
     return 512 + 4096 * 11;
-  case AMDGPUAS::CONSTANT_BUFFER_12:
+  case AMDGPUASI.CONSTANT_BUFFER_12:
     return 512 + 4096 * 12;
-  case AMDGPUAS::CONSTANT_BUFFER_13:
+  case AMDGPUASI.CONSTANT_BUFFER_13:
     return 512 + 4096 * 13;
-  case AMDGPUAS::CONSTANT_BUFFER_14:
+  case AMDGPUASI.CONSTANT_BUFFER_14:
     return 512 + 4096 * 14;
-  case AMDGPUAS::CONSTANT_BUFFER_15:
+  case AMDGPUASI.CONSTANT_BUFFER_15:
     return 512 + 4096 * 15;
   default:
     return -1;
@@ -1402,7 +1408,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   EVT MemVT = LoadNode->getMemoryVT();
   ISD::LoadExtType ExtType = LoadNode->getExtensionType();
 
-  if (AS == AMDGPUAS::PRIVATE_ADDRESS &&
+  if (AS == AMDGPUASI.PRIVATE_ADDRESS &&
       ExtType != ISD::NON_EXTLOAD && MemVT.bitsLT(MVT::i32)) {
     return lowerPrivateExtLoad(Op, DAG);
   }
@@ -1412,13 +1418,14 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SDValue Chain = LoadNode->getChain();
   SDValue Ptr = LoadNode->getBasePtr();
 
-  if ((LoadNode->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-      LoadNode->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) &&
+  if ((LoadNode->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS ||
+      LoadNode->getAddressSpace() == AMDGPUASI.PRIVATE_ADDRESS) &&
       VT.isVector()) {
       return scalarizeVectorLoad(LoadNode, DAG);
   }
 
-  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace());
+  int ConstantBlock = ConstantAddressBlock(LoadNode->getAddressSpace(),
+      AMDGPUASI);
   if (ConstantBlock > -1 &&
       ((LoadNode->getExtensionType() == ISD::NON_EXTLOAD) ||
        (LoadNode->getExtensionType() == ISD::ZEXTLOAD))) {
@@ -1450,7 +1457,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
           DAG.getNode(ISD::SRL, DL, MVT::i32, Ptr,
                       DAG.getConstant(4, DL, MVT::i32)),
                       DAG.getConstant(LoadNode->getAddressSpace() -
-                                      AMDGPUAS::CONSTANT_BUFFER_0, DL, MVT::i32)
+                                      AMDGPUASI.CONSTANT_BUFFER_0, DL, MVT::i32)
           );
     }
 
@@ -1486,7 +1493,7 @@ SDValue R600TargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     return DAG.getMergeValues(MergedValues, DL);
   }
 
-  if (LoadNode->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS) {
+  if (LoadNode->getAddressSpace() != AMDGPUASI.PRIVATE_ADDRESS) {
     return SDValue();
   }
 
@@ -1540,7 +1547,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
   SmallVector<ISD::InputArg, 8> LocalIns;
 
   if (AMDGPU::isShader(CallConv)) {
-    AnalyzeFormalArguments(CCInfo, Ins);
+    CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
   } else {
     analyzeFormalArgumentsCompute(CCInfo, Ins);
   }
@@ -1563,7 +1570,7 @@ SDValue R600TargetLowering::LowerFormalArguments(
     }
 
     PointerType *PtrTy = PointerType::get(VT.getTypeForEVT(*DAG.getContext()),
-                                          AMDGPUAS::CONSTANT_BUFFER_0);
+                                          AMDGPUASI.CONSTANT_BUFFER_0);
 
     // i64 isn't a legal type, so the register type used ends up as i32, which
     // isn't expected here. It attempts to create this sextload, but it ends up
diff --git a/lib/Target/AMDGPU/R600InstrInfo.cpp b/lib/Target/AMDGPU/R600InstrInfo.cpp
index 3cd983056631f2bc443fb249a268c9cba1e613d8..2422d57269eb96500c7a5fda5918fa98a137837b 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -869,7 +869,7 @@ bool R600InstrInfo::isPredicated(const MachineInstr &MI) const {
   }
 }
 
-bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
+bool R600InstrInfo::isPredicable(const MachineInstr &MI) const {
   // XXX: KILL* instructions can be predicated, but they must be the last
   // instruction in a clause, so this means any instructions after them cannot
   // be predicated.  Until we have proper support for instruction clauses in the
@@ -880,7 +880,7 @@ bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
   } else if (MI.getOpcode() == AMDGPU::CF_ALU) {
     // If the clause start in the middle of MBB then the MBB has more
     // than a single clause, unable to predicate several clauses.
-    if (MI.getParent()->begin() != MachineBasicBlock::iterator(MI))
+    if (MI.getParent()->begin() != MachineBasicBlock::const_iterator(MI))
       return false;
     // TODO: We don't support KC merging atm
     return MI.getOperand(3).getImm() == 0 && MI.getOperand(4).getImm() == 0;
@@ -893,7 +893,7 @@ bool R600InstrInfo::isPredicable(MachineInstr &MI) const {
 
 bool
 R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &MBB,
-                                   unsigned NumCyles,
+                                   unsigned NumCycles,
                                    unsigned ExtraPredCycles,
                                    BranchProbability Probability) const{
   return true;
@@ -912,7 +912,7 @@ R600InstrInfo::isProfitableToIfCvt(MachineBasicBlock &TMBB,
 
 bool
 R600InstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
-                                         unsigned NumCyles,
+                                         unsigned NumCycles,
                                          BranchProbability Probability)
                                          const {
   return true;
diff --git a/lib/Target/AMDGPU/R600InstrInfo.h b/lib/Target/AMDGPU/R600InstrInfo.h
index a280052dbd4a3f4b9c1f335a2e18764b7b9426d7..3b828006807e37ef0cc748d55a2e97e4db1200ac 100644
--- a/lib/Target/AMDGPU/R600InstrInfo.h
+++ b/lib/Target/AMDGPU/R600InstrInfo.h
@@ -177,12 +177,12 @@ public:
 
   bool isPredicated(const MachineInstr &MI) const override;
 
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
-  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+  bool isProfitableToDupForIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                                  BranchProbability Probability) const override;
 
-  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCyles,
+  bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
                            BranchProbability Probability) const override ;
 
diff --git a/lib/Target/AMDGPU/R600Instructions.td b/lib/Target/AMDGPU/R600Instructions.td
index 9210e66b0fe76e6ad54d75a353c76e0fca94ccf2..bac557ba989ef798ef954f106c4bbacfefbd820d 100644
--- a/lib/Target/AMDGPU/R600Instructions.td
+++ b/lib/Target/AMDGPU/R600Instructions.td
@@ -316,7 +316,7 @@ class VTX_READ <string name, dag outs, list<dag> pattern>
 class LoadParamFrag <PatFrag load_type> : PatFrag <
   (ops node:$ptr), (load_type node:$ptr),
   [{ return isConstantLoad(cast<LoadSDNode>(N), 0) ||
-            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::PARAM_I_ADDRESS); }]
+            (cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.PARAM_I_ADDRESS); }]
 >;
 
 def vtx_id3_az_extloadi8 : LoadParamFrag<az_extloadi8>;
@@ -326,8 +326,8 @@ def vtx_id3_load : LoadParamFrag<load>;
 class LoadVtxId1 <PatFrag load> : PatFrag <
   (ops node:$ptr), (load node:$ptr), [{
   const MemSDNode *LD = cast<MemSDNode>(N);
-  return LD->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-         (LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+  return LD->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
+         (LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
            !isa<GlobalValue>(GetUnderlyingObject(
            LD->getMemOperand()->getValue(), CurDAG->getDataLayout())));
 }]>;
@@ -339,7 +339,7 @@ def vtx_id1_load : LoadVtxId1 <load>;
 class LoadVtxId2 <PatFrag load> : PatFrag <
   (ops node:$ptr), (load node:$ptr), [{
   const MemSDNode *LD = cast<MemSDNode>(N);
-  return LD->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+  return LD->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
          isa<GlobalValue>(GetUnderlyingObject(
          LD->getMemOperand()->getValue(), CurDAG->getDataLayout()));
 }]>;
@@ -1013,7 +1013,7 @@ multiclass CUBE_Common <bits<11> inst> {
     (outs R600_Reg128:$dst),
     (ins R600_Reg128:$src0),
     "CUBE $dst $src0",
-    [(set v4f32:$dst, (int_AMDGPU_cube v4f32:$src0))],
+    [(set v4f32:$dst, (int_r600_cube v4f32:$src0))],
     VecALU
   > {
     let isPseudo = 1;
diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index d70f52e0f295279bf248287cf19d699955b80ffd..b7e62075244b8147e79b0d760500c06527d86000 100644
--- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 
 using namespace llvm;
@@ -34,15 +35,6 @@ namespace {
 typedef std::pair<BasicBlock *, Value *> StackEntry;
 typedef SmallVector<StackEntry, 16> StackVector;
 
-// Intrinsic names the control flow is annotated with
-static const char *const IfIntrinsic = "llvm.amdgcn.if";
-static const char *const ElseIntrinsic = "llvm.amdgcn.else";
-static const char *const BreakIntrinsic = "llvm.amdgcn.break";
-static const char *const IfBreakIntrinsic = "llvm.amdgcn.if.break";
-static const char *const ElseBreakIntrinsic = "llvm.amdgcn.else.break";
-static const char *const LoopIntrinsic = "llvm.amdgcn.loop";
-static const char *const EndCfIntrinsic = "llvm.amdgcn.end.cf";
-
 class SIAnnotateControlFlow : public FunctionPass {
   DivergenceAnalysis *DA;
 
@@ -56,13 +48,13 @@ class SIAnnotateControlFlow : public FunctionPass {
   UndefValue *BoolUndef;
   Constant *Int64Zero;
 
-  Constant *If;
-  Constant *Else;
-  Constant *Break;
-  Constant *IfBreak;
-  Constant *ElseBreak;
-  Constant *Loop;
-  Constant *EndCf;
+  Function *If;
+  Function *Else;
+  Function *Break;
+  Function *IfBreak;
+  Function *ElseBreak;
+  Function *Loop;
+  Function *EndCf;
 
   DominatorTree *DT;
   StackVector Stack;
@@ -86,7 +78,8 @@ class SIAnnotateControlFlow : public FunctionPass {
   void insertElse(BranchInst *Term);
 
   Value *handleLoopCondition(Value *Cond, PHINode *Broken,
-                             llvm::Loop *L, BranchInst *Term);
+                             llvm::Loop *L, BranchInst *Term,
+                             SmallVectorImpl<WeakVH> &LoopPhiConditions);
 
   void handleLoop(BranchInst *Term);
 
@@ -118,6 +111,7 @@ public:
 
 INITIALIZE_PASS_BEGIN(SIAnnotateControlFlow, DEBUG_TYPE,
                       "Annotate SI Control Flow", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(SIAnnotateControlFlow, DEBUG_TYPE,
                     "Annotate SI Control Flow", false, false)
@@ -138,30 +132,13 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) {
   BoolUndef = UndefValue::get(Boolean);
   Int64Zero = ConstantInt::get(Int64, 0);
 
-  If = M.getOrInsertFunction(
-    IfIntrinsic, ReturnStruct, Boolean, (Type *)nullptr);
-
-  Else = M.getOrInsertFunction(
-    ElseIntrinsic, ReturnStruct, Int64, (Type *)nullptr);
-
-  Break = M.getOrInsertFunction(
-    BreakIntrinsic, Int64, Int64, (Type *)nullptr);
-  cast<Function>(Break)->setDoesNotAccessMemory();
-
-  IfBreak = M.getOrInsertFunction(
-    IfBreakIntrinsic, Int64, Boolean, Int64, (Type *)nullptr);
-  cast<Function>(IfBreak)->setDoesNotAccessMemory();;
-
-  ElseBreak = M.getOrInsertFunction(
-    ElseBreakIntrinsic, Int64, Int64, Int64, (Type *)nullptr);
-  cast<Function>(ElseBreak)->setDoesNotAccessMemory();
-
-  Loop = M.getOrInsertFunction(
-    LoopIntrinsic, Boolean, Int64, (Type *)nullptr);
-
-  EndCf = M.getOrInsertFunction(
-    EndCfIntrinsic, Void, Int64, (Type *)nullptr);
-
+  If = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if);
+  Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else);
+  Break = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_break);
+  IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break);
+  ElseBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else_break);
+  Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop);
+  EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf);
   return false;
 }
 
@@ -208,15 +185,16 @@ bool SIAnnotateControlFlow::isElse(PHINode *Phi) {
 
 // \brief Erase "Phi" if it is not used any more
 void SIAnnotateControlFlow::eraseIfUnused(PHINode *Phi) {
-  if (!Phi->hasNUsesOrMore(1))
-    Phi->eraseFromParent();
+  if (llvm::RecursivelyDeleteDeadPHINode(Phi)) {
+    DEBUG(dbgs() << "Erased unused condition phi\n");
+  }
 }
 
 /// \brief Open a new "If" block
 void SIAnnotateControlFlow::openIf(BranchInst *Term) {
-  if (isUniform(Term)) {
+  if (isUniform(Term))
     return;
-  }
+
   Value *Ret = CallInst::Create(If, Term->getCondition(), "", Term);
   Term->setCondition(ExtractValueInst::Create(Ret, 0, "", Term));
   push(Term->getSuccessor(1), ExtractValueInst::Create(Ret, 1, "", Term));
@@ -233,8 +211,10 @@ void SIAnnotateControlFlow::insertElse(BranchInst *Term) {
 }
 
 /// \brief Recursively handle the condition leading to a loop
-Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
-                                             llvm::Loop *L, BranchInst *Term) {
+Value *SIAnnotateControlFlow::handleLoopCondition(
+  Value *Cond, PHINode *Broken,
+  llvm::Loop *L, BranchInst *Term,
+  SmallVectorImpl<WeakVH> &LoopPhiConditions) {
 
   // Only search through PHI nodes which are inside the loop.  If we try this
   // with PHI nodes that are outside of the loop, we end up inserting new PHI
@@ -245,7 +225,7 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
   if ((Phi = dyn_cast<PHINode>(Cond)) && L->contains(Phi)) {
 
     BasicBlock *Parent = Phi->getParent();
-    PHINode *NewPhi = PHINode::Create(Int64, 0, "", &Parent->front());
+    PHINode *NewPhi = PHINode::Create(Int64, 0, "loop.phi", &Parent->front());
     Value *Ret = NewPhi;
 
     // Handle all non-constant incoming values first
@@ -258,14 +238,14 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
       }
 
       Phi->setIncomingValue(i, BoolFalse);
-      Value *PhiArg = handleLoopCondition(Incoming, Broken, L, Term);
+      Value *PhiArg = handleLoopCondition(Incoming, Broken, L,
+                                          Term, LoopPhiConditions);
       NewPhi->addIncoming(PhiArg, From);
     }
 
     BasicBlock *IDom = DT->getNode(Parent)->getIDom()->getBlock();
 
     for (unsigned i = 0, e = Phi->getNumIncomingValues(); i != e; ++i) {
-
       Value *Incoming = Phi->getIncomingValue(i);
       if (Incoming != BoolTrue)
         continue;
@@ -295,14 +275,17 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
           continue;
         }
       }
+
       TerminatorInst *Insert = From->getTerminator();
       Value *PhiArg = CallInst::Create(Break, Broken, "", Insert);
       NewPhi->setIncomingValue(i, PhiArg);
     }
-    eraseIfUnused(Phi);
+
+    LoopPhiConditions.push_back(WeakVH(Phi));
     return Ret;
+  }
 
-  } else if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
+  if (Instruction *Inst = dyn_cast<Instruction>(Cond)) {
     BasicBlock *Parent = Inst->getParent();
     Instruction *Insert;
     if (L->contains(Inst)) {
@@ -310,46 +293,55 @@ Value *SIAnnotateControlFlow::handleLoopCondition(Value *Cond, PHINode *Broken,
     } else {
       Insert = L->getHeader()->getFirstNonPHIOrDbgOrLifetime();
     }
+
     Value *Args[] = { Cond, Broken };
     return CallInst::Create(IfBreak, Args, "", Insert);
+  }
 
-  // Insert IfBreak before TERM for constant COND.
-  } else if (isa<ConstantInt>(Cond)) {
-    Value *Args[] = { Cond, Broken };
-    return CallInst::Create(IfBreak, Args, "", Term);
+  // Insert IfBreak in the loop header TERM for constant COND other than true.
+  if (isa<Constant>(Cond)) {
+    Instruction *Insert = Cond == BoolTrue ?
+      Term : L->getHeader()->getTerminator();
 
-  } else {
-    llvm_unreachable("Unhandled loop condition!");
+    Value *Args[] = { Cond, Broken };
+    return CallInst::Create(IfBreak, Args, "", Insert);
   }
-  return nullptr;
+
+  llvm_unreachable("Unhandled loop condition!");
 }
 
 /// \brief Handle a back edge (loop)
 void SIAnnotateControlFlow::handleLoop(BranchInst *Term) {
-  if (isUniform(Term)) {
+  if (isUniform(Term))
     return;
-  }
 
   BasicBlock *BB = Term->getParent();
   llvm::Loop *L = LI->getLoopFor(BB);
   if (!L)
     return;
+
   BasicBlock *Target = Term->getSuccessor(1);
-  PHINode *Broken = PHINode::Create(Int64, 0, "", &Target->front());
+  PHINode *Broken = PHINode::Create(Int64, 0, "phi.broken", &Target->front());
 
+  SmallVector<WeakVH, 8> LoopPhiConditions;
   Value *Cond = Term->getCondition();
   Term->setCondition(BoolTrue);
-  Value *Arg = handleLoopCondition(Cond, Broken, L, Term);
+  Value *Arg = handleLoopCondition(Cond, Broken, L, Term, LoopPhiConditions);
 
-  for (pred_iterator PI = pred_begin(Target), PE = pred_end(Target);
-       PI != PE; ++PI) {
+  for (BasicBlock *Pred : predecessors(Target))
+    Broken->addIncoming(Pred == BB ? Arg : Int64Zero, Pred);
+
+  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
 
-    Broken->addIncoming(*PI == BB ? Arg : Int64Zero, *PI);
+  for (WeakVH Val : reverse(LoopPhiConditions)) {
+    if (PHINode *Cond = cast_or_null<PHINode>(Val))
+      eraseIfUnused(Cond);
   }
 
-  Term->setCondition(CallInst::Create(Loop, Arg, "", Term));
   push(Term->getSuccessor(0), Arg);
-}/// \brief Close the last opened control flow
+}
+
+/// \brief Close the last opened control flow
 void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
   llvm::Loop *L = LI->getLoopFor(BB);
 
@@ -359,59 +351,62 @@ void SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
     // We can't insert an EndCF call into a loop header, because it will
     // get executed on every iteration of the loop, when it should be
     // executed only once before the loop.
-    SmallVector <BasicBlock*, 8> Latches;
+    SmallVector <BasicBlock *, 8> Latches;
     L->getLoopLatches(Latches);
 
-    std::vector<BasicBlock*> Preds;
-    for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
-      if (!is_contained(Latches, *PI))
-        Preds.push_back(*PI);
+    SmallVector<BasicBlock *, 2> Preds;
+    for (BasicBlock *Pred : predecessors(BB)) {
+      if (!is_contained(Latches, Pred))
+        Preds.push_back(Pred);
     }
+
     BB = llvm::SplitBlockPredecessors(BB, Preds, "endcf.split", DT, LI, false);
   }
 
   Value *Exec = popSaved();
-  if (!isa<UndefValue>(Exec))
-    CallInst::Create(EndCf, Exec, "", &*BB->getFirstInsertionPt());
+  Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt();
+  if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt))
+    CallInst::Create(EndCf, Exec, "", FirstInsertionPt);
 }
 
 /// \brief Annotate the control flow with intrinsics so the backend can
 /// recognize if/then/else and loops.
 bool SIAnnotateControlFlow::runOnFunction(Function &F) {
-
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   DA = &getAnalysis<DivergenceAnalysis>();
 
   for (df_iterator<BasicBlock *> I = df_begin(&F.getEntryBlock()),
        E = df_end(&F.getEntryBlock()); I != E; ++I) {
-
-    BranchInst *Term = dyn_cast<BranchInst>((*I)->getTerminator());
+    BasicBlock *BB = *I;
+    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
 
     if (!Term || Term->isUnconditional()) {
-      if (isTopOfStack(*I))
-        closeControlFlow(*I);
+      if (isTopOfStack(BB))
+        closeControlFlow(BB);
 
       continue;
     }
 
     if (I.nodeVisited(Term->getSuccessor(1))) {
-      if (isTopOfStack(*I))
-        closeControlFlow(*I);
+      if (isTopOfStack(BB))
+        closeControlFlow(BB);
 
       handleLoop(Term);
       continue;
     }
 
-    if (isTopOfStack(*I)) {
+    if (isTopOfStack(BB)) {
       PHINode *Phi = dyn_cast<PHINode>(Term->getCondition());
-      if (Phi && Phi->getParent() == *I && isElse(Phi)) {
+      if (Phi && Phi->getParent() == BB && isElse(Phi)) {
         insertElse(Term);
         eraseIfUnused(Phi);
         continue;
       }
-      closeControlFlow(*I);
+
+      closeControlFlow(BB);
     }
+
     openIf(Term);
   }
 
diff --git a/lib/Target/AMDGPU/SIDefines.h b/lib/Target/AMDGPU/SIDefines.h
index ff4e32147184a5185eb76ca583e37e4e0f6fbf54..3dd372b328668658d0a071871520a0cf0df139e0 100644
--- a/lib/Target/AMDGPU/SIDefines.h
+++ b/lib/Target/AMDGPU/SIDefines.h
@@ -36,6 +36,7 @@ enum : uint64_t {
 
  // TODO: Should this be spilt into VOP3 a and b?
   VOP3 = 1 << 10,
+  VOP3P = 1 << 12,
 
   VINTRP = 1 << 13,
   SDWA = 1 << 14,
@@ -65,8 +66,8 @@ enum : uint64_t {
   SOPK_ZEXT = UINT64_C(1) << 38,
   SCALAR_STORE = UINT64_C(1) << 39,
   FIXED_SIZE = UINT64_C(1) << 40,
-  VOPAsmPrefer32Bit = UINT64_C(1) << 41
-
+  VOPAsmPrefer32Bit = UINT64_C(1) << 41,
+  HasFPClamp = UINT64_C(1) << 42
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
@@ -102,12 +103,14 @@ namespace AMDGPU {
     OPERAND_REG_INLINE_C_FP16,
     OPERAND_REG_INLINE_C_FP32,
     OPERAND_REG_INLINE_C_FP64,
+    OPERAND_REG_INLINE_C_V2FP16,
+    OPERAND_REG_INLINE_C_V2INT16,
 
     OPERAND_REG_IMM_FIRST = OPERAND_REG_IMM_INT32,
     OPERAND_REG_IMM_LAST = OPERAND_REG_IMM_FP16,
 
     OPERAND_REG_INLINE_C_FIRST = OPERAND_REG_INLINE_C_INT16,
-    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_FP64,
+    OPERAND_REG_INLINE_C_LAST = OPERAND_REG_INLINE_C_V2INT16,
 
     OPERAND_SRC_FIRST = OPERAND_REG_IMM_INT32,
     OPERAND_SRC_LAST = OPERAND_REG_INLINE_C_LAST,
@@ -125,9 +128,12 @@ namespace AMDGPU {
 // NEG and SEXT share same bit-mask because they can't be set simultaneously.
 namespace SISrcMods {
   enum {
-   NEG = 1 << 0,  // Floating-point negate modifier
-   ABS = 1 << 1,  // Floating-point absolute modifier
-   SEXT = 1 << 0  // Integer sign-extend modifier
+   NEG = 1 << 0,   // Floating-point negate modifier
+   ABS = 1 << 1,   // Floating-point absolute modifier
+   SEXT = 1 << 0,  // Integer sign-extend modifier
+   NEG_HI = ABS,   // Floating-point negate high packed component modifier.
+   OP_SEL_0 = 1 << 2,
+   OP_SEL_1 = 1 << 3
   };
 }
 
@@ -242,6 +248,7 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_LDS_ALLOC = 6,
   ID_IB_STS = 7,
   ID_SYMBOLIC_LAST_ = 8,
+  ID_MEM_BASES = 15,
   ID_SHIFT_ = 0,
   ID_WIDTH_ = 6,
   ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -251,14 +258,20 @@ enum Offset { // Offset, (5) [10:6]
   OFFSET_DEFAULT_ = 0,
   OFFSET_SHIFT_ = 6,
   OFFSET_WIDTH_ = 5,
-  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_)
+  OFFSET_MASK_ = (((1 << OFFSET_WIDTH_) - 1) << OFFSET_SHIFT_),
+
+  OFFSET_SRC_SHARED_BASE = 16,
+  OFFSET_SRC_PRIVATE_BASE = 0
 };
 
 enum WidthMinusOne { // WidthMinusOne, (5) [15:11]
   WIDTH_M1_DEFAULT_ = 31,
   WIDTH_M1_SHIFT_ = 11,
   WIDTH_M1_WIDTH_ = 5,
-  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_)
+  WIDTH_M1_MASK_ = (((1 << WIDTH_M1_WIDTH_) - 1) << WIDTH_M1_SHIFT_),
+
+  WIDTH_M1_SRC_SHARED_BASE = 15,
+  WIDTH_M1_SRC_PRIVATE_BASE = 15
 };
 
 } // namespace Hwreg
@@ -300,6 +313,9 @@ enum DstUnused {
 #define   S_00B84C_USER_SGPR(x)                                       (((x) & 0x1F) << 1)
 #define   G_00B84C_USER_SGPR(x)                                       (((x) >> 1) & 0x1F)
 #define   C_00B84C_USER_SGPR                                          0xFFFFFFC1
+#define   S_00B84C_TRAP_HANDLER(x)                                    (((x) & 0x1) << 6)
+#define   G_00B84C_TRAP_HANDLER(x)                                    (((x) >> 6) & 0x1)
+#define   C_00B84C_TRAP_HANDLER                                       0xFFFFFFBF
 #define   S_00B84C_TGID_X_EN(x)                                       (((x) & 0x1) << 7)
 #define   G_00B84C_TGID_X_EN(x)                                       (((x) >> 7) & 0x1)
 #define   C_00B84C_TGID_X_EN                                          0xFFFFFF7F
@@ -387,7 +403,6 @@ enum DstUnused {
 
 #define R_SPILLED_SGPRS         0x4
 #define R_SPILLED_VGPRS         0x8
-
 } // End namespace llvm
 
 #endif
diff --git a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 43cb15f502cd00c8aea5933eae800939588789c9..34cd6f704a12f51e0fb00bb3c8899c6784ce2eff 100644
--- a/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -198,6 +198,10 @@ static bool foldVGPRCopyIntoRegSequence(MachineInstr &MI,
   if (!CopyUse.isCopy())
     return false;
 
+  // It is illegal to have vreg inputs to a physreg defining reg_sequence.
+  if (TargetRegisterInfo::isPhysicalRegister(CopyUse.getOperand(0).getReg()))
+    return false;
+
   const TargetRegisterClass *SrcRC, *DstRC;
   std::tie(SrcRC, DstRC) = getCopyRegClasses(CopyUse, *TRI, MRI);
 
diff --git a/lib/Target/AMDGPU/SIFoldOperands.cpp b/lib/Target/AMDGPU/SIFoldOperands.cpp
index a5c0d4923d6b3b702d8a77fc69aa93a47b2bb882..d63414735b95a3b7b83110e769ca9d9353ec04c5 100644
--- a/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -12,6 +12,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
 #include "llvm/CodeGen/LiveIntervalAnalysis.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -66,6 +67,7 @@ public:
   MachineRegisterInfo *MRI;
   const SIInstrInfo *TII;
   const SIRegisterInfo *TRI;
+  const SISubtarget *ST;
 
   void foldOperand(MachineOperand &OpToFold,
                    MachineInstr *UseMI,
@@ -75,6 +77,12 @@ public:
 
   void foldInstOperand(MachineInstr &MI, MachineOperand &OpToFold) const;
 
+  const MachineOperand *isClamp(const MachineInstr &MI) const;
+  bool tryFoldClamp(MachineInstr &MI);
+
+  std::pair<const MachineOperand *, int> isOMod(const MachineInstr &MI) const;
+  bool tryFoldOMod(MachineInstr &MI);
+
 public:
   SIFoldOperands() : MachineFunctionPass(ID) {
     initializeSIFoldOperandsPass(*PassRegistry::getPassRegistry());
@@ -131,27 +139,6 @@ FunctionPass *llvm::createSIFoldOperandsPass() {
   return new SIFoldOperands();
 }
 
-static bool isSafeToFold(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  case AMDGPU::V_MOV_B32_e32:
-  case AMDGPU::V_MOV_B32_e64:
-  case AMDGPU::V_MOV_B64_PSEUDO: {
-    // If there are additional implicit register operands, this may be used for
-    // register indexing so the source register operand isn't simply copied.
-    unsigned NumOps = MI.getDesc().getNumOperands() +
-      MI.getDesc().getNumImplicitUses();
-
-    return MI.getNumOperands() == NumOps;
-  }
-  case AMDGPU::S_MOV_B32:
-  case AMDGPU::S_MOV_B64:
-  case AMDGPU::COPY:
-    return true;
-  default:
-    return false;
-  }
-}
-
 static bool updateOperand(FoldCandidate &Fold,
                           const TargetRegisterInfo &TRI) {
   MachineInstr *MI = Fold.UseMI;
@@ -359,8 +346,6 @@ void SIFoldOperands::foldOperand(
   const TargetRegisterClass *FoldRC =
     TRI->getRegClass(FoldDesc.OpInfo[0].RegClass);
 
-  APInt Imm(TII->operandBitWidth(FoldDesc.OpInfo[1].OperandType),
-            OpToFold.getImm());
 
   // Split 64-bit constants into 32-bits for folding.
   if (UseOp.getSubReg() && AMDGPU::getRegBitWidth(FoldRC->getID()) == 64) {
@@ -370,21 +355,25 @@ void SIFoldOperands::foldOperand(
       MRI->getRegClass(UseReg) :
       TRI->getPhysRegClass(UseReg);
 
-    assert(Imm.getBitWidth() == 64);
-
     if (AMDGPU::getRegBitWidth(UseRC->getID()) != 64)
       return;
 
+    APInt Imm(64, OpToFold.getImm());
     if (UseOp.getSubReg() == AMDGPU::sub0) {
       Imm = Imm.getLoBits(32);
     } else {
       assert(UseOp.getSubReg() == AMDGPU::sub1);
       Imm = Imm.getHiBits(32);
     }
+
+    MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
+    tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+    return;
   }
 
-  MachineOperand ImmOp = MachineOperand::CreateImm(Imm.getSExtValue());
-  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &ImmOp, TII);
+
+
+  tryAddToFoldList(FoldList, UseMI, UseOpIdx, &OpToFold, TII);
 }
 
 static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
@@ -581,6 +570,32 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   return false;
 }
 
+// Try to fold an instruction into a simpler one
+static bool tryFoldInst(const SIInstrInfo *TII,
+                        MachineInstr *MI) {
+  unsigned Opc = MI->getOpcode();
+
+  if (Opc == AMDGPU::V_CNDMASK_B32_e32    ||
+      Opc == AMDGPU::V_CNDMASK_B32_e64    ||
+      Opc == AMDGPU::V_CNDMASK_B64_PSEUDO) {
+    const MachineOperand *Src0 = TII->getNamedOperand(*MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(*MI, AMDGPU::OpName::src1);
+    if (Src1->isIdenticalTo(*Src0)) {
+      DEBUG(dbgs() << "Folded " << *MI << " into ");
+      int Src2Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2);
+      if (Src2Idx != -1)
+        MI->RemoveOperand(Src2Idx);
+      MI->RemoveOperand(AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1));
+      mutateCopyOp(*MI, TII->get(Src0->isReg() ? (unsigned)AMDGPU::COPY
+                                               : getMovOpc(false)));
+      DEBUG(dbgs() << *MI << '\n');
+      return true;
+    }
+  }
+
+  return false;
+}
+
 void SIFoldOperands::foldInstOperand(MachineInstr &MI,
                                      MachineOperand &OpToFold) const {
   // We need mutate the operands of new mov instructions to add implicit
@@ -682,20 +697,213 @@ void SIFoldOperands::foldInstOperand(MachineInstr &MI,
       }
       DEBUG(dbgs() << "Folded source from " << MI << " into OpNo " <<
             static_cast<int>(Fold.UseOpNo) << " of " << *Fold.UseMI << '\n');
+      tryFoldInst(TII, Fold.UseMI);
     }
   }
 }
 
+const MachineOperand *SIFoldOperands::isClamp(const MachineInstr &MI) const {
+  unsigned Op = MI.getOpcode();
+  switch (Op) {
+  case AMDGPU::V_MAX_F32_e64:
+  case AMDGPU::V_MAX_F16_e64:
+  case AMDGPU::V_MAX_F64: {
+    if (!TII->getNamedOperand(MI, AMDGPU::OpName::clamp)->getImm())
+      return nullptr;
+
+    // Make sure sources are identical.
+    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    if (!Src0->isReg() || Src0->getSubReg() != Src1->getSubReg() ||
+        Src0->getSubReg() != AMDGPU::NoSubRegister)
+      return nullptr;
+
+    // Can't fold up if we have modifiers.
+    if (TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+      return nullptr;
+    return Src0;
+  }
+  default:
+    return nullptr;
+  }
+}
+
+// We obviously have multiple uses in a clamp since the register is used twice
+// in the same instruction.
+static bool hasOneNonDBGUseInst(const MachineRegisterInfo &MRI, unsigned Reg) {
+  int Count = 0;
+  for (auto I = MRI.use_instr_nodbg_begin(Reg), E = MRI.use_instr_nodbg_end();
+       I != E; ++I) {
+    if (++Count > 1)
+      return false;
+  }
+
+  return true;
+}
+
+bool SIFoldOperands::tryFoldClamp(MachineInstr &MI) {
+  const MachineOperand *ClampSrc = isClamp(MI);
+  if (!ClampSrc || !hasOneNonDBGUseInst(*MRI, ClampSrc->getReg()))
+    return false;
+
+  MachineInstr *Def = MRI->getVRegDef(ClampSrc->getReg());
+  if (!TII->hasFPClamp(*Def))
+    return false;
+  MachineOperand *DefClamp = TII->getNamedOperand(*Def, AMDGPU::OpName::clamp);
+  if (!DefClamp)
+    return false;
+
+  DEBUG(dbgs() << "Folding clamp " << *DefClamp << " into " << *Def << '\n');
+
+  // Clamp is applied after omod, so it is OK if omod is set.
+  DefClamp->setImm(1);
+  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
+static int getOModValue(unsigned Opc, int64_t Val) {
+  switch (Opc) {
+  case AMDGPU::V_MUL_F32_e64: {
+    switch (static_cast<uint32_t>(Val)) {
+    case 0x3f000000: // 0.5
+      return SIOutMods::DIV2;
+    case 0x40000000: // 2.0
+      return SIOutMods::MUL2;
+    case 0x40800000: // 4.0
+      return SIOutMods::MUL4;
+    default:
+      return SIOutMods::NONE;
+    }
+  }
+  case AMDGPU::V_MUL_F16_e64: {
+    switch (static_cast<uint16_t>(Val)) {
+    case 0x3800: // 0.5
+      return SIOutMods::DIV2;
+    case 0x4000: // 2.0
+      return SIOutMods::MUL2;
+    case 0x4400: // 4.0
+      return SIOutMods::MUL4;
+    default:
+      return SIOutMods::NONE;
+    }
+  }
+  default:
+    llvm_unreachable("invalid mul opcode");
+  }
+}
+
+// FIXME: Does this really not support denormals with f16?
+// FIXME: Does this need to check IEEE mode bit? SNaNs are generally not
+// handled, so will anything other than that break?
+std::pair<const MachineOperand *, int>
+SIFoldOperands::isOMod(const MachineInstr &MI) const {
+  unsigned Op = MI.getOpcode();
+  switch (Op) {
+  case AMDGPU::V_MUL_F32_e64:
+  case AMDGPU::V_MUL_F16_e64: {
+    // If output denormals are enabled, omod is ignored.
+    if ((Op == AMDGPU::V_MUL_F32_e64 && ST->hasFP32Denormals()) ||
+        (Op == AMDGPU::V_MUL_F16_e64 && ST->hasFP16Denormals()))
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    const MachineOperand *RegOp = nullptr;
+    const MachineOperand *ImmOp = nullptr;
+    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    if (Src0->isImm()) {
+      ImmOp = Src0;
+      RegOp = Src1;
+    } else if (Src1->isImm()) {
+      ImmOp = Src1;
+      RegOp = Src0;
+    } else
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    int OMod = getOModValue(Op, ImmOp->getImm());
+    if (OMod == SIOutMods::NONE ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::omod) ||
+        TII->hasModifiersSet(MI, AMDGPU::OpName::clamp))
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    return std::make_pair(RegOp, OMod);
+  }
+  case AMDGPU::V_ADD_F32_e64:
+  case AMDGPU::V_ADD_F16_e64: {
+    // If output denormals are enabled, omod is ignored.
+    if ((Op == AMDGPU::V_ADD_F32_e64 && ST->hasFP32Denormals()) ||
+        (Op == AMDGPU::V_ADD_F16_e64 && ST->hasFP16Denormals()))
+      return std::make_pair(nullptr, SIOutMods::NONE);
+
+    // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
+    const MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+    const MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+
+    if (Src0->isReg() && Src1->isReg() && Src0->getReg() == Src1->getReg() &&
+        Src0->getSubReg() == Src1->getSubReg() &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::clamp) &&
+        !TII->hasModifiersSet(MI, AMDGPU::OpName::omod))
+      return std::make_pair(Src0, SIOutMods::MUL2);
+
+    return std::make_pair(nullptr, SIOutMods::NONE);
+  }
+  default:
+    return std::make_pair(nullptr, SIOutMods::NONE);
+  }
+}
+
+// FIXME: Does this need to check IEEE bit on function?
+bool SIFoldOperands::tryFoldOMod(MachineInstr &MI) {
+  const MachineOperand *RegOp;
+  int OMod;
+  std::tie(RegOp, OMod) = isOMod(MI);
+  if (OMod == SIOutMods::NONE || !RegOp->isReg() ||
+      RegOp->getSubReg() != AMDGPU::NoSubRegister ||
+      !hasOneNonDBGUseInst(*MRI, RegOp->getReg()))
+    return false;
+
+  MachineInstr *Def = MRI->getVRegDef(RegOp->getReg());
+  MachineOperand *DefOMod = TII->getNamedOperand(*Def, AMDGPU::OpName::omod);
+  if (!DefOMod || DefOMod->getImm() != SIOutMods::NONE)
+    return false;
+
+  // Clamp is applied after omod. If the source already has clamp set, don't
+  // fold it.
+  if (TII->hasModifiersSet(*Def, AMDGPU::OpName::clamp))
+    return false;
+
+  DEBUG(dbgs() << "Folding omod " << MI << " into " << *Def << '\n');
+
+  DefOMod->setImm(OMod);
+  MRI->replaceRegWith(MI.getOperand(0).getReg(), Def->getOperand(0).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
 bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(*MF.getFunction()))
     return false;
 
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-
   MRI = &MF.getRegInfo();
-  TII = ST.getInstrInfo();
+  ST = &MF.getSubtarget<SISubtarget>();
+  TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
 
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  // omod is ignored by hardware if IEEE bit is enabled. omod also does not
+  // correctly handle signed zeros.
+  //
+  // TODO: Check nsz on instructions when fast math flags are preserved to MI
+  // level.
+  bool IsIEEEMode = ST->enableIEEEBit(MF) || !MFI->hasNoSignedZerosFPMath();
+
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; ++BI) {
 
@@ -705,8 +913,13 @@ bool SIFoldOperands::runOnMachineFunction(MachineFunction &MF) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
-      if (!isSafeToFold(MI))
+      tryFoldInst(TII, &MI);
+
+      if (!TII->isFoldableCopy(MI)) {
+        if (IsIEEEMode || !tryFoldOMod(MI))
+          tryFoldClamp(MI);
         continue;
+      }
 
       MachineOperand &OpToFold = MI.getOperand(1);
       bool FoldingImm = OpToFold.isImm() || OpToFold.isFI();
diff --git a/lib/Target/AMDGPU/SIFrameLowering.cpp b/lib/Target/AMDGPU/SIFrameLowering.cpp
index 0b57155158805399e3473b14bbc40ff599774656..abe6af9a6d3fcbbed51c2547b583b72234aa1c21 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -21,22 +21,24 @@
 using namespace llvm;
 
 
-static ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF,
-                                         const SIRegisterInfo *TRI) {
+static ArrayRef<MCPhysReg> getAllSGPR128(const SISubtarget &ST,
+                                         const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
-                      TRI->getMaxNumSGPRs(MF) / 4);
+                      ST.getMaxNumSGPRs(MF) / 4);
 }
 
-static ArrayRef<MCPhysReg> getAllSGPRs(const MachineFunction &MF,
-                                       const SIRegisterInfo *TRI) {
+static ArrayRef<MCPhysReg> getAllSGPRs(const SISubtarget &ST,
+                                       const MachineFunction &MF) {
   return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
-                      TRI->getMaxNumSGPRs(MF));
+                      ST.getMaxNumSGPRs(MF));
 }
 
-void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
-                                          const SIRegisterInfo* TRI,
+void SIFrameLowering::emitFlatScratchInit(const SISubtarget &ST,
                                           MachineFunction &MF,
                                           MachineBasicBlock &MBB) const {
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo* TRI = &TII->getRegisterInfo();
+
   // We don't need this if we only have spills since there is no user facing
   // scratch.
 
@@ -59,16 +61,28 @@ void SIFrameLowering::emitFlatScratchInit(const SIInstrInfo *TII,
   MRI.addLiveIn(FlatScratchInitReg);
   MBB.addLiveIn(FlatScratchInitReg);
 
-  // Copy the size in bytes.
-  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
-  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
-    .addReg(FlatScrInitHi, RegState::Kill);
-
   unsigned FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
+  unsigned FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
 
+  // Do a 64-bit pointer add.
+  if (ST.flatScratchIsPointer()) {
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), AMDGPU::FLAT_SCR_LO)
+      .addReg(FlatScrInitLo)
+      .addReg(ScratchWaveOffsetReg);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), AMDGPU::FLAT_SCR_HI)
+      .addReg(FlatScrInitHi)
+      .addImm(0);
+
+    return;
+  }
+
+  // Copy the size in bytes.
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), AMDGPU::FLAT_SCR_LO)
+    .addReg(FlatScrInitHi, RegState::Kill);
+
   // Add wave offset in bytes to private base offset.
   // See comment in AMDKernelCodeT.h for enable_sgpr_flat_scratch_init.
   BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), FlatScrInitLo)
@@ -111,16 +125,15 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
-  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(MF, TRI);
+  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
 
-  // Skip the last 2 elements because the last one is reserved for VCC, and
-  // this is the 2nd to last element already.
+  // Skip the last N reserved elements because they should have already been
+  // reserved for VCC etc.
   for (MCPhysReg Reg : AllSGPR128s) {
     // Pick the first unallocated one. Make sure we don't clobber the other
     // reserved input we needed.
     if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
-      //assert(MRI.isAllocatable(Reg));
       MRI.replaceRegWith(ScratchRsrcReg, Reg);
       MFI->setScratchRSrcReg(Reg);
       return Reg;
@@ -143,10 +156,9 @@ unsigned SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
 
   unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-
   unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
 
-  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(MF, TRI);
+  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
   if (NumPreloaded > AllSGPRs.size())
     return ScratchWaveOffsetReg;
 
@@ -190,6 +202,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   // Emit debugger prologue if "amdgpu-debugger-emit-prologue" attribute was
   // specified.
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  auto AMDGPUASI = ST.getAMDGPUAS();
   if (ST.debuggerEmitPrologue())
     emitDebuggerPrologue(MF, MBB);
 
@@ -229,7 +242,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   // emitted after frame indices are eliminated.
 
   if (MF.getFrameInfo().hasStackObjects() && MFI->hasFlatScratchInit())
-    emitFlatScratchInit(TII, TRI, MF, MBB);
+    emitFlatScratchInit(ST, MF, MBB);
 
   // We need to insert initialization of the scratch resource descriptor.
   unsigned PreloadedScratchWaveOffsetReg = TRI->getPreloadedValue(
@@ -328,7 +341,7 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
 
         PointerType *PtrTy =
           PointerType::get(Type::getInt64Ty(MF.getFunction()->getContext()),
-                           AMDGPUAS::CONSTANT_ADDRESS);
+                           AMDGPUASI.CONSTANT_ADDRESS);
         MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
         auto MMO = MF.getMachineMemOperand(PtrInfo,
                                            MachineMemOperand::MOLoad |
@@ -371,6 +384,24 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
 
 }
 
+static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
+  for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
+       I != E; ++I) {
+    if (!MFI.isDeadObjectIndex(I))
+      return false;
+  }
+
+  return true;
+}
+
+int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                            unsigned &FrameReg) const {
+  const SIRegisterInfo *RI = MF.getSubtarget<SISubtarget>().getRegisterInfo();
+
+  FrameReg = RI->getFrameRegister(MF);
+  return MF.getFrameInfo().getObjectOffset(FI);
+}
+
 void SIFrameLowering::processFunctionBeforeFrameFinalized(
   MachineFunction &MF,
   RegScavenger *RS) const {
@@ -379,15 +410,66 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   if (!MFI.hasStackObjects())
     return;
 
-  bool MayNeedScavengingEmergencySlot = MFI.hasStackObjects();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  bool AllSGPRSpilledToVGPRs = false;
+
+  if (TRI.spillSGPRToVGPR() && FuncInfo->hasSpilledSGPRs()) {
+    AllSGPRSpilledToVGPRs = true;
+
+    // Process all SGPR spills before frame offsets are finalized. Ideally SGPRs
+    // are spilled to VGPRs, in which case we can eliminate the stack usage.
+    //
+    // XXX - This operates under the assumption that only other SGPR spills are
+    // users of the frame index. I'm not 100% sure this is correct. The
+    // StackColoring pass has a comment saying a future improvement would be to
+    // merging of allocas with spill slots, but for now according to
+    // MachineFrameInfo isSpillSlot can't alias any other object.
+    for (MachineBasicBlock &MBB : MF) {
+      MachineBasicBlock::iterator Next;
+      for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
+        MachineInstr &MI = *I;
+        Next = std::next(I);
+
+        if (TII->isSGPRSpill(MI)) {
+          int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex();
+          if (FuncInfo->allocateSGPRSpillToVGPR(MF, FI)) {
+            bool Spilled = TRI.eliminateSGPRToVGPRSpillFrameIndex(MI, FI, RS);
+            (void)Spilled;
+            assert(Spilled && "failed to spill SGPR to VGPR when allocated");
+          } else
+            AllSGPRSpilledToVGPRs = false;
+        }
+      }
+    }
 
-  assert((RS || !MayNeedScavengingEmergencySlot) &&
-         "RegScavenger required if spilling");
+    FuncInfo->removeSGPRToVGPRFrameIndices(MFI);
+  }
 
-  if (MayNeedScavengingEmergencySlot) {
-    int ScavengeFI = MFI.CreateStackObject(
-      AMDGPU::SGPR_32RegClass.getSize(),
-      AMDGPU::SGPR_32RegClass.getAlignment(), false);
+  // FIXME: The other checks should be redundant with allStackObjectsAreDead,
+  // but currently hasNonSpillStackObjects is set only from source
+  // allocas. Stack temps produced from legalization are not counted currently.
+  if (FuncInfo->hasNonSpillStackObjects() || FuncInfo->hasSpilledVGPRs() ||
+      !AllSGPRSpilledToVGPRs || !allStackObjectsAreDead(MFI)) {
+    assert(RS && "RegScavenger required if spilling");
+
+    // We force this to be at offset 0 so no user object ever has 0 as an
+    // address, so we may use 0 as an invalid pointer value. This is because
+    // LLVM assumes 0 is an invalid pointer in address space 0. Because alloca
+    // is required to be address space 0, we are forced to accept this for
+    // now. Ideally we could have the stack in another address space with 0 as a
+    // valid pointer, and -1 as the null value.
+    //
+    // This will also waste additional space when user stack objects require > 4
+    // byte alignment.
+    //
+    // The main cost here is losing the offset for addressing modes. However
+    // this also ensures we shouldn't need a register for the offset when
+    // emergency scavenging.
+    int ScavengeFI = MFI.CreateFixedObject(
+      AMDGPU::SGPR_32RegClass.getSize(), 0, false);
     RS->addScavengingFrameIndex(ScavengeFI);
   }
 }
diff --git a/lib/Target/AMDGPU/SIFrameLowering.h b/lib/Target/AMDGPU/SIFrameLowering.h
index 7657b4e03864db8ba62ec7042cacb515b0170c25..1bfc08093da224d761293e2ebcb520053ed00035 100644
--- a/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/lib/Target/AMDGPU/SIFrameLowering.h
@@ -30,14 +30,15 @@ public:
                     MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override;
+  int getFrameIndexReference(const MachineFunction &MF, int FI,
+                             unsigned &FrameReg) const override;
 
   void processFunctionBeforeFrameFinalized(
     MachineFunction &MF,
     RegScavenger *RS = nullptr) const override;
 
 private:
-  void emitFlatScratchInit(const SIInstrInfo *TII,
-                           const SIRegisterInfo* TRI,
+  void emitFlatScratchInit(const SISubtarget &ST,
                            MachineFunction &MF,
                            MachineBasicBlock &MBB) const;
 
diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp
index 89d3a72fae4cdf873b05490ad52aa8efd8c783e8..eda825d8c6eee356eda7caf88a6e24b1362c037e 100644
--- a/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -19,6 +19,7 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUIntrinsicInfo.h"
+#include "AMDGPUTargetMachine.h"
 #include "AMDGPUSubtarget.h"
 #include "SIDefines.h"
 #include "SIISelLowering.h"
@@ -60,6 +61,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
@@ -68,7 +70,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetCallingConv.h"
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include <cassert>
@@ -126,6 +127,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::f16, &AMDGPU::SReg_32_XM0RegClass);
   }
 
+  if (Subtarget->hasVOP3PInsts()) {
+    addRegisterClass(MVT::v2i16, &AMDGPU::SReg_32_XM0RegClass);
+    addRegisterClass(MVT::v2f16, &AMDGPU::SReg_32_XM0RegClass);
+  }
+
   computeRegisterProperties(STI.getRegisterInfo());
 
   // We need to custom lower vector stores from local memory
@@ -183,9 +189,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i16, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::Other, Custom);
 
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v2f16, Custom);
+
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
+
+  setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2i16, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::v2f16, Custom);
 
@@ -201,7 +212,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   // We only support LOAD/STORE and vector manipulation ops for vectors
   // with > 4 elements.
-  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32, MVT::v2i64, MVT::v2f64}) {
+  for (MVT VT : {MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
+        MVT::v2i64, MVT::v2f64}) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -276,6 +288,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // On SI this is s_memtime and s_memrealtime on VI.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
   setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
   setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
@@ -356,6 +369,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FP_TO_UINT, MVT::f16, Promote);
     setOperationAction(ISD::SINT_TO_FP, MVT::f16, Promote);
     setOperationAction(ISD::UINT_TO_FP, MVT::f16, Promote);
+    setOperationAction(ISD::FROUND, MVT::f16, Custom);
 
     // F16 - VOP2 Actions.
     setOperationAction(ISD::BR_CC, MVT::f16, Expand);
@@ -370,6 +384,85 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMAD, MVT::f16, Legal);
   }
 
+  if (Subtarget->hasVOP3PInsts()) {
+    for (MVT VT : {MVT::v2i16, MVT::v2f16}) {
+      for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
+        switch (Op) {
+        case ISD::LOAD:
+        case ISD::STORE:
+        case ISD::BUILD_VECTOR:
+        case ISD::BITCAST:
+        case ISD::EXTRACT_VECTOR_ELT:
+        case ISD::INSERT_VECTOR_ELT:
+        case ISD::INSERT_SUBVECTOR:
+        case ISD::EXTRACT_SUBVECTOR:
+        case ISD::SCALAR_TO_VECTOR:
+          break;
+        case ISD::CONCAT_VECTORS:
+          setOperationAction(Op, VT, Custom);
+          break;
+        default:
+          setOperationAction(Op, VT, Expand);
+          break;
+        }
+      }
+    }
+
+    // XXX - Do these do anything? Vector constants turn into build_vector.
+    setOperationAction(ISD::Constant, MVT::v2i16, Legal);
+    setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::STORE, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::STORE, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::STORE, MVT::v2f16, MVT::i32);
+
+    setOperationAction(ISD::LOAD, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::LOAD, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::LOAD, MVT::v2f16, MVT::i32);
+
+    setOperationAction(ISD::AND, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::AND, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::OR, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::OR, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::XOR, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::XOR, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::SELECT, MVT::v2i16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::v2i16, MVT::i32);
+    setOperationAction(ISD::SELECT, MVT::v2f16, Promote);
+    AddPromotedToType(ISD::SELECT, MVT::v2f16, MVT::i32);
+
+    setOperationAction(ISD::ADD, MVT::v2i16, Legal);
+    setOperationAction(ISD::SUB, MVT::v2i16, Legal);
+    setOperationAction(ISD::MUL, MVT::v2i16, Legal);
+    setOperationAction(ISD::SHL, MVT::v2i16, Legal);
+    setOperationAction(ISD::SRL, MVT::v2i16, Legal);
+    setOperationAction(ISD::SRA, MVT::v2i16, Legal);
+    setOperationAction(ISD::SMIN, MVT::v2i16, Legal);
+    setOperationAction(ISD::UMIN, MVT::v2i16, Legal);
+    setOperationAction(ISD::SMAX, MVT::v2i16, Legal);
+    setOperationAction(ISD::UMAX, MVT::v2i16, Legal);
+
+    setOperationAction(ISD::FADD, MVT::v2f16, Legal);
+    setOperationAction(ISD::FNEG, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMUL, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMA, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMINNUM, MVT::v2f16, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::v2f16, Legal);
+
+    // This isn't really legal, but this avoids the legalizer unrolling it (and
+    // allows matching fneg (fabs x) patterns)
+    setOperationAction(ISD::FABS, MVT::v2f16, Legal);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2i16, Custom);
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v2i32, Expand);
+    setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Expand);
+  }
+
   setTargetDAGCombine(ISD::FADD);
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FMINNUM);
@@ -385,6 +478,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::FCANONICALIZE);
+  setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+  setTargetDAGCombine(ISD::ZERO_EXTEND);
 
   // All memory operations. Some folding on the pointer operand is done to help
   // matching the constant offsets in the addressing modes.
@@ -417,30 +512,49 @@ const SISubtarget *SITargetLowering::getSubtarget() const {
 // TargetLowering queries
 //===----------------------------------------------------------------------===//
 
+bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
+                                          EVT) const {
+  // SI has some legal vector types, but no legal vector operations. Say no
+  // shuffles are legal in order to prefer scalarizing some vector operations.
+  return false;
+}
+
 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           unsigned IntrID) const {
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
-  case Intrinsic::amdgcn_atomic_dec:
+  case Intrinsic::amdgcn_atomic_dec: {
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Info.memVT = MVT::getVT(CI.getType());
     Info.ptrVal = CI.getOperand(0);
     Info.align = 0;
-    Info.vol = false;
+
+    const ConstantInt *Vol = dyn_cast<ConstantInt>(CI.getOperand(4));
+    Info.vol = !Vol || !Vol->isNullValue();
     Info.readMem = true;
     Info.writeMem = true;
     return true;
+  }
   default:
     return false;
   }
 }
 
-bool SITargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &,
-                                          EVT) const {
-  // SI has some legal vector types, but no legal vector operations. Say no
-  // shuffles are legal in order to prefer scalarizing some vector operations.
-  return false;
+bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
+                                            SmallVectorImpl<Value*> &Ops,
+                                            Type *&AccessTy) const {
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec: {
+    Value *Ptr = II->getArgOperand(0);
+    AccessTy = II->getType();
+    Ops.push_back(Ptr);
+    return true;
+  }
+  default:
+    return false;
+  }
 }
 
 bool SITargetLowering::isLegalFlatAddressingMode(const AddrMode &AM) const {
@@ -491,8 +605,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (AM.BaseGV)
     return false;
 
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
       // Assume the we will use FLAT for all global memory accesses
       // on VI.
@@ -507,8 +620,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     }
 
     return isLegalMUBUFAddressingMode(AM);
-
-  case AMDGPUAS::CONSTANT_ADDRESS:
+  } else if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     // If the offset isn't a multiple of 4, it probably isn't going to be
     // correctly aligned.
     // FIXME: Can we get the real alignment here?
@@ -531,7 +643,7 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       // in 8-bits, it can use a smaller encoding.
       if (!isUInt<32>(AM.BaseOffs / 4))
         return false;
-    } else if (Subtarget->getGeneration() == SISubtarget::VOLCANIC_ISLANDS) {
+    } else if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
       // On VI, these use the SMEM format and the offset is 20-bit in bytes.
       if (!isUInt<20>(AM.BaseOffs))
         return false;
@@ -546,11 +658,10 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
 
     return false;
 
-  case AMDGPUAS::PRIVATE_ADDRESS:
+  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     return isLegalMUBUFAddressingMode(AM);
-
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS:
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS ||
+             AS == AMDGPUASI.REGION_ADDRESS) {
     // Basic, single offset DS instructions allow a 16-bit unsigned immediate
     // field.
     // XXX - If doing a 4-byte aligned 8-byte type access, we effectively have
@@ -565,17 +676,15 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
       return true;
 
     return false;
-
-  case AMDGPUAS::FLAT_ADDRESS:
-  case AMDGPUAS::UNKNOWN_ADDRESS_SPACE:
+  } else if (AS == AMDGPUASI.FLAT_ADDRESS ||
+             AS == AMDGPUASI.UNKNOWN_ADDRESS_SPACE) {
     // For an unknown address space, this usually means that this is for some
     // reason being used for pure arithmetic, and not based on some addressing
     // computation. We don't have instructions that compute pointers with any
     // addressing modes, so treat them as having no offset like flat
     // instructions.
     return isLegalFlatAddressingMode(AM);
-
-  default:
+  } else {
     llvm_unreachable("unhandled address space");
   }
 }
@@ -596,8 +705,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     return false;
   }
 
-  if (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
-      AddrSpace == AMDGPUAS::REGION_ADDRESS) {
+  if (AddrSpace == AMDGPUASI.LOCAL_ADDRESS ||
+      AddrSpace == AMDGPUASI.REGION_ADDRESS) {
     // ds_read/write_b64 require 8-byte alignment, but we can do a 4 byte
     // aligned, 8 byte access in a single operation using ds_read2/write2_b32
     // with adjacent offsets.
@@ -612,8 +721,8 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   // will access scratch.  If we had access to the IR function, then we
   // could determine if any private memory was used in the function.
   if (!Subtarget->hasUnalignedScratchAccess() &&
-      (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
-       AddrSpace == AMDGPUAS::FLAT_ADDRESS)) {
+      (AddrSpace == AMDGPUASI.PRIVATE_ADDRESS ||
+       AddrSpace == AMDGPUASI.FLAT_ADDRESS)) {
     return false;
   }
 
@@ -621,7 +730,7 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
     // If we have an uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
     if (IsFast) {
-      *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS) ?
+      *IsFast = (AddrSpace == AMDGPUASI.CONSTANT_ADDRESS) ?
         (Align % 4 == 0) : true;
     }
 
@@ -661,15 +770,16 @@ EVT SITargetLowering::getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
   return MVT::Other;
 }
 
-static bool isFlatGlobalAddrSpace(unsigned AS) {
-  return AS == AMDGPUAS::GLOBAL_ADDRESS ||
-         AS == AMDGPUAS::FLAT_ADDRESS ||
-         AS == AMDGPUAS::CONSTANT_ADDRESS;
+static bool isFlatGlobalAddrSpace(unsigned AS, AMDGPUAS AMDGPUASI) {
+  return AS == AMDGPUASI.GLOBAL_ADDRESS ||
+         AS == AMDGPUASI.FLAT_ADDRESS ||
+         AS == AMDGPUASI.CONSTANT_ADDRESS;
 }
 
 bool SITargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
                                            unsigned DestAS) const {
-  return isFlatGlobalAddrSpace(SrcAS) && isFlatGlobalAddrSpace(DestAS);
+  return isFlatGlobalAddrSpace(SrcAS, AMDGPUASI) &&
+         isFlatGlobalAddrSpace(DestAS, AMDGPUASI);
 }
 
 bool SITargetLowering::isMemOpHasNoClobberedMemOperand(const SDNode *N) const {
@@ -683,7 +793,7 @@ bool SITargetLowering::isCheapAddrSpaceCast(unsigned SrcAS,
                                             unsigned DestAS) const {
   // Flat -> private/local is a simple truncate.
   // Flat -> global is no-op
-  if (SrcAS == AMDGPUAS::FLAT_ADDRESS)
+  if (SrcAS == AMDGPUASI.FLAT_ADDRESS)
     return true;
 
   return isNoopAddrSpaceCast(SrcAS, DestAS);
@@ -735,40 +845,28 @@ bool SITargetLowering::isTypeDesirableForOp(unsigned Op, EVT VT) const {
   return TargetLowering::isTypeDesirableForOp(Op, VT);
 }
 
-SDValue SITargetLowering::LowerParameterPtr(SelectionDAG &DAG,
-                                            const SDLoc &SL, SDValue Chain,
-                                            unsigned Offset) const {
+SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
+                                                   const SDLoc &SL,
+                                                   SDValue Chain,
+                                                   uint64_t Offset) const {
   const DataLayout &DL = DAG.getDataLayout();
   MachineFunction &MF = DAG.getMachineFunction();
   const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
-  unsigned InputPtrReg = TRI->getPreloadedValue(MF, SIRegisterInfo::KERNARG_SEGMENT_PTR);
+  unsigned InputPtrReg = TRI->getPreloadedValue(MF,
+                                                SIRegisterInfo::KERNARG_SEGMENT_PTR);
 
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
-  MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
+  MVT PtrVT = getPointerTy(DL, AMDGPUASI.CONSTANT_ADDRESS);
   SDValue BasePtr = DAG.getCopyFromReg(Chain, SL,
                                        MRI.getLiveInVirtReg(InputPtrReg), PtrVT);
   return DAG.getNode(ISD::ADD, SL, PtrVT, BasePtr,
                      DAG.getConstant(Offset, SL, PtrVT));
 }
 
-SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
-                                         const SDLoc &SL, SDValue Chain,
-                                         unsigned Offset, bool Signed,
+SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                         const SDLoc &SL, SDValue Val,
+                                         bool Signed,
                                          const ISD::InputArg *Arg) const {
-  const DataLayout &DL = DAG.getDataLayout();
-  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
-  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
-
-  unsigned Align = DL.getABITypeAlignment(Ty);
-
-  SDValue Ptr = LowerParameterPtr(DAG, SL, Chain, Offset);
-  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
-                             MachineMemOperand::MONonTemporal |
-                             MachineMemOperand::MODereferenceable |
-                             MachineMemOperand::MOInvariant);
-
-  SDValue Val = Load;
   if (Arg && (Arg->Flags.isSExt() || Arg->Flags.isZExt()) &&
       VT.bitsLT(MemVT)) {
     unsigned Opc = Arg->Flags.isZExt() ? ISD::AssertZext : ISD::AssertSext;
@@ -782,371 +880,434 @@ SDValue SITargetLowering::LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
   else
     Val = DAG.getZExtOrTrunc(Val, SL, VT);
 
-  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
+  return Val;
 }
 
-SDValue SITargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
-    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
-  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
-
-  MachineFunction &MF = DAG.getMachineFunction();
-  FunctionType *FType = MF.getFunction()->getFunctionType();
-  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+SDValue SITargetLowering::lowerKernargMemParameter(
+  SelectionDAG &DAG, EVT VT, EVT MemVT,
+  const SDLoc &SL, SDValue Chain,
+  uint64_t Offset, bool Signed,
+  const ISD::InputArg *Arg) const {
+  const DataLayout &DL = DAG.getDataLayout();
+  Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
+  MachinePointerInfo PtrInfo(UndefValue::get(PtrTy));
 
-  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
-    const Function *Fn = MF.getFunction();
-    DiagnosticInfoUnsupported NoGraphicsHSA(
-        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
-    DAG.getContext()->diagnose(NoGraphicsHSA);
-    return DAG.getEntryNode();
-  }
+  unsigned Align = DL.getABITypeAlignment(Ty);
 
-  // Create stack objects that are used for emitting debugger prologue if
-  // "amdgpu-debugger-emit-prologue" attribute was specified.
-  if (ST.debuggerEmitPrologue())
-    createDebuggerPrologueStackObjects(MF);
+  SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
+  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
+                             MachineMemOperand::MONonTemporal |
+                             MachineMemOperand::MODereferenceable |
+                             MachineMemOperand::MOInvariant);
 
-  SmallVector<ISD::InputArg, 16> Splits;
-  BitVector Skipped(Ins.size());
+  SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
+  return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
+}
 
-  for (unsigned i = 0, e = Ins.size(), PSInputNum = 0; i != e; ++i) {
-    const ISD::InputArg &Arg = Ins[i];
+static void processShaderInputArgs(SmallVectorImpl<ISD::InputArg> &Splits,
+                                   CallingConv::ID CallConv,
+                                   ArrayRef<ISD::InputArg> Ins,
+                                   BitVector &Skipped,
+                                   FunctionType *FType,
+                                   SIMachineFunctionInfo *Info) {
+  for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) {
+    const ISD::InputArg &Arg = Ins[I];
 
-    // First check if it's a PS input addr
+    // First check if it's a PS input addr.
     if (CallConv == CallingConv::AMDGPU_PS && !Arg.Flags.isInReg() &&
         !Arg.Flags.isByVal() && PSInputNum <= 15) {
 
       if (!Arg.Used && !Info->isPSInputAllocated(PSInputNum)) {
-        // We can safely skip PS inputs
-        Skipped.set(i);
+        // We can safely skip PS inputs.
+        Skipped.set(I);
         ++PSInputNum;
         continue;
       }
 
       Info->markPSInputAllocated(PSInputNum);
       if (Arg.Used)
-        Info->PSInputEna |= 1 << PSInputNum;
+        Info->markPSInputEnabled(PSInputNum);
 
       ++PSInputNum;
     }
 
-    if (AMDGPU::isShader(CallConv)) {
-      // Second split vertices into their elements
-      if (Arg.VT.isVector()) {
-        ISD::InputArg NewArg = Arg;
-        NewArg.Flags.setSplit();
-        NewArg.VT = Arg.VT.getVectorElementType();
-
-        // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
-        // three or five element vertex only needs three or five registers,
-        // NOT four or eight.
-        Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-        unsigned NumElements = ParamType->getVectorNumElements();
-
-        for (unsigned j = 0; j != NumElements; ++j) {
-          Splits.push_back(NewArg);
-          NewArg.PartOffset += NewArg.VT.getStoreSize();
-        }
-      } else {
-        Splits.push_back(Arg);
+    // Second split vertices into their elements.
+    if (Arg.VT.isVector()) {
+      ISD::InputArg NewArg = Arg;
+      NewArg.Flags.setSplit();
+      NewArg.VT = Arg.VT.getVectorElementType();
+
+      // We REALLY want the ORIGINAL number of vertex elements here, e.g. a
+      // three or five element vertex only needs three or five registers,
+      // NOT four or eight.
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      for (unsigned J = 0; J != NumElements; ++J) {
+        Splits.push_back(NewArg);
+        NewArg.PartOffset += NewArg.VT.getStoreSize();
       }
+    } else {
+      Splits.push_back(Arg);
     }
   }
+}
 
-  SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                 *DAG.getContext());
+// Allocate special inputs passed in VGPRs.
+static void allocateSpecialInputVGPRs(CCState &CCInfo,
+                                      MachineFunction &MF,
+                                      const SIRegisterInfo &TRI,
+                                      SIMachineFunctionInfo &Info) {
+  if (Info.hasWorkItemIDX()) {
+    unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
 
-  // At least one interpolation mode must be enabled or else the GPU will hang.
-  //
-  // Check PSInputAddr instead of PSInputEna. The idea is that if the user set
-  // PSInputAddr, the user wants to enable some bits after the compilation
-  // based on run-time states. Since we can't know what the final PSInputEna
-  // will look like, so we shouldn't do anything here and the user should take
-  // responsibility for the correct programming.
-  //
-  // Otherwise, the following restrictions apply:
-  // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
-  // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
-  //   enabled too.
-  if (CallConv == CallingConv::AMDGPU_PS &&
-      ((Info->getPSInputAddr() & 0x7F) == 0 ||
-       ((Info->getPSInputAddr() & 0xF) == 0 && Info->isPSInputAllocated(11)))) {
-    CCInfo.AllocateReg(AMDGPU::VGPR0);
-    CCInfo.AllocateReg(AMDGPU::VGPR1);
-    Info->markPSInputAllocated(0);
-    Info->PSInputEna |= 1;
-  }
-
-  if (!AMDGPU::isShader(CallConv)) {
-    assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX());
-  } else {
-    assert(!Info->hasDispatchPtr() &&
-           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
-           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
-           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
-           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
-           !Info->hasWorkItemIDZ());
+  if (Info.hasWorkItemIDY()) {
+    unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasPrivateMemoryInputPtr()) {
-    unsigned PrivateMemoryPtrReg = Info->addPrivateMemoryPtr(*TRI);
-    MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SReg_64RegClass);
+  if (Info.hasWorkItemIDZ()) {
+    unsigned Reg = TRI.getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
+    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
+    CCInfo.AllocateReg(Reg);
+  }
+}
+
+// Allocate special inputs passed in user SGPRs.
+static void allocateHSAUserSGPRs(CCState &CCInfo,
+                                 MachineFunction &MF,
+                                 const SIRegisterInfo &TRI,
+                                 SIMachineFunctionInfo &Info) {
+  if (Info.hasPrivateMemoryInputPtr()) {
+    unsigned PrivateMemoryPtrReg = Info.addPrivateMemoryPtr(TRI);
+    MF.addLiveIn(PrivateMemoryPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(PrivateMemoryPtrReg);
   }
 
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
-  if (Info->hasPrivateSegmentBuffer()) {
-    unsigned PrivateSegmentBufferReg = Info->addPrivateSegmentBuffer(*TRI);
-    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SReg_128RegClass);
+  if (Info.hasPrivateSegmentBuffer()) {
+    unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+    MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
     CCInfo.AllocateReg(PrivateSegmentBufferReg);
   }
 
-  if (Info->hasDispatchPtr()) {
-    unsigned DispatchPtrReg = Info->addDispatchPtr(*TRI);
+  if (Info.hasDispatchPtr()) {
+    unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(DispatchPtrReg);
   }
 
-  if (Info->hasQueuePtr()) {
-    unsigned QueuePtrReg = Info->addQueuePtr(*TRI);
+  if (Info.hasQueuePtr()) {
+    unsigned QueuePtrReg = Info.addQueuePtr(TRI);
     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(QueuePtrReg);
   }
 
-  if (Info->hasKernargSegmentPtr()) {
-    unsigned InputPtrReg = Info->addKernargSegmentPtr(*TRI);
+  if (Info.hasKernargSegmentPtr()) {
+    unsigned InputPtrReg = Info.addKernargSegmentPtr(TRI);
     MF.addLiveIn(InputPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(InputPtrReg);
   }
 
-  if (Info->hasDispatchID()) {
-    unsigned DispatchIDReg = Info->addDispatchID(*TRI);
+  if (Info.hasDispatchID()) {
+    unsigned DispatchIDReg = Info.addDispatchID(TRI);
     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(DispatchIDReg);
   }
 
-  if (Info->hasFlatScratchInit()) {
-    unsigned FlatScratchInitReg = Info->addFlatScratchInit(*TRI);
+  if (Info.hasFlatScratchInit()) {
+    unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
 
-  if (!AMDGPU::isShader(CallConv))
-    analyzeFormalArgumentsCompute(CCInfo, Ins);
-  else
-    AnalyzeFormalArguments(CCInfo, Splits);
-
-  SmallVector<SDValue, 16> Chains;
-
-  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
-    const ISD::InputArg &Arg = Ins[i];
-    if (Skipped[i]) {
-      InVals.push_back(DAG.getUNDEF(Arg.VT));
-      continue;
-    }
-
-    CCValAssign &VA = ArgLocs[ArgIdx++];
-    MVT VT = VA.getLocVT();
-
-    if (VA.isMemLoc()) {
-      VT = Ins[i].VT;
-      EVT MemVT = VA.getLocVT();
-      const unsigned Offset = Subtarget->getExplicitKernelArgOffset(MF) +
-                              VA.getLocMemOffset();
-      // The first 36 bytes of the input buffer contains information about
-      // thread group and global sizes.
-      SDValue Arg = LowerParameter(DAG, VT, MemVT,  DL, Chain,
-                                   Offset, Ins[i].Flags.isSExt(),
-                                   &Ins[i]);
-      Chains.push_back(Arg.getValue(1));
-
-      auto *ParamTy =
-        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
-      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
-          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-        // On SI local pointers are just offsets into LDS, so they are always
-        // less than 16-bits.  On CI and newer they could potentially be
-        // real pointers, so we can't guarantee their size.
-        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
-                          DAG.getValueType(MVT::i16));
-      }
-
-      InVals.push_back(Arg);
-      Info->setABIArgOffset(Offset + MemVT.getStoreSize());
-      continue;
-    }
-    assert(VA.isRegLoc() && "Parameter must be in a register!");
-
-    unsigned Reg = VA.getLocReg();
-
-    if (VT == MVT::i64) {
-      // For now assume it is a pointer
-      Reg = TRI->getMatchingSuperReg(Reg, AMDGPU::sub0,
-                                     &AMDGPU::SGPR_64RegClass);
-      Reg = MF.addLiveIn(Reg, &AMDGPU::SGPR_64RegClass);
-      SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-      InVals.push_back(Copy);
-      continue;
-    }
-
-    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
-
-    Reg = MF.addLiveIn(Reg, RC);
-    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-
-    if (Arg.VT.isVector()) {
-      // Build a vector from the registers
-      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
-      unsigned NumElements = ParamType->getVectorNumElements();
-
-      SmallVector<SDValue, 4> Regs;
-      Regs.push_back(Val);
-      for (unsigned j = 1; j != NumElements; ++j) {
-        Reg = ArgLocs[ArgIdx++].getLocReg();
-        Reg = MF.addLiveIn(Reg, RC);
-
-        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
-        Regs.push_back(Copy);
-      }
-
-      // Fill up the missing vector elements
-      NumElements = Arg.VT.getVectorNumElements() - NumElements;
-      Regs.append(NumElements, DAG.getUNDEF(VT));
-
-      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
-      continue;
-    }
-
-    InVals.push_back(Val);
-  }
-
   // TODO: Add GridWorkGroupCount user SGPRs when used. For now with HSA we read
   // these from the dispatch pointer.
+}
 
-  // Start adding system SGPRs.
-  if (Info->hasWorkGroupIDX()) {
-    unsigned Reg = Info->addWorkGroupIDX();
+// Allocate special input registers that are initialized per-wave.
+static void allocateSystemSGPRs(CCState &CCInfo,
+                                MachineFunction &MF,
+                                SIMachineFunctionInfo &Info,
+                                bool IsShader) {
+  if (Info.hasWorkGroupIDX()) {
+    unsigned Reg = Info.addWorkGroupIDX();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasWorkGroupIDY()) {
-    unsigned Reg = Info->addWorkGroupIDY();
+  if (Info.hasWorkGroupIDY()) {
+    unsigned Reg = Info.addWorkGroupIDY();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasWorkGroupIDZ()) {
-    unsigned Reg = Info->addWorkGroupIDZ();
+  if (Info.hasWorkGroupIDZ()) {
+    unsigned Reg = Info.addWorkGroupIDZ();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasWorkGroupInfo()) {
-    unsigned Reg = Info->addWorkGroupInfo();
+  if (Info.hasWorkGroupInfo()) {
+    unsigned Reg = Info.addWorkGroupInfo();
     MF.addLiveIn(Reg, &AMDGPU::SReg_32_XM0RegClass);
     CCInfo.AllocateReg(Reg);
   }
 
-  if (Info->hasPrivateSegmentWaveByteOffset()) {
+  if (Info.hasPrivateSegmentWaveByteOffset()) {
     // Scratch wave offset passed in system SGPR.
     unsigned PrivateSegmentWaveByteOffsetReg;
 
-    if (AMDGPU::isShader(CallConv)) {
+    if (IsShader) {
       PrivateSegmentWaveByteOffsetReg = findFirstFreeSGPR(CCInfo);
-      Info->setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
+      Info.setPrivateSegmentWaveByteOffset(PrivateSegmentWaveByteOffsetReg);
     } else
-      PrivateSegmentWaveByteOffsetReg = Info->addPrivateSegmentWaveByteOffset();
+      PrivateSegmentWaveByteOffsetReg = Info.addPrivateSegmentWaveByteOffset();
 
     MF.addLiveIn(PrivateSegmentWaveByteOffsetReg, &AMDGPU::SGPR_32RegClass);
     CCInfo.AllocateReg(PrivateSegmentWaveByteOffsetReg);
   }
+}
 
+static void reservePrivateMemoryRegs(const TargetMachine &TM,
+                                     MachineFunction &MF,
+                                     const SIRegisterInfo &TRI,
+                                     SIMachineFunctionInfo &Info) {
   // Now that we've figured out where the scratch register inputs are, see if
   // should reserve the arguments and use them directly.
   bool HasStackObjects = MF.getFrameInfo().hasStackObjects();
+
   // Record that we know we have non-spill stack objects so we don't need to
   // check all stack objects later.
   if (HasStackObjects)
-    Info->setHasNonSpillStackObjects(true);
+    Info.setHasNonSpillStackObjects(true);
 
   // Everything live out of a block is spilled with fast regalloc, so it's
   // almost certain that spilling will be required.
-  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
+  if (TM.getOptLevel() == CodeGenOpt::None)
     HasStackObjects = true;
 
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   if (ST.isAmdCodeObjectV2(MF)) {
     if (HasStackObjects) {
       // If we have stack objects, we unquestionably need the private buffer
       // resource. For the Code Object V2 ABI, this will be the first 4 user
       // SGPR inputs. We can reserve those and use them directly.
 
-      unsigned PrivateSegmentBufferReg = TRI->getPreloadedValue(
+      unsigned PrivateSegmentBufferReg = TRI.getPreloadedValue(
         MF, SIRegisterInfo::PRIVATE_SEGMENT_BUFFER);
-      Info->setScratchRSrcReg(PrivateSegmentBufferReg);
+      Info.setScratchRSrcReg(PrivateSegmentBufferReg);
 
-      unsigned PrivateSegmentWaveByteOffsetReg = TRI->getPreloadedValue(
+      unsigned PrivateSegmentWaveByteOffsetReg = TRI.getPreloadedValue(
         MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-      Info->setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
+      Info.setScratchWaveOffsetReg(PrivateSegmentWaveByteOffsetReg);
     } else {
       unsigned ReservedBufferReg
-        = TRI->reservedPrivateSegmentBufferReg(MF);
+        = TRI.reservedPrivateSegmentBufferReg(MF);
       unsigned ReservedOffsetReg
-        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
+        = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
 
       // We tentatively reserve the last registers (skipping the last two
       // which may contain VCC). After register allocation, we'll replace
       // these with the ones immediately after those which were really
       // allocated. In the prologue copies will be inserted from the argument
       // to these reserved registers.
-      Info->setScratchRSrcReg(ReservedBufferReg);
-      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+      Info.setScratchRSrcReg(ReservedBufferReg);
+      Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   } else {
-    unsigned ReservedBufferReg = TRI->reservedPrivateSegmentBufferReg(MF);
+    unsigned ReservedBufferReg = TRI.reservedPrivateSegmentBufferReg(MF);
 
     // Without HSA, relocations are used for the scratch pointer and the
     // buffer resource setup is always inserted in the prologue. Scratch wave
     // offset is still in an input SGPR.
-    Info->setScratchRSrcReg(ReservedBufferReg);
+    Info.setScratchRSrcReg(ReservedBufferReg);
 
     if (HasStackObjects) {
-      unsigned ScratchWaveOffsetReg = TRI->getPreloadedValue(
+      unsigned ScratchWaveOffsetReg = TRI.getPreloadedValue(
         MF, SIRegisterInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-      Info->setScratchWaveOffsetReg(ScratchWaveOffsetReg);
+      Info.setScratchWaveOffsetReg(ScratchWaveOffsetReg);
     } else {
       unsigned ReservedOffsetReg
-        = TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
-      Info->setScratchWaveOffsetReg(ReservedOffsetReg);
+        = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
+      Info.setScratchWaveOffsetReg(ReservedOffsetReg);
     }
   }
+}
 
-  if (Info->hasWorkItemIDX()) {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_X);
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
-    CCInfo.AllocateReg(Reg);
+SDValue SITargetLowering::LowerFormalArguments(
+    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+  const SIRegisterInfo *TRI = getSubtarget()->getRegisterInfo();
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  FunctionType *FType = MF.getFunction()->getFunctionType();
+  SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  if (Subtarget->isAmdHsaOS() && AMDGPU::isShader(CallConv)) {
+    const Function *Fn = MF.getFunction();
+    DiagnosticInfoUnsupported NoGraphicsHSA(
+        *Fn, "unsupported non-compute shaders with HSA", DL.getDebugLoc());
+    DAG.getContext()->diagnose(NoGraphicsHSA);
+    return DAG.getEntryNode();
   }
 
-  if (Info->hasWorkItemIDY()) {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Y);
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
-    CCInfo.AllocateReg(Reg);
+  // Create stack objects that are used for emitting debugger prologue if
+  // "amdgpu-debugger-emit-prologue" attribute was specified.
+  if (ST.debuggerEmitPrologue())
+    createDebuggerPrologueStackObjects(MF);
+
+  SmallVector<ISD::InputArg, 16> Splits;
+  SmallVector<CCValAssign, 16> ArgLocs;
+  BitVector Skipped(Ins.size());
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+
+  bool IsShader = AMDGPU::isShader(CallConv);
+  bool IsKernel = AMDGPU::isKernel(CallConv);
+  bool IsEntryFunc = AMDGPU::isEntryFunctionCC(CallConv);
+
+  if (IsShader) {
+    processShaderInputArgs(Splits, CallConv, Ins, Skipped, FType, Info);
+
+    // At least one interpolation mode must be enabled or else the GPU will
+    // hang.
+    //
+    // Check PSInputAddr instead of PSInputEnable. The idea is that if the user
+    // set PSInputAddr, the user wants to enable some bits after the compilation
+    // based on run-time states. Since we can't know what the final PSInputEna
+    // will look like, so we shouldn't do anything here and the user should take
+    // responsibility for the correct programming.
+    //
+    // Otherwise, the following restrictions apply:
+    // - At least one of PERSP_* (0xF) or LINEAR_* (0x70) must be enabled.
+    // - If POS_W_FLOAT (11) is enabled, at least one of PERSP_* must be
+    //   enabled too.
+    if (CallConv == CallingConv::AMDGPU_PS &&
+        ((Info->getPSInputAddr() & 0x7F) == 0 ||
+         ((Info->getPSInputAddr() & 0xF) == 0 &&
+          Info->isPSInputAllocated(11)))) {
+      CCInfo.AllocateReg(AMDGPU::VGPR0);
+      CCInfo.AllocateReg(AMDGPU::VGPR1);
+      Info->markPSInputAllocated(0);
+      Info->markPSInputEnabled(0);
+    }
+
+    assert(!Info->hasDispatchPtr() &&
+           !Info->hasKernargSegmentPtr() && !Info->hasFlatScratchInit() &&
+           !Info->hasWorkGroupIDX() && !Info->hasWorkGroupIDY() &&
+           !Info->hasWorkGroupIDZ() && !Info->hasWorkGroupInfo() &&
+           !Info->hasWorkItemIDX() && !Info->hasWorkItemIDY() &&
+           !Info->hasWorkItemIDZ());
+  } else {
+    assert(!IsKernel || (Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()));
   }
 
-  if (Info->hasWorkItemIDZ()) {
-    unsigned Reg = TRI->getPreloadedValue(MF, SIRegisterInfo::WORKITEM_ID_Z);
-    MF.addLiveIn(Reg, &AMDGPU::VGPR_32RegClass);
-    CCInfo.AllocateReg(Reg);
+  if (IsEntryFunc) {
+    allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
+    allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
-  if (Chains.empty())
-    return Chain;
+  if (IsKernel) {
+    analyzeFormalArgumentsCompute(CCInfo, Ins);
+  } else {
+    CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, isVarArg);
+    CCInfo.AnalyzeFormalArguments(Splits, AssignFn);
+  }
 
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
+  SmallVector<SDValue, 16> Chains;
+
+  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+    const ISD::InputArg &Arg = Ins[i];
+    if (Skipped[i]) {
+      InVals.push_back(DAG.getUNDEF(Arg.VT));
+      continue;
+    }
+
+    CCValAssign &VA = ArgLocs[ArgIdx++];
+    MVT VT = VA.getLocVT();
+
+    if (IsEntryFunc && VA.isMemLoc()) {
+      VT = Ins[i].VT;
+      EVT MemVT = VA.getLocVT();
+
+      const uint64_t Offset = Subtarget->getExplicitKernelArgOffset(MF) +
+        VA.getLocMemOffset();
+      Info->setABIArgOffset(Offset + MemVT.getStoreSize());
+
+      // The first 36 bytes of the input buffer contains information about
+      // thread group and global sizes.
+      SDValue Arg = lowerKernargMemParameter(
+        DAG, VT, MemVT, DL, Chain, Offset, Ins[i].Flags.isSExt(), &Ins[i]);
+      Chains.push_back(Arg.getValue(1));
+
+      auto *ParamTy =
+        dyn_cast<PointerType>(FType->getParamType(Ins[i].getOrigArgIndex()));
+      if (Subtarget->getGeneration() == SISubtarget::SOUTHERN_ISLANDS &&
+          ParamTy && ParamTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
+        // On SI local pointers are just offsets into LDS, so they are always
+        // less than 16-bits.  On CI and newer they could potentially be
+        // real pointers, so we can't guarantee their size.
+        Arg = DAG.getNode(ISD::AssertZext, DL, Arg.getValueType(), Arg,
+                          DAG.getValueType(MVT::i16));
+      }
+
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    if (VA.isMemLoc())
+      report_fatal_error("memloc not supported with calling convention");
+
+    assert(VA.isRegLoc() && "Parameter must be in a register!");
+
+    unsigned Reg = VA.getLocReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
+
+    Reg = MF.addLiveIn(Reg, RC);
+    SDValue Val = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+
+    if (Arg.VT.isVector()) {
+      // Build a vector from the registers
+      Type *ParamType = FType->getParamType(Arg.getOrigArgIndex());
+      unsigned NumElements = ParamType->getVectorNumElements();
+
+      SmallVector<SDValue, 4> Regs;
+      Regs.push_back(Val);
+      for (unsigned j = 1; j != NumElements; ++j) {
+        Reg = ArgLocs[ArgIdx++].getLocReg();
+        Reg = MF.addLiveIn(Reg, RC);
+
+        SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT);
+        Regs.push_back(Copy);
+      }
+
+      // Fill up the missing vector elements
+      NumElements = Arg.VT.getVectorNumElements() - NumElements;
+      Regs.append(NumElements, DAG.getUNDEF(VT));
+
+      InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs));
+      continue;
+    }
+
+    InVals.push_back(Val);
+  }
+
+  // Start adding system SGPRs.
+  if (IsEntryFunc)
+    allocateSystemSGPRs(CCInfo, MF, *Info, IsShader);
+
+  reservePrivateMemoryRegs(getTargetMachine(), MF, *TRI, *Info);
+
+  return Chains.empty() ? Chain :
+    DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
 }
 
 SDValue
@@ -1237,7 +1398,7 @@ SITargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   if (Flag.getNode())
     RetOps.push_back(Flag);
 
-  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN;
+  unsigned Opc = Info->returnsVoid() ? AMDGPUISD::ENDPGM : AMDGPUISD::RETURN_TO_EPILOG;
   return DAG.getNode(Opc, DL, MVT::Other, RetOps);
 }
 
@@ -1532,11 +1693,12 @@ static bool setM0ToIndexFromSGPR(const SIInstrInfo *TII,
   }
 
   if (Offset == 0) {
-    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0).add(*Idx);
+    BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+      .add(*Idx);
   } else {
     BuildMI(*MBB, I, DL, TII->get(AMDGPU::S_ADD_I32), AMDGPU::M0)
-        .add(*Idx)
-        .addImm(Offset);
+      .add(*Idx)
+      .addImm(Offset);
   }
 
   return true;
@@ -1561,7 +1723,7 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
   std::tie(SubReg, Offset)
     = computeIndirectRegAndOffset(TRI, VecRC, SrcReg, Offset);
 
-  bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
 
   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, true)) {
     MachineBasicBlock::iterator I(&MI);
@@ -1663,7 +1825,7 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
   std::tie(SubReg, Offset) = computeIndirectRegAndOffset(TRI, VecRC,
                                                          SrcVec->getReg(),
                                                          Offset);
-  bool UseGPRIdxMode = ST.hasVGPRIndexMode() && EnableVGPRIndexMode;
+  bool UseGPRIdxMode = ST.useVGPRIndexMode(EnableVGPRIndexMode);
 
   if (Idx->getReg() == AMDGPU::NoRegister) {
     MachineBasicBlock::iterator I(&MI);
@@ -1779,29 +1941,50 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   }
 
   switch (MI.getOpcode()) {
-   case AMDGPU::S_TRAP_PSEUDO: {
-	DebugLoc DL = MI.getDebugLoc();
-	BuildMI(*BB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), AMDGPU::VGPR0)
-     .addImm(1);
+  case AMDGPU::S_TRAP_PSEUDO: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    const int TrapType = MI.getOperand(0).getImm();
 
-    MachineFunction *MF = BB->getParent();
-    SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
-    unsigned UserSGPR = Info->getQueuePtrUserSGPR();
-    assert(UserSGPR != AMDGPU::NoRegister);
+    if (Subtarget->getTrapHandlerAbi() == SISubtarget::TrapHandlerAbiHsa &&
+        Subtarget->isTrapHandlerEnabled()) {
 
-    if (!BB->isLiveIn(UserSGPR))
-      BB->addLiveIn(UserSGPR);
+      MachineFunction *MF = BB->getParent();
+      SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
+      unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+      assert(UserSGPR != AMDGPU::NoRegister);
 
-    BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::SGPR0_SGPR1)
-     .addReg(UserSGPR);
-    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_TRAP)).addImm(0x1)
-     .addReg(AMDGPU::VGPR0, RegState::Implicit)
-     .addReg(AMDGPU::SGPR0_SGPR1, RegState::Implicit);
+      if (!BB->isLiveIn(UserSGPR))
+        BB->addLiveIn(UserSGPR);
+
+      BuildMI(*BB, MI, DL, TII->get(AMDGPU::COPY), AMDGPU::SGPR0_SGPR1)
+        .addReg(UserSGPR);
+      BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_TRAP))
+        .addImm(TrapType)
+        .addReg(AMDGPU::SGPR0_SGPR1, RegState::Implicit);
+    } else {
+      switch (TrapType) {
+      case SISubtarget::TrapIDLLVMTrap:
+        BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_ENDPGM));
+        break;
+      case SISubtarget::TrapIDLLVMDebugTrap: {
+        DiagnosticInfoUnsupported NoTrap(*MF->getFunction(),
+                                         "debugtrap handler not supported",
+                                         DL,
+                                         DS_Warning);
+        LLVMContext &C = MF->getFunction()->getContext();
+        C.diagnose(NoTrap);
+        BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_NOP))
+          .addImm(0);
+        break;
+      }
+      default:
+        llvm_unreachable("unsupported trap handler type!");
+      }
+    }
 
     MI.eraseFromParent();
     return BB;
   }
-
   case AMDGPU::SI_INIT_M0:
     BuildMI(*BB, MI.getIterator(), MI.getDebugLoc(),
             TII->get(AMDGPU::S_MOV_B32), AMDGPU::M0)
@@ -1991,6 +2174,23 @@ void SITargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(Res);
     return;
   }
+  case ISD::INTRINSIC_WO_CHAIN: {
+    unsigned IID = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
+    switch (IID) {
+    case Intrinsic::amdgcn_cvt_pkrtz: {
+      SDValue Src0 = N->getOperand(1);
+      SDValue Src1 = N->getOperand(2);
+      SDLoc SL(N);
+      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, SL, MVT::i32,
+                                Src0, Src1);
+
+      Results.push_back(DAG.getNode(ISD::BITCAST, SL, MVT::v2f16, Cvt));
+      return;
+    }
+    default:
+      break;
+    }
+  }
   default:
     break;
   }
@@ -2012,31 +2212,25 @@ static SDNode *findUser(SDValue Value, unsigned Opcode) {
   return nullptr;
 }
 
-bool SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
+unsigned SITargetLowering::isCFIntrinsic(const SDNode *Intr) const {
   if (Intr->getOpcode() == ISD::INTRINSIC_W_CHAIN) {
     switch (cast<ConstantSDNode>(Intr->getOperand(1))->getZExtValue()) {
-    case AMDGPUIntrinsic::amdgcn_if:
-    case AMDGPUIntrinsic::amdgcn_else:
-    case AMDGPUIntrinsic::amdgcn_end_cf:
-    case AMDGPUIntrinsic::amdgcn_loop:
-      return true;
-    default:
-      return false;
-    }
-  }
-
-  if (Intr->getOpcode() == ISD::INTRINSIC_WO_CHAIN) {
-    switch (cast<ConstantSDNode>(Intr->getOperand(0))->getZExtValue()) {
-    case AMDGPUIntrinsic::amdgcn_break:
-    case AMDGPUIntrinsic::amdgcn_if_break:
-    case AMDGPUIntrinsic::amdgcn_else_break:
-      return true;
+    case Intrinsic::amdgcn_if:
+      return AMDGPUISD::IF;
+    case Intrinsic::amdgcn_else:
+      return AMDGPUISD::ELSE;
+    case Intrinsic::amdgcn_loop:
+      return AMDGPUISD::LOOP;
+    case Intrinsic::amdgcn_end_cf:
+      llvm_unreachable("should not occur");
     default:
-      return false;
+      return 0;
     }
   }
 
-  return false;
+  // break, if_break, else_break are all only used as inputs to loop, not
+  // directly as branch conditions.
+  return 0;
 }
 
 void SITargetLowering::createDebuggerPrologueStackObjects(
@@ -2067,13 +2261,13 @@ void SITargetLowering::createDebuggerPrologueStackObjects(
 
 bool SITargetLowering::shouldEmitFixup(const GlobalValue *GV) const {
   const Triple &TT = getTargetMachine().getTargetTriple();
-  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+  return GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
          AMDGPU::shouldEmitConstantsToTextSection(TT);
 }
 
 bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
-  return (GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-              GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+  return (GV->getType()->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
+              GV->getType()->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
          !shouldEmitFixup(GV) &&
          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
 }
@@ -2111,7 +2305,8 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
   // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
 
-  if (!isCFIntrinsic(Intr)) {
+  unsigned CFNode = isCFIntrinsic(Intr);
+  if (CFNode == 0) {
     // This is a uniform branch so we don't need to legalize.
     return BRCOND;
   }
@@ -2129,15 +2324,13 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   if (HaveChain)
     Ops.push_back(BRCOND.getOperand(0));
 
-  Ops.append(Intr->op_begin() + (HaveChain ?  1 : 0), Intr->op_end());
+  Ops.append(Intr->op_begin() + (HaveChain ?  2 : 1), Intr->op_end());
   Ops.push_back(Target);
 
   ArrayRef<EVT> Res(Intr->value_begin() + 1, Intr->value_end());
 
   // build the new intrinsic call
-  SDNode *Result = DAG.getNode(
-    Res.size() > 1 ? ISD::INTRINSIC_W_CHAIN : ISD::INTRINSIC_VOID, DL,
-    DAG.getVTList(Res), Ops).getNode();
+  SDNode *Result = DAG.getNode(CFNode, DL, DAG.getVTList(Res), Ops).getNode();
 
   if (!HaveChain) {
     SDValue Ops[] =  {
@@ -2209,9 +2402,28 @@ SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getNode(ISD::BITCAST, DL, MVT::f16, Trunc);;
 }
 
-SDValue SITargetLowering::getSegmentAperture(unsigned AS,
+SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
                                              SelectionDAG &DAG) const {
-  SDLoc SL;
+  // FIXME: Use inline constants (src_{shared, private}_base) instead.
+  if (Subtarget->hasApertureRegs()) {
+    unsigned Offset = AS == AMDGPUASI.LOCAL_ADDRESS ?
+        AMDGPU::Hwreg::OFFSET_SRC_SHARED_BASE :
+        AMDGPU::Hwreg::OFFSET_SRC_PRIVATE_BASE;
+    unsigned WidthM1 = AS == AMDGPUASI.LOCAL_ADDRESS ?
+        AMDGPU::Hwreg::WIDTH_M1_SRC_SHARED_BASE :
+        AMDGPU::Hwreg::WIDTH_M1_SRC_PRIVATE_BASE;
+    unsigned Encoding =
+        AMDGPU::Hwreg::ID_MEM_BASES << AMDGPU::Hwreg::ID_SHIFT_ |
+        Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
+        WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
+
+    SDValue EncodingImm = DAG.getTargetConstant(Encoding, DL, MVT::i16);
+    SDValue ApertureReg = SDValue(
+        DAG.getMachineNode(AMDGPU::S_GETREG_B32, DL, MVT::i32, EncodingImm), 0);
+    SDValue ShiftAmount = DAG.getTargetConstant(WidthM1 + 1, DL, MVT::i32);
+    return DAG.getNode(ISD::SHL, DL, MVT::i32, ApertureReg, ShiftAmount);
+  }
+
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   unsigned UserSGPR = Info->getQueuePtrUserSGPR();
@@ -2222,19 +2434,19 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS,
 
   // Offset into amd_queue_t for group_segment_aperture_base_hi /
   // private_segment_aperture_base_hi.
-  uint32_t StructOffset = (AS == AMDGPUAS::LOCAL_ADDRESS) ? 0x40 : 0x44;
+  uint32_t StructOffset = (AS == AMDGPUASI.LOCAL_ADDRESS) ? 0x40 : 0x44;
 
-  SDValue Ptr = DAG.getNode(ISD::ADD, SL, MVT::i64, QueuePtr,
-                            DAG.getConstant(StructOffset, SL, MVT::i64));
+  SDValue Ptr = DAG.getNode(ISD::ADD, DL, MVT::i64, QueuePtr,
+                            DAG.getConstant(StructOffset, DL, MVT::i64));
 
   // TODO: Use custom target PseudoSourceValue.
   // TODO: We should use the value from the IR intrinsic call, but it might not
   // be available and how do we get it?
   Value *V = UndefValue::get(PointerType::get(Type::getInt8Ty(*DAG.getContext()),
-                                              AMDGPUAS::CONSTANT_ADDRESS));
+                                              AMDGPUASI.CONSTANT_ADDRESS));
 
   MachinePointerInfo PtrInfo(V, StructOffset);
-  return DAG.getLoad(MVT::i32, SL, QueuePtr.getValue(1), Ptr, PtrInfo,
+  return DAG.getLoad(MVT::i32, DL, QueuePtr.getValue(1), Ptr, PtrInfo,
                      MinAlign(64, StructOffset),
                      MachineMemOperand::MODereferenceable |
                          MachineMemOperand::MOInvariant);
@@ -2246,15 +2458,19 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
   const AddrSpaceCastSDNode *ASC = cast<AddrSpaceCastSDNode>(Op);
 
   SDValue Src = ASC->getOperand(0);
-
-  // FIXME: Really support non-0 null pointers.
-  SDValue SegmentNullPtr = DAG.getConstant(-1, SL, MVT::i32);
   SDValue FlatNullPtr = DAG.getConstant(0, SL, MVT::i64);
 
+  const AMDGPUTargetMachine &TM =
+    static_cast<const AMDGPUTargetMachine &>(getTargetMachine());
+
   // flat -> local/private
-  if (ASC->getSrcAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
-    if (ASC->getDestAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-        ASC->getDestAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+  if (ASC->getSrcAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
+    unsigned DestAS = ASC->getDestAddressSpace();
+
+    if (DestAS == AMDGPUASI.LOCAL_ADDRESS ||
+        DestAS == AMDGPUASI.PRIVATE_ADDRESS) {
+      unsigned NullVal = TM.getNullPointerValue(DestAS);
+      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
       SDValue NonNull = DAG.getSetCC(SL, MVT::i1, Src, FlatNullPtr, ISD::SETNE);
       SDValue Ptr = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
 
@@ -2264,13 +2480,18 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
   }
 
   // local/private -> flat
-  if (ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) {
-    if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
-        ASC->getSrcAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS) {
+  if (ASC->getDestAddressSpace() == AMDGPUASI.FLAT_ADDRESS) {
+    unsigned SrcAS = ASC->getSrcAddressSpace();
+
+    if (SrcAS == AMDGPUASI.LOCAL_ADDRESS ||
+        SrcAS == AMDGPUASI.PRIVATE_ADDRESS) {
+      unsigned NullVal = TM.getNullPointerValue(SrcAS);
+      SDValue SegmentNullPtr = DAG.getConstant(NullVal, SL, MVT::i32);
+
       SDValue NonNull
         = DAG.getSetCC(SL, MVT::i1, Src, SegmentNullPtr, ISD::SETNE);
 
-      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), DAG);
+      SDValue Aperture = getSegmentAperture(ASC->getSrcAddressSpace(), SL, DAG);
       SDValue CvtPtr
         = DAG.getNode(ISD::BUILD_VECTOR, SL, MVT::v2i32, Src, Aperture);
 
@@ -2363,8 +2584,8 @@ SDValue SITargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
 bool
 SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
   // We can fold offsets for anything that doesn't require a GOT relocation.
-  return (GA->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-              GA->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS) &&
+  return (GA->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS ||
+              GA->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS) &&
          !shouldEmitGOTReloc(GA->getGlobal());
 }
 
@@ -2415,8 +2636,8 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                              SelectionDAG &DAG) const {
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
 
-  if (GSD->getAddressSpace() != AMDGPUAS::CONSTANT_ADDRESS &&
-      GSD->getAddressSpace() != AMDGPUAS::GLOBAL_ADDRESS)
+  if (GSD->getAddressSpace() != AMDGPUASI.CONSTANT_ADDRESS &&
+      GSD->getAddressSpace() != AMDGPUASI.GLOBAL_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
 
   SDLoc DL(GSD);
@@ -2433,7 +2654,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
                                             SIInstrInfo::MO_GOTPCREL32);
 
   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
-  PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
+  PointerType *PtrTy = PointerType::get(Ty, AMDGPUASI.CONSTANT_ADDRESS);
   const DataLayout &DataLayout = DAG.getDataLayout();
   unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
   // FIXME: Use a PseudoSourceValue once those can be assigned an address space.
@@ -2465,8 +2686,8 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                                                  MVT VT,
                                                  unsigned Offset) const {
   SDLoc SL(Op);
-  SDValue Param = LowerParameter(DAG, MVT::i32, MVT::i32, SL,
-                                 DAG.getEntryNode(), Offset, false);
+  SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
+                                           DAG.getEntryNode(), Offset, false);
   // The local size values will have the hi 16-bits as zero.
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
                      DAG.getValueType(VT));
@@ -2524,7 +2745,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_implicitarg_ptr: {
     unsigned offset = getImplicitParameterOffset(MFI, FIRST_IMPLICIT);
-    return LowerParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
+    return lowerKernArgParameterPtr(DAG, DL, DAG.getEntryNode(), offset);
   }
   case Intrinsic::amdgcn_kernarg_segment_ptr: {
     unsigned Reg
@@ -2538,7 +2759,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_rcp:
     return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1));
   case Intrinsic::amdgcn_rsq:
-  case AMDGPUIntrinsic::AMDGPU_rsq: // Legacy name
     return DAG.getNode(AMDGPUISD::RSQ, DL, VT, Op.getOperand(1));
   case Intrinsic::amdgcn_rsq_legacy:
     if (Subtarget->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
@@ -2567,38 +2787,38 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_X, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::NGROUPS_X, false);
   case Intrinsic::r600_read_ngroups_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_Y, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::NGROUPS_Y, false);
   case Intrinsic::r600_read_ngroups_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::NGROUPS_Z, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::NGROUPS_Z, false);
   case Intrinsic::r600_read_global_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, false);
   case Intrinsic::r600_read_global_size_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, false);
   case Intrinsic::r600_read_global_size_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
-    return LowerParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                          SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
+    return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, false);
   case Intrinsic::r600_read_local_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
@@ -2655,42 +2875,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::LOAD_CONSTANT, DL,
                                    Op->getVTList(), Ops, VT, MMO);
   }
-  case AMDGPUIntrinsic::amdgcn_fdiv_fast:
+  case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
-  case AMDGPUIntrinsic::SI_vs_load_input:
-    return DAG.getNode(AMDGPUISD::LOAD_INPUT, DL, VT,
-                       Op.getOperand(1),
-                       Op.getOperand(2),
-                       Op.getOperand(3));
-
-  case AMDGPUIntrinsic::SI_fs_constant: {
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
-    SDValue Glue = M0.getValue(1);
-    return DAG.getNode(AMDGPUISD::INTERP_MOV, DL, MVT::f32,
-                       DAG.getConstant(2, DL, MVT::i32), // P0
-                       Op.getOperand(1), Op.getOperand(2), Glue);
-  }
-  case AMDGPUIntrinsic::SI_packf16:
-    if (Op.getOperand(1).isUndef() && Op.getOperand(2).isUndef())
-      return DAG.getUNDEF(MVT::i32);
-    return Op;
-  case AMDGPUIntrinsic::SI_fs_interp: {
-    SDValue IJ = Op.getOperand(4);
-    SDValue I = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
-                            DAG.getConstant(0, DL, MVT::i32));
-    SDValue J = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, IJ,
-                            DAG.getConstant(1, DL, MVT::i32));
-    I = DAG.getNode(ISD::BITCAST, DL, MVT::f32, I);
-    J = DAG.getNode(ISD::BITCAST, DL, MVT::f32, J);
-    SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(3));
-    SDValue Glue = M0.getValue(1);
-    SDValue P1 = DAG.getNode(AMDGPUISD::INTERP_P1, DL,
-                             DAG.getVTList(MVT::f32, MVT::Glue),
-                             I, Op.getOperand(1), Op.getOperand(2), Glue);
-    Glue = SDValue(P1.getNode(), 1);
-    return DAG.getNode(AMDGPUISD::INTERP_P2, DL, MVT::f32, P1, J,
-                             Op.getOperand(1), Op.getOperand(2), Glue);
-  }
   case Intrinsic::amdgcn_interp_mov: {
     SDValue M0 = copyToM0(DAG, DAG.getEntryNode(), DL, Op.getOperand(4));
     SDValue Glue = M0.getValue(1);
@@ -2771,10 +2957,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_icmp: {
     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-    int CondCode = CD->getSExtValue();
+    if (!CD)
+      return DAG.getUNDEF(VT);
 
+    int CondCode = CD->getSExtValue();
     if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
-        CondCode >= ICmpInst::Predicate::BAD_ICMP_PREDICATE)
+        CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
       return DAG.getUNDEF(VT);
 
     ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
@@ -2784,10 +2972,12 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   }
   case Intrinsic::amdgcn_fcmp: {
     const auto *CD = dyn_cast<ConstantSDNode>(Op.getOperand(3));
-    int CondCode = CD->getSExtValue();
+    if (!CD)
+      return DAG.getUNDEF(VT);
 
-    if (CondCode <= FCmpInst::Predicate::FCMP_FALSE ||
-        CondCode >= FCmpInst::Predicate::FCMP_TRUE)
+    int CondCode = CD->getSExtValue();
+    if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
+        CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE)
       return DAG.getUNDEF(VT);
 
     FCmpInst::Predicate IcInput = static_cast<FCmpInst::Predicate>(CondCode);
@@ -2795,14 +2985,29 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AMDGPUISD::SETCC, DL, VT, Op.getOperand(1),
                        Op.getOperand(2), DAG.getCondCode(CCOpcode));
   }
+  case Intrinsic::amdgcn_fmed3:
+    return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   case Intrinsic::amdgcn_fmul_legacy:
     return DAG.getNode(AMDGPUISD::FMUL_LEGACY, DL, VT,
                        Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::amdgcn_sffbh:
-  case AMDGPUIntrinsic::AMDGPU_flbit_i32: // Legacy name.
     return DAG.getNode(AMDGPUISD::FFBH_I32, DL, VT, Op.getOperand(1));
+  case Intrinsic::amdgcn_sbfe:
+    return DAG.getNode(AMDGPUISD::BFE_I32, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_ubfe:
+    return DAG.getNode(AMDGPUISD::BFE_U32, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_cvt_pkrtz: {
+    // FIXME: Stop adding cast if v2f16 legal.
+    EVT VT = Op.getValueType();
+    SDValue Node = DAG.getNode(AMDGPUISD::CVT_PKRTZ_F16_F32, DL, MVT::i32,
+                               Op.getOperand(1), Op.getOperand(2));
+    return DAG.getNode(ISD::BITCAST, DL, VT, Node);
+  }
   default:
-    return AMDGPUTargetLowering::LowerOperation(Op, DAG);
+    return Op;
   }
 }
 
@@ -2850,6 +3055,64 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
 
     return DAG.getMemIntrinsicNode(Opc, DL, Op->getVTList(), Ops, IntVT, MMO);
   }
+  // Basic sample.
+  case Intrinsic::amdgcn_image_sample:
+  case Intrinsic::amdgcn_image_sample_cl:
+  case Intrinsic::amdgcn_image_sample_d:
+  case Intrinsic::amdgcn_image_sample_d_cl:
+  case Intrinsic::amdgcn_image_sample_l:
+  case Intrinsic::amdgcn_image_sample_b:
+  case Intrinsic::amdgcn_image_sample_b_cl:
+  case Intrinsic::amdgcn_image_sample_lz:
+  case Intrinsic::amdgcn_image_sample_cd:
+  case Intrinsic::amdgcn_image_sample_cd_cl:
+
+  // Sample with comparison.
+  case Intrinsic::amdgcn_image_sample_c:
+  case Intrinsic::amdgcn_image_sample_c_cl:
+  case Intrinsic::amdgcn_image_sample_c_d:
+  case Intrinsic::amdgcn_image_sample_c_d_cl:
+  case Intrinsic::amdgcn_image_sample_c_l:
+  case Intrinsic::amdgcn_image_sample_c_b:
+  case Intrinsic::amdgcn_image_sample_c_b_cl:
+  case Intrinsic::amdgcn_image_sample_c_lz:
+  case Intrinsic::amdgcn_image_sample_c_cd:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl:
+
+  // Sample with offsets.
+  case Intrinsic::amdgcn_image_sample_o:
+  case Intrinsic::amdgcn_image_sample_cl_o:
+  case Intrinsic::amdgcn_image_sample_d_o:
+  case Intrinsic::amdgcn_image_sample_d_cl_o:
+  case Intrinsic::amdgcn_image_sample_l_o:
+  case Intrinsic::amdgcn_image_sample_b_o:
+  case Intrinsic::amdgcn_image_sample_b_cl_o:
+  case Intrinsic::amdgcn_image_sample_lz_o:
+  case Intrinsic::amdgcn_image_sample_cd_o:
+  case Intrinsic::amdgcn_image_sample_cd_cl_o:
+
+  // Sample with comparison and offsets.
+  case Intrinsic::amdgcn_image_sample_c_o:
+  case Intrinsic::amdgcn_image_sample_c_cl_o:
+  case Intrinsic::amdgcn_image_sample_c_d_o:
+  case Intrinsic::amdgcn_image_sample_c_d_cl_o:
+  case Intrinsic::amdgcn_image_sample_c_l_o:
+  case Intrinsic::amdgcn_image_sample_c_b_o:
+  case Intrinsic::amdgcn_image_sample_c_b_cl_o:
+  case Intrinsic::amdgcn_image_sample_c_lz_o:
+  case Intrinsic::amdgcn_image_sample_c_cd_o:
+  case Intrinsic::amdgcn_image_sample_c_cd_cl_o:
+
+  case Intrinsic::amdgcn_image_getlod: {
+    // Replace dmask with everything disabled with undef.
+    const ConstantSDNode *DMask = dyn_cast<ConstantSDNode>(Op.getOperand(5));
+    if (!DMask || DMask->isNullValue()) {
+      SDValue Undef = DAG.getUNDEF(Op.getValueType());
+      return DAG.getMergeValues({ Undef, Op.getOperand(0) }, SDLoc(Op));
+    }
+
+    return SDValue();
+  }
   default:
     return SDValue();
   }
@@ -2863,7 +3126,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   unsigned IntrinsicID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
 
   switch (IntrinsicID) {
-      case Intrinsic::amdgcn_exp: {
+  case Intrinsic::amdgcn_exp: {
     const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
     const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
     const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
@@ -2911,16 +3174,12 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
   }
   case Intrinsic::amdgcn_s_sendmsg:
-  case AMDGPUIntrinsic::SI_sendmsg: {
-    Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
-    SDValue Glue = Chain.getValue(1);
-    return DAG.getNode(AMDGPUISD::SENDMSG, DL, MVT::Other, Chain,
-                       Op.getOperand(2), Glue);
-  }
   case Intrinsic::amdgcn_s_sendmsghalt: {
+    unsigned NodeOp = (IntrinsicID == Intrinsic::amdgcn_s_sendmsg) ?
+      AMDGPUISD::SENDMSG : AMDGPUISD::SENDMSGHALT;
     Chain = copyToM0(DAG, Chain, DL, Op.getOperand(3));
     SDValue Glue = Chain.getValue(1);
-    return DAG.getNode(AMDGPUISD::SENDMSGHALT, DL, MVT::Other, Chain,
+    return DAG.getNode(NodeOp, DL, MVT::Other, Chain,
                        Op.getOperand(2), Glue);
   }
   case AMDGPUIntrinsic::SI_tbuffer_store: {
@@ -2963,31 +3222,19 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     SDValue Cast = DAG.getNode(ISD::BITCAST, DL, MVT::i32, Src);
     return DAG.getNode(AMDGPUISD::KILL, DL, MVT::Other, Chain, Cast);
   }
-  case AMDGPUIntrinsic::SI_export: { // Legacy intrinsic.
-    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(2));
-    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(3));
-    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(4));
-    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(5));
-    const ConstantSDNode *Compr = cast<ConstantSDNode>(Op.getOperand(6));
-
-    const SDValue Ops[] = {
-      Chain,
-      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8),
-      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),
-      Op.getOperand(7),  // src0
-      Op.getOperand(8),  // src1
-      Op.getOperand(9),  // src2
-      Op.getOperand(10), // src3
-      DAG.getTargetConstant(Compr->getZExtValue(), DL, MVT::i1),
-      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
-    };
-
-    unsigned Opc = Done->isNullValue() ?
-      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
-    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
-  }
-  default:
+  case Intrinsic::amdgcn_s_barrier: {
+    if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
+      const MachineFunction &MF = DAG.getMachineFunction();
+      const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+      unsigned WGSize = ST.getFlatWorkGroupSizes(*MF.getFunction()).second;
+      if (WGSize <= ST.getWavefrontSize())
+        return SDValue(DAG.getMachineNode(AMDGPU::WAVE_BARRIER, DL, MVT::Other,
+                                          Op.getOperand(0)), 0);
+    }
     return SDValue();
+  };
+  default:
+    return Op;
   }
 }
 
@@ -3036,21 +3283,20 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   // If there is a possibilty that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
-  if (AS == AMDGPUAS::FLAT_ADDRESS)
+  if (AS == AMDGPUASI.FLAT_ADDRESS)
     AS = MFI->hasFlatScratchInit() ?
-         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+         AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
 
   unsigned NumElements = MemVT.getVectorNumElements();
-  switch (AS) {
-  case AMDGPUAS::CONSTANT_ADDRESS:
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     if (isMemOpUniform(Load))
       return SDValue();
     // Non-uniform loads will be selected to MUBUF instructions, so they
     // have the same legalization requirements as global and private
     // loads.
     //
-    LLVM_FALLTHROUGH;
-  case AMDGPUAS::GLOBAL_ADDRESS:
+  }
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS) {
     if (Subtarget->getScalarizeGlobalBehavior() && isMemOpUniform(Load) &&
                   isMemOpHasNoClobberedMemOperand(Load))
       return SDValue();
@@ -3058,13 +3304,15 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     // have the same legalization requirements as global and private
     // loads.
     //
-    LLVM_FALLTHROUGH;
-  case AMDGPUAS::FLAT_ADDRESS:
+  }
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS || AS == AMDGPUASI.GLOBAL_ADDRESS ||
+      AS == AMDGPUASI.FLAT_ADDRESS) {
     if (NumElements > 4)
       return SplitVectorLoad(Op, DAG);
     // v4 loads are supported for private and global memory.
     return SDValue();
-  case AMDGPUAS::PRIVATE_ADDRESS:
+  }
+  if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     // Depending on the setting of the private_element_size field in the
     // resource descriptor, we can only make private accesses up to a certain
     // size.
@@ -3083,7 +3331,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
     default:
       llvm_unreachable("unsupported private_element_size");
     }
-  case AMDGPUAS::LOCAL_ADDRESS:
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
     if (NumElements > 2)
       return SplitVectorLoad(Op, DAG);
 
@@ -3092,9 +3340,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
 
     // If properly aligned, if we split we might be able to use ds_read_b64.
     return SplitVectorLoad(Op, DAG);
-  default:
-    return SDValue();
   }
+  return SDValue();
 }
 
 SDValue SITargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
@@ -3463,18 +3710,17 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   // If there is a possibilty that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
-  if (AS == AMDGPUAS::FLAT_ADDRESS)
+  if (AS == AMDGPUASI.FLAT_ADDRESS)
     AS = MFI->hasFlatScratchInit() ?
-         AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
+         AMDGPUASI.PRIVATE_ADDRESS : AMDGPUASI.GLOBAL_ADDRESS;
 
   unsigned NumElements = VT.getVectorNumElements();
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
-  case AMDGPUAS::FLAT_ADDRESS:
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS ||
+      AS == AMDGPUASI.FLAT_ADDRESS) {
     if (NumElements > 4)
       return SplitVectorStore(Op, DAG);
     return SDValue();
-  case AMDGPUAS::PRIVATE_ADDRESS: {
+  } else if (AS == AMDGPUASI.PRIVATE_ADDRESS) {
     switch (Subtarget->getMaxPrivateElementSize()) {
     case 4:
       return scalarizeVectorStore(Store, DAG);
@@ -3489,8 +3735,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     default:
       llvm_unreachable("unsupported private_element_size");
     }
-  }
-  case AMDGPUAS::LOCAL_ADDRESS: {
+  } else if (AS == AMDGPUASI.LOCAL_ADDRESS) {
     if (NumElements > 2)
       return SplitVectorStore(Op, DAG);
 
@@ -3499,8 +3744,7 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
 
     // If properly aligned, if we split we might be able to use ds_write_b64.
     return SplitVectorStore(Op, DAG);
-  }
-  default:
+  } else {
     llvm_unreachable("unhandled address space");
   }
 }
@@ -3531,7 +3775,7 @@ SDValue SITargetLowering::LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) co
   unsigned AS = AtomicNode->getAddressSpace();
 
   // No custom lowering required for local address space
-  if (!isFlatGlobalAddrSpace(AS))
+  if (!isFlatGlobalAddrSpace(AS, AMDGPUASI))
     return Op;
 
   // Non-local address space requires custom lowering for atomic compare
@@ -3588,26 +3832,26 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
 /// the immediate offsets of a memory instruction for the given address space.
 static bool canFoldOffset(unsigned OffsetSize, unsigned AS,
                           const SISubtarget &STI) {
-  switch (AS) {
-  case AMDGPUAS::GLOBAL_ADDRESS:
+  auto AMDGPUASI = STI.getAMDGPUAS();
+  if (AS == AMDGPUASI.GLOBAL_ADDRESS) {
     // MUBUF instructions a 12-bit offset in bytes.
     return isUInt<12>(OffsetSize);
-  case AMDGPUAS::CONSTANT_ADDRESS:
+  }
+  if (AS == AMDGPUASI.CONSTANT_ADDRESS) {
     // SMRD instructions have an 8-bit offset in dwords on SI and
     // a 20-bit offset in bytes on VI.
     if (STI.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
       return isUInt<20>(OffsetSize);
     else
       return (OffsetSize % 4 == 0) && isUInt<8>(OffsetSize / 4);
-  case AMDGPUAS::LOCAL_ADDRESS:
-  case AMDGPUAS::REGION_ADDRESS:
+  }
+  if (AS == AMDGPUASI.LOCAL_ADDRESS ||
+      AS == AMDGPUASI.REGION_ADDRESS) {
     // The single offset versions have a 16-bit offset in bytes.
     return isUInt<16>(OffsetSize);
-  case AMDGPUAS::PRIVATE_ADDRESS:
-  // Indirect register addressing does not use any offsets.
-  default:
-    return false;
   }
+  // Indirect register addressing does not use any offsets.
+  return false;
 }
 
 // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
@@ -3665,7 +3909,7 @@ SDValue SITargetLowering::performMemSDNodeCombine(MemSDNode *N,
 
   // TODO: We could also do this for multiplies.
   unsigned AS = N->getAddressSpace();
-  if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUAS::PRIVATE_ADDRESS) {
+  if (Ptr.getOpcode() == ISD::SHL && AS != AMDGPUASI.PRIVATE_ADDRESS) {
     SDValue NewPtr = performSHLPtrCombine(Ptr.getNode(), AS, DCI);
     if (NewPtr) {
       SmallVector<SDValue, 8> NewOps(N->op_begin(), N->op_end());
@@ -3865,6 +4109,88 @@ SDValue SITargetLowering::performXorCombine(SDNode *N,
   return SDValue();
 }
 
+// Instructions that will be lowered with a final instruction that zeros the
+// high result bits.
+// XXX - probably only need to list legal operations.
+static bool fp16SrcZerosHighBits(unsigned Opc) {
+  switch (Opc) {
+  case ISD::FADD:
+  case ISD::FSUB:
+  case ISD::FMUL:
+  case ISD::FDIV:
+  case ISD::FREM:
+  case ISD::FMA:
+  case ISD::FMAD:
+  case ISD::FCANONICALIZE:
+  case ISD::FP_ROUND:
+  case ISD::UINT_TO_FP:
+  case ISD::SINT_TO_FP:
+  case ISD::FABS:
+    // Fabs is lowered to a bit operation, but it's an and which will clear the
+    // high bits anyway.
+  case ISD::FSQRT:
+  case ISD::FSIN:
+  case ISD::FCOS:
+  case ISD::FPOWI:
+  case ISD::FPOW:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FCEIL:
+  case ISD::FTRUNC:
+  case ISD::FRINT:
+  case ISD::FNEARBYINT:
+  case ISD::FROUND:
+  case ISD::FFLOOR:
+  case ISD::FMINNUM:
+  case ISD::FMAXNUM:
+  case AMDGPUISD::FRACT:
+  case AMDGPUISD::CLAMP:
+  case AMDGPUISD::COS_HW:
+  case AMDGPUISD::SIN_HW:
+  case AMDGPUISD::FMIN3:
+  case AMDGPUISD::FMAX3:
+  case AMDGPUISD::FMED3:
+  case AMDGPUISD::FMAD_FTZ:
+  case AMDGPUISD::RCP:
+  case AMDGPUISD::RSQ:
+  case AMDGPUISD::LDEXP:
+    return true;
+  default:
+    // fcopysign, select and others may be lowered to 32-bit bit operations
+    // which don't zero the high bits.
+    return false;
+  }
+}
+
+SDValue SITargetLowering::performZeroExtendCombine(SDNode *N,
+                                                   DAGCombinerInfo &DCI) const {
+  if (!Subtarget->has16BitInsts() ||
+      DCI.getDAGCombineLevel() < AfterLegalizeDAG)
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::i32)
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  if (Src.getValueType() != MVT::i16)
+    return SDValue();
+
+  // (i32 zext (i16 (bitcast f16:$src))) -> fp16_zext $src
+  // FIXME: It is not universally true that the high bits are zeroed on gfx9.
+  if (Src.getOpcode() == ISD::BITCAST) {
+    SDValue BCSrc = Src.getOperand(0);
+    if (BCSrc.getValueType() == MVT::f16 &&
+        fp16SrcZerosHighBits(BCSrc.getOpcode()))
+      return DCI.DAG.getNode(AMDGPUISD::FP16_ZEXT, SDLoc(N), VT, BCSrc);
+  }
+
+  return SDValue();
+}
+
 SDValue SITargetLowering::performClassCombine(SDNode *N,
                                               DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -3886,7 +4212,7 @@ SDValue SITargetLowering::performClassCombine(SDNode *N,
 SDValue SITargetLowering::performFCanonicalizeCombine(
   SDNode *N,
   DAGCombinerInfo &DCI) const {
-  ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(N->getOperand(0));
+  ConstantFPSDNode *CFP = isConstOrConstSplatFP(N->getOperand(0));
   if (!CFP)
     return SDValue();
 
@@ -3896,13 +4222,14 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
   // Flush denormals to 0 if not enabled.
   if (C.isDenormal()) {
     EVT VT = N->getValueType(0);
-    if (VT == MVT::f32 && !Subtarget->hasFP32Denormals())
+    EVT SVT = VT.getScalarType();
+    if (SVT == MVT::f32 && !Subtarget->hasFP32Denormals())
       return DAG.getConstantFP(0.0, SDLoc(N), VT);
 
-    if (VT == MVT::f64 && !Subtarget->hasFP64Denormals())
+    if (SVT == MVT::f64 && !Subtarget->hasFP64Denormals())
       return DAG.getConstantFP(0.0, SDLoc(N), VT);
 
-    if (VT == MVT::f16 && !Subtarget->hasFP16Denormals())
+    if (SVT == MVT::f16 && !Subtarget->hasFP16Denormals())
       return DAG.getConstantFP(0.0, SDLoc(N), VT);
   }
 
@@ -3922,7 +4249,7 @@ SDValue SITargetLowering::performFCanonicalizeCombine(
       return DAG.getConstantFP(CanonicalQNaN, SDLoc(N), VT);
   }
 
-  return SDValue(CFP, 0);
+  return N->getOperand(0);
 }
 
 static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
@@ -3944,8 +4271,9 @@ static unsigned minMaxOpcToMin3Max3Opc(unsigned Opc) {
   }
 }
 
-static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
-                                        SDValue Op0, SDValue Op1, bool Signed) {
+SDValue SITargetLowering::performIntMed3ImmCombine(
+  SelectionDAG &DAG, const SDLoc &SL,
+  SDValue Op0, SDValue Op1, bool Signed) const {
   ConstantSDNode *K1 = dyn_cast<ConstantSDNode>(Op1);
   if (!K1)
     return SDValue();
@@ -3963,23 +4291,22 @@ static SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
   }
 
   EVT VT = K0->getValueType(0);
+  unsigned Med3Opc = Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3;
+  if (VT == MVT::i32 || (VT == MVT::i16 && Subtarget->hasMed3_16())) {
+    return DAG.getNode(Med3Opc, SL, VT,
+                       Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+  }
 
+  // If there isn't a 16-bit med3 operation, convert to 32-bit.
   MVT NVT = MVT::i32;
   unsigned ExtOp = Signed ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
 
-  SDValue Tmp1, Tmp2, Tmp3;
-  Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
-  Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
-  Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
+  SDValue Tmp1 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(0));
+  SDValue Tmp2 = DAG.getNode(ExtOp, SL, NVT, Op0->getOperand(1));
+  SDValue Tmp3 = DAG.getNode(ExtOp, SL, NVT, Op1);
 
-  if (VT == MVT::i16) {
-    Tmp1 = DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, NVT,
-                       Tmp1, Tmp2, Tmp3);
-
-    return DAG.getNode(ISD::TRUNCATE, SL, VT, Tmp1);
-  } else
-    return DAG.getNode(Signed ? AMDGPUISD::SMED3 : AMDGPUISD::UMED3, SL, VT,
-                       Op0.getOperand(0), SDValue(K0, 0), SDValue(K1, 0));
+  SDValue Med3 = DAG.getNode(Med3Opc, SL, NVT, Tmp1, Tmp2, Tmp3);
+  return DAG.getNode(ISD::TRUNCATE, SL, VT, Med3);
 }
 
 static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
@@ -3989,8 +4316,10 @@ static bool isKnownNeverSNan(SelectionDAG &DAG, SDValue Op) {
   return DAG.isKnownNeverNaN(Op);
 }
 
-static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
-                                       SDValue Op0, SDValue Op1) {
+SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
+                                                  const SDLoc &SL,
+                                                  SDValue Op0,
+                                                  SDValue Op1) const {
   ConstantFPSDNode *K1 = dyn_cast<ConstantFPSDNode>(Op1);
   if (!K1)
     return SDValue();
@@ -4004,6 +4333,20 @@ static SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
   if (Cmp == APFloat::cmpGreaterThan)
     return SDValue();
 
+  // TODO: Check IEEE bit enabled?
+  EVT VT = K0->getValueType(0);
+  if (Subtarget->enableDX10Clamp()) {
+    // If dx10_clamp is enabled, NaNs clamp to 0.0. This is the same as the
+    // hardware fmed3 behavior converting to a min.
+    // FIXME: Should this be allowing -0.0?
+    if (K1->isExactlyValue(1.0) && K0->isExactlyValue(0.0))
+      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Op0.getOperand(0));
+  }
+
+  // med3 for f16 is only available on gfx9+.
+  if (VT == MVT::f64 || (VT == MVT::f16 && !Subtarget->hasMed3_16()))
+    return SDValue();
+
   // This isn't safe with signaling NaNs because in IEEE mode, min/max on a
   // signaling NaN gives a quiet NaN. The quiet NaN input to the min would then
   // give the other result, which is different from med3 with a NaN input.
@@ -4019,6 +4362,7 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
 
+  EVT VT = N->getValueType(0);
   unsigned Opc = N->getOpcode();
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -4026,7 +4370,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   // Only do this if the inner op has one use since this will just increases
   // register pressure for no benefit.
 
-  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY) {
+
+  if (Opc != AMDGPUISD::FMIN_LEGACY && Opc != AMDGPUISD::FMAX_LEGACY &&
+      VT != MVT::f64) {
     // max(max(a, b), c) -> max3(a, b, c)
     // min(min(a, b), c) -> min3(a, b, c)
     if (Op0.getOpcode() == Opc && Op0.hasOneUse()) {
@@ -4068,7 +4414,9 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   if (((Opc == ISD::FMINNUM && Op0.getOpcode() == ISD::FMAXNUM) ||
        (Opc == AMDGPUISD::FMIN_LEGACY &&
         Op0.getOpcode() == AMDGPUISD::FMAX_LEGACY)) &&
-      N->getValueType(0) == MVT::f32 && Op0.hasOneUse()) {
+      (VT == MVT::f32 || VT == MVT::f64 ||
+       (VT == MVT::f16 && Subtarget->has16BitInsts())) &&
+      Op0.hasOneUse()) {
     if (SDValue Res = performFPMed3ImmCombine(DAG, SDLoc(N), Op0, Op1))
       return Res;
   }
@@ -4076,6 +4424,69 @@ SDValue SITargetLowering::performMinMaxCombine(SDNode *N,
   return SDValue();
 }
 
+static bool isClampZeroToOne(SDValue A, SDValue B) {
+  if (ConstantFPSDNode *CA = dyn_cast<ConstantFPSDNode>(A)) {
+    if (ConstantFPSDNode *CB = dyn_cast<ConstantFPSDNode>(B)) {
+      // FIXME: Should this be allowing -0.0?
+      return (CA->isExactlyValue(0.0) && CB->isExactlyValue(1.0)) ||
+             (CA->isExactlyValue(1.0) && CB->isExactlyValue(0.0));
+    }
+  }
+
+  return false;
+}
+
+// FIXME: Should only worry about snans for version with chain.
+SDValue SITargetLowering::performFMed3Combine(SDNode *N,
+                                              DAGCombinerInfo &DCI) const {
+  EVT VT = N->getValueType(0);
+  // v_med3_f32 and v_max_f32 behave identically wrt denorms, exceptions and
+  // NaNs. With a NaN input, the order of the operands may change the result.
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc SL(N);
+
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  SDValue Src2 = N->getOperand(2);
+
+  if (isClampZeroToOne(Src0, Src1)) {
+    // const_a, const_b, x -> clamp is safe in all cases including signaling
+    // nans.
+    // FIXME: Should this be allowing -0.0?
+    return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src2);
+  }
+
+  // FIXME: dx10_clamp behavior assumed in instcombine. Should we really bother
+  // handling no dx10-clamp?
+  if (Subtarget->enableDX10Clamp()) {
+    // If NaNs is clamped to 0, we are free to reorder the inputs.
+
+    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
+      std::swap(Src0, Src1);
+
+    if (isa<ConstantFPSDNode>(Src1) && !isa<ConstantFPSDNode>(Src2))
+      std::swap(Src1, Src2);
+
+    if (isa<ConstantFPSDNode>(Src0) && !isa<ConstantFPSDNode>(Src1))
+      std::swap(Src0, Src1);
+
+    if (isClampZeroToOne(Src1, Src2))
+      return DAG.getNode(AMDGPUISD::CLAMP, SL, VT, Src0);
+  }
+
+  return SDValue();
+}
+
+SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
+                                                 DAGCombinerInfo &DCI) const {
+  SDValue Src0 = N->getOperand(0);
+  SDValue Src1 = N->getOperand(1);
+  if (Src0.isUndef() && Src1.isUndef())
+    return DCI.DAG.getUNDEF(N->getValueType(0));
+  return SDValue();
+}
+
 unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG,
                                           const SDNode *N0,
                                           const SDNode *N1) const {
@@ -4106,7 +4517,6 @@ SDValue SITargetLowering::performFAddCombine(SDNode *N,
 
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
-  assert(!VT.isVector());
 
   SDLoc SL(N);
   SDValue LHS = N->getOperand(0);
@@ -4285,7 +4695,6 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::FMIN_LEGACY:
   case AMDGPUISD::FMAX_LEGACY: {
     if (DCI.getDAGCombineLevel() >= AfterLegalizeDAG &&
-        N->getValueType(0) != MVT::f64 &&
         getTargetMachine().getOptLevel() > CodeGenOpt::None)
       return performMinMaxCombine(N, DCI);
     break;
@@ -4318,6 +4727,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
     return performOrCombine(N, DCI);
   case ISD::XOR:
     return performXorCombine(N, DCI);
+  case ISD::ZERO_EXTEND:
+    return performZeroExtendCombine(N, DCI);
   case AMDGPUISD::FP_CLASS:
     return performClassCombine(N, DCI);
   case ISD::FCANONICALIZE:
@@ -4342,6 +4753,28 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::CVT_F32_UBYTE2:
   case AMDGPUISD::CVT_F32_UBYTE3:
     return performCvtF32UByteNCombine(N, DCI);
+  case AMDGPUISD::FMED3:
+    return performFMed3Combine(N, DCI);
+  case AMDGPUISD::CVT_PKRTZ_F16_F32:
+    return performCvtPkRTZCombine(N, DCI);
+  case ISD::SCALAR_TO_VECTOR: {
+    SelectionDAG &DAG = DCI.DAG;
+    EVT VT = N->getValueType(0);
+
+    // v2i16 (scalar_to_vector i16:x) -> v2i16 (bitcast (any_extend i16:x))
+    if (VT == MVT::v2i16 || VT == MVT::v2f16) {
+      SDLoc SL(N);
+      SDValue Src = N->getOperand(0);
+      EVT EltVT = Src.getValueType();
+      if (EltVT == MVT::f16)
+        Src = DAG.getNode(ISD::BITCAST, SL, MVT::i16, Src);
+
+      SDValue Ext = DAG.getNode(ISD::ANY_EXTEND, SL, MVT::i32, Src);
+      return DAG.getNode(ISD::BITCAST, SL, VT, Ext);
+    }
+
+    break;
+  }
   }
   return AMDGPUTargetLowering::PerformDAGCombine(N, DCI);
 }
@@ -4370,6 +4803,10 @@ void SITargetLowering::adjustWritemask(MachineSDNode *&Node,
   for (SDNode::use_iterator I = Node->use_begin(), E = Node->use_end();
        I != E; ++I) {
 
+    // Don't look at users of the chain.
+    if (I.getUse().getResNo() != 0)
+      continue;
+
     // Abort if we can't understand the usage
     if (!I->isMachineOpcode() ||
         I->getMachineOpcode() != TargetOpcode::EXTRACT_SUBREG)
@@ -4667,6 +5104,8 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &AMDGPU::SReg_128RegClass);
       case 256:
         return std::make_pair(0U, &AMDGPU::SReg_256RegClass);
+      case 512:
+        return std::make_pair(0U, &AMDGPU::SReg_512RegClass);
       }
 
     case 'v':
diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h
index 037b6f730c5bf8cb9029c13b111923ef1ccd661a..452ee684ef614750510c3f49a28a82eef0028b05 100644
--- a/lib/Target/AMDGPU/SIISelLowering.h
+++ b/lib/Target/AMDGPU/SIISelLowering.h
@@ -21,11 +21,13 @@
 namespace llvm {
 
 class SITargetLowering final : public AMDGPUTargetLowering {
-  SDValue LowerParameterPtr(SelectionDAG &DAG, const SDLoc &SL, SDValue Chain,
-                            unsigned Offset) const;
-  SDValue LowerParameter(SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL,
-                         SDValue Chain, unsigned Offset, bool Signed,
-                         const ISD::InputArg *Arg = nullptr) const;
+  SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
+                                   SDValue Chain, uint64_t Offset) const;
+  SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
+                                   const SDLoc &SL, SDValue Chain,
+                                   uint64_t Offset, bool Signed,
+                                   const ISD::InputArg *Arg = nullptr) const;
+
   SDValue LowerGlobalAddress(AMDGPUMachineFunction *MFI, SDValue Op,
                              SelectionDAG &DAG) const override;
   SDValue lowerImplicitZextParam(SelectionDAG &DAG, SDValue Op,
@@ -55,10 +57,16 @@ class SITargetLowering final : public AMDGPUTargetLowering {
                             const SDLoc &DL,
                             EVT VT) const;
 
+  SDValue convertArgType(
+    SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Val,
+    bool Signed, const ISD::InputArg *Arg = nullptr) const;
+
   /// \brief Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue getSegmentAperture(unsigned AS, SelectionDAG &DAG) const;
+  SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
+                             SelectionDAG &DAG) const;
+
   SDValue lowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
@@ -81,10 +89,17 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   SDValue performAndCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performOrCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performXorCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performZeroExtendCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performClassCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue performFCanonicalizeCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
+  SDValue performFPMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                  SDValue Op0, SDValue Op1) const;
+  SDValue performIntMed3ImmCombine(SelectionDAG &DAG, const SDLoc &SL,
+                                   SDValue Op0, SDValue Op1, bool Signed) const;
   SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const;
 
   unsigned getFusedOpcode(const SelectionDAG &DAG,
                           const SDNode *N0, const SDNode *N1) const;
@@ -96,7 +111,7 @@ class SITargetLowering final : public AMDGPUTargetLowering {
   bool isLegalFlatAddressingMode(const AddrMode &AM) const;
   bool isLegalMUBUFAddressingMode(const AddrMode &AM) const;
 
-  bool isCFIntrinsic(const SDNode *Intr) const;
+  unsigned isCFIntrinsic(const SDNode *Intr) const;
 
   void createDebuggerPrologueStackObjects(MachineFunction &MF) const;
 
@@ -117,11 +132,15 @@ public:
 
   const SISubtarget *getSubtarget() const;
 
+  bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
+                          EVT /*VT*/) const override;
+
   bool getTgtMemIntrinsic(IntrinsicInfo &, const CallInst &,
                           unsigned IntrinsicID) const override;
 
-  bool isShuffleMaskLegal(const SmallVectorImpl<int> &/*Mask*/,
-                          EVT /*VT*/) const override;
+  bool getAddrModeArguments(IntrinsicInst * /*I*/,
+                            SmallVectorImpl<Value*> &/*Ops*/,
+                            Type *&/*AccessTy*/) const override;
 
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
diff --git a/lib/Target/AMDGPU/SIInsertSkips.cpp b/lib/Target/AMDGPU/SIInsertSkips.cpp
index 9d6feaa94fb7196bdd375700ad76278a6a54b144..ba346d2fad02cc71e5b32f5b79681c376973ce07 100644
--- a/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -318,14 +318,14 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
         MI.eraseFromParent();
         break;
 
-      case AMDGPU::SI_RETURN:
+      case AMDGPU::SI_RETURN_TO_EPILOG:
         // FIXME: Should move somewhere else
         assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
 
         // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
         // because external bytecode will be appended at the end.
         if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
-          // SI_RETURN is not the last instruction. Add an empty block at
+          // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
           // the end and jump there.
           if (!EmptyMBBAtEnd) {
             EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
diff --git a/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2a3e62aa82747124e1424afdbc337fb5e552686
--- /dev/null
+++ b/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -0,0 +1,1863 @@
+//===-- SIInsertWaitcnts.cpp - Insert Wait Instructions --------------------===/
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// \brief Insert wait instructions for memory reads and writes.
+///
+/// Memory reads and writes are issued asynchronously, so we need to insert
+/// S_WAITCNT instructions when we want to access any of their results or
+/// overwrite any register that's used asynchronously.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "Utils/AMDGPUBaseInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+
+#define DEBUG_TYPE "si-insert-waitcnts"
+
+using namespace llvm;
+
+namespace {
+
+// Class of object that encapsulates latest instruction counter score
+// associated with the operand.  Used for determining whether
+// s_waitcnt instruction needs to be emited.
+
+#define CNT_MASK(t) (1u << (t))
+
+enum InstCounterType { VM_CNT = 0, LGKM_CNT, EXP_CNT, NUM_INST_CNTS };
+
+typedef std::pair<signed, signed> RegInterval;
+
+struct {
+  int32_t VmcntMax;
+  int32_t ExpcntMax;
+  int32_t LgkmcntMax;
+  int32_t NumVGPRsMax;
+  int32_t NumSGPRsMax;
+} HardwareLimits;
+
+struct {
+  unsigned VGPR0;
+  unsigned VGPRL;
+  unsigned SGPR0;
+  unsigned SGPRL;
+} RegisterEncoding;
+
+enum WaitEventType {
+  VMEM_ACCESS,      // vector-memory read & write
+  LDS_ACCESS,       // lds read & write
+  GDS_ACCESS,       // gds read & write
+  SQ_MESSAGE,       // send message
+  SMEM_ACCESS,      // scalar-memory read & write
+  EXP_GPR_LOCK,     // export holding on its data src
+  GDS_GPR_LOCK,     // GDS holding on its data and addr src
+  EXP_POS_ACCESS,   // write to export position
+  EXP_PARAM_ACCESS, // write to export parameter
+  VMW_GPR_LOCK,     // vector-memory write holding on its data src
+  NUM_WAIT_EVENTS,
+};
+
+// The mapping is:
+//  0                .. SQ_MAX_PGM_VGPRS-1               real VGPRs
+//  SQ_MAX_PGM_VGPRS .. NUM_ALL_VGPRS-1                  extra VGPR-like slots
+//  NUM_ALL_VGPRS    .. NUM_ALL_VGPRS+SQ_MAX_PGM_SGPRS-1 real SGPRs
+// We reserve a fixed number of VGPR slots in the scoring tables for
+// special tokens like SCMEM_LDS (needed for buffer load to LDS).
+enum RegisterMapping {
+  SQ_MAX_PGM_VGPRS = 256, // Maximum programmable VGPRs across all targets.
+  SQ_MAX_PGM_SGPRS = 256, // Maximum programmable SGPRs across all targets.
+  NUM_EXTRA_VGPRS = 1,    // A reserved slot for DS.
+  EXTRA_VGPR_LDS = 0,     // This is a placeholder the Shader algorithm uses.
+  NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
+};
+
+#define ForAllWaitEventType(w)                                                 \
+  for (enum WaitEventType w = (enum WaitEventType)0;                           \
+       (w) < (enum WaitEventType)NUM_WAIT_EVENTS;                              \
+       (w) = (enum WaitEventType)((w) + 1))
+
+// This is a per-basic-block object that maintains current score brackets
+// of each wait-counter, and a per-register scoreboard for each wait-couner.
+// We also maintain the latest score for every event type that can change the
+// waitcnt in order to know if there are multiple types of events within
+// the brackets. When multiple types of event happen in the bracket,
+// wait-count may get decreased out of order, therefore we need to put in
+// "s_waitcnt 0" before use.
+class BlockWaitcntBrackets {
+public:
+  static int32_t getWaitCountMax(InstCounterType T) {
+    switch (T) {
+    case VM_CNT:
+      return HardwareLimits.VmcntMax;
+    case LGKM_CNT:
+      return HardwareLimits.LgkmcntMax;
+    case EXP_CNT:
+      return HardwareLimits.ExpcntMax;
+    default:
+      break;
+    }
+    return 0;
+  };
+
+  void setScoreLB(InstCounterType T, int32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreLBs[T] = Val;
+  };
+
+  void setScoreUB(InstCounterType T, int32_t Val) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return;
+    ScoreUBs[T] = Val;
+    if (T == EXP_CNT) {
+      int32_t UB = (int)(ScoreUBs[T] - getWaitCountMax(EXP_CNT));
+      if (ScoreLBs[T] < UB)
+        ScoreLBs[T] = UB;
+    }
+  };
+
+  int32_t getScoreLB(InstCounterType T) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return 0;
+    return ScoreLBs[T];
+  };
+
+  int32_t getScoreUB(InstCounterType T) {
+    assert(T < NUM_INST_CNTS);
+    if (T >= NUM_INST_CNTS)
+      return 0;
+    return ScoreUBs[T];
+  };
+
+  // Mapping from event to counter.
+  InstCounterType eventCounter(WaitEventType E) {
+    switch (E) {
+    case VMEM_ACCESS:
+      return VM_CNT;
+    case LDS_ACCESS:
+    case GDS_ACCESS:
+    case SQ_MESSAGE:
+    case SMEM_ACCESS:
+      return LGKM_CNT;
+    case EXP_GPR_LOCK:
+    case GDS_GPR_LOCK:
+    case VMW_GPR_LOCK:
+    case EXP_POS_ACCESS:
+    case EXP_PARAM_ACCESS:
+      return EXP_CNT;
+    default:
+      llvm_unreachable("unhandled event type");
+    }
+    return NUM_INST_CNTS;
+  }
+
+  void setRegScore(int GprNo, InstCounterType T, int32_t Val) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      if (GprNo > VgprUB) {
+        VgprUB = GprNo;
+      }
+      VgprScores[T][GprNo] = Val;
+    } else {
+      assert(T == LGKM_CNT);
+      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
+        SgprUB = GprNo - NUM_ALL_VGPRS;
+      }
+      SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
+    }
+  }
+
+  int32_t getRegScore(int GprNo, InstCounterType T) {
+    if (GprNo < NUM_ALL_VGPRS) {
+      return VgprScores[T][GprNo];
+    }
+    return SgprScores[GprNo - NUM_ALL_VGPRS];
+  }
+
+  void clear() {
+    memset(ScoreLBs, 0, sizeof(ScoreLBs));
+    memset(ScoreUBs, 0, sizeof(ScoreUBs));
+    memset(EventUBs, 0, sizeof(EventUBs));
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
+    }
+    memset(SgprScores, 0, sizeof(SgprScores));
+  }
+
+  RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
+                             const MachineRegisterInfo *MRI,
+                             const SIRegisterInfo *TRI, unsigned OpNo,
+                             bool Def) const;
+
+  void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
+                   const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
+                   unsigned OpNo, int32_t Val);
+
+  void setWaitAtBeginning() { WaitAtBeginning = true; }
+  void clearWaitAtBeginning() { WaitAtBeginning = false; }
+  bool getWaitAtBeginning() const { return WaitAtBeginning; }
+  void setEventUB(enum WaitEventType W, int32_t Val) { EventUBs[W] = Val; }
+  int32_t getMaxVGPR() const { return VgprUB; }
+  int32_t getMaxSGPR() const { return SgprUB; }
+  int32_t getEventUB(enum WaitEventType W) const {
+    assert(W < NUM_WAIT_EVENTS);
+    return EventUBs[W];
+  }
+  bool counterOutOfOrder(InstCounterType T);
+  unsigned int updateByWait(InstCounterType T, int ScoreToWait);
+  void updateByEvent(const SIInstrInfo *TII, const SIRegisterInfo *TRI,
+                     const MachineRegisterInfo *MRI, WaitEventType E,
+                     MachineInstr &MI);
+
+  BlockWaitcntBrackets()
+      : WaitAtBeginning(false), ValidLoop(false), MixedExpTypes(false),
+        LoopRegion(NULL), PostOrder(0), Waitcnt(NULL), VgprUB(0), SgprUB(0) {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
+    }
+  }
+  ~BlockWaitcntBrackets(){};
+
+  bool hasPendingSMEM() const {
+    return (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
+            EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]);
+  }
+
+  bool hasPendingFlat() const {
+    return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
+             LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
+            (LastFlat[VM_CNT] > ScoreLBs[VM_CNT] &&
+             LastFlat[VM_CNT] <= ScoreUBs[VM_CNT]));
+  }
+
+  void setPendingFlat() {
+    LastFlat[VM_CNT] = ScoreUBs[VM_CNT];
+    LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
+  }
+
+  int pendingFlat(InstCounterType Ct) const { return LastFlat[Ct]; }
+
+  void setLastFlat(InstCounterType Ct, int Val) { LastFlat[Ct] = Val; }
+
+  bool getRevisitLoop() const { return RevisitLoop; }
+  void setRevisitLoop(bool RevisitLoopIn) { RevisitLoop = RevisitLoopIn; }
+
+  void setPostOrder(int32_t PostOrderIn) { PostOrder = PostOrderIn; }
+  int32_t getPostOrder() const { return PostOrder; }
+
+  void setWaitcnt(MachineInstr *WaitcntIn) { Waitcnt = WaitcntIn; }
+  void clearWaitcnt() { Waitcnt = NULL; }
+  MachineInstr *getWaitcnt() const { return Waitcnt; }
+
+  bool mixedExpTypes() const { return MixedExpTypes; }
+  void setMixedExpTypes(bool MixedExpTypesIn) {
+    MixedExpTypes = MixedExpTypesIn;
+  }
+
+  void print(raw_ostream &);
+  void dump() { print(dbgs()); }
+
+private:
+  bool WaitAtBeginning;
+  bool RevisitLoop;
+  bool ValidLoop;
+  bool MixedExpTypes;
+  MachineLoop *LoopRegion;
+  int32_t PostOrder;
+  MachineInstr *Waitcnt;
+  int32_t ScoreLBs[NUM_INST_CNTS] = {0};
+  int32_t ScoreUBs[NUM_INST_CNTS] = {0};
+  int32_t EventUBs[NUM_WAIT_EVENTS] = {0};
+  // Remember the last flat memory operation.
+  int32_t LastFlat[NUM_INST_CNTS] = {0};
+  // wait_cnt scores for every vgpr.
+  // Keep track of the VgprUB and SgprUB to make merge at join efficient.
+  int32_t VgprUB;
+  int32_t SgprUB;
+  int32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
+  // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
+  int32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+};
+
+// This is a per-loop-region object that records waitcnt status at the end of
+// loop footer from the previous iteration. We also maintain an iteration
+// count to track the number of times the loop has been visited. When it
+// doesn't converge naturally, we force convergence by inserting s_waitcnt 0
+// at the end of the loop footer.
+class LoopWaitcntData {
+public:
+  void incIterCnt() { IterCnt++; }
+  void resetIterCnt() { IterCnt = 0; }
+  int32_t getIterCnt() { return IterCnt; }
+
+  LoopWaitcntData() : LfWaitcnt(NULL), IterCnt(0) {}
+  ~LoopWaitcntData(){};
+
+  void setWaitcnt(MachineInstr *WaitcntIn) { LfWaitcnt = WaitcntIn; }
+  MachineInstr *getWaitcnt() const { return LfWaitcnt; }
+
+  void print() {
+    DEBUG(dbgs() << "  iteration " << IterCnt << '\n';);
+    return;
+  }
+
+private:
+  // s_waitcnt added at the end of loop footer to stablize wait scores
+  // at the end of the loop footer.
+  MachineInstr *LfWaitcnt;
+  // Number of iterations the loop has been visited, not including the initial
+  // walk over.
+  int32_t IterCnt;
+};
+
+class SIInsertWaitcnts : public MachineFunctionPass {
+
+private:
+  const SISubtarget *ST;
+  const SIInstrInfo *TII;
+  const SIRegisterInfo *TRI;
+  const MachineRegisterInfo *MRI;
+  const MachineLoopInfo *MLI;
+  AMDGPU::IsaInfo::IsaVersion IV;
+  AMDGPUAS AMDGPUASI;
+
+  DenseSet<MachineBasicBlock *> BlockVisitedSet;
+  DenseSet<MachineInstr *> CompilerGeneratedWaitcntSet;
+  DenseSet<MachineInstr *> VCCZBugHandledSet;
+
+  DenseMap<MachineBasicBlock *, std::unique_ptr<BlockWaitcntBrackets>>
+      BlockWaitcntBracketsMap;
+
+  DenseSet<MachineBasicBlock *> BlockWaitcntProcessedSet;
+
+  DenseMap<MachineLoop *, std::unique_ptr<LoopWaitcntData>> LoopWaitcntDataMap;
+
+  std::vector<std::unique_ptr<BlockWaitcntBrackets>> KillWaitBrackets;
+
+public:
+  static char ID;
+
+  SIInsertWaitcnts()
+      : MachineFunctionPass(ID), ST(nullptr), TII(nullptr), TRI(nullptr),
+        MRI(nullptr), MLI(nullptr) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI insert wait instructions";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  void addKillWaitBracket(BlockWaitcntBrackets *Bracket) {
+    // The waitcnt information is copied because it changes as the block is
+    // traversed.
+    KillWaitBrackets.push_back(make_unique<BlockWaitcntBrackets>(*Bracket));
+  }
+
+  MachineInstr *generateSWaitCntInstBefore(MachineInstr &MI,
+                                           BlockWaitcntBrackets *ScoreBrackets);
+  void updateEventWaitCntAfter(MachineInstr &Inst,
+                               BlockWaitcntBrackets *ScoreBrackets);
+  void mergeInputScoreBrackets(MachineBasicBlock &Block);
+  MachineBasicBlock *loopBottom(const MachineLoop *Loop);
+  void insertWaitcntInBlock(MachineFunction &MF, MachineBasicBlock &Block);
+  void insertWaitcntBeforeCF(MachineBasicBlock &Block, MachineInstr *Inst);
+};
+
+} // End anonymous namespace.
+
+RegInterval BlockWaitcntBrackets::getRegInterval(const MachineInstr *MI,
+                                                 const SIInstrInfo *TII,
+                                                 const MachineRegisterInfo *MRI,
+                                                 const SIRegisterInfo *TRI,
+                                                 unsigned OpNo,
+                                                 bool Def) const {
+  const MachineOperand &Op = MI->getOperand(OpNo);
+  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
+      (Def && !Op.isDef()))
+    return {-1, -1};
+
+  // A use via a PW operand does not need a waitcnt.
+  // A partial write is not a WAW.
+  assert(!Op.getSubReg() || !Op.isUndef());
+
+  RegInterval Result;
+  const MachineRegisterInfo &MRIA = *MRI;
+
+  unsigned Reg = TRI->getEncodingValue(Op.getReg());
+
+  if (TRI->isVGPR(MRIA, Op.getReg())) {
+    assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
+    Result.first = Reg - RegisterEncoding.VGPR0;
+    assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
+  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
+    assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
+    Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
+    assert(Result.first >= NUM_ALL_VGPRS &&
+           Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
+  }
+  // TODO: Handle TTMP
+  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
+  else
+    return {-1, -1};
+
+  const MachineInstr &MIA = *MI;
+  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
+  unsigned Size = RC->getSize();
+  Result.second = Result.first + (Size / 4);
+
+  return Result;
+}
+
+void BlockWaitcntBrackets::setExpScore(const MachineInstr *MI,
+                                       const SIInstrInfo *TII,
+                                       const SIRegisterInfo *TRI,
+                                       const MachineRegisterInfo *MRI,
+                                       unsigned OpNo, int32_t Val) {
+  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
+  DEBUG({
+    const MachineOperand &Opnd = MI->getOperand(OpNo);
+    assert(TRI->isVGPR(*MRI, Opnd.getReg()));
+  });
+  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+    setRegScore(RegNo, EXP_CNT, Val);
+  }
+}
+
+void BlockWaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
+                                         const SIRegisterInfo *TRI,
+                                         const MachineRegisterInfo *MRI,
+                                         WaitEventType E, MachineInstr &Inst) {
+  const MachineRegisterInfo &MRIA = *MRI;
+  InstCounterType T = eventCounter(E);
+  int32_t CurrScore = getScoreUB(T) + 1;
+  // EventUB and ScoreUB need to be update regardless if this event changes
+  // the score of a register or not.
+  // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
+  EventUBs[E] = CurrScore;
+  setScoreUB(T, CurrScore);
+
+  if (T == EXP_CNT) {
+    // Check for mixed export types. If they are mixed, then a waitcnt exp(0)
+    // is required.
+    if (!MixedExpTypes) {
+      MixedExpTypes = counterOutOfOrder(EXP_CNT);
+    }
+
+    // Put score on the source vgprs. If this is a store, just use those
+    // specific register(s).
+    if (TII->isDS(Inst) && (Inst.mayStore() || Inst.mayLoad())) {
+      // All GDS operations must protect their address register (same as
+      // export.)
+      if (Inst.getOpcode() != AMDGPU::DS_APPEND &&
+          Inst.getOpcode() != AMDGPU::DS_CONSUME) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::addr),
+            CurrScore);
+      }
+      if (Inst.mayStore()) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data0),
+            CurrScore);
+        if (AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                       AMDGPU::OpName::data1) != -1) {
+          setExpScore(&Inst, TII, TRI, MRI,
+                      AMDGPU::getNamedOperandIdx(Inst.getOpcode(),
+                                                 AMDGPU::OpName::data1),
+                      CurrScore);
+        }
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1 &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_INIT &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_V &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_BR &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_SEMA_P &&
+                 Inst.getOpcode() != AMDGPU::DS_GWS_BARRIER &&
+                 Inst.getOpcode() != AMDGPU::DS_APPEND &&
+                 Inst.getOpcode() != AMDGPU::DS_CONSUME &&
+                 Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
+        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+          const MachineOperand &Op = Inst.getOperand(I);
+          if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
+            setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+          }
+        }
+      }
+    } else if (TII->isFLAT(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isMIMG(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else if (TII->isMTBUF(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      }
+    } else if (TII->isMUBUF(Inst)) {
+      if (Inst.mayStore()) {
+        setExpScore(&Inst, TII, TRI, MRI, 0, CurrScore);
+      } else if (AMDGPU::getAtomicNoRetOp(Inst.getOpcode()) != -1) {
+        setExpScore(
+            &Inst, TII, TRI, MRI,
+            AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::data),
+            CurrScore);
+      }
+    } else {
+      if (TII->isEXP(Inst)) {
+        // For export the destination registers are really temps that
+        // can be used as the actual source after export patching, so
+        // we need to treat them like sources and set the EXP_CNT
+        // score.
+        for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+          MachineOperand &DefMO = Inst.getOperand(I);
+          if (DefMO.isReg() && DefMO.isDef() &&
+              TRI->isVGPR(MRIA, DefMO.getReg())) {
+            setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
+                        CurrScore);
+          }
+        }
+      }
+      for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+        MachineOperand &MO = Inst.getOperand(I);
+        if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
+          setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
+        }
+      }
+    }
+#if 0 // TODO: check if this is handled by MUBUF code above.
+  } else if (Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORD ||
+	     Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX2 ||
+	     Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
+    MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
+    unsigned OpNo;//TODO: find the OpNo for this operand;
+    RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
+    for (signed RegNo = Interval.first; RegNo < Interval.second;
+	 ++RegNo) {
+      setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
+    }
+#endif
+  } else {
+    // Match the score to the destination registers.
+    for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
+      RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
+      if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
+        continue;
+      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        setRegScore(RegNo, T, CurrScore);
+      }
+    }
+    if (TII->isDS(Inst) && Inst.mayStore()) {
+      setRegScore(SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS, T, CurrScore);
+    }
+  }
+}
+
+void BlockWaitcntBrackets::print(raw_ostream &OS) {
+  OS << '\n';
+  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+       T = (enum InstCounterType)(T + 1)) {
+    int LB = getScoreLB(T);
+    int UB = getScoreUB(T);
+
+    switch (T) {
+    case VM_CNT:
+      OS << "    VM_CNT(" << UB - LB << "): ";
+      break;
+    case LGKM_CNT:
+      OS << "    LGKM_CNT(" << UB - LB << "): ";
+      break;
+    case EXP_CNT:
+      OS << "    EXP_CNT(" << UB - LB << "): ";
+      break;
+    default:
+      OS << "    UNKNOWN(" << UB - LB << "): ";
+      break;
+    }
+
+    if (LB < UB) {
+      // Print vgpr scores.
+      for (int J = 0; J <= getMaxVGPR(); J++) {
+        int RegScore = getRegScore(J, T);
+        if (RegScore <= LB)
+          continue;
+        int RelScore = RegScore - LB - 1;
+        if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
+          OS << RelScore << ":v" << J << " ";
+        } else {
+          OS << RelScore << ":ds ";
+        }
+      }
+      // Also need to print sgpr scores for lgkm_cnt.
+      if (T == LGKM_CNT) {
+        for (int J = 0; J <= getMaxSGPR(); J++) {
+          int RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+          if (RegScore <= LB)
+            continue;
+          int RelScore = RegScore - LB - 1;
+          OS << RelScore << ":s" << J << " ";
+        }
+      }
+    }
+    OS << '\n';
+  }
+  OS << '\n';
+  return;
+}
+
+unsigned int BlockWaitcntBrackets::updateByWait(InstCounterType T,
+                                                int ScoreToWait) {
+  unsigned int NeedWait = 0;
+  if (ScoreToWait == -1) {
+    // The score to wait is unknown. This implies that it was not encountered
+    // during the path of the CFG walk done during the current traversal but
+    // may be seen on a different path. Emit an s_wait counter with a
+    // conservative value of 0 for the counter.
+    NeedWait = CNT_MASK(T);
+    setScoreLB(T, getScoreUB(T));
+    return NeedWait;
+  }
+
+  // If the score of src_operand falls within the bracket, we need an
+  // s_waitcnt instruction.
+  const int32_t LB = getScoreLB(T);
+  const int32_t UB = getScoreUB(T);
+  if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
+    if (T == VM_CNT && hasPendingFlat()) {
+      // If there is a pending FLAT operation, and this is a VM waitcnt,
+      // then we need to force a waitcnt 0 for VM.
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, getScoreUB(T));
+    } else if (counterOutOfOrder(T)) {
+      // Counter can get decremented out-of-order when there
+      // are multiple types event in the brack. Also emit an s_wait counter
+      // with a conservative value of 0 for the counter.
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, getScoreUB(T));
+    } else {
+      NeedWait = CNT_MASK(T);
+      setScoreLB(T, ScoreToWait);
+    }
+  }
+
+  return NeedWait;
+}
+
+// Where there are multiple types of event in the bracket of a counter,
+// the decrement may go out of order.
+bool BlockWaitcntBrackets::counterOutOfOrder(InstCounterType T) {
+  switch (T) {
+  case VM_CNT:
+    return false;
+  case LGKM_CNT: {
+    if (EventUBs[SMEM_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[SMEM_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      // Scalar memory read always can go out of order.
+      return true;
+    }
+    int NumEventTypes = 0;
+    if (EventUBs[LDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[LDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[GDS_ACCESS] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[GDS_ACCESS] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[SQ_MESSAGE] > ScoreLBs[LGKM_CNT] &&
+        EventUBs[SQ_MESSAGE] <= ScoreUBs[LGKM_CNT]) {
+      NumEventTypes++;
+    }
+    if (NumEventTypes <= 1) {
+      return false;
+    }
+    break;
+  }
+  case EXP_CNT: {
+    // If there has been a mixture of export types, then a waitcnt exp(0) is
+    // required.
+    if (MixedExpTypes)
+      return true;
+    int NumEventTypes = 0;
+    if (EventUBs[EXP_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[GDS_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[GDS_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[VMW_GPR_LOCK] > ScoreLBs[EXP_CNT] &&
+        EventUBs[VMW_GPR_LOCK] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+    if (EventUBs[EXP_PARAM_ACCESS] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_PARAM_ACCESS] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+
+    if (EventUBs[EXP_POS_ACCESS] > ScoreLBs[EXP_CNT] &&
+        EventUBs[EXP_POS_ACCESS] <= ScoreUBs[EXP_CNT]) {
+      NumEventTypes++;
+    }
+
+    if (NumEventTypes <= 1) {
+      return false;
+    }
+    break;
+  }
+  default:
+    break;
+  }
+  return true;
+}
+
+INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
+                      false)
+INITIALIZE_PASS_END(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
+                    false)
+
+char SIInsertWaitcnts::ID = 0;
+
+char &llvm::SIInsertWaitcntsID = SIInsertWaitcnts::ID;
+
+FunctionPass *llvm::createSIInsertWaitcntsPass() {
+  return new SIInsertWaitcnts();
+}
+
+static bool readsVCCZ(const MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  return (Opc == AMDGPU::S_CBRANCH_VCCNZ || Opc == AMDGPU::S_CBRANCH_VCCZ) &&
+         !MI.getOperand(1).isUndef();
+}
+
+///  \brief Generate s_waitcnt instruction to be placed before cur_Inst.
+///  Instructions of a given type are returned in order,
+///  but instructions of different types can complete out of order.
+///  We rely on this in-order completion
+///  and simply assign a score to the memory access instructions.
+///  We keep track of the active "score bracket" to determine
+///  if an access of a memory read requires an s_waitcnt
+///  and if so what the value of each counter is.
+///  The "score bracket" is bound by the lower bound and upper bound
+///  scores (*_score_LB and *_score_ub respectively).
+MachineInstr *SIInsertWaitcnts::generateSWaitCntInstBefore(
+    MachineInstr &MI, BlockWaitcntBrackets *ScoreBrackets) {
+  // To emit, or not to emit - that's the question!
+  // Start with an assumption that there is no need to emit.
+  unsigned int EmitSwaitcnt = 0;
+  // s_waitcnt instruction to return; default is NULL.
+  MachineInstr *SWaitInst = nullptr;
+  // No need to wait before phi. If a phi-move exists, then the wait should
+  // has been inserted before the move. If a phi-move does not exist, then
+  // wait should be inserted before the real use. The same is true for
+  // sc-merge. It is not a coincident that all these cases correspond to the
+  // instructions that are skipped in the assembling loop.
+  bool NeedLineMapping = false; // TODO: Check on this.
+  if (MI.isDebugValue() &&
+      // TODO: any other opcode?
+      !NeedLineMapping) {
+    return SWaitInst;
+  }
+
+  // See if an s_waitcnt is forced at block entry, or is needed at
+  // program end.
+  if (ScoreBrackets->getWaitAtBeginning()) {
+    // Note that we have already cleared the state, so we don't need to update
+    // it.
+    ScoreBrackets->clearWaitAtBeginning();
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      EmitSwaitcnt |= CNT_MASK(T);
+      ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+    }
+  }
+
+  // See if this instruction has a forced S_WAITCNT VM.
+  // TODO: Handle other cases of NeedsWaitcntVmBefore()
+  else if (MI.getOpcode() == AMDGPU::BUFFER_WBINVL1 ||
+           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_SC ||
+           MI.getOpcode() == AMDGPU::BUFFER_WBINVL1_VOL) {
+    EmitSwaitcnt |=
+        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+  }
+
+  // All waits must be resolved at call return.
+  // NOTE: this could be improved with knowledge of all call sites or
+  //   with knowledge of the called routines.
+  if (MI.getOpcode() == AMDGPU::RETURN ||
+      MI.getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) {
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
+        ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+        EmitSwaitcnt |= CNT_MASK(T);
+      }
+    }
+  }
+  // Resolve vm waits before gs-done.
+  else if ((MI.getOpcode() == AMDGPU::S_SENDMSG ||
+            MI.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+           ((MI.getOperand(0).getImm() & AMDGPU::SendMsg::ID_MASK_) ==
+            AMDGPU::SendMsg::ID_GS_DONE)) {
+    if (ScoreBrackets->getScoreUB(VM_CNT) > ScoreBrackets->getScoreLB(VM_CNT)) {
+      ScoreBrackets->setScoreLB(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+      EmitSwaitcnt |= CNT_MASK(VM_CNT);
+    }
+  }
+#if 0 // TODO: the following blocks of logic when we have fence.
+  else if (MI.getOpcode() == SC_FENCE) {
+    const unsigned int group_size =
+      context->shader_info->GetMaxThreadGroupSize();
+    // group_size == 0 means thread group size is unknown at compile time
+    const bool group_is_multi_wave =
+      (group_size == 0 || group_size > target_info->GetWaveFrontSize());
+    const bool fence_is_global = !((SCInstInternalMisc*)Inst)->IsGroupFence();
+
+    for (unsigned int i = 0; i < Inst->NumSrcOperands(); i++) {
+      SCRegType src_type = Inst->GetSrcType(i);
+      switch (src_type) {
+        case SCMEM_LDS:
+          if (group_is_multi_wave ||
+	      context->OptFlagIsOn(OPT_R1100_LDSMEM_FENCE_CHICKEN_BIT)) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+                               ScoreBrackets->getScoreUB(LGKM_CNT));
+            // LDS may have to wait for VM_CNT after buffer load to LDS
+            if (target_info->HasBufferLoadToLDS()) {
+              EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+                                 ScoreBrackets->getScoreUB(VM_CNT));
+            }
+          }
+          break;
+
+        case SCMEM_GDS:
+          if (group_is_multi_wave || fence_is_global) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+			       ScoreBrackets->getScoreUB(EXP_CNT));
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(LGKM_CNT,
+			       ScoreBrackets->getScoreUB(LGKM_CNT));
+          }
+          break;
+
+        case SCMEM_UAV:
+        case SCMEM_TFBUF:
+        case SCMEM_RING:
+        case SCMEM_SCATTER:
+          if (group_is_multi_wave || fence_is_global) {
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(EXP_CNT,
+			       ScoreBrackets->getScoreUB(EXP_CNT));
+            EmitSwaitcnt |= ScoreBrackets->updateByWait(VM_CNT,
+			       ScoreBrackets->getScoreUB(VM_CNT));
+          }
+          break;
+
+        case SCMEM_SCRATCH:
+        default:
+          break;
+      }
+    }
+  }
+#endif
+
+  // Export & GDS instructions do not read the EXEC mask until after the export
+  // is granted (which can occur well after the instruction is issued).
+  // The shader program must flush all EXP operations on the export-count
+  // before overwriting the EXEC mask.
+  else {
+    if (MI.modifiesRegister(AMDGPU::EXEC, TRI)) {
+      // Export and GDS are tracked individually, either may trigger a waitcnt
+      // for EXEC.
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_GPR_LOCK));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_PARAM_ACCESS));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(EXP_POS_ACCESS));
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          EXP_CNT, ScoreBrackets->getEventUB(GDS_GPR_LOCK));
+    }
+
+#if 0 // TODO: the following code to handle CALL.
+    // The argument passing for CALLs should suffice for VM_CNT and LGKM_CNT.
+    // However, there is a problem with EXP_CNT, because the call cannot
+    // easily tell if a register is used in the function, and if it did, then
+    // the referring instruction would have to have an S_WAITCNT, which is
+    // dependent on all call sites. So Instead, force S_WAITCNT for EXP_CNTs
+    // before the call.
+    if (MI.getOpcode() == SC_CALL) {
+      if (ScoreBrackets->getScoreUB(EXP_CNT) >
+	  ScoreBrackets->getScoreLB(EXP_CNT)) {
+        ScoreBrackets->setScoreLB(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+        EmitSwaitcnt |= CNT_MASK(EXP_CNT);
+      }
+    }
+#endif
+
+    // Look at the source operands of every instruction to see if
+    // any of them results from a previous memory operation that affects
+    // its current usage. If so, an s_waitcnt instruction needs to be
+    // emitted.
+    // If the source operand was defined by a load, add the s_waitcnt
+    // instruction.
+    for (const MachineMemOperand *Memop : MI.memoperands()) {
+      unsigned AS = Memop->getAddrSpace();
+      if (AS != AMDGPUASI.LOCAL_ADDRESS)
+        continue;
+      unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+      // VM_CNT is only relevant to vgpr or LDS.
+      EmitSwaitcnt |= ScoreBrackets->updateByWait(
+          VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+    }
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      const MachineOperand &Op = MI.getOperand(I);
+      const MachineRegisterInfo &MRIA = *MRI;
+      RegInterval Interval =
+          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, false);
+      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        if (TRI->isVGPR(MRIA, Op.getReg())) {
+          // VM_CNT is only relevant to vgpr or LDS.
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+        }
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+      }
+    }
+    // End of for loop that looks at all source operands to decide vm_wait_cnt
+    // and lgk_wait_cnt.
+
+    // Two cases are handled for destination operands:
+    // 1) If the destination operand was defined by a load, add the s_waitcnt
+    // instruction to guarantee the right WAW order.
+    // 2) If a destination operand that was used by a recent export/store ins,
+    // add s_waitcnt on exp_cnt to guarantee the WAR order.
+    if (MI.mayStore()) {
+      for (const MachineMemOperand *Memop : MI.memoperands()) {
+        unsigned AS = Memop->getAddrSpace();
+        if (AS != AMDGPUASI.LOCAL_ADDRESS)
+          continue;
+        unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+      }
+    }
+    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
+      MachineOperand &Def = MI.getOperand(I);
+      const MachineRegisterInfo &MRIA = *MRI;
+      RegInterval Interval =
+          ScoreBrackets->getRegInterval(&MI, TII, MRI, TRI, I, true);
+      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+        if (TRI->isVGPR(MRIA, Def.getReg())) {
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              VM_CNT, ScoreBrackets->getRegScore(RegNo, VM_CNT));
+          EmitSwaitcnt |= ScoreBrackets->updateByWait(
+              EXP_CNT, ScoreBrackets->getRegScore(RegNo, EXP_CNT));
+        }
+        EmitSwaitcnt |= ScoreBrackets->updateByWait(
+            LGKM_CNT, ScoreBrackets->getRegScore(RegNo, LGKM_CNT));
+      }
+    } // End of for loop that looks at all dest operands.
+  }
+
+  // TODO: Tie force zero to a compiler triage option.
+  bool ForceZero = false;
+
+  // Check to see if this is an S_BARRIER, and if an implicit S_WAITCNT 0
+  // occurs before the instruction. Doing it here prevents any additional
+  // S_WAITCNTs from being emitted if the instruction was marked as
+  // requiring a WAITCNT beforehand.
+  if (MI.getOpcode() == AMDGPU::S_BARRIER && ST->needWaitcntBeforeBarrier()) {
+    EmitSwaitcnt |=
+        ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+    EmitSwaitcnt |= ScoreBrackets->updateByWait(
+        LGKM_CNT, ScoreBrackets->getScoreUB(LGKM_CNT));
+  }
+
+  // TODO: Remove this work-around, enable the assert for Bug 457939
+  //       after fixing the scheduler. Also, the Shader Compiler code is
+  //       independent of target.
+  if (readsVCCZ(MI) && ST->getGeneration() <= SISubtarget::SEA_ISLANDS) {
+    if (ScoreBrackets->getScoreLB(LGKM_CNT) <
+            ScoreBrackets->getScoreUB(LGKM_CNT) &&
+        ScoreBrackets->hasPendingSMEM()) {
+      // Wait on everything, not just LGKM.  vccz reads usually come from
+      // terminators, and we always wait on everything at the end of the
+      // block, so if we only wait on LGKM here, we might end up with
+      // another s_waitcnt inserted right after this if there are non-LGKM
+      // instructions still outstanding.
+      ForceZero = true;
+      EmitSwaitcnt = true;
+    }
+  }
+
+  // Does this operand processing indicate s_wait counter update?
+  if (EmitSwaitcnt) {
+    int CntVal[NUM_INST_CNTS];
+
+    bool UseDefaultWaitcntStrategy = true;
+    if (ForceZero) {
+      // Force all waitcnts to 0.
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+      }
+      CntVal[VM_CNT] = 0;
+      CntVal[EXP_CNT] = 0;
+      CntVal[LGKM_CNT] = 0;
+      UseDefaultWaitcntStrategy = false;
+    }
+
+    if (UseDefaultWaitcntStrategy) {
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        if (EmitSwaitcnt & CNT_MASK(T)) {
+          int Delta =
+              ScoreBrackets->getScoreUB(T) - ScoreBrackets->getScoreLB(T);
+          int MaxDelta = ScoreBrackets->getWaitCountMax(T);
+          if (Delta >= MaxDelta) {
+            Delta = -1;
+            if (T != EXP_CNT) {
+              ScoreBrackets->setScoreLB(
+                  T, ScoreBrackets->getScoreUB(T) - MaxDelta);
+            }
+            EmitSwaitcnt &= ~CNT_MASK(T);
+          }
+          CntVal[T] = Delta;
+        } else {
+          // If we are not waiting for a particular counter then encode
+          // it as -1 which means "don't care."
+          CntVal[T] = -1;
+        }
+      }
+    }
+
+    // If we are not waiting on any counter we can skip the wait altogether.
+    if (EmitSwaitcnt != 0) {
+      MachineInstr *OldWaitcnt = ScoreBrackets->getWaitcnt();
+      int Imm = (!OldWaitcnt) ? 0 : OldWaitcnt->getOperand(0).getImm();
+      if (!OldWaitcnt || (AMDGPU::decodeVmcnt(IV, Imm) !=
+                          (CntVal[VM_CNT] & AMDGPU::getVmcntBitMask(IV))) ||
+          (AMDGPU::decodeExpcnt(IV, Imm) !=
+           (CntVal[EXP_CNT] & AMDGPU::getExpcntBitMask(IV))) ||
+          (AMDGPU::decodeLgkmcnt(IV, Imm) !=
+           (CntVal[LGKM_CNT] & AMDGPU::getLgkmcntBitMask(IV)))) {
+        MachineLoop *ContainingLoop = MLI->getLoopFor(MI.getParent());
+        if (ContainingLoop) {
+          MachineBasicBlock *TBB = ContainingLoop->getTopBlock();
+          BlockWaitcntBrackets *ScoreBracket =
+              BlockWaitcntBracketsMap[TBB].get();
+          if (!ScoreBracket) {
+            assert(BlockVisitedSet.find(TBB) == BlockVisitedSet.end());
+            BlockWaitcntBracketsMap[TBB] = make_unique<BlockWaitcntBrackets>();
+            ScoreBracket = BlockWaitcntBracketsMap[TBB].get();
+          }
+          ScoreBracket->setRevisitLoop(true);
+          DEBUG(dbgs() << "set-revisit: block"
+                       << ContainingLoop->getTopBlock()->getNumber() << '\n';);
+        }
+      }
+
+      // Update an existing waitcount, or make a new one.
+      MachineFunction &MF = *MI.getParent()->getParent();
+      if (OldWaitcnt && OldWaitcnt->getOpcode() != AMDGPU::S_WAITCNT) {
+        SWaitInst = OldWaitcnt;
+      } else {
+        SWaitInst = MF.CreateMachineInstr(TII->get(AMDGPU::S_WAITCNT),
+                                          MI.getDebugLoc());
+        CompilerGeneratedWaitcntSet.insert(SWaitInst);
+      }
+
+      const MachineOperand &Op =
+          MachineOperand::CreateImm(AMDGPU::encodeWaitcnt(
+              IV, CntVal[VM_CNT], CntVal[EXP_CNT], CntVal[LGKM_CNT]));
+      SWaitInst->addOperand(MF, Op);
+
+      if (CntVal[EXP_CNT] == 0) {
+        ScoreBrackets->setMixedExpTypes(false);
+      }
+    }
+  }
+
+  return SWaitInst;
+}
+
+void SIInsertWaitcnts::insertWaitcntBeforeCF(MachineBasicBlock &MBB,
+                                             MachineInstr *Waitcnt) {
+  if (MBB.empty()) {
+    MBB.push_back(Waitcnt);
+    return;
+  }
+
+  MachineBasicBlock::iterator It = MBB.end();
+  MachineInstr *MI = &*(--It);
+  if (MI->isBranch()) {
+    MBB.insert(It, Waitcnt);
+  } else {
+    MBB.push_back(Waitcnt);
+  }
+
+  return;
+}
+
+void SIInsertWaitcnts::updateEventWaitCntAfter(
+    MachineInstr &Inst, BlockWaitcntBrackets *ScoreBrackets) {
+  // Now look at the instruction opcode. If it is a memory access
+  // instruction, update the upper-bound of the appropriate counter's
+  // bracket and the destination operand scores.
+  // TODO: Use the (TSFlags & SIInstrFlags::LGKM_CNT) property everywhere.
+  if (TII->isDS(Inst) && (Inst.mayLoad() || Inst.mayStore())) {
+    if (TII->getNamedOperand(Inst, AMDGPU::OpName::gds)->getImm() != 0) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_ACCESS, Inst);
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, GDS_GPR_LOCK, Inst);
+    } else {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+    }
+  } else if (TII->isFLAT(Inst)) {
+    assert(Inst.mayLoad() || Inst.mayStore());
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, LDS_ACCESS, Inst);
+
+    // This is a flat memory operation. Check to see if it has memory
+    // tokens for both LDS and Memory, and if so mark it as a flat.
+    bool FoundLDSMem = false;
+    for (const MachineMemOperand *Memop : Inst.memoperands()) {
+      unsigned AS = Memop->getAddrSpace();
+      if (AS == AMDGPUASI.LOCAL_ADDRESS || AS == AMDGPUASI.FLAT_ADDRESS)
+        FoundLDSMem = true;
+    }
+
+    // This is a flat memory operation, so note it - it will require
+    // that both the VM and LGKM be flushed to zero if it is pending when
+    // a VM or LGKM dependency occurs.
+    if (FoundLDSMem) {
+      ScoreBrackets->setPendingFlat();
+    }
+  } else if (SIInstrInfo::isVMEM(Inst) &&
+             // TODO: get a better carve out.
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1 &&
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_SC &&
+             Inst.getOpcode() != AMDGPU::BUFFER_WBINVL1_VOL) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, VMEM_ACCESS, Inst);
+    if ( // TODO: assumed yes -- target_info->MemWriteNeedsExpWait() &&
+        (Inst.mayStore() || AMDGPU::getAtomicNoRetOp(Inst.getOpcode()))) {
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, VMW_GPR_LOCK, Inst);
+    }
+  } else if (TII->isSMRD(Inst)) {
+    ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+  } else {
+    switch (Inst.getOpcode()) {
+    case AMDGPU::S_SENDMSG:
+    case AMDGPU::S_SENDMSGHALT:
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, SQ_MESSAGE, Inst);
+      break;
+    case AMDGPU::EXP:
+    case AMDGPU::EXP_DONE: {
+      int Imm = TII->getNamedOperand(Inst, AMDGPU::OpName::tgt)->getImm();
+      if (Imm >= 32 && Imm <= 63)
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_PARAM_ACCESS, Inst);
+      else if (Imm >= 12 && Imm <= 15)
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_POS_ACCESS, Inst);
+      else
+        ScoreBrackets->updateByEvent(TII, TRI, MRI, EXP_GPR_LOCK, Inst);
+      break;
+    }
+    case AMDGPU::S_MEMTIME:
+    case AMDGPU::S_MEMREALTIME:
+      ScoreBrackets->updateByEvent(TII, TRI, MRI, SMEM_ACCESS, Inst);
+      break;
+    default:
+      break;
+    }
+  }
+}
+
+void SIInsertWaitcnts::mergeInputScoreBrackets(MachineBasicBlock &Block) {
+  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+  int32_t MaxPending[NUM_INST_CNTS] = {0};
+  int32_t MaxFlat[NUM_INST_CNTS] = {0};
+  bool MixedExpTypes = false;
+
+  // Clear the score bracket state.
+  ScoreBrackets->clear();
+
+  // Compute the number of pending elements on block entry.
+
+  // IMPORTANT NOTE: If iterative handling of loops is added, the code will
+  // need to handle single BBs with backedges to themselves. This means that
+  // they will need to retain and not clear their initial state.
+
+  // See if there are any uninitialized predecessors. If so, emit an
+  // s_waitcnt 0 at the beginning of the block.
+  for (MachineBasicBlock *pred : Block.predecessors()) {
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[pred].get();
+    bool Visited = BlockVisitedSet.find(pred) != BlockVisitedSet.end();
+    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+      break;
+    }
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      int span =
+          PredScoreBrackets->getScoreUB(T) - PredScoreBrackets->getScoreLB(T);
+      MaxPending[T] = std::max(MaxPending[T], span);
+      span =
+          PredScoreBrackets->pendingFlat(T) - PredScoreBrackets->getScoreLB(T);
+      MaxFlat[T] = std::max(MaxFlat[T], span);
+    }
+
+    MixedExpTypes |= PredScoreBrackets->mixedExpTypes();
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  // Also handle kills for exit block.
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        int Span = KillWaitBrackets[I]->getScoreUB(T) -
+                   KillWaitBrackets[I]->getScoreLB(T);
+        MaxPending[T] = std::max(MaxPending[T], Span);
+        Span = KillWaitBrackets[I]->pendingFlat(T) -
+               KillWaitBrackets[I]->getScoreLB(T);
+        MaxFlat[T] = std::max(MaxFlat[T], Span);
+      }
+
+      MixedExpTypes |= KillWaitBrackets[I]->mixedExpTypes();
+    }
+  }
+
+  // Special handling for GDS_GPR_LOCK and EXP_GPR_LOCK.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+    bool Visited = BlockVisitedSet.find(Pred) != BlockVisitedSet.end();
+    if (!Visited || PredScoreBrackets->getWaitAtBeginning()) {
+      break;
+    }
+
+    int GDSSpan = PredScoreBrackets->getEventUB(GDS_GPR_LOCK) -
+                  PredScoreBrackets->getScoreLB(EXP_CNT);
+    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
+    int EXPSpan = PredScoreBrackets->getEventUB(EXP_GPR_LOCK) -
+                  PredScoreBrackets->getScoreLB(EXP_CNT);
+    MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      int GDSSpan = KillWaitBrackets[I]->getEventUB(GDS_GPR_LOCK) -
+                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
+      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], GDSSpan);
+      int EXPSpan = KillWaitBrackets[I]->getEventUB(EXP_GPR_LOCK) -
+                    KillWaitBrackets[I]->getScoreLB(EXP_CNT);
+      MaxPending[EXP_CNT] = std::max(MaxPending[EXP_CNT], EXPSpan);
+    }
+  }
+
+#if 0
+  // LC does not (unlike) add a waitcnt at beginning. Leaving it as marker.
+  // TODO: how does LC distinguish between function entry and main entry?
+  // If this is the entry to a function, force a wait.
+  MachineBasicBlock &Entry = Block.getParent()->front();
+  if (Entry.getNumber() == Block.getNumber()) {
+    ScoreBrackets->setWaitAtBeginning();
+    return;
+  }
+#endif
+
+  // Now set the current Block's brackets to the largest ending bracket.
+  for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+       T = (enum InstCounterType)(T + 1)) {
+    ScoreBrackets->setScoreUB(T, MaxPending[T]);
+    ScoreBrackets->setScoreLB(T, 0);
+    ScoreBrackets->setLastFlat(T, MaxFlat[T]);
+  }
+
+  ScoreBrackets->setMixedExpTypes(MixedExpTypes);
+
+  // Set the register scoreboard.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+      break;
+    }
+
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+
+    // Now merge the gpr_reg_score information
+    for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+         T = (enum InstCounterType)(T + 1)) {
+      int PredLB = PredScoreBrackets->getScoreLB(T);
+      int PredUB = PredScoreBrackets->getScoreUB(T);
+      if (PredLB < PredUB) {
+        int PredScale = MaxPending[T] - PredUB;
+        // Merge vgpr scores.
+        for (int J = 0; J <= PredScoreBrackets->getMaxVGPR(); J++) {
+          int PredRegScore = PredScoreBrackets->getRegScore(J, T);
+          if (PredRegScore <= PredLB)
+            continue;
+          int NewRegScore = PredScale + PredRegScore;
+          ScoreBrackets->setRegScore(
+              J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
+        }
+        // Also need to merge sgpr scores for lgkm_cnt.
+        if (T == LGKM_CNT) {
+          for (int J = 0; J <= PredScoreBrackets->getMaxSGPR(); J++) {
+            int PredRegScore =
+                PredScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+            if (PredRegScore <= PredLB)
+              continue;
+            int NewRegScore = PredScale + PredRegScore;
+            ScoreBrackets->setRegScore(
+                J + NUM_ALL_VGPRS, LGKM_CNT,
+                std::max(
+                    ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
+                    NewRegScore));
+          }
+        }
+      }
+    }
+
+    // Also merge the WaitEvent information.
+    ForAllWaitEventType(W) {
+      enum InstCounterType T = PredScoreBrackets->eventCounter(W);
+      int PredEventUB = PredScoreBrackets->getEventUB(W);
+      if (PredEventUB > PredScoreBrackets->getScoreLB(T)) {
+        int NewEventUB =
+            MaxPending[T] + PredEventUB - PredScoreBrackets->getScoreUB(T);
+        if (NewEventUB > 0) {
+          ScoreBrackets->setEventUB(
+              W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
+        }
+      }
+    }
+  }
+
+  // TODO: Is SC Block->IsMainExit() same as Block.succ_empty()?
+  // Set the register scoreboard.
+  if (Block.succ_empty() && !KillWaitBrackets.empty()) {
+    for (unsigned int I = 0; I < KillWaitBrackets.size(); I++) {
+      // Now merge the gpr_reg_score information.
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        int PredLB = KillWaitBrackets[I]->getScoreLB(T);
+        int PredUB = KillWaitBrackets[I]->getScoreUB(T);
+        if (PredLB < PredUB) {
+          int PredScale = MaxPending[T] - PredUB;
+          // Merge vgpr scores.
+          for (int J = 0; J <= KillWaitBrackets[I]->getMaxVGPR(); J++) {
+            int PredRegScore = KillWaitBrackets[I]->getRegScore(J, T);
+            if (PredRegScore <= PredLB)
+              continue;
+            int NewRegScore = PredScale + PredRegScore;
+            ScoreBrackets->setRegScore(
+                J, T, std::max(ScoreBrackets->getRegScore(J, T), NewRegScore));
+          }
+          // Also need to merge sgpr scores for lgkm_cnt.
+          if (T == LGKM_CNT) {
+            for (int J = 0; J <= KillWaitBrackets[I]->getMaxSGPR(); J++) {
+              int PredRegScore =
+                  KillWaitBrackets[I]->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+              if (PredRegScore <= PredLB)
+                continue;
+              int NewRegScore = PredScale + PredRegScore;
+              ScoreBrackets->setRegScore(
+                  J + NUM_ALL_VGPRS, LGKM_CNT,
+                  std::max(
+                      ScoreBrackets->getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT),
+                      NewRegScore));
+            }
+          }
+        }
+      }
+
+      // Also merge the WaitEvent information.
+      ForAllWaitEventType(W) {
+        enum InstCounterType T = KillWaitBrackets[I]->eventCounter(W);
+        int PredEventUB = KillWaitBrackets[I]->getEventUB(W);
+        if (PredEventUB > KillWaitBrackets[I]->getScoreLB(T)) {
+          int NewEventUB =
+              MaxPending[T] + PredEventUB - KillWaitBrackets[I]->getScoreUB(T);
+          if (NewEventUB > 0) {
+            ScoreBrackets->setEventUB(
+                W, std::max(ScoreBrackets->getEventUB(W), NewEventUB));
+          }
+        }
+      }
+    }
+  }
+
+  // Special case handling of GDS_GPR_LOCK and EXP_GPR_LOCK. Merge this for the
+  // sequencing predecessors, because changes to EXEC require waitcnts due to
+  // the delayed nature of these operations.
+  for (MachineBasicBlock *Pred : Block.predecessors()) {
+    if (BlockVisitedSet.find(Pred) == BlockVisitedSet.end()) {
+      break;
+    }
+
+    BlockWaitcntBrackets *PredScoreBrackets =
+        BlockWaitcntBracketsMap[Pred].get();
+
+    int pred_gds_ub = PredScoreBrackets->getEventUB(GDS_GPR_LOCK);
+    if (pred_gds_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
+      int new_gds_ub = MaxPending[EXP_CNT] + pred_gds_ub -
+                       PredScoreBrackets->getScoreUB(EXP_CNT);
+      if (new_gds_ub > 0) {
+        ScoreBrackets->setEventUB(
+            GDS_GPR_LOCK,
+            std::max(ScoreBrackets->getEventUB(GDS_GPR_LOCK), new_gds_ub));
+      }
+    }
+    int pred_exp_ub = PredScoreBrackets->getEventUB(EXP_GPR_LOCK);
+    if (pred_exp_ub > PredScoreBrackets->getScoreLB(EXP_CNT)) {
+      int new_exp_ub = MaxPending[EXP_CNT] + pred_exp_ub -
+                       PredScoreBrackets->getScoreUB(EXP_CNT);
+      if (new_exp_ub > 0) {
+        ScoreBrackets->setEventUB(
+            EXP_GPR_LOCK,
+            std::max(ScoreBrackets->getEventUB(EXP_GPR_LOCK), new_exp_ub));
+      }
+    }
+  }
+}
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+MachineBasicBlock *SIInsertWaitcnts::loopBottom(const MachineLoop *Loop) {
+  MachineBasicBlock *Bottom = Loop->getHeader();
+  for (MachineBasicBlock *MBB : Loop->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
+
+// Generate s_waitcnt instructions where needed.
+void SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
+                                            MachineBasicBlock &Block) {
+  // Initialize the state information.
+  mergeInputScoreBrackets(Block);
+
+  BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&Block].get();
+
+  DEBUG({
+    dbgs() << "Block" << Block.getNumber();
+    ScoreBrackets->dump();
+  });
+
+  bool InsertNOP = false;
+
+  // Walk over the instructions.
+  for (MachineBasicBlock::iterator Iter = Block.begin(), E = Block.end();
+       Iter != E;) {
+    MachineInstr &Inst = *Iter;
+    // Remove any previously existing waitcnts.
+    if (Inst.getOpcode() == AMDGPU::S_WAITCNT) {
+      // TODO: Register the old waitcnt and optimize the following waitcnts.
+      // Leaving the previously existing waitcnts is conservatively correct.
+      if (CompilerGeneratedWaitcntSet.find(&Inst) ==
+          CompilerGeneratedWaitcntSet.end())
+        ++Iter;
+      else {
+        ScoreBrackets->setWaitcnt(&Inst);
+        ++Iter;
+        Inst.removeFromParent();
+      }
+      continue;
+    }
+
+    // Kill instructions generate a conditional branch to the endmain block.
+    // Merge the current waitcnt state into the endmain block information.
+    // TODO: Are there other flavors of KILL instruction?
+    if (Inst.getOpcode() == AMDGPU::KILL) {
+      addKillWaitBracket(ScoreBrackets);
+    }
+
+    bool VCCZBugWorkAround = false;
+    if (readsVCCZ(Inst) &&
+        (VCCZBugHandledSet.find(&Inst) == VCCZBugHandledSet.end())) {
+      if (ScoreBrackets->getScoreLB(LGKM_CNT) <
+              ScoreBrackets->getScoreUB(LGKM_CNT) &&
+          ScoreBrackets->hasPendingSMEM()) {
+        if (ST->getGeneration() <= SISubtarget::SEA_ISLANDS)
+          VCCZBugWorkAround = true;
+      }
+    }
+
+    // Generate an s_waitcnt instruction to be placed before
+    // cur_Inst, if needed.
+    MachineInstr *SWaitInst = generateSWaitCntInstBefore(Inst, ScoreBrackets);
+
+    if (SWaitInst) {
+      Block.insert(Inst, SWaitInst);
+      if (ScoreBrackets->getWaitcnt() != SWaitInst) {
+        DEBUG(dbgs() << "insertWaitcntInBlock\n"
+                     << "Old Instr: " << Inst << '\n'
+                     << "New Instr: " << *SWaitInst << '\n';);
+      }
+    }
+
+    updateEventWaitCntAfter(Inst, ScoreBrackets);
+
+#if 0 // TODO: implement resource type check controlled by options with ub = LB.
+    // If this instruction generates a S_SETVSKIP because it is an
+    // indexed resource, and we are on Tahiti, then it will also force
+    // an S_WAITCNT vmcnt(0)
+    if (RequireCheckResourceType(Inst, context)) {
+      // Force the score to as if an S_WAITCNT vmcnt(0) is emitted.
+      ScoreBrackets->setScoreLB(VM_CNT,
+				   ScoreBrackets->getScoreUB(VM_CNT));
+    }
+#endif
+
+    ScoreBrackets->clearWaitcnt();
+
+    if (SWaitInst) {
+      DEBUG({ SWaitInst->print(dbgs() << '\n'); });
+    }
+    DEBUG({
+      Inst.print(dbgs());
+      ScoreBrackets->dump();
+    });
+
+    // Check to see if this is a GWS instruction. If so, and if this is CI or
+    // VI, then the generated code sequence will include an S_WAITCNT 0.
+    // TODO: Are these the only GWS instructions?
+    if (Inst.getOpcode() == AMDGPU::DS_GWS_INIT ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_V ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_BR ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_SEMA_P ||
+        Inst.getOpcode() == AMDGPU::DS_GWS_BARRIER) {
+      // TODO: && context->target_info->GwsRequiresMemViolTest() ) {
+      ScoreBrackets->updateByWait(VM_CNT, ScoreBrackets->getScoreUB(VM_CNT));
+      ScoreBrackets->updateByWait(EXP_CNT, ScoreBrackets->getScoreUB(EXP_CNT));
+      ScoreBrackets->updateByWait(LGKM_CNT,
+                                  ScoreBrackets->getScoreUB(LGKM_CNT));
+    }
+
+    // TODO: Remove this work-around after fixing the scheduler and enable the
+    // assert above.
+    if (VCCZBugWorkAround) {
+      // Restore the vccz bit.  Any time a value is written to vcc, the vcc
+      // bit is updated, so we can restore the bit by reading the value of
+      // vcc and then writing it back to the register.
+      BuildMI(Block, Inst, Inst.getDebugLoc(), TII->get(AMDGPU::S_MOV_B64),
+              AMDGPU::VCC)
+          .addReg(AMDGPU::VCC);
+      VCCZBugHandledSet.insert(&Inst);
+    }
+
+    if (ST->getGeneration() >= SISubtarget::VOLCANIC_ISLANDS) {
+
+      // This avoids a s_nop after a waitcnt has just been inserted.
+      if (!SWaitInst && InsertNOP) {
+        BuildMI(Block, Inst, DebugLoc(), TII->get(AMDGPU::S_NOP)).addImm(0);
+      }
+      InsertNOP = false;
+
+      // Any occurrence of consecutive VMEM or SMEM instructions forms a VMEM
+      // or SMEM clause, respectively.
+      //
+      // The temporary workaround is to break the clauses with S_NOP.
+      //
+      // The proper solution would be to allocate registers such that all source
+      // and destination registers don't overlap, e.g. this is illegal:
+      //   r0 = load r2
+      //   r2 = load r0
+      bool IsSMEM = false;
+      bool IsVMEM = false;
+      if (TII->isSMRD(Inst))
+        IsSMEM = true;
+      else if (TII->usesVM_CNT(Inst))
+        IsVMEM = true;
+
+      ++Iter;
+      if (Iter == E)
+        break;
+
+      MachineInstr &Next = *Iter;
+
+      // TODO: How about consecutive SMEM instructions?
+      //       The comments above says break the clause but the code does not.
+      // if ((TII->isSMRD(next) && isSMEM) ||
+      if (!IsSMEM && TII->usesVM_CNT(Next) && IsVMEM &&
+          // TODO: Enable this check when hasSoftClause is upstreamed.
+          // ST->hasSoftClauses() &&
+          ST->isXNACKEnabled()) {
+        // Insert a NOP to break the clause.
+        InsertNOP = true;
+        continue;
+      }
+
+      // There must be "S_NOP 0" between an instruction writing M0 and
+      // S_SENDMSG.
+      if ((Next.getOpcode() == AMDGPU::S_SENDMSG ||
+           Next.getOpcode() == AMDGPU::S_SENDMSGHALT) &&
+          Inst.definesRegister(AMDGPU::M0))
+        InsertNOP = true;
+
+      continue;
+    }
+
+    ++Iter;
+  }
+
+  // Check if we need to force convergence at loop footer.
+  MachineLoop *ContainingLoop = MLI->getLoopFor(&Block);
+  if (ContainingLoop && loopBottom(ContainingLoop) == &Block) {
+    LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+    WaitcntData->print();
+    DEBUG(dbgs() << '\n';);
+
+    // The iterative waitcnt insertion algorithm aims for optimal waitcnt
+    // placement and doesn't always guarantee convergence for a loop. Each
+    // loop should take at most 2 iterations for it to converge naturally.
+    // When this max is reached and result doesn't converge, we force
+    // convergence by inserting a s_waitcnt at the end of loop footer.
+    if (WaitcntData->getIterCnt() > 2) {
+      // To ensure convergence, need to make wait events at loop footer be no
+      // more than those from the previous iteration.
+      // As a simplification, Instead of tracking individual scores and
+      // generate the precise wait count, just wait on 0.
+      bool HasPending = false;
+      MachineInstr *SWaitInst = WaitcntData->getWaitcnt();
+      for (enum InstCounterType T = VM_CNT; T < NUM_INST_CNTS;
+           T = (enum InstCounterType)(T + 1)) {
+        if (ScoreBrackets->getScoreUB(T) > ScoreBrackets->getScoreLB(T)) {
+          ScoreBrackets->setScoreLB(T, ScoreBrackets->getScoreUB(T));
+          HasPending = true;
+        }
+      }
+
+      if (HasPending) {
+        if (!SWaitInst) {
+          SWaitInst = Block.getParent()->CreateMachineInstr(
+              TII->get(AMDGPU::S_WAITCNT), DebugLoc());
+          CompilerGeneratedWaitcntSet.insert(SWaitInst);
+          const MachineOperand &Op = MachineOperand::CreateImm(0);
+          SWaitInst->addOperand(MF, Op);
+#if 0 // TODO: Format the debug output
+          OutputTransformBanner("insertWaitcntInBlock",0,"Create:",context);
+          OutputTransformAdd(SWaitInst, context);
+#endif
+        }
+#if 0 // TODO: ??
+        _DEV( REPORTED_STATS->force_waitcnt_converge = 1; )
+#endif
+      }
+
+      if (SWaitInst) {
+        DEBUG({
+          SWaitInst->print(dbgs());
+          dbgs() << "\nAdjusted score board:";
+          ScoreBrackets->dump();
+        });
+
+        // Add this waitcnt to the block. It is either newly created or
+        // created in previous iterations and added back since block traversal
+        // always remove waitcnt.
+        insertWaitcntBeforeCF(Block, SWaitInst);
+        WaitcntData->setWaitcnt(SWaitInst);
+      }
+    }
+  }
+}
+
+bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget<SISubtarget>();
+  TII = ST->getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MRI = &MF.getRegInfo();
+  MLI = &getAnalysis<MachineLoopInfo>();
+  IV = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
+  AMDGPUASI = ST->getAMDGPUAS();
+
+  HardwareLimits.VmcntMax = AMDGPU::getVmcntBitMask(IV);
+  HardwareLimits.ExpcntMax = AMDGPU::getExpcntBitMask(IV);
+  HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
+
+  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
+  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
+  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+
+  RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
+  RegisterEncoding.VGPRL =
+      RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
+  RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
+  RegisterEncoding.SGPRL =
+      RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
+
+  // Walk over the blocks in reverse post-dominator order, inserting
+  // s_waitcnt where needed.
+  ReversePostOrderTraversal<MachineFunction *> RPOT(&MF);
+  bool Modified = false;
+  for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
+           I = RPOT.begin(),
+           E = RPOT.end(), J = RPOT.begin();
+       I != E;) {
+    MachineBasicBlock &MBB = **I;
+
+    BlockVisitedSet.insert(&MBB);
+
+    BlockWaitcntBrackets *ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
+    if (!ScoreBrackets) {
+      BlockWaitcntBracketsMap[&MBB] = make_unique<BlockWaitcntBrackets>();
+      ScoreBrackets = BlockWaitcntBracketsMap[&MBB].get();
+    }
+    ScoreBrackets->setPostOrder(MBB.getNumber());
+    MachineLoop *ContainingLoop = MLI->getLoopFor(&MBB);
+    if (ContainingLoop && LoopWaitcntDataMap[ContainingLoop] == nullptr)
+      LoopWaitcntDataMap[ContainingLoop] = make_unique<LoopWaitcntData>();
+
+    // If we are walking into the block from before the loop, then guarantee
+    // at least 1 re-walk over the loop to propagate the information, even if
+    // no S_WAITCNT instructions were generated.
+    if (ContainingLoop && ContainingLoop->getTopBlock() == &MBB && J < I &&
+        (BlockWaitcntProcessedSet.find(&MBB) ==
+         BlockWaitcntProcessedSet.end())) {
+      BlockWaitcntBracketsMap[&MBB]->setRevisitLoop(true);
+      DEBUG(dbgs() << "set-revisit: block"
+                   << ContainingLoop->getTopBlock()->getNumber() << '\n';);
+    }
+
+    // Walk over the instructions.
+    insertWaitcntInBlock(MF, MBB);
+
+    // Flag that waitcnts have been processed at least once.
+    BlockWaitcntProcessedSet.insert(&MBB);
+
+    // See if we want to revisit the loop.
+    if (ContainingLoop && loopBottom(ContainingLoop) == &MBB) {
+      MachineBasicBlock *EntryBB = ContainingLoop->getTopBlock();
+      BlockWaitcntBrackets *EntrySB = BlockWaitcntBracketsMap[EntryBB].get();
+      if (EntrySB && EntrySB->getRevisitLoop()) {
+        EntrySB->setRevisitLoop(false);
+        J = I;
+        int32_t PostOrder = EntrySB->getPostOrder();
+        // TODO: Avoid this loop. Find another way to set I.
+        for (ReversePostOrderTraversal<MachineFunction *>::rpo_iterator
+                 X = RPOT.begin(),
+                 Y = RPOT.end();
+             X != Y; ++X) {
+          MachineBasicBlock &MBBX = **X;
+          if (MBBX.getNumber() == PostOrder) {
+            I = X;
+            break;
+          }
+        }
+        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+        WaitcntData->incIterCnt();
+        DEBUG(dbgs() << "revisit: block" << EntryBB->getNumber() << '\n';);
+        continue;
+      } else {
+        LoopWaitcntData *WaitcntData = LoopWaitcntDataMap[ContainingLoop].get();
+        // Loop converged, reset iteration count. If this loop gets revisited,
+        // it must be from an outer loop, the counter will restart, this will
+        // ensure we don't force convergence on such revisits.
+        WaitcntData->resetIterCnt();
+      }
+    }
+
+    J = I;
+    ++I;
+  }
+
+  SmallVector<MachineBasicBlock *, 4> EndPgmBlocks;
+
+  bool HaveScalarStores = false;
+
+  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
+       ++BI) {
+
+    MachineBasicBlock &MBB = *BI;
+
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
+         ++I) {
+
+      if (!HaveScalarStores && TII->isScalarStore(*I))
+        HaveScalarStores = true;
+
+      if (I->getOpcode() == AMDGPU::S_ENDPGM ||
+          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
+        EndPgmBlocks.push_back(&MBB);
+    }
+  }
+
+  if (HaveScalarStores) {
+    // If scalar writes are used, the cache must be flushed or else the next
+    // wave to reuse the same scratch memory can be clobbered.
+    //
+    // Insert s_dcache_wb at wave termination points if there were any scalar
+    // stores, and only if the cache hasn't already been flushed. This could be
+    // improved by looking across blocks for flushes in postdominating blocks
+    // from the stores but an explicitly requested flush is probably very rare.
+    for (MachineBasicBlock *MBB : EndPgmBlocks) {
+      bool SeenDCacheWB = false;
+
+      for (MachineBasicBlock::iterator I = MBB->begin(), E = MBB->end(); I != E;
+           ++I) {
+
+        if (I->getOpcode() == AMDGPU::S_DCACHE_WB)
+          SeenDCacheWB = true;
+        else if (TII->isScalarStore(*I))
+          SeenDCacheWB = false;
+
+        // FIXME: It would be better to insert this before a waitcnt if any.
+        if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
+             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) &&
+            !SeenDCacheWB) {
+          Modified = true;
+          BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
+        }
+      }
+    }
+  }
+
+  return Modified;
+}
diff --git a/lib/Target/AMDGPU/SIInsertWaits.cpp b/lib/Target/AMDGPU/SIInsertWaits.cpp
index c814e55e844fd7df82447dc35658e6b2139fd4cd..47257ce16ceb3367b88f3a4ab28a1c4cc29d139d 100644
--- a/lib/Target/AMDGPU/SIInsertWaits.cpp
+++ b/lib/Target/AMDGPU/SIInsertWaits.cpp
@@ -47,7 +47,6 @@
 #define DEBUG_TYPE "si-insert-waits"
 
 using namespace llvm;
-using namespace llvm::AMDGPU;
 
 namespace {
 
@@ -76,7 +75,7 @@ private:
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI;
-  IsaVersion IV;
+  AMDGPU::IsaInfo::IsaVersion ISA;
 
   /// \brief Constant zero value
   static const Counters ZeroCounts;
@@ -427,10 +426,10 @@ bool SIInsertWaits::insertWait(MachineBasicBlock &MBB,
 
   // Build the wait instruction
   BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-    .addImm(encodeWaitcnt(IV,
-                          Counts.Named.VM,
-                          Counts.Named.EXP,
-                          Counts.Named.LGKM));
+    .addImm(AMDGPU::encodeWaitcnt(ISA,
+                                  Counts.Named.VM,
+                                  Counts.Named.EXP,
+                                  Counts.Named.LGKM));
 
   LastOpcodeType = OTHER;
   LastInstWritesM0 = false;
@@ -458,9 +457,9 @@ void SIInsertWaits::handleExistingWait(MachineBasicBlock::iterator I) {
   unsigned Imm = I->getOperand(0).getImm();
   Counters Counts, WaitOn;
 
-  Counts.Named.VM = decodeVmcnt(IV, Imm);
-  Counts.Named.EXP = decodeExpcnt(IV, Imm);
-  Counts.Named.LGKM = decodeLgkmcnt(IV, Imm);
+  Counts.Named.VM = AMDGPU::decodeVmcnt(ISA, Imm);
+  Counts.Named.EXP = AMDGPU::decodeExpcnt(ISA, Imm);
+  Counts.Named.LGKM = AMDGPU::decodeLgkmcnt(ISA, Imm);
 
   for (unsigned i = 0; i < 3; ++i) {
     if (Counts.Array[i] <= LastIssued.Array[i])
@@ -525,6 +524,16 @@ void SIInsertWaits::handleSendMsg(MachineBasicBlock &MBB,
   }
 }
 
+/// Return true if \p MBB has one successor immediately following, and is its
+/// only predecessor
+static bool hasTrivialSuccessor(const MachineBasicBlock &MBB) {
+  if (MBB.succ_size() != 1)
+    return false;
+
+  const MachineBasicBlock *Succ = *MBB.succ_begin();
+  return (Succ->pred_size() == 1) && MBB.isLayoutSuccessor(Succ);
+}
+
 // FIXME: Insert waits listed in Table 4.2 "Required User-Inserted Wait States"
 // around other non-memory instructions.
 bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
@@ -534,12 +543,12 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   TII = ST->getInstrInfo();
   TRI = &TII->getRegisterInfo();
   MRI = &MF.getRegInfo();
-  IV = getIsaVersion(ST->getFeatureBits());
+  ISA = AMDGPU::IsaInfo::getIsaVersion(ST->getFeatureBits());
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  HardwareLimits.Named.VM = getVmcntBitMask(IV);
-  HardwareLimits.Named.EXP = getExpcntBitMask(IV);
-  HardwareLimits.Named.LGKM = getLgkmcntBitMask(IV);
+  HardwareLimits.Named.VM = AMDGPU::getVmcntBitMask(ISA);
+  HardwareLimits.Named.EXP = AMDGPU::getExpcntBitMask(ISA);
+  HardwareLimits.Named.LGKM = AMDGPU::getLgkmcntBitMask(ISA);
 
   WaitedOn = ZeroCounts;
   DelayedWaitOn = ZeroCounts;
@@ -639,12 +648,14 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
       handleSendMsg(MBB, I);
 
       if (I->getOpcode() == AMDGPU::S_ENDPGM ||
-          I->getOpcode() == AMDGPU::SI_RETURN)
+          I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG)
         EndPgmBlocks.push_back(&MBB);
     }
 
-    // Wait for everything at the end of the MBB
-    Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
+    // Wait for everything at the end of the MBB. If there is only one
+    // successor, we can defer this until the uses there.
+    if (!hasTrivialSuccessor(MBB))
+      Changes |= insertWait(MBB, MBB.getFirstTerminator(), LastIssued);
   }
 
   if (HaveScalarStores) {
@@ -668,7 +679,7 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
 
         // FIXME: It would be better to insert this before a waitcnt if any.
         if ((I->getOpcode() == AMDGPU::S_ENDPGM ||
-             I->getOpcode() == AMDGPU::SI_RETURN) && !SeenDCacheWB) {
+             I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG) && !SeenDCacheWB) {
           Changes = true;
           BuildMI(*MBB, I, I->getDebugLoc(), TII->get(AMDGPU::S_DCACHE_WB));
         }
@@ -679,5 +690,19 @@ bool SIInsertWaits::runOnMachineFunction(MachineFunction &MF) {
   for (MachineInstr *I : RemoveMI)
     I->eraseFromParent();
 
+  if (!MFI->isEntryFunction()) {
+    // Wait for any outstanding memory operations that the input registers may
+    // depend on. We can't track them and it's better to to the wait after the
+    // costly call sequence.
+
+    // TODO: Could insert earlier and schedule more liberally with operations
+    // that only use caller preserved registers.
+    MachineBasicBlock &EntryBB = MF.front();
+    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
+      .addImm(0);
+
+    Changes = true;
+  }
+
   return Changes;
 }
diff --git a/lib/Target/AMDGPU/SIInstrFormats.td b/lib/Target/AMDGPU/SIInstrFormats.td
index 5523ec142ba736048b0290b793e4853f9eb73665..b83a1fe187eb7523da475887f1ec35b0b2469b61 100644
--- a/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/lib/Target/AMDGPU/SIInstrFormats.td
@@ -31,6 +31,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   field bit VOP2 = 0;
   field bit VOPC = 0;
   field bit VOP3 = 0;
+  field bit VOP3P = 0;
   field bit VINTRP = 0;
   field bit SDWA = 0;
   field bit DPP = 0;
@@ -78,6 +79,10 @@ class InstSI <dag outs, dag ins, string asm = "",
   // is unable to infer the encoding from the operands.
   field bit VOPAsmPrefer32Bit = 0;
 
+  // This bit indicates that this has a floating point result type, so
+  // the clamp modifier has floating point semantics.
+  field bit FPClamp = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -92,6 +97,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{8} = VOP2;
   let TSFlags{9} = VOPC;
   let TSFlags{10} = VOP3;
+  let TSFlags{12} = VOP3P;
 
   let TSFlags{13} = VINTRP;
   let TSFlags{14} = SDWA;
@@ -120,6 +126,7 @@ class InstSI <dag outs, dag ins, string asm = "",
   let TSFlags{39} = ScalarStore;
   let TSFlags{40} = FixedSize;
   let TSFlags{41} = VOPAsmPrefer32Bit;
+  let TSFlags{42} = FPClamp;
 
   let SchedRW = [Write32Bit];
 
@@ -131,19 +138,19 @@ class InstSI <dag outs, dag ins, string asm = "",
   let AsmVariantName = AMDGPUAsmVariants.Default;
 }
 
-class PseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : InstSI<outs, ins, "", pattern> {
+class PseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : InstSI<outs, ins, asm, pattern> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
 
-class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : PseudoInstSI<outs, ins, pattern> {
+class SPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : PseudoInstSI<outs, ins, pattern, asm> {
   let SALU = 1;
 }
 
-class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = []>
-  : PseudoInstSI<outs, ins, pattern> {
+class VPseudoInstSI<dag outs, dag ins, list<dag> pattern = [], string asm = "">
+  : PseudoInstSI<outs, ins, pattern, asm> {
   let VALU = 1;
   let Uses = [EXEC];
 }
diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp
index ca1fa94d81203be5ffe780554e3c806a5322c6f7..b1e4c9a7aaa226d4060e62c2888fc1d98ccd9cba 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -36,7 +37,7 @@ BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
                  cl::desc("Restrict range of branch instructions (DEBUG)"));
 
 SIInstrInfo::SIInstrInfo(const SISubtarget &ST)
-  : AMDGPUInstrInfo(ST), RI(), ST(ST) {}
+  : AMDGPUInstrInfo(ST), RI(ST), ST(ST) {}
 
 //===----------------------------------------------------------------------===//
 // TargetInstrInfo callbacks
@@ -315,7 +316,8 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
   const MachineOperand *SecondDst = nullptr;
 
   if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
-      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt))) {
+      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
+      (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
     FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
     SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
   } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
@@ -346,6 +348,21 @@ bool SIInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt,
   return (NumLoads * DstRC->getSize()) <= LoadClusterThreshold;
 }
 
+static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              const DebugLoc &DL, unsigned DestReg,
+                              unsigned SrcReg, bool KillSrc) {
+  MachineFunction *MF = MBB.getParent();
+  DiagnosticInfoUnsupported IllegalCopy(*MF->getFunction(),
+                                        "illegal SGPR to VGPR copy",
+                                        DL, DS_Error);
+  LLVMContext &C = MF->getFunction()->getContext();
+  C.diagnose(IllegalCopy);
+
+  BuildMI(MBB, MI, DL, TII->get(AMDGPU::SI_ILLEGAL_COPY), DestReg)
+    .addReg(SrcReg, getKillRegState(KillSrc));
+}
+
 void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               const DebugLoc &DL, unsigned DestReg,
@@ -369,7 +386,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+    if (!AMDGPU::SReg_32RegClass.contains(SrcReg)) {
+      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+      return;
+    }
+
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
     return;
@@ -391,7 +412,11 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       return;
     }
 
-    assert(AMDGPU::SReg_64RegClass.contains(SrcReg));
+    if (!AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+      return;
+    }
+
     BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg)
             .addReg(SrcReg, getKillRegState(KillSrc));
     return;
@@ -415,8 +440,14 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       Opcode = AMDGPU::S_MOV_B32;
       EltSize = 4;
     }
+
+    if (!RI.isSGPRClass(RI.getPhysRegClass(SrcReg))) {
+      reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+      return;
+    }
   }
 
+
   ArrayRef<int16_t> SubIndices = RI.getRegSplitParts(RC, EltSize);
   bool Forward = RI.getHWRegIndex(DestReg) <= RI.getHWRegIndex(SrcReg);
 
@@ -1487,6 +1518,27 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
   }
 }
 
+bool SIInstrInfo::isFoldableCopy(const MachineInstr &MI) const {
+  switch (MI.getOpcode()) {
+  case AMDGPU::V_MOV_B32_e32:
+  case AMDGPU::V_MOV_B32_e64:
+  case AMDGPU::V_MOV_B64_PSEUDO: {
+    // If there are additional implicit register operands, this may be used for
+    // register indexing so the source register operand isn't simply copied.
+    unsigned NumOps = MI.getDesc().getNumOperands() +
+      MI.getDesc().getNumImplicitUses();
+
+    return MI.getNumOperands() == NumOps;
+  }
+  case AMDGPU::S_MOV_B32:
+  case AMDGPU::S_MOV_B64:
+  case AMDGPU::COPY:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static void removeModOperands(MachineInstr &MI) {
   unsigned Opc = MI.getOpcode();
   int Src0ModIdx = AMDGPU::getNamedOperandIdx(Opc,
@@ -1536,15 +1588,10 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
   if (Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64 ||
       Opc == AMDGPU::V_MAD_F16 || Opc == AMDGPU::V_MAC_F16_e64) {
-    bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
-
-    // Don't fold if we are using source modifiers. The new VOP2 instructions
-    // don't have them.
-    if (hasModifiersSet(UseMI, AMDGPU::OpName::src0_modifiers) ||
-        hasModifiersSet(UseMI, AMDGPU::OpName::src1_modifiers) ||
-        hasModifiersSet(UseMI, AMDGPU::OpName::src2_modifiers)) {
+    // Don't fold if we are using source or output modifiers. The new VOP2
+    // instructions don't have them.
+    if (hasAnyModifiersSet(UseMI))
       return false;
-    }
 
     const MachineOperand &ImmOp = DefMI.getOperand(1);
 
@@ -1557,6 +1604,7 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
     if (isInlineConstant(UseMI, *Src0, ImmOp))
       return false;
 
+    bool IsF32 = Opc == AMDGPU::V_MAD_F32 || Opc == AMDGPU::V_MAC_F32_e64;
     MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1);
     MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2);
 
@@ -1769,20 +1817,26 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineFunction::iterator &MBB,
 
   const MachineOperand *Dst = getNamedOperand(MI, AMDGPU::OpName::vdst);
   const MachineOperand *Src0 = getNamedOperand(MI, AMDGPU::OpName::src0);
+  const MachineOperand *Src0Mods =
+    getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
   const MachineOperand *Src1 = getNamedOperand(MI, AMDGPU::OpName::src1);
+  const MachineOperand *Src1Mods =
+    getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
   const MachineOperand *Src2 = getNamedOperand(MI, AMDGPU::OpName::src2);
+  const MachineOperand *Clamp = getNamedOperand(MI, AMDGPU::OpName::clamp);
+  const MachineOperand *Omod = getNamedOperand(MI, AMDGPU::OpName::omod);
 
   return BuildMI(*MBB, MI, MI.getDebugLoc(),
                  get(IsF16 ? AMDGPU::V_MAD_F16 : AMDGPU::V_MAD_F32))
       .add(*Dst)
-      .addImm(0) // Src0 mods
+      .addImm(Src0Mods ? Src0Mods->getImm() : 0)
       .add(*Src0)
-      .addImm(0) // Src1 mods
+      .addImm(Src1Mods ? Src1Mods->getImm() : 0)
       .add(*Src1)
       .addImm(0) // Src mods
       .add(*Src2)
-      .addImm(0)  // clamp
-      .addImm(0); // omod
+      .addImm(Clamp ? Clamp->getImm() : 0)
+      .addImm(Omod ? Omod->getImm() : 0);
 }
 
 // It's not generally safe to move VALU instructions across these since it will
@@ -1823,7 +1877,8 @@ bool SIInstrInfo::isInlineConstant(const APInt &Imm) const {
     return AMDGPU::isInlinableLiteral64(Imm.getSExtValue(),
                                         ST.hasInv2PiInlineImm());
   case 16:
-    return AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
+    return ST.has16BitInsts() &&
+           AMDGPU::isInlinableLiteral16(Imm.getSExtValue(),
                                         ST.hasInv2PiInlineImm());
   default:
     llvm_unreachable("invalid bitwidth");
@@ -1841,24 +1896,43 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
   // would be for any 32-bit integer operand, but would not be for a 64-bit one.
 
   int64_t Imm = MO.getImm();
-  switch (operandBitWidth(OperandType)) {
-  case 32: {
+  switch (OperandType) {
+  case AMDGPU::OPERAND_REG_IMM_INT32:
+  case AMDGPU::OPERAND_REG_IMM_FP32:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT32:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP32: {
     int32_t Trunc = static_cast<int32_t>(Imm);
     return Trunc == Imm &&
            AMDGPU::isInlinableLiteral32(Trunc, ST.hasInv2PiInlineImm());
   }
-  case 64: {
+  case AMDGPU::OPERAND_REG_IMM_INT64:
+  case AMDGPU::OPERAND_REG_IMM_FP64:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT64:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP64: {
     return AMDGPU::isInlinableLiteral64(MO.getImm(),
                                         ST.hasInv2PiInlineImm());
   }
-  case 16: {
+  case AMDGPU::OPERAND_REG_IMM_INT16:
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16: {
     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
+      // A few special case instructions have 16-bit operands on subtargets
+      // where 16-bit instructions are not legal.
+      // TODO: Do the 32-bit immediates work? We shouldn't really need to handle
+      // constants in these cases
       int16_t Trunc = static_cast<int16_t>(Imm);
-      return AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
+      return ST.has16BitInsts() &&
+             AMDGPU::isInlinableLiteral16(Trunc, ST.hasInv2PiInlineImm());
     }
 
     return false;
   }
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16: {
+    uint32_t Trunc = static_cast<uint32_t>(Imm);
+    return  AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
+  }
   default:
     llvm_unreachable("invalid bitwidth");
   }
@@ -1937,6 +2011,14 @@ bool SIInstrInfo::hasModifiersSet(const MachineInstr &MI,
   return Mods && Mods->getImm();
 }
 
+bool SIInstrInfo::hasAnyModifiersSet(const MachineInstr &MI) const {
+  return hasModifiersSet(MI, AMDGPU::OpName::src0_modifiers) ||
+         hasModifiersSet(MI, AMDGPU::OpName::src1_modifiers) ||
+         hasModifiersSet(MI, AMDGPU::OpName::src2_modifiers) ||
+         hasModifiersSet(MI, AMDGPU::OpName::clamp) ||
+         hasModifiersSet(MI, AMDGPU::OpName::omod);
+}
+
 bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
                                   const MachineOperand &MO,
                                   const MCOperandInfo &OpInfo) const {
@@ -3106,6 +3188,14 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     case AMDGPU::S_BFE_U64:
     case AMDGPU::S_BFM_B64:
       llvm_unreachable("Moving this op to VALU not implemented");
+
+    case AMDGPU::S_PACK_LL_B32_B16:
+    case AMDGPU::S_PACK_LH_B32_B16:
+    case AMDGPU::S_PACK_HH_B32_B16: {
+      movePackToVALU(Worklist, MRI, Inst);
+      Inst.eraseFromParent();
+      continue;
+    }
     }
 
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -3163,12 +3253,15 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst) const {
     bool HasDst = Inst.getOperand(0).isReg() && Inst.getOperand(0).isDef();
     unsigned NewDstReg = AMDGPU::NoRegister;
     if (HasDst) {
+      unsigned DstReg = Inst.getOperand(0).getReg();
+      if (TargetRegisterInfo::isPhysicalRegister(DstReg))
+        continue;
+
       // Update the destination register class.
       const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);
       if (!NewDstRC)
         continue;
 
-      unsigned DstReg = Inst.getOperand(0).getReg();
       if (Inst.isCopy() &&
           TargetRegisterInfo::isVirtualRegister(Inst.getOperand(1).getReg()) &&
           NewDstRC == RI.getRegClassForReg(MRI, Inst.getOperand(1).getReg())) {
@@ -3456,6 +3549,82 @@ void SIInstrInfo::addUsersToMoveToVALUWorklist(
   }
 }
 
+void SIInstrInfo::movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+                                 MachineRegisterInfo &MRI,
+                                 MachineInstr &Inst) const {
+  unsigned ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  MachineBasicBlock *MBB = Inst.getParent();
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  const DebugLoc &DL = Inst.getDebugLoc();
+
+  switch (Inst.getOpcode()) {
+  case AMDGPU::S_PACK_LL_B32_B16: {
+    // v_pack_b32_f16 flushes denormals if not enabled. Use it if the default
+    // is to leave them untouched.
+    // XXX: Does this do anything to NaNs?
+    if (ST.hasFP16Denormals()) {
+      BuildMI(*MBB, Inst, DL, get(AMDGPU::V_PACK_B32_F16), ResultReg)
+        .addImm(0)  // src0_modifiers
+        .add(Src0)  // src0
+        .addImm(0)  // src1_modifiers
+        .add(Src1)  // src2
+        .addImm(0)  // clamp
+        .addImm(0); // omod
+    } else {
+      unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+      unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+      // FIXME: Can do a lot better if we know the high bits of src0 or src1 are
+      // 0.
+      BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+        .addImm(0xffff);
+
+      BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_B32_e64), TmpReg)
+        .addReg(ImmReg, RegState::Kill)
+        .add(Src0);
+
+      BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHL_OR_B32), ResultReg)
+        .add(Src1)
+        .addImm(16)
+        .addReg(TmpReg, RegState::Kill);
+    }
+
+    break;
+  }
+  case AMDGPU::S_PACK_LH_B32_B16: {
+    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+      .addImm(0xffff);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_BFI_B32), ResultReg)
+      .addReg(ImmReg, RegState::Kill)
+      .add(Src0)
+      .add(Src1);
+    break;
+  }
+  case AMDGPU::S_PACK_HH_B32_B16: {
+    unsigned ImmReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+      .addImm(16)
+      .add(Src0);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_MOV_B32_e32), ImmReg)
+      .addImm(0xffff);
+    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_AND_OR_B32), ResultReg)
+      .add(Src1)
+      .addReg(ImmReg, RegState::Kill)
+      .addReg(TmpReg, RegState::Kill);
+    break;
+  }
+  default:
+    llvm_unreachable("unhandled s_pack_* instruction");
+  }
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
 void SIInstrInfo::addSCCDefUsersToVALUWorklist(
     MachineInstr &SCCDefInst, SmallVectorImpl<MachineInstr *> &Worklist) const {
   // This assumes that all the users of SCC are in the same block
@@ -3578,10 +3747,13 @@ MachineOperand *SIInstrInfo::getNamedOperand(MachineInstr &MI,
 uint64_t SIInstrInfo::getDefaultRsrcDataFormat() const {
   uint64_t RsrcDataFormat = AMDGPU::RSRC_DATA_FORMAT;
   if (ST.isAmdHsaOS()) {
-    RsrcDataFormat |= (1ULL << 56);
+    // Set ATC = 1. GFX9 doesn't have this bit.
+    if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS)
+      RsrcDataFormat |= (1ULL << 56);
 
-    if (ST.getGeneration() >= SISubtarget::VOLCANIC_ISLANDS)
-      // Set MTYPE = 2
+    // Set MTYPE = 2 (MTYPE_UC = uncached). GFX9 doesn't have this.
+    // BTW, it disables TC L2 and therefore decreases performance.
+    if (ST.getGeneration() == SISubtarget::VOLCANIC_ISLANDS)
       RsrcDataFormat |= (2ULL << 59);
   }
 
@@ -3593,11 +3765,14 @@ uint64_t SIInstrInfo::getScratchRsrcWords23() const {
                     AMDGPU::RSRC_TID_ENABLE |
                     0xffffffff; // Size;
 
-  uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+  // GFX9 doesn't have ELEMENT_SIZE.
+  if (ST.getGeneration() <= SISubtarget::VOLCANIC_ISLANDS) {
+    uint64_t EltSizeValue = Log2_32(ST.getMaxPrivateElementSize()) - 1;
+    Rsrc23 |= EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT;
+  }
 
-  Rsrc23 |= (EltSizeValue << AMDGPU::RSRC_ELEMENT_SIZE_SHIFT) |
-            // IndexStride = 64
-            (UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT);
+  // IndexStride = 64.
+  Rsrc23 |= UINT64_C(3) << AMDGPU::RSRC_INDEX_STRIDE_SHIFT;
 
   // If TID_ENABLE is set, DATA_FORMAT specifies stride bits [14:17].
   // Clear them unless we want a huge stride.
@@ -3626,7 +3801,7 @@ unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
     return AMDGPU::NoRegister;
 
   assert(!MI.memoperands_empty() &&
-         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUAS::PRIVATE_ADDRESS);
+         (*MI.memoperands_begin())->getAddrSpace() == AMDGPUASI.PRIVATE_ADDRESS);
 
   FrameIndex = Addr->getIndex();
   return getNamedOperand(MI, AMDGPU::OpName::vdata)->getReg();
@@ -3682,16 +3857,11 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   if (DescSize != 0 && DescSize != 4)
     return DescSize;
 
-  if (Opc == AMDGPU::WAVE_BARRIER)
-    return 0;
-
   // 4-byte instructions may have a 32-bit literal encoded after them. Check
   // operands that coud ever be literals.
   if (isVALU(MI) || isSALU(MI)) {
-    if (isFixedSize(MI)) {
-      assert(DescSize == 4);
+    if (isFixedSize(MI))
       return DescSize;
-    }
 
     int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
     if (Src0Idx == -1)
@@ -3714,7 +3884,6 @@ unsigned SIInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     return 4;
 
   switch (Opc) {
-  case AMDGPU::SI_MASK_BRANCH:
   case TargetOpcode::IMPLICIT_DEF:
   case TargetOpcode::KILL:
   case TargetOpcode::DBG_VALUE:
@@ -3739,7 +3908,7 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const {
     return true;
 
   for (const MachineMemOperand *MMO : MI.memoperands()) {
-    if (MMO->getAddrSpace() == AMDGPUAS::FLAT_ADDRESS)
+    if (MMO->getAddrSpace() == AMDGPUASI.FLAT_ADDRESS)
       return true;
   }
   return false;
diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h
index 5f53fd189173c48b896a1d3ccb676c988d8ed35e..18099abc1019374a9d9f4334d9af28f5f90a9a8b 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/lib/Target/AMDGPU/SIInstrInfo.h
@@ -69,6 +69,9 @@ private:
                             MachineInstr &Inst) const;
   void splitScalar64BitBFE(SmallVectorImpl<MachineInstr *> &Worklist,
                            MachineInstr &Inst) const;
+  void movePackToVALU(SmallVectorImpl<MachineInstr *> &Worklist,
+                      MachineRegisterInfo &MRI,
+                      MachineInstr &Inst) const;
 
   void addUsersToMoveToVALUWorklist(
     unsigned Reg, MachineRegisterInfo &MRI,
@@ -219,6 +222,8 @@ public:
   areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb,
                                   AliasAnalysis *AA = nullptr) const override;
 
+  bool isFoldableCopy(const MachineInstr &MI) const;
+
   bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
                      MachineRegisterInfo *MRI) const final;
 
@@ -440,6 +445,22 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::DPP;
   }
 
+  static bool isVOP3P(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VOP3P;
+  }
+
+  bool isVOP3P(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VOP3P;
+  }
+
+  static bool isVINTRP(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::VINTRP;
+  }
+
+  bool isVINTRP(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::VINTRP;
+  }
+
   static bool isScalarUnit(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & (SIInstrFlags::SALU | SIInstrFlags::SMRD);
   }
@@ -474,6 +495,14 @@ public:
     return get(Opcode).TSFlags & SIInstrFlags::FIXED_SIZE;
   }
 
+  static bool hasFPClamp(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::HasFPClamp;
+  }
+
+  bool hasFPClamp(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::HasFPClamp;
+  }
+
   bool isVGPRCopy(const MachineInstr &MI) const {
     assert(MI.isCopy());
     unsigned Dest = MI.getOperand(0).getReg();
@@ -482,28 +511,6 @@ public:
     return !RI.isSGPRReg(MRI, Dest);
   }
 
-  static int operandBitWidth(uint8_t OperandType) {
-    switch (OperandType) {
-    case AMDGPU::OPERAND_REG_IMM_INT32:
-    case AMDGPU::OPERAND_REG_IMM_FP32:
-    case AMDGPU::OPERAND_REG_INLINE_C_INT32:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP32:
-      return 32;
-    case AMDGPU::OPERAND_REG_IMM_INT64:
-    case AMDGPU::OPERAND_REG_IMM_FP64:
-    case AMDGPU::OPERAND_REG_INLINE_C_INT64:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP64:
-      return 64;
-    case AMDGPU::OPERAND_REG_INLINE_C_INT16:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
-    case AMDGPU::OPERAND_REG_IMM_INT16:
-    case AMDGPU::OPERAND_REG_IMM_FP16:
-      return 16;
-    default:
-      llvm_unreachable("unexpected operand type");
-    }
-  }
-
   bool isInlineConstant(const APInt &Imm) const;
 
   bool isInlineConstant(const MachineOperand &MO, uint8_t OperandType) const;
@@ -591,6 +598,7 @@ public:
 
   bool hasModifiersSet(const MachineInstr &MI,
                        unsigned OpName) const;
+  bool hasAnyModifiersSet(const MachineInstr &MI) const;
 
   bool verifyInstruction(const MachineInstr &MI,
                          StringRef &ErrInfo) const override;
@@ -762,6 +770,9 @@ namespace AMDGPU {
   LLVM_READONLY
   int getVOPe32(uint16_t Opcode);
 
+  LLVM_READONLY
+  int getSDWAOp(uint16_t Opcode);
+
   LLVM_READONLY
   int getCommuteRev(uint16_t Opcode);
 
diff --git a/lib/Target/AMDGPU/SIInstrInfo.td b/lib/Target/AMDGPU/SIInstrInfo.td
index 6f4746b7559a3170cb08465a3abc7036a19d70d3..c6daf743f3ac1e4157b377152fb5b7153013421e 100644
--- a/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/lib/Target/AMDGPU/SIInstrInfo.td
@@ -71,11 +71,6 @@ def SIbuffer_load : SDNode <"AMDGPUISD::BUFFER_LOAD", SDTBufferLoad,
 def SIbuffer_load_format : SDNode <"AMDGPUISD::BUFFER_LOAD_FORMAT", SDTBufferLoad,
                             [SDNPMemOperand, SDNPHasChain, SDNPMayLoad]>;
 
-def SIload_input : SDNode<"AMDGPUISD::LOAD_INPUT",
-  SDTypeProfile<1, 3, [SDTCisVT<0, v4f32>, SDTCisVT<1, v4i32>, SDTCisVT<2, i16>,
-                       SDTCisVT<3, i32>]>
->;
-
 class SDSample<string opcode> : SDNode <opcode,
   SDTypeProfile<1, 4, [SDTCisVT<0, v4f32>, SDTCisVT<2, v8i32>,
                        SDTCisVT<3, v4i32>, SDTCisVT<4, i32>]>
@@ -107,7 +102,7 @@ def SIld_local : SDNode <"ISD::LOAD", SDTLoad,
 >;
 
 def si_ld_local : PatFrag <(ops node:$ptr), (SIld_local node:$ptr), [{
-  return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<LoadSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 def si_load_local : PatFrag <(ops node:$ptr), (si_ld_local node:$ptr), [{
@@ -144,7 +139,7 @@ def SIst_local : SDNode <"ISD::STORE", SDTStore,
 
 def si_st_local : PatFrag <
   (ops node:$val, node:$ptr), (SIst_local node:$val, node:$ptr), [{
-  return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+  return cast<StoreSDNode>(N)->getAddressSpace() == AMDGPUASI.LOCAL_ADDRESS;
 }]>;
 
 def si_store_local : PatFrag <
@@ -196,6 +191,21 @@ def si_uniform_br_scc : PatFrag <
   return isCBranchSCC(N);
 }]>;
 
+def lshr_rev : PatFrag <
+  (ops node:$src1, node:$src0),
+  (srl $src0, $src1)
+>;
+
+def ashr_rev : PatFrag <
+  (ops node:$src1, node:$src0),
+  (sra $src0, $src1)
+>;
+
+def lshl_rev : PatFrag <
+  (ops node:$src1, node:$src0),
+  (shl $src0, $src1)
+>;
+
 multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0> {
 
   def _glue : SDNode <
@@ -458,6 +468,12 @@ class NamedOperandU32<string Name, AsmOperandClass MatchClass> : Operand<i32> {
   let ParserMatchClass = MatchClass;
 }
 
+class NamedOperandU32Default0<string Name, AsmOperandClass MatchClass> :
+  OperandWithDefaultOps<i32, (ops (i32 0))> {
+  let PrintMethod = "print"#Name;
+  let ParserMatchClass = MatchClass;
+}
+
 let OperandType = "OPERAND_IMMEDIATE" in {
 
 def offen : NamedOperandBit<"Offen", NamedMatchClass<"Offen">>;
@@ -495,6 +511,11 @@ def src0_sel : NamedOperandU32<"SDWASrc0Sel", NamedMatchClass<"SDWASrc0Sel">>;
 def src1_sel : NamedOperandU32<"SDWASrc1Sel", NamedMatchClass<"SDWASrc1Sel">>;
 def dst_unused : NamedOperandU32<"SDWADstUnused", NamedMatchClass<"SDWADstUnused">>;
 
+def op_sel : NamedOperandU32Default0<"OpSel", NamedMatchClass<"OpSel">>;
+def op_sel_hi : NamedOperandU32Default0<"OpSelHi", NamedMatchClass<"OpSelHi">>;
+def neg_lo : NamedOperandU32Default0<"NegLo", NamedMatchClass<"NegLo">>;
+def neg_hi : NamedOperandU32Default0<"NegHi", NamedMatchClass<"NegHi">>;
+
 def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
 
 def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
@@ -534,6 +555,7 @@ class FPInputModsMatchClass <int opSize> : AsmOperandClass {
   let ParserMethod = "parseRegOrImmWithFPInputMods";
   let PredicateMethod = "isRegOrImmWithFP"#opSize#"InputMods";
 }
+
 def FP16InputModsMatchClass : FPInputModsMatchClass<16>;
 def FP32InputModsMatchClass : FPInputModsMatchClass<32>;
 def FP64InputModsMatchClass : FPInputModsMatchClass<64>;
@@ -586,6 +608,33 @@ def IntVRegInputMods : InputMods <IntVRegInputModsMatchClass> {
   let PrintMethod = "printOperandAndIntInputMods";
 }
 
+class PackedFPInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "PackedFP"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImm";
+  let PredicateMethod = "isRegOrImm";
+//  let PredicateMethod = "isPackedFP"#opSize#"InputMods";
+}
+
+class PackedIntInputModsMatchClass <int opSize> : AsmOperandClass {
+  let Name = "PackedInt"#opSize#"InputMods";
+  let ParserMethod = "parseRegOrImm";
+  let PredicateMethod = "isRegOrImm";
+//  let PredicateMethod = "isPackedInt"#opSize#"InputMods";
+}
+
+def PackedF16InputModsMatchClass : PackedFPInputModsMatchClass<16>;
+def PackedI16InputModsMatchClass : PackedIntInputModsMatchClass<16>;
+
+class PackedFPInputMods <PackedFPInputModsMatchClass matchClass> : InputMods <matchClass> {
+//  let PrintMethod = "printPackedFPInputMods";
+}
+
+class PackedIntInputMods <PackedIntInputModsMatchClass matchClass> : InputMods <matchClass> {
+  //let PrintMethod = "printPackedIntInputMods";
+}
+
+def PackedF16InputMods : PackedFPInputMods<PackedF16InputModsMatchClass>;
+def PackedI16InputMods : PackedIntInputMods<PackedI16InputModsMatchClass>;
 
 //===----------------------------------------------------------------------===//
 // Complex patterns
@@ -602,6 +651,14 @@ def VOP3Mods0Clamp : ComplexPattern<untyped, 3, "SelectVOP3Mods0Clamp">;
 def VOP3Mods0Clamp0OMod : ComplexPattern<untyped, 4, "SelectVOP3Mods0Clamp0OMod">;
 def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
 def VOP3NoMods : ComplexPattern<untyped, 2, "SelectVOP3NoMods">;
+// VOP3Mods, but the input source is known to never be NaN.
+def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;
+
+def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
+
+def VOP3PMods  : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
+def VOP3PMods0 : ComplexPattern<untyped, 3, "SelectVOP3PMods0">;
+
 
 //===----------------------------------------------------------------------===//
 // SI assembler operands
@@ -613,19 +670,32 @@ def SIOperand {
   int FLAT_SCR = 0x68;
 }
 
+// This should be kept in sync with SISrcMods enum
 def SRCMODS {
   int NONE = 0;
   int NEG = 1;
+  int ABS = 2;
+  int NEG_ABS = 3;
+
+  int NEG_HI = ABS;
+  int OP_SEL_0 = 4;
+  int OP_SEL_1 = 8;
 }
 
 def DSTCLAMP {
   int NONE = 0;
+  int ENABLE = 1;
 }
 
 def DSTOMOD {
   int NONE = 0;
 }
 
+def TRAPID{
+  int LLVM_TRAP = 2;
+  int LLVM_DEBUG_TRAP = 3;
+}
+
 //===----------------------------------------------------------------------===//
 //
 // SI Instruction multiclass helpers.
@@ -718,12 +788,34 @@ class getVALUDstForVT<ValueType VT> {
 // instructions for the given VT.
 class getVOPSrc0ForVT<ValueType VT> {
   bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+             !if(!eq(VT.Value, v2f16.Value), 1,
              !if(!eq(VT.Value, f32.Value), 1,
              !if(!eq(VT.Value, f64.Value), 1,
-             0)));
-  RegisterOperand ret = !if(isFP,
-                            !if(!eq(VT.Size, 64), VSrc_f64, !if(!eq(VT.Size, 16), VSrc_f16, VSrc_f32)),
-                            !if(!eq(VT.Size, 64), VSrc_b64, !if(!eq(VT.Size, 16), VSrc_b16, VSrc_b32)));
+             0))));
+
+  RegisterOperand ret =
+    !if(isFP,
+      !if(!eq(VT.Size, 64),
+         VSrc_f64,
+         !if(!eq(VT.Value, f16.Value),
+            VSrc_f16,
+            !if(!eq(VT.Value, v2f16.Value),
+               VCSrc_v2f16,
+               VSrc_f32
+            )
+         )
+       ),
+       !if(!eq(VT.Size, 64),
+          VSrc_b64,
+          !if(!eq(VT.Value, i16.Value),
+             VSrc_b16,
+             !if(!eq(VT.Value, v2i16.Value),
+                VCSrc_v2b16,
+                VSrc_b32
+             )
+          )
+       )
+    );
 }
 
 // Returns the vreg register class to use for source operand given VT
@@ -737,25 +829,38 @@ class getVregSrcForVT<ValueType VT> {
 // given VT.
 class getVOP3SrcForVT<ValueType VT> {
   bit isFP = !if(!eq(VT.Value, f16.Value), 1,
+             !if(!eq(VT.Value, v2f16.Value), 1,
              !if(!eq(VT.Value, f32.Value), 1,
              !if(!eq(VT.Value, f64.Value), 1,
-             0)));
+             0))));
   RegisterOperand ret =
   !if(!eq(VT.Size, 128),
-      VSrc_128,
-    !if(!eq(VT.Size, 64),
+     VSrc_128,
+     !if(!eq(VT.Size, 64),
         !if(isFP,
-            VCSrc_f64,
-            VCSrc_b64),
+           VCSrc_f64,
+           VCSrc_b64),
         !if(!eq(VT.Value, i1.Value),
-            SCSrc_b64,
-            !if(isFP,
-                !if(!eq(VT.Size, 16), VCSrc_f16, VCSrc_f32),
-                !if(!eq(VT.Size, 16), VCSrc_b16, VCSrc_b32)
-            )
-         )
-	   )
-     );
+           SCSrc_b64,
+           !if(isFP,
+              !if(!eq(VT.Value, f16.Value),
+                 VCSrc_f16,
+                 !if(!eq(VT.Value, v2f16.Value),
+                    VCSrc_v2f16,
+                    VCSrc_f32
+                 )
+              ),
+              !if(!eq(VT.Value, i16.Value),
+                 VCSrc_b16,
+                 !if(!eq(VT.Value, v2i16.Value),
+                    VCSrc_v2b16,
+                    VCSrc_b32
+                 )
+              )
+           )
+        )
+     )
+  );
 }
 
 // Returns 1 if the source arguments have modifiers, 0 if they do not.
@@ -765,7 +870,8 @@ class isFloatType<ValueType SrcVT> {
     !if(!eq(SrcVT.Value, f16.Value), 1,
     !if(!eq(SrcVT.Value, f32.Value), 1,
     !if(!eq(SrcVT.Value, f64.Value), 1,
-    0)));
+    !if(!eq(SrcVT.Value, v2f16.Value), 1,
+    0))));
 }
 
 class isIntType<ValueType SrcVT> {
@@ -776,6 +882,23 @@ class isIntType<ValueType SrcVT> {
     0)));
 }
 
+class isPackedType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, v2i16.Value), 1,
+      !if(!eq(SrcVT.Value, v2f16.Value), 1, 0)
+    );
+}
+
+// Float or packed int
+class isModifierType<ValueType SrcVT> {
+  bit ret =
+    !if(!eq(SrcVT.Value, f16.Value), 1,
+    !if(!eq(SrcVT.Value, f32.Value), 1,
+    !if(!eq(SrcVT.Value, f64.Value), 1,
+    !if(!eq(SrcVT.Value, v2f16.Value), 1,
+    !if(!eq(SrcVT.Value, v2i16.Value), 1,
+    0)))));
+}
 
 // Return type of input modifiers operand for specified input operand
 class getSrcMod <ValueType VT> {
@@ -783,6 +906,7 @@ class getSrcMod <ValueType VT> {
                !if(!eq(VT.Value, f32.Value), 1,
                !if(!eq(VT.Value, f64.Value), 1,
                0)));
+  bit isPacked = isPackedType<VT>.ret;
   Operand ret =  !if(!eq(VT.Size, 64),
                      !if(isFP, FP64InputMods, Int64InputMods),
                        !if(isFP,
@@ -813,8 +937,8 @@ class getIns32 <RegisterOperand Src0RC, RegisterClass Src1RC, int NumSrcArgs> {
 // Returns the input arguments for VOP3 instructions for the given SrcVT.
 class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
                 RegisterOperand Src2RC, int NumSrcArgs,
-                bit HasModifiers, Operand Src0Mod, Operand Src1Mod,
-                Operand Src2Mod> {
+                bit HasModifiers, bit HasOMod,
+                Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
 
   dag ret =
     !if (!eq(NumSrcArgs, 0),
@@ -833,9 +957,13 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
     !if (!eq(NumSrcArgs, 2),
       !if (!eq(HasModifiers, 1),
         // VOP 2 with modifiers
-        (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-             Src1Mod:$src1_modifiers, Src1RC:$src1,
-             clampmod:$clamp, omod:$omod)
+        !if( !eq(HasOMod, 1),
+          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               clampmod:$clamp, omod:$omod),
+           (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               clampmod:$clamp))
       /* else */,
         // VOP2 without modifiers
         (ins Src0RC:$src0, Src1RC:$src1)
@@ -843,16 +971,57 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
     /* NumSrcArgs == 3 */,
       !if (!eq(HasModifiers, 1),
         // VOP3 with modifiers
-        (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-             Src1Mod:$src1_modifiers, Src1RC:$src1,
-             Src2Mod:$src2_modifiers, Src2RC:$src2,
-             clampmod:$clamp, omod:$omod)
+        !if (!eq(HasOMod, 1),
+          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               Src2Mod:$src2_modifiers, Src2RC:$src2,
+               clampmod:$clamp, omod:$omod),
+          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+               Src1Mod:$src1_modifiers, Src1RC:$src1,
+               Src2Mod:$src2_modifiers, Src2RC:$src2,
+               clampmod:$clamp))
       /* else */,
         // VOP3 without modifiers
         (ins Src0RC:$src0, Src1RC:$src1, Src2RC:$src2)
       /* endif */ ))));
 }
 
+/// XXX - src1 may only allow VGPRs?
+
+// The modifiers (except clamp) are dummy operands for the benefit of
+// printing and parsing. They defer their values to looking at the
+// srcN_modifiers for what to print.
+class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
+                   RegisterOperand Src2RC, int NumSrcArgs,
+                   bit HasClamp,
+                   Operand Src0Mod, Operand Src1Mod, Operand Src2Mod> {
+  dag ret = !if (!eq(NumSrcArgs, 2),
+    !if (HasClamp,
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           clampmod:$clamp,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi),
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi)),
+    // else NumSrcArgs == 3
+    !if (HasClamp,
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           Src2Mod:$src2_modifiers, Src2RC:$src2,
+           clampmod:$clamp,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi),
+      (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
+           Src1Mod:$src1_modifiers, Src1RC:$src1,
+           Src2Mod:$src2_modifiers, Src2RC:$src2,
+           op_sel:$op_sel, op_sel_hi:$op_sel_hi,
+           neg_lo:$neg_lo, neg_hi:$neg_hi))
+  );
+}
+
 class getInsDPP <RegisterClass Src0RC, RegisterClass Src1RC, int NumSrcArgs,
                  bit HasModifiers, Operand Src0Mod, Operand Src1Mod> {
 
@@ -936,7 +1105,8 @@ class getAsm32 <bit HasDst, int NumSrcArgs, ValueType DstVT = i32> {
 
 // Returns the assembly string for the inputs and outputs of a VOP3
 // instruction.
-class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
+class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers,
+                bit HasOMod, ValueType DstVT = i32> {
   string dst = !if(!eq(DstVT.Size, 1), "$sdst", "$vdst"); // use $sdst for VOPC
   string src0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,");
   string src1 = !if(!eq(NumSrcArgs, 1), "",
@@ -946,7 +1116,26 @@ class getAsm64 <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT =
   string ret =
   !if(!eq(HasModifiers, 0),
       getAsm32<HasDst, NumSrcArgs, DstVT>.ret,
-      dst#", "#src0#src1#src2#"$clamp"#"$omod");
+      dst#", "#src0#src1#src2#"$clamp"#!if(HasOMod, "$omod", ""));
+}
+
+// Returns the assembly string for the inputs and outputs of a VOP3P
+// instruction.
+class getAsmVOP3P <bit HasDst, int NumSrcArgs, bit HasModifiers,
+                   bit HasClamp, ValueType DstVT = i32> {
+  string dst = " $vdst";
+  string src0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,");
+  string src1 = !if(!eq(NumSrcArgs, 1), "",
+                   !if(!eq(NumSrcArgs, 2), " $src1",
+                                           " $src1,"));
+  string src2 = !if(!eq(NumSrcArgs, 3), " $src2", "");
+
+  string mods = !if(HasModifiers, "$neg_lo$neg_hi", "");
+  string clamp = !if(HasClamp, "$clamp", "");
+
+  // Each modifier is printed as an array of bits for each operand, so
+  // all operands are printed as part of src0_modifiers.
+  string ret = dst#", "#src0#src1#src2#"$op_sel$op_sel_hi"#mods#clamp;
 }
 
 class getAsmDPP <bit HasDst, int NumSrcArgs, bit HasModifiers, ValueType DstVT = i32> {
@@ -1058,7 +1247,7 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field bit HasSrc2 = !if(!eq(Src2VT.Value, untyped.Value), 0, 1);
 
   // TODO: Modifiers logic is somewhat adhoc here, to be refined later
-  field bit HasModifiers = isFloatType<Src0VT>.ret;
+  field bit HasModifiers = isModifierType<Src0VT>.ret;
 
   field bit HasSrc0FloatMods = isFloatType<Src0VT>.ret;
   field bit HasSrc1FloatMods = isFloatType<Src1VT>.ret;
@@ -1072,12 +1261,20 @@ class VOPProfile <list<ValueType> _ArgVT> {
   field bit HasSrc1Mods = !if(HasModifiers, BitOr<HasSrc1FloatMods, HasSrc1IntMods>.ret, 0);
   field bit HasSrc2Mods = !if(HasModifiers, BitOr<HasSrc2FloatMods, HasSrc2IntMods>.ret, 0);
 
-  field bit HasOMod = HasModifiers;
   field bit HasClamp = HasModifiers;
   field bit HasSDWAClamp = HasSrc0;
+  field bit HasFPClamp = BitAnd<isFloatType<DstVT>.ret, HasClamp>.ret;
+
+  field bit IsPacked = isPackedType<Src0VT>.ret;
+  field bit HasOpSel = IsPacked;
+  field bit HasOMod = !if(HasOpSel, 0, HasModifiers);
 
   field bit HasExt = getHasExt<NumSrcArgs, DstVT, Src0VT, Src1VT>.ret;
 
+  field Operand Src0PackedMod = !if(HasSrc0FloatMods, PackedF16InputMods, PackedI16InputMods);
+  field Operand Src1PackedMod = !if(HasSrc1FloatMods, PackedF16InputMods, PackedI16InputMods);
+  field Operand Src2PackedMod = !if(HasSrc2FloatMods, PackedF16InputMods, PackedI16InputMods);
+
   field dag Outs = !if(HasDst,(outs DstRC:$vdst),(outs));
 
   // VOP3b instructions are a special case with a second explicit
@@ -1089,7 +1286,12 @@ class VOPProfile <list<ValueType> _ArgVT> {
 
   field dag Ins32 = getIns32<Src0RC32, Src1RC32, NumSrcArgs>.ret;
   field dag Ins64 = getIns64<Src0RC64, Src1RC64, Src2RC64, NumSrcArgs,
-                             HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+                             HasModifiers, HasOMod, Src0Mod, Src1Mod,
+                             Src2Mod>.ret;
+  field dag InsVOP3P = getInsVOP3P<Src0RC64, Src1RC64, Src2RC64,
+                                   NumSrcArgs, HasClamp,
+                                   Src0PackedMod, Src1PackedMod, Src2PackedMod>.ret;
+
   field dag InsDPP = getInsDPP<Src0DPP, Src1DPP, NumSrcArgs,
                                HasModifiers, Src0ModDPP, Src1ModDPP>.ret;
   field dag InsSDWA = getInsSDWA<Src0SDWA, Src1SDWA, NumSrcArgs,
@@ -1097,7 +1299,8 @@ class VOPProfile <list<ValueType> _ArgVT> {
                                  DstVT>.ret;
 
   field string Asm32 = getAsm32<HasDst, NumSrcArgs, DstVT>.ret;
-  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
+  field string Asm64 = getAsm64<HasDst, NumSrcArgs, HasModifiers, HasOMod, DstVT>.ret;
+  field string AsmVOP3P = getAsmVOP3P<HasDst, NumSrcArgs, HasModifiers, HasClamp, DstVT>.ret;
   field string AsmDPP = getAsmDPP<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
   field string AsmSDWA = getAsmSDWA<HasDst, NumSrcArgs, HasModifiers, DstVT>.ret;
 }
@@ -1113,11 +1316,18 @@ def VOP_I16_F16 : VOPProfile <[i16, f16, untyped, untyped]>;
 def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
 def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
-def VOP_I16_I16_I16 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
 
-def VOP_I16_I16_I16_I16 : VOPProfile <[i32, i32, i32, i32, untyped]>;
+def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
 def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
 
+def VOP_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, untyped]>;
+def VOP_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, untyped]>;
+def VOP_B32_F16_F16 : VOPProfile <[i32, f16, f16, untyped]>;
+
+def VOP_V2F16_V2F16_V2F16_V2F16 : VOPProfile <[v2f16, v2f16, v2f16, v2f16]>;
+def VOP_V2I16_V2I16_V2I16_V2I16 : VOPProfile <[v2i16, v2i16, v2i16, v2i16]>;
+
 def VOP_NONE : VOPProfile <[untyped, untyped, untyped, untyped]>;
 
 def VOP_F32_F32 : VOPProfile <[f32, f32, untyped, untyped]>;
@@ -1129,6 +1339,8 @@ def VOP_F64_I32 : VOPProfile <[f64, i32, untyped, untyped]>;
 def VOP_I32_F32 : VOPProfile <[i32, f32, untyped, untyped]>;
 def VOP_I32_F64 : VOPProfile <[i32, f64, untyped, untyped]>;
 def VOP_I32_I32 : VOPProfile <[i32, i32, untyped, untyped]>;
+def VOP_F16_F32 : VOPProfile <[f16, f32, untyped, untyped]>;
+def VOP_F32_F16 : VOPProfile <[f32, f16, untyped, untyped]>;
 
 def VOP_F32_F32_F16 : VOPProfile <[f32, f32, f16, untyped]>;
 def VOP_F32_F32_F32 : VOPProfile <[f32, f32, f32, untyped]>;
@@ -1138,6 +1350,7 @@ def VOP_F64_F64_I32 : VOPProfile <[f64, f64, i32, untyped]>;
 def VOP_I32_F32_F32 : VOPProfile <[i32, f32, f32, untyped]>;
 def VOP_I32_F32_I32 : VOPProfile <[i32, f32, i32, untyped]>;
 def VOP_I32_I32_I32 : VOPProfile <[i32, i32, i32, untyped]>;
+def VOP_V2F16_F32_F32 : VOPProfile <[v2f16, f32, f32, untyped]>;
 
 def VOP_I64_I64_I32 : VOPProfile <[i64, i64, i32, untyped]>;
 def VOP_I64_I32_I64 : VOPProfile <[i64, i32, i64, untyped]>;
@@ -1225,6 +1438,15 @@ def getVOPe32 : InstrMapping {
   let ValueCols = [["4", "0"]];
 }
 
+// Maps ordinary instructions to their SDWA counterparts
+def getSDWAOp : InstrMapping {
+  let FilterClass = "VOP";
+  let RowFields = ["OpName"];
+  let ColFields = ["AsmVariantName"];
+  let KeyCol = ["Default"];
+  let ValueCols = [["SDWA"]];
+}
+
 def getMaskedMIMGOp : InstrMapping {
   let FilterClass = "MIMG_Mask";
   let RowFields = ["Op"];
diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td
index 06516b24f329f60a3f99dad05b3f09b06f3a0b55..2f89503e129a3abdb1260e1cc33b8feeaca4474e 100644
--- a/lib/Target/AMDGPU/SIInstructions.td
+++ b/lib/Target/AMDGPU/SIInstructions.td
@@ -111,8 +111,7 @@ def V_MOV_B64_PSEUDO : VPseudoInstSI <(outs VReg_64:$vdst),
                                       (ins VSrc_b64:$src0)>;
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 
-def S_TRAP_PSEUDO : VPseudoInstSI <(outs), (ins),
-  [(trap)]> {
+def S_TRAP_PSEUDO : SPseudoInstSI <(outs), (ins i16imm:$simm16)> {
   let hasSideEffects = 1;
   let SALU = 1;
   let usesCustomInserter = 1;
@@ -153,6 +152,8 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
   let mayStore = 1;
   let isBarrier = 1;
   let isConvergent = 1;
+  let FixedSize = 1;
+  let Size = 0;
 }
 
 // SI pseudo instructions. These are used by the CFG structurizer pass
@@ -160,48 +161,44 @@ def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
 
 // Dummy terminator instruction to use after control flow instructions
 // replaced with exec mask operations.
-def SI_MASK_BRANCH : PseudoInstSI <
+def SI_MASK_BRANCH : VPseudoInstSI <
   (outs), (ins brtarget:$target)> {
   let isBranch = 0;
   let isTerminator = 1;
   let isBarrier = 0;
-  let Uses = [EXEC];
   let SchedRW = [];
   let hasNoSchedulingInfo = 1;
+  let FixedSize = 1;
+  let Size = 0;
 }
 
 let isTerminator = 1 in {
 
 def SI_IF: CFPseudoInstSI <
   (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target),
-  [(set i64:$dst, (int_amdgcn_if i1:$vcc, bb:$target))], 1, 1> {
+  [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> {
   let Constraints = "";
   let Size = 12;
-  let mayLoad = 1;
-  let mayStore = 1;
   let hasSideEffects = 1;
 }
 
 def SI_ELSE : CFPseudoInstSI <
-  (outs SReg_64:$dst), (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
+  (outs SReg_64:$dst),
+  (ins SReg_64:$src, brtarget:$target, i1imm:$execfix), [], 1, 1> {
   let Constraints = "$src = $dst";
   let Size = 12;
-  let mayStore = 1;
-  let mayLoad = 1;
   let hasSideEffects = 1;
 }
 
 def SI_LOOP : CFPseudoInstSI <
   (outs), (ins SReg_64:$saved, brtarget:$target),
-  [(int_amdgcn_loop i64:$saved, bb:$target)], 1, 1> {
+  [(AMDGPUloop i64:$saved, bb:$target)], 1, 1> {
   let Size = 8;
-  let isBranch = 1;
+  let isBranch = 0;
   let hasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
 }
 
-} // End isBranch = 1, isTerminator = 1
+} // End isTerminator = 1
 
 def SI_END_CF : CFPseudoInstSI <
   (outs), (ins SReg_64:$saved),
@@ -209,9 +206,9 @@ def SI_END_CF : CFPseudoInstSI <
   let Size = 4;
   let isAsCheapAsAMove = 1;
   let isReMaterializable = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
   let hasSideEffects = 1;
+  let mayLoad = 1; // FIXME: Should not need memory flags
+  let mayStore = 1;
 }
 
 def SI_BREAK : CFPseudoInstSI <
@@ -251,6 +248,10 @@ def SI_KILL_TERMINATOR : SPseudoInstSI <
   let isTerminator = 1;
 }
 
+def SI_ILLEGAL_COPY : SPseudoInstSI <
+  (outs unknown:$dst), (ins unknown:$src),
+  [], " ; illegal copy $src to $dst">;
+
 } // End Uses = [EXEC], Defs = [EXEC,VCC]
 
 // Branch on undef scc. Used to avoid intermediate copy from
@@ -266,6 +267,14 @@ def SI_PS_LIVE : PseudoInstSI <
   let SALU = 1;
 }
 
+def SI_MASKED_UNREACHABLE : SPseudoInstSI <(outs), (ins),
+  [(int_amdgcn_unreachable)],
+  "; divergent unreachable"> {
+  let Size = 0;
+  let hasNoSchedulingInfo = 1;
+  let FixedSize = 1;
+}
+
 // Used as an isel pseudo to directly emit initialization with an
 // s_mov_b32 rather than a copy of another initialized
 // register. MachineCSE skips copies, and we don't want to have to
@@ -277,12 +286,12 @@ def SI_INIT_M0 : SPseudoInstSI <(outs), (ins SSrc_b32:$src)> {
   let isReMaterializable = 1;
 }
 
-def SI_RETURN : SPseudoInstSI <
-  (outs), (ins variable_ops), [(AMDGPUreturn)]> {
+// Return for returning shaders to a shader variant epilog.
+def SI_RETURN_TO_EPILOG : SPseudoInstSI <
+  (outs), (ins variable_ops), [(AMDGPUreturn_to_epilog)]> {
   let isTerminator = 1;
   let isBarrier = 1;
   let isReturn = 1;
-  let hasSideEffects = 1;
   let hasNoSchedulingInfo = 1;
   let DisableWQM = 1;
 }
@@ -390,9 +399,18 @@ def SI_PC_ADD_REL_OFFSET : SPseudoInstSI <
 } // End SubtargetPredicate = isGCN
 
 let Predicates = [isGCN] in {
+def : Pat<
+  (trap),
+  (S_TRAP_PSEUDO TRAPID.LLVM_TRAP)
+>;
 
 def : Pat<
-  (int_amdgcn_else i64:$src, bb:$target),
+  (debugtrap),
+  (S_TRAP_PSEUDO TRAPID.LLVM_DEBUG_TRAP)
+>;
+
+def : Pat<
+  (AMDGPUelse i64:$src, bb:$target),
   (SI_ELSE $src, $target, 0)
 >;
 
@@ -430,9 +448,26 @@ def : Pat <
 
 } // End Predicates = [UnsafeFPMath]
 
+
+// f16_to_fp patterns
 def : Pat <
-  (f32 (fpextend f16:$src)),
-  (V_CVT_F32_F16_e32 $src)
+  (f32 (f16_to_fp i32:$src0)),
+  (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+  (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
+  (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+  (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
+  (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : Pat <
+  (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
+  (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
 def : Pat <
@@ -440,9 +475,10 @@ def : Pat <
   (V_CVT_F64_F32_e32 (V_CVT_F32_F16_e32 $src))
 >;
 
+// fp_to_fp16 patterns
 def : Pat <
-  (f16 (fpround f32:$src)),
-  (V_CVT_F16_F32_e32 $src)
+  (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)))),
+  (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, $clamp, $omod)
 >;
 
 def : Pat <
@@ -482,6 +518,16 @@ multiclass FMADPat <ValueType vt, Instruction inst> {
 defm : FMADPat <f16, V_MAC_F16_e64>;
 defm : FMADPat <f32, V_MAC_F32_e64>;
 
+class FMADModsPat<Instruction inst, SDPatternOperator mad_opr> : Pat<
+  (f32 (mad_opr (VOP3Mods f32:$src0, i32:$src0_mod),
+  (VOP3Mods f32:$src1, i32:$src1_mod),
+  (VOP3Mods f32:$src2, i32:$src2_mod))),
+  (inst $src0_mod, $src0, $src1_mod, $src1,
+  $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz>;
+
 multiclass SelectPat <ValueType vt, Instruction inst> {
   def : Pat <
     (vt (select i1:$src0, vt:$src1, vt:$src2)),
@@ -580,6 +626,16 @@ def : BitConvert <i32, f32, VGPR_32>;
 def : BitConvert <f32, i32, VGPR_32>;
 def : BitConvert <i32, f32, SReg_32>;
 def : BitConvert <f32, i32, SReg_32>;
+def : BitConvert <v2i16, i32, SReg_32>;
+def : BitConvert <i32, v2i16, SReg_32>;
+def : BitConvert <v2f16, i32, SReg_32>;
+def : BitConvert <i32, v2f16, SReg_32>;
+def : BitConvert <v2i16, v2f16, SReg_32>;
+def : BitConvert <v2f16, v2i16, SReg_32>;
+def : BitConvert <v2f16, f32, SReg_32>;
+def : BitConvert <f32, v2f16, SReg_32>;
+def : BitConvert <v2i16, f32, SReg_32>;
+def : BitConvert <f32, v2i16, SReg_32>;
 
 // 64-bit bitcast
 def : BitConvert <i64, f64, VReg_64>;
@@ -621,12 +677,20 @@ def : BitConvert <v16f32, v16i32, VReg_512>;
 /********** Src & Dst modifiers **********/
 /********** =================== **********/
 
-def : Pat <
-  (AMDGPUclamp (VOP3Mods0Clamp f32:$src0, i32:$src0_modifiers, i32:$omod),
-               (f32 FP_ZERO), (f32 FP_ONE)),
-  (V_ADD_F32_e64 $src0_modifiers, $src0, 0, (i32 0), 1, $omod)
+
+// If denormals are not enabled, it only impacts the compare of the
+// inputs. The output result is not flushed.
+class ClampPat<Instruction inst, ValueType vt> : Pat <
+  (vt (AMDGPUclamp
+        (VOP3Mods0Clamp vt:$src0, i32:$src0_modifiers, i32:$omod))),
+  (inst i32:$src0_modifiers, vt:$src0,
+        i32:$src0_modifiers, vt:$src0, DSTCLAMP.ENABLE, $omod)
 >;
 
+def : ClampPat<V_MAX_F32_e64, f32>;
+def : ClampPat<V_MAX_F64, f64>;
+def : ClampPat<V_MAX_F16_e64, f16>;
+
 /********** ================================ **********/
 /********** Floating point absolute/negative **********/
 /********** ================================ **********/
@@ -725,6 +789,25 @@ def : Pat <
   (S_OR_B32 $src, (S_MOV_B32 (i32 0x00008000))) // Set sign bit
 >;
 
+def : Pat <
+  (fneg v2f16:$src),
+  (V_XOR_B32_e64 (S_MOV_B32 (i32 0x80008000)), $src)
+>;
+
+def : Pat <
+  (fabs v2f16:$src),
+  (V_AND_B32_e64 (S_MOV_B32 (i32 0x7fff7fff)), $src)
+>;
+
+// This is really (fneg (fabs v2f16:$src))
+//
+// fabs is not reported as free because there is modifier for it in
+// VOP3P instructions, so it is turned into the bit op.
+def : Pat <
+  (fneg (v2f16 (bitconvert (and_oneuse i32:$src, 0x7fff7fff)))),
+  (S_OR_B32 (S_MOV_B32 (i32 0x80008000)), $src) // Set sign bit
+>;
+
 /********** ================== **********/
 /********** Immediate Patterns **********/
 /********** ================== **********/
@@ -791,27 +874,6 @@ def : Pat <
 
 def : POW_Common <V_LOG_F32_e32, V_EXP_F32_e32, V_MUL_LEGACY_F32_e32>;
 
-def : Pat <
-  (int_AMDGPU_cube v4f32:$src),
-  (REG_SEQUENCE VReg_128,
-    (V_CUBETC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */, (f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src2_modifiers */, (f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub0,
-    (V_CUBESC_F32 0 /* src0_modifiers */, (f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src2_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub1,
-    (V_CUBEMA_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub2,
-    (V_CUBEID_F32 0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub0)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub1)),
-                  0 /* src1_modifiers */,(f32 (EXTRACT_SUBREG $src, sub2)),
-                  0 /* clamp */, 0 /* omod */), sub3)
->;
-
 def : Pat <
   (i32 (sext i1:$src0)),
   (V_CNDMASK_B32_e64 (i32 0), (i32 -1), $src0)
@@ -1018,6 +1080,11 @@ def : Pat <
 //===----------------------------------------------------------------------===//
 // Miscellaneous Patterns
 //===----------------------------------------------------------------------===//
+def : Pat <
+  (i32 (AMDGPUfp16_zext f16:$src)),
+  (COPY $src)
+>;
+
 
 def : Pat <
   (i32 (trunc i64:$a)),
@@ -1029,6 +1096,11 @@ def : Pat <
   (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
 >;
 
+def : Pat <
+  (i1 (trunc i16:$a)),
+  (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1), $a), (i32 1))
+>;
+
 def : Pat <
   (i1 (trunc i64:$a)),
   (V_CMP_EQ_U32_e64 (S_AND_B32 (i32 1),
@@ -1056,24 +1128,29 @@ multiclass BFMPatterns <ValueType vt, InstSI BFM, InstSI MOV> {
 
 defm : BFMPatterns <i32, S_BFM_B32, S_MOV_B32>;
 // FIXME: defm : BFMPatterns <i64, S_BFM_B64, S_MOV_B64>;
+defm : BFEPattern <V_BFE_U32, V_BFE_I32, S_MOV_B32>;
 
-def : BFEPattern <V_BFE_U32, S_MOV_B32>;
+def : Pat<
+  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
+>;
 
 def : Pat<
-  (fcanonicalize f16:$src),
-  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), 0, $src, 0, 0)
+  (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
+  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
 >;
 
 def : Pat<
-  (fcanonicalize f32:$src),
-  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), 0, $src, 0, 0)
+  (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+  (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
 >;
 
 def : Pat<
-  (fcanonicalize f64:$src),
-  (V_MUL_F64 0, CONST.FP64_ONE, 0, $src, 0, 0)
+  (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+  (V_PK_MUL_F16 SRCMODS.OP_SEL_1, (i32 CONST.V2FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
 >;
 
+
 // Allow integer inputs
 class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat<
   (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)),
@@ -1083,6 +1160,40 @@ class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : Pat<
 def : ExpPattern<AMDGPUexport, i32, EXP>;
 def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
 
+def : Pat <
+  (v2i16 (build_vector i16:$src0, i16:$src1)),
+  (v2i16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
+// With multiple uses of the shift, this will duplicate the shift and
+// increase register pressure.
+def : Pat <
+  (v2i16 (build_vector i16:$src0, (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
+  (v2i16 (S_PACK_LH_B32_B16 i16:$src0, i32:$src1))
+>;
+
+def : Pat <
+  (v2i16 (build_vector (i16 (trunc (srl_oneuse i32:$src0, (i32 16)))),
+                       (i16 (trunc (srl_oneuse i32:$src1, (i32 16)))))),
+  (v2i16 (S_PACK_HH_B32_B16 $src0, $src1))
+>;
+
+// TODO: Should source modifiers be matched to v_pack_b32_f16?
+def : Pat <
+  (v2f16 (build_vector f16:$src0, f16:$src1)),
+  (v2f16 (S_PACK_LL_B32_B16 $src0, $src1))
+>;
+
+// def : Pat <
+//   (v2f16 (scalar_to_vector f16:$src0)),
+//   (COPY $src0)
+// >;
+
+// def : Pat <
+//   (v2i16 (scalar_to_vector i16:$src0)),
+//   (COPY $src0)
+// >;
+
 //===----------------------------------------------------------------------===//
 // Fract Patterns
 //===----------------------------------------------------------------------===//
@@ -1120,12 +1231,6 @@ def : Pat <
 // Miscellaneous Optimization Patterns
 //============================================================================//
 
-def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
-
-def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
-def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
-
-
 // Undo sub x, c -> add x, -c canonicalization since c is more likely
 // an inline immediate than -c.
 // TODO: Also do for 64-bit.
@@ -1134,6 +1239,31 @@ def : Pat<
   (S_SUB_I32 $src0, NegSubInlineConst32:$src1)
 >;
 
+def : SHA256MaPattern <V_BFI_B32, V_XOR_B32_e64>;
+
+def : IntMed3Pat<V_MED3_I32, smax, smax_oneuse, smin_oneuse>;
+def : IntMed3Pat<V_MED3_U32, umax, umax_oneuse, umin_oneuse>;
+
+// This matches 16 permutations of
+// max(min(x, y), min(max(x, y), z))
+class FPMed3Pat<ValueType vt,
+                Instruction med3Inst> : Pat<
+  (fmaxnum (fminnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+           (fminnum_oneuse (fmaxnum_oneuse (VOP3Mods_nnan vt:$src0, i32:$src0_mods),
+                                           (VOP3Mods_nnan vt:$src1, i32:$src1_mods)),
+                           (vt (VOP3Mods_nnan vt:$src2, i32:$src2_mods)))),
+  (med3Inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
+>;
+
+def : FPMed3Pat<f32, V_MED3_F32>;
+
+let Predicates = [isGFX9] in {
+def : FPMed3Pat<f16, V_MED3_F16>;
+def : IntMed3Pat<V_MED3_I16, smax, smax_oneuse, smin_oneuse, i16>;
+def : IntMed3Pat<V_MED3_U16, umax, umax_oneuse, umin_oneuse, i16>;
+} // End Predicates = [isGFX9]
+
 //============================================================================//
 // Assembler aliases
 //============================================================================//
diff --git a/lib/Target/AMDGPU/SIIntrinsics.td b/lib/Target/AMDGPU/SIIntrinsics.td
index 5da375468713c86722698d4894402694b9991a2c..7b7cf1635050bdf18f7129593d8c52b684b93132 100644
--- a/lib/Target/AMDGPU/SIIntrinsics.td
+++ b/lib/Target/AMDGPU/SIIntrinsics.td
@@ -14,23 +14,7 @@
 
 
 let TargetPrefix = "SI", isTarget = 1 in {
-  def int_SI_packf16 : Intrinsic <[llvm_i32_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>;
-
-  def int_SI_export : Intrinsic <[],
-    [llvm_i32_ty,   // en
-    llvm_i32_ty,    // vm   (FIXME: should be i1)
-    llvm_i32_ty,    // done (FIXME: should be i1)
-    llvm_i32_ty,    // tgt
-    llvm_i32_ty,    // compr (FIXME: should be i1)
-    llvm_float_ty,  // src0
-    llvm_float_ty,  // src1
-    llvm_float_ty,  // src2
-    llvm_float_ty], // src3
-    []
-  >;
-
   def int_SI_load_const : Intrinsic <[llvm_float_ty], [llvm_anyint_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_vs_load_input : Intrinsic <[llvm_v4f32_ty], [llvm_anyint_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]> ;
 
   // Fully-flexible TBUFFER_STORE_FORMAT_* except for the ADDR64 bit, which is not exposed
   def int_SI_tbuffer_store : Intrinsic <
@@ -64,146 +48,4 @@ let TargetPrefix = "SI", isTarget = 1 in {
      llvm_i32_ty],    // tfe(imm)
     [IntrReadMem, IntrArgMemOnly]>;
 
-  def int_SI_sendmsg : Intrinsic <[], [llvm_i32_ty, llvm_i32_ty], []>;
-
-  // Fully-flexible SAMPLE instruction.
-  class SampleRaw : Intrinsic <
-    [llvm_v4f32_ty],    // vdata(VGPR)
-    [llvm_anyint_ty,    // vaddr(VGPR)
-     llvm_v8i32_ty,     // rsrc(SGPR)
-     llvm_v4i32_ty,     // sampler(SGPR)
-     llvm_i32_ty,       // dmask(imm)
-     llvm_i32_ty,       // unorm(imm)
-     llvm_i32_ty,       // r128(imm)
-     llvm_i32_ty,       // da(imm)
-     llvm_i32_ty,       // glc(imm)
-     llvm_i32_ty,       // slc(imm)
-     llvm_i32_ty,       // tfe(imm)
-     llvm_i32_ty],      // lwe(imm)
-    [IntrNoMem]>;
-
-  // Image instruction without a sampler.
-  class Image : Intrinsic <
-    [llvm_v4f32_ty],    // vdata(VGPR)
-    [llvm_anyint_ty,    // vaddr(VGPR)
-     llvm_v8i32_ty,     // rsrc(SGPR)
-     llvm_i32_ty,       // dmask(imm)
-     llvm_i32_ty,       // unorm(imm)
-     llvm_i32_ty,       // r128(imm)
-     llvm_i32_ty,       // da(imm)
-     llvm_i32_ty,       // glc(imm)
-     llvm_i32_ty,       // slc(imm)
-     llvm_i32_ty,       // tfe(imm)
-     llvm_i32_ty],      // lwe(imm)
-    [IntrNoMem]>;
-
-  // Basic sample
-  def int_SI_image_sample : SampleRaw;
-  def int_SI_image_sample_cl : SampleRaw;
-  def int_SI_image_sample_d : SampleRaw;
-  def int_SI_image_sample_d_cl : SampleRaw;
-  def int_SI_image_sample_l : SampleRaw;
-  def int_SI_image_sample_b : SampleRaw;
-  def int_SI_image_sample_b_cl : SampleRaw;
-  def int_SI_image_sample_lz : SampleRaw;
-  def int_SI_image_sample_cd : SampleRaw;
-  def int_SI_image_sample_cd_cl : SampleRaw;
-
-  // Sample with comparison
-  def int_SI_image_sample_c : SampleRaw;
-  def int_SI_image_sample_c_cl : SampleRaw;
-  def int_SI_image_sample_c_d : SampleRaw;
-  def int_SI_image_sample_c_d_cl : SampleRaw;
-  def int_SI_image_sample_c_l : SampleRaw;
-  def int_SI_image_sample_c_b : SampleRaw;
-  def int_SI_image_sample_c_b_cl : SampleRaw;
-  def int_SI_image_sample_c_lz : SampleRaw;
-  def int_SI_image_sample_c_cd : SampleRaw;
-  def int_SI_image_sample_c_cd_cl : SampleRaw;
-
-  // Sample with offsets
-  def int_SI_image_sample_o : SampleRaw;
-  def int_SI_image_sample_cl_o : SampleRaw;
-  def int_SI_image_sample_d_o : SampleRaw;
-  def int_SI_image_sample_d_cl_o : SampleRaw;
-  def int_SI_image_sample_l_o : SampleRaw;
-  def int_SI_image_sample_b_o : SampleRaw;
-  def int_SI_image_sample_b_cl_o : SampleRaw;
-  def int_SI_image_sample_lz_o : SampleRaw;
-  def int_SI_image_sample_cd_o : SampleRaw;
-  def int_SI_image_sample_cd_cl_o : SampleRaw;
-
-  // Sample with comparison and offsets
-  def int_SI_image_sample_c_o : SampleRaw;
-  def int_SI_image_sample_c_cl_o : SampleRaw;
-  def int_SI_image_sample_c_d_o : SampleRaw;
-  def int_SI_image_sample_c_d_cl_o : SampleRaw;
-  def int_SI_image_sample_c_l_o : SampleRaw;
-  def int_SI_image_sample_c_b_o : SampleRaw;
-  def int_SI_image_sample_c_b_cl_o : SampleRaw;
-  def int_SI_image_sample_c_lz_o : SampleRaw;
-  def int_SI_image_sample_c_cd_o : SampleRaw;
-  def int_SI_image_sample_c_cd_cl_o : SampleRaw;
-
-  // Basic gather4
-  def int_SI_gather4 : SampleRaw;
-  def int_SI_gather4_cl : SampleRaw;
-  def int_SI_gather4_l : SampleRaw;
-  def int_SI_gather4_b : SampleRaw;
-  def int_SI_gather4_b_cl : SampleRaw;
-  def int_SI_gather4_lz : SampleRaw;
-
-  // Gather4 with comparison
-  def int_SI_gather4_c : SampleRaw;
-  def int_SI_gather4_c_cl : SampleRaw;
-  def int_SI_gather4_c_l : SampleRaw;
-  def int_SI_gather4_c_b : SampleRaw;
-  def int_SI_gather4_c_b_cl : SampleRaw;
-  def int_SI_gather4_c_lz : SampleRaw;
-
-  // Gather4 with offsets
-  def int_SI_gather4_o : SampleRaw;
-  def int_SI_gather4_cl_o : SampleRaw;
-  def int_SI_gather4_l_o : SampleRaw;
-  def int_SI_gather4_b_o : SampleRaw;
-  def int_SI_gather4_b_cl_o : SampleRaw;
-  def int_SI_gather4_lz_o : SampleRaw;
-
-  // Gather4 with comparison and offsets
-  def int_SI_gather4_c_o : SampleRaw;
-  def int_SI_gather4_c_cl_o : SampleRaw;
-  def int_SI_gather4_c_l_o : SampleRaw;
-  def int_SI_gather4_c_b_o : SampleRaw;
-  def int_SI_gather4_c_b_cl_o : SampleRaw;
-  def int_SI_gather4_c_lz_o : SampleRaw;
-
-  def int_SI_getlod : SampleRaw;
-
-  // Image instrinsics.
-  def int_SI_image_load : Image;
-  def int_SI_image_load_mip : Image;
-  def int_SI_getresinfo : Image;
-
-  /* Interpolation Intrinsics */
-
-  def int_SI_fs_constant : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_SI_fs_interp : Intrinsic <[llvm_float_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_v2i32_ty], [IntrNoMem]>;
 } // End TargetPrefix = "SI", isTarget = 1
-
-let TargetPrefix = "amdgcn", isTarget = 1 in {
-  // Emit 2.5 ulp, no denormal division. Should only be inserted by
-  // pass based on !fpmath metadata.
-  def int_amdgcn_fdiv_fast : Intrinsic<
-    [llvm_float_ty], [llvm_float_ty], [IntrNoMem]
-  >;
-
-  /* Control flow Intrinsics */
-
-  def int_amdgcn_if : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_else : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_if_break : Intrinsic<[llvm_i64_ty], [llvm_i1_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_else_break : Intrinsic<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty], [IntrNoMem, IntrConvergent]>;
-  def int_amdgcn_loop : Intrinsic<[], [llvm_i64_ty, llvm_empty_ty], [IntrConvergent]>;
-  def int_amdgcn_end_cf : Intrinsic<[], [llvm_i64_ty], [IntrConvergent]>;
-}
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index ecd46b95ca6fe85d102246266b945f96b6e36613..8e612d2ddfdae85f40b3d73712b79465609a38b7 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -20,12 +20,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> EnableSpillSGPRToVGPR(
-  "amdgpu-spill-sgpr-to-vgpr",
-  cl::desc("Enable spilling VGPRs to SGPRs"),
-  cl::ReallyHidden,
-  cl::init(true));
-
 SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   : AMDGPUMachineFunction(MF),
     TIDReg(AMDGPU::NoRegister),
@@ -47,13 +41,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     WorkGroupInfoSystemSGPR(AMDGPU::NoRegister),
     PrivateSegmentWaveByteOffsetSystemSGPR(AMDGPU::NoRegister),
     PSInputAddr(0),
+    PSInputEnable(0),
     ReturnsVoid(true),
     FlatWorkGroupSizes(0, 0),
     WavesPerEU(0, 0),
     DebuggerWorkGroupIDStackObjectIndices({{0, 0, 0}}),
     DebuggerWorkItemIDStackObjectIndices({{0, 0, 0}}),
     LDSWaveSpillSize(0),
-    PSInputEna(0),
     NumUserSGPRs(0),
     NumSystemSGPRs(0),
     HasSpilledSGPRs(false),
@@ -81,34 +75,48 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     PrivateMemoryInputPtr(false) {
   const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
   const Function *F = MF.getFunction();
+  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
+  WavesPerEU = ST.getWavesPerEU(*F);
 
-  PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
+  // Non-entry functions have no special inputs for now.
+  // TODO: Return early for non-entry CCs.
 
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  CallingConv::ID CC = F->getCallingConv();
+  if (CC == CallingConv::AMDGPU_PS)
+    PSInputAddr = AMDGPU::getInitialPSInputAddr(*F);
 
-  if (!AMDGPU::isShader(F->getCallingConv())) {
+  if (AMDGPU::isKernel(CC)) {
     KernargSegmentPtr = true;
     WorkGroupIDX = true;
     WorkItemIDX = true;
   }
 
-  if (F->hasFnAttribute("amdgpu-work-group-id-y") || ST.debuggerEmitPrologue())
+  if (ST.debuggerEmitPrologue()) {
+    // Enable everything.
     WorkGroupIDY = true;
-
-  if (F->hasFnAttribute("amdgpu-work-group-id-z") || ST.debuggerEmitPrologue())
     WorkGroupIDZ = true;
-
-  if (F->hasFnAttribute("amdgpu-work-item-id-y") || ST.debuggerEmitPrologue())
     WorkItemIDY = true;
-
-  if (F->hasFnAttribute("amdgpu-work-item-id-z") || ST.debuggerEmitPrologue())
     WorkItemIDZ = true;
+  } else {
+    if (F->hasFnAttribute("amdgpu-work-group-id-y"))
+      WorkGroupIDY = true;
+
+    if (F->hasFnAttribute("amdgpu-work-group-id-z"))
+      WorkGroupIDZ = true;
+
+    if (F->hasFnAttribute("amdgpu-work-item-id-y"))
+      WorkItemIDY = true;
+
+    if (F->hasFnAttribute("amdgpu-work-item-id-z"))
+      WorkItemIDZ = true;
+  }
 
   // X, XY, and XYZ are the only supported combinations, so make sure Y is
   // enabled if Z is.
   if (WorkItemIDZ)
     WorkItemIDY = true;
 
+  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   bool MaySpill = ST.isVGPRSpillingEnabled(*F);
   bool HasStackObjects = FrameInfo.hasStackObjects();
 
@@ -135,12 +143,8 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   // We don't need to worry about accessing spills with flat instructions.
   // TODO: On VI where we must use flat for global, we should be able to omit
   // this if it is never used for generic access.
-  if (HasStackObjects && ST.getGeneration() >= SISubtarget::SEA_ISLANDS &&
-      ST.isAmdHsaOS())
+  if (HasStackObjects && ST.hasFlatAddressSpace() && ST.isAmdHsaOS())
     FlatScratchInit = true;
-
-  FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(*F);
-  WavesPerEU = ST.getWavesPerEU(*F);
 }
 
 unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
@@ -193,45 +197,60 @@ unsigned SIMachineFunctionInfo::addPrivateMemoryPtr(const SIRegisterInfo &TRI) {
   return PrivateMemoryPtrUserSGPR;
 }
 
-SIMachineFunctionInfo::SpilledReg SIMachineFunctionInfo::getSpilledReg (
-                                                       MachineFunction *MF,
-                                                       unsigned FrameIndex,
-                                                       unsigned SubIdx) {
-  if (!EnableSpillSGPRToVGPR)
-    return SpilledReg();
-
-  const SISubtarget &ST = MF->getSubtarget<SISubtarget>();
-  const SIRegisterInfo *TRI = ST.getRegisterInfo();
-
-  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-  MachineRegisterInfo &MRI = MF->getRegInfo();
-  int64_t Offset = FrameInfo.getObjectOffset(FrameIndex);
-  Offset += SubIdx * 4;
-
-  unsigned LaneVGPRIdx = Offset / (64 * 4);
-  unsigned Lane = (Offset / 4) % 64;
-
-  struct SpilledReg Spill;
-  Spill.Lane = Lane;
-
-  if (!LaneVGPRs.count(LaneVGPRIdx)) {
-    unsigned LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass,
-                                                *MF);
+/// Reserve a slice of a VGPR to support spilling for FrameIndex \p FI.
+bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
+                                                    int FI) {
+  std::vector<SpilledReg> &SpillLanes = SGPRToVGPRSpills[FI];
 
-    if (LaneVGPR == AMDGPU::NoRegister)
-      // We have no VGPRs left for spilling SGPRs.
-      return Spill;
+  // This has already been allocated.
+  if (!SpillLanes.empty())
+    return true;
 
-    LaneVGPRs[LaneVGPRIdx] = LaneVGPR;
-
-    // Add this register as live-in to all blocks to avoid machine verifer
-    // complaining about use of an undefined physical register.
-    for (MachineFunction::iterator BI = MF->begin(), BE = MF->end();
-         BI != BE; ++BI) {
-      BI->addLiveIn(LaneVGPR);
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  unsigned WaveSize = ST.getWavefrontSize();
+
+  unsigned Size = FrameInfo.getObjectSize(FI);
+  assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
+  assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
+
+  int NumLanes = Size / 4;
+
+  // Make sure to handle the case where a wide SGPR spill may span between two
+  // VGPRs.
+  for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
+    unsigned LaneVGPR;
+    unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
+
+    if (VGPRIndex == 0) {
+      LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+      if (LaneVGPR == AMDGPU::NoRegister) {
+        // We have no VGPRs left for spilling SGPRs. Reset because we won't
+        // partially spill the SGPR to VGPRs.
+        SGPRToVGPRSpills.erase(FI);
+        NumVGPRSpillLanes -= I;
+        return false;
+      }
+
+      SpillVGPRs.push_back(LaneVGPR);
+
+      // Add this register as live-in to all blocks to avoid machine verifer
+      // complaining about use of an undefined physical register.
+      for (MachineBasicBlock &BB : MF)
+        BB.addLiveIn(LaneVGPR);
+    } else {
+      LaneVGPR = SpillVGPRs.back();
     }
+
+    SpillLanes.push_back(SpilledReg(LaneVGPR, VGPRIndex));
   }
 
-  Spill.VGPR = LaneVGPRs[LaneVGPRIdx];
-  return Spill;
+  return true;
+}
+
+void SIMachineFunctionInfo::removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI) {
+  for (auto &R : SGPRToVGPRSpills)
+    MFI.RemoveStackObject(R.first);
 }
diff --git a/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index dc1f22ae60d77527856e407c51adc683d2fa845f..a84f3e274f82aa22d5350860a02df2fae9d63ef1 100644
--- a/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -16,6 +16,7 @@
 
 #include "AMDGPUMachineFunction.h"
 #include "SIRegisterInfo.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -112,6 +113,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
 
   // Graphics info.
   unsigned PSInputAddr;
+  unsigned PSInputEnable;
+
   bool ReturnsVoid;
 
   // A pair of default/requested minimum/maximum flat work group sizes.
@@ -133,8 +136,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
 public:
   // FIXME: Make private
   unsigned LDSWaveSpillSize;
-  unsigned PSInputEna;
-  std::map<unsigned, unsigned> LaneVGPRs;
   unsigned ScratchOffsetReg;
   unsigned NumUserSGPRs;
   unsigned NumSystemSGPRs;
@@ -195,12 +196,29 @@ public:
     bool hasReg() { return VGPR != AMDGPU::NoRegister;}
   };
 
-  // SIMachineFunctionInfo definition
+private:
+  // SGPR->VGPR spilling support.
+  typedef std::pair<unsigned, unsigned> SpillRegMask;
+
+  // Track VGPR + wave index for each subregister of the SGPR spilled to
+  // frameindex key.
+  DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
+  unsigned NumVGPRSpillLanes = 0;
+  SmallVector<unsigned, 2> SpillVGPRs;
+
+public:
 
   SIMachineFunctionInfo(const MachineFunction &MF);
 
-  SpilledReg getSpilledReg(MachineFunction *MF, unsigned FrameIndex,
-                           unsigned SubIdx);
+  ArrayRef<SpilledReg> getSGPRToVGPRSpills(int FrameIndex) const {
+    auto I = SGPRToVGPRSpills.find(FrameIndex);
+    return (I == SGPRToVGPRSpills.end()) ?
+      ArrayRef<SpilledReg>() : makeArrayRef(I->second);
+  }
+
+  bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
+  void removeSGPRToVGPRFrameIndices(MachineFrameInfo &MFI);
+
   bool hasCalculatedTID() const { return TIDReg != AMDGPU::NoRegister; };
   unsigned getTIDReg() const { return TIDReg; };
   void setTIDReg(unsigned Reg) { TIDReg = Reg; }
@@ -405,6 +423,10 @@ public:
     return PSInputAddr;
   }
 
+  unsigned getPSInputEnable() const {
+    return PSInputEnable;
+  }
+
   bool isPSInputAllocated(unsigned Index) const {
     return PSInputAddr & (1 << Index);
   }
@@ -413,6 +435,10 @@ public:
     PSInputAddr |= 1 << Index;
   }
 
+  void markPSInputEnabled(unsigned Index) {
+    PSInputEnable |= 1 << Index;
+  }
+
   bool returnsVoid() const {
     return ReturnsVoid;
   }
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.cpp b/lib/Target/AMDGPU/SIMachineScheduler.cpp
index da86bbf9dd2a2c90f670933e0c5e0d25a28454a1..9d4e677400e69580e431303c84a03fe6f82ea940 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -539,21 +539,30 @@ void SIScheduleBlock::addPred(SIScheduleBlock *Pred) {
   Preds.push_back(Pred);
 
   assert(none_of(Succs,
-                 [=](SIScheduleBlock *S) { return PredID == S->getID(); }) &&
+                 [=](std::pair<SIScheduleBlock*,
+                     SIScheduleBlockLinkKind> S) {
+                   return PredID == S.first->getID();
+                    }) &&
          "Loop in the Block Graph!");
 }
 
-void SIScheduleBlock::addSucc(SIScheduleBlock *Succ) {
+void SIScheduleBlock::addSucc(SIScheduleBlock *Succ,
+                              SIScheduleBlockLinkKind Kind) {
   unsigned SuccID = Succ->getID();
 
   // Check if not already predecessor.
-  for (SIScheduleBlock* S : Succs) {
-    if (SuccID == S->getID())
+  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> &S : Succs) {
+    if (SuccID == S.first->getID()) {
+      if (S.second == SIScheduleBlockLinkKind::NoData &&
+          Kind == SIScheduleBlockLinkKind::Data)
+        S.second = Kind;
       return;
+    }
   }
   if (Succ->isHighLatencyBlock())
     ++NumHighLatencySuccessors;
-  Succs.push_back(Succ);
+  Succs.push_back(std::make_pair(Succ, Kind));
+
   assert(none_of(Preds,
                  [=](SIScheduleBlock *P) { return SuccID == P->getID(); }) &&
          "Loop in the Block Graph!");
@@ -573,8 +582,10 @@ void SIScheduleBlock::printDebug(bool full) {
   }
 
   dbgs() << "\nSuccessors:\n";
-  for (SIScheduleBlock* S : Succs) {
-    S->printDebug(false);
+  for (std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind> S : Succs) {
+    if (S.second == SIScheduleBlockLinkKind::Data)
+      dbgs() << "(Data Dep) ";
+    S.first->printDebug(false);
   }
 
   if (Scheduled) {
@@ -651,11 +662,21 @@ void SIScheduleBlockCreator::colorHighLatenciesAlone() {
   }
 }
 
+static bool
+hasDataDependencyPred(const SUnit &SU, const SUnit &FromSU) {
+  for (const auto &PredDep : SU.Preds) {
+    if (PredDep.getSUnit() == &FromSU &&
+        PredDep.getKind() == llvm::SDep::Data)
+      return true;
+  }
+  return false;
+}
+
 void SIScheduleBlockCreator::colorHighLatenciesGroups() {
   unsigned DAGSize = DAG->SUnits.size();
   unsigned NumHighLatencies = 0;
   unsigned GroupSize;
-  unsigned Color = NextReservedID;
+  int Color = NextReservedID;
   unsigned Count = 0;
   std::set<unsigned> FormingGroup;
 
@@ -675,35 +696,102 @@ void SIScheduleBlockCreator::colorHighLatenciesGroups() {
   else
     GroupSize = 4;
 
-  for (unsigned i = 0, e = DAGSize; i != e; ++i) {
-    SUnit *SU = &DAG->SUnits[i];
-    if (DAG->IsHighLatencySU[SU->NodeNum]) {
+  for (unsigned SUNum : DAG->TopDownIndex2SU) {
+    const SUnit &SU = DAG->SUnits[SUNum];
+    if (DAG->IsHighLatencySU[SU.NodeNum]) {
       unsigned CompatibleGroup = true;
-      unsigned ProposedColor = Color;
+      int ProposedColor = Color;
+      std::vector<int> AdditionalElements;
+
+      // We don't want to put in the same block
+      // two high latency instructions that depend
+      // on each other.
+      // One way would be to check canAddEdge
+      // in both directions, but that currently is not
+      // enough because there the high latency order is
+      // enforced (via links).
+      // Instead, look at the dependencies between the
+      // high latency instructions and deduce if it is
+      // a data dependency or not.
       for (unsigned j : FormingGroup) {
-        // TODO: Currently CompatibleGroup will always be false,
-        // because the graph enforces the load order. This
-        // can be fixed, but as keeping the load order is often
-        // good for performance that causes a performance hit (both
-        // the default scheduler and this scheduler).
-        // When this scheduler determines a good load order,
-        // this can be fixed.
-        if (!DAG->canAddEdge(SU, &DAG->SUnits[j]) ||
-            !DAG->canAddEdge(&DAG->SUnits[j], SU))
+        bool HasSubGraph;
+        std::vector<int> SubGraph;
+        // By construction (topological order), if SU and
+        // DAG->SUnits[j] are linked, DAG->SUnits[j] is neccessary
+        // in the parent graph of SU.
+#ifndef NDEBUG
+        SubGraph = DAG->GetTopo()->GetSubGraph(SU, DAG->SUnits[j],
+                                               HasSubGraph);
+        assert(!HasSubGraph);
+#endif
+        SubGraph = DAG->GetTopo()->GetSubGraph(DAG->SUnits[j], SU,
+                                               HasSubGraph);
+        if (!HasSubGraph)
+          continue; // No dependencies between each other
+        else if (SubGraph.size() > 5) {
+          // Too many elements would be required to be added to the block.
           CompatibleGroup = false;
+          break;
+        }
+        else {
+          // Check the type of dependency
+          for (unsigned k : SubGraph) {
+            // If in the path to join the two instructions,
+            // there is another high latency instruction,
+            // or instructions colored for another block
+            // abort the merge.
+            if (DAG->IsHighLatencySU[k] ||
+                (CurrentColoring[k] != ProposedColor &&
+                 CurrentColoring[k] != 0)) {
+              CompatibleGroup = false;
+              break;
+            }
+            // If one of the SU in the subgraph depends on the result of SU j,
+            // there'll be a data dependency.
+            if (hasDataDependencyPred(DAG->SUnits[k], DAG->SUnits[j])) {
+              CompatibleGroup = false;
+              break;
+            }
+          }
+          if (!CompatibleGroup)
+            break;
+          // Same check for the SU
+          if (hasDataDependencyPred(SU, DAG->SUnits[j])) {
+            CompatibleGroup = false;
+            break;
+          }
+          // Add all the required instructions to the block
+          // These cannot live in another block (because they
+          // depend (order dependency) on one of the
+          // instruction in the block, and are required for the
+          // high latency instruction we add.
+          AdditionalElements.insert(AdditionalElements.end(),
+                                    SubGraph.begin(), SubGraph.end());
+        }
       }
-      if (!CompatibleGroup || ++Count == GroupSize) {
+      if (CompatibleGroup) {
+        FormingGroup.insert(SU.NodeNum);
+        for (unsigned j : AdditionalElements)
+          CurrentColoring[j] = ProposedColor;
+        CurrentColoring[SU.NodeNum] = ProposedColor;
+        ++Count;
+      }
+      // Found one incompatible instruction,
+      // or has filled a big enough group.
+      // -> start a new one.
+      if (!CompatibleGroup) {
         FormingGroup.clear();
         Color = ++NextReservedID;
-        if (!CompatibleGroup) {
-          ProposedColor = Color;
-          FormingGroup.insert(SU->NodeNum);
-        }
+        ProposedColor = Color;
+        FormingGroup.insert(SU.NodeNum);
+        CurrentColoring[SU.NodeNum] = ProposedColor;
+        Count = 0;
+      } else if (Count == GroupSize) {
+        FormingGroup.clear();
+        Color = ++NextReservedID;
+        ProposedColor = Color;
         Count = 0;
-      } else {
-        FormingGroup.insert(SU->NodeNum);
       }
-      CurrentColoring[SU->NodeNum] = ProposedColor;
     }
   }
 }
@@ -835,6 +923,17 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
   unsigned DAGSize = DAG->SUnits.size();
   std::vector<int> PendingColoring = CurrentColoring;
 
+  assert(DAGSize >= 1 &&
+         CurrentBottomUpReservedDependencyColoring.size() == DAGSize &&
+         CurrentTopDownReservedDependencyColoring.size() == DAGSize);
+  // If there is no reserved block at all, do nothing. We don't want
+  // everything in one block.
+  if (*std::max_element(CurrentBottomUpReservedDependencyColoring.begin(),
+                        CurrentBottomUpReservedDependencyColoring.end()) == 0 &&
+      *std::max_element(CurrentTopDownReservedDependencyColoring.begin(),
+                        CurrentTopDownReservedDependencyColoring.end()) == 0)
+    return;
+
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     SUnit *SU = &DAG->SUnits[SUNum];
     std::set<unsigned> SUColors;
@@ -856,6 +955,9 @@ void SIScheduleBlockCreator::colorEndsAccordingToDependencies() {
         SUColors.insert(CurrentColoring[Succ->NodeNum]);
       SUColorsPending.insert(PendingColoring[Succ->NodeNum]);
     }
+    // If there is only one child/parent block, and that block
+    // is not among the ones we are removing in this path, then
+    // merge the instruction to that block
     if (SUColors.size() == 1 && SUColorsPending.size() == 1)
       PendingColoring[SU->NodeNum] = *SUColors.begin();
     else // TODO: Attribute new colors depending on color
@@ -974,12 +1076,7 @@ void SIScheduleBlockCreator::colorMergeIfPossibleSmallGroupsToNextGroup() {
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
     SUnit *SU = &DAG->SUnits[SUNum];
     unsigned color = CurrentColoring[SU->NodeNum];
-    std::map<unsigned, unsigned>::iterator Pos = ColorCount.find(color);
-      if (Pos != ColorCount.end()) {
-        ++ColorCount[color];
-      } else {
-        ColorCount[color] = 1;
-      }
+     ++ColorCount[color];
   }
 
   for (unsigned SUNum : DAG->BottomUpIndex2SU) {
@@ -1087,7 +1184,8 @@ void SIScheduleBlockCreator::createBlocksForVariant(SISchedulerBlockCreatorVaria
       if (SuccDep.isWeak() || Succ->NodeNum >= DAGSize)
         continue;
       if (Node2CurrentBlock[Succ->NodeNum] != SUID)
-        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]]);
+        CurrentBlocks[SUID]->addSucc(CurrentBlocks[Node2CurrentBlock[Succ->NodeNum]],
+                                     SuccDep.isCtrl() ? NoData : Data);
     }
     for (SDep& PredDep : SU->Preds) {
       SUnit *Pred = PredDep.getSUnit();
@@ -1281,10 +1379,8 @@ void SIScheduleBlockCreator::fillStats() {
       Block->Height = 0;
     else {
       unsigned Height = 0;
-      for (SIScheduleBlock *Succ : Block->getSuccs()) {
-        if (Height < Succ->Height + 1)
-          Height = Succ->Height + 1;
-      }
+      for (const auto &Succ : Block->getSuccs())
+        Height = std::min(Height, Succ.first->Height + 1);
       Block->Height = Height;
     }
   }
@@ -1331,13 +1427,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
         continue;
 
       int PredID = BlocksStruct.TopDownIndex2Block[topoInd];
-      std::map<unsigned, unsigned>::iterator RegPos =
-        LiveOutRegsNumUsages[PredID].find(Reg);
-      if (RegPos != LiveOutRegsNumUsages[PredID].end()) {
-        ++LiveOutRegsNumUsages[PredID][Reg];
-      } else {
-        LiveOutRegsNumUsages[PredID][Reg] = 1;
-      }
+      ++LiveOutRegsNumUsages[PredID][Reg];
     }
   }
 
@@ -1361,6 +1451,24 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
   std::set<unsigned> InRegs = DAG->getInRegs();
   addLiveRegs(InRegs);
 
+  // Increase LiveOutRegsNumUsages for blocks
+  // producing registers consumed in another
+  // scheduling region.
+  for (unsigned Reg : DAG->getOutRegs()) {
+    for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
+      // Do reverse traversal
+      int ID = BlocksStruct.TopDownIndex2Block[Blocks.size()-1-i];
+      SIScheduleBlock *Block = Blocks[ID];
+      const std::set<unsigned> &OutRegs = Block->getOutRegs();
+
+      if (OutRegs.find(Reg) == OutRegs.end())
+        continue;
+
+      ++LiveOutRegsNumUsages[ID][Reg];
+      break;
+    }
+  }
+
   // Fill LiveRegsConsumers for regs that were already
   // defined before scheduling.
   for (unsigned i = 0, e = Blocks.size(); i != e; ++i) {
@@ -1377,12 +1485,8 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
         }
       }
 
-      if (!Found) {
-        if (LiveRegsConsumers.find(Reg) == LiveRegsConsumers.end())
-          LiveRegsConsumers[Reg] = 1;
-        else
-          ++LiveRegsConsumers[Reg];
-      }
+      if (!Found)
+        ++LiveRegsConsumers[Reg];
     }
   }
 
@@ -1403,6 +1507,7 @@ SIScheduleBlockScheduler::SIScheduleBlockScheduler(SIScheduleDAGMI *DAG,
     for (SIScheduleBlock* Block : BlocksScheduled) {
       dbgs() << ' ' << Block->getID();
     }
+    dbgs() << '\n';
   );
 }
 
@@ -1464,8 +1569,8 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
                         VregCurrentUsage, SregCurrentUsage);
   if (VregCurrentUsage > maxVregUsage)
     maxVregUsage = VregCurrentUsage;
-  if (VregCurrentUsage > maxSregUsage)
-    maxSregUsage = VregCurrentUsage;
+  if (SregCurrentUsage > maxSregUsage)
+    maxSregUsage = SregCurrentUsage;
   DEBUG(
     dbgs() << "Picking New Blocks\n";
     dbgs() << "Available: ";
@@ -1556,17 +1661,13 @@ void SIScheduleBlockScheduler::decreaseLiveRegs(SIScheduleBlock *Block,
 }
 
 void SIScheduleBlockScheduler::releaseBlockSuccs(SIScheduleBlock *Parent) {
-  for (SIScheduleBlock* Block : Parent->getSuccs()) {
-    --BlockNumPredsLeft[Block->getID()];
-    if (BlockNumPredsLeft[Block->getID()] == 0) {
-      ReadyBlocks.push_back(Block);
-    }
-    // TODO: Improve check. When the dependency between the high latency
-    // instructions and the instructions of the other blocks are WAR or WAW
-    // there will be no wait triggered. We would like these cases to not
-    // update LastPosHighLatencyParentScheduled.
-    if (Parent->isHighLatencyBlock())
-      LastPosHighLatencyParentScheduled[Block->getID()] = NumBlockScheduled;
+  for (const auto &Block : Parent->getSuccs()) {
+    if (--BlockNumPredsLeft[Block.first->getID()] == 0)
+      ReadyBlocks.push_back(Block.first);
+
+    if (Parent->isHighLatencyBlock() &&
+        Block.second == SIScheduleBlockLinkKind::Data)
+      LastPosHighLatencyParentScheduled[Block.first->getID()] = NumBlockScheduled;
   }
 }
 
@@ -1578,12 +1679,10 @@ void SIScheduleBlockScheduler::blockScheduled(SIScheduleBlock *Block) {
        LiveOutRegsNumUsages[Block->getID()].begin(),
        E = LiveOutRegsNumUsages[Block->getID()].end(); RegI != E; ++RegI) {
     std::pair<unsigned, unsigned> RegP = *RegI;
-    if (LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end())
-      LiveRegsConsumers[RegP.first] = RegP.second;
-    else {
-      assert(LiveRegsConsumers[RegP.first] == 0);
-      LiveRegsConsumers[RegP.first] += RegP.second;
-    }
+    // We produce this register, thus it must not be previously alive.
+    assert(LiveRegsConsumers.find(RegP.first) == LiveRegsConsumers.end() ||
+           LiveRegsConsumers[RegP.first] == 0);
+    LiveRegsConsumers[RegP.first] += RegP.second;
   }
   if (LastPosHighLatencyParentScheduled[Block->getID()] >
         (unsigned)LastPosWaitedHighLatency)
@@ -1825,7 +1924,9 @@ void SIScheduleDAGMI::schedule()
   // if VGPR usage is extremely high, try other good performing variants
   // which could lead to lower VGPR usage
   if (Best.MaxVGPRUsage > 180) {
-    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+    static const std::pair<SISchedulerBlockCreatorVariant,
+                           SISchedulerBlockSchedulerVariant>
+        Variants[] = {
       { LatenciesAlone, BlockRegUsageLatency },
 //      { LatenciesAlone, BlockRegUsage },
       { LatenciesGrouped, BlockLatencyRegUsage },
@@ -1844,7 +1945,9 @@ void SIScheduleDAGMI::schedule()
   // if VGPR usage is still extremely high, we may spill. Try other variants
   // which are less performing, but that could lead to lower VGPR usage.
   if (Best.MaxVGPRUsage > 200) {
-    std::vector<std::pair<SISchedulerBlockCreatorVariant, SISchedulerBlockSchedulerVariant>> Variants = {
+    static const std::pair<SISchedulerBlockCreatorVariant,
+                           SISchedulerBlockSchedulerVariant>
+        Variants[] = {
 //      { LatenciesAlone, BlockRegUsageLatency },
       { LatenciesAlone, BlockRegUsage },
 //      { LatenciesGrouped, BlockLatencyRegUsage },
diff --git a/lib/Target/AMDGPU/SIMachineScheduler.h b/lib/Target/AMDGPU/SIMachineScheduler.h
index 2dc4b346de7921ab8760f1c8cfa8fd67d8774842..122d0f67ca8c7bc15e4f37e969a0e2f71b119aa7 100644
--- a/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -54,6 +54,11 @@ struct SISchedulerCandidate {
 class SIScheduleDAGMI;
 class SIScheduleBlockCreator;
 
+enum SIScheduleBlockLinkKind {
+  NoData,
+  Data
+};
+
 class SIScheduleBlock {
   SIScheduleDAGMI *DAG;
   SIScheduleBlockCreator *BC;
@@ -92,7 +97,8 @@ class SIScheduleBlock {
   unsigned ID;
 
   std::vector<SIScheduleBlock*> Preds;  // All blocks predecessors.
-  std::vector<SIScheduleBlock*> Succs;  // All blocks successors.
+  // All blocks successors, and the kind of link
+  std::vector<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>> Succs;
   unsigned NumHighLatencySuccessors = 0;
 
 public:
@@ -112,10 +118,11 @@ public:
 
   // Add block pred, which has instruction predecessor of SU.
   void addPred(SIScheduleBlock *Pred);
-  void addSucc(SIScheduleBlock *Succ);
+  void addSucc(SIScheduleBlock *Succ, SIScheduleBlockLinkKind Kind);
 
   const std::vector<SIScheduleBlock*>& getPreds() const { return Preds; }
-  const std::vector<SIScheduleBlock*>& getSuccs() const { return Succs; }
+  ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
+    getSuccs() const { return Succs; }
 
   unsigned Height;  // Maximum topdown path length to block without outputs
   unsigned Depth;   // Maximum bottomup path length to block without inputs
@@ -449,6 +456,7 @@ public:
   LiveIntervals *getLIS() { return LIS; }
   MachineRegisterInfo *getMRI() { return &MRI; }
   const TargetRegisterInfo *getTRI() { return TRI; }
+  ScheduleDAGTopologicalSort *GetTopo() { return &Topo; }
   SUnit& getEntrySU() { return EntrySU; }
   SUnit& getExitSU() { return ExitSU; }
 
@@ -467,6 +475,14 @@ public:
     return InRegs;
   }
 
+  std::set<unsigned> getOutRegs() {
+    std::set<unsigned> OutRegs;
+    for (const auto &RegMaskPair : RPTracker.getPressure().LiveOutRegs) {
+      OutRegs.insert(RegMaskPair.RegUnit);
+    }
+    return OutRegs;
+  };
+
   unsigned getVGPRSetID() const { return VGPRSetID; }
   unsigned getSGPRSetID() const { return SGPRSetID; }
 
diff --git a/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e02c2e3240e849409404ca4f0999405b90781776
--- /dev/null
+++ b/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -0,0 +1,713 @@
+//===-- SIPeepholeSDWA.cpp - Peephole optimization for SDWA instructions --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass tries to apply several peephole SDWA patterns.
+///
+/// E.g. original:
+///   V_LSHRREV_B32_e32 %vreg0, 16, %vreg1
+///   V_ADD_I32_e32 %vreg2, %vreg0, %vreg3
+///   V_LSHLREV_B32_e32 %vreg4, 16, %vreg2
+///
+/// Replace:
+///   V_ADD_I32_sdwa %vreg4, %vreg1, %vreg3
+///       dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+///
+//===----------------------------------------------------------------------===//
+
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include <unordered_map>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-peephole-sdwa"
+
+STATISTIC(NumSDWAPatternsFound, "Number of SDWA patterns found.");
+STATISTIC(NumSDWAInstructionsPeepholed,
+          "Number of instruction converted to SDWA.");
+
+namespace {
+
+class SDWAOperand;
+
+class SIPeepholeSDWA : public MachineFunctionPass {
+private:
+  MachineRegisterInfo *MRI;
+  const SIRegisterInfo *TRI;
+  const SIInstrInfo *TII;
+
+  std::unordered_map<MachineInstr *, std::unique_ptr<SDWAOperand>> SDWAOperands;
+
+  Optional<int64_t> foldToImm(const MachineOperand &Op) const;
+
+public:
+  static char ID;
+
+  typedef SmallVector<std::unique_ptr<SDWAOperand>, 4> SDWAOperandsVector;
+
+  SIPeepholeSDWA() : MachineFunctionPass(ID) {
+    initializeSIPeepholeSDWAPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void matchSDWAOperands(MachineFunction &MF);
+  bool convertToSDWA(MachineInstr &MI, const SDWAOperandsVector &SDWAOperands);
+
+  StringRef getPassName() const override { return "SI Peephole SDWA"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+};
+
+class SDWAOperand {
+private:
+  MachineOperand *Target; // Operand that would be used in converted instruction
+  MachineOperand *Replaced; // Operand that would be replace by Target
+
+public:
+  SDWAOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp)
+      : Target(TargetOp), Replaced(ReplacedOp) {
+    assert(Target->isReg());
+    assert(Replaced->isReg());
+  }
+
+  virtual ~SDWAOperand() {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) = 0;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) = 0;
+
+  MachineOperand *getTargetOperand() const { return Target; }
+  MachineOperand *getReplacedOperand() const { return Replaced; }
+  MachineInstr *getParentInst() const { return Target->getParent(); }
+  MachineRegisterInfo *getMRI() const {
+    return &getParentInst()->getParent()->getParent()->getRegInfo();
+  }
+};
+
+using namespace AMDGPU::SDWA;
+
+class SDWASrcOperand : public SDWAOperand {
+private:
+  SdwaSel SrcSel;
+  bool Abs;
+  bool Neg;
+  bool Sext;
+
+public:
+  SDWASrcOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
+                 SdwaSel SrcSel_ = DWORD, bool Abs_ = false, bool Neg_ = false,
+                 bool Sext_ = false)
+      : SDWAOperand(TargetOp, ReplacedOp), SrcSel(SrcSel_), Abs(Abs_),
+        Neg(Neg_), Sext(Sext_) {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+
+  SdwaSel getSrcSel() const { return SrcSel; }
+  bool getAbs() const { return Abs; }
+  bool getNeg() const { return Neg; }
+  bool getSext() const { return Sext; }
+
+  uint64_t getSrcMods() const;
+};
+
+class SDWADstOperand : public SDWAOperand {
+private:
+  SdwaSel DstSel;
+  DstUnused DstUn;
+
+public:
+  SDWADstOperand(MachineOperand *TargetOp, MachineOperand *ReplacedOp,
+                 SdwaSel DstSel_ = DWORD, DstUnused DstUn_ = UNUSED_PAD)
+      : SDWAOperand(TargetOp, ReplacedOp), DstSel(DstSel_), DstUn(DstUn_) {}
+
+  virtual MachineInstr *potentialToConvert(const SIInstrInfo *TII) override;
+  virtual bool convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) override;
+
+  SdwaSel getDstSel() const { return DstSel; }
+  DstUnused getDstUnused() const { return DstUn; }
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPeepholeSDWA, DEBUG_TYPE, "SI Peephole SDWA", false, false)
+
+char SIPeepholeSDWA::ID = 0;
+
+char &llvm::SIPeepholeSDWAID = SIPeepholeSDWA::ID;
+
+FunctionPass *llvm::createSIPeepholeSDWAPass() {
+  return new SIPeepholeSDWA();
+}
+
+#ifndef NDEBUG
+
+static raw_ostream& operator<<(raw_ostream &OS, const SdwaSel &Sel) {
+  switch(Sel) {
+  case BYTE_0: OS << "BYTE_0"; break;
+  case BYTE_1: OS << "BYTE_1"; break;
+  case BYTE_2: OS << "BYTE_2"; break;
+  case BYTE_3: OS << "BYTE_3"; break;
+  case WORD_0: OS << "WORD_0"; break;
+  case WORD_1: OS << "WORD_1"; break;
+  case DWORD:  OS << "DWORD"; break;
+  }
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
+  switch(Un) {
+  case UNUSED_PAD: OS << "UNUSED_PAD"; break;
+  case UNUSED_SEXT: OS << "UNUSED_SEXT"; break;
+  case UNUSED_PRESERVE: OS << "UNUSED_PRESERVE"; break;
+  }
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const SDWASrcOperand &Src) {
+  OS << "SDWA src: " << *Src.getTargetOperand()
+     << " src_sel:" << Src.getSrcSel()
+     << " abs:" << Src.getAbs() << " neg:" << Src.getNeg()
+     << " sext:" << Src.getSext() << '\n';
+  return OS;
+}
+
+static raw_ostream& operator<<(raw_ostream &OS, const SDWADstOperand &Dst) {
+  OS << "SDWA dst: " << *Dst.getTargetOperand()
+     << " dst_sel:" << Dst.getDstSel()
+     << " dst_unused:" << Dst.getDstUnused() << '\n';
+  return OS;
+}
+
+#endif
+
+static void copyRegOperand(MachineOperand &To, const MachineOperand &From) {
+  assert(To.isReg() && From.isReg());
+  To.setReg(From.getReg());
+  To.setSubReg(From.getSubReg());
+  To.setIsUndef(From.isUndef());
+  if (To.isUse()) {
+    To.setIsKill(From.isKill());
+  } else {
+    To.setIsDead(From.isDead());
+  }
+}
+
+static bool isSameReg(const MachineOperand &LHS, const MachineOperand &RHS) {
+  return LHS.isReg() &&
+         RHS.isReg() &&
+         LHS.getReg() == RHS.getReg() &&
+         LHS.getSubReg() == RHS.getSubReg();
+}
+
+static bool isSubregOf(const MachineOperand &SubReg,
+                       const MachineOperand &SuperReg,
+                       const TargetRegisterInfo *TRI) {
+  
+  if (!SuperReg.isReg() || !SubReg.isReg())
+    return false;
+
+  if (isSameReg(SuperReg, SubReg))
+    return true;
+
+  if (SuperReg.getReg() != SubReg.getReg())
+    return false;
+
+  LaneBitmask SuperMask = TRI->getSubRegIndexLaneMask(SuperReg.getSubReg());
+  LaneBitmask SubMask = TRI->getSubRegIndexLaneMask(SubReg.getSubReg());
+  SuperMask |= ~SubMask;
+  return SuperMask.all();
+}
+
+uint64_t SDWASrcOperand::getSrcMods() const {
+  uint64_t Mods = 0;
+  if (Abs || Neg) {
+    assert(!Sext &&
+           "Float and integer src modifiers can't be set simulteniously");
+    Mods |= Abs ? SISrcMods::ABS : 0;
+    Mods |= Neg ? SISrcMods::NEG : 0;
+  } else if (Sext) {
+    Mods |= SISrcMods::SEXT;
+  }
+
+  return Mods;
+}
+
+MachineInstr *SDWASrcOperand::potentialToConvert(const SIInstrInfo *TII) {
+  // For SDWA src operand potential instruction is one that use register
+  // defined by parent instruction
+  MachineRegisterInfo *MRI = getMRI();
+  MachineOperand *Replaced = getReplacedOperand();
+  assert(Replaced->isReg());
+
+  MachineInstr *PotentialMI = nullptr;
+  for (MachineOperand &PotentialMO : MRI->use_operands(Replaced->getReg())) {
+    // If this is use of another subreg of dst reg then do nothing
+    if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
+      continue;
+
+    // If there exist use of superreg of dst then we should not combine this
+    // opernad
+    if (!isSameReg(PotentialMO, *Replaced))
+      return nullptr;
+
+    // Check that PotentialMI is only instruction that uses dst reg
+    if (PotentialMI == nullptr) {
+      PotentialMI = PotentialMO.getParent();
+    } else if (PotentialMI != PotentialMO.getParent()) {
+      return nullptr;
+    }
+  }
+
+  return PotentialMI;
+}
+
+bool SDWASrcOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
+  // Find operand in instruction that matches source operand and replace it with
+  // target operand. Set corresponding src_sel
+
+  MachineOperand *Src = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  MachineOperand *SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src0_sel);
+  MachineOperand *SrcMods =
+      TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers);
+  assert(Src && Src->isReg());
+  if (!isSameReg(*Src, *getReplacedOperand())) {
+    // If this is not src0 then it should be src1
+    Src = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+    SrcSel = TII->getNamedOperand(MI, AMDGPU::OpName::src1_sel);
+    SrcMods = TII->getNamedOperand(MI, AMDGPU::OpName::src1_modifiers);
+
+    assert(Src && Src->isReg());
+
+    if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+         MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
+        !isSameReg(*Src, *getReplacedOperand())) {
+      // In case of v_mac_f16/32_sdwa this pass can try to apply src operand to
+      // src2. This is not allowed.
+      return false;
+    }
+
+    assert(isSameReg(*Src, *getReplacedOperand()) && SrcSel && SrcMods);
+  }
+  copyRegOperand(*Src, *getTargetOperand());
+  SrcSel->setImm(getSrcSel());
+  SrcMods->setImm(getSrcMods());
+  getTargetOperand()->setIsKill(false);
+  return true;
+}
+
+MachineInstr *SDWADstOperand::potentialToConvert(const SIInstrInfo *TII) {
+  // For SDWA dst operand potential instruction is one that defines register
+  // that this operand uses
+  MachineRegisterInfo *MRI = getMRI();
+  MachineInstr *ParentMI = getParentInst();
+  MachineOperand *Replaced = getReplacedOperand();
+  assert(Replaced->isReg());
+
+  for (MachineOperand &PotentialMO : MRI->def_operands(Replaced->getReg())) {
+    if (!isSubregOf(*Replaced, PotentialMO, MRI->getTargetRegisterInfo()))
+      continue;
+
+    if (!isSameReg(*Replaced, PotentialMO))
+      return nullptr;
+
+    // Check that ParentMI is the only instruction that uses replaced register
+    for (MachineOperand &UseMO : MRI->use_operands(PotentialMO.getReg())) {
+      if (isSubregOf(UseMO, PotentialMO, MRI->getTargetRegisterInfo()) &&
+          UseMO.getParent() != ParentMI) {
+        return nullptr;
+      }
+    }
+
+    // Due to SSA this should be onle def of replaced register, so return it
+    return PotentialMO.getParent();
+  }
+
+  return nullptr;
+}
+
+bool SDWADstOperand::convertToSDWA(MachineInstr &MI, const SIInstrInfo *TII) {
+  // Replace vdst operand in MI with target operand. Set dst_sel and dst_unused
+
+  if ((MI.getOpcode() == AMDGPU::V_MAC_F16_sdwa ||
+       MI.getOpcode() == AMDGPU::V_MAC_F32_sdwa) &&
+      getDstSel() != AMDGPU::SDWA::DWORD) {
+    // v_mac_f16/32_sdwa allow dst_sel to be equal only to DWORD
+    return false;
+  }
+
+  MachineOperand *Operand = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  assert(Operand &&
+         Operand->isReg() &&
+         isSameReg(*Operand, *getReplacedOperand()));
+  copyRegOperand(*Operand, *getTargetOperand());
+  MachineOperand *DstSel= TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel);
+  assert(DstSel);
+  DstSel->setImm(getDstSel());
+  MachineOperand *DstUnused= TII->getNamedOperand(MI, AMDGPU::OpName::dst_unused);
+  assert(DstUnused);
+  DstUnused->setImm(getDstUnused());
+
+  // Remove original instruction  because it would conflict with our new
+  // instruction by register definition
+  getParentInst()->eraseFromParent();
+  return true;
+}
+
+Optional<int64_t> SIPeepholeSDWA::foldToImm(const MachineOperand &Op) const {
+  if (Op.isImm()) {
+    return Op.getImm();
+  }
+
+  // If this is not immediate then it can be copy of immediate value, e.g.:
+  // %vreg1<def> = S_MOV_B32 255;
+  if (Op.isReg()) {
+    for (const MachineOperand &Def : MRI->def_operands(Op.getReg())) {
+      if (!isSameReg(Op, Def))
+        continue;
+
+      const MachineInstr *DefInst = Def.getParent();
+      if (!TII->isFoldableCopy(*DefInst))
+        return None;
+
+      const MachineOperand &Copied = DefInst->getOperand(1);
+      if (!Copied.isImm())
+        return None;
+
+      return Copied.getImm();
+    }
+  }
+
+  return None;
+}
+
+void SIPeepholeSDWA::matchSDWAOperands(MachineFunction &MF) {
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      unsigned Opcode = MI.getOpcode();
+      switch (Opcode) {
+      case AMDGPU::V_LSHRREV_B32_e32:
+      case AMDGPU::V_ASHRREV_I32_e32:
+      case AMDGPU::V_LSHLREV_B32_e32: {
+        // from: v_lshrrev_b32_e32 v1, 16/24, v0
+        // to SDWA src:v0 src_sel:WORD_1/BYTE_3
+
+        // from: v_ashrrev_i32_e32 v1, 16/24, v0
+        // to SDWA src:v0 src_sel:WORD_1/BYTE_3 sext:1
+
+        // from: v_lshlrev_b32_e32 v1, 16/24, v0
+        // to SDWA dst:v1 dst_sel:WORD_1/BYTE_3 dst_unused:UNUSED_PAD
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        auto Imm = foldToImm(*Src0);
+        if (!Imm)
+          break;
+
+        if (*Imm != 16 && *Imm != 24)
+          break;
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+        if (TRI->isPhysicalRegister(Src1->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        if (Opcode == AMDGPU::V_LSHLREV_B32_e32) {
+          auto SDWADst = make_unique<SDWADstOperand>(
+              Dst, Src1, *Imm == 16 ? WORD_1 : BYTE_3, UNUSED_PAD);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
+          SDWAOperands[&MI] = std::move(SDWADst);
+          ++NumSDWAPatternsFound;
+        } else {
+          auto SDWASrc = make_unique<SDWASrcOperand>(
+              Src1, Dst, *Imm == 16 ? WORD_1 : BYTE_3, false, false,
+              Opcode == AMDGPU::V_LSHRREV_B32_e32 ? false : true);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+          SDWAOperands[&MI] = std::move(SDWASrc);
+          ++NumSDWAPatternsFound;
+        }
+        break;
+      }
+
+      case AMDGPU::V_LSHRREV_B16_e32:
+      case AMDGPU::V_ASHRREV_I16_e32:
+      case AMDGPU::V_LSHLREV_B16_e32: {
+        // from: v_lshrrev_b16_e32 v1, 8, v0
+        // to SDWA src:v0 src_sel:BYTE_1
+
+        // from: v_ashrrev_i16_e32 v1, 8, v0
+        // to SDWA src:v0 src_sel:BYTE_1 sext:1
+
+        // from: v_lshlrev_b16_e32 v1, 8, v0
+        // to SDWA dst:v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        auto Imm = foldToImm(*Src0);
+        if (!Imm || *Imm != 8)
+          break;
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+
+        if (TRI->isPhysicalRegister(Src1->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        if (Opcode == AMDGPU::V_LSHLREV_B16_e32) {
+          auto SDWADst =
+              make_unique<SDWADstOperand>(Dst, Src1, BYTE_1, UNUSED_PAD);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWADst << '\n');
+          SDWAOperands[&MI] = std::move(SDWADst);
+          ++NumSDWAPatternsFound;
+        } else {
+          auto SDWASrc = make_unique<SDWASrcOperand>(
+              Src1, Dst, BYTE_1, false, false,
+              Opcode == AMDGPU::V_LSHRREV_B16_e32 ? false : true);
+          DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+          SDWAOperands[&MI] = std::move(SDWASrc);
+          ++NumSDWAPatternsFound;
+        }
+        break;
+      }
+
+      case AMDGPU::V_BFE_I32:
+      case AMDGPU::V_BFE_U32: {
+        // e.g.:
+        // from: v_bfe_u32 v1, v0, 8, 8
+        // to SDWA src:v0 src_sel:BYTE_1
+
+        // offset | width | src_sel
+        // ------------------------
+        // 0      | 8     | BYTE_0
+        // 0      | 16    | WORD_0
+        // 0      | 32    | DWORD ?
+        // 8      | 8     | BYTE_1
+        // 16     | 8     | BYTE_2
+        // 16     | 16    | WORD_1
+        // 24     | 8     | BYTE_3
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        auto Offset = foldToImm(*Src1);
+        if (!Offset)
+          break;
+
+        MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+        auto Width = foldToImm(*Src2);
+        if (!Width)
+          break;
+
+        SdwaSel SrcSel = DWORD;
+
+        if (*Offset == 0 && *Width == 8)
+          SrcSel = BYTE_0;
+        else if (*Offset == 0 && *Width == 16)
+          SrcSel = WORD_0;
+        else if (*Offset == 0 && *Width == 32)
+          SrcSel = DWORD;
+        else if (*Offset == 8 && *Width == 8)
+          SrcSel = BYTE_1;
+        else if (*Offset == 16 && *Width == 8)
+          SrcSel = BYTE_2;
+        else if (*Offset == 16 && *Width == 16)
+          SrcSel = WORD_1;
+        else if (*Offset == 24 && *Width == 8)
+          SrcSel = BYTE_3;
+        else
+          break;
+
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      
+        if (TRI->isPhysicalRegister(Src0->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        auto SDWASrc = make_unique<SDWASrcOperand>(
+            Src0, Dst, SrcSel, false, false,
+            Opcode == AMDGPU::V_BFE_U32 ? false : true);
+        DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+        SDWAOperands[&MI] = std::move(SDWASrc);
+        ++NumSDWAPatternsFound;
+        break;
+      }
+      case AMDGPU::V_AND_B32_e32: {
+        // e.g.:
+        // from: v_and_b32_e32 v1, 0x0000ffff/0x000000ff, v0
+        // to SDWA src:v0 src_sel:WORD_0/BYTE_0
+
+        MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+        auto Imm = foldToImm(*Src0);
+        if (!Imm)
+          break;
+
+        if (*Imm != 0x0000ffff && *Imm != 0x000000ff)
+          break;
+
+        MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+        MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+      
+        if (TRI->isPhysicalRegister(Src1->getReg()) ||
+            TRI->isPhysicalRegister(Dst->getReg()))
+          break;
+
+        auto SDWASrc = make_unique<SDWASrcOperand>(
+            Src1, Dst, *Imm == 0x0000ffff ? WORD_0 : BYTE_0);
+        DEBUG(dbgs() << "Match: " << MI << "To: " << *SDWASrc << '\n');
+        SDWAOperands[&MI] = std::move(SDWASrc);
+        ++NumSDWAPatternsFound;
+        break;
+      }
+      }
+    }
+  }
+}
+
+bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
+                                   const SDWAOperandsVector &SDWAOperands) {
+  // Check if this instruction can be converted to SDWA:
+  // 1. Does this opcode support SDWA
+  if (AMDGPU::getSDWAOp(MI.getOpcode()) == -1)
+    return false;
+
+  // 2. Are all operands - VGPRs
+  for (const MachineOperand &Operand : MI.explicit_operands()) {
+    if (!Operand.isReg() || !TRI->isVGPR(*MRI, Operand.getReg()))
+      return false;
+  }
+
+  // Convert to sdwa
+  int SDWAOpcode = AMDGPU::getSDWAOp(MI.getOpcode());
+  assert(SDWAOpcode != -1);
+
+  const MCInstrDesc &SDWADesc = TII->get(SDWAOpcode);
+
+  // Create SDWA version of instruction MI and initialize its operands
+  MachineInstrBuilder SDWAInst =
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
+
+  // Copy dst, if it is present in original then should also be present in SDWA
+  MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
+  if (Dst) {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::vdst) != -1);
+    SDWAInst.add(*Dst);
+  } else {
+    assert(TII->isVOPC(MI));
+  }
+
+  // Copy src0, initialize src0_modifiers. All sdwa instructions has src0 and
+  // src0_modifiers (except for v_nop_sdwa, but it can't get here)
+  MachineOperand *Src0 = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  assert(
+    Src0 &&
+    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0) != -1 &&
+    AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_modifiers) != -1);
+  SDWAInst.addImm(0);
+  SDWAInst.add(*Src0);
+
+  // Copy src1 if present, initialize src1_modifiers.
+  MachineOperand *Src1 = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
+  if (Src1) {
+    assert(
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1) != -1 &&
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_modifiers) != -1);
+    SDWAInst.addImm(0);
+    SDWAInst.add(*Src1);
+  } else {
+    assert(TII->isVOP1(MI));
+  }
+
+  if (SDWAOpcode == AMDGPU::V_MAC_F16_sdwa ||
+      SDWAOpcode == AMDGPU::V_MAC_F32_sdwa) {
+    // v_mac_f16/32 has additional src2 operand tied to vdst
+    MachineOperand *Src2 = TII->getNamedOperand(MI, AMDGPU::OpName::src2);
+    assert(Src2);
+    SDWAInst.add(*Src2);
+  }
+
+  // Initialize clamp.
+  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::clamp) != -1);
+  SDWAInst.addImm(0);
+
+  // Initialize dst_sel and dst_unused if present
+  if (Dst) {
+    assert(
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_sel) != -1 &&
+      AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::dst_unused) != -1);
+    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+    SDWAInst.addImm(AMDGPU::SDWA::DstUnused::UNUSED_PAD);
+  }
+
+  // Initialize src0_sel
+  assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src0_sel) != -1);
+  SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+
+
+  // Initialize src1_sel if present
+  if (Src1) {
+    assert(AMDGPU::getNamedOperandIdx(SDWAOpcode, AMDGPU::OpName::src1_sel) != -1);
+    SDWAInst.addImm(AMDGPU::SDWA::SdwaSel::DWORD);
+  }
+
+  // Apply all sdwa operand pattenrs
+  bool Converted = false;
+  for (auto &Operand : SDWAOperands) {
+    Converted |= Operand->convertToSDWA(*SDWAInst, TII);
+  }
+  if (!Converted) {
+    SDWAInst->eraseFromParent();
+    return false;
+  }
+
+  DEBUG(dbgs() << "Convert instruction:" << MI
+               << "Into:" << *SDWAInst << '\n');
+  ++NumSDWAInstructionsPeepholed;
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool SIPeepholeSDWA::runOnMachineFunction(MachineFunction &MF) {
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  if (!ST.hasSDWA() ||
+      !AMDGPU::isVI(ST)) { // TODO: Add support for SDWA on gfx9
+    return false;
+  }
+
+  MRI = &MF.getRegInfo();
+  TRI = ST.getRegisterInfo();
+  TII = ST.getInstrInfo();
+
+  std::unordered_map<MachineInstr *, SDWAOperandsVector> PotentialMatches;
+
+  matchSDWAOperands(MF);
+
+  for (auto &OperandPair : SDWAOperands) {
+    auto &Operand = OperandPair.second;
+    MachineInstr *PotentialMI = Operand->potentialToConvert(TII);
+    if (PotentialMI) {
+      PotentialMatches[PotentialMI].push_back(std::move(Operand));
+    }
+  }
+
+  for (auto &PotentialPair : PotentialMatches) {
+    MachineInstr &PotentialMI = *PotentialPair.first;
+    convertToSDWA(PotentialMI, PotentialPair.second);
+  }
+
+  SDWAOperands.clear();
+  return false;
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.cpp b/lib/Target/AMDGPU/SIRegisterInfo.cpp
index f5df0f8d1f8a6d8e108ce7e36f918622fa72a2e2..39324cbbcc02427a508be61d3607d3d70e2a0b27 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -24,12 +24,6 @@
 
 using namespace llvm;
 
-static cl::opt<bool> EnableSpillSGPRToSMEM(
-  "amdgpu-spill-sgpr-to-smem",
-  cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
-  cl::init(false));
-
-
 static bool hasPressureSet(const int *PSets, unsigned PSetID) {
   for (unsigned i = 0; PSets[i] != -1; ++i) {
     if (PSets[i] == (int)PSetID)
@@ -49,9 +43,28 @@ void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
   }
 }
 
-SIRegisterInfo::SIRegisterInfo() : AMDGPURegisterInfo(),
-                                   SGPRPressureSets(getNumRegPressureSets()),
-                                   VGPRPressureSets(getNumRegPressureSets()) {
+static cl::opt<bool> EnableSpillSGPRToSMEM(
+  "amdgpu-spill-sgpr-to-smem",
+  cl::desc("Use scalar stores to spill SGPRs if supported by subtarget"),
+  cl::init(false));
+
+static cl::opt<bool> EnableSpillSGPRToVGPR(
+  "amdgpu-spill-sgpr-to-vgpr",
+  cl::desc("Enable spilling VGPRs to SGPRs"),
+  cl::ReallyHidden,
+  cl::init(true));
+
+SIRegisterInfo::SIRegisterInfo(const SISubtarget &ST) :
+  AMDGPURegisterInfo(),
+  SGPRPressureSets(getNumRegPressureSets()),
+  VGPRPressureSets(getNumRegPressureSets()),
+  SpillSGPRToVGPR(false),
+  SpillSGPRToSMEM(false) {
+  if (EnableSpillSGPRToSMEM && ST.hasScalarStores())
+    SpillSGPRToSMEM = true;
+  else if (EnableSpillSGPRToVGPR)
+    SpillSGPRToVGPR = true;
+
   unsigned NumRegPressureSets = getNumRegPressureSets();
 
   SGPRSetID = NumRegPressureSets;
@@ -97,14 +110,18 @@ void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) co
 
 unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
-  unsigned BaseIdx = alignDown(getMaxNumSGPRs(MF), 4) - 4;
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
   unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
   return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SReg_128RegClass);
 }
 
 unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
   const MachineFunction &MF) const {
-  unsigned RegCount = getMaxNumSGPRs(MF);
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  unsigned RegCount = ST.getMaxNumSGPRs(MF);
   unsigned Reg;
 
   // Try to place it in a hole after PrivateSegmentbufferReg.
@@ -129,6 +146,12 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::EXEC);
   reserveRegisterTuples(Reserved, AMDGPU::FLAT_SCR);
 
+  // Reserve the memory aperture registers.
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_BASE);
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_SHARED_LIMIT);
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_BASE);
+  reserveRegisterTuples(Reserved, AMDGPU::SRC_PRIVATE_LIMIT);
+
   // Reserve Trap Handler registers - support is not implemented in Codegen.
   reserveRegisterTuples(Reserved, AMDGPU::TBA);
   reserveRegisterTuples(Reserved, AMDGPU::TMA);
@@ -139,14 +162,16 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   reserveRegisterTuples(Reserved, AMDGPU::TTMP8_TTMP9);
   reserveRegisterTuples(Reserved, AMDGPU::TTMP10_TTMP11);
 
-  unsigned MaxNumSGPRs = getMaxNumSGPRs(MF);
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+
+  unsigned MaxNumSGPRs = ST.getMaxNumSGPRs(MF);
   unsigned TotalNumSGPRs = AMDGPU::SGPR_32RegClass.getNumRegs();
   for (unsigned i = MaxNumSGPRs; i < TotalNumSGPRs; ++i) {
     unsigned Reg = AMDGPU::SGPR_32RegClass.getRegister(i);
     reserveRegisterTuples(Reserved, Reg);
   }
 
-  unsigned MaxNumVGPRs = getMaxNumVGPRs(MF);
+  unsigned MaxNumVGPRs = ST.getMaxNumVGPRs(MF);
   unsigned TotalNumVGPRs = AMDGPU::VGPR_32RegClass.getNumRegs();
   for (unsigned i = MaxNumVGPRs; i < TotalNumVGPRs; ++i) {
     unsigned Reg = AMDGPU::VGPR_32RegClass.getRegister(i);
@@ -545,11 +570,20 @@ static std::pair<unsigned, unsigned> getSpillEltSize(unsigned SuperRegSize,
                       AMDGPU::S_BUFFER_LOAD_DWORD_SGPR};
 }
 
-void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
+bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                                int Index,
-                               RegScavenger *RS) const {
+                               RegScavenger *RS,
+                               bool OnlyToVGPR) const {
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MBB->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
+    = MFI->getSGPRToVGPRSpills(Index);
+  bool SpillToVGPR = !VGPRSpills.empty();
+  if (OnlyToVGPR && !SpillToVGPR)
+    return false;
+
   MachineRegisterInfo &MRI = MF->getRegInfo();
   const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -558,10 +592,11 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   bool IsKill = MI->getOperand(0).isKill();
   const DebugLoc &DL = MI->getDebugLoc();
 
-  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
 
-  bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+  bool SpillToSMEM = spillSGPRToSMEM();
+  if (SpillToSMEM && OnlyToVGPR)
+    return false;
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
@@ -634,9 +669,9 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
       continue;
     }
 
-    struct SIMachineFunctionInfo::SpilledReg Spill =
-      MFI->getSpilledReg(MF, Index, i);
-    if (Spill.hasReg()) {
+    if (SpillToVGPR) {
+      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
+
       BuildMI(*MBB, MI, DL,
               TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
               Spill.VGPR)
@@ -647,6 +682,10 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
       // frame index, we should delete the frame index when all references to
       // it are fixed.
     } else {
+      // XXX - Can to VGPR spill fail for some subregisters but not others?
+      if (OnlyToVGPR)
+        return false;
+
       // Spill SGPR to a frame index.
       // TODO: Should VI try to spill to VGPR and then spill to SMEM?
       unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -690,22 +729,33 @@ void SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
 
   MI->eraseFromParent();
   MFI->addToSpilledSGPRs(NumSubRegs);
+  return true;
 }
 
-void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
+bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
                                  int Index,
-                                 RegScavenger *RS) const {
+                                 RegScavenger *RS,
+                                 bool OnlyToVGPR) const {
   MachineFunction *MF = MI->getParent()->getParent();
   MachineRegisterInfo &MRI = MF->getRegInfo();
   MachineBasicBlock *MBB = MI->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+
+  ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
+    = MFI->getSGPRToVGPRSpills(Index);
+  bool SpillToVGPR = !VGPRSpills.empty();
+  if (OnlyToVGPR && !SpillToVGPR)
+    return false;
+
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   const SISubtarget &ST =  MF->getSubtarget<SISubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const DebugLoc &DL = MI->getDebugLoc();
 
   unsigned SuperReg = MI->getOperand(0).getReg();
-  bool SpillToSMEM = ST.hasScalarStores() && EnableSpillSGPRToSMEM;
+  bool SpillToSMEM = spillSGPRToSMEM();
+  if (SpillToSMEM && OnlyToVGPR)
+    return false;
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
 
@@ -773,10 +823,8 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
       continue;
     }
 
-    SIMachineFunctionInfo::SpilledReg Spill
-      = MFI->getSpilledReg(MF, Index, i);
-
-    if (Spill.hasReg()) {
+    if (SpillToVGPR) {
+      SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
       auto MIB =
         BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
                 SubReg)
@@ -786,6 +834,9 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
       if (NumSubRegs > 1)
         MIB.addReg(SuperReg, RegState::ImplicitDefine);
     } else {
+      if (OnlyToVGPR)
+        return false;
+
       // Restore SGPR from a stack slot.
       // FIXME: We should use S_LOAD_DWORD here for VI.
       unsigned TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
@@ -820,6 +871,32 @@ void SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
   }
 
   MI->eraseFromParent();
+  return true;
+}
+
+/// Special case of eliminateFrameIndex. Returns true if the SGPR was spilled to
+/// a VGPR and the stack slot can be safely eliminated when all other users are
+/// handled.
+bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
+  MachineBasicBlock::iterator MI,
+  int FI,
+  RegScavenger *RS) const {
+  switch (MI->getOpcode()) {
+  case AMDGPU::SI_SPILL_S512_SAVE:
+  case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S128_SAVE:
+  case AMDGPU::SI_SPILL_S64_SAVE:
+  case AMDGPU::SI_SPILL_S32_SAVE:
+    return spillSGPR(MI, FI, RS, true);
+  case AMDGPU::SI_SPILL_S512_RESTORE:
+  case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_S128_RESTORE:
+  case AMDGPU::SI_SPILL_S64_RESTORE:
+  case AMDGPU::SI_SPILL_S32_RESTORE:
+    return restoreSGPR(MI, FI, RS, true);
+  default:
+    llvm_unreachable("not an SGPR spill instruction");
+  }
 }
 
 void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
@@ -1156,210 +1233,6 @@ SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
   return AMDGPU::NoRegister;
 }
 
-unsigned SIRegisterInfo::getTotalNumSGPRs(const SISubtarget &ST) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return 800;
-  return 512;
-}
-
-unsigned SIRegisterInfo::getNumAddressableSGPRs(const SISubtarget &ST) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-    return 102;
-  return 104;
-}
-
-unsigned SIRegisterInfo::getNumReservedSGPRs(const SISubtarget &ST,
-                                             const SIMachineFunctionInfo &MFI) const {
-  if (MFI.hasFlatScratchInit()) {
-    if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
-      return 6; // FLAT_SCRATCH, XNACK, VCC (in that order)
-
-    if (ST.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS)
-      return 4; // FLAT_SCRATCH, VCC (in that order)
-  }
-
-  if (ST.isXNACKEnabled())
-    return 4; // XNACK, VCC (in that order)
-
-  return 2; // VCC.
-}
-
-unsigned SIRegisterInfo::getMinNumSGPRs(const SISubtarget &ST,
-                                        unsigned WavesPerEU) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    switch (WavesPerEU) {
-      case 0:  return 0;
-      case 10: return 0;
-      case 9:  return 0;
-      case 8:  return 81;
-      default: return 97;
-    }
-  } else {
-    switch (WavesPerEU) {
-      case 0:  return 0;
-      case 10: return 0;
-      case 9:  return 49;
-      case 8:  return 57;
-      case 7:  return 65;
-      case 6:  return 73;
-      case 5:  return 81;
-      default: return 97;
-    }
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumSGPRs(const SISubtarget &ST,
-                                        unsigned WavesPerEU,
-                                        bool Addressable) const {
-  if (ST.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS) {
-    switch (WavesPerEU) {
-      case 0:  return 80;
-      case 10: return 80;
-      case 9:  return 80;
-      case 8:  return 96;
-      default: return Addressable ? getNumAddressableSGPRs(ST) : 112;
-    }
-  } else {
-    switch (WavesPerEU) {
-      case 0:  return 48;
-      case 10: return 48;
-      case 9:  return 56;
-      case 8:  return 64;
-      case 7:  return 72;
-      case 6:  return 80;
-      case 5:  return 96;
-      default: return getNumAddressableSGPRs(ST);
-    }
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumSGPRs(const MachineFunction &MF) const {
-  const Function &F = *MF.getFunction();
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-
-  // Compute maximum number of SGPRs function can use using default/requested
-  // minimum number of waves per execution unit.
-  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
-  unsigned MaxNumSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, false);
-  unsigned MaxNumAddressableSGPRs = getMaxNumSGPRs(ST, WavesPerEU.first, true);
-
-  // Check if maximum number of SGPRs was explicitly requested using
-  // "amdgpu-num-sgpr" attribute.
-  if (F.hasFnAttribute("amdgpu-num-sgpr")) {
-    unsigned Requested = AMDGPU::getIntegerAttribute(
-      F, "amdgpu-num-sgpr", MaxNumSGPRs);
-
-    // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && (Requested <= getNumReservedSGPRs(ST, MFI)))
-      Requested = 0;
-
-    // If more SGPRs are required to support the input user/system SGPRs,
-    // increase to accommodate them.
-    //
-    // FIXME: This really ends up using the requested number of SGPRs + number
-    // of reserved special registers in total. Theoretically you could re-use
-    // the last input registers for these special registers, but this would
-    // require a lot of complexity to deal with the weird aliasing.
-    unsigned NumInputSGPRs = MFI.getNumPreloadedSGPRs();
-    if (Requested && Requested < NumInputSGPRs)
-      Requested = NumInputSGPRs;
-
-    // Make sure requested value is compatible with values implied by
-    // default/requested minimum/maximum number of waves per execution unit.
-    if (Requested && Requested > getMaxNumSGPRs(ST, WavesPerEU.first, false))
-      Requested = 0;
-    if (WavesPerEU.second &&
-        Requested && Requested < getMinNumSGPRs(ST, WavesPerEU.second))
-      Requested = 0;
-
-    if (Requested)
-      MaxNumSGPRs = Requested;
-  }
-
-  if (ST.hasSGPRInitBug())
-    MaxNumSGPRs = SISubtarget::FIXED_SGPR_COUNT_FOR_INIT_BUG;
-
-  return std::min(MaxNumSGPRs - getNumReservedSGPRs(ST, MFI),
-                  MaxNumAddressableSGPRs);
-}
-
-unsigned SIRegisterInfo::getNumDebuggerReservedVGPRs(
-  const SISubtarget &ST) const {
-  if (ST.debuggerReserveRegs())
-    return 4;
-  return 0;
-}
-
-unsigned SIRegisterInfo::getMinNumVGPRs(unsigned WavesPerEU) const {
-  switch (WavesPerEU) {
-    case 0:  return 0;
-    case 10: return 0;
-    case 9:  return 25;
-    case 8:  return 29;
-    case 7:  return 33;
-    case 6:  return 37;
-    case 5:  return 41;
-    case 4:  return 49;
-    case 3:  return 65;
-    case 2:  return 85;
-    default: return 129;
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumVGPRs(unsigned WavesPerEU) const {
-  switch (WavesPerEU) {
-    case 0:  return 24;
-    case 10: return 24;
-    case 9:  return 28;
-    case 8:  return 32;
-    case 7:  return 36;
-    case 6:  return 40;
-    case 5:  return 48;
-    case 4:  return 64;
-    case 3:  return 84;
-    case 2:  return 128;
-    default: return getTotalNumVGPRs();
-  }
-}
-
-unsigned SIRegisterInfo::getMaxNumVGPRs(const MachineFunction &MF) const {
-  const Function &F = *MF.getFunction();
-
-  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
-  const SIMachineFunctionInfo &MFI = *MF.getInfo<SIMachineFunctionInfo>();
-
-  // Compute maximum number of VGPRs function can use using default/requested
-  // minimum number of waves per execution unit.
-  std::pair<unsigned, unsigned> WavesPerEU = MFI.getWavesPerEU();
-  unsigned MaxNumVGPRs = getMaxNumVGPRs(WavesPerEU.first);
-
-  // Check if maximum number of VGPRs was explicitly requested using
-  // "amdgpu-num-vgpr" attribute.
-  if (F.hasFnAttribute("amdgpu-num-vgpr")) {
-    unsigned Requested = AMDGPU::getIntegerAttribute(
-      F, "amdgpu-num-vgpr", MaxNumVGPRs);
-
-    // Make sure requested value does not violate subtarget's specifications.
-    if (Requested && Requested <= getNumDebuggerReservedVGPRs(ST))
-      Requested = 0;
-
-    // Make sure requested value is compatible with values implied by
-    // default/requested minimum/maximum number of waves per execution unit.
-    if (Requested && Requested > getMaxNumVGPRs(WavesPerEU.first))
-      Requested = 0;
-    if (WavesPerEU.second &&
-        Requested && Requested < getMinNumVGPRs(WavesPerEU.second))
-      Requested = 0;
-
-    if (Requested)
-      MaxNumVGPRs = Requested;
-  }
-
-  return MaxNumVGPRs - getNumDebuggerReservedVGPRs(ST);
-}
-
 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
                                                    unsigned EltSize) const {
   if (EltSize == 4) {
@@ -1496,3 +1369,42 @@ bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
 
   return NewSize <= DstSize || NewSize <= SrcSize;
 }
+
+unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                             MachineFunction &MF) const {
+
+  const SISubtarget &ST = MF.getSubtarget<SISubtarget>();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  unsigned Occupancy = ST.getOccupancyWithLocalMemSize(MFI->getLDSSize(),
+                                                       *MF.getFunction());
+  switch (RC->getID()) {
+  default:
+    return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
+  case AMDGPU::VGPR_32RegClassID:
+    return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
+  case AMDGPU::SGPR_32RegClassID:
+    return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
+  }
+}
+
+unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
+                                                unsigned Idx) const {
+  if (Idx == getVGPRPressureSet())
+    return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
+                               const_cast<MachineFunction &>(MF));
+
+  if (Idx == getSGPRPressureSet())
+    return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
+                               const_cast<MachineFunction &>(MF));
+
+  return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
+}
+
+const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
+  static const int Empty[] = { -1 };
+
+  if (hasRegUnit(AMDGPU::M0, RegUnit))
+    return Empty;
+  return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
+}
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.h b/lib/Target/AMDGPU/SIRegisterInfo.h
index 29c72b6a8f801262a63a4a4db627abcd98edd73a..679ed229758a0eef73f9e928942c43878d9e5706 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -21,8 +21,8 @@
 
 namespace llvm {
 
-class SISubtarget;
 class MachineRegisterInfo;
+class SISubtarget;
 class SIMachineFunctionInfo;
 
 class SIRegisterInfo final : public AMDGPURegisterInfo {
@@ -31,13 +31,22 @@ private:
   unsigned VGPRSetID;
   BitVector SGPRPressureSets;
   BitVector VGPRPressureSets;
+  bool SpillSGPRToVGPR;
+  bool SpillSGPRToSMEM;
 
   void reserveRegisterTuples(BitVector &, unsigned Reg) const;
   void classifyPressureSet(unsigned PSetID, unsigned Reg,
                            BitVector &PressureSets) const;
-
 public:
-  SIRegisterInfo();
+  SIRegisterInfo(const SISubtarget &ST);
+
+  bool spillSGPRToVGPR() const {
+    return SpillSGPRToVGPR;
+  }
+
+  bool spillSGPRToSMEM() const {
+    return SpillSGPRToSMEM;
+  }
 
   /// Return the end register initially reserved for the scratch buffer in case
   /// spilling is needed.
@@ -78,16 +87,22 @@ public:
   const TargetRegisterClass *getPointerRegClass(
     const MachineFunction &MF, unsigned Kind = 0) const override;
 
-  void spillSGPR(MachineBasicBlock::iterator MI,
-                 int FI, RegScavenger *RS) const;
+  /// If \p OnlyToVGPR is true, this will only succeed if this
+  bool spillSGPR(MachineBasicBlock::iterator MI,
+                 int FI, RegScavenger *RS,
+                 bool OnlyToVGPR = false) const;
 
-  void restoreSGPR(MachineBasicBlock::iterator MI,
-                   int FI, RegScavenger *RS) const;
+  bool restoreSGPR(MachineBasicBlock::iterator MI,
+                   int FI, RegScavenger *RS,
+                   bool OnlyToVGPR = false) const;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS) const override;
 
+  bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
+                                          int FI, RegScavenger *RS) const;
+
   unsigned getHWRegIndex(unsigned Reg) const {
     return getEncodingValue(Reg) & 0xff;
   }
@@ -195,72 +210,6 @@ public:
     return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID);
   }
 
-  /// \returns SGPR allocation granularity supported by the subtarget.
-  unsigned getSGPRAllocGranule() const {
-    return 8;
-  }
-
-  /// \returns Total number of SGPRs supported by the subtarget.
-  unsigned getTotalNumSGPRs(const SISubtarget &ST) const;
-
-  /// \returns Number of addressable SGPRs supported by the subtarget.
-  unsigned getNumAddressableSGPRs(const SISubtarget &ST) const;
-
-  /// \returns Number of reserved SGPRs supported by the subtarget.
-  unsigned getNumReservedSGPRs(const SISubtarget &ST,
-                               const SIMachineFunctionInfo &MFI) const;
-
-  /// \returns Minimum number of SGPRs that meets given number of waves per
-  /// execution unit requirement for given subtarget.
-  unsigned getMinNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU) const;
-
-  /// \returns Maximum number of SGPRs that meets given number of waves per
-  /// execution unit requirement for given subtarget.
-  unsigned getMaxNumSGPRs(const SISubtarget &ST, unsigned WavesPerEU,
-                          bool Addressable) const;
-
-  /// \returns Maximum number of SGPRs that meets number of waves per execution
-  /// unit requirement for function \p MF, or number of SGPRs explicitly
-  /// requested using "amdgpu-num-sgpr" attribute attached to function \p MF.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumSGPRs(const MachineFunction &MF) const;
-
-  /// \returns VGPR allocation granularity supported by the subtarget.
-  unsigned getVGPRAllocGranule() const {
-    return 4;
-  }
-
-  /// \returns Total number of VGPRs supported by the subtarget.
-  unsigned getTotalNumVGPRs() const {
-    return 256;
-  }
-
-  /// \returns Number of reserved VGPRs for debugger use supported by the
-  /// subtarget.
-  unsigned getNumDebuggerReservedVGPRs(const SISubtarget &ST) const;
-
-  /// \returns Minimum number of SGPRs that meets given number of waves per
-  /// execution unit requirement.
-  unsigned getMinNumVGPRs(unsigned WavesPerEU) const;
-
-  /// \returns Maximum number of VGPRs that meets given number of waves per
-  /// execution unit requirement.
-  unsigned getMaxNumVGPRs(unsigned WavesPerEU) const;
-
-  /// \returns Maximum number of VGPRs that meets number of waves per execution
-  /// unit requirement for function \p MF, or number of VGPRs explicitly
-  /// requested using "amdgpu-num-vgpr" attribute attached to function \p MF.
-  ///
-  /// \returns Value that meets number of waves per execution unit requirement
-  /// if explicitly requested value cannot be converted to integer, violates
-  /// subtarget's specifications, or does not meet number of waves per execution
-  /// unit requirement.
-  unsigned getMaxNumVGPRs(const MachineFunction &MF) const;
-
   ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
                                      unsigned EltSize) const;
 
@@ -271,6 +220,14 @@ public:
                       unsigned DstSubReg,
                       const TargetRegisterClass *NewRC) const override;
 
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const override;
+
+  unsigned getRegPressureSetLimit(const MachineFunction &MF,
+                                  unsigned Idx) const override;
+
+  const int *getRegUnitPressureSets(unsigned RegUnit) const override;
+
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
                            unsigned LoadStoreOp,
diff --git a/lib/Target/AMDGPU/SIRegisterInfo.td b/lib/Target/AMDGPU/SIRegisterInfo.td
index 31e714b9f6b92a4d52426043c3898334e13b5563..fc808011cd88950ebdc3beb9c4324938f9f13118 100644
--- a/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -44,6 +44,11 @@ def EXEC : RegisterWithSubRegs<"EXEC", [EXEC_LO, EXEC_HI]>,
 def SCC : SIReg<"scc", 253>;
 def M0 : SIReg <"m0", 124>;
 
+def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>;
+def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
+def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
+def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
+
 // Trap handler registers
 def TBA_LO : SIReg<"tba_lo", 108>;
 def TBA_HI : SIReg<"tba_hi", 109>;
@@ -128,7 +133,7 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
 // SGPR 32-bit registers
-def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "SGPR%u", 0, 103))> {
   // Give all SGPR classes higher priority than VGPR classes, because
   // we want to spill SGPRs to VGPRs.
@@ -179,7 +184,7 @@ def SGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
                                (add (decimate (shl SGPR_32, 15), 4))]>;
 
 // Trap handler TMP 32-bit registers
-def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32], 32,
+def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
                             (add (sequence "TTMP%u", 0, 11))> {
   let isAllocatable = 0;
 }
@@ -197,7 +202,8 @@ def TTMP_128Regs : RegisterTuples<[sub0, sub1, sub2, sub3],
                                (add (decimate (shl TTMP_32, 3), 4))]>;
 
 // VGPR 32-bit registers
-def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+// i16/f16 only on VI+
+def VGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "VGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
@@ -258,19 +264,20 @@ def VGPR_512 : RegisterTuples<[sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7,
 
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
-def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SGPR_32, VCC_LO, VCC_HI, FLAT_SCR_LO, FLAT_SCR_HI,
-   TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI)> {
+   TTMP_32, TMA_LO, TMA_HI, TBA_LO, TBA_HI, SRC_SHARED_BASE, SRC_SHARED_LIMIT,
+   SRC_PRIVATE_BASE, SRC_PRIVATE_LIMIT)> {
   let AllocationPriority = 7;
 }
 
-def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 7;
 }
 
 // Register class for all scalar registers (SGPRs + Special Registers)
-def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
   (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 7;
 }
@@ -319,7 +326,7 @@ def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256)> {
   let AllocationPriority = 11;
 }
 
-def SReg_512 : RegisterClass<"AMDGPU", [v64i8, v16i32], 32, (add SGPR_512)> {
+def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32, (add SGPR_512)> {
   // Requires 8 s_mov_b64 to copy
   let CopyCost = 8;
   let AllocationPriority = 12;
@@ -366,7 +373,7 @@ def VReg_1 : RegisterClass<"AMDGPU", [i1], 32, (add VGPR_32)> {
   let Size = 32;
 }
 
-def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16], 32,
+def VS_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                           (add VGPR_32, SReg_32)> {
   let isAllocatable = 0;
 }
@@ -417,6 +424,18 @@ multiclass SIRegOperand <string rc, string MatchName, string opType> {
       let OperandType = opType#"_FP64";
       let ParserMatchClass = RegImmMatcher<MatchName#"F64">;
     }
+
+    def _v2b16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_V2INT16";
+      let ParserMatchClass = RegImmMatcher<MatchName#"V2B16">;
+      let DecoderMethod = "decodeOperand_VSrcV216";
+    }
+
+    def _v2f16 : RegisterOperand<!cast<RegisterClass>(rc#"_32")> {
+      let OperandType = opType#"_V2FP16";
+      let ParserMatchClass = RegImmMatcher<MatchName#"V2F16">;
+      let DecoderMethod = "decodeOperand_VSrcV216";
+    }
   }
 }
 
diff --git a/lib/Target/AMDGPU/SMInstructions.td b/lib/Target/AMDGPU/SMInstructions.td
index 5dfae3f8f3f809b6853da2dd7eee3807194ef1d8..5b840a14dbc337b42048594882121cfc03b84100 100644
--- a/lib/Target/AMDGPU/SMInstructions.td
+++ b/lib/Target/AMDGPU/SMInstructions.td
@@ -226,9 +226,9 @@ def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>
 def smrd_load : PatFrag <(ops node:$ptr), (load node:$ptr), [{
   auto Ld = cast<LoadSDNode>(N);
   return Ld->getAlignment() >= 4  &&
-    ((Ld->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS &&
+    ((Ld->getAddressSpace() == AMDGPUASI.CONSTANT_ADDRESS &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N)) ||
-    (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
+    (Subtarget->getScalarizeGlobalBehavior() && Ld->getAddressSpace() == AMDGPUASI.GLOBAL_ADDRESS &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpUniform(N) &&
     static_cast<const SITargetLowering *>(getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)));
 }]>;
diff --git a/lib/Target/AMDGPU/SOPInstructions.td b/lib/Target/AMDGPU/SOPInstructions.td
index 73cd5774128e3816be6bd3a93229c5ca4c4bd39a..597d9ba668dc0a65ea158039ffe5023d3dd80f09 100644
--- a/lib/Target/AMDGPU/SOPInstructions.td
+++ b/lib/Target/AMDGPU/SOPInstructions.td
@@ -82,6 +82,12 @@ class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo <
   let has_sdst = 0;
 }
 
+class SOP1_0_32R <string opName, list<dag> pattern = []> : SOP1_Pseudo <
+  opName, (outs), (ins SReg_32:$src0),
+  "$src0", pattern> {
+  let has_sdst = 0;
+}
+
 class SOP1_64 <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
   opName, (outs SReg_64:$sdst), (ins SSrc_b64:$src0),
   "$sdst, $src0", pattern
@@ -210,7 +216,7 @@ def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">;
 def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">;
 } // End Uses = [M0]
 
-def S_CBRANCH_JOIN : SOP1_1  <"s_cbranch_join">;
+def S_CBRANCH_JOIN : SOP1_0_32R <"s_cbranch_join">;
 def S_MOV_REGRD_B32 : SOP1_32 <"s_mov_regrd_b32">;
 let Defs = [SCC] in {
 def S_ABS_I32 : SOP1_32 <"s_abs_i32">;
@@ -438,6 +444,22 @@ let Defs = [SCC] in {
 def S_ABSDIFF_I32 : SOP2_32 <"s_absdiff_i32">;
 } // End Defs = [SCC]
 
+let SubtargetPredicate = isVI in {
+  def S_RFE_RESTORE_B64 : SOP2_Pseudo <
+    "s_rfe_restore_b64", (outs),
+    (ins SSrc_b64:$src0, SSrc_b32:$src1),
+    "$src0, $src1"
+  > {
+    let hasSideEffects = 1;
+    let has_sdst = 0;
+  }
+}
+
+let SubtargetPredicate = isGFX9 in {
+  def S_PACK_LL_B32_B16 : SOP2_32<"s_pack_ll_b32_b16">;
+  def S_PACK_LH_B32_B16 : SOP2_32<"s_pack_lh_b32_b16">;
+  def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
+}
 
 //===----------------------------------------------------------------------===//
 // SOPK Instructions
@@ -751,6 +773,14 @@ def S_ENDPGM : SOPP <0x00000001, (ins), "s_endpgm",
   let isReturn = 1;
 }
 
+let SubtargetPredicate = isVI in {
+def S_ENDPGM_SAVED : SOPP <0x0000001B, (ins), "s_endpgm_saved"> {
+  let simm16 = 0;
+  let isBarrier = 1;
+  let isReturn = 1;
+}
+}
+
 let isBranch = 1, SchedRW = [WriteBranch] in {
 def S_BRANCH : SOPP <
   0x00000002, (ins sopp_brtarget:$simm16), "s_branch $simm16",
@@ -792,6 +822,25 @@ def S_CBRANCH_EXECNZ : SOPP <
 >;
 } // End Uses = [EXEC]
 
+def S_CBRANCH_CDBGSYS : SOPP <
+  0x00000017, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbgsys $simm16"
+>;
+
+def S_CBRANCH_CDBGSYS_AND_USER : SOPP <
+  0x0000001A, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbgsys_and_user $simm16"
+>;
+
+def S_CBRANCH_CDBGSYS_OR_USER : SOPP <
+  0x00000019, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbgsys_or_user $simm16"
+>;
+
+def S_CBRANCH_CDBGUSER : SOPP <
+  0x00000018, (ins sopp_brtarget:$simm16),
+  "s_cbranch_cdbguser $simm16"
+>;
 
 } // End isBranch = 1
 } // End isTerminator = 1
@@ -806,9 +855,18 @@ def S_BARRIER : SOPP <0x0000000a, (ins), "s_barrier",
   let isConvergent = 1;
 }
 
+let SubtargetPredicate = isVI in {
+def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
+  let simm16 = 0;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+}
+
 let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16">;
 def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
+def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
 
 // On SI the documentation says sleep for approximately 64 * low 2
 // bits, consistent with the reported maximum of 448. On VI the
@@ -1207,6 +1265,10 @@ def S_BFE_U64_vi           : SOP2_Real_vi <0x27, S_BFE_U64>;
 def S_BFE_I64_vi           : SOP2_Real_vi <0x28, S_BFE_I64>;
 def S_CBRANCH_G_FORK_vi    : SOP2_Real_vi <0x29, S_CBRANCH_G_FORK>;
 def S_ABSDIFF_I32_vi       : SOP2_Real_vi <0x2a, S_ABSDIFF_I32>;
+def S_PACK_LL_B32_B16_vi   : SOP2_Real_vi <0x32, S_PACK_LL_B32_B16>;
+def S_PACK_LH_B32_B16_vi   : SOP2_Real_vi <0x33, S_PACK_LH_B32_B16>;
+def S_PACK_HH_B32_B16_vi   : SOP2_Real_vi <0x34, S_PACK_HH_B32_B16>;
+def S_RFE_RESTORE_B64_vi   : SOP2_Real_vi <0x2b, S_RFE_RESTORE_B64>;
 
 def S_MOVK_I32_vi          : SOPK_Real_vi <0x00, S_MOVK_I32>;
 def S_CMOVK_I32_vi         : SOPK_Real_vi <0x01, S_CMOVK_I32>;
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 70ed40ec3b0af42d1408c64abff860269c255d12..86095a8e1142f2310670508e9cb911cdb14bdb30 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -1,4 +1,4 @@
-//===-- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information--------------===//
+//===- AMDGPUBaseInfo.cpp - AMDGPU Base encoding information --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -6,34 +6,42 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-#include "AMDGPUBaseInfo.h"
+
 #include "AMDGPU.h"
+#include "AMDGPUBaseInfo.h"
 #include "SIDefines.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <cstring>
+#include <utility>
 
-#define GET_SUBTARGETINFO_ENUM
-#include "AMDGPUGenSubtargetInfo.inc"
-#undef GET_SUBTARGETINFO_ENUM
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 
-#define GET_REGINFO_ENUM
-#include "AMDGPUGenRegisterInfo.inc"
-#undef GET_REGINFO_ENUM
 
 #define GET_INSTRINFO_NAMED_OPS
-#define GET_INSTRINFO_ENUM
 #include "AMDGPUGenInstrInfo.inc"
 #undef GET_INSTRINFO_NAMED_OPS
-#undef GET_INSTRINFO_ENUM
 
 namespace {
 
@@ -58,11 +66,11 @@ unsigned unpackBits(unsigned Src, unsigned Shift, unsigned Width) {
   return (Src & getBitMask(Shift, Width)) >> Shift;
 }
 
-/// \returns Vmcnt bit shift.
-unsigned getVmcntBitShift() { return 0; }
+/// \returns Vmcnt bit shift (lower bits).
+unsigned getVmcntBitShiftLo() { return 0; }
 
-/// \returns Vmcnt bit width.
-unsigned getVmcntBitWidth() { return 4; }
+/// \returns Vmcnt bit width (lower bits).
+unsigned getVmcntBitWidthLo() { return 4; }
 
 /// \returns Expcnt bit shift.
 unsigned getExpcntBitShift() { return 4; }
@@ -76,52 +84,224 @@ unsigned getLgkmcntBitShift() { return 8; }
 /// \returns Lgkmcnt bit width.
 unsigned getLgkmcntBitWidth() { return 4; }
 
-} // anonymous namespace
+/// \returns Vmcnt bit shift (higher bits).
+unsigned getVmcntBitShiftHi() { return 14; }
+
+/// \returns Vmcnt bit width (higher bits).
+unsigned getVmcntBitWidthHi() { return 2; }
+
+} // end namespace anonymous
 
 namespace llvm {
 namespace AMDGPU {
 
-IsaVersion getIsaVersion(const FeatureBitset &Features) {
+namespace IsaInfo {
 
+IsaVersion getIsaVersion(const FeatureBitset &Features) {
+  // CI.
   if (Features.test(FeatureISAVersion7_0_0))
     return {7, 0, 0};
-
   if (Features.test(FeatureISAVersion7_0_1))
     return {7, 0, 1};
-
   if (Features.test(FeatureISAVersion7_0_2))
     return {7, 0, 2};
 
+  // VI.
   if (Features.test(FeatureISAVersion8_0_0))
     return {8, 0, 0};
-
   if (Features.test(FeatureISAVersion8_0_1))
     return {8, 0, 1};
-
   if (Features.test(FeatureISAVersion8_0_2))
     return {8, 0, 2};
-
   if (Features.test(FeatureISAVersion8_0_3))
     return {8, 0, 3};
-
   if (Features.test(FeatureISAVersion8_0_4))
     return {8, 0, 4};
-
   if (Features.test(FeatureISAVersion8_1_0))
     return {8, 1, 0};
 
-  return {0, 0, 0};
+  // GFX9.
+  if (Features.test(FeatureISAVersion9_0_0))
+    return {9, 0, 0};
+  if (Features.test(FeatureISAVersion9_0_1))
+    return {9, 0, 1};
+
+  if (!Features.test(FeatureGCN) || Features.test(FeatureSouthernIslands))
+    return {0, 0, 0};
+  return {7, 0, 0};
+}
+
+unsigned getWavefrontSize(const FeatureBitset &Features) {
+  if (Features.test(FeatureWavefrontSize16))
+    return 16;
+  if (Features.test(FeatureWavefrontSize32))
+    return 32;
+
+  return 64;
+}
+
+unsigned getLocalMemorySize(const FeatureBitset &Features) {
+  if (Features.test(FeatureLocalMemorySize32768))
+    return 32768;
+  if (Features.test(FeatureLocalMemorySize65536))
+    return 65536;
+
+  return 0;
+}
+
+unsigned getEUsPerCU(const FeatureBitset &Features) {
+  return 4;
+}
+
+unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
+                               unsigned FlatWorkGroupSize) {
+  if (!Features.test(FeatureGCN))
+    return 8;
+  unsigned N = getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+  if (N == 1)
+    return 40;
+  N = 40 / N;
+  return std::min(N, 16u);
+}
+
+unsigned getMaxWavesPerCU(const FeatureBitset &Features) {
+  return getMaxWavesPerEU(Features) * getEUsPerCU(Features);
+}
+
+unsigned getMaxWavesPerCU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize) {
+  return getWavesPerWorkGroup(Features, FlatWorkGroupSize);
+}
+
+unsigned getMinWavesPerEU(const FeatureBitset &Features) {
+  return 1;
+}
+
+unsigned getMaxWavesPerEU(const FeatureBitset &Features) {
+  if (!Features.test(FeatureGCN))
+    return 8;
+  // FIXME: Need to take scratch memory into account.
+  return 10;
+}
+
+unsigned getMaxWavesPerEU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize) {
+  return alignTo(getMaxWavesPerCU(Features, FlatWorkGroupSize),
+                 getEUsPerCU(Features)) / getEUsPerCU(Features);
+}
+
+unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features) {
+  return 1;
+}
+
+unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features) {
+  return 2048;
+}
+
+unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
+                              unsigned FlatWorkGroupSize) {
+  return alignTo(FlatWorkGroupSize, getWavefrontSize(Features)) /
+                 getWavefrontSize(Features);
+}
+
+unsigned getSGPRAllocGranule(const FeatureBitset &Features) {
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major >= 8)
+    return 16;
+  return 8;
+}
+
+unsigned getSGPREncodingGranule(const FeatureBitset &Features) {
+  return 8;
+}
+
+unsigned getTotalNumSGPRs(const FeatureBitset &Features) {
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major >= 8)
+    return 800;
+  return 512;
+}
+
+unsigned getAddressableNumSGPRs(const FeatureBitset &Features) {
+  if (Features.test(FeatureSGPRInitBug))
+    return FIXED_NUM_SGPRS_FOR_INIT_BUG;
+
+  IsaVersion Version = getIsaVersion(Features);
+  if (Version.Major >= 8)
+    return 102;
+  return 104;
+}
+
+unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+  assert(WavesPerEU != 0);
+
+  if (WavesPerEU >= getMaxWavesPerEU(Features))
+    return 0;
+  unsigned MinNumSGPRs =
+      alignDown(getTotalNumSGPRs(Features) / (WavesPerEU + 1),
+                getSGPRAllocGranule(Features)) + 1;
+  return std::min(MinNumSGPRs, getAddressableNumSGPRs(Features));
+}
+
+unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
+                        bool Addressable) {
+  assert(WavesPerEU != 0);
+
+  IsaVersion Version = getIsaVersion(Features);
+  unsigned MaxNumSGPRs = alignDown(getTotalNumSGPRs(Features) / WavesPerEU,
+                                   getSGPRAllocGranule(Features));
+  unsigned AddressableNumSGPRs = getAddressableNumSGPRs(Features);
+  if (Version.Major >= 8 && !Addressable)
+    AddressableNumSGPRs = 112;
+  return std::min(MaxNumSGPRs, AddressableNumSGPRs);
+}
+
+unsigned getVGPRAllocGranule(const FeatureBitset &Features) {
+  return 4;
+}
+
+unsigned getVGPREncodingGranule(const FeatureBitset &Features) {
+  return getVGPRAllocGranule(Features);
+}
+
+unsigned getTotalNumVGPRs(const FeatureBitset &Features) {
+  return 256;
+}
+
+unsigned getAddressableNumVGPRs(const FeatureBitset &Features) {
+  return getTotalNumVGPRs(Features);
 }
 
+unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+  assert(WavesPerEU != 0);
+
+  if (WavesPerEU >= getMaxWavesPerEU(Features))
+    return 0;
+  unsigned MinNumVGPRs =
+      alignDown(getTotalNumVGPRs(Features) / (WavesPerEU + 1),
+                getVGPRAllocGranule(Features)) + 1;
+  return std::min(MinNumVGPRs, getAddressableNumVGPRs(Features));
+}
+
+unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU) {
+  assert(WavesPerEU != 0);
+
+  unsigned MaxNumVGPRs = alignDown(getTotalNumVGPRs(Features) / WavesPerEU,
+                                   getVGPRAllocGranule(Features));
+  unsigned AddressableNumVGPRs = getAddressableNumVGPRs(Features);
+  return std::min(MaxNumVGPRs, AddressableNumVGPRs);
+}
+
+} // end namespace IsaInfo
+
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features) {
-
-  IsaVersion ISA = getIsaVersion(Features);
+  IsaInfo::IsaVersion ISA = IsaInfo::getIsaVersion(Features);
 
   memset(&Header, 0, sizeof(Header));
 
   Header.amd_kernel_code_version_major = 1;
-  Header.amd_kernel_code_version_minor = 0;
+  Header.amd_kernel_code_version_minor = 1;
   Header.amd_machine_kind = 1; // AMD_MACHINE_KIND_AMDGPU
   Header.amd_machine_version_major = ISA.Major;
   Header.amd_machine_version_minor = ISA.Minor;
@@ -168,16 +348,16 @@ MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx) {
                            ELF::SHF_AMDGPU_HSA_AGENT);
 }
 
-bool isGroupSegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS;
+bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS) {
+  return GV->getType()->getAddressSpace() == AS.LOCAL_ADDRESS;
 }
 
-bool isGlobalSegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
+bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS) {
+  return GV->getType()->getAddressSpace() == AS.GLOBAL_ADDRESS;
 }
 
-bool isReadOnlySegment(const GlobalValue *GV) {
-  return GV->getType()->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS;
+bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS) {
+  return GV->getType()->getAddressSpace() == AS.CONSTANT_ADDRESS;
 }
 
 bool shouldEmitConstantsToTextSection(const Triple &TT) {
@@ -215,7 +395,7 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
     return Default;
   }
   if (Strs.second.trim().getAsInteger(0, Ints.second)) {
-    if (!OnlyFirstRequired || Strs.second.trim().size()) {
+    if (!OnlyFirstRequired || !Strs.second.trim().empty()) {
       Ctx.emitError("can't parse second integer attribute " + Name);
       return Default;
     }
@@ -224,57 +404,84 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
   return Ints;
 }
 
-unsigned getWaitcntBitMask(IsaVersion Version) {
-  unsigned Vmcnt = getBitMask(getVmcntBitShift(), getVmcntBitWidth());
-  unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
-  unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
-  return Vmcnt | Expcnt | Lgkmcnt;
-}
+unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version) {
+  unsigned VmcntLo = (1 << getVmcntBitWidthLo()) - 1;
+  if (Version.Major < 9)
+    return VmcntLo;
 
-unsigned getVmcntBitMask(IsaVersion Version) {
-  return (1 << getVmcntBitWidth()) - 1;
+  unsigned VmcntHi = ((1 << getVmcntBitWidthHi()) - 1) << getVmcntBitWidthLo();
+  return VmcntLo | VmcntHi;
 }
 
-unsigned getExpcntBitMask(IsaVersion Version) {
+unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version) {
   return (1 << getExpcntBitWidth()) - 1;
 }
 
-unsigned getLgkmcntBitMask(IsaVersion Version) {
+unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version) {
   return (1 << getLgkmcntBitWidth()) - 1;
 }
 
-unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt) {
-  return unpackBits(Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version) {
+  unsigned VmcntLo = getBitMask(getVmcntBitShiftLo(), getVmcntBitWidthLo());
+  unsigned Expcnt = getBitMask(getExpcntBitShift(), getExpcntBitWidth());
+  unsigned Lgkmcnt = getBitMask(getLgkmcntBitShift(), getLgkmcntBitWidth());
+  unsigned Waitcnt = VmcntLo | Expcnt | Lgkmcnt;
+  if (Version.Major < 9)
+    return Waitcnt;
+
+  unsigned VmcntHi = getBitMask(getVmcntBitShiftHi(), getVmcntBitWidthHi());
+  return Waitcnt | VmcntHi;
+}
+
+unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
+  unsigned VmcntLo =
+      unpackBits(Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
+  if (Version.Major < 9)
+    return VmcntLo;
+
+  unsigned VmcntHi =
+      unpackBits(Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
+  VmcntHi <<= getVmcntBitWidthLo();
+  return VmcntLo | VmcntHi;
 }
 
-unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt) {
+unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
   return unpackBits(Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
 }
 
-unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt) {
+unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt) {
   return unpackBits(Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
 }
 
-void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt) {
   Vmcnt = decodeVmcnt(Version, Waitcnt);
   Expcnt = decodeExpcnt(Version, Waitcnt);
   Lgkmcnt = decodeLgkmcnt(Version, Waitcnt);
 }
 
-unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt) {
-  return packBits(Vmcnt, Waitcnt, getVmcntBitShift(), getVmcntBitWidth());
+unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                     unsigned Vmcnt) {
+  Waitcnt =
+      packBits(Vmcnt, Waitcnt, getVmcntBitShiftLo(), getVmcntBitWidthLo());
+  if (Version.Major < 9)
+    return Waitcnt;
+
+  Vmcnt >>= getVmcntBitWidthLo();
+  return packBits(Vmcnt, Waitcnt, getVmcntBitShiftHi(), getVmcntBitWidthHi());
 }
 
-unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt) {
+unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                      unsigned Expcnt) {
   return packBits(Expcnt, Waitcnt, getExpcntBitShift(), getExpcntBitWidth());
 }
 
-unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt) {
+unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                       unsigned Lgkmcnt) {
   return packBits(Lgkmcnt, Waitcnt, getLgkmcntBitShift(), getLgkmcntBitWidth());
 }
 
-unsigned encodeWaitcnt(IsaVersion Version,
+unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt) {
   unsigned Waitcnt = getWaitcntBitMask(Version);
   Waitcnt = encodeVmcnt(Version, Waitcnt, Vmcnt);
@@ -303,6 +510,10 @@ bool isCompute(CallingConv::ID cc) {
   return !isShader(cc) || cc == CallingConv::AMDGPU_CS;
 }
 
+bool isEntryFunctionCC(CallingConv::ID CC) {
+  return true;
+}
+
 bool isSI(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureSouthernIslands];
 }
@@ -334,15 +545,34 @@ unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI) {
   return Reg;
 }
 
+unsigned mc2PseudoReg(unsigned Reg) {
+  switch (Reg) {
+  case AMDGPU::FLAT_SCR_ci:
+  case AMDGPU::FLAT_SCR_vi:
+    return FLAT_SCR;
+
+  case AMDGPU::FLAT_SCR_LO_ci:
+  case AMDGPU::FLAT_SCR_LO_vi:
+    return AMDGPU::FLAT_SCR_LO;
+
+  case AMDGPU::FLAT_SCR_HI_ci:
+  case AMDGPU::FLAT_SCR_HI_vi:
+    return AMDGPU::FLAT_SCR_HI;
+
+  default:
+    return Reg;
+  }
+}
+
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo) {
-  assert(OpNo <= Desc.NumOperands);
+  assert(OpNo < Desc.NumOperands);
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
   return OpType >= AMDGPU::OPERAND_SRC_FIRST &&
          OpType <= AMDGPU::OPERAND_SRC_LAST;
 }
 
 bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
-  assert(OpNo <= Desc.NumOperands);
+  assert(OpNo < Desc.NumOperands);
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
   switch (OpType) {
   case AMDGPU::OPERAND_REG_IMM_FP32:
@@ -351,6 +581,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
   case AMDGPU::OPERAND_REG_INLINE_C_FP32:
   case AMDGPU::OPERAND_REG_INLINE_C_FP64:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     return true;
   default:
     return false;
@@ -358,7 +589,7 @@ bool isSISrcFPOperand(const MCInstrDesc &Desc, unsigned OpNo) {
 }
 
 bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
-  assert(OpNo <= Desc.NumOperands);
+  assert(OpNo < Desc.NumOperands);
   unsigned OpType = Desc.OpInfo[OpNo].OperandType;
   return OpType >= AMDGPU::OPERAND_REG_INLINE_C_FIRST &&
          OpType <= AMDGPU::OPERAND_REG_INLINE_C_LAST;
@@ -402,7 +633,7 @@ unsigned getRegBitWidth(const MCRegisterClass &RC) {
 
 unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
                            unsigned OpNo) {
-  assert(OpNo <= Desc.NumOperands);
+  assert(OpNo < Desc.NumOperands);
   unsigned RCID = Desc.OpInfo[OpNo].RegClass;
   return getRegBitWidth(MRI->getRegClass(RCID)) / 8;
 }
@@ -469,6 +700,14 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
          Val == 0x3118;   // 1/2pi
 }
 
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
+  assert(HasInv2Pi);
+
+  int16_t Lo16 = static_cast<int16_t>(Literal);
+  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
+  return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
+}
+
 bool isUniformMMO(const MachineMemOperand *MMO) {
   const Value *Ptr = MMO->getValue();
   // UndefValue means this is a load of a kernel input.  These are uniform.
@@ -495,6 +734,58 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
   return isSI(ST) || isCI(ST) ? isUInt<8>(EncodedOffset) :
                                 isUInt<20>(EncodedOffset);
 }
+} // end namespace AMDGPU
+
+} // end namespace llvm
+
+const unsigned AMDGPUAS::MAX_COMMON_ADDRESS;
+const unsigned AMDGPUAS::GLOBAL_ADDRESS;
+const unsigned AMDGPUAS::LOCAL_ADDRESS;
+const unsigned AMDGPUAS::PARAM_D_ADDRESS;
+const unsigned AMDGPUAS::PARAM_I_ADDRESS;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_0;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_1;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_2;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_3;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_4;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_5;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_6;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_7;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_8;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_9;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_10;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_11;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_12;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_13;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_14;
+const unsigned AMDGPUAS::CONSTANT_BUFFER_15;
+const unsigned AMDGPUAS::UNKNOWN_ADDRESS_SPACE;
+
+namespace llvm {
+namespace AMDGPU {
 
-} // End namespace AMDGPU
-} // End namespace llvm
+AMDGPUAS getAMDGPUAS(Triple T) {
+  auto Env = T.getEnvironmentName();
+  AMDGPUAS AS;
+  if (Env == "amdgiz" || Env == "amdgizcl") {
+    AS.FLAT_ADDRESS     = 0;
+    AS.PRIVATE_ADDRESS  = 5;
+    AS.REGION_ADDRESS   = 4;
+  }
+  else {
+    AS.FLAT_ADDRESS     = 4;
+    AS.PRIVATE_ADDRESS  = 0;
+    AS.REGION_ADDRESS   = 5;
+   }
+  return AS;
+}
+
+AMDGPUAS getAMDGPUAS(const TargetMachine &M) {
+  return getAMDGPUAS(M.getTargetTriple());
+}
+
+AMDGPUAS getAMDGPUAS(const Module &M) {
+  return getAMDGPUAS(Triple(M.getTargetTriple()));
+}
+} // namespace AMDGPU
+} // namespace llvm
diff --git a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 10944d2fee618093202057b97ce1da9029a99a29..d6c836eb748b1521c9edfd24031d76b1c74111e9 100644
--- a/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -1,4 +1,4 @@
-//===-- AMDGPUBaseInfo.h - Top level definitions for AMDGPU -----*- C++ -*-===//
+//===- AMDGPUBaseInfo.h - Top level definitions for AMDGPU ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,14 +10,16 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
 
+#include "AMDGPU.h"
 #include "AMDKernelCodeT.h"
-#include "llvm/IR/CallingConv.h"
-
 #include "SIDefines.h"
-
-#define GET_INSTRINFO_OPERAND_ENUM
-#include "AMDGPUGenInstrInfo.inc"
-#undef GET_INSTRINFO_OPERAND_ENUM
+#include "llvm/ADT/StringRef.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cstdint>
+#include <utility>
 
 namespace llvm {
 
@@ -26,24 +28,125 @@ class Function;
 class GlobalValue;
 class MachineMemOperand;
 class MCContext;
-class MCInstrDesc;
 class MCRegisterClass;
 class MCRegisterInfo;
 class MCSection;
 class MCSubtargetInfo;
+class Triple;
 
 namespace AMDGPU {
+namespace IsaInfo {
 
-LLVM_READONLY
-int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+enum {
+  // The closed Vulkan driver sets 96, which limits the wave count to 8 but
+  // doesn't spill SGPRs as much as when 80 is set.
+  FIXED_NUM_SGPRS_FOR_INIT_BUG = 96
+};
 
+/// \brief Instruction set architecture version.
 struct IsaVersion {
   unsigned Major;
   unsigned Minor;
   unsigned Stepping;
 };
 
+/// \returns Isa version for given subtarget \p Features.
 IsaVersion getIsaVersion(const FeatureBitset &Features);
+
+/// \returns Wavefront size for given subtarget \p Features.
+unsigned getWavefrontSize(const FeatureBitset &Features);
+
+/// \returns Local memory size in bytes for given subtarget \p Features.
+unsigned getLocalMemorySize(const FeatureBitset &Features);
+
+/// \returns Number of execution units per compute unit for given subtarget \p
+/// Features.
+unsigned getEUsPerCU(const FeatureBitset &Features);
+
+/// \returns Maximum number of work groups per compute unit for given subtarget
+/// \p Features and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWorkGroupsPerCU(const FeatureBitset &Features,
+                               unsigned FlatWorkGroupSize);
+
+/// \returns Maximum number of waves per compute unit for given subtarget \p
+/// Features without any kind of limitation.
+unsigned getMaxWavesPerCU(const FeatureBitset &Features);
+
+/// \returns Maximum number of waves per compute unit for given subtarget \p
+/// Features and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWavesPerCU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize);
+
+/// \returns Minimum number of waves per execution unit for given subtarget \p
+/// Features.
+unsigned getMinWavesPerEU(const FeatureBitset &Features);
+
+/// \returns Maximum number of waves per execution unit for given subtarget \p
+/// Features without any kind of limitation.
+unsigned getMaxWavesPerEU(const FeatureBitset &Features);
+
+/// \returns Maximum number of waves per execution unit for given subtarget \p
+/// Features and limited by given \p FlatWorkGroupSize.
+unsigned getMaxWavesPerEU(const FeatureBitset &Features,
+                          unsigned FlatWorkGroupSize);
+
+/// \returns Minimum flat work group size for given subtarget \p Features.
+unsigned getMinFlatWorkGroupSize(const FeatureBitset &Features);
+
+/// \returns Maximum flat work group size for given subtarget \p Features.
+unsigned getMaxFlatWorkGroupSize(const FeatureBitset &Features);
+
+/// \returns Number of waves per work group for given subtarget \p Features and
+/// limited by given \p FlatWorkGroupSize.
+unsigned getWavesPerWorkGroup(const FeatureBitset &Features,
+                              unsigned FlatWorkGroupSize);
+
+/// \returns SGPR allocation granularity for given subtarget \p Features.
+unsigned getSGPRAllocGranule(const FeatureBitset &Features);
+
+/// \returns SGPR encoding granularity for given subtarget \p Features.
+unsigned getSGPREncodingGranule(const FeatureBitset &Features);
+
+/// \returns Total number of SGPRs for given subtarget \p Features.
+unsigned getTotalNumSGPRs(const FeatureBitset &Features);
+
+/// \returns Addressable number of SGPRs for given subtarget \p Features.
+unsigned getAddressableNumSGPRs(const FeatureBitset &Features);
+
+/// \returns Minimum number of SGPRs that meets the given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMinNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+
+/// \returns Maximum number of SGPRs that meets the given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMaxNumSGPRs(const FeatureBitset &Features, unsigned WavesPerEU,
+                        bool Addressable);
+
+/// \returns VGPR allocation granularity for given subtarget \p Features.
+unsigned getVGPRAllocGranule(const FeatureBitset &Features);
+
+/// \returns VGPR encoding granularity for given subtarget \p Features.
+unsigned getVGPREncodingGranule(const FeatureBitset &Features);
+
+/// \returns Total number of VGPRs for given subtarget \p Features.
+unsigned getTotalNumVGPRs(const FeatureBitset &Features);
+
+/// \returns Addressable number of VGPRs for given subtarget \p Features.
+unsigned getAddressableNumVGPRs(const FeatureBitset &Features);
+
+/// \returns Minimum number of VGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMinNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+
+/// \returns Maximum number of VGPRs that meets given number of waves per
+/// execution unit requirement for given subtarget \p Features.
+unsigned getMaxNumVGPRs(const FeatureBitset &Features, unsigned WavesPerEU);
+
+} // end namespace IsaInfo
+
+LLVM_READONLY
+int16_t getNamedOperandIdx(uint16_t Opcode, uint16_t NamedIdx);
+
 void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header,
                                const FeatureBitset &Features);
 MCSection *getHSATextSection(MCContext &Ctx);
@@ -54,9 +157,9 @@ MCSection *getHSADataGlobalProgramSection(MCContext &Ctx);
 
 MCSection *getHSARodataReadonlyAgentSection(MCContext &Ctx);
 
-bool isGroupSegment(const GlobalValue *GV);
-bool isGlobalSegment(const GlobalValue *GV);
-bool isReadOnlySegment(const GlobalValue *GV);
+bool isGroupSegment(const GlobalValue *GV, AMDGPUAS AS);
+bool isGlobalSegment(const GlobalValue *GV, AMDGPUAS AS);
+bool isReadOnlySegment(const GlobalValue *GV, AMDGPUAS AS);
 
 /// \returns True if constants should be emitted to .text section for given
 /// target triple \p TT, false otherwise.
@@ -84,64 +187,89 @@ std::pair<int, int> getIntegerPairAttribute(const Function &F,
                                             std::pair<int, int> Default,
                                             bool OnlyFirstRequired = false);
 
-/// \returns Waitcnt bit mask for given isa \p Version.
-unsigned getWaitcntBitMask(IsaVersion Version);
-
 /// \returns Vmcnt bit mask for given isa \p Version.
-unsigned getVmcntBitMask(IsaVersion Version);
+unsigned getVmcntBitMask(const IsaInfo::IsaVersion &Version);
 
 /// \returns Expcnt bit mask for given isa \p Version.
-unsigned getExpcntBitMask(IsaVersion Version);
+unsigned getExpcntBitMask(const IsaInfo::IsaVersion &Version);
 
 /// \returns Lgkmcnt bit mask for given isa \p Version.
-unsigned getLgkmcntBitMask(IsaVersion Version);
+unsigned getLgkmcntBitMask(const IsaInfo::IsaVersion &Version);
+
+/// \returns Waitcnt bit mask for given isa \p Version.
+unsigned getWaitcntBitMask(const IsaInfo::IsaVersion &Version);
 
 /// \returns Decoded Vmcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeVmcnt(IsaVersion Version, unsigned Waitcnt);
+unsigned decodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
 /// \returns Decoded Expcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeExpcnt(IsaVersion Version, unsigned Waitcnt);
+unsigned decodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
 /// \returns Decoded Lgkmcnt from given \p Waitcnt for given isa \p Version.
-unsigned decodeLgkmcnt(IsaVersion Version, unsigned Waitcnt);
+unsigned decodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt);
 
 /// \brief Decodes Vmcnt, Expcnt and Lgkmcnt from given \p Waitcnt for given isa
 /// \p Version, and writes decoded values into \p Vmcnt, \p Expcnt and
 /// \p Lgkmcnt respectively.
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are decoded as follows:
-///     \p Vmcnt = \p Waitcnt[3:0]
+///     \p Vmcnt = \p Waitcnt[3:0]                      (pre-gfx9 only)
+///     \p Vmcnt = \p Waitcnt[3:0] | \p Waitcnt[15:14]  (gfx9+ only)
 ///     \p Expcnt = \p Waitcnt[6:4]
 ///     \p Lgkmcnt = \p Waitcnt[11:8]
-void decodeWaitcnt(IsaVersion Version, unsigned Waitcnt,
+void decodeWaitcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
                    unsigned &Vmcnt, unsigned &Expcnt, unsigned &Lgkmcnt);
 
 /// \returns \p Waitcnt with encoded \p Vmcnt for given isa \p Version.
-unsigned encodeVmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Vmcnt);
+unsigned encodeVmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                     unsigned Vmcnt);
 
 /// \returns \p Waitcnt with encoded \p Expcnt for given isa \p Version.
-unsigned encodeExpcnt(IsaVersion Version, unsigned Waitcnt, unsigned Expcnt);
+unsigned encodeExpcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                      unsigned Expcnt);
 
 /// \returns \p Waitcnt with encoded \p Lgkmcnt for given isa \p Version.
-unsigned encodeLgkmcnt(IsaVersion Version, unsigned Waitcnt, unsigned Lgkmcnt);
+unsigned encodeLgkmcnt(const IsaInfo::IsaVersion &Version, unsigned Waitcnt,
+                       unsigned Lgkmcnt);
 
 /// \brief Encodes \p Vmcnt, \p Expcnt and \p Lgkmcnt into Waitcnt for given isa
 /// \p Version.
 ///
 /// \details \p Vmcnt, \p Expcnt and \p Lgkmcnt are encoded as follows:
-///     Waitcnt[3:0]  = \p Vmcnt
-///     Waitcnt[6:4]  = \p Expcnt
-///     Waitcnt[11:8] = \p Lgkmcnt
+///     Waitcnt[3:0]   = \p Vmcnt       (pre-gfx9 only)
+///     Waitcnt[3:0]   = \p Vmcnt[3:0]  (gfx9+ only)
+///     Waitcnt[6:4]   = \p Expcnt
+///     Waitcnt[11:8]  = \p Lgkmcnt
+///     Waitcnt[15:14] = \p Vmcnt[5:4]  (gfx9+ only)
 ///
 /// \returns Waitcnt with encoded \p Vmcnt, \p Expcnt and \p Lgkmcnt for given
 /// isa \p Version.
-unsigned encodeWaitcnt(IsaVersion Version,
+unsigned encodeWaitcnt(const IsaInfo::IsaVersion &Version,
                        unsigned Vmcnt, unsigned Expcnt, unsigned Lgkmcnt);
 
 unsigned getInitialPSInputAddr(const Function &F);
 
-bool isShader(CallingConv::ID cc);
-bool isCompute(CallingConv::ID cc);
+LLVM_READNONE
+bool isShader(CallingConv::ID CC);
+
+LLVM_READNONE
+bool isCompute(CallingConv::ID CC);
+
+LLVM_READNONE
+bool isEntryFunctionCC(CallingConv::ID CC);
+
+// FIXME: Remove this when calling conventions cleaned up
+LLVM_READNONE
+inline bool isKernel(CallingConv::ID CC) {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::SPIR_KERNEL:
+    return true;
+  default:
+    return false;
+  }
+}
 
 bool isSI(const MCSubtargetInfo &STI);
 bool isCI(const MCSubtargetInfo &STI);
@@ -151,6 +279,10 @@ bool isVI(const MCSubtargetInfo &STI);
 /// \p STI otherwise return \p Reg.
 unsigned getMCReg(unsigned Reg, const MCSubtargetInfo &STI);
 
+/// \brief Convert hardware register \p Reg to a pseudo register
+LLVM_READNONE
+unsigned mc2PseudoReg(unsigned Reg);
+
 /// \brief Can this operand also contain immediate values?
 bool isSISrcOperand(const MCInstrDesc &Desc, unsigned OpNo);
 
@@ -189,6 +321,8 @@ inline unsigned getOperandSize(const MCOperandInfo &OpInfo) {
   case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     return 2;
 
   default:
@@ -211,6 +345,9 @@ bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi);
 LLVM_READNONE
 bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
 
+LLVM_READNONE
+bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
+
 bool isUniformMMO(const MachineMemOperand *MMO);
 
 /// \returns The encoding that will be used for \p ByteOffset in the SMRD
@@ -225,4 +362,4 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 } // end namespace AMDGPU
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_AMDGPU_UTILS_AMDGPUBASEINFO_H
diff --git a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
index c55eaab077d1f7af8630b2d6da19526a3bd8e29e..991408c81c922566be519fd80d1830b0441b7528 100644
--- a/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
+++ b/lib/Target/AMDGPU/Utils/AMDKernelCodeTInfo.h
@@ -87,7 +87,7 @@ COMPPGM1(enable_ieee_mode,                compute_pgm_rsrc1_ieee_mode,      IEEE
 // TODO: cdbg_user
 COMPPGM2(enable_sgpr_private_segment_wave_byte_offset, compute_pgm_rsrc2_scratch_en, SCRATCH_EN),
 COMPPGM2(user_sgpr_count,                 compute_pgm_rsrc2_user_sgpr,      USER_SGPR),
-// TODO: enable_trap_handler
+COMPPGM2(enable_trap_handler,             compute_pgm_rsrc2_trap_handler,   TRAP_HANDLER),
 COMPPGM2(enable_sgpr_workgroup_id_x,      compute_pgm_rsrc2_tgid_x_en,      TGID_X_EN),
 COMPPGM2(enable_sgpr_workgroup_id_y,      compute_pgm_rsrc2_tgid_y_en,      TGID_Y_EN),
 COMPPGM2(enable_sgpr_workgroup_id_z,      compute_pgm_rsrc2_tgid_z_en,      TGID_Z_EN),
diff --git a/lib/Target/AMDGPU/VOP1Instructions.td b/lib/Target/AMDGPU/VOP1Instructions.td
index a15b9ceff2f43a110cb7289265027a75783d27d5..1febc6bf8ec2081bab1c495c2765ebb9138ea52f 100644
--- a/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/lib/Target/AMDGPU/VOP1Instructions.td
@@ -23,18 +23,18 @@ class VOP1e <bits<8> op, VOPProfile P> : Enc32 {
 
 class VOP1_SDWAe <bits<8> op, VOPProfile P> : VOP_SDWAe <P> {
   bits<8> vdst;
-  
+
   let Inst{8-0}   = 0xf9; // sdwa
   let Inst{16-9}  = op;
   let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
   let Inst{31-25} = 0x3f; // encoding
 }
 
-class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
+class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1Only = 0> :
   InstSI <P.Outs32, P.Ins32, "", pattern>,
   VOP <opName>,
-  SIMCInstr <opName#"_e32", SIEncodingFamily.NONE>,
-  MnemonicAlias<opName#"_e32", opName> {
+  SIMCInstr <!if(VOP1Only, opName, opName#"_e32"), SIEncodingFamily.NONE>,
+  MnemonicAlias<!if(VOP1Only, opName, opName#"_e32"), opName> {
 
   let isPseudo = 1;
   let isCodeGenOnly = 1;
@@ -75,6 +75,8 @@ class VOP1_Real <VOP1_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
 class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -83,10 +85,17 @@ class VOP1_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
 }
 
 class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
-  list<dag> ret = !if(P.HasModifiers,
-    [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
-                                i32:$src0_modifiers, i1:$clamp, i32:$omod))))],
-    [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]);
+  list<dag> ret =
+    !if(P.HasModifiers,
+        [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
+                                              i32:$src0_modifiers,
+                                              i1:$clamp, i32:$omod))))],
+        !if(P.HasOMod,
+            [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
+                                                  i1:$clamp, i32:$omod))))],
+            [(set P.DstVT:$vdst, (node P.Src0VT:$src0))]
+        )
+    );
 }
 
 multiclass VOP1Inst <string opName, VOPProfile P,
@@ -96,6 +105,23 @@ multiclass VOP1Inst <string opName, VOPProfile P,
   def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
 }
 
+// Special profile for instructions which have clamp
+// and output modifiers (but have no input modifiers)
+class VOPProfileI2F<ValueType dstVt, ValueType srcVt> :
+  VOPProfile<[dstVt, srcVt, untyped, untyped]> {
+
+  let Ins64 = (ins Src0RC64:$src0, clampmod:$clamp, omod:$omod);
+  let Asm64 = "$vdst, $src0$clamp$omod";
+
+  let HasModifiers = 0;
+  let HasClamp = 1;
+  let HasOMod = 1;
+}
+
+def VOP1_F64_I32 : VOPProfileI2F <f64, i32>;
+def VOP1_F32_I32 : VOPProfileI2F <f32, i32>;
+def VOP1_F16_I16 : VOPProfileI2F <f16, i16>;
+
 //===----------------------------------------------------------------------===//
 // VOP1 Instructions
 //===----------------------------------------------------------------------===//
@@ -142,24 +168,24 @@ def V_READFIRSTLANE_B32 :
 
 let SchedRW = [WriteQuarterRate32] in {
 defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64, fp_to_sint>;
-defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP_F64_I32, sint_to_fp>;
-defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP_F32_I32, sint_to_fp>;
-defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP_F32_I32, uint_to_fp>;
+defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
+defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
+defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
 defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
 defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
-defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_I32_F32, fp_to_f16>;
-defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_I32, f16_to_fp>;
+defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
+defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
 defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
-defm V_CVT_OFF_F32_I4 : VOP1Inst  <"v_cvt_off_f32_i4", VOP_F32_I32>;
+defm V_CVT_OFF_F32_I4 : VOP1Inst  <"v_cvt_off_f32_i4", VOP1_F32_I32>;
 defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64, fpround>;
 defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32, fpextend>;
-defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP_F32_I32, AMDGPUcvt_f32_ubyte0>;
-defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP_F32_I32, AMDGPUcvt_f32_ubyte1>;
-defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP_F32_I32, AMDGPUcvt_f32_ubyte2>;
-defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP_F32_I32, AMDGPUcvt_f32_ubyte3>;
+defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>;
+defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>;
+defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>;
+defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>;
 defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64, fp_to_uint>;
-defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP_F64_I32, uint_to_fp>;
+defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
 } // End SchedRW = [WriteQuarterRate32]
 
 defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
@@ -237,7 +263,7 @@ def VOP_MOVRELD : VOPProfile<[untyped, i32, untyped, untyped]> {
                      src0_sel:$src0_sel);
 
   let Asm32 = getAsm32<1, 1>.ret;
-  let Asm64 = getAsm64<1, 1, 0>.ret;
+  let Asm64 = getAsm64<1, 1, 0, 1>.ret;
   let AsmDPP = getAsmDPP<1, 1, 0>.ret;
   let AsmSDWA = getAsmSDWA<1, 1, 0>.ret;
 
@@ -258,11 +284,14 @@ defm V_MOVRELS_B32 : VOP1Inst <"v_movrels_b32", VOP_I32_VI32_NO_EXT>;
 defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_NO_EXT<VOP_I32_I32>>;
 } // End Uses = [M0, EXEC]
 
+let SchedRW = [WriteQuarterRate32] in {
+defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
+}
+
 // These instruction only exist on SI and CI
 let SubtargetPredicate = isSICI in {
 
 let SchedRW = [WriteQuarterRate32] in {
-defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
 defm V_LOG_CLAMP_F32 : VOP1Inst <"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
 defm V_RCP_CLAMP_F32 : VOP1Inst <"v_rcp_clamp_f32", VOP_F32_F32>;
 defm V_RCP_LEGACY_F32 : VOP1Inst <"v_rcp_legacy_f32", VOP_F32_F32, AMDGPUrcp_legacy>;
@@ -297,8 +326,8 @@ defm V_EXP_LEGACY_F32 : VOP1Inst <"v_exp_legacy_f32", VOP_F32_F32>;
 
 let SubtargetPredicate = isVI in {
 
-defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP_F16_I16, uint_to_fp>;
-defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP_F16_I16, sint_to_fp>;
+defm V_CVT_F16_U16 : VOP1Inst <"v_cvt_f16_u16", VOP1_F16_I16, uint_to_fp>;
+defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
 defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
@@ -326,12 +355,31 @@ def : Pat<
 >;
 
 def : Pat<
-    (i16 (fp_to_f16 f32:$src)),
+    (i16 (AMDGPUfp_to_f16 f32:$src)),
     (V_CVT_F16_F32_e32 $src)
 >;
 
 }
 
+def VOP_SWAP_I32 : VOPProfile<[i32, i32, i32, untyped]> {
+  let Outs32 = (outs VGPR_32:$vdst, VGPR_32:$vdst1);
+  let Ins32 = (ins VGPR_32:$src0, VGPR_32:$src1);
+  let Outs64 = Outs32;
+  let Asm32 = " $vdst, $src0";
+  let Asm64 = "";
+  let Ins64 = (ins);
+}
+
+let SubtargetPredicate = isGFX9 in {
+  let Constraints = "$vdst = $src1, $vdst1 = $src0",
+      DisableEncoding="$vdst1,$src1",
+      SchedRW = [Write64Bit, Write64Bit] in {
+// Never VOP3. Takes as long as 2 v_mov_b32s
+def V_SWAP_B32 : VOP1_Pseudo <"v_swap_b32", VOP_SWAP_I32, [], 1>;
+}
+
+} // End SubtargetPredicate = isGFX9
+
 //===----------------------------------------------------------------------===//
 // Target
 //===----------------------------------------------------------------------===//
@@ -453,6 +501,14 @@ class VOP1_DPP <bits<8> op, VOP1_Pseudo ps, VOPProfile P = ps.Pfl> :
   let Inst{31-25} = 0x3f; //encoding
 }
 
+multiclass VOP1Only_Real_vi <bits<10> op> {
+  let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
+    def _vi :
+      VOP1_Real<!cast<VOP1_Pseudo>(NAME), SIEncodingFamily.VI>,
+      VOP1e<op{7-0}, !cast<VOP1_Pseudo>(NAME).Pfl>;
+  }
+}
+
 multiclass VOP1_Real_vi <bits<10> op> {
   let AssemblerPredicates = [isVI], DecoderNamespace = "VI" in {
     def _e32_vi :
@@ -480,6 +536,7 @@ defm V_CVT_F32_I32       : VOP1_Real_vi <0x5>;
 defm V_CVT_F32_U32       : VOP1_Real_vi <0x6>;
 defm V_CVT_U32_F32       : VOP1_Real_vi <0x7>;
 defm V_CVT_I32_F32       : VOP1_Real_vi <0x8>;
+defm V_MOV_FED_B32       : VOP1_Real_vi <0x9>;
 defm V_CVT_F16_F32       : VOP1_Real_vi <0xa>;
 defm V_CVT_F32_F16       : VOP1_Real_vi <0xb>;
 defm V_CVT_RPI_I32_F32   : VOP1_Real_vi <0xc>;
@@ -547,7 +604,7 @@ defm V_RNDNE_F16         : VOP1_Real_vi <0x47>;
 defm V_FRACT_F16         : VOP1_Real_vi <0x48>;
 defm V_SIN_F16           : VOP1_Real_vi <0x49>;
 defm V_COS_F16           : VOP1_Real_vi <0x4a>;
-
+defm V_SWAP_B32          : VOP1Only_Real_vi <0x51>;
 
 // Copy of v_mov_b32 with $vdst as a use operand for use with VGPR
 // indexing mode. vdst can't be treated as a def for codegen purposes,
@@ -607,12 +664,6 @@ def : Pat<
   (COPY $src)
 >;
 
-def : Pat<
-  (i1 (trunc i16:$src)),
-  (COPY $src)
->;
-
-
 def : Pat <
   (i16 (trunc i64:$src)),
   (EXTRACT_SUBREG $src, sub0)
diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td
index cd5f044ecd5388f74f0dc7eb38d3e64206cb3980..2281f338ab45ea571be028682ab0da191acf4be5 100644
--- a/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/lib/Target/AMDGPU/VOP2Instructions.td
@@ -40,7 +40,7 @@ class VOP2_MADKe <bits<6> op, VOPProfile P> : Enc64 {
 class VOP2_SDWAe <bits<6> op, VOPProfile P> : VOP_SDWAe <P> {
   bits<8> vdst;
   bits<8> src1;
-  
+
   let Inst{8-0}   = 0xf9; // sdwa
   let Inst{16-9}  = !if(P.HasSrc1, src1{7-0}, 0);
   let Inst{24-17} = !if(P.EmitDst, vdst{7-0}, 0);
@@ -93,6 +93,8 @@ class VOP2_Real <VOP2_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
 class VOP2_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -133,7 +135,7 @@ multiclass VOP2bInst <string opName,
     let Uses = !if(useSGPRInput, [VCC, EXEC], [EXEC]), Defs = [VCC] in {
       def _e32 : VOP2_Pseudo <opName, P>,
                  Commutable_REV<revOp#"_e32", !eq(revOp, opName)>;
-      
+
       def _sdwa : VOP2_SDWA_Pseudo <opName, P>;
     }
 
@@ -179,10 +181,12 @@ class VOP_MADMK <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
 def VOP_MADMK_F16 : VOP_MADMK <f16>;
 def VOP_MADMK_F32 : VOP_MADMK <f32>;
 
+// FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
+// and processing time but it makes it easier to convert to mad.
 class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, VGPR_32:$src2);
   let Ins64 = getIns64<Src0RC64, Src1RC64, RegisterOperand<VGPR_32>, 3,
-                       HasModifiers, Src0Mod, Src1Mod, Src2Mod>.ret;
+                       HasModifiers, HasOMod, Src0Mod, Src1Mod, Src2Mod>.ret;
   let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                     Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
                     VGPR_32:$src2, // stub argument
@@ -194,6 +198,7 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
                      clampmod:$clamp, dst_sel:$dst_sel, dst_unused:$dst_unused,
                      src0_sel:$src0_sel, src1_sel:$src1_sel);
   let Asm32 = getAsm32<1, 2, vt>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, vt>.ret;
   let AsmDPP = getAsmDPP<1, 2, HasModifiers, vt>.ret;
   let AsmSDWA = getAsmSDWA<1, 2, HasModifiers, vt>.ret;
   let HasSrc2 = 0;
@@ -204,13 +209,13 @@ class VOP_MAC <ValueType vt> : VOPProfile <[vt, vt, vt, vt]> {
 def VOP_MAC_F16 : VOP_MAC <f16> {
   // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
   // 'not a string initializer' error.
-  let Asm64 = getAsm64<1, 2, HasModifiers, f16>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f16>.ret;
 }
 
 def VOP_MAC_F32 : VOP_MAC <f32> {
   // FIXME: Move 'Asm64' definition to VOP_MAC, and use 'vt'. Currently it gives
   // 'not a string initializer' error.
-  let Asm64 = getAsm64<1, 2, HasModifiers, f32>.ret;
+  let Asm64 = getAsm64<1, 2, HasModifiers, HasOMod, f32>.ret;
 }
 
 // Write out to vcc or arbitrary SGPR.
@@ -280,7 +285,7 @@ def VOP_READLANE : VOPProfile<[i32, i32, i32]> {
 def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> {
   let Outs32 = (outs VGPR_32:$vdst);
   let Outs64 = Outs32;
-  let Ins32 = (ins SReg_32:$src0, SCSrc_b32:$src1);
+  let Ins32 = (ins SCSrc_b32:$src0, SCSrc_b32:$src1);
   let Ins64 = Ins32;
   let Asm32 = " $vdst, $src0, $src1";
   let Asm64 = Asm32;
@@ -354,7 +359,7 @@ defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_F32_F32_I32, AMDGPUldexp>;
 defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_I32_F32_I32>; // TODO: set "Uses = dst"
 defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_I32_F32_F32>;
 defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_I32_F32_F32>;
-defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, int_SI_packf16>;
+defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_I32_F32_F32, AMDGPUpkrtz_f16_f32>;
 defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_I32_I32_I32>;
 defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_I32_I32_I32>;
 
@@ -574,7 +579,10 @@ defm V_SUBB_U32           : VOP2be_Real_e32e64_si <0x29>;
 defm V_SUBBREV_U32        : VOP2be_Real_e32e64_si <0x2a>;
 
 defm V_READLANE_B32       : VOP2_Real_si <0x01>;
+
+let InOperandList = (ins SSrc_b32:$src0, SCSrc_b32:$src1) in {
 defm V_WRITELANE_B32      : VOP2_Real_si <0x02>;
+}
 
 defm V_MAC_LEGACY_F32     : VOP2_Real_e32e64_si <0x6>;
 defm V_MIN_LEGACY_F32     : VOP2_Real_e32e64_si <0xd>;
@@ -654,7 +662,7 @@ multiclass Base_VOP2_Real_e32e64_vi <bits<6> op> :
   VOP2_Real_e64_vi<{0, 1, 0, 0, op{5-0}}>;
 
 } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
- 
+
 multiclass VOP2_SDWA_Real <bits<6> op> {
   def _sdwa_vi :
     VOP_SDWA_Real <!cast<VOP2_SDWA_Pseudo>(NAME#"_sdwa")>,
diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td
index c2a4d4ba99b133be21d5d25eb2871f40535c28e3..217a0748885320c139ff7ad117b5632a8581ade6 100644
--- a/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/lib/Target/AMDGPU/VOP3Instructions.td
@@ -29,6 +29,26 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
                   ret1));
 }
 
+class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
+  list<dag> ret3 = [(set P.DstVT:$vdst,
+    (node (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+                                    (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
+          (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
+          (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))];
+
+  list<dag> ret2 = [(set P.DstVT:$vdst,
+    (node !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+                          (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
+          (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))];
+
+  list<dag> ret1 = [(set P.DstVT:$vdst,
+    (node (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+
+  list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
+                  !if(!eq(P.NumSrcArgs, 2), ret2,
+                  ret1));
+}
+
 class getVOP3Pat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1, P.Src2VT:$src2))];
   list<dag> ret2 = [(set P.DstVT:$vdst, (node P.Src0VT:$src0, P.Src1VT:$src1))];
@@ -86,6 +106,14 @@ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
   let DstRC = RegisterOperand<VReg_64>;
 }
 
+def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
+  // FIXME: Hack to stop printing _e64
+  let DstRC = RegisterOperand<VReg_64>;
+
+  let Outs64 = (outs DstRC:$vdst, SReg_64:$sdst);
+  let Asm64 = " $vdst, $sdst, $src0, $src1, $src2";
+}
+
 //===----------------------------------------------------------------------===//
 // VOP3 Instructions
 //===----------------------------------------------------------------------===//
@@ -209,10 +237,8 @@ def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I3
 def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile<VOP_V4I32_I64_I32_V4I32>, int_amdgcn_mqsad_u32_u8>;
 
 let isCommutable = 1 in {
-def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
-
-// XXX - Does this set VCC?
-def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3_Profile<VOP_I64_I32_I32_I64>>;
+def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
+def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
 } // End isCommutable = 1
 
 } // End SubtargetPredicate = isCIVI
@@ -234,12 +260,14 @@ def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16>>;
 
 }  // End isCommutable = 1
 
+def V_PERM_B32 : VOP3Inst <"v_perm_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+
 } // End SubtargetPredicate = isVI
 
 let Predicates = [isVI] in {
 
-multiclass Tenary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
-                            Instruction inst, SDPatternOperator op3> {
+multiclass Ternary_i16_Pats <SDPatternOperator op1, SDPatternOperator op2,
+                             Instruction inst, SDPatternOperator op3> {
 def : Pat<
   (op2 (op1 i16:$src0, i16:$src1), i16:$src2),
   (inst i16:$src0, i16:$src1, i16:$src2)
@@ -258,11 +286,26 @@ def : Pat<
 >;
 }
 
-defm: Tenary_i16_Pats<mul, add, V_MAD_U16, zext>;
-defm: Tenary_i16_Pats<mul, add, V_MAD_I16, sext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_U16, zext>;
+defm: Ternary_i16_Pats<mul, add, V_MAD_I16, sext>;
 
 } // End Predicates = [isVI]
 
+let SubtargetPredicate = isGFX9 in {
+def V_PACK_B32_F16 : VOP3Inst <"v_pack_b32_f16", VOP3_Profile<VOP_B32_F16_F16>>;
+def V_LSHL_ADD_U32 : VOP3Inst <"v_lshl_add_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_ADD_LSHL_U32 : VOP3Inst <"v_add_lshl_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_ADD3_U32 : VOP3Inst <"v_add3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_LSHL_OR_B32 : VOP3Inst <"v_lshl_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_AND_OR_B32 : VOP3Inst <"v_and_or_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_OR3_B32 : VOP3Inst <"v_or3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+
+def V_XAD_U32 : VOP3Inst <"v_xad_u32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
+def V_MED3_F16 : VOP3Inst <"v_med3_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, AMDGPUfmed3>;
+def V_MED3_I16 : VOP3Inst <"v_med3_i16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUsmed3>;
+def V_MED3_U16 : VOP3Inst <"v_med3_u16", VOP3_Profile<VOP_I16_I16_I16_I16>, AMDGPUumed3>;
+}
+
 
 //===----------------------------------------------------------------------===//
 // Target
@@ -351,11 +394,19 @@ multiclass VOP3_Real_ci<bits<9> op> {
   }
 }
 
+multiclass VOP3be_Real_ci<bits<9> op> {
+  def _ci : VOP3_Real<!cast<VOP3_Pseudo>(NAME), SIEncodingFamily.SI>,
+            VOP3be_si <op, !cast<VOP3_Pseudo>(NAME).Pfl> {
+    let AssemblerPredicates = [isCIOnly];
+    let DecoderNamespace = "CI";
+  }
+}
+
 defm V_MQSAD_U16_U8     : VOP3_Real_ci <0x172>;
 defm V_QSAD_PK_U16_U8   : VOP3_Real_ci <0x172>;
-defm V_MQSAD_U32_U8     : VOP3_Real_ci <0x174>;
-defm V_MAD_U64_U32      : VOP3_Real_ci <0x176>;
-defm V_MAD_I64_I32      : VOP3_Real_ci <0x177>;
+defm V_MQSAD_U32_U8     : VOP3_Real_ci <0x175>;
+defm V_MAD_U64_U32      : VOP3be_Real_ci <0x176>;
+defm V_MAD_I64_I32      : VOP3be_Real_ci <0x177>;
 
 //===----------------------------------------------------------------------===//
 // VI
@@ -376,8 +427,8 @@ multiclass VOP3be_Real_vi<bits<10> op> {
 } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI"
 
 defm V_MQSAD_U16_U8     : VOP3_Real_vi <0x172>;
-defm V_MAD_U64_U32      : VOP3_Real_vi <0x176>;
-defm V_MAD_I64_I32      : VOP3_Real_vi <0x177>;
+defm V_MAD_U64_U32      : VOP3be_Real_vi <0x1E8>;
+defm V_MAD_I64_I32      : VOP3be_Real_vi <0x1E9>;
 
 defm V_MAD_LEGACY_F32   : VOP3_Real_vi <0x1c0>;
 defm V_MAD_F32          : VOP3_Real_vi <0x1c1>;
@@ -424,6 +475,8 @@ defm V_MAD_F16          : VOP3_Real_vi <0x1ea>;
 defm V_MAD_U16          : VOP3_Real_vi <0x1eb>;
 defm V_MAD_I16          : VOP3_Real_vi <0x1ec>;
 
+defm V_PERM_B32         : VOP3_Real_vi <0x1ed>;
+
 defm V_FMA_F16          : VOP3_Real_vi <0x1ee>;
 defm V_DIV_FIXUP_F16    : VOP3_Real_vi <0x1ef>;
 
@@ -449,3 +502,16 @@ defm V_LSHLREV_B64      : VOP3_Real_vi <0x28f>;
 defm V_LSHRREV_B64      : VOP3_Real_vi <0x290>;
 defm V_ASHRREV_I64      : VOP3_Real_vi <0x291>;
 defm V_TRIG_PREOP_F64   : VOP3_Real_vi <0x292>;
+
+defm V_LSHL_ADD_U32 : VOP3_Real_vi <0x1fd>;
+defm V_ADD_LSHL_U32 : VOP3_Real_vi <0x1fe>;
+defm V_ADD3_U32 : VOP3_Real_vi <0x1ff>;
+defm V_LSHL_OR_B32 : VOP3_Real_vi <0x200>;
+defm V_AND_OR_B32 : VOP3_Real_vi <0x201>;
+defm V_OR3_B32 : VOP3_Real_vi <0x202>;
+defm V_PACK_B32_F16 : VOP3_Real_vi <0x2a0>;
+
+defm V_XAD_U32 : VOP3_Real_vi <0x1f3>;
+defm V_MED3_F16 : VOP3_Real_vi <0x1fa>;
+defm V_MED3_I16 : VOP3_Real_vi <0x1fb>;
+defm V_MED3_U16 : VOP3_Real_vi <0x1fc>;
diff --git a/lib/Target/AMDGPU/VOP3PInstructions.td b/lib/Target/AMDGPU/VOP3PInstructions.td
new file mode 100644
index 0000000000000000000000000000000000000000..96d343099132c0a37af7fa80acc3efc135d3a7a5
--- /dev/null
+++ b/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -0,0 +1,82 @@
+//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// VOP3P Classes
+//===----------------------------------------------------------------------===//
+
+class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
+  VOP3P_Pseudo<OpName, P,
+    !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
+>;
+
+// Non-packed instructions that use the VOP3P encoding. i.e. where
+// omod/abs are used.
+class VOP3_VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
+  VOP3P_Pseudo<OpName, P,
+    !if(P.HasModifiers, getVOP3ModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
+>;
+
+let isCommutable = 1 in {
+def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
+def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
+def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
+def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum>;
+def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum>;
+
+def V_PK_ADD_U16 : VOP3PInst<"v_pk_add_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, add>;
+def V_PK_ADD_I16 : VOP3PInst<"v_pk_add_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>>;
+def V_PK_SUB_I16 : VOP3PInst<"v_pk_sub_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, sub>;
+def V_PK_MUL_LO_U16 : VOP3PInst<"v_pk_mul_lo_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, mul>;
+
+def V_PK_MIN_I16 : VOP3PInst<"v_pk_min_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smin>;
+def V_PK_MIN_U16 : VOP3PInst<"v_pk_min_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umin>;
+def V_PK_MAX_I16 : VOP3PInst<"v_pk_max_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, smax>;
+def V_PK_MAX_U16 : VOP3PInst<"v_pk_max_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, umax>;
+}
+
+def V_PK_LSHLREV_B16 : VOP3PInst<"v_pk_lshlrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshl_rev>;
+def V_PK_ASHRREV_I16 : VOP3PInst<"v_pk_ashrrev_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, ashr_rev>;
+def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I16_V2I16>, lshr_rev>;
+
+// XXX - Commutable?
+def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+def V_MAD_MIXLO_F16 : VOP3_VOP3PInst<"v_mad_mixlo_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+def V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3_Profile<VOP_F16_F16_F16_F16>>;
+
+
+multiclass VOP3P_Real_vi<bits<10> op> {
+  def _vi : VOP3P_Real<!cast<VOP3P_Pseudo>(NAME), SIEncodingFamily.VI>,
+            VOP3Pe <op, !cast<VOP3P_Pseudo>(NAME).Pfl> {
+    let AssemblerPredicates = [HasVOP3PInsts];
+    let DecoderNamespace = "VI";
+  }
+}
+
+defm V_PK_MUL_LO_U16 : VOP3P_Real_vi <0x381>;
+defm V_PK_ADD_I16 : VOP3P_Real_vi <0x382>;
+defm V_PK_SUB_I16 : VOP3P_Real_vi <0x383>;
+defm V_PK_LSHLREV_B16 : VOP3P_Real_vi <0x384>;
+defm V_PK_LSHRREV_B16 : VOP3P_Real_vi <0x385>;
+defm V_PK_ASHRREV_I16 : VOP3P_Real_vi <0x386>;
+defm V_PK_MAX_I16 : VOP3P_Real_vi <0x387>;
+defm V_PK_MIN_I16 : VOP3P_Real_vi <0x388>;
+
+defm V_PK_ADD_U16 : VOP3P_Real_vi <0x38a>;
+defm V_PK_MAX_U16 : VOP3P_Real_vi <0x38c>;
+defm V_PK_MIN_U16 : VOP3P_Real_vi <0x38d>;
+defm V_PK_FMA_F16 : VOP3P_Real_vi <0x38e>;
+defm V_PK_ADD_F16 : VOP3P_Real_vi <0x38f>;
+defm V_PK_MUL_F16 : VOP3P_Real_vi <0x390>;
+defm V_PK_MIN_F16 : VOP3P_Real_vi <0x391>;
+defm V_PK_MAX_F16 : VOP3P_Real_vi <0x392>;
+
+defm V_MAD_MIX_F32 : VOP3P_Real_vi <0x3a0>;
+defm V_MAD_MIXLO_F16 : VOP3P_Real_vi <0x3a1>;
+defm V_MAD_MIXHI_F16 : VOP3P_Real_vi <0x3a2>;
diff --git a/lib/Target/AMDGPU/VOPCInstructions.td b/lib/Target/AMDGPU/VOPCInstructions.td
index bb05fb7bae7fa163f5a12377fa9f7a89627d9eba..a3550a63677bac9e6ec842c9077fcff6f0b12855 100644
--- a/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/lib/Target/AMDGPU/VOPCInstructions.td
@@ -93,6 +93,8 @@ class VOPC_Real <VOPC_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
 class VOPC_SDWA_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
@@ -561,7 +563,7 @@ multiclass VOPC_CLASS_F16 <string opName> :
   VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 0>;
 
 multiclass VOPCX_CLASS_F16 <string opName> :
-  VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 1>;
+  VOPC_Class_Pseudos <opName, VOPC_I1_F16_I32, 1>;
 
 multiclass VOPC_CLASS_F32 <string opName> :
   VOPC_Class_Pseudos <opName, VOPC_I1_F32_I32, 0>;
diff --git a/lib/Target/AMDGPU/VOPInstructions.td b/lib/Target/AMDGPU/VOPInstructions.td
index 184c5bc9bb59568439fefd293b32fb1b728ea29c..69906c419db3b9e6773bc57012fe8b5c1422dede 100644
--- a/lib/Target/AMDGPU/VOPInstructions.td
+++ b/lib/Target/AMDGPU/VOPInstructions.td
@@ -68,8 +68,9 @@ class VOP3Common <dag outs, dag ins, string asm = "",
   let hasPostISelHook = 1;
 }
 
-class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3Only = 0> :
-  InstSI <P.Outs64, P.Ins64, "", pattern>,
+class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
+                   bit VOP3Only = 0, bit isVOP3P = 0> :
+  InstSI <P.Outs64, !if(!and(isVOP3P, P.IsPacked), P.InsVOP3P, P.Ins64), "", pattern>,
   VOP <opName>,
   SIMCInstr<opName#"_e64", SIEncodingFamily.NONE>,
   MnemonicAlias<opName#"_e64", opName> {
@@ -79,7 +80,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3On
   let UseNamedOperandTable = 1;
 
   string Mnemonic = opName;
-  string AsmOperands = P.Asm64;
+  string AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64);
 
   let Size = 8;
   let mayLoad = 0;
@@ -100,23 +101,34 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP3On
 
   let VOP3 = 1;
   let VALU = 1;
+  let FPClamp = P.HasFPClamp;
   let Uses = [EXEC];
 
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
   let AsmMatchConverter =
     !if(!eq(VOP3Only,1),
-        "cvtVOP3",
-        !if(!eq(P.HasModifiers, 1), "cvtVOP3_2_mod", ""));
+        !if(!and(P.IsPacked, isVOP3P), "cvtVOP3P", "cvtVOP3"),
+        !if(!eq(P.HasModifiers, 1),
+            "cvtVOP3_2_mod",
+            !if(!eq(P.HasOMod, 1), "cvtVOP3OMod", "")
+        )
+    );
 
   VOPProfile Pfl = P;
 }
 
+class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
+  VOP3_Pseudo<opName, P, pattern, 1, 1> {
+  let VOP3P = 1;
+}
+
 class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   InstSI <ps.OutOperandList, ps.InOperandList, ps.Mnemonic # ps.AsmOperands, []>,
   SIMCInstr <ps.PseudoInstr, EncodingFamily> {
 
   let isPseudo = 0;
   let isCodeGenOnly = 0;
+  let UseNamedOperandTable = 1;
 
   let Constraints     = ps.Constraints;
   let DisableEncoding = ps.DisableEncoding;
@@ -128,8 +140,15 @@ class VOP3_Real <VOP3_Pseudo ps, int EncodingFamily> :
   let Constraints        = ps.Constraints;
   let DisableEncoding    = ps.DisableEncoding;
   let TSFlags            = ps.TSFlags;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let Uses                 = ps.Uses;
 }
 
+// XXX - Is there any reason to distingusih this from regular VOP3
+// here?
+class VOP3P_Real<VOP3P_Pseudo ps, int EncodingFamily> :
+  VOP3_Real<ps, EncodingFamily>;
+
 class VOP3a<VOPProfile P> : Enc64 {
   bits<2> src0_modifiers;
   bits<9> src0;
@@ -197,6 +216,42 @@ class VOP3be <VOPProfile P> : Enc64 {
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
 }
 
+class VOP3Pe <bits<10> op, VOPProfile P> : Enc64 {
+  bits<8> vdst;
+  // neg, neg_hi, op_sel put in srcN_modifiers
+  bits<4> src0_modifiers;
+  bits<9> src0;
+  bits<4> src1_modifiers;
+  bits<9> src1;
+  bits<4> src2_modifiers;
+  bits<9> src2;
+  bits<1> clamp;
+
+  let Inst{7-0} = vdst;
+  let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
+  let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
+  let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
+
+  let Inst{11} = !if(P.HasOpSel, src0_modifiers{2}, 0); // op_sel(0)
+  let Inst{12} = !if(P.HasOpSel, src1_modifiers{2}, 0); // op_sel(1)
+  let Inst{13} = !if(P.HasOpSel, src2_modifiers{2}, 0); // op_sel(2)
+
+  let Inst{14} = !if(P.HasOpSel, src2_modifiers{3}, 0); // op_sel_hi(2)
+
+  let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+
+  let Inst{25-16} = op;
+  let Inst{31-26} = 0x34; //encoding
+  let Inst{40-32} = !if(P.HasSrc0, src0, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2, 0);
+  let Inst{59}    = !if(P.HasOpSel, src0_modifiers{3}, 0); // op_sel_hi(0)
+  let Inst{60}    = !if(P.HasOpSel, src1_modifiers{3}, 0); // op_sel_hi(1)
+  let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0); // neg (lo)
+  let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0); // neg (lo)
+  let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0); // neg (lo)
+}
+
 class VOP3be_si <bits<9> op, VOPProfile P> : VOP3be<P> {
   let Inst{25-17} = op;
 }
@@ -250,7 +305,7 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   VOP <opName>,
   SIMCInstr <opName#"_sdwa", SIEncodingFamily.NONE>,
   MnemonicAlias <opName#"_sdwa", opName> {
-  
+
   let isPseudo = 1;
   let isCodeGenOnly = 1;
   let UseNamedOperandTable = 1;
@@ -261,13 +316,13 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
   let Size = 8;
   let mayLoad = 0;
   let mayStore = 0;
-  let hasSideEffects = 0;  
+  let hasSideEffects = 0;
 
   let VALU = 1;
   let SDWA = 1;
   let Uses = [EXEC];
-  
-  let SubtargetPredicate = HasSDWA;
+
+  let SubtargetPredicate = !if(P.HasExt, HasSDWA, DisableInst);
   let AssemblerPredicate = !if(P.HasExt, HasSDWA, DisableInst);
   let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.SDWA,
                                      AMDGPUAsmVariants.Disable);
@@ -348,3 +403,4 @@ include "VOPCInstructions.td"
 include "VOP1Instructions.td"
 include "VOP2Instructions.td"
 include "VOP3Instructions.td"
+include "VOP3PInstructions.td"
diff --git a/lib/Target/ARM/ARM.h b/lib/Target/ARM/ARM.h
index 76227bc28e27a543912226fe2b7f13b66037eb61..39f7988200ea8904e09e531dd5c46ae62e8e3ba3 100644
--- a/lib/Target/ARM/ARM.h
+++ b/lib/Target/ARM/ARM.h
@@ -53,6 +53,7 @@ std::vector<BasicBlockInfo> computeAllBlockSizes(MachineFunction *MF);
 
 void initializeARMLoadStoreOptPass(PassRegistry &);
 void initializeARMPreAllocLoadStoreOptPass(PassRegistry &);
+void initializeARMConstantIslandsPass(PassRegistry &);
 
 } // end namespace llvm
 
diff --git a/lib/Target/ARM/ARM.td b/lib/Target/ARM/ARM.td
index 2a090faeee6a0d02eb3411510e33f833258de6c8..57f9d1c6b6109e297c71f757e1d2504084c09d56 100644
--- a/lib/Target/ARM/ARM.td
+++ b/lib/Target/ARM/ARM.td
@@ -72,8 +72,6 @@ def FeatureHWDiv  : SubtargetFeature<"hwdiv", "HasHardwareDivide", "true",
 def FeatureHWDivARM  : SubtargetFeature<"hwdiv-arm",
                                         "HasHardwareDivideInARM", "true",
                                       "Enable divide instructions in ARM mode">;
-def FeatureT2XtPk : SubtargetFeature<"t2xtpk", "HasT2ExtractPack", "true",
-                                 "Enable Thumb2 extract and pack instructions">;
 def FeatureDB     : SubtargetFeature<"db", "HasDataBarrier", "true",
                                    "Has data barrier (dmb / dsb) instructions">;
 def FeatureV7Clrex : SubtargetFeature<"v7clrex", "HasV7Clrex", "true",
@@ -263,6 +261,12 @@ def FeatureNoMovt : SubtargetFeature<"no-movt", "NoMovt", "true",
                                      "Don't use movt/movw pairs for 32-bit "
                                      "imms">;
 
+def FeatureNoNegativeImmediates : SubtargetFeature<"no-neg-immediates",
+                                        "NegativeImmediates", "false",
+                                        "Convert immediates and instructions "
+                                        "to their negated or complemented "
+                                        "equivalent when the immediate does "
+                                        "not fit in the encoding.">;
 
 //===----------------------------------------------------------------------===//
 // ARM ISAa.
@@ -297,8 +301,7 @@ def HasV7Ops    : SubtargetFeature<"v7", "HasV7Ops", "true",
                                     FeatureV7Clrex]>;
 def HasV8Ops    : SubtargetFeature<"v8", "HasV8Ops", "true",
                                    "Support ARM v8 instructions",
-                                   [HasV7Ops, FeatureAcquireRelease,
-                                    FeatureT2XtPk]>;
+                                   [HasV7Ops, FeatureAcquireRelease]>;
 def HasV8_1aOps : SubtargetFeature<"v8.1a", "HasV8_1aOps", "true",
                                    "Support ARM v8.1a instructions",
                                    [HasV8Ops]>;
@@ -342,7 +345,9 @@ def ProcA73     : SubtargetFeature<"a73", "ARMProcFamily", "CortexA73",
                                    "Cortex-A73 ARM processors", []>;
 
 def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
-                                   "Qualcomm ARM processors", []>;
+                                   "Qualcomm Krait processors", []>;
+def ProcKryo    : SubtargetFeature<"kryo", "ARMProcFamily", "Kryo",
+                                   "Qualcomm Kryo processors", []>;
 def ProcSwift   : SubtargetFeature<"swift", "ARMProcFamily", "Swift",
                                    "Swift ARM processors", []>;
 
@@ -393,8 +398,7 @@ def ARMv5tej  : Architecture<"armv5tej",  "ARMv5tej", [HasV5TEOps]>;
 def ARMv6     : Architecture<"armv6",     "ARMv6",    [HasV6Ops]>;
 
 def ARMv6t2   : Architecture<"armv6t2",   "ARMv6t2",  [HasV6T2Ops,
-                                                       FeatureDSP,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureDSP]>;
 
 def ARMv6k    : Architecture<"armv6k",    "ARMv6k",   [HasV6KOps]>;
 
@@ -415,15 +419,22 @@ def ARMv7a    : Architecture<"armv7-a",   "ARMv7a",   [HasV7Ops,
                                                        FeatureNEON,
                                                        FeatureDB,
                                                        FeatureDSP,
-                                                       FeatureAClass,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureAClass]>;
+
+def ARMv7ve   : Architecture<"armv7ve",   "ARMv7ve",  [HasV7Ops,
+                                                       FeatureNEON,
+                                                       FeatureDB,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureAClass]>;
 
 def ARMv7r    : Architecture<"armv7-r",   "ARMv7r",   [HasV7Ops,
                                                        FeatureDB,
                                                        FeatureDSP,
                                                        FeatureHWDiv,
-                                                       FeatureRClass,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureRClass]>;
 
 def ARMv7m    : Architecture<"armv7-m",   "ARMv7m",   [HasV7Ops,
                                                        FeatureThumb2,
@@ -438,8 +449,7 @@ def ARMv7em   : Architecture<"armv7e-m",  "ARMv7em",  [HasV7Ops,
                                                        FeatureDB,
                                                        FeatureHWDiv,
                                                        FeatureMClass,
-                                                       FeatureDSP,
-                                                       FeatureT2XtPk]>;
+                                                       FeatureDSP]>;
 
 def ARMv8a    : Architecture<"armv8-a",   "ARMv8a",   [HasV8Ops,
                                                        FeatureAClass,
@@ -481,9 +491,6 @@ def ARMv82a   : Architecture<"armv8.2-a", "ARMv82a",  [HasV8_2aOps,
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
                                                        FeatureDB,
-                                                       FeatureHWDiv,
-                                                       FeatureHWDivARM,
-                                                       FeatureT2XtPk,
                                                        FeatureDSP,
                                                        FeatureCRC,
                                                        FeatureMP,
@@ -603,8 +610,6 @@ def : ProcessorModel<"cortex-a7",   CortexA8Model,      [ARMv7a, ProcA7,
                                                          FeatureVMLxForwarding,
                                                          FeatureMP,
                                                          FeatureVFP4,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureVirtualization]>;
 
 def : ProcessorModel<"cortex-a8",   CortexA8Model,      [ARMv7a, ProcA8,
@@ -636,8 +641,6 @@ def : ProcessorModel<"cortex-a12",  CortexA9Model,      [ARMv7a, ProcA12,
                                                          FeatureTrustZone,
                                                          FeatureVMLxForwarding,
                                                          FeatureVFP4,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization,
                                                          FeatureMP]>;
@@ -651,8 +654,6 @@ def : ProcessorModel<"cortex-a15",  CortexA9Model,      [ARMv7a, ProcA15,
                                                          FeatureVFP4,
                                                          FeatureMP,
                                                          FeatureCheckVLDnAlign,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
@@ -663,8 +664,6 @@ def : ProcessorModel<"cortex-a17",  CortexA9Model,      [ARMv7a, ProcA17,
                                                          FeatureMP,
                                                          FeatureVMLxForwarding,
                                                          FeatureVFP4,
-                                                         FeatureHWDiv,
-                                                         FeatureHWDivARM,
                                                          FeatureAvoidPartialCPSR,
                                                          FeatureVirtualization]>;
 
@@ -759,6 +758,15 @@ def : ProcNoItin<"cortex-m7",                           [ARMv7em,
                                                          FeatureFPARMv8,
                                                          FeatureD16]>;
 
+def : ProcNoItin<"cortex-m23",                          [ARMv8mBaseline,
+                                                         FeatureNoMovt]>;
+
+def : ProcNoItin<"cortex-m33",                          [ARMv8mMainline,
+                                                         FeatureDSP,
+                                                         FeatureFPARMv8,
+                                                         FeatureD16,
+                                                         FeatureVFPOnlySP]>;
+
 def : ProcNoItin<"cortex-a32",                           [ARMv8a,
                                                          FeatureHWDiv,
                                                          FeatureHWDivARM,
@@ -829,6 +837,12 @@ def : ProcNoItin<"exynos-m3",                           [ARMv8a, ProcExynosM1,
                                                          FeatureCrypto,
                                                          FeatureCRC]>;
 
+def : ProcNoItin<"kryo",                                [ARMv8a, ProcKryo,
+                                                         FeatureHWDiv,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC]>;
+
 def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
                                                          FeatureFPAO]>;
 
@@ -838,6 +852,8 @@ def : ProcessorModel<"cortex-r52", CortexR52Model,      [ARMv8r, ProcR52,
 
 include "ARMRegisterInfo.td"
 
+include "ARMRegisterBanks.td"
+
 include "ARMCallingConv.td"
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/ARM/ARMAsmPrinter.cpp b/lib/Target/ARM/ARMAsmPrinter.cpp
index d77717e3ba2624f008ef226d3dffc37cc734b00d..eb0d410b596be7926fffef317d7b2617aed5f2ae 100644
--- a/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -1142,6 +1142,11 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
   const MachineOperand &MO1 = MI->getOperand(1);
   unsigned JTI = MO1.getIndex();
 
+  // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
+  // ARM mode tables.
+  EmitAlignment(2);
+
+  // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
   OutStreamer->EmitLabel(JTISymbol);
 
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 54b2c83fbdaeac47594c7710ed9b21aa66ead23a..4f5711ca9a796bf6a18af072b97175f8f31762f4 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -597,7 +597,7 @@ static bool isEligibleForITBlock(const MachineInstr *MI) {
 /// isPredicable - Return true if the specified instruction can be predicated.
 /// By default, this returns true for every instruction with a
 /// PredicateOperand.
-bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
+bool ARMBaseInstrInfo::isPredicable(const MachineInstr &MI) const {
   if (!MI.isPredicable())
     return false;
 
@@ -607,7 +607,7 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
   if (!isEligibleForITBlock(&MI))
     return false;
 
-  ARMFunctionInfo *AFI =
+  const ARMFunctionInfo *AFI =
       MI.getParent()->getParent()->getInfo<ARMFunctionInfo>();
 
   if (AFI->isThumb2Function()) {
@@ -623,7 +623,7 @@ bool ARMBaseInstrInfo::isPredicable(MachineInstr &MI) const {
 
 namespace llvm {
 
-template <> bool IsCPSRDead<MachineInstr>(MachineInstr *MI) {
+template <> bool IsCPSRDead<MachineInstr>(const MachineInstr *MI) {
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg() || MO.isUndef() || MO.isUse())
@@ -1789,25 +1789,17 @@ isProfitableToIfCvt(MachineBasicBlock &MBB,
       }
     }
   }
-
-  // Attempt to estimate the relative costs of predication versus branching.
-  // Here we scale up each component of UnpredCost to avoid precision issue when
-  // scaling NumCycles by Probability.
-  const unsigned ScalingUpFactor = 1024;
-  unsigned UnpredCost = Probability.scale(NumCycles * ScalingUpFactor);
-  UnpredCost += ScalingUpFactor; // The branch itself
-  UnpredCost += Subtarget.getMispredictionPenalty() * ScalingUpFactor / 10;
-
-  return (NumCycles + ExtraPredCycles) * ScalingUpFactor <= UnpredCost;
+  return isProfitableToIfCvt(MBB, NumCycles, ExtraPredCycles,
+                             MBB, 0, 0, Probability);
 }
 
 bool ARMBaseInstrInfo::
-isProfitableToIfCvt(MachineBasicBlock &TMBB,
+isProfitableToIfCvt(MachineBasicBlock &,
                     unsigned TCycles, unsigned TExtra,
-                    MachineBasicBlock &FMBB,
+                    MachineBasicBlock &,
                     unsigned FCycles, unsigned FExtra,
                     BranchProbability Probability) const {
-  if (!TCycles || !FCycles)
+  if (!TCycles)
     return false;
 
   // Attempt to estimate the relative costs of predication versus branching.
@@ -2036,6 +2028,16 @@ static const AddSubFlagsOpcodePair AddSubFlagsOpcodeMap[] = {
   {ARM::RSBSrsi, ARM::RSBrsi},
   {ARM::RSBSrsr, ARM::RSBrsr},
 
+  {ARM::tADDSi3, ARM::tADDi3},
+  {ARM::tADDSi8, ARM::tADDi8},
+  {ARM::tADDSrr, ARM::tADDrr},
+  {ARM::tADCS, ARM::tADC},
+
+  {ARM::tSUBSi3, ARM::tSUBi3},
+  {ARM::tSUBSi8, ARM::tSUBi8},
+  {ARM::tSUBSrr, ARM::tSUBrr},
+  {ARM::tSBCS, ARM::tSBC},
+
   {ARM::t2ADDSri, ARM::t2ADDri},
   {ARM::t2ADDSrr, ARM::t2ADDrr},
   {ARM::t2ADDSrs, ARM::t2ADDrs},
@@ -4709,19 +4711,6 @@ bool ARMBaseInstrInfo::hasNOP() const {
   return Subtarget.getFeatureBits()[ARM::HasV6KOps];
 }
 
-bool ARMBaseInstrInfo::isTailCall(const MachineInstr &Inst) const
-{
-  switch (Inst.getOpcode()) {
-  case ARM::TAILJMPd:
-  case ARM::TAILJMPr:
-  case ARM::TCRETURNdi:
-  case ARM::TCRETURNri:
-    return true;
-  default:
-    return false;
-  }
-}
-
 bool ARMBaseInstrInfo::isSwiftFastImmShift(const MachineInstr *MI) const {
   if (MI->getNumOperands() < 4)
     return true;
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h
index c9ef1829e3f629e821dbbfeb410d8984c635e9c0..23777b821f9f360703bb452e646e85cd583a0287 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -109,8 +109,6 @@ public:
     getNoopForMachoTarget(NopInst);
   }
 
-  bool isTailCall(const MachineInstr &Inst) const override;
-
   // Return the non-pre/post incrementing version of 'Opc'. Return 0
   // if there is not such an opcode.
   virtual unsigned getUnindexedOpcode(unsigned Opc) const = 0;
@@ -163,7 +161,7 @@ public:
   bool DefinesPredicate(MachineInstr &MI,
                         std::vector<MachineOperand> &Pred) const override;
 
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
   /// GetInstSize - Returns the size of the specified MachineInstr.
   ///
diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp
index 5a2ded2e4b7f54e01c702e7361816bc4c287b000..94b317a8f9863ea419714011e0642b96f722f436 100644
--- a/lib/Target/ARM/ARMCallLowering.cpp
+++ b/lib/Target/ARM/ARMCallLowering.cpp
@@ -19,6 +19,7 @@
 #include "ARMISelLowering.h"
 #include "ARMSubtarget.h"
 
+#include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
@@ -33,23 +34,45 @@ ARMCallLowering::ARMCallLowering(const ARMTargetLowering &TLI)
 
 static bool isSupportedType(const DataLayout &DL, const ARMTargetLowering &TLI,
                             Type *T) {
-  EVT VT = TLI.getValueType(DL, T);
-  if (!VT.isSimple() || !VT.isInteger() || VT.isVector())
+  EVT VT = TLI.getValueType(DL, T, true);
+  if (!VT.isSimple() || VT.isVector())
     return false;
 
   unsigned VTSize = VT.getSimpleVT().getSizeInBits();
+
+  if (VTSize == 64)
+    // FIXME: Support i64 too
+    return VT.isFloatingPoint();
+
   return VTSize == 1 || VTSize == 8 || VTSize == 16 || VTSize == 32;
 }
 
 namespace {
-struct FuncReturnHandler : public CallLowering::ValueHandler {
-  FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                    MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
-    : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+/// Helper class for values going out through an ABI boundary (used for handling
+/// function return values and call parameters).
+struct OutgoingValueHandler : public CallLowering::ValueHandler {
+  OutgoingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB), StackSize(0) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
-    llvm_unreachable("Don't know how to get a stack address yet");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
+
+    LLT p0 = LLT::pointer(0, 32);
+    LLT s32 = LLT::scalar(32);
+    unsigned SPReg = MRI.createGenericVirtualRegister(p0);
+    MIRBuilder.buildCopy(SPReg, ARM::SP);
+
+    unsigned OffsetReg = MRI.createGenericVirtualRegister(s32);
+    MIRBuilder.buildConstant(OffsetReg, Offset);
+
+    unsigned AddrReg = MRI.createGenericVirtualRegister(p0);
+    MIRBuilder.buildGEP(AddrReg, SPReg, OffsetReg);
+
+    MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
+    return AddrReg;
   }
 
   void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
@@ -57,8 +80,8 @@ struct FuncReturnHandler : public CallLowering::ValueHandler {
     assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
     assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
 
-    assert(VA.getValVT().getSizeInBits() <= 32 && "Unsupported value size");
-    assert(VA.getLocVT().getSizeInBits() == 32 && "Unsupported location size");
+    assert(VA.getValVT().getSizeInBits() <= 64 && "Unsupported value size");
+    assert(VA.getLocVT().getSizeInBits() <= 64 && "Unsupported location size");
 
     unsigned ExtReg = extendRegister(ValVReg, VA);
     MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -67,13 +90,82 @@ struct FuncReturnHandler : public CallLowering::ValueHandler {
 
   void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    llvm_unreachable("Don't know how to assign a value to an address yet");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
+
+    unsigned ExtReg = extendRegister(ValVReg, VA);
+    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+        MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
+        /* Alignment */ 0);
+    MIRBuilder.buildStore(ExtReg, Addr, *MMO);
+  }
+
+  unsigned assignCustomValue(const CallLowering::ArgInfo &Arg,
+                             ArrayRef<CCValAssign> VAs) override {
+    CCValAssign VA = VAs[0];
+    assert(VA.needsCustom() && "Value doesn't need custom handling");
+    assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+    CCValAssign NextVA = VAs[1];
+    assert(NextVA.needsCustom() && "Value doesn't need custom handling");
+    assert(NextVA.getValVT() == MVT::f64 && "Unsupported type");
+
+    assert(VA.getValNo() == NextVA.getValNo() &&
+           "Values belong to different arguments");
+
+    assert(VA.isRegLoc() && "Value should be in reg");
+    assert(NextVA.isRegLoc() && "Value should be in reg");
+
+    unsigned NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
+                          MRI.createGenericVirtualRegister(LLT::scalar(32))};
+    MIRBuilder.buildExtract(NewRegs[0], Arg.Reg, 0);
+    MIRBuilder.buildExtract(NewRegs[1], Arg.Reg, 32);
+
+    bool IsLittle = MIRBuilder.getMF().getSubtarget<ARMSubtarget>().isLittle();
+    if (!IsLittle)
+      std::swap(NewRegs[0], NewRegs[1]);
+
+    assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
+    assignValueToReg(NewRegs[1], NextVA.getLocReg(), NextVA);
+
+    return 1;
+  }
+
+  bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
+                 CCValAssign::LocInfo LocInfo,
+                 const CallLowering::ArgInfo &Info, CCState &State) override {
+    if (AssignFn(ValNo, ValVT, LocVT, LocInfo, Info.Flags, State))
+      return true;
+
+    StackSize =
+        std::max(StackSize, static_cast<uint64_t>(State.getNextStackOffset()));
+    return false;
   }
 
   MachineInstrBuilder &MIB;
+  uint64_t StackSize;
 };
 } // End anonymous namespace.
 
+void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
+                                        SmallVectorImpl<ArgInfo> &SplitArgs,
+                                        const DataLayout &DL,
+                                        MachineRegisterInfo &MRI) const {
+  const ARMTargetLowering &TLI = *getTLI<ARMTargetLowering>();
+  LLVMContext &Ctx = OrigArg.Ty->getContext();
+
+  SmallVector<EVT, 4> SplitVTs;
+  SmallVector<uint64_t, 4> Offsets;
+  ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
+
+  assert(SplitVTs.size() == 1 && "Unsupported type");
+
+  // Even if there is no splitting to do, we still want to replace the original
+  // type (e.g. pointer type -> integer).
+  SplitArgs.emplace_back(OrigArg.Reg, SplitVTs[0].getTypeForEVT(Ctx),
+                         OrigArg.Flags, OrigArg.IsFixed);
+}
+
 /// Lower the return value for the already existing \p Ret. This assumes that
 /// \p MIRBuilder's insertion point is correct.
 bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
@@ -91,14 +183,16 @@ bool ARMCallLowering::lowerReturnVal(MachineIRBuilder &MIRBuilder,
   if (!isSupportedType(DL, TLI, Val->getType()))
     return false;
 
+  SmallVector<ArgInfo, 4> SplitVTs;
+  ArgInfo RetInfo(VReg, Val->getType());
+  setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F);
+  splitToValueTypes(RetInfo, SplitVTs, DL, MF.getRegInfo());
+
   CCAssignFn *AssignFn =
       TLI.CCAssignFnForReturn(F.getCallingConv(), F.isVarArg());
 
-  ArgInfo RetInfo(VReg, Val->getType());
-  setArgFlags(RetInfo, AttributeSet::ReturnIndex, DL, F);
-
-  FuncReturnHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, AssignFn);
-  return handleAssignments(MIRBuilder, RetInfo, RetHandler);
+  OutgoingValueHandler RetHandler(MIRBuilder, MF.getRegInfo(), Ret, AssignFn);
+  return handleAssignments(MIRBuilder, SplitVTs, RetHandler);
 }
 
 bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
@@ -115,14 +209,17 @@ bool ARMCallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
 }
 
 namespace {
-struct FormalArgHandler : public CallLowering::ValueHandler {
-  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
-                   CCAssignFn AssignFn)
+/// Helper class for values coming in through an ABI boundary (used for handling
+/// formal arguments and call return values).
+struct IncomingValueHandler : public CallLowering::ValueHandler {
+  IncomingValueHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                       CCAssignFn AssignFn)
       : ValueHandler(MIRBuilder, MRI, AssignFn) {}
 
   unsigned getStackAddress(uint64_t Size, int64_t Offset,
                            MachinePointerInfo &MPO) override {
-    assert((Size == 1 || Size == 2 || Size == 4) && "Unsupported size");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
 
     auto &MFI = MIRBuilder.getMF().getFrameInfo();
 
@@ -138,12 +235,13 @@ struct FormalArgHandler : public CallLowering::ValueHandler {
 
   void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    assert((Size == 1 || Size == 2 || Size == 4) && "Unsupported size");
+    assert((Size == 1 || Size == 2 || Size == 4 || Size == 8) &&
+           "Unsupported size");
 
     if (VA.getLocInfo() == CCValAssign::SExt ||
         VA.getLocInfo() == CCValAssign::ZExt) {
-      // If the argument is zero- or sign-extended by the caller, its size
-      // becomes 4 bytes, so that's what we should load.
+      // If the value is zero- or sign-extended, its size becomes 4 bytes, so
+      // that's what we should load.
       Size = 4;
       assert(MRI.getType(ValVReg).isScalar() && "Only scalars supported atm");
       MRI.setType(ValVReg, LLT::scalar(32));
@@ -159,13 +257,60 @@ struct FormalArgHandler : public CallLowering::ValueHandler {
     assert(VA.isRegLoc() && "Value shouldn't be assigned to reg");
     assert(VA.getLocReg() == PhysReg && "Assigning to the wrong reg?");
 
-    assert(VA.getValVT().getSizeInBits() <= 32 && "Unsupported value size");
-    assert(VA.getLocVT().getSizeInBits() == 32 && "Unsupported location size");
+    assert(VA.getValVT().getSizeInBits() <= 64 && "Unsupported value size");
+    assert(VA.getLocVT().getSizeInBits() <= 64 && "Unsupported location size");
 
-    // The caller should handle all necesary extensions.
-    MIRBuilder.getMBB().addLiveIn(PhysReg);
+    // The necesary extensions are handled on the other side of the ABI
+    // boundary.
+    markPhysRegUsed(PhysReg);
     MIRBuilder.buildCopy(ValVReg, PhysReg);
   }
+
+  unsigned assignCustomValue(const ARMCallLowering::ArgInfo &Arg,
+                             ArrayRef<CCValAssign> VAs) override {
+    CCValAssign VA = VAs[0];
+    assert(VA.needsCustom() && "Value doesn't need custom handling");
+    assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+    CCValAssign NextVA = VAs[1];
+    assert(NextVA.needsCustom() && "Value doesn't need custom handling");
+    assert(NextVA.getValVT() == MVT::f64 && "Unsupported type");
+
+    assert(VA.getValNo() == NextVA.getValNo() &&
+           "Values belong to different arguments");
+
+    assert(VA.isRegLoc() && "Value should be in reg");
+    assert(NextVA.isRegLoc() && "Value should be in reg");
+
+    unsigned NewRegs[] = {MRI.createGenericVirtualRegister(LLT::scalar(32)),
+                          MRI.createGenericVirtualRegister(LLT::scalar(32))};
+
+    assignValueToReg(NewRegs[0], VA.getLocReg(), VA);
+    assignValueToReg(NewRegs[1], NextVA.getLocReg(), NextVA);
+
+    bool IsLittle = MIRBuilder.getMF().getSubtarget<ARMSubtarget>().isLittle();
+    if (!IsLittle)
+      std::swap(NewRegs[0], NewRegs[1]);
+
+    MIRBuilder.buildSequence(Arg.Reg, NewRegs, {0, 32});
+
+    return 1;
+  }
+
+  /// Marking a physical register as used is different between formal
+  /// parameters, where it's a basic block live-in, and call returns, where it's
+  /// an implicit-def of the call instruction.
+  virtual void markPhysRegUsed(unsigned PhysReg) = 0;
+};
+
+struct FormalArgHandler : public IncomingValueHandler {
+  FormalArgHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                   CCAssignFn AssignFn)
+      : IncomingValueHandler(MIRBuilder, MRI, AssignFn) {}
+
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIRBuilder.getMBB().addLiveIn(PhysReg);
+  }
 };
 } // End anonymous namespace
 
@@ -179,14 +324,16 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   if (F.isVarArg())
     return false;
 
-  auto DL = MIRBuilder.getMF().getDataLayout();
+  auto &MF = MIRBuilder.getMF();
+  auto DL = MF.getDataLayout();
   auto &TLI = *getTLI<ARMTargetLowering>();
 
-  if (TLI.getSubtarget()->isThumb())
+  auto Subtarget = TLI.getSubtarget();
+
+  if (Subtarget->isThumb())
     return false;
 
-  auto &Args = F.getArgumentList();
-  for (auto &Arg : Args)
+  for (auto &Arg : F.args())
     if (!isSupportedType(DL, TLI, Arg.getType()))
       return false;
 
@@ -195,10 +342,10 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
 
   SmallVector<ArgInfo, 8> ArgInfos;
   unsigned Idx = 0;
-  for (auto &Arg : Args) {
+  for (auto &Arg : F.args()) {
     ArgInfo AInfo(VRegs[Idx], Arg.getType());
     setArgFlags(AInfo, Idx + 1, DL, F);
-    ArgInfos.push_back(AInfo);
+    splitToValueTypes(AInfo, ArgInfos, DL, MF.getRegInfo());
     Idx++;
   }
 
@@ -206,3 +353,82 @@ bool ARMCallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                               AssignFn);
   return handleAssignments(MIRBuilder, ArgInfos, ArgHandler);
 }
+
+namespace {
+struct CallReturnHandler : public IncomingValueHandler {
+  CallReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                    MachineInstrBuilder MIB, CCAssignFn *AssignFn)
+      : IncomingValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+  void markPhysRegUsed(unsigned PhysReg) override {
+    MIB.addDef(PhysReg, RegState::Implicit);
+  }
+
+  MachineInstrBuilder MIB;
+};
+} // End anonymous namespace.
+
+bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
+                                CallingConv::ID CallConv,
+                                const MachineOperand &Callee,
+                                const ArgInfo &OrigRet,
+                                ArrayRef<ArgInfo> OrigArgs) const {
+  MachineFunction &MF = MIRBuilder.getMF();
+  const auto &TLI = *getTLI<ARMTargetLowering>();
+  const auto &DL = MF.getDataLayout();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  if (MF.getSubtarget<ARMSubtarget>().genLongCalls())
+    return false;
+
+  auto CallSeqStart = MIRBuilder.buildInstr(ARM::ADJCALLSTACKDOWN);
+
+  // Create the call instruction so we can add the implicit uses of arg
+  // registers, but don't insert it yet.
+  auto MIB = MIRBuilder.buildInstrNoInsert(ARM::BLX).add(Callee).addRegMask(
+      TRI->getCallPreservedMask(MF, CallConv));
+
+  SmallVector<ArgInfo, 8> ArgInfos;
+  for (auto Arg : OrigArgs) {
+    if (!isSupportedType(DL, TLI, Arg.Ty))
+      return false;
+
+    if (!Arg.IsFixed)
+      return false;
+
+    splitToValueTypes(Arg, ArgInfos, DL, MRI);
+  }
+
+  auto ArgAssignFn = TLI.CCAssignFnForCall(CallConv, /*IsVarArg=*/false);
+  OutgoingValueHandler ArgHandler(MIRBuilder, MRI, MIB, ArgAssignFn);
+  if (!handleAssignments(MIRBuilder, ArgInfos, ArgHandler))
+    return false;
+
+  // Now we can add the actual call instruction to the correct basic block.
+  MIRBuilder.insertInstr(MIB);
+
+  if (!OrigRet.Ty->isVoidTy()) {
+    if (!isSupportedType(DL, TLI, OrigRet.Ty))
+      return false;
+
+    ArgInfos.clear();
+    splitToValueTypes(OrigRet, ArgInfos, DL, MRI);
+
+    auto RetAssignFn = TLI.CCAssignFnForReturn(CallConv, /*IsVarArg=*/false);
+    CallReturnHandler RetHandler(MIRBuilder, MRI, MIB, RetAssignFn);
+    if (!handleAssignments(MIRBuilder, ArgInfos, RetHandler))
+      return false;
+  }
+
+  // We now know the size of the stack - update the ADJCALLSTACKDOWN
+  // accordingly.
+  CallSeqStart.addImm(ArgHandler.StackSize).add(predOps(ARMCC::AL));
+
+  MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP)
+      .addImm(ArgHandler.StackSize)
+      .addImm(0)
+      .add(predOps(ARMCC::AL));
+
+  return true;
+}
diff --git a/lib/Target/ARM/ARMCallLowering.h b/lib/Target/ARM/ARMCallLowering.h
index 6a1b886b501fcf151445136a4d262950ffa66fbb..6404c7a2689eedc60ab8ca6de8f67428827f6573 100644
--- a/lib/Target/ARM/ARMCallLowering.h
+++ b/lib/Target/ARM/ARMCallLowering.h
@@ -34,9 +34,19 @@ public:
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
 
+  bool lowerCall(MachineIRBuilder &MIRBuilder, CallingConv::ID CallConv,
+                 const MachineOperand &Callee, const ArgInfo &OrigRet,
+                 ArrayRef<ArgInfo> OrigArgs) const override;
+
 private:
   bool lowerReturnVal(MachineIRBuilder &MIRBuilder, const Value *Val,
                       unsigned VReg, MachineInstrBuilder &Ret) const;
+
+  /// Split an argument into one or more arguments that the CC lowering can cope
+  /// with (e.g. replace pointers with integers).
+  void splitToValueTypes(const ArgInfo &OrigArg,
+                         SmallVectorImpl<ArgInfo> &SplitArgs,
+                         const DataLayout &DL, MachineRegisterInfo &MRI) const;
 };
 } // End of namespace llvm
 #endif
diff --git a/lib/Target/ARM/ARMCallingConv.td b/lib/Target/ARM/ARMCallingConv.td
index 9c278a52a7ff7565224355ecf411a3aa6790146d..7a7b7fede7c832fbe1c00ea17ce07378526d6fb9 100644
--- a/lib/Target/ARM/ARMCallingConv.td
+++ b/lib/Target/ARM/ARMCallingConv.td
@@ -26,8 +26,8 @@ def CC_ARM_APCS : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is passed in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is passed in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
@@ -51,8 +51,8 @@ def RetCC_ARM_APCS : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is returned in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is returned in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   // Handle all vector types as either f64 or v2f64.
   CCIfType<[v1i64, v2i32, v4i16, v8i8, v2f32], CCBitConvertToType<f64>>,
@@ -166,8 +166,8 @@ def CC_ARM_AAPCS : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is passed in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is passed in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -182,8 +182,8 @@ def RetCC_ARM_AAPCS : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is returned in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is returned in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
@@ -206,8 +206,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is passed in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is passed in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   // HFAs are passed in a contiguous block of registers, or on the stack
   CCIfConsecutiveRegs<CCCustom<"CC_ARM_AAPCS_Custom_Aggregate">>,
@@ -227,8 +227,8 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
 
-  // A SwiftError is returned in R6.
-  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R6]>>>,
+  // A SwiftError is returned in R8.
+  CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
@@ -267,8 +267,8 @@ def CSR_AAPCS_ThisReturn : CalleeSavedRegs<(add LR, R11, R10, R9, R8, R7, R6,
 // Also save R7-R4 first to match the stack frame fixed spill areas.
 def CSR_iOS : CalleeSavedRegs<(add LR, R7, R6, R5, R4, (sub CSR_AAPCS, R9))>;
 
-// R6 is used to pass swifterror, remove it from CSR.
-def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R6)>;
+// R8 is used to pass swifterror, remove it from CSR.
+def CSR_iOS_SwiftError : CalleeSavedRegs<(sub CSR_iOS, R8)>;
 
 def CSR_iOS_ThisReturn : CalleeSavedRegs<(add LR, R7, R6, R5, R4,
                                          (sub CSR_AAPCS_ThisReturn, R9))>;
diff --git a/lib/Target/ARM/ARMConstantIslandPass.cpp b/lib/Target/ARM/ARMConstantIslandPass.cpp
index 46262fd138c58a298fe2e3bb7b0b3f02f0befa1f..23722f1b7f3ff50b0abb398e33aff04a26760aa0 100644
--- a/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -56,6 +56,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "arm-cp-islands"
 
+#define ARM_CP_ISLANDS_OPT_NAME \
+  "ARM constant island placement and branch shortening pass"
 STATISTIC(NumCPEs,       "Number of constpool entries");
 STATISTIC(NumSplit,      "Number of uncond branches inserted");
 STATISTIC(NumCBrFixed,   "Number of cond branches fixed");
@@ -230,7 +232,7 @@ namespace {
     }
 
     StringRef getPassName() const override {
-      return "ARM constant island placement and branch shortening pass";
+      return ARM_CP_ISLANDS_OPT_NAME;
     }
 
   private:
@@ -803,6 +805,7 @@ initializeFunctionInfo(const std::vector<MachineInstr*> &CPEMIs) {
           case ARM::LDRcp:
           case ARM::t2LDRpci:
           case ARM::t2LDRHpci:
+          case ARM::t2LDRBpci:
             Bits = 12;  // +-offset_12
             NegOk = true;
             break;
@@ -1739,6 +1742,13 @@ bool ARMConstantIslands::undoLRSpillRestore() {
       MI->eraseFromParent();
       MadeChange = true;
     }
+    if (MI->getOpcode() == ARM::tPUSH &&
+        MI->getOperand(2).getReg() == ARM::LR &&
+        MI->getNumExplicitOperands() == 3) {
+      // Just remove the push.
+      MI->eraseFromParent();
+      MadeChange = true;
+    }
   }
   return MadeChange;
 }
@@ -2007,6 +2017,54 @@ static bool jumpTableFollowsTB(MachineInstr *JTMI, MachineInstr *CPEMI) {
          &*MBB->begin() == CPEMI;
 }
 
+static void RemoveDeadAddBetweenLEAAndJT(MachineInstr *LEAMI,
+                                         MachineInstr *JumpMI,
+                                         unsigned &DeadSize) {
+  // Remove a dead add between the LEA and JT, which used to compute EntryReg,
+  // but the JT now uses PC. Finds the last ADD (if any) that def's EntryReg
+  // and is not clobbered / used.
+  MachineInstr *RemovableAdd = nullptr;
+  unsigned EntryReg = JumpMI->getOperand(0).getReg();
+
+  // Find the last ADD to set EntryReg
+  MachineBasicBlock::iterator I(LEAMI);
+  for (++I; &*I != JumpMI; ++I) {
+    if (I->getOpcode() == ARM::t2ADDrs && I->getOperand(0).getReg() == EntryReg)
+      RemovableAdd = &*I;
+  }
+
+  if (!RemovableAdd)
+    return;
+
+  // Ensure EntryReg is not clobbered or used.
+  MachineBasicBlock::iterator J(RemovableAdd);
+  for (++J; &*J != JumpMI; ++J) {
+    for (unsigned K = 0, E = J->getNumOperands(); K != E; ++K) {
+      const MachineOperand &MO = J->getOperand(K);
+      if (!MO.isReg() || !MO.getReg())
+        continue;
+      if (MO.isDef() && MO.getReg() == EntryReg)
+        return;
+      if (MO.isUse() && MO.getReg() == EntryReg)
+        return;
+    }
+  }
+
+  DEBUG(dbgs() << "Removing Dead Add: " << *RemovableAdd);
+  RemovableAdd->eraseFromParent();
+  DeadSize += 4;
+}
+
+static bool registerDefinedBetween(unsigned Reg,
+                                   MachineBasicBlock::iterator From,
+                                   MachineBasicBlock::iterator To,
+                                   const TargetRegisterInfo *TRI) {
+  for (auto I = From; I != To; ++I)
+    if (I->modifiesRegister(Reg, TRI))
+      return true;
+  return false;
+}
+
 /// optimizeThumb2JumpTables - Use tbb / tbh instructions to generate smaller
 /// jumptables when it's possible.
 bool ARMConstantIslands::optimizeThumb2JumpTables() {
@@ -2084,6 +2142,12 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       IdxReg = Shift->getOperand(2).getReg();
       unsigned ShiftedIdxReg = Shift->getOperand(0).getReg();
 
+      // It's important that IdxReg is live until the actual TBB/TBH. Most of
+      // the range is checked later, but the LEA might still clobber it and not
+      // actually get removed.
+      if (BaseReg == IdxReg && !jumpTableFollowsTB(MI, User.CPEMI))
+        continue;
+
       MachineInstr *Load = User.MI->getNextNode();
       if (Load->getOpcode() != ARM::tLDRr)
         continue;
@@ -2093,6 +2157,7 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
         continue;
 
       // If we're in PIC mode, there should be another ADD following.
+      auto *TRI = STI->getRegisterInfo();
       if (isPositionIndependentOrROPI) {
         MachineInstr *Add = Load->getNextNode();
         if (Add->getOpcode() != ARM::tADDrr ||
@@ -2102,21 +2167,26 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
           continue;
         if (Add->getOperand(0).getReg() != MI->getOperand(0).getReg())
           continue;
-
+        if (registerDefinedBetween(IdxReg, Add->getNextNode(), MI, TRI))
+          // IdxReg gets redefined in the middle of the sequence.
+          continue;
         Add->eraseFromParent();
         DeadSize += 2;
       } else {
         if (Load->getOperand(0).getReg() != MI->getOperand(0).getReg())
           continue;
+        if (registerDefinedBetween(IdxReg, Load->getNextNode(), MI, TRI))
+          // IdxReg gets redefined in the middle of the sequence.
+          continue;
       }
-      
+
       // Now safe to delete the load and lsl. The LEA will be removed later.
       CanDeleteLEA = true;
       Shift->eraseFromParent();
       Load->eraseFromParent();
       DeadSize += 4;
     }
-    
+
     DEBUG(dbgs() << "Shrink JT: " << *MI);
     MachineInstr *CPEMI = User.CPEMI;
     unsigned Opc = ByteOk ? ARM::t2TBB_JT : ARM::t2TBH_JT;
@@ -2140,7 +2210,10 @@ bool ARMConstantIslands::optimizeThumb2JumpTables() {
       NewJTMI->getOperand(0).setReg(ARM::PC);
       NewJTMI->getOperand(0).setIsKill(false);
 
-      if (CanDeleteLEA)  {
+      if (CanDeleteLEA) {
+        if (isThumb2)
+          RemoveDeadAddBetweenLEAAndJT(User.MI, MI, DeadSize);
+
         User.MI->eraseFromParent();
         DeadSize += isThumb2 ? 4 : 2;
 
@@ -2283,3 +2356,6 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
 FunctionPass *llvm::createARMConstantIslandPass() {
   return new ARMConstantIslands();
 }
+
+INITIALIZE_PASS(ARMConstantIslands, "arm-cp-islands", ARM_CP_ISLANDS_OPT_NAME,
+                false, false)
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 78a9144bd321407764cf682592c54324e2f99f99..e0aecff2633b4fddf1111e1f8fea508a7e22ab8b 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -661,6 +661,7 @@ static bool IsAnAddressOperand(const MachineOperand &MO) {
     return false;
   case MachineOperand::MO_IntrinsicID:
   case MachineOperand::MO_Predicate:
+  case MachineOperand::MO_Placeholder:
     llvm_unreachable("should not exist post-isel");
   }
   llvm_unreachable("unhandled machine operand type");
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 3f510aa1b6580553968cf51b958079467f8eb427..6638edfa05c3b523baf47acfb81a931372899c8f 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -187,7 +187,7 @@ class ARMFastISel final : public FastISel {
     bool isTypeLegal(Type *Ty, MVT &VT);
     bool isLoadTypeLegal(Type *Ty, MVT &VT);
     bool ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
-                    bool isZExt);
+                    bool isZExt, bool isEquality);
     bool ARMEmitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
                      unsigned Alignment = 0, bool isZExt = true,
                      bool allocReg = true);
@@ -1256,7 +1256,8 @@ bool ARMFastISel::SelectBranch(const Instruction *I) {
       if (ARMPred == ARMCC::AL) return false;
 
       // Emit the compare.
-      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+      if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                      CI->isEquality()))
         return false;
 
       unsigned BrOpc = isThumb2 ? ARM::t2Bcc : ARM::Bcc;
@@ -1343,14 +1344,16 @@ bool ARMFastISel::SelectIndirectBr(const Instruction *I) {
 }
 
 bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
-                             bool isZExt) {
+                             bool isZExt, bool isEquality) {
   Type *Ty = Src1Value->getType();
   EVT SrcEVT = TLI.getValueType(DL, Ty, true);
   if (!SrcEVT.isSimple()) return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
 
-  bool isFloat = (Ty->isFloatTy() || Ty->isDoubleTy());
-  if (isFloat && !Subtarget->hasVFP2())
+  if (Ty->isFloatTy() && !Subtarget->hasVFP2())
+    return false;
+
+  if (Ty->isDoubleTy() && (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()))
     return false;
 
   // Check to see if the 2nd operand is a constant that we can encode directly
@@ -1389,10 +1392,18 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
     // TODO: Verify compares.
     case MVT::f32:
       isICmp = false;
-      CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
+      // Equality comparisons shouldn't raise Invalid on uordered inputs.
+      if (isEquality)
+        CmpOpc = UseImm ? ARM::VCMPZS : ARM::VCMPS;
+      else
+        CmpOpc = UseImm ? ARM::VCMPEZS : ARM::VCMPES;
       break;
     case MVT::f64:
       isICmp = false;
+      // Equality comparisons shouldn't raise Invalid on uordered inputs.
+      if (isEquality)
+        CmpOpc = UseImm ? ARM::VCMPZD : ARM::VCMPD;
+      else
       CmpOpc = UseImm ? ARM::VCMPEZD : ARM::VCMPED;
       break;
     case MVT::i1:
@@ -1469,7 +1480,8 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
   if (ARMPred == ARMCC::AL) return false;
 
   // Emit the compare.
-  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned()))
+  if (!ARMEmitCmp(CI->getOperand(0), CI->getOperand(1), CI->isUnsigned(),
+                  CI->isEquality()))
     return false;
 
   // Now set a register based on the comparison. Explicitly set the predicates
@@ -1491,7 +1503,7 @@ bool ARMFastISel::SelectCmp(const Instruction *I) {
 
 bool ARMFastISel::SelectFPExt(const Instruction *I) {
   // Make sure we have VFP and that we're extending float to double.
-  if (!Subtarget->hasVFP2()) return false;
+  if (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()) return false;
 
   Value *V = I->getOperand(0);
   if (!I->getType()->isDoubleTy() ||
@@ -1510,7 +1522,7 @@ bool ARMFastISel::SelectFPExt(const Instruction *I) {
 
 bool ARMFastISel::SelectFPTrunc(const Instruction *I) {
   // Make sure we have VFP and that we're truncating double to float.
-  if (!Subtarget->hasVFP2()) return false;
+  if (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()) return false;
 
   Value *V = I->getOperand(0);
   if (!(I->getType()->isFloatTy() &&
@@ -1561,7 +1573,8 @@ bool ARMFastISel::SelectIToFP(const Instruction *I, bool isSigned) {
 
   unsigned Opc;
   if (Ty->isFloatTy()) Opc = isSigned ? ARM::VSITOS : ARM::VUITOS;
-  else if (Ty->isDoubleTy()) Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
+  else if (Ty->isDoubleTy() && !Subtarget->isFPOnlySP())
+    Opc = isSigned ? ARM::VSITOD : ARM::VUITOD;
   else return false;
 
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(DstVT));
@@ -1586,7 +1599,8 @@ bool ARMFastISel::SelectFPToI(const Instruction *I, bool isSigned) {
   unsigned Opc;
   Type *OpTy = I->getOperand(0)->getType();
   if (OpTy->isFloatTy()) Opc = isSigned ? ARM::VTOSIZS : ARM::VTOUIZS;
-  else if (OpTy->isDoubleTy()) Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
+  else if (OpTy->isDoubleTy() && !Subtarget->isFPOnlySP())
+    Opc = isSigned ? ARM::VTOSIZD : ARM::VTOUIZD;
   else return false;
 
   // f64->s32/u32 or f32->s32/u32 both need an intermediate f32 reg.
@@ -1790,8 +1804,9 @@ bool ARMFastISel::SelectBinaryFPOp(const Instruction *I, unsigned ISDOpcode) {
   // if we have them.
   // FIXME: It'd be nice to use NEON instructions.
   Type *Ty = I->getType();
-  bool isFloat = (Ty->isDoubleTy() || Ty->isFloatTy());
-  if (isFloat && !Subtarget->hasVFP2())
+  if (Ty->isFloatTy() && !Subtarget->hasVFP2())
+    return false;
+  if (Ty->isDoubleTy() && (!Subtarget->hasVFP2() || Subtarget->isFPOnlySP()))
     return false;
 
   unsigned Opc;
diff --git a/lib/Target/ARM/ARMFeatures.h b/lib/Target/ARM/ARMFeatures.h
index 0c910ab6130f2d4167f9eb367a33675ef9a62f4c..8c0df4c2cbf9498a888ebb9a5453e4c7190ca455 100644
--- a/lib/Target/ARM/ARMFeatures.h
+++ b/lib/Target/ARM/ARMFeatures.h
@@ -19,10 +19,10 @@
 namespace llvm {
 
 template<typename InstrType> // could be MachineInstr or MCInst
-bool IsCPSRDead(InstrType *Instr);
+bool IsCPSRDead(const InstrType *Instr);
 
 template<typename InstrType> // could be MachineInstr or MCInst
-inline bool isV8EligibleForIT(InstrType *Instr) {
+inline bool isV8EligibleForIT(const InstrType *Instr) {
   switch (Instr->getOpcode()) {
   default:
     return false;
diff --git a/lib/Target/ARM/ARMFrameLowering.cpp b/lib/Target/ARM/ARMFrameLowering.cpp
index 63c608dde95a46a7138ef42c7d6636ed42faeae0..37be22bed54087b05a36a3c47e470c3b4f6d6e97 100644
--- a/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1700,13 +1700,14 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   //        worth the effort and added fragility?
   unsigned EstimatedStackSize =
       MFI.estimateStackSize(MF) + 4 * (NumGPRSpills + NumFPRSpills);
-  if (hasFP(MF)) {
+  bool HasFP = hasFP(MF);
+  if (HasFP) {
     if (AFI->hasStackFrame())
       EstimatedStackSize += 4;
   } else {
     // If FP is not used, SP will be used to access arguments, so count the
     // size of arguments into the estimation.
-    EstimatedStackSize += MF.getInfo<ARMFunctionInfo>()->getArgumentStackSize();
+    EstimatedStackSize += AFI->getArgumentStackSize();
   }
   EstimatedStackSize += 16; // For possible paddings.
 
@@ -1717,7 +1718,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (BigStack || !CanEliminateFrame || RegInfo->cannotEliminateFrame(MF)) {
     AFI->setHasStackFrame(true);
 
-    if (hasFP(MF)) {
+    if (HasFP) {
       SavedRegs.set(FramePtr);
       // If the frame pointer is required by the ABI, also spill LR so that we
       // emit a complete frame record.
@@ -1788,7 +1789,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
       }
 
       // r7 can be used if it is not being used as the frame pointer.
-      if (!hasFP(MF)) {
+      if (!HasFP) {
         if (SavedRegs.test(ARM::R7)) {
           --RegDeficit;
           DEBUG(dbgs() << "%R7 is saved low register, RegDeficit = "
diff --git a/lib/Target/ARM/ARMISelDAGToDAG.cpp b/lib/Target/ARM/ARMISelDAGToDAG.cpp
index c3e9591d5c7050e8b39002f02525d83a199f562f..b07b4e1f5cfbdc43df0c4d4ab4c1892ca4f0555f 100644
--- a/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -244,11 +244,8 @@ private:
 
   bool tryInlineAsm(SDNode *N);
 
-  void SelectConcatVector(SDNode *N);
   void SelectCMPZ(SDNode *N, bool &SwitchEQNEToPLMI);
 
-  bool trySMLAWSMULW(SDNode *N);
-
   void SelectCMP_SWAP(SDNode *N);
 
   /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
@@ -2559,141 +2556,6 @@ bool ARMDAGToDAGISel::tryABSOp(SDNode *N){
   return false;
 }
 
-static bool SearchSignedMulShort(SDValue SignExt, unsigned *Opc, SDValue &Src1,
-                                 bool Accumulate) {
-  // For SM*WB, we need to some form of sext.
-  // For SM*WT, we need to search for (sra X, 16)
-  // Src1 then gets set to X.
-  if ((SignExt.getOpcode() == ISD::SIGN_EXTEND ||
-       SignExt.getOpcode() == ISD::SIGN_EXTEND_INREG ||
-       SignExt.getOpcode() == ISD::AssertSext) &&
-       SignExt.getValueType() == MVT::i32) {
-
-    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
-    Src1 = SignExt.getOperand(0);
-    return true;
-  }
-
-  if (SignExt.getOpcode() != ISD::SRA)
-    return false;
-
-  ConstantSDNode *SRASrc1 = dyn_cast<ConstantSDNode>(SignExt.getOperand(1));
-  if (!SRASrc1 || SRASrc1->getZExtValue() != 16)
-    return false;
-
-  SDValue Op0 = SignExt.getOperand(0);
-
-  // The sign extend operand for SM*WB could be generated by a shl and ashr.
-  if (Op0.getOpcode() == ISD::SHL) {
-    SDValue SHL = Op0;
-    ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
-    if (!SHLSrc1 || SHLSrc1->getZExtValue() != 16)
-      return false;
-
-    *Opc = Accumulate ? ARM::SMLAWB : ARM::SMULWB;
-    Src1 = Op0.getOperand(0);
-    return true;
-  }
-  *Opc = Accumulate ? ARM::SMLAWT : ARM::SMULWT;
-  Src1 = SignExt.getOperand(0);
-  return true;
-}
-
-static bool SearchSignedMulLong(SDValue OR, unsigned *Opc, SDValue &Src0,
-                                SDValue &Src1, bool Accumulate) {
-  // First we look for:
-  // (add (or (srl ?, 16), (shl ?, 16)))
-  if (OR.getOpcode() != ISD::OR)
-    return false;
-
-  SDValue SRL = OR.getOperand(0);
-  SDValue SHL = OR.getOperand(1);
-
-  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
-    SRL = OR.getOperand(1);
-    SHL = OR.getOperand(0);
-    if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL)
-      return false;
-  }
-
-  ConstantSDNode *SRLSrc1 = dyn_cast<ConstantSDNode>(SRL.getOperand(1));
-  ConstantSDNode *SHLSrc1 = dyn_cast<ConstantSDNode>(SHL.getOperand(1));
-  if (!SRLSrc1 || !SHLSrc1 || SRLSrc1->getZExtValue() != 16 ||
-      SHLSrc1->getZExtValue() != 16)
-    return false;
-
-  // The first operands to the shifts need to be the two results from the
-  // same smul_lohi node.
-  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
-       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
-    return false;
-
-  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
-  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
-      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
-    return false;
-
-  // Now we have:
-  // (add (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
-  // For SMLAW[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
-  // For SMLAWB the 16-bit value will signed extended somehow.
-  // For SMLAWT only the SRA is required.
-
-  // Check both sides of SMUL_LOHI
-  if (SearchSignedMulShort(SMULLOHI->getOperand(0), Opc, Src1, Accumulate)) {
-    Src0 = SMULLOHI->getOperand(1);
-  } else if (SearchSignedMulShort(SMULLOHI->getOperand(1), Opc, Src1,
-                                  Accumulate)) {
-    Src0 = SMULLOHI->getOperand(0);
-  } else {
-    return false;
-  }
-  return true;
-}
-
-bool ARMDAGToDAGISel::trySMLAWSMULW(SDNode *N) {
-  if (!Subtarget->hasV6Ops() ||
-      (Subtarget->isThumb() && !Subtarget->hasThumb2()))
-    return false;
-
-  SDLoc dl(N);
-  SDValue Src0 = N->getOperand(0);
-  SDValue Src1 = N->getOperand(1);
-  SDValue A, B;
-  unsigned Opc = 0;
-
-  if (N->getOpcode() == ISD::ADD) {
-    if (Src0.getOpcode() != ISD::OR && Src1.getOpcode() != ISD::OR)
-      return false;
-
-    SDValue Acc;
-    if (SearchSignedMulLong(Src0, &Opc, A, B, true)) {
-      Acc = Src1;
-    } else if (SearchSignedMulLong(Src1, &Opc, A, B, true)) {
-      Acc = Src0;
-    } else {
-      return false;
-    }
-    if (Opc == 0)
-      return false;
-
-    SDValue Ops[] = { A, B, Acc, getAL(CurDAG, dl),
-                      CurDAG->getRegister(0, MVT::i32) };
-    CurDAG->SelectNodeTo(N, Opc, MVT::i32, MVT::Other, Ops);
-    return true;
-  } else if (N->getOpcode() == ISD::OR &&
-             SearchSignedMulLong(SDValue(N, 0), &Opc, A, B, false)) {
-    if (Opc == 0)
-      return false;
-
-    SDValue Ops[] = { A, B, getAL(CurDAG, dl),
-                      CurDAG->getRegister(0, MVT::i32)};
-    CurDAG->SelectNodeTo(N, Opc, MVT::i32, Ops);
-    return true;
-  }
-  return false;
-}
-
 /// We've got special pseudo-instructions for these
 void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   unsigned Opcode;
@@ -2722,15 +2584,6 @@ void ARMDAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   CurDAG->RemoveDeadNode(N);
 }
 
-void ARMDAGToDAGISel::SelectConcatVector(SDNode *N) {
-  // The only time a CONCAT_VECTORS operation can have legal types is when
-  // two 64-bit vectors are concatenated to a 128-bit vector.
-  EVT VT = N->getValueType(0);
-  if (!VT.is128BitVector() || N->getNumOperands() != 2)
-    llvm_unreachable("unexpected CONCAT_VECTORS");
-  ReplaceNode(N, createDRegPairNode(VT, N->getOperand(0), N->getOperand(1)));
-}
-
 static Optional<std::pair<unsigned, unsigned>>
 getContiguousRangeOfSetBits(const APInt &A) {
   unsigned FirstOne = A.getBitWidth() - A.countLeadingZeros() - 1;
@@ -2822,11 +2675,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
   switch (N->getOpcode()) {
   default: break;
-  case ISD::ADD:
-  case ISD::OR:
-    if (trySMLAWSMULW(N))
-      return;
-    break;
   case ISD::WRITE_REGISTER:
     if (tryWriteRegister(N))
       return;
@@ -3042,49 +2890,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
 
     break;
   }
-  case ARMISD::VMOVRRD:
-    ReplaceNode(N, CurDAG->getMachineNode(ARM::VMOVRRD, dl, MVT::i32, MVT::i32,
-                                          N->getOperand(0), getAL(CurDAG, dl),
-                                          CurDAG->getRegister(0, MVT::i32)));
-    return;
-  case ISD::UMUL_LOHI: {
-    if (Subtarget->isThumb1Only())
-      break;
-    if (Subtarget->isThumb()) {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(
-          N, CurDAG->getMachineNode(ARM::t2UMULL, dl, MVT::i32, MVT::i32, Ops));
-      return;
-    } else {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
-                        CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(N, CurDAG->getMachineNode(
-                         Subtarget->hasV6Ops() ? ARM::UMULL : ARM::UMULLv5, dl,
-                         MVT::i32, MVT::i32, Ops));
-      return;
-    }
-  }
-  case ISD::SMUL_LOHI: {
-    if (Subtarget->isThumb1Only())
-      break;
-    if (Subtarget->isThumb()) {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(
-          N, CurDAG->getMachineNode(ARM::t2SMULL, dl, MVT::i32, MVT::i32, Ops));
-      return;
-    } else {
-      SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                        getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
-                        CurDAG->getRegister(0, MVT::i32) };
-      ReplaceNode(N, CurDAG->getMachineNode(
-                         Subtarget->hasV6Ops() ? ARM::SMULL : ARM::SMULLv5, dl,
-                         MVT::i32, MVT::i32, Ops));
-      return;
-    }
-  }
   case ARMISD::UMAAL: {
     unsigned Opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
     SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
@@ -3095,38 +2900,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     return;
   }
   case ARMISD::UMLAL:{
-    // UMAAL is similar to UMLAL but it adds two 32-bit values to the
-    // 64-bit multiplication result.
-    if (Subtarget->hasV6Ops() && Subtarget->hasDSP() &&
-        N->getOperand(2).getOpcode() == ARMISD::ADDC &&
-        N->getOperand(3).getOpcode() == ARMISD::ADDE) {
-
-      SDValue Addc = N->getOperand(2);
-      SDValue Adde = N->getOperand(3);
-
-      if (Adde.getOperand(2).getNode() == Addc.getNode()) {
-
-        ConstantSDNode *Op0 = dyn_cast<ConstantSDNode>(Adde.getOperand(0));
-        ConstantSDNode *Op1 = dyn_cast<ConstantSDNode>(Adde.getOperand(1));
-
-        if (Op0 && Op1 && Op0->getZExtValue() == 0 && Op1->getZExtValue() == 0)
-        {
-          // Select UMAAL instead: UMAAL RdLo, RdHi, Rn, Rm
-          // RdLo = one operand to be added, lower 32-bits of res
-          // RdHi = other operand to be added, upper 32-bits of res
-          // Rn = first multiply operand
-          // Rm = second multiply operand
-          SDValue Ops[] = { N->getOperand(0), N->getOperand(1),
-                            Addc.getOperand(0), Addc.getOperand(1),
-                            getAL(CurDAG, dl),
-                            CurDAG->getRegister(0, MVT::i32) };
-          unsigned opc = Subtarget->isThumb() ? ARM::t2UMAAL : ARM::UMAAL;
-          CurDAG->SelectNodeTo(N, opc, MVT::i32, MVT::i32, Ops);
-          return;
-        }
-      }
-    }
-
     if (Subtarget->isThumb()) {
       SDValue Ops[] = { N->getOperand(0), N->getOperand(1), N->getOperand(2),
                         N->getOperand(3), getAL(CurDAG, dl),
@@ -3277,26 +3050,23 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       int64_t Addend = -C->getSExtValue();
 
       SDNode *Add = nullptr;
-      // In T2 mode, ADDS can be better than CMN if the immediate fits in a
+      // ADDS can be better than CMN if the immediate fits in a
       // 16-bit ADDS, which means either [0,256) for tADDi8 or [0,8) for tADDi3.
       // Outside that range we can just use a CMN which is 32-bit but has a
       // 12-bit immediate range.
-      if (Subtarget->isThumb2() && Addend < 1<<8) {
-        SDValue Ops[] = { X, CurDAG->getTargetConstant(Addend, dl, MVT::i32),
-                          getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
-                          CurDAG->getRegister(0, MVT::i32) };
-        Add = CurDAG->getMachineNode(ARM::t2ADDri, dl, MVT::i32, Ops);
-      } else if (!Subtarget->isThumb2() && Addend < 1<<8) {
-        // FIXME: Add T1 tADDi8 code.
-        SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
-                         CurDAG->getTargetConstant(Addend, dl, MVT::i32),
-                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
-        Add = CurDAG->getMachineNode(ARM::tADDi8, dl, MVT::i32, Ops);
-      } else if (!Subtarget->isThumb2() && Addend < 1<<3) {
-        SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
-                         CurDAG->getTargetConstant(Addend, dl, MVT::i32),
-                         getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
-        Add = CurDAG->getMachineNode(ARM::tADDi3, dl, MVT::i32, Ops);
+      if (Addend < 1<<8) {
+        if (Subtarget->isThumb2()) {
+          SDValue Ops[] = { X, CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+                            getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32),
+                            CurDAG->getRegister(0, MVT::i32) };
+          Add = CurDAG->getMachineNode(ARM::t2ADDri, dl, MVT::i32, Ops);
+        } else {
+          unsigned Opc = (Addend < 1<<3) ? ARM::tADDi3 : ARM::tADDi8;
+          SDValue Ops[] = {CurDAG->getRegister(ARM::CPSR, MVT::i32), X,
+                           CurDAG->getTargetConstant(Addend, dl, MVT::i32),
+                           getAL(CurDAG, dl), CurDAG->getRegister(0, MVT::i32)};
+          Add = CurDAG->getMachineNode(Opc, dl, MVT::i32, Ops);
+        }
       }
       if (Add) {
         SDValue Ops2[] = {SDValue(Add, 0), CurDAG->getConstant(0, dl, MVT::i32)};
@@ -4013,10 +3783,6 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     return;
   }
 
-  case ISD::CONCAT_VECTORS:
-    SelectConcatVector(N);
-    return;
-
   case ISD::ATOMIC_CMP_SWAP:
     SelectCMP_SWAP(N);
     return;
@@ -4123,11 +3889,10 @@ static inline int getMClassRegisterSYSmValueMask(StringRef RegString) {
 // The flags here are common to those allowed for apsr in the A class cores and
 // those allowed for the special registers in the M class cores. Returns a
 // value representing which flags were present, -1 if invalid.
-static inline int getMClassFlagsMask(StringRef Flags, bool hasDSP) {
-  if (Flags.empty())
-    return 0x2 | (int)hasDSP;
-
+static inline int getMClassFlagsMask(StringRef Flags) {
   return StringSwitch<int>(Flags)
+          .Case("", 0x2) // no flags means nzcvq for psr registers, and 0x2 is
+                         // correct when flags are not permitted
           .Case("g", 0x1)
           .Case("nzcvq", 0x2)
           .Case("nzcvqg", 0x3)
@@ -4170,7 +3935,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
   }
 
   // We know we are now handling a write so need to get the mask for the flags.
-  int Mask = getMClassFlagsMask(Flags, Subtarget->hasDSP());
+  int Mask = getMClassFlagsMask(Flags);
 
   // Only apsr, iapsr, eapsr, xpsr can have flags. The other register values
   // shouldn't have flags present.
@@ -4185,10 +3950,7 @@ static int getMClassRegisterMask(StringRef Reg, StringRef Flags, bool IsRead,
   // The register was valid so need to put the mask in the correct place
   // (the flags need to be in bits 11-10) and combine with the SYSmvalue to
   // construct the operand for the instruction node.
-  if (SYSmvalue < 0x4)
-    return SYSmvalue | Mask << 10;
-
-  return SYSmvalue;
+  return SYSmvalue | Mask << 10;
 }
 
 static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
@@ -4201,7 +3963,7 @@ static int getARClassRegisterMask(StringRef Reg, StringRef Flags) {
     // The flags permitted for apsr are the same flags that are allowed in
     // M class registers. We get the flag value and then shift the flags into
     // the correct place to combine with the mask.
-    Mask = getMClassFlagsMask(Flags, true);
+    Mask = getMClassFlagsMask(Flags);
     if (Mask == -1)
       return -1;
     return Mask << 2;
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index 2306b76cc57af742bb761d10ed86ecf2622c0f75..e697c8ca5339e873f3e55d7f0fb378f138317c0e 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -38,6 +38,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
@@ -135,23 +136,6 @@ static cl::opt<unsigned> ConstpoolPromotionMaxTotal(
     cl::desc("Maximum size of ALL constants to promote into a constant pool"),
     cl::init(128));
 
-namespace {
-
-  class ARMCCState : public CCState {
-  public:
-    ARMCCState(CallingConv::ID CC, bool isVarArg, MachineFunction &MF,
-               SmallVectorImpl<CCValAssign> &locs, LLVMContext &C,
-               ParmContext PC)
-        : CCState(CC, isVarArg, MF, locs, C) {
-      assert(((PC == Call) || (PC == Prologue)) &&
-             "ARMCCState users must specify whether their context is call"
-             "or prologue generation.");
-      CallOrPrologue = PC;
-    }
-  };
-
-} // end anonymous namespace
-
 // The APCS parameter registers.
 static const MCPhysReg GPRArgRegs[] = {
   ARM::R0, ARM::R1, ARM::R2, ARM::R3
@@ -740,10 +724,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     }
   }
 
-  // ARM and Thumb2 support UMLAL/SMLAL.
-  if (!Subtarget->isThumb1Only())
-    setTargetDAGCombine(ISD::ADDC);
-
   if (Subtarget->isFPOnlySP()) {
     // When targeting a floating-point unit with only single-precision
     // operations, f64 is legal for the few double-precision instructions which
@@ -842,13 +822,10 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SRL,       MVT::i64, Custom);
   setOperationAction(ISD::SRA,       MVT::i64, Custom);
 
-  if (!Subtarget->isThumb1Only()) {
-    // FIXME: We should do this for Thumb1 as well.
-    setOperationAction(ISD::ADDC,    MVT::i32, Custom);
-    setOperationAction(ISD::ADDE,    MVT::i32, Custom);
-    setOperationAction(ISD::SUBC,    MVT::i32, Custom);
-    setOperationAction(ISD::SUBE,    MVT::i32, Custom);
-  }
+  setOperationAction(ISD::ADDC,      MVT::i32, Custom);
+  setOperationAction(ISD::ADDE,      MVT::i32, Custom);
+  setOperationAction(ISD::SUBC,      MVT::i32, Custom);
+  setOperationAction(ISD::SUBE,      MVT::i32, Custom);
 
   if (!Subtarget->isThumb1Only() && Subtarget->hasV6T2Ops())
     setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
@@ -1360,6 +1337,12 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
+  case ARMISD::SMLALBB:       return "ARMISD::SMLALBB";
+  case ARMISD::SMLALBT:       return "ARMISD::SMLALBT";
+  case ARMISD::SMLALTB:       return "ARMISD::SMLALTB";
+  case ARMISD::SMLALTT:       return "ARMISD::SMLALTT";
+  case ARMISD::SMULWB:        return "ARMISD::SMULWB";
+  case ARMISD::SMULWT:        return "ARMISD::SMULWT";
   case ARMISD::BUILD_VECTOR:  return "ARMISD::BUILD_VECTOR";
   case ARMISD::BFI:           return "ARMISD::BFI";
   case ARMISD::VORRIMM:       return "ARMISD::VORRIMM";
@@ -1469,6 +1452,40 @@ Sched::Preference ARMTargetLowering::getSchedulingPreference(SDNode *N) const {
 // Lowering Code
 //===----------------------------------------------------------------------===//
 
+static bool isSRL16(const SDValue &Op) {
+  if (Op.getOpcode() != ISD::SRL)
+    return false;
+  if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+    return Const->getZExtValue() == 16;
+  return false;
+}
+
+static bool isSRA16(const SDValue &Op) {
+  if (Op.getOpcode() != ISD::SRA)
+    return false;
+  if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+    return Const->getZExtValue() == 16;
+  return false;
+}
+
+static bool isSHL16(const SDValue &Op) {
+  if (Op.getOpcode() != ISD::SHL)
+    return false;
+  if (auto Const = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+    return Const->getZExtValue() == 16;
+  return false;
+}
+
+// Check for a signed 16-bit value. We special case SRA because it makes it
+// more simple when also looking for SRAs that aren't sign extending a
+// smaller value. Without the check, we'd need to take extra care with
+// checking order for some operations.
+static bool isS16(const SDValue &Op, SelectionDAG &DAG) {
+  if (isSRA16(Op))
+    return isSHL16(Op.getOperand(0));
+  return DAG.ComputeNumSignBits(Op) == 17;
+}
+
 /// IntCCToARMCC - Convert a DAG integer condition code to an ARM CC
 static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
   switch (CC) {
@@ -1488,22 +1505,34 @@ static ARMCC::CondCodes IntCCToARMCC(ISD::CondCode CC) {
 
 /// FPCCToARMCC - Convert a DAG fp condition code to an ARM CC.
 static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
-                        ARMCC::CondCodes &CondCode2) {
+                        ARMCC::CondCodes &CondCode2, bool &InvalidOnQNaN) {
   CondCode2 = ARMCC::AL;
+  InvalidOnQNaN = true;
   switch (CC) {
   default: llvm_unreachable("Unknown FP condition!");
   case ISD::SETEQ:
-  case ISD::SETOEQ: CondCode = ARMCC::EQ; break;
+  case ISD::SETOEQ:
+    CondCode = ARMCC::EQ;
+    InvalidOnQNaN = false;
+    break;
   case ISD::SETGT:
   case ISD::SETOGT: CondCode = ARMCC::GT; break;
   case ISD::SETGE:
   case ISD::SETOGE: CondCode = ARMCC::GE; break;
   case ISD::SETOLT: CondCode = ARMCC::MI; break;
   case ISD::SETOLE: CondCode = ARMCC::LS; break;
-  case ISD::SETONE: CondCode = ARMCC::MI; CondCode2 = ARMCC::GT; break;
+  case ISD::SETONE:
+    CondCode = ARMCC::MI;
+    CondCode2 = ARMCC::GT;
+    InvalidOnQNaN = false;
+    break;
   case ISD::SETO:   CondCode = ARMCC::VC; break;
   case ISD::SETUO:  CondCode = ARMCC::VS; break;
-  case ISD::SETUEQ: CondCode = ARMCC::EQ; CondCode2 = ARMCC::VS; break;
+  case ISD::SETUEQ:
+    CondCode = ARMCC::EQ;
+    CondCode2 = ARMCC::VS;
+    InvalidOnQNaN = false;
+    break;
   case ISD::SETUGT: CondCode = ARMCC::HI; break;
   case ISD::SETUGE: CondCode = ARMCC::PL; break;
   case ISD::SETLT:
@@ -1511,7 +1540,10 @@ static void FPCCToARMCC(ISD::CondCode CC, ARMCC::CondCodes &CondCode,
   case ISD::SETLE:
   case ISD::SETULE: CondCode = ARMCC::LE; break;
   case ISD::SETNE:
-  case ISD::SETUNE: CondCode = ARMCC::NE; break;
+  case ISD::SETUNE:
+    CondCode = ARMCC::NE;
+    InvalidOnQNaN = false;
+    break;
   }
 }
 
@@ -1604,8 +1636,8 @@ SDValue ARMTargetLowering::LowerCallResult(
 
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
-                    *DAG.getContext(), Call);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallResult(Ins, CCAssignFnForReturn(CallConv, isVarArg));
 
   // Copy all of the result registers out of their specified physreg.
@@ -1765,8 +1797,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                    *DAG.getContext(), Call);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CallConv, isVarArg));
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -1842,7 +1874,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                          StackPtr, MemOpChains, Flags);
       }
     } else if (VA.isRegLoc()) {
-      if (realArgIdx == 0 && Flags.isReturned() && Outs[0].VT == MVT::i32) {
+      if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
+          Outs[0].VT == MVT::i32) {
         assert(VA.getLocVT() == MVT::i32 &&
                "unexpected calling convention register assignment");
         assert(!Ins.empty() && Ins[0].VT == MVT::i32 &&
@@ -2142,10 +2175,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 /// this.
 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
                                     unsigned Align) const {
-  assert((State->getCallOrPrologue() == Prologue ||
-          State->getCallOrPrologue() == Call) &&
-         "unhandled ParmContext");
-
   // Byval (as with any stack) slots are always at least 4 byte aligned.
   Align = std::max(Align, 4U);
 
@@ -2314,7 +2343,7 @@ ARMTargetLowering::IsEligibleForTailCallOptimization(SDValue Callee,
     // Check if stack adjustment is needed. For now, do not do this if any
     // argument is passed on the stack.
     SmallVector<CCValAssign, 16> ArgLocs;
-    ARMCCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C, Call);
+    CCState CCInfo(CalleeCC, isVarArg, MF, ArgLocs, C);
     CCInfo.AnalyzeCallOperands(Outs, CCAssignFnForCall(CalleeCC, isVarArg));
     if (CCInfo.getNextStackOffset()) {
       // Check if the arguments are already laid out in the right way as
@@ -2416,8 +2445,8 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   SmallVector<CCValAssign, 16> RVLocs;
 
   // CCState - Info about the registers and stack slots.
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
-                    *DAG.getContext(), Call);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
   // Analyze outgoing return values.
   CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg));
@@ -2844,9 +2873,9 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
 
   // FIXME: is there useful debug info available here?
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
-               DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
+  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
+      CallingConv::C, Type::getInt32Ty(*DAG.getContext()),
+      DAG.getExternalSymbol("__tls_get_addr", PtrVT), std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.first;
@@ -3034,7 +3063,8 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
   unsigned RequiredPadding = 4 - (Size % 4);
   bool PaddingPossible =
     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
-  if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize)
+  if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
+      Size == 0)
     return SDValue();
 
   unsigned PaddedSize = Size + ((RequiredPadding == 4) ? 0 : RequiredPadding);
@@ -3081,17 +3111,20 @@ static SDValue promoteToConstantPool(const GlobalValue *GV, SelectionDAG &DAG,
   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
 }
 
+static bool isReadOnly(const GlobalValue *GV) {
+  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
+    GV = GA->getBaseObject();
+  return (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
+         isa<Function>(GV);
+}
+
 SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
                                                  SelectionDAG &DAG) const {
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc dl(Op);
   const GlobalValue *GV = cast<GlobalAddressSDNode>(Op)->getGlobal();
   const TargetMachine &TM = getTargetMachine();
-  if (const GlobalAlias *GA = dyn_cast<GlobalAlias>(GV))
-    GV = GA->getBaseObject();
-  bool IsRO =
-      (isa<GlobalVariable>(GV) && cast<GlobalVariable>(GV)->isConstant()) ||
-      isa<Function>(GV);
+  bool IsRO = isReadOnly(GV);
 
   // promoteToConstantPool only if not generating XO text section
   if (TM.shouldAssumeDSOLocal(*GV->getParent(), GV) && !Subtarget->genExecuteOnly())
@@ -3131,15 +3164,22 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     return Result;
   } else if (Subtarget->isRWPI() && !IsRO) {
     // SB-relative.
-    ARMConstantPoolValue *CPV =
-      ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
-    SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
-    CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
-    SDValue G = DAG.getLoad(
-        PtrVT, dl, DAG.getEntryNode(), CPAddr,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    SDValue RelAddr;
+    if (Subtarget->useMovt(DAG.getMachineFunction())) {
+      ++NumMovwMovt;
+      SDValue G = DAG.getTargetGlobalAddress(GV, dl, PtrVT, 0, ARMII::MO_SBREL);
+      RelAddr = DAG.getNode(ARMISD::Wrapper, dl, PtrVT, G);
+    } else { // use literal pool for address constant
+      ARMConstantPoolValue *CPV =
+        ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+      CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
+      RelAddr = DAG.getLoad(
+          PtrVT, dl, DAG.getEntryNode(), CPAddr,
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
+    }
     SDValue SB = DAG.getCopyFromReg(DAG.getEntryNode(), dl, ARM::R9, PtrVT);
-    SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, G);
+    SDValue Result = DAG.getNode(ISD::ADD, dl, PtrVT, SB, RelAddr);
     return Result;
   }
 
@@ -3513,8 +3553,8 @@ SDValue ARMTargetLowering::LowerFormalArguments(
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  ARMCCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
-                    *DAG.getContext(), Prologue);
+  CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
   CCInfo.AnalyzeFormalArguments(Ins, CCAssignFnForCall(CallConv, isVarArg));
 
   SmallVector<SDValue, 16> ArgValues;
@@ -3784,13 +3824,15 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
-                                     SelectionDAG &DAG, const SDLoc &dl) const {
+                                     SelectionDAG &DAG, const SDLoc &dl,
+                                     bool InvalidOnQNaN) const {
   assert(!Subtarget->isFPOnlySP() || RHS.getValueType() != MVT::f64);
   SDValue Cmp;
+  SDValue C = DAG.getConstant(InvalidOnQNaN, dl, MVT::i32);
   if (!isFloatingPointZero(RHS))
-    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
+    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS, C);
   else
-    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
+    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS, C);
   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
 }
 
@@ -3807,10 +3849,12 @@ ARMTargetLowering::duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const {
   Cmp = Cmp.getOperand(0);
   Opc = Cmp.getOpcode();
   if (Opc == ARMISD::CMPFP)
-    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),Cmp.getOperand(1));
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
+                      Cmp.getOperand(1), Cmp.getOperand(2));
   else {
     assert(Opc == ARMISD::CMPFPw0 && "unexpected operand of FMSTAT");
-    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0));
+    Cmp = DAG.getNode(Opc, DL, MVT::Glue, Cmp.getOperand(0),
+                      Cmp.getOperand(1));
   }
   return DAG.getNode(ARMISD::FMSTAT, DL, MVT::Glue, Cmp);
 }
@@ -4210,7 +4254,8 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
-  FPCCToARMCC(CC, CondCode, CondCode2);
+  bool InvalidOnQNaN;
+  FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
 
   // Try to generate VMAXNM/VMINNM on ARMv8.
   if (Subtarget->hasFPARMv8() && (TrueVal.getValueType() == MVT::f32 ||
@@ -4229,13 +4274,13 @@ SDValue ARMTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDValue Result = getCMOV(dl, VT, FalseVal, TrueVal, ARMcc, CCR, Cmp, DAG);
   if (CondCode2 != ARMCC::AL) {
     SDValue ARMcc2 = DAG.getConstant(CondCode2, dl, MVT::i32);
     // FIXME: Needs another CMP because flag can have but one use.
-    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl);
+    SDValue Cmp2 = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
     Result = getCMOV(dl, VT, Result, TrueVal, ARMcc2, CCR, Cmp2, DAG);
   }
   return Result;
@@ -4396,10 +4441,11 @@ SDValue ARMTargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
   }
 
   ARMCC::CondCodes CondCode, CondCode2;
-  FPCCToARMCC(CC, CondCode, CondCode2);
+  bool InvalidOnQNaN;
+  FPCCToARMCC(CC, CondCode, CondCode2, InvalidOnQNaN);
 
   SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
-  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, InvalidOnQNaN);
   SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
   SDVTList VTList = DAG.getVTList(MVT::Other, MVT::Glue);
   SDValue Ops[] = { Chain, Dest, ARMcc, CCR, Cmp };
@@ -4901,9 +4947,10 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
   // so that the shift + and get folded into a bitfield extract.
   SDLoc dl(Op);
-  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i32,
-                              DAG.getConstant(Intrinsic::arm_get_fpscr, dl,
-                                              MVT::i32));
+  SDValue Ops[] = { DAG.getEntryNode(),
+                    DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
+
+  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
                                   DAG.getConstant(1U << 22, dl, MVT::i32));
   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
@@ -6942,8 +6989,19 @@ static SDValue SkipExtensionForVMULL(SDNode *N, SelectionDAG &DAG) {
                                         N->getValueType(0),
                                         N->getOpcode());
 
-  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N))
-    return SkipLoadExtensionForVMULL(LD, DAG);
+  if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
+    assert((ISD::isSEXTLoad(LD) || ISD::isZEXTLoad(LD)) &&
+           "Expected extending load");
+
+    SDValue newLoad = SkipLoadExtensionForVMULL(LD, DAG);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), newLoad.getValue(1));
+    unsigned Opcode = ISD::isSEXTLoad(LD) ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+    SDValue extLoad =
+        DAG.getNode(Opcode, SDLoc(newLoad), LD->getValueType(0), newLoad);
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 0), extLoad);
+
+    return newLoad;
+  }
 
   // Otherwise, the value must be a BUILD_VECTOR.  For v2i64, it will
   // have been legalized as a BITCAST from v4i32.
@@ -7304,9 +7362,9 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
     ArgListEntry Entry;
     Entry.Node = SRet;
     Entry.Ty = RetTy->getPointerTo();
-    Entry.isSExt = false;
-    Entry.isZExt = false;
-    Entry.isSRet = true;
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
+    Entry.IsSRet = true;
     Args.push_back(Entry);
     RetTy = Type::getVoidTy(*DAG.getContext());
   }
@@ -7314,8 +7372,8 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   ArgListEntry Entry;
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   const char *LibcallName =
@@ -7526,12 +7584,12 @@ static SDValue LowerFPOWI(SDValue Op, const ARMSubtarget &Subtarget,
 
   Entry.Node = Val;
   Entry.Ty = Val.getValueType().getTypeForEVT(*DAG.getContext());
-  Entry.isZExt = true;
+  Entry.IsZExt = true;
   Args.push_back(Entry);
 
   Entry.Node = Exponent;
   Entry.Ty = Exponent.getValueType().getTypeForEVT(*DAG.getContext());
-  Entry.isZExt = true;
+  Entry.IsZExt = true;
   Args.push_back(Entry);
 
   Type *LCRTy = Val.getValueType().getTypeForEVT(*DAG.getContext());
@@ -9046,19 +9104,45 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
 
   // Rename pseudo opcodes.
   unsigned NewOpc = convertAddSubFlagsOpcode(MI.getOpcode());
+  unsigned ccOutIdx;
   if (NewOpc) {
     const ARMBaseInstrInfo *TII = Subtarget->getInstrInfo();
     MCID = &TII->get(NewOpc);
 
-    assert(MCID->getNumOperands() == MI.getDesc().getNumOperands() + 1 &&
-           "converted opcode should be the same except for cc_out");
+    assert(MCID->getNumOperands() ==
+           MI.getDesc().getNumOperands() + 5 - MI.getDesc().getSize()
+        && "converted opcode should be the same except for cc_out"
+           " (and, on Thumb1, pred)");
 
     MI.setDesc(*MCID);
 
     // Add the optional cc_out operand
     MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/true));
-  }
-  unsigned ccOutIdx = MCID->getNumOperands() - 1;
+
+    // On Thumb1, move all input operands to the end, then add the predicate
+    if (Subtarget->isThumb1Only()) {
+      for (unsigned c = MCID->getNumOperands() - 4; c--;) {
+        MI.addOperand(MI.getOperand(1));
+        MI.RemoveOperand(1);
+      }
+
+      // Restore the ties
+      for (unsigned i = MI.getNumOperands(); i--;) {
+        const MachineOperand& op = MI.getOperand(i);
+        if (op.isReg() && op.isUse()) {
+          int DefIdx = MCID->getOperandConstraint(i, MCOI::TIED_TO);
+          if (DefIdx != -1)
+            MI.tieOperands(DefIdx, i);
+        }
+      }
+
+      MI.addOperand(MachineOperand::CreateImm(ARMCC::AL));
+      MI.addOperand(MachineOperand::CreateReg(0, /*isDef=*/false));
+      ccOutIdx = 1;
+    } else
+      ccOutIdx = MCID->getNumOperands() - 1;
+  } else
+    ccOutIdx = MCID->getNumOperands() - 1;
 
   // Any ARM instruction that sets the 's' bit should specify an optional
   // "cc_out" operand in the last operand position.
@@ -9089,7 +9173,9 @@ void ARMTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
   if (deadCPSR) {
     assert(!MI.getOperand(ccOutIdx).getReg() &&
            "expect uninitialized optional cc_out operand");
-    return;
+    // Thumb1 instructions must have the S bit even if the CPSR is dead.
+    if (!Subtarget->isThumb1Only())
+      return;
   }
 
   // If this instruction was defined with an optional CPSR def and its dag node
@@ -9151,7 +9237,7 @@ static bool isConditionalZeroOrAllOnes(SDNode *N, bool AllOnes,
     SDLoc dl(N);
     EVT VT = N->getValueType(0);
     CC = N->getOperand(0);
-    if (CC.getValueType() != MVT::i1)
+    if (CC.getValueType() != MVT::i1 || CC.getOpcode() != ISD::SETCC)
       return false;
     Invert = !AllOnes;
     if (AllOnes)
@@ -9427,7 +9513,88 @@ static SDValue findMUL_LOHI(SDValue V) {
   return SDValue();
 }
 
-static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
+static SDValue AddCombineTo64BitSMLAL16(SDNode *AddcNode, SDNode *AddeNode,
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const ARMSubtarget *Subtarget) {
+
+  if (Subtarget->isThumb()) {
+    if (!Subtarget->hasDSP())
+      return SDValue();
+  } else if (!Subtarget->hasV5TEOps())
+    return SDValue();
+
+  // SMLALBB, SMLALBT, SMLALTB, SMLALTT multiply two 16-bit values and
+  // accumulates the product into a 64-bit value. The 16-bit values will
+  // be sign extended somehow or SRA'd into 32-bit values
+  // (addc (adde (mul 16bit, 16bit), lo), hi)
+  SDValue Mul = AddcNode->getOperand(0);
+  SDValue Lo = AddcNode->getOperand(1);
+  if (Mul.getOpcode() != ISD::MUL) {
+    Lo = AddcNode->getOperand(0);
+    Mul = AddcNode->getOperand(1);
+    if (Mul.getOpcode() != ISD::MUL)
+      return SDValue();
+  }
+
+  SDValue SRA = AddeNode->getOperand(0);
+  SDValue Hi = AddeNode->getOperand(1);
+  if (SRA.getOpcode() != ISD::SRA) {
+    SRA = AddeNode->getOperand(1);
+    Hi = AddeNode->getOperand(0);
+    if (SRA.getOpcode() != ISD::SRA)
+      return SDValue();
+  }
+  if (auto Const = dyn_cast<ConstantSDNode>(SRA.getOperand(1))) {
+    if (Const->getZExtValue() != 31)
+      return SDValue();
+  } else
+    return SDValue();
+
+  if (SRA.getOperand(0) != Mul)
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDLoc dl(AddcNode);
+  unsigned Opcode = 0;
+  SDValue Op0;
+  SDValue Op1;
+
+  if (isS16(Mul.getOperand(0), DAG) && isS16(Mul.getOperand(1), DAG)) {
+    Opcode = ARMISD::SMLALBB;
+    Op0 = Mul.getOperand(0);
+    Op1 = Mul.getOperand(1);
+  } else if (isS16(Mul.getOperand(0), DAG) && isSRA16(Mul.getOperand(1))) {
+    Opcode = ARMISD::SMLALBT;
+    Op0 = Mul.getOperand(0);
+    Op1 = Mul.getOperand(1).getOperand(0);
+  } else if (isSRA16(Mul.getOperand(0)) && isS16(Mul.getOperand(1), DAG)) {
+    Opcode = ARMISD::SMLALTB;
+    Op0 = Mul.getOperand(0).getOperand(0);
+    Op1 = Mul.getOperand(1);
+  } else if (isSRA16(Mul.getOperand(0)) && isSRA16(Mul.getOperand(1))) {
+    Opcode = ARMISD::SMLALTT;
+    Op0 = Mul->getOperand(0).getOperand(0);
+    Op1 = Mul->getOperand(1).getOperand(0);
+  }
+
+  if (!Op0 || !Op1)
+    return SDValue();
+
+  SDValue SMLAL = DAG.getNode(Opcode, dl, DAG.getVTList(MVT::i32, MVT::i32),
+                              Op0, Op1, Lo, Hi);
+  // Replace the ADDs' nodes uses by the MLA node's values.
+  SDValue HiMLALResult(SMLAL.getNode(), 1);
+  SDValue LoMLALResult(SMLAL.getNode(), 0);
+
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(AddeNode, 0), HiMLALResult);
+
+  // Return original node to notify the driver to stop replacing.
+  SDValue resNode(AddcNode, 0);
+  return resNode;
+}
+
+static SDValue AddCombineTo64bitMLAL(SDNode *AddeNode,
                                      TargetLowering::DAGCombinerInfo &DCI,
                                      const ARMSubtarget *Subtarget) {
   // Look for multiply add opportunities.
@@ -9444,7 +9611,17 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   //                  \      /
   //                    ADDC   <- hiAdd
   //
-  assert(AddcNode->getOpcode() == ISD::ADDC && "Expect an ADDC");
+  assert(AddeNode->getOpcode() == ARMISD::ADDE && "Expect an ADDE");
+
+  assert(AddeNode->getNumOperands() == 3 &&
+         AddeNode->getOperand(2).getValueType() == MVT::i32 &&
+         "ADDE node has the wrong inputs");
+
+  // Check that we have a glued ADDC node.
+  SDNode* AddcNode = AddeNode->getOperand(2).getNode();
+  if (AddcNode->getOpcode() != ARMISD::ADDC)
+    return SDValue();
+
   SDValue AddcOp0 = AddcNode->getOperand(0);
   SDValue AddcOp1 = AddcNode->getOperand(1);
 
@@ -9456,29 +9633,13 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
          AddcNode->getValueType(0) == MVT::i32 &&
          "Expect ADDC with two result values. First: i32");
 
-  // Check that we have a glued ADDC node.
-  if (AddcNode->getValueType(1) != MVT::Glue)
-    return SDValue();
-
-  // Check that the ADDC adds the low result of the S/UMUL_LOHI.
+  // Check that the ADDC adds the low result of the S/UMUL_LOHI. If not, it
+  // maybe a SMLAL which multiplies two 16-bit values.
   if (AddcOp0->getOpcode() != ISD::UMUL_LOHI &&
       AddcOp0->getOpcode() != ISD::SMUL_LOHI &&
       AddcOp1->getOpcode() != ISD::UMUL_LOHI &&
       AddcOp1->getOpcode() != ISD::SMUL_LOHI)
-    return SDValue();
-
-  // Look for the glued ADDE.
-  SDNode* AddeNode = AddcNode->getGluedUser();
-  if (!AddeNode)
-    return SDValue();
-
-  // Make sure it is really an ADDE.
-  if (AddeNode->getOpcode() != ISD::ADDE)
-    return SDValue();
-
-  assert(AddeNode->getNumOperands() == 3 &&
-         AddeNode->getOperand(2).getValueType() == MVT::Glue &&
-         "ADDE node has the wrong inputs");
+    return AddCombineTo64BitSMLAL16(AddcNode, AddeNode, DCI, Subtarget);
 
   // Check for the triangle shape.
   SDValue AddeOp0 = AddeNode->getOperand(0);
@@ -9553,38 +9714,25 @@ static SDValue AddCombineTo64bitMLAL(SDNode *AddcNode,
   DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), LoMLALResult);
 
   // Return original node to notify the driver to stop replacing.
-  SDValue resNode(AddcNode, 0);
-  return resNode;
+  return SDValue(AddeNode, 0);
 }
 
-static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
+static SDValue AddCombineTo64bitUMAAL(SDNode *AddeNode,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const ARMSubtarget *Subtarget) {
   // UMAAL is similar to UMLAL except that it adds two unsigned values.
   // While trying to combine for the other MLAL nodes, first search for the
-  // chance to use UMAAL. Check if Addc uses another addc node which can first
-  // be combined into a UMLAL. The other pattern is AddcNode being combined
-  // into an UMLAL and then using another addc is handled in ISelDAGToDAG.
-
-  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP() ||
-      (Subtarget->isThumb() && !Subtarget->hasThumb2()))
-    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
-
-  SDNode *PrevAddc = nullptr;
-  if (AddcNode->getOperand(0).getOpcode() == ISD::ADDC)
-    PrevAddc = AddcNode->getOperand(0).getNode();
-  else if (AddcNode->getOperand(1).getOpcode() == ISD::ADDC)
-    PrevAddc = AddcNode->getOperand(1).getNode();
-
-  // If there's no addc chains, just return a search for any MLAL.
-  if (PrevAddc == nullptr)
-    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
-
-  // Try to convert the addc operand to an MLAL and if that fails try to
-  // combine AddcNode.
-  SDValue MLAL = AddCombineTo64bitMLAL(PrevAddc, DCI, Subtarget);
-  if (MLAL != SDValue(PrevAddc, 0))
-    return AddCombineTo64bitMLAL(AddcNode, DCI, Subtarget);
+  // chance to use UMAAL. Check if Addc uses a node which has already
+  // been combined into a UMLAL. The other pattern is UMLAL using Addc/Adde
+  // as the addend, and it's handled in PerformUMLALCombine.
+
+  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
+    return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
+
+  // Check that we have a glued ADDC node.
+  SDNode* AddcNode = AddeNode->getOperand(2).getNode();
+  if (AddcNode->getOpcode() != ARMISD::ADDC)
+    return SDValue();
 
   // Find the converted UMAAL or quit if it doesn't exist.
   SDNode *UmlalNode = nullptr;
@@ -9596,29 +9744,18 @@ static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
     UmlalNode = AddcNode->getOperand(1).getNode();
     AddHi = AddcNode->getOperand(0);
   } else {
-    return SDValue();
+    return AddCombineTo64bitMLAL(AddeNode, DCI, Subtarget);
   }
 
   // The ADDC should be glued to an ADDE node, which uses the same UMLAL as
   // the ADDC as well as Zero.
-  auto *Zero = dyn_cast<ConstantSDNode>(UmlalNode->getOperand(3));
-
-  if (!Zero || Zero->getZExtValue() != 0)
+  if (!isNullConstant(UmlalNode->getOperand(3)))
     return SDValue();
 
-  // Check that we have a glued ADDC node.
-  if (AddcNode->getValueType(1) != MVT::Glue)
-    return SDValue();
-
-  // Look for the glued ADDE.
-  SDNode* AddeNode = AddcNode->getGluedUser();
-  if (!AddeNode)
-    return SDValue();
-
-  if ((AddeNode->getOperand(0).getNode() == Zero &&
+  if ((isNullConstant(AddeNode->getOperand(0)) &&
        AddeNode->getOperand(1).getNode() == UmlalNode) ||
       (AddeNode->getOperand(0).getNode() == UmlalNode &&
-       AddeNode->getOperand(1).getNode() == Zero)) {
+       isNullConstant(AddeNode->getOperand(1)))) {
 
     SelectionDAG &DAG = DCI.DAG;
     SDValue Ops[] = { UmlalNode->getOperand(0), UmlalNode->getOperand(1),
@@ -9631,18 +9768,84 @@ static SDValue AddCombineTo64bitUMAAL(SDNode *AddcNode,
     DAG.ReplaceAllUsesOfValueWith(SDValue(AddcNode, 0), SDValue(UMAAL.getNode(), 0));
 
     // Return original node to notify the driver to stop replacing.
-    return SDValue(AddcNode, 0);
+    return SDValue(AddeNode, 0);
   }
   return SDValue();
 }
 
-/// PerformADDCCombine - Target-specific dag combine transform from
-/// ISD::ADDC, ISD::ADDE, and ISD::MUL_LOHI to MLAL or
-/// ISD::ADDC, ISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
-static SDValue PerformADDCCombine(SDNode *N,
-                                 TargetLowering::DAGCombinerInfo &DCI,
-                                 const ARMSubtarget *Subtarget) {
-  if (Subtarget->isThumb1Only()) return SDValue();
+static SDValue PerformUMLALCombine(SDNode *N, SelectionDAG &DAG,
+                                   const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
+    return SDValue();
+
+  // Check that we have a pair of ADDC and ADDE as operands.
+  // Both addends of the ADDE must be zero.
+  SDNode* AddcNode = N->getOperand(2).getNode();
+  SDNode* AddeNode = N->getOperand(3).getNode();
+  if ((AddcNode->getOpcode() == ARMISD::ADDC) &&
+      (AddeNode->getOpcode() == ARMISD::ADDE) &&
+      isNullConstant(AddeNode->getOperand(0)) &&
+      isNullConstant(AddeNode->getOperand(1)) &&
+      (AddeNode->getOperand(2).getNode() == AddcNode))
+    return DAG.getNode(ARMISD::UMAAL, SDLoc(N),
+                       DAG.getVTList(MVT::i32, MVT::i32),
+                       {N->getOperand(0), N->getOperand(1),
+                        AddcNode->getOperand(0), AddcNode->getOperand(1)});
+  else
+    return SDValue();
+}
+
+static SDValue PerformAddcSubcCombine(SDNode *N, SelectionDAG &DAG,
+                                      const ARMSubtarget *Subtarget) {
+  if (Subtarget->isThumb1Only()) {
+    SDValue RHS = N->getOperand(1);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+      int32_t imm = C->getSExtValue();
+      if (imm < 0 && imm > INT_MIN) {
+        SDLoc DL(N);
+        RHS = DAG.getConstant(-imm, DL, MVT::i32);
+        unsigned Opcode = (N->getOpcode() == ARMISD::ADDC) ? ARMISD::SUBC
+                                                           : ARMISD::ADDC;
+        return DAG.getNode(Opcode, DL, N->getVTList(), N->getOperand(0), RHS);
+      }
+    }
+  }
+  return SDValue();
+}
+
+static SDValue PerformAddeSubeCombine(SDNode *N, SelectionDAG &DAG,
+                                      const ARMSubtarget *Subtarget) {
+  if (Subtarget->isThumb1Only()) {
+    SDValue RHS = N->getOperand(1);
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(RHS)) {
+      int64_t imm = C->getSExtValue();
+      if (imm < 0) {
+        SDLoc DL(N);
+
+        // The with-carry-in form matches bitwise not instead of the negation.
+        // Effectively, the inverse interpretation of the carry flag already
+        // accounts for part of the negation.
+        RHS = DAG.getConstant(~imm, DL, MVT::i32);
+
+        unsigned Opcode = (N->getOpcode() == ARMISD::ADDE) ? ARMISD::SUBE
+                                                           : ARMISD::ADDE;
+        return DAG.getNode(Opcode, DL, N->getVTList(),
+                           N->getOperand(0), RHS, N->getOperand(2));
+      }
+    }
+  }
+  return SDValue();
+}
+
+/// PerformADDECombine - Target-specific dag combine transform from
+/// ARMISD::ADDC, ARMISD::ADDE, and ISD::MUL_LOHI to MLAL or
+/// ARMISD::ADDC, ARMISD::ADDE and ARMISD::UMLAL to ARMISD::UMAAL
+static SDValue PerformADDECombine(SDNode *N,
+                                  TargetLowering::DAGCombinerInfo &DCI,
+                                  const ARMSubtarget *Subtarget) {
+  // Only ARM and Thumb2 support UMLAL/SMLAL.
+  if (Subtarget->isThumb1Only())
+    return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
 
   // Only perform the checks after legalize when the pattern is available.
   if (DCI.isBeforeLegalize()) return SDValue();
@@ -9877,6 +10080,67 @@ static SDValue PerformANDCombine(SDNode *N,
   return SDValue();
 }
 
+// Try combining OR nodes to SMULWB, SMULWT.
+static SDValue PerformORCombineToSMULWBT(SDNode *OR,
+                                         TargetLowering::DAGCombinerInfo &DCI,
+                                         const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasV6Ops() ||
+      (Subtarget->isThumb() &&
+       (!Subtarget->hasThumb2() || !Subtarget->hasDSP())))
+    return SDValue();
+
+  SDValue SRL = OR->getOperand(0);
+  SDValue SHL = OR->getOperand(1);
+
+  if (SRL.getOpcode() != ISD::SRL || SHL.getOpcode() != ISD::SHL) {
+    SRL = OR->getOperand(1);
+    SHL = OR->getOperand(0);
+  }
+  if (!isSRL16(SRL) || !isSHL16(SHL))
+    return SDValue();
+
+  // The first operands to the shifts need to be the two results from the
+  // same smul_lohi node.
+  if ((SRL.getOperand(0).getNode() != SHL.getOperand(0).getNode()) ||
+       SRL.getOperand(0).getOpcode() != ISD::SMUL_LOHI)
+    return SDValue();
+
+  SDNode *SMULLOHI = SRL.getOperand(0).getNode();
+  if (SRL.getOperand(0) != SDValue(SMULLOHI, 0) ||
+      SHL.getOperand(0) != SDValue(SMULLOHI, 1))
+    return SDValue();
+
+  // Now we have:
+  // (or (srl (smul_lohi ?, ?), 16), (shl (smul_lohi ?, ?), 16)))
+  // For SMUL[B|T] smul_lohi will take a 32-bit and a 16-bit arguments.
+  // For SMUWB the 16-bit value will signed extended somehow.
+  // For SMULWT only the SRA is required.
+  // Check both sides of SMUL_LOHI
+  SDValue OpS16 = SMULLOHI->getOperand(0);
+  SDValue OpS32 = SMULLOHI->getOperand(1);
+
+  SelectionDAG &DAG = DCI.DAG;
+  if (!isS16(OpS16, DAG) && !isSRA16(OpS16)) {
+    OpS16 = OpS32;
+    OpS32 = SMULLOHI->getOperand(0);
+  }
+
+  SDLoc dl(OR);
+  unsigned Opcode = 0;
+  if (isS16(OpS16, DAG))
+    Opcode = ARMISD::SMULWB;
+  else if (isSRA16(OpS16)) {
+    Opcode = ARMISD::SMULWT;
+    OpS16 = OpS16->getOperand(0);
+  }
+  else
+    return SDValue();
+
+  SDValue Res = DAG.getNode(Opcode, dl, MVT::i32, OpS32, OpS16);
+  DAG.ReplaceAllUsesOfValueWith(SDValue(OR, 0), Res);
+  return SDValue(OR, 0);
+}
+
 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
 static SDValue PerformORCombine(SDNode *N,
                                 TargetLowering::DAGCombinerInfo &DCI,
@@ -9914,6 +10178,8 @@ static SDValue PerformORCombine(SDNode *N,
     // fold (or (select cc, 0, c), x) -> (select cc, x, (or, x, c))
     if (SDValue Result = combineSelectAndUseCommutative(N, false, DCI))
       return Result;
+    if (SDValue Result = PerformORCombineToSMULWBT(N, DCI, Subtarget))
+      return Result;
   }
 
   // The code below optimizes (or (and X, Y), Z).
@@ -10022,7 +10288,7 @@ static SDValue PerformORCombine(SDNode *N,
         (Mask == ~Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
       // so use that when it's available.
-      if (Subtarget->hasT2ExtractPack() &&
+      if (Subtarget->hasDSP() &&
           (Mask == 0xffff || Mask == 0xffff0000))
         return SDValue();
       // 2a
@@ -10038,7 +10304,7 @@ static SDValue PerformORCombine(SDNode *N,
                (~Mask == Mask2)) {
       // The pack halfword instruction works better for masks that fit it,
       // so use that when it's available.
-      if (Subtarget->hasT2ExtractPack() &&
+      if (Subtarget->hasDSP() &&
           (Mask2 == 0xffff || Mask2 == 0xffff0000))
         return SDValue();
       // 2b
@@ -11440,8 +11706,8 @@ static void computeKnownBits(SelectionDAG &DAG, SDValue Op, APInt &KnownZero,
   if (Op.getOpcode() == ARMISD::CMOV) {
     APInt KZ2(KnownZero.getBitWidth(), 0);
     APInt KO2(KnownOne.getBitWidth(), 0);
-    computeKnownBits(DAG, Op.getOperand(1), KnownZero, KnownOne);
-    computeKnownBits(DAG, Op.getOperand(2), KZ2, KO2);
+    computeKnownBits(DAG, Op.getOperand(0), KnownZero, KnownOne);
+    computeKnownBits(DAG, Op.getOperand(1), KZ2, KO2);
 
     KnownZero &= KZ2;
     KnownOne &= KO2;
@@ -11671,13 +11937,17 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
-  case ISD::ADDC:       return PerformADDCCombine(N, DCI, Subtarget);
+  case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
+  case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
   case ISD::ADD:        return PerformADDCombine(N, DCI, Subtarget);
   case ISD::SUB:        return PerformSUBCombine(N, DCI);
   case ISD::MUL:        return PerformMULCombine(N, DCI, Subtarget);
   case ISD::OR:         return PerformORCombine(N, DCI, Subtarget);
   case ISD::XOR:        return PerformXORCombine(N, DCI, Subtarget);
   case ISD::AND:        return PerformANDCombine(N, DCI, Subtarget);
+  case ARMISD::ADDC:
+  case ARMISD::SUBC:    return PerformAddcSubcCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::SUBE:    return PerformAddeSubeCombine(N, DCI.DAG, Subtarget);
   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
@@ -11709,6 +11979,56 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformVLDCombine(N, DCI);
   case ARMISD::BUILD_VECTOR:
     return PerformARMBUILD_VECTORCombine(N, DCI);
+  case ARMISD::SMULWB: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+    if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMULWT: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
+    if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALBB: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALBT: {
+    unsigned LowWidth = N->getOperand(0).getValueType().getSizeInBits();
+    APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
+    unsigned HighWidth = N->getOperand(1).getValueType().getSizeInBits();
+    APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), LowMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), HighMask, DCI)))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALTB: {
+    unsigned HighWidth = N->getOperand(0).getValueType().getSizeInBits();
+    APInt HighMask = APInt::getHighBitsSet(HighWidth, 16);
+    unsigned LowWidth = N->getOperand(1).getValueType().getSizeInBits();
+    APInt LowMask = APInt::getLowBitsSet(LowWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), HighMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), LowMask, DCI)))
+      return SDValue();
+    break;
+  }
+  case ARMISD::SMLALTT: {
+    unsigned BitWidth = N->getValueType(0).getSizeInBits();
+    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 16);
+    if ((SimplifyDemandedBits(N->getOperand(0), DemandedMask, DCI)) ||
+        (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI)))
+      return SDValue();
+    break;
+  }
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
@@ -12296,6 +12616,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
 void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       APInt &KnownZero,
                                                       APInt &KnownOne,
+                                                      const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   unsigned BitWidth = KnownOne.getBitWidth();
@@ -12704,8 +13025,8 @@ static TargetLowering::ArgListTy getDivRemArgList(
     Type *ArgTy = ArgVT.getTypeForEVT(*Context);
     Entry.Node = N->getOperand(i);
     Entry.Ty = ArgTy;
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = isSigned;
+    Entry.IsZExt = !isSigned;
     Args.push_back(Entry);
   }
   if (Subtarget->isTargetWindows() && Args.size() >= 2)
@@ -13270,6 +13591,39 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilder<> &Builder, Value *Val,
               Addr});
 }
 
+/// A helper function for determining the number of interleaved accesses we
+/// will generate when lowering accesses of the given type.
+unsigned
+ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
+                                             const DataLayout &DL) const {
+  return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
+}
+
+bool ARMTargetLowering::isLegalInterleavedAccessType(
+    VectorType *VecTy, const DataLayout &DL) const {
+
+  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
+  unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
+
+  // Ensure the vector doesn't have f16 elements. Even though we could do an
+  // i16 vldN, we can't hold the f16 vectors and will end up converting via
+  // f32.
+  if (VecTy->getElementType()->isHalfTy())
+    return false;
+
+  // Ensure the number of vector elements is greater than 1.
+  if (VecTy->getNumElements() < 2)
+    return false;
+
+  // Ensure the element type is legal.
+  if (ElSize != 8 && ElSize != 16 && ElSize != 32)
+    return false;
+
+  // Ensure the total vector size is 64 or a multiple of 128. Types larger than
+  // 128 will be split into multiple interleaved accesses.
+  return VecSize == 64 || VecSize % 128 == 0;
+}
+
 /// \brief Lower an interleaved load into a vldN intrinsic.
 ///
 /// E.g. Lower an interleaved load (Factor = 2):
@@ -13294,64 +13648,97 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   Type *EltTy = VecTy->getVectorElementType();
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
-  unsigned VecSize = DL.getTypeSizeInBits(VecTy);
-  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
-  // Skip if we do not have NEON and skip illegal vector types and vector types
-  // with i64/f64 elements (vldN doesn't support i64/f64 elements).
-  if (!Subtarget->hasNEON() || (VecSize != 64 && VecSize != 128) || EltIs64Bits)
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
     return false;
 
+  unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   if (EltTy->isPointerTy())
     VecTy =
         VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
 
+  IRBuilder<> Builder(LI);
+
+  // The base address of the load.
+  Value *BaseAddr = LI->getPointerOperand();
+
+  if (NumLoads > 1) {
+    // If we're going to generate more than one load, reset the sub-vector type
+    // to something legal.
+    VecTy = VectorType::get(VecTy->getVectorElementType(),
+                            VecTy->getVectorNumElements() / NumLoads);
+
+    // We will compute the pointer operand of each load from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, VecTy->getVectorElementType()->getPointerTo(
+                      LI->getPointerAddressSpace()));
+  }
+
+  assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
+  Type *Tys[] = {VecTy, Int8Ptr};
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
                                             Intrinsic::arm_neon_vld3,
                                             Intrinsic::arm_neon_vld4};
+  Function *VldnFunc =
+      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
 
-  IRBuilder<> Builder(LI);
-  SmallVector<Value *, 2> Ops;
+  // Holds sub-vectors extracted from the load intrinsic return values. The
+  // sub-vectors are associated with the shufflevector instructions they will
+  // replace.
+  DenseMap<ShuffleVectorInst *, SmallVector<Value *, 4>> SubVecs;
 
-  Type *Int8Ptr = Builder.getInt8PtrTy(LI->getPointerAddressSpace());
-  Ops.push_back(Builder.CreateBitCast(LI->getPointerOperand(), Int8Ptr));
-  Ops.push_back(Builder.getInt32(LI->getAlignment()));
+  for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) {
 
-  Type *Tys[] = { VecTy, Int8Ptr };
-  Function *VldnFunc =
-      Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys);
-  CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
+    // If we're generating more than one load, compute the base address of
+    // subsequent loads as an offset from the previous.
+    if (LoadCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(
+          BaseAddr, VecTy->getVectorNumElements() * Factor);
 
-  // Replace uses of each shufflevector with the corresponding vector loaded
-  // by ldN.
-  for (unsigned i = 0; i < Shuffles.size(); i++) {
-    ShuffleVectorInst *SV = Shuffles[i];
-    unsigned Index = Indices[i];
+    SmallVector<Value *, 2> Ops;
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+    Ops.push_back(Builder.getInt32(LI->getAlignment()));
 
-    Value *SubVec = Builder.CreateExtractValue(VldN, Index);
+    CallInst *VldN = Builder.CreateCall(VldnFunc, Ops, "vldN");
 
-    // Convert the integer vector to pointer vector if the element is pointer.
-    if (EltTy->isPointerTy())
-      SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
+    // Replace uses of each shufflevector with the corresponding vector loaded
+    // by ldN.
+    for (unsigned i = 0; i < Shuffles.size(); i++) {
+      ShuffleVectorInst *SV = Shuffles[i];
+      unsigned Index = Indices[i];
 
-    SV->replaceAllUsesWith(SubVec);
-  }
+      Value *SubVec = Builder.CreateExtractValue(VldN, Index);
 
-  return true;
-}
+      // Convert the integer vector to pointer vector if the element is pointer.
+      if (EltTy->isPointerTy())
+        SubVec = Builder.CreateIntToPtr(SubVec, SV->getType());
 
-/// \brief Get a mask consisting of sequential integers starting from \p Start.
-///
-/// I.e. <Start, Start + 1, ..., Start + NumElts - 1>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned Start,
-                                   unsigned NumElts) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < NumElts; i++)
-    Mask.push_back(Builder.getInt32(Start + i));
+      SubVecs[SV].push_back(SubVec);
+    }
+  }
 
-  return ConstantVector::get(Mask);
+  // Replace uses of the shufflevector instructions with the sub-vectors
+  // returned by the load intrinsic. If a shufflevector instruction is
+  // associated with more than one sub-vector, those sub-vectors will be
+  // concatenated into a single wide vector.
+  for (ShuffleVectorInst *SVI : Shuffles) {
+    auto &SubVec = SubVecs[SVI];
+    auto *WideVec =
+        SubVec.size() > 1 ? concatenateVectors(Builder, SubVec) : SubVec[0];
+    SVI->replaceAllUsesWith(WideVec);
+  }
+
+  return true;
 }
 
 /// \brief Lower an interleaved store into a vstN intrinsic.
@@ -13395,15 +13782,15 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
-  unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
-  bool EltIs64Bits = DL.getTypeSizeInBits(EltTy) == 64;
 
-  // Skip if we do not have NEON and skip illegal vector types and vector types
-  // with i64/f64 elements (vstN doesn't support i64/f64 elements).
-  if (!Subtarget->hasNEON() || (SubVecSize != 64 && SubVecSize != 128) ||
-      EltIs64Bits)
+  // Skip if we do not have NEON and skip illegal vector types. We can
+  // "legalize" wide vector types into multiple interleaved accesses as long as
+  // the vector types are divisible by 128.
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL))
     return false;
 
+  unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL);
+
   Value *Op0 = SVI->getOperand(0);
   Value *Op1 = SVI->getOperand(1);
   IRBuilder<> Builder(SI);
@@ -13422,44 +13809,75 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
     SubVecTy = VectorType::get(IntTy, LaneLen);
   }
 
+  // The base address of the store.
+  Value *BaseAddr = SI->getPointerOperand();
+
+  if (NumStores > 1) {
+    // If we're going to generate more than one store, reset the lane length
+    // and sub-vector type to something legal.
+    LaneLen /= NumStores;
+    SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+
+    // We will compute the pointer operand of each store from the original base
+    // address using GEPs. Cast the base address to a pointer to the scalar
+    // element type.
+    BaseAddr = Builder.CreateBitCast(
+        BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
+                      SI->getPointerAddressSpace()));
+  }
+
+  assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
+
+  auto Mask = SVI->getShuffleMask();
+
+  Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
+  Type *Tys[] = {Int8Ptr, SubVecTy};
   static const Intrinsic::ID StoreInts[3] = {Intrinsic::arm_neon_vst2,
                                              Intrinsic::arm_neon_vst3,
                                              Intrinsic::arm_neon_vst4};
-  SmallVector<Value *, 6> Ops;
 
-  Type *Int8Ptr = Builder.getInt8PtrTy(SI->getPointerAddressSpace());
-  Ops.push_back(Builder.CreateBitCast(SI->getPointerOperand(), Int8Ptr));
+  for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) {
 
-  Type *Tys[] = { Int8Ptr, SubVecTy };
-  Function *VstNFunc = Intrinsic::getDeclaration(
-      SI->getModule(), StoreInts[Factor - 2], Tys);
+    // If we generating more than one store, we compute the base address of
+    // subsequent stores as an offset from the previous.
+    if (StoreCount > 0)
+      BaseAddr = Builder.CreateConstGEP1_32(BaseAddr, LaneLen * Factor);
 
-  // Split the shufflevector operands into sub vectors for the new vstN call.
-  auto Mask = SVI->getShuffleMask();
-  for (unsigned i = 0; i < Factor; i++) {
-    if (Mask[i] >= 0) {
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, Mask[i], LaneLen)));
-    } else {
-      unsigned StartMask = 0;
-      for (unsigned j = 1; j < LaneLen; j++) {
-        if (Mask[j*Factor + i] >= 0) {
-          StartMask = Mask[j*Factor + i] - j;
-          break;
+    SmallVector<Value *, 6> Ops;
+    Ops.push_back(Builder.CreateBitCast(BaseAddr, Int8Ptr));
+
+    Function *VstNFunc =
+        Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys);
+
+    // Split the shufflevector operands into sub vectors for the new vstN call.
+    for (unsigned i = 0; i < Factor; i++) {
+      unsigned IdxI = StoreCount * LaneLen * Factor + i;
+      if (Mask[IdxI] >= 0) {
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+      } else {
+        unsigned StartMask = 0;
+        for (unsigned j = 1; j < LaneLen; j++) {
+          unsigned IdxJ = StoreCount * LaneLen * Factor + j;
+          if (Mask[IdxJ * Factor + IdxI] >= 0) {
+            StartMask = Mask[IdxJ * Factor + IdxI] - IdxJ;
+            break;
+          }
         }
+        // Note: If all elements in a chunk are undefs, StartMask=0!
+        // Note: Filling undef gaps with random elements is ok, since
+        // those elements were being written anyway (with undefs).
+        // In the case of all undefs we're defaulting to using elems from 0
+        // Note: StartMask cannot be negative, it's checked in
+        // isReInterleaveMask
+        Ops.push_back(Builder.CreateShuffleVector(
+            Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
       }
-      // Note: If all elements in a chunk are undefs, StartMask=0!
-      // Note: Filling undef gaps with random elements is ok, since
-      // those elements were being written anyway (with undefs).
-      // In the case of all undefs we're defaulting to using elems from 0
-      // Note: StartMask cannot be negative, it's checked in isReInterleaveMask
-      Ops.push_back(Builder.CreateShuffleVector(
-        Op0, Op1, getSequentialMask(Builder, StartMask, LaneLen)));
     }
-  }
 
-  Ops.push_back(Builder.getInt32(SI->getAlignment()));
-  Builder.CreateCall(VstNFunc, Ops);
+    Ops.push_back(Builder.getInt32(SI->getAlignment()));
+    Builder.CreateCall(VstNFunc, Ops);
+  }
   return true;
 }
 
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index 84c6eb845bb898cea953b9f804a611604a5202f6..70a0b1380ec98bfdfb637ea2769cbfbaa5965877 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -175,9 +175,15 @@ class InstrItineraryData;
       VMULLs,       // ...signed
       VMULLu,       // ...unsigned
 
+      SMULWB,       // Signed multiply word by half word, bottom
+      SMULWT,       // Signed multiply word by half word, top
       UMLAL,        // 64bit Unsigned Accumulate Multiply
       SMLAL,        // 64bit Signed Accumulate Multiply
       UMAAL,        // 64-bit Unsigned Accumulate Accumulate Multiply
+      SMLALBB,      // 64-bit signed accumulate multiply bottom, bottom 16
+      SMLALBT,      // 64-bit signed accumulate multiply bottom, top 16
+      SMLALTB,      // 64-bit signed accumulate multiply top, bottom 16
+      SMLALTT,      // 64-bit signed accumulate multiply top, top 16
 
       // Operands of the standard BUILD_VECTOR node are not legalized, which
       // is fine if BUILD_VECTORs are always lowered to shuffles or other
@@ -346,6 +352,7 @@ class InstrItineraryData;
 
     void computeKnownBitsForTargetNode(const SDValue Op, APInt &KnownZero,
                                        APInt &KnownOne,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth) const override;
 
@@ -500,9 +507,18 @@ class InstrItineraryData;
     bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                    unsigned &Cost) const override;
 
+    bool canMergeStoresTo(EVT MemVT) const override {
+      // Do not merge to larger than i32.
+      return (MemVT.getSizeInBits() <= 32);
+    }
+
     bool isCheapToSpeculateCttz() const override;
     bool isCheapToSpeculateCtlz() const override;
 
+    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+      return VT.isScalarInteger();
+    }
+
     bool supportSwiftError() const override {
       return true;
     }
@@ -514,6 +530,17 @@ class InstrItineraryData;
     CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool isVarArg) const;
     CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC, bool isVarArg) const;
 
+    /// Returns true if \p VecTy is a legal interleaved access type. This
+    /// function checks the vector element type and the overall width of the
+    /// vector.
+    bool isLegalInterleavedAccessType(VectorType *VecTy,
+                                      const DataLayout &DL) const;
+
+    /// Returns the number of interleaved accesses that will be generated when
+    /// lowering accesses of the given type.
+    unsigned getNumInterleavedAccesses(VectorType *VecTy,
+                                       const DataLayout &DL) const;
+
   protected:
     std::pair<const TargetRegisterClass *, uint8_t>
     findRepresentativeClass(const TargetRegisterInfo *TRI,
@@ -698,7 +725,7 @@ class InstrItineraryData;
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                      const SDLoc &dl) const;
+                      const SDLoc &dl, bool InvalidOnQNaN) const;
     SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 488439fc24e0da58b524b477d0825283c45bf919..1bbe7f0d275eda27b8eefa806f52f4173025b58a 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -184,7 +184,7 @@ def s_cc_out : OptionalDefOperand<OtherVT, (ops CCR), (ops (i32 CPSR))> {
 
 // ARM special operands for disassembly only.
 //
-def SetEndAsmOperand : ImmAsmOperand {
+def SetEndAsmOperand : ImmAsmOperand<0,1> {
   let Name = "SetEndImm";
   let ParserMethod = "parseSetEndImm";
 }
@@ -221,25 +221,25 @@ def banked_reg : Operand<i32> {
 //     16       imm6<5:4> = '01', 16 - <imm> is encoded in imm6<3:0>
 //     32       imm6<5> = '1', 32 - <imm> is encoded in imm6<4:0>
 //     64       64 - <imm> is encoded in imm6<5:0>
-def shr_imm8_asm_operand : ImmAsmOperand { let Name = "ShrImm8"; }
+def shr_imm8_asm_operand : ImmAsmOperand<1,8> { let Name = "ShrImm8"; }
 def shr_imm8  : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 8; }]> {
   let EncoderMethod = "getShiftRight8Imm";
   let DecoderMethod = "DecodeShiftRight8Imm";
   let ParserMatchClass = shr_imm8_asm_operand;
 }
-def shr_imm16_asm_operand : ImmAsmOperand { let Name = "ShrImm16"; }
+def shr_imm16_asm_operand : ImmAsmOperand<1,16> { let Name = "ShrImm16"; }
 def shr_imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 16; }]> {
   let EncoderMethod = "getShiftRight16Imm";
   let DecoderMethod = "DecodeShiftRight16Imm";
   let ParserMatchClass = shr_imm16_asm_operand;
 }
-def shr_imm32_asm_operand : ImmAsmOperand { let Name = "ShrImm32"; }
+def shr_imm32_asm_operand : ImmAsmOperand<1,32> { let Name = "ShrImm32"; }
 def shr_imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 32; }]> {
   let EncoderMethod = "getShiftRight32Imm";
   let DecoderMethod = "DecodeShiftRight32Imm";
   let ParserMatchClass = shr_imm32_asm_operand;
 }
-def shr_imm64_asm_operand : ImmAsmOperand { let Name = "ShrImm64"; }
+def shr_imm64_asm_operand : ImmAsmOperand<1,64> { let Name = "ShrImm64"; }
 def shr_imm64 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm <= 64; }]> {
   let EncoderMethod = "getShiftRight64Imm";
   let DecoderMethod = "DecodeShiftRight64Imm";
@@ -261,10 +261,19 @@ def const_pool_asm_imm : Operand<i32> {
 // Note: When EmitPriority == 1, the alias will be used for printing
 class ARMInstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[IsARM]>;
+class ARMInstSubst<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>,
+        Requires<[IsARM,UseNegativeImmediates]>;
 class  tInstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb]>;
+class  tInstSubst<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>,
+        Requires<[IsThumb,UseNegativeImmediates]>;
 class t2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[IsThumb2]>;
+class t2InstSubst<string Asm, dag Result, bit EmitPriority = 0>
+      : InstAlias<Asm, Result, EmitPriority>,
+        Requires<[IsThumb2,UseNegativeImmediates]>;
 class VFP2InstAlias<string Asm, dag Result, bit EmitPriority = 0>
       : InstAlias<Asm, Result, EmitPriority>, Requires<[HasVFP2]>;
 class VFP2DPInstAlias<string Asm, dag Result, bit EmitPriority = 0>
@@ -948,7 +957,7 @@ class ADivA1I<bits<3> opcod, dag oops, dag iops,
 }
 
 // PKH instructions
-def PKHLSLAsmOperand : ImmAsmOperand {
+def PKHLSLAsmOperand : ImmAsmOperand<0,31> {
   let Name = "PKHLSLImm";
   let ParserMethod = "parsePKHLSLImm";
 }
@@ -1013,9 +1022,6 @@ class Thumb2DSPPat<dag pattern, dag result> : Pat<pattern, result> {
 class Thumb2DSPMulPat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb2, UseMulOps, HasDSP];
 }
-class Thumb2ExtractPat<dag pattern, dag result> : Pat<pattern, result> {
-  list<Predicate> Predicates = [IsThumb2, HasT2ExtractPack];
-}
 //===----------------------------------------------------------------------===//
 // Thumb Instruction Format Definitions.
 //
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index c47393990e977cdc252f0460883eb0a51003924f..cc0e7d4d9c3595fb4c3d5f21b9c792246e3cd8f7 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -51,6 +51,8 @@ def SDT_ARMAnd     : SDTypeProfile<1, 2,
                                     SDTCisVT<2, i32>]>;
 
 def SDT_ARMCmp     : SDTypeProfile<0, 2, [SDTCisSameAs<0, 1>]>;
+def SDT_ARMFCmp    : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>,
+                                          SDTCisVT<2, i32>]>;
 
 def SDT_ARMPICAdd  : SDTypeProfile<1, 2, [SDTCisSameAs<0, 1>,
                                           SDTCisPtrTy<1>, SDTCisVT<2, i32>]>;
@@ -90,6 +92,13 @@ def SDTBinaryArithWithFlagsInOut : SDTypeProfile<2, 3,
                                              SDTCisVT<1, i32>,
                                              SDTCisVT<4, i32>]>;
 
+def SDT_LongMac  : SDTypeProfile<2, 4, [SDTCisVT<0, i32>,
+                                        SDTCisSameAs<0, 1>,
+                                        SDTCisSameAs<0, 2>,
+                                        SDTCisSameAs<0, 3>,
+                                        SDTCisSameAs<0, 4>,
+                                        SDTCisSameAs<0, 5>]>;
+
 // Node definitions.
 def ARMWrapper       : SDNode<"ARMISD::Wrapper",     SDTIntUnaryOp>;
 def ARMWrapperPIC    : SDNode<"ARMISD::WrapperPIC",  SDTIntUnaryOp>;
@@ -181,6 +190,13 @@ def ARMmemcopy : SDNode<"ARMISD::MEMCPY", SDT_ARMMEMCPY,
                         [SDNPHasChain, SDNPInGlue, SDNPOutGlue,
                          SDNPMayStore, SDNPMayLoad]>;
 
+def ARMsmulwb       : SDNode<"ARMISD::SMULWB", SDTIntBinOp, []>;
+def ARMsmulwt       : SDNode<"ARMISD::SMULWT", SDTIntBinOp, []>;
+def ARMsmlalbb      : SDNode<"ARMISD::SMLALBB", SDT_LongMac, []>;
+def ARMsmlalbt      : SDNode<"ARMISD::SMLALBT", SDT_LongMac, []>;
+def ARMsmlaltb      : SDNode<"ARMISD::SMLALTB", SDT_LongMac, []>;
+def ARMsmlaltt      : SDNode<"ARMISD::SMLALTT", SDT_LongMac, []>;
+
 //===----------------------------------------------------------------------===//
 // ARM Instruction Predicate Definitions.
 //
@@ -247,9 +263,6 @@ def HasDivide        : Predicate<"Subtarget->hasDivide()">,
                                  AssemblerPredicate<"FeatureHWDiv", "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
                                  AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
-def HasT2ExtractPack : Predicate<"Subtarget->hasT2ExtractPack()">,
-                                 AssemblerPredicate<"FeatureT2XtPk",
-                                                     "pack/extract">;
 def HasDSP           : Predicate<"Subtarget->hasDSP()">,
                                  AssemblerPredicate<"FeatureDSP", "dsp">;
 def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
@@ -298,6 +311,11 @@ def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
                                  AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
 def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
 
+def UseNegativeImmediates :
+  Predicate<"false">,
+            AssemblerPredicate<"!FeatureNoNegativeImmediates",
+                               "NegativeImmediates">;
+
 // FIXME: Eventually this will be just "hasV6T2Ops".
 def UseMovt          : Predicate<"Subtarget->useMovt(*MF)">;
 def DontUseMovt      : Predicate<"!Subtarget->useMovt(*MF)">;
@@ -423,7 +441,16 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
 //
 
 // Immediate operands with a shared generic asm render method.
-class ImmAsmOperand : AsmOperandClass { let RenderMethod = "addImmOperands"; }
+class ImmAsmOperand<int Low, int High> : AsmOperandClass {
+  let RenderMethod = "addImmOperands";
+  let PredicateMethod = "isImmediate<" # Low # "," # High # ">";
+  let DiagnosticType = "ImmRange" # Low # "_" # High;
+}
+
+class ImmAsmOperandMinusOne<int Low, int High> : AsmOperandClass {
+  let PredicateMethod = "isImmediate<" # Low # "," # High # ">";
+  let DiagnosticType = "ImmRange" # Low # "_" # High;
+}
 
 // Operands that are part of a memory addressing mode.
 class MemOperand : Operand<i32> { let OperandType = "OPERAND_MEMORY"; }
@@ -645,35 +672,45 @@ def arm_i32imm : PatLeaf<(imm), [{
 }]>;
 
 /// imm0_1 predicate - Immediate in the range [0,1].
-def Imm0_1AsmOperand: ImmAsmOperand { let Name = "Imm0_1"; }
+def Imm0_1AsmOperand: ImmAsmOperand<0,1> { let Name = "Imm0_1"; }
 def imm0_1 : Operand<i32> { let ParserMatchClass = Imm0_1AsmOperand; }
 
 /// imm0_3 predicate - Immediate in the range [0,3].
-def Imm0_3AsmOperand: ImmAsmOperand { let Name = "Imm0_3"; }
+def Imm0_3AsmOperand: ImmAsmOperand<0,3> { let Name = "Imm0_3"; }
 def imm0_3 : Operand<i32> { let ParserMatchClass = Imm0_3AsmOperand; }
 
 /// imm0_7 predicate - Immediate in the range [0,7].
-def Imm0_7AsmOperand: ImmAsmOperand { let Name = "Imm0_7"; }
+def Imm0_7AsmOperand: ImmAsmOperand<0,7> {
+  let Name = "Imm0_7";
+}
 def imm0_7 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 8;
 }]> {
   let ParserMatchClass = Imm0_7AsmOperand;
 }
 
+/// imm8_255 predicate - Immediate in the range [8,255].
+def Imm8_255AsmOperand: ImmAsmOperand<8,255> { let Name = "Imm8_255"; }
+def imm8_255 : Operand<i32>, ImmLeaf<i32, [{
+  return Imm >= 8 && Imm < 256;
+}]> {
+  let ParserMatchClass = Imm8_255AsmOperand;
+}
+
 /// imm8 predicate - Immediate is exactly 8.
-def Imm8AsmOperand: ImmAsmOperand { let Name = "Imm8"; }
+def Imm8AsmOperand: ImmAsmOperand<8,8> { let Name = "Imm8"; }
 def imm8 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 8; }]> {
   let ParserMatchClass = Imm8AsmOperand;
 }
 
 /// imm16 predicate - Immediate is exactly 16.
-def Imm16AsmOperand: ImmAsmOperand { let Name = "Imm16"; }
+def Imm16AsmOperand: ImmAsmOperand<16,16> { let Name = "Imm16"; }
 def imm16 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 16; }]> {
   let ParserMatchClass = Imm16AsmOperand;
 }
 
 /// imm32 predicate - Immediate is exactly 32.
-def Imm32AsmOperand: ImmAsmOperand { let Name = "Imm32"; }
+def Imm32AsmOperand: ImmAsmOperand<32,32> { let Name = "Imm32"; }
 def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
   let ParserMatchClass = Imm32AsmOperand;
 }
@@ -681,25 +718,25 @@ def imm32 : Operand<i32>, ImmLeaf<i32, [{ return Imm == 32; }]> {
 def imm8_or_16 : ImmLeaf<i32, [{ return Imm == 8 || Imm == 16;}]>;
 
 /// imm1_7 predicate - Immediate in the range [1,7].
-def Imm1_7AsmOperand: ImmAsmOperand { let Name = "Imm1_7"; }
+def Imm1_7AsmOperand: ImmAsmOperand<1,7> { let Name = "Imm1_7"; }
 def imm1_7 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 8; }]> {
   let ParserMatchClass = Imm1_7AsmOperand;
 }
 
 /// imm1_15 predicate - Immediate in the range [1,15].
-def Imm1_15AsmOperand: ImmAsmOperand { let Name = "Imm1_15"; }
+def Imm1_15AsmOperand: ImmAsmOperand<1,15> { let Name = "Imm1_15"; }
 def imm1_15 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 16; }]> {
   let ParserMatchClass = Imm1_15AsmOperand;
 }
 
 /// imm1_31 predicate - Immediate in the range [1,31].
-def Imm1_31AsmOperand: ImmAsmOperand { let Name = "Imm1_31"; }
+def Imm1_31AsmOperand: ImmAsmOperand<1,31> { let Name = "Imm1_31"; }
 def imm1_31 : Operand<i32>, ImmLeaf<i32, [{ return Imm > 0 && Imm < 32; }]> {
   let ParserMatchClass = Imm1_31AsmOperand;
 }
 
 /// imm0_15 predicate - Immediate in the range [0,15].
-def Imm0_15AsmOperand: ImmAsmOperand {
+def Imm0_15AsmOperand: ImmAsmOperand<0,15> {
   let Name = "Imm0_15";
   let DiagnosticType = "ImmRange0_15";
 }
@@ -710,7 +747,7 @@ def imm0_15 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_31 predicate - True if the 32-bit immediate is in the range [0,31].
-def Imm0_31AsmOperand: ImmAsmOperand { let Name = "Imm0_31"; }
+def Imm0_31AsmOperand: ImmAsmOperand<0,31> { let Name = "Imm0_31"; }
 def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 32;
 }]> {
@@ -718,15 +755,15 @@ def imm0_31 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_32 predicate - True if the 32-bit immediate is in the range [0,32].
-def Imm0_32AsmOperand: ImmAsmOperand { let Name = "Imm0_32"; }
+def Imm0_32AsmOperand: ImmAsmOperand<0,32> { let Name = "Imm0_32"; }
 def imm0_32 : Operand<i32>, ImmLeaf<i32, [{
-  return Imm >= 0 && Imm < 32;
+  return Imm >= 0 && Imm < 33;
 }]> {
   let ParserMatchClass = Imm0_32AsmOperand;
 }
 
 /// imm0_63 predicate - True if the 32-bit immediate is in the range [0,63].
-def Imm0_63AsmOperand: ImmAsmOperand { let Name = "Imm0_63"; }
+def Imm0_63AsmOperand: ImmAsmOperand<0,63> { let Name = "Imm0_63"; }
 def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 64;
 }]> {
@@ -734,7 +771,7 @@ def imm0_63 : Operand<i32>, ImmLeaf<i32, [{
 }
 
 /// imm0_239 predicate - Immediate in the range [0,239].
-def Imm0_239AsmOperand : ImmAsmOperand {
+def Imm0_239AsmOperand : ImmAsmOperand<0,239> {
   let Name = "Imm0_239";
   let DiagnosticType = "ImmRange0_239";
 }
@@ -743,13 +780,13 @@ def imm0_239 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 240; }]> {
 }
 
 /// imm0_255 predicate - Immediate in the range [0,255].
-def Imm0_255AsmOperand : ImmAsmOperand { let Name = "Imm0_255"; }
+def Imm0_255AsmOperand : ImmAsmOperand<0,255> { let Name = "Imm0_255"; }
 def imm0_255 : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm < 256; }]> {
   let ParserMatchClass = Imm0_255AsmOperand;
 }
 
-/// imm0_65535 - An immediate is in the range [0.65535].
-def Imm0_65535AsmOperand: ImmAsmOperand { let Name = "Imm0_65535"; }
+/// imm0_65535 - An immediate is in the range [0,65535].
+def Imm0_65535AsmOperand: ImmAsmOperand<0,65535> { let Name = "Imm0_65535"; }
 def imm0_65535 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 65536;
 }]> {
@@ -767,19 +804,23 @@ def imm0_65535_neg : Operand<i32>, ImmLeaf<i32, [{
 // FIXME: This really needs a Thumb version separate from the ARM version.
 // While the range is the same, and can thus use the same match class,
 // the encoding is different so it should have a different encoder method.
-def Imm0_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm0_65535Expr"; }
+def Imm0_65535ExprAsmOperand: AsmOperandClass {
+  let Name = "Imm0_65535Expr";
+  let RenderMethod = "addImmOperands";
+}
+
 def imm0_65535_expr : Operand<i32> {
   let EncoderMethod = "getHiLo16ImmOpValue";
   let ParserMatchClass = Imm0_65535ExprAsmOperand;
 }
 
-def Imm256_65535ExprAsmOperand: ImmAsmOperand { let Name = "Imm256_65535Expr"; }
+def Imm256_65535ExprAsmOperand: ImmAsmOperand<256,65535> { let Name = "Imm256_65535Expr"; }
 def imm256_65535_expr : Operand<i32> {
   let ParserMatchClass = Imm256_65535ExprAsmOperand;
 }
 
 /// imm24b - True if the 32-bit immediate is encodable in 24 bits.
-def Imm24bitAsmOperand: ImmAsmOperand { let Name = "Imm24bit"; }
+def Imm24bitAsmOperand: ImmAsmOperand<0,0xffffff> { let Name = "Imm24bit"; }
 def imm24b : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm <= 0xffffff;
 }]> {
@@ -808,7 +849,9 @@ def imm1_32_XFORM: SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
                                    MVT::i32);
 }]>;
-def Imm1_32AsmOperand: AsmOperandClass { let Name = "Imm1_32"; }
+def Imm1_32AsmOperand: ImmAsmOperandMinusOne<1,32> {
+  let Name = "Imm1_32";
+}
 def imm1_32 : Operand<i32>, PatLeaf<(imm), [{
    uint64_t Imm = N->getZExtValue();
    return Imm > 0 && Imm <= 32;
@@ -822,7 +865,7 @@ def imm1_16_XFORM: SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((int)N->getZExtValue() - 1, SDLoc(N),
                                    MVT::i32);
 }]>;
-def Imm1_16AsmOperand: AsmOperandClass { let Name = "Imm1_16"; }
+def Imm1_16AsmOperand: ImmAsmOperandMinusOne<1,16> { let Name = "Imm1_16"; }
 def imm1_16 : Operand<i32>, PatLeaf<(imm), [{ return Imm > 0 && Imm <= 16; }],
     imm1_16_XFORM> {
   let PrintMethod = "printImmPlusOneOperand";
@@ -3850,6 +3893,7 @@ def  MVNi  : AsI1<0b1111, (outs GPR:$Rd), (ins mod_imm:$imm), DPFrm,
   let Inst{11-0} = imm;
 }
 
+let AddedComplexity = 1 in
 def : ARMPat<(and   GPR:$src, mod_imm_not:$imm),
              (BICri GPR:$src, mod_imm_not:$imm)>;
 
@@ -3899,7 +3943,8 @@ def MUL : AsMul1I32<0b0000000, (outs GPRnopc:$Rd),
                     (ins GPRnopc:$Rn, GPRnopc:$Rm),
                     IIC_iMUL32, "mul", "\t$Rd, $Rn, $Rm",
                   [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))]>,
-                  Requires<[IsARM, HasV6]> {
+                  Requires<[IsARM, HasV6]>,
+         Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{15-12} = 0b0000;
   let Unpredictable{15-12} = 0b1111;
 }
@@ -3910,14 +3955,16 @@ def MULv5: ARMPseudoExpand<(outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm,
                            4, IIC_iMUL32,
                [(set GPRnopc:$Rd, (mul GPRnopc:$Rn, GPRnopc:$Rm))],
                (MUL GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p, cc_out:$s)>,
-               Requires<[IsARM, NoV6, UseMulOps]>;
+               Requires<[IsARM, NoV6, UseMulOps]>,
+           Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 }
 
 def MLA  : AsMul1I32<0b0000001, (outs GPRnopc:$Rd),
                      (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra),
                      IIC_iMAC32, "mla", "\t$Rd, $Rn, $Rm, $Ra",
         [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))]>,
-                     Requires<[IsARM, HasV6, UseMulOps]> {
+                     Requires<[IsARM, HasV6, UseMulOps]>,
+        Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   bits<4> Ra;
   let Inst{15-12} = Ra;
 }
@@ -3928,12 +3975,14 @@ def MLAv5: ARMPseudoExpand<(outs GPRnopc:$Rd),
                             pred:$p, cc_out:$s), 4, IIC_iMAC32,
          [(set GPRnopc:$Rd, (add (mul GPRnopc:$Rn, GPRnopc:$Rm), GPRnopc:$Ra))],
   (MLA GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$Ra, pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+           Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def MLS  : AMul1I<0b0000011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                    IIC_iMAC32, "mls", "\t$Rd, $Rn, $Rm, $Ra",
                    [(set GPR:$Rd, (sub GPR:$Ra, (mul GPR:$Rn, GPR:$Rm)))]>,
-                   Requires<[IsARM, HasV6T2, UseMulOps]> {
+                   Requires<[IsARM, HasV6T2, UseMulOps]>,
+          Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   bits<4> Rd;
   bits<4> Rm;
   bits<4> Rn;
@@ -3949,26 +3998,38 @@ let hasSideEffects = 0 in {
 let isCommutable = 1 in {
 def SMULL : AsMul1I64<0b0000110, (outs GPR:$RdLo, GPR:$RdHi),
                                  (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
-                    "smull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
+                    "smull", "\t$RdLo, $RdHi, $Rn, $Rm",
+                    [(set GPR:$RdLo, GPR:$RdHi,
+                          (smullohi GPR:$Rn, GPR:$Rm))]>,
+                    Requires<[IsARM, HasV6]>,
+           Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 
 def UMULL : AsMul1I64<0b0000100, (outs GPR:$RdLo, GPR:$RdHi),
                                  (ins GPR:$Rn, GPR:$Rm), IIC_iMUL64,
-                    "umull", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-                    Requires<[IsARM, HasV6]>;
+                    "umull", "\t$RdLo, $RdHi, $Rn, $Rm",
+                    [(set GPR:$RdLo, GPR:$RdHi,
+                          (umullohi GPR:$Rn, GPR:$Rm))]>,
+                    Requires<[IsARM, HasV6]>,
+           Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL]>;
 
 let Constraints = "@earlyclobber $RdLo,@earlyclobber $RdHi" in {
 def SMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                             (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
-                            4, IIC_iMUL64, [],
+                            4, IIC_iMUL64,
+                            [(set GPR:$RdLo, GPR:$RdHi,
+                                  (smullohi GPR:$Rn, GPR:$Rm))],
           (SMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+              Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 
 def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                             (ins GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s),
-                            4, IIC_iMUL64, [],
+                            4, IIC_iMUL64,
+                            [(set GPR:$RdLo, GPR:$RdHi,
+                                  (umullohi GPR:$Rn, GPR:$Rm))],
           (UMULL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+             Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 }
 }
 
@@ -3976,17 +4037,20 @@ def UMULLv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
 def SMLAL : AsMla1I64<0b0000111, (outs GPR:$RdLo, GPR:$RdHi),
                         (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "smlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
+           Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 def UMLAL : AsMla1I64<0b0000101, (outs GPR:$RdLo, GPR:$RdHi),
                         (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi), IIC_iMAC64,
                     "umlal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>;
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
+            Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 
 def UMAAL : AMul1I <0b0000010, (outs GPR:$RdLo, GPR:$RdHi),
                                (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
                                IIC_iMAC64,
                     "umaal", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]> {
+         RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">, Requires<[IsARM, HasV6]>,
+            Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rm;
@@ -4004,13 +4068,15 @@ def SMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                               4, IIC_iMAC64, [],
              (SMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
                            pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+              Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
                 (ins GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi, pred:$p, cc_out:$s),
                               4, IIC_iMAC64, [],
              (UMLAL GPR:$RdLo, GPR:$RdHi, GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi,
                            pred:$p, cc_out:$s)>,
-                           Requires<[IsARM, NoV6]>;
+                           Requires<[IsARM, NoV6]>,
+              Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 }
 
 } // hasSideEffects
@@ -4019,13 +4085,15 @@ def UMLALv5 : ARMPseudoExpand<(outs GPR:$RdLo, GPR:$RdHi),
 def SMMUL : AMul2I <0b0111010, 0b0001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                IIC_iMUL32, "smmul", "\t$Rd, $Rn, $Rm",
                [(set GPR:$Rd, (mulhs GPR:$Rn, GPR:$Rm))]>,
-            Requires<[IsARM, HasV6]> {
+            Requires<[IsARM, HasV6]>,
+            Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{15-12} = 0b1111;
 }
 
 def SMMULR : AMul2I <0b0111010, 0b0011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
                IIC_iMUL32, "smmulr", "\t$Rd, $Rn, $Rm", []>,
-            Requires<[IsARM, HasV6]> {
+            Requires<[IsARM, HasV6]>,
+             Sched<[WriteMUL32, ReadMUL, ReadMUL]>  {
   let Inst{15-12} = 0b1111;
 }
 
@@ -4033,57 +4101,67 @@ def SMMLA : AMul2Ia <0b0111010, 0b0001, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmla", "\t$Rd, $Rn, $Rm, $Ra",
                [(set GPR:$Rd, (add (mulhs GPR:$Rn, GPR:$Rm), GPR:$Ra))]>,
-            Requires<[IsARM, HasV6, UseMulOps]>;
+            Requires<[IsARM, HasV6, UseMulOps]>,
+            Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def SMMLAR : AMul2Ia <0b0111010, 0b0011, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmlar", "\t$Rd, $Rn, $Rm, $Ra", []>,
-            Requires<[IsARM, HasV6]>;
+            Requires<[IsARM, HasV6]>,
+             Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def SMMLS : AMul2Ia <0b0111010, 0b1101, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmls", "\t$Rd, $Rn, $Rm, $Ra", []>,
-            Requires<[IsARM, HasV6, UseMulOps]>;
+            Requires<[IsARM, HasV6, UseMulOps]>,
+            Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 def SMMLSR : AMul2Ia <0b0111010, 0b1111, (outs GPR:$Rd),
                (ins GPR:$Rn, GPR:$Rm, GPR:$Ra),
                IIC_iMAC32, "smmlsr", "\t$Rd, $Rn, $Rm, $Ra", []>,
-            Requires<[IsARM, HasV6]>;
+            Requires<[IsARM, HasV6]>,
+             Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
 multiclass AI_smul<string opc> {
   def BB : AMulxyI<0b0001011, 0b00, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "bb"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
                                       (sext_inreg GPR:$Rm, i16)))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def BT : AMulxyI<0b0001011, 0b10, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "bt"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sext_inreg GPR:$Rn, i16),
                                       (sra GPR:$Rm, (i32 16))))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def TB : AMulxyI<0b0001011, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "tb"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
                                       (sext_inreg GPR:$Rm, i16)))]>,
-           Requires<[IsARM, HasV5TE]>;
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def TT : AMulxyI<0b0001011, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "tt"), "\t$Rd, $Rn, $Rm",
               [(set GPR:$Rd, (mul (sra GPR:$Rn, (i32 16)),
                                       (sra GPR:$Rm, (i32 16))))]>,
-            Requires<[IsARM, HasV5TE]>;
+            Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def WB : AMulxyI<0b0001001, 0b01, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm",
-              []>,
-           Requires<[IsARM, HasV5TE]>;
+              [(set GPR:$Rd, (ARMsmulwb GPR:$Rn, GPR:$Rm))]>,
+           Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 
   def WT : AMulxyI<0b0001001, 0b11, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm),
               IIC_iMUL16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm",
-              []>,
-            Requires<[IsARM, HasV5TE]>;
+              [(set GPR:$Rd, (ARMsmulwt GPR:$Rn, GPR:$Rm))]>,
+            Requires<[IsARM, HasV5TE]>,
+           Sched<[WriteMUL16, ReadMUL, ReadMUL]>;
 }
 
 
@@ -4095,7 +4173,8 @@ multiclass AI_smla<string opc> {
               [(set GPRnopc:$Rd, (add GPR:$Ra,
                                (mul (sext_inreg GPRnopc:$Rn, i16),
                                        (sext_inreg GPRnopc:$Rm, i16))))]>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def BT : AMulxyIa<0b0001000, 0b10, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -4103,7 +4182,8 @@ multiclass AI_smla<string opc> {
               [(set GPRnopc:$Rd,
                     (add GPR:$Ra, (mul (sext_inreg GPRnopc:$Rn, i16),
                                           (sra GPRnopc:$Rm, (i32 16)))))]>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def TB : AMulxyIa<0b0001000, 0b01, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -4111,7 +4191,8 @@ multiclass AI_smla<string opc> {
               [(set GPRnopc:$Rd,
                     (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
                                           (sext_inreg GPRnopc:$Rm, i16))))]>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def TT : AMulxyIa<0b0001000, 0b11, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
@@ -4119,19 +4200,24 @@ multiclass AI_smla<string opc> {
              [(set GPRnopc:$Rd,
                    (add GPR:$Ra, (mul (sra GPRnopc:$Rn, (i32 16)),
                                          (sra GPRnopc:$Rm, (i32 16)))))]>,
-            Requires<[IsARM, HasV5TE, UseMulOps]>;
+            Requires<[IsARM, HasV5TE, UseMulOps]>,
+            Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def WB : AMulxyIa<0b0001001, 0b00, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wb"), "\t$Rd, $Rn, $Rm, $Ra",
-              []>,
-           Requires<[IsARM, HasV5TE, UseMulOps]>;
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (ARMsmulwb GPRnopc:$Rn, GPRnopc:$Rm)))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
 
   def WT : AMulxyIa<0b0001001, 0b10, (outs GPRnopc:$Rd),
               (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
               IIC_iMAC16, !strconcat(opc, "wt"), "\t$Rd, $Rn, $Rm, $Ra",
-              []>,
-            Requires<[IsARM, HasV5TE, UseMulOps]>;
+              [(set GPRnopc:$Rd,
+                    (add GPR:$Ra, (ARMsmulwt GPRnopc:$Rn, GPRnopc:$Rm)))]>,
+           Requires<[IsARM, HasV5TE, UseMulOps]>,
+           Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>;
   }
 }
 
@@ -4139,25 +4225,28 @@ defm SMUL : AI_smul<"smul">;
 defm SMLA : AI_smla<"smla">;
 
 // Halfword multiply accumulate long: SMLAL<x><y>.
-def SMLALBB : AMulxyI64<0b0001010, 0b00, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlalbb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
-
-def SMLALBT : AMulxyI64<0b0001010, 0b10, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlalbt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
-
-def SMLALTB : AMulxyI64<0b0001010, 0b01, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlaltb", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
-
-def SMLALTT : AMulxyI64<0b0001010, 0b11, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
-                      (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                      IIC_iMAC64, "smlaltt", "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-              Requires<[IsARM, HasV5TE]>;
+class SMLAL<bits<2> opc1, string asm>
+ : AMulxyI64<0b0001010, opc1,
+        (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
+        (ins GPRnopc:$Rn, GPRnopc:$Rm, GPRnopc:$RLo, GPRnopc:$RHi),
+        IIC_iMAC64, asm, "\t$RdLo, $RdHi, $Rn, $Rm", []>,
+        RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
+        Requires<[IsARM, HasV5TE]>,
+        Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
+
+def SMLALBB : SMLAL<0b00, "smlalbb">;
+def SMLALBT : SMLAL<0b10, "smlalbt">;
+def SMLALTB : SMLAL<0b01, "smlaltb">;
+def SMLALTT : SMLAL<0b11, "smlaltt">;
+
+def : ARMV5TEPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALBB $Rn, $Rm, $RLo, $RHi)>;
+def : ARMV5TEPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALBT $Rn, $Rm, $RLo, $RHi)>;
+def : ARMV5TEPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALTB $Rn, $Rm, $RLo, $RHi)>;
+def : ARMV5TEPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                 (SMLALTT $Rn, $Rm, $RLo, $RHi)>;
 
 // Helper class for AI_smld.
 class AMulDualIbase<bit long, bit sub, bit swap, dag oops, dag iops,
@@ -4203,19 +4292,23 @@ multiclass AI_smld<bit sub, string opc> {
 
   def D : AMulDualIa<0, sub, 0, (outs GPRnopc:$Rd),
                   (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
-                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">;
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm, $Ra">,
+          Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
   def DX: AMulDualIa<0, sub, 1, (outs GPRnopc:$Rd),
                   (ins GPRnopc:$Rn, GPRnopc:$Rm, GPR:$Ra),
-                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">;
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm, $Ra">,
+          Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>;
 
   def LD: AMulDualI64<1, sub, 0, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
                   (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
-                  !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">;
+                  !strconcat(opc, "ld"), "\t$RdLo, $RdHi, $Rn, $Rm">,
+          Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 
   def LDX : AMulDualI64<1, sub, 1, (outs GPRnopc:$RdLo, GPRnopc:$RdHi),
                   (ins GPRnopc:$Rn, GPRnopc:$Rm), NoItinerary,
-                  !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">;
+                  !strconcat(opc, "ldx"),"\t$RdLo, $RdHi, $Rn, $Rm">,
+             Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]>;
 
 }
 
@@ -4225,9 +4318,11 @@ defm SMLS : AI_smld<1, "smls">;
 multiclass AI_sdml<bit sub, string opc> {
 
   def D:AMulDualI<0, sub, 0, (outs GPRnopc:$Rd), (ins GPRnopc:$Rn, GPRnopc:$Rm),
-                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">;
+                  NoItinerary, !strconcat(opc, "d"), "\t$Rd, $Rn, $Rm">,
+        Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
   def DX:AMulDualI<0, sub, 1, (outs GPRnopc:$Rd),(ins GPRnopc:$Rn, GPRnopc:$Rm),
-                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">;
+                  NoItinerary, !strconcat(opc, "dx"), "\t$Rd, $Rn, $Rm">,
+         Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 }
 
 defm SMUA : AI_sdml<0, "smua">;
@@ -4239,12 +4334,14 @@ defm SMUS : AI_sdml<1, "smus">;
 def SDIV : ADivA1I<0b001, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
                    "sdiv", "\t$Rd, $Rn, $Rm",
                    [(set GPR:$Rd, (sdiv GPR:$Rn, GPR:$Rm))]>,
-           Requires<[IsARM, HasDivideInARM]>;
+           Requires<[IsARM, HasDivideInARM]>,
+           Sched<[WriteDIV]>;
 
 def UDIV : ADivA1I<0b011, (outs GPR:$Rd), (ins GPR:$Rn, GPR:$Rm), IIC_iDIV,
                    "udiv", "\t$Rd, $Rn, $Rm",
                    [(set GPR:$Rd, (udiv GPR:$Rn, GPR:$Rm))]>,
-           Requires<[IsARM, HasDivideInARM]>;
+           Requires<[IsARM, HasDivideInARM]>,
+           Sched<[WriteDIV]>;
 
 //===----------------------------------------------------------------------===//
 //  Misc. Arithmetic Instructions.
@@ -4831,14 +4928,15 @@ let AddedComplexity = 8 in {
   def : ARMPat<(atomic_store_release_32 addr_offset_none:$addr, GPR:$val), (STL  GPR:$val, addr_offset_none:$addr)>;
 }
 
-// SWP/SWPB are deprecated in V6/V7.
+// SWP/SWPB are deprecated in V6/V7 and optional in v7VE.
+// FIXME Use InstAlias to generate LDREX/STREX pairs instead.
 let mayLoad = 1, mayStore = 1 in {
 def SWP : AIswp<0, (outs GPRnopc:$Rt),
                 (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swp", []>,
-                Requires<[PreV8]>;
+                Requires<[IsARM,PreV8]>;
 def SWPB: AIswp<1, (outs GPRnopc:$Rt),
                 (ins GPRnopc:$Rt2, addr_offset_none:$addr), "swpb", []>,
-                Requires<[PreV8]>;
+                Requires<[IsARM,PreV8]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4850,7 +4948,7 @@ def CDP : ABI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
             NoItinerary, "cdp", "\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
             [(int_arm_cdp imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
                           imm:$CRm, imm:$opc2)]>,
-            Requires<[PreV8]> {
+            Requires<[IsARM,PreV8]> {
   bits<4> opc1;
   bits<4> CRn;
   bits<4> CRd;
@@ -4872,7 +4970,7 @@ def CDP2 : ABXI<0b1110, (outs), (ins p_imm:$cop, imm0_15:$opc1,
                NoItinerary, "cdp2\t$cop, $opc1, $CRd, $CRn, $CRm, $opc2",
                [(int_arm_cdp2 imm:$cop, imm:$opc1, imm:$CRd, imm:$CRn,
                               imm:$CRm, imm:$opc2)]>,
-               Requires<[PreV8]> {
+               Requires<[IsARM,PreV8]> {
   let Inst{31-28} = 0b1111;
   bits<4> opc1;
   bits<4> CRn;
@@ -5048,13 +5146,13 @@ multiclass LdSt2Cop<bit load, bit Dbit, string asm, list<dag> pattern> {
 
 defm LDC   : LdStCop <1, 0, "ldc", [(int_arm_ldc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
 defm LDCL  : LdStCop <1, 1, "ldcl", [(int_arm_ldcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
-defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm LDC2  : LdSt2Cop<1, 0, "ldc2", [(int_arm_ldc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm LDC2L : LdSt2Cop<1, 1, "ldc2l", [(int_arm_ldc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
 
 defm STC   : LdStCop <0, 0, "stc", [(int_arm_stc imm:$cop, imm:$CRd, addrmode5:$addr)]>;
 defm STCL  : LdStCop <0, 1, "stcl", [(int_arm_stcl imm:$cop, imm:$CRd, addrmode5:$addr)]>;
-defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
-defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[PreV8]>;
+defm STC2  : LdSt2Cop<0, 0, "stc2", [(int_arm_stc2 imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
+defm STC2L : LdSt2Cop<0, 1, "stc2l", [(int_arm_stc2l imm:$cop, imm:$CRd, addrmode5:$addr)]>, Requires<[IsARM,PreV8]>;
 
 //===----------------------------------------------------------------------===//
 // Move between coprocessor and ARM core register.
@@ -5132,7 +5230,7 @@ def MCR2 : MovRCopro2<"mcr2", 0 /* from ARM core register to coprocessor */,
                            c_imm:$CRm, imm0_7:$opc2),
                       [(int_arm_mcr2 imm:$cop, imm:$opc1, GPR:$Rt, imm:$CRn,
                                      imm:$CRm, imm:$opc2)]>,
-                      Requires<[PreV8]>;
+                      Requires<[IsARM,PreV8]>;
 def : ARMInstAlias<"mcr2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MCR2 p_imm:$cop, imm0_7:$opc1, GPR:$Rt, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
@@ -5140,7 +5238,7 @@ def MRC2 : MovRCopro2<"mrc2", 1 /* from coprocessor to ARM core register */,
                       (outs GPRwithAPSR:$Rt),
                       (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
                            imm0_7:$opc2), []>,
-                      Requires<[PreV8]>;
+                      Requires<[IsARM,PreV8]>;
 def : ARMInstAlias<"mrc2 $cop, $opc1, $Rt, $CRn, $CRm",
                    (MRC2 GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                          c_imm:$CRm, 0)>;
@@ -5183,7 +5281,7 @@ class MovRRCopro2<string opc, bit direction, dag oops, dag iops,
                   list<dag> pattern = []>
   : ABXI<0b1100, oops, iops, NoItinerary,
          !strconcat(opc, "\t$cop, $opc1, $Rt, $Rt2, $CRm"), pattern>,
-    Requires<[PreV8]> {
+    Requires<[IsARM,PreV8]> {
   let Inst{31-28} = 0b1111;
   let Inst{23-21} = 0b010;
   let Inst{20} = direction;
@@ -5525,20 +5623,26 @@ def : ARMPat<(extloadi16 addrmodepc:$addr), (PICLDRH addrmodepc:$addr)>;
 
 // smul* and smla*
 def : ARMV5TEPat<(mul sext_16_node:$a, sext_16_node:$b),
-                 (SMULBB GPR:$a, GPR:$b)>;
+                 (SMULBB GPR:$a, GPR:$b)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5TEPat<(mul sext_16_node:$a, (sra GPR:$b, (i32 16))),
-                 (SMULBT GPR:$a, GPR:$b)>;
+                 (SMULBT GPR:$a, GPR:$b)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5TEPat<(mul (sra GPR:$a, (i32 16)), sext_16_node:$b),
-                (SMULTB GPR:$a, GPR:$b)>;
+                (SMULTB GPR:$a, GPR:$b)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul sext_16_node:$a, sext_16_node:$b)),
-                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>;
+                 (SMLABB GPR:$a, GPR:$b, GPR:$acc)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul sext_16_node:$a, (sra GPR:$b, (i32 16)))),
-                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>;
+                 (SMLABT GPR:$a, GPR:$b, GPR:$acc)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 def : ARMV5MOPat<(add GPR:$acc,
                       (mul (sra GPR:$a, (i32 16)), sext_16_node:$b)),
-                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>;
+                 (SMLATB GPR:$a, GPR:$b, GPR:$acc)>,
+      Sched<[WriteMUL32, ReadMUL, ReadMUL]>;
 
 // Pre-v7 uses MCR for synchronization barriers.
 def : ARMPat<(ARMMemBarrierMCR GPR:$zero), (MCR 15, 0, GPR:$zero, 7, 10, 5)>,
@@ -5717,33 +5821,49 @@ def : MnemonicAlias<"usubaddx", "usax">;
 
 // "mov Rd, mod_imm_not" can be handled via "mvn" in assembly, just like
 // for isel.
-def : ARMInstAlias<"mov${s}${p} $Rd, $imm",
+def : ARMInstSubst<"mov${s}${p} $Rd, $imm",
                    (MVNi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"mvn${s}${p} $Rd, $imm",
+def : ARMInstSubst<"mvn${s}${p} $Rd, $imm",
                    (MOVi rGPR:$Rd, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
 // Same for AND <--> BIC
-def : ARMInstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+def : ARMInstSubst<"bic${s}${p} $Rd, $Rn, $imm",
                    (ANDri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"bic${s}${p} $Rdn, $imm",
+def : ARMInstSubst<"bic${s}${p} $Rdn, $imm",
                    (ANDri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"and${s}${p} $Rd, $Rn, $imm",
+def : ARMInstSubst<"and${s}${p} $Rd, $Rn, $imm",
                    (BICri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"and${s}${p} $Rdn, $imm",
+def : ARMInstSubst<"and${s}${p} $Rdn, $imm",
                    (BICri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm,
                           pred:$p, cc_out:$s)>;
 
 // Likewise, "add Rd, mod_imm_neg" -> sub
-def : ARMInstAlias<"add${s}${p} $Rd, $Rn, $imm",
+def : ARMInstSubst<"add${s}${p} $Rd, $Rn, $imm",
                  (SUBri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
-def : ARMInstAlias<"add${s}${p} $Rd, $imm",
+def : ARMInstSubst<"add${s}${p} $Rd, $imm",
                  (SUBri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+// Likewise, "sub Rd, mod_imm_neg" -> add
+def : ARMInstSubst<"sub${s}${p} $Rd, $Rn, $imm",
+                 (ADDri GPR:$Rd, GPR:$Rn, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"sub${s}${p} $Rd, $imm",
+                 (ADDri GPR:$Rd, GPR:$Rd, mod_imm_neg:$imm, pred:$p, cc_out:$s)>;
+
+
+def : ARMInstSubst<"adc${s}${p} $Rd, $Rn, $imm",
+                 (SBCri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"adc${s}${p} $Rdn, $imm",
+                 (SBCri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"sbc${s}${p} $Rd, $Rn, $imm",
+                 (ADCri GPR:$Rd, GPR:$Rn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstSubst<"sbc${s}${p} $Rdn, $imm",
+                 (ADCri GPR:$Rdn, GPR:$Rdn, mod_imm_not:$imm, pred:$p, cc_out:$s)>;
+
 // Same for CMP <--> CMN via mod_imm_neg
-def : ARMInstAlias<"cmp${p} $Rd, $imm",
+def : ARMInstSubst<"cmp${p} $Rd, $imm",
                    (CMNri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
-def : ARMInstAlias<"cmn${p} $Rd, $imm",
+def : ARMInstSubst<"cmn${p} $Rd, $imm",
                    (CMPri rGPR:$Rd, mod_imm_neg:$imm, pred:$p)>;
 
 // The shifter forms of the MOV instruction are aliased to the ASR, LSL,
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index b5fa8e999e2a9d908913b85ee9432b15f43b8559..681e235d78f08026631425e741afd3d1a8fb646e 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -7139,6 +7139,17 @@ let Predicates = [IsBE] in {
                         (f64 (IMPLICIT_DEF)), (i32 0)))), dsub_0)), dsub_0))>;
 }
 
+def : Pat<(v2i64 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v4i32 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v8i16 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v16i8 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v4f32 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+
 //===----------------------------------------------------------------------===//
 // Assembler aliases
 //
diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td
index b0bfae4d30435f8553af6813451e2f205b43ff0f..f2f426e867014f040840b9f4331459a28a7dc9e8 100644
--- a/lib/Target/ARM/ARMInstrThumb.td
+++ b/lib/Target/ARM/ARMInstrThumb.td
@@ -19,7 +19,7 @@ def imm_sr_XFORM: SDNodeXForm<imm, [{
   unsigned Imm = N->getZExtValue();
   return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
 }]>;
-def ThumbSRImmAsmOperand: AsmOperandClass { let Name = "ImmThumbSR"; }
+def ThumbSRImmAsmOperand: ImmAsmOperand<1,32> { let Name = "ImmThumbSR"; }
 def imm_sr : Operand<i32>, PatLeaf<(imm), [{
   uint64_t Imm = N->getZExtValue();
   return Imm > 0 && Imm <= 32;
@@ -28,22 +28,31 @@ def imm_sr : Operand<i32>, PatLeaf<(imm), [{
   let ParserMatchClass = ThumbSRImmAsmOperand;
 }
 
-def imm_comp_XFORM : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(~((uint32_t)N->getZExtValue()), SDLoc(N),
-                                   MVT::i32);
-}]>;
-
 def imm0_7_neg : PatLeaf<(i32 imm), [{
   return (uint32_t)-N->getZExtValue() < 8;
 }], imm_neg_XFORM>;
 
+def ThumbModImmNeg1_7AsmOperand : AsmOperandClass { let Name = "ThumbModImmNeg1_7"; }
+def mod_imm1_7_neg : Operand<i32>, PatLeaf<(imm), [{
+    unsigned Value = -(unsigned)N->getZExtValue();
+    return 0 < Value && Value < 8;
+  }], imm_neg_XFORM> {
+  let ParserMatchClass = ThumbModImmNeg1_7AsmOperand;
+}
+
+def ThumbModImmNeg8_255AsmOperand : AsmOperandClass { let Name = "ThumbModImmNeg8_255"; }
+def mod_imm8_255_neg : Operand<i32>, PatLeaf<(imm), [{
+    unsigned Value = -(unsigned)N->getZExtValue();
+    return 7 < Value && Value < 256;
+  }], imm_neg_XFORM> {
+  let ParserMatchClass = ThumbModImmNeg8_255AsmOperand;
+}
+
+
 def imm0_255_comp : PatLeaf<(i32 imm), [{
   return ~((uint32_t)N->getZExtValue()) < 256;
 }]>;
 
-def imm8_255 : ImmLeaf<i32, [{
-  return Imm >= 8 && Imm < 256;
-}]>;
 def imm8_255_neg : PatLeaf<(i32 imm), [{
   unsigned Val = -N->getZExtValue();
   return Val >= 8 && Val < 256;
@@ -407,9 +416,9 @@ def tSUBspi : T1pIt<(outs GPRsp:$Rdn), (ins GPRsp:$Rn, t_imm0_508s4:$imm),
   let DecoderMethod = "DecodeThumbAddSPImm";
 }
 
-def : tInstAlias<"add${p} sp, $imm",
+def : tInstSubst<"add${p} sp, $imm",
                  (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
-def : tInstAlias<"add${p} sp, sp, $imm",
+def : tInstSubst<"add${p} sp, sp, $imm",
                  (tSUBspi SP, t_imm0_508s4_neg:$imm, pred:$p)>;
 
 // Can optionally specify SP as a three operand instruction.
@@ -910,7 +919,7 @@ let isAdd = 1 in {
   def tADC :                      // A8.6.2
     T1sItDPEncode<0b0101, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm), IIC_iALUr,
                   "adc", "\t$Rdn, $Rm",
-                  [(set tGPR:$Rdn, (adde tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
+                  []>, Sched<[WriteALU]>;
 
   // Add immediate
   def tADDi3 :                    // A8.6.4 T1
@@ -938,6 +947,43 @@ let isAdd = 1 in {
                   "add", "\t$Rd, $Rn, $Rm",
                   [(set tGPR:$Rd, (add tGPR:$Rn, tGPR:$Rm))]>, Sched<[WriteALU]>;
 
+  /// Similar to the above except these set the 's' bit so the
+  /// instruction modifies the CPSR register.
+  ///
+  /// These opcodes will be converted to the real non-S opcodes by
+  /// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+  let hasPostISelHook = 1, Defs = [CPSR] in {
+    let isCommutable = 1 in
+    def tADCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                            2, IIC_iALUr,
+                            [(set tGPR:$Rdn, CPSR, (ARMadde tGPR:$Rn, tGPR:$Rm,
+                                                            CPSR))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+
+    def tADDSi3 : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                              2, IIC_iALUi,
+                              [(set tGPR:$Rd, CPSR, (ARMaddc tGPR:$Rm,
+                                                             imm0_7:$imm3))]>,
+                  Requires<[IsThumb1Only]>,
+                  Sched<[WriteALU]>;
+
+    def tADDSi8 : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, imm0_255:$imm8),
+                              2, IIC_iALUi,
+                              [(set tGPR:$Rdn, CPSR, (ARMaddc tGPR:$Rn,
+                                                      imm8_255:$imm8))]>,
+                  Requires<[IsThumb1Only]>,
+                  Sched<[WriteALU]>;
+
+    let isCommutable = 1 in
+    def tADDSrr : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                              2, IIC_iALUr,
+                              [(set tGPR:$Rd, CPSR, (ARMaddc tGPR:$Rn,
+                                                             tGPR:$Rm))]>,
+                  Requires<[IsThumb1Only]>,
+                  Sched<[WriteALU]>;
+  }
+
   let hasSideEffects = 0 in
   def tADDhirr : T1pIt<(outs GPR:$Rdn), (ins GPR:$Rn, GPR:$Rm), IIC_iALUr,
                        "add", "\t$Rdn, $Rm", []>,
@@ -951,6 +997,12 @@ let isAdd = 1 in {
   }
 }
 
+def : tInstSubst<"sub${s}${p} $rd, $rn, $imm",
+                 (tADDi3 tGPR:$rd, s_cc_out:$s, tGPR:$rn, mod_imm1_7_neg:$imm, pred:$p)>;
+def : tInstSubst<"sub${s}${p} $rdn, $imm",
+                 (tADDi8 tGPR:$rdn, s_cc_out:$s, mod_imm8_255_neg:$imm, pred:$p)>;
+
+
 // AND register
 let isCommutable = 1 in
 def tAND :                      // A8.6.12
@@ -1197,7 +1249,7 @@ def tSBC :                      // A8.6.151
   T1sItDPEncode<0b0110, (outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
                 IIC_iALUr,
                 "sbc", "\t$Rdn, $Rm",
-                [(set tGPR:$Rdn, (sube tGPR:$Rn, tGPR:$Rm))]>,
+                []>,
                 Sched<[WriteALU]>;
 
 // Subtract immediate
@@ -1218,6 +1270,14 @@ def tSUBi8 :                    // A8.6.210 T2
                     [(set tGPR:$Rdn, (add tGPR:$Rn, imm8_255_neg:$imm8))]>,
                     Sched<[WriteALU]>;
 
+def : tInstSubst<"add${s}${p} $rd, $rn, $imm",
+                 (tSUBi3 tGPR:$rd, s_cc_out:$s, tGPR:$rn, mod_imm1_7_neg:$imm, pred:$p)>;
+
+
+def : tInstSubst<"add${s}${p} $rdn, $imm",
+                 (tSUBi8 tGPR:$rdn, s_cc_out:$s, mod_imm8_255_neg:$imm, pred:$p)>;
+
+
 // Subtract register
 def tSUBrr :                    // A8.6.212
   T1sIGenEncode<0b01101, (outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
@@ -1226,6 +1286,41 @@ def tSUBrr :                    // A8.6.212
                 [(set tGPR:$Rd, (sub tGPR:$Rn, tGPR:$Rm))]>,
                 Sched<[WriteALU]>;
 
+/// Similar to the above except these set the 's' bit so the
+/// instruction modifies the CPSR register.
+///
+/// These opcodes will be converted to the real non-S opcodes by
+/// AdjustInstrPostInstrSelection after giving then an optional CPSR operand.
+let hasPostISelHook = 1, Defs = [CPSR] in {
+  def tSBCS : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, tGPR:$Rm),
+                          2, IIC_iALUr,
+                          [(set tGPR:$Rdn, CPSR, (ARMsube tGPR:$Rn, tGPR:$Rm,
+                                                          CPSR))]>,
+              Requires<[IsThumb1Only]>,
+              Sched<[WriteALU]>;
+
+  def tSUBSi3 : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rm, imm0_7:$imm3),
+                            2, IIC_iALUi,
+                            [(set tGPR:$Rd, CPSR, (ARMsubc tGPR:$Rm,
+                                                           imm0_7:$imm3))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+
+  def tSUBSi8 : tPseudoInst<(outs tGPR:$Rdn), (ins tGPR:$Rn, imm0_255:$imm8),
+                            2, IIC_iALUi,
+                            [(set tGPR:$Rdn, CPSR, (ARMsubc tGPR:$Rn,
+                                                            imm8_255:$imm8))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+
+  def tSUBSrr : tPseudoInst<(outs tGPR:$Rd), (ins tGPR:$Rn, tGPR:$Rm),
+                            2, IIC_iALUr,
+                            [(set tGPR:$Rd, CPSR, (ARMsubc tGPR:$Rn,
+                                                           tGPR:$Rm))]>,
+                Requires<[IsThumb1Only]>,
+                Sched<[WriteALU]>;
+}
+
 // Sign-extend byte
 def tSXTB :                     // A8.6.222
   T1pIMiscEncode<{0,0,1,0,0,1,?}, (outs tGPR:$Rd), (ins tGPR:$Rm),
@@ -1386,22 +1481,6 @@ def : T1Pat<(ARMcmpZ tGPR:$Rn, imm0_255:$imm8),
 def : T1Pat<(ARMcmpZ tGPR:$Rn, tGPR:$Rm),
             (tCMPr   tGPR:$Rn, tGPR:$Rm)>;
 
-// Add with carry
-def : T1Pat<(addc   tGPR:$lhs, imm0_7:$rhs),
-            (tADDi3 tGPR:$lhs, imm0_7:$rhs)>;
-def : T1Pat<(addc   tGPR:$lhs, imm8_255:$rhs),
-            (tADDi8 tGPR:$lhs, imm8_255:$rhs)>;
-def : T1Pat<(addc   tGPR:$lhs, tGPR:$rhs),
-            (tADDrr tGPR:$lhs, tGPR:$rhs)>;
-
-// Subtract with carry
-def : T1Pat<(addc   tGPR:$lhs, imm0_7_neg:$rhs),
-            (tSUBi3 tGPR:$lhs, imm0_7_neg:$rhs)>;
-def : T1Pat<(addc   tGPR:$lhs, imm8_255_neg:$rhs),
-            (tSUBi8 tGPR:$lhs, imm8_255_neg:$rhs)>;
-def : T1Pat<(subc   tGPR:$lhs, tGPR:$rhs),
-            (tSUBrr tGPR:$lhs, tGPR:$rhs)>;
-
 // Bswap 16 with load/store
 def : T1Pat<(srl (bswap (extloadi16 t_addrmode_is2:$addr)), (i32 16)),
             (tREV16 (tLDRHi t_addrmode_is2:$addr))>;
@@ -1547,7 +1626,7 @@ def : T1Pat<(i32 thumb_immshifted:$src),
                     (thumb_immshifted_shamt imm:$src))>;
 
 def : T1Pat<(i32 imm0_255_comp:$src),
-            (tMVN (tMOVi8 (imm_comp_XFORM imm:$src)))>;
+            (tMVN (tMOVi8 (imm_not_XFORM imm:$src)))>;
 
 def : T1Pat<(i32 imm256_510:$src),
             (tADDi8 (tMOVi8 255),
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 603d66403e65e0e9002ddec8373e96682c134a39..f5b673b78ad711aba64fd668d6adef10bda62a17 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -76,7 +76,11 @@ def t2_so_imm_notSext16_XFORM : SDNodeXForm<imm, [{
 // t2_so_imm - Match a 32-bit immediate operand, which is an
 // 8-bit immediate rotated by an arbitrary number of bits, or an 8-bit
 // immediate splatted into multiple bytes of the word.
-def t2_so_imm_asmoperand : ImmAsmOperand { let Name = "T2SOImm"; }
+def t2_so_imm_asmoperand : AsmOperandClass {
+  let Name = "T2SOImm";
+  let RenderMethod = "addImmOperands";
+
+}
 def t2_so_imm : Operand<i32>, ImmLeaf<i32, [{
     return ARM_AM::getT2SOImmVal(Imm) != -1;
   }]> {
@@ -110,15 +114,14 @@ def t2_so_imm_notSext : Operand<i32>, PatLeaf<(imm), [{
 
 // t2_so_imm_neg - Match an immediate that is a negation of a t2_so_imm.
 def t2_so_imm_neg_asmoperand : AsmOperandClass { let Name = "T2SOImmNeg"; }
-def t2_so_imm_neg : Operand<i32>, PatLeaf<(imm), [{
-  int64_t Value = -(int)N->getZExtValue();
-  return Value && ARM_AM::getT2SOImmVal(Value) != -1;
+def t2_so_imm_neg : Operand<i32>, ImmLeaf<i32, [{
+  return Imm && ARM_AM::getT2SOImmVal(-(uint32_t)Imm) != -1;
 }], t2_so_imm_neg_XFORM> {
   let ParserMatchClass = t2_so_imm_neg_asmoperand;
 }
 
-/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0.4095].
-def imm0_4095_asmoperand: ImmAsmOperand { let Name = "Imm0_4095"; }
+/// imm0_4095 predicate - True if the 32-bit immediate is in the range [0,4095].
+def imm0_4095_asmoperand: ImmAsmOperand<0,4095> { let Name = "Imm0_4095"; }
 def imm0_4095 : Operand<i32>, ImmLeaf<i32, [{
   return Imm >= 0 && Imm < 4096;
 }]> {
@@ -139,7 +142,7 @@ def imm1_255_neg : PatLeaf<(i32 imm), [{
 
 def imm0_255_not : PatLeaf<(i32 imm), [{
   return (uint32_t)(~N->getZExtValue()) < 255;
-}], imm_comp_XFORM>;
+}], imm_not_XFORM>;
 
 def lo5AllOne : PatLeaf<(i32 imm), [{
   // Returns true if all low 5-bits are 1.
@@ -538,7 +541,8 @@ class T2FourReg<dag oops, dag iops, InstrItinClass itin,
 class T2MulLong<bits<3> opc22_20, bits<4> opc7_4,
                 string opc, list<dag> pattern>
   : T2I<(outs rGPR:$RdLo, rGPR:$RdHi), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL64,
-         opc, "\t$RdLo, $RdHi, $Rn, $Rm", pattern> {
+         opc, "\t$RdLo, $RdHi, $Rn, $Rm", pattern>,
+    Sched<[WriteMUL64Lo, WriteMUL64Hi, ReadMUL, ReadMUL]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rn;
@@ -556,7 +560,8 @@ class T2MlaLong<bits<3> opc22_20, bits<4> opc7_4, string opc>
   : T2I<(outs rGPR:$RdLo, rGPR:$RdHi),
         (ins rGPR:$Rn, rGPR:$Rm, rGPR:$RLo, rGPR:$RHi), IIC_iMAC64,
         opc, "\t$RdLo, $RdHi, $Rn, $Rm", []>,
-        RegConstraint<"$RLo = $RdLo, $RHi = $RdHi"> {
+        RegConstraint<"$RLo = $RdLo, $RHi = $RdHi">,
+    Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]> {
   bits<4> RdLo;
   bits<4> RdHi;
   bits<4> Rn;
@@ -977,7 +982,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
                   PatFrag opnode> {
   def i12 : T2Ii12<(outs target:$Rt), (ins t2addrmode_imm12:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_imm12:$addr))]>,
+            Sched<[WriteLd]> {
     bits<4> Rt;
     bits<17> addr;
     let Inst{31-25} = 0b1111100;
@@ -993,7 +999,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
   }
   def i8  : T2Ii8 <(outs target:$Rt), (ins t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
-                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_negimm8:$addr))]>,
+            Sched<[WriteLd]> {
     bits<4> Rt;
     bits<13> addr;
     let Inst{31-27} = 0b11111;
@@ -1015,7 +1022,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
   }
   def s   : T2Iso <(outs target:$Rt), (ins t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
-                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]> {
+                   [(set target:$Rt, (opnode t2addrmode_so_reg:$addr))]>,
+            Sched<[WriteLd]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
     let Inst{24} = signed;
@@ -1039,7 +1047,8 @@ multiclass T2I_ld<bit signed, bits<2> opcod, string opc,
   // from the PC.
   def pci : T2Ipc <(outs target:$Rt), (ins t2ldrlabel:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]> {
+                   [(set target:$Rt, (opnode (ARMWrapper tconstpool:$addr)))]>,
+            Sched<[WriteLd]> {
     let isReMaterializable = 1;
     let Inst{31-27} = 0b11111;
     let Inst{26-25} = 0b00;
@@ -1065,7 +1074,8 @@ multiclass T2I_st<bits<2> opcod, string opc,
                   PatFrag opnode> {
   def i12 : T2Ii12<(outs), (ins target:$Rt, t2addrmode_imm12:$addr), iii,
                    opc, ".w\t$Rt, $addr",
-                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_imm12:$addr)]>,
+            Sched<[WriteST]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0001;
     let Inst{22-21} = opcod;
@@ -1082,7 +1092,8 @@ multiclass T2I_st<bits<2> opcod, string opc,
   }
   def i8  : T2Ii8 <(outs), (ins target:$Rt, t2addrmode_negimm8:$addr), iii,
                    opc, "\t$Rt, $addr",
-                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_negimm8:$addr)]>,
+            Sched<[WriteST]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -1102,7 +1113,8 @@ multiclass T2I_st<bits<2> opcod, string opc,
   }
   def s   : T2Iso <(outs), (ins target:$Rt, t2addrmode_so_reg:$addr), iis,
                    opc, ".w\t$Rt, $addr",
-                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]> {
+                   [(opnode target:$Rt, t2addrmode_so_reg:$addr)]>,
+            Sched<[WriteST]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0000;
     let Inst{22-21} = opcod;
@@ -1121,28 +1133,10 @@ multiclass T2I_st<bits<2> opcod, string opc,
 
 /// T2I_ext_rrot - A unary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-class T2I_ext_rrot<bits<3> opcod, string opc, PatFrag opnode>
-  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
-             opc, ".w\t$Rd, $Rm$rot",
-             [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
-             Requires<[IsThumb2]> {
-   let Inst{31-27} = 0b11111;
-   let Inst{26-23} = 0b0100;
-   let Inst{22-20} = opcod;
-   let Inst{19-16} = 0b1111; // Rn
-   let Inst{15-12} = 0b1111;
-   let Inst{7} = 1;
-
-   bits<2> rot;
-   let Inst{5-4} = rot{1-0}; // rotate
-}
-
-// UXTB16 - Requres T2ExtractPack, does not need the .w qualifier.
-class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
-  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot),
-             IIC_iEXTr, opc, "\t$Rd, $Rm$rot",
-            [(set rGPR:$Rd, (opnode (rotr rGPR:$Rm, rot_imm:$rot)))]>,
-          Requires<[HasT2ExtractPack, IsThumb2]> {
+class T2I_ext_rrot_base<bits<3> opcod, dag iops, dag oops,
+                        string opc, string oprs,
+                        list<dag> pattern>
+  : T2TwoReg<iops, oops, IIC_iEXTr, opc, oprs, pattern> {
   bits<2> rot;
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0100;
@@ -1150,46 +1144,34 @@ class T2I_ext_rrot_uxtb16<bits<3> opcod, string opc, PatFrag opnode>
   let Inst{19-16} = 0b1111; // Rn
   let Inst{15-12} = 0b1111;
   let Inst{7} = 1;
-  let Inst{5-4} = rot;
-}
-
-// SXTB16 - Requres T2ExtractPack, does not need the .w qualifier, no pattern
-// supported yet.
-class T2I_ext_rrot_sxtb16<bits<3> opcod, string opc>
-  : T2TwoReg<(outs rGPR:$Rd), (ins rGPR:$Rm, rot_imm:$rot), IIC_iEXTr,
-             opc, "\t$Rd, $Rm$rot", []>,
-          Requires<[IsThumb2, HasT2ExtractPack]> {
-  bits<2> rot;
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0100;
-  let Inst{22-20} = opcod;
-  let Inst{19-16} = 0b1111; // Rn
-  let Inst{15-12} = 0b1111;
-  let Inst{7} = 1;
-  let Inst{5-4} = rot;
-}
+  let Inst{5-4} = rot; // rotate
+}
+
+class T2I_ext_rrot<bits<3> opcod, string opc>
+  : T2I_ext_rrot_base<opcod,
+                      (outs rGPR:$Rd),
+                      (ins rGPR:$Rm, rot_imm:$rot),
+                      opc, ".w\t$Rd, $Rm$rot", []>,
+                      Requires<[IsThumb2]>,
+                      Sched<[WriteALU, ReadALU]>;
+
+// UXTB16, SXTB16 - Requires HasDSP, does not need the .w qualifier.
+class T2I_ext_rrot_xtb16<bits<3> opcod, string opc>
+  : T2I_ext_rrot_base<opcod,
+                      (outs rGPR:$Rd),
+                      (ins rGPR:$Rm, rot_imm:$rot),
+                      opc, "\t$Rd, $Rm$rot", []>,
+                      Requires<[HasDSP, IsThumb2]>,
+                      Sched<[WriteALU, ReadALU]>;
 
 /// T2I_exta_rrot - A binary operation with two forms: one whose operand is a
 /// register and one whose operand is a register rotated by 8/16/24.
-class T2I_exta_rrot<bits<3> opcod, string opc, PatFrag opnode>
+class T2I_exta_rrot<bits<3> opcod, string opc>
   : T2ThreeReg<(outs rGPR:$Rd),
                (ins rGPR:$Rn, rGPR:$Rm, rot_imm:$rot),
-               IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot",
-             [(set rGPR:$Rd, (opnode rGPR:$Rn, (rotr rGPR:$Rm,rot_imm:$rot)))]>,
-           Requires<[HasT2ExtractPack, IsThumb2]> {
-  bits<2> rot;
-  let Inst{31-27} = 0b11111;
-  let Inst{26-23} = 0b0100;
-  let Inst{22-20} = opcod;
-  let Inst{15-12} = 0b1111;
-  let Inst{7} = 1;
-  let Inst{5-4} = rot;
-}
-
-class T2I_exta_rrot_np<bits<3> opcod, string opc>
-  : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm,rot_imm:$rot),
                IIC_iEXTAsr, opc, "\t$Rd, $Rn, $Rm$rot", []>,
-               Requires<[HasT2ExtractPack, IsThumb2]> {
+               Requires<[HasDSP, IsThumb2]>,
+               Sched<[WriteALU, ReadALU]> {
   bits<2> rot;
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0100;
@@ -1279,7 +1261,8 @@ let mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1 in {
 // Load doubleword
 def t2LDRDi8  : T2Ii8s4<1, 0, 1, (outs rGPR:$Rt, rGPR:$Rt2),
                         (ins t2addrmode_imm8s4:$addr),
-                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>;
+                        IIC_iLoad_d_i, "ldrd", "\t$Rt, $Rt2, $addr", "", []>,
+                 Sched<[WriteLd]>;
 } // mayLoad = 1, hasSideEffects = 0, hasExtraDefRegAllocReq = 1
 
 // zextload i1 -> zextload i8
@@ -1333,17 +1316,20 @@ let mayLoad = 1, hasSideEffects = 0 in {
 def t2LDR_PRE  : T2Ipreldst<0, 0b10, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_iu,
-                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+                            "ldr", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>,
+                 Sched<[WriteLd]>;
 
 def t2LDR_POST : T2Ipostldst<0, 0b10, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_iu,
-                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldr", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                  Sched<[WriteLd]>;
 
 def t2LDRB_PRE : T2Ipreldst<0, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+                            "ldrb", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>,
+                 Sched<[WriteLd]>;
 
 def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
@@ -1353,41 +1339,45 @@ def t2LDRB_POST : T2Ipostldst<0, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
 def t2LDRH_PRE : T2Ipreldst<0, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
-                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>;
+                            "ldrh", "\t$Rt, $addr!", "$addr.base = $Rn_wb", []>,
+                Sched<[WriteLd]>;
 
 def t2LDRH_POST : T2Ipostldst<0, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldrh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                  Sched<[WriteLd]>;
 
 def t2LDRSB_PRE : T2Ipreldst<1, 0b00, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
                             "ldrsb", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []>;
+                            []>, Sched<[WriteLd]>;
 
 def t2LDRSB_POST : T2Ipostldst<1, 0b00, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldrsb", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                   Sched<[WriteLd]>;
 
 def t2LDRSH_PRE : T2Ipreldst<1, 0b01, 1, 1, (outs GPR:$Rt, GPR:$Rn_wb),
                             (ins t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iLoad_bh_iu,
                             "ldrsh", "\t$Rt, $addr!", "$addr.base = $Rn_wb",
-                            []>;
+                            []>, Sched<[WriteLd]>;
 
 def t2LDRSH_POST : T2Ipostldst<1, 0b01, 1, 0, (outs GPR:$Rt, GPR:$Rn_wb),
                           (ins addr_offset_none:$Rn, t2am_imm8_offset:$offset),
                           AddrModeT2_i8, IndexModePost, IIC_iLoad_bh_iu,
-                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>;
+                          "ldrsh", "\t$Rt, $Rn$offset", "$Rn = $Rn_wb", []>,
+                  Sched<[WriteLd]>;
 } // mayLoad = 1, hasSideEffects = 0
 
 // LDRT, LDRBT, LDRHT, LDRSBT, LDRSHT all have offset mode (PUW=0b110).
 // Ref: A8.6.57 LDR (immediate, Thumb) Encoding T4
 class T2IldT<bit signed, bits<2> type, string opc, InstrItinClass ii>
   : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_posimm8:$addr), ii, opc,
-          "\t$Rt, $addr", []> {
+          "\t$Rt, $addr", []>, Sched<[WriteLd]> {
   bits<4> Rt;
   bits<13> addr;
   let Inst{31-27} = 0b11111;
@@ -1431,11 +1421,14 @@ class T2Ildacq<bits<4> bits23_20, bits<2> bit54, dag oops, dag iops,
 }
 
 def t2LDA : T2Ildacq<0b1101, 0b10, (outs rGPR:$Rt),
-                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>;
+                     (ins addr_offset_none:$addr), "lda", "\t$Rt, $addr", []>,
+            Sched<[WriteLd]>;
 def t2LDAB : T2Ildacq<0b1101, 0b00, (outs rGPR:$Rt),
-                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>;
+                      (ins addr_offset_none:$addr), "ldab", "\t$Rt, $addr", []>,
+            Sched<[WriteLd]>;
 def t2LDAH : T2Ildacq<0b1101, 0b01, (outs rGPR:$Rt),
-                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>;
+                      (ins addr_offset_none:$addr), "ldah", "\t$Rt, $addr", []>,
+            Sched<[WriteLd]>;
 
 // Store
 defm t2STR :T2I_st<0b10,"str", IIC_iStore_i, IIC_iStore_si, GPR, store>;
@@ -1448,7 +1441,8 @@ defm t2STRH:T2I_st<0b01,"strh", IIC_iStore_bh_i, IIC_iStore_bh_si,
 let mayStore = 1, hasSideEffects = 0, hasExtraSrcRegAllocReq = 1 in
 def t2STRDi8 : T2Ii8s4<1, 0, 0, (outs),
                        (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4:$addr),
-               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>;
+               IIC_iStore_d_r, "strd", "\t$Rt, $Rt2, $addr", "", []>,
+               Sched<[WriteST]>;
 
 // Indexed stores
 
@@ -1457,19 +1451,22 @@ def t2STR_PRE  : T2Ipreldst<0, 0b10, 0, 1, (outs GPRnopc:$Rn_wb),
                             (ins GPRnopc:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                             "str", "\t$Rt, $addr!",
-                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+                            "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>,
+                 Sched<[WriteST]>;
 
 def t2STRH_PRE  : T2Ipreldst<0, 0b01, 0, 1, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_iu,
                         "strh", "\t$Rt, $addr!",
-                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>,
+                  Sched<[WriteST]>;
 
 def t2STRB_PRE  : T2Ipreldst<0, 0b00, 0, 1, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, t2addrmode_imm8_pre:$addr),
                             AddrModeT2_i8, IndexModePre, IIC_iStore_bh_iu,
                         "strb", "\t$Rt, $addr!",
-                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>;
+                        "$addr.base = $Rn_wb,@earlyclobber $Rn_wb", []>,
+            Sched<[WriteST]>;
 } // mayStore = 1, hasSideEffects = 0
 
 def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
@@ -1480,7 +1477,8 @@ def t2STR_POST : T2Ipostldst<0, 0b10, 0, 0, (outs GPRnopc:$Rn_wb),
                           "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
              [(set GPRnopc:$Rn_wb,
                   (post_store GPRnopc:$Rt, addr_offset_none:$Rn,
-                              t2am_imm8_offset:$offset))]>;
+                              t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 
 def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, addr_offset_none:$Rn,
@@ -1490,7 +1488,8 @@ def t2STRH_POST : T2Ipostldst<0, 0b01, 0, 0, (outs GPRnopc:$Rn_wb),
                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
        [(set GPRnopc:$Rn_wb,
              (post_truncsti16 rGPR:$Rt, addr_offset_none:$Rn,
-                              t2am_imm8_offset:$offset))]>;
+                              t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 
 def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
                             (ins rGPR:$Rt, addr_offset_none:$Rn,
@@ -1500,7 +1499,8 @@ def t2STRB_POST : T2Ipostldst<0, 0b00, 0, 0, (outs GPRnopc:$Rn_wb),
                          "$Rn = $Rn_wb,@earlyclobber $Rn_wb",
         [(set GPRnopc:$Rn_wb,
               (post_truncsti8 rGPR:$Rt, addr_offset_none:$Rn,
-                              t2am_imm8_offset:$offset))]>;
+                              t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 
 // Pseudo-instructions for pattern matching the pre-indexed stores. We can't
 // put the patterns on the instruction definitions directly as ISel wants
@@ -1513,17 +1513,20 @@ def t2STR_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
                (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
                4, IIC_iStore_ru,
       [(set GPRnopc:$Rn_wb,
-            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+            (pre_store rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 def t2STRB_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
                (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
                4, IIC_iStore_ru,
       [(set GPRnopc:$Rn_wb,
-            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+            (pre_truncsti8 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
                (ins rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset, pred:$p),
                4, IIC_iStore_ru,
       [(set GPRnopc:$Rn_wb,
-            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>;
+            (pre_truncsti16 rGPR:$Rt, GPRnopc:$Rn, t2am_imm8_offset:$offset))]>,
+            Sched<[WriteST]>;
 }
 
 // STRT, STRBT, STRHT all have offset mode (PUW=0b110) and are for disassembly
@@ -1531,7 +1534,7 @@ def t2STRH_preidx: t2PseudoInst<(outs GPRnopc:$Rn_wb),
 // Ref: A8.6.193 STR (immediate, Thumb) Encoding T4
 class T2IstT<bits<2> type, string opc, InstrItinClass ii>
   : T2Ii8<(outs rGPR:$Rt), (ins t2addrmode_imm8:$addr), ii, opc,
-          "\t$Rt, $addr", []> {
+          "\t$Rt, $addr", []>, Sched<[WriteST]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-25} = 0b00;
   let Inst{24} = 0; // not signed
@@ -1557,7 +1560,8 @@ def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 let mayLoad = 1 in
 def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
-                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []> {
+                 "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []>,
+                 Sched<[WriteLd]> {
   let DecoderMethod = "DecodeT2LDRDPreInstruction";
 }
 
@@ -1565,13 +1569,13 @@ let mayLoad = 1 in
 def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
                  IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
-                 "$addr.base = $wb", []>;
+                 "$addr.base = $wb", []>, Sched<[WriteLd]>;
 
 let mayStore = 1 in
 def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
                  (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
-                 "$addr.base = $wb", []> {
+                 "$addr.base = $wb", []>, Sched<[WriteST]> {
   let DecoderMethod = "DecodeT2STRDPreInstruction";
 }
 
@@ -1580,12 +1584,13 @@ def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
                  (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
                       t2am_imm8s4_offset:$imm),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr$imm",
-                 "$addr.base = $wb", []>;
+                 "$addr.base = $wb", []>, Sched<[WriteST]>;
 
 class T2Istrrel<bits<2> bit54, dag oops, dag iops,
                 string opc, string asm, list<dag> pattern>
   : Thumb2I<oops, iops, AddrModeNone, 4, NoItinerary, opc,
-            asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]> {
+            asm, "", pattern>, Requires<[IsThumb, HasAcquireRelease]>,
+    Sched<[WriteST]> {
   bits<4> Rt;
   bits<4> addr;
 
@@ -1861,7 +1866,7 @@ defm t2STM : thumb2_st_mult<"stm", IIC_iStore_m, IIC_iStore_mu, 0>;
 //
 
 let hasSideEffects = 0 in
-def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
+def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPRnopc:$Rm), IIC_iMOVr,
                    "mov", ".w\t$Rd, $Rm", []>, Sched<[WriteALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -1870,11 +1875,11 @@ def t2MOVr : T2sTwoReg<(outs GPRnopc:$Rd), (ins GPR:$Rm), IIC_iMOVr,
   let Inst{14-12} = 0b000;
   let Inst{7-4} = 0b0000;
 }
-def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+def : t2InstAlias<"mov${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
                                                 pred:$p, zero_reg)>;
-def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+def : t2InstAlias<"movs${p}.w $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
                                                  pred:$p, CPSR)>;
-def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPR:$Rm,
+def : t2InstAlias<"movs${p} $Rd, $Rm", (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm,
                                                pred:$p, CPSR)>;
 
 // AddedComplexity to ensure isel tries t2MOVi before t2MOVi16.
@@ -1926,10 +1931,11 @@ def t2MOVi16 : T2I<(outs rGPR:$Rd), (ins imm0_65535_expr:$imm), IIC_iMOVi,
 
 def : InstAlias<"mov${p} $Rd, $imm",
                 (t2MOVi16 rGPR:$Rd, imm256_65535_expr:$imm, pred:$p), 0>,
-                Requires<[IsThumb, HasV8MBaseline]>;
+                Requires<[IsThumb, HasV8MBaseline]>, Sched<[WriteALU]>;
 
 def t2MOVi16_ga_pcrel : PseudoInst<(outs rGPR:$Rd),
-                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>;
+                                (ins i32imm:$addr, pclabel:$id), IIC_iMOVi, []>,
+                        Sched<[WriteALU]>;
 
 let Constraints = "$src = $Rd" in {
 def t2MOVTi16 : T2I<(outs rGPR:$Rd),
@@ -1969,31 +1975,39 @@ def : T2Pat<(or rGPR:$src, 0xffff0000), (t2MOVTi16 rGPR:$src, 0xffff)>;
 
 // Sign extenders
 
-def t2SXTB  : T2I_ext_rrot<0b100, "sxtb",
-                              UnOpFrag<(sext_inreg node:$Src, i8)>>;
-def t2SXTH  : T2I_ext_rrot<0b000, "sxth",
-                              UnOpFrag<(sext_inreg node:$Src, i16)>>;
-def t2SXTB16 : T2I_ext_rrot_sxtb16<0b010, "sxtb16">;
+def t2SXTB  : T2I_ext_rrot<0b100, "sxtb">;
+def t2SXTH  : T2I_ext_rrot<0b000, "sxth">;
+def t2SXTB16 : T2I_ext_rrot_xtb16<0b010, "sxtb16">;
+
+def t2SXTAB : T2I_exta_rrot<0b100, "sxtab">;
+def t2SXTAH : T2I_exta_rrot<0b000, "sxtah">;
+def t2SXTAB16 : T2I_exta_rrot<0b010, "sxtab16">;
+
+def : T2Pat<(sext_inreg (rotr rGPR:$Rn, rot_imm:$rot), i8),
+            (t2SXTB rGPR:$Rn, rot_imm:$rot)>;
+def : T2Pat<(sext_inreg (rotr rGPR:$Rn, rot_imm:$rot), i16),
+            (t2SXTH rGPR:$Rn, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn,
+                            (sext_inreg (rotr rGPR:$Rm, rot_imm:$rot), i8)),
+            (t2SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn,
+                            (sext_inreg (rotr rGPR:$Rm, rot_imm:$rot), i16)),
+            (t2SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
 
-def t2SXTAB : T2I_exta_rrot<0b100, "sxtab",
-                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS, i8))>>;
-def t2SXTAH : T2I_exta_rrot<0b000, "sxtah",
-                        BinOpFrag<(add node:$LHS, (sext_inreg node:$RHS,i16))>>;
-def t2SXTAB16 : T2I_exta_rrot_np<0b010, "sxtab16">;
 
 // A simple right-shift can also be used in most cases (the exception is the
 // SXTH operations with a rotate of 24: there the non-contiguous bits are
 // relevant).
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (srl rGPR:$Rm, rot_imm:$rot), i8)),
                        (t2SXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (srl rGPR:$Rm, imm8_or_16:$rot), i16)),
                        (t2SXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (rotr rGPR:$Rm, (i32 24)), i16)),
                        (t2SXTAH rGPR:$Rn, rGPR:$Rm, (i32 3))>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
+def : Thumb2DSPPat<(add rGPR:$Rn, (sext_inreg
                                         (or (srl rGPR:$Rm, (i32 24)),
                                               (shl rGPR:$Rm, (i32 8))), i16)),
                        (t2SXTAH rGPR:$Rn, rGPR:$Rm, (i32 3))>;
@@ -2001,12 +2015,16 @@ def : Thumb2ExtractPat<(add rGPR:$Rn, (sext_inreg
 // Zero extenders
 
 let AddedComplexity = 16 in {
-def t2UXTB   : T2I_ext_rrot<0b101, "uxtb",
-                               UnOpFrag<(and node:$Src, 0x000000FF)>>;
-def t2UXTH   : T2I_ext_rrot<0b001, "uxth",
-                               UnOpFrag<(and node:$Src, 0x0000FFFF)>>;
-def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
-                                   UnOpFrag<(and node:$Src, 0x00FF00FF)>>;
+def t2UXTB   : T2I_ext_rrot<0b101, "uxtb">;
+def t2UXTH   : T2I_ext_rrot<0b001, "uxth">;
+def t2UXTB16 : T2I_ext_rrot_xtb16<0b011, "uxtb16">;
+
+def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x000000FF),
+                       (t2UXTB rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x0000FFFF),
+                       (t2UXTH rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(and (rotr rGPR:$Rm, rot_imm:$rot), 0x00FF00FF),
+                       (t2UXTB16 rGPR:$Rm, rot_imm:$rot)>;
 
 // FIXME: This pattern incorrectly assumes the shl operator is a rotate.
 //        The transformation should probably be done as a combiner action
@@ -2014,21 +2032,25 @@ def t2UXTB16 : T2I_ext_rrot_uxtb16<0b011, "uxtb16",
 //        eight bits of the source into the lower eight bits of the result.
 //def : T2Pat<(and (shl rGPR:$Src, (i32 8)), 0xFF00FF),
 //            (t2UXTB16 rGPR:$Src, 3)>,
-//          Requires<[HasT2ExtractPack, IsThumb2]>;
+//          Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(and (srl rGPR:$Src, (i32 8)), 0xFF00FF),
             (t2UXTB16 rGPR:$Src, 1)>,
-        Requires<[HasT2ExtractPack, IsThumb2]>;
+        Requires<[HasDSP, IsThumb2]>;
 
-def t2UXTAB : T2I_exta_rrot<0b101, "uxtab",
-                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0x00FF))>>;
-def t2UXTAH : T2I_exta_rrot<0b001, "uxtah",
-                           BinOpFrag<(add node:$LHS, (and node:$RHS, 0xFFFF))>>;
-def t2UXTAB16 : T2I_exta_rrot_np<0b011, "uxtab16">;
+def t2UXTAB : T2I_exta_rrot<0b101, "uxtab">;
+def t2UXTAH : T2I_exta_rrot<0b001, "uxtah">;
+def t2UXTAB16 : T2I_exta_rrot<0b011, "uxtab16">;
 
-def : Thumb2ExtractPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot),
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (rotr rGPR:$Rm, rot_imm:$rot),
+                                            0x00FF)),
+                       (t2UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (rotr rGPR:$Rm, rot_imm:$rot),
+                                            0xFFFF)),
+                       (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, rot_imm:$rot),
                                            0xFF)),
                        (t2UXTAB rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
-def : Thumb2ExtractPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot),
+def : Thumb2DSPPat<(add rGPR:$Rn, (and (srl rGPR:$Rm, imm8_or_16:$rot),
                                             0xFFFF)),
                        (t2UXTAH rGPR:$Rn, rGPR:$Rm, rot_imm:$rot)>;
 }
@@ -2060,6 +2082,19 @@ defm t2ADC  : T2I_adde_sube_irs<0b1010, "adc", ARMadde, 1>;
 defm t2SBC  : T2I_adde_sube_irs<0b1011, "sbc", ARMsube>;
 }
 
+def : t2InstSubst<"adc${s}${p} $rd, $rn, $imm",
+                 (t2SBCri rGPR:$rd, rGPR:$rn, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"sbc${s}${p} $rd, $rn, $imm",
+                 (t2ADCri rGPR:$rd, rGPR:$rn, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
+
+def : t2InstSubst<"add${s}${p}.w $rd, $rn, $imm",
+                 (t2SUBri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"addw${p} $rd, $rn, $imm",
+                 (t2SUBri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
+def : t2InstSubst<"sub${s}${p}.w $rd, $rn, $imm",
+                 (t2ADDri GPRnopc:$rd, GPRnopc:$rn, t2_so_imm_neg:$imm, pred:$p, s_cc_out:$s)>;
+def : t2InstSubst<"subw${p} $rd, $rn, $imm",
+                 (t2ADDri12 GPRnopc:$rd, GPR:$rn, t2_so_imm_neg:$imm, pred:$p)>;
 // RSB
 defm t2RSB  : T2I_rbin_irs  <0b1110, "rsb", sub>;
 
@@ -2230,70 +2265,52 @@ def t2USADA8  : T2FourReg_mac<0, 0b111, 0b0000, (outs rGPR:$Rd),
           Requires<[IsThumb2, HasDSP]>;
 
 // Signed/Unsigned saturate.
-class T2SatI<dag oops, dag iops, InstrItinClass itin,
-           string opc, string asm, list<dag> pattern>
-  : T2I<oops, iops, itin, opc, asm, pattern> {
+class T2SatI<dag iops, string opc, string asm>
+  : T2I<(outs rGPR:$Rd), iops, NoItinerary, opc, asm, []> {
   bits<4> Rd;
   bits<4> Rn;
   bits<5> sat_imm;
-  bits<7> sh;
+  bits<6> sh;
 
-  let Inst{11-8}  = Rd;
+  let Inst{31-24} = 0b11110011;
+  let Inst{21} = sh{5};
+  let Inst{20} = 0;
   let Inst{19-16} = Rn;
-  let Inst{4-0}   = sat_imm;
-  let Inst{21}    = sh{5};
+  let Inst{15} = 0;
   let Inst{14-12} = sh{4-2};
-  let Inst{7-6}   = sh{1-0};
+  let Inst{11-8}  = Rd;
+  let Inst{7-6} = sh{1-0};
+  let Inst{5} = 0;
+  let Inst{4-0}   = sat_imm;
 }
 
-def t2SSAT: T2SatI<
-              (outs rGPR:$Rd),
-              (ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-              NoItinerary, "ssat", "\t$Rd, $sat_imm, $Rn$sh", []>,
-              Requires<[IsThumb2]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1100;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
+def t2SSAT: T2SatI<(ins imm1_32:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                   "ssat", "\t$Rd, $sat_imm, $Rn$sh">,
+                   Requires<[IsThumb2]> {
+  let Inst{23-22} = 0b00;
   let Inst{5}  = 0;
 }
 
-def t2SSAT16: T2SatI<
-                (outs rGPR:$Rd), (ins imm1_16:$sat_imm, rGPR:$Rn), NoItinerary,
-                "ssat16", "\t$Rd, $sat_imm, $Rn", []>,
-                Requires<[IsThumb2, HasDSP]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1100;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
-  let Inst{21} = 1;        // sh = '1'
-  let Inst{14-12} = 0b000; // imm3 = '000'
-  let Inst{7-6} = 0b00;    // imm2 = '00'
-  let Inst{5-4} = 0b00;
+def t2SSAT16: T2SatI<(ins imm1_16:$sat_imm, rGPR:$Rn),
+                     "ssat16", "\t$Rd, $sat_imm, $Rn">,
+                     Requires<[IsThumb2, HasDSP]> {
+  let Inst{23-22} = 0b00;
+  let sh = 0b100000;
+  let Inst{4} = 0;
 }
 
-def t2USAT: T2SatI<
-               (outs rGPR:$Rd),
-               (ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
-                NoItinerary, "usat", "\t$Rd, $sat_imm, $Rn$sh", []>,
-                Requires<[IsThumb2]> {
-  let Inst{31-27} = 0b11110;
-  let Inst{25-22} = 0b1110;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
+def t2USAT: T2SatI<(ins imm0_31:$sat_imm, rGPR:$Rn, t2_shift_imm:$sh),
+                    "usat", "\t$Rd, $sat_imm, $Rn$sh">,
+                    Requires<[IsThumb2]> {
+  let Inst{23-22} = 0b10;
 }
 
-def t2USAT16: T2SatI<(outs rGPR:$Rd), (ins imm0_15:$sat_imm, rGPR:$Rn),
-                     NoItinerary,
-                     "usat16", "\t$Rd, $sat_imm, $Rn", []>,
+def t2USAT16: T2SatI<(ins imm0_15:$sat_imm, rGPR:$Rn),
+                     "usat16", "\t$Rd, $sat_imm, $Rn">,
                      Requires<[IsThumb2, HasDSP]> {
-  let Inst{31-22} = 0b1111001110;
-  let Inst{20} = 0;
-  let Inst{15} = 0;
-  let Inst{21} = 1;        // sh = '1'
-  let Inst{14-12} = 0b000; // imm3 = '000'
-  let Inst{7-6} = 0b00;    // imm2 = '00'
-  let Inst{5-4} = 0b00;
+  let Inst{23-22} = 0b10;
+  let sh = 0b100000;
+  let Inst{4} = 0;
 }
 
 def : T2Pat<(int_arm_ssat GPR:$a, imm1_32:$pos), (t2SSAT imm1_32:$pos, GPR:$a, 0)>;
@@ -2305,11 +2322,18 @@ def : T2Pat<(ARMssatnoshift GPRnopc:$Rn, imm0_31:$imm),
 //  Shift and rotate Instructions.
 //
 
-defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm0_31, shl>;
+defm t2LSL  : T2I_sh_ir<0b00, "lsl", imm1_31, shl>;
 defm t2LSR  : T2I_sh_ir<0b01, "lsr", imm_sr,  srl>;
 defm t2ASR  : T2I_sh_ir<0b10, "asr", imm_sr,  sra>;
 defm t2ROR  : T2I_sh_ir<0b11, "ror", imm0_31, rotr>;
 
+// LSL #0 is actually MOV, and has slightly different permitted registers to
+// LSL with non-zero shift
+def : t2InstAlias<"lsl${s}${p} $Rd, $Rm, #0",
+                  (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+def : t2InstAlias<"lsl${s}${p}.w $Rd, $Rm, #0",
+                  (t2MOVr GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+
 // (rotr x, (and y, 0x...1f)) ==> (ROR x, y)
 def : T2Pat<(rotr rGPR:$lhs, (and rGPR:$rhs, lo5AllOne)),
             (t2RORrr rGPR:$lhs, rGPR:$rhs)>;
@@ -2547,7 +2571,8 @@ def : T2Pat<(t2_so_imm_not:$src),
 let isCommutable = 1 in
 def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
                 "mul", "\t$Rd, $Rn, $Rm",
-                [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]> {
+                [(set rGPR:$Rd, (mul rGPR:$Rn, rGPR:$Rm))]>,
+           Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -2558,7 +2583,8 @@ def t2MUL: T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
 class T2FourRegMLA<bits<4> op7_4, string opc, list<dag> pattern>
   : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
                opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
-               Requires<[IsThumb2, UseMulOps]> {
+               Requires<[IsThumb2, UseMulOps]>,
+    Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]>  {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b000;
@@ -2575,8 +2601,12 @@ def t2MLS: T2FourRegMLA<0b0001, "mls",
 // Extra precision multiplies with low / high results
 let hasSideEffects = 0 in {
 let isCommutable = 1 in {
-def t2SMULL : T2MulLong<0b000, 0b0000, "smull", []>;
-def t2UMULL : T2MulLong<0b010, 0b0000, "umull", []>;
+def t2SMULL : T2MulLong<0b000, 0b0000, "smull",
+                        [(set rGPR:$RdLo, rGPR:$RdHi,
+                              (smullohi rGPR:$Rn, rGPR:$Rm))]>;
+def t2UMULL : T2MulLong<0b010, 0b0000, "umull",
+                        [(set rGPR:$RdLo, rGPR:$RdHi,
+                              (umullohi rGPR:$Rn, rGPR:$Rm))]>;
 } // isCommutable
 
 // Multiply + accumulate
@@ -2592,7 +2622,8 @@ class T2SMMUL<bits<4> op7_4, string opc, list<dag> pattern>
   : T2ThreeReg<(outs rGPR:$Rd),
                (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL32,
                opc, "\t$Rd, $Rn, $Rm", pattern>,
-               Requires<[IsThumb2, HasDSP]> {
+               Requires<[IsThumb2, HasDSP]>,
+    Sched<[WriteMUL32, ReadMUL, ReadMUL]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = 0b101;
@@ -2607,7 +2638,8 @@ class T2FourRegSMMLA<bits<3> op22_20, bits<4> op7_4, string opc,
                      list<dag> pattern>
   : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMAC32,
               opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
-              Requires<[IsThumb2, HasDSP, UseMulOps]> {
+              Requires<[IsThumb2, HasDSP, UseMulOps]>,
+    Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-23} = 0b0110;
   let Inst{22-20} = op22_20;
@@ -2624,7 +2656,8 @@ class T2ThreeRegSMUL<bits<3> op22_20, bits<2> op5_4, string opc,
                      list<dag> pattern>
   : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iMUL16, opc,
                "\t$Rd, $Rn, $Rm", pattern>,
-    Requires<[IsThumb2, HasDSP]> {
+    Requires<[IsThumb2, HasDSP]>,
+    Sched<[WriteMUL16, ReadMUL, ReadMUL]> {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = op22_20;
@@ -2645,8 +2678,10 @@ def t2SMULTB : T2ThreeRegSMUL<0b001, 0b10, "smultb",
 def t2SMULTT : T2ThreeRegSMUL<0b001, 0b11, "smultt",
              [(set rGPR:$Rd, (mul (sra rGPR:$Rn, (i32 16)),
                                    (sra rGPR:$Rm, (i32 16))))]>;
-def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb", []>;
-def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt", []>;
+def t2SMULWB : T2ThreeRegSMUL<0b011, 0b00, "smulwb",
+             [(set rGPR:$Rd, (ARMsmulwb rGPR:$Rn, rGPR:$Rm))]>;
+def t2SMULWT : T2ThreeRegSMUL<0b011, 0b01, "smulwt",
+             [(set rGPR:$Rd, (ARMsmulwt rGPR:$Rn, rGPR:$Rm))]>;
 
 def : Thumb2DSPPat<(mul sext_16_node:$Rm, sext_16_node:$Rn),
                    (t2SMULBB rGPR:$Rm, rGPR:$Rn)>;
@@ -2659,7 +2694,8 @@ class T2FourRegSMLA<bits<3> op22_20, bits<2> op5_4, string opc,
                     list<dag> pattern>
   : T2FourReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm, rGPR:$Ra), IIC_iMUL16,
                opc, "\t$Rd, $Rn, $Rm, $Ra", pattern>,
-    Requires<[IsThumb2, HasDSP, UseMulOps]> {
+    Requires<[IsThumb2, HasDSP, UseMulOps]>,
+    Sched<[WriteMAC16, ReadMUL, ReadMUL, ReadMAC]>  {
     let Inst{31-27} = 0b11111;
     let Inst{26-23} = 0b0110;
     let Inst{22-20} = op22_20;
@@ -2680,8 +2716,10 @@ def t2SMLATB : T2FourRegSMLA<0b001, 0b10, "smlatb",
 def t2SMLATT : T2FourRegSMLA<0b001, 0b11, "smlatt",
              [(set rGPR:$Rd, (add rGPR:$Ra, (mul (sra rGPR:$Rn, (i32 16)),
                                                  (sra rGPR:$Rm, (i32 16)))))]>;
-def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb", []>;
-def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt", []>;
+def t2SMLAWB : T2FourRegSMLA<0b011, 0b00, "smlawb",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwb rGPR:$Rn, rGPR:$Rm)))]>;
+def t2SMLAWT : T2FourRegSMLA<0b011, 0b01, "smlawt",
+             [(set rGPR:$Rd, (add rGPR:$Ra, (ARMsmulwt rGPR:$Rn, rGPR:$Rm)))]>;
 
 def : Thumb2DSPMulPat<(add rGPR:$Ra, (mul sext_16_node:$Rn, sext_16_node:$Rm)),
                       (t2SMLABB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
@@ -2692,25 +2730,32 @@ def : Thumb2DSPMulPat<(add rGPR:$Ra,
                         (mul (sra rGPR:$Rn, (i32 16)), sext_16_node:$Rm)),
                       (t2SMLATB rGPR:$Rn, rGPR:$Rm, rGPR:$Ra)>;
 
-class T2SMLAL<bits<3> op22_20, bits<4> op7_4, string opc, list<dag> pattern>
-  : T2FourReg_mac<1, op22_20, op7_4,
-                  (outs rGPR:$Ra, rGPR:$Rd),
-                  (ins rGPR:$Rn, rGPR:$Rm),
-                  IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>,
-                  Requires<[IsThumb2, HasDSP]>;
-
 // Halfword multiple accumulate long: SMLAL<x><y>
-def t2SMLALBB : T2SMLAL<0b100, 0b1000, "smlalbb", []>;
-def t2SMLALBT : T2SMLAL<0b100, 0b1001, "smlalbt", []>;
-def t2SMLALTB : T2SMLAL<0b100, 0b1010, "smlaltb", []>;
-def t2SMLALTT : T2SMLAL<0b100, 0b1011, "smlaltt", []>;
+def t2SMLALBB : T2MlaLong<0b100, 0b1000, "smlalbb">,
+                          Requires<[IsThumb2, HasDSP]>;
+def t2SMLALBT : T2MlaLong<0b100, 0b1001, "smlalbt">,
+                          Requires<[IsThumb2, HasDSP]>;
+def t2SMLALTB : T2MlaLong<0b100, 0b1010, "smlaltb">,
+                          Requires<[IsThumb2, HasDSP]>;
+def t2SMLALTT : T2MlaLong<0b100, 0b1011, "smlaltt">,
+                          Requires<[IsThumb2, HasDSP]>;
+
+def : Thumb2DSPPat<(ARMsmlalbb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALBB $Rn, $Rm, $RLo, $RHi)>;
+def : Thumb2DSPPat<(ARMsmlalbt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALBT $Rn, $Rm, $RLo, $RHi)>;
+def : Thumb2DSPPat<(ARMsmlaltb GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALTB $Rn, $Rm, $RLo, $RHi)>;
+def : Thumb2DSPPat<(ARMsmlaltt GPR:$Rn, GPR:$Rm, GPR:$RLo, GPR:$RHi),
+                   (t2SMLALTT $Rn, $Rm, $RLo, $RHi)>;
 
 class T2DualHalfMul<bits<3> op22_20, bits<4> op7_4, string opc>
   : T2ThreeReg_mac<0, op22_20, op7_4,
                    (outs rGPR:$Rd),
                    (ins rGPR:$Rn, rGPR:$Rm),
                    IIC_iMAC32, opc, "\t$Rd, $Rn, $Rm", []>,
-                   Requires<[IsThumb2, HasDSP]> {
+                   Requires<[IsThumb2, HasDSP]>,
+   Sched<[WriteMAC32, ReadMUL, ReadMUL, ReadMAC]> {
   let Inst{15-12} = 0b1111;
 }
 
@@ -2737,7 +2782,8 @@ class T2DualHalfMulAddLong<bits<3> op22_20, bits<4> op7_4, string opc>
                   (outs rGPR:$Ra, rGPR:$Rd),
                   (ins rGPR:$Rn, rGPR:$Rm),
                   IIC_iMAC64, opc, "\t$Ra, $Rd, $Rn, $Rm", []>,
-                  Requires<[IsThumb2, HasDSP]>;
+                  Requires<[IsThumb2, HasDSP]>,
+    Sched<[WriteMAC64Lo, WriteMAC64Hi, ReadMUL, ReadMUL, ReadMAC, ReadMAC]>;
 
 def t2SMLALD  : T2DualHalfMulAddLong<0b100, 0b1100, "smlald">;
 def t2SMLALDX : T2DualHalfMulAddLong<0b100, 0b1101, "smlaldx">;
@@ -2751,7 +2797,8 @@ def t2SMLSLDX : T2DualHalfMulAddLong<0b101, 0b1101, "smlsldx">;
 def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "sdiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (sdiv rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
+                 Requires<[HasDivide, IsThumb, HasV8MBaseline]>,
+             Sched<[WriteDIV]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011100;
   let Inst{20} = 0b1;
@@ -2762,7 +2809,8 @@ def t2SDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
 def t2UDIV : T2ThreeReg<(outs rGPR:$Rd), (ins rGPR:$Rn, rGPR:$Rm), IIC_iDIV,
                  "udiv", "\t$Rd, $Rn, $Rm",
                  [(set rGPR:$Rd, (udiv rGPR:$Rn, rGPR:$Rm))]>,
-                 Requires<[HasDivide, IsThumb, HasV8MBaseline]> {
+                 Requires<[HasDivide, IsThumb, HasV8MBaseline]>,
+             Sched<[WriteDIV]> {
   let Inst{31-27} = 0b11111;
   let Inst{26-21} = 0b011101;
   let Inst{20} = 0b1;
@@ -2819,7 +2867,7 @@ def t2PKHBT : T2ThreeReg<
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF),
                                       (and (shl rGPR:$Rm, pkh_lsl_amt:$sh),
                                            0xFFFF0000)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Requires<[HasDSP, IsThumb2]>,
                   Sched<[WriteALUsi, ReadALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -2835,10 +2883,10 @@ def t2PKHBT : T2ThreeReg<
 // Alternate cases for PKHBT where identities eliminate some nodes.
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (and rGPR:$src2, 0xFFFF0000)),
             (t2PKHBT rGPR:$src1, rGPR:$src2, 0)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF), (shl rGPR:$src2, imm16_31:$sh)),
             (t2PKHBT rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 
 // Note: Shifts of 1-15 bits will be transformed to srl instead of sra and
 // will match the pattern below.
@@ -2848,7 +2896,7 @@ def t2PKHTB : T2ThreeReg<
                   [(set rGPR:$Rd, (or (and rGPR:$Rn, 0xFFFF0000),
                                        (and (sra rGPR:$Rm, pkh_asr_amt:$sh),
                                             0xFFFF)))]>,
-                  Requires<[HasT2ExtractPack, IsThumb2]>,
+                  Requires<[HasDSP, IsThumb2]>,
                   Sched<[WriteALUsi, ReadALU]> {
   let Inst{31-27} = 0b11101;
   let Inst{26-25} = 0b01;
@@ -2867,14 +2915,14 @@ def t2PKHTB : T2ThreeReg<
 // pkhtb src1, src2, asr (17..31).
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (srl rGPR:$src2, imm16:$sh)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm16:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000), (sra rGPR:$src2, imm16_31:$sh)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm16_31:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(or (and rGPR:$src1, 0xFFFF0000),
                 (and (srl rGPR:$src2, imm1_15:$sh), 0xFFFF)),
             (t2PKHTB rGPR:$src1, rGPR:$src2, imm1_15:$sh)>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 
 //===----------------------------------------------------------------------===//
 // CRC32 Instructions
@@ -4216,13 +4264,13 @@ def : T2Pat<(and rGPR:$Rm, 0x000000FF), (t2UXTB rGPR:$Rm, 0)>,
 def : T2Pat<(and rGPR:$Rm, 0x0000FFFF), (t2UXTH rGPR:$Rm, 0)>,
            Requires<[IsThumb2]>;
 def : T2Pat<(and rGPR:$Rm, 0x00FF00FF), (t2UXTB16 rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0x00FF)),
             (t2UXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (and rGPR:$Rm, 0xFFFF)),
             (t2UXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 }
 
 def : T2Pat<(sext_inreg rGPR:$Src, i8),  (t2SXTB rGPR:$Src, 0)>,
@@ -4231,10 +4279,10 @@ def : T2Pat<(sext_inreg rGPR:$Src, i16), (t2SXTH rGPR:$Src, 0)>,
            Requires<[IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i8)),
             (t2SXTAB rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 def : T2Pat<(add rGPR:$Rn, (sext_inreg rGPR:$Rm, i16)),
             (t2SXTAH rGPR:$Rn, rGPR:$Rm, 0)>,
-           Requires<[HasT2ExtractPack, IsThumb2]>;
+           Requires<[HasDSP, IsThumb2]>;
 
 // Atomic load/store patterns
 def : T2Pat<(atomic_load_8   t2addrmode_imm12:$addr),
@@ -4325,26 +4373,26 @@ def : t2InstAlias<"add${s}${p} $Rdn, $ShiftedRm",
                            pred:$p, cc_out:$s)>;
 
 // add w/ negative immediates is just a sub.
-def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm",
         (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
                  cc_out:$s)>;
-def : t2InstAlias<"add${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"add${p} $Rd, $Rn, $imm",
            (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
-def : t2InstAlias<"add${s}${p} $Rdn, $imm",
+def : t2InstSubst<"add${s}${p} $Rdn, $imm",
       (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
                cc_out:$s)>;
-def : t2InstAlias<"add${p} $Rdn, $imm",
+def : t2InstSubst<"add${p} $Rdn, $imm",
            (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
 
-def : t2InstAlias<"add${s}${p}.w $Rd, $Rn, $imm",
+def : t2InstSubst<"add${s}${p}.w $Rd, $Rn, $imm",
         (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm, pred:$p,
                  cc_out:$s)>;
-def : t2InstAlias<"addw${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"addw${p} $Rd, $Rn, $imm",
            (t2SUBri12 GPRnopc:$Rd, GPR:$Rn, imm0_4095_neg:$imm, pred:$p)>;
-def : t2InstAlias<"add${s}${p}.w $Rdn, $imm",
+def : t2InstSubst<"add${s}${p}.w $Rdn, $imm",
       (t2SUBri GPRnopc:$Rdn, GPRnopc:$Rdn, t2_so_imm_neg:$imm, pred:$p,
                cc_out:$s)>;
-def : t2InstAlias<"addw${p} $Rdn, $imm",
+def : t2InstSubst<"addw${p} $Rdn, $imm",
            (t2SUBri12 GPRnopc:$Rdn, GPRnopc:$Rdn, imm0_4095_neg:$imm, pred:$p)>;
 
 
@@ -4431,10 +4479,10 @@ def : t2InstAlias<"mvn${s}${p} $Rd, $ShiftedRm",
 // input operands swapped when the shift amount is zero (i.e., unspecified).
 def : InstAlias<"pkhbt${p} $Rd, $Rn, $Rm",
                 (t2PKHBT rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"pkhtb${p} $Rd, $Rn, $Rm",
                 (t2PKHBT rGPR:$Rd, rGPR:$Rm, rGPR:$Rn, 0, pred:$p), 0>,
-            Requires<[HasT2ExtractPack, IsThumb2]>;
+            Requires<[HasDSP, IsThumb2]>;
 
 // PUSH/POP aliases for STM/LDM
 def : t2InstAlias<"push${p}.w $regs", (t2STMDB_UPD SP, pred:$p, reglist:$regs)>;
@@ -4513,16 +4561,16 @@ def : t2InstAlias<"strh${p} $Rt, $addr",
 // Extend instruction optional rotate operand.
 def : InstAlias<"sxtab${p} $Rd, $Rn, $Rm",
               (t2SXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"sxtah${p} $Rd, $Rn, $Rm",
               (t2SXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"sxtab16${p} $Rd, $Rn, $Rm",
               (t2SXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"sxtb16${p} $Rd, $Rm",
               (t2SXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 
 def : t2InstAlias<"sxtb${p} $Rd, $Rm",
                 (t2SXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
@@ -4535,16 +4583,16 @@ def : t2InstAlias<"sxth${p}.w $Rd, $Rm",
 
 def : InstAlias<"uxtab${p} $Rd, $Rn, $Rm",
               (t2UXTAB rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"uxtah${p} $Rd, $Rn, $Rm",
               (t2UXTAH rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"uxtab16${p} $Rd, $Rn, $Rm",
               (t2UXTAB16 rGPR:$Rd, rGPR:$Rn, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 def : InstAlias<"uxtb16${p} $Rd, $Rm",
               (t2UXTB16 rGPR:$Rd, rGPR:$Rm, 0, pred:$p), 0>,
-              Requires<[HasT2ExtractPack, IsThumb2]>;
+              Requires<[HasDSP, IsThumb2]>;
 
 def : t2InstAlias<"uxtb${p} $Rd, $Rm",
                 (t2UXTB rGPR:$Rd, rGPR:$Rm, 0, pred:$p)>;
@@ -4560,7 +4608,7 @@ def : t2InstAlias<"uxtb${p} $Rd, $Rm$rot",
                   (t2UXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 def : InstAlias<"uxtb16${p} $Rd, $Rm$rot",
                 (t2UXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
-                Requires<[HasT2ExtractPack, IsThumb2]>;
+                Requires<[HasDSP, IsThumb2]>;
 def : t2InstAlias<"uxth${p} $Rd, $Rm$rot",
                   (t2UXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 
@@ -4568,41 +4616,41 @@ def : t2InstAlias<"sxtb${p} $Rd, $Rm$rot",
                   (t2SXTB rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 def : InstAlias<"sxtb16${p} $Rd, $Rm$rot",
                 (t2SXTB16 rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p), 0>,
-                Requires<[HasT2ExtractPack, IsThumb2]>;
+                Requires<[HasDSP, IsThumb2]>;
 def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
                   (t2SXTH rGPR:$Rd, rGPR:$Rm, rot_imm:$rot, pred:$p)>;
 
 
 // "mov Rd, t2_so_imm_not" can be handled via "mvn" in assembly, just like
 // for isel.
-def : t2InstAlias<"mov${p} $Rd, $imm",
+def : t2InstSubst<"mov${p} $Rd, $imm",
                   (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
-def : t2InstAlias<"mvn${p} $Rd, $imm",
-                  (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+def : t2InstSubst<"mvn${s}${p} $Rd, $imm",
+                  (t2MOVi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, s_cc_out:$s)>;
 // Same for AND <--> BIC
-def : t2InstAlias<"bic${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"bic${s}${p} $Rd, $Rn, $imm",
                   (t2ANDri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"bic${s}${p} $Rdn, $imm",
+def : t2InstSubst<"bic${s}${p} $Rdn, $imm",
                   (t2ANDri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"and${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"and${s}${p} $Rd, $Rn, $imm",
                   (t2BICri rGPR:$Rd, rGPR:$Rn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"and${s}${p} $Rdn, $imm",
+def : t2InstSubst<"and${s}${p} $Rdn, $imm",
                   (t2BICri rGPR:$Rdn, rGPR:$Rdn, t2_so_imm_not:$imm,
                            pred:$p, cc_out:$s)>;
 // Likewise, "add Rd, t2_so_imm_neg" -> sub
-def : t2InstAlias<"add${s}${p} $Rd, $Rn, $imm",
+def : t2InstSubst<"add${s}${p} $Rd, $Rn, $imm",
                   (t2SUBri GPRnopc:$Rd, GPRnopc:$Rn, t2_so_imm_neg:$imm,
                            pred:$p, cc_out:$s)>;
-def : t2InstAlias<"add${s}${p} $Rd, $imm",
+def : t2InstSubst<"add${s}${p} $Rd, $imm",
                   (t2SUBri GPRnopc:$Rd, GPRnopc:$Rd, t2_so_imm_neg:$imm,
                            pred:$p, cc_out:$s)>;
 // Same for CMP <--> CMN via t2_so_imm_neg
-def : t2InstAlias<"cmp${p} $Rd, $imm",
+def : t2InstSubst<"cmp${p} $Rd, $imm",
                   (t2CMNri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
-def : t2InstAlias<"cmn${p} $Rd, $imm",
+def : t2InstSubst<"cmn${p} $Rd, $imm",
                   (t2CMPri rGPR:$Rd, t2_so_imm_neg:$imm, pred:$p)>;
 
 
@@ -4616,6 +4664,8 @@ def : t2InstAlias<"neg${s}${p} $Rd, $Rm",
 
 // MOV so_reg assembler pseudos. InstAlias isn't expressive enough for
 // these, unfortunately.
+// FIXME: LSL #0 in the shift should allow SP to be used as either the
+// source or destination (but not both).
 def t2MOVsi: t2AsmPseudo<"mov${p} $Rd, $shift",
                          (ins rGPR:$Rd, t2_so_reg:$shift, pred:$p)>;
 def t2MOVSsi: t2AsmPseudo<"movs${p} $Rd, $shift",
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 4142e8db574d0478209f7d926560df32c4360f1c..0f225156d4cac4b4d51b50a0bfd4bc2010a4a7b5 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -11,14 +11,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-def SDT_CMPFP0  : SDTypeProfile<0, 1, [SDTCisFP<0>]>;
+def SDT_CMPFP0  : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisVT<1, i32>]>;
 def SDT_VMOVDRR : SDTypeProfile<1, 2, [SDTCisVT<0, f64>, SDTCisVT<1, i32>,
                                        SDTCisSameAs<1, 2>]>;
+def SDT_VMOVRRD : SDTypeProfile<2, 1, [SDTCisVT<0, i32>, SDTCisSameAs<0, 1>,
+                                       SDTCisVT<2, f64>]>;
 
 def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
-def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
+def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMFCmp, [SDNPOutGlue]>;
 def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
 def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
+def arm_fmrrd  : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
 
 //===----------------------------------------------------------------------===//
 // Operand Definitions.
@@ -516,12 +519,12 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
-                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm))]>;
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 1))]>;
 
 def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
-                  [(arm_cmpfp SPR:$Sd, SPR:$Sm)]> {
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 1))]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -532,17 +535,15 @@ def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
                   []>;
 
-
-// FIXME: Verify encoding after integrated assembler is working.
 def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmp", ".f64\t$Dd, $Dm",
-                  [/* For disassembly only; pattern left blank */]>;
+                  [(arm_cmpfp DPR:$Dd, (f64 DPR:$Dm), (i32 0))]>;
 
 def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmp", ".f32\t$Sd, $Sm",
-                  [/* For disassembly only; pattern left blank */]> {
+                  [(arm_cmpfp SPR:$Sd, SPR:$Sm, (i32 0))]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -581,7 +582,7 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
-                   [(arm_cmpfp0 (f64 DPR:$Dd))]> {
+                   [(arm_cmpfp0 (f64 DPR:$Dd), (i32 1))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -589,7 +590,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
-                   [(arm_cmpfp0 SPR:$Sd)]> {
+                   [(arm_cmpfp0 SPR:$Sd, (i32 1))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -606,11 +607,10 @@ def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
   let Inst{5}   = 0;
 }
 
-// FIXME: Verify encoding after integrated assembler is working.
 def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmp", ".f64\t$Dd, #0",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(arm_cmpfp0 (f64 DPR:$Dd), (i32 0))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -618,7 +618,7 @@ def VCMPZD  : ADuI<0b11101, 0b11, 0b0101, 0b01, 0,
 def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmp", ".f32\t$Sd, #0",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(arm_cmpfp0 SPR:$Sd, (i32 0))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -680,7 +680,6 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
 
 // Between half, single and double-precision.  For disassembly only.
 
-// FIXME: Verify encoding after integrated assembler is working.
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
                  [/* For disassembly only; pattern left blank */]>,
@@ -1058,7 +1057,7 @@ let hasSideEffects = 0 in {
 def VMOVRRD  : AVConv3I<0b11000101, 0b1011,
                         (outs GPR:$Rt, GPR:$Rt2), (ins DPR:$Dm),
                         IIC_fpMOVDI, "vmov", "\t$Rt, $Rt2, $Dm",
-                 [/* FIXME: Can't write pattern for multiple result instr*/]>,
+                 [(set GPR:$Rt, GPR:$Rt2, (arm_fmrrd DPR:$Dm))]>,
                Sched<[WriteFPMOV]> {
   // Instruction operands.
   bits<5> Dm;
@@ -1513,7 +1512,6 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
 
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
 let Uses = [FPSCR] in {
-// FIXME: Verify encoding after integrated assembler is working.
 def VTOSIRD : AVConv1IsD_Encode<0b11101, 0b11, 0b1101, 0b1011,
                                 (outs SPR:$Sd), (ins DPR:$Dm),
                                 IIC_fpCVTDI, "vcvtr", ".s32.f64\t$Sd, $Dm",
diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp
index 538060d4334b250751fcc98c11d8d690a136aeea..8d224d6a70fa8ecb0a8c2940d4ad8c8e929eb8a6 100644
--- a/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -54,9 +54,21 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
            DstSize <= SrcSize)) &&
          "Copy with different width?!");
 
-  assert(RegBank->getID() == ARM::GPRRegBankID && "Unsupported reg bank");
+  assert((RegBank->getID() == ARM::GPRRegBankID ||
+          RegBank->getID() == ARM::FPRRegBankID) &&
+         "Unsupported reg bank");
+
   const TargetRegisterClass *RC = &ARM::GPRRegClass;
 
+  if (RegBank->getID() == ARM::FPRRegBankID) {
+    if (DstSize == 32)
+      RC = &ARM::SPRRegClass;
+    else if (DstSize == 64)
+      RC = &ARM::DPRRegClass;
+    else
+      llvm_unreachable("Unsupported destination size");
+  }
+
   // No need to constrain SrcReg. It will get constrained when
   // we hit another of its uses or its defs.
   // Copies do not have constraints.
@@ -68,13 +80,101 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
   return true;
 }
 
+static bool selectFAdd(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
+                       MachineRegisterInfo &MRI) {
+  assert(TII.getSubtarget().hasVFP2() && "Can't select fp add without vfp");
+
+  LLT Ty = MRI.getType(MIB->getOperand(0).getReg());
+  unsigned ValSize = Ty.getSizeInBits();
+
+  if (ValSize == 32) {
+    if (TII.getSubtarget().useNEONForSinglePrecisionFP())
+      return false;
+    MIB->setDesc(TII.get(ARM::VADDS));
+  } else {
+    assert(ValSize == 64 && "Unsupported size for floating point value");
+    if (TII.getSubtarget().isFPOnlySP())
+      return false;
+    MIB->setDesc(TII.get(ARM::VADDD));
+  }
+  MIB.add(predOps(ARMCC::AL));
+
+  return true;
+}
+
+static bool selectSequence(MachineInstrBuilder &MIB,
+                           const ARMBaseInstrInfo &TII,
+                           MachineRegisterInfo &MRI,
+                           const TargetRegisterInfo &TRI,
+                           const RegisterBankInfo &RBI) {
+  assert(TII.getSubtarget().hasVFP2() && "Can't select sequence without VFP");
+
+  // We only support G_SEQUENCE as a way to stick together two scalar GPRs
+  // into one DPR.
+  unsigned VReg0 = MIB->getOperand(0).getReg();
+  (void)VReg0;
+  assert(MRI.getType(VReg0).getSizeInBits() == 64 &&
+         RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID &&
+         "Unsupported operand for G_SEQUENCE");
+  unsigned VReg1 = MIB->getOperand(1).getReg();
+  (void)VReg1;
+  assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
+         RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported operand for G_SEQUENCE");
+  unsigned VReg2 = MIB->getOperand(3).getReg();
+  (void)VReg2;
+  assert(MRI.getType(VReg2).getSizeInBits() == 32 &&
+         RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported operand for G_SEQUENCE");
+
+  // Remove the operands corresponding to the offsets.
+  MIB->RemoveOperand(4);
+  MIB->RemoveOperand(2);
+
+  MIB->setDesc(TII.get(ARM::VMOVDRR));
+  MIB.add(predOps(ARMCC::AL));
+
+  return true;
+}
+
+static bool selectExtract(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII,
+                          MachineRegisterInfo &MRI,
+                          const TargetRegisterInfo &TRI,
+                          const RegisterBankInfo &RBI) {
+  assert(TII.getSubtarget().hasVFP2() && "Can't select extract without VFP");
+
+  // We only support G_EXTRACT as a way to break up one DPR into two GPRs.
+  unsigned VReg0 = MIB->getOperand(0).getReg();
+  (void)VReg0;
+  assert(MRI.getType(VReg0).getSizeInBits() == 32 &&
+         RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+         "Unsupported operand for G_EXTRACT");
+  unsigned VReg1 = MIB->getOperand(1).getReg();
+  (void)VReg1;
+  assert(MRI.getType(VReg1).getSizeInBits() == 64 &&
+         RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::FPRRegBankID &&
+         "Unsupported operand for G_EXTRACT");
+  assert(MIB->getOperand(2).getImm() % 32 == 0 &&
+         "Unsupported operand for G_EXTRACT");
+
+  // Remove the operands corresponding to the offsets.
+  MIB->getOperand(2).setImm(MIB->getOperand(2).getImm() / 32);
+
+  MIB->setDesc(TII.get(ARM::VGETLNi32));
+  MIB.add(predOps(ARMCC::AL));
+
+  return true;
+}
+
 /// Select the opcode for simple extensions (that translate to a single SXT/UXT
 /// instruction). Extension operations more complicated than that should not
-/// invoke this.
+/// invoke this. Returns the original opcode if it doesn't know how to select a
+/// better one.
 static unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) {
   using namespace TargetOpcode;
 
-  assert((Size == 8 || Size == 16) && "Unsupported size");
+  if (Size != 8 && Size != 16)
+    return Opc;
 
   if (Opc == G_SEXT)
     return Size == 8 ? ARM::SXTB : ARM::SXTH;
@@ -82,23 +182,42 @@ static unsigned selectSimpleExtOpc(unsigned Opc, unsigned Size) {
   if (Opc == G_ZEXT)
     return Size == 8 ? ARM::UXTB : ARM::UXTH;
 
-  llvm_unreachable("Unsupported opcode");
+  return Opc;
 }
 
-/// Select the opcode for simple loads. For types smaller than 32 bits, the
-/// value will be zero extended.
-static unsigned selectLoadOpCode(unsigned Size) {
-  switch (Size) {
-  case 1:
-  case 8:
-    return ARM::LDRBi12;
-  case 16:
-    return ARM::LDRH;
-  case 32:
-    return ARM::LDRi12;
+/// Select the opcode for simple loads and stores. For types smaller than 32
+/// bits, the value will be zero extended. Returns the original opcode if it
+/// doesn't know how to select a better one.
+static unsigned selectLoadStoreOpCode(unsigned Opc, unsigned RegBank,
+                                      unsigned Size) {
+  bool isStore = Opc == TargetOpcode::G_STORE;
+
+  if (RegBank == ARM::GPRRegBankID) {
+    switch (Size) {
+    case 1:
+    case 8:
+      return isStore ? ARM::STRBi12 : ARM::LDRBi12;
+    case 16:
+      return isStore ? ARM::STRH : ARM::LDRH;
+    case 32:
+      return isStore ? ARM::STRi12 : ARM::LDRi12;
+    default:
+      return Opc;
+    }
   }
 
-  llvm_unreachable("Unsupported size");
+  if (RegBank == ARM::FPRRegBankID) {
+    switch (Size) {
+    case 32:
+      return isStore ? ARM::VSTRS : ARM::VLDRS;
+    case 64:
+      return isStore ? ARM::VSTRD : ARM::VLDRD;
+    default:
+      return Opc;
+    }
+  }
+
+  return Opc;
 }
 
 bool ARMInstructionSelector::select(MachineInstr &I) const {
@@ -163,6 +282,8 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     case 8:
     case 16: {
       unsigned NewOpc = selectSimpleExtOpc(I.getOpcode(), SrcSize);
+      if (NewOpc == I.getOpcode())
+        return false;
       I.setDesc(TII.get(NewOpc));
       MIB.addImm(0).add(predOps(ARMCC::AL));
       break;
@@ -174,31 +295,70 @@ bool ARMInstructionSelector::select(MachineInstr &I) const {
     break;
   }
   case G_ADD:
+  case G_GEP:
     I.setDesc(TII.get(ARM::ADDrr));
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
     break;
+  case G_FADD:
+    if (!selectFAdd(MIB, TII, MRI))
+      return false;
+    break;
   case G_FRAME_INDEX:
     // Add 0 to the given frame index and hope it will eventually be folded into
     // the user(s).
     I.setDesc(TII.get(ARM::ADDri));
     MIB.addImm(0).add(predOps(ARMCC::AL)).add(condCodeOp());
     break;
+  case G_CONSTANT: {
+    unsigned Reg = I.getOperand(0).getReg();
+    if (MRI.getType(Reg).getSizeInBits() != 32)
+      return false;
+
+    assert(RBI.getRegBank(Reg, MRI, TRI)->getID() == ARM::GPRRegBankID &&
+           "Expected constant to live in a GPR");
+    I.setDesc(TII.get(ARM::MOVi));
+    MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
+    break;
+  }
+  case G_STORE:
   case G_LOAD: {
-    LLT ValTy = MRI.getType(I.getOperand(0).getReg());
+    const auto &MemOp = **I.memoperands_begin();
+    if (MemOp.getOrdering() != AtomicOrdering::NotAtomic) {
+      DEBUG(dbgs() << "Atomic load/store not supported yet\n");
+      return false;
+    }
+
+    unsigned Reg = I.getOperand(0).getReg();
+    unsigned RegBank = RBI.getRegBank(Reg, MRI, TRI)->getID();
+
+    LLT ValTy = MRI.getType(Reg);
     const auto ValSize = ValTy.getSizeInBits();
 
-    if (ValSize != 32 && ValSize != 16 && ValSize != 8 && ValSize != 1)
+    assert((ValSize != 64 || TII.getSubtarget().hasVFP2()) &&
+           "Don't know how to load/store 64-bit value without VFP");
+
+    const auto NewOpc = selectLoadStoreOpCode(I.getOpcode(), RegBank, ValSize);
+    if (NewOpc == G_LOAD || NewOpc == G_STORE)
       return false;
 
-    const auto NewOpc = selectLoadOpCode(ValSize);
     I.setDesc(TII.get(NewOpc));
 
-    if (NewOpc == ARM::LDRH)
+    if (NewOpc == ARM::LDRH || NewOpc == ARM::STRH)
       // LDRH has a funny addressing mode (there's already a FIXME for it).
       MIB.addReg(0);
     MIB.addImm(0).add(predOps(ARMCC::AL));
     break;
   }
+  case G_SEQUENCE: {
+    if (!selectSequence(MIB, TII, MRI, TRI, RBI))
+      return false;
+    break;
+  }
+  case G_EXTRACT: {
+    if (!selectExtract(MIB, TII, MRI, TRI, RBI))
+      return false;
+    break;
+  }
   default:
     return false;
   }
diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp
index f12a4f70251783ea77a1fd40907fdb58f4a83927..994bbd673dd87145354cab57de0a65ade1fc59cc 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "ARMLegalizerInfo.h"
+#include "ARMSubtarget.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Type.h"
@@ -23,7 +24,7 @@ using namespace llvm;
 #error "You shouldn't build this"
 #endif
 
-ARMLegalizerInfo::ARMLegalizerInfo() {
+ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) {
   using namespace TargetOpcode;
 
   const LLT p0 = LLT::pointer(0, 32);
@@ -32,12 +33,15 @@ ARMLegalizerInfo::ARMLegalizerInfo() {
   const LLT s8 = LLT::scalar(8);
   const LLT s16 = LLT::scalar(16);
   const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
 
   setAction({G_FRAME_INDEX, p0}, Legal);
 
-  for (auto Ty : {s1, s8, s16, s32})
-    setAction({G_LOAD, Ty}, Legal);
-  setAction({G_LOAD, 1, p0}, Legal);
+  for (unsigned Op : {G_LOAD, G_STORE}) {
+    for (auto Ty : {s1, s8, s16, s32, p0})
+      setAction({Op, Ty}, Legal);
+    setAction({Op, 1, p0}, Legal);
+  }
 
   for (auto Ty : {s1, s8, s16, s32})
     setAction({G_ADD, Ty}, Legal);
@@ -48,5 +52,25 @@ ARMLegalizerInfo::ARMLegalizerInfo() {
       setAction({Op, 1, Ty}, Legal);
   }
 
+  setAction({G_GEP, p0}, Legal);
+  setAction({G_GEP, 1, s32}, Legal);
+
+  setAction({G_CONSTANT, s32}, Legal);
+
+  if (!ST.useSoftFloat() && ST.hasVFP2()) {
+    setAction({G_FADD, s32}, Legal);
+    setAction({G_FADD, s64}, Legal);
+
+    setAction({G_LOAD, s64}, Legal);
+    setAction({G_STORE, s64}, Legal);
+  } else {
+    for (auto Ty : {s32, s64})
+      setAction({G_FADD, Ty}, Libcall);
+  }
+
+  for (unsigned Op : {G_FREM, G_FPOW})
+    for (auto Ty : {s32, s64})
+      setAction({Op, Ty}, Libcall);
+
   computeTables();
 }
diff --git a/lib/Target/ARM/ARMLegalizerInfo.h b/lib/Target/ARM/ARMLegalizerInfo.h
index ca3eea81271bc1afab8c03b91c97029d6fd5d61b..0b8a608a6bdea077b811bb9c5d13d747bcc026cc 100644
--- a/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/lib/Target/ARM/ARMLegalizerInfo.h
@@ -18,12 +18,12 @@
 
 namespace llvm {
 
-class LLVMContext;
+class ARMSubtarget;
 
 /// This class provides the information for the target register banks.
 class ARMLegalizerInfo : public LegalizerInfo {
 public:
-  ARMLegalizerInfo();
+  ARMLegalizerInfo(const ARMSubtarget &ST);
 };
 } // End llvm namespace.
 #endif
diff --git a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index c87f70397b71292a3a409c91c4e273a5423520a9..72fcf7cd6a4fdd967c46d726065f5aa785670592 100644
--- a/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -609,13 +609,12 @@ MachineInstr *ARMLoadStoreOpt::CreateLoadStoreMulti(
   // Exception: If the base register is in the input reglist, Thumb1 LDM is
   // non-writeback.
   // It's also not possible to merge an STR of the base register in Thumb1.
-  if (isThumb1 && isi32Load(Opcode) && ContainsReg(Regs, Base)) {
+  if (isThumb1 && ContainsReg(Regs, Base)) {
     assert(Base != ARM::SP && "Thumb1 does not allow SP in register list");
-    if (Opcode == ARM::tLDRi) {
+    if (Opcode == ARM::tLDRi)
       Writeback = false;
-    } else if (Opcode == ARM::tSTRi) {
+    else if (Opcode == ARM::tSTRi)
       return nullptr;
-    }
   }
 
   ARM_AM::AMSubMode Mode = ARM_AM::ia;
@@ -1962,6 +1961,7 @@ namespace {
     static char ID;
     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
 
+    AliasAnalysis *AA;
     const DataLayout *TD;
     const TargetInstrInfo *TII;
     const TargetRegisterInfo *TRI;
@@ -1975,6 +1975,11 @@ namespace {
       return ARM_PREALLOC_LOAD_STORE_OPT_NAME;
     }
 
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
+      MachineFunctionPass::getAnalysisUsage(AU);
+    }
+
   private:
     bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
                           unsigned &NewOpc, unsigned &EvenReg,
@@ -2004,6 +2009,7 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   TRI = STI->getRegisterInfo();
   MRI = &Fn.getRegInfo();
   MF  = &Fn;
+  AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   bool Modified = false;
   for (MachineBasicBlock &MFI : Fn)
@@ -2017,28 +2023,19 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
                                       MachineBasicBlock::iterator E,
                                       SmallPtrSetImpl<MachineInstr*> &MemOps,
                                       SmallSet<unsigned, 4> &MemRegs,
-                                      const TargetRegisterInfo *TRI) {
+                                      const TargetRegisterInfo *TRI,
+                                      AliasAnalysis *AA) {
   // Are there stores / loads / calls between them?
-  // FIXME: This is overly conservative. We should make use of alias information
-  // some day.
   SmallSet<unsigned, 4> AddedRegPressure;
   while (++I != E) {
     if (I->isDebugValue() || MemOps.count(&*I))
       continue;
     if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects())
       return false;
-    if (isLd && I->mayStore())
-      return false;
-    if (!isLd) {
-      if (I->mayLoad())
-        return false;
-      // It's not safe to move the first 'str' down.
-      // str r1, [r0]
-      // strh r5, [r0]
-      // str r4, [r0, #+4]
-      if (I->mayStore())
-        return false;
-    }
+    if (I->mayStore() || (!isLd && I->mayLoad()))
+      for (MachineInstr *MemOp : MemOps)
+        if (I->mayAlias(AA, *MemOp, /*UseTBAA*/ false))
+          return false;
     for (unsigned j = 0, NumOps = I->getNumOperands(); j != NumOps; ++j) {
       MachineOperand &MO = I->getOperand(j);
       if (!MO.isReg())
@@ -2162,33 +2159,40 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
     unsigned LastBytes = 0;
     unsigned NumMove = 0;
     for (int i = Ops.size() - 1; i >= 0; --i) {
+      // Make sure each operation has the same kind.
       MachineInstr *Op = Ops[i];
-      unsigned Loc = MI2LocMap[Op];
-      if (Loc <= FirstLoc) {
-        FirstLoc = Loc;
-        FirstOp = Op;
-      }
-      if (Loc >= LastLoc) {
-        LastLoc = Loc;
-        LastOp = Op;
-      }
-
       unsigned LSMOpcode
         = getLoadStoreMultipleOpcode(Op->getOpcode(), ARM_AM::ia);
       if (LastOpcode && LSMOpcode != LastOpcode)
         break;
 
+      // Check that we have a continuous set of offsets.
       int Offset = getMemoryOpOffset(*Op);
       unsigned Bytes = getLSMultipleTransferSize(Op);
       if (LastBytes) {
         if (Bytes != LastBytes || Offset != (LastOffset + (int)Bytes))
           break;
       }
+
+      // Don't try to reschedule too many instructions.
+      if (NumMove == 8) // FIXME: Tune this limit.
+        break;
+
+      // Found a mergable instruction; save information about it.
+      ++NumMove;
       LastOffset = Offset;
       LastBytes = Bytes;
       LastOpcode = LSMOpcode;
-      if (++NumMove == 8) // FIXME: Tune this limit.
-        break;
+
+      unsigned Loc = MI2LocMap[Op];
+      if (Loc <= FirstLoc) {
+        FirstLoc = Loc;
+        FirstOp = Op;
+      }
+      if (Loc >= LastLoc) {
+        LastLoc = Loc;
+        LastOp = Op;
+      }
     }
 
     if (NumMove <= 1)
@@ -2196,7 +2200,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
     else {
       SmallPtrSet<MachineInstr*, 4> MemOps;
       SmallSet<unsigned, 4> MemRegs;
-      for (int i = NumMove-1; i >= 0; --i) {
+      for (size_t i = Ops.size() - NumMove, e = Ops.size(); i != e; ++i) {
         MemOps.insert(Ops[i]);
         MemRegs.insert(Ops[i]->getOperand(0).getReg());
       }
@@ -2206,7 +2210,7 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
       bool DoMove = (LastLoc - FirstLoc) <= NumMove*4; // FIXME: Tune this.
       if (DoMove)
         DoMove = IsSafeAndProfitableToMove(isLd, Base, FirstOp, LastOp,
-                                           MemOps, MemRegs, TRI);
+                                           MemOps, MemRegs, TRI, AA);
       if (!DoMove) {
         for (unsigned i = 0; i != NumMove; ++i)
           Ops.pop_back();
diff --git a/lib/Target/ARM/ARMMCInstLower.cpp b/lib/Target/ARM/ARMMCInstLower.cpp
index bbfd2a62464fc9748abe2777f3a842fd4e7a92e4..0fd98268723ab49c4823b5148ebd3cb35c4ea1d8 100644
--- a/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/lib/Target/ARM/ARMMCInstLower.cpp
@@ -38,8 +38,12 @@ using namespace llvm;
 
 MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
                                       const MCSymbol *Symbol) {
+  MCSymbolRefExpr::VariantKind SymbolVariant = MCSymbolRefExpr::VK_None;
+  if (MO.getTargetFlags() & ARMII::MO_SBREL)
+    SymbolVariant = MCSymbolRefExpr::VK_ARM_SBREL;
+
   const MCExpr *Expr =
-      MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+      MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
   switch (MO.getTargetFlags() & ARMII::MO_OPTION_MASK) {
   default:
     llvm_unreachable("Unknown target flag on symbol operand");
@@ -47,12 +51,12 @@ MCOperand ARMAsmPrinter::GetSymbolRef(const MachineOperand &MO,
     break;
   case ARMII::MO_LO16:
     Expr =
-        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+        MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
     Expr = ARMMCExpr::createLower16(Expr, OutContext);
     break;
   case ARMII::MO_HI16:
     Expr =
-        MCSymbolRefExpr::create(Symbol, MCSymbolRefExpr::VK_None, OutContext);
+        MCSymbolRefExpr::create(Symbol, SymbolVariant, OutContext);
     Expr = ARMMCExpr::createUpper16(Expr, OutContext);
     break;
   }
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp
index ee2daa756f09ce2c0b750acf8e7cee0471216c62..08f3da7388684cdebe20e1d285817719d432bad7 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -13,11 +13,15 @@
 
 #include "ARMRegisterBankInfo.h"
 #include "ARMInstrInfo.h" // For the register classes
+#include "ARMSubtarget.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
+#define GET_TARGET_REGBANK_IMPL
+#include "ARMGenRegisterBank.inc"
+
 using namespace llvm;
 
 #ifndef LLVM_BUILD_GLOBAL_ISEL
@@ -29,46 +33,109 @@ using namespace llvm;
 // into an ARMGenRegisterBankInfo.def (similar to AArch64).
 namespace llvm {
 namespace ARM {
-const uint32_t GPRCoverageData[] = {
-    // Classes 0-31
-    (1u << ARM::GPRRegClassID) | (1u << ARM::GPRwithAPSRRegClassID) |
-        (1u << ARM::GPRnopcRegClassID) | (1u << ARM::rGPRRegClassID) |
-        (1u << ARM::hGPRRegClassID) | (1u << ARM::tGPRRegClassID) |
-        (1u << ARM::GPRnopc_and_hGPRRegClassID) |
-        (1u << ARM::hGPR_and_rGPRRegClassID) | (1u << ARM::tcGPRRegClassID) |
-        (1u << ARM::tGPR_and_tcGPRRegClassID) | (1u << ARM::GPRspRegClassID) |
-        (1u << ARM::hGPR_and_tcGPRRegClassID),
-    // Classes 32-63
-    0,
-    // Classes 64-96
-    0,
-    // FIXME: Some of the entries below this point can be safely removed once
-    // this is tablegenerated. It's only needed because of the hardcoded
-    // register class limit.
-    // Classes 97-128
-    0,
-    // Classes 129-160
-    0,
-    // Classes 161-192
-    0,
-    // Classes 193-224
-    0,
+enum PartialMappingIdx {
+  PMI_GPR,
+  PMI_SPR,
+  PMI_DPR,
+  PMI_Min = PMI_GPR,
+};
+
+RegisterBankInfo::PartialMapping PartMappings[]{
+    // GPR Partial Mapping
+    {0, 32, GPRRegBank},
+    // SPR Partial Mapping
+    {0, 32, FPRRegBank},
+    // DPR Partial Mapping
+    {0, 64, FPRRegBank},
 };
 
-// FIXME: The 200 will be replaced by the number of register classes when this is
-//        tablegenerated.
-RegisterBank GPRRegBank(ARM::GPRRegBankID, "GPRB", 32, ARM::GPRCoverageData, 200);
-RegisterBank *RegBanks[] = {&GPRRegBank};
+#ifndef NDEBUG
+static bool checkPartMapping(const RegisterBankInfo::PartialMapping &PM,
+                             unsigned Start, unsigned Length,
+                             unsigned RegBankID) {
+  return PM.StartIdx == Start && PM.Length == Length &&
+         PM.RegBank->getID() == RegBankID;
+}
+
+static void checkPartialMappings() {
+  assert(
+      checkPartMapping(PartMappings[PMI_GPR - PMI_Min], 0, 32, GPRRegBankID) &&
+      "Wrong mapping for GPR");
+  assert(
+      checkPartMapping(PartMappings[PMI_SPR - PMI_Min], 0, 32, FPRRegBankID) &&
+      "Wrong mapping for SPR");
+  assert(
+      checkPartMapping(PartMappings[PMI_DPR - PMI_Min], 0, 64, FPRRegBankID) &&
+      "Wrong mapping for DPR");
+}
+#endif
 
-RegisterBankInfo::PartialMapping GPRPartialMapping{0, 32, GPRRegBank};
+enum ValueMappingIdx {
+  InvalidIdx = 0,
+  GPR3OpsIdx = 1,
+  SPR3OpsIdx = 4,
+  DPR3OpsIdx = 7,
+};
 
 RegisterBankInfo::ValueMapping ValueMappings[] = {
-    {&GPRPartialMapping, 1}, {&GPRPartialMapping, 1}, {&GPRPartialMapping, 1}};
+    // invalid
+    {nullptr, 0},
+    // 3 ops in GPRs
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    {&PartMappings[PMI_GPR - PMI_Min], 1},
+    // 3 ops in SPRs
+    {&PartMappings[PMI_SPR - PMI_Min], 1},
+    {&PartMappings[PMI_SPR - PMI_Min], 1},
+    {&PartMappings[PMI_SPR - PMI_Min], 1},
+    // 3 ops in DPRs
+    {&PartMappings[PMI_DPR - PMI_Min], 1},
+    {&PartMappings[PMI_DPR - PMI_Min], 1},
+    {&PartMappings[PMI_DPR - PMI_Min], 1}};
+
+#ifndef NDEBUG
+static bool checkValueMapping(const RegisterBankInfo::ValueMapping &VM,
+                              RegisterBankInfo::PartialMapping *BreakDown) {
+  return VM.NumBreakDowns == 1 && VM.BreakDown == BreakDown;
+}
+
+static void checkValueMappings() {
+  assert(checkValueMapping(ValueMappings[GPR3OpsIdx],
+                           &PartMappings[PMI_GPR - PMI_Min]) &&
+         "Wrong value mapping for 3 GPR ops instruction");
+  assert(checkValueMapping(ValueMappings[GPR3OpsIdx + 1],
+                           &PartMappings[PMI_GPR - PMI_Min]) &&
+         "Wrong value mapping for 3 GPR ops instruction");
+  assert(checkValueMapping(ValueMappings[GPR3OpsIdx + 2],
+                           &PartMappings[PMI_GPR - PMI_Min]) &&
+         "Wrong value mapping for 3 GPR ops instruction");
+
+  assert(checkValueMapping(ValueMappings[SPR3OpsIdx],
+                           &PartMappings[PMI_SPR - PMI_Min]) &&
+         "Wrong value mapping for 3 SPR ops instruction");
+  assert(checkValueMapping(ValueMappings[SPR3OpsIdx + 1],
+                           &PartMappings[PMI_SPR - PMI_Min]) &&
+         "Wrong value mapping for 3 SPR ops instruction");
+  assert(checkValueMapping(ValueMappings[SPR3OpsIdx + 2],
+                           &PartMappings[PMI_SPR - PMI_Min]) &&
+         "Wrong value mapping for 3 SPR ops instruction");
+
+  assert(checkValueMapping(ValueMappings[DPR3OpsIdx],
+                           &PartMappings[PMI_DPR - PMI_Min]) &&
+         "Wrong value mapping for 3 DPR ops instruction");
+  assert(checkValueMapping(ValueMappings[DPR3OpsIdx + 1],
+                           &PartMappings[PMI_DPR - PMI_Min]) &&
+         "Wrong value mapping for 3 DPR ops instruction");
+  assert(checkValueMapping(ValueMappings[DPR3OpsIdx + 2],
+                           &PartMappings[PMI_DPR - PMI_Min]) &&
+         "Wrong value mapping for 3 DPR ops instruction");
+}
+#endif
 } // end namespace arm
 } // end namespace llvm
 
 ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
-    : RegisterBankInfo(ARM::RegBanks, ARM::NumRegisterBanks) {
+    : ARMGenRegisterBankInfo() {
   static bool AlreadyInit = false;
   // We have only one set of register banks, whatever the subtarget
   // is. Therefore, the initialization of the RegBanks table should be
@@ -99,6 +166,11 @@ ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
   assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
          "Subclass not added?");
   assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit");
+
+#ifndef NDEBUG
+  ARM::checkPartialMappings();
+  ARM::checkValueMappings();
+#endif
 }
 
 const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
@@ -108,8 +180,15 @@ const RegisterBank &ARMRegisterBankInfo::getRegBankFromRegClass(
   switch (RC.getID()) {
   case GPRRegClassID:
   case GPRnopcRegClassID:
+  case GPRspRegClassID:
   case tGPR_and_tcGPRRegClassID:
+  case tGPRRegClassID:
     return getRegBank(ARM::GPRRegBankID);
+  case SPR_8RegClassID:
+  case SPRRegClassID:
+  case DPR_8RegClassID:
+  case DPRRegClassID:
+    return getRegBank(ARM::FPRRegBankID);
   default:
     llvm_unreachable("Unsupported register kind");
   }
@@ -131,25 +210,83 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
 
   using namespace TargetOpcode;
 
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
   unsigned NumOperands = MI.getNumOperands();
-  const ValueMapping *OperandsMapping = &ARM::ValueMappings[0];
+  const ValueMapping *OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx];
 
   switch (Opc) {
   case G_ADD:
-  case G_LOAD:
   case G_SEXT:
   case G_ZEXT:
+  case G_GEP:
     // FIXME: We're abusing the fact that everything lives in a GPR for now; in
     // the real world we would use different mappings.
-    OperandsMapping = &ARM::ValueMappings[0];
+    OperandsMapping = &ARM::ValueMappings[ARM::GPR3OpsIdx];
+    break;
+  case G_LOAD:
+  case G_STORE:
+    OperandsMapping =
+        Ty.getSizeInBits() == 64
+            ? getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+                                  &ARM::ValueMappings[ARM::GPR3OpsIdx]})
+            : &ARM::ValueMappings[ARM::GPR3OpsIdx];
+    break;
+  case G_FADD:
+    assert((Ty.getSizeInBits() == 32 || Ty.getSizeInBits() == 64) &&
+           "Unsupported size for G_FADD");
+    OperandsMapping = Ty.getSizeInBits() == 64
+                          ? &ARM::ValueMappings[ARM::DPR3OpsIdx]
+                          : &ARM::ValueMappings[ARM::SPR3OpsIdx];
     break;
+  case G_CONSTANT:
   case G_FRAME_INDEX:
-    OperandsMapping = getOperandsMapping({&ARM::ValueMappings[0], nullptr});
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
     break;
+  case G_SEQUENCE: {
+    // We only support G_SEQUENCE for creating a double precision floating point
+    // value out of two GPRs.
+    LLT Ty1 = MRI.getType(MI.getOperand(1).getReg());
+    LLT Ty2 = MRI.getType(MI.getOperand(3).getReg());
+    if (Ty.getSizeInBits() != 64 || Ty1.getSizeInBits() != 32 ||
+        Ty2.getSizeInBits() != 32)
+      return InstructionMapping{};
+    OperandsMapping =
+        getOperandsMapping({&ARM::ValueMappings[ARM::DPR3OpsIdx],
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr,
+                            &ARM::ValueMappings[ARM::GPR3OpsIdx], nullptr});
+    break;
+  }
+  case G_EXTRACT: {
+    // We only support G_EXTRACT for splitting a double precision floating point
+    // value into two GPRs.
+    LLT Ty1 = MRI.getType(MI.getOperand(1).getReg());
+    if (Ty.getSizeInBits() != 32 || Ty1.getSizeInBits() != 64 ||
+        MI.getOperand(2).getImm() % 32 != 0)
+      return InstructionMapping{};
+    OperandsMapping = getOperandsMapping({&ARM::ValueMappings[ARM::GPR3OpsIdx],
+                                          &ARM::ValueMappings[ARM::DPR3OpsIdx],
+                                          nullptr, nullptr});
+    break;
+  }
   default:
     return InstructionMapping{};
   }
 
+#ifndef NDEBUG
+  for (unsigned i = 0; i < NumOperands; i++) {
+    for (const auto &Mapping : OperandsMapping[i]) {
+      assert(
+          (Mapping.RegBank->getID() != ARM::FPRRegBankID ||
+           MF.getSubtarget<ARMSubtarget>().hasVFP2()) &&
+          "Trying to use floating point register bank on target without vfp");
+    }
+  }
+#endif
+
   return InstructionMapping{DefaultMappingID, /*Cost=*/1, OperandsMapping,
                             NumOperands};
 }
diff --git a/lib/Target/ARM/ARMRegisterBankInfo.h b/lib/Target/ARM/ARMRegisterBankInfo.h
index 773920ee57a7e4d4d8e0f85c3b0f1b1c744a9c93..5222c1e6389f054b1d10dbfd4d1dcf59f805b05a 100644
--- a/lib/Target/ARM/ARMRegisterBankInfo.h
+++ b/lib/Target/ARM/ARMRegisterBankInfo.h
@@ -16,19 +16,20 @@
 
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 
+#define GET_REGBANK_DECLARATIONS
+#include "ARMGenRegisterBank.inc"
+
 namespace llvm {
 
 class TargetRegisterInfo;
 
-namespace ARM {
-enum {
-  GPRRegBankID = 0, // General purpose registers
-  NumRegisterBanks,
+class ARMGenRegisterBankInfo : public RegisterBankInfo {
+#define GET_TARGET_REGBANK_CLASS
+#include "ARMGenRegisterBank.inc"
 };
-} // end namespace ARM
 
 /// This class provides the information for the target register banks.
-class ARMRegisterBankInfo final : public RegisterBankInfo {
+class ARMRegisterBankInfo final : public ARMGenRegisterBankInfo {
 public:
   ARMRegisterBankInfo(const TargetRegisterInfo &TRI);
 
diff --git a/lib/Target/ARM/ARMRegisterBanks.td b/lib/Target/ARM/ARMRegisterBanks.td
new file mode 100644
index 0000000000000000000000000000000000000000..7cd2d60d36a4b616a49d1512f7ba49eaf8ba40d8
--- /dev/null
+++ b/lib/Target/ARM/ARMRegisterBanks.td
@@ -0,0 +1,14 @@
+//=- ARMRegisterBank.td - Describe the AArch64 Banks ---------*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+def GPRRegBank : RegisterBank<"GPRB", [GPR, GPRwithAPSR]>;
+def FPRRegBank : RegisterBank<"FPRB", [SPR, DPR]>;
diff --git a/lib/Target/ARM/ARMSchedule.td b/lib/Target/ARM/ARMSchedule.td
index f8dacd2877919cf2ee286e1aa158ee05f1e6c2a4..87eb4c2b9074d308fba1747d993ebb1ce2704152 100644
--- a/lib/Target/ARM/ARMSchedule.td
+++ b/lib/Target/ARM/ARMSchedule.td
@@ -72,12 +72,27 @@ def WriteCMP : SchedWrite;
 def WriteCMPsi : SchedWrite;
 def WriteCMPsr : SchedWrite;
 
-// Division.
-def WriteDiv : SchedWrite;
+// Multiplys.
+def WriteMUL16   : SchedWrite; // 16-bit multiply.
+def WriteMUL32   : SchedWrite; // 32-bit multiply.
+def WriteMUL64Lo : SchedWrite; // 64-bit result. Low reg.
+def WriteMUL64Hi : SchedWrite; // 64-bit result. High reg.
+def ReadMUL  : SchedRead;
 
-// Loads.
+// Multiply-accumulates.
+def WriteMAC16   : SchedWrite; // 16-bit mac.
+def WriteMAC32   : SchedWrite; // 32-bit mac.
+def WriteMAC64Lo : SchedWrite; // 64-bit mac. Low reg.
+def WriteMAC64Hi : SchedWrite; // 64-bit mac. High reg.
+def ReadMAC : SchedRead;
+
+// Divisions.
+def WriteDIV : SchedWrite;
+
+// Loads/Stores.
 def WriteLd : SchedWrite;
 def WritePreLd : SchedWrite;
+def WriteST : SchedWrite;
 
 // Branches.
 def WriteBr : SchedWrite;
diff --git a/lib/Target/ARM/ARMScheduleA9.td b/lib/Target/ARM/ARMScheduleA9.td
index a683e05f875e504f22b69309c9a23a156d6fc0ab..8fb8a2a3b6d2df6ad971b7a21a76ff3fbf4a2b83 100644
--- a/lib/Target/ARM/ARMScheduleA9.td
+++ b/lib/Target/ARM/ARMScheduleA9.td
@@ -1944,6 +1944,16 @@ def A9WriteMHi : SchedWriteRes<[A9UnitMul]> { let Latency = 5;
 def A9WriteM16   : SchedWriteRes<[A9UnitMul]> { let Latency = 3; }
 def A9WriteM16Hi : SchedWriteRes<[A9UnitMul]> { let Latency = 4;
                                                 let NumMicroOps = 0; }
+def : SchedAlias<WriteMUL16, A9WriteM16>;
+def : SchedAlias<WriteMUL32, A9WriteM>;
+def : SchedAlias<WriteMUL64Lo, A9WriteM>;
+def : SchedAlias<WriteMUL64Hi, A9WriteMHi>;
+def : SchedAlias<WriteMAC16, A9WriteM16>;
+def : SchedAlias<WriteMAC32, A9WriteM>;
+def : SchedAlias<WriteMAC64Lo, A9WriteM>;
+def : SchedAlias<WriteMAC64Hi, A9WriteMHi>;
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 0>;
 
 // Floating-point
 // Only one FP or AGU instruction may issue per cycle. We model this
@@ -1953,6 +1963,7 @@ def A9WriteFMov   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 1; }
 def A9WriteFMulS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 5; }
 def A9WriteFMulD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 6; }
 def A9WriteFMAS   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 8; }
+
 def A9WriteFMAD   : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 9; }
 def A9WriteFDivS  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 15; }
 def A9WriteFDivD  : SchedWriteRes<[A9UnitFP, A9UnitAGU]> { let Latency = 25; }
@@ -1992,6 +2003,7 @@ def A9WriteAdr : SchedWriteRes<[A9UnitAGU]> { let NumMicroOps = 0; }
 
 // Load Integer.
 def A9WriteL : SchedWriteRes<[A9UnitLS]> { let Latency = 3; }
+def : SchedAlias<WriteLd, A9WriteL>;
 // Load the upper 32-bits using the same micro-op.
 def A9WriteLHi : SchedWriteRes<[]> { let Latency = 3;
                                      let NumMicroOps = 0; }
@@ -2471,6 +2483,7 @@ def : SchedAlias<WriteALUsr, A9WriteALUsr>;
 def : SchedAlias<WriteALUSsr, A9WriteALUsr>;
 def : SchedAlias<ReadALU, A9ReadALU>;
 def : SchedAlias<ReadALUsr, A9ReadALU>;
+def : SchedAlias<WriteST, A9WriteS>;
 
 // ===---------------------------------------------------------------------===//
 // Floating-point. Map target defined SchedReadWrite to processor specific ones
@@ -2545,7 +2558,7 @@ def : InstRW<[A9WriteLb],
       "LDRH", "LDRSH", "LDRSB")>;
 def : InstRW<[A9WriteLbsi], (instregex "LDRrs")>;
 
-def : WriteRes<WriteDiv, []> { let Latency = 0; }
+def : WriteRes<WriteDIV, []> { let Latency = 0; }
 
 def : WriteRes<WriteBr, [A9UnitB]>;
 def : WriteRes<WriteBrL, [A9UnitB]>;
diff --git a/lib/Target/ARM/ARMScheduleR52.td b/lib/Target/ARM/ARMScheduleR52.td
index 3e684ed971301f303aa921d302961254eafb7fd8..537e5da9669f39f873be0c562eeb06dc4e143b58 100644
--- a/lib/Target/ARM/ARMScheduleR52.td
+++ b/lib/Target/ARM/ARMScheduleR52.td
@@ -70,15 +70,13 @@ def : WriteRes<WriteCMP, [R52UnitALU]> { let Latency = 0; }
 def : WriteRes<WriteCMPsi, [R52UnitALU]> { let Latency = 0; }
 def : WriteRes<WriteCMPsr, [R52UnitALU]> { let Latency = 0; }
 
+// Multiply - aliased to sub-target specific later
+
 // Div - may stall 0-9 cycles depending on input (i.e. WRI+(0-9)/2)
-def : WriteRes<WriteDiv, [R52UnitDiv]> {
-  let Latency = 8; let ResourceCycles = [8]; // not pipelined
+def : WriteRes<WriteDIV, [R52UnitDiv]> {
+  let Latency = 8; let ResourceCycles = [8]; // non-pipelined
 }
 
-// Loads
-def : WriteRes<WriteLd, [R52UnitLd]> { let Latency = 4; }
-def : WriteRes<WritePreLd, [R52UnitLd]> { let Latency = 4; }
-
 // Branches  - LR written in Late EX2
 def : WriteRes<WriteBr, [R52UnitB]> { let Latency = 0; }
 def : WriteRes<WriteBrL, [R52UnitB]> { let Latency = 0; }
@@ -90,7 +88,8 @@ def : WriteRes<WriteNoop, []> { let Latency = 0; let NumMicroOps = 0; }
 // Integer pipeline by-passes
 def : ReadAdvance<ReadALU, 1>;   // Operand needed in EX1 stage
 def : ReadAdvance<ReadALUsr, 0>; // Shift operands needed in ISS
-
+def : ReadAdvance<ReadMUL, 0>;
+def : ReadAdvance<ReadMAC, 0>;
 
 // Floating-point. Map target-defined SchedReadWrites to subtarget
 def : WriteRes<WriteFPMUL32, [R52UnitFPMUL]> { let Latency = 6; }
@@ -124,7 +123,6 @@ def : WriteRes<WriteFPSQRT64, [R52UnitDiv]> { let Latency = 17; }
 def : ReadAdvance<ReadFPMUL, 1>; // mul operand read in F1
 def : ReadAdvance<ReadFPMAC, 1>; // fp-mac operand read in F1
 
-
 //===----------------------------------------------------------------------===//
 // Subtarget-specific SchedReadWrites.
 
@@ -139,6 +137,9 @@ def : ReadAdvance<R52Read_F2, 2>;
 
 // Cortex-R52 specific SchedWrites for use with InstRW
 def R52WriteMAC        : SchedWriteRes<[R52UnitMAC]> { let Latency = 4; }
+def R52WriteMACHi      : SchedWriteRes<[R52UnitMAC]> {
+  let Latency = 4; let NumMicroOps = 0;
+}
 def R52WriteDIV        : SchedWriteRes<[R52UnitDiv]> {
   let Latency = 8; let ResourceCycles = [8]; // not pipelined
 }
@@ -153,6 +154,19 @@ def R52WriteALU_WRI    : SchedWriteRes<[R52UnitALU]> { let Latency = 4; }
 def R52WriteNoRSRC_EX2 : SchedWriteRes<[]> { let Latency = 3; }
 def R52WriteNoRSRC_WRI : SchedWriteRes<[]> { let Latency = 4; }
 
+// Alias generics to sub-target specific
+def : SchedAlias<WriteMUL16, R52WriteMAC>;
+def : SchedAlias<WriteMUL32, R52WriteMAC>;
+def : SchedAlias<WriteMUL64Lo, R52WriteMAC>;
+def : SchedAlias<WriteMUL64Hi, R52WriteMACHi>;
+def : SchedAlias<WriteMAC16, R52WriteMAC>;
+def : SchedAlias<WriteMAC32, R52WriteMAC>;
+def : SchedAlias<WriteMAC64Lo, R52WriteMAC>;
+def : SchedAlias<WriteMAC64Hi, R52WriteMACHi>;
+def : SchedAlias<WritePreLd, R52WriteLd>;
+def : SchedAlias<WriteLd, R52WriteLd>;
+def : SchedAlias<WriteST, R52WriteST>;
+
 def R52WriteFPALU_F3   : SchedWriteRes<[R52UnitFPALU]> { let Latency = 4; }
 def R52Write2FPALU_F3  : SchedWriteRes<[R52UnitFPALU, R52UnitFPALU]> {
   let Latency = 4;
@@ -266,7 +280,7 @@ def : InstRW<[R52WriteMAC, R52Read_ISS, R52Read_ISS, R52Read_ISS],
       "t2SMLSLD", "t2SMLSLDX", "t2UMAAL")>;
 
 def : InstRW <[R52WriteDIV, R52Read_ISS, R52Read_ISS],
-      (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
+      (instregex "t2SDIV", "t2UDIV")>;
 
 // Loads (except POST) with SHL > 2, or ror, require 2 extra cycles.
 // However, that's non-trivial to specify, so we keep it uniform
@@ -325,15 +339,6 @@ def : InstRW<[R52WriteCC, R52Read_ISS], (instregex "TST")>;
 def : InstRW<[R52WriteLd], (instregex "MRS", "MRSbanked")>;
 def : InstRW<[R52WriteLd, R52Read_EX1], (instregex "MSR", "MSRbanked")>;
 
-//def : InstRW<[R52WriteLd, R52Read_ISS], (instregex "^LDRB?(_PRE_IMM|_POST_IMM)", "LDRrs")>;
-//def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS], (instregex "^LDRB?_PRE_REG", "LDRB?rr")>;
-//def : InstRW<[R52WriteLd, R52Read_ISS, R52Read_ISS], (instregex "^LDRB?_POST_REG")>;
-
-//def : InstRW<[R52WriteST, R52Read_ISS], (instregex "STRi12", "PICSTR")>;
-//def : InstRW<[R52WriteST, R52WriteAdr, R52Read_ISS, R52Read_EX2], (instregex "t2STRB?_PRE_REG", "STRB?_PRE_REG")>;
-//def : InstRW<[R52WriteST, R52WriteAdr, R52Read_ISS, R52Read_EX2], (instregex "t2STRB?_POST_REG", "STRB?_POST_REG")>;
-
-
 // Integer Load, Multiple.
 foreach Lat = 3-25 in {
   def R52WriteILDM#Lat#Cy : SchedWriteRes<[R52UnitLd]> {
@@ -712,16 +717,19 @@ def R52WriteVLD2Mem  : SchedWriteRes<[R52UnitLd]> {
   let Latency = 6;
   let NumMicroOps = 3;
   let ResourceCycles = [2];
+  let SingleIssue = 1;
 }
 def R52WriteVLD3Mem  : SchedWriteRes<[R52UnitLd]> {
   let Latency = 7;
   let NumMicroOps = 5;
   let ResourceCycles = [3];
+  let SingleIssue = 1;
 }
 def R52WriteVLD4Mem  : SchedWriteRes<[R52UnitLd]> {
   let Latency = 8;
   let NumMicroOps = 7;
   let ResourceCycles = [4];
+  let SingleIssue = 1;
 }
 def R52WriteVST1Mem  : SchedWriteRes<[R52UnitLd]> {
   let Latency = 5;
diff --git a/lib/Target/ARM/ARMScheduleSwift.td b/lib/Target/ARM/ARMScheduleSwift.td
index 6c7573b3890523f958d1783347d7aa8c27012ebc..dc041c6c6006bd3dd82f0c5c467bcee987468c4f 100644
--- a/lib/Target/ARM/ARMScheduleSwift.td
+++ b/lib/Target/ARM/ARMScheduleSwift.td
@@ -133,6 +133,8 @@ let SchedModel = SwiftModel in {
   def : SchedAlias<WriteALUSsr, SwiftWriteALUSsr>;
   def : ReadAdvance<ReadALU, 0>;
   def : SchedAlias<ReadALUsr, SwiftReadAdvanceALUsr>;
+  def : SchedAlias<WriteLd, SwiftWriteP2ThreeCycle>;
+  def : SchedAlias<WriteST, SwiftWriteP2>;
 
 
   def SwiftChooseShiftKindP01OneOrTwoCycle : SchedWriteVariant<[
@@ -166,10 +168,10 @@ let SchedModel = SwiftModel in {
   def : InstRW<[SwiftWriteP01OneCycle2x_load],
         (instregex "MOV_ga_pcrel_ldr", "t2MOV_ga_pcrel_ldr")>;
 
-  def SwiftWriteP0TwoCyleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
+  def SwiftWriteP0TwoCycleTwoUops : WriteSequence<[SwiftWriteP0OneCycle], 2>;
 
   def SwiftPredP0OneOrTwoCycle : SchedWriteVariant<[
-    SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCyleTwoUops ]>,
+    SchedVar<IsPredicatedPred, [ SwiftWriteP0TwoCycleTwoUops ]>,
     SchedVar<NoSchedPred,     [ SwiftWriteP0OneCycle ]>
   ]>;
 
@@ -282,6 +284,18 @@ let SchedModel = SwiftModel in {
     let ResourceCycles = [2, 3];
   }
 
+  // Aliasing sub-target specific WriteRes to generic ones
+  def : SchedAlias<WriteMUL16, SwiftWriteP0FourCycle>;
+  def : SchedAlias<WriteMUL32, SwiftWriteP0FourCycle>;
+  def : SchedAlias<WriteMUL64Lo, SwiftP0P0P01FiveCycle>;
+  def : SchedAlias<WriteMUL64Hi, SwiftWrite5Cycle>;
+  def : SchedAlias<WriteMAC16, SwiftPredP0P01FourFiveCycle>;
+  def : SchedAlias<WriteMAC32, SwiftPredP0P01FourFiveCycle>;
+  def : SchedAlias<WriteMAC64Lo, SwiftWrite5Cycle>;
+  def : SchedAlias<WriteMAC64Hi, Swift2P03P01FiveCycle>;
+  def : ReadAdvance<ReadMUL, 0>;
+  def : SchedAlias<ReadMAC, SwiftReadAdvanceFourCyclesPred>;
+
   // 4.2.15 Integer Multiply Accumulate, Long
   // 4.2.16 Integer Multiply Accumulate, Dual
   // 4.2.17 Integer Multiply Accumulate Accumulate, Long
@@ -300,7 +314,7 @@ let SchedModel = SwiftModel in {
     let ResourceCycles = [1, 14];
   }
   // 4.2.18 Integer Divide
-  def : WriteRes<WriteDiv, [SwiftUnitDiv]>; // Workaround.
+  def : WriteRes<WriteDIV, [SwiftUnitDiv]>; // Workaround.
   def : InstRW <[SwiftDiv],
         (instregex "SDIV", "UDIV", "t2SDIV", "t2UDIV")>;
 
@@ -310,7 +324,7 @@ let SchedModel = SwiftModel in {
     let Latency = 3;
     let NumMicroOps = 2;
   }
-  def SwiftWriteP2P01FourCyle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
+  def SwiftWriteP2P01FourCycle : SchedWriteRes<[SwiftUnitP2, SwiftUnitP01]> {
     let Latency = 4;
     let NumMicroOps = 2;
   }
@@ -343,7 +357,7 @@ let SchedModel = SwiftModel in {
         "tLDR(r|i|spi|pci|pciASM)")>;
   def : InstRW<[SwiftWriteP2ThreeCycle],
         (instregex "LDRH$",  "PICLDR$", "PICLDR(H|B)$", "LDRcp$")>;
-  def : InstRW<[SwiftWriteP2P01FourCyle],
+  def : InstRW<[SwiftWriteP2P01FourCycle],
         (instregex "PICLDRS(H|B)$", "t2LDRS(H|B)(i|r|p|s)", "LDRS(H|B)$",
         "t2LDRpci_pic", "tLDRS(B|H)")>;
   def : InstRW<[SwiftWriteP2P01ThreeCycle,  SwiftWrBackOne],
diff --git a/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index 3b99762f7157a353ed375f44a8fe733c0d062b66..33dcf9b8fef02dac43f2bae53d68134359ab53ea 100644
--- a/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -95,7 +95,7 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
 
     Entry.Node = Src; 
     Entry.Ty = Type::getInt32Ty(*DAG.getContext());
-    Entry.isSExt = false;
+    Entry.IsSExt = false;
     Args.push_back(Entry);
   } else {
     Entry.Node = Src;
@@ -114,11 +114,11 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(
-           TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
-           DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
-                                 TLI->getPointerTy(DAG.getDataLayout())),
-           std::move(Args))
+      .setLibCallee(
+          TLI->getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()),
+          DAG.getExternalSymbol(FunctionNames[AEABILibcall][AlignVariant],
+                                TLI->getPointerTy(DAG.getDataLayout())),
+          std::move(Args))
       .setDiscardResult();
   std::pair<SDValue,SDValue> CallResult = TLI->LowerCallTo(CLI);
   
@@ -198,17 +198,18 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
     return Chain;
 
   // Issue loads / stores for the trailing (1 - 3) bytes.
+  auto getRemainingValueType = [](unsigned BytesLeft) {
+    return (BytesLeft >= 2) ? MVT::i16 : MVT::i8;
+  };
+  auto getRemainingSize = [](unsigned BytesLeft) {
+    return (BytesLeft >= 2) ? 2 : 1;
+  };
+
   unsigned BytesLeftSave = BytesLeft;
   i = 0;
   while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
+    VT = getRemainingValueType(BytesLeft);
+    VTSize = getRemainingSize(BytesLeft);
     Loads[i] = DAG.getLoad(VT, dl, Chain,
                            DAG.getNode(ISD::ADD, dl, MVT::i32, Src,
                                        DAG.getConstant(SrcOff, dl, MVT::i32)),
@@ -224,14 +225,8 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
   i = 0;
   BytesLeft = BytesLeftSave;
   while (BytesLeft) {
-    if (BytesLeft >= 2) {
-      VT = MVT::i16;
-      VTSize = 2;
-    } else {
-      VT = MVT::i8;
-      VTSize = 1;
-    }
-
+    VT = getRemainingValueType(BytesLeft);
+    VTSize = getRemainingSize(BytesLeft);
     TFOps[i] = DAG.getStore(Chain, dl, Loads[i],
                             DAG.getNode(ISD::ADD, dl, MVT::i32, Dst,
                                         DAG.getConstant(DstOff, dl, MVT::i32)),
diff --git a/lib/Target/ARM/ARMSubtarget.cpp b/lib/Target/ARM/ARMSubtarget.cpp
index 5596478a3b2be287765e6ed1067d9a20270deec6..b8a708a20a9556343956de5dbfe51ab1ba84d0f2 100644
--- a/lib/Target/ARM/ARMSubtarget.cpp
+++ b/lib/Target/ARM/ARMSubtarget.cpp
@@ -202,12 +202,12 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // support in the assembler and linker to be used. This would need to be
   // fixed to fully support tail calls in Thumb1.
   //
-  // Doing this is tricky, since the LDM/POP instruction on Thumb doesn't take
-  // LR.  This means if we need to reload LR, it takes an extra instructions,
-  // which outweighs the value of the tail call; but here we don't know yet
-  // whether LR is going to be used.  Probably the right approach is to
-  // generate the tail call here and turn it back into CALL/RET in
-  // emitEpilogue if LR is used.
+  // For ARMv8-M, we /do/ implement tail calls.  Doing this is tricky for v8-M
+  // baseline, since the LDM/POP instruction on Thumb doesn't take LR.  This
+  // means if we need to reload LR, it takes extra instructions, which outweighs
+  // the value of the tail call; but here we don't know yet whether LR is going
+  // to be used. We generate the tail call here and turn it back into CALL/RET
+  // in emitEpilogue if LR is used.
 
   // Thumb1 PIC calls to external symbols use BX, so they can be tail calls,
   // but we need to make sure there are enough registers; the only valid
@@ -276,6 +276,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   case CortexM3:
   case ExynosM1:
   case CortexR52:
+  case Kryo:
     break;
   case Krait:
     PreISelOperandLatencyAdjustment = 1;
diff --git a/lib/Target/ARM/ARMSubtarget.h b/lib/Target/ARM/ARMSubtarget.h
index cc172ea68c38d4b27dbe16abe3702c03adaf5e36..40993fc0aa8acdea1ca8d93c4eb7a0b587dbcfd0 100644
--- a/lib/Target/ARM/ARMSubtarget.h
+++ b/lib/Target/ARM/ARMSubtarget.h
@@ -41,18 +41,66 @@ class StringRef;
 class ARMSubtarget : public ARMGenSubtargetInfo {
 protected:
   enum ARMProcFamilyEnum {
-    Others, CortexA5, CortexA7, CortexA8, CortexA9, CortexA12, CortexA15,
-    CortexA17, CortexR4, CortexR4F, CortexR5, CortexR7, CortexR52, CortexM3,
-    CortexA32, CortexA35, CortexA53, CortexA57, CortexA72, CortexA73,
-    Krait, Swift, ExynosM1
+    Others,
+
+    CortexA12,
+    CortexA15,
+    CortexA17,
+    CortexA32,
+    CortexA35,
+    CortexA5,
+    CortexA53,
+    CortexA57,
+    CortexA7,
+    CortexA72,
+    CortexA73,
+    CortexA8,
+    CortexA9,
+    CortexM3,
+    CortexR4,
+    CortexR4F,
+    CortexR5,
+    CortexR52,
+    CortexR7,
+    ExynosM1,
+    Krait,
+    Kryo,
+    Swift
   };
   enum ARMProcClassEnum {
-    None, AClass, RClass, MClass
+    None,
+
+    AClass,
+    MClass,
+    RClass
   };
   enum ARMArchEnum {
-    ARMv2, ARMv2a, ARMv3, ARMv3m, ARMv4, ARMv4t, ARMv5, ARMv5t, ARMv5te,
-    ARMv5tej, ARMv6, ARMv6k, ARMv6kz, ARMv6t2, ARMv6m, ARMv6sm, ARMv7a, ARMv7r,
-    ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a, ARMv8mMainline, ARMv8mBaseline,
+    ARMv2,
+    ARMv2a,
+    ARMv3,
+    ARMv3m,
+    ARMv4,
+    ARMv4t,
+    ARMv5,
+    ARMv5t,
+    ARMv5te,
+    ARMv5tej,
+    ARMv6,
+    ARMv6k,
+    ARMv6kz,
+    ARMv6m,
+    ARMv6sm,
+    ARMv6t2,
+    ARMv7a,
+    ARMv7em,
+    ARMv7m,
+    ARMv7r,
+    ARMv7ve,
+    ARMv81a,
+    ARMv82a,
+    ARMv8a,
+    ARMv8mBaseline,
+    ARMv8mMainline,
     ARMv8r
   };
 
@@ -166,10 +214,6 @@ protected:
   /// HasHardwareDivideInARM - True if subtarget supports [su]div in ARM mode
   bool HasHardwareDivideInARM = false;
 
-  /// HasT2ExtractPack - True if subtarget supports thumb2 extract/pack
-  /// instructions.
-  bool HasT2ExtractPack = false;
-
   /// HasDataBarrier - True if the subtarget supports DMB / DSB data barrier
   /// instructions.
   bool HasDataBarrier = false;
@@ -308,6 +352,10 @@ protected:
   /// UseSjLjEH - If true, the target uses SjLj exception handling (e.g. iOS).
   bool UseSjLjEH = false;
 
+  /// Implicitly convert an instruction to a different one if its immediates
+  /// cannot be encoded. For example, ADD r0, r1, #FFFFFFFF -> SUB r0, r1, #1.
+  bool NegativeImmediates = true;
+
   /// stackAlignment - The minimum alignment known to hold of the stack frame on
   /// entry to the function and which must be maintained by every function.
   unsigned stackAlignment = 4;
@@ -461,7 +509,6 @@ public:
 
   bool hasDivide() const { return HasHardwareDivide; }
   bool hasDivideInARMMode() const { return HasHardwareDivideInARM; }
-  bool hasT2ExtractPack() const { return HasT2ExtractPack; }
   bool hasDataBarrier() const { return HasDataBarrier; }
   bool hasV7Clrex() const { return HasV7Clrex; }
   bool hasAcquireRelease() const { return HasAcquireRelease; }
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index 4ee070ebb7a9ad94b3979b5bb462c29c03c49518..b8dadb331ecf6033e4436036d796f155452ae3be 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExecutionDepsFix.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
@@ -76,6 +77,10 @@ static cl::opt<cl::boolOrDefault>
 EnableGlobalMerge("arm-global-merge", cl::Hidden,
                   cl::desc("Enable the global merge pass"));
 
+namespace llvm {
+  void initializeARMExecutionDepsFixPass(PassRegistry&);
+}
+
 extern "C" void LLVMInitializeARMTarget() {
   // Register the target.
   RegisterTargetMachine<ARMLETargetMachine> X(getTheARMLETarget());
@@ -87,6 +92,8 @@ extern "C" void LLVMInitializeARMTarget() {
   initializeGlobalISel(Registry);
   initializeARMLoadStoreOptPass(Registry);
   initializeARMPreAllocLoadStoreOptPass(Registry);
+  initializeARMConstantIslandsPass(Registry);
+  initializeARMExecutionDepsFixPass(Registry);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -325,7 +332,7 @@ ARMBaseTargetMachine::getSubtargetImpl(const Function &F) const {
 #else
     ARMGISelActualAccessor *GISel = new ARMGISelActualAccessor();
     GISel->CallLoweringInfo.reset(new ARMCallLowering(*I->getTargetLowering()));
-    GISel->Legalizer.reset(new ARMLegalizerInfo());
+    GISel->Legalizer.reset(new ARMLegalizerInfo(*I));
 
     auto *RBI = new ARMRegisterBankInfo(*I->getRegisterInfo());
 
@@ -440,8 +447,21 @@ public:
   void addPreEmitPass() override;
 };
 
+class ARMExecutionDepsFix : public ExecutionDepsFix {
+public:
+  static char ID;
+  ARMExecutionDepsFix() : ExecutionDepsFix(ID, ARM::DPRRegClass) {}
+  StringRef getPassName() const override {
+    return "ARM Execution Dependency Fix";
+  }
+};
+char ARMExecutionDepsFix::ID;
+
 } // end anonymous namespace
 
+INITIALIZE_PASS(ARMExecutionDepsFix, "arm-execution-deps-fix",
+                "ARM Execution Dependency Fix", false, false)
+
 TargetPassConfig *ARMBaseTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new ARMPassConfig(this, PM);
 }
@@ -535,7 +555,7 @@ void ARMPassConfig::addPreSched2() {
     if (EnableARMLoadStoreOpt)
       addPass(createARMLoadStoreOptimizationPass());
 
-    addPass(createExecutionDependencyFixPass(&ARM::DPRRegClass));
+    addPass(new ARMExecutionDepsFix());
   }
 
   // Expand some pseudo instructions into multiple instructions to allow
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.cpp b/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 2b6b36bc3e6831b550c56f441e10da51a07391f6..8eb9dbf5f9de69791dafd346625941a5e958fd83 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -92,7 +92,8 @@ int ARMTTIImpl::getIntImmCost(unsigned Opcode, unsigned Idx, const APInt &Imm,
 }
 
 
-int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -310,7 +311,8 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 }
 
-int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // On NEON a a vector select gets lowered to vbsl.
@@ -335,7 +337,7 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
     return LT.first;
   }
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -504,7 +506,7 @@ int ARMTTIImpl::getArithmeticInstrCost(
 }
 
 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
 
   if (Src->isVectorTy() && Alignment != 16 &&
@@ -529,12 +531,14 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits) {
     unsigned NumElts = VecTy->getVectorNumElements();
-    Type *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
-    unsigned SubVecSize = DL.getTypeSizeInBits(SubVecTy);
+    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
-    if (NumElts % Factor == 0 && (SubVecSize == 64 || SubVecSize == 128))
-      return Factor;
+    // Accesses having vector types that are a multiple of 128 bits can be
+    // matched to more than one vldN/vstN instruction.
+    if (NumElts % Factor == 0 &&
+        TLI->isLegalInterleavedAccessType(SubVecTy, DL))
+      return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL);
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
diff --git a/lib/Target/ARM/ARMTargetTransformInfo.h b/lib/Target/ARM/ARMTargetTransformInfo.h
index 5ee1f7c4b0e4a7a85c7f441dd84dbdc8822364c7..7de0543dfa5e0f438f34750ecb78da02e56c3c19 100644
--- a/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -94,9 +94,11 @@ public:
 
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
 
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
 
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
 
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
@@ -114,7 +116,7 @@ public:
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
 
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
                                  ArrayRef<unsigned> Indices, unsigned Alignment,
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index c243a2d359798716426815a5f0578bb9e55e425b..f421d3ac1693b03d67ef2e1e97d4e986e4f007b0 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -915,40 +915,37 @@ public:
     int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
     return Val != -1;
   }
-  bool isFBits16() const {
+
+  template<int64_t N, int64_t M>
+  bool isImmediate() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return Value >= 0 && Value <= 16;
+    return Value >= N && Value <= M;
   }
-  bool isFBits32() const {
+  template<int64_t N, int64_t M>
+  bool isImmediateS4() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
     if (!CE) return false;
     int64_t Value = CE->getValue();
-    return Value >= 1 && Value <= 32;
+    return ((Value & 3) == 0) && Value >= N && Value <= M;
+  }
+  bool isFBits16() const {
+    return isImmediate<0, 17>();
+  }
+  bool isFBits32() const {
+    return isImmediate<1, 33>();
   }
   bool isImm8s4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ((Value & 3) == 0) && Value >= -1020 && Value <= 1020;
+    return isImmediateS4<-1020, 1020>();
   }
   bool isImm0_1020s4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ((Value & 3) == 0) && Value >= 0 && Value <= 1020;
+    return isImmediateS4<0, 1020>();
   }
   bool isImm0_508s4() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return ((Value & 3) == 0) && Value >= 0 && Value <= 508;
+    return isImmediateS4<0, 508>();
   }
   bool isImm0_508s4Neg() const {
     if (!isImm()) return false;
@@ -958,27 +955,6 @@ public:
     // explicitly exclude zero. we want that to use the normal 0_508 version.
     return ((Value & 3) == 0) && Value > 0 && Value <= 508;
   }
-  bool isImm0_239() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 240;
-  }
-  bool isImm0_255() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 256;
-  }
-  bool isImm0_4095() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 4096;
-  }
   bool isImm0_4095Neg() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
@@ -986,145 +962,17 @@ public:
     int64_t Value = -CE->getValue();
     return Value > 0 && Value < 4096;
   }
-  bool isImm0_1() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 2;
-  }
-  bool isImm0_3() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 4;
-  }
   bool isImm0_7() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 8;
-  }
-  bool isImm0_15() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 16;
-  }
-  bool isImm0_31() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 32;
-  }
-  bool isImm0_63() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 64;
-  }
-  bool isImm8() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value == 8;
-  }
-  bool isImm16() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value == 16;
-  }
-  bool isImm32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value == 32;
-  }
-  bool isShrImm8() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 8;
-  }
-  bool isShrImm16() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 16;
-  }
-  bool isShrImm32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 32;
-  }
-  bool isShrImm64() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 64;
-  }
-  bool isImm1_7() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 8;
-  }
-  bool isImm1_15() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 16;
-  }
-  bool isImm1_31() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 32;
+    return isImmediate<0, 7>();
   }
   bool isImm1_16() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 17;
+    return isImmediate<1, 16>();
   }
   bool isImm1_32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 33;
-  }
-  bool isImm0_32() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 33;
+    return isImmediate<1, 32>();
   }
-  bool isImm0_65535() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 65536;
+  bool isImm8_255() const {
+    return isImmediate<8, 255>();
   }
   bool isImm256_65535Expr() const {
     if (!isImm()) return false;
@@ -1145,32 +993,16 @@ public:
     return Value >= 0 && Value < 65536;
   }
   bool isImm24bit() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value <= 0xffffff;
+    return isImmediate<0, 0xffffff + 1>();
   }
   bool isImmThumbSR() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value < 33;
+    return isImmediate<1, 33>();
   }
   bool isPKHLSLImm() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value >= 0 && Value < 32;
+    return isImmediate<0, 32>();
   }
   bool isPKHASRImm() const {
-    if (!isImm()) return false;
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    int64_t Value = CE->getValue();
-    return Value > 0 && Value <= 32;
+    return isImmediate<0, 33>();
   }
   bool isAdrLabel() const {
     // If we have an immediate that's not a constant, treat it as a label
@@ -1245,6 +1077,20 @@ public:
     return ARM_AM::getSOImmVal(Value) == -1 &&
       ARM_AM::getSOImmVal(-Value) != -1;
   }
+  bool isThumbModImmNeg1_7() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int32_t Value = -(int32_t)CE->getValue();
+    return 0 < Value && Value < 8;
+  }
+  bool isThumbModImmNeg8_255() const {
+    if (!isImm()) return false;
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (!CE) return false;
+    int32_t Value = -(int32_t)CE->getValue();
+    return 7 < Value && Value < 256;
+  }
   bool isConstantPoolImm() const { return Kind == k_ConstantPoolImmediate; }
   bool isBitfield() const { return Kind == k_BitfieldDescriptor; }
   bool isPostIdxRegShifted() const { return Kind == k_PostIndexRegister; }
@@ -2035,6 +1881,20 @@ public:
     Inst.addOperand(MCOperand::createImm(Enc));
   }
 
+  void addThumbModImmNeg8_255Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint32_t Val = -CE->getValue();
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
+  void addThumbModImmNeg1_7Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    uint32_t Val = -CE->getValue();
+    Inst.addOperand(MCOperand::createImm(Val));
+  }
+
   void addBitfieldOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // Munge the lsb/width into a bitfield mask.
@@ -2141,7 +2001,7 @@ public:
     // The operand is actually a t2_so_imm, but we have its bitwise
     // negation in the assembly source, so twiddle it here.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(~CE->getValue()));
+    Inst.addOperand(MCOperand::createImm(~(uint32_t)CE->getValue()));
   }
 
   void addT2SOImmNegOperands(MCInst &Inst, unsigned N) const {
@@ -2149,7 +2009,7 @@ public:
     // The operand is actually a t2_so_imm, but we have its
     // negation in the assembly source, so twiddle it here.
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    Inst.addOperand(MCOperand::createImm(-CE->getValue()));
+    Inst.addOperand(MCOperand::createImm(-(uint32_t)CE->getValue()));
   }
 
   void addImm0_4095NegOperands(MCInst &Inst, unsigned N) const {
@@ -4330,7 +4190,7 @@ ARMAsmParser::parseMSRMaskOperand(OperandVector &Operands) {
 
       // If some specific flag is already set, it means that some letter is
       // present more than once, this is not acceptable.
-      if (FlagsVal == ~0U || (FlagsVal & Flag))
+      if (Flag == ~0U || (FlagsVal & Flag))
         return MatchOperand_NoMatch;
       FlagsVal |= Flag;
     }
@@ -5484,7 +5344,8 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   enum {
     COFF = (1 << MCObjectFileInfo::IsCOFF),
     ELF = (1 << MCObjectFileInfo::IsELF),
-    MACHO = (1 << MCObjectFileInfo::IsMachO)
+    MACHO = (1 << MCObjectFileInfo::IsMachO),
+    WASM = (1 << MCObjectFileInfo::IsWasm),
   };
   static const struct PrefixEntry {
     const char *Spelling;
@@ -5518,6 +5379,9 @@ bool ARMAsmParser::parsePrefix(ARMMCExpr::VariantKind &RefKind) {
   case MCObjectFileInfo::IsCOFF:
     CurrentFormat = COFF;
     break;
+  case MCObjectFileInfo::IsWasm:
+    CurrentFormat = WASM;
+    break;
   }
 
   if (~Prefix->SupportedFormats & CurrentFormat) {
@@ -6301,10 +6165,6 @@ bool ARMAsmParser::validatetLDMRegList(const MCInst &Inst,
   else if (ListContainsPC && ListContainsLR)
     return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
                  "PC and LR may not be in the register list simultaneously");
-  else if (inITBlock() && !lastInITBlock() && ListContainsPC)
-    return Error(Operands[ListNo + HasWritebackToken]->getStartLoc(),
-                 "instruction must be outside of IT block or the last "
-                 "instruction in an IT block");
   return false;
 }
 
@@ -6366,6 +6226,12 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     return Warning(Loc, "predicated instructions should be in IT block");
   }
 
+  // PC-setting instructions in an IT block, but not the last instruction of
+  // the block, are UNPREDICTABLE.
+  if (inExplicitITBlock() && !lastInITBlock() && isITBlockTerminator(Inst)) {
+    return Error(Loc, "instruction must be outside of IT block or the last instruction in an IT block");
+  }
+
   const unsigned Opcode = Inst.getOpcode();
   switch (Opcode) {
   case ARM::LDRD:
@@ -6676,6 +6542,7 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
     break;
   }
   case ARM::MOVi16:
+  case ARM::MOVTi16:
   case ARM::t2MOVi16:
   case ARM::t2MOVTi16:
     {
@@ -8232,7 +8099,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   case ARM::t2LSRri:
   case ARM::t2ASRri: {
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
-        Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
+        isARMLowRegister(Inst.getOperand(1).getReg()) &&
         Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
         !(static_cast<ARMOperand &>(*Operands[3]).isToken() &&
           static_cast<ARMOperand &>(*Operands[3]).getToken() == ".w")) {
@@ -8307,23 +8174,38 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
       isNarrow = true;
     MCInst TmpInst;
     unsigned newOpc;
-    switch(ARM_AM::getSORegShOp(Inst.getOperand(2).getImm())) {
-    default: llvm_unreachable("unexpected opcode!");
-    case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
-    case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
-    case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
-    case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
-    case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
-    }
+    unsigned Shift = ARM_AM::getSORegShOp(Inst.getOperand(2).getImm());
     unsigned Amount = ARM_AM::getSORegOffset(Inst.getOperand(2).getImm());
+    bool isMov = false;
+    // MOV rd, rm, LSL #0 is actually a MOV instruction
+    if (Shift == ARM_AM::lsl && Amount == 0) {
+      isMov = true;
+      // The 16-bit encoding of MOV rd, rm, LSL #N is explicitly encoding T2 of
+      // MOV (register) in the ARMv8-A and ARMv8-M manuals, and immediate 0 is
+      // unpredictable in an IT block so the 32-bit encoding T3 has to be used
+      // instead.
+      if (inITBlock()) {
+        isNarrow = false;
+      }
+      newOpc = isNarrow ? ARM::tMOVSr : ARM::t2MOVr;
+    } else {
+      switch(Shift) {
+      default: llvm_unreachable("unexpected opcode!");
+      case ARM_AM::asr: newOpc = isNarrow ? ARM::tASRri : ARM::t2ASRri; break;
+      case ARM_AM::lsr: newOpc = isNarrow ? ARM::tLSRri : ARM::t2LSRri; break;
+      case ARM_AM::lsl: newOpc = isNarrow ? ARM::tLSLri : ARM::t2LSLri; break;
+      case ARM_AM::ror: newOpc = ARM::t2RORri; isNarrow = false; break;
+      case ARM_AM::rrx: isNarrow = false; newOpc = ARM::t2RRX; break;
+      }
+    }
     if (Amount == 32) Amount = 0;
     TmpInst.setOpcode(newOpc);
     TmpInst.addOperand(Inst.getOperand(0)); // Rd
-    if (isNarrow)
+    if (isNarrow && !isMov)
       TmpInst.addOperand(MCOperand::createReg(
           Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
-    if (newOpc != ARM::t2RRX)
+    if (newOpc != ARM::t2RRX && !isMov)
       TmpInst.addOperand(MCOperand::createImm(Amount));
     TmpInst.addOperand(Inst.getOperand(3)); // CondCode
     TmpInst.addOperand(Inst.getOperand(4));
@@ -8918,6 +8800,9 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
     if (isThumbTwo() && Inst.getOperand(OpNo).getReg() == ARM::CPSR &&
         inITBlock())
       return Match_RequiresNotITBlock;
+    // LSL with zero immediate is not allowed in an IT block
+    if (Opc == ARM::tLSLri && Inst.getOperand(3).getImm() == 0 && inITBlock())
+      return Match_RequiresNotITBlock;
   } else if (isThumbOne()) {
     // Some high-register supporting Thumb1 encodings only allow both registers
     // to be from r0-r7 when in Thumb2.
@@ -8932,6 +8817,22 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       return Match_RequiresV6;
   }
 
+  // Before ARMv8 the rules for when SP is allowed in t2MOVr are more complex
+  // than the loop below can handle, so it uses the GPRnopc register class and
+  // we do SP handling here.
+  if (Opc == ARM::t2MOVr && !hasV8Ops())
+  {
+    // SP as both source and destination is not allowed
+    if (Inst.getOperand(0).getReg() == ARM::SP &&
+        Inst.getOperand(1).getReg() == ARM::SP)
+      return Match_RequiresV8;
+    // When flags-setting SP as either source or destination is not allowed
+    if (Inst.getOperand(4).getReg() == ARM::CPSR &&
+        (Inst.getOperand(0).getReg() == ARM::SP ||
+         Inst.getOperand(1).getReg() == ARM::SP))
+      return Match_RequiresV8;
+  }
+
   for (unsigned I = 0; I < MCID.NumOperands; ++I)
     if (MCID.OpInfo[I].RegClass == ARM::rGPRRegClassID) {
       // rGPRRegClass excludes PC, and also excluded SP before ARMv8
@@ -8945,7 +8846,7 @@ unsigned ARMAsmParser::checkTargetMatchPredicate(MCInst &Inst) {
 }
 
 namespace llvm {
-template <> inline bool IsCPSRDead<MCInst>(MCInst *Instr) {
+template <> inline bool IsCPSRDead<MCInst>(const MCInst *Instr) {
   return true; // In an assembly source, no need to second-guess
 }
 }
@@ -8975,6 +8876,7 @@ bool ARMAsmParser::isITBlockTerminator(MCInst &Inst) const {
   // operands. We only care about Thumb instructions here, as ARM instructions
   // obviously can't be in an IT block.
   switch (Inst.getOpcode()) {
+  case ARM::tLDMIA:
   case ARM::t2LDMIA:
   case ARM::t2LDMIA_UPD:
   case ARM::t2LDMDB:
@@ -9088,6 +8990,13 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   MatchResult = MatchInstruction(Operands, Inst, ErrorInfo, MatchingInlineAsm,
                                  PendConditionalInstruction, Out);
 
+  SMLoc ErrorLoc;
+  if (ErrorInfo < Operands.size()) {
+    ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
+    if (ErrorLoc == SMLoc())
+      ErrorLoc = IDLoc;
+  }
+
   switch (MatchResult) {
   case Match_Success:
     // Context sensitive operand constraints aren't handled by the matcher,
@@ -9177,16 +9086,52 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     return Error(IDLoc, "instruction variant requires ARMv8 or later");
   case Match_RequiresFlagSetting:
     return Error(IDLoc, "no flag-preserving variant of this instruction available");
-  case Match_ImmRange0_15: {
-    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
-    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+  case Match_ImmRange0_1:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,1]");
+  case Match_ImmRange0_3:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,3]");
+  case Match_ImmRange0_7:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,7]");
+  case Match_ImmRange0_15:
     return Error(ErrorLoc, "immediate operand must be in the range [0,15]");
-  }
-  case Match_ImmRange0_239: {
-    SMLoc ErrorLoc = ((ARMOperand &)*Operands[ErrorInfo]).getStartLoc();
-    if (ErrorLoc == SMLoc()) ErrorLoc = IDLoc;
+  case Match_ImmRange0_31:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,31]");
+  case Match_ImmRange0_32:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,32]");
+  case Match_ImmRange0_63:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,63]");
+  case Match_ImmRange0_239:
     return Error(ErrorLoc, "immediate operand must be in the range [0,239]");
-  }
+  case Match_ImmRange0_255:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,255]");
+  case Match_ImmRange0_4095:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,4095]");
+  case Match_ImmRange0_65535:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,65535]");
+  case Match_ImmRange1_7:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,7]");
+  case Match_ImmRange1_8:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,8]");
+  case Match_ImmRange1_15:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,15]");
+  case Match_ImmRange1_16:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,16]");
+  case Match_ImmRange1_31:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,31]");
+  case Match_ImmRange1_32:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,32]");
+  case Match_ImmRange1_64:
+    return Error(ErrorLoc, "immediate operand must be in the range [1,64]");
+  case Match_ImmRange8_8:
+    return Error(ErrorLoc, "immediate operand must be 8.");
+  case Match_ImmRange16_16:
+    return Error(ErrorLoc, "immediate operand must be 16.");
+  case Match_ImmRange32_32:
+    return Error(ErrorLoc, "immediate operand must be 32.");
+  case Match_ImmRange256_65535:
+    return Error(ErrorLoc, "immediate operand must be in the range [255,65535]");
+  case Match_ImmRange0_16777215:
+    return Error(ErrorLoc, "immediate operand must be in the range [0,0xffffff]");
   case Match_AlignedMemoryRequiresNone:
   case Match_DupAlignedMemoryRequiresNone:
   case Match_AlignedMemoryRequires16:
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index 0c57a3e3166b7eccdee2b359869662b9e40e093e..1062c79432011a431f5aa1956ac2131c01aa721f 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -1,5 +1,6 @@
 set(LLVM_TARGET_DEFINITIONS ARM.td)
 
+tablegen(LLVM ARMGenRegisterBank.inc -gen-register-bank)
 tablegen(LLVM ARMGenRegisterInfo.inc -gen-register-info)
 tablegen(LLVM ARMGenInstrInfo.inc -gen-instr-info)
 tablegen(LLVM ARMGenMCCodeEmitter.inc -gen-emitter)
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 6b4b6171e43390ca661f56d1eadf71edd049c205..e812d32cc76f6f6928b574ad9b91862aa22df5d9 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -462,65 +462,28 @@ DecodeStatus ARMDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return checkDecodedInstruction(MI, Size, Address, OS, CS, Insn, Result);
   }
 
-  // VFP and NEON instructions, similarly, are shared between ARM
-  // and Thumb modes.
-  Result = decodeInstruction(DecoderTableVFP32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
-
-  Result = decodeInstruction(DecoderTableVFPV832, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
-
-  Result =
-      decodeInstruction(DecoderTableNEONData32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    // Add a fake predicate operand, because we share these instruction
-    // definitions with Thumb2 where these instructions are predicable.
-    if (!DecodePredicateOperand(MI, 0xE, Address, this))
-      return MCDisassembler::Fail;
-    return Result;
-  }
-
-  Result = decodeInstruction(DecoderTableNEONLoadStore32, MI, Insn, Address,
-                             this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    // Add a fake predicate operand, because we share these instruction
-    // definitions with Thumb2 where these instructions are predicable.
-    if (!DecodePredicateOperand(MI, 0xE, Address, this))
-      return MCDisassembler::Fail;
-    return Result;
-  }
-
-  Result =
-      decodeInstruction(DecoderTableNEONDup32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    // Add a fake predicate operand, because we share these instruction
-    // definitions with Thumb2 where these instructions are predicable.
-    if (!DecodePredicateOperand(MI, 0xE, Address, this))
-      return MCDisassembler::Fail;
-    return Result;
-  }
+  struct DecodeTable {
+    const uint8_t *P;
+    bool DecodePred;
+  };
 
-  Result =
-      decodeInstruction(DecoderTablev8NEON32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
-  }
+  const DecodeTable Tables[] = {
+      {DecoderTableVFP32, false},      {DecoderTableVFPV832, false},
+      {DecoderTableNEONData32, true},  {DecoderTableNEONLoadStore32, true},
+      {DecoderTableNEONDup32, true},   {DecoderTablev8NEON32, false},
+      {DecoderTablev8Crypto32, false},
+  };
 
-  Result =
-      decodeInstruction(DecoderTablev8Crypto32, MI, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
-    return Result;
+  for (auto Table : Tables) {
+    Result = decodeInstruction(Table.P, MI, Insn, Address, this, STI);
+    if (Result != MCDisassembler::Fail) {
+      Size = 4;
+      // Add a fake predicate operand, because we share these instruction
+      // definitions with Thumb2 where these instructions are predicable.
+      if (Table.DecodePred && !DecodePredicateOperand(MI, 0xE, Address, this))
+        return MCDisassembler::Fail;
+      return Result;
+    }
   }
 
   Size = 0;
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
index 3667952d44c0c4fd42f74bbf880e1be514b40741..57b91366a0858e6021699b4d60133489c0cf039d 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.cpp
@@ -20,7 +20,15 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -73,7 +81,6 @@ void ARMInstPrinter::printInst(const MCInst *MI, raw_ostream &O,
   unsigned Opcode = MI->getOpcode();
 
   switch (Opcode) {
-
   // Check for MOVs and print canonical forms, instead.
   case ARM::MOVsr: {
     // FIXME: Thumb variants?
diff --git a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
index 9d80eed84dc266c05d6c2b18c939cd74c3f107b4..86873a3a6ccbbd5e21b1655d64a4164ad87c0ee3 100644
--- a/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
+++ b/lib/Target/ARM/InstPrinter/ARMInstPrinter.h
@@ -235,4 +235,4 @@ public:
 
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_ARM_INSTPRINTER_ARMINSTPRINTER_H
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 469c47d236dab08a604740fbe7287cc7afeb0319..40bf545e83224ab32e3218b1bfef8abc343b682b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -357,13 +357,13 @@ static uint32_t joinHalfWords(uint32_t FirstHalf, uint32_t SecondHalf,
 }
 
 unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                         bool IsPCRel, MCContext *Ctx,
+                                         bool IsPCRel, MCContext &Ctx,
                                          bool IsLittleEndian,
                                          bool IsResolved) const {
   unsigned Kind = Fixup.getKind();
   switch (Kind) {
   default:
-    if (Ctx) Ctx->reportError(Fixup.getLoc(), "bad relocation fixup type");
+    Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type");
     return 0;
   case FK_Data_1:
   case FK_Data_2:
@@ -413,8 +413,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       Value = -Value;
       isAdd = false;
     }
-    if (Ctx && Value >= 4096) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 4096) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value |= isAdd << 23;
@@ -434,8 +434,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       Value = -Value;
       opc = 2; // 0b0010
     }
-    if (Ctx && ARM_AM::getSOImmVal(Value) == -1) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (ARM_AM::getSOImmVal(Value) == -1) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     // Encode the immediate and shift the opcode into place.
@@ -542,8 +542,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     //
     // Note that the halfwords are stored high first, low second; so we need
     // to transpose the fixup value here to map properly.
-    if (Ctx && Value % 4 != 0) {
-      Ctx->reportError(Fixup.getLoc(), "misaligned ARM call destination");
+    if (Value % 4 != 0) {
+      Ctx.reportError(Fixup.getLoc(), "misaligned ARM call destination");
       return 0;
     }
 
@@ -569,10 +569,10 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case ARM::fixup_arm_thumb_cp:
     // On CPUs supporting Thumb2, this will be relaxed to an ldr.w, otherwise we
     // could have an error on our hands.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2] && IsResolved) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
-        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
         return 0;
       }
     }
@@ -582,8 +582,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // CB instructions can only branch to offsets in [4, 126] in multiples of 2
     // so ensure that the raw value LSB is zero and it lies in [2, 130].
     // An offset of 2 will be relaxed to a NOP.
-    if (Ctx && ((int64_t)Value < 2 || Value > 0x82 || Value & 1)) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if ((int64_t)Value < 2 || Value > 0x82 || Value & 1) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     // Offset by 4 and don't encode the lower bit, which is always 0.
@@ -593,21 +593,21 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
   case ARM::fixup_arm_thumb_br:
     // Offset by 4 and don't encode the lower bit, which is always 0.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2] &&
-               !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2] &&
+        !STI->getFeatureBits()[ARM::HasV8MBaselineOps]) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
-        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
         return 0;
       }
     }
     return ((Value - 4) >> 1) & 0x7ff;
   case ARM::fixup_arm_thumb_bcc:
     // Offset by 4 and don't encode the lower bit, which is always 0.
-    if (Ctx && !STI->getFeatureBits()[ARM::FeatureThumb2]) {
+    if (!STI->getFeatureBits()[ARM::FeatureThumb2]) {
       const char *FixupDiagnostic = reasonForFixupRelaxation(Fixup, Value);
       if (FixupDiagnostic) {
-        Ctx->reportError(Fixup.getLoc(), FixupDiagnostic);
+        Ctx.reportError(Fixup.getLoc(), FixupDiagnostic);
         return 0;
       }
     }
@@ -621,8 +621,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       isAdd = false;
     }
     // The value has the low 4 bits encoded in [3:0] and the high 4 in [11:8].
-    if (Ctx && Value >= 256) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 256) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value = (Value & 0xf) | ((Value & 0xf0) << 4);
@@ -642,8 +642,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     }
     // These values don't encode the low two bits since they're always zero.
     Value >>= 2;
-    if (Ctx && Value >= 256) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 256) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value |= isAdd << 23;
@@ -668,13 +668,13 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
       isAdd = false;
     }
     // These values don't encode the low bit since it's always zero.
-    if (Ctx && (Value & 1)) {
-      Ctx->reportError(Fixup.getLoc(), "invalid value for this fixup");
+    if (Value & 1) {
+      Ctx.reportError(Fixup.getLoc(), "invalid value for this fixup");
       return 0;
     }
     Value >>= 1;
-    if (Ctx && Value >= 256) {
-      Ctx->reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
+    if (Value >= 256) {
+      Ctx.reportError(Fixup.getLoc(), "out of range pc-relative fixup value");
       return 0;
     }
     Value |= isAdd << 23;
@@ -688,8 +688,8 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
   case ARM::fixup_arm_mod_imm:
     Value = ARM_AM::getSOImmVal(Value);
-    if (Ctx && Value >> 12) {
-      Ctx->reportError(Fixup.getLoc(), "out of range immediate fixup value");
+    if (Value >> 12) {
+      Ctx.reportError(Fixup.getLoc(), "out of range immediate fixup value");
       return 0;
     }
     return Value;
@@ -738,12 +738,6 @@ void ARMAsmBackend::processFixupValue(const MCAssembler &Asm,
             (unsigned)Fixup.getKind() == ARM::fixup_arm_uncondbl ||
             (unsigned)Fixup.getKind() == ARM::fixup_arm_condbl))
     IsResolved = false;
-
-  // Try to get the encoded value for the fixup as-if we're mapping it into
-  // the instruction. This allows adjustFixupValue() to issue a diagnostic
-  // if the value aren't invalid.
-  (void)adjustFixupValue(Fixup, Value, false, &Asm.getContext(),
-                         IsLittleEndian, IsResolved);
 }
 
 /// getFixupKindNumBytes - The number of bytes the fixup may change.
@@ -847,11 +841,10 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
 }
 
 void ARMAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value,
-                               bool IsPCRel) const {
+                               unsigned DataSize, uint64_t Value, bool IsPCRel,
+                               MCContext &Ctx) const {
   unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
-  Value =
-      adjustFixupValue(Fixup, Value, IsPCRel, nullptr, IsLittleEndian, true);
+  Value = adjustFixupValue(Fixup, Value, IsPCRel, Ctx, IsLittleEndian, true);
   if (!Value)
     return; // Doesn't change encoding.
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 84caaacc47d3a096c37236fc023bdb28f03e76f2..2ddedb5d61059af4992e7f8f6cdf7dc930e2c79b 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -46,11 +46,11 @@ public:
                          bool &IsResolved) override;
 
   unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value, bool IsPCRel,
-                            MCContext *Ctx, bool IsLittleEndian,
+                            MCContext &Ctx, bool IsLittleEndian,
                             bool IsResolved) const;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   unsigned getRelaxedOpcode(unsigned Op) const;
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 088b4205ed624582dd3e52d19b8b7987e4f36c30..92e553f21f14386ac1db621c21163197665f6c70 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -291,7 +291,11 @@ namespace ARMII {
 
     /// MO_OPTION_MASK - Most flags are mutually exclusive; this mask selects
     /// just that part of the flag set.
-    MO_OPTION_MASK = 0x1f,
+    MO_OPTION_MASK = 0x0f,
+
+    /// MO_SBREL - On a symbol operand, this represents a static base relative
+    /// relocation. Used in movw and movt instructions.
+    MO_SBREL = 0x10,
 
     /// MO_DLLIMPORT - On a symbol operand, this represents that the reference
     /// to the symbol is for an import stub.  This is used for DLL import
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 2c947541322052de3e302e1a3f17bc66b477deaf..e1fa2457182024b4c50d2b38856c72dedd6e05e3 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -9,6 +9,7 @@
 
 #include "MCTargetDesc/ARMFixupKinds.h"
 #include "MCTargetDesc/ARMMCTargetDesc.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
@@ -25,9 +26,8 @@ namespace {
   class ARMELFObjectWriter : public MCELFObjectTargetWriter {
     enum { DefaultEABIVersion = 0x05000000U };
 
-    unsigned GetRelocTypeInner(const MCValue &Target,
-                               const MCFixup &Fixup,
-                               bool IsPCRel) const;
+    unsigned GetRelocTypeInner(const MCValue &Target, const MCFixup &Fixup,
+                               bool IsPCRel, MCContext &Ctx) const;
 
   public:
     ARMELFObjectWriter(uint8_t OSABI);
@@ -69,19 +69,20 @@ bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
 unsigned ARMELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
-  return GetRelocTypeInner(Target, Fixup, IsPCRel);
+  return GetRelocTypeInner(Target, Fixup, IsPCRel, Ctx);
 }
 
 unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
                                                const MCFixup &Fixup,
-                                               bool IsPCRel) const  {
+                                               bool IsPCRel,
+                                               MCContext &Ctx) const {
   MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
 
   unsigned Type = 0;
   if (IsPCRel) {
     switch ((unsigned)Fixup.getKind()) {
     default:
-      report_fatal_error("unsupported relocation on symbol");
+      Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
       return ELF::R_ARM_NONE;
     case FK_Data_4:
       switch (Modifier) {
@@ -160,7 +161,7 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   } else {
     switch ((unsigned)Fixup.getKind()) {
     default:
-      report_fatal_error("unsupported relocation on symbol");
+      Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
       return ELF::R_ARM_NONE;
     case FK_Data_1:
       switch (Modifier) {
@@ -269,10 +270,26 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       }
       break;
     case ARM::fixup_t2_movt_hi16:
-      Type = ELF::R_ARM_THM_MOVT_ABS;
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_THM_MOVT_ABS;
+        break;
+      case MCSymbolRefExpr::VK_ARM_SBREL:
+        Type = ELF:: R_ARM_THM_MOVT_BREL;
+        break;
+      }
       break;
     case ARM::fixup_t2_movw_lo16:
-      Type = ELF::R_ARM_THM_MOVW_ABS_NC;
+      switch (Modifier) {
+      default: llvm_unreachable("Unsupported Modifier");
+      case MCSymbolRefExpr::VK_None:
+        Type = ELF::R_ARM_THM_MOVW_ABS_NC;
+        break;
+      case MCSymbolRefExpr::VK_ARM_SBREL:
+        Type = ELF:: R_ARM_THM_MOVW_BREL_NC;
+        break;
+      }
       break;
     }
   }
diff --git a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 7e56ba39eef0c49b0ce97a38530a4cfb367a9d9c..774a0b3771b14a6266016e912851514710369bfb 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -464,13 +464,14 @@ public:
   void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes);
 
   void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
-    // We have to keep track of the mapping symbol state of any sections we
-    // use. Each one should start off as EMS_None, which is provided as the
-    // default constructor by DenseMap::lookup.
-    LastMappingSymbols[getPreviousSection().first] = LastEMS;
-    LastEMS = LastMappingSymbols.lookup(Section);
-
+    LastMappingSymbols[getPreviousSection().first] = std::move(LastEMSInfo);
     MCELFStreamer::ChangeSection(Section, Subsection);
+    auto LastMappingSymbol = LastMappingSymbols.find(Section);
+    if (LastMappingSymbol != LastMappingSymbols.end()) {
+      LastEMSInfo = std::move(LastMappingSymbol->second);
+      return;
+    }
+    LastEMSInfo.reset(new ElfMappingSymbolInfo(SMLoc(), nullptr, 0));
   }
 
   /// This function is the one used to emit instruction data into the ELF
@@ -532,15 +533,25 @@ public:
     MCELFStreamer::EmitBytes(Data);
   }
 
+  void FlushPendingMappingSymbol() {
+    if (!LastEMSInfo->hasInfo())
+      return;
+    ElfMappingSymbolInfo *EMS = LastEMSInfo.get();
+    EmitMappingSymbol("$d", EMS->Loc, EMS->F, EMS->Offset);
+    EMS->resetInfo();
+  }
+
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
   void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
-    if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value))
+    if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) {
       if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) {
         getContext().reportError(Loc, "relocated expression must be 32-bit");
         return;
       }
+      getOrCreateDataFragment();
+    }
 
     EmitDataMappingSymbol();
     MCELFStreamer::EmitValueImpl(Value, Size, Loc);
@@ -573,22 +584,54 @@ private:
     EMS_Data
   };
 
+  struct ElfMappingSymbolInfo {
+    explicit ElfMappingSymbolInfo(SMLoc Loc, MCFragment *F, uint64_t O)
+        : Loc(Loc), F(F), Offset(O), State(EMS_None) {}
+    void resetInfo() {
+      F = nullptr;
+      Offset = 0;
+    }
+    bool hasInfo() { return F != nullptr; }
+    SMLoc Loc;
+    MCFragment *F;
+    uint64_t Offset;
+    ElfMappingSymbol State;
+  };
+
   void EmitDataMappingSymbol() {
-    if (LastEMS == EMS_Data) return;
+    if (LastEMSInfo->State == EMS_Data)
+      return;
+    else if (LastEMSInfo->State == EMS_None) {
+      // This is a tentative symbol, it won't really be emitted until it's
+      // actually needed.
+      ElfMappingSymbolInfo *EMS = LastEMSInfo.get();
+      auto *DF = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
+      if (!DF)
+        return;
+      EMS->Loc = SMLoc();
+      EMS->F = getCurrentFragment();
+      EMS->Offset = DF->getContents().size();
+      LastEMSInfo->State = EMS_Data;
+      return;
+    }
     EmitMappingSymbol("$d");
-    LastEMS = EMS_Data;
+    LastEMSInfo->State = EMS_Data;
   }
 
   void EmitThumbMappingSymbol() {
-    if (LastEMS == EMS_Thumb) return;
+    if (LastEMSInfo->State == EMS_Thumb)
+      return;
+    FlushPendingMappingSymbol();
     EmitMappingSymbol("$t");
-    LastEMS = EMS_Thumb;
+    LastEMSInfo->State = EMS_Thumb;
   }
 
   void EmitARMMappingSymbol() {
-    if (LastEMS == EMS_ARM) return;
+    if (LastEMSInfo->State == EMS_ARM)
+      return;
+    FlushPendingMappingSymbol();
     EmitMappingSymbol("$a");
-    LastEMS = EMS_ARM;
+    LastEMSInfo->State = EMS_ARM;
   }
 
   void EmitMappingSymbol(StringRef Name) {
@@ -601,6 +644,17 @@ private:
     Symbol->setExternal(false);
   }
 
+  void EmitMappingSymbol(StringRef Name, SMLoc Loc, MCFragment *F,
+                         uint64_t Offset) {
+    auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
+        Name + "." + Twine(MappingSymbolCounter++)));
+    EmitLabel(Symbol, Loc, F);
+    Symbol->setType(ELF::STT_NOTYPE);
+    Symbol->setBinding(ELF::STB_LOCAL);
+    Symbol->setExternal(false);
+    Symbol->setOffset(Offset);
+  }
+
   void EmitThumbFunc(MCSymbol *Func) override {
     getAssembler().setIsThumbFunc(Func);
     EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
@@ -626,8 +680,10 @@ private:
   bool IsThumb;
   int64_t MappingSymbolCounter = 0;
 
-  DenseMap<const MCSection *, ElfMappingSymbol> LastMappingSymbols;
-  ElfMappingSymbol LastEMS = EMS_None;
+  DenseMap<const MCSection *, std::unique_ptr<ElfMappingSymbolInfo>>
+      LastMappingSymbols;
+
+  std::unique_ptr<ElfMappingSymbolInfo> LastEMSInfo;
 
   // ARM Exception Handling Frame Information
   MCSymbol *ExTab;
@@ -1138,9 +1194,9 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
   const MCSymbolELF *Group = FnSection.getGroup();
   if (Group)
     Flags |= ELF::SHF_GROUP;
-  MCSectionELF *EHSection =
-      getContext().getELFSection(EHSecName, Type, Flags, 0, Group,
-                                 FnSection.getUniqueID(), nullptr, &FnSection);
+  MCSectionELF *EHSection = getContext().getELFSection(
+      EHSecName, Type, Flags, 0, Group, FnSection.getUniqueID(),
+      static_cast<const MCSymbolELF *>(&Fn));
 
   assert(EHSection && "Failed to get the required EH section");
 
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 9e4d202321e6c2f28168b706d37a8ce91072662f..477755157040da50190aef773312706cd41338bc 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -260,18 +260,37 @@ public:
       return false;
 
     int64_t Imm = Inst.getOperand(0).getImm();
-    // FIXME: This is not right for thumb.
     Target = Addr+Imm+8; // In ARM mode the PC is always off by 8 bytes.
     return true;
   }
 };
 
+class ThumbMCInstrAnalysis : public ARMMCInstrAnalysis {
+public:
+  ThumbMCInstrAnalysis(const MCInstrInfo *Info) : ARMMCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const override {
+    // We only handle PCRel branches for now.
+    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+      return false;
+
+    int64_t Imm = Inst.getOperand(0).getImm();
+    Target = Addr+Imm+4; // In Thumb mode the PC is always off by 4 bytes.
+    return true;
+  }
+};
+
 }
 
 static MCInstrAnalysis *createARMMCInstrAnalysis(const MCInstrInfo *Info) {
   return new ARMMCInstrAnalysis(Info);
 }
 
+static MCInstrAnalysis *createThumbMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new ThumbMCInstrAnalysis(Info);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeARMTargetMC() {
   for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
@@ -289,9 +308,6 @@ extern "C" void LLVMInitializeARMTargetMC() {
     TargetRegistry::RegisterMCSubtargetInfo(*T,
                                             ARM_MC::createARMMCSubtargetInfo);
 
-    // Register the MC instruction analyzer.
-    TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis);
-
     TargetRegistry::RegisterELFStreamer(*T, createELFStreamer);
     TargetRegistry::RegisterCOFFStreamer(*T, createARMWinCOFFStreamer);
     TargetRegistry::RegisterMachOStreamer(*T, createARMMachOStreamer);
@@ -313,6 +329,12 @@ extern "C" void LLVMInitializeARMTargetMC() {
     TargetRegistry::RegisterMCRelocationInfo(*T, createARMMCRelocationInfo);
   }
 
+  // Register the MC instruction analyzer.
+  for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget()})
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createARMMCInstrAnalysis);
+  for (Target *T : {&getTheThumbLETarget(), &getTheThumbBETarget()})
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createThumbMCInstrAnalysis);
+
   // Register the MC Code Emitter
   for (Target *T : {&getTheARMLETarget(), &getTheThumbLETarget()})
     TargetRegistry::RegisterMCCodeEmitter(*T, createARMLEMCCodeEmitter);
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
index 482bcf902518687aa18ed5e8fefdbd470e74fbe9..34c770440e1ba5799b71a2a70c1dc54c5fc72b77 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMachORelocationInfo.cpp
@@ -1,4 +1,4 @@
-//===-- ARMMachORelocationInfo.cpp ----------------------------------------===//
+//===- ARMMachORelocationInfo.cpp -----------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,17 +7,17 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "ARMMCExpr.h"
-#include "llvm-c/Disassembler.h"
+#include "MCTargetDesc/ARMMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCDisassembler/MCRelocationInfo.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm-c/Disassembler.h"
 
 using namespace llvm;
-using namespace object;
 
 namespace {
+
 class ARMMachORelocationInfo : public MCRelocationInfo {
 public:
   ARMMachORelocationInfo(MCContext &Ctx) : MCRelocationInfo(Ctx) {}
@@ -35,7 +35,8 @@ public:
     }
   }
 };
-} // End unnamed namespace
+
+} // end anonymous namespace
 
 /// createARMMachORelocationInfo - Construct an ARM Mach-O RelocationInfo.
 MCRelocationInfo *llvm::createARMMachORelocationInfo(MCContext &Ctx) {
diff --git a/lib/Target/ARM/Thumb1FrameLowering.cpp b/lib/Target/ARM/Thumb1FrameLowering.cpp
index a2e60299c883aaba2750ab5a377acca896e2ee9d..fc083b98395b00ae9ea22ff127921c490b53abbf 100644
--- a/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -888,6 +888,16 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
         // ARMv4T requires BX, see emitEpilogue
         if (!STI.hasV5TOps())
           continue;
+        // Tailcall optimization failed; change TCRETURN to a tBL
+        if (MI->getOpcode() == ARM::TCRETURNdi ||
+            MI->getOpcode() == ARM::TCRETURNri) {
+          unsigned Opcode = MI->getOpcode() == ARM::TCRETURNdi
+                            ? ARM::tBL : ARM::tBLXr;
+          MachineInstrBuilder BL = BuildMI(MF, DL, TII.get(Opcode));
+          BL.add(predOps(ARMCC::AL));
+          BL.add(MI->getOperand(0));
+          MBB.insert(MI, &*BL);
+        }
         Reg = ARM::PC;
         (*MIB).setDesc(TII.get(ARM::tPOP_RET));
         if (MI != MBB.end())
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index f1d6fa68d3a251d60e982fdd59cec8cc5b03fd2a..27bff4d75acf4c2724a1e03509c12e9271526102 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -54,11 +54,17 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
         .addReg(SrcReg, getKillRegState(KillSrc))
         .add(predOps(ARMCC::AL));
   else {
-    // FIXME: The performance consequences of this are going to be atrocious.
-    // Some things to try that should be better:
-    //   * 'mov hi, $src; mov $dst, hi', with hi as either r10 or r11
-    //   * 'movs $dst, $src' if cpsr isn't live
-    // See: http://lists.llvm.org/pipermail/llvm-dev/2014-August/075998.html
+    // FIXME: Can also use 'mov hi, $src; mov $dst, hi',
+    // with hi as either r10 or r11.
+
+    const TargetRegisterInfo *RegInfo = st.getRegisterInfo();
+    if (MBB.computeRegisterLiveness(RegInfo, ARM::CPSR, I)
+        == MachineBasicBlock::LQR_Dead) {
+      BuildMI(MBB, I, DL, get(ARM::tMOVSr), DestReg)
+          .addReg(SrcReg, getKillRegState(KillSrc))
+          ->addRegisterDead(ARM::CPSR, RegInfo);
+      return;
+    }
 
     // 'MOV lo, lo' is unpredictable on < v6, so use the stack to do it
     BuildMI(MBB, I, DL, get(ARM::tPUSH))
diff --git a/lib/Target/AVR/AVRAsmPrinter.cpp b/lib/Target/AVR/AVRAsmPrinter.cpp
index 4afdd3a0ec089f8fde95fe5418775d11a5abeb93..50bb50b44f27b80dad552d22bdd88852c6e32db7 100644
--- a/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -130,7 +130,8 @@ bool AVRAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNum,
     }
   }
 
-  printOperand(MI, OpNum, O);
+  if (Error)
+    printOperand(MI, OpNum, O);
 
   return false;
 }
diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp
index 07fc3f6890b8dc097d6aa57903ffac092eab4794..0b95d3819399c11b660d841d13e686ef4fe1448f 100644
--- a/lib/Target/AVR/AVRISelLowering.cpp
+++ b/lib/Target/AVR/AVRISelLowering.cpp
@@ -48,6 +48,8 @@ AVRTargetLowering::AVRTargetLowering(AVRTargetMachine &tm)
   setOperationAction(ISD::GlobalAddress, MVT::i16, Custom);
   setOperationAction(ISD::BlockAddress, MVT::i16, Custom);
 
+  setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
+  setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i8, Expand);
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i16, Expand);
 
@@ -311,7 +313,7 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   unsigned Opcode = Op->getOpcode();
   assert((Opcode == ISD::SDIVREM || Opcode == ISD::UDIVREM) &&
          "Invalid opcode for Div/Rem lowering");
-  bool isSigned = (Opcode == ISD::SDIVREM);
+  bool IsSigned = (Opcode == ISD::SDIVREM);
   EVT VT = Op->getValueType(0);
   Type *Ty = VT.getTypeForEVT(*DAG.getContext());
 
@@ -320,16 +322,16 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   default:
     llvm_unreachable("Unexpected request for libcall!");
   case MVT::i8:
-    LC = isSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8;
+    LC = IsSigned ? RTLIB::SDIVREM_I8 : RTLIB::UDIVREM_I8;
     break;
   case MVT::i16:
-    LC = isSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16;
+    LC = IsSigned ? RTLIB::SDIVREM_I16 : RTLIB::UDIVREM_I16;
     break;
   case MVT::i32:
-    LC = isSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
+    LC = IsSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
     break;
   case MVT::i64:
-    LC = isSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
+    LC = IsSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
     break;
   }
 
@@ -340,8 +342,8 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   for (SDValue const &Value : Op->op_values()) {
     Entry.Node = Value;
     Entry.Ty = Value.getValueType().getTypeForEVT(*DAG.getContext());
-    Entry.isSExt = isSigned;
-    Entry.isZExt = !isSigned;
+    Entry.IsSExt = IsSigned;
+    Entry.IsZExt = !IsSigned;
     Args.push_back(Entry);
   }
 
@@ -354,10 +356,10 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(InChain)
-      .setCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
+      .setLibCallee(getLibcallCallingConv(LC), RetTy, Callee, std::move(Args))
       .setInRegister()
-      .setSExtResult(isSigned)
-      .setZExtResult(!isSigned);
+      .setSExtResult(IsSigned)
+      .setZExtResult(!IsSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return CallInfo.first;
@@ -932,6 +934,12 @@ static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
   bool UsesStack = false;
   for (unsigned i = 0, pos = 0, e = Args.size(); i != e; ++i) {
     unsigned Size = Args[i];
+
+    // If we have a zero-sized argument, don't attempt to lower it.
+    // AVR-GCC does not support zero-sized arguments and so we need not
+    // worry about ABI compatibility.
+    if (Size == 0) continue;
+
     MVT LocVT = (IsCall) ? (*Outs)[pos].VT : (*Ins)[pos].VT;
 
     // If we have plenty of regs to pass the whole argument do it.
@@ -1373,7 +1381,7 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   // Don't emit the ret/reti instruction when the naked attribute is present in
   // the function being compiled.
   if (MF.getFunction()->getAttributes().hasAttribute(
-          AttributeSet::FunctionIndex, Attribute::Naked)) {
+          AttributeList::FunctionIndex, Attribute::Naked)) {
     return Chain;
   }
 
@@ -1975,4 +1983,3 @@ unsigned AVRTargetLowering::getRegisterByName(const char *RegName,
 }
 
 } // end of namespace llvm
-
diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td
index bc66379ab70841039aed78379c95392619e221be..693d80a1c06fbf95d915d46b903e08f65f9027be 100644
--- a/lib/Target/AVR/AVRInstrInfo.td
+++ b/lib/Target/AVR/AVRInstrInfo.td
@@ -694,7 +694,7 @@ Defs = [SREG] in
 }
 
 //===----------------------------------------------------------------------===//
-// One's/Two's Compliment
+// One's/Two's Complement
 //===----------------------------------------------------------------------===//
 let Constraints = "$src = $rd",
 Defs = [SREG] in
@@ -1718,7 +1718,7 @@ Defs = [SREG] in
                      (implicit SREG)]>;
 
   // CBR Rd, K
-  // Alias for `ANDI Rd, COM(K)` where COM(K) is the compliment of K.
+  // Alias for `ANDI Rd, COM(K)` where COM(K) is the complement of K.
   // FIXME: This uses the 'complement' encoder. We need it to also use the
   // imm_ldi8 encoder. This will cause no fixups to be created on this instruction.
   def CBRRdK : FRdK<0b0111,
diff --git a/lib/Target/AVR/AVRInstrumentFunctions.cpp b/lib/Target/AVR/AVRInstrumentFunctions.cpp
index 5553dc2da31b50964cecf502fe8ae6551a25d3fb..e7fca74e170190d65ede8ff28456d3309871519f 100644
--- a/lib/Target/AVR/AVRInstrumentFunctions.cpp
+++ b/lib/Target/AVR/AVRInstrumentFunctions.cpp
@@ -96,7 +96,7 @@ static void BuildSignatureCall(StringRef SymName, BasicBlock &BB, Function &F) {
   Value *FunctionName = CreateStringPtr(BB, F.getName());
 
   Value *Args[] = {FunctionName,
-                   ConstantInt::get(I16, F.getArgumentList().size())};
+                   ConstantInt::get(I16, F.arg_size())};
   CallInst::Create(Fn, Args, "", &BB);
 }
 
diff --git a/lib/Target/AVR/AVRMCInstLower.cpp b/lib/Target/AVR/AVRMCInstLower.cpp
index 342fe558813a866abffcaecb937b4580b99660ce..475dda420e8927ca794dd75a7cd2c1297214e7f1 100644
--- a/lib/Target/AVR/AVRMCInstLower.cpp
+++ b/lib/Target/AVR/AVRMCInstLower.cpp
@@ -56,7 +56,7 @@ void AVRMCInstLower::lowerInstruction(const MachineInstr &MI, MCInst &OutMI) con
 
     switch (MO.getType()) {
     default:
-      MI.dump();
+      MI.print(errs());
       llvm_unreachable("unknown operand type");
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index 081d8b5740efc07302436d0f1cb0f7457c9c5c60..5c3b45ac2328504d87bd2fed6162c94353469987 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -335,7 +335,7 @@ MCObjectWriter *AVRAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
 
 void AVRAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                unsigned DataSize, uint64_t Value,
-                               bool IsPCRel) const {
+                               bool IsPCRel, MCContext &Ctx) const {
   if (Value == 0)
     return; // Doesn't change encoding.
 
diff --git a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index 7ff4b8f350f6137198e2170dfbd254f8f7589fd2..f2be2494684a27d6c715a24b7dddb0945bf93ee9 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -41,7 +41,7 @@ public:
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
diff --git a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
index 481de320b22fef1c0a47dfc20ad53338cdbf92f6..7137548210058ff00fb6273406588e604671b33e 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRELFStreamer.cpp
@@ -1,5 +1,7 @@
 #include "AVRELFStreamer.h"
 
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/FormattedStream.h"
 
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index cca3bcc4968ab0cf593793bad371e90483a2f373..9f2ee8cf80356599eb9031c4d3bfa673c5ee5fcf 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -23,6 +23,7 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT) {
   CommentString = ";";
   PrivateGlobalPrefix = ".L";
   UsesELFSectionDirectiveForBSS = true;
+  UseIntegratedAssembler = true;
 }
 
 } // end of namespace llvm
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
index e6dc8868c705a6921abb6e9af1a8fa0cc0e79f1f..c3d43ebb407ecde1817ad874e58af77a85aff009 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.cpp
@@ -25,6 +25,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "mccodeemitter"
diff --git a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
index 5fa425c296a5eb40b1b1913dda1110792879462c..4cee8d904c9d16bcc15d37053a5f105910926c80 100644
--- a/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
+++ b/lib/Target/AVR/MCTargetDesc/AVRMCCodeEmitter.h
@@ -63,7 +63,7 @@ private:
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const;
 
-  /// Takes the compliment of a number (~0 - val).
+  /// Takes the complement of a number (~0 - val).
   unsigned encodeComplement(const MCInst &MI, unsigned OpNo,
                             SmallVectorImpl<MCFixup> &Fixups,
                             const MCSubtargetInfo &STI) const;
diff --git a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index afc321ea2c34e4a9374f34832ae6811a3128f3ef..1f355171ebd3f5b0207b4433f59f5e23d5c66813 100644
--- a/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -28,7 +28,7 @@ public:
   ~BPFAsmBackend() override = default;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -62,8 +62,8 @@ bool BPFAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 }
 
 void BPFAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                               unsigned DataSize, uint64_t Value,
-                               bool IsPCRel) const {
+                               unsigned DataSize, uint64_t Value, bool IsPCRel,
+                               MCContext &Ctx) const {
   if (Fixup.getKind() == FK_SecRel_4 || Fixup.getKind() == FK_SecRel_8) {
     assert(Value == 0);
   } else if (Fixup.getKind() == FK_Data_4 || Fixup.getKind() == FK_Data_8) {
diff --git a/lib/Target/CMakeLists.txt b/lib/Target/CMakeLists.txt
index 044db10fb3fa0ab4844837bb128d661f693a679d..1e6abfacb79217465db19499a127268273f5d865 100644
--- a/lib/Target/CMakeLists.txt
+++ b/lib/Target/CMakeLists.txt
@@ -17,3 +17,9 @@ foreach(t ${LLVM_TARGETS_TO_BUILD})
   message(STATUS "Targeting ${t}")
   add_subdirectory(${t})
 endforeach()
+
+# Currently we do not allow libraries from lib to reference targets directly.
+# This property is used to enforce that convention. It is important because the
+# logic in llvm_map_components_to_libnames is order dependent on the target
+# libraries being created.
+set_property(GLOBAL PROPERTY LLVM_TARGETS_CONFIGURED On)
diff --git a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index becc086c81b0a9638cf733e0ffdfc14deca9ce2f..4bbc36a86e5b76950baca77e4d77ac5779ca45cf 100644
--- a/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -63,21 +63,25 @@ using namespace llvm;
 static cl::opt<bool> EnableFutureRegs("mfuture-regs",
                                       cl::desc("Enable future registers"));
 
-static cl::opt<bool> WarnMissingParenthesis("mwarn-missing-parenthesis",
-cl::desc("Warn for missing parenthesis around predicate registers"),
-cl::init(true));
-static cl::opt<bool> ErrorMissingParenthesis("merror-missing-parenthesis",
-cl::desc("Error for missing parenthesis around predicate registers"),
-cl::init(false));
-static cl::opt<bool> WarnSignedMismatch("mwarn-sign-mismatch",
-cl::desc("Warn for mismatching a signed and unsigned value"),
-cl::init(true));
-static cl::opt<bool> WarnNoncontigiousRegister("mwarn-noncontigious-register",
-cl::desc("Warn for register names that arent contigious"),
-cl::init(true));
-static cl::opt<bool> ErrorNoncontigiousRegister("merror-noncontigious-register",
-cl::desc("Error for register names that aren't contigious"),
-cl::init(false));
+static cl::opt<bool> WarnMissingParenthesis(
+    "mwarn-missing-parenthesis",
+    cl::desc("Warn for missing parenthesis around predicate registers"),
+    cl::init(true));
+static cl::opt<bool> ErrorMissingParenthesis(
+    "merror-missing-parenthesis",
+    cl::desc("Error for missing parenthesis around predicate registers"),
+    cl::init(false));
+static cl::opt<bool> WarnSignedMismatch(
+    "mwarn-sign-mismatch",
+    cl::desc("Warn for mismatching a signed and unsigned value"),
+    cl::init(true));
+static cl::opt<bool> WarnNoncontigiousRegister(
+    "mwarn-noncontigious-register",
+    cl::desc("Warn for register names that arent contigious"), cl::init(true));
+static cl::opt<bool> ErrorNoncontigiousRegister(
+    "merror-noncontigious-register",
+    cl::desc("Error for register names that aren't contigious"),
+    cl::init(false));
 
 namespace {
 
@@ -123,9 +127,11 @@ class HexagonAsmParser : public MCTargetAsmParser {
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
-                               uint64_t &ErrorInfo, bool MatchingInlineAsm) override;
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
 
-  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op, unsigned Kind) override;
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
   bool OutOfRange(SMLoc IDLoc, long long Val, long long Max);
   int processInstruction(MCInst &Inst, OperandVector const &Operands,
                          SMLoc IDLoc);
@@ -168,11 +174,10 @@ public:
   bool parseInstruction(OperandVector &Operands);
   bool implicitExpressionLocation(OperandVector &Operands);
   bool parseExpressionOrOperand(OperandVector &Operands);
-  bool parseExpression(MCExpr const *& Expr);
+  bool parseExpression(MCExpr const *&Expr);
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
-                        SMLoc NameLoc, OperandVector &Operands) override
-  {
+                        SMLoc NameLoc, OperandVector &Operands) override {
     llvm_unreachable("Unimplemented");
   }
 
@@ -289,45 +294,63 @@ public:
     return false;
   }
 
-  bool isf32Ext() const { return false; }
-  bool iss32_0Imm() const { return CheckImmRange(32, 0, true, true, false); }
+  bool isa30_2Imm() const { return CheckImmRange(30, 2, true, true, true); }
+  bool isb30_2Imm() const { return CheckImmRange(30, 2, true, true, true); }
+  bool isb15_2Imm() const { return CheckImmRange(15, 2, true, true, false); }
+  bool isb13_2Imm() const { return CheckImmRange(13, 2, true, true, false); }
+
+  bool ism32_0Imm() const { return true; }
+
+  bool isf32Imm() const { return false; }
+  bool isf64Imm() const { return false; }
+  bool iss32_0Imm() const { return true; }
+  bool iss31_1Imm() const { return true; }
+  bool iss30_2Imm() const { return true; }
+  bool iss29_3Imm() const { return true; }
   bool iss23_2Imm() const { return CheckImmRange(23, 2, true, true, false); }
+  bool iss10_0Imm() const { return CheckImmRange(10, 0, true, false, false); }
+  bool iss10_6Imm() const { return CheckImmRange(10, 6, true, false, false); }
+  bool iss9_0Imm() const { return CheckImmRange(9, 0, true, false, false); }
   bool iss8_0Imm() const { return CheckImmRange(8, 0, true, false, false); }
   bool iss8_0Imm64() const { return CheckImmRange(8, 0, true, true, false); }
   bool iss7_0Imm() const { return CheckImmRange(7, 0, true, false, false); }
   bool iss6_0Imm() const { return CheckImmRange(6, 0, true, false, false); }
+  bool iss6_3Imm() const { return CheckImmRange(6, 3, true, false, false); }
   bool iss4_0Imm() const { return CheckImmRange(4, 0, true, false, false); }
   bool iss4_1Imm() const { return CheckImmRange(4, 1, true, false, false); }
   bool iss4_2Imm() const { return CheckImmRange(4, 2, true, false, false); }
   bool iss4_3Imm() const { return CheckImmRange(4, 3, true, false, false); }
-  bool iss4_6Imm() const { return CheckImmRange(4, 0, true, false, false); }
-  bool iss3_6Imm() const { return CheckImmRange(3, 0, true, false, false); }
   bool iss3_0Imm() const { return CheckImmRange(3, 0, true, false, false); }
 
   bool isu64_0Imm() const { return CheckImmRange(64, 0, false, true, true); }
-  bool isu32_0Imm() const { return CheckImmRange(32, 0, false, true, false); }
+  bool isu32_0Imm() const { return true; }
+  bool isu31_1Imm() const { return true; }
+  bool isu30_2Imm() const { return true; }
+  bool isu29_3Imm() const { return true; }
   bool isu26_6Imm() const { return CheckImmRange(26, 6, false, true, false); }
   bool isu16_0Imm() const { return CheckImmRange(16, 0, false, true, false); }
   bool isu16_1Imm() const { return CheckImmRange(16, 1, false, true, false); }
   bool isu16_2Imm() const { return CheckImmRange(16, 2, false, true, false); }
   bool isu16_3Imm() const { return CheckImmRange(16, 3, false, true, false); }
   bool isu11_3Imm() const { return CheckImmRange(11, 3, false, false, false); }
-  bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); }
-  bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); }
-  bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); }
   bool isu10_0Imm() const { return CheckImmRange(10, 0, false, false, false); }
   bool isu9_0Imm() const { return CheckImmRange(9, 0, false, false, false); }
   bool isu8_0Imm() const { return CheckImmRange(8, 0, false, false, false); }
   bool isu7_0Imm() const { return CheckImmRange(7, 0, false, false, false); }
   bool isu6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
+  bool isu6_1Imm() const { return CheckImmRange(6, 1, false, false, false); }
+  bool isu6_2Imm() const { return CheckImmRange(6, 2, false, false, false); }
+  bool isu6_3Imm() const { return CheckImmRange(6, 3, false, false, false); }
   bool isu5_0Imm() const { return CheckImmRange(5, 0, false, false, false); }
+  bool isu5_2Imm() const { return CheckImmRange(5, 2, false, false, false); }
+  bool isu5_3Imm() const { return CheckImmRange(5, 3, false, false, false); }
   bool isu4_0Imm() const { return CheckImmRange(4, 0, false, false, false); }
+  bool isu4_2Imm() const { return CheckImmRange(4, 2, false, false, false); }
   bool isu3_0Imm() const { return CheckImmRange(3, 0, false, false, false); }
+  bool isu3_1Imm() const { return CheckImmRange(3, 1, false, false, false); }
   bool isu2_0Imm() const { return CheckImmRange(2, 0, false, false, false); }
   bool isu1_0Imm() const { return CheckImmRange(1, 0, false, false, false); }
 
-  bool ism6_0Imm() const { return CheckImmRange(6, 0, false, false, false); }
-  bool isn8_0Imm() const { return CheckImmRange(8, 0, false, false, false); }
   bool isn1Const() const {
     if (!isImm())
       return false;
@@ -336,35 +359,18 @@ public:
       return false;
     return Value == -1;
   }
-
-  bool iss16_0Ext() const { return CheckImmRange(16 + 26, 0, true, true, true); }
-  bool iss12_0Ext() const { return CheckImmRange(12 + 26, 0, true, true, true); }
-  bool iss10_0Ext() const { return CheckImmRange(10 + 26, 0, true, true, true); }
-  bool iss9_0Ext() const { return CheckImmRange(9 + 26, 0, true, true, true); }
-  bool iss8_0Ext() const { return CheckImmRange(8 + 26, 0, true, true, true); }
-  bool iss7_0Ext() const { return CheckImmRange(7 + 26, 0, true, true, true); }
-  bool iss6_0Ext() const { return CheckImmRange(6 + 26, 0, true, true, true); }
-  bool iss11_0Ext() const {
+  bool iss11_0Imm() const {
     return CheckImmRange(11 + 26, 0, true, true, true);
   }
-  bool iss11_1Ext() const {
+  bool iss11_1Imm() const {
     return CheckImmRange(11 + 26, 1, true, true, true);
   }
-  bool iss11_2Ext() const {
+  bool iss11_2Imm() const {
     return CheckImmRange(11 + 26, 2, true, true, true);
   }
-  bool iss11_3Ext() const {
+  bool iss11_3Imm() const {
     return CheckImmRange(11 + 26, 3, true, true, true);
   }
-
-  bool isu7_0Ext() const { return CheckImmRange(7 + 26, 0, false, true, true); }
-  bool isu8_0Ext() const { return CheckImmRange(8 + 26, 0, false, true, true); }
-  bool isu9_0Ext() const { return CheckImmRange(9 + 26, 0, false, true, true); }
-  bool isu10_0Ext() const { return CheckImmRange(10 + 26, 0, false, true, true); }
-  bool isu6_0Ext() const { return CheckImmRange(6 + 26, 0, false, true, true); }
-  bool isu6_1Ext() const { return CheckImmRange(6 + 26, 1, false, true, true); }
-  bool isu6_2Ext() const { return CheckImmRange(6 + 26, 2, false, true, true); }
-  bool isu6_3Ext() const { return CheckImmRange(6 + 26, 3, false, true, true); }
   bool isu32_0MustExt() const { return isImm(); }
 
   void addRegOperands(MCInst &Inst, unsigned N) const {
@@ -392,188 +398,10 @@ public:
     Inst.addOperand(MCOperand::createExpr(Expr));
   }
 
-  void addf32ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void adds32_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds23_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds8_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds8_0Imm64Operands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds6_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_1ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds4_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds3_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-
-  void addu64_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu32_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu26_6ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_1ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu16_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu11_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu10_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu9_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu8_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu7_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_1ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_2ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_3ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu5_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu4_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu3_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu2_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu1_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void addm6_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addn8_0ImmOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void adds16_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds12_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds10_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds9_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds8_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds6_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_1ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_2ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
-  void adds11_3ExtOperands(MCInst &Inst, unsigned N) const {
-    addSignedImmOperands(Inst, N);
-  }
   void addn1ConstOperands(MCInst &Inst, unsigned N) const {
     addImmOperands(Inst, N);
   }
 
-  void addu7_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu8_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu9_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu10_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_0ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_1ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_2ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu6_3ExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-  void addu32_0MustExtOperands(MCInst &Inst, unsigned N) const {
-    addImmOperands(Inst, N);
-  }
-
-  void adds4_6ImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE =
-        dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
-    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
-  }
-
-  void adds3_6ImmOperands(MCInst &Inst, unsigned N) const {
-    assert(N == 1 && "Invalid number of operands!");
-    const MCConstantExpr *CE =
-        dyn_cast<MCConstantExpr>(&HexagonMCInstrInfo::getExpr(*getImm()));
-    Inst.addOperand(MCOperand::createImm(CE->getValue() * 64));
-  }
-
   StringRef getToken() const {
     assert(Kind == Token && "Invalid access!");
     return StringRef(Tok.Data, Tok.Length);
@@ -749,10 +577,6 @@ bool HexagonAsmParser::matchBundleOptions() {
       HexagonMCInstrInfo::setInnerLoop(MCB);
     else if (Option.compare_lower("endloop1") == 0)
       HexagonMCInstrInfo::setOuterLoop(MCB);
-    else if (Option.compare_lower("mem_noshuf") == 0)
-      HexagonMCInstrInfo::setMemReorderDisabled(MCB);
-    else if (Option.compare_lower("mem_shuf") == 0)
-      HexagonMCInstrInfo::setMemStoreReorderEnabled(MCB);
     else
       return true;
     Lex();
@@ -770,8 +594,7 @@ void HexagonAsmParser::canonicalizeImmediates(MCInst &MCI) {
       int64_t Value (I.getImm());
       NewInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
           MCConstantExpr::create(Value, getContext()), getContext())));
-    }
-    else {
+    } else {
       if (I.isExpr() && cast<HexagonMCExpr>(I.getExpr())->signMismatch() &&
           WarnSignedMismatch)
         Warning (MCI.getLoc(), "Signed/Unsigned mismatch");
@@ -1066,6 +889,9 @@ bool HexagonAsmParser::ParseDirectiveComm(bool IsLocal, SMLoc Loc) {
 
 // validate register against architecture
 bool HexagonAsmParser::RegisterMatchesArch(unsigned MatchNum) const {
+  if (HexagonMCRegisterClasses[Hexagon::V62RegsRegClassID].contains(MatchNum))
+    if (!getSTI().getFeatureBits()[Hexagon::ArchV62])
+      return false;
   return true;
 }
 
@@ -1171,11 +997,15 @@ bool HexagonAsmParser::parseOperand(OperandVector &Operands) {
 bool HexagonAsmParser::isLabel(AsmToken &Token) {
   MCAsmLexer &Lexer = getLexer();
   AsmToken const &Second = Lexer.getTok();
-  AsmToken Third = Lexer.peekTok();  
+  AsmToken Third = Lexer.peekTok();
   StringRef String = Token.getString();
   if (Token.is(AsmToken::TokenKind::LCurly) ||
       Token.is(AsmToken::TokenKind::RCurly))
     return false;
+  // special case for parsing vwhist256:sat
+  if (String.lower() == "vwhist256" && Second.is(AsmToken::Colon) &&
+      Third.getString().lower() == "sat")
+    return false;
   if (!Token.is(AsmToken::TokenKind::Identifier))
     return true;
   if (!matchRegister(String.lower()))
@@ -1756,8 +1586,8 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
           TmpInst.setOpcode(Hexagon::L2_loadrdgp);
 
         TmpInst.addOperand(MO_0);
-        TmpInst.addOperand(
-            MCOperand::createExpr(MCSymbolRefExpr::create(Sym, getContext())));
+        TmpInst.addOperand(MCOperand::createExpr(HexagonMCExpr::create(
+          MCSymbolRefExpr::create(Sym, getContext()), getContext())));
         Inst = TmpInst;
       }
     }
@@ -2142,6 +1972,67 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
     Inst = TmpInst;
     break;
   }
+  case Hexagon::PS_loadrubabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrubgp);
+    break;
+  case Hexagon::PS_loadrbabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrbgp);
+    break;
+  case Hexagon::PS_loadruhabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadruhgp);
+    break;
+  case Hexagon::PS_loadrhabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrhgp);
+    break;
+  case Hexagon::PS_loadriabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrigp);
+    break;
+  case Hexagon::PS_loadrdabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(1).getExpr()))
+      Inst.setOpcode(Hexagon::L2_loadrdgp);
+    break;
+  case Hexagon::PS_storerbabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerbgp);
+    break;
+  case Hexagon::PS_storerhabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerhgp);
+    break;
+  case Hexagon::PS_storerfabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerfgp);
+    break;
+  case Hexagon::PS_storeriabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerigp);
+    break;
+  case Hexagon::PS_storerdabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerdgp);
+    break;
+  case Hexagon::PS_storerbnewabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerbnewgp);
+    break;
+  case Hexagon::PS_storerhnewabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerhnewgp);
+    break;
+  case Hexagon::PS_storerinewabs:
+    if (!HexagonMCInstrInfo::mustExtend(*Inst.getOperand(0).getExpr()))
+      Inst.setOpcode(Hexagon::S2_storerinewgp);
+    break;
+  case Hexagon::A2_zxtb: {
+    Inst.setOpcode(Hexagon::A2_andir);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(255, Context)));
+    break;
+  }
   } // switch
 
   return Match_Success;
diff --git a/lib/Target/Hexagon/BitTracker.cpp b/lib/Target/Hexagon/BitTracker.cpp
index 963fb99ce09b5f621d311f4c5e4bc3e74b37cffb..61d3630ac095a5ed1c1ae24f734addec36bc302e 100644
--- a/lib/Target/Hexagon/BitTracker.cpp
+++ b/lib/Target/Hexagon/BitTracker.cpp
@@ -317,6 +317,15 @@ bool BT::RegisterCell::operator== (const RegisterCell &RC) const {
   return true;
 }
 
+BT::RegisterCell &BT::RegisterCell::regify(unsigned R) {
+  for (unsigned i = 0, n = width(); i < n; ++i) {
+    const BitValue &V = Bits[i];
+    if (V.Type == BitValue::Ref && V.RefI.Reg == 0)
+      Bits[i].RefI = BitRef(R, i);
+  }
+  return *this;
+}
+
 uint16_t BT::MachineEvaluator::getRegBitWidth(const RegisterRef &RR) const {
   // The general problem is with finding a register class that corresponds
   // to a given reference reg:sub. There can be several such classes, and
@@ -378,12 +387,7 @@ void BT::MachineEvaluator::putCell(const RegisterRef &RR, RegisterCell RC,
     return;
   assert(RR.Sub == 0 && "Unexpected sub-register in definition");
   // Eliminate all ref-to-reg-0 bit values: replace them with "self".
-  for (unsigned i = 0, n = RC.width(); i < n; ++i) {
-    const BitValue &V = RC[i];
-    if (V.Type == BitValue::Ref && V.RefI.Reg == 0)
-      RC[i].RefI = BitRef(RR.Reg, i);
-  }
-  M[RR.Reg] = RC;
+  M[RR.Reg] = RC.regify(RR.Reg);
 }
 
 // Check if the cell represents a compile-time integer value.
diff --git a/lib/Target/Hexagon/BitTracker.h b/lib/Target/Hexagon/BitTracker.h
index 48c5f2266acfabc2b9de0a1bf6c219d80ae8247d..a547b34e852f6e28268f7d0255389dd7879b48ce 100644
--- a/lib/Target/Hexagon/BitTracker.h
+++ b/lib/Target/Hexagon/BitTracker.h
@@ -283,6 +283,9 @@ struct BitTracker::RegisterCell {
     return !operator==(RC);
   }
 
+  // Replace the ref-to-reg-0 bit values with the given register.
+  RegisterCell &regify(unsigned R);
+
   // Generate a "ref" cell for the corresponding register. In the resulting
   // cell each bit will be described as being the same as the corresponding
   // bit in register Reg (i.e. the cell is "defined" by register Reg).
diff --git a/lib/Target/Hexagon/CMakeLists.txt b/lib/Target/Hexagon/CMakeLists.txt
index 3e2a82297979a325839af3c2773081f1e276187e..2f3dd3326fccdff160d6097a0353fdd7a1acefd9 100644
--- a/lib/Target/Hexagon/CMakeLists.txt
+++ b/lib/Target/Hexagon/CMakeLists.txt
@@ -60,10 +60,9 @@ add_llvm_target(HexagonCodeGen
   RDFGraph.cpp
   RDFLiveness.cpp
   RDFRegisters.cpp
-  )
+)
 
 add_subdirectory(AsmParser)
 add_subdirectory(TargetInfo)
 add_subdirectory(MCTargetDesc)
 add_subdirectory(Disassembler)
-
diff --git a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index c05fbc1d77567a944e5f67605742eca54ace7315..ae15ed0e924055c73b6666d0a00a42e6946c8172 100644
--- a/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -57,11 +57,38 @@ public:
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &VStream,
                               raw_ostream &CStream) const override;
-
-  void adjustExtendedInstructions(MCInst &MCI, MCInst const &MCB) const;
   void addSubinstOperands(MCInst *MI, unsigned opcode, unsigned inst) const;
 };
 
+namespace {
+  uint32_t fullValue(MCInstrInfo const &MCII, MCInst &MCB, MCInst &MI,
+                     int64_t Value) {
+    MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
+      MCB, HexagonMCInstrInfo::bundleSize(MCB));
+    if (!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI))
+      return Value;
+    unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
+    uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f;
+    int64_t Bits;
+    bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits);
+    assert(Success); (void)Success;
+    uint32_t Upper26 = static_cast<uint32_t>(Bits);
+    uint32_t Operand = Upper26 | Lower6;
+    return Operand;
+  }
+  HexagonDisassembler const &disassembler(void const *Decoder) {
+    return *static_cast<HexagonDisassembler const *>(Decoder);
+  }
+  template <size_t T>
+  void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
+    HexagonDisassembler const &Disassembler = disassembler(Decoder);
+    int64_t FullValue =
+        fullValue(*Disassembler.MCII, **Disassembler.CurrentBundle, MI,
+                  SignExtend64<T>(tmp));
+    int64_t Extended = SignExtend64<32>(FullValue);
+    HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
+  }
+}
 } // end anonymous namespace
 
 // Forward declare these because the auto-generated code will reference them.
@@ -70,6 +97,10 @@ public:
 static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address,
                                                const void *Decoder);
+static DecodeStatus DecodeGeneralSubRegsRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Address,
+                                                      const void *Decoder);
 static DecodeStatus DecodeIntRegsLow8RegisterClass(MCInst &Inst, unsigned RegNo,
                                                    uint64_t Address,
                                                    const void *Decoder);
@@ -79,6 +110,9 @@ static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t Address,
                                                   const void *Decoder);
+static DecodeStatus
+DecodeGeneralDoubleLow8RegsRegisterClass(MCInst &Inst, unsigned RegNo,
+                                         uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t Address,
                                                   const void *Decoder);
@@ -98,31 +132,10 @@ static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder);
 
-static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn);
-static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
-                                 void const *Decoder);
-
-static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
-                                 raw_ostream &os);
-
-static unsigned getRegFromSubinstEncoding(unsigned encoded_reg);
-
 static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
                                        uint64_t Address, const void *Decoder);
-static DecodeStatus s16_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                  const void *Decoder);
-static DecodeStatus s12_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                  const void *Decoder);
-static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                    const void *Decoder);
-static DecodeStatus s10_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                  const void *Decoder);
+static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
+                                    uint64_t /*Address*/, const void *Decoder);
 static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                  const void *Decoder);
 static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
@@ -135,13 +148,12 @@ static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
 static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
-static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
-                                   const void *Decoder);
-static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
+static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                    const void *Decoder);
 static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
                                     const void *Decoder);
 
+#include "HexagonDepDecoders.h"
 #include "HexagonGenDisassemblerTables.inc"
 
 static MCDisassembler *createHexagonDisassembler(const Target &T,
@@ -175,20 +187,31 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Size += HEXAGON_INSTR_SIZE;
     Bytes = Bytes.slice(HEXAGON_INSTR_SIZE);
   }
-  if(Result == MCDisassembler::Fail)
+  if (Result == MCDisassembler::Fail)
     return Result;
-  HexagonMCChecker Checker (*MCII, STI, MI, MI, *getContext().getRegisterInfo());
-  if(!Checker.check())
+  if (Size > HEXAGON_MAX_PACKET_SIZE)
+    return MCDisassembler::Fail;
+  HexagonMCChecker Checker(*MCII, STI, MI, MI, *getContext().getRegisterInfo());
+  if (!Checker.check())
     return MCDisassembler::Fail;
   return MCDisassembler::Success;
 }
 
-static HexagonDisassembler const &disassembler(void const *Decoder) {
-  return *static_cast<HexagonDisassembler const *>(Decoder);
+namespace {
+void adjustDuplex(MCInst &MI, MCContext &Context) {
+  switch (MI.getOpcode()) {
+  case Hexagon::SA1_setin1:
+    MI.insert(MI.begin() + 1,
+              MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
+    break;
+  case Hexagon::SA1_dec:
+    MI.insert(MI.begin() + 2,
+              MCOperand::createExpr(MCConstantExpr::create(-1, Context)));
+    break;
+  default:
+    break;
+  }
 }
-
-static MCContext &contextFromDecoder(void const *Decoder) {
-  return disassembler(Decoder).getContext();
 }
 
 DecodeStatus HexagonDisassembler::getSingleInstruction(
@@ -196,8 +219,7 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
     raw_ostream &os, raw_ostream &cs, bool &Complete) const {
   assert(Bytes.size() >= HEXAGON_INSTR_SIZE);
 
-  uint32_t Instruction =
-      (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | (Bytes[0] << 0);
+  uint32_t Instruction = support::endian::read32le(Bytes.data());
 
   auto BundleSize = HexagonMCInstrInfo::bundleSize(MCB);
   if ((Instruction & HexagonII::INST_PARSE_MASK) ==
@@ -210,103 +232,92 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
       return DecodeStatus::Fail;
   }
 
-  DecodeStatus Result = DecodeStatus::Success;
+  MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
+      MCB, HexagonMCInstrInfo::bundleSize(MCB));
+
+  DecodeStatus Result = DecodeStatus::Fail;
   if ((Instruction & HexagonII::INST_PARSE_MASK) ==
       HexagonII::INST_PARSE_DUPLEX) {
-    // Determine the instruction class of each instruction in the duplex.
-    unsigned duplexIClass, IClassLow, IClassHigh;
-
+    unsigned duplexIClass;
+    uint8_t const *DecodeLow, *DecodeHigh;
     duplexIClass = ((Instruction >> 28) & 0xe) | ((Instruction >> 13) & 0x1);
     switch (duplexIClass) {
     default:
       return MCDisassembler::Fail;
     case 0:
-      IClassLow = HexagonII::HSIG_L1;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_L132;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 1:
-      IClassLow = HexagonII::HSIG_L2;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_L232;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 2:
-      IClassLow = HexagonII::HSIG_L2;
-      IClassHigh = HexagonII::HSIG_L2;
+      DecodeLow = DecoderTableSUBINSN_L232;
+      DecodeHigh = DecoderTableSUBINSN_L232;
       break;
     case 3:
-      IClassLow = HexagonII::HSIG_A;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_A32;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 4:
-      IClassLow = HexagonII::HSIG_L1;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_L132;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 5:
-      IClassLow = HexagonII::HSIG_L2;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_L232;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 6:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 7:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_A;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_A32;
       break;
     case 8:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 9:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_L2;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_L232;
       break;
     case 10:
-      IClassLow = HexagonII::HSIG_S1;
-      IClassHigh = HexagonII::HSIG_S1;
+      DecodeLow = DecoderTableSUBINSN_S132;
+      DecodeHigh = DecoderTableSUBINSN_S132;
       break;
     case 11:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_S1;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_S132;
       break;
     case 12:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_L1;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_L132;
       break;
     case 13:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_L2;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_L232;
       break;
     case 14:
-      IClassLow = HexagonII::HSIG_S2;
-      IClassHigh = HexagonII::HSIG_S2;
+      DecodeLow = DecoderTableSUBINSN_S232;
+      DecodeHigh = DecoderTableSUBINSN_S232;
       break;
     }
-
-    // Set the MCInst to be a duplex instruction. Which one doesn't matter.
-    MI.setOpcode(Hexagon::DuplexIClass0);
-
-    // Decode each instruction in the duplex.
-    // Create an MCInst for each instruction.
-    unsigned instLow = Instruction & 0x1fff;
-    unsigned instHigh = (Instruction >> 16) & 0x1fff;
-    unsigned opLow;
-    if (GetSubinstOpcode(IClassLow, instLow, opLow, os) !=
-        MCDisassembler::Success)
-      return MCDisassembler::Fail;
-    unsigned opHigh;
-    if (GetSubinstOpcode(IClassHigh, instHigh, opHigh, os) !=
-        MCDisassembler::Success)
-      return MCDisassembler::Fail;
+    MI.setOpcode(Hexagon::DuplexIClass0 + duplexIClass);
     MCInst *MILow = new (getContext()) MCInst;
-    MILow->setOpcode(opLow);
     MCInst *MIHigh = new (getContext()) MCInst;
-    MIHigh->setOpcode(opHigh);
-    addSubinstOperands(MILow, opLow, instLow);
-    addSubinstOperands(MIHigh, opHigh, instHigh);
-    // see ConvertToSubInst() in
-    // lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
-
-    // Add the duplex instruction MCInsts as operands to the passed in MCInst.
+    Result = decodeInstruction(DecodeLow, *MILow, Instruction & 0x1fff, Address,
+                               this, STI);
+    if (Result != DecodeStatus::Success)
+      return DecodeStatus::Fail;
+    adjustDuplex(*MILow, getContext());
+    Result = decodeInstruction(
+        DecodeHigh, *MIHigh, (Instruction >> 16) & 0x1fff, Address, this, STI);
+    if (Result != DecodeStatus::Success)
+      return DecodeStatus::Fail;
+    adjustDuplex(*MIHigh, getContext());
     MCOperand OPLow = MCOperand::createInst(MILow);
     MCOperand OPHigh = MCOperand::createInst(MIHigh);
     MI.addOperand(OPLow);
@@ -316,34 +327,23 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
     if ((Instruction & HexagonII::INST_PARSE_MASK) ==
         HexagonII::INST_PARSE_PACKET_END)
       Complete = true;
-    // Calling the auto-generated decoder function.
-    Result =
-        decodeInstruction(DecoderTable32, MI, Instruction, Address, this, STI);
 
-    // If a, "standard" insn isn't found check special cases.
-    if (MCDisassembler::Success != Result ||
-        MI.getOpcode() == Hexagon::A4_ext) {
-      Result = decodeImmext(MI, Instruction, this);
-      if (MCDisassembler::Success != Result) {
-        Result = decodeSpecial(MI, Instruction);
-      }
-    } else {
-      // If the instruction is a compound instruction, register values will
-      // follow the duplex model, so the register values in the MCInst are
-      // incorrect. If the instruction is a compound, loop through the
-      // operands and change registers appropriately.
-      if (HexagonMCInstrInfo::getType(*MCII, MI) == HexagonII::TypeCOMPOUND) {
-        for (MCInst::iterator i = MI.begin(), last = MI.end(); i < last; ++i) {
-          if (i->isReg()) {
-            unsigned reg = i->getReg() - Hexagon::R0;
-            i->setReg(getRegFromSubinstEncoding(reg));
-          }
-        }
-      }
-    }
+    if (Extender != nullptr)
+      Result = decodeInstruction(DecoderTableMustExtend32, MI, Instruction,
+                                 Address, this, STI);
+
+    if (Result != MCDisassembler::Success)
+      Result = decodeInstruction(DecoderTable32, MI, Instruction, Address, this,
+                                 STI);
+
+    if (Result != MCDisassembler::Success &&
+        STI.getFeatureBits()[Hexagon::ExtensionHVX])
+      Result = decodeInstruction(DecoderTableEXT_mmvec32, MI, Instruction,
+                                 Address, this, STI);
+
   }
 
-  switch(MI.getOpcode()) {
+  switch (MI.getOpcode()) {
   case Hexagon::J4_cmpeqn1_f_jumpnv_nt:
   case Hexagon::J4_cmpeqn1_f_jumpnv_t:
   case Hexagon::J4_cmpeqn1_fp0_jump_nt:
@@ -368,7 +368,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
   case Hexagon::J4_cmpgtn1_tp0_jump_t:
   case Hexagon::J4_cmpgtn1_tp1_jump_nt:
   case Hexagon::J4_cmpgtn1_tp1_jump_t:
-    MI.insert(MI.begin() + 1, MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
+    MI.insert(MI.begin() + 1,
+              MCOperand::createExpr(MCConstantExpr::create(-1, getContext())));
     break;
   default:
     break;
@@ -423,13 +424,10 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
       return MCDisassembler::Fail;
   }
 
-  adjustExtendedInstructions(MI, MCB);
-  MCInst const *Extender =
-    HexagonMCInstrInfo::extenderForIndex(MCB,
-                                         HexagonMCInstrInfo::bundleSize(MCB));
-  if(Extender != nullptr) {
-    MCInst const & Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI) ?
-                          *MI.getOperand(1).getInst() : MI;
+  if (Extender != nullptr) {
+    MCInst const &Inst = HexagonMCInstrInfo::isDuplex(*MCII, MI)
+                             ? *MI.getOperand(1).getInst()
+                             : MI;
     if (!HexagonMCInstrInfo::isExtendable(*MCII, Inst) &&
         !HexagonMCInstrInfo::isExtended(*MCII, Inst))
       return MCDisassembler::Fail;
@@ -437,68 +435,6 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(
   return Result;
 }
 
-void HexagonDisassembler::adjustExtendedInstructions(MCInst &MCI,
-                                                     MCInst const &MCB) const {
-  if (!HexagonMCInstrInfo::hasExtenderForIndex(
-          MCB, HexagonMCInstrInfo::bundleSize(MCB))) {
-    unsigned opcode;
-    // This code is used by the disassembler to disambiguate between GP
-    // relative and absolute addressing instructions since they both have
-    // same encoding bits. However, an absolute addressing instruction must
-    // follow an immediate extender. Disassembler alwaus select absolute
-    // addressing instructions first and uses this code to change them into
-    // GP relative instruction in the absence of the corresponding immediate
-    // extender.
-    switch (MCI.getOpcode()) {
-    case Hexagon::PS_storerbabs:
-      opcode = Hexagon::S2_storerbgp;
-      break;
-    case Hexagon::PS_storerhabs:
-      opcode = Hexagon::S2_storerhgp;
-      break;
-    case Hexagon::PS_storerfabs:
-      opcode = Hexagon::S2_storerfgp;
-      break;
-    case Hexagon::PS_storeriabs:
-      opcode = Hexagon::S2_storerigp;
-      break;
-    case Hexagon::PS_storerbnewabs:
-      opcode = Hexagon::S2_storerbnewgp;
-      break;
-    case Hexagon::PS_storerhnewabs:
-      opcode = Hexagon::S2_storerhnewgp;
-      break;
-    case Hexagon::PS_storerinewabs:
-      opcode = Hexagon::S2_storerinewgp;
-      break;
-    case Hexagon::PS_storerdabs:
-      opcode = Hexagon::S2_storerdgp;
-      break;
-    case Hexagon::PS_loadrbabs:
-      opcode = Hexagon::L2_loadrbgp;
-      break;
-    case Hexagon::PS_loadrubabs:
-      opcode = Hexagon::L2_loadrubgp;
-      break;
-    case Hexagon::PS_loadrhabs:
-      opcode = Hexagon::L2_loadrhgp;
-      break;
-    case Hexagon::PS_loadruhabs:
-      opcode = Hexagon::L2_loadruhgp;
-      break;
-    case Hexagon::PS_loadriabs:
-      opcode = Hexagon::L2_loadrigp;
-      break;
-    case Hexagon::PS_loadrdabs:
-      opcode = Hexagon::L2_loadrdgp;
-      break;
-    default:
-      opcode = MCI.getOpcode();
-    }
-    MCI.setOpcode(opcode);
-  }
-}
-
 static DecodeStatus DecodeRegisterClass(MCInst &Inst, unsigned RegNo,
                                         ArrayRef<MCPhysReg> Table) {
   if (RegNo < Table.size()) {
@@ -530,6 +466,20 @@ static DecodeStatus DecodeIntRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo, IntRegDecoderTable);
 }
 
+static DecodeStatus DecodeGeneralSubRegsRegisterClass(MCInst &Inst,
+                                                      unsigned RegNo,
+                                                      uint64_t Address,
+                                                      const void *Decoder) {
+  static const MCPhysReg GeneralSubRegDecoderTable[] = {
+      Hexagon::R0,  Hexagon::R1,  Hexagon::R2,  Hexagon::R3,
+      Hexagon::R4,  Hexagon::R5,  Hexagon::R6,  Hexagon::R7,
+      Hexagon::R16, Hexagon::R17, Hexagon::R18, Hexagon::R19,
+      Hexagon::R20, Hexagon::R21, Hexagon::R22, Hexagon::R23,
+  };
+
+  return DecodeRegisterClass(Inst, RegNo, GeneralSubRegDecoderTable);
+}
+
 static DecodeStatus DecodeVectorRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
@@ -557,6 +507,15 @@ static DecodeStatus DecodeDoubleRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return DecodeRegisterClass(Inst, RegNo >> 1, DoubleRegDecoderTable);
 }
 
+static DecodeStatus DecodeGeneralDoubleLow8RegsRegisterClass(
+    MCInst &Inst, unsigned RegNo, uint64_t /*Address*/, const void *Decoder) {
+  static const MCPhysReg GeneralDoubleLow8RegDecoderTable[] = {
+      Hexagon::D0, Hexagon::D1, Hexagon::D2,  Hexagon::D3,
+      Hexagon::D8, Hexagon::D9, Hexagon::D10, Hexagon::D11};
+
+  return DecodeRegisterClass(Inst, RegNo, GeneralDoubleLow8RegDecoderTable);
+}
+
 static DecodeStatus DecodeVecDblRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                   uint64_t /*Address*/,
                                                   const void *Decoder) {
@@ -590,17 +549,23 @@ static DecodeStatus DecodeVecPredRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t /*Address*/,
                                                const void *Decoder) {
+  using namespace Hexagon;
   static const MCPhysReg CtrlRegDecoderTable[] = {
-    Hexagon::SA0, Hexagon::LC0, Hexagon::SA1, Hexagon::LC1,
-    Hexagon::P3_0, Hexagon::C5, Hexagon::C6, Hexagon::C7,
-    Hexagon::USR, Hexagon::PC, Hexagon::UGP, Hexagon::GP,
-    Hexagon::CS0, Hexagon::CS1, Hexagon::UPCL, Hexagon::UPC
+    /*  0 */  SA0,        LC0,        SA1,        LC1,
+    /*  4 */  P3_0,       C5,         C6,         C7,
+    /*  8 */  USR,        PC,         UGP,        GP,
+    /* 12 */  CS0,        CS1,        UPCYCLELO,  UPCYCLEHI,
+    /* 16 */  FRAMELIMIT, FRAMEKEY,   PKTCOUNTLO, PKTCOUNTHI,
+    /* 20 */  0,          0,          0,          0,
+    /* 24 */  0,          0,          0,          0,
+    /* 28 */  0,          0,          UTIMERLO,   UTIMERHI
   };
 
   if (RegNo >= array_lengthof(CtrlRegDecoderTable))
     return MCDisassembler::Fail;
 
-  if (CtrlRegDecoderTable[RegNo] == Hexagon::NoRegister)
+  static_assert(NoRegister == 0, "Expecting NoRegister to be 0");
+  if (CtrlRegDecoderTable[RegNo] == NoRegister)
     return MCDisassembler::Fail;
 
   unsigned Register = CtrlRegDecoderTable[RegNo];
@@ -611,20 +576,23 @@ static DecodeStatus DecodeCtrRegsRegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus DecodeCtrRegs64RegisterClass(MCInst &Inst, unsigned RegNo,
                                                  uint64_t /*Address*/,
                                                  const void *Decoder) {
+  using namespace Hexagon;
   static const MCPhysReg CtrlReg64DecoderTable[] = {
-      Hexagon::C1_0,   Hexagon::NoRegister,
-      Hexagon::C3_2,   Hexagon::NoRegister,
-      Hexagon::C7_6,   Hexagon::NoRegister,
-      Hexagon::C9_8,   Hexagon::NoRegister,
-      Hexagon::C11_10, Hexagon::NoRegister,
-      Hexagon::CS,     Hexagon::NoRegister,
-      Hexagon::UPC,    Hexagon::NoRegister
+    /*  0 */  C1_0,       0,          C3_2,       0,
+    /*  4 */  C5_4,       0,          C7_6,       0,
+    /*  8 */  C9_8,       0,          C11_10,     0,
+    /* 12 */  CS,         0,          UPCYCLE,    0,
+    /* 16 */  C17_16,     0,          PKTCOUNT,   0,
+    /* 20 */  0,          0,          0,          0,
+    /* 24 */  0,          0,          0,          0,
+    /* 28 */  0,          0,          UTIMER,     0
   };
 
   if (RegNo >= array_lengthof(CtrlReg64DecoderTable))
     return MCDisassembler::Fail;
 
-  if (CtrlReg64DecoderTable[RegNo] == Hexagon::NoRegister)
+  static_assert(NoRegister == 0, "Expecting NoRegister to be 0");
+  if (CtrlReg64DecoderTable[RegNo] == NoRegister)
     return MCDisassembler::Fail;
 
   unsigned Register = CtrlReg64DecoderTable[RegNo];
@@ -650,132 +618,23 @@ static DecodeStatus DecodeModRegsRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
-static uint32_t fullValue(MCInstrInfo const &MCII, MCInst &MCB, MCInst &MI,
-                          int64_t Value) {
-  MCInst const *Extender = HexagonMCInstrInfo::extenderForIndex(
-    MCB, HexagonMCInstrInfo::bundleSize(MCB));
-  if(!Extender || MI.size() != HexagonMCInstrInfo::getExtendableOp(MCII, MI))
-    return Value;
-  unsigned Alignment = HexagonMCInstrInfo::getExtentAlignment(MCII, MI);
-  uint32_t Lower6 = static_cast<uint32_t>(Value >> Alignment) & 0x3f;
-  int64_t Bits;
-  bool Success = Extender->getOperand(0).getExpr()->evaluateAsAbsolute(Bits);
-  assert(Success);(void)Success;
-  uint32_t Upper26 = static_cast<uint32_t>(Bits);
-  uint32_t Operand = Upper26 | Lower6;
-  return Operand;
-}
-
-template <size_t T>
-static void signedDecoder(MCInst &MI, unsigned tmp, const void *Decoder) {
-  HexagonDisassembler const &Disassembler = disassembler(Decoder);
-  int64_t FullValue = fullValue(*Disassembler.MCII,
-                                **Disassembler.CurrentBundle,
-                                MI, SignExtend64<T>(tmp));
-  int64_t Extended = SignExtend64<32>(FullValue);
-  HexagonMCInstrInfo::addConstant(MI, Extended,
-                                  Disassembler.getContext());
-}
-
 static DecodeStatus unsignedImmDecoder(MCInst &MI, unsigned tmp,
                                        uint64_t /*Address*/,
                                        const void *Decoder) {
   HexagonDisassembler const &Disassembler = disassembler(Decoder);
-  int64_t FullValue = fullValue(*Disassembler.MCII,
-                                **Disassembler.CurrentBundle,
-                                MI, tmp);
+  int64_t FullValue =
+      fullValue(*Disassembler.MCII, **Disassembler.CurrentBundle, MI, tmp);
   assert(FullValue >= 0 && "Negative in unsigned decoder");
   HexagonMCInstrInfo::addConstant(MI, FullValue, Disassembler.getContext());
   return MCDisassembler::Success;
 }
 
-static DecodeStatus s16_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                  uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<16>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s12_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                  uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<12>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<11>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_1ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder) {
-  HexagonMCInstrInfo::addConstant(MI, SignExtend64<12>(tmp), contextFromDecoder(Decoder));
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_2ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s32_0ImmDecoder(MCInst &MI, unsigned tmp,
                                     uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<13>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s11_3ImmDecoder(MCInst &MI, unsigned tmp,
-                                    uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<14>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s10_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                  uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<10>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp, uint64_t /*Address*/,
-                                 const void *Decoder) {
-  signedDecoder<8>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<6>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<4>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<5>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<6>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<7>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s4_6ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<10>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-
-static DecodeStatus s3_6ImmDecoder(MCInst &MI, unsigned tmp,
-                                   uint64_t /*Address*/, const void *Decoder) {
-  signedDecoder<19>(MI, tmp, Decoder);
+  HexagonDisassembler const &Disassembler = disassembler(Decoder);
+  unsigned Bits = HexagonMCInstrInfo::getExtentBits(*Disassembler.MCII, MI);
+  tmp = SignExtend64(tmp, Bits);
+  signedDecoder<32>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
@@ -787,838 +646,13 @@ static DecodeStatus brtargetDecoder(MCInst &MI, unsigned tmp, uint64_t Address,
   // r13_2 is not extendable, so if there are no extent bits, it's r13_2
   if (Bits == 0)
     Bits = 15;
-  uint32_t FullValue = fullValue(*Disassembler.MCII,
-                                **Disassembler.CurrentBundle,
-                                MI, SignExtend64(tmp, Bits));
+  uint32_t FullValue =
+      fullValue(*Disassembler.MCII, **Disassembler.CurrentBundle, MI,
+                SignExtend64(tmp, Bits));
   int64_t Extended = SignExtend64<32>(FullValue) + Address;
-  if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true,
-                                              0, 4))
+  if (!Disassembler.tryAddingSymbolicOperand(MI, Extended, Address, true, 0, 4))
     HexagonMCInstrInfo::addConstant(MI, Extended, Disassembler.getContext());
   return MCDisassembler::Success;
 }
 
-// Addressing mode dependent load store opcode map.
-//   - If an insn is preceded by an extender the address is absolute.
-//      - memw(##symbol) = r0
-//   - If an insn is not preceded by an extender the address is GP relative.
-//      - memw(gp + #symbol) = r0
-// Please note that the instructions must be ordered in the descending order
-// of their opcode.
-// HexagonII::INST_ICLASS_ST
-static const unsigned int StoreConditionalOpcodeData[][2] = {
-    {S4_pstorerdfnew_abs, 0xafc02084},
-    {S4_pstorerdtnew_abs, 0xafc02080},
-    {S4_pstorerdf_abs, 0xafc00084},
-    {S4_pstorerdt_abs, 0xafc00080},
-    {S4_pstorerinewfnew_abs, 0xafa03084},
-    {S4_pstorerinewtnew_abs, 0xafa03080},
-    {S4_pstorerhnewfnew_abs, 0xafa02884},
-    {S4_pstorerhnewtnew_abs, 0xafa02880},
-    {S4_pstorerbnewfnew_abs, 0xafa02084},
-    {S4_pstorerbnewtnew_abs, 0xafa02080},
-    {S4_pstorerinewf_abs, 0xafa01084},
-    {S4_pstorerinewt_abs, 0xafa01080},
-    {S4_pstorerhnewf_abs, 0xafa00884},
-    {S4_pstorerhnewt_abs, 0xafa00880},
-    {S4_pstorerbnewf_abs, 0xafa00084},
-    {S4_pstorerbnewt_abs, 0xafa00080},
-    {S4_pstorerifnew_abs, 0xaf802084},
-    {S4_pstoreritnew_abs, 0xaf802080},
-    {S4_pstorerif_abs, 0xaf800084},
-    {S4_pstorerit_abs, 0xaf800080},
-    {S4_pstorerhfnew_abs, 0xaf402084},
-    {S4_pstorerhtnew_abs, 0xaf402080},
-    {S4_pstorerhf_abs, 0xaf400084},
-    {S4_pstorerht_abs, 0xaf400080},
-    {S4_pstorerbfnew_abs, 0xaf002084},
-    {S4_pstorerbtnew_abs, 0xaf002080},
-    {S4_pstorerbf_abs, 0xaf000084},
-    {S4_pstorerbt_abs, 0xaf000080}};
-// HexagonII::INST_ICLASS_LD
-
-// HexagonII::INST_ICLASS_LD_ST_2
-static unsigned int LoadStoreOpcodeData[][2] = {{PS_loadrdabs, 0x49c00000},
-                                                {PS_loadriabs, 0x49800000},
-                                                {PS_loadruhabs, 0x49600000},
-                                                {PS_loadrhabs, 0x49400000},
-                                                {PS_loadrubabs, 0x49200000},
-                                                {PS_loadrbabs, 0x49000000},
-                                                {PS_storerdabs, 0x48c00000},
-                                                {PS_storerinewabs, 0x48a01000},
-                                                {PS_storerhnewabs, 0x48a00800},
-                                                {PS_storerbnewabs, 0x48a00000},
-                                                {PS_storeriabs, 0x48800000},
-                                                {PS_storerfabs, 0x48600000},
-                                                {PS_storerhabs, 0x48400000},
-                                                {PS_storerbabs, 0x48000000}};
-static const size_t NumCondS = array_lengthof(StoreConditionalOpcodeData);
-static const size_t NumLS = array_lengthof(LoadStoreOpcodeData);
-
-static DecodeStatus decodeSpecial(MCInst &MI, uint32_t insn) {
-  unsigned MachineOpcode = 0;
-  unsigned LLVMOpcode = 0;
-
-  if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_ST) {
-    for (size_t i = 0; i < NumCondS; ++i) {
-      if ((insn & StoreConditionalOpcodeData[i][1]) ==
-          StoreConditionalOpcodeData[i][1]) {
-        MachineOpcode = StoreConditionalOpcodeData[i][1];
-        LLVMOpcode = StoreConditionalOpcodeData[i][0];
-        break;
-      }
-    }
-  }
-  if ((insn & HexagonII::INST_ICLASS_MASK) == HexagonII::INST_ICLASS_LD_ST_2) {
-    for (size_t i = 0; i < NumLS; ++i) {
-      if ((insn & LoadStoreOpcodeData[i][1]) == LoadStoreOpcodeData[i][1]) {
-        MachineOpcode = LoadStoreOpcodeData[i][1];
-        LLVMOpcode = LoadStoreOpcodeData[i][0];
-        break;
-      }
-    }
-  }
-
-  if (MachineOpcode) {
-    unsigned Value = 0;
-    unsigned shift = 0;
-    MI.setOpcode(LLVMOpcode);
-    // Remove the parse bits from the insn.
-    insn &= ~HexagonII::INST_PARSE_MASK;
-
-    switch (LLVMOpcode) {
-    default:
-      return MCDisassembler::Fail;
-      break;
-
-    case Hexagon::S4_pstorerdf_abs:
-    case Hexagon::S4_pstorerdt_abs:
-    case Hexagon::S4_pstorerdfnew_abs:
-    case Hexagon::S4_pstorerdtnew_abs:
-      // op: Pv
-      Value = insn & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 12) & UINT64_C(48);
-      Value |= (insn >> 3) & UINT64_C(15);
-      MI.addOperand(MCOperand::createImm(Value));
-      // op: Rtt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    case Hexagon::S4_pstorerbnewf_abs:
-    case Hexagon::S4_pstorerbnewt_abs:
-    case Hexagon::S4_pstorerbnewfnew_abs:
-    case Hexagon::S4_pstorerbnewtnew_abs:
-    case Hexagon::S4_pstorerhnewf_abs:
-    case Hexagon::S4_pstorerhnewt_abs:
-    case Hexagon::S4_pstorerhnewfnew_abs:
-    case Hexagon::S4_pstorerhnewtnew_abs:
-    case Hexagon::S4_pstorerinewf_abs:
-    case Hexagon::S4_pstorerinewt_abs:
-    case Hexagon::S4_pstorerinewfnew_abs:
-    case Hexagon::S4_pstorerinewtnew_abs:
-      // op: Pv
-      Value = insn & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 12) & UINT64_C(48);
-      Value |= (insn >> 3) & UINT64_C(15);
-      MI.addOperand(MCOperand::createImm(Value));
-      // op: Nt
-      Value = (insn >> 8) & UINT64_C(7);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    case Hexagon::S4_pstorerbf_abs:
-    case Hexagon::S4_pstorerbt_abs:
-    case Hexagon::S4_pstorerbfnew_abs:
-    case Hexagon::S4_pstorerbtnew_abs:
-    case Hexagon::S4_pstorerhf_abs:
-    case Hexagon::S4_pstorerht_abs:
-    case Hexagon::S4_pstorerhfnew_abs:
-    case Hexagon::S4_pstorerhtnew_abs:
-    case Hexagon::S4_pstorerif_abs:
-    case Hexagon::S4_pstorerit_abs:
-    case Hexagon::S4_pstorerifnew_abs:
-    case Hexagon::S4_pstoreritnew_abs:
-      // op: Pv
-      Value = insn & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 12) & UINT64_C(48);
-      Value |= (insn >> 3) & UINT64_C(15);
-      MI.addOperand(MCOperand::createImm(Value));
-      // op: Rt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    case Hexagon::L4_ploadrdf_abs:
-    case Hexagon::L4_ploadrdt_abs:
-    case Hexagon::L4_ploadrdfnew_abs:
-    case Hexagon::L4_ploadrdtnew_abs:
-      // op: Rdd
-      Value = insn & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: Pt
-      Value = ((insn >> 9) & UINT64_C(3));
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = ((insn >> 15) & UINT64_C(62));
-      Value |= ((insn >> 8) & UINT64_C(1));
-      MI.addOperand(MCOperand::createImm(Value));
-      break;
-
-    case Hexagon::L4_ploadrbf_abs:
-    case Hexagon::L4_ploadrbt_abs:
-    case Hexagon::L4_ploadrbfnew_abs:
-    case Hexagon::L4_ploadrbtnew_abs:
-    case Hexagon::L4_ploadrhf_abs:
-    case Hexagon::L4_ploadrht_abs:
-    case Hexagon::L4_ploadrhfnew_abs:
-    case Hexagon::L4_ploadrhtnew_abs:
-    case Hexagon::L4_ploadrubf_abs:
-    case Hexagon::L4_ploadrubt_abs:
-    case Hexagon::L4_ploadrubfnew_abs:
-    case Hexagon::L4_ploadrubtnew_abs:
-    case Hexagon::L4_ploadruhf_abs:
-    case Hexagon::L4_ploadruht_abs:
-    case Hexagon::L4_ploadruhfnew_abs:
-    case Hexagon::L4_ploadruhtnew_abs:
-    case Hexagon::L4_ploadrif_abs:
-    case Hexagon::L4_ploadrit_abs:
-    case Hexagon::L4_ploadrifnew_abs:
-    case Hexagon::L4_ploadritnew_abs:
-      // op: Rd
-      Value = insn & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: Pt
-      Value = (insn >> 9) & UINT64_C(3);
-      DecodePredRegsRegisterClass(MI, Value, 0, nullptr);
-      // op: u6
-      Value = (insn >> 15) & UINT64_C(62);
-      Value |= (insn >> 8) & UINT64_C(1);
-      MI.addOperand(MCOperand::createImm(Value));
-      break;
-
-    // op: g16_2
-    case (Hexagon::PS_loadriabs):
-      ++shift;
-    // op: g16_1
-    case Hexagon::PS_loadrhabs:
-    case Hexagon::PS_loadruhabs:
-      ++shift;
-    // op: g16_0
-    case Hexagon::PS_loadrbabs:
-    case Hexagon::PS_loadrubabs:
-      // op: Rd
-      Value |= insn & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(511);
-      MI.addOperand(MCOperand::createImm(Value << shift));
-      break;
-
-    case Hexagon::PS_loadrdabs:
-      Value = insn & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(511);
-      MI.addOperand(MCOperand::createImm(Value << 3));
-      break;
-
-    case Hexagon::PS_storerdabs:
-      // op: g16_3
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(256);
-      Value |= insn & UINT64_C(255);
-      MI.addOperand(MCOperand::createImm(Value << 3));
-      // op: Rtt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeDoubleRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    // op: g16_2
-    case Hexagon::PS_storerinewabs:
-      ++shift;
-    // op: g16_1
-    case Hexagon::PS_storerhnewabs:
-      ++shift;
-    // op: g16_0
-    case Hexagon::PS_storerbnewabs:
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(256);
-      Value |= insn & UINT64_C(255);
-      MI.addOperand(MCOperand::createImm(Value << shift));
-      // op: Nt
-      Value = (insn >> 8) & UINT64_C(7);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-
-    // op: g16_2
-    case Hexagon::PS_storeriabs:
-      ++shift;
-    // op: g16_1
-    case Hexagon::PS_storerhabs:
-    case Hexagon::PS_storerfabs:
-      ++shift;
-    // op: g16_0
-    case Hexagon::PS_storerbabs:
-      Value = (insn >> 11) & UINT64_C(49152);
-      Value |= (insn >> 7) & UINT64_C(15872);
-      Value |= (insn >> 5) & UINT64_C(256);
-      Value |= insn & UINT64_C(255);
-      MI.addOperand(MCOperand::createImm(Value << shift));
-      // op: Rt
-      Value = (insn >> 8) & UINT64_C(31);
-      DecodeIntRegsRegisterClass(MI, Value, 0, nullptr);
-      break;
-    }
-    return MCDisassembler::Success;
-  }
-  return MCDisassembler::Fail;
-}
-
-static DecodeStatus decodeImmext(MCInst &MI, uint32_t insn,
-                                 void const *Decoder) {
-  // Instruction Class for a constant a extender: bits 31:28 = 0x0000
-  if ((~insn & 0xf0000000) == 0xf0000000) {
-    unsigned Value;
-    // 27:16 High 12 bits of 26-bit extender.
-    Value = (insn & 0x0fff0000) << 4;
-    // 13:0 Low 14 bits of 26-bit extender.
-    Value |= ((insn & 0x3fff) << 6);
-    MI.setOpcode(Hexagon::A4_ext);
-    HexagonMCInstrInfo::addConstant(MI, Value, contextFromDecoder(Decoder));
-    return MCDisassembler::Success;
-  }
-  return MCDisassembler::Fail;
-}
-
-// These values are from HexagonGenMCCodeEmitter.inc and HexagonIsetDx.td
-enum subInstBinaryValues {
-  SA1_addi_BITS = 0x0000,
-  SA1_addi_MASK = 0x1800,
-  SA1_addrx_BITS = 0x1800,
-  SA1_addrx_MASK = 0x1f00,
-  SA1_addsp_BITS = 0x0c00,
-  SA1_addsp_MASK = 0x1c00,
-  SA1_and1_BITS = 0x1200,
-  SA1_and1_MASK = 0x1f00,
-  SA1_clrf_BITS = 0x1a70,
-  SA1_clrf_MASK = 0x1e70,
-  SA1_clrfnew_BITS = 0x1a50,
-  SA1_clrfnew_MASK = 0x1e70,
-  SA1_clrt_BITS = 0x1a60,
-  SA1_clrt_MASK = 0x1e70,
-  SA1_clrtnew_BITS = 0x1a40,
-  SA1_clrtnew_MASK = 0x1e70,
-  SA1_cmpeqi_BITS = 0x1900,
-  SA1_cmpeqi_MASK = 0x1f00,
-  SA1_combine0i_BITS = 0x1c00,
-  SA1_combine0i_MASK = 0x1d18,
-  SA1_combine1i_BITS = 0x1c08,
-  SA1_combine1i_MASK = 0x1d18,
-  SA1_combine2i_BITS = 0x1c10,
-  SA1_combine2i_MASK = 0x1d18,
-  SA1_combine3i_BITS = 0x1c18,
-  SA1_combine3i_MASK = 0x1d18,
-  SA1_combinerz_BITS = 0x1d08,
-  SA1_combinerz_MASK = 0x1d08,
-  SA1_combinezr_BITS = 0x1d00,
-  SA1_combinezr_MASK = 0x1d08,
-  SA1_dec_BITS = 0x1300,
-  SA1_dec_MASK = 0x1f00,
-  SA1_inc_BITS = 0x1100,
-  SA1_inc_MASK = 0x1f00,
-  SA1_seti_BITS = 0x0800,
-  SA1_seti_MASK = 0x1c00,
-  SA1_setin1_BITS = 0x1a00,
-  SA1_setin1_MASK = 0x1e40,
-  SA1_sxtb_BITS = 0x1500,
-  SA1_sxtb_MASK = 0x1f00,
-  SA1_sxth_BITS = 0x1400,
-  SA1_sxth_MASK = 0x1f00,
-  SA1_tfr_BITS = 0x1000,
-  SA1_tfr_MASK = 0x1f00,
-  SA1_zxtb_BITS = 0x1700,
-  SA1_zxtb_MASK = 0x1f00,
-  SA1_zxth_BITS = 0x1600,
-  SA1_zxth_MASK = 0x1f00,
-  SL1_loadri_io_BITS = 0x0000,
-  SL1_loadri_io_MASK = 0x1000,
-  SL1_loadrub_io_BITS = 0x1000,
-  SL1_loadrub_io_MASK = 0x1000,
-  SL2_deallocframe_BITS = 0x1f00,
-  SL2_deallocframe_MASK = 0x1fc0,
-  SL2_jumpr31_BITS = 0x1fc0,
-  SL2_jumpr31_MASK = 0x1fc4,
-  SL2_jumpr31_f_BITS = 0x1fc5,
-  SL2_jumpr31_f_MASK = 0x1fc7,
-  SL2_jumpr31_fnew_BITS = 0x1fc7,
-  SL2_jumpr31_fnew_MASK = 0x1fc7,
-  SL2_jumpr31_t_BITS = 0x1fc4,
-  SL2_jumpr31_t_MASK = 0x1fc7,
-  SL2_jumpr31_tnew_BITS = 0x1fc6,
-  SL2_jumpr31_tnew_MASK = 0x1fc7,
-  SL2_loadrb_io_BITS = 0x1000,
-  SL2_loadrb_io_MASK = 0x1800,
-  SL2_loadrd_sp_BITS = 0x1e00,
-  SL2_loadrd_sp_MASK = 0x1f00,
-  SL2_loadrh_io_BITS = 0x0000,
-  SL2_loadrh_io_MASK = 0x1800,
-  SL2_loadri_sp_BITS = 0x1c00,
-  SL2_loadri_sp_MASK = 0x1e00,
-  SL2_loadruh_io_BITS = 0x0800,
-  SL2_loadruh_io_MASK = 0x1800,
-  SL2_return_BITS = 0x1f40,
-  SL2_return_MASK = 0x1fc4,
-  SL2_return_f_BITS = 0x1f45,
-  SL2_return_f_MASK = 0x1fc7,
-  SL2_return_fnew_BITS = 0x1f47,
-  SL2_return_fnew_MASK = 0x1fc7,
-  SL2_return_t_BITS = 0x1f44,
-  SL2_return_t_MASK = 0x1fc7,
-  SL2_return_tnew_BITS = 0x1f46,
-  SL2_return_tnew_MASK = 0x1fc7,
-  SS1_storeb_io_BITS = 0x1000,
-  SS1_storeb_io_MASK = 0x1000,
-  SS1_storew_io_BITS = 0x0000,
-  SS1_storew_io_MASK = 0x1000,
-  SS2_allocframe_BITS = 0x1c00,
-  SS2_allocframe_MASK = 0x1e00,
-  SS2_storebi0_BITS = 0x1200,
-  SS2_storebi0_MASK = 0x1f00,
-  SS2_storebi1_BITS = 0x1300,
-  SS2_storebi1_MASK = 0x1f00,
-  SS2_stored_sp_BITS = 0x0a00,
-  SS2_stored_sp_MASK = 0x1e00,
-  SS2_storeh_io_BITS = 0x0000,
-  SS2_storeh_io_MASK = 0x1800,
-  SS2_storew_sp_BITS = 0x0800,
-  SS2_storew_sp_MASK = 0x1e00,
-  SS2_storewi0_BITS = 0x1000,
-  SS2_storewi0_MASK = 0x1f00,
-  SS2_storewi1_BITS = 0x1100,
-  SS2_storewi1_MASK = 0x1f00
-};
 
-static unsigned GetSubinstOpcode(unsigned IClass, unsigned inst, unsigned &op,
-                                 raw_ostream &os) {
-  switch (IClass) {
-  case HexagonII::HSIG_L1:
-    if ((inst & SL1_loadri_io_MASK) == SL1_loadri_io_BITS)
-      op = Hexagon::SL1_loadri_io;
-    else if ((inst & SL1_loadrub_io_MASK) == SL1_loadrub_io_BITS)
-      op = Hexagon::SL1_loadrub_io;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_L2:
-    if ((inst & SL2_deallocframe_MASK) == SL2_deallocframe_BITS)
-      op = Hexagon::SL2_deallocframe;
-    else if ((inst & SL2_jumpr31_MASK) == SL2_jumpr31_BITS)
-      op = Hexagon::SL2_jumpr31;
-    else if ((inst & SL2_jumpr31_f_MASK) == SL2_jumpr31_f_BITS)
-      op = Hexagon::SL2_jumpr31_f;
-    else if ((inst & SL2_jumpr31_fnew_MASK) == SL2_jumpr31_fnew_BITS)
-      op = Hexagon::SL2_jumpr31_fnew;
-    else if ((inst & SL2_jumpr31_t_MASK) == SL2_jumpr31_t_BITS)
-      op = Hexagon::SL2_jumpr31_t;
-    else if ((inst & SL2_jumpr31_tnew_MASK) == SL2_jumpr31_tnew_BITS)
-      op = Hexagon::SL2_jumpr31_tnew;
-    else if ((inst & SL2_loadrb_io_MASK) == SL2_loadrb_io_BITS)
-      op = Hexagon::SL2_loadrb_io;
-    else if ((inst & SL2_loadrd_sp_MASK) == SL2_loadrd_sp_BITS)
-      op = Hexagon::SL2_loadrd_sp;
-    else if ((inst & SL2_loadrh_io_MASK) == SL2_loadrh_io_BITS)
-      op = Hexagon::SL2_loadrh_io;
-    else if ((inst & SL2_loadri_sp_MASK) == SL2_loadri_sp_BITS)
-      op = Hexagon::SL2_loadri_sp;
-    else if ((inst & SL2_loadruh_io_MASK) == SL2_loadruh_io_BITS)
-      op = Hexagon::SL2_loadruh_io;
-    else if ((inst & SL2_return_MASK) == SL2_return_BITS)
-      op = Hexagon::SL2_return;
-    else if ((inst & SL2_return_f_MASK) == SL2_return_f_BITS)
-      op = Hexagon::SL2_return_f;
-    else if ((inst & SL2_return_fnew_MASK) == SL2_return_fnew_BITS)
-      op = Hexagon::SL2_return_fnew;
-    else if ((inst & SL2_return_t_MASK) == SL2_return_t_BITS)
-      op = Hexagon::SL2_return_t;
-    else if ((inst & SL2_return_tnew_MASK) == SL2_return_tnew_BITS)
-      op = Hexagon::SL2_return_tnew;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_A:
-    if ((inst & SA1_addi_MASK) == SA1_addi_BITS)
-      op = Hexagon::SA1_addi;
-    else if ((inst & SA1_addrx_MASK) == SA1_addrx_BITS)
-      op = Hexagon::SA1_addrx;
-    else if ((inst & SA1_addsp_MASK) == SA1_addsp_BITS)
-      op = Hexagon::SA1_addsp;
-    else if ((inst & SA1_and1_MASK) == SA1_and1_BITS)
-      op = Hexagon::SA1_and1;
-    else if ((inst & SA1_clrf_MASK) == SA1_clrf_BITS)
-      op = Hexagon::SA1_clrf;
-    else if ((inst & SA1_clrfnew_MASK) == SA1_clrfnew_BITS)
-      op = Hexagon::SA1_clrfnew;
-    else if ((inst & SA1_clrt_MASK) == SA1_clrt_BITS)
-      op = Hexagon::SA1_clrt;
-    else if ((inst & SA1_clrtnew_MASK) == SA1_clrtnew_BITS)
-      op = Hexagon::SA1_clrtnew;
-    else if ((inst & SA1_cmpeqi_MASK) == SA1_cmpeqi_BITS)
-      op = Hexagon::SA1_cmpeqi;
-    else if ((inst & SA1_combine0i_MASK) == SA1_combine0i_BITS)
-      op = Hexagon::SA1_combine0i;
-    else if ((inst & SA1_combine1i_MASK) == SA1_combine1i_BITS)
-      op = Hexagon::SA1_combine1i;
-    else if ((inst & SA1_combine2i_MASK) == SA1_combine2i_BITS)
-      op = Hexagon::SA1_combine2i;
-    else if ((inst & SA1_combine3i_MASK) == SA1_combine3i_BITS)
-      op = Hexagon::SA1_combine3i;
-    else if ((inst & SA1_combinerz_MASK) == SA1_combinerz_BITS)
-      op = Hexagon::SA1_combinerz;
-    else if ((inst & SA1_combinezr_MASK) == SA1_combinezr_BITS)
-      op = Hexagon::SA1_combinezr;
-    else if ((inst & SA1_dec_MASK) == SA1_dec_BITS)
-      op = Hexagon::SA1_dec;
-    else if ((inst & SA1_inc_MASK) == SA1_inc_BITS)
-      op = Hexagon::SA1_inc;
-    else if ((inst & SA1_seti_MASK) == SA1_seti_BITS)
-      op = Hexagon::SA1_seti;
-    else if ((inst & SA1_setin1_MASK) == SA1_setin1_BITS)
-      op = Hexagon::SA1_setin1;
-    else if ((inst & SA1_sxtb_MASK) == SA1_sxtb_BITS)
-      op = Hexagon::SA1_sxtb;
-    else if ((inst & SA1_sxth_MASK) == SA1_sxth_BITS)
-      op = Hexagon::SA1_sxth;
-    else if ((inst & SA1_tfr_MASK) == SA1_tfr_BITS)
-      op = Hexagon::SA1_tfr;
-    else if ((inst & SA1_zxtb_MASK) == SA1_zxtb_BITS)
-      op = Hexagon::SA1_zxtb;
-    else if ((inst & SA1_zxth_MASK) == SA1_zxth_BITS)
-      op = Hexagon::SA1_zxth;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_S1:
-    if ((inst & SS1_storeb_io_MASK) == SS1_storeb_io_BITS)
-      op = Hexagon::SS1_storeb_io;
-    else if ((inst & SS1_storew_io_MASK) == SS1_storew_io_BITS)
-      op = Hexagon::SS1_storew_io;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  case HexagonII::HSIG_S2:
-    if ((inst & SS2_allocframe_MASK) == SS2_allocframe_BITS)
-      op = Hexagon::SS2_allocframe;
-    else if ((inst & SS2_storebi0_MASK) == SS2_storebi0_BITS)
-      op = Hexagon::SS2_storebi0;
-    else if ((inst & SS2_storebi1_MASK) == SS2_storebi1_BITS)
-      op = Hexagon::SS2_storebi1;
-    else if ((inst & SS2_stored_sp_MASK) == SS2_stored_sp_BITS)
-      op = Hexagon::SS2_stored_sp;
-    else if ((inst & SS2_storeh_io_MASK) == SS2_storeh_io_BITS)
-      op = Hexagon::SS2_storeh_io;
-    else if ((inst & SS2_storew_sp_MASK) == SS2_storew_sp_BITS)
-      op = Hexagon::SS2_storew_sp;
-    else if ((inst & SS2_storewi0_MASK) == SS2_storewi0_BITS)
-      op = Hexagon::SS2_storewi0;
-    else if ((inst & SS2_storewi1_MASK) == SS2_storewi1_BITS)
-      op = Hexagon::SS2_storewi1;
-    else {
-      os << "<unknown subinstruction>";
-      return MCDisassembler::Fail;
-    }
-    break;
-  default:
-    os << "<unknown>";
-    return MCDisassembler::Fail;
-  }
-  return MCDisassembler::Success;
-}
-
-static unsigned getRegFromSubinstEncoding(unsigned encoded_reg) {
-  if (encoded_reg < 8)
-    return Hexagon::R0 + encoded_reg;
-  else if (encoded_reg < 16)
-    return Hexagon::R0 + encoded_reg + 8;
-
-  // patently false value
-  return Hexagon::NoRegister;
-}
-
-static unsigned getDRegFromSubinstEncoding(unsigned encoded_dreg) {
-  if (encoded_dreg < 4)
-    return Hexagon::D0 + encoded_dreg;
-  else if (encoded_dreg < 8)
-    return Hexagon::D0 + encoded_dreg + 4;
-
-  // patently false value
-  return Hexagon::NoRegister;
-}
-
-void HexagonDisassembler::addSubinstOperands(MCInst *MI, unsigned opcode,
-                                             unsigned inst) const {
-  int64_t operand;
-  MCOperand Op;
-  switch (opcode) {
-  case Hexagon::SL2_deallocframe:
-  case Hexagon::SL2_jumpr31:
-  case Hexagon::SL2_jumpr31_f:
-  case Hexagon::SL2_jumpr31_fnew:
-  case Hexagon::SL2_jumpr31_t:
-  case Hexagon::SL2_jumpr31_tnew:
-  case Hexagon::SL2_return:
-  case Hexagon::SL2_return_f:
-  case Hexagon::SL2_return_fnew:
-  case Hexagon::SL2_return_t:
-  case Hexagon::SL2_return_tnew:
-    // no operands for these instructions
-    break;
-  case Hexagon::SS2_allocframe:
-    // u 8-4{5_3}
-    operand = ((inst & 0x1f0) >> 4) << 3;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL1_loadri_io:
-    // Rd 3-0, Rs 7-4, u 11-8{4_2}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf00) >> 6;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL1_loadrub_io:
-    // Rd 3-0, Rs 7-4, u 11-8
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf00) >> 8;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadrb_io:
-    // Rd 3-0, Rs 7-4, u 10-8
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0x700) >> 8;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadrh_io:
-  case Hexagon::SL2_loadruh_io:
-    // Rd 3-0, Rs 7-4, u 10-8{3_1}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x700) >> 8) << 1;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadrd_sp:
-    // Rdd 2-0, u 7-3{5_3}
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x0f8) >> 3) << 3;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SL2_loadri_sp:
-    // Rd 3-0, u 8-4{5_2}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x1f0) >> 4) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_addi:
-    // Rx 3-0 (x2), s7 10-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    MI->addOperand(Op);
-    operand = SignExtend64<7>((inst & 0x7f0) >> 4);
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_addrx:
-    // Rx 3-0 (x2), Rs 7-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SA1_and1:
-  case Hexagon::SA1_dec:
-  case Hexagon::SA1_inc:
-  case Hexagon::SA1_sxtb:
-  case Hexagon::SA1_sxth:
-  case Hexagon::SA1_tfr:
-  case Hexagon::SA1_zxtb:
-  case Hexagon::SA1_zxth:
-    // Rd 3-0, Rs 7-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SA1_addsp:
-    // Rd 3-0, u 9-4{6_2}
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x3f0) >> 4) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_seti:
-    // Rd 3-0, u 9-4
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0x3f0) >> 4;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_clrf:
-  case Hexagon::SA1_clrfnew:
-  case Hexagon::SA1_clrt:
-  case Hexagon::SA1_clrtnew:
-  case Hexagon::SA1_setin1:
-    // Rd 3-0
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    if (opcode == Hexagon::SA1_setin1)
-      break;
-    MI->addOperand(MCOperand::createReg(Hexagon::P0));
-    break;
-  case Hexagon::SA1_cmpeqi:
-    // Rs 7-4, u 1-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = inst & 0x3;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_combine0i:
-  case Hexagon::SA1_combine1i:
-  case Hexagon::SA1_combine2i:
-  case Hexagon::SA1_combine3i:
-    // Rdd 2-0, u 6-5
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0x060) >> 5;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SA1_combinerz:
-  case Hexagon::SA1_combinezr:
-    // Rdd 2-0, Rs 7-4
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS1_storeb_io:
-    // Rs 7-4, u 11-8, Rt 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf00) >> 8;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS1_storew_io:
-    // Rs 7-4, u 11-8{4_2}, Rt 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0xf00) >> 8) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS2_storebi0:
-  case Hexagon::SS2_storebi1:
-    // Rs 7-4, u 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = inst & 0xf;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SS2_storewi0:
-  case Hexagon::SS2_storewi1:
-    // Rs 7-4, u 3-0{4_2}
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = (inst & 0xf) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    break;
-  case Hexagon::SS2_stored_sp:
-    // s 8-3{6_3}, Rtt 2-0
-    operand = SignExtend64<9>(((inst & 0x1f8) >> 3) << 3);
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getDRegFromSubinstEncoding(inst & 0x7);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS2_storeh_io:
-    // Rs 7-4, u 10-8{3_1}, Rt 3-0
-    operand = getRegFromSubinstEncoding((inst & 0xf0) >> 4);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    operand = ((inst & 0x700) >> 8) << 1;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  case Hexagon::SS2_storew_sp:
-    // u 8-4{5_2}, Rd 3-0
-    operand = ((inst & 0x1f0) >> 4) << 2;
-    HexagonMCInstrInfo::addConstant(*MI, operand, getContext());
-    operand = getRegFromSubinstEncoding(inst & 0xf);
-    Op = MCOperand::createReg(operand);
-    MI->addOperand(Op);
-    break;
-  default:
-    // don't crash with an invalid subinstruction
-    // llvm_unreachable("Invalid subinstruction in duplex instruction");
-    break;
-  }
-}
diff --git a/lib/Target/Hexagon/Hexagon.td b/lib/Target/Hexagon/Hexagon.td
index 0b2b46387b6ade9f5bd31e0fdbf49939179a17b5..4767165141a3c4fb1065dbd79f987ec9a61b3276 100644
--- a/lib/Target/Hexagon/Hexagon.td
+++ b/lib/Target/Hexagon/Hexagon.td
@@ -22,14 +22,12 @@ include "llvm/Target/Target.td"
 //===----------------------------------------------------------------------===//
 
 // Hexagon Architectures
-def ArchV4:  SubtargetFeature<"v4",  "HexagonArchVersion", "V4",  "Hexagon V4">;
-def ArchV5:  SubtargetFeature<"v5",  "HexagonArchVersion", "V5",  "Hexagon V5">;
-def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Hexagon V55">;
-def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Hexagon V60">;
+include "HexagonDepArch.td"
 
-def FeatureHVX: SubtargetFeature<"hvx", "UseHVXOps", "true",
+// Hexagon ISA Extensions
+def ExtensionHVX: SubtargetFeature<"hvx", "UseHVXOps", "true",
       "Hexagon HVX instructions">;
-def FeatureHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps", "true",
+def ExtensionHVXDbl: SubtargetFeature<"hvx-double", "UseHVXDblOps", "true",
       "Hexagon HVX Double instructions">;
 def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
       "Use constant-extended calls">;
@@ -37,19 +35,14 @@ def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
 //===----------------------------------------------------------------------===//
 // Hexagon Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
-def HasV5T             : Predicate<"HST->hasV5TOps()">;
-def NoV5T              : Predicate<"!HST->hasV5TOps()">;
-def HasV55T            : Predicate<"HST->hasV55TOps()">,
-                         AssemblerPredicate<"ArchV55">;
-def HasV60T            : Predicate<"HST->hasV60TOps()">,
-                         AssemblerPredicate<"ArchV60">;
+
 def UseMEMOP           : Predicate<"HST->useMemOps()">;
 def IEEERndNearV5T     : Predicate<"HST->modeIEEERndNear()">;
 def UseHVXDbl          : Predicate<"HST->useHVXDblOps()">,
-                         AssemblerPredicate<"FeatureHVXDbl">;
+                         AssemblerPredicate<"ExtensionHVXDbl">;
 def UseHVXSgl          : Predicate<"HST->useHVXSglOps()">;
 def UseHVX             : Predicate<"HST->useHVXSglOps() ||HST->useHVXDblOps()">,
-                         AssemblerPredicate<"FeatureHVX">;
+                         AssemblerPredicate<"ExtensionHVX">;
 
 //===----------------------------------------------------------------------===//
 // Classes used for relation maps.
@@ -81,7 +74,7 @@ class IntrinsicsRel;
 def getPredOpcode : InstrMapping {
   let FilterClass = "PredRel";
   // Instructions with the same BaseOpcode and isNVStore values form a row.
-  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isNT"];
+  let RowFields = ["BaseOpcode", "isNVStore", "PNewValue", "isBrTaken", "isNT"];
   // Instructions with the same predicate sense form a column.
   let ColFields = ["PredSense"];
   // The key column is the unpredicated instructions.
@@ -132,7 +125,7 @@ def getPredNewOpcode : InstrMapping {
 //
 def getPredOldOpcode : InstrMapping {
   let FilterClass = "PredNewRel";
-  let RowFields = ["BaseOpcode", "PredSense", "isNVStore"];
+  let RowFields = ["BaseOpcode", "PredSense", "isNVStore", "isBrTaken"];
   let ColFields = ["PNewValue"];
   let KeyCol = ["new"];
   let ValueCols = [[""]];
@@ -248,11 +241,18 @@ def getRealHWInstr : InstrMapping {
 //===----------------------------------------------------------------------===//
 include "HexagonSchedule.td"
 include "HexagonRegisterInfo.td"
-include "HexagonCallingConv.td"
-include "HexagonInstrInfo.td"
+include "HexagonOperands.td"
+include "HexagonDepOperands.td"
+include "HexagonDepITypes.td"
+include "HexagonInstrFormats.td"
+include "HexagonDepInstrFormats.td"
+include "HexagonDepInstrInfo.td"
+include "HexagonPseudo.td"
 include "HexagonPatterns.td"
+include "HexagonDepMappings.td"
 include "HexagonIntrinsics.td"
 include "HexagonIntrinsicsDerived.td"
+include "HexagonMapAsm2IntrinV62.gen.td"
 
 def HexagonInstrInfo : InstrInfo;
 
@@ -271,7 +271,9 @@ def : Proc<"hexagonv5",  HexagonModelV4,
 def : Proc<"hexagonv55", HexagonModelV55,
            [ArchV4, ArchV5, ArchV55]>;
 def : Proc<"hexagonv60", HexagonModelV60,
-           [ArchV4, ArchV5, ArchV55, ArchV60, FeatureHVX]>;
+           [ArchV4, ArchV5, ArchV55, ArchV60, ExtensionHVX]>;
+def : Proc<"hexagonv62", HexagonModelV62,
+           [ArchV4, ArchV5, ArchV55, ArchV60, ArchV62, ExtensionHVX]>;
 
 //===----------------------------------------------------------------------===//
 // Declare the target which we are implementing
diff --git a/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 54db5ad4374b04cf662c1209dd8f9d91503650ac..fda23f8f6b05f17a443090e38bb261d0e04470a8 100644
--- a/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -261,10 +261,34 @@ static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
   return Sym;
 }
 
+static MCInst ScaleVectorOffset(MCInst &Inst, unsigned OpNo,
+                                unsigned VectorSize, MCContext &Ctx) {
+  MCInst T;
+  T.setOpcode(Inst.getOpcode());
+  for (unsigned i = 0, n = Inst.getNumOperands(); i != n; ++i) {
+    if (i != OpNo) {
+      T.addOperand(Inst.getOperand(i));
+      continue;
+    }
+    MCOperand &ImmOp = Inst.getOperand(i);
+    const auto *HE = static_cast<const HexagonMCExpr*>(ImmOp.getExpr());
+    int32_t V = cast<MCConstantExpr>(HE->getExpr())->getValue();
+    auto *NewCE = MCConstantExpr::create(V / int32_t(VectorSize), Ctx);
+    auto *NewHE = HexagonMCExpr::create(NewCE, Ctx);
+    T.addOperand(MCOperand::createExpr(NewHE));
+  }
+  return T;
+}
+
 void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
                                                   const MachineInstr &MI) {
   MCInst &MappedInst = static_cast <MCInst &>(Inst);
   const MCRegisterInfo *RI = OutStreamer->getContext().getRegisterInfo();
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const auto &HST = MF.getSubtarget<HexagonSubtarget>();
+  unsigned VectorSize = HST.useHVXSglOps()
+                            ? Hexagon::VectorRegsRegClass.getSize()
+                            : Hexagon::VectorRegs128BRegClass.getSize();
 
   switch (Inst.getOpcode()) {
   default: return;
@@ -282,6 +306,36 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     break;
   }
 
+  case Hexagon::A2_tfrf: {
+    Inst.setOpcode(Hexagon::A2_paddif);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_tfrt: {
+    Inst.setOpcode(Hexagon::A2_paddit);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_tfrfnew: {
+    Inst.setOpcode(Hexagon::A2_paddifnew);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_tfrtnew: {
+    Inst.setOpcode(Hexagon::A2_padditnew);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(0, OutContext)));
+    break;
+  }
+
+  case Hexagon::A2_zxtb: {
+    Inst.setOpcode(Hexagon::A2_andir);
+    Inst.addOperand(MCOperand::createExpr(MCConstantExpr::create(255, OutContext)));
+    break;
+  }
+
   // "$dst = CONST64(#$src1)",
   case Hexagon::CONST64:
     if (!OutStreamer->hasRawTextSupport()) {
@@ -376,6 +430,9 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     Rs.setReg(getHexagonRegisterPair(Rs.getReg(), RI));
     return;
   }
+  case Hexagon::PS_call_nr:
+    Inst.setOpcode(Hexagon::J2_call);
+    break;
   case Hexagon::S5_asrhub_rnd_sat_goodsyntax: {
     MCOperand &MO = MappedInst.getOperand(2);
     int64_t Imm;
@@ -564,6 +621,181 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
     return;
   }
 
+  case Hexagon::V6_vL32Ub_pi:
+  case Hexagon::V6_vL32b_cur_pi:
+  case Hexagon::V6_vL32b_nt_cur_pi:
+  case Hexagon::V6_vL32b_pi:
+  case Hexagon::V6_vL32b_nt_pi:
+  case Hexagon::V6_vL32b_nt_tmp_pi:
+  case Hexagon::V6_vL32b_tmp_pi:
+  case Hexagon::V6_vL32Ub_pi_128B:
+  case Hexagon::V6_vL32b_cur_pi_128B:
+  case Hexagon::V6_vL32b_nt_cur_pi_128B:
+  case Hexagon::V6_vL32b_pi_128B:
+  case Hexagon::V6_vL32b_nt_pi_128B:
+  case Hexagon::V6_vL32b_nt_tmp_pi_128B:
+  case Hexagon::V6_vL32b_tmp_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 3, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vL32Ub_ai:
+  case Hexagon::V6_vL32b_ai:
+  case Hexagon::V6_vL32b_cur_ai:
+  case Hexagon::V6_vL32b_nt_ai:
+  case Hexagon::V6_vL32b_nt_cur_ai:
+  case Hexagon::V6_vL32b_nt_tmp_ai:
+  case Hexagon::V6_vL32b_tmp_ai:
+  case Hexagon::V6_vL32Ub_ai_128B:
+  case Hexagon::V6_vL32b_ai_128B:
+  case Hexagon::V6_vL32b_cur_ai_128B:
+  case Hexagon::V6_vL32b_nt_ai_128B:
+  case Hexagon::V6_vL32b_nt_cur_ai_128B:
+  case Hexagon::V6_vL32b_nt_tmp_ai_128B:
+  case Hexagon::V6_vL32b_tmp_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_pi:
+  case Hexagon::V6_vS32b_new_pi:
+  case Hexagon::V6_vS32b_nt_new_pi:
+  case Hexagon::V6_vS32b_nt_pi:
+  case Hexagon::V6_vS32b_pi:
+  case Hexagon::V6_vS32Ub_pi_128B:
+  case Hexagon::V6_vS32b_new_pi_128B:
+  case Hexagon::V6_vS32b_nt_new_pi_128B:
+  case Hexagon::V6_vS32b_nt_pi_128B:
+  case Hexagon::V6_vS32b_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_ai:
+  case Hexagon::V6_vS32b_ai:
+  case Hexagon::V6_vS32b_new_ai:
+  case Hexagon::V6_vS32b_nt_ai:
+  case Hexagon::V6_vS32b_nt_new_ai:
+  case Hexagon::V6_vS32Ub_ai_128B:
+  case Hexagon::V6_vS32b_ai_128B:
+  case Hexagon::V6_vS32b_new_ai_128B:
+  case Hexagon::V6_vS32b_nt_ai_128B:
+  case Hexagon::V6_vS32b_nt_new_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 1, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vL32b_cur_npred_pi:
+  case Hexagon::V6_vL32b_cur_pred_pi:
+  case Hexagon::V6_vL32b_npred_pi:
+  case Hexagon::V6_vL32b_nt_cur_npred_pi:
+  case Hexagon::V6_vL32b_nt_cur_pred_pi:
+  case Hexagon::V6_vL32b_nt_npred_pi:
+  case Hexagon::V6_vL32b_nt_pred_pi:
+  case Hexagon::V6_vL32b_nt_tmp_npred_pi:
+  case Hexagon::V6_vL32b_nt_tmp_pred_pi:
+  case Hexagon::V6_vL32b_pred_pi:
+  case Hexagon::V6_vL32b_tmp_npred_pi:
+  case Hexagon::V6_vL32b_tmp_pred_pi:
+  case Hexagon::V6_vL32b_cur_npred_pi_128B:
+  case Hexagon::V6_vL32b_cur_pred_pi_128B:
+  case Hexagon::V6_vL32b_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_cur_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_cur_pred_pi_128B:
+  case Hexagon::V6_vL32b_nt_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_pred_pi_128B:
+  case Hexagon::V6_vL32b_nt_tmp_npred_pi_128B:
+  case Hexagon::V6_vL32b_nt_tmp_pred_pi_128B:
+  case Hexagon::V6_vL32b_pred_pi_128B:
+  case Hexagon::V6_vL32b_tmp_npred_pi_128B:
+  case Hexagon::V6_vL32b_tmp_pred_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 4, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vL32b_cur_npred_ai:
+  case Hexagon::V6_vL32b_cur_pred_ai:
+  case Hexagon::V6_vL32b_npred_ai:
+  case Hexagon::V6_vL32b_nt_cur_npred_ai:
+  case Hexagon::V6_vL32b_nt_cur_pred_ai:
+  case Hexagon::V6_vL32b_nt_npred_ai:
+  case Hexagon::V6_vL32b_nt_pred_ai:
+  case Hexagon::V6_vL32b_nt_tmp_npred_ai:
+  case Hexagon::V6_vL32b_nt_tmp_pred_ai:
+  case Hexagon::V6_vL32b_pred_ai:
+  case Hexagon::V6_vL32b_tmp_npred_ai:
+  case Hexagon::V6_vL32b_tmp_pred_ai:
+  case Hexagon::V6_vL32b_cur_npred_ai_128B:
+  case Hexagon::V6_vL32b_cur_pred_ai_128B:
+  case Hexagon::V6_vL32b_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_cur_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_cur_pred_ai_128B:
+  case Hexagon::V6_vL32b_nt_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_pred_ai_128B:
+  case Hexagon::V6_vL32b_nt_tmp_npred_ai_128B:
+  case Hexagon::V6_vL32b_nt_tmp_pred_ai_128B:
+  case Hexagon::V6_vL32b_pred_ai_128B:
+  case Hexagon::V6_vL32b_tmp_npred_ai_128B:
+  case Hexagon::V6_vL32b_tmp_pred_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 3, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_npred_pi:
+  case Hexagon::V6_vS32Ub_pred_pi:
+  case Hexagon::V6_vS32b_new_npred_pi:
+  case Hexagon::V6_vS32b_new_pred_pi:
+  case Hexagon::V6_vS32b_npred_pi:
+  case Hexagon::V6_vS32b_nqpred_pi:
+  case Hexagon::V6_vS32b_nt_new_npred_pi:
+  case Hexagon::V6_vS32b_nt_new_pred_pi:
+  case Hexagon::V6_vS32b_nt_npred_pi:
+  case Hexagon::V6_vS32b_nt_nqpred_pi:
+  case Hexagon::V6_vS32b_nt_pred_pi:
+  case Hexagon::V6_vS32b_nt_qpred_pi:
+  case Hexagon::V6_vS32b_pred_pi:
+  case Hexagon::V6_vS32b_qpred_pi:
+  case Hexagon::V6_vS32Ub_npred_pi_128B:
+  case Hexagon::V6_vS32Ub_pred_pi_128B:
+  case Hexagon::V6_vS32b_new_npred_pi_128B:
+  case Hexagon::V6_vS32b_new_pred_pi_128B:
+  case Hexagon::V6_vS32b_npred_pi_128B:
+  case Hexagon::V6_vS32b_nqpred_pi_128B:
+  case Hexagon::V6_vS32b_nt_new_npred_pi_128B:
+  case Hexagon::V6_vS32b_nt_new_pred_pi_128B:
+  case Hexagon::V6_vS32b_nt_npred_pi_128B:
+  case Hexagon::V6_vS32b_nt_nqpred_pi_128B:
+  case Hexagon::V6_vS32b_nt_pred_pi_128B:
+  case Hexagon::V6_vS32b_nt_qpred_pi_128B:
+  case Hexagon::V6_vS32b_pred_pi_128B:
+  case Hexagon::V6_vS32b_qpred_pi_128B:
+    MappedInst = ScaleVectorOffset(Inst, 3, VectorSize, OutContext);
+    return;
+
+  case Hexagon::V6_vS32Ub_npred_ai:
+  case Hexagon::V6_vS32Ub_pred_ai:
+  case Hexagon::V6_vS32b_new_npred_ai:
+  case Hexagon::V6_vS32b_new_pred_ai:
+  case Hexagon::V6_vS32b_npred_ai:
+  case Hexagon::V6_vS32b_nqpred_ai:
+  case Hexagon::V6_vS32b_nt_new_npred_ai:
+  case Hexagon::V6_vS32b_nt_new_pred_ai:
+  case Hexagon::V6_vS32b_nt_npred_ai:
+  case Hexagon::V6_vS32b_nt_nqpred_ai:
+  case Hexagon::V6_vS32b_nt_pred_ai:
+  case Hexagon::V6_vS32b_nt_qpred_ai:
+  case Hexagon::V6_vS32b_pred_ai:
+  case Hexagon::V6_vS32b_qpred_ai:
+  case Hexagon::V6_vS32Ub_npred_ai_128B:
+  case Hexagon::V6_vS32Ub_pred_ai_128B:
+  case Hexagon::V6_vS32b_new_npred_ai_128B:
+  case Hexagon::V6_vS32b_new_pred_ai_128B:
+  case Hexagon::V6_vS32b_npred_ai_128B:
+  case Hexagon::V6_vS32b_nqpred_ai_128B:
+  case Hexagon::V6_vS32b_nt_new_npred_ai_128B:
+  case Hexagon::V6_vS32b_nt_new_pred_ai_128B:
+  case Hexagon::V6_vS32b_nt_npred_ai_128B:
+  case Hexagon::V6_vS32b_nt_nqpred_ai_128B:
+  case Hexagon::V6_vS32b_nt_pred_ai_128B:
+  case Hexagon::V6_vS32b_nt_qpred_ai_128B:
+  case Hexagon::V6_vS32b_pred_ai_128B:
+  case Hexagon::V6_vS32b_qpred_ai_128B:
+    MappedInst = ScaleVectorOffset(Inst, 2, VectorSize, OutContext);
+    return;
   }
 }
 
@@ -578,13 +810,9 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   if (MI->isBundle()) {
     const MachineBasicBlock* MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator MII = MI->getIterator();
-    unsigned IgnoreCount = 0;
 
     for (++MII; MII != MBB->instr_end() && MII->isInsideBundle(); ++MII)
-      if (MII->getOpcode() == TargetOpcode::DBG_VALUE ||
-          MII->getOpcode() == TargetOpcode::IMPLICIT_DEF)
-        ++IgnoreCount;
-      else
+      if (!MII->isDebugValue() && !MII->isImplicitDef())
         HexagonLowerToMC(MCII, &*MII, MCB, *this);
   }
   else
diff --git a/lib/Target/Hexagon/HexagonBitSimplify.cpp b/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 063aaa7af3da56ec4b1c6b1074b0add0d540644a..61f290ca98d70beb18a22ad530ecc0b4aa9cbdcb 100644
--- a/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -46,6 +46,17 @@ using namespace llvm;
 
 static cl::opt<bool> PreserveTiedOps("hexbit-keep-tied", cl::Hidden,
   cl::init(true), cl::desc("Preserve subregisters in tied operands"));
+static cl::opt<bool> GenExtract("hexbit-extract", cl::Hidden,
+  cl::init(true), cl::desc("Generate extract instructions"));
+static cl::opt<bool> GenBitSplit("hexbit-bitsplit", cl::Hidden,
+  cl::init(true), cl::desc("Generate bitsplit instructions"));
+
+static cl::opt<unsigned> MaxExtract("hexbit-max-extract", cl::Hidden,
+  cl::init(UINT_MAX));
+static unsigned CountExtract = 0;
+static cl::opt<unsigned> MaxBitSplit("hexbit-max-bitsplit", cl::Hidden,
+  cl::init(UINT_MAX));
+static unsigned CountBitSplit = 0;
 
 namespace llvm {
 
@@ -249,8 +260,6 @@ INITIALIZE_PASS_END(HexagonBitSimplify, "hexbit",
 
 bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
       RegisterSet &AVs) {
-  MachineDomTreeNode *N = MDT->getNode(&B);
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
   bool Changed = false;
 
   if (T.TopDown)
@@ -262,10 +271,9 @@ bool HexagonBitSimplify::visitBlock(MachineBasicBlock &B, Transformation &T,
   RegisterSet NewAVs = AVs;
   NewAVs.insert(Defs);
 
-  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
-    MachineBasicBlock *SB = (*I)->getBlock();
-    Changed |= visitBlock(*SB, T, NewAVs);
-  }
+  for (auto *DTN : children<MachineDomTreeNode*>(MDT->getNode(&B)))
+    Changed |= visitBlock(*(DTN->getBlock()), T, NewAVs);
+
   if (!T.TopDown)
     Changed |= T.processBlock(B, AVs);
 
@@ -984,9 +992,9 @@ bool DeadCodeElimination::isDead(unsigned R) const {
 
 bool DeadCodeElimination::runOnNode(MachineDomTreeNode *N) {
   bool Changed = false;
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
-  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
-    Changed |= runOnNode(*I);
+
+  for (auto *DTN : children<MachineDomTreeNode*>(N))
+    Changed |= runOnNode(DTN);
 
   MachineBasicBlock *B = N->getBlock();
   std::vector<MachineInstr*> Instrs;
@@ -1736,10 +1744,11 @@ namespace {
 // This is by no means complete
   class BitSimplification : public Transformation {
   public:
-    BitSimplification(BitTracker &bt, const HexagonInstrInfo &hii,
-        const HexagonRegisterInfo &hri, MachineRegisterInfo &mri,
-        MachineFunction &mf)
-      : Transformation(true), HII(hii), HRI(hri), MRI(mri), MF(mf), BT(bt) {}
+    BitSimplification(BitTracker &bt, const MachineDominatorTree &mdt,
+        const HexagonInstrInfo &hii, const HexagonRegisterInfo &hri,
+        MachineRegisterInfo &mri, MachineFunction &mf)
+      : Transformation(true), MDT(mdt), HII(hii), HRI(hri), MRI(mri),
+        MF(mf), BT(bt) {}
 
     bool processBlock(MachineBasicBlock &B, const RegisterSet &AVs) override;
 
@@ -1766,9 +1775,18 @@ namespace {
           const BitTracker::RegisterCell &RC);
     bool genExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD,
           const BitTracker::RegisterCell &RC);
+    bool genBitSplit(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC, const RegisterSet &AVs);
     bool simplifyTstbit(MachineInstr *MI, BitTracker::RegisterRef RD,
           const BitTracker::RegisterCell &RC);
+    bool simplifyExtractLow(MachineInstr *MI, BitTracker::RegisterRef RD,
+          const BitTracker::RegisterCell &RC, const RegisterSet &AVs);
+
+    // Cache of created instructions to avoid creating duplicates.
+    // XXX Currently only used by genBitSplit.
+    std::vector<MachineInstr*> NewMIs;
 
+    const MachineDominatorTree &MDT;
     const HexagonInstrInfo &HII;
     const HexagonRegisterInfo &HRI;
     MachineRegisterInfo &MRI;
@@ -2150,6 +2168,146 @@ bool BitSimplification::genExtractLow(MachineInstr *MI,
   return false;
 }
 
+bool BitSimplification::genBitSplit(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC,
+      const RegisterSet &AVs) {
+  if (!GenBitSplit)
+    return false;
+  if (CountBitSplit >= MaxBitSplit)
+    return false;
+
+  unsigned Opc = MI->getOpcode();
+  switch (Opc) {
+    case Hexagon::A4_bitsplit:
+    case Hexagon::A4_bitspliti:
+      return false;
+  }
+
+  unsigned W = RC.width();
+  if (W != 32)
+    return false;
+
+  auto ctlz = [] (const BitTracker::RegisterCell &C) -> unsigned {
+    unsigned Z = C.width();
+    while (Z > 0 && C[Z-1].is(0))
+      --Z;
+    return C.width() - Z;
+  };
+
+  // Count the number of leading zeros in the target RC.
+  unsigned Z = ctlz(RC);
+  if (Z == 0 || Z == W)
+    return false;
+
+  // A simplistic analysis: assume the source register (the one being split)
+  // is fully unknown, and that all its bits are self-references.
+  const BitTracker::BitValue &B0 = RC[0];
+  if (B0.Type != BitTracker::BitValue::Ref)
+    return false;
+
+  unsigned SrcR = B0.RefI.Reg;
+  unsigned SrcSR = 0;
+  unsigned Pos = B0.RefI.Pos;
+
+  // All the non-zero bits should be consecutive bits from the same register.
+  for (unsigned i = 1; i < W-Z; ++i) {
+    const BitTracker::BitValue &V = RC[i];
+    if (V.Type != BitTracker::BitValue::Ref)
+      return false;
+    if (V.RefI.Reg != SrcR || V.RefI.Pos != Pos+i)
+      return false;
+  }
+
+  // Now, find the other bitfield among AVs.
+  for (unsigned S = AVs.find_first(); S; S = AVs.find_next(S)) {
+    // The number of leading zeros here should be the number of trailing
+    // non-zeros in RC.
+    if (!BT.has(S))
+      continue;
+    const BitTracker::RegisterCell &SC = BT.lookup(S);
+    if (SC.width() != W || ctlz(SC) != W-Z)
+      continue;
+    // The Z lower bits should now match SrcR.
+    const BitTracker::BitValue &S0 = SC[0];
+    if (S0.Type != BitTracker::BitValue::Ref || S0.RefI.Reg != SrcR)
+      continue;
+    unsigned P = S0.RefI.Pos;
+
+    if (Pos <= P && (Pos + W-Z) != P)
+      continue;
+    if (P < Pos && (P + Z) != Pos)
+      continue;
+    // The starting bitfield position must be at a subregister boundary.
+    if (std::min(P, Pos) != 0 && std::min(P, Pos) != 32)
+      continue;
+
+    unsigned I;
+    for (I = 1; I < Z; ++I) {
+      const BitTracker::BitValue &V = SC[I];
+      if (V.Type != BitTracker::BitValue::Ref)
+        break;
+      if (V.RefI.Reg != SrcR || V.RefI.Pos != P+I)
+        break;
+    }
+    if (I != Z)
+      continue;
+
+    // Generate bitsplit where S is defined.
+    CountBitSplit++;
+    MachineInstr *DefS = MRI.getVRegDef(S);
+    assert(DefS != nullptr);
+    DebugLoc DL = DefS->getDebugLoc();
+    MachineBasicBlock &B = *DefS->getParent();
+    auto At = DefS->isPHI() ? B.getFirstNonPHI()
+                            : MachineBasicBlock::iterator(DefS);
+    if (MRI.getRegClass(SrcR)->getID() == Hexagon::DoubleRegsRegClassID)
+      SrcSR = (std::min(Pos, P) == 32) ? Hexagon::isub_hi : Hexagon::isub_lo;
+    if (!validateReg({SrcR,SrcSR}, Hexagon::A4_bitspliti, 1))
+      continue;
+    unsigned ImmOp = Pos <= P ? W-Z : Z;
+
+    // Find an existing bitsplit instruction if one already exists.
+    unsigned NewR = 0;
+    for (MachineInstr *In : NewMIs) {
+      if (In->getOpcode() != Hexagon::A4_bitspliti)
+        continue;
+      MachineOperand &Op1 = In->getOperand(1);
+      if (Op1.getReg() != SrcR || Op1.getSubReg() != SrcSR)
+        continue;
+      if (In->getOperand(2).getImm() != ImmOp)
+        continue;
+      // Check if the target register is available here.
+      MachineOperand &Op0 = In->getOperand(0);
+      MachineInstr *DefI = MRI.getVRegDef(Op0.getReg());
+      assert(DefI != nullptr);
+      if (!MDT.dominates(DefI, &*At))
+        continue;
+
+      // Found one that can be reused.
+      assert(Op0.getSubReg() == 0);
+      NewR = Op0.getReg();
+      break;
+    }
+    if (!NewR) {
+      NewR = MRI.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
+      auto NewBS = BuildMI(B, At, DL, HII.get(Hexagon::A4_bitspliti), NewR)
+                      .addReg(SrcR, 0, SrcSR)
+                      .addImm(ImmOp);
+      NewMIs.push_back(NewBS);
+    }
+    if (Pos <= P) {
+      HBS::replaceRegWithSub(RD.Reg, NewR, Hexagon::isub_lo, MRI);
+      HBS::replaceRegWithSub(S,      NewR, Hexagon::isub_hi, MRI);
+    } else {
+      HBS::replaceRegWithSub(S,      NewR, Hexagon::isub_lo, MRI);
+      HBS::replaceRegWithSub(RD.Reg, NewR, Hexagon::isub_hi, MRI);
+    }
+    return true;
+  }
+
+  return false;
+}
+
 // Check for tstbit simplification opportunity, where the bit being checked
 // can be tracked back to another register. For example:
 //   vreg2 = S2_lsr_i_r  vreg1, 5
@@ -2211,6 +2369,201 @@ bool BitSimplification::simplifyTstbit(MachineInstr *MI,
   return false;
 }
 
+// Detect whether RD is a bitfield extract (sign- or zero-extended) of
+// some register from the AVs set. Create a new corresponding instruction
+// at the location of MI. The intent is to recognize situations where
+// a sequence of instructions performs an operation that is equivalent to
+// an extract operation, such as a shift left followed by a shift right.
+bool BitSimplification::simplifyExtractLow(MachineInstr *MI,
+      BitTracker::RegisterRef RD, const BitTracker::RegisterCell &RC,
+      const RegisterSet &AVs) {
+  if (!GenExtract)
+    return false;
+  if (CountExtract >= MaxExtract)
+    return false;
+  CountExtract++;
+
+  unsigned W = RC.width();
+  unsigned RW = W;
+  unsigned Len;
+  bool Signed;
+
+  // The code is mostly class-independent, except for the part that generates
+  // the extract instruction, and establishes the source register (in case it
+  // needs to use a subregister).
+  const TargetRegisterClass *FRC = HBS::getFinalVRegClass(RD, MRI);
+  if (FRC != &Hexagon::IntRegsRegClass && FRC != &Hexagon::DoubleRegsRegClass)
+    return false;
+  assert(RD.Sub == 0);
+
+  // Observation:
+  // If the cell has a form of 00..0xx..x with k zeros and n remaining
+  // bits, this could be an extractu of the n bits, but it could also be
+  // an extractu of a longer field which happens to have 0s in the top
+  // bit positions.
+  // The same logic applies to sign-extended fields.
+  //
+  // Do not check for the extended extracts, since it would expand the
+  // search space quite a bit. The search may be expensive as it is.
+
+  const BitTracker::BitValue &TopV = RC[W-1];
+
+  // Eliminate candidates that have self-referential bits, since they
+  // cannot be extracts from other registers. Also, skip registers that
+  // have compile-time constant values.
+  bool IsConst = true;
+  for (unsigned I = 0; I != W; ++I) {
+    const BitTracker::BitValue &V = RC[I];
+    if (V.Type == BitTracker::BitValue::Ref && V.RefI.Reg == RD.Reg)
+      return false;
+    IsConst = IsConst && (V.is(0) || V.is(1));
+  }
+  if (IsConst)
+    return false;
+
+  if (TopV.is(0) || TopV.is(1)) {
+    bool S = TopV.is(1);
+    for (--W; W > 0 && RC[W-1].is(S); --W)
+      ;
+    Len = W;
+    Signed = S;
+    // The sign bit must be a part of the field being extended.
+    if (Signed)
+      ++Len;
+  } else {
+    // This could still be a sign-extended extract.
+    assert(TopV.Type == BitTracker::BitValue::Ref);
+    if (TopV.RefI.Reg == RD.Reg || TopV.RefI.Pos == W-1)
+      return false;
+    for (--W; W > 0 && RC[W-1] == TopV; --W)
+      ;
+    // The top bits of RC are copies of TopV. One occurrence of TopV will
+    // be a part of the field.
+    Len = W + 1;
+    Signed = true;
+  }
+
+  // This would be just a copy. It should be handled elsewhere.
+  if (Len == RW)
+    return false;
+
+  DEBUG({
+    dbgs() << __func__ << " on reg: " << PrintReg(RD.Reg, &HRI, RD.Sub)
+           << ", MI: " << *MI;
+    dbgs() << "Cell: " << RC << '\n';
+    dbgs() << "Expected bitfield size: " << Len << " bits, "
+           << (Signed ? "sign" : "zero") << "-extended\n";
+  });
+
+  bool Changed = false;
+
+  for (unsigned R = AVs.find_first(); R != 0; R = AVs.find_next(R)) {
+    if (!BT.has(R))
+      continue;
+    const BitTracker::RegisterCell &SC = BT.lookup(R);
+    unsigned SW = SC.width();
+
+    // The source can be longer than the destination, as long as its size is
+    // a multiple of the size of the destination. Also, we would need to be
+    // able to refer to the subregister in the source that would be of the
+    // same size as the destination, but only check the sizes here.
+    if (SW < RW || (SW % RW) != 0)
+      continue;
+
+    // The field can start at any offset in SC as long as it contains Len
+    // bits and does not cross subregister boundary (if the source register
+    // is longer than the destination).
+    unsigned Off = 0;
+    while (Off <= SW-Len) {
+      unsigned OE = (Off+Len)/RW;
+      if (OE != Off/RW) {
+        // The assumption here is that if the source (R) is longer than the
+        // destination, then the destination is a sequence of words of
+        // size RW, and each such word in R can be accessed via a subregister.
+        //
+        // If the beginning and the end of the field cross the subregister
+        // boundary, advance to the next subregister.
+        Off = OE*RW;
+        continue;
+      }
+      if (HBS::isEqual(RC, 0, SC, Off, Len))
+        break;
+      ++Off;
+    }
+
+    if (Off > SW-Len)
+      continue;
+
+    // Found match.
+    unsigned ExtOpc = 0;
+    if (Off == 0) {
+      if (Len == 8)
+        ExtOpc = Signed ? Hexagon::A2_sxtb : Hexagon::A2_zxtb;
+      else if (Len == 16)
+        ExtOpc = Signed ? Hexagon::A2_sxth : Hexagon::A2_zxth;
+      else if (Len < 10 && !Signed)
+        ExtOpc = Hexagon::A2_andir;
+    }
+    if (ExtOpc == 0) {
+      ExtOpc =
+          Signed ? (RW == 32 ? Hexagon::S4_extract  : Hexagon::S4_extractp)
+                 : (RW == 32 ? Hexagon::S2_extractu : Hexagon::S2_extractup);
+    }
+    unsigned SR = 0;
+    // This only recognizes isub_lo and isub_hi.
+    if (RW != SW && RW*2 != SW)
+      continue;
+    if (RW != SW)
+      SR = (Off/RW == 0) ? Hexagon::isub_lo : Hexagon::isub_hi;
+    Off = Off % RW;
+
+    if (!validateReg({R,SR}, ExtOpc, 1))
+      continue;
+
+    // Don't generate the same instruction as the one being optimized.
+    if (MI->getOpcode() == ExtOpc) {
+      // All possible ExtOpc's have the source in operand(1).
+      const MachineOperand &SrcOp = MI->getOperand(1);
+      if (SrcOp.getReg() == R)
+        continue;
+    }
+
+    DebugLoc DL = MI->getDebugLoc();
+    MachineBasicBlock &B = *MI->getParent();
+    unsigned NewR = MRI.createVirtualRegister(FRC);
+    auto At = MI->isPHI() ? B.getFirstNonPHI()
+                          : MachineBasicBlock::iterator(MI);
+    auto MIB = BuildMI(B, At, DL, HII.get(ExtOpc), NewR)
+                  .addReg(R, 0, SR);
+    switch (ExtOpc) {
+      case Hexagon::A2_sxtb:
+      case Hexagon::A2_zxtb:
+      case Hexagon::A2_sxth:
+      case Hexagon::A2_zxth:
+        break;
+      case Hexagon::A2_andir:
+        MIB.addImm((1u << Len) - 1);
+        break;
+      case Hexagon::S4_extract:
+      case Hexagon::S2_extractu:
+      case Hexagon::S4_extractp:
+      case Hexagon::S2_extractup:
+        MIB.addImm(Len)
+           .addImm(Off);
+        break;
+      default:
+        llvm_unreachable("Unexpected opcode");
+    }
+
+    HBS::replaceReg(RD.Reg, NewR, MRI);
+    BT.put(BitTracker::RegisterRef(NewR), RC);
+    Changed = true;
+    break;
+  }
+
+  return Changed;
+}
+
 bool BitSimplification::processBlock(MachineBasicBlock &B,
       const RegisterSet &AVs) {
   if (!BT.reached(&B))
@@ -2248,12 +2601,15 @@ bool BitSimplification::processBlock(MachineBasicBlock &B,
 
     if (FRC->getID() == Hexagon::DoubleRegsRegClassID) {
       bool T = genPackhl(MI, RD, RC);
+      T = T || simplifyExtractLow(MI, RD, RC, AVB);
       Changed |= T;
       continue;
     }
 
     if (FRC->getID() == Hexagon::IntRegsRegClassID) {
-      bool T = genExtractHalf(MI, RD, RC);
+      bool T = genBitSplit(MI, RD, RC, AVB);
+      T = T || simplifyExtractLow(MI, RD, RC, AVB);
+      T = T || genExtractHalf(MI, RD, RC);
       T = T || genCombineHalf(MI, RD, RC);
       T = T || genExtractLow(MI, RD, RC);
       Changed |= T;
@@ -2314,7 +2670,7 @@ bool HexagonBitSimplify::runOnMachineFunction(MachineFunction &MF) {
 
   BT.run();
   RegisterSet ABS;  // Available registers for BS.
-  BitSimplification BitS(BT, HII, HRI, MRI, MF);
+  BitSimplification BitS(BT, *MDT, HII, HRI, MRI, MF);
   Changed |= visitBlock(Entry, BitS, ABS);
 
   Changed = DeadCodeElimination(MF, *MDT).run() || Changed;
diff --git a/lib/Target/Hexagon/HexagonBitTracker.cpp b/lib/Target/Hexagon/HexagonBitTracker.cpp
index 436f88dcd450a8c731198d7d200b0154684f337d..90ccecb6629acbd0713e5d00206d834201f735b0 100644
--- a/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -74,7 +74,7 @@ HexagonEvaluator::HexagonEvaluator(const HexagonRegisterInfo &tri,
     // Module::AnyPointerSize.
     if (Width == 0 || Width > 64)
       break;
-    AttributeSet Attrs = F.getAttributes();
+    AttributeList Attrs = F.getAttributes();
     if (Attrs.hasAttribute(AttrIdx, Attribute::ByVal))
       continue;
     InPhysReg = getNextPhysReg(InPhysReg, Width);
@@ -272,6 +272,9 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
   // cases below.
   uint16_t W0 = (Reg[0].Reg != 0) ? getRegBitWidth(Reg[0]) : 0;
 
+  // Register id of the 0th operand. It can be 0.
+  unsigned Reg0 = Reg[0].Reg;
+
   switch (Opc) {
     // Transfer immediate:
 
@@ -792,6 +795,17 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
     case A2_zxth:
       return rr0(eZXT(rc(1), 16), Outputs);
 
+    // Saturations
+
+    case A2_satb:
+      return rr0(eSXT(RegisterCell::self(0, W0).regify(Reg0), 8), Outputs);
+    case A2_sath:
+      return rr0(eSXT(RegisterCell::self(0, W0).regify(Reg0), 16), Outputs);
+    case A2_satub:
+      return rr0(eZXT(RegisterCell::self(0, W0).regify(Reg0), 8), Outputs);
+    case A2_satuh:
+      return rr0(eZXT(RegisterCell::self(0, W0).regify(Reg0), 16), Outputs);
+
     // Bit count:
 
     case S2_cl0:
diff --git a/lib/Target/Hexagon/HexagonBlockRanges.cpp b/lib/Target/Hexagon/HexagonBlockRanges.cpp
index 52d1b1c65cd4b2142c8804140fb607a7f6613f6f..721cf0417289b93cc394a7df1f1b570744afaeeb 100644
--- a/lib/Target/Hexagon/HexagonBlockRanges.cpp
+++ b/lib/Target/Hexagon/HexagonBlockRanges.cpp
@@ -306,6 +306,8 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
     LastUse[R] = LastDef[R] = IndexType::None;
   };
 
+  RegisterSet Defs, Clobbers;
+
   for (auto &In : B) {
     if (In.isDebugValue())
       continue;
@@ -324,19 +326,67 @@ void HexagonBlockRanges::computeInitialLiveRanges(InstrIndexMap &IndexMap,
           closeRange(S);
       }
     }
-    // Process defs.
+    // Process defs and clobbers.
+    Defs.clear();
+    Clobbers.clear();
     for (auto &Op : In.operands()) {
       if (!Op.isReg() || !Op.isDef() || Op.isUndef())
         continue;
       RegisterRef R = { Op.getReg(), Op.getSubReg() };
-      if (TargetRegisterInfo::isPhysicalRegister(R.Reg) && Reserved[R.Reg])
-        continue;
       for (auto S : expandToSubRegs(R, MRI, TRI)) {
-        if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
-          closeRange(S);
-        LastDef[S] = Index;
+        if (TargetRegisterInfo::isPhysicalRegister(S.Reg) && Reserved[S.Reg])
+          continue;
+        if (Op.isDead())
+          Clobbers.insert(S);
+        else
+          Defs.insert(S);
       }
     }
+
+    for (auto &Op : In.operands()) {
+      if (!Op.isRegMask())
+        continue;
+      const uint32_t *BM = Op.getRegMask();
+      for (unsigned PR = 1, N = TRI.getNumRegs(); PR != N; ++PR) {
+        // Skip registers that have subregisters. A register is preserved
+        // iff its bit is set in the regmask, so if R1:0 was preserved, both
+        // R1 and R0 would also be present.
+        if (MCSubRegIterator(PR, &TRI, false).isValid())
+          continue;
+        if (Reserved[PR])
+          continue;
+        if (BM[PR/32] & (1u << (PR%32)))
+          continue;
+        RegisterRef R = { PR, 0 };
+        if (!Defs.count(R))
+          Clobbers.insert(R);
+      }
+    }
+    // Defs and clobbers can overlap, e.g.
+    // %D0<def,dead> = COPY %vreg5, %R0<imp-def>, %R1<imp-def>
+    for (RegisterRef R : Defs)
+      Clobbers.erase(R);
+
+    // Update maps for defs.
+    for (RegisterRef S : Defs) {
+      // Defs should already be expanded into subregs.
+      assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) ||
+             !MCSubRegIterator(S.Reg, &TRI, false).isValid());
+      if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
+        closeRange(S);
+      LastDef[S] = Index;
+    }
+    // Update maps for clobbers.
+    for (RegisterRef S : Clobbers) {
+      // Clobbers should already be expanded into subregs.
+      assert(!TargetRegisterInfo::isPhysicalRegister(S.Reg) ||
+             !MCSubRegIterator(S.Reg, &TRI, false).isValid());
+      if (LastDef[S] != IndexType::None || LastUse[S] != IndexType::None)
+        closeRange(S);
+      // Create a single-instruction range.
+      LastDef[S] = LastUse[S] = Index;
+      closeRange(S);
+    }
   }
 
   // Collect live-on-exit.
diff --git a/lib/Target/Hexagon/HexagonCallingConv.td b/lib/Target/Hexagon/HexagonCallingConv.td
deleted file mode 100644
index e61b2a7a58ac1907f1742d6ad7b95679f4a07803..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonCallingConv.td
+++ /dev/null
@@ -1,35 +0,0 @@
-//===- HexagonCallingConv.td - Calling Conventions Hexagon -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This describes the calling conventions for the Hexagon architectures.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Return Value Calling Conventions
-//===----------------------------------------------------------------------===//
-
-// Hexagon 32-bit C return-value convention.
-def RetCC_Hexagon32 : CallingConv<[
-  CCIfType<[i32, f32], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
-  CCIfType<[i64, f64], CCAssignToReg<[D0, D1, D2]>>,
-
-  // Alternatively, they are assigned to the stack in 4-byte aligned units.
-  CCAssignToStack<4, 4>
-]>;
-
-// Hexagon 32-bit C Calling convention.
-def CC_Hexagon32 : CallingConv<[
-  // All arguments get passed in integer registers if there is space.
-  CCIfType<[f32, i32, i16, i8], CCAssignToReg<[R0, R1, R2, R3, R4, R5]>>,
-  CCIfType<[f64, i64], CCAssignToReg<[D0, D1, D2]>>,
-
-  // Alternatively, they are assigned to the stack in 4-byte aligned units.
-  CCAssignToStack<4, 4>
-]>;
diff --git a/lib/Target/Hexagon/HexagonCommonGEP.cpp b/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 489da6be923def822f4d954ed305cece5558d4a5..a07ba77e6f3e1b33d724bbeafb72df9b56a5d6f1 100644
--- a/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -315,11 +315,8 @@ void HexagonCommonGEP::getBlockTraversalOrder(BasicBlock *Root,
   // visited".
 
   Order.push_back(Root);
-  DomTreeNode *DTN = DT->getNode(Root);
-  typedef GraphTraits<DomTreeNode*> GTN;
-  typedef GTN::ChildIteratorType Iter;
-  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
-    getBlockTraversalOrder((*I)->getBlock(), Order);
+  for (auto *DTN : children<DomTreeNode*>(DT->getNode(Root)))
+    getBlockTraversalOrder(DTN->getBlock(), Order);
 }
 
 bool HexagonCommonGEP::isHandledGepForm(GetElementPtrInst *GepI) {
@@ -1235,11 +1232,8 @@ void HexagonCommonGEP::removeDeadCode() {
 
   for (unsigned i = 0; i < BO.size(); ++i) {
     BasicBlock *B = cast<BasicBlock>(BO[i]);
-    DomTreeNode *N = DT->getNode(B);
-    typedef GraphTraits<DomTreeNode*> GTN;
-    typedef GTN::ChildIteratorType Iter;
-    for (Iter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
-      BO.push_back((*I)->getBlock());
+    for (auto DTN : children<DomTreeNode*>(DT->getNode(B)))
+      BO.push_back(DTN->getBlock());
   }
 
   for (unsigned i = BO.size(); i > 0; --i) {
diff --git a/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 36080997ec6bae6f5befaef7c322ac22d4109a0f..5f375f8dc74284b3e2c2f276eb1612115823aa95 100644
--- a/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -440,17 +440,21 @@ HexagonCopyToCombine::findPotentialNewifiableTFRs(MachineBasicBlock &BB) {
 
     // Put instructions that last defined integer or double registers into the
     // map.
-    for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-      MachineOperand &Op = MI.getOperand(I);
-      if (!Op.isReg() || !Op.isDef() || !Op.getReg())
-        continue;
-      unsigned Reg = Op.getReg();
-      if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
-        for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs) {
-          LastDef[*SubRegs] = &MI;
-        }
-      } else if (Hexagon::IntRegsRegClass.contains(Reg))
-        LastDef[Reg] = &MI;
+    for (MachineOperand &Op : MI.operands()) {
+      if (Op.isReg()) {
+        if (!Op.isDef() || !Op.getReg())
+          continue;
+        unsigned Reg = Op.getReg();
+        if (Hexagon::DoubleRegsRegClass.contains(Reg)) {
+          for (MCSubRegIterator SubRegs(Reg, TRI); SubRegs.isValid(); ++SubRegs)
+            LastDef[*SubRegs] = &MI;
+        } else if (Hexagon::IntRegsRegClass.contains(Reg))
+          LastDef[Reg] = &MI;
+      } else if (Op.isRegMask()) {
+        for (unsigned Reg : Hexagon::IntRegsRegClass)
+          if (Op.clobbersPhysReg(Reg))
+            LastDef[Reg] = &MI;
+      }
     }
   }
 }
diff --git a/lib/Target/Hexagon/HexagonDepArch.h b/lib/Target/Hexagon/HexagonDepArch.h
new file mode 100644
index 0000000000000000000000000000000000000000..1009aa39cefb91fd386933bdab2d827194951230
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepArch.h
@@ -0,0 +1,10 @@
+//===--- HexagonDepArch.h -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+enum HexagonArchEnum { V4,V5,V55,V60,V62 };
diff --git a/lib/Target/Hexagon/HexagonDepArch.td b/lib/Target/Hexagon/HexagonDepArch.td
new file mode 100644
index 0000000000000000000000000000000000000000..5b1d02c136f02a811fb614f76960342725341939
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepArch.td
@@ -0,0 +1,19 @@
+//===--- HexagonDepArch.td ------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def ArchV62: SubtargetFeature<"v62", "HexagonArchVersion", "V62", "Enable Hexagon V62 architecture">;
+def HasV62T : Predicate<"HST->hasV62TOps()">, AssemblerPredicate<"ArchV62">;
+def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "V60", "Enable Hexagon V60 architecture">;
+def HasV60T : Predicate<"HST->hasV60TOps()">, AssemblerPredicate<"ArchV60">;
+def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "V55", "Enable Hexagon V55 architecture">;
+def HasV55T : Predicate<"HST->hasV55TOps()">, AssemblerPredicate<"ArchV55">;
+def ArchV4: SubtargetFeature<"v4", "HexagonArchVersion", "V4", "Enable Hexagon V4 architecture">;
+def HasV4T : Predicate<"HST->hasV4TOps()">, AssemblerPredicate<"ArchV4">;
+def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "V5", "Enable Hexagon V5 architecture">;
+def HasV5T : Predicate<"HST->hasV5TOps()">, AssemblerPredicate<"ArchV5">;
diff --git a/lib/Target/Hexagon/HexagonDepDecoders.h b/lib/Target/Hexagon/HexagonDepDecoders.h
new file mode 100644
index 0000000000000000000000000000000000000000..aa9787ecf0c8485b2f5303096573469b50c13a4a
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepDecoders.h
@@ -0,0 +1,64 @@
+//===--- HexagonDepDecoders.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<4>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<14>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<8>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<7>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<12>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<3>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<13>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<6>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<9>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<5>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<6>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
diff --git a/lib/Target/Hexagon/HexagonDepITypes.h b/lib/Target/Hexagon/HexagonDepITypes.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8ae39a379942446717fa73c3194d5ed5ab769c6
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepITypes.h
@@ -0,0 +1,53 @@
+//===--- HexagonDepITypes.h -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+namespace HexagonII {
+enum Type {
+  TypeALU32_2op = 0,
+  TypeALU32_3op = 1,
+  TypeALU32_ADDI = 2,
+  TypeALU64 = 3,
+  TypeCJ = 4,
+  TypeCOPROC_VMEM = 5,
+  TypeCR = 7,
+  TypeCVI_HIST = 10,
+  TypeCVI_VA = 16,
+  TypeCVI_VA_DV = 17,
+  TypeCVI_VINLANESAT = 18,
+  TypeCVI_VM_CUR_LD = 19,
+  TypeCVI_VM_LD = 20,
+  TypeCVI_VM_NEW_ST = 21,
+  TypeCVI_VM_ST = 22,
+  TypeCVI_VM_STU = 23,
+  TypeCVI_VM_TMP_LD = 24,
+  TypeCVI_VM_VP_LDU = 25,
+  TypeCVI_VP = 26,
+  TypeCVI_VP_VS = 27,
+  TypeCVI_VS = 28,
+  TypeCVI_VX = 30,
+  TypeCVI_VX_DV = 31,
+  TypeDUPLEX = 32,
+  TypeENDLOOP = 33,
+  TypeEXTENDER = 34,
+  TypeJ = 35,
+  TypeLD = 36,
+  TypeM = 37,
+  TypeMAPPING = 38,
+  TypeNCJ = 39,
+  TypePSEUDO = 40,
+  TypeST = 41,
+  TypeSUBINSN = 42,
+  TypeS_2op = 43,
+  TypeS_3op = 44,
+  TypeV2LDST = 47,
+  TypeV4LDST = 48
+};
+}
+}
diff --git a/lib/Target/Hexagon/HexagonDepITypes.td b/lib/Target/Hexagon/HexagonDepITypes.td
new file mode 100644
index 0000000000000000000000000000000000000000..f1d689ce12f4319d73725c6eb7254dfe99b5b8fa
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepITypes.td
@@ -0,0 +1,48 @@
+//===--- HexagonDepITypes.td ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class IType<bits<6> t> { bits<6> Value = t; }
+def TypeALU32_2op : IType<0>;
+def TypeALU32_3op : IType<1>;
+def TypeALU32_ADDI : IType<2>;
+def TypeALU64 : IType<3>;
+def TypeCJ : IType<4>;
+def TypeCOPROC_VMEM : IType<5>;
+def TypeCR : IType<7>;
+def TypeCVI_HIST : IType<10>;
+def TypeCVI_VA : IType<16>;
+def TypeCVI_VA_DV : IType<17>;
+def TypeCVI_VINLANESAT : IType<18>;
+def TypeCVI_VM_CUR_LD : IType<19>;
+def TypeCVI_VM_LD : IType<20>;
+def TypeCVI_VM_NEW_ST : IType<21>;
+def TypeCVI_VM_ST : IType<22>;
+def TypeCVI_VM_STU : IType<23>;
+def TypeCVI_VM_TMP_LD : IType<24>;
+def TypeCVI_VM_VP_LDU : IType<25>;
+def TypeCVI_VP : IType<26>;
+def TypeCVI_VP_VS : IType<27>;
+def TypeCVI_VS : IType<28>;
+def TypeCVI_VX : IType<30>;
+def TypeCVI_VX_DV : IType<31>;
+def TypeDUPLEX : IType<32>;
+def TypeENDLOOP : IType<33>;
+def TypeEXTENDER : IType<34>;
+def TypeJ : IType<35>;
+def TypeLD : IType<36>;
+def TypeM : IType<37>;
+def TypeMAPPING : IType<38>;
+def TypeNCJ : IType<39>;
+def TypePSEUDO : IType<40>;
+def TypeST : IType<41>;
+def TypeSUBINSN : IType<42>;
+def TypeS_2op : IType<43>;
+def TypeS_3op : IType<44>;
+def TypeV2LDST : IType<47>;
+def TypeV4LDST : IType<48>;
diff --git a/lib/Target/Hexagon/HexagonDepInstrFormats.td b/lib/Target/Hexagon/HexagonDepInstrFormats.td
new file mode 100644
index 0000000000000000000000000000000000000000..d7a99f48803bf728df9061384b76b327af4c0e02
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -0,0 +1,4182 @@
+//===--- HexagonDepInstrFormats.td ----------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+class Enc_12122225 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+  bits <3> Qd8;
+  let Inst{2-0} = Qd8{2-0};
+}
+class Enc_16626097 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_13397056 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7315939 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{24-22} = n1{4-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_15275738 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12822813 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_10282127 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_14264243 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{11-8} = Rt16{3-0};
+}
+class Enc_6778937 : OpcodeHexagon {
+  bits <5> Rxx32;
+  let Inst{20-16} = Rxx32{4-0};
+  bits <0> sgp10;
+}
+class Enc_5480539 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+}
+class Enc_11422009 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_16357011 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{8-4} = Vv32{4-0};
+  bits <5> Vt32;
+  let Inst{13-9} = Vt32{4-0};
+  bits <4> Vdd16;
+  let Inst{3-0} = Vdd16{3-0};
+}
+class Enc_4975051 : OpcodeHexagon {
+  bits <19> Ii;
+  let Inst{26-25} = Ii{18-17};
+  let Inst{20-16} = Ii{16-12};
+  let Inst{13-5} = Ii{11-3};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_14786238 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_15472748 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_6773159 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_12535811 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_14007201 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <8> II;
+  let Inst{22-16} = II{7-1};
+  let Inst{13-13} = II{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_2577026 : OpcodeHexagon {
+  bits <3> Qt8;
+  let Inst{2-0} = Qt8{2-0};
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_7305764 : OpcodeHexagon {
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+}
+class Enc_11682941 : OpcodeHexagon {
+  bits <19> Ii;
+  let Inst{26-25} = Ii{18-17};
+  let Inst{20-16} = Ii{16-12};
+  let Inst{13-13} = Ii{11-11};
+  let Inst{7-0} = Ii{10-3};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_16376009 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13249928 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_1971351 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13715847 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_13303422 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14574598 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_13094118 : OpcodeHexagon {
+  bits <5> Css32;
+  let Inst{20-16} = Css32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_4231995 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_844699 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-22} = n1{2-0};
+}
+class Enc_8752140 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7978128 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+}
+class Enc_10492541 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_0 : OpcodeHexagon {
+}
+class Enc_15733946 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_738356 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_14400220 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{9-5} = Ii{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_15194851 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_14172170 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_10065510 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14998517 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <3> n1;
+  let Inst{29-29} = n1{2-2};
+  let Inst{26-25} = n1{1-0};
+}
+class Enc_16657398 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_14620934 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_10075393 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_8638014 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vss32;
+  let Inst{7-3} = Vss32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13261538 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_8990840 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_5974204 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_4711514 : OpcodeHexagon {
+  bits <2> Qu4;
+  let Inst{9-8} = Qu4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_11492529 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9277990 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_6690615 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-4} = Ii{6-2};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_1220199 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_7785569 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-22} = n1{4-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_2880796 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_6858527 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{4-0} = Vv32{4-0};
+}
+class Enc_11863656 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_151014 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <2> Px4;
+  let Inst{6-5} = Px4{1-0};
+}
+class Enc_10333841 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_14044877 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-3} = Ii{4-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_13691337 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <2> Qx4;
+  let Inst{6-5} = Qx4{1-0};
+}
+class Enc_3817033 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <3> Qt8;
+  let Inst{10-8} = Qt8{2-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_3540372 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_5200852 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_15949334 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_3831744 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_8280533 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_10969213 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_3974695 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{10-4} = Ii{6-0};
+  bits <4> Rx16;
+  let Inst{3-0} = Rx16{3-0};
+}
+class Enc_7255914 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7212930 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_12781442 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_799555 : OpcodeHexagon {
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_11083408 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_900013 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_9487067 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{19-16} = Ii{11-8};
+  let Inst{12-5} = Ii{7-0};
+  bits <2> Pu4;
+  let Inst{22-21} = Pu4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_16014536 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_12419313 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-23} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_5503430 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_14767681 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_9093094 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <8> II;
+  let Inst{22-16} = II{7-1};
+  let Inst{13-13} = II{0-0};
+  bits <2> Pu4;
+  let Inst{24-23} = Pu4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_11542684 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{27-21} = Ii{15-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8877260 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_1737833 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-3} = Ii{4-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_255516 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_10721363 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_7076358 : OpcodeHexagon {
+  bits <5> Zdd8;
+  let Inst{4-0} = Zdd8{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_11930928 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2410156 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_6735062 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_7965855 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_5202340 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vyy32;
+  let Inst{4-0} = Vyy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10568534 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <2> Pu4;
+  let Inst{22-21} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_16730127 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_11224149 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{13-13} = Ii{7-7};
+  let Inst{7-3} = Ii{6-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_9772987 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{4-0} = Rtt32{4-0};
+}
+class Enc_9238139 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Zdd8;
+  let Inst{4-0} = Zdd8{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2082775 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_5790679 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{12-8} = Ii{8-4};
+  let Inst{4-3} = Ii{3-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_9305257 : OpcodeHexagon {
+  bits <5> Zu8;
+  let Inst{12-8} = Zu8{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_3735566 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_12654528 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{4-0} = Vvv32{4-0};
+}
+class Enc_15290236 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_11139981 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_15546666 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{10-8} = Ii{8-6};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_486163 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2079016 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{1-0} = Ii{1-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_10095813 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_13133322 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_9422954 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_10642833 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14989332 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{4-0} = Vv32{4-0};
+}
+class Enc_10263630 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_13937564 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+}
+class Enc_7171569 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_2702036 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_1928953 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_5853469 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_7692963 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_15140689 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_748676 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_3372766 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7900405 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_11930027 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_971574 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <6> II;
+  let Inst{23-23} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_13453446 : OpcodeHexagon {
+  bits <24> Ii;
+  let Inst{24-16} = Ii{23-15};
+  let Inst{13-1} = Ii{14-2};
+}
+class Enc_6356866 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_16246706 : OpcodeHexagon {
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_5326450 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_11687333 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_2771456 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_11282123 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_518319 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_16104442 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_7912540 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_15560488 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7581852 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_10030031 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_3915770 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4075554 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_11326438 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4050532 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_14461004 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_13344657 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_13114546 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_14530015 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-23} = n1{4-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_5967898 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_15450971 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-22} = n1{4-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_15536400 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{3-0} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_1291652 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{8-8} = Ii{0-0};
+}
+class Enc_5636753 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+}
+class Enc_5757366 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_9752128 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13618890 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_5890213 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_5582416 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_13536408 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{3-0} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_9773189 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rxx32;
+  let Inst{12-8} = Rxx32{4-0};
+}
+class Enc_2152247 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+}
+class Enc_12848507 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_16279406 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_1734121 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{10-8} = Ii{3-1};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_766909 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_4527648 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_8849208 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_9894557 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <6> II;
+  let Inst{23-21} = II{5-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_4109168 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+}
+class Enc_14560494 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9773167 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_1898420 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+}
+class Enc_11498120 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_15459921 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10058269 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_10197700 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_12608570 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_4804090 : OpcodeHexagon {
+  bits <6> Ss64;
+  let Inst{21-16} = Ss64{5-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_14973146 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Qd8;
+  let Inst{5-3} = Qd8{2-0};
+}
+class Enc_5718302 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
+}
+class Enc_2103742 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_7564330 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_2176383 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{9-4} = Ii{5-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_7736768 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_13189194 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_5154851 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_1329520 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Cdd32;
+  let Inst{4-0} = Cdd32{4-0};
+}
+class Enc_14057553 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9223889 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_10979813 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{13-13} = Ii{6-6};
+  let Inst{7-3} = Ii{5-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_13490067 : OpcodeHexagon {
+  bits <3> Qt8;
+  let Inst{2-0} = Qt8{2-0};
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_10076500 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_163381 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{26-25} = Ii{13-12};
+  let Inst{13-5} = Ii{11-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_10328975 : OpcodeHexagon {
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_14939491 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_8891794 : OpcodeHexagon {
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_7723767 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_2639299 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{11-8} = Rd16{3-0};
+}
+class Enc_11552785 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_11849200 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_14868535 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{23-22} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+}
+class Enc_48594 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6608821 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+}
+class Enc_11049656 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-3} = Ii{7-3};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_117962 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{23-21} = Ii{7-5};
+  let Inst{13-13} = Ii{4-4};
+  let Inst{7-5} = Ii{3-1};
+  let Inst{3-3} = Ii{0-0};
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5900401 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_36641 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_9626139 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_11971407 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9852473 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_6495334 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_1186018 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_15999208 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_11477246 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_7971062 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{23-22} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_4327792 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_10326434 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1572239 : OpcodeHexagon {
+  bits <2> Qt4;
+  let Inst{6-5} = Qt4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_6372758 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_15793331 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_11424254 : OpcodeHexagon {
+  bits <2> Qt4;
+  let Inst{6-5} = Qt4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_4983213 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{10-0} = Ii{13-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_16035138 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+}
+class Enc_8225953 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{13-13} = Ii{7-7};
+  let Inst{7-3} = Ii{6-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_4397470 : OpcodeHexagon {
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+}
+class Enc_1004392 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+}
+class Enc_16319737 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{26-25} = Ii{13-12};
+  let Inst{13-13} = Ii{11-11};
+  let Inst{7-0} = Ii{10-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_2296022 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9664427 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <3> Qss8;
+  let Inst{2-0} = Qss8{2-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_877823 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_1589406 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6900405 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14150875 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-22} = n1{3-0};
+}
+class Enc_15707793 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Gd32;
+  let Inst{4-0} = Gd32{4-0};
+}
+class Enc_14689096 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_9915754 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7470998 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <2> Qx4;
+  let Inst{1-0} = Qx4{1-0};
+}
+class Enc_11471622 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_14363183 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_15816255 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5321335 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <4> Vdd16;
+  let Inst{7-4} = Vdd16{3-0};
+}
+class Enc_12702821 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_449439 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_2054304 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <6> Sd64;
+  let Inst{5-0} = Sd64{5-0};
+}
+class Enc_236434 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_5598813 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8409782 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_15182416 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_4501395 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6039436 : OpcodeHexagon {
+  bits <3> Qtt8;
+  let Inst{2-0} = Qtt8{2-0};
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_476163 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+}
+class Enc_11281763 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9929262 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
+}
+class Enc_13174858 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8437395 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_16578332 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{10-8} = Ii{8-6};
+  bits <5> Zdd8;
+  let Inst{4-0} = Zdd8{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_12829314 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+}
+class Enc_9744403 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{13-9} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{8-4} = Vv32{4-0};
+  bits <4> Vdd16;
+  let Inst{3-0} = Vdd16{3-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10968391 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <7> n1;
+  let Inst{28-28} = n1{6-6};
+  let Inst{25-22} = n1{5-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_64199 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-4} = Ii{6-2};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_11039423 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6730375 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+}
+class Enc_16213761 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_13204995 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_13338314 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_9920336 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{4-0} = Rtt32{4-0};
+}
+class Enc_15380240 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+}
+class Enc_3296020 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2428539 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-23} = n1{2-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_10039393 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9372046 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+}
+class Enc_2901241 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_16145290 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{6-5} = Ps4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_13783220 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_12261611 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6135183 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rx16;
+  let Inst{3-0} = Rx16{3-0};
+}
+class Enc_5523416 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_13472494 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_16303398 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_3494181 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_13983714 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_931653 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7622936 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+}
+class Enc_8773155 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_5401217 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <3> n1;
+  let Inst{28-28} = n1{2-2};
+  let Inst{24-23} = n1{1-0};
+}
+class Enc_6736678 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_3457570 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_3813442 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_3135259 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_5486172 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <3> Nt8;
+  let Inst{2-0} = Nt8{2-0};
+}
+class Enc_11081334 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vss32;
+  let Inst{7-3} = Vss32{4-0};
+}
+class Enc_9470751 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_2683366 : OpcodeHexagon {
+  bits <3> Quu8;
+  let Inst{10-8} = Quu8{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Qdd8;
+  let Inst{5-3} = Qdd8{2-0};
+}
+class Enc_15830826 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{10-0} = Ii{13-3};
+}
+class Enc_4967902 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_14287645 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8324216 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_913538 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Qd8;
+  let Inst{5-3} = Qd8{2-0};
+}
+class Enc_16311032 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_9864697 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <6> II;
+  let Inst{20-16} = II{5-1};
+  let Inst{13-13} = II{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_11205051 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{11-8} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_5611087 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10915758 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8943121 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_1539665 : OpcodeHexagon {
+  bits <5> Cs32;
+  let Inst{20-16} = Cs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8479583 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{23-23} = n1{1-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_313333 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_11544269 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_9018141 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Cd32;
+  let Inst{4-0} = Cd32{4-0};
+}
+class Enc_6152036 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Gdd32;
+  let Inst{4-0} = Gdd32{4-0};
+}
+class Enc_1954437 : OpcodeHexagon {
+  bits <6> Sss64;
+  let Inst{21-16} = Sss64{5-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_3742184 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_1835415 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{10-5} = Ii{6-1};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_1085466 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_13150110 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_6772177 : OpcodeHexagon {
+  bits <5> Zu8;
+  let Inst{12-8} = Zu8{4-0};
+  bits <5> Zd8;
+  let Inst{4-0} = Zd8{4-0};
+}
+class Enc_6616512 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_1886960 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2835415 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{10-5} = Ii{7-2};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_14024197 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_12297800 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_7254313 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_677558 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{10-5} = Ii{8-3};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_6223403 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_674613 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_16479122 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{7-3} = Ii{7-3};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_11704059 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_9165078 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{8-3} = Ii{8-3};
+  bits <3> Rtt8;
+  let Inst{2-0} = Rtt8{2-0};
+}
+class Enc_15376009 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8838398 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{21-21} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <6> II;
+  let Inst{13-8} = II{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_2328527 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_1451363 : OpcodeHexagon {
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_4030179 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_13770697 : OpcodeHexagon {
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ry32;
+  let Inst{12-8} = Ry32{4-0};
+}
+class Enc_12212978 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_12665927 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2082956 : OpcodeHexagon {
+  bits <32> Ii;
+  let Inst{27-16} = Ii{31-20};
+  let Inst{13-0} = Ii{19-6};
+}
+class Enc_220949 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-23} = n1{3-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_9939385 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{12-8} = Ii{8-4};
+  let Inst{4-3} = Ii{3-2};
+  bits <10> II;
+  let Inst{20-16} = II{9-5};
+  let Inst{7-5} = II{4-2};
+  let Inst{1-0} = II{1-0};
+}
+class Enc_2117024 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-8} = Ii{7-3};
+  let Inst{4-2} = Ii{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8390029 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_10989558 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_5972412 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_12851489 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vss32;
+  let Inst{7-3} = Vss32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9554661 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_4202401 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6091631 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{9-8} = Qs4{1-0};
+  bits <2> Qt4;
+  let Inst{23-22} = Qt4{1-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_10157519 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_4835423 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{10-5} = Ii{5-0};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_14046916 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_2921694 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_8732960 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-8} = Ii{7-3};
+  let Inst{4-2} = Ii{2-0};
+}
+class Enc_5338033 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-22} = n1{3-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_6956613 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2153798 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_16210172 : OpcodeHexagon {
+  bits <3> Qt8;
+  let Inst{10-8} = Qt8{2-0};
+  bits <3> Qd8;
+  let Inst{5-3} = Qd8{2-0};
+}
+class Enc_5023792 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_1244745 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_10002182 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_12492533 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1774350 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_2703240 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_6975103 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_9789480 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_12244921 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8674673 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{23-22} = n1{1-0};
+}
+class Enc_8514936 : OpcodeHexagon {
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13455308 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_10188026 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_3158657 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10597934 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+  bits <2> n1;
+  let Inst{9-8} = n1{1-0};
+}
+class Enc_10612292 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qx4;
+  let Inst{1-0} = Qx4{1-0};
+}
+class Enc_5178985 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_3967902 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_2462143 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9849208 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_12618352 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_7303598 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_13823098 : OpcodeHexagon {
+  bits <5> Gss32;
+  let Inst{20-16} = Gss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_16388420 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_8328140 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1793896 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_4944558 : OpcodeHexagon {
+  bits <2> Qu4;
+  let Inst{9-8} = Qu4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_13211717 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{20-16} = Vvv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_8170340 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+  bits <3> Qdd8;
+  let Inst{2-0} = Qdd8{2-0};
+}
+class Enc_14071773 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8605375 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12711252 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{9-8} = Pv4{1-0};
+}
+class Enc_8202458 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_8577055 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-23} = n1{3-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_1409050 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_7466005 : OpcodeHexagon {
+  bits <5> Gs32;
+  let Inst{20-16} = Gs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_2380082 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_10067774 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_11000933 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <3> Nt8;
+  let Inst{2-0} = Nt8{2-0};
+}
+class Enc_13201267 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_1989309 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{4-0} = Vvv32{4-0};
+}
+class Enc_9082775 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8065534 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4631106 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Pu4;
+  let Inst{7-6} = Pu4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_11065510 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6673186 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_8498433 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4395009 : OpcodeHexagon {
+  bits <7> Ii;
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10926598 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+}
+class Enc_7606379 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_8131399 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_11522288 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_114098 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_5654851 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_12023037 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{6-5} = Ps4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_176263 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{9-4} = Ii{7-2};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_6130414 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{23-22} = Ii{15-14};
+  let Inst{13-0} = Ii{13-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_631197 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <6> II;
+  let Inst{23-21} = II{5-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_16214129 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_8333157 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_4834775 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{13-8} = II{5-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rd16;
+  let Inst{19-16} = Rd16{3-0};
+}
+class Enc_16601956 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_15946706 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{6-5} = Ii{1-0};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_6923828 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+}
+class Enc_1332717 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_1786883 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <6> Sdd64;
+  let Inst{5-0} = Sdd64{5-0};
+}
+class Enc_14303394 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_9282127 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_2813446 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_364753 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{23-23} = n1{0-0};
+}
+class Enc_12477789 : OpcodeHexagon {
+  bits <15> Ii;
+  let Inst{21-21} = Ii{14-14};
+  let Inst{13-13} = Ii{13-13};
+  let Inst{11-1} = Ii{12-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_44555 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_8497723 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_4359901 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{22-22} = n1{0-0};
+}
+class Enc_11271630 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_10501894 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_9768377 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_16268019 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_8814718 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_6212930 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5462762 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_6154421 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{13-13} = Ii{6-6};
+  let Inst{7-3} = Ii{5-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_8940892 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_3531000 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{11-5} = Ii{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_14311138 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_2216485 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12395768 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_11047413 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_1256611 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_7884306 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{8-4} = Ii{7-3};
+}
+class Enc_11244923 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_8612939 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{22-22} = n1{1-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_16355964 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12616482 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_5915771 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-22} = n1{3-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_14459927 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_7504828 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_14209223 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_3931661 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_13606251 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{11-8} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_11475992 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_13133231 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9959498 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{22-21} = Ii{7-6};
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-5} = Ii{4-2};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_8919369 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-23} = n1{3-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_2968094 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{11-5} = Ii{6-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_4813442 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4684887 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{25-23} = n1{2-0};
+}
+class Enc_15606259 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_2268028 : OpcodeHexagon {
+  bits <3> Qtt8;
+  let Inst{10-8} = Qtt8{2-0};
+  bits <3> Qdd8;
+  let Inst{5-3} = Qdd8{2-0};
+}
+class Enc_13430430 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <3> Qxx8;
+  let Inst{2-0} = Qxx8{2-0};
+}
+class Enc_13336212 : OpcodeHexagon {
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+  bits <1> n1;
+  let Inst{9-9} = n1{0-0};
+}
+class Enc_15008287 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+}
+class Enc_4897205 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{9-8} = Qs4{1-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
+}
+class Enc_8038806 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_12669374 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_971347 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1997594 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_11940513 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_2735552 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_16410950 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6226085 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_14193700 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_15763937 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <6> n1;
+  let Inst{29-29} = n1{5-5};
+  let Inst{26-25} = n1{4-3};
+  let Inst{23-22} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_2492727 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_13425035 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_4135257 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{10-8} = Ii{3-1};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_14631806 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_12397062 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_11959851 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
diff --git a/lib/Target/Hexagon/HexagonDepInstrInfo.td b/lib/Target/Hexagon/HexagonDepInstrInfo.td
new file mode 100644
index 0000000000000000000000000000000000000000..2bfde9acaea9cd7f426ea4775ce645ade49753dd
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -0,0 +1,45573 @@
+//===--- HexagonDepInstrInfo.td -------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def A2_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = abs($Rs32)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_absp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = abs($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000100;
+}
+def A2_abssat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = abs($Rs32):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_add : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_addh_h16_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.h):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_h16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.l):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_h16_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_h16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_h16_sat_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.h):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addh_h16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.h,$Rs32.l):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addh_h16_sat_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addh_h16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addh_l16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_l16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_addh_l16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.h):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addh_l16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = add($Rt32.l,$Rs32.l):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_addi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = add($Rs32,#$Ii)",
+ALU32_ADDI_tc_1_SLOT0123, TypeALU32_ADDI>, Enc_11542684, PredNewRel, ImmRegRel {
+let Inst{31-28} = 0b1011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isPredicable = 1;
+let isAdd = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def A2_addp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let isCommutable = 1;
+let isAdd = 1;
+}
+def A2_addpsat : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let Defs = [USR_OVF];
+let isCommutable = 1;
+}
+def A2_addsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add($Rs32,$Rt32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_addsp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rs32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64> {
+let isPseudo = 1;
+}
+def A2_addsph : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32):raw:hi",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_addspl : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = add($Rss32,$Rtt32):raw:lo",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_and : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = and($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_and";
+let InputType = "reg";
+let BaseOpcode = "A2_and";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_andir : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = and($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_13472494, ImmRegRel {
+let Inst{31-22} = 0b0111011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_and";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def A2_andp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = and($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let isCommutable = 1;
+}
+def A2_aslh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = aslh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_aslh";
+let isPredicable = 1;
+}
+def A2_asrh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = asrh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_asrh";
+let isPredicable = 1;
+}
+def A2_combine_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.h,$Rs32.h)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combine_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.h,$Rs32.l)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combine_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.l,$Rs32.h)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combine_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = combine($Rt32.l,$Rs32.l)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_combineii : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s32_0Imm:$Ii, s8_0Imm:$II),
+"$Rdd32 = combine(#$Ii,#$II)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_14007201 {
+let Inst{31-23} = 0b011111000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_combinew : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = combine($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1997594, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110101000;
+let InputType = "reg";
+let BaseOpcode = "A2_combinew";
+let isPredicable = 1;
+}
+def A2_max : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = max($Rs32,$Rt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_maxp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = max($Rss32,$Rtt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_maxu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = maxu($Rs32,$Rt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_maxup : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = maxu($Rss32,$Rtt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_min : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = min($Rt32,$Rs32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_minp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = min($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_minu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = minu($Rt32,$Rs32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_minup : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = minu($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_neg : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = neg($Rs32)",
+PSEUDO, TypeALU32_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_negp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = neg($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000100;
+}
+def A2_negsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = neg($Rs32):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_nop : HInst<
+(outs),
+(ins),
+"nop",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_0 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b0111111100000000;
+}
+def A2_not : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = not($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_notp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = not($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000100;
+}
+def A2_or : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = or($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_or";
+let InputType = "reg";
+let BaseOpcode = "A2_or";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_orir : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = or($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_13472494, ImmRegRel {
+let Inst{31-22} = 0b0111011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_or";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def A2_orp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = or($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let isCommutable = 1;
+}
+def A2_paddf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = add($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_paddfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = add($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_paddif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if (!$Pu4) $Rd32 = add($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011101001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_paddifnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if (!$Pu4.new) $Rd32 = add($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-23} = 0b011101001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_paddit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if ($Pu4) $Rd32 = add($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011101000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_padditnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"if ($Pu4.new) $Rd32 = add($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-23} = 0b011101000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "imm";
+let BaseOpcode = "A2_addi";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A2_paddt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = add($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_paddtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = add($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel, ImmRegRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_add";
+let InputType = "reg";
+let BaseOpcode = "A2_add";
+}
+def A2_pandf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = and($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_and";
+}
+def A2_pandfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = and($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_and";
+}
+def A2_pandt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = and($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_and";
+}
+def A2_pandtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = and($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_and";
+}
+def A2_porf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = or($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_or";
+}
+def A2_porfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = or($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_or";
+}
+def A2_port : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = or($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_or";
+}
+def A2_portnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = or($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_or";
+}
+def A2_psubf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = sub($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1332717, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sub";
+}
+def A2_psubfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = sub($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1332717, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sub";
+}
+def A2_psubt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = sub($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1332717, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sub";
+}
+def A2_psubtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = sub($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1332717, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sub";
+}
+def A2_pxorf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rd32 = xor($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_xor";
+}
+def A2_pxorfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rd32 = xor($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_xor";
+}
+def A2_pxort : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rd32 = xor($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_xor";
+}
+def A2_pxortnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rd32 = xor($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111001011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_xor";
+}
+def A2_roundsat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = round($Rss32):sat",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = sat($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_satb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = satb($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_sath : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sath($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_satub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = satub($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_satuh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = satuh($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_sub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375, PredNewRel, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_sub";
+let InputType = "reg";
+let BaseOpcode = "A2_sub";
+let isPredicable = 1;
+}
+def A2_subh_h16_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.h):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_h16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.l):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_h16_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_h16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l):<<16",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_h16_sat_hh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.h):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subh_h16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.h,$Rs32.l):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subh_h16_sat_lh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subh_h16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l):sat:<<16",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subh_l16_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_l16_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_subh_l16_sat_hl : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.h):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subh_l16_sat_ll : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32.l,$Rs32.l):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def A2_subp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = sub($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_subri : HInst<
+(outs IntRegs:$Rd32),
+(ins s32_0Imm:$Ii, IntRegs:$Rs32),
+"$Rd32 = sub(#$Ii,$Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_13472494, PredNewRel, ImmRegRel {
+let Inst{31-22} = 0b0111011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_sub";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def A2_subsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32,$Rs32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+}
+def A2_svaddh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vaddh($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svaddhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vaddh($Rs32,$Rt32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svadduhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vadduh($Rs32,$Rt32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svavgh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vavgh($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svavghs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vavgh($Rs32,$Rt32):rnd",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A2_svnavgh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vnavgh($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_svsubh : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vsubh($Rt32,$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A2_svsubhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vsubh($Rt32,$Rs32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+}
+def A2_svsubuhs : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = vsubuh($Rt32,$Rs32):sat",
+ALU32_3op_tc_2_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+let InputType = "reg";
+}
+def A2_swiz : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = swiz($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_sxtb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxtb";
+let isPredicable = 1;
+}
+def A2_sxth : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxth";
+let isPredicable = 1;
+}
+def A2_sxtw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = sxtw($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000100010;
+}
+def A2_tfr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = $Rs32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPredicable = 1;
+}
+def A2_tfrcrr : HInst<
+(outs IntRegs:$Rd32),
+(ins CtrRegs:$Cs32),
+"$Rd32 = $Cs32",
+CR_tc_3x_SLOT3, TypeCR>, Enc_1539665 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_tfrf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = $Rs32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_tfrfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = $Rs32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_tfrih : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, u16_0Imm:$Ii),
+"$Rx32.h = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_6130414 {
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def A2_tfril : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, u16_0Imm:$Ii),
+"$Rx32.l = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_6130414 {
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def A2_tfrp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = $Rss32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let BaseOpcode = "A2_tfrp";
+let isPredicable = 1;
+let isPseudo = 1;
+}
+def A2_tfrpf : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if (!$Pu4) $Rdd32 = $Rss32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrpfnew : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if (!$Pu4.new) $Rdd32 = $Rss32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrpi : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s8_0Imm:$Ii),
+"$Rdd32 = #$Ii",
+ALU64_tc_1_SLOT23, TypeALU64> {
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+let isMoveImm = 1;
+let isPseudo = 1;
+}
+def A2_tfrpt : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if ($Pu4) $Rdd32 = $Rss32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrptnew : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32),
+"if ($Pu4.new) $Rdd32 = $Rss32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let isPredicated = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_tfrp";
+let isPseudo = 1;
+}
+def A2_tfrrcr : HInst<
+(outs CtrRegs:$Cd32),
+(ins IntRegs:$Rs32),
+"$Cd32 = $Rs32",
+CR_tc_3x_SLOT3, TypeCR>, Enc_9018141 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01100010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A2_tfrsi : HInst<
+(outs IntRegs:$Rd32),
+(ins s32_0Imm:$Ii),
+"$Rd32 = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_7971062, PredNewRel, ImmRegRel {
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isPredicable = 1;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def A2_tfrt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = $Rs32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_tfrtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = $Rs32",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel, ImmRegRel {
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "reg";
+let BaseOpcode = "A2_tfr";
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_vabsh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsh($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000010;
+}
+def A2_vabshsat : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsh($Rss32):sat",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000010;
+let Defs = [USR_OVF];
+}
+def A2_vabsw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsw($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000010;
+}
+def A2_vabswsat : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vabsw($Rss32):sat",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000010;
+let Defs = [USR_OVF];
+}
+def A2_vaddb_map : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddb($Rss32,$Rtt32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_vaddh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddh($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+}
+def A2_vaddhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddh($Rss32,$Rtt32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let Defs = [USR_OVF];
+}
+def A2_vaddub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddub($Rss32,$Rtt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+}
+def A2_vaddubs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddub($Rss32,$Rtt32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let Defs = [USR_OVF];
+}
+def A2_vadduhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vadduh($Rss32,$Rtt32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let Defs = [USR_OVF];
+}
+def A2_vaddw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddw($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+}
+def A2_vaddws : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vaddw($Rss32,$Rtt32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011000;
+let Defs = [USR_OVF];
+}
+def A2_vavgh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgh($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavghcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgh($Rss32,$Rtt32):crnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+let prefersSlot3 = 1;
+}
+def A2_vavghr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgh($Rss32,$Rtt32):rnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavgub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgub($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavgubr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgub($Rss32,$Rtt32):rnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavguh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguh($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavguhr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguh($Rss32,$Rtt32):rnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011010;
+}
+def A2_vavguw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguw($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_vavguwr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavguw($Rss32,$Rtt32):rnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_vavgw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgw($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_vavgwcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgw($Rss32,$Rtt32):crnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+let prefersSlot3 = 1;
+}
+def A2_vavgwr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vavgw($Rss32,$Rtt32):rnd",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011011;
+}
+def A2_vcmpbeq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpb.eq($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b110000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpbgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpb.gtu($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b111000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpheq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmph.eq($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmphgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmph.gt($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmphgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmph.gtu($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpweq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpw.eq($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpwgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpw.gt($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vcmpwgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpw.gtu($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010000;
+}
+def A2_vconj : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vconj($Rss32):sat",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000100;
+let Defs = [USR_OVF];
+}
+def A2_vmaxb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxb($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vmaxh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxh($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vmaxub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxub($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vmaxuh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxuh($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vmaxuw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxuw($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vmaxw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vmaxw($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vminb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminb($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011110;
+}
+def A2_vminh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminh($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vminub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminub($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vminuh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminuh($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vminuw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminuw($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vminw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vminw($Rtt32,$Rss32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011101;
+}
+def A2_vnavgh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgh($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+}
+def A2_vnavghcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgh($Rtt32,$Rss32):crnd:sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vnavghr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgh($Rtt32,$Rss32):rnd:sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vnavgw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgw($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+}
+def A2_vnavgwcr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgw($Rtt32,$Rss32):crnd:sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vnavgwr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vnavgw($Rtt32,$Rss32):rnd:sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A2_vraddub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vraddub($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def A2_vraddub_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vraddub($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A2_vrsadub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrsadub($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def A2_vrsadub_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrsadub($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A2_vsubb_map : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vsubb($Rss32,$Rtt32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_vsubh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubh($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_vsubhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubh($Rtt32,$Rss32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let Defs = [USR_OVF];
+}
+def A2_vsubub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubub($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_vsububs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubub($Rtt32,$Rss32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let Defs = [USR_OVF];
+}
+def A2_vsubuhs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubuh($Rtt32,$Rss32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let Defs = [USR_OVF];
+}
+def A2_vsubw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubw($Rtt32,$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+}
+def A2_vsubws : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vsubw($Rtt32,$Rss32):sat",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011001;
+let Defs = [USR_OVF];
+}
+def A2_xor : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = xor($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+let BaseOpcode = "A2_xor";
+let isCommutable = 1;
+let isPredicable = 1;
+}
+def A2_xorp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = xor($Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let isCommutable = 1;
+}
+def A2_zxtb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = zxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, PredNewRel {
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxtb";
+let isPredicable = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def A2_zxth : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = zxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_4075554, PredNewRel {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01110000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxth";
+let isPredicable = 1;
+}
+def A4_addp_c : HInst<
+(outs DoubleRegs:$Rdd32, PredRegs:$Px4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in),
+"$Rdd32 = add($Rss32,$Rtt32,$Px4):carry",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_151014 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010110;
+let isPredicateLate = 1;
+let Constraints = "$Px4 = $Px4in";
+}
+def A4_andn : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = and($Rt32,~$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A4_andnp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = and($Rtt32,~$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+}
+def A4_bitsplit : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = bitsplit($Rs32,$Rt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010100001;
+}
+def A4_bitspliti : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rdd32 = bitsplit($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_5654851 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001000110;
+}
+def A4_boundscheck : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Pd4 = boundscheck($Rs32,$Rtt32)",
+M_tc_3x_SLOT23, TypeALU64> {
+let isPseudo = 1;
+}
+def A4_boundscheck_hi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = boundscheck($Rss32,$Rtt32):raw:hi",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_boundscheck_lo : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = boundscheck($Rss32,$Rtt32):raw:lo",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_cmpbeq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmpb.eq($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b110000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpbeq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def A4_cmpbeqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u8_0Imm:$Ii),
+"$Pd4 = cmpb.eq($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_6736678, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101000;
+let CextOpcode = "A4_cmpbeq";
+let InputType = "imm";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def A4_cmpbgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmpb.gt($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpbgt";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmpbgti : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s8_0Imm:$Ii),
+"$Pd4 = cmpb.gt($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_6736678, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101001;
+let CextOpcode = "A4_cmpbgt";
+let InputType = "imm";
+let isCompare = 1;
+}
+def A4_cmpbgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmpb.gtu($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b111000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpbgtu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmpbgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = cmpb.gtu($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3531000, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011101010;
+let CextOpcode = "A4_cmpbgtu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 0;
+}
+def A4_cmpheq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmph.eq($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmpheq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def A4_cmpheqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmph.eq($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_6736678, ImmRegRel {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101000;
+let CextOpcode = "A4_cmpheq";
+let InputType = "imm";
+let isCommutable = 1;
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_cmphgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmph.gt($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmphgt";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmphgti : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmph.gt($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_6736678, ImmRegRel {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011101001;
+let CextOpcode = "A4_cmphgt";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_cmphgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmph.gtu($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111110;
+let CextOpcode = "A4_cmphgtu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def A4_cmphgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = cmph.gtu($Rs32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3531000, ImmRegRel {
+let Inst{4-2} = 0b010;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011101010;
+let CextOpcode = "A4_cmphgtu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 0;
+}
+def A4_combineii : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s8_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = combine(#$Ii,#$II)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9864697 {
+let Inst{31-21} = 0b01111100100;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def A4_combineir : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s32_0Imm:$Ii, IntRegs:$Rs32),
+"$Rdd32 = combine(#$Ii,$Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_2462143 {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011001;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_combineri : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rdd32 = combine($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_2462143 {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011000;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_cround_ri : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = cround($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_cround_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cround($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_ext : HInst<
+(outs),
+(ins u26_6Imm:$Ii),
+"immext(#$Ii)",
+EXTENDER_tc_1_SLOT0123, TypeEXTENDER>, Enc_2082956 {
+let Inst{31-28} = 0b0000;
+}
+def A4_modwrapu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = modwrap($Rs32,$Rt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_orn : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = or($Rt32,~$Rs32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8605375 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def A4_ornp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = or($Rtt32,~$Rss32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_11687333 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010011111;
+}
+def A4_paslhf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = aslh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_aslh";
+}
+def A4_paslhfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = aslh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_aslh";
+}
+def A4_paslht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = aslh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_aslh";
+}
+def A4_paslhtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = aslh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_aslh";
+}
+def A4_pasrhf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = asrh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_asrh";
+}
+def A4_pasrhfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = asrh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_asrh";
+}
+def A4_pasrht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = asrh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_asrh";
+}
+def A4_pasrhtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = asrh($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_asrh";
+}
+def A4_psxtbf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = sxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxtbfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = sxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxtbt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = sxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxtbtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = sxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000101;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxtb";
+}
+def A4_psxthf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = sxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxth";
+}
+def A4_psxthfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = sxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxth";
+}
+def A4_psxtht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = sxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_sxth";
+}
+def A4_psxthtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = sxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000111;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_sxth";
+}
+def A4_pzxtbf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = zxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxtbfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = zxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxtbt : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = zxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxtbtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = zxtb($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxtb";
+}
+def A4_pzxthf : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) $Rd32 = zxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxth";
+}
+def A4_pzxthfnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) $Rd32 = zxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1011;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxth";
+}
+def A4_pzxtht : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) $Rd32 = zxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let BaseOpcode = "A2_zxth";
+}
+def A4_pzxthtnew : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) $Rd32 = zxth($Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9422954, PredNewRel {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b01110000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_zxth";
+}
+def A4_rcmpeq : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmp.eq($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpeq";
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A4_rcmpeqi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = cmp.eq($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_16355964, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpeqi";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_rcmpneq : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = !cmp.eq($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_14071773, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpneq";
+let InputType = "reg";
+let isCommutable = 1;
+}
+def A4_rcmpneqi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = !cmp.eq($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_16355964, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b01110011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A4_rcmpeqi";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def A4_round_ri : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = round($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_round_ri_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = round($Rs32,#$Ii):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A4_round_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = round($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def A4_round_rr_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = round($Rs32,$Rt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A4_subp_c : HInst<
+(outs DoubleRegs:$Rdd32, PredRegs:$Px4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in),
+"$Rdd32 = sub($Rss32,$Rtt32,$Px4):carry",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_151014 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010111;
+let isPredicateLate = 1;
+let Constraints = "$Px4 = $Px4in";
+}
+def A4_tfrcpp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins CtrRegs64:$Css32),
+"$Rdd32 = $Css32",
+CR_tc_3x_SLOT3, TypeCR>, Enc_13094118 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101000000;
+}
+def A4_tfrpcp : HInst<
+(outs CtrRegs64:$Cdd32),
+(ins DoubleRegs:$Rss32),
+"$Cdd32 = $Rss32",
+CR_tc_3x_SLOT3, TypeCR>, Enc_1329520 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01100011001;
+}
+def A4_tlbmatch : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Pd4 = tlbmatch($Rss32,$Rt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_2492727 {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+let isPredicateLate = 1;
+}
+def A4_vcmpbeq_any : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = any8(vcmpb.eq($Rss32,$Rtt32))",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_vcmpbeqi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u8_0Imm:$Ii),
+"$Pd4 = vcmpb.eq($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100000;
+}
+def A4_vcmpbgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = vcmpb.gt($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11010010000;
+}
+def A4_vcmpbgti : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmpb.gt($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100001;
+}
+def A4_vcmpbgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
+"$Pd4 = vcmpb.gtu($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_2968094 {
+let Inst{4-2} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011100010;
+}
+def A4_vcmpheqi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmph.eq($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100000;
+}
+def A4_vcmphgti : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmph.gt($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100001;
+}
+def A4_vcmphgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
+"$Pd4 = vcmph.gtu($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_2968094 {
+let Inst{4-2} = 0b010;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011100010;
+}
+def A4_vcmpweqi : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmpw.eq($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100000;
+}
+def A4_vcmpwgti : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
+"$Pd4 = vcmpw.gt($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_13455308 {
+let Inst{4-2} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11011100001;
+}
+def A4_vcmpwgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
+"$Pd4 = vcmpw.gtu($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_2968094 {
+let Inst{4-2} = 0b100;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b11011100010;
+}
+def A4_vrmaxh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxh($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrmaxuh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxuh($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrmaxuw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxuw($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrmaxw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrmaxw($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminh($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminuh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminuh($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminuw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminuw($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A4_vrminw : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
+"$Rxx32 = vrminw($Rss32,$Ru32)",
+S_3op_tc_3_SLOT23, TypeS_3op>, Enc_9773189 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A5_ACS : HInst<
+(outs DoubleRegs:$Rxx32, PredRegs:$Pe4),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32,$Pe4 = vacsh($Rss32,$Rtt32)",
+M_tc_3stall_SLOT23, TypeM>, Enc_12822813, Requires<[HasV55T]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def A5_vaddhubs : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vaddhub($Rss32,$Rtt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9277990, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def A6_vminub_RdP : HInst<
+(outs DoubleRegs:$Rdd32, PredRegs:$Pe4),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32,$Pe4 = vminub($Rtt32,$Rss32)",
+M_tc_2_SLOT23, TypeM>, Enc_766909, Requires<[HasV62T]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+}
+def C2_all8 : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = all8($Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_6975103 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-18} = 0b01101011101000;
+}
+def C2_and : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = and($Pt4,$Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8891794 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011000000;
+}
+def C2_andn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = and($Pt4,!$Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8891794 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011011000;
+}
+def C2_any8 : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = any8($Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_6975103 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-18} = 0b01101011100000;
+}
+def C2_bitsclr : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = bitsclr($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111100;
+}
+def C2_bitsclri : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u6_0Imm:$Ii),
+"$Pd4 = bitsclr($Rs32,#$Ii)",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_14574598 {
+let Inst{7-2} = 0b000000;
+let Inst{31-21} = 0b10000101100;
+}
+def C2_bitsset : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = bitsset($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111010;
+}
+def C2_ccombinewf : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4) $Rdd32 = combine($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8202458, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_ccombinewnewf : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pu4.new) $Rdd32 = combine($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8202458, PredNewRel {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_ccombinewnewt : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4.new) $Rdd32 = combine($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8202458, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let isPredicatedNew = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_ccombinewt : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pu4) $Rdd32 = combine($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_8202458, PredNewRel {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11111101000;
+let isPredicated = 1;
+let BaseOpcode = "A2_combinew";
+}
+def C2_cmoveif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if (!$Pu4) $Rd32 = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9487067, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmoveit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if ($Pu4) $Rd32 = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9487067, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmovenewif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if (!$Pu4.new) $Rd32 = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9487067, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmovenewit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if ($Pu4.new) $Rd32 = #$Ii",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9487067, PredNewRel, ImmRegRel {
+let Inst{13-13} = 0b1;
+let Inst{20-20} = 0b0;
+let Inst{31-23} = 0b011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPredicatedNew = 1;
+let CextOpcode = "A2_tfr";
+let InputType = "imm";
+let BaseOpcode = "A2_tfrsi";
+let isMoveImm = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def C2_cmpeq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.eq($Rs32,$Rt32)",
+ALU32_3op_tc_2early_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010000;
+let CextOpcode = "C2_cmpeq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def C2_cmpeqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmp.eq($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_16014536, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{31-22} = 0b0111010100;
+let CextOpcode = "C2_cmpeq";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C2_cmpeqp : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = cmp.eq($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010100;
+let isCommutable = 1;
+let isCompare = 1;
+}
+def C2_cmpgei : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s8_0Imm:$Ii),
+"$Pd4 = cmp.ge($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op> {
+let isCompare = 1;
+let isPseudo = 1;
+}
+def C2_cmpgeui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u8_0Imm:$Ii),
+"$Pd4 = cmp.geu($Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op> {
+let isCompare = 1;
+let isPseudo = 1;
+}
+def C2_cmpgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.gt($Rs32,$Rt32)",
+ALU32_3op_tc_2early_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010010;
+let CextOpcode = "C2_cmpgt";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C2_cmpgti : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmp.gt($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_16014536, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{31-22} = 0b0111010101;
+let CextOpcode = "C2_cmpgt";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C2_cmpgtp : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = cmp.gt($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010100;
+let isCompare = 1;
+}
+def C2_cmpgtu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.gtu($Rs32,$Rt32)",
+ALU32_3op_tc_2early_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010011;
+let CextOpcode = "C2_cmpgtu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C2_cmpgtui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = cmp.gtu($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_13249928, ImmRegRel {
+let Inst{4-2} = 0b000;
+let Inst{31-21} = 0b01110101100;
+let CextOpcode = "C2_cmpgtu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 0;
+}
+def C2_cmpgtup : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = cmp.gtu($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744 {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010100;
+let isCompare = 1;
+}
+def C2_cmplt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.lt($Rs32,$Rt32)",
+PSEUDO, TypeALU32_3op> {
+let isCompare = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def C2_cmpltu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = cmp.ltu($Rs32,$Rt32)",
+PSEUDO, TypeALU32_3op> {
+let isCompare = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def C2_mask : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4),
+"$Rdd32 = mask($Pt4)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_10328975 {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0000;
+let Inst{31-16} = 0b1000011000000000;
+}
+def C2_mux : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mux($Pu4,$Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_9626139 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "reg";
+}
+def C2_muxii : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii, s8_0Imm:$II),
+"$Rd32 = mux($Pu4,#$Ii,#$II)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_9093094 {
+let Inst{31-25} = 0b0111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def C2_muxir : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = mux($Pu4,$Rs32,#$Ii)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534 {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def C2_muxri : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii, IntRegs:$Rs32),
+"$Rd32 = mux($Pu4,#$Ii,$Rs32)",
+ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, Enc_10568534 {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def C2_not : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = not($Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_6975103 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-18} = 0b01101011110000;
+}
+def C2_or : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = or($Pt4,$Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8891794 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011001000;
+}
+def C2_orn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Pt4, PredRegs:$Ps4),
+"$Pd4 = or($Pt4,!$Ps4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8891794 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011111000;
+}
+def C2_pxfer_map : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4),
+"$Pd4 = $Ps4",
+S_2op_tc_1_SLOT23, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def C2_tfrpr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Ps4),
+"$Rd32 = $Ps4",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_11139981 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-18} = 0b10001001010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def C2_tfrrp : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32),
+"$Pd4 = $Rs32",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_4527648 {
+let Inst{13-2} = 0b000000000000;
+let Inst{31-21} = 0b10000101010;
+}
+def C2_vitpack : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Rd32 = vitpack($Ps4,$Pt4)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_6735062 {
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b10001001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def C2_vmux : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pu4, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmux($Pu4,$Rss32,$Rtt32)",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_7606379 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010001000;
+}
+def C2_xor : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Pd4 = xor($Ps4,$Pt4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8324216 {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011010000;
+}
+def C4_addipc : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = add(pc,#$Ii)",
+CR_tc_2_SLOT3, TypeCR>, Enc_9554661 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0110101001001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def C4_and_and : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,and($Pt4,$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011000100;
+}
+def C4_and_andn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,and($Pt4,!$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011100100;
+}
+def C4_and_or : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,or($Pt4,$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011001100;
+}
+def C4_and_orn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = and($Ps4,or($Pt4,!$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011101100;
+}
+def C4_cmplte : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !cmp.gt($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010010;
+let CextOpcode = "C4_cmplte";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C4_cmpltei : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = !cmp.gt($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_16014536, ImmRegRel {
+let Inst{4-2} = 0b100;
+let Inst{31-22} = 0b0111010101;
+let CextOpcode = "C4_cmplte";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C4_cmplteu : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !cmp.gtu($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010011;
+let CextOpcode = "C4_cmplteu";
+let InputType = "reg";
+let isCompare = 1;
+}
+def C4_cmplteui : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Pd4 = !cmp.gtu($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_13249928, ImmRegRel {
+let Inst{4-2} = 0b100;
+let Inst{31-21} = 0b01110101100;
+let CextOpcode = "C4_cmplteu";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 0;
+}
+def C4_cmpneq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !cmp.eq($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_10157519, ImmRegRel {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110010000;
+let CextOpcode = "C4_cmpneq";
+let InputType = "reg";
+let isCommutable = 1;
+let isCompare = 1;
+}
+def C4_cmpneqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = !cmp.eq($Rs32,#$Ii)",
+ALU32_2op_tc_2early_SLOT0123, TypeALU32_2op>, Enc_16014536, ImmRegRel {
+let Inst{4-2} = 0b100;
+let Inst{31-22} = 0b0111010100;
+let CextOpcode = "C4_cmpneq";
+let InputType = "imm";
+let isCompare = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def C4_fastcorner9 : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Pd4 = fastcorner9($Ps4,$Pt4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8324216 {
+let Inst{7-2} = 0b100100;
+let Inst{13-10} = 0b1000;
+let Inst{31-18} = 0b01101011000000;
+}
+def C4_fastcorner9_not : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4),
+"$Pd4 = !fastcorner9($Ps4,$Pt4)",
+CR_tc_2early_SLOT23, TypeCR>, Enc_8324216 {
+let Inst{7-2} = 0b100100;
+let Inst{13-10} = 0b1000;
+let Inst{31-18} = 0b01101011000100;
+}
+def C4_nbitsclr : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !bitsclr($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111101;
+}
+def C4_nbitsclri : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u6_0Imm:$Ii),
+"$Pd4 = !bitsclr($Rs32,#$Ii)",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_14574598 {
+let Inst{7-2} = 0b000000;
+let Inst{31-21} = 0b10000101101;
+}
+def C4_nbitsset : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !bitsset($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111011;
+}
+def C4_or_and : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,and($Pt4,$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011010100;
+}
+def C4_or_andn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,and($Pt4,!$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011110100;
+}
+def C4_or_or : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,or($Pt4,$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011011100;
+}
+def C4_or_orn : HInst<
+(outs PredRegs:$Pd4),
+(ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
+"$Pd4 = or($Ps4,or($Pt4,!$Pu4))",
+CR_tc_2early_SLOT23, TypeCR>, Enc_4631106 {
+let Inst{5-2} = 0b0000;
+let Inst{13-10} = 0b0000;
+let Inst{31-18} = 0b01101011111100;
+}
+def F2_conv_d2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_d2df($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000011;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_d2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_d2sf($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2d : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2d($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2d_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2d($Rss32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2sf($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2ud : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2ud($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2ud_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_df2ud($Rss32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2uw : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2uw($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2uw_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2uw($Rss32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2w : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2w($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_df2w_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_df2w($Rss32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2d : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2d($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2d_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2d($Rs32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2df($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2ud : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2ud($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000011;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2ud_chop : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_sf2ud($Rs32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2uw : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2uw($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2uw_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2uw($Rs32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2w : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2w($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_sf2w_chop : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_sf2w($Rs32):chop",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_ud2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = convert_ud2df($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_13133231, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000000111;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_ud2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = convert_ud2sf($Rss32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10001000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_uw2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_uw2df($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_uw2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_uw2sf($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_w2df : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = convert_w2df($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000100100;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_conv_w2sf : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = convert_w2sf($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_dfclass : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Pd4 = dfclass($Rss32,#$Ii)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_14400220, Requires<[HasV5T]> {
+let Inst{4-2} = 0b100;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b11011100100;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_dfcmpeq : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.eq($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744, Requires<[HasV5T]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfcmpge : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.ge($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744, Requires<[HasV5T]> {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfcmpgt : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.gt($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744, Requires<[HasV5T]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfcmpuo : HInst<
+(outs PredRegs:$Pd4),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Pd4 = dfcmp.uo($Rss32,$Rtt32)",
+ALU64_tc_2early_SLOT23, TypeALU64>, Enc_3831744, Requires<[HasV5T]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010010111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_dfimm_n : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u10_0Imm:$Ii),
+"$Rdd32 = dfmake(#$Ii):neg",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_2702036, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101100101;
+let prefersSlot3 = 1;
+}
+def F2_dfimm_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u10_0Imm:$Ii),
+"$Rdd32 = dfmake(#$Ii):pos",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_2702036, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101100100;
+let prefersSlot3 = 1;
+}
+def F2_sfadd : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfadd($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let isCommutable = 1;
+}
+def F2_sfclass : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Pd4 = sfclass($Rs32,#$Ii)",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_2103742, Requires<[HasV5T]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000101111;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_sfcmpeq : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.eq($Rs32,$Rt32)",
+ALU64_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, Requires<[HasV5T]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sfcmpge : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.ge($Rs32,$Rt32)",
+ALU64_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, Requires<[HasV5T]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sfcmpgt : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.gt($Rs32,$Rt32)",
+ALU64_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, Requires<[HasV5T]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sfcmpuo : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = sfcmp.uo($Rs32,$Rt32)",
+ALU64_tc_2early_SLOT23, TypeS_3op>, Enc_10157519, Requires<[HasV5T]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111111;
+let isFP = 1;
+let Uses = [USR];
+let isCompare = 1;
+}
+def F2_sffixupd : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sffixupd($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+}
+def F2_sffixupn : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sffixupn($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+}
+def F2_sffixupr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sffixupr($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_4075554, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+}
+def F2_sffma : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += sfmpy($Rs32,$Rt32)",
+M_tc_3or4x_acc_SLOT23, TypeM>, Enc_9223889, Requires<[HasV5T]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffma_lib : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += sfmpy($Rs32,$Rt32):lib",
+M_tc_3or4x_acc_SLOT23, TypeM>, Enc_9223889, Requires<[HasV5T]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffma_sc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32, PredRegs:$Pu4),
+"$Rx32 += sfmpy($Rs32,$Rt32,$Pu4):scale",
+M_tc_3or4x_acc_SLOT23, TypeM>, Enc_15194851, Requires<[HasV5T]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffms : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= sfmpy($Rs32,$Rt32)",
+M_tc_3or4x_acc_SLOT23, TypeM>, Enc_9223889, Requires<[HasV5T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sffms_lib : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= sfmpy($Rs32,$Rt32):lib",
+M_tc_3or4x_acc_SLOT23, TypeM>, Enc_9223889, Requires<[HasV5T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def F2_sfimm_n : HInst<
+(outs IntRegs:$Rd32),
+(ins u10_0Imm:$Ii),
+"$Rd32 = sfmake(#$Ii):neg",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_9082775, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def F2_sfimm_p : HInst<
+(outs IntRegs:$Rd32),
+(ins u10_0Imm:$Ii),
+"$Rd32 = sfmake(#$Ii):pos",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_9082775, Requires<[HasV5T]> {
+let Inst{20-16} = 0b00000;
+let Inst{31-22} = 0b1101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def F2_sfinvsqrta : HInst<
+(outs IntRegs:$Rd32, PredRegs:$Pe4),
+(ins IntRegs:$Rs32),
+"$Rd32,$Pe4 = sfinvsqrta($Rs32)",
+S_2op_tc_3or4x_SLOT23, TypeS_2op>, Enc_5718302, Requires<[HasV5T]> {
+let Inst{13-7} = 0b0000000;
+let Inst{31-21} = 0b10001011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+}
+def F2_sfmax : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfmax($Rs32,$Rt32)",
+M_tc_2_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_sfmin : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfmin($Rs32,$Rt32)",
+M_tc_2_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_sfmpy : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfmpy($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+let isCommutable = 1;
+}
+def F2_sfrecipa : HInst<
+(outs IntRegs:$Rd32, PredRegs:$Pe4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32,$Pe4 = sfrecipa($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_5853469, Requires<[HasV5T]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+}
+def F2_sfsub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = sfsub($Rs32,$Rt32)",
+M_tc_3or4x_SLOT23, TypeM>, Enc_14071773, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def J2_call : HInst<
+(outs),
+(ins a30_2Imm:$Ii),
+"call $Ii",
+J_tc_2early_SLOT23, TypeJ>, Enc_13453446, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{31-25} = 0b0101101;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let BaseOpcode = "J2_call";
+let isPredicable = 1;
+let hasSideEffects = 1;
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 24;
+let opExtentAlign = 2;
+}
+def J2_callf : HInst<
+(outs),
+(ins PredRegs:$Pu4, a30_2Imm:$Ii),
+"if (!$Pu4) call $Ii",
+J_tc_2early_SLOT23, TypeJ>, Enc_14868535, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let BaseOpcode = "J2_call";
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_callr : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"callr $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010000101;
+let cofMax1 = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let hasSideEffects = 1;
+}
+def J2_callrf : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) callr $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953 {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let cofMax1 = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+}
+def J2_callrt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) callr $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953 {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010001000;
+let isPredicated = 1;
+let cofMax1 = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+}
+def J2_callt : HInst<
+(outs),
+(ins PredRegs:$Pu4, a30_2Imm:$Ii),
+"if ($Pu4) call $Ii",
+J_tc_2early_SLOT23, TypeJ>, Enc_14868535, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011101;
+let isPredicated = 1;
+let isCall = 1;
+let prefersSlot3 = 1;
+let Uses = [R29];
+let Defs = [PC, R31];
+let BaseOpcode = "J2_call";
+let hasSideEffects = 1;
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_endloop0 : HInst<
+(outs),
+(ins),
+"endloop0",
+PSEUDO, TypeJ> {
+let Uses = [LC0, SA0];
+let Defs = [LC0, P3, PC, USR];
+let isPseudo = 1;
+}
+def J2_endloop01 : HInst<
+(outs),
+(ins),
+"endloop01",
+PSEUDO, TypeJ> {
+let Uses = [LC0, LC1, SA0, SA1];
+let Defs = [LC0, LC1, P3, PC, USR];
+let isPseudo = 1;
+}
+def J2_endloop1 : HInst<
+(outs),
+(ins),
+"endloop1",
+PSEUDO, TypeJ> {
+let Uses = [LC1, SA1];
+let Defs = [LC1, PC];
+let isPseudo = 1;
+}
+def J2_jump : HInst<
+(outs),
+(ins b30_2Imm:$Ii),
+"jump $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_13453446, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{31-25} = 0b0101100;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isBarrier = 1;
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 24;
+let opExtentAlign = 2;
+}
+def J2_jumpf : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4) jump:nt $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpf_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, b15_2Imm:$Ii),
+"if (!$Pu4) jump $Ii",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumpfnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4.new) jump:nt $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b010;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpfnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4.new) jump:t $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b110;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpfpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if (!$Pu4) jump:t $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, Requires<[HasV60T]>, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b100;
+let Inst{21-21} = 0b1;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpr : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"jumpr $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_11704059, PredNewRel {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010010100;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isBarrier = 1;
+let isPredicable = 1;
+}
+def J2_jumprf : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) jumpr:nt $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprf_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) jumpr $Rs32",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumprfnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) jumpr:nt $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0010;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprfnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4.new) jumpr:t $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0110;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprfpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if (!$Pu4) jumpr:t $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, Requires<[HasV60T]>, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0100;
+let Inst{31-21} = 0b01010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprgtez : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32>=#0) jump:nt $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000101;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprgtezpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32>=#0) jump:t $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000101;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprltez : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32<=#0) jump:nt $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000111;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprltezpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32<=#0) jump:t $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000111;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprnz : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32==#0) jump:nt $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprnzpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32==#0) jump:t $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) jumpr:nt $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprt_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) jumpr $Rs32",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumprtnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) jumpr:nt $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0010;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprtnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4.new) jumpr:t $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0110;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprtpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, IntRegs:$Rs32),
+"if ($Pu4) jumpr:t $Rs32",
+J_tc_2early_SLOT2, TypeJ>, Enc_1928953, Requires<[HasV60T]>, PredNewRel {
+let Inst{7-0} = 0b00000000;
+let Inst{13-10} = 0b0100;
+let Inst{31-21} = 0b01010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let Defs = [PC];
+let InputType = "reg";
+let BaseOpcode = "J2_jumpr";
+let isTaken = Inst{12};
+}
+def J2_jumprz : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32!=#0) jump:nt $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b0;
+let Inst{31-22} = 0b0110000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumprzpt : HInst<
+(outs),
+(ins IntRegs:$Rs32, b13_2Imm:$Ii),
+"if ($Rs32!=#0) jump:t $Ii",
+CR_tc_2early_SLOT3, TypeCR>, Enc_12477789 {
+let Inst{0-0} = 0b0;
+let Inst{12-12} = 0b1;
+let Inst{31-22} = 0b0110000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let isTaken = Inst{12};
+}
+def J2_jumpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4) jump:nt $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b000;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumpt_nopred_map : HInst<
+(outs),
+(ins PredRegs:$Pu4, b15_2Imm:$Ii),
+"if ($Pu4) jump $Ii",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def J2_jumptnew : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4.new) jump:nt $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b010;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumptnewpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4.new) jump:t $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b110;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_jumptpt : HInst<
+(outs),
+(ins PredRegs:$Pu4, b30_2Imm:$Ii),
+"if ($Pu4) jump:t $Ii",
+J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT, TypeJ>, Enc_14868535, Requires<[HasV60T]>, PredNewRel {
+let Inst{0-0} = 0b0;
+let Inst{12-10} = 0b100;
+let Inst{21-21} = 0b0;
+let Inst{31-24} = 0b01011100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let InputType = "imm";
+let BaseOpcode = "J2_jump";
+let isTaken = Inst{12};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 17;
+let opExtentAlign = 2;
+}
+def J2_loop0i : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"loop0($Ii,#$II)",
+CR_tc_3x_SLOT3, TypeCR>, Enc_9939385 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001000;
+let Defs = [LC0, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_loop0r : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"loop0($Ii,$Rs32)",
+CR_tc_3x_SLOT3, TypeCR>, Enc_5790679 {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000000;
+let Defs = [LC0, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_loop1i : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"loop1($Ii,#$II)",
+CR_tc_3x_SLOT3, TypeCR>, Enc_9939385 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001001;
+let Defs = [LC1, SA1];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_loop1r : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"loop1($Ii,$Rs32)",
+CR_tc_3x_SLOT3, TypeCR>, Enc_5790679 {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000001;
+let Defs = [LC1, SA1];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_pause : HInst<
+(outs),
+(ins u8_0Imm:$Ii),
+"pause(#$Ii)",
+J_tc_2early_SLOT2, TypeJ>, Enc_8732960 {
+let Inst{1-0} = 0b00;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0101010001000000;
+let isSolo = 1;
+}
+def J2_ploop1si : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"p3 = sp1loop0($Ii,#$II)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_9939385 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001101;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop1sr : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"p3 = sp1loop0($Ii,$Rs32)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_5790679 {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000101;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop2si : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"p3 = sp2loop0($Ii,#$II)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_9939385 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001110;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop2sr : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"p3 = sp2loop0($Ii,$Rs32)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_5790679 {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000110;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop3si : HInst<
+(outs),
+(ins b30_2Imm:$Ii, u10_0Imm:$II),
+"p3 = sp3loop0($Ii,#$II)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_9939385 {
+let Inst{2-2} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01101001111;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_ploop3sr : HInst<
+(outs),
+(ins b30_2Imm:$Ii, IntRegs:$Rs32),
+"p3 = sp3loop0($Ii,$Rs32)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_5790679 {
+let Inst{2-0} = 0b000;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100000111;
+let isPredicateLate = 1;
+let Defs = [LC0, P3, SA0, USR];
+let isExtendable = 1;
+let opExtendable = 0;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 2;
+}
+def J2_trap0 : HInst<
+(outs),
+(ins u8_0Imm:$Ii),
+"trap0(#$Ii)",
+J_tc_2early_SLOT2, TypeJ>, Enc_8732960 {
+let Inst{1-0} = 0b00;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0101010000000000;
+let isSolo = 1;
+}
+def J4_cmpeq_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeq_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeq_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqi";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqi_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqi_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$n1)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4359901, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.eq($Ns8.new,#$n1)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_8612939, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_844699, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_5338033, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14150875, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_15450971, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$n1)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_14998517, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.eq($Ns8.new,#$n1)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_11544269, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpeqn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpeqn1_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_5401217, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12419313, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpeqn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_4684887, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpeqn1_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_220949, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpeqn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgt_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtp0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgt_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtp1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgti_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgti_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$n1)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_8674673, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (!cmp.gt($Ns8.new,#$n1)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15763937, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_5915771, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7315939, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7785569, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_10968391, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$n1)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_364753, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
+"if (cmp.gt($Ns8.new,#$n1)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_8479583, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010011010;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtn1r";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtn1_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_2428539, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_8919369, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtn1p0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_8577055, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000001;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtn1_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
+"p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14530015, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100001;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtn1p1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_15140689, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtu_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b10;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtup0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtu_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_14264243, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b11;
+let Inst{31-22} = 0b0001010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtup1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (!cmp.gtu($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,#$II)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
+"if (cmp.gtu($Ns8.new,#$II)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_4397470, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpgtuir";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_cmpgtui_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let BaseOpcode = "J4_cmpgtuip0";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-22} = 0b0001001100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmpgtui_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
+"p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_7305764, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-22} = 0b0001001100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let BaseOpcode = "J4_cmpgtuip1";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_cmplt_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmplt_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gt($Rt32,$Ns8.new)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmplt_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmplt_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gt($Rt32,$Ns8.new)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltr";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_cmpltu_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_6730375, PredRel {
+let Inst{0-0} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010001000;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let BaseOpcode = "J4_cmpltur";
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def J4_hintjumpr : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"hintjr($Rs32)",
+J_tc_2early_SLOT2, TypeJ>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010010101;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+}
+def J4_jumpseti : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u6_0Imm:$II, b30_2Imm:$Ii),
+"$Rd16 = #$II ; jump $Ii",
+COMPOUND, TypeCJ>, Enc_4834775 {
+let Inst{0-0} = 0b0;
+let Inst{31-22} = 0b0001011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_jumpsetr : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"$Rd16 = $Rs16 ; jump $Ii",
+COMPOUND, TypeCJ>, Enc_2639299 {
+let Inst{0-0} = 0b0;
+let Inst{13-12} = 0b00;
+let Inst{31-22} = 0b0001011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isTerminator = 1;
+let isBranch = 1;
+let Defs = [PC];
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_f_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!tstbit($Ns8.new,#0)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_1898420 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_f_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (!tstbit($Ns8.new,#0)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_1898420 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_fp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (!p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_fp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (!p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_fp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (!p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_fp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (!p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_t_jumpnv_nt : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (tstbit($Ns8.new,#0)) jump:nt $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_1898420 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_t_jumpnv_t : HInst<
+(outs),
+(ins IntRegs:$Ns8, b30_2Imm:$Ii),
+"if (tstbit($Ns8.new,#0)) jump:t $Ii",
+NCJ_tc_3or4stall_SLOT0, TypeNCJ>, Enc_1898420 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100000;
+let Inst{19-19} = 0b0;
+let Inst{31-22} = 0b0010010110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let cofMax1 = 1;
+let isNewValue = 1;
+let Defs = [PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+let opNewValue = 0;
+}
+def J4_tstbit0_tp0_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (p0.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_tp0_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p0 = tstbit($Rs16,#0); if (p0.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P0];
+let Defs = [P0, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_tp1_jump_nt : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (p1.new) jump:nt $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b000011;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def J4_tstbit0_tp1_jump_t : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
+"p1 = tstbit($Rs16,#0); if (p1.new) jump:t $Ii",
+COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>, Enc_12829314 {
+let Inst{0-0} = 0b0;
+let Inst{13-8} = 0b100011;
+let Inst{31-22} = 0b0001001110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isBranch = 1;
+let isPredicatedNew = 1;
+let Uses = [P1];
+let Defs = [P1, PC];
+let isTaken = Inst{13};
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 2;
+}
+def L2_deallocframe : HInst<
+(outs),
+(ins),
+"deallocframe",
+LD_tc_ld_SLOT01, TypeLD>, Enc_0 {
+let Inst{4-0} = 0b11110;
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10010000000;
+let Inst{20-16} = 0b11110;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [R29, R30, R31];
+}
+def L2_loadalignb_io : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Ryy32 = memb_fifo($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_449439 {
+let Inst{24-21} = 0b0100;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadalignb_pbr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110100;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pci : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_971347 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pcr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pi : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"$Ryy32 = memb_fifo($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_6372758 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_pr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memb_fifo($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100100;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignb_zomap : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32),
+"$Ryy32 = memb_fifo($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadalignh_io : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Ryy32 = memh_fifo($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_11930027 {
+let Inst{24-21} = 0b0010;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadalignh_pbr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110010;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pci : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_1971351 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pcr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pi : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Ryy32 = memh_fifo($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_3372766 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_pr : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Ryy32 = memh_fifo($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12261611 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Ryy32 = $Ryy32in, $Rx32 = $Rx32in";
+}
+def L2_loadalignh_zomap : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rs32),
+"$Ryy32 = memh_fifo($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L2_loadbsw2_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = membh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15275738 {
+let Inst{24-21} = 0b0001;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadbsw2_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13303422 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = membh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_15376009 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = membh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw2_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = membh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadbsw4_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii),
+"$Rdd32 = membh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_9852473 {
+let Inst{24-21} = 0b0111;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def L2_loadbsw4_pbr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110111;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pci : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_3931661 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pcr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii),
+"$Rdd32 = membh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_8752140 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_pr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = membh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100111;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbsw4_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = membh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadbzw2_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memubh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15275738 {
+let Inst{24-21} = 0b0011;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadbzw2_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13303422 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = memubh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_15376009 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memubh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw2_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memubh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadbzw4_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii),
+"$Rdd32 = memubh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_9852473 {
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def L2_loadbzw4_pbr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011110101;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pci : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_3931661 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011000101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pcr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011000101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii),
+"$Rdd32 = memubh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_8752140 {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011010101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_pr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memubh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011100101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadbzw4_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = memubh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrb_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = memb($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_14461004, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def L2_loadrb_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_16303398 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii),
+"$Rd32 = memb($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5598813, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memb($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrb_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memb($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrbgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memb(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1886960, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrb_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def L2_loadrd_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s29_3Imm:$Ii),
+"$Rdd32 = memd($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_163381, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 14;
+let opExtentAlign = 3;
+}
+def L2_loadrd_pbr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111110;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pci : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_931653 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pcr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii),
+"$Rdd32 = memd($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_9752128, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_pr : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rdd32 = memd($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_2901241 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrd_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = memd($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrdgp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u29_3Imm:$Ii),
+"$Rdd32 = memd(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4975051, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b01001;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrd_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def L2_loadrh_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15275738, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadrh_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13303422 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = memh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_15376009, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrh_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrhgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memh(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_12608570, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrh_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def L2_loadri_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii),
+"$Rd32 = memw($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_8990840, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def L2_loadri_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_14303394 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii),
+"$Rd32 = memw($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_16376009, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memw($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadri_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memw($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrigp : HInst<
+(outs IntRegs:$Rd32),
+(ins u30_2Imm:$Ii),
+"$Rd32 = memw(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_8814718, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadri_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def L2_loadrub_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = memub($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_14461004, AddrModeRel {
+let Inst{24-21} = 0b1001;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def L2_loadrub_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_16303398 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii),
+"$Rd32 = memub($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5598813, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memub($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadrub_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memub($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadrubgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memub(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1886960, AddrModeRel {
+let Inst{24-21} = 0b1001;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadrub_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def L2_loadruh_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memuh($Rs32+#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15275738, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def L2_loadruh_pbr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++$Mu2:brev)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pci : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++#$Ii:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13303422 {
+let Inst{12-9} = 0b0000;
+let Inst{31-21} = 0b10011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pcr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++I:circ($Mu2))",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00010000;
+let Inst{31-21} = 0b10011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii),
+"$Rd32 = memuh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_15376009, PredNewRel {
+let Inst{13-9} = 0b00000;
+let Inst{31-21} = 0b10011011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_pr : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Rd32 = memuh($Rx32++$Mu2)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_48594 {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b10011101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_loadruh_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memuh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_loadruhgp : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memuh(gp+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_12608570, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let Uses = [GP];
+let BaseOpcode = "L4_loadruh_abs";
+let isPredicable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def L2_loadw_locked : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = memw_locked($Rs32)",
+LD_tc_ld_SLOT0, TypeLD>, Enc_4075554 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10010010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let accessSize = WordAccess;
+let isSoloAX = 1;
+let mayLoad = 1;
+}
+def L2_ploadrbf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memb($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memb($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memb($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrbfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memb($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memb($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memb($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrbt_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memb($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbt_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memb($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbt_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memb($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrbtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memb($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L2_loadrb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrbtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memb($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrb_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrbtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memb($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdf_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if (!$Pt4) $Rdd32 = memd($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_677558, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdf_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if (!$Pt4) $Rdd32 = memd($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5611087, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdf_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rdd32 = memd($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdfnew_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if (!$Pt4.new) $Rdd32 = memd($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_677558, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdfnew_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if (!$Pt4.new) $Rdd32 = memd($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5611087, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdfnew_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rdd32 = memd($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdt_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if ($Pt4) $Rdd32 = memd($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_677558, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdt_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if ($Pt4) $Rdd32 = memd($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5611087, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdt_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rdd32 = memd($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrdtnew_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
+"if ($Pt4.new) $Rdd32 = memd($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_677558, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L2_loadrd_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def L2_ploadrdtnew_pi : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
+"if ($Pt4.new) $Rdd32 = memd($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_5611087, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrdtnew_zomap : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rdd32 = memd($Rs32)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrhf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrhf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrhf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrhfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrhfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrhfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrht_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrht_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrht_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrhtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L2_loadrh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadrhtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrhtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrif_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if (!$Pt4) $Rd32 = memw($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_2835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadrif_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if (!$Pt4) $Rd32 = memw($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_6212930, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrif_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memw($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrifnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memw($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_2835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadrifnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memw($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_6212930, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrifnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memw($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrit_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if ($Pt4) $Rd32 = memw($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_2835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadrit_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if ($Pt4) $Rd32 = memw($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_6212930, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrit_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memw($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadritnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memw($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_2835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L2_loadri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L2_ploadritnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memw($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_6212930, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadri_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadritnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memw($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memub($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memub($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memub($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memub($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memub($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memub($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubt_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memub($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubt_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memub($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubt_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memub($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadrubtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memub($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4835423, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L2_loadrub_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L2_ploadrubtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memub($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_12212978, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadrub_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadrubtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memub($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruhf_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memuh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000101011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruhf_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4) $Rd32 = memuh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruhf_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4) $Rd32 = memuh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruhfnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memuh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruhfnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memuh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruhfnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if (!$Pt4.new) $Rd32 = memuh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruht_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memuh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000001011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruht_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4) $Rd32 = memuh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruht_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4) $Rd32 = memuh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L2_ploadruhtnew_io : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memuh($Rs32+#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1835415, AddrModeRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01000011011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L2_loadruh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L2_ploadruhtnew_pi : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Rx32),
+(ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memuh($Rx32++#$Ii)",
+LD_tc_ld_pi_SLOT01, TypeLD>, Enc_7212930, PredNewRel {
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011011011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let BaseOpcode = "L2_loadruh_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def L2_ploadruhtnew_zomap : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, IntRegs:$Rs32),
+"if ($Pt4.new) $Rd32 = memuh($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_add_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) += $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_11849200 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_add_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) += $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_add_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) += $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8849208 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_add_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) += $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_add_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) += $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9849208 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_add_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) += $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_and_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) &= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_11849200 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_and_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) &= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_and_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) &= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8849208 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_and_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) &= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_and_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) &= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9849208 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_and_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) &= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iadd_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) += #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_6773159 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_iadd_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) += #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iadd_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) += #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9773167 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_iadd_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) += #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iadd_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) += #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8773155 {
+let Inst{6-5} = 0b00;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_iadd_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) += #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iand_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) = clrbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_6773159 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_iand_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) = clrbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iand_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) = clrbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9773167 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_iand_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) = clrbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_iand_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) = clrbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8773155 {
+let Inst{6-5} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_iand_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) = clrbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ior_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) = setbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_6773159 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ior_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) = setbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ior_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) = setbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9773167 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_ior_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) = setbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ior_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) = setbit(#$II)",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8773155 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_ior_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) = setbit(#$II)",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_isub_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
+"memb($Rs32+#$Ii) -= #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_6773159 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_isub_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memb($Rs32) -= #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_isub_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
+"memh($Rs32+#$Ii) -= #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9773167 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_isub_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memh($Rs32) -= #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_isub_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
+"memw($Rs32+#$Ii) -= #$II",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8773155 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111111010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_isub_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, u5_0Imm:$II),
+"memw($Rs32) -= #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_loadalignb_ap : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Re32),
+(ins DoubleRegs:$Ryy32in, u32_0Imm:$II),
+"$Ryy32 = memb_fifo($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_11047413 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010100;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadalignb_ur : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Ryy32 = memb_fifo($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_7303598 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100100;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 4;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadalignh_ap : HInst<
+(outs DoubleRegs:$Ryy32, IntRegs:$Re32),
+(ins DoubleRegs:$Ryy32in, u32_0Imm:$II),
+"$Ryy32 = memh_fifo($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_11047413 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010010;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadalignh_ur : HInst<
+(outs DoubleRegs:$Ryy32),
+(ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Ryy32 = memh_fifo($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_7303598 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100010;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 4;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let Constraints = "$Ryy32 = $Ryy32in";
+}
+def L4_loadbsw2_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = membh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbsw2_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = membh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbsw4_ap : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rdd32 = membh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_877823 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010111;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbsw4_ur : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = membh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_5582416 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100111;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw2_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memubh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw2_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memubh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw4_ap : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rdd32 = memubh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_877823 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011010101;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadbzw4_ur : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = memubh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_5582416 {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011100101;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadd_locked : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = memd_locked($Rs32)",
+LD_tc_ld_SLOT0, TypeLD>, Enc_4030179 {
+let Inst{13-5} = 0b010000000;
+let Inst{31-21} = 0b10010010000;
+let accessSize = DoubleWordAccess;
+let isSoloAX = 1;
+let mayLoad = 1;
+}
+def L4_loadrb_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memb($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrb_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_10721363, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+let isPredicable = 1;
+}
+def L4_loadrb_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memb($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrd_ap : HInst<
+(outs DoubleRegs:$Rdd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rdd32 = memd($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_877823 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011110;
+let hasNewValue = 1;
+let opNewValue = 1;
+let addrMode = AbsoluteSet;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrd_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_7581852, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010110;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+let isPredicable = 1;
+}
+def L4_loadrd_ur : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = memd($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_5582416, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101110;
+let addrMode = BaseLongOffset;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrh_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrh_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_10721363, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+let isPredicable = 1;
+}
+def L4_loadrh_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadri_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memw($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadri_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_10721363, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+let isPredicable = 1;
+}
+def L4_loadri_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memw($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrub_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memub($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadrub_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_10721363, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+let isPredicable = 1;
+}
+def L4_loadrub_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memub($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadruh_ap : HInst<
+(outs IntRegs:$Rd32, IntRegs:$Re32),
+(ins u32_0Imm:$II),
+"$Rd32 = memuh($Re32=#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_12616482 {
+let Inst{7-7} = 0b0;
+let Inst{13-12} = 0b01;
+let Inst{31-21} = 0b10011011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_loadruh_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_10721363, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111010011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+let isPredicable = 1;
+}
+def L4_loadruh_ur : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
+"$Rd32 = memuh($Rt32<<#$Ii+#$II)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_486163, AddrModeRel, ImmRegShl {
+let Inst{12-12} = 0b1;
+let Inst{31-21} = 0b10011101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "imm";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_or_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) |= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_11849200 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_or_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) |= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_or_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) |= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8849208 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_or_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) |= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_or_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) |= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9849208 {
+let Inst{6-5} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_or_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) |= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_ploadrbf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memb(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110001000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrbfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memb(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrbt_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memb(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbt_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrbtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memb(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrbtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110010000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrb_rr";
+}
+def L4_ploadrdf_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rdd32 = memd(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15182416, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdf_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_7254313, AddrModeRel {
+let Inst{31-21} = 0b00110001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrdfnew_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rdd32 = memd(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15182416, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdfnew_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_7254313, AddrModeRel {
+let Inst{31-21} = 0b00110011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrdt_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rdd32 = memd(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15182416, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdt_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_7254313, AddrModeRel {
+let Inst{31-21} = 0b00110000110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrdtnew_abs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rdd32 = memd(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_15182416, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111110;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrdtnew_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_7254313, AddrModeRel {
+let Inst{31-21} = 0b00110010110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrd_rr";
+}
+def L4_ploadrhf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrhf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110001010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrhfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrhfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrht_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrht_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110000010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrhtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrhtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110010010;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrh_rr";
+}
+def L4_ploadrif_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memw(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrif_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadrifnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memw(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrifnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadrit_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memw(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrit_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadritnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memw(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadritnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110010100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let InputType = "reg";
+let BaseOpcode = "L4_loadri_rr";
+}
+def L4_ploadrubf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memub(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110001001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadrubfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memub(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110011001;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadrubt_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memub(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubt_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110000001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadrubtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memub(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadrubtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110010001;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let InputType = "reg";
+let BaseOpcode = "L4_loadrub_rr";
+}
+def L4_ploadruhf_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4) $Rd32 = memuh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruhf_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110001011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_ploadruhfnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if (!$Pt4.new) $Rd32 = memuh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b111;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruhfnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if (!$Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_ploadruht_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4) $Rd32 = memuh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruht_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110000011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_ploadruhtnew_abs : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pt4, u32_0Imm:$Ii),
+"if ($Pt4.new) $Rd32 = memuh(#$Ii)",
+LD_tc_ld_SLOT01, TypeLD>, Enc_13344657, AddrModeRel {
+let Inst{7-5} = 0b100;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10011111011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_ploadruhtnew_rr : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"if ($Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
+V4LDST_tc_ld_SLOT01, TypeLD>, Enc_1793896, AddrModeRel {
+let Inst{31-21} = 0b00110010011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let InputType = "reg";
+let BaseOpcode = "L4_loadruh_rr";
+}
+def L4_return : HInst<
+(outs),
+(ins),
+"dealloc_return",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_0, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isBarrier = 1;
+let isPredicable = 1;
+let isTaken = 1;
+}
+def L4_return_f : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if (!$Pv4) dealloc_return",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1100;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_fnew_pnt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if (!$Pv4.new) dealloc_return:nt",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1010;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_fnew_pt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if (!$Pv4.new) dealloc_return:t",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b1110;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_t : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if ($Pv4) dealloc_return",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0100;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_tnew_pnt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if ($Pv4.new) dealloc_return:nt",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0010;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_return_tnew_pt : HInst<
+(outs),
+(ins PredRegs:$Pv4),
+"if ($Pv4.new) dealloc_return:t",
+LD_tc_3or4stall_SLOT0, TypeLD>, Enc_12711252, PredNewRel {
+let Inst{4-0} = 0b11110;
+let Inst{7-5} = 0b000;
+let Inst{13-10} = 0b0110;
+let Inst{31-21} = 0b10010110000;
+let Inst{20-16} = 0b11110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R29, R30, R31];
+let BaseOpcode = "L4_return";
+let isTaken = Inst{12};
+}
+def L4_sub_memopb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) -= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_11849200 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def L4_sub_memopb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) -= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_sub_memoph_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) -= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_8849208 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def L4_sub_memoph_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) -= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def L4_sub_memopw_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) -= $Rt32",
+V4LDST_tc_st_SLOT0, TypeV4LDST>, Enc_9849208 {
+let Inst{6-5} = 0b01;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00111110010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let mayLoad = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def L4_sub_memopw_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) -= $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def M2_acci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += add($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889, ImmRegRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_acci";
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_accii : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 += add($Rs32,#$Ii)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_11522288, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_acci";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_cmaci_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpyi($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacr_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpyr($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacsc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32*):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmacsc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += cmpy($Rs32,$Rt32*):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cmpyi_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpyi($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+}
+def M2_cmpyr_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpyr($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+}
+def M2_cmpyrs_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpyrs_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpyrsc_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32*):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpyrsc_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = cmpy($Rs32,$Rt32*):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpys_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpys_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpysc_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32*):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cmpysc_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = cmpy($Rs32,$Rt32*):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_cnacs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cnacs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cnacsc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32*):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_cnacsc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= cmpy($Rs32,$Rt32*):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyss_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyss_nac_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyss_rnd_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_dpmpyss_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+}
+def M2_dpmpyuu_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyuu_nac_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_dpmpyuu_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+}
+def M2_hmmpyh_rs1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.h):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_hmmpyh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.h):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_hmmpyl_rs1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.l):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_hmmpyl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32.l):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_maci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyi($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_maci";
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_macsin : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rx32 -= mpyi($Rs32,#$Ii)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_11522288 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_macsip : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rx32 += mpyi($Rs32,#$Ii)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_11522288, ImmRegRel {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_maci";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mmachs_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmachs_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmachs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmachs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacls_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmacuhs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_rs0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_rs1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmaculs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mmpyh_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyh_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyuh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_rs0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_rs1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mmpyul_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_acc_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.h):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.h,$Rt32.l):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.h):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_acc_sat_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32.l,$Rt32.l):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_nac_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_nac_sat_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpy_rnd_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_rnd_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_sat_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_sat_rnd_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpy_up : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_up_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpy_up_s1_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_mpyd_acc_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_acc_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_hh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_hh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_hl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_hl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_lh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_lh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_ll_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100000;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_ll_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100100;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_nac_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_nac_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpy($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyd_rnd_hh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_hh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_hl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_hl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_lh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_lh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_ll_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l):rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100001;
+let prefersSlot3 = 1;
+}
+def M2_mpyd_rnd_ll_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100101;
+let prefersSlot3 = 1;
+}
+def M2_mpyi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyi($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M2_mpyi";
+let InputType = "reg";
+}
+def M2_mpysin : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u8_0Imm:$Ii),
+"$Rd32 = -mpyi($Rs32,#$Ii)",
+M_tc_3x_SLOT23, TypeM>, Enc_16355964 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpysip : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rd32 = +mpyi($Rs32,#$Ii)",
+M_tc_3x_SLOT23, TypeM>, Enc_16355964 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def M2_mpysmi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, m32_0Imm:$Ii),
+"$Rd32 = mpyi($Rs32,#$Ii)",
+M_tc_3x_SLOT23, TypeM>, ImmRegRel {
+let hasNewValue = 1;
+let opNewValue = 0;
+let CextOpcode = "M2_mpyi";
+let InputType = "imm";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 9;
+let opExtentAlign = 0;
+}
+def M2_mpysu_up : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpysu($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_acc_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_acc_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_hh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_hh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_hl_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_hl_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_lh_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_lh_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_ll_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_ll_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyu_nac_hh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_hh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_hl_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_hl_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_lh_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_lh_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_ll_s0 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_nac_ll_s1 : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_mpyu_up : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyu($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_acc_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_acc_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_hh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_hh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_hl_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_hl_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_lh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_lh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_ll_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100010;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_ll_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100100110;
+let prefersSlot3 = 1;
+}
+def M2_mpyud_nac_hh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_hh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_hl_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_hl_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.h,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_lh_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.h)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_lh_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.h):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_ll_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.l)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyud_nac_ll_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 -= mpyu($Rs32.l,$Rt32.l):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100110111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_mpyui : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = mpyui($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def M2_nacci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= add($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_naccii : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 -= add($Rs32,#$Ii)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_11522288 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100010100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_subacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rx32 += sub($Rt32,$Rs32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_7692963 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M2_vabsdiffh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffh($Rtt32,$Rss32)",
+M_tc_2_SLOT23, TypeM>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+}
+def M2_vabsdiffw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffw($Rtt32,$Rss32)",
+M_tc_2_SLOT23, TypeM>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+}
+def M2_vcmac_s0_sat_i : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vcmpyi($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vcmac_s0_sat_r : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vcmpyr($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vcmpy_s0_sat_i : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyi($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vcmpy_s0_sat_r : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyr($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vcmpy_s1_sat_i : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyi($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vcmpy_s1_sat_r : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vcmpyr($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmacs_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vdmpy($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vdmacs_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vdmpy($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vdmpyrs_s0 : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vdmpy($Rss32,$Rtt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmpyrs_s1 : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vdmpy($Rss32,$Rtt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmpys_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vdmpy($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vdmpys_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vdmpy($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmac2 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyh($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2es : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyeh($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2es_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyeh($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2es_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vmpyeh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2s_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyh($Rs32,$Rt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2s_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyh($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2su_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyhsu($Rs32,$Rt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111011;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmac2su_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpyhsu($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vmpy2es_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyeh($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2es_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vmpyeh($Rss32,$Rtt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyh($Rs32,$Rt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s0pack : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vmpyh($Rs32,$Rt32):rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyh($Rs32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2s_s1pack : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = vmpyh($Rs32,$Rt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_14071773 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101101101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2su_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyhsu($Rs32,$Rt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101000;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vmpy2su_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpyhsu($Rs32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vraddh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vraddh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_vradduh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vradduh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M2_vrcmaci_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyi($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmaci_s0c : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyi($Rss32,$Rtt32*)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmacr_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyr($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmacr_s0c : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpyr($Rss32,$Rtt32*)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpyi_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyi($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpyi_s0c : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyi($Rss32,$Rtt32*)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpyr_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyr($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpyr_s0c : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpyr($Rss32,$Rtt32*)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+}
+def M2_vrcmpys_acc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += vrcmpys($Rss32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM> {
+let isPseudo = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpys_acc_s1_h : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi",
+M_tc_3x_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpys_acc_s1_l : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo",
+M_tc_3x_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrcmpys_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vrcmpys($Rss32,$Rt32):<<1:sat",
+M_tc_3x_SLOT23, TypeM> {
+let isPseudo = 1;
+}
+def M2_vrcmpys_s1_h : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrcmpys_s1_l : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrcmpys_s1rp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = vrcmpys($Rss32,$Rt32):<<1:rnd:sat",
+M_tc_3x_SLOT23, TypeM> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+}
+def M2_vrcmpys_s1rp_h : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:hi",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrcmpys_s1rp_l : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:lo",
+M_tc_3x_SLOT23, TypeM>, Enc_9277990 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M2_vrmac_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpyh($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M2_vrmpy_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpyh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000000;
+let prefersSlot3 = 1;
+}
+def M2_xor_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= xor($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= and($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_andn : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= and($Rs32,~$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= or($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_and_xor : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= xor($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_cmpyi_wh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyiwh($Rss32,$Rt32):<<1:rnd:sat",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_14287645 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_cmpyi_whc : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyiwh($Rss32,$Rt32*):<<1:rnd:sat",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_14287645, Requires<[HasV5T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_cmpyr_wh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyrwh($Rss32,$Rt32):<<1:rnd:sat",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_14287645 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_cmpyr_whc : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = cmpyrwh($Rss32,$Rt32*):<<1:rnd:sat",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_14287645, Requires<[HasV5T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M4_mac_up_s1_sat : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += mpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_mpyri_addi : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii, IntRegs:$Rs32, u6_0Imm:$II),
+"$Rd32 = add(#$Ii,mpyi($Rs32,#$II))",
+ALU64_tc_3x_SLOT23, TypeALU64>, Enc_971574, ImmRegRel {
+let Inst{31-24} = 0b11011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyri_addr";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def M4_mpyri_addr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Ru32, IntRegs:$Rs32, u32_0Imm:$Ii),
+"$Rd32 = add($Ru32,mpyi($Rs32,#$Ii))",
+ALU64_tc_3x_SLOT23, TypeALU64>, Enc_236434, ImmRegRel {
+let Inst{31-23} = 0b110111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyri_addr";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def M4_mpyri_addr_u2 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Ru32, u6_2Imm:$Ii, IntRegs:$Rs32),
+"$Rd32 = add($Ru32,mpyi(#$Ii,$Rs32))",
+ALU64_tc_3x_SLOT23, TypeALU64>, Enc_9959498 {
+let Inst{31-23} = 0b110111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def M4_mpyrr_addi : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add(#$Ii,mpyi($Rs32,$Rt32))",
+ALU64_tc_3x_SLOT23, TypeALU64>, Enc_2216485, ImmRegRel {
+let Inst{31-23} = 0b110101110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyrr_addr";
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def M4_mpyrr_addr : HInst<
+(outs IntRegs:$Ry32),
+(ins IntRegs:$Ru32, IntRegs:$Ry32in, IntRegs:$Rs32),
+"$Ry32 = add($Ru32,mpyi($Ry32in,$Rs32))",
+M_tc_3x_SLOT23, TypeM>, Enc_13770697, ImmRegRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let CextOpcode = "M4_mpyrr_addr";
+let InputType = "reg";
+let Constraints = "$Ry32 = $Ry32in";
+}
+def M4_nac_up_s1_sat : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= mpy($Rs32,$Rt32):<<1:sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= and($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_andn : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= and($Rs32,~$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= or($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_or_xor : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= xor($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_pmpyw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = pmpyw($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+}
+def M4_pmpyw_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 ^= pmpyw($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vpmpyh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vpmpyh($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101110;
+let prefersSlot3 = 1;
+}
+def M4_vpmpyh_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 ^= vpmpyh($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyeh_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpyweh($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyeh_acc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpyweh($Rss32,$Rtt32):<<1",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyeh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpyweh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let prefersSlot3 = 1;
+}
+def M4_vrmpyeh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpyweh($Rss32,$Rtt32):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+}
+def M4_vrmpyoh_acc_s0 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpywoh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyoh_acc_s1 : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpywoh($Rss32,$Rtt32):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010111;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M4_vrmpyoh_s0 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpywoh($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let prefersSlot3 = 1;
+}
+def M4_vrmpyoh_s1 : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpywoh($Rss32,$Rtt32):<<1",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+}
+def M4_xor_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= and($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_xor_andn : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= and($Rs32,~$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_xor_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 ^= or($Rs32,$Rt32)",
+M_tc_2_acc_SLOT23, TypeM>, Enc_9223889 {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "reg";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def M4_xor_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 ^= xor($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_12702821 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vdmacbsu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vdmpybsu($Rss32,$Rtt32):sat",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010001;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vdmpybsu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vdmpybsu($Rss32,$Rtt32):sat",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157, Requires<[HasV5T]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M5_vmacbsu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpybsu($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vmacbuu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rxx32 += vmpybu($Rs32,$Rt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_1409050 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100111100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vmpybsu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpybsu($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101010;
+let prefersSlot3 = 1;
+}
+def M5_vmpybuu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = vmpybu($Rs32,$Rt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_1997594 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11100101100;
+let prefersSlot3 = 1;
+}
+def M5_vrmacbsu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpybsu($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vrmacbuu : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vrmpybu($Rss32,$Rtt32)",
+M_tc_3x_acc_SLOT23, TypeM>, Enc_12702821 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M5_vrmpybsu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpybsu($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+}
+def M5_vrmpybuu : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vrmpybu($Rss32,$Rtt32)",
+M_tc_3x_SLOT23, TypeM>, Enc_8333157 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+}
+def M6_vabsdiffb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffb($Rtt32,$Rss32)",
+M_tc_2_SLOT23, TypeM>, Enc_11687333, Requires<[HasV62T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+}
+def M6_vabsdiffub : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = vabsdiffub($Rtt32,$Rss32)",
+M_tc_2_SLOT23, TypeM>, Enc_11687333, Requires<[HasV62T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+}
+def PS_loadrbabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memb(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1886960, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrb";
+let BaseOpcode = "L4_loadrb_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def PS_loadrdabs : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins u29_3Imm:$Ii),
+"$Rdd32 = memd(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_4975051, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrd";
+let BaseOpcode = "L4_loadrd_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def PS_loadrhabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memh(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_12608570, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrh";
+let BaseOpcode = "L4_loadrh_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_loadriabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u30_2Imm:$Ii),
+"$Rd32 = memw(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_8814718, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadri";
+let BaseOpcode = "L4_loadri_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def PS_loadrubabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u32_0Imm:$Ii),
+"$Rd32 = memub(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_1886960, AddrModeRel {
+let Inst{24-21} = 0b1001;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadrub";
+let BaseOpcode = "L4_loadrub_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def PS_loadruhabs : HInst<
+(outs IntRegs:$Rd32),
+(ins u31_1Imm:$Ii),
+"$Rd32 = memuh(#$Ii)",
+V2LDST_tc_ld_SLOT01, TypeV2LDST>, Enc_12608570, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b01001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayLoad = 1;
+let CextOpcode = "L2_loadruh";
+let BaseOpcode = "L4_loadruh_abs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_storerbabs : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeV2LDST>, Enc_12395768, AddrModeRel {
+let Inst{24-21} = 0b0000;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def PS_storerbnewabs : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Nt8),
+"memb(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeV2LDST>, Enc_4050532, AddrModeRel {
+let Inst{12-11} = 0b00;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+let opNewValue = 1;
+}
+def PS_storerdabs : HInst<
+(outs),
+(ins u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd(#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeV2LDST>, Enc_11682941, AddrModeRel {
+let Inst{24-21} = 0b0110;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def PS_storerfabs : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeV2LDST>, Enc_1186018, AddrModeRel {
+let Inst{24-21} = 0b0011;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_storerhabs : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeV2LDST>, Enc_1186018, AddrModeRel {
+let Inst{24-21} = 0b0010;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def PS_storerhnewabs : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Nt8),
+"memh(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeV2LDST>, Enc_13618890, AddrModeRel {
+let Inst{12-11} = 0b01;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+let opNewValue = 1;
+}
+def PS_storeriabs : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeV2LDST>, Enc_15999208, AddrModeRel {
+let Inst{24-21} = 0b0100;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def PS_storerinewabs : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Nt8),
+"memw(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeV2LDST>, Enc_12297800, AddrModeRel {
+let Inst{12-11} = 0b10;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtended = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def S2_addasl_rrri : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32, u3_0Imm:$Ii),
+"$Rd32 = addasl($Rt32,$Rs32,#$Ii)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_3494181 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_allocframe : HInst<
+(outs),
+(ins u11_3Imm:$Ii),
+"allocframe(#$Ii)",
+ST_tc_ld_SLOT0, TypeST>, Enc_15830826 {
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10100000100;
+let Inst{20-16} = 0b11101;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [R29, R30, R31];
+let Defs = [R29, R30];
+}
+def S2_asl_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asl($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4231995 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000000000;
+}
+def S2_asl_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += asl($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= asl($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= asl($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= asl($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_p_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 ^= asl($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10000010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asl($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asl_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += asl($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= asl($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= asl($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= asl($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_r_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asl($Rs32,#$Ii):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_asl_i_r_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 ^= asl($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_i_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vaslh($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2082775 {
+let Inst{7-5} = 0b010;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000100;
+}
+def S2_asl_i_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rdd32 = vaslw($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13201267 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000000010;
+}
+def S2_asl_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = asl($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_asl_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += asl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= asl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= asl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= asl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= asl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asl_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asl($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asl_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += asl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= asl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= asl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= asl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asl_r_r_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asl($Rs32,$Rt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_asl_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vaslh($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_asl_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vaslw($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_asr_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asr($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4231995 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10000000000;
+}
+def S2_asr_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += asr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= asr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= asr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= asr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_i_p_rnd : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asr($Rss32,#$Ii):rnd",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4231995, Requires<[HasV5T]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b10000000110;
+let prefersSlot3 = 1;
+}
+def S2_asr_i_p_rnd_goodsyntax : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = asrrnd($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Requires<[HasV5T]> {
+let isPseudo = 1;
+}
+def S2_asr_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asr($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asr_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += asr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= asr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= asr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= asr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_i_r_rnd : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asr($Rs32,#$Ii):rnd",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_asr_i_r_rnd_goodsyntax : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = asrrnd($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+}
+def S2_asr_i_svw_trun : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rd32 = vasrw($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2380082 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asr_i_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vasrh($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2082775 {
+let Inst{7-5} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000100;
+}
+def S2_asr_i_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rdd32 = vasrw($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13201267 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000000010;
+}
+def S2_asr_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = asr($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_asr_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += asr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= asr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= asr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= asr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= asr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_asr_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asr($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asr_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += asr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= asr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= asr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= asr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_asr_r_r_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = asr($Rs32,$Rt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_asr_r_svw_trun : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rd32 = vasrw($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14287645 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_asr_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vasrh($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_asr_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vasrw($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_brev : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = brev($Rs32)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_brevp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = brev($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000110;
+}
+def S2_cabacdecbin : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = decbin($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let isPredicateLate = 1;
+let prefersSlot3 = 1;
+let Defs = [P0];
+}
+def S2_cl0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = cl0($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_cl0p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = cl0($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_cl1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = cl1($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_cl1p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = cl1($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = clb($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clbnorm : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = normamt($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clbp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = clb($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clrbit_i : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = clrbit($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_clrbit_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = clrbit($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_ct0 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = ct0($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_ct0p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = ct0($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_ct1 : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = ct1($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_ct1p : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = ct1($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000111;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_deinterleave : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = deinterleave($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000110;
+}
+def S2_extractu : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
+"$Rd32 = extractu($Rs32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_11930928 {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b100011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_extractu_rp : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rd32 = extractu($Rs32,$Rtt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_15472748 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_extractup : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
+"$Rdd32 = extractu($Rss32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_9894557 {
+let Inst{31-24} = 0b10000001;
+let prefersSlot3 = 1;
+}
+def S2_extractup_rp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = extractu($Rss32,$Rtt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+let prefersSlot3 = 1;
+}
+def S2_insert : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = insert($Rs32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2880796 {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b100011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_insert_rp : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rx32 = insert($Rs32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_16311032 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_insertp : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
+"$Rxx32 = insert($Rss32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_631197 {
+let Inst{31-24} = 0b10000011;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_insertp_rp : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 = insert($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_12702821 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001010000;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_interleave : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = interleave($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000110;
+}
+def S2_lfsp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = lfs($Rss32,$Rtt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+let prefersSlot3 = 1;
+}
+def S2_lsl_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = lsl($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_lsl_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += lsl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= lsl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= lsl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= lsl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= lsl($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsl_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = lsl($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_lsl_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += lsl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= lsl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= lsl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= lsl($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsl_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlslh($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_lsl_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlslw($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_lsr_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = lsr($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4231995 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000000000;
+}
+def S2_lsr_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += lsr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= lsr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= lsr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= lsr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_p_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 ^= lsr($Rss32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8497723 {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b10000010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = lsr($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_lsr_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += lsr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= lsr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= lsr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= lsr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_r_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 ^= lsr($Rs32,#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_2410156 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_i_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vlsrh($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2082775 {
+let Inst{7-5} = 0b001;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000100;
+}
+def S2_lsr_i_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rdd32 = vlsrw($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13201267 {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000000010;
+}
+def S2_lsr_r_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = lsr($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011100;
+}
+def S2_lsr_r_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += lsr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 &= lsr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 -= lsr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 |= lsr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_p_xor : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 ^= lsr($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001011011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_lsr_r_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = lsr($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_lsr_r_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 += lsr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 &= lsr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 -= lsr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rx32 |= lsr($Rs32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_9223889 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_lsr_r_vh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlsrh($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011010;
+}
+def S2_lsr_r_vw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vlsrw($Rss32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011000;
+}
+def S2_packhl : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = packhl($Rs32,$Rt32)",
+ALU32_3op_tc_1_SLOT0123, TypeALU32_3op>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11110101100;
+let InputType = "reg";
+}
+def S2_parityp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = parity($Rss32,$Rtt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_9277990 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S2_pstorerbf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_14044877, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S2_pstorerbf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_8065534, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerbfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_8065534, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_1737833, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S2_pstorerbnewf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_2813446, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerbnewfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_2813446, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_1737833, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S2_pstorerbnewt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_2813446, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbnewt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4) memb($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerbnewtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_2813446, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerb_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_14044877, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S2_pstorerbt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_8065534, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerbt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memb($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerbtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_8065534, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011000;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rs32+#$Ii) = $Rtt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_11049656, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S2_pstorerdf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rx32++#$Ii) = $Rtt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11959851, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rs32) = $Rtt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerdfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rx32++#$Ii) = $Rtt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11959851, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rs32+#$Ii) = $Rtt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_11049656, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S2_pstorerdt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rx32++#$Ii) = $Rtt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11959851, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerdt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rs32) = $Rtt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerdtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rx32++#$Ii) = $Rtt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11959851, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011110;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerff_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+#$Ii) = $Rt32.h",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerff_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rx32++#$Ii) = $Rt32.h",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerff_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32) = $Rt32.h",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerffnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32.h",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerft_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+#$Ii) = $Rt32.h",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000011;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerft_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rx32++#$Ii) = $Rt32.h",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerft_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32) = $Rt32.h",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerftnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32.h",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011011;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerhf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerhfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_6154421, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S2_pstorerhnewf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_3813442, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerhnewfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_3813442, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_6154421, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S2_pstorerhnewt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_3813442, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerhnewt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4) memh($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerhnewtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_3813442, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerht_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S2_pstorerht_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerht_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerhtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011010;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerif_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_8225953, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000100100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S2_pstorerif_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_10065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerif_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstorerifnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_10065510, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_11224149, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S2_pstorerinewf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_4813442, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerinewfnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_4813442, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_11224149, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S2_pstorerinewt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_4813442, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerinewt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4) memw($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S2_pstorerinewtnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_4813442, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-21} = 0b10101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_pi";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerit_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_8225953, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000000100;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S2_pstorerit_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_10065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_pstorerit_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4) memw($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_pstoreritnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_10065510, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b10101011100;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_setbit_i : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = setbit($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_setbit_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = setbit($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_shuffeb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = shuffeb($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+}
+def S2_shuffeh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = shuffeh($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+}
+def S2_shuffob : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = shuffob($Rtt32,$Rss32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_11687333 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001000;
+}
+def S2_shuffoh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
+"$Rdd32 = shuffoh($Rtt32,$Rss32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_11687333 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S2_storerb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_13150110, AddrModeRel {
+let Inst{24-21} = 0b1000;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isPredicable = 1;
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def S2_storerb_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++$Mu2:brev) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111000;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pbr";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++#$Ii:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_3915770 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++I:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_12492533, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memb($Rx32++$Mu2) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101000;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memb($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerbgp : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Rt32),
+"memb(gp+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_12395768, AddrModeRel {
+let Inst{24-21} = 0b0000;
+let Inst{31-27} = 0b01001;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def S2_storerbnew_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Nt8),
+"memb($Rs32+#$Ii) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_10002182, AddrModeRel {
+let Inst{12-11} = 0b00;
+let Inst{24-21} = 0b1101;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S2_storerbnew_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++$Mu2:brev) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101111101;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerb_pbr";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_5326450 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++I:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
+"memb($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_5900401, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10101011101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerb_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memb($Rx32++$Mu2) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101101101;
+let addrMode = PostInc;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerbnew_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Nt8),
+"memb($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def S2_storerbnewgp : HInst<
+(outs),
+(ins u32_0Imm:$Ii, IntRegs:$Nt8),
+"memb(gp+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_4050532, AddrModeRel {
+let Inst{12-11} = 0b00;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerbabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+let opNewValue = 1;
+}
+def S2_storerd_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd($Rs32+#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16319737, AddrModeRel {
+let Inst{24-21} = 0b1110;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 14;
+let opExtentAlign = 3;
+}
+def S2_storerd_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++$Mu2:brev) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_15816255 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111110;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++#$Ii:circ($Mu2)) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_4501395 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++I:circ($Mu2)) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_15816255 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd($Rx32++#$Ii) = $Rtt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11271630, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerd_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
+"memd($Rx32++$Mu2) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_15816255 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101110;
+let addrMode = PostInc;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerd_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"memd($Rs32) = $Rtt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerdgp : HInst<
+(outs),
+(ins u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd(gp+#$Ii) = $Rtt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_11682941, AddrModeRel {
+let Inst{24-21} = 0b0110;
+let Inst{31-27} = 0b01001;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerdabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 19;
+let opExtentAlign = 3;
+}
+def S2_storerf_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_7736768, AddrModeRel {
+let Inst{24-21} = 0b1011;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def S2_storerf_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2:brev) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111011;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++#$Ii:circ($Mu2)) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_10915758 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++I:circ($Mu2)) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rx32++#$Ii) = $Rt32.h",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11492529, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerf_pi";
+let isPredicable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101011;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerf_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) = $Rt32.h",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerfgp : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(gp+#$Ii) = $Rt32.h",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_1186018, AddrModeRel {
+let Inst{24-21} = 0b0011;
+let Inst{31-27} = 0b01001;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerfabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def S2_storerh_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7736768, AddrModeRel {
+let Inst{24-21} = 0b1010;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isPredicable = 1;
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def S2_storerh_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2:brev) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111010;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pbr";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++#$Ii:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_10915758 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++I:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_11492529, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memh($Rx32++$Mu2) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101010;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerh_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memh($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerhgp : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Rt32),
+"memh(gp+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_1186018, AddrModeRel {
+let Inst{24-21} = 0b0010;
+let Inst{31-27} = 0b01001;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+}
+def S2_storerhnew_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Nt8),
+"memh($Rs32+#$Ii) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_748676, AddrModeRel {
+let Inst{12-11} = 0b01;
+let Inst{24-21} = 0b1101;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+let opNewValue = 2;
+}
+def S2_storerhnew_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++$Mu2:brev) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101111101;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerh_pbr";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10326434 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++I:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000010;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
+"memh($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_6900405, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b001;
+let Inst{31-21} = 0b10101011101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerh_pi";
+let isNVStorable = 1;
+let isPredicable = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memh($Rx32++$Mu2) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101101101;
+let addrMode = PostInc;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerhnew_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Nt8),
+"memh($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def S2_storerhnewgp : HInst<
+(outs),
+(ins u31_1Imm:$Ii, IntRegs:$Nt8),
+"memh(gp+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_13618890, AddrModeRel {
+let Inst{12-11} = 0b01;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storerhabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 17;
+let opExtentAlign = 1;
+let opNewValue = 1;
+}
+def S2_storeri_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_6673186, AddrModeRel {
+let Inst{24-21} = 0b1100;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isPredicable = 1;
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def S2_storeri_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++$Mu2:brev) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101111100;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pbr";
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++#$Ii:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_9915754 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{31-21} = 0b10101001100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++I:circ($Mu2)) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000010;
+let Inst{31-21} = 0b10101001100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let Uses = [CS];
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rx32++#$Ii) = $Rt32",
+ST_tc_st_pi_SLOT01, TypeST>, Enc_10492541, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isPredicable = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
+"memw($Rx32++$Mu2) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_7255914 {
+let Inst{7-0} = 0b00000000;
+let Inst{31-21} = 0b10101101100;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let mayStore = 1;
+let isNVStorable = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storeri_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S2_storerigp : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Rt32),
+"memw(gp+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_15999208, AddrModeRel {
+let Inst{24-21} = 0b0100;
+let Inst{31-27} = 0b01001;
+let accessSize = WordAccess;
+let mayStore = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let isNVStorable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+}
+def S2_storerinew_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Nt8),
+"memw($Rs32+#$Ii) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_8409782, AddrModeRel {
+let Inst{12-11} = 0b10;
+let Inst{24-21} = 0b1101;
+let Inst{31-27} = 0b10100;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+let opNewValue = 2;
+}
+def S2_storerinew_pbr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++$Mu2:brev) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774, AddrModeRel {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101111101;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storeri_pbr";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pci : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_11326438 {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pcr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++I:circ($Mu2)) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000010;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101001101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [CS];
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
+"memw($Rx32++#$Ii) = $Nt8.new",
+ST_tc_st_pi_SLOT0, TypeST>, Enc_7900405, AddrModeRel {
+let Inst{2-0} = 0b000;
+let Inst{7-7} = 0b0;
+let Inst{13-11} = 0b010;
+let Inst{31-21} = 0b10101011101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storeri_pi";
+let isPredicable = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_pr : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
+"memw($Rx32++$Mu2) = $Nt8.new",
+ST_tc_st_SLOT0, TypeST>, Enc_10067774 {
+let Inst{7-0} = 0b00000000;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101101101;
+let addrMode = PostInc;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_storerinew_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Nt8),
+"memw($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def S2_storerinewgp : HInst<
+(outs),
+(ins u30_2Imm:$Ii, IntRegs:$Nt8),
+"memw(gp+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_12297800, AddrModeRel {
+let Inst{12-11} = 0b10;
+let Inst{24-21} = 0b0101;
+let Inst{31-27} = 0b01001;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let Uses = [GP];
+let BaseOpcode = "S2_storeriabs";
+let isPredicable = 1;
+let opExtendable = 0;
+let isExtentSigned = 0;
+let opExtentBits = 18;
+let opExtentAlign = 2;
+let opNewValue = 1;
+}
+def S2_storew_locked : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"memw_locked($Rs32,$Pd4) = $Rt32",
+ST_tc_ld_SLOT0, TypeST>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000101;
+let accessSize = WordAccess;
+let isSoloAX = 1;
+let mayStore = 1;
+let isPredicateLate = 1;
+}
+def S2_svsathb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = vsathb($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_svsathub : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = vsathub($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_tableidxb : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxb($Rs32,#$Ii,#$II):raw",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8838398 {
+let Inst{31-22} = 0b1000011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxb_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxb($Rs32,#$Ii,#$II)",
+S_2op_tc_1_SLOT23, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxd : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxd($Rs32,#$Ii,#$II):raw",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8838398 {
+let Inst{31-22} = 0b1000011111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxd_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxd($Rs32,#$Ii,#$II)",
+S_2op_tc_1_SLOT23, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxh : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxh($Rs32,#$Ii,#$II):raw",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8838398 {
+let Inst{31-22} = 0b1000011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxh_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxh($Rs32,#$Ii,#$II)",
+S_2op_tc_1_SLOT23, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxw : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
+"$Rx32 = tableidxw($Rs32,#$Ii,#$II):raw",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8838398 {
+let Inst{31-22} = 0b1000011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_tableidxw_goodsyntax : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
+"$Rx32 = tableidxw($Rs32,#$Ii,#$II)",
+S_2op_tc_1_SLOT23, TypeS_2op> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S2_togglebit_i : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = togglebit($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_togglebit_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = togglebit($Rs32,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_14071773 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_tstbit_i : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Pd4 = tstbit($Rs32,#$Ii)",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_2103742 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000101000;
+}
+def S2_tstbit_r : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = tstbit($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111000;
+}
+def S2_valignib : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, u3_0Imm:$Ii),
+"$Rdd32 = valignb($Rtt32,$Rss32,#$Ii)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_11971407 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000000000;
+}
+def S2_valignrb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, PredRegs:$Pu4),
+"$Rdd32 = valignb($Rtt32,$Rss32,$Pu4)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_11552785 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010000;
+}
+def S2_vcnegh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vcnegh($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S2_vcrotate : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = vcrotate($Rss32,$Rt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8940892 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000011110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S2_vrcnegh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rxx32 += vrcnegh($Rss32,$Rt32)",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_7912540 {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b11001011001;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S2_vrndpackwh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vrndwh($Rss32)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_vrndpackwhs : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vrndwh($Rss32):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsathb : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsathb($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsathb_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsathb($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsathub : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsathub($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsathub_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsathub($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsatwh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsatwh($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsatwh_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsatwh($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsatwuh : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vsatwuh($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10001000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def S2_vsatwuh_nopack : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32),
+"$Rdd32 = vsatwuh($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_13133231 {
+let Inst{13-5} = 0b000000101;
+let Inst{31-21} = 0b10000000000;
+let Defs = [USR_OVF];
+}
+def S2_vsplatrb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = vsplatb($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4075554 {
+let Inst{13-5} = 0b000000111;
+let Inst{31-21} = 0b10001100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vsplatrh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsplath($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000100010;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vspliceib : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, u3_0Imm:$Ii),
+"$Rdd32 = vspliceb($Rss32,$Rtt32,#$Ii)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_16730127 {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000000100;
+}
+def S2_vsplicerb : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Pu4),
+"$Rdd32 = vspliceb($Rss32,$Rtt32,$Pu4)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_5178985 {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000010100;
+}
+def S2_vsxtbh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsxtbh($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vsxthw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsxthw($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vtrunehb : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vtrunehb($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_vtrunewh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunewh($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S2_vtrunohb : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = vtrunohb($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S2_vtrunowh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunowh($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S2_vzxtbh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vzxtbh($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S2_vzxthw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vzxthw($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179 {
+let Inst{13-5} = 0b000000110;
+let Inst{31-21} = 0b10000100000;
+let isReMaterializable = 1;
+let isAsCheapAsAMove = 1;
+}
+def S4_addaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, s32_0Imm:$Ii),
+"$Rd32 = add($Rs32,add($Ru32,#$Ii))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_6495334 {
+let Inst{31-23} = 0b110110110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_addi_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = add(#$Ii,asl($Rx32in,#$II))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b100;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_addi_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = add(#$Ii,lsr($Rx32in,#$II))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b100;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_andi_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = and(#$Ii,asl($Rx32in,#$II))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b000;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_andi_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = and(#$Ii,lsr($Rx32in,#$II))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b000;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_clbaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s6_0Imm:$Ii),
+"$Rd32 = add(clb($Rs32),#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_5523416 {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b10001100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_clbpaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, s6_0Imm:$Ii),
+"$Rd32 = add(clb($Rss32),#$Ii)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_10188026 {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_clbpnorm : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = normamt($Rss32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_3742184 {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S4_extract : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
+"$Rd32 = extract($Rs32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_11930928 {
+let Inst{13-13} = 0b0;
+let Inst{31-23} = 0b100011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_extract_rp : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"$Rd32 = extract($Rs32,$Rtt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_15472748 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11001001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_extractp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
+"$Rdd32 = extract($Rss32,#$Ii,#$II)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_9894557 {
+let Inst{31-24} = 0b10001010;
+let prefersSlot3 = 1;
+}
+def S4_extractp_rp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = extract($Rss32,$Rtt32)",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let prefersSlot3 = 1;
+}
+def S4_lsli : HInst<
+(outs IntRegs:$Rd32),
+(ins s6_0Imm:$Ii, IntRegs:$Rt32),
+"$Rd32 = lsl(#$Ii,$Rt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_518319 {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S4_ntstbit_i : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Pd4 = !tstbit($Rs32,#$Ii)",
+S_2op_tc_2early_SLOT23, TypeS_2op>, Enc_2103742 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10000101001;
+}
+def S4_ntstbit_r : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Pd4 = !tstbit($Rs32,$Rt32)",
+S_3op_tc_2early_SLOT23, TypeS_3op>, Enc_10157519 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000111001;
+}
+def S4_or_andi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 |= and($Rs32,#$Ii)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_6356866 {
+let Inst{31-22} = 0b1101101000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_or_andix : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Ru32, IntRegs:$Rx32in, s32_0Imm:$Ii),
+"$Rx32 = or($Ru32,and($Rx32in,#$Ii))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_7504828 {
+let Inst{31-22} = 0b1101101001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_or_ori : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rx32 |= or($Rs32,#$Ii)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_6356866 {
+let Inst{31-22} = 0b1101101010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let InputType = "imm";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_ori_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = or(#$Ii,asl($Rx32in,#$II))",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b010;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_ori_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = or(#$Ii,lsr($Rx32in,#$II))",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b010;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_parity : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = parity($Rs32,$Rt32)",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S4_pstorerbf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_14044877, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110111000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memb($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerbnewf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b000;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110101101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_1737833, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000110101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_pstorerbnewfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4.new) memb($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerbnewt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b000;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110100101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b100;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerbnewtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_1737833, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b01000010101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_pstorerbnewtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b00;
+let Inst{31-21} = 0b00110110101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let opNewValue = 4;
+}
+def S4_pstorerbnewtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4.new) memb($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerbt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110100000;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111000000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S2_storerbabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_14044877, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S2_storerb_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerbtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110110000;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerbtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memb($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerdf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd(#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_13715847, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9920336, AddrModeRel {
+let Inst{31-21} = 0b00110101110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd(#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_13715847, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rs32+#$Ii) = $Rtt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_11049656, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S4_pstorerdfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9920336, AddrModeRel {
+let Inst{31-21} = 0b00110111110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if (!$Pv4.new) memd($Rs32) = $Rtt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerdt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd(#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_13715847, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9920336, AddrModeRel {
+let Inst{31-21} = 0b00110100110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd(#$Ii) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_13715847, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111110000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let BaseOpcode = "S2_storerdabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerdtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rs32+#$Ii) = $Rtt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_11049656, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010110;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 9;
+let opExtentAlign = 3;
+}
+def S4_pstorerdtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9920336, AddrModeRel {
+let Inst{31-21} = 0b00110110110;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+}
+def S4_pstorerdtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"if ($Pv4.new) memd($Rs32) = $Rtt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerff_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh(#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerff_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110101011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerffnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh(#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerffnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32.h",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerffnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110111011;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerffnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32) = $Rt32.h",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerft_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh(#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerft_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110100011;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerftnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh(#$Ii) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111011000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let BaseOpcode = "S2_storerfabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerftnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32.h",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010011;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S2_storerf_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerftnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110110011;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+}
+def S4_pstorerftnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32) = $Rt32.h",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerhf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerhf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110101010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerhfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerhfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110111010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memh($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerhnewf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b001;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110101101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_6154421, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000110101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S4_pstorerhnewfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4.new) memh($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerhnewt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b001;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110100101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b101;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerhnewtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_6154421, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b01000010101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+let opNewValue = 3;
+}
+def S4_pstorerhnewtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b01;
+let Inst{31-21} = 0b00110110101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let opNewValue = 4;
+}
+def S4_pstorerhnewtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4.new) memh($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerht_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerht_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110100010;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111010000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerhabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerhtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_10979813, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 7;
+let opExtentAlign = 1;
+}
+def S4_pstorerhtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110110010;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerhtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memh($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerif_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerif_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110101100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerifnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerifnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_8225953, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000110100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S4_pstorerifnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110111100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstorerifnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if (!$Pv4.new) memw($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_pstorerinewf_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b010;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewf_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110101101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewfnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b1;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_11224149, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000110101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S4_pstorerinewfnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110111101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if (!$Pv4.new) memw($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerinewt_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b010;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewt_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110100101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewtnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw(#$Ii) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_1774350, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-11} = 0b110;
+let Inst{31-18} = 0b10101111101000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_pstorerinewtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rs32+#$Ii) = $Nt8.new",
+V2LDST_tc_st_SLOT0, TypeV2LDST>, Enc_11224149, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b01000010101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+let opNewValue = 3;
+}
+def S4_pstorerinewtnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_11000933, AddrModeRel {
+let Inst{4-3} = 0b10;
+let Inst{31-21} = 0b00110110101;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let opNewValue = 4;
+}
+def S4_pstorerinewtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
+"if ($Pv4.new) memw($Rs32) = $Nt8.new",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def S4_pstorerit_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b0;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstorerit_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110100100;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstoreritnew_abs : HInst<
+(outs),
+(ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw(#$Ii) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_16657398, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-18} = 0b10101111100000;
+let isPredicated = 1;
+let addrMode = Absolute;
+let accessSize = WordAccess;
+let isExtended = 1;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeriabs";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_pstoreritnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rs32+#$Ii) = $Rt32",
+V2LDST_tc_st_SLOT01, TypeV2LDST>, Enc_8225953, AddrModeRel {
+let Inst{2-2} = 0b0;
+let Inst{31-21} = 0b01000010100;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_io";
+let isNVStorable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 2;
+}
+def S4_pstoreritnew_rr : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11940513, AddrModeRel {
+let Inst{31-21} = 0b00110110100;
+let isPredicated = 1;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+}
+def S4_pstoreritnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
+"if ($Pv4.new) memw($Rs32) = $Rt32",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_stored_locked : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"memd_locked($Rs32,$Pd4) = $Rtt32",
+ST_tc_ld_SLOT0, TypeST>, Enc_2921694 {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100000111;
+let accessSize = DoubleWordAccess;
+let isSoloAX = 1;
+let mayStore = 1;
+let isPredicateLate = 1;
+}
+def S4_storeirb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"memb($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_11282123, PredNewRel {
+let Inst{31-21} = 0b00111100000;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def S4_storeirb_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, s8_0Imm:$II),
+"memb($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4) memb($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_5967898, PredNewRel {
+let Inst{31-21} = 0b00111000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4) memb($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4.new) memb($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_5967898, PredNewRel {
+let Inst{31-21} = 0b00111001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4.new) memb($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbt_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4) memb($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_5967898, PredNewRel {
+let Inst{31-21} = 0b00111000000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbt_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4) memb($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirbtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4.new) memb($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_5967898, PredNewRel {
+let Inst{31-21} = 0b00111001000;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirb_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirbtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4.new) memb($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirh_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"memh($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_10282127, PredNewRel {
+let Inst{31-21} = 0b00111100001;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def S4_storeirh_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, s8_0Imm:$II),
+"memh($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirhf_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4) memh($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_4967902, PredNewRel {
+let Inst{31-21} = 0b00111000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirhf_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4) memh($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirhfnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4.new) memh($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_4967902, PredNewRel {
+let Inst{31-21} = 0b00111001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirhfnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4.new) memh($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirht_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4) memh($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_4967902, PredNewRel {
+let Inst{31-21} = 0b00111000001;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirht_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4) memh($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirhtnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4.new) memh($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_4967902, PredNewRel {
+let Inst{31-21} = 0b00111001001;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S4_storeirh_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirhtnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4.new) memh($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeiri_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"memw($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9282127, PredNewRel {
+let Inst{31-21} = 0b00111100010;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isPredicable = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def S4_storeiri_zomap : HInst<
+(outs),
+(ins IntRegs:$Rs32, s8_0Imm:$II),
+"memw($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirif_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4) memw($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_3967902, PredNewRel {
+let Inst{31-21} = 0b00111000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirif_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4) memw($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirifnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if (!$Pv4.new) memw($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_3967902, PredNewRel {
+let Inst{31-21} = 0b00111001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirifnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if (!$Pv4.new) memw($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeirit_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4) memw($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_3967902, PredNewRel {
+let Inst{31-21} = 0b00111000010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeirit_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4) memw($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storeiritnew_io : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"if ($Pv4.new) memw($Rs32+#$Ii) = #$II",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_3967902, PredNewRel {
+let Inst{31-21} = 0b00111001010;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let isPredicatedNew = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S4_storeiri_io";
+let isExtendable = 1;
+let opExtendable = 3;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeiritnew_zomap : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
+"if ($Pv4.new) memw($Rs32) = #$II",
+PSEUDO, TypeMAPPING> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def S4_storerb_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memb($Re32=#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_11477246, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerb_ap";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerb_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_14046916, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011000;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isNVStorable = 1;
+let isPredicable = 1;
+}
+def S4_storerb_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memb($Ru32<<#$Ii+#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_14689096, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101000;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "imm";
+let BaseOpcode = "S4_storerb_ur";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerbnew_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Nt8),
+"memb($Re32=#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_14193700, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10101011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerb_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_storerbnew_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_5486172, AddrModeRel {
+let Inst{6-3} = 0b0000;
+let Inst{31-21} = 0b00111011101;
+let addrMode = BaseRegOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let InputType = "reg";
+let BaseOpcode = "S4_storerb_rr";
+let isPredicable = 1;
+let opNewValue = 3;
+}
+def S4_storerbnew_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
+"memb($Ru32<<#$Ii+#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10076500, AddrModeRel {
+let Inst{7-7} = 0b1;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b10101101101;
+let addrMode = BaseLongOffset;
+let accessSize = ByteAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerb";
+let BaseOpcode = "S4_storerb_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_storerd_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, DoubleRegs:$Rtt32),
+"memd($Re32=#$II) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_8131399 {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S4_storerd_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerd_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
+"memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_9772987, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011110;
+let addrMode = BaseRegOffset;
+let accessSize = DoubleWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "reg";
+let BaseOpcode = "S2_storerd_rr";
+let isPredicable = 1;
+}
+def S4_storerd_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, DoubleRegs:$Rtt32),
+"memd($Ru32<<#$Ii+#$II) = $Rtt32",
+ST_tc_st_SLOT01, TypeST>, Enc_12848507, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101110;
+let addrMode = BaseLongOffset;
+let accessSize = DoubleWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerd";
+let InputType = "imm";
+let BaseOpcode = "S2_storerd_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerf_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Re32=#$II) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_11477246 {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S4_storerf_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerf_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_14046916, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011011;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "reg";
+let BaseOpcode = "S4_storerf_rr";
+let isPredicable = 1;
+}
+def S4_storerf_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Ru32<<#$Ii+#$II) = $Rt32.h",
+ST_tc_st_SLOT01, TypeST>, Enc_14689096, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101011;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerf";
+let InputType = "imm";
+let BaseOpcode = "S4_storerf_rr";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerh_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Re32=#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_11477246, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storerh_ap";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerh_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_14046916, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011010;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isNVStorable = 1;
+let isPredicable = 1;
+}
+def S4_storerh_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memh($Ru32<<#$Ii+#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_14689096, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101010;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "imm";
+let BaseOpcode = "S2_storerh_ur";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerhnew_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Nt8),
+"memh($Re32=#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_14193700, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-11} = 0b001;
+let Inst{31-21} = 0b10101011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storerh_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_storerhnew_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_5486172, AddrModeRel {
+let Inst{6-3} = 0b0001;
+let Inst{31-21} = 0b00111011101;
+let addrMode = BaseRegOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let InputType = "reg";
+let BaseOpcode = "S2_storerh_rr";
+let isPredicable = 1;
+let opNewValue = 3;
+}
+def S4_storerhnew_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
+"memh($Ru32<<#$Ii+#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10076500, AddrModeRel {
+let Inst{7-7} = 0b1;
+let Inst{12-11} = 0b01;
+let Inst{31-21} = 0b10101101101;
+let addrMode = BaseLongOffset;
+let accessSize = HalfWordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storerh";
+let BaseOpcode = "S2_storerh_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_storeri_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Rt32),
+"memw($Re32=#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_11477246, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10101011100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let BaseOpcode = "S2_storeri_ap";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storeri_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+$Ru32<<#$Ii) = $Rt32",
+V4LDST_tc_st_SLOT01, TypeST>, Enc_14046916, AddrModeRel, ImmRegShl {
+let Inst{6-5} = 0b00;
+let Inst{31-21} = 0b00111011100;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isNVStorable = 1;
+let isPredicable = 1;
+}
+def S4_storeri_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
+"memw($Ru32<<#$Ii+#$II) = $Rt32",
+ST_tc_st_SLOT01, TypeST>, Enc_14689096, AddrModeRel, ImmRegShl {
+let Inst{7-7} = 0b1;
+let Inst{31-21} = 0b10101101100;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isExtended = 1;
+let mayStore = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "imm";
+let BaseOpcode = "S2_storeri_ur";
+let isNVStorable = 1;
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_storerinew_ap : HInst<
+(outs IntRegs:$Re32),
+(ins u32_0Imm:$II, IntRegs:$Nt8),
+"memw($Re32=#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_14193700, AddrModeRel {
+let Inst{7-6} = 0b10;
+let Inst{13-11} = 0b010;
+let Inst{31-21} = 0b10101011101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = AbsoluteSet;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "S2_storeri_ap";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 2;
+}
+def S4_storerinew_rr : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
+"memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
+V4LDST_tc_st_SLOT0, TypeST>, Enc_5486172, AddrModeRel {
+let Inst{6-3} = 0b0010;
+let Inst{31-21} = 0b00111011101;
+let addrMode = BaseRegOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let InputType = "reg";
+let BaseOpcode = "S2_storeri_rr";
+let isPredicable = 1;
+let opNewValue = 3;
+}
+def S4_storerinew_ur : HInst<
+(outs),
+(ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
+"memw($Ru32<<#$Ii+#$II) = $Nt8.new",
+NCJ_tc_3or4stall_SLOT0, TypeST>, Enc_10076500, AddrModeRel {
+let Inst{7-7} = 0b1;
+let Inst{12-11} = 0b10;
+let Inst{31-21} = 0b10101101101;
+let addrMode = BaseLongOffset;
+let accessSize = WordAccess;
+let isNVStore = 1;
+let isExtended = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let CextOpcode = "S2_storeri";
+let BaseOpcode = "S2_storeri_ur";
+let DecoderNamespace = "MustExtend";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+let opNewValue = 3;
+}
+def S4_subaddi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Ru32),
+"$Rd32 = add($Rs32,sub(#$Ii,$Ru32))",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_6495334 {
+let Inst{31-23} = 0b110110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def S4_subi_asl_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = sub(#$Ii,asl($Rx32in,#$II))",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b110;
+let Inst{4-4} = 0b0;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_subi_lsr_ri : HInst<
+(outs IntRegs:$Rx32),
+(ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
+"$Rx32 = sub(#$Ii,lsr($Rx32in,#$II))",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_117962 {
+let Inst{2-0} = 0b110;
+let Inst{4-4} = 0b1;
+let Inst{31-24} = 0b11011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S4_vrcrotate : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rdd32 = vrcrotate($Rss32,$Rt32,#$Ii)",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_114098 {
+let Inst{7-6} = 0b11;
+let Inst{31-21} = 0b11000011110;
+let prefersSlot3 = 1;
+}
+def S4_vrcrotate_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii),
+"$Rxx32 += vrcrotate($Rss32,$Rt32,#$Ii)",
+S_3op_tc_3x_SLOT23, TypeS_3op>, Enc_13114546 {
+let Inst{7-6} = 0b00;
+let Inst{31-21} = 0b11001011101;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S4_vxaddsubh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxaddsubh($Rss32,$Rtt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let Defs = [USR_OVF];
+}
+def S4_vxaddsubhr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxaddsubh($Rss32,$Rtt32):rnd:>>1:sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S4_vxaddsubw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxaddsubw($Rss32,$Rtt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let Defs = [USR_OVF];
+}
+def S4_vxsubaddh : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxsubaddh($Rss32,$Rtt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let Defs = [USR_OVF];
+}
+def S4_vxsubaddhr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxsubaddh($Rss32,$Rtt32):rnd:>>1:sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001110;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S4_vxsubaddw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vxsubaddw($Rss32,$Rtt32):sat",
+S_3op_tc_2_SLOT23, TypeS_3op>, Enc_8333157 {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001010;
+let Defs = [USR_OVF];
+}
+def S5_asrhub_rnd_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rd32 = vasrhub($Rss32,#$Ii):raw",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8038806, Requires<[HasV5T]> {
+let Inst{7-5} = 0b100;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S5_asrhub_rnd_sat_goodsyntax : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rd32 = vasrhub($Rss32,#$Ii):rnd:sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Requires<[HasV5T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+}
+def S5_asrhub_sat : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rd32 = vasrhub($Rss32,#$Ii):sat",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_8038806, Requires<[HasV5T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def S5_popcountp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = popcount($Rss32)",
+S_2op_tc_2_SLOT23, TypeS_2op>, Enc_3742184, Requires<[HasV5T]> {
+let Inst{13-5} = 0b000000011;
+let Inst{31-21} = 0b10001000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+}
+def S5_vasrhrnd : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vasrh($Rss32,#$Ii):raw",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2082775, Requires<[HasV5T]> {
+let Inst{7-5} = 0b000;
+let Inst{13-12} = 0b00;
+let Inst{31-21} = 0b10000000001;
+let prefersSlot3 = 1;
+}
+def S5_vasrhrnd_goodsyntax : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
+"$Rdd32 = vasrh($Rss32,#$Ii):rnd",
+S_2op_tc_1_SLOT23, TypeS_2op>, Requires<[HasV5T]> {
+let isPseudo = 1;
+}
+def S6_rol_i_p : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4231995, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000000000;
+}
+def S6_rol_i_p_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 += rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8497723, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_and : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 &= rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8497723, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_nac : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 -= rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8497723, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_or : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 |= rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8497723, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b10000010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_p_xacc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rxx32 ^= rol($Rss32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_8497723, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b10000010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def S6_rol_i_r : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2771456, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def S6_rol_i_r_acc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 += rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2410156, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_and : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 &= rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2410156, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_nac : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 -= rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2410156, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_or : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 |= rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2410156, Requires<[HasV60T]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_rol_i_r_xacc : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rx32 ^= rol($Rs32,#$Ii)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_2410156, Requires<[HasV60T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001110100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def S6_vsplatrbp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = vsplatb($Rs32)",
+S_2op_tc_1_SLOT23, TypeS_2op>, Enc_4030179, Requires<[HasV62T]> {
+let Inst{13-5} = 0b000000100;
+let Inst{31-21} = 0b10000100010;
+}
+def S6_vtrunehb_ppp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunehb($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157, Requires<[HasV62T]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def S6_vtrunohb_ppp : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vtrunohb($Rss32,$Rtt32)",
+S_3op_tc_1_SLOT23, TypeS_3op>, Enc_8333157, Requires<[HasV62T]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000001100;
+}
+def SA1_addi : HInst<
+(outs GeneralSubRegs:$Rx16),
+(ins IntRegs:$Rx16in, s32_0Imm:$Ii),
+"$Rx16 = add($Rx16in,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_3974695 {
+let Inst{12-11} = 0b00;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 7;
+let opExtentAlign = 0;
+let Constraints = "$Rx16 = $Rx16in";
+}
+def SA1_addrx : HInst<
+(outs GeneralSubRegs:$Rx16),
+(ins IntRegs:$Rx16in, GeneralSubRegs:$Rs16),
+"$Rx16 = add($Rx16in,$Rs16)",
+PSEUDO, TypeSUBINSN>, Enc_6135183 {
+let Inst{12-8} = 0b11000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+let Constraints = "$Rx16 = $Rx16in";
+}
+def SA1_addsp : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u6_2Imm:$Ii),
+"$Rd16 = add(r29,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_176263 {
+let Inst{12-10} = 0b011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_and1 : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = and($Rs16,#1)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrf : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (!p0) $Rd16 = #0",
+PSEUDO, TypeSUBINSN>, Enc_1451363 {
+let Inst{12-4} = 0b110100111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrfnew : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (!p0.new) $Rd16 = #0",
+PSEUDO, TypeSUBINSN>, Enc_1451363 {
+let Inst{12-4} = 0b110100101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrt : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (p0) $Rd16 = #0",
+PSEUDO, TypeSUBINSN>, Enc_1451363 {
+let Inst{12-4} = 0b110100110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_clrtnew : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins),
+"if (p0.new) $Rd16 = #0",
+PSEUDO, TypeSUBINSN>, Enc_1451363 {
+let Inst{12-4} = 0b110100100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let Uses = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_cmpeqi : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u2_0Imm:$Ii),
+"p0 = cmp.eq($Rs16,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_2079016 {
+let Inst{3-2} = 0b00;
+let Inst{12-8} = 0b11001;
+let AsmVariantName = "NonParsable";
+let Defs = [P0];
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine0i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#0,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_15946706 {
+let Inst{4-3} = 0b00;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine1i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#1,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_15946706 {
+let Inst{4-3} = 0b01;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine2i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#2,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_15946706 {
+let Inst{4-3} = 0b10;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combine3i : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u2_0Imm:$Ii),
+"$Rdd8 = combine(#3,#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_15946706 {
+let Inst{4-3} = 0b11;
+let Inst{12-7} = 0b111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combinerz : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins GeneralSubRegs:$Rs16),
+"$Rdd8 = combine($Rs16,#0)",
+PSEUDO, TypeSUBINSN>, Enc_10501894 {
+let Inst{3-3} = 0b1;
+let Inst{12-8} = 0b11101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_combinezr : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins GeneralSubRegs:$Rs16),
+"$Rdd8 = combine(#0,$Rs16)",
+PSEUDO, TypeSUBINSN>, Enc_10501894 {
+let Inst{3-3} = 0b0;
+let Inst{12-8} = 0b11101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_dec : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, n1Const:$n1),
+"$Rd16 = add($Rs16,#$n1)",
+PSEUDO, TypeSUBINSN>, Enc_10597934 {
+let Inst{12-8} = 0b10011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_inc : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = add($Rs16,#1)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_seti : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u32_0Imm:$Ii),
+"$Rd16 = #$Ii",
+PSEUDO, TypeSUBINSN>, Enc_2176383 {
+let Inst{12-10} = 0b010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def SA1_setin1 : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins n1Const:$n1),
+"$Rd16 = #$n1",
+PSEUDO, TypeSUBINSN>, Enc_13336212 {
+let Inst{12-4} = 0b110100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_sxtb : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = sxtb($Rs16)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_sxth : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = sxth($Rs16)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_tfr : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = $Rs16",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_zxtb : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = and($Rs16,#255)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SA1_zxth : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16),
+"$Rd16 = zxth($Rs16)",
+PSEUDO, TypeSUBINSN>, Enc_14939491 {
+let Inst{12-8} = 0b10110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let DecoderNamespace = "SUBINSN_A";
+}
+def SL1_loadri_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
+"$Rd16 = memw($Rs16+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_13606251 {
+let Inst{12-12} = 0b0;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L1";
+}
+def SL1_loadrub_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
+"$Rd16 = memub($Rs16+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_15606259 {
+let Inst{12-12} = 0b1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L1";
+}
+def SL2_deallocframe : HInst<
+(outs),
+(ins),
+"deallocframe",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111100000000;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [R30, R29, R31];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31 : HInst<
+(outs),
+(ins),
+"jumpr r31",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111111000000;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let Uses = [R31];
+let Defs = [PC];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_f : HInst<
+(outs),
+(ins),
+"if (!p0) jumpr r31",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111111000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_fnew : HInst<
+(outs),
+(ins),
+"if (!p0.new) jumpr:nt r31",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111111000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let isPredicatedNew = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_t : HInst<
+(outs),
+(ins),
+"if (p0) jumpr r31",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111111000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_jumpr31_tnew : HInst<
+(outs),
+(ins),
+"if (p0.new) jumpr:nt r31",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111111000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let isPredicatedNew = 1;
+let Uses = [P0, R31];
+let Defs = [PC];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadrb_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u3_0Imm:$Ii),
+"$Rd16 = memb($Rs16+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_3135259 {
+let Inst{12-11} = 0b10;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadrd_sp : HInst<
+(outs GeneralDoubleLow8Regs:$Rdd8),
+(ins u5_3Imm:$Ii),
+"$Rdd8 = memd(r29+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_16479122 {
+let Inst{12-8} = 0b11110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadrh_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii),
+"$Rd16 = memh($Rs16+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_4135257 {
+let Inst{12-11} = 0b00;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadri_sp : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins u5_2Imm:$Ii),
+"$Rd16 = memw(r29+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_64199 {
+let Inst{12-9} = 0b1110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_loadruh_io : HInst<
+(outs GeneralSubRegs:$Rd16),
+(ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii),
+"$Rd16 = memuh($Rs16+#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_4135257 {
+let Inst{12-11} = 0b01;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return : HInst<
+(outs),
+(ins),
+"dealloc_return",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111101000000;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [R30];
+let Defs = [PC, R30, R29, R31];
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_f : HInst<
+(outs),
+(ins),
+"if (!p0) dealloc_return",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_fnew : HInst<
+(outs),
+(ins),
+"if (!p0.new) dealloc_return:nt",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_t : HInst<
+(outs),
+(ins),
+"if (p0) dealloc_return",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111101000100;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let mayLoad = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SL2_return_tnew : HInst<
+(outs),
+(ins),
+"if (p0.new) dealloc_return:nt",
+PSEUDO, TypeSUBINSN>, Enc_0 {
+let Inst{12-0} = 0b1111101000110;
+let isPredicated = 1;
+let isTerminator = 1;
+let isIndirectBranch = 1;
+let accessSize = DoubleWordAccess;
+let cofMax1 = 1;
+let AsmVariantName = "NonParsable";
+let isReturn = 1;
+let isPredicatedNew = 1;
+let mayLoad = 1;
+let Uses = [P0, R30];
+let Defs = [PC, R30, R29, R31];
+let isTaken = Inst{4};
+let DecoderNamespace = "SUBINSN_L2";
+}
+def SS1_storeb_io : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii, GeneralSubRegs:$Rt16),
+"memb($Rs16+#$Ii) = $Rt16",
+PSEUDO, TypeSUBINSN>, Enc_13204995 {
+let Inst{12-12} = 0b1;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S1";
+}
+def SS1_storew_io : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii, GeneralSubRegs:$Rt16),
+"memw($Rs16+#$Ii) = $Rt16",
+PSEUDO, TypeSUBINSN>, Enc_11205051 {
+let Inst{12-12} = 0b0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S1";
+}
+def SS2_allocframe : HInst<
+(outs),
+(ins u5_3Imm:$Ii),
+"allocframe(#$Ii)",
+PSEUDO, TypeSUBINSN>, Enc_7884306 {
+let Inst{3-0} = 0b0000;
+let Inst{12-9} = 0b1110;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let Uses = [R30, R29, R31];
+let Defs = [R30, R29];
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storebi0 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
+"memb($Rs16+#$Ii) = #0",
+PSEUDO, TypeSUBINSN>, Enc_13536408 {
+let Inst{12-8} = 0b10010;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storebi1 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
+"memb($Rs16+#$Ii) = #1",
+PSEUDO, TypeSUBINSN>, Enc_13536408 {
+let Inst{12-8} = 0b10011;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_stored_sp : HInst<
+(outs),
+(ins s6_3Imm:$Ii, GeneralDoubleLow8Regs:$Rtt8),
+"memd(r29+#$Ii) = $Rtt8",
+PSEUDO, TypeSUBINSN>, Enc_9165078 {
+let Inst{12-9} = 0b0101;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storeh_io : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii, GeneralSubRegs:$Rt16),
+"memh($Rs16+#$Ii) = $Rt16",
+PSEUDO, TypeSUBINSN>, Enc_1734121 {
+let Inst{12-11} = 0b00;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storew_sp : HInst<
+(outs),
+(ins u5_2Imm:$Ii, GeneralSubRegs:$Rt16),
+"memw(r29+#$Ii) = $Rt16",
+PSEUDO, TypeSUBINSN>, Enc_6690615 {
+let Inst{12-9} = 0b0100;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let Uses = [R29];
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storewi0 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
+"memw($Rs16+#$Ii) = #0",
+PSEUDO, TypeSUBINSN>, Enc_15536400 {
+let Inst{12-8} = 0b10000;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def SS2_storewi1 : HInst<
+(outs),
+(ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
+"memw($Rs16+#$Ii) = #1",
+PSEUDO, TypeSUBINSN>, Enc_15536400 {
+let Inst{12-8} = 0b10001;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let DecoderNamespace = "SUBINSN_S2";
+}
+def V6_MAP_equb : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_MAP_equb_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_MAP_equb_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_ior : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_ior_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equb_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.ub,$Vv32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_MAP_equh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_MAP_equh_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_ior : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_ior_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equh_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uh,$Vv32.uh)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_MAP_equw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_MAP_equw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_ior : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_ior_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_MAP_equw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.uw,$Vv32.uw)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_extractw : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rs32),
+"$Rd32 = vextract($Vu32,$Rs32)",
+LD_tc_ld_SLOT0, TypeLD>, Enc_16601956, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10010010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_extractw_128B : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rs32),
+"$Rd32 = vextract($Vu32,$Rs32)",
+LD_tc_ld_SLOT0, TypeLD>, Enc_16601956, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10010010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_extractw_alt : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rs32),
+"$Rd32.w = vextract($Vu32,$Rs32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_extractw_alt_128B : HInst<
+(outs IntRegs:$Rd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rs32),
+"$Rd32.w = vextract($Vu32,$Rs32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_hi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vss32),
+"$Vd32 = hi($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_hi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vss32),
+"$Vd32 = hi($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ld0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ld0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldnt0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32):nt",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldnt0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmem($Rt32):nt",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_ldu0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmemu($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_ldu0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vmemu($Rt32)",
+PSEUDO, TypeCVI_VM_LD>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lo : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vss32),
+"$Vd32 = lo($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lo_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vss32),
+"$Vd32 = lo($Vss32)",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lvsplatb : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.b = vsplat($Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_9768377, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lvsplatb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.b = vsplat($Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_9768377, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000010;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lvsplath : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.h = vsplat($Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_9768377, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lvsplath_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32.h = vsplat($Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_9768377, Requires<[HasV62T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_lvsplatw : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vsplat($Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_9768377, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_lvsplatw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32),
+"$Vd32 = vsplat($Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_9768377, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b000000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_and : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = and($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_and_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = and($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_and_n : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = and($Qs4,!$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_and_n_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = and($Qs4,!$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_not : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4),
+"$Qd4 = not($Qs4)",
+CVI_VA, TypeCVI_VA>, Enc_4897205, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-10} = 0b0000;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_not_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4),
+"$Qd4 = not($Qs4)",
+CVI_VA, TypeCVI_VA>, Enc_4897205, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-10} = 0b0000;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_or : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = or($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_or_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = or($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_or_n : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = or($Qs4,!$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_or_n_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = or($Qs4,!$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_scalar2 : HInst<
+(outs VecPredRegs:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq($Rt32)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_12781442, Requires<[HasV60T,UseHVX]> {
+let Inst{13-2} = 0b000000010001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_scalar2_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq($Rt32)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_12781442, Requires<[HasV60T,UseHVX]> {
+let Inst{13-2} = 0b000000010001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_scalar2v2 : HInst<
+(outs VecPredRegs:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq2($Rt32)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_12781442, Requires<[HasV62T,UseHVX]> {
+let Inst{13-2} = 0b000000010011;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_scalar2v2_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins IntRegs:$Rt32),
+"$Qd4 = vsetq2($Rt32)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_12781442, Requires<[HasV62T,UseHVX]> {
+let Inst{13-2} = 0b000000010011;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_pred_xor : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4 = xor($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_pred_xor_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4 = xor($Qs4,$Qt4)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000011;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_shuffeqh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4.b = vshuffe($Qs4.h,$Qt4.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_shuffeqh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4.b = vshuffe($Qs4.h,$Qt4.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_shuffeqw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VecPredRegs:$Qs4, VecPredRegs:$Qt4),
+"$Qd4.h = vshuffe($Qs4.w,$Qt4.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_shuffeqw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VecPredRegs128B:$Qs4, VecPredRegs128B:$Qt4),
+"$Qd4.h = vshuffe($Qs4.w,$Qt4.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_6091631, Requires<[HasV62T,UseHVX]> {
+let Inst{7-2} = 0b000111;
+let Inst{13-10} = 0b0000;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_st0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Vs32),
+"vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_st0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stn0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Os8),
+"vmem($Rt32) = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 1;
+}
+def V6_stn0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Os8),
+"vmem($Rt32) = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def V6_stnnt0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Os8),
+"vmem($Rt32):nt = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 1;
+}
+def V6_stnnt0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Os8),
+"vmem($Rt32):nt = $Os8.new",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 1;
+}
+def V6_stnp0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnp0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnpnt0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnpnt0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnq0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnq0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnqnt0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnqnt0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stnt0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Vs32),
+"vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stnt0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stp0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stp0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stpnt0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stpnt0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stq0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stq0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stqnt0 : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stqnt0_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32):nt = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stu0 : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs:$Vs32),
+"vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stu0_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stunp0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stunp0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_stup0 : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_stup0_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rt32) = $Vs32",
+PSEUDO, TypeCVI_VM_ST>, Requires<[HasV60T,UseHVX]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32Ub_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmemu($Rt32+#$Ii)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32Ub_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmemu($Rt32+#$Ii)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32Ub_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmemu($Rx32++#$Ii)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32Ub_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmemu($Rx32++#$Ii)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32Ub_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmemu($Rx32++$Mu2)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32Ub_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmemu($Rx32++$Mu2)",
+CVI_VM_VP_LDU, TypeCVI_VM_VP_LDU>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_cur_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_cur_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_cur_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_cur_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_CUR_LD, TypeCOPROC_VMEM>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii)",
+CVI_VM_CUR_LD, TypeCOPROC_VMEM>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_cur_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2)",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_cur_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_cur_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000101;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000001;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_cur_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rt32+#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_cur_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++#$Ii):nt",
+CVI_VM_CUR_LD, TypeCVI_VM_CUR_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCOPROC_VMEM>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_cur_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.cur = vmem($Rx32++$Mu2):nt",
+CVI_VM_CUR_LD, TypeCOPROC_VMEM>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000100;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000011;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2):nt",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_tmp_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_tmp_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_tmp_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_tmp_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_nt_tmp_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_nt_tmp_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_nt_tmp_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2):nt",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011110;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let isNonTemporal = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let isCVLoadable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rt32+#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32 = vmem($Rx32++#$Ii)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32 = vmem($Rx32++$Mu2)",
+CVI_VM_LD, TypeCVI_VM_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000010;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_1244745, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_tmp_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_8437395, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_tmp_npred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_tmp_npred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_tmp_npred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_npred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_npred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_npred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if (!$Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_10039393, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii),
+"$Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_11039423, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2),
+"$Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15949334, Requires<[HasV60T,UseHVX]> {
+let Inst{12-5} = 0b00000010;
+let Inst{31-21} = 0b00101011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_ai : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_13338314, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vL32b_tmp_pred_ai_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rt32+#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_738356, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vL32b_tmp_pred_pi : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_14560494, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_pi_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++#$Ii)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_15560488, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_ppu : HInst<
+(outs VectorRegs:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vL32b_tmp_pred_ppu_128B : HInst<
+(outs VectorRegs128B:$Vd32, IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2),
+"if ($Pv4) $Vd32.tmp = vmem($Rx32++$Mu2)",
+CVI_VM_TMP_LD, TypeCVI_VM_TMP_LD>, Enc_3158657, Requires<[HasV62T,UseHVX]> {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011100;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isCVLoad = 1;
+let mayLoad = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_6923828, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32Ub_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_5757366, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32Ub_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai";
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32Ub_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32Ub_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15459921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_14459927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000111;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_3296020, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_2296022, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b111;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000111;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai";
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32Ub_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rt32+#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32Ub_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15459921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rx32++#$Ii) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_14459927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32Ub_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Pv4) vmemu($Rx32++$Mu2) = $Vs32",
+CVI_VM_STU, TypeCVI_VM_STU>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000110;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32Ub_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_6923828, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_5757366, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_new_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_6608821, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 2;
+}
+def V6_vS32b_new_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2152247, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000001;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def V6_vS32b_new_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_9372046, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_new_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_13937564, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_new_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_3735566, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2735552, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001101;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001101;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_12244921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_11244923, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_1589406, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_1589406, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_9372046, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_new_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_13937564, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_new_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_3735566, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2735552, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_new_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15459921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_14459927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_16279406, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nqpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2703240, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nqpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_12397062, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13397056, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nqpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_6923828, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_5757366, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_new_ai : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_6608821, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 2;
+}
+def V6_vS32b_nt_new_ai_128B : HInst<
+(outs),
+(ins IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2152247, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{12-11} = 0b00;
+let Inst{31-21} = 0b00101000011;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 2;
+}
+def V6_vS32b_nt_new_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_9372046, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_13937564, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_3735566, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2735552, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001111;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001111;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_12244921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_11244923, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b00100;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_1589406, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_1589406, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-3} = 0b0000000100;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_9372046, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_13937564, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 3;
+}
+def V6_vS32b_nt_new_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_3735566, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_2735552, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-3} = 0b01010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001010;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let DecoderNamespace = "EXT_mmvec";
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_new_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Os8),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Os8.new",
+CVI_VM_NEW_ST, TypeCVI_VM_NEW_ST>, Enc_8498433, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-3} = 0b00001010;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let isNVStore = 1;
+let mayStore = 1;
+let isNonTemporal = 1;
+let isNewValue = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let opNewValue = 4;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_npred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_npred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15459921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_14459927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_npred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_16279406, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_nqpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2703240, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_nqpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_12397062, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13397056, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_nqpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if (!$Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000001;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_3296020, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2296022, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011011;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000111;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15459921, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_14459927, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011111;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let BaseOpcode = "V6_vS32b_ppu_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_16279406, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_nt_qpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2703240, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000110;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_nt_qpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_12397062, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13397056, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_nt_qpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2):nt = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011110;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNonTemporal = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_3296020, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2296022, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b00101001001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_11281763, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{12-5} = 0b00000000;
+let Inst{31-21} = 0b00101011001;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let isPredicable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_ai : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_10075393, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_pred_ai_128B : HInst<
+(outs),
+(ins PredRegs:$Pv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_9470751, Requires<[HasV60T,UseHVX]>, NewValueRel {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000101;
+let isPredicated = 1;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ai_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_pred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15459921, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_14459927, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let BaseOpcode = "V6_vS32b_pi_128B";
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_pred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins PredRegs:$Pv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Pv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_15733946, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011101;
+let isPredicated = 1;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let isNVStorable = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_ai : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_16279406, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vS32b_qpred_ai_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rt32, s4_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rt32+#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_2703240, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{31-21} = 0b00101000100;
+let addrMode = BaseImmOffset;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vS32b_qpred_pi : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_12397062, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_pi_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, s3_0Imm:$Ii, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++#$Ii) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13397056, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00101001100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_ppu : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector64Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vS32b_qpred_ppu_128B : HInst<
+(outs IntRegs:$Rx32),
+(ins VecPredRegs128B:$Qv4, IntRegs:$Rx32in, ModRegs:$Mu2, VectorRegs128B:$Vs32),
+"if ($Qv4) vmem($Rx32++$Mu2) = $Vs32",
+CVI_VM_ST, TypeCVI_VM_ST>, Enc_13425035, Requires<[HasV60T,UseHVX]> {
+let Inst{10-5} = 0b000000;
+let Inst{31-21} = 0b00101011100;
+let addrMode = PostInc;
+let accessSize = Vector128Access;
+let mayStore = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def V6_vabsdiffh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.h,$Vv32.h)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.h,$Vv32.h)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vabsdiff($Vu32.ub,$Vv32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vabsdiff($Vu32.ub,$Vv32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.uh,$Vv32.uh)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vabsdiff($Vu32.uh,$Vv32.uh)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vabsdiff($Vu32.w,$Vv32.w)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vabsdiff($Vu32.w,$Vv32.w)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsdiffw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vabsdiffw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsdiffw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vabsdiffw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vabs($Vu32.h)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vabs($Vu32.h)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vabs($Vu32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vabs($Vu32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsh_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsh($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsh_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsh($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.w = vabs($Vu32.w)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.w = vabs($Vu32.w)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.w = vabs($Vu32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.w = vabs($Vu32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vabsw_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vabsw($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vabsw_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vabsw($Vu32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddb_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddb_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.b += $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.b += $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.b += $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.b += $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.b) $Vx32.b += $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddbsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vadd($Vu32.b,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vadd($Vuu32.b,$Vvv32.b):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddbsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddbsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddcarry : HInst<
+(outs VectorRegs:$Vd32, VecPredRegs:$Qx4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, VecPredRegs:$Qx4in),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w,$Qx4):carry",
+CVI_VA, TypeCVI_VA>, Enc_13691337, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vaddcarry_128B : HInst<
+(outs VectorRegs128B:$Vd32, VecPredRegs128B:$Qx4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, VecPredRegs128B:$Qx4in),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w,$Qx4):carry",
+CVI_VA, TypeCVI_VA>, Enc_13691337, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vaddclbh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vadd(vclb($Vu32.h),$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddclbh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vadd(vclb($Vu32.h),$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddclbw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vadd(vclb($Vu32.w),$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddclbw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vadd(vclb($Vu32.w),$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddh_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddh_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.h += $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.h += $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.h += $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.h += $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.h) $Vx32.h += $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vadd($Vu32.h,$Vv32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vadd($Vuu32.h,$Vvv32.h):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vadd($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vadd($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddhw_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vadd($Vu32.h,$Vv32.h)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vadd($Vu32.h,$Vv32.h)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vaddh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vadd($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vadd($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubh_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.h += vadd($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.h += vadd($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vaddubh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vaddub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.ub):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.ub):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.ub = vadd($Vuu32.ub,$Vvv32.ub):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.ub = vadd($Vuu32.ub,$Vvv32.ub):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddubsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddubsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddububb_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddububb_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vadd($Vu32.ub,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vadd($Vu32.uh,$Vv32.uh):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vadd($Vu32.uh,$Vv32.uh):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vadduh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vadduh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uh = vadd($Vuu32.uh,$Vvv32.uh):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uh = vadd($Vuu32.uh,$Vvv32.uh):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vadduh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vadduh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vadd($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vadd($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduhw_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vadd($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vadd($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vadduhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vadduh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vadd($Vu32.uw,$Vv32.uw):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vadd($Vu32.uw,$Vv32.uw):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vadduw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vadduw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uw = vadd($Vuu32.uw,$Vvv32.uw):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uw = vadd($Vuu32.uw,$Vvv32.uw):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vadduwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vadduw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vadduwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vadduw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddw_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddw_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.w += $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.w += $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.w += $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.w += $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.w) $Vx32.w += $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaddwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vadd($Vu32.w,$Vv32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaddw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vadd($Vuu32.w,$Vvv32.w):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaddwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaddwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vaddw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_valignb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = valign($Vu32,$Vv32,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_valignb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = valign($Vu32,$Vv32,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_valignbi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = valign($Vu32,$Vv32,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_valignbi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = valign($Vu32,$Vv32,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vand : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vand($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vand_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vand($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandnqrt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand(!$Qu4,$Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_4711514, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandnqrt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand(!$Qu4,$Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_4711514, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandnqrt_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand(!$Qu4,$Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_4944558, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand(!$Qu4,$Rt32)",
+CVI_VX, TypeCVI_VX>, Enc_4944558, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1001;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandnqrt_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandnqrt_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand(!$Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandqrt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand($Qu4,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_4711514, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandqrt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32 = vand($Qu4,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_4711514, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-10} = 0b0000;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandqrt_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand($Qu4,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_4944558, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32 |= vand($Qu4,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_4944558, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-10} = 0b1000;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vx32.ub |= vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vandqrt_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandqrt_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qu4, IntRegs:$Rt32),
+"$Vd32.ub = vand($Qu4.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvnqv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vu32),
+"$Vd32 = vand(!$Qv4,$Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_1220199, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvnqv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vu32),
+"$Vd32 = vand(!$Qv4,$Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_1220199, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvqv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vu32),
+"$Vd32 = vand($Qv4,$Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_1220199, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvqv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vu32),
+"$Vd32 = vand($Qv4,$Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_1220199, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000011;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvrt : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qd4 = vand($Vu32,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_11498120, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvrt_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qd4 = vand($Vu32,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_11498120, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vandvrt_acc : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qx4 |= vand($Vu32,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_10612292, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_acc_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qx4 |= vand($Vu32,$Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_10612292, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_acc_alt : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qx4.ub |= vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_acc_alt_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qx4.ub |= vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vandvrt_alt : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Qd4.ub = vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vandvrt_alt_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Qd4.ub = vand($Vu32.ub,$Rt32.ub)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasl($Vu32.h,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasl($Vu32.h,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslhv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vasl($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslhv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vasl($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslhv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaslh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslhv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaslh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasl($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasl($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslw_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasl($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasl($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vaslw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vaslw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslwv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vasl($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslwv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vasl($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vaslwv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vaslw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vaslwv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vaslw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasr($Vu32.h,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vasr($Vu32.h,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhbrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhbrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhbrndsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrhbsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhbsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhubrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubrndsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhubsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.ub = vasr($Vu32.h,$Vv32.h,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhubsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrhv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vasr($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vasr($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrhv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vasrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrhv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vasrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasruwuhrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.uw,$Vv32.uw,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasruwuhrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.uw,$Vv32.uw,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasr($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vasr($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrw_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasr($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vasr($Vu32.w,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vasrw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vasrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8)",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8)",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwhrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhrndsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.h = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwuhrndsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwuhrndsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):rnd:sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwuhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwuhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.uh = vasr($Vu32.w,$Vv32.w,$Rt8):sat",
+CVI_VS, TypeCVI_VS>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwuhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vasrwuh($Vu32,$Vv32,$Rt8):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def V6_vasrwv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vasr($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vasr($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vasrwv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vasrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vasrwv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vasrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vassign : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vassign_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vassignp : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32),
+"$Vdd32 = $Vuu32",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vassignp_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32),
+"$Vdd32 = $Vuu32",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavghrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavghrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vavg($Vu32.h,$Vv32.h):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavghrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavghrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgubrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgubrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vavg($Vu32.ub,$Vv32.ub):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgubrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgubrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgub($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguhrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguhrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vavg($Vu32.uh,$Vv32.uh):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavguhrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavguhrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavguh($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgwrnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgwrnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vavg($Vu32.w,$Vv32.w):rnd",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vavgwrnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vavgwrnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vavgw($Vu32,$Vv32):rnd",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vccombine : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"if ($Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_16145290, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vccombine_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"if ($Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_16145290, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010011;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0h : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.uh = vcl0($Vu32.uh)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0h_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.uh = vcl0($Vu32.uh)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0h_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vcl0h($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0h_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vcl0h($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0w : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.uw = vcl0($Vu32.uw)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0w_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.uw = vcl0($Vu32.uw)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcl0w_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vcl0w($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcl0w_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vcl0w($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcmov : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32),
+"if ($Ps4) $Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_12023037, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcmov_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32),
+"if ($Ps4) $Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_12023037, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000000000;
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vcombine : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isRegSequence = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vcombine_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isRegSequence = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vd0 : HInst<
+(outs VectorRegs:$Vd32),
+(ins),
+"$Vd32 = #0",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vd0_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins),
+"$Vd32 = #0",
+CVI_VA, TypeCVI_VA>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdeal : HInst<
+(outs VectorRegs:$Vy32, VectorRegs:$Vx32),
+(ins VectorRegs:$Vy32in, VectorRegs:$Vx32in, IntRegs:$Rt32),
+"vdeal($Vy32,$Vx32,$Rt32)",
+CVI_VP_VS_LONG_EARLY, TypeCVI_VP_VS>, Enc_11422009, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vdeal_128B : HInst<
+(outs VectorRegs128B:$Vy32, VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vy32in, VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"vdeal($Vy32,$Vx32,$Rt32)",
+CVI_VP_VS_LONG_EARLY, TypeCVI_VP_VS>, Enc_11422009, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vdealb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.b = vdeal($Vu32.b)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb4w : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vdeale($Vu32.b,$Vv32.b)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb4w_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vdeale($Vu32.b,$Vv32.b)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealb4w_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vdealb4w($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb4w_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vdealb4w($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.b = vdeal($Vu32.b)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vdealb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vdealb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vdeal($Vu32.h)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vdeal($Vu32.h)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vdealh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vdealh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdealvdd : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vdeal($Vu32,$Vv32,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdealvdd_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vdeal($Vu32,$Vv32,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdelta : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vdelta($Vu32,$Vv32)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdelta_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vdelta($Vu32,$Vv32)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vdmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vdmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vdmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vdmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpybus_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vdmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vdmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpybus_dv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vdmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vdmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpybus_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpybus_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vdmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vdmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhb_dv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vdmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vdmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdmpyhb_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhb_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhisat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_36641, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhisat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_36641, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhisat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5890213, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5890213, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhisat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhisat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vuu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsuisat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_36641, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsuisat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_36641, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsuisat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5890213, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vuu32.h,$Rt32.uh,#1):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5890213, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsuisat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsuisat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vuu32,$Rt32,#1):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsusat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.uh):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsusat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vdmpy($Vu32.h,$Rt32.uh):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhsusat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.uh):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vdmpy($Vu32.h,$Rt32.uh):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhsusat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhsusat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vdmpyhsu($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhvsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vdmpy($Vu32.h,$Vv32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhvsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vdmpy($Vu32.h,$Vv32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdmpyhvsat_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vdmpy($Vu32.h,$Vv32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vdmpy($Vu32.h,$Vv32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vdmpyhvsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdmpyhvsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vdmpyh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdsaduh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.uw = vdsad($Vuu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdsaduh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.uw = vdsad($Vuu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vdsaduh_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.uw += vdsad($Vuu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.uw += vdsad($Vuu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vdsaduh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vdsaduh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vdsaduh($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqb : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqb_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqb_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqb_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqh_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqh_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_veqw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_veqw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_veqw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.eq($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtb_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtb_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtb_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgth_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgth_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgth_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtub_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtub_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtub_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtuh_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtuh_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuh_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtuw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtuw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b001010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b011010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtuw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b101010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw : HInst<
+(outs VecPredRegs:$Qd4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vgtw_128B : HInst<
+(outs VecPredRegs128B:$Qd4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qd4 = vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_13983714, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vgtw_and : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_and_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 &= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b000110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_or : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_or_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 |= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b010110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_xor : HInst<
+(outs VecPredRegs:$Qx4),
+(ins VecPredRegs:$Qx4in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vgtw_xor_128B : HInst<
+(outs VecPredRegs128B:$Qx4),
+(ins VecPredRegs128B:$Qx4in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Qx4 ^= vcmp.gt($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_7470998, Requires<[HasV60T,UseHVX]> {
+let Inst{7-2} = 0b100110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vhist : HInst<
+(outs),
+(ins),
+"vhist",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vhist_128B : HInst<
+(outs),
+(ins),
+"vhist",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vhistq : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vhist($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vhistq_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vhist($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV60T,UseHVX]> {
+let Inst{13-0} = 0b10000010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vinsertwr : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, IntRegs:$Rt32),
+"$Vx32.w = vinsert($Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_313333, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b100000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vinsertwr_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"$Vx32.w = vinsert($Rt32)",
+CVI_VX_LATE, TypeCVI_VX>, Enc_313333, Requires<[HasV60T,UseHVX]> {
+let Inst{13-5} = 0b100000001;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlalignb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vlalign($Vu32,$Vv32,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlalignb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32 = vlalign($Vu32,$Vv32,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlalignbi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = vlalign($Vu32,$Vv32,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlalignbi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vd32 = vlalign($Vu32,$Vv32,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV60T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.ub = vlsr($Vu32.ub,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.ub = vlsr($Vu32.ub,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.uh = vlsr($Vu32.uh,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.uh = vlsr($Vu32.uh,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrhv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vlsr($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrhv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vlsr($Vu32.h,$Vv32.h)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrhv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vlsrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrhv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vlsrh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vlsr($Vu32.uw,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vlsr($Vu32.uw,$Rt32)",
+CVI_VS, TypeCVI_VS>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vlsrw($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrwv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vlsr($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrwv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vlsr($Vu32.w,$Vv32.w)",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlsrwv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vlsrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlsrwv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vlsrw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvvb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvvb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvvb_nm : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8):nomatch",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvvb_nm_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,$Rt8):nomatch",
+CVI_VP_LONG, TypeCVI_VP>, Enc_11083408, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvvb_oracc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_8877260, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvb_oracc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_8877260, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvb_oracci : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_8280533, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvb_oracci_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vx32.b |= vlut32($Vu32.b,$Vv32.b,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_8280533, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vlutvvbi : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvvbi_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vd32.b = vlut32($Vu32.b,$Vv32.b,#$Ii)",
+CVI_VP_LONG, TypeCVI_VP>, Enc_7171569, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvwh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvwh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvwh_nm : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8):nomatch",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvwh_nm_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,$Rt8):nomatch",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-24} = 0b00011000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vlutvwh_oracc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_16213761, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwh_oracc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_16213761, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwh_oracci : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_3457570, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwh_oracci_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vxx32.h |= vlut16($Vu32.b,$Vv32.h,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_3457570, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vlutvwhi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, u3_0Imm:$Ii),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_13261538, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vlutvwhi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, u3_0Imm:$Ii),
+"$Vdd32.h = vlut16($Vu32.b,$Vv32.h,#$Ii)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_13261538, Requires<[HasV62T,UseHVX]> {
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vmax($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vmax($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmax($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmax($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vmax($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vmax($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vmax($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vmax($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmax($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmax($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmaxw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmaxw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmaxw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmaxw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vmin($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vmin($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmin($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmin($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vmin($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vmin($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vmin($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vmin($Vu32.uh,$Vv32.uh)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmin($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmin($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vminw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vminw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vminw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vminw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpa($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpa($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpabus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpabus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabusv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabusv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabusv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vmpabus($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabusv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vmpabus($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabuuv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.ub)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabuuv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vmpa($Vuu32.ub,$Vvv32.ub)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpabuuv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vmpabuu($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpabuuv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vmpabuu($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpahb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpahb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpahb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpahb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpahb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpahb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpauhb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.uh,$Rt32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpauhb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpa($Vuu32.uh,$Rt32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpauhb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.uh,$Rt32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpa($Vuu32.uh,$Rt32.b)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpauhb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpauhb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vmpauhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpy($Vu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.h = vmpy($Vu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpy($Vu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.h += vmpy($Vu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybusv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybusv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybusv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.h += vmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.h += vmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybusv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybusv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpybv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.h += vmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.h += vmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpybv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpybv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyewuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpye($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyewuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpye($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyewuh_64 : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpye($Vu32.w,$Vv32.uh)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyewuh_64_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpye($Vu32.w,$Vv32.uh)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyewuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyewuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpy($Vu32.h,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.w = vmpy($Vu32.h,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhsat_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsat_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.w += vmpy($Vu32.h,$Rt32.h):sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsat_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsat_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyh($Vu32,$Rt32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhsrs : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhsrs_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhsrs_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhsrs_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhss : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhss_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpy($Vu32.h,$Rt32.h):<<1:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhss_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhss_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyh($Vu32,$Rt32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyhus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vmpy($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.w += vmpy($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyhv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhvsrs : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmpy($Vu32.h,$Vv32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhvsrs_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmpy($Vu32.h,$Vv32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyhvsrs_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyhvsrs_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyieoh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyieo($Vu32.h,$Vv32.h)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyieoh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyieo($Vu32.h,$Vv32.h)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiewh_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewh_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewh_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyiewh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewh_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyiewh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyie($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiewuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyie($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiewuh_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyie($Vu32.w,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiewuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiewuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyiewuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyih : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vmpyi($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyih_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vmpyi($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyih_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.h += vmpyi($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.h += vmpyi($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyih_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyih_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyih($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyihb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpyi($Vu32.h,$Rt32.b)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyihb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.h = vmpyi($Vu32.h,$Rt32.b)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyihb_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vmpyi($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.h += vmpyi($Vu32.h,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyihb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyihb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyihb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiowh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyio($Vu32.w,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiowh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyio($Vu32.w,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiowh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyiowh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiowh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyiowh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwb_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwb($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwh_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.ub)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_16214129, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vmpyi($Vu32.w,$Rt32.ub)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_16214129, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyiwub_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.ub)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_10058269, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vmpyi($Vu32.w,$Rt32.ub)",
+CVI_VX_LONG, TypeCVI_VX>, Enc_10058269, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyiwub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyiwub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vmpyiwub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_64_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyo($Vu32.w,$Vv32.h)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyowh_64_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyo($Vu32.w,$Vv32.h)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyowh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_rnd : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_rnd_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_rnd_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyowh_rnd_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmpyowh($Vu32,$Vv32):<<1:rnd:sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyowh_rnd_sacc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat:shift",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_rnd_sacc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:rnd:sat:shift",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_rnd_sacc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:rnd:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_rnd_sacc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:rnd:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:sat:shift",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vmpyo($Vu32.w,$Vv32.h):<<1:sat:shift",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyowh_sacc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vmpyowh($Vu32,$Vv32):<<1:sat:shift",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vmpyub : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyub_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyub_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyub_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyub_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyubv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyubv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.uh = vmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyubv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.uh += vmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyubv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyubv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_11471622, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuh_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Rt32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2153798, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vxx32 += vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vdd32 = vmpyuh($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuhv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuhv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.uw = vmpy($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmpyuhv_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32.uw += vmpy($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5972412, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vxx32 += vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vxx32 += vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vmpyuhv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmpyuhv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vmpyuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vmux : HInst<
+(outs VectorRegs:$Vd32),
+(ins VecPredRegs:$Qt4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vmux($Qt4,$Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_1572239, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vmux_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VecPredRegs128B:$Qt4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vmux($Qt4,$Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_1572239, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vnavg($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vnavg($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vnavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vnavgh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vnavg($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vnavg($Vu32.ub,$Vv32.ub)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vnavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vnavgub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vnavg($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vnavg($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnavgw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vnavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnavgw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vnavgw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnccombine : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"if (!$Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_16145290, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnccombine_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"if (!$Ps4) $Vdd32 = vcombine($Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_16145290, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011010010;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vncmov : HInst<
+(outs VectorRegs:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs:$Vu32),
+"if (!$Ps4) $Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_12023037, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vncmov_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins PredRegs:$Ps4, VectorRegs128B:$Vu32),
+"if (!$Ps4) $Vd32 = $Vu32",
+CVI_VA, TypeCVI_VA>, Enc_12023037, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001101000100000;
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamth : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vnormamt($Vu32.h)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamth_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vnormamt($Vu32.h)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamth_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vnormamth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamth_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vnormamth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamtw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.w = vnormamt($Vu32.w)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamtw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.w = vnormamt($Vu32.w)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnormamtw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vnormamtw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnormamtw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vnormamtw($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vnot : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vnot($Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vnot_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vnot($Vu32)",
+CVI_VA, TypeCVI_VA>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vor : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vor($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vor_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vor($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vpacke($Vu32.h,$Vv32.h)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vpacke($Vu32.h,$Vv32.h)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vpacke($Vu32.w,$Vv32.w)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vpacke($Vu32.w,$Vv32.w)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackeh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackeh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhb_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vpack($Vu32.h,$Vv32.h):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhb_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vpack($Vu32.h,$Vv32.h):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhb_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhb_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhub_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vpack($Vu32.h,$Vv32.h):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhub_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vpack($Vu32.h,$Vv32.h):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackhub_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackhub_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackob : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vpacko($Vu32.h,$Vv32.h)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackob_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vpacko($Vu32.h,$Vv32.h)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackob_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackob_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackoh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vpacko($Vu32.w,$Vv32.w)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackoh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vpacko($Vu32.w,$Vv32.w)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackoh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackoh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwh_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vpack($Vu32.w,$Vv32.w):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwh_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vpack($Vu32.w,$Vv32.w):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwh_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwh_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwuh_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vpack($Vu32.w,$Vv32.w):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwuh_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vpack($Vu32.w,$Vv32.w):sat",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpackwuh_sat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vpackwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpackwuh_sat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vpackwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpopcounth : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vpopcount($Vu32.h)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpopcounth_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vpopcount($Vu32.h)",
+CVI_VS, TypeCVI_VS>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vpopcounth_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vpopcounth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vpopcounth_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vpopcounth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrdelta : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrdelta($Vu32,$Vv32)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrdelta_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrdelta($Vu32,$Vv32)",
+CVI_VP, TypeCVI_VP>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybus : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vrmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybus_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.w = vrmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybus_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vrmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.w += vrmpy($Vu32.ub,$Rt32.b)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybus_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybus_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpybus($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.w = vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.w = vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusi_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.w += vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.w += vrmpy($Vuu32.ub,$Rt32.b,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b10;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpybusi_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusi_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpybus($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vrmpy($Vu32.ub,$Vv32.b)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vrmpy($Vu32.ub,$Vv32.b)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybusv_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vrmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vrmpy($Vu32.ub,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybusv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybusv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrmpybus($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vrmpy($Vu32.b,$Vv32.b)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vrmpy($Vu32.b,$Vv32.b)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpybv_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.w += vrmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.w += vrmpy($Vu32.b,$Vv32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpybv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpybv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrmpyb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyub_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Rt32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_10058269, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vx32 += vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vrmpyub($Vu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubi_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrmpy($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrmpyubi_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubi_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrmpyub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubv : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubv_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vrmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX, TypeCVI_VX>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrmpyubv_acc : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_acc_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32.uw += vrmpy($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_2328527, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_acc_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VectorRegs:$Vx32in, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vx32 += vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_acc_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vx32 += vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vrmpyubv_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrmpyubv_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrmpyub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vror : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vror($Vu32,$Rt32)",
+CVI_VP, TypeCVI_VP>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vror_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, IntRegs:$Rt32),
+"$Vd32 = vror($Vu32,$Rt32)",
+CVI_VP, TypeCVI_VP>, Enc_16214129, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vround($Vu32.h,$Vv32.h):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vround($Vu32.h,$Vv32.h):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundhb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vround($Vu32.h,$Vv32.h):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vround($Vu32.h,$Vv32.h):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundhub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundhub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduhub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vround($Vu32.uh,$Vv32.uh):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduhub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vround($Vu32.uh,$Vv32.uh):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduhub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrounduhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduhub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrounduhub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduwuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vround($Vu32.uw,$Vv32.uw):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduwuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vround($Vu32.uw,$Vv32.uw):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrounduwuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vrounduwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrounduwuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vrounduwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vround($Vu32.w,$Vv32.w):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vround($Vu32.w,$Vv32.w):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundwh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vround($Vu32.w,$Vv32.w):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vround($Vu32.w,$Vv32.w):sat",
+CVI_VS, TypeCVI_VS>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vroundwuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vroundwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vroundwuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vroundwuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrsadubi : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrsadubi_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32.uw = vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_14172170, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vrsadubi_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32.uw += vrsad($Vuu32.ub,$Rt32.ub,#$Ii)",
+CVI_VX_DV_LONG, TypeCVI_VX_DV>, Enc_13189194, Requires<[HasV60T,UseHVX]> {
+let Inst{7-6} = 0b11;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vxx32 += vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vrsadubi_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vrsadubi_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii),
+"$Vdd32 = vrsadub($Vuu32,$Rt32,#$Ii)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsathub : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vsat($Vu32.h,$Vv32.h)",
+CVI_VINLANESAT, TypeCVI_VINLANESAT>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsathub_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vsat($Vu32.h,$Vv32.h)",
+CVI_VINLANESAT, TypeCVI_VINLANESAT>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsathub_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsathub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsathub_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsathub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatuwuh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vsat($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatuwuh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vsat($Vu32.uw,$Vv32.uw)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatuwuh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsatuwuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatuwuh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsatuwuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatwh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vsat($Vu32.w,$Vv32.w)",
+CVI_VINLANESAT, TypeCVI_VINLANESAT>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatwh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vsat($Vu32.w,$Vv32.w)",
+CVI_VINLANESAT, TypeCVI_VINLANESAT>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsatwh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsatwh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsatwh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsatwh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.h = vsxt($Vu32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.h = vsxt($Vu32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vsxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vsxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.w = vsxt($Vu32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.w = vsxt($Vu32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vsxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vsxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufeh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vshuffe($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufeh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vshuffe($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufeh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufeh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuff : HInst<
+(outs VectorRegs:$Vy32, VectorRegs:$Vx32),
+(ins VectorRegs:$Vy32in, VectorRegs:$Vx32in, IntRegs:$Rt32),
+"vshuff($Vy32,$Vx32,$Rt32)",
+CVI_VP_VS_LONG_EARLY, TypeCVI_VP_VS>, Enc_11422009, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vshuff_128B : HInst<
+(outs VectorRegs128B:$Vy32, VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vy32in, VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"vshuff($Vy32,$Vx32,$Rt32)",
+CVI_VP_VS_LONG_EARLY, TypeCVI_VP_VS>, Enc_11422009, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vshuffb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.b = vshuff($Vu32.b)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.b = vshuff($Vu32.b)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vshuffb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vshuffb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffeb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vshuffe($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffeb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vshuffe($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffeb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffeb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32.h = vshuff($Vu32.h)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32.h = vshuff($Vu32.h)",
+CVI_VP, TypeCVI_VP>, Enc_900013, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32),
+"$Vd32 = vshuffh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32),
+"$Vd32 = vshuffh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffob : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vshuffo($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffob_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vshuffo($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffob_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffob_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffob($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshuffvdd : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vshuff($Vu32,$Vv32,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshuffvdd_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, IntRegsLow8:$Rt8),
+"$Vdd32 = vshuff($Vu32,$Vv32,$Rt8)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_14767681, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{31-24} = 0b00011011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.b = vshuffoe($Vu32.b,$Vv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.b = vshuffoe($Vu32.b,$Vv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vshuffoeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vshuffoeb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vshuffoe($Vu32.h,$Vv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vshuffoe($Vu32.h,$Vv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoeh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vshuffoeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoeh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vshuffoeh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vshuffo($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vshuffo($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vshufoh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vshuffoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vshufoh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vshuffoh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubb_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubb_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.b -= $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.b -= $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.b -= $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.b -= $Vu32.b",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.b) $Vx32.b -= $Vu32.b",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubbsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.b = vsub($Vu32.b,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubb($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.b = vsub($Vuu32.b,$Vvv32.b):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubbsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubbsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubb($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubcarry : HInst<
+(outs VectorRegs:$Vd32, VecPredRegs:$Qx4),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32, VecPredRegs:$Qx4in),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w,$Qx4):carry",
+CVI_VA, TypeCVI_VA>, Enc_13691337, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vsubcarry_128B : HInst<
+(outs VectorRegs128B:$Vd32, VecPredRegs128B:$Qx4),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32, VecPredRegs128B:$Qx4in),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w,$Qx4):carry",
+CVI_VA, TypeCVI_VA>, Enc_13691337, Requires<[HasV62T,UseHVX]> {
+let Inst{7-7} = 0b1;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Qx4 = $Qx4in";
+}
+def V6_vsubh : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubh_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubh_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubh_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubh_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.h -= $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.h -= $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.h -= $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.h -= $Vu32.h",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000001;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.h) $Vx32.h -= $Vu32.h",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.h = vsub($Vu32.h,$Vv32.h):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.h = vsub($Vuu32.h,$Vvv32.h):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vsub($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vsub($Vu32.h,$Vv32.h)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vsubh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.h = vsub($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.h = vsub($Vu32.ub,$Vv32.ub)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vsubub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vsubub($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.ub):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.ub):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubub($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.ub = vsub($Vuu32.ub,$Vvv32.ub):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.ub = vsub($Vuu32.ub,$Vvv32.ub):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsububsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsububsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubub($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubububb_sat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubububb_sat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.ub = vsub($Vu32.ub,$Vv32.b):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uh = vsub($Vu32.uh,$Vv32.uh):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uh = vsub($Vu32.uh,$Vv32.uh):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubuh($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uh = vsub($Vuu32.uh,$Vvv32.uh):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uh = vsub($Vuu32.uh,$Vvv32.uh):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubuh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubuh($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhw : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32.w = vsub($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhw_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32.w = vsub($Vu32.uh,$Vv32.uh)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_15290236, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuhw_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vsubuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuhw_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vsubuh($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.uw = vsub($Vu32.uw,$Vv32.uw):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.uw = vsub($Vu32.uw,$Vv32.uw):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011111110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubuw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubuw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.uw = vsub($Vuu32.uw,$Vvv32.uw):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.uw = vsub($Vuu32.uw,$Vvv32.uw):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV62T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubuwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubuw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubuwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubuw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV62T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubw_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubw_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwnq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4) $Vx32.w -= $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwnq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4) $Vx32.w -= $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwnq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if (!$Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwnq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if (!$Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4) $Vx32.w -= $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4) $Vx32.w -= $Vu32.w",
+CVI_VA, TypeCVI_VA>, Enc_12535811, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq_alt : HInst<
+(outs VectorRegs:$Vx32),
+(ins VecPredRegs:$Qv4, VectorRegs:$Vx32in, VectorRegs:$Vu32),
+"if ($Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwq_alt_128B : HInst<
+(outs VectorRegs128B:$Vx32),
+(ins VecPredRegs128B:$Qv4, VectorRegs128B:$Vx32in, VectorRegs128B:$Vu32),
+"if ($Qv4.w) $Vx32.w -= $Vu32.w",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vx32 = $Vx32in";
+}
+def V6_vsubwsat : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32.w = vsub($Vu32.w,$Vv32.w):sat",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwsat_alt : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_alt_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vsubw($Vu32,$Vv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwsat_dv : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_dv_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32.w = vsub($Vuu32.w,$Vvv32.w):sat",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_13211717, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vsubwsat_dv_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, VecDblRegs:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vsubwsat_dv_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, VecDblRegs128B:$Vvv32),
+"$Vdd32 = vsubw($Vuu32,$Vvv32):sat",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vswap : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecPredRegs:$Qt4, VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vdd32 = vswap($Qt4,$Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_11424254, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vswap_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecPredRegs128B:$Qt4, VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vdd32 = vswap($Qt4,$Vu32,$Vv32)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_11424254, Requires<[HasV60T,UseHVX]> {
+let Inst{7-7} = 0b0;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011110101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.b,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.b,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.b,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.b,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpybus : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpybus_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.h = vtmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpybus_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.h += vtmpy($Vuu32.ub,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpybus_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpybus_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpybus($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyhb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vtmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyhb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32.w = vtmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_5023792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtmpyhb_acc : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vtmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_acc_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32.w += vtmpy($Vuu32.h,$Rt32.b)",
+CVI_VX_DV, TypeCVI_VX_DV>, Enc_4327792, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_acc_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_acc_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vxx32 += vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vtmpyhb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VecDblRegs:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vtmpyhb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VecDblRegs128B:$Vuu32, IntRegs:$Rt32),
+"$Vdd32 = vtmpyhb($Vuu32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vtran2x2_map : HInst<
+(outs VectorRegs:$Vy32, VectorRegs:$Vx32),
+(ins VectorRegs:$Vy32in, VectorRegs:$Vx32in, IntRegs:$Rt32),
+"vtrans2x2($Vy32,$Vx32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vtran2x2_map_128B : HInst<
+(outs VectorRegs128B:$Vy32, VectorRegs128B:$Vx32),
+(ins VectorRegs128B:$Vy32in, VectorRegs128B:$Vx32in, IntRegs:$Rt32),
+"vtrans2x2($Vy32,$Vx32,$Rt32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let hasNewValue2 = 1;
+let opNewValue2 = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
+}
+def V6_vunpackb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.h = vunpack($Vu32.b)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.h = vunpack($Vu32.b)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.w = vunpack($Vu32.h)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.w = vunpack($Vu32.h)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackob : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32.h |= vunpacko($Vu32.b)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_12669374, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackob_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32.h |= vunpacko($Vu32.b)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_12669374, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackob_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32 |= vunpackob($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackob_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32 |= vunpackob($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32.w |= vunpacko($Vu32.h)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_12669374, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32.w |= vunpacko($Vu32.h)",
+CVI_VP_VS_LONG, TypeCVI_VP_VS>, Enc_12669374, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b1;
+let Inst{31-16} = 0b0001111000000000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh_alt : HInst<
+(outs VecDblRegs:$Vxx32),
+(ins VecDblRegs:$Vxx32in, VectorRegs:$Vu32),
+"$Vxx32 |= vunpackoh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackoh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vxx32),
+(ins VecDblRegs128B:$Vxx32in, VectorRegs128B:$Vu32),
+"$Vxx32 |= vunpackoh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isAccumulator = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+let Constraints = "$Vxx32 = $Vxx32in";
+}
+def V6_vunpackub : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uh = vunpack($Vu32.ub)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackub_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uh = vunpack($Vu32.ub)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackub_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackub($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackub_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackub($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackuh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uw = vunpack($Vu32.uh)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackuh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uw = vunpack($Vu32.uh)",
+CVI_VP_VS, TypeCVI_VP_VS>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vunpackuh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vunpackuh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vunpackuh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vunpackuh($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128 : HInst<
+(outs),
+(ins),
+"vwhist128",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128_128B : HInst<
+(outs),
+(ins),
+"vwhist128",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128m : HInst<
+(outs),
+(ins u1_0Imm:$Ii),
+"vwhist128(#$Ii)",
+CVI_HIST, TypeCVI_HIST>, Enc_1291652, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128m_128B : HInst<
+(outs),
+(ins u1_0Imm:$Ii),
+"vwhist128(#$Ii)",
+CVI_HIST, TypeCVI_HIST>, Enc_1291652, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128q : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vwhist128($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128q_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vwhist128($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10010010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist128qm : HInst<
+(outs),
+(ins VecPredRegs:$Qv4, u1_0Imm:$Ii),
+"vwhist128($Qv4,#$Ii)",
+CVI_HIST, TypeCVI_HIST>, Enc_7978128, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist128qm_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4, u1_0Imm:$Ii),
+"vwhist128($Qv4,#$Ii)",
+CVI_HIST, TypeCVI_HIST>, Enc_7978128, Requires<[HasV62T,UseHVX]> {
+let Inst{7-0} = 0b10000000;
+let Inst{13-9} = 0b10011;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256 : HInst<
+(outs),
+(ins),
+"vwhist256",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256_128B : HInst<
+(outs),
+(ins),
+"vwhist256",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256_sat : HInst<
+(outs),
+(ins),
+"vwhist256:sat",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256_sat_128B : HInst<
+(outs),
+(ins),
+"vwhist256:sat",
+CVI_HIST, TypeCVI_HIST>, Enc_0, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{31-16} = 0b0001111000000000;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256q : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vwhist256($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256q_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vwhist256($Qv4)",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001010000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vwhist256q_sat : HInst<
+(outs),
+(ins VecPredRegs:$Qv4),
+"vwhist256($Qv4):sat",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vwhist256q_sat_128B : HInst<
+(outs),
+(ins VecPredRegs128B:$Qv4),
+"vwhist256($Qv4):sat",
+CVI_HIST, TypeCVI_HIST>, Enc_4109168, Requires<[HasV62T,UseHVX]> {
+let Inst{13-0} = 0b10001110000000;
+let Inst{21-16} = 0b000010;
+let Inst{31-24} = 0b00011110;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vxor : HInst<
+(outs VectorRegs:$Vd32),
+(ins VectorRegs:$Vu32, VectorRegs:$Vv32),
+"$Vd32 = vxor($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vxor_128B : HInst<
+(outs VectorRegs128B:$Vd32),
+(ins VectorRegs128B:$Vu32, VectorRegs128B:$Vv32),
+"$Vd32 = vxor($Vu32,$Vv32)",
+CVI_VA, TypeCVI_VA>, Enc_6223403, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b111;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b00011100001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzb : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uh = vzxt($Vu32.ub)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzb_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uh = vzxt($Vu32.ub)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b001;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzb_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vzxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzb_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vzxtb($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzh : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32.uw = vzxt($Vu32.uh)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzh_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32.uw = vzxt($Vu32.uh)",
+CVI_VA_DV, TypeCVI_VA_DV>, Enc_14631806, Requires<[HasV60T,UseHVX]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0001111000000010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def V6_vzh_alt : HInst<
+(outs VecDblRegs:$Vdd32),
+(ins VectorRegs:$Vu32),
+"$Vdd32 = vzxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
+def V6_vzh_alt_128B : HInst<
+(outs VecDblRegs128B:$Vdd32),
+(ins VectorRegs128B:$Vu32),
+"$Vdd32 = vzxth($Vu32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV60T,UseHVX]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let DecoderNamespace = "EXT_mmvec";
+let isCodeGenOnly = 1;
+}
+def Y2_barrier : HInst<
+(outs),
+(ins),
+"barrier",
+ST_tc_3stall_SLOT0, TypeST>, Enc_0 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b1010100000000000;
+let isSoloAX = 1;
+let hasSideEffects = 1;
+}
+def Y2_break : HInst<
+(outs),
+(ins),
+"brkpt",
+CR_tc_3x_SLOT3, TypeCR>, Enc_0 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b0110110000100000;
+let isSolo = 1;
+}
+def Y2_dccleana : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dccleana($Rs32)",
+ST_tc_ld_SLOT0, TypeST>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000000;
+let isSoloAin1 = 1;
+}
+def Y2_dccleaninva : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dccleaninva($Rs32)",
+ST_tc_ld_SLOT0, TypeST>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000010;
+let isSoloAin1 = 1;
+}
+def Y2_dcfetch : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dcfetch($Rs32)",
+PSEUDO, TypeMAPPING> {
+let hasSideEffects = 1;
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def Y2_dcfetchbo : HInst<
+(outs),
+(ins IntRegs:$Rs32, u11_3Imm:$Ii),
+"dcfetch($Rs32+#$Ii)",
+LD_tc_ld_SLOT0, TypeLD>, Enc_4983213 {
+let Inst{13-11} = 0b000;
+let Inst{31-21} = 0b10010100000;
+let addrMode = BaseImmOffset;
+let hasSideEffects = 1;
+}
+def Y2_dcinva : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dcinva($Rs32)",
+ST_tc_ld_SLOT0, TypeST>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000001;
+let isSoloAin1 = 1;
+}
+def Y2_dczeroa : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"dczeroa($Rs32)",
+ST_tc_ld_SLOT0, TypeST>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b10100000110;
+let mayStore = 1;
+let isSoloAin1 = 1;
+}
+def Y2_icinva : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"icinva($Rs32)",
+J_tc_2early_SLOT2, TypeJ>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01010110110;
+let isSolo = 1;
+}
+def Y2_isync : HInst<
+(outs),
+(ins),
+"isync",
+J_tc_2early_SLOT2, TypeJ>, Enc_0 {
+let Inst{13-0} = 0b00000000000010;
+let Inst{31-16} = 0b0101011111000000;
+let isSolo = 1;
+}
+def Y2_syncht : HInst<
+(outs),
+(ins),
+"syncht",
+ST_tc_ld_SLOT0, TypeST>, Enc_0 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b1010100001000000;
+let isSolo = 1;
+}
+def Y4_l2fetch : HInst<
+(outs),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"l2fetch($Rs32,$Rt32)",
+ST_tc_3stall_SLOT0, TypeST>, Enc_14620934 {
+let Inst{7-0} = 0b00000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100110000;
+let isSoloAX = 1;
+let mayStore = 1;
+let hasSideEffects = 1;
+}
+def Y4_trace : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"trace($Rs32)",
+CR_tc_2early_SLOT3, TypeCR>, Enc_11704059 {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-21} = 0b01100010010;
+let isSoloAX = 1;
+}
+def Y5_l2fetch : HInst<
+(outs),
+(ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
+"l2fetch($Rs32,$Rtt32)",
+ST_tc_3stall_SLOT0, TypeST>, Enc_8943121, Requires<[HasV5T]> {
+let Inst{7-0} = 0b00000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10100110100;
+let isSoloAX = 1;
+let mayStore = 1;
+let hasSideEffects = 1;
+}
+def dep_A2_addsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add($Rs32,$Rt32):sat:deprecated",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_14071773 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def dep_A2_subsat : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rt32, IntRegs:$Rs32),
+"$Rd32 = sub($Rt32,$Rs32):sat:deprecated",
+ALU64_tc_2_SLOT23, TypeALU64>, Enc_8605375 {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010101100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let Defs = [USR_OVF];
+}
+def dep_S2_packhl : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rdd32 = packhl($Rs32,$Rt32):deprecated",
+ALU64_tc_1_SLOT23, TypeALU64>, Enc_1997594 {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11010100000;
+}
diff --git a/lib/Target/Hexagon/HexagonDepMappings.td b/lib/Target/Hexagon/HexagonDepMappings.td
new file mode 100644
index 0000000000000000000000000000000000000000..77a56a9adf10e0ec2f143c56087ce3a72822d933
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepMappings.td
@@ -0,0 +1,654 @@
+//===--- HexagonDepMappings.td --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def A2_negAlias : InstAlias<"$Rd32=neg($Rs32)", (A2_subri IntRegs:$Rd32, 0, IntRegs:$Rs32)>;
+def A2_notAlias : InstAlias<"$Rd32=not($Rs32)", (A2_subri IntRegs:$Rd32, -1, IntRegs:$Rs32)>;
+def A2_tfrfAlias : InstAlias<"if (!$Pu4) $Rd32=$Rs32", (A2_paddif IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_tfrfnewAlias : InstAlias<"if (!$Pu4.new) $Rd32=$Rs32", (A2_paddifnew IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_tfrtAlias : InstAlias<"if ($Pu4) $Rd32=$Rs32", (A2_paddit IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_tfrtnewAlias : InstAlias<"if ($Pu4.new) $Rd32=$Rs32", (A2_padditnew IntRegs:$Rd32, PredRegs:$Pu4, IntRegs:$Rs32, 0)>;
+def A2_vaddb_mapAlias : InstAlias<"$Rdd32=vaddb($Rss32,$Rtt32)", (A2_vaddub DoubleRegs:$Rdd32, DoubleRegs:$Rss32, DoubleRegs:$Rtt32)>;
+def A2_vsubb_mapAlias : InstAlias<"$Rdd32=vsubb($Rss32,$Rtt32)", (A2_vsubub DoubleRegs:$Rdd32, DoubleRegs:$Rss32, DoubleRegs:$Rtt32)>;
+def A2_zxtbAlias : InstAlias<"$Rd32=zxtb($Rs32)", (A2_andir IntRegs:$Rd32, IntRegs:$Rs32, 255)>;
+def C2_cmpltAlias : InstAlias<"$Pd4=cmp.lt($Rs32,$Rt32)", (C2_cmpgt PredRegs:$Pd4, IntRegs:$Rt32, IntRegs:$Rs32)>;
+def C2_cmpltuAlias : InstAlias<"$Pd4=cmp.ltu($Rs32,$Rt32)", (C2_cmpgtu PredRegs:$Pd4, IntRegs:$Rt32, IntRegs:$Rs32)>;
+def C2_pxfer_mapAlias : InstAlias<"$Pd4=$Ps4", (C2_or PredRegs:$Pd4, PredRegs:$Ps4, PredRegs:$Ps4)>;
+def J2_jumpf_nopred_mapAlias : InstAlias<"if (!$Pu4) jump $Ii", (J2_jumpf PredRegs:$Pu4, b30_2Imm:$Ii)>;
+def J2_jumprf_nopred_mapAlias : InstAlias<"if (!$Pu4) jumpr $Rs32", (J2_jumprf PredRegs:$Pu4, IntRegs:$Rs32)>;
+def J2_jumprt_nopred_mapAlias : InstAlias<"if ($Pu4) jumpr $Rs32", (J2_jumprt PredRegs:$Pu4, IntRegs:$Rs32)>;
+def J2_jumpt_nopred_mapAlias : InstAlias<"if ($Pu4) jump $Ii", (J2_jumpt PredRegs:$Pu4, b30_2Imm:$Ii)>;
+def L2_loadalignb_zomapAlias : InstAlias<"$Ryy32=memb_fifo($Rs32)", (L2_loadalignb_io DoubleRegs:$Ryy32, IntRegs:$Rs32, 0)>;
+def L2_loadalignh_zomapAlias : InstAlias<"$Ryy32=memh_fifo($Rs32)", (L2_loadalignh_io DoubleRegs:$Ryy32, IntRegs:$Rs32, 0)>;
+def L2_loadbsw2_zomapAlias : InstAlias<"$Rd32=membh($Rs32)", (L2_loadbsw2_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadbsw4_zomapAlias : InstAlias<"$Rdd32=membh($Rs32)", (L2_loadbsw4_io DoubleRegs:$Rdd32, IntRegs:$Rs32, 0)>;
+def L2_loadbzw2_zomapAlias : InstAlias<"$Rd32=memubh($Rs32)", (L2_loadbzw2_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadbzw4_zomapAlias : InstAlias<"$Rdd32=memubh($Rs32)", (L2_loadbzw4_io DoubleRegs:$Rdd32, IntRegs:$Rs32, 0)>;
+def L2_loadrb_zomapAlias : InstAlias<"$Rd32=memb($Rs32)", (L2_loadrb_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadrd_zomapAlias : InstAlias<"$Rdd32=memd($Rs32)", (L2_loadrd_io DoubleRegs:$Rdd32, IntRegs:$Rs32, 0)>;
+def L2_loadrh_zomapAlias : InstAlias<"$Rd32=memh($Rs32)", (L2_loadrh_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadri_zomapAlias : InstAlias<"$Rd32=memw($Rs32)", (L2_loadri_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadrub_zomapAlias : InstAlias<"$Rd32=memub($Rs32)", (L2_loadrub_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_loadruh_zomapAlias : InstAlias<"$Rd32=memuh($Rs32)", (L2_loadruh_io IntRegs:$Rd32, IntRegs:$Rs32, 0)>;
+def L2_ploadrbf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memb($Rs32)", (L2_ploadrbf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrbfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memb($Rs32)", (L2_ploadrbfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrbt_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memb($Rs32)", (L2_ploadrbt_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrbtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memb($Rs32)", (L2_ploadrbtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdf_zomapAlias : InstAlias<"if (!$Pt4) $Rdd32=memd($Rs32)", (L2_ploadrdf_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rdd32=memd($Rs32)", (L2_ploadrdfnew_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdt_zomapAlias : InstAlias<"if ($Pt4) $Rdd32=memd($Rs32)", (L2_ploadrdt_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrdtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rdd32=memd($Rs32)", (L2_ploadrdtnew_io DoubleRegs:$Rdd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrhf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memh($Rs32)", (L2_ploadrhf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrhfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memh($Rs32)", (L2_ploadrhfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrht_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memh($Rs32)", (L2_ploadrht_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrhtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memh($Rs32)", (L2_ploadrhtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrif_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memw($Rs32)", (L2_ploadrif_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrifnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memw($Rs32)", (L2_ploadrifnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrit_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memw($Rs32)", (L2_ploadrit_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadritnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memw($Rs32)", (L2_ploadritnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memub($Rs32)", (L2_ploadrubf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memub($Rs32)", (L2_ploadrubfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubt_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memub($Rs32)", (L2_ploadrubt_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadrubtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memub($Rs32)", (L2_ploadrubtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruhf_zomapAlias : InstAlias<"if (!$Pt4) $Rd32=memuh($Rs32)", (L2_ploadruhf_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruhfnew_zomapAlias : InstAlias<"if (!$Pt4.new) $Rd32=memuh($Rs32)", (L2_ploadruhfnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruht_zomapAlias : InstAlias<"if ($Pt4) $Rd32=memuh($Rs32)", (L2_ploadruht_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L2_ploadruhtnew_zomapAlias : InstAlias<"if ($Pt4.new) $Rd32=memuh($Rs32)", (L2_ploadruhtnew_io IntRegs:$Rd32, PredRegs:$Pt4, IntRegs:$Rs32, 0)>;
+def L4_add_memopb_zomapAlias : InstAlias<"memb($Rs32)+=$Rt32", (L4_add_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_add_memoph_zomapAlias : InstAlias<"memh($Rs32)+=$Rt32", (L4_add_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_add_memopw_zomapAlias : InstAlias<"memw($Rs32)+=$Rt32", (L4_add_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_and_memopb_zomapAlias : InstAlias<"memb($Rs32)&=$Rt32", (L4_and_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_and_memoph_zomapAlias : InstAlias<"memh($Rs32)&=$Rt32", (L4_and_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_and_memopw_zomapAlias : InstAlias<"memw($Rs32)&=$Rt32", (L4_and_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_iadd_memopb_zomapAlias : InstAlias<"memb($Rs32)+=#$II", (L4_iadd_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iadd_memoph_zomapAlias : InstAlias<"memh($Rs32)+=#$II", (L4_iadd_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iadd_memopw_zomapAlias : InstAlias<"memw($Rs32)+=#$II", (L4_iadd_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iand_memopb_zomapAlias : InstAlias<"memb($Rs32)=clrbit(#$II)", (L4_iand_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iand_memoph_zomapAlias : InstAlias<"memh($Rs32)=clrbit(#$II)", (L4_iand_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_iand_memopw_zomapAlias : InstAlias<"memw($Rs32)=clrbit(#$II)", (L4_iand_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_ior_memopb_zomapAlias : InstAlias<"memb($Rs32)=setbit(#$II)", (L4_ior_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_ior_memoph_zomapAlias : InstAlias<"memh($Rs32)=setbit(#$II)", (L4_ior_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_ior_memopw_zomapAlias : InstAlias<"memw($Rs32)=setbit(#$II)", (L4_ior_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_isub_memopb_zomapAlias : InstAlias<"memb($Rs32)-=#$II", (L4_isub_memopb_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_isub_memoph_zomapAlias : InstAlias<"memh($Rs32)-=#$II", (L4_isub_memoph_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_isub_memopw_zomapAlias : InstAlias<"memw($Rs32)-=#$II", (L4_isub_memopw_io IntRegs:$Rs32, 0, u5_0Imm:$II)>;
+def L4_or_memopb_zomapAlias : InstAlias<"memb($Rs32)|=$Rt32", (L4_or_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_or_memoph_zomapAlias : InstAlias<"memh($Rs32)|=$Rt32", (L4_or_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_or_memopw_zomapAlias : InstAlias<"memw($Rs32)|=$Rt32", (L4_or_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_sub_memopb_zomapAlias : InstAlias<"memb($Rs32)-=$Rt32", (L4_sub_memopb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_sub_memoph_zomapAlias : InstAlias<"memh($Rs32)-=$Rt32", (L4_sub_memoph_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def L4_sub_memopw_zomapAlias : InstAlias<"memw($Rs32)-=$Rt32", (L4_sub_memopw_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def M2_mpyuiAlias : InstAlias<"$Rd32=mpyui($Rs32,$Rt32)", (M2_mpyi IntRegs:$Rd32, IntRegs:$Rs32, IntRegs:$Rt32)>;
+def S2_pstorerbf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32)=$Rt32", (S2_pstorerbf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerbnewf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32)=$Nt8.new", (S2_pstorerbnewf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerbnewt_zomapAlias : InstAlias<"if ($Pv4) memb($Rs32)=$Nt8.new", (S2_pstorerbnewt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerbt_zomapAlias : InstAlias<"if ($Pv4) memb($Rs32)=$Rt32", (S2_pstorerbt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerdf_zomapAlias : InstAlias<"if (!$Pv4) memd($Rs32)=$Rtt32", (S2_pstorerdf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S2_pstorerdt_zomapAlias : InstAlias<"if ($Pv4) memd($Rs32)=$Rtt32", (S2_pstorerdt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S2_pstorerff_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=$Rt32.h", (S2_pstorerff_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerft_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=$Rt32.h", (S2_pstorerft_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerhf_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=$Rt32", (S2_pstorerhf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerhnewf_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=$Nt8.new", (S2_pstorerhnewf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerhnewt_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=$Nt8.new", (S2_pstorerhnewt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerht_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=$Rt32", (S2_pstorerht_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerif_zomapAlias : InstAlias<"if (!$Pv4) memw($Rs32)=$Rt32", (S2_pstorerif_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_pstorerinewf_zomapAlias : InstAlias<"if (!$Pv4) memw($Rs32)=$Nt8.new", (S2_pstorerinewf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerinewt_zomapAlias : InstAlias<"if ($Pv4) memw($Rs32)=$Nt8.new", (S2_pstorerinewt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_pstorerit_zomapAlias : InstAlias<"if ($Pv4) memw($Rs32)=$Rt32", (S2_pstorerit_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerb_zomapAlias : InstAlias<"memb($Rs32)=$Rt32", (S2_storerb_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerbnew_zomapAlias : InstAlias<"memb($Rs32)=$Nt8.new", (S2_storerbnew_io IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_storerd_zomapAlias : InstAlias<"memd($Rs32)=$Rtt32", (S2_storerd_io IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S2_storerf_zomapAlias : InstAlias<"memh($Rs32)=$Rt32.h", (S2_storerf_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerh_zomapAlias : InstAlias<"memh($Rs32)=$Rt32", (S2_storerh_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerhnew_zomapAlias : InstAlias<"memh($Rs32)=$Nt8.new", (S2_storerhnew_io IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_storeri_zomapAlias : InstAlias<"memw($Rs32)=$Rt32", (S2_storeri_io IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S2_storerinew_zomapAlias : InstAlias<"memw($Rs32)=$Nt8.new", (S2_storerinew_io IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S2_tableidxb_goodsyntaxAlias : InstAlias<"$Rx32=tableidxb($Rs32,#$Ii,#$II)", (S2_tableidxb IntRegs:$Rx32, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II)>;
+def S4_pstorerbfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memb($Rs32)=$Rt32", (S4_pstorerbfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerbnewfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memb($Rs32)=$Nt8.new", (S4_pstorerbnewfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerbnewtnew_zomapAlias : InstAlias<"if ($Pv4.new) memb($Rs32)=$Nt8.new", (S4_pstorerbnewtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerbtnew_zomapAlias : InstAlias<"if ($Pv4.new) memb($Rs32)=$Rt32", (S4_pstorerbtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerdfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memd($Rs32)=$Rtt32", (S4_pstorerdfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S4_pstorerdtnew_zomapAlias : InstAlias<"if ($Pv4.new) memd($Rs32)=$Rtt32", (S4_pstorerdtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, DoubleRegs:$Rtt32)>;
+def S4_pstorerffnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=$Rt32.h", (S4_pstorerffnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerftnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=$Rt32.h", (S4_pstorerftnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerhfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=$Rt32", (S4_pstorerhfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerhnewfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=$Nt8.new", (S4_pstorerhnewfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerhnewtnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=$Nt8.new", (S4_pstorerhnewtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerhtnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=$Rt32", (S4_pstorerhtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerifnew_zomapAlias : InstAlias<"if (!$Pv4.new) memw($Rs32)=$Rt32", (S4_pstorerifnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_pstorerinewfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memw($Rs32)=$Nt8.new", (S4_pstorerinewfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstorerinewtnew_zomapAlias : InstAlias<"if ($Pv4.new) memw($Rs32)=$Nt8.new", (S4_pstorerinewtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
+def S4_pstoreritnew_zomapAlias : InstAlias<"if ($Pv4.new) memw($Rs32)=$Rt32", (S4_pstoreritnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
+def S4_storeirb_zomapAlias : InstAlias<"memb($Rs32)=#$II", (S4_storeirb_io IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32)=#$II", (S4_storeirbf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memb($Rs32)=#$II", (S4_storeirbfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbt_zomapAlias : InstAlias<"if ($Pv4) memb($Rs32)=#$II", (S4_storeirbt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirbtnew_zomapAlias : InstAlias<"if ($Pv4.new) memb($Rs32)=#$II", (S4_storeirbtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirh_zomapAlias : InstAlias<"memh($Rs32)=#$II", (S4_storeirh_io IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirhf_zomapAlias : InstAlias<"if (!$Pv4) memh($Rs32)=#$II", (S4_storeirhf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirhfnew_zomapAlias : InstAlias<"if (!$Pv4.new) memh($Rs32)=#$II", (S4_storeirhfnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirht_zomapAlias : InstAlias<"if ($Pv4) memh($Rs32)=#$II", (S4_storeirht_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirhtnew_zomapAlias : InstAlias<"if ($Pv4.new) memh($Rs32)=#$II", (S4_storeirhtnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeiri_zomapAlias : InstAlias<"memw($Rs32)=#$II", (S4_storeiri_io IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirif_zomapAlias : InstAlias<"if (!$Pv4) memw($Rs32)=#$II", (S4_storeirif_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirifnew_zomapAlias : InstAlias<"if (!$Pv4.new) memw($Rs32)=#$II", (S4_storeirifnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeirit_zomapAlias : InstAlias<"if ($Pv4) memw($Rs32)=#$II", (S4_storeirit_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def S4_storeiritnew_zomapAlias : InstAlias<"if ($Pv4.new) memw($Rs32)=#$II", (S4_storeiritnew_io PredRegs:$Pv4, IntRegs:$Rs32, 0, s32_0Imm:$II)>;
+def V6_MAP_equbAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_128BAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_andAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_and_128BAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_iorAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_ior_128BAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_xorAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equb_xor_128BAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.ub,$Vv32.ub)", (V6_veqb_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equhAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_128BAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_andAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_and_128BAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_iorAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_ior_128BAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_xorAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equh_xor_128BAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uh,$Vv32.uh)", (V6_veqh_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equwAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_128BAlias : InstAlias<"$Qd4=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw VecPredRegs:$Qd4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_andAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_and_128BAlias : InstAlias<"$Qx4&=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_and VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_iorAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_ior_128BAlias : InstAlias<"$Qx4|=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_or VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_xorAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_MAP_equw_xor_128BAlias : InstAlias<"$Qx4^=vcmp.eq($Vu32.uw,$Vv32.uw)", (V6_veqw_xor VecPredRegs:$Qx4, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_extractw_altAlias : InstAlias<"$Rd32.w=vextract($Vu32,$Rs32)", (V6_extractw IntRegs:$Rd32, VectorRegs:$Vu32, IntRegs:$Rs32)>, Requires<[UseHVX]>;
+def V6_extractw_alt_128BAlias : InstAlias<"$Rd32.w=vextract($Vu32,$Rs32)", (V6_extractw IntRegs:$Rd32, VectorRegs:$Vu32, IntRegs:$Rs32)>, Requires<[UseHVX]>;
+def V6_ld0Alias : InstAlias<"$Vd32=vmem($Rt32)", (V6_vL32b_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ld0_128BAlias : InstAlias<"$Vd32=vmem($Rt32)", (V6_vL32b_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldnt0Alias : InstAlias<"$Vd32=vmem($Rt32):nt", (V6_vL32b_nt_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldnt0_128BAlias : InstAlias<"$Vd32=vmem($Rt32):nt", (V6_vL32b_nt_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldu0Alias : InstAlias<"$Vd32=vmemu($Rt32)", (V6_vL32Ub_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_ldu0_128BAlias : InstAlias<"$Vd32=vmemu($Rt32)", (V6_vL32Ub_ai VectorRegs:$Vd32, IntRegs:$Rt32, 0)>, Requires<[UseHVX]>;
+def V6_st0Alias : InstAlias<"vmem($Rt32)=$Vs32", (V6_vS32b_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_st0_128BAlias : InstAlias<"vmem($Rt32)=$Vs32", (V6_vS32b_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stn0Alias : InstAlias<"vmem($Rt32)=$Os8.new", (V6_vS32b_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stn0_128BAlias : InstAlias<"vmem($Rt32)=$Os8.new", (V6_vS32b_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stnnt0Alias : InstAlias<"vmem($Rt32):nt=$Os8.new", (V6_vS32b_nt_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stnnt0_128BAlias : InstAlias<"vmem($Rt32):nt=$Os8.new", (V6_vS32b_nt_new_ai IntRegs:$Rt32, 0, VectorRegs:$Os8)>, Requires<[UseHVX]>;
+def V6_stnp0Alias : InstAlias<"if (!$Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnp0_128BAlias : InstAlias<"if (!$Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnpnt0Alias : InstAlias<"if (!$Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnpnt0_128BAlias : InstAlias<"if (!$Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnq0Alias : InstAlias<"if (!$Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnq0_128BAlias : InstAlias<"if (!$Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnqnt0Alias : InstAlias<"if (!$Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnqnt0_128BAlias : InstAlias<"if (!$Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnt0Alias : InstAlias<"vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stnt0_128BAlias : InstAlias<"vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stp0Alias : InstAlias<"if ($Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stp0_128BAlias : InstAlias<"if ($Pv4) vmem($Rt32)=$Vs32", (V6_vS32b_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stpnt0Alias : InstAlias<"if ($Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stpnt0_128BAlias : InstAlias<"if ($Pv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stq0Alias : InstAlias<"if ($Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stq0_128BAlias : InstAlias<"if ($Qv4) vmem($Rt32)=$Vs32", (V6_vS32b_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stqnt0Alias : InstAlias<"if ($Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stqnt0_128BAlias : InstAlias<"if ($Qv4) vmem($Rt32):nt=$Vs32", (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stu0Alias : InstAlias<"vmemu($Rt32)=$Vs32", (V6_vS32Ub_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stu0_128BAlias : InstAlias<"vmemu($Rt32)=$Vs32", (V6_vS32Ub_ai IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stunp0Alias : InstAlias<"if (!$Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stunp0_128BAlias : InstAlias<"if (!$Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_npred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stup0Alias : InstAlias<"if ($Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_stup0_128BAlias : InstAlias<"if ($Pv4) vmemu($Rt32)=$Vs32", (V6_vS32Ub_pred_ai PredRegs:$Pv4, IntRegs:$Rt32, 0, VectorRegs:$Vs32)>, Requires<[UseHVX]>;
+def V6_vabsdiffh_altAlias : InstAlias<"$Vd32=vabsdiffh($Vu32,$Vv32)", (V6_vabsdiffh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffh_alt_128BAlias : InstAlias<"$Vd32=vabsdiffh($Vu32,$Vv32)", (V6_vabsdiffh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffub_altAlias : InstAlias<"$Vd32=vabsdiffub($Vu32,$Vv32)", (V6_vabsdiffub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffub_alt_128BAlias : InstAlias<"$Vd32=vabsdiffub($Vu32,$Vv32)", (V6_vabsdiffub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffuh_altAlias : InstAlias<"$Vd32=vabsdiffuh($Vu32,$Vv32)", (V6_vabsdiffuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffuh_alt_128BAlias : InstAlias<"$Vd32=vabsdiffuh($Vu32,$Vv32)", (V6_vabsdiffuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffw_altAlias : InstAlias<"$Vd32=vabsdiffw($Vu32,$Vv32)", (V6_vabsdiffw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsdiffw_alt_128BAlias : InstAlias<"$Vd32=vabsdiffw($Vu32,$Vv32)", (V6_vabsdiffw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vabsh_altAlias : InstAlias<"$Vd32=vabsh($Vu32)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsh_alt_128BAlias : InstAlias<"$Vd32=vabsh($Vu32)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsh_sat_altAlias : InstAlias<"$Vd32=vabsh($Vu32):sat", (V6_vabsh_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsh_sat_alt_128BAlias : InstAlias<"$Vd32=vabsh($Vu32):sat", (V6_vabsh_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuh_altAlias : InstAlias<"$Vd32.uh=vabs($Vu32.h)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuh_alt_128BAlias : InstAlias<"$Vd32.uh=vabs($Vu32.h)", (V6_vabsh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuw_altAlias : InstAlias<"$Vd32.uw=vabs($Vu32.w)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsuw_alt_128BAlias : InstAlias<"$Vd32.uw=vabs($Vu32.w)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_altAlias : InstAlias<"$Vd32=vabsw($Vu32)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_alt_128BAlias : InstAlias<"$Vd32=vabsw($Vu32)", (V6_vabsw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_sat_altAlias : InstAlias<"$Vd32=vabsw($Vu32):sat", (V6_vabsw_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vabsw_sat_alt_128BAlias : InstAlias<"$Vd32=vabsw($Vu32):sat", (V6_vabsw_sat VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddb_altAlias : InstAlias<"$Vd32=vaddb($Vu32,$Vv32)", (V6_vaddb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddb_alt_128BAlias : InstAlias<"$Vd32=vaddb($Vu32,$Vv32)", (V6_vaddb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddb_dv_altAlias : InstAlias<"$Vdd32=vaddb($Vuu32,$Vvv32)", (V6_vaddb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddb_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddb($Vuu32,$Vvv32)", (V6_vaddb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddbnq_altAlias : InstAlias<"if (!$Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddbnq_alt_128BAlias : InstAlias<"if (!$Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddbq_altAlias : InstAlias<"if ($Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddbq_alt_128BAlias : InstAlias<"if ($Qv4.b) $Vx32.b+=$Vu32.b", (V6_vaddbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddh_altAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32)", (V6_vaddh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddh_alt_128BAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32)", (V6_vaddh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddh_dv_altAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32)", (V6_vaddh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddh_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32)", (V6_vaddh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddhnq_altAlias : InstAlias<"if (!$Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhnq_alt_128BAlias : InstAlias<"if (!$Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhq_altAlias : InstAlias<"if ($Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhq_alt_128BAlias : InstAlias<"if ($Qv4.h) $Vx32.h+=$Vu32.h", (V6_vaddhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_altAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32):sat", (V6_vaddhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_alt_128BAlias : InstAlias<"$Vd32=vaddh($Vu32,$Vv32):sat", (V6_vaddhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_dv_altAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32):sat", (V6_vaddhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddh($Vuu32,$Vvv32):sat", (V6_vaddhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddhw_altAlias : InstAlias<"$Vdd32=vaddh($Vu32,$Vv32)", (V6_vaddhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddhw_alt_128BAlias : InstAlias<"$Vdd32=vaddh($Vu32,$Vv32)", (V6_vaddhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubh_altAlias : InstAlias<"$Vdd32=vaddub($Vu32,$Vv32)", (V6_vaddubh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubh_alt_128BAlias : InstAlias<"$Vdd32=vaddub($Vu32,$Vv32)", (V6_vaddubh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_altAlias : InstAlias<"$Vd32=vaddub($Vu32,$Vv32):sat", (V6_vaddubsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_alt_128BAlias : InstAlias<"$Vd32=vaddub($Vu32,$Vv32):sat", (V6_vaddubsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_dv_altAlias : InstAlias<"$Vdd32=vaddub($Vuu32,$Vvv32):sat", (V6_vaddubsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddubsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddub($Vuu32,$Vvv32):sat", (V6_vaddubsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_altAlias : InstAlias<"$Vd32=vadduh($Vu32,$Vv32):sat", (V6_vadduhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_alt_128BAlias : InstAlias<"$Vd32=vadduh($Vu32,$Vv32):sat", (V6_vadduhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_dv_altAlias : InstAlias<"$Vdd32=vadduh($Vuu32,$Vvv32):sat", (V6_vadduhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vadduhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vadduh($Vuu32,$Vvv32):sat", (V6_vadduhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vadduhw_altAlias : InstAlias<"$Vdd32=vadduh($Vu32,$Vv32)", (V6_vadduhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vadduhw_alt_128BAlias : InstAlias<"$Vdd32=vadduh($Vu32,$Vv32)", (V6_vadduhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddw_altAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32)", (V6_vaddw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddw_alt_128BAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32)", (V6_vaddw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddw_dv_altAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32)", (V6_vaddw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddw_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32)", (V6_vaddw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddwnq_altAlias : InstAlias<"if (!$Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwnq_alt_128BAlias : InstAlias<"if (!$Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwq_altAlias : InstAlias<"if ($Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwq_alt_128BAlias : InstAlias<"if ($Qv4.w) $Vx32.w+=$Vu32.w", (V6_vaddwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_altAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32):sat", (V6_vaddwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_alt_128BAlias : InstAlias<"$Vd32=vaddw($Vu32,$Vv32):sat", (V6_vaddwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_dv_altAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32):sat", (V6_vaddwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vaddwsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vaddw($Vuu32,$Vvv32):sat", (V6_vaddwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vandqrt_acc_altAlias : InstAlias<"$Vx32.ub|=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt_acc VectorRegs:$Vx32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandqrt_acc_alt_128BAlias : InstAlias<"$Vx32.ub|=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt_acc VectorRegs:$Vx32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandqrt_altAlias : InstAlias<"$Vd32.ub=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt VectorRegs:$Vd32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandqrt_alt_128BAlias : InstAlias<"$Vd32.ub=vand($Qu4.ub,$Rt32.ub)", (V6_vandqrt VectorRegs:$Vd32, VecPredRegs:$Qu4, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_acc_altAlias : InstAlias<"$Qx4.ub|=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt_acc VecPredRegs:$Qx4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_acc_alt_128BAlias : InstAlias<"$Qx4.ub|=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt_acc VecPredRegs:$Qx4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_altAlias : InstAlias<"$Qd4.ub=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt VecPredRegs:$Qd4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vandvrt_alt_128BAlias : InstAlias<"$Qd4.ub=vand($Vu32.ub,$Rt32.ub)", (V6_vandvrt VecPredRegs:$Qd4, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslh_altAlias : InstAlias<"$Vd32=vaslh($Vu32,$Rt32)", (V6_vaslh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslh_alt_128BAlias : InstAlias<"$Vd32=vaslh($Vu32,$Rt32)", (V6_vaslh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslhv_altAlias : InstAlias<"$Vd32=vaslh($Vu32,$Vv32)", (V6_vaslhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaslhv_alt_128BAlias : InstAlias<"$Vd32=vaslh($Vu32,$Vv32)", (V6_vaslhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaslw_acc_altAlias : InstAlias<"$Vx32+=vaslw($Vu32,$Rt32)", (V6_vaslw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslw_acc_alt_128BAlias : InstAlias<"$Vx32+=vaslw($Vu32,$Rt32)", (V6_vaslw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslw_altAlias : InstAlias<"$Vd32=vaslw($Vu32,$Rt32)", (V6_vaslw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslw_alt_128BAlias : InstAlias<"$Vd32=vaslw($Vu32,$Rt32)", (V6_vaslw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vaslwv_altAlias : InstAlias<"$Vd32=vaslw($Vu32,$Vv32)", (V6_vaslwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vaslwv_alt_128BAlias : InstAlias<"$Vd32=vaslw($Vu32,$Vv32)", (V6_vaslwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrh_altAlias : InstAlias<"$Vd32=vasrh($Vu32,$Rt32)", (V6_vasrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrh_alt_128BAlias : InstAlias<"$Vd32=vasrh($Vu32,$Rt32)", (V6_vasrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrhbrndsat_altAlias : InstAlias<"$Vd32=vasrhb($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrhbrndsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrhubrndsat_altAlias : InstAlias<"$Vd32=vasrhub($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrhubrndsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrhubsat_altAlias : InstAlias<"$Vd32=vasrhub($Vu32,$Vv32,$Rt8):sat", (V6_vasrhubsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrhv_altAlias : InstAlias<"$Vd32=vasrh($Vu32,$Vv32)", (V6_vasrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrhv_alt_128BAlias : InstAlias<"$Vd32=vasrh($Vu32,$Vv32)", (V6_vasrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrw_acc_altAlias : InstAlias<"$Vx32+=vasrw($Vu32,$Rt32)", (V6_vasrw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrw_acc_alt_128BAlias : InstAlias<"$Vx32+=vasrw($Vu32,$Rt32)", (V6_vasrw_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrw_altAlias : InstAlias<"$Vd32=vasrw($Vu32,$Rt32)", (V6_vasrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrw_alt_128BAlias : InstAlias<"$Vd32=vasrw($Vu32,$Rt32)", (V6_vasrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vasrwh_altAlias : InstAlias<"$Vd32=vasrwh($Vu32,$Vv32,$Rt8)", (V6_vasrwhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwhrndsat_altAlias : InstAlias<"$Vd32=vasrwh($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrwhrndsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwhsat_altAlias : InstAlias<"$Vd32=vasrwh($Vu32,$Vv32,$Rt8):sat", (V6_vasrwhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwuhsat_altAlias : InstAlias<"$Vd32=vasrwuh($Vu32,$Vv32,$Rt8):sat", (V6_vasrwuhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32, IntRegsLow8:$Rt8)>;
+def V6_vasrwv_altAlias : InstAlias<"$Vd32=vasrw($Vu32,$Vv32)", (V6_vasrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vasrwv_alt_128BAlias : InstAlias<"$Vd32=vasrw($Vu32,$Vv32)", (V6_vasrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgh_altAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32)", (V6_vavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgh_alt_128BAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32)", (V6_vavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavghrnd_altAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32):rnd", (V6_vavghrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavghrnd_alt_128BAlias : InstAlias<"$Vd32=vavgh($Vu32,$Vv32):rnd", (V6_vavghrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgub_altAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32)", (V6_vavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgub_alt_128BAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32)", (V6_vavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgubrnd_altAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32):rnd", (V6_vavgubrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgubrnd_alt_128BAlias : InstAlias<"$Vd32=vavgub($Vu32,$Vv32):rnd", (V6_vavgubrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguh_altAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32)", (V6_vavguh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguh_alt_128BAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32)", (V6_vavguh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguhrnd_altAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32):rnd", (V6_vavguhrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavguhrnd_alt_128BAlias : InstAlias<"$Vd32=vavguh($Vu32,$Vv32):rnd", (V6_vavguhrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgw_altAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32)", (V6_vavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgw_alt_128BAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32)", (V6_vavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgwrnd_altAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32):rnd", (V6_vavgwrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vavgwrnd_alt_128BAlias : InstAlias<"$Vd32=vavgw($Vu32,$Vv32):rnd", (V6_vavgwrnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vcl0h_altAlias : InstAlias<"$Vd32=vcl0h($Vu32)", (V6_vcl0h VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vcl0h_alt_128BAlias : InstAlias<"$Vd32=vcl0h($Vu32)", (V6_vcl0h VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vcl0w_altAlias : InstAlias<"$Vd32=vcl0w($Vu32)", (V6_vcl0w VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vcl0w_alt_128BAlias : InstAlias<"$Vd32=vcl0w($Vu32)", (V6_vcl0w VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vd0Alias : InstAlias<"$Vd32=#0", (V6_vxor VectorRegs:$Vd32, VectorRegs:$Vd32, VectorRegs:$Vd32)>, Requires<[UseHVX]>;
+def V6_vd0_128BAlias : InstAlias<"$Vd32=#0", (V6_vxor VectorRegs:$Vd32, VectorRegs:$Vd32, VectorRegs:$Vd32)>, Requires<[UseHVX]>;
+def V6_vdd0Alias : InstAlias<"$Vdd32=#0", (V6_vsubw_dv VecDblRegs:$Vdd32, W15, W15)>, Requires<[UseHVX]>;
+def V6_vdd0_128BAlias : InstAlias<"$Vdd32=#0", (V6_vsubw_dv VecDblRegs:$Vdd32, W15, W15)>, Requires<[UseHVX]>;
+def V6_vdealb4w_altAlias : InstAlias<"$Vd32=vdealb4w($Vu32,$Vv32)", (V6_vdealb4w VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdealb4w_alt_128BAlias : InstAlias<"$Vd32=vdealb4w($Vu32,$Vv32)", (V6_vdealb4w VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdealb_altAlias : InstAlias<"$Vd32=vdealb($Vu32)", (V6_vdealb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdealb_alt_128BAlias : InstAlias<"$Vd32=vdealb($Vu32)", (V6_vdealb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdealh_altAlias : InstAlias<"$Vd32=vdealh($Vu32)", (V6_vdealh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdealh_alt_128BAlias : InstAlias<"$Vd32=vdealh($Vu32)", (V6_vdealh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_acc_altAlias : InstAlias<"$Vx32+=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_altAlias : InstAlias<"$Vd32=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_alt_128BAlias : InstAlias<"$Vd32=vdmpybus($Vu32,$Rt32)", (V6_vdmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_acc_altAlias : InstAlias<"$Vxx32+=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_altAlias : InstAlias<"$Vdd32=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpybus_dv_alt_128BAlias : InstAlias<"$Vdd32=vdmpybus($Vuu32,$Rt32)", (V6_vdmpybus_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_acc_altAlias : InstAlias<"$Vx32+=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_altAlias : InstAlias<"$Vd32=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_alt_128BAlias : InstAlias<"$Vd32=vdmpyhb($Vu32,$Rt32)", (V6_vdmpyhb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_acc_altAlias : InstAlias<"$Vxx32+=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_altAlias : InstAlias<"$Vdd32=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhb_dv_alt_128BAlias : InstAlias<"$Vdd32=vdmpyhb($Vuu32,$Rt32)", (V6_vdmpyhb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_acc_altAlias : InstAlias<"$Vx32+=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_altAlias : InstAlias<"$Vd32=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhisat_alt_128BAlias : InstAlias<"$Vd32=vdmpyh($Vuu32,$Rt32):sat", (V6_vdmpyhisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_acc_altAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_altAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsat_alt_128BAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Rt32):sat", (V6_vdmpyhsat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_acc_altAlias : InstAlias<"$Vx32+=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat_acc VectorRegs:$Vx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_altAlias : InstAlias<"$Vd32=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsuisat_alt_128BAlias : InstAlias<"$Vd32=vdmpyhsu($Vuu32,$Rt32,#1):sat", (V6_vdmpyhsuisat VectorRegs:$Vd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_acc_altAlias : InstAlias<"$Vx32+=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_altAlias : InstAlias<"$Vd32=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhsusat_alt_128BAlias : InstAlias<"$Vd32=vdmpyhsu($Vu32,$Rt32):sat", (V6_vdmpyhsusat VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_acc_altAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_acc_alt_128BAlias : InstAlias<"$Vx32+=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_altAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdmpyhvsat_alt_128BAlias : InstAlias<"$Vd32=vdmpyh($Vu32,$Vv32):sat", (V6_vdmpyhvsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_acc_altAlias : InstAlias<"$Vxx32+=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_acc_alt_128BAlias : InstAlias<"$Vxx32+=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_altAlias : InstAlias<"$Vdd32=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vdsaduh_alt_128BAlias : InstAlias<"$Vdd32=vdsaduh($Vuu32,$Rt32)", (V6_vdsaduh VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrh_altAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Rt32)", (V6_vlsrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrh_alt_128BAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Rt32)", (V6_vlsrh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrhv_altAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Vv32)", (V6_vlsrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vlsrhv_alt_128BAlias : InstAlias<"$Vd32=vlsrh($Vu32,$Vv32)", (V6_vlsrhv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vlsrw_altAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Rt32)", (V6_vlsrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrw_alt_128BAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Rt32)", (V6_vlsrw VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vlsrwv_altAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Vv32)", (V6_vlsrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vlsrwv_alt_128BAlias : InstAlias<"$Vd32=vlsrw($Vu32,$Vv32)", (V6_vlsrwv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxh_altAlias : InstAlias<"$Vd32=vmaxh($Vu32,$Vv32)", (V6_vmaxh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxh_alt_128BAlias : InstAlias<"$Vd32=vmaxh($Vu32,$Vv32)", (V6_vmaxh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxub_altAlias : InstAlias<"$Vd32=vmaxub($Vu32,$Vv32)", (V6_vmaxub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxub_alt_128BAlias : InstAlias<"$Vd32=vmaxub($Vu32,$Vv32)", (V6_vmaxub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxuh_altAlias : InstAlias<"$Vd32=vmaxuh($Vu32,$Vv32)", (V6_vmaxuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxuh_alt_128BAlias : InstAlias<"$Vd32=vmaxuh($Vu32,$Vv32)", (V6_vmaxuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxw_altAlias : InstAlias<"$Vd32=vmaxw($Vu32,$Vv32)", (V6_vmaxw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmaxw_alt_128BAlias : InstAlias<"$Vd32=vmaxw($Vu32,$Vv32)", (V6_vmaxw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminh_altAlias : InstAlias<"$Vd32=vminh($Vu32,$Vv32)", (V6_vminh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminh_alt_128BAlias : InstAlias<"$Vd32=vminh($Vu32,$Vv32)", (V6_vminh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminub_altAlias : InstAlias<"$Vd32=vminub($Vu32,$Vv32)", (V6_vminub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminub_alt_128BAlias : InstAlias<"$Vd32=vminub($Vu32,$Vv32)", (V6_vminub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminuh_altAlias : InstAlias<"$Vd32=vminuh($Vu32,$Vv32)", (V6_vminuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminuh_alt_128BAlias : InstAlias<"$Vd32=vminuh($Vu32,$Vv32)", (V6_vminuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminw_altAlias : InstAlias<"$Vd32=vminw($Vu32,$Vv32)", (V6_vminw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vminw_alt_128BAlias : InstAlias<"$Vd32=vminw($Vu32,$Vv32)", (V6_vminw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpabus_acc_altAlias : InstAlias<"$Vxx32+=vmpabus($Vuu32,$Rt32)", (V6_vmpabus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpabus($Vuu32,$Rt32)", (V6_vmpabus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabus_altAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Rt32)", (V6_vmpabus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabus_alt_128BAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Rt32)", (V6_vmpabus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpabusv_altAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Vvv32)", (V6_vmpabusv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpabusv_alt_128BAlias : InstAlias<"$Vdd32=vmpabus($Vuu32,$Vvv32)", (V6_vmpabusv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpabuuv_altAlias : InstAlias<"$Vdd32=vmpabuu($Vuu32,$Vvv32)", (V6_vmpabuuv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpabuuv_alt_128BAlias : InstAlias<"$Vdd32=vmpabuu($Vuu32,$Vvv32)", (V6_vmpabuuv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vmpahb_acc_altAlias : InstAlias<"$Vxx32+=vmpahb($Vuu32,$Rt32)", (V6_vmpahb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpahb_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpahb($Vuu32,$Rt32)", (V6_vmpahb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpahb_altAlias : InstAlias<"$Vdd32=vmpahb($Vuu32,$Rt32)", (V6_vmpahb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpahb_alt_128BAlias : InstAlias<"$Vdd32=vmpahb($Vuu32,$Rt32)", (V6_vmpahb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_acc_altAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Rt32)", (V6_vmpybus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Rt32)", (V6_vmpybus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_altAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Rt32)", (V6_vmpybus VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybus_alt_128BAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Rt32)", (V6_vmpybus VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_acc_altAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Vv32)", (V6_vmpybusv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpybus($Vu32,$Vv32)", (V6_vmpybusv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_altAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Vv32)", (V6_vmpybusv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybusv_alt_128BAlias : InstAlias<"$Vdd32=vmpybus($Vu32,$Vv32)", (V6_vmpybusv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_acc_altAlias : InstAlias<"$Vxx32+=vmpyb($Vu32,$Vv32)", (V6_vmpybv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyb($Vu32,$Vv32)", (V6_vmpybv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_altAlias : InstAlias<"$Vdd32=vmpyb($Vu32,$Vv32)", (V6_vmpybv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpybv_alt_128BAlias : InstAlias<"$Vdd32=vmpyb($Vu32,$Vv32)", (V6_vmpybv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyewuh_altAlias : InstAlias<"$Vd32=vmpyewuh($Vu32,$Vv32)", (V6_vmpyewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyewuh_alt_128BAlias : InstAlias<"$Vd32=vmpyewuh($Vu32,$Vv32)", (V6_vmpyewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyh_altAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Rt32)", (V6_vmpyh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyh_alt_128BAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Rt32)", (V6_vmpyh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsat_acc_altAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Rt32):sat", (V6_vmpyhsat_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsat_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Rt32):sat", (V6_vmpyhsat_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsrs_altAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:rnd:sat", (V6_vmpyhsrs VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhsrs_alt_128BAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:rnd:sat", (V6_vmpyhsrs VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhss_altAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:sat", (V6_vmpyhss VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhss_alt_128BAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Rt32):<<1:sat", (V6_vmpyhss VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_acc_altAlias : InstAlias<"$Vxx32+=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_altAlias : InstAlias<"$Vdd32=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhus_alt_128BAlias : InstAlias<"$Vdd32=vmpyhus($Vu32,$Vv32)", (V6_vmpyhus VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_acc_altAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Vv32)", (V6_vmpyhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyh($Vu32,$Vv32)", (V6_vmpyhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_altAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Vv32)", (V6_vmpyhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhv_alt_128BAlias : InstAlias<"$Vdd32=vmpyh($Vu32,$Vv32)", (V6_vmpyhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhvsrs_altAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyhvsrs VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyhvsrs_alt_128BAlias : InstAlias<"$Vd32=vmpyh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyhvsrs VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewh_acc_altAlias : InstAlias<"$Vx32+=vmpyiewh($Vu32,$Vv32)", (V6_vmpyiewh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewh_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiewh($Vu32,$Vv32)", (V6_vmpyiewh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_acc_altAlias : InstAlias<"$Vx32+=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_altAlias : InstAlias<"$Vd32=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiewuh_alt_128BAlias : InstAlias<"$Vd32=vmpyiewuh($Vu32,$Vv32)", (V6_vmpyiewuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_acc_altAlias : InstAlias<"$Vx32+=vmpyih($Vu32,$Vv32)", (V6_vmpyih_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyih($Vu32,$Vv32)", (V6_vmpyih_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_altAlias : InstAlias<"$Vd32=vmpyih($Vu32,$Vv32)", (V6_vmpyih VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyih_alt_128BAlias : InstAlias<"$Vd32=vmpyih($Vu32,$Vv32)", (V6_vmpyih VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_acc_altAlias : InstAlias<"$Vx32+=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_altAlias : InstAlias<"$Vd32=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyihb_alt_128BAlias : InstAlias<"$Vd32=vmpyihb($Vu32,$Rt32)", (V6_vmpyihb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiowh_altAlias : InstAlias<"$Vd32=vmpyiowh($Vu32,$Vv32)", (V6_vmpyiowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiowh_alt_128BAlias : InstAlias<"$Vd32=vmpyiowh($Vu32,$Vv32)", (V6_vmpyiowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_acc_altAlias : InstAlias<"$Vx32+=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_altAlias : InstAlias<"$Vd32=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwb_alt_128BAlias : InstAlias<"$Vd32=vmpyiwb($Vu32,$Rt32)", (V6_vmpyiwb VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_acc_altAlias : InstAlias<"$Vx32+=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_acc_alt_128BAlias : InstAlias<"$Vx32+=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_altAlias : InstAlias<"$Vd32=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyiwh_alt_128BAlias : InstAlias<"$Vd32=vmpyiwh($Vu32,$Rt32)", (V6_vmpyiwh VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_altAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:sat", (V6_vmpyowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_alt_128BAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:sat", (V6_vmpyowh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_rnd_altAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyowh_rnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyowh_rnd_alt_128BAlias : InstAlias<"$Vd32=vmpyowh($Vu32,$Vv32):<<1:rnd:sat", (V6_vmpyowh_rnd VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyub_acc_altAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Rt32)", (V6_vmpyub_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyub_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Rt32)", (V6_vmpyub_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyub_altAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Rt32)", (V6_vmpyub VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyub_alt_128BAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Rt32)", (V6_vmpyub VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_acc_altAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Vv32)", (V6_vmpyubv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyub($Vu32,$Vv32)", (V6_vmpyubv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_altAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Vv32)", (V6_vmpyubv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyubv_alt_128BAlias : InstAlias<"$Vdd32=vmpyub($Vu32,$Vv32)", (V6_vmpyubv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_acc_altAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_altAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuh_alt_128BAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Rt32)", (V6_vmpyuh VecDblRegs:$Vdd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_acc_altAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_acc_alt_128BAlias : InstAlias<"$Vxx32+=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv_acc VecDblRegs:$Vxx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_altAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vmpyuhv_alt_128BAlias : InstAlias<"$Vdd32=vmpyuh($Vu32,$Vv32)", (V6_vmpyuhv VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgh_altAlias : InstAlias<"$Vd32=vnavgh($Vu32,$Vv32)", (V6_vnavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgh_alt_128BAlias : InstAlias<"$Vd32=vnavgh($Vu32,$Vv32)", (V6_vnavgh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgub_altAlias : InstAlias<"$Vd32=vnavgub($Vu32,$Vv32)", (V6_vnavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgub_alt_128BAlias : InstAlias<"$Vd32=vnavgub($Vu32,$Vv32)", (V6_vnavgub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgw_altAlias : InstAlias<"$Vd32=vnavgw($Vu32,$Vv32)", (V6_vnavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnavgw_alt_128BAlias : InstAlias<"$Vd32=vnavgw($Vu32,$Vv32)", (V6_vnavgw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vnormamth_altAlias : InstAlias<"$Vd32=vnormamth($Vu32)", (V6_vnormamth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vnormamth_alt_128BAlias : InstAlias<"$Vd32=vnormamth($Vu32)", (V6_vnormamth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vnormamtw_altAlias : InstAlias<"$Vd32=vnormamtw($Vu32)", (V6_vnormamtw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vnormamtw_alt_128BAlias : InstAlias<"$Vd32=vnormamtw($Vu32)", (V6_vnormamtw VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vpackeb_altAlias : InstAlias<"$Vd32=vpackeb($Vu32,$Vv32)", (V6_vpackeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackeb_alt_128BAlias : InstAlias<"$Vd32=vpackeb($Vu32,$Vv32)", (V6_vpackeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackeh_altAlias : InstAlias<"$Vd32=vpackeh($Vu32,$Vv32)", (V6_vpackeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackeh_alt_128BAlias : InstAlias<"$Vd32=vpackeh($Vu32,$Vv32)", (V6_vpackeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhb_sat_altAlias : InstAlias<"$Vd32=vpackhb($Vu32,$Vv32):sat", (V6_vpackhb_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhb_sat_alt_128BAlias : InstAlias<"$Vd32=vpackhb($Vu32,$Vv32):sat", (V6_vpackhb_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhub_sat_altAlias : InstAlias<"$Vd32=vpackhub($Vu32,$Vv32):sat", (V6_vpackhub_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackhub_sat_alt_128BAlias : InstAlias<"$Vd32=vpackhub($Vu32,$Vv32):sat", (V6_vpackhub_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackob_altAlias : InstAlias<"$Vd32=vpackob($Vu32,$Vv32)", (V6_vpackob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackob_alt_128BAlias : InstAlias<"$Vd32=vpackob($Vu32,$Vv32)", (V6_vpackob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackoh_altAlias : InstAlias<"$Vd32=vpackoh($Vu32,$Vv32)", (V6_vpackoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackoh_alt_128BAlias : InstAlias<"$Vd32=vpackoh($Vu32,$Vv32)", (V6_vpackoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwh_sat_altAlias : InstAlias<"$Vd32=vpackwh($Vu32,$Vv32):sat", (V6_vpackwh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwh_sat_alt_128BAlias : InstAlias<"$Vd32=vpackwh($Vu32,$Vv32):sat", (V6_vpackwh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwuh_sat_altAlias : InstAlias<"$Vd32=vpackwuh($Vu32,$Vv32):sat", (V6_vpackwuh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpackwuh_sat_alt_128BAlias : InstAlias<"$Vd32=vpackwuh($Vu32,$Vv32):sat", (V6_vpackwuh_sat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vpopcounth_altAlias : InstAlias<"$Vd32=vpopcounth($Vu32)", (V6_vpopcounth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vpopcounth_alt_128BAlias : InstAlias<"$Vd32=vpopcounth($Vu32)", (V6_vpopcounth VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_acc_altAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_altAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybus_alt_128BAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Rt32)", (V6_vrmpybus VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_acc_altAlias : InstAlias<"$Vxx32+=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_acc_alt_128BAlias : InstAlias<"$Vxx32+=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_altAlias : InstAlias<"$Vdd32=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusi_alt_128BAlias : InstAlias<"$Vdd32=vrmpybus($Vuu32,$Rt32,#$Ii)", (V6_vrmpybusi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_acc_altAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_altAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybusv_alt_128BAlias : InstAlias<"$Vd32=vrmpybus($Vu32,$Vv32)", (V6_vrmpybusv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_acc_altAlias : InstAlias<"$Vx32+=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_altAlias : InstAlias<"$Vd32=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpybv_alt_128BAlias : InstAlias<"$Vd32=vrmpyb($Vu32,$Vv32)", (V6_vrmpybv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_acc_altAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub_acc VectorRegs:$Vx32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_altAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyub_alt_128BAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Rt32)", (V6_vrmpyub VectorRegs:$Vd32, VectorRegs:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_acc_altAlias : InstAlias<"$Vxx32+=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_acc_alt_128BAlias : InstAlias<"$Vxx32+=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_altAlias : InstAlias<"$Vdd32=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubi_alt_128BAlias : InstAlias<"$Vdd32=vrmpyub($Vuu32,$Rt32,#$Ii)", (V6_vrmpyubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_acc_altAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_acc_alt_128BAlias : InstAlias<"$Vx32+=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv_acc VectorRegs:$Vx32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_altAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrmpyubv_alt_128BAlias : InstAlias<"$Vd32=vrmpyub($Vu32,$Vv32)", (V6_vrmpyubv VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhb_altAlias : InstAlias<"$Vd32=vroundhb($Vu32,$Vv32):sat", (V6_vroundhb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhb_alt_128BAlias : InstAlias<"$Vd32=vroundhb($Vu32,$Vv32):sat", (V6_vroundhb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhub_altAlias : InstAlias<"$Vd32=vroundhub($Vu32,$Vv32):sat", (V6_vroundhub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundhub_alt_128BAlias : InstAlias<"$Vd32=vroundhub($Vu32,$Vv32):sat", (V6_vroundhub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwh_altAlias : InstAlias<"$Vd32=vroundwh($Vu32,$Vv32):sat", (V6_vroundwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwh_alt_128BAlias : InstAlias<"$Vd32=vroundwh($Vu32,$Vv32):sat", (V6_vroundwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwuh_altAlias : InstAlias<"$Vd32=vroundwuh($Vu32,$Vv32):sat", (V6_vroundwuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vroundwuh_alt_128BAlias : InstAlias<"$Vd32=vroundwuh($Vu32,$Vv32):sat", (V6_vroundwuh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vrsadubi_acc_altAlias : InstAlias<"$Vxx32+=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrsadubi_acc_alt_128BAlias : InstAlias<"$Vxx32+=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrsadubi_altAlias : InstAlias<"$Vdd32=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vrsadubi_alt_128BAlias : InstAlias<"$Vdd32=vrsadub($Vuu32,$Rt32,#$Ii)", (V6_vrsadubi VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32, u1_0Imm:$Ii)>, Requires<[UseHVX]>;
+def V6_vsathub_altAlias : InstAlias<"$Vd32=vsathub($Vu32,$Vv32)", (V6_vsathub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsathub_alt_128BAlias : InstAlias<"$Vd32=vsathub($Vu32,$Vv32)", (V6_vsathub VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsatwh_altAlias : InstAlias<"$Vd32=vsatwh($Vu32,$Vv32)", (V6_vsatwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsatwh_alt_128BAlias : InstAlias<"$Vd32=vsatwh($Vu32,$Vv32)", (V6_vsatwh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsb_altAlias : InstAlias<"$Vdd32=vsxtb($Vu32)", (V6_vsb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsb_alt_128BAlias : InstAlias<"$Vdd32=vsxtb($Vu32)", (V6_vsb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsh_altAlias : InstAlias<"$Vdd32=vsxth($Vu32)", (V6_vsh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsh_alt_128BAlias : InstAlias<"$Vdd32=vsxth($Vu32)", (V6_vsh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshufeh_altAlias : InstAlias<"$Vd32=vshuffeh($Vu32,$Vv32)", (V6_vshufeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufeh_alt_128BAlias : InstAlias<"$Vd32=vshuffeh($Vu32,$Vv32)", (V6_vshufeh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffb_altAlias : InstAlias<"$Vd32=vshuffb($Vu32)", (V6_vshuffb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffb_alt_128BAlias : InstAlias<"$Vd32=vshuffb($Vu32)", (V6_vshuffb VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffeb_altAlias : InstAlias<"$Vd32=vshuffeb($Vu32,$Vv32)", (V6_vshuffeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffeb_alt_128BAlias : InstAlias<"$Vd32=vshuffeb($Vu32,$Vv32)", (V6_vshuffeb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffh_altAlias : InstAlias<"$Vd32=vshuffh($Vu32)", (V6_vshuffh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffh_alt_128BAlias : InstAlias<"$Vd32=vshuffh($Vu32)", (V6_vshuffh VectorRegs:$Vd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vshuffob_altAlias : InstAlias<"$Vd32=vshuffob($Vu32,$Vv32)", (V6_vshuffob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshuffob_alt_128BAlias : InstAlias<"$Vd32=vshuffob($Vu32,$Vv32)", (V6_vshuffob VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeb_altAlias : InstAlias<"$Vdd32=vshuffoeb($Vu32,$Vv32)", (V6_vshufoeb VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeb_alt_128BAlias : InstAlias<"$Vdd32=vshuffoeb($Vu32,$Vv32)", (V6_vshufoeb VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeh_altAlias : InstAlias<"$Vdd32=vshuffoeh($Vu32,$Vv32)", (V6_vshufoeh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoeh_alt_128BAlias : InstAlias<"$Vdd32=vshuffoeh($Vu32,$Vv32)", (V6_vshufoeh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoh_altAlias : InstAlias<"$Vd32=vshuffoh($Vu32,$Vv32)", (V6_vshufoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vshufoh_alt_128BAlias : InstAlias<"$Vd32=vshuffoh($Vu32,$Vv32)", (V6_vshufoh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubb_altAlias : InstAlias<"$Vd32=vsubb($Vu32,$Vv32)", (V6_vsubb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubb_alt_128BAlias : InstAlias<"$Vd32=vsubb($Vu32,$Vv32)", (V6_vsubb VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubb_dv_altAlias : InstAlias<"$Vdd32=vsubb($Vuu32,$Vvv32)", (V6_vsubb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubb_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubb($Vuu32,$Vvv32)", (V6_vsubb_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubbnq_altAlias : InstAlias<"if (!$Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubbnq_alt_128BAlias : InstAlias<"if (!$Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubbq_altAlias : InstAlias<"if ($Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubbq_alt_128BAlias : InstAlias<"if ($Qv4.b) $Vx32.b-=$Vu32.b", (V6_vsubbq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubh_altAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32)", (V6_vsubh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubh_alt_128BAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32)", (V6_vsubh VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubh_dv_altAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32)", (V6_vsubh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubh_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32)", (V6_vsubh_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubhnq_altAlias : InstAlias<"if (!$Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhnq_alt_128BAlias : InstAlias<"if (!$Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhq_altAlias : InstAlias<"if ($Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhq_alt_128BAlias : InstAlias<"if ($Qv4.h) $Vx32.h-=$Vu32.h", (V6_vsubhq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_altAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32):sat", (V6_vsubhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_alt_128BAlias : InstAlias<"$Vd32=vsubh($Vu32,$Vv32):sat", (V6_vsubhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_dv_altAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32):sat", (V6_vsubhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubh($Vuu32,$Vvv32):sat", (V6_vsubhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubhw_altAlias : InstAlias<"$Vdd32=vsubh($Vu32,$Vv32)", (V6_vsubhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubhw_alt_128BAlias : InstAlias<"$Vdd32=vsubh($Vu32,$Vv32)", (V6_vsubhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububh_altAlias : InstAlias<"$Vdd32=vsubub($Vu32,$Vv32)", (V6_vsububh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububh_alt_128BAlias : InstAlias<"$Vdd32=vsubub($Vu32,$Vv32)", (V6_vsububh VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_altAlias : InstAlias<"$Vd32=vsubub($Vu32,$Vv32):sat", (V6_vsububsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_alt_128BAlias : InstAlias<"$Vd32=vsubub($Vu32,$Vv32):sat", (V6_vsububsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_dv_altAlias : InstAlias<"$Vdd32=vsubub($Vuu32,$Vvv32):sat", (V6_vsububsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsububsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubub($Vuu32,$Vvv32):sat", (V6_vsububsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_altAlias : InstAlias<"$Vd32=vsubuh($Vu32,$Vv32):sat", (V6_vsubuhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_alt_128BAlias : InstAlias<"$Vd32=vsubuh($Vu32,$Vv32):sat", (V6_vsubuhsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_dv_altAlias : InstAlias<"$Vdd32=vsubuh($Vuu32,$Vvv32):sat", (V6_vsubuhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubuhsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubuh($Vuu32,$Vvv32):sat", (V6_vsubuhsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubuhw_altAlias : InstAlias<"$Vdd32=vsubuh($Vu32,$Vv32)", (V6_vsubuhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubuhw_alt_128BAlias : InstAlias<"$Vdd32=vsubuh($Vu32,$Vv32)", (V6_vsubuhw VecDblRegs:$Vdd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubw_altAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32)", (V6_vsubw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubw_alt_128BAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32)", (V6_vsubw VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubw_dv_altAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32)", (V6_vsubw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubw_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32)", (V6_vsubw_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubwnq_altAlias : InstAlias<"if (!$Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwnq_alt_128BAlias : InstAlias<"if (!$Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwnq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwq_altAlias : InstAlias<"if ($Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwq_alt_128BAlias : InstAlias<"if ($Qv4.w) $Vx32.w-=$Vu32.w", (V6_vsubwq VectorRegs:$Vx32, VecPredRegs:$Qv4, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_altAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32):sat", (V6_vsubwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_alt_128BAlias : InstAlias<"$Vd32=vsubw($Vu32,$Vv32):sat", (V6_vsubwsat VectorRegs:$Vd32, VectorRegs:$Vu32, VectorRegs:$Vv32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_dv_altAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32):sat", (V6_vsubwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vsubwsat_dv_alt_128BAlias : InstAlias<"$Vdd32=vsubw($Vuu32,$Vvv32):sat", (V6_vsubwsat_dv VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, VecDblRegs:$Vvv32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_acc_altAlias : InstAlias<"$Vxx32+=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_acc_alt_128BAlias : InstAlias<"$Vxx32+=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_altAlias : InstAlias<"$Vdd32=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyb_alt_128BAlias : InstAlias<"$Vdd32=vtmpyb($Vuu32,$Rt32)", (V6_vtmpyb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_acc_altAlias : InstAlias<"$Vxx32+=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_acc_alt_128BAlias : InstAlias<"$Vxx32+=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_altAlias : InstAlias<"$Vdd32=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpybus_alt_128BAlias : InstAlias<"$Vdd32=vtmpybus($Vuu32,$Rt32)", (V6_vtmpybus VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_acc_altAlias : InstAlias<"$Vxx32+=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_acc_alt_128BAlias : InstAlias<"$Vxx32+=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb_acc VecDblRegs:$Vxx32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_altAlias : InstAlias<"$Vdd32=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtmpyhb_alt_128BAlias : InstAlias<"$Vdd32=vtmpyhb($Vuu32,$Rt32)", (V6_vtmpyhb VecDblRegs:$Vdd32, VecDblRegs:$Vuu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtran2x2_mapAlias : InstAlias<"vtrans2x2($Vy32,$Vx32,$Rt32)", (V6_vshuff VectorRegs:$Vy32, VectorRegs:$Vx32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vtran2x2_map_128BAlias : InstAlias<"vtrans2x2($Vy32,$Vx32,$Rt32)", (V6_vshuff VectorRegs:$Vy32, VectorRegs:$Vx32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
+def V6_vunpackb_altAlias : InstAlias<"$Vdd32=vunpackb($Vu32)", (V6_vunpackb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackb_alt_128BAlias : InstAlias<"$Vdd32=vunpackb($Vu32)", (V6_vunpackb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackh_altAlias : InstAlias<"$Vdd32=vunpackh($Vu32)", (V6_vunpackh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackh_alt_128BAlias : InstAlias<"$Vdd32=vunpackh($Vu32)", (V6_vunpackh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackoh_altAlias : InstAlias<"$Vxx32|=vunpackoh($Vu32)", (V6_vunpackoh VecDblRegs:$Vxx32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackoh_alt_128BAlias : InstAlias<"$Vxx32|=vunpackoh($Vu32)", (V6_vunpackoh VecDblRegs:$Vxx32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackub_altAlias : InstAlias<"$Vdd32=vunpackub($Vu32)", (V6_vunpackub VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackub_alt_128BAlias : InstAlias<"$Vdd32=vunpackub($Vu32)", (V6_vunpackub VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackuh_altAlias : InstAlias<"$Vdd32=vunpackuh($Vu32)", (V6_vunpackuh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vunpackuh_alt_128BAlias : InstAlias<"$Vdd32=vunpackuh($Vu32)", (V6_vunpackuh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzb_altAlias : InstAlias<"$Vdd32=vzxtb($Vu32)", (V6_vzb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzb_alt_128BAlias : InstAlias<"$Vdd32=vzxtb($Vu32)", (V6_vzb VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzh_altAlias : InstAlias<"$Vdd32=vzxth($Vu32)", (V6_vzh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def V6_vzh_alt_128BAlias : InstAlias<"$Vdd32=vzxth($Vu32)", (V6_vzh VecDblRegs:$Vdd32, VectorRegs:$Vu32)>, Requires<[UseHVX]>;
+def Y2_dcfetchAlias : InstAlias<"dcfetch($Rs32)", (Y2_dcfetchbo IntRegs:$Rs32, 0)>;
diff --git a/lib/Target/Hexagon/HexagonDepOperands.td b/lib/Target/Hexagon/HexagonDepOperands.td
new file mode 100644
index 0000000000000000000000000000000000000000..0e83b267873224cd4132ab917c27d909923e8b8e
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonDepOperands.td
@@ -0,0 +1,132 @@
+//===--- HexagonDepOperands.td --------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def s3_0ImmOperand : AsmOperandClass { let Name = "s3_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s3_0Imm : Operand<i32> { let ParserMatchClass = s3_0ImmOperand; let DecoderMethod = "s3_0ImmDecoder"; }
+def s3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<3, 0>(N->getSExtValue());}]>;
+def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; }
+def s4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
+def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s29_3Imm : Operand<i32> { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; }
+def s29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
+def s10_6ImmOperand : AsmOperandClass { let Name = "s10_6Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s10_6Imm : Operand<i32> { let ParserMatchClass = s10_6ImmOperand; let DecoderMethod = "s10_6ImmDecoder"; }
+def s10_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<10, 6>(N->getSExtValue());}]>;
+def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; }
+def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
+def a30_2ImmOperand : AsmOperandClass { let Name = "a30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def a30_2Imm : Operand<i32> { let ParserMatchClass = a30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def a30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def u29_3ImmOperand : AsmOperandClass { let Name = "u29_3Imm"; let RenderMethod = "addImmOperands"; }
+def u29_3Imm : Operand<i32> { let ParserMatchClass = u29_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u29_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
+def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand; let DecoderMethod = "s8_0ImmDecoder"; }
+def s8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<8, 0>(N->getSExtValue());}]>;
+def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; let RenderMethod = "addImmOperands"; }
+def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>;
+def u4_2ImmOperand : AsmOperandClass { let Name = "u4_2Imm"; let RenderMethod = "addImmOperands"; }
+def u4_2Imm : Operand<i32> { let ParserMatchClass = u4_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
+def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; let RenderMethod = "addImmOperands"; }
+def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u3_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
+def b15_2ImmOperand : AsmOperandClass { let Name = "b15_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b15_2Imm : Operand<OtherVT> { let ParserMatchClass = b15_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def b15_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<15, 2>(N->getSExtValue());}]>;
+def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; let RenderMethod = "addImmOperands"; }
+def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u11_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
+def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; let DecoderMethod = "s4_3ImmDecoder"; }
+def s4_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
+def m32_0ImmOperand : AsmOperandClass { let Name = "m32_0Imm"; let RenderMethod = "addImmOperands"; }
+def m32_0Imm : Operand<i32> { let ParserMatchClass = m32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def m32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+def u3_1ImmOperand : AsmOperandClass { let Name = "u3_1Imm"; let RenderMethod = "addImmOperands"; }
+def u3_1Imm : Operand<i32> { let ParserMatchClass = u3_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u3_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
+def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; let RenderMethod = "addImmOperands"; }
+def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u1_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>;
+def s31_1ImmOperand : AsmOperandClass { let Name = "s31_1Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s31_1Imm : Operand<i32> { let ParserMatchClass = s31_1ImmOperand; let DecoderMethod = "s31_1ImmDecoder"; }
+def s31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 1>(N->getSExtValue());}]>;
+def s30_2ImmOperand : AsmOperandClass { let Name = "s30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s30_2Imm : Operand<i32> { let ParserMatchClass = s30_2ImmOperand; let DecoderMethod = "s30_2ImmDecoder"; }
+def s30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; let RenderMethod = "addImmOperands"; }
+def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u4_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>;
+def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand; let DecoderMethod = "s6_0ImmDecoder"; }
+def s6_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 0>(N->getSExtValue());}]>;
+def u5_3ImmOperand : AsmOperandClass { let Name = "u5_3Imm"; let RenderMethod = "addImmOperands"; }
+def u5_3Imm : Operand<i32> { let ParserMatchClass = u5_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u5_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
+def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand; let DecoderMethod = "s32_0ImmDecoder"; }
+def s32_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+def s6_3ImmOperand : AsmOperandClass { let Name = "s6_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s6_3Imm : Operand<i32> { let ParserMatchClass = s6_3ImmOperand; let DecoderMethod = "s6_3ImmDecoder"; }
+def s6_3ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
+def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; let RenderMethod = "addImmOperands"; }
+def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>;
+def u31_1ImmOperand : AsmOperandClass { let Name = "u31_1Imm"; let RenderMethod = "addImmOperands"; }
+def u31_1Imm : Operand<i32> { let ParserMatchClass = u31_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u31_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
+def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; let DecoderMethod = "s4_1ImmDecoder"; }
+def s4_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
+def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; let RenderMethod = "addImmOperands"; }
+def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u16_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
+def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; let RenderMethod = "addImmOperands"; }
+def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u6_1ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>;
+def u5_2ImmOperand : AsmOperandClass { let Name = "u5_2Imm"; let RenderMethod = "addImmOperands"; }
+def u5_2Imm : Operand<i32> { let ParserMatchClass = u5_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u5_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>;
+def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; let RenderMethod = "addImmOperands"; }
+def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u26_6ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
+def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; let RenderMethod = "addImmOperands"; }
+def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u6_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
+def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; let RenderMethod = "addImmOperands"; }
+def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u7_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
+def b13_2ImmOperand : AsmOperandClass { let Name = "b13_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b13_2Imm : Operand<OtherVT> { let ParserMatchClass = b13_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def b13_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
+def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; let RenderMethod = "addImmOperands"; }
+def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u5_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
+def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; let RenderMethod = "addImmOperands"; }
+def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u2_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
+def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; let DecoderMethod = "s4_2ImmDecoder"; }
+def s4_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
+def b30_2ImmOperand : AsmOperandClass { let Name = "b30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b30_2Imm : Operand<OtherVT> { let ParserMatchClass = b30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+def b30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; let RenderMethod = "addImmOperands"; }
+def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u8_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
+def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; }
+def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+def u30_2ImmPred : PatLeaf<(i32 imm), [{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
+def s10_0ImmOperand : AsmOperandClass { let Name = "s10_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s10_0Imm : Operand<i32> { let ParserMatchClass = s10_0ImmOperand; let DecoderMethod = "s10_0ImmDecoder"; }
+def s10_0ImmPred : PatLeaf<(i32 imm), [{ return isShiftedInt<10, 0>(N->getSExtValue());}]>;
diff --git a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index d7c726bb36c5092feba8f2af9eb4e7fc93655fe9..67af947e089dd6db718ce2a1da577315d861d099 100644
--- a/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -105,6 +105,8 @@ namespace {
     cl::init(false), cl::desc("Enable branch probability info"));
   cl::opt<unsigned> SizeLimit("eif-limit", cl::init(6), cl::Hidden,
     cl::desc("Size limit in Hexagon early if-conversion"));
+  cl::opt<bool> SkipExitBranches("eif-no-loop-exit", cl::init(false),
+    cl::Hidden, cl::desc("Do not convert branches that may exit the loop"));
 
   struct PrintMB {
     PrintMB(const MachineBasicBlock *B) : MB(B) {}
@@ -142,8 +144,8 @@ namespace {
   raw_ostream &operator<<(raw_ostream &OS, const PrintFP &P) {
     OS << "{ SplitB:" << PrintMB(P.FP.SplitB)
        << ", PredR:" << PrintReg(P.FP.PredR, &P.TRI)
-       << ", TrueB:" << PrintMB(P.FP.TrueB) << ", FalseB:"
-       << PrintMB(P.FP.FalseB)
+       << ", TrueB:" << PrintMB(P.FP.TrueB)
+       << ", FalseB:" << PrintMB(P.FP.FalseB)
        << ", JoinB:" << PrintMB(P.FP.JoinB) << " }";
     return OS;
   }
@@ -187,7 +189,8 @@ namespace {
     bool usesUndefVReg(const MachineInstr *MI) const;
     bool isValid(const FlowPattern &FP) const;
     unsigned countPredicateDefs(const MachineBasicBlock *B) const;
-    unsigned computePhiCost(MachineBasicBlock *B) const;
+    unsigned computePhiCost(const MachineBasicBlock *B,
+          const FlowPattern &FP) const;
     bool isProfitable(const FlowPattern &FP) const;
     bool isPredicableStore(const MachineInstr *MI) const;
     bool isSafeToSpeculate(const MachineInstr *MI) const;
@@ -199,6 +202,9 @@ namespace {
           MachineBasicBlock::iterator At, MachineBasicBlock *FromB,
           unsigned PredR, bool IfTrue);
 
+    unsigned buildMux(MachineBasicBlock *B, MachineBasicBlock::iterator At,
+          const TargetRegisterClass *DRC, unsigned PredR, unsigned TR,
+          unsigned TSR, unsigned FR, unsigned FSR);
     void updatePhiNodes(MachineBasicBlock *WhereB, const FlowPattern &FP);
     void convert(const FlowPattern &FP);
 
@@ -230,7 +236,7 @@ bool HexagonEarlyIfConversion::isPreheader(const MachineBasicBlock *B) const {
     return false;
   MachineBasicBlock *SB = *B->succ_begin();
   MachineLoop *L = MLI->getLoopFor(SB);
-  return L && SB == L->getHeader();
+  return L && SB == L->getHeader() && MDT->dominates(B, SB);
 }
 
 bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
@@ -264,9 +270,6 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
     // mark as diamond with both sides equal?
     return false;
   }
-  // Loop could be null for both.
-  if (MLI->getLoopFor(T1B) != L || MLI->getLoopFor(T2B) != L)
-    return false;
 
   // Record the true/false blocks in such a way that "true" means "if (PredR)",
   // and "false" means "if (!PredR)".
@@ -289,8 +292,14 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
   // it has a single successor. In fact, the block has to end either with
   // an unconditional branch (which can be predicated), or with a fall-
   // through.
-  bool TOk = (TNP == 1) && (TNS == 1);
-  bool FOk = (FNP == 1) && (FNS == 1);
+  // Also, skip blocks that do not belong to the same loop.
+  bool TOk = (TNP == 1 && TNS == 1 && MLI->getLoopFor(TB) == L);
+  bool FOk = (FNP == 1 && FNS == 1 && MLI->getLoopFor(FB) == L);
+
+  // If requested (via an option), do not consider branches where the
+  // true and false targets do not belong to the same loop.
+  if (SkipExitBranches && MLI->getLoopFor(TB) != MLI->getLoopFor(FB))
+    return false;
 
   // If neither is predicable, there is nothing interesting.
   if (!TOk && !FOk)
@@ -307,17 +316,15 @@ bool HexagonEarlyIfConversion::matchFlowPattern(MachineBasicBlock *B,
       // Diamond: "if (P) then TB; else FB;".
     } else {
       // TOk && !FOk
-      if (TSB == FB) {
+      if (TSB == FB)
         JB = FB;
-        FB = nullptr;
-      }
+      FB = nullptr;
     }
   } else {
     // !TOk && FOk  (at least one must be true by now).
-    if (FSB == TB) {
+    if (FSB == TB)
       JB = TB;
-      TB = nullptr;
-    }
+    TB = nullptr;
   }
   // Don't try to predicate loop preheaders.
   if ((TB && isPreheader(TB)) || (FB && isPreheader(FB))) {
@@ -383,8 +390,14 @@ bool HexagonEarlyIfConversion::isValidCandidate(const MachineBasicBlock *B)
       unsigned R = MO.getReg();
       if (!TargetRegisterInfo::isVirtualRegister(R))
         continue;
-      if (MRI->getRegClass(R) != &Hexagon::PredRegsRegClass)
-        continue;
+      switch (MRI->getRegClass(R)->getID()) {
+        case Hexagon::PredRegsRegClassID:
+        case Hexagon::VecPredRegsRegClassID:
+        case Hexagon::VecPredRegs128BRegClassID:
+          break;
+        default:
+          continue;
+      }
       for (auto U = MRI->use_begin(R); U != MRI->use_end(); ++U)
         if (U->getParent()->isPHI())
           return false;
@@ -442,24 +455,39 @@ bool HexagonEarlyIfConversion::isValid(const FlowPattern &FP) const {
   return true;
 }
 
-unsigned HexagonEarlyIfConversion::computePhiCost(MachineBasicBlock *B) const {
-  assert(B->pred_size() <= 2);
+unsigned HexagonEarlyIfConversion::computePhiCost(const MachineBasicBlock *B,
+      const FlowPattern &FP) const {
   if (B->pred_size() < 2)
     return 0;
 
   unsigned Cost = 0;
-  MachineBasicBlock::const_iterator I, E = B->getFirstNonPHI();
-  for (I = B->begin(); I != E; ++I) {
-    const MachineOperand &RO1 = I->getOperand(1);
-    const MachineOperand &RO3 = I->getOperand(3);
-    assert(RO1.isReg() && RO3.isReg());
+  for (const MachineInstr &MI : *B) {
+    if (!MI.isPHI())
+      break;
+    // If both incoming blocks are one of the TrueB/FalseB/SplitB, then
+    // a MUX may be needed. Otherwise the PHI will need to be updated at
+    // no extra cost.
+    // Find the interesting PHI operands for further checks.
+    SmallVector<unsigned,2> Inc;
+    for (unsigned i = 1, e = MI.getNumOperands(); i != e; i += 2) {
+      const MachineBasicBlock *BB = MI.getOperand(i+1).getMBB();
+      if (BB == FP.SplitB || BB == FP.TrueB || BB == FP.FalseB)
+        Inc.push_back(i);
+    }
+    assert(Inc.size() <= 2);
+    if (Inc.size() < 2)
+      continue;
+
+    const MachineOperand &RA = MI.getOperand(1);
+    const MachineOperand &RB = MI.getOperand(3);
+    assert(RA.isReg() && RB.isReg());
     // Must have a MUX if the phi uses a subregister.
-    if (RO1.getSubReg() != 0 || RO3.getSubReg() != 0) {
+    if (RA.getSubReg() != 0 || RB.getSubReg() != 0) {
       Cost++;
       continue;
     }
-    MachineInstr *Def1 = MRI->getVRegDef(RO1.getReg());
-    MachineInstr *Def3 = MRI->getVRegDef(RO3.getReg());
+    const MachineInstr *Def1 = MRI->getVRegDef(RA.getReg());
+    const MachineInstr *Def3 = MRI->getVRegDef(RB.getReg());
     if (!HII->isPredicable(*Def1) || !HII->isPredicable(*Def3))
       Cost++;
   }
@@ -485,7 +513,6 @@ unsigned HexagonEarlyIfConversion::countPredicateDefs(
 
 bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
   if (FP.TrueB && FP.FalseB) {
-
     // Do not IfCovert if the branch is one sided.
     if (MBPI) {
       BranchProbability Prob(9, 10);
@@ -510,18 +537,16 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
   // the code size. If the predicated blocks are smaller than a packet size,
   // approximate the spare room in the packet that could be filled with the
   // predicated/speculated instructions.
-  unsigned TS = 0, FS = 0, Spare = 0;
-  if (FP.TrueB) {
-    TS = std::distance(FP.TrueB->begin(), FP.TrueB->getFirstTerminator());
-    if (TS < HEXAGON_PACKET_SIZE)
-      Spare += HEXAGON_PACKET_SIZE-TS;
-  }
-  if (FP.FalseB) {
-    FS = std::distance(FP.FalseB->begin(), FP.FalseB->getFirstTerminator());
-    if (FS < HEXAGON_PACKET_SIZE)
-      Spare += HEXAGON_PACKET_SIZE-TS;
-  }
-  unsigned TotalIn = TS+FS;
+  auto TotalCount = [] (const MachineBasicBlock *B, unsigned &Spare) {
+    if (!B)
+      return 0u;
+    unsigned T = std::distance(B->begin(), B->getFirstTerminator());
+    if (T < HEXAGON_PACKET_SIZE)
+      Spare += HEXAGON_PACKET_SIZE-T;
+    return T;
+  };
+  unsigned Spare = 0;
+  unsigned TotalIn = TotalCount(FP.TrueB, Spare) + TotalCount(FP.FalseB, Spare);
   DEBUG(dbgs() << "Total number of instructions to be predicated/speculated: "
                << TotalIn << ", spare room: " << Spare << "\n");
   if (TotalIn >= SizeLimit+Spare)
@@ -536,17 +561,17 @@ bool HexagonEarlyIfConversion::isProfitable(const FlowPattern &FP) const {
   unsigned TotalPh = 0;
   unsigned PredDefs = countPredicateDefs(FP.SplitB);
   if (FP.JoinB) {
-    TotalPh = computePhiCost(FP.JoinB);
+    TotalPh = computePhiCost(FP.JoinB, FP);
     PredDefs += countPredicateDefs(FP.JoinB);
   } else {
     if (FP.TrueB && FP.TrueB->succ_size() > 0) {
       MachineBasicBlock *SB = *FP.TrueB->succ_begin();
-      TotalPh += computePhiCost(SB);
+      TotalPh += computePhiCost(SB, FP);
       PredDefs += countPredicateDefs(SB);
     }
     if (FP.FalseB && FP.FalseB->succ_size() > 0) {
       MachineBasicBlock *SB = *FP.FalseB->succ_begin();
-      TotalPh += computePhiCost(SB);
+      TotalPh += computePhiCost(SB, FP);
       PredDefs += countPredicateDefs(SB);
     }
   }
@@ -733,6 +758,43 @@ void HexagonEarlyIfConversion::predicateBlockNB(MachineBasicBlock *ToB,
   }
 }
 
+unsigned HexagonEarlyIfConversion::buildMux(MachineBasicBlock *B,
+      MachineBasicBlock::iterator At, const TargetRegisterClass *DRC,
+      unsigned PredR, unsigned TR, unsigned TSR, unsigned FR, unsigned FSR) {
+  unsigned Opc = 0;
+  switch (DRC->getID()) {
+    case Hexagon::IntRegsRegClassID:
+      Opc = Hexagon::C2_mux;
+      break;
+    case Hexagon::DoubleRegsRegClassID:
+      Opc = Hexagon::PS_pselect;
+      break;
+    case Hexagon::VectorRegsRegClassID:
+      Opc = Hexagon::PS_vselect;
+      break;
+    case Hexagon::VecDblRegsRegClassID:
+      Opc = Hexagon::PS_wselect;
+      break;
+    case Hexagon::VectorRegs128BRegClassID:
+      Opc = Hexagon::PS_vselect_128B;
+      break;
+    case Hexagon::VecDblRegs128BRegClassID:
+      Opc = Hexagon::PS_wselect_128B;
+      break;
+    default:
+      llvm_unreachable("unexpected register type");
+  }
+  const MCInstrDesc &D = HII->get(Opc);
+
+  DebugLoc DL = B->findBranchDebugLoc();
+  unsigned MuxR = MRI->createVirtualRegister(DRC);
+  BuildMI(*B, At, DL, D, MuxR)
+    .addReg(PredR)
+    .addReg(TR, 0, TSR)
+    .addReg(FR, 0, FSR);
+  return MuxR;
+}
+
 void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
       const FlowPattern &FP) {
   // Visit all PHI nodes in the WhereB block and generate MUX instructions
@@ -759,40 +821,25 @@ void HexagonEarlyIfConversion::updatePhiNodes(MachineBasicBlock *WhereB,
       TR = SR, TSR = SSR;
     else if (FR == 0)
       FR = SR, FSR = SSR;
-    assert(TR && FR);
-
-    using namespace Hexagon;
-
-    unsigned DR = PN->getOperand(0).getReg();
-    const TargetRegisterClass *RC = MRI->getRegClass(DR);
-    unsigned Opc = 0;
-    if (RC == &IntRegsRegClass)
-      Opc = C2_mux;
-    else if (RC == &DoubleRegsRegClass)
-      Opc = PS_pselect;
-    else if (RC == &VectorRegsRegClass)
-      Opc = PS_vselect;
-    else if (RC == &VecDblRegsRegClass)
-      Opc = PS_wselect;
-    else if (RC == &VectorRegs128BRegClass)
-      Opc = PS_vselect_128B;
-    else if (RC == &VecDblRegs128BRegClass)
-      Opc = PS_wselect_128B;
-    else
-      llvm_unreachable("unexpected register type");
-    const MCInstrDesc &D = HII->get(Opc);
-
-    MachineBasicBlock::iterator MuxAt = FP.SplitB->getFirstTerminator();
-    DebugLoc DL;
-    if (MuxAt != FP.SplitB->end())
-      DL = MuxAt->getDebugLoc();
-    unsigned MuxR = MRI->createVirtualRegister(RC);
-    BuildMI(*FP.SplitB, MuxAt, DL, D, MuxR)
-      .addReg(FP.PredR)
-      .addReg(TR, 0, TSR)
-      .addReg(FR, 0, FSR);
-
-    PN->addOperand(MachineOperand::CreateReg(MuxR, false));
+
+    assert(TR || FR);
+    unsigned MuxR = 0, MuxSR = 0;
+
+    if (TR && FR) {
+      unsigned DR = PN->getOperand(0).getReg();
+      const TargetRegisterClass *RC = MRI->getRegClass(DR);
+      MuxR = buildMux(FP.SplitB, FP.SplitB->getFirstTerminator(), RC,
+                      FP.PredR, TR, TSR, FR, FSR);
+    } else if (TR) {
+      MuxR = TR;
+      MuxSR = TSR;
+    } else {
+      MuxR = FR;
+      MuxSR = FSR;
+    }
+
+    PN->addOperand(MachineOperand::CreateReg(MuxR, false, false, false, false,
+                                             false, false, MuxSR));
     PN->addOperand(MachineOperand::CreateMBB(FP.SplitB));
   }
 }
diff --git a/lib/Target/Hexagon/HexagonExpandCondsets.cpp b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
index 624a713a80d9ae986af7f1ae0ed95a049731fc07..d8ba5dcd35ad06f6df7155557f8f8143113ec128 100644
--- a/lib/Target/Hexagon/HexagonExpandCondsets.cpp
+++ b/lib/Target/Hexagon/HexagonExpandCondsets.cpp
@@ -362,14 +362,16 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
   if (Range.empty())
     return;
 
-  auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> bool {
+  // Return two booleans: { def-modifes-reg, def-covers-reg }.
+  auto IsRegDef = [this,Reg,LM] (MachineOperand &Op) -> std::pair<bool,bool> {
     if (!Op.isReg() || !Op.isDef())
-      return false;
+      return { false, false };
     unsigned DR = Op.getReg(), DSR = Op.getSubReg();
     if (!TargetRegisterInfo::isVirtualRegister(DR) || DR != Reg)
-      return false;
+      return { false, false };
     LaneBitmask SLM = getLaneMask(DR, DSR);
-    return (SLM & LM).any();
+    LaneBitmask A = SLM & LM;
+    return { A.any(), A == SLM };
   };
 
   // The splitting step will create pairs of predicated definitions without
@@ -453,20 +455,27 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
   // Remove <dead> flags from all defs that are not dead after live range
   // extension, and collect all def operands. They will be used to generate
   // the necessary implicit uses.
+  // At the same time, add <dead> flag to all defs that are actually dead.
+  // This can happen, for example, when a mux with identical inputs is
+  // replaced with a COPY: the use of the predicate register disappears and
+  // the dead can become dead.
   std::set<RegisterRef> DefRegs;
   for (auto &Seg : Range) {
     if (!Seg.start.isRegister())
       continue;
     MachineInstr *DefI = LIS->getInstructionFromIndex(Seg.start);
     for (auto &Op : DefI->operands()) {
-      if (Seg.start.isDead() || !IsRegDef(Op))
-        continue;
-      DefRegs.insert(Op);
-      Op.setIsDead(false);
+      auto P = IsRegDef(Op);
+      if (P.second && Seg.end.isDead()) {
+        Op.setIsDead(true);
+      } else if (P.first) {
+        DefRegs.insert(Op);
+        Op.setIsDead(false);
+      }
     }
   }
 
-  // Finally, add implicit uses to each predicated def that is reached
+  // Now, add implicit uses to each predicated def that is reached
   // by other defs.
   for (auto &Seg : Range) {
     if (!Seg.start.isRegister() || !Range.liveAt(Seg.start.getPrevSlot()))
@@ -486,6 +495,7 @@ void HexagonExpandCondsets::updateDeadsInRange(unsigned Reg, LaneBitmask LM,
     for (RegisterRef R : ImpUses)
       MachineInstrBuilder(MF, DefI).addReg(R.Reg, RegState::Implicit, R.Sub);
   }
+
 }
 
 void HexagonExpandCondsets::updateDeadFlags(unsigned Reg) {
@@ -622,6 +632,12 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
   bool ReadUndef = MD.isUndef();
   MachineBasicBlock::iterator At = MI;
 
+  auto updateRegs = [&UpdRegs] (const MachineInstr &MI) -> void {
+    for (auto &Op : MI.operands())
+      if (Op.isReg())
+        UpdRegs.insert(Op.getReg());
+  };
+
   // If this is a mux of the same register, just replace it with COPY.
   // Ideally, this would happen earlier, so that register coalescing would
   // see it.
@@ -630,6 +646,8 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
   if (ST.isReg() && SF.isReg()) {
     RegisterRef RT(ST);
     if (RT == RegisterRef(SF)) {
+      // Copy regs to update first.
+      updateRegs(MI);
       MI.setDesc(HII->get(TargetOpcode::COPY));
       unsigned S = getRegState(ST);
       while (MI.getNumOperands() > 1)
@@ -651,9 +669,7 @@ bool HexagonExpandCondsets::split(MachineInstr &MI,
   LIS->InsertMachineInstrInMaps(*TfrF);
 
   // Will need to recalculate live intervals for all registers in MI.
-  for (auto &Op : MI.operands())
-    if (Op.isReg())
-      UpdRegs.insert(Op.getReg());
+  updateRegs(MI);
 
   removeInstr(MI);
   return true;
diff --git a/lib/Target/Hexagon/HexagonFrameLowering.cpp b/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 55aee261a6d65c276839b8cb3a233b1419273af5..0e2380f4316a9c24994230285eb4c2915072c694 100644
--- a/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -301,16 +301,30 @@ static bool needsStackFrame(const MachineBasicBlock &MBB, const BitVector &CSR,
         // the frame creation/destruction instructions.
         if (MO.isFI())
           return true;
-        if (!MO.isReg())
-          continue;
-        unsigned R = MO.getReg();
-        // Virtual registers will need scavenging, which then may require
-        // a stack slot.
-        if (TargetRegisterInfo::isVirtualRegister(R))
-          return true;
-        for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
-          if (CSR[*S])
+        if (MO.isReg()) {
+          unsigned R = MO.getReg();
+          // Virtual registers will need scavenging, which then may require
+          // a stack slot.
+          if (TargetRegisterInfo::isVirtualRegister(R))
             return true;
+          for (MCSubRegIterator S(R, &HRI, true); S.isValid(); ++S)
+            if (CSR[*S])
+              return true;
+          continue;
+        }
+        if (MO.isRegMask()) {
+          // A regmask would normally have all callee-saved registers marked
+          // as preserved, so this check would not be needed, but in case of
+          // ever having other regmasks (for other calling conventions),
+          // make sure they would be processed correctly.
+          const uint32_t *BM = MO.getRegMask();
+          for (int x = CSR.find_first(); x >= 0; x = CSR.find_next(x)) {
+            unsigned R = x;
+            // If this regmask does not preserve a CSR, a frame will be needed.
+            if (!(BM[R/32] & (1u << (R%32))))
+              return true;
+          }
+        }
       }
     }
     return false;
@@ -1651,7 +1665,7 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
     // Dead defs are recorded in Clobbers, but are not automatically removed
     // from the live set.
     for (auto &C : Clobbers)
-      if (C.second->isDead())
+      if (C.second->isReg() && C.second->isDead())
         LPR.removeReg(C.first);
   }
 
diff --git a/lib/Target/Hexagon/HexagonGenExtract.cpp b/lib/Target/Hexagon/HexagonGenExtract.cpp
index bb5e379ce01442c64795488c83282e9666c0d6ac..c99ad5130aef5ae03f643bdea3d131b94ed861da 100644
--- a/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -197,13 +197,13 @@ bool HexagonGenExtract::convert(Instruction *In) {
     // It is still ok to generate extract, but only if the mask eliminates
     // those bits (i.e. M does not have any bits set beyond U).
     APInt C = APInt::getHighBitsSet(BW, BW-U);
-    if (M.intersects(C) || !APIntOps::isMask(W, M))
+    if (M.intersects(C) || !M.isMask(W))
       return false;
   } else {
     // Check if M starts with a contiguous sequence of W times 1 bits. Get
     // the low U bits of M (which eliminates the 0 bits shifted in on the
     // left), and check if the result is APInt's "mask":
-    if (!APIntOps::isMask(W, M.getLoBits(U)))
+    if (!M.getLoBits(U).isMask(W))
       return false;
   }
 
@@ -221,11 +221,8 @@ bool HexagonGenExtract::convert(Instruction *In) {
 
 bool HexagonGenExtract::visitBlock(BasicBlock *B) {
   // Depth-first, bottom-up traversal.
-  DomTreeNode *DTN = DT->getNode(B);
-  typedef GraphTraits<DomTreeNode*> GTN;
-  typedef GTN::ChildIteratorType Iter;
-  for (Iter I = GTN::child_begin(DTN), E = GTN::child_end(DTN); I != E; ++I)
-    visitBlock((*I)->getBlock());
+  for (auto *DTN : children<DomTreeNode*>(DT->getNode(B)))
+    visitBlock(DTN->getBlock());
 
   // Allow limiting the number of generated extracts for debugging purposes.
   bool HasCutoff = ExtractCutoff.getPosition();
diff --git a/lib/Target/Hexagon/HexagonGenInsert.cpp b/lib/Target/Hexagon/HexagonGenInsert.cpp
index 5a8e392d1275f6dd40060b31e7b07200e04d408b..54d99d399f88571b81939475d13db3d7c47bd196 100644
--- a/lib/Target/Hexagon/HexagonGenInsert.cpp
+++ b/lib/Target/Hexagon/HexagonGenInsert.cpp
@@ -947,11 +947,8 @@ void HexagonGenInsert::collectInBlock(MachineBasicBlock *B,
     BlockDefs.insert(InsDefs);
   }
 
-  MachineDomTreeNode *N = MDT->getNode(B);
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
-  typedef GTN::ChildIteratorType ChildIter;
-  for (ChildIter I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I) {
-    MachineBasicBlock *SB = (*I)->getBlock();
+  for (auto *DTN : children<MachineDomTreeNode*>(MDT->getNode(B))) {
+    MachineBasicBlock *SB = DTN->getBlock();
     collectInBlock(SB, AVs);
   }
 
@@ -1422,9 +1419,9 @@ bool HexagonGenInsert::generateInserts() {
 
 bool HexagonGenInsert::removeDeadCode(MachineDomTreeNode *N) {
   bool Changed = false;
-  typedef GraphTraits<MachineDomTreeNode*> GTN;
-  for (auto I = GTN::child_begin(N), E = GTN::child_end(N); I != E; ++I)
-    Changed |= removeDeadCode(*I);
+
+  for (auto *DTN : children<MachineDomTreeNode*>(N))
+    Changed |= removeDeadCode(DTN);
 
   MachineBasicBlock *B = N->getBlock();
   std::vector<MachineInstr*> Instrs;
diff --git a/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index e477dcc0f64a8019c5cb26477d664daa5e022722..86a8089401c29bde8c834596bf2aeb7da8d68ea5 100644
--- a/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -100,6 +100,7 @@ namespace {
     MachineRegisterInfo        *MRI;
     MachineDominatorTree       *MDT;
     const HexagonInstrInfo     *TII;
+    const HexagonRegisterInfo  *TRI;
 #ifndef NDEBUG
     static int Counter;
 #endif
@@ -381,7 +382,9 @@ bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) {
   MLI = &getAnalysis<MachineLoopInfo>();
   MRI = &MF.getRegInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
-  TII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
+  const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>();
+  TII = HST.getInstrInfo();
+  TRI = HST.getRegisterInfo();
 
   for (auto &L : *MLI)
     if (!L->getParentLoop()) {
@@ -960,24 +963,21 @@ CountValue *HexagonHardwareLoops::computeCount(MachineLoop *Loop,
 /// \brief Return true if the operation is invalid within hardware loop.
 bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI,
                                                   bool IsInnerHWLoop) const {
-
   // Call is not allowed because the callee may use a hardware loop except for
   // the case when the call never returns.
   if (MI->getDesc().isCall())
     return !TII->doesNotReturn(*MI);
 
   // Check if the instruction defines a hardware loop register.
-  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
-    const MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || !MO.isDef())
-      continue;
-    unsigned R = MO.getReg();
-    if (IsInnerHWLoop && (R == Hexagon::LC0 || R == Hexagon::SA0 ||
-                          R == Hexagon::LC1 || R == Hexagon::SA1))
-      return true;
-    if (!IsInnerHWLoop && (R == Hexagon::LC1 || R == Hexagon::SA1))
+  using namespace Hexagon;
+  static const unsigned Regs01[] = { LC0, SA0, LC1, SA1 };
+  static const unsigned Regs1[]  = { LC1, SA1 };
+  auto CheckRegs = IsInnerHWLoop ? makeArrayRef(Regs01, array_lengthof(Regs01))
+                                 : makeArrayRef(Regs1, array_lengthof(Regs1));
+  for (unsigned R : CheckRegs)
+    if (MI->modifiesRegister(R, TRI))
       return true;
-  }
+
   return false;
 }
 
@@ -1511,7 +1511,7 @@ bool HexagonHardwareLoops::checkForImmediate(const MachineOperand &MO,
       int64_t V1, V2;
       if (!checkForImmediate(S1, V1) || !checkForImmediate(S2, V2))
         return false;
-      TV = V2 | (V1 << 32);
+      TV = V2 | (static_cast<uint64_t>(V1) << 32);
       break;
     }
     case TargetOpcode::REG_SEQUENCE: {
diff --git a/lib/Target/Hexagon/HexagonIICHVX.td b/lib/Target/Hexagon/HexagonIICHVX.td
new file mode 100644
index 0000000000000000000000000000000000000000..4081a225832b0b94da36205086062b99ebb79e44
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIICHVX.td
@@ -0,0 +1,102 @@
+//===--- HexagonIICHVX.td -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+//
+// Though all these itinerary classes exist for V60 onwards, they are being
+// listed here as 'HVXV62Itin' because itinerary class description prior to V62
+// doesn't include operand cycle info. In future, I plan to merge them
+// together and call it 'HVXItin'.
+//
+class HVXV62Itin {
+  list<InstrItinData> HVXV62Itin_list = [
+    InstrItinData<COPROC_VMEM_vtc_long_SLOT01,
+                                   [InstrStage<1, [SLOT0, SLOT1]>],
+                                   [3, 1, 1, 1]>,
+    InstrItinData<COPROC_VX_vtc_long_SLOT23,
+                                   [InstrStage<1, [SLOT2, SLOT3]>],
+                                   [3, 1, 1, 1]>,
+    InstrItinData<COPROC_VX_vtc_SLOT23,
+                                   [InstrStage<1, [SLOT2, SLOT3]>],
+                                   [3, 1, 1, 1]>,
+    InstrItinData<CVI_VA,          [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE,CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VA_DV,       [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF, CVI_MPY01]>],
+                                    [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_LONG,     [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_LATE,     [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX,          [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_DV_LONG,  [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY01]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_DV,       [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                    InstrStage<1, [CVI_MPY01]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_DV_SLOT2, [InstrStage<1, [SLOT2], 0>,
+                                    InstrStage<1, [CVI_MPY01]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VX_DV_SLOT2_LONG_EARLY,
+                                   [InstrStage<1, [SLOT2], 0>,
+                                    InstrStage<1, [CVI_MPY01]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP,          [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_LONG,     [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_VS_EARLY, [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_VS_LONG,  [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_VS,       [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_VS_LONG_EARLY,
+                                   [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VP_DV,       [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLSHF]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VS,          [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_SHIFT]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VINLANESAT,  [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_LD,       [InstrStage<1, [SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_TMP_LD,   [InstrStage<1,[SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD]>],[1, 1, 1, 1, 10]>,
+    InstrItinData<CVI_VM_CUR_LD,   [InstrStage<1,[SLOT0, SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_VP_LDU,   [InstrStage<1,[SLOT0], 0>,
+                                    InstrStage<1, [SLOT1], 0>,
+                                    InstrStage<1, [CVI_LD], 0>,
+                                    InstrStage<1, [CVI_XLANE]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_ST,       [InstrStage<1, [SLOT0], 0>,
+                                    InstrStage<1, [CVI_ST], 0>,
+                                    InstrStage<1, [CVI_XLANE, CVI_SHIFT,
+                                                   CVI_MPY0, CVI_MPY1]>],
+                                   [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_NEW_ST,   [InstrStage<1,[SLOT0], 0>,
+                                    InstrStage<1, [CVI_ST]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_VM_STU,      [InstrStage<1, [SLOT0], 0>,
+                                    InstrStage<1, [SLOT1], 0>,
+                                    InstrStage<1, [CVI_ST], 0>,
+                                    InstrStage<1, [CVI_XLANE]>], [1, 1, 1, 1]>,
+    InstrItinData<CVI_HIST,        [InstrStage<1, [SLOT0,SLOT1,SLOT2,SLOT3], 0>,
+                                    InstrStage<1, [CVI_ALL]>], [1, 1, 1, 1]>];
+}
diff --git a/lib/Target/Hexagon/HexagonIICScalar.td b/lib/Target/Hexagon/HexagonIICScalar.td
new file mode 100644
index 0000000000000000000000000000000000000000..e69cfbdad68888d41b7509f67cb4b7894269ee90
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonIICScalar.td
@@ -0,0 +1,164 @@
+//===--- HexagonIICScalar.td ----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// These itinerary class descriptions are based on the instruction timing
+// classes as per V62. Curretnly, they are just extracted from
+// HexagonScheduleV62.td but will soon be auto-generated by HexagonGen.py.
+
+class ScalarItin {
+  list<InstrItinData> ScalarItin_list = [
+    InstrItinData<ALU32_2op_tc_1_SLOT0123     ,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+    InstrItinData<ALU32_2op_tc_2early_SLOT0123,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+    InstrItinData<ALU32_3op_tc_1_SLOT0123     ,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+    InstrItinData<ALU32_3op_tc_2_SLOT0123     ,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+    InstrItinData<ALU32_3op_tc_2early_SLOT0123,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1]>,
+    InstrItinData<ALU32_ADDI_tc_1_SLOT0123    ,
+                  [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+
+    // ALU64
+    InstrItinData<ALU64_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [1, 1, 1]>,
+    InstrItinData<ALU64_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    InstrItinData<ALU64_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    InstrItinData<ALU64_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+
+    // CR -> System
+    InstrItinData<CR_tc_2_SLOT3      , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
+    InstrItinData<CR_tc_2early_SLOT3 , [InstrStage<1, [SLOT3]>], [2, 1, 1]>,
+    InstrItinData<CR_tc_3x_SLOT3     , [InstrStage<1, [SLOT3]>], [3, 1, 1]>,
+
+    // Jump (conditional/unconditional/return etc)
+    InstrItinData<CR_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                       [2, 1, 1, 1]>,
+    InstrItinData<CR_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                       [3, 1, 1, 1]>,
+    InstrItinData<CJ_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                       [1, 1, 1, 1]>,
+    InstrItinData<CJ_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                       [2, 1, 1, 1]>,
+    InstrItinData<J_tc_2early_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                       [2, 1, 1, 1]>,
+    InstrItinData<J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT,
+        [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1, 1]>,
+
+    // JR
+    InstrItinData<J_tc_2early_SLOT2  , [InstrStage<1, [SLOT2]>], [2, 1, 1]>,
+    InstrItinData<J_tc_3stall_SLOT2  , [InstrStage<1, [SLOT2]>], [3, 1, 1]>,
+
+    // Extender
+    InstrItinData<EXTENDER_tc_1_SLOT0123, [InstrStage<1,
+                          [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 1, 1, 1]>,
+
+    // Load
+    InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<1, [SLOT0, SLOT1]>],
+                                         [3, 1]>,
+    InstrItinData<LD_tc_ld_pi_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
+                                         [3, 1]>,
+    InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>], [4, 1]>,
+    InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<1, [SLOT0]>], [3, 1]>,
+
+    // M
+    InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [1, 1, 1]>,
+    InstrItinData<M_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [2, 1, 1]>,
+    InstrItinData<M_tc_2_acc_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [2, 1, 1]>,
+    InstrItinData<M_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [3, 1, 1]>,
+    InstrItinData<M_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [3, 1, 1]>,
+    InstrItinData<M_tc_3x_acc_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [3, 1, 1, 1]>,
+    InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [4, 1, 1]>,
+    InstrItinData<M_tc_3or4x_acc_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [4, 1, 1]>,
+    InstrItinData<M_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                      [3, 1, 1]>,
+
+    // Store
+    InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
+                                      [1, 1, 1]>,
+    InstrItinData<ST_tc_st_pi_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>],
+                                      [1, 1, 1]>,
+    InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<1, [SLOT0]>], [3, 1, 1]>,
+    InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<1, [SLOT0]>], [3, 1, 1]>,
+    InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+    InstrItinData<ST_tc_st_pi_SLOT0 , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+
+    // S
+    InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [1, 1, 1]>,
+    InstrItinData<S_2op_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    InstrItinData<S_2op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    // The S_2op_tc_3x_SLOT23 slots are 4 cycles on v60.
+    InstrItinData<S_2op_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [4, 1, 1]>,
+    InstrItinData<S_3op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [1, 1, 1]>,
+    InstrItinData<S_3op_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    InstrItinData<S_3op_tc_2early_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
+    InstrItinData<S_3op_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+    InstrItinData<S_3op_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+    InstrItinData<S_3op_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
+
+    // New Value Compare Jump
+    InstrItinData<NCJ_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>],
+                                          [3, 1, 1, 1]>,
+
+    // Mem ops
+    InstrItinData<V2LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>],
+                                        [1, 1, 1, 1]>,
+    InstrItinData<V2LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                        [2, 1, 1, 1]>,
+    InstrItinData<V2LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                        [1, 1, 1, 1]>,
+    InstrItinData<V4LDST_tc_st_SLOT0  , [InstrStage<1, [SLOT0]>],
+                                        [1, 1, 1, 1]>,
+    InstrItinData<V4LDST_tc_ld_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                        [3, 1, 1, 1]>,
+    InstrItinData<V4LDST_tc_st_SLOT01 , [InstrStage<1, [SLOT0, SLOT1]>],
+                                        [1, 1, 1, 1]>,
+
+    // Endloop
+    InstrItinData<J_tc_2early_SLOT0123, [InstrStage<1, [SLOT_ENDLOOP]>],
+                                        [2]>,
+    InstrItinData<MAPPING_tc_1_SLOT0123      ,
+                         [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                         [1, 1, 1, 1]>,
+
+    // Duplex and Compound
+    InstrItinData<DUPLEX     , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+    InstrItinData<COMPOUND_CJ_ARCHDEPSLOT,
+        [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 1]>,
+    InstrItinData<COMPOUND , [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 1]>,
+    // Misc
+    InstrItinData<PREFIX , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                           [1, 1, 1]>,
+    InstrItinData<PSEUDO , [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>],
+                           [1, 1, 1]>,
+    InstrItinData<PSEUDOM    , [InstrStage<1, [SLOT2, SLOT3], 0>,
+                                InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 1]>];
+}
diff --git a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index f6012d29d422edcecb4fbd64b75a9de6cc4347e6..8e10c521a77d38154b729b9d29bcc158a9c52623 100644
--- a/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -123,6 +123,12 @@ private:
   bool isAlignedMemNode(const MemSDNode *N) const;
   bool isPositiveHalfWord(const SDNode *N) const;
 
+  // DAG preprocessing functions.
+  void ppSimplifyOrSelect0(std::vector<SDNode*> &&Nodes);
+  void ppAddrReorderAddShl(std::vector<SDNode*> &&Nodes);
+  void ppAddrRewriteAndSrl(std::vector<SDNode*> &&Nodes);
+  void ppHoistZextI1(std::vector<SDNode*> &&Nodes);
+
   SmallDenseMap<SDNode *,int> RootWeights;
   SmallDenseMap<SDNode *,int> RootHeights;
   SmallDenseMap<const Value *,int> GAUsesInFunction;
@@ -932,55 +938,21 @@ void HexagonDAGToDAGISel::SelectBitcast(SDNode *N) {
 
 
 void HexagonDAGToDAGISel::Select(SDNode *N) {
-  if (N->isMachineOpcode()) {
-    N->setNodeId(-1);
-    return;   // Already selected.
-  }
+  if (N->isMachineOpcode())
+    return N->setNodeId(-1);  // Already selected.
 
   switch (N->getOpcode()) {
-  case ISD::Constant:
-    SelectConstant(N);
-    return;
-
-  case ISD::ConstantFP:
-    SelectConstantFP(N);
-    return;
-
-  case ISD::FrameIndex:
-    SelectFrameIndex(N);
-    return;
-
-  case ISD::BITCAST:
-    SelectBitcast(N);
-    return;
-
-  case ISD::SHL:
-    SelectSHL(N);
-    return;
-
-  case ISD::LOAD:
-    SelectLoad(N);
-    return;
-
-  case ISD::STORE:
-    SelectStore(N);
-    return;
-
-  case ISD::MUL:
-    SelectMul(N);
-    return;
-
-  case ISD::ZERO_EXTEND:
-    SelectZeroExtend(N);
-    return;
-
-  case ISD::INTRINSIC_W_CHAIN:
-    SelectIntrinsicWChain(N);
-    return;
-
-  case ISD::INTRINSIC_WO_CHAIN:
-    SelectIntrinsicWOChain(N);
-    return;
+  case ISD::Constant:             return SelectConstant(N);
+  case ISD::ConstantFP:           return SelectConstantFP(N);
+  case ISD::FrameIndex:           return SelectFrameIndex(N);
+  case ISD::BITCAST:              return SelectBitcast(N);
+  case ISD::SHL:                  return SelectSHL(N);
+  case ISD::LOAD:                 return SelectLoad(N);
+  case ISD::STORE:                return SelectStore(N);
+  case ISD::MUL:                  return SelectMul(N);
+  case ISD::ZERO_EXTEND:          return SelectZeroExtend(N);
+  case ISD::INTRINSIC_W_CHAIN:    return SelectIntrinsicWChain(N);
+  case ISD::INTRINSIC_WO_CHAIN:   return SelectIntrinsicWOChain(N);
   }
 
   SelectCode(N);
@@ -1010,15 +982,52 @@ SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
 }
 
 
-void HexagonDAGToDAGISel::PreprocessISelDAG() {
+static bool isMemOPCandidate(SDNode *I, SDNode *U) {
+  // I is an operand of U. Check if U is an arithmetic (binary) operation
+  // usable in a memop, where the other operand is a loaded value, and the
+  // result of U is stored in the same location.
+
+  if (!U->hasOneUse())
+    return false;
+  unsigned Opc = U->getOpcode();
+  switch (Opc) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::AND:
+    case ISD::OR:
+      break;
+    default:
+      return false;
+  }
+
+  SDValue S0 = U->getOperand(0);
+  SDValue S1 = U->getOperand(1);
+  SDValue SY = (S0.getNode() == I) ? S1 : S0;
+
+  SDNode *UUse = *U->use_begin();
+  if (UUse->getNumValues() != 1)
+    return false;
+
+  // Check if one of the inputs to U is a load instruction and the output
+  // is used by a store instruction. If so and they also have the same
+  // base pointer, then don't preoprocess this node sequence as it
+  // can be matched to a memop.
+  SDNode *SYNode = SY.getNode();
+  if (UUse->getOpcode() == ISD::STORE && SYNode->getOpcode() == ISD::LOAD) {
+    SDValue LDBasePtr = cast<MemSDNode>(SYNode)->getBasePtr();
+    SDValue STBasePtr = cast<MemSDNode>(UUse)->getBasePtr();
+    if (LDBasePtr == STBasePtr)
+      return true;
+  }
+  return false;
+}
+
+
+// Transform: (or (select c x 0) z)  ->  (select c (or x z) z)
+//            (or (select c 0 y) z)  ->  (select c z (or y z))
+void HexagonDAGToDAGISel::ppSimplifyOrSelect0(std::vector<SDNode*> &&Nodes) {
   SelectionDAG &DAG = *CurDAG;
-  std::vector<SDNode*> Nodes;
-  for (SDNode &Node : DAG.allnodes())
-    Nodes.push_back(&Node);
 
-  // Simplify: (or (select c x 0) z)  ->  (select c (or x z) z)
-  //           (or (select c 0 y) z)  ->  (select c z (or y z))
-  // This may not be the right thing for all targets, so do it here.
   for (auto I : Nodes) {
     if (I->getOpcode() != ISD::OR)
       continue;
@@ -1056,18 +1065,22 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
       }
     }
   }
+}
+
+// Transform: (store ch val (add x (add (shl y c) e)))
+//        to: (store ch val (add x (shl (add y d) c))),
+// where e = (shl d c) for some integer d.
+// The purpose of this is to enable generation of loads/stores with
+// shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
+// value c must be 0, 1 or 2.
+void HexagonDAGToDAGISel::ppAddrReorderAddShl(std::vector<SDNode*> &&Nodes) {
+  SelectionDAG &DAG = *CurDAG;
 
-  // Transform: (store ch addr (add x (add (shl y c) e)))
-  //        to: (store ch addr (add x (shl (add y d) c))),
-  // where e = (shl d c) for some integer d.
-  // The purpose of this is to enable generation of loads/stores with
-  // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
-  // value c must be 0, 1 or 2.
   for (auto I : Nodes) {
     if (I->getOpcode() != ISD::STORE)
       continue;
 
-    // I matched: (store ch addr Off)
+    // I matched: (store ch val Off)
     SDValue Off = I->getOperand(2);
     // Off needs to match: (add x (add (shl y c) (shl d c))))
     if (Off.getOpcode() != ISD::ADD)
@@ -1109,15 +1122,192 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
     SDValue NewShl = DAG.getNode(ISD::SHL, DL, VT, NewAdd, C);
     ReplaceNode(T0.getNode(), NewShl.getNode());
   }
+}
+
+// Transform: (load ch (add x (and (srl y c) Mask)))
+//        to: (load ch (add x (shl (srl y d) d-c)))
+// where
+// Mask = 00..0 111..1 0.0
+//          |     |     +-- d-c 0s, and d-c is 0, 1 or 2.
+//          |     +-------- 1s
+//          +-------------- at most c 0s
+// Motivating example:
+// DAG combiner optimizes (add x (shl (srl y 5) 2))
+//                     to (add x (and (srl y 3) 1FFFFFFC))
+// which results in a constant-extended and(##...,lsr). This transformation
+// undoes this simplification for cases where the shl can be folded into
+// an addressing mode.
+void HexagonDAGToDAGISel::ppAddrRewriteAndSrl(std::vector<SDNode*> &&Nodes) {
+  SelectionDAG &DAG = *CurDAG;
+
+  for (SDNode *N : Nodes) {
+    unsigned Opc = N->getOpcode();
+    if (Opc != ISD::LOAD && Opc != ISD::STORE)
+      continue;
+    SDValue Addr = Opc == ISD::LOAD ? N->getOperand(1) : N->getOperand(2);
+    // Addr must match: (add x T0)
+    if (Addr.getOpcode() != ISD::ADD)
+      continue;
+    SDValue T0 = Addr.getOperand(1);
+    // T0 must match: (and T1 Mask)
+    if (T0.getOpcode() != ISD::AND)
+      continue;
+
+    // We have an AND.
+    //
+    // Check the first operand. It must be: (srl y c).
+    SDValue S = T0.getOperand(0);
+    if (S.getOpcode() != ISD::SRL)
+      continue;
+    ConstantSDNode *SN = dyn_cast<ConstantSDNode>(S.getOperand(1).getNode());
+    if (SN == nullptr)
+      continue;
+    if (SN->getAPIntValue().getBitWidth() != 32)
+      continue;
+    uint32_t CV = SN->getZExtValue();
+
+    // Check the second operand: the supposed mask.
+    ConstantSDNode *MN = dyn_cast<ConstantSDNode>(T0.getOperand(1).getNode());
+    if (MN == nullptr)
+      continue;
+    if (MN->getAPIntValue().getBitWidth() != 32)
+      continue;
+    uint32_t Mask = MN->getZExtValue();
+    // Examine the mask.
+    uint32_t TZ = countTrailingZeros(Mask);
+    uint32_t M1 = countTrailingOnes(Mask >> TZ);
+    uint32_t LZ = countLeadingZeros(Mask);
+    // Trailing zeros + middle ones + leading zeros must equal the width.
+    if (TZ + M1 + LZ != 32)
+      continue;
+    // The number of trailing zeros will be encoded in the addressing mode.
+    if (TZ > 2)
+      continue;
+    // The number of leading zeros must be at most c.
+    if (LZ > CV)
+      continue;
+
+    // All looks good.
+    SDValue Y = S.getOperand(0);
+    EVT VT = Addr.getValueType();
+    SDLoc dl(S);
+    // TZ = D-C, so D = TZ+C.
+    SDValue D = DAG.getConstant(TZ+CV, dl, VT);
+    SDValue DC = DAG.getConstant(TZ, dl, VT);
+    SDValue NewSrl = DAG.getNode(ISD::SRL, dl, VT, Y, D);
+    SDValue NewShl = DAG.getNode(ISD::SHL, dl, VT, NewSrl, DC);
+    ReplaceNode(T0.getNode(), NewShl.getNode());
+  }
+}
+
+// Transform: (op ... (zext i1 c) ...) -> (select c (op ... 0 ...)
+//                                                  (op ... 1 ...))
+void HexagonDAGToDAGISel::ppHoistZextI1(std::vector<SDNode*> &&Nodes) {
+  SelectionDAG &DAG = *CurDAG;
+
+  for (SDNode *N : Nodes) {
+    unsigned Opc = N->getOpcode();
+    if (Opc != ISD::ZERO_EXTEND)
+      continue;
+    SDValue OpI1 = N->getOperand(0);
+    EVT OpVT = OpI1.getValueType();
+    if (!OpVT.isSimple() || OpVT.getSimpleVT() != MVT::i1)
+      continue;
+    for (auto I = N->use_begin(), E = N->use_end(); I != E; ++I) {
+      SDNode *U = *I;
+      if (U->getNumValues() != 1)
+        continue;
+      EVT UVT = U->getValueType(0);
+      if (!UVT.isSimple() || !UVT.isInteger() || UVT.getSimpleVT() == MVT::i1)
+        continue;
+      if (isMemOPCandidate(N, U))
+        continue;
+
+      // Potentially simplifiable operation.
+      unsigned I1N = I.getOperandNo();
+      SmallVector<SDValue,2> Ops(U->getNumOperands());
+      for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i)
+        Ops[i] = U->getOperand(i);
+      EVT BVT = Ops[I1N].getValueType();
+
+      SDLoc dl(U);
+      SDValue C0 = DAG.getConstant(0, dl, BVT);
+      SDValue C1 = DAG.getConstant(1, dl, BVT);
+      SDValue If0, If1;
+
+      if (isa<MachineSDNode>(U)) {
+        unsigned UseOpc = U->getMachineOpcode();
+        Ops[I1N] = C0;
+        If0 = SDValue(DAG.getMachineNode(UseOpc, dl, UVT, Ops), 0);
+        Ops[I1N] = C1;
+        If1 = SDValue(DAG.getMachineNode(UseOpc, dl, UVT, Ops), 0);
+      } else {
+        unsigned UseOpc = U->getOpcode();
+        Ops[I1N] = C0;
+        If0 = DAG.getNode(UseOpc, dl, UVT, Ops);
+        Ops[I1N] = C1;
+        If1 = DAG.getNode(UseOpc, dl, UVT, Ops);
+      }
+      SDValue Sel = DAG.getNode(ISD::SELECT, dl, UVT, OpI1, If1, If0);
+      DAG.ReplaceAllUsesWith(U, Sel.getNode());
+    }
+  }
+}
+
+void HexagonDAGToDAGISel::PreprocessISelDAG() {
+  // Repack all nodes before calling each preprocessing function,
+  // because each of them can modify the set of nodes.
+  auto getNodes = [this] () -> std::vector<SDNode*> {
+    std::vector<SDNode*> T;
+    T.reserve(CurDAG->allnodes_size());
+    for (SDNode &N : CurDAG->allnodes())
+      T.push_back(&N);
+    return T;
+  };
+
+  // Transform: (or (select c x 0) z)  ->  (select c (or x z) z)
+  //            (or (select c 0 y) z)  ->  (select c z (or y z))
+  ppSimplifyOrSelect0(getNodes());
+
+  // Transform: (store ch val (add x (add (shl y c) e)))
+  //        to: (store ch val (add x (shl (add y d) c))),
+  // where e = (shl d c) for some integer d.
+  // The purpose of this is to enable generation of loads/stores with
+  // shifted addressing mode, i.e. mem(x+y<<#c). For that, the shift
+  // value c must be 0, 1 or 2.
+  ppAddrReorderAddShl(getNodes());
+
+  // Transform: (load ch (add x (and (srl y c) Mask)))
+  //        to: (load ch (add x (shl (srl y d) d-c)))
+  // where
+  // Mask = 00..0 111..1 0.0
+  //          |     |     +-- d-c 0s, and d-c is 0, 1 or 2.
+  //          |     +-------- 1s
+  //          +-------------- at most c 0s
+  // Motivating example:
+  // DAG combiner optimizes (add x (shl (srl y 5) 2))
+  //                     to (add x (and (srl y 3) 1FFFFFFC))
+  // which results in a constant-extended and(##...,lsr). This transformation
+  // undoes this simplification for cases where the shl can be folded into
+  // an addressing mode.
+  ppAddrRewriteAndSrl(getNodes());
+
+  // Transform: (op ... (zext i1 c) ...) -> (select c (op ... 0 ...)
+  //                                                  (op ... 1 ...))
+  ppHoistZextI1(getNodes());
+
+  DEBUG_WITH_TYPE("isel", {
+    dbgs() << "Preprocessed (Hexagon) selection DAG:";
+    CurDAG->dump();
+  });
 
   if (EnableAddressRebalancing) {
     rebalanceAddressTrees();
 
-    DEBUG(
-      dbgs() << "************* SelectionDAG after preprocessing: ***********\n";
+    DEBUG_WITH_TYPE("isel", {
+      dbgs() << "Address tree balanced selection DAG:";
       CurDAG->dump();
-      dbgs() << "************* End SelectionDAG after preprocessing ********\n";
-    );
+    });
   }
 }
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp
index e87e1e6a7e0fefcfbc75560a3ac7bcdf19de960f..0a5e9aed4f1315f90ec416aceeb1e09d57980a1e 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -256,7 +256,9 @@ static bool CC_Hexagon (unsigned ValNo, MVT ValVT, MVT LocVT,
     return false;
   }
 
-  if (LocVT == MVT::i1 || LocVT == MVT::i8 || LocVT == MVT::i16) {
+  if (LocVT == MVT::i1) {
+    LocVT = MVT::i32;
+  } else if (LocVT == MVT::i8 || LocVT == MVT::i16) {
     LocVT = MVT::i32;
     ValVT = MVT::i32;
     if (ArgFlags.isSExt())
@@ -644,11 +646,11 @@ bool HexagonTargetLowering::mayBeEmittedAsTailCall(CallInst *CI) const {
 
 /// LowerCallResult - Lower the result values of an ISD::CALL into the
 /// appropriate copies out of appropriate physical registers.  This assumes that
-/// Chain/InFlag are the input chain/flag to use, and that TheCall is the call
+/// Chain/Glue are the input chain/glue to use, and that TheCall is the call
 /// being lowered. Returns a SDNode with the same number of values as the
 /// ISD::CALL.
 SDValue HexagonTargetLowering::LowerCallResult(
-    SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
+    SDValue Chain, SDValue Glue, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
     const SmallVectorImpl<SDValue> &OutVals, SDValue Callee) const {
@@ -671,21 +673,24 @@ SDValue HexagonTargetLowering::LowerCallResult(
       // predicate register as the call result.
       auto &MRI = DAG.getMachineFunction().getRegInfo();
       SDValue FR0 = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
-                                       MVT::i32, InFlag);
+                                       MVT::i32, Glue);
       // FR0 = (Value, Chain, Glue)
       unsigned PredR = MRI.createVirtualRegister(&Hexagon::PredRegsRegClass);
       SDValue TPR = DAG.getCopyToReg(FR0.getValue(1), dl, PredR,
                                      FR0.getValue(0), FR0.getValue(2));
       // TPR = (Chain, Glue)
-      RetVal = DAG.getCopyFromReg(TPR.getValue(0), dl, PredR, MVT::i1,
-                                  TPR.getValue(1));
+      // Don't glue this CopyFromReg, because it copies from a virtual
+      // register. If it is glued to the call, InstrEmitter will add it
+      // as an implicit def to the call (EmitMachineNode).
+      RetVal = DAG.getCopyFromReg(TPR.getValue(0), dl, PredR, MVT::i1);
+      Glue = TPR.getValue(1);
     } else {
       RetVal = DAG.getCopyFromReg(Chain, dl, RVLocs[i].getLocReg(),
-                                  RVLocs[i].getValVT(), InFlag);
+                                  RVLocs[i].getValVT(), Glue);
+      Glue = RetVal.getValue(2);
     }
     InVals.push_back(RetVal.getValue(0));
     Chain = RetVal.getValue(1);
-    InFlag = RetVal.getValue(2);
   }
 
   return Chain;
@@ -840,16 +845,17 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
 
+  SDValue Glue;
   if (!IsTailCall) {
     SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true);
     Chain = DAG.getCALLSEQ_START(Chain, C, dl);
+    Glue = Chain.getValue(1);
   }
 
   // Build a sequence of copy-to-reg nodes chained together with token
   // chain and flag operands which copy the outgoing args into registers.
   // The Glue is necessary since all emitted instructions must be
   // stuck together.
-  SDValue Glue;
   if (!IsTailCall) {
     for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
       Chain = DAG.getCopyToReg(Chain, dl, RegsToPass[i].first,
@@ -902,6 +908,10 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                   RegsToPass[i].second.getValueType()));
   }
 
+  const uint32_t *Mask = HRI.getCallPreservedMask(MF, CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
   if (Glue.getNode())
     Ops.push_back(Glue);
 
@@ -1054,6 +1064,18 @@ SDValue HexagonTargetLowering::LowerPREFETCH(SDValue Op,
   return DAG.getNode(HexagonISD::DCFETCH, DL, MVT::Other, Chain, Addr, Zero);
 }
 
+// Custom-handle ISD::READCYCLECOUNTER because the target-independent SDNode
+// is marked as having side-effects, while the register read on Hexagon does
+// not have any. TableGen refuses to accept the direct pattern from that node
+// to the A4_tfrcpp.
+SDValue HexagonTargetLowering::LowerREADCYCLECOUNTER(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDValue Chain = Op.getOperand(0);
+  SDLoc dl(Op);
+  SDVTList VTs = DAG.getVTList(MVT::i32, MVT::Other);
+  return DAG.getNode(HexagonISD::READCYCLE, dl, VTs, Chain);
+}
+
 SDValue HexagonTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
       SelectionDAG &DAG) const {
   SDValue Chain = Op.getOperand(0);
@@ -1140,10 +1162,25 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
       EVT RegVT = VA.getLocVT();
       if (RegVT == MVT::i8 || RegVT == MVT::i16 ||
           RegVT == MVT::i32 || RegVT == MVT::f32) {
-        unsigned VReg =
+        unsigned VReg = 
           RegInfo.createVirtualRegister(&Hexagon::IntRegsRegClass);
         RegInfo.addLiveIn(VA.getLocReg(), VReg);
-        InVals.push_back(DAG.getCopyFromReg(Chain, dl, VReg, RegVT));
+        SDValue Copy = DAG.getCopyFromReg(Chain, dl, VReg, RegVT);
+        // Treat values of type MVT::i1 specially: they are passed in
+        // registers of type i32, but they need to remain as values of
+        // type i1 for consistency of the argument lowering.
+        if (VA.getValVT() == MVT::i1) {
+          // Generate a copy into a predicate register and use the value
+          // of the register as the "InVal".
+          unsigned PReg =
+            RegInfo.createVirtualRegister(&Hexagon::PredRegsRegClass);
+          SDNode *T = DAG.getMachineNode(Hexagon::C2_tfrrp, dl, MVT::i1,
+                                         Copy.getValue(0));
+          Copy = DAG.getCopyToReg(Copy.getValue(1), dl, PReg, SDValue(T, 0));
+          Copy = DAG.getCopyFromReg(Copy, dl, PReg, MVT::i1);
+        }
+        InVals.push_back(Copy);
+        Chain = Copy.getValue(1);
       } else if (RegVT == MVT::i64 || RegVT == MVT::f64) {
         unsigned VReg =
           RegInfo.createVirtualRegister(&Hexagon::DoubleRegsRegClass);
@@ -1272,17 +1309,6 @@ static bool isSExtFree(SDValue N) {
   return false;
 }
 
-SDValue HexagonTargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc dl(Op);
-  SDValue InpVal = Op.getOperand(0);
-  if (isa<ConstantSDNode>(InpVal)) {
-    uint64_t V = cast<ConstantSDNode>(InpVal)->getZExtValue();
-    return DAG.getTargetConstant(countPopulation(V), dl, MVT::i64);
-  }
-  SDValue PopOut = DAG.getNode(HexagonISD::POPCOUNT, dl, MVT::i32, InpVal);
-  return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i64, PopOut);
-}
-
 SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
 
@@ -1571,9 +1597,10 @@ HexagonTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG)
 
 SDValue
 HexagonTargetLowering::GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
-      GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT, unsigned ReturnReg,
+      GlobalAddressSDNode *GA, SDValue Glue, EVT PtrVT, unsigned ReturnReg,
       unsigned char OperandFlags) const {
-  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
   SDLoc dl(GA);
   SDValue TGA = DAG.getTargetGlobalAddress(GA->getGlobal(), dl,
@@ -1585,23 +1612,21 @@ HexagonTargetLowering::GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
   // 2. Callee which in this case is the Global address value.
   // 3. Registers live into the call.In this case its R0, as we
   //    have just one argument to be passed.
-  // 4. InFlag if there is any.
+  // 4. Glue.
   // Note: The order is important.
 
-  if (InFlag) {
-    SDValue Ops[] = { Chain, TGA,
-                      DAG.getRegister(Hexagon::R0, PtrVT), *InFlag };
-    Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
-  } else {
-    SDValue Ops[]  = { Chain, TGA, DAG.getRegister(Hexagon::R0, PtrVT)};
-    Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
-  }
+  const auto &HRI = *Subtarget.getRegisterInfo();
+  const uint32_t *Mask = HRI.getCallPreservedMask(MF, CallingConv::C);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  SDValue Ops[] = { Chain, TGA, DAG.getRegister(Hexagon::R0, PtrVT),
+                    DAG.getRegisterMask(Mask), Glue };
+  Chain = DAG.getNode(HexagonISD::CALL, dl, NodeTys, Ops);
 
   // Inform MFI that function has calls.
   MFI.setAdjustsStack(true);
 
-  SDValue Flag = Chain.getValue(1);
-  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Flag);
+  Glue = Chain.getValue(1);
+  return DAG.getCopyFromReg(Chain, dl, ReturnReg, PtrVT, Glue);
 }
 
 //
@@ -1694,7 +1719,7 @@ HexagonTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   Chain = DAG.getCopyToReg(DAG.getEntryNode(), dl, Hexagon::R0, Chain, InFlag);
   InFlag = Chain.getValue(1);
 
-  return GetDynamicTLSAddr(DAG, Chain, GA, &InFlag, PtrVT,
+  return GetDynamicTLSAddr(DAG, Chain, GA, InFlag, PtrVT,
                            Hexagon::R0, HexagonII::MO_GDPLT);
 }
 
@@ -1821,6 +1846,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand);
   setOperationAction(ISD::INLINEASM, MVT::Other, Custom);
   setOperationAction(ISD::PREFETCH, MVT::Other, Custom);
+  setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Custom);
   setOperationAction(ISD::INTRINSIC_VOID, MVT::Other, Custom);
   setOperationAction(ISD::EH_RETURN, MVT::Other, Custom);
   setOperationAction(ISD::GLOBAL_OFFSET_TABLE, MVT::i32, Custom);
@@ -1891,7 +1917,12 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTPOP, MVT::i8,  Promote);
   setOperationAction(ISD::CTPOP, MVT::i16, Promote);
   setOperationAction(ISD::CTPOP, MVT::i32, Promote);
-  setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i64, Legal);
+
+  setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
+  setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
+  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+  setOperationAction(ISD::BSWAP, MVT::i64, Legal);
 
   // We custom lower i64 to i64 mul, so that it is not considered as a legal
   // operation. There is a pattern that will match i64 mul and transform it
@@ -1901,7 +1932,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   for (unsigned IntExpOp :
        { ISD::SDIV,      ISD::UDIV,      ISD::SREM,      ISD::UREM,
          ISD::SDIVREM,   ISD::UDIVREM,   ISD::ROTL,      ISD::ROTR,
-         ISD::BSWAP,     ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
+         ISD::SHL_PARTS, ISD::SRA_PARTS, ISD::SRL_PARTS,
          ISD::SMUL_LOHI, ISD::UMUL_LOHI }) {
     setOperationAction(IntExpOp, MVT::i32, Expand);
     setOperationAction(IntExpOp, MVT::i64, Expand);
@@ -2268,7 +2299,6 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::INSERTRP:      return "HexagonISD::INSERTRP";
   case HexagonISD::JT:            return "HexagonISD::JT";
   case HexagonISD::PACKHL:        return "HexagonISD::PACKHL";
-  case HexagonISD::POPCOUNT:      return "HexagonISD::POPCOUNT";
   case HexagonISD::RET_FLAG:      return "HexagonISD::RET_FLAG";
   case HexagonISD::SHUFFEB:       return "HexagonISD::SHUFFEB";
   case HexagonISD::SHUFFEH:       return "HexagonISD::SHUFFEH";
@@ -2296,6 +2326,7 @@ const char* HexagonTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case HexagonISD::VSRLW:         return "HexagonISD::VSRLW";
   case HexagonISD::VSXTBH:        return "HexagonISD::VSXTBH";
   case HexagonISD::VSXTBW:        return "HexagonISD::VSXTBW";
+  case HexagonISD::READCYCLE:     return "HexagonISD::READCYCLE";
   case HexagonISD::OP_END:        break;
   }
   return nullptr;
@@ -2968,11 +2999,11 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::DYNAMIC_STACKALLOC:   return LowerDYNAMIC_STACKALLOC(Op, DAG);
     case ISD::SETCC:                return LowerSETCC(Op, DAG);
     case ISD::VSELECT:              return LowerVSELECT(Op, DAG);
-    case ISD::CTPOP:                return LowerCTPOP(Op, DAG);
     case ISD::INTRINSIC_WO_CHAIN:   return LowerINTRINSIC_WO_CHAIN(Op, DAG);
     case ISD::INTRINSIC_VOID:       return LowerINTRINSIC_VOID(Op, DAG);
     case ISD::INLINEASM:            return LowerINLINEASM(Op, DAG);
     case ISD::PREFETCH:             return LowerPREFETCH(Op, DAG);
+    case ISD::READCYCLECOUNTER:     return LowerREADCYCLECOUNTER(Op, DAG);
   }
 }
 
@@ -3026,37 +3057,25 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
         return std::make_pair(0U, &Hexagon::DoubleRegsRegClass);
       }
     case 'q': // q0-q3
-      switch (VT.SimpleTy) {
+      switch (VT.getSizeInBits()) {
       default:
-        llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
-      case MVT::v1024i1:
-      case MVT::v512i1:
-      case MVT::v32i16:
-      case MVT::v16i32:
-      case MVT::v64i8:
-      case MVT::v8i64:
+        llvm_unreachable("getRegForInlineAsmConstraint Unhandled vector size");
+      case 512:
         return std::make_pair(0U, &Hexagon::VecPredRegsRegClass);
+      case 1024:
+        return std::make_pair(0U, &Hexagon::VecPredRegs128BRegClass);
       }
     case 'v': // V0-V31
-      switch (VT.SimpleTy) {
+      switch (VT.getSizeInBits()) {
       default:
-        llvm_unreachable("getRegForInlineAsmConstraint Unhandled data type");
-      case MVT::v16i32:
-      case MVT::v32i16:
-      case MVT::v64i8:
-      case MVT::v8i64:
+        llvm_unreachable("getRegForInlineAsmConstraint Unhandled vector size");
+      case 512:
         return std::make_pair(0U, &Hexagon::VectorRegsRegClass);
-      case MVT::v32i32:
-      case MVT::v64i16:
-      case MVT::v16i64:
-      case MVT::v128i8:
+      case 1024:
         if (Subtarget.hasV60TOps() && UseHVX && UseHVXDbl)
           return std::make_pair(0U, &Hexagon::VectorRegs128BRegClass);
         return std::make_pair(0U, &Hexagon::VecDblRegsRegClass);
-      case MVT::v256i8:
-      case MVT::v128i16:
-      case MVT::v64i32:
-      case MVT::v32i64:
+      case 2048:
         return std::make_pair(0U, &Hexagon::VecDblRegs128BRegClass);
       }
 
diff --git a/lib/Target/Hexagon/HexagonISelLowering.h b/lib/Target/Hexagon/HexagonISelLowering.h
index a8ed29e585d417121e7e0ce371d9cdcb14556ffe..aa0f00cd5bbf484fff609d9003558625d5226ce0 100644
--- a/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/lib/Target/Hexagon/HexagonISelLowering.h
@@ -50,7 +50,6 @@ namespace HexagonISD {
       JT,          // Jump table.
       CP,          // Constant pool.
 
-      POPCOUNT,
       COMBINE,
       PACKHL,
       VSPLATB,
@@ -86,6 +85,7 @@ namespace HexagonISD {
       TC_RETURN,
       EH_RETURN,
       DCFETCH,
+      READCYCLE,
 
       OP_END
     };
@@ -146,6 +146,7 @@ namespace HexagonISD {
     SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const;
     SDValue
@@ -163,7 +164,7 @@ namespace HexagonISD {
     SDValue LowerToTLSLocalExecModel(GlobalAddressSDNode *GA,
         SelectionDAG &DAG) const;
     SDValue GetDynamicTLSAddr(SelectionDAG &DAG, SDValue Chain,
-        GlobalAddressSDNode *GA, SDValue *InFlag, EVT PtrVT,
+        GlobalAddressSDNode *GA, SDValue InFlag, EVT PtrVT,
         unsigned ReturnReg, unsigned char OperandFlags) const;
     SDValue LowerGLOBAL_OFFSET_TABLE(SDValue Op, SelectionDAG &DAG) const;
 
@@ -179,7 +180,6 @@ namespace HexagonISD {
 
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerCTPOP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFRAMEADDR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_FENCE(SDValue Op, SelectionDAG& DAG) const;
     SDValue LowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/Hexagon/HexagonInstrAlias.td b/lib/Target/Hexagon/HexagonInstrAlias.td
deleted file mode 100644
index 7283d94ee759e931596b4f636fa486af5c8c97d6..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonInstrAlias.td
+++ /dev/null
@@ -1,652 +0,0 @@
-//==- HexagonInstrAlias.td - Hexagon Instruction Aliases ---*- tablegen -*--==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//                     Hexagon Instruction Mappings
-//===----------------------------------------------------------------------===//
-
-
-def : InstAlias<"memb({GP}+#$addr) = $Nt.new",
-                (S2_storerbnewgp u16_0Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memh({GP}+#$addr) = $Nt.new",
-                (S2_storerhnewgp u16_1Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memw({GP}+#$addr) = $Nt.new",
-                (S2_storerinewgp u16_2Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memb({GP}+#$addr) = $Nt",
-                (S2_storerbgp u16_0Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memh({GP}+#$addr) = $Nt",
-                (S2_storerhgp u16_1Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memh({GP}+#$addr) = $Nt.h",
-                (S2_storerfgp u16_1Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memw({GP}+#$addr) = $Nt",
-                (S2_storerigp u16_2Imm:$addr, IntRegs:$Nt)>;
-def : InstAlias<"memd({GP}+#$addr) = $Nt",
-                (S2_storerdgp u16_3Imm:$addr, DoubleRegs:$Nt)>;
-
-def : InstAlias<"$Nt = memb({GP}+#$addr)",
-                (L2_loadrbgp IntRegs:$Nt, u16_0Imm:$addr)>;
-def : InstAlias<"$Nt = memub({GP}+#$addr)",
-                (L2_loadrubgp IntRegs:$Nt, u16_0Imm:$addr)>;
-def : InstAlias<"$Nt = memh({GP}+#$addr)",
-                (L2_loadrhgp IntRegs:$Nt, u16_1Imm:$addr)>;
-def : InstAlias<"$Nt = memuh({GP}+#$addr)",
-                (L2_loadruhgp IntRegs:$Nt, u16_1Imm:$addr)>;
-def : InstAlias<"$Nt = memw({GP}+#$addr)",
-                (L2_loadrigp IntRegs:$Nt, u16_2Imm:$addr)>;
-def : InstAlias<"$Nt = memd({GP}+#$addr)",
-                (L2_loadrdgp DoubleRegs:$Nt, u16_3Imm:$addr)>;
-
-// Alias of: memXX($Rs+#XX) = $Rt to memXX($Rs) = $Rt
-def : InstAlias<"memb($Rs) = $Rt",
-      (S2_storerb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memh($Rs) = $Rt",
-      (S2_storerh_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memh($Rs) = $Rt.h",
-      (S2_storerf_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memw($Rs) = $Rt",
-      (S2_storeri_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memb($Rs) = $Rt.new",
-      (S2_storerbnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memh($Rs) = $Rt.new",
-      (S2_storerhnew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memw($Rs) = $Rt.new",
-      (S2_storerinew_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"memb($Rs) = #$S8",
-      (S4_storeirb_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
-
-def : InstAlias<"memh($Rs) = #$S8",
-      (S4_storeirh_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
-
-def : InstAlias<"memw($Rs) = #$S8",
-      (S4_storeiri_io IntRegs:$Rs, 0, s8_0Ext:$S8), 0>;
-
-def : InstAlias<"memd($Rs) = $Rtt",
-      (S2_storerd_io IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"memb($Rs) = setbit(#$U5)",
-      (L4_ior_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memh($Rs) = setbit(#$U5)",
-      (L4_ior_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memw($Rs) = setbit(#$U5)",
-      (L4_ior_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memb($Rs) = clrbit(#$U5)",
-      (L4_iand_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memh($Rs) = clrbit(#$U5)",
-      (L4_iand_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-def : InstAlias<"memw($Rs) = clrbit(#$U5)",
-      (L4_iand_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>;
-
-// Alias of: $Rd = memXX($Rs+#XX) to $Rd = memXX($Rs)
-def : InstAlias<"$Rd = memb($Rs)",
-      (L2_loadrb_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memub($Rs)",
-      (L2_loadrub_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memh($Rs)",
-      (L2_loadrh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memuh($Rs)",
-      (L2_loadruh_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memw($Rs)",
-      (L2_loadri_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memd($Rs)",
-      (L2_loadrd_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = memubh($Rs)",
-      (L2_loadbzw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memubh($Rs)",
-      (L2_loadbzw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rd = membh($Rs)",
-      (L2_loadbsw2_io IntRegs:$Rd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = membh($Rs)",
-      (L2_loadbsw4_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memb_fifo($Rs)",
-      (L2_loadalignb_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"$Rdd = memh_fifo($Rs)",
-      (L2_loadalignh_io DoubleRegs:$Rdd, IntRegs:$Rs, 0), 0>;
-
-// Alias of: if ($Pt) $Rd = memXX($Rs + #$u6_X)
-//       to: if ($Pt) $Rd = memXX($Rs)
-def : InstAlias<"if ($Pt) $Rd = memb($Rs)",
-      (L2_ploadrbt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memub($Rs)",
-      (L2_ploadrubt_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memh($Rs)",
-      (L2_ploadrht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memuh($Rs)",
-      (L2_ploadruht_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rd = memw($Rs)",
-      (L2_ploadrit_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt) $Rdd = memd($Rs)",
-      (L2_ploadrdt_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-// Alias of: if ($Pt) memXX($Rs + #$u6_X) = $Rt
-//       to: if ($Pt) memXX($Rs) = $Rt
-def : InstAlias<"if ($Pt) memb($Rs) = $Rt",
-      (S2_pstorerbt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = $Rt",
-      (S2_pstorerht_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = $Rt.h",
-      (S2_pstorerft_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memw($Rs) = $Rt",
-      (S2_pstorerit_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memd($Rs) = $Rtt",
-      (S2_pstorerdt_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"if ($Pt) memb($Rs) = $Rt.new",
-      (S2_pstorerbnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = $Rt.new",
-      (S2_pstorerhnewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memw($Rs) = $Rt.new",
-      (S2_pstorerinewt_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt.new) memb($Rs) = $Rt.new",
-      (S4_pstorerbnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt.new) memh($Rs) = $Rt.new",
-      (S4_pstorerhnewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt.new) memw($Rs) = $Rt.new",
-      (S4_pstorerinewtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-
-// Alias of: if (!$Pt) $Rd = memXX($Rs + #$u6_X)
-//       to: if (!$Pt) $Rd = memXX($Rs)
-def : InstAlias<"if (!$Pt) $Rd = memb($Rs)",
-      (L2_ploadrbf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memub($Rs)",
-      (L2_ploadrubf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memh($Rs)",
-      (L2_ploadrhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memuh($Rs)",
-      (L2_ploadruhf_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rd = memw($Rs)",
-      (L2_ploadrif_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt) $Rdd = memd($Rs)",
-      (L2_ploadrdf_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-// Alias of: if (!$Pt) memXX($Rs + #$u6_X) = $Rt
-//       to: if (!$Pt) memXX($Rs) = $Rt
-def : InstAlias<"if (!$Pt) memb($Rs) = $Rt",
-      (S2_pstorerbf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = $Rt",
-      (S2_pstorerhf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.h",
-      (S2_pstorerff_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memw($Rs) = $Rt",
-      (S2_pstorerif_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memd($Rs) = $Rtt",
-      (S2_pstorerdf_io PredRegs:$Pt, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"if (!$Pt) memb($Rs) = $Rt.new",
-      (S2_pstorerbnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = $Rt.new",
-      (S2_pstorerhnewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt) memw($Rs) = $Rt.new",
-      (S2_pstorerinewf_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt.new) memb($Rs) = $Rt.new",
-      (S4_pstorerbnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt.new) memh($Rs) = $Rt.new",
-      (S4_pstorerhnewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pt.new) memw($Rs) = $Rt.new",
-      (S4_pstorerinewfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pt) memb($Rs) = #$S6",
-      (S4_storeirbt_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt) memh($Rs) = #$S6",
-      (S4_storeirht_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt) memw($Rs) = #$S6",
-      (S4_storeirit_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt.new) memb($Rs) = #$S6",
-      (S4_storeirbtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt.new) memh($Rs) = #$S6",
-      (S4_storeirhtnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if ($Pt.new) memw($Rs) = #$S6",
-      (S4_storeiritnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt) memb($Rs) = #$S6",
-      (S4_storeirbf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt) memh($Rs) = #$S6",
-      (S4_storeirhf_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt) memw($Rs) = #$S6",
-      (S4_storeirif_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt.new) memb($Rs) = #$S6",
-      (S4_storeirbfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt.new) memh($Rs) = #$S6",
-      (S4_storeirhfnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-def : InstAlias<"if (!$Pt.new) memw($Rs) = #$S6",
-      (S4_storeirifnew_io PredRegs:$Pt, IntRegs:$Rs, 0, s6_0Ext:$S6), 0>;
-
-// Alias of: memXX($Rs + $u6_X) |= $Rt, also &=, +=, -=
-//       to: memXX($Rs) |= $Rt
-def : InstAlias<"memb($Rs) &= $Rt",
-      (L4_and_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) |= $Rt",
-      (L4_or_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) += $Rt",
-      (L4_add_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) -= $Rt",
-      (L4_sub_memopb_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) += #$U5",
-      (L4_iadd_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memb($Rs) -= #$U5",
-      (L4_isub_memopb_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) &= $Rt",
-      (L4_and_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) |= $Rt",
-      (L4_or_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) += $Rt",
-      (L4_add_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) -= $Rt",
-      (L4_sub_memoph_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) += #$U5",
-      (L4_iadd_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memh($Rs) -= #$U5",
-      (L4_isub_memoph_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) &= $Rt",
-      (L4_and_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) |= $Rt",
-      (L4_or_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) += $Rt",
-      (L4_add_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) -= $Rt",
-      (L4_sub_memopw_io IntRegs:$Rs, 0, IntRegs:$Rt), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) += #$U5",
-      (L4_iadd_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-def : InstAlias<"memw($Rs) -= #$U5",
-      (L4_isub_memopw_io IntRegs:$Rs, 0, u5_0Imm:$U5), 0>,
-      Requires<[UseMEMOP]>;
-
-//
-// Alias of: if ($Pv.new) memX($Rs) = $Rt
-//       to: if (p3.new) memX(r17 + #0) = $Rt
-def : InstAlias<"if ($Pv.new) memb($Rs) = $Rt",
-      (S4_pstorerbtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt",
-      (S4_pstorerhtnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memh($Rs) = $Rt.h",
-      (S4_pstorerftnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memw($Rs) = $Rt",
-      (S4_pstoreritnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if ($Pv.new) memd($Rs) = $Rtt",
-      (S4_pstorerdtnew_io
-       PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memb($Rs) = $Rt",
-      (S4_pstorerbfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt",
-      (S4_pstorerhfnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memh($Rs) = $Rt.h",
-      (S4_pstorerffnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memw($Rs) = $Rt",
-      (S4_pstorerifnew_io PredRegs:$Pv, IntRegs:$Rs, 0, IntRegs:$Rt), 0>;
-
-def : InstAlias<"if (!$Pv.new) memd($Rs) = $Rtt",
-      (S4_pstorerdfnew_io
-       PredRegs:$Pv, IntRegs:$Rs, 0, DoubleRegs:$Rtt), 0>;
-
-//
-// Alias of: if ($Pt.new) $Rd = memub($Rs) -- And if (!$Pt.new) ...
-//       to: if ($Pt.new) $Rd = memub($Rs + #$u6_0)
-def : InstAlias<"if ($Pt.new) $Rd = memub($Rs)",
-      (L2_ploadrubtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memb($Rs)",
-      (L2_ploadrbtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memh($Rs)",
-      (L2_ploadrhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memuh($Rs)",
-      (L2_ploadruhtnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rd = memw($Rs)",
-      (L2_ploadritnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if ($Pt.new) $Rdd = memd($Rs)",
-      (L2_ploadrdtnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memub($Rs)",
-      (L2_ploadrubfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memb($Rs)",
-      (L2_ploadrbfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memh($Rs)",
-      (L2_ploadrhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memuh($Rs)",
-      (L2_ploadruhfnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rd = memw($Rs)",
-      (L2_ploadrifnew_io IntRegs:$Rd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"if (!$Pt.new) $Rdd = memd($Rs)",
-      (L2_ploadrdfnew_io DoubleRegs:$Rdd, PredRegs:$Pt, IntRegs:$Rs, 0), 0>;
-
-def : InstAlias<"dcfetch($Rs)",
-      (Y2_dcfetchbo IntRegs:$Rs, 0), 0>;
-
-// Alias of some insn mappings, others must be handled by the parser
-def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
-      (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
-      (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-
-// Rd=neg(Rs) is aliased to Rd=sub(#0,Rs)
-def : InstAlias<"$Rd = neg($Rs)",
-      (A2_subri IntRegs:$Rd, 0, IntRegs:$Rs), 0>;
-
-def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
-def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
-def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
-def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
-
-def : InstAlias<"$Pd = $Ps",
-      (C2_or PredRegs:$Pd, PredRegs:$Ps, PredRegs:$Ps), 0>;
-
-def : InstAlias<"$Rdd = vaddb($Rss, $Rtt)",
-      (A2_vaddub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 1>;
-
-def : InstAlias<"$Rdd = vsubb($Rss,$Rtt)",
-      (A2_vsubub DoubleRegs:$Rdd, DoubleRegs:$Rss, DoubleRegs:$Rtt), 0>;
-
-def : InstAlias<"$Rd = mpyui($Rs,$Rt)",
-      (M2_mpyi IntRegs:$Rd, IntRegs:$Rs, IntRegs:$Rt), 0>;
-
-// Assembler mapped insns: cmp.lt(a,b) -> cmp.gt(b,a)
-def : InstAlias<"$Pd=cmp.lt($Rs, $Rt)",
-      (C2_cmpgt PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-def : InstAlias<"$Pd=cmp.ltu($Rs, $Rt)",
-      (C2_cmpgtu PredRegs:$Pd, IntRegs:$Rt, IntRegs:$Rs), 0>;
-
-// maps if (!Pu) jumpr Rs -> if (!Pu) jumpr:nt Rs
-def : InstAlias<"if (!$Pu) jumpr $Rs",
-      (J2_jumprf PredRegs:$Pu, IntRegs:$Rs)>,
-      Requires<[HasV60T]>;
-
-// maps if (Pu) jumpr Rs -> if (Pu) jumpr:nt Rs
-def : InstAlias<"if ($Pu) jumpr $Rs",
-      (J2_jumprt PredRegs:$Pu, IntRegs:$Rs)>,
-      Requires<[HasV60T]>;
-
-// maps if (!Pu) jump $r15_2 -> if (!Pu) jump:nt $r15_2
-def : InstAlias<"if (!$Pu) jump $r15_2",
-      (J2_jumpf PredRegs:$Pu, brtarget:$r15_2)>,
-      Requires<[HasV60T]>;
-
-// maps if (Pu) jump $r15_2 -> if (Pu) jump:nt $r15_2
-def : InstAlias<"if ($Pu) jump $r15_2",
-     (J2_jumpt PredRegs:$Pu, brtarget:$r15_2)>,
-     Requires<[HasV60T]>;
-
-def : InstAlias<"if ($src) jump $r15_2",
-      (J2_jumpt PredRegs:$src, brtarget:$r15_2), 0>;
-
-def : InstAlias<"if (!$src) jump $r15_2",
-      (J2_jumpf PredRegs:$src, brtarget:$r15_2), 0>;
-
-def : InstAlias<"if ($src1) jumpr $src2",
-      (J2_jumprt PredRegs:$src1, IntRegs:$src2), 0>;
-
-def : InstAlias<"if (!$src1) jumpr $src2",
-      (J2_jumprf PredRegs:$src1, IntRegs:$src2), 0>;
-
-// maps Vdd = Vss to Vdd = V6_vassignp(Vss)
-def : InstAlias<"$Vdd = $Vss",
-      (V6_vassignp VecDblRegs:$Vdd, VecDblRegs:$Vss)>,
-      Requires<[HasV60T]>;
-
-// maps Vd = #0 to Vd = vxor(Vd, Vd)
-def : InstAlias<"$Vd = #0",
-      (V6_vxor VectorRegs:$Vd, VectorRegs:$Vd, VectorRegs:$Vd)>,
-      Requires<[HasV60T]>;
-
-// maps Vdd  = #0 to Vdd = vsub(Vdd, Vdd)
-def : InstAlias<"$Vdd = #0",
-      (V6_vsubw_dv VecDblRegs:$Vdd, VecDblRegs:$Vdd, VecDblRegs:$Vdd)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd = vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd = vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd = vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd &= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd &= vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd &= vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd |= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd |= vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd |= vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)" -> "$Qd ^= vcmp.eq($Vu.h, $Vv.h)"
-def : InstAlias<"$Qd ^= vcmp.eq($Vu.uh, $Vv.uh)",
-      (V6_veqh_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd = vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd = vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd = vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqw VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd &= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd &= vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd &= vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqw_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd |= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd |= vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd |= vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqh_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)" -> "$Qd ^= vcmp.eq($Vu.w, $Vv.w)"
-def : InstAlias<"$Qd ^= vcmp.eq($Vu.uw, $Vv.uw)",
-      (V6_veqw_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd = vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd = vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd = vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd &= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd &= vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd &= vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb_and VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd |= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd |= vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd |= vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb_or VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)" -> "$Qd ^= vcmp.eq($Vu.b, $Vv.b)"
-def : InstAlias<"$Qd ^= vcmp.eq($Vu.ub, $Vv.ub)",
-      (V6_veqb_xor VecPredRegs:$Qd, VectorRegs:$Vu, VectorRegs:$Vv)>,
-      Requires<[HasV60T]>;
-
-// maps   "$Rd.w = vextract($Vu, $Rs)" -> "$Rd = vextract($Vu, $Rs)"
-def : InstAlias<"$Rd.w = vextract($Vu, $Rs)",
-      (V6_extractw IntRegs:$Rd, VectorRegs:$Vu, IntRegs:$Rs)>,
-      Requires<[HasV60T]>;
-
-// Mapping from vtrans2x2(Vy32,Vx32,Rt32) to vshuff(Vy32,Vx32,Rt32)
-def : InstAlias<"vtrans2x2($Vy, $Vx, $Rt)",
-      (V6_vshuff VectorRegs:$Vy, VectorRegs:$Vx, IntRegs:$Rt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"$Vt=vmem($Rs)",
-      (V6_vL32b_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"$Vt=vmem($Rs):nt",
-      (V6_vL32b_nt_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs)=$Vt",
-      (V6_vS32b_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs)=$Vt.new",
-      (V6_vS32b_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmem($Rs):nt=$Vt.new",
-      (V6_vS32b_nt_new_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Qv) vmem($Rs)=$Vt",
-      (V6_vS32b_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Qv) vmem($Rs)=$Vt",
-      (V6_vS32b_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Qv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_qpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Qv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_nqpred_ai VecPredRegs:$Qv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Pv) vmem($Rs)=$Vt",
-      (V6_vS32b_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Pv) vmem($Rs)=$Vt",
-      (V6_vS32b_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Pv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Pv) vmem($Rs):nt=$Vt",
-      (V6_vS32b_nt_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"$Vt=vmemu($Rs)",
-      (V6_vL32Ub_ai VectorRegs:$Vt, IntRegs:$Rs, 0)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"vmemu($Rs)=$Vt",
-      (V6_vS32Ub_ai IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if ($Pv) vmemu($Rs)=$Vt",
-      (V6_vS32Ub_pred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-def : InstAlias<"if (!$Pv) vmemu($Rs)=$Vt",
-      (V6_vS32Ub_npred_ai PredRegs:$Pv, IntRegs:$Rs, 0, VectorRegs:$Vt)>,
-      Requires<[HasV60T]>;
-
-
diff --git a/lib/Target/Hexagon/HexagonInstrEnc.td b/lib/Target/Hexagon/HexagonInstrEnc.td
deleted file mode 100644
index 280832fd167f452faa719233f2d1dffecceaae0b..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonInstrEnc.td
+++ /dev/null
@@ -1,1019 +0,0 @@
-class Enc_COPROC_VX_3op_v<bits<15> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { opc{14-4}, src2};
-  let Inst{13-0} = { opc{3}, src1, opc{2-0}, dst};
-}
-
-class V6_vtmpyb_enc : Enc_COPROC_VX_3op_v<0b000110010000000>;
-class V6_vtmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000001>;
-class V6_vdmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110010000010>;
-class V6_vrmpyub_enc : Enc_COPROC_VX_3op_v<0b000110010000011>;
-class V6_vrmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000100>;
-class V6_vdsaduh_enc : Enc_COPROC_VX_3op_v<0b000110010000101>;
-class V6_vdmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010000110>;
-class V6_vdmpybus_dv_enc : Enc_COPROC_VX_3op_v<0b000110010000111>;
-class V6_vtmpyb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001000>;
-class V6_vtmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001001>;
-class V6_vtmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001010>;
-class V6_vdmpyhb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001011>;
-class V6_vrmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001100>;
-class V6_vrmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001101>;
-class V6_vdmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001110>;
-class V6_vdmpybus_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010001111>;
-class V6_vdmpyhsusat_enc : Enc_COPROC_VX_3op_v<0b000110010010000>;
-class V6_vdmpyhsuisat_enc : Enc_COPROC_VX_3op_v<0b000110010010001>;
-class V6_vdmpyhsat_enc : Enc_COPROC_VX_3op_v<0b000110010010010>;
-class V6_vdmpyhisat_enc : Enc_COPROC_VX_3op_v<0b000110010010011>;
-class V6_vdmpyhb_dv_enc : Enc_COPROC_VX_3op_v<0b000110010010100>;
-class V6_vmpybus_enc : Enc_COPROC_VX_3op_v<0b000110010010101>;
-class V6_vmpabus_enc : Enc_COPROC_VX_3op_v<0b000110010010110>;
-class V6_vmpahb_enc : Enc_COPROC_VX_3op_v<0b000110010010111>;
-class V6_vdmpyhsusat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011000>;
-class V6_vdmpyhsuisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011001>;
-class V6_vdmpyhisat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011010>;
-class V6_vdmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011011>;
-class V6_vdmpyhb_dv_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011100>;
-class V6_vmpybus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011101>;
-class V6_vmpabus_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011110>;
-class V6_vmpahb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010011111>;
-class V6_vmpyh_enc : Enc_COPROC_VX_3op_v<0b000110010100000>;
-class V6_vmpyhss_enc : Enc_COPROC_VX_3op_v<0b000110010100001>;
-class V6_vmpyhsrs_enc : Enc_COPROC_VX_3op_v<0b000110010100010>;
-class V6_vmpyuh_enc : Enc_COPROC_VX_3op_v<0b000110010100011>;
-class V6_vmpyhsat_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101000>;
-class V6_vmpyuh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101001>;
-class V6_vmpyiwb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101010>;
-class V6_vmpyiwh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010101011>;
-class V6_vmpyihb_enc : Enc_COPROC_VX_3op_v<0b000110010110000>;
-class V6_vror_enc : Enc_COPROC_VX_3op_v<0b000110010110001>;
-class V6_vasrw_enc : Enc_COPROC_VX_3op_v<0b000110010110101>;
-class V6_vasrh_enc : Enc_COPROC_VX_3op_v<0b000110010110110>;
-class V6_vaslw_enc : Enc_COPROC_VX_3op_v<0b000110010110111>;
-class V6_vdsaduh_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111000>;
-class V6_vmpyihb_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111001>;
-class V6_vaslw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111010>;
-class V6_vasrw_acc_enc : Enc_COPROC_VX_3op_v<0b000110010111101>;
-class V6_vaslh_enc : Enc_COPROC_VX_3op_v<0b000110011000000>;
-class V6_vlsrw_enc : Enc_COPROC_VX_3op_v<0b000110011000001>;
-class V6_vlsrh_enc : Enc_COPROC_VX_3op_v<0b000110011000010>;
-class V6_vmpyiwh_enc : Enc_COPROC_VX_3op_v<0b000110011000111>;
-class V6_vmpyub_acc_enc : Enc_COPROC_VX_3op_v<0b000110011001000>;
-class V6_vmpyiwb_enc : Enc_COPROC_VX_3op_v<0b000110011010000>;
-class V6_vtmpyhb_enc : Enc_COPROC_VX_3op_v<0b000110011010100>;
-class V6_vmpyub_enc : Enc_COPROC_VX_3op_v<0b000110011100000>;
-class V6_vrmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000000>;
-class V6_vrmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000001>;
-class V6_vrmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000010>;
-class V6_vdmpyhvsat_enc : Enc_COPROC_VX_3op_v<0b000111000000011>;
-class V6_vmpybv_enc : Enc_COPROC_VX_3op_v<0b000111000000100>;
-class V6_vmpyubv_enc : Enc_COPROC_VX_3op_v<0b000111000000101>;
-class V6_vmpybusv_enc : Enc_COPROC_VX_3op_v<0b000111000000110>;
-class V6_vmpyhv_enc : Enc_COPROC_VX_3op_v<0b000111000000111>;
-class V6_vrmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001000>;
-class V6_vrmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001001>;
-class V6_vrmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001010>;
-class V6_vdmpyhvsat_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001011>;
-class V6_vmpybv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001100>;
-class V6_vmpyubv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001101>;
-class V6_vmpybusv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001110>;
-class V6_vmpyhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000001111>;
-class V6_vmpyuhv_enc : Enc_COPROC_VX_3op_v<0b000111000010000>;
-class V6_vmpyhvsrs_enc : Enc_COPROC_VX_3op_v<0b000111000010001>;
-class V6_vmpyhus_enc : Enc_COPROC_VX_3op_v<0b000111000010010>;
-class V6_vmpabusv_enc : Enc_COPROC_VX_3op_v<0b000111000010011>;
-class V6_vmpyih_enc : Enc_COPROC_VX_3op_v<0b000111000010100>;
-class V6_vand_enc : Enc_COPROC_VX_3op_v<0b000111000010101>;
-class V6_vor_enc : Enc_COPROC_VX_3op_v<0b000111000010110>;
-class V6_vxor_enc : Enc_COPROC_VX_3op_v<0b000111000010111>;
-class V6_vmpyuhv_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011000>;
-class V6_vmpyhus_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011001>;
-class V6_vmpyih_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011100>;
-class V6_vmpyiewuh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000011101>;
-class V6_vmpyowh_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011110>;
-class V6_vmpyowh_rnd_sacc_enc : Enc_COPROC_VX_3op_v<0b000111000011111>;
-class V6_vaddw_enc : Enc_COPROC_VX_3op_v<0b000111000100000>;
-class V6_vaddubsat_enc : Enc_COPROC_VX_3op_v<0b000111000100001>;
-class V6_vadduhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100010>;
-class V6_vaddhsat_enc : Enc_COPROC_VX_3op_v<0b000111000100011>;
-class V6_vaddwsat_enc : Enc_COPROC_VX_3op_v<0b000111000100100>;
-class V6_vsubb_enc : Enc_COPROC_VX_3op_v<0b000111000100101>;
-class V6_vsubh_enc : Enc_COPROC_VX_3op_v<0b000111000100110>;
-class V6_vsubw_enc : Enc_COPROC_VX_3op_v<0b000111000100111>;
-class V6_vmpyiewh_acc_enc : Enc_COPROC_VX_3op_v<0b000111000101000>;
-class V6_vsububsat_enc : Enc_COPROC_VX_3op_v<0b000111000110000>;
-class V6_vsubuhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110001>;
-class V6_vsubhsat_enc : Enc_COPROC_VX_3op_v<0b000111000110010>;
-class V6_vsubwsat_enc : Enc_COPROC_VX_3op_v<0b000111000110011>;
-class V6_vaddb_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110100>;
-class V6_vaddh_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110101>;
-class V6_vaddw_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110110>;
-class V6_vaddubsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111000110111>;
-class V6_vadduhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000000>;
-class V6_vaddhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000001>;
-class V6_vaddwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000010>;
-class V6_vsubb_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000011>;
-class V6_vsubh_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000100>;
-class V6_vsubw_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000101>;
-class V6_vsububsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000110>;
-class V6_vsubuhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001000111>;
-class V6_vsubhsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010000>;
-class V6_vsubwsat_dv_enc : Enc_COPROC_VX_3op_v<0b000111001010001>;
-class V6_vaddubh_enc : Enc_COPROC_VX_3op_v<0b000111001010010>;
-class V6_vadduhw_enc : Enc_COPROC_VX_3op_v<0b000111001010011>;
-class V6_vaddhw_enc : Enc_COPROC_VX_3op_v<0b000111001010100>;
-class V6_vsububh_enc : Enc_COPROC_VX_3op_v<0b000111001010101>;
-class V6_vsubuhw_enc : Enc_COPROC_VX_3op_v<0b000111001010110>;
-class V6_vsubhw_enc : Enc_COPROC_VX_3op_v<0b000111001010111>;
-class V6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b000111001100000>;
-class V6_vabsdiffh_enc : Enc_COPROC_VX_3op_v<0b000111001100001>;
-class V6_vabsdiffuh_enc : Enc_COPROC_VX_3op_v<0b000111001100010>;
-class V6_vabsdiffw_enc : Enc_COPROC_VX_3op_v<0b000111001100011>;
-class V6_vavgub_enc : Enc_COPROC_VX_3op_v<0b000111001100100>;
-class V6_vavguh_enc : Enc_COPROC_VX_3op_v<0b000111001100101>;
-class V6_vavgh_enc : Enc_COPROC_VX_3op_v<0b000111001100110>;
-class V6_vavgw_enc : Enc_COPROC_VX_3op_v<0b000111001100111>;
-class V6_vnavgub_enc : Enc_COPROC_VX_3op_v<0b000111001110000>;
-class V6_vnavgh_enc : Enc_COPROC_VX_3op_v<0b000111001110001>;
-class V6_vnavgw_enc : Enc_COPROC_VX_3op_v<0b000111001110010>;
-class V6_vavgubrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110011>;
-class V6_vavguhrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110100>;
-class V6_vavghrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110101>;
-class V6_vavgwrnd_enc : Enc_COPROC_VX_3op_v<0b000111001110110>;
-class V6_vmpabuuv_enc : Enc_COPROC_VX_3op_v<0b000111001110111>;
-class V6_vminub_enc : Enc_COPROC_VX_3op_v<0b000111110000001>;
-class V6_vminuh_enc : Enc_COPROC_VX_3op_v<0b000111110000010>;
-class V6_vminh_enc : Enc_COPROC_VX_3op_v<0b000111110000011>;
-class V6_vminw_enc : Enc_COPROC_VX_3op_v<0b000111110000100>;
-class V6_vmaxub_enc : Enc_COPROC_VX_3op_v<0b000111110000101>;
-class V6_vmaxuh_enc : Enc_COPROC_VX_3op_v<0b000111110000110>;
-class V6_vmaxh_enc : Enc_COPROC_VX_3op_v<0b000111110000111>;
-class V6_vmaxw_enc : Enc_COPROC_VX_3op_v<0b000111110010000>;
-class V6_vdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010001>;
-class V6_vrdelta_enc : Enc_COPROC_VX_3op_v<0b000111110010011>;
-class V6_vdealb4w_enc : Enc_COPROC_VX_3op_v<0b000111110010111>;
-class V6_vmpyowh_rnd_enc : Enc_COPROC_VX_3op_v<0b000111110100000>;
-class V6_vshuffeb_enc : Enc_COPROC_VX_3op_v<0b000111110100001>;
-class V6_vshuffob_enc : Enc_COPROC_VX_3op_v<0b000111110100010>;
-class V6_vshufeh_enc : Enc_COPROC_VX_3op_v<0b000111110100011>;
-class V6_vshufoh_enc : Enc_COPROC_VX_3op_v<0b000111110100100>;
-class V6_vshufoeh_enc : Enc_COPROC_VX_3op_v<0b000111110100101>;
-class V6_vshufoeb_enc : Enc_COPROC_VX_3op_v<0b000111110100110>;
-class V6_vcombine_enc : Enc_COPROC_VX_3op_v<0b000111110100111>;
-class V6_vmpyieoh_enc : Enc_COPROC_VX_3op_v<0b000111110110000>;
-class V6_vsathub_enc : Enc_COPROC_VX_3op_v<0b000111110110010>;
-class V6_vsatwh_enc : Enc_COPROC_VX_3op_v<0b000111110110011>;
-class V6_vroundwh_enc : Enc_COPROC_VX_3op_v<0b000111110110100>;
-class V6_vroundwuh_enc : Enc_COPROC_VX_3op_v<0b000111110110101>;
-class V6_vroundhb_enc : Enc_COPROC_VX_3op_v<0b000111110110110>;
-class V6_vroundhub_enc : Enc_COPROC_VX_3op_v<0b000111110110111>;
-class V6_vasrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010000>;
-class V6_vlsrwv_enc : Enc_COPROC_VX_3op_v<0b000111111010001>;
-class V6_vlsrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010010>;
-class V6_vasrhv_enc : Enc_COPROC_VX_3op_v<0b000111111010011>;
-class V6_vaslwv_enc : Enc_COPROC_VX_3op_v<0b000111111010100>;
-class V6_vaslhv_enc : Enc_COPROC_VX_3op_v<0b000111111010101>;
-class V6_vaddb_enc : Enc_COPROC_VX_3op_v<0b000111111010110>;
-class V6_vaddh_enc : Enc_COPROC_VX_3op_v<0b000111111010111>;
-class V6_vmpyiewuh_enc : Enc_COPROC_VX_3op_v<0b000111111100000>;
-class V6_vmpyiowh_enc : Enc_COPROC_VX_3op_v<0b000111111100001>;
-class V6_vpackeb_enc : Enc_COPROC_VX_3op_v<0b000111111100010>;
-class V6_vpackeh_enc : Enc_COPROC_VX_3op_v<0b000111111100011>;
-class V6_vpackhub_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100101>;
-class V6_vpackhb_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100110>;
-class V6_vpackwuh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111100111>;
-class V6_vpackwh_sat_enc : Enc_COPROC_VX_3op_v<0b000111111110000>;
-class V6_vpackob_enc : Enc_COPROC_VX_3op_v<0b000111111110001>;
-class V6_vpackoh_enc : Enc_COPROC_VX_3op_v<0b000111111110010>;
-class V6_vmpyewuh_enc : Enc_COPROC_VX_3op_v<0b000111111110101>;
-class V6_vmpyowh_enc : Enc_COPROC_VX_3op_v<0b000111111110111>;
-class V6_extractw_enc : Enc_COPROC_VX_3op_v<0b100100100000001>;
-class M6_vabsdiffub_enc : Enc_COPROC_VX_3op_v<0b111010001010000>;
-class M6_vabsdiffb_enc : Enc_COPROC_VX_3op_v<0b111010001110000>;
-
-class Enc_COPROC_VX_cmp<bits<13> opc> : OpcodeHexagon {
-  bits<2> dst;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b00011, opc{12-7}, src2{4-0} };
-  let Inst{13-0} = { opc{6}, src1{4-0}, opc{5-0}, dst{1-0} };
-}
-
-class V6_vandvrt_acc_enc : Enc_COPROC_VX_cmp<0b0010111100000>;
-class V6_vandvrt_enc : Enc_COPROC_VX_cmp<0b0011010010010>;
-class V6_veqb_and_enc : Enc_COPROC_VX_cmp<0b1001001000000>;
-class V6_veqh_and_enc : Enc_COPROC_VX_cmp<0b1001001000001>;
-class V6_veqw_and_enc : Enc_COPROC_VX_cmp<0b1001001000010>;
-class V6_vgtb_and_enc : Enc_COPROC_VX_cmp<0b1001001000100>;
-class V6_vgth_and_enc : Enc_COPROC_VX_cmp<0b1001001000101>;
-class V6_vgtw_and_enc : Enc_COPROC_VX_cmp<0b1001001000110>;
-class V6_vgtub_and_enc : Enc_COPROC_VX_cmp<0b1001001001000>;
-class V6_vgtuh_and_enc : Enc_COPROC_VX_cmp<0b1001001001001>;
-class V6_vgtuw_and_enc : Enc_COPROC_VX_cmp<0b1001001001010>;
-class V6_veqb_or_enc : Enc_COPROC_VX_cmp<0b1001001010000>;
-class V6_veqh_or_enc : Enc_COPROC_VX_cmp<0b1001001010001>;
-class V6_veqw_or_enc : Enc_COPROC_VX_cmp<0b1001001010010>;
-class V6_vgtb_or_enc : Enc_COPROC_VX_cmp<0b1001001010100>;
-class V6_vgth_or_enc : Enc_COPROC_VX_cmp<0b1001001010101>;
-class V6_vgtw_or_enc : Enc_COPROC_VX_cmp<0b1001001010110>;
-class V6_vgtub_or_enc : Enc_COPROC_VX_cmp<0b1001001011000>;
-class V6_vgtuh_or_enc : Enc_COPROC_VX_cmp<0b1001001011001>;
-class V6_vgtuw_or_enc : Enc_COPROC_VX_cmp<0b1001001011010>;
-class V6_veqb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100000>;
-class V6_veqh_xor_enc : Enc_COPROC_VX_cmp<0b1001001100001>;
-class V6_veqw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100010>;
-class V6_vgtb_xor_enc : Enc_COPROC_VX_cmp<0b1001001100100>;
-class V6_vgth_xor_enc : Enc_COPROC_VX_cmp<0b1001001100101>;
-class V6_vgtw_xor_enc : Enc_COPROC_VX_cmp<0b1001001100110>;
-class V6_vgtub_xor_enc : Enc_COPROC_VX_cmp<0b1001001101000>;
-class V6_vgtuh_xor_enc : Enc_COPROC_VX_cmp<0b1001001101001>;
-class V6_vgtuw_xor_enc : Enc_COPROC_VX_cmp<0b1001001101010>;
-class V6_veqb_enc : Enc_COPROC_VX_cmp<0b1111000000000>;
-class V6_veqh_enc : Enc_COPROC_VX_cmp<0b1111000000001>;
-class V6_veqw_enc : Enc_COPROC_VX_cmp<0b1111000000010>;
-class V6_vgtb_enc : Enc_COPROC_VX_cmp<0b1111000000100>;
-class V6_vgth_enc : Enc_COPROC_VX_cmp<0b1111000000101>;
-class V6_vgtw_enc : Enc_COPROC_VX_cmp<0b1111000000110>;
-class V6_vgtub_enc : Enc_COPROC_VX_cmp<0b1111000001000>;
-class V6_vgtuh_enc : Enc_COPROC_VX_cmp<0b1111000001001>;
-class V6_vgtuw_enc : Enc_COPROC_VX_cmp<0b1111000001010>;
-
-class Enc_COPROC_VX_p2op<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> dst;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b00011110, src1{1-0}, 0b0000, opc{4-3} };
-  let Inst{13-0} = { 1, src2{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vaddbq_enc : Enc_COPROC_VX_p2op<0b01000>;
-class V6_vaddhq_enc : Enc_COPROC_VX_p2op<0b01001>;
-class V6_vaddwq_enc : Enc_COPROC_VX_p2op<0b01010>;
-class V6_vaddbnq_enc : Enc_COPROC_VX_p2op<0b01011>;
-class V6_vaddhnq_enc : Enc_COPROC_VX_p2op<0b01100>;
-class V6_vaddwnq_enc : Enc_COPROC_VX_p2op<0b01101>;
-class V6_vsubbq_enc : Enc_COPROC_VX_p2op<0b01110>;
-class V6_vsubhq_enc : Enc_COPROC_VX_p2op<0b01111>;
-class V6_vsubwq_enc : Enc_COPROC_VX_p2op<0b10000>;
-class V6_vsubbnq_enc : Enc_COPROC_VX_p2op<0b10001>;
-class V6_vsubhnq_enc : Enc_COPROC_VX_p2op<0b10010>;
-class V6_vsubwnq_enc : Enc_COPROC_VX_p2op<0b10011>;
-
-class Enc_COPROC_VX_2op<bits<6> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-
-  let Inst{31-16} = { 0b00011110000000, opc{5-4} };
-  let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vabsh_enc : Enc_COPROC_VX_2op<0b000000>;
-class V6_vabsh_sat_enc : Enc_COPROC_VX_2op<0b000001>;
-class V6_vabsw_enc : Enc_COPROC_VX_2op<0b000010>;
-class V6_vabsw_sat_enc : Enc_COPROC_VX_2op<0b000011>;
-class V6_vnot_enc : Enc_COPROC_VX_2op<0b000100>;
-class V6_vdealh_enc : Enc_COPROC_VX_2op<0b000110>;
-class V6_vdealb_enc : Enc_COPROC_VX_2op<0b000111>;
-class V6_vunpackob_enc : Enc_COPROC_VX_2op<0b001000>;
-class V6_vunpackoh_enc : Enc_COPROC_VX_2op<0b001001>;
-class V6_vunpackub_enc : Enc_COPROC_VX_2op<0b010000>;
-class V6_vunpackuh_enc : Enc_COPROC_VX_2op<0b010001>;
-class V6_vunpackb_enc : Enc_COPROC_VX_2op<0b010010>;
-class V6_vunpackh_enc : Enc_COPROC_VX_2op<0b010011>;
-class V6_vshuffh_enc : Enc_COPROC_VX_2op<0b010111>;
-class V6_vshuffb_enc : Enc_COPROC_VX_2op<0b100000>;
-class V6_vzb_enc : Enc_COPROC_VX_2op<0b100001>;
-class V6_vzh_enc : Enc_COPROC_VX_2op<0b100010>;
-class V6_vsb_enc : Enc_COPROC_VX_2op<0b100011>;
-class V6_vsh_enc : Enc_COPROC_VX_2op<0b100100>;
-class V6_vcl0w_enc : Enc_COPROC_VX_2op<0b100101>;
-class V6_vpopcounth_enc : Enc_COPROC_VX_2op<0b100110>;
-class V6_vcl0h_enc : Enc_COPROC_VX_2op<0b100111>;
-class V6_vnormamtw_enc : Enc_COPROC_VX_2op<0b110100>;
-class V6_vnormamth_enc : Enc_COPROC_VX_2op<0b110101>;
-class V6_vassign_enc : Enc_COPROC_VX_2op<0b111111>;
-
-class Enc_COPROC_VMEM_vL32_b_ai<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<10> src2;
-  bits<4> src2_vector;
-
-  let src2_vector = src2{9-6};
-  let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0000>;
-class V6_vL32b_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0001>;
-class V6_vL32b_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0010>;
-class V6_vL32Ub_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b0111>;
-class V6_vL32b_nt_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1000>;
-class V6_vL32b_nt_cur_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1001>;
-class V6_vL32b_nt_tmp_ai_enc : Enc_COPROC_VMEM_vL32_b_ai<0b1010>;
-
-class Enc_COPROC_VMEM_vL32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<11> src2;
-  bits<4> src2_vector;
-
-  let src2_vector = src2{10-7};
-  let Inst{31-16} = { 0b001010000, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0000>;
-class V6_vL32b_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0001>;
-class V6_vL32b_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0010>;
-class V6_vL32Ub_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b0111>;
-class V6_vL32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1000>;
-class V6_vL32b_nt_cur_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1001>;
-class V6_vL32b_nt_tmp_ai_128B_enc : Enc_COPROC_VMEM_vL32_b_ai_128B<0b1010>;
-
-class Enc_COPROC_VMEM_vS32_b_ai_64B<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<4> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{9-6};
-  let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class Enc_COPROC_VMEM_vS32_b_ai_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<11> src2;
-  bits<4> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{10-7};
-  let Inst{31-16} = { 0b001010000, opc{3}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0000>;
-class V6_vS32Ub_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b0111>;
-class V6_vS32b_nt_ai_enc : Enc_COPROC_VMEM_vS32_b_ai_64B<0b1000>;
-
-class V6_vS32b_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0000>;
-class V6_vS32Ub_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b0111>;
-class V6_vS32b_nt_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_ai_128B<0b1000>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<4> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{9-6};
-  let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<0>;
-class V6_vS32b_nt_new_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_64B<1>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<11> src2;
-  bits<4> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{10-7};
-  let Inst{31-16} = { 0b001010000, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { src2_vector{3}, 0b00, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<0>;
-class V6_vS32b_nt_new_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_ai_128B<1>;
-
-class Enc_COPROC_VMEM_vS32_b_pred_ai<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<4> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{9-6};
-  let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class Enc_COPROC_VMEM_vS32_b_pred_ai_128B<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<11> src3;
-  bits<4> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{10-7};
-  let Inst{31-16} = { 0b001010001, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00000>;
-class V6_vS32b_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b00001>;
-class V6_vS32b_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01000>;
-class V6_vS32b_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01001>;
-class V6_vS32Ub_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01110>;
-class V6_vS32Ub_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b01111>;
-class V6_vS32b_nt_qpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10000>;
-class V6_vS32b_nt_nqpred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b10001>;
-class V6_vS32b_nt_pred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11000>;
-class V6_vS32b_nt_npred_ai_enc : Enc_COPROC_VMEM_vS32_b_pred_ai<0b11001>;
-
-class V6_vS32b_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00000>;
-class V6_vS32b_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b00001>;
-class V6_vS32b_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01000>;
-class V6_vS32b_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01001>;
-class V6_vS32Ub_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01110>;
-class V6_vS32Ub_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b01111>;
-class V6_vS32b_nt_qpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10000>;
-class V6_vS32b_nt_nqpred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b10001>;
-class V6_vS32b_nt_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11000>;
-class V6_vS32b_nt_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_ai_128B<0b11001>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<4> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{9-6};
-  let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0000>;
-class V6_vS32b_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b0101>;
-class V6_vS32b_nt_new_pred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1010>;
-class V6_vS32b_nt_new_npred_ai_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai<0b1111>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<11> src3;
-  bits<4> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{10-7};
-  let Inst{31-16} = { 0b001010001, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { src3_vector{3}, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0000>;
-class V6_vS32b_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b0101>;
-class V6_vS32b_nt_new_pred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1010>;
-class V6_vS32b_nt_new_npred_ai_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ai_128B<0b1111>;
-
-// TODO: Change script to generate dst, src1, src2 instead of
-// dst, dst2, src1.
-class Enc_COPROC_VMEM_vL32_b_pi<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<9> src2;
-  bits<3> src2_vector;
-
-  let src2_vector = src2{8-6};
-  let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0000>;
-class V6_vL32b_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0001>;
-class V6_vL32b_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0010>;
-class V6_vL32Ub_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b0111>;
-class V6_vL32b_nt_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1000>;
-class V6_vL32b_nt_cur_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1001>;
-class V6_vL32b_nt_tmp_pi_enc : Enc_COPROC_VMEM_vL32_b_pi<0b1010>;
-
-class Enc_COPROC_VMEM_vL32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<10> src2;
-  bits<3> src2_vector;
-
-  let src2_vector = src2{9-7};
-  let Inst{31-16} = { 0b001010010, opc{3}, 0, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0000>;
-class V6_vL32b_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0001>;
-class V6_vL32b_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0010>;
-class V6_vL32Ub_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b0111>;
-class V6_vL32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1000>;
-class V6_vL32b_nt_cur_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1001>;
-class V6_vL32b_nt_tmp_pi_128B_enc : Enc_COPROC_VMEM_vL32_b_pi_128B<0b1010>;
-
-
-// TODO: Change script to generate src1, src2 and src3 instead of
-// dst, src1, src2.
-class Enc_COPROC_VMEM_vS32_b_pi<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<9> src2;
-  bits<3> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{8-6};
-  let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
-  let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0000>;
-class V6_vS32Ub_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b0111>;
-class V6_vS32b_nt_pi_enc : Enc_COPROC_VMEM_vS32_b_pi<0b1000>;
-
-class Enc_COPROC_VMEM_vS32_b_pi_128B<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<3> src2_vector;
-  bits<5> src3;
-
-  let src2_vector = src2{9-7};
-  let Inst{31-16} = { 0b001010010, opc{3}, 1, src1{4-0} };
-  let Inst{10-0} = {src2_vector{2-0}, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0000>;
-class V6_vS32Ub_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b0111>;
-class V6_vS32b_nt_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pi_128B<0b1000>;
-
-// TODO: Change script to generate src1, src2 and src3 instead of
-// dst, src1, src2.
-class Enc_COPROC_VMEM_vS32b_n_ew_pi<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<9> src2;
-  bits<3> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{8-6};
-  let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<0>;
-class V6_vS32b_nt_new_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi<1>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<10> src2;
-  bits<3> src2_vector;
-  bits<3> src3;
-
-  let src2_vector = src2{9-7};
-  let Inst{31-16} = { 0b001010010, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { 0b000, src2_vector{2-0}, 0b00100, src3{2-0} };
-}
-
-class V6_vS32b_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<0>;
-class V6_vS32b_nt_new_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pi_128B<1>;
-
-// TODO: Change script to generate src1, src2,src3 and src4 instead of
-// dst, src1, src2, src3.
-class Enc_COPROC_VMEM_vS32_b_pred_pi<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<9> src3;
-  bits<3> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{8-6};
-  let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00000>;
-class V6_vS32b_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b00001>;
-class V6_vS32b_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01000>;
-class V6_vS32b_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01001>;
-class V6_vS32Ub_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01110>;
-class V6_vS32Ub_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b01111>;
-class V6_vS32b_nt_qpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10000>;
-class V6_vS32b_nt_nqpred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b10001>;
-class V6_vS32b_nt_pred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11000>;
-class V6_vS32b_nt_npred_pi_enc : Enc_COPROC_VMEM_vS32_b_pred_pi<0b11001>;
-
-// TODO: Change script to generate src1, src2,src3 and src4 instead of
-// dst, src1, src2, src3.
-class Enc_COPROC_VMEM_vS32_b_pred_pi_128B<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<3> src3_vector;
-  bits<5> src4;
-
-  let src3_vector = src3{9-7};
-  let Inst{31-16} = { 0b001010011, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00000>;
-class V6_vS32b_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b00001>;
-class V6_vS32b_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01000>;
-class V6_vS32b_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01001>;
-class V6_vS32Ub_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01110>;
-class V6_vS32Ub_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b01111>;
-class V6_vS32b_nt_qpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10000>;
-class V6_vS32b_nt_nqpred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b10001>;
-class V6_vS32b_nt_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11000>;
-class V6_vS32b_nt_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32_b_pred_pi_128B<0b11001>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<9> src3;
-  bits<3> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{8-6};
-  let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0000>;
-class V6_vS32b_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b0101>;
-class V6_vS32b_nt_new_pred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1010>;
-class V6_vS32b_nt_new_npred_pi_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi<0b1111>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<10> src3;
-  bits<3> src3_vector;
-  bits<3> src4;
-
-  let src3_vector = src3{9-7};
-  let Inst{31-16} = { 0b001010011, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { 0, src1{1-0}, src3_vector{2-0}, 0b01, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0000>;
-class V6_vS32b_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b0101>;
-class V6_vS32b_nt_new_pred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1010>;
-class V6_vS32b_nt_new_npred_pi_128B_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_pi_128B<0b1111>;
-
-class Enc_LD_load_m<bits<13> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<1> src2;
-
-  let Inst{31-16} = { opc{12}, 0, opc{11-10}, 1, opc{9-4}, src1{4-0} };
-  let Inst{13-0} = { src2{0}, 0b000, opc{3}, 0, opc{2-0}, dst{4-0} };
-}
-
-class V6_vL32b_ppu_enc : Enc_LD_load_m<0b0100110000000>;
-class V6_vL32b_cur_ppu_enc : Enc_LD_load_m<0b0100110000001>;
-class V6_vL32b_tmp_ppu_enc : Enc_LD_load_m<0b0100110000010>;
-class V6_vL32Ub_ppu_enc : Enc_LD_load_m<0b0100110000111>;
-class V6_vL32b_nt_ppu_enc : Enc_LD_load_m<0b0100110100000>;
-class V6_vL32b_nt_cur_ppu_enc : Enc_LD_load_m<0b0100110100001>;
-class V6_vL32b_nt_tmp_ppu_enc : Enc_LD_load_m<0b0100110100010>;
-
-class Enc_COPROC_VMEM_vS32_b_ppu<bits<4> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<1> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { 0b001010110, opc{3}, 1, src1{4-0} };
-  let Inst{13-0} = { src2{0}, 0b00000, opc{2-0}, src3{4-0} };
-}
-
-class V6_vS32b_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0000>;
-class V6_vS32Ub_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b0111>;
-class V6_vS32b_nt_ppu_enc : Enc_COPROC_VMEM_vS32_b_ppu<0b1000>;
-
-class Enc_COPROC_VMEM_vS32b_new_ppu<bits<1> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<1> src2;
-  bits<3> src3;
-
-  let Inst{31-16} = { 0b001010110, opc{0}, 1, src1{4-0} };
-  let Inst{13-0} = { src2{0}, 0b0000000100, src3{2-0} };
-}
-
-class V6_vS32b_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<0>;
-class V6_vS32b_nt_new_ppu_enc : Enc_COPROC_VMEM_vS32b_new_ppu<1>;
-
-class Enc_COPROC_VMEM_vS32_b_pred_ppu<bits<5> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<1> src3;
-  bits<5> src4;
-
-  let Inst{31-16} = { 0b001010111, opc{4-3}, src2{4-0} };
-  let Inst{13-0} = { src3{0}, src1{1-0}, 0b000, opc{2-0}, src4{4-0} };
-}
-
-class V6_vS32b_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00000>;
-class V6_vS32b_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b00001>;
-class V6_vS32b_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01000>;
-class V6_vS32b_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01001>;
-class V6_vS32Ub_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01110>;
-class V6_vS32Ub_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b01111>;
-class V6_vS32b_nt_qpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10000>;
-class V6_vS32b_nt_nqpred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b10001>;
-class V6_vS32b_nt_pred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11000>;
-class V6_vS32b_nt_npred_ppu_enc : Enc_COPROC_VMEM_vS32_b_pred_ppu<0b11001>;
-
-class Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<bits<4> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> src2;
-  bits<1> src3;
-  bits<3> src4;
-
-  let Inst{31-16} = { 0b001010111, opc{3}, 1, src2{4-0} };
-  let Inst{13-0} = { src3{0}, src1{1-0}, 0b00001, opc{2-0}, src4{2-0} };
-}
-
-class V6_vS32b_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0000>;
-class V6_vS32b_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b0101>;
-class V6_vS32b_nt_new_pred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1010>;
-class V6_vS32b_nt_new_npred_ppu_enc : Enc_COPROC_VMEM_vS32b_n_ew_pred_ppu<0b1111>;
-
-
-class Enc_COPROC_VX_4op_i<bits<5> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<1> src3;
-
-  let Inst{31-16} = { 0b00011001, opc{4-2}, src2{4-0} };
-  let Inst{13-0} = { opc{1}, src1{4-0}, 1, opc{0}, src3{0}, dst{4-0} };
-}
-
-class V6_vrmpybusi_enc : Enc_COPROC_VX_4op_i<0b01000>;
-class V6_vrsadubi_enc : Enc_COPROC_VX_4op_i<0b01001>;
-class V6_vrmpybusi_acc_enc : Enc_COPROC_VX_4op_i<0b01010>;
-class V6_vrsadubi_acc_enc : Enc_COPROC_VX_4op_i<0b01011>;
-class V6_vrmpyubi_acc_enc : Enc_COPROC_VX_4op_i<0b01111>;
-class V6_vrmpyubi_enc : Enc_COPROC_VX_4op_i<0b10101>;
-
-class Enc_COPROC_VX_vandqrt<bits<5> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<2> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b00011001, opc{4-3}, 1, src2{4-0} };
-  let Inst{13-0} = { opc{2}, 0b000, src1{1-0}, opc{1-0}, 1, dst{4-0} };
-}
-
-class V6_vandqrt_acc_enc : Enc_COPROC_VX_vandqrt<0b01101>;
-class V6_vandqrt_enc : Enc_COPROC_VX_vandqrt<0b10010>;
-
-class Enc_COPROC_VX_cards<bits<2> opc> : OpcodeHexagon {
-  bits<5> src1;
-  bits<5> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { 0b00011001111, src3{4-0} };
-  let Inst{13-0} = { 1, src1{4-0}, 0, opc{1-0}, src2{4-0} };
-}
-
-class V6_vshuff_enc : Enc_COPROC_VX_cards<0b01>;
-class V6_vdeal_enc : Enc_COPROC_VX_cards<0b10>;
-
-
-class Enc_COPROC_VX_v_cmov<bits<1> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> dst;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b0001101000, opc{0}, 0b00000 };
-  let Inst{13-0} = { 0, src2{4-0}, 0, src1{1-0}, dst{4-0} };
-}
-
-class V6_vcmov_enc : Enc_COPROC_VX_v_cmov<0>;
-class V6_vncmov_enc : Enc_COPROC_VX_v_cmov<1>;
-
-class Enc_X_p3op<bits<8> opc> : OpcodeHexagon {
-  bits<2> src1;
-  bits<5> dst;
-  bits<5> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { opc{7-5}, 0b1101, opc{4}, 0, opc{3-2}, src3{4-0} };
-  let Inst{13-0} = { opc{1}, src2{4-0}, opc{0}, src1{1-0}, dst{4-0} };
-}
-
-class V6_vnccombine_enc : Enc_X_p3op<0b00001000>;
-class V6_vccombine_enc : Enc_X_p3op<0b00001100>;
-
-class Enc_COPROC_VX_4op_r<bits<4> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<3> src3;
-
-  let Inst{31-16} = { 0b00011011, src2{4-0}, src3{2-0} };
-  let Inst{13-0} = { opc{3}, src1{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class V6_valignb_enc : Enc_COPROC_VX_4op_r<0b0000>;
-class V6_vlalignb_enc : Enc_COPROC_VX_4op_r<0b0001>;
-class V6_vasrwh_enc : Enc_COPROC_VX_4op_r<0b0010>;
-class V6_vasrwhsat_enc : Enc_COPROC_VX_4op_r<0b0011>;
-class V6_vasrwhrndsat_enc : Enc_COPROC_VX_4op_r<0b0100>;
-class V6_vasrwuhsat_enc : Enc_COPROC_VX_4op_r<0b0101>;
-class V6_vasrhubsat_enc : Enc_COPROC_VX_4op_r<0b0110>;
-class V6_vasrhubrndsat_enc : Enc_COPROC_VX_4op_r<0b0111>;
-class V6_vasrhbrndsat_enc : Enc_COPROC_VX_4op_r<0b1000>;
-class V6_vlutvvb_enc : Enc_COPROC_VX_4op_r<0b1001>;
-class V6_vshuffvdd_enc : Enc_COPROC_VX_4op_r<0b1011>;
-class V6_vdealvdd_enc : Enc_COPROC_VX_4op_r<0b1100>;
-class V6_vlutvvb_oracc_enc : Enc_COPROC_VX_4op_r<0b1101>;
-class V6_vlutvwh_enc : Enc_COPROC_VX_4op_r<0b1110>;
-class V6_vlutvwh_oracc_enc : Enc_COPROC_VX_4op_r<0b1111>;
-
-class Enc_S_3op_valign_i<bits<9> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<3> src3;
-
-  let Inst{31-16} = { opc{8-7}, 0, opc{6-3}, 0b00, opc{2-1}, src2{4-0} };
-  let Inst{13-0} = { opc{0}, src1{4-0}, src3{2-0}, dst{4-0} };
-}
-
-class V6_vlutb_enc : Enc_S_3op_valign_i<0b001100000>;
-class V6_vlutb_dv_enc : Enc_S_3op_valign_i<0b001100010>;
-class V6_vlutb_acc_enc : Enc_S_3op_valign_i<0b001100100>;
-class V6_vlutb_dv_acc_enc : Enc_S_3op_valign_i<0b001100110>;
-class V6_valignbi_enc : Enc_S_3op_valign_i<0b001111011>;
-class V6_vlalignbi_enc : Enc_S_3op_valign_i<0b001111111>;
-class S2_valignib_enc : Enc_S_3op_valign_i<0b110000000>;
-class S2_addasl_rrri_enc : Enc_S_3op_valign_i<0b110010000>;
-
-class Enc_COPROC_VX_3op_q<bits<3> opc> : OpcodeHexagon {
-  bits<2> dst;
-  bits<2> src1;
-  bits<2> src2;
-
-  let Inst{31-16} = { 0b00011110, src2{1-0}, 0b000011 };
-  let Inst{13-0} = { 0b0000, src1{1-0}, 0b000, opc{2-0}, dst{1-0} };
-}
-
-class V6_pred_and_enc : Enc_COPROC_VX_3op_q<0b000>;
-class V6_pred_or_enc : Enc_COPROC_VX_3op_q<0b001>;
-class V6_pred_xor_enc : Enc_COPROC_VX_3op_q<0b011>;
-class V6_pred_or_n_enc : Enc_COPROC_VX_3op_q<0b100>;
-class V6_pred_and_n_enc : Enc_COPROC_VX_3op_q<0b101>;
-
-class V6_pred_not_enc : OpcodeHexagon {
-  bits<2> dst;
-  bits<2> src1;
-
-  let Inst{31-16} = { 0b0001111000000011 };
-  let Inst{13-0} = { 0b0000, src1{1-0}, 0b000010, dst{1-0} };
-}
-
-class Enc_COPROC_VX_4op_q<bits<1> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<2> src1;
-  bits<5> src2;
-  bits<5> src3;
-
-  let Inst{31-16} = { 0b000111101, opc{0}, 1, src3{4-0} };
-  let Inst{13-0} = { 1, src2{4-0}, 0, src1{1-0}, dst{4-0} };
-}
-
-class V6_vswap_enc : Enc_COPROC_VX_4op_q<0>;
-class V6_vmux_enc : Enc_COPROC_VX_4op_q<1>;
-
-class Enc_X_2op<bits<16> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-
-  let Inst{31-16} = { opc{15-5}, src1{4-0} };
-  let Inst{13-0} = { opc{4-3}, 0b0000, opc{2-0}, dst{4-0} };
-}
-
-class V6_lvsplatw_enc : Enc_X_2op<0b0001100110100001>;
-class V6_vinsertwr_enc : Enc_X_2op<0b0001100110110001>;
-class S6_vsplatrbp_enc : Enc_X_2op<0b1000010001000100>;
-
-
-class Enc_CR_2op_r<bits<12> opc> : OpcodeHexagon {
-  bits<2> dst;
-  bits<5> src1;
-
-  let Inst{31-16} = { opc{11}, 0, opc{10-7}, 0, opc{6-3}, src1{4-0} };
-  let Inst{13-0} = { opc{2}, 0b000000, opc{1}, 0b000, opc{0}, dst{1-0} };
-}
-
-class V6_pred_scalar2_enc : Enc_CR_2op_r<0b001101101011>;
-class Y5_l2locka_enc : Enc_CR_2op_r<0b110000111100>;
-
-class Enc_S_3op_i6<bits<9> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<6> src2;
-
-  let Inst{31-16} = { 0b1000, opc{8-6}, 0, opc{5-3}, src1{4-0} };
-  let Inst{13-0} = { src2{5-0}, opc{2-0}, dst{4-0} };
-}
-
-class S6_rol_i_p_enc : Enc_S_3op_i6<0b000000011>;
-class S6_rol_i_p_nac_enc : Enc_S_3op_i6<0b001000011>;
-class S6_rol_i_p_acc_enc : Enc_S_3op_i6<0b001000111>;
-class S6_rol_i_p_and_enc : Enc_S_3op_i6<0b001010011>;
-class S6_rol_i_p_or_enc : Enc_S_3op_i6<0b001010111>;
-class S6_rol_i_p_xacc_enc : Enc_S_3op_i6<0b001100011>;
-
-class Enc_X_3op_r<bits<15> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { opc{14-4}, src1{4-0} };
-  let Inst{13-0} = { opc{3}, src2{4-0}, opc{2-0}, dst{4-0} };
-}
-
-class S6_rol_i_r_enc : Enc_X_3op_r<0b100011000000011>;
-class S6_rol_i_r_nac_enc : Enc_X_3op_r<0b100011100000011>;
-class S6_rol_i_r_acc_enc : Enc_X_3op_r<0b100011100000111>;
-class S6_rol_i_r_and_enc : Enc_X_3op_r<0b100011100100011>;
-class S6_rol_i_r_or_enc : Enc_X_3op_r<0b100011100100111>;
-class S6_rol_i_r_xacc_enc : Enc_X_3op_r<0b100011101000011>;
-class S6_vtrunehb_ppp_enc : Enc_X_3op_r<0b110000011000011>;
-class S6_vtrunohb_ppp_enc : Enc_X_3op_r<0b110000011000101>;
-
-class Enc_no_operands<bits<25> opc> : OpcodeHexagon {
-
-  let Inst{31-16} = { opc{24-10}, 0 };
-  let Inst{13-0} = { opc{9-7}, 0b000, opc{6-0}, 0 };
-}
-
-class Y5_l2gunlock_enc : Enc_no_operands<0b1010100000100000010000000>;
-class Y5_l2gclean_enc : Enc_no_operands<0b1010100000100000100000000>;
-class Y5_l2gcleaninv_enc : Enc_no_operands<0b1010100000100000110000000>;
-class V6_vhist_enc : Enc_no_operands<0b0001111000000001001000000>;
-
-class Enc_J_jumpr<bits<13> opc> : OpcodeHexagon {
-  bits<5> src1;
-
-  let Inst{31-16} = { opc{12-6}, 0, opc{5-3}, src1{4-0} };
-  let Inst{13-0} = { 0b00, opc{2}, 0b0000, opc{1-0}, 0b00000 };
-}
-
-class Y5_l2unlocka_enc : Enc_J_jumpr<0b1010011011000>;
-class Y2_l2cleaninvidx_enc : Enc_J_jumpr<0b1010100011000>;
-
-class Enc_ST_l2gclean_pa<bits<2> opc> : OpcodeHexagon {
-  bits<5> src1;
-
-  let Inst{31-16} = { 0b101001101, opc{1-0}, 0b00000 };
-  let Inst{13-0} = { 0, src1{4-0}, 0b00000000 };
-}
-
-class Y6_l2gcleanpa_enc : Enc_ST_l2gclean_pa<0b01>;
-class Y6_l2gcleaninvpa_enc : Enc_ST_l2gclean_pa<0b10>;
-
-class A5_ACS_enc : OpcodeHexagon {
-  bits<5> dst1;
-  bits<2> dst2;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b11101010101, src1{4-0} };
-  let Inst{13-0} = { 0, src2{4-0}, 0, dst2{1-0}, dst1{4-0} };
-}
-
-class Enc_X_4op_r<bits<8> opc> : OpcodeHexagon {
-  bits<5> dst;
-  bits<5> src1;
-  bits<5> src2;
-  bits<2> src3;
-
-  let Inst{31-16} = { 0b11, opc{7}, 0, opc{6-5}, 1, opc{4-1}, src1{4-0} };
-  let Inst{13-0} = { 0, src2{4-0}, opc{0}, src3{1-0}, dst{4-0} };
-}
-
-class S2_vsplicerb_enc : Enc_X_4op_r<0b00001000>;
-class S2_cabacencbin_enc : Enc_X_4op_r<0b00001010>;
-class F2_sffma_sc_enc : Enc_X_4op_r<0b11110111>;
-
-class V6_vhistq_enc : OpcodeHexagon {
-  bits<2> src1;
-
-  let Inst{31-16} = { 0b00011110, src1{1-0}, 0b000010 };
-  let Inst{13-0} = { 0b10000010000000 };
-}
-
-// TODO: Change script to generate dst1 instead of dst.
-class A6_vminub_RdP_enc : OpcodeHexagon {
-  bits<5> dst1;
-  bits<2> dst2;
-  bits<5> src1;
-  bits<5> src2;
-
-  let Inst{31-16} = { 0b11101010111, src2{4-0} };
-  let Inst{13-0} = { 0, src1{4-0}, 0, dst2{1-0}, dst1{4-0} };
-}
diff --git a/lib/Target/Hexagon/HexagonInstrFormats.td b/lib/Target/Hexagon/HexagonInstrFormats.td
index fa3cccbd08792e094a6dd24387507798767488d0..39c2a6e4f5a54c94d1568f0c21bfe67d404779b7 100644
--- a/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -7,26 +7,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-//===----------------------------------------------------------------------===//
-//                         Hexagon Instruction Flags +
-//
-//                    *** Must match HexagonBaseInfo.h ***
-//===----------------------------------------------------------------------===//
-
-class IType<bits<5> t> {
-  bits<5> Value = t;
-}
-def TypePSEUDO : IType<0>;
-def TypeALU32  : IType<1>;
-def TypeCR     : IType<2>;
-def TypeJR     : IType<3>;
-def TypeJ      : IType<4>;
-def TypeLD     : IType<5>;
-def TypeST     : IType<6>;
-def TypeSYSTEM : IType<7>;
-def TypeXTYPE  : IType<8>;
-def TypeENDLOOP: IType<31>;
-
 // Maintain list of valid subtargets for each instruction.
 class SubTarget<bits<6> value> {
   bits<6> Value = value;
@@ -54,6 +34,7 @@ class MemAccessSize<bits<4> value> {
   bits<4> Value = value;
 }
 
+// MemAccessSize is represented as 1+log2(N) where N is size in bits.
 def NoMemAccess      : MemAccessSize<0>;// Not a memory access instruction.
 def ByteAccess       : MemAccessSize<1>;// Byte access instruction (memb).
 def HalfWordAccess   : MemAccessSize<2>;// Half word access instruction (memh).
@@ -70,10 +51,9 @@ def Vector128Access  : MemAccessSize<8>;// Vector access instruction (memv)
 class OpcodeHexagon {
   field bits<32> Inst = ?; // Default to an invalid insn.
   bits<4> IClass = 0; // ICLASS
+  bits<1> zero = 0;
 
   let Inst{31-28} = IClass;
-
-  bits<1> zero = 0;
 }
 
 class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
@@ -99,85 +79,88 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
 
   // Instruction type according to the ISA.
   IType Type = type;
-  let TSFlags{4-0} = Type.Value;
+  let TSFlags{5-0} = Type.Value;
 
   // Solo instructions, i.e., those that cannot be in a packet with others.
   bits<1> isSolo = 0;
-  let TSFlags{5} = isSolo;
+  let TSFlags{6} = isSolo;
   // Packed only with A or X-type instructions.
   bits<1> isSoloAX = 0;
-  let TSFlags{6} = isSoloAX;
+  let TSFlags{7} = isSoloAX;
   // Only A-type instruction in first slot or nothing.
   bits<1> isSoloAin1 = 0;
-  let TSFlags{7} = isSoloAin1;
+  let TSFlags{8} = isSoloAin1;
 
   // Predicated instructions.
   bits<1> isPredicated = 0;
-  let TSFlags{8} = isPredicated;
+  let TSFlags{9} = isPredicated;
   bits<1> isPredicatedFalse = 0;
-  let TSFlags{9} = isPredicatedFalse;
+  let TSFlags{10} = isPredicatedFalse;
   bits<1> isPredicatedNew = 0;
-  let TSFlags{10} = isPredicatedNew;
+  let TSFlags{11} = isPredicatedNew;
   bits<1> isPredicateLate = 0;
-  let TSFlags{11} = isPredicateLate; // Late predicate producer insn.
+  let TSFlags{12} = isPredicateLate; // Late predicate producer insn.
 
   // New-value insn helper fields.
   bits<1> isNewValue = 0;
-  let TSFlags{12} = isNewValue; // New-value consumer insn.
+  let TSFlags{13} = isNewValue; // New-value consumer insn.
   bits<1> hasNewValue = 0;
-  let TSFlags{13} = hasNewValue; // New-value producer insn.
+  let TSFlags{14} = hasNewValue; // New-value producer insn.
   bits<3> opNewValue = 0;
-  let TSFlags{16-14} = opNewValue; // New-value produced operand.
+  let TSFlags{17-15} = opNewValue; // New-value produced operand.
   bits<1> isNVStorable = 0;
-  let TSFlags{17} = isNVStorable; // Store that can become new-value store.
+  let TSFlags{18} = isNVStorable; // Store that can become new-value store.
   bits<1> isNVStore = 0;
-  let TSFlags{18} = isNVStore; // New-value store insn.
+  let TSFlags{19} = isNVStore; // New-value store insn.
   bits<1> isCVLoadable = 0;
-  let TSFlags{19} = isCVLoadable; // Load that can become cur-value load.
+  let TSFlags{20} = isCVLoadable; // Load that can become cur-value load.
   bits<1> isCVLoad = 0;
-  let TSFlags{20} = isCVLoad; // Cur-value load insn.
+  let TSFlags{21} = isCVLoad; // Cur-value load insn.
 
   // Immediate extender helper fields.
   bits<1> isExtendable = 0;
-  let TSFlags{21} = isExtendable; // Insn may be extended.
+  let TSFlags{22} = isExtendable; // Insn may be extended.
   bits<1> isExtended = 0;
-  let TSFlags{22} = isExtended; // Insn must be extended.
+  let TSFlags{23} = isExtended; // Insn must be extended.
   bits<3> opExtendable = 0;
-  let TSFlags{25-23} = opExtendable; // Which operand may be extended.
+  let TSFlags{26-24} = opExtendable; // Which operand may be extended.
   bits<1> isExtentSigned = 0;
-  let TSFlags{26} = isExtentSigned; // Signed or unsigned range.
+  let TSFlags{27} = isExtentSigned; // Signed or unsigned range.
   bits<5> opExtentBits = 0;
-  let TSFlags{31-27} = opExtentBits; //Number of bits of range before extending.
+  let TSFlags{32-28} = opExtentBits; //Number of bits of range before extending.
   bits<2> opExtentAlign = 0;
-  let TSFlags{33-32} = opExtentAlign; // Alignment exponent before extending.
+  let TSFlags{34-33} = opExtentAlign; // Alignment exponent before extending.
 
   // If an instruction is valid on a subtarget, set the corresponding
   // bit from validSubTargets.
   // By default, instruction is valid on all subtargets.
   SubTarget validSubTargets = HasAnySubT;
-  let TSFlags{39-34} = validSubTargets.Value;
+  let TSFlags{40-35} = validSubTargets.Value;
 
   // Addressing mode for load/store instructions.
   AddrModeType addrMode = NoAddrMode;
-  let TSFlags{42-40} = addrMode.Value;
+  let TSFlags{43-41} = addrMode.Value;
 
   // Memory access size for mem access instructions (load/store)
   MemAccessSize accessSize = NoMemAccess;
-  let TSFlags{46-43} = accessSize.Value;
+  let TSFlags{47-44} = accessSize.Value;
 
   bits<1> isTaken = 0;
-  let TSFlags {47} = isTaken; // Branch prediction.
+  let TSFlags {48} = isTaken; // Branch prediction.
 
   bits<1> isFP = 0;
-  let TSFlags {48} = isFP; // Floating-point.
+  let TSFlags {49} = isFP; // Floating-point.
 
   bits<1> hasNewValue2 = 0;
-  let TSFlags{50} = hasNewValue2; // Second New-value producer insn.
+  let TSFlags{51} = hasNewValue2; // Second New-value producer insn.
   bits<3> opNewValue2 = 0;
-  let TSFlags{53-51} = opNewValue2; // Second New-value produced operand.
+  let TSFlags{54-52} = opNewValue2; // Second New-value produced operand.
 
   bits<1> isAccumulator = 0;
-  let TSFlags{54} = isAccumulator;
+  let TSFlags{55} = isAccumulator;
+
+  bits<1> prefersSlot3 = 0;
+  let TSFlags{56} = prefersSlot3; // Complex XU
 
   bit cofMax1 = 0;
   let TSFlags{60} = cofMax1;
@@ -200,9 +183,13 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   let NValueST = !if(isNVStore, "true", "false");
   let isNT = !if(isNonTemporal, "true", "false");
 
+  let hasSideEffects = 0;
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 }
 
+class HInst<dag outs, dag ins, string asmstr, InstrItinClass itin, IType type> :
+      InstHexagon<outs, ins, asmstr, [], "", itin, type>;
+
 //===----------------------------------------------------------------------===//
 //                         Instruction Classes Definitions +
 //===----------------------------------------------------------------------===//
@@ -214,14 +201,13 @@ class LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
 
-let mayLoad = 1 in
-class LDInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
+class PseudoLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = LD_tc_ld_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeLD>, OpcodeHexagon;
 
 class CONSTLDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                   string cstr = "">
-  : LDInst<outs, ins, asmstr, pattern, cstr>;
+  : PseudoLDInst<outs, ins, asmstr, pattern, cstr>;
 
 // LD Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
@@ -247,6 +233,11 @@ class STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>, OpcodeHexagon;
 
+let mayStore = 1 in
+class STInst_NoOpcode<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+             string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeST>;
+
 class STInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
   : STInst<outs, ins, asmstr, pattern, cstr>;
@@ -269,28 +260,24 @@ class STInstPost<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                  string cstr = "", InstrItinClass itin = ST_tc_st_SLOT01>
   : STInst<outs, ins, asmstr, pattern, cstr, itin>;
 
-// SYSTEM Instruction Class in V4 can take SLOT0 only
-// In V2/V3 we used ST for this but in v4 ST can take SLOT0 or SLOT1.
-class SYSInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "",  InstrItinClass itin = ST_tc_3stall_SLOT0>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeSYSTEM>,
-    OpcodeHexagon;
-
-// ALU32 Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
-class ALU32Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
- : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU32>, OpcodeHexagon;
-
 // ALU64 Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
 // Definition of the instruction class NOT CHANGED.
 // Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
 class ALU64Inst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                 string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
-   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU64>,
      OpcodeHexagon;
 
+// ALU64 Instruction Class in V2/V3.
+// XTYPE Instruction Class in V4.
+// Definition of the instruction class NOT CHANGED.
+// Name of the Instruction Class changed from ALU64 to XTYPE from V2/V3 to V4.
+class ALU64Inst_NoOpcode<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
+   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeALU64>;
+
+
 class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                 string cstr = "", InstrItinClass itin = ALU64_tc_2_SLOT23>
   : ALU64Inst<outs, ins, asmstr, pattern, cstr, itin>;
@@ -302,13 +289,13 @@ class ALU64_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 // Name of the Instruction Class changed from M to XTYPE from V2/V3 to V4.
 class MInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
             string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeM>,
     OpcodeHexagon;
 
 // Same as above but doesn't derive from OpcodeHexagon
 class MInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
             string cstr = "", InstrItinClass itin = M_tc_3x_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeM>;
 
 // M Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
@@ -324,12 +311,16 @@ class MInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
             string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>,
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeS_2op>,
     OpcodeHexagon;
 
+class SInst_NoOpcode<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+            string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeS_2op>;
+
 class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
             string cstr = "", InstrItinClass itin = S_2op_tc_1_SLOT23>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeXTYPE>;
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeS_2op>;
 
 // S Instruction Class in V2/V3.
 // XTYPE Instruction Class in V4.
@@ -337,7 +328,9 @@ class SInst2<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 // Name of the Instruction Class changed from S to XTYPE from V2/V3 to V4.
 class SInst_acc<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                 string cstr = "", InstrItinClass itin = S_3op_tc_1_SLOT23>
-  : SInst<outs, ins, asmstr, pattern, cstr, itin>;
+  : SInst<outs, ins, asmstr, pattern, cstr, itin> {
+  let Type = TypeS_3op;
+}
 
 // J Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
@@ -349,12 +342,6 @@ class JInst_CJUMP_UCJUMP<dag outs, dag ins, string asmstr, list<dag> pattern = [
             string cstr = "", InstrItinClass itin = J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>
   : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJ>, OpcodeHexagon;
 
-// JR Instruction Class in V2/V3/V4.
-// Definition of the instruction class NOT CHANGED.
-class JRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-             string cstr = "", InstrItinClass itin = J_tc_2early_SLOT2>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeJR>, OpcodeHexagon;
-
 // CR Instruction Class in V2/V3/V4.
 // Definition of the instruction class NOT CHANGED.
 class CRInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
@@ -383,26 +370,6 @@ class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 //                         Instruction Classes Definitions -
 //===----------------------------------------------------------------------===//
 
-
-//
-// ALU32 patterns
-//.
-class ALU32_rr<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class ALU32_ir<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class ALU32_ri<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
-class ALU32_ii<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-               string cstr = "", InstrItinClass itin = ALU32_2op_tc_1_SLOT0123>
-   : ALU32Inst<outs, ins, asmstr, pattern, cstr, itin>;
-
 //
 // ALU64 patterns.
 //
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV4.td b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
index 493d04703da91516448eb34783ab881b32327441..1fdf930c62fdb4502ec01cdaaf2d32abdf248823 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV4.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV4.td
@@ -11,18 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-//----------------------------------------------------------------------------//
-//                         Hexagon Instruction Flags
-//
-//                        *** Must match BaseInfo.h ***
-//----------------------------------------------------------------------------//
-
-def TypeV4LDST    : IType<9>;
-def TypeNV       : IType<10>;
-def TypeDUPLEX   : IType<11>;
-def TypeCOMPOUND : IType<12>;
-def TypePREFIX   : IType<30>;
-
 //                      Duplex Instruction Class Declaration
 //===----------------------------------------------------------------------===//
 
@@ -61,7 +49,7 @@ class InstDuplex<bits<4> iClass, list<dag> pattern = [],
 
   // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
 
-  let TSFlags{4-0} = Type.Value;
+  let TSFlags{5-0} = Type.Value;
 
   // Predicated instructions.
   bits<1> isPredicated = 0;
@@ -107,7 +95,7 @@ class InstDuplex<bits<4> iClass, list<dag> pattern = [],
 //
 class NVInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
              string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNV>, OpcodeHexagon;
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, TypeNCJ>, OpcodeHexagon;
 
 class NVInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
                 string cstr = "", InstrItinClass itin = NCJ_tc_3or4stall_SLOT0>
@@ -141,7 +129,7 @@ class MEMInst_V4<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 class EXTENDERInst<dag outs, dag ins, string asmstr, list<dag> pattern = []>
   : InstHexagon<outs, ins, asmstr, pattern, "", EXTENDER_tc_1_SLOT0123,
-                TypePREFIX>, OpcodeHexagon;
+                TypeEXTENDER>, OpcodeHexagon;
 
 class SUBInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
@@ -150,11 +138,11 @@ class SUBInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
 
 class CJInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>,
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND_CJ_ARCHDEPSLOT, TypeCJ>,
     OpcodeHexagon;
 
 class CJInst_JMPSET<dag outs, dag ins, string asmstr, list<dag> pattern = [],
               string cstr = "">
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCOMPOUND>,
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, COMPOUND, TypeCJ>,
     OpcodeHexagon;
 
diff --git a/lib/Target/Hexagon/HexagonInstrFormatsV60.td b/lib/Target/Hexagon/HexagonInstrFormatsV60.td
index b9f4373a0b79acb5f895badbcfa652cdbb89ad59..c8a7faea5ed5130b3888666ec2b93725ac08c455 100644
--- a/lib/Target/Hexagon/HexagonInstrFormatsV60.td
+++ b/lib/Target/Hexagon/HexagonInstrFormatsV60.td
@@ -11,28 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-//----------------------------------------------------------------------------//
-//                         Hexagon Instruction Flags +
-//
-//                        *** Must match BaseInfo.h ***
-//----------------------------------------------------------------------------//
-
-def TypeCVI_VA         : IType<13>;
-def TypeCVI_VA_DV      : IType<14>;
-def TypeCVI_VX         : IType<15>;
-def TypeCVI_VX_DV      : IType<16>;
-def TypeCVI_VP         : IType<17>;
-def TypeCVI_VP_VS      : IType<18>;
-def TypeCVI_VS         : IType<19>;
-def TypeCVI_VINLANESAT : IType<20>;
-def TypeCVI_VM_LD      : IType<21>;
-def TypeCVI_VM_TMP_LD  : IType<22>;
-def TypeCVI_VM_CUR_LD  : IType<23>;
-def TypeCVI_VM_VP_LDU  : IType<24>;
-def TypeCVI_VM_ST      : IType<25>;
-def TypeCVI_VM_NEW_ST  : IType<26>;
-def TypeCVI_VM_STU     : IType<27>;
-def TypeCVI_HIST       : IType<28>;
 //----------------------------------------------------------------------------//
 //                         Instruction Classes Definitions +
 //----------------------------------------------------------------------------//
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.cpp b/lib/Target/Hexagon/HexagonInstrInfo.cpp
index c2a8b569cded02e691040e7984f6cda745513120..b265a883da5c47589dd4012736c07ef30753a8f4 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -152,10 +152,11 @@ static unsigned nonDbgMICount(MachineBasicBlock::const_instr_iterator MIB,
 /// On Hexagon, we have two instructions used to set-up the hardware loop
 /// (LOOP0, LOOP1) with corresponding endloop (ENDLOOP0, ENDLOOP1) instructions
 /// to indicate the end of a loop.
-static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
+static MachineInstr *findLoopInstr(MachineBasicBlock *BB, unsigned EndLoopOp,
+      MachineBasicBlock *TargetBB,
       SmallPtrSet<MachineBasicBlock *, 8> &Visited) {
-  int LOOPi;
-  int LOOPr;
+  unsigned LOOPi;
+  unsigned LOOPr;
   if (EndLoopOp == Hexagon::ENDLOOP0) {
     LOOPi = Hexagon::J2_loop0i;
     LOOPr = Hexagon::J2_loop0r;
@@ -165,26 +166,24 @@ static MachineInstr *findLoopInstr(MachineBasicBlock *BB, int EndLoopOp,
   }
 
   // The loop set-up instruction will be in a predecessor block
-  for (MachineBasicBlock::pred_iterator PB = BB->pred_begin(),
-         PE = BB->pred_end(); PB != PE; ++PB) {
+  for (MachineBasicBlock *PB : BB->predecessors()) {
     // If this has been visited, already skip it.
-    if (!Visited.insert(*PB).second)
+    if (!Visited.insert(PB).second)
       continue;
-    if (*PB == BB)
+    if (PB == BB)
       continue;
-    for (MachineBasicBlock::reverse_instr_iterator I = (*PB)->instr_rbegin(),
-           E = (*PB)->instr_rend(); I != E; ++I) {
-      int Opc = I->getOpcode();
+    for (auto I = PB->instr_rbegin(), E = PB->instr_rend(); I != E; ++I) {
+      unsigned Opc = I->getOpcode();
       if (Opc == LOOPi || Opc == LOOPr)
         return &*I;
-      // We've reached a different loop, which means the loop0 has been removed.
-      if (Opc == EndLoopOp)
+      // We've reached a different loop, which means the loop01 has been
+      // removed.
+      if (Opc == EndLoopOp && I->getOperand(0).getMBB() != TargetBB)
         return nullptr;
     }
     // Check the predecessors for the LOOP instruction.
-    MachineInstr *loop = findLoopInstr(*PB, EndLoopOp, Visited);
-    if (loop)
-      return loop;
+    if (MachineInstr *Loop = findLoopInstr(PB, EndLoopOp, TargetBB, Visited))
+      return Loop;
   }
   return nullptr;
 }
@@ -597,7 +596,8 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
       // Since we're adding an ENDLOOP, there better be a LOOP instruction.
       // Check for it, and change the BB target if needed.
       SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-      MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+      MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, Cond[1].getMBB(),
+                                         VisitedBBs);
       assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
       Loop->getOperand(0).setMBB(TBB);
       // Add the ENDLOOP after the finding the LOOP0.
@@ -637,7 +637,8 @@ unsigned HexagonInstrInfo::insertBranch(MachineBasicBlock &MBB,
     // Since we're adding an ENDLOOP, there better be a LOOP instruction.
     // Check for it, and change the BB target if needed.
     SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-    MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, VisitedBBs);
+    MachineInstr *Loop = findLoopInstr(TBB, EndLoopOp, Cond[1].getMBB(),
+                                       VisitedBBs);
     assert(Loop != 0 && "Inserting an ENDLOOP without a LOOP");
     Loop->getOperand(0).setMBB(TBB);
     // Add the ENDLOOP after the finding the LOOP0.
@@ -687,7 +688,8 @@ unsigned HexagonInstrInfo::reduceLoopCount(MachineBasicBlock &MBB,
   MachineFunction *MF = MBB.getParent();
   DebugLoc DL = Cmp.getDebugLoc();
   SmallPtrSet<MachineBasicBlock *, 8> VisitedBBs;
-  MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(), VisitedBBs);
+  MachineInstr *Loop = findLoopInstr(&MBB, Cmp.getOpcode(),
+                                     Cmp.getOperand(0).getMBB(), VisitedBBs);
   if (!Loop)
     return 0;
   // If the loop trip count is a compile-time value, then just change the
@@ -1411,18 +1413,28 @@ bool HexagonInstrInfo::DefinesPredicate(
   auto &HRI = getRegisterInfo();
   for (unsigned oper = 0; oper < MI.getNumOperands(); ++oper) {
     MachineOperand MO = MI.getOperand(oper);
-    if (MO.isReg() && MO.isDef()) {
+    if (MO.isReg()) {
+      if (!MO.isDef())
+        continue;
       const TargetRegisterClass* RC = HRI.getMinimalPhysRegClass(MO.getReg());
       if (RC == &Hexagon::PredRegsRegClass) {
         Pred.push_back(MO);
         return true;
       }
+      continue;
+    } else if (MO.isRegMask()) {
+      for (unsigned PR : Hexagon::PredRegsRegClass) {
+        if (!MI.modifiesRegister(PR, &HRI))
+          continue;
+        Pred.push_back(MO);
+        return true;
+      }
     }
   }
   return false;
 }
 
-bool HexagonInstrInfo::isPredicable(MachineInstr &MI) const {
+bool HexagonInstrInfo::isPredicable(const MachineInstr &MI) const {
   return MI.getDesc().isPredicable();
 }
 
@@ -1713,7 +1725,7 @@ bool HexagonInstrInfo::isComplex(const MachineInstr &MI) const {
 
 // Return true if the instruction is a compund branch instruction.
 bool HexagonInstrInfo::isCompoundBranchInstr(const MachineInstr &MI) const {
-  return (getType(MI) == HexagonII::TypeCOMPOUND && MI.isBranch());
+  return getType(MI) == HexagonII::TypeCJ && MI.isBranch();
 }
 
 bool HexagonInstrInfo::isCondInst(const MachineInstr &MI) const {
@@ -3007,10 +3019,12 @@ bool HexagonInstrInfo::producesStall(const MachineInstr &MI,
 
 bool HexagonInstrInfo::predCanBeUsedAsDotNew(const MachineInstr &MI,
       unsigned PredReg) const {
-  for (unsigned opNum = 0; opNum < MI.getNumOperands(); opNum++) {
-    const MachineOperand &MO = MI.getOperand(opNum);
+  for (const MachineOperand &MO : MI.operands()) {
+    // Predicate register must be explicitly defined.
+    if (MO.isRegMask() && MO.clobbersPhysReg(PredReg))
+      return false;
     if (MO.isReg() && MO.isDef() && MO.isImplicit() && (MO.getReg() == PredReg))
-      return false; // Predicate register must be explicitly defined.
+      return false;
   }
 
   // Hexagon Programmer's Reference says that decbin, memw_locked, and
@@ -3413,7 +3427,9 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
     return NVOpcode;
 
   switch (MI.getOpcode()) {
-  default: llvm_unreachable("Unknown .new type");
+  default:
+    llvm::report_fatal_error(std::string("Unknown .new type: ") +
+      std::to_string(MI.getOpcode()).c_str());
   case Hexagon::S4_storerb_ur:
     return Hexagon::S4_storerbnew_ur;
 
@@ -3454,20 +3470,75 @@ int HexagonInstrInfo::getDotNewOp(const MachineInstr &MI) const {
 int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
       const MachineBranchProbabilityInfo *MBPI) const {
   // We assume that block can have at most two successors.
-  bool taken = false;
   const MachineBasicBlock *Src = MI.getParent();
   const MachineOperand &BrTarget = MI.getOperand(1);
-  const MachineBasicBlock *Dst = BrTarget.getMBB();
+  bool Taken = false;
+  const BranchProbability OneHalf(1, 2);
+
+  if (BrTarget.isMBB()) {
+    const MachineBasicBlock *Dst = BrTarget.getMBB();
+    Taken = MBPI->getEdgeProbability(Src, Dst) >= OneHalf;
+  } else {
+    // The branch target is not a basic block (most likely a function).
+    // Since BPI only gives probabilities for targets that are basic blocks,
+    // try to identify another target of this branch (potentially a fall-
+    // -through) and check the probability of that target.
+    //
+    // The only handled branch combinations are:
+    // - one conditional branch,
+    // - one conditional branch followed by one unconditional branch.
+    // Otherwise, assume not-taken.
+    assert(MI.isConditionalBranch());
+    const MachineBasicBlock &B = *MI.getParent();
+    bool SawCond = false, Bad = false;
+    for (const MachineInstr &I : B) {
+      if (!I.isBranch())
+        continue;
+      if (I.isConditionalBranch()) {
+        SawCond = true;
+        if (&I != &MI) {
+          Bad = true;
+          break;
+        }
+      }
+      if (I.isUnconditionalBranch() && !SawCond) {
+        Bad = true;
+        break;
+      }
+    }
+    if (!Bad) {
+      MachineBasicBlock::const_instr_iterator It(MI);
+      MachineBasicBlock::const_instr_iterator NextIt = std::next(It);
+      if (NextIt == B.instr_end()) {
+        // If this branch is the last, look for the fall-through block.
+        for (const MachineBasicBlock *SB : B.successors()) {
+          if (!B.isLayoutSuccessor(SB))
+            continue;
+          Taken = MBPI->getEdgeProbability(Src, SB) < OneHalf;
+          break;
+        }
+      } else {
+        assert(NextIt->isUnconditionalBranch());
+        // Find the first MBB operand and assume it's the target.
+        const MachineBasicBlock *BT = nullptr;
+        for (const MachineOperand &Op : NextIt->operands()) {
+          if (!Op.isMBB())
+            continue;
+          BT = Op.getMBB();
+          break;
+        }
+        Taken = BT && MBPI->getEdgeProbability(Src, BT) < OneHalf;
+      }
+    } // if (!Bad)
+  }
 
-  const BranchProbability Prediction = MBPI->getEdgeProbability(Src, Dst);
-  if (Prediction >= BranchProbability(1,2))
-    taken = true;
+  // The Taken flag should be set to something reasonable by this point.
 
   switch (MI.getOpcode()) {
   case Hexagon::J2_jumpt:
-    return taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
+    return Taken ? Hexagon::J2_jumptnewpt : Hexagon::J2_jumptnew;
   case Hexagon::J2_jumpf:
-    return taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
+    return Taken ? Hexagon::J2_jumpfnewpt : Hexagon::J2_jumpfnew;
 
   default:
     llvm_unreachable("Unexpected jump instruction.");
@@ -3477,26 +3548,46 @@ int HexagonInstrInfo::getDotNewPredJumpOp(const MachineInstr &MI,
 // Return .new predicate version for an instruction.
 int HexagonInstrInfo::getDotNewPredOp(const MachineInstr &MI,
       const MachineBranchProbabilityInfo *MBPI) const {
-  int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
-  if (NewOpcode >= 0) // Valid predicate new instruction
-    return NewOpcode;
-
   switch (MI.getOpcode()) {
   // Condtional Jumps
   case Hexagon::J2_jumpt:
   case Hexagon::J2_jumpf:
     return getDotNewPredJumpOp(MI, MBPI);
-
-  default:
-    assert(0 && "Unknown .new type");
   }
-  return 0;
+
+  int NewOpcode = Hexagon::getPredNewOpcode(MI.getOpcode());
+  if (NewOpcode >= 0)
+    return NewOpcode;
+
+  dbgs() << "Cannot convert to .new: " << getName(MI.getOpcode()) << '\n';
+  llvm_unreachable(nullptr);
 }
 
-int HexagonInstrInfo::getDotOldOp(const int opc) const {
-  int NewOp = opc;
+int HexagonInstrInfo::getDotOldOp(const MachineInstr &MI) const {
+  int NewOp = MI.getOpcode();
   if (isPredicated(NewOp) && isPredicatedNew(NewOp)) { // Get predicate old form
     NewOp = Hexagon::getPredOldOpcode(NewOp);
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    const HexagonSubtarget &HST = MF.getSubtarget<HexagonSubtarget>();
+    // All Hexagon architectures have prediction bits on dot-new branches,
+    // but only Hexagon V60+ has prediction bits on dot-old ones. Make sure
+    // to pick the right opcode when converting back to dot-old.
+    if (!HST.getFeatureBits()[Hexagon::ArchV60]) {
+      switch (NewOp) {
+      case Hexagon::J2_jumptpt:
+        NewOp = Hexagon::J2_jumpt;
+        break;
+      case Hexagon::J2_jumpfpt:
+        NewOp = Hexagon::J2_jumpf;
+        break;
+      case Hexagon::J2_jumprtpt:
+        NewOp = Hexagon::J2_jumprt;
+        break;
+      case Hexagon::J2_jumprfpt:
+        NewOp = Hexagon::J2_jumprf;
+        break;
+      }
+    }
     assert(NewOp >= 0 &&
            "Couldn't change predicate new instruction to its old form.");
   }
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.h b/lib/Target/Hexagon/HexagonInstrInfo.h
index 2358d4b7e4c0ee1747b7e3394f6fdf42f6fef1d0..b268c7a28171e2fedd4d314b89af8e93360d2bd4 100644
--- a/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -235,7 +235,7 @@ public:
   /// Return true if the specified instruction can be predicated.
   /// By default, this returns true for every instruction with a
   /// PredicateOperand.
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
   /// Test if the given instruction should be considered a scheduling boundary.
   /// This primarily includes labels and terminators.
@@ -404,7 +404,7 @@ public:
                           const MachineBranchProbabilityInfo *MBPI) const;
   int getDotNewPredOp(const MachineInstr &MI,
                       const MachineBranchProbabilityInfo *MBPI) const;
-  int getDotOldOp(const int opc) const;
+  int getDotOldOp(const MachineInstr &MI) const;
   HexagonII::SubInstructionGroup getDuplexCandidateGroup(const MachineInstr &MI)
                                                          const;
   short getEquivalentHWInstr(const MachineInstr &MI) const;
diff --git a/lib/Target/Hexagon/HexagonInstrInfo.td b/lib/Target/Hexagon/HexagonInstrInfo.td
deleted file mode 100644
index c5719ad5b6d839790537de0d400b99dfbd65f751..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfo.td
+++ /dev/null
@@ -1,4799 +0,0 @@
-//==- HexagonInstrInfo.td - Target Description for Hexagon -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrFormats.td"
-include "HexagonOperands.td"
-include "HexagonInstrEnc.td"
-
-//===----------------------------------------------------------------------===//
-// Compare
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isCompare = 1, InputType = "imm", isExtendable = 1,
-    opExtendable = 2 in
-class T_CMP <string mnemonic, bits<2> MajOp, bit isNot, Operand ImmOp>
-  : ALU32Inst <(outs PredRegs:$dst),
-               (ins IntRegs:$src1, ImmOp:$src2),
-  "$dst = "#!if(isNot, "!","")#mnemonic#"($src1, #$src2)",
-  [], "",ALU32_2op_tc_2early_SLOT0123 >, ImmRegRel {
-    bits<2> dst;
-    bits<5> src1;
-    bits<10> src2;
-    let CextOpcode = mnemonic;
-    let opExtentBits  = !if(!eq(mnemonic, "cmp.gtu"), 9, 10);
-    let isExtentSigned = !if(!eq(mnemonic, "cmp.gtu"), 0, 1);
-
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0101;
-    let Inst{23-22} = MajOp;
-    let Inst{21}    = !if(!eq(mnemonic, "cmp.gtu"), 0, src2{9});
-    let Inst{20-16} = src1;
-    let Inst{13-5}  = src2{8-0};
-    let Inst{4}     = isNot;
-    let Inst{3-2}   = 0b00;
-    let Inst{1-0}   = dst;
-  }
-
-def C2_cmpeqi   : T_CMP <"cmp.eq",  0b00, 0, s10_0Ext>;
-def C2_cmpgti   : T_CMP <"cmp.gt",  0b01, 0, s10_0Ext>;
-def C2_cmpgtui  : T_CMP <"cmp.gtu", 0b10, 0, u9_0Ext>;
-
-//===----------------------------------------------------------------------===//
-// ALU32/ALU +
-//===----------------------------------------------------------------------===//
-// Add.
-
-let hasSideEffects = 0, hasNewValue = 1, InputType = "reg" in
-class T_ALU32_3op<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit OpsRev,
-                  bit IsComm>
-  : ALU32_rr<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-             "$Rd = "#mnemonic#"($Rs, $Rt)",
-             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredRel {
-  let isCommutable = IsComm;
-  let BaseOpcode = mnemonic#_rr;
-  let CextOpcode = mnemonic;
-
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<5> Rd;
-
-  let IClass = 0b1111;
-  let Inst{27} = 0b0;
-  let Inst{26-24} = MajOp;
-  let Inst{23-21} = MinOp;
-  let Inst{20-16} = !if(OpsRev,Rt,Rs);
-  let Inst{12-8} = !if(OpsRev,Rs,Rt);
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_ALU32_3op_pred<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                       bit OpsRev, bit PredNot, bit PredNew>
-  : ALU32_rr<(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
-             "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") "#
-             "$Rd = "#mnemonic#"($Rs, $Rt)",
-             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
-  let isPredicated = 1;
-  let isPredicatedFalse = PredNot;
-  let isPredicatedNew = PredNew;
-  let BaseOpcode = mnemonic#_rr;
-  let CextOpcode = mnemonic;
-
-  bits<2> Pu;
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<5> Rd;
-
-  let IClass = 0b1111;
-  let Inst{27} = 0b1;
-  let Inst{26-24} = MajOp;
-  let Inst{23-21} = MinOp;
-  let Inst{20-16} = !if(OpsRev,Rt,Rs);
-  let Inst{13} = PredNew;
-  let Inst{12-8} = !if(OpsRev,Rs,Rt);
-  let Inst{7} = PredNot;
-  let Inst{6-5} = Pu;
-  let Inst{4-0} = Rd;
-}
-
-class T_ALU32_combineh<string Op1, string Op2, bits<3> MajOp, bits<3> MinOp,
-                      bit OpsRev>
-  : T_ALU32_3op<"", MajOp, MinOp, OpsRev, 0> {
-  let AsmString = "$Rd = combine($Rs"#Op1#", $Rt"#Op2#")";
-}
-
-def A2_combine_hh : T_ALU32_combineh<".h", ".h", 0b011, 0b100, 1>;
-def A2_combine_hl : T_ALU32_combineh<".h", ".l", 0b011, 0b101, 1>;
-def A2_combine_lh : T_ALU32_combineh<".l", ".h", 0b011, 0b110, 1>;
-def A2_combine_ll : T_ALU32_combineh<".l", ".l", 0b011, 0b111, 1>;
-
-class T_ALU32_3op_sfx<string mnemonic, string suffix, bits<3> MajOp,
-                      bits<3> MinOp, bit OpsRev, bit IsComm>
-  : T_ALU32_3op<"", MajOp, MinOp, OpsRev, IsComm> {
-  let AsmString = "$Rd = "#mnemonic#"($Rs, $Rt)"#suffix;
-}
-
-def A2_svaddh   : T_ALU32_3op<"vaddh",   0b110, 0b000, 0, 1>;
-def A2_svsubh   : T_ALU32_3op<"vsubh",   0b110, 0b100, 1, 0>;
-
-let Defs = [USR_OVF], Itinerary = ALU32_3op_tc_2_SLOT0123 in {
-  def A2_svaddhs  : T_ALU32_3op_sfx<"vaddh",  ":sat", 0b110, 0b001, 0, 1>;
-  def A2_addsat   : T_ALU32_3op_sfx<"add",    ":sat", 0b110, 0b010, 0, 1>;
-  def A2_svadduhs : T_ALU32_3op_sfx<"vadduh", ":sat", 0b110, 0b011, 0, 1>;
-  def A2_svsubhs  : T_ALU32_3op_sfx<"vsubh",  ":sat", 0b110, 0b101, 1, 0>;
-  def A2_subsat   : T_ALU32_3op_sfx<"sub",    ":sat", 0b110, 0b110, 1, 0>;
-  def A2_svsubuhs : T_ALU32_3op_sfx<"vsubuh", ":sat", 0b110, 0b111, 1, 0>;
-}
-
-let Itinerary = ALU32_3op_tc_2_SLOT0123 in
-def A2_svavghs  : T_ALU32_3op_sfx<"vavgh",  ":rnd", 0b111, 0b001, 0, 1>;
-
-def A2_svavgh   : T_ALU32_3op<"vavgh",   0b111, 0b000, 0, 1>;
-def A2_svnavgh  : T_ALU32_3op<"vnavgh",  0b111, 0b011, 1, 0>;
-
-multiclass T_ALU32_3op_p<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                         bit OpsRev> {
-  def t    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 0>;
-  def f    : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 0>;
-  def tnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 0, 1>;
-  def fnew : T_ALU32_3op_pred<mnemonic, MajOp, MinOp, OpsRev, 1, 1>;
-}
-
-multiclass T_ALU32_3op_A2<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                          bit OpsRev, bit IsComm> {
-  let isPredicable = 1 in
-  def  A2_#NAME  : T_ALU32_3op  <mnemonic, MajOp, MinOp, OpsRev, IsComm>;
-  defm A2_p#NAME : T_ALU32_3op_p<mnemonic, MajOp, MinOp, OpsRev>;
-}
-
-defm add : T_ALU32_3op_A2<"add", 0b011, 0b000, 0, 1>;
-defm and : T_ALU32_3op_A2<"and", 0b001, 0b000, 0, 1>;
-defm or  : T_ALU32_3op_A2<"or",  0b001, 0b001, 0, 1>;
-defm sub : T_ALU32_3op_A2<"sub", 0b011, 0b001, 1, 0>;
-defm xor : T_ALU32_3op_A2<"xor", 0b001, 0b011, 0, 1>;
-
-// A few special cases producing register pairs:
-let OutOperandList = (outs DoubleRegs:$Rd), hasNewValue = 0 in {
-  def S2_packhl    : T_ALU32_3op  <"packhl",  0b101, 0b100, 0, 0>;
-
-  let isPredicable = 1 in
-    def A2_combinew  : T_ALU32_3op  <"combine", 0b101, 0b000, 0, 0>;
-
-  // Conditional combinew uses "newt/f" instead of "t/fnew".
-  def C2_ccombinewt    : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 0>;
-  def C2_ccombinewf    : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 0>;
-  def C2_ccombinewnewt : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 0, 1>;
-  def C2_ccombinewnewf : T_ALU32_3op_pred<"combine", 0b101, 0b000, 0, 1, 1>;
-}
-
-let hasSideEffects = 0, hasNewValue = 1, isCompare = 1, InputType = "reg"  in
-class T_ALU32_3op_cmp<string mnemonic, bits<2> MinOp, bit IsNeg, bit IsComm>
-  : ALU32_rr<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-             "$Pd = "#mnemonic#"($Rs, $Rt)",
-             [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
-  let CextOpcode = mnemonic;
-  let isCommutable = IsComm;
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<2> Pd;
-
-  let IClass = 0b1111;
-  let Inst{27-24} = 0b0010;
-  let Inst{22-21} = MinOp;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4} = IsNeg;
-  let Inst{3-2} = 0b00;
-  let Inst{1-0} = Pd;
-}
-
-let Itinerary = ALU32_3op_tc_2early_SLOT0123 in {
-  def C2_cmpeq   : T_ALU32_3op_cmp< "cmp.eq",  0b00, 0, 1>;
-  def C2_cmpgt   : T_ALU32_3op_cmp< "cmp.gt",  0b10, 0, 0>;
-  def C2_cmpgtu  : T_ALU32_3op_cmp< "cmp.gtu", 0b11, 0, 0>;
-}
-
-let CextOpcode = "MUX", InputType = "reg", hasNewValue = 1 in
-def C2_mux: ALU32_rr<(outs IntRegs:$Rd),
-                     (ins PredRegs:$Pu, IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = mux($Pu, $Rs, $Rt)", [], "", ALU32_3op_tc_1_SLOT0123>, ImmRegRel {
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let CextOpcode = "mux";
-  let InputType = "reg";
-  let hasSideEffects = 0;
-  let IClass = 0b1111;
-
-  let Inst{27-24} = 0b0100;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{6-5} = Pu;
-  let Inst{4-0} = Rd;
-}
-
-// Combines the two immediates into a double register.
-// Increase complexity to make it greater than any complexity of a combine
-// that involves a register.
-
-let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
-    isExtentSigned = 1, isExtendable = 1, opExtentBits = 8, opExtendable = 1,
-    AddedComplexity = 75 in
-def A2_combineii: ALU32Inst <(outs DoubleRegs:$Rdd), (ins s8_0Ext:$s8, s8_0Imm:$S8),
-  "$Rdd = combine(#$s8, #$S8)",
-  []> {
-    bits<5> Rdd;
-    bits<8> s8;
-    bits<8> S8;
-
-    let IClass = 0b0111;
-    let Inst{27-23} = 0b11000;
-    let Inst{22-16} = S8{7-1};
-    let Inst{13}    = S8{0};
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rdd;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated ADD of a reg and an Immediate value.
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_Addri_Pred <bit PredNot, bit PredNew>
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins PredRegs:$Pu, IntRegs:$Rs, s8_0Ext:$s8),
-  !if(PredNot, "if (!$Pu", "if ($Pu")#!if(PredNew,".new) $Rd = ",
-  ") $Rd = ")#"add($Rs, #$s8)"> {
-    bits<5> Rd;
-    bits<2> Pu;
-    bits<5> Rs;
-    bits<8> s8;
-
-    let isPredicatedNew = PredNew;
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0100;
-    let Inst{23}    = PredNot;
-    let Inst{22-21} = Pu;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = PredNew;
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rd;
-  }
-
-//===----------------------------------------------------------------------===//
-// A2_addi: Add a signed immediate to a register.
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_Addri <Operand immOp>
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins IntRegs:$Rs, immOp:$s16),
-  "$Rd = add($Rs, #$s16)", [], "", ALU32_ADDI_tc_1_SLOT0123> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<16> s16;
-
-    let IClass = 0b1011;
-
-    let Inst{27-21} = s16{15-9};
-    let Inst{20-16} = Rs;
-    let Inst{13-5}  = s16{8-0};
-    let Inst{4-0}   = Rd;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multiclass for ADD of a register and an immediate value.
-//===----------------------------------------------------------------------===//
-multiclass Addri_Pred<string mnemonic, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    def NAME     : T_Addri_Pred<PredNot, 0>;
-    // Predicate new
-    def NAME#new : T_Addri_Pred<PredNot, 1>;
-  }
-}
-
-let isExtendable = 1, isExtentSigned = 1, InputType = "imm" in
-multiclass Addri_base<string mnemonic, SDNode OpNode> {
-  let CextOpcode = mnemonic, BaseOpcode = mnemonic#_ri in {
-    let opExtendable = 2, opExtentBits = 16, isPredicable = 1, isAdd = 1 in
-    def A2_#NAME : T_Addri<s16_0Ext>;
-
-    let opExtendable = 3, opExtentBits = 8, isPredicated = 1 in {
-      defm A2_p#NAME#t : Addri_Pred<mnemonic, 0>;
-      defm A2_p#NAME#f : Addri_Pred<mnemonic, 1>;
-    }
-  }
-}
-
-defm addi : Addri_base<"add", add>, ImmRegRel, PredNewRel;
-
-let hasNewValue = 1, hasSideEffects = 0, isPseudo = 1 in
-def A2_iconst
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins s23_2Imm:$s23_2),
-  "$Rd = iconst(#$s23_2)"> {}
-
-//===----------------------------------------------------------------------===//
-// Template class used for the following ALU32 instructions.
-// Rd=and(Rs,#s10)
-// Rd=or(Rs,#s10)
-//===----------------------------------------------------------------------===//
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 10,
-InputType = "imm", hasNewValue = 1 in
-class T_ALU32ri_logical <string mnemonic, SDNode OpNode, bits<2> MinOp>
-  : ALU32_ri <(outs IntRegs:$Rd),
-              (ins IntRegs:$Rs, s10_0Ext:$s10),
-  "$Rd = "#mnemonic#"($Rs, #$s10)" ,
-  []> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<10> s10;
-    let CextOpcode = mnemonic;
-
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0110;
-    let Inst{23-22} = MinOp;
-    let Inst{21}    = s10{9};
-    let Inst{20-16} = Rs;
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Rd;
-  }
-
-def A2_orir  : T_ALU32ri_logical<"or", or, 0b10>, ImmRegRel;
-def A2_andir : T_ALU32ri_logical<"and", and, 0b00>, ImmRegRel;
-
-// Subtract register from immediate
-// Rd32=sub(#s10,Rs32)
-let isExtendable = 1, CextOpcode = "sub", opExtendable = 1, isExtentSigned = 1,
-    opExtentBits = 10, InputType = "imm", hasNewValue = 1, hasSideEffects = 0 in
-def A2_subri: ALU32_ri <(outs IntRegs:$Rd), (ins s10_0Ext:$s10, IntRegs:$Rs),
-  "$Rd = sub(#$s10, $Rs)", []>, ImmRegRel {
-    bits<5> Rd;
-    bits<10> s10;
-    bits<5> Rs;
-
-    let IClass = 0b0111;
-
-    let Inst{27-22} = 0b011001;
-    let Inst{21}    = s10{9};
-    let Inst{20-16} = Rs;
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Rd;
-  }
-
-// Nop.
-let hasSideEffects = 0 in
-def A2_nop: ALU32Inst <(outs), (ins), "nop" > {
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b1111;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_tfr16<bit isHi>
-  : ALU32Inst <(outs IntRegs:$Rx), (ins IntRegs:$src1, u16_0Imm:$u16),
-  "$Rx"#!if(isHi, ".h", ".l")#" = #$u16",
-  [], "$src1 = $Rx" > {
-    bits<5> Rx;
-    bits<16> u16;
-
-    let IClass = 0b0111;
-    let Inst{27-26} = 0b00;
-    let Inst{25-24} = !if(isHi, 0b10, 0b01);
-    let Inst{23-22} = u16{15-14};
-    let Inst{21}    = 0b1;
-    let Inst{20-16} = Rx;
-    let Inst{13-0}  = u16{13-0};
-  }
-
-def A2_tfril: T_tfr16<0>;
-def A2_tfrih: T_tfr16<1>;
-
-// Conditional transfer is an alias to conditional "Rd = add(Rs, #0)".
-let isPredicated = 1, hasNewValue = 1, opNewValue = 0 in
-class T_tfr_pred<bit isPredNot, bit isPredNew>
-  : ALU32Inst<(outs IntRegs:$dst),
-              (ins PredRegs:$src1, IntRegs:$src2),
-              "if ("#!if(isPredNot, "!", "")#
-              "$src1"#!if(isPredNew, ".new", "")#
-              ") $dst = $src2"> {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-
-    let isPredicatedFalse = isPredNot;
-    let isPredicatedNew = isPredNew;
-    let IClass = 0b0111;
-
-    let Inst{27-24} = 0b0100;
-    let Inst{23} = isPredNot;
-    let Inst{13} = isPredNew;
-    let Inst{12-5} = 0;
-    let Inst{4-0} = dst;
-    let Inst{22-21} = src1;
-    let Inst{20-16} = src2;
-  }
-
-let isPredicable = 1 in
-class T_tfr : ALU32Inst<(outs IntRegs:$dst), (ins IntRegs:$src),
-              "$dst = $src"> {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b0111;
-
-    let Inst{27-21} = 0b0000011;
-    let Inst{20-16} = src;
-    let Inst{13}    = 0b0;
-    let Inst{4-0}   = dst;
-  }
-
-let InputType = "reg", hasNewValue = 1, hasSideEffects = 0 in
-multiclass tfr_base<string CextOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp in {
-    def NAME : T_tfr;
-
-    // Predicate
-    def t : T_tfr_pred<0, 0>;
-    def f : T_tfr_pred<1, 0>;
-    // Predicate new
-    def tnew : T_tfr_pred<0, 1>;
-    def fnew : T_tfr_pred<1, 1>;
-  }
-}
-
-// Assembler mapped to C2_ccombinew[t|f|newt|newf].
-// Please don't add bits to this instruction as it'll be converted into
-// 'combine' before object code emission.
-let isPredicated = 1 in
-class T_tfrp_pred<bit PredNot, bit PredNew>
-  : ALU32_rr <(outs DoubleRegs:$dst),
-              (ins PredRegs:$src1, DoubleRegs:$src2),
-  "if ("#!if(PredNot, "!", "")#"$src1"
-        #!if(PredNew, ".new", "")#") $dst = $src2" > {
-    let isPredicatedFalse = PredNot;
-    let isPredicatedNew = PredNew;
-  }
-
-// Assembler mapped to A2_combinew.
-// Please don't add bits to this instruction as it'll be converted into
-// 'combine' before object code emission.
-class T_tfrp : ALU32Inst <(outs DoubleRegs:$dst),
-               (ins DoubleRegs:$src),
-    "$dst = $src">;
-
-let hasSideEffects = 0 in
-multiclass TFR64_base<string BaseName> {
-  let BaseOpcode = BaseName in {
-    let isPredicable = 1 in
-    def NAME : T_tfrp;
-    // Predicate
-    def t : T_tfrp_pred <0, 0>;
-    def f : T_tfrp_pred <1, 0>;
-    // Predicate new
-    def tnew : T_tfrp_pred <0, 1>;
-    def fnew : T_tfrp_pred <1, 1>;
-  }
-}
-
-let InputType = "imm", isExtendable = 1, isExtentSigned = 1, opExtentBits = 12,
-    isMoveImm = 1, opExtendable = 2, BaseOpcode = "TFRI", CextOpcode = "TFR",
-    hasSideEffects = 0, isPredicated = 1, hasNewValue = 1 in
-class T_TFRI_Pred<bit PredNot, bit PredNew>
-  : ALU32_ri<(outs IntRegs:$Rd), (ins PredRegs:$Pu, s12_0Ext:$s12),
-    "if ("#!if(PredNot,"!","")#"$Pu"#!if(PredNew,".new","")#") $Rd = #$s12",
-    [], "", ALU32_2op_tc_1_SLOT0123>, ImmRegRel, PredNewRel {
-  let isPredicatedFalse = PredNot;
-  let isPredicatedNew = PredNew;
-
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<12> s12;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b1110;
-  let Inst{23} = PredNot;
-  let Inst{22-21} = Pu;
-  let Inst{20} = 0b0;
-  let Inst{19-16,12-5} = s12;
-  let Inst{13} = PredNew;
-  let Inst{4-0} = Rd;
-}
-
-def C2_cmoveit    : T_TFRI_Pred<0, 0>;
-def C2_cmoveif    : T_TFRI_Pred<1, 0>;
-def C2_cmovenewit : T_TFRI_Pred<0, 1>;
-def C2_cmovenewif : T_TFRI_Pred<1, 1>;
-
-let InputType = "imm", isExtendable = 1, isExtentSigned = 1,
-    CextOpcode = "TFR", BaseOpcode = "TFRI", hasNewValue = 1, opNewValue = 0,
-    isAsCheapAsAMove = 1 , opExtendable = 1, opExtentBits = 16, isMoveImm = 1,
-    isPredicated = 0, isPredicable = 1, isReMaterializable = 1 in
-def A2_tfrsi : ALU32Inst<(outs IntRegs:$Rd), (ins s16_0Ext:$s16), "$Rd = #$s16",
-    [], "", ALU32_2op_tc_1_SLOT0123>,
-    ImmRegRel, PredRel {
-  bits<5> Rd;
-  bits<16> s16;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b1000;
-  let Inst{23-22,20-16,13-5} = s16;
-  let Inst{4-0} = Rd;
-}
-
-defm A2_tfr  : tfr_base<"TFR">, ImmRegRel, PredNewRel;
-let isAsmParserOnly = 1 in
-defm A2_tfrp : TFR64_base<"TFR64">, PredNewRel;
-
-// Assembler mapped
-let isReMaterializable = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
-    isAsmParserOnly = 1 in
-def A2_tfrpi : ALU64_rr<(outs DoubleRegs:$dst), (ins s8_0Imm64:$src1),
-                      "$dst = #$src1",
-                      []>;
-
-// TODO: see if this instruction can be deleted..
-let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
-    isAsmParserOnly = 1 in {
-def TFRI64_V4 : ALU64_rr<(outs DoubleRegs:$dst), (ins u64_0Imm:$src1),
-                         "$dst = #$src1">;
-def TFRI64_V2_ext : ALU64_rr<(outs DoubleRegs:$dst),
-                             (ins s8_0Ext:$src1, s8_0Imm:$src2),
-                             "$dst = combine(##$src1, #$src2)">;
-}
-
-//===----------------------------------------------------------------------===//
-// ALU32/ALU -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM +
-//===----------------------------------------------------------------------===//
-// Scalar mux register immediate.
-let hasSideEffects = 0, isExtentSigned = 1, CextOpcode = "MUX",
-    InputType = "imm", hasNewValue = 1, isExtendable = 1, opExtentBits = 8 in
-class T_MUX1 <bit MajOp, dag ins, string AsmStr>
-      : ALU32Inst <(outs IntRegs:$Rd), ins, AsmStr>, ImmRegRel {
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<8> s8;
-  bits<5> Rs;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b0011;
-  let Inst{23} = MajOp;
-  let Inst{22-21} = Pu;
-  let Inst{20-16} = Rs;
-  let Inst{13}    = 0b0;
-  let Inst{12-5}  = s8;
-  let Inst{4-0}   = Rd;
-}
-
-let opExtendable = 2 in
-def C2_muxri : T_MUX1<0b1, (ins PredRegs:$Pu, s8_0Ext:$s8, IntRegs:$Rs),
-                           "$Rd = mux($Pu, #$s8, $Rs)">;
-
-let opExtendable = 3 in
-def C2_muxir : T_MUX1<0b0, (ins PredRegs:$Pu, IntRegs:$Rs, s8_0Ext:$s8),
-                           "$Rd = mux($Pu, $Rs, #$s8)">;
-
-// C2_muxii: Scalar mux immediates.
-let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1,
-    opExtentBits = 8, opExtendable = 2 in
-def C2_muxii: ALU32Inst <(outs IntRegs:$Rd),
-                         (ins PredRegs:$Pu, s8_0Ext:$s8, s8_0Imm:$S8),
-  "$Rd = mux($Pu, #$s8, #$S8)" ,
-  []> {
-    bits<5> Rd;
-    bits<2> Pu;
-    bits<8> s8;
-    bits<8> S8;
-
-    let IClass = 0b0111;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-23} = Pu;
-    let Inst{22-16} = S8{7-1};
-    let Inst{13}    = S8{0};
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rd;
-  }
-
-let isCodeGenOnly = 1, isPseudo = 1 in
-def PS_pselect : ALU64_rr<(outs DoubleRegs:$Rd),
-      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
-      ".error \"should not emit\" ", []>;
-
-
-//===----------------------------------------------------------------------===//
-// template class for non-predicated alu32_2op instructions
-// - aslh, asrh, sxtb, sxth, zxth
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, opNewValue = 0 in
-class T_ALU32_2op <string mnemonic, bits<3> minOp> :
-  ALU32Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
-             "$Rd = "#mnemonic#"($Rs)", [] > {
-  bits<5> Rd;
-  bits<5> Rs;
-
-  let IClass = 0b0111;
-
-  let Inst{27-24} = 0b0000;
-  let Inst{23-21} = minOp;
-  let Inst{13} = 0b0;
-  let Inst{4-0} = Rd;
-  let Inst{20-16} = Rs;
-}
-
-//===----------------------------------------------------------------------===//
-// template class for predicated alu32_2op instructions
-// - aslh, asrh, sxtb, sxth, zxtb, zxth
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_ALU32_2op_Pred <string mnemonic, bits<3> minOp, bit isPredNot,
-                        bit isPredNew > :
-  ALU32Inst <(outs IntRegs:$Rd), (ins PredRegs:$Pu, IntRegs:$Rs),
-             !if(isPredNot, "if (!$Pu", "if ($Pu")
-             #!if(isPredNew, ".new) ",") ")#"$Rd = "#mnemonic#"($Rs)"> {
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<5> Rs;
-
-  let IClass = 0b0111;
-
-  let Inst{27-24} = 0b0000;
-  let Inst{23-21} = minOp;
-  let Inst{13} = 0b1;
-  let Inst{11} = isPredNot;
-  let Inst{10} = isPredNew;
-  let Inst{4-0} = Rd;
-  let Inst{9-8} = Pu;
-  let Inst{20-16} = Rs;
-}
-
-multiclass ALU32_2op_Pred<string mnemonic, bits<3> minOp, bit PredNot> {
-  let isPredicatedFalse = PredNot in {
-    def NAME : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 0>;
-
-    // Predicate new
-    let isPredicatedNew = 1 in
-    def NAME#new : T_ALU32_2op_Pred<mnemonic, minOp, PredNot, 1>;
-  }
-}
-
-multiclass ALU32_2op_base<string mnemonic, bits<3> minOp> {
-  let BaseOpcode = mnemonic in {
-    let isPredicable = 1, hasSideEffects = 0 in
-    def A2_#NAME : T_ALU32_2op<mnemonic, minOp>;
-
-    let isPredicated = 1, hasSideEffects = 0 in {
-      defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
-      defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
-    }
-  }
-}
-
-defm aslh : ALU32_2op_base<"aslh", 0b000>, PredNewRel;
-defm asrh : ALU32_2op_base<"asrh", 0b001>, PredNewRel;
-defm sxtb : ALU32_2op_base<"sxtb", 0b101>, PredNewRel;
-defm sxth : ALU32_2op_base<"sxth", 0b111>, PredNewRel;
-defm zxth : ALU32_2op_base<"zxth", 0b110>, PredNewRel;
-
-// Rd=zxtb(Rs): assembler mapped to Rd=and(Rs,#255).
-// Compiler would want to generate 'zxtb' instead of 'and' because 'zxtb' has
-// predicated forms while 'and' doesn't. Since integrated assembler can't
-// handle 'mapped' instructions, we need to encode 'zxtb' same as 'and' where
-// immediate operand is set to '255'.
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_ZXTB: ALU32Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rs),
-  "$Rd = zxtb($Rs)", [] > { // Rd = and(Rs,255)
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<10> s10 = 255;
-
-    let IClass = 0b0111;
-
-    let Inst{27-22} = 0b011000;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rs;
-    let Inst{21} = s10{9};
-    let Inst{13-5} = s10{8-0};
-}
-
-//Rd=zxtb(Rs): assembler mapped to "Rd=and(Rs,#255)
-multiclass ZXTB_base <string mnemonic, bits<3> minOp> {
-  let BaseOpcode = mnemonic in {
-    let isPredicable = 1, hasSideEffects = 0 in
-    def A2_#NAME : T_ZXTB;
-
-    let isPredicated = 1, hasSideEffects = 0 in {
-      defm A4_p#NAME#t : ALU32_2op_Pred<mnemonic, minOp, 0>;
-      defm A4_p#NAME#f : ALU32_2op_Pred<mnemonic, minOp, 1>;
-    }
-  }
-}
-
-defm zxtb : ZXTB_base<"zxtb",0b100>, PredNewRel;
-
-//===----------------------------------------------------------------------===//
-// Template class for vector add and avg
-//===----------------------------------------------------------------------===//
-
-class T_VectALU_64 <string opc, bits<3> majOp, bits<3> minOp,
-                   bit isSat, bit isRnd, bit isCrnd, bit SwapOps >
-  : ALU64_rr < (outs DoubleRegs:$Rdd),
-                (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd = "#opc#"($Rss, $Rtt)"#!if(isRnd, ":rnd", "")
-                             #!if(isCrnd,":crnd","")
-                             #!if(isSat, ":sat", ""),
-  [], "", ALU64_tc_2_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b0011;
-    let Inst{23-21} = majOp;
-    let Inst{20-16} = !if (SwapOps, Rtt, Rss);
-    let Inst{12-8} = !if (SwapOps, Rss, Rtt);
-    let Inst{7-5} = minOp;
-    let Inst{4-0} = Rdd;
-  }
-
-// ALU64 - Vector add
-// Rdd=vadd[u][bhw](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vaddub  : T_VectALU_64 < "vaddub", 0b000, 0b000, 0, 0, 0, 0>;
-  def A2_vaddh   : T_VectALU_64 < "vaddh",  0b000, 0b010, 0, 0, 0, 0>;
-  def A2_vaddw   : T_VectALU_64 < "vaddw",  0b000, 0b101, 0, 0, 0, 0>;
-}
-
-// Rdd=vadd[u][bhw](Rss,Rtt):sat
-let Defs = [USR_OVF] in {
-  def A2_vaddubs : T_VectALU_64 < "vaddub", 0b000, 0b001, 1, 0, 0, 0>;
-  def A2_vaddhs  : T_VectALU_64 < "vaddh",  0b000, 0b011, 1, 0, 0, 0>;
-  def A2_vadduhs : T_VectALU_64 < "vadduh", 0b000, 0b100, 1, 0, 0, 0>;
-  def A2_vaddws  : T_VectALU_64 < "vaddw",  0b000, 0b110, 1, 0, 0, 0>;
-}
-
-// ALU64 - Vector average
-// Rdd=vavg[u][bhw](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vavgub : T_VectALU_64 < "vavgub", 0b010, 0b000, 0, 0, 0, 0>;
-  def A2_vavgh  : T_VectALU_64 < "vavgh",  0b010, 0b010, 0, 0, 0, 0>;
-  def A2_vavguh : T_VectALU_64 < "vavguh", 0b010, 0b101, 0, 0, 0, 0>;
-  def A2_vavgw  : T_VectALU_64 < "vavgw",  0b011, 0b000, 0, 0, 0, 0>;
-  def A2_vavguw : T_VectALU_64 < "vavguw", 0b011, 0b011, 0, 0, 0, 0>;
-}
-
-// Rdd=vavg[u][bhw](Rss,Rtt)[:rnd|:crnd]
-def A2_vavgubr : T_VectALU_64 < "vavgub", 0b010, 0b001, 0, 1, 0, 0>;
-def A2_vavghr  : T_VectALU_64 < "vavgh",  0b010, 0b011, 0, 1, 0, 0>;
-def A2_vavghcr : T_VectALU_64 < "vavgh",  0b010, 0b100, 0, 0, 1, 0>;
-def A2_vavguhr : T_VectALU_64 < "vavguh", 0b010, 0b110, 0, 1, 0, 0>;
-
-def A2_vavgwr  : T_VectALU_64 < "vavgw",  0b011, 0b001, 0, 1, 0, 0>;
-def A2_vavgwcr : T_VectALU_64 < "vavgw",  0b011, 0b010, 0, 0, 1, 0>;
-def A2_vavguwr : T_VectALU_64 < "vavguw", 0b011, 0b100, 0, 1, 0, 0>;
-
-// Rdd=vnavg[bh](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vnavgh   : T_VectALU_64 < "vnavgh", 0b100, 0b000, 0, 0, 0, 1>;
-  def A2_vnavgw   : T_VectALU_64 < "vnavgw", 0b100, 0b011, 0, 0, 0, 1>;
-}
-
-// Rdd=vnavg[bh](Rss,Rtt)[:rnd|:crnd]:sat
-let Defs = [USR_OVF] in {
-  def A2_vnavghr  : T_VectALU_64 < "vnavgh", 0b100, 0b001, 1, 1, 0, 1>;
-  def A2_vnavghcr : T_VectALU_64 < "vnavgh", 0b100, 0b010, 1, 0, 1, 1>;
-  def A2_vnavgwr  : T_VectALU_64 < "vnavgw", 0b100, 0b100, 1, 1, 0, 1>;
-  def A2_vnavgwcr : T_VectALU_64 < "vnavgw", 0b100, 0b110, 1, 0, 1, 1>;
-}
-
-// Rdd=vsub[u][bh](Rss,Rtt)
-let Itinerary = ALU64_tc_1_SLOT23 in {
-  def A2_vsubub  : T_VectALU_64 < "vsubub", 0b001, 0b000, 0, 0, 0, 1>;
-  def A2_vsubh   : T_VectALU_64 < "vsubh",  0b001, 0b010, 0, 0, 0, 1>;
-  def A2_vsubw   : T_VectALU_64 < "vsubw",  0b001, 0b101, 0, 0, 0, 1>;
-}
-
-// Rdd=vsub[u][bh](Rss,Rtt):sat
-let Defs = [USR_OVF] in {
-  def A2_vsububs : T_VectALU_64 < "vsubub", 0b001, 0b001, 1, 0, 0, 1>;
-  def A2_vsubhs  : T_VectALU_64 < "vsubh",  0b001, 0b011, 1, 0, 0, 1>;
-  def A2_vsubuhs : T_VectALU_64 < "vsubuh", 0b001, 0b100, 1, 0, 0, 1>;
-  def A2_vsubws  : T_VectALU_64 < "vsubw",  0b001, 0b110, 1, 0, 0, 1>;
-}
-
-// Rdd=vmax[u][bhw](Rss,Rtt)
-def A2_vmaxb  : T_VectALU_64 < "vmaxb",  0b110, 0b110, 0, 0, 0, 1>;
-def A2_vmaxub : T_VectALU_64 < "vmaxub", 0b110, 0b000, 0, 0, 0, 1>;
-def A2_vmaxh  : T_VectALU_64 < "vmaxh",  0b110, 0b001, 0, 0, 0, 1>;
-def A2_vmaxuh : T_VectALU_64 < "vmaxuh", 0b110, 0b010, 0, 0, 0, 1>;
-def A2_vmaxw  : T_VectALU_64 < "vmaxw",  0b110, 0b011, 0, 0, 0, 1>;
-def A2_vmaxuw : T_VectALU_64 < "vmaxuw", 0b101, 0b101, 0, 0, 0, 1>;
-
-// Rdd=vmin[u][bhw](Rss,Rtt)
-def A2_vminb  : T_VectALU_64 < "vminb",  0b110, 0b111, 0, 0, 0, 1>;
-def A2_vminub : T_VectALU_64 < "vminub", 0b101, 0b000, 0, 0, 0, 1>;
-def A2_vminh  : T_VectALU_64 < "vminh",  0b101, 0b001, 0, 0, 0, 1>;
-def A2_vminuh : T_VectALU_64 < "vminuh", 0b101, 0b010, 0, 0, 0, 1>;
-def A2_vminw  : T_VectALU_64 < "vminw",  0b101, 0b011, 0, 0, 0, 1>;
-def A2_vminuw : T_VectALU_64 < "vminuw", 0b101, 0b100, 0, 0, 0, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template class for vector compare
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_vcmp <string Str, bits<4> minOp>
-  : ALU64_rr <(outs PredRegs:$Pd),
-              (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Pd = "#Str#"($Rss, $Rtt)", [],
-  "", ALU64_tc_2early_SLOT23> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b00100;
-    let Inst{13} = minOp{3};
-    let Inst{7-5} = minOp{2-0};
-    let Inst{1-0} = Pd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector compare bytes
-def A2_vcmpbeq  : T_vcmp <"vcmpb.eq",  0b0110>;
-def A2_vcmpbgtu : T_vcmp <"vcmpb.gtu", 0b0111>;
-
-// Vector compare halfwords
-def A2_vcmpheq  : T_vcmp <"vcmph.eq",  0b0011>;
-def A2_vcmphgt  : T_vcmp <"vcmph.gt",  0b0100>;
-def A2_vcmphgtu : T_vcmp <"vcmph.gtu", 0b0101>;
-
-// Vector compare words
-def A2_vcmpweq  : T_vcmp <"vcmpw.eq",  0b0000>;
-def A2_vcmpwgt  : T_vcmp <"vcmpw.gt",  0b0001>;
-def A2_vcmpwgtu : T_vcmp <"vcmpw.gtu", 0b0010>;
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PRED +
-//===----------------------------------------------------------------------===//
-// No bits needed.  If cmp.ge is found the assembler parser will
-// transform it to cmp.gt subtracting 1 from the immediate.
-let isPseudo = 1 in {
-def C2_cmpgei: ALU32Inst <
-  (outs PredRegs:$Pd), (ins IntRegs:$Rs, s8_0Ext:$s8),
-  "$Pd = cmp.ge($Rs, #$s8)">;
-def C2_cmpgeui: ALU32Inst <
-  (outs PredRegs:$Pd), (ins IntRegs:$Rs, u8_0Ext:$s8),
-  "$Pd = cmp.geu($Rs, #$s8)">;
-}
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PRED -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU +
-//===----------------------------------------------------------------------===//
-// Add.
-//===----------------------------------------------------------------------===//
-// Template Class
-// Add/Subtract halfword
-// Rd=add(Rt.L,Rs.[HL])[:sat]
-// Rd=sub(Rt.L,Rs.[HL])[:sat]
-// Rd=add(Rt.[LH],Rs.[HL])[:sat][:<16]
-// Rd=sub(Rt.[LH],Rs.[HL])[:sat][:<16]
-//===----------------------------------------------------------------------===//
-
-let  hasNewValue = 1, opNewValue = 0 in
-class T_XTYPE_ADD_SUB <bits<2> LHbits, bit isSat, bit hasShift, bit isSub>
-  : ALU64Inst <(outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
-  "$Rd = "#!if(isSub,"sub","add")#"($Rt."
-          #!if(hasShift, !if(LHbits{1},"h","l"),"l") #", $Rs."
-          #!if(hasShift, !if(LHbits{0},"h)","l)"), !if(LHbits{1},"h)","l)"))
-          #!if(isSat,":sat","")
-          #!if(hasShift,":<<16",""), [], "", ALU64_tc_1_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rt;
-    bits<5> Rs;
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b01010;
-    let Inst{22} = hasShift;
-    let Inst{21} = isSub;
-    let Inst{7} = isSat;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rd;
-    let Inst{12-8} = Rt;
-    let Inst{20-16} = Rs;
-  }
-
-//Rd=sub(Rt.L,Rs.[LH])
-def A2_subh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 1>;
-def A2_subh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 1>;
-
-//Rd=add(Rt.L,Rs.[LH])
-def A2_addh_l16_ll : T_XTYPE_ADD_SUB <0b00, 0, 0, 0>;
-def A2_addh_l16_hl : T_XTYPE_ADD_SUB <0b10, 0, 0, 0>;
-
-let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
-  //Rd=sub(Rt.L,Rs.[LH]):sat
-  def A2_subh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 1>;
-  def A2_subh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 1>;
-
-  //Rd=add(Rt.L,Rs.[LH]):sat
-  def A2_addh_l16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 0, 0>;
-  def A2_addh_l16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 0, 0>;
-}
-
-//Rd=sub(Rt.[LH],Rs.[LH]):<<16
-def A2_subh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 1>;
-def A2_subh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 1>;
-def A2_subh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 1>;
-def A2_subh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 1>;
-
-//Rd=add(Rt.[LH],Rs.[LH]):<<16
-def A2_addh_h16_ll : T_XTYPE_ADD_SUB <0b00, 0, 1, 0>;
-def A2_addh_h16_lh : T_XTYPE_ADD_SUB <0b01, 0, 1, 0>;
-def A2_addh_h16_hl : T_XTYPE_ADD_SUB <0b10, 0, 1, 0>;
-def A2_addh_h16_hh : T_XTYPE_ADD_SUB <0b11, 0, 1, 0>;
-
-let Itinerary = ALU64_tc_2_SLOT23, Defs = [USR_OVF] in {
-  //Rd=sub(Rt.[LH],Rs.[LH]):sat:<<16
-  def A2_subh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 1>;
-  def A2_subh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 1>;
-  def A2_subh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 1>;
-  def A2_subh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 1>;
-
-  //Rd=add(Rt.[LH],Rs.[LH]):sat:<<16
-  def A2_addh_h16_sat_ll : T_XTYPE_ADD_SUB <0b00, 1, 1, 0>;
-  def A2_addh_h16_sat_lh : T_XTYPE_ADD_SUB <0b01, 1, 1, 0>;
-  def A2_addh_h16_sat_hl : T_XTYPE_ADD_SUB <0b10, 1, 1, 0>;
-  def A2_addh_h16_sat_hh : T_XTYPE_ADD_SUB <0b11, 1, 1, 0>;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def S2_parityp: ALU64Inst<(outs IntRegs:$Rd),
-      (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
-      "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0000;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-let hasNewValue = 1, opNewValue = 0, hasSideEffects = 0 in
-class T_XTYPE_MIN_MAX < bit isMax, bit isUnsigned >
-  : ALU64Inst < (outs IntRegs:$Rd), (ins IntRegs:$Rt, IntRegs:$Rs),
-  "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
-          #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rt;
-    bits<5> Rs;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b01011;
-    let Inst{22-21} = !if(isMax, 0b10, 0b01);
-    let Inst{7} = isUnsigned;
-    let Inst{4-0} = Rd;
-    let Inst{12-8} = !if(isMax, Rs, Rt);
-    let Inst{20-16} = !if(isMax, Rt, Rs);
-  }
-
-def A2_min  : T_XTYPE_MIN_MAX < 0, 0 >;
-def A2_minu : T_XTYPE_MIN_MAX < 0, 1 >;
-def A2_max  : T_XTYPE_MIN_MAX < 1, 0 >;
-def A2_maxu : T_XTYPE_MIN_MAX < 1, 1 >;
-
-class T_cmp64_rr<string mnemonic, bits<3> MinOp, bit IsComm>
-  : ALU64_rr<(outs PredRegs:$Pd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
-             "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", ALU64_tc_2early_SLOT23> {
-  let isCompare = 1;
-  let isCommutable = IsComm;
-  let hasSideEffects = 0;
-
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0010100;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-5} = MinOp;
-  let Inst{1-0} = Pd;
-}
-
-def C2_cmpeqp  : T_cmp64_rr<"cmp.eq",  0b000, 1>;
-def C2_cmpgtp  : T_cmp64_rr<"cmp.gt",  0b010, 0>;
-def C2_cmpgtup : T_cmp64_rr<"cmp.gtu", 0b100, 0>;
-
-def C2_vmux : ALU64_rr<(outs DoubleRegs:$Rd),
-      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
-      "$Rd = vmux($Pu, $Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
-  let hasSideEffects = 0;
-
-  bits<5> Rd;
-  bits<2> Pu;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0001;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{6-5} = Pu;
-  let Inst{4-0} = Rd;
-}
-
-class T_ALU64_rr<string mnemonic, string suffix, bits<4> RegType,
-                 bits<3> MajOp, bits<3> MinOp, bit OpsRev, bit IsComm,
-                 string Op2Pfx>
-  : ALU64_rr<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rs, DoubleRegs:$Rt),
-             "$Rd = " #mnemonic# "($Rs, " #Op2Pfx# "$Rt)" #suffix, [],
-             "", ALU64_tc_1_SLOT23> {
-  let hasSideEffects = 0;
-  let isCommutable = IsComm;
-
-  bits<5> Rs;
-  bits<5> Rt;
-  bits<5> Rd;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = RegType;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = !if (OpsRev,Rt,Rs);
-  let Inst{12-8} = !if (OpsRev,Rs,Rt);
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-class T_ALU64_arith<string mnemonic, bits<3> MajOp, bits<3> MinOp, bit IsSat,
-                    bit OpsRev, bit IsComm>
-  : T_ALU64_rr<mnemonic, !if(IsSat,":sat",""), 0b0011, MajOp, MinOp, OpsRev,
-               IsComm, "">;
-
-let isAdd = 1 in
-def A2_addp : T_ALU64_arith<"add", 0b000, 0b111, 0, 0, 1>;
-def A2_subp : T_ALU64_arith<"sub", 0b001, 0b111, 0, 1, 0>;
-
-class T_ALU64_logical<string mnemonic, bits<3> MinOp, bit OpsRev, bit IsComm,
-                      bit IsNeg>
-  : T_ALU64_rr<mnemonic, "", 0b0011, 0b111, MinOp, OpsRev, IsComm,
-               !if(IsNeg,"~","")>;
-
-def A2_andp : T_ALU64_logical<"and", 0b000, 0, 1, 0>;
-def A2_orp  : T_ALU64_logical<"or",  0b010, 0, 1, 0>;
-def A2_xorp : T_ALU64_logical<"xor", 0b100, 0, 1, 0>;
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/BIT +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/BIT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/PERM +
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-// ALU64/PERM -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// CR +
-//===----------------------------------------------------------------------===//
-// Logical reductions on predicates.
-
-// Looping instructions.
-
-// Pipelined looping instructions.
-
-// Logical operations on predicates.
-let hasSideEffects = 0 in
-class T_LOGICAL_1OP<string MnOp, bits<2> OpBits>
-    : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps),
-             "$Pd = " # MnOp # "($Ps)", [], "", CR_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<2> Ps;
-
-  let IClass = 0b0110;
-  let Inst{27-23} = 0b10111;
-  let Inst{22-21} = OpBits;
-  let Inst{20} = 0b0;
-  let Inst{17-16} = Ps;
-  let Inst{13} = 0b0;
-  let Inst{1-0} = Pd;
-}
-
-def C2_any8 : T_LOGICAL_1OP<"any8", 0b00>;
-def C2_all8 : T_LOGICAL_1OP<"all8", 0b01>;
-def C2_not  : T_LOGICAL_1OP<"not",  0b10>;
-
-let hasSideEffects = 0 in
-class T_LOGICAL_2OP<string MnOp, bits<3> OpBits, bit IsNeg, bit Rev>
-    : CRInst<(outs PredRegs:$Pd), (ins PredRegs:$Ps, PredRegs:$Pt),
-             "$Pd = " # MnOp # "($Ps, " # !if (IsNeg,"!","") # "$Pt)",
-             [], "", CR_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<2> Ps;
-  bits<2> Pt;
-
-  let IClass = 0b0110;
-  let Inst{27-24} = 0b1011;
-  let Inst{23-21} = OpBits;
-  let Inst{20} = 0b0;
-  let Inst{17-16} = !if(Rev,Pt,Ps);  // Rs and Rt are reversed for some
-  let Inst{13} = 0b0;                // instructions.
-  let Inst{9-8} = !if(Rev,Ps,Pt);
-  let Inst{1-0} = Pd;
-}
-
-def C2_and  : T_LOGICAL_2OP<"and", 0b000, 0, 1>;
-def C2_or   : T_LOGICAL_2OP<"or",  0b001, 0, 1>;
-def C2_xor  : T_LOGICAL_2OP<"xor", 0b010, 0, 0>;
-def C2_andn : T_LOGICAL_2OP<"and", 0b011, 1, 1>;
-def C2_orn  : T_LOGICAL_2OP<"or",  0b111, 1, 1>;
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def C2_vitpack : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps, PredRegs:$Pt),
-      "$Rd = vitpack($Ps, $Pt)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<2> Ps;
-  bits<2> Pt;
-
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1001;
-  let Inst{22-21} = 0b00;
-  let Inst{17-16} = Ps;
-  let Inst{9-8} = Pt;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0 in
-def C2_mask : SInst<(outs DoubleRegs:$Rd), (ins PredRegs:$Pt),
-      "$Rd = mask($Pt)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<2> Pt;
-
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b0110;
-  let Inst{9-8} = Pt;
-  let Inst{4-0} = Rd;
-}
-
-// User control register transfer.
-//===----------------------------------------------------------------------===//
-// CR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// JR +
-//===----------------------------------------------------------------------===//
-
-class CondStr<string CReg, bit True, bit New> {
-  string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
-}
-class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
-  string S = Mnemonic # !if(Taken, ":t", ":nt");
-}
-
-let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
-    isPredicable = 1,
-    isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
-    opExtentBits = 24, opExtentAlign = 2, InputType = "imm" in
-class T_JMP<string ExtStr>
-  : JInst_CJUMP_UCJUMP<(outs), (ins brtarget:$dst),
-      "jump " # ExtStr # "$dst",
-      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT> {
-    bits<24> dst;
-    let IClass = 0b0101;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24-16} = dst{23-15};
-    let Inst{13-1} = dst{14-2};
-}
-
-let isBranch = 1, Defs = [PC], hasSideEffects = 0, isPredicated = 1,
-    isExtendable = 1, opExtendable = 1, isExtentSigned = 1,
-    opExtentBits = 17, opExtentAlign = 2, InputType = "imm" in
-class T_JMP_c<bit PredNot, bit isPredNew, bit isTak, string ExtStr>
-  : JInst_CJUMP_UCJUMP<(outs), (ins PredRegs:$src, brtarget:$dst),
-      CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
-        JumpOpcStr<"jump", isPredNew, isTak>.S # " " #
-        ExtStr # "$dst",
-      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT>, ImmRegRel {
-    let isTaken = isTak;
-    let isPredicatedFalse = PredNot;
-    let isPredicatedNew = isPredNew;
-    bits<2> src;
-    bits<17> dst;
-
-    let IClass = 0b0101;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{21} = PredNot;
-    let Inst{12} = isTak;
-    let Inst{11} = isPredNew;
-    let Inst{9-8} = src;
-    let Inst{23-22} = dst{16-15};
-    let Inst{20-16} = dst{14-10};
-    let Inst{13} = dst{9};
-    let Inst{7-1} = dst{8-2};
-  }
-
-multiclass JMP_Pred<bit PredNot, string ExtStr> {
-  def NAME       : T_JMP_c<PredNot, 0, 0, ExtStr>; // not taken
-  // Predicate new
-  def NAME#newpt : T_JMP_c<PredNot, 1, 1, ExtStr>; // taken
-  def NAME#new   : T_JMP_c<PredNot, 1, 0, ExtStr>; // not taken
-}
-
-multiclass JMP_base<string BaseOp, string ExtStr> {
-  let BaseOpcode = BaseOp in {
-    def NAME : T_JMP<ExtStr>;
-    defm t : JMP_Pred<0, ExtStr>;
-    defm f : JMP_Pred<1, ExtStr>;
-  }
-}
-
-// Jumps to address stored in a register, JUMPR_MISC
-// if ([[!]P[.new]]) jumpr[:t/nt] Rs
-let isBranch = 1, isIndirectBranch = 1, isBarrier = 1, Defs = [PC],
-    isPredicable = 1, hasSideEffects = 0, InputType = "reg" in
-class T_JMPr
-  : JRInst<(outs), (ins IntRegs:$dst),
-      "jumpr $dst", [], "", J_tc_2early_SLOT2> {
-    bits<5> dst;
-
-    let IClass = 0b0101;
-    let Inst{27-21} = 0b0010100;
-    let Inst{20-16} = dst;
-}
-
-let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
-    hasSideEffects = 0, InputType = "reg" in
-class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>
-  : JRInst <(outs), (ins PredRegs:$src, IntRegs:$dst),
-      CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
-        JumpOpcStr<"jumpr", isPredNew, isTak>.S # " $dst", [],
-      "", J_tc_2early_SLOT2> {
-
-    let isTaken = isTak;
-    let isPredicatedFalse = PredNot;
-    let isPredicatedNew = isPredNew;
-    bits<2> src;
-    bits<5> dst;
-
-    let IClass = 0b0101;
-
-    let Inst{27-22} = 0b001101;
-    let Inst{21} = PredNot;
-    let Inst{20-16} = dst;
-    let Inst{12} = isTak;
-    let Inst{11} = isPredNew;
-    let Inst{9-8} = src;
-}
-
-multiclass JMPR_Pred<bit PredNot> {
-  def NAME        : T_JMPr_c<PredNot, 0, 0>; // not taken
-  // Predicate new
-  def NAME#newpt  : T_JMPr_c<PredNot, 1, 1>; // taken
-  def NAME#new    : T_JMPr_c<PredNot, 1, 0>; // not taken
-}
-
-multiclass JMPR_base<string BaseOp> {
-  let BaseOpcode = BaseOp in {
-    def NAME : T_JMPr;
-    defm t : JMPR_Pred<0>;
-    defm f : JMPR_Pred<1>;
-  }
-}
-
-let isCall = 1, hasSideEffects = 1 in
-class JUMPR_MISC_CALLR<bit isPred, bit isPredNot,
-               dag InputDag = (ins IntRegs:$Rs)>
-  : JRInst<(outs), InputDag,
-      !if(isPred, !if(isPredNot, "if (!$Pu) callr $Rs",
-                                 "if ($Pu) callr $Rs"),
-                                 "callr $Rs"),
-      [], "", J_tc_2early_SLOT2> {
-    bits<5> Rs;
-    bits<2> Pu;
-    let isPredicated = isPred;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b0101;
-    let Inst{27-25} = 0b000;
-    let Inst{24-23} = !if (isPred, 0b10, 0b01);
-    let Inst{22} = 0;
-    let Inst{21} = isPredNot;
-    let Inst{9-8} = !if (isPred, Pu, 0b00);
-    let Inst{20-16} = Rs;
-
-  }
-
-let Defs = VolatileV3.Regs in {
-  def J2_callrt : JUMPR_MISC_CALLR<1, 0, (ins PredRegs:$Pu, IntRegs:$Rs)>;
-  def J2_callrf : JUMPR_MISC_CALLR<1, 1, (ins PredRegs:$Pu, IntRegs:$Rs)>;
-}
-
-let isTerminator = 1, hasSideEffects = 0 in {
-  defm J2_jump : JMP_base<"JMP", "">, PredNewRel;
-
-  defm J2_jumpr : JMPR_base<"JMPr">, PredNewRel;
-
-  let isReturn = 1, isPseudo = 1, isCodeGenOnly = 1 in
-  defm PS_jmpret : JMPR_base<"JMPret">, PredNewRel;
-}
-
-let validSubTargets  = HasV60SubT in
-multiclass JMPpt_base<string BaseOp> {
-  let BaseOpcode = BaseOp in {
-    def tpt : T_JMP_c <0, 0, 1, "">; // Predicate true - taken
-    def fpt : T_JMP_c <1, 0, 1, "">; // Predicate false - taken
-  }
-}
-
-let validSubTargets  = HasV60SubT in
-multiclass JMPRpt_base<string BaseOp> {
-  let BaseOpcode = BaseOp in {
-    def tpt : T_JMPr_c<0, 0, 1>; // predicate true - taken
-    def fpt : T_JMPr_c<1, 0, 1>; // predicate false - taken
-  }
-}
-
-defm J2_jumpr : JMPRpt_base<"JMPr">;
-defm J2_jump  : JMPpt_base<"JMP">;
-
-// A return through builtin_eh_return.
-let isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 0,
-    isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
-def EH_RETURN_JMPR : T_JMPr;
-
-//===----------------------------------------------------------------------===//
-// JR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// LD +
-//===----------------------------------------------------------------------===//
-
-// Load - Base with Immediate offset addressing mode
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, AddedComplexity = 20 in
-class T_load_io <string mnemonic, RegisterClass RC, bits<4> MajOp,
-                 Operand ImmOp>
-  : LDInst<(outs RC:$dst), (ins IntRegs:$src1, ImmOp:$offset),
-  "$dst = "#mnemonic#"($src1 + #$offset)", []>, AddrModeRel {
-    bits<4> name;
-    bits<5> dst;
-    bits<5> src1;
-    bits<14> offset;
-    bits<11> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), offset{13-3},
-                     !if (!eq(ImmOpStr, "s11_2Ext"), offset{12-2},
-                     !if (!eq(ImmOpStr, "s11_1Ext"), offset{11-1},
-                                      /* s11_0Ext */ offset{10-0})));
-    let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
-                       !if (!eq(ImmOpStr, "s11_2Ext"), 13,
-                       !if (!eq(ImmOpStr, "s11_1Ext"), 12,
-                                        /* s11_0Ext */ 11)));
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27}    = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13-5}  = offsetBits{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-let opExtendable = 3, isExtentSigned = 0, isPredicated = 1 in
-class T_pload_io <string mnemonic, RegisterClass RC, bits<4>MajOp,
-                  Operand ImmOp, bit isNot, bit isPredNew>
-  : LDInst<(outs RC:$dst),
-           (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  "if ("#!if(isNot, "!$src1", "$src1")
-       #!if(isPredNew, ".new", "")
-       #") $dst = "#mnemonic#"($src2 + #$offset)",
-  [],"", V2LDST_tc_ld_SLOT01> , AddrModeRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-    bits<9> offset;
-    bits<6> offsetBits;
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), offset{8-3},
-                     !if (!eq(ImmOpStr, "u6_2Ext"), offset{7-2},
-                     !if (!eq(ImmOpStr, "u6_1Ext"), offset{6-1},
-                                      /* u6_0Ext */ offset{5-0})));
-    let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
-                       !if (!eq(ImmOpStr, "u6_2Ext"), 8,
-                       !if (!eq(ImmOpStr, "u6_1Ext"), 7,
-                                        /* u6_0Ext */ 6)));
-    let hasNewValue = !if (!eq(ImmOpStr, "u6_3Ext"), 0, 1);
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isNot;
-
-    let IClass = 0b0100;
-
-    let Inst{27}    = 0b0;
-    let Inst{27}    = 0b0;
-    let Inst{26}    = isNot;
-    let Inst{25}    = isPredNew;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13}    = 0b0;
-    let Inst{12-11} = src1;
-    let Inst{10-5}  = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-let isExtendable = 1, hasSideEffects = 0, addrMode = BaseImmOffset in
-multiclass LD_Idxd<string mnemonic, string CextOp, RegisterClass RC,
-                   Operand ImmOp, Operand predImmOp, bits<4>MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    let isPredicable = 1 in
-    def L2_#NAME#_io : T_load_io <mnemonic, RC, MajOp, ImmOp>;
-
-    // Predicated
-    def L2_p#NAME#t_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 0>;
-    def L2_p#NAME#f_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 0>;
-
-    // Predicated new
-    def L2_p#NAME#tnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 0, 1>;
-    def L2_p#NAME#fnew_io : T_pload_io <mnemonic, RC, MajOp, predImmOp, 1, 1>;
-  }
-}
-
-let accessSize = ByteAccess in {
-  defm loadrb:  LD_Idxd <"memb", "LDrib", IntRegs, s11_0Ext, u6_0Ext, 0b1000>;
-  defm loadrub: LD_Idxd <"memub", "LDriub", IntRegs, s11_0Ext, u6_0Ext, 0b1001>;
-}
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  defm loadrh:  LD_Idxd <"memh", "LDrih", IntRegs, s11_1Ext, u6_1Ext, 0b1010>;
-  defm loadruh: LD_Idxd <"memuh", "LDriuh", IntRegs, s11_1Ext, u6_1Ext, 0b1011>;
-}
-
-let accessSize = WordAccess, opExtentAlign = 2 in
-defm loadri: LD_Idxd <"memw", "LDriw", IntRegs, s11_2Ext, u6_2Ext, 0b1100>;
-
-let accessSize = DoubleWordAccess, opExtentAlign = 3 in
-defm loadrd: LD_Idxd <"memd", "LDrid", DoubleRegs, s11_3Ext, u6_3Ext, 0b1110>;
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  def L2_loadbsw2_io:   T_load_io<"membh",  IntRegs, 0b0001, s11_1Ext>;
-  def L2_loadbzw2_io:   T_load_io<"memubh", IntRegs, 0b0011, s11_1Ext>;
-}
-
-let accessSize = WordAccess, opExtentAlign = 2 in {
-  def L2_loadbzw4_io: T_load_io<"memubh", DoubleRegs, 0b0101, s11_2Ext>;
-  def L2_loadbsw4_io: T_load_io<"membh",  DoubleRegs, 0b0111, s11_2Ext>;
-}
-
-let addrMode = BaseImmOffset, isExtendable = 1, hasSideEffects = 0,
-    opExtendable = 3, isExtentSigned = 1  in
-class T_loadalign_io <string str, bits<4> MajOp, Operand ImmOp>
-  : LDInst<(outs DoubleRegs:$dst),
-           (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  "$dst = "#str#"($src2 + #$offset)", [],
-  "$src1 = $dst">, AddrModeRel {
-    bits<4> name;
-    bits<5> dst;
-    bits<5> src2;
-    bits<12> offset;
-    bits<11> offsetBits;
-
-    let offsetBits = !if (!eq(!cast<string>(ImmOp), "s11_1Ext"), offset{11-1},
-                                                  /* s11_0Ext */ offset{10-0});
-    let IClass = 0b1001;
-
-    let Inst{27}    = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13-5}  = offsetBits{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-let accessSize = HalfWordAccess, opExtentBits = 12, opExtentAlign = 1 in
-def L2_loadalignh_io: T_loadalign_io <"memh_fifo", 0b0010, s11_1Ext>;
-
-let accessSize = ByteAccess, opExtentBits = 11 in
-def L2_loadalignb_io: T_loadalign_io <"memb_fifo", 0b0100, s11_0Ext>;
-
-//===----------------------------------------------------------------------===//
-// Post increment load
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated post increment loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_load_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                     bits<4> MajOp >
-  : LDInstPI <(outs RC:$dst, IntRegs:$dst2),
-  (ins IntRegs:$src1, ImmOp:$offset),
-  "$dst = "#mnemonic#"($src1++#$offset)" ,
-  [],
-  "$src1 = $dst2" > ,
-  PredNewRel {
-    bits<5> dst;
-    bits<5> src1;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13-12} = 0b00;
-    let Inst{8-5} = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated post increment loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_pload_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                          bits<4> MajOp, bit isPredNot, bit isPredNew >
-  : LDInst <(outs RC:$dst, IntRegs:$dst2),
-            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#"$dst = "#mnemonic#"($src2++#$offset)",
-  [] ,
-  "$src2 = $dst2" > ,
-  PredNewRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    let hasNewValue = !if (!eq(ImmOpStr, "s4_3Imm"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b1;
-    let Inst{12} = isPredNew;
-    let Inst{11} = isPredNot;
-    let Inst{10-9} = src1;
-    let Inst{8-5}  = offsetBits;
-    let Inst{4-0}  = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multiclass for post increment loads with immediate offset.
-//===----------------------------------------------------------------------===//
-
-multiclass LD_PostInc <string mnemonic, string BaseOp, RegisterClass RC,
-                       Operand ImmOp, bits<4> MajOp> {
-  let BaseOpcode = "POST_"#BaseOp in {
-    let isPredicable = 1 in
-    def L2_#NAME#_pi : T_load_pi < mnemonic, RC, ImmOp, MajOp>;
-
-    // Predicated
-    def L2_p#NAME#t_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 0>;
-    def L2_p#NAME#f_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 0>;
-
-    // Predicated new
-    def L2_p#NAME#tnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 0, 1>;
-    def L2_p#NAME#fnew_pi : T_pload_pi < mnemonic, RC, ImmOp, MajOp, 1, 1>;
-  }
-}
-
-// post increment byte loads with immediate offset
-let accessSize = ByteAccess in {
-  defm loadrb  : LD_PostInc <"memb",  "LDrib", IntRegs, s4_0Imm, 0b1000>;
-  defm loadrub : LD_PostInc <"memub", "LDriub", IntRegs, s4_0Imm, 0b1001>;
-}
-
-// post increment halfword loads with immediate offset
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  defm loadrh  : LD_PostInc <"memh",  "LDrih", IntRegs, s4_1Imm, 0b1010>;
-  defm loadruh : LD_PostInc <"memuh", "LDriuh", IntRegs, s4_1Imm, 0b1011>;
-}
-
-// post increment word loads with immediate offset
-let accessSize = WordAccess, opExtentAlign = 2 in
-defm loadri : LD_PostInc <"memw", "LDriw", IntRegs, s4_2Imm, 0b1100>;
-
-// post increment doubleword loads with immediate offset
-let accessSize = DoubleWordAccess, opExtentAlign = 3 in
-defm loadrd : LD_PostInc <"memd", "LDrid", DoubleRegs, s4_3Imm, 0b1110>;
-
-// Rd=memb[u]h(Rx++#s4:1)
-// Rdd=memb[u]h(Rx++#s4:2)
-let accessSize = HalfWordAccess, opExtentAlign = 1 in {
-  def L2_loadbsw2_pi   : T_load_pi <"membh", IntRegs, s4_1Imm, 0b0001>;
-  def L2_loadbzw2_pi   : T_load_pi <"memubh", IntRegs, s4_1Imm, 0b0011>;
-}
-let accessSize = WordAccess, opExtentAlign = 2, hasNewValue = 0 in {
-  def L2_loadbsw4_pi   : T_load_pi <"membh", DoubleRegs, s4_2Imm, 0b0111>;
-  def L2_loadbzw4_pi   : T_load_pi <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment fifo loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_loadalign_pi <string mnemonic, Operand ImmOp, bits<4> MajOp >
-  : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$dst2),
-  (ins DoubleRegs:$src1, IntRegs:$src2, ImmOp:$offset),
-  "$dst = "#mnemonic#"($src2++#$offset)" ,
-  [], "$src2 = $dst2, $src1 = $dst" > ,
-  PredNewRel {
-    bits<5> dst;
-    bits<5> src2;
-    bits<5> offset;
-    bits<4> offsetBits;
-
-    let offsetBits = !if (!eq(!cast<string>(ImmOp), "s4_1Imm"), offset{4-1},
-                                                  /* s4_0Imm */ offset{3-0});
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13-12} = 0b00;
-    let Inst{8-5} = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-// Ryy=memh_fifo(Rx++#s4:1)
-// Ryy=memb_fifo(Rx++#s4:0)
-let accessSize = ByteAccess in
-def L2_loadalignb_pi : T_loadalign_pi <"memb_fifo", s4_0Imm, 0b0100>;
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in
-def L2_loadalignh_pi : T_loadalign_pi <"memh_fifo", s4_1Imm, 0b0010>;
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment loads with register offset.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_load_pr <string mnemonic, RegisterClass RC, bits<4> MajOp,
-                       MemAccessSize AccessSz>
-  : LDInstPI <(outs RC:$dst, IntRegs:$_dst_),
-              (ins IntRegs:$src1, ModRegs:$src2),
-  "$dst = "#mnemonic#"($src1++$src2)" ,
-  [], "$src1 = $_dst_" > {
-    bits<5> dst;
-    bits<5> src1;
-    bits<1> src2;
-
-    let accessSize = AccessSz;
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b110;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2;
-    let Inst{12}    = 0b0;
-    let Inst{7}     = 0b0;
-    let Inst{4-0}   = dst;
-  }
-
-let hasNewValue = 1 in {
-  def L2_loadrb_pr  : T_load_pr <"memb",  IntRegs, 0b1000, ByteAccess>;
-  def L2_loadrub_pr : T_load_pr <"memub", IntRegs, 0b1001, ByteAccess>;
-  def L2_loadrh_pr  : T_load_pr <"memh",  IntRegs, 0b1010, HalfWordAccess>;
-  def L2_loadruh_pr : T_load_pr <"memuh", IntRegs, 0b1011, HalfWordAccess>;
-  def L2_loadri_pr  : T_load_pr <"memw",  IntRegs, 0b1100, WordAccess>;
-
-  def L2_loadbzw2_pr : T_load_pr <"memubh", IntRegs, 0b0011, HalfWordAccess>;
-}
-
-def L2_loadrd_pr   : T_load_pr <"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
-def L2_loadbzw4_pr : T_load_pr <"memubh", DoubleRegs, 0b0101, WordAccess>;
-
-// Load predicate.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def LDriw_pred : LDInst<(outs PredRegs:$dst),
-                        (ins IntRegs:$addr, s11_2Ext:$off),
-                        ".error \"should not emit\"", []>;
-// Load modifier.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def LDriw_mod : LDInst<(outs ModRegs:$dst),
-                        (ins IntRegs:$addr, s11_2Ext:$off),
-                        ".error \"should not emit\"", []>;
-
-let Defs = [R29, R30, R31], Uses = [R30], hasSideEffects = 0 in
-  def L2_deallocframe : LDInst<(outs), (ins),
-                     "deallocframe",
-                     []> {
-    let IClass = 0b1001;
-
-    let Inst{27-16} = 0b000000011110;
-    let Inst{13} = 0b0;
-    let Inst{4-0} = 0b11110;
-}
-
-// Load / Post increment circular addressing mode.
-let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
-class T_load_pcr<string mnemonic, RegisterClass RC, bits<4> MajOp>
-  : LDInst <(outs RC:$dst, IntRegs:$_dst_),
-            (ins IntRegs:$Rz, ModRegs:$Mu),
-  "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
-  "$Rz = $_dst_" > {
-    bits<5> dst;
-    bits<5> Rz;
-    bit Mu;
-
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12} = 0b0;
-    let Inst{9} = 0b1;
-    let Inst{7} = 0b0;
-    let Inst{4-0} = dst;
- }
-
-let accessSize = ByteAccess in {
-  def L2_loadrb_pcr  : T_load_pcr <"memb",  IntRegs, 0b1000>;
-  def L2_loadrub_pcr : T_load_pcr <"memub", IntRegs, 0b1001>;
-}
-
-let accessSize = HalfWordAccess in {
-  def L2_loadrh_pcr   : T_load_pcr <"memh",   IntRegs, 0b1010>;
-  def L2_loadruh_pcr  : T_load_pcr <"memuh",  IntRegs, 0b1011>;
-  def L2_loadbsw2_pcr : T_load_pcr <"membh",  IntRegs, 0b0001>;
-  def L2_loadbzw2_pcr : T_load_pcr <"memubh", IntRegs, 0b0011>;
-}
-
-let accessSize = WordAccess in {
-  def  L2_loadri_pcr  : T_load_pcr <"memw", IntRegs, 0b1100>;
-  let hasNewValue = 0 in {
-    def L2_loadbzw4_pcr : T_load_pcr <"memubh", DoubleRegs, 0b0101>;
-    def L2_loadbsw4_pcr : T_load_pcr <"membh",  DoubleRegs, 0b0111>;
-  }
-}
-
-let accessSize = DoubleWordAccess in
-def L2_loadrd_pcr  : T_load_pcr <"memd", DoubleRegs, 0b1110>;
-
-// Load / Post increment circular addressing mode.
-let Uses = [CS], hasSideEffects = 0, addrMode = PostInc in
-class T_loadalign_pcr<string mnemonic, bits<4> MajOp, MemAccessSize AccessSz >
-  : LDInst <(outs DoubleRegs:$dst, IntRegs:$_dst_),
-            (ins DoubleRegs:$_src_, IntRegs:$Rz, ModRegs:$Mu),
-  "$dst = "#mnemonic#"($Rz ++ I:circ($Mu))", [],
-  "$Rz = $_dst_, $dst = $_src_" > {
-    bits<5> dst;
-    bits<5> Rz;
-    bit Mu;
-
-    let accessSize = AccessSz;
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13}    = Mu;
-    let Inst{12}    = 0b0;
-    let Inst{9}     = 0b1;
-    let Inst{7}     = 0b0;
-    let Inst{4-0}   = dst;
- }
-
-def L2_loadalignb_pcr : T_loadalign_pcr <"memb_fifo", 0b0100, ByteAccess>;
-def L2_loadalignh_pcr : T_loadalign_pcr <"memh_fifo", 0b0010, HalfWordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Circular loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let Uses = [CS], mayLoad = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_load_pci <string mnemonic, RegisterClass RC,
-                  Operand ImmOp, bits<4> MajOp>
-  : LDInstPI<(outs RC:$dst, IntRegs:$_dst_),
-             (ins IntRegs:$Rz, ImmOp:$offset, ModRegs:$Mu),
-  "$dst = "#mnemonic#"($Rz ++ #$offset:circ($Mu))", [],
-  "$Rz = $_dst_"> {
-    bits<5> dst;
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    let IClass      = 0b1001;
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13}    = Mu;
-    let Inst{12}    = 0b0;
-    let Inst{9}     = 0b0;
-    let Inst{8-5}   = offsetBits;
-    let Inst{4-0}   = dst;
-  }
-
-// Byte variants of circ load
-let accessSize = ByteAccess in {
-  def L2_loadrb_pci  : T_load_pci <"memb",  IntRegs, s4_0Imm, 0b1000>;
-  def L2_loadrub_pci : T_load_pci <"memub", IntRegs, s4_0Imm, 0b1001>;
-}
-
-// Half word variants of circ load
-let accessSize = HalfWordAccess in {
-  def L2_loadrh_pci   : T_load_pci <"memh",   IntRegs, s4_1Imm, 0b1010>;
-  def L2_loadruh_pci  : T_load_pci <"memuh",  IntRegs, s4_1Imm, 0b1011>;
-  def L2_loadbzw2_pci : T_load_pci <"memubh", IntRegs, s4_1Imm, 0b0011>;
-  def L2_loadbsw2_pci : T_load_pci <"membh",  IntRegs, s4_1Imm, 0b0001>;
-}
-
-// Word variants of circ load
-let accessSize = WordAccess in
-def L2_loadri_pci   : T_load_pci <"memw",   IntRegs,    s4_2Imm, 0b1100>;
-
-let accessSize = WordAccess, hasNewValue = 0 in {
-  def L2_loadbzw4_pci : T_load_pci <"memubh", DoubleRegs, s4_2Imm, 0b0101>;
-  def L2_loadbsw4_pci : T_load_pci <"membh",  DoubleRegs, s4_2Imm, 0b0111>;
-}
-
-let accessSize = DoubleWordAccess, hasNewValue = 0 in
-def L2_loadrd_pci : T_load_pci <"memd", DoubleRegs, s4_3Imm, 0b1110>;
-
-
-// TODO: memb_fifo and memh_fifo must take destination register as input.
-// One-off circ loads - not enough in common to break into a class.
-let accessSize = ByteAccess in
-def L2_loadalignb_pci : T_load_pci <"memb_fifo", DoubleRegs, s4_0Imm, 0b0100>;
-
-let accessSize = HalfWordAccess, opExtentAlign = 1 in
-def L2_loadalignh_pci : T_load_pci <"memh_fifo", DoubleRegs, s4_1Imm, 0b0010>;
-
-// L[24]_load[wd]_locked: Load word/double with lock.
-let isSoloAX = 1 in
-class T_load_locked <string mnemonic, RegisterClass RC>
-  : LD0Inst <(outs RC:$dst),
-             (ins IntRegs:$src),
-    "$dst = "#mnemonic#"($src)"> {
-    bits<5> dst;
-    bits<5> src;
-    let IClass = 0b1001;
-    let Inst{27-21} = 0b0010000;
-    let Inst{20-16} = src;
-    let Inst{13-12} = !if (!eq(mnemonic, "memd_locked"), 0b01, 0b00);
-    let Inst{5}   = 0;
-    let Inst{4-0} = dst;
-}
-let hasNewValue = 1, accessSize = WordAccess, opNewValue = 0 in
-  def L2_loadw_locked : T_load_locked <"memw_locked", IntRegs>;
-let accessSize = DoubleWordAccess in
-  def L4_loadd_locked : T_load_locked <"memd_locked", DoubleRegs>;
-
-// S[24]_store[wd]_locked: Store word/double conditionally.
-let isSoloAX = 1, isPredicateLate = 1 in
-class T_store_locked <string mnemonic, RegisterClass RC>
-  : ST0Inst <(outs PredRegs:$Pd), (ins IntRegs:$Rs, RC:$Rt),
-    mnemonic#"($Rs, $Pd) = $Rt"> {
-    bits<2> Pd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1010;
-    let Inst{27-23} = 0b00001;
-    let Inst{22} = !if (!eq(mnemonic, "memw_locked"), 0b0, 0b1);
-    let Inst{21} = 0b1;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-    let Inst{1-0} = Pd;
-}
-
-let accessSize = WordAccess in
-def S2_storew_locked : T_store_locked <"memw_locked", IntRegs>;
-
-let accessSize = DoubleWordAccess in
-def S4_stored_locked : T_store_locked <"memd_locked", DoubleRegs>;
-
-//===----------------------------------------------------------------------===//
-// Bit-reversed loads with auto-increment register
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_load_pbr<string mnemonic, RegisterClass RC,
-                            MemAccessSize addrSize, bits<4> majOp>
-  : LDInst
-    <(outs RC:$dst, IntRegs:$_dst_),
-     (ins IntRegs:$Rz, ModRegs:$Mu),
-     "$dst = "#mnemonic#"($Rz ++ $Mu:brev)" ,
-      [] , "$Rz = $_dst_" > {
-
-      let accessSize = addrSize;
-
-      bits<5> dst;
-      bits<5> Rz;
-      bits<1> Mu;
-
-      let IClass = 0b1001;
-
-      let Inst{27-25} = 0b111;
-      let Inst{24-21} = majOp;
-      let Inst{20-16} = Rz;
-      let Inst{13} = Mu;
-      let Inst{12} = 0b0;
-      let Inst{7} = 0b0;
-      let Inst{4-0} = dst;
-  }
-
-let hasNewValue =1, opNewValue = 0 in {
-  def L2_loadrb_pbr   : T_load_pbr <"memb",  IntRegs, ByteAccess, 0b1000>;
-  def L2_loadrub_pbr  : T_load_pbr <"memub", IntRegs, ByteAccess, 0b1001>;
-  def L2_loadrh_pbr   : T_load_pbr <"memh",  IntRegs, HalfWordAccess, 0b1010>;
-  def L2_loadruh_pbr  : T_load_pbr <"memuh", IntRegs, HalfWordAccess, 0b1011>;
-  def L2_loadbsw2_pbr : T_load_pbr <"membh", IntRegs, HalfWordAccess, 0b0001>;
-  def L2_loadbzw2_pbr : T_load_pbr <"memubh", IntRegs, HalfWordAccess, 0b0011>;
-  def L2_loadri_pbr : T_load_pbr <"memw", IntRegs, WordAccess, 0b1100>;
-}
-
-def L2_loadbzw4_pbr : T_load_pbr <"memubh", DoubleRegs, WordAccess, 0b0101>;
-def L2_loadbsw4_pbr : T_load_pbr <"membh",  DoubleRegs, WordAccess, 0b0111>;
-def L2_loadrd_pbr : T_load_pbr <"memd", DoubleRegs, DoubleWordAccess, 0b1110>;
-
-def L2_loadalignb_pbr :T_load_pbr <"memb_fifo", DoubleRegs, ByteAccess, 0b0100>;
-def L2_loadalignh_pbr :T_load_pbr <"memh_fifo", DoubleRegs,
-                                   HalfWordAccess, 0b0010>;
-
-//===----------------------------------------------------------------------===//
-// LD -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/ALU +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/COMPLEX +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/COMPLEX -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYH +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Template Class
-// MPYS / Multipy signed/unsigned halfwords
-//Rd=mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
-//===----------------------------------------------------------------------===//
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_M2_mpy < bits<2> LHbits, bit isSat, bit isRnd,
-                 bit hasShift, bit isUnsigned>
-  : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
-                                       #", $Rt."#!if(LHbits{0},"h)","l)")
-                                       #!if(hasShift,":<<1","")
-                                       #!if(isRnd,":rnd","")
-                                       #!if(isSat,":sat",""),
-  [], "", M_tc_3x_SLOT23 > {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isRnd;
-    let Inst{7} = isSat;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpy_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 0>;
-def M2_mpy_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 0>;
-def M2_mpy_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 0>;
-def M2_mpy_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 0>;
-def M2_mpy_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 0>;
-def M2_mpy_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 0>;
-def M2_mpy_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 0>;
-def M2_mpy_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 0>;
-
-//Rd=mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpyu_ll_s1: T_M2_mpy<0b00, 0, 0, 1, 1>;
-def M2_mpyu_ll_s0: T_M2_mpy<0b00, 0, 0, 0, 1>;
-def M2_mpyu_lh_s1: T_M2_mpy<0b01, 0, 0, 1, 1>;
-def M2_mpyu_lh_s0: T_M2_mpy<0b01, 0, 0, 0, 1>;
-def M2_mpyu_hl_s1: T_M2_mpy<0b10, 0, 0, 1, 1>;
-def M2_mpyu_hl_s0: T_M2_mpy<0b10, 0, 0, 0, 1>;
-def M2_mpyu_hh_s1: T_M2_mpy<0b11, 0, 0, 1, 1>;
-def M2_mpyu_hh_s0: T_M2_mpy<0b11, 0, 0, 0, 1>;
-
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1]:rnd
-def M2_mpy_rnd_ll_s1: T_M2_mpy <0b00, 0, 1, 1, 0>;
-def M2_mpy_rnd_ll_s0: T_M2_mpy <0b00, 0, 1, 0, 0>;
-def M2_mpy_rnd_lh_s1: T_M2_mpy <0b01, 0, 1, 1, 0>;
-def M2_mpy_rnd_lh_s0: T_M2_mpy <0b01, 0, 1, 0, 0>;
-def M2_mpy_rnd_hl_s1: T_M2_mpy <0b10, 0, 1, 1, 0>;
-def M2_mpy_rnd_hl_s0: T_M2_mpy <0b10, 0, 1, 0, 0>;
-def M2_mpy_rnd_hh_s1: T_M2_mpy <0b11, 0, 1, 1, 0>;
-def M2_mpy_rnd_hh_s0: T_M2_mpy <0b11, 0, 1, 0, 0>;
-
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//Rd=mpy(Rs.[H|L],Rt.[H|L])[:<<1][:rnd][:sat]
-let Defs = [USR_OVF] in {
-  def M2_mpy_sat_ll_s1: T_M2_mpy <0b00, 1, 0, 1, 0>;
-  def M2_mpy_sat_ll_s0: T_M2_mpy <0b00, 1, 0, 0, 0>;
-  def M2_mpy_sat_lh_s1: T_M2_mpy <0b01, 1, 0, 1, 0>;
-  def M2_mpy_sat_lh_s0: T_M2_mpy <0b01, 1, 0, 0, 0>;
-  def M2_mpy_sat_hl_s1: T_M2_mpy <0b10, 1, 0, 1, 0>;
-  def M2_mpy_sat_hl_s0: T_M2_mpy <0b10, 1, 0, 0, 0>;
-  def M2_mpy_sat_hh_s1: T_M2_mpy <0b11, 1, 0, 1, 0>;
-  def M2_mpy_sat_hh_s0: T_M2_mpy <0b11, 1, 0, 0, 0>;
-
-  def M2_mpy_sat_rnd_ll_s1: T_M2_mpy <0b00, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_ll_s0: T_M2_mpy <0b00, 1, 1, 0, 0>;
-  def M2_mpy_sat_rnd_lh_s1: T_M2_mpy <0b01, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_lh_s0: T_M2_mpy <0b01, 1, 1, 0, 0>;
-  def M2_mpy_sat_rnd_hl_s1: T_M2_mpy <0b10, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_hl_s0: T_M2_mpy <0b10, 1, 1, 0, 0>;
-  def M2_mpy_sat_rnd_hh_s1: T_M2_mpy <0b11, 1, 1, 1, 0>;
-  def M2_mpy_sat_rnd_hh_s0: T_M2_mpy <0b11, 1, 1, 0, 0>;
-}
-
-//===----------------------------------------------------------------------===//
-// Template Class
-// MPYS / Multipy signed/unsigned halfwords and add/subtract the
-// result from the accumulator.
-//Rx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//===----------------------------------------------------------------------===//
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_M2_mpy_acc < bits<2> LHbits, bit isSat, bit isNac,
-                 bit hasShift, bit isUnsigned >
-  : MInst_acc<(outs IntRegs:$Rx), (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
-                              #"($Rs."#!if(LHbits{1},"h","l")
-                              #", $Rt."#!if(LHbits{0},"h)","l)")
-                              #!if(hasShift,":<<1","")
-                              #!if(isSat,":sat",""),
-  [], "$dst2 = $Rx", M_tc_3x_SLOT23 > {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-    let Inst{27-24} = 0b1110;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isNac;
-    let Inst{7} = isSat;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpy_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 0>;
-def M2_mpy_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 0>;
-def M2_mpy_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 0>;
-def M2_mpy_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 0>;
-def M2_mpy_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 0>;
-def M2_mpy_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 0>;
-def M2_mpy_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 0>;
-def M2_mpy_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 0>;
-
-//Rx += mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpyu_acc_ll_s1: T_M2_mpy_acc <0b00, 0, 0, 1, 1>;
-def M2_mpyu_acc_ll_s0: T_M2_mpy_acc <0b00, 0, 0, 0, 1>;
-def M2_mpyu_acc_lh_s1: T_M2_mpy_acc <0b01, 0, 0, 1, 1>;
-def M2_mpyu_acc_lh_s0: T_M2_mpy_acc <0b01, 0, 0, 0, 1>;
-def M2_mpyu_acc_hl_s1: T_M2_mpy_acc <0b10, 0, 0, 1, 1>;
-def M2_mpyu_acc_hl_s0: T_M2_mpy_acc <0b10, 0, 0, 0, 1>;
-def M2_mpyu_acc_hh_s1: T_M2_mpy_acc <0b11, 0, 0, 1, 1>;
-def M2_mpyu_acc_hh_s0: T_M2_mpy_acc <0b11, 0, 0, 0, 1>;
-
-//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpy_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 0>;
-def M2_mpy_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 0>;
-def M2_mpy_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 0>;
-def M2_mpy_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 0>;
-def M2_mpy_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 0>;
-def M2_mpy_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 0>;
-def M2_mpy_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 0>;
-def M2_mpy_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 0>;
-
-//Rx -= mpyu(Rs.[H|L],Rt.[H|L])[:<<1]
-def M2_mpyu_nac_ll_s1: T_M2_mpy_acc <0b00, 0, 1, 1, 1>;
-def M2_mpyu_nac_ll_s0: T_M2_mpy_acc <0b00, 0, 1, 0, 1>;
-def M2_mpyu_nac_lh_s1: T_M2_mpy_acc <0b01, 0, 1, 1, 1>;
-def M2_mpyu_nac_lh_s0: T_M2_mpy_acc <0b01, 0, 1, 0, 1>;
-def M2_mpyu_nac_hl_s1: T_M2_mpy_acc <0b10, 0, 1, 1, 1>;
-def M2_mpyu_nac_hl_s0: T_M2_mpy_acc <0b10, 0, 1, 0, 1>;
-def M2_mpyu_nac_hh_s1: T_M2_mpy_acc <0b11, 0, 1, 1, 1>;
-def M2_mpyu_nac_hh_s0: T_M2_mpy_acc <0b11, 0, 1, 0, 1>;
-
-//Rx += mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
-def M2_mpy_acc_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 0, 0, 0>;
-def M2_mpy_acc_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 0, 0, 0>;
-def M2_mpy_acc_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 0, 0, 0>;
-def M2_mpy_acc_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 0, 1, 0>;
-def M2_mpy_acc_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 0, 0, 0>;
-
-//Rx -= mpy(Rs.[H|L],Rt.[H|L])[:<<1]:sat
-def M2_mpy_nac_sat_ll_s1: T_M2_mpy_acc <0b00, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_ll_s0: T_M2_mpy_acc <0b00, 1, 1, 0, 0>;
-def M2_mpy_nac_sat_lh_s1: T_M2_mpy_acc <0b01, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_lh_s0: T_M2_mpy_acc <0b01, 1, 1, 0, 0>;
-def M2_mpy_nac_sat_hl_s1: T_M2_mpy_acc <0b10, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_hl_s0: T_M2_mpy_acc <0b10, 1, 1, 0, 0>;
-def M2_mpy_nac_sat_hh_s1: T_M2_mpy_acc <0b11, 1, 1, 1, 0>;
-def M2_mpy_nac_sat_hh_s0: T_M2_mpy_acc <0b11, 1, 1, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template Class
-// MPYS / Multipy signed/unsigned halfwords and add/subtract the
-// result from the 64-bit destination register.
-//Rxx [-+]= mpy[u](Rs.[H|L],Rt.[H|L])[:<<1][:sat]
-//===----------------------------------------------------------------------===//
-
-class T_M2_mpyd_acc < bits<2> LHbits, bit isNac, bit hasShift, bit isUnsigned>
-  : MInst_acc<(outs DoubleRegs:$Rxx),
-              (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rxx "#!if(isNac,"-= ","+= ")#!if(isUnsigned,"mpyu","mpy")
-                                #"($Rs."#!if(LHbits{1},"h","l")
-                                #", $Rt."#!if(LHbits{0},"h)","l)")
-                                #!if(hasShift,":<<1",""),
-  [], "$dst2 = $Rxx", M_tc_3x_SLOT23 > {
-    bits<5> Rxx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0110;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isNac;
-    let Inst{7} = 0;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-def M2_mpyd_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 0>;
-def M2_mpyd_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 0>;
-def M2_mpyd_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 0>;
-def M2_mpyd_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 0>;
-
-def M2_mpyd_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 0>;
-def M2_mpyd_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 0>;
-def M2_mpyd_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 0>;
-def M2_mpyd_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 0>;
-
-def M2_mpyd_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 0>;
-def M2_mpyd_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 0>;
-def M2_mpyd_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 0>;
-def M2_mpyd_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 0>;
-
-def M2_mpyd_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 0>;
-def M2_mpyd_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 0>;
-def M2_mpyd_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 0>;
-def M2_mpyd_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 0>;
-
-def M2_mpyud_acc_hh_s0: T_M2_mpyd_acc <0b11, 0, 0, 1>;
-def M2_mpyud_acc_hl_s0: T_M2_mpyd_acc <0b10, 0, 0, 1>;
-def M2_mpyud_acc_lh_s0: T_M2_mpyd_acc <0b01, 0, 0, 1>;
-def M2_mpyud_acc_ll_s0: T_M2_mpyd_acc <0b00, 0, 0, 1>;
-
-def M2_mpyud_acc_hh_s1: T_M2_mpyd_acc <0b11, 0, 1, 1>;
-def M2_mpyud_acc_hl_s1: T_M2_mpyd_acc <0b10, 0, 1, 1>;
-def M2_mpyud_acc_lh_s1: T_M2_mpyd_acc <0b01, 0, 1, 1>;
-def M2_mpyud_acc_ll_s1: T_M2_mpyd_acc <0b00, 0, 1, 1>;
-
-def M2_mpyud_nac_hh_s0: T_M2_mpyd_acc <0b11, 1, 0, 1>;
-def M2_mpyud_nac_hl_s0: T_M2_mpyd_acc <0b10, 1, 0, 1>;
-def M2_mpyud_nac_lh_s0: T_M2_mpyd_acc <0b01, 1, 0, 1>;
-def M2_mpyud_nac_ll_s0: T_M2_mpyd_acc <0b00, 1, 0, 1>;
-
-def M2_mpyud_nac_hh_s1: T_M2_mpyd_acc <0b11, 1, 1, 1>;
-def M2_mpyud_nac_hl_s1: T_M2_mpyd_acc <0b10, 1, 1, 1>;
-def M2_mpyud_nac_lh_s1: T_M2_mpyd_acc <0b01, 1, 1, 1>;
-def M2_mpyud_nac_ll_s1: T_M2_mpyd_acc <0b00, 1, 1, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- Vector Multipy
-// Used for complex multiply real or imaginary, dual multiply and even halfwords
-//===----------------------------------------------------------------------===//
-class T_M2_vmpy < string opc, bits<3> MajOp, bits<3> MinOp, bit hasShift,
-                  bit isRnd, bit isSat >
-  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd = "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
-                              #!if(isRnd,":rnd","")
-                              #!if(isSat,":sat",""),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector complex multiply imaginary: Rdd=vcmpyi(Rss,Rtt)[:<<1]:sat
-let Defs = [USR_OVF] in {
-def M2_vcmpy_s1_sat_i: T_M2_vmpy <"vcmpyi", 0b110, 0b110, 1, 0, 1>;
-def M2_vcmpy_s0_sat_i: T_M2_vmpy <"vcmpyi", 0b010, 0b110, 0, 0, 1>;
-
-// Vector complex multiply real: Rdd=vcmpyr(Rss,Rtt)[:<<1]:sat
-def M2_vcmpy_s1_sat_r: T_M2_vmpy <"vcmpyr", 0b101, 0b110, 1, 0, 1>;
-def M2_vcmpy_s0_sat_r: T_M2_vmpy <"vcmpyr", 0b001, 0b110, 0, 0, 1>;
-
-// Vector dual multiply: Rdd=vdmpy(Rss,Rtt)[:<<1]:sat
-def M2_vdmpys_s1: T_M2_vmpy <"vdmpy", 0b100, 0b100, 1, 0, 1>;
-def M2_vdmpys_s0: T_M2_vmpy <"vdmpy", 0b000, 0b100, 0, 0, 1>;
-
-// Vector multiply even halfwords: Rdd=vmpyeh(Rss,Rtt)[:<<1]:sat
-def M2_vmpy2es_s1: T_M2_vmpy <"vmpyeh", 0b100, 0b110, 1, 0, 1>;
-def M2_vmpy2es_s0: T_M2_vmpy <"vmpyeh", 0b000, 0b110, 0, 0, 1>;
-
-//Rdd=vmpywoh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyh_s0:  T_M2_vmpy <"vmpywoh", 0b000, 0b111, 0, 0, 1>;
-def M2_mmpyh_s1:  T_M2_vmpy <"vmpywoh", 0b100, 0b111, 1, 0, 1>;
-def M2_mmpyh_rs0: T_M2_vmpy <"vmpywoh", 0b001, 0b111, 0, 1, 1>;
-def M2_mmpyh_rs1: T_M2_vmpy <"vmpywoh", 0b101, 0b111, 1, 1, 1>;
-
-//Rdd=vmpyweh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyl_s0:  T_M2_vmpy <"vmpyweh", 0b000, 0b101, 0, 0, 1>;
-def M2_mmpyl_s1:  T_M2_vmpy <"vmpyweh", 0b100, 0b101, 1, 0, 1>;
-def M2_mmpyl_rs0: T_M2_vmpy <"vmpyweh", 0b001, 0b101, 0, 1, 1>;
-def M2_mmpyl_rs1: T_M2_vmpy <"vmpyweh", 0b101, 0b101, 1, 1, 1>;
-
-//Rdd=vmpywouh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyuh_s0:  T_M2_vmpy <"vmpywouh", 0b010, 0b111, 0, 0, 1>;
-def M2_mmpyuh_s1:  T_M2_vmpy <"vmpywouh", 0b110, 0b111, 1, 0, 1>;
-def M2_mmpyuh_rs0: T_M2_vmpy <"vmpywouh", 0b011, 0b111, 0, 1, 1>;
-def M2_mmpyuh_rs1: T_M2_vmpy <"vmpywouh", 0b111, 0b111, 1, 1, 1>;
-
-//Rdd=vmpyweuh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmpyul_s0:  T_M2_vmpy <"vmpyweuh", 0b010, 0b101, 0, 0, 1>;
-def M2_mmpyul_s1:  T_M2_vmpy <"vmpyweuh", 0b110, 0b101, 1, 0, 1>;
-def M2_mmpyul_rs0: T_M2_vmpy <"vmpyweuh", 0b011, 0b101, 0, 1, 1>;
-def M2_mmpyul_rs1: T_M2_vmpy <"vmpyweuh", 0b111, 0b101, 1, 1, 1>;
-}
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_MType_mpy <string mnemonic, bits<4> RegTyBits, RegisterClass RC,
-                   bits<3> MajOp, bits<3> MinOp, bit isSat = 0, bit isRnd = 0,
-                   string op2Suffix = "", bit isRaw = 0, bit isHi = 0 >
-  : MInst <(outs IntRegs:$dst), (ins RC:$src1, RC:$src2),
-  "$dst = "#mnemonic
-           #"($src1, $src2"#op2Suffix#")"
-           #!if(MajOp{2}, ":<<1", "")
-           #!if(isRnd, ":rnd", "")
-           #!if(isSat, ":sat", "")
-           #!if(isRaw, !if(isHi, ":raw:hi", ":raw:lo"), ""), [] > {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = src2;
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = dst;
-  }
-
-class T_MType_vrcmpy <string mnemonic, bits<3> MajOp, bits<3> MinOp, bit isHi>
-  : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, 1, 1, "", 1, isHi>;
-
-class T_MType_dd  <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                   bit isSat = 0, bit isRnd = 0 >
-  : T_MType_mpy <mnemonic, 0b1001, DoubleRegs, MajOp, MinOp, isSat, isRnd>;
-
-class T_MType_rr1  <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                    bit isSat = 0, bit isRnd = 0 >
-  : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd>;
-
-class T_MType_rr2 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                   bit isSat = 0, bit isRnd = 0, string op2str = "" >
-  : T_MType_mpy<mnemonic, 0b1101, IntRegs, MajOp, MinOp, isSat, isRnd, op2str>;
-
-def M2_vradduh    : T_MType_dd <"vradduh", 0b000, 0b001, 0, 0>;
-def M2_vdmpyrs_s0 : T_MType_dd <"vdmpy",   0b000, 0b000, 1, 1>;
-def M2_vdmpyrs_s1 : T_MType_dd <"vdmpy",   0b100, 0b000, 1, 1>;
-
-let CextOpcode = "mpyi", InputType = "reg" in
-def M2_mpyi    : T_MType_rr1 <"mpyi", 0b000, 0b000>, ImmRegRel;
-
-def M2_mpy_up  : T_MType_rr1 <"mpy",  0b000, 0b001>;
-def M2_mpyu_up : T_MType_rr1 <"mpyu", 0b010, 0b001>;
-
-def M2_dpmpyss_rnd_s0 : T_MType_rr1 <"mpy", 0b001, 0b001, 0, 1>;
-
-def M2_vmpy2s_s0pack : T_MType_rr1 <"vmpyh", 0b001, 0b111, 1, 1>;
-def M2_vmpy2s_s1pack : T_MType_rr1 <"vmpyh", 0b101, 0b111, 1, 1>;
-
-def M2_hmmpyh_rs1 : T_MType_rr2 <"mpy", 0b101, 0b100, 1, 1, ".h">;
-def M2_hmmpyl_rs1 : T_MType_rr2 <"mpy", 0b111, 0b100, 1, 1, ".l">;
-
-def M2_cmpyrs_s0  : T_MType_rr2 <"cmpy", 0b001, 0b110, 1, 1>;
-def M2_cmpyrs_s1  : T_MType_rr2 <"cmpy", 0b101, 0b110, 1, 1>;
-def M2_cmpyrsc_s0 : T_MType_rr2 <"cmpy", 0b011, 0b110, 1, 1, "*">;
-def M2_cmpyrsc_s1 : T_MType_rr2 <"cmpy", 0b111, 0b110, 1, 1, "*">;
-
-// V4 Instructions
-def M2_vraddh : T_MType_dd <"vraddh", 0b001, 0b111, 0>;
-def M2_mpysu_up : T_MType_rr1 <"mpysu", 0b011, 0b001, 0>;
-def M2_mpy_up_s1 : T_MType_rr1 <"mpy", 0b101, 0b010, 0>;
-def M2_mpy_up_s1_sat : T_MType_rr1 <"mpy", 0b111, 0b000, 1>;
-
-def M2_hmmpyh_s1 : T_MType_rr2 <"mpy", 0b101, 0b000, 1, 0, ".h">;
-def M2_hmmpyl_s1 : T_MType_rr2 <"mpy", 0b101, 0b001, 1, 0, ".l">;
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_MType_mpy_ri <bit isNeg, Operand ImmOp, list<dag> pattern>
-  : MInst < (outs IntRegs:$Rd), (ins IntRegs:$Rs, ImmOp:$u8),
-  "$Rd ="#!if(isNeg, "- ", "+ ")#"mpyi($Rs, #$u8)" ,
-   pattern, "", M_tc_3x_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<8> u8;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0000;
-    let Inst{23} = isNeg;
-    let Inst{13} = 0b0;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rs;
-    let Inst{12-5} = u8;
-  }
-
-let isExtendable = 1, opExtentBits = 8, opExtendable = 2 in
-def M2_mpysip : T_MType_mpy_ri <0, u8_0Ext, []>;
-
-def M2_mpysin :  T_MType_mpy_ri <1, u8_0Imm, []>;
-
-// Assember mapped to M2_mpyi
-let isAsmParserOnly = 1 in
-def M2_mpyui : MInst<(outs IntRegs:$dst),
-                     (ins IntRegs:$src1, IntRegs:$src2),
-  "$dst = mpyui($src1, $src2)">;
-
-// Rd=mpyi(Rs,#m9)
-// s9 is NOT the same as m9 - but it works.. so far.
-// Assembler maps to either Rd=+mpyi(Rs,#u8) or Rd=-mpyi(Rs,#u8)
-// depending on the value of m9. See Arch Spec.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 9,
-    CextOpcode = "mpyi", InputType = "imm", hasNewValue = 1,
-    isAsmParserOnly = 1 in
-def M2_mpysmi : MInst<(outs IntRegs:$dst), (ins IntRegs:$src1, s9_0Ext:$src2),
-    "$dst = mpyi($src1, #$src2)", []>, ImmRegRel;
-
-let hasNewValue = 1, isExtendable = 1,  opExtentBits = 8, opExtendable = 3,
-    InputType = "imm" in
-class T_MType_acc_ri <string mnemonic, bits<3> MajOp, Operand ImmOp,
-                      list<dag> pattern = []>
- : MInst < (outs IntRegs:$dst), (ins IntRegs:$src1, IntRegs:$src2, ImmOp:$src3),
-  "$dst "#mnemonic#"($src2, #$src3)",
-  pattern, "$src1 = $dst", M_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src2;
-    bits<8> src3;
-
-    let IClass = 0b1110;
-
-    let Inst{27-26} = 0b00;
-    let Inst{25-23} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b0;
-    let Inst{12-5} = src3;
-    let Inst{4-0} = dst;
-  }
-
-let InputType = "reg", hasNewValue = 1 in
-class T_MType_acc_rr <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                      bit isSwap = 0, list<dag> pattern = [], bit hasNot = 0,
-                      bit isSat = 0, bit isShift = 0>
-  : MInst < (outs IntRegs:$dst),
-            (ins IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-  "$dst "#mnemonic#"($src2, "#!if(hasNot, "~$src3)","$src3)")
-                          #!if(isShift, ":<<1", "")
-                          #!if(isSat, ":sat", ""),
-  pattern, "$src1 = $dst", M_tc_2_SLOT23 > {
-    bits<5> dst;
-    bits<5> src2;
-    bits<5> src3;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = !if(isSwap, src3, src2);
-    let Inst{13} = 0b0;
-    let Inst{12-8} = !if(isSwap, src2, src3);
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-let CextOpcode = "MPYI_acc", Itinerary = M_tc_3x_SLOT23 in {
-  def M2_macsip : T_MType_acc_ri <"+= mpyi", 0b010, u8_0Ext, []>, ImmRegRel;
-
-  def M2_maci   : T_MType_acc_rr <"+= mpyi", 0b000, 0b000, 0, []>, ImmRegRel;
-}
-
-let CextOpcode = "ADD_acc" in {
-  let isExtentSigned = 1 in
-  def M2_accii : T_MType_acc_ri <"+= add", 0b100, s8_0Ext, []>, ImmRegRel;
-
-  def M2_acci  : T_MType_acc_rr <"+= add",  0b000, 0b001, 0, []>, ImmRegRel;
-}
-
-let CextOpcode = "SUB_acc" in {
-  let isExtentSigned = 1 in
-  def M2_naccii : T_MType_acc_ri <"-= add", 0b101, s8_0Ext>, ImmRegRel;
-
-  def M2_nacci  : T_MType_acc_rr <"-= add",  0b100, 0b001, 0>, ImmRegRel;
-}
-
-let Itinerary = M_tc_3x_SLOT23 in
-def M2_macsin : T_MType_acc_ri <"-= mpyi", 0b011, u8_0Ext>;
-
-def M2_xor_xacc : T_MType_acc_rr < "^= xor", 0b100, 0b011, 0>;
-def M2_subacc : T_MType_acc_rr <"+= sub",  0b000, 0b011, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- XType Vector Instructions
-//===----------------------------------------------------------------------===//
-class T_XTYPE_Vect < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
-  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd = "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-class T_XTYPE_Vect_acc < string opc, bits<3> MajOp, bits<3> MinOp, bit isConj >
-  : MInst <(outs DoubleRegs:$Rdd),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rdd += "#opc#"($Rss, $Rtt"#!if(isConj,"*)",")"),
-  [], "$dst2 = $Rdd",M_tc_3x_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-class T_XTYPE_Vect_diff < bits<3> MajOp, string opc >
-  : MInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rtt, DoubleRegs:$Rss),
-  "$Rdd = "#opc#"($Rtt, $Rss)",
-  [], "",M_tc_2_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = 0b000;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector reduce add unsigned bytes: Rdd32=vrmpybu(Rss32,Rtt32)
-def A2_vraddub: T_XTYPE_Vect <"vraddub", 0b010, 0b001, 0>;
-def A2_vraddub_acc: T_XTYPE_Vect_acc <"vraddub", 0b010, 0b001, 0>;
-
-// Vector sum of absolute differences unsigned bytes: Rdd=vrsadub(Rss,Rtt)
-def A2_vrsadub: T_XTYPE_Vect <"vrsadub", 0b010, 0b010, 0>;
-def A2_vrsadub_acc: T_XTYPE_Vect_acc <"vrsadub", 0b010, 0b010, 0>;
-
-// Vector absolute difference: Rdd=vabsdiffh(Rtt,Rss)
-def M2_vabsdiffh: T_XTYPE_Vect_diff<0b011, "vabsdiffh">;
-
-// Vector absolute difference words: Rdd=vabsdiffw(Rtt,Rss)
-def M2_vabsdiffw: T_XTYPE_Vect_diff<0b001, "vabsdiffw">;
-
-// Vector reduce complex multiply real or imaginary:
-// Rdd[+]=vrcmpy[ir](Rss,Rtt[*])
-def M2_vrcmpyi_s0:  T_XTYPE_Vect <"vrcmpyi", 0b000, 0b000, 0>;
-def M2_vrcmpyi_s0c: T_XTYPE_Vect <"vrcmpyi", 0b010, 0b000, 1>;
-def M2_vrcmaci_s0:  T_XTYPE_Vect_acc <"vrcmpyi", 0b000, 0b000, 0>;
-def M2_vrcmaci_s0c: T_XTYPE_Vect_acc <"vrcmpyi", 0b010, 0b000, 1>;
-
-def M2_vrcmpyr_s0:  T_XTYPE_Vect <"vrcmpyr", 0b000, 0b001, 0>;
-def M2_vrcmpyr_s0c: T_XTYPE_Vect <"vrcmpyr", 0b011, 0b001, 1>;
-def M2_vrcmacr_s0:  T_XTYPE_Vect_acc <"vrcmpyr", 0b000, 0b001, 0>;
-def M2_vrcmacr_s0c: T_XTYPE_Vect_acc <"vrcmpyr", 0b011, 0b001, 1>;
-
-// Vector reduce halfwords:
-// Rdd[+]=vrmpyh(Rss,Rtt)
-def M2_vrmpy_s0: T_XTYPE_Vect <"vrmpyh", 0b000, 0b010, 0>;
-def M2_vrmac_s0: T_XTYPE_Vect_acc <"vrmpyh", 0b000, 0b010, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- Vector Multipy with accumulation.
-// Used for complex multiply real or imaginary, dual multiply and even halfwords
-//===----------------------------------------------------------------------===//
-let Defs = [USR_OVF] in
-class T_M2_vmpy_acc_sat < string opc, bits<3> MajOp, bits<3> MinOp,
-                          bit hasShift, bit isRnd >
-  : MInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
-                               #!if(isRnd,":rnd","")#":sat",
-  [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-class T_M2_vmpy_acc < string opc, bits<3> MajOp, bits<3> MinOp,
-                      bit hasShift, bit isRnd >
-  : MInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx += "#opc#"($Rss, $Rtt)"#!if(hasShift,":<<1","")
-                               #!if(isRnd,":rnd",""),
-  [], "$dst2 = $Rxx",M_tc_3x_SLOT23 > {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// Vector multiply word by signed half with accumulation
-// Rxx+=vmpyw[eo]h(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmacls_s1:  T_M2_vmpy_acc_sat <"vmpyweh", 0b100, 0b101, 1, 0>;
-def M2_mmacls_s0:  T_M2_vmpy_acc_sat <"vmpyweh", 0b000, 0b101, 0, 0>;
-def M2_mmacls_rs1: T_M2_vmpy_acc_sat <"vmpyweh", 0b101, 0b101, 1, 1>;
-def M2_mmacls_rs0: T_M2_vmpy_acc_sat <"vmpyweh", 0b001, 0b101, 0, 1>;
-
-def M2_mmachs_s1:  T_M2_vmpy_acc_sat <"vmpywoh", 0b100, 0b111, 1, 0>;
-def M2_mmachs_s0:  T_M2_vmpy_acc_sat <"vmpywoh", 0b000, 0b111, 0, 0>;
-def M2_mmachs_rs1: T_M2_vmpy_acc_sat <"vmpywoh", 0b101, 0b111, 1, 1>;
-def M2_mmachs_rs0: T_M2_vmpy_acc_sat <"vmpywoh", 0b001, 0b111, 0, 1>;
-
-// Vector multiply word by unsigned half with accumulation
-// Rxx+=vmpyw[eo]uh(Rss,Rtt)[:<<1][:rnd]:sat
-def M2_mmaculs_s1:  T_M2_vmpy_acc_sat <"vmpyweuh", 0b110, 0b101, 1, 0>;
-def M2_mmaculs_s0:  T_M2_vmpy_acc_sat <"vmpyweuh", 0b010, 0b101, 0, 0>;
-def M2_mmaculs_rs1: T_M2_vmpy_acc_sat <"vmpyweuh", 0b111, 0b101, 1, 1>;
-def M2_mmaculs_rs0: T_M2_vmpy_acc_sat <"vmpyweuh", 0b011, 0b101, 0, 1>;
-
-def M2_mmacuhs_s1:  T_M2_vmpy_acc_sat <"vmpywouh", 0b110, 0b111, 1, 0>;
-def M2_mmacuhs_s0:  T_M2_vmpy_acc_sat <"vmpywouh", 0b010, 0b111, 0, 0>;
-def M2_mmacuhs_rs1: T_M2_vmpy_acc_sat <"vmpywouh", 0b111, 0b111, 1, 1>;
-def M2_mmacuhs_rs0: T_M2_vmpy_acc_sat <"vmpywouh", 0b011, 0b111, 0, 1>;
-
-// Vector multiply even halfwords with accumulation
-// Rxx+=vmpyeh(Rss,Rtt)[:<<1][:sat]
-def M2_vmac2es:    T_M2_vmpy_acc     <"vmpyeh", 0b001, 0b010, 0, 0>;
-def M2_vmac2es_s1: T_M2_vmpy_acc_sat <"vmpyeh", 0b100, 0b110, 1, 0>;
-def M2_vmac2es_s0: T_M2_vmpy_acc_sat <"vmpyeh", 0b000, 0b110, 0, 0>;
-
-// Vector dual multiply with accumulation
-// Rxx+=vdmpy(Rss,Rtt)[:sat]
-def M2_vdmacs_s1: T_M2_vmpy_acc_sat <"vdmpy", 0b100, 0b100, 1, 0>;
-def M2_vdmacs_s0: T_M2_vmpy_acc_sat <"vdmpy", 0b000, 0b100, 0, 0>;
-
-// Vector complex multiply real or imaginary with accumulation
-// Rxx+=vcmpy[ir](Rss,Rtt):sat
-def M2_vcmac_s0_sat_r: T_M2_vmpy_acc_sat <"vcmpyr", 0b001, 0b100, 0, 0>;
-def M2_vcmac_s0_sat_i: T_M2_vmpy_acc_sat <"vcmpyi", 0b010, 0b100, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template Class -- Multiply signed/unsigned halfwords with and without
-// saturation and rounding
-//===----------------------------------------------------------------------===//
-class T_M2_mpyd < bits<2> LHbits, bit isRnd, bit hasShift, bit isUnsigned >
-  : MInst < (outs DoubleRegs:$Rdd), (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rdd = "#!if(isUnsigned,"mpyu","mpy")#"($Rs."#!if(LHbits{1},"h","l")
-                                       #", $Rt."#!if(LHbits{0},"h)","l)")
-                                       #!if(hasShift,":<<1","")
-                                       #!if(isRnd,":rnd",""),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0100;
-    let Inst{23} = hasShift;
-    let Inst{22} = isUnsigned;
-    let Inst{21} = isRnd;
-    let Inst{6-5} = LHbits;
-    let Inst{4-0} = Rdd;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-}
-
-def M2_mpyd_hh_s0: T_M2_mpyd<0b11, 0, 0, 0>;
-def M2_mpyd_hl_s0: T_M2_mpyd<0b10, 0, 0, 0>;
-def M2_mpyd_lh_s0: T_M2_mpyd<0b01, 0, 0, 0>;
-def M2_mpyd_ll_s0: T_M2_mpyd<0b00, 0, 0, 0>;
-
-def M2_mpyd_hh_s1: T_M2_mpyd<0b11, 0, 1, 0>;
-def M2_mpyd_hl_s1: T_M2_mpyd<0b10, 0, 1, 0>;
-def M2_mpyd_lh_s1: T_M2_mpyd<0b01, 0, 1, 0>;
-def M2_mpyd_ll_s1: T_M2_mpyd<0b00, 0, 1, 0>;
-
-def M2_mpyd_rnd_hh_s0: T_M2_mpyd<0b11, 1, 0, 0>;
-def M2_mpyd_rnd_hl_s0: T_M2_mpyd<0b10, 1, 0, 0>;
-def M2_mpyd_rnd_lh_s0: T_M2_mpyd<0b01, 1, 0, 0>;
-def M2_mpyd_rnd_ll_s0: T_M2_mpyd<0b00, 1, 0, 0>;
-
-def M2_mpyd_rnd_hh_s1: T_M2_mpyd<0b11, 1, 1, 0>;
-def M2_mpyd_rnd_hl_s1: T_M2_mpyd<0b10, 1, 1, 0>;
-def M2_mpyd_rnd_lh_s1: T_M2_mpyd<0b01, 1, 1, 0>;
-def M2_mpyd_rnd_ll_s1: T_M2_mpyd<0b00, 1, 1, 0>;
-
-//Rdd=mpyu(Rs.[HL],Rt.[HL])[:<<1]
-def M2_mpyud_hh_s0: T_M2_mpyd<0b11, 0, 0, 1>;
-def M2_mpyud_hl_s0: T_M2_mpyd<0b10, 0, 0, 1>;
-def M2_mpyud_lh_s0: T_M2_mpyd<0b01, 0, 0, 1>;
-def M2_mpyud_ll_s0: T_M2_mpyd<0b00, 0, 0, 1>;
-
-def M2_mpyud_hh_s1: T_M2_mpyd<0b11, 0, 1, 1>;
-def M2_mpyud_hl_s1: T_M2_mpyd<0b10, 0, 1, 1>;
-def M2_mpyud_lh_s1: T_M2_mpyd<0b01, 0, 1, 1>;
-def M2_mpyud_ll_s1: T_M2_mpyd<0b00, 0, 1, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template Class for xtype mpy:
-// Vector multiply
-// Complex multiply
-// multiply 32X32 and use full result
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_XTYPE_mpy64 <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                     bit isSat, bit hasShift, bit isConj>
-   : MInst <(outs DoubleRegs:$Rdd),
-            (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rdd = "#mnemonic#"($Rs, $Rt"#!if(isConj,"*)",")")
-                                #!if(hasShift,":<<1","")
-                                #!if(isSat,":sat",""),
-  [] > {
-    bits<5> Rdd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0101;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rdd;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template Class for xtype mpy with accumulation into 64-bit:
-// Vector multiply
-// Complex multiply
-// multiply 32X32 and use full result
-//===----------------------------------------------------------------------===//
-class T_XTYPE_mpy64_acc <string op1, string op2, bits<3> MajOp, bits<3> MinOp,
-                         bit isSat, bit hasShift, bit isConj>
-  : MInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rxx "#op2#"= "#op1#"($Rs, $Rt"#!if(isConj,"*)",")")
-                                   #!if(hasShift,":<<1","")
-                                   #!if(isSat,":sat",""),
-
-  [] , "$dst2 = $Rxx" > {
-    bits<5> Rxx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b0111;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rxx;
-  }
-
-// MPY - Multiply and use full result
-// Rdd = mpy[u](Rs,Rt)
-def M2_dpmpyss_s0 : T_XTYPE_mpy64 < "mpy", 0b000, 0b000, 0, 0, 0>;
-def M2_dpmpyuu_s0 : T_XTYPE_mpy64 < "mpyu", 0b010, 0b000, 0, 0, 0>;
-
-// Rxx[+-]= mpy[u](Rs,Rt)
-def M2_dpmpyss_acc_s0 : T_XTYPE_mpy64_acc < "mpy",  "+", 0b000, 0b000, 0, 0, 0>;
-def M2_dpmpyss_nac_s0 : T_XTYPE_mpy64_acc < "mpy",  "-", 0b001, 0b000, 0, 0, 0>;
-def M2_dpmpyuu_acc_s0 : T_XTYPE_mpy64_acc < "mpyu", "+", 0b010, 0b000, 0, 0, 0>;
-def M2_dpmpyuu_nac_s0 : T_XTYPE_mpy64_acc < "mpyu", "-", 0b011, 0b000, 0, 0, 0>;
-
-// Complex multiply real or imaginary
-// Rxx=cmpy[ir](Rs,Rt)
-def M2_cmpyi_s0 : T_XTYPE_mpy64 < "cmpyi", 0b000, 0b001, 0, 0, 0>;
-def M2_cmpyr_s0 : T_XTYPE_mpy64 < "cmpyr", 0b000, 0b010, 0, 0, 0>;
-
-// Rxx+=cmpy[ir](Rs,Rt)
-def M2_cmaci_s0 : T_XTYPE_mpy64_acc < "cmpyi", "+", 0b000, 0b001, 0, 0, 0>;
-def M2_cmacr_s0 : T_XTYPE_mpy64_acc < "cmpyr", "+", 0b000, 0b010, 0, 0, 0>;
-
-// Complex multiply
-// Rdd=cmpy(Rs,Rt)[:<<]:sat
-def M2_cmpys_s0 : T_XTYPE_mpy64 < "cmpy", 0b000, 0b110, 1, 0, 0>;
-def M2_cmpys_s1 : T_XTYPE_mpy64 < "cmpy", 0b100, 0b110, 1, 1, 0>;
-
-// Rdd=cmpy(Rs,Rt*)[:<<]:sat
-def M2_cmpysc_s0 : T_XTYPE_mpy64 < "cmpy", 0b010, 0b110, 1, 0, 1>;
-def M2_cmpysc_s1 : T_XTYPE_mpy64 < "cmpy", 0b110, 0b110, 1, 1, 1>;
-
-// Rxx[-+]=cmpy(Rs,Rt)[:<<1]:sat
-def M2_cmacs_s0  : T_XTYPE_mpy64_acc < "cmpy", "+", 0b000, 0b110, 1, 0, 0>;
-def M2_cnacs_s0  : T_XTYPE_mpy64_acc < "cmpy", "-", 0b000, 0b111, 1, 0, 0>;
-def M2_cmacs_s1  : T_XTYPE_mpy64_acc < "cmpy", "+", 0b100, 0b110, 1, 1, 0>;
-def M2_cnacs_s1  : T_XTYPE_mpy64_acc < "cmpy", "-", 0b100, 0b111, 1, 1, 0>;
-
-// Rxx[-+]=cmpy(Rs,Rt*)[:<<1]:sat
-def M2_cmacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b010, 0b110, 1, 0, 1>;
-def M2_cnacsc_s0 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b010, 0b111, 1, 0, 1>;
-def M2_cmacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "+", 0b110, 0b110, 1, 1, 1>;
-def M2_cnacsc_s1 : T_XTYPE_mpy64_acc < "cmpy", "-", 0b110, 0b111, 1, 1, 1>;
-
-// Vector multiply halfwords
-// Rdd=vmpyh(Rs,Rt)[:<<]:sat
-//let Defs = [USR_OVF] in {
-  def M2_vmpy2s_s1 : T_XTYPE_mpy64 < "vmpyh", 0b100, 0b101, 1, 1, 0>;
-  def M2_vmpy2s_s0 : T_XTYPE_mpy64 < "vmpyh", 0b000, 0b101, 1, 0, 0>;
-//}
-
-// Rxx+=vmpyh(Rs,Rt)[:<<1][:sat]
-def M2_vmac2     : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b001, 0b001, 0, 0, 0>;
-def M2_vmac2s_s1 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b100, 0b101, 1, 1, 0>;
-def M2_vmac2s_s0 : T_XTYPE_mpy64_acc < "vmpyh", "+", 0b000, 0b101, 1, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYH -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYS +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/MPYS -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/VB +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/VB -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MTYPE/VH  +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// MTYPE/VH  -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ST +
-//===----------------------------------------------------------------------===//
-///
-// Store doubleword.
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated post increment stores with immediate offset
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_store_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                 bits<4> MajOp, bit isHalf >
-  : STInst <(outs IntRegs:$_dst_),
-            (ins IntRegs:$src1, ImmOp:$offset, RC:$src2),
-  mnemonic#"($src1++#$offset) = $src2"#!if(isHalf, ".h", ""),
-  [], "$src1 = $_dst_" >,
-  AddrModeRel {
-    bits<5> src1;
-    bits<5> src2;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = src2;
-    let Inst{7}     = 0b0;
-    let Inst{6-3}   = offsetBits;
-    let Inst{1}     = 0b0;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated post increment stores with immediate offset
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc in
-class T_pstore_pi <string mnemonic, RegisterClass RC, Operand ImmOp,
-                   bits<4> MajOp, bit isHalf, bit isPredNot, bit isPredNew>
-  : STInst <(outs IntRegs:$_dst_),
-            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$offset, RC:$src3),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2++#$offset) = $src3"#!if(isHalf, ".h", ""),
-  [], "$src2 = $_dst_" >,
-  AddrModeRel {
-    bits<2> src1;
-    bits<5> src2;
-    bits<7> offset;
-    bits<5> src3;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_3Imm"), offset{6-3},
-                     !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0})));
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(ImmOpStr, "s4_3Imm"), 0, !if(isHalf,0,1));
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b1010;
-
-    let Inst{27-25} = 0b101;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b1;
-    let Inst{12-8} = src3;
-    let Inst{7} = isPredNew;
-    let Inst{6-3} = offsetBits;
-    let Inst{2} = isPredNot;
-    let Inst{1-0} = src1;
-  }
-
-multiclass ST_PostInc<string mnemonic, string BaseOp, RegisterClass RC,
-                      Operand ImmOp, bits<4> MajOp, bit isHalf = 0 > {
-
-  let BaseOpcode = "POST_"#BaseOp in {
-    def S2_#NAME#_pi : T_store_pi <mnemonic, RC, ImmOp, MajOp, isHalf>;
-
-    // Predicated
-    def S2_p#NAME#t_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 0, 0>;
-    def S2_p#NAME#f_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp, isHalf, 1, 0>;
-
-    // Predicated new
-    def S2_p#NAME#tnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
-                                          isHalf, 0, 1>;
-    def S2_p#NAME#fnew_pi : T_pstore_pi <mnemonic, RC, ImmOp, MajOp,
-                                          isHalf, 1, 1>;
-  }
-}
-
-let accessSize = ByteAccess in
-defm storerb: ST_PostInc <"memb", "STrib", IntRegs, s4_0Imm, 0b1000>;
-
-let accessSize = HalfWordAccess in
-defm storerh: ST_PostInc <"memh", "STrih", IntRegs, s4_1Imm, 0b1010>;
-
-let accessSize = WordAccess in
-defm storeri: ST_PostInc <"memw", "STriw", IntRegs, s4_2Imm, 0b1100>;
-
-let accessSize = DoubleWordAccess in
-defm storerd: ST_PostInc <"memd", "STrid", DoubleRegs, s4_3Imm, 0b1110>;
-
-let accessSize = HalfWordAccess, isNVStorable = 0 in
-defm storerf: ST_PostInc <"memh", "STrih_H", IntRegs, s4_1Imm, 0b1011, 1>;
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment stores with register offset.
-//===----------------------------------------------------------------------===//
-class T_store_pr <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                     MemAccessSize AccessSz, bit isHalf = 0>
-  : STInst <(outs IntRegs:$_dst_),
-            (ins IntRegs:$src1, ModRegs:$src2, RC:$src3),
-  mnemonic#"($src1++$src2) = $src3"#!if(isHalf, ".h", ""),
-  [], "$src1 = $_dst_" > {
-    bits<5> src1;
-    bits<1> src2;
-    bits<5> src3;
-    let accessSize = AccessSz;
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if(!eq(mnemonic,"memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1101;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13} = src2;
-    let Inst{12-8} = src3;
-    let Inst{7} = 0b0;
-  }
-
-def S2_storerb_pr : T_store_pr<"memb", IntRegs, 0b000, ByteAccess>;
-def S2_storerh_pr : T_store_pr<"memh", IntRegs, 0b010, HalfWordAccess>;
-def S2_storeri_pr : T_store_pr<"memw", IntRegs, 0b100, WordAccess>;
-def S2_storerd_pr : T_store_pr<"memd", DoubleRegs, 0b110, DoubleWordAccess>;
-def S2_storerf_pr : T_store_pr<"memh", IntRegs, 0b011, HalfWordAccess, 1>;
-
-let opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
-class T_store_io <string mnemonic, RegisterClass RC, Operand ImmOp,
-                  bits<3> MajOp, bit isH = 0>
-  : STInst <(outs),
-            (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-  mnemonic#"($src1+#$src2) = $src3"#!if(isH,".h","")>,
-  AddrModeRel, ImmRegRel {
-    bits<5> src1;
-    bits<14> src2; // Actual address offset
-    bits<5> src3;
-    bits<11> offsetBits; // Represents offset encoding
-
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let opExtentBits = !if (!eq(ImmOpStr, "s11_3Ext"), 14,
-                       !if (!eq(ImmOpStr, "s11_2Ext"), 13,
-                       !if (!eq(ImmOpStr, "s11_1Ext"), 12,
-                                        /* s11_0Ext */ 11)));
-    let offsetBits = !if (!eq(ImmOpStr, "s11_3Ext"), src2{13-3},
-                     !if (!eq(ImmOpStr, "s11_2Ext"), src2{12-2},
-                     !if (!eq(ImmOpStr, "s11_1Ext"), src2{11-1},
-                                      /* s11_0Ext */ src2{10-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-    let IClass = 0b1010;
-
-    let Inst{27} = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24} = 0b1;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13} = offsetBits{8};
-    let Inst{12-8} = src3;
-    let Inst{7-0} = offsetBits{7-0};
-  }
-
-let opExtendable = 2, isPredicated = 1 in
-class T_pstore_io <string mnemonic, RegisterClass RC, Operand ImmOp,
-                   bits<3>MajOp, bit PredNot, bit isPredNew, bit isH = 0>
-  : STInst <(outs),
-            (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-  !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2+#$src3) = $src4"#!if(isH,".h",""),
-  [],"",V2LDST_tc_st_SLOT01 >,
-   AddrModeRel, ImmRegRel {
-    bits<2> src1;
-    bits<5> src2;
-    bits<9> src3; // Actual address offset
-    bits<5> src4;
-    bits<6> offsetBits; // Represents offset encoding
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = PredNot;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let opExtentBits = !if (!eq(ImmOpStr, "u6_3Ext"), 9,
-                       !if (!eq(ImmOpStr, "u6_2Ext"), 8,
-                       !if (!eq(ImmOpStr, "u6_1Ext"), 7,
-                                        /* u6_0Ext */ 6)));
-    let offsetBits = !if (!eq(ImmOpStr, "u6_3Ext"), src3{8-3},
-                     !if (!eq(ImmOpStr, "u6_2Ext"), src3{7-2},
-                     !if (!eq(ImmOpStr, "u6_1Ext"), src3{6-1},
-                                      /* u6_0Ext */ src3{5-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-
-    let IClass = 0b0100;
-
-    let Inst{27} = 0b0;
-    let Inst{26} = PredNot;
-    let Inst{25} = isPredNew;
-    let Inst{24} = 0b0;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13} = offsetBits{5};
-    let Inst{12-8} = src4;
-    let Inst{7-3} = offsetBits{4-0};
-    let Inst{1-0} = src1;
-  }
-
-let isExtendable = 1, hasSideEffects = 0 in
-multiclass ST_Idxd<string mnemonic, string CextOp, RegisterClass RC,
-                 Operand ImmOp, Operand predImmOp, bits<3> MajOp, bit isH = 0> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    def S2_#NAME#_io : T_store_io <mnemonic, RC, ImmOp, MajOp, isH>;
-
-    // Predicated
-    def S2_p#NAME#t_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 0, 0, isH>;
-    def S2_p#NAME#f_io : T_pstore_io<mnemonic, RC, predImmOp, MajOp, 1, 0, isH>;
-
-    // Predicated new
-    def S4_p#NAME#tnew_io : T_pstore_io <mnemonic, RC, predImmOp,
-                                         MajOp, 0, 1, isH>;
-    def S4_p#NAME#fnew_io : T_pstore_io <mnemonic, RC, predImmOp,
-                                         MajOp, 1, 1, isH>;
-  }
-}
-
-let addrMode = BaseImmOffset, InputType = "imm" in {
-  let accessSize = ByteAccess in
-    defm storerb: ST_Idxd < "memb", "STrib", IntRegs, s11_0Ext, u6_0Ext, 0b000>;
-
-  let accessSize = HalfWordAccess, opExtentAlign = 1 in
-    defm storerh: ST_Idxd < "memh", "STrih", IntRegs, s11_1Ext, u6_1Ext, 0b010>;
-
-  let accessSize = WordAccess, opExtentAlign = 2 in
-    defm storeri: ST_Idxd < "memw", "STriw", IntRegs, s11_2Ext, u6_2Ext, 0b100>;
-
-  let accessSize = DoubleWordAccess, isNVStorable = 0, opExtentAlign = 3 in
-    defm storerd: ST_Idxd < "memd", "STrid", DoubleRegs, s11_3Ext,
-                            u6_3Ext, 0b110>;
-
-  let accessSize = HalfWordAccess, opExtentAlign = 1 in
-    defm storerf: ST_Idxd < "memh", "STrif", IntRegs, s11_1Ext,
-                            u6_1Ext, 0b011, 1>;
-}
-
-// Store predicate.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def STriw_pred : STInst<(outs),
-      (ins IntRegs:$addr, s11_2Ext:$off, PredRegs:$src1),
-      ".error \"should not emit\"", []>;
-// Store modifier.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
-def STriw_mod : STInst<(outs),
-      (ins IntRegs:$addr, s11_2Ext:$off, ModRegs:$src1),
-      ".error \"should not emit\"", []>;
-
-// S2_allocframe: Allocate stack frame.
-let Defs = [R29, R30], Uses = [R29, R31, R30],
-    hasSideEffects = 0, accessSize = DoubleWordAccess in
-def S2_allocframe: ST0Inst <
-  (outs), (ins u11_3Imm:$u11_3),
-  "allocframe(#$u11_3)" > {
-    bits<14> u11_3;
-
-    let IClass = 0b1010;
-    let Inst{27-16} = 0b000010011101;
-    let Inst{13-11} = 0b000;
-    let Inst{10-0} = u11_3{13-3};
-  }
-
-// S2_storer[bhwdf]_pci: Store byte/half/word/double.
-// S2_storer[bhwdf]_pci -> S2_storerbnew_pci
-let Uses = [CS], addrMode = PostInc in
-class T_store_pci <string mnemonic, RegisterClass RC,
-                         Operand Imm, bits<4>MajOp,
-                         MemAccessSize AlignSize, string RegSrc = "Rt">
-  : STInst <(outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, RC:$Rt),
-  #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $"#RegSrc#"",
-  [] ,
-  "$Rz = $_dst_" > {
-    bits<5> Rz;
-    bits<7> offset;
-    bits<1> Mu;
-    bits<5> Rt;
-    let accessSize = AlignSize;
-    let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
-                       !if(!eq(RegSrc,"Rt.h"), 0, 1));
-
-    let IClass = 0b1010;
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-8} = Rt;
-    let Inst{7} = 0b0;
-    let Inst{6-3} =
-      !if (!eq(!cast<string>(AlignSize), "DoubleWordAccess"), offset{6-3},
-      !if (!eq(!cast<string>(AlignSize), "WordAccess"),       offset{5-2},
-      !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"),   offset{4-1},
-                                       /* ByteAccess */       offset{3-0})));
-    let Inst{1} = 0b0;
-  }
-
-def S2_storerb_pci : T_store_pci<"memb", IntRegs, s4_0Imm, 0b1000,
-                                 ByteAccess>;
-def S2_storerh_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1010,
-                                 HalfWordAccess>;
-def S2_storerf_pci : T_store_pci<"memh", IntRegs, s4_1Imm, 0b1011,
-                                 HalfWordAccess, "Rt.h">;
-def S2_storeri_pci : T_store_pci<"memw", IntRegs, s4_2Imm, 0b1100,
-                                 WordAccess>;
-def S2_storerd_pci : T_store_pci<"memd", DoubleRegs, s4_3Imm, 0b1110,
-                                 DoubleWordAccess>;
-
-let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 4,
-    addrMode = PostInc in
-class T_storenew_pci <string mnemonic, Operand Imm,
-                             bits<2>MajOp, MemAccessSize AlignSize>
-  : NVInst < (outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, Imm:$offset, ModRegs:$Mu, IntRegs:$Nt),
-  #mnemonic#"($Rz ++ #$offset:circ($Mu)) = $Nt.new",
-  [],
-  "$Rz = $_dst_"> {
-    bits<5> Rz;
-    bits<6> offset;
-    bits<1> Mu;
-    bits<3> Nt;
-
-    let accessSize = AlignSize;
-
-    let IClass = 0b1010;
-    let Inst{27-21} = 0b1001101;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = Nt;
-    let Inst{7} = 0b0;
-    let Inst{6-3} =
-      !if (!eq(!cast<string>(AlignSize), "WordAccess"),     offset{5-2},
-      !if (!eq(!cast<string>(AlignSize), "HalfWordAccess"), offset{4-1},
-                                       /* ByteAccess */     offset{3-0}));
-    let Inst{1} = 0b0;
-  }
-
-def S2_storerbnew_pci : T_storenew_pci <"memb", s4_0Imm, 0b00, ByteAccess>;
-def S2_storerhnew_pci : T_storenew_pci <"memh", s4_1Imm, 0b01, HalfWordAccess>;
-def S2_storerinew_pci : T_storenew_pci <"memw", s4_2Imm, 0b10, WordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Circular stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let Uses = [CS], addrMode = PostInc in
-class T_store_pcr <string mnemonic, RegisterClass RC, bits<4>MajOp,
-                               MemAccessSize AlignSize, string RegSrc = "Rt">
-  : STInst <(outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, ModRegs:$Mu, RC:$Rt),
-  #mnemonic#"($Rz ++ I:circ($Mu)) = $"#RegSrc#"",
-  [],
-  "$Rz = $_dst_" > {
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<5> Rt;
-
-    let accessSize = AlignSize;
-    let isNVStorable = !if(!eq(mnemonic,"memd"), 0,
-                       !if(!eq(RegSrc,"Rt.h"), 0, 1));
-
-    let IClass = 0b1010;
-    let Inst{27-25} = 0b100;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-8} = Rt;
-    let Inst{7} = 0b0;
-    let Inst{1} = 0b1;
-  }
-
-def S2_storerb_pcr : T_store_pcr<"memb", IntRegs, 0b1000, ByteAccess>;
-def S2_storerh_pcr : T_store_pcr<"memh", IntRegs, 0b1010, HalfWordAccess>;
-def S2_storeri_pcr : T_store_pcr<"memw", IntRegs, 0b1100, WordAccess>;
-def S2_storerd_pcr : T_store_pcr<"memd", DoubleRegs, 0b1110, DoubleWordAccess>;
-def S2_storerf_pcr : T_store_pcr<"memh", IntRegs, 0b1011,
-                                 HalfWordAccess, "Rt.h">;
-
-//===----------------------------------------------------------------------===//
-// Circular .new stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let Uses = [CS], isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
-    addrMode = PostInc in
-class T_storenew_pcr <string mnemonic, bits<2>MajOp,
-                                   MemAccessSize AlignSize>
-  : NVInst <(outs IntRegs:$_dst_),
-  (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
-  #mnemonic#"($Rz ++ I:circ($Mu)) = $Nt.new" ,
-  [] ,
-  "$Rz = $_dst_"> {
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<3> Nt;
-
-    let accessSize = AlignSize;
-
-    let IClass = 0b1010;
-    let Inst{27-21} = 0b1001101;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = Nt;
-    let Inst{7} = 0b0;
-    let Inst{1} = 0b1;
-  }
-
-def S2_storerbnew_pcr : T_storenew_pcr <"memb", 0b00, ByteAccess>;
-def S2_storerhnew_pcr : T_storenew_pcr <"memh", 0b01, HalfWordAccess>;
-def S2_storerinew_pcr : T_storenew_pcr <"memw", 0b10, WordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Bit-reversed stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = PostInc in
-class T_store_pbr<string mnemonic, RegisterClass RC,
-                            MemAccessSize addrSize, bits<3> majOp,
-                            bit isHalf = 0>
-  : STInst
-    <(outs IntRegs:$_dst_),
-     (ins IntRegs:$Rz, ModRegs:$Mu, RC:$src),
-     #mnemonic#"($Rz ++ $Mu:brev) = $src"#!if (!eq(isHalf, 1), ".h", ""),
-     [], "$Rz = $_dst_" > {
-
-      let accessSize = addrSize;
-
-      bits<5> Rz;
-      bits<1> Mu;
-      bits<5> src;
-
-      let IClass = 0b1010;
-
-      let Inst{27-24} = 0b1111;
-      let Inst{23-21} = majOp;
-      let Inst{7} = 0b0;
-      let Inst{20-16} = Rz;
-      let Inst{13} = Mu;
-      let Inst{12-8} = src;
-    }
-
-let isNVStorable = 1 in {
-  let BaseOpcode = "S2_storerb_pbr" in
-  def S2_storerb_pbr : T_store_pbr<"memb", IntRegs, ByteAccess,
-                                             0b000>, NewValueRel;
-  let BaseOpcode = "S2_storerh_pbr" in
-  def S2_storerh_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess,
-                                             0b010>, NewValueRel;
-  let BaseOpcode = "S2_storeri_pbr" in
-  def S2_storeri_pbr : T_store_pbr<"memw", IntRegs, WordAccess,
-                                             0b100>, NewValueRel;
-}
-
-def S2_storerf_pbr : T_store_pbr<"memh", IntRegs, HalfWordAccess, 0b011, 1>;
-def S2_storerd_pbr : T_store_pbr<"memd", DoubleRegs, DoubleWordAccess, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// Bit-reversed .new stores with auto-increment register
-//===----------------------------------------------------------------------===//
-let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3,
-    hasSideEffects = 0, addrMode = PostInc in
-class T_storenew_pbr<string mnemonic, MemAccessSize addrSize, bits<2> majOp>
-  : NVInst <(outs IntRegs:$_dst_),
-            (ins IntRegs:$Rz, ModRegs:$Mu, IntRegs:$Nt),
-     #mnemonic#"($Rz ++ $Mu:brev) = $Nt.new", [],
-     "$Rz = $_dst_">, NewValueRel {
-    let accessSize = addrSize;
-    bits<5> Rz;
-    bits<1> Mu;
-    bits<3> Nt;
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1111101;
-    let Inst{12-11} = majOp;
-    let Inst{7} = 0b0;
-    let Inst{20-16} = Rz;
-    let Inst{13} = Mu;
-    let Inst{10-8} = Nt;
-  }
-
-let BaseOpcode = "S2_storerb_pbr" in
-def S2_storerbnew_pbr : T_storenew_pbr<"memb", ByteAccess, 0b00>;
-
-let BaseOpcode = "S2_storerh_pbr" in
-def S2_storerhnew_pbr : T_storenew_pbr<"memh", HalfWordAccess, 0b01>;
-
-let BaseOpcode = "S2_storeri_pbr" in
-def S2_storerinew_pbr : T_storenew_pbr<"memw", WordAccess, 0b10>;
-
-//===----------------------------------------------------------------------===//
-// ST -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Template class for S_2op instructions.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_S2op_1 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
-                RegisterClass RCIn, bits<2> MajOp, bits<3> MinOp, bit isSat>
-  : SInst <(outs RCOut:$dst), (ins RCIn:$src),
-  "$dst = "#mnemonic#"($src)"#!if(isSat, ":sat", ""),
-  [], "", S_2op_tc_1_SLOT23 > {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23-22} = MajOp;
-    let Inst{21} = 0b0;
-    let Inst{20-16} = src;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-class T_S2op_1_di <string mnemonic, bits<2> MajOp, bits<3> MinOp>
-  : T_S2op_1 <mnemonic, 0b0100, DoubleRegs, IntRegs, MajOp, MinOp, 0>;
-
-let hasNewValue = 1 in
-class T_S2op_1_id <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
-  : T_S2op_1 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, isSat>;
-
-let hasNewValue = 1 in
-class T_S2op_1_ii <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit isSat = 0>
-  : T_S2op_1 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp, isSat>;
-
-// Vector sign/zero extend
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def S2_vsxtbh : T_S2op_1_di <"vsxtbh", 0b00, 0b000>;
-  def S2_vsxthw : T_S2op_1_di <"vsxthw", 0b00, 0b100>;
-  def S2_vzxtbh : T_S2op_1_di <"vzxtbh", 0b00, 0b010>;
-  def S2_vzxthw : T_S2op_1_di <"vzxthw", 0b00, 0b110>;
-}
-
-// Vector splat bytes/halfwords
-let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
-  def S2_vsplatrb : T_S2op_1_ii <"vsplatb", 0b01, 0b111>;
-  def S2_vsplatrh : T_S2op_1_di <"vsplath", 0b01, 0b010>;
-}
-
-// Sign extend word to doubleword
-def A2_sxtw   : T_S2op_1_di <"sxtw", 0b01, 0b000>;
-
-// Vector saturate and pack
-let Defs = [USR_OVF] in {
-  def S2_svsathb  : T_S2op_1_ii <"vsathb", 0b10, 0b000>;
-  def S2_svsathub : T_S2op_1_ii <"vsathub", 0b10, 0b010>;
-  def S2_vsathb   : T_S2op_1_id <"vsathb", 0b00, 0b110>;
-  def S2_vsathub  : T_S2op_1_id <"vsathub", 0b00, 0b000>;
-  def S2_vsatwh   : T_S2op_1_id <"vsatwh", 0b00, 0b010>;
-  def S2_vsatwuh  : T_S2op_1_id <"vsatwuh", 0b00, 0b100>;
-}
-
-// Vector truncate
-def S2_vtrunohb : T_S2op_1_id <"vtrunohb", 0b10, 0b000>;
-def S2_vtrunehb : T_S2op_1_id <"vtrunehb", 0b10, 0b010>;
-
-// Swizzle the bytes of a word
-def A2_swiz : T_S2op_1_ii <"swiz", 0b10, 0b111>;
-
-// Saturate
-let Defs = [USR_OVF] in {
-  def A2_sat   : T_S2op_1_id <"sat", 0b11, 0b000>;
-  def A2_satb  : T_S2op_1_ii <"satb", 0b11, 0b111>;
-  def A2_satub : T_S2op_1_ii <"satub", 0b11, 0b110>;
-  def A2_sath  : T_S2op_1_ii <"sath", 0b11, 0b100>;
-  def A2_satuh : T_S2op_1_ii <"satuh", 0b11, 0b101>;
-  def A2_roundsat : T_S2op_1_id <"round", 0b11, 0b001, 0b1>;
-}
-
-let Itinerary = S_2op_tc_2_SLOT23 in {
-  // Vector round and pack
-  def S2_vrndpackwh   : T_S2op_1_id <"vrndwh", 0b10, 0b100>;
-
-  let Defs = [USR_OVF] in
-  def S2_vrndpackwhs  : T_S2op_1_id <"vrndwh", 0b10, 0b110, 1>;
-
-  // Bit reverse
-  def S2_brev : T_S2op_1_ii <"brev", 0b01, 0b110>;
-
-  // Absolute value word
-  def A2_abs    : T_S2op_1_ii <"abs", 0b10, 0b100>;
-
-  let Defs = [USR_OVF] in
-  def A2_abssat : T_S2op_1_ii <"abs", 0b10, 0b101, 1>;
-
-  // Negate with saturation
-  let Defs = [USR_OVF] in
-  def A2_negsat : T_S2op_1_ii <"neg", 0b10, 0b110, 1>;
-}
-
-class T_S2op_2 <string mnemonic, bits<4> RegTyBits, RegisterClass RCOut,
-                RegisterClass RCIn, bits<3> MajOp, bits<3> MinOp,
-                bit isSat, bit isRnd, list<dag> pattern = []>
-  : SInst <(outs RCOut:$dst),
-  (ins RCIn:$src, u5_0Imm:$u5),
-  "$dst = "#mnemonic#"($src, #$u5)"#!if(isSat, ":sat", "")
-                                   #!if(isRnd, ":rnd", ""),
-  pattern, "", S_2op_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src;
-    bits<5> u5;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src;
-    let Inst{13} = 0b0;
-    let Inst{12-8} = u5;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-class T_S2op_2_di <string mnemonic, bits<3> MajOp, bits<3> MinOp>
-  : T_S2op_2 <mnemonic, 0b1000, DoubleRegs, IntRegs, MajOp, MinOp, 0, 0>;
-
-let hasNewValue = 1 in
-class T_S2op_2_id <string mnemonic, bits<3> MajOp, bits<3> MinOp>
-  : T_S2op_2 <mnemonic, 0b1000, IntRegs, DoubleRegs, MajOp, MinOp, 0, 0>;
-
-let hasNewValue = 1 in
-class T_S2op_2_ii <string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                   bit isSat = 0, bit isRnd = 0, list<dag> pattern = []>
-  : T_S2op_2 <mnemonic, 0b1100, IntRegs, IntRegs, MajOp, MinOp,
-              isSat, isRnd, pattern>;
-
-class T_S2op_shift <string mnemonic, bits<3> MajOp, bits<3> MinOp, SDNode OpNd>
-  : T_S2op_2_ii <mnemonic, MajOp, MinOp, 0, 0, []>;
-
-// Vector arithmetic shift right by immediate with truncate and pack
-def S2_asr_i_svw_trun : T_S2op_2_id <"vasrw", 0b110, 0b010>;
-
-// Arithmetic/logical shift right/left by immediate
-let Itinerary = S_2op_tc_1_SLOT23 in {
-  def S2_asr_i_r : T_S2op_shift <"asr", 0b000, 0b000, sra>;
-  def S2_lsr_i_r : T_S2op_shift <"lsr", 0b000, 0b001, srl>;
-  def S2_asl_i_r : T_S2op_shift <"asl", 0b000, 0b010, shl>;
-}
-
-// Shift left by immediate with saturation
-let Defs = [USR_OVF] in
-def S2_asl_i_r_sat : T_S2op_2_ii <"asl", 0b010, 0b010, 1>;
-
-// Shift right with round
-def S2_asr_i_r_rnd : T_S2op_2_ii <"asr", 0b010, 0b000, 0, 1>;
-
-let isAsmParserOnly = 1 in
-def S2_asr_i_r_rnd_goodsyntax
-  : SInst <(outs IntRegs:$dst), (ins  IntRegs:$src, u5_0Imm:$u5),
-  "$dst = asrrnd($src, #$u5)",
-  [], "", S_2op_tc_1_SLOT23>;
-
-let isAsmParserOnly = 1 in
-def A2_not: ALU32_rr<(outs IntRegs:$dst),(ins IntRegs:$src),
-  "$dst = not($src)">;
-
-class T_S2op_3<string opc, bits<2>MajOp, bits<3>minOp, bits<1> sat = 0>
-  : SInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
-           "$Rdd = "#opc#"($Rss)"#!if(!eq(sat, 1),":sat","")> {
-  bits<5> Rss;
-  bits<5> Rdd;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0;
-  let Inst{23-22} = MajOp;
-  let Inst{20-16} = Rss;
-  let Inst{7-5} = minOp;
-  let Inst{4-0} = Rdd;
-}
-
-def A2_absp : T_S2op_3 <"abs", 0b10, 0b110>;
-def A2_negp : T_S2op_3 <"neg", 0b10, 0b101>;
-def A2_notp : T_S2op_3 <"not", 0b10, 0b100>;
-
-// Innterleave/deinterleave
-def S2_interleave   : T_S2op_3 <"interleave",   0b11, 0b101>;
-def S2_deinterleave : T_S2op_3 <"deinterleave", 0b11, 0b100>;
-
-// Vector Complex conjugate
-def A2_vconj : T_S2op_3 <"vconj", 0b10, 0b111, 1>;
-
-// Vector saturate without pack
-def S2_vsathb_nopack  : T_S2op_3 <"vsathb",  0b00, 0b111>;
-def S2_vsathub_nopack : T_S2op_3 <"vsathub", 0b00, 0b100>;
-def S2_vsatwh_nopack  : T_S2op_3 <"vsatwh",  0b00, 0b110>;
-def S2_vsatwuh_nopack : T_S2op_3 <"vsatwuh", 0b00, 0b101>;
-
-// Vector absolute value halfwords with and without saturation
-// Rdd64=vabsh(Rss64)[:sat]
-def A2_vabsh    : T_S2op_3 <"vabsh", 0b01, 0b100>;
-def A2_vabshsat : T_S2op_3 <"vabsh", 0b01, 0b101, 1>;
-
-// Vector absolute value words with and without saturation
-def A2_vabsw    : T_S2op_3 <"vabsw", 0b01, 0b110>;
-def A2_vabswsat : T_S2op_3 <"vabsw", 0b01, 0b111, 1>;
-
-//===----------------------------------------------------------------------===//
-// STYPE/BIT +
-//===----------------------------------------------------------------------===//
-// Bit count
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_COUNT_LEADING<string MnOp, bits<3> MajOp, bits<3> MinOp, bit Is32,
-                dag Out, dag Inp>
-    : SInst<Out, Inp, "$Rd = "#MnOp#"($Rs)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rs;
-  bits<5> Rd;
-  let IClass = 0b1000;
-  let Inst{27} = 0b1;
-  let Inst{26} = Is32;
-  let Inst{25-24} = 0b00;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = Rs;
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-class T_COUNT_LEADING_32<string MnOp, bits<3> MajOp, bits<3> MinOp>
-    : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b1,
-                      (outs IntRegs:$Rd), (ins IntRegs:$Rs)>;
-
-class T_COUNT_LEADING_64<string MnOp, bits<3> MajOp, bits<3> MinOp>
-    : T_COUNT_LEADING<MnOp, MajOp, MinOp, 0b0,
-                      (outs IntRegs:$Rd), (ins DoubleRegs:$Rs)>;
-
-def S2_cl0     : T_COUNT_LEADING_32<"cl0",     0b000, 0b101>;
-def S2_cl1     : T_COUNT_LEADING_32<"cl1",     0b000, 0b110>;
-def S2_ct0     : T_COUNT_LEADING_32<"ct0",     0b010, 0b100>;
-def S2_ct1     : T_COUNT_LEADING_32<"ct1",     0b010, 0b101>;
-def S2_cl0p    : T_COUNT_LEADING_64<"cl0",     0b010, 0b010>;
-def S2_cl1p    : T_COUNT_LEADING_64<"cl1",     0b010, 0b100>;
-def S2_clb     : T_COUNT_LEADING_32<"clb",     0b000, 0b100>;
-def S2_clbp    : T_COUNT_LEADING_64<"clb",     0b010, 0b000>;
-def S2_clbnorm : T_COUNT_LEADING_32<"normamt", 0b000, 0b111>;
-
-// The 64-bit counts leading/trailing are defined in HexagonInstrInfoV4.td.
-
-// Bit set/clear/toggle
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_SCT_BIT_IMM<string MnOp, bits<3> MinOp>
-    : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, u5_0Imm:$u5),
-            "$Rd = "#MnOp#"($Rs, #$u5)", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> u5;
-  let IClass = 0b1000;
-  let Inst{27-21} = 0b1100110;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0b0;
-  let Inst{12-8} = u5;
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-class T_SCT_BIT_REG<string MnOp, bits<2> MinOp>
-    : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-            "$Rd = "#MnOp#"($Rs, $Rt)", [], "", S_3op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-  let IClass = 0b1100;
-  let Inst{27-22} = 0b011010;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-6} = MinOp;
-  let Inst{4-0} = Rd;
-}
-
-def S2_clrbit_i    : T_SCT_BIT_IMM<"clrbit",    0b001>;
-def S2_setbit_i    : T_SCT_BIT_IMM<"setbit",    0b000>;
-def S2_togglebit_i : T_SCT_BIT_IMM<"togglebit", 0b010>;
-def S2_clrbit_r    : T_SCT_BIT_REG<"clrbit",    0b01>;
-def S2_setbit_r    : T_SCT_BIT_REG<"setbit",    0b00>;
-def S2_togglebit_r : T_SCT_BIT_REG<"togglebit", 0b10>;
-
-// Bit test
-
-let hasSideEffects = 0 in
-class T_TEST_BIT_IMM<string MnOp, bits<3> MajOp>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u5_0Imm:$u5),
-            "$Pd = "#MnOp#"($Rs, #$u5)",
-            [], "", S_2op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> u5;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b0101;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0;
-  let Inst{12-8} = u5;
-  let Inst{1-0} = Pd;
-}
-
-let hasSideEffects = 0 in
-class T_TEST_BIT_REG<string MnOp, bit IsNeg>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-            "$Pd = "#MnOp#"($Rs, $Rt)",
-            [], "", S_3op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-  let IClass = 0b1100;
-  let Inst{27-22} = 0b011100;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{1-0} = Pd;
-}
-
-def S2_tstbit_i : T_TEST_BIT_IMM<"tstbit", 0b000>;
-def S2_tstbit_r : T_TEST_BIT_REG<"tstbit", 0>;
-
-let hasSideEffects = 0 in
-class T_TEST_BITS_IMM<string MnOp, bits<2> MajOp, bit IsNeg>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, u6_0Imm:$u6),
-            "$Pd = "#MnOp#"($Rs, #$u6)",
-            [], "", S_2op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<6> u6;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b0101;
-  let Inst{23-22} = MajOp;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{13-8} = u6;
-  let Inst{1-0} = Pd;
-}
-
-let hasSideEffects = 0 in
-class T_TEST_BITS_REG<string MnOp, bits<2> MajOp, bit IsNeg>
-    : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-            "$Pd = "#MnOp#"($Rs, $Rt)",
-            [], "", S_3op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-  let IClass = 0b1100;
-  let Inst{27-24} = 0b0111;
-  let Inst{23-22} = MajOp;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{1-0} = Pd;
-}
-
-def C2_bitsclri : T_TEST_BITS_IMM<"bitsclr", 0b10, 0>;
-def C2_bitsclr  : T_TEST_BITS_REG<"bitsclr", 0b10, 0>;
-def C2_bitsset  : T_TEST_BITS_REG<"bitsset", 0b01, 0>;
-
-//===----------------------------------------------------------------------===//
-// STYPE/BIT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/COMPLEX +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/COMPLEX -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PERM +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PERM -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/PRED +
-//===----------------------------------------------------------------------===//
-
-// Predicate transfer.
-let hasSideEffects = 0, hasNewValue = 1 in
-def C2_tfrpr : SInst<(outs IntRegs:$Rd), (ins PredRegs:$Ps),
-      "$Rd = $Ps", [], "", S_2op_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<2> Ps;
-
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1001;
-  let Inst{22} = 0b1;
-  let Inst{17-16} = Ps;
-  let Inst{4-0} = Rd;
-}
-
-// Transfer general register to predicate.
-let hasSideEffects = 0 in
-def C2_tfrrp: SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs),
-      "$Pd = $Rs", [], "", S_2op_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<5> Rs;
-
-  let IClass = 0b1000;
-  let Inst{27-21} = 0b0101010;
-  let Inst{20-16} = Rs;
-  let Inst{1-0} = Pd;
-}
-
-let hasSideEffects = 0, isCodeGenOnly = 1 in
-def C2_pxfer_map: SInst<(outs PredRegs:$dst), (ins PredRegs:$src),
-     "$dst = $src">;
-
-//===----------------------------------------------------------------------===//
-// STYPE/PRED -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/SHIFT +
-//===----------------------------------------------------------------------===//
-class S_2OpInstImm<string Mnemonic, bits<3>MajOp, bits<3>MinOp,
-                   Operand Imm, list<dag> pattern = [], bit isRnd = 0>
-  : SInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, Imm:$src2),
-           "$dst = "#Mnemonic#"($src1, #$src2)"#!if(isRnd, ":rnd", ""),
-           pattern> {
-  bits<5> src1;
-  bits<5> dst;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0;
-  let Inst{23-21} = MajOp;
-  let Inst{20-16} = src1;
-  let Inst{7-5} = MinOp;
-  let Inst{4-0} = dst;
-}
-
-class S_2OpInstImmI6<string Mnemonic, SDNode OpNode, bits<3>MinOp>
-  : S_2OpInstImm<Mnemonic, 0b000, MinOp, u6_0Imm, []> {
-  bits<6> src2;
-  let Inst{13-8} = src2;
-}
-
-// Shift by immediate.
-def S2_asr_i_p : S_2OpInstImmI6<"asr", sra, 0b000>;
-def S2_asl_i_p : S_2OpInstImmI6<"asl", shl, 0b010>;
-def S2_lsr_i_p : S_2OpInstImmI6<"lsr", srl, 0b001>;
-
-// Shift left by small amount and add.
-let AddedComplexity = 100, hasNewValue = 1, hasSideEffects = 0 in
-def S2_addasl_rrri: SInst <(outs IntRegs:$Rd),
-                           (ins IntRegs:$Rt, IntRegs:$Rs, u3_0Imm:$u3),
-  "$Rd = addasl($Rt, $Rs, #$u3)" , [],
-  "", S_3op_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rt;
-    bits<5> Rs;
-    bits<3> u3;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b0100000;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = u3;
-    let Inst{4-0}   = Rd;
-  }
-
-//===----------------------------------------------------------------------===//
-// STYPE/SHIFT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/VH +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/VH -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// STYPE/VW +
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// STYPE/VW -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// SYSTEM/SUPER +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// SYSTEM/USER +
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 1, isSoloAX = 1 in
-def Y2_barrier : SYSInst<(outs), (ins), "barrier", [],"",ST_tc_st_SLOT0> {
-  let Inst{31-28} = 0b1010;
-  let Inst{27-21} = 0b1000000;
-}
-
-//===----------------------------------------------------------------------===//
-// SYSTEM/SUPER -
-//===----------------------------------------------------------------------===//
-
-// Generate frameindex addresses. The main reason for the offset operand is
-// that every instruction that is allowed to have frame index as an operand
-// will then have that operand followed by an immediate operand (the offset).
-// This simplifies the frame-index elimination code.
-//
-let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
-    isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
-  def PS_fi  : ALU32_ri<(outs IntRegs:$Rd),
-                        (ins IntRegs:$fi, s32_0Imm:$off), "">;
-  def PS_fia : ALU32_ri<(outs IntRegs:$Rd),
-                        (ins IntRegs:$Rs, IntRegs:$fi, s32_0Imm:$off), "">;
-}
-
-//===----------------------------------------------------------------------===//
-// CRUSER - Type.
-//===----------------------------------------------------------------------===//
-// HW loop
-let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, hasSideEffects = 0 in
-class LOOP_iBase<string mnemonic, Operand brOp, bit mustExtend = 0>
-         : CRInst<(outs), (ins brOp:$offset, u10_0Imm:$src2),
-           #mnemonic#"($offset, #$src2)",
-           [], "" , CR_tc_3x_SLOT3> {
-    bits<9> offset;
-    bits<10> src2;
-
-    let IClass = 0b0110;
-
-    let Inst{27-22} = 0b100100;
-    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
-    let Inst{20-16} = src2{9-5};
-    let Inst{12-8} = offset{8-4};
-    let Inst{7-5} = src2{4-2};
-    let Inst{4-3} = offset{3-2};
-    let Inst{1-0} = src2{1-0};
-}
-
-let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, hasSideEffects = 0 in
-class LOOP_rBase<string mnemonic, Operand brOp, bit mustExtend = 0>
-         : CRInst<(outs), (ins brOp:$offset, IntRegs:$src2),
-           #mnemonic#"($offset, $src2)",
-           [], "" ,CR_tc_3x_SLOT3> {
-    bits<9> offset;
-    bits<5> src2;
-
-    let IClass = 0b0110;
-
-    let Inst{27-22} = 0b000000;
-    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
-    let Inst{20-16} = src2;
-    let Inst{12-8} = offset{8-4};
-    let Inst{4-3} = offset{3-2};
-  }
-
-multiclass LOOP_ri<string mnemonic> {
-  def i : LOOP_iBase<mnemonic, brtarget>;
-  def r : LOOP_rBase<mnemonic, brtarget>;
-
-  let isCodeGenOnly = 1, isExtended = 1, opExtendable = 0 in {
-    def iext: LOOP_iBase<mnemonic, brtargetExt, 1>;
-    def rext: LOOP_rBase<mnemonic, brtargetExt, 1>;
-  }
-}
-
-
-let Defs = [SA0, LC0, USR] in
-defm J2_loop0 : LOOP_ri<"loop0">;
-
-// Interestingly only loop0's appear to set usr.lpcfg
-let Defs = [SA1, LC1] in
-defm J2_loop1 : LOOP_ri<"loop1">;
-
-let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
-    Defs = [PC, LC0], Uses = [SA0, LC0] in {
-def ENDLOOP0 : Endloop<(outs), (ins brtarget:$offset),
-                       ":endloop0",
-                       []>;
-}
-
-let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
-    Defs = [PC, LC1], Uses = [SA1, LC1] in {
-def ENDLOOP1 : Endloop<(outs), (ins brtarget:$offset),
-                       ":endloop1",
-                       []>;
-}
-
-// Pipelined loop instructions, sp[123]loop0
-let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
-    isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, isPredicateLate = 1 in
-class SPLOOP_iBase<string SP, bits<2> op>
-  : CRInst <(outs), (ins brtarget:$r7_2, u10_0Imm:$U10),
-  "p3 = sp"#SP#"loop0($r7_2, #$U10)" > {
-    bits<9> r7_2;
-    bits<10> U10;
-
-    let IClass = 0b0110;
-
-    let Inst{22-21} = op;
-    let Inst{27-23} = 0b10011;
-    let Inst{20-16} = U10{9-5};
-    let Inst{12-8} = r7_2{8-4};
-    let Inst{7-5} = U10{4-2};
-    let Inst{4-3} = r7_2{3-2};
-    let Inst{1-0} = U10{1-0};
-  }
-
-let Defs = [LC0, SA0, P3, USR], hasSideEffects = 0,
-    isExtentSigned = 1, isExtendable = 1, opExtentBits = 9, opExtentAlign = 2,
-    opExtendable = 0, isPredicateLate = 1 in
-class SPLOOP_rBase<string SP, bits<2> op>
-  : CRInst <(outs), (ins brtarget:$r7_2, IntRegs:$Rs),
-  "p3 = sp"#SP#"loop0($r7_2, $Rs)" > {
-    bits<9> r7_2;
-    bits<5> Rs;
-
-    let IClass = 0b0110;
-
-    let Inst{22-21} = op;
-    let Inst{27-23} = 0b00001;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = r7_2{8-4};
-    let Inst{4-3} = r7_2{3-2};
-  }
-
-multiclass SPLOOP_ri<string mnemonic, bits<2> op> {
-  def i : SPLOOP_iBase<mnemonic, op>;
-  def r : SPLOOP_rBase<mnemonic, op>;
-}
-
-defm J2_ploop1s : SPLOOP_ri<"1", 0b01>;
-defm J2_ploop2s : SPLOOP_ri<"2", 0b10>;
-defm J2_ploop3s : SPLOOP_ri<"3", 0b11>;
-
-// if (Rs[!>=<]=#0) jump:[t/nt]
-let Defs = [PC], isPredicated = 1, isBranch = 1, hasSideEffects = 0,
-    hasSideEffects = 0 in
-class J2_jump_0_Base<string compare, bit isTak, bits<2> op>
-  : CRInst <(outs), (ins IntRegs:$Rs, brtarget:$r13_2),
-  "if ($Rs"#compare#"#0) jump"#!if(isTak, ":t", ":nt")#" $r13_2" > {
-    bits<5> Rs;
-    bits<15> r13_2;
-
-    let IClass = 0b0110;
-
-    let Inst{27-24} = 0b0001;
-    let Inst{23-22} = op;
-    let Inst{12} = isTak;
-    let Inst{21} = r13_2{14};
-    let Inst{20-16} = Rs;
-    let Inst{11-1} = r13_2{12-2};
-    let Inst{13} = r13_2{13};
-  }
-
-multiclass J2_jump_compare_0<string compare, bits<2> op> {
-  def NAME    : J2_jump_0_Base<compare, 0, op>;
-  def NAME#pt : J2_jump_0_Base<compare, 1, op>;
-}
-
-defm J2_jumprz    : J2_jump_compare_0<"!=", 0b00>;
-defm J2_jumprgtez : J2_jump_compare_0<">=", 0b01>;
-defm J2_jumprnz   : J2_jump_compare_0<"==", 0b10>;
-defm J2_jumprltez : J2_jump_compare_0<"<=", 0b11>;
-
-// Transfer to/from Control/GPR Guest/GPR
-let hasSideEffects = 0 in
-class TFR_CR_RS_base<RegisterClass CTRC, RegisterClass RC, bit isDouble>
-  : CRInst <(outs CTRC:$dst), (ins RC:$src),
-  "$dst = $src", [], "", CR_tc_3x_SLOT3> {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b0110;
-
-    let Inst{27-25} = 0b001;
-    let Inst{24} = isDouble;
-    let Inst{23-21} = 0b001;
-    let Inst{20-16} = src;
-    let Inst{4-0} = dst;
-  }
-
-def A2_tfrrcr : TFR_CR_RS_base<CtrRegs, IntRegs, 0b0>;
-def A4_tfrpcp : TFR_CR_RS_base<CtrRegs64, DoubleRegs, 0b1>;
-def : InstAlias<"m0 = $Rs", (A2_tfrrcr C6, IntRegs:$Rs)>;
-def : InstAlias<"m1 = $Rs", (A2_tfrrcr C7, IntRegs:$Rs)>;
-
-let hasSideEffects = 0 in
-class TFR_RD_CR_base<RegisterClass RC, RegisterClass CTRC, bit isSingle>
-  : CRInst <(outs RC:$dst), (ins CTRC:$src),
-  "$dst = $src", [], "", CR_tc_3x_SLOT3> {
-    bits<5> dst;
-    bits<5> src;
-
-    let IClass = 0b0110;
-
-    let Inst{27-26} = 0b10;
-    let Inst{25} = isSingle;
-    let Inst{24-21} = 0b0000;
-    let Inst{20-16} = src;
-    let Inst{4-0} = dst;
-  }
-
-let hasNewValue = 1, opNewValue = 0 in
-def A2_tfrcrr : TFR_RD_CR_base<IntRegs, CtrRegs, 1>;
-def A4_tfrcpp : TFR_RD_CR_base<DoubleRegs, CtrRegs64, 0>;
-def : InstAlias<"$Rd = m0", (A2_tfrcrr IntRegs:$Rd, C6)>;
-def : InstAlias<"$Rd = m1", (A2_tfrcrr IntRegs:$Rd, C7)>;
-
-// Y4_trace: Send value to etm trace.
-let isSoloAX = 1, hasSideEffects = 0 in
-def Y4_trace: CRInst <(outs), (ins IntRegs:$Rs),
-  "trace($Rs)"> {
-    bits<5> Rs;
-
-    let IClass = 0b0110;
-    let Inst{27-21} = 0b0010010;
-    let Inst{20-16} = Rs;
-  }
-
-// HI/LO Instructions
-let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
-    hasNewValue = 1, opNewValue = 0 in
-class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp>
-  : ALU32_ri<(outs IntRegs:$dst),
-              (ins u16_0Imm:$imm_value),
-              "$dst"#RegHalf#" = $imm_value", []> {
-    bits<5> dst;
-    bits<32> imm_value;
-    let IClass = 0b0111;
-
-    let Inst{27} = Rs;
-    let Inst{26-24} = MajOp;
-    let Inst{21} = MinOp;
-    let Inst{20-16} = dst;
-    let Inst{23-22} = imm_value{15-14};
-    let Inst{13-0} = imm_value{13-0};
-}
-
-let isAsmParserOnly = 1 in {
-  def LO : REG_IMMED<".l", 0b0, 0b001, 0b1>;
-  def HI : REG_IMMED<".h", 0b0, 0b010, 0b1>;
-}
-
-let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in {
-  def CONST32 : CONSTLDInst<(outs IntRegs:$Rd), (ins i32imm:$v),
-                "$Rd = CONST32(#$v)", []>;
-  def CONST64 : CONSTLDInst<(outs DoubleRegs:$Rd), (ins i64imm:$v),
-                "$Rd = CONST64(#$v)", []>;
-}
-
-let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
-    isCodeGenOnly = 1 in
-def PS_true : SInst<(outs PredRegs:$dst), (ins), "", []>;
-
-let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
-    isCodeGenOnly = 1 in
-def PS_false : SInst<(outs PredRegs:$dst), (ins), "", []>;
-
-let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
-def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
-                              ".error \"should not emit\" ", []>;
-
-let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
-def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
-                             ".error \"should not emit\" ", []>;
-
-// Call subroutine indirectly.
-let Defs = VolatileV3.Regs in
-def J2_callr : JUMPR_MISC_CALLR<0, 1>;
-
-// Indirect tail-call.
-let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
-    isTerminator = 1, isCodeGenOnly = 1 in
-def PS_tailcall_r : T_JMPr;
-
-// Direct tail-calls.
-let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
-    isTerminator = 1, isCodeGenOnly = 1 in
-def PS_tailcall_i : JInst<(outs), (ins calltarget:$dst), "", []>;
-
-// The reason for the custom inserter is to record all ALLOCA instructions
-// in MachineFunctionInfo.
-let Defs = [R29], isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 1 in
-def PS_alloca: ALU32Inst<(outs IntRegs:$Rd),
-      (ins IntRegs:$Rs, u32_0Imm:$A), "", []>;
-
-let isCodeGenOnly = 1, isPseudo = 1, Uses = [R30], hasSideEffects = 0 in
-def PS_aligna : ALU32Inst<(outs IntRegs:$Rd), (ins u32_0Imm:$A), "", []>;
-
-// XTYPE/SHIFT
-//
-//===----------------------------------------------------------------------===//
-// Template Class
-// Shift by immediate/register and accumulate/logical
-//===----------------------------------------------------------------------===//
-
-// Rx[+-&|]=asr(Rs,#u5)
-// Rx[+-&|^]=lsr(Rs,#u5)
-// Rx[+-&|^]=asl(Rs,#u5)
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_shift_imm_acc_r <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
-  : SInst_acc<(outs IntRegs:$Rx),
-              (ins IntRegs:$src1, IntRegs:$Rs, u5_0Imm:$u5),
-  "$Rx "#opc2#opc1#"($Rs, #$u5)", [],
-  "$src1 = $Rx", S_2op_tc_2_SLOT23> {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> u5;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = 0b1110;
-    let Inst{23-22} = majOp{2-1};
-    let Inst{13} = 0b0;
-    let Inst{7} = majOp{0};
-    let Inst{6-5} = minOp;
-    let Inst{4-0} = Rx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = u5;
-  }
-
-// Rx[+-&|]=asr(Rs,Rt)
-// Rx[+-&|^]=lsr(Rs,Rt)
-// Rx[+-&|^]=asl(Rs,Rt)
-
-let hasNewValue = 1, opNewValue = 0 in
-class T_shift_reg_acc_r <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<2> majOp, bits<2> minOp>
-  : SInst_acc<(outs IntRegs:$Rx),
-              (ins IntRegs:$src1, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rx "#opc2#opc1#"($Rs, $Rt)", [],
-  "$src1 = $Rx", S_3op_tc_2_SLOT23 > {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{23-22} = majOp;
-    let Inst{7-6} = minOp;
-    let Inst{4-0} = Rx;
-    let Inst{20-16} = Rs;
-    let Inst{12-8} = Rt;
-  }
-
-// Rxx[+-&|]=asr(Rss,#u6)
-// Rxx[+-&|^]=lsr(Rss,#u6)
-// Rxx[+-&|^]=asl(Rss,#u6)
-
-class T_shift_imm_acc_p <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
-  : SInst_acc<(outs DoubleRegs:$Rxx),
-              (ins DoubleRegs:$src1, DoubleRegs:$Rss, u6_0Imm:$u6),
-  "$Rxx "#opc2#opc1#"($Rss, #$u6)", [],
-  "$src1 = $Rxx", S_2op_tc_2_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<6> u6;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = 0b0010;
-    let Inst{23-22} = majOp{2-1};
-    let Inst{7} = majOp{0};
-    let Inst{6-5} = minOp;
-    let Inst{4-0} = Rxx;
-    let Inst{20-16} = Rss;
-    let Inst{13-8} = u6;
-  }
-
-
-// Rxx[+-&|]=asr(Rss,Rt)
-// Rxx[+-&|^]=lsr(Rss,Rt)
-// Rxx[+-&|^]=asl(Rss,Rt)
-// Rxx[+-&|^]=lsl(Rss,Rt)
-
-class T_shift_reg_acc_p <string opc1, string opc2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp>
-  : SInst_acc<(outs DoubleRegs:$Rxx),
-              (ins DoubleRegs:$src1, DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rxx "#opc2#opc1#"($Rss, $Rt)", [],
-  "$src1 = $Rxx", S_3op_tc_2_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = majOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rt;
-    let Inst{7-6} = minOp;
-    let Inst{4-0} = Rxx;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multi-class for the shift instructions with logical/arithmetic operators.
-//===----------------------------------------------------------------------===//
-
-multiclass xtype_imm_base<string OpcStr1, string OpcStr2, SDNode OpNode1,
-                         SDNode OpNode2, bits<3> majOp, bits<2> minOp > {
-  def _i_r#NAME : T_shift_imm_acc_r< OpcStr1, OpcStr2, OpNode1,
-                                     OpNode2, majOp, minOp >;
-  def _i_p#NAME : T_shift_imm_acc_p< OpcStr1, OpcStr2, OpNode1,
-                                     OpNode2, majOp, minOp >;
-}
-
-multiclass xtype_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
-  let AddedComplexity = 100 in
-  defm _acc  : xtype_imm_base< opc1, "+= ", OpNode, add, 0b001, minOp>;
-
-  defm _nac  : xtype_imm_base< opc1, "-= ", OpNode, sub, 0b000, minOp>;
-  defm _and  : xtype_imm_base< opc1, "&= ", OpNode, and, 0b010, minOp>;
-  defm _or   : xtype_imm_base< opc1, "|= ", OpNode,  or, 0b011, minOp>;
-}
-
-multiclass xtype_xor_imm_acc<string opc1, SDNode OpNode, bits<2>minOp> {
-let AddedComplexity = 100 in
-  defm _xacc  : xtype_imm_base< opc1, "^= ", OpNode, xor, 0b100, minOp>;
-}
-
-defm S2_asr : xtype_imm_acc<"asr", sra, 0b00>;
-
-defm S2_lsr : xtype_imm_acc<"lsr", srl, 0b01>,
-              xtype_xor_imm_acc<"lsr", srl, 0b01>;
-
-defm S2_asl : xtype_imm_acc<"asl", shl, 0b10>,
-              xtype_xor_imm_acc<"asl", shl, 0b10>;
-
-multiclass xtype_reg_acc_r<string opc1, SDNode OpNode, bits<2>minOp> {
-  let AddedComplexity = 100 in
-  def _acc : T_shift_reg_acc_r <opc1, "+= ", OpNode, add, 0b11, minOp>;
-
-  def _nac : T_shift_reg_acc_r <opc1, "-= ", OpNode, sub, 0b10, minOp>;
-  def _and : T_shift_reg_acc_r <opc1, "&= ", OpNode, and, 0b01, minOp>;
-  def _or  : T_shift_reg_acc_r <opc1, "|= ", OpNode,  or, 0b00, minOp>;
-}
-
-multiclass xtype_reg_acc_p<string opc1, SDNode OpNode, bits<2>minOp> {
-  let AddedComplexity = 100 in
-  def _acc : T_shift_reg_acc_p <opc1, "+= ", OpNode, add, 0b110, minOp>;
-
-  def _nac : T_shift_reg_acc_p <opc1, "-= ", OpNode, sub, 0b100, minOp>;
-  def _and : T_shift_reg_acc_p <opc1, "&= ", OpNode, and, 0b010, minOp>;
-  def _or  : T_shift_reg_acc_p <opc1, "|= ", OpNode,  or, 0b000, minOp>;
-  def _xor : T_shift_reg_acc_p <opc1, "^= ", OpNode, xor, 0b011, minOp>;
-}
-
-multiclass xtype_reg_acc<string OpcStr, SDNode OpNode, bits<2> minOp > {
-  defm _r_r : xtype_reg_acc_r <OpcStr, OpNode, minOp>;
-  defm _r_p : xtype_reg_acc_p <OpcStr, OpNode, minOp>;
-}
-
-defm S2_asl : xtype_reg_acc<"asl", shl, 0b10>;
-defm S2_asr : xtype_reg_acc<"asr", sra, 0b00>;
-defm S2_lsr : xtype_reg_acc<"lsr", srl, 0b01>;
-defm S2_lsl : xtype_reg_acc<"lsl", shl, 0b11>;
-
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_S3op_1 <string mnemonic, RegisterClass RC, bits<2> MajOp, bits<3> MinOp,
-                bit SwapOps, bit isSat = 0, bit isRnd = 0, bit hasShift = 0>
-  : SInst <(outs RC:$dst),
-           (ins DoubleRegs:$src1, DoubleRegs:$src2),
-  "$dst = "#mnemonic#"($src1, $src2)"#!if(isRnd, ":rnd", "")
-                                     #!if(hasShift,":>>1","")
-                                     #!if(isSat, ":sat", ""),
-  [], "", S_3op_tc_2_SLOT23 > {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0001;
-    let Inst{23-22} = MajOp;
-    let Inst{20-16} = !if (SwapOps, src2, src1);
-    let Inst{12-8}  = !if (SwapOps, src1, src2);
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = dst;
-  }
-
-class T_S3op_64 <string mnemonic, bits<2> MajOp, bits<3> MinOp, bit SwapOps,
-                 bit isSat = 0, bit isRnd = 0, bit hasShift = 0 >
-  : T_S3op_1 <mnemonic, DoubleRegs, MajOp, MinOp, SwapOps,
-              isSat, isRnd, hasShift>;
-
-let Itinerary = S_3op_tc_1_SLOT23 in {
-  def S2_shuffeb : T_S3op_64 < "shuffeb", 0b00, 0b010, 0>;
-  def S2_shuffeh : T_S3op_64 < "shuffeh", 0b00, 0b110, 0>;
-  def S2_shuffob : T_S3op_64 < "shuffob", 0b00, 0b100, 1>;
-  def S2_shuffoh : T_S3op_64 < "shuffoh", 0b10, 0b000, 1>;
-
-  def S2_vtrunewh : T_S3op_64 < "vtrunewh", 0b10, 0b010, 0>;
-  def S2_vtrunowh : T_S3op_64 < "vtrunowh", 0b10, 0b100, 0>;
-}
-
-def S2_lfsp : T_S3op_64 < "lfs", 0b10, 0b110, 0>;
-
-let hasSideEffects = 0 in
-class T_S3op_2 <string mnemonic, bits<3> MajOp, bit SwapOps>
-  : SInst < (outs DoubleRegs:$Rdd),
-            (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
-  "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu)",
-  [], "", S_3op_tc_1_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-    bits<2> Pu;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = !if (SwapOps, Rtt, Rss);
-    let Inst{12-8} = !if (SwapOps, Rss, Rtt);
-    let Inst{6-5} = Pu;
-    let Inst{4-0} = Rdd;
-  }
-
-def S2_valignrb  : T_S3op_2 < "valignb",  0b000, 1>;
-def S2_vsplicerb : T_S3op_2 < "vspliceb", 0b100, 0>;
-
-//===----------------------------------------------------------------------===//
-// Template class used by vector shift, vector rotate, vector neg,
-// 32-bit shift, 64-bit shifts, etc.
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0 in
-class T_S3op_3 <string mnemonic, RegisterClass RC, bits<2> MajOp,
-                 bits<2> MinOp, bit isSat = 0, list<dag> pattern = [] >
-  : SInst <(outs RC:$dst),
-           (ins RC:$src1, IntRegs:$src2),
-  "$dst = "#mnemonic#"($src1, $src2)"#!if(isSat, ":sat", ""),
-  pattern, "", S_3op_tc_1_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b0110, 0b0011);
-    let Inst{23-22} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{12-8} = src2;
-    let Inst{7-6} = MinOp;
-    let Inst{4-0} = dst;
-  }
-
-let hasNewValue = 1 in
-class T_S3op_shift32 <string mnemonic, SDNode OpNode, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, IntRegs, 0b01, MinOp, 0, []>;
-
-let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in
-class T_S3op_shift32_Sat <string mnemonic, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, IntRegs, 0b00, MinOp, 1, []>;
-
-
-class T_S3op_shift64 <string mnemonic, SDNode OpNode, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, DoubleRegs, 0b10, MinOp, 0, []>;
-
-
-class T_S3op_shiftVect <string mnemonic, bits<2> MajOp, bits<2> MinOp>
-  : T_S3op_3 <mnemonic, DoubleRegs, MajOp, MinOp, 0, []>;
-
-
-// Shift by register
-// Rdd=[asr|lsr|asl|lsl](Rss,Rt)
-
-def S2_asr_r_p : T_S3op_shift64 < "asr", sra, 0b00>;
-def S2_lsr_r_p : T_S3op_shift64 < "lsr", srl, 0b01>;
-def S2_asl_r_p : T_S3op_shift64 < "asl", shl, 0b10>;
-def S2_lsl_r_p : T_S3op_shift64 < "lsl", shl, 0b11>;
-
-// Rd=[asr|lsr|asl|lsl](Rs,Rt)
-
-def S2_asr_r_r : T_S3op_shift32<"asr", sra, 0b00>;
-def S2_lsr_r_r : T_S3op_shift32<"lsr", srl, 0b01>;
-def S2_asl_r_r : T_S3op_shift32<"asl", shl, 0b10>;
-def S2_lsl_r_r : T_S3op_shift32<"lsl", shl, 0b11>;
-
-// Shift by register with saturation
-// Rd=asr(Rs,Rt):sat
-// Rd=asl(Rs,Rt):sat
-
-let Defs = [USR_OVF] in {
-  def S2_asr_r_r_sat : T_S3op_shift32_Sat<"asr", 0b00>;
-  def S2_asl_r_r_sat : T_S3op_shift32_Sat<"asl", 0b10>;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_S3op_8 <string opc, bits<3> MinOp, bit isSat, bit isRnd, bit hasShift, bit hasSplat = 0>
-  : SInst < (outs IntRegs:$Rd),
-            (ins DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rd = "#opc#"($Rss, $Rt"#!if(hasSplat, "*", "")#")"
-                           #!if(hasShift, ":<<1", "")
-                           #!if(isRnd, ":rnd", "")
-                           #!if(isSat, ":sat", ""),
-  [], "", S_3op_tc_1_SLOT23 > {
-    bits<5> Rd;
-    bits<5> Rss;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0101;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = Rd;
-  }
-
-def S2_asr_r_svw_trun : T_S3op_8<"vasrw", 0b010, 0, 0, 0>;
-
-let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
-def S2_vcrotate : T_S3op_shiftVect < "vcrotate", 0b11, 0b00>;
-
-let hasSideEffects = 0 in
-class T_S3op_7 <string mnemonic, bit MajOp >
-  : SInst <(outs DoubleRegs:$Rdd),
-           (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, u3_0Imm:$u3),
-  "$Rdd = "#mnemonic#"($Rss, $Rtt, #$u3)" ,
-  [], "", S_3op_tc_1_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-    bits<3> u3;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0000;
-    let Inst{23}    = MajOp;
-    let Inst{20-16} = !if(MajOp, Rss, Rtt);
-    let Inst{12-8}  =  !if(MajOp, Rtt, Rss);
-    let Inst{7-5}   = u3;
-    let Inst{4-0}   = Rdd;
-  }
-
-def S2_valignib  : T_S3op_7 < "valignb", 0>;
-def S2_vspliceib : T_S3op_7 < "vspliceb", 1>;
-
-//===----------------------------------------------------------------------===//
-// Template class for 'insert bitfield' instructions
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0 in
-class T_S3op_insert <string mnemonic, RegisterClass RC>
-  : SInst <(outs RC:$dst),
-           (ins RC:$src1, RC:$src2, DoubleRegs:$src3),
-  "$dst = "#mnemonic#"($src2, $src3)" ,
-  [], "$src1 = $dst", S_3op_tc_1_SLOT23 > {
-    bits<5> dst;
-    bits<5> src2;
-    bits<5> src3;
-
-    let IClass = 0b1100;
-
-    let Inst{27-26} = 0b10;
-    let Inst{25-24} = !if(!eq(!cast<string>(RC), "IntRegs"), 0b00, 0b10);
-    let Inst{23}    = 0b0;
-    let Inst{20-16} = src2;
-    let Inst{12-8}  = src3;
-    let Inst{4-0}   = dst;
-  }
-
-let hasSideEffects = 0 in
-class T_S2op_insert <bits<4> RegTyBits, RegisterClass RC, Operand ImmOp>
-  : SInst <(outs RC:$dst), (ins RC:$dst2, RC:$src1, ImmOp:$src2, ImmOp:$src3),
-  "$dst = insert($src1, #$src2, #$src3)",
-  [], "$dst2 = $dst", S_2op_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<6> src2;
-    bits<6> src3;
-    bit bit23;
-    bit bit13;
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let bit23 = !if (!eq(ImmOpStr, "u6_0Imm"), src3{5}, 0);
-    let bit13 = !if (!eq(ImmOpStr, "u6_0Imm"), src2{5}, 0);
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23}    = bit23;
-    let Inst{22-21} = src3{4-3};
-    let Inst{20-16} = src1;
-    let Inst{13}    = bit13;
-    let Inst{12-8}  = src2{4-0};
-    let Inst{7-5}   = src3{2-0};
-    let Inst{4-0}   = dst;
-  }
-
-// Rx=insert(Rs,Rtt)
-// Rx=insert(Rs,#u5,#U5)
-let hasNewValue = 1 in {
-  def S2_insert_rp : T_S3op_insert <"insert", IntRegs>;
-  def S2_insert    : T_S2op_insert <0b1111, IntRegs, u5_0Imm>;
-}
-
-// Rxx=insert(Rss,Rtt)
-// Rxx=insert(Rss,#u6,#U6)
-def S2_insertp_rp : T_S3op_insert<"insert", DoubleRegs>;
-def S2_insertp    : T_S2op_insert <0b0011, DoubleRegs, u6_0Imm>;
-
-
-//===----------------------------------------------------------------------===//
-// Template class for 'extract bitfield' instructions
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_S3op_extract <string mnemonic, bits<2> MinOp>
-  : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
-  "$Rd = "#mnemonic#"($Rs, $Rtt)",
-  [], "", S_3op_tc_2_SLOT23 > {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Rtt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b100100;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-6}   = MinOp;
-    let Inst{4-0}   = Rd;
-  }
-
-let hasSideEffects = 0 in
-class T_S2op_extract <string mnemonic, bits<4> RegTyBits,
-                      RegisterClass RC, Operand ImmOp>
-  : SInst <(outs RC:$dst), (ins RC:$src1, ImmOp:$src2, ImmOp:$src3),
-  "$dst = "#mnemonic#"($src1, #$src2, #$src3)",
-  [], "", S_2op_tc_2_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<6> src2;
-    bits<6> src3;
-    bit bit23;
-    bit bit13;
-    string ImmOpStr = !cast<string>(ImmOp);
-
-    let bit23 = !if (!eq(ImmOpStr, "u6_0Imm"), src3{5},
-                !if (!eq(mnemonic, "extractu"), 0, 1));
-
-    let bit13 = !if (!eq(ImmOpStr, "u6_0Imm"), src2{5}, 0);
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = RegTyBits;
-    let Inst{23}    = bit23;
-    let Inst{22-21} = src3{4-3};
-    let Inst{20-16} = src1;
-    let Inst{13}    = bit13;
-    let Inst{12-8}  = src2{4-0};
-    let Inst{7-5}   = src3{2-0};
-    let Inst{4-0}   = dst;
-  }
-
-// Extract bitfield
-
-// Rdd=extractu(Rss,Rtt)
-// Rdd=extractu(Rss,#u6,#U6)
-def S2_extractup_rp : T_S3op_64 < "extractu", 0b00, 0b000, 0>;
-def S2_extractup    : T_S2op_extract <"extractu", 0b0001, DoubleRegs, u6_0Imm>;
-
-// Rd=extractu(Rs,Rtt)
-// Rd=extractu(Rs,#u5,#U5)
-let hasNewValue = 1 in {
-  def S2_extractu_rp : T_S3op_extract<"extractu", 0b00>;
-  def S2_extractu    : T_S2op_extract <"extractu", 0b1101, IntRegs, u5_0Imm>;
-}
-
-//===----------------------------------------------------------------------===//
-// :raw for of tableindx[bdhw] insns
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class tableidxRaw<string OpStr, bits<2>MinOp>
-  : SInst <(outs IntRegs:$Rx),
-           (ins IntRegs:$_dst_, IntRegs:$Rs, u4_0Imm:$u4, s6_0Imm:$S6),
-           "$Rx = "#OpStr#"($Rs, #$u4, #$S6):raw",
-    [], "$Rx = $_dst_" > {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<4> u4;
-    bits<6> S6;
-
-    let IClass = 0b1000;
-
-    let Inst{27-24} = 0b0111;
-    let Inst{23-22} = MinOp;
-    let Inst{21}    = u4{3};
-    let Inst{20-16} = Rs;
-    let Inst{13-8}  = S6;
-    let Inst{7-5}   = u4{2-0};
-    let Inst{4-0}   = Rx;
-  }
-
-def S2_tableidxb : tableidxRaw<"tableidxb", 0b00>;
-def S2_tableidxh : tableidxRaw<"tableidxh", 0b01>;
-def S2_tableidxw : tableidxRaw<"tableidxw", 0b10>;
-def S2_tableidxd : tableidxRaw<"tableidxd", 0b11>;
-
-//===----------------------------------------------------------------------===//
-// Template class for 'table index' instructions which are assembler mapped
-// to their :raw format.
-//===----------------------------------------------------------------------===//
-let isPseudo = 1 in
-class tableidx_goodsyntax <string mnemonic>
-  : SInst <(outs IntRegs:$Rx),
-           (ins IntRegs:$_dst_, IntRegs:$Rs, u4_0Imm:$u4, u5_0Imm:$u5),
-           "$Rx = "#mnemonic#"($Rs, #$u4, #$u5)",
-           [], "$Rx = $_dst_" >;
-
-def S2_tableidxb_goodsyntax : tableidx_goodsyntax<"tableidxb">;
-def S2_tableidxh_goodsyntax : tableidx_goodsyntax<"tableidxh">;
-def S2_tableidxw_goodsyntax : tableidx_goodsyntax<"tableidxw">;
-def S2_tableidxd_goodsyntax : tableidx_goodsyntax<"tableidxd">;
-
-//===----------------------------------------------------------------------===//
-// V3 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV3.td"
-
-//===----------------------------------------------------------------------===//
-// V3 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// V4 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV4.td"
-
-//===----------------------------------------------------------------------===//
-// V4 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// V5 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV5.td"
-
-//===----------------------------------------------------------------------===//
-// V5 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// V60 Instructions +
-//===----------------------------------------------------------------------===//
-
-include "HexagonInstrInfoV60.td"
-
-//===----------------------------------------------------------------------===//
-// V60 Instructions -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU32/64/Vector +
-//===----------------------------------------------------------------------===///
-
-include "HexagonInstrInfoVector.td"
-
-include "HexagonInstrAlias.td"
-include "HexagonSystemInst.td"
-
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV3.td b/lib/Target/Hexagon/HexagonInstrInfoV3.td
deleted file mode 100644
index 225f944050763f7def9ef5d945a1ed199d1e90c8..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfoV3.td
+++ /dev/null
@@ -1,215 +0,0 @@
-//=- HexagonInstrInfoV3.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V3 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// J +
-//===----------------------------------------------------------------------===//
-// Call subroutine.
-let isCall = 1, hasSideEffects = 1, isPredicable = 1,
-    isExtended = 0, isExtendable = 1, opExtendable = 0,
-    isExtentSigned = 1, opExtentBits = 24, opExtentAlign = 2 in
-class T_Call<bit CSR, string ExtStr>
-  : JInst<(outs), (ins calltarget:$dst),
-      "call " # ExtStr # "$dst", [], "", J_tc_2early_SLOT23> {
-  let BaseOpcode = "call";
-  bits<24> dst;
-
-  let Defs = !if (CSR, VolatileV3.Regs, []);
-  let IClass = 0b0101;
-  let Inst{27-25} = 0b101;
-  let Inst{24-16,13-1} = dst{23-2};
-  let Inst{0} = 0b0;
-}
-
-let isCall = 1, hasSideEffects = 1, isPredicated = 1,
-    isExtended = 0, isExtendable = 1, opExtendable = 1,
-    isExtentSigned = 1, opExtentBits = 17, opExtentAlign = 2 in
-class T_CallPred<bit CSR, bit IfTrue, string ExtStr>
-  : JInst<(outs), (ins PredRegs:$Pu, calltarget:$dst),
-      CondStr<"$Pu", IfTrue, 0>.S # "call " # ExtStr # "$dst",
-      [], "", J_tc_2early_SLOT23> {
-  let BaseOpcode = "call";
-  let isPredicatedFalse = !if(IfTrue,0,1);
-  bits<2> Pu;
-  bits<17> dst;
-
-  let Defs = !if (CSR, VolatileV3.Regs, []);
-  let IClass = 0b0101;
-  let Inst{27-24} = 0b1101;
-  let Inst{23-22,20-16,13,7-1} = dst{16-2};
-  let Inst{21} = !if(IfTrue,0,1);
-  let Inst{11} = 0b0;
-  let Inst{9-8} = Pu;
-}
-
-multiclass T_Calls<bit CSR, string ExtStr> {
-  def NAME : T_Call<CSR, ExtStr>;
-  def t    : T_CallPred<CSR, 1, ExtStr>;
-  def f    : T_CallPred<CSR, 0, ExtStr>;
-}
-
-defm J2_call: T_Calls<1, "">, PredRel;
-
-let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
-    Defs = VolatileV3.Regs in
-def PS_call_nr : T_Call<1, "">, PredRel;
-
-let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
-    Defs = [PC, R31, R6, R7, P0] in
-def PS_call_stk :  T_Call<0, "">, PredRel;
-
-//===----------------------------------------------------------------------===//
-// J -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// JR +
-//===----------------------------------------------------------------------===//
-// Call subroutine from register.
-
-let isCodeGenOnly = 1, Defs = VolatileV3.Regs in {
-  def PS_callr_nr : JUMPR_MISC_CALLR<0, 1>; // Call, no return.
-}
-
-//===----------------------------------------------------------------------===//
-// JR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU +
-//===----------------------------------------------------------------------===//
-
-let Defs = [USR_OVF], Itinerary = ALU64_tc_2_SLOT23 in
-def A2_addpsat : T_ALU64_arith<"add", 0b011, 0b101, 1, 0, 1>;
-
-class T_ALU64_addsp_hl<string suffix, bits<3> MinOp>
-  : T_ALU64_rr<"add", suffix, 0b0011, 0b011, MinOp, 0, 0, "">;
-
-def A2_addspl : T_ALU64_addsp_hl<":raw:lo", 0b110>;
-def A2_addsph : T_ALU64_addsp_hl<":raw:hi", 0b111>;
-
-let hasSideEffects = 0, isAsmParserOnly = 1 in
-def A2_addsp : ALU64_rr<(outs DoubleRegs:$Rd),
-  (ins IntRegs:$Rs, DoubleRegs:$Rt), "$Rd = add($Rs, $Rt)", [],
-  "", ALU64_tc_1_SLOT23>;
-
-
-let hasSideEffects = 0 in
-class T_XTYPE_MIN_MAX_P<bit isMax, bit isUnsigned>
-  : ALU64Inst<(outs DoubleRegs:$Rd), (ins DoubleRegs:$Rt, DoubleRegs:$Rs),
-  "$Rd = "#!if(isMax,"max","min")#!if(isUnsigned,"u","")
-          #"($Rt, $Rs)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-
-  let Inst{27-23} = 0b00111;
-  let Inst{22-21} = !if(isMax, 0b10, 0b01);
-  let Inst{20-16} = !if(isMax, Rt, Rs);
-  let Inst{12-8} = !if(isMax, Rs, Rt);
-  let Inst{7} = 0b1;
-  let Inst{6} = !if(isMax, 0b0, 0b1);
-  let Inst{5} = isUnsigned;
-  let Inst{4-0} = Rd;
-}
-
-def A2_minp  : T_XTYPE_MIN_MAX_P<0, 0>;
-def A2_minup : T_XTYPE_MIN_MAX_P<0, 1>;
-def A2_maxp  : T_XTYPE_MIN_MAX_P<1, 0>;
-def A2_maxup : T_XTYPE_MIN_MAX_P<1, 1>;
-
-//===----------------------------------------------------------------------===//
-// ALU64/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// :raw form of vrcmpys:hi/lo insns
-//===----------------------------------------------------------------------===//
-// Vector reduce complex multiply by scalar.
-let Defs = [USR_OVF], hasSideEffects = 0 in
-class T_vrcmpRaw<string HiLo, bits<3>MajOp>:
-  MInst<(outs DoubleRegs:$Rdd),
-         (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-         "$Rdd = vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, []> {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-5}   = 0b100;
-    let Inst{4-0}   = Rdd;
-}
-
-def M2_vrcmpys_s1_h: T_vrcmpRaw<"hi", 0b101>;
-def M2_vrcmpys_s1_l: T_vrcmpRaw<"lo", 0b111>;
-
-// Assembler mapped to M2_vrcmpys_s1_h or M2_vrcmpys_s1_l
-let hasSideEffects = 0, isAsmParserOnly = 1 in
-def M2_vrcmpys_s1
- : MInst<(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
- "$Rdd=vrcmpys($Rss,$Rt):<<1:sat">;
-
-// Vector reduce complex multiply by scalar with accumulation.
-let Defs = [USR_OVF], hasSideEffects = 0 in
-class T_vrcmpys_acc<string HiLo, bits<3>MajOp>:
-  MInst <(outs DoubleRegs:$Rxx),
-         (ins DoubleRegs:$_src_, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx += vrcmpys($Rss, $Rtt):<<1:sat:raw:"#HiLo, [],
-  "$Rxx = $_src_"> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-5}   = 0b100;
-    let Inst{4-0}   = Rxx;
-  }
-
-def M2_vrcmpys_acc_s1_h: T_vrcmpys_acc<"hi", 0b101>;
-def M2_vrcmpys_acc_s1_l: T_vrcmpys_acc<"lo", 0b111>;
-
-// Assembler mapped to M2_vrcmpys_acc_s1_h or M2_vrcmpys_acc_s1_l
-
-let isAsmParserOnly = 1 in
-def M2_vrcmpys_acc_s1
-  : MInst <(outs DoubleRegs:$dst),
-           (ins DoubleRegs:$dst2, DoubleRegs:$src1, IntRegs:$src2),
-           "$dst += vrcmpys($src1, $src2):<<1:sat", [],
-           "$dst2 = $dst">;
-
-def M2_vrcmpys_s1rp_h : T_MType_vrcmpy <"vrcmpys", 0b101, 0b110, 1>;
-def M2_vrcmpys_s1rp_l : T_MType_vrcmpy <"vrcmpys", 0b101, 0b111, 0>;
-
-// Assembler mapped to M2_vrcmpys_s1rp_h or M2_vrcmpys_s1rp_l
-let isAsmParserOnly = 1 in
-def M2_vrcmpys_s1rp
-  : MInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rd=vrcmpys($Rss,$Rt):<<1:rnd:sat">;
-
-
-// S2_cabacdecbin: Cabac decode bin.
-let Defs = [P0], isPredicateLate = 1, Itinerary = S_3op_tc_1_SLOT23 in
-def S2_cabacdecbin : T_S3op_64 < "decbin", 0b11, 0b110, 0>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV4.td b/lib/Target/Hexagon/HexagonInstrInfoV4.td
deleted file mode 100644
index 18943a082d28a2445416ea17f284a643269b4a38..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfoV4.td
+++ /dev/null
@@ -1,3301 +0,0 @@
-//=- HexagonInstrInfoV4.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V4 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-def DuplexIClass0:  InstDuplex < 0 >;
-def DuplexIClass1:  InstDuplex < 1 >;
-def DuplexIClass2:  InstDuplex < 2 >;
-let isExtendable = 1 in {
-  def DuplexIClass3:  InstDuplex < 3 >;
-  def DuplexIClass4:  InstDuplex < 4 >;
-  def DuplexIClass5:  InstDuplex < 5 >;
-  def DuplexIClass6:  InstDuplex < 6 >;
-  def DuplexIClass7:  InstDuplex < 7 >;
-}
-def DuplexIClass8:  InstDuplex < 8 >;
-def DuplexIClass9:  InstDuplex < 9 >;
-def DuplexIClassA:  InstDuplex < 0xA >;
-def DuplexIClassB:  InstDuplex < 0xB >;
-def DuplexIClassC:  InstDuplex < 0xC >;
-def DuplexIClassD:  InstDuplex < 0xD >;
-def DuplexIClassE:  InstDuplex < 0xE >;
-def DuplexIClassF:  InstDuplex < 0xF >;
-
-let hasSideEffects = 0 in
-class T_Immext<Operand ImmType>
-  : EXTENDERInst<(outs), (ins ImmType:$imm),
-                 "immext(#$imm)", []> {
-    bits<32> imm;
-    let IClass = 0b0000;
-
-    let Inst{27-16} = imm{31-20};
-    let Inst{13-0} = imm{19-6};
-  }
-
-def A4_ext : T_Immext<u26_6Imm>;
-let isCodeGenOnly = 1 in {
-  let isBranch = 1 in
-    def A4_ext_b : T_Immext<brtarget>;
-  let isCall = 1 in
-    def A4_ext_c : T_Immext<calltarget>;
-  def A4_ext_g : T_Immext<globaladdress>;
-}
-
-// Hexagon V4 Architecture spec defines 8 instruction classes:
-// LD ST ALU32 XTYPE J JR MEMOP NV CR SYSTEM(system is not implemented in the
-// compiler)
-
-// LD Instructions:
-// ========================================
-// Loads (8/16/32/64 bit)
-// Deallocframe
-
-// ST Instructions:
-// ========================================
-// Stores (8/16/32/64 bit)
-// Allocframe
-
-// ALU32 Instructions:
-// ========================================
-// Arithmetic / Logical (32 bit)
-// Vector Halfword
-
-// XTYPE Instructions (32/64 bit):
-// ========================================
-// Arithmetic, Logical, Bit Manipulation
-// Multiply (Integer, Fractional, Complex)
-// Permute / Vector Permute Operations
-// Predicate Operations
-// Shift / Shift with Add/Sub/Logical
-// Vector Byte ALU
-// Vector Halfword (ALU, Shift, Multiply)
-// Vector Word (ALU, Shift)
-
-// J Instructions:
-// ========================================
-// Jump/Call PC-relative
-
-// JR Instructions:
-// ========================================
-// Jump/Call Register
-
-// MEMOP Instructions:
-// ========================================
-// Operation on memory (8/16/32 bit)
-
-// NV Instructions:
-// ========================================
-// New-value Jumps
-// New-value Stores
-
-// CR Instructions:
-// ========================================
-// Control-Register Transfers
-// Hardware Loop Setup
-// Predicate Logicals & Reductions
-
-// SYSTEM Instructions (not implemented in the compiler):
-// ========================================
-// Prefetch
-// Cache Maintenance
-// Bus Operations
-
-
-//===----------------------------------------------------------------------===//
-// ALU32 +
-//===----------------------------------------------------------------------===//
-
-class T_ALU32_3op_not<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                      bit OpsRev>
-  : T_ALU32_3op<mnemonic, MajOp, MinOp, OpsRev, 0> {
-  let AsmString = "$Rd = "#mnemonic#"($Rs, ~$Rt)";
-}
-
-let BaseOpcode = "andn_rr", CextOpcode = "andn" in
-def A4_andn    : T_ALU32_3op_not<"and", 0b001, 0b100, 1>;
-let BaseOpcode = "orn_rr", CextOpcode = "orn" in
-def A4_orn     : T_ALU32_3op_not<"or",  0b001, 0b101, 1>;
-
-let CextOpcode = "rcmp.eq" in
-def A4_rcmpeq  : T_ALU32_3op<"cmp.eq",  0b011, 0b010, 0, 1>;
-let CextOpcode = "!rcmp.eq" in
-def A4_rcmpneq : T_ALU32_3op<"!cmp.eq", 0b011, 0b011, 0, 1>;
-
-def C4_cmpneq  : T_ALU32_3op_cmp<"!cmp.eq",  0b00, 1, 1>;
-def C4_cmplte  : T_ALU32_3op_cmp<"!cmp.gt",  0b10, 1, 0>;
-def C4_cmplteu : T_ALU32_3op_cmp<"!cmp.gtu", 0b11, 1, 0>;
-
-class T_CMP_rrbh<string mnemonic, bits<3> MinOp, bit IsComm>
-  : SInst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, IntRegs:$Rt),
-    "$Pd = "#mnemonic#"($Rs, $Rt)", [], "", S_3op_tc_2early_SLOT23>,
-    ImmRegRel {
-  let InputType = "reg";
-  let CextOpcode = mnemonic;
-  let isCompare = 1;
-  let isCommutable = IsComm;
-  let hasSideEffects = 0;
-
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1100;
-  let Inst{27-21} = 0b0111110;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-5} = MinOp;
-  let Inst{1-0} = Pd;
-}
-
-def A4_cmpbeq  : T_CMP_rrbh<"cmpb.eq",  0b110, 1>;
-def A4_cmpbgt  : T_CMP_rrbh<"cmpb.gt",  0b010, 0>;
-def A4_cmpbgtu : T_CMP_rrbh<"cmpb.gtu", 0b111, 0>;
-def A4_cmpheq  : T_CMP_rrbh<"cmph.eq",  0b011, 1>;
-def A4_cmphgt  : T_CMP_rrbh<"cmph.gt",  0b100, 0>;
-def A4_cmphgtu : T_CMP_rrbh<"cmph.gtu", 0b101, 0>;
-
-class T_CMP_ribh<string mnemonic, bits<2> MajOp, bit IsHalf, bit IsComm,
-                 Operand ImmType, bit IsImmExt, bit IsImmSigned, int ImmBits>
-  : ALU64Inst<(outs PredRegs:$Pd), (ins IntRegs:$Rs, ImmType:$Imm),
-    "$Pd = "#mnemonic#"($Rs, #$Imm)", [], "", ALU64_tc_2early_SLOT23>,
-    ImmRegRel {
-  let InputType = "imm";
-  let CextOpcode = mnemonic;
-  let isCompare = 1;
-  let isCommutable = IsComm;
-  let hasSideEffects = 0;
-  let isExtendable = IsImmExt;
-  let opExtendable = !if (IsImmExt, 2, 0);
-  let isExtentSigned = IsImmSigned;
-  let opExtentBits = ImmBits;
-
-  bits<2> Pd;
-  bits<5> Rs;
-  bits<8> Imm;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b1101;
-  let Inst{22-21} = MajOp;
-  let Inst{20-16} = Rs;
-  let Inst{12-5} = Imm;
-  let Inst{4} = 0b0;
-  let Inst{3} = IsHalf;
-  let Inst{1-0} = Pd;
-}
-
-def A4_cmpbeqi  : T_CMP_ribh<"cmpb.eq",  0b00, 0, 1, u8_0Imm, 0, 0, 8>;
-def A4_cmpbgti  : T_CMP_ribh<"cmpb.gt",  0b01, 0, 0, s8_0Imm, 0, 1, 8>;
-def A4_cmpbgtui : T_CMP_ribh<"cmpb.gtu", 0b10, 0, 0, u7_0Ext, 1, 0, 7>;
-def A4_cmpheqi  : T_CMP_ribh<"cmph.eq",  0b00, 1, 1, s8_0Ext, 1, 1, 8>;
-def A4_cmphgti  : T_CMP_ribh<"cmph.gt",  0b01, 1, 0, s8_0Ext, 1, 1, 8>;
-def A4_cmphgtui : T_CMP_ribh<"cmph.gtu", 0b10, 1, 0, u7_0Ext, 1, 0, 7>;
-
-class T_RCMP_EQ_ri<string mnemonic, bit IsNeg>
-  : ALU32_ri<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s8_0Ext:$s8),
-    "$Rd = "#mnemonic#"($Rs, #$s8)", [], "", ALU32_2op_tc_1_SLOT0123>,
-    ImmRegRel {
-  let InputType = "imm";
-  let CextOpcode = !if (IsNeg, "!rcmp.eq", "rcmp.eq");
-  let isExtendable = 1;
-  let opExtendable = 2;
-  let isExtentSigned = 1;
-  let opExtentBits = 8;
-  let hasNewValue = 1;
-
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<8> s8;
-
-  let IClass = 0b0111;
-  let Inst{27-24} = 0b0011;
-  let Inst{22} = 0b1;
-  let Inst{21} = IsNeg;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0b1;
-  let Inst{12-5} = s8;
-  let Inst{4-0} = Rd;
-}
-
-def A4_rcmpeqi  : T_RCMP_EQ_ri<"cmp.eq",  0>;
-def A4_rcmpneqi : T_RCMP_EQ_ri<"!cmp.eq", 1>;
-
-//===----------------------------------------------------------------------===//
-// ALU32 -
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM +
-//===----------------------------------------------------------------------===//
-
-// Combine a word and an immediate into a register pair.
-let hasSideEffects = 0, isExtentSigned = 1, isExtendable = 1,
-    opExtentBits = 8 in
-class T_Combine1 <bits<2> MajOp, dag ins, string AsmStr>
-  : ALU32Inst <(outs DoubleRegs:$Rdd), ins, AsmStr> {
-    bits<5> Rdd;
-    bits<5> Rs;
-    bits<8> s8;
-
-    let IClass      = 0b0111;
-    let Inst{27-24} = 0b0011;
-    let Inst{22-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b1;
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rdd;
-  }
-
-let opExtendable = 2 in
-def A4_combineri : T_Combine1<0b00, (ins IntRegs:$Rs, s8_0Ext:$s8),
-                                    "$Rdd = combine($Rs, #$s8)">;
-
-let opExtendable = 1 in
-def A4_combineir : T_Combine1<0b01, (ins s8_0Ext:$s8, IntRegs:$Rs),
-                                    "$Rdd = combine(#$s8, $Rs)">;
-
-// A4_combineii: Set two small immediates.
-let hasSideEffects = 0, isExtendable = 1, opExtentBits = 6, opExtendable = 2 in
-def A4_combineii: ALU32Inst<(outs DoubleRegs:$Rdd), (ins s8_0Imm:$s8, u6_0Ext:$U6),
-  "$Rdd = combine(#$s8, #$U6)"> {
-    bits<5> Rdd;
-    bits<8> s8;
-    bits<6> U6;
-
-    let IClass = 0b0111;
-    let Inst{27-23} = 0b11001;
-    let Inst{20-16} = U6{5-1};
-    let Inst{13}    = U6{0};
-    let Inst{12-5}  = s8;
-    let Inst{4-0}   = Rdd;
-  }
-
-//===----------------------------------------------------------------------===//
-// ALU32/PERM -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// LD +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Template class for load instructions with Absolute set addressing mode.
-//===----------------------------------------------------------------------===//
-let isExtended = 1, opExtendable = 2, opExtentBits = 6, addrMode = AbsoluteSet,
-    hasSideEffects = 0 in
-class T_LD_abs_set<string mnemonic, RegisterClass RC, bits<4>MajOp>:
-            LDInst<(outs RC:$dst1, IntRegs:$dst2),
-            (ins u6_0Ext:$addr),
-            "$dst1 = "#mnemonic#"($dst2 = #$addr)",
-            []> {
-  bits<7> name;
-  bits<5> dst1;
-  bits<5> dst2;
-  bits<6> addr;
-
-  let IClass = 0b1001;
-  let Inst{27-25} = 0b101;
-  let Inst{24-21} = MajOp;
-  let Inst{13-12} = 0b01;
-  let Inst{4-0}   = dst1;
-  let Inst{20-16} = dst2;
-  let Inst{11-8}  = addr{5-2};
-  let Inst{6-5}   = addr{1-0};
-}
-
-let accessSize = ByteAccess, hasNewValue = 1 in {
-  def L4_loadrb_ap   : T_LD_abs_set <"memb",   IntRegs, 0b1000>;
-  def L4_loadrub_ap  : T_LD_abs_set <"memub",  IntRegs, 0b1001>;
-}
-
-let accessSize = HalfWordAccess, hasNewValue = 1 in {
-  def L4_loadrh_ap  : T_LD_abs_set <"memh",  IntRegs, 0b1010>;
-  def L4_loadruh_ap : T_LD_abs_set <"memuh", IntRegs, 0b1011>;
-  def L4_loadbsw2_ap : T_LD_abs_set <"membh",  IntRegs, 0b0001>;
-  def L4_loadbzw2_ap : T_LD_abs_set <"memubh", IntRegs, 0b0011>;
-}
-
-let accessSize = WordAccess, hasNewValue = 1 in
-  def L4_loadri_ap : T_LD_abs_set <"memw", IntRegs, 0b1100>;
-
-let accessSize = WordAccess in {
-  def L4_loadbzw4_ap : T_LD_abs_set <"memubh", DoubleRegs, 0b0101>;
-  def L4_loadbsw4_ap : T_LD_abs_set <"membh",  DoubleRegs, 0b0111>;
-}
-
-let accessSize = DoubleWordAccess in
-def L4_loadrd_ap : T_LD_abs_set <"memd", DoubleRegs, 0b1110>;
-
-let accessSize = ByteAccess in
-  def L4_loadalignb_ap : T_LD_abs_set <"memb_fifo", DoubleRegs, 0b0100>;
-
-let accessSize = HalfWordAccess in
-def L4_loadalignh_ap : T_LD_abs_set <"memh_fifo", DoubleRegs, 0b0010>;
-
-// Load - Indirect with long offset
-let InputType = "imm", addrMode = BaseLongOffset, isExtended = 1,
-opExtentBits = 6, opExtendable = 3 in
-class T_LoadAbsReg <string mnemonic, string CextOp, RegisterClass RC,
-                    bits<4> MajOp>
-  : LDInst <(outs RC:$dst), (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3),
-  "$dst = "#mnemonic#"($src1<<#$src2 + #$src3)",
-  [] >, ImmRegShl {
-    bits<5> dst;
-    bits<5> src1;
-    bits<2> src2;
-    bits<6> src3;
-    let CextOpcode = CextOp;
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-
-    let IClass = 0b1001;
-    let Inst{27-25} = 0b110;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2{1};
-    let Inst{12}    = 0b1;
-    let Inst{11-8}  = src3{5-2};
-    let Inst{7}     = src2{0};
-    let Inst{6-5}   = src3{1-0};
-    let Inst{4-0}   = dst;
-  }
-
-let accessSize = ByteAccess in {
-  def L4_loadrb_ur  : T_LoadAbsReg<"memb",  "LDrib", IntRegs, 0b1000>;
-  def L4_loadrub_ur : T_LoadAbsReg<"memub", "LDriub", IntRegs, 0b1001>;
-  def L4_loadalignb_ur : T_LoadAbsReg<"memb_fifo", "LDrib_fifo",
-                                      DoubleRegs, 0b0100>;
-}
-
-let accessSize = HalfWordAccess in {
-  def L4_loadrh_ur   : T_LoadAbsReg<"memh",   "LDrih",    IntRegs, 0b1010>;
-  def L4_loadruh_ur  : T_LoadAbsReg<"memuh",  "LDriuh",   IntRegs, 0b1011>;
-  def L4_loadbsw2_ur : T_LoadAbsReg<"membh",  "LDribh2",  IntRegs, 0b0001>;
-  def L4_loadbzw2_ur : T_LoadAbsReg<"memubh", "LDriubh2", IntRegs, 0b0011>;
-  def L4_loadalignh_ur : T_LoadAbsReg<"memh_fifo", "LDrih_fifo",
-                                      DoubleRegs, 0b0010>;
-}
-
-let accessSize = WordAccess in {
-  def L4_loadri_ur   : T_LoadAbsReg<"memw", "LDriw", IntRegs, 0b1100>;
-  def L4_loadbsw4_ur : T_LoadAbsReg<"membh", "LDribh4", DoubleRegs, 0b0111>;
-  def L4_loadbzw4_ur : T_LoadAbsReg<"memubh", "LDriubh4", DoubleRegs, 0b0101>;
-}
-
-let accessSize = DoubleWordAccess in
-def L4_loadrd_ur  : T_LoadAbsReg<"memd", "LDrid", DoubleRegs, 0b1110>;
-
-
-//===----------------------------------------------------------------------===//
-// Template classes for the non-predicated load instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-class T_load_rr <string mnemonic, RegisterClass RC, bits<3> MajOp>:
-   LDInst<(outs RC:$dst), (ins IntRegs:$src1, IntRegs:$src2, u2_0Imm:$u2),
-  "$dst = "#mnemonic#"($src1 + $src2<<#$u2)",
-  [], "", V4LDST_tc_ld_SLOT01>, ImmRegShl, AddrModeRel {
-    bits<5> dst;
-    bits<5> src1;
-    bits<5> src2;
-    bits<2> u2;
-
-    let IClass = 0b0011;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{12-8}  = src2;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{4-0}   = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the predicated load instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicated =  1 in
-class T_pload_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                  bit isNot, bit isPredNew>:
-   LDInst <(outs RC:$dst),
-           (ins PredRegs:$src1, IntRegs:$src2, IntRegs:$src3, u2_0Imm:$u2),
-  !if(isNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#"$dst = "#mnemonic#"($src2+$src3<<#$u2)",
-  [], "", V4LDST_tc_ld_SLOT01>, AddrModeRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<5> src2;
-    bits<5> src3;
-    bits<2> u2;
-
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-
-    let IClass = 0b0011;
-
-    let Inst{27-26} = 0b00;
-    let Inst{25}    = isPredNew;
-    let Inst{24}    = isNot;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{12-8}  = src3;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{6-5}   = src1;
-    let Inst{4-0}   = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// multiclass for load instructions with base + register offset
-// addressing mode
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, addrMode = BaseRegOffset in
-multiclass ld_idxd_shl <string mnemonic, string CextOp, RegisterClass RC,
-                        bits<3> MajOp > {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl,
-      InputType = "reg" in {
-    let isPredicable = 1 in
-    def L4_#NAME#_rr : T_load_rr <mnemonic, RC, MajOp>;
-
-    // Predicated
-    def L4_p#NAME#t_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 0>;
-    def L4_p#NAME#f_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 0>;
-
-    // Predicated new
-    def L4_p#NAME#tnew_rr : T_pload_rr <mnemonic, RC, MajOp, 0, 1>;
-    def L4_p#NAME#fnew_rr : T_pload_rr <mnemonic, RC, MajOp, 1, 1>;
-  }
-}
-
-let hasNewValue = 1, accessSize = ByteAccess in {
-  defm loadrb  : ld_idxd_shl<"memb", "LDrib", IntRegs, 0b000>;
-  defm loadrub : ld_idxd_shl<"memub", "LDriub", IntRegs, 0b001>;
-}
-
-let hasNewValue = 1, accessSize = HalfWordAccess in {
-  defm loadrh  : ld_idxd_shl<"memh", "LDrih", IntRegs, 0b010>;
-  defm loadruh : ld_idxd_shl<"memuh", "LDriuh", IntRegs, 0b011>;
-}
-
-let hasNewValue = 1, accessSize = WordAccess in
-defm loadri : ld_idxd_shl<"memw", "LDriw", IntRegs, 0b100>;
-
-let accessSize = DoubleWordAccess in
-defm loadrd  : ld_idxd_shl<"memd", "LDrid", DoubleRegs, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// LD -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ST +
-//===----------------------------------------------------------------------===//
-///
-//===----------------------------------------------------------------------===//
-// Template class for store instructions with Absolute set addressing mode.
-//===----------------------------------------------------------------------===//
-let isExtended = 1, opExtendable = 1, opExtentBits = 6,
-    addrMode = AbsoluteSet in
-class T_ST_absset <string mnemonic, string BaseOp, RegisterClass RC,
-                   bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
-  : STInst<(outs IntRegs:$dst),
-           (ins u6_0Ext:$addr, RC:$src),
-    mnemonic#"($dst = #$addr) = $src"#!if(isHalf, ".h","")>, NewValueRel {
-    bits<5> dst;
-    bits<6> addr;
-    bits<5> src;
-    let accessSize = AccessSz;
-    let BaseOpcode = BaseOp#"_AbsSet";
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = dst;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = src;
-    let Inst{7}     = 0b1;
-    let Inst{5-0}   = addr;
-  }
-
-def S4_storerb_ap : T_ST_absset <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
-def S4_storerh_ap : T_ST_absset <"memh", "STrih", IntRegs, 0b010,
-                                 HalfWordAccess>;
-def S4_storeri_ap : T_ST_absset <"memw", "STriw", IntRegs, 0b100, WordAccess>;
-
-let isNVStorable = 0 in {
-  def S4_storerf_ap : T_ST_absset <"memh", "STrif", IntRegs,
-                                   0b011, HalfWordAccess, 1>;
-  def S4_storerd_ap : T_ST_absset <"memd", "STrid", DoubleRegs,
-                                   0b110, DoubleWordAccess>;
-}
-
-let opExtendable = 1, isNewValue = 1, isNVStore = 1, opNewValue = 2,
-isExtended = 1, opExtentBits= 6 in
-class T_ST_absset_nv <string mnemonic, string BaseOp, bits<2> MajOp,
-                      MemAccessSize AccessSz >
-  : NVInst <(outs IntRegs:$dst),
-            (ins u6_0Ext:$addr, IntRegs:$src),
-    mnemonic#"($dst = #$addr) = $src.new">, NewValueRel {
-    bits<5> dst;
-    bits<6> addr;
-    bits<3> src;
-    let accessSize = AccessSz;
-    let BaseOpcode = BaseOp#"_AbsSet";
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = dst;
-    let Inst{13-11} = 0b000;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src;
-    let Inst{7}     = 0b1;
-    let Inst{5-0}   = addr;
-  }
-
-let mayStore = 1, addrMode = AbsoluteSet in {
-  def S4_storerbnew_ap : T_ST_absset_nv <"memb", "STrib", 0b00, ByteAccess>;
-  def S4_storerhnew_ap : T_ST_absset_nv <"memh", "STrih", 0b01, HalfWordAccess>;
-  def S4_storerinew_ap : T_ST_absset_nv <"memw", "STriw", 0b10, WordAccess>;
-}
-
-let isExtended = 1, opExtendable = 2, opExtentBits = 6, InputType = "imm",
-    addrMode = BaseLongOffset, AddedComplexity = 40 in
-class T_StoreAbsReg <string mnemonic, string CextOp, RegisterClass RC,
-                     bits<3> MajOp, MemAccessSize AccessSz, bit isHalf = 0>
-  : STInst<(outs),
-           (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3, RC:$src4),
-   mnemonic#"($src1<<#$src2 + #$src3) = $src4"#!if(isHalf, ".h",""),
-   []>, ImmRegShl, NewValueRel {
-
-    bits<5> src1;
-    bits<2> src2;
-    bits<6> src3;
-    bits<5> src4;
-
-    let accessSize = AccessSz;
-    let CextOpcode = CextOp;
-    let BaseOpcode = CextOp#"_shl";
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} =0b1101;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2{1};
-    let Inst{12-8}  = src4;
-    let Inst{7}     = 0b1;
-    let Inst{6}     = src2{0};
-    let Inst{5-0}   = src3;
-}
-
-def S4_storerb_ur : T_StoreAbsReg <"memb", "STrib", IntRegs, 0b000, ByteAccess>;
-def S4_storerh_ur : T_StoreAbsReg <"memh", "STrih", IntRegs, 0b010,
-                                   HalfWordAccess>;
-def S4_storerf_ur : T_StoreAbsReg <"memh", "STrif", IntRegs, 0b011,
-                                   HalfWordAccess, 1>;
-def S4_storeri_ur : T_StoreAbsReg <"memw", "STriw", IntRegs, 0b100, WordAccess>;
-def S4_storerd_ur : T_StoreAbsReg <"memd", "STrid", DoubleRegs, 0b110,
-                                   DoubleWordAccess>;
-
-let mayStore = 1, isNVStore = 1, isExtended = 1, addrMode = BaseLongOffset,
-    opExtentBits = 6, isNewValue = 1, opNewValue = 3, opExtendable = 2 in
-class T_StoreAbsRegNV <string mnemonic, string CextOp, bits<2> MajOp,
-                       MemAccessSize AccessSz>
-  : NVInst <(outs ),
-            (ins IntRegs:$src1, u2_0Imm:$src2, u6_0Ext:$src3, IntRegs:$src4),
-  mnemonic#"($src1<<#$src2 + #$src3) = $src4.new">, NewValueRel {
-    bits<5> src1;
-    bits<2> src2;
-    bits<6> src3;
-    bits<3> src4;
-
-    let CextOpcode  = CextOp;
-    let BaseOpcode  = CextOp#"_shl";
-    let IClass      = 0b1010;
-
-    let Inst{27-21} = 0b1101101;
-    let Inst{12-11} = 0b00;
-    let Inst{7}     = 0b1;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2{1};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src4;
-    let Inst{6}     = src2{0};
-    let Inst{5-0}   = src3;
-  }
-
-def S4_storerbnew_ur : T_StoreAbsRegNV <"memb", "STrib", 0b00, ByteAccess>;
-def S4_storerhnew_ur : T_StoreAbsRegNV <"memh", "STrih", 0b01, HalfWordAccess>;
-def S4_storerinew_ur : T_StoreAbsRegNV <"memw", "STriw", 0b10, WordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Template classes for the non-predicated store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicable = 1 in
-class T_store_rr <string mnemonic, RegisterClass RC, bits<3> MajOp, bit isH>
-  : STInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, RC:$Rt),
-  mnemonic#"($Rs + $Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
-  [],"",V4LDST_tc_st_SLOT01>, ImmRegShl, AddrModeRel {
-
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<5> Rt;
-
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-
-    let IClass = 0b0011;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{4-0}   = Rt;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the predicated store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicated = 1 in
-class T_pstore_rr <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                   bit isNot, bit isPredNew, bit isH>
-  : STInst <(outs),
-            (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, RC:$Rt),
-
-  !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Rt"#!if(isH, ".h",""),
-  [], "", V4LDST_tc_st_SLOT01> , AddrModeRel{
-    bits<2> Pv;
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<5> Rt;
-
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isH,0,1));
-
-    let IClass = 0b0011;
-
-    let Inst{27-26} = 0b01;
-    let Inst{25}    = isPredNew;
-    let Inst{24}    = isNot;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{6-5}   = Pv;
-    let Inst{4-0}   = Rt;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the new-value store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, isNewValue = 1, opNewValue = 3 in
-class T_store_new_rr <string mnemonic, bits<2> MajOp> :
-  NVInst < (outs ), (ins IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, IntRegs:$Nt),
-  mnemonic#"($Rs + $Ru<<#$u2) = $Nt.new",
-  [],"",V4LDST_tc_st_SLOT0>, ImmRegShl, AddrModeRel {
-
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<3> Nt;
-
-    let IClass = 0b0011;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{4-3}   = MajOp;
-    let Inst{2-0}   = Nt;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template classes for the predicated new-value store instructions with
-// base + register offset addressing mode
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, isNewValue = 1, opNewValue = 4 in
-class T_pstore_new_rr <string mnemonic, bits<2> MajOp, bit isNot, bit isPredNew>
-  : NVInst<(outs),
-           (ins PredRegs:$Pv, IntRegs:$Rs, IntRegs:$Ru, u2_0Imm:$u2, IntRegs:$Nt),
-   !if(isNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
-   ") ")#mnemonic#"($Rs+$Ru<<#$u2) = $Nt.new",
-   [], "", V4LDST_tc_st_SLOT0>, AddrModeRel {
-    bits<2> Pv;
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<2> u2;
-    bits<3> Nt;
-
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-
-    let IClass = 0b0011;
-    let Inst{27-26} = 0b01;
-    let Inst{25}    = isPredNew;
-    let Inst{24}    = isNot;
-    let Inst{23-21} = 0b101;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Ru;
-    let Inst{13}    = u2{1};
-    let Inst{7}     = u2{0};
-    let Inst{6-5}   = Pv;
-    let Inst{4-3}   = MajOp;
-    let Inst{2-0}   = Nt;
-  }
-
-//===----------------------------------------------------------------------===//
-// multiclass for store instructions with base + register offset addressing
-// mode
-//===----------------------------------------------------------------------===//
-let isNVStorable = 1 in
-multiclass ST_Idxd_shl<string mnemonic, string CextOp, RegisterClass RC,
-                       bits<3> MajOp, bit isH = 0> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
-    def S4_#NAME#_rr : T_store_rr <mnemonic, RC, MajOp, isH>;
-
-    // Predicated
-    def S4_p#NAME#t_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 0, isH>;
-    def S4_p#NAME#f_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 0, isH>;
-
-    // Predicated new
-    def S4_p#NAME#tnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 0, 1, isH>;
-    def S4_p#NAME#fnew_rr : T_pstore_rr <mnemonic, RC, MajOp, 1, 1, isH>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// multiclass for new-value store instructions with base + register offset
-// addressing mode.
-//===----------------------------------------------------------------------===//
-let mayStore = 1, isNVStore = 1 in
-multiclass ST_Idxd_shl_nv <string mnemonic, string CextOp, RegisterClass RC,
-                           bits<2> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed_shl in {
-    def S4_#NAME#new_rr : T_store_new_rr<mnemonic, MajOp>;
-
-    // Predicated
-    def S4_p#NAME#newt_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 0>;
-    def S4_p#NAME#newf_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 0>;
-
-    // Predicated new
-    def S4_p#NAME#newtnew_rr : T_pstore_new_rr <mnemonic, MajOp, 0, 1>;
-    def S4_p#NAME#newfnew_rr : T_pstore_new_rr <mnemonic, MajOp, 1, 1>;
-  }
-}
-
-let addrMode = BaseRegOffset, InputType = "reg", hasSideEffects = 0 in {
-  let accessSize = ByteAccess in
-  defm storerb: ST_Idxd_shl<"memb", "STrib", IntRegs, 0b000>,
-                ST_Idxd_shl_nv<"memb", "STrib", IntRegs, 0b00>;
-
-  let accessSize = HalfWordAccess in
-  defm storerh: ST_Idxd_shl<"memh", "STrih", IntRegs, 0b010>,
-                ST_Idxd_shl_nv<"memh", "STrih", IntRegs, 0b01>;
-
-  let accessSize = WordAccess in
-  defm storeri: ST_Idxd_shl<"memw", "STriw", IntRegs, 0b100>,
-                ST_Idxd_shl_nv<"memw", "STriw", IntRegs, 0b10>;
-
-  let isNVStorable = 0, accessSize = DoubleWordAccess in
-  defm storerd: ST_Idxd_shl<"memd", "STrid", DoubleRegs, 0b110>;
-
-  let isNVStorable = 0, accessSize = HalfWordAccess in
-  defm storerf: ST_Idxd_shl<"memh", "STrif", IntRegs, 0b011, 1>;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 8,
-    opExtendable = 2 in
-class T_StoreImm <string mnemonic, Operand OffsetOp, bits<2> MajOp >
-  : STInst <(outs ), (ins IntRegs:$Rs, OffsetOp:$offset, s8_0Ext:$S8),
-  mnemonic#"($Rs+#$offset)=#$S8",
-  [], "", V4LDST_tc_st_SLOT01>,
-  ImmRegRel, PredNewRel {
-    bits<5> Rs;
-    bits<8> S8;
-    bits<8> offset;
-    bits<6> offsetBits;
-
-    string OffsetOpStr = !cast<string>(OffsetOp);
-    let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
-                     !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
-                                         /* u6_0Imm */ offset{5-0}));
-
-    let IClass = 0b0011;
-
-    let Inst{27-25} = 0b110;
-    let Inst{22-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{12-7}  = offsetBits;
-    let Inst{13}    = S8{7};
-    let Inst{6-0}   = S8{6-0};
-  }
-
-let isPredicated = 1, isExtendable = 1, isExtentSigned = 1, opExtentBits = 6,
-    opExtendable = 3 in
-class T_StoreImm_pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
-                       bit isPredNot, bit isPredNew >
-  : STInst <(outs ),
-            (ins PredRegs:$Pv, IntRegs:$Rs, OffsetOp:$offset, s6_0Ext:$S6),
-  !if(isPredNot, "if (!$Pv", "if ($Pv")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($Rs+#$offset)=#$S6",
-  [], "", V4LDST_tc_st_SLOT01>,
-  ImmRegRel, PredNewRel {
-    bits<2> Pv;
-    bits<5> Rs;
-    bits<6> S6;
-    bits<8> offset;
-    bits<6> offsetBits;
-
-    string OffsetOpStr = !cast<string>(OffsetOp);
-    let offsetBits = !if (!eq(OffsetOpStr, "u6_2Imm"), offset{7-2},
-                     !if (!eq(OffsetOpStr, "u6_1Imm"), offset{6-1},
-                                         /* u6_0Imm */ offset{5-0}));
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b0011;
-
-    let Inst{27-25} = 0b100;
-    let Inst{24}    = isPredNew;
-    let Inst{23}    = isPredNot;
-    let Inst{22-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = S6{5};
-    let Inst{12-7}  = offsetBits;
-    let Inst{6-5}   = Pv;
-    let Inst{4-0}   = S6{4-0};
-  }
-
-
-//===----------------------------------------------------------------------===//
-// multiclass for store instructions with base + immediate offset
-// addressing mode and immediate stored value.
-// mem[bhw](Rx++#s4:3)=#s8
-// if ([!]Pv[.new]) mem[bhw](Rx++#s4:3)=#s6
-//===----------------------------------------------------------------------===//
-
-multiclass ST_Imm_Pred <string mnemonic, Operand OffsetOp, bits<2> MajOp,
-                        bit PredNot> {
-  def _io    : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 0>;
-  // Predicate new
-  def new_io : T_StoreImm_pred <mnemonic, OffsetOp, MajOp, PredNot, 1>;
-}
-
-multiclass ST_Imm <string mnemonic, string CextOp, Operand OffsetOp,
-                   bits<2> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_imm in {
-    def _io : T_StoreImm <mnemonic, OffsetOp, MajOp>;
-
-    defm t : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 0>;
-    defm f : ST_Imm_Pred <mnemonic, OffsetOp, MajOp, 1>;
-  }
-}
-
-let hasSideEffects = 0, addrMode = BaseImmOffset,
-    InputType = "imm" in {
-  let accessSize = ByteAccess in
-  defm S4_storeirb : ST_Imm<"memb", "STrib", u6_0Imm, 0b00>;
-
-  let accessSize = HalfWordAccess in
-  defm S4_storeirh : ST_Imm<"memh", "STrih", u6_1Imm, 0b01>;
-
-  let accessSize = WordAccess in
-  defm S4_storeiri : ST_Imm<"memw", "STriw", u6_2Imm, 0b10>;
-}
-
-//===----------------------------------------------------------------------===
-// ST -
-//===----------------------------------------------------------------------===
-
-
-//===----------------------------------------------------------------------===//
-// NV/ST +
-//===----------------------------------------------------------------------===//
-
-let opNewValue = 2, opExtendable = 1, isExtentSigned = 1, isPredicable = 1 in
-class T_store_io_nv <string mnemonic, RegisterClass RC,
-                    Operand ImmOp, bits<2>MajOp>
-  : NVInst_V4 <(outs),
-               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-  mnemonic#"($src1+#$src2) = $src3.new",
-  [],"",ST_tc_st_SLOT0> {
-    bits<5> src1;
-    bits<13> src2; // Actual address offset
-    bits<3> src3;
-    bits<11> offsetBits; // Represents offset encoding
-
-    let opExtentBits = !if (!eq(mnemonic, "memb"), 11,
-                       !if (!eq(mnemonic, "memh"), 12,
-                       !if (!eq(mnemonic, "memw"), 13, 0)));
-
-    let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
-                        !if (!eq(mnemonic, "memh"), 1,
-                        !if (!eq(mnemonic, "memw"), 2, 0)));
-
-    let offsetBits = !if (!eq(mnemonic, "memb"),  src2{10-0},
-                     !if (!eq(mnemonic, "memh"),  src2{11-1},
-                     !if (!eq(mnemonic, "memw"),  src2{12-2}, 0)));
-
-    let IClass = 0b1010;
-
-    let Inst{27} = 0b0;
-    let Inst{26-25} = offsetBits{10-9};
-    let Inst{24-21} = 0b1101;
-    let Inst{20-16} = src1;
-    let Inst{13} = offsetBits{8};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = src3;
-    let Inst{7-0} = offsetBits{7-0};
-  }
-
-let opExtendable = 2, opNewValue = 3, isPredicated = 1 in
-class T_pstore_io_nv <string mnemonic, RegisterClass RC, Operand predImmOp,
-                         bits<2>MajOp, bit PredNot, bit isPredNew>
-  : NVInst_V4 <(outs),
-               (ins PredRegs:$src1, IntRegs:$src2, predImmOp:$src3, RC:$src4),
-  !if(PredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2+#$src3) = $src4.new",
-  [],"",V2LDST_tc_st_SLOT0> {
-    bits<2> src1;
-    bits<5> src2;
-    bits<9> src3;
-    bits<3> src4;
-    bits<6> offsetBits; // Represents offset encoding
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = PredNot;
-    let opExtentBits = !if (!eq(mnemonic, "memb"), 6,
-                       !if (!eq(mnemonic, "memh"), 7,
-                       !if (!eq(mnemonic, "memw"), 8, 0)));
-
-    let opExtentAlign = !if (!eq(mnemonic, "memb"), 0,
-                        !if (!eq(mnemonic, "memh"), 1,
-                        !if (!eq(mnemonic, "memw"), 2, 0)));
-
-    let offsetBits = !if (!eq(mnemonic, "memb"), src3{5-0},
-                     !if (!eq(mnemonic, "memh"), src3{6-1},
-                     !if (!eq(mnemonic, "memw"), src3{7-2}, 0)));
-
-    let IClass = 0b0100;
-
-    let Inst{27}    = 0b0;
-    let Inst{26}    = PredNot;
-    let Inst{25}    = isPredNew;
-    let Inst{24-21} = 0b0101;
-    let Inst{20-16} = src2;
-    let Inst{13}    = offsetBits{5};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src4;
-    let Inst{7-3}   = offsetBits{4-0};
-    let Inst{2}     = 0b0;
-    let Inst{1-0}   = src1;
-  }
-
-// multiclass for new-value store instructions with base + immediate offset.
-//
-let mayStore = 1, isNVStore = 1, isNewValue = 1, hasSideEffects = 0,
-    isExtendable = 1 in
-multiclass ST_Idxd_nv<string mnemonic, string CextOp, RegisterClass RC,
-                   Operand ImmOp, Operand predImmOp, bits<2> MajOp> {
-
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_indexed in {
-    def S2_#NAME#new_io : T_store_io_nv <mnemonic, RC, ImmOp, MajOp>;
-    // Predicated
-    def S2_p#NAME#newt_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 0, 0>;
-    def S2_p#NAME#newf_io :T_pstore_io_nv <mnemonic, RC, predImmOp, MajOp, 1, 0>;
-    // Predicated new
-    def S4_p#NAME#newtnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
-                                              MajOp, 0, 1>;
-    def S4_p#NAME#newfnew_io :T_pstore_io_nv <mnemonic, RC, predImmOp,
-                                              MajOp, 1, 1>;
-  }
-}
-
-let addrMode = BaseImmOffset, InputType = "imm" in {
-  let accessSize = ByteAccess in
-  defm storerb: ST_Idxd_nv<"memb", "STrib", IntRegs, s11_0Ext,
-                           u6_0Ext, 0b00>, AddrModeRel;
-
-  let accessSize = HalfWordAccess, opExtentAlign = 1 in
-  defm storerh: ST_Idxd_nv<"memh", "STrih", IntRegs, s11_1Ext,
-                           u6_1Ext, 0b01>, AddrModeRel;
-
-  let accessSize = WordAccess, opExtentAlign = 2 in
-  defm storeri: ST_Idxd_nv<"memw", "STriw", IntRegs, s11_2Ext,
-                           u6_2Ext, 0b10>, AddrModeRel;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment loads with register offset.
-//===----------------------------------------------------------------------===//
-
-let hasNewValue = 1 in
-def L2_loadbsw2_pr : T_load_pr <"membh", IntRegs, 0b0001, HalfWordAccess>;
-
-def L2_loadbsw4_pr : T_load_pr <"membh", DoubleRegs, 0b0111, WordAccess>;
-
-let hasSideEffects = 0, addrMode = PostInc in
-class T_loadalign_pr <string mnemonic, bits<4> MajOp, MemAccessSize AccessSz>
-  : LDInstPI <(outs DoubleRegs:$dst, IntRegs:$_dst_),
-              (ins DoubleRegs:$src1, IntRegs:$src2, ModRegs:$src3),
-  "$dst = "#mnemonic#"($src2++$src3)", [],
-  "$src1 = $dst, $src2 = $_dst_"> {
-    bits<5> dst;
-    bits<5> src2;
-    bits<1> src3;
-
-    let accessSize = AccessSz;
-    let IClass = 0b1001;
-
-    let Inst{27-25} = 0b110;
-    let Inst{24-21} = MajOp;
-    let Inst{20-16} = src2;
-    let Inst{13}    = src3;
-    let Inst{12}    = 0b0;
-    let Inst{7}     = 0b0;
-    let Inst{4-0}   = dst;
-  }
-
-def L2_loadalignb_pr : T_loadalign_pr <"memb_fifo", 0b0100, ByteAccess>;
-def L2_loadalignh_pr : T_loadalign_pr <"memh_fifo", 0b0010, HalfWordAccess>;
-
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated post increment .new stores
-// mem[bhwd](Rx++#s4:[0123])=Nt.new
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
-    isNewValue = 1, opNewValue = 3 in
-class T_StorePI_nv <string mnemonic, Operand ImmOp, bits<2> MajOp >
-  : NVInstPI_V4 <(outs IntRegs:$_dst_),
-                 (ins IntRegs:$src1, ImmOp:$offset, IntRegs:$src2),
-  mnemonic#"($src1++#$offset) = $src2.new",
-  [], "$src1 = $_dst_">,
-  AddrModeRel {
-    bits<5> src1;
-    bits<3> src2;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0}));
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = src1;
-    let Inst{13} = 0b0;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = src2;
-    let Inst{7} = 0b0;
-    let Inst{6-3} = offsetBits;
-    let Inst{1} = 0b0;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated post increment .new stores
-// if([!]Pv[.new]) mem[bhwd](Rx++#s4:[0123])=Nt.new
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, addrMode = PostInc, isNVStore = 1,
-    isNewValue = 1, opNewValue = 4 in
-class T_StorePI_nv_pred <string mnemonic, Operand ImmOp,
-                         bits<2> MajOp, bit isPredNot, bit isPredNew >
-  : NVInstPI_V4 <(outs IntRegs:$_dst_),
-                 (ins PredRegs:$src1, IntRegs:$src2,
-                      ImmOp:$offset, IntRegs:$src3),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#mnemonic#"($src2++#$offset) = $src3.new",
-  [], "$src2 = $_dst_">,
-  AddrModeRel {
-    bits<2> src1;
-    bits<5> src2;
-    bits<3> src3;
-    bits<7> offset;
-    bits<4> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "s4_2Imm"), offset{5-2},
-                     !if (!eq(ImmOpStr, "s4_1Imm"), offset{4-1},
-                                      /* s4_0Imm */ offset{3-0}));
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = src2;
-    let Inst{13} = 0b1;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8} = src3;
-    let Inst{7} = isPredNew;
-    let Inst{6-3} = offsetBits;
-    let Inst{2} = isPredNot;
-    let Inst{1-0} = src1;
-  }
-
-multiclass ST_PostInc_Pred_nv<string mnemonic, Operand ImmOp,
-                              bits<2> MajOp, bit PredNot> {
-  def _pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 0>;
-
-  // Predicate new
-  def new_pi : T_StorePI_nv_pred <mnemonic, ImmOp, MajOp, PredNot, 1>;
-}
-
-multiclass ST_PostInc_nv<string mnemonic, string BaseOp, Operand ImmOp,
-                         bits<2> MajOp> {
-  let BaseOpcode = "POST_"#BaseOp in {
-    def S2_#NAME#_pi : T_StorePI_nv <mnemonic, ImmOp, MajOp>;
-
-    // Predicated
-    defm S2_p#NAME#t : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 0>;
-    defm S2_p#NAME#f : ST_PostInc_Pred_nv <mnemonic, ImmOp, MajOp, 1>;
-  }
-}
-
-let accessSize = ByteAccess in
-defm storerbnew: ST_PostInc_nv <"memb", "STrib", s4_0Imm, 0b00>;
-
-let accessSize = HalfWordAccess in
-defm storerhnew: ST_PostInc_nv <"memh", "STrih", s4_1Imm, 0b01>;
-
-let accessSize = WordAccess in
-defm storerinew: ST_PostInc_nv <"memw", "STriw", s4_2Imm, 0b10>;
-
-//===----------------------------------------------------------------------===//
-// Template class for post increment .new stores with register offset
-//===----------------------------------------------------------------------===//
-let isNewValue = 1, mayStore = 1, isNVStore = 1, opNewValue = 3 in
-class T_StorePI_RegNV <string mnemonic, bits<2> MajOp, MemAccessSize AccessSz>
-  : NVInstPI_V4 <(outs IntRegs:$_dst_),
-                 (ins IntRegs:$src1, ModRegs:$src2, IntRegs:$src3),
-  #mnemonic#"($src1++$src2) = $src3.new",
-  [], "$src1 = $_dst_"> {
-    bits<5> src1;
-    bits<1> src2;
-    bits<3> src3;
-    let accessSize = AccessSz;
-
-    let IClass = 0b1010;
-
-    let Inst{27-21} = 0b1101101;
-    let Inst{20-16} = src1;
-    let Inst{13}    = src2;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src3;
-    let Inst{7}     = 0b0;
-  }
-
-def S2_storerbnew_pr : T_StorePI_RegNV<"memb", 0b00, ByteAccess>;
-def S2_storerhnew_pr : T_StorePI_RegNV<"memh", 0b01, HalfWordAccess>;
-def S2_storerinew_pr : T_StorePI_RegNV<"memw", 0b10, WordAccess>;
-
-// memb(Rx++#s4:0:circ(Mu))=Nt.new
-// memb(Rx++I:circ(Mu))=Nt.new
-// memb(Rx++Mu:brev)=Nt.new
-// memh(Rx++#s4:1:circ(Mu))=Nt.new
-// memh(Rx++I:circ(Mu))=Nt.new
-// memh(Rx++Mu)=Nt.new
-// memh(Rx++Mu:brev)=Nt.new
-
-// memw(Rx++#s4:2:circ(Mu))=Nt.new
-// memw(Rx++I:circ(Mu))=Nt.new
-// memw(Rx++Mu)=Nt.new
-// memw(Rx++Mu:brev)=Nt.new
-
-//===----------------------------------------------------------------------===//
-// NV/ST -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// NV/J +
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// multiclass/template class for the new-value compare jumps with the register
-// operands.
-//===----------------------------------------------------------------------===//
-
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
-    opExtentAlign = 2 in
-class NVJrr_template<string mnemonic, bits<3> majOp, bit NvOpNum,
-                      bit isNegCond, bit isTak>
-  : NVInst_V4<(outs),
-    (ins IntRegs:$src1, IntRegs:$src2, brtarget:$offset),
-    "if ("#!if(isNegCond, "!","")#mnemonic#
-    "($src1"#!if(!eq(NvOpNum, 0),".new, ",", ")#
-    "$src2"#!if(!eq(NvOpNum, 1),".new))","))")#" jump:"
-    #!if(isTak, "t","nt")#" $offset", []> {
-
-      bits<5> src1;
-      bits<5> src2;
-      bits<3> Ns;    // New-Value Operand
-      bits<5> RegOp; // Non-New-Value Operand
-      bits<11> offset;
-
-      let isTaken = isTak;
-      let isPredicatedFalse = isNegCond;
-      let opNewValue{0} = NvOpNum;
-
-      let Ns = !if(!eq(NvOpNum, 0), src1{2-0}, src2{2-0});
-      let RegOp = !if(!eq(NvOpNum, 0), src2, src1);
-
-      let IClass = 0b0010;
-      let Inst{27-26} = 0b00;
-      let Inst{25-23} = majOp;
-      let Inst{22} = isNegCond;
-      let Inst{18-16} = Ns;
-      let Inst{13} = isTak;
-      let Inst{12-8} = RegOp;
-      let Inst{21-20} = offset{10-9};
-      let Inst{7-1} = offset{8-2};
-}
-
-
-multiclass NVJrr_cond<string mnemonic, bits<3> majOp, bit NvOpNum,
-                       bit isNegCond> {
-  // Branch not taken:
-  def _nt: NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 0>;
-  // Branch taken:
-  def _t : NVJrr_template<mnemonic, majOp, NvOpNum, isNegCond, 1>;
-}
-
-// NvOpNum = 0 -> First Operand is a new-value Register
-// NvOpNum = 1 -> Second Operand is a new-value Register
-
-multiclass NVJrr_base<string mnemonic, string BaseOp, bits<3> majOp,
-                       bit NvOpNum> {
-  let BaseOpcode = BaseOp#_NVJ in {
-    defm _t_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 0>; // True cond
-    defm _f_jumpnv : NVJrr_cond<mnemonic, majOp, NvOpNum, 1>; // False cond
-  }
-}
-
-// if ([!]cmp.eq(Ns.new,Rt)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Ns.new,Rt)) jump:[n]t #r9:2
-// if ([!]cmp.gtu(Ns.new,Rt)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Rt,Ns.new)) jump:[n]t #r9:2
-// if ([!]cmp.gtu(Rt,Ns.new)) jump:[n]t #r9:2
-
-let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
-    Defs = [PC], hasSideEffects = 0 in {
-  defm J4_cmpeq  : NVJrr_base<"cmp.eq",  "CMPEQ",  0b000, 0>, PredRel;
-  defm J4_cmpgt  : NVJrr_base<"cmp.gt",  "CMPGT",  0b001, 0>, PredRel;
-  defm J4_cmpgtu : NVJrr_base<"cmp.gtu", "CMPGTU", 0b010, 0>, PredRel;
-  defm J4_cmplt  : NVJrr_base<"cmp.gt",  "CMPLT",  0b011, 1>, PredRel;
-  defm J4_cmpltu : NVJrr_base<"cmp.gtu", "CMPLTU", 0b100, 1>, PredRel;
-}
-
-//===----------------------------------------------------------------------===//
-// multiclass/template class for the new-value compare jumps instruction
-// with a register and an unsigned immediate (U5) operand.
-//===----------------------------------------------------------------------===//
-
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 11,
-    opExtentAlign = 2 in
-class NVJri_template<string mnemonic, bits<3> majOp, bit isNegCond,
-                         bit isTak>
-  : NVInst_V4<(outs),
-    (ins IntRegs:$src1, u5_0Imm:$src2, brtarget:$offset),
-    "if ("#!if(isNegCond, "!","")#mnemonic#"($src1.new, #$src2)) jump:"
-    #!if(isTak, "t","nt")#" $offset", []> {
-
-      let isTaken = isTak;
-      let isPredicatedFalse = isNegCond;
-      let isTaken = isTak;
-
-      bits<3> src1;
-      bits<5> src2;
-      bits<11> offset;
-
-      let IClass = 0b0010;
-      let Inst{26} = 0b1;
-      let Inst{25-23} = majOp;
-      let Inst{22} = isNegCond;
-      let Inst{18-16} = src1;
-      let Inst{13} = isTak;
-      let Inst{12-8} = src2;
-      let Inst{21-20} = offset{10-9};
-      let Inst{7-1} = offset{8-2};
-}
-
-multiclass NVJri_cond<string mnemonic, bits<3> majOp, bit isNegCond> {
-  // Branch not taken:
-  def _nt: NVJri_template<mnemonic, majOp, isNegCond, 0>;
-  // Branch taken:
-  def _t : NVJri_template<mnemonic, majOp, isNegCond, 1>;
-}
-
-multiclass NVJri_base<string mnemonic, string BaseOp, bits<3> majOp> {
-  let BaseOpcode = BaseOp#_NVJri in {
-    defm _t_jumpnv : NVJri_cond<mnemonic, majOp, 0>; // True Cond
-    defm _f_jumpnv : NVJri_cond<mnemonic, majOp, 1>; // False cond
-  }
-}
-
-// if ([!]cmp.eq(Ns.new,#U5)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Ns.new,#U5)) jump:[n]t #r9:2
-// if ([!]cmp.gtu(Ns.new,#U5)) jump:[n]t #r9:2
-
-let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator = 1,
-    Defs = [PC], hasSideEffects = 0 in {
-  defm J4_cmpeqi  : NVJri_base<"cmp.eq", "CMPEQ", 0b000>, PredRel;
-  defm J4_cmpgti  : NVJri_base<"cmp.gt", "CMPGT", 0b001>, PredRel;
-  defm J4_cmpgtui : NVJri_base<"cmp.gtu", "CMPGTU", 0b010>, PredRel;
-}
-
-//===----------------------------------------------------------------------===//
-// multiclass/template class for the new-value compare jumps instruction
-// with a register and an hardcoded 0/-1 immediate value.
-//===----------------------------------------------------------------------===//
-
-let isExtendable = 1, isExtentSigned = 1, opExtentBits = 11,
-    opExtentAlign = 2 in
-class NVJ_ConstImm_template<string mnemonic, bits<3> majOp, string ImmVal,
-                            bit isNegCond, bit isTak>
-  : NVInst_V4<(outs),
-    !if(!eq(ImmVal, "{-1}"),
-        (ins IntRegs:$src1, n1Const:$n1, brtarget:$offset),
-        (ins IntRegs:$src1, brtarget:$offset)),
-    "if ("#!if(isNegCond, "!","")#mnemonic
-    #"($src1.new, #" # !if(!eq(ImmVal, "{-1}"), "$n1", ImmVal) # ")) jump:"
-    #!if(isTak, "t","nt")#" $offset", []> {
-
-      let isTaken = isTak;
-      let isPredicatedFalse = isNegCond;
-      let isTaken = isTak;
-      let opExtendable = !if(!eq(ImmVal, "{-1}"), 2, 1);
-
-      bits<3> src1;
-      bits<11> offset;
-      let IClass = 0b0010;
-      let Inst{26} = 0b1;
-      let Inst{25-23} = majOp;
-      let Inst{22} = isNegCond;
-      let Inst{18-16} = src1;
-      let Inst{13} = isTak;
-      let Inst{21-20} = offset{10-9};
-      let Inst{7-1} = offset{8-2};
-}
-
-multiclass NVJ_ConstImm_cond<string mnemonic, bits<3> majOp, string ImmVal,
-                             bit isNegCond> {
-  // Branch not taken:
-  def _nt: NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 0>;
-  // Branch taken:
-  def _t : NVJ_ConstImm_template<mnemonic, majOp, ImmVal, isNegCond, 1>;
-}
-
-multiclass NVJ_ConstImm_base<string mnemonic, string BaseOp, bits<3> majOp,
-                             string ImmVal> {
-  let BaseOpcode = BaseOp#_NVJ_ConstImm in {
-    defm _t_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 0>; // True
-    defm _f_jumpnv : NVJ_ConstImm_cond<mnemonic, majOp, ImmVal, 1>; // False
-  }
-}
-
-// if ([!]tstbit(Ns.new,#0)) jump:[n]t #r9:2
-// if ([!]cmp.eq(Ns.new,#-1)) jump:[n]t #r9:2
-// if ([!]cmp.gt(Ns.new,#-1)) jump:[n]t #r9:2
-
-let isPredicated = 1, isBranch = 1, isNewValue = 1, isTerminator=1,
-    Defs = [PC], hasSideEffects = 0 in {
-  defm J4_tstbit0 : NVJ_ConstImm_base<"tstbit", "TSTBIT", 0b011, "0">, PredRel;
-  defm J4_cmpeqn1 : NVJ_ConstImm_base<"cmp.eq", "CMPEQ",  0b100, "{-1}">, PredRel;
-  defm J4_cmpgtn1 : NVJ_ConstImm_base<"cmp.gt", "CMPGT",  0b101, "{-1}">, PredRel;
-}
-
-// J4_hintjumpr: Hint indirect conditional jump.
-let isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def J4_hintjumpr: JRInst <
-  (outs),
-  (ins IntRegs:$Rs),
-  "hintjr($Rs)"> {
-    bits<5> Rs;
-    let IClass = 0b0101;
-    let Inst{27-21} = 0b0010101;
-    let Inst{20-16} = Rs;
-  }
-
-//===----------------------------------------------------------------------===//
-// NV/J -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// CR +
-//===----------------------------------------------------------------------===//
-
-// PC-relative add
-let hasNewValue = 1, isExtendable = 1, opExtendable = 1,
-    isExtentSigned = 0, opExtentBits = 6, hasSideEffects = 0, Uses = [PC] in
-def C4_addipc : CRInst <(outs IntRegs:$Rd), (ins u6_0Ext:$u6),
-  "$Rd = add(pc, #$u6)", [], "", CR_tc_2_SLOT3 > {
-    bits<5> Rd;
-    bits<6> u6;
-
-    let IClass = 0b0110;
-    let Inst{27-16} = 0b101001001001;
-    let Inst{12-7} = u6;
-    let Inst{4-0} = Rd;
-  }
-
-
-
-let hasSideEffects = 0 in
-class T_LOGICAL_3OP<string MnOp1, string MnOp2, bits<2> OpBits, bit IsNeg>
-    : CRInst<(outs PredRegs:$Pd),
-             (ins PredRegs:$Ps, PredRegs:$Pt, PredRegs:$Pu),
-             "$Pd = " # MnOp1 # "($Ps, " # MnOp2 # "($Pt, " #
-                   !if (IsNeg,"!","") # "$Pu))",
-             [], "", CR_tc_2early_SLOT23> {
-  bits<2> Pd;
-  bits<2> Ps;
-  bits<2> Pt;
-  bits<2> Pu;
-
-  let IClass = 0b0110;
-  let Inst{27-24} = 0b1011;
-  let Inst{23} = IsNeg;
-  let Inst{22-21} = OpBits;
-  let Inst{20} = 0b1;
-  let Inst{17-16} = Ps;
-  let Inst{13} = 0b0;
-  let Inst{9-8} = Pt;
-  let Inst{7-6} = Pu;
-  let Inst{1-0} = Pd;
-}
-
-def C4_and_and  : T_LOGICAL_3OP<"and", "and", 0b00, 0>;
-def C4_and_or   : T_LOGICAL_3OP<"and", "or",  0b01, 0>;
-def C4_or_and   : T_LOGICAL_3OP<"or",  "and", 0b10, 0>;
-def C4_or_or    : T_LOGICAL_3OP<"or",  "or",  0b11, 0>;
-def C4_and_andn : T_LOGICAL_3OP<"and", "and", 0b00, 1>;
-def C4_and_orn  : T_LOGICAL_3OP<"and", "or",  0b01, 1>;
-def C4_or_andn  : T_LOGICAL_3OP<"or",  "and", 0b10, 1>;
-def C4_or_orn   : T_LOGICAL_3OP<"or",  "or",  0b11, 1>;
-
-//===----------------------------------------------------------------------===//
-// CR -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/ALU +
-//===----------------------------------------------------------------------===//
-
-// Logical with-not instructions.
-def A4_andnp : T_ALU64_logical<"and", 0b001, 1, 0, 1>;
-def A4_ornp  : T_ALU64_logical<"or",  0b011, 1, 0, 1>;
-
-let hasNewValue = 1, hasSideEffects = 0 in
-def S4_parity: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = parity($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0101111;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-//  Add and accumulate.
-//  Rd=add(Rs,add(Ru,#s6))
-let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 6,
-    opExtendable = 3 in
-def S4_addaddi : ALU64Inst <(outs IntRegs:$Rd),
-                            (ins IntRegs:$Rs, IntRegs:$Ru, s6_0Ext:$s6),
-  "$Rd = add($Rs, add($Ru, #$s6))" , [],
-  "", ALU64_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Ru;
-    bits<6> s6;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b10110;
-    let Inst{22-21} = s6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = s6{3};
-    let Inst{12-8}  = Rd;
-    let Inst{7-5}   = s6{2-0};
-    let Inst{4-0}   = Ru;
-  }
-
-let isExtentSigned = 1, hasSideEffects = 0, hasNewValue = 1, isExtendable = 1,
-    opExtentBits = 6, opExtendable = 2 in
-def S4_subaddi: ALU64Inst <(outs IntRegs:$Rd),
-                           (ins IntRegs:$Rs, s6_0Ext:$s6, IntRegs:$Ru),
-  "$Rd = add($Rs, sub(#$s6, $Ru))",
-  [], "", ALU64_tc_2_SLOT23> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<6> s6;
-    bits<5> Ru;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b10111;
-    let Inst{22-21} = s6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = s6{3};
-    let Inst{12-8}  = Rd;
-    let Inst{7-5}   = s6{2-0};
-    let Inst{4-0}   = Ru;
-  }
-
-def S4_extractp_rp : T_S3op_64 < "extract",  0b11, 0b100, 0>;
-def S4_extractp    : T_S2op_extract <"extract",  0b1010, DoubleRegs, u6_0Imm>;
-
-let hasNewValue = 1 in {
-  def S4_extract_rp : T_S3op_extract<"extract",  0b01>;
-  def S4_extract    : T_S2op_extract <"extract",  0b1101, IntRegs, u5_0Imm>;
-}
-
-// Complex add/sub halfwords/words
-let Defs = [USR_OVF] in {
-  def S4_vxaddsubh : T_S3op_64 < "vxaddsubh", 0b01, 0b100, 0, 1>;
-  def S4_vxaddsubw : T_S3op_64 < "vxaddsubw", 0b01, 0b000, 0, 1>;
-  def S4_vxsubaddh : T_S3op_64 < "vxsubaddh", 0b01, 0b110, 0, 1>;
-  def S4_vxsubaddw : T_S3op_64 < "vxsubaddw", 0b01, 0b010, 0, 1>;
-}
-
-let Defs = [USR_OVF] in {
-  def S4_vxaddsubhr : T_S3op_64 < "vxaddsubh", 0b11, 0b000, 0, 1, 1, 1>;
-  def S4_vxsubaddhr : T_S3op_64 < "vxsubaddh", 0b11, 0b010, 0, 1, 1, 1>;
-}
-
-let Itinerary = M_tc_3x_SLOT23, Defs = [USR_OVF] in {
-  def M4_mac_up_s1_sat: T_MType_acc_rr<"+= mpy", 0b011, 0b000, 0, [], 0, 1, 1>;
-  def M4_nac_up_s1_sat: T_MType_acc_rr<"-= mpy", 0b011, 0b001, 0, [], 0, 1, 1>;
-}
-
-// Logical xor with xor accumulation.
-// Rxx^=xor(Rss,Rtt)
-let hasSideEffects = 0 in
-def M4_xor_xacc
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Rxx ^= xor($Rss, $Rtt)", [],
-  "$dst2 = $Rxx", S_3op_tc_1_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b101010;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{7-5}   = 0b000;
-    let Inst{4-0}   = Rxx;
-  }
-
-// Rotate and reduce bytes
-// Rdd=vrcrotate(Rss,Rt,#u2)
-let hasSideEffects = 0 in
-def S4_vrcrotate
-  : SInst <(outs DoubleRegs:$Rdd),
-           (ins DoubleRegs:$Rss, IntRegs:$Rt, u2_0Imm:$u2),
-  "$Rdd = vrcrotate($Rss, $Rt, #$u2)",
-  [], "", S_3op_tc_3x_SLOT23> {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rt;
-    bits<2> u2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b001111;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = u2{1};
-    let Inst{12-8}  = Rt;
-    let Inst{7-6}   = 0b11;
-    let Inst{5}     = u2{0};
-    let Inst{4-0}   = Rdd;
-  }
-
-// Rotate and reduce bytes with accumulation
-// Rxx+=vrcrotate(Rss,Rt,#u2)
-let hasSideEffects = 0 in
-def S4_vrcrotate_acc
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt, u2_0Imm:$u2),
-  "$Rxx += vrcrotate($Rss, $Rt, #$u2)", [],
-  "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rt;
-    bits<2> u2;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = u2{1};
-    let Inst{12-8}  = Rt;
-    let Inst{5}     = u2{0};
-    let Inst{4-0}   = Rxx;
-  }
-
-// Vector reduce conditional negate halfwords
-let hasSideEffects = 0 in
-def S2_vrcnegh
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Rt),
-  "$Rxx += vrcnegh($Rss, $Rt)", [],
-  "$dst2 = $Rxx", S_3op_tc_3x_SLOT23> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b1011001;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = 0b1;
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = 0b111;
-    let Inst{4-0}   = Rxx;
-  }
-
-// Split bitfield
-def A4_bitspliti : T_S2op_2_di <"bitsplit", 0b110, 0b100>;
-
-// Arithmetic/Convergent round
-def A4_cround_ri : T_S2op_2_ii <"cround", 0b111, 0b000>;
-
-def A4_round_ri  : T_S2op_2_ii <"round", 0b111, 0b100>;
-
-let Defs = [USR_OVF] in
-def A4_round_ri_sat : T_S2op_2_ii <"round", 0b111, 0b110, 1>;
-
-// Logical-logical words.
-// Compound or-and -- Rx=or(Ru,and(Rx,#s10))
-let isExtentSigned = 1, hasNewValue = 1, isExtendable = 1, opExtentBits = 10,
-    opExtendable = 3 in
-def S4_or_andix:
-  ALU64Inst<(outs IntRegs:$Rx),
-            (ins IntRegs:$Ru, IntRegs:$_src_, s10_0Ext:$s10),
-  "$Rx = or($Ru, and($_src_, #$s10))" , [] ,
-  "$_src_ = $Rx", ALU64_tc_2_SLOT23> {
-    bits<5> Rx;
-    bits<5> Ru;
-    bits<10> s10;
-
-    let IClass = 0b1101;
-
-    let Inst{27-22} = 0b101001;
-    let Inst{20-16} = Rx;
-    let Inst{21}    = s10{9};
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Ru;
-  }
-
-// Miscellaneous ALU64 instructions.
-//
-let hasNewValue = 1, hasSideEffects = 0 in
-def A4_modwrapu: ALU64Inst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = modwrap($Rs, $Rt)", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0011111;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7-5} = 0b111;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0 in
-def A4_bitsplit: ALU64Inst<(outs DoubleRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = bitsplit($Rs, $Rt)", [], "", ALU64_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0100;
-  let Inst{21} = 0b1;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0 in
-def dep_S2_packhl: ALU64Inst<(outs DoubleRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = packhl($Rs, $Rt):deprecated", [], "", ALU64_tc_1_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b0100;
-  let Inst{21} = 0b0;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{4-0} = Rd;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-def dep_A2_addsat: ALU64Inst<(outs IntRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = add($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0101100;
-  let Inst{20-16} = Rs;
-  let Inst{12-8} = Rt;
-  let Inst{7} = 0b0;
-  let Inst{4-0} = Rd;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-def dep_A2_subsat: ALU64Inst<(outs IntRegs:$Rd),
-      (ins IntRegs:$Rs, IntRegs:$Rt),
-      "$Rd = sub($Rs, $Rt):sat:deprecated", [], "", ALU64_tc_2_SLOT23> {
-  bits<5> Rd;
-  bits<5> Rs;
-  bits<5> Rt;
-
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0101100;
-  let Inst{20-16} = Rt;
-  let Inst{12-8} = Rs;
-  let Inst{7} = 0b1;
-  let Inst{4-0} = Rd;
-}
-
-// Rx[&|]=xor(Rs,Rt)
-def M4_or_xor   : T_MType_acc_rr < "|= xor", 0b110, 0b001, 0>;
-def M4_and_xor  : T_MType_acc_rr < "&= xor", 0b010, 0b010, 0>;
-
-// Rx[&|^]=or(Rs,Rt)
-def M4_xor_or   : T_MType_acc_rr < "^= or",  0b110, 0b011, 0>;
-
-let CextOpcode = "ORr_ORr" in
-def M4_or_or    : T_MType_acc_rr < "|= or",  0b110, 0b000, 0>;
-def M4_and_or   : T_MType_acc_rr < "&= or",  0b010, 0b001, 0>;
-
-// Rx[&|^]=and(Rs,Rt)
-def M4_xor_and  : T_MType_acc_rr < "^= and", 0b110, 0b010, 0>;
-
-let CextOpcode = "ORr_ANDr" in
-def M4_or_and   : T_MType_acc_rr < "|= and", 0b010, 0b011, 0>;
-def M4_and_and  : T_MType_acc_rr < "&= and", 0b010, 0b000, 0>;
-
-// Rx[&|^]=and(Rs,~Rt)
-def M4_xor_andn : T_MType_acc_rr < "^= and", 0b001, 0b010, 0, [], 1>;
-def M4_or_andn  : T_MType_acc_rr < "|= and", 0b001, 0b000, 0, [], 1>;
-def M4_and_andn : T_MType_acc_rr < "&= and", 0b001, 0b001, 0, [], 1>;
-
-// Compound or-or and or-and
-let isExtentSigned = 1, InputType = "imm", hasNewValue = 1, isExtendable = 1,
-    opExtentBits = 10, opExtendable = 3 in
-class T_CompOR <string mnemonic, bits<2> MajOp, SDNode OpNode>
-  : MInst_acc <(outs IntRegs:$Rx),
-               (ins IntRegs:$src1, IntRegs:$Rs, s10_0Ext:$s10),
-  "$Rx |= "#mnemonic#"($Rs, #$s10)", [],
-  "$src1 = $Rx", ALU64_tc_2_SLOT23>, ImmRegRel {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<10> s10;
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b1010;
-    let Inst{23-22} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{21}    = s10{9};
-    let Inst{13-5}  = s10{8-0};
-    let Inst{4-0}   = Rx;
-  }
-
-let CextOpcode = "ORr_ANDr" in
-def S4_or_andi : T_CompOR <"and", 0b00, and>;
-
-let CextOpcode = "ORr_ORr" in
-def S4_or_ori : T_CompOR <"or", 0b10, or>;
-
-//    Modulo wrap
-//        Rd=modwrap(Rs,Rt)
-//    Round
-//        Rd=cround(Rs,#u5)
-//        Rd=cround(Rs,Rt)
-//        Rd=round(Rs,#u5)[:sat]
-//        Rd=round(Rs,Rt)[:sat]
-//    Vector reduce add unsigned halfwords
-//        Rd=vraddh(Rss,Rtt)
-//    Vector add bytes
-//        Rdd=vaddb(Rss,Rtt)
-//    Vector conditional negate
-//        Rdd=vcnegh(Rss,Rt)
-//        Rxx+=vrcnegh(Rss,Rt)
-//    Vector maximum bytes
-//        Rdd=vmaxb(Rtt,Rss)
-//    Vector reduce maximum halfwords
-//        Rxx=vrmaxh(Rss,Ru)
-//        Rxx=vrmaxuh(Rss,Ru)
-//    Vector reduce maximum words
-//        Rxx=vrmaxuw(Rss,Ru)
-//        Rxx=vrmaxw(Rss,Ru)
-//    Vector minimum bytes
-//        Rdd=vminb(Rtt,Rss)
-//    Vector reduce minimum halfwords
-//        Rxx=vrminh(Rss,Ru)
-//        Rxx=vrminuh(Rss,Ru)
-//    Vector reduce minimum words
-//        Rxx=vrminuw(Rss,Ru)
-//        Rxx=vrminw(Rss,Ru)
-//    Vector subtract bytes
-//        Rdd=vsubb(Rss,Rtt)
-
-//===----------------------------------------------------------------------===//
-// XTYPE/ALU -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/BIT +
-//===----------------------------------------------------------------------===//
-
-// Bit reverse
-def S2_brevp : T_S2op_3 <"brev", 0b11, 0b110>;
-
-// Bit count
-def S2_ct0p : T_COUNT_LEADING_64<"ct0", 0b111, 0b010>;
-def S2_ct1p : T_COUNT_LEADING_64<"ct1", 0b111, 0b100>;
-def S4_clbpnorm : T_COUNT_LEADING_64<"normamt", 0b011, 0b000>;
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def S4_clbaddi : SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs, s6_0Imm:$s6),
-    "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
-  bits<5> Rs;
-  bits<5> Rd;
-  bits<6> s6;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1100;
-  let Inst{23-21} = 0b001;
-  let Inst{20-16} = Rs;
-  let Inst{13-8} = s6;
-  let Inst{7-5} = 0b000;
-  let Inst{4-0} = Rd;
-}
-
-let hasSideEffects = 0, hasNewValue = 1 in
-def S4_clbpaddi : SInst<(outs IntRegs:$Rd), (ins DoubleRegs:$Rs, s6_0Imm:$s6),
-    "$Rd = add(clb($Rs), #$s6)", [], "", S_2op_tc_2_SLOT23> {
-  bits<5> Rs;
-  bits<5> Rd;
-  bits<6> s6;
-  let IClass = 0b1000;
-  let Inst{27-24} = 0b1000;
-  let Inst{23-21} = 0b011;
-  let Inst{20-16} = Rs;
-  let Inst{13-8} = s6;
-  let Inst{7-5} = 0b010;
-  let Inst{4-0} = Rd;
-}
-
-
-// Bit test/set/clear
-def S4_ntstbit_i : T_TEST_BIT_IMM<"!tstbit", 0b001>;
-def S4_ntstbit_r : T_TEST_BIT_REG<"!tstbit", 1>;
-
-def C4_nbitsset  : T_TEST_BITS_REG<"!bitsset", 0b01, 1>;
-def C4_nbitsclr  : T_TEST_BITS_REG<"!bitsclr", 0b10, 1>;
-def C4_nbitsclri : T_TEST_BITS_IMM<"!bitsclr", 0b10, 1>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/BIT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/MPY +
-//===----------------------------------------------------------------------===//
-
-// Rd=add(#u6,mpyi(Rs,#U6)) -- Multiply by immed and add immed.
-
-let hasNewValue = 1, isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
-def M4_mpyri_addi : MInst<(outs IntRegs:$Rd),
-  (ins u6_0Ext:$u6, IntRegs:$Rs, u6_0Imm:$U6),
-  "$Rd = add(#$u6, mpyi($Rs, #$U6))" , [],"",ALU64_tc_3x_SLOT23> {
-    bits<5> Rd;
-    bits<6> u6;
-    bits<5> Rs;
-    bits<6> U6;
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b1000;
-    let Inst{23}    = U6{5};
-    let Inst{22-21} = u6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = u6{3};
-    let Inst{12-8}  = Rd;
-    let Inst{7-5}   = u6{2-0};
-    let Inst{4-0}   = U6{4-0};
-  }
-
-// Rd=add(#u6,mpyi(Rs,Rt))
-let CextOpcode = "ADD_MPY", InputType = "imm", hasNewValue = 1,
-    isExtendable = 1, opExtentBits = 6, opExtendable = 1 in
-def M4_mpyrr_addi : MInst <(outs IntRegs:$Rd),
-  (ins u6_0Ext:$u6, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd = add(#$u6, mpyi($Rs, $Rt))" , [], "", ALU64_tc_3x_SLOT23>, ImmRegRel {
-    bits<5> Rd;
-    bits<6> u6;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b01110;
-    let Inst{22-21} = u6{5-4};
-    let Inst{20-16} = Rs;
-    let Inst{13}    = u6{3};
-    let Inst{12-8}  = Rt;
-    let Inst{7-5}   = u6{2-0};
-    let Inst{4-0}   = Rd;
-  }
-
-let hasNewValue = 1 in
-class T_AddMpy <bit MajOp, PatLeaf ImmPred, dag ins>
-  : ALU64Inst <(outs IntRegs:$dst), ins,
-  "$dst = add($src1, mpyi("#!if(MajOp,"$src3, #$src2))",
-                                      "#$src2, $src3))"), [],
-  "", ALU64_tc_3x_SLOT23> {
-    bits<5> dst;
-    bits<5> src1;
-    bits<8> src2;
-    bits<5> src3;
-
-    let IClass = 0b1101;
-
-    bits<6> ImmValue = !if(MajOp, src2{5-0}, src2{7-2});
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23}    = MajOp;
-    let Inst{22-21} = ImmValue{5-4};
-    let Inst{20-16} = src3;
-    let Inst{13}    = ImmValue{3};
-    let Inst{12-8}  = dst;
-    let Inst{7-5}   = ImmValue{2-0};
-    let Inst{4-0}   = src1;
-  }
-
-def M4_mpyri_addr_u2 : T_AddMpy<0b0, u6_2ImmPred,
-                       (ins IntRegs:$src1, u6_2Imm:$src2, IntRegs:$src3)>;
-
-let isExtendable = 1, opExtentBits = 6, opExtendable = 3,
-    CextOpcode = "ADD_MPY", InputType = "imm" in
-def M4_mpyri_addr : T_AddMpy<0b1, u32_0ImmPred,
-                    (ins IntRegs:$src1, IntRegs:$src3, u6_0Ext:$src2)>, ImmRegRel;
-
-// Rx=add(Ru,mpyi(Rx,Rs))
-let CextOpcode = "ADD_MPY", InputType = "reg", hasNewValue = 1 in
-def M4_mpyrr_addr: MInst_acc <(outs IntRegs:$Rx),
-                              (ins IntRegs:$Ru, IntRegs:$_src_, IntRegs:$Rs),
-  "$Rx = add($Ru, mpyi($_src_, $Rs))", [],
-  "$_src_ = $Rx", M_tc_3x_SLOT23>, ImmRegRel {
-    bits<5> Rx;
-    bits<5> Ru;
-    bits<5> Rs;
-
-    let IClass = 0b1110;
-
-    let Inst{27-21} = 0b0011000;
-    let Inst{12-8} = Rx;
-    let Inst{4-0} = Ru;
-    let Inst{20-16} = Rs;
-  }
-
-
-// Vector reduce multiply word by signed half (32x16)
-//Rdd=vrmpyweh(Rss,Rtt)[:<<1]
-def M4_vrmpyeh_s0 : T_M2_vmpy<"vrmpyweh", 0b010, 0b100, 0, 0, 0>;
-def M4_vrmpyeh_s1 : T_M2_vmpy<"vrmpyweh", 0b110, 0b100, 1, 0, 0>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def M4_vrmpyoh_s0 : T_M2_vmpy<"vrmpywoh", 0b001, 0b010, 0, 0, 0>;
-def M4_vrmpyoh_s1 : T_M2_vmpy<"vrmpywoh", 0b101, 0b010, 1, 0, 0>;
-
-//Rdd+=vrmpyweh(Rss,Rtt)[:<<1]
-def M4_vrmpyeh_acc_s0: T_M2_vmpy_acc<"vrmpyweh", 0b001, 0b110, 0, 0>;
-def M4_vrmpyeh_acc_s1: T_M2_vmpy_acc<"vrmpyweh", 0b101, 0b110, 1, 0>;
-
-//Rdd=vrmpywoh(Rss,Rtt)[:<<1]
-def M4_vrmpyoh_acc_s0: T_M2_vmpy_acc<"vrmpywoh", 0b011, 0b110, 0, 0>;
-def M4_vrmpyoh_acc_s1: T_M2_vmpy_acc<"vrmpywoh", 0b111, 0b110, 1, 0>;
-
-// Vector multiply halfwords, signed by unsigned
-// Rdd=vmpyhsu(Rs,Rt)[:<<]:sat
-def M2_vmpy2su_s0 : T_XTYPE_mpy64 < "vmpyhsu", 0b000, 0b111, 1, 0, 0>;
-def M2_vmpy2su_s1 : T_XTYPE_mpy64 < "vmpyhsu", 0b100, 0b111, 1, 1, 0>;
-
-// Rxx+=vmpyhsu(Rs,Rt)[:<<1]:sat
-def M2_vmac2su_s0 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b011, 0b101, 1, 0, 0>;
-def M2_vmac2su_s1 : T_XTYPE_mpy64_acc < "vmpyhsu", "+", 0b111, 0b101, 1, 1, 0>;
-
-// Vector polynomial multiply halfwords
-// Rdd=vpmpyh(Rs,Rt)
-def M4_vpmpyh : T_XTYPE_mpy64 < "vpmpyh", 0b110, 0b111, 0, 0, 0>;
-
-// Rxx^=vpmpyh(Rs,Rt)
-def M4_vpmpyh_acc : T_XTYPE_mpy64_acc < "vpmpyh", "^", 0b101, 0b111, 0, 0, 0>;
-
-// Polynomial multiply words
-// Rdd=pmpyw(Rs,Rt)
-def M4_pmpyw : T_XTYPE_mpy64 < "pmpyw", 0b010, 0b111, 0, 0, 0>;
-
-// Rxx^=pmpyw(Rs,Rt)
-def M4_pmpyw_acc  : T_XTYPE_mpy64_acc < "pmpyw", "^", 0b001, 0b111, 0, 0, 0>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/MPY -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// ALU64/Vector compare
-//===----------------------------------------------------------------------===//
-//===----------------------------------------------------------------------===//
-// Template class for vector compare
-//===----------------------------------------------------------------------===//
-
-let hasSideEffects = 0 in
-class T_vcmpImm <string Str, bits<2> cmpOp, bits<2> minOp, Operand ImmOprnd>
-  : ALU64_rr <(outs PredRegs:$Pd),
-              (ins DoubleRegs:$Rss, ImmOprnd:$Imm),
-  "$Pd = "#Str#"($Rss, #$Imm)",
-  [], "", ALU64_tc_2early_SLOT23> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<32> Imm;
-    bits<8> ImmBits;
-    let ImmBits{6-0} = Imm{6-0};
-    let ImmBits{7} = !if (!eq(cmpOp,0b10), 0b0, Imm{7}); // 0 for vcmp[bhw].gtu
-
-    let IClass = 0b1101;
-
-    let Inst{27-24} = 0b1100;
-    let Inst{22-21} = cmpOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-5} = ImmBits;
-    let Inst{4-3} = minOp;
-    let Inst{1-0} = Pd;
-  }
-
-// Vector compare bytes
-def A4_vcmpbgt   : T_vcmp <"vcmpb.gt", 0b1010>;
-
-let AsmString = "$Pd = any8(vcmpb.eq($Rss, $Rtt))" in
-def A4_vcmpbeq_any : T_vcmp <"any8(vcmpb.gt", 0b1000>;
-
-def A4_vcmpbeqi  : T_vcmpImm <"vcmpb.eq",  0b00, 0b00, u8_0Imm>;
-def A4_vcmpbgti  : T_vcmpImm <"vcmpb.gt",  0b01, 0b00, s8_0Imm>;
-def A4_vcmpbgtui : T_vcmpImm <"vcmpb.gtu", 0b10, 0b00, u7_0Imm>;
-
-// Vector compare halfwords
-def A4_vcmpheqi  : T_vcmpImm <"vcmph.eq",  0b00, 0b01, s8_0Imm>;
-def A4_vcmphgti  : T_vcmpImm <"vcmph.gt",  0b01, 0b01, s8_0Imm>;
-def A4_vcmphgtui : T_vcmpImm <"vcmph.gtu", 0b10, 0b01, u7_0Imm>;
-
-// Vector compare words
-def A4_vcmpweqi  : T_vcmpImm <"vcmpw.eq",  0b00, 0b10, s8_0Imm>;
-def A4_vcmpwgti  : T_vcmpImm <"vcmpw.gt",  0b01, 0b10, s8_0Imm>;
-def A4_vcmpwgtui : T_vcmpImm <"vcmpw.gtu", 0b10, 0b10, u7_0Imm>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/SHIFT +
-//===----------------------------------------------------------------------===//
-// Shift by immediate and accumulate/logical.
-// Rx=add(#u8,asl(Rx,#U5))  Rx=add(#u8,lsr(Rx,#U5))
-// Rx=sub(#u8,asl(Rx,#U5))  Rx=sub(#u8,lsr(Rx,#U5))
-// Rx=and(#u8,asl(Rx,#U5))  Rx=and(#u8,lsr(Rx,#U5))
-// Rx=or(#u8,asl(Rx,#U5))   Rx=or(#u8,lsr(Rx,#U5))
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0, opExtentBits = 8,
-    hasNewValue = 1, opNewValue = 0 in
-class T_S4_ShiftOperate<string MnOp, string MnSh, bit asl_lsr,
-                        bits<2> MajOp, InstrItinClass Itin>
-  : MInst_acc<(outs IntRegs:$Rd), (ins u8_0Ext:$u8, IntRegs:$Rx, u5_0Imm:$U5),
-      "$Rd = "#MnOp#"(#$u8, "#MnSh#"($Rx, #$U5))",
-      [], "$Rd = $Rx", Itin> {
-
-  bits<5> Rd;
-  bits<8> u8;
-  bits<5> Rx;
-  bits<5> U5;
-
-  let IClass = 0b1101;
-  let Inst{27-24} = 0b1110;
-  let Inst{23-21} = u8{7-5};
-  let Inst{20-16} = Rd;
-  let Inst{13} = u8{4};
-  let Inst{12-8} = U5;
-  let Inst{7-5} = u8{3-1};
-  let Inst{4} = asl_lsr;
-  let Inst{3} = u8{0};
-  let Inst{2-1} = MajOp;
-}
-
-multiclass T_ShiftOperate<string mnemonic, bits<2> MajOp, InstrItinClass Itin> {
-  def _asl_ri : T_S4_ShiftOperate<mnemonic, "asl", 0, MajOp, Itin>;
-  def _lsr_ri : T_S4_ShiftOperate<mnemonic, "lsr", 1, MajOp, Itin>;
-}
-
-defm S4_addi : T_ShiftOperate<"add", 0b10, ALU64_tc_2_SLOT23>;
-defm S4_andi : T_ShiftOperate<"and", 0b00, ALU64_tc_2_SLOT23>;
-defm S4_ori  : T_ShiftOperate<"or",  0b01, ALU64_tc_1_SLOT23>;
-defm S4_subi : T_ShiftOperate<"sub", 0b11, ALU64_tc_1_SLOT23>;
-
-// Vector conditional negate
-// Rdd=vcnegh(Rss,Rt)
-let Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
-def S2_vcnegh   : T_S3op_shiftVect < "vcnegh",   0b11, 0b01>;
-
-// Rd=[cround|round](Rs,Rt)
-let hasNewValue = 1, Itinerary = S_3op_tc_2_SLOT23 in {
-  def A4_cround_rr    : T_S3op_3 < "cround", IntRegs, 0b11, 0b00>;
-  def A4_round_rr     : T_S3op_3 < "round", IntRegs, 0b11, 0b10>;
-}
-
-// Rd=round(Rs,Rt):sat
-let hasNewValue = 1, Defs = [USR_OVF], Itinerary = S_3op_tc_2_SLOT23 in
-def A4_round_rr_sat : T_S3op_3 < "round", IntRegs, 0b11, 0b11, 1>;
-
-// Rd=[cmpyiwh|cmpyrwh](Rss,Rt):<<1:rnd:sat
-let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
-  def M4_cmpyi_wh     : T_S3op_8<"cmpyiwh", 0b100, 1, 1, 1>;
-  def M4_cmpyr_wh     : T_S3op_8<"cmpyrwh", 0b110, 1, 1, 1>;
-}
-
-// Rdd=[add|sub](Rss,Rtt,Px):carry
-let isPredicateLate = 1, hasSideEffects = 0 in
-class T_S3op_carry <string mnemonic, bits<3> MajOp>
-  : SInst < (outs DoubleRegs:$Rdd, PredRegs:$Px),
-            (ins DoubleRegs:$Rss, DoubleRegs:$Rtt, PredRegs:$Pu),
-  "$Rdd = "#mnemonic#"($Rss, $Rtt, $Pu):carry",
-  [], "$Px = $Pu", S_3op_tc_1_SLOT23 > {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<5> Rtt;
-    bits<2> Pu;
-
-    let IClass = 0b1100;
-
-    let Inst{27-24} = 0b0010;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rss;
-    let Inst{12-8}  = Rtt;
-    let Inst{6-5}   = Pu;
-    let Inst{4-0}   = Rdd;
-  }
-
-def A4_addp_c : T_S3op_carry < "add", 0b110 >;
-def A4_subp_c : T_S3op_carry < "sub", 0b111 >;
-
-let Itinerary = S_3op_tc_3_SLOT23, hasSideEffects = 0 in
-class T_S3op_6 <string mnemonic, bits<3> MinOp, bit isUnsigned>
-  : SInst <(outs DoubleRegs:$Rxx),
-           (ins DoubleRegs:$dst2, DoubleRegs:$Rss, IntRegs:$Ru),
-  "$Rxx = "#mnemonic#"($Rss, $Ru)" ,
-  [] , "$dst2 = $Rxx"> {
-    bits<5> Rxx;
-    bits<5> Rss;
-    bits<5> Ru;
-
-    let IClass = 0b1100;
-
-    let Inst{27-21} = 0b1011001;
-    let Inst{20-16} = Rss;
-    let Inst{13}    = isUnsigned;
-    let Inst{12-8}  = Rxx;
-    let Inst{7-5}   = MinOp;
-    let Inst{4-0}   = Ru;
-  }
-
-// Vector reduce maximum halfwords
-// Rxx=vrmax[u]h(Rss,Ru)
-def A4_vrmaxh  : T_S3op_6 < "vrmaxh",  0b001, 0>;
-def A4_vrmaxuh : T_S3op_6 < "vrmaxuh", 0b001, 1>;
-
-// Vector reduce maximum words
-// Rxx=vrmax[u]w(Rss,Ru)
-def A4_vrmaxw  : T_S3op_6 < "vrmaxw",  0b010, 0>;
-def A4_vrmaxuw : T_S3op_6 < "vrmaxuw", 0b010, 1>;
-
-// Vector reduce minimum halfwords
-// Rxx=vrmin[u]h(Rss,Ru)
-def A4_vrminh  : T_S3op_6 < "vrminh",  0b101, 0>;
-def A4_vrminuh : T_S3op_6 < "vrminuh", 0b101, 1>;
-
-// Vector reduce minimum words
-// Rxx=vrmin[u]w(Rss,Ru)
-def A4_vrminw  : T_S3op_6 < "vrminw",  0b110, 0>;
-def A4_vrminuw : T_S3op_6 < "vrminuw", 0b110, 1>;
-
-// Shift an immediate left by register amount.
-let hasNewValue = 1, hasSideEffects = 0 in
-def S4_lsli: SInst <(outs IntRegs:$Rd), (ins s6_0Imm:$s6, IntRegs:$Rt),
-  "$Rd = lsl(#$s6, $Rt)" , [], "", S_3op_tc_1_SLOT23> {
-    bits<5> Rd;
-    bits<6> s6;
-    bits<5> Rt;
-
-    let IClass = 0b1100;
-
-    let Inst{27-22} = 0b011010;
-    let Inst{20-16} = s6{5-1};
-    let Inst{12-8}  = Rt;
-    let Inst{7-6}   = 0b11;
-    let Inst{4-0}   = Rd;
-    let Inst{5}     = s6{0};
-  }
-
-//===----------------------------------------------------------------------===//
-// XTYPE/SHIFT -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// MEMOP
-//===----------------------------------------------------------------------===//
-
-
-//===----------------------------------------------------------------------===//
-// Template class for MemOp instructions with the register value.
-//===----------------------------------------------------------------------===//
-class MemOp_rr_base <string opc, bits<2> opcBits, Operand ImmOp,
-                     string memOp, bits<2> memOpBits> :
-      MEMInst_V4<(outs),
-                 (ins IntRegs:$base, ImmOp:$offset, IntRegs:$delta),
-                 opc#"($base+#$offset)"#memOp#"$delta",
-                 []>,
-                 Requires<[UseMEMOP]> {
-
-    bits<5> base;
-    bits<5> delta;
-    bits<32> offset;
-    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
-
-    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
-                     !if (!eq(opcBits, 0b01), offset{6-1},
-                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
-
-    let opExtentAlign = opcBits;
-    let IClass = 0b0011;
-    let Inst{27-24} = 0b1110;
-    let Inst{22-21} = opcBits;
-    let Inst{20-16} = base;
-    let Inst{13} = 0b0;
-    let Inst{12-7} = offsetBits;
-    let Inst{6-5} = memOpBits;
-    let Inst{4-0} = delta;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for MemOp instructions with the immediate value.
-//===----------------------------------------------------------------------===//
-class MemOp_ri_base <string opc, bits<2> opcBits, Operand ImmOp,
-                     string memOp, bits<2> memOpBits> :
-      MEMInst_V4 <(outs),
-                  (ins IntRegs:$base, ImmOp:$offset, u5_0Imm:$delta),
-                  opc#"($base+#$offset)"#memOp#"#$delta"
-                  #!if(memOpBits{1},")", ""), // clrbit, setbit - include ')'
-                  []>,
-                  Requires<[UseMEMOP]> {
-
-    bits<5> base;
-    bits<5> delta;
-    bits<32> offset;
-    bits<6> offsetBits; // memb - u6:0 , memh - u6:1, memw - u6:2
-
-    let offsetBits = !if (!eq(opcBits, 0b00), offset{5-0},
-                     !if (!eq(opcBits, 0b01), offset{6-1},
-                     !if (!eq(opcBits, 0b10), offset{7-2},0)));
-
-    let opExtentAlign = opcBits;
-    let IClass = 0b0011;
-    let Inst{27-24} = 0b1111;
-    let Inst{22-21} = opcBits;
-    let Inst{20-16} = base;
-    let Inst{13} = 0b0;
-    let Inst{12-7} = offsetBits;
-    let Inst{6-5} = memOpBits;
-    let Inst{4-0} = delta;
-}
-
-// multiclass to define MemOp instructions with register operand.
-multiclass MemOp_rr<string opc, bits<2> opcBits, Operand ImmOp> {
-  def L4_add#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " += ", 0b00>; // add
-  def L4_sub#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " -= ", 0b01>; // sub
-  def L4_and#NAME : MemOp_rr_base <opc, opcBits, ImmOp, " &= ", 0b10>; // and
-  def L4_or#NAME  : MemOp_rr_base <opc, opcBits, ImmOp, " |= ", 0b11>; // or
-}
-
-// multiclass to define MemOp instructions with immediate Operand.
-multiclass MemOp_ri<string opc, bits<2> opcBits, Operand ImmOp> {
-  def L4_iadd#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " += ", 0b00 >;
-  def L4_isub#NAME : MemOp_ri_base <opc, opcBits, ImmOp, " -= ", 0b01 >;
-  def L4_iand#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = clrbit(", 0b10>;
-  def L4_ior#NAME : MemOp_ri_base<opc, opcBits, ImmOp, " = setbit(", 0b11>;
-}
-
-multiclass MemOp_base <string opc, bits<2> opcBits, Operand ImmOp> {
-  defm _#NAME : MemOp_rr <opc, opcBits, ImmOp>;
-  defm _#NAME : MemOp_ri <opc, opcBits, ImmOp>;
-}
-
-// Define MemOp instructions.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 0 in {
-  let opExtentBits = 6, accessSize = ByteAccess in
-  defm memopb_io : MemOp_base <"memb", 0b00, u6_0Ext>;
-
-  let opExtentBits = 7, accessSize = HalfWordAccess in
-  defm memoph_io : MemOp_base <"memh", 0b01, u6_1Ext>;
-
-  let opExtentBits = 8, accessSize = WordAccess in
-  defm memopw_io : MemOp_base <"memw", 0b10, u6_2Ext>;
-}
-
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PRED +
-//===----------------------------------------------------------------------===//
-
-// Hexagon V4 only supports these flavors of byte/half compare instructions:
-// EQ/GT/GTU. Other flavors like GE/GEU/LT/LTU/LE/LEU are not supported by
-// hardware. However, compiler can still implement these patterns through
-// appropriate patterns combinations based on current implemented patterns.
-// The implemented patterns are: EQ/GT/GTU.
-// Missing patterns are: GE/GEU/LT/LTU/LE/LEU.
-
-// Following instruction is not being extended as it results into the
-// incorrect code for negative numbers.
-// Pd=cmpb.eq(Rs,#u8)
-
-// p=!cmp.eq(r1,#s10)
-def C4_cmpneqi  : T_CMP <"cmp.eq",  0b00, 1, s10_0Ext>;
-def C4_cmpltei  : T_CMP <"cmp.gt",  0b01, 1, s10_0Ext>;
-def C4_cmplteui : T_CMP <"cmp.gtu", 0b10, 1, u9_0Ext>;
-
-//===----------------------------------------------------------------------===//
-// XTYPE/PRED -
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// Multiclass for DeallocReturn
-//===----------------------------------------------------------------------===//
-class L4_RETURN<string mnemonic, bit isNot, bit isPredNew, bit isTak>
-  : LD0Inst<(outs), (ins PredRegs:$src),
-  !if(isNot, "if (!$src", "if ($src")#
-  !if(isPredNew, ".new) ", ") ")#mnemonic#
-  !if(isPredNew, #!if(isTak,":t", ":nt"),""),
-  [], "", LD_tc_3or4stall_SLOT0> {
-
-    bits<2> src;
-    let BaseOpcode = "L4_RETURN";
-    let isPredicatedFalse = isNot;
-    let isPredicatedNew = isPredNew;
-    let isTaken = isTak;
-    let IClass = 0b1001;
-
-    let Inst{27-16} = 0b011000011110;
-
-    let Inst{13} = isNot;
-    let Inst{12} = isTak;
-    let Inst{11} = isPredNew;
-    let Inst{10} = 0b0;
-    let Inst{9-8} = src;
-    let Inst{4-0} = 0b11110;
-  }
-
-// Produce all predicated forms, p, !p, p.new, !p.new, :t, :nt
-multiclass L4_RETURN_PRED<string mnemonic, bit PredNot> {
-  let isPredicated = 1 in {
-    def _#NAME# : L4_RETURN <mnemonic, PredNot, 0, 1>;
-    def _#NAME#new_pnt : L4_RETURN <mnemonic, PredNot, 1, 0>;
-    def _#NAME#new_pt : L4_RETURN <mnemonic, PredNot, 1, 1>;
-  }
-}
-
-multiclass LD_MISC_L4_RETURN<string mnemonic> {
-  let isBarrier = 1, isPredicable = 1 in
-    def NAME : LD0Inst <(outs), (ins), mnemonic, [], "",
-                        LD_tc_3or4stall_SLOT0> {
-      let BaseOpcode = "L4_RETURN";
-      let IClass = 0b1001;
-      let Inst{27-16} = 0b011000011110;
-      let Inst{13-10} = 0b0000;
-      let Inst{4-0} = 0b11110;
-    }
-  defm t : L4_RETURN_PRED<mnemonic, 0 >;
-  defm f : L4_RETURN_PRED<mnemonic, 1 >;
-}
-
-let isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], Uses = [R30], hasSideEffects = 0 in
-defm L4_return: LD_MISC_L4_RETURN <"dealloc_return">, PredNewRel;
-
-// Restore registers and dealloc return function call.
-let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
-    Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
-  def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">;
-
-  let isExtended = 1, opExtendable = 0 in
-  def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
-
-  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
-    def RESTORE_DEALLOC_RET_JMP_V4_PIC : T_JMP<"">;
-
-    let isExtended = 1, opExtendable = 0 in
-    def RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC : T_JMP<"">;
-  }
-}
-
-// Restore registers and dealloc frame before a tail call.
-let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
-  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<0, "">, PredRel;
-
-  let isExtended = 1, opExtendable = 0 in
-  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
-    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC : T_Call<0, "">, PredRel;
-
-    let isExtended = 1, opExtendable = 0 in
-    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
-  }
-}
-
-// Save registers function call.
-let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
-  def SAVE_REGISTERS_CALL_V4 : T_Call<0, "">, PredRel;
-
-  let isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4_EXT : T_Call<0, "">, PredRel;
-
-  let Defs = [P0] in
-  def SAVE_REGISTERS_CALL_V4STK : T_Call<0, "">, PredRel;
-
-  let Defs = [P0], isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4STK_EXT : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28] in
-  def SAVE_REGISTERS_CALL_V4_PIC : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28], isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4_EXT_PIC : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28, P0] in
-  def SAVE_REGISTERS_CALL_V4STK_PIC : T_Call<0, "">, PredRel;
-
-  let Defs = [R14, R15, R28, P0], isExtended = 1, opExtendable = 0 in
-  def SAVE_REGISTERS_CALL_V4STK_EXT_PIC : T_Call<0, "">, PredRel;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for non predicated store instructions with
-// GP-Relative or absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicable = 1 in
-class T_StoreAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
-                    bits<2>MajOp, bit isAbs, bit isHalf>
-  : STInst<(outs), (ins ImmOp:$addr, RC:$src),
-  mnemonic # "(#$addr) = $src"#!if(isHalf, ".h",""),
-  [], "", V2LDST_tc_st_SLOT01> {
-    bits<19> addr;
-    bits<5> src;
-    bits<16> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
-                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
-                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
-                                      /* u16_0Imm */ addr{15-0})));
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-    let Uses = !if (isAbs, [], [GP]);
-
-    let IClass = 0b0100;
-    let Inst{27} = 1;
-    let Inst{26-25} = offsetBits{15-14};
-    let Inst{24}    = 0b0;
-    let Inst{23-22} = MajOp;
-    let Inst{21}    = isHalf;
-    let Inst{20-16} = offsetBits{13-9};
-    let Inst{13}    = offsetBits{8};
-    let Inst{12-8}  = src;
-    let Inst{7-0}   = offsetBits{7-0};
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated store instructions with
-// GP-Relative or absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicated = 1, opExtentBits = 6, opExtendable = 1 in
-class T_StoreAbs_Pred <string mnemonic, RegisterClass RC, bits<2> MajOp,
-                       bit isHalf, bit isNot, bit isNew>
-  : STInst<(outs), (ins PredRegs:$src1, u32_0MustExt:$absaddr, RC: $src2),
-  !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
-  ") ")#mnemonic#"(#$absaddr) = $src2"#!if(isHalf, ".h",""),
-  [], "", ST_tc_st_SLOT01>, AddrModeRel {
-    bits<2> src1;
-    bits<6> absaddr;
-    bits<5> src2;
-
-    let isPredicatedNew = isNew;
-    let isPredicatedFalse = isNot;
-    // Store upper-half and store doubleword cannot be NV.
-    let isNVStorable = !if (!eq(mnemonic, "memd"), 0, !if(isHalf,0,1));
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-22} = MajOp;
-    let Inst{21}    = isHalf;
-    let Inst{17-16} = absaddr{5-4};
-    let Inst{13}    = isNew;
-    let Inst{12-8}  = src2;
-    let Inst{7}     = 0b1;
-    let Inst{6-3}   = absaddr{3-0};
-    let Inst{2}     = isNot;
-    let Inst{1-0}   = src1;
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated store instructions with absolute addressing.
-//===----------------------------------------------------------------------===//
-class T_StoreAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
-                 bits<2> MajOp, bit isHalf>
-  : T_StoreAbsGP <mnemonic, RC, u32_0MustExt, MajOp, 1, isHalf>,
-                  AddrModeRel {
-  string ImmOpStr = !cast<string>(ImmOp);
-  let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
-                     !if (!eq(ImmOpStr, "u16_2Imm"), 18,
-                     !if (!eq(ImmOpStr, "u16_1Imm"), 17,
-                                      /* u16_0Imm */ 16)));
-
-  let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
-                      !if (!eq(ImmOpStr, "u16_2Imm"), 2,
-                      !if (!eq(ImmOpStr, "u16_1Imm"), 1,
-                                       /* u16_0Imm */ 0)));
-}
-
-//===----------------------------------------------------------------------===//
-// Multiclass for store instructions with absolute addressing.
-//===----------------------------------------------------------------------===//
-let addrMode = Absolute, isExtended = 1 in
-multiclass ST_Abs<string mnemonic, string CextOp, RegisterClass RC,
-                  Operand ImmOp, bits<2> MajOp, bit isHalf = 0> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let opExtendable = 0, isPredicable = 1 in
-    def PS_#NAME#abs : T_StoreAbs <mnemonic, RC, ImmOp, MajOp, isHalf>;
-
-    // Predicated
-    def S4_p#NAME#t_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 0>;
-    def S4_p#NAME#f_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 0>;
-
-    // .new Predicated
-    def S4_p#NAME#tnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 0, 1>;
-    def S4_p#NAME#fnew_abs : T_StoreAbs_Pred<mnemonic, RC, MajOp, isHalf, 1, 1>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for non predicated new-value store instructions with
-// GP-Relative or absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicable = 1, mayStore = 1, isNVStore = 1,
-    isNewValue = 1, opNewValue = 1 in
-class T_StoreAbsGP_NV <string mnemonic, Operand ImmOp, bits<2>MajOp>
-  : NVInst_V4<(outs), (ins ImmOp:$addr, IntRegs:$src),
-  mnemonic #"(#$addr) = $src.new",
-  [], "", V2LDST_tc_st_SLOT0> {
-    bits<19> addr;
-    bits<3> src;
-    bits<16> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
-                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
-                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
-                                      /* u16_0Imm */ addr{15-0})));
-    let IClass = 0b0100;
-
-    let Inst{27} = 1;
-    let Inst{26-25} = offsetBits{15-14};
-    let Inst{24-21} = 0b0101;
-    let Inst{20-16} = offsetBits{13-9};
-    let Inst{13}    = offsetBits{8};
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src;
-    let Inst{7-0}   = offsetBits{7-0};
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated new-value store instructions with
-// absolute addressing.
-//===----------------------------------------------------------------------===//
-let hasSideEffects = 0, isPredicated = 1, mayStore = 1, isNVStore = 1,
-    isNewValue = 1, opNewValue = 2, opExtentBits = 6, opExtendable = 1 in
-class T_StoreAbs_NV_Pred <string mnemonic, bits<2> MajOp, bit isNot, bit isNew>
-  : NVInst_V4<(outs), (ins PredRegs:$src1, u32_0MustExt:$absaddr, IntRegs:$src2),
-  !if(isNot, "if (!$src1", "if ($src1")#!if(isNew, ".new) ",
-  ") ")#mnemonic#"(#$absaddr) = $src2.new",
-  [], "", ST_tc_st_SLOT0>, AddrModeRel {
-    bits<2> src1;
-    bits<6> absaddr;
-    bits<3> src2;
-
-    let isPredicatedNew = isNew;
-    let isPredicatedFalse = isNot;
-
-    let IClass = 0b1010;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-21} = 0b101;
-    let Inst{17-16} = absaddr{5-4};
-    let Inst{13}    = isNew;
-    let Inst{12-11} = MajOp;
-    let Inst{10-8}  = src2;
-    let Inst{7}     = 0b1;
-    let Inst{6-3}   = absaddr{3-0};
-    let Inst{2}     = isNot;
-    let Inst{1-0}   = src1;
-}
-
-//===----------------------------------------------------------------------===//
-// Template class for non-predicated new-value store instructions with
-// absolute addressing.
-//===----------------------------------------------------------------------===//
-class T_StoreAbs_NV <string mnemonic, Operand ImmOp, bits<2> MajOp>
-  : T_StoreAbsGP_NV <mnemonic, u32_0MustExt, MajOp>, AddrModeRel {
-
-  string ImmOpStr = !cast<string>(ImmOp);
-  let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
-                     !if (!eq(ImmOpStr, "u16_2Imm"), 18,
-                     !if (!eq(ImmOpStr, "u16_1Imm"), 17,
-                                      /* u16_0Imm */ 16)));
-
-  let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
-                      !if (!eq(ImmOpStr, "u16_2Imm"), 2,
-                      !if (!eq(ImmOpStr, "u16_1Imm"), 1,
-                                       /* u16_0Imm */ 0)));
-}
-
-//===----------------------------------------------------------------------===//
-// Multiclass for new-value store instructions with absolute addressing.
-//===----------------------------------------------------------------------===//
-let addrMode = Absolute, isExtended = 1  in
-multiclass ST_Abs_NV <string mnemonic, string CextOp, Operand ImmOp,
-                   bits<2> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let opExtendable = 0, isPredicable = 1 in
-    def PS_#NAME#newabs : T_StoreAbs_NV <mnemonic, ImmOp, MajOp>;
-
-    // Predicated
-    def S4_p#NAME#newt_abs  : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 0>;
-    def S4_p#NAME#newf_abs  : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 0>;
-
-    // .new Predicated
-    def S4_p#NAME#newtnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 0, 1>;
-    def S4_p#NAME#newfnew_abs : T_StoreAbs_NV_Pred <mnemonic, MajOp, 1, 1>;
-  }
-}
-
-//===----------------------------------------------------------------------===//
-// Stores with absolute addressing
-//===----------------------------------------------------------------------===//
-let accessSize = ByteAccess in
-defm storerb : ST_Abs    <"memb", "STrib", IntRegs, u16_0Imm, 0b00>,
-               ST_Abs_NV <"memb", "STrib", u16_0Imm, 0b00>;
-
-let accessSize = HalfWordAccess in
-defm storerh : ST_Abs    <"memh", "STrih", IntRegs, u16_1Imm, 0b01>,
-               ST_Abs_NV <"memh", "STrih", u16_1Imm, 0b01>;
-
-let accessSize = WordAccess in
-defm storeri : ST_Abs    <"memw", "STriw", IntRegs, u16_2Imm, 0b10>,
-               ST_Abs_NV <"memw", "STriw", u16_2Imm, 0b10>;
-
-let isNVStorable = 0, accessSize = DoubleWordAccess in
-defm storerd : ST_Abs <"memd", "STrid", DoubleRegs, u16_3Imm, 0b11>;
-
-let isNVStorable = 0, accessSize = HalfWordAccess in
-defm storerf : ST_Abs <"memh", "STrif", IntRegs, u16_1Imm, 0b01, 1>;
-
-//===----------------------------------------------------------------------===//
-// GP-relative stores.
-// mem[bhwd](#global)=Rt
-// Once predicated, these instructions map to absolute addressing mode.
-// if ([!]Pv[.new]) mem[bhwd](##global)=Rt
-//===----------------------------------------------------------------------===//
-
-let Uses = [GP], isAsmParserOnly = 1 in
-class T_StoreGP <string mnemonic, string BaseOp, RegisterClass RC,
-                 Operand ImmOp, bits<2> MajOp, bit isHalf = 0>
-  : T_StoreAbsGP <mnemonic, RC, ImmOp, MajOp, 0, isHalf> {
-    // Set BaseOpcode same as absolute addressing instructions so that
-    // non-predicated GP-Rel instructions can have relate with predicated
-    // Absolute instruction.
-    let BaseOpcode = BaseOp#_abs;
-  }
-
-let Uses = [GP], isAsmParserOnly = 1 in
-multiclass ST_GP <string mnemonic, string BaseOp, Operand ImmOp,
-                  bits<2> MajOp, bit isHalf = 0> {
-  // Set BaseOpcode same as absolute addressing instructions so that
-  // non-predicated GP-Rel instructions can have relate with predicated
-  // Absolute instruction.
-  let BaseOpcode = BaseOp#_abs in {
-    def NAME#gp : T_StoreAbsGP <mnemonic, IntRegs, ImmOp, MajOp,
-                                0, isHalf>;
-    // New-value store
-    def NAME#newgp : T_StoreAbsGP_NV <mnemonic, ImmOp, MajOp> ;
-  }
-}
-
-let accessSize = ByteAccess in
-defm S2_storerb : ST_GP<"memb", "STrib", u16_0Imm, 0b00>, NewValueRel;
-
-let accessSize = HalfWordAccess in
-defm S2_storerh : ST_GP<"memh", "STrih", u16_1Imm, 0b01>, NewValueRel;
-
-let accessSize = WordAccess in
-defm S2_storeri : ST_GP<"memw", "STriw", u16_2Imm, 0b10>, NewValueRel;
-
-let isNVStorable = 0, accessSize = DoubleWordAccess in
-def S2_storerdgp : T_StoreGP <"memd", "STrid", DoubleRegs,
-                              u16_3Imm, 0b11>, PredNewRel;
-
-let isNVStorable = 0, accessSize = HalfWordAccess in
-def S2_storerfgp : T_StoreGP <"memh", "STrif", IntRegs,
-                              u16_1Imm, 0b01, 1>, PredNewRel;
-
-//===----------------------------------------------------------------------===//
-// Template class for non predicated load instructions with
-// absolute addressing mode.
-//===----------------------------------------------------------------------===//
-let isPredicable = 1, hasSideEffects = 0 in
-class T_LoadAbsGP <string mnemonic, RegisterClass RC, Operand ImmOp,
-                   bits<3> MajOp>
-  : LDInst <(outs RC:$dst), (ins ImmOp:$addr),
-  "$dst = "#mnemonic# "(#$addr)",
-  [], "", V2LDST_tc_ld_SLOT01> {
-    bits<5> dst;
-    bits<19> addr;
-    bits<16> offsetBits;
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let offsetBits = !if (!eq(ImmOpStr, "u16_3Imm"), addr{18-3},
-                     !if (!eq(ImmOpStr, "u16_2Imm"), addr{17-2},
-                     !if (!eq(ImmOpStr, "u16_1Imm"), addr{16-1},
-                                      /* u16_0Imm */ addr{15-0})));
-
-    let IClass = 0b0100;
-
-    let Inst{27}    = 0b1;
-    let Inst{26-25} = offsetBits{15-14};
-    let Inst{24}    = 0b1;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = offsetBits{13-9};
-    let Inst{13-5}  = offsetBits{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-class T_LoadAbs <string mnemonic, RegisterClass RC, Operand ImmOp,
-                 bits<3> MajOp>
-  : T_LoadAbsGP <mnemonic, RC, u32_0MustExt, MajOp>, AddrModeRel {
-
-    string ImmOpStr = !cast<string>(ImmOp);
-    let opExtentBits = !if (!eq(ImmOpStr, "u16_3Imm"), 19,
-                       !if (!eq(ImmOpStr, "u16_2Imm"), 18,
-                       !if (!eq(ImmOpStr, "u16_1Imm"), 17,
-                                        /* u16_0Imm */ 16)));
-
-    let opExtentAlign = !if (!eq(ImmOpStr, "u16_3Imm"), 3,
-                        !if (!eq(ImmOpStr, "u16_2Imm"), 2,
-                        !if (!eq(ImmOpStr, "u16_1Imm"), 1,
-                                        /* u16_0Imm */ 0)));
-  }
-
-//===----------------------------------------------------------------------===//
-// Template class for predicated load instructions with
-// absolute addressing mode.
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opExtentBits = 6,
-    opExtendable = 2 in
-class T_LoadAbs_Pred <string mnemonic, RegisterClass RC, bits<3> MajOp,
-                      bit isPredNot, bit isPredNew>
-  : LDInst <(outs RC:$dst), (ins PredRegs:$src1, u32_0MustExt:$absaddr),
-  !if(isPredNot, "if (!$src1", "if ($src1")#!if(isPredNew, ".new) ",
-  ") ")#"$dst = "#mnemonic#"(#$absaddr)">, AddrModeRel {
-    bits<5> dst;
-    bits<2> src1;
-    bits<6> absaddr;
-
-    let isPredicatedNew = isPredNew;
-    let isPredicatedFalse = isPredNot;
-    let hasNewValue = !if (!eq(!cast<string>(RC), "DoubleRegs"), 0, 1);
-
-    let IClass = 0b1001;
-
-    let Inst{27-24} = 0b1111;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = absaddr{5-1};
-    let Inst{13} = 0b1;
-    let Inst{12} = isPredNew;
-    let Inst{11} = isPredNot;
-    let Inst{10-9} = src1;
-    let Inst{8} = absaddr{0};
-    let Inst{7} = 0b1;
-    let Inst{4-0} = dst;
-  }
-
-//===----------------------------------------------------------------------===//
-// Multiclass for the load instructions with absolute addressing mode.
-//===----------------------------------------------------------------------===//
-multiclass LD_Abs_Pred<string mnemonic, RegisterClass RC, bits<3> MajOp,
-                       bit PredNot> {
-  def _abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 0>;
-  // Predicate new
-  def new_abs : T_LoadAbs_Pred <mnemonic, RC, MajOp, PredNot, 1>;
-}
-
-let addrMode = Absolute, isExtended = 1 in
-multiclass LD_Abs<string mnemonic, string CextOp, RegisterClass RC,
-                  Operand ImmOp, bits<3> MajOp> {
-  let CextOpcode = CextOp, BaseOpcode = CextOp#_abs in {
-    let opExtendable = 1, isPredicable = 1 in
-    def PS_#NAME#abs: T_LoadAbs <mnemonic, RC, ImmOp, MajOp>;
-
-    // Predicated
-    defm L4_p#NAME#t : LD_Abs_Pred<mnemonic, RC, MajOp, 0>;
-    defm L4_p#NAME#f : LD_Abs_Pred<mnemonic, RC, MajOp, 1>;
-  }
-}
-
-let accessSize = ByteAccess, hasNewValue = 1 in {
-  defm loadrb  : LD_Abs<"memb",  "LDrib",  IntRegs, u16_0Imm, 0b000>;
-  defm loadrub : LD_Abs<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
-}
-
-let accessSize = HalfWordAccess, hasNewValue = 1 in {
-  defm loadrh  : LD_Abs<"memh",  "LDrih",  IntRegs, u16_1Imm, 0b010>;
-  defm loadruh : LD_Abs<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
-}
-
-let accessSize = WordAccess, hasNewValue = 1 in
-defm loadri  : LD_Abs<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
-
-let accessSize = DoubleWordAccess in
-defm loadrd  : LD_Abs<"memd",  "LDrid", DoubleRegs, u16_3Imm, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// multiclass for load instructions with GP-relative addressing mode.
-// Rx=mem[bhwd](##global)
-// Once predicated, these instructions map to absolute addressing mode.
-// if ([!]Pv[.new]) Rx=mem[bhwd](##global)
-//===----------------------------------------------------------------------===//
-
-let isAsmParserOnly = 1, Uses = [GP] in
-class T_LoadGP <string mnemonic, string BaseOp, RegisterClass RC, Operand ImmOp,
-                bits<3> MajOp>
-  : T_LoadAbsGP <mnemonic, RC, ImmOp, MajOp>, PredNewRel {
-    let BaseOpcode = BaseOp#_abs;
-  }
-
-let accessSize = ByteAccess, hasNewValue = 1 in {
-  def L2_loadrbgp  : T_LoadGP<"memb",  "LDrib",  IntRegs, u16_0Imm, 0b000>;
-  def L2_loadrubgp : T_LoadGP<"memub", "LDriub", IntRegs, u16_0Imm, 0b001>;
-}
-
-let accessSize = HalfWordAccess, hasNewValue = 1 in {
-  def L2_loadrhgp  : T_LoadGP<"memh",  "LDrih",  IntRegs, u16_1Imm, 0b010>;
-  def L2_loadruhgp : T_LoadGP<"memuh", "LDriuh", IntRegs, u16_1Imm, 0b011>;
-}
-
-let accessSize = WordAccess, hasNewValue = 1 in
-def L2_loadrigp  : T_LoadGP<"memw",  "LDriw",  IntRegs, u16_2Imm, 0b100>;
-
-let accessSize = DoubleWordAccess in
-def L2_loadrdgp  : T_LoadGP<"memd", "LDrid", DoubleRegs, u16_3Imm, 0b110>;
-
-//===----------------------------------------------------------------------===//
-// :raw for of boundscheck:hi:lo insns
-//===----------------------------------------------------------------------===//
-
-// A4_boundscheck_lo: Detect if a register is within bounds.
-let hasSideEffects = 0 in
-def A4_boundscheck_lo: ALU64Inst <
-  (outs PredRegs:$Pd),
-  (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Pd = boundscheck($Rss, $Rtt):raw:lo"> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b00100;
-    let Inst{13} = 0b1;
-    let Inst{7-5} = 0b100;
-    let Inst{1-0} = Pd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-// A4_boundscheck_hi: Detect if a register is within bounds.
-let hasSideEffects = 0 in
-def A4_boundscheck_hi: ALU64Inst <
-  (outs PredRegs:$Pd),
-  (ins DoubleRegs:$Rss, DoubleRegs:$Rtt),
-  "$Pd = boundscheck($Rss, $Rtt):raw:hi"> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> Rtt;
-
-    let IClass = 0b1101;
-
-    let Inst{27-23} = 0b00100;
-    let Inst{13} = 0b1;
-    let Inst{7-5} = 0b101;
-    let Inst{1-0} = Pd;
-    let Inst{20-16} = Rss;
-    let Inst{12-8} = Rtt;
-  }
-
-let hasSideEffects = 0, isAsmParserOnly = 1 in
-def A4_boundscheck : MInst <
-  (outs PredRegs:$Pd), (ins IntRegs:$Rs, DoubleRegs:$Rtt),
-  "$Pd=boundscheck($Rs,$Rtt)">;
-
-// A4_tlbmatch: Detect if a VA/ASID matches a TLB entry.
-let isPredicateLate = 1, hasSideEffects = 0 in
-def A4_tlbmatch : ALU64Inst<(outs PredRegs:$Pd),
-  (ins DoubleRegs:$Rs, IntRegs:$Rt),
-  "$Pd = tlbmatch($Rs, $Rt)",
-  [], "", ALU64_tc_2early_SLOT23> {
-    bits<2> Pd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1101;
-    let Inst{27-23} = 0b00100;
-    let Inst{20-16} = Rs;
-    let Inst{13} = 0b1;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = 0b011;
-    let Inst{1-0} = Pd;
-  }
-
-// Use LD0Inst for dcfetch, but set "mayLoad" to 0 because this doesn't
-// really do a load.
-let hasSideEffects = 1, mayLoad = 0 in
-def Y2_dcfetchbo : LD0Inst<(outs), (ins IntRegs:$Rs, u11_3Imm:$u11_3),
-      "dcfetch($Rs + #$u11_3)",
-      [], "", LD_tc_ld_SLOT0> {
-  bits<5> Rs;
-  bits<14> u11_3;
-
-  let IClass = 0b1001;
-  let Inst{27-21} = 0b0100000;
-  let Inst{20-16} = Rs;
-  let Inst{13} = 0b0;
-  let Inst{10-0} = u11_3{13-3};
-}
-
-
-//===----------------------------------------------------------------------===//
-// Compound instructions
-//===----------------------------------------------------------------------===//
-
-let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
-    isPredicated = 1, isPredicatedNew = 1, isExtendable = 1,
-    opExtentBits = 11, opExtentAlign = 2, opExtendable = 1,
-    isTerminator = 1 in
-class CJInst_tstbit_R0<string px, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, brtarget:$r9_2),
-  ""#px#" = tstbit($Rs, #0); if ("
-    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-26} = 0b00;
-  let Inst{25} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-  let Inst{24-23} = 0b11;
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  let Inst{9-8} = 0b11;
-  let Inst{7-1} = r9_2{8-2};
-}
-
-let Defs = [PC, P0], Uses = [P0] in {
-  def J4_tstbit0_tp0_jump_nt : CJInst_tstbit_R0<"p0", 0, "nt">;
-  def J4_tstbit0_tp0_jump_t : CJInst_tstbit_R0<"p0", 0, "t">;
-  def J4_tstbit0_fp0_jump_nt : CJInst_tstbit_R0<"p0", 1, "nt">;
-  def J4_tstbit0_fp0_jump_t : CJInst_tstbit_R0<"p0", 1, "t">;
-}
-
-let Defs = [PC, P1], Uses = [P1] in {
-  def J4_tstbit0_tp1_jump_nt : CJInst_tstbit_R0<"p1", 0, "nt">;
-  def J4_tstbit0_tp1_jump_t : CJInst_tstbit_R0<"p1", 0, "t">;
-  def J4_tstbit0_fp1_jump_nt : CJInst_tstbit_R0<"p1", 1, "nt">;
-  def J4_tstbit0_fp1_jump_t : CJInst_tstbit_R0<"p1", 1, "t">;
-}
-
-
-let isBranch = 1, hasSideEffects = 0,
-    isExtentSigned = 1, isPredicated = 1, isPredicatedNew = 1,
-    isExtendable = 1, opExtentBits = 11, opExtentAlign = 2,
-    opExtendable = 2, isTerminator = 1 in
-class CJInst_RR<string px, string op, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, IntRegs:$Rt, brtarget:$r9_2),
-  ""#px#" = cmp."#op#"($Rs, $Rt); if ("
-   #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<4> Rt;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-23} = !if (!eq(op, "eq"),  0b01000,
-                    !if (!eq(op, "gt"),  0b01001,
-                    !if (!eq(op, "gtu"), 0b01010, 0)));
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  // px: Predicate reg 0/1
-  let Inst{12} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-  let Inst{11-8} = Rt;
-  let Inst{7-1} = r9_2{8-2};
-}
-
-// P[10] taken/not taken.
-multiclass T_tnt_CJInst_RR<string op, bit np> {
-  let Defs = [PC, P0], Uses = [P0] in {
-    def NAME#p0_jump_nt : CJInst_RR<"p0", op, np, "nt">;
-    def NAME#p0_jump_t : CJInst_RR<"p0", op, np, "t">;
-  }
-  let Defs = [PC, P1], Uses = [P1] in {
-    def NAME#p1_jump_nt : CJInst_RR<"p1", op, np, "nt">;
-    def NAME#p1_jump_t : CJInst_RR<"p1", op, np, "t">;
-  }
-}
-// Predicate / !Predicate
-multiclass T_pnp_CJInst_RR<string op>{
-  defm J4_cmp#NAME#_t : T_tnt_CJInst_RR<op, 0>;
-  defm J4_cmp#NAME#_f : T_tnt_CJInst_RR<op, 1>;
-}
-// TypeCJ Instructions compare RR and jump
-defm eq : T_pnp_CJInst_RR<"eq">;
-defm gt : T_pnp_CJInst_RR<"gt">;
-defm gtu : T_pnp_CJInst_RR<"gtu">;
-
-let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
-    isPredicated = 1, isPredicatedNew = 1, isExtendable = 1, opExtentBits = 11,
-    opExtentAlign = 2, opExtendable = 2, isTerminator = 1 in
-class CJInst_RU5<string px, string op, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, u5_0Imm:$U5, brtarget:$r9_2),
-  ""#px#" = cmp."#op#"($Rs, #$U5); if ("
-    #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<5> U5;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-26} = 0b00;
-  // px: Predicate reg 0/1
-  let Inst{25} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-  let Inst{24-23} = !if (!eq(op, "eq"),  0b00,
-                    !if (!eq(op, "gt"),  0b01,
-                    !if (!eq(op, "gtu"), 0b10, 0)));
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  let Inst{12-8} = U5;
-  let Inst{7-1} = r9_2{8-2};
-}
-// P[10] taken/not taken.
-multiclass T_tnt_CJInst_RU5<string op, bit np> {
-  let Defs = [PC, P0], Uses = [P0] in {
-    def NAME#p0_jump_nt : CJInst_RU5<"p0", op, np, "nt">;
-    def NAME#p0_jump_t : CJInst_RU5<"p0", op, np, "t">;
-  }
-  let Defs = [PC, P1], Uses = [P1] in {
-    def NAME#p1_jump_nt : CJInst_RU5<"p1", op, np, "nt">;
-    def NAME#p1_jump_t : CJInst_RU5<"p1", op, np, "t">;
-  }
-}
-// Predicate / !Predicate
-multiclass T_pnp_CJInst_RU5<string op>{
-  defm J4_cmp#NAME#i_t : T_tnt_CJInst_RU5<op, 0>;
-  defm J4_cmp#NAME#i_f : T_tnt_CJInst_RU5<op, 1>;
-}
-// TypeCJ Instructions compare RI and jump
-defm eq : T_pnp_CJInst_RU5<"eq">;
-defm gt : T_pnp_CJInst_RU5<"gt">;
-defm gtu : T_pnp_CJInst_RU5<"gtu">;
-
-let isBranch = 1, hasSideEffects = 0, isExtentSigned = 1,
-    isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1,
-    isExtendable = 1, opExtentBits = 11, opExtentAlign = 2, opExtendable = 2,
-    isTerminator = 1 in
-class CJInst_Rn1<string px, string op, bit np, string tnt>
-  : InstHexagon<(outs), (ins IntRegs:$Rs, n1Const:$n1, brtarget:$r9_2),
-  ""#px#" = cmp."#op#"($Rs,#$n1); if ("
-  #!if(np, "!","")#""#px#".new) jump:"#tnt#" $r9_2",
-  [], "", COMPOUND_CJ_ARCHDEPSLOT, TypeCOMPOUND>, OpcodeHexagon {
-  bits<4> Rs;
-  bits<11> r9_2;
-
-  // np: !p[01]
-  let isPredicatedFalse = np;
-  // tnt: Taken/Not Taken
-  let isBrTaken = !if (!eq(tnt, "t"), "true", "false");
-  let isTaken   = !if (!eq(tnt, "t"), 1, 0);
-
-  let IClass = 0b0001;
-  let Inst{27-26} = 0b00;
-  let Inst{25} = !if (!eq(px, "!p1"), 1,
-                 !if (!eq(px,  "p1"), 1, 0));
-
-  let Inst{24-23} = 0b11;
-  let Inst{22} = np;
-  let Inst{21-20} = r9_2{10-9};
-  let Inst{19-16} = Rs;
-  let Inst{13} = !if (!eq(tnt, "t"), 1, 0);
-  let Inst{9-8} = !if (!eq(op, "eq"),  0b00,
-                  !if (!eq(op, "gt"),  0b01, 0));
-  let Inst{7-1} = r9_2{8-2};
-}
-
-// P[10] taken/not taken.
-multiclass T_tnt_CJInst_Rn1<string op, bit np> {
-  let Defs = [PC, P0], Uses = [P0] in {
-    def NAME#p0_jump_nt : CJInst_Rn1<"p0", op, np, "nt">;
-    def NAME#p0_jump_t : CJInst_Rn1<"p0", op, np, "t">;
-  }
-  let Defs = [PC, P1], Uses = [P1] in {
-    def NAME#p1_jump_nt : CJInst_Rn1<"p1", op, np, "nt">;
-    def NAME#p1_jump_t : CJInst_Rn1<"p1", op, np, "t">;
-  }
-}
-// Predicate / !Predicate
-multiclass T_pnp_CJInst_Rn1<string op>{
-  defm J4_cmp#NAME#n1_t : T_tnt_CJInst_Rn1<op, 0>;
-  defm J4_cmp#NAME#n1_f : T_tnt_CJInst_Rn1<op, 1>;
-}
-// TypeCJ Instructions compare -1 and jump
-defm eq : T_pnp_CJInst_Rn1<"eq">;
-defm gt : T_pnp_CJInst_Rn1<"gt">;
-
-// J4_jumpseti: Direct unconditional jump and set register to immediate.
-let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
-    isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
-    opExtentAlign = 2, opExtendable = 2 in
-def J4_jumpseti: CJInst_JMPSET <
-  (outs IntRegs:$Rd),
-  (ins u6_0Imm:$U6, brtarget:$r9_2),
-  "$Rd = #$U6 ; jump $r9_2"> {
-    bits<4> Rd;
-    bits<6> U6;
-    bits<11> r9_2;
-
-    let IClass = 0b0001;
-    let Inst{27-24} = 0b0110;
-    let Inst{21-20} = r9_2{10-9};
-    let Inst{19-16} = Rd;
-    let Inst{13-8} = U6;
-    let Inst{7-1} = r9_2{8-2};
-  }
-
-// J4_jumpsetr: Direct unconditional jump and transfer register.
-let Defs = [PC], isBranch = 1, hasSideEffects = 0, hasNewValue = 1,
-    isExtentSigned = 1, opNewValue = 0, isExtendable = 1, opExtentBits = 11,
-    opExtentAlign = 2, opExtendable = 2 in
-def J4_jumpsetr: CJInst_JMPSET <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, brtarget:$r9_2),
-  "$Rd = $Rs ; jump $r9_2"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<11> r9_2;
-
-    let IClass = 0b0001;
-    let Inst{27-24} = 0b0111;
-    let Inst{21-20} = r9_2{10-9};
-    let Inst{11-8} = Rd;
-    let Inst{19-16} = Rs;
-    let Inst{7-1} = r9_2{8-2};
-  }
-
-// Duplex instructions
-//===----------------------------------------------------------------------===//
-include "HexagonIsetDx.td"
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV5.td b/lib/Target/Hexagon/HexagonInstrInfoV5.td
deleted file mode 100644
index cd19b6916f21afec560cc950975d175ef0247e6c..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfoV5.td
+++ /dev/null
@@ -1,497 +0,0 @@
-//=- HexagonInstrInfoV5.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V5 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-// XTYPE/MPY
-//===----------------------------------------------------------------------===//
-
-  //Rdd[+]=vrmpybsu(Rss,Rtt)
-let Predicates = [HasV5T] in {
-  def M5_vrmpybsu: T_XTYPE_Vect<"vrmpybsu", 0b110, 0b001, 0>;
-  def M5_vrmacbsu: T_XTYPE_Vect_acc<"vrmpybsu", 0b110, 0b001, 0>;
-
-  //Rdd[+]=vrmpybu(Rss,Rtt)
-  def M5_vrmpybuu: T_XTYPE_Vect<"vrmpybu", 0b100, 0b001, 0>;
-  def M5_vrmacbuu: T_XTYPE_Vect_acc<"vrmpybu", 0b100, 0b001, 0>;
-
-  def M5_vdmpybsu: T_M2_vmpy<"vdmpybsu", 0b101, 0b001, 0, 0, 1>;
-  def M5_vdmacbsu: T_M2_vmpy_acc_sat <"vdmpybsu", 0b001, 0b001, 0, 0>;
-}
-
-// Vector multiply bytes
-// Rdd=vmpyb[s]u(Rs,Rt)
-let Predicates = [HasV5T] in {
-  def M5_vmpybsu: T_XTYPE_mpy64 <"vmpybsu", 0b010, 0b001, 0, 0, 0>;
-  def M5_vmpybuu: T_XTYPE_mpy64 <"vmpybu",  0b100, 0b001, 0, 0, 0>;
-
-  // Rxx+=vmpyb[s]u(Rs,Rt)
-  def M5_vmacbsu: T_XTYPE_mpy64_acc <"vmpybsu", "+", 0b110, 0b001, 0, 0, 0>;
-  def M5_vmacbuu: T_XTYPE_mpy64_acc <"vmpybu", "+", 0b100, 0b001, 0, 0, 0>;
-
-  // Rd=vaddhub(Rss,Rtt):sat
-  let hasNewValue = 1, opNewValue = 0 in
-    def A5_vaddhubs: T_S3op_1 <"vaddhub", IntRegs, 0b01, 0b001, 0, 1>;
-}
-
-def S2_asr_i_p_rnd : S_2OpInstImm<"asr", 0b110, 0b111, u6_0Imm, [], 1>,
-      Requires<[HasV5T]> {
-  bits<6> src2;
-  let Inst{13-8} = src2;
-}
-
-let isAsmParserOnly = 1 in
-def S2_asr_i_p_rnd_goodsyntax
-  : MInst<(outs DoubleRegs:$dst), (ins DoubleRegs:$src1, u6_0Imm:$src2),
-    "$dst = asrrnd($src1, #$src2)">;
-
-def C4_fastcorner9 : T_LOGICAL_2OP<"fastcorner9", 0b000, 0, 0>,
-  Requires<[HasV5T]> {
-  let Inst{13,7,4} = 0b111;
-}
-
-def C4_fastcorner9_not : T_LOGICAL_2OP<"!fastcorner9", 0b000, 0, 0>,
-  Requires<[HasV5T]> {
-  let Inst{20,13,7,4} = 0b1111;
-}
-
-let hasNewValue = 1, validSubTargets = HasV5SubT in
-def S5_popcountp : ALU64_rr<(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
-  "$Rd = popcount($Rss)", [], "", S_2op_tc_2_SLOT23>,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rss;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1000011;
-    let Inst{7-5} = 0b011;
-    let Inst{4-0} = Rd;
-    let Inst{20-16} = Rss;
-  }
-
-let isFP = 1, hasNewValue = 1, opNewValue = 0 in
-class T_MInstFloat <string mnemonic, bits<3> MajOp, bits<3> MinOp>
-  : MInst<(outs IntRegs:$Rd),
-          (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd = "#mnemonic#"($Rs, $Rt)", [],
-  "" , M_tc_3or4x_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-24} = 0b1011;
-    let Inst{23-21} = MajOp;
-    let Inst{20-16} = Rs;
-    let Inst{13} = 0b0;
-    let Inst{12-8} = Rt;
-    let Inst{7-5} = MinOp;
-    let Inst{4-0} = Rd;
-  }
-
-let isCommutable = 1 in {
-  def F2_sfadd : T_MInstFloat < "sfadd", 0b000, 0b000>;
-  def F2_sfmpy : T_MInstFloat < "sfmpy", 0b010, 0b000>;
-}
-
-def F2_sfsub : T_MInstFloat < "sfsub", 0b000, 0b001>;
-
-let Itinerary = M_tc_3x_SLOT23 in {
-  def F2_sfmax : T_MInstFloat < "sfmax", 0b100, 0b000>;
-  def F2_sfmin : T_MInstFloat < "sfmin", 0b100, 0b001>;
-}
-
-let Itinerary = M_tc_3or4x_SLOT23 in {
-def F2_sffixupn : T_MInstFloat < "sffixupn", 0b110, 0b000>;
-def F2_sffixupd : T_MInstFloat < "sffixupd", 0b110, 0b001>;
-}
-
-// F2_sfrecipa: Reciprocal approximation for division.
-let Uses = [USR], isPredicateLate = 1, isFP = 1,
-    hasSideEffects = 0, hasNewValue = 1, Itinerary = M_tc_3or4x_SLOT23 in
-def F2_sfrecipa: MInst <
-  (outs IntRegs:$Rd, PredRegs:$Pe),
-  (ins IntRegs:$Rs, IntRegs:$Rt),
-  "$Rd, $Pe = sfrecipa($Rs, $Rt)">,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<2> Pe;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-    let Inst{27-21} = 0b1011111;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7}     = 0b1;
-    let Inst{6-5}   = Pe;
-    let Inst{4-0}   = Rd;
-  }
-
-// F2_dfcmpeq: Floating point compare for equal.
-let Uses = [USR], isCompare = 1, isFP = 1 in
-class T_fcmp <string mnemonic, RegisterClass RC, bits<3> MinOp,
-              list<dag> pattern = [] >
-  : ALU64Inst <(outs PredRegs:$dst), (ins RC:$src1, RC:$src2),
-  "$dst = "#mnemonic#"($src1, $src2)", pattern,
-  "" , ALU64_tc_2early_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<2> dst;
-    bits<5> src1;
-    bits<5> src2;
-
-    let IClass = 0b1101;
-
-    let Inst{27-21} = 0b0010111;
-    let Inst{20-16} = src1;
-    let Inst{12-8}  = src2;
-    let Inst{7-5}   = MinOp;
-    let Inst{1-0}   = dst;
-  }
-
-class T_fcmp64 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
-  : T_fcmp <mnemonic, DoubleRegs, MinOp, []> {
-  let IClass = 0b1101;
-  let Inst{27-21} = 0b0010111;
-}
-
-class T_fcmp32 <string mnemonic, PatFrag OpNode, bits<3> MinOp>
-  : T_fcmp <mnemonic, IntRegs, MinOp, []> {
-  let IClass = 0b1100;
-  let Inst{27-21} = 0b0111111;
-}
-
-def F2_dfcmpeq : T_fcmp64<"dfcmp.eq", setoeq, 0b000>;
-def F2_dfcmpgt : T_fcmp64<"dfcmp.gt", setogt, 0b001>;
-def F2_dfcmpge : T_fcmp64<"dfcmp.ge", setoge, 0b010>;
-def F2_dfcmpuo : T_fcmp64<"dfcmp.uo", setuo,  0b011>;
-
-def F2_sfcmpge : T_fcmp32<"sfcmp.ge", setoge, 0b000>;
-def F2_sfcmpuo : T_fcmp32<"sfcmp.uo", setuo,  0b001>;
-def F2_sfcmpeq : T_fcmp32<"sfcmp.eq", setoeq, 0b011>;
-def F2_sfcmpgt : T_fcmp32<"sfcmp.gt", setogt, 0b100>;
-
-// F2 convert template classes:
-let Uses = [USR], isFP = 1 in
-class F2_RDD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
-                         string chop ="">
-  : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss),
-   "$Rdd = "#mnemonic#"($Rss)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rdd;
-     bits<5> Rss;
-
-     let IClass = 0b1000;
-
-     let Inst{27-21} = 0b0000111;
-     let Inst{20-16} = Rss;
-     let Inst{7-5} = MinOp;
-     let Inst{4-0} = Rdd;
-  }
-
-let Uses = [USR], isFP = 1 in
-class F2_RDD_RS_CONVERT<string mnemonic, bits<3> MinOp,
-                        string chop ="">
-  : SInst <(outs DoubleRegs:$Rdd), (ins IntRegs:$Rs),
-   "$Rdd = "#mnemonic#"($Rs)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rdd;
-     bits<5> Rs;
-
-     let IClass = 0b1000;
-
-     let Inst{27-21} = 0b0100100;
-     let Inst{20-16} = Rs;
-     let Inst{7-5} = MinOp;
-     let Inst{4-0} = Rdd;
-  }
-
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-class F2_RD_RSS_CONVERT<string mnemonic, bits<3> MinOp,
-                        string chop ="">
-  : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss),
-   "$Rd = "#mnemonic#"($Rss)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rd;
-     bits<5> Rss;
-
-     let IClass = 0b1000;
-
-     let Inst{27-24} = 0b1000;
-     let Inst{23-21} = MinOp;
-     let Inst{20-16} = Rss;
-     let Inst{7-5} = 0b001;
-     let Inst{4-0} = Rd;
-  }
-
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-class F2_RD_RS_CONVERT<string mnemonic, bits<3> MajOp, bits<3> MinOp,
-                        string chop ="">
-  : SInst <(outs IntRegs:$Rd), (ins IntRegs:$Rs),
-   "$Rd = "#mnemonic#"($Rs)"#chop, [], "",
-   S_2op_tc_3or4x_SLOT23> {
-     bits<5> Rd;
-     bits<5> Rs;
-
-     let IClass = 0b1000;
-
-     let Inst{27-24} = 0b1011;
-     let Inst{23-21} = MajOp;
-     let Inst{20-16} = Rs;
-     let Inst{7-5} = MinOp;
-     let Inst{4-0} = Rd;
-  }
-
-// Convert single precision to double precision and vice-versa.
-def F2_conv_sf2df : F2_RDD_RS_CONVERT <"convert_sf2df", 0b000>;
-def F2_conv_df2sf : F2_RD_RSS_CONVERT <"convert_df2sf", 0b000>;
-
-// Convert Integer to Floating Point.
-def F2_conv_d2sf : F2_RD_RSS_CONVERT <"convert_d2sf", 0b010>;
-def F2_conv_ud2sf : F2_RD_RSS_CONVERT <"convert_ud2sf", 0b001>;
-def F2_conv_uw2sf : F2_RD_RS_CONVERT <"convert_uw2sf", 0b001, 0b000>;
-def F2_conv_w2sf : F2_RD_RS_CONVERT <"convert_w2sf", 0b010, 0b000>;
-def F2_conv_d2df : F2_RDD_RSS_CONVERT <"convert_d2df", 0b011>;
-def F2_conv_ud2df : F2_RDD_RSS_CONVERT <"convert_ud2df", 0b010>;
-def F2_conv_uw2df : F2_RDD_RS_CONVERT <"convert_uw2df", 0b001>;
-def F2_conv_w2df : F2_RDD_RS_CONVERT <"convert_w2df", 0b010>;
-
-// Convert Floating Point to Integer.
-def F2_conv_df2uw_chop : F2_RD_RSS_CONVERT <"convert_df2uw", 0b101, ":chop">;
-def F2_conv_df2w_chop : F2_RD_RSS_CONVERT <"convert_df2w", 0b111, ":chop">;
-def F2_conv_sf2uw_chop : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b001,
-                                           ":chop">;
-def F2_conv_sf2w_chop : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b001,
-                                          ":chop">;
-def F2_conv_df2d_chop : F2_RDD_RSS_CONVERT <"convert_df2d", 0b110, ":chop">;
-def F2_conv_df2ud_chop : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b111, ":chop">;
-def F2_conv_sf2d_chop : F2_RDD_RS_CONVERT <"convert_sf2d", 0b110, ":chop">;
-def F2_conv_sf2ud_chop : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b101, ":chop">;
-
-// Convert Floating Point to Integer: non-chopped.
-let AddedComplexity = 20, Predicates = [HasV5T] in {
-  def F2_conv_df2d : F2_RDD_RSS_CONVERT <"convert_df2d", 0b000>;
-  def F2_conv_df2ud : F2_RDD_RSS_CONVERT <"convert_df2ud", 0b001>;
-  def F2_conv_sf2ud : F2_RDD_RS_CONVERT <"convert_sf2ud", 0b011>;
-  def F2_conv_sf2d : F2_RDD_RS_CONVERT <"convert_sf2d", 0b100>;
-  def F2_conv_df2uw : F2_RD_RSS_CONVERT <"convert_df2uw", 0b011>;
-  def F2_conv_df2w : F2_RD_RSS_CONVERT <"convert_df2w", 0b100>;
-  def F2_conv_sf2uw : F2_RD_RS_CONVERT <"convert_sf2uw", 0b011, 0b000>;
-  def F2_conv_sf2w : F2_RD_RS_CONVERT <"convert_sf2w", 0b100, 0b000>;
-}
-
-// Fix up radicand.
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-def F2_sffixupr: SInst<(outs IntRegs:$Rd), (ins IntRegs:$Rs),
-  "$Rd = sffixupr($Rs)",
-  [], "" , S_2op_tc_3or4x_SLOT23>, Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rs;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1011101;
-    let Inst{20-16} = Rs;
-    let Inst{7-5}   = 0b000;
-    let Inst{4-0}   = Rd;
-  }
-
-// F2_sffma: Floating-point fused multiply add.
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-class T_sfmpy_acc <bit isSub, bit isLib>
-  : MInst<(outs IntRegs:$Rx),
-          (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt),
-  "$Rx "#!if(isSub, "-=","+=")#" sfmpy($Rs, $Rt)"#!if(isLib, ":lib",""),
-  [], "$dst2 = $Rx" , M_tc_3or4x_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-
-    let IClass = 0b1110;
-
-    let Inst{27-21} = 0b1111000;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7}     = 0b1;
-    let Inst{6}     = isLib;
-    let Inst{5}     = isSub;
-    let Inst{4-0}   = Rx;
-  }
-
-def F2_sffma: T_sfmpy_acc <0, 0>;
-def F2_sffms: T_sfmpy_acc <1, 0>;
-def F2_sffma_lib: T_sfmpy_acc <0, 1>;
-def F2_sffms_lib: T_sfmpy_acc <1, 1>;
-
-// Floating-point fused multiply add w/ additional scaling (2**pu).
-let Uses = [USR], isFP = 1, hasNewValue = 1 in
-def F2_sffma_sc: MInst <
-  (outs IntRegs:$Rx),
-  (ins IntRegs:$dst2, IntRegs:$Rs, IntRegs:$Rt, PredRegs:$Pu),
-  "$Rx += sfmpy($Rs, $Rt, $Pu):scale" ,
-  [], "$dst2 = $Rx" , M_tc_3or4x_SLOT23 > ,
-  Requires<[HasV5T]> {
-    bits<5> Rx;
-    bits<5> Rs;
-    bits<5> Rt;
-    bits<2> Pu;
-
-    let IClass = 0b1110;
-
-    let Inst{27-21} = 0b1111011;
-    let Inst{20-16} = Rs;
-    let Inst{13}    = 0b0;
-    let Inst{12-8}  = Rt;
-    let Inst{7}     = 0b1;
-    let Inst{6-5}   = Pu;
-    let Inst{4-0}   = Rx;
-  }
-
-//===----------------------------------------------------------------------===//
-// :natural forms of vasrh and vasrhub insns
-//===----------------------------------------------------------------------===//
-// S5_asrhub_rnd_sat: Vector arithmetic shift right by immediate with round,
-// saturate, and pack.
-let Defs = [USR_OVF], hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_ASRHUB<bit isSat>
-  : SInst <(outs IntRegs:$Rd),
-  (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rd = vasrhub($Rss, #$u4):"#!if(isSat, "sat", "raw"),
-  [], "", S_2op_tc_2_SLOT23>,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<5> Rss;
-    bits<4> u4;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1000011;
-    let Inst{20-16} = Rss;
-    let Inst{13-12} = 0b00;
-    let Inst{11-8} = u4;
-    let Inst{7-6} = 0b10;
-    let Inst{5} = isSat;
-    let Inst{4-0} = Rd;
-  }
-
-def S5_asrhub_rnd_sat : T_ASRHUB <0>;
-def S5_asrhub_sat : T_ASRHUB <1>;
-
-let isAsmParserOnly = 1 in
-def S5_asrhub_rnd_sat_goodsyntax
-  : SInst <(outs IntRegs:$Rd), (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rd = vasrhub($Rss, #$u4):rnd:sat">, Requires<[HasV5T]>;
-
-// S5_vasrhrnd: Vector arithmetic shift right by immediate with round.
-let hasSideEffects = 0 in
-def S5_vasrhrnd : SInst <(outs DoubleRegs:$Rdd),
-                         (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rdd = vasrh($Rss, #$u4):raw">,
-  Requires<[HasV5T]> {
-    bits<5> Rdd;
-    bits<5> Rss;
-    bits<4> u4;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b0000001;
-    let Inst{20-16} = Rss;
-    let Inst{13-12} = 0b00;
-    let Inst{11-8}  = u4;
-    let Inst{7-5}   = 0b000;
-    let Inst{4-0}   = Rdd;
-  }
-
-let isAsmParserOnly = 1 in
-def S5_vasrhrnd_goodsyntax
-  : SInst <(outs DoubleRegs:$Rdd), (ins DoubleRegs:$Rss, u4_0Imm:$u4),
-  "$Rdd = vasrh($Rss,#$u4):rnd">, Requires<[HasV5T]>;
-
-// Floating point reciprocal square root approximation
-let Uses = [USR], isPredicateLate = 1, isFP = 1,
-    hasSideEffects = 0, hasNewValue = 1, opNewValue = 0,
-    validSubTargets = HasV5SubT in
-def F2_sfinvsqrta: SInst <
-  (outs IntRegs:$Rd, PredRegs:$Pe),
-  (ins IntRegs:$Rs),
-  "$Rd, $Pe = sfinvsqrta($Rs)" > ,
-  Requires<[HasV5T]> {
-    bits<5> Rd;
-    bits<2> Pe;
-    bits<5> Rs;
-
-    let IClass = 0b1000;
-
-    let Inst{27-21} = 0b1011111;
-    let Inst{20-16} = Rs;
-    let Inst{7} = 0b0;
-    let Inst{6-5} = Pe;
-    let Inst{4-0} = Rd;
-  }
-
-// Complex multiply 32x16
-let Defs = [USR_OVF], Itinerary = S_3op_tc_3x_SLOT23 in {
-  def M4_cmpyi_whc : T_S3op_8<"cmpyiwh", 0b101, 1, 1, 1, 1>;
-  def M4_cmpyr_whc : T_S3op_8<"cmpyrwh", 0b111, 1, 1, 1, 1>;
-}
-
-// Classify floating-point value
-let Uses = [USR], isFP = 1 in
-def F2_sfclass : T_TEST_BIT_IMM<"sfclass", 0b111>, Requires<[HasV5T]>;
-
-let Uses = [USR], isFP = 1 in
-def F2_dfclass: ALU64Inst<(outs PredRegs:$Pd), (ins DoubleRegs:$Rss, u5_0Imm:$u5),
-  "$Pd = dfclass($Rss, #$u5)",
-  [], "" , ALU64_tc_2early_SLOT23 > , Requires<[HasV5T]> {
-    bits<2> Pd;
-    bits<5> Rss;
-    bits<5> u5;
-
-    let IClass = 0b1101;
-    let Inst{27-21} = 0b1100100;
-    let Inst{20-16} = Rss;
-    let Inst{12-10} = 0b000;
-    let Inst{9-5}   = u5;
-    let Inst{4-3}   = 0b10;
-    let Inst{1-0}   = Pd;
-  }
-
-// Instructions to create floating point constant
-class T_fimm <string mnemonic, RegisterClass RC, bits<4> RegType, bit isNeg>
-  : ALU64Inst<(outs RC:$dst), (ins u10_0Imm:$src),
-  "$dst = "#mnemonic#"(#$src)"#!if(isNeg, ":neg", ":pos"),
-  [], "", ALU64_tc_2_SLOT23>, Requires<[HasV5T]> {
-    bits<5> dst;
-    bits<10> src;
-
-    let IClass = 0b1101;
-    let Inst{27-24} = RegType;
-    let Inst{23}    = 0b0;
-    let Inst{22}    = isNeg;
-    let Inst{21}    = src{9};
-    let Inst{13-5}  = src{8-0};
-    let Inst{4-0}   = dst;
-  }
-
-let hasNewValue = 1, opNewValue = 0 in {
-  def F2_sfimm_p : T_fimm <"sfmake", IntRegs, 0b0110, 0>;
-  def F2_sfimm_n : T_fimm <"sfmake", IntRegs, 0b0110, 1>;
-}
-
-def F2_dfimm_p : T_fimm <"dfmake", DoubleRegs, 0b1001, 0>;
-def F2_dfimm_n : T_fimm <"dfmake", DoubleRegs, 0b1001, 1>;
diff --git a/lib/Target/Hexagon/HexagonInstrInfoV60.td b/lib/Target/Hexagon/HexagonInstrInfoV60.td
deleted file mode 100644
index c50141b18ead311d085e1ce48618a570f6ad5836..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfoV60.td
+++ /dev/null
@@ -1,2068 +0,0 @@
-//=- HexagonInstrInfoV60.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V60 instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-// Vector load
-let Predicates = [HasV60T, UseHVX] in
-let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
-  class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                  string cstr = "", InstrItinClass itin = CVI_VM_LD,
-                  IType type = TypeCVI_VM_LD>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
-
-// Vector store
-let Predicates = [HasV60T, UseHVX] in
-let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
-class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-                string cstr = "", InstrItinClass itin = CVI_VM_ST,
-                IType type = TypeCVI_VM_ST>
-: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
-
-//===----------------------------------------------------------------------===//
-// Vector loads with base + immediate offset
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, accessSize = Vector64Access in
-class T_vload_ai<string asmStr>
-  : V6_LDInst <(outs VectorRegs:$dst), (ins IntRegs:$src1, s4_6Imm:$src2),
-                asmStr>;
-
-let isCodeGenOnly = 1, addrMode = BaseImmOffset, accessSize = Vector128Access in
-class T_vload_ai_128B<string asmStr>
-  : V6_LDInst <(outs VectorRegs128B:$dst), (ins IntRegs:$src1, s4_7Imm:$src2),
-                asmStr>;
-
-let isCVLoadable = 1, hasNewValue = 1 in {
-  def V6_vL32b_ai         : T_vload_ai <"$dst = vmem($src1+#$src2)">,
-                            V6_vL32b_ai_enc;
-  def V6_vL32b_nt_ai      : T_vload_ai <"$dst = vmem($src1+#$src2):nt">,
-                            V6_vL32b_nt_ai_enc;
-  // 128B
-  def V6_vL32b_ai_128B    : T_vload_ai_128B <"$dst = vmem($src1+#$src2)">,
-                            V6_vL32b_ai_128B_enc;
-  def V6_vL32b_nt_ai_128B : T_vload_ai_128B <"$dst = vmem($src1+#$src2):nt">,
-                            V6_vL32b_nt_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU, hasNewValue = 1 in {
-  def V6_vL32Ub_ai      : T_vload_ai <"$dst = vmemu($src1+#$src2)">,
-                          V6_vL32Ub_ai_enc;
-  def V6_vL32Ub_ai_128B : T_vload_ai_128B <"$dst = vmemu($src1+#$src2)">,
-                          V6_vL32Ub_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD, isCVLoad = 1,
-    hasNewValue = 1 in {
-  def V6_vL32b_cur_ai    : T_vload_ai <"$dst.cur = vmem($src1+#$src2)">,
-                           V6_vL32b_cur_ai_enc;
-  def V6_vL32b_nt_cur_ai : T_vload_ai <"$dst.cur = vmem($src1+#$src2):nt">,
-                           V6_vL32b_nt_cur_ai_enc;
-  // 128B
-  def V6_vL32b_cur_ai_128B    : T_vload_ai_128B
-                                <"$dst.cur = vmem($src1+#$src2)">,
-                                V6_vL32b_cur_ai_128B_enc;
-  def V6_vL32b_nt_cur_ai_128B : T_vload_ai_128B
-                                <"$dst.cur = vmem($src1+#$src2):nt">,
-                                V6_vL32b_nt_cur_ai_128B_enc;
-}
-
-
-let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD, hasNewValue = 1 in {
-  def V6_vL32b_tmp_ai    : T_vload_ai <"$dst.tmp = vmem($src1+#$src2)">,
-                           V6_vL32b_tmp_ai_enc;
-  def V6_vL32b_nt_tmp_ai : T_vload_ai <"$dst.tmp = vmem($src1+#$src2):nt">,
-                           V6_vL32b_nt_tmp_ai_enc;
-  // 128B
-  def V6_vL32b_tmp_ai_128B    : T_vload_ai_128B
-                                <"$dst.tmp = vmem($src1+#$src2)">,
-                                V6_vL32b_tmp_ai_128B_enc;
-  def V6_vL32b_nt_tmp_ai_128B : T_vload_ai_128B
-                                <"$dst.tmp = vmem($src1+#$src2)">,
-                                V6_vL32b_nt_tmp_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - unconditional
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, accessSize = Vector64Access, isPredicable = 1 in
-class T_vstore_ai <string mnemonic, string baseOp, Operand ImmOp,
-                   RegisterClass RC, bit isNT>
-  : V6_STInst <(outs), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    mnemonic#"($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_ai_64B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_ai_128B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_ai         : T_vstore_ai_64B <"vmem", "vS32b_ai">,
-                            V6_vS32b_ai_enc;
-  def V6_vS32b_ai_128B    : T_vstore_ai_128B <"vmem", "vS32b_ai">,
-                            V6_vS32b_ai_128B_enc;
-}
-
-let isNVStorable = 1, isNonTemporal = 1 in {
-  def V6_vS32b_nt_ai      : T_vstore_ai_64B <"vmem", "vS32b_ai", 1>,
-                            V6_vS32b_nt_ai_enc;
-  def V6_vS32b_nt_ai_128B : T_vstore_ai_128B <"vmem", "vS32b_ai", 1>,
-                            V6_vS32b_nt_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_ai      : T_vstore_ai_64B <"vmemu", "vS32Ub_ai">,
-                          V6_vS32Ub_ai_enc;
-  def V6_vS32Ub_ai_128B : T_vstore_ai_128B <"vmemu", "vS32Ub_ai">,
-                          V6_vS32Ub_ai_128B_enc;
-}
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - unconditional new
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, isNewValue = 1, opNewValue = 2, isNVStore = 1,
-    isPredicable = 1, Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST in
-class T_vstore_new_ai <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
-  : V6_STInst <(outs ), (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    "vmem($src1+#$src2)"#!if(isNT, ":nt", "")#" = $src3.new">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_ai_64B <string baseOp, bit isNT = 0>
-  : T_vstore_new_ai <baseOp, s4_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_ai_128B <string baseOp, bit isNT = 0>
-  : T_vstore_new_ai <baseOp#"128B", s4_7Imm, VectorRegs128B, isNT>;
-
-def V6_vS32b_new_ai      : T_vstore_new_ai_64B <"vS32b_ai">, V6_vS32b_new_ai_enc;
-def V6_vS32b_new_ai_128B : T_vstore_new_ai_128B <"vS32b_ai">,
-                           V6_vS32b_new_ai_128B_enc;
-
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_ai      : T_vstore_new_ai_64B<"vS32b_ai", 1>,
-                                V6_vS32b_nt_new_ai_enc;
-  def V6_vS32b_nt_new_ai_128B : T_vstore_new_ai_128B<"vS32b_ai", 1>,
-                                V6_vS32b_nt_new_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - conditional
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, isPredicated = 1 in
-class T_vstore_pred_ai <string mnemonic, string baseOp, Operand ImmOp,
-                        RegisterClass RC, bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs),
-               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) "
-     #mnemonic#"($src2+#$src3)"#!if(isNT, ":nt", "")#" = $src4">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_pred_ai_64B <string mnemonic, string baseOp,
-                            bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_ai <mnemonic, baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_pred_ai_128B <string mnemonic, string baseOp,
-                             bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_ai <mnemonic, baseOp#"128B", s4_7Imm, VectorRegs128B,
-                      isPredNot, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_pred_ai     : T_vstore_pred_ai_64B <"vmem", "vS32b_ai">,
-                             V6_vS32b_pred_ai_enc;
-  def V6_vS32b_npred_ai    : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1>,
-                             V6_vS32b_npred_ai_enc;
-  // 128B
-  def V6_vS32b_pred_ai_128B    : T_vstore_pred_ai_128B <"vmem", "vS32b_ai">,
-                                 V6_vS32b_pred_ai_128B_enc;
-  def V6_vS32b_npred_ai_128B   : T_vstore_pred_ai_128B <"vmem", "vS32b_ai", 1>,
-                                 V6_vS32b_npred_ai_128B_enc;
-}
-
-
-let isNVStorable = 1, isNonTemporal = 1 in {
-  def V6_vS32b_nt_pred_ai  : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 0, 1>,
-                             V6_vS32b_nt_pred_ai_enc;
-  def V6_vS32b_nt_npred_ai : T_vstore_pred_ai_64B <"vmem", "vS32b_ai", 1, 1>,
-                             V6_vS32b_nt_npred_ai_enc;
-  // 128B
-  def V6_vS32b_nt_pred_ai_128B  : T_vstore_pred_ai_128B
-                                  <"vmem", "vS32b_ai", 0, 1>,
-                                  V6_vS32b_nt_pred_ai_128B_enc;
-  def V6_vS32b_nt_npred_ai_128B : T_vstore_pred_ai_128B
-                                  <"vmem", "vS32b_ai", 1, 1>,
-                                  V6_vS32b_nt_npred_ai_128B_enc;
-}
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pred_ai  : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai">,
-                           V6_vS32Ub_pred_ai_enc;
-  def V6_vS32Ub_npred_ai : T_vstore_pred_ai_64B <"vmemu", "vS32Ub_ai", 1>,
-                           V6_vS32Ub_npred_ai_enc;
-  // 128B
-  def V6_vS32Ub_pred_ai_128B  :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai">,
-                               V6_vS32Ub_pred_ai_128B_enc;
-  def V6_vS32Ub_npred_ai_128B :T_vstore_pred_ai_128B <"vmemu", "vS32Ub_ai", 1>,
-                               V6_vS32Ub_npred_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - byte-enabled aligned
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset in
-class T_vstore_qpred_ai <Operand ImmOp, RegisterClass RC,
-                         bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs),
-               (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
-          #!if(isNT, ":nt", "")#" = $src4"> {
-  let isPredicatedFalse = isPredNot;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_qpred_ai_64B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_ai <s4_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_qpred_ai_128B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_ai <s4_7Imm, VectorRegs128B, isPredNot, isNT>;
-
-def V6_vS32b_qpred_ai  : T_vstore_qpred_ai_64B, V6_vS32b_qpred_ai_enc;
-def V6_vS32b_nqpred_ai : T_vstore_qpred_ai_64B <1>,
-                         V6_vS32b_nqpred_ai_enc;
-def V6_vS32b_nt_qpred_ai  : T_vstore_qpred_ai_64B <0, 1>,
-                            V6_vS32b_nt_qpred_ai_enc;
-def V6_vS32b_nt_nqpred_ai : T_vstore_qpred_ai_64B <1, 1>,
-                            V6_vS32b_nt_nqpred_ai_enc;
-// 128B
-def V6_vS32b_qpred_ai_128B  : T_vstore_qpred_ai_128B, V6_vS32b_qpred_ai_128B_enc;
-def V6_vS32b_nqpred_ai_128B : T_vstore_qpred_ai_128B<1>,
-                              V6_vS32b_nqpred_ai_128B_enc;
-def V6_vS32b_nt_qpred_ai_128B  : T_vstore_qpred_ai_128B<0, 1>,
-                                 V6_vS32b_nt_qpred_ai_128B_enc;
-def V6_vS32b_nt_nqpred_ai_128B : T_vstore_qpred_ai_128B<1, 1>,
-                                 V6_vS32b_nt_nqpred_ai_128B_enc;
-
-
-//===----------------------------------------------------------------------===//
-// Vector stores with base + immediate offset - conditional new
-//===----------------------------------------------------------------------===//
-let addrMode = BaseImmOffset, isPredicated = 1, isNewValue = 1, opNewValue = 3,
-    isNVStore = 1, Type = TypeCVI_VM_NEW_ST, Itinerary = CVI_VM_NEW_ST in
-class T_vstore_new_pred_ai <string baseOp, Operand ImmOp, RegisterClass RC,
-                            bit isPredNot, bit isNT>
-  : V6_STInst <(outs),
-               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2+#$src3)"
-         #!if(isNT, ":nt", "")#" = $src4.new">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_pred_ai_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_ai <baseOp, s4_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_pred_ai_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_ai <baseOp#"128B", s4_7Imm, VectorRegs128B,
-                          isPredNot, isNT>;
-
-
-def V6_vS32b_new_pred_ai     : T_vstore_new_pred_ai_64B <"vS32b_ai">,
-                               V6_vS32b_new_pred_ai_enc;
-def V6_vS32b_new_npred_ai    : T_vstore_new_pred_ai_64B <"vS32b_ai", 1>,
-                               V6_vS32b_new_npred_ai_enc;
-// 128B
-def V6_vS32b_new_pred_ai_128B     : T_vstore_new_pred_ai_128B <"vS32b_ai">,
-                                    V6_vS32b_new_pred_ai_128B_enc;
-def V6_vS32b_new_npred_ai_128B    : T_vstore_new_pred_ai_128B <"vS32b_ai", 1>,
-                                    V6_vS32b_new_npred_ai_128B_enc;
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_pred_ai  : T_vstore_new_pred_ai_64B <"vS32b_ai", 0, 1>,
-                                 V6_vS32b_nt_new_pred_ai_enc;
-  def V6_vS32b_nt_new_npred_ai : T_vstore_new_pred_ai_64B <"vS32b_ai", 1, 1>,
-                                 V6_vS32b_nt_new_npred_ai_enc;
-  // 128B
-  def V6_vS32b_nt_new_pred_ai_128B  : T_vstore_new_pred_ai_128B
-                                      <"vS32b_ai", 0, 1>,
-                                      V6_vS32b_nt_new_pred_ai_128B_enc;
-  def V6_vS32b_nt_new_npred_ai_128B : T_vstore_new_pred_ai_128B
-                                      <"vS32b_ai", 1, 1>,
-                                      V6_vS32b_nt_new_npred_ai_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector loads with immediate offset.
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc, hasNewValue = 1 in
-class T_vload_pi<string asmStr, Operand ImmOp, RegisterClass RC>
-  : V6_LDInst <(outs RC:$dst, IntRegs:$_dst_),
-               (ins IntRegs:$src1, ImmOp:$src2), asmStr, [],
-    "$src1 = $_dst_">;
-
-let accessSize = Vector64Access in
-class T_vload_pi_64B <string asmStr>
-  : T_vload_pi <asmStr, s3_6Imm, VectorRegs>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vload_pi_128B <string asmStr>
-  : T_vload_pi <asmStr, s3_7Imm, VectorRegs128B>;
-
-let isCVLoadable = 1 in {
-  def V6_vL32b_pi    : T_vload_pi_64B <"$dst = vmem($src1++#$src2)">,
-                       V6_vL32b_pi_enc;
-  def V6_vL32b_nt_pi : T_vload_pi_64B <"$dst = vmem($src1++#$src2):nt">,
-                       V6_vL32b_nt_pi_enc;
-  // 128B
-  def V6_vL32b_pi_128B    : T_vload_pi_128B <"$dst = vmem($src1++#$src2)">,
-                            V6_vL32b_pi_128B_enc;
-  def V6_vL32b_nt_pi_128B : T_vload_pi_128B <"$dst = vmem($src1++#$src2):nt">,
-                            V6_vL32b_nt_pi_128B_enc;
-}
-
-let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in {
-  def V6_vL32Ub_pi : T_vload_pi_64B <"$dst = vmemu($src1++#$src2)">,
-                     V6_vL32Ub_pi_enc;
-  // 128B
-  def V6_vL32Ub_pi_128B : T_vload_pi_128B <"$dst = vmemu($src1++#$src2)">,
-                          V6_vL32Ub_pi_128B_enc;
-}
-
-let isCVLoad = 1, Itinerary = CVI_VM_LD, Type = TypeCVI_VM_LD in {
-  def V6_vL32b_cur_pi    : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2)">,
-                           V6_vL32b_cur_pi_enc;
-  def V6_vL32b_nt_cur_pi : T_vload_pi_64B <"$dst.cur = vmem($src1++#$src2):nt">,
-                           V6_vL32b_nt_cur_pi_enc;
-  // 128B
-  def V6_vL32b_cur_pi_128B    : T_vload_pi_128B
-                                <"$dst.cur = vmem($src1++#$src2)">,
-                                V6_vL32b_cur_pi_128B_enc;
-  def V6_vL32b_nt_cur_pi_128B : T_vload_pi_128B
-                                <"$dst.cur = vmem($src1++#$src2):nt">,
-                                V6_vL32b_nt_cur_pi_128B_enc;
-}
-
-let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
-  def V6_vL32b_tmp_pi    : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2)">,
-                           V6_vL32b_tmp_pi_enc;
-  def V6_vL32b_nt_tmp_pi : T_vload_pi_64B <"$dst.tmp = vmem($src1++#$src2):nt">,
-                           V6_vL32b_nt_tmp_pi_enc;
-  //128B
-  def V6_vL32b_tmp_pi_128B    : T_vload_pi_128B
-                                <"$dst.tmp = vmem($src1++#$src2)">,
-                                V6_vL32b_tmp_pi_128B_enc;
-  def V6_vL32b_nt_tmp_pi_128B : T_vload_pi_128B
-                                <"$dst.tmp = vmem($src1++#$src2):nt">,
-                                V6_vL32b_nt_tmp_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with immediate offset.
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc, isPredicable = 1 in
-class T_vstore_pi <string mnemonic, string baseOp, Operand ImmOp,
-                   RegisterClass RC, bit isNT>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    mnemonic#"($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
-    "$src1 = $_dst_">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_pi_64B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_pi_128B <string mnemonic, string baseOp, bit isNT = 0>
-  : T_vstore_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi">, V6_vS32b_pi_enc;
-  def V6_vS32b_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi">,
-                         V6_vS32b_pi_128B_enc;
-}
-
-let isNVStorable = 1 , isNonTemporal = 1  in {
-  def V6_vS32b_nt_pi      : T_vstore_pi_64B <"vmem", "vS32b_pi", 1>,
-                            V6_vS32b_nt_pi_enc;
-  def V6_vS32b_nt_pi_128B : T_vstore_pi_128B <"vmem", "vS32b_pi", 1>,
-                            V6_vS32b_nt_pi_128B_enc;
-}
-
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pi      : T_vstore_pi_64B <"vmemu", "vS32Ub_pi">,
-                          V6_vS32Ub_pi_enc;
-  def V6_vS32Ub_pi_128B : T_vstore_pi_128B <"vmemu", "vS32Ub_pi">,
-                          V6_vS32Ub_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment unconditional .new vector stores with immediate offset.
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc, isNVStore = 1 in
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
-    isPredicable = 1, opNewValue = 3, isNVStore = 1 in
-class T_vstore_new_pi <string baseOp, Operand ImmOp, RegisterClass RC, bit isNT>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ImmOp:$src2, RC:$src3),
-    "vmem($src1++#$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
-    "$src1 = $_dst_">, NewValueRel {
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_pi_64B <string baseOp, bit isNT = 0>
-  : T_vstore_new_pi <baseOp, s3_6Imm, VectorRegs, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_pi_128B <string baseOp, bit isNT = 0>
-  : T_vstore_new_pi <baseOp#"128B", s3_7Imm, VectorRegs128B, isNT>;
-
-
-def V6_vS32b_new_pi      : T_vstore_new_pi_64B <"vS32b_pi">,
-                           V6_vS32b_new_pi_enc;
-def V6_vS32b_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi">,
-                           V6_vS32b_new_pi_128B_enc;
-
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_pi      : T_vstore_new_pi_64B <"vS32b_pi", 1>,
-                                V6_vS32b_nt_new_pi_enc;
-  def V6_vS32b_nt_new_pi_128B : T_vstore_new_pi_128B <"vS32b_pi", 1>,
-                                V6_vS32b_nt_new_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional vector stores with immediate offset
-//===----------------------------------------------------------------------===//
-let isPredicated = 1, addrMode = PostInc in
-class T_vstore_pred_pi <string mnemonic, string baseOp, Operand ImmOp,
-                        RegisterClass RC, bit isPredNot, bit isNT>
-  : V6_STInst<(outs IntRegs:$_dst_),
-             (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++#$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_pred_pi_64B <string mnemonic, string baseOp,
-                            bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_pi <mnemonic, baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_pred_pi_128B <string mnemonic, string baseOp,
-                             bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_pred_pi <mnemonic, baseOp#"128B", s3_7Imm, VectorRegs128B,
-                      isPredNot, isNT>;
-
-let isNVStorable = 1 in {
-  def V6_vS32b_pred_pi     : T_vstore_pred_pi_64B <"vmem", "vS32b_pi">,
-                             V6_vS32b_pred_pi_enc;
-  def V6_vS32b_npred_pi    : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1>,
-                             V6_vS32b_npred_pi_enc;
-  // 128B
-  def V6_vS32b_pred_pi_128B  : T_vstore_pred_pi_128B <"vmem", "vS32b_pi">,
-                               V6_vS32b_pred_pi_128B_enc;
-  def V6_vS32b_npred_pi_128B : T_vstore_pred_pi_128B <"vmem", "vS32b_pi", 1>,
-                               V6_vS32b_npred_pi_128B_enc;
-}
-let isNVStorable = 1, isNonTemporal = 1 in {
-  def V6_vS32b_nt_pred_pi  : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 0, 1>,
-                             V6_vS32b_nt_pred_pi_enc;
-  def V6_vS32b_nt_npred_pi : T_vstore_pred_pi_64B <"vmem", "vS32b_pi", 1, 1>,
-                             V6_vS32b_nt_npred_pi_enc;
-  // 128B
-  def V6_vS32b_nt_pred_pi_128B  : T_vstore_pred_pi_128B
-                                  <"vmem", "vS32b_pi", 0, 1>,
-                                  V6_vS32b_nt_pred_pi_128B_enc;
-  def V6_vS32b_nt_npred_pi_128B : T_vstore_pred_pi_128B
-                                  <"vmem", "vS32b_pi", 1, 1>,
-                                  V6_vS32b_nt_npred_pi_128B_enc;
-}
-
-let Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pred_pi  : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi">,
-                           V6_vS32Ub_pred_pi_enc;
-  def V6_vS32Ub_npred_pi : T_vstore_pred_pi_64B <"vmemu", "vS32Ub_pi", 1>,
-                           V6_vS32Ub_npred_pi_enc;
-  // 128B
-  def V6_vS32Ub_pred_pi_128B  : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi">,
-                                V6_vS32Ub_pred_pi_128B_enc;
-  def V6_vS32Ub_npred_pi_128B : T_vstore_pred_pi_128B <"vmemu", "vS32Ub_pi", 1>,
-                                V6_vS32Ub_npred_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with immediate offset - byte-enabled aligned
-//===----------------------------------------------------------------------===//
-let addrMode = PostInc in
-class T_vstore_qpred_pi <Operand ImmOp, RegisterClass RC, bit isPredNot = 0,
-                         bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins VecPredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">;
-
-let accessSize = Vector64Access in
-class T_vstore_qpred_pi_64B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_pi <s3_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_qpred_pi_128B <bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_qpred_pi <s3_7Imm, VectorRegs128B, isPredNot, isNT>;
-
-def V6_vS32b_qpred_pi  : T_vstore_qpred_pi_64B, V6_vS32b_qpred_pi_enc;
-def V6_vS32b_nqpred_pi : T_vstore_qpred_pi_64B <1>, V6_vS32b_nqpred_pi_enc;
-// 128B
-def V6_vS32b_qpred_pi_128B  : T_vstore_qpred_pi_128B,
-                              V6_vS32b_qpred_pi_128B_enc;
-def V6_vS32b_nqpred_pi_128B : T_vstore_qpred_pi_128B<1>,
-                              V6_vS32b_nqpred_pi_128B_enc;
-
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_qpred_pi  : T_vstore_qpred_pi_64B <0, 1>,
-                              V6_vS32b_nt_qpred_pi_enc;
-  def V6_vS32b_nt_nqpred_pi : T_vstore_qpred_pi_64B <1, 1>,
-                              V6_vS32b_nt_nqpred_pi_enc;
-  // 128B
-  def V6_vS32b_nt_qpred_pi_128B  : T_vstore_qpred_pi_128B<0, 1>,
-                                   V6_vS32b_nt_qpred_pi_128B_enc;
-  def V6_vS32b_nt_nqpred_pi_128B : T_vstore_qpred_pi_128B<1, 1>,
-                                   V6_vS32b_nt_nqpred_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional .new vector stores with immediate offset
-//===----------------------------------------------------------------------===//
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
-    isNewValue = 1, opNewValue = 4, addrMode = PostInc, isNVStore = 1 in
-class T_vstore_new_pred_pi <string baseOp, Operand ImmOp, RegisterClass RC,
-                            bit isPredNot, bit isNT>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins PredRegs:$src1, IntRegs:$src2, ImmOp:$src3, RC:$src4),
-    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++#$src3)"
-         #!if(isNT, ":nt", "")#" = $src4.new", [],
-    "$src2 = $_dst_"> , NewValueRel {
-  let isPredicatedFalse = isPredNot;
-  let BaseOpcode = baseOp;
-}
-
-let accessSize = Vector64Access in
-class T_vstore_new_pred_pi_64B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_pi <baseOp, s3_6Imm, VectorRegs, isPredNot, isNT>;
-
-let isCodeGenOnly = 1, accessSize = Vector128Access in
-class T_vstore_new_pred_pi_128B <string baseOp, bit isPredNot = 0, bit isNT = 0>
-  : T_vstore_new_pred_pi <baseOp#"128B", s3_7Imm, VectorRegs128B,
-                          isPredNot, isNT>;
-
-def V6_vS32b_new_pred_pi     : T_vstore_new_pred_pi_64B <"vS32b_pi">,
-                               V6_vS32b_new_pred_pi_enc;
-def V6_vS32b_new_npred_pi    : T_vstore_new_pred_pi_64B <"vS32b_pi", 1>,
-                               V6_vS32b_new_npred_pi_enc;
-// 128B
-def V6_vS32b_new_pred_pi_128B    : T_vstore_new_pred_pi_128B <"vS32b_pi">,
-                                   V6_vS32b_new_pred_pi_128B_enc;
-def V6_vS32b_new_npred_pi_128B   : T_vstore_new_pred_pi_128B <"vS32b_pi", 1>,
-                                   V6_vS32b_new_npred_pi_128B_enc;
-let isNonTemporal = 1 in {
-  def V6_vS32b_nt_new_pred_pi  : T_vstore_new_pred_pi_64B <"vS32b_pi", 0, 1>,
-                                 V6_vS32b_nt_new_pred_pi_enc;
-  def V6_vS32b_nt_new_npred_pi : T_vstore_new_pred_pi_64B <"vS32b_pi", 1, 1>,
-                                 V6_vS32b_nt_new_npred_pi_enc;
-  // 128B
-  def V6_vS32b_nt_new_pred_pi_128B : T_vstore_new_pred_pi_128B
-                                     <"vS32b_pi", 0, 1>,
-                                     V6_vS32b_nt_new_pred_pi_128B_enc;
-  def V6_vS32b_nt_new_npred_pi_128B : T_vstore_new_pred_pi_128B
-                                      <"vS32b_pi", 1, 1>,
-                                      V6_vS32b_nt_new_npred_pi_128B_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector loads with register offset
-//===----------------------------------------------------------------------===//
-let hasNewValue = 1 in
-class T_vload_ppu<string asmStr>
-  : V6_LDInst <(outs VectorRegs:$dst, IntRegs:$_dst_),
-               (ins IntRegs:$src1, ModRegs:$src2), asmStr, [],
-    "$src1 = $_dst_">, NewValueRel;
-
-let isCVLoadable = 1 in {
-  def V6_vL32b_ppu    : T_vload_ppu <"$dst = vmem($src1++$src2)">,
-                        V6_vL32b_ppu_enc;
-  def V6_vL32b_nt_ppu : T_vload_ppu <"$dst = vmem($src1++$src2):nt">,
-                        V6_vL32b_nt_ppu_enc;
-}
-
-let Itinerary = CVI_VM_VP_LDU, Type = TypeCVI_VM_VP_LDU in
-def V6_vL32Ub_ppu : T_vload_ppu <"$dst = vmemu($src1++$src2)">,
-                     V6_vL32Ub_ppu_enc;
-
-let isCVLoad = 1, Itinerary = CVI_VM_CUR_LD, Type = TypeCVI_VM_CUR_LD in {
-  def V6_vL32b_cur_ppu    : T_vload_ppu <"$dst.cur = vmem($src1++$src2)">,
-                             V6_vL32b_cur_ppu_enc;
-  def V6_vL32b_nt_cur_ppu : T_vload_ppu <"$dst.cur = vmem($src1++$src2):nt">,
-                             V6_vL32b_nt_cur_ppu_enc;
-}
-
-let Itinerary = CVI_VM_TMP_LD, Type = TypeCVI_VM_TMP_LD in {
-  def V6_vL32b_tmp_ppu    : T_vload_ppu <"$dst.tmp = vmem($src1++$src2)">,
-                             V6_vL32b_tmp_ppu_enc;
-  def V6_vL32b_nt_tmp_ppu : T_vload_ppu <"$dst.tmp = vmem($src1++$src2):nt">,
-                             V6_vL32b_nt_tmp_ppu_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with register offset
-//===----------------------------------------------------------------------===//
-let isPredicable = 1 in
-class T_vstore_ppu <string mnemonic, bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
-    mnemonic#"($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3", [],
-    "$src1 = $_dst_">, NewValueRel;
-
-let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
-  def V6_vS32b_ppu    : T_vstore_ppu <"vmem">,
-                        V6_vS32b_ppu_enc;
-  let isNonTemporal = 1, BaseOpcode = "vS32b_ppu" in
-  def V6_vS32b_nt_ppu : T_vstore_ppu <"vmem", 1>,
-                        V6_vS32b_nt_ppu_enc;
-}
-
-let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU, Type = TypeCVI_VM_STU in
-def V6_vS32Ub_ppu   : T_vstore_ppu <"vmemu">, V6_vS32Ub_ppu_enc;
-
-//===----------------------------------------------------------------------===//
-// Post increment .new vector stores with register offset
-//===----------------------------------------------------------------------===//
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isNewValue = 1,
-    isPredicable = 1, opNewValue = 3, isNVStore = 1 in
-class T_vstore_new_ppu <bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-               (ins IntRegs:$src1, ModRegs:$src2, VectorRegs:$src3),
-    "vmem($src1++$src2)"#!if(isNT, ":nt", "")#" = $src3.new", [],
-    "$src1 = $_dst_">, NewValueRel;
-
-let BaseOpcode = "vS32b_ppu" in
-def V6_vS32b_new_ppu    : T_vstore_new_ppu, V6_vS32b_new_ppu_enc;
-
-let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in
-def V6_vS32b_nt_new_ppu : T_vstore_new_ppu<1>, V6_vS32b_nt_new_ppu_enc;
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional .new vector stores with register offset
-//===----------------------------------------------------------------------===//
-let isPredicated = 1 in
-class T_vstore_pred_ppu <string mnemonic, bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst<(outs IntRegs:$_dst_),
-           (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) "#mnemonic#"($src2++$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-}
-
-let isNVStorable = 1, BaseOpcode = "vS32b_ppu" in {
-  def V6_vS32b_pred_ppu : T_vstore_pred_ppu<"vmem">, V6_vS32b_pred_ppu_enc;
-  def V6_vS32b_npred_ppu: T_vstore_pred_ppu<"vmem", 1>, V6_vS32b_npred_ppu_enc;
-}
-
-let isNVStorable = 1, BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
-  def V6_vS32b_nt_pred_ppu  : T_vstore_pred_ppu <"vmem", 0, 1>,
-                              V6_vS32b_nt_pred_ppu_enc;
-  def V6_vS32b_nt_npred_ppu : T_vstore_pred_ppu <"vmem", 1, 1>,
-                              V6_vS32b_nt_npred_ppu_enc;
-}
-
-let BaseOpcode = "vS32Ub_ppu", Itinerary = CVI_VM_STU,
-    Type = TypeCVI_VM_STU in {
-  def V6_vS32Ub_pred_ppu  : T_vstore_pred_ppu <"vmemu">,
-                            V6_vS32Ub_pred_ppu_enc;
-  def V6_vS32Ub_npred_ppu : T_vstore_pred_ppu <"vmemu", 1>,
-                            V6_vS32Ub_npred_ppu_enc;
-}
-
-//===----------------------------------------------------------------------===//
-// Post increment vector stores with register offset - byte-enabled aligned
-//===----------------------------------------------------------------------===//
-class T_vstore_qpred_ppu <bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-        (ins VecPredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
-    "if ("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
-          #!if(isNT, ":nt", "")#" = $src4", [],
-    "$src2 = $_dst_">, NewValueRel;
-
-def V6_vS32b_qpred_ppu  : T_vstore_qpred_ppu, V6_vS32b_qpred_ppu_enc;
-def V6_vS32b_nqpred_ppu : T_vstore_qpred_ppu<1>, V6_vS32b_nqpred_ppu_enc;
-def V6_vS32b_nt_qpred_ppu  : T_vstore_qpred_ppu<0, 1>,
-                             V6_vS32b_nt_qpred_ppu_enc;
-def V6_vS32b_nt_nqpred_ppu : T_vstore_qpred_ppu<1, 1>,
-                             V6_vS32b_nt_nqpred_ppu_enc;
-
-//===----------------------------------------------------------------------===//
-// Post increment conditional .new vector stores with register offset
-//===----------------------------------------------------------------------===//
-let Itinerary = CVI_VM_NEW_ST, Type = TypeCVI_VM_NEW_ST, isPredicated = 1,
-    isNewValue = 1, opNewValue = 4, isNVStore = 1 in
-class T_vstore_new_pred_ppu <bit isPredNot = 0, bit isNT = 0>
-  : V6_STInst <(outs IntRegs:$_dst_),
-           (ins PredRegs:$src1, IntRegs:$src2, ModRegs:$src3, VectorRegs:$src4),
-    "if("#!if(isPredNot, "!", "")#"$src1) vmem($src2++$src3)"
-         #!if(isNT, ":nt", "")#" = $src4.new", [],
-    "$src2 = $_dst_">, NewValueRel {
-  let isPredicatedFalse = isPredNot;
-}
-
-let BaseOpcode = "vS32b_ppu" in {
-  def V6_vS32b_new_pred_ppu  : T_vstore_new_pred_ppu,
-                               V6_vS32b_new_pred_ppu_enc;
-  def V6_vS32b_new_npred_ppu : T_vstore_new_pred_ppu<1>,
-                               V6_vS32b_new_npred_ppu_enc;
-}
-
-let BaseOpcode = "vS32b_ppu", isNonTemporal = 1 in {
-def V6_vS32b_nt_new_pred_ppu :  T_vstore_new_pred_ppu<0, 1>,
-                                V6_vS32b_nt_new_pred_ppu_enc;
-def V6_vS32b_nt_new_npred_ppu : T_vstore_new_pred_ppu<1, 1>,
-                                V6_vS32b_nt_new_npred_ppu_enc;
-}
-
-
-// Vector load/store pseudos
-
-let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
-class STrivv_template<RegisterClass RC>
-  : V6_STInst<(outs), (ins IntRegs:$addr, s32_0Imm:$off, RC:$src), "", []>;
-
-def PS_vstorerw_ai: STrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vstorerwu_ai: STrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vstorerw_ai_128B: STrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-
-
-let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
-class LDrivv_template<RegisterClass RC>
-  : V6_LDInst<(outs RC:$dst), (ins IntRegs:$addr, s32_0Imm:$off), "", []>;
-
-def PS_vloadrw_ai: LDrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vloadrwu_ai: LDrivv_template<VecDblRegs>,
-      Requires<[HasV60T,UseHVXSgl]>;
-def PS_vloadrw_ai_128B: LDrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B>,
-      Requires<[HasV60T,UseHVXDbl]>;
-
-// Store vector predicate pseudo.
-let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
-    isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
-  def PS_vstorerq_ai : STInst<(outs),
-              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXSgl]>;
-  def PS_vstorerq_ai_128B : STInst<(outs),
-              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXDbl]>;
-}
-
-// Load vector predicate pseudo.
-let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
-    opExtentAlign = 2, isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
-  def PS_vloadrq_ai : LDInst<(outs VecPredRegs:$dst),
-              (ins IntRegs:$base, s32_0Imm:$offset),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXSgl]>;
-  def PS_vloadrq_ai_128B : LDInst<(outs VecPredRegs128B:$dst),
-              (ins IntRegs:$base, s32_0Imm:$offset),
-              ".error \"should not emit\"", []>,
-              Requires<[HasV60T,UseHVXDbl]>;
-}
-
-class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
-              string cstr = "", InstrItinClass itin = CVI_VA_DV,
-              IType type = TypeCVI_VA_DV>
-  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
-
-let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
-  def PS_vselect: VSELInst<(outs VectorRegs:$dst),
-        (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), "", []>,
-        Requires<[HasV60T,UseHVXSgl]>;
-  def PS_vselect_128B: VSELInst<(outs VectorRegs128B:$dst),
-        (ins PredRegs:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
-        "", []>, Requires<[HasV60T,UseHVXDbl]>;
-  def PS_wselect: VSELInst<(outs VecDblRegs:$dst),
-        (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3), "", []>,
-        Requires<[HasV60T,UseHVXSgl]>;
-  def PS_wselect_128B: VSELInst<(outs VecDblRegs128B:$dst),
-        (ins PredRegs:$src1, VecDblRegs128B:$src2, VecDblRegs128B:$src3),
-        "", []>, Requires<[HasV60T,UseHVXDbl]>;
-}
-
-let hasNewValue = 1 in
-class T_vmpy <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
-    asmString >;
-
-multiclass T_vmpy <string asmString, RegisterClass RCout,
-                        RegisterClass RCin> {
-  def NAME : T_vmpy <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_vmpy <asmString, !cast<RegisterClass>(RCout#"128B"),
-                                      !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_vmpy_VV <string asmString>:
-  T_vmpy <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_vmpy_WW <string asmString>:
-  T_vmpy <asmString, VecDblRegs, VecDblRegs>;
-
-multiclass T_vmpy_VW <string asmString>:
-  T_vmpy <asmString, VectorRegs, VecDblRegs>;
-
-multiclass T_vmpy_WV <string asmString>:
-  T_vmpy <asmString, VecDblRegs, VectorRegs>;
-
-defm V6_vtmpyb   :T_vmpy_WW<"$dst.h = vtmpy($src1.b,$src2.b)">, V6_vtmpyb_enc;
-defm V6_vtmpybus :T_vmpy_WW<"$dst.h = vtmpy($src1.ub,$src2.b)">, V6_vtmpybus_enc;
-defm V6_vdsaduh  :T_vmpy_WW<"$dst.uw = vdsad($src1.uh,$src2.uh)">, V6_vdsaduh_enc;
-defm V6_vmpybus  :T_vmpy_WV<"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybus_enc;
-defm V6_vmpabus  :T_vmpy_WW<"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabus_enc;
-defm V6_vmpahb   :T_vmpy_WW<"$dst.w = vmpa($src1.h,$src2.b)">, V6_vmpahb_enc;
-defm V6_vmpyh    :T_vmpy_WV<"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyh_enc;
-defm V6_vmpyuh   :T_vmpy_WV<"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuh_enc;
-defm V6_vmpyiwh  :T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_enc;
-defm V6_vtmpyhb  :T_vmpy_WW<"$dst.w = vtmpy($src1.h,$src2.b)">, V6_vtmpyhb_enc;
-defm V6_vmpyub   :T_vmpy_WV<"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyub_enc;
-
-let Itinerary = CVI_VX_LONG, Type = TypeCVI_VX in
-defm V6_vmpyihb  :T_vmpy_VV<"$dst.h = vmpyi($src1.h,$src2.b)">, V6_vmpyihb_enc;
-
-defm V6_vdmpybus_dv :
-     T_vmpy_WW <"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_dv_enc;
-defm V6_vdmpyhsusat :
-     T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.uh):sat">, V6_vdmpyhsusat_enc;
-defm V6_vdmpyhsuisat :
-     T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.uh,#1):sat">, V6_vdmpyhsuisat_enc;
-defm V6_vdmpyhsat :
-     T_vmpy_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhsat_enc;
-defm V6_vdmpyhisat :
-     T_vmpy_VW <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhisat_enc;
-defm V6_vdmpyhb_dv :
-     T_vmpy_WW <"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_dv_enc;
-defm V6_vmpyhss :
-     T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:sat">, V6_vmpyhss_enc;
-defm V6_vmpyhsrs :
-     T_vmpy_VV <"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhsrs_enc;
-
-let Itinerary = CVI_VP, Type = TypeCVI_VP in
-defm V6_vror : T_vmpy_VV <"$dst = vror($src1,$src2)">, V6_vror_enc;
-
-let Itinerary = CVI_VX, Type = TypeCVI_VX in {
-defm V6_vdmpyhb  : T_vmpy_VV<"$dst.w = vdmpy($src1.h,$src2.b)">, V6_vdmpyhb_enc;
-defm V6_vrmpybus : T_vmpy_VV<"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybus_enc;
-defm V6_vdmpybus : T_vmpy_VV<"$dst.h = vdmpy($src1.ub,$src2.b)">, V6_vdmpybus_enc;
-defm V6_vmpyiwb  : T_vmpy_VV<"$dst.w = vmpyi($src1.w,$src2.b)">, V6_vmpyiwb_enc;
-defm V6_vrmpyub : T_vmpy_VV<"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyub_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vasrw  : T_vmpy_VV <"$dst.w = vasr($src1.w,$src2)">, V6_vasrw_enc;
-defm V6_vasrh  : T_vmpy_VV <"$dst.h = vasr($src1.h,$src2)">, V6_vasrh_enc;
-defm V6_vaslw  : T_vmpy_VV <"$dst.w = vasl($src1.w,$src2)">, V6_vaslw_enc;
-defm V6_vaslh  : T_vmpy_VV <"$dst.h = vasl($src1.h,$src2)">, V6_vaslh_enc;
-defm V6_vlsrw  : T_vmpy_VV <"$dst.uw = vlsr($src1.uw,$src2)">, V6_vlsrw_enc;
-defm V6_vlsrh  : T_vmpy_VV <"$dst.uh = vlsr($src1.uh,$src2)">, V6_vlsrh_enc;
-}
-
-let hasNewValue = 1 in
-class T_HVX_alu <string asmString, InstrItinClass itin,
-                 RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
-    asmString >{
-  let Itinerary = itin;
-  let Type = !cast<IType>("Type"#itin);
-}
-
-multiclass T_HVX_alu <string asmString, RegisterClass RCout,
-           RegisterClass RCin, InstrItinClass itin> {
-  def NAME : T_HVX_alu <asmString, itin, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_alu <asmString, itin,
-                              !cast<RegisterClass>(RCout#"128B"),
-                              !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_alu_VV <string asmString>:
-  T_HVX_alu <asmString, VectorRegs, VectorRegs, CVI_VA>;
-
-multiclass T_HVX_alu_WW <string asmString>:
-  T_HVX_alu <asmString, VecDblRegs, VecDblRegs, CVI_VA_DV>;
-
-multiclass T_HVX_alu_WV <string asmString>:
-  T_HVX_alu <asmString, VecDblRegs, VectorRegs, CVI_VX_DV>;
-
-
-let Itinerary  =  CVI_VX, Type  =  TypeCVI_VX in {
-defm V6_vrmpyubv :
-     T_HVX_alu_VV <"$dst.uw = vrmpy($src1.ub,$src2.ub)">, V6_vrmpyubv_enc;
-defm V6_vrmpybv :
-     T_HVX_alu_VV <"$dst.w = vrmpy($src1.b,$src2.b)">, V6_vrmpybv_enc;
-defm V6_vrmpybusv :
-     T_HVX_alu_VV <"$dst.w = vrmpy($src1.ub,$src2.b)">, V6_vrmpybusv_enc;
-defm V6_vabsdiffub :
-     T_HVX_alu_VV <"$dst.ub = vabsdiff($src1.ub,$src2.ub)">, V6_vabsdiffub_enc;
-defm V6_vabsdiffh :
-     T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.h,$src2.h)">, V6_vabsdiffh_enc;
-defm V6_vabsdiffuh :
-     T_HVX_alu_VV <"$dst.uh = vabsdiff($src1.uh,$src2.uh)">, V6_vabsdiffuh_enc;
-defm V6_vabsdiffw :
-     T_HVX_alu_VV <"$dst.uw = vabsdiff($src1.w,$src2.w)">, V6_vabsdiffw_enc;
-}
-
-let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
-defm V6_vdmpyhvsat :
-     T_HVX_alu_VV <"$dst.w = vdmpy($src1.h,$src2.h):sat">, V6_vdmpyhvsat_enc;
-defm V6_vmpyhvsrs :
-     T_HVX_alu_VV<"$dst.h = vmpy($src1.h,$src2.h):<<1:rnd:sat">, V6_vmpyhvsrs_enc;
-defm V6_vmpyih :
-     T_HVX_alu_VV <"$dst.h = vmpyi($src1.h,$src2.h)">, V6_vmpyih_enc;
-}
-
-defm V6_vand :
-     T_HVX_alu_VV <"$dst = vand($src1,$src2)">, V6_vand_enc;
-defm V6_vor :
-     T_HVX_alu_VV <"$dst = vor($src1,$src2)">, V6_vor_enc;
-defm V6_vxor :
-     T_HVX_alu_VV <"$dst = vxor($src1,$src2)">, V6_vxor_enc;
-defm V6_vaddw :
-     T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_enc;
-defm V6_vaddubsat :
-     T_HVX_alu_VV <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_enc;
-defm V6_vadduhsat :
-     T_HVX_alu_VV <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_enc;
-defm V6_vaddhsat :
-     T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_enc;
-defm V6_vaddwsat :
-     T_HVX_alu_VV <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_enc;
-defm V6_vsubb :
-     T_HVX_alu_VV <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_enc;
-defm V6_vsubh :
-     T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_enc;
-defm V6_vsubw :
-     T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_enc;
-defm V6_vsububsat :
-     T_HVX_alu_VV <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_enc;
-defm V6_vsubuhsat :
-     T_HVX_alu_VV <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_enc;
-defm V6_vsubhsat :
-     T_HVX_alu_VV <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_enc;
-defm V6_vsubwsat :
-     T_HVX_alu_VV <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_enc;
-defm V6_vavgub :
-     T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub)">, V6_vavgub_enc;
-defm V6_vavguh :
-     T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh)">, V6_vavguh_enc;
-defm V6_vavgh :
-     T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h)">, V6_vavgh_enc;
-defm V6_vavgw :
-     T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w)">, V6_vavgw_enc;
-defm V6_vnavgub :
-     T_HVX_alu_VV <"$dst.b = vnavg($src1.ub,$src2.ub)">, V6_vnavgub_enc;
-defm V6_vnavgh :
-     T_HVX_alu_VV <"$dst.h = vnavg($src1.h,$src2.h)">, V6_vnavgh_enc;
-defm V6_vnavgw :
-     T_HVX_alu_VV <"$dst.w = vnavg($src1.w,$src2.w)">, V6_vnavgw_enc;
-defm V6_vavgubrnd :
-     T_HVX_alu_VV <"$dst.ub = vavg($src1.ub,$src2.ub):rnd">, V6_vavgubrnd_enc;
-defm V6_vavguhrnd :
-     T_HVX_alu_VV <"$dst.uh = vavg($src1.uh,$src2.uh):rnd">, V6_vavguhrnd_enc;
-defm V6_vavghrnd :
-     T_HVX_alu_VV <"$dst.h = vavg($src1.h,$src2.h):rnd">, V6_vavghrnd_enc;
-defm V6_vavgwrnd :
-     T_HVX_alu_VV <"$dst.w = vavg($src1.w,$src2.w):rnd">, V6_vavgwrnd_enc;
-
-defm V6_vmpybv :
-     T_HVX_alu_WV <"$dst.h = vmpy($src1.b,$src2.b)">, V6_vmpybv_enc;
-defm V6_vmpyubv :
-     T_HVX_alu_WV <"$dst.uh = vmpy($src1.ub,$src2.ub)">, V6_vmpyubv_enc;
-defm V6_vmpybusv :
-     T_HVX_alu_WV <"$dst.h = vmpy($src1.ub,$src2.b)">, V6_vmpybusv_enc;
-defm V6_vmpyhv :
-     T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.h)">, V6_vmpyhv_enc;
-defm V6_vmpyuhv :
-     T_HVX_alu_WV <"$dst.uw = vmpy($src1.uh,$src2.uh)">, V6_vmpyuhv_enc;
-defm V6_vmpyhus :
-     T_HVX_alu_WV <"$dst.w = vmpy($src1.h,$src2.uh)">, V6_vmpyhus_enc;
-defm V6_vaddubh :
-     T_HVX_alu_WV <"$dst.h = vadd($src1.ub,$src2.ub)">, V6_vaddubh_enc;
-defm V6_vadduhw :
-     T_HVX_alu_WV <"$dst.w = vadd($src1.uh,$src2.uh)">, V6_vadduhw_enc;
-defm V6_vaddhw :
-     T_HVX_alu_WV <"$dst.w = vadd($src1.h,$src2.h)">, V6_vaddhw_enc;
-defm V6_vsububh :
-     T_HVX_alu_WV <"$dst.h = vsub($src1.ub,$src2.ub)">, V6_vsububh_enc;
-defm V6_vsubuhw :
-     T_HVX_alu_WV <"$dst.w = vsub($src1.uh,$src2.uh)">, V6_vsubuhw_enc;
-defm V6_vsubhw :
-     T_HVX_alu_WV <"$dst.w = vsub($src1.h,$src2.h)">, V6_vsubhw_enc;
-
-defm V6_vaddb_dv :
-     T_HVX_alu_WW <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_dv_enc;
-defm V6_vaddh_dv :
-     T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_dv_enc;
-defm V6_vaddw_dv :
-     T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w)">, V6_vaddw_dv_enc;
-defm V6_vaddubsat_dv :
-     T_HVX_alu_WW <"$dst.ub = vadd($src1.ub,$src2.ub):sat">, V6_vaddubsat_dv_enc;
-defm V6_vadduhsat_dv :
-     T_HVX_alu_WW <"$dst.uh = vadd($src1.uh,$src2.uh):sat">, V6_vadduhsat_dv_enc;
-defm V6_vaddhsat_dv :
-     T_HVX_alu_WW <"$dst.h = vadd($src1.h,$src2.h):sat">, V6_vaddhsat_dv_enc;
-defm V6_vaddwsat_dv :
-     T_HVX_alu_WW <"$dst.w = vadd($src1.w,$src2.w):sat">, V6_vaddwsat_dv_enc;
-defm V6_vsubb_dv :
-     T_HVX_alu_WW <"$dst.b = vsub($src1.b,$src2.b)">, V6_vsubb_dv_enc;
-defm V6_vsubh_dv :
-     T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h)">, V6_vsubh_dv_enc;
-defm V6_vsubw_dv :
-     T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w)">, V6_vsubw_dv_enc;
-defm V6_vsububsat_dv :
-     T_HVX_alu_WW <"$dst.ub = vsub($src1.ub,$src2.ub):sat">, V6_vsububsat_dv_enc;
-defm V6_vsubuhsat_dv :
-     T_HVX_alu_WW <"$dst.uh = vsub($src1.uh,$src2.uh):sat">, V6_vsubuhsat_dv_enc;
-defm V6_vsubhsat_dv :
-     T_HVX_alu_WW <"$dst.h = vsub($src1.h,$src2.h):sat">, V6_vsubhsat_dv_enc;
-defm V6_vsubwsat_dv :
-     T_HVX_alu_WW <"$dst.w = vsub($src1.w,$src2.w):sat">, V6_vsubwsat_dv_enc;
-
-let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV in {
-defm V6_vmpabusv :
-     T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.b)">, V6_vmpabusv_enc;
-defm V6_vmpabuuv :
-     T_HVX_alu_WW <"$dst.h = vmpa($src1.ub,$src2.ub)">, V6_vmpabuuv_enc;
-}
-
-let isAccumulator = 1, hasNewValue = 1 in
-class T_HVX_vmpyacc <string asmString, InstrItinClass itin, RegisterClass RCout,
-                     RegisterClass RCin1, RegisterClass RCin2>
-  : CVI_VA_Resource1 <(outs RCout:$dst),
-                      (ins RCout:$_src_, RCin1:$src1, RCin2:$src2), asmString,
-                      [], "$dst = $_src_" > {
-  let Itinerary = itin;
-  let Type = !cast<IType>("Type"#itin);
-}
-
-multiclass T_HVX_vmpyacc_both <string asmString, RegisterClass RCout,
-           RegisterClass RCin1, RegisterClass RCin2, InstrItinClass itin > {
-  def NAME : T_HVX_vmpyacc <asmString, itin, RCout, RCin1, RCin2>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vmpyacc <asmString, itin,
-                   !cast<RegisterClass>(RCout#"128B"),
-                   !cast<RegisterClass>(RCin1#"128B"),
-                   !cast<RegisterClass>(RCin2#
-                   !if(!eq (!cast<string>(RCin2), "IntRegs"), "", "128B"))>;
-}
-
-multiclass T_HVX_vmpyacc_VVR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, IntRegs, CVI_VX>;
-
-multiclass T_HVX_vmpyacc_VWR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VectorRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_WVR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, IntRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_WWR <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VecDblRegs, VecDblRegs, IntRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_VVV <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VectorRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
-
-multiclass T_HVX_vmpyacc_WVV <string asmString>:
-  T_HVX_vmpyacc_both <asmString, VecDblRegs, VectorRegs, VectorRegs, CVI_VX_DV>;
-
-
-defm V6_vtmpyb_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.b,$src2.b)">,
-     V6_vtmpyb_acc_enc;
-defm V6_vtmpybus_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vtmpy($src1.ub,$src2.b)">,
-     V6_vtmpybus_acc_enc;
-defm V6_vtmpyhb_acc :
-     T_HVX_vmpyacc_WWR <"$dst.w += vtmpy($src1.h,$src2.b)">,
-     V6_vtmpyhb_acc_enc;
-defm V6_vdmpyhb_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.b)">,
-     V6_vdmpyhb_acc_enc;
-defm V6_vrmpyub_acc :
-     T_HVX_vmpyacc_VVR <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
-     V6_vrmpyub_acc_enc;
-defm V6_vrmpybus_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vrmpy($src1.ub,$src2.b)">,
-     V6_vrmpybus_acc_enc;
-defm V6_vdmpybus_acc :
-     T_HVX_vmpyacc_VVR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
-     V6_vdmpybus_acc_enc;
-defm V6_vdmpybus_dv_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vdmpy($src1.ub,$src2.b)">,
-     V6_vdmpybus_dv_acc_enc;
-defm V6_vdmpyhsuisat_acc :
-     T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.uh,#1):sat">,
-     V6_vdmpyhsuisat_acc_enc;
-defm V6_vdmpyhisat_acc :
-     T_HVX_vmpyacc_VWR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
-     V6_vdmpyhisat_acc_enc;
-defm V6_vdmpyhb_dv_acc :
-     T_HVX_vmpyacc_WWR <"$dst.w += vdmpy($src1.h,$src2.b)">,
-     V6_vdmpyhb_dv_acc_enc;
-defm V6_vmpybus_acc :
-     T_HVX_vmpyacc_WVR <"$dst.h += vmpy($src1.ub,$src2.b)">,
-     V6_vmpybus_acc_enc;
-defm V6_vmpabus_acc :
-     T_HVX_vmpyacc_WWR <"$dst.h += vmpa($src1.ub,$src2.b)">,
-     V6_vmpabus_acc_enc;
-defm V6_vmpahb_acc :
-     T_HVX_vmpyacc_WWR <"$dst.w += vmpa($src1.h,$src2.b)">,
-     V6_vmpahb_acc_enc;
-defm V6_vmpyhsat_acc :
-     T_HVX_vmpyacc_WVR <"$dst.w += vmpy($src1.h,$src2.h):sat">,
-     V6_vmpyhsat_acc_enc;
-defm V6_vmpyuh_acc :
-     T_HVX_vmpyacc_WVR <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
-     V6_vmpyuh_acc_enc;
-defm V6_vmpyiwb_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vmpyi($src1.w,$src2.b)">,
-     V6_vmpyiwb_acc_enc;
-defm V6_vdsaduh_acc :
-     T_HVX_vmpyacc_WWR <"$dst.uw += vdsad($src1.uh,$src2.uh)">,
-     V6_vdsaduh_acc_enc;
-defm V6_vmpyihb_acc :
-     T_HVX_vmpyacc_VVR <"$dst.h += vmpyi($src1.h,$src2.b)">,
-     V6_vmpyihb_acc_enc;
-defm V6_vmpyub_acc :
-     T_HVX_vmpyacc_WVR <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
-     V6_vmpyub_acc_enc;
-
-let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
-defm V6_vdmpyhsusat_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.uh):sat">,
-     V6_vdmpyhsusat_acc_enc;
-defm V6_vdmpyhsat_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
-     V6_vdmpyhsat_acc_enc;
-defm V6_vmpyiwh_acc : T_HVX_vmpyacc_VVR
-     <"$dst.w += vmpyi($src1.w,$src2.h)">, V6_vmpyiwh_acc_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vaslw_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vasl($src1.w,$src2)">, V6_vaslw_acc_enc;
-defm V6_vasrw_acc :
-     T_HVX_vmpyacc_VVR <"$dst.w += vasr($src1.w,$src2)">, V6_vasrw_acc_enc;
-}
-
-defm V6_vdmpyhvsat_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vdmpy($src1.h,$src2.h):sat">,
-     V6_vdmpyhvsat_acc_enc;
-defm V6_vmpybusv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.ub,$src2.b)">,
-     V6_vmpybusv_acc_enc;
-defm V6_vmpybv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.h += vmpy($src1.b,$src2.b)">, V6_vmpybv_acc_enc;
-defm V6_vmpyhus_acc :
-     T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.uh)">, V6_vmpyhus_acc_enc;
-defm V6_vmpyhv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.w += vmpy($src1.h,$src2.h)">, V6_vmpyhv_acc_enc;
-defm V6_vmpyiewh_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.h)">,
-     V6_vmpyiewh_acc_enc;
-defm V6_vmpyiewuh_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyie($src1.w,$src2.uh)">,
-     V6_vmpyiewuh_acc_enc;
-defm V6_vmpyih_acc :
-     T_HVX_vmpyacc_VVV <"$dst.h += vmpyi($src1.h,$src2.h)">, V6_vmpyih_acc_enc;
-defm V6_vmpyowh_rnd_sacc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:rnd:sat:shift">,
-     V6_vmpyowh_rnd_sacc_enc;
-defm V6_vmpyowh_sacc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vmpyo($src1.w,$src2.h):<<1:sat:shift">,
-     V6_vmpyowh_sacc_enc;
-defm V6_vmpyubv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.uh += vmpy($src1.ub,$src2.ub)">,
-     V6_vmpyubv_acc_enc;
-defm V6_vmpyuhv_acc :
-     T_HVX_vmpyacc_WVV <"$dst.uw += vmpy($src1.uh,$src2.uh)">,
-     V6_vmpyuhv_acc_enc;
-defm V6_vrmpybusv_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.ub,$src2.b)">,
-     V6_vrmpybusv_acc_enc;
-defm V6_vrmpybv_acc :
-     T_HVX_vmpyacc_VVV <"$dst.w += vrmpy($src1.b,$src2.b)">, V6_vrmpybv_acc_enc;
-defm V6_vrmpyubv_acc :
-     T_HVX_vmpyacc_VVV <"$dst.uw += vrmpy($src1.ub,$src2.ub)">,
-     V6_vrmpyubv_acc_enc;
-
-
-class T_HVX_vcmp <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst),
-                      (ins RCout:$_src_, RCin:$src1, RCin:$src2), asmString,
-                      [], "$dst = $_src_" > {
-  let Itinerary = CVI_VA;
-  let Type = TypeCVI_VA;
-}
-
-multiclass T_HVX_vcmp <string asmString> {
-  def NAME : T_HVX_vcmp <asmString, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vcmp <asmString, VecPredRegs128B, VectorRegs128B>;
-}
-
-defm V6_veqb_and :
-     T_HVX_vcmp <"$dst &= vcmp.eq($src1.b,$src2.b)">, V6_veqb_and_enc;
-defm V6_veqh_and :
-     T_HVX_vcmp <"$dst &= vcmp.eq($src1.h,$src2.h)">, V6_veqh_and_enc;
-defm V6_veqw_and :
-     T_HVX_vcmp <"$dst &= vcmp.eq($src1.w,$src2.w)">, V6_veqw_and_enc;
-defm V6_vgtb_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_and_enc;
-defm V6_vgth_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.h,$src2.h)">, V6_vgth_and_enc;
-defm V6_vgtw_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_and_enc;
-defm V6_vgtub_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_and_enc;
-defm V6_vgtuh_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_and_enc;
-defm V6_vgtuw_and :
-     T_HVX_vcmp <"$dst &= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_and_enc;
-defm V6_veqb_or :
-     T_HVX_vcmp <"$dst |= vcmp.eq($src1.b,$src2.b)">, V6_veqb_or_enc;
-defm V6_veqh_or :
-     T_HVX_vcmp <"$dst |= vcmp.eq($src1.h,$src2.h)">, V6_veqh_or_enc;
-defm V6_veqw_or :
-     T_HVX_vcmp <"$dst |= vcmp.eq($src1.w,$src2.w)">, V6_veqw_or_enc;
-defm V6_vgtb_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_or_enc;
-defm V6_vgth_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.h,$src2.h)">, V6_vgth_or_enc;
-defm V6_vgtw_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_or_enc;
-defm V6_vgtub_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_or_enc;
-defm V6_vgtuh_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_or_enc;
-defm V6_vgtuw_or :
-     T_HVX_vcmp <"$dst |= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_or_enc;
-defm V6_veqb_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.b,$src2.b)">, V6_veqb_xor_enc;
-defm V6_veqh_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.h,$src2.h)">, V6_veqh_xor_enc;
-defm V6_veqw_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.eq($src1.w,$src2.w)">, V6_veqw_xor_enc;
-defm V6_vgtb_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.b,$src2.b)">, V6_vgtb_xor_enc;
-defm V6_vgth_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.h,$src2.h)">, V6_vgth_xor_enc;
-defm V6_vgtw_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.w,$src2.w)">, V6_vgtw_xor_enc;
-defm V6_vgtub_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_xor_enc;
-defm V6_vgtuh_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_xor_enc;
-defm V6_vgtuw_xor :
-     T_HVX_vcmp <"$dst ^= vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_xor_enc;
-
-defm V6_vminub :
-     T_HVX_alu_VV <"$dst.ub = vmin($src1.ub,$src2.ub)">, V6_vminub_enc;
-defm V6_vminuh :
-     T_HVX_alu_VV <"$dst.uh = vmin($src1.uh,$src2.uh)">, V6_vminuh_enc;
-defm V6_vminh :
-     T_HVX_alu_VV <"$dst.h = vmin($src1.h,$src2.h)">, V6_vminh_enc;
-defm V6_vminw :
-     T_HVX_alu_VV <"$dst.w = vmin($src1.w,$src2.w)">, V6_vminw_enc;
-defm V6_vmaxub :
-     T_HVX_alu_VV <"$dst.ub = vmax($src1.ub,$src2.ub)">, V6_vmaxub_enc;
-defm V6_vmaxuh :
-     T_HVX_alu_VV <"$dst.uh = vmax($src1.uh,$src2.uh)">, V6_vmaxuh_enc;
-defm V6_vmaxh :
-     T_HVX_alu_VV <"$dst.h = vmax($src1.h,$src2.h)">, V6_vmaxh_enc;
-defm V6_vmaxw :
-     T_HVX_alu_VV <"$dst.w = vmax($src1.w,$src2.w)">, V6_vmaxw_enc;
-defm V6_vshuffeb :
-     T_HVX_alu_VV <"$dst.b = vshuffe($src1.b,$src2.b)">, V6_vshuffeb_enc;
-defm V6_vshuffob :
-     T_HVX_alu_VV <"$dst.b = vshuffo($src1.b,$src2.b)">, V6_vshuffob_enc;
-defm V6_vshufeh :
-     T_HVX_alu_VV <"$dst.h = vshuffe($src1.h,$src2.h)">, V6_vshufeh_enc;
-defm V6_vshufoh :
-     T_HVX_alu_VV <"$dst.h = vshuffo($src1.h,$src2.h)">, V6_vshufoh_enc;
-
-let Itinerary = CVI_VX_DV, Type = TypeCVI_VX_DV in {
-defm V6_vmpyowh_rnd :
-     T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:rnd:sat">,
-     V6_vmpyowh_rnd_enc;
-defm V6_vmpyiewuh :
-     T_HVX_alu_VV <"$dst.w = vmpyie($src1.w,$src2.uh)">, V6_vmpyiewuh_enc;
-defm V6_vmpyewuh :
-     T_HVX_alu_VV <"$dst.w = vmpye($src1.w,$src2.uh)">, V6_vmpyewuh_enc;
-defm V6_vmpyowh :
-     T_HVX_alu_VV <"$dst.w = vmpyo($src1.w,$src2.h):<<1:sat">, V6_vmpyowh_enc;
-defm V6_vmpyiowh :
-     T_HVX_alu_VV <"$dst.w = vmpyio($src1.w,$src2.h)">, V6_vmpyiowh_enc;
-}
-let Itinerary = CVI_VX, Type = TypeCVI_VX in
-defm V6_vmpyieoh :
-     T_HVX_alu_VV <"$dst.w = vmpyieo($src1.h,$src2.h)">, V6_vmpyieoh_enc;
-
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in {
-defm V6_vshufoeh :
-     T_HVX_alu_WV <"$dst.h = vshuffoe($src1.h,$src2.h)">, V6_vshufoeh_enc;
-defm V6_vshufoeb :
-     T_HVX_alu_WV <"$dst.b = vshuffoe($src1.b,$src2.b)">, V6_vshufoeb_enc;
-}
-
-let isRegSequence = 1, Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
-defm V6_vcombine :
-     T_HVX_alu_WV <"$dst = vcombine($src1,$src2)">, V6_vcombine_enc;
-
-let Itinerary = CVI_VINLANESAT, Type = TypeCVI_VINLANESAT in {
-defm V6_vsathub :
-     T_HVX_alu_VV <"$dst.ub = vsat($src1.h,$src2.h)">, V6_vsathub_enc;
-defm V6_vsatwh :
-     T_HVX_alu_VV <"$dst.h = vsat($src1.w,$src2.w)">, V6_vsatwh_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vroundwh :
-     T_HVX_alu_VV <"$dst.h = vround($src1.w,$src2.w):sat">, V6_vroundwh_enc;
-defm V6_vroundwuh :
-     T_HVX_alu_VV <"$dst.uh = vround($src1.w,$src2.w):sat">, V6_vroundwuh_enc;
-defm V6_vroundhb :
-     T_HVX_alu_VV <"$dst.b = vround($src1.h,$src2.h):sat">, V6_vroundhb_enc;
-defm V6_vroundhub :
-     T_HVX_alu_VV <"$dst.ub = vround($src1.h,$src2.h):sat">, V6_vroundhub_enc;
-defm V6_vasrwv :
-     T_HVX_alu_VV <"$dst.w = vasr($src1.w,$src2.w)">, V6_vasrwv_enc;
-defm V6_vlsrwv :
-     T_HVX_alu_VV <"$dst.w = vlsr($src1.w,$src2.w)">, V6_vlsrwv_enc;
-defm V6_vlsrhv :
-     T_HVX_alu_VV <"$dst.h = vlsr($src1.h,$src2.h)">, V6_vlsrhv_enc;
-defm V6_vasrhv :
-     T_HVX_alu_VV <"$dst.h = vasr($src1.h,$src2.h)">, V6_vasrhv_enc;
-defm V6_vaslwv :
-     T_HVX_alu_VV <"$dst.w = vasl($src1.w,$src2.w)">, V6_vaslwv_enc;
-defm V6_vaslhv :
-     T_HVX_alu_VV <"$dst.h = vasl($src1.h,$src2.h)">, V6_vaslhv_enc;
-}
-
-defm V6_vaddb :
-     T_HVX_alu_VV <"$dst.b = vadd($src1.b,$src2.b)">, V6_vaddb_enc;
-defm V6_vaddh :
-     T_HVX_alu_VV <"$dst.h = vadd($src1.h,$src2.h)">, V6_vaddh_enc;
-
-let Itinerary = CVI_VP, Type = TypeCVI_VP in {
-defm V6_vdelta :
-     T_HVX_alu_VV <"$dst = vdelta($src1,$src2)">, V6_vdelta_enc;
-defm V6_vrdelta :
-     T_HVX_alu_VV <"$dst = vrdelta($src1,$src2)">, V6_vrdelta_enc;
-defm V6_vdealb4w :
-     T_HVX_alu_VV <"$dst.b = vdeale($src1.b,$src2.b)">, V6_vdealb4w_enc;
-defm V6_vpackeb :
-     T_HVX_alu_VV <"$dst.b = vpacke($src1.h,$src2.h)">, V6_vpackeb_enc;
-defm V6_vpackeh :
-     T_HVX_alu_VV <"$dst.h = vpacke($src1.w,$src2.w)">, V6_vpackeh_enc;
-defm V6_vpackhub_sat :
-     T_HVX_alu_VV <"$dst.ub = vpack($src1.h,$src2.h):sat">, V6_vpackhub_sat_enc;
-defm V6_vpackhb_sat :
-     T_HVX_alu_VV <"$dst.b = vpack($src1.h,$src2.h):sat">, V6_vpackhb_sat_enc;
-defm V6_vpackwuh_sat :
-     T_HVX_alu_VV <"$dst.uh = vpack($src1.w,$src2.w):sat">, V6_vpackwuh_sat_enc;
-defm V6_vpackwh_sat :
-     T_HVX_alu_VV <"$dst.h = vpack($src1.w,$src2.w):sat">, V6_vpackwh_sat_enc;
-defm V6_vpackob :
-     T_HVX_alu_VV <"$dst.b = vpacko($src1.h,$src2.h)">, V6_vpackob_enc;
-defm V6_vpackoh :
-     T_HVX_alu_VV <"$dst.h = vpacko($src1.w,$src2.w)">, V6_vpackoh_enc;
-}
-
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_HVX_condALU <string asmString, RegisterClass RC1, RegisterClass RC2>
-  : CVI_VA_Resource1 <(outs RC2:$dst),
-                      (ins RC1:$src1, RC2:$_src_, RC2:$src2), asmString,
-                      [], "$dst = $_src_" > {
-  let Itinerary = CVI_VA;
-  let Type = TypeCVI_VA;
-}
-
-multiclass T_HVX_condALU <string asmString> {
-  def NAME : T_HVX_condALU <asmString, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_condALU <asmString, VecPredRegs128B, VectorRegs128B>;
-}
-
-defm V6_vaddbq  : T_HVX_condALU <"if ($src1) $dst.b += $src2.b">,
-                  V6_vaddbq_enc;
-defm V6_vaddhq  : T_HVX_condALU <"if ($src1) $dst.h += $src2.h">,
-                  V6_vaddhq_enc;
-defm V6_vaddwq  : T_HVX_condALU <"if ($src1) $dst.w += $src2.w">,
-                  V6_vaddwq_enc;
-defm V6_vsubbq  : T_HVX_condALU <"if ($src1) $dst.b -= $src2.b">,
-                  V6_vsubbq_enc;
-defm V6_vsubhq  : T_HVX_condALU <"if ($src1) $dst.h -= $src2.h">,
-                  V6_vsubhq_enc;
-defm V6_vsubwq  : T_HVX_condALU <"if ($src1) $dst.w -= $src2.w">,
-                  V6_vsubwq_enc;
-defm V6_vaddbnq : T_HVX_condALU <"if (!$src1) $dst.b += $src2.b">,
-                  V6_vaddbnq_enc;
-defm V6_vaddhnq : T_HVX_condALU <"if (!$src1) $dst.h += $src2.h">,
-                  V6_vaddhnq_enc;
-defm V6_vaddwnq : T_HVX_condALU <"if (!$src1) $dst.w += $src2.w">,
-                  V6_vaddwnq_enc;
-defm V6_vsubbnq : T_HVX_condALU <"if (!$src1) $dst.b -= $src2.b">,
-                  V6_vsubbnq_enc;
-defm V6_vsubhnq : T_HVX_condALU <"if (!$src1) $dst.h -= $src2.h">,
-                  V6_vsubhnq_enc;
-defm V6_vsubwnq : T_HVX_condALU <"if (!$src1) $dst.w -= $src2.w">,
-                  V6_vsubwnq_enc;
-
-let hasNewValue = 1 in
-class T_HVX_alu_2op <string asmString, InstrItinClass itin,
-                 RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1),
-    asmString >{
-  let Itinerary = itin;
-  let Type = !cast<IType>("Type"#itin);
-}
-
-multiclass T_HVX_alu_2op <string asmString, RegisterClass RCout,
-           RegisterClass RCin, InstrItinClass itin> {
-  def NAME : T_HVX_alu_2op <asmString, itin, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_alu_2op <asmString, itin,
-                              !cast<RegisterClass>(RCout#"128B"),
-                              !cast<RegisterClass>(RCin#"128B")>;
-}
-
-let hasNewValue = 1 in
-multiclass T_HVX_alu_2op_VV <string asmString>:
-  T_HVX_alu_2op <asmString, VectorRegs, VectorRegs, CVI_VA>;
-
-multiclass T_HVX_alu_2op_WV <string asmString>:
-  T_HVX_alu_2op <asmString, VecDblRegs, VectorRegs, CVI_VA_DV>;
-
-
-defm V6_vabsh     : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h)">,
-                    V6_vabsh_enc;
-defm V6_vabsw     : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w)">,
-                    V6_vabsw_enc;
-defm V6_vabsh_sat : T_HVX_alu_2op_VV <"$dst.h = vabs($src1.h):sat">,
-                    V6_vabsh_sat_enc;
-defm V6_vabsw_sat : T_HVX_alu_2op_VV <"$dst.w = vabs($src1.w):sat">,
-                    V6_vabsw_sat_enc;
-defm V6_vnot      : T_HVX_alu_2op_VV <"$dst = vnot($src1)">,
-                    V6_vnot_enc;
-defm V6_vassign   : T_HVX_alu_2op_VV <"$dst = $src1">,
-                    V6_vassign_enc;
-
-defm V6_vzb       : T_HVX_alu_2op_WV <"$dst.uh = vzxt($src1.ub)">,
-                    V6_vzb_enc;
-defm V6_vzh       : T_HVX_alu_2op_WV <"$dst.uw = vzxt($src1.uh)">,
-                    V6_vzh_enc;
-defm V6_vsb       : T_HVX_alu_2op_WV <"$dst.h = vsxt($src1.b)">,
-                    V6_vsb_enc;
-defm V6_vsh       : T_HVX_alu_2op_WV <"$dst.w = vsxt($src1.h)">,
-                    V6_vsh_enc;
-
-let Itinerary = CVI_VP, Type = TypeCVI_VP in {
-defm V6_vdealh    : T_HVX_alu_2op_VV <"$dst.h = vdeal($src1.h)">,
-                    V6_vdealh_enc;
-defm V6_vdealb    : T_HVX_alu_2op_VV <"$dst.b = vdeal($src1.b)">,
-                    V6_vdealb_enc;
-defm V6_vshuffh   : T_HVX_alu_2op_VV <"$dst.h = vshuff($src1.h)">,
-                    V6_vshuffh_enc;
-defm V6_vshuffb   : T_HVX_alu_2op_VV <"$dst.b = vshuff($src1.b)">,
-                    V6_vshuffb_enc;
-}
-
-let Itinerary = CVI_VP_VS, Type = TypeCVI_VP_VS in {
-defm V6_vunpackub : T_HVX_alu_2op_WV <"$dst.uh = vunpack($src1.ub)">,
-                    V6_vunpackub_enc;
-defm V6_vunpackuh : T_HVX_alu_2op_WV <"$dst.uw = vunpack($src1.uh)">,
-                    V6_vunpackuh_enc;
-defm V6_vunpackb  : T_HVX_alu_2op_WV <"$dst.h = vunpack($src1.b)">,
-                    V6_vunpackb_enc;
-defm V6_vunpackh  : T_HVX_alu_2op_WV <"$dst.w = vunpack($src1.h)">,
-                    V6_vunpackh_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vcl0w     : T_HVX_alu_2op_VV <"$dst.uw = vcl0($src1.uw)">,
-                    V6_vcl0w_enc;
-defm V6_vcl0h     : T_HVX_alu_2op_VV <"$dst.uh = vcl0($src1.uh)">,
-                    V6_vcl0h_enc;
-defm V6_vnormamtw : T_HVX_alu_2op_VV <"$dst.w = vnormamt($src1.w)">,
-                    V6_vnormamtw_enc;
-defm V6_vnormamth : T_HVX_alu_2op_VV <"$dst.h = vnormamt($src1.h)">,
-                    V6_vnormamth_enc;
-defm V6_vpopcounth : T_HVX_alu_2op_VV <"$dst.h = vpopcount($src1.h)">,
-                     V6_vpopcounth_enc;
-}
-
-let isAccumulator = 1, hasNewValue = 1, Itinerary = CVI_VX_DV_LONG,
-    Type = TypeCVI_VX_DV in
-class T_HVX_vmpyacc2 <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1 <(outs RC:$dst),
-                      (ins RC:$_src_, RC:$src1, IntRegs:$src2, u1_0Imm:$src3),
-    asmString, [], "$dst = $_src_" > ;
-
-
-multiclass T_HVX_vmpyacc2 <string asmString> {
-  def NAME : T_HVX_vmpyacc2 <asmString, VecDblRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vmpyacc2 <asmString, VecDblRegs128B>;
-}
-
-defm V6_vrmpybusi_acc :
-     T_HVX_vmpyacc2<"$dst.w += vrmpy($src1.ub,$src2.b,#$src3)">,
-     V6_vrmpybusi_acc_enc;
-defm V6_vrsadubi_acc :
-     T_HVX_vmpyacc2<"$dst.uw += vrsad($src1.ub,$src2.ub,#$src3)">,
-     V6_vrsadubi_acc_enc;
-defm V6_vrmpyubi_acc :
-     T_HVX_vmpyacc2<"$dst.uw += vrmpy($src1.ub,$src2.ub,#$src3)">,
-     V6_vrmpyubi_acc_enc;
-
-
-let Itinerary = CVI_VX_DV_LONG, Type = TypeCVI_VX_DV, hasNewValue = 1 in
-class T_HVX_vmpy2 <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, IntRegs:$src2, u1_0Imm:$src3),
-    asmString>;
-
-
-multiclass T_HVX_vmpy2 <string asmString> {
-  def NAME : T_HVX_vmpy2 <asmString, VecDblRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vmpy2 <asmString, VecDblRegs128B>;
-}
-
-defm V6_vrmpybusi :
-     T_HVX_vmpy2 <"$dst.w = vrmpy($src1.ub,$src2.b,#$src3)">, V6_vrmpybusi_enc;
-defm V6_vrsadubi :
-     T_HVX_vmpy2 <"$dst.uw = vrsad($src1.ub,$src2.ub,#$src3)">, V6_vrsadubi_enc;
-defm V6_vrmpyubi :
-     T_HVX_vmpy2 <"$dst.uw = vrmpy($src1.ub,$src2.ub,#$src3)">, V6_vrmpyubi_enc;
-
-
-let Itinerary = CVI_VP_VS_LONG_EARLY, Type = TypeCVI_VP_VS,
-    hasSideEffects = 0, hasNewValue2 = 1, opNewValue2 = 1 in
-class T_HVX_perm <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1 <(outs RC:$_dst1_, RC:$_dst2_),
-                      (ins RC:$src1, RC:$src2, IntRegs:$src3),
-    asmString, [], "$_dst1_ = $src1, $_dst2_ = $src2" >;
-
-multiclass T_HVX_perm <string asmString> {
-  def NAME : T_HVX_perm <asmString, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_perm <asmString, VectorRegs128B>;
-}
-
-let hasNewValue = 1, opNewValue = 0, hasNewValue2 = 1, opNewValue2 = 1 in {
-  defm V6_vshuff : T_HVX_perm <"vshuff($src1,$src2,$src3)">, V6_vshuff_enc;
-  defm V6_vdeal : T_HVX_perm <"vdeal($src1,$src2,$src3)">, V6_vdeal_enc;
-}
-
-// Conditional vector move.
-let isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_HVX_cmov <bit isPredNot, RegisterClass RC>
-  : CVI_VA_Resource1 <(outs RC:$dst), (ins PredRegs:$src1, RC:$src2),
-    "if ("#!if(isPredNot, "!", "")#"$src1) $dst = $src2"> {
-  let isPredicatedFalse = isPredNot;
-}
-
-multiclass T_HVX_cmov <bit isPredNot = 0> {
-  def NAME : T_HVX_cmov <isPredNot, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_cmov <isPredNot, VectorRegs128B>;
-}
-
-defm V6_vcmov : T_HVX_cmov, V6_vcmov_enc;
-defm V6_vncmov : T_HVX_cmov<1>, V6_vncmov_enc;
-
-// Conditional vector combine.
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, isPredicated = 1,
-    hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-class T_HVX_ccombine <bit isPredNot, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 < (outs RCout:$dst),
-    (ins PredRegs:$src1, RCin:$src2, RCin:$src3),
-    "if ("#!if(isPredNot, "!", "")#"$src1) $dst = vcombine($src2,$src3)"> {
-  let isPredicatedFalse = isPredNot;
-}
-
-multiclass T_HVX_ccombine <bit isPredNot = 0> {
-  def NAME : T_HVX_ccombine <isPredNot, VecDblRegs, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_ccombine <isPredNot, VecDblRegs128B, VectorRegs128B>;
-}
-
-defm V6_vccombine : T_HVX_ccombine, V6_vccombine_enc;
-defm V6_vnccombine : T_HVX_ccombine<1>, V6_vnccombine_enc;
-
-let hasNewValue = 1 in
-class T_HVX_shift <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_DV_Resource1<(outs RCout:$dst),
-    (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
-    asmString >;
-
-multiclass T_HVX_shift <string asmString, RegisterClass RCout,
-                        RegisterClass RCin> {
-  def NAME : T_HVX_shift <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_shift <asmString, !cast<RegisterClass>(RCout#"128B"),
-                                           !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_shift_VV <string asmString>:
-  T_HVX_shift <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_HVX_shift_WV <string asmString>:
-  T_HVX_shift <asmString, VecDblRegs, VectorRegs>;
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in {
-defm V6_valignb :
-     T_HVX_shift_VV <"$dst = valign($src1,$src2,$src3)">, V6_valignb_enc;
-defm V6_vlalignb :
-     T_HVX_shift_VV <"$dst = vlalign($src1,$src2,$src3)">, V6_vlalignb_enc;
-}
-
-let Itinerary = CVI_VS, Type = TypeCVI_VS in {
-defm V6_vasrwh :
-     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3)">, V6_vasrwh_enc;
-defm V6_vasrwhsat :
-     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):sat">,
-     V6_vasrwhsat_enc;
-defm V6_vasrwhrndsat :
-     T_HVX_shift_VV <"$dst.h = vasr($src1.w,$src2.w,$src3):rnd:sat">,
-     V6_vasrwhrndsat_enc;
-defm V6_vasrwuhsat :
-     T_HVX_shift_VV <"$dst.uh = vasr($src1.w,$src2.w,$src3):sat">,
-     V6_vasrwuhsat_enc;
-defm V6_vasrhubsat :
-     T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):sat">,
-     V6_vasrhubsat_enc;
-defm V6_vasrhubrndsat :
-     T_HVX_shift_VV <"$dst.ub = vasr($src1.h,$src2.h,$src3):rnd:sat">,
-     V6_vasrhubrndsat_enc;
-defm V6_vasrhbrndsat :
-     T_HVX_shift_VV <"$dst.b = vasr($src1.h,$src2.h,$src3):rnd:sat">,
-     V6_vasrhbrndsat_enc;
-}
-
-// Assembler mapped -- alias?
-//defm V6_vtran2x2vdd : T_HVX_shift_VV <"">, V6_vtran2x2vdd_enc;
-let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in {
-defm V6_vshuffvdd :
-     T_HVX_shift_WV <"$dst = vshuff($src1,$src2,$src3)">, V6_vshuffvdd_enc;
-defm V6_vdealvdd :
-     T_HVX_shift_WV <"$dst = vdeal($src1,$src2,$src3)">, V6_vdealvdd_enc;
-}
-
-let hasNewValue = 1, Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS in
-class T_HVX_unpack <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_DV_Resource1<(outs RCout:$dst), (ins RCout:$_src_, RCin:$src1),
-    asmString, [], "$dst = $_src_">;
-
-multiclass T_HVX_unpack <string asmString> {
-  def NAME : T_HVX_unpack <asmString, VecDblRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_unpack <asmString, VecDblRegs128B, VectorRegs128B>;
-}
-
-defm V6_vunpackob : T_HVX_unpack <"$dst.h |= vunpacko($src1.b)">, V6_vunpackob_enc;
-defm V6_vunpackoh : T_HVX_unpack <"$dst.w |= vunpacko($src1.h)">, V6_vunpackoh_enc;
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1,
-    hasSideEffects = 0 in
-class T_HVX_valign <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2, u3_0Imm:$src3),
-    asmString>;
-
-multiclass T_HVX_valign <string asmString> {
-  def NAME : T_HVX_valign <asmString, VectorRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_valign <asmString, VectorRegs128B>;
-}
-
-defm V6_valignbi :
-     T_HVX_valign <"$dst = valign($src1,$src2,#$src3)">, V6_valignbi_enc;
-defm V6_vlalignbi :
-     T_HVX_valign <"$dst = vlalign($src1,$src2,#$src3)">, V6_vlalignbi_enc;
-
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV in
-class T_HVX_predAlu <string asmString, RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1, RC:$src2),
-    asmString>;
-
-multiclass T_HVX_predAlu <string asmString> {
-  def NAME : T_HVX_predAlu <asmString, VecPredRegs>;
-
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_predAlu <asmString, VecPredRegs128B>;
-}
-
-defm V6_pred_and  : T_HVX_predAlu <"$dst = and($src1,$src2)">, V6_pred_and_enc;
-defm V6_pred_or   : T_HVX_predAlu <"$dst = or($src1,$src2)">, V6_pred_or_enc;
-defm V6_pred_xor  : T_HVX_predAlu <"$dst = xor($src1,$src2)">, V6_pred_xor_enc;
-defm V6_pred_or_n : T_HVX_predAlu <"$dst = or($src1,!$src2)">, V6_pred_or_n_enc;
-defm V6_pred_and_n :
-     T_HVX_predAlu <"$dst = and($src1,!$src2)">, V6_pred_and_n_enc;
-
-let Itinerary = CVI_VA, Type = TypeCVI_VA in
-class T_HVX_prednot <RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins RC:$src1),
-    "$dst = not($src1)">, V6_pred_not_enc;
-
-def V6_pred_not : T_HVX_prednot <VecPredRegs>;
-let isCodeGenOnly =  1 in
-def V6_pred_not_128B : T_HVX_prednot <VecPredRegs128B>;
-
-let Itinerary = CVI_VA, Type = TypeCVI_VA in
-class T_HVX_vcmp2 <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1 <(outs RCout:$dst), (ins RCin:$src1, RCin:$src2),
-    asmString >;
-
-multiclass T_HVX_vcmp2 <string asmString> {
-  def NAME : T_HVX_vcmp2 <asmString, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vcmp2 <asmString, VecPredRegs128B, VectorRegs128B>;
-}
-
-defm V6_veqb : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.b,$src2.b)">, V6_veqb_enc;
-defm V6_veqh : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.h,$src2.h)">, V6_veqh_enc;
-defm V6_veqw : T_HVX_vcmp2  <"$dst = vcmp.eq($src1.w,$src2.w)">, V6_veqw_enc;
-defm V6_vgtb : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.b,$src2.b)">, V6_vgtb_enc;
-defm V6_vgth : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.h,$src2.h)">, V6_vgth_enc;
-defm V6_vgtw : T_HVX_vcmp2  <"$dst = vcmp.gt($src1.w,$src2.w)">, V6_vgtw_enc;
-defm V6_vgtub : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.ub,$src2.ub)">, V6_vgtub_enc;
-defm V6_vgtuh : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uh,$src2.uh)">, V6_vgtuh_enc;
-defm V6_vgtuw : T_HVX_vcmp2 <"$dst = vcmp.gt($src1.uw,$src2.uw)">, V6_vgtuw_enc;
-
-let isAccumulator = 1, hasNewValue = 1, hasSideEffects = 0 in
-class T_V6_vandqrt_acc <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst),
-    (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
-    "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandqrt_acc_enc;
-
-def V6_vandqrt_acc : T_V6_vandqrt_acc <VectorRegs, VecPredRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandqrt_acc_128B : T_V6_vandqrt_acc <VectorRegs128B, VecPredRegs128B>;
-
-let isAccumulator = 1 in
-class T_V6_vandvrt_acc <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst),
-    (ins RCout:$_src_, RCin:$src1, IntRegs:$src2),
-    "$dst |= vand($src1,$src2)", [], "$dst = $_src_">, V6_vandvrt_acc_enc;
-
-def V6_vandvrt_acc : T_V6_vandvrt_acc <VecPredRegs, VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandvrt_acc_128B : T_V6_vandvrt_acc <VecPredRegs128B, VectorRegs128B>;
-
-let hasNewValue =  1, hasSideEffects = 0 in
-class T_V6_vandqrt <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst),
-    (ins RCin:$src1, IntRegs:$src2),
-    "$dst = vand($src1,$src2)" >, V6_vandqrt_enc;
-
-def V6_vandqrt : T_V6_vandqrt <VectorRegs, VecPredRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandqrt_128B : T_V6_vandqrt <VectorRegs128B, VecPredRegs128B>;
-
-let hasNewValue = 1, hasSideEffects = 0 in
-class T_V6_lvsplatw <RegisterClass RC>
-  : CVI_VX_Resource_late<(outs RC:$dst), (ins IntRegs:$src1),
-    "$dst = vsplat($src1)" >, V6_lvsplatw_enc;
-
-def V6_lvsplatw : T_V6_lvsplatw <VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_lvsplatw_128B : T_V6_lvsplatw <VectorRegs128B>;
-
-
-let hasNewValue = 1 in
-class T_V6_vinsertwr <RegisterClass RC>
-  : CVI_VX_Resource_late<(outs RC:$dst), (ins RC:$_src_, IntRegs:$src1),
-    "$dst.w = vinsert($src1)", [], "$dst = $_src_">,
-    V6_vinsertwr_enc;
-
-def V6_vinsertwr : T_V6_vinsertwr <VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_vinsertwr_128B : T_V6_vinsertwr <VectorRegs128B>;
-
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP in
-class T_V6_pred_scalar2 <RegisterClass RC>
-  : CVI_VA_Resource1<(outs RC:$dst), (ins IntRegs:$src1),
-    "$dst = vsetq($src1)">, V6_pred_scalar2_enc;
-
-def V6_pred_scalar2 : T_V6_pred_scalar2 <VecPredRegs>;
-let isCodeGenOnly = 1 in
-def V6_pred_scalar2_128B : T_V6_pred_scalar2 <VecPredRegs128B>;
-
-class T_V6_vandvrt <RegisterClass RCout, RegisterClass RCin>
-  : CVI_VX_Resource_late<(outs RCout:$dst), (ins RCin:$src1, IntRegs:$src2),
-    "$dst = vand($src1,$src2)">, V6_vandvrt_enc;
-
-def V6_vandvrt : T_V6_vandvrt <VecPredRegs, VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_vandvrt_128B : T_V6_vandvrt <VecPredRegs128B, VectorRegs128B>;
-
-let validSubTargets = HasV60SubT in
-class T_HVX_rol <string asmString, RegisterClass RC, Operand ImmOp >
-  : SInst2 <(outs RC:$dst), (ins  RC:$src1, ImmOp:$src2), asmString>;
-
-class T_HVX_rol_R <string asmString>
-  : T_HVX_rol <asmString, IntRegs, u5_0Imm>;
-class T_HVX_rol_P <string asmString>
-  : T_HVX_rol <asmString, DoubleRegs, u6_0Imm>;
-
-def S6_rol_i_p : T_HVX_rol_P <"$dst = rol($src1,#$src2)">, S6_rol_i_p_enc;
-let hasNewValue = 1, opNewValue = 0 in
-def S6_rol_i_r : T_HVX_rol_R <"$dst = rol($src1,#$src2)">, S6_rol_i_r_enc;
-
-let validSubTargets = HasV60SubT in
-class T_HVX_rol_acc <string asmString, RegisterClass RC, Operand ImmOp>
-  : SInst2 <(outs RC:$dst), (ins RC:$_src_, RC:$src1, ImmOp:$src2),
-    asmString, [], "$dst = $_src_" >;
-
-class T_HVX_rol_acc_P <string asmString>
-  : T_HVX_rol_acc <asmString, DoubleRegs, u6_0Imm>;
-
-class T_HVX_rol_acc_R <string asmString>
-  : T_HVX_rol_acc <asmString, IntRegs, u5_0Imm>;
-
-def S6_rol_i_p_nac :
-    T_HVX_rol_acc_P <"$dst -= rol($src1,#$src2)">, S6_rol_i_p_nac_enc;
-def S6_rol_i_p_acc :
-    T_HVX_rol_acc_P <"$dst += rol($src1,#$src2)">, S6_rol_i_p_acc_enc;
-def S6_rol_i_p_and :
-    T_HVX_rol_acc_P <"$dst &= rol($src1,#$src2)">, S6_rol_i_p_and_enc;
-def S6_rol_i_p_or  :
-    T_HVX_rol_acc_P <"$dst |= rol($src1,#$src2)">, S6_rol_i_p_or_enc;
-def S6_rol_i_p_xacc :
-    T_HVX_rol_acc_P<"$dst ^= rol($src1,#$src2)">, S6_rol_i_p_xacc_enc;
-
-let hasNewValue = 1, opNewValue = 0 in {
-def S6_rol_i_r_nac :
-    T_HVX_rol_acc_R <"$dst -= rol($src1,#$src2)">, S6_rol_i_r_nac_enc;
-def S6_rol_i_r_acc :
-    T_HVX_rol_acc_R <"$dst += rol($src1,#$src2)">, S6_rol_i_r_acc_enc;
-def S6_rol_i_r_and :
-    T_HVX_rol_acc_R <"$dst &= rol($src1,#$src2)">, S6_rol_i_r_and_enc;
-def S6_rol_i_r_or :
-    T_HVX_rol_acc_R <"$dst |= rol($src1,#$src2)">, S6_rol_i_r_or_enc;
-def S6_rol_i_r_xacc :
-    T_HVX_rol_acc_R <"$dst ^= rol($src1,#$src2)">, S6_rol_i_r_xacc_enc;
-}
-
-let isSolo = 1, Itinerary = LD_tc_ld_SLOT0, Type = TypeLD in
-class T_V6_extractw <RegisterClass RC>
-  : LD1Inst <(outs IntRegs:$dst), (ins RC:$src1, IntRegs:$src2),
-    "$dst = vextract($src1,$src2)">, V6_extractw_enc;
-
-def V6_extractw : T_V6_extractw <VectorRegs>;
-let isCodeGenOnly = 1 in
-def V6_extractw_128B : T_V6_extractw <VectorRegs128B>;
-
-let Itinerary = ST_tc_st_SLOT0, validSubTargets = HasV55SubT  in
-class T_sys0op <string asmString>
-  : ST1Inst <(outs), (ins), asmString>;
-
-let isSolo = 1, validSubTargets = HasV55SubT in {
-def Y5_l2gunlock   : T_sys0op <"l2gunlock">, Y5_l2gunlock_enc;
-def Y5_l2gclean    : T_sys0op <"l2gclean">, Y5_l2gclean_enc;
-def Y5_l2gcleaninv : T_sys0op <"l2gcleaninv">, Y5_l2gcleaninv_enc;
-}
-
-class T_sys1op <string asmString, RegisterClass RC>
-  : ST1Inst <(outs), (ins RC:$src1), asmString>;
-
-class T_sys1op_R <string asmString> : T_sys1op <asmString, IntRegs>;
-class T_sys1op_P <string asmString> : T_sys1op <asmString, DoubleRegs>;
-
-let isSoloAX = 1, validSubTargets = HasV55SubT in
-def Y5_l2unlocka     : T_sys1op_R <"l2unlocka($src1)">, Y5_l2unlocka_enc;
-
-let isSolo = 1, validSubTargets = HasV60SubT in {
-def Y6_l2gcleanpa    : T_sys1op_P <"l2gclean($src1)">, Y6_l2gcleanpa_enc;
-def Y6_l2gcleaninvpa : T_sys1op_P <"l2gcleaninv($src1)">, Y6_l2gcleaninvpa_enc;
-}
-
-let Itinerary = ST_tc_3stall_SLOT0, isPredicateLate = 1, isSoloAX = 1,
-    validSubTargets = HasV55SubT in
-def Y5_l2locka : ST1Inst <(outs PredRegs:$dst), (ins IntRegs:$src1),
-  "$dst = l2locka($src1)">, Y5_l2locka_enc;
-
-// not defined on etc side. why?
-// defm S2_cabacencbin : _VV <"Rdd=encbin(Rss,$src2,Pu)">, S2_cabacencbin_enc;
-
-let Defs = [USR_OVF], Itinerary = M_tc_3stall_SLOT23, isPredicateLate = 1,
-    hasSideEffects = 0,
-validSubTargets = HasV55SubT in
-def A5_ACS : MInst2 <(outs DoubleRegs:$dst1, PredRegs:$dst2),
-  (ins DoubleRegs:$_src_, DoubleRegs:$src1, DoubleRegs:$src2),
-  "$dst1,$dst2 = vacsh($src1,$src2)", [],
-  "$dst1 = $_src_" >, Requires<[HasV55T]>, A5_ACS_enc;
-
-let Itinerary = CVI_VA_DV, Type = TypeCVI_VA_DV, hasNewValue = 1,
-    hasSideEffects = 0 in
-class T_HVX_alu2 <string asmString, RegisterClass RCout, RegisterClass RCin1,
-                  RegisterClass RCin2>
-  : CVI_VA_Resource1<(outs RCout:$dst),
-    (ins RCin1:$src1, RCin2:$src2, RCin2:$src3), asmString>;
-
-multiclass T_HVX_alu2 <string asmString, RegisterClass RC > {
-  def NAME : T_HVX_alu2 <asmString, RC, VecPredRegs, VectorRegs>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_alu2 <asmString, !cast<RegisterClass>(RC#"128B"),
-                               VecPredRegs128B, VectorRegs128B>;
-}
-
-multiclass T_HVX_alu2_V <string asmString> :
-  T_HVX_alu2 <asmString, VectorRegs>;
-
-multiclass T_HVX_alu2_W <string asmString> :
-  T_HVX_alu2 <asmString, VecDblRegs>;
-
-defm V6_vswap : T_HVX_alu2_W <"$dst = vswap($src1,$src2,$src3)">, V6_vswap_enc;
-
-let Itinerary = CVI_VA, Type = TypeCVI_VA, hasNewValue = 1,
-    hasSideEffects = 0 in
-defm V6_vmux  : T_HVX_alu2_V <"$dst = vmux($src1,$src2,$src3)">, V6_vmux_enc;
-
-class T_HVX_vlutb <string asmString, RegisterClass RCout, RegisterClass RCin>
-  : CVI_VA_Resource1<(outs RCout:$dst),
-    (ins RCin:$src1, RCin:$src2, IntRegsLow8:$src3), asmString>;
-
-multiclass T_HVX_vlutb <string asmString, RegisterClass RCout,
-                        RegisterClass RCin> {
-  def NAME : T_HVX_vlutb <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vlutb <asmString, !cast<RegisterClass>(RCout#"128B"),
-                                           !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_vlutb_V <string asmString> :
-  T_HVX_vlutb <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_HVX_vlutb_W <string asmString> :
-  T_HVX_vlutb <asmString, VecDblRegs, VectorRegs>;
-
-let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, isAccumulator = 1 in
-class T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
-                       RegisterClass RCin>
-  : CVI_VA_Resource1<(outs RCout:$dst),
-    (ins RCout:$_src_, RCin:$src1, RCin:$src2, IntRegsLow8:$src3),
-    asmString, [], "$dst = $_src_">;
-
-multiclass T_HVX_vlutb_acc <string asmString, RegisterClass RCout,
-                            RegisterClass RCin> {
-  def NAME : T_HVX_vlutb_acc <asmString, RCout, RCin>;
-  let isCodeGenOnly = 1 in
-  def NAME#_128B : T_HVX_vlutb_acc<asmString,
-                                   !cast<RegisterClass>(RCout#"128B"),
-                                   !cast<RegisterClass>(RCin#"128B")>;
-}
-
-multiclass T_HVX_vlutb_acc_V <string asmString> :
-  T_HVX_vlutb_acc <asmString, VectorRegs, VectorRegs>;
-
-multiclass T_HVX_vlutb_acc_W <string asmString> :
-  T_HVX_vlutb_acc <asmString, VecDblRegs, VectorRegs>;
-
-
-let Itinerary = CVI_VP_LONG, Type = TypeCVI_VP, hasNewValue = 1 in
-defm V6_vlutvvb:
-     T_HVX_vlutb_V <"$dst.b = vlut32($src1.b,$src2.b,$src3)">, V6_vlutvvb_enc;
-
-let Itinerary = CVI_VP_VS_LONG, Type = TypeCVI_VP_VS, hasNewValue = 1 in
-defm V6_vlutvwh:
-     T_HVX_vlutb_W <"$dst.h = vlut16($src1.b,$src2.h,$src3)">, V6_vlutvwh_enc;
-
-let hasNewValue = 1 in {
-  defm V6_vlutvvb_oracc:
-       T_HVX_vlutb_acc_V <"$dst.b |= vlut32($src1.b,$src2.b,$src3)">,
-       V6_vlutvvb_oracc_enc;
-  defm V6_vlutvwh_oracc:
-       T_HVX_vlutb_acc_W <"$dst.h |= vlut16($src1.b,$src2.h,$src3)">,
-       V6_vlutvwh_oracc_enc;
-}
-
-// It's a fake instruction and should not be defined?
-def S2_cabacencbin
-  : SInst2<(outs DoubleRegs:$dst),
-          (ins DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
-    "$dst = encbin($src1,$src2,$src3)">, S2_cabacencbin_enc;
-
-// Vhist instructions
-def V6_vhistq
-  : CVI_HIST_Resource1 <(outs), (ins VecPredRegs:$src1),
-    "vhist($src1)">, V6_vhistq_enc;
-
-def V6_vhist
-  : CVI_HIST_Resource1 <(outs), (ins),
-    "vhist" >, V6_vhist_enc;
-
-
-let isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
-  def V6_vd0: CVI_VA_Resource<(outs VectorRegs:$dst), (ins), "$dst = #0", []>;
-  def V6_vd0_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst), (ins),
-      "$dst = #0", []>;
-
-  def V6_vassignp: CVI_VA_Resource<(outs VecDblRegs:$dst),
-      (ins VecDblRegs:$src), "", []>;
-  def V6_vassignp_128B : CVI_VA_Resource<(outs VecDblRegs128B:$dst),
-      (ins VecDblRegs128B:$src), "", []>;
-
-  def V6_lo: CVI_VA_Resource<(outs VectorRegs:$dst), (ins VecDblRegs:$src1),
-      "", []>;
-  def V6_lo_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst),
-      (ins VecDblRegs128B:$src1), "", []>;
-
-  def V6_hi: CVI_VA_Resource<(outs VectorRegs:$dst), (ins VecDblRegs:$src1),
-      "", []>;
-  def V6_hi_128B: CVI_VA_Resource<(outs VectorRegs128B:$dst),
-      (ins VecDblRegs128B:$src1), "", []>;
-}
diff --git a/lib/Target/Hexagon/HexagonInstrInfoVector.td b/lib/Target/Hexagon/HexagonInstrInfoVector.td
deleted file mode 100644
index e3520bd6e5157be6620a338c4aeacb15deef4766..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonInstrInfoVector.td
+++ /dev/null
@@ -1,69 +0,0 @@
-//===- HexagonInstrInfoVector.td - Hexagon Vector Patterns -*- tablegen -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon Vector instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-// Vector shift support. Vector shifting in Hexagon is rather different
-// from internal representation of LLVM.
-// LLVM assumes all shifts (in vector case) will have the form
-// <VT> = SHL/SRA/SRL <VT> by <VT>
-// while Hexagon has the following format:
-// <VT> = SHL/SRA/SRL <VT> by <IT/i32>
-// As a result, special care is needed to guarantee correctness and
-// performance.
-class vshift_v4i16<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
-  : S_2OpInstImm<Str, MajOp, MinOp, u4_0Imm, []> {
-  bits<4> src2;
-  let Inst{11-8} = src2;
-}
-
-class vshift_v2i32<SDNode Op, string Str, bits<3>MajOp, bits<3>MinOp>
-  : S_2OpInstImm<Str, MajOp, MinOp, u5_0Imm, []> {
-  bits<5> src2;
-  let Inst{12-8} = src2;
-}
-
-def S2_asr_i_vw : vshift_v2i32<sra, "vasrw", 0b010, 0b000>;
-def S2_lsr_i_vw : vshift_v2i32<srl, "vlsrw", 0b010, 0b001>;
-def S2_asl_i_vw : vshift_v2i32<shl, "vaslw", 0b010, 0b010>;
-
-def S2_asr_i_vh : vshift_v4i16<sra, "vasrh", 0b100, 0b000>;
-def S2_lsr_i_vh : vshift_v4i16<srl, "vlsrh", 0b100, 0b001>;
-def S2_asl_i_vh : vshift_v4i16<shl, "vaslh", 0b100, 0b010>;
-
-// Vector shift words by register
-def S2_asr_r_vw : T_S3op_shiftVect < "vasrw", 0b00, 0b00>;
-def S2_lsr_r_vw : T_S3op_shiftVect < "vlsrw", 0b00, 0b01>;
-def S2_asl_r_vw : T_S3op_shiftVect < "vaslw", 0b00, 0b10>;
-def S2_lsl_r_vw : T_S3op_shiftVect < "vlslw", 0b00, 0b11>;
-
-// Vector shift halfwords by register
-def S2_asr_r_vh : T_S3op_shiftVect < "vasrh", 0b01, 0b00>;
-def S2_lsr_r_vh : T_S3op_shiftVect < "vlsrh", 0b01, 0b01>;
-def S2_asl_r_vh : T_S3op_shiftVect < "vaslh", 0b01, 0b10>;
-def S2_lsl_r_vh : T_S3op_shiftVect < "vlslh", 0b01, 0b11>;
-
-
-// Hexagon doesn't have a vector multiply with C semantics.
-// Instead, generate a pseudo instruction that gets expaneded into two
-// scalar MPYI instructions.
-// This is expanded by ExpandPostRAPseudos.
-let isPseudo = 1 in
-def PS_vmulw : PseudoM<(outs DoubleRegs:$Rd),
-      (ins DoubleRegs:$Rs, DoubleRegs:$Rt), "", []>;
-
-let isPseudo = 1 in
-def PS_vmulw_acc : PseudoM<(outs DoubleRegs:$Rd),
-      (ins DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt), "", [],
-      "$Rd = $Rx">;
-
-
-
diff --git a/lib/Target/Hexagon/HexagonIntrinsics.td b/lib/Target/Hexagon/HexagonIntrinsics.td
index d4f303bf6ff016cb2e7cd4fca8e9571aa5b21782..c611857ec26af9b4b1d4d5f12be5349aafb58827 100644
--- a/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -1347,6 +1347,25 @@ def: T_stc_pat<S2_storeri_pci, int_hexagon_circ_stw,   s4_2ImmPred, I32>;
 def: T_stc_pat<S2_storerd_pci, int_hexagon_circ_std,   s4_3ImmPred, I64>;
 def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred, I32>;
 
+multiclass MaskedStore <InstHexagon MI, Intrinsic IntID> {
+  def : Pat<(IntID VecPredRegs:$src1, IntRegs:$src2, VectorRegs:$src3),
+            (MI VecPredRegs:$src1, IntRegs:$src2, #0, VectorRegs:$src3)>,
+        Requires<[UseHVXSgl]>;
+
+  def : Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1,
+                                             IntRegs:$src2,
+                                             VectorRegs128B:$src3),
+            (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1,
+                                            IntRegs:$src2, #0,
+                                            VectorRegs128B:$src3)>,
+        Requires<[UseHVXDbl]>;
+}
+
+defm : MaskedStore <V6_vS32b_qpred_ai, int_hexagon_V6_vmaskedstoreq>;
+defm : MaskedStore <V6_vS32b_nqpred_ai, int_hexagon_V6_vmaskedstorenq>;
+defm : MaskedStore <V6_vS32b_nt_qpred_ai, int_hexagon_V6_vmaskedstorentq>;
+defm : MaskedStore <V6_vS32b_nt_nqpred_ai, int_hexagon_V6_vmaskedstorentnq>;
+
 include "HexagonIntrinsicsV3.td"
 include "HexagonIntrinsicsV4.td"
 include "HexagonIntrinsicsV5.td"
diff --git a/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/lib/Target/Hexagon/HexagonIntrinsicsV60.td
index a45e1c9d7be4311c86eb8ea4fbd63f011d4af901..f438b3e0368fd7baccadff8c3d09a4b20a537d87 100644
--- a/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ b/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -790,7 +790,7 @@ def : T_RRI_pat <S6_rol_i_r_xacc, int_hexagon_S6_rol_i_r_xacc>;
 defm : T_VR_pat <V6_extractw, int_hexagon_V6_extractw>;
 defm : T_VR_pat <V6_vinsertwr, int_hexagon_V6_vinsertwr>;
 
-def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
+//def : T_PPQ_pat <S2_cabacencbin, int_hexagon_S2_cabacencbin>;
 
 def: Pat<(v64i16 (trunc v64i32:$Vdd)),
          (v64i16 (V6_vpackwh_sat_128B
diff --git a/lib/Target/Hexagon/HexagonIsetDx.td b/lib/Target/Hexagon/HexagonIsetDx.td
deleted file mode 100644
index ebedf2cbaf172a8a96db856d5ba7b2cb4708514c..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonIsetDx.td
+++ /dev/null
@@ -1,728 +0,0 @@
-//=- HexagonIsetDx.td - Target Desc. for Hexagon Target -*- tablegen -*-=//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon duplex instructions.
-//
-//===----------------------------------------------------------------------===//
-
-// SA1_combine1i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine1i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#1, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b01;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SL2_jumpr31_f: Indirect conditional jump if false.
-// SL2_jumpr31_f -> SL2_jumpr31_fnew
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_f: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0) jumpr r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b101;
-  }
-
-// SL2_deallocframe: Deallocate stack frame.
-let Defs = [R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
-def SL2_deallocframe: SUBInst <
-  (outs ),
-  (ins ),
-  "deallocframe"> {
-    let Inst{12-6} = 0b1111100;
-    let Inst{2} = 0b0;
-  }
-
-// SL2_return_f: Deallocate stack frame and return.
-// SL2_return_f -> SL2_return_fnew
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_f: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0) dealloc_return"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b101;
-  }
-
-// SA1_combine3i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine3i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#3, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b11;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SS2_storebi0: Store byte.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
-def SS2_storebi0: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
-  "memb($Rs + #$u4_0)=#0"> {
-    bits<4> Rs;
-    bits<4> u4_0;
-
-    let Inst{12-8} = 0b10010;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_0;
-  }
-
-// SA1_clrtnew: Clear if true.
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrtnew: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if ($Pu.new) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b100;
-    let Inst{3-0} = Rd;
-  }
-
-// SL2_loadruh_io: Load half.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadruh_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u3_1Imm:$u3_1),
-  "$Rd = memuh($Rs + #$u3_1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<4> u3_1;
-
-    let Inst{12-11} = 0b01;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_1{3-1};
-  }
-
-// SL2_jumpr31_tnew: Indirect conditional jump if true.
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_tnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0.new) jumpr:nt r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b110;
-  }
-
-// SA1_addi: Add.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 1, opExtentBits = 7, opExtendable = 2 in
-def SA1_addi: SUBInst <
-  (outs IntRegs:$Rx),
-  (ins IntRegs:$_src_, s7_0Ext:$s7),
-  "$Rx = add($_src_, #$s7)" ,
-  [] ,
-  "$_src_ = $Rx"> {
-    bits<4> Rx;
-    bits<7> s7;
-
-    let Inst{12-11} = 0b00;
-    let Inst{3-0} = Rx;
-    let Inst{10-4} = s7;
-  }
-
-// SL1_loadrub_io: Load byte.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
-def SL1_loadrub_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
-  "$Rd = memub($Rs + #$u4_0)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<4> u4_0;
-
-    let Inst{12} = 0b1;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_0;
-  }
-
-// SL1_loadri_io: Load word.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL1_loadri_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
-  "$Rd = memw($Rs + #$u4_2)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<6> u4_2;
-
-    let Inst{12} = 0b0;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_2{5-2};
-  }
-
-// SA1_cmpeqi: Compareimmed.
-let Defs = [P0], isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_cmpeqi: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u2_0Imm:$u2),
-  "p0 = cmp.eq($Rs, #$u2)"> {
-    bits<4> Rs;
-    bits<2> u2;
-
-    let Inst{12-8} = 0b11001;
-    let Inst{7-4} = Rs;
-    let Inst{1-0} = u2;
-  }
-
-// SA1_combinerz: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combinerz: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins IntRegs:$Rs),
-  "$Rdd = combine($Rs, #0)"> {
-    bits<3> Rdd;
-    bits<4> Rs;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b1;
-    let Inst{3} = 0b1;
-    let Inst{2-0} = Rdd;
-    let Inst{7-4} = Rs;
-  }
-
-// SL2_return_t: Deallocate stack frame and return.
-// SL2_return_t -> SL2_return_tnew
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_t: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0) dealloc_return"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b100;
-  }
-
-// SS2_allocframe: Allocate stack frame.
-let Defs = [R29, R30], Uses = [R30, R31, R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
-def SS2_allocframe: SUBInst <
-  (outs ),
-  (ins u5_3Imm:$u5_3),
-  "allocframe(#$u5_3)"> {
-    bits<8> u5_3;
-
-    let Inst{12-9} = 0b1110;
-    let Inst{8-4} = u5_3{7-3};
-  }
-
-// SS2_storeh_io: Store half.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = HalfWordAccess in
-def SS2_storeh_io: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u3_1Imm:$u3_1, IntRegs:$Rt),
-  "memh($Rs + #$u3_1) = $Rt"> {
-    bits<4> Rs;
-    bits<4> u3_1;
-    bits<4> Rt;
-
-    let Inst{12-11} = 0b00;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_1{3-1};
-    let Inst{3-0} = Rt;
-  }
-
-// SS2_storewi0: Store word.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS2_storewi0: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
-  "memw($Rs + #$u4_2)=#0"> {
-    bits<4> Rs;
-    bits<6> u4_2;
-
-    let Inst{12-8} = 0b10000;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_2{5-2};
-  }
-
-// SS2_storewi1: Store word.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS2_storewi1: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2),
-  "memw($Rs + #$u4_2)=#1"> {
-    bits<4> Rs;
-    bits<6> u4_2;
-
-    let Inst{12-8} = 0b10001;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_2{5-2};
-  }
-
-// SL2_jumpr31: Indirect conditional jump if true.
-let Defs = [PC], Uses = [R31], isCodeGenOnly = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31: SUBInst <
-  (outs ),
-  (ins ),
-  "jumpr r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2} = 0b0;
-  }
-
-// SA1_combinezr: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combinezr: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins IntRegs:$Rs),
-  "$Rdd = combine(#0, $Rs)"> {
-    bits<3> Rdd;
-    bits<4> Rs;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b1;
-    let Inst{3} = 0b0;
-    let Inst{2-0} = Rdd;
-    let Inst{7-4} = Rs;
-  }
-
-// SL2_loadrh_io: Load half.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = HalfWordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadrh_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u3_1Imm:$u3_1),
-  "$Rd = memh($Rs + #$u3_1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<4> u3_1;
-
-    let Inst{12-11} = 0b00;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_1{3-1};
-  }
-
-// SA1_addrx: Add.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_addrx: SUBInst <
-  (outs IntRegs:$Rx),
-  (ins IntRegs:$_src_, IntRegs:$Rs),
-  "$Rx = add($_src_, $Rs)" ,
-  [] ,
-  "$_src_ = $Rx"> {
-    bits<4> Rx;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b11000;
-    let Inst{3-0} = Rx;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_setin1: Set to -1.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_setin1: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins ),
-  "$Rd = #{-1}"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6} = 0b0;
-    let Inst{3-0} = Rd;
-  }
-
-// SA1_sxth: Sxth.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_sxth: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = sxth($Rs)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10100;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_combine0i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine0i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#0, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b00;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SA1_combine2i: Combines.
-let isCodeGenOnly = 1, hasSideEffects = 0 in
-def SA1_combine2i: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u2_0Imm:$u2),
-  "$Rdd = combine(#2, #$u2)"> {
-    bits<3> Rdd;
-    bits<2> u2;
-
-    let Inst{12-10} = 0b111;
-    let Inst{8} = 0b0;
-    let Inst{4-3} = 0b10;
-    let Inst{2-0} = Rdd;
-    let Inst{6-5} = u2;
-  }
-
-// SA1_sxtb: Sxtb.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_sxtb: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = sxtb($Rs)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10101;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_clrf: Clear if false.
-// SA1_clrf -> SA1_clrfnew
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrf: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if (!$Pu) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b111;
-    let Inst{3-0} = Rd;
-  }
-
-// SL2_loadrb_io: Load byte.
-let isCodeGenOnly = 1, mayLoad = 1, accessSize = ByteAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadrb_io: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs, u3_0Imm:$u3_0),
-  "$Rd = memb($Rs + #$u3_0)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-    bits<3> u3_0;
-
-    let Inst{12-11} = 0b10;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-    let Inst{10-8} = u3_0;
-  }
-
-// SA1_tfr: Tfr.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_tfr: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = $Rs"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10000;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SL2_loadrd_sp: Load dword.
-let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess in
-def SL2_loadrd_sp: SUBInst <
-  (outs DoubleRegs:$Rdd),
-  (ins u5_3Imm:$u5_3),
-  "$Rdd = memd(r29 + #$u5_3)"> {
-    bits<3> Rdd;
-    bits<8> u5_3;
-
-    let Inst{12-8} = 0b11110;
-    let Inst{2-0} = Rdd;
-    let Inst{7-3} = u5_3{7-3};
-  }
-
-// SA1_and1: And #1.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_and1: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = and($Rs, #1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10010;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SS2_storebi1: Store byte.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
-def SS2_storebi1: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0),
-  "memb($Rs + #$u4_0)=#1"> {
-    bits<4> Rs;
-    bits<4> u4_0;
-
-    let Inst{12-8} = 0b10011;
-    let Inst{7-4} = Rs;
-    let Inst{3-0} = u4_0;
-  }
-
-// SA1_inc: Inc.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_inc: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = add($Rs, #1)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10001;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SS2_stored_sp: Store dword.
-let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = DoubleWordAccess in
-def SS2_stored_sp: SUBInst <
-  (outs ),
-  (ins s6_3Imm:$s6_3, DoubleRegs:$Rtt),
-  "memd(r29 + #$s6_3) = $Rtt"> {
-    bits<9> s6_3;
-    bits<3> Rtt;
-
-    let Inst{12-9} = 0b0101;
-    let Inst{8-3} = s6_3{8-3};
-    let Inst{2-0} = Rtt;
-  }
-
-// SS2_storew_sp: Store word.
-let Uses = [R29], isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS2_storew_sp: SUBInst <
-  (outs ),
-  (ins u5_2Imm:$u5_2, IntRegs:$Rt),
-  "memw(r29 + #$u5_2) = $Rt"> {
-    bits<7> u5_2;
-    bits<4> Rt;
-
-    let Inst{12-9} = 0b0100;
-    let Inst{8-4} = u5_2{6-2};
-    let Inst{3-0} = Rt;
-  }
-
-// SL2_jumpr31_fnew: Indirect conditional jump if false.
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_fnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0.new) jumpr:nt r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b111;
-  }
-
-// SA1_clrt: Clear if true.
-// SA1_clrt -> SA1_clrtnew
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrt: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if ($Pu) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b110;
-    let Inst{3-0} = Rd;
-  }
-
-// SL2_return: Deallocate stack frame and return.
-let Defs = [PC, R31, R29, R30], Uses = [R30], isCodeGenOnly = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return: SUBInst <
-  (outs ),
-  (ins ),
-  "dealloc_return"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2} = 0b0;
-  }
-
-// SA1_dec: Dec.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_dec: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = add($Rs,#{-1})"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10011;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_seti: Set immed.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0, isExtendable = 1, isExtentSigned = 0, opExtentBits = 6, opExtendable = 1 in
-def SA1_seti: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins u6_0Ext:$u6),
-  "$Rd = #$u6"> {
-    bits<4> Rd;
-    bits<6> u6;
-
-    let Inst{12-10} = 0b010;
-    let Inst{3-0} = Rd;
-    let Inst{9-4} = u6;
-  }
-
-// SL2_jumpr31_t: Indirect conditional jump if true.
-// SL2_jumpr31_t -> SL2_jumpr31_tnew
-let Defs = [PC], Uses = [P0, R31], isCodeGenOnly = 1, isPredicated = 1, isBranch = 1, isIndirectBranch = 1, hasSideEffects = 0 in
-def SL2_jumpr31_t: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0) jumpr r31"> {
-    let Inst{12-6} = 0b1111111;
-    let Inst{2-0} = 0b100;
-  }
-
-// SA1_clrfnew: Clear if false.
-let Uses = [P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_clrfnew: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins PredRegs:$Pu),
-  "if (!$Pu.new) $Rd = #0"> {
-    bits<4> Rd;
-
-    let Inst{12-9} = 0b1101;
-    let Inst{6-4} = 0b101;
-    let Inst{3-0} = Rd;
-  }
-
-// SS1_storew_io: Store word.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = WordAccess in
-def SS1_storew_io: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_2Imm:$u4_2, IntRegs:$Rt),
-  "memw($Rs + #$u4_2) = $Rt"> {
-    bits<4> Rs;
-    bits<6> u4_2;
-    bits<4> Rt;
-
-    let Inst{12} = 0b0;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_2{5-2};
-    let Inst{3-0} = Rt;
-  }
-
-// SA1_zxtb: Zxtb.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_zxtb: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = and($Rs, #255)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10111;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
-// SA1_addsp: Add.
-let Uses = [R29], isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_addsp: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins u6_2Imm:$u6_2),
-  "$Rd = add(r29, #$u6_2)"> {
-    bits<4> Rd;
-    bits<8> u6_2;
-
-    let Inst{12-10} = 0b011;
-    let Inst{3-0} = Rd;
-    let Inst{9-4} = u6_2{7-2};
-  }
-
-// SL2_loadri_sp: Load word.
-let Uses = [R29], isCodeGenOnly = 1, mayLoad = 1, accessSize = WordAccess, hasNewValue = 1, opNewValue = 0 in
-def SL2_loadri_sp: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins u5_2Imm:$u5_2),
-  "$Rd = memw(r29 + #$u5_2)"> {
-    bits<4> Rd;
-    bits<7> u5_2;
-
-    let Inst{12-9} = 0b1110;
-    let Inst{3-0} = Rd;
-    let Inst{8-4} = u5_2{6-2};
-  }
-
-// SS1_storeb_io: Store byte.
-let isCodeGenOnly = 1, mayStore = 1, accessSize = ByteAccess in
-def SS1_storeb_io: SUBInst <
-  (outs ),
-  (ins IntRegs:$Rs, u4_0Imm:$u4_0, IntRegs:$Rt),
-  "memb($Rs + #$u4_0) = $Rt"> {
-    bits<4> Rs;
-    bits<4> u4_0;
-    bits<4> Rt;
-
-    let Inst{12} = 0b1;
-    let Inst{7-4} = Rs;
-    let Inst{11-8} = u4_0;
-    let Inst{3-0} = Rt;
-  }
-
-// SL2_return_tnew: Deallocate stack frame and return.
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_tnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (p0.new) dealloc_return:nt"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b110;
-  }
-
-// SL2_return_fnew: Deallocate stack frame and return.
-let Defs = [PC, R31, R29, R30], Uses = [R30, P0], isCodeGenOnly = 1, isPredicated = 1, isPredicatedFalse = 1, isPredicatedNew = 1, mayLoad = 1, accessSize = DoubleWordAccess, isBranch = 1, isIndirectBranch = 1 in
-def SL2_return_fnew: SUBInst <
-  (outs ),
-  (ins ),
-  "if (!p0.new) dealloc_return:nt"> {
-    let Inst{12-6} = 0b1111101;
-    let Inst{2-0} = 0b111;
-  }
-
-// SA1_zxth: Zxth.
-let isCodeGenOnly = 1, hasSideEffects = 0, hasNewValue = 1, opNewValue = 0 in
-def SA1_zxth: SUBInst <
-  (outs IntRegs:$Rd),
-  (ins IntRegs:$Rs),
-  "$Rd = zxth($Rs)"> {
-    bits<4> Rd;
-    bits<4> Rs;
-
-    let Inst{12-8} = 0b10110;
-    let Inst{3-0} = Rd;
-    let Inst{7-4} = Rs;
-  }
-
diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index 48f74073147e2a16c076ee63aee7e0013665ea6d..b5948475e1f76449dfd174cb95d6eb4f621c4542 100644
--- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -129,6 +129,382 @@ INITIALIZE_PASS_END(HexagonLoopIdiomRecognize, "hexagon-loop-idiom",
     "Recognize Hexagon-specific loop idioms", false, false)
 
 
+namespace {
+  struct Simplifier {
+    typedef std::function<Value* (Instruction*, LLVMContext&)> Rule;
+
+    void addRule(const Rule &R) { Rules.push_back(R); }
+
+  private:
+    struct WorkListType {
+      WorkListType() = default;
+
+      void push_back(Value* V) {
+        // Do not push back duplicates.
+        if (!S.count(V)) { Q.push_back(V); S.insert(V); }
+      }
+      Value *pop_front_val() {
+        Value *V = Q.front(); Q.pop_front(); S.erase(V);
+        return V;
+      }
+      bool empty() const { return Q.empty(); }
+
+    private:
+      std::deque<Value*> Q;
+      std::set<Value*> S;
+    };
+
+    typedef std::set<Value*> ValueSetType;
+    std::vector<Rule> Rules;
+
+  public:
+    struct Context {
+      typedef DenseMap<Value*,Value*> ValueMapType;
+
+      Value *Root;
+      ValueSetType Used;    // The set of all cloned values used by Root.
+      ValueSetType Clones;  // The set of all cloned values.
+      LLVMContext &Ctx;
+
+      Context(Instruction *Exp)
+        : Ctx(Exp->getParent()->getParent()->getContext()) {
+        initialize(Exp);
+      }
+      ~Context() { cleanup(); }
+      void print(raw_ostream &OS, const Value *V) const;
+
+      Value *materialize(BasicBlock *B, BasicBlock::iterator At);
+
+    private:
+      void initialize(Instruction *Exp);
+      void cleanup();
+
+      template <typename FuncT> void traverse(Value *V, FuncT F);
+      void record(Value *V);
+      void use(Value *V);
+      void unuse(Value *V);
+
+      bool equal(const Instruction *I, const Instruction *J) const;
+      Value *find(Value *Tree, Value *Sub) const;
+      Value *subst(Value *Tree, Value *OldV, Value *NewV);
+      void replace(Value *OldV, Value *NewV);
+      void link(Instruction *I, BasicBlock *B, BasicBlock::iterator At);
+
+      friend struct Simplifier;
+    };
+
+    Value *simplify(Context &C);
+  };
+
+  struct PE {
+    PE(const Simplifier::Context &c, Value *v = nullptr) : C(c), V(v) {}
+    const Simplifier::Context &C;
+    const Value *V;
+  };
+
+  raw_ostream &operator<< (raw_ostream &OS, const PE &P) LLVM_ATTRIBUTE_USED;
+  raw_ostream &operator<< (raw_ostream &OS, const PE &P) {
+    P.C.print(OS, P.V ? P.V : P.C.Root);
+    return OS;
+  }
+}
+
+
+template <typename FuncT>
+void Simplifier::Context::traverse(Value *V, FuncT F) {
+  WorkListType Q;
+  Q.push_back(V);
+
+  while (!Q.empty()) {
+    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
+    if (!U || U->getParent())
+      continue;
+    if (!F(U))
+      continue;
+    for (Value *Op : U->operands())
+      Q.push_back(Op);
+  }
+}
+
+
+void Simplifier::Context::print(raw_ostream &OS, const Value *V) const {
+  const auto *U = dyn_cast<const Instruction>(V);
+  if (!U) {
+    OS << V << '(' << *V << ')';
+    return;
+  }
+
+  if (U->getParent()) {
+    OS << U << '(';
+    U->printAsOperand(OS, true);
+    OS << ')';
+    return;
+  }
+
+  unsigned N = U->getNumOperands();
+  if (N != 0)
+    OS << U << '(';
+  OS << U->getOpcodeName();
+  for (const Value *Op : U->operands()) {
+    OS << ' ';
+    print(OS, Op);
+  }
+  if (N != 0)
+    OS << ')';
+}
+
+
+void Simplifier::Context::initialize(Instruction *Exp) {
+  // Perform a deep clone of the expression, set Root to the root
+  // of the clone, and build a map from the cloned values to the
+  // original ones.
+  ValueMapType M;
+  BasicBlock *Block = Exp->getParent();
+  WorkListType Q;
+  Q.push_back(Exp);
+
+  while (!Q.empty()) {
+    Value *V = Q.pop_front_val();
+    if (M.find(V) != M.end())
+      continue;
+    if (Instruction *U = dyn_cast<Instruction>(V)) {
+      if (isa<PHINode>(U) || U->getParent() != Block)
+        continue;
+      for (Value *Op : U->operands())
+        Q.push_back(Op);
+      M.insert({U, U->clone()});
+    }
+  }
+
+  for (std::pair<Value*,Value*> P : M) {
+    Instruction *U = cast<Instruction>(P.second);
+    for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
+      auto F = M.find(U->getOperand(i));
+      if (F != M.end())
+        U->setOperand(i, F->second);
+    }
+  }
+
+  auto R = M.find(Exp);
+  assert(R != M.end());
+  Root = R->second;
+
+  record(Root);
+  use(Root);
+}
+
+
+void Simplifier::Context::record(Value *V) {
+  auto Record = [this](Instruction *U) -> bool {
+    Clones.insert(U);
+    return true;
+  };
+  traverse(V, Record);
+}
+
+
+void Simplifier::Context::use(Value *V) {
+  auto Use = [this](Instruction *U) -> bool {
+    Used.insert(U);
+    return true;
+  };
+  traverse(V, Use);
+}
+
+
+void Simplifier::Context::unuse(Value *V) {
+  if (!isa<Instruction>(V) || cast<Instruction>(V)->getParent() != nullptr)
+    return;
+
+  auto Unuse = [this](Instruction *U) -> bool {
+    if (!U->use_empty())
+      return false;
+    Used.erase(U);
+    return true;
+  };
+  traverse(V, Unuse);
+}
+
+
+Value *Simplifier::Context::subst(Value *Tree, Value *OldV, Value *NewV) {
+  if (Tree == OldV)
+    return NewV;
+  if (OldV == NewV)
+    return Tree;
+
+  WorkListType Q;
+  Q.push_back(Tree);
+  while (!Q.empty()) {
+    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
+    // If U is not an instruction, or it's not a clone, skip it.
+    if (!U || U->getParent())
+      continue;
+    for (unsigned i = 0, n = U->getNumOperands(); i != n; ++i) {
+      Value *Op = U->getOperand(i);
+      if (Op == OldV) {
+        U->setOperand(i, NewV);
+        unuse(OldV);
+      } else {
+        Q.push_back(Op);
+      }
+    }
+  }
+  return Tree;
+}
+
+
+void Simplifier::Context::replace(Value *OldV, Value *NewV) {
+  if (Root == OldV) {
+    Root = NewV;
+    use(Root);
+    return;
+  }
+
+  // NewV may be a complex tree that has just been created by one of the
+  // transformation rules. We need to make sure that it is commoned with
+  // the existing Root to the maximum extent possible.
+  // Identify all subtrees of NewV (including NewV itself) that have
+  // equivalent counterparts in Root, and replace those subtrees with
+  // these counterparts.
+  WorkListType Q;
+  Q.push_back(NewV);
+  while (!Q.empty()) {
+    Value *V = Q.pop_front_val();
+    Instruction *U = dyn_cast<Instruction>(V);
+    if (!U || U->getParent())
+      continue;
+    if (Value *DupV = find(Root, V)) {
+      if (DupV != V)
+        NewV = subst(NewV, V, DupV);
+    } else {
+      for (Value *Op : U->operands())
+        Q.push_back(Op);
+    }
+  }
+
+  // Now, simply replace OldV with NewV in Root.
+  Root = subst(Root, OldV, NewV);
+  use(Root);
+}
+
+
+void Simplifier::Context::cleanup() {
+  for (Value *V : Clones) {
+    Instruction *U = cast<Instruction>(V);
+    if (!U->getParent())
+      U->dropAllReferences();
+  }
+
+  for (Value *V : Clones) {
+    Instruction *U = cast<Instruction>(V);
+    if (!U->getParent())
+      delete U;
+  }
+}
+
+
+bool Simplifier::Context::equal(const Instruction *I,
+                                const Instruction *J) const {
+  if (I == J)
+    return true;
+  if (!I->isSameOperationAs(J))
+    return false;
+  if (isa<PHINode>(I))
+    return I->isIdenticalTo(J);
+
+  for (unsigned i = 0, n = I->getNumOperands(); i != n; ++i) {
+    Value *OpI = I->getOperand(i), *OpJ = J->getOperand(i);
+    if (OpI == OpJ)
+      continue;
+    auto *InI = dyn_cast<const Instruction>(OpI);
+    auto *InJ = dyn_cast<const Instruction>(OpJ);
+    if (InI && InJ) {
+      if (!equal(InI, InJ))
+        return false;
+    } else if (InI != InJ || !InI)
+      return false;
+  }
+  return true;
+}
+
+
+Value *Simplifier::Context::find(Value *Tree, Value *Sub) const {
+  Instruction *SubI = dyn_cast<Instruction>(Sub);
+  WorkListType Q;
+  Q.push_back(Tree);
+
+  while (!Q.empty()) {
+    Value *V = Q.pop_front_val();
+    if (V == Sub)
+      return V;
+    Instruction *U = dyn_cast<Instruction>(V);
+    if (!U || U->getParent())
+      continue;
+    if (SubI && equal(SubI, U))
+      return U;
+    assert(!isa<PHINode>(U));
+    for (Value *Op : U->operands())
+      Q.push_back(Op);
+  }
+  return nullptr;
+}
+
+
+void Simplifier::Context::link(Instruction *I, BasicBlock *B,
+      BasicBlock::iterator At) {
+  if (I->getParent())
+    return;
+
+  for (Value *Op : I->operands()) {
+    if (Instruction *OpI = dyn_cast<Instruction>(Op))
+      link(OpI, B, At);
+  }
+
+  B->getInstList().insert(At, I);
+}
+
+
+Value *Simplifier::Context::materialize(BasicBlock *B,
+      BasicBlock::iterator At) {
+  if (Instruction *RootI = dyn_cast<Instruction>(Root))
+    link(RootI, B, At);
+  return Root;
+}
+
+
+Value *Simplifier::simplify(Context &C) {
+  WorkListType Q;
+  Q.push_back(C.Root);
+  unsigned Count = 0;
+  const unsigned Limit = 100000;
+
+  while (!Q.empty()) {
+    if (Count++ >= Limit)
+      break;
+    Instruction *U = dyn_cast<Instruction>(Q.pop_front_val());
+    if (!U || U->getParent() || !C.Used.count(U))
+      continue;
+    bool Changed = false;
+    for (Rule &R : Rules) {
+      Value *W = R(U, C.Ctx);
+      if (!W)
+        continue;
+      Changed = true;
+      C.record(W);
+      C.replace(U, W);
+      Q.push_back(C.Root);
+      break;
+    }
+    if (!Changed) {
+      for (Value *Op : U->operands())
+        Q.push_back(Op);
+    }
+  }
+  assert(Count < Limit && "Infinite loop in HLIR/simplify?");
+  return C.Root;
+}
+
+
 //===----------------------------------------------------------------------===//
 //
 //          Implementation of PolynomialMultiplyRecognize
@@ -147,6 +523,14 @@ namespace {
   private:
     typedef SetVector<Value*> ValueSeq;
 
+    IntegerType *getPmpyType() const {
+      LLVMContext &Ctx = CurLoop->getHeader()->getParent()->getContext();
+      return IntegerType::get(Ctx, 32);
+    }
+    bool isPromotableTo(Value *V, IntegerType *Ty);
+    void promoteTo(Instruction *In, IntegerType *DestTy, BasicBlock *LoopB);
+    bool promoteTypes(BasicBlock *LoopB, BasicBlock *ExitB);
+
     Value *getCountIV(BasicBlock *BB);
     bool findCycle(Value *Out, Value *In, ValueSeq &Cycle);
     void classifyCycle(Instruction *DivI, ValueSeq &Cycle, ValueSeq &Early,
@@ -176,6 +560,9 @@ namespace {
     unsigned getInverseMxN(unsigned QP);
     Value *generate(BasicBlock::iterator At, ParsedValues &PV);
 
+    void setupSimplifier();
+
+    Simplifier Simp;
     Loop *CurLoop;
     const DataLayout &DL;
     const DominatorTree &DT;
@@ -425,7 +812,6 @@ bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
       BasicBlock *LoopB, BasicBlock *PrehB, Value *CIV, ParsedValues &PV,
       bool PreScan) {
   using namespace PatternMatch;
-
   // The basic pattern for R = P.Q is:
   // for i = 0..31
   //   R = phi (0, R')
@@ -529,6 +915,150 @@ bool PolynomialMultiplyRecognize::scanSelect(SelectInst *SelI,
 }
 
 
+bool PolynomialMultiplyRecognize::isPromotableTo(Value *Val,
+      IntegerType *DestTy) {
+  IntegerType *T = dyn_cast<IntegerType>(Val->getType());
+  if (!T || T->getBitWidth() > DestTy->getBitWidth())
+    return false;
+  if (T->getBitWidth() == DestTy->getBitWidth())
+    return true;
+  // Non-instructions are promotable. The reason why an instruction may not
+  // be promotable is that it may produce a different result if its operands
+  // and the result are promoted, for example, it may produce more non-zero
+  // bits. While it would still be possible to represent the proper result
+  // in a wider type, it may require adding additional instructions (which
+  // we don't want to do).
+  Instruction *In = dyn_cast<Instruction>(Val);
+  if (!In)
+    return true;
+  // The bitwidth of the source type is smaller than the destination.
+  // Check if the individual operation can be promoted.
+  switch (In->getOpcode()) {
+    case Instruction::PHI:
+    case Instruction::ZExt:
+    case Instruction::And:
+    case Instruction::Or:
+    case Instruction::Xor:
+    case Instruction::LShr: // Shift right is ok.
+    case Instruction::Select:
+      return true;
+    case Instruction::ICmp:
+      if (CmpInst *CI = cast<CmpInst>(In))
+        return CI->isEquality() || CI->isUnsigned();
+      llvm_unreachable("Cast failed unexpectedly");
+    case Instruction::Add:
+      return In->hasNoSignedWrap() && In->hasNoUnsignedWrap();
+  }
+  return false;
+}
+
+
+void PolynomialMultiplyRecognize::promoteTo(Instruction *In,
+      IntegerType *DestTy, BasicBlock *LoopB) {
+  // Leave boolean values alone.
+  if (!In->getType()->isIntegerTy(1))
+    In->mutateType(DestTy);
+  unsigned DestBW = DestTy->getBitWidth();
+
+  // Handle PHIs.
+  if (PHINode *P = dyn_cast<PHINode>(In)) {
+    unsigned N = P->getNumIncomingValues();
+    for (unsigned i = 0; i != N; ++i) {
+      BasicBlock *InB = P->getIncomingBlock(i);
+      if (InB == LoopB)
+        continue;
+      Value *InV = P->getIncomingValue(i);
+      IntegerType *Ty = cast<IntegerType>(InV->getType());
+      // Do not promote values in PHI nodes of type i1.
+      if (Ty != P->getType()) {
+        // If the value type does not match the PHI type, the PHI type
+        // must have been promoted.
+        assert(Ty->getBitWidth() < DestBW);
+        InV = IRBuilder<>(InB->getTerminator()).CreateZExt(InV, DestTy);
+        P->setIncomingValue(i, InV);
+      }
+    }
+  } else if (ZExtInst *Z = dyn_cast<ZExtInst>(In)) {
+    Value *Op = Z->getOperand(0);
+    if (Op->getType() == Z->getType())
+      Z->replaceAllUsesWith(Op);
+    Z->eraseFromParent();
+    return;
+  }
+
+  // Promote immediates.
+  for (unsigned i = 0, n = In->getNumOperands(); i != n; ++i) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(In->getOperand(i)))
+      if (CI->getType()->getBitWidth() < DestBW)
+        In->setOperand(i, ConstantInt::get(DestTy, CI->getZExtValue()));
+  }
+}
+
+
+bool PolynomialMultiplyRecognize::promoteTypes(BasicBlock *LoopB,
+      BasicBlock *ExitB) {
+  assert(LoopB);
+  // Skip loops where the exit block has more than one predecessor. The values
+  // coming from the loop block will be promoted to another type, and so the
+  // values coming into the exit block from other predecessors would also have
+  // to be promoted.
+  if (!ExitB || (ExitB->getSinglePredecessor() != LoopB))
+    return false;
+  IntegerType *DestTy = getPmpyType();
+  // Check if the exit values have types that are no wider than the type
+  // that we want to promote to.
+  unsigned DestBW = DestTy->getBitWidth();
+  for (Instruction &In : *ExitB) {
+    PHINode *P = dyn_cast<PHINode>(&In);
+    if (!P)
+      break;
+    if (P->getNumIncomingValues() != 1)
+      return false;
+    assert(P->getIncomingBlock(0) == LoopB);
+    IntegerType *T = dyn_cast<IntegerType>(P->getType());
+    if (!T || T->getBitWidth() > DestBW)
+      return false;
+  }
+
+  // Check all instructions in the loop.
+  for (Instruction &In : *LoopB)
+    if (!In.isTerminator() && !isPromotableTo(&In, DestTy))
+      return false;
+
+  // Perform the promotion.
+  std::vector<Instruction*> LoopIns;
+  std::transform(LoopB->begin(), LoopB->end(), std::back_inserter(LoopIns),
+                 [](Instruction &In) { return &In; });
+  for (Instruction *In : LoopIns)
+    promoteTo(In, DestTy, LoopB);
+
+  // Fix up the PHI nodes in the exit block.
+  Instruction *EndI = ExitB->getFirstNonPHI();
+  BasicBlock::iterator End = EndI ? EndI->getIterator() : ExitB->end();
+  for (auto I = ExitB->begin(); I != End; ++I) {
+    PHINode *P = dyn_cast<PHINode>(I);
+    if (!P)
+      break;
+    Type *Ty0 = P->getIncomingValue(0)->getType();
+    Type *PTy = P->getType();
+    if (PTy != Ty0) {
+      assert(Ty0 == DestTy);
+      // In order to create the trunc, P must have the promoted type.
+      P->mutateType(Ty0);
+      Value *T = IRBuilder<>(ExitB, End).CreateTrunc(P, PTy);
+      // In order for the RAUW to work, the types of P and T must match.
+      P->mutateType(PTy);
+      P->replaceAllUsesWith(T);
+      // Final update of the P's type.
+      P->mutateType(Ty0);
+      cast<Instruction>(T)->setOperand(0, P);
+    }
+  }
+
+  return true;
+}
+
+
 bool PolynomialMultiplyRecognize::findCycle(Value *Out, Value *In,
       ValueSeq &Cycle) {
   // Out = ..., In, ...
@@ -699,6 +1229,7 @@ bool PolynomialMultiplyRecognize::keepsHighBitsZero(Value *V,
       case Instruction::Select:
       case Instruction::ICmp:
       case Instruction::PHI:
+      case Instruction::ZExt:
         return true;
     }
   }
@@ -985,13 +1516,170 @@ Value *PolynomialMultiplyRecognize::generate(BasicBlock::iterator At,
 }
 
 
+void PolynomialMultiplyRecognize::setupSimplifier() {
+  Simp.addRule(
+    // Sink zext past bitwise operations.
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::ZExt)
+        return nullptr;
+      Instruction *T = dyn_cast<Instruction>(I->getOperand(0));
+      if (!T)
+        return nullptr;
+      switch (T->getOpcode()) {
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor:
+          break;
+        default:
+          return nullptr;
+      }
+      IRBuilder<> B(Ctx);
+      return B.CreateBinOp(cast<BinaryOperator>(T)->getOpcode(),
+                           B.CreateZExt(T->getOperand(0), I->getType()),
+                           B.CreateZExt(T->getOperand(1), I->getType()));
+    });
+  Simp.addRule(
+    // (xor (and x a) (and y a)) -> (and (xor x y) a)
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::Xor)
+        return nullptr;
+      Instruction *And0 = dyn_cast<Instruction>(I->getOperand(0));
+      Instruction *And1 = dyn_cast<Instruction>(I->getOperand(1));
+      if (!And0 || !And1)
+        return nullptr;
+      if (And0->getOpcode() != Instruction::And ||
+          And1->getOpcode() != Instruction::And)
+        return nullptr;
+      if (And0->getOperand(1) != And1->getOperand(1))
+        return nullptr;
+      IRBuilder<> B(Ctx);
+      return B.CreateAnd(B.CreateXor(And0->getOperand(0), And1->getOperand(0)),
+                         And0->getOperand(1));
+    });
+  Simp.addRule(
+    // (Op (select c x y) z) -> (select c (Op x z) (Op y z))
+    // (Op x (select c y z)) -> (select c (Op x y) (Op x z))
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      BinaryOperator *BO = dyn_cast<BinaryOperator>(I);
+      if (!BO)
+        return nullptr;
+      Instruction::BinaryOps Op = BO->getOpcode();
+      if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(0))) {
+        IRBuilder<> B(Ctx);
+        Value *X = Sel->getTrueValue(), *Y = Sel->getFalseValue();
+        Value *Z = BO->getOperand(1);
+        return B.CreateSelect(Sel->getCondition(),
+                              B.CreateBinOp(Op, X, Z),
+                              B.CreateBinOp(Op, Y, Z));
+      }
+      if (SelectInst *Sel = dyn_cast<SelectInst>(BO->getOperand(1))) {
+        IRBuilder<> B(Ctx);
+        Value *X = BO->getOperand(0);
+        Value *Y = Sel->getTrueValue(), *Z = Sel->getFalseValue();
+        return B.CreateSelect(Sel->getCondition(),
+                              B.CreateBinOp(Op, X, Y),
+                              B.CreateBinOp(Op, X, Z));
+      }
+      return nullptr;
+    });
+  Simp.addRule(
+    // (select c (select c x y) z) -> (select c x z)
+    // (select c x (select c y z)) -> (select c x z)
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      SelectInst *Sel = dyn_cast<SelectInst>(I);
+      if (!Sel)
+        return nullptr;
+      IRBuilder<> B(Ctx);
+      Value *C = Sel->getCondition();
+      if (SelectInst *Sel0 = dyn_cast<SelectInst>(Sel->getTrueValue())) {
+        if (Sel0->getCondition() == C)
+          return B.CreateSelect(C, Sel0->getTrueValue(), Sel->getFalseValue());
+      }
+      if (SelectInst *Sel1 = dyn_cast<SelectInst>(Sel->getFalseValue())) {
+        if (Sel1->getCondition() == C)
+          return B.CreateSelect(C, Sel->getTrueValue(), Sel1->getFalseValue());
+      }
+      return nullptr;
+    });
+  Simp.addRule(
+    // (or (lshr x 1) 0x800.0) -> (xor (lshr x 1) 0x800.0)
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::Or)
+        return nullptr;
+      Instruction *LShr = dyn_cast<Instruction>(I->getOperand(0));
+      if (!LShr || LShr->getOpcode() != Instruction::LShr)
+        return nullptr;
+      ConstantInt *One = dyn_cast<ConstantInt>(LShr->getOperand(1));
+      if (!One || One->getZExtValue() != 1)
+        return nullptr;
+      ConstantInt *Msb = dyn_cast<ConstantInt>(I->getOperand(1));
+      if (!Msb || Msb->getZExtValue() != Msb->getType()->getSignBit())
+        return nullptr;
+      return IRBuilder<>(Ctx).CreateXor(LShr, Msb);
+    });
+  Simp.addRule(
+    // (lshr (BitOp x y) c) -> (BitOp (lshr x c) (lshr y c))
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      if (I->getOpcode() != Instruction::LShr)
+        return nullptr;
+      BinaryOperator *BitOp = dyn_cast<BinaryOperator>(I->getOperand(0));
+      if (!BitOp)
+        return nullptr;
+      switch (BitOp->getOpcode()) {
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor:
+          break;
+        default:
+          return nullptr;
+      }
+      IRBuilder<> B(Ctx);
+      Value *S = I->getOperand(1);
+      return B.CreateBinOp(BitOp->getOpcode(),
+                B.CreateLShr(BitOp->getOperand(0), S),
+                B.CreateLShr(BitOp->getOperand(1), S));
+    });
+  Simp.addRule(
+    // (BitOp1 (BitOp2 x a) b) -> (BitOp2 x (BitOp1 a b))
+    [](Instruction *I, LLVMContext &Ctx) -> Value* {
+      auto IsBitOp = [](unsigned Op) -> bool {
+        switch (Op) {
+          case Instruction::And:
+          case Instruction::Or:
+          case Instruction::Xor:
+            return true;
+        }
+        return false;
+      };
+      BinaryOperator *BitOp1 = dyn_cast<BinaryOperator>(I);
+      if (!BitOp1 || !IsBitOp(BitOp1->getOpcode()))
+        return nullptr;
+      BinaryOperator *BitOp2 = dyn_cast<BinaryOperator>(BitOp1->getOperand(0));
+      if (!BitOp2 || !IsBitOp(BitOp2->getOpcode()))
+        return nullptr;
+      ConstantInt *CA = dyn_cast<ConstantInt>(BitOp2->getOperand(1));
+      ConstantInt *CB = dyn_cast<ConstantInt>(BitOp1->getOperand(1));
+      if (!CA || !CB)
+        return nullptr;
+      IRBuilder<> B(Ctx);
+      Value *X = BitOp2->getOperand(0);
+      return B.CreateBinOp(BitOp2->getOpcode(), X,
+                B.CreateBinOp(BitOp1->getOpcode(), CA, CB));
+    });
+}
+
+
 bool PolynomialMultiplyRecognize::recognize() {
+  DEBUG(dbgs() << "Starting PolynomialMultiplyRecognize on loop\n"
+               << *CurLoop << '\n');
   // Restrictions:
   // - The loop must consist of a single block.
   // - The iteration count must be known at compile-time.
   // - The loop must have an induction variable starting from 0, and
   //   incremented in each iteration of the loop.
   BasicBlock *LoopB = CurLoop->getHeader();
+  DEBUG(dbgs() << "Loop header:\n" << *LoopB);
+
   if (LoopB != CurLoop->getLoopLatch())
     return false;
   BasicBlock *ExitB = CurLoop->getExitBlock();
@@ -1011,30 +1699,65 @@ bool PolynomialMultiplyRecognize::recognize() {
   Value *CIV = getCountIV(LoopB);
   ParsedValues PV;
   PV.IterCount = IterCount;
+  DEBUG(dbgs() << "Loop IV: " << *CIV << "\nIterCount: " << IterCount << '\n');
+
+  setupSimplifier();
+
+  // Perform a preliminary scan of select instructions to see if any of them
+  // looks like a generator of the polynomial multiply steps. Assume that a
+  // loop can only contain a single transformable operation, so stop the
+  // traversal after the first reasonable candidate was found.
+  // XXX: Currently this approach can modify the loop before being 100% sure
+  // that the transformation can be carried out.
+  bool FoundPreScan = false;
+  for (Instruction &In : *LoopB) {
+    SelectInst *SI = dyn_cast<SelectInst>(&In);
+    if (!SI)
+      continue;
 
-  // Test function to see if a given select instruction is a part of the
-  // pmpy pattern. The argument PreScan set to "true" indicates that only
-  // a preliminary scan is needed, "false" indicated an exact match.
-  auto CouldBePmpy = [this, LoopB, EntryB, CIV, &PV] (bool PreScan)
-      -> std::function<bool (Instruction &I)> {
-    return [this, LoopB, EntryB, CIV, &PV, PreScan] (Instruction &I) -> bool {
-      if (auto *SelI = dyn_cast<SelectInst>(&I))
-        return scanSelect(SelI, LoopB, EntryB, CIV, PV, PreScan);
-      return false;
-    };
-  };
-  auto PreF = std::find_if(LoopB->begin(), LoopB->end(), CouldBePmpy(true));
-  if (PreF == LoopB->end())
+    Simplifier::Context C(SI);
+    Value *T = Simp.simplify(C);
+    SelectInst *SelI = (T && isa<SelectInst>(T)) ? cast<SelectInst>(T) : SI;
+    DEBUG(dbgs() << "scanSelect(pre-scan): " << PE(C, SelI) << '\n');
+    if (scanSelect(SelI, LoopB, EntryB, CIV, PV, true)) {
+      FoundPreScan = true;
+      if (SelI != SI) {
+        Value *NewSel = C.materialize(LoopB, SI->getIterator());
+        SI->replaceAllUsesWith(NewSel);
+        RecursivelyDeleteTriviallyDeadInstructions(SI, &TLI);
+      }
+      break;
+    }
+  }
+
+  if (!FoundPreScan) {
+    DEBUG(dbgs() << "Have not found candidates for pmpy\n");
     return false;
+  }
 
   if (!PV.Left) {
+    // The right shift version actually only returns the higher bits of
+    // the result (each iteration discards the LSB). If we want to convert it
+    // to a left-shifting loop, the working data type must be at least as
+    // wide as the target's pmpy instruction.
+    if (!promoteTypes(LoopB, ExitB))
+      return false;
     convertShiftsToLeft(LoopB, ExitB, IterCount);
     cleanupLoopBody(LoopB);
   }
 
-  auto PostF = std::find_if(LoopB->begin(), LoopB->end(), CouldBePmpy(false));
-  if (PostF == LoopB->end())
-    return false;
+  // Scan the loop again, find the generating select instruction.
+  bool FoundScan = false;
+  for (Instruction &In : *LoopB) {
+    SelectInst *SelI = dyn_cast<SelectInst>(&In);
+    if (!SelI)
+      continue;
+    DEBUG(dbgs() << "scanSelect: " << *SelI << '\n');
+    FoundScan = scanSelect(SelI, LoopB, EntryB, CIV, PV, false);
+    if (FoundScan)
+      break;
+  }
+  assert(FoundScan);
 
   DEBUG({
     StringRef PP = (PV.M ? "(P+M)" : "P");
@@ -1175,9 +1898,9 @@ void HexagonLoopIdiomRecognize::collectStores(Loop *CurLoop, BasicBlock *BB,
 
 bool HexagonLoopIdiomRecognize::processCopyingStore(Loop *CurLoop,
       StoreInst *SI, const SCEV *BECount) {
-  assert(SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy) &&
-             "Expected only non-volatile stores, or Hexagon-specific memcpy"
-             "to volatile destination.");
+  assert((SI->isSimple() || (SI->isVolatile() && HexagonVolatileMemcpy)) &&
+         "Expected only non-volatile stores, or Hexagon-specific memcpy"
+         "to volatile destination.");
 
   Value *StorePtr = SI->getPointerOperand();
   auto *StoreEv = cast<SCEVAddRecExpr>(SE->getSCEV(StorePtr));
@@ -1423,8 +2146,7 @@ CleanupAndExit:
       Type *VoidTy = Type::getVoidTy(Ctx);
       Module *M = Func->getParent();
       Constant *CF = M->getOrInsertFunction(HexagonVolatileMemcpyName, VoidTy,
-                                            Int32PtrTy, Int32PtrTy, Int32Ty,
-                                            nullptr);
+                                            Int32PtrTy, Int32PtrTy, Int32Ty);
       Function *Fn = cast<Function>(CF);
       Fn->setLinkage(Function::ExternalLinkage);
 
diff --git a/lib/Target/Hexagon/HexagonMCInstLower.cpp b/lib/Target/Hexagon/HexagonMCInstLower.cpp
index 9d8c29463bf34e674ac221f8c218cc25f54ee6e4..7189b5a52c4242cc15a46fea0167e80c60bf669f 100644
--- a/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -111,9 +111,12 @@ void llvm::HexagonLowerToMC(const MCInstrInfo &MCII, const MachineInstr *MI,
     default:
       MI->print(errs());
       llvm_unreachable("unknown operand type");
+    case MachineOperand::MO_RegisterMask:
+      continue;
     case MachineOperand::MO_Register:
       // Ignore all implicit register operands.
-      if (MO.isImplicit()) continue;
+      if (MO.isImplicit())
+        continue;
       MCO = MCOperand::createReg(MO.getReg());
       break;
     case MachineOperand::MO_FPImmediate: {
diff --git a/lib/Target/Hexagon/HexagonMachineScheduler.cpp b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
index 9ff9d93ea0c3486ae714e4d0cb81237af33bb45b..20dc9b0da1dba84b9e403e0dd2e03dd62982108f 100644
--- a/lib/Target/Hexagon/HexagonMachineScheduler.cpp
+++ b/lib/Target/Hexagon/HexagonMachineScheduler.cpp
@@ -74,7 +74,9 @@ bool HexagonCallMutation::shouldTFRICallBind(const HexagonInstrInfo &HII,
     return false;
 
   // TypeXTYPE are 64 bit operations.
-  if (HII.getType(*Inst2.getInstr()) == HexagonII::TypeXTYPE)
+  unsigned Type = HII.getType(*Inst2.getInstr());
+  if (Type == HexagonII::TypeS_2op || Type == HexagonII::TypeS_3op ||
+    Type == HexagonII::TypeALU64 || Type == HexagonII::TypeM)
     return true;
   return false;
 }
diff --git a/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td b/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
new file mode 100644
index 0000000000000000000000000000000000000000..0b4ac14c7a4757506869636b2b651ee70cefc0b5
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonMapAsm2IntrinV62.gen.td
@@ -0,0 +1,204 @@
+//===--- HexagonMapAsm2IntrinV62.gen.td -----------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+multiclass T_VR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, IntRegs:$src2),
+           (MI VectorRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVL_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegsLow8:$src3),
+           (MI VectorRegs:$src1, VectorRegs:$src2, IntRegsLow8:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegsLow8:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegsLow8:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2),
+           (MI VectorRegs:$src1, VectorRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WW_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2),
+           (MI VecDblRegs:$src1, VecDblRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3),
+           (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, IntRegs:$src2),
+           (MI VecDblRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WWR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3),
+           (MI VecDblRegs:$src1, VecDblRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2, IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VecDblRegs128B:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3),
+           (MI VectorRegs:$src1, VectorRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_ZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, IntRegs:$src2),
+           (MI VecPredRegs:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, IntRegs:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, IntRegs:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VZR_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3),
+           (MI VectorRegs:$src1, VecPredRegs:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VecPredRegs128B:$src2, IntRegs:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VecPredRegs128B:$src2, IntRegs:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_ZV_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VectorRegs:$src2),
+           (MI VecPredRegs:$src1, VectorRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, VectorRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, VectorRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_R_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID IntRegs:$src1),
+           (MI IntRegs:$src1)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") IntRegs:$src1),
+           (!cast<InstHexagon>(MI#"_128B") IntRegs:$src1)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_ZZ_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecPredRegs:$src1, VecPredRegs:$src2),
+           (MI VecPredRegs:$src1, VecPredRegs:$src2)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecPredRegs128B:$src1, VecPredRegs128B:$src2),
+           (!cast<InstHexagon>(MI#"_128B") VecPredRegs128B:$src1, VecPredRegs128B:$src2)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, imm:$src3),
+           (MI VectorRegs:$src1, VectorRegs:$src2, imm:$src3)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, imm:$src3),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, imm:$src3)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_VVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4),
+           (MI VectorRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4),
+           (!cast<InstHexagon>(MI#"_128B") VectorRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+multiclass T_WVVI_HVX_gen_pat <InstHexagon MI, Intrinsic IntID> {
+  def: Pat<(IntID VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4),
+           (MI VecDblRegs:$src1, VectorRegs:$src2, VectorRegs:$src3, imm:$src4)>,
+       Requires<[UseHVXSgl]>;
+  def: Pat<(!cast<Intrinsic>(IntID#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4),
+           (!cast<InstHexagon>(MI#"_128B") VecDblRegs128B:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3, imm:$src4)>,
+       Requires<[UseHVXDbl]>;
+}
+
+def : T_R_pat <S6_vsplatrbp, int_hexagon_S6_vsplatrbp>;
+def : T_PP_pat <M6_vabsdiffb, int_hexagon_M6_vabsdiffb>;
+def : T_PP_pat <M6_vabsdiffub, int_hexagon_M6_vabsdiffub>;
+def : T_PP_pat <S6_vtrunehb_ppp, int_hexagon_S6_vtrunehb_ppp>;
+def : T_PP_pat <S6_vtrunohb_ppp, int_hexagon_S6_vtrunohb_ppp>;
+
+defm : T_VR_HVX_gen_pat <V6_vlsrb, int_hexagon_V6_vlsrb>;
+defm : T_VR_HVX_gen_pat <V6_vmpyiwub, int_hexagon_V6_vmpyiwub>;
+defm : T_VVL_HVX_gen_pat <V6_vasrwuhrndsat, int_hexagon_V6_vasrwuhrndsat>;
+defm : T_VVL_HVX_gen_pat <V6_vasruwuhrndsat, int_hexagon_V6_vasruwuhrndsat>;
+defm : T_VVL_HVX_gen_pat <V6_vasrhbsat, int_hexagon_V6_vasrhbsat>;
+defm : T_VVL_HVX_gen_pat <V6_vlutvvb_nm, int_hexagon_V6_vlutvvb_nm>;
+defm : T_VVL_HVX_gen_pat <V6_vlutvwh_nm, int_hexagon_V6_vlutvwh_nm>;
+defm : T_VV_HVX_gen_pat <V6_vrounduwuh, int_hexagon_V6_vrounduwuh>;
+defm : T_VV_HVX_gen_pat <V6_vrounduhub, int_hexagon_V6_vrounduhub>;
+defm : T_VV_HVX_gen_pat <V6_vadduwsat, int_hexagon_V6_vadduwsat>;
+defm : T_VV_HVX_gen_pat <V6_vsubuwsat, int_hexagon_V6_vsubuwsat>;
+defm : T_VV_HVX_gen_pat <V6_vaddbsat, int_hexagon_V6_vaddbsat>;
+defm : T_VV_HVX_gen_pat <V6_vsubbsat, int_hexagon_V6_vsubbsat>;
+defm : T_VV_HVX_gen_pat <V6_vaddububb_sat, int_hexagon_V6_vaddububb_sat>;
+defm : T_VV_HVX_gen_pat <V6_vsubububb_sat, int_hexagon_V6_vsubububb_sat>;
+defm : T_VV_HVX_gen_pat <V6_vmpyewuh_64, int_hexagon_V6_vmpyewuh_64>;
+defm : T_VV_HVX_gen_pat <V6_vmaxb, int_hexagon_V6_vmaxb>;
+defm : T_VV_HVX_gen_pat <V6_vminb, int_hexagon_V6_vminb>;
+defm : T_VV_HVX_gen_pat <V6_vsatuwuh, int_hexagon_V6_vsatuwuh>;
+defm : T_VV_HVX_gen_pat <V6_vaddclbw, int_hexagon_V6_vaddclbw>;
+defm : T_VV_HVX_gen_pat <V6_vaddclbh, int_hexagon_V6_vaddclbh>;
+defm : T_WW_HVX_gen_pat <V6_vadduwsat_dv, int_hexagon_V6_vadduwsat_dv>;
+defm : T_WW_HVX_gen_pat <V6_vsubuwsat_dv, int_hexagon_V6_vsubuwsat_dv>;
+defm : T_WW_HVX_gen_pat <V6_vaddbsat_dv, int_hexagon_V6_vaddbsat_dv>;
+defm : T_WW_HVX_gen_pat <V6_vsubbsat_dv, int_hexagon_V6_vsubbsat_dv>;
+defm : T_WVV_HVX_gen_pat <V6_vaddhw_acc, int_hexagon_V6_vaddhw_acc>;
+defm : T_WVV_HVX_gen_pat <V6_vadduhw_acc, int_hexagon_V6_vadduhw_acc>;
+defm : T_WVV_HVX_gen_pat <V6_vaddubh_acc, int_hexagon_V6_vaddubh_acc>;
+defm : T_WVV_HVX_gen_pat <V6_vmpyowh_64_acc, int_hexagon_V6_vmpyowh_64_acc>;
+defm : T_WR_HVX_gen_pat <V6_vmpauhb, int_hexagon_V6_vmpauhb>;
+defm : T_WWR_HVX_gen_pat <V6_vmpauhb_acc, int_hexagon_V6_vmpauhb_acc>;
+defm : T_VVR_HVX_gen_pat <V6_vmpyiwub_acc, int_hexagon_V6_vmpyiwub_acc>;
+defm : T_ZR_HVX_gen_pat <V6_vandnqrt, int_hexagon_V6_vandnqrt>;
+defm : T_VZR_HVX_gen_pat <V6_vandnqrt_acc, int_hexagon_V6_vandnqrt_acc>;
+defm : T_ZV_HVX_gen_pat <V6_vandvqv, int_hexagon_V6_vandvqv>;
+defm : T_ZV_HVX_gen_pat <V6_vandvnqv, int_hexagon_V6_vandvnqv>;
+defm : T_R_HVX_gen_pat <V6_pred_scalar2v2, int_hexagon_V6_pred_scalar2v2>;
+defm : T_R_HVX_gen_pat <V6_lvsplath, int_hexagon_V6_lvsplath>;
+defm : T_R_HVX_gen_pat <V6_lvsplatb, int_hexagon_V6_lvsplatb>;
+defm : T_ZZ_HVX_gen_pat <V6_shuffeqw, int_hexagon_V6_shuffeqw>;
+defm : T_ZZ_HVX_gen_pat <V6_shuffeqh, int_hexagon_V6_shuffeqh>;
+defm : T_VVI_HVX_gen_pat <V6_vlutvvbi, int_hexagon_V6_vlutvvbi>;
+defm : T_VVI_HVX_gen_pat <V6_vlutvwhi, int_hexagon_V6_vlutvwhi>;
+defm : T_VVVI_HVX_gen_pat <V6_vlutvvb_oracci, int_hexagon_V6_vlutvvb_oracci>;
+defm : T_WVVI_HVX_gen_pat <V6_vlutvwh_oracci, int_hexagon_V6_vlutvwh_oracci>;
diff --git a/lib/Target/Hexagon/HexagonNewValueJump.cpp b/lib/Target/Hexagon/HexagonNewValueJump.cpp
index 72d8011277e68a83cc7835f1048132f951f5c3f7..d73fc7c73185d807a04783c2db623845abf99f70 100644
--- a/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -130,6 +130,8 @@ static bool canBeFeederToNewValueJump(const HexagonInstrInfo *QII,
   if (II->getOpcode() == TargetOpcode::KILL)
     return false;
 
+  if (II->isImplicitDef())
+    return false;
 
   // Make sure there there is no 'def' or 'use' of any of the uses of
   // feeder insn between it's definition, this MI and jump, jmpInst
diff --git a/lib/Target/Hexagon/HexagonOperands.td b/lib/Target/Hexagon/HexagonOperands.td
index 9833105715637626f6b36e5d3d426e65af7d17db..f87a1b8e424dca2da8b0fca689907fb6d3b2b98f 100644
--- a/lib/Target/Hexagon/HexagonOperands.td
+++ b/lib/Target/Hexagon/HexagonOperands.td
@@ -1,298 +1,33 @@
-//===- HexagonImmediates.td - Hexagon immediate processing -*- tablegen -*-===//
+//===--- HexagonOperands.td -----------------------------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
-// This file is distributed under the University of Illnois Open Source
+// This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 
-def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; }
-def s23_2ImmOperand : AsmOperandClass { let Name = "s23_2Imm"; }
-def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; }
-def s8_0Imm64Operand : AsmOperandClass { let Name = "s8_0Imm64"; }
-def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; }
-def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; }
-def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; }
-def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; }
-def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; }
-def s4_6ImmOperand : AsmOperandClass { let Name = "s4_6Imm"; }
-def s3_6ImmOperand : AsmOperandClass { let Name = "s3_6Imm"; }
-def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; }
-def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; }
-def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; }
-def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; }
-def u16_1ImmOperand : AsmOperandClass { let Name = "u16_1Imm"; }
-def u16_2ImmOperand : AsmOperandClass { let Name = "u16_2Imm"; }
-def u16_3ImmOperand : AsmOperandClass { let Name = "u16_3Imm"; }
-def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; }
-def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; }
-def u9_0ImmOperand : AsmOperandClass { let Name = "u9_0Imm"; }
-def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; }
-def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; }
-def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; }
-def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; }
-def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; }
-def u6_3ImmOperand : AsmOperandClass { let Name = "u6_3Imm"; }
-def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; }
-def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; }
-def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; }
-def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; }
-def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; }
-def n8_0ImmOperand : AsmOperandClass { let Name = "n8_0Imm"; }
-// Immediate operands.
-
-let OperandType = "OPERAND_IMMEDIATE",
-    DecoderMethod = "unsignedImmDecoder" in {
-  def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand;
-                                let DecoderMethod = "s32_0ImmDecoder"; }
-  def s23_2Imm : Operand<i32> { let ParserMatchClass = s23_2ImmOperand; }
-  def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand;
-                               let DecoderMethod = "s8_0ImmDecoder"; }
-  def s8_0Imm64 : Operand<i64>  { let ParserMatchClass = s8_0Imm64Operand;
-                                  let DecoderMethod = "s8_0ImmDecoder"; }
-  def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand;
-                             let DecoderMethod = "s6_0ImmDecoder"; }
-  def s6_3Imm : Operand<i32>;
-  def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand;
-                               let DecoderMethod = "s4_0ImmDecoder"; }
-  def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand;
-                               let DecoderMethod = "s4_1ImmDecoder"; }
-  def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand;
-                               let DecoderMethod = "s4_2ImmDecoder"; }
-  def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand;
-                               let DecoderMethod = "s4_3ImmDecoder"; }
-  def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; }
-  def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; }
-  def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; }
-  def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; }
-  def u16_1Imm : Operand<i32> { let ParserMatchClass = u16_1ImmOperand; }
-  def u16_2Imm : Operand<i32> { let ParserMatchClass = u16_2ImmOperand; }
-  def u16_3Imm : Operand<i32> { let ParserMatchClass = u16_3ImmOperand; }
-  def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; }
-  def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; }
-  def u9_0Imm : Operand<i32> { let ParserMatchClass = u9_0ImmOperand; }
-  def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; }
-  def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; }
-  def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; }
-  def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; }
-  def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; }
-  def u6_3Imm : Operand<i32> { let ParserMatchClass = u6_3ImmOperand; }
-  def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; }
-  def u5_1Imm : Operand<i32>;
-  def u5_2Imm : Operand<i32>;
-  def u5_3Imm : Operand<i32>;
-  def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; }
-  def u4_1Imm : Operand<i32>;
-  def u4_2Imm : Operand<i32>;
-  def u4_3Imm : Operand<i32>;
-  def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; }
-  def u3_1Imm : Operand<i32>;
-  def u3_2Imm : Operand<i32>;
-  def u3_3Imm : Operand<i32>;
-  def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; }
-  def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; }
-  def n8_0Imm : Operand<i32> { let ParserMatchClass = n8_0ImmOperand; }
-}
-
-let OperandType = "OPERAND_IMMEDIATE" in {
-  def s4_6Imm : Operand<i32> { let ParserMatchClass = s4_6ImmOperand;
-                               let PrintMethod = "prints4_6ImmOperand";
-                               let DecoderMethod = "s4_6ImmDecoder";}
-  def s4_7Imm : Operand<i32> { let PrintMethod = "prints4_7ImmOperand";
-                               let DecoderMethod = "s4_6ImmDecoder";}
-  def s3_6Imm : Operand<i32> { let ParserMatchClass = s3_6ImmOperand;
-                               let PrintMethod = "prints3_6ImmOperand";
-                               let DecoderMethod = "s3_6ImmDecoder";}
-  def s3_7Imm : Operand<i32> { let PrintMethod = "prints3_7ImmOperand";
-                               let DecoderMethod = "s3_6ImmDecoder";}
-}
-def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; }
-def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; }
-
-//
-// Immediate predicates
-//
-def s32_0ImmPred  : PatLeaf<(i32 imm), [{
+def f32ImmOperand : AsmOperandClass { let Name = "f32Imm"; }
+def f32Imm : Operand<f32> { let ParserMatchClass = f32ImmOperand; }
+def f64ImmOperand : AsmOperandClass { let Name = "f64Imm"; }
+def f64Imm : Operand<f64> { let ParserMatchClass = f64ImmOperand; }
+def s8_0Imm64Pred  : PatLeaf<(i64 imm), [{ return isInt<8>(N->getSExtValue()); }]>;
+def s9_0ImmOperand : AsmOperandClass { let Name = "s9_0Imm"; }
+def s9_0Imm : Operand<i32> { let ParserMatchClass = s9_0ImmOperand; }
+def s23_2ImmOperand : AsmOperandClass { let Name = "s23_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s23_2Imm : Operand<i32> { let ParserMatchClass = s23_2ImmOperand; }
+def r32_0ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
   return isInt<32>(v);
 }]>;
-
-def s31_1ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<31,1>(v);
-}]>;
-
-def s30_2ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<30,2>(v);
-}]>;
-
-def s29_3ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<29,3>(v);
-}]>;
-
-def s10_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<10>(v);
-}]>;
-
-def s8_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<8>(v);
-}]>;
-
-def s8_0Imm64Pred  : PatLeaf<(i64 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<8>(v);
-}]>;
-
-def s6_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<6>(v);
-}]>;
-
-def s4_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isInt<4>(v);
-}]>;
-
-def s4_1ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<4,1>(v);
-}]>;
-
-def s4_2ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<4,2>(v);
-}]>;
-
-def s4_3ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedInt<4,3>(v);
-}]>;
-
-def u32_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<32>(v);
-}]>;
-
-def u16_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<16>(v);
-}]>;
-
-def u11_3ImmPred : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedUInt<11,3>(v);
-}]>;
-
 def u9_0ImmPred  : PatLeaf<(i32 imm), [{
   int64_t v = (int64_t)N->getSExtValue();
   return isUInt<9>(v);
 }]>;
-
-def u8_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<8>(v);
-}]>;
-
-def u6_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<6>(v);
-}]>;
-
-def u6_1ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedUInt<6,1>(v);
-}]>;
-
-def u6_2ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isShiftedUInt<6,2>(v);
-}]>;
-
-def u5_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<5>(v);
-}]>;
-
-def u4_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<4>(v);
-}]>;
-
-def u3_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<3>(v);
-}]>;
-
-def u2_0ImmPred  : PatLeaf<(i32 imm), [{
-  int64_t v = (int64_t)N->getSExtValue();
-  return isUInt<2>(v);
-}]>;
-
-// Extendable immediate operands.
-def f32ExtOperand : AsmOperandClass { let Name = "f32Ext"; }
-def s16_0ExtOperand : AsmOperandClass { let Name = "s16_0Ext"; }
-def s12_0ExtOperand : AsmOperandClass { let Name = "s12_0Ext"; }
-def s10_0ExtOperand : AsmOperandClass { let Name = "s10_0Ext"; }
-def s9_0ExtOperand : AsmOperandClass { let Name = "s9_0Ext"; }
-def s8_0ExtOperand : AsmOperandClass { let Name = "s8_0Ext"; }
-def s7_0ExtOperand : AsmOperandClass { let Name = "s7_0Ext"; }
-def s6_0ExtOperand : AsmOperandClass { let Name = "s6_0Ext"; }
-def s11_0ExtOperand : AsmOperandClass { let Name = "s11_0Ext"; }
-def s11_1ExtOperand : AsmOperandClass { let Name = "s11_1Ext"; }
-def s11_2ExtOperand : AsmOperandClass { let Name = "s11_2Ext"; }
-def s11_3ExtOperand : AsmOperandClass { let Name = "s11_3Ext"; }
-def u6_0ExtOperand : AsmOperandClass { let Name = "u6_0Ext"; }
-def u7_0ExtOperand : AsmOperandClass { let Name = "u7_0Ext"; }
-def u8_0ExtOperand : AsmOperandClass { let Name = "u8_0Ext"; }
-def u9_0ExtOperand : AsmOperandClass { let Name = "u9_0Ext"; }
-def u10_0ExtOperand : AsmOperandClass { let Name = "u10_0Ext"; }
-def u6_1ExtOperand : AsmOperandClass { let Name = "u6_1Ext"; }
-def u6_2ExtOperand : AsmOperandClass { let Name = "u6_2Ext"; }
-def u6_3ExtOperand : AsmOperandClass { let Name = "u6_3Ext"; }
-def u32_0MustExtOperand : AsmOperandClass { let Name = "u32_0MustExt"; }
-
-
-
-let OperandType = "OPERAND_IMMEDIATE", PrintMethod = "printExtOperand",
-    DecoderMethod = "unsignedImmDecoder" in {
-  def f32Ext : Operand<f32> { let ParserMatchClass = f32ExtOperand; }
-  def s16_0Ext : Operand<i32> { let ParserMatchClass = s16_0ExtOperand;
-                                let DecoderMethod = "s16_0ImmDecoder"; }
-  def s12_0Ext : Operand<i32> { let ParserMatchClass = s12_0ExtOperand;
-                                let DecoderMethod = "s12_0ImmDecoder"; }
-  def s11_0Ext : Operand<i32> { let ParserMatchClass = s11_0ExtOperand;
-                                let DecoderMethod = "s11_0ImmDecoder"; }
-  def s11_1Ext : Operand<i32> { let ParserMatchClass = s11_1ExtOperand;
-                                let DecoderMethod = "s11_1ImmDecoder"; }
-  def s11_2Ext : Operand<i32> { let ParserMatchClass = s11_2ExtOperand;
-                                let DecoderMethod = "s11_2ImmDecoder"; }
-  def s11_3Ext : Operand<i32> { let ParserMatchClass = s11_3ExtOperand;
-                                let DecoderMethod = "s11_3ImmDecoder"; }
-  def s10_0Ext : Operand<i32> { let ParserMatchClass = s10_0ExtOperand;
-                                let DecoderMethod = "s10_0ImmDecoder"; }
-  def s9_0Ext : Operand<i32> { let ParserMatchClass = s9_0ExtOperand;
-                               let DecoderMethod = "s9_0ImmDecoder"; }
-  def s8_0Ext : Operand<i32> { let ParserMatchClass = s8_0ExtOperand;
-                               let DecoderMethod = "s8_0ImmDecoder"; }
-  def s7_0Ext : Operand<i32> { let ParserMatchClass = s7_0ExtOperand; }
-  def s6_0Ext : Operand<i32> { let ParserMatchClass = s6_0ExtOperand;
-                               let DecoderMethod = "s6_0ImmDecoder"; }
-  def u7_0Ext : Operand<i32> { let ParserMatchClass = u7_0ExtOperand; }
-  def u8_0Ext : Operand<i32> { let ParserMatchClass = u8_0ExtOperand; }
-  def u9_0Ext : Operand<i32> { let ParserMatchClass = u9_0ExtOperand; }
-  def u10_0Ext : Operand<i32> { let ParserMatchClass = u10_0ExtOperand; }
-  def u6_0Ext : Operand<i32> { let ParserMatchClass = u6_0ExtOperand; }
-  def u6_1Ext : Operand<i32> { let ParserMatchClass = u6_1ExtOperand; }
-  def u6_2Ext : Operand<i32> { let ParserMatchClass = u6_2ExtOperand; }
-  def u6_3Ext : Operand<i32> { let ParserMatchClass = u6_3ExtOperand; }
-  def u32_0MustExt : Operand<i32> { let ParserMatchClass = u32_0MustExtOperand; }
-}
-
+def u64_0ImmOperand : AsmOperandClass { let Name = "u64_0Imm"; let RenderMethod = "addImmOperands"; }
+def u64_0Imm : Operand<i64> { let ParserMatchClass = u64_0ImmOperand; }
+def n1ConstOperand : AsmOperandClass { let Name = "n1Const"; }
+def n1Const : Operand<i32> { let ParserMatchClass = n1ConstOperand; }
 
 // This complex pattern exists only to create a machine instruction operand
 // of type "frame index". There doesn't seem to be a way to do that directly
@@ -305,28 +40,6 @@ def AddrFI : ComplexPattern<i32, 1, "SelectAddrFI", [frameindex], []>;
 def AddrGA : ComplexPattern<i32, 1, "SelectAddrGA", [], []>;
 def AddrGP : ComplexPattern<i32, 1, "SelectAddrGP", [], []>;
 
-// Address operands.
-
-let PrintMethod = "printGlobalOperand" in {
-  def globaladdress : Operand<i32>;
-  def globaladdressExt : Operand<i32>;
-}
-
-let PrintMethod = "printJumpTable" in
-def jumptablebase : Operand<i32>;
-
-def brtarget : Operand<OtherVT> {
-  let DecoderMethod = "brtargetDecoder";
-  let PrintMethod = "printBrtarget";
-}
-def brtargetExt : Operand<OtherVT> {
-  let DecoderMethod = "brtargetDecoder";
-  let PrintMethod = "printBrtarget";
-}
-def calltarget : Operand<i32> {
-  let DecoderMethod = "brtargetDecoder";
-  let PrintMethod = "printBrtarget";
-}
 
 def bblabel : Operand<i32>;
 def bbl     : SDNode<"ISD::BasicBlock", SDTPtrLeaf, [], "BasicBlockSDNode">;
diff --git a/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index f99a0645507c63d642bfb5477149e690b75a41d6..be50288849ca7fbc6f1d2b243750aa1d60d165ae 100644
--- a/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -208,7 +208,16 @@ bool HexagonOptAddrMode::allValidCandidates(NodeAddr<StmtNode *> SA,
     NodeAddr<UseNode *> UN = *I;
     RegisterRef UR = UN.Addr->getRegRef(*DFG);
     NodeSet Visited, Defs;
-    const auto &ReachingDefs = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
+    const auto &P = LV->getAllReachingDefsRec(UR, UN, Visited, Defs);
+    if (!P.second) {
+      DEBUG({
+        dbgs() << "*** Unable to collect all reaching defs for use ***\n"
+               << PrintNode<UseNode*>(UN, *DFG) << '\n'
+               << "The program's complexity may exceed the limits.\n";
+      });
+      return false;
+    }
+    const auto &ReachingDefs = P.first;
     if (ReachingDefs.size() > 1) {
       DEBUG({
         dbgs() << "*** Multiple Reaching Defs found!!! ***\n";
@@ -230,7 +239,7 @@ void HexagonOptAddrMode::getAllRealUses(NodeAddr<StmtNode *> SA,
   for (NodeAddr<DefNode *> DA : SA.Addr->members_if(DFG->IsDef, *DFG)) {
     DEBUG(dbgs() << "\t\t[DefNode]: " << Print<NodeAddr<DefNode *>>(DA, *DFG)
                  << "\n");
-    RegisterRef DR = DFG->normalizeRef(DA.Addr->getRegRef(*DFG));
+    RegisterRef DR = DFG->getPRI().normalize(DA.Addr->getRegRef(*DFG));
 
     auto UseSet = LV->getAllReachedUses(DR, DA);
 
@@ -617,7 +626,7 @@ bool HexagonOptAddrMode::constructDefMap(MachineBasicBlock *B) {
 
   for (NodeAddr<InstrNode *> IA : BA.Addr->members(*DFG)) {
     updateMap(IA);
-    DFG->pushDefs(IA, DefM);
+    DFG->pushAllDefs(IA, DefM);
   }
 
   MachineDomTreeNode *N = MDT->getNode(B);
diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td
index ad81287007e61e9415df9e8bdaa427a6415c57a9..b8c3bf0745cee53214d54d81754f0e2fa86e6d49 100644
--- a/lib/Target/Hexagon/HexagonPatterns.td
+++ b/lib/Target/Hexagon/HexagonPatterns.td
@@ -17,6 +17,16 @@ def HiReg: OutPatFrag<(ops node:$Rs), (EXTRACT_SUBREG (i64 $Rs), isub_hi)>;
 def IsOrAdd: PatFrag<(ops node:$Addr, node:$off),
     (or node:$Addr, node:$off), [{ return isOrEquivalentToAdd(N); }]>;
 
+def Iss4_6 : PatLeaf<(i32 imm), [{
+  int32_t V = N->getSExtValue();
+  return isShiftedInt<4,6>(V);
+}]>;
+
+def Iss4_7 : PatLeaf<(i32 imm), [{
+  int32_t V = N->getSExtValue();
+  return isShiftedInt<4,7>(V);
+}]>;
+
 def IsPow2_32 : PatLeaf<(i32 imm), [{
   uint32_t V = N->getZExtValue();
   return isPowerOf2_32(V);
@@ -89,6 +99,11 @@ def LogN2_64 : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(Log2_64(NV), SDLoc(N), MVT::i32);
 }]>;
 
+def ToZext64: OutPatFrag<(ops node:$Rs),
+  (i64 (A4_combineir 0, (i32 $Rs)))>;
+def ToSext64: OutPatFrag<(ops node:$Rs),
+  (i64 (A2_sxtw (i32 $Rs)))>;
+
 
 class T_CMP_pat <InstHexagon MI, PatFrag OpNode, PatLeaf ImmPred>
   : Pat<(i1 (OpNode I32:$src1, ImmPred:$src2)),
@@ -153,8 +168,12 @@ def: Pat<(sub s32_0ImmPred:$s10, IntRegs:$Rs),
 def: Pat<(not I32:$src1),
          (A2_subri -1, IntRegs:$src1)>;
 
+def TruncI64ToI32: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i32);
+}]>;
+
 def: Pat<(s32_0ImmPred:$s16), (A2_tfrsi imm:$s16)>;
-def: Pat<(s8_0Imm64Pred:$s8), (A2_tfrpi imm:$s8)>;
+def: Pat<(s8_0Imm64Pred:$s8), (A2_tfrpi (TruncI64ToI32 $s8))>;
 
 def : Pat<(select I1:$Pu, s32_0ImmPred:$s8, I32:$Rs),
           (C2_muxri I1:$Pu, imm:$s8, I32:$Rs)>;
@@ -274,7 +293,7 @@ def retflag : SDNode<"HexagonISD::RET_FLAG", SDTNone,
                      [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def eh_return: SDNode<"HexagonISD::EH_RETURN", SDTNone, [SDNPHasChain]>;
 
-def: Pat<(br bb:$dst),                  (J2_jump brtarget:$dst)>;
+def: Pat<(br bb:$dst),                  (J2_jump b30_2Imm:$dst)>;
 def: Pat<(brcond I1:$src1, bb:$block),  (J2_jumpt PredRegs:$src1, bb:$block)>;
 def: Pat<(brind I32:$dst),              (J2_jumpr IntRegs:$dst)>;
 
@@ -695,8 +714,8 @@ def HexagonCONST32    : SDNode<"HexagonISD::CONST32",    SDTHexagonCONST32>;
 def HexagonCONST32_GP : SDNode<"HexagonISD::CONST32_GP", SDTHexagonCONST32>;
 
 // Map TLS addressses to A2_tfrsi.
-def: Pat<(HexagonCONST32 tglobaltlsaddr:$addr), (A2_tfrsi s16_0Ext:$addr)>;
-def: Pat<(HexagonCONST32 bbl:$label),           (A2_tfrsi s16_0Ext:$label)>;
+def: Pat<(HexagonCONST32 tglobaltlsaddr:$addr), (A2_tfrsi s32_0Imm:$addr)>;
+def: Pat<(HexagonCONST32 bbl:$label),           (A2_tfrsi s32_0Imm:$label)>;
 
 def: Pat<(i64 imm:$v), (CONST64 imm:$v)>;
 def: Pat<(i1 0), (PS_false)>;
@@ -898,26 +917,35 @@ def: Pat<(i1 (setule I64:$src1, I64:$src2)),
          (C2_not (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2))>;
 
 // Sign extends.
-// i1 -> i32
-def: Pat<(i32 (sext I1:$src1)),
-         (C2_muxii PredRegs:$src1, -1, 0)>;
+// sext i1->i32
+def: Pat<(i32 (sext I1:$Pu)),
+         (C2_muxii I1:$Pu, -1, 0)>;
 
-// i1 -> i64
-def: Pat<(i64 (sext I1:$src1)),
-         (A2_combinew (A2_tfrsi -1), (C2_muxii PredRegs:$src1, -1, 0))>;
+// sext i1->i64
+def: Pat<(i64 (sext I1:$Pu)),
+         (A2_combinew (C2_muxii PredRegs:$Pu, -1, 0),
+                      (C2_muxii PredRegs:$Pu, -1, 0))>;
 
 // Zero extends.
-// i1 -> i32
-def: Pat<(i32 (zext I1:$src1)),
-         (C2_muxii PredRegs:$src1, 1, 0)>;
+// zext i1->i32
+def: Pat<(i32 (zext I1:$Pu)),
+         (C2_muxii PredRegs:$Pu, 1, 0)>;
+
+// zext i1->i64
+def: Pat<(i64 (zext I1:$Pu)),
+         (ToZext64 (C2_muxii PredRegs:$Pu, 1, 0))>;
+
+// zext i32->i64
+def: Pat<(Zext64 I32:$Rs),
+         (ToZext64 IntRegs:$Rs)>;
 
 // Map from Rs = Pd to Pd = mux(Pd, #1, #0)
-def: Pat<(i32 (anyext I1:$src1)),
-         (C2_muxii PredRegs:$src1, 1, 0)>;
+def: Pat<(i32 (anyext I1:$Pu)),
+         (C2_muxii PredRegs:$Pu, 1, 0)>;
 
-// Map from Rss = Pd to Rdd = sxtw (mux(Pd, #1, #0))
-def: Pat<(i64 (anyext I1:$src1)),
-         (A2_sxtw (C2_muxii PredRegs:$src1, 1, 0))>;
+// Map from Rss = Pd to Rdd = combine(#0, (mux(Pd, #1, #0)))
+def: Pat<(i64 (anyext I1:$Pu)),
+         (ToZext64 (C2_muxii PredRegs:$Pu, 1, 0))>;
 
 // Clear the sign bit in a 64-bit register.
 def ClearSign : OutPatFrag<(ops node:$Rss),
@@ -1244,11 +1272,6 @@ def: Pat<(HexagonCOMBINE s32_0ImmPred:$s8, s8_0ImmPred:$S8),
 }
 
 
-def ToZext64: OutPatFrag<(ops node:$Rs),
-  (i64 (A4_combineir 0, (i32 $Rs)))>;
-def ToSext64: OutPatFrag<(ops node:$Rs),
-  (i64 (A2_sxtw (i32 $Rs)))>;
-
 // Patterns to generate indexed loads with different forms of the address:
 // - frameindex,
 // - base + offset,
@@ -1349,14 +1372,6 @@ let AddedComplexity = 20 in {
   def: Loadxs_simple_pat<load,        i64, L4_loadrd_rr>;
 }
 
-// zext i1->i64
-def: Pat<(i64 (zext I1:$src1)),
-         (ToZext64 (C2_muxii PredRegs:$src1, 1, 0))>;
-
-// zext i32->i64
-def: Pat<(Zext64 I32:$src1),
-         (ToZext64 IntRegs:$src1)>;
-
 let AddedComplexity = 40 in
 multiclass T_StoreAbsReg_Pats <InstHexagon MI, RegisterClass RC, ValueType VT,
                            PatFrag stOp> {
@@ -1587,6 +1602,15 @@ def: Pat<(i64 (cttz I64:$Rss)), (ToZext64 (S2_ct0p I64:$Rss))>;
 def: Pat<(i64 (ctlz (not I64:$Rss))), (ToZext64 (S2_cl1p I64:$Rss))>;
 def: Pat<(i64 (cttz (not I64:$Rss))), (ToZext64 (S2_ct1p I64:$Rss))>;
 
+def: Pat<(i64 (ctpop I64:$Rss)), (ToZext64 (S5_popcountp I64:$Rss))>;
+def: Pat<(i32 (ctpop I32:$Rs)), (S5_popcountp (A4_combineir 0, I32:$Rs))>;
+
+def: Pat<(bitreverse I32:$Rs), (S2_brev I32:$Rs)>;
+def: Pat<(bitreverse I64:$Rss), (S2_brevp I64:$Rss)>;
+
+def: Pat<(bswap I32:$Rs), (A2_swiz I32:$Rs)>;
+def: Pat<(bswap I64:$Rss), (A2_combinew (A2_swiz (LoReg $Rss)),
+                                        (A2_swiz (HiReg $Rss)))>;
 
 let AddedComplexity = 20 in {   // Complexity greater than cmp reg-imm.
   def: Pat<(i1 (seteq (and (shl 1, u5_0ImmPred:$u5), I32:$Rs), 0)),
@@ -2235,12 +2259,6 @@ def ftoi : SDNodeXForm<fpimm, [{
 def: Pat<(sra (i64 (add (sra I64:$src1, u6_0ImmPred:$src2), 1)), (i32 1)),
          (S2_asr_i_p_rnd I64:$src1, imm:$src2)>;
 
-def SDTHexagonI32I64: SDTypeProfile<1, 1, [SDTCisVT<0, i32>,
-                                           SDTCisVT<1, i64>]>;
-def HexagonPOPCOUNT: SDNode<"HexagonISD::POPCOUNT", SDTHexagonI32I64>;
-
-def: Pat<(HexagonPOPCOUNT I64:$Rss), (S5_popcountp I64:$Rss)>;
-
 let AddedComplexity = 20 in {
   defm: Loadx_pat<load, f32, s30_2ImmPred, L2_loadri_io>;
   defm: Loadx_pat<load, f64, s29_3ImmPred, L2_loadrd_io>;
@@ -2718,17 +2736,6 @@ def unalignedstore : PatFrag<(ops node:$val, node:$addr), (store $val, $addr), [
 }]>;
 
 
-def s4_6ImmPred: PatLeaf<(i32 imm), [{
-  int64_t V = N->getSExtValue();
-  return isShiftedInt<4,6>(V);
-}]>;
-
-def s4_7ImmPred: PatLeaf<(i32 imm), [{
-  int64_t V = N->getSExtValue();
-  return isShiftedInt<4,7>(V);
-}]>;
-
-
 multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
   // Aligned stores
   def : Pat<(alignedstore (VTSgl VectorRegs:$src1), IntRegs:$addr),
@@ -2749,25 +2756,25 @@ multiclass vS32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
   // Fold Add R+OFF into vector store.
   let AddedComplexity = 10 in {
     def : Pat<(alignedstore (VTSgl VectorRegs:$src1),
-                     (add IntRegs:$src2, s4_6ImmPred:$offset)),
-              (V6_vS32b_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_6:$offset)),
+              (V6_vS32b_ai IntRegs:$src2, Iss4_6:$offset,
                            (VTSgl VectorRegs:$src1))>,
               Requires<[UseHVXSgl]>;
     def : Pat<(unalignedstore (VTSgl VectorRegs:$src1),
-                     (add IntRegs:$src2, s4_6ImmPred:$offset)),
-              (V6_vS32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_6:$offset)),
+              (V6_vS32Ub_ai IntRegs:$src2, Iss4_6:$offset,
                            (VTSgl VectorRegs:$src1))>,
               Requires<[UseHVXSgl]>;
 
     // Fold Add R+OFF into vector store 128B.
     def : Pat<(alignedstore (VTDbl VectorRegs128B:$src1),
-                     (add IntRegs:$src2, s4_7ImmPred:$offset)),
-              (V6_vS32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_7:$offset)),
+              (V6_vS32b_ai_128B IntRegs:$src2, Iss4_7:$offset,
                                 (VTDbl VectorRegs128B:$src1))>,
               Requires<[UseHVXDbl]>;
     def : Pat<(unalignedstore (VTDbl VectorRegs128B:$src1),
-                     (add IntRegs:$src2, s4_7ImmPred:$offset)),
-              (V6_vS32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset,
+                     (add IntRegs:$src2, Iss4_7:$offset)),
+              (V6_vS32Ub_ai_128B IntRegs:$src2, Iss4_7:$offset,
                                 (VTDbl VectorRegs128B:$src1))>,
               Requires<[UseHVXDbl]>;
   }
@@ -2798,18 +2805,18 @@ multiclass vL32b_ai_pats <ValueType VTSgl, ValueType VTDbl> {
 
   // Fold Add R+OFF into vector load.
   let AddedComplexity = 10 in {
-    def : Pat<(VTDbl (alignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
-              (V6_vL32b_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+    def : Pat<(VTDbl (alignedload (add IntRegs:$src2, Iss4_7:$offset))),
+              (V6_vL32b_ai_128B IntRegs:$src2, Iss4_7:$offset)>,
                Requires<[UseHVXDbl]>;
-    def : Pat<(VTDbl (unalignedload (add IntRegs:$src2, s4_7ImmPred:$offset))),
-              (V6_vL32Ub_ai_128B IntRegs:$src2, s4_7ImmPred:$offset)>,
+    def : Pat<(VTDbl (unalignedload (add IntRegs:$src2, Iss4_7:$offset))),
+              (V6_vL32Ub_ai_128B IntRegs:$src2, Iss4_7:$offset)>,
                Requires<[UseHVXDbl]>;
 
-    def : Pat<(VTSgl (alignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
-              (V6_vL32b_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+    def : Pat<(VTSgl (alignedload (add IntRegs:$src2, Iss4_6:$offset))),
+              (V6_vL32b_ai IntRegs:$src2, Iss4_6:$offset)>,
               Requires<[UseHVXSgl]>;
-    def : Pat<(VTSgl (unalignedload (add IntRegs:$src2, s4_6ImmPred:$offset))),
-              (V6_vL32Ub_ai IntRegs:$src2, s4_6ImmPred:$offset)>,
+    def : Pat<(VTSgl (unalignedload (add IntRegs:$src2, Iss4_6:$offset))),
+              (V6_vL32Ub_ai IntRegs:$src2, Iss4_6:$offset)>,
               Requires<[UseHVXSgl]>;
   }
 }
@@ -3253,8 +3260,8 @@ def vmpyh: OutPatFrag<(ops node:$Rs, node:$Rt),
                       (M2_vmpy2s_s0 (i32 $Rs), (i32 $Rt))>;
 
 def: Pat<(v2i16 (mul V2I16:$Rs, V2I16:$Rt)),
-         (LoReg (S2_vtrunewh (v2i32 (A2_combineii 0, 0)),
-                             (v2i32 (vmpyh V2I16:$Rs, V2I16:$Rt))))>;
+         (LoReg (S2_vtrunewh (A2_combineii 0, 0),
+                             (vmpyh V2I16:$Rs, V2I16:$Rt)))>;
 
 // Multiplies two v4i16 vectors.
 def: Pat<(v4i16 (mul V4I16:$Rs, V4I16:$Rt)),
@@ -3345,3 +3352,11 @@ def: Pat<(v2i32 (zextloadv2i8 I32:$Rs)),
 def: Pat<(v2i32 (sextloadv2i8 I32:$Rs)),
          (S2_vsxthw (LoReg (v4i16 (S2_vsxtbh (L2_loadrh_io I32:$Rs, 0)))))>;
 
+
+// Read cycle counter.
+//
+def SDTInt64Leaf: SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
+def HexagonREADCYCLE: SDNode<"HexagonISD::READCYCLE", SDTInt64Leaf,
+  [SDNPHasChain]>;
+
+def: Pat<(HexagonREADCYCLE), (A4_tfrcpp UPCYCLE)>;
diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td
new file mode 100644
index 0000000000000000000000000000000000000000..5a720e794562d6a05e5eed5bad66bdff248f8ec6
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonPseudo.td
@@ -0,0 +1,537 @@
+//===--- HexagonPseudo.td -------------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+let PrintMethod = "printGlobalOperand" in {
+  def globaladdress : Operand<i32>;
+  def globaladdressExt : Operand<i32>;
+}
+
+let isPseudo = 1 in {
+let isCodeGenOnly = 0 in
+def A2_iconst : Pseudo<(outs IntRegs:$Rd32), (ins s23_2Imm:$Ii), "${Rd32}=iconst(#${Ii})">;
+def DUPLEX_Pseudo : InstHexagon<(outs), (ins s32_0Imm:$offset), "DUPLEX", [], "", DUPLEX, TypePSEUDO>;
+}
+
+let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
+    isAsmParserOnly = 1 in
+def TFRI64_V2_ext : ALU64_rr<(outs DoubleRegs:$dst),
+                             (ins s32_0Imm:$src1, s8_0Imm:$src2),
+                             "$dst=combine(#$src1,#$src2)">;
+
+// HI/LO Instructions
+let isReMaterializable = 1, isMoveImm = 1, hasSideEffects = 0,
+    hasNewValue = 1, opNewValue = 0 in
+class REG_IMMED<string RegHalf, bit Rs, bits<3> MajOp, bit MinOp>
+  : InstHexagon<(outs IntRegs:$dst),
+              (ins u16_0Imm:$imm_value),
+              "$dst"#RegHalf#"=#$imm_value", [], "", ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>, OpcodeHexagon {
+    bits<5> dst;
+    bits<32> imm_value;
+
+    let Inst{27} = Rs;
+    let Inst{26-24} = MajOp;
+    let Inst{21} = MinOp;
+    let Inst{20-16} = dst;
+    let Inst{23-22} = imm_value{15-14};
+    let Inst{13-0} = imm_value{13-0};
+}
+
+let isAsmParserOnly = 1 in {
+  def LO : REG_IMMED<".l", 0b0, 0b001, 0b1>;
+  def HI : REG_IMMED<".h", 0b0, 0b010, 0b1>;
+}
+
+let isReMaterializable = 1, isMoveImm = 1, isAsmParserOnly = 1 in {
+  def CONST32 : CONSTLDInst<(outs IntRegs:$Rd), (ins i32imm:$v),
+                "$Rd = CONST32(#$v)", []>;
+  def CONST64 : CONSTLDInst<(outs DoubleRegs:$Rd), (ins i64imm:$v),
+                "$Rd = CONST64(#$v)", []>;
+}
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in
+def PS_true : SInst<(outs PredRegs:$dst), (ins), "", []>;
+
+let hasSideEffects = 0, isReMaterializable = 1, isPseudo = 1,
+    isCodeGenOnly = 1 in
+def PS_false : SInst<(outs PredRegs:$dst), (ins), "", []>;
+
+let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in
+def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt),
+                              ".error \"should not emit\" ", []>;
+
+let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in
+def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2),
+                             ".error \"should not emit\" ", []>;
+
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC0], Uses = [SA0, LC0] in {
+def ENDLOOP0 : Endloop<(outs), (ins b30_2Imm:$offset),
+                       ":endloop0",
+                       []>;
+}
+
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0,
+    Defs = [PC, LC1], Uses = [SA1, LC1] in {
+def ENDLOOP1 : Endloop<(outs), (ins b30_2Imm:$offset),
+                       ":endloop1",
+                       []>;
+}
+
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, hasSideEffects = 0 in
+class LOOP_iBase<string mnemonic, Operand brOp, bit mustExtend = 0>
+         : CRInst<(outs), (ins brOp:$offset, u10_0Imm:$src2),
+           #mnemonic#"($offset,#$src2)",
+           [], "" , CR_tc_3x_SLOT3> {
+    bits<9> offset;
+    bits<10> src2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-22} = 0b100100;
+    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+    let Inst{20-16} = src2{9-5};
+    let Inst{12-8} = offset{8-4};
+    let Inst{7-5} = src2{4-2};
+    let Inst{4-3} = offset{3-2};
+    let Inst{1-0} = src2{1-0};
+}
+
+let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
+    opExtendable = 0, hasSideEffects = 0 in
+class LOOP_rBase<string mnemonic, Operand brOp, bit mustExtend = 0>
+         : CRInst<(outs), (ins brOp:$offset, IntRegs:$src2),
+           #mnemonic#"($offset,$src2)",
+           [], "" ,CR_tc_3x_SLOT3> {
+    bits<9> offset;
+    bits<5> src2;
+
+    let IClass = 0b0110;
+
+    let Inst{27-22} = 0b000000;
+    let Inst{21} = !if (!eq(mnemonic, "loop0"), 0b0, 0b1);
+    let Inst{20-16} = src2;
+    let Inst{12-8} = offset{8-4};
+    let Inst{4-3} = offset{3-2};
+  }
+
+multiclass LOOP_ri<string mnemonic> {
+  let isCodeGenOnly = 1, isExtended = 1, opExtendable = 0 in {
+    def iext: LOOP_iBase<mnemonic, b30_2Imm, 1>;
+    def rext: LOOP_rBase<mnemonic, b30_2Imm, 1>;
+  }
+}
+
+
+let Defs = [SA0, LC0, USR] in
+defm J2_loop0 : LOOP_ri<"loop0">;
+
+// Interestingly only loop0's appear to set usr.lpcfg
+let Defs = [SA1, LC1] in
+defm J2_loop1 : LOOP_ri<"loop1">;
+
+let isCall = 1, hasSideEffects = 1, isPredicable = 0,
+    isExtended = 0, isExtendable = 1, opExtendable = 0,
+    isExtentSigned = 1, opExtentBits = 24, opExtentAlign = 2 in
+class T_Call<string ExtStr>
+  : JInst<(outs), (ins a30_2Imm:$dst),
+      "call " # ExtStr # "$dst", [], "", J_tc_2early_SLOT23> {
+  let BaseOpcode = "call";
+  bits<24> dst;
+
+  let IClass = 0b0101;
+  let Inst{27-25} = 0b101;
+  let Inst{24-16,13-1} = dst{23-2};
+  let Inst{0} = 0b0;
+}
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1, Defs = [R16],
+    isPredicable = 0 in
+def CALLProfile :  T_Call<"">;
+
+let isCodeGenOnly = 1, isCall = 1, hasSideEffects = 1,
+    Defs = [PC, R31, R6, R7, P0] in
+def PS_call_stk : T_Call<"">;
+
+let isCall = 1, hasSideEffects = 1, cofMax1 = 1 in
+class JUMPR_MISC_CALLR<bit isPred, bit isPredNot,
+               dag InputDag = (ins IntRegs:$Rs)>
+  : JInst<(outs), InputDag,
+      !if(isPred, !if(isPredNot, "if (!$Pu) callr $Rs",
+                                 "if ($Pu) callr $Rs"),
+                                 "callr $Rs"),
+      [], "", J_tc_2early_SLOT2> {
+    bits<5> Rs;
+    bits<2> Pu;
+    let isPredicated = isPred;
+    let isPredicatedFalse = isPredNot;
+
+    let IClass = 0b0101;
+    let Inst{27-25} = 0b000;
+    let Inst{24-23} = !if (isPred, 0b10, 0b01);
+    let Inst{22} = 0;
+    let Inst{21} = isPredNot;
+    let Inst{9-8} = !if (isPred, Pu, 0b00);
+    let Inst{20-16} = Rs;
+
+  }
+
+let isCodeGenOnly = 1 in {
+  def PS_callr_nr : JUMPR_MISC_CALLR<0, 1>; // Call, no return.
+}
+
+let isCall = 1, hasSideEffects = 1,
+    isExtended = 0, isExtendable = 1, opExtendable = 0, isCodeGenOnly = 1,
+    BaseOpcode = "PS_call_nr", isExtentSigned = 1, opExtentAlign = 2,
+    Itinerary = J_tc_2early_SLOT23 in
+class Call_nr<bits<5> nbits, bit isPred, bit isFalse, dag iops>
+  : Pseudo<(outs), iops, "">, PredRel {
+    bits<2> Pu;
+    bits<17> dst;
+    let opExtentBits = nbits;
+    let isPredicable = 0;  // !if(isPred, 0, 1);
+    let isPredicated = 0;  // isPred;
+    let isPredicatedFalse = isFalse;
+}
+
+def PS_call_nr : Call_nr<24, 0, 0, (ins s32_0Imm:$Ii)>;
+//def PS_call_nrt: Call_nr<17, 1, 0, (ins PredRegs:$Pu, s32_0Imm:$dst)>;
+//def PS_call_nrf: Call_nr<17, 1, 1, (ins PredRegs:$Pu, s32_0Imm:$dst)>;
+
+let isBranch = 1, isIndirectBranch = 1, isBarrier = 1, Defs = [PC],
+    isPredicable = 1, hasSideEffects = 0, InputType = "reg",
+    cofMax1 = 1 in
+class T_JMPr
+  :  InstHexagon<(outs), (ins IntRegs:$dst), "jumpr $dst", [],
+                 "", J_tc_2early_SLOT2, TypeJ>, OpcodeHexagon {
+    bits<5> dst;
+
+    let IClass = 0b0101;
+    let Inst{27-21} = 0b0010100;
+    let Inst{20-16} = dst;
+}
+
+// A return through builtin_eh_return.
+let isReturn = 1, isTerminator = 1, isBarrier = 1, hasSideEffects = 0,
+    isCodeGenOnly = 1, Defs = [PC], Uses = [R28], isPredicable = 0 in
+def EH_RETURN_JMPR : T_JMPr;
+
+// Indirect tail-call.
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+    isTerminator = 1, isCodeGenOnly = 1 in
+def PS_tailcall_r : T_JMPr;
+
+//
+// Direct tail-calls.
+let isPseudo = 1, isCall = 1, isReturn = 1, isBarrier = 1, isPredicable = 0,
+    isTerminator = 1, isCodeGenOnly = 1 in
+def PS_tailcall_i : Pseudo<(outs), (ins a30_2Imm:$dst), "", []>;
+
+let isCodeGenOnly = 1, isPseudo = 1, Uses = [R30], hasSideEffects = 0 in
+def PS_aligna : Pseudo<(outs IntRegs:$Rd), (ins u32_0Imm:$A), "", []>;
+
+// Generate frameindex addresses. The main reason for the offset operand is
+// that every instruction that is allowed to have frame index as an operand
+// will then have that operand followed by an immediate operand (the offset).
+// This simplifies the frame-index elimination code.
+//
+let isMoveImm = 1, isAsCheapAsAMove = 1, isReMaterializable = 1,
+    isPseudo = 1, isCodeGenOnly = 1, hasSideEffects = 0 in {
+  def PS_fi  : Pseudo<(outs IntRegs:$Rd),
+                         (ins IntRegs:$fi, s32_0Imm:$off), "">;
+  def PS_fia : Pseudo<(outs IntRegs:$Rd),
+                         (ins IntRegs:$Rs, IntRegs:$fi, s32_0Imm:$off), "">;
+}
+
+class CondStr<string CReg, bit True, bit New> {
+  string S = "if (" # !if(True,"","!") # CReg # !if(New,".new","") # ") ";
+}
+class JumpOpcStr<string Mnemonic, bit New, bit Taken> {
+  string S = Mnemonic # !if(Taken, ":t", ":nt");
+}
+let isBranch = 1, isIndirectBranch = 1, Defs = [PC], isPredicated = 1,
+    hasSideEffects = 0, InputType = "reg", cofMax1 = 1 in
+class T_JMPr_c <bit PredNot, bit isPredNew, bit isTak>
+  :  InstHexagon<(outs), (ins PredRegs:$src, IntRegs:$dst),
+                 CondStr<"$src", !if(PredNot,0,1), isPredNew>.S #
+                 JumpOpcStr<"jumpr", isPredNew, isTak>.S # " $dst",
+                 [], "", J_tc_2early_SLOT2, TypeJ>, OpcodeHexagon {
+
+    let isTaken = isTak;
+    let isPredicatedFalse = PredNot;
+    let isPredicatedNew = isPredNew;
+    bits<2> src;
+    bits<5> dst;
+
+    let IClass = 0b0101;
+
+    let Inst{27-22} = 0b001101;
+    let Inst{21} = PredNot;
+    let Inst{20-16} = dst;
+    let Inst{12} = isTak;
+    let Inst{11} = isPredNew;
+    let Inst{9-8} = src;
+}
+multiclass JMPR_Pred<bit PredNot> {
+  def NAME        : T_JMPr_c<PredNot, 0, 0>; // not taken
+  // Predicate new
+  def NAME#newpt  : T_JMPr_c<PredNot, 1, 1>; // taken
+  def NAME#new    : T_JMPr_c<PredNot, 1, 0>; // not taken
+}
+multiclass JMPR_base<string BaseOp> {
+  let BaseOpcode = BaseOp in {
+    def NAME : T_JMPr;
+    defm t : JMPR_Pred<0>;
+    defm f : JMPR_Pred<1>;
+  }
+}
+let isTerminator = 1, hasSideEffects = 0, isReturn = 1, isCodeGenOnly = 1, isBarrier = 1 in
+defm PS_jmpret : JMPR_base<"JMPret">, PredNewRel;
+
+//defm V6_vtran2x2_map : HexagonMapping<(outs VectorRegs:$Vy32, VectorRegs:$Vx32), (ins VectorRegs:$Vx32in, IntRegs:$Rt32), "vtrans2x2(${Vy32},${Vx32},${Rt32})", (V6_vshuff VectorRegs:$Vy32, VectorRegs:$Vx32, VectorRegs:$Vx32in, IntRegs:$Rt32)>;
+
+// The reason for the custom inserter is to record all ALLOCA instructions
+// in MachineFunctionInfo.
+let Defs = [R29], isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 1 in
+def PS_alloca: InstHexagon<(outs IntRegs:$Rd),
+      (ins IntRegs:$Rs, u32_0Imm:$A), "",
+      [], "", ALU32_2op_tc_1_SLOT0123, TypeALU32_2op>;
+
+// Load predicate.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_pred : LDInst<(outs PredRegs:$dst),
+                        (ins IntRegs:$addr, s32_0Imm:$off),
+                        ".error \"should not emit\"", []>;
+
+// Load modifier.
+let isExtendable = 1, opExtendable = 2, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def LDriw_mod : LDInst<(outs ModRegs:$dst),
+                        (ins IntRegs:$addr, s32_0Imm:$off),
+                        ".error \"should not emit\"", []>;
+
+// Vector load
+let Predicates = [HasV60T, UseHVX] in
+let mayLoad = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+  class V6_LDInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                  string cstr = "", InstrItinClass itin = CVI_VM_LD,
+                  IType type = TypeCVI_VM_LD>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+// Vector store
+let Predicates = [HasV60T, UseHVX] in
+let mayStore = 1, validSubTargets = HasV60SubT, hasSideEffects = 0 in
+class V6_STInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+                string cstr = "", InstrItinClass itin = CVI_VM_ST,
+                IType type = TypeCVI_VM_ST>
+: InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+let isCodeGenOnly = 1, isPseudo = 1 in
+def PS_pselect : ALU64_rr<(outs DoubleRegs:$Rd),
+      (ins PredRegs:$Pu, DoubleRegs:$Rs, DoubleRegs:$Rt),
+      ".error \"should not emit\" ", []>;
+
+let isBranch = 1, isBarrier = 1, Defs = [PC], hasSideEffects = 0,
+    isPredicable = 1,
+    isExtendable = 1, opExtendable = 0, isExtentSigned = 1,
+    opExtentBits = 24, opExtentAlign = 2, InputType = "imm" in
+class T_JMP<string ExtStr>
+  : JInst_CJUMP_UCJUMP<(outs), (ins b30_2Imm:$dst),
+      "jump " # ExtStr # "$dst",
+      [], "", J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT> {
+    bits<24> dst;
+    let IClass = 0b0101;
+
+    let Inst{27-25} = 0b100;
+    let Inst{24-16} = dst{23-15};
+    let Inst{13-1} = dst{14-2};
+}
+
+// Restore registers and dealloc return function call.
+let isCall = 1, isBarrier = 1, isReturn = 1, isTerminator = 1,
+    Defs = [R29, R30, R31, PC], isPredicable = 0, isAsmParserOnly = 1 in {
+  def RESTORE_DEALLOC_RET_JMP_V4 : T_JMP<"">;
+
+  let isExtended = 1, opExtendable = 0 in
+  def RESTORE_DEALLOC_RET_JMP_V4_EXT : T_JMP<"">;
+
+  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+    def RESTORE_DEALLOC_RET_JMP_V4_PIC : T_JMP<"">;
+
+    let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_RET_JMP_V4_EXT_PIC : T_JMP<"">;
+  }
+}
+
+// Restore registers and dealloc frame before a tail call.
+let isCall = 1, Defs = [R29, R30, R31, PC], isAsmParserOnly = 1 in {
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4 : T_Call<"">, PredRel;
+
+  let isExtended = 1, opExtendable = 0 in
+  def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28, R29, R30, R31, PC] in {
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC : T_Call<"">, PredRel;
+
+    let isExtended = 1, opExtendable = 0 in
+    def RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC : T_Call<"">, PredRel;
+  }
+}
+
+// Save registers function call.
+let isCall = 1, Uses = [R29, R31], isAsmParserOnly = 1 in {
+  def SAVE_REGISTERS_CALL_V4 : T_Call<"">, PredRel;
+
+  let isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4_EXT : T_Call<"">, PredRel;
+
+  let Defs = [P0] in
+  def SAVE_REGISTERS_CALL_V4STK : T_Call<"">, PredRel;
+
+  let Defs = [P0], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4STK_EXT : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28] in
+  def SAVE_REGISTERS_CALL_V4_PIC : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4_EXT_PIC : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28, P0] in
+  def SAVE_REGISTERS_CALL_V4STK_PIC : T_Call<"">, PredRel;
+
+  let Defs = [R14, R15, R28, P0], isExtended = 1, opExtendable = 0 in
+  def SAVE_REGISTERS_CALL_V4STK_EXT_PIC : T_Call<"">, PredRel;
+}
+
+// Vector load/store pseudos
+
+let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
+class STrivv_template<RegisterClass RC>
+  : V6_STInst<(outs), (ins IntRegs:$addr, s32_0Imm:$off, RC:$src), "", []>;
+
+def PS_vstorerw_ai: STrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerwu_ai: STrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vstorerw_ai_128B: STrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+def PS_vstorerwu_ai_128B: STrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+
+let isPseudo = 1, isCodeGenOnly = 1, validSubTargets = HasV60SubT in
+class LDrivv_template<RegisterClass RC>
+  : V6_LDInst<(outs RC:$dst), (ins IntRegs:$addr, s32_0Imm:$off), "", []>;
+
+def PS_vloadrw_ai: LDrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrwu_ai: LDrivv_template<VecDblRegs>,
+      Requires<[HasV60T,UseHVXSgl]>;
+def PS_vloadrw_ai_128B: LDrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+def PS_vloadrwu_ai_128B: LDrivv_template<VecDblRegs128B>,
+      Requires<[HasV60T,UseHVXDbl]>;
+
+// Store vector predicate pseudo.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, mayStore = 1, hasSideEffects = 0 in {
+  def PS_vstorerq_ai : STInst<(outs),
+              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs:$src1),
+              ".error \"should not emit\" ", []>,
+              Requires<[HasV60T,UseHVXSgl]>;
+
+  def PS_vstorerq_ai_128B : STInst<(outs),
+              (ins IntRegs:$base, s32_0Imm:$offset, VectorRegs:$src1),
+              ".error \"should not emit\" ", []>,
+            Requires<[HasV60T,UseHVXSgl]>;
+
+  def PS_vloadrq_ai : STInst<(outs),
+              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
+              ".error \"should not emit\" ", []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+
+  def PS_vloadrq_ai_128B : STInst<(outs),
+              (ins IntRegs:$base, s32_0Imm:$offset, VecPredRegs128B:$src1),
+              ".error \"should not emit\" ", []>,
+            Requires<[HasV60T,UseHVXDbl]>;
+}
+
+class VSELInst<dag outs, dag ins, string asmstr, list<dag> pattern = [],
+              string cstr = "", InstrItinClass itin = CVI_VA_DV,
+              IType type = TypeCVI_VA_DV>
+  : InstHexagon<outs, ins, asmstr, pattern, cstr, itin, type>;
+
+let isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in {
+  def PS_vselect: VSELInst<(outs VectorRegs:$dst),
+        (ins PredRegs:$src1, VectorRegs:$src2, VectorRegs:$src3), "", []>,
+        Requires<[HasV60T,UseHVXSgl]>;
+  def PS_vselect_128B: VSELInst<(outs VectorRegs128B:$dst),
+        (ins PredRegs:$src1, VectorRegs128B:$src2, VectorRegs128B:$src3),
+        "", []>, Requires<[HasV60T,UseHVXDbl]>;
+  def PS_wselect: VSELInst<(outs VecDblRegs:$dst),
+        (ins PredRegs:$src1, VecDblRegs:$src2, VecDblRegs:$src3), "", []>,
+        Requires<[HasV60T,UseHVXSgl]>;
+  def PS_wselect_128B: VSELInst<(outs VecDblRegs128B:$dst),
+        (ins PredRegs:$src1, VecDblRegs128B:$src2, VecDblRegs128B:$src3),
+        "", []>, Requires<[HasV60T,UseHVXDbl]>;
+}
+
+// Store predicate.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_pred : STInst<(outs),
+      (ins IntRegs:$addr, s32_0Imm:$off, PredRegs:$src1),
+      ".error \"should not emit\"", []>;
+// Store modifier.
+let isExtendable = 1, opExtendable = 1, isExtentSigned = 1, opExtentBits = 13,
+    isCodeGenOnly = 1, isPseudo = 1, hasSideEffects = 0 in
+def STriw_mod : STInst<(outs),
+      (ins IntRegs:$addr, s32_0Imm:$off, ModRegs:$src1),
+      ".error \"should not emit\"", []>;
+
+let isExtendable = 1, opExtendable = 1, opExtentBits = 6,
+    isAsmParserOnly = 1 in
+def TFRI64_V4 : ALU64_rr<(outs DoubleRegs:$dst), (ins u64_0Imm:$src1),
+                         "$dst = #$src1">;
+
+// Hexagon doesn't have a vector multiply with C semantics.
+// Instead, generate a pseudo instruction that gets expaneded into two
+// scalar MPYI instructions.
+// This is expanded by ExpandPostRAPseudos.
+let isPseudo = 1 in
+def PS_vmulw : PseudoM<(outs DoubleRegs:$Rd),
+      (ins DoubleRegs:$Rs, DoubleRegs:$Rt), "", []>;
+
+let isPseudo = 1 in
+def PS_vmulw_acc : PseudoM<(outs DoubleRegs:$Rd),
+      (ins DoubleRegs:$Rx, DoubleRegs:$Rs, DoubleRegs:$Rt), "", [],
+      "$Rd = $Rx">;
+
+def DuplexIClass0:  InstDuplex < 0 >;
+def DuplexIClass1:  InstDuplex < 1 >;
+def DuplexIClass2:  InstDuplex < 2 >;
+let isExtendable = 1 in {
+  def DuplexIClass3:  InstDuplex < 3 >;
+  def DuplexIClass4:  InstDuplex < 4 >;
+  def DuplexIClass5:  InstDuplex < 5 >;
+  def DuplexIClass6:  InstDuplex < 6 >;
+  def DuplexIClass7:  InstDuplex < 7 >;
+}
+def DuplexIClass8:  InstDuplex < 8 >;
+def DuplexIClass9:  InstDuplex < 9 >;
+def DuplexIClassA:  InstDuplex < 0xA >;
+def DuplexIClassB:  InstDuplex < 0xB >;
+def DuplexIClassC:  InstDuplex < 0xC >;
+def DuplexIClassD:  InstDuplex < 0xD >;
+def DuplexIClassE:  InstDuplex < 0xE >;
+def DuplexIClassF:  InstDuplex < 0xF >;
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index a09ae25384ef2013ec988665d0843a46d0b2a20b..2a1bb63af78924bb81d2321ab4d533671cf8d1c1 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -36,6 +36,9 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 
+#define GET_REGINFO_TARGET_DESC
+#include "HexagonGenRegisterInfo.inc"
+
 using namespace llvm;
 
 HexagonRegisterInfo::HexagonRegisterInfo()
@@ -125,6 +128,7 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   case HexagonSubtarget::V5:
   case HexagonSubtarget::V55:
   case HexagonSubtarget::V60:
+  case HexagonSubtarget::V62:
     return HasEHReturn ? CalleeSavedRegsV3EHReturn : CalleeSavedRegsV3;
   }
 
@@ -133,23 +137,38 @@ HexagonRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
 }
 
 
+const uint32_t *HexagonRegisterInfo::getCallPreservedMask(
+      const MachineFunction &MF, CallingConv::ID) const {
+  return HexagonCSR_RegMask;
+}
+
+
 BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
   const {
   BitVector Reserved(getNumRegs());
   Reserved.set(Hexagon::R29);
   Reserved.set(Hexagon::R30);
   Reserved.set(Hexagon::R31);
-  Reserved.set(Hexagon::SA0); // C0
-  Reserved.set(Hexagon::LC0); // C1
-  Reserved.set(Hexagon::SA1); // C2
-  Reserved.set(Hexagon::LC1); // C3
-  Reserved.set(Hexagon::USR); // C8
-  Reserved.set(Hexagon::PC);  // C9
-  Reserved.set(Hexagon::UGP); // C10
-  Reserved.set(Hexagon::GP);  // C11
-  Reserved.set(Hexagon::CS0); // C12
-  Reserved.set(Hexagon::CS1); // C13
-
+  // Control registers.
+  Reserved.set(Hexagon::SA0);         // C0
+  Reserved.set(Hexagon::LC0);         // C1
+  Reserved.set(Hexagon::SA1);         // C2
+  Reserved.set(Hexagon::LC1);         // C3
+  Reserved.set(Hexagon::P3_0);        // C4
+  Reserved.set(Hexagon::USR);         // C8
+  Reserved.set(Hexagon::PC);          // C9
+  Reserved.set(Hexagon::UGP);         // C10
+  Reserved.set(Hexagon::GP);          // C11
+  Reserved.set(Hexagon::CS0);         // C12
+  Reserved.set(Hexagon::CS1);         // C13
+  Reserved.set(Hexagon::UPCYCLELO);   // C14
+  Reserved.set(Hexagon::UPCYCLEHI);   // C15
+  Reserved.set(Hexagon::FRAMELIMIT);  // C16
+  Reserved.set(Hexagon::FRAMEKEY);    // C17
+  Reserved.set(Hexagon::PKTCOUNTLO);  // C18
+  Reserved.set(Hexagon::PKTCOUNTHI);  // C19
+  Reserved.set(Hexagon::UTIMERLO);    // C30
+  Reserved.set(Hexagon::UTIMERHI);    // C31
   // Out of the control registers, only C8 is explicitly defined in
   // HexagonRegisterInfo.td. If others are defined, make sure to add
   // them here as well.
@@ -274,6 +293,3 @@ unsigned HexagonRegisterInfo::getFirstCallerSavedNonParamReg() const {
   return Hexagon::R6;
 }
 
-
-#define GET_REGINFO_TARGET_DESC
-#include "HexagonGenRegisterInfo.inc"
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.h b/lib/Target/Hexagon/HexagonRegisterInfo.h
index 1fb295b5bd8c8704b039b1b25e58d30b5b3f35a9..8a3f175b84881c7f25479566f7f04fffe85aff8a 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -35,7 +35,8 @@ public:
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF)
         const override;
-
+  const uint32_t *getCallPreservedMask(const MachineFunction &MF,
+        CallingConv::ID) const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
diff --git a/lib/Target/Hexagon/HexagonRegisterInfo.td b/lib/Target/Hexagon/HexagonRegisterInfo.td
index 582ab7289f3e9f4ae6c364f431fda191265b572e..93ab2f73120716a433c4a9886cfdd71d9ba9a141 100644
--- a/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -140,43 +140,54 @@ let Namespace = "Hexagon" in {
   }
 
   // Control registers.
-  def SA0  : Rc<0,  "sa0",       ["c0"]>,   DwarfRegNum<[67]>;
-  def LC0  : Rc<1,  "lc0",       ["c1"]>,   DwarfRegNum<[68]>;
-  def SA1  : Rc<2,  "sa1",       ["c2"]>,   DwarfRegNum<[69]>;
-  def LC1  : Rc<3,  "lc1",       ["c3"]>,   DwarfRegNum<[70]>;
-  def P3_0 : Rc<4,  "p3:0",      ["c4"], [P0, P1, P2, P3]>,
-                                            DwarfRegNum<[71]>;
+  def SA0:        Rc<0,  "sa0",        ["c0"]>,    DwarfRegNum<[67]>;
+  def LC0:        Rc<1,  "lc0",        ["c1"]>,    DwarfRegNum<[68]>;
+  def SA1:        Rc<2,  "sa1",        ["c2"]>,    DwarfRegNum<[69]>;
+  def LC1:        Rc<3,  "lc1",        ["c3"]>,    DwarfRegNum<[70]>;
+  def P3_0:       Rc<4,  "p3:0",       ["c4"], [P0, P1, P2, P3]>,
+                                                   DwarfRegNum<[71]>;
   // When defining more Cn registers, make sure to explicitly mark them
   // as reserved in HexagonRegisterInfo.cpp.
-  def C5   : Rc<5,  "c5",        ["c5"]>,   DwarfRegNum<[72]>; // future use
-  def C6   : Rc<6,  "c6",        [], [M0]>, DwarfRegNum<[73]>;
-  def C7   : Rc<7,  "c7",        [], [M1]>, DwarfRegNum<[74]>;
+  def C5:         Rc<5,  "c5",         ["c5"]>,    DwarfRegNum<[72]>;
+  def C6:         Rc<6,  "c6",         [], [M0]>,  DwarfRegNum<[73]>;
+  def C7:         Rc<7,  "c7",         [], [M1]>,  DwarfRegNum<[74]>;
   // Define C8 separately and make it aliased with USR.
   // The problem is that USR has subregisters (e.g. overflow). If USR was
   // specified as a subregister of C9_8, it would imply that subreg_overflow
   // and isub_lo can be composed, which leads to all kinds of issues
   // with lane masks.
-  def C8   : Rc<8,  "c8",       [], [USR]>, DwarfRegNum<[75]>;
-  def PC   : Rc<9,  "pc">,                  DwarfRegNum<[76]>;
-  def UGP  : Rc<10, "ugp",       ["c10"]>,  DwarfRegNum<[77]>;
-  def GP   : Rc<11, "gp",        ["c11"]>,  DwarfRegNum<[78]>;
-  def CS0  : Rc<12, "cs0",       ["c12"]>,  DwarfRegNum<[79]>;
-  def CS1  : Rc<13, "cs1",       ["c13"]>,  DwarfRegNum<[80]>;
-  def UPCL : Rc<14, "upcyclelo", ["c14"]>,  DwarfRegNum<[81]>;
-  def UPCH : Rc<15, "upcyclehi", ["c15"]>,  DwarfRegNum<[82]>;
+  def C8:         Rc<8,  "c8",         [], [USR]>, DwarfRegNum<[75]>;
+  def PC:         Rc<9,  "pc">,                    DwarfRegNum<[76]>;
+  def UGP:        Rc<10, "ugp",        ["c10"]>,   DwarfRegNum<[77]>;
+  def GP:         Rc<11, "gp",         ["c11"]>,   DwarfRegNum<[78]>;
+  def CS0:        Rc<12, "cs0",        ["c12"]>,   DwarfRegNum<[79]>;
+  def CS1:        Rc<13, "cs1",        ["c13"]>,   DwarfRegNum<[80]>;
+  def UPCYCLELO:  Rc<14, "upcyclelo",  ["c14"]>,   DwarfRegNum<[81]>;
+  def UPCYCLEHI:  Rc<15, "upcyclehi",  ["c15"]>,   DwarfRegNum<[82]>;
+  def FRAMELIMIT: Rc<16, "framelimit", ["c16"]>,   DwarfRegNum<[83]>;
+  def FRAMEKEY:   Rc<17, "framekey",   ["c17"]>,   DwarfRegNum<[84]>;
+  def PKTCOUNTLO: Rc<18, "pktcountlo", ["c18"]>,   DwarfRegNum<[85]>;
+  def PKTCOUNTHI: Rc<19, "pktcounthi", ["c19"]>,   DwarfRegNum<[86]>;
+  def UTIMERLO:   Rc<30, "utimerlo",   ["c30"]>,   DwarfRegNum<[97]>;
+  def UTIMERHI:   Rc<31, "utimerhi",   ["c31"]>,   DwarfRegNum<[98]>;
 }
 
   // Control registers pairs.
   let SubRegIndices = [isub_lo, isub_hi], CoveredBySubRegs = 1 in {
-    def C1_0   : Rcc<0,   "c1:0",  [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
-    def C3_2   : Rcc<2,   "c3:2",  [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
-    def C5_4   : Rcc<4,   "c5:4",  [P3_0, C5]>,              DwarfRegNum<[71]>;
-    def C7_6   : Rcc<6,   "c7:6",  [C6, C7],   ["m1:0"]>,    DwarfRegNum<[72]>;
+    def C1_0:     Rcc<0,  "c1:0",   [SA0, LC0], ["lc0:sa0"]>, DwarfRegNum<[67]>;
+    def C3_2:     Rcc<2,  "c3:2",   [SA1, LC1], ["lc1:sa1"]>, DwarfRegNum<[69]>;
+    def C5_4:     Rcc<4,  "c5:4",   [P3_0, C5]>,              DwarfRegNum<[71]>;
+    def C7_6:     Rcc<6,  "c7:6",   [C6, C7],   ["m1:0"]>,    DwarfRegNum<[72]>;
     // Use C8 instead of USR as a subregister of C9_8.
-    def C9_8   : Rcc<8,   "c9:8",  [C8, PC]>,                DwarfRegNum<[74]>;
-    def C11_10 : Rcc<10, "c11:10", [UGP, GP]>,               DwarfRegNum<[76]>;
-    def CS     : Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>,   DwarfRegNum<[78]>;
-    def UPC    : Rcc<14, "c15:14", [UPCL, UPCH]>,            DwarfRegNum<[80]>;
+    def C9_8:     Rcc<8,  "c9:8",   [C8, PC]>,                DwarfRegNum<[74]>;
+    def C11_10:   Rcc<10, "c11:10", [UGP, GP]>,               DwarfRegNum<[76]>;
+    def CS:       Rcc<12, "c13:12", [CS0, CS1], ["cs1:0"]>,   DwarfRegNum<[78]>;
+    def UPCYCLE:  Rcc<14, "c15:14", [UPCYCLELO, UPCYCLEHI]>,  DwarfRegNum<[80]>;
+    def C17_16:   Rcc<16, "c17:16", [FRAMELIMIT, FRAMEKEY]>,  DwarfRegNum<[83]>;
+    def PKTCOUNT: Rcc<18, "c19:18", [PKTCOUNTLO, PKTCOUNTHI], ["pktcount"]>,
+                                                              DwarfRegNum<[85]>;
+    def UTIMER:   Rcc<30, "c31:30", [UTIMERLO, UTIMERHI], ["utimer"]>,
+                                                              DwarfRegNum<[97]>;
   }
 
   foreach i = 0-31 in {
@@ -221,6 +232,10 @@ def IntRegs : RegisterClass<"Hexagon", [i32, f32, v4i8, v2i16], 32,
 }
 
 // Registers are listed in reverse order for allocation preference reasons.
+def GeneralSubRegs : RegisterClass<"Hexagon", [i32], 32,
+                                   (add R23, R22, R21, R20, R19, R18, R17,
+                                        R16, R7, R6, R5, R4, R3, R2, R1, R0)>;
+
 def IntRegsLow8 : RegisterClass<"Hexagon", [i32], 32,
                                 (add R7, R6, R5, R4, R3, R2, R1, R0)> ;
 
@@ -228,6 +243,10 @@ def DoubleRegs : RegisterClass<"Hexagon", [i64, f64, v8i8, v4i16, v2i32], 64,
                                (add (sequence "D%u", 0, 4),
                                     (sequence "D%u", 6, 13), D5, D14, D15)>;
 
+def GeneralDoubleLow8Regs : RegisterClass<"Hexagon", [i64], 64,
+                                          (add D11, D10, D9, D8, D3, D2, D1,
+                                               D0)>;
+
 def VectorRegs : RegisterClass<"Hexagon", [v64i8, v32i16, v16i32, v8i64], 512,
                                (add (sequence "V%u", 0, 31))>;
 
@@ -261,28 +280,28 @@ def ModRegs : RegisterClass<"Hexagon", [i32], 32, (add M0, M1)>;
 
 let Size = 32, isAllocatable = 0 in
 def CtrRegs : RegisterClass<"Hexagon", [i32], 32,
-                            (add LC0, SA0, LC1, SA1,
-                                 P3_0, C5,
-                                 M0, M1, C6, C7, C8, CS0, CS1, UPCL, UPCH,
-                                 USR, UGP, GP, PC)>;
+  (add LC0, SA0, LC1, SA1, P3_0, C5, C6, C7,
+       C8, PC, UGP, GP, CS0, CS1, UPCYCLELO, UPCYCLEHI,
+       FRAMELIMIT, FRAMEKEY, PKTCOUNTLO, PKTCOUNTHI, UTIMERLO, UTIMERHI,
+       M0, M1, USR)>;
 
 let isAllocatable = 0 in
 def UsrBits : RegisterClass<"Hexagon", [i1], 0, (add USR_OVF)>;
 
 let Size = 64, isAllocatable = 0 in
 def CtrRegs64 : RegisterClass<"Hexagon", [i64], 64,
-                              (add C1_0, C3_2, C7_6, C9_8, C11_10, CS, UPC)>;
-
-def VolatileV3 {
-  list<Register> Regs = [D0, D1, D2, D3, D4, D5, D6, D7,
-                         R28, R31,
-                         P0, P1, P2, P3,
-                         M0, M1,
-                         LC0, LC1, SA0, SA1, USR, USR_OVF, CS0, CS1,
-                         V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11,
-                         V12, V13, V14, V15, V16, V17, V18, V19, V20, V21,
-                         V22, V23, V24, V25, V26, V27, V28, V29, V30, V31,
-                         W0, W1, W2, W3, W4, W5, W6, W7, W8, W9, W10, W11,
-                         W12, W13, W14, W15,
-                         Q0, Q1, Q2, Q3];
-}
+  (add C1_0, C3_2, C5_4, C7_6, C9_8, C11_10, CS, UPCYCLE, C17_16,
+       PKTCOUNT, UTIMER)>;
+
+// These registers are new for v62 and onward.
+// The function RegisterMatchesArch() uses this list for validation.
+let isAllocatable = 0 in
+def V62Regs : RegisterClass<"Hexagon", [i32], 32,
+                            (add FRAMELIMIT, FRAMEKEY,   C17_16,
+                                 PKTCOUNTLO, PKTCOUNTHI, PKTCOUNT,
+                                 UTIMERLO,   UTIMERHI,   UTIMER)>;
+
+
+def HexagonCSR
+  : CalleeSavedRegs<(add R16, R17, R18, R19, R20, R21, R22, R23,
+                         R24, R25, R26, R27)>;
diff --git a/lib/Target/Hexagon/HexagonSchedule.td b/lib/Target/Hexagon/HexagonSchedule.td
index 6e4987b7e4e3f143fee5e4b7522e7ecebf5aa565..9b5fbea04d18b3c6687a064a7515a7a8ad1a5406 100644
--- a/lib/Target/Hexagon/HexagonSchedule.td
+++ b/lib/Target/Hexagon/HexagonSchedule.td
@@ -21,4 +21,12 @@ include "HexagonScheduleV55.td"
 //===----------------------------------------------------------------------===//
 
 include "HexagonScheduleV60.td"
+include "HexagonIICScalar.td"
+include "HexagonIICHVX.td"
+
+//===----------------------------------------------------------------------===//
+// V62 Machine Info +
+//===----------------------------------------------------------------------===//
+
+include "HexagonScheduleV62.td"
 
diff --git a/lib/Target/Hexagon/HexagonScheduleV4.td b/lib/Target/Hexagon/HexagonScheduleV4.td
index 7416baab392c3c95d3925304f54f02dcf7f02d41..880cc0a02b6a570fbe238a3092c2072c87f0aa93 100644
--- a/lib/Target/Hexagon/HexagonScheduleV4.td
+++ b/lib/Target/Hexagon/HexagonScheduleV4.td
@@ -61,15 +61,21 @@ def J_tc_2early_SLOT23       : InstrItinClass;
 def J_tc_2early_CJUMP_UCJUMP_ARCHDEPSLOT       : InstrItinClass;
 def J_tc_2early_SLOT2        : InstrItinClass;
 def LD_tc_ld_SLOT01          : InstrItinClass;
+def LD_tc_ld_pi_SLOT01          : InstrItinClass;
 def LD_tc_ld_SLOT0           : InstrItinClass;
 def LD_tc_3or4stall_SLOT0    : InstrItinClass;
 def M_tc_2_SLOT23            : InstrItinClass;
+def M_tc_2_acc_SLOT23        : InstrItinClass;
 def M_tc_3_SLOT23            : InstrItinClass;
 def M_tc_1_SLOT23            : InstrItinClass;
 def M_tc_3x_SLOT23           : InstrItinClass;
+def M_tc_3x_acc_SLOT23       : InstrItinClass;
 def M_tc_3or4x_SLOT23        : InstrItinClass;
+def M_tc_3or4x_acc_SLOT23    : InstrItinClass;
 def ST_tc_st_SLOT01          : InstrItinClass;
+def ST_tc_st_pi_SLOT01       : InstrItinClass;
 def ST_tc_st_SLOT0           : InstrItinClass;
+def ST_tc_st_pi_SLOT0        : InstrItinClass;
 def ST_tc_ld_SLOT0           : InstrItinClass;
 def ST_tc_3stall_SLOT0       : InstrItinClass;
 def S_2op_tc_1_SLOT23        : InstrItinClass;
@@ -131,21 +137,27 @@ def HexagonItinerariesV4 :
 
         //Load
         InstrItinData<LD_tc_ld_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_ld_pi_SLOT01     , [InstrStage<1, [SLOT0, SLOT1]>]>,
         InstrItinData<LD_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
         InstrItinData<LD_tc_3or4stall_SLOT0  , [InstrStage<1, [SLOT0]>]>,
 
         // M
         InstrItinData<M_tc_1_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_2_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_acc_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3_SLOT23          , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3x_SLOT23         , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_acc_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3or4x_SLOT23      , [InstrStage<1, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_acc_SLOT23  , [InstrStage<1, [SLOT2, SLOT3]>]>,
 
         // Store
         // ST
         InstrItinData<ST_tc_st_SLOT01        , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<ST_tc_st_pi_SLOT01     , [InstrStage<1, [SLOT0, SLOT1]>]>,
         // ST0
         InstrItinData<ST_tc_st_SLOT0         , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<ST_tc_st_pi_SLOT0      , [InstrStage<1, [SLOT0]>]>,
         InstrItinData<ST_tc_ld_SLOT0         , [InstrStage<1, [SLOT0]>]>,
 
         // S
diff --git a/lib/Target/Hexagon/HexagonScheduleV55.td b/lib/Target/Hexagon/HexagonScheduleV55.td
index b2a75f7200d707f97898828b0443870783f056eb..06cbcb16abb7ba87b9cb6b07a31fc01ec5d5dd1c 100644
--- a/lib/Target/Hexagon/HexagonScheduleV55.td
+++ b/lib/Target/Hexagon/HexagonScheduleV55.td
@@ -88,6 +88,8 @@ def HexagonItinerariesV55 :
         // Load
         InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<1, [SLOT0, SLOT1]>],
                                              [2, 1]>,
+        InstrItinData<LD_tc_ld_pi_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
+                                             [2, 1]>,
         InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1]>,
         InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<1, [SLOT0]>], [2, 1]>,
 
@@ -96,21 +98,30 @@ def HexagonItinerariesV55 :
                                           [1, 1, 1]>,
         InstrItinData<M_tc_2_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
                                           [2, 1, 1]>,
+        InstrItinData<M_tc_2_acc_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [2, 1, 1]>,
         InstrItinData<M_tc_3_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
                                           [1, 1, 1]>,
         InstrItinData<M_tc_3x_SLOT23    , [InstrStage<1, [SLOT2, SLOT3]>],
                                           [3, 1, 1]>,
+        InstrItinData<M_tc_3x_acc_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1, 1]>,
         InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
                                           [3, 1, 1]>,
+        InstrItinData<M_tc_3or4x_acc_SLOT23 , [InstrStage<1, [SLOT2, SLOT3]>],
+                                          [3, 1, 1]>,
         InstrItinData<M_tc_3stall_SLOT23, [InstrStage<1, [SLOT2, SLOT3]>],
                                           [3, 1, 1]>,
 
         // Store
         InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>],
                                           [1, 1, 1]>,
+        InstrItinData<ST_tc_st_pi_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>],
+                                          [1, 1, 1]>,
         InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
         InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<1, [SLOT0]>], [2, 1, 1]>,
         InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
+        InstrItinData<ST_tc_st_pi_SLOT0 , [InstrStage<1, [SLOT0]>], [1, 1, 1]>,
 
         // S
         InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>],
diff --git a/lib/Target/Hexagon/HexagonScheduleV60.td b/lib/Target/Hexagon/HexagonScheduleV60.td
index dc2ce43b0579e68a00bb605d881ef34712eea2f0..63784710f52b684a6b83c7b160aaabc34401c3a2 100644
--- a/lib/Target/Hexagon/HexagonScheduleV60.td
+++ b/lib/Target/Hexagon/HexagonScheduleV60.td
@@ -19,6 +19,8 @@ def CVI_LD     : FuncUnit;
 def CVI_XLSHF  : FuncUnit;
 def CVI_MPY01  : FuncUnit;
 def CVI_ALL    : FuncUnit;
+def CVI_XLMPY0 : FuncUnit;
+def CVI_SHFMPY1: FuncUnit;
 
 // Combined functional unit data.
 def HexagonComboFuncsV60 :
@@ -26,7 +28,9 @@ def HexagonComboFuncsV60 :
       ComboFuncData<CVI_XLSHF    , [CVI_XLANE, CVI_SHIFT]>,
       ComboFuncData<CVI_MPY01    , [CVI_MPY0, CVI_MPY1]>,
       ComboFuncData<CVI_ALL      , [CVI_ST, CVI_XLANE, CVI_SHIFT,
-                                    CVI_MPY0, CVI_MPY1, CVI_LD]>
+                                    CVI_MPY0, CVI_MPY1, CVI_LD]>,
+      ComboFuncData<CVI_XLMPY0   , [CVI_XLANE, CVI_MPY0]>,
+      ComboFuncData<CVI_SHFMPY1  , [CVI_SHIFT, CVI_MPY1]>
     ]>;
 
 // Note: When adding additional vector scheduling classes, add the
@@ -39,6 +43,7 @@ def CVI_VX           : InstrItinClass;
 def CVI_VX_DV_LONG   : InstrItinClass;
 def CVI_VX_DV        : InstrItinClass;
 def CVI_VX_DV_SLOT2  : InstrItinClass;
+def CVI_VX_DV_SLOT2_LONG_EARLY : InstrItinClass;
 def CVI_VP           : InstrItinClass;
 def CVI_VP_LONG      : InstrItinClass;
 def CVI_VP_VS_EARLY  : InstrItinClass;
@@ -150,22 +155,28 @@ def HexagonItinerariesV60 :
 
         // Load
         InstrItinData<LD_tc_ld_SLOT01      , [InstrStage<3, [SLOT0, SLOT1]>]>,
+        InstrItinData<LD_tc_ld_pi_SLOT01   , [InstrStage<3, [SLOT0, SLOT1]>]>,
         InstrItinData<LD_tc_3or4stall_SLOT0, [InstrStage<4, [SLOT0]>]>,
         InstrItinData<LD_tc_ld_SLOT0       , [InstrStage<3, [SLOT0]>]>,
 
         // M
         InstrItinData<M_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_2_SLOT23     , [InstrStage<2, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_2_acc_SLOT23 , [InstrStage<2, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3_SLOT23     , [InstrStage<3, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3x_SLOT23    , [InstrStage<3, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3x_acc_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3or4x_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
+        InstrItinData<M_tc_3or4x_acc_SLOT23 , [InstrStage<4, [SLOT2, SLOT3]>]>,
         InstrItinData<M_tc_3stall_SLOT23, [InstrStage<3, [SLOT2, SLOT3]>]>,
 
         // Store
         InstrItinData<ST_tc_st_SLOT01   , [InstrStage<1, [SLOT0, SLOT1]>]>,
+        InstrItinData<ST_tc_st_pi_SLOT01, [InstrStage<1, [SLOT0, SLOT1]>]>,
         InstrItinData<ST_tc_3stall_SLOT0, [InstrStage<3, [SLOT0]>]>,
         InstrItinData<ST_tc_ld_SLOT0    , [InstrStage<3, [SLOT0]>]>,
         InstrItinData<ST_tc_st_SLOT0    , [InstrStage<1, [SLOT0]>]>,
+        InstrItinData<ST_tc_st_pi_SLOT0 , [InstrStage<1, [SLOT0]>]>,
 
         // S
         InstrItinData<S_2op_tc_1_SLOT23     , [InstrStage<1, [SLOT2, SLOT3]>]>,
diff --git a/lib/Target/Hexagon/HexagonScheduleV62.td b/lib/Target/Hexagon/HexagonScheduleV62.td
new file mode 100644
index 0000000000000000000000000000000000000000..0758788a600be4da43298b0972b2b4187c00a635
--- /dev/null
+++ b/lib/Target/Hexagon/HexagonScheduleV62.td
@@ -0,0 +1,129 @@
+//=-HexagonScheduleV62.td - HexagonV62 Scheduling Definitions *- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+// V62 follows the same schedule as V60 with following exceptions:
+// Following instructions are permissible on any slot on V62:
+// V4_J4_cmpeq_fp0_jump_nt
+// V4_J4_cmpeq_fp0_jump_t
+// V4_J4_cmpeq_fp1_jump_nt
+// V4_J4_cmpeq_fp1_jump_t
+// V4_J4_cmpeq_tp0_jump_nt
+// V4_J4_cmpeq_tp0_jump_t
+// V4_J4_cmpeq_tp1_jump_nt
+// V4_J4_cmpeq_tp1_jump_t
+// V4_J4_cmpeqi_fp0_jump_nt
+// V4_J4_cmpeqi_fp0_jump_t
+// V4_J4_cmpeqi_fp1_jump_nt
+// V4_J4_cmpeqi_fp1_jump_t
+// V4_J4_cmpeqi_tp0_jump_nt
+// V4_J4_cmpeqi_tp0_jump_t
+// V4_J4_cmpeqi_tp1_jump_nt
+// V4_J4_cmpeqi_tp1_jump_t
+// V4_J4_cmpeqn1_fp0_jump_nt
+// V4_J4_cmpeqn1_fp0_jump_t
+// V4_J4_cmpeqn1_fp1_jump_nt
+// V4_J4_cmpeqn1_fp1_jump_t
+// V4_J4_cmpeqn1_tp0_jump_nt
+// V4_J4_cmpeqn1_tp0_jump_t
+// V4_J4_cmpeqn1_tp1_jump_nt
+// V4_J4_cmpeqn1_tp1_jump_t
+// V4_J4_cmpgt_fp0_jump_nt
+// V4_J4_cmpgt_fp0_jump_t
+// V4_J4_cmpgt_fp1_jump_nt
+// V4_J4_cmpgt_fp1_jump_t
+// V4_J4_cmpgt_tp0_jump_nt
+// V4_J4_cmpgt_tp0_jump_t
+// V4_J4_cmpgt_tp1_jump_nt
+// V4_J4_cmpgt_tp1_jump_t
+// V4_J4_cmpgti_fp0_jump_nt
+// V4_J4_cmpgti_fp0_jump_t
+// V4_J4_cmpgti_fp1_jump_nt
+// V4_J4_cmpgti_fp1_jump_t
+// V4_J4_cmpgti_tp0_jump_nt
+// V4_J4_cmpgti_tp0_jump_t
+// V4_J4_cmpgti_tp1_jump_nt
+// V4_J4_cmpgti_tp1_jump_t
+// V4_J4_cmpgtn1_fp0_jump_nt
+// V4_J4_cmpgtn1_fp0_jump_t
+// V4_J4_cmpgtn1_fp1_jump_nt
+// V4_J4_cmpgtn1_fp1_jump_t
+// V4_J4_cmpgtn1_tp0_jump_nt
+// V4_J4_cmpgtn1_tp0_jump_t
+// V4_J4_cmpgtn1_tp1_jump_nt
+// V4_J4_cmpgtn1_tp1_jump_t
+// V4_J4_cmpgtu_fp0_jump_nt
+// V4_J4_cmpgtu_fp0_jump_t
+// V4_J4_cmpgtu_fp1_jump_nt
+// V4_J4_cmpgtu_fp1_jump_t
+// V4_J4_cmpgtu_tp0_jump_nt
+// V4_J4_cmpgtu_tp0_jump_t
+// V4_J4_cmpgtu_tp1_jump_nt
+// V4_J4_cmpgtu_tp1_jump_t
+// V4_J4_cmpgtui_fp0_jump_nt
+// V4_J4_cmpgtui_fp0_jump_t
+// V4_J4_cmpgtui_fp1_jump_nt
+// V4_J4_cmpgtui_fp1_jump_t
+// V4_J4_cmpgtui_tp0_jump_nt
+// V4_J4_cmpgtui_tp0_jump_t
+// V4_J4_cmpgtui_tp1_jump_nt
+// V4_J4_cmpgtui_tp1_jump_t
+// V4_J4_tstbit0_fp0_jump_nt
+// V4_J4_tstbit0_fp0_jump_t
+// V4_J4_tstbit0_fp1_jump_nt
+// V4_J4_tstbit0_fp1_jump_t
+// V4_J4_tstbit0_tp0_jump_nt
+// V4_J4_tstbit0_tp0_jump_t
+// V4_J4_tstbit0_tp1_jump_nt
+// V4_J4_tstbit0_tp1_jump_t
+// JMP
+// JMPEXT
+// JMPEXT_f
+// JMPEXT_fnew_nt
+// JMPEXT_fnew_t
+// JMPEXT_t
+// JMPEXT_tnew_nt
+// JMPEXT_tnew_t
+// JMPNOTEXT
+// JMPNOTEXT_f
+// JMPNOTEXT_fnew_nt
+// JMPNOTEXT_fnew_t
+// JMPNOTEXT_t
+// JMPNOTEXT_tnew_nt
+// JMPNOTEXT_tnew_t
+// JMP_f
+// JMP_fnew_nt
+// JMP_fnew_t
+// JMP_t
+// JMP_tnew_nt
+// JMP_tnew_t
+// RESTORE_DEALLOC_RET_JMP_V4
+// RESTORE_DEALLOC_RET_JMP_V4_EXT
+
+def HexagonV62ItinList : ScalarItin, HVXV62Itin {
+  list<InstrItinData> ItinList =
+    !listconcat(ScalarItin_list, HVXV62Itin_list);
+}
+
+def HexagonItinerariesV62 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL],
+                           [], HexagonV62ItinList.ItinList>;
+
+def HexagonModelV62 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV62;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V62 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index 10730536080e57bc06f237d3f692aec3d3f4fd01..002e87fb32ce5a7e6cc4e91bb2579c3a058a1e34 100644
--- a/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -51,11 +51,12 @@ SDValue HexagonSelectionDAGInfo::EmitTargetCodeForMemcpy(
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
       .setChain(Chain)
-      .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
-                 Type::getVoidTy(*DAG.getContext()),
-                 DAG.getTargetExternalSymbol(SpecialMemcpyName,
-                      TLI.getPointerTy(DAG.getDataLayout()), Flags),
-                 std::move(Args))
+      .setLibCallee(
+          TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+          Type::getVoidTy(*DAG.getContext()),
+          DAG.getTargetExternalSymbol(
+              SpecialMemcpyName, TLI.getPointerTy(DAG.getDataLayout()), Flags),
+          std::move(Args))
       .setDiscardResult();
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/lib/Target/Hexagon/HexagonSplitDouble.cpp b/lib/Target/Hexagon/HexagonSplitDouble.cpp
index 176d3f75e11d074b0281ce1f0711be021419e415..471e32221b2925fc9caf4f76f74d04ca762ef782 100644
--- a/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -393,7 +393,7 @@ int32_t HexagonSplitDoubleRegs::profit(const MachineInstr *MI) const {
 
 bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
       const {
-  unsigned FixedNum = 0, SplitNum = 0, LoopPhiNum = 0;
+  unsigned FixedNum = 0, LoopPhiNum = 0;
   int32_t TotalP = 0;
 
   for (unsigned DR : Part) {
@@ -430,7 +430,6 @@ bool HexagonSplitDoubleRegs::isProfitable(const USet &Part, LoopRegMap &IRM)
           LoopPhiNum++;
       }
       // Splittable instruction.
-      SplitNum++;
       int32_t P = profit(UseI);
       if (P == std::numeric_limits<int>::min())
         return false;
diff --git a/lib/Target/Hexagon/HexagonSubtarget.cpp b/lib/Target/Hexagon/HexagonSubtarget.cpp
index 8c23a2465dd6328c542eaf26696675e44866b58c..033b93fc910aac07e94a94983e96a9d8bd017e8d 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -88,6 +88,7 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
     { "hexagonv5", V5 },
     { "hexagonv55", V55 },
     { "hexagonv60", V60 },
+    { "hexagonv62", V62 },
   };
 
   auto foundIt = CpuTable.find(CPUString);
diff --git a/lib/Target/Hexagon/HexagonSubtarget.h b/lib/Target/Hexagon/HexagonSubtarget.h
index f2b9cdaad1ae1f6b07134c0b4dcfe5b19ccb033c..6a3e7f13be4c4a8d3d2bdd52d3bcabd5d9f168d9 100644
--- a/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/lib/Target/Hexagon/HexagonSubtarget.h
@@ -38,9 +38,7 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   bool ModeIEEERndNear;
 
 public:
-  enum HexagonArchEnum {
-    V4, V5, V55, V60
-  };
+#include "HexagonDepArch.h"
 
   HexagonArchEnum HexagonArchVersion;
   /// True if the target should use Back-Skip-Back scheduling. This is the
@@ -98,6 +96,9 @@ public:
   bool hasV55TOpsOnly() const { return getHexagonArchVersion() == V55; }
   bool hasV60TOps() const { return getHexagonArchVersion() >= V60; }
   bool hasV60TOpsOnly() const { return getHexagonArchVersion() == V60; }
+  bool hasV62TOps() const { return getHexagonArchVersion() >= V62; }
+  bool hasV62TOpsOnly() const { return getHexagonArchVersion() == V62; }
+
   bool modeIEEERndNear() const { return ModeIEEERndNear; }
   bool useHVXOps() const { return UseHVXOps; }
   bool useHVXDblOps() const { return UseHVXOps && UseHVXDblOps; }
diff --git a/lib/Target/Hexagon/HexagonSystemInst.td b/lib/Target/Hexagon/HexagonSystemInst.td
deleted file mode 100644
index 629a98749ee9f8ed2182795f4c2d25eda323a02c..0000000000000000000000000000000000000000
--- a/lib/Target/Hexagon/HexagonSystemInst.td
+++ /dev/null
@@ -1,134 +0,0 @@
-//==- HexagonSystemInst.td - System Instructions for Hexagon -*- tablegen -*-==//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon instructions in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//                     Cache manipulation instructions.
-//===----------------------------------------------------------------------===//
-let mayStore = 1 in
-class ST_MISC_CACHEOP<dag outs, dag ins,
-              string asmstr, list<dag> pattern = [],
-              bits<3> amode, bits<3> type, bits<1> un>
-  : ST0Inst<outs, ins, asmstr, pattern, "", ST_tc_ld_SLOT0> {
-
-    bits<5> Rs;
-    bits<5> Rt;
-    bits<5> Rd;
-    let Inst{31-28} = 0b1010;
-    let Inst{27-25} = amode;
-    let Inst{24-22} = type;
-    let Inst{21}    = un;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Rt;
-    let Inst{4-0}   = Rd;
-}
-
-let mayStore = 1 in
-class ST_MISC_CACHEOP_SYS<dag outs, dag ins,
-              string asmstr, list<dag> pattern = [],
-              bits<3> amode, bits<3> type, bits<1> un>
-  : SYSInst<outs, ins, asmstr, pattern, ""> {
-
-    bits<5> Rs;
-    bits<5> Rt;
-    bits<5> Rd;
-    let Inst{31-28} = 0b1010;
-    let Inst{27-25} = amode;
-    let Inst{24-22} = type;
-    let Inst{21}    = un;
-    let Inst{20-16} = Rs;
-    let Inst{12-8}  = Rt;
-    let Inst{4-0}   = Rd;
-}
-
-
-let isSolo = 1, Rs = 0, Rt = 0, Rd = 0 in {
-def Y2_syncht: ST_MISC_CACHEOP <(outs), (ins),
-    "syncht" , [], 0b100, 0b001, 0b0>;
-}
-
-let Rt = 0, Rd = 0 in {
-let isSoloAin1 = 1 in {
-  def Y2_dccleana: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
-      "dccleana($Rs)", [], 0b000, 0b000, 0b0>;
-  def Y2_dcinva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
-      "dcinva($Rs)", [], 0b000, 0b000, 0b1>;
-  def Y2_dccleaninva: ST_MISC_CACHEOP <(outs), (ins IntRegs:$Rs),
-      "dccleaninva($Rs)", [], 0b000, 0b001, 0b0>;
-  }
-}
-
-let isSoloAX = 1, hasSideEffects = 1, Rd = 0 in {
-  def Y4_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, IntRegs:$Rt),
-      "l2fetch($Rs, $Rt)", [], 0b011, 0b000, 0b0>;
-  def Y5_l2fetch: ST_MISC_CACHEOP_SYS<(outs), (ins IntRegs:$Rs, DoubleRegs:$Rt),
-      "l2fetch($Rs, $Rt)", [], 0b011, 0b010, 0b0>;
-}
-
-let hasSideEffects = 0, isSolo = 1 in
-class Y2_INVALIDATE_CACHE<string mnemonic, bit MajOp>
-  : JRInst <
-  (outs), (ins IntRegs:$Rs),
-  #mnemonic#"($Rs)" > {
-    bits<5> Rs;
-
-    let IClass = 0b0101;
-    let Inst{27-21} = 0b0110110;
-    let Inst{20-16} = Rs;
-    let Inst{13-12} = 0b00;
-    let Inst{11} = MajOp;
-  }
-// Instruction cache invalidate
-def Y2_icinva : Y2_INVALIDATE_CACHE<"icinva", 0b0>;
-
-// Zero an aligned 32-byte cacheline.
-let isSoloAin1 = 1 in
-def Y2_dczeroa: ST0Inst <(outs), (ins IntRegs:$Rs),
-  "dczeroa($Rs)"> {
-    bits<5> Rs;
-    let IClass = 0b1010;
-    let Inst{27-21} = 0b0000110;
-    let Inst{13} = 0b0;
-    let Inst{20-16} = Rs;
-  }
-
-// Memory synchronization.
-let hasSideEffects = 0, isSolo = 1 in
-def Y2_isync: JRInst <(outs), (ins),
-  "isync"> {
-    let IClass = 0b0101;
-    let Inst{27-16} = 0b011111000000;
-    let Inst{13} = 0b0;
-    let Inst{9-0} = 0b0000000010;
-  }
-
-//===----------------------------------------------------------------------===//
-//                     System/User instructions.
-//===----------------------------------------------------------------------===//
-// traps and pause
-let hasSideEffects = 0, isSolo = 1 in
-class J2_MISC_TRAP_PAUSE<string mnemonic, bits<2> MajOp>
-  : JRInst
-  <(outs), (ins u8_0Imm:$u8),
-   #mnemonic#"(#$u8)"> {
-    bits<8> u8;
-
-    let IClass = 0b0101;
-    let Inst{27-24} = 0b0100;
-    let Inst{23-22} = MajOp;
-    let Inst{12-8} = u8{7-3};
-    let Inst{4-2} = u8{2-0};
-  }
-def J2_trap0 : J2_MISC_TRAP_PAUSE<"trap0", 0b00>;
-def J2_trap1 : J2_MISC_TRAP_PAUSE<"trap1", 0b10>;
-def J2_pause : J2_MISC_TRAP_PAUSE<"pause", 0b01>;
-
diff --git a/lib/Target/Hexagon/HexagonTargetMachine.cpp b/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 8b29461554b376363fec7fe623115d5fa0d05391..06fc9195fa677e70929cc9c981966c7e03499e6e 100644
--- a/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -176,11 +176,11 @@ HexagonTargetMachine::HexagonTargetMachine(const Target &T, const Triple &TT,
 
 const HexagonSubtarget *
 HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
-  AttributeSet FnAttrs = F.getAttributes();
+  AttributeList FnAttrs = F.getAttributes();
   Attribute CPUAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-cpu");
+      FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-cpu");
   Attribute FSAttr =
-      FnAttrs.getAttribute(AttributeSet::FunctionIndex, "target-features");
+      FnAttrs.getAttribute(AttributeList::FunctionIndex, "target-features");
 
   std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
                         ? CPUAttr.getValueAsString().str()
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 7b1247d815a5a4aed771375906c443d620510410..3a789a5f7e0b22b85ad02273795003ffeb19b665 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -440,7 +440,7 @@ bool HexagonPacketizerList::promoteToDotNew(MachineInstr &MI,
 }
 
 bool HexagonPacketizerList::demoteToDotOld(MachineInstr &MI) {
-  int NewOpcode = HII->getDotOldOp(MI.getOpcode());
+  int NewOpcode = HII->getDotOldOp(MI);
   MI.setDesc(HII->get(NewOpcode));
   return true;
 }
@@ -720,6 +720,8 @@ bool HexagonPacketizerList::canPromoteToNewValueStore(const MachineInstr &MI,
   // %R9<def> = ZXTH %R12, %D6<imp-use>, %R12<imp-def>
   // S2_storerh_io %R8, 2, %R12<kill>; mem:ST2[%scevgep343]
   for (auto &MO : PacketMI.operands()) {
+    if (MO.isRegMask() && MO.clobbersPhysReg(DepReg))
+      return false;
     if (!MO.isReg() || !MO.isDef() || !MO.isImplicit())
       continue;
     unsigned R = MO.getReg();
@@ -759,9 +761,12 @@ bool HexagonPacketizerList::canPromoteToNewValue(const MachineInstr &MI,
 }
 
 static bool isImplicitDependency(const MachineInstr &I, unsigned DepReg) {
-  for (auto &MO : I.operands())
+  for (auto &MO : I.operands()) {
+    if (MO.isRegMask() && MO.clobbersPhysReg(DepReg))
+      return true;
     if (MO.isReg() && MO.isDef() && (MO.getReg() == DepReg) && MO.isImplicit())
       return true;
+  }
   return false;
 }
 
@@ -1046,7 +1051,9 @@ static bool cannotCoexistAsymm(const MachineInstr &MI, const MachineInstr &MJ,
     // XTYPE instructions.  Since there is no convenient way of identifying fp
     // XTYPE instructions, only allow grouping with ALU32 for now.
     unsigned TJ = HII.getType(MJ);
-    if (TJ != HexagonII::TypeALU32)
+    if (TJ != HexagonII::TypeALU32_2op &&
+        TJ != HexagonII::TypeALU32_3op &&
+        TJ != HexagonII::TypeALU32_ADDI)
       return true;
     break;
   }
@@ -1171,6 +1178,36 @@ bool HexagonPacketizerList::hasControlDependence(const MachineInstr &I,
          (J.isBranch() || J.isCall() || J.isBarrier());
 }
 
+bool HexagonPacketizerList::hasRegMaskDependence(const MachineInstr &I,
+                                                 const MachineInstr &J) {
+  // Adding I to a packet that has J.
+
+  // Regmasks are not reflected in the scheduling dependency graph, so
+  // we need to check them manually. This code assumes that regmasks only
+  // occur on calls, and the problematic case is when we add an instruction
+  // defining a register R to a packet that has a call that clobbers R via
+  // a regmask. Those cannot be packetized together, because the call will
+  // be executed last. That's also a reson why it is ok to add a call
+  // clobbering R to a packet that defines R.
+
+  // Look for regmasks in J.
+  for (const MachineOperand &OpJ : J.operands()) {
+    if (!OpJ.isRegMask())
+      continue;
+    assert((J.isCall() || HII->isTailCall(J)) && "Regmask on a non-call");
+    for (const MachineOperand &OpI : I.operands()) {
+      if (OpI.isReg()) {
+        if (OpJ.clobbersPhysReg(OpI.getReg()))
+          return true;
+      } else if (OpI.isRegMask()) {
+        // Both are regmasks. Assume that they intersect.
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 bool HexagonPacketizerList::hasV4SpecificDependence(const MachineInstr &I,
                                                     const MachineInstr &J) {
   bool SysI = isSystemInstr(I), SysJ = isSystemInstr(J);
@@ -1217,6 +1254,14 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
   if (Dependence)
     return false;
 
+  // Regmasks are not accounted for in the scheduling graph, so we need
+  // to explicitly check for dependencies caused by them. They should only
+  // appear on calls, so it's not too pessimistic to reject all regmask
+  // dependencies.
+  Dependence = hasRegMaskDependence(I, J);
+  if (Dependence)
+    return false;
+
   // V4 allows dual stores. It does not allow second store, if the first
   // store is not in SLOT0. New value store, new value jump, dealloc_return
   // and memop always take SLOT0. Arch spec 3.4.4.2.
@@ -1465,13 +1510,19 @@ bool HexagonPacketizerList::isLegalToPacketizeTogether(SUnit *SUI, SUnit *SUJ) {
     //   R0 = ...                   ; SUI
     // Those cannot be packetized together, since the call will observe
     // the effect of the assignment to R0.
-    if (DepType == SDep::Anti && J.isCall()) {
+    if ((DepType == SDep::Anti || DepType == SDep::Output) && J.isCall()) {
       // Check if I defines any volatile register. We should also check
       // registers that the call may read, but these happen to be a
       // subset of the volatile register set.
-      for (const MCPhysReg *P = J.getDesc().ImplicitDefs; P && *P; ++P) {
-        if (!I.modifiesRegister(*P, HRI))
+      for (const MachineOperand &Op : I.operands()) {
+        if (Op.isReg() && Op.isDef()) {
+          unsigned R = Op.getReg();
+          if (!J.readsRegister(R, HRI) && !J.modifiesRegister(R, HRI))
+            continue;
+        } else if (!Op.isRegMask()) {
+          // If I has a regmask assume dependency.
           continue;
+        }
         FoundSequentialDependence = true;
         break;
       }
diff --git a/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index 6d708722979c17a0bd66c7ee33c2c364e8e2da80..3f28dc5b79cec78582211f86103e513837de4577 100644
--- a/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -112,6 +112,7 @@ protected:
   void reserveResourcesForConstExt();
   bool hasDeadDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasControlDependence(const MachineInstr &I, const MachineInstr &J);
+  bool hasRegMaskDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasV4SpecificDependence(const MachineInstr &I, const MachineInstr &J);
   bool producesStall(const MachineInstr &MI);
 };
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index c140bd1d7ee273b8bb80ec94ee37b4de839b2fca..337af294eb861cb0e710355c9aa0844a2f418830 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -9,10 +9,10 @@
 
 #include "Hexagon.h"
 #include "HexagonFixupKinds.h"
-#include "HexagonMCTargetDesc.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCChecker.h"
 #include "MCTargetDesc/HexagonMCCodeEmitter.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/MC/MCAsmBackend.h"
@@ -59,9 +59,10 @@ class HexagonAsmBackend : public MCAsmBackend {
     RF.getFixups() = Fixups;
   }
 public:
-  HexagonAsmBackend(const Target &T, uint8_t OSABI, StringRef CPU) :
-    OSABI(OSABI), MCII (T.createMCInstrInfo()), RelaxTarget(new MCInst *),
-    Extender(nullptr) {}
+  HexagonAsmBackend(const Target &T, const Triple &TT, uint8_t OSABI,
+      StringRef CPU) :
+      OSABI(OSABI), CPU(CPU), MCII(T.createMCInstrInfo()),
+      RelaxTarget(new MCInst *), Extender(nullptr) {}
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override {
     return createHexagonELFObjectWriter(OS, OSABI, CPU);
@@ -88,101 +89,101 @@ public:
       // This table *must* be in same the order of fixup_* kinds in
       // HexagonFixupKinds.h.
       //
-      // namei                          offset  bits  flags
-      { "fixup_Hexagon_B22_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B15_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B7_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_LO16",             0,    32,   0 },
-      { "fixup_Hexagon_HI16",             0,    32,   0 },
-      { "fixup_Hexagon_32",               0,    32,   0 },
-      { "fixup_Hexagon_16",               0,    32,   0 },
-      { "fixup_Hexagon_8",                0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_0",        0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_1",        0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_2",        0,    32,   0 },
-      { "fixup_Hexagon_GPREL16_3",        0,    32,   0 },
-      { "fixup_Hexagon_HL16",             0,    32,   0 },
-      { "fixup_Hexagon_B13_PCREL",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B9_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B32_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_32_6_X",           0,    32,   0 },
-      { "fixup_Hexagon_B22_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B15_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B13_PCREL_X",      0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B9_PCREL_X",       0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_B7_PCREL_X",       0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_16_X",             0,    32,   0 },
-      { "fixup_Hexagon_12_X",             0,    32,   0 },
-      { "fixup_Hexagon_11_X",             0,    32,   0 },
-      { "fixup_Hexagon_10_X",             0,    32,   0 },
-      { "fixup_Hexagon_9_X",              0,    32,   0 },
-      { "fixup_Hexagon_8_X",              0,    32,   0 },
-      { "fixup_Hexagon_7_X",              0,    32,   0 },
-      { "fixup_Hexagon_6_X",              0,    32,   0 },
-      { "fixup_Hexagon_32_PCREL",         0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_COPY",             0,    32,   0 },
-      { "fixup_Hexagon_GLOB_DAT",         0,    32,   0 },
-      { "fixup_Hexagon_JMP_SLOT",         0,    32,   0 },
-      { "fixup_Hexagon_RELATIVE",         0,    32,   0 },
-      { "fixup_Hexagon_PLT_B22_PCREL",    0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_GOTREL_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_32",        0,    32,   0 },
-      { "fixup_Hexagon_GOT_LO16",         0,    32,   0 },
-      { "fixup_Hexagon_GOT_HI16",         0,    32,   0 },
-      { "fixup_Hexagon_GOT_32",           0,    32,   0 },
-      { "fixup_Hexagon_GOT_16",           0,    32,   0 },
-      { "fixup_Hexagon_DTPMOD_32",        0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_32",        0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_16",        0,    32,   0 },
-      { "fixup_Hexagon_GD_PLT_B22_PCREL", 0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_LD_PLT_B22_PCREL", 0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_GD_GOT_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_32",        0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_16",        0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_32",        0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_16",        0,    32,   0 },
-      { "fixup_Hexagon_IE_LO16",          0,    32,   0 },
-      { "fixup_Hexagon_IE_HI16",          0,    32,   0 },
-      { "fixup_Hexagon_IE_32",            0,    32,   0 },
-      { "fixup_Hexagon_IE_16",            0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_LO16",      0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_HI16",      0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_32",        0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_16",        0,    32,   0 },
-      { "fixup_Hexagon_TPREL_LO16",       0,    32,   0 },
-      { "fixup_Hexagon_TPREL_HI16",       0,    32,   0 },
-      { "fixup_Hexagon_TPREL_32",         0,    32,   0 },
-      { "fixup_Hexagon_TPREL_16",         0,    32,   0 },
-      { "fixup_Hexagon_6_PCREL_X",        0,    32,   MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_Hexagon_GOTREL_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_GOTREL_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_GOT_32_6_X",       0,    32,   0 },
-      { "fixup_Hexagon_GOT_16_X",         0,    32,   0 },
-      { "fixup_Hexagon_GOT_11_X",         0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_DTPREL_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_GD_GOT_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_LD_GOT_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_IE_32_6_X",        0,    32,   0 },
-      { "fixup_Hexagon_IE_16_X",          0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_32_6_X",    0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_16_X",      0,    32,   0 },
-      { "fixup_Hexagon_IE_GOT_11_X",      0,    32,   0 },
-      { "fixup_Hexagon_TPREL_32_6_X",     0,    32,   0 },
-      { "fixup_Hexagon_TPREL_16_X",       0,    32,   0 },
-      { "fixup_Hexagon_TPREL_11_X",       0,    32,   0 }
+      // namei                          offset  bits    flags
+      { "fixup_Hexagon_B22_PCREL",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B15_PCREL",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B7_PCREL",       0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LO16",           0,      32,     0 },
+      { "fixup_Hexagon_HI16",           0,      32,     0 },
+      { "fixup_Hexagon_32",             0,      32,     0 },
+      { "fixup_Hexagon_16",             0,      32,     0 },
+      { "fixup_Hexagon_8",              0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_0",      0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_1",      0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_2",      0,      32,     0 },
+      { "fixup_Hexagon_GPREL16_3",      0,      32,     0 },
+      { "fixup_Hexagon_HL16",           0,      32,     0 },
+      { "fixup_Hexagon_B13_PCREL",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B9_PCREL",       0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B32_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_32_6_X",         0,      32,     0 },
+      { "fixup_Hexagon_B22_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B15_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B13_PCREL_X",    0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B9_PCREL_X",     0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_B7_PCREL_X",     0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_16_X",           0,      32,     0 },
+      { "fixup_Hexagon_12_X",           0,      32,     0 },
+      { "fixup_Hexagon_11_X",           0,      32,     0 },
+      { "fixup_Hexagon_10_X",           0,      32,     0 },
+      { "fixup_Hexagon_9_X",            0,      32,     0 },
+      { "fixup_Hexagon_8_X",            0,      32,     0 },
+      { "fixup_Hexagon_7_X",            0,      32,     0 },
+      { "fixup_Hexagon_6_X",            0,      32,     0 },
+      { "fixup_Hexagon_32_PCREL",       0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_COPY",           0,      32,     0 },
+      { "fixup_Hexagon_GLOB_DAT",       0,      32,     0 },
+      { "fixup_Hexagon_JMP_SLOT",       0,      32,     0 },
+      { "fixup_Hexagon_RELATIVE",       0,      32,     0 },
+      { "fixup_Hexagon_PLT_B22_PCREL",  0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GOTREL_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_32",      0,      32,     0 },
+      { "fixup_Hexagon_GOT_LO16",       0,      32,     0 },
+      { "fixup_Hexagon_GOT_HI16",       0,      32,     0 },
+      { "fixup_Hexagon_GOT_32",         0,      32,     0 },
+      { "fixup_Hexagon_GOT_16",         0,      32,     0 },
+      { "fixup_Hexagon_DTPMOD_32",      0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_32",      0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_16",      0,      32,     0 },
+      { "fixup_Hexagon_GD_PLT_B22_PCREL",0,     32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_LD_PLT_B22_PCREL",0,     32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GD_GOT_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_32",      0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_16",      0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_32",      0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_16",      0,      32,     0 },
+      { "fixup_Hexagon_IE_LO16",        0,      32,     0 },
+      { "fixup_Hexagon_IE_HI16",        0,      32,     0 },
+      { "fixup_Hexagon_IE_32",          0,      32,     0 },
+      { "fixup_Hexagon_IE_16",          0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_LO16",    0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_HI16",    0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_32",      0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_16",      0,      32,     0 },
+      { "fixup_Hexagon_TPREL_LO16",     0,      32,     0 },
+      { "fixup_Hexagon_TPREL_HI16",     0,      32,     0 },
+      { "fixup_Hexagon_TPREL_32",       0,      32,     0 },
+      { "fixup_Hexagon_TPREL_16",       0,      32,     0 },
+      { "fixup_Hexagon_6_PCREL_X",      0,      32,     MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_Hexagon_GOTREL_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_GOTREL_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_GOT_32_6_X",     0,      32,     0 },
+      { "fixup_Hexagon_GOT_16_X",       0,      32,     0 },
+      { "fixup_Hexagon_GOT_11_X",       0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_DTPREL_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_GD_GOT_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_LD_GOT_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_IE_32_6_X",      0,      32,     0 },
+      { "fixup_Hexagon_IE_16_X",        0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_32_6_X",  0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_16_X",    0,      32,     0 },
+      { "fixup_Hexagon_IE_GOT_11_X",    0,      32,     0 },
+      { "fixup_Hexagon_TPREL_32_6_X",   0,      32,     0 },
+      { "fixup_Hexagon_TPREL_16_X",     0,      32,     0 },
+      { "fixup_Hexagon_TPREL_11_X",     0,      32,     0 }
     };
 
     if (Kind < FirstTargetFixupKind)
@@ -401,7 +402,8 @@ public:
   /// data fragment, at the offset specified by the fixup and following the
   /// fixup kind as appropriate.
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t FixupValue, bool IsPCRel) const override {
+                  uint64_t FixupValue, bool IsPCRel,
+                  MCContext &Ctx) const override {
 
     // When FixupValue is 0 the relocation is external and there
     // is nothing for us to do.
@@ -524,10 +526,9 @@ public:
     bool Relaxable = false;
     // Branches and loop-setup insns are handled as necessary by relaxation.
     if (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeJ ||
-        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) ==
-             HexagonII::TypeCOMPOUND &&
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeCJ &&
          MCID.isBranch()) ||
-        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeNV &&
+        (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeNCJ &&
          MCID.isBranch()) ||
         (llvm::HexagonMCInstrInfo::getType(*MCII, HMI) == HexagonII::TypeCR &&
          HMI.getOpcode() != Hexagon::C4_addipc))
@@ -724,7 +725,8 @@ public:
                   Size = 0;
                 }
               }
-              bool Error = HexagonMCShuffle(*MCII, RF.getSubtargetInfo(), Inst);
+              bool Error = HexagonMCShuffle(true, *MCII, RF.getSubtargetInfo(),
+                                            Inst);
               //assert(!Error);
               (void)Error;
               ReplaceInstruction(Asm.getEmitter(), RF, Inst);
@@ -739,15 +741,17 @@ public:
       }
     }
   }
-};
-} // end anonymous namespace
+}; // class HexagonAsmBackend
 
-namespace llvm {
-MCAsmBackend *createHexagonAsmBackend(Target const &T,
+} // namespace
+
+// MCAsmBackend
+MCAsmBackend *llvm::createHexagonAsmBackend(Target const &T,
                                       MCRegisterInfo const & /*MRI*/,
                                       const Triple &TT, StringRef CPU,
                                       const MCTargetOptions &Options) {
   uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(TT.getOS());
-  return new HexagonAsmBackend(T, OSABI, CPU);
-}
+
+  StringRef CPUString = Hexagon_MC::selectHexagonCPU(TT, CPU);
+  return new HexagonAsmBackend(T, TT, OSABI, CPUString);
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 4292f6b3faa4c6763a67a81e048ef6f4b62de054..9c80312b790da159f156b360a92aad89e33df9d2 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -17,6 +17,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
 #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONBASEINFO_H
 
+#include "HexagonDepITypes.h"
 #include "HexagonMCTargetDesc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <stdint.h>
@@ -27,57 +28,14 @@ namespace llvm {
 /// instruction info tracks.
 ///
 namespace HexagonII {
-  // *** The code below must match HexagonInstrFormat*.td *** //
-
-  // Insn types.
-  // *** Must match HexagonInstrFormat*.td ***
-  enum Type {
-    TypePSEUDO  = 0,
-    TypeALU32   = 1,
-    TypeCR      = 2,
-    TypeJR      = 3,
-    TypeJ       = 4,
-    TypeLD      = 5,
-    TypeST      = 6,
-    TypeSYSTEM  = 7,
-    TypeXTYPE   = 8,
-    TypeV4LDST  = 9,
-    TypeNV      = 10,
-    TypeDUPLEX  = 11,
-    TypeCOMPOUND = 12,
-    TypeCVI_FIRST     = 13,
-    TypeCVI_VA        = TypeCVI_FIRST,
-    TypeCVI_VA_DV     = 14,
-    TypeCVI_VX        = 15,
-    TypeCVI_VX_DV     = 16,
-    TypeCVI_VP        = 17,
-    TypeCVI_VP_VS     = 18,
-    TypeCVI_VS        = 19,
-    TypeCVI_VINLANESAT= 20,
-    TypeCVI_VM_LD     = 21,
-    TypeCVI_VM_TMP_LD = 22,
-    TypeCVI_VM_CUR_LD = 23,
-    TypeCVI_VM_VP_LDU = 24,
-    TypeCVI_VM_ST     = 25,
-    TypeCVI_VM_NEW_ST = 26,
-    TypeCVI_VM_STU    = 27,
-    TypeCVI_HIST      = 28,
-    TypeCVI_LAST      = TypeCVI_HIST,
-    TypePREFIX  = 30, // Such as extenders.
-    TypeENDLOOP = 31  // Such as end of a HW loop.
-  };
+  unsigned const TypeCVI_FIRST = TypeCVI_HIST;
+  unsigned const TypeCVI_LAST = TypeCVI_VX_DV;
 
   enum SubTarget {
-    HasV2SubT     = 0xf,
-    HasV2SubTOnly = 0x1,
-    NoV2SubT      = 0x0,
-    HasV3SubT     = 0xe,
-    HasV3SubTOnly = 0x2,
-    NoV3SubT      = 0x1,
-    HasV4SubT     = 0xc,
-    NoV4SubT      = 0x3,
-    HasV5SubT     = 0x8,
-    NoV5SubT      = 0x7
+    HasV4SubT     = 0x3f,
+    HasV5SubT     = 0x3e,
+    HasV55SubT    = 0x3c,
+    HasV60SubT    = 0x38,
   };
 
   enum AddrMode {
@@ -107,102 +65,101 @@ namespace HexagonII {
   enum {
     // This 5-bit field describes the insn type.
     TypePos  = 0,
-    TypeMask = 0x1f,
+    TypeMask = 0x3f,
 
     // Solo instructions.
-    SoloPos  = 5,
+    SoloPos  = 6,
     SoloMask = 0x1,
     // Packed only with A or X-type instructions.
-    SoloAXPos  = 6,
+    SoloAXPos  = 7,
     SoloAXMask = 0x1,
     // Only A-type instruction in first slot or nothing.
-    SoloAin1Pos  = 7,
+    SoloAin1Pos  = 8,
     SoloAin1Mask = 0x1,
 
     // Predicated instructions.
-    PredicatedPos  = 8,
+    PredicatedPos  = 9,
     PredicatedMask = 0x1,
-    PredicatedFalsePos  = 9,
+    PredicatedFalsePos  = 10,
     PredicatedFalseMask = 0x1,
-    PredicatedNewPos  = 10,
+    PredicatedNewPos  = 11,
     PredicatedNewMask = 0x1,
-    PredicateLatePos  = 11,
+    PredicateLatePos  = 12,
     PredicateLateMask = 0x1,
 
     // New-Value consumer instructions.
-    NewValuePos  = 12,
+    NewValuePos  = 13,
     NewValueMask = 0x1,
     // New-Value producer instructions.
-    hasNewValuePos  = 13,
+    hasNewValuePos  = 14,
     hasNewValueMask = 0x1,
     // Which operand consumes or produces a new value.
-    NewValueOpPos  = 14,
+    NewValueOpPos  = 15,
     NewValueOpMask = 0x7,
     // Stores that can become new-value stores.
-    mayNVStorePos  = 17,
+    mayNVStorePos  = 18,
     mayNVStoreMask = 0x1,
     // New-value store instructions.
-    NVStorePos  = 18,
+    NVStorePos  = 19,
     NVStoreMask = 0x1,
     // Loads that can become current-value loads.
-    mayCVLoadPos  = 19,
+    mayCVLoadPos  = 20,
     mayCVLoadMask = 0x1,
     // Current-value load instructions.
-    CVLoadPos  = 20,
+    CVLoadPos  = 21,
     CVLoadMask = 0x1,
 
     // Extendable insns.
-    ExtendablePos  = 21,
+    ExtendablePos  = 22,
     ExtendableMask = 0x1,
     // Insns must be extended.
-    ExtendedPos  = 22,
+    ExtendedPos  = 23,
     ExtendedMask = 0x1,
     // Which operand may be extended.
-    ExtendableOpPos  = 23,
+    ExtendableOpPos  = 24,
     ExtendableOpMask = 0x7,
     // Signed or unsigned range.
-    ExtentSignedPos  = 26,
+    ExtentSignedPos  = 27,
     ExtentSignedMask = 0x1,
     // Number of bits of range before extending operand.
-    ExtentBitsPos  = 27,
+    ExtentBitsPos  = 28,
     ExtentBitsMask = 0x1f,
     // Alignment power-of-two before extending operand.
-    ExtentAlignPos  = 32,
+    ExtentAlignPos  = 33,
     ExtentAlignMask = 0x3,
 
     // Valid subtargets
-    validSubTargetPos  = 34,
-    validSubTargetMask = 0xf,
+    validSubTargetPos  = 35,
+    validSubTargetMask = 0x3f,
 
     // Addressing mode for load/store instructions.
-    AddrModePos  = 40,
+    AddrModePos  = 41,
     AddrModeMask = 0x7,
     // Access size for load/store instructions.
-    MemAccessSizePos = 43,
+    MemAccessSizePos = 44,
     MemAccesSizeMask = 0xf,
 
     // Branch predicted taken.
-    TakenPos = 47,
+    TakenPos = 48,
     TakenMask = 0x1,
 
     // Floating-point instructions.
-    FPPos  = 48,
+    FPPos  = 49,
     FPMask = 0x1,
 
     // New-Value producer-2 instructions.
-    hasNewValuePos2  = 50,
+    hasNewValuePos2  = 51,
     hasNewValueMask2 = 0x1,
-
     // Which operand consumes or produces a new value.
-    NewValueOpPos2  = 51,
+    NewValueOpPos2  = 52,
     NewValueOpMask2 = 0x7,
 
     // Accumulator instructions.
-    AccumulatorPos = 54,
+    AccumulatorPos = 55,
     AccumulatorMask = 0x1,
 
     // Complex XU, prevent xu competition by preferring slot3
-    PrefersSlot3Pos = 55,
+    PrefersSlot3Pos = 56,
     PrefersSlot3Mask = 0x1,
 
     CofMax1Pos = 60,
@@ -217,8 +174,6 @@ namespace HexagonII {
     // Hexagon Specific MachineOperand flags.
     MO_NO_FLAG,
 
-    HMOTF_ConstExtended = 1,
-
     /// MO_PCREL - On a symbol operand, indicates a PC-relative relocation
     /// Used for computing a global address for PIC compilations
     MO_PCREL,
@@ -250,7 +205,13 @@ namespace HexagonII {
 
     // MO_TPREL - indicates relocation for TLS
     // local Executable method
-    MO_TPREL
+    MO_TPREL,
+
+    // HMOTF_ConstExtended
+    // Addendum to abovem, indicates a const extended op
+    // Can be used as a mask.
+    HMOTF_ConstExtended = 0x80
+
   };
 
   // Hexagon Sub-instruction classes.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
index 42fcc5a6aa89a718a32bb8e18c43082391e9da0a..dd790fd41257d40b4fc4ee4f5a769189c01fd95b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.cpp
@@ -125,46 +125,6 @@ void HexagonInstPrinter::printNOneImmOperand(MCInst const *MI, unsigned OpNo,
   O << -1;
 }
 
-void HexagonInstPrinter::prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<9>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
-  O << formatImm(Imm/64);
-}
-
-void HexagonInstPrinter::prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<10>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
-  O << formatImm(Imm/128);
-}
-
-void HexagonInstPrinter::prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<10>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x3f) == 0) && "Lower 6 bits must be ZERO.");
-  O << formatImm(Imm/64);
-}
-
-void HexagonInstPrinter::prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                                             raw_ostream &O) const {
-  int64_t Imm;
-  bool Success = MI->getOperand(OpNo).getExpr()->evaluateAsAbsolute(Imm);
-  Imm = SignExtend64<11>(Imm);
-  assert(Success); (void)Success;
-  assert(((Imm & 0x7f) == 0) && "Lower 7 bits must be ZERO.");
-  O << formatImm(Imm/128);
-}
-
 void HexagonInstPrinter::printGlobalOperand(MCInst const *MI, unsigned OpNo,
                                             raw_ostream &O) const {
   printOperand(MI, OpNo, O);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
index 5f421184b20a1b7af7eefdb3597fa60d8f93cee6..ac8e391905e0703d3aeca1adba40432a8cd7bdf7 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonInstPrinter.h
@@ -44,14 +44,6 @@ public:
                           raw_ostream &O) const;
   void printNOneImmOperand(MCInst const *MI, unsigned OpNo,
                            raw_ostream &O) const;
-  void prints3_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
-  void prints3_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
-  void prints4_6ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
-  void prints4_7ImmOperand(MCInst const *MI, unsigned OpNo,
-                           raw_ostream &O) const;
   void printBranchOperand(MCInst const *MI, unsigned OpNo,
                           raw_ostream &O) const;
   void printCallOperand(MCInst const *MI, unsigned OpNo, raw_ostream &O) const;
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index c619c36164cfc2b308f5219b9bf02234f9f7141d..446b3b2ce668a77af128b30f2cdf78bb7c15fa6c 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -23,6 +23,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   Data32bitsDirective = "\t.word\t";
   Data64bitsDirective = nullptr;  // .xword is only supported by V9.
   CommentString = "//";
+  SupportsDebugInformation = true;
 
   LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
   InlineAsmStart = "# InlineAsm Start";
@@ -30,8 +31,8 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   ZeroDirective = "\t.space\t";
   AscizDirective = "\t.string\t";
 
-  SupportsDebugInformation = true;
   MinInstAlignment = 4;
   UsesELFSectionDirectiveForBSS  = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
+  UseLogicalShr = false;
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 07c9ad96a0d776936e2bd0d2d87bc644fda78687..62b21c419f30334bf75d890b63b0fbcc5e94f94a 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -47,12 +47,40 @@ void HexagonMCChecker::init() {
   if (HexagonMCInstrInfo::isBundle(MCB))
     // Unfurl a bundle.
     for (auto const&I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
-      init(*I.getInst());
+      MCInst const &Inst = *I.getInst();
+      if (HexagonMCInstrInfo::isDuplex(MCII, Inst)) {
+        init(*Inst.getOperand(0).getInst());
+        init(*Inst.getOperand(1).getInst());
+      }
+      else
+        init(Inst);
     }
   else
     init(MCB);
 }
 
+void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg,
+                               bool &isTrue) {
+  if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
+    // Note an used predicate register.
+    PredReg = R;
+    isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
+
+    // Note use of new predicate register.
+    if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
+      NewPreds.insert(PredReg);
+  }
+  else
+    // Note register use.  Super-registers are not tracked directly,
+    // but their components.
+    for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
+        SRI.isValid();
+        ++SRI)
+      if (!MCSubRegIterator(*SRI, &RI).isValid())
+        // Skip super-registers used indirectly.
+        Uses.insert(*SRI);
+}
+
 void HexagonMCChecker::init(MCInst const& MCI) {
   const MCInstrDesc& MCID = HexagonMCInstrInfo::getDesc(MCII, MCI);
   unsigned PredReg = Hexagon::NoRegister;
@@ -60,28 +88,10 @@ void HexagonMCChecker::init(MCInst const& MCI) {
 
   // Get used registers.
   for (unsigned i = MCID.getNumDefs(); i < MCID.getNumOperands(); ++i)
-    if (MCI.getOperand(i).isReg()) {
-      unsigned R = MCI.getOperand(i).getReg();
-
-      if (HexagonMCInstrInfo::isPredicated(MCII, MCI) && isPredicateRegister(R)) {
-        // Note an used predicate register.
-        PredReg = R;
-        isTrue = HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI);
-
-        // Note use of new predicate register.
-        if (HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
-          NewPreds.insert(PredReg);
-      }
-      else
-        // Note register use.  Super-registers are not tracked directly,
-        // but their components.
-        for(MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
-           SRI.isValid();
-           ++SRI)
-         if (!MCSubRegIterator(*SRI, &RI).isValid())
-           // Skip super-registers used indirectly.
-           Uses.insert(*SRI);
-    }
+    if (MCI.getOperand(i).isReg())
+      initReg(MCI, MCI.getOperand(i).getReg(), PredReg, isTrue);
+  for (unsigned i = 0; i < MCID.getNumImplicitUses(); ++i)
+    initReg(MCI, MCID.getImplicitUses()[i], PredReg, isTrue);
 
   // Get implicit register definitions.
   if (const MCPhysReg *ImpDef = MCID.getImplicitDefs())
@@ -216,9 +226,11 @@ void HexagonMCChecker::init(MCInst const& MCI) {
     if (!MCSubRegIterator(N, &RI).isValid()) {
       // Super-registers cannot use new values.
       if (MCID.isBranch())
-        NewUses[N] = NewSense::Jmp(llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV);
+        NewUses[N] = NewSense::Jmp(
+          llvm::HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNCJ);
       else
-        NewUses[N] = NewSense::Use(PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI));
+        NewUses[N] = NewSense::Use(
+          PredReg, HexagonMCInstrInfo::isPredicatedTrue(MCII, MCI));
     }
   }
 }
@@ -230,14 +242,18 @@ HexagonMCChecker::HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo cons
   init();
 }
 
-bool HexagonMCChecker::check() {
+bool HexagonMCChecker::check(bool FullCheck) {
   bool chkB = checkBranches();
   bool chkP = checkPredicates();
   bool chkNV = checkNewValues();
   bool chkR = checkRegisters();
   bool chkS = checkSolo();
-  bool chkSh = checkShuffle();
-  bool chkSl = checkSlots();
+  bool chkSh = true;
+  if (FullCheck)
+   chkSh = checkShuffle();
+  bool chkSl = true;
+  if (FullCheck)
+   chkSl = checkSlots();
   bool chk = chkB && chkP && chkNV && chkR && chkS && chkSh && chkSl;
 
   return chk;
@@ -271,8 +287,8 @@ bool HexagonMCChecker::checkBranches() {
   HexagonMCErrInfo errInfo;
   if (HexagonMCInstrInfo::isBundle(MCB)) {
     bool hasConditional = false;
-    unsigned Branches = 0, Returns = 0, NewIndirectBranches = 0,
-             NewValueBranches = 0, Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE,
+    unsigned Branches = 0,
+             Conditional = HEXAGON_PRESHUFFLE_PACKET_SIZE,
              Unconditional = HEXAGON_PRESHUFFLE_PACKET_SIZE;
 
     for (unsigned i = HexagonMCInstrInfo::bundleInstructionsOffset;
@@ -284,12 +300,6 @@ bool HexagonMCChecker::checkBranches() {
       if (HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch() ||
           HexagonMCInstrInfo::getDesc(MCII, MCI).isCall()) {
         ++Branches;
-        if (HexagonMCInstrInfo::getDesc(MCII, MCI).isIndirectBranch() &&
-            HexagonMCInstrInfo::isPredicatedNew(MCII, MCI))
-          ++NewIndirectBranches;
-        if (HexagonMCInstrInfo::isNewValue(MCII, MCI))
-          ++NewValueBranches;
-
         if (HexagonMCInstrInfo::isPredicated(MCII, MCI) ||
             HexagonMCInstrInfo::isPredicatedNew(MCII, MCI)) {
           hasConditional = true;
@@ -298,9 +308,6 @@ bool HexagonMCChecker::checkBranches() {
           Unconditional = i; // Record the position of the unconditional branch.
         }
       }
-      if (HexagonMCInstrInfo::getDesc(MCII, MCI).isReturn() &&
-          HexagonMCInstrInfo::getDesc(MCII, MCI).mayLoad())
-        ++Returns;
     }
 
     if (Branches) // FIXME: should "Defs.count(Hexagon::PC)" be here too?
@@ -504,7 +511,7 @@ bool HexagonMCChecker::checkShuffle() {
   HexagonMCErrInfo errInfo;
   // Branch info is lost when duplexing. The unduplexed insns must be
   // checked and only branch errors matter for this case.
-  HexagonMCShuffler MCS(MCII, STI, MCB);
+  HexagonMCShuffler MCS(true, MCII, STI, MCB);
   if (!MCS.check()) {
     if (MCS.getError() == HexagonShuffler::SHUFFLE_ERROR_BRANCHES) {
       errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
@@ -513,7 +520,7 @@ bool HexagonMCChecker::checkShuffle() {
       return false;
     }
   }
-  HexagonMCShuffler MCSDX(MCII, STI, MCBDX);
+  HexagonMCShuffler MCSDX(true, MCII, STI, MCBDX);
   if (!MCSDX.check()) {
     errInfo.setError(HexagonMCErrInfo::CHECK_ERROR_SHUFFLE);
     errInfo.setShuffleError(MCSDX.getError());
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index 33e22798c9544929189b5382f4d51fc4385fed29..c3b3d4c14c8812089a0769a60e1c4b8710981e41 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -168,6 +168,7 @@ class HexagonMCChecker {
 
   void init();
   void init(MCInst const&);
+  void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue);
 
   // Checks performed.
   bool checkBranches();
@@ -177,6 +178,7 @@ class HexagonMCChecker {
   bool checkSolo();
   bool checkShuffle();
   bool checkSlots();
+  bool checkSize();
 
   static void compoundRegisterMap(unsigned&);
 
@@ -196,7 +198,7 @@ class HexagonMCChecker {
   explicit HexagonMCChecker(MCInstrInfo const &MCII, MCSubtargetInfo const &STI, MCInst& mcb, MCInst &mcbdx,
                             const MCRegisterInfo& ri);
 
-  bool check();
+  bool check(bool FullCheck = true);
 
   /// add a new error/warning
   void addErrInfo(HexagonMCErrInfo &err) { ErrInfoQ.push(err.s); };
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 2645a17b9bd081814980bd71f2b62a6a55d8841a..c0956520de738a6b6eacbc010bc8a0dd1c6c5032 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -35,38 +35,40 @@ STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 HexagonMCCodeEmitter::HexagonMCCodeEmitter(MCInstrInfo const &aMII,
                                            MCContext &aMCT)
     : MCT(aMCT), MCII(aMII), Addend(new unsigned(0)),
-      Extended(new bool(false)), CurrentBundle(new MCInst const *) {}
+      Extended(new bool(false)), CurrentBundle(new MCInst const *),
+      CurrentIndex(new size_t(0)) {}
 
-uint32_t HexagonMCCodeEmitter::parseBits(size_t Instruction, size_t Last,
+uint32_t HexagonMCCodeEmitter::parseBits(size_t Last,
                                          MCInst const &MCB,
                                          MCInst const &MCI) const {
   bool Duplex = HexagonMCInstrInfo::isDuplex(MCII, MCI);
-  if (Instruction == 0) {
+  if (*CurrentIndex == 0) {
     if (HexagonMCInstrInfo::isInnerLoop(MCB)) {
       assert(!Duplex);
-      assert(Instruction != Last);
+      assert(*CurrentIndex != Last);
       return HexagonII::INST_PARSE_LOOP_END;
     }
   }
-  if (Instruction == 1) {
+  if (*CurrentIndex == 1) {
     if (HexagonMCInstrInfo::isOuterLoop(MCB)) {
       assert(!Duplex);
-      assert(Instruction != Last);
+      assert(*CurrentIndex != Last);
       return HexagonII::INST_PARSE_LOOP_END;
     }
   }
   if (Duplex) {
-    assert(Instruction == Last);
+    assert(*CurrentIndex == Last);
     return HexagonII::INST_PARSE_DUPLEX;
   }
-  if(Instruction == Last)
+  if(*CurrentIndex == Last)
     return HexagonII::INST_PARSE_PACKET_END;
   return HexagonII::INST_PARSE_NOT_END;
 }
 
-void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
+/// EncodeInstruction - Emit the bundle
+void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                              SmallVectorImpl<MCFixup> &Fixups,
-                                             MCSubtargetInfo const &STI) const {
+                                             const MCSubtargetInfo &STI) const {
   MCInst &HMB = const_cast<MCInst &>(MI);
 
   assert(HexagonMCInstrInfo::isBundle(HMB));
@@ -74,7 +76,7 @@ void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
   *Addend = 0;
   *Extended = false;
   *CurrentBundle = &MI;
-  size_t Instruction = 0;
+  *CurrentIndex = 0;
   size_t Last = HexagonMCInstrInfo::bundleSize(HMB) - 1;
   for (auto &I : HexagonMCInstrInfo::bundleInstructions(HMB)) {
     MCInst &HMI = const_cast<MCInst &>(*I.getInst());
@@ -82,11 +84,10 @@ void HexagonMCCodeEmitter::encodeInstruction(MCInst const &MI, raw_ostream &OS,
                                 computeAvailableFeatures(STI.getFeatureBits()));
 
     EncodeSingleInstruction(HMI, OS, Fixups, STI,
-                            parseBits(Instruction, Last, HMB, HMI),
-                            Instruction);
+                            parseBits(Last, HMB, HMI));
     *Extended = HexagonMCInstrInfo::isImmext(HMI);
     *Addend += HEXAGON_INSTR_SIZE;
-    ++Instruction;
+    ++*CurrentIndex;
   }
   return;
 }
@@ -107,165 +108,44 @@ static bool RegisterMatches(unsigned Consumer, unsigned Producer,
 /// EncodeSingleInstruction - Emit a single
 void HexagonMCCodeEmitter::EncodeSingleInstruction(
     const MCInst &MI, raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
-    const MCSubtargetInfo &STI, uint32_t Parse, size_t Index) const {
-  MCInst HMB = MI;
-  assert(!HexagonMCInstrInfo::isBundle(HMB));
+    const MCSubtargetInfo &STI, uint32_t Parse) const {
+  assert(!HexagonMCInstrInfo::isBundle(MI));
   uint64_t Binary;
 
-  // Compound instructions are limited to using registers 0-7 and 16-23
-  // and here we make a map 16-23 to 8-15 so they can be correctly encoded.
-  static unsigned RegMap[8] = {Hexagon::R8,  Hexagon::R9,  Hexagon::R10,
-                               Hexagon::R11, Hexagon::R12, Hexagon::R13,
-                               Hexagon::R14, Hexagon::R15};
-
   // Pseudo instructions don't get encoded and shouldn't be here
   // in the first place!
-  assert(!HexagonMCInstrInfo::getDesc(MCII, HMB).isPseudo() &&
+  assert(!HexagonMCInstrInfo::getDesc(MCII, MI).isPseudo() &&
          "pseudo-instruction found");
   DEBUG(dbgs() << "Encoding insn"
-                  " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+                  " `" << HexagonMCInstrInfo::getName(MCII, MI) << "'"
                                                                     "\n");
 
-  if (llvm::HexagonMCInstrInfo::getType(MCII, HMB) == HexagonII::TypeCOMPOUND) {
-    for (unsigned i = 0; i < HMB.getNumOperands(); ++i)
-      if (HMB.getOperand(i).isReg()) {
-        unsigned Reg =
-            MCT.getRegisterInfo()->getEncodingValue(HMB.getOperand(i).getReg());
-        if ((Reg <= 23) && (Reg >= 16))
-          HMB.getOperand(i).setReg(RegMap[Reg - 16]);
-      }
-  }
-
-  if (HexagonMCInstrInfo::isNewValue(MCII, HMB)) {
-    // Calculate the new value distance to the associated producer
-    MCOperand &MCO =
-        HMB.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, HMB));
-    unsigned SOffset = 0;
-    unsigned VOffset = 0;
-    unsigned Register = MCO.getReg();
-    unsigned Register1;
-    unsigned Register2;
-    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
-    auto i = Instructions.begin() + Index - 1;
-    for (;; --i) {
-      assert(i != Instructions.begin() - 1 && "Couldn't find producer");
-      MCInst const &Inst = *i->getInst();
-      if (HexagonMCInstrInfo::isImmext(Inst))
-        continue;
-      ++SOffset;
-      if (HexagonMCInstrInfo::isVector(MCII, Inst))
-        // Vector instructions don't count scalars
-        ++VOffset;
-      Register1 =
-          HexagonMCInstrInfo::hasNewValue(MCII, Inst)
-              ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
-              : static_cast<unsigned>(Hexagon::NoRegister);
-      Register2 =
-          HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
-              ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
-              : static_cast<unsigned>(Hexagon::NoRegister);
-      if (!RegisterMatches(Register, Register1, Register2))
-        // This isn't the register we're looking for
-        continue;
-      if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
-        // Producer is unpredicated
-        break;
-      assert(HexagonMCInstrInfo::isPredicated(MCII, HMB) &&
-             "Unpredicated consumer depending on predicated producer");
-      if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
-          HexagonMCInstrInfo::isPredicatedTrue(MCII, HMB))
-        // Producer predicate sense matched ours
-        break;
-    }
-    // Hexagon PRM 10.11 Construct Nt from distance
-    unsigned Offset =
-        HexagonMCInstrInfo::isVector(MCII, HMB) ? VOffset : SOffset;
-    Offset <<= 1;
-    Offset |=
-        HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
-    MCO.setReg(Offset + Hexagon::R0);
-  }
-
-  Binary = getBinaryCodeForInstr(HMB, Fixups, STI);
+  Binary = getBinaryCodeForInstr(MI, Fixups, STI);
   // Check for unimplemented instructions. Immediate extenders
   // are encoded as zero, so they need to be accounted for.
-  if ((!Binary) &&
-      ((HMB.getOpcode() != DuplexIClass0) && (HMB.getOpcode() != A4_ext) &&
-       (HMB.getOpcode() != A4_ext_b) && (HMB.getOpcode() != A4_ext_c) &&
-       (HMB.getOpcode() != A4_ext_g))) {
+  if (!Binary &&
+      MI.getOpcode() != DuplexIClass0 &&
+      MI.getOpcode() != A4_ext) {
     DEBUG(dbgs() << "Unimplemented inst: "
-                    " `" << HexagonMCInstrInfo::getName(MCII, HMB) << "'"
+                    " `" << HexagonMCInstrInfo::getName(MCII, MI) << "'"
                                                                       "\n");
     llvm_unreachable("Unimplemented Instruction");
   }
   Binary |= Parse;
 
   // if we need to emit a duplexed instruction
-  if (HMB.getOpcode() >= Hexagon::DuplexIClass0 &&
-      HMB.getOpcode() <= Hexagon::DuplexIClassF) {
+  if (MI.getOpcode() >= Hexagon::DuplexIClass0 &&
+      MI.getOpcode() <= Hexagon::DuplexIClassF) {
     assert(Parse == HexagonII::INST_PARSE_DUPLEX &&
            "Emitting duplex without duplex parse bits");
-    unsigned dupIClass;
-    switch (HMB.getOpcode()) {
-    case Hexagon::DuplexIClass0:
-      dupIClass = 0;
-      break;
-    case Hexagon::DuplexIClass1:
-      dupIClass = 1;
-      break;
-    case Hexagon::DuplexIClass2:
-      dupIClass = 2;
-      break;
-    case Hexagon::DuplexIClass3:
-      dupIClass = 3;
-      break;
-    case Hexagon::DuplexIClass4:
-      dupIClass = 4;
-      break;
-    case Hexagon::DuplexIClass5:
-      dupIClass = 5;
-      break;
-    case Hexagon::DuplexIClass6:
-      dupIClass = 6;
-      break;
-    case Hexagon::DuplexIClass7:
-      dupIClass = 7;
-      break;
-    case Hexagon::DuplexIClass8:
-      dupIClass = 8;
-      break;
-    case Hexagon::DuplexIClass9:
-      dupIClass = 9;
-      break;
-    case Hexagon::DuplexIClassA:
-      dupIClass = 10;
-      break;
-    case Hexagon::DuplexIClassB:
-      dupIClass = 11;
-      break;
-    case Hexagon::DuplexIClassC:
-      dupIClass = 12;
-      break;
-    case Hexagon::DuplexIClassD:
-      dupIClass = 13;
-      break;
-    case Hexagon::DuplexIClassE:
-      dupIClass = 14;
-      break;
-    case Hexagon::DuplexIClassF:
-      dupIClass = 15;
-      break;
-    default:
-      llvm_unreachable("Unimplemented DuplexIClass");
-      break;
-    }
+    unsigned dupIClass = MI.getOpcode() - Hexagon::DuplexIClass0;
     // 29 is the bit position.
     // 0b1110 =0xE bits are masked off and down shifted by 1 bit.
     // Last bit is moved to bit position 13
     Binary = ((dupIClass & 0xE) << (29 - 1)) | ((dupIClass & 0x1) << 13);
 
-    const MCInst *subInst0 = HMB.getOperand(0).getInst();
-    const MCInst *subInst1 = HMB.getOperand(1).getInst();
+    const MCInst *subInst0 = MI.getOperand(0).getInst();
+    const MCInst *subInst1 = MI.getOperand(1).getInst();
 
     // get subinstruction slot 0
     unsigned subInstSlot0Bits = getBinaryCodeForInstr(*subInst0, Fixups, STI);
@@ -293,14 +173,13 @@ void raise_relocation_error(unsigned bits, unsigned kind) {
 /// getFixupNoBits - Some insns are not extended and thus have no
 /// bits.  These cases require a more brute force method for determining
 /// the correct relocation.
-namespace {
-Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
-                                      const MCOperand &MO,
-                                      const MCSymbolRefExpr::VariantKind kind) {
+Hexagon::Fixups HexagonMCCodeEmitter::getFixupNoBits(
+    MCInstrInfo const &MCII, const MCInst &MI, const MCOperand &MO,
+    const MCSymbolRefExpr::VariantKind kind) const {
   const MCInstrDesc &MCID = HexagonMCInstrInfo::getDesc(MCII, MI);
   unsigned insnType = llvm::HexagonMCInstrInfo::getType(MCII, MI);
 
-  if (insnType == HexagonII::TypePREFIX) {
+  if (insnType == HexagonII::TypeEXTENDER) {
     switch (kind) {
     case MCSymbolRefExpr::VK_GOTREL:
       return Hexagon::fixup_Hexagon_GOTREL_32_6_X;
@@ -319,11 +198,21 @@ Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
     case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
       return Hexagon::fixup_Hexagon_IE_GOT_32_6_X;
     case MCSymbolRefExpr::VK_Hexagon_PCREL:
-    case MCSymbolRefExpr::VK_None:
-      if (MCID.isBranch())
-        return Hexagon::fixup_Hexagon_B32_PCREL_X;
-      else
-        return Hexagon::fixup_Hexagon_32_6_X;
+      return Hexagon::fixup_Hexagon_B32_PCREL_X;
+    case MCSymbolRefExpr::VK_None: {
+      auto Insts = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+      for (auto I = Insts.begin(), N = Insts.end(); I != N; ++I) {
+        if (I->getInst() == &MI) {
+          const MCInst &NextI = *(I+1)->getInst();
+          const MCInstrDesc &D = HexagonMCInstrInfo::getDesc(MCII, NextI);
+          if (D.isBranch() || D.isCall() ||
+              HexagonMCInstrInfo::getType(MCII, NextI) == HexagonII::TypeCR)
+            return Hexagon::fixup_Hexagon_B32_PCREL_X;
+          return Hexagon::fixup_Hexagon_32_6_X;
+        }
+      }
+      raise_relocation_error(0, kind);
+    }
     default:
       raise_relocation_error(0, kind);
     }
@@ -406,7 +295,6 @@ Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
   }
   llvm_unreachable("Relocation exit not taken");
 }
-}
 
 namespace llvm {
 extern const MCInstrDesc HexagonInsts[];
@@ -450,7 +338,8 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
   int64_t Value;
   if (ME->evaluateAsAbsolute(Value))
     return Value;
-  assert(ME->getKind() == MCExpr::SymbolRef || ME->getKind() == MCExpr::Binary);
+  assert(ME->getKind() == MCExpr::SymbolRef ||
+         ME->getKind() == MCExpr::Binary);
   if (ME->getKind() == MCExpr::Binary) {
     MCBinaryExpr const *Binary = cast<MCBinaryExpr>(ME);
     getExprOpValue(MI, MO, Binary->getLHS(), Fixups, STI);
@@ -581,7 +470,30 @@ unsigned HexagonMCCodeEmitter::getExprOpValue(const MCInst &MI,
         if (HexagonMCInstrInfo::s23_2_reloc(*MO.getExpr()))
           FixupKind = Hexagon::fixup_Hexagon_23_REG;
         else
-          raise_relocation_error(bits, kind);
+          if (MCID.mayStore() || MCID.mayLoad()) {
+            for (const MCPhysReg *ImpUses = MCID.getImplicitUses(); *ImpUses;
+                 ++ImpUses) {
+              if (*ImpUses != Hexagon::GP)
+                continue;
+              switch (HexagonMCInstrInfo::getAccessSize(MCII, MI)) {
+              case HexagonII::MemAccessSize::ByteAccess:
+                FixupKind = fixup_Hexagon_GPREL16_0;
+                break;
+              case HexagonII::MemAccessSize::HalfWordAccess:
+                FixupKind = fixup_Hexagon_GPREL16_1;
+                break;
+              case HexagonII::MemAccessSize::WordAccess:
+                FixupKind = fixup_Hexagon_GPREL16_2;
+                break;
+              case HexagonII::MemAccessSize::DoubleWordAccess:
+                FixupKind = fixup_Hexagon_GPREL16_3;
+                break;
+              default:
+                raise_relocation_error(bits, kind);
+              }
+            }
+          } else
+            raise_relocation_error(bits, kind);
         break;
       }
       case MCSymbolRefExpr::VK_DTPREL:
@@ -795,10 +707,71 @@ unsigned
 HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
                                         SmallVectorImpl<MCFixup> &Fixups,
                                         MCSubtargetInfo const &STI) const {
+#ifndef NDEBUG
+  size_t OperandNumber = ~0U;
+  for (unsigned i = 0, n = MI.getNumOperands(); i < n; ++i)
+    if (&MI.getOperand(i) == &MO) {
+      OperandNumber = i;
+      break;
+    }
+  assert((OperandNumber != ~0U) && "Operand not found");
+#endif
+
+  if (HexagonMCInstrInfo::isNewValue(MCII, MI) &&
+      &MO == &MI.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, MI))) {
+    // Calculate the new value distance to the associated producer
+    MCOperand const &MCO =
+      MI.getOperand(HexagonMCInstrInfo::getNewValueOp(MCII, MI));
+    unsigned SOffset = 0;
+    unsigned VOffset = 0;
+    unsigned Register = MCO.getReg();
+    unsigned Register1;
+    unsigned Register2;
+    auto Instructions = HexagonMCInstrInfo::bundleInstructions(**CurrentBundle);
+    auto i = Instructions.begin() + *CurrentIndex - 1;
+    for (;; --i) {
+      assert(i != Instructions.begin() - 1 && "Couldn't find producer");
+      MCInst const &Inst = *i->getInst();
+      if (HexagonMCInstrInfo::isImmext(Inst))
+        continue;
+      ++SOffset;
+      if (HexagonMCInstrInfo::isVector(MCII, Inst))
+        // Vector instructions don't count scalars
+        ++VOffset;
+      Register1 =
+        HexagonMCInstrInfo::hasNewValue(MCII, Inst)
+        ? HexagonMCInstrInfo::getNewValueOperand(MCII, Inst).getReg()
+        : static_cast<unsigned>(Hexagon::NoRegister);
+      Register2 =
+        HexagonMCInstrInfo::hasNewValue2(MCII, Inst)
+        ? HexagonMCInstrInfo::getNewValueOperand2(MCII, Inst).getReg()
+        : static_cast<unsigned>(Hexagon::NoRegister);
+      if (!RegisterMatches(Register, Register1, Register2))
+        // This isn't the register we're looking for
+        continue;
+      if (!HexagonMCInstrInfo::isPredicated(MCII, Inst))
+        // Producer is unpredicated
+        break;
+      assert(HexagonMCInstrInfo::isPredicated(MCII, MI) &&
+        "Unpredicated consumer depending on predicated producer");
+      if (HexagonMCInstrInfo::isPredicatedTrue(MCII, Inst) ==
+        HexagonMCInstrInfo::isPredicatedTrue(MCII, MI))
+        // Producer predicate sense matched ours
+        break;
+    }
+    // Hexagon PRM 10.11 Construct Nt from distance
+    unsigned Offset =
+      HexagonMCInstrInfo::isVector(MCII, MI) ? VOffset : SOffset;
+    Offset <<= 1;
+    Offset |=
+      HexagonMCInstrInfo::SubregisterBit(Register, Register1, Register2);
+    return Offset;
+  }
   assert(!MO.isImm());
   if (MO.isReg()) {
     unsigned Reg = MO.getReg();
-    if (HexagonMCInstrInfo::isSubInstruction(MI))
+    if (HexagonMCInstrInfo::isSubInstruction(MI) ||
+        llvm::HexagonMCInstrInfo::getType(MCII, MI) == HexagonII::TypeCJ)
       return HexagonMCInstrInfo::getDuplexRegisterNumbering(Reg);
     switch(MI.getOpcode()){
     case Hexagon::A2_tfrrcr:
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
index 8e0667d9ac8e96eb00a49ce26b5c972c31430f36..c3a4beec313fb2f22318a0f87b02f7b4d8a99a08 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.h
@@ -15,6 +15,7 @@
 #ifndef HEXAGONMCCODEEMITTER_H
 #define HEXAGONMCCODEEMITTER_H
 
+#include "MCTargetDesc/HexagonFixupKinds.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -31,18 +32,22 @@ class HexagonMCCodeEmitter : public MCCodeEmitter {
   std::unique_ptr<unsigned> Addend;
   std::unique_ptr<bool> Extended;
   std::unique_ptr<MCInst const *> CurrentBundle;
+  std::unique_ptr<size_t> CurrentIndex;
 
   // helper routine for getMachineOpValue()
   unsigned getExprOpValue(const MCInst &MI, const MCOperand &MO,
                           const MCExpr *ME, SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
+  Hexagon::Fixups getFixupNoBits(MCInstrInfo const &MCII, const MCInst &MI,
+                                 const MCOperand &MO,
+                                 const MCSymbolRefExpr::VariantKind kind) const;
+
 public:
   HexagonMCCodeEmitter(MCInstrInfo const &aMII, MCContext &aMCT);
 
   // Return parse bits for instruction `MCI' inside bundle `MCB'
-  uint32_t parseBits(size_t Instruction, size_t Last, MCInst const &MCB,
-                    MCInst const &MCI) const;
+  uint32_t parseBits(size_t Last, MCInst const &MCB, MCInst const &MCI) const;
 
   void encodeInstruction(MCInst const &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
@@ -51,7 +56,7 @@ public:
   void EncodeSingleInstruction(const MCInst &MI, raw_ostream &OS,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI,
-                               uint32_t Parse, size_t Index) const;
+                               uint32_t Parse) const;
 
   // \brief TableGen'erated function for getting the
   // binary encoding for an instruction.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
index 9a09a17767a640775d53526d79cf8d06f4a18504..ffa980ca6563cd6c9d975c44ecee24dbc7e8b06d 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCCompound.cpp
@@ -14,6 +14,7 @@
 #include "Hexagon.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCShuffler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/Debug.h"
@@ -396,7 +397,7 @@ static bool lookForCompound(MCInstrInfo const &MCII, MCContext &Context,
 /// is found update the contents fo the bundle with the compound insn.
 /// If a compound instruction is found then the bundle will have one
 /// additional slot.
-void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
+void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                                      MCContext &Context, MCInst &MCI) {
   assert(HexagonMCInstrInfo::isBundle(MCI) &&
          "Non-Bundle where Bundle expected");
@@ -405,8 +406,23 @@ void HexagonMCInstrInfo::tryCompound(MCInstrInfo const &MCII,
   if (MCI.size() < 2)
     return;
 
+  bool StartedValid = llvm::HexagonMCShuffle(false, MCII, STI, MCI);
+
+  // Create a vector, needed to keep the order of jump instructions.
+  MCInst CheckList(MCI);
+
   // Look for compounds until none are found, only update the bundle when
   // a compound is found.
-  while (lookForCompound(MCII, Context, MCI))
-    ;
+  while (lookForCompound(MCII, Context, CheckList)) {
+    // Keep the original bundle around in case the shuffle fails.
+    MCInst OriginalBundle(MCI);
+
+    // Need to update the bundle.
+    MCI = CheckList;
+
+    if (StartedValid && !llvm::HexagonMCShuffle(false, MCII, STI, MCI)) {
+      DEBUG(dbgs() << "Found ERROR\n");
+      MCI = OriginalBundle;
+    }
+  }
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 413f052aa4bdc2d76daab16bdedc1fcf71f14c3a..e8f154a1fa5335676721a326d5fb6a8d8f75cd12 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -15,6 +15,7 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -262,6 +263,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
   case Hexagon::EH_RETURN_JMPR:
 
   case Hexagon::J2_jumpr:
+  case Hexagon::PS_jmpret:
     // jumpr r31
     // Actual form JMPR %PC<imp-def>, %R31<imp-use>, %R0<imp-use,internal>.
     DstReg = MCI.getOperand(0).getReg();
@@ -275,6 +277,12 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
   case Hexagon::J2_jumprfnew:
   case Hexagon::J2_jumprtnewpt:
   case Hexagon::J2_jumprfnewpt:
+  case Hexagon::PS_jmprett:
+  case Hexagon::PS_jmpretf:
+  case Hexagon::PS_jmprettnew:
+  case Hexagon::PS_jmpretfnew:
+  case Hexagon::PS_jmprettnewpt:
+  case Hexagon::PS_jmpretfnewpt:
     DstReg = MCI.getOperand(1).getReg();
     SrcReg = MCI.getOperand(0).getReg();
     // [if ([!]p0[.new])] jumpr r31
@@ -284,15 +292,10 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     }
     break;
   case Hexagon::L4_return_t:
-
   case Hexagon::L4_return_f:
-
   case Hexagon::L4_return_tnew_pnt:
-
   case Hexagon::L4_return_fnew_pnt:
-
   case Hexagon::L4_return_tnew_pt:
-
   case Hexagon::L4_return_fnew_pt:
     // [if ([!]p0[.new])] dealloc_return
     SrcReg = MCI.getOperand(0).getReg();
@@ -565,7 +568,8 @@ bool HexagonMCInstrInfo::subInstWouldBeExtended(MCInst const &potentialDuplex) {
 bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
                                              MCInst const &MIa, bool ExtendedA,
                                              MCInst const &MIb, bool ExtendedB,
-                                             bool bisReversable) {
+                                             bool bisReversable,
+                                             MCSubtargetInfo const &STI) {
   // Slot 1 cannot be extended in duplexes PRM 10.5
   if (ExtendedA)
     return false;
@@ -625,11 +629,16 @@ bool HexagonMCInstrInfo::isOrderedDuplexPair(MCInstrInfo const &MCII,
       return false;
   }
 
-  // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
-  //   therefore, not duplexable if slot 1 is a store, and slot 0 is not.
-  if ((MIbG == HexagonII::HSIG_S1) || (MIbG == HexagonII::HSIG_S2)) {
-    if ((MIaG != HexagonII::HSIG_S1) && (MIaG != HexagonII::HSIG_S2))
-      return false;
+  if (STI.getCPU().equals_lower("hexagonv4") ||
+      STI.getCPU().equals_lower("hexagonv5") ||
+      STI.getCPU().equals_lower("hexagonv55") ||
+      STI.getCPU().equals_lower("hexagonv60")) {
+    // If a store appears, it must be in slot 0 (MIa) 1st, and then slot 1 (MIb);
+    //   therefore, not duplexable if slot 1 is a store, and slot 0 is not.
+    if ((MIbG == HexagonII::HSIG_S1) || (MIbG == HexagonII::HSIG_S2)) {
+      if ((MIaG != HexagonII::HSIG_S1) && (MIaG != HexagonII::HSIG_S2))
+        return false;
+    }
   }
 
   return (isDuplexPairMatch(MIaG, MIbG));
@@ -703,6 +712,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
       Result.setOpcode(Hexagon::SA1_dec);
       addOps(Result, Inst, 0);
       addOps(Result, Inst, 1);
+      addOps(Result, Inst, 2);
       break;
     } //  1,2 SUBInst $Rd = add($Rs,#-1)
     else if (Inst.getOperand(1).getReg() == Hexagon::R29) {
@@ -806,20 +816,27 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     break; //    none  SUBInst deallocframe
   case Hexagon::EH_RETURN_JMPR:
   case Hexagon::J2_jumpr:
+  case Hexagon::PS_jmpret:
     Result.setOpcode(Hexagon::SL2_jumpr31);
     break; //    none  SUBInst jumpr r31
   case Hexagon::J2_jumprf:
+  case Hexagon::PS_jmpretf:
     Result.setOpcode(Hexagon::SL2_jumpr31_f);
     break; //    none  SUBInst if (!p0) jumpr r31
   case Hexagon::J2_jumprfnew:
   case Hexagon::J2_jumprfnewpt:
+  case Hexagon::PS_jmpretfnewpt:
+  case Hexagon::PS_jmpretfnew:
     Result.setOpcode(Hexagon::SL2_jumpr31_fnew);
     break; //    none  SUBInst if (!p0.new) jumpr:nt r31
   case Hexagon::J2_jumprt:
+  case Hexagon::PS_jmprett:
     Result.setOpcode(Hexagon::SL2_jumpr31_t);
     break; //    none  SUBInst if (p0) jumpr r31
   case Hexagon::J2_jumprtnew:
   case Hexagon::J2_jumprtnewpt:
+  case Hexagon::PS_jmprettnewpt:
+  case Hexagon::PS_jmprettnew:
     Result.setOpcode(Hexagon::SL2_jumpr31_tnew);
     break; //    none  SUBInst if (p0.new) jumpr:nt r31
   case Hexagon::L2_loadrb_io:
@@ -966,6 +983,7 @@ MCInst HexagonMCInstrInfo::deriveSubInst(MCInst const &Inst) {
     if (Absolute && Value == -1) {
       Result.setOpcode(Hexagon::SA1_setin1);
       addOps(Result, Inst, 0);
+      addOps(Result, Inst, 1);
       break; //  2 1 SUBInst $Rd = #-1
     } else {
       Result.setOpcode(Hexagon::SA1_seti);
@@ -1005,6 +1023,7 @@ static bool isStoreInst(unsigned opCode) {
 
 SmallVector<DuplexCandidate, 8>
 HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
+                                          MCSubtargetInfo const &STI,
                                           MCInst const &MCB) {
   assert(isBundle(MCB));
   SmallVector<DuplexCandidate, 8> duplexToTry;
@@ -1033,7 +1052,7 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
               HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
               *MCB.getOperand(j).getInst(),
               HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
-              bisReversable)) {
+              bisReversable, STI)) {
         // Get iClass.
         unsigned iClass = iClassOfDuplexPair(
             getDuplexCandidateGroup(*MCB.getOperand(k).getInst()),
@@ -1058,7 +1077,7 @@ HexagonMCInstrInfo::getDuplexPossibilties(MCInstrInfo const &MCII,
                 HexagonMCInstrInfo::hasExtenderForIndex(MCB, j - 1),
                 *MCB.getOperand(k).getInst(),
                 HexagonMCInstrInfo::hasExtenderForIndex(MCB, k - 1),
-                bisReversable)) {
+                bisReversable, STI)) {
           // Get iClass.
           unsigned iClass = iClassOfDuplexPair(
               getDuplexCandidateGroup(*MCB.getOperand(j).getInst()),
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 226470cfbced995e3e9315bc15f49a3ade8dcdc2..09819ccedd8fb4007f57fe33448d8a2913a97cdb 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -37,30 +37,19 @@
 
 using namespace llvm;
 
-static cl::opt<unsigned>
-    GPSize("gpsize", cl::NotHidden,
-           cl::desc("Global Pointer Addressing Size.  The default size is 8."),
-           cl::Prefix, cl::init(8));
+static cl::opt<unsigned> GPSize
+  ("gpsize", cl::NotHidden,
+   cl::desc("Global Pointer Addressing Size.  The default size is 8."),
+   cl::Prefix,
+   cl::init(8));
 
-void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
+void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB,
                                            const MCSubtargetInfo &STI) {
-  MCInst HMI = HexagonMCInstrInfo::createBundle();
-  MCInst *MCB;
-
-  if (MCK.getOpcode() != Hexagon::BUNDLE) {
-    HMI.addOperand(MCOperand::createInst(&MCK));
-    MCB = &HMI;
-  } else
-    MCB = const_cast<MCInst *>(&MCK);
-
-  // Examines packet and pad the packet, if needed, when an
-  // end-loop is in the bundle.
-  HexagonMCInstrInfo::padEndloop(getContext(), *MCB);
-  HexagonMCShuffle(*MCII, STI, *MCB);
-
-  assert(HexagonMCInstrInfo::bundleSize(*MCB) <= HEXAGON_PACKET_SIZE);
+  assert(MCB.getOpcode() == Hexagon::BUNDLE);
+  assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE);
+  assert(HexagonMCInstrInfo::bundleSize(MCB) > 0);
   bool Extended = false;
-  for (auto &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) {
+  for (auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
     MCInst *MCI = const_cast<MCInst *>(I.getInst());
     if (Extended) {
       if (HexagonMCInstrInfo::isDuplex(*MCII, *MCI)) {
@@ -77,11 +66,12 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCK,
 
   // At this point, MCB is a bundle
   // Iterate through the bundle and assign addends for the instructions
-  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(*MCB)) {
+  for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
     MCInst *MCI = const_cast<MCInst *>(I.getInst());
     EmitSymbol(*MCI);
   }
-  MCObjectStreamer::EmitInstruction(*MCB, STI);
+
+  MCObjectStreamer::EmitInstruction(MCB, STI);
 }
 
 void HexagonMCELFStreamer::EmitSymbol(const MCInst &Inst) {
@@ -119,9 +109,11 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
     MCSectionSubPair P = getCurrentSection();
     SwitchSection(&Section);
 
-    EmitValueToAlignment(ByteAlignment, 0, 1, 0);
-    EmitLabel(Symbol);
-    EmitZeros(Size);
+    if (ELFSymbol->isUndefined(false)) {
+      EmitValueToAlignment(ByteAlignment, 0, 1, 0);
+      EmitLabel(Symbol);
+      EmitZeros(Size);
+    }
 
     // Update the maximum alignment of the section if necessary.
     if (ByteAlignment > Section.getAlignment())
@@ -144,9 +136,10 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
   ELFSymbol->setSize(MCConstantExpr::create(Size, getContext()));
 }
 
-void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(
-    MCSymbol *Symbol, uint64_t Size, unsigned ByteAlignment,
-    unsigned AccessSize) {
+void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol,
+                                                         uint64_t Size,
+                                                         unsigned ByteAlignment,
+                                                         unsigned AccessSize) {
   getAssembler().registerSymbol(*Symbol);
   auto ELFSymbol = cast<MCSymbolELF>(Symbol);
   ELFSymbol->setBinding(ELF::STB_LOCAL);
@@ -154,11 +147,12 @@ void HexagonMCELFStreamer::HexagonMCEmitLocalCommonSymbol(
   HexagonMCEmitCommonSymbol(Symbol, Size, ByteAlignment, AccessSize);
 }
 
-namespace llvm {
 
-MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                     raw_pwrite_stream &OS, MCCodeEmitter *CE) {
-  return new HexagonMCELFStreamer(Context, MAB, OS, CE);
-}
+namespace llvm {
+  MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
+                                       MCAsmBackend &MAB,
+                                       raw_pwrite_stream &OS, MCCodeEmitter *CE) {
+    return new HexagonMCELFStreamer(Context, MAB, OS, CE);
+  }
 
 } // end namespace llvm
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
index 0ac1a68d4ef9c92e9b3d936943f90fd3d8258a55..5cb84a48a3136dbb2c7d7b8a72314d8c8cb63311 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
@@ -27,6 +27,13 @@ public:
       : MCELFStreamer(Context, TAB, OS, Emitter),
         MCII(createHexagonMCInstrInfo()) {}
 
+  HexagonMCELFStreamer(MCContext &Context,
+                       MCAsmBackend &TAB,
+                       raw_pwrite_stream &OS, MCCodeEmitter *Emitter,
+                       MCAssembler *Assembler) :
+  MCELFStreamer(Context, TAB, OS, Emitter),
+  MCII (createHexagonMCInstrInfo()) {}
+
   void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
   void EmitSymbol(const MCInst &Inst);
   void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
@@ -36,8 +43,9 @@ public:
                                  unsigned ByteAlignment, unsigned AccessSize);
 };
 
-MCStreamer *createHexagonELFStreamer(MCContext &Context, MCAsmBackend &MAB,
-                                     raw_pwrite_stream &OS, MCCodeEmitter *CE);
+MCStreamer *createHexagonELFStreamer(Triple const &TT, MCContext &Context,
+                                     MCAsmBackend &MAB, raw_pwrite_stream &OS,
+                                     MCCodeEmitter *CE);
 
 } // end namespace llvm
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
index e93906a0a3969cca62036ee04c52e7b3aa19d604..14300edc7e1b295bed426649c96d9b49ba2f3c68 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.cpp
@@ -11,7 +11,9 @@
 #include "HexagonMCExpr.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Object/ELF.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -36,7 +38,47 @@ MCFragment *llvm::HexagonMCExpr::findAssociatedFragment() const {
   return Expr->findAssociatedFragment();
 }
 
-void HexagonMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {}
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Cannot handle nested target MCExpr");
+    break;
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *be = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(be->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(be->getRHS(), Asm);
+    break;
+  }
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr &symRef = *cast<MCSymbolRefExpr>(Expr);
+    switch (symRef.getKind()) {
+    default:
+      return;
+    case MCSymbolRefExpr::VK_Hexagon_GD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_LD_GOT:
+    case MCSymbolRefExpr::VK_Hexagon_GD_PLT:
+    case MCSymbolRefExpr::VK_Hexagon_LD_PLT:
+    case MCSymbolRefExpr::VK_Hexagon_IE:
+    case MCSymbolRefExpr::VK_Hexagon_IE_GOT:
+    case MCSymbolRefExpr::VK_TPREL:
+      break;
+    }
+    cast<MCSymbolELF>(symRef.getSymbol()).setType(ELF::STT_TLS);
+    break;
+  }
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void HexagonMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  auto expr = getExpr();
+  fixELFSymbolsInTLSFixupsImpl(expr, Asm);
+}
 
 MCExpr const *HexagonMCExpr::getExpr() const { return Expr; }
 
@@ -75,4 +117,4 @@ void HexagonMCExpr::setSignMismatch(bool Val) {
 
 bool HexagonMCExpr::signMismatch() const {
   return SignMismatch;
-}
\ No newline at end of file
+}
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index e627f026c8ad626ae6ae4cd861dbe059b2aac172..553ffba508a1ffffff5c23e6a28078cf368eed53 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -16,10 +16,9 @@
 #include "Hexagon.h"
 #include "HexagonBaseInfo.h"
 #include "HexagonMCChecker.h"
-
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
@@ -59,31 +58,36 @@ bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
                                             MCSubtargetInfo const &STI,
                                             MCContext &Context, MCInst &MCB,
                                             HexagonMCChecker *Check) {
-  // Examine the packet and convert pairs of instructions to compound
-  // instructions when possible.
-  if (!HexagonDisableCompound)
-    HexagonMCInstrInfo::tryCompound(MCII, Context, MCB);
   // Check the bundle for errors.
-  bool CheckOk = Check ? Check->check() : true;
+  bool CheckOk = Check ? Check->check(false) : true;
   if (!CheckOk)
     return false;
-  HexagonMCShuffle(MCII, STI, MCB);
+  // Examine the packet and convert pairs of instructions to compound
+  // instructions when possible.
+  if (!HexagonDisableCompound)
+    HexagonMCInstrInfo::tryCompound(MCII, STI, Context, MCB);
+  HexagonMCShuffle(false, MCII, STI, MCB);
   // Examine the packet and convert pairs of instructions to duplex
   // instructions when possible.
   MCInst InstBundlePreDuplex = MCInst(MCB);
   if (!HexagonDisableDuplex) {
     SmallVector<DuplexCandidate, 8> possibleDuplexes;
-    possibleDuplexes = HexagonMCInstrInfo::getDuplexPossibilties(MCII, MCB);
+    possibleDuplexes =
+        HexagonMCInstrInfo::getDuplexPossibilties(MCII, STI, MCB);
     HexagonMCShuffle(MCII, STI, Context, MCB, possibleDuplexes);
   }
   // Examines packet and pad the packet, if needed, when an
   // end-loop is in the bundle.
-  HexagonMCInstrInfo::padEndloop(Context, MCB);
+  HexagonMCInstrInfo::padEndloop(MCB, Context);
   // If compounding and duplexing didn't reduce the size below
   // 4 or less we have a packet that is too big.
   if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE)
     return false;
-  HexagonMCShuffle(MCII, STI, MCB);
+  // Check the bundle for errors.
+  CheckOk = Check ? Check->check(true) : true;
+  if (!CheckOk)
+    return false;
+  HexagonMCShuffle(true, MCII, STI, MCB);
   return true;
 }
 
@@ -111,32 +115,14 @@ MCInst HexagonMCInstrInfo::createBundle() {
   return Result;
 }
 
-MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
-                                         MCInst const &inst0,
-                                         MCInst const &inst1) {
-  assert((iClass <= 0xf) && "iClass must have range of 0 to 0xf");
-  MCInst *duplexInst = new (Context) MCInst;
-  duplexInst->setOpcode(Hexagon::DuplexIClass0 + iClass);
-
-  MCInst *SubInst0 = new (Context) MCInst(deriveSubInst(inst0));
-  MCInst *SubInst1 = new (Context) MCInst(deriveSubInst(inst1));
-  duplexInst->addOperand(MCOperand::createInst(SubInst0));
-  duplexInst->addOperand(MCOperand::createInst(SubInst1));
-  return duplexInst;
-}
-
 MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
                                           MCInst const &Inst,
                                           MCOperand const &MO) {
   assert(HexagonMCInstrInfo::isExtendable(MCII, Inst) ||
          HexagonMCInstrInfo::isExtended(MCII, Inst));
 
-  MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, Inst);
   MCInst XMI;
-  XMI.setOpcode((Desc.isBranch() || Desc.isCall() ||
-                 HexagonMCInstrInfo::getType(MCII, Inst) == HexagonII::TypeCR)
-                    ? Hexagon::A4_ext_b
-                    : Hexagon::A4_ext);
+  XMI.setOpcode(Hexagon::A4_ext);
   if (MO.isImm())
     XMI.addOperand(MCOperand::createImm(MO.getImm() & (~0x3f)));
   else if (MO.isExpr())
@@ -146,6 +132,20 @@ MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
   return XMI;
 }
 
+MCInst *HexagonMCInstrInfo::deriveDuplex(MCContext &Context, unsigned iClass,
+                                         MCInst const &inst0,
+                                         MCInst const &inst1) {
+  assert((iClass <= 0xf) && "iClass must have range of 0 to 0xf");
+  MCInst *duplexInst = new (Context) MCInst;
+  duplexInst->setOpcode(Hexagon::DuplexIClass0 + iClass);
+
+  MCInst *SubInst0 = new (Context) MCInst(deriveSubInst(inst0));
+  MCInst *SubInst1 = new (Context) MCInst(deriveSubInst(inst1));
+  duplexInst->addOperand(MCOperand::createInst(SubInst0));
+  duplexInst->addOperand(MCOperand::createInst(SubInst1));
+  return duplexInst;
+}
+
 MCInst const *HexagonMCInstrInfo::extenderForIndex(MCInst const &MCB,
                                                    size_t Index) {
   assert(Index <= bundleSize(MCB));
@@ -173,22 +173,9 @@ HexagonMCInstrInfo::getAccessSize(MCInstrInfo const &MCII, MCInst const &MCI) {
                                    HexagonII::MemAccesSizeMask));
 }
 
-unsigned HexagonMCInstrInfo::getBitCount(MCInstrInfo const &MCII,
-                                         MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
-}
-
-// Return constant extended operand number.
-unsigned short HexagonMCInstrInfo::getCExtOpNum(MCInstrInfo const &MCII,
-                                                MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask);
-}
-
 MCInstrDesc const &HexagonMCInstrInfo::getDesc(MCInstrInfo const &MCII,
                                                MCInst const &MCI) {
-  return (MCII.get(MCI.getOpcode()));
+  return MCII.get(MCI.getOpcode());
 }
 
 unsigned HexagonMCInstrInfo::getDuplexRegisterNumbering(unsigned Reg) {
@@ -276,34 +263,32 @@ unsigned HexagonMCInstrInfo::getExtentBits(MCInstrInfo const &MCII,
   return ((F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask);
 }
 
-// Return the max value that a constant extendable operand can have
-// without being extended.
+/// Return the maximum value of an extendable operand.
 int HexagonMCInstrInfo::getMaxValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  unsigned isSigned =
-      (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-  unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  bool S = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
 
-  if (isSigned) // if value is signed
-    return ~(-1U << (bits - 1));
-  else
-    return ~(-1U << bits);
+  assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+         HexagonMCInstrInfo::isExtended(MCII, MCI));
+
+  if (S) // if value is signed
+    return (1 << (HexagonMCInstrInfo::getExtentBits(MCII, MCI) - 1)) - 1;
+  return (1 << HexagonMCInstrInfo::getExtentBits(MCII, MCI)) - 1;
 }
 
-// Return the min value that a constant extendable operand can have
-// without being extended.
+/// Return the minimum value of an extendable operand.
 int HexagonMCInstrInfo::getMinValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  unsigned isSigned =
-      (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
-  unsigned bits = (F >> HexagonII::ExtentBitsPos) & HexagonII::ExtentBitsMask;
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  bool S = (F >> HexagonII::ExtentSignedPos) & HexagonII::ExtentSignedMask;
 
-  if (isSigned) // if value is signed
-    return -1U << (bits - 1);
-  else
-    return 0;
+  assert(HexagonMCInstrInfo::isExtendable(MCII, MCI) ||
+         HexagonMCInstrInfo::isExtended(MCII, MCI));
+
+  if (S) // if value is signed
+    return -(1 << (HexagonMCInstrInfo::getExtentBits(MCII, MCI) - 1));
+  return 0;
 }
 
 StringRef HexagonMCInstrInfo::getName(MCInstrInfo const &MCII,
@@ -319,9 +304,7 @@ unsigned short HexagonMCInstrInfo::getNewValueOp(MCInstrInfo const &MCII,
 
 MCOperand const &HexagonMCInstrInfo::getNewValueOperand(MCInstrInfo const &MCII,
                                                         MCInst const &MCI) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  unsigned const O =
-      (F >> HexagonII::NewValueOpPos) & HexagonII::NewValueOpMask;
+  unsigned O = HexagonMCInstrInfo::getNewValueOp(MCII, MCI);
   MCOperand const &MCO = MCI.getOperand(O);
 
   assert((HexagonMCInstrInfo::isNewValue(MCII, MCI) ||
@@ -349,6 +332,13 @@ HexagonMCInstrInfo::getNewValueOperand2(MCInstrInfo const &MCII,
   return (MCO);
 }
 
+/// Return the Hexagon ISA class for the insn.
+unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
+                                     MCInst const &MCI) {
+  const uint64_t F = MCII.get(MCI.getOpcode()).TSFlags;
+  return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
+}
+
 int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII,
                                      MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -361,33 +351,55 @@ int HexagonMCInstrInfo::getSubTarget(MCInstrInfo const &MCII,
     return Hexagon::ArchV4;
   case HexagonII::HasV5SubT:
     return Hexagon::ArchV5;
+  case HexagonII::HasV55SubT:
+    return Hexagon::ArchV55;
+  case HexagonII::HasV60SubT:
+    return Hexagon::ArchV60;
   }
 }
 
-// Return the Hexagon ISA class for the insn.
-unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
-                                     MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-
-  return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
-}
-
+/// Return the slots this instruction can execute out of
 unsigned HexagonMCInstrInfo::getUnits(MCInstrInfo const &MCII,
                                       MCSubtargetInfo const &STI,
                                       MCInst const &MCI) {
-
   const InstrItinerary *II = STI.getSchedModel().InstrItineraries;
   int SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
   return ((II[SchedClass].FirstStage + HexagonStages)->getUnits());
 }
 
-bool HexagonMCInstrInfo::hasImmExt(MCInst const &MCI) {
+/// Return the slots this instruction consumes in addition to
+/// the slot(s) it can execute out of
+
+unsigned HexagonMCInstrInfo::getOtherReservedSlots(MCInstrInfo const &MCII,
+                                                   MCSubtargetInfo const &STI,
+                                                   MCInst const &MCI) {
+  const InstrItinerary *II = STI.getSchedModel().InstrItineraries;
+  int SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
+  unsigned Slots = 0;
+
+  // FirstStage are slots that this instruction can execute in.
+  // FirstStage+1 are slots that are also consumed by this instruction.
+  // For example: vmemu can only execute in slot 0 but also consumes slot 1.
+  for (unsigned Stage = II[SchedClass].FirstStage + 1;
+       Stage < II[SchedClass].LastStage; ++Stage) {
+    unsigned Units = (Stage + HexagonStages)->getUnits();
+    if (Units > HexagonGetLastSlot())
+      break;
+    // fyi: getUnits() will return 0x1, 0x2, 0x4 or 0x8
+    Slots |= Units;
+  }
+
+  // if 0 is returned, then no additional slots are consumed by this inst.
+  return Slots;
+}
+
+bool HexagonMCInstrInfo::hasDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
   if (!HexagonMCInstrInfo::isBundle(MCI))
     return false;
 
   for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCI)) {
     auto MI = I.getInst();
-    if (isImmext(*MI))
+    if (HexagonMCInstrInfo::isDuplex(MCII, *MI))
       return true;
   }
 
@@ -398,7 +410,20 @@ bool HexagonMCInstrInfo::hasExtenderForIndex(MCInst const &MCB, size_t Index) {
   return extenderForIndex(MCB, Index) != nullptr;
 }
 
-// Return whether the instruction is a legal new-value producer.
+bool HexagonMCInstrInfo::hasImmExt(MCInst const &MCI) {
+  if (!HexagonMCInstrInfo::isBundle(MCI))
+    return false;
+
+  for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCI)) {
+    auto MI = I.getInst();
+    if (isImmext(*MI))
+      return true;
+  }
+
+  return false;
+}
+
+/// Return whether the insn produces a value.
 bool HexagonMCInstrInfo::hasNewValue(MCInstrInfo const &MCII,
                                      MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -418,46 +443,19 @@ MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) {
   return *MCB.getOperand(bundleInstructionsOffset + Index).getInst();
 }
 
+/// Return where the instruction is an accumulator.
+bool HexagonMCInstrInfo::isAccumulator(MCInstrInfo const &MCII,
+                                       MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::AccumulatorPos) & HexagonII::AccumulatorMask);
+}
+
 bool HexagonMCInstrInfo::isBundle(MCInst const &MCI) {
   auto Result = Hexagon::BUNDLE == MCI.getOpcode();
   assert(!Result || (MCI.size() > 0 && MCI.getOperand(0).isImm()));
   return Result;
 }
 
-// Return whether the insn is an actual insn.
-bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
-  return (!HexagonMCInstrInfo::getDesc(MCII, MCI).isPseudo() &&
-          !HexagonMCInstrInfo::isPrefix(MCII, MCI) &&
-          HexagonMCInstrInfo::getType(MCII, MCI) != HexagonII::TypeENDLOOP);
-}
-
-bool HexagonMCInstrInfo::isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::CofMax1Pos) & HexagonII::CofMax1Mask);
-}
-
-bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII,
-                                    MCInst const &MCI) {
-  return (getType(MCII, MCI) == HexagonII::TypeCOMPOUND);
-}
-
-bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
-  return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
-          (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
-}
-
-bool HexagonMCInstrInfo::isDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
-  return HexagonII::TypeDUPLEX == HexagonMCInstrInfo::getType(MCII, MCI);
-}
-
-// Return whether the instruction needs to be constant extended.
-// 1) Always return true if the instruction has 'isExtended' flag set.
-//
-// isExtendable:
-// 2) For immediate extended operands, return true only if the value is
-//    out-of-range.
-// 3) For global address, always return true.
-
 bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
                                          MCInst const &MCI) {
   if (HexagonMCInstrInfo::isExtended(MCII, MCI))
@@ -470,9 +468,9 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
     return true;
   // Branch insns are handled as necessary by relaxation.
   if ((HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeJ) ||
-      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCOMPOUND &&
+      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCJ &&
        HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()) ||
-      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNV &&
+      (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeNCJ &&
        HexagonMCInstrInfo::getDesc(MCII, MCI).isBranch()))
     return false;
   // Otherwise loop instructions and other CR insts are handled by relaxation
@@ -492,6 +490,30 @@ bool HexagonMCInstrInfo::isConstExtended(MCInstrInfo const &MCII,
   return (MinValue > Value || Value > MaxValue);
 }
 
+bool HexagonMCInstrInfo::isCanon(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return !HexagonMCInstrInfo::getDesc(MCII, MCI).isPseudo() &&
+         !HexagonMCInstrInfo::isPrefix(MCII, MCI);
+}
+
+bool HexagonMCInstrInfo::isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::CofMax1Pos) & HexagonII::CofMax1Mask);
+}
+
+bool HexagonMCInstrInfo::isCompound(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
+  return (getType(MCII, MCI) == HexagonII::TypeCJ);
+}
+
+bool HexagonMCInstrInfo::isDblRegForSubInst(unsigned Reg) {
+  return ((Reg >= Hexagon::D0 && Reg <= Hexagon::D3) ||
+          (Reg >= Hexagon::D8 && Reg <= Hexagon::D11));
+}
+
+bool HexagonMCInstrInfo::isDuplex(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return HexagonII::TypeDUPLEX == HexagonMCInstrInfo::getType(MCII, MCI);
+}
+
 bool HexagonMCInstrInfo::isExtendable(MCInstrInfo const &MCII,
                                       MCInst const &MCI) {
   uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -510,9 +532,7 @@ bool HexagonMCInstrInfo::isFloat(MCInstrInfo const &MCII, MCInst const &MCI) {
 }
 
 bool HexagonMCInstrInfo::isImmext(MCInst const &MCI) {
-  auto Op = MCI.getOpcode();
-  return (Op == Hexagon::A4_ext_b || Op == Hexagon::A4_ext_c ||
-          Op == Hexagon::A4_ext_g || Op == Hexagon::A4_ext);
+  return MCI.getOpcode() == Hexagon::A4_ext;
 }
 
 bool HexagonMCInstrInfo::isInnerLoop(MCInst const &MCI) {
@@ -530,20 +550,17 @@ bool HexagonMCInstrInfo::isIntRegForSubInst(unsigned Reg) {
           (Reg >= Hexagon::R16 && Reg <= Hexagon::R23));
 }
 
-// Return whether the insn is a new-value consumer.
+/// Return whether the insn expects newly produced value.
 bool HexagonMCInstrInfo::isNewValue(MCInstrInfo const &MCII,
                                     MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
   return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
 }
 
-// Return whether the operand can be constant extended.
-bool HexagonMCInstrInfo::isOperandExtended(MCInstrInfo const &MCII,
-                                           MCInst const &MCI,
-                                           unsigned short OperandNum) {
-  uint64_t const F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::ExtendableOpPos) & HexagonII::ExtendableOpMask) ==
-         OperandNum;
+/// Return whether the operand is extendable.
+bool HexagonMCInstrInfo::isOpExtendable(MCInstrInfo const &MCII,
+                                        MCInst const &MCI, unsigned short O) {
+  return (O == HexagonMCInstrInfo::getExtendableOp(MCII, MCI));
 }
 
 bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) {
@@ -558,6 +575,10 @@ bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII,
   return ((F >> HexagonII::PredicatedPos) & HexagonII::PredicatedMask);
 }
 
+bool HexagonMCInstrInfo::isPrefix(MCInstrInfo const &MCII, MCInst const &MCI) {
+  return HexagonII::TypeEXTENDER == HexagonMCInstrInfo::getType(MCII, MCI);
+}
+
 bool HexagonMCInstrInfo::isPredicateLate(MCInstrInfo const &MCII,
                                          MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -582,12 +603,22 @@ bool HexagonMCInstrInfo::isPredReg(unsigned Reg) {
   return (Reg >= Hexagon::P0 && Reg <= Hexagon::P3_0);
 }
 
-bool HexagonMCInstrInfo::isPrefix(MCInstrInfo const &MCII, MCInst const &MCI) {
-  return (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypePREFIX);
+/// Return whether the insn can be packaged only with A and X-type insns.
+bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
 }
 
-bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
+/// Return whether the insn can be packaged only with an A-type insn in slot #1.
+bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
+                                    MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
+}
+
+/// Return whether the insn is solo, i.e., cannot be in a packet.
+bool HexagonMCInstrInfo::isSolo(MCInstrInfo const &MCII, MCInst const &MCI) {
+  const uint64_t F = MCII.get(MCI.getOpcode()).TSFlags;
   return ((F >> HexagonII::SoloPos) & HexagonII::SoloMask);
 }
 
@@ -663,17 +694,6 @@ bool HexagonMCInstrInfo::isSubInstruction(MCInst const &MCI) {
   }
 }
 
-bool HexagonMCInstrInfo::isSoloAX(MCInstrInfo const &MCII, MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::SoloAXPos) & HexagonII::SoloAXMask);
-}
-
-bool HexagonMCInstrInfo::isSoloAin1(MCInstrInfo const &MCII,
-                                    MCInst const &MCI) {
-  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
-  return ((F >> HexagonII::SoloAin1Pos) & HexagonII::SoloAin1Mask);
-}
-
 bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) {
   if ((getType(MCII, MCI) <= HexagonII::TypeCVI_LAST) &&
       (getType(MCII, MCI) >= HexagonII::TypeCVI_FIRST))
@@ -705,16 +725,26 @@ bool HexagonMCInstrInfo::mustExtend(MCExpr const &Expr) {
   return HExpr.mustExtend();
 }
 void HexagonMCInstrInfo::setMustNotExtend(MCExpr const &Expr, bool Val) {
-  HexagonMCExpr &HExpr =
-      const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
+  HexagonMCExpr &HExpr = const_cast<HexagonMCExpr &>(cast<HexagonMCExpr>(Expr));
   HExpr.setMustNotExtend(Val);
 }
 bool HexagonMCInstrInfo::mustNotExtend(MCExpr const &Expr) {
   HexagonMCExpr const &HExpr = cast<HexagonMCExpr>(Expr);
   return HExpr.mustNotExtend();
 }
+void HexagonMCInstrInfo::setS23_2_reloc(MCExpr const &Expr, bool Val) {
+  HexagonMCExpr &HExpr =
+      const_cast<HexagonMCExpr &>(*llvm::cast<HexagonMCExpr>(&Expr));
+  HExpr.setS23_2_reloc(Val);
+}
+bool HexagonMCInstrInfo::s23_2_reloc(MCExpr const &Expr) {
+  HexagonMCExpr const *HExpr = llvm::dyn_cast<HexagonMCExpr>(&Expr);
+  if (!HExpr)
+    return false;
+  return HExpr->s23_2_reloc();
+}
 
-void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
+void HexagonMCInstrInfo::padEndloop(MCInst &MCB, MCContext &Context) {
   MCInst Nop;
   Nop.setOpcode(Hexagon::A2_nop);
   assert(isBundle(MCB));
@@ -727,22 +757,8 @@ void HexagonMCInstrInfo::padEndloop(MCContext &Context, MCInst &MCB) {
 
 bool HexagonMCInstrInfo::prefersSlot3(MCInstrInfo const &MCII,
                                       MCInst const &MCI) {
-  if (HexagonMCInstrInfo::getType(MCII, MCI) == HexagonII::TypeCR)
-    return false;
-
-  unsigned SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
-  switch (SchedClass) {
-  case Hexagon::Sched::ALU32_3op_tc_2_SLOT0123:
-  case Hexagon::Sched::ALU64_tc_2_SLOT23:
-  case Hexagon::Sched::ALU64_tc_3x_SLOT23:
-  case Hexagon::Sched::M_tc_2_SLOT23:
-  case Hexagon::Sched::M_tc_3x_SLOT23:
-  case Hexagon::Sched::S_2op_tc_2_SLOT23:
-  case Hexagon::Sched::S_3op_tc_2_SLOT23:
-  case Hexagon::Sched::S_3op_tc_3x_SLOT23:
-    return true;
-  }
-  return false;
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::PrefersSlot3Pos) & HexagonII::PrefersSlot3Mask;
 }
 
 void HexagonMCInstrInfo::replaceDuplex(MCContext &Context, MCInst &MCB,
@@ -778,15 +794,6 @@ void HexagonMCInstrInfo::setMemStoreReorderEnabled(MCInst &MCI) {
   Operand.setImm(Operand.getImm() | memStoreReorderEnabledMask);
   assert(isMemStoreReorderEnabled(MCI));
 }
-void HexagonMCInstrInfo::setS23_2_reloc(MCExpr const &Expr, bool Val) {
-  HexagonMCExpr &HExpr =
-      const_cast<HexagonMCExpr &>(*llvm::cast<HexagonMCExpr>(&Expr));
-  HExpr.setS23_2_reloc(Val);
-}
-bool HexagonMCInstrInfo::s23_2_reloc(MCExpr const &Expr) {
-  HexagonMCExpr const &HExpr = *llvm::cast<HexagonMCExpr>(&Expr);
-  return HExpr.s23_2_reloc();
-}
 
 void HexagonMCInstrInfo::setOuterLoop(MCInst &MCI) {
   assert(isBundle(MCI));
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index d701c3ade69e93315d9353199f24694dedc338f4..2e989adb5ccbf0830b73a5128934edec325b7110 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -19,11 +19,8 @@
 
 namespace llvm {
 class HexagonMCChecker;
-class MCContext;
 class MCInstrDesc;
 class MCInstrInfo;
-class MCInst;
-class MCOperand;
 class MCSubtargetInfo;
 namespace HexagonII {
 enum class MemAccessSize;
@@ -67,16 +64,6 @@ bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                         MCContext &Context, MCInst &MCB,
                         HexagonMCChecker *Checker);
 
-// Clamp off upper 26 bits of extendable operand for emission
-void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
-
-MCInst createBundle();
-
-// Return the extender for instruction at Index or nullptr if none
-MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
-void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
-                    MCInst const &MCI);
-
 // Create a duplex instruction given the two subinsts
 MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
                      MCInst const &inst1);
@@ -86,27 +73,28 @@ MCInst deriveExtender(MCInstrInfo const &MCII, MCInst const &Inst,
 // Convert this instruction in to a duplex subinst
 MCInst deriveSubInst(MCInst const &Inst);
 
+// Clamp off upper 26 bits of extendable operand for emission
+void clampExtended(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+
+MCInst createBundle();
+
 // Return the extender for instruction at Index or nullptr if none
 MCInst const *extenderForIndex(MCInst const &MCB, size_t Index);
+void extendIfNeeded(MCContext &Context, MCInstrInfo const &MCII, MCInst &MCB,
+                    MCInst const &MCI);
 
 // Return memory access size
 HexagonII::MemAccessSize getAccessSize(MCInstrInfo const &MCII,
                                        MCInst const &MCI);
-
-// Return number of bits in the constant extended operand.
-unsigned getBitCount(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return constant extended operand number.
-unsigned short getCExtOpNum(MCInstrInfo const &MCII, MCInst const &MCI);
-
 MCInstrDesc const &getDesc(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return which duplex group this instruction belongs to
 unsigned getDuplexCandidateGroup(MCInst const &MI);
 
 // Return a list of all possible instruction duplex combinations
-SmallVector<DuplexCandidate, 8> getDuplexPossibilties(MCInstrInfo const &MCII,
-                                                      MCInst const &MCB);
+SmallVector<DuplexCandidate, 8>
+getDuplexPossibilties(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                      MCInst const &MCB);
 unsigned getDuplexRegisterNumbering(unsigned Reg);
 
 MCExpr const &getExpr(MCExpr const &Expr);
@@ -143,7 +131,6 @@ MCOperand const &getNewValueOperand(MCInstrInfo const &MCII, MCInst const &MCI);
 unsigned short getNewValueOp2(MCInstrInfo const &MCII, MCInst const &MCI);
 MCOperand const &getNewValueOperand2(MCInstrInfo const &MCII,
                                      MCInst const &MCI);
-
 int getSubTarget(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Return the Hexagon ISA class for the insn.
@@ -152,6 +139,9 @@ unsigned getType(MCInstrInfo const &MCII, MCInst const &MCI);
 /// Return the slots used by the insn.
 unsigned getUnits(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                   MCInst const &MCI);
+unsigned getOtherReservedSlots(MCInstrInfo const &MCII,
+                               MCSubtargetInfo const &STI, MCInst const &MCI);
+bool hasDuplex(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Does the packet have an extender for the instruction at Index
 bool hasExtenderForIndex(MCInst const &MCB, size_t Index);
@@ -161,19 +151,6 @@ bool hasImmExt(MCInst const &MCI);
 // Return whether the instruction is a legal new-value producer.
 bool hasNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
 bool hasNewValue2(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return the instruction at Index
-MCInst const &instruction(MCInst const &MCB, size_t Index);
-
-// Returns whether this MCInst is a wellformed bundle
-bool isBundle(MCInst const &MCI);
-
-// Return whether the insn is an actual insn.
-bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
-bool isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI);
-bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return the duplex iclass given the two duplex classes
 unsigned iClassOfDuplexPair(unsigned Ga, unsigned Gb);
 
 int64_t minConstant(MCInst const &MCI, size_t Index);
@@ -189,6 +166,18 @@ template <unsigned N> bool inRange(MCInst const &MCI, size_t Index) {
   return isUInt<N>(minConstant(MCI, Index));
 }
 
+// Return the instruction at Index
+MCInst const &instruction(MCInst const &MCB, size_t Index);
+bool isAccumulator(MCInstrInfo const &MCII, MCInst const &MCI);
+
+// Returns whether this MCInst is a wellformed bundle
+bool isBundle(MCInst const &MCI);
+
+// Return whether the insn is an actual insn.
+bool isCanon(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCofMax1(MCInstrInfo const &MCII, MCInst const &MCI);
+bool isCompound(MCInstrInfo const &MCII, MCInst const &MCI);
+
 // Return whether the instruction needs to be constant extended.
 bool isConstExtended(MCInstrInfo const &MCII, MCInst const &MCI);
 
@@ -229,15 +218,12 @@ bool isMemStoreReorderEnabled(MCInst const &MCI);
 
 // Return whether the insn is a new-value consumer.
 bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
-
-// Return true if the operand can be constant extended.
-bool isOperandExtended(MCInstrInfo const &MCII, MCInst const &MCI,
-                       unsigned short OperandNum);
+bool isOpExtendable(MCInstrInfo const &MCII, MCInst const &MCI, unsigned short);
 
 // Can these two instructions be duplexed
 bool isOrderedDuplexPair(MCInstrInfo const &MCII, MCInst const &MIa,
                          bool ExtendedA, MCInst const &MIb, bool ExtendedB,
-                         bool bisReversable);
+                         bool bisReversable, MCSubtargetInfo const &STI);
 
 // Returns whether this bundle is an endloop1
 bool isOuterLoop(MCInst const &MCI);
@@ -270,12 +256,11 @@ bool mustExtend(MCExpr const &Expr);
 bool mustNotExtend(MCExpr const &Expr);
 
 // Pad the bundle with nops to satisfy endloop requirements
-void padEndloop(MCContext &Context, MCInst &MCI);
-
+void padEndloop(MCInst &MCI, MCContext &Context);
 bool prefersSlot3(MCInstrInfo const &MCII, MCInst const &MCI);
 
 // Replace the instructions inside MCB, represented by Candidate
-void replaceDuplex(MCContext &Context, MCInst &MCB, DuplexCandidate Candidate);
+void replaceDuplex(MCContext &Context, MCInst &MCI, DuplexCandidate Candidate);
 
 bool s23_2_reloc(MCExpr const &Expr);
 // Marks a bundle as endloop0
@@ -295,7 +280,8 @@ unsigned SubregisterBit(unsigned Consumer, unsigned Producer,
                         unsigned Producer2);
 
 // Attempt to find and replace compound pairs
-void tryCompound(MCInstrInfo const &MCII, MCContext &Context, MCInst &MCI);
+void tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                 MCContext &Context, MCInst &MCI);
 }
 }
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
index 7f8e7a4edb0cc4ad56ea12fd08a3d8676d1a0207..529a5fd5ed82c09606d6b90b52eaafc80c53c36b 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.cpp
@@ -33,42 +33,39 @@ void HexagonMCShuffler::init(MCInst &MCB) {
     MCInst const *Extender = nullptr;
     // Copy the bundle for the shuffling.
     for (const auto &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
-      assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
-      MCInst *MI = const_cast<MCInst *>(I.getInst());
+      MCInst &MI = *const_cast<MCInst *>(I.getInst());
+      DEBUG(dbgs() << "Shuffling: " << MCII.getName(MI.getOpcode()) << '\n');
+      assert(!HexagonMCInstrInfo::getDesc(MCII, MI).isPseudo());
 
-      if (!HexagonMCInstrInfo::isImmext(*MI)) {
-        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
-               false);
+      if (!HexagonMCInstrInfo::isImmext(MI)) {
+        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, MI));
         Extender = nullptr;
       } else
-        Extender = MI;
+        Extender = &MI;
     }
   }
 
   BundleFlags = MCB.getOperand(0).getImm();
 }
 
-void HexagonMCShuffler::init(MCInst &MCB, MCInst const *AddMI,
+void HexagonMCShuffler::init(MCInst &MCB, MCInst const &AddMI,
                              bool bInsertAtFront) {
   if (HexagonMCInstrInfo::isBundle(MCB)) {
-    if (bInsertAtFront && AddMI)
-      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
-             false);
+    if (bInsertAtFront)
+      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, AddMI));
     MCInst const *Extender = nullptr;
     // Copy the bundle for the shuffling.
     for (auto const &I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
       assert(!HexagonMCInstrInfo::getDesc(MCII, *I.getInst()).isPseudo());
-      MCInst *MI = const_cast<MCInst *>(I.getInst());
-      if (!HexagonMCInstrInfo::isImmext(*MI)) {
-        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, *MI),
-               false);
+      MCInst &MI = *const_cast<MCInst *>(I.getInst());
+      if (!HexagonMCInstrInfo::isImmext(MI)) {
+        append(MI, Extender, HexagonMCInstrInfo::getUnits(MCII, STI, MI));
         Extender = nullptr;
       } else
-        Extender = MI;
+        Extender = &MI;
     }
-    if (!bInsertAtFront && AddMI)
-      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, *AddMI),
-             false);
+    if (!bInsertAtFront)
+      append(AddMI, nullptr, HexagonMCInstrInfo::getUnits(MCII, STI, AddMI));
   }
 
   BundleFlags = MCB.getOperand(0).getImm();
@@ -80,11 +77,11 @@ void HexagonMCShuffler::copyTo(MCInst &MCB) {
   // Copy the results into the bundle.
   for (HexagonShuffler::iterator I = begin(); I != end(); ++I) {
 
-    MCInst const *MI = I->getDesc();
+    MCInst const &MI = I->getDesc();
     MCInst const *Extender = I->getExtender();
     if (Extender)
       MCB.addOperand(MCOperand::createInst(Extender));
-    MCB.addOperand(MCOperand::createInst(MI));
+    MCB.addOperand(MCOperand::createInst(&MI));
   }
 }
 
@@ -98,9 +95,9 @@ bool HexagonMCShuffler::reshuffleTo(MCInst &MCB) {
   return (!getError());
 }
 
-bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                            MCInst &MCB) {
-  HexagonMCShuffler MCS(MCII, STI, MCB);
+bool llvm::HexagonMCShuffle(bool Fatal, MCInstrInfo const &MCII,
+                            MCSubtargetInfo const &STI, MCInst &MCB) {
+  HexagonMCShuffler MCS(true, MCII, STI, MCB);
 
   if (DisableShuffle)
     // Ignore if user chose so.
@@ -124,6 +121,18 @@ bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   if (!MCS.reshuffleTo(MCB)) {
     // Unless there is any error, which should not happen at this point.
     unsigned shuffleError = MCS.getError();
+
+    if (!Fatal && (shuffleError !=  HexagonShuffler::SHUFFLE_SUCCESS))
+      return false;
+    if (shuffleError !=  HexagonShuffler::SHUFFLE_SUCCESS) {
+      errs() << "\nFailing packet:\n";
+      for (const auto& I : HexagonMCInstrInfo::bundleInstructions(MCB)) {
+        MCInst *MI = const_cast<MCInst *>(I.getInst());
+        errs() << HexagonMCInstrInfo::getName(MCII, *MI) << ' ' << HexagonMCInstrInfo::getDesc(MCII, *MI).getOpcode() << '\n';
+      }
+      errs() << '\n';
+    }
+
     switch (shuffleError) {
     default:
       llvm_unreachable("unknown error");
@@ -176,7 +185,7 @@ llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
     DuplexCandidate duplexToTry = possibleDuplexes.pop_back_val();
     MCInst Attempt(MCB);
     HexagonMCInstrInfo::replaceDuplex(Context, Attempt, duplexToTry);
-    HexagonMCShuffler MCS(MCII, STI, Attempt); // copy packet to the shuffler
+    HexagonMCShuffler MCS(true, MCII, STI, Attempt); // copy packet to the shuffler
     if (MCS.size() == 1) {                     // case of one duplex
       // copy the created duplex in the shuffler to the bundle
       MCS.copyTo(MCB);
@@ -191,7 +200,7 @@ llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   }
 
   if (doneShuffling == false) {
-    HexagonMCShuffler MCS(MCII, STI, MCB);
+    HexagonMCShuffler MCS(true, MCII, STI, MCB);
     doneShuffling = MCS.reshuffleTo(MCB); // shuffle
     shuffleError = MCS.getError();
   }
@@ -202,8 +211,8 @@ llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
 }
 
 bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                            MCInst &MCB, MCInst const *AddMI, int fixupCount) {
-  if (!HexagonMCInstrInfo::isBundle(MCB) || !AddMI)
+                            MCInst &MCB, MCInst const &AddMI, int fixupCount) {
+  if (!HexagonMCInstrInfo::isBundle(MCB))
     return false;
 
   // if fixups present, make sure we don't insert too many nops that would
@@ -211,8 +220,15 @@ bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   unsigned int bundleSize = HexagonMCInstrInfo::bundleSize(MCB);
   if (bundleSize >= HEXAGON_PACKET_SIZE)
     return false;
+  bool bhasDuplex = HexagonMCInstrInfo::hasDuplex(MCII, MCB);
   if (fixupCount >= 2) {
-    return false;
+    if (bhasDuplex) {
+      if (bundleSize >= HEXAGON_PACKET_SIZE - 1) {
+        return false;
+      }
+    } else {
+      return false;
+    }
   } else {
     if (bundleSize == HEXAGON_PACKET_SIZE - 1 && fixupCount)
       return false;
@@ -221,7 +237,16 @@ bool llvm::HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
   if (DisableShuffle)
     return false;
 
-  HexagonMCShuffler MCS(MCII, STI, MCB, AddMI);
+  // mgl: temporary code (shuffler doesn't take into account the fact that
+  // a duplex takes up two slots.  for example, 3 nops can be put into a packet
+  // containing a duplex oversubscribing slots by 1).
+  unsigned maxBundleSize = (HexagonMCInstrInfo::hasImmExt(MCB))
+                               ? HEXAGON_PACKET_SIZE
+                               : HEXAGON_PACKET_SIZE - 1;
+  if (bhasDuplex && bundleSize >= maxBundleSize)
+    return false;
+
+  HexagonMCShuffler MCS(MCII, STI, MCB, AddMI, false);
   if (!MCS.reshuffleTo(MCB)) {
     unsigned shuffleError = MCS.getError();
     switch (shuffleError) {
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
index a21cce1fc2409ab5dcda6171502a96b65dffdd61..14bbfda4c914a8ab191effae0ca4b9d9f9929e20 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCShuffler.h
@@ -27,16 +27,16 @@ class HexagonMCShuffler : public HexagonShuffler {
   bool duplex_present;
 
 public:
-  HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                    MCInst &MCB)
+  HexagonMCShuffler(bool Fatal, MCInstrInfo const &MCII,
+                    MCSubtargetInfo const &STI, MCInst &MCB)
       : HexagonShuffler(MCII, STI) {
     init(MCB);
   };
   HexagonMCShuffler(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                    MCInst &MCB, const MCInst *AddMI,
-                    bool bInsertAtFront = false)
+                    MCInst &MCB, MCInst const &AddMI,
+                    bool InsertAtFront)
       : HexagonShuffler(MCII, STI) {
-    init(MCB, AddMI, bInsertAtFront);
+    init(MCB, AddMI, InsertAtFront);
   };
 
   // Copy reordered bundle to another.
@@ -49,14 +49,14 @@ public:
 
 private:
   void init(MCInst &MCB);
-  void init(MCInst &MCB, const MCInst *AddMI, bool bInsertAtFront = false);
+  void init(MCInst &MCB, MCInst const &AddMI, bool InsertAtFront);
 };
 
 // Invocation of the shuffler.
+bool HexagonMCShuffle(bool Fatal, MCInstrInfo const &MCII,
+                      MCSubtargetInfo const &STI, MCInst &);
 bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                      MCInst &);
-bool HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
-                      MCInst &, const MCInst *, int);
+                      MCInst &, MCInst const &, int);
 unsigned HexagonMCShuffle(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                           MCContext &Context, MCInst &,
                           SmallVector<DuplexCandidate, 8>);
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 694cf582f8d9675ca51451e75264ecdaccd4f378..bb98c2bbef6d7134a28547264550e27a1a0c528e 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -66,6 +67,12 @@ static cl::opt<bool> HexagonV55ArchVariant("mv55", cl::Hidden, cl::init(false),
 static cl::opt<bool> HexagonV60ArchVariant("mv60", cl::Hidden, cl::init(false),
   cl::desc("Build for Hexagon V60"));
 
+static cl::opt<bool> HexagonV62ArchVariant("mv62", cl::Hidden, cl::init(false),
+  cl::desc("Build for Hexagon V62"));
+
+static cl::opt<bool> EnableHVX("mhvx", cl::Hidden, cl::init(false),
+  cl::desc("Enable Hexagon Vector Extension (HVX)"));
+
 static StringRef DefaultArch = "hexagonv60";
 
 static StringRef HexagonGetArchVariant() {
@@ -77,6 +84,8 @@ static StringRef HexagonGetArchVariant() {
     return "hexagonv55";
   if (HexagonV60ArchVariant)
     return "hexagonv60";
+  if (HexagonV62ArchVariant)
+    return "hexagonv62";
   return "";
 }
 
@@ -95,31 +104,16 @@ StringRef Hexagon_MC::selectHexagonCPU(const Triple &TT, StringRef CPU) {
   return ArchV;
 }
 
-MCInstrInfo *llvm::createHexagonMCInstrInfo() {
-  MCInstrInfo *X = new MCInstrInfo();
-  InitHexagonMCInstrInfo(X);
-  return X;
-}
-
-static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
-  MCRegisterInfo *X = new MCRegisterInfo();
-  InitHexagonMCRegisterInfo(X, Hexagon::R31);
-  return X;
-}
-
-static MCSubtargetInfo *
-createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  CPU = Hexagon_MC::selectHexagonCPU(TT, CPU);
-  return createHexagonMCSubtargetInfoImpl(TT, CPU, FS);
-}
+unsigned llvm::HexagonGetLastSlot() { return HexagonItinerariesV4FU::SLOT3; }
 
 namespace {
 
 class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
 public:
   HexagonTargetAsmStreamer(MCStreamer &S,
-                           formatted_raw_ostream &, bool,
-                           MCInstPrinter &)
+                           formatted_raw_ostream &OS,
+                           bool isVerboseAsm,
+                           MCInstPrinter &IP)
       : HexagonTargetStreamer(S) {}
 
   void prettyPrintAsm(MCInstPrinter &InstPrinter, raw_ostream &OS,
@@ -156,24 +150,15 @@ public:
 
 class HexagonTargetELFStreamer : public HexagonTargetStreamer {
 public:
+  MCELFStreamer &getStreamer() {
+    return static_cast<MCELFStreamer &>(Streamer);
+  }
   HexagonTargetELFStreamer(MCStreamer &S, MCSubtargetInfo const &STI)
       : HexagonTargetStreamer(S) {
-    auto Bits = STI.getFeatureBits();
-    unsigned Flags = 0;
-    if (Bits[Hexagon::ArchV60])
-      Flags = ELF::EF_HEXAGON_MACH_V60;
-    else if (Bits[Hexagon::ArchV55])
-      Flags = ELF::EF_HEXAGON_MACH_V55;
-    else if (Bits[Hexagon::ArchV5])
-      Flags = ELF::EF_HEXAGON_MACH_V5;
-    else if (Bits[Hexagon::ArchV4])
-      Flags = ELF::EF_HEXAGON_MACH_V4;
-    getStreamer().getAssembler().setELFHeaderEFlags(Flags);
+    MCAssembler &MCA = getStreamer().getAssembler();
+    MCA.setELFHeaderEFlags(Hexagon_MC::GetELFFlags(STI));
   }
 
-  MCELFStreamer &getStreamer() {
-    return static_cast<MCELFStreamer &>(Streamer);
-  }
 
   void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                               unsigned ByteAlignment,
@@ -196,13 +181,26 @@ public:
 
 } // end anonymous namespace
 
+llvm::MCInstrInfo *llvm::createHexagonMCInstrInfo() {
+  MCInstrInfo *X = new MCInstrInfo();
+  InitHexagonMCInstrInfo(X);
+  return X;
+}
+
+static MCRegisterInfo *createHexagonMCRegisterInfo(const Triple &TT) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitHexagonMCRegisterInfo(X, Hexagon::R31);
+  return X;
+}
+
 static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
                                          const Triple &TT) {
   MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
 
   // VirtualFP = (R30 + #0).
   MCCFIInstruction Inst =
-      MCCFIInstruction::createDefCfa(nullptr, Hexagon::R30, 0);
+      MCCFIInstruction::createDefCfa(nullptr,
+          MRI.getDwarfRegNum(Hexagon::R30, true), 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -212,31 +210,138 @@ static MCInstPrinter *createHexagonMCInstPrinter(const Triple &T,
                                                  unsigned SyntaxVariant,
                                                  const MCAsmInfo &MAI,
                                                  const MCInstrInfo &MII,
-                                                 const MCRegisterInfo &MRI) {
+                                                 const MCRegisterInfo &MRI)
+{
   if (SyntaxVariant == 0)
-    return (new HexagonInstPrinter(MAI, MII, MRI));
+    return new HexagonInstPrinter(MAI, MII, MRI);
   else
     return nullptr;
 }
 
-static MCTargetStreamer *createMCAsmTargetStreamer(MCStreamer &S,
-                                                   formatted_raw_ostream &OS,
-                                                   MCInstPrinter *InstPrint,
-                                                   bool IsVerboseAsm) {
-  return new HexagonTargetAsmStreamer(S,  OS, IsVerboseAsm, *InstPrint);
+static MCTargetStreamer *
+createMCAsmTargetStreamer(MCStreamer &S, formatted_raw_ostream &OS,
+                          MCInstPrinter *IP, bool IsVerboseAsm) {
+  return new HexagonTargetAsmStreamer(S, OS, IsVerboseAsm, *IP);
 }
 
-static MCStreamer *createMCStreamer(Triple const &T, MCContext &Context,
-                                    MCAsmBackend &MAB, raw_pwrite_stream &OS,
-                                    MCCodeEmitter *Emitter, bool RelaxAll) {
-  return createHexagonELFStreamer(Context, MAB, OS, Emitter);
+static MCStreamer *createMCStreamer(Triple const &T,
+                                    MCContext &Context,
+                                    MCAsmBackend &MAB,
+                                    raw_pwrite_stream &OS,
+                                    MCCodeEmitter *Emitter,
+                                    bool RelaxAll) {
+  return createHexagonELFStreamer(T, Context, MAB, OS, Emitter);
 }
 
 static MCTargetStreamer *
-createHexagonObjectTargetStreamer(MCStreamer &S, MCSubtargetInfo const &STI) {
+createHexagonObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
   return new HexagonTargetELFStreamer(S, STI);
 }
 
+static void LLVM_ATTRIBUTE_UNUSED clearFeature(MCSubtargetInfo* STI, uint64_t F) {
+  uint64_t FB = STI->getFeatureBits().to_ullong();
+  if (FB & (1ULL << F))
+    STI->ToggleFeature(F);
+}
+
+static bool LLVM_ATTRIBUTE_UNUSED checkFeature(MCSubtargetInfo* STI, uint64_t F) {
+  uint64_t FB = STI->getFeatureBits().to_ullong();
+  return (FB & (1ULL << F)) != 0;
+}
+
+StringRef Hexagon_MC::ParseHexagonTriple(const Triple &TT, StringRef CPU) {
+  StringRef CPUName = Hexagon_MC::selectHexagonCPU(TT, CPU);
+  StringRef FS = "";
+  if (EnableHVX) {
+    if (CPUName.equals_lower("hexagonv60") ||
+        CPUName.equals_lower("hexagonv62"))
+      FS = "+hvx";
+  }
+  return FS;
+}
+
+static bool isCPUValid(std::string CPU)
+{
+  std::vector<std::string> table
+  {
+    "hexagonv4",
+    "hexagonv5",
+    "hexagonv55",
+    "hexagonv60",
+    "hexagonv62",
+  };
+
+  return std::find(table.begin(), table.end(), CPU) != table.end();
+}
+
+MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
+                                                          StringRef CPU,
+                                                          StringRef FS) {
+  StringRef ArchFS = (FS.size()) ? FS : Hexagon_MC::ParseHexagonTriple(TT, CPU);
+  StringRef CPUName = Hexagon_MC::selectHexagonCPU(TT, CPU);
+  if (!isCPUValid(CPUName.str())) {
+    errs() << "error: invalid CPU \"" << CPUName.str().c_str()
+           << "\" specified\n";
+    return nullptr;
+  }
+
+  MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(TT, CPUName, ArchFS);
+  if (X->getFeatureBits()[Hexagon::ExtensionHVXDbl]) {
+    llvm::FeatureBitset Features = X->getFeatureBits();
+    X->setFeatureBits(Features.set(Hexagon::ExtensionHVX));
+  }
+  return X;
+}
+
+unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
+  static std::map<StringRef,unsigned> ElfFlags = {
+    {"hexagonv4",  ELF::EF_HEXAGON_MACH_V4},
+    {"hexagonv5",  ELF::EF_HEXAGON_MACH_V5},
+    {"hexagonv55", ELF::EF_HEXAGON_MACH_V55},
+    {"hexagonv60", ELF::EF_HEXAGON_MACH_V60},
+    {"hexagonv62", ELF::EF_HEXAGON_MACH_V62},
+  };
+
+  auto F = ElfFlags.find(STI.getCPU());
+  assert(F != ElfFlags.end() && "Unrecognized Architecture");
+  return F->second;
+}
+
+namespace {
+class HexagonMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  HexagonMCInstrAnalysis(MCInstrInfo const *Info) : MCInstrAnalysis(Info) {}
+
+  bool isUnconditionalBranch(MCInst const &Inst) const override {
+    //assert(!HexagonMCInstrInfo::isBundle(Inst));
+    return MCInstrAnalysis::isUnconditionalBranch(Inst);
+  }
+
+  bool isConditionalBranch(MCInst const &Inst) const override {
+    //assert(!HexagonMCInstrInfo::isBundle(Inst));
+    return MCInstrAnalysis::isConditionalBranch(Inst);
+  }
+
+  bool evaluateBranch(MCInst const &Inst, uint64_t Addr,
+                      uint64_t Size, uint64_t &Target) const override {
+    //assert(!HexagonMCInstrInfo::isBundle(Inst));
+    if(!HexagonMCInstrInfo::isExtendable(*Info, Inst))
+      return false;
+    auto const &Extended(HexagonMCInstrInfo::getExtendableOperand(*Info, Inst));
+    assert(Extended.isExpr());
+    int64_t Value;
+    if(!Extended.getExpr()->evaluateAsAbsolute(Value))
+      return false;
+    Target = Value;
+    return true;
+  }
+};
+}
+
+static MCInstrAnalysis *createHexagonMCInstrAnalysis(const MCInstrInfo *Info) {
+  return new HexagonMCInstrAnalysis(Info);
+}
+
 // Force static initialization.
 extern "C" void LLVMInitializeHexagonTargetMC() {
   // Register the MC asm info.
@@ -252,7 +357,7 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
 
   // Register the MC subtarget info.
   TargetRegistry::RegisterMCSubtargetInfo(getTheHexagonTarget(),
-                                          createHexagonMCSubtargetInfo);
+    Hexagon_MC::createHexagonMCSubtargetInfo);
 
   // Register the MC Code Emitter
   TargetRegistry::RegisterMCCodeEmitter(getTheHexagonTarget(),
@@ -262,8 +367,18 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
   TargetRegistry::RegisterMCAsmBackend(getTheHexagonTarget(),
                                        createHexagonAsmBackend);
 
+
+  // Register the MC instruction analyzer.
+  TargetRegistry::RegisterMCInstrAnalysis(getTheHexagonTarget(),
+                                          createHexagonMCInstrAnalysis);
+
   // Register the obj streamer
-  TargetRegistry::RegisterELFStreamer(getTheHexagonTarget(), createMCStreamer);
+  TargetRegistry::RegisterELFStreamer(getTheHexagonTarget(),
+                                      createMCStreamer);
+
+  // Register the obj target streamer
+  TargetRegistry::RegisterObjectTargetStreamer(getTheHexagonTarget(),
+                                      createHexagonObjectTargetStreamer);
 
   // Register the asm streamer
   TargetRegistry::RegisterAsmTargetStreamer(getTheHexagonTarget(),
@@ -272,7 +387,4 @@ extern "C" void LLVMInitializeHexagonTargetMC() {
   // Register the MC Inst Printer
   TargetRegistry::RegisterMCInstPrinter(getTheHexagonTarget(),
                                         createHexagonMCInstPrinter);
-
-  TargetRegistry::RegisterObjectTargetStreamer(
-      getTheHexagonTarget(), createHexagonObjectTargetStreamer);
 }
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 6e677e9d9f8673d500851ba8c8bd596abab11d8b..6bb69be6142e5989e269a2dd94b3ea39a66d0062 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -41,6 +41,18 @@ extern cl::opt<bool> HexagonDisableDuplex;
 extern const InstrStage HexagonStages[];
 
 MCInstrInfo *createHexagonMCInstrInfo();
+MCRegisterInfo *createHexagonMCRegisterInfo(StringRef TT);
+
+namespace Hexagon_MC {
+  StringRef ParseHexagonTriple(const Triple &TT, StringRef CPU);
+  StringRef selectHexagonCPU(const Triple &TT, StringRef CPU);
+
+  /// Create a Hexagon MCSubtargetInfo instance. This is exposed so Asm parser,
+  /// etc. do not need to go through TargetRegistry.
+  MCSubtargetInfo *createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU,
+                                                StringRef FS);
+  unsigned GetELFFlags(const MCSubtargetInfo &STI);
+}
 
 MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
                                           const MCRegisterInfo &MRI,
@@ -54,13 +66,9 @@ MCAsmBackend *createHexagonAsmBackend(const Target &T,
 MCObjectWriter *createHexagonELFObjectWriter(raw_pwrite_stream &OS,
                                              uint8_t OSABI, StringRef CPU);
 
-namespace Hexagon_MC {
-
-  StringRef selectHexagonCPU(const Triple &TT, StringRef CPU);
-
-} // end namespace Hexagon_MC
+unsigned HexagonGetLastSlot();
 
-} // end namespace llvm
+} // End llvm namespace
 
 // Define symbolic names for Hexagon registers.  This defines a mapping from
 // register name to register number.
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 88f37d620dcf8cf928968def112222b41a96fb66..853f76213d38dc5a3ef0c26208e6da3264ae4d98 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -22,6 +22,7 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "HexagonShuffler.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -37,16 +38,16 @@ class HexagonBid {
   unsigned Bid;
 
 public:
-  HexagonBid() : Bid(0){};
-  HexagonBid(unsigned B) { Bid = B ? MAX / countPopulation(B) : 0; };
+  HexagonBid() : Bid(0){}
+  HexagonBid(unsigned B) { Bid = B ? MAX / countPopulation(B) : 0; }
 
   // Check if the insn priority is overflowed.
-  bool isSold() const { return (Bid >= MAX); };
+  bool isSold() const { return (Bid >= MAX); }
 
   HexagonBid &operator+=(const HexagonBid &B) {
     Bid += B.Bid;
     return *this;
-  };
+  }
 };
 
 // Slot shuffling allocation.
@@ -56,7 +57,7 @@ class HexagonUnitAuction {
   unsigned isSold : HEXAGON_PACKET_SIZE;
 
 public:
-  HexagonUnitAuction() : isSold(0){};
+  HexagonUnitAuction(unsigned cs = 0) : isSold(cs){};
 
   // Allocate slots.
   bool bid(unsigned B) {
@@ -70,29 +71,29 @@ public:
           isSold |= Scores[i].isSold() << i;
         }
       return true;
-      ;
     } else
       // Error if the desired slots are already full.
       return false;
-  };
+  }
 };
 } // end anonymous namespace
 
 unsigned HexagonResource::setWeight(unsigned s) {
   const unsigned SlotWeight = 8;
   const unsigned MaskWeight = SlotWeight - 1;
-  bool Key = (1 << s) & getUnits();
-
-  // TODO: Improve this API so that we can prevent misuse statically.
-  assert(SlotWeight * s < 32 && "Argument to setWeight too large.");
+  unsigned Units = getUnits();
+  unsigned Key = ((1u << s) & Units) != 0;
 
   // Calculate relative weight of the insn for the given slot, weighing it the
   // heavier the more restrictive the insn is and the lowest the slots that the
   // insn may be executed in.
-  Weight =
-      (Key << (SlotWeight * s)) * ((MaskWeight - countPopulation(getUnits()))
-                                   << countTrailingZeros(getUnits()));
-  return (Weight);
+  if (Key == 0 || Units == 0 || (SlotWeight*s >= 32))
+    return Weight = 0;
+
+  unsigned Ctpop = countPopulation(Units);
+  unsigned Cttz = countTrailingZeros(Units);
+  Weight = (1u << (SlotWeight * s)) * ((MaskWeight - Ctpop) << Cttz);
+  return Weight;
 }
 
 void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
@@ -104,7 +105,10 @@ void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
   (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1);
   (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2);
   (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1);
-  (*TUL)[HexagonII::TypeCVI_VINLANESAT] = UnitsAndLanes(CVI_SHIFT, 1);
+  (*TUL)[HexagonII::TypeCVI_VINLANESAT] =
+      (CPU == "hexagonv60" || CPU == "hexagonv61" || CPU == "hexagonv61v1") ?
+      UnitsAndLanes(CVI_SHIFT, 1) :
+      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
   (*TUL)[HexagonII::TypeCVI_VM_LD] =
       UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
   (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0);
@@ -141,6 +145,40 @@ HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL,
   }
 }
 
+struct CVIUnits {
+  unsigned Units;
+  unsigned Lanes;
+};
+typedef SmallVector<struct CVIUnits, 8> HVXInstsT;
+
+static unsigned makeAllBits(unsigned startBit, unsigned Lanes)
+
+{
+  for (unsigned i = 1 ; i < Lanes ; ++i)
+    startBit = (startBit << 1) | startBit;
+  return startBit;
+}
+
+static bool checkHVXPipes(const HVXInstsT& hvxInsts, unsigned startIdx, unsigned usedUnits)
+
+{
+  if (startIdx < hvxInsts.size()) {
+    if (!hvxInsts[startIdx].Units)
+      return checkHVXPipes(hvxInsts, startIdx + 1, usedUnits);
+    for (unsigned b = 0x1 ; b <= 0x8 ; b <<= 1) {
+      if ((hvxInsts[startIdx].Units & b) == 0)
+        continue;
+      unsigned allBits = makeAllBits(b, hvxInsts[startIdx].Lanes);
+      if ((allBits & usedUnits) == 0) {
+        if (checkHVXPipes(hvxInsts, startIdx + 1, usedUnits | allBits))
+          return true;
+      }
+    }
+    return false;
+  }
+  return true;
+}
+
 HexagonShuffler::HexagonShuffler(MCInstrInfo const &MCII,
                                  MCSubtargetInfo const &STI)
     : MCII(MCII), STI(STI) {
@@ -154,21 +192,82 @@ void HexagonShuffler::reset() {
   Error = SHUFFLE_SUCCESS;
 }
 
-void HexagonShuffler::append(MCInst const *ID, MCInst const *Extender,
-                             unsigned S, bool X) {
-  HexagonInstr PI(&TUL, MCII, ID, Extender, S, X);
+void HexagonShuffler::append(MCInst const &ID, MCInst const *Extender,
+                             unsigned S) {
+  HexagonInstr PI(&TUL, MCII, &ID, Extender, S);
 
   Packet.push_back(PI);
 }
 
+static struct {
+  unsigned first;
+  unsigned second;
+} jumpSlots[] = { {8, 4}, {8, 2}, {8, 1}, {4, 2}, {4, 1}, {2, 1} };
+#define MAX_JUMP_SLOTS (sizeof(jumpSlots)/sizeof(jumpSlots[0]))
+
+namespace {
+bool isDuplexAGroup(unsigned Opcode) {
+  switch (Opcode) {
+  case Hexagon::SA1_addi:
+  case Hexagon::SA1_addrx:
+  case Hexagon::SA1_addsp:
+  case Hexagon::SA1_and1:
+  case Hexagon::SA1_clrf:
+  case Hexagon::SA1_clrfnew:
+  case Hexagon::SA1_clrt:
+  case Hexagon::SA1_clrtnew:
+  case Hexagon::SA1_cmpeqi:
+  case Hexagon::SA1_combine0i:
+  case Hexagon::SA1_combine1i:
+  case Hexagon::SA1_combine2i:
+  case Hexagon::SA1_combine3i:
+  case Hexagon::SA1_combinerz:
+  case Hexagon::SA1_combinezr:
+  case Hexagon::SA1_dec:
+  case Hexagon::SA1_inc:
+  case Hexagon::SA1_seti:
+  case Hexagon::SA1_setin1:
+  case Hexagon::SA1_sxtb:
+  case Hexagon::SA1_sxth:
+  case Hexagon::SA1_tfr:
+  case Hexagon::SA1_zxtb:
+  case Hexagon::SA1_zxth:
+    return true;
+    break;
+  default:
+    return false;
+  }
+}
+
+unsigned countNeitherAnorX(MCInstrInfo const &MCII, MCInst const &ID) {
+  unsigned Result = 0;
+  unsigned Type = HexagonMCInstrInfo::getType(MCII, ID);
+  if (Type == HexagonII::TypeDUPLEX) {
+    unsigned subInst0Opcode = ID.getOperand(0).getInst()->getOpcode();
+    unsigned subInst1Opcode = ID.getOperand(1).getInst()->getOpcode();
+    Result += !isDuplexAGroup(subInst0Opcode);
+    Result += !isDuplexAGroup(subInst1Opcode);
+  } else
+    Result += Type != HexagonII::TypeALU32_2op &&
+              Type != HexagonII::TypeALU32_3op &&
+              Type != HexagonII::TypeALU32_ADDI &&
+              Type != HexagonII::TypeS_2op &&
+              Type != HexagonII::TypeS_3op &&
+              Type != HexagonII::TypeALU64 &&
+              (Type != HexagonII::TypeM ||
+               HexagonMCInstrInfo::isFloat(MCII, ID));
+  return Result;
+}
+}
+
 /// Check that the packet is legal and enforce relative insn order.
 bool HexagonShuffler::check() {
   // Descriptive slot masks.
   const unsigned slotSingleLoad = 0x1, slotSingleStore = 0x1, slotOne = 0x2,
-                 slotThree = 0x8, slotFirstJump = 0x8, slotLastJump = 0x4,
+                 slotThree = 0x8, //slotFirstJump = 0x8,
                  slotFirstLoadStore = 0x2, slotLastLoadStore = 0x1;
   // Highest slots for branches and stores used to keep their original order.
-  unsigned slotJump = slotFirstJump;
+  //unsigned slotJump = slotFirstJump;
   unsigned slotLoadStore = slotFirstLoadStore;
   // Number of branches, solo branches, indirect branches.
   unsigned jumps = 0, jump1 = 0;
@@ -188,36 +287,41 @@ bool HexagonShuffler::check() {
   unsigned onlyNo1 = 0;
   unsigned xtypeFloat = 0;
   unsigned pSlot3Cnt = 0;
+  unsigned memops = 0;
+  unsigned deallocs = 0;
   iterator slot3ISJ = end();
+  std::vector<iterator> foundBranches;
+  unsigned reservedSlots = 0;
 
   // Collect information from the insns in the packet.
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-    MCInst const *ID = ISJ->getDesc();
-
-    if (HexagonMCInstrInfo::isSolo(MCII, *ID))
-      solo += !ISJ->isSoloException();
-    else if (HexagonMCInstrInfo::isSoloAX(MCII, *ID))
-      onlyAX += !ISJ->isSoloException();
-    else if (HexagonMCInstrInfo::isSoloAin1(MCII, *ID))
-      onlyAin1 += !ISJ->isSoloException();
-    if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32 &&
-        HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeXTYPE)
-      ++neitherAnorX;
-    if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID)) {
+    MCInst const &ID = ISJ->getDesc();
+
+    if (HexagonMCInstrInfo::isSolo(MCII, ID))
+      solo++;
+    else if (HexagonMCInstrInfo::isSoloAX(MCII, ID))
+      onlyAX++;
+    else if (HexagonMCInstrInfo::isSoloAin1(MCII, ID))
+      onlyAin1++;
+    neitherAnorX += countNeitherAnorX(MCII, ID);
+    if (HexagonMCInstrInfo::prefersSlot3(MCII, ID)) {
       ++pSlot3Cnt;
       slot3ISJ = ISJ;
     }
-    if (HexagonMCInstrInfo::isCofMax1(MCII, *ID))
+    reservedSlots |= HexagonMCInstrInfo::getOtherReservedSlots(MCII, STI, ID);
+    if (HexagonMCInstrInfo::isCofMax1(MCII, ID))
       ++jump1;
 
-    switch (HexagonMCInstrInfo::getType(MCII, *ID)) {
-    case HexagonII::TypeXTYPE:
-      if (HexagonMCInstrInfo::isFloat(MCII, *ID))
+    switch (HexagonMCInstrInfo::getType(MCII, ID)) {
+    case HexagonII::TypeS_2op:
+    case HexagonII::TypeS_3op:
+    case HexagonII::TypeALU64:
+      if (HexagonMCInstrInfo::isFloat(MCII, ID))
         ++xtypeFloat;
       break;
-    case HexagonII::TypeJR:
     case HexagonII::TypeJ:
       ++jumps;
+      foundBranches.push_back(ISJ);
       break;
     case HexagonII::TypeCVI_VM_VP_LDU:
       ++onlyNo1;
@@ -228,10 +332,14 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeLD:
       ++loads;
       ++memory;
-      if (ISJ->Core.getUnits() == slotSingleLoad)
+      if (ISJ->Core.getUnits() == slotSingleLoad ||
+          HexagonMCInstrInfo::getType(MCII, ID) ==
+              HexagonII::TypeCVI_VM_VP_LDU)
         ++load0;
-      if (HexagonMCInstrInfo::getDesc(MCII, *ID).isReturn())
-        ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+      if (HexagonMCInstrInfo::getDesc(MCII, ID).isReturn()) {
+        ++deallocs, ++jumps, ++jump1; // DEALLOC_RETURN is of type LD.
+        foundBranches.push_back(ISJ);
+      }
       break;
     case HexagonII::TypeCVI_VM_STU:
       ++onlyNo1;
@@ -241,27 +349,66 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeST:
       ++stores;
       ++memory;
-      if (ISJ->Core.getUnits() == slotSingleStore)
+      if (ISJ->Core.getUnits() == slotSingleStore ||
+          HexagonMCInstrInfo::getType(MCII, ID) == HexagonII::TypeCVI_VM_STU)
         ++store0;
       break;
     case HexagonII::TypeV4LDST:
       ++loads;
       ++stores;
       ++store1;
+      ++memops;
       ++memory;
       break;
-    case HexagonII::TypeNV:
+    case HexagonII::TypeNCJ:
       ++memory; // NV insns are memory-like.
-      if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch())
+      if (HexagonMCInstrInfo::getDesc(MCII, ID).isBranch()) {
         ++jumps, ++jump1;
+        foundBranches.push_back(ISJ);
+      }
+      break;
+    case HexagonII::TypeV2LDST:
+      if(HexagonMCInstrInfo::getDesc(MCII, ID).mayLoad()) {
+        ++loads;
+        ++memory;
+        if (ISJ->Core.getUnits() == slotSingleLoad ||
+            HexagonMCInstrInfo::getType(MCII,ID) ==
+                HexagonII::TypeCVI_VM_VP_LDU)
+          ++load0;
+      }
+      else {
+        assert(HexagonMCInstrInfo::getDesc(MCII, ID).mayStore());
+        ++memory;
+        ++stores;
+      }
       break;
     case HexagonII::TypeCR:
     // Legacy conditional branch predicated on a register.
-    case HexagonII::TypeSYSTEM:
-      if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad())
-        ++loads;
+    case HexagonII::TypeCJ:
+      if (HexagonMCInstrInfo::getDesc(MCII, ID).isBranch()) {
+        ++jumps;
+        foundBranches.push_back(ISJ);
+      }
+      break;
+    case HexagonII::TypeDUPLEX: {
+      ++duplex;
+      MCInst const &Inst0 = *ID.getOperand(0).getInst();
+      MCInst const &Inst1 = *ID.getOperand(1).getInst();
+      if (HexagonMCInstrInfo::isCofMax1(MCII, Inst0))
+        ++jump1;
+      if (HexagonMCInstrInfo::isCofMax1(MCII, Inst1))
+        ++jump1;
+      if (HexagonMCInstrInfo::getDesc(MCII, Inst0).isBranch()) {
+        ++jumps;
+        foundBranches.push_back(ISJ);
+      }
+      if (HexagonMCInstrInfo::getDesc(MCII, Inst1).isBranch()) {
+        ++jumps;
+        foundBranches.push_back(ISJ);
+      }
       break;
     }
+    }
   }
 
   // Check if the packet is legal.
@@ -277,12 +424,20 @@ bool HexagonShuffler::check() {
     Error = SHUFFLE_ERROR_BRANCHES;
     return false;
   }
+  if (memops && stores > 1) {
+    Error = SHUFFLE_ERROR_STORE_LOAD_CONFLICT;
+    return false;
+  }
+  if (deallocs && stores) {
+    Error = SHUFFLE_ERROR_STORE_LOAD_CONFLICT;
+    return false;
+  }
 
   // Modify packet accordingly.
   // TODO: need to reserve slots #0 and #1 for duplex insns.
   bool bOnlySlot3 = false;
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-    MCInst const *ID = ISJ->getDesc();
+    MCInst const &ID = ISJ->getDesc();
 
     if (!ISJ->Core.getUnits()) {
       // Error if insn may not be executed in any slot.
@@ -291,40 +446,26 @@ bool HexagonShuffler::check() {
     }
 
     // Exclude from slot #1 any insn but A2_nop.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).getOpcode() != Hexagon::A2_nop)
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).getOpcode() != Hexagon::A2_nop)
       if (onlyNo1)
         ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
 
     // Exclude from slot #1 any insn but A-type.
-    if (HexagonMCInstrInfo::getType(MCII, *ID) != HexagonII::TypeALU32)
+    if (HexagonMCInstrInfo::getType(MCII, ID) != HexagonII::TypeALU32_2op &&
+        HexagonMCInstrInfo::getType(MCII, ID) != HexagonII::TypeALU32_3op &&
+        HexagonMCInstrInfo::getType(MCII, ID) != HexagonII::TypeALU32_ADDI)
       if (onlyAin1)
         ISJ->Core.setUnits(ISJ->Core.getUnits() & ~slotOne);
 
-    // Branches must keep the original order.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).isBranch() ||
-        HexagonMCInstrInfo::getDesc(MCII, *ID).isCall())
-      if (jumps > 1) {
-        if (slotJump < slotLastJump) {
-          // Error if indirect branch with another branch or
-          // no more slots available for branches.
-          Error = SHUFFLE_ERROR_BRANCHES;
-          return false;
-        }
-        // Pin the branch to the highest slot available to it.
-        ISJ->Core.setUnits(ISJ->Core.getUnits() & slotJump);
-        // Update next highest slot available to branches.
-        slotJump >>= 1;
-      }
-
     // A single load must use slot #0.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayLoad()) {
-      if (loads == 1 && loads == memory)
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).mayLoad()) {
+      if (loads == 1 && loads == memory && memops == 0)
         // Pin the load to slot #0.
         ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleLoad);
     }
 
     // A single store must use slot #0.
-    if (HexagonMCInstrInfo::getDesc(MCII, *ID).mayStore()) {
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).mayStore()) {
       if (!store0) {
         if (stores == 1)
           ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleStore);
@@ -347,7 +488,7 @@ bool HexagonShuffler::check() {
       }
     }
 
-    // flag if an instruction can only be executed in slot 3
+    // flag if an instruction requires to be in slot 3
     if (ISJ->Core.getUnits() == slotThree)
       bOnlySlot3 = true;
 
@@ -358,14 +499,61 @@ bool HexagonShuffler::check() {
     }
   }
 
+  // preserve branch order
   bool validateSlots = true;
-  if (bOnlySlot3 == false && pSlot3Cnt == 1 && slot3ISJ != end()) {
+  if (jumps > 1) {
+    if (foundBranches.size() > 2) {
+      Error = SHUFFLE_ERROR_BRANCHES;
+      return false;
+    }
+
+    // try all possible choices
+    for (unsigned int i = 0 ; i < MAX_JUMP_SLOTS ; ++i) {
+      // validate first jump with this slot rule
+      if (!(jumpSlots[i].first & foundBranches[0]->Core.getUnits()))
+        continue;
+
+      // validate second jump with this slot rule
+      if (!(jumpSlots[i].second & foundBranches[1]->Core.getUnits()))
+        continue;
+
+      // both valid for this configuration, set new slot rules
+      PacketSave = Packet;
+      foundBranches[0]->Core.setUnits(jumpSlots[i].first);
+      foundBranches[1]->Core.setUnits(jumpSlots[i].second);
+
+      HexagonUnitAuction AuctionCore(reservedSlots);
+      std::sort(begin(), end(), HexagonInstr::lessCore);
+
+      // see if things ok with that instruction being pinned to slot "slotJump"
+      bool bFail = false;
+      for (iterator I = begin(); I != end() && bFail != true; ++I)
+        if (!AuctionCore.bid(I->Core.getUnits()))
+          bFail = true;
+
+      // if yes, great, if not then restore original slot mask
+      if (!bFail) {
+        validateSlots = false; // all good, no need to re-do auction
+        break;
+      }
+      else
+        // restore original values
+        Packet = PacketSave;
+    }
+    if (validateSlots == true) {
+      Error = SHUFFLE_ERROR_NOSLOTS;
+      return false;
+    }
+  }
+
+  if (jumps <= 1 && bOnlySlot3 == false && pSlot3Cnt == 1 && slot3ISJ != end()) {
+    validateSlots = true;
     // save off slot mask of instruction marked with A_PREFER_SLOT3
     // and then pin it to slot #3
     unsigned saveUnits = slot3ISJ->Core.getUnits();
     slot3ISJ->Core.setUnits(saveUnits & slotThree);
 
-    HexagonUnitAuction AuctionCore;
+    HexagonUnitAuction AuctionCore(reservedSlots);
     std::sort(begin(), end(), HexagonInstr::lessCore);
 
     // see if things ok with that instruction being pinned to slot #3
@@ -379,16 +567,16 @@ bool HexagonShuffler::check() {
       validateSlots = false; // all good, no need to re-do auction
     else
       for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-        MCInst const *ID = ISJ->getDesc();
-        if (HexagonMCInstrInfo::prefersSlot3(MCII, *ID))
+        MCInst const &ID = ISJ->getDesc();
+        if (HexagonMCInstrInfo::prefersSlot3(MCII, ID))
           ISJ->Core.setUnits(saveUnits);
       }
   }
 
-  // Check if any slot, core, is over-subscribed.
+  // Check if any slot, core or CVI, is over-subscribed.
   // Verify the core slot subscriptions.
   if (validateSlots) {
-    HexagonUnitAuction AuctionCore;
+    HexagonUnitAuction AuctionCore(reservedSlots);
 
     std::sort(begin(), end(), HexagonInstr::lessCore);
 
@@ -399,17 +587,27 @@ bool HexagonShuffler::check() {
       }
   }
   // Verify the CVI slot subscriptions.
-  {
-    HexagonUnitAuction AuctionCVI;
-
-    std::sort(begin(), end(), HexagonInstr::lessCVI);
-
-    for (iterator I = begin(); I != end(); ++I)
-      for (unsigned i = 0; i < I->CVI.getLanes(); ++i) // TODO: I->CVI.isValid?
-        if (!AuctionCVI.bid(I->CVI.getUnits() << i)) {
-          Error = SHUFFLE_ERROR_SLOTS;
-          return false;
-        }
+  std::sort(begin(), end(), HexagonInstr::lessCVI);
+  // create vector of hvx instructions to check
+  HVXInstsT hvxInsts;
+  hvxInsts.clear();
+  for (iterator I = begin(); I != end(); ++I) {
+    struct CVIUnits inst;
+    inst.Units = I->CVI.getUnits();
+    inst.Lanes = I->CVI.getLanes();
+    if (inst.Units == 0)
+      continue; // not an hvx inst or an hvx inst that doesn't uses any pipes
+    hvxInsts.push_back(inst);
+  }
+  // if there are any hvx instructions in this packet, check pipe usage
+  if (hvxInsts.size() > 0) {
+    unsigned startIdx, usedUnits;
+    startIdx = usedUnits = 0x0;
+    if (checkHVXPipes(hvxInsts, startIdx, usedUnits) == false) {
+      // too many pipes used to be valid
+      Error = SHUFFLE_ERROR_SLOTS;
+      return false;
+    }
   }
 
   Error = SHUFFLE_SUCCESS;
@@ -452,10 +650,12 @@ bool HexagonShuffler::shuffle() {
     }
 
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ)
-    DEBUG(dbgs().write_hex(ISJ->Core.getUnits());
-          dbgs() << ':'
-                 << HexagonMCInstrInfo::getDesc(MCII, *ISJ->getDesc())
-                        .getOpcode();
+    DEBUG(dbgs().write_hex(ISJ->Core.getUnits()); if (ISJ->CVI.isValid()) {
+      dbgs() << '/';
+      dbgs().write_hex(ISJ->CVI.getUnits()) << '|';
+      dbgs() << ISJ->CVI.getLanes();
+    } dbgs() << ':'
+             << HexagonMCInstrInfo::getDesc(MCII, ISJ->getDesc()).getOpcode();
           dbgs() << '\n');
   DEBUG(dbgs() << '\n');
 
diff --git a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index a093f85451323f228f0fbcc772e90e4efabc87f1..36e8fa19d4671772ad0ba73ec83f8126d07b6d4e 100644
--- a/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -35,7 +35,8 @@ public:
   HexagonResource(unsigned s) { setUnits(s); };
 
   void setUnits(unsigned s) {
-    Slots = s & ~(~0U << HEXAGON_PACKET_SIZE);
+    Slots = s & ((1u << HEXAGON_PACKET_SIZE) - 1);
+    setWeight(s);
   };
   unsigned setWeight(unsigned s);
 
@@ -44,7 +45,8 @@ public:
 
   // Check if the resources are in ascending slot order.
   static bool lessUnits(const HexagonResource &A, const HexagonResource &B) {
-    return (countPopulation(A.getUnits()) < countPopulation(B.getUnits()));
+    return (countPopulation(A.getUnits()) <
+            countPopulation(B.getUnits()));
   };
   // Check if the resources are in ascending weight order.
   static bool lessWeight(const HexagonResource &A, const HexagonResource &B) {
@@ -86,10 +88,10 @@ public:
                      unsigned s, MCInst const *id);
   static void SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU);
 
-  bool isValid() const { return (Valid); };
-  unsigned getLanes() const { return (Lanes); };
-  bool mayLoad() const { return (Load); };
-  bool mayStore() const { return (Store); };
+  bool isValid() const { return Valid; };
+  unsigned getLanes() const { return Lanes; };
+  bool mayLoad() const { return Load; };
+  bool mayStore() const { return Store; };
 };
 
 // Handle to an insn used by the shuffling algorithm.
@@ -100,21 +102,17 @@ class HexagonInstr {
   MCInst const *Extender;
   HexagonResource Core;
   HexagonCVIResource CVI;
-  bool SoloException;
 
 public:
   HexagonInstr(HexagonCVIResource::TypeUnitsAndLanes *T,
                MCInstrInfo const &MCII, MCInst const *id,
-               MCInst const *Extender, unsigned s, bool x = false)
-      : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id),
-        SoloException(x) {};
+               MCInst const *Extender, unsigned s)
+      : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id) {}
 
-  MCInst const *getDesc() const { return (ID); };
+  MCInst const &getDesc() const { return *ID; };
 
   MCInst const *getExtender() const { return Extender; }
 
-  unsigned isSoloException() const { return (SoloException); };
-
   // Check if the handles are in ascending order for shuffling purposes.
   bool operator<(const HexagonInstr &B) const {
     return (HexagonResource::lessWeight(B.Core, Core));
@@ -136,6 +134,7 @@ class HexagonShuffler {
 
   // Insn handles in a bundle.
   HexagonPacket Packet;
+  HexagonPacket PacketSave;
 
   // Shuffling error code.
   unsigned Error;
@@ -178,8 +177,7 @@ public:
   iterator end() { return (Packet.end()); };
 
   // Add insn handle to the bundle .
-  void append(MCInst const *ID, MCInst const *Extender, unsigned S,
-              bool X = false);
+  void append(MCInst const &ID, MCInst const *Extender, unsigned S);
 
   // Return the error code for the last check or shuffling of the bundle.
   void setError(unsigned Err) { Error = Err; };
diff --git a/lib/Target/Hexagon/RDFCopy.cpp b/lib/Target/Hexagon/RDFCopy.cpp
index 392871628d98a4ac1669e687a1ab5d98bfad698a..57ce9fabc5e3afa32e9bad50b33418d9b7726f76 100644
--- a/lib/Target/Hexagon/RDFCopy.cpp
+++ b/lib/Target/Hexagon/RDFCopy.cpp
@@ -11,6 +11,7 @@
 
 #include "RDFCopy.h"
 #include "RDFGraph.h"
+#include "RDFLiveness.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -53,47 +54,12 @@ bool CopyPropagation::interpretAsCopy(const MachineInstr *MI, EqualityMap &EM) {
 void CopyPropagation::recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM) {
   CopyMap.insert(std::make_pair(SA.Id, EM));
   Copies.push_back(SA.Id);
-
-  for (auto I : EM) {
-    auto FS = DefM.find(I.second.Reg);
-    if (FS == DefM.end() || FS->second.empty())
-      continue; // Undefined source
-    RDefMap[I.second][SA.Id] = FS->second.top()->Id;
-    // Insert DstR into the map.
-    RDefMap[I.first];
-  }
-}
-
-
-void CopyPropagation::updateMap(NodeAddr<InstrNode*> IA) {
-  RegisterSet RRs;
-  for (NodeAddr<RefNode*> RA : IA.Addr->members(DFG))
-    RRs.insert(RA.Addr->getRegRef(DFG));
-  bool Common = false;
-  for (auto &R : RDefMap) {
-    if (!RRs.count(R.first))
-      continue;
-    Common = true;
-    break;
-  }
-  if (!Common)
-    return;
-
-  for (auto &R : RDefMap) {
-    if (!RRs.count(R.first))
-      continue;
-    auto F = DefM.find(R.first.Reg);
-    if (F == DefM.end() || F->second.empty())
-      continue;
-    R.second[IA.Id] = F->second.top()->Id;
-  }
 }
 
 
 bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
   bool Changed = false;
   auto BA = DFG.getFunc().Addr->findBlock(B, DFG);
-  DFG.markBlock(BA.Id, DefM);
 
   for (NodeAddr<InstrNode*> IA : BA.Addr->members(DFG)) {
     if (DFG.IsCode<NodeAttrs::Stmt>(IA)) {
@@ -102,20 +68,30 @@ bool CopyPropagation::scanBlock(MachineBasicBlock *B) {
       if (interpretAsCopy(SA.Addr->getCode(), EM))
         recordCopy(SA, EM);
     }
-
-    updateMap(IA);
-    DFG.pushDefs(IA, DefM);
   }
 
   MachineDomTreeNode *N = MDT.getNode(B);
   for (auto I : *N)
     Changed |= scanBlock(I->getBlock());
 
-  DFG.releaseBlock(BA.Id, DefM);
   return Changed;
 }
 
 
+NodeId CopyPropagation::getLocalReachingDef(RegisterRef RefRR,
+      NodeAddr<InstrNode*> IA) {
+  NodeAddr<RefNode*> RA = L.getNearestAliasedRef(RefRR, IA);
+  if (RA.Id != 0) {
+    if (RA.Addr->getKind() == NodeAttrs::Def)
+      return RA.Id;
+    assert(RA.Addr->getKind() == NodeAttrs::Use);
+    if (NodeId RD = RA.Addr->getReachingDef())
+      return RD;
+  }
+  return 0;
+}
+
+
 bool CopyPropagation::run() {
   scanBlock(&DFG.getMF().front());
 
@@ -129,14 +105,6 @@ bool CopyPropagation::run() {
                << Print<RegisterRef>(J.second, DFG);
       dbgs() << " }\n";
     }
-    dbgs() << "\nRDef map:\n";
-    for (auto R : RDefMap) {
-      dbgs() << Print<RegisterRef>(R.first, DFG) << " -> {";
-      for (auto &M : R.second)
-        dbgs() << ' ' << Print<NodeId>(M.first, DFG) << ':'
-               << Print<NodeId>(M.second, DFG);
-      dbgs() << " }\n";
-    }
   }
 
   bool Changed = false;
@@ -176,8 +144,7 @@ bool CopyPropagation::run() {
       if (DR == SR)
         continue;
 
-      auto &RDefSR = RDefMap[SR];
-      NodeId RDefSR_SA = RDefSR[SA.Id];
+      NodeId AtCopy = getLocalReachingDef(SR, SA);
 
       for (NodeId N = DA.Addr->getReachedUse(), NextN; N; N = NextN) {
         auto UA = DFG.addr<UseNode*>(N);
@@ -190,7 +157,8 @@ bool CopyPropagation::run() {
 
         NodeAddr<InstrNode*> IA = UA.Addr->getOwner(DFG);
         assert(DFG.IsCode<NodeAttrs::Stmt>(IA));
-        if (RDefSR[IA.Id] != RDefSR_SA)
+        NodeId AtUse = getLocalReachingDef(SR, IA);
+        if (AtCopy != AtUse)
           continue;
 
         MachineOperand &Op = UA.Addr->getOp();
@@ -206,8 +174,8 @@ bool CopyPropagation::run() {
         Op.setReg(NewReg);
         Op.setSubReg(0);
         DFG.unlinkUse(UA, false);
-        if (RDefSR_SA != 0) {
-          UA.Addr->linkToDef(UA.Id, DFG.addr<DefNode*>(RDefSR_SA));
+        if (AtCopy != 0) {
+          UA.Addr->linkToDef(UA.Id, DFG.addr<DefNode*>(AtCopy));
         } else {
           UA.Addr->setReachingDef(0);
           UA.Addr->setSibling(0);
diff --git a/lib/Target/Hexagon/RDFCopy.h b/lib/Target/Hexagon/RDFCopy.h
index 5ece11bd5ce47fec13e2f39e3b8dca3d0385cb45..bbd625c5f5f6ed7a634ff42b388612f99187a119 100644
--- a/lib/Target/Hexagon/RDFCopy.h
+++ b/lib/Target/Hexagon/RDFCopy.h
@@ -11,6 +11,9 @@
 #define LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
 
 #include "RDFGraph.h"
+#include "RDFLiveness.h"
+#include "llvm/CodeGen/MachineFunction.h"
+
 #include <map>
 #include <vector>
 
@@ -24,7 +27,7 @@ namespace rdf {
 
   struct CopyPropagation {
     CopyPropagation(DataFlowGraph &dfg) : MDT(dfg.getDT()), DFG(dfg),
-        Trace(false) {}
+        L(dfg.getMF().getRegInfo(), dfg), Trace(false) {}
 
     virtual ~CopyPropagation() = default;
 
@@ -39,18 +42,16 @@ namespace rdf {
   private:
     const MachineDominatorTree &MDT;
     DataFlowGraph &DFG;
-    DataFlowGraph::DefStackMap DefM;
+    Liveness L;
     bool Trace;
 
-    // map: register -> (map: stmt -> reaching def)
-    std::map<RegisterRef,std::map<NodeId,NodeId>> RDefMap;
     // map: statement -> (map: dst reg -> src reg)
     std::map<NodeId, EqualityMap> CopyMap;
     std::vector<NodeId> Copies;
 
     void recordCopy(NodeAddr<StmtNode*> SA, EqualityMap &EM);
-    void updateMap(NodeAddr<InstrNode*> IA);
     bool scanBlock(MachineBasicBlock *B);
+    NodeId getLocalReachingDef(RegisterRef RefRR, NodeAddr<InstrNode*> IA);
   };
 
 } // end namespace rdf
diff --git a/lib/Target/Hexagon/RDFDeadCode.cpp b/lib/Target/Hexagon/RDFDeadCode.cpp
index 63177d51cada056e6fc350bef1d813b62c5ea538..9aa8ad68e07e2260427383eb2a284a0e0d1cef8a 100644
--- a/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -62,9 +62,19 @@ bool DeadCodeElimination::isLiveInstr(const MachineInstr *MI) const {
     return true;
   if (MI->isPHI())
     return false;
-  for (auto &Op : MI->operands())
+  for (auto &Op : MI->operands()) {
     if (Op.isReg() && MRI.isReserved(Op.getReg()))
       return true;
+    if (Op.isRegMask()) {
+      const uint32_t *BM = Op.getRegMask();
+      for (unsigned R = 0, RN = DFG.getTRI().getNumRegs(); R != RN; ++R) {
+        if (BM[R/32] & (1u << (R%32)))
+          continue;
+        if (MRI.isReserved(R))
+          return true;
+      }
+    }
+  }
   return false;
 }
 
diff --git a/lib/Target/Hexagon/RDFGraph.cpp b/lib/Target/Hexagon/RDFGraph.cpp
index 4b5c212a7f8c8e442262dc711441db70e61291dd..2253969290b78af6aece04dbbbabe493ff1a4cf9 100644
--- a/lib/Target/Hexagon/RDFGraph.cpp
+++ b/lib/Target/Hexagon/RDFGraph.cpp
@@ -617,8 +617,12 @@ bool TargetOperandInfo::isPreserving(const MachineInstr &In, unsigned OpNum)
 // Check if the definition of RR produces an unspecified value.
 bool TargetOperandInfo::isClobbering(const MachineInstr &In, unsigned OpNum)
       const {
+  const MachineOperand &Op = In.getOperand(OpNum);
+  if (Op.isRegMask())
+    return true;
+  assert(Op.isReg());
   if (In.isCall())
-    if (In.getOperand(OpNum).isImplicit())
+    if (Op.isDef() && Op.isDead())
       return true;
   return false;
 }
@@ -903,8 +907,10 @@ void DataFlowGraph::build(unsigned Options) {
   assert(EntryB.pred_empty() && "Function entry block has predecessors");
   for (auto I = MRI.livein_begin(), E = MRI.livein_end(); I != E; ++I)
     LiveIns.insert(RegisterRef(I->first));
-  for (auto I : EntryB.liveins())
-    LiveIns.insert(RegisterRef(I.PhysReg, I.LaneMask));
+  if (MRI.tracksLiveness()) {
+    for (auto I : EntryB.liveins())
+      LiveIns.insert(RegisterRef(I.PhysReg, I.LaneMask));
+  }
 
   // Add function-entry phi nodes for the live-in registers.
   for (std::pair<RegisterId,LaneBitmask> P : LiveIns) {
@@ -981,30 +987,14 @@ RegisterRef DataFlowGraph::makeRegRef(const MachineOperand &Op) const {
   return RegisterRef(PRI.getRegMaskId(Op.getRegMask()), LaneBitmask::getAll());
 }
 
-RegisterRef DataFlowGraph::normalizeRef(RegisterRef RR) const {
-  // FIXME copied from RegisterAggr
-  if (PhysicalRegisterInfo::isRegMaskId(RR.Reg))
-    return RR;
-  const TargetRegisterClass *RC = PRI.RegInfos[RR.Reg].RegClass;
-  LaneBitmask RCMask = RC != nullptr ? RC->LaneMask : LaneBitmask(0x00000001);
-  LaneBitmask Common = RR.Mask & RCMask;
-
-  RegisterId SuperReg = PRI.RegInfos[RR.Reg].MaxSuper;
-// Ex: IP/EIP/RIP
-//  assert(RC != nullptr || RR.Reg == SuperReg);
-  uint32_t Sub = PRI.getTRI().getSubRegIndex(SuperReg, RR.Reg);
-  LaneBitmask SuperMask = PRI.getTRI().composeSubRegIndexLaneMask(Sub, Common);
-  return RegisterRef(SuperReg, SuperMask);
-}
-
 RegisterRef DataFlowGraph::restrictRef(RegisterRef AR, RegisterRef BR) const {
   if (AR.Reg == BR.Reg) {
     LaneBitmask M = AR.Mask & BR.Mask;
     return M.any() ? RegisterRef(AR.Reg, M) : RegisterRef();
   }
 #ifndef NDEBUG
-  RegisterRef NAR = normalizeRef(AR);
-  RegisterRef NBR = normalizeRef(BR);
+  RegisterRef NAR = PRI.normalize(AR);
+  RegisterRef NBR = PRI.normalize(BR);
   assert(NAR.Reg != NBR.Reg);
 #endif
   // This isn't strictly correct, because the overlap may happen in the
@@ -1038,13 +1028,63 @@ void DataFlowGraph::releaseBlock(NodeId B, DefStackMap &DefM) {
   }
 }
 
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
+  pushClobbers(IA, DefM);
+  pushDefs(IA, DefM);
+}
+
+// Push all definitions from the instruction node IA to an appropriate
+// stack in DefM.
+void DataFlowGraph::pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
+  NodeSet Visited;
+  std::set<RegisterId> Defined;
+
+  // The important objectives of this function are:
+  // - to be able to handle instructions both while the graph is being
+  //   constructed, and after the graph has been constructed, and
+  // - maintain proper ordering of definitions on the stack for each
+  //   register reference:
+  //   - if there are two or more related defs in IA (i.e. coming from
+  //     the same machine operand), then only push one def on the stack,
+  //   - if there are multiple unrelated defs of non-overlapping
+  //     subregisters of S, then the stack for S will have both (in an
+  //     unspecified order), but the order does not matter from the data-
+  //     -flow perspective.
+
+  for (NodeAddr<DefNode*> DA : IA.Addr->members_if(IsDef, *this)) {
+    if (Visited.count(DA.Id))
+      continue;
+    if (!(DA.Addr->getFlags() & NodeAttrs::Clobbering))
+      continue;
+
+    NodeList Rel = getRelatedRefs(IA, DA);
+    NodeAddr<DefNode*> PDA = Rel.front();
+    RegisterRef RR = PDA.Addr->getRegRef(*this);
+
+    // Push the definition on the stack for the register and all aliases.
+    // The def stack traversal in linkNodeUp will check the exact aliasing.
+    DefM[RR.Reg].push(DA);
+    Defined.insert(RR.Reg);
+    for (RegisterId A : PRI.getAliasSet(RR.Reg)) {
+      // Check that we don't push the same def twice.
+      assert(A != RR.Reg);
+      if (!Defined.count(A))
+        DefM[A].push(DA);
+    }
+    // Mark all the related defs as visited.
+    for (NodeAddr<NodeBase*> T : Rel)
+      Visited.insert(T.Id);
+  }
+}
+
 // Push all definitions from the instruction node IA to an appropriate
 // stack in DefM.
 void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
-  NodeList Defs = IA.Addr->members_if(IsDef, *this);
   NodeSet Visited;
 #ifndef NDEBUG
-  RegisterSet Defined;
+  std::set<RegisterId> Defined;
 #endif
 
   // The important objectives of this function are:
@@ -1059,9 +1099,11 @@ void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
   //     unspecified order), but the order does not matter from the data-
   //     -flow perspective.
 
-  for (NodeAddr<DefNode*> DA : Defs) {
+  for (NodeAddr<DefNode*> DA : IA.Addr->members_if(IsDef, *this)) {
     if (Visited.count(DA.Id))
       continue;
+    if (DA.Addr->getFlags() & NodeAttrs::Clobbering)
+      continue;
 
     NodeList Rel = getRelatedRefs(IA, DA);
     NodeAddr<DefNode*> PDA = Rel.front();
@@ -1069,7 +1111,7 @@ void DataFlowGraph::pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DefM) {
 #ifndef NDEBUG
     // Assert if the register is defined in two or more unrelated defs.
     // This could happen if there are two or more def operands defining it.
-    if (!Defined.insert(RR).second) {
+    if (!Defined.insert(RR.Reg).second) {
       MachineInstr *MI = NodeAddr<StmtNode*>(IA).Addr->getCode();
       dbgs() << "Multiple definitions of register: "
              << Print<RegisterRef>(RR, *this) << " in\n  " << *MI
@@ -1627,13 +1669,15 @@ void DataFlowGraph::linkRefUp(NodeAddr<InstrNode*> IA, NodeAddr<T> TA,
 }
 
 // Create data-flow links for all reference nodes in the statement node SA.
-void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA) {
+template <typename Predicate>
+void DataFlowGraph::linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA,
+      Predicate P) {
 #ifndef NDEBUG
   RegisterSet Defs;
 #endif
 
   // Link all nodes (upwards in the data-flow) with their reaching defs.
-  for (NodeAddr<RefNode*> RA : SA.Addr->members(*this)) {
+  for (NodeAddr<RefNode*> RA : SA.Addr->members_if(P, *this)) {
     uint16_t Kind = RA.Addr->getKind();
     assert(Kind == NodeAttrs::Def || Kind == NodeAttrs::Use);
     RegisterRef RR = RA.Addr->getRegRef(*this);
@@ -1662,6 +1706,13 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
   // Push block delimiters.
   markBlock(BA.Id, DefM);
 
+  auto IsClobber = [] (NodeAddr<RefNode*> RA) -> bool {
+    return IsDef(RA) && (RA.Addr->getFlags() & NodeAttrs::Clobbering);
+  };
+  auto IsNoClobber = [] (NodeAddr<RefNode*> RA) -> bool {
+    return IsDef(RA) && !(RA.Addr->getFlags() & NodeAttrs::Clobbering);
+  };
+
   assert(BA.Addr && "block node address is needed to create a data-flow link");
   // For each non-phi instruction in the block, link all the defs and uses
   // to their reaching defs. For any member of the block (including phis),
@@ -1669,10 +1720,17 @@ void DataFlowGraph::linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA) {
   for (NodeAddr<InstrNode*> IA : BA.Addr->members(*this)) {
     // Ignore phi nodes here. They will be linked part by part from the
     // predecessors.
-    if (IA.Addr->getKind() == NodeAttrs::Stmt)
-      linkStmtRefs(DefM, IA);
+    if (IA.Addr->getKind() == NodeAttrs::Stmt) {
+      linkStmtRefs(DefM, IA, IsUse);
+      linkStmtRefs(DefM, IA, IsClobber);
+    }
 
     // Push the definitions on the stack.
+    pushClobbers(IA, DefM);
+
+    if (IA.Addr->getKind() == NodeAttrs::Stmt)
+      linkStmtRefs(DefM, IA, IsNoClobber);
+
     pushDefs(IA, DefM);
   }
 
diff --git a/lib/Target/Hexagon/RDFGraph.h b/lib/Target/Hexagon/RDFGraph.h
index a10dad587569d4c6d1ef31cde095ef37812a2f16..d5faca4cd6f4b5be0321e25055a5076e637fa210 100644
--- a/lib/Target/Hexagon/RDFGraph.h
+++ b/lib/Target/Hexagon/RDFGraph.h
@@ -729,7 +729,7 @@ namespace rdf {
     typedef std::unordered_map<RegisterId,DefStack> DefStackMap;
 
     void build(unsigned Options = BuildOptions::None);
-    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
+    void pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
     void markBlock(NodeId B, DefStackMap &DefM);
     void releaseBlock(NodeId B, DefStackMap &DefM);
 
@@ -745,7 +745,6 @@ namespace rdf {
 
     RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const;
     RegisterRef makeRegRef(const MachineOperand &Op) const;
-    RegisterRef normalizeRef(RegisterRef RR) const;
     RegisterRef restrictRef(RegisterRef AR, RegisterRef BR) const;
 
     NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
@@ -762,6 +761,10 @@ namespace rdf {
     NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
         NodeAddr<RefNode*> RA) const;
 
+    NodeAddr<BlockNode*> findBlock(MachineBasicBlock *BB) const {
+      return BlockNodes.at(BB);
+    }
+
     void unlinkUse(NodeAddr<UseNode*> UA, bool RemoveFromOwner) {
       unlinkUseDF(UA);
       if (RemoveFromOwner)
@@ -845,9 +848,12 @@ namespace rdf {
         NodeAddr<BlockNode*> BA);
     void removeUnusedPhis();
 
+    void pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DM);
+    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
     template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
         NodeAddr<T> TA, DefStack &DS);
-    void linkStmtRefs(DefStackMap &DefM, NodeAddr<StmtNode*> SA);
+    template <typename Predicate> void linkStmtRefs(DefStackMap &DefM,
+        NodeAddr<StmtNode*> SA, Predicate P);
     void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
 
     void unlinkUseDF(NodeAddr<UseNode*> UA);
@@ -858,10 +864,6 @@ namespace rdf {
       IA.Addr->removeMember(RA, *this);
     }
 
-    NodeAddr<BlockNode*> findBlock(MachineBasicBlock *BB) {
-      return BlockNodes[BB];
-    }
-
     MachineFunction &MF;
     const TargetInstrInfo &TII;
     const TargetRegisterInfo &TRI;
diff --git a/lib/Target/Hexagon/RDFLiveness.cpp b/lib/Target/Hexagon/RDFLiveness.cpp
index 0dab15e9a690ba3157b076eaa5e78f6fd08334e9..25a4c5f7058d58ad6c0fad59c69297bfba4195cb 100644
--- a/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/lib/Target/Hexagon/RDFLiveness.cpp
@@ -31,11 +31,15 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 
 using namespace llvm;
 using namespace rdf;
 
+static cl::opt<unsigned> MaxRecNest("rdf-liveness-max-rec", cl::init(25),
+  cl::Hidden, cl::desc("Maximum recursion level"));
+
 namespace llvm {
 namespace rdf {
   template<>
@@ -85,7 +89,8 @@ namespace rdf {
 // the data-flow.
 
 NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
-      NodeAddr<RefNode*> RefA, bool FullChain, const RegisterAggr &DefRRs) {
+      NodeAddr<RefNode*> RefA, bool TopShadows, bool FullChain,
+      const RegisterAggr &DefRRs) {
   NodeList RDefs; // Return value.
   SetVector<NodeId> DefQ;
   SetVector<NodeId> Owners;
@@ -105,6 +110,11 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
   auto SNA = DFG.addr<RefNode*>(Start);
   if (NodeId RD = SNA.Addr->getReachingDef())
     DefQ.insert(RD);
+  if (TopShadows) {
+    for (auto S : DFG.getRelatedRefs(RefA.Addr->getOwner(DFG), RefA))
+      if (NodeId RD = NodeAddr<RefNode*>(S).Addr->getReachingDef())
+        DefQ.insert(RD);
+  }
 
   // Collect all the reaching defs, going up until a phi node is encountered,
   // or there are no more reaching defs. From this set, the actual set of
@@ -241,8 +251,18 @@ NodeList Liveness::getAllReachingDefs(RegisterRef RefRR,
 }
 
 
-NodeSet Liveness::getAllReachingDefsRec(RegisterRef RefRR,
-      NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs) {
+std::pair<NodeSet,bool>
+Liveness::getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+      NodeSet &Visited, const NodeSet &Defs) {
+  return getAllReachingDefsRecImpl(RefRR, RefA, Visited, Defs, 0, MaxRecNest);
+}
+
+
+std::pair<NodeSet,bool>
+Liveness::getAllReachingDefsRecImpl(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
+      NodeSet &Visited, const NodeSet &Defs, unsigned Nest, unsigned MaxNest) {
+  if (Nest > MaxNest)
+    return { NodeSet(), false };
   // Collect all defined registers. Do not consider phis to be defining
   // anything, only collect "real" definitions.
   RegisterAggr DefRRs(PRI);
@@ -252,9 +272,9 @@ NodeSet Liveness::getAllReachingDefsRec(RegisterRef RefRR,
       DefRRs.insert(DA.Addr->getRegRef(DFG));
   }
 
-  NodeList RDs = getAllReachingDefs(RefRR, RefA, true, DefRRs);
+  NodeList RDs = getAllReachingDefs(RefRR, RefA, false, true, DefRRs);
   if (RDs.empty())
-    return Defs;
+    return { Defs, true };
 
   // Make a copy of the preexisting definitions and add the newly found ones.
   NodeSet TmpDefs = Defs;
@@ -273,12 +293,74 @@ NodeSet Liveness::getAllReachingDefsRec(RegisterRef RefRR,
     Visited.insert(PA.Id);
     // Go over all phi uses and get the reaching defs for each use.
     for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
-      const auto &T = getAllReachingDefsRec(RefRR, U, Visited, TmpDefs);
-      Result.insert(T.begin(), T.end());
+      const auto &T = getAllReachingDefsRecImpl(RefRR, U, Visited, TmpDefs,
+                                                Nest+1, MaxNest);
+      if (!T.second)
+        return { T.first, false };
+      Result.insert(T.first.begin(), T.first.end());
     }
   }
 
-  return Result;
+  return { Result, true };
+}
+
+/// Find the nearest ref node aliased to RefRR, going upwards in the data
+/// flow, starting from the instruction immediately preceding Inst.
+NodeAddr<RefNode*> Liveness::getNearestAliasedRef(RegisterRef RefRR,
+      NodeAddr<InstrNode*> IA) {
+  NodeAddr<BlockNode*> BA = IA.Addr->getOwner(DFG);
+  NodeList Ins = BA.Addr->members(DFG);
+  NodeId FindId = IA.Id;
+  auto E = Ins.rend();
+  auto B = std::find_if(Ins.rbegin(), E,
+                        [FindId] (const NodeAddr<InstrNode*> T) {
+                          return T.Id == FindId;
+                        });
+  // Do not scan IA (which is what B would point to).
+  if (B != E)
+    ++B;
+
+  do {
+    // Process the range of instructions from B to E.
+    for (NodeAddr<InstrNode*> I : make_range(B, E)) {
+      NodeList Refs = I.Addr->members(DFG);
+      NodeAddr<RefNode*> Clob, Use;
+      // Scan all the refs in I aliased to RefRR, and return the one that
+      // is the closest to the output of I, i.e. def > clobber > use.
+      for (NodeAddr<RefNode*> R : Refs) {
+        if (!PRI.alias(R.Addr->getRegRef(DFG), RefRR))
+          continue;
+        if (DFG.IsDef(R)) {
+          // If it's a non-clobbering def, just return it.
+          if (!(R.Addr->getFlags() & NodeAttrs::Clobbering))
+            return R;
+          Clob = R;
+        } else {
+          Use = R;
+        }
+      }
+      if (Clob.Id != 0)
+        return Clob;
+      if (Use.Id != 0)
+        return Use;
+    }
+
+    // Go up to the immediate dominator, if any.
+    MachineBasicBlock *BB = BA.Addr->getCode();
+    BA = NodeAddr<BlockNode*>();
+    if (MachineDomTreeNode *N = MDT.getNode(BB)) {
+      if ((N = N->getIDom()))
+        BA = DFG.findBlock(N->getBlock());
+    }
+    if (!BA.Id)
+      break;
+
+    Ins = BA.Addr->members(DFG);
+    B = Ins.rbegin();
+    E = Ins.rend();
+  } while (true);
+
+  return NodeAddr<RefNode*>();
 }
 
 
@@ -377,7 +459,7 @@ void Liveness::computePhiInfo() {
         NodeAddr<UseNode*> A = DFG.addr<UseNode*>(UN);
         uint16_t F = A.Addr->getFlags();
         if ((F & (NodeAttrs::Undef | NodeAttrs::PhiRef)) == 0) {
-          RegisterRef R = DFG.normalizeRef(getRestrictedRegRef(A));
+          RegisterRef R = PRI.normalize(A.Addr->getRegRef(DFG));
           RealUses[R.Reg].insert({A.Id,R.Mask});
         }
         UN = A.Addr->getSibling();
@@ -426,15 +508,11 @@ void Liveness::computePhiInfo() {
         assert((UA.Addr->getFlags() & NodeAttrs::Undef) == 0);
         RegisterRef R(UI->first, I->second);
         NodeList RDs = getAllReachingDefs(R, UA);
-        if (any_of(RDs, InPhiDefs))
-          ++I;
-        else
-          I = Uses.erase(I);
+        // If none of the reaching defs of R are from this phi, remove this
+        // use of R.
+        I = any_of(RDs, InPhiDefs) ? std::next(I) : Uses.erase(I);
       }
-      if (Uses.empty())
-        UI = RealUses.erase(UI);
-      else
-        ++UI;
+      UI = Uses.empty() ? RealUses.erase(UI) : std::next(UI);
     }
 
     // If this phi reaches some "real" uses, add it to the queue for upward
@@ -452,32 +530,29 @@ void Liveness::computePhiInfo() {
     for (auto I : PhiRefs) {
       if (!DFG.IsRef<NodeAttrs::Use>(I) || SeenUses.count(I.Id))
         continue;
-      NodeAddr<UseNode*> UA = I;
-
-      // Given a phi use UA, traverse all related phi uses (including UA).
-      // The related phi uses may reach different phi nodes or may reach the
-      // same phi node. If multiple uses reach the same phi P, the intervening
-      // defs must be accumulated for all such uses. To group all such uses
-      // into one set, map their node ids to the first use id that reaches P.
-      std::map<NodeId,NodeId> FirstUse; // Phi reached up -> first phi use.
-
-      for (NodeAddr<UseNode*> VA : DFG.getRelatedRefs(PhiA, UA)) {
-        SeenUses.insert(VA.Id);
-        RegisterAggr DefRRs(PRI);
-        for (NodeAddr<DefNode*> DA : getAllReachingDefs(VA)) {
-          if (DA.Addr->getFlags() & NodeAttrs::PhiRef) {
-            NodeId RP = DA.Addr->getOwner(DFG).Id;
-            NodeId FU = FirstUse.insert({RP,VA.Id}).first->second;
-            std::map<NodeId,RegisterAggr> &M = PhiUp[FU];
-            auto F = M.find(RP);
-            if (F == M.end())
-              M.insert(std::make_pair(RP, DefRRs));
-            else
-              F->second.insert(DefRRs);
-          }
-          DefRRs.insert(DA.Addr->getRegRef(DFG));
+      NodeAddr<PhiUseNode*> PUA = I;
+      if (PUA.Addr->getReachingDef() == 0)
+        continue;
+
+      RegisterRef UR = PUA.Addr->getRegRef(DFG);
+      NodeList Ds = getAllReachingDefs(UR, PUA, true, false, NoRegs);
+      RegisterAggr DefRRs(PRI);
+
+      for (NodeAddr<DefNode*> D : Ds) {
+        if (D.Addr->getFlags() & NodeAttrs::PhiRef) {
+          NodeId RP = D.Addr->getOwner(DFG).Id;
+          std::map<NodeId,RegisterAggr> &M = PhiUp[PUA.Id];
+          auto F = M.find(RP);
+          if (F == M.end())
+            M.insert(std::make_pair(RP, DefRRs));
+          else
+            F->second.insert(DefRRs);
         }
+        DefRRs.insert(D.Addr->getRegRef(DFG));
       }
+
+      for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PhiA, PUA))
+        SeenUses.insert(T.Id);
     }
   }
 
@@ -522,7 +597,7 @@ void Liveness::computePhiInfo() {
 
     for (NodeAddr<UseNode*> UA : PUs) {
       std::map<NodeId,RegisterAggr> &PUM = PhiUp[UA.Id];
-      RegisterRef UR = DFG.normalizeRef(getRestrictedRegRef(UA));
+      RegisterRef UR = PRI.normalize(UA.Addr->getRegRef(DFG));
       for (const std::pair<NodeId,RegisterAggr> &P : PUM) {
         bool Changed = false;
         const RegisterAggr &MidDefs = P.second;
@@ -645,30 +720,43 @@ void Liveness::computeLiveIns() {
       if (RUs.empty())
         continue;
 
+      NodeSet SeenUses;
       for (auto U : PA.Addr->members_if(DFG.IsRef<NodeAttrs::Use>, DFG)) {
+        if (!SeenUses.insert(U.Id).second)
+          continue;
         NodeAddr<PhiUseNode*> PUA = U;
         if (PUA.Addr->getReachingDef() == 0)
           continue;
 
-        // Mark all reached "real" uses of P as live on exit in the
-        // predecessor.
-        // Remap all the RUs so that they have a correct reaching def.
+        // Each phi has some set (possibly empty) of reached "real" uses,
+        // that is, uses that are part of the compiled program. Such a use
+        // may be located in some farther block, but following a chain of
+        // reaching defs will eventually lead to this phi.
+        // Any chain of reaching defs may fork at a phi node, but there
+        // will be a path upwards that will lead to this phi. Now, this
+        // chain will need to fork at this phi, since some of the reached
+        // uses may have definitions joining in from multiple predecessors.
+        // For each reached "real" use, identify the set of reaching defs
+        // coming from each predecessor P, and add them to PhiLOX[P].
+        //
         auto PrA = DFG.addr<BlockNode*>(PUA.Addr->getPredecessor());
         RefMap &LOX = PhiLOX[PrA.Addr->getCode()];
 
-        RegisterRef UR = DFG.normalizeRef(getRestrictedRegRef(PUA));
-        for (const std::pair<RegisterId,NodeRefSet> &T : RUs) {
-          // Check if T.first aliases UR?
-          LaneBitmask M;
-          for (std::pair<NodeId,LaneBitmask> P : T.second)
-            M |= P.second;
-
-          RegisterRef S = DFG.restrictRef(RegisterRef(T.first, M), UR);
-          if (!S)
-            continue;
-          for (NodeAddr<DefNode*> D : getAllReachingDefs(S, PUA))
-            LOX[S.Reg].insert({D.Id, S.Mask});
+        for (const std::pair<RegisterId,NodeRefSet> &RS : RUs) {
+          // We need to visit each individual use.
+          for (std::pair<NodeId,LaneBitmask> P : RS.second) {
+            // Create a register ref corresponding to the use, and find
+            // all reaching defs starting from the phi use, and treating
+            // all related shadows as a single use cluster.
+            RegisterRef S(RS.first, P.second);
+            NodeList Ds = getAllReachingDefs(S, PUA, true, false, NoRegs);
+            for (NodeAddr<DefNode*> D : Ds)
+              LOX[S.Reg].insert({D.Id, S.Mask});
+          }
         }
+
+        for (NodeAddr<PhiUseNode*> T : DFG.getRelatedRefs(PA, PUA))
+          SeenUses.insert(T.Id);
       }  // for U : phi uses
     }  // for P : Phis
   }  // for B : Blocks
@@ -789,7 +877,7 @@ void Liveness::resetKills(MachineBasicBlock *B) {
         Live.reset(*SR);
     }
     for (auto &Op : MI->operands()) {
-      if (!Op.isReg() || !Op.isUse())
+      if (!Op.isReg() || !Op.isUse() || Op.isUndef())
         continue;
       unsigned R = Op.getReg();
       if (!TargetRegisterInfo::isPhysicalRegister(R))
@@ -810,17 +898,6 @@ void Liveness::resetKills(MachineBasicBlock *B) {
 }
 
 
-RegisterRef Liveness::getRestrictedRegRef(NodeAddr<RefNode*> RA) const {
-  assert(DFG.IsRef<NodeAttrs::Use>(RA));
-  if (RA.Addr->getFlags() & NodeAttrs::Shadow) {
-    NodeId RD = RA.Addr->getReachingDef();
-    assert(RD);
-    RA = DFG.addr<DefNode*>(RD);
-  }
-  return RA.Addr->getRegRef(DFG);
-}
-
-
 // Helper function to obtain the basic block containing the reaching def
 // of the given use.
 MachineBasicBlock *Liveness::getBlockWithRef(NodeId RN) const {
@@ -980,7 +1057,7 @@ void Liveness::traverse(MachineBasicBlock *B, RefMap &LiveIn) {
     for (NodeAddr<UseNode*> UA : IA.Addr->members_if(DFG.IsUse, DFG)) {
       if (UA.Addr->getFlags() & NodeAttrs::Undef)
         continue;
-      RegisterRef RR = DFG.normalizeRef(UA.Addr->getRegRef(DFG));
+      RegisterRef RR = PRI.normalize(UA.Addr->getRegRef(DFG));
       for (NodeAddr<DefNode*> D : getAllReachingDefs(UA))
         if (getBlockWithRef(D.Id) != B)
           LiveIn[RR.Reg].insert({D.Id,RR.Mask});
diff --git a/lib/Target/Hexagon/RDFLiveness.h b/lib/Target/Hexagon/RDFLiveness.h
index 756977fb38616f252d850f8dee56f0300509c42b..6f2615b7c4f37ca91494fac13a368d87a77340b5 100644
--- a/lib/Target/Hexagon/RDFLiveness.h
+++ b/lib/Target/Hexagon/RDFLiveness.h
@@ -50,25 +50,30 @@ namespace rdf {
 
     Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
       : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
-        MDF(g.getDF()), MRI(mri), LiveMap(g.getPRI()), Empty(),
+        MDF(g.getDF()), LiveMap(g.getPRI()), Empty(),
         NoRegs(g.getPRI()), Trace(false) {}
 
     NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
-        bool FullChain, const RegisterAggr &DefRRs);
+        bool TopShadows, bool FullChain, const RegisterAggr &DefRRs);
     NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA) {
-      return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false, NoRegs);
+      return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false,
+                                false, NoRegs);
     }
     NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA) {
-      return getAllReachingDefs(RefRR, RefA, false, NoRegs);
+      return getAllReachingDefs(RefRR, RefA, false, false, NoRegs);
     }
-    NodeSet getAllReachingDefsRec(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
-        NodeSet &Visited, const NodeSet &Defs);
     NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA,
         const RegisterAggr &DefRRs);
     NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA) {
       return getAllReachedUses(RefRR, DefA, NoRegs);
     }
 
+    std::pair<NodeSet,bool> getAllReachingDefsRec(RegisterRef RefRR,
+        NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs);
+
+    NodeAddr<RefNode*> getNearestAliasedRef(RegisterRef RefRR,
+        NodeAddr<InstrNode*> IA);
+
     LiveMapType &getLiveMap() { return LiveMap; }
     const LiveMapType &getLiveMap() const { return LiveMap; }
     const RefMap &getRealUses(NodeId P) const {
@@ -90,7 +95,6 @@ namespace rdf {
     const PhysicalRegisterInfo &PRI;
     const MachineDominatorTree &MDT;
     const MachineDominanceFrontier &MDF;
-    MachineRegisterInfo &MRI;
     LiveMapType LiveMap;
     const RefMap Empty;
     const RegisterAggr NoRegs;
@@ -122,12 +126,13 @@ namespace rdf {
     // the dominator tree), create a map: block -> set of uses live on exit.
     std::map<MachineBasicBlock*,RefMap> PhiLOX;
 
-    bool isRestrictedToRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
-        RegisterRef RR) const;
-    RegisterRef getRestrictedRegRef(NodeAddr<RefNode*> RA) const;
     MachineBasicBlock *getBlockWithRef(NodeId RN) const;
     void traverse(MachineBasicBlock *B, RefMap &LiveIn);
     void emptify(RefMap &M);
+
+    std::pair<NodeSet,bool> getAllReachingDefsRecImpl(RegisterRef RefRR,
+        NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs,
+        unsigned Nest, unsigned MaxNest);
   };
 } // namespace rdf
 } // namespace llvm
diff --git a/lib/Target/Hexagon/RDFRegisters.cpp b/lib/Target/Hexagon/RDFRegisters.cpp
index fc5ad0aedb101618f68274ca91fa3b9e393a7bd2..7a7933c187a5943d9d78071fd565333597dc40ff 100644
--- a/lib/Target/Hexagon/RDFRegisters.cpp
+++ b/lib/Target/Hexagon/RDFRegisters.cpp
@@ -33,10 +33,21 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
     }
   }
 
+  auto HasPartialOverlaps = [this] (uint32_t Reg) -> bool {
+    for (MCRegAliasIterator A(Reg, &TRI, false); A.isValid(); ++A)
+      if (!TRI.isSubRegister(Reg, *A) && !TRI.isSubRegister(*A, Reg))
+        return true;
+    return false;
+  };
+
+  for (MCPhysReg R = 1, NR = TRI.getNumRegs(); R != NR; ++R)
+    RegInfos[R].Partial = HasPartialOverlaps(R);
+
   for (MCPhysReg R = 1, NR = TRI.getNumRegs(); R != NR; ++R) {
     MCPhysReg SuperR = R;
     for (MCSuperRegIterator S(R, &TRI, false); S.isValid(); ++S)
-      SuperR = *S;
+      if (!RegInfos[*S].Partial)
+        SuperR = *S;
     RegInfos[R].MaxSuper = SuperR;
   }
 
@@ -49,71 +60,78 @@ PhysicalRegisterInfo::PhysicalRegisterInfo(const TargetRegisterInfo &tri,
           RegMasks.insert(Op.getRegMask());
 }
 
+RegisterRef PhysicalRegisterInfo::normalize(RegisterRef RR) const {
+  if (PhysicalRegisterInfo::isRegMaskId(RR.Reg))
+    return RR;
+  RegisterId SuperReg = RegInfos[RR.Reg].MaxSuper;
+  if (RR.Reg == SuperReg)
+    return RR;
+
+  const TargetRegisterClass *RC = RegInfos[RR.Reg].RegClass;
+  LaneBitmask RCMask = RC != nullptr ? RC->LaneMask : LaneBitmask(0x00000001);
+  LaneBitmask Common = RR.Mask & RCMask;
+
+// Ex: IP/EIP/RIP
+//  assert(RC != nullptr || RR.Reg == SuperReg);
+  uint32_t Sub = TRI.getSubRegIndex(SuperReg, RR.Reg);
+  LaneBitmask SuperMask = TRI.composeSubRegIndexLaneMask(Sub, Common);
+  assert(RR.Mask.none() || SuperMask.any());
+  return RegisterRef(SuperReg, SuperMask);
+}
+
 std::set<RegisterId> PhysicalRegisterInfo::getAliasSet(RegisterId Reg) const {
   // Do not include RR in the alias set.
   std::set<RegisterId> AS;
   assert(isRegMaskId(Reg) || TargetRegisterInfo::isPhysicalRegister(Reg));
   if (isRegMaskId(Reg)) {
     // XXX SLOW
-    // XXX Add other regmasks to the set.
     const uint32_t *MB = getRegMaskBits(Reg);
     for (unsigned i = 1, e = TRI.getNumRegs(); i != e; ++i) {
       if (MB[i/32] & (1u << (i%32)))
         continue;
       AS.insert(i);
     }
+    for (const uint32_t *RM : RegMasks) {
+      RegisterId MI = getRegMaskId(RM);
+      if (MI != Reg && aliasMM(RegisterRef(Reg), RegisterRef(MI)))
+        AS.insert(MI);
+    }
     return AS;
   }
 
   for (MCRegAliasIterator AI(Reg, &TRI, false); AI.isValid(); ++AI)
     AS.insert(*AI);
+  for (const uint32_t *RM : RegMasks) {
+    RegisterId MI = getRegMaskId(RM);
+    if (aliasRM(RegisterRef(Reg), RegisterRef(MI)))
+      AS.insert(MI);
+  }
   return AS;
 }
 
 bool PhysicalRegisterInfo::aliasRR(RegisterRef RA, RegisterRef RB) const {
   assert(TargetRegisterInfo::isPhysicalRegister(RA.Reg));
   assert(TargetRegisterInfo::isPhysicalRegister(RB.Reg));
+
   MCRegUnitMaskIterator UMA(RA.Reg, &TRI);
   MCRegUnitMaskIterator UMB(RB.Reg, &TRI);
   // Reg units are returned in the numerical order.
   while (UMA.isValid() && UMB.isValid()) {
-    std::pair<uint32_t,LaneBitmask> PA = *UMA;
-    std::pair<uint32_t,LaneBitmask> PB = *UMB;
-    if (PA.first == PB.first) {
-      // Lane mask of 0 (given by the iterator) should be treated as "full".
-      // This can happen when the register has only one unit, or when the
-      // unit corresponds to explicit aliasing. In such cases, the lane mask
-      // from RegisterRef should be ignored.
-      if (PA.second.none() || PB.second.none())
-        return true;
-
-      // At this point the common unit corresponds to a subregister. The lane
-      // masks correspond to the lane mask of that unit within the original
-      // register, for example assuming register quadruple q0 = r3:0, and
-      // a register pair d1 = r3:2, the lane mask of r2 in q0 may be 0b0100,
-      // while the lane mask of r2 in d1 may be 0b0001.
-      LaneBitmask LA = PA.second & RA.Mask;
-      LaneBitmask LB = PB.second & RB.Mask;
-      if (LA.any() && LB.any()) {
-        unsigned Root = *MCRegUnitRootIterator(PA.first, &TRI);
-        // If register units were guaranteed to only have 1 bit in any lane
-        // mask, the code below would not be necessary. This is because LA
-        // and LB would have at most 1 bit set each, and that bit would be
-        // guaranteed to correspond to the given register unit.
-        uint32_t SubA = TRI.getSubRegIndex(RA.Reg, Root);
-        uint32_t SubB = TRI.getSubRegIndex(RB.Reg, Root);
-        const TargetRegisterClass *RC = RegInfos[Root].RegClass;
-        LaneBitmask RCMask = RC != nullptr ? RC->LaneMask : LaneBitmask(0x1);
-        LaneBitmask MaskA = TRI.reverseComposeSubRegIndexLaneMask(SubA, LA);
-        LaneBitmask MaskB = TRI.reverseComposeSubRegIndexLaneMask(SubB, LB);
-        if ((MaskA & MaskB & RCMask).any())
-          return true;
-      }
-
+    // Skip units that are masked off in RA.
+    std::pair<RegisterId,LaneBitmask> PA = *UMA;
+    if (PA.second.any() && (PA.second & RA.Mask).none()) {
       ++UMA;
+      continue;
+    }
+    // Skip units that are masked off in RB.
+    std::pair<RegisterId,LaneBitmask> PB = *UMB;
+    if (PB.second.any() && (PB.second & RB.Mask).none()) {
       ++UMB;
       continue;
     }
+
+    if (PA.first == PB.first)
+      return true;
     if (PA.first < PB.first)
       ++UMA;
     else if (PB.first < PA.first)
@@ -130,10 +148,10 @@ bool PhysicalRegisterInfo::aliasRM(RegisterRef RR, RegisterRef RM) const {
   // is a superset of the lane mask from the register class, check the regmask
   // bit directly.
   if (RR.Mask == LaneBitmask::getAll())
-    return Preserved;
+    return !Preserved;
   const TargetRegisterClass *RC = RegInfos[RR.Reg].RegClass;
   if (RC != nullptr && (RR.Mask & RC->LaneMask) == RC->LaneMask)
-    return Preserved;
+    return !Preserved;
 
   // Otherwise, check all subregisters whose lane mask overlaps the given
   // mask. For each such register, if it is preserved by the regmask, then
@@ -186,21 +204,6 @@ bool PhysicalRegisterInfo::aliasMM(RegisterRef RM, RegisterRef RN) const {
 }
 
 
-RegisterRef RegisterAggr::normalize(RegisterRef RR) const {
-  if (PhysicalRegisterInfo::isRegMaskId(RR.Reg))
-    return RR;
-  const TargetRegisterClass *RC = PRI.RegInfos[RR.Reg].RegClass;
-  LaneBitmask RCMask = RC != nullptr ? RC->LaneMask : LaneBitmask(0x00000001);
-  LaneBitmask Common = RR.Mask & RCMask;
-
-  RegisterId SuperReg = PRI.RegInfos[RR.Reg].MaxSuper;
-// Ex: IP/EIP/RIP
-//  assert(RC != nullptr || RR.Reg == SuperReg);
-  uint32_t Sub = PRI.getTRI().getSubRegIndex(SuperReg, RR.Reg);
-  LaneBitmask SuperMask = PRI.getTRI().composeSubRegIndexLaneMask(Sub, Common);
-  return RegisterRef(SuperReg, SuperMask);
-}
-
 bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
   if (PhysicalRegisterInfo::isRegMaskId(RR.Reg)) {
     // XXX SLOW
@@ -211,18 +214,22 @@ bool RegisterAggr::hasAliasOf(RegisterRef RR) const {
       if (hasAliasOf(RegisterRef(i, LaneBitmask::getAll())))
         return true;
     }
+    return false;
   }
 
-  RegisterRef NR = normalize(RR);
+  RegisterRef NR = PRI.normalize(RR);
   auto F = Masks.find(NR.Reg);
   if (F != Masks.end()) {
     if ((F->second & NR.Mask).any())
       return true;
   }
-  if (CheckUnits) {
-    for (MCRegUnitIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U)
-      if (ExpAliasUnits.test(*U))
-        return true;
+  if (CheckUnits || PRI.hasPartialOverlaps(NR.Reg)) {
+    for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+      std::pair<RegisterId,LaneBitmask> P = *U;
+      if (P.second.none() || (P.second & RR.Mask).any())
+        if (ExpUnits.test(P.first))
+          return true;
+    }
   }
   return false;
 }
@@ -241,13 +248,24 @@ bool RegisterAggr::hasCoverOf(RegisterRef RR) const {
   }
 
   // Always have a cover for empty lane mask.
-  RegisterRef NR = normalize(RR);
+  RegisterRef NR = PRI.normalize(RR);
   if (NR.Mask.none())
     return true;
   auto F = Masks.find(NR.Reg);
-  if (F == Masks.end())
-    return false;
-  return (NR.Mask & F->second) == NR.Mask;
+  if (F != Masks.end()) {
+    if ((NR.Mask & F->second) == NR.Mask)
+      return true;
+  }
+  if (CheckUnits || PRI.hasPartialOverlaps(NR.Reg)) {
+    for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+      std::pair<RegisterId,LaneBitmask> P = *U;
+      if (P.second.none() || (P.second & RR.Mask).any())
+        if (!ExpUnits.test(P.first))
+          return false;
+    }
+    return true;
+  }
+  return false;
 }
 
 RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
@@ -262,23 +280,24 @@ RegisterAggr &RegisterAggr::insert(RegisterRef RR) {
     return *this;
   }
 
-  RegisterRef NR = normalize(RR);
+  RegisterRef NR = PRI.normalize(RR);
   auto F = Masks.find(NR.Reg);
   if (F == Masks.end())
     Masks.insert({NR.Reg, NR.Mask});
   else
     F->second |= NR.Mask;
 
-  // Visit all register units to see if there are any that were created
-  // by explicit aliases. Add those that were to the bit vector.
-  const TargetRegisterInfo &TRI = PRI.getTRI();
-  for (MCRegUnitIterator U(RR.Reg, &TRI); U.isValid(); ++U) {
-    MCRegUnitRootIterator R(*U, &TRI);
-    ++R;
-    if (!R.isValid())
-      continue;
-    ExpAliasUnits.set(*U);
-    CheckUnits = true;
+  // If the register has any partial overlaps, the mask will not be sufficient
+  // to accurately represent aliasing/covering information. Add all units to
+  // the bit vector.
+  if (PRI.hasPartialOverlaps(NR.Reg)) {
+    for (MCRegUnitMaskIterator U(RR.Reg, &PRI.getTRI()); U.isValid(); ++U) {
+      std::pair<RegisterId,LaneBitmask> P = *U;
+      if (P.second.none() || (P.second & RR.Mask).none())
+        continue;
+      ExpUnits.set(P.first);
+      CheckUnits = true;
+    }
   }
   return *this;
 }
@@ -301,7 +320,7 @@ RegisterAggr &RegisterAggr::clear(RegisterRef RR) {
     return *this;
   }
 
-  RegisterRef NR = normalize(RR);
+  RegisterRef NR = PRI.normalize(RR);
   auto F = Masks.find(NR.Reg);
   if (F == Masks.end())
     return *this;
diff --git a/lib/Target/Hexagon/RDFRegisters.h b/lib/Target/Hexagon/RDFRegisters.h
index bad0e70091a567f4aa59d3c21ae1f51b8409a9f2..621a6e2ff7737a4272461d4fb7626f5b76d26268 100644
--- a/lib/Target/Hexagon/RDFRegisters.h
+++ b/lib/Target/Hexagon/RDFRegisters.h
@@ -51,6 +51,10 @@ namespace rdf {
       return F - Map.begin() + 1;
     }
 
+    typedef typename std::vector<T>::const_iterator const_iterator;
+    const_iterator begin() const { return Map.begin(); }
+    const_iterator end() const { return Map.end(); }
+
   private:
     std::vector<T> Map;
   };
@@ -91,6 +95,7 @@ namespace rdf {
     const uint32_t *getRegMaskBits(RegisterId R) const {
       return RegMasks.get(TargetRegisterInfo::stackSlot2Index(R));
     }
+    RegisterRef normalize(RegisterRef RR) const;
 
     bool alias(RegisterRef RA, RegisterRef RB) const {
       if (!isRegMaskId(RA.Reg))
@@ -98,13 +103,17 @@ namespace rdf {
       return !isRegMaskId(RB.Reg) ? aliasRM(RB, RA) : aliasMM(RA, RB);
     }
     std::set<RegisterId> getAliasSet(RegisterId Reg) const;
+    bool hasPartialOverlaps(RegisterId Reg) const {
+      return RegInfos[Reg].Partial;
+    }
 
     const TargetRegisterInfo &getTRI() const { return TRI; }
 
-//  private:
+  private:
     struct RegInfo {
       unsigned MaxSuper = 0;
       const TargetRegisterClass *RegClass = nullptr;
+      bool Partial = false;
     };
 
     const TargetRegisterInfo &TRI;
@@ -119,7 +128,7 @@ namespace rdf {
 
   struct RegisterAggr {
     RegisterAggr(const PhysicalRegisterInfo &pri)
-        : ExpAliasUnits(pri.getTRI().getNumRegUnits()), PRI(pri) {}
+        : ExpUnits(pri.getTRI().getNumRegUnits()), PRI(pri) {}
     RegisterAggr(const RegisterAggr &RG) = default;
 
     bool empty() const { return Masks.empty(); }
@@ -146,11 +155,10 @@ namespace rdf {
     typedef MapType::const_iterator iterator;
     iterator begin() const { return Masks.begin(); }
     iterator end() const { return Masks.end(); }
-    RegisterRef normalize(RegisterRef RR) const;
 
   private:
     MapType Masks;
-    BitVector ExpAliasUnits; // Register units for explicit aliases.
+    BitVector ExpUnits; // Register units for explicit checks.
     bool CheckUnits = false;
     const PhysicalRegisterInfo &PRI;
   };
diff --git a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 57ead973b56ea0936bd2fd556731665263e95eef..1d6c07974beb492c2c5fae45d8864cc14f754f4c 100644
--- a/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -1096,7 +1096,7 @@ StringRef LanaiAsmParser::splitMnemonic(StringRef Name, SMLoc NameLoc,
   return Mnemonic;
 }
 
-bool IsMemoryAssignmentError(const OperandVector &Operands) {
+static bool IsMemoryAssignmentError(const OperandVector &Operands) {
   // Detects if a memory operation has an erroneous base register modification.
   // Memory operations are detected by matching the types of operands.
   //
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index a04fe8112fb99692c6aa65873601b44a08824864..0ef1401ef531a6af92562a0481531dc6b1963da1 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -50,7 +50,7 @@ public:
       : MCAsmBackend(), OSType(OST) {}
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -90,7 +90,7 @@ bool LanaiAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 
 void LanaiAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                  unsigned /*DataSize*/, uint64_t Value,
-                                 bool /*IsPCRel*/) const {
+                                 bool /*IsPCRel*/, MCContext & /*Ctx*/) const {
   MCFixupKind Kind = Fixup.getKind();
   Value = adjustFixupValue(static_cast<unsigned>(Kind), Value);
 
diff --git a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
index f5b5335bb98951cb023b209f7b24a407f49ddf20..10254677a5ad103e62c4c77037d7ec64f544b7f6 100644
--- a/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
+++ b/lib/Target/Lanai/MCTargetDesc/LanaiMCCodeEmitter.cpp
@@ -89,7 +89,7 @@ public:
 
 } // end anonymous namespace
 
-Lanai::Fixups FixupKind(const MCExpr *Expr) {
+static Lanai::Fixups FixupKind(const MCExpr *Expr) {
   if (isa<MCSymbolRefExpr>(Expr))
     return Lanai::FIXUP_LANAI_21;
   if (const LanaiMCExpr *McExpr = dyn_cast<LanaiMCExpr>(Expr)) {
@@ -134,8 +134,8 @@ unsigned LanaiMCCodeEmitter::getMachineOpValue(
 }
 
 // Helper function to adjust P and Q bits on load and store instructions.
-unsigned adjustPqBits(const MCInst &Inst, unsigned Value, unsigned PBitShift,
-                      unsigned QBitShift) {
+static unsigned adjustPqBits(const MCInst &Inst, unsigned Value,
+                             unsigned PBitShift, unsigned QBitShift) {
   const MCOperand AluOp = Inst.getOperand(3);
   unsigned AluCode = AluOp.getImm();
 
diff --git a/lib/Target/MSP430/MSP430CallingConv.td b/lib/Target/MSP430/MSP430CallingConv.td
index b38f5781c84a1e01ae2f6e65be798479b1a6c281..0434f8abfbf46679cd004bb39cf293ab8165efc8 100644
--- a/lib/Target/MSP430/MSP430CallingConv.td
+++ b/lib/Target/MSP430/MSP430CallingConv.td
@@ -13,11 +13,11 @@
 // MSP430 Return Value Calling Convention
 //===----------------------------------------------------------------------===//
 def RetCC_MSP430 : CallingConv<[
-  // i8 are returned in registers R15B, R14B, R13B, R12B
-  CCIfType<[i8], CCAssignToReg<[R15B, R14B, R13B, R12B]>>,
+  // i8 are returned in registers R12B, R13B, R14B, R15B
+  CCIfType<[i8], CCAssignToReg<[R12B, R13B, R14B, R15B]>>,
 
-  // i16 are returned in registers R15, R14, R13, R12
-  CCIfType<[i16], CCAssignToReg<[R15, R14, R13, R12]>>
+  // i16 are returned in registers R12, R13, R14, R15
+  CCIfType<[i16], CCAssignToReg<[R12, R13, R14, R15]>>
 ]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 73346b9ce41d7971d83f332af23db0442a82fb98..40b1dd3cc2ebfc94396812a67ca270d06a07bc1e 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -245,13 +245,20 @@ MSP430TargetLowering::getRegForInlineAsmConstraint(
 template<typename ArgT>
 static void ParseFunctionArgs(const SmallVectorImpl<ArgT> &Args,
                               SmallVectorImpl<unsigned> &Out) {
-  unsigned CurrentArgIndex = ~0U;
-  for (unsigned i = 0, e = Args.size(); i != e; i++) {
-    if (CurrentArgIndex == Args[i].OrigArgIndex) {
-      Out.back()++;
+  unsigned CurrentArgIndex;
+
+  if (Args.empty())
+    return;
+
+  CurrentArgIndex = Args[0].OrigArgIndex;
+  Out.push_back(0);
+
+  for (auto &Arg : Args) {
+    if (CurrentArgIndex == Arg.OrigArgIndex) {
+      Out.back() += 1;
     } else {
       Out.push_back(1);
-      CurrentArgIndex++;
+      CurrentArgIndex = Arg.OrigArgIndex;
     }
   }
 }
@@ -275,7 +282,7 @@ static void AnalyzeArguments(CCState &State,
                              SmallVectorImpl<CCValAssign> &ArgLocs,
                              const SmallVectorImpl<ArgT> &Args) {
   static const MCPhysReg RegList[] = {
-    MSP430::R15, MSP430::R14, MSP430::R13, MSP430::R12
+    MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
   };
   static const unsigned NbRegs = array_lengthof(RegList);
 
@@ -288,7 +295,7 @@ static void AnalyzeArguments(CCState &State,
   ParseFunctionArgs(Args, ArgsParts);
 
   unsigned RegsLeft = NbRegs;
-  bool UseStack = false;
+  bool UsedStack = false;
   unsigned ValNo = 0;
 
   for (unsigned i = 0, e = ArgsParts.size(); i != e; i++) {
@@ -316,20 +323,22 @@ static void AnalyzeArguments(CCState &State,
 
     unsigned Parts = ArgsParts[i];
 
-    if (!UseStack && Parts <= RegsLeft) {
-      unsigned FirstVal = ValNo;
+    if (!UsedStack && Parts == 2 && RegsLeft == 1) {
+      // Special case for 32-bit register split, see EABI section 3.3.3
+      unsigned Reg = State.AllocateReg(RegList);
+      State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
+      RegsLeft -= 1;
+
+      UsedStack = true;
+      CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
+    } else if (Parts <= RegsLeft) {
       for (unsigned j = 0; j < Parts; j++) {
         unsigned Reg = State.AllocateReg(RegList);
         State.addLoc(CCValAssign::getReg(ValNo++, ArgVT, Reg, LocVT, LocInfo));
         RegsLeft--;
       }
-
-      // Reverse the order of the pieces to agree with the "big endian" format
-      // required in the calling convention ABI.
-      SmallVectorImpl<CCValAssign>::iterator B = ArgLocs.begin() + FirstVal;
-      std::reverse(B, B + Parts);
     } else {
-      UseStack = true;
+      UsedStack = true;
       for (unsigned j = 0; j < Parts; j++)
         CC_MSP430_AssignStack(ValNo++, ArgVT, LocVT, LocInfo, ArgFlags, State);
     }
@@ -351,10 +360,6 @@ static void AnalyzeReturnValues(CCState &State,
                                 SmallVectorImpl<CCValAssign> &RVLocs,
                                 const SmallVectorImpl<ArgT> &Args) {
   AnalyzeRetResult(State, Args);
-
-  // Reverse splitted return values to get the "big endian" format required
-  // to agree with the calling convention ABI.
-  std::reverse(RVLocs.begin(), RVLocs.end());
 }
 
 SDValue MSP430TargetLowering::LowerFormalArguments(
@@ -496,9 +501,33 @@ SDValue MSP430TargetLowering::LowerCCCArguments(
     }
   }
 
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    if (Ins[i].Flags.isSRet()) {
+      unsigned Reg = FuncInfo->getSRetReturnReg();
+      if (!Reg) {
+        Reg = MF.getRegInfo().createVirtualRegister(
+            getRegClassFor(MVT::i16));
+        FuncInfo->setSRetReturnReg(Reg);
+      }
+      SDValue Copy = DAG.getCopyToReg(DAG.getEntryNode(), dl, Reg, InVals[i]);
+      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Copy, Chain);
+    }
+  }
+
   return Chain;
 }
 
+bool
+MSP430TargetLowering::CanLowerReturn(CallingConv::ID CallConv,
+                                     MachineFunction &MF,
+                                     bool IsVarArg,
+                                     const SmallVectorImpl<ISD::OutputArg> &Outs,
+                                     LLVMContext &Context) const {
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC_MSP430);
+}
+
 SDValue
 MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                   bool isVarArg,
@@ -506,6 +535,8 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                   const SmallVectorImpl<SDValue> &OutVals,
                                   const SDLoc &dl, SelectionDAG &DAG) const {
 
+  MachineFunction &MF = DAG.getMachineFunction();
+
   // CCValAssign - represent the assignment of the return value to a location
   SmallVector<CCValAssign, 16> RVLocs;
 
@@ -537,6 +568,22 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
+  if (MF.getFunction()->hasStructRetAttr()) {
+    MSP430MachineFunctionInfo *FuncInfo = MF.getInfo<MSP430MachineFunctionInfo>();
+    unsigned Reg = FuncInfo->getSRetReturnReg();
+
+    if (!Reg)
+      llvm_unreachable("sret virtual register not created in entry block");
+
+    SDValue Val =
+      DAG.getCopyFromReg(Chain, dl, Reg, getPointerTy(DAG.getDataLayout()));
+    unsigned R12 = MSP430::R12;
+
+    Chain = DAG.getCopyToReg(Chain, dl, R12, Val, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(R12, getPointerTy(DAG.getDataLayout())));
+  }
+
   unsigned Opc = (CallConv == CallingConv::MSP430_INTR ?
                   MSP430ISD::RETI_FLAG : MSP430ISD::RET_FLAG);
 
@@ -1219,7 +1266,7 @@ MSP430TargetLowering::EmitShiftInstr(MachineInstr &MI,
                 BB->end());
   RemBB->transferSuccessorsAndUpdatePHIs(BB);
 
-  // Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
+  // Add edges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB
   BB->addSuccessor(LoopBB);
   BB->addSuccessor(RemBB);
   LoopBB->addSuccessor(RemBB);
diff --git a/lib/Target/MSP430/MSP430ISelLowering.h b/lib/Target/MSP430/MSP430ISelLowering.h
index 8864807e999e88d17797fd713bd41db6d6c00c7e..3a729623c99a89f4ff6fafefad4fbd74852d0f96 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/lib/Target/MSP430/MSP430ISelLowering.h
@@ -158,6 +158,12 @@ namespace llvm {
       LowerCall(TargetLowering::CallLoweringInfo &CLI,
                 SmallVectorImpl<SDValue> &InVals) const override;
 
+    bool CanLowerReturn(CallingConv::ID CallConv,
+                        MachineFunction &MF,
+                        bool IsVarArg,
+                        const SmallVectorImpl<ISD::OutputArg> &Outs,
+                        LLVMContext &Context) const override;
+
     SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                         const SmallVectorImpl<ISD::OutputArg> &Outs,
                         const SmallVectorImpl<SDValue> &OutVals,
diff --git a/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index 2d937318c7e5f2654646a59a41f76210bd926eda..fcaa8a1d6c728ee72076574c27bfae7843720094 100644
--- a/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -33,15 +33,23 @@ class MSP430MachineFunctionInfo : public MachineFunctionInfo {
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
+  /// SRetReturnReg - Some subtargets require that sret lowering includes
+  /// returning the value of the returned struct in a register. This field
+  /// holds the virtual register into which the sret argument is passed.
+  unsigned SRetReturnReg;
+
 public:
   MSP430MachineFunctionInfo() : CalleeSavedFrameSize(0) {}
 
   explicit MSP430MachineFunctionInfo(MachineFunction &MF)
-    : CalleeSavedFrameSize(0), ReturnAddrIndex(0) {}
+    : CalleeSavedFrameSize(0), ReturnAddrIndex(0), SRetReturnReg(0) {}
 
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
 
+  unsigned getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+
   int getRAIndex() const { return ReturnAddrIndex; }
   void setRAIndex(int Index) { ReturnAddrIndex = Index; }
 
diff --git a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index f40d33c8eaa742101bca546a96d72f627c543b56..d407774574be119344cbdbd29a1b80e6cd282550 100644
--- a/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -169,6 +169,8 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   bool parseBracketSuffix(StringRef Name, OperandVector &Operands);
 
+  bool mnemonicIsValid(StringRef Mnemonic, unsigned VariantID);
+
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
 
@@ -274,6 +276,18 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                  const MCSubtargetInfo *STI);
 
+  bool expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                    const MCSubtargetInfo *STI);
+
+  bool expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                  const MCSubtargetInfo *STI);
+
+  bool expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                   const MCSubtargetInfo *STI);
+
+  bool expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                       const MCSubtargetInfo *STI);
+
   bool expandLoadStoreDMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                              const MCSubtargetInfo *STI, bool IsLoad);
 
@@ -361,6 +375,8 @@ class MipsAsmParser : public MCTargetAsmParser {
   /// This should be used in pseudo-instruction expansions which need AT.
   unsigned getATReg(SMLoc Loc);
 
+  bool canUseATReg();
+
   bool processInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                           const MCSubtargetInfo *STI);
 
@@ -952,6 +968,16 @@ public:
   /// Render the operand to an MCInst as a GPR32
   /// Asserts if the wrong number of operands are requested, or the operand
   /// is not a k_RegisterIndex compatible with RegKind_GPR
+  void addGPR32ZeroAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
+  }
+
+  void addGPR32NonZeroAsmRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
+  }
+
   void addGPR32AsmRegOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getGPR32Reg()));
@@ -1508,6 +1534,15 @@ public:
     return Op;
   }
 
+ bool isGPRZeroAsmReg() const {
+    return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index == 0;
+  }
+
+ bool isGPRNonZeroAsmReg() const {
+   return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index > 0 &&
+          RegIdx.Index <= 31;
+  }
+
   bool isGPRAsmReg() const {
     return isRegIdx() && RegIdx.Kind & RegKind_GPR && RegIdx.Index <= 31;
   }
@@ -1867,6 +1902,61 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     }
   }
 
+  // Warn on division by zero. We're checking here as all instructions get
+  // processed here, not just the macros that need expansion.
+  //
+  // The MIPS backend models most of the divison instructions and macros as
+  // three operand instructions. The pre-R6 divide instructions however have
+  // two operands and explicitly define HI/LO as part of the instruction,
+  // not in the operands.
+  unsigned FirstOp = 1;
+  unsigned SecondOp = 2;
+  switch (Inst.getOpcode()) {
+  default:
+    break;
+  case Mips::SDivIMacro:
+  case Mips::UDivIMacro:
+  case Mips::DSDivIMacro:
+  case Mips::DUDivIMacro:
+    if (Inst.getOperand(2).getImm() == 0) {
+      if (Inst.getOperand(1).getReg() == Mips::ZERO ||
+          Inst.getOperand(1).getReg() == Mips::ZERO_64)
+        Warning(IDLoc, "dividing zero by zero");
+      else
+        Warning(IDLoc, "division by zero");
+    }
+    break;
+  case Mips::DSDIV:
+  case Mips::SDIV:
+  case Mips::UDIV:
+  case Mips::DUDIV:
+  case Mips::UDIV_MM:
+  case Mips::SDIV_MM:
+    FirstOp = 0;
+    SecondOp = 1;
+  case Mips::SDivMacro:
+  case Mips::DSDivMacro:
+  case Mips::UDivMacro:
+  case Mips::DUDivMacro:
+  case Mips::DIV:
+  case Mips::DIVU:
+  case Mips::DDIV:
+  case Mips::DDIVU:
+  case Mips::DIVU_MMR6:
+  case Mips::DDIVU_MM64R6:
+  case Mips::DIV_MMR6:
+  case Mips::DDIV_MM64R6:
+    if (Inst.getOperand(SecondOp).getReg() == Mips::ZERO ||
+        Inst.getOperand(SecondOp).getReg() == Mips::ZERO_64) {
+      if (Inst.getOperand(FirstOp).getReg() == Mips::ZERO ||
+          Inst.getOperand(FirstOp).getReg() == Mips::ZERO_64)
+        Warning(IDLoc, "dividing zero by zero");
+      else
+        Warning(IDLoc, "division by zero");
+    }
+    break;
+  }
+
   // For PIC code convert unconditional jump to unconditional branch.
   if ((Inst.getOpcode() == Mips::J || Inst.getOpcode() == Mips::J_MM) &&
       inPicMode()) {
@@ -2217,6 +2307,8 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandJalWithRegs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::BneImm:
   case Mips::BeqImm:
+  case Mips::BEQLImmMacro:
+  case Mips::BNELImmMacro:
     return expandBranchImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::BLT:
   case Mips::BLE:
@@ -2252,15 +2344,19 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   case Mips::BGTULImmMacro:
     return expandCondBranches(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SDivMacro:
+  case Mips::SDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, false, true) ? MER_Fail
                                                          : MER_Success;
   case Mips::DSDivMacro:
+  case Mips::DSDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, true, true) ? MER_Fail
                                                         : MER_Success;
   case Mips::UDivMacro:
+  case Mips::UDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, false, false) ? MER_Fail
                                                           : MER_Success;
   case Mips::DUDivMacro:
+  case Mips::DUDivIMacro:
     return expandDiv(Inst, IDLoc, Out, STI, true, false) ? MER_Fail
                                                          : MER_Success;
   case Mips::PseudoTRUNC_W_S:
@@ -2282,11 +2378,24 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   case Mips::Usw:
     return expandUxw(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::NORImm:
+  case Mips::NORImm64:
+    return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SLTImm64:
+    if (isInt<16>(Inst.getOperand(2).getImm())) {
+      Inst.setOpcode(Mips::SLTi64);
+      return MER_NotAMacro;
+    }
     return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
-  case Mips::ADDi:
-  case Mips::ADDiu:
-  case Mips::SLTi:
-  case Mips::SLTiu:
+  case Mips::SLTUImm64:
+    if (isInt<16>(Inst.getOperand(2).getImm())) {
+      Inst.setOpcode(Mips::SLTiu64);
+      return MER_NotAMacro;
+    }
+    return expandAliasImmediate(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::ADDi:   case Mips::ADDi_MM:
+  case Mips::ADDiu:  case Mips::ADDiu_MM:
+  case Mips::SLTi:   case Mips::SLTi_MM:
+  case Mips::SLTiu:  case Mips::SLTiu_MM:
     if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
         Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
       int64_t ImmValue = Inst.getOperand(2).getImm();
@@ -2296,9 +2405,9 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                                                          : MER_Success;
     }
     return MER_NotAMacro;
-  case Mips::ANDi:
-  case Mips::ORi:
-  case Mips::XORi:
+  case Mips::ANDi:  case Mips::ANDi_MM:  case Mips::ANDi64:
+  case Mips::ORi:   case Mips::ORi_MM:   case Mips::ORi64:
+  case Mips::XORi:  case Mips::XORi_MM:  case Mips::XORi64:
     if ((Inst.getNumOperands() == 3) && Inst.getOperand(0).isReg() &&
         Inst.getOperand(1).isReg() && Inst.getOperand(2).isImm()) {
       int64_t ImmValue = Inst.getOperand(2).getImm();
@@ -2322,6 +2431,17 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandDRotationImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::ABSMacro:
     return expandAbs(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::MULImmMacro:
+  case Mips::DMULImmMacro:
+    return expandMulImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::MULOMacro:
+  case Mips::DMULOMacro:
+    return expandMulO(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::MULOUMacro:
+  case Mips::DMULOUMacro:
+    return expandMulOU(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::DMULMacro:
+    return expandDMULMacro(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::LDMacro:
   case Mips::SDMacro:
     return expandLoadStoreDMacro(Inst, IDLoc, Out, STI,
@@ -2474,7 +2594,6 @@ bool MipsAsmParser::loadImmediate(int64_t ImmValue, unsigned DstReg,
 
     uint16_t Bits31To16 = (ImmValue >> 16) & 0xffff;
     uint16_t Bits15To0 = ImmValue & 0xffff;
-
     if (!Is32BitImm && !isInt<32>(ImmValue)) {
       // Traditional behaviour seems to special case this particular value. It's
       // not clear why other masks are handled differently.
@@ -2700,20 +2819,24 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
 
   // This is the 64-bit symbol address expansion.
   if (ABI.ArePtrs64bit() && isGP64bit()) {
-    // We always need AT for the 64-bit expansion.
-    // If it is not available we exit.
-    unsigned ATReg = getATReg(IDLoc);
-    if (!ATReg)
-      return true;
+    // We need AT for the 64-bit expansion in the cases where the optional
+    // source register is the destination register and for the superscalar
+    // scheduled form.
+    //
+    // If it is not available we exit if the destination is the same as the
+    // source register.
 
     const MipsMCExpr *HighestExpr =
         MipsMCExpr::create(MipsMCExpr::MEK_HIGHEST, SymExpr, getContext());
     const MipsMCExpr *HigherExpr =
         MipsMCExpr::create(MipsMCExpr::MEK_HIGHER, SymExpr, getContext());
 
-    if (UseSrcReg &&
-        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg,
-                                                               SrcReg)) {
+    bool RdRegIsRsReg =
+        getContext().getRegisterInfo()->isSuperOrSubRegisterEq(DstReg, SrcReg);
+
+    if (canUseATReg() && UseSrcReg && RdRegIsRsReg) {
+      unsigned ATReg = getATReg(IDLoc);
+
       // If $rs is the same as $rd:
       // (d)la $rd, sym($rd) => lui    $at, %highest(sym)
       //                        daddiu $at, $at, %higher(sym)
@@ -2735,29 +2858,65 @@ bool MipsAsmParser::loadAndAddSymbolAddress(const MCExpr *SymExpr,
       TOut.emitRRR(Mips::DADDu, DstReg, ATReg, SrcReg, IDLoc, STI);
 
       return false;
-    }
+    } else if (canUseATReg() && !RdRegIsRsReg) {
+      unsigned ATReg = getATReg(IDLoc);
 
-    // Otherwise, if the $rs is different from $rd or if $rs isn't specified:
-    // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
-    //                            lui    $at, %hi(sym)
-    //                            daddiu $rd, $rd, %higher(sym)
-    //                            daddiu $at, $at, %lo(sym)
-    //                            dsll32 $rd, $rd, 0
-    //                            daddu  $rd, $rd, $at
-    //                            (daddu  $rd, $rd, $rs)
-    TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
-                STI);
-    TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
-    TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
-                 MCOperand::createExpr(HigherExpr), IDLoc, STI);
-    TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
-                 IDLoc, STI);
-    TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI);
-    TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI);
-    if (UseSrcReg)
-      TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
+      // If the $rs is different from $rd or if $rs isn't specified and we
+      // have $at available:
+      // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
+      //                            lui    $at, %hi(sym)
+      //                            daddiu $rd, $rd, %higher(sym)
+      //                            daddiu $at, $at, %lo(sym)
+      //                            dsll32 $rd, $rd, 0
+      //                            daddu  $rd, $rd, $at
+      //                            (daddu  $rd, $rd, $rs)
+      //
+      // Which is preferred for superscalar issue.
+      TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                  STI);
+      TOut.emitRX(Mips::LUi, ATReg, MCOperand::createExpr(HiExpr), IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(HigherExpr), IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, ATReg, ATReg, MCOperand::createExpr(LoExpr),
+                   IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL32, DstReg, DstReg, 0, IDLoc, STI);
+      TOut.emitRRR(Mips::DADDu, DstReg, DstReg, ATReg, IDLoc, STI);
+      if (UseSrcReg)
+        TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
 
-    return false;
+      return false;
+    } else if (!canUseATReg() && !RdRegIsRsReg) {
+      // Otherwise, synthesize the address in the destination register
+      // serially:
+      // (d)la $rd, sym/sym($rs) => lui    $rd, %highest(sym)
+      //                            daddiu $rd, $rd, %higher(sym)
+      //                            dsll   $rd, $rd, 16
+      //                            daddiu $rd, $rd, %hi(sym)
+      //                            dsll   $rd, $rd, 16
+      //                            daddiu $rd, $rd, %lo(sym)
+      TOut.emitRX(Mips::LUi, DstReg, MCOperand::createExpr(HighestExpr), IDLoc,
+                  STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(HigherExpr), IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, DstReg, DstReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(HiExpr), IDLoc, STI);
+      TOut.emitRRI(Mips::DSLL, DstReg, DstReg, 16, IDLoc, STI);
+      TOut.emitRRX(Mips::DADDiu, DstReg, DstReg,
+                   MCOperand::createExpr(LoExpr), IDLoc, STI);
+      if (UseSrcReg)
+        TOut.emitRRR(Mips::DADDu, DstReg, DstReg, SrcReg, IDLoc, STI);
+
+      return false;
+    } else {
+      // We have a case where SrcReg == DstReg and we don't have $at
+      // available. We can't expand this case, so error out appropriately.
+      assert(SrcReg == DstReg && !canUseATReg() &&
+             "Could have expanded dla but didn't?");
+      reportParseError(IDLoc,
+                     "pseudo-instruction requires $at, which is not available");
+      return true;
+    }
   }
 
   // And now, the 32-bit symbol address expansion:
@@ -2851,6 +3010,8 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   assert((MemOffsetOp.isImm() || MemOffsetOp.isExpr()) &&
          "expected immediate or expression operand");
 
+  bool IsLikely = false;
+
   unsigned OpCode = 0;
   switch(Inst.getOpcode()) {
     case Mips::BneImm:
@@ -2859,16 +3020,29 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     case Mips::BeqImm:
       OpCode = Mips::BEQ;
       break;
+    case Mips::BEQLImmMacro:
+      OpCode = Mips::BEQL;
+      IsLikely = true;
+      break;
+    case Mips::BNELImmMacro:
+      OpCode = Mips::BNEL;
+      IsLikely = true;
+      break;
     default:
       llvm_unreachable("Unknown immediate branch pseudo-instruction.");
       break;
   }
 
   int64_t ImmValue = ImmOp.getImm();
-  if (ImmValue == 0)
-    TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
-                 STI);
-  else {
+  if (ImmValue == 0) {
+    if (IsLikely) {
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO,
+                   MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
+      TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+    } else
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), Mips::ZERO, MemOffsetOp, IDLoc,
+              STI);
+  } else {
     warnIfNoMacro(IDLoc);
 
     unsigned ATReg = getATReg(IDLoc);
@@ -2879,7 +3053,12 @@ bool MipsAsmParser::expandBranchImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                       IDLoc, Out, STI))
       return true;
 
-    TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI);
+    if (IsLikely) {
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg,
+              MCOperand::createExpr(MemOffsetOp.getExpr()), IDLoc, STI);
+      TOut.emitRRI(Mips::SLL, Mips::ZERO, Mips::ZERO, 0, IDLoc, STI);
+    } else
+      TOut.emitRRX(OpCode, DstRegOp.getReg(), ATReg, MemOffsetOp, IDLoc, STI);
   }
   return false;
 }
@@ -3267,6 +3446,14 @@ bool MipsAsmParser::expandCondBranches(MCInst &Inst, SMLoc IDLoc,
   return false;
 }
 
+// Expand a integer division macro.
+//
+// Notably we don't have to emit a warning when encountering $rt as the $zero
+// register, or 0 as an immediate. processInstruction() has already done that.
+//
+// The destination register can only be $zero when expanding (S)DivIMacro or
+// D(S)DivMacro.
+
 bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                               const MCSubtargetInfo *STI, const bool IsMips64,
                               const bool Signed) {
@@ -3282,67 +3469,88 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   assert(RsRegOp.isReg() && "expected register operand kind");
   unsigned RsReg = RsRegOp.getReg();
 
-  const MCOperand &RtRegOp = Inst.getOperand(2);
-  assert(RtRegOp.isReg() && "expected register operand kind");
-  unsigned RtReg = RtRegOp.getReg();
+  unsigned RtReg;
+  int64_t ImmValue;
+
+  const MCOperand &RtOp = Inst.getOperand(2);
+  assert((RtOp.isReg() || RtOp.isImm()) &&
+         "expected register or immediate operand kind");
+  if (RtOp.isReg())
+    RtReg = RtOp.getReg();
+  else
+    ImmValue = RtOp.getImm();
+
   unsigned DivOp;
   unsigned ZeroReg;
+  unsigned SubOp;
 
   if (IsMips64) {
     DivOp = Signed ? Mips::DSDIV : Mips::DUDIV;
     ZeroReg = Mips::ZERO_64;
+    SubOp = Mips::DSUB;
   } else {
     DivOp = Signed ? Mips::SDIV : Mips::UDIV;
     ZeroReg = Mips::ZERO;
+    SubOp = Mips::SUB;
   }
 
   bool UseTraps = useTraps();
 
-  if (RsReg == Mips::ZERO || RsReg == Mips::ZERO_64) {
-    if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)
-      Warning(IDLoc, "dividing zero by zero");
-    if (IsMips64) {
-      if (Signed && (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64)) {
-        if (UseTraps) {
-          TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
-          return false;
-        }
+  if (RtOp.isImm()) {
+    unsigned ATReg = getATReg(IDLoc);
+    if (!ATReg)
+      return true;
 
+    if (ImmValue == 0) {
+      if (UseTraps)
+        TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI);
+      else
         TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
-        return false;
-      }
+      return false;
+    }
+
+    if (ImmValue == 1) {
+      TOut.emitRRR(Mips::OR, RdReg, RsReg, Mips::ZERO, IDLoc, STI);
+      return false;
+    } else if (Signed && ImmValue == -1) {
+      TOut.emitRRR(SubOp, RdReg, ZeroReg, RsReg, IDLoc, STI);
+      return false;
     } else {
-      TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
+      if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, isInt<32>(ImmValue),
+                        false, Inst.getLoc(), Out, STI))
+        return true;
+      TOut.emitRR(DivOp, RsReg, ATReg, IDLoc, STI);
+      TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
       return false;
     }
+    return true;
   }
 
+  // If the macro expansion of (d)div(u) would always trap or break, insert
+  // the trap/break and exit. This gives a different result to GAS. GAS has
+  // an inconsistency/missed optimization in that not all cases are handled
+  // equivalently. As the observed behaviour is the same, we're ok.
   if (RtReg == Mips::ZERO || RtReg == Mips::ZERO_64) {
-    Warning(IDLoc, "division by zero");
-    if (Signed) {
-      if (UseTraps) {
-        TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
-        return false;
-      }
-
-      TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+    if (UseTraps) {
+      TOut.emitRRI(Mips::TEQ, ZeroReg, ZeroReg, 0x7, IDLoc, STI);
       return false;
     }
+    TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
+    return false;
   }
 
-  // FIXME: The values for these two BranchTarget variables may be different in
-  // micromips. These magic numbers need to be removed.
-  unsigned BranchTargetNoTraps;
-  unsigned BranchTarget;
+  // Temporary label for first branch traget
+  MCContext &Context = TOut.getStreamer().getContext();
+  MCSymbol *BrTarget;
+  MCOperand LabelOp;
 
   if (UseTraps) {
-    BranchTarget = IsMips64 ? 12 : 8;
     TOut.emitRRI(Mips::TEQ, RtReg, ZeroReg, 0x7, IDLoc, STI);
   } else {
-    BranchTarget = IsMips64 ? 20 : 16;
-    BranchTargetNoTraps = 8;
     // Branch to the li instruction.
-    TOut.emitRRI(Mips::BNE, RtReg, ZeroReg, BranchTargetNoTraps, IDLoc, STI);
+    BrTarget = Context.createTempSymbol();
+    LabelOp = MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));
+    TOut.emitRRX(Mips::BNE, RtReg, ZeroReg, LabelOp, IDLoc, STI);
   }
 
   TOut.emitRR(DivOp, RsReg, RtReg, IDLoc, STI);
@@ -3351,6 +3559,9 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     TOut.emitII(Mips::BREAK, 0x7, 0, IDLoc, STI);
 
   if (!Signed) {
+    if (!UseTraps)
+      TOut.getStreamer().EmitLabel(BrTarget);
+
     TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
     return false;
   }
@@ -3359,15 +3570,23 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   if (!ATReg)
     return true;
 
+  if (!UseTraps)
+    TOut.getStreamer().EmitLabel(BrTarget);
+
   TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, STI);
+
+  // Temporary label for the second branch target.
+  MCSymbol *BrTargetEnd = Context.createTempSymbol();
+  MCOperand LabelOpEnd =
+      MCOperand::createExpr(MCSymbolRefExpr::create(BrTargetEnd, Context));
+
+  // Branch to the mflo instruction.
+  TOut.emitRRX(Mips::BNE, RtReg, ATReg, LabelOpEnd, IDLoc, STI);
+
   if (IsMips64) {
-    // Branch to the mflo instruction.
-    TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
     TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, 1, IDLoc, STI);
     TOut.emitRRI(Mips::DSLL32, ATReg, ATReg, 0x1f, IDLoc, STI);
   } else {
-    // Branch to the mflo instruction.
-    TOut.emitRRI(Mips::BNE, RtReg, ATReg, BranchTarget, IDLoc, STI);
     TOut.emitRI(Mips::LUi, ATReg, (uint16_t)0x8000, IDLoc, STI);
   }
 
@@ -3375,10 +3594,12 @@ bool MipsAsmParser::expandDiv(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     TOut.emitRRI(Mips::TEQ, RsReg, ATReg, 0x6, IDLoc, STI);
   else {
     // Branch to the mflo instruction.
-    TOut.emitRRI(Mips::BNE, RsReg, ATReg, BranchTargetNoTraps, IDLoc, STI);
+    TOut.emitRRX(Mips::BNE, RsReg, ATReg, LabelOpEnd, IDLoc, STI);
     TOut.emitRRI(Mips::SLL, ZeroReg, ZeroReg, 0, IDLoc, STI);
     TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI);
   }
+
+  TOut.getStreamer().EmitLabel(BrTargetEnd);
   TOut.emitR(Mips::MFLO, RdReg, IDLoc, STI);
   return false;
 }
@@ -3596,7 +3817,7 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
   unsigned SrcReg = Inst.getOperand(1).getReg();
   int64_t ImmValue = Inst.getOperand(2).getImm();
 
-  bool Is32Bit = isInt<32>(ImmValue) || isUInt<32>(ImmValue);
+  bool Is32Bit = isInt<32>(ImmValue) || (!isGP64bit() && isUInt<32>(ImmValue));
 
   unsigned FinalOpcode = Inst.getOpcode();
 
@@ -3612,30 +3833,69 @@ bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
     switch (FinalOpcode) {
     default:
       llvm_unreachable("unimplemented expansion");
-    case (Mips::ADDi):
+    case Mips::ADDi:
       FinalOpcode = Mips::ADD;
       break;
-    case (Mips::ADDiu):
+    case Mips::ADDiu:
       FinalOpcode = Mips::ADDu;
       break;
-    case (Mips::ANDi):
+    case Mips::ANDi:
       FinalOpcode = Mips::AND;
       break;
-    case (Mips::NORImm):
+    case Mips::NORImm:
       FinalOpcode = Mips::NOR;
       break;
-    case (Mips::ORi):
+    case Mips::ORi:
       FinalOpcode = Mips::OR;
       break;
-    case (Mips::SLTi):
+    case Mips::SLTi:
       FinalOpcode = Mips::SLT;
       break;
-    case (Mips::SLTiu):
+    case Mips::SLTiu:
       FinalOpcode = Mips::SLTu;
       break;
-    case (Mips::XORi):
+    case Mips::XORi:
       FinalOpcode = Mips::XOR;
       break;
+    case Mips::ADDi_MM:
+      FinalOpcode = Mips::ADD_MM;
+      break;
+    case Mips::ADDiu_MM:
+      FinalOpcode = Mips::ADDu_MM;
+      break;
+    case Mips::ANDi_MM:
+      FinalOpcode = Mips::AND_MM;
+      break;
+    case Mips::ORi_MM:
+      FinalOpcode = Mips::OR_MM;
+      break;
+    case Mips::SLTi_MM:
+      FinalOpcode = Mips::SLT_MM;
+      break;
+    case Mips::SLTiu_MM:
+      FinalOpcode = Mips::SLTu_MM;
+      break;
+    case Mips::XORi_MM:
+      FinalOpcode = Mips::XOR_MM;
+      break;
+    case Mips::ANDi64:
+      FinalOpcode = Mips::AND64;
+      break;
+    case Mips::NORImm64:
+      FinalOpcode = Mips::NOR64;
+      break;
+    case Mips::ORi64:
+      FinalOpcode = Mips::OR64;
+      break;
+    case Mips::SLTImm64:
+      FinalOpcode = Mips::SLT64;
+      break;
+    case Mips::SLTUImm64:
+      FinalOpcode = Mips::SLTu64;
+      break;
+    case Mips::XORi64:
+      FinalOpcode = Mips::XOR64;
+      break;
     }
 
     if (FinalDstReg == Mips::NoRegister)
@@ -3945,6 +4205,119 @@ bool MipsAsmParser::expandAbs(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   return false;
 }
 
+bool MipsAsmParser::expandMulImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                 const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  int32_t ImmValue = Inst.getOperand(2).getImm();
+
+  ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  loadImmediate(ImmValue, ATReg, Mips::NoRegister, true, false, IDLoc, Out, STI);
+
+  TOut.emitRR(Inst.getOpcode() == Mips::MULImmMacro ? Mips::MULT : Mips::DMULT,
+              SrcReg, ATReg, IDLoc, STI);
+
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                               const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned TmpReg = Inst.getOperand(2).getReg();
+
+  ATReg = getATReg(Inst.getLoc());
+  if (!ATReg)
+    return true;
+
+  TOut.emitRR(Inst.getOpcode() == Mips::MULOMacro ? Mips::MULT : Mips::DMULT,
+              SrcReg, TmpReg, IDLoc, STI);
+
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  TOut.emitRRI(Inst.getOpcode() == Mips::MULOMacro ? Mips::SRA : Mips::DSRA32,
+               DstReg, DstReg, 0x1F, IDLoc, STI);
+
+  TOut.emitR(Mips::MFHI, ATReg, IDLoc, STI);
+
+  if (useTraps()) {
+    TOut.emitRRI(Mips::TNE, DstReg, ATReg, 6, IDLoc, STI);
+  } else {
+    MCContext & Context = TOut.getStreamer().getContext();
+    MCSymbol * BrTarget = Context.createTempSymbol();
+    MCOperand LabelOp =
+        MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));
+
+    TOut.emitRRX(Mips::BEQ, DstReg, ATReg, LabelOp, IDLoc, STI);
+    if (AssemblerOptions.back()->isReorder())
+      TOut.emitNop(IDLoc, STI);
+    TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI);
+
+    TOut.getStreamer().EmitLabel(BrTarget);
+  }
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned ATReg = Mips::NoRegister;
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned TmpReg = Inst.getOperand(2).getReg();
+
+  ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  TOut.emitRR(Inst.getOpcode() == Mips::MULOUMacro ? Mips::MULTu : Mips::DMULTu,
+              SrcReg, TmpReg, IDLoc, STI);
+
+  TOut.emitR(Mips::MFHI, ATReg, IDLoc, STI);
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+  if (useTraps()) {
+    TOut.emitRRI(Mips::TNE, ATReg, Mips::ZERO, 6, IDLoc, STI);
+  } else {
+    MCContext & Context = TOut.getStreamer().getContext();
+    MCSymbol * BrTarget = Context.createTempSymbol();
+    MCOperand LabelOp =
+        MCOperand::createExpr(MCSymbolRefExpr::create(BrTarget, Context));
+
+    TOut.emitRRX(Mips::BEQ, ATReg, Mips::ZERO, LabelOp, IDLoc, STI);
+    if (AssemblerOptions.back()->isReorder())
+      TOut.emitNop(IDLoc, STI);
+    TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI);
+
+    TOut.getStreamer().EmitLabel(BrTarget);
+  }
+
+  return false;
+}
+
+bool MipsAsmParser::expandDMULMacro(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                    const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned TmpReg = Inst.getOperand(2).getReg();
+
+  TOut.emitRR(Mips::DMULTu, SrcReg, TmpReg, IDLoc, STI);
+  TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
+
+  return false;
+}
+
 static unsigned nextReg(unsigned Reg) {
   switch (Reg) {
   case Mips::ZERO: return Mips::AT;
@@ -4579,6 +4952,10 @@ int MipsAsmParser::matchMSA128CtrlRegisterName(StringRef Name) {
   return CC;
 }
 
+bool MipsAsmParser::canUseATReg() {
+  return AssemblerOptions.back()->getATRegIndex() != 0;
+}
+
 unsigned MipsAsmParser::getATReg(SMLoc Loc) {
   unsigned ATIndex = AssemblerOptions.back()->getATRegIndex();
   if (ATIndex == 0) {
@@ -5981,6 +6358,14 @@ bool MipsAsmParser::parseDirectiveSet() {
     return parseSetAtDirective();
   } else if (Tok.getString() == "arch") {
     return parseSetArchDirective();
+  } else if (Tok.getString() == "bopt") {
+    Warning(Tok.getLoc(), "'bopt' feature is unsupported");
+    getParser().Lex();
+    return false;
+  } else if (Tok.getString() == "nobopt") {
+    // We're already running in nobopt mode, so nothing to do.
+    getParser().Lex();
+    return false;
   } else if (Tok.getString() == "fp") {
     return parseSetFpDirective();
   } else if (Tok.getString() == "oddspreg") {
@@ -6840,3 +7225,15 @@ extern "C" void LLVMInitializeMipsAsmParser() {
 #define GET_REGISTER_MATCHER
 #define GET_MATCHER_IMPLEMENTATION
 #include "MipsGenAsmMatcher.inc"
+
+bool MipsAsmParser::mnemonicIsValid(StringRef Mnemonic, unsigned VariantID) {
+  // Find the appropriate table for this asm variant.
+  const MatchEntry *Start, *End;
+  switch (VariantID) {
+  default: llvm_unreachable("invalid variant!");
+  case 0: Start = std::begin(MatchTable0); End = std::end(MatchTable0); break;
+  }
+  // Search the table.
+  auto MnemonicRange = std::equal_range(Start, End, Mnemonic, LessOpcode());
+  return MnemonicRange.first != MnemonicRange.second;
+}
diff --git a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
index f80efb18507b1e0d479864157f68568756831640..ecdf6b0de6e7f101d57bc4f4887dd91f907169c0 100644
--- a/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
+++ b/lib/Target/Mips/Disassembler/MipsDisassembler.cpp
@@ -1,4 +1,4 @@
-//===- MipsDisassembler.cpp - Disassembler for Mips -------------*- C++ -*-===//
+//===- MipsDisassembler.cpp - Disassembler for Mips -----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -12,15 +12,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "Mips.h"
-#include "MipsRegisterInfo.h"
-#include "MipsSubtarget.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -33,6 +39,7 @@ namespace {
 class MipsDisassembler : public MCDisassembler {
   bool IsMicroMips;
   bool IsBigEndian;
+
 public:
   MipsDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx, bool IsBigEndian)
       : MCDisassembler(STI, Ctx),
@@ -42,9 +49,11 @@ public:
   bool hasMips2() const { return STI.getFeatureBits()[Mips::FeatureMips2]; }
   bool hasMips3() const { return STI.getFeatureBits()[Mips::FeatureMips3]; }
   bool hasMips32() const { return STI.getFeatureBits()[Mips::FeatureMips32]; }
+
   bool hasMips32r6() const {
     return STI.getFeatureBits()[Mips::FeatureMips32r6];
   }
+
   bool isFP64() const { return STI.getFeatureBits()[Mips::FeatureFP64Bit]; }
 
   bool isGP64() const { return STI.getFeatureBits()[Mips::FeatureGP64Bit]; }
@@ -527,11 +536,13 @@ static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
                                        const void *Decoder);
 
 namespace llvm {
+
 Target &getTheMipselTarget();
 Target &getTheMipsTarget();
 Target &getTheMips64Target();
 Target &getTheMips64elTarget();
-}
+
+} // end namespace llvm
 
 static MCDisassembler *createMipsDisassembler(
                        const Target &T,
@@ -1106,6 +1117,7 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
                                               raw_ostream &CStream) const {
   uint32_t Insn;
   DecodeStatus Result;
+  Size = 0;
 
   if (IsMicroMips) {
     Result = readInstruction16(Bytes, Address, Size, Insn, IsBigEndian);
@@ -1168,98 +1180,88 @@ DecodeStatus MipsDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
       }
     }
 
-    // This is an invalid instruction. Let the disassembler move forward by the
-    // minimum instruction size.
+    // This is an invalid instruction. Claim that the Size is 2 bytes. Since
+    // microMIPS instructions have a minimum alignment of 2, the next 2 bytes
+    // could form a valid instruction. The two bytes we rejected as an
+    // instruction could have actually beeen an inline constant pool that is
+    // unconditionally branched over.
     Size = 2;
     return MCDisassembler::Fail;
   }
 
+  // Attempt to read the instruction so that we can attempt to decode it. If
+  // the buffer is not 4 bytes long, let the higher level logic figure out
+  // what to do with a size of zero and MCDisassembler::Fail.
   Result = readInstruction32(Bytes, Address, Size, Insn, IsBigEndian, false);
-  if (Result == MCDisassembler::Fail) {
-    Size = 4;
+  if (Result == MCDisassembler::Fail)
     return MCDisassembler::Fail;
-  }
+
+  // The only instruction size for standard encoded MIPS.
+  Size = 4;
 
   if (hasCOP3()) {
     DEBUG(dbgs() << "Trying COP3_ table (32-bit opcodes):\n");
     Result =
         decodeInstruction(DecoderTableCOP3_32, Instr, Insn, Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips32r6() && isGP64()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 (GPR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r6_GP6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips32r6() && isPTR64()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r6_PTR6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips32r6()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32r6_64r632, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasMips2() && isPTR64()) {
     DEBUG(dbgs() << "Trying Mips32r6_64r6 (PTR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips32_64_PTR6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (hasCnMips()) {
     DEBUG(dbgs() << "Trying CnMips table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableCnMips32, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   if (isGP64()) {
     DEBUG(dbgs() << "Trying Mips64 (GPR64) table (32-bit opcodes):\n");
     Result = decodeInstruction(DecoderTableMips6432, Instr, Insn,
                                Address, this, STI);
-    if (Result != MCDisassembler::Fail) {
-      Size = 4;
+    if (Result != MCDisassembler::Fail)
       return Result;
-    }
   }
 
   DEBUG(dbgs() << "Trying Mips table (32-bit opcodes):\n");
   // Calling the auto-generated decoder function.
   Result =
       decodeInstruction(DecoderTableMips32, Instr, Insn, Address, this, STI);
-  if (Result != MCDisassembler::Fail) {
-    Size = 4;
+  if (Result != MCDisassembler::Fail)
     return Result;
-  }
 
-  Size = 4;
   return MCDisassembler::Fail;
 }
 
@@ -1267,16 +1269,13 @@ static DecodeStatus DecodeCPU16RegsRegisterClass(MCInst &Inst,
                                                  unsigned RegNo,
                                                  uint64_t Address,
                                                  const void *Decoder) {
-
   return MCDisassembler::Fail;
-
 }
 
 static DecodeStatus DecodeGPR64RegisterClass(MCInst &Inst,
                                              unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder) {
-
   if (RegNo > 31)
     return MCDisassembler::Fail;
 
@@ -1620,7 +1619,7 @@ static DecodeStatus DecodeMSA128Mem(MCInst &Inst, unsigned Insn,
   switch(Inst.getOpcode())
   {
   default:
-    assert (0 && "Unexpected instruction");
+    assert(false && "Unexpected instruction");
     return MCDisassembler::Fail;
     break;
   case Mips::LD_B:
@@ -1980,7 +1979,6 @@ static DecodeStatus DecodeAFGR64RegisterClass(MCInst &Inst,
   if (RegNo > 30 || RegNo %2)
     return MCDisassembler::Fail;
 
-  ;
   unsigned Reg = getReg(Decoder, Mips::AFGR64RegClassID, RegNo /2);
   Inst.addOperand(MCOperand::createReg(Reg));
   return MCDisassembler::Success;
@@ -2128,7 +2126,6 @@ static DecodeStatus DecodeJumpTarget(MCInst &Inst,
                                      unsigned Insn,
                                      uint64_t Address,
                                      const void *Decoder) {
-
   unsigned JumpOffset = fieldFromInstruction(Insn, 0, 26) << 2;
   Inst.addOperand(MCOperand::createImm(JumpOffset));
   return MCDisassembler::Success;
@@ -2267,7 +2264,14 @@ static DecodeStatus DecodeInsSize(MCInst &Inst,
                                   const void *Decoder) {
   // First we need to grab the pos(lsb) from MCInst.
   int Pos = Inst.getOperand(2).getImm();
-  int Size = (int) Insn - Pos + 1;
+  if (Inst.getOpcode() == Mips::DINSU)
+    Pos += 32;
+  int Size;
+  if (Inst.getOpcode() == Mips::DINSM ||
+      Inst.getOpcode() == Mips::DINSU)
+    Size = (int) Insn - Pos + 33;
+  else
+    Size = (int) Insn - Pos + 1;
   Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Size)));
   return MCDisassembler::Success;
 }
@@ -2363,7 +2367,6 @@ static DecodeStatus DecodeRegListOperand16(MCInst &Inst, unsigned Insn,
 
 static DecodeStatus DecodeMovePRegPair(MCInst &Inst, unsigned Insn,
                                        uint64_t Address, const void *Decoder) {
-
   unsigned RegPair = fieldFromInstruction(Insn, 7, 3);
 
   switch (RegPair) {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
index 932d38a0b9fe29cf58c45de991b406af270036b5..4a2b75b9ae4602736bed1fe28f504000cc74fd8d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
@@ -1,4 +1,4 @@
-//===-- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---*- C++ -*--===//
+//===- MipsABIFlagsSection.cpp - Mips ELF ABI Flags Section ---------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,7 +7,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsABIFlagsSection.h"
+#include "MCTargetDesc/MipsABIFlagsSection.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MipsABIFlags.h"
 
 using namespace llvm;
 
@@ -51,6 +55,7 @@ uint8_t MipsABIFlagsSection::getCPR1SizeValue() {
 }
 
 namespace llvm {
+
 MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
   // Write out a Elf_Internal_ABIFlags_v0 struct
   OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2);      // version
@@ -66,4 +71,5 @@ MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
   OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4);       // flags2
   return OS;
 }
-}
+
+} // end namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
index 3966cae9fe33db89df7935116603d178a11b9385..f3854102702310ab0704dffad9e93f6ba6f5cd52 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.h
@@ -1,4 +1,4 @@
-//===-- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -----*- C++ -*--===//
+//===- MipsABIFlagsSection.h - Mips ELF ABI Flags Section -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,9 +10,10 @@
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
 
-#include "llvm/MC/MCStreamer.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MipsABIFlags.h"
+#include <cstdint>
 
 namespace llvm {
 
@@ -23,36 +24,32 @@ struct MipsABIFlagsSection {
   enum class FpABIKind { ANY, XX, S32, S64, SOFT };
 
   // Version of flags structure.
-  uint16_t Version;
+  uint16_t Version = 0;
   // The level of the ISA: 1-5, 32, 64.
-  uint8_t ISALevel;
+  uint8_t ISALevel = 0;
   // The revision of ISA: 0 for MIPS V and below, 1-n otherwise.
-  uint8_t ISARevision;
+  uint8_t ISARevision = 0;
   // The size of general purpose registers.
-  Mips::AFL_REG GPRSize;
+  Mips::AFL_REG GPRSize = Mips::AFL_REG_NONE;
   // The size of co-processor 1 registers.
-  Mips::AFL_REG CPR1Size;
+  Mips::AFL_REG CPR1Size = Mips::AFL_REG_NONE;
   // The size of co-processor 2 registers.
-  Mips::AFL_REG CPR2Size;
+  Mips::AFL_REG CPR2Size = Mips::AFL_REG_NONE;
   // Processor-specific extension.
-  Mips::AFL_EXT ISAExtension;
+  Mips::AFL_EXT ISAExtension = Mips::AFL_EXT_NONE;
   // Mask of ASEs used.
-  uint32_t ASESet;
+  uint32_t ASESet = 0;
 
-  bool OddSPReg;
+  bool OddSPReg = false;
 
-  bool Is32BitABI;
+  bool Is32BitABI = false;
 
 protected:
   // The floating-point ABI.
-  FpABIKind FpABI;
+  FpABIKind FpABI = FpABIKind::ANY;
 
 public:
-  MipsABIFlagsSection()
-      : Version(0), ISALevel(0), ISARevision(0), GPRSize(Mips::AFL_REG_NONE),
-        CPR1Size(Mips::AFL_REG_NONE), CPR2Size(Mips::AFL_REG_NONE),
-        ISAExtension(Mips::AFL_EXT_NONE), ASESet(0), OddSPReg(false),
-        Is32BitABI(false), FpABI(FpABIKind::ANY) {}
+  MipsABIFlagsSection() = default;
 
   uint16_t getVersionValue() { return (uint16_t)Version; }
   uint8_t getISALevelValue() { return (uint8_t)ISALevel; }
@@ -80,6 +77,7 @@ public:
     FpABI = Value;
     Is32BitABI = IsABI32Bit;
   }
+
   StringRef getFpABIString(FpABIKind Value);
 
   template <class PredicateLibrary>
@@ -195,6 +193,7 @@ public:
 };
 
 MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection);
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSABIFLAGSSECTION_H
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 38b11f78e36d928bb1cf3abe13720b46e0b34035..3304449efb91ee2a486eb789fb46c31a022260d1 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -34,7 +34,7 @@ using namespace llvm;
 
 // Prepare value for the target space for it
 static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
-                                 MCContext *Ctx = nullptr) {
+                                 MCContext &Ctx) {
 
   unsigned Kind = Fixup.getKind();
 
@@ -74,8 +74,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // address range. Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 4;
     // We now check if Value can be encoded as a 16-bit signed immediate.
-    if (!isInt<16>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+    if (!isInt<16>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC16 fixup");
       return 0;
     }
     break;
@@ -84,8 +84,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 4;
     // We now check if Value can be encoded as a 19-bit signed immediate.
-    if (!isInt<19>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC19 fixup");
+    if (!isInt<19>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC19 fixup");
       return 0;
     }
     break;
@@ -121,8 +121,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 2;
     // We now check if Value can be encoded as a 7-bit signed immediate.
-    if (!isInt<7>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC7 fixup");
+    if (!isInt<7>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC7 fixup");
       return 0;
     }
     break;
@@ -131,8 +131,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 2;
     // We now check if Value can be encoded as a 10-bit signed immediate.
-    if (!isInt<10>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC10 fixup");
+    if (!isInt<10>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC10 fixup");
       return 0;
     }
     break;
@@ -141,8 +141,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 16-bit signed immediate.
-    if (!isInt<16>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC16 fixup");
+    if (!isInt<16>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC16 fixup");
       return 0;
     }
     break;
@@ -150,21 +150,21 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 8;
     // We now check if Value can be encoded as a 18-bit signed immediate.
-    if (!isInt<18>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    if (!isInt<18>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC18 fixup");
       return 0;
     }
     break;
   case Mips::fixup_MICROMIPS_PC18_S3:
     // Check alignment.
-    if ((Value & 7) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    if ((Value & 7)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC18 fixup");
     }
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 8;
     // We now check if Value can be encoded as a 18-bit signed immediate.
-    if (!isInt<18>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC18 fixup");
+    if (!isInt<18>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC18 fixup");
       return 0;
     }
     break;
@@ -172,8 +172,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 4;
     // We now check if Value can be encoded as a 21-bit signed immediate.
-    if (!isInt<21>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+    if (!isInt<21>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC21 fixup");
       return 0;
     }
     break;
@@ -181,8 +181,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t) Value / 4;
     // We now check if Value can be encoded as a 26-bit signed immediate.
-    if (!isInt<26>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC26 fixup");
+    if (!isInt<26>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC26 fixup");
       return 0;
     }
     break;
@@ -190,8 +190,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 26-bit signed immediate.
-    if (!isInt<26>(Value) && Ctx) {
-      Ctx->reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
+    if (!isInt<26>(Value)) {
+      Ctx.reportFatalError(Fixup.getLoc(), "out of range PC26 fixup");
       return 0;
     }
     break;
@@ -199,8 +199,8 @@ static unsigned adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
     // Forcing a signed division because Value can be negative.
     Value = (int64_t)Value / 2;
     // We now check if Value can be encoded as a 21-bit signed immediate.
-    if (!isInt<21>(Value) && Ctx) {
-      Ctx->reportError(Fixup.getLoc(), "out of range PC21 fixup");
+    if (!isInt<21>(Value)) {
+      Ctx.reportError(Fixup.getLoc(), "out of range PC21 fixup");
       return 0;
     }
     break;
@@ -236,10 +236,10 @@ static unsigned calculateMMLEIndex(unsigned i) {
 /// data fragment, at the offset specified by the fixup and following the
 /// fixup kind as appropriate.
 void MipsAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
-                                unsigned DataSize, uint64_t Value,
-                                bool IsPCRel) const {
+                                unsigned DataSize, uint64_t Value, bool IsPCRel,
+                                MCContext &Ctx) const {
   MCFixupKind Kind = Fixup.getKind();
-  Value = adjustFixupValue(Fixup, Value);
+  Value = adjustFixupValue(Fixup, Value, Ctx);
 
   if (!Value)
     return; // Doesn't change encoding.
@@ -471,24 +471,6 @@ bool MipsAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
   return true;
 }
 
-/// processFixupValue - Target hook to process the literal value of a fixup
-/// if necessary.
-void MipsAsmBackend::processFixupValue(const MCAssembler &Asm,
-                                       const MCAsmLayout &Layout,
-                                       const MCFixup &Fixup,
-                                       const MCFragment *DF,
-                                       const MCValue &Target,
-                                       uint64_t &Value,
-                                       bool &IsResolved) {
-  // At this point we'll ignore the value returned by adjustFixupValue as
-  // we are only checking if the fixup can be applied correctly. We have
-  // access to MCContext from here which allows us to report a fatal error
-  // with *possibly* a source code location.
-  // The caller will also ignore any changes we make to Value
-  // (recordRelocation() overwrites it with it's own calculation).
-  (void)adjustFixupValue(Fixup, Value, &Asm.getContext());
-}
-
 // MCAsmBackend
 MCAsmBackend *llvm::createMipsAsmBackendEL32(const Target &T,
                                              const MCRegisterInfo &MRI,
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index f260cfa566c9026bacbe1b8802f94d0416e2047f..4b3cc6e21f4cd15c87bb48a1966b22c39268cc78 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -39,7 +39,7 @@ public:
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
@@ -82,11 +82,6 @@ public:
 
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 
-  void processFixupValue(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                         const MCFixup &Fixup, const MCFragment *DF,
-                         const MCValue &Target, uint64_t &Value,
-                         bool &IsResolved) override;
-
 }; // class MipsAsmBackend
 
 } // namespace
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index e7d687e89a8ac3d5372a12c18d0ffd1a8a303416..4eeccc3995fd42f7c92ff7cbbf006f0ed671475d 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -8,9 +8,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "MipsELFStreamer.h"
+#include "MipsOptionRecord.h"
 #include "MipsTargetStreamer.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
 
 using namespace llvm;
@@ -51,7 +55,7 @@ void MipsELFStreamer::createPendingLabelRelocs() {
   Labels.clear();
 }
 
-void MipsELFStreamer::EmitLabel(MCSymbol *Symbol) {
+void MipsELFStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   MCELFStreamer::EmitLabel(Symbol);
   Labels.push_back(Symbol);
 }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index a241cdebdcc88dfa1243ea8e8fb95797b306917c..72cde1c908453d09ee79c628947a0c43c97f054f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -1,4 +1,4 @@
-//===-------- MipsELFStreamer.h - ELF Object Output -----------------------===//
+//===- MipsELFStreamer.h - ELF Object Output --------------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -21,6 +21,7 @@
 #include <memory>
 
 namespace llvm {
+
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
@@ -31,12 +32,10 @@ class MipsELFStreamer : public MCELFStreamer {
   MipsRegInfoRecord *RegInfoRecord;
   SmallVector<MCSymbol*, 4> Labels;
 
-
 public:
   MipsELFStreamer(MCContext &Context, MCAsmBackend &MAB, raw_pwrite_stream &OS,
                   MCCodeEmitter *Emitter)
       : MCELFStreamer(Context, MAB, OS, Emitter) {
-
     RegInfoRecord = new MipsRegInfoRecord(this, Context);
     MipsOptionRecords.push_back(
         std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
@@ -51,7 +50,7 @@ public:
   /// Overriding this function allows us to record all labels that should be
   /// marked as microMIPS. Based on this data marking is done in
   /// EmitInstruction.
-  void EmitLabel(MCSymbol *Symbol) override;
+  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
 
   /// Overriding this function allows us to dismiss all labels that are
   /// candidates for marking as microMIPS when .section directive is processed.
@@ -72,5 +71,6 @@ public:
 MCELFStreamer *createMipsELFStreamer(MCContext &Context, MCAsmBackend &MAB,
                                      raw_pwrite_stream &OS,
                                      MCCodeEmitter *Emitter, bool RelaxAll);
-} // namespace llvm.
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSELFSTREAMER_H
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index a44a35f49e5fca3d20ed5db8f5fb517e28400000..ebe3c578488826fa6d7d029e326d14e1dd75b176 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -19,9 +19,7 @@ using namespace llvm;
 void MipsMCAsmInfo::anchor() { }
 
 MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple) {
-  if ((TheTriple.getArch() == Triple::mips) ||
-      (TheTriple.getArch() == Triple::mips64))
-    IsLittleEndian = false;
+  IsLittleEndian = TheTriple.isLittleEndian();
 
   if ((TheTriple.getArch() == Triple::mips64el) ||
       (TheTriple.getArch() == Triple::mips64)) {
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 0614316d5ac7a483d6262d65d1df2b852b14e9cc..5685f0426e9b4035f9a2f40c32ef29bc4bc1dd0f 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -10,22 +10,29 @@
 // This file implements the MipsMCCodeEmitter class.
 //
 //===----------------------------------------------------------------------===//
-//
 
-#include "MipsMCCodeEmitter.h"
 #include "MCTargetDesc/MipsFixupKinds.h"
 #include "MCTargetDesc/MipsMCExpr.h"
 #include "MCTargetDesc/MipsMCTargetDesc.h"
+#include "MipsMCCodeEmitter.h"
 #include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
@@ -34,6 +41,7 @@
 #undef GET_INSTRMAP_INFO
 
 namespace llvm {
+
 MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
                                          const MCRegisterInfo &MRI,
                                          MCContext &Ctx) {
@@ -45,12 +53,12 @@ MCCodeEmitter *createMipsMCCodeEmitterEL(const MCInstrInfo &MCII,
                                          MCContext &Ctx) {
   return new MipsMCCodeEmitter(MCII, Ctx, true);
 }
-} // End of namespace llvm.
+
+} // end namespace llvm
 
 // If the D<shift> instruction has a shift amount that is greater
 // than 31 (checked in calling routine), lower it to a D<shift>32 instruction
 static void LowerLargeShift(MCInst& Inst) {
-
   assert(Inst.getNumOperands() == 3 && "Invalid no. of operands for shift!");
   assert(Inst.getOperand(2).isImm());
 
@@ -103,24 +111,25 @@ static void LowerDins(MCInst& InstIn) {
   assert(InstIn.getOperand(3).isImm());
   int64_t size = InstIn.getOperand(3).getImm();
 
-  if (size <= 32) {
-    if (pos < 32)  // DINS, do nothing
-      return;
+  assert((pos + size) <= 64 &&
+         "DINS cannot have position plus size over 64");
+  if (pos < 32) {
+    if ((pos + size) > 0 && (pos + size) <= 32)
+      return; // DINS, do nothing
+    else if ((pos + size) > 32) {
+      //DINSM
+      InstIn.getOperand(3).setImm(size - 32);
+      InstIn.setOpcode(Mips::DINSM);
+    }
+  } else if ((pos + size) > 32 && (pos + size) <= 64) {
     // DINSU
     InstIn.getOperand(2).setImm(pos - 32);
     InstIn.setOpcode(Mips::DINSU);
-    return;
   }
-  // DINSM
-  assert(pos < 32 && "DINS cannot have both size and pos > 32");
-  InstIn.getOperand(3).setImm(size - 32);
-  InstIn.setOpcode(Mips::DINSM);
-  return;
 }
 
 // Fix a bad compact branch encoding for beqc/bnec.
 void MipsMCCodeEmitter::LowerCompactBranch(MCInst& Inst) const {
-
   // Encoding may be illegal !(rs < rt), but this situation is
   // easily fixed.
   unsigned RegOp0 = Inst.getOperand(0).getReg();
@@ -146,7 +155,6 @@ void MipsMCCodeEmitter::LowerCompactBranch(MCInst& Inst) const {
 
   Inst.getOperand(0).setReg(RegOp1);
   Inst.getOperand(1).setReg(RegOp0);
-
 }
 
 bool MipsMCCodeEmitter::isMicroMips(const MCSubtargetInfo &STI) const {
@@ -186,7 +194,6 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const
 {
-
   // Non-pseudo instructions that get changed for direct object
   // only based on operand values.
   // If this list of instructions get much longer we will move
@@ -272,7 +279,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -295,7 +301,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValue1SImm16(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -318,7 +323,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueMMR6(const MCInst &MI, unsigned OpNo,
                            SmallVectorImpl<MCFixup> &Fixups,
                            const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -342,7 +346,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueLsl2MMR6(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -366,7 +369,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget7OpValueMM(const MCInst &MI, unsigned OpNo,
                           SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -388,7 +390,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueMMPC10(const MCInst &MI, unsigned OpNo,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -410,7 +411,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTargetOpValueMM(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -433,7 +433,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget21OpValue(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -456,7 +455,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget21OpValueMM(const MCInst &MI, unsigned OpNo,
                            SmallVectorImpl<MCFixup> &Fixups,
                            const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -479,7 +477,6 @@ unsigned MipsMCCodeEmitter::
 getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 4.
@@ -501,7 +498,6 @@ getBranchTarget26OpValue(const MCInst &MI, unsigned OpNo,
 unsigned MipsMCCodeEmitter::getBranchTarget26OpValueMM(
     const MCInst &MI, unsigned OpNo, SmallVectorImpl<MCFixup> &Fixups,
     const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   // If the destination is an immediate, divide by 2.
@@ -525,7 +521,6 @@ unsigned MipsMCCodeEmitter::
 getJumpOffset16OpValue(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
 
   if (MO.isImm()) return MO.getImm();
@@ -544,7 +539,6 @@ unsigned MipsMCCodeEmitter::
 getJumpTargetOpValue(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   // If the destination is an immediate, divide by 4.
   if (MO.isImm()) return MO.getImm()>>2;
@@ -562,7 +556,6 @@ unsigned MipsMCCodeEmitter::
 getJumpTargetOpValueMM(const MCInst &MI, unsigned OpNo,
                        SmallVectorImpl<MCFixup> &Fixups,
                        const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   // If the destination is an immediate, divide by 2.
   if (MO.isImm()) return MO.getImm() >> 1;
@@ -580,7 +573,6 @@ unsigned MipsMCCodeEmitter::
 getUImm5Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     // The immediate is encoded as 'immediate << 2'.
@@ -599,7 +591,6 @@ unsigned MipsMCCodeEmitter::
 getSImm3Lsa2Value(const MCInst &MI, unsigned OpNo,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     int Value = MO.getImm();
@@ -613,7 +604,6 @@ unsigned MipsMCCodeEmitter::
 getUImm6Lsl2Encoding(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     unsigned Value = MO.getImm();
@@ -627,7 +617,6 @@ unsigned MipsMCCodeEmitter::
 getSImm9AddiuspValue(const MCInst &MI, unsigned OpNo,
                      SmallVectorImpl<MCFixup> &Fixups,
                      const MCSubtargetInfo &STI) const {
-
   const MCOperand &MO = MI.getOperand(OpNo);
   if (MO.isImm()) {
     unsigned Binary = (MO.getImm() >> 2) & 0x0000ffff;
@@ -711,7 +700,7 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
     case MipsMCExpr::MEK_GPREL:
       FixupKind = Mips::fixup_Mips_GPREL16;
       break;
-    case MipsMCExpr::MEK_LO: {
+    case MipsMCExpr::MEK_LO:
       // Check for %lo(%neg(%gp_rel(X)))
       if (MipsExpr->isGpOff()) {
         FixupKind = Mips::fixup_Mips_GPOFF_LO;
@@ -720,7 +709,6 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
       FixupKind = isMicroMips(STI) ? Mips::fixup_MICROMIPS_LO16
                                    : Mips::fixup_Mips_LO16;
       break;
-    }
     case MipsMCExpr::MEK_HIGHEST:
       FixupKind = Mips::fixup_Mips_HIGHEST;
       break;
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index 2d041dcbf0408b34dd637655d9d2b6233ae9808b..d12d3195521aabba6b1f3f39ed101cb2f9b801e8 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -1,4 +1,4 @@
-//===-- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code -----------===//
+//===- MipsMCCodeEmitter.h - Convert Mips Code to Machine Code --*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,29 +10,25 @@
 // This file defines the MipsMCCodeEmitter class.
 //
 //===----------------------------------------------------------------------===//
-//
 
 #ifndef LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
 #define LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
 
 #include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/Support/DataTypes.h"
-
-using namespace llvm;
+#include <cstdint>
 
 namespace llvm {
+
 class MCContext;
 class MCExpr;
+class MCFixup;
 class MCInst;
 class MCInstrInfo;
-class MCFixup;
 class MCOperand;
 class MCSubtargetInfo;
 class raw_ostream;
 
 class MipsMCCodeEmitter : public MCCodeEmitter {
-  MipsMCCodeEmitter(const MipsMCCodeEmitter &) = delete;
-  void operator=(const MipsMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
   bool IsLittleEndian;
@@ -43,8 +39,9 @@ class MipsMCCodeEmitter : public MCCodeEmitter {
 public:
   MipsMCCodeEmitter(const MCInstrInfo &mcii, MCContext &Ctx_, bool IsLittle)
       : MCII(mcii), Ctx(Ctx_), IsLittleEndian(IsLittle) {}
-
-  ~MipsMCCodeEmitter() override {}
+  MipsMCCodeEmitter(const MipsMCCodeEmitter &) = delete;
+  MipsMCCodeEmitter &operator=(const MipsMCCodeEmitter &) = delete;
+  ~MipsMCCodeEmitter() override = default;
 
   void EmitByte(unsigned char C, raw_ostream &OS) const;
 
@@ -270,9 +267,11 @@ public:
   unsigned getRegisterListOpValue16(const MCInst &MI, unsigned OpNo,
                                     SmallVectorImpl<MCFixup> &Fixups,
                                     const MCSubtargetInfo &STI) const;
-  private:
+
+private:
   void LowerCompactBranch(MCInst& Inst) const;
-}; // class MipsMCCodeEmitter
-} // namespace llvm.
+};
+
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCCODEEMITTER_H
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
index 082bb87fcb8a1a3e9621802b7da1c3008e2b76d6..be04480044d48268b3627a265641011cbe42b846 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.cpp
@@ -11,9 +11,15 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCObjectStreamer.h"
 #include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ELF.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cstdint>
 
 using namespace llvm;
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
index d1a4334ec640ad5770dd6ac2e7e988bec2e8bdf1..495d525ccff42d92f7c76a83737d144f7c18570e 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCExpr.h
@@ -1,4 +1,4 @@
-//===-- MipsMCExpr.h - Mips specific MC expression classes ------*- C++ -*-===//
+//===- MipsMCExpr.h - Mips specific MC expression classes -------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -70,6 +70,7 @@ public:
   bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
                                  const MCFixup *Fixup) const override;
   void visitUsedExpr(MCStreamer &Streamer) const override;
+
   MCFragment *findAssociatedFragment() const override {
     return getSubExpr()->findAssociatedFragment();
   }
@@ -86,6 +87,7 @@ public:
     return isGpOff(Kind);
   }
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MCTARGETDESC_MIPSMCEXPR_H
diff --git a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index aef9bd3a8e2a68328527ab0e71316deca92f9784..8c2617a687b8f813603933cb3039b5002069ec90 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -20,7 +20,11 @@
 #include "Mips.h"
 #include "MipsELFStreamer.h"
 #include "MipsMCNaCl.h"
+#include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -38,14 +42,14 @@ class MipsNaClELFStreamer : public MipsELFStreamer {
 public:
   MipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
                       raw_pwrite_stream &OS, MCCodeEmitter *Emitter)
-      : MipsELFStreamer(Context, TAB, OS, Emitter), PendingCall(false) {}
+      : MipsELFStreamer(Context, TAB, OS, Emitter) {}
 
-  ~MipsNaClELFStreamer() override {}
+  ~MipsNaClELFStreamer() override = default;
 
 private:
   // Whether we started the sandboxing sequence for calls.  Calls are bundled
   // with branch delays and aligned to the bundle end.
-  bool PendingCall;
+  bool PendingCall = false;
 
   bool isIndirectJump(const MCInst &MI) {
     if (MI.getOpcode() == Mips::JALR) {
@@ -265,4 +269,4 @@ MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context, MCAsmBackend &TAB,
   return S;
 }
 
-}
+} // end namespace llvm
diff --git a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index 24b602810d6e6bd6721249bca9c5054f032cb532..74d5e4cc98419bddaf2e32fc09bcb9481dbfe636 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -1,4 +1,4 @@
-//===-- MipsOptionRecord.cpp - Abstraction for storing information --------===//
+//===- MipsOptionRecord.cpp - Abstraction for storing information ---------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,10 +7,16 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsOptionRecord.h"
+#include "MipsABIInfo.h"
 #include "MipsELFStreamer.h"
+#include "MipsOptionRecord.h"
 #include "MipsTargetStreamer.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/Support/ELF.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 2ab5aea4bfa046f3e4114b537f075e3ba18a8641..2d4083b27ed17481e0c6f61659892de5b826f292 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsTargetStreamer.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MipsELFStreamer.h"
@@ -685,6 +686,17 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
   // issues as well.
   unsigned EFlags = MCA.getELFHeaderEFlags();
 
+  // FIXME: Fix a dependency issue by instantiating the ABI object to some
+  // default based off the triple. The triple doesn't describe the target
+  // fully, but any external user of the API that uses the MCTargetStreamer
+  // would otherwise crash on assertion failure.
+
+  ABI = MipsABIInfo(
+      STI.getTargetTriple().getArch() == Triple::ArchType::mipsel ||
+              STI.getTargetTriple().getArch() == Triple::ArchType::mips
+          ? MipsABIInfo::O32()
+          : MipsABIInfo::N64());
+
   // Architecture
   if (Features[Mips::FeatureMips64r6])
     EFlags |= ELF::EF_MIPS_ARCH_64R6;
@@ -726,14 +738,13 @@ MipsTargetELFStreamer::MipsTargetELFStreamer(MCStreamer &S,
 
 void MipsTargetELFStreamer::emitLabel(MCSymbol *S) {
   auto *Symbol = cast<MCSymbolELF>(S);
-  if (!isMicroMipsEnabled())
-    return;
   getStreamer().getAssembler().registerSymbol(*Symbol);
   uint8_t Type = Symbol->getType();
   if (Type != ELF::STT_FUNC)
     return;
 
-  Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
+  if (isMicroMipsEnabled())
+    Symbol->setOther(ELF::STO_MIPS_MICROMIPS);
 }
 
 void MipsTargetELFStreamer::finish() {
@@ -903,10 +914,10 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
   const MCExpr *Size = MCBinaryExpr::createSub(
       MCSymbolRefExpr::create(CurPCSym, MCSymbolRefExpr::VK_None, Context),
       ExprRef, Context);
-  int64_t AbsSize;
-  if (!Size->evaluateAsAbsolute(AbsSize, MCA))
-    llvm_unreachable("Function size must be evaluatable as absolute");
-  Size = MCConstantExpr::create(AbsSize, Context);
+
+  // The ELFObjectWriter can determine the absolute size as it has access to
+  // the layout information of the assembly file, so a size expression rather
+  // than an absolute value is ok here.
   static_cast<MCSymbolELF *>(Sym)->setSize(Size);
 }
 
diff --git a/lib/Target/Mips/MicroMipsInstrInfo.td b/lib/Target/Mips/MicroMipsInstrInfo.td
index c0de9e7390a4cb5819d1a3fa451aad4312f21e67..ee554bc7f69a96ae4a4aaa22718bc59ee874129a 100644
--- a/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -1136,12 +1136,6 @@ let Predicates = [InMicroMips] in {
   def : MipsInstAlias<
           "sgtu $rs, $rt",
           (SLTu_MM GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
-  def : MipsInstAlias<"slt $rs, $rt, $imm",
-                      (SLTi_MM GPR32Opnd:$rs, GPR32Opnd:$rt,
-                               simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<"sltu $rs, $rt, $imm",
-                      (SLTiu_MM GPR32Opnd:$rs, GPR32Opnd:$rt,
-                                simm32_relaxed:$imm), 0>;
   def : MipsInstAlias<"sll $rd, $rt, $rs",
                       (SLLV_MM GPR32Opnd:$rd, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
   def : MipsInstAlias<"sra $rd, $rt, $rs",
@@ -1163,18 +1157,21 @@ let Predicates = [InMicroMips] in {
   def : MipsInstAlias<"rotr $rt, $imm",
                       (ROTR_MM GPR32Opnd:$rt, GPR32Opnd:$rt, uimm5:$imm), 0>;
   def : MipsInstAlias<"syscall", (SYSCALL_MM 0), 1>;
-  def : MipsInstAlias<"and $rs, $rt, $imm",
-                      (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, simm16:$imm), 0>;
-  def : MipsInstAlias<"and $rs, $imm",
-                      (ANDi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, simm16:$imm), 0>;
-  def : MipsInstAlias<"or $rs, $rt, $imm",
-                      (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-  def : MipsInstAlias<"or $rs, $imm",
-                      (ORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
-  def : MipsInstAlias<"xor $rs, $rt, $imm",
-                      (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rt, uimm16:$imm), 0>;
-  def : MipsInstAlias<"xor $rs, $imm",
-                      (XORi_MM GPR32Opnd:$rs, GPR32Opnd:$rs, uimm16:$imm), 0>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi_MM>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu_MM>;
+
   def : MipsInstAlias<"not $rt, $rs",
                       (NOR_MM GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
   def : MipsInstAlias<"not $rt",
diff --git a/lib/Target/Mips/Mips16HardFloat.cpp b/lib/Target/Mips/Mips16HardFloat.cpp
index 191006d6463c8bb6f3011d930c18db6660156520..a71b161b24ccff001f3372b7913e76c148f059d4 100644
--- a/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/lib/Target/Mips/Mips16HardFloat.cpp
@@ -405,7 +405,7 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
           "__mips16_ret_dc"
         };
         const char *Name = Helper[RV];
-        AttributeSet A;
+        AttributeList A;
         Value *Params[] = {RVal};
         Modified = true;
         //
@@ -414,13 +414,13 @@ static bool fixupFPReturnAndCall(Function &F, Module *M,
         // during call setup, the proper call lowering to the helper
         // functions will take place.
         //
-        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+        A = A.addAttribute(C, AttributeList::FunctionIndex,
                            "__Mips16RetHelper");
-        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+        A = A.addAttribute(C, AttributeList::FunctionIndex,
                            Attribute::ReadNone);
-        A = A.addAttribute(C, AttributeSet::FunctionIndex,
+        A = A.addAttribute(C, AttributeList::FunctionIndex,
                            Attribute::NoInline);
-        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T, nullptr));
+        Value *F = (M->getOrInsertFunction(Name, A, MyVoid, T));
         CallInst::Create(F, Params, "", &I);
       } else if (const CallInst *CI = dyn_cast<CallInst>(&I)) {
         FunctionType *FT = CI->getFunctionType();
@@ -490,15 +490,15 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
 // remove the use-soft-float attribute
 //
 static void removeUseSoftFloat(Function &F) {
-  AttributeSet A;
+  AttributeList A;
   DEBUG(errs() << "removing -use-soft-float\n");
-  A = A.addAttribute(F.getContext(), AttributeSet::FunctionIndex,
+  A = A.addAttribute(F.getContext(), AttributeList::FunctionIndex,
                      "use-soft-float", "false");
-  F.removeAttributes(AttributeSet::FunctionIndex, A);
+  F.removeAttributes(AttributeList::FunctionIndex, A);
   if (F.hasFnAttribute("use-soft-float")) {
     DEBUG(errs() << "still has -use-soft-float\n");
   }
-  F.addAttributes(AttributeSet::FunctionIndex, A);
+  F.addAttributes(AttributeList::FunctionIndex, A);
 }
 
 
diff --git a/lib/Target/Mips/Mips16InstrInfo.td b/lib/Target/Mips/Mips16InstrInfo.td
index 021fb86786869f99fd8741d28072fd9801fa53af..52bf690a8083556bcb76191674fed6cc0f7af8ce 100644
--- a/lib/Target/Mips/Mips16InstrInfo.td
+++ b/lib/Target/Mips/Mips16InstrInfo.td
@@ -766,6 +766,7 @@ def JrRa16: FRR16_JALRC_RA_only_ins<0, 0, "jr", IIM16Alu> {
   let hasDelaySlot = 1;
   let isTerminator=1;
   let isBarrier=1;
+  let isReturn=1;
 }
 
 def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
@@ -773,6 +774,7 @@ def JrcRa16: FRR16_JALRC_RA_only_ins<1, 1, "jrc", IIM16Alu> {
   let isIndirectBranch = 1;
   let isTerminator=1;
   let isBarrier=1;
+  let isReturn=1;
 }
 
 def JrcRx16: FRR16_JALRC_ins<1, 1, 0, "jrc", IIM16Alu> {
diff --git a/lib/Target/Mips/Mips32r6InstrInfo.td b/lib/Target/Mips/Mips32r6InstrInfo.td
index 1b4d73b798958331c274ad6c2527cd2ab4b6f9f6..3272319ad50f4e7bac665b3d2afc938b24dc9b34 100644
--- a/lib/Target/Mips/Mips32r6InstrInfo.td
+++ b/lib/Target/Mips/Mips32r6InstrInfo.td
@@ -917,6 +917,12 @@ def : MipsInstAlias<"jrc $rs", (JIC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
 let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalrc $rs", (JIALC GPR32Opnd:$rs, 0), 1>, ISA_MIPS32R6, GPR_32;
 }
+
+def : MipsInstAlias<"div $rs, $rt", (DIV GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                         GPR32Opnd:$rt)>, ISA_MIPS32R6;
+def : MipsInstAlias<"divu $rs, $rt", (DIVU GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                           GPR32Opnd:$rt)>, ISA_MIPS32R6;
+
 //===----------------------------------------------------------------------===//
 //
 // Patterns and Pseudo Instructions
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 03882dadbe1642201c19de78ba485d8ddedf0aba..99025fe1341dab1d069451462327c7d500e5e7c6 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -326,6 +326,14 @@ let AdditionalPredicates = [NotInMicroMips] in {
               EXT_FM<5>, ISA_MIPS64R2;
 }
 
+let isCodeGenOnly = 1, AdditionalPredicates = [NotInMicroMips] in {
+  def DEXT64_32 : InstSE<(outs GPR64Opnd:$rt),
+                         (ins GPR32Opnd:$rs, uimm5_report_uimm6:$pos,
+                              uimm5_plus1:$size),
+                         "dext $rt, $rs, $pos, $size", [], II_EXT, FrmR, "dext">,
+                  EXT_FM<3>, ISA_MIPS64R2;
+}
+
 let isCodeGenOnly = 1, rs = 0, shamt = 0 in {
   def DSLL64_32 : FR<0x00, 0x3c, (outs GPR64:$rd), (ins GPR32:$rt),
                      "dsll\t$rd, $rt, 32", [], II_DSLL>;
@@ -356,11 +364,11 @@ class Count1s<string opstr, RegisterOperand RO>:
   let TwoOperandAliasConstraint = "$rd = $rs";
 }
 
-class ExtsCins<string opstr, InstrItinClass itin,
-               SDPatternOperator Op = null_frag>:
-  InstSE<(outs GPR64Opnd:$rt), (ins GPR64Opnd:$rs, uimm5:$pos, uimm5:$lenm1),
-         !strconcat(opstr, " $rt, $rs, $pos, $lenm1"),
-         [(set GPR64Opnd:$rt, (Op GPR64Opnd:$rs, imm:$pos, imm:$lenm1))],
+class ExtsCins<string opstr, InstrItinClass itin, RegisterOperand RO,
+               PatFrag PosImm, SDPatternOperator Op = null_frag>:
+  InstSE<(outs RO:$rt), (ins RO:$rs, uimm5:$pos, uimm5:$lenm1),
+         !strconcat(opstr, "\t$rt, $rs, $pos, $lenm1"),
+         [(set RO:$rt, (Op RO:$rs, PosImm:$pos, imm:$lenm1))],
          itin, FrmR, opstr> {
   let TwoOperandAliasConstraint = "$rt = $rs";
 }
@@ -424,13 +432,28 @@ def DMUL  : ArithLogicR<"dmul", GPR64Opnd, 1, II_DMUL, mul>,
   let Defs = [HI0, LO0, P0, P1, P2];
 }
 
-// Extract a signed bit field /+32
-def EXTS  : ExtsCins<"exts", II_EXT>, EXTS_FM<0x3a>, ASE_CNMIPS;
-def EXTS32: ExtsCins<"exts32", II_EXT>, EXTS_FM<0x3b>, ASE_CNMIPS;
-
-// Clear and insert a bit field /+32
-def CINS  : ExtsCins<"cins", II_INS>, EXTS_FM<0x32>, ASE_CNMIPS;
-def CINS32: ExtsCins<"cins32", II_INS>, EXTS_FM<0x33>, ASE_CNMIPS;
+let AdditionalPredicates = [NotInMicroMips] in {
+  // Extract a signed bit field /+32
+  def EXTS  : ExtsCins<"exts", II_EXT, GPR64Opnd, immZExt5>, EXTS_FM<0x3a>,
+              ASE_MIPS64_CNMIPS;
+  def EXTS32: ExtsCins<"exts32", II_EXT, GPR64Opnd, immZExt5Plus32>,
+              EXTS_FM<0x3b>, ASE_MIPS64_CNMIPS;
+
+  // Clear and insert a bit field /+32
+  def CINS  : ExtsCins<"cins", II_INS, GPR64Opnd, immZExt5, MipsCIns>,
+              EXTS_FM<0x32>, ASE_MIPS64_CNMIPS;
+  def CINS32: ExtsCins<"cins32", II_INS, GPR64Opnd, immZExt5Plus32, MipsCIns>,
+              EXTS_FM<0x33>, ASE_MIPS64_CNMIPS;
+  let isCodeGenOnly = 1 in {
+    def CINS_i32 : ExtsCins<"cins", II_INS, GPR32Opnd, immZExt5, MipsCIns>,
+                   EXTS_FM<0x32>, ASE_MIPS64_CNMIPS;
+    def CINS64_32 :InstSE<(outs GPR64Opnd:$rt),
+                          (ins GPR32Opnd:$rs, uimm5:$pos, uimm5:$lenm1),
+                          "cins\t$rt, $rs, $pos, $lenm1", [], II_INS, FrmR,
+                          "cins">,
+                   EXTS_FM<0x32>, ASE_MIPS64_CNMIPS;
+  }
+}
 
 // Move to multiplier/product register
 def MTM0   : MoveToLOHI<"mtm0", GPR64Opnd, [MPL0, P0, P1, P2]>, MTMR_FM<0x08>,
@@ -646,6 +669,14 @@ def : MipsPat<(i64 (anyext GPR32:$src)),
 def : MipsPat<(i64 (zext GPR32:$src)), (DSRL (DSLL64_32 GPR32:$src), 32)>;
 def : MipsPat<(i64 (sext GPR32:$src)), (SLL64_32 GPR32:$src)>;
 
+let AdditionalPredicates = [NotInMicroMips] in {
+  def : MipsPat<(i64 (zext GPR32:$src)), (DEXT64_32 GPR32:$src, 0, 32)>,
+        ISA_MIPS64R2;
+  def : MipsPat<(i64 (zext (i32 (shl GPR32:$rt, immZExt5:$imm)))),
+                (CINS64_32 GPR32:$rt, imm:$imm, (immZExt5To31 imm:$imm))>,
+        ASE_MIPS64_CNMIPS;
+}
+
 // Sign extend in register
 def : MipsPat<(i64 (sext_inreg GPR64:$src, i32)),
               (SLL64_64 GPR64:$src)>;
@@ -707,6 +738,15 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<"daddu $rs, $imm",
                       (DADDiu GPR64Opnd:$rs, GPR64Opnd:$rs, simm16_64:$imm),
                       0>, ISA_MIPS3;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi64, GPR64Opnd, imm64>,
+         GPR_64;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi64, GPR64Opnd, imm64>,
+         GPR_64;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi64, GPR64Opnd, imm64>,
+         GPR_64;
 }
 def : MipsInstAlias<"dsll $rd, $rt, $rs",
                     (DSLLV GPR64Opnd:$rd, GPR64Opnd:$rt, GPR32Opnd:$rs), 0>,
@@ -787,21 +827,21 @@ def : MipsInstAlias<"bbit1 $rs, $p, $offset",
 def : MipsInstAlias<"exts $rt, $rs, $pos, $lenm1",
                     (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 def : MipsInstAlias<"exts $rt, $pos, $lenm1",
                     (EXTS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 
 // cins with $pos 32-63 in converted to cins32 with $pos 0-31
 def : MipsInstAlias<"cins $rt, $rs, $pos, $lenm1",
                     (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rs,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 def : MipsInstAlias<"cins $rt, $pos, $lenm1",
                     (CINS32 GPR64Opnd:$rt, GPR64Opnd:$rt,
                             uimm5_plus32_normalize:$pos, uimm5:$lenm1), 0>,
-      ASE_CNMIPS;
+      ASE_MIPS64_CNMIPS;
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
@@ -816,3 +856,81 @@ def LoadAddrReg64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins mem:$addr),
                                        "dla\t$rt, $addr">;
 def LoadAddrImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rt), (ins imm64:$imm64),
                                        "dla\t$rt, $imm64">;
+
+def DMULImmMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                                  simm32_relaxed:$imm),
+                                     "dmul\t$rs, $rt, $imm">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+def DMULOMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                                GPR64Opnd:$rd),
+                                   "dmulo\t$rs, $rt, $rd">,
+                 ISA_MIPS3_NOT_32R6_64R6;
+def DMULOUMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                                 GPR64Opnd:$rd),
+                                    "dmulou\t$rs, $rt, $rd">,
+                  ISA_MIPS3_NOT_32R6_64R6;
+
+def DMULMacro : MipsAsmPseudoInst<(outs), (ins GPR64Opnd:$rs, GPR64Opnd:$rt,
+                                               GPR64Opnd:$rd),
+                                  "dmul\t$rs, $rt, $rd"> {
+  let InsnPredicates = [HasMips3, NotMips64r6, NotCnMips];
+}
+
+let AdditionalPredicates = [NotInMicroMips] in {
+  def DSDivMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                     (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+                                     "ddiv\t$rd, $rs, $rt">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+  def DSDivIMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                      (ins GPR64Opnd:$rs, imm64:$imm),
+                                      "ddiv\t$rd, $rs, $imm">,
+                    ISA_MIPS3_NOT_32R6_64R6;
+  def DUDivMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                     (ins GPR64Opnd:$rs, GPR64Opnd:$rt),
+                                     "ddivu\t$rd, $rs, $rt">,
+                   ISA_MIPS3_NOT_32R6_64R6;
+  def DUDivIMacro : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                      (ins GPR64Opnd:$rs, imm64:$imm),
+                                      "ddivu\t$rd, $rs, $imm">,
+                    ISA_MIPS3_NOT_32R6_64R6;
+
+  // GAS expands 'div' and 'ddiv' differently when the destination
+  // register is $zero and the instruction is in the two operand
+  // form. 'ddiv' gets expanded, while 'div' is not expanded.
+
+  def : MipsInstAlias<"ddiv $rs, $rt", (DSDivMacro GPR64Opnd:$rs,
+                                               GPR64Opnd:$rs,
+                                               GPR64Opnd:$rt), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"ddiv $rd, $imm", (DSDivIMacro GPR64Opnd:$rd,
+                                                     GPR64Opnd:$rd,
+                                                     imm64:$imm), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+
+  // GAS expands 'divu' and 'ddivu' differently when the destination
+  // register is $zero and the instruction is in the two operand
+  // form. 'ddivu' gets expanded, while 'divu' is not expanded.
+
+  def : MipsInstAlias<"ddivu $rt, $rs", (DUDivMacro GPR64Opnd:$rt,
+                                                    GPR64Opnd:$rt,
+                                                    GPR64Opnd:$rs), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+  def : MipsInstAlias<"ddivu $rd, $imm", (DUDivIMacro GPR64Opnd:$rd,
+                                                      GPR64Opnd:$rd,
+                                                      imm64:$imm), 0>,
+        ISA_MIPS3_NOT_32R6_64R6;
+}
+
+def NORImm64 : NORIMM_DESC_BASE<GPR64Opnd, imm64>, GPR_64;
+def : MipsInstAlias<"nor\t$rs, $imm", (NORImm64 GPR64Opnd:$rs, GPR64Opnd:$rs,
+                                                imm64:$imm)>, GPR_64;
+def SLTImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rs),
+                                 (ins GPR64Opnd:$rt, imm64:$imm),
+                                 "slt\t$rs, $rt, $imm">, GPR_64;
+def : MipsInstAlias<"slt\t$rs, $imm", (SLTImm64 GPR64Opnd:$rs, GPR64Opnd:$rs,
+                                                imm64:$imm)>, GPR_64;
+def SLTUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rs),
+                                  (ins GPR64Opnd:$rt, imm64:$imm),
+                                  "sltu\t$rs, $rt, $imm">, GPR_64;
+def : MipsInstAlias<"sltu\t$rs, $imm", (SLTUImm64 GPR64Opnd:$rs, GPR64Opnd:$rs,
+                                                  imm64:$imm)>, GPR_64;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 04c5d96673faade5802f50d97e3c2a5ea5df1fe0..2a9d96205eb96e8d26111b7be22fc7b43494dc38 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -39,6 +39,7 @@
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSymbolELF.h"
@@ -79,6 +80,9 @@ bool MipsAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
     NaClAlignIndirectJumpTargets(MF);
 
   AsmPrinter::runOnMachineFunction(MF);
+
+  EmitXRayTable();
+
   return true;
 }
 
@@ -132,6 +136,7 @@ void MipsAsmPrinter::emitPseudoIndirectBranch(MCStreamer &OutStreamer,
 
 void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   MipsTargetStreamer &TS = getTargetStreamer();
+  unsigned Opc = MI->getOpcode();
   TS.forbidModuleDirective();
 
   if (MI->isDebugValue()) {
@@ -143,20 +148,20 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 
   // If we just ended a constant pool, mark it as such.
-  if (InConstantPool && MI->getOpcode() != Mips::CONSTPOOL_ENTRY) {
+  if (InConstantPool && Opc != Mips::CONSTPOOL_ENTRY) {
     OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
     InConstantPool = false;
   }
-  if (MI->getOpcode() == Mips::CONSTPOOL_ENTRY) {
+  if (Opc == Mips::CONSTPOOL_ENTRY) {
     // CONSTPOOL_ENTRY - This instruction represents a floating
-    //constant pool in the function.  The first operand is the ID#
+    // constant pool in the function.  The first operand is the ID#
     // for this instruction, the second is the index into the
     // MachineConstantPool that this is, the third is the size in
     // bytes of this constant pool entry.
     // The required alignment is specified on the basic block holding this MI.
     //
     unsigned LabelId = (unsigned)MI->getOperand(0).getImm();
-    unsigned CPIdx   = (unsigned)MI->getOperand(1).getIndex();
+    unsigned CPIdx = (unsigned)MI->getOperand(1).getIndex();
 
     // If this is the first entry of the pool, mark it.
     if (!InConstantPool) {
@@ -174,6 +179,17 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
 
+  switch (Opc) {
+  case Mips::PATCHABLE_FUNCTION_ENTER:
+    LowerPATCHABLE_FUNCTION_ENTER(*MI);
+    return;
+  case Mips::PATCHABLE_FUNCTION_EXIT:
+    LowerPATCHABLE_FUNCTION_EXIT(*MI);
+    return;
+  case Mips::PATCHABLE_TAIL_CALL:
+    LowerPATCHABLE_TAIL_CALL(*MI);
+    return;
+  }
 
   MachineBasicBlock::const_instr_iterator I = MI->getIterator();
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
@@ -1034,6 +1050,149 @@ void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
   OutStreamer->SwitchSection(OutContext.getObjectFileInfo()->getTextSection());
 }
 
+void MipsAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
+  const uint8_t NoopsInSledCount = Subtarget->isGP64bit() ? 15 : 11;
+  // For mips32 we want to emit the following pattern:
+  //
+  // .Lxray_sled_N:
+  //   ALIGN
+  //   B .tmpN
+  //   11 NOP instructions (44 bytes)
+  //   ADDIU T9, T9, 52 
+  // .tmpN
+  //
+  // We need the 44 bytes (11 instructions) because at runtime, we'd
+  // be patching over the full 48 bytes (12 instructions) with the following
+  // pattern:
+  //
+  //   ADDIU	SP, SP, -8
+  //   NOP
+  //   SW	RA, 4(SP)
+  //   SW       T9, 0(SP)
+  //   LUI      T9, %hi(__xray_FunctionEntry/Exit)
+  //   ORI      T9, T9, %lo(__xray_FunctionEntry/Exit)
+  //   LUI      T0, %hi(function_id)
+  //   JALR	T9
+  //   ORI	T0, T0, %lo(function_id)
+  //   LW	T9, 0(SP)
+  //   LW       RA, 4(SP)
+  //   ADDIU    SP, SP, 8
+  //
+  // We add 52 bytes to t9 because we want to adjust the function pointer to
+  // the actual start of function i.e. the address just after the noop sled.
+  // We do this because gp displacement relocation is emitted at the start of
+  // of the function i.e after the nop sled and to correctly calculate the
+  // global offset table address, t9 must hold the address of the instruction
+  // containing the gp displacement relocation.
+  // FIXME: Is this correct for the static relocation model?
+  //
+  // For mips64 we want to emit the following pattern:
+  //
+  // .Lxray_sled_N:
+  //   ALIGN
+  //   B .tmpN
+  //   15 NOP instructions (60 bytes)
+  // .tmpN
+  //
+  // We need the 60 bytes (15 instructions) because at runtime, we'd
+  // be patching over the full 64 bytes (16 instructions) with the following
+  // pattern:
+  //
+  //   DADDIU   SP, SP, -16
+  //   NOP
+  //   SD       RA, 8(SP)
+  //   SD       T9, 0(SP)
+  //   LUI      T9, %highest(__xray_FunctionEntry/Exit)
+  //   ORI      T9, T9, %higher(__xray_FunctionEntry/Exit)
+  //   DSLL     T9, T9, 16
+  //   ORI      T9, T9, %hi(__xray_FunctionEntry/Exit)
+  //   DSLL     T9, T9, 16
+  //   ORI      T9, T9, %lo(__xray_FunctionEntry/Exit)
+  //   LUI      T0, %hi(function_id)
+  //   JALR     T9
+  //   ADDIU    T0, T0, %lo(function_id)
+  //   LD       T9, 0(SP)
+  //   LD       RA, 8(SP)
+  //   DADDIU   SP, SP, 16
+  //
+  OutStreamer->EmitCodeAlignment(4);
+  auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
+  OutStreamer->EmitLabel(CurSled);
+  auto Target = OutContext.createTempSymbol();
+
+  // Emit "B .tmpN" instruction, which jumps over the nop sled to the actual
+  // start of function
+  const MCExpr *TargetExpr = MCSymbolRefExpr::create(
+      Target, MCSymbolRefExpr::VariantKind::VK_None, OutContext);
+  EmitToStreamer(*OutStreamer, MCInstBuilder(Mips::BEQ)
+                                   .addReg(Mips::ZERO)
+                                   .addReg(Mips::ZERO)
+                                   .addExpr(TargetExpr));
+
+  for (int8_t I = 0; I < NoopsInSledCount; I++)
+    EmitToStreamer(*OutStreamer, MCInstBuilder(Mips::SLL)
+                                     .addReg(Mips::ZERO)
+                                     .addReg(Mips::ZERO)
+                                     .addImm(0));
+
+  OutStreamer->EmitLabel(Target);
+
+  if (!Subtarget->isGP64bit()) {
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(Mips::ADDiu)
+                       .addReg(Mips::T9)
+                       .addReg(Mips::T9)
+                       .addImm(0x34));
+  }
+
+  recordSled(CurSled, MI, Kind);
+}
+
+void MipsAsmPrinter::EmitXRayTable() {
+  if (Sleds.empty())
+    return;
+  if (Subtarget->isTargetELF()) {
+    auto PrevSection = OutStreamer->getCurrentSectionOnly();
+    auto Fn = MF->getFunction();
+    MCSection *Section;
+
+    if (Fn->hasComdat())
+      Section = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                         ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
+                                         Fn->getComdat()->getName());
+    else
+      Section =
+          OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                   ELF::SHF_ALLOC, 0, CurrentFnSym->getName());
+
+    OutStreamer->SwitchSection(Section);
+    for (const auto &Sled : Sleds) {
+      OutStreamer->EmitSymbolValue(Sled.Sled, Subtarget->isGP64bit() ? 8 : 4);
+      OutStreamer->EmitSymbolValue(CurrentFnSym, Subtarget->isGP64bit() ? 8 : 4);
+      auto Kind = static_cast<uint8_t>(Sled.Kind);
+      OutStreamer->EmitBytes(
+          StringRef(reinterpret_cast<const char *>(&Kind), 1));
+      OutStreamer->EmitBytes(
+          StringRef(reinterpret_cast<const char *>(&Sled.AlwaysInstrument), 1));
+      OutStreamer->EmitZeros(Subtarget->isGP64bit() ? 14 : 6);
+    }
+    OutStreamer->SwitchSection(PrevSection);
+  }
+  Sleds.clear();
+}
+
+void MipsAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::FUNCTION_ENTER);
+}
+
+void MipsAsmPrinter::LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::FUNCTION_EXIT);
+}
+
+void MipsAsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI) {
+  EmitSled(MI, SledKind::TAIL_CALL);
+}
+
 void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
                                            raw_ostream &OS) {
   // TODO: implement
@@ -1041,7 +1200,7 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 
 // Emit .dtprelword or .dtpreldword directive
 // and value for debug thread local expression.
-void MipsAsmPrinter::EmitDebugValue(const MCExpr *Value,
+void MipsAsmPrinter::EmitDebugThreadLocal(const MCExpr *Value,
                                           unsigned Size) const {
   switch (Size) {
   case 4:
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index c5cf5241c236a4041fa2319e3febf05cf36d50ff..4699e1b0bd3bb07c6e2a8bf40e21671e939532a5 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -35,7 +35,21 @@ class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
 
   void EmitInstrWithMacroNoAT(const MachineInstr *MI);
 
+  //===------------------------------------------------------------------===//
+  // XRay implementation
+  //===------------------------------------------------------------------===//
+public:
+  // XRay-specific lowering for Mips.
+  void LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI);
+  void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
+  void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
+  // Helper function that emits the XRay sleds we've collected for a particular
+  // function.
+  void EmitXRayTable();
+
 private:
+  void EmitSled(const MachineInstr &MI, SledKind Kind);
+
   // tblgen'erated function.
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
@@ -140,7 +154,7 @@ public:
   void EmitStartOfAsmFile(Module &M) override;
   void EmitEndOfAsmFile(Module &M) override;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-  void EmitDebugValue(const MCExpr *Value, unsigned Size) const override;
+  void EmitDebugThreadLocal(const MCExpr *Value, unsigned Size) const override;
 };
 }
 
diff --git a/lib/Target/Mips/MipsDelaySlotFiller.cpp b/lib/Target/Mips/MipsDelaySlotFiller.cpp
index c821084f68cfb8bccd307185fced59bf1ec39560..ae58c26e145aba8bcf8982180cb21c909361cbb5 100644
--- a/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -14,21 +14,39 @@
 #include "MCTargetDesc/MipsMCNaCl.h"
 #include "Mips.h"
 #include "MipsInstrInfo.h"
+#include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetRegisterInfo.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <memory>
+#include <utility>
 
 using namespace llvm;
 
@@ -84,6 +102,7 @@ static cl::opt<CompactBranchPolicy> MipsCompactBranchPolicy(
 );
 
 namespace {
+
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
   typedef SmallDenseMap<MachineBasicBlock*, MachineInstr*, 2> BB2BrMap;
@@ -91,6 +110,7 @@ namespace {
   class RegDefsUses {
   public:
     RegDefsUses(const TargetRegisterInfo &TRI);
+
     void init(const MachineInstr &MI);
 
     /// This function sets all caller-saved registers in Defs.
@@ -120,18 +140,18 @@ namespace {
   /// Base class for inspecting loads and stores.
   class InspectMemInstr {
   public:
-    InspectMemInstr(bool ForbidMemInstr_)
-      : OrigSeenLoad(false), OrigSeenStore(false), SeenLoad(false),
-        SeenStore(false), ForbidMemInstr(ForbidMemInstr_) {}
+    InspectMemInstr(bool ForbidMemInstr_) : ForbidMemInstr(ForbidMemInstr_) {}
+    virtual ~InspectMemInstr() = default;
 
     /// Return true if MI cannot be moved to delay slot.
     bool hasHazard(const MachineInstr &MI);
 
-    virtual ~InspectMemInstr() {}
-
   protected:
     /// Flags indicating whether loads or stores have been seen.
-    bool OrigSeenLoad, OrigSeenStore, SeenLoad, SeenStore;
+    bool OrigSeenLoad = false;
+    bool OrigSeenStore = false;
+    bool SeenLoad = false;
+    bool SeenStore = false;
 
     /// Memory instructions are not allowed to move to delay slot if this flag
     /// is true.
@@ -145,6 +165,7 @@ namespace {
   class NoMemInstr : public InspectMemInstr {
   public:
     NoMemInstr() : InspectMemInstr(true) {}
+
   private:
     bool hasHazard_(const MachineInstr &MI) override { return true; }
   };
@@ -153,6 +174,7 @@ namespace {
   class LoadFromStackOrConst : public InspectMemInstr {
   public:
     LoadFromStackOrConst() : InspectMemInstr(false) {}
+
   private:
     bool hasHazard_(const MachineInstr &MI) override;
   };
@@ -183,7 +205,8 @@ namespace {
 
     /// Flags indicating whether loads or stores with no underlying objects have
     /// been seen.
-    bool SeenNoObjLoad, SeenNoObjStore;
+    bool SeenNoObjLoad = false;
+    bool SeenNoObjStore = false;
   };
 
   class Filler : public MachineFunctionPass {
@@ -271,8 +294,10 @@ namespace {
 
     static char ID;
   };
+
   char Filler::ID = 0;
-} // end of anonymous namespace
+
+} // end anonymous namespace
 
 static bool hasUnoccupiedSlot(const MachineInstr *MI) {
   return MI->hasDelaySlot() && !MI->isBundledWithSucc();
@@ -458,8 +483,7 @@ bool LoadFromStackOrConst::hasHazard_(const MachineInstr &MI) {
 }
 
 MemDefsUses::MemDefsUses(const DataLayout &DL, const MachineFrameInfo *MFI_)
-    : InspectMemInstr(false), MFI(MFI_), DL(DL), SeenNoObjLoad(false),
-      SeenNoObjStore(false) {}
+    : InspectMemInstr(false), MFI(MFI_), DL(DL) {}
 
 bool MemDefsUses::hasHazard_(const MachineInstr &MI) {
   bool HasHazard = false;
@@ -646,12 +670,6 @@ bool Filler::runOnMachineBasicBlock(MachineBasicBlock &MBB) {
   return Changed;
 }
 
-/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
-/// slots in Mips MachineFunctions
-FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
-  return new Filler(tm);
-}
-
 template<typename IterTy>
 bool Filler::searchRange(MachineBasicBlock &MBB, IterTy Begin, IterTy End,
                          RegDefsUses &RegDU, InspectMemInstr& IM, Iter Slot,
@@ -889,3 +907,9 @@ bool Filler::terminateSearch(const MachineInstr &Candidate) const {
           Candidate.isPosition() || Candidate.isInlineAsm() ||
           Candidate.hasUnmodeledSideEffects());
 }
+
+/// createMipsDelaySlotFillerPass - Returns a pass that fills in delay
+/// slots in Mips MachineFunctions
+FunctionPass *llvm::createMipsDelaySlotFillerPass(MipsTargetMachine &tm) {
+  return new Filler(tm);
+}
diff --git a/lib/Target/Mips/MipsHazardSchedule.cpp b/lib/Target/Mips/MipsHazardSchedule.cpp
index 31b86124bc8d0b9ca275c332136517512b75a517..f6fcf6ec938510fdc2f26a50eddaf589c786bf20 100644
--- a/lib/Target/Mips/MipsHazardSchedule.cpp
+++ b/lib/Target/Mips/MipsHazardSchedule.cpp
@@ -36,7 +36,7 @@
 ///
 /// A) A previous pass has created a compact branch directly.
 /// B) Transforming a delay slot branch into compact branch. This case can be
-///    difficult to process as lookahead for hazards is insufficent, as
+///    difficult to process as lookahead for hazards is insufficient, as
 ///    backwards delay slot fillling can also produce hazards in previously
 ///    processed instuctions.
 ///
@@ -103,23 +103,24 @@ static Iter getNextMachineInstrInBB(Iter Position) {
 
 // Find the next real instruction from the current position, looking through
 // basic block boundaries.
-static Iter getNextMachineInstr(Iter Position, MachineBasicBlock *Parent) {
+static std::pair<Iter, bool> getNextMachineInstr(Iter Position, MachineBasicBlock * Parent) {
   if (Position == Parent->end()) {
-    MachineBasicBlock *Succ = Parent->getNextNode();
-    if (Succ != nullptr && Parent->isSuccessor(Succ)) {
-      Position = Succ->begin();
-      Parent = Succ;
-    } else {
-      llvm_unreachable(
-          "Should have identified the end of the function earlier!");
-    }
+    do {
+      MachineBasicBlock *Succ = Parent->getNextNode();
+      if (Succ != nullptr && Parent->isSuccessor(Succ)) {
+        Position = Succ->begin();
+        Parent = Succ;
+      } else {
+        return std::make_pair(Position, true);
+      }
+    } while (Parent->empty());
   }
 
   Iter Instr = getNextMachineInstrInBB(Position);
   if (Instr == Parent->end()) {
     return getNextMachineInstr(Instr, Parent);
   }
-  return Instr;
+  return std::make_pair(Instr, false);
 }
 
 bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
@@ -145,7 +146,9 @@ bool MipsHazardSchedule::runOnMachineFunction(MachineFunction &MF) {
       bool LastInstInFunction =
           std::next(I) == FI->end() && std::next(FI) == MF.end();
       if (!LastInstInFunction) {
-        Inst = getNextMachineInstr(std::next(I), &*FI);
+        std::pair<Iter, bool> Res = getNextMachineInstr(std::next(I), &*FI);
+        LastInstInFunction |= Res.second;
+        Inst = Res.first;
       }
 
       if (LastInstInFunction || !TII->SafeInForbiddenSlot(*Inst)) {
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index f0f2424f7224cd4e329fe49c7915816e88ec9b12..93c5f496ce9716234ee7ca9ca40e9b70d536a130 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -147,6 +147,7 @@ const char *MipsTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case MipsISD::Sync:              return "MipsISD::Sync";
   case MipsISD::Ext:               return "MipsISD::Ext";
   case MipsISD::Ins:               return "MipsISD::Ins";
+  case MipsISD::CIns:              return "MipsISD::CIns";
   case MipsISD::LWL:               return "MipsISD::LWL";
   case MipsISD::LWR:               return "MipsISD::LWR";
   case MipsISD::SWL:               return "MipsISD::SWL";
@@ -428,6 +429,7 @@ MipsTargetLowering::MipsTargetLowering(const MipsTargetMachine &TM,
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::ADD);
   setTargetDAGCombine(ISD::AssertZext);
+  setTargetDAGCombine(ISD::SHL);
 
   if (ABI.IsO32()) {
     // These libcalls are not available in 32-bit.
@@ -702,41 +704,81 @@ static SDValue performCMovFPCombine(SDNode *N, SelectionDAG &DAG,
 static SDValue performANDCombine(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const MipsSubtarget &Subtarget) {
-  // Pattern match EXT.
-  //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
-  //  => ext $dst, $src, size, pos
   if (DCI.isBeforeLegalizeOps() || !Subtarget.hasExtractInsert())
     return SDValue();
 
-  SDValue ShiftRight = N->getOperand(0), Mask = N->getOperand(1);
-  unsigned ShiftRightOpc = ShiftRight.getOpcode();
-
-  // Op's first operand must be a shift right.
-  if (ShiftRightOpc != ISD::SRA && ShiftRightOpc != ISD::SRL)
-    return SDValue();
+  SDValue FirstOperand = N->getOperand(0);
+  unsigned FirstOperandOpc = FirstOperand.getOpcode();
+  SDValue Mask = N->getOperand(1);
+  EVT ValTy = N->getValueType(0);
+  SDLoc DL(N);
 
-  // The second operand of the shift must be an immediate.
+  uint64_t Pos = 0, SMPos, SMSize;
   ConstantSDNode *CN;
-  if (!(CN = dyn_cast<ConstantSDNode>(ShiftRight.getOperand(1))))
-    return SDValue();
-
-  uint64_t Pos = CN->getZExtValue();
-  uint64_t SMPos, SMSize;
+  SDValue NewOperand;
+  unsigned Opc;
 
   // Op's second operand must be a shifted mask.
   if (!(CN = dyn_cast<ConstantSDNode>(Mask)) ||
       !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
     return SDValue();
 
-  // Return if the shifted mask does not start at bit 0 or the sum of its size
-  // and Pos exceeds the word's size.
-  EVT ValTy = N->getValueType(0);
-  if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
-    return SDValue();
+  if (FirstOperandOpc == ISD::SRA || FirstOperandOpc == ISD::SRL) {
+    // Pattern match EXT.
+    //  $dst = and ((sra or srl) $src , pos), (2**size - 1)
+    //  => ext $dst, $src, pos, size
+
+    // The second operand of the shift must be an immediate.
+    if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))))
+      return SDValue();
+
+    Pos = CN->getZExtValue();
+
+    // Return if the shifted mask does not start at bit 0 or the sum of its size
+    // and Pos exceeds the word's size.
+    if (SMPos != 0 || Pos + SMSize > ValTy.getSizeInBits())
+      return SDValue();
+
+    Opc = MipsISD::Ext;
+    NewOperand = FirstOperand.getOperand(0);
+  } else if (FirstOperandOpc == ISD::SHL && Subtarget.hasCnMips()) {
+    // Pattern match CINS.
+    //  $dst = and (shl $src , pos), mask
+    //  => cins $dst, $src, pos, size
+    // mask is a shifted mask with consecutive 1's, pos = shift amount,
+    // size = population count.
+
+    // The second operand of the shift must be an immediate.
+    if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))))
+      return SDValue();
+
+    Pos = CN->getZExtValue();
+
+    if (SMPos != Pos || Pos >= ValTy.getSizeInBits() || SMSize >= 32 ||
+        Pos + SMSize > ValTy.getSizeInBits())
+      return SDValue();
+
+    NewOperand = FirstOperand.getOperand(0);
+    // SMSize is 'location' (position) in this case, not size.
+    SMSize--;
+    Opc = MipsISD::CIns;
+  } else {
+    // Pattern match EXT.
+    //  $dst = and $src, (2**size - 1) , if size > 16
+    //  => ext $dst, $src, pos, size , pos = 0
 
-  SDLoc DL(N);
-  return DAG.getNode(MipsISD::Ext, DL, ValTy,
-                     ShiftRight.getOperand(0),
+    // If the mask is <= 0xffff, andi can be used instead.
+    if (CN->getZExtValue() <= 0xffff)
+      return SDValue();
+
+    // Return if the mask doesn't start at position 0.
+    if (SMPos)
+      return SDValue();
+
+    Opc = MipsISD::Ext;
+    NewOperand = FirstOperand;
+  }
+  return DAG.getNode(Opc, DL, ValTy, NewOperand,
                      DAG.getConstant(Pos, DL, MVT::i32),
                      DAG.getConstant(SMSize, DL, MVT::i32));
 }
@@ -855,6 +897,58 @@ static SDValue performAssertZextCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+
+static SDValue performSHLCombine(SDNode *N, SelectionDAG &DAG,
+                                 TargetLowering::DAGCombinerInfo &DCI,
+                                 const MipsSubtarget &Subtarget) {
+  // Pattern match CINS.
+  //  $dst = shl (and $src , imm), pos
+  //  => cins $dst, $src, pos, size
+
+  if (DCI.isBeforeLegalizeOps() || !Subtarget.hasCnMips())
+    return SDValue();
+
+  SDValue FirstOperand = N->getOperand(0);
+  unsigned FirstOperandOpc = FirstOperand.getOpcode();
+  SDValue SecondOperand = N->getOperand(1);
+  EVT ValTy = N->getValueType(0);
+  SDLoc DL(N);
+
+  uint64_t Pos = 0, SMPos, SMSize;
+  ConstantSDNode *CN;
+  SDValue NewOperand;
+
+  // The second operand of the shift must be an immediate.
+  if (!(CN = dyn_cast<ConstantSDNode>(SecondOperand)))
+    return SDValue();
+
+  Pos = CN->getZExtValue();
+
+  if (Pos >= ValTy.getSizeInBits())
+    return SDValue();
+
+  if (FirstOperandOpc != ISD::AND)
+    return SDValue();
+
+  // AND's second operand must be a shifted mask.
+  if (!(CN = dyn_cast<ConstantSDNode>(FirstOperand.getOperand(1))) ||
+      !isShiftedMask(CN->getZExtValue(), SMPos, SMSize))
+    return SDValue();
+
+  // Return if the shifted mask does not start at bit 0 or the sum of its size
+  // and Pos exceeds the word's size.
+  if (SMPos != 0 || SMSize > 32 || Pos + SMSize > ValTy.getSizeInBits())
+    return SDValue();
+
+  NewOperand = FirstOperand.getOperand(0);
+  // SMSize is 'location' (position) in this case, not size.
+  SMSize--;
+
+  return DAG.getNode(MipsISD::CIns, DL, ValTy, NewOperand,
+                     DAG.getConstant(Pos, DL, MVT::i32),
+                     DAG.getConstant(SMSize, DL, MVT::i32));
+}
+
 SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
   const {
   SelectionDAG &DAG = DCI.DAG;
@@ -878,6 +972,8 @@ SDValue  MipsTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
     return performADDCombine(N, DAG, DCI, Subtarget);
   case ISD::AssertZext:
     return performAssertZextCombine(N, DAG, DCI, Subtarget);
+  case ISD::SHL:
+    return performSHLCombine(N, DAG, DCI, Subtarget);
   }
 
   return SDValue();
@@ -1826,8 +1922,9 @@ lowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const
     Args.push_back(Entry);
 
     TargetLowering::CallLoweringInfo CLI(DAG);
-    CLI.setDebugLoc(DL).setChain(DAG.getEntryNode())
-      .setCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
+    CLI.setDebugLoc(DL)
+        .setChain(DAG.getEntryNode())
+        .setLibCallee(CallingConv::C, PtrTy, TlsGetAddr, std::move(Args));
     std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
 
     SDValue Ret = CallResult.first;
diff --git a/lib/Target/Mips/MipsISelLowering.h b/lib/Target/Mips/MipsISelLowering.h
index abc34be63779389d342eb2a8dc6b2fd9d7d7ffde..2dcafd51061a281f33032703d7c23ae0af57d571 100644
--- a/lib/Target/Mips/MipsISelLowering.h
+++ b/lib/Target/Mips/MipsISelLowering.h
@@ -116,6 +116,7 @@ namespace llvm {
 
       Ext,
       Ins,
+      CIns,
 
       // EXTR.W instrinsic nodes.
       EXTP,
diff --git a/lib/Target/Mips/MipsInstrInfo.cpp b/lib/Target/Mips/MipsInstrInfo.cpp
index 86bf103711873c0da7a4acd4604fdde89f88ec4a..df62c66b75a323519968deb51be766b49717b7f5 100644
--- a/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/lib/Target/Mips/MipsInstrInfo.cpp
@@ -501,3 +501,31 @@ MipsInstrInfo::genInstrWithNewOpc(unsigned NewOpc,
   MIB.setMemRefs(I->memoperands_begin(), I->memoperands_end());
   return MIB;
 }
+
+bool MipsInstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                                          unsigned &SrcOpIdx2) const {
+  assert(!MI.isBundle() &&
+         "TargetInstrInfo::findCommutedOpIndices() can't handle bundles");
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  if (!MCID.isCommutable())
+    return false;
+
+  switch (MI.getOpcode()) {
+  case Mips::DPADD_U_H:
+  case Mips::DPADD_U_W:
+  case Mips::DPADD_U_D:
+  case Mips::DPADD_S_H:
+  case Mips::DPADD_S_W:
+  case Mips::DPADD_S_D: {
+    // The first operand is both input and output, so it should not commute
+    if (!fixCommutedOpIndices(SrcOpIdx1, SrcOpIdx2, 2, 3))
+      return false;
+
+    if (!MI.getOperand(SrcOpIdx1).isReg() || !MI.getOperand(SrcOpIdx2).isReg())
+      return false;
+    return true;
+  }
+  }
+  return TargetInstrInfo::findCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
+}
diff --git a/lib/Target/Mips/MipsInstrInfo.h b/lib/Target/Mips/MipsInstrInfo.h
index 347b9187d08c0b2d448e154426a058d801dd78c7..45d700d8afd64b79beab54e7b25c9b375a638983 100644
--- a/lib/Target/Mips/MipsInstrInfo.h
+++ b/lib/Target/Mips/MipsInstrInfo.h
@@ -135,6 +135,9 @@ public:
   MachineInstrBuilder genInstrWithNewOpc(unsigned NewOpc,
                                          MachineBasicBlock::iterator I) const;
 
+  bool findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
+                             unsigned &SrcOpIdx2) const override;
+
 protected:
   bool isZeroImm(const MachineOperand &op) const;
 
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 883dac3549fb7385680a3d228d4f2af520306c49..b90077d7807d8a1c618ee5af3e8a9a8f194b95f1 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -138,6 +138,7 @@ def MipsSync : SDNode<"MipsISD::Sync", SDT_Sync, [SDNPHasChain,SDNPSideEffect]>;
 
 def MipsExt :  SDNode<"MipsISD::Ext", SDT_Ext>;
 def MipsIns :  SDNode<"MipsISD::Ins", SDT_Ins>;
+def MipsCIns : SDNode<"MipsISD::CIns", SDT_Ext>;
 
 def MipsLWL : SDNode<"MipsISD::LWL", SDTMipsLoadLR,
                      [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
@@ -541,7 +542,7 @@ def UImm32CoercedAsmOperandClass : UImmAnyAsmOperandClass<33, []> {
 def SImm32RelaxedAsmOperandClass
     : SImmAsmOperandClass<32, [UImm32CoercedAsmOperandClass]> {
   let Name = "SImm32_Relaxed";
-  let PredicateMethod = "isAnyImm<32>";
+  let PredicateMethod = "isAnyImm<33>";
   let DiagnosticType = "SImm32_Relaxed";
 }
 def SImm32AsmOperandClass
@@ -1172,6 +1173,10 @@ def immZExt5Plus33 : PatLeaf<(imm), [{
   return isUInt<5>(N->getZExtValue() - 33);
 }]>;
 
+def immZExt5To31 : SDNodeXForm<imm, [{
+  return getImm(N, 31 - N->getZExtValue());
+}]>;
+
 // True if (N + 1) fits in 16-bit field.
 def immSExt16Plus1 : PatLeaf<(imm), [{
   return isInt<17>(N->getSExtValue()) && isInt<16>(N->getSExtValue() + 1);
@@ -2303,9 +2308,38 @@ def SEQIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
 def : MipsInstAlias<"seq $rd, $imm",
                     (SEQIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>,
                     NOT_ASE_CNMIPS;
+
+def MULImmMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
+                                                 simm32_relaxed:$imm),
+                                    "mul\t$rd, $rs, $imm">,
+                  ISA_MIPS1_NOT_32R6_64R6;
+def MULOMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
+                                               GPR32Opnd:$rt),
+                                  "mulo\t$rd, $rs, $rt">,
+                ISA_MIPS1_NOT_32R6_64R6;
+def MULOUMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
+                                                GPR32Opnd:$rt),
+                                   "mulou\t$rd, $rs, $rt">,
+                 ISA_MIPS1_NOT_32R6_64R6;
+
 //===----------------------------------------------------------------------===//
 // Instruction aliases
 //===----------------------------------------------------------------------===//
+
+multiclass OneOrTwoOperandMacroImmediateAlias<string Memnomic,
+                                              Instruction Opcode,
+                                              RegisterOperand RO = GPR32Opnd,
+                                              Operand Imm = simm32_relaxed> {
+  def : MipsInstAlias<!strconcat(Memnomic, " $rs, $rt, $imm"),
+                                (Opcode RO:$rs,
+                                        RO:$rt,
+                                        Imm:$imm), 0>;
+  def : MipsInstAlias<!strconcat(Memnomic, " $rs, $imm"),
+                                (Opcode RO:$rs,
+                                        RO:$rs,
+                                        Imm:$imm), 0>;
+}
+
 def : MipsInstAlias<"move $dst, $src",
                     (OR GPR32Opnd:$dst, GPR32Opnd:$src, ZERO), 1>,
       GPR_32 {
@@ -2318,26 +2352,7 @@ def : MipsInstAlias<"move $dst, $src",
 }
 def : MipsInstAlias<"bal $offset", (BGEZAL ZERO, brtarget:$offset), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<
-          "addu $rs, $rt, $imm",
-          (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-def : MipsInstAlias<
-          "addu $rs, $imm",
-          (ADDiu GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-def : MipsInstAlias<
-          "add $rs, $rt, $imm",
-          (ADDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>,
-          ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<
-          "add $rs, $imm",
-          (ADDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>,
-          ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<
-          "and $rs, $rt, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-def : MipsInstAlias<
-          "and $rs, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
+
 def : MipsInstAlias<"j $rs", (JR GPR32Opnd:$rs), 0>;
 let Predicates = [NotInMicroMips] in {
 def : MipsInstAlias<"jalr $rs", (JALR RA, GPR32Opnd:$rs), 0>;
@@ -2364,30 +2379,6 @@ let AdditionalPredicates = [NotInMicroMips] in {
   def : MipsInstAlias<
           "sgtu $$rs, $rt",
           (SLTu GPR32Opnd:$rs, GPR32Opnd:$rt, GPR32Opnd:$rs), 0>;
-  def : MipsInstAlias<
-          "slt $rs, $rt, $imm",
-          (SLTi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "sltu $rt, $rs, $imm",
-          (SLTiu GPR32Opnd:$rt, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "and $rs, $rt, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "and $rs, $imm",
-          (ANDi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "xor $rs, $rt, $imm",
-          (XORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "xor $rs, $imm",
-          (XORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "or $rs, $rt, $imm",
-          (ORi GPR32Opnd:$rs, GPR32Opnd:$rt, simm32_relaxed:$imm), 0>;
-  def : MipsInstAlias<
-          "or $rs, $imm",
-          (ORi GPR32Opnd:$rs, GPR32Opnd:$rs, simm32_relaxed:$imm), 0>;
   def : MipsInstAlias<
           "not $rt, $rs",
           (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>;
@@ -2395,6 +2386,20 @@ let AdditionalPredicates = [NotInMicroMips] in {
           "not $rt",
           (NOR GPR32Opnd:$rt, GPR32Opnd:$rt, ZERO), 0>;
   def : MipsInstAlias<"nop", (SLL ZERO, ZERO, 0), 1>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"add", ADDi>, ISA_MIPS1_NOT_32R6_64R6;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"addu", ADDiu>;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"and", ANDi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"or", ORi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"xor", XORi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"slt", SLTi>, GPR_32;
+
+  defm : OneOrTwoOperandMacroImmediateAlias<"sltu", SLTiu>, GPR_32;
 }
 def : MipsInstAlias<"mfc0 $rt, $rd", (MFC0 GPR32Opnd:$rt, COP0Opnd:$rd, 0), 0>;
 def : MipsInstAlias<"mtc0 $rt, $rd", (MTC0 COP0Opnd:$rd, GPR32Opnd:$rt, 0), 0>;
@@ -2467,6 +2472,14 @@ let AdditionalPredicates = [NotInMicroMips] in {
 def : MipsInstAlias<"sdbbp", (SDBBP 0)>, ISA_MIPS32_NOT_32R6_64R6;
 def : MipsInstAlias<"sync",
                     (SYNC 0), 1>, ISA_MIPS2;
+
+def : MipsInstAlias<"mulo $rs, $rt",
+                    (MULOMacro GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+                    ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"mulou $rs, $rt",
+                    (MULOUMacro GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+                    ISA_MIPS1_NOT_32R6_64R6;
+
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions
 //===----------------------------------------------------------------------===//
@@ -2494,9 +2507,12 @@ def JalTwoReg : MipsAsmPseudoInst<(outs GPR32Opnd:$rd), (ins GPR32Opnd:$rs),
 def JalOneReg : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs),
                       "jal\t$rs"> ;
 
-def NORImm : MipsAsmPseudoInst<
-                 (outs), (ins GPR32Opnd:$rs, GPR32Opnd:$rt, simm32:$imm),
-                 "nor\t$rs, $rt, $imm"> ;
+class NORIMM_DESC_BASE<RegisterOperand RO, DAGOperand Imm> :
+   MipsAsmPseudoInst<(outs RO:$rs), (ins RO:$rt, Imm:$imm),
+                      "nor\t$rs, $rt, $imm">;
+def NORImm : NORIMM_DESC_BASE<GPR32Opnd, simm32_relaxed>, GPR_32;
+def : MipsInstAlias<"nor\t$rs, $imm", (NORImm GPR32Opnd:$rs, GPR32Opnd:$rs,
+                                              simm32_relaxed:$imm)>, GPR_32;
 
 let hasDelaySlot = 1, isCTI = 1 in {
 def BneImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rt),
@@ -2534,6 +2550,9 @@ class CondBranchImmPseudo<string instr_asm> :
   MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rs, imm64:$imm, brtarget:$offset),
                     !strconcat(instr_asm, "\t$rs, $imm, $offset")>;
 
+def BEQLImmMacro : CondBranchImmPseudo<"beql">, ISA_MIPS2_NOT_32R6_64R6;
+def BNELImmMacro : CondBranchImmPseudo<"bnel">, ISA_MIPS2_NOT_32R6_64R6;
+
 def BLTImmMacro  : CondBranchImmPseudo<"blt">;
 def BLEImmMacro  : CondBranchImmPseudo<"ble">;
 def BGEImmMacro  : CondBranchImmPseudo<"bge">;
@@ -2557,34 +2576,46 @@ def BGTULImmMacro : CondBranchImmPseudo<"bgtul">, ISA_MIPS2_NOT_32R6_64R6;
 // Once the tablegen-erated errors are made better, this needs to be fixed and
 // predicates needs to be restored.
 
-def SDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+def SDivMacro : MipsAsmPseudoInst<(outs GPR32NonZeroOpnd:$rd),
                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
                                   "div\t$rd, $rs, $rt">,
                 ISA_MIPS1_NOT_32R6_64R6;
+def SDivIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, simm32:$imm),
+                                   "div\t$rd, $rs, $imm">,
+                 ISA_MIPS1_NOT_32R6_64R6;
 def UDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
                                   "divu\t$rd, $rs, $rt">,
                 ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<"div $rt, $rs", (SDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
-                                               GPR32Opnd:$rs), 0>,
+def UDivIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                   (ins GPR32Opnd:$rs, simm32:$imm),
+                                   "divu\t$rd, $rs, $imm">,
+                 ISA_MIPS1_NOT_32R6_64R6;
+
+
+def : MipsInstAlias<"div $rs, $rt", (SDIV GPR32ZeroOpnd:$rs,
+                                          GPR32Opnd:$rt), 0>,
+     ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"div $rs, $rt", (SDivMacro GPR32NonZeroOpnd:$rs,
+                                               GPR32NonZeroOpnd:$rs,
+                                               GPR32Opnd:$rt), 0>,
+     ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"div $rd, $imm", (SDivIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
+                                                 simm32:$imm), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
-def : MipsInstAlias<"divu $rt, $rs", (UDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
+
+def : MipsInstAlias<"divu $rt, $rs", (UDIV GPR32ZeroOpnd:$rt,
+                                           GPR32Opnd:$rs), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
+def : MipsInstAlias<"divu $rt, $rs", (UDivMacro GPR32NonZeroOpnd:$rt,
+                                                GPR32NonZeroOpnd:$rt,
                                                 GPR32Opnd:$rs), 0>,
       ISA_MIPS1_NOT_32R6_64R6;
-def DSDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
-                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-                                   "ddiv\t$rd, $rs, $rt">,
-                 ISA_MIPS64_NOT_64R6;
-def DUDivMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
-                                   (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
-                                   "ddivu\t$rd, $rs, $rt">,
-                 ISA_MIPS64_NOT_64R6;
-def : MipsInstAlias<"ddiv $rt, $rs", (DSDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
-                                                 GPR32Opnd:$rs), 0>,
-      ISA_MIPS64_NOT_64R6;
-def : MipsInstAlias<"ddivu $rt, $rs", (DUDivMacro GPR32Opnd:$rt, GPR32Opnd:$rt,
-                                                  GPR32Opnd:$rs), 0>,
-      ISA_MIPS64_NOT_64R6;
+
+def : MipsInstAlias<"divu $rd, $imm", (UDivIMacro GPR32Opnd:$rd, GPR32Opnd:$rd,
+                                                  simm32:$imm), 0>,
+      ISA_MIPS1_NOT_32R6_64R6;
 
 def Ulh : MipsAsmPseudoInst<(outs GPR32Opnd:$rt), (ins mem:$addr),
                             "ulh\t$rt, $addr">; //, ISA_MIPS1_NOT_32R6_64R6;
diff --git a/lib/Target/Mips/MipsLongBranch.cpp b/lib/Target/Mips/MipsLongBranch.cpp
index 1087d0e0140e9e2ae6475626722442c50f8e3f10..100503700a720fa01c958150907e431f07ce81a3 100644
--- a/lib/Target/Mips/MipsLongBranch.cpp
+++ b/lib/Target/Mips/MipsLongBranch.cpp
@@ -13,20 +13,31 @@
 // FIXME: Fix pc-region jump instructions which cross 256MB segment boundaries.
 //===----------------------------------------------------------------------===//
 
-#include "Mips.h"
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "MCTargetDesc/MipsMCNaCl.h"
+#include "Mips.h"
+#include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
+#include "MipsSubtarget.h"
 #include "MipsTargetMachine.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/IR/Function.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Target/TargetInstrInfo.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
@@ -47,21 +58,23 @@ static cl::opt<bool> ForceLongBranch(
   cl::Hidden);
 
 namespace {
+
   typedef MachineBasicBlock::iterator Iter;
   typedef MachineBasicBlock::reverse_iterator ReverseIter;
 
   struct MBBInfo {
-    uint64_t Size, Address;
-    bool HasLongBranch;
-    MachineInstr *Br;
+    uint64_t Size = 0;
+    uint64_t Address;
+    bool HasLongBranch = false;
+    MachineInstr *Br = nullptr;
 
-    MBBInfo() : Size(0), HasLongBranch(false), Br(nullptr) {}
+    MBBInfo() = default;
   };
 
   class MipsLongBranch : public MachineFunctionPass {
-
   public:
     static char ID;
+
     MipsLongBranch(TargetMachine &tm)
         : MachineFunctionPass(ID), TM(tm), IsPIC(TM.isPositionIndependent()),
           ABI(static_cast<const MipsTargetMachine &>(TM).getABI()) {}
@@ -92,13 +105,8 @@ namespace {
   };
 
   char MipsLongBranch::ID = 0;
-} // end of anonymous namespace
 
-/// createMipsLongBranchPass - Returns a pass that converts branches to long
-/// branches.
-FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
-  return new MipsLongBranch(tm);
-}
+} // end anonymous namespace
 
 /// Iterate over list of Br's operands and search for a MachineBasicBlock
 /// operand.
@@ -530,3 +538,9 @@ bool MipsLongBranch::runOnMachineFunction(MachineFunction &F) {
 
   return true;
 }
+
+/// createMipsLongBranchPass - Returns a pass that converts branches to long
+/// branches.
+FunctionPass *llvm::createMipsLongBranchPass(MipsTargetMachine &tm) {
+  return new MipsLongBranch(tm);
+}
diff --git a/lib/Target/Mips/MipsOptionRecord.h b/lib/Target/Mips/MipsOptionRecord.h
index 23f0b7070d623d3ca2ee86d07b70c89d713ab0d5..4708784063d3dfcc854db16e9448d81be1b1d496 100644
--- a/lib/Target/Mips/MipsOptionRecord.h
+++ b/lib/Target/Mips/MipsOptionRecord.h
@@ -1,4 +1,4 @@
-//===-- MipsOptionRecord.h - Abstraction for storing information ----------===//
+//===- MipsOptionRecord.h - Abstraction for storing information -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -23,14 +23,16 @@
 #include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MipsELFStreamer;
-class MCSubtargetInfo;
 
 class MipsOptionRecord {
 public:
-  virtual ~MipsOptionRecord(){};
+  virtual ~MipsOptionRecord() = default;
+
   virtual void EmitMipsOptionRecord() = 0;
 };
 
@@ -53,7 +55,8 @@ public:
     COP2RegClass = &(TRI->getRegClass(Mips::COP2RegClassID));
     COP3RegClass = &(TRI->getRegClass(Mips::COP3RegClassID));
   }
-  ~MipsRegInfoRecord() override {}
+
+  ~MipsRegInfoRecord() override = default;
 
   void EmitMipsOptionRecord() override;
   void SetPhysRegUsed(unsigned Reg, const MCRegisterInfo *MCRegInfo);
@@ -74,5 +77,7 @@ private:
   uint32_t ri_cprmask[4];
   int64_t ri_gp_value;
 };
-} // namespace llvm
-#endif
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MIPSOPTIONRECORD_H
diff --git a/lib/Target/Mips/MipsOs16.cpp b/lib/Target/Mips/MipsOs16.cpp
index 51ac5620f585d08bb6fa532eb4d135397ebadc9a..670b6c96e78ef02af652c70fe6fe880a6e2ea242 100644
--- a/lib/Target/Mips/MipsOs16.cpp
+++ b/lib/Target/Mips/MipsOs16.cpp
@@ -57,7 +57,7 @@ static  bool needsFPFromSig(Function &F) {
     ;
   }
   if (F.arg_size() >=1) {
-    Argument &Arg = F.getArgumentList().front();
+    Argument &Arg = *F.arg_begin();
     switch (Arg.getType()->getTypeID()) {
     case Type::FloatTyID:
     case Type::DoubleTyID:
diff --git a/lib/Target/Mips/MipsRegisterInfo.td b/lib/Target/Mips/MipsRegisterInfo.td
index 8c82239ebbd349c1f2618f1149ce9edb0b98a02c..ccfdcc89b078a7161ae23f5af83d83c380424922 100644
--- a/lib/Target/Mips/MipsRegisterInfo.td
+++ b/lib/Target/Mips/MipsRegisterInfo.td
@@ -290,6 +290,25 @@ class GPR32Class<list<ValueType> regTypes> :
   K0, K1, GP, SP, FP, RA)>;
 
 def GPR32 : GPR32Class<[i32]>;
+
+def GPR32ZERO : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  ZERO)>;
+
+def GPR32NONZERO : RegisterClass<"Mips", [i32], 32, (add
+  // Reserved
+  AT,
+  // Return Values and Arguments
+  V0, V1, A0, A1, A2, A3,
+  // Not preserved across procedure calls
+  T0, T1, T2, T3, T4, T5, T6, T7,
+  // Callee save
+  S0, S1, S2, S3, S4, S5, S6, S7,
+  // Not preserved across procedure calls
+  T8, T9,
+  // Reserved
+  K0, K1, GP, SP, FP, RA)>;
+
 def DSPR  : GPR32Class<[v4i8, v2i16]>;
 
 def GPRMM16 : RegisterClass<"Mips", [i32], 32, (add
@@ -317,7 +336,7 @@ def GPRMM16MoveP : RegisterClass<"Mips", [i32], 32, (add
   S0, S2, S3, S4)>;
 
 def GPR64 : RegisterClass<"Mips", [i64], 64, (add
-// Reserved
+  // Reserved
   ZERO_64, AT_64,
   // Return Values and Arguments
   V0_64, V1_64, A0_64, A1_64, A2_64, A3_64,
@@ -479,6 +498,16 @@ def GPR64AsmOperand : MipsAsmRegOperand {
   let PredicateMethod = "isGPRAsmReg";
 }
 
+def GPR32ZeroAsmOperand : MipsAsmRegOperand {
+  let Name = "GPR32ZeroAsmReg";
+  let PredicateMethod = "isGPRZeroAsmReg";
+}
+
+def GPR32NonZeroAsmOperand : MipsAsmRegOperand {
+  let Name = "GPR32NonZeroAsmReg";
+  let PredicateMethod = "isGPRNonZeroAsmReg";
+}
+
 def GPR32AsmOperand : MipsAsmRegOperand {
   let Name = "GPR32AsmReg";
   let PredicateMethod = "isGPRAsmReg";
@@ -550,6 +579,14 @@ def MSACtrlAsmOperand : MipsAsmRegOperand {
   let Name = "MSACtrlAsmReg";
 }
 
+def GPR32ZeroOpnd : RegisterOperand<GPR32ZERO> {
+  let ParserMatchClass = GPR32ZeroAsmOperand;
+}
+
+def GPR32NonZeroOpnd : RegisterOperand<GPR32NONZERO> {
+  let ParserMatchClass = GPR32NonZeroAsmOperand;
+}
+
 def GPR32Opnd : RegisterOperand<GPR32> {
   let ParserMatchClass = GPR32AsmOperand;
 }
diff --git a/lib/Target/Mips/MipsSEFrameLowering.cpp b/lib/Target/Mips/MipsSEFrameLowering.cpp
index 4996d070eb2902e6a959bec13eeb3bfae012bc6c..ef8d18c6deb14c0546b8a243e3073340bf61f76a 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -11,27 +11,42 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsSEFrameLowering.h"
-#include "MCTargetDesc/MipsBaseInfo.h"
+#include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsMachineFunction.h"
+#include "MipsRegisterInfo.h"
+#include "MipsSEFrameLowering.h"
 #include "MipsSEInstrInfo.h"
 #include "MipsSubtarget.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Target/TargetOptions.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MachineLocation.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+#include <cassert>
+#include <cstdint>
+#include <utility>
+#include <vector>
 
 using namespace llvm;
 
-namespace {
-typedef MachineBasicBlock::iterator Iter;
-
 static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
   if (Mips::ACC64RegClass.contains(Src))
     return std::make_pair((unsigned)Mips::PseudoMFHI,
@@ -47,6 +62,8 @@ static std::pair<unsigned, unsigned> getMFHiLoOpc(unsigned Src) {
   return std::make_pair(0, 0);
 }
 
+namespace {
+
 /// Helper class to expand pseudos.
 class ExpandPseudo {
 public:
@@ -54,6 +71,8 @@ public:
   bool expand();
 
 private:
+  typedef MachineBasicBlock::iterator Iter;
+
   bool expandInstr(MachineBasicBlock &MBB, Iter I);
   void expandLoadCCond(MachineBasicBlock &MBB, Iter I);
   void expandStoreCCond(MachineBasicBlock &MBB, Iter I);
@@ -74,7 +93,8 @@ private:
   const MipsSEInstrInfo &TII;
   const MipsRegisterInfo &RegInfo;
 };
-}
+
+} // end anonymous namespace
 
 ExpandPseudo::ExpandPseudo(MachineFunction &MF_)
     : MF(MF_), MRI(MF.getRegInfo()),
@@ -419,7 +439,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
 
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
 
-  if (CSI.size()) {
+  if (!CSI.empty()) {
     // Find the instruction past the last instruction that saves a callee-saved
     // register to the stack.
     for (unsigned i = 0; i < CSI.size(); ++i)
@@ -471,7 +491,7 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
       } else {
         // Reg is either in GPR32 or FGR32.
         unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-            nullptr, MRI->getDwarfRegNum(Reg, 1), Offset));
+            nullptr, MRI->getDwarfRegNum(Reg, true), Offset));
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
       }
@@ -534,7 +554,6 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
 
 void MipsSEFrameLowering::emitInterruptPrologueStub(
     MachineFunction &MF, MachineBasicBlock &MBB) const {
-
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -722,7 +741,6 @@ void MipsSEFrameLowering::emitEpilogue(MachineFunction &MF,
 
 void MipsSEFrameLowering::emitInterruptEpilogueStub(
     MachineFunction &MF, MachineBasicBlock &MBB) const {
-
   MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
   MipsFunctionInfo *MipsFI = MF.getInfo<MipsFunctionInfo>();
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -820,7 +838,6 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 bool
 MipsSEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-
   // Reserve call frame if the size of the maximum call frame fits into 16-bit
   // immediate field and there are no variable sized objects on the stack.
   // Make sure the second register scavenger spill slot can be accessed with one
diff --git a/lib/Target/Mips/MipsSEFrameLowering.h b/lib/Target/Mips/MipsSEFrameLowering.h
index 63cd3cebc56a1d568da6fb9d9b284cc9840a5cd8..bf30deb1905e9b371de7c91f8fd43c4b4e335829 100644
--- a/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/lib/Target/Mips/MipsSEFrameLowering.h
@@ -1,4 +1,4 @@
-//===-- MipsSEFrameLowering.h - Mips32/64 frame lowering --------*- C++ -*-===//
+//===- MipsSEFrameLowering.h - Mips32/64 frame lowering ---------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -15,6 +15,8 @@
 #define LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
 
 #include "MipsFrameLowering.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include <vector>
 
 namespace llvm {
 
@@ -47,6 +49,7 @@ private:
   void emitInterruptPrologueStub(MachineFunction &MF,
                                  MachineBasicBlock &MBB) const;
 };
-} // End llvm namespace
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_MIPS_MIPSSEFRAMELOWERING_H
diff --git a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index edec1073c043a11e7d61514e7db526e5988909c9..c9cf9363b8c96100ca4e744f1f9f6599b36712ea 100644
--- a/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -692,7 +692,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskL(SDValue N, SDValue &Imm) const {
     // as the original value.
     if (ImmValue == ~(~ImmValue & ~(~ImmValue + 1))) {
 
-      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), SDLoc(N),
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation() - 1, SDLoc(N),
                                       EltTy);
       return true;
     }
@@ -724,7 +724,7 @@ bool MipsSEDAGToDAGISel::selectVSplatMaskR(SDValue N, SDValue &Imm) const {
     // Extract the run of set bits starting with bit zero, and test that the
     // result is the same as the original value
     if (ImmValue == (ImmValue & ~(ImmValue + 1))) {
-      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation(), SDLoc(N),
+      Imm = CurDAG->getTargetConstant(ImmValue.countPopulation() - 1, SDLoc(N),
                                       EltTy);
       return true;
     }
@@ -934,6 +934,9 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
     // same set/ of registers. Similarly, ldi.h isn't capable of producing {
     // 0x00000000, 0x00000001, 0x00000000, 0x00000001 } but 'ldi.d wd, 1' can.
 
+    const MipsABIInfo &ABI =
+        static_cast<const MipsTargetMachine &>(TM).getABI();
+
     BuildVectorSDNode *BVN = cast<BuildVectorSDNode>(Node);
     APInt SplatValue, SplatUndef;
     unsigned SplatBitSize;
@@ -971,13 +974,233 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
       break;
     }
 
-    if (!SplatValue.isSignedIntN(10))
-      return false;
-
-    SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
-                                            ViaVecTy.getVectorElementType());
+    SDNode *Res;
 
-    SDNode *Res = CurDAG->getMachineNode(LdiOp, DL, ViaVecTy, Imm);
+    // If we have a signed 10 bit integer, we can splat it directly.
+    //
+    // If we have something bigger we can synthesize the value into a GPR and
+    // splat from there.
+    if (SplatValue.isSignedIntN(10)) {
+      SDValue Imm = CurDAG->getTargetConstant(SplatValue, DL,
+                                              ViaVecTy.getVectorElementType());
+
+      Res = CurDAG->getMachineNode(LdiOp, DL, ViaVecTy, Imm);
+    } else if (SplatValue.isSignedIntN(16) &&
+               ((ABI.IsO32() && SplatBitSize < 64) ||
+                (ABI.IsN32() || ABI.IsN64()))) {
+      // Only handle signed 16 bit values when the element size is GPR width.
+      // MIPS64 can handle all the cases but MIPS32 would need to handle
+      // negative cases specifically here. Instead, handle those cases as
+      // 64bit values.
+
+      bool Is32BitSplat = ABI.IsO32() || SplatBitSize < 64;
+      const unsigned ADDiuOp = Is32BitSplat ? Mips::ADDiu : Mips::DADDiu;
+      const MVT SplatMVT = Is32BitSplat ? MVT::i32 : MVT::i64;
+      SDValue ZeroVal = CurDAG->getRegister(
+          Is32BitSplat ? Mips::ZERO : Mips::ZERO_64, SplatMVT);
+
+      const unsigned FILLOp =
+          SplatBitSize == 16
+              ? Mips::FILL_H
+              : (SplatBitSize == 32 ? Mips::FILL_W
+                                    : (SplatBitSize == 64 ? Mips::FILL_D : 0));
+
+      assert(FILLOp != 0 && "Unknown FILL Op for splat synthesis!");
+      assert((!ABI.IsO32() || (FILLOp != Mips::FILL_D)) &&
+             "Attempting to use fill.d on MIPS32!");
+
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, SplatMVT);
+
+      Res = CurDAG->getMachineNode(ADDiuOp, DL, SplatMVT, ZeroVal, LoVal);
+      Res = CurDAG->getMachineNode(FILLOp, DL, ViaVecTy, SDValue(Res, 0));
+
+    } else if (SplatValue.isSignedIntN(32) && SplatBitSize == 32) {
+      // Only handle the cases where the splat size agrees with the size
+      // of the SplatValue here.
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      const unsigned Hi = SplatValue.lshr(16).getLoBits(16).getZExtValue();
+      SDValue ZeroVal = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, MVT::i32);
+      SDValue HiVal = CurDAG->getTargetConstant(Hi, DL, MVT::i32);
+
+      if (Hi)
+        Res = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HiVal);
+
+      if (Lo)
+        Res = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                     Hi ? SDValue(Res, 0) : ZeroVal, LoVal);
+
+      assert((Hi || Lo) && "Zero case reached 32 bit case splat synthesis!");
+      Res = CurDAG->getMachineNode(Mips::FILL_W, DL, MVT::v4i32, SDValue(Res, 0));
+
+    } else if (SplatValue.isSignedIntN(32) && SplatBitSize == 64 &&
+               (ABI.IsN32() || ABI.IsN64())) {
+      // N32 and N64 can perform some tricks that O32 can't for signed 32 bit
+      // integers due to having 64bit registers. lui will cause the necessary
+      // zero/sign extension.
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      const unsigned Hi = SplatValue.lshr(16).getLoBits(16).getZExtValue();
+      SDValue ZeroVal = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, MVT::i32);
+      SDValue HiVal = CurDAG->getTargetConstant(Hi, DL, MVT::i32);
+
+      if (Hi)
+        Res = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HiVal);
+
+      if (Lo)
+        Res = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                     Hi ? SDValue(Res, 0) : ZeroVal, LoVal);
+
+      Res = CurDAG->getMachineNode(
+              Mips::SUBREG_TO_REG, DL, MVT::i64,
+              CurDAG->getTargetConstant(((Hi >> 15) & 0x1), DL, MVT::i64),
+              SDValue(Res, 0),
+              CurDAG->getTargetConstant(Mips::sub_32, DL, MVT::i64));
+
+      Res =
+          CurDAG->getMachineNode(Mips::FILL_D, DL, MVT::v2i64, SDValue(Res, 0));
+
+    } else if (SplatValue.isSignedIntN(64)) {
+      // If we have a 64 bit Splat value, we perform a similar sequence to the
+      // above:
+      //
+      // MIPS32:                            MIPS64:
+      //   lui $res, %highest(val)            lui $res, %highest(val)
+      //   ori $res, $res, %higher(val)       ori $res, $res, %higher(val)
+      //   lui $res2, %hi(val)                lui $res2, %hi(val)
+      //   ori $res2, %res2, %lo(val)         ori $res2, %res2, %lo(val)
+      //   $res3 = fill $res2                 dinsu $res, $res2, 0, 32
+      //   $res4 = insert.w $res3[1], $res    fill.d $res
+      //   splat.d $res4, 0
+      //
+      // The ability to use dinsu is guaranteed as MSA requires MIPSR5. This saves
+      // having to materialize the value by shifts and ors.
+      //
+      // FIXME: Implement the preferred sequence for MIPS64R6:
+      //
+      // MIPS64R6:
+      //   ori $res, $zero, %lo(val)
+      //   daui $res, $res, %hi(val)
+      //   dahi $res, $res, %higher(val)
+      //   dati $res, $res, %highest(cal)
+      //   fill.d $res
+      //
+
+      const unsigned Lo = SplatValue.getLoBits(16).getZExtValue();
+      const unsigned Hi = SplatValue.lshr(16).getLoBits(16).getZExtValue();
+      const unsigned Higher = SplatValue.lshr(32).getLoBits(16).getZExtValue();
+      const unsigned Highest = SplatValue.lshr(48).getLoBits(16).getZExtValue();
+
+      SDValue LoVal = CurDAG->getTargetConstant(Lo, DL, MVT::i32);
+      SDValue HiVal = CurDAG->getTargetConstant(Hi, DL, MVT::i32);
+      SDValue HigherVal = CurDAG->getTargetConstant(Higher, DL, MVT::i32);
+      SDValue HighestVal = CurDAG->getTargetConstant(Highest, DL, MVT::i32);
+      SDValue ZeroVal = CurDAG->getRegister(Mips::ZERO, MVT::i32);
+
+      // Independent of whether we're targeting MIPS64 or not, the basic
+      // operations are the same. Also, directly use the $zero register if
+      // the 16 bit chunk is zero.
+      //
+      // For optimization purposes we always synthesize the splat value as
+      // an i32 value, then if we're targetting MIPS64, use SUBREG_TO_REG
+      // just before combining the values with dinsu to produce an i64. This
+      // enables SelectionDAG to aggressively share components of splat values
+      // where possible.
+      //
+      // FIXME: This is the general constant synthesis problem. This code
+      //        should be factored out into a class shared between all the
+      //        classes that need it. Specifically, for a splat size of 64
+      //        bits that's a negative number we can do better than LUi/ORi
+      //        for the upper 32bits.
+
+      if (Hi)
+        Res = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HiVal);
+
+      if (Lo)
+        Res = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                     Hi ? SDValue(Res, 0) : ZeroVal, LoVal);
+
+      SDNode *HiRes;
+      if (Highest)
+        HiRes = CurDAG->getMachineNode(Mips::LUi, DL, MVT::i32, HighestVal);
+
+      if (Higher)
+        HiRes = CurDAG->getMachineNode(Mips::ORi, DL, MVT::i32,
+                                       Highest ? SDValue(HiRes, 0) : ZeroVal,
+                                       HigherVal);
+
+
+      if (ABI.IsO32()) {
+        Res = CurDAG->getMachineNode(Mips::FILL_W, DL, MVT::v4i32,
+                                     (Hi || Lo) ? SDValue(Res, 0) : ZeroVal);
+
+        Res = CurDAG->getMachineNode(
+            Mips::INSERT_W, DL, MVT::v4i32, SDValue(Res, 0),
+            (Highest || Higher) ? SDValue(HiRes, 0) : ZeroVal,
+            CurDAG->getTargetConstant(1, DL, MVT::i32));
+
+        const TargetLowering *TLI = getTargetLowering();
+        const TargetRegisterClass *RC =
+            TLI->getRegClassFor(ViaVecTy.getSimpleVT());
+
+        Res = CurDAG->getMachineNode(
+            Mips::COPY_TO_REGCLASS, DL, ViaVecTy, SDValue(Res, 0),
+            CurDAG->getTargetConstant(RC->getID(), DL, MVT::i32));
+
+        Res = CurDAG->getMachineNode(
+            Mips::SPLATI_D, DL, MVT::v2i64, SDValue(Res, 0),
+            CurDAG->getTargetConstant(0, DL, MVT::i32));
+      } else if (ABI.IsN64() || ABI.IsN32()) {
+
+        SDValue Zero64Val = CurDAG->getRegister(Mips::ZERO_64, MVT::i64);
+        const bool HiResNonZero = Highest || Higher;
+        const bool ResNonZero = Hi || Lo;
+
+        if (HiResNonZero)
+          HiRes = CurDAG->getMachineNode(
+              Mips::SUBREG_TO_REG, DL, MVT::i64,
+              CurDAG->getTargetConstant(((Highest >> 15) & 0x1), DL, MVT::i64),
+              SDValue(HiRes, 0),
+              CurDAG->getTargetConstant(Mips::sub_32, DL, MVT::i64));
+
+        if (ResNonZero)
+          Res = CurDAG->getMachineNode(
+              Mips::SUBREG_TO_REG, DL, MVT::i64,
+              CurDAG->getTargetConstant(((Hi >> 15) & 0x1), DL, MVT::i64),
+              SDValue(Res, 0),
+              CurDAG->getTargetConstant(Mips::sub_32, DL, MVT::i64));
+
+        // We have 3 cases:
+        //   The HiRes is nonzero but Res is $zero  => dsll32 HiRes, 0
+        //   The Res is nonzero but HiRes is $zero  => dinsu Res, $zero, 32, 32
+        //   Both are non zero                      => dinsu Res, HiRes, 32, 32
+        //
+        // The obvious "missing" case is when both are zero, but that case is
+        // handled by the ldi case.
+        if (ResNonZero) {
+          SDValue Ops[4] = {HiResNonZero ? SDValue(HiRes, 0) : Zero64Val,
+                            CurDAG->getTargetConstant(64, DL, MVT::i32),
+                            CurDAG->getTargetConstant(32, DL, MVT::i32),
+                            SDValue(Res, 0)};
+
+          Res = CurDAG->getMachineNode(Mips::DINSU, DL, MVT::i64, Ops);
+        } else if (HiResNonZero) {
+          Res = CurDAG->getMachineNode(
+              Mips::DSLL32, DL, MVT::i64, SDValue(HiRes, 0),
+              CurDAG->getTargetConstant(0, DL, MVT::i32));
+        } else
+          llvm_unreachable(
+              "Zero splat value handled by non-zero 64bit splat synthesis!");
+
+        Res = CurDAG->getMachineNode(Mips::FILL_D, DL, MVT::v2i64, SDValue(Res, 0));
+      } else
+        llvm_unreachable("Unknown ABI in MipsISelDAGToDAG!");
+
+    } else
+      return false;
 
     if (ResVecTy != ViaVecTy) {
       // If LdiOp is writing to a different register class to ResVecTy, then
diff --git a/lib/Target/Mips/MipsSEISelLowering.cpp b/lib/Target/Mips/MipsSEISelLowering.cpp
index 69162d4f6e86c88b208d9c7a53ef5024e8bfa74b..e2da8477295b7d17e3936276cfc8484368d3bc6b 100644
--- a/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1123,7 +1123,8 @@ MipsSETargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const {
   case ISD::MUL:
     return performMULCombine(N, DAG, DCI, this);
   case ISD::SHL:
-    return performSHLCombine(N, DAG, DCI, Subtarget);
+    Val = performSHLCombine(N, DAG, DCI, Subtarget);
+    break;
   case ISD::SRA:
     return performSRACombine(N, DAG, DCI, Subtarget);
   case ISD::SRL:
@@ -1643,7 +1644,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
       report_fatal_error("Immediate out of range");
     APInt Mask = APInt::getHighBitsSet(EltTy.getSizeInBits(),
-                                       Op->getConstantOperandVal(3));
+                                       Op->getConstantOperandVal(3) + 1);
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
                        DAG.getConstant(Mask, DL, VecTy, true),
                        Op->getOperand(2), Op->getOperand(1));
@@ -1658,7 +1659,7 @@ SDValue MipsSETargetLowering::lowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (Op->getConstantOperandVal(3) >= EltTy.getSizeInBits())
       report_fatal_error("Immediate out of range");
     APInt Mask = APInt::getLowBitsSet(EltTy.getSizeInBits(),
-                                      Op->getConstantOperandVal(3));
+                                      Op->getConstantOperandVal(3) + 1);
     return DAG.getNode(ISD::VSELECT, DL, VecTy,
                        DAG.getConstant(Mask, DL, VecTy, true),
                        Op->getOperand(2), Op->getOperand(1));
@@ -2529,11 +2530,10 @@ SDValue MipsSETargetLowering::lowerBUILD_VECTOR(SDValue Op,
         SplatBitSize != 64)
       return SDValue();
 
-    // If the value fits into a simm10 then we can use ldi.[bhwd]
-    // However, if it isn't an integer type we will have to bitcast from an
-    // integer type first. Also, if there are any undefs, we must lower them
-    // to defined values first.
-    if (ResTy.isInteger() && !HasAnyUndefs && SplatValue.isSignedIntN(10))
+    // If the value isn't an integer type we will have to bitcast
+    // from an integer type first. Also, if there are any undefs, we must
+    // lower them to defined values first.
+    if (ResTy.isInteger() && !HasAnyUndefs)
       return Op;
 
     EVT ViaVecTy;
diff --git a/lib/Target/Mips/MipsSEInstrInfo.cpp b/lib/Target/Mips/MipsSEInstrInfo.cpp
index ea703d0edd9638a764293257b922f8059eb6bb8a..91e712a7a54e8d15bd55bee542eb2a6fecab0c54 100644
--- a/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -540,11 +540,20 @@ unsigned MipsSEInstrInfo::getAnalyzableBrOpc(unsigned Opc) const {
 
 void MipsSEInstrInfo::expandRetRA(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) const {
+
+  MachineInstrBuilder MIB;
   if (Subtarget.isGP64bit())
-    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
-        .addReg(Mips::RA_64);
+    MIB = BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn64))
+              .addReg(Mips::RA_64, RegState::Undef);
   else
-    BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn)).addReg(Mips::RA);
+    MIB = BuildMI(MBB, I, I->getDebugLoc(), get(Mips::PseudoReturn))
+              .addReg(Mips::RA, RegState::Undef);
+
+  // Retain any imp-use flags.
+  for (auto & MO : I->operands()) {
+    if (MO.isImplicit())
+      MIB.add(MO);
+  }
 }
 
 void MipsSEInstrInfo::expandERet(MachineBasicBlock &MBB,
diff --git a/lib/Target/Mips/MipsSubtarget.h b/lib/Target/Mips/MipsSubtarget.h
index 5f6fc7ca3ce581ad77f7013184c67ed331efd4c0..cca2cb8a46608f6ca9edb1c9bedf716c7ecb5385 100644
--- a/lib/Target/Mips/MipsSubtarget.h
+++ b/lib/Target/Mips/MipsSubtarget.h
@@ -236,6 +236,7 @@ public:
     return (HasSym32 && isABI_N64()) || isABI_N32() || isABI_O32();
   }
   bool isSingleFloat() const { return IsSingleFloat; }
+  bool isTargetELF() const { return TargetTriple.isOSBinFormatELF(); }
   bool hasVFPU() const { return HasVFPU; }
   bool inMips16Mode() const { return InMips16Mode; }
   bool inMips16ModeDefault() const {
@@ -277,6 +278,8 @@ public:
 
   bool isTargetNaCl() const { return TargetTriple.isOSNaCl(); }
 
+  bool isXRaySupported() const override { return true; }
+
   // for now constant islands are on for the whole compilation unit but we only
   // really use them if in addition we are in mips16 mode
   static bool useConstantIslands();
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index bb48188e3b87bb7ebae30e05793f6c2b54536321..a45a9c4b41c37b62911fa64f1146884460f0820d 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -11,27 +11,30 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MipsTargetMachine.h"
+#include "MCTargetDesc/MipsABIInfo.h"
+#include "MCTargetDesc/MipsMCTargetDesc.h"
 #include "Mips.h"
-#include "Mips16FrameLowering.h"
 #include "Mips16ISelDAGToDAG.h"
-#include "Mips16ISelLowering.h"
-#include "Mips16InstrInfo.h"
-#include "MipsFrameLowering.h"
-#include "MipsInstrInfo.h"
-#include "MipsSEFrameLowering.h"
 #include "MipsSEISelDAGToDAG.h"
-#include "MipsSEISelLowering.h"
-#include "MipsSEInstrInfo.h"
+#include "MipsSubtarget.h"
 #include "MipsTargetObjectFile.h"
+#include "MipsTargetMachine.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/BasicTTIImpl.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/Transforms/Scalar.h"
+#include "llvm/Target/TargetOptions.h"
+#include <string>
 
 using namespace llvm;
 
@@ -48,7 +51,7 @@ extern "C" void LLVMInitializeMipsTarget() {
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
                                      const TargetOptions &Options,
                                      bool isLittle) {
-  std::string Ret = "";
+  std::string Ret;
   MipsABIInfo ABI = MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions);
 
   // There are both little and big endian mips.
@@ -102,7 +105,7 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(TT, CPU, Options, isLittle), TT,
                         CPU, FS, Options, getEffectiveRelocModel(CM, RM), CM,
                         OL),
-      isLittle(isLittle), TLOF(make_unique<MipsTargetObjectFile>()),
+      isLittle(isLittle), TLOF(llvm::make_unique<MipsTargetObjectFile>()),
       ABI(MipsABIInfo::computeTargetABI(TT, CPU, Options.MCOptions)),
       Subtarget(nullptr), DefaultSubtarget(TT, CPU, FS, isLittle, *this),
       NoMips16Subtarget(TT, CPU, FS.empty() ? "-mips16" : FS.str() + ",-mips16",
@@ -113,9 +116,9 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-MipsTargetMachine::~MipsTargetMachine() {}
+MipsTargetMachine::~MipsTargetMachine() = default;
 
-void MipsebTargetMachine::anchor() { }
+void MipsebTargetMachine::anchor() {}
 
 MipsebTargetMachine::MipsebTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
@@ -125,7 +128,7 @@ MipsebTargetMachine::MipsebTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOpt::Level OL)
     : MipsTargetMachine(T, TT, CPU, FS, Options, RM, CM, OL, false) {}
 
-void MipselTargetMachine::anchor() { }
+void MipselTargetMachine::anchor() {}
 
 MipselTargetMachine::MipselTargetMachine(const Target &T, const Triple &TT,
                                          StringRef CPU, StringRef FS,
@@ -182,10 +185,10 @@ void MipsTargetMachine::resetSubtarget(MachineFunction *MF) {
 
   Subtarget = const_cast<MipsSubtarget *>(getSubtargetImpl(*MF->getFunction()));
   MF->setSubtarget(Subtarget);
-  return;
 }
 
 namespace {
+
 /// Mips Code Generator Pass Configuration Options.
 class MipsPassConfig : public TargetPassConfig {
 public:
@@ -209,11 +212,10 @@ public:
   void addIRPasses() override;
   bool addInstSelector() override;
   void addPreEmitPass() override;
-
   void addPreRegAlloc() override;
-
 };
-} // namespace
+
+} // end anonymous namespace
 
 TargetPassConfig *MipsTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new MipsPassConfig(this, PM);
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index e4cf17e2abd85552218cea16b0951ef852067e4f..140d7133f879bf232056c195cacc3666ed17289d 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -1,4 +1,4 @@
-//===-- MipsTargetMachine.h - Define TargetMachine for Mips -----*- C++ -*-===//
+//===- MipsTargetMachine.h - Define TargetMachine for Mips ------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -16,15 +16,14 @@
 
 #include "MCTargetDesc/MipsABIInfo.h"
 #include "MipsSubtarget.h"
-#include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/Target/TargetFrameLowering.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
-class formatted_raw_ostream;
-class MipsRegisterInfo;
 
 class MipsTargetMachine : public LLVMTargetMachine {
   bool isLittle;
@@ -73,6 +72,7 @@ public:
 ///
 class MipsebTargetMachine : public MipsTargetMachine {
   virtual void anchor();
+
 public:
   MipsebTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, const TargetOptions &Options,
@@ -84,6 +84,7 @@ public:
 ///
 class MipselTargetMachine : public MipsTargetMachine {
   virtual void anchor();
+
 public:
   MipselTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, const TargetOptions &Options,
@@ -91,6 +92,6 @@ public:
                       CodeGenOpt::Level OL);
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_MIPS_MIPSTARGETMACHINE_H
diff --git a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
index 04ae3c2533d1f4075a352345b3cd8dec59e680fe..b774fe169d7109ecff915b91eb83a71fcc18c777 100644
--- a/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
+++ b/lib/Target/NVPTX/InstPrinter/NVPTXInstPrinter.cpp
@@ -64,6 +64,9 @@ void NVPTXInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
   case 7:
     OS << "%h";
     break;
+  case 8:
+    OS << "%hh";
+    break;
   }
 
   unsigned VReg = RegNo & 0x0FFFFFFF;
diff --git a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index b9473514e5e44e81244a041ca0afe0e06fded3fe..307ca6b99ffcb3778504f8adf469bee618a88ca0 100644
--- a/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -363,6 +363,8 @@ unsigned NVPTXAsmPrinter::encodeVirtualRegister(unsigned Reg) {
       Ret = (6 << 28);
     } else if (RC == &NVPTX::Float16RegsRegClass) {
       Ret = (7 << 28);
+    } else if (RC == &NVPTX::Float16x2RegsRegClass) {
+      Ret = (8 << 28);
     } else {
       report_fatal_error("Bad register class");
     }
@@ -1491,7 +1493,7 @@ void NVPTXAsmPrinter::printParamName(Function::const_arg_iterator I,
 
 void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
   const DataLayout &DL = getDataLayout();
-  const AttributeSet &PAL = F->getAttributes();
+  const AttributeList &PAL = F->getAttributes();
   const TargetLowering *TLI = nvptxSubtarget->getTargetLowering();
   Function::const_arg_iterator I, E;
   unsigned paramIndex = 0;
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 2aef67b9caf7b009a804046f525260377737313b..2749772540464d2634694422b900eea3359220da 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -84,6 +84,14 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) {
     if (tryStore(N))
       return;
     break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    if (tryEXTRACT_VECTOR_ELEMENT(N))
+      return;
+    break;
+  case NVPTXISD::SETP_F16X2:
+    SelectSETP_F16X2(N);
+    return;
+
   case NVPTXISD::LoadV2:
   case NVPTXISD::LoadV4:
     if (tryLoadVector(N))
@@ -516,6 +524,127 @@ bool NVPTXDAGToDAGISel::tryConstantFP16(SDNode *N) {
   return true;
 }
 
+// Map ISD:CONDCODE value to appropriate CmpMode expected by
+// NVPTXInstPrinter::printCmpMode()
+static unsigned getPTXCmpMode(const CondCodeSDNode &CondCode, bool FTZ) {
+  using NVPTX::PTXCmpMode::CmpMode;
+  unsigned PTXCmpMode = [](ISD::CondCode CC) {
+    switch (CC) {
+    default:
+      llvm_unreachable("Unexpected condition code.");
+    case ISD::SETOEQ:
+      return CmpMode::EQ;
+    case ISD::SETOGT:
+      return CmpMode::GT;
+    case ISD::SETOGE:
+      return CmpMode::GE;
+    case ISD::SETOLT:
+      return CmpMode::LT;
+    case ISD::SETOLE:
+      return CmpMode::LE;
+    case ISD::SETONE:
+      return CmpMode::NE;
+    case ISD::SETO:
+      return CmpMode::NUM;
+    case ISD::SETUO:
+      return CmpMode::NotANumber;
+    case ISD::SETUEQ:
+      return CmpMode::EQU;
+    case ISD::SETUGT:
+      return CmpMode::GTU;
+    case ISD::SETUGE:
+      return CmpMode::GEU;
+    case ISD::SETULT:
+      return CmpMode::LTU;
+    case ISD::SETULE:
+      return CmpMode::LEU;
+    case ISD::SETUNE:
+      return CmpMode::NEU;
+    case ISD::SETEQ:
+      return CmpMode::EQ;
+    case ISD::SETGT:
+      return CmpMode::GT;
+    case ISD::SETGE:
+      return CmpMode::GE;
+    case ISD::SETLT:
+      return CmpMode::LT;
+    case ISD::SETLE:
+      return CmpMode::LE;
+    case ISD::SETNE:
+      return CmpMode::NE;
+    }
+  }(CondCode.get());
+
+  if (FTZ)
+    PTXCmpMode |= NVPTX::PTXCmpMode::FTZ_FLAG;
+
+  return PTXCmpMode;
+}
+
+bool NVPTXDAGToDAGISel::SelectSETP_F16X2(SDNode *N) {
+  unsigned PTXCmpMode =
+      getPTXCmpMode(*cast<CondCodeSDNode>(N->getOperand(2)), useF32FTZ());
+  SDLoc DL(N);
+  SDNode *SetP = CurDAG->getMachineNode(
+      NVPTX::SETP_f16x2rr, DL, MVT::i1, MVT::i1, N->getOperand(0),
+      N->getOperand(1), CurDAG->getTargetConstant(PTXCmpMode, DL, MVT::i32));
+  ReplaceNode(N, SetP);
+  return true;
+}
+
+// Find all instances of extract_vector_elt that use this v2f16 vector
+// and coalesce them into a scattering move instruction.
+bool NVPTXDAGToDAGISel::tryEXTRACT_VECTOR_ELEMENT(SDNode *N) {
+  SDValue Vector = N->getOperand(0);
+
+  // We only care about f16x2 as it's the only real vector type we
+  // need to deal with.
+  if (Vector.getSimpleValueType() != MVT::v2f16)
+    return false;
+
+  // Find and record all uses of this vector that extract element 0 or 1.
+  SmallVector<SDNode *, 4> E0, E1;
+  for (const auto &U : Vector.getNode()->uses()) {
+    if (U->getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+      continue;
+    if (U->getOperand(0) != Vector)
+      continue;
+    if (const ConstantSDNode *IdxConst =
+            dyn_cast<ConstantSDNode>(U->getOperand(1))) {
+      if (IdxConst->getZExtValue() == 0)
+        E0.push_back(U);
+      else if (IdxConst->getZExtValue() == 1)
+        E1.push_back(U);
+      else
+        llvm_unreachable("Invalid vector index.");
+    }
+  }
+
+  // There's no point scattering f16x2 if we only ever access one
+  // element of it.
+  if (E0.empty() || E1.empty())
+    return false;
+
+  unsigned Op = NVPTX::SplitF16x2;
+  // If the vector has been BITCAST'ed from i32, we can use original
+  // value directly and avoid register-to-register move.
+  SDValue Source = Vector;
+  if (Vector->getOpcode() == ISD::BITCAST) {
+    Op = NVPTX::SplitI32toF16x2;
+    Source = Vector->getOperand(0);
+  }
+  // Merge (f16 extractelt(V, 0), f16 extractelt(V,1))
+  // into f16,f16 SplitF16x2(V)
+  SDNode *ScatterOp =
+      CurDAG->getMachineNode(Op, SDLoc(N), MVT::f16, MVT::f16, Source);
+  for (auto *Node : E0)
+    ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 0));
+  for (auto *Node : E1)
+    ReplaceUses(SDValue(Node, 0), SDValue(ScatterOp, 1));
+
+  return true;
+}
+
 static unsigned int getCodeAddrSpace(MemSDNode *N) {
   const Value *Src = N->getMemOperand()->getValue();
 
@@ -661,6 +790,35 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   }
 }
 
+// Helper function template to reduce amount of boilerplate code for
+// opcode selection.
+static Optional<unsigned> pickOpcodeForVT(
+    MVT::SimpleValueType VT, unsigned Opcode_i8, unsigned Opcode_i16,
+    unsigned Opcode_i32, Optional<unsigned> Opcode_i64, unsigned Opcode_f16,
+    unsigned Opcode_f16x2, unsigned Opcode_f32, Optional<unsigned> Opcode_f64) {
+  switch (VT) {
+  case MVT::i1:
+  case MVT::i8:
+    return Opcode_i8;
+  case MVT::i16:
+    return Opcode_i16;
+  case MVT::i32:
+    return Opcode_i32;
+  case MVT::i64:
+    return Opcode_i64;
+  case MVT::f16:
+    return Opcode_f16;
+  case MVT::v2f16:
+    return Opcode_f16x2;
+  case MVT::f32:
+    return Opcode_f32;
+  case MVT::f64:
+    return Opcode_f64;
+  default:
+    return None;
+  }
+}
+
 bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDLoc dl(N);
   LoadSDNode *LD = cast<LoadSDNode>(N);
@@ -689,29 +847,26 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
       codeAddrSpace != NVPTX::PTXLdStInstCode::GENERIC)
     isVolatile = false;
 
-  // Vector Setting
-  MVT SimpleVT = LoadedVT.getSimpleVT();
-  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
-  if (SimpleVT.isVector()) {
-    unsigned num = SimpleVT.getVectorNumElements();
-    if (num == 2)
-      vecType = NVPTX::PTXLdStInstCode::V2;
-    else if (num == 4)
-      vecType = NVPTX::PTXLdStInstCode::V4;
-    else
-      return false;
-  }
-
   // Type Setting: fromType + fromTypeWidth
   //
   // Sign   : ISD::SEXTLOAD
   // Unsign : ISD::ZEXTLOAD, ISD::NON_EXTLOAD or ISD::EXTLOAD and the
   //          type is integer
   // Float  : ISD::NON_EXTLOAD or ISD::EXTLOAD and the type is float
+  MVT SimpleVT = LoadedVT.getSimpleVT();
   MVT ScalarVT = SimpleVT.getScalarType();
   // Read at least 8 bits (predicates are stored as 8-bit values)
   unsigned fromTypeWidth = std::max(8U, ScalarVT.getSizeInBits());
   unsigned int fromType;
+
+  // Vector Setting
+  unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
+  if (SimpleVT.isVector()) {
+    assert(LoadedVT == MVT::v2f16 && "Unexpected vector type");
+    // v2f16 is loaded using ld.b32
+    fromTypeWidth = 32;
+  }
+
   if ((LD->getExtensionType() == ISD::SEXTLOAD))
     fromType = NVPTX::PTXLdStInstCode::Signed;
   else if (ScalarVT.isFloatingPoint())
@@ -726,187 +881,72 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) {
   SDValue N1 = N->getOperand(1);
   SDValue Addr;
   SDValue Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   MVT::SimpleValueType TargetVT = LD->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N1, Addr)) {
-    switch (TargetVT) {
-    case MVT::i8:
-      Opcode = NVPTX::LD_i8_avar;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::LD_i16_avar;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::LD_i32_avar;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::LD_i64_avar;
-      break;
-    case MVT::f16:
-      Opcode = NVPTX::LD_f16_avar;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::LD_f32_avar;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::LD_f64_avar;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(
+        TargetVT, NVPTX::LD_i8_avar, NVPTX::LD_i16_avar, NVPTX::LD_i32_avar,
+        NVPTX::LD_i64_avar, NVPTX::LD_f16_avar, NVPTX::LD_f16x2_avar,
+        NVPTX::LD_f32_avar, NVPTX::LD_f64_avar);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Addr, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRsi64(N1.getNode(), N1, Base, Offset)
                           : SelectADDRsi(N1.getNode(), N1, Base, Offset)) {
-    switch (TargetVT) {
-    case MVT::i8:
-      Opcode = NVPTX::LD_i8_asi;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::LD_i16_asi;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::LD_i32_asi;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::LD_i64_asi;
-      break;
-    case MVT::f16:
-      Opcode = NVPTX::LD_f16_asi;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::LD_f32_asi;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::LD_f64_asi;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(TargetVT, NVPTX::LD_i8_asi, NVPTX::LD_i16_asi,
+                                 NVPTX::LD_i32_asi, NVPTX::LD_i64_asi,
+                                 NVPTX::LD_f16_asi, NVPTX::LD_f16x2_asi,
+                                 NVPTX::LD_f32_asi, NVPTX::LD_f64_asi);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(N1.getNode(), N1, Base, Offset)
                           : SelectADDRri(N1.getNode(), N1, Base, Offset)) {
-    if (TM.is64Bit()) {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_ari_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_ari_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_ari_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_ari_64;
-        break;
-      case MVT::f16:
-        Opcode = NVPTX::LD_f16_ari_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_ari_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_ari_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_ari;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_ari;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_ari;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_ari;
-        break;
-      case MVT::f16:
-        Opcode = NVPTX::LD_f16_ari;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_ari;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_ari;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_ari_64, NVPTX::LD_i16_ari_64,
+          NVPTX::LD_i32_ari_64, NVPTX::LD_i64_ari_64, NVPTX::LD_f16_ari_64,
+          NVPTX::LD_f16x2_ari_64, NVPTX::LD_f32_ari_64, NVPTX::LD_f64_ari_64);
+    else
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_ari, NVPTX::LD_i16_ari, NVPTX::LD_i32_ari,
+          NVPTX::LD_i64_ari, NVPTX::LD_f16_ari, NVPTX::LD_f16x2_ari,
+          NVPTX::LD_f32_ari, NVPTX::LD_f64_ari);
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), Base, Offset, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   } else {
-    if (TM.is64Bit()) {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_areg_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_areg_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_areg_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_areg_64;
-        break;
-      case MVT::f16:
-        Opcode = NVPTX::LD_f16_areg_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_areg_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_areg_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (TargetVT) {
-      case MVT::i8:
-        Opcode = NVPTX::LD_i8_areg;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LD_i16_areg;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LD_i32_areg;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LD_i64_areg;
-        break;
-      case MVT::f16:
-        Opcode = NVPTX::LD_f16_areg;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LD_f32_areg;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LD_f64_areg;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_areg_64, NVPTX::LD_i16_areg_64,
+          NVPTX::LD_i32_areg_64, NVPTX::LD_i64_areg_64, NVPTX::LD_f16_areg_64,
+          NVPTX::LD_f16x2_areg_64, NVPTX::LD_f32_areg_64,
+          NVPTX::LD_f64_areg_64);
+    else
+      Opcode = pickOpcodeForVT(
+          TargetVT, NVPTX::LD_i8_areg, NVPTX::LD_i16_areg, NVPTX::LD_i32_areg,
+          NVPTX::LD_i64_areg, NVPTX::LD_f16_areg, NVPTX::LD_f16x2_areg,
+          NVPTX::LD_f32_areg, NVPTX::LD_f64_areg);
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(isVolatile, dl), getI32Imm(codeAddrSpace, dl),
                       getI32Imm(vecType, dl), getI32Imm(fromType, dl),
                       getI32Imm(fromTypeWidth, dl), N1, Chain };
-    NVPTXLD = CurDAG->getMachineNode(Opcode, dl, TargetVT, MVT::Other, Ops);
+    NVPTXLD = CurDAG->getMachineNode(Opcode.getValue(), dl, TargetVT,
+                                     MVT::Other, Ops);
   }
 
   if (!NVPTXLD)
@@ -925,7 +965,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *LD;
   MemSDNode *MemSD = cast<MemSDNode>(N);
@@ -968,7 +1008,8 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
   if (ExtensionType == ISD::SEXTLOAD)
     FromType = NVPTX::PTXLdStInstCode::Signed;
   else if (ScalarVT.isFloatingPoint())
-    FromType = NVPTX::PTXLdStInstCode::Float;
+    FromType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
+                                             : NVPTX::PTXLdStInstCode::Float;
   else
     FromType = NVPTX::PTXLdStInstCode::Unsigned;
 
@@ -987,111 +1028,67 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
 
   EVT EltVT = N->getValueType(0);
 
+  // v8f16 is a special case. PTX doesn't have ld.v8.f16
+  // instruction. Instead, we split the vector into v2f16 chunks and
+  // load them with ld.v4.b32.
+  if (EltVT == MVT::v2f16) {
+    assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode.");
+    EltVT = MVT::i32;
+    FromType = NVPTX::PTXLdStInstCode::Untyped;
+    FromTypeWidth = 32;
+  }
+
   if (SelectDirectAddr(Op1, Addr)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case NVPTXISD::LoadV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v2_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v2_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v2_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LDV_i64_v2_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v2_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LDV_f64_v2_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::LDV_i8_v2_avar, NVPTX::LDV_i16_v2_avar,
+                               NVPTX::LDV_i32_v2_avar, NVPTX::LDV_i64_v2_avar,
+                               NVPTX::LDV_f16_v2_avar, NVPTX::LDV_f16x2_v2_avar,
+                               NVPTX::LDV_f32_v2_avar, NVPTX::LDV_f64_v2_avar);
       break;
     case NVPTXISD::LoadV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v4_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v4_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v4_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v4_avar;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_avar,
+                          NVPTX::LDV_i16_v4_avar, NVPTX::LDV_i32_v4_avar, None,
+                          NVPTX::LDV_f16_v4_avar, NVPTX::LDV_f16x2_v4_avar,
+                          NVPTX::LDV_f32_v4_avar, None);
       break;
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else if (TM.is64Bit() ? SelectADDRsi64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case NVPTXISD::LoadV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v2_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v2_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v2_asi;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::LDV_i64_v2_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v2_asi;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::LDV_f64_v2_asi;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::LDV_i8_v2_asi, NVPTX::LDV_i16_v2_asi,
+                               NVPTX::LDV_i32_v2_asi, NVPTX::LDV_i64_v2_asi,
+                               NVPTX::LDV_f16_v2_asi, NVPTX::LDV_f16x2_v2_asi,
+                               NVPTX::LDV_f32_v2_asi, NVPTX::LDV_f64_v2_asi);
       break;
     case NVPTXISD::LoadV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::LDV_i8_v4_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::LDV_i16_v4_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::LDV_i32_v4_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::LDV_f32_v4_asi;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_asi,
+                          NVPTX::LDV_i16_v4_asi, NVPTX::LDV_i32_v4_asi, None,
+                          NVPTX::LDV_f16_v4_asi, NVPTX::LDV_f16x2_v4_asi,
+                          NVPTX::LDV_f32_v4_asi, None);
       break;
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (TM.is64Bit()) {
@@ -1099,46 +1096,19 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_ari_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_ari_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_ari_64,
+            NVPTX::LDV_i16_v2_ari_64, NVPTX::LDV_i32_v2_ari_64,
+            NVPTX::LDV_i64_v2_ari_64, NVPTX::LDV_f16_v2_ari_64,
+            NVPTX::LDV_f16x2_v2_ari_64, NVPTX::LDV_f32_v2_ari_64,
+            NVPTX::LDV_f64_v2_ari_64);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari_64,
+            NVPTX::LDV_i16_v4_ari_64, NVPTX::LDV_i32_v4_ari_64, None,
+            NVPTX::LDV_f16_v4_ari_64, NVPTX::LDV_f16x2_v4_ari_64,
+            NVPTX::LDV_f32_v4_ari_64, None);
         break;
       }
     } else {
@@ -1146,101 +1116,47 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_ari;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_ari;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_ari;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::LDV_i8_v2_ari, NVPTX::LDV_i16_v2_ari,
+                                 NVPTX::LDV_i32_v2_ari, NVPTX::LDV_i64_v2_ari,
+                                 NVPTX::LDV_f16_v2_ari, NVPTX::LDV_f16x2_v2_ari,
+                                 NVPTX::LDV_f32_v2_ari, NVPTX::LDV_f64_v2_ari);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_ari;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_ari,
+                            NVPTX::LDV_i16_v4_ari, NVPTX::LDV_i32_v4_ari, None,
+                            NVPTX::LDV_f16_v4_ari, NVPTX::LDV_f16x2_v4_ari,
+                            NVPTX::LDV_f32_v4_ari, None);
         break;
       }
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Base, Offset, Chain };
 
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   } else {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_areg_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_areg_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg_64,
+            NVPTX::LDV_i16_v2_areg_64, NVPTX::LDV_i32_v2_areg_64,
+            NVPTX::LDV_i64_v2_areg_64, NVPTX::LDV_f16_v2_areg_64,
+            NVPTX::LDV_f16x2_v2_areg_64, NVPTX::LDV_f32_v2_areg_64,
+            NVPTX::LDV_f64_v2_areg_64);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg_64,
+            NVPTX::LDV_i16_v4_areg_64, NVPTX::LDV_i32_v4_areg_64, None,
+            NVPTX::LDV_f16_v4_areg_64, NVPTX::LDV_f16x2_v4_areg_64,
+            NVPTX::LDV_f32_v4_areg_64, None);
         break;
       }
     } else {
@@ -1248,54 +1164,28 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::LoadV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v2_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v2_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v2_areg;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::LDV_i64_v2_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v2_areg;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::LDV_f64_v2_areg;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v2_areg,
+                            NVPTX::LDV_i16_v2_areg, NVPTX::LDV_i32_v2_areg,
+                            NVPTX::LDV_i64_v2_areg, NVPTX::LDV_f16_v2_areg,
+                            NVPTX::LDV_f16x2_v2_areg, NVPTX::LDV_f32_v2_areg,
+                            NVPTX::LDV_f64_v2_areg);
         break;
       case NVPTXISD::LoadV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::LDV_i8_v4_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::LDV_i16_v4_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::LDV_i32_v4_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::LDV_f32_v4_areg;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4_areg,
+            NVPTX::LDV_i16_v4_areg, NVPTX::LDV_i32_v4_areg, None,
+            NVPTX::LDV_f16_v4_areg, NVPTX::LDV_f16x2_v4_areg,
+            NVPTX::LDV_f32_v4_areg, None);
         break;
       }
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { getI32Imm(IsVolatile, DL), getI32Imm(CodeAddrSpace, DL),
                       getI32Imm(VecType, DL), getI32Imm(FromType, DL),
                       getI32Imm(FromTypeWidth, DL), Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, N->getVTList(), Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
@@ -1338,7 +1228,7 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     Mem = cast<MemSDNode>(N);
   }
 
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *LD;
   SDValue Base, Offset, Addr;
@@ -1366,142 +1256,72 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
     default:
       return false;
     case ISD::INTRINSIC_W_CHAIN:
-      if (IsLDG) {
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8avar;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16avar;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32avar;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64avar;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32avar;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64avar;
-          break;
-        }
-      } else {
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8avar;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16avar;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32avar;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64avar;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32avar;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64avar;
-          break;
-        }
-      }
+      if (IsLDG)
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i16avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_i64avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f16avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f16x2avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
+                                     NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
+      else
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i16avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_i64avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f16avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f16x2avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
+                                     NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
       break;
     case NVPTXISD::LDGV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f16_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
+                                   NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
       break;
     case NVPTXISD::LDUV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f16_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
+                                   NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
       break;
     case NVPTXISD::LDGV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, None,
+                               NVPTX::INT_PTX_LDG_G_v4f16_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_avar,
+                               NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, None);
       break;
     case NVPTXISD::LDUV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, None,
+                               NVPTX::INT_PTX_LDU_G_v4f16_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_avar,
+                               NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, None);
       break;
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { Addr, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(Op1.getNode(), Op1, Base, Offset)
                           : SelectADDRri(Op1.getNode(), Op1, Base, Offset)) {
     if (TM.is64Bit()) {
@@ -1510,139 +1330,68 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari64;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari64;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i8ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i16ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i32ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i64ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f32ari64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f64ari64);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i8ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i16ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i32ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i64ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f32ari64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f64ari64);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari64);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari64,
+                                     NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari64);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari64, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari64,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari64, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari64, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari64,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari64, None);
         break;
       }
     } else {
@@ -1651,146 +1400,75 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8ari;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16ari;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32ari;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64ari;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32ari;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64ari;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8ari;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16ari;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32ari;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64ari;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32ari;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64ari;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i8ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i16ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i32ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i64ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f32ari,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f64ari);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i8ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i16ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i32ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i64ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f32ari,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f64ari);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_G_v2i8_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2i16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2i32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2i64_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDG_G_v2f64_ELE_ari32);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_G_v2i8_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2i16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2i32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2i64_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f32_ELE_ari32,
+                                     NVPTX::INT_PTX_LDU_G_v2f64_ELE_ari32);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_ari32, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_ari32,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_ari32, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_ari32, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_ari32,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_ari32, None);
         break;
       }
     }
-
-    SDValue Ops[] = { Base, Offset, Chain };
-
-    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+    if (!Opcode)
+      return false;
+    SDValue Ops[] = {Base, Offset, Chain};
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   } else {
     if (TM.is64Bit()) {
       switch (N->getOpcode()) {
@@ -1798,139 +1476,68 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg64;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg64;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg64;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg64;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg64;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg64;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg64;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i8areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i16areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i32areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_i64areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f32areg64,
+                                       NVPTX::INT_PTX_LDG_GLOBAL_f64areg64);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i8areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i16areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i32areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_i64areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f32areg64,
+                                       NVPTX::INT_PTX_LDU_GLOBAL_f64areg64);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg64);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                     NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg64,
+                                     NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg64);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg64, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg64,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg64, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg64, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg64,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg64, None);
         break;
       }
     } else {
@@ -1939,145 +1546,75 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         return false;
       case ISD::LOAD:
       case ISD::INTRINSIC_W_CHAIN:
-        if (IsLDG) {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i8areg;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i16areg;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i32areg;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_i64areg;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f32areg;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDG_GLOBAL_f64areg;
-            break;
-          }
-        } else {
-          switch (EltVT.getSimpleVT().SimpleTy) {
-          default:
-            return false;
-          case MVT::i8:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i8areg;
-            break;
-          case MVT::i16:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i16areg;
-            break;
-          case MVT::i32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i32areg;
-            break;
-          case MVT::i64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_i64areg;
-            break;
-          case MVT::f32:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f32areg;
-            break;
-          case MVT::f64:
-            Opcode = NVPTX::INT_PTX_LDU_GLOBAL_f64areg;
-            break;
-          }
-        }
+        if (IsLDG)
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i8areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i16areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i32areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_i64areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f16areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f16x2areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f32areg,
+                                   NVPTX::INT_PTX_LDG_GLOBAL_f64areg);
+        else
+          Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i8areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i16areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i32areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_i64areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f16areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f16x2areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f32areg,
+                                   NVPTX::INT_PTX_LDU_GLOBAL_f64areg);
         break;
       case NVPTXISD::LoadV2:
       case NVPTXISD::LDGV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v2i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2i32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2i64_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v2f64_ELE_areg32);
         break;
       case NVPTXISD::LDUV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v2i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2i32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2i64_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f32_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v2f64_ELE_areg32);
         break;
       case NVPTXISD::LoadV4:
       case NVPTXISD::LDGV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDG_G_v4i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4i32_ELE_areg32, None,
+                                 NVPTX::INT_PTX_LDG_G_v4f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDG_G_v4f32_ELE_areg32, None);
         break;
       case NVPTXISD::LDUV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::INT_PTX_LDU_G_v4i8_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4i16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4i32_ELE_areg32, None,
+                                 NVPTX::INT_PTX_LDU_G_v4f16_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4f16x2_ELE_areg32,
+                                 NVPTX::INT_PTX_LDU_G_v4f32_ELE_areg32, None);
         break;
       }
     }
-
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { Op1, Chain };
-    LD = CurDAG->getMachineNode(Opcode, DL, InstVTList, Ops);
+    LD = CurDAG->getMachineNode(Opcode.getValue(), DL, InstVTList, Ops);
   }
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
@@ -2151,21 +1688,18 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   // Vector Setting
   MVT SimpleVT = StoreVT.getSimpleVT();
   unsigned vecType = NVPTX::PTXLdStInstCode::Scalar;
-  if (SimpleVT.isVector()) {
-    unsigned num = SimpleVT.getVectorNumElements();
-    if (num == 2)
-      vecType = NVPTX::PTXLdStInstCode::V2;
-    else if (num == 4)
-      vecType = NVPTX::PTXLdStInstCode::V4;
-    else
-      return false;
-  }
 
   // Type Setting: toType + toTypeWidth
   // - for integer type, always use 'u'
   //
   MVT ScalarVT = SimpleVT.getScalarType();
   unsigned toTypeWidth = ScalarVT.getSizeInBits();
+  if (SimpleVT.isVector()) {
+    assert(StoreVT == MVT::v2f16 && "Unexpected vector type");
+    // v2f16 is stored using st.b32
+    toTypeWidth = 32;
+  }
+
   unsigned int toType;
   if (ScalarVT.isFloatingPoint())
     // f16 uses .b16 as its storage type.
@@ -2180,191 +1714,73 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) {
   SDValue N2 = N->getOperand(2);
   SDValue Addr;
   SDValue Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   MVT::SimpleValueType SourceVT = N1.getNode()->getSimpleValueType(0).SimpleTy;
 
   if (SelectDirectAddr(N2, Addr)) {
-    switch (SourceVT) {
-    case MVT::i8:
-      Opcode = NVPTX::ST_i8_avar;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::ST_i16_avar;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::ST_i32_avar;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::ST_i64_avar;
-      break;
-    case MVT::f16:
-      Opcode = NVPTX::ST_f16_avar;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::ST_f32_avar;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::ST_f64_avar;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_avar, NVPTX::ST_i16_avar,
+                             NVPTX::ST_i32_avar, NVPTX::ST_i64_avar,
+                             NVPTX::ST_f16_avar, NVPTX::ST_f16x2_avar,
+                             NVPTX::ST_f32_avar, NVPTX::ST_f64_avar);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Addr,
                       Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRsi64(N2.getNode(), N2, Base, Offset)
                           : SelectADDRsi(N2.getNode(), N2, Base, Offset)) {
-    switch (SourceVT) {
-    case MVT::i8:
-      Opcode = NVPTX::ST_i8_asi;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::ST_i16_asi;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::ST_i32_asi;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::ST_i64_asi;
-      break;
-    case MVT::f16:
-      Opcode = NVPTX::ST_f16_asi;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::ST_f32_asi;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::ST_f64_asi;
-      break;
-    default:
+    Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_asi, NVPTX::ST_i16_asi,
+                             NVPTX::ST_i32_asi, NVPTX::ST_i64_asi,
+                             NVPTX::ST_f16_asi, NVPTX::ST_f16x2_asi,
+                             NVPTX::ST_f32_asi, NVPTX::ST_f64_asi);
+    if (!Opcode)
       return false;
-    }
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
                       Offset, Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else if (TM.is64Bit() ? SelectADDRri64(N2.getNode(), N2, Base, Offset)
                           : SelectADDRri(N2.getNode(), N2, Base, Offset)) {
-    if (TM.is64Bit()) {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_ari_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_ari_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_ari_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_ari_64;
-        break;
-      case MVT::f16:
-        Opcode = NVPTX::ST_f16_ari_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_ari_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_ari_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_ari;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_ari;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_ari;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_ari;
-        break;
-      case MVT::f16:
-        Opcode = NVPTX::ST_f16_ari;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_ari;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_ari;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode = pickOpcodeForVT(
+          SourceVT, NVPTX::ST_i8_ari_64, NVPTX::ST_i16_ari_64,
+          NVPTX::ST_i32_ari_64, NVPTX::ST_i64_ari_64, NVPTX::ST_f16_ari_64,
+          NVPTX::ST_f16x2_ari_64, NVPTX::ST_f32_ari_64, NVPTX::ST_f64_ari_64);
+    else
+      Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_ari, NVPTX::ST_i16_ari,
+                               NVPTX::ST_i32_ari, NVPTX::ST_i64_ari,
+                               NVPTX::ST_f16_ari, NVPTX::ST_f16x2_ari,
+                               NVPTX::ST_f32_ari, NVPTX::ST_f64_ari);
+    if (!Opcode)
+      return false;
+
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), Base,
                       Offset, Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   } else {
-    if (TM.is64Bit()) {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_areg_64;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_areg_64;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_areg_64;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_areg_64;
-        break;
-      case MVT::f16:
-        Opcode = NVPTX::ST_f16_areg_64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_areg_64;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_areg_64;
-        break;
-      default:
-        return false;
-      }
-    } else {
-      switch (SourceVT) {
-      case MVT::i8:
-        Opcode = NVPTX::ST_i8_areg;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::ST_i16_areg;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::ST_i32_areg;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::ST_i64_areg;
-        break;
-      case MVT::f16:
-        Opcode = NVPTX::ST_f16_areg;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::ST_f32_areg;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::ST_f64_areg;
-        break;
-      default:
-        return false;
-      }
-    }
+    if (TM.is64Bit())
+      Opcode =
+          pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg_64, NVPTX::ST_i16_areg_64,
+                          NVPTX::ST_i32_areg_64, NVPTX::ST_i64_areg_64,
+                          NVPTX::ST_f16_areg_64, NVPTX::ST_f16x2_areg_64,
+                          NVPTX::ST_f32_areg_64, NVPTX::ST_f64_areg_64);
+    else
+      Opcode = pickOpcodeForVT(SourceVT, NVPTX::ST_i8_areg, NVPTX::ST_i16_areg,
+                               NVPTX::ST_i32_areg, NVPTX::ST_i64_areg,
+                               NVPTX::ST_f16_areg, NVPTX::ST_f16x2_areg,
+                               NVPTX::ST_f32_areg, NVPTX::ST_f64_areg);
+    if (!Opcode)
+      return false;
     SDValue Ops[] = { N1, getI32Imm(isVolatile, dl),
                       getI32Imm(codeAddrSpace, dl), getI32Imm(vecType, dl),
                       getI32Imm(toType, dl), getI32Imm(toTypeWidth, dl), N2,
                       Chain };
-    NVPTXST = CurDAG->getMachineNode(Opcode, dl, MVT::Other, Ops);
+    NVPTXST = CurDAG->getMachineNode(Opcode.getValue(), dl, MVT::Other, Ops);
   }
 
   if (!NVPTXST)
@@ -2381,7 +1797,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   SDValue Chain = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
   SDValue Addr, Offset, Base;
-  unsigned Opcode;
+  Optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *ST;
   EVT EltVT = Op1.getValueType();
@@ -2411,7 +1827,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
   unsigned ToTypeWidth = ScalarVT.getSizeInBits();
   unsigned ToType;
   if (ScalarVT.isFloatingPoint())
-    ToType = NVPTX::PTXLdStInstCode::Float;
+    ToType = ScalarVT.SimpleTy == MVT::f16 ? NVPTX::PTXLdStInstCode::Untyped
+                                           : NVPTX::PTXLdStInstCode::Float;
   else
     ToType = NVPTX::PTXLdStInstCode::Unsigned;
 
@@ -2438,6 +1855,16 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     return false;
   }
 
+  // v8f16 is a special case. PTX doesn't have st.v8.f16
+  // instruction. Instead, we split the vector into v2f16 chunks and
+  // store them with st.v4.b32.
+  if (EltVT == MVT::v2f16) {
+    assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode.");
+    EltVT = MVT::i32;
+    ToType = NVPTX::PTXLdStInstCode::Untyped;
+    ToTypeWidth = 32;
+  }
+
   StOps.push_back(getI32Imm(IsVolatile, DL));
   StOps.push_back(getI32Imm(CodeAddrSpace, DL));
   StOps.push_back(getI32Imm(VecType, DL));
@@ -2449,46 +1876,18 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     default:
       return false;
     case NVPTXISD::StoreV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v2_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v2_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v2_avar;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::STV_i64_v2_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v2_avar;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::STV_f64_v2_avar;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::STV_i8_v2_avar, NVPTX::STV_i16_v2_avar,
+                               NVPTX::STV_i32_v2_avar, NVPTX::STV_i64_v2_avar,
+                               NVPTX::STV_f16_v2_avar, NVPTX::STV_f16x2_v2_avar,
+                               NVPTX::STV_f32_v2_avar, NVPTX::STV_f64_v2_avar);
       break;
     case NVPTXISD::StoreV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v4_avar;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v4_avar;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v4_avar;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v4_avar;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_avar,
+                          NVPTX::STV_i16_v4_avar, NVPTX::STV_i32_v4_avar, None,
+                          NVPTX::STV_f16_v4_avar, NVPTX::STV_f16x2_v4_avar,
+                          NVPTX::STV_f32_v4_avar, None);
       break;
     }
     StOps.push_back(Addr);
@@ -2498,46 +1897,18 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
     default:
       return false;
     case NVPTXISD::StoreV2:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v2_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v2_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v2_asi;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::STV_i64_v2_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v2_asi;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::STV_f64_v2_asi;
-        break;
-      }
+      Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                               NVPTX::STV_i8_v2_asi, NVPTX::STV_i16_v2_asi,
+                               NVPTX::STV_i32_v2_asi, NVPTX::STV_i64_v2_asi,
+                               NVPTX::STV_f16_v2_asi, NVPTX::STV_f16x2_v2_asi,
+                               NVPTX::STV_f32_v2_asi, NVPTX::STV_f64_v2_asi);
       break;
     case NVPTXISD::StoreV4:
-      switch (EltVT.getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i8:
-        Opcode = NVPTX::STV_i8_v4_asi;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::STV_i16_v4_asi;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::STV_i32_v4_asi;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::STV_f32_v4_asi;
-        break;
-      }
+      Opcode =
+          pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_asi,
+                          NVPTX::STV_i16_v4_asi, NVPTX::STV_i32_v4_asi, None,
+                          NVPTX::STV_f16_v4_asi, NVPTX::STV_f16x2_v4_asi,
+                          NVPTX::STV_f32_v4_asi, None);
       break;
     }
     StOps.push_back(Base);
@@ -2549,46 +1920,19 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_ari_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_ari_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_ari_64,
+            NVPTX::STV_i16_v2_ari_64, NVPTX::STV_i32_v2_ari_64,
+            NVPTX::STV_i64_v2_ari_64, NVPTX::STV_f16_v2_ari_64,
+            NVPTX::STV_f16x2_v2_ari_64, NVPTX::STV_f32_v2_ari_64,
+            NVPTX::STV_f64_v2_ari_64);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_ari_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_ari_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_ari_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_ari_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari_64,
+            NVPTX::STV_i16_v4_ari_64, NVPTX::STV_i32_v4_ari_64, None,
+            NVPTX::STV_f16_v4_ari_64, NVPTX::STV_f16x2_v4_ari_64,
+            NVPTX::STV_f32_v4_ari_64, None);
         break;
       }
     } else {
@@ -2596,46 +1940,18 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_ari;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_ari;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_ari;
-          break;
-        }
+        Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
+                                 NVPTX::STV_i8_v2_ari, NVPTX::STV_i16_v2_ari,
+                                 NVPTX::STV_i32_v2_ari, NVPTX::STV_i64_v2_ari,
+                                 NVPTX::STV_f16_v2_ari, NVPTX::STV_f16x2_v2_ari,
+                                 NVPTX::STV_f32_v2_ari, NVPTX::STV_f64_v2_ari);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_ari;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_ari;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_ari;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_ari;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_ari,
+                            NVPTX::STV_i16_v4_ari, NVPTX::STV_i32_v4_ari, None,
+                            NVPTX::STV_f16_v4_ari, NVPTX::STV_f16x2_v4_ari,
+                            NVPTX::STV_f32_v4_ari, None);
         break;
       }
     }
@@ -2647,46 +1963,19 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_areg_64;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_areg_64;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg_64,
+            NVPTX::STV_i16_v2_areg_64, NVPTX::STV_i32_v2_areg_64,
+            NVPTX::STV_i64_v2_areg_64, NVPTX::STV_f16_v2_areg_64,
+            NVPTX::STV_f16x2_v2_areg_64, NVPTX::STV_f32_v2_areg_64,
+            NVPTX::STV_f64_v2_areg_64);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_areg_64;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_areg_64;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_areg_64;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_areg_64;
-          break;
-        }
+        Opcode = pickOpcodeForVT(
+            EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg_64,
+            NVPTX::STV_i16_v4_areg_64, NVPTX::STV_i32_v4_areg_64, None,
+            NVPTX::STV_f16_v4_areg_64, NVPTX::STV_f16x2_v4_areg_64,
+            NVPTX::STV_f32_v4_areg_64, None);
         break;
       }
     } else {
@@ -2694,55 +1983,31 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
       default:
         return false;
       case NVPTXISD::StoreV2:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v2_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v2_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v2_areg;
-          break;
-        case MVT::i64:
-          Opcode = NVPTX::STV_i64_v2_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v2_areg;
-          break;
-        case MVT::f64:
-          Opcode = NVPTX::STV_f64_v2_areg;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v2_areg,
+                            NVPTX::STV_i16_v2_areg, NVPTX::STV_i32_v2_areg,
+                            NVPTX::STV_i64_v2_areg, NVPTX::STV_f16_v2_areg,
+                            NVPTX::STV_f16x2_v2_areg, NVPTX::STV_f32_v2_areg,
+                            NVPTX::STV_f64_v2_areg);
         break;
       case NVPTXISD::StoreV4:
-        switch (EltVT.getSimpleVT().SimpleTy) {
-        default:
-          return false;
-        case MVT::i8:
-          Opcode = NVPTX::STV_i8_v4_areg;
-          break;
-        case MVT::i16:
-          Opcode = NVPTX::STV_i16_v4_areg;
-          break;
-        case MVT::i32:
-          Opcode = NVPTX::STV_i32_v4_areg;
-          break;
-        case MVT::f32:
-          Opcode = NVPTX::STV_f32_v4_areg;
-          break;
-        }
+        Opcode =
+            pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4_areg,
+                            NVPTX::STV_i16_v4_areg, NVPTX::STV_i32_v4_areg, None,
+                            NVPTX::STV_f16_v4_areg, NVPTX::STV_f16x2_v4_areg,
+                            NVPTX::STV_f32_v4_areg, None);
         break;
       }
     }
     StOps.push_back(N2);
   }
 
+  if (!Opcode)
+    return false;
+
   StOps.push_back(Chain);
 
-  ST = CurDAG->getMachineNode(Opcode, DL, MVT::Other, StOps);
+  ST = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, StOps);
 
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
@@ -2777,90 +2042,36 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
   EVT EltVT = Node->getValueType(0);
   EVT MemVT = Mem->getMemoryVT();
 
-  unsigned Opc = 0;
+  Optional<unsigned> Opcode;
 
   switch (VecSize) {
   default:
     return false;
   case 1:
-    switch (MemVT.getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opc = NVPTX::LoadParamMemI8;
-      break;
-    case MVT::i8:
-      Opc = NVPTX::LoadParamMemI8;
-      break;
-    case MVT::i16:
-      Opc = NVPTX::LoadParamMemI16;
-      break;
-    case MVT::i32:
-      Opc = NVPTX::LoadParamMemI32;
-      break;
-    case MVT::i64:
-      Opc = NVPTX::LoadParamMemI64;
-      break;
-    case MVT::f16:
-      Opc = NVPTX::LoadParamMemF16;
-      break;
-    case MVT::f32:
-      Opc = NVPTX::LoadParamMemF32;
-      break;
-    case MVT::f64:
-      Opc = NVPTX::LoadParamMemF64;
-      break;
-    }
+    Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy,
+                             NVPTX::LoadParamMemI8, NVPTX::LoadParamMemI16,
+                             NVPTX::LoadParamMemI32, NVPTX::LoadParamMemI64,
+                             NVPTX::LoadParamMemF16, NVPTX::LoadParamMemF16x2,
+                             NVPTX::LoadParamMemF32, NVPTX::LoadParamMemF64);
     break;
   case 2:
-    switch (MemVT.getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opc = NVPTX::LoadParamMemV2I8;
-      break;
-    case MVT::i8:
-      Opc = NVPTX::LoadParamMemV2I8;
-      break;
-    case MVT::i16:
-      Opc = NVPTX::LoadParamMemV2I16;
-      break;
-    case MVT::i32:
-      Opc = NVPTX::LoadParamMemV2I32;
-      break;
-    case MVT::i64:
-      Opc = NVPTX::LoadParamMemV2I64;
-      break;
-    case MVT::f32:
-      Opc = NVPTX::LoadParamMemV2F32;
-      break;
-    case MVT::f64:
-      Opc = NVPTX::LoadParamMemV2F64;
-      break;
-    }
+    Opcode =
+        pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV2I8,
+                        NVPTX::LoadParamMemV2I16, NVPTX::LoadParamMemV2I32,
+                        NVPTX::LoadParamMemV2I64, NVPTX::LoadParamMemV2F16,
+                        NVPTX::LoadParamMemV2F16x2, NVPTX::LoadParamMemV2F32,
+                        NVPTX::LoadParamMemV2F64);
     break;
   case 4:
-    switch (MemVT.getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opc = NVPTX::LoadParamMemV4I8;
-      break;
-    case MVT::i8:
-      Opc = NVPTX::LoadParamMemV4I8;
-      break;
-    case MVT::i16:
-      Opc = NVPTX::LoadParamMemV4I16;
-      break;
-    case MVT::i32:
-      Opc = NVPTX::LoadParamMemV4I32;
-      break;
-    case MVT::f32:
-      Opc = NVPTX::LoadParamMemV4F32;
-      break;
-    }
+    Opcode = pickOpcodeForVT(
+        MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8,
+        NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, None,
+        NVPTX::LoadParamMemV4F16, NVPTX::LoadParamMemV4F16x2,
+        NVPTX::LoadParamMemV4F32, None);
     break;
   }
+  if (!Opcode)
+    return false;
 
   SDVTList VTs;
   if (VecSize == 1) {
@@ -2879,7 +2090,7 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) {
   Ops.push_back(Chain);
   Ops.push_back(Flag);
 
-  ReplaceNode(Node, CurDAG->getMachineNode(Opc, DL, VTs, Ops));
+  ReplaceNode(Node, CurDAG->getMachineNode(Opcode.getValue(), DL, VTs, Ops));
   return true;
 }
 
@@ -2916,92 +2127,36 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) {
   // Determine target opcode
   // If we have an i1, use an 8-bit store. The lowering code in
   // NVPTXISelLowering will have already emitted an upcast.
-  unsigned Opcode = 0;
+  Optional<unsigned> Opcode = 0;
   switch (NumElts) {
   default:
     return false;
   case 1:
-    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opcode = NVPTX::StoreRetvalI8;
-      break;
-    case MVT::i8:
-      Opcode = NVPTX::StoreRetvalI8;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::StoreRetvalI16;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::StoreRetvalI32;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::StoreRetvalI64;
-      break;
-    case MVT::f16:
-      Opcode = NVPTX::StoreRetvalF16;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::StoreRetvalF32;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::StoreRetvalF64;
-      break;
-    }
+    Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                             NVPTX::StoreRetvalI8, NVPTX::StoreRetvalI16,
+                             NVPTX::StoreRetvalI32, NVPTX::StoreRetvalI64,
+                             NVPTX::StoreRetvalF16, NVPTX::StoreRetvalF16x2,
+                             NVPTX::StoreRetvalF32, NVPTX::StoreRetvalF64);
     break;
   case 2:
-    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opcode = NVPTX::StoreRetvalV2I8;
-      break;
-    case MVT::i8:
-      Opcode = NVPTX::StoreRetvalV2I8;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::StoreRetvalV2I16;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::StoreRetvalV2I32;
-      break;
-    case MVT::i64:
-      Opcode = NVPTX::StoreRetvalV2I64;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::StoreRetvalV2F32;
-      break;
-    case MVT::f64:
-      Opcode = NVPTX::StoreRetvalV2F64;
-      break;
-    }
+    Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                             NVPTX::StoreRetvalV2I8, NVPTX::StoreRetvalV2I16,
+                             NVPTX::StoreRetvalV2I32, NVPTX::StoreRetvalV2I64,
+                             NVPTX::StoreRetvalV2F16, NVPTX::StoreRetvalV2F16x2,
+                             NVPTX::StoreRetvalV2F32, NVPTX::StoreRetvalV2F64);
     break;
   case 4:
-    switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-    default:
-      return false;
-    case MVT::i1:
-      Opcode = NVPTX::StoreRetvalV4I8;
-      break;
-    case MVT::i8:
-      Opcode = NVPTX::StoreRetvalV4I8;
-      break;
-    case MVT::i16:
-      Opcode = NVPTX::StoreRetvalV4I16;
-      break;
-    case MVT::i32:
-      Opcode = NVPTX::StoreRetvalV4I32;
-      break;
-    case MVT::f32:
-      Opcode = NVPTX::StoreRetvalV4F32;
-      break;
-    }
+    Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                             NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16,
+                             NVPTX::StoreRetvalV4I32, None,
+                             NVPTX::StoreRetvalV4F16, NVPTX::StoreRetvalV4F16x2,
+                             NVPTX::StoreRetvalV4F32, None);
     break;
   }
+  if (!Opcode)
+    return false;
 
-  SDNode *Ret =
-      CurDAG->getMachineNode(Opcode, DL, MVT::Other, Ops);
+  SDNode *Ret = CurDAG->getMachineNode(Opcode.getValue(), DL, MVT::Other, Ops);
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
@@ -3050,91 +2205,36 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
   // Determine target opcode
   // If we have an i1, use an 8-bit store. The lowering code in
   // NVPTXISelLowering will have already emitted an upcast.
-  unsigned Opcode = 0;
+  Optional<unsigned> Opcode = 0;
   switch (N->getOpcode()) {
   default:
     switch (NumElts) {
     default:
       return false;
     case 1:
-      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i1:
-        Opcode = NVPTX::StoreParamI8;
-        break;
-      case MVT::i8:
-        Opcode = NVPTX::StoreParamI8;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::StoreParamI16;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::StoreParamI32;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::StoreParamI64;
-        break;
-      case MVT::f16:
-        Opcode = NVPTX::StoreParamF16;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::StoreParamF32;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::StoreParamF64;
-        break;
-      }
+      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                               NVPTX::StoreParamI8, NVPTX::StoreParamI16,
+                               NVPTX::StoreParamI32, NVPTX::StoreParamI64,
+                               NVPTX::StoreParamF16, NVPTX::StoreParamF16x2,
+                               NVPTX::StoreParamF32, NVPTX::StoreParamF64);
       break;
     case 2:
-      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i1:
-        Opcode = NVPTX::StoreParamV2I8;
-        break;
-      case MVT::i8:
-        Opcode = NVPTX::StoreParamV2I8;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::StoreParamV2I16;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::StoreParamV2I32;
-        break;
-      case MVT::i64:
-        Opcode = NVPTX::StoreParamV2I64;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::StoreParamV2F32;
-        break;
-      case MVT::f64:
-        Opcode = NVPTX::StoreParamV2F64;
-        break;
-      }
+      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                               NVPTX::StoreParamV2I8, NVPTX::StoreParamV2I16,
+                               NVPTX::StoreParamV2I32, NVPTX::StoreParamV2I64,
+                               NVPTX::StoreParamV2F16, NVPTX::StoreParamV2F16x2,
+                               NVPTX::StoreParamV2F32, NVPTX::StoreParamV2F64);
       break;
     case 4:
-      switch (Mem->getMemoryVT().getSimpleVT().SimpleTy) {
-      default:
-        return false;
-      case MVT::i1:
-        Opcode = NVPTX::StoreParamV4I8;
-        break;
-      case MVT::i8:
-        Opcode = NVPTX::StoreParamV4I8;
-        break;
-      case MVT::i16:
-        Opcode = NVPTX::StoreParamV4I16;
-        break;
-      case MVT::i32:
-        Opcode = NVPTX::StoreParamV4I32;
-        break;
-      case MVT::f32:
-        Opcode = NVPTX::StoreParamV4F32;
-        break;
-      }
+      Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy,
+                               NVPTX::StoreParamV4I8, NVPTX::StoreParamV4I16,
+                               NVPTX::StoreParamV4I32, None,
+                               NVPTX::StoreParamV4F16, NVPTX::StoreParamV4F16x2,
+                               NVPTX::StoreParamV4F32, None);
       break;
     }
+    if (!Opcode)
+      return false;
     break;
   // Special case: if we have a sign-extend/zero-extend node, insert the
   // conversion instruction first, and use that as the value operand to
@@ -3161,7 +2261,7 @@ bool NVPTXDAGToDAGISel::tryStoreParam(SDNode *N) {
 
   SDVTList RetVTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
   SDNode *Ret =
-      CurDAG->getMachineNode(Opcode, DL, RetVTs, Ops);
+      CurDAG->getMachineNode(Opcode.getValue(), DL, RetVTs, Ops);
   MachineSDNode::mmo_iterator MemRefs0 = MF->allocateMemRefsArray(1);
   MemRefs0[0] = cast<MemSDNode>(N)->getMemOperand();
   cast<MachineSDNode>(Ret)->setMemRefs(MemRefs0, MemRefs0 + 1);
diff --git a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
index 889575cdf7cbabaa750b9e9556205ff77262fca4..8fc38e7c461223345519ec0358d024ab0882d2ca 100644
--- a/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
+++ b/lib/Target/NVPTX/NVPTXISelDAGToDAG.h
@@ -71,6 +71,8 @@ private:
   bool trySurfaceIntrinsic(SDNode *N);
   bool tryBFE(SDNode *N);
   bool tryConstantFP16(SDNode *N);
+  bool SelectSETP_F16X2(SDNode *N);
+  bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
 
   inline SDValue getI32Imm(unsigned Imm, const SDLoc &DL) {
     return CurDAG->getTargetConstant(Imm, DL, MVT::i32);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1fb42496d95525101053219a8f86c0bfd70d8b74..36e4382777cf23c773128920633acdcbaef9c191 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -146,6 +146,9 @@ static bool IsPTXVectorType(MVT VT) {
   case MVT::v2i32:
   case MVT::v4i32:
   case MVT::v2i64:
+  case MVT::v2f16:
+  case MVT::v4f16:
+  case MVT::v8f16: // <4 x f16x2>
   case MVT::v2f32:
   case MVT::v4f32:
   case MVT::v2f64:
@@ -170,13 +173,24 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
   for (unsigned i = 0, e = TempVTs.size(); i != e; ++i) {
     EVT VT = TempVTs[i];
     uint64_t Off = TempOffsets[i];
-    if (VT.isVector())
-      for (unsigned j = 0, je = VT.getVectorNumElements(); j != je; ++j) {
-        ValueVTs.push_back(VT.getVectorElementType());
+    // Split vectors into individual elements, except for v2f16, which
+    // we will pass as a single scalar.
+    if (VT.isVector()) {
+      unsigned NumElts = VT.getVectorNumElements();
+      EVT EltVT = VT.getVectorElementType();
+      // Vectors with an even number of f16 elements will be passed to
+      // us as an array of v2f16 elements. We must match this so we
+      // stay in sync with Ins/Outs.
+      if (EltVT == MVT::f16 && NumElts % 2 == 0) {
+        EltVT = MVT::v2f16;
+        NumElts /= 2;
+      }
+      for (unsigned j = 0; j != NumElts; ++j) {
+        ValueVTs.push_back(EltVT);
         if (Offsets)
-          Offsets->push_back(Off+j*VT.getVectorElementType().getStoreSize());
+          Offsets->push_back(Off + j * EltVT.getStoreSize());
       }
-    else {
+    } else {
       ValueVTs.push_back(VT);
       if (Offsets)
         Offsets->push_back(Off);
@@ -184,6 +198,125 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
   }
 }
 
+// Check whether we can merge loads/stores of some of the pieces of a
+// flattened function parameter or return value into a single vector
+// load/store.
+//
+// The flattened parameter is represented as a list of EVTs and
+// offsets, and the whole structure is aligned to ParamAlignment. This
+// function determines whether we can load/store pieces of the
+// parameter starting at index Idx using a single vectorized op of
+// size AccessSize. If so, it returns the number of param pieces
+// covered by the vector op. Otherwise, it returns 1.
+static unsigned CanMergeParamLoadStoresStartingAt(
+    unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
+    const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
+  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
+
+  // Can't vectorize if param alignment is not sufficient.
+  if (AccessSize > ParamAlignment)
+    return 1;
+  // Can't vectorize if offset is not aligned.
+  if (Offsets[Idx] & (AccessSize - 1))
+    return 1;
+
+  EVT EltVT = ValueVTs[Idx];
+  unsigned EltSize = EltVT.getStoreSize();
+
+  // Element is too large to vectorize.
+  if (EltSize >= AccessSize)
+    return 1;
+
+  unsigned NumElts = AccessSize / EltSize;
+  // Can't vectorize if AccessBytes if not a multiple of EltSize.
+  if (AccessSize != EltSize * NumElts)
+    return 1;
+
+  // We don't have enough elements to vectorize.
+  if (Idx + NumElts > ValueVTs.size())
+    return 1;
+
+  // PTX ISA can only deal with 2- and 4-element vector ops.
+  if (NumElts != 4 && NumElts != 2)
+    return 1;
+
+  for (unsigned j = Idx + 1; j < Idx + NumElts; ++j) {
+    // Types do not match.
+    if (ValueVTs[j] != EltVT)
+      return 1;
+
+    // Elements are not contiguous.
+    if (Offsets[j] - Offsets[j - 1] != EltSize)
+      return 1;
+  }
+  // OK. We can vectorize ValueVTs[i..i+NumElts)
+  return NumElts;
+}
+
+// Flags for tracking per-element vectorization state of loads/stores
+// of a flattened function parameter or return value.
+enum ParamVectorizationFlags {
+  PVF_INNER = 0x0, // Middle elements of a vector.
+  PVF_FIRST = 0x1, // First element of the vector.
+  PVF_LAST = 0x2,  // Last element of the vector.
+  // Scalar is effectively a 1-element vector.
+  PVF_SCALAR = PVF_FIRST | PVF_LAST
+};
+
+// Computes whether and how we can vectorize the loads/stores of a
+// flattened function parameter or return value.
+//
+// The flattened parameter is represented as the list of ValueVTs and
+// Offsets, and is aligned to ParamAlignment bytes. We return a vector
+// of the same size as ValueVTs indicating how each piece should be
+// loaded/stored (i.e. as a scalar, or as part of a vector
+// load/store).
+static SmallVector<ParamVectorizationFlags, 16>
+VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
+                     const SmallVectorImpl<uint64_t> &Offsets,
+                     unsigned ParamAlignment) {
+  // Set vector size to match ValueVTs and mark all elements as
+  // scalars by default.
+  SmallVector<ParamVectorizationFlags, 16> VectorInfo;
+  VectorInfo.assign(ValueVTs.size(), PVF_SCALAR);
+
+  // Check what we can vectorize using 128/64/32-bit accesses.
+  for (int I = 0, E = ValueVTs.size(); I != E; ++I) {
+    // Skip elements we've already processed.
+    assert(VectorInfo[I] == PVF_SCALAR && "Unexpected vector info state.");
+    for (unsigned AccessSize : {16, 8, 4, 2}) {
+      unsigned NumElts = CanMergeParamLoadStoresStartingAt(
+          I, AccessSize, ValueVTs, Offsets, ParamAlignment);
+      // Mark vectorized elements.
+      switch (NumElts) {
+      default:
+        llvm_unreachable("Unexpected return value");
+      case 1:
+        // Can't vectorize using this size, try next smaller size.
+        continue;
+      case 2:
+        assert(I + 1 < E && "Not enough elements.");
+        VectorInfo[I] = PVF_FIRST;
+        VectorInfo[I + 1] = PVF_LAST;
+        I += 1;
+        break;
+      case 4:
+        assert(I + 3 < E && "Not enough elements.");
+        VectorInfo[I] = PVF_FIRST;
+        VectorInfo[I + 1] = PVF_INNER;
+        VectorInfo[I + 2] = PVF_INNER;
+        VectorInfo[I + 3] = PVF_LAST;
+        I += 3;
+        break;
+      }
+      // Break out of the inner loop because we've already succeeded
+      // using largest possible AccessSize.
+      break;
+    }
+  }
+  return VectorInfo;
+}
+
 // NVPTXTargetLowering Constructor.
 NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
                                          const NVPTXSubtarget &STI)
@@ -212,6 +345,11 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   else
     setSchedulingPreference(Sched::Source);
 
+  auto setFP16OperationAction = [&](unsigned Op, MVT VT, LegalizeAction Action,
+                                    LegalizeAction NoF16Action) {
+    setOperationAction(Op, VT, STI.allowFP16Math() ? Action : NoF16Action);
+  };
+
   addRegisterClass(MVT::i1, &NVPTX::Int1RegsRegClass);
   addRegisterClass(MVT::i16, &NVPTX::Int16RegsRegClass);
   addRegisterClass(MVT::i32, &NVPTX::Int32RegsRegClass);
@@ -219,13 +357,20 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   addRegisterClass(MVT::f32, &NVPTX::Float32RegsRegClass);
   addRegisterClass(MVT::f64, &NVPTX::Float64RegsRegClass);
   addRegisterClass(MVT::f16, &NVPTX::Float16RegsRegClass);
+  addRegisterClass(MVT::v2f16, &NVPTX::Float16x2RegsRegClass);
 
-  setOperationAction(ISD::SETCC, MVT::f16,
-                     STI.allowFP16Math() ? Legal : Promote);
+  // Conversion to/from FP16/FP16x2 is always legal.
+  setOperationAction(ISD::SINT_TO_FP, MVT::f16, Legal);
+  setOperationAction(ISD::FP_TO_SINT, MVT::f16, Legal);
+  setOperationAction(ISD::BUILD_VECTOR, MVT::v2f16, Custom);
+  setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v2f16, Custom);
+
+  setFP16OperationAction(ISD::SETCC, MVT::f16, Legal, Promote);
+  setFP16OperationAction(ISD::SETCC, MVT::v2f16, Legal, Expand);
 
   // Operations not directly supported by NVPTX.
-  setOperationAction(ISD::SELECT_CC, MVT::f16,
-                     STI.allowFP16Math() ? Expand : Promote);
+  setOperationAction(ISD::SELECT_CC, MVT::f16, Expand);
+  setOperationAction(ISD::SELECT_CC, MVT::v2f16, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::f64, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i1, Expand);
@@ -233,8 +378,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setOperationAction(ISD::SELECT_CC, MVT::i16, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i32, Expand);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
-  setOperationAction(ISD::BR_CC, MVT::f16,
-                     STI.allowFP16Math() ? Expand : Promote);
+  setOperationAction(ISD::BR_CC, MVT::f16, Expand);
+  setOperationAction(ISD::BR_CC, MVT::v2f16, Expand);
   setOperationAction(ISD::BR_CC, MVT::f32, Expand);
   setOperationAction(ISD::BR_CC, MVT::f64, Expand);
   setOperationAction(ISD::BR_CC, MVT::i1, Expand);
@@ -374,58 +519,53 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   setTargetDAGCombine(ISD::SREM);
   setTargetDAGCombine(ISD::UREM);
 
-  if (!STI.allowFP16Math()) {
-    // Promote fp16 arithmetic if fp16 hardware isn't available or the
-    // user passed --nvptx-no-fp16-math. The flag is useful because,
-    // although sm_53+ GPUs have some sort of FP16 support in
-    // hardware, only sm_53 and sm_60 have full implementation. Others
-    // only have token amount of hardware and are likely to run faster
-    // by using fp32 units instead.
-    setOperationAction(ISD::FADD, MVT::f16, Promote);
-    setOperationAction(ISD::FMUL, MVT::f16, Promote);
-    setOperationAction(ISD::FSUB, MVT::f16, Promote);
-    setOperationAction(ISD::FMA, MVT::f16, Promote);
+  // setcc for f16x2 needs special handling to prevent legalizer's
+  // attempt to scalarize it due to v2i1 not being legal.
+  if (STI.allowFP16Math())
+    setTargetDAGCombine(ISD::SETCC);
+
+  // Promote fp16 arithmetic if fp16 hardware isn't available or the
+  // user passed --nvptx-no-fp16-math. The flag is useful because,
+  // although sm_53+ GPUs have some sort of FP16 support in
+  // hardware, only sm_53 and sm_60 have full implementation. Others
+  // only have token amount of hardware and are likely to run faster
+  // by using fp32 units instead.
+  for (const auto &Op : {ISD::FADD, ISD::FMUL, ISD::FSUB, ISD::FMA}) {
+    setFP16OperationAction(Op, MVT::f16, Legal, Promote);
+    setFP16OperationAction(Op, MVT::v2f16, Legal, Expand);
   }
-  // There's no neg.f16 instruction.
+
+  // There's no neg.f16 instruction. Expand to (0-x).
   setOperationAction(ISD::FNEG, MVT::f16, Expand);
+  setOperationAction(ISD::FNEG, MVT::v2f16, Expand);
 
-  // Library functions.  These default to Expand, but we have instructions
-  // for them.
-  setOperationAction(ISD::FCEIL,  MVT::f16, Legal);
-  setOperationAction(ISD::FCEIL,  MVT::f32, Legal);
-  setOperationAction(ISD::FCEIL,  MVT::f64, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f16, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f32, Legal);
-  setOperationAction(ISD::FFLOOR, MVT::f64, Legal);
-  setOperationAction(ISD::FNEARBYINT, MVT::f32, Legal);
-  setOperationAction(ISD::FNEARBYINT, MVT::f64, Legal);
-  setOperationAction(ISD::FRINT, MVT::f16, Legal);
-  setOperationAction(ISD::FRINT,  MVT::f32, Legal);
-  setOperationAction(ISD::FRINT,  MVT::f64, Legal);
-  setOperationAction(ISD::FROUND, MVT::f16, Legal);
-  setOperationAction(ISD::FROUND, MVT::f32, Legal);
-  setOperationAction(ISD::FROUND, MVT::f64, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f16, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f32, Legal);
-  setOperationAction(ISD::FTRUNC, MVT::f64, Legal);
-  setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
-  setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
-  setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+  // (would be) Library functions.
+
+  // These map to conversion instructions for scalar FP types.
+  for (const auto &Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FNEARBYINT, ISD::FRINT,
+                         ISD::FROUND, ISD::FTRUNC}) {
+    setOperationAction(Op, MVT::f16, Legal);
+    setOperationAction(Op, MVT::f32, Legal);
+    setOperationAction(Op, MVT::f64, Legal);
+    setOperationAction(Op, MVT::v2f16, Expand);
+  }
 
   // 'Expand' implements FCOPYSIGN without calling an external library.
   setOperationAction(ISD::FCOPYSIGN, MVT::f16, Expand);
+  setOperationAction(ISD::FCOPYSIGN, MVT::v2f16, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f32, Expand);
   setOperationAction(ISD::FCOPYSIGN, MVT::f64, Expand);
 
-  // FP16 does not support these nodes in hardware, but we can perform
-  // these ops using single-precision hardware.
-  setOperationAction(ISD::FDIV, MVT::f16, Promote);
-  setOperationAction(ISD::FREM, MVT::f16, Promote);
-  setOperationAction(ISD::FSQRT, MVT::f16, Promote);
-  setOperationAction(ISD::FSIN, MVT::f16, Promote);
-  setOperationAction(ISD::FCOS, MVT::f16, Promote);
-  setOperationAction(ISD::FABS, MVT::f16, Promote);
+  // These map to corresponding instructions for f32/f64. f16 must be
+  // promoted to f32. v2f16 is expanded to f16, which is then promoted
+  // to f32.
+  for (const auto &Op : {ISD::FDIV, ISD::FREM, ISD::FSQRT, ISD::FSIN, ISD::FCOS,
+                         ISD::FABS, ISD::FMINNUM, ISD::FMAXNUM}) {
+    setOperationAction(Op, MVT::f16, Promote);
+    setOperationAction(Op, MVT::f32, Legal);
+    setOperationAction(Op, MVT::f64, Legal);
+    setOperationAction(Op, MVT::v2f16, Expand);
+  }
   setOperationAction(ISD::FMINNUM, MVT::f16, Promote);
   setOperationAction(ISD::FMAXNUM, MVT::f16, Promote);
   setOperationAction(ISD::FMINNAN, MVT::f16, Promote);
@@ -541,6 +681,8 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const {
     return "NVPTXISD::FUN_SHFR_CLAMP";
   case NVPTXISD::IMAD:
     return "NVPTXISD::IMAD";
+  case NVPTXISD::SETP_F16X2:
+    return "NVPTXISD::SETP_F16X2";
   case NVPTXISD::Dummy:
     return "NVPTXISD::Dummy";
   case NVPTXISD::MUL_WIDE_SIGNED:
@@ -1039,10 +1181,60 @@ TargetLoweringBase::LegalizeTypeAction
 NVPTXTargetLowering::getPreferredVectorAction(EVT VT) const {
   if (VT.getVectorNumElements() != 1 && VT.getScalarType() == MVT::i1)
     return TypeSplitVector;
-
+  if (VT == MVT::v2f16)
+    return TypeLegal;
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+SDValue NVPTXTargetLowering::getSqrtEstimate(SDValue Operand, SelectionDAG &DAG,
+                                             int Enabled, int &ExtraSteps,
+                                             bool &UseOneConst,
+                                             bool Reciprocal) const {
+  if (!(Enabled == ReciprocalEstimate::Enabled ||
+        (Enabled == ReciprocalEstimate::Unspecified && !usePrecSqrtF32())))
+    return SDValue();
+
+  if (ExtraSteps == ReciprocalEstimate::Unspecified)
+    ExtraSteps = 0;
+
+  SDLoc DL(Operand);
+  EVT VT = Operand.getValueType();
+  bool Ftz = useF32FTZ(DAG.getMachineFunction());
+
+  auto MakeIntrinsicCall = [&](Intrinsic::ID IID) {
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
+                       DAG.getConstant(IID, DL, MVT::i32), Operand);
+  };
+
+  // The sqrt and rsqrt refinement processes assume we always start out with an
+  // approximation of the rsqrt.  Therefore, if we're going to do any refinement
+  // (i.e. ExtraSteps > 0), we must return an rsqrt.  But if we're *not* doing
+  // any refinement, we must return a regular sqrt.
+  if (Reciprocal || ExtraSteps > 0) {
+    if (VT == MVT::f32)
+      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_rsqrt_approx_ftz_f
+                                   : Intrinsic::nvvm_rsqrt_approx_f);
+    else if (VT == MVT::f64)
+      return MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d);
+    else
+      return SDValue();
+  } else {
+    if (VT == MVT::f32)
+      return MakeIntrinsicCall(Ftz ? Intrinsic::nvvm_sqrt_approx_ftz_f
+                                   : Intrinsic::nvvm_sqrt_approx_f);
+    else {
+      // There's no sqrt.approx.f64 instruction, so we emit
+      // reciprocal(rsqrt(x)).  This is faster than
+      // select(x == 0, 0, x * rsqrt(x)).  (In fact, it's faster than plain
+      // x * rsqrt(x).)
+      return DAG.getNode(
+          ISD::INTRINSIC_WO_CHAIN, DL, VT,
+          DAG.getConstant(Intrinsic::nvvm_rcp_approx_ftz_d, DL, MVT::i32),
+          MakeIntrinsicCall(Intrinsic::nvvm_rsqrt_approx_d));
+    }
+  }
+}
+
 SDValue
 NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -1227,21 +1419,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   SDValue Callee = CLI.Callee;
   bool &isTailCall = CLI.IsTailCall;
   ArgListTy &Args = CLI.getArgs();
-  Type *retTy = CLI.RetTy;
+  Type *RetTy = CLI.RetTy;
   ImmutableCallSite *CS = CLI.CS;
+  const DataLayout &DL = DAG.getDataLayout();
 
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
     return Chain;
-  MachineFunction &MF = DAG.getMachineFunction();
-  const Function *F = MF.getFunction();
-  auto &DL = MF.getDataLayout();
 
   SDValue tempChain = Chain;
-  Chain = DAG.getCALLSEQ_START(Chain,
-                               DAG.getIntPtrConstant(uniqueCallSite, dl, true),
-                               dl);
+  Chain = DAG.getCALLSEQ_START(
+      Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl);
   SDValue InFlag = Chain.getValue(1);
 
   unsigned paramCount = 0;
@@ -1262,244 +1451,124 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Type *Ty = Args[i].Ty;
 
     if (!Outs[OIdx].Flags.isByVal()) {
-      if (Ty->isAggregateType()) {
-        // aggregate
-        SmallVector<EVT, 16> vtparts;
-        SmallVector<uint64_t, 16> Offsets;
-        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &Offsets,
-                           0);
-
-        unsigned align =
-            getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+      SmallVector<EVT, 16> VTs;
+      SmallVector<uint64_t, 16> Offsets;
+      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
+      unsigned ArgAlign =
+          getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+      unsigned AllocSize = DL.getTypeAllocSize(Ty);
+      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
+      bool NeedAlign; // Does argument declaration specify alignment?
+      if (Ty->isAggregateType() || Ty->isVectorTy()) {
         // declare .param .align <align> .b8 .param<n>[<size>];
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-        SDValue DeclareParamOps[] = { Chain, DAG.getConstant(align, dl,
-                                                             MVT::i32),
-                                      DAG.getConstant(paramCount, dl, MVT::i32),
-                                      DAG.getConstant(sz, dl, MVT::i32),
-                                      InFlag };
+        SDValue DeclareParamOps[] = {
+            Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
+            DAG.getConstant(paramCount, dl, MVT::i32),
+            DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
                             DeclareParamOps);
-        InFlag = Chain.getValue(1);
-        for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
-          EVT elemtype = vtparts[j];
-          unsigned ArgAlign = GreatestCommonDivisor64(align, Offsets[j]);
-          if (elemtype.isInteger() && (sz < 8))
-            sz = 8;
-          SDValue StVal = OutVals[OIdx];
-          if (elemtype.getSizeInBits() < 16) {
-            StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
-          }
-          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue CopyParamOps[] = { Chain,
-                                     DAG.getConstant(paramCount, dl, MVT::i32),
-                                     DAG.getConstant(Offsets[j], dl, MVT::i32),
-                                     StVal, InFlag };
-          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                          CopyParamVTs, CopyParamOps,
-                                          elemtype, MachinePointerInfo(),
-                                          ArgAlign);
-          InFlag = Chain.getValue(1);
-          ++OIdx;
+        NeedAlign = true;
+      } else {
+        // declare .param .b<size> .param<n>;
+        if ((VT.isInteger() || VT.isFloatingPoint()) && AllocSize < 4) {
+          // PTX ABI requires integral types to be at least 32 bits in
+          // size. FP16 is loaded/stored using i16, so it's handled
+          // here as well.
+          AllocSize = 4;
         }
-        if (vtparts.size() > 0)
-          --OIdx;
-        ++paramCount;
-        continue;
+        SDValue DeclareScalarParamOps[] = {
+            Chain, DAG.getConstant(paramCount, dl, MVT::i32),
+            DAG.getConstant(AllocSize * 8, dl, MVT::i32),
+            DAG.getConstant(0, dl, MVT::i32), InFlag};
+        Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
+                            DeclareScalarParamOps);
+        NeedAlign = false;
       }
-      if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(DL, Ty);
-        unsigned align =
-            getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
-        // declare .param .align <align> .b8 .param<n>[<size>];
-        unsigned sz = DL.getTypeAllocSize(Ty);
-        SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-        SDValue DeclareParamOps[] = { Chain,
-                                      DAG.getConstant(align, dl, MVT::i32),
-                                      DAG.getConstant(paramCount, dl, MVT::i32),
-                                      DAG.getConstant(sz, dl, MVT::i32),
-                                      InFlag };
-        Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
-                            DeclareParamOps);
-        InFlag = Chain.getValue(1);
-        unsigned NumElts = ObjectVT.getVectorNumElements();
-        EVT EltVT = ObjectVT.getVectorElementType();
-        EVT MemVT = EltVT;
-        bool NeedExtend = false;
-        if (EltVT.getSizeInBits() < 16) {
-          NeedExtend = true;
-          EltVT = MVT::i16;
+      InFlag = Chain.getValue(1);
+
+      // PTX Interoperability Guide 3.3(A): [Integer] Values shorter
+      // than 32-bits are sign extended or zero extended, depending on
+      // whether they are signed or unsigned types. This case applies
+      // only to scalar parameters and not to aggregate values.
+      bool ExtendIntegerParam =
+          Ty->isIntegerTy() && DL.getTypeAllocSizeInBits(Ty) < 32;
+
+      auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, ArgAlign);
+      SmallVector<SDValue, 6> StoreOperands;
+      for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
+        // New store.
+        if (VectorInfo[j] & PVF_FIRST) {
+          assert(StoreOperands.empty() && "Unfinished preceeding store.");
+          StoreOperands.push_back(Chain);
+          StoreOperands.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
+          StoreOperands.push_back(DAG.getConstant(Offsets[j], dl, MVT::i32));
         }
 
-        // V1 store
-        if (NumElts == 1) {
-          SDValue Elt = OutVals[OIdx++];
-          if (NeedExtend)
-            Elt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt);
-
-          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue CopyParamOps[] = { Chain,
-                                     DAG.getConstant(paramCount, dl, MVT::i32),
-                                     DAG.getConstant(0, dl, MVT::i32), Elt,
-                                     InFlag };
-          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl,
-                                          CopyParamVTs, CopyParamOps,
-                                          MemVT, MachinePointerInfo());
-          InFlag = Chain.getValue(1);
-        } else if (NumElts == 2) {
-          SDValue Elt0 = OutVals[OIdx++];
-          SDValue Elt1 = OutVals[OIdx++];
-          if (NeedExtend) {
-            Elt0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt0);
-            Elt1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Elt1);
+        EVT EltVT = VTs[j];
+        SDValue StVal = OutVals[OIdx];
+        if (ExtendIntegerParam) {
+          assert(VTs.size() == 1 && "Scalar can't have multiple parts.");
+          // zext/sext to i32
+          StVal = DAG.getNode(Outs[OIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                        : ISD::ZERO_EXTEND,
+                              dl, MVT::i32, StVal);
+        } else if (EltVT.getSizeInBits() < 16) {
+          // Use 16-bit registers for small stores as it's the
+          // smallest general purpose register size supported by NVPTX.
+          StVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, StVal);
+        }
+
+        // Record the value to store.
+        StoreOperands.push_back(StVal);
+
+        if (VectorInfo[j] & PVF_LAST) {
+          unsigned NumElts = StoreOperands.size() - 3;
+          NVPTXISD::NodeType Op;
+          switch (NumElts) {
+          case 1:
+            Op = NVPTXISD::StoreParam;
+            break;
+          case 2:
+            Op = NVPTXISD::StoreParamV2;
+            break;
+          case 4:
+            Op = NVPTXISD::StoreParamV4;
+            break;
+          default:
+            llvm_unreachable("Invalid vector info.");
           }
 
-          SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-          SDValue CopyParamOps[] = { Chain,
-                                     DAG.getConstant(paramCount, dl, MVT::i32),
-                                     DAG.getConstant(0, dl, MVT::i32), Elt0,
-                                     Elt1, InFlag };
-          Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParamV2, dl,
-                                          CopyParamVTs, CopyParamOps,
-                                          MemVT, MachinePointerInfo());
-          InFlag = Chain.getValue(1);
-        } else {
-          unsigned curOffset = 0;
-          // V4 stores
-          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
-          // the
-          // vector will be expanded to a power of 2 elements, so we know we can
-          // always round up to the next multiple of 4 when creating the vector
-          // stores.
-          // e.g.  4 elem => 1 st.v4
-          //       6 elem => 2 st.v4
-          //       8 elem => 2 st.v4
-          //      11 elem => 3 st.v4
-          unsigned VecSize = 4;
-          if (EltVT.getSizeInBits() == 64)
-            VecSize = 2;
-
-          // This is potentially only part of a vector, so assume all elements
-          // are packed together.
-          unsigned PerStoreOffset = MemVT.getStoreSizeInBits() / 8 * VecSize;
-
-          for (unsigned i = 0; i < NumElts; i += VecSize) {
-            // Get values
-            SDValue StoreVal;
-            SmallVector<SDValue, 8> Ops;
-            Ops.push_back(Chain);
-            Ops.push_back(DAG.getConstant(paramCount, dl, MVT::i32));
-            Ops.push_back(DAG.getConstant(curOffset, dl, MVT::i32));
-
-            unsigned Opc = NVPTXISD::StoreParamV2;
-
-            StoreVal = OutVals[OIdx++];
-            if (NeedExtend)
-              StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-            Ops.push_back(StoreVal);
-
-            if (i + 1 < NumElts) {
-              StoreVal = OutVals[OIdx++];
-              if (NeedExtend)
-                StoreVal =
-                    DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-            } else {
-              StoreVal = DAG.getUNDEF(EltVT);
-            }
-            Ops.push_back(StoreVal);
-
-            if (VecSize == 4) {
-              Opc = NVPTXISD::StoreParamV4;
-              if (i + 2 < NumElts) {
-                StoreVal = OutVals[OIdx++];
-                if (NeedExtend)
-                  StoreVal =
-                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-              } else {
-                StoreVal = DAG.getUNDEF(EltVT);
-              }
-              Ops.push_back(StoreVal);
-
-              if (i + 3 < NumElts) {
-                StoreVal = OutVals[OIdx++];
-                if (NeedExtend)
-                  StoreVal =
-                      DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-              } else {
-                StoreVal = DAG.getUNDEF(EltVT);
-              }
-              Ops.push_back(StoreVal);
-            }
+          StoreOperands.push_back(InFlag);
 
-            Ops.push_back(InFlag);
+          // Adjust type of the store op if we've extended the scalar
+          // return value.
+          EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
+          unsigned EltAlign =
+              NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
 
-            SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-            Chain = DAG.getMemIntrinsicNode(Opc, dl, CopyParamVTs, Ops,
-                                            MemVT, MachinePointerInfo());
-            InFlag = Chain.getValue(1);
-            curOffset += PerStoreOffset;
-          }
+          Chain = DAG.getMemIntrinsicNode(
+              Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
+              TheStoreType, MachinePointerInfo(), EltAlign);
+          InFlag = Chain.getValue(1);
+
+          // Cleanup.
+          StoreOperands.clear();
         }
-        ++paramCount;
-        --OIdx;
-        continue;
+        ++OIdx;
       }
-      // Plain scalar
-      // for ABI,    declare .param .b<size> .param<n>;
-      unsigned sz = VT.getSizeInBits();
-      bool needExtend = false;
-      if (VT.isInteger()) {
-        if (sz < 16)
-          needExtend = true;
-        if (sz < 32)
-          sz = 32;
-      } else if (VT.isFloatingPoint() && sz < 32)
-        // PTX ABI requires all scalar parameters to be at least 32
-        // bits in size.  fp16 normally uses .b16 as its storage type
-        // in PTX, so its size must be adjusted here, too.
-        sz = 32;
-      SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue DeclareParamOps[] = { Chain,
-                                    DAG.getConstant(paramCount, dl, MVT::i32),
-                                    DAG.getConstant(sz, dl, MVT::i32),
-                                    DAG.getConstant(0, dl, MVT::i32), InFlag };
-      Chain = DAG.getNode(NVPTXISD::DeclareScalarParam, dl, DeclareParamVTs,
-                          DeclareParamOps);
-      InFlag = Chain.getValue(1);
-      SDValue OutV = OutVals[OIdx];
-      if (needExtend) {
-        // zext/sext i1 to i16
-        unsigned opc = ISD::ZERO_EXTEND;
-        if (Outs[OIdx].Flags.isSExt())
-          opc = ISD::SIGN_EXTEND;
-        OutV = DAG.getNode(opc, dl, MVT::i16, OutV);
-      }
-      SDVTList CopyParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue CopyParamOps[] = { Chain,
-                                 DAG.getConstant(paramCount, dl, MVT::i32),
-                                 DAG.getConstant(0, dl, MVT::i32), OutV,
-                                 InFlag };
-
-      unsigned opcode = NVPTXISD::StoreParam;
-      if (Outs[OIdx].Flags.isZExt() && VT.getSizeInBits() < 32)
-        opcode = NVPTXISD::StoreParamU32;
-      else if (Outs[OIdx].Flags.isSExt() && VT.getSizeInBits() < 32)
-        opcode = NVPTXISD::StoreParamS32;
-      Chain = DAG.getMemIntrinsicNode(opcode, dl, CopyParamVTs, CopyParamOps,
-                                      VT, MachinePointerInfo());
-
-      InFlag = Chain.getValue(1);
+      assert(StoreOperands.empty() && "Unfinished parameter store.");
+      if (VTs.size() > 0)
+        --OIdx;
       ++paramCount;
       continue;
     }
-    // struct or vector
-    SmallVector<EVT, 16> vtparts;
+
+    // ByVal arguments
+    SmallVector<EVT, 16> VTs;
     SmallVector<uint64_t, 16> Offsets;
     auto *PTy = dyn_cast<PointerType>(Args[i].Ty);
     assert(PTy && "Type of a byval parameter should be pointer");
-    ComputePTXValueVTs(*this, DAG.getDataLayout(), PTy->getElementType(),
-                       vtparts, &Offsets, 0);
+    ComputePTXValueVTs(*this, DL, PTy->getElementType(), VTs, &Offsets, 0);
 
     // declare .param .align <align> .b8 .param<n>[<size>];
     unsigned sz = Outs[OIdx].Flags.getByValSize();
@@ -1520,11 +1589,11 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
                         DeclareParamOps);
     InFlag = Chain.getValue(1);
-    for (unsigned j = 0, je = vtparts.size(); j != je; ++j) {
-      EVT elemtype = vtparts[j];
+    for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
+      EVT elemtype = VTs[j];
       int curOffset = Offsets[j];
       unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
-      auto PtrVT = getPointerTy(DAG.getDataLayout());
+      auto PtrVT = getPointerTy(DL);
       SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
                                     DAG.getConstant(curOffset, dl, PtrVT));
       SDValue theVal = DAG.getLoad(elemtype, dl, tempChain, srcAddr,
@@ -1552,18 +1621,18 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Handle Result
   if (Ins.size() > 0) {
     SmallVector<EVT, 16> resvtparts;
-    ComputeValueVTs(*this, DL, retTy, resvtparts);
+    ComputeValueVTs(*this, DL, RetTy, resvtparts);
 
     // Declare
     //  .param .align 16 .b8 retval0[<size-in-bytes>], or
     //  .param .b<size-in-bits> retval0
-    unsigned resultsz = DL.getTypeAllocSizeInBits(retTy);
+    unsigned resultsz = DL.getTypeAllocSizeInBits(RetTy);
     // Emit ".param .b<size-in-bits> retval0" instead of byte arrays only for
     // these three types to match the logic in
     // NVPTXAsmPrinter::printReturnValStr and NVPTXTargetLowering::getPrototype.
     // Plus, this behavior is consistent with nvcc's.
-    if (retTy->isFloatingPointTy() || retTy->isIntegerTy() ||
-        retTy->isPointerTy()) {
+    if (RetTy->isFloatingPointTy() || RetTy->isIntegerTy() ||
+        RetTy->isPointerTy()) {
       // Scalar needs to be at least 32bit wide
       if (resultsz < 32)
         resultsz = 32;
@@ -1575,7 +1644,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           DeclareRetOps);
       InFlag = Chain.getValue(1);
     } else {
-      retAlignment = getArgumentAlignment(Callee, CS, retTy, 0, DL);
+      retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
       SDValue DeclareRetOps[] = { Chain,
                                   DAG.getConstant(retAlignment, dl, MVT::i32),
@@ -1596,8 +1665,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // The prototype is embedded in a string and put as the operand for a
     // CallPrototype SDNode which will print out to the value of the string.
     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    std::string Proto =
-        getPrototype(DAG.getDataLayout(), retTy, Args, Outs, retAlignment, CS);
+    std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
     const char *ProtoStr =
       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
     SDValue ProtoOps[] = {
@@ -1662,175 +1730,84 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Generate loads from param memory/moves from registers for result
   if (Ins.size() > 0) {
-    if (retTy && retTy->isVectorTy()) {
-      EVT ObjectVT = getValueType(DL, retTy);
-      unsigned NumElts = ObjectVT.getVectorNumElements();
-      EVT EltVT = ObjectVT.getVectorElementType();
-      assert(STI.getTargetLowering()->getNumRegisters(F->getContext(),
-                                                      ObjectVT) == NumElts &&
-             "Vector was not scalarized");
-      unsigned sz = EltVT.getSizeInBits();
-      bool needTruncate = sz < 8;
-
-      if (NumElts == 1) {
-        // Just a simple load
-        SmallVector<EVT, 4> LoadRetVTs;
-        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
-          // If loading i1/i8 result, generate
-          //   load.b8 i16
-          //   if i1
-          //   trunc i16 to i1
-          LoadRetVTs.push_back(MVT::i16);
-        } else
-          LoadRetVTs.push_back(EltVT);
-        LoadRetVTs.push_back(MVT::Other);
-        LoadRetVTs.push_back(MVT::Glue);
-        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                DAG.getConstant(0, dl, MVT::i32), InFlag};
-        SDValue retval = DAG.getMemIntrinsicNode(
-            NVPTXISD::LoadParam, dl,
-            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
-        Chain = retval.getValue(1);
-        InFlag = retval.getValue(2);
-        SDValue Ret0 = retval;
-        if (needTruncate)
-          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Ret0);
-        InVals.push_back(Ret0);
-      } else if (NumElts == 2) {
-        // LoadV2
-        SmallVector<EVT, 4> LoadRetVTs;
-        if (EltVT == MVT::i1 || EltVT == MVT::i8) {
-          // If loading i1/i8 result, generate
-          //   load.b8 i16
-          //   if i1
-          //   trunc i16 to i1
-          LoadRetVTs.push_back(MVT::i16);
-          LoadRetVTs.push_back(MVT::i16);
-        } else {
-          LoadRetVTs.push_back(EltVT);
-          LoadRetVTs.push_back(EltVT);
-        }
-        LoadRetVTs.push_back(MVT::Other);
-        LoadRetVTs.push_back(MVT::Glue);
-        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                DAG.getConstant(0, dl, MVT::i32), InFlag};
-        SDValue retval = DAG.getMemIntrinsicNode(
-            NVPTXISD::LoadParamV2, dl,
-            DAG.getVTList(LoadRetVTs), LoadRetOps, EltVT, MachinePointerInfo());
-        Chain = retval.getValue(2);
-        InFlag = retval.getValue(3);
-        SDValue Ret0 = retval.getValue(0);
-        SDValue Ret1 = retval.getValue(1);
-        if (needTruncate) {
-          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret0);
-          InVals.push_back(Ret0);
-          Ret1 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Ret1);
-          InVals.push_back(Ret1);
-        } else {
-          InVals.push_back(Ret0);
-          InVals.push_back(Ret1);
-        }
-      } else {
-        // Split into N LoadV4
-        unsigned Ofst = 0;
-        unsigned VecSize = 4;
-        unsigned Opc = NVPTXISD::LoadParamV4;
-        if (EltVT.getSizeInBits() == 64) {
-          VecSize = 2;
-          Opc = NVPTXISD::LoadParamV2;
-        }
-        EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
-        for (unsigned i = 0; i < NumElts; i += VecSize) {
-          SmallVector<EVT, 8> LoadRetVTs;
-          if (EltVT == MVT::i1 || EltVT == MVT::i8) {
-            // If loading i1/i8 result, generate
-            //   load.b8 i16
-            //   if i1
-            //   trunc i16 to i1
-            for (unsigned j = 0; j < VecSize; ++j)
-              LoadRetVTs.push_back(MVT::i16);
-          } else {
-            for (unsigned j = 0; j < VecSize; ++j)
-              LoadRetVTs.push_back(EltVT);
-          }
-          LoadRetVTs.push_back(MVT::Other);
-          LoadRetVTs.push_back(MVT::Glue);
-          SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                  DAG.getConstant(Ofst, dl, MVT::i32), InFlag};
-          SDValue retval = DAG.getMemIntrinsicNode(
-              Opc, dl, DAG.getVTList(LoadRetVTs),
-              LoadRetOps, EltVT, MachinePointerInfo());
-          if (VecSize == 2) {
-            Chain = retval.getValue(2);
-            InFlag = retval.getValue(3);
-          } else {
-            Chain = retval.getValue(4);
-            InFlag = retval.getValue(5);
-          }
+    SmallVector<EVT, 16> VTs;
+    SmallVector<uint64_t, 16> Offsets;
+    ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
+    assert(VTs.size() == Ins.size() && "Bad value decomposition");
+
+    unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
+    auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
+
+    SmallVector<EVT, 6> LoadVTs;
+    int VecIdx = -1; // Index of the first element of the vector.
+
+    // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
+    // 32-bits are sign extended or zero extended, depending on whether
+    // they are signed or unsigned types.
+    bool ExtendIntegerRetVal =
+        RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
+
+    for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+      bool needTruncate = false;
+      EVT TheLoadType = VTs[i];
+      EVT EltType = Ins[i].VT;
+      unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
+      if (ExtendIntegerRetVal) {
+        TheLoadType = MVT::i32;
+        EltType = MVT::i32;
+        needTruncate = true;
+      } else if (TheLoadType.getSizeInBits() < 16) {
+        if (VTs[i].isInteger())
+          needTruncate = true;
+        EltType = MVT::i16;
+      }
 
-          for (unsigned j = 0; j < VecSize; ++j) {
-            if (i + j >= NumElts)
-              break;
-            SDValue Elt = retval.getValue(j);
-            if (needTruncate)
-              Elt = DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
-            InVals.push_back(Elt);
-          }
-          Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
-        }
+      // Record index of the very first element of the vector.
+      if (VectorInfo[i] & PVF_FIRST) {
+        assert(VecIdx == -1 && LoadVTs.empty() && "Orphaned operand list.");
+        VecIdx = i;
       }
-    } else {
-      SmallVector<EVT, 16> VTs;
-      SmallVector<uint64_t, 16> Offsets;
-      auto &DL = DAG.getDataLayout();
-      ComputePTXValueVTs(*this, DL, retTy, VTs, &Offsets, 0);
-      assert(VTs.size() == Ins.size() && "Bad value decomposition");
-      unsigned RetAlign = getArgumentAlignment(Callee, CS, retTy, 0, DL);
-      for (unsigned i = 0, e = Ins.size(); i != e; ++i) {
-        unsigned sz = VTs[i].getSizeInBits();
-        unsigned AlignI = GreatestCommonDivisor64(RetAlign, Offsets[i]);
-        bool needTruncate = false;
-        if (VTs[i].isInteger() && sz < 8) {
-          sz = 8;
-          needTruncate = true;
+
+      LoadVTs.push_back(EltType);
+
+      if (VectorInfo[i] & PVF_LAST) {
+        unsigned NumElts = LoadVTs.size();
+        LoadVTs.push_back(MVT::Other);
+        LoadVTs.push_back(MVT::Glue);
+        NVPTXISD::NodeType Op;
+        switch (NumElts) {
+        case 1:
+          Op = NVPTXISD::LoadParam;
+          break;
+        case 2:
+          Op = NVPTXISD::LoadParamV2;
+          break;
+        case 4:
+          Op = NVPTXISD::LoadParamV4;
+          break;
+        default:
+          llvm_unreachable("Invalid vector info.");
         }
 
-        SmallVector<EVT, 4> LoadRetVTs;
-        EVT TheLoadType = VTs[i];
-        if (retTy->isIntegerTy() && DL.getTypeAllocSizeInBits(retTy) < 32) {
-          // This is for integer types only, and specifically not for
-          // aggregates.
-          LoadRetVTs.push_back(MVT::i32);
-          TheLoadType = MVT::i32;
-          needTruncate = true;
-        } else if (sz < 16) {
-          // If loading i1/i8 result, generate
-          //   load i8 (-> i16)
-          //   trunc i16 to i1/i8
-
-          // FIXME: Do we need to set needTruncate to true here, too?  We could
-          // not figure out what this branch is for in D17872, so we left it
-          // alone.  The comment above about loading i1/i8 may be wrong, as the
-          // branch above seems to cover integers of size < 32.
-          LoadRetVTs.push_back(MVT::i16);
-        } else
-          LoadRetVTs.push_back(Ins[i].VT);
-        LoadRetVTs.push_back(MVT::Other);
-        LoadRetVTs.push_back(MVT::Glue);
-
-        SDValue LoadRetOps[] = {Chain, DAG.getConstant(1, dl, MVT::i32),
-                                DAG.getConstant(Offsets[i], dl, MVT::i32),
-                                InFlag};
-        SDValue retval = DAG.getMemIntrinsicNode(
-            NVPTXISD::LoadParam, dl,
-            DAG.getVTList(LoadRetVTs), LoadRetOps,
-            TheLoadType, MachinePointerInfo(), AlignI);
-        Chain = retval.getValue(1);
-        InFlag = retval.getValue(2);
-        SDValue Ret0 = retval.getValue(0);
-        if (needTruncate)
-          Ret0 = DAG.getNode(ISD::TRUNCATE, dl, Ins[i].VT, Ret0);
-        InVals.push_back(Ret0);
+        SDValue LoadOperands[] = {
+            Chain, DAG.getConstant(1, dl, MVT::i32),
+            DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag};
+        SDValue RetVal = DAG.getMemIntrinsicNode(
+            Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType,
+            MachinePointerInfo(), EltAlign);
+
+        for (unsigned j = 0; j < NumElts; ++j) {
+          SDValue Ret = RetVal.getValue(j);
+          if (needTruncate)
+            Ret = DAG.getNode(ISD::TRUNCATE, dl, Ins[VecIdx + j].VT, Ret);
+          InVals.push_back(Ret);
+        }
+        Chain = RetVal.getValue(NumElts);
+        InFlag = RetVal.getValue(NumElts + 1);
+
+        // Cleanup
+        VecIdx = -1;
+        LoadVTs.clear();
       }
     }
   }
@@ -1870,6 +1847,55 @@ NVPTXTargetLowering::LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const {
   return DAG.getBuildVector(Node->getValueType(0), dl, Ops);
 }
 
+// We can init constant f16x2 with a single .b32 move.  Normally it
+// would get lowered as two constant loads and vector-packing move.
+//        mov.b16         %h1, 0x4000;
+//        mov.b16         %h2, 0x3C00;
+//        mov.b32         %hh2, {%h2, %h1};
+// Instead we want just a constant move:
+//        mov.b32         %hh2, 0x40003C00
+//
+// This results in better SASS code with CUDA 7.x. Ptxas in CUDA 8.0
+// generates good SASS in both cases.
+SDValue NVPTXTargetLowering::LowerBUILD_VECTOR(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  //return Op;
+  if (!(Op->getValueType(0) == MVT::v2f16 &&
+        isa<ConstantFPSDNode>(Op->getOperand(0)) &&
+        isa<ConstantFPSDNode>(Op->getOperand(1))))
+    return Op;
+
+  APInt E0 =
+      cast<ConstantFPSDNode>(Op->getOperand(0))->getValueAPF().bitcastToAPInt();
+  APInt E1 =
+      cast<ConstantFPSDNode>(Op->getOperand(1))->getValueAPF().bitcastToAPInt();
+  SDValue Const =
+      DAG.getConstant(E1.zext(32).shl(16) | E0.zext(32), SDLoc(Op), MVT::i32);
+  return DAG.getNode(ISD::BITCAST, SDLoc(Op), MVT::v2f16, Const);
+}
+
+SDValue NVPTXTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  SDValue Index = Op->getOperand(1);
+  // Constant index will be matched by tablegen.
+  if (isa<ConstantSDNode>(Index.getNode()))
+    return Op;
+
+  // Extract individual elements and select one of them.
+  SDValue Vector = Op->getOperand(0);
+  EVT VectorVT = Vector.getValueType();
+  assert(VectorVT == MVT::v2f16 && "Unexpected vector type.");
+  EVT EltVT = VectorVT.getVectorElementType();
+
+  SDLoc dl(Op.getNode());
+  SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
+                           DAG.getIntPtrConstant(0, dl));
+  SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Vector,
+                           DAG.getIntPtrConstant(1, dl));
+  return DAG.getSelectCC(dl, Index, DAG.getIntPtrConstant(0, dl), E0, E1,
+                         ISD::CondCode::SETEQ);
+}
+
 /// LowerShiftRightParts - Lower SRL_PARTS, SRA_PARTS, which
 /// 1) returns two i32 values and take a 2 x i32 value to shift plus a shift
 ///    amount, or
@@ -2003,8 +2029,11 @@ NVPTXTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::INTRINSIC_W_CHAIN:
     return Op;
   case ISD::BUILD_VECTOR:
+    return LowerBUILD_VECTOR(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
     return Op;
+  case ISD::EXTRACT_VECTOR_ELT:
+    return LowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::CONCAT_VECTORS:
     return LowerCONCAT_VECTORS(Op, DAG);
   case ISD::STORE:
@@ -2042,8 +2071,21 @@ SDValue NVPTXTargetLowering::LowerSelect(SDValue Op, SelectionDAG &DAG) const {
 SDValue NVPTXTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getValueType() == MVT::i1)
     return LowerLOADi1(Op, DAG);
-  else
-    return SDValue();
+
+  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
+  // loads and have to handle it here.
+  if (Op.getValueType() == MVT::v2f16) {
+    LoadSDNode *Load = cast<LoadSDNode>(Op);
+    EVT MemVT = Load->getMemoryVT();
+    if (!allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                            Load->getAddressSpace(), Load->getAlignment())) {
+      SDValue Ops[2];
+      std::tie(Ops[0], Ops[1]) = expandUnalignedLoad(Load, DAG);
+      return DAG.getMergeValues(Ops, SDLoc(Op));
+    }
+  }
+
+  return SDValue();
 }
 
 // v = ld i1* addr
@@ -2069,16 +2111,23 @@ SDValue NVPTXTargetLowering::LowerLOADi1(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue NVPTXTargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
-  EVT ValVT = Op.getOperand(1).getValueType();
-  switch (ValVT.getSimpleVT().SimpleTy) {
-  case MVT::i1:
+  StoreSDNode *Store = cast<StoreSDNode>(Op);
+  EVT VT = Store->getMemoryVT();
+
+  if (VT == MVT::i1)
     return LowerSTOREi1(Op, DAG);
-  default:
-    if (ValVT.isVector())
-      return LowerSTOREVector(Op, DAG);
-    else
-      return SDValue();
-  }
+
+  // v2f16 is legal, so we can't rely on legalizer to handle unaligned
+  // stores and have to handle it here.
+  if (VT == MVT::v2f16 &&
+      !allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                          Store->getAddressSpace(), Store->getAlignment()))
+    return expandUnalignedStore(Store, DAG);
+
+  if (VT.isVector())
+    return LowerSTOREVector(Op, DAG);
+
+  return SDValue();
 }
 
 SDValue
@@ -2101,12 +2150,15 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     case MVT::v2i16:
     case MVT::v2i32:
     case MVT::v2i64:
+    case MVT::v2f16:
     case MVT::v2f32:
     case MVT::v2f64:
     case MVT::v4i8:
     case MVT::v4i16:
     case MVT::v4i32:
+    case MVT::v4f16:
     case MVT::v4f32:
+    case MVT::v8f16: // <4 x f16x2>
       // This is a "native" vector type
       break;
     }
@@ -2137,6 +2189,7 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     if (EltVT.getSizeInBits() < 16)
       NeedExt = true;
 
+    bool StoreF16x2 = false;
     switch (NumElts) {
     default:
       return SDValue();
@@ -2146,6 +2199,14 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     case 4:
       Opcode = NVPTXISD::StoreV4;
       break;
+    case 8:
+      // v8f16 is a special case. PTX doesn't have st.v8.f16
+      // instruction. Instead, we split the vector into v2f16 chunks and
+      // store them with st.v4.b32.
+      assert(EltVT == MVT::f16 && "Wrong type for the vector.");
+      Opcode = NVPTXISD::StoreV4;
+      StoreF16x2 = true;
+      break;
     }
 
     SmallVector<SDValue, 8> Ops;
@@ -2153,23 +2214,36 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     // First is the chain
     Ops.push_back(N->getOperand(0));
 
-    // Then the split values
-    for (unsigned i = 0; i < NumElts; ++i) {
-      SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
-                                   DAG.getIntPtrConstant(i, DL));
-      if (NeedExt)
-        ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
-      Ops.push_back(ExtVal);
+    if (StoreF16x2) {
+      // Combine f16,f16 -> v2f16
+      NumElts /= 2;
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
+                                 DAG.getIntPtrConstant(i * 2, DL));
+        SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f16, Val,
+                                 DAG.getIntPtrConstant(i * 2 + 1, DL));
+        SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v2f16, E0, E1);
+        Ops.push_back(V2);
+      }
+    } else {
+      // Then the split values
+      for (unsigned i = 0; i < NumElts; ++i) {
+        SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val,
+                                     DAG.getIntPtrConstant(i, DL));
+        if (NeedExt)
+          ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal);
+        Ops.push_back(ExtVal);
+      }
     }
 
     // Then any remaining arguments
     Ops.append(N->op_begin() + 2, N->op_end());
 
-    SDValue NewSt = DAG.getMemIntrinsicNode(
-        Opcode, DL, DAG.getVTList(MVT::Other), Ops,
-        MemSD->getMemoryVT(), MemSD->getMemOperand());
+    SDValue NewSt =
+        DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops,
+                                MemSD->getMemoryVT(), MemSD->getMemOperand());
 
-    //return DCI.CombineTo(N, NewSt, true);
+    // return DCI.CombineTo(N, NewSt, true);
     return NewSt;
   }
 
@@ -2241,7 +2315,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
   const Function *F = MF.getFunction();
-  const AttributeSet &PAL = F->getAttributes();
+  const AttributeList &PAL = F->getAttributes();
   const TargetLowering *TLI = STI.getTargetLowering();
 
   SDValue Root = DAG.getRoot();
@@ -2322,176 +2396,79 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
     // appear in the same order as their order of appearance
     // in the original function. "idx+1" holds that order.
     if (!PAL.hasAttribute(i + 1, Attribute::ByVal)) {
-      if (Ty->isAggregateType()) {
-        SmallVector<EVT, 16> vtparts;
-        SmallVector<uint64_t, 16> offsets;
+      bool aggregateIsPacked = false;
+      if (StructType *STy = dyn_cast<StructType>(Ty))
+        aggregateIsPacked = STy->isPacked();
 
-        // NOTE: Here, we lose the ability to issue vector loads for vectors
-        // that are a part of a struct.  This should be investigated in the
-        // future.
-        ComputePTXValueVTs(*this, DAG.getDataLayout(), Ty, vtparts, &offsets,
-                           0);
-        assert(vtparts.size() > 0 && "empty aggregate type not expected");
-        bool aggregateIsPacked = false;
-        if (StructType *STy = dyn_cast<StructType>(Ty))
-          aggregateIsPacked = STy->isPacked();
+      SmallVector<EVT, 16> VTs;
+      SmallVector<uint64_t, 16> Offsets;
+      ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
+      assert(VTs.size() > 0 && "Unexpected empty type.");
+      auto VectorInfo =
+          VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
 
-        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
-        for (unsigned parti = 0, parte = vtparts.size(); parti != parte;
-             ++parti) {
-          EVT partVT = vtparts[parti];
-          Value *srcValue = Constant::getNullValue(
-              PointerType::get(partVT.getTypeForEVT(F->getContext()),
-                               ADDRESS_SPACE_PARAM));
-          SDValue srcAddr =
-              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
-                          DAG.getConstant(offsets[parti], dl, PtrVT));
-          unsigned partAlign = aggregateIsPacked
-                                   ? 1
-                                   : DL.getABITypeAlignment(
-                                         partVT.getTypeForEVT(F->getContext()));
-          SDValue p;
-          if (Ins[InsIdx].VT.getSizeInBits() > partVT.getSizeInBits()) {
-            ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
-                                     ISD::SEXTLOAD : ISD::ZEXTLOAD;
-            p = DAG.getExtLoad(ExtOp, dl, Ins[InsIdx].VT, Root, srcAddr,
-                               MachinePointerInfo(srcValue), partVT, partAlign);
-          } else {
-            p = DAG.getLoad(partVT, dl, Root, srcAddr,
-                            MachinePointerInfo(srcValue), partAlign);
-          }
-          if (p.getNode())
-            p.getNode()->setIROrder(idx + 1);
-          InVals.push_back(p);
-          ++InsIdx;
+      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
+      int VecIdx = -1; // Index of the first element of the current vector.
+      for (unsigned parti = 0, parte = VTs.size(); parti != parte; ++parti) {
+        if (VectorInfo[parti] & PVF_FIRST) {
+          assert(VecIdx == -1 && "Orphaned vector.");
+          VecIdx = parti;
         }
-        if (vtparts.size() > 0)
-          --InsIdx;
-        continue;
-      }
-      if (Ty->isVectorTy()) {
-        EVT ObjectVT = getValueType(DL, Ty);
-        SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
-        unsigned NumElts = ObjectVT.getVectorNumElements();
-        assert(TLI->getNumRegisters(F->getContext(), ObjectVT) == NumElts &&
-               "Vector was not scalarized");
-        EVT EltVT = ObjectVT.getVectorElementType();
-
-        // V1 load
-        // f32 = load ...
-        if (NumElts == 1) {
-          // We only have one element, so just directly load it
-          Value *SrcValue = Constant::getNullValue(PointerType::get(
-              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
-          SDValue P = DAG.getLoad(
-              EltVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
-              DL.getABITypeAlignment(EltVT.getTypeForEVT(F->getContext())),
-              MachineMemOperand::MODereferenceable |
-                  MachineMemOperand::MOInvariant);
-          if (P.getNode())
-            P.getNode()->setIROrder(idx + 1);
 
-          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
-            P = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, P);
-          InVals.push_back(P);
-          ++InsIdx;
-        } else if (NumElts == 2) {
-          // V2 load
-          // f32,f32 = load ...
-          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, 2);
-          Value *SrcValue = Constant::getNullValue(PointerType::get(
-              VecVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
-          SDValue P = DAG.getLoad(
-              VecVT, dl, Root, Arg, MachinePointerInfo(SrcValue),
-              DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
-              MachineMemOperand::MODereferenceable |
-                  MachineMemOperand::MOInvariant);
+        // That's the last element of this store op.
+        if (VectorInfo[parti] & PVF_LAST) {
+          unsigned NumElts = parti - VecIdx + 1;
+          EVT EltVT = VTs[parti];
+          // i1 is loaded/stored as i8.
+          EVT LoadVT = EltVT;
+          if (EltVT == MVT::i1)
+            LoadVT = MVT::i8;
+          else if (EltVT == MVT::v2f16)
+            // getLoad needs a vector type, but it can't handle
+            // vectors which contain v2f16 elements. So we must load
+            // using i32 here and then bitcast back.
+            LoadVT = MVT::i32;
+
+          EVT VecVT = EVT::getVectorVT(F->getContext(), LoadVT, NumElts);
+          SDValue VecAddr =
+              DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
+                          DAG.getConstant(Offsets[VecIdx], dl, PtrVT));
+          Value *srcValue = Constant::getNullValue(PointerType::get(
+              EltVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
+          SDValue P =
+              DAG.getLoad(VecVT, dl, Root, VecAddr,
+                          MachinePointerInfo(srcValue), aggregateIsPacked,
+                          MachineMemOperand::MODereferenceable |
+                              MachineMemOperand::MOInvariant);
           if (P.getNode())
             P.getNode()->setIROrder(idx + 1);
-
-          SDValue Elt0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
-                                     DAG.getIntPtrConstant(0, dl));
-          SDValue Elt1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
-                                     DAG.getIntPtrConstant(1, dl));
-
-          if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits()) {
-            Elt0 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt0);
-            Elt1 = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt1);
-          }
-
-          InVals.push_back(Elt0);
-          InVals.push_back(Elt1);
-          InsIdx += 2;
-        } else {
-          // V4 loads
-          // We have at least 4 elements (<3 x Ty> expands to 4 elements) and
-          // the vector will be expanded to a power of 2 elements, so we know we
-          // can always round up to the next multiple of 4 when creating the
-          // vector loads.
-          // e.g.  4 elem => 1 ld.v4
-          //       6 elem => 2 ld.v4
-          //       8 elem => 2 ld.v4
-          //      11 elem => 3 ld.v4
-          unsigned VecSize = 4;
-          if (EltVT.getSizeInBits() == 64) {
-            VecSize = 2;
-          }
-          EVT VecVT = EVT::getVectorVT(F->getContext(), EltVT, VecSize);
-          unsigned Ofst = 0;
-          for (unsigned i = 0; i < NumElts; i += VecSize) {
-            Value *SrcValue = Constant::getNullValue(
-                PointerType::get(VecVT.getTypeForEVT(F->getContext()),
-                                 ADDRESS_SPACE_PARAM));
-            SDValue SrcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Arg,
-                                          DAG.getConstant(Ofst, dl, PtrVT));
-            SDValue P = DAG.getLoad(
-                VecVT, dl, Root, SrcAddr, MachinePointerInfo(SrcValue),
-                DL.getABITypeAlignment(VecVT.getTypeForEVT(F->getContext())),
-                MachineMemOperand::MODereferenceable |
-                    MachineMemOperand::MOInvariant);
-            if (P.getNode())
-              P.getNode()->setIROrder(idx + 1);
-
-            for (unsigned j = 0; j < VecSize; ++j) {
-              if (i + j >= NumElts)
-                break;
-              SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, P,
-                                        DAG.getIntPtrConstant(j, dl));
-              if (Ins[InsIdx].VT.getSizeInBits() > EltVT.getSizeInBits())
-                Elt = DAG.getNode(ISD::ANY_EXTEND, dl, Ins[InsIdx].VT, Elt);
-              InVals.push_back(Elt);
+          for (unsigned j = 0; j < NumElts; ++j) {
+            SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, LoadVT, P,
+                                      DAG.getIntPtrConstant(j, dl));
+            // We've loaded i1 as an i8 and now must truncate it back to i1
+            if (EltVT == MVT::i1)
+              Elt = DAG.getNode(ISD::TRUNCATE, dl, MVT::i1, Elt);
+            // v2f16 was loaded as an i32. Now we must bitcast it back.
+            else if (EltVT == MVT::v2f16)
+              Elt = DAG.getNode(ISD::BITCAST, dl, MVT::v2f16, Elt);
+            // Extend the element if necesary (e.g. an i8 is loaded
+            // into an i16 register)
+            if (Ins[InsIdx].VT.isInteger() &&
+                Ins[InsIdx].VT.getSizeInBits() > LoadVT.getSizeInBits()) {
+              unsigned Extend = Ins[InsIdx].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                           : ISD::ZERO_EXTEND;
+              Elt = DAG.getNode(Extend, dl, Ins[InsIdx].VT, Elt);
             }
-            Ofst += DL.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
+            InVals.push_back(Elt);
           }
-          InsIdx += NumElts;
-        }
 
-        if (NumElts > 0)
-          --InsIdx;
-        continue;
-      }
-      // A plain scalar.
-      EVT ObjectVT = getValueType(DL, Ty);
-      // If ABI, load from the param symbol
-      SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
-      Value *srcValue = Constant::getNullValue(PointerType::get(
-          ObjectVT.getTypeForEVT(F->getContext()), ADDRESS_SPACE_PARAM));
-      SDValue p;
-       if (ObjectVT.getSizeInBits() < Ins[InsIdx].VT.getSizeInBits()) {
-        ISD::LoadExtType ExtOp = Ins[InsIdx].Flags.isSExt() ? 
-                                       ISD::SEXTLOAD : ISD::ZEXTLOAD;
-        p = DAG.getExtLoad(
-            ExtOp, dl, Ins[InsIdx].VT, Root, Arg, MachinePointerInfo(srcValue),
-            ObjectVT,
-            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
-      } else {
-        p = DAG.getLoad(
-            Ins[InsIdx].VT, dl, Root, Arg, MachinePointerInfo(srcValue),
-            DL.getABITypeAlignment(ObjectVT.getTypeForEVT(F->getContext())));
+          // Reset vector tracking state.
+          VecIdx = -1;
+        }
+        ++InsIdx;
       }
-      if (p.getNode())
-        p.getNode()->setIROrder(idx + 1);
-      InVals.push_back(p);
+      if (VTs.size() > 0)
+        --InsIdx;
       continue;
     }
 
@@ -2533,165 +2510,77 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                  const SmallVectorImpl<SDValue> &OutVals,
                                  const SDLoc &dl, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
-  const Function *F = MF.getFunction();
-  Type *RetTy = F->getReturnType();
-  const DataLayout &TD = DAG.getDataLayout();
+  Type *RetTy = MF.getFunction()->getReturnType();
 
   bool isABI = (STI.getSmVersion() >= 20);
   assert(isABI && "Non-ABI compilation is not supported");
   if (!isABI)
     return Chain;
 
-  if (VectorType *VTy = dyn_cast<VectorType>(RetTy)) {
-    // If we have a vector type, the OutVals array will be the scalarized
-    // components and we have combine them into 1 or more vector stores.
-    unsigned NumElts = VTy->getNumElements();
-    assert(NumElts == Outs.size() && "Bad scalarization of return value");
+  const DataLayout DL = DAG.getDataLayout();
+  SmallVector<EVT, 16> VTs;
+  SmallVector<uint64_t, 16> Offsets;
+  ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets);
+  assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
+
+  auto VectorInfo = VectorizePTXValueVTs(
+      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
+
+  // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
+  // 32-bits are sign extended or zero extended, depending on whether
+  // they are signed or unsigned types.
+  bool ExtendIntegerRetVal =
+      RetTy->isIntegerTy() && DL.getTypeAllocSizeInBits(RetTy) < 32;
+
+  SmallVector<SDValue, 6> StoreOperands;
+  for (unsigned i = 0, e = VTs.size(); i != e; ++i) {
+    // New load/store. Record chain and offset operands.
+    if (VectorInfo[i] & PVF_FIRST) {
+      assert(StoreOperands.empty() && "Orphaned operand list.");
+      StoreOperands.push_back(Chain);
+      StoreOperands.push_back(DAG.getConstant(Offsets[i], dl, MVT::i32));
+    }
 
-    // const_cast can be removed in later LLVM versions
-    EVT EltVT = getValueType(TD, RetTy).getVectorElementType();
-    bool NeedExtend = false;
-    if (EltVT.getSizeInBits() < 16)
-      NeedExtend = true;
-
-    // V1 store
-    if (NumElts == 1) {
-      SDValue StoreVal = OutVals[0];
-      // We only have one element, so just directly store it
-      if (NeedExtend)
-        StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal);
-      SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal };
-      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
-                                      DAG.getVTList(MVT::Other), Ops,
-                                      EltVT, MachinePointerInfo());
-    } else if (NumElts == 2) {
-      // V2 store
-      SDValue StoreVal0 = OutVals[0];
-      SDValue StoreVal1 = OutVals[1];
-
-      if (NeedExtend) {
-        StoreVal0 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal0);
-        StoreVal1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, StoreVal1);
-      }
+    SDValue RetVal = OutVals[i];
+    if (ExtendIntegerRetVal) {
+      RetVal = DAG.getNode(Outs[i].Flags.isSExt() ? ISD::SIGN_EXTEND
+                                                  : ISD::ZERO_EXTEND,
+                           dl, MVT::i32, RetVal);
+    } else if (RetVal.getValueSizeInBits() < 16) {
+      // Use 16-bit registers for small load-stores as it's the
+      // smallest general purpose register size supported by NVPTX.
+      RetVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, RetVal);
+    }
 
-      SDValue Ops[] = { Chain, DAG.getConstant(0, dl, MVT::i32), StoreVal0,
-                        StoreVal1 };
-      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetvalV2, dl,
-                                      DAG.getVTList(MVT::Other), Ops,
-                                      EltVT, MachinePointerInfo());
-    } else {
-      // V4 stores
-      // We have at least 4 elements (<3 x Ty> expands to 4 elements) and the
-      // vector will be expanded to a power of 2 elements, so we know we can
-      // always round up to the next multiple of 4 when creating the vector
-      // stores.
-      // e.g.  4 elem => 1 st.v4
-      //       6 elem => 2 st.v4
-      //       8 elem => 2 st.v4
-      //      11 elem => 3 st.v4
-
-      unsigned VecSize = 4;
-      if (OutVals[0].getValueSizeInBits() == 64)
-        VecSize = 2;
-
-      unsigned Offset = 0;
-
-      EVT VecVT =
-          EVT::getVectorVT(F->getContext(), EltVT, VecSize);
-      unsigned PerStoreOffset =
-          TD.getTypeAllocSize(VecVT.getTypeForEVT(F->getContext()));
-
-      for (unsigned i = 0; i < NumElts; i += VecSize) {
-        // Get values
-        SDValue StoreVal;
-        SmallVector<SDValue, 8> Ops;
-        Ops.push_back(Chain);
-        Ops.push_back(DAG.getConstant(Offset, dl, MVT::i32));
-        unsigned Opc = NVPTXISD::StoreRetvalV2;
-        EVT ExtendedVT = (NeedExtend) ? MVT::i16 : OutVals[0].getValueType();
-
-        StoreVal = OutVals[i];
-        if (NeedExtend)
-          StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-        Ops.push_back(StoreVal);
-
-        if (i + 1 < NumElts) {
-          StoreVal = OutVals[i + 1];
-          if (NeedExtend)
-            StoreVal = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-        } else {
-          StoreVal = DAG.getUNDEF(ExtendedVT);
-        }
-        Ops.push_back(StoreVal);
-
-        if (VecSize == 4) {
-          Opc = NVPTXISD::StoreRetvalV4;
-          if (i + 2 < NumElts) {
-            StoreVal = OutVals[i + 2];
-            if (NeedExtend)
-              StoreVal =
-                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-          } else {
-            StoreVal = DAG.getUNDEF(ExtendedVT);
-          }
-          Ops.push_back(StoreVal);
-
-          if (i + 3 < NumElts) {
-            StoreVal = OutVals[i + 3];
-            if (NeedExtend)
-              StoreVal =
-                  DAG.getNode(ISD::ZERO_EXTEND, dl, ExtendedVT, StoreVal);
-          } else {
-            StoreVal = DAG.getUNDEF(ExtendedVT);
-          }
-          Ops.push_back(StoreVal);
-        }
+    // Record the value to return.
+    StoreOperands.push_back(RetVal);
 
-        // Chain = DAG.getNode(Opc, dl, MVT::Other, &Ops[0], Ops.size());
-        Chain =
-            DAG.getMemIntrinsicNode(Opc, dl, DAG.getVTList(MVT::Other), Ops,
-                                    EltVT, MachinePointerInfo());
-        Offset += PerStoreOffset;
-      }
-    }
-  } else {
-    SmallVector<EVT, 16> ValVTs;
-    SmallVector<uint64_t, 16> Offsets;
-    ComputePTXValueVTs(*this, DAG.getDataLayout(), RetTy, ValVTs, &Offsets, 0);
-    assert(ValVTs.size() == OutVals.size() && "Bad return value decomposition");
-
-    for (unsigned i = 0, e = Outs.size(); i != e; ++i) {
-      SDValue theVal = OutVals[i];
-      EVT TheValType = theVal.getValueType();
-      unsigned numElems = 1;
-      if (TheValType.isVector())
-        numElems = TheValType.getVectorNumElements();
-      for (unsigned j = 0, je = numElems; j != je; ++j) {
-        SDValue TmpVal = theVal;
-        if (TheValType.isVector())
-          TmpVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                               TheValType.getVectorElementType(), TmpVal,
-                               DAG.getIntPtrConstant(j, dl));
-        EVT TheStoreType = ValVTs[i];
-        if (RetTy->isIntegerTy() && TD.getTypeAllocSizeInBits(RetTy) < 32) {
-          // The following zero-extension is for integer types only, and
-          // specifically not for aggregates.
-          TmpVal = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, TmpVal);
-          TheStoreType = MVT::i32;
-        } else if (RetTy->isHalfTy()) {
-          TheStoreType = MVT::f16;
-        } else if (TmpVal.getValueSizeInBits() < 16)
-          TmpVal = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i16, TmpVal);
-
-        SDValue Ops[] = {
-          Chain,
-          DAG.getConstant(Offsets[i], dl, MVT::i32),
-          TmpVal };
-        Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreRetval, dl,
-                                        DAG.getVTList(MVT::Other), Ops,
-                                        TheStoreType,
-                                        MachinePointerInfo());
+    // That's the last element of this store op.
+    if (VectorInfo[i] & PVF_LAST) {
+      NVPTXISD::NodeType Op;
+      unsigned NumElts = StoreOperands.size() - 2;
+      switch (NumElts) {
+      case 1:
+        Op = NVPTXISD::StoreRetval;
+        break;
+      case 2:
+        Op = NVPTXISD::StoreRetvalV2;
+        break;
+      case 4:
+        Op = NVPTXISD::StoreRetvalV4;
+        break;
+      default:
+        llvm_unreachable("Invalid vector info.");
       }
+
+      // Adjust type of load/store op if we've extended the scalar
+      // return value.
+      EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
+      Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
+                                      StoreOperands, TheStoreType,
+                                      MachinePointerInfo(), 1);
+      // Cleanup vector state.
+      StoreOperands.clear();
     }
   }
 
@@ -4413,6 +4302,27 @@ static SDValue PerformSHLCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue PerformSETCCCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  EVT CCType = N->getValueType(0);
+  SDValue A = N->getOperand(0);
+  SDValue B = N->getOperand(1);
+
+  if (CCType != MVT::v2i1 || A.getValueType() != MVT::v2f16)
+    return SDValue();
+
+  SDLoc DL(N);
+  // setp.f16x2 returns two scalar predicates, which we need to
+  // convert back to v2i1. The returned result will be scalarized by
+  // the legalizer, but the comparison will remain a single vector
+  // instruction.
+  SDValue CCNode = DCI.DAG.getNode(NVPTXISD::SETP_F16X2, DL,
+                                   DCI.DAG.getVTList(MVT::i1, MVT::i1),
+                                   {A, B, N->getOperand(2)});
+  return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, CCType, CCNode.getValue(0),
+                         CCNode.getValue(1));
+}
+
 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   CodeGenOpt::Level OptLevel = getTargetMachine().getOptLevel();
@@ -4430,6 +4340,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     case ISD::UREM:
     case ISD::SREM:
       return PerformREMCombine(N, DCI, OptLevel);
+    case ISD::SETCC:
+      return PerformSETCCCombine(N, DCI);
   }
   return SDValue();
 }
@@ -4453,12 +4365,15 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
   case MVT::v2i16:
   case MVT::v2i32:
   case MVT::v2i64:
+  case MVT::v2f16:
   case MVT::v2f32:
   case MVT::v2f64:
   case MVT::v4i8:
   case MVT::v4i16:
   case MVT::v4i32:
+  case MVT::v4f16:
   case MVT::v4f32:
+  case MVT::v8f16: // <4 x f16x2>
     // This is a "native" vector type
     break;
   }
@@ -4492,6 +4407,7 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
 
   unsigned Opcode = 0;
   SDVTList LdResVTs;
+  bool LoadF16x2 = false;
 
   switch (NumElts) {
   default:
@@ -4506,6 +4422,18 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
     LdResVTs = DAG.getVTList(ListVTs);
     break;
   }
+  case 8: {
+    // v8f16 is a special case. PTX doesn't have ld.v8.f16
+    // instruction. Instead, we split the vector into v2f16 chunks and
+    // load them with ld.v4.b32.
+    assert(EltVT == MVT::f16 && "Unsupported v8 vector type.");
+    LoadF16x2 = true;
+    Opcode = NVPTXISD::LoadV4;
+    EVT ListVTs[] = {MVT::v2f16, MVT::v2f16, MVT::v2f16, MVT::v2f16,
+                     MVT::Other};
+    LdResVTs = DAG.getVTList(ListVTs);
+    break;
+  }
   }
 
   // Copy regular operands
@@ -4519,13 +4447,26 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
                                           LD->getMemoryVT(),
                                           LD->getMemOperand());
 
-  SmallVector<SDValue, 4> ScalarRes;
-
-  for (unsigned i = 0; i < NumElts; ++i) {
-    SDValue Res = NewLD.getValue(i);
-    if (NeedTrunc)
-      Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
-    ScalarRes.push_back(Res);
+  SmallVector<SDValue, 8> ScalarRes;
+  if (LoadF16x2) {
+    // Split v2f16 subvectors back into individual elements.
+    NumElts /= 2;
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue SubVector = NewLD.getValue(i);
+      SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
+                               DAG.getIntPtrConstant(0, DL));
+      SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector,
+                               DAG.getIntPtrConstant(1, DL));
+      ScalarRes.push_back(E0);
+      ScalarRes.push_back(E1);
+    }
+  } else {
+    for (unsigned i = 0; i < NumElts; ++i) {
+      SDValue Res = NewLD.getValue(i);
+      if (NeedTrunc)
+        Res = DAG.getNode(ISD::TRUNCATE, DL, ResVT.getVectorElementType(), Res);
+      ScalarRes.push_back(Res);
+    }
   }
 
   SDValue LoadChain = NewLD.getValue(NumElts);
diff --git a/lib/Target/NVPTX/NVPTXISelLowering.h b/lib/Target/NVPTX/NVPTXISelLowering.h
index 05c54018b739786e4c6f422d722f8437c57cdaef..9d7b70d80c1178a7efb0452fe200b96be6a2c4ef 100644
--- a/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -56,6 +56,7 @@ enum NodeType : unsigned {
   MUL_WIDE_SIGNED,
   MUL_WIDE_UNSIGNED,
   IMAD,
+  SETP_F16X2,
   Dummy,
 
   LoadV2 = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -73,7 +74,7 @@ enum NodeType : unsigned {
   StoreParamV2,
   StoreParamV4,
   StoreParamS32, // to sext and store a <32bit value, not used currently
-  StoreParamU32, // to zext and store a <32bit value, not used currently 
+  StoreParamU32, // to zext and store a <32bit value, not used currently
   StoreRetval,
   StoreRetvalV2,
   StoreRetvalV4,
@@ -526,6 +527,12 @@ public:
   // to sign-preserving zero.
   bool useF32FTZ(const MachineFunction &MF) const;
 
+  SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                          int &ExtraSteps, bool &UseOneConst,
+                          bool Reciprocal) const override;
+
+  unsigned combineRepeatedFPDivisors() const override { return 2; }
+
   bool allowFMA(MachineFunction &MF, CodeGenOpt::Level OptLevel) const;
   bool allowUnsafeFPMath(MachineFunction &MF) const;
 
@@ -543,14 +550,15 @@ private:
   const NVPTXSubtarget &STI; // cache the subtarget here
   SDValue getParamSymbol(SelectionDAG &DAG, int idx, EVT) const;
 
+  SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerLOAD(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerLOADi1(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTOREi1(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerSTOREf16(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
index 8d00bbb5e9c262cde5de4265a3385d3df3fdd25b..f12ed81b6d9fc9024291ddde1a245680ae53e200 100644
--- a/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
+++ b/lib/Target/NVPTX/NVPTXImageOptimizer.cpp
@@ -96,9 +96,7 @@ bool NVPTXImageOptimizer::replaceIsTypePSampler(Instruction &I) {
     // This is an OpenCL sampler, so it must be a samplerref
     replaceWith(&I, ConstantInt::getTrue(I.getContext()));
     return true;
-  } else if (isImageWriteOnly(*TexHandle) ||
-             isImageReadWrite(*TexHandle) ||
-             isImageReadOnly(*TexHandle)) {
+  } else if (isImage(*TexHandle)) {
     // This is an OpenCL image, so it cannot be a samplerref
     replaceWith(&I, ConstantInt::getFalse(I.getContext()));
     return true;
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index 67e6e252eb9a31e1090c057d7100ff8885662088..3026f0be242dd07fa8aa7c684fc458cd0cd7f026 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -55,6 +55,8 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   } else if (DestRC == &NVPTX::Float16RegsRegClass) {
     Op = (SrcRC == &NVPTX::Float16RegsRegClass ? NVPTX::FMOV16rr
                                                : NVPTX::BITCONVERT_16_I2F);
+  } else if (DestRC == &NVPTX::Float16x2RegsRegClass) {
+    Op = NVPTX::IMOV32rr;
   } else if (DestRC == &NVPTX::Float32RegsRegClass) {
     Op = (SrcRC == &NVPTX::Float32RegsRegClass ? NVPTX::FMOV32rr
                                                : NVPTX::BITCONVERT_32_I2F);
diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td
index 8b703bd196e725fc004d55c9a0cd36d03fbf53fb..2b847414b8a8aef2c79cb1eedbcf95d188a1456b 100644
--- a/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -102,6 +102,9 @@ def CmpNAN_FTZ  : PatLeaf<(i32 0x111)>;
 def CmpMode : Operand<i32> {
   let PrintMethod = "printCmpMode";
 }
+def VecElement : Operand<i32> {
+  let PrintMethod = "printVecElement";
+}
 
 //===----------------------------------------------------------------------===//
 // NVPTX Instruction Predicate Definitions
@@ -305,6 +308,19 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
                [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
                Requires<[useFP16Math, allowFMA]>;
 
+   def f16x2rr_ftz :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".ftz.f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, allowFMA, doF32FTZ]>;
+   def f16x2rr :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, allowFMA]>;
+
    // These have strange names so we don't perturb existing mir tests.
    def _rnf64rr :
      NVPTXInst<(outs Float64Regs:$dst),
@@ -354,6 +370,18 @@ multiclass F3_fma_component<string OpcStr, SDNode OpNode> {
                !strconcat(OpcStr, ".rn.f16 \t$dst, $a, $b;"),
                [(set Float16Regs:$dst, (OpNode Float16Regs:$a, Float16Regs:$b))]>,
                Requires<[useFP16Math, noFMA]>;
+   def _rnf16x2rr_ftz :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".rn.ftz.f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, noFMA, doF32FTZ]>;
+   def _rnf16x2rr :
+     NVPTXInst<(outs Float16x2Regs:$dst),
+               (ins Float16x2Regs:$a, Float16x2Regs:$b),
+               !strconcat(OpcStr, ".rn.f16x2 \t$dst, $a, $b;"),
+               [(set Float16x2Regs:$dst, (OpNode Float16x2Regs:$a, Float16x2Regs:$b))]>,
+               Requires<[useFP16Math, noFMA]>;
 }
 
 // Template for operations which take two f32 or f64 operands.  Provides three
@@ -489,7 +517,7 @@ multiclass ADD_SUB_i1<SDNode OpNode> {
 defm ADD_i1 : ADD_SUB_i1<add>;
 defm SUB_i1 : ADD_SUB_i1<sub>;
 
-// int16, int32, and int64 signed addition.  Since nvptx is 2's compliment, we
+// int16, int32, and int64 signed addition.  Since nvptx is 2's complement, we
 // also use these for unsigned arithmetic.
 defm ADD : I3<"add.s", add>;
 defm SUB : I3<"sub.s", sub>;
@@ -966,18 +994,9 @@ def FDIV32ri_prec :
             Requires<[reqPTX20]>;
 
 //
-// F32 rsqrt
+// FMA
 //
 
-def RSQRTF32approx1r : NVPTXInst<(outs Float32Regs:$dst), (ins Float32Regs:$b),
-                       "rsqrt.approx.f32 \t$dst, $b;", []>;
-
-// Convert 1.0f/sqrt(x) to rsqrt.approx.f32.  (There is an rsqrt.approx.f64, but
-// it's emulated in software.)
-def: Pat<(fdiv FloatConst1, (int_nvvm_sqrt_f Float32Regs:$b)),
-         (RSQRTF32approx1r Float32Regs:$b)>,
-         Requires<[do_DIVF32_FULL, do_SQRTF32_APPROX, doNoF32FTZ]>;
-
 multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
    def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
@@ -1000,15 +1019,17 @@ multiclass FMA<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred>
                        Requires<[Pred]>;
 }
 
-multiclass FMA_F16<string OpcStr, RegisterClass RC, Operand ImmCls, Predicate Pred> {
+multiclass FMA_F16<string OpcStr, RegisterClass RC, Predicate Pred> {
    def rrr : NVPTXInst<(outs RC:$dst), (ins RC:$a, RC:$b, RC:$c),
                        !strconcat(OpcStr, " \t$dst, $a, $b, $c;"),
                        [(set RC:$dst, (fma RC:$a, RC:$b, RC:$c))]>,
                        Requires<[useFP16Math, Pred]>;
 }
 
-defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, f16imm, doF32FTZ>;
-defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, f16imm, true>;
+defm FMA16_ftz : FMA_F16<"fma.rn.ftz.f16", Float16Regs, doF32FTZ>;
+defm FMA16     : FMA_F16<"fma.rn.f16", Float16Regs, true>;
+defm FMA16x2_ftz : FMA_F16<"fma.rn.ftz.f16x2", Float16x2Regs, doF32FTZ>;
+defm FMA16x2     : FMA_F16<"fma.rn.f16x2", Float16x2Regs, true>;
 defm FMA32_ftz : FMA<"fma.rn.ftz.f32", Float32Regs, f32imm, doF32FTZ>;
 defm FMA32     : FMA<"fma.rn.f32", Float32Regs, f32imm, true>;
 defm FMA64     : FMA<"fma.rn.f64", Float64Regs, f64imm, true>;
@@ -1399,9 +1420,17 @@ defm SETP_f64 : SETP<"f64", Float64Regs, f64imm>;
 def SETP_f16rr :
       NVPTXInst<(outs Int1Regs:$dst),
                 (ins Float16Regs:$a, Float16Regs:$b, CmpMode:$cmp),
-                "setp${cmp:base}${cmp:ftz}.f16	$dst, $a, $b;",
+                "setp${cmp:base}${cmp:ftz}.f16 \t$dst, $a, $b;",
                 []>, Requires<[useFP16Math]>;
 
+def SETP_f16x2rr :
+      NVPTXInst<(outs Int1Regs:$p, Int1Regs:$q),
+                (ins Float16x2Regs:$a, Float16x2Regs:$b, CmpMode:$cmp),
+                "setp${cmp:base}${cmp:ftz}.f16x2 \t$p|$q, $a, $b;",
+                []>,
+                Requires<[useFP16Math]>;
+
+
 // FIXME: This doesn't appear to be correct.  The "set" mnemonic has the form
 // "set.CmpOp{.ftz}.dtype.stype", where dtype is the type of the destination
 // reg, either u32, s32, or f32.  Anyway these aren't used at the moment.
@@ -1497,6 +1526,13 @@ defm SELP_f16 : SELP_PATTERN<"b16", Float16Regs, f16imm, fpimm>;
 defm SELP_f32 : SELP_PATTERN<"f32", Float32Regs, f32imm, fpimm>;
 defm SELP_f64 : SELP_PATTERN<"f64", Float64Regs, f64imm, fpimm>;
 
+def SELP_f16x2rr :
+    NVPTXInst<(outs Float16x2Regs:$dst),
+              (ins Float16x2Regs:$a, Float16x2Regs:$b, Int1Regs:$p),
+              "selp.b32 \t$dst, $a, $b, $p;",
+              [(set Float16x2Regs:$dst,
+                    (select Int1Regs:$p, Float16x2Regs:$a, Float16x2Regs:$b))]>;
+
 //-----------------------------------
 // Data Movement (Load / Store, Move)
 //-----------------------------------
@@ -2070,10 +2106,15 @@ def LoadParamMemV4I32  : LoadParamV4MemInst<Int32Regs, ".b32">;
 def LoadParamMemV4I16  : LoadParamV4MemInst<Int16Regs, ".b16">;
 def LoadParamMemV4I8   : LoadParamV4MemInst<Int16Regs, ".b8">;
 def LoadParamMemF16    : LoadParamMemInst<Float16Regs, ".b16">;
+def LoadParamMemF16x2  : LoadParamMemInst<Float16x2Regs, ".b32">;
 def LoadParamMemF32    : LoadParamMemInst<Float32Regs, ".f32">;
 def LoadParamMemF64    : LoadParamMemInst<Float64Regs, ".f64">;
+def LoadParamMemV2F16  : LoadParamV2MemInst<Float16Regs, ".b16">;
+def LoadParamMemV2F16x2: LoadParamV2MemInst<Float16x2Regs, ".b32">;
 def LoadParamMemV2F32  : LoadParamV2MemInst<Float32Regs, ".f32">;
 def LoadParamMemV2F64  : LoadParamV2MemInst<Float64Regs, ".f64">;
+def LoadParamMemV4F16  : LoadParamV4MemInst<Float16Regs, ".b16">;
+def LoadParamMemV4F16x2: LoadParamV4MemInst<Float16x2Regs, ".b32">;
 def LoadParamMemV4F32  : LoadParamV4MemInst<Float32Regs, ".f32">;
 
 def StoreParamI64    : StoreParamInst<Int64Regs, ".b64">;
@@ -2091,10 +2132,15 @@ def StoreParamV4I16  : StoreParamV4Inst<Int16Regs, ".b16">;
 def StoreParamV4I8   : StoreParamV4Inst<Int16Regs, ".b8">;
 
 def StoreParamF16      : StoreParamInst<Float16Regs, ".b16">;
+def StoreParamF16x2    : StoreParamInst<Float16x2Regs, ".b32">;
 def StoreParamF32      : StoreParamInst<Float32Regs, ".f32">;
 def StoreParamF64      : StoreParamInst<Float64Regs, ".f64">;
+def StoreParamV2F16    : StoreParamV2Inst<Float16Regs, ".b16">;
+def StoreParamV2F16x2  : StoreParamV2Inst<Float16x2Regs, ".b32">;
 def StoreParamV2F32    : StoreParamV2Inst<Float32Regs, ".f32">;
 def StoreParamV2F64    : StoreParamV2Inst<Float64Regs, ".f64">;
+def StoreParamV4F16    : StoreParamV4Inst<Float16Regs, ".b16">;
+def StoreParamV4F16x2  : StoreParamV4Inst<Float16x2Regs, ".b32">;
 def StoreParamV4F32    : StoreParamV4Inst<Float32Regs, ".f32">;
 
 def StoreRetvalI64    : StoreRetvalInst<Int64Regs, ".b64">;
@@ -2112,9 +2158,14 @@ def StoreRetvalV4I8   : StoreRetvalV4Inst<Int16Regs, ".b8">;
 def StoreRetvalF64    : StoreRetvalInst<Float64Regs, ".f64">;
 def StoreRetvalF32    : StoreRetvalInst<Float32Regs, ".f32">;
 def StoreRetvalF16    : StoreRetvalInst<Float16Regs, ".b16">;
+def StoreRetvalF16x2  : StoreRetvalInst<Float16x2Regs, ".b32">;
 def StoreRetvalV2F64  : StoreRetvalV2Inst<Float64Regs, ".f64">;
 def StoreRetvalV2F32  : StoreRetvalV2Inst<Float32Regs, ".f32">;
+def StoreRetvalV2F16  : StoreRetvalV2Inst<Float16Regs, ".b16">;
+def StoreRetvalV2F16x2: StoreRetvalV2Inst<Float16x2Regs, ".b32">;
 def StoreRetvalV4F32  : StoreRetvalV4Inst<Float32Regs, ".f32">;
+def StoreRetvalV4F16  : StoreRetvalV4Inst<Float16Regs, ".b16">;
+def StoreRetvalV4F16x2: StoreRetvalV4Inst<Float16x2Regs, ".b32">;
 
 def CallArgBeginInst : NVPTXInst<(outs), (ins), "(", [(CallArgBegin)]>;
 def CallArgEndInst1  : NVPTXInst<(outs), (ins), ");", [(CallArgEnd (i32 1))]>;
@@ -2261,6 +2312,7 @@ let mayLoad=1, hasSideEffects=0 in {
   defm LD_i32 : LD<Int32Regs>;
   defm LD_i64 : LD<Int64Regs>;
   defm LD_f16 : LD<Float16Regs>;
+  defm LD_f16x2 : LD<Float16x2Regs>;
   defm LD_f32 : LD<Float32Regs>;
   defm LD_f64 : LD<Float64Regs>;
 }
@@ -2310,6 +2362,7 @@ let mayStore=1, hasSideEffects=0 in {
   defm ST_i32 : ST<Int32Regs>;
   defm ST_i64 : ST<Int64Regs>;
   defm ST_f16 : ST<Float16Regs>;
+  defm ST_f16x2 : ST<Float16x2Regs>;
   defm ST_f32 : ST<Float32Regs>;
   defm ST_f64 : ST<Float64Regs>;
 }
@@ -2396,6 +2449,8 @@ let mayLoad=1, hasSideEffects=0 in {
   defm LDV_i16 : LD_VEC<Int16Regs>;
   defm LDV_i32 : LD_VEC<Int32Regs>;
   defm LDV_i64 : LD_VEC<Int64Regs>;
+  defm LDV_f16 : LD_VEC<Float16Regs>;
+  defm LDV_f16x2 : LD_VEC<Float16x2Regs>;
   defm LDV_f32 : LD_VEC<Float32Regs>;
   defm LDV_f64 : LD_VEC<Float64Regs>;
 }
@@ -2489,17 +2544,18 @@ let mayStore=1, hasSideEffects=0 in {
   defm STV_i16 : ST_VEC<Int16Regs>;
   defm STV_i32 : ST_VEC<Int32Regs>;
   defm STV_i64 : ST_VEC<Int64Regs>;
+  defm STV_f16 : ST_VEC<Float16Regs>;
+  defm STV_f16x2 : ST_VEC<Float16x2Regs>;
   defm STV_f32 : ST_VEC<Float32Regs>;
   defm STV_f64 : ST_VEC<Float64Regs>;
 }
 
-
 //---- Conversion ----
 
 class F_BITCONVERT<string SzStr, NVPTXRegClass regclassIn,
   NVPTXRegClass regclassOut> :
            NVPTXInst<(outs regclassOut:$d), (ins regclassIn:$a),
-           !strconcat("mov.b", !strconcat(SzStr, " \t $d, $a;")),
+           !strconcat("mov.b", !strconcat(SzStr, " \t$d, $a;")),
      [(set regclassOut:$d, (bitconvert regclassIn:$a))]>;
 
 def BITCONVERT_16_I2F : F_BITCONVERT<"16", Int16Regs, Float16Regs>;
@@ -2508,6 +2564,8 @@ def BITCONVERT_32_I2F : F_BITCONVERT<"32", Int32Regs, Float32Regs>;
 def BITCONVERT_32_F2I : F_BITCONVERT<"32", Float32Regs, Int32Regs>;
 def BITCONVERT_64_I2F : F_BITCONVERT<"64", Int64Regs, Float64Regs>;
 def BITCONVERT_64_F2I : F_BITCONVERT<"64", Float64Regs, Int64Regs>;
+def BITCONVERT_32_I2F16x2 : F_BITCONVERT<"32", Int32Regs, Float16x2Regs>;
+def BITCONVERT_32_F16x22I : F_BITCONVERT<"32", Float16x2Regs, Int32Regs>;
 
 // NOTE: pred->fp are currently sub-optimal due to an issue in TableGen where
 // we cannot specify floating-point literals in isel patterns.  Therefore, we
@@ -2750,6 +2808,9 @@ def : Pat<(select Int32Regs:$pred, Int32Regs:$a, Int32Regs:$b),
 def : Pat<(select Int32Regs:$pred, Int64Regs:$a, Int64Regs:$b),
           (SELP_b64rr Int64Regs:$a, Int64Regs:$b,
           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
+def : Pat<(select Int32Regs:$pred, Float16Regs:$a, Float16Regs:$b),
+          (SELP_f16rr Float16Regs:$a, Float16Regs:$b,
+          (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
 def : Pat<(select Int32Regs:$pred, Float32Regs:$a, Float32Regs:$b),
           (SELP_f32rr Float32Regs:$a, Float32Regs:$b,
           (SETP_b32ri (ANDb32ri Int32Regs:$pred, 1), 1, CmpEQ))>;
@@ -2788,6 +2849,49 @@ let hasSideEffects = 0 in {
   def F64toV2F32 : NVPTXInst<(outs Float32Regs:$d1, Float32Regs:$d2),
                              (ins Float64Regs:$s),
                              "mov.b64 \t{{$d1, $d2}}, $s;", []>;
+
+}
+
+let hasSideEffects = 0 in {
+  // Extract element of f16x2 register. PTX does not provide any way
+  // to access elements of f16x2 vector directly, so we need to
+  // extract it using a temporary register.
+  def F16x2toF16_0 : NVPTXInst<(outs Float16Regs:$dst),
+                               (ins Float16x2Regs:$src),
+                               "{{ .reg .b16 \t%tmp_hi;\n\t"
+                               "  mov.b32 \t{$dst, %tmp_hi}, $src; }}",
+                               [(set Float16Regs:$dst,
+                                 (extractelt (v2f16 Float16x2Regs:$src), 0))]>;
+  def F16x2toF16_1 : NVPTXInst<(outs Float16Regs:$dst),
+                               (ins Float16x2Regs:$src),
+                               "{{ .reg .b16 \t%tmp_lo;\n\t"
+                               "  mov.b32 \t{%tmp_lo, $dst}, $src; }}",
+                               [(set Float16Regs:$dst,
+                                 (extractelt (v2f16 Float16x2Regs:$src), 1))]>;
+
+  // Coalesce two f16 registers into f16x2
+  def BuildF16x2 : NVPTXInst<(outs Float16x2Regs:$dst),
+                             (ins Float16Regs:$a, Float16Regs:$b),
+                             "mov.b32 \t$dst, {{$a, $b}};",
+                             [(set Float16x2Regs:$dst,
+                               (build_vector (f16 Float16Regs:$a), (f16 Float16Regs:$b)))]>;
+
+  // Directly initializing underlying the b32 register is one less SASS
+  // instruction than than vector-packing move.
+  def BuildF16x2i : NVPTXInst<(outs Float16x2Regs:$dst), (ins i32imm:$src),
+                              "mov.b32 \t$dst, $src;",
+                              []>;
+
+  // Split f16x2 into two f16 registers.
+  def SplitF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
+                              (ins Float16x2Regs:$src),
+                              "mov.b32 \t{{$lo, $hi}}, $src;",
+                              []>;
+  // Split an i32 into two f16
+  def SplitI32toF16x2  : NVPTXInst<(outs Float16Regs:$lo, Float16Regs:$hi),
+                                   (ins Int32Regs:$src),
+                                   "mov.b32 \t{{$lo, $hi}}, $src;",
+                                   []>;
 }
 
 // Count leading zeros
diff --git a/lib/Target/NVPTX/NVPTXIntrinsics.td b/lib/Target/NVPTX/NVPTXIntrinsics.td
index 509a253d24a025be3f9ded0bfeaae751969bfae3..8d228a9eeb74d8b8a9aa90fda13974a9f273021d 100644
--- a/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -1503,6 +1503,8 @@ defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
 defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
 defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
 defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f16 : LDU_G<"b16 \t$result, [$src];", Float16Regs>;
+defm INT_PTX_LDU_GLOBAL_f16x2 : LDU_G<"b32 \t$result, [$src];", Float16x2Regs>;
 defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
 defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
 defm INT_PTX_LDU_GLOBAL_p32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
@@ -1553,6 +1555,10 @@ defm INT_PTX_LDU_G_v2i16_ELE
   : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
 defm INT_PTX_LDU_G_v2i32_ELE
   : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDU_G_v2f16_ELE
+  : VLDU_G_ELE_V2<"v2.b16 \t{{$dst1, $dst2}}, [$src];", Float16Regs>;
+defm INT_PTX_LDU_G_v2f16x2_ELE
+  : VLDU_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDU_G_v2f32_ELE
   : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
 defm INT_PTX_LDU_G_v2i64_ELE
@@ -1567,6 +1573,12 @@ defm INT_PTX_LDU_G_v4i16_ELE
 defm INT_PTX_LDU_G_v4i32_ELE
   : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
     Int32Regs>;
+defm INT_PTX_LDU_G_v4f16_ELE
+  : VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float16Regs>;
+defm INT_PTX_LDU_G_v4f16x2_ELE
+  : VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
+    Float16x2Regs>;
 defm INT_PTX_LDU_G_v4f32_ELE
   : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
     Float32Regs>;
@@ -1606,6 +1618,10 @@ defm INT_PTX_LDG_GLOBAL_i32
   : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
 defm INT_PTX_LDG_GLOBAL_i64
   : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f16
+  : LDG_G<"b16 \t$result, [$src];", Float16Regs>;
+defm INT_PTX_LDG_GLOBAL_f16x2
+  : LDG_G<"b32 \t$result, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_GLOBAL_f32
   : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
 defm INT_PTX_LDG_GLOBAL_f64
@@ -1661,6 +1677,10 @@ defm INT_PTX_LDG_G_v2i16_ELE
   : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
 defm INT_PTX_LDG_G_v2i32_ELE
   : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v2f16_ELE
+  : VLDG_G_ELE_V2<"v2.b16 \t{{$dst1, $dst2}}, [$src];", Float16Regs>;
+defm INT_PTX_LDG_G_v2f16x2_ELE
+  : VLDG_G_ELE_V2<"v2.b32 \t{{$dst1, $dst2}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_G_v2f32_ELE
   : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
 defm INT_PTX_LDG_G_v2i64_ELE
@@ -1673,6 +1693,10 @@ defm INT_PTX_LDG_G_v4i16_ELE
   : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
 defm INT_PTX_LDG_G_v4i32_ELE
   : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
+defm INT_PTX_LDG_G_v4f16_ELE
+  : VLDG_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float16Regs>;
+defm INT_PTX_LDG_G_v4f16x2_ELE
+  : VLDG_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float16x2Regs>;
 defm INT_PTX_LDG_G_v4f32_ELE
   : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
 
diff --git a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index b925b632ee4a7647c7ed294f15dd51568dd94bfb..3be291b48b8f29d4778ae1e56702664219afb55d 100644
--- a/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -26,6 +26,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
 
 #define DEBUG_TYPE "nvptx"
 
@@ -54,188 +55,6 @@ struct NVPTXLowerAggrCopies : public FunctionPass {
 
 char NVPTXLowerAggrCopies::ID = 0;
 
-// Lower memcpy to loop.
-void convertMemCpyToLoop(Instruction *ConvertedInst, Value *SrcAddr,
-                         Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
-                         bool DstIsVolatile, LLVMContext &Context,
-                         Function &F) {
-  Type *TypeOfCopyLen = CopyLen->getType();
-
-  BasicBlock *OrigBB = ConvertedInst->getParent();
-  BasicBlock *NewBB =
-      ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
-  BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
-
-  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
-  IRBuilder<> Builder(OrigBB->getTerminator());
-
-  // SrcAddr and DstAddr are expected to be pointer types,
-  // so no check is made here.
-  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
-  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
-
-  // Cast pointers to (char *)
-  SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
-  DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
-
-  IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
-
-  // load from SrcAddr+LoopIndex
-  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
-  // word-sized loads and stores.
-  Value *Element =
-      LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
-                                 LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
-                             SrcIsVolatile);
-  // store at DstAddr+LoopIndex
-  LoopBuilder.CreateStore(Element,
-                          LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
-                                                        DstAddr, LoopIndex),
-                          DstIsVolatile);
-
-  // The value for LoopIndex coming from backedge is (LoopIndex + 1)
-  Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
-  LoopIndex->addIncoming(NewIndex, LoopBB);
-
-  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
-                           NewBB);
-}
-
-// Lower memmove to IR. memmove is required to correctly copy overlapping memory
-// regions; therefore, it has to check the relative positions of the source and
-// destination pointers and choose the copy direction accordingly.
-//
-// The code below is an IR rendition of this C function:
-//
-// void* memmove(void* dst, const void* src, size_t n) {
-//   unsigned char* d = dst;
-//   const unsigned char* s = src;
-//   if (s < d) {
-//     // copy backwards
-//     while (n--) {
-//       d[n] = s[n];
-//     }
-//   } else {
-//     // copy forward
-//     for (size_t i = 0; i < n; ++i) {
-//       d[i] = s[i];
-//     }
-//   }
-//   return dst;
-// }
-void convertMemMoveToLoop(Instruction *ConvertedInst, Value *SrcAddr,
-                          Value *DstAddr, Value *CopyLen, bool SrcIsVolatile,
-                          bool DstIsVolatile, LLVMContext &Context,
-                          Function &F) {
-  Type *TypeOfCopyLen = CopyLen->getType();
-  BasicBlock *OrigBB = ConvertedInst->getParent();
-
-  // Create the a comparison of src and dst, based on which we jump to either
-  // the forward-copy part of the function (if src >= dst) or the backwards-copy
-  // part (if src < dst).
-  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
-  // structure. Its block terminators (unconditional branches) are replaced by
-  // the appropriate conditional branches when the loop is built.
-  ICmpInst *PtrCompare = new ICmpInst(ConvertedInst, ICmpInst::ICMP_ULT,
-                                      SrcAddr, DstAddr, "compare_src_dst");
-  TerminatorInst *ThenTerm, *ElseTerm;
-  SplitBlockAndInsertIfThenElse(PtrCompare, ConvertedInst, &ThenTerm,
-                                &ElseTerm);
-
-  // Each part of the function consists of two blocks:
-  //   copy_backwards:        used to skip the loop when n == 0
-  //   copy_backwards_loop:   the actual backwards loop BB
-  //   copy_forward:          used to skip the loop when n == 0
-  //   copy_forward_loop:     the actual forward loop BB
-  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
-  CopyBackwardsBB->setName("copy_backwards");
-  BasicBlock *CopyForwardBB = ElseTerm->getParent();
-  CopyForwardBB->setName("copy_forward");
-  BasicBlock *ExitBB = ConvertedInst->getParent();
-  ExitBB->setName("memmove_done");
-
-  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
-  // between both backwards and forward copy clauses.
-  ICmpInst *CompareN =
-      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
-                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
-
-  // Copying backwards.
-  BasicBlock *LoopBB =
-      BasicBlock::Create(Context, "copy_backwards_loop", &F, CopyForwardBB);
-  IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
-  Value *IndexPtr = LoopBuilder.CreateSub(
-      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
-  Value *Element = LoopBuilder.CreateLoad(
-      LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
-  LoopBuilder.CreateStore(Element,
-                          LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
-  LoopBuilder.CreateCondBr(
-      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
-      ExitBB, LoopBB);
-  LoopPhi->addIncoming(IndexPtr, LoopBB);
-  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
-  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
-  ThenTerm->eraseFromParent();
-
-  // Copying forward.
-  BasicBlock *FwdLoopBB =
-      BasicBlock::Create(Context, "copy_forward_loop", &F, ExitBB);
-  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
-  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
-  Value *FwdElement = FwdLoopBuilder.CreateLoad(
-      FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
-  FwdLoopBuilder.CreateStore(
-      FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
-  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
-      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
-  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
-                              ExitBB, FwdLoopBB);
-  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
-  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
-
-  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
-  ElseTerm->eraseFromParent();
-}
-
-// Lower memset to loop.
-void convertMemSetToLoop(Instruction *ConvertedInst, Value *DstAddr,
-                         Value *CopyLen, Value *SetValue, LLVMContext &Context,
-                         Function &F) {
-  BasicBlock *OrigBB = ConvertedInst->getParent();
-  BasicBlock *NewBB =
-      ConvertedInst->getParent()->splitBasicBlock(ConvertedInst, "split");
-  BasicBlock *LoopBB = BasicBlock::Create(Context, "loadstoreloop", &F, NewBB);
-
-  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
-  IRBuilder<> Builder(OrigBB->getTerminator());
-
-  // Cast pointer to the type of value getting stored
-  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
-  DstAddr = Builder.CreateBitCast(DstAddr,
-                                  PointerType::get(SetValue->getType(), dstAS));
-
-  IRBuilder<> LoopBuilder(LoopBB);
-  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
-  LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
-
-  LoopBuilder.CreateStore(
-      SetValue,
-      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
-      false);
-
-  Value *NewIndex =
-      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
-  LoopIndex->addIncoming(NewIndex, LoopBB);
-
-  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
-                           NewBB);
-}
-
 bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   SmallVector<LoadInst *, 4> AggrLoads;
   SmallVector<MemIntrinsic *, 4> MemCalls;
@@ -287,13 +106,13 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
     unsigned NumLoads = DL.getTypeStoreSize(LI->getType());
     Value *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads);
 
-    convertMemCpyToLoop(/* ConvertedInst */ SI,
-                        /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
-                        /* CopyLen */ CopyLen,
-                        /* SrcIsVolatile */ LI->isVolatile(),
-                        /* DstIsVolatile */ SI->isVolatile(),
-                        /* Context */ Context,
-                        /* Function F */ F);
+    createMemCpyLoop(/* ConvertedInst */ SI,
+                     /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
+                     /* CopyLen */ CopyLen,
+                     /* SrcAlign */ LI->getAlignment(),
+                     /* DestAlign */ SI->getAlignment(),
+                     /* SrcIsVolatile */ LI->isVolatile(),
+                     /* DstIsVolatile */ SI->isVolatile());
 
     SI->eraseFromParent();
     LI->eraseFromParent();
@@ -302,31 +121,11 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
   // Transform mem* intrinsic calls.
   for (MemIntrinsic *MemCall : MemCalls) {
     if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
-      convertMemCpyToLoop(/* ConvertedInst */ Memcpy,
-                          /* SrcAddr */ Memcpy->getRawSource(),
-                          /* DstAddr */ Memcpy->getRawDest(),
-                          /* CopyLen */ Memcpy->getLength(),
-                          /* SrcIsVolatile */ Memcpy->isVolatile(),
-                          /* DstIsVolatile */ Memcpy->isVolatile(),
-                          /* Context */ Context,
-                          /* Function F */ F);
+      expandMemCpyAsLoop(Memcpy);
     } else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
-      convertMemMoveToLoop(/* ConvertedInst */ Memmove,
-                           /* SrcAddr */ Memmove->getRawSource(),
-                           /* DstAddr */ Memmove->getRawDest(),
-                           /* CopyLen */ Memmove->getLength(),
-                           /* SrcIsVolatile */ Memmove->isVolatile(),
-                           /* DstIsVolatile */ Memmove->isVolatile(),
-                           /* Context */ Context,
-                           /* Function F */ F);
-
+      expandMemMoveAsLoop(Memmove);
     } else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
-      convertMemSetToLoop(/* ConvertedInst */ Memset,
-                          /* DstAddr */ Memset->getRawDest(),
-                          /* CopyLen */ Memset->getLength(),
-                          /* SetValue */ Memset->getValue(),
-                          /* Context */ Context,
-                          /* Function F */ F);
+      expandMemSetAsLoop(Memset);
     }
     MemCall->eraseFromParent();
   }
diff --git a/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 3f0c7be7863d10c339bc1d49b6a3e0eee92ef2dd..5b626cbcd5ba12ebd52bb17038d4810bad0cde98 100644
--- a/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -159,7 +159,8 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   assert(PType && "Expecting pointer type in handleByValParam");
 
   Type *StructType = PType->getElementType();
-  AllocaInst *AllocA = new AllocaInst(StructType, Arg->getName(), FirstInst);
+  unsigned AS = Func->getParent()->getDataLayout().getAllocaAddrSpace();
+  AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
   // Set the alignment to alignment of the byval parameter. This is because,
   // later load/stores assume that alignment, and we are going to replace
   // the use of the byval parameter with this alloca instruction.
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
index 9caedfb0fef2f7cb1a98800db522591faac4e9f2..8d46694fbe50abacfd1beb2ee2b6b9f33d195f64 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.cpp
@@ -35,6 +35,8 @@ std::string getNVPTXRegClassName(TargetRegisterClass const *RC) {
     // accepted for all supported fp16 instructions on all GPU
     // variants, so we can use them instead.
     return ".b16";
+  if (RC == &NVPTX::Float16x2RegsRegClass)
+    return ".b32";
   if (RC == &NVPTX::Float64RegsRegClass)
     return ".f64";
   if (RC == &NVPTX::Int64RegsRegClass)
@@ -73,6 +75,8 @@ std::string getNVPTXRegClassStr(TargetRegisterClass const *RC) {
     return "%f";
   if (RC == &NVPTX::Float16RegsRegClass)
     return "%h";
+  if (RC == &NVPTX::Float16x2RegsRegClass)
+    return "%hh";
   if (RC == &NVPTX::Float64RegsRegClass)
     return "%fd";
   if (RC == &NVPTX::Int64RegsRegClass)
diff --git a/lib/Target/NVPTX/NVPTXRegisterInfo.td b/lib/Target/NVPTX/NVPTXRegisterInfo.td
index fd255bdb6d26214db8a3b4e6d7686f4afc08ff36..f04764a9e9a39a36356623e881b859ac7edf541c 100644
--- a/lib/Target/NVPTX/NVPTXRegisterInfo.td
+++ b/lib/Target/NVPTX/NVPTXRegisterInfo.td
@@ -37,6 +37,7 @@ foreach i = 0-4 in {
   def R#i  : NVPTXReg<"%r"#i>;  // 32-bit
   def RL#i : NVPTXReg<"%rd"#i>; // 64-bit
   def H#i  : NVPTXReg<"%h"#i>;  // 16-bit float
+  def HH#i : NVPTXReg<"%hh"#i>; // 2x16-bit float
   def F#i  : NVPTXReg<"%f"#i>;  // 32-bit float
   def FL#i : NVPTXReg<"%fd"#i>; // 64-bit float
 
@@ -59,6 +60,7 @@ def Int16Regs : NVPTXRegClass<[i16], 16, (add (sequence "RS%u", 0, 4))>;
 def Int32Regs : NVPTXRegClass<[i32], 32, (add (sequence "R%u", 0, 4))>;
 def Int64Regs : NVPTXRegClass<[i64], 64, (add (sequence "RL%u", 0, 4))>;
 def Float16Regs : NVPTXRegClass<[f16], 16, (add (sequence "H%u", 0, 4))>;
+def Float16x2Regs : NVPTXRegClass<[v2f16], 32, (add (sequence "HH%u", 0, 4))>;
 def Float32Regs : NVPTXRegClass<[f32], 32, (add (sequence "F%u", 0, 4))>;
 def Float64Regs : NVPTXRegClass<[f64], 64, (add (sequence "FL%u", 0, 4))>;
 def Int32ArgRegs : NVPTXRegClass<[i32], 32, (add (sequence "ia%u", 0, 4))>;
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 5847b3a52bfc5a9e74bf42427a527eab34cb1ab0..4863ac54273666555e2ccc13a90dfc192a1efffa 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -114,7 +114,7 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override {
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
     Value = adjustFixupValue(Fixup.getKind(), Value);
     if (!Value) return;           // Doesn't change encoding.
 
diff --git a/lib/Target/PowerPC/PPCAsmPrinter.cpp b/lib/Target/PowerPC/PPCAsmPrinter.cpp
index f0e0ebc4946c2707dfc3347a34e3eddaee5be211..1f181d007f637aadb18c9e6c73e0aec5d799e2ff 100644
--- a/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -112,7 +112,9 @@ public:
     void EmitTlsCall(const MachineInstr *MI, MCSymbolRefExpr::VariantKind VK);
     bool runOnMachineFunction(MachineFunction &MF) override {
       Subtarget = &MF.getSubtarget<PPCSubtarget>();
-      return AsmPrinter::runOnMachineFunction(MF);
+      bool Changed = AsmPrinter::runOnMachineFunction(MF);
+      emitXRayTable();
+      return Changed;
     }
   };
 
@@ -134,6 +136,7 @@ public:
 
     void EmitFunctionBodyStart() override;
     void EmitFunctionBodyEnd() override;
+    void EmitInstruction(const MachineInstr *MI) override;
   };
 
   /// PPCDarwinAsmPrinter - PowerPC assembly printer, customized for Darwin/Mac
@@ -402,7 +405,7 @@ void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
                                       .addImm(CallTarget & 0xFFFF));
 
       // Save the current TOC pointer before the remote call.
-      int TOCSaveOffset = Subtarget->isELFv2ABI() ? 24 : 40;
+      int TOCSaveOffset = Subtarget->getFrameLowering()->getTOCSaveOffset();
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::STD)
                                       .addReg(PPC::X2)
                                       .addImm(TOCSaveOffset)
@@ -1046,6 +1049,97 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+  if (!Subtarget->isPPC64())
+    return PPCAsmPrinter::EmitInstruction(MI);
+
+  switch (MI->getOpcode()) {
+  default:
+    return PPCAsmPrinter::EmitInstruction(MI);
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
+    // .begin:
+    //   b .end # lis 0, FuncId[16..32]
+    //   nop    # li  0, FuncId[0..15]
+    //   std 0, -8(1)
+    //   mflr 0
+    //   bl __xray_FunctionEntry
+    //   mtlr 0
+    // .end:
+    //
+    // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
+    // of instructions change.
+    MCSymbol *BeginOfSled = OutContext.createTempSymbol();
+    MCSymbol *EndOfSled = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(BeginOfSled);
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::B).addExpr(
+                       MCSymbolRefExpr::create(EndOfSled, OutContext)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0));
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::BL8_NOP)
+                       .addExpr(MCSymbolRefExpr::create(
+                           OutContext.getOrCreateSymbol("__xray_FunctionEntry"),
+                           OutContext)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
+    OutStreamer->EmitLabel(EndOfSled);
+    recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER);
+    break;
+  }
+  case TargetOpcode::PATCHABLE_FUNCTION_EXIT: {
+    // .p2align 3
+    // .begin:
+    //   b(lr)? # lis 0, FuncId[16..32]
+    //   nop    # li  0, FuncId[0..15]
+    //   std 0, -8(1)
+    //   mflr 0
+    //   bl __xray_FunctionExit
+    //   mtlr 0
+    // .end:
+    //   b(lr)?
+    //
+    // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
+    // of instructions change.
+    const MachineInstr *Next = [&] {
+      MachineBasicBlock::const_iterator It(MI);
+      assert(It != MI->getParent()->end());
+      ++It;
+      assert(It->isReturn());
+      return &*It;
+    }();
+    OutStreamer->EmitCodeAlignment(8);
+    MCSymbol *BeginOfSled = OutContext.createTempSymbol();
+    OutStreamer->EmitLabel(BeginOfSled);
+    MCInst TmpInst;
+    LowerPPCMachineInstrToMCInst(Next, TmpInst, *this, false);
+    EmitToStreamer(*OutStreamer, TmpInst);
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(PPC::STD).addReg(PPC::X0).addImm(-8).addReg(PPC::X1));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR8).addReg(PPC::X0));
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(PPC::BL8_NOP)
+                       .addExpr(MCSymbolRefExpr::create(
+                           OutContext.getOrCreateSymbol("__xray_FunctionExit"),
+                           OutContext)));
+    EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
+    recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT);
+    break;
+  }
+  case TargetOpcode::PATCHABLE_TAIL_CALL:
+  case TargetOpcode::PATCHABLE_RET:
+    // PPC's tail call instruction, e.g. PPC::TCRETURNdi8, doesn't really
+    // lower to a PPC::B instruction. The PPC::B instruction is generated
+    // before it, and handled by the normal case.
+    llvm_unreachable("Tail call is handled in the normal case. See comments"
+                     "around this assert.");
+  }
+}
+
 void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
   if (static_cast<const PPCTargetMachine &>(TM).isELFv2ABI()) {
     PPCTargetStreamer *TS =
diff --git a/lib/Target/PowerPC/PPCCTRLoops.cpp b/lib/Target/PowerPC/PPCCTRLoops.cpp
index 197be8b7db930da4c7952d6766150a5e77d980e9..70c4170653aef0e28bc554dbbfb4504eb9a741f3 100644
--- a/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -298,15 +298,17 @@ bool PPCCTRLoops::mightUseCTR(const Triple &TT, BasicBlock *BB) {
               return true;
             else
               continue; // ISD::FCOPYSIGN is never a library call.
-          case Intrinsic::sqrt:      Opcode = ISD::FSQRT;      break;
-          case Intrinsic::floor:     Opcode = ISD::FFLOOR;     break;
-          case Intrinsic::ceil:      Opcode = ISD::FCEIL;      break;
-          case Intrinsic::trunc:     Opcode = ISD::FTRUNC;     break;
-          case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
-          case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
-          case Intrinsic::round:     Opcode = ISD::FROUND;     break;
-          case Intrinsic::minnum:    Opcode = ISD::FMINNUM;    break;
-          case Intrinsic::maxnum:    Opcode = ISD::FMAXNUM;    break;
+          case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
+          case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
+          case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
+          case Intrinsic::trunc:              Opcode = ISD::FTRUNC;     break;
+          case Intrinsic::rint:               Opcode = ISD::FRINT;      break;
+          case Intrinsic::nearbyint:          Opcode = ISD::FNEARBYINT; break;
+          case Intrinsic::round:              Opcode = ISD::FROUND;     break;
+          case Intrinsic::minnum:             Opcode = ISD::FMINNUM;    break;
+          case Intrinsic::maxnum:             Opcode = ISD::FMAXNUM;    break;
+          case Intrinsic::umul_with_overflow: Opcode = ISD::UMULO;      break;
+          case Intrinsic::smul_with_overflow: Opcode = ISD::SMULO;      break;
           }
         }
 
diff --git a/lib/Target/PowerPC/PPCExpandISEL.cpp b/lib/Target/PowerPC/PPCExpandISEL.cpp
index accb84a8094c3d2bcb7736e1966dcbcf99b87989..ebd414baf1d2160de00485aeea0480f1c5c5f384 100644
--- a/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -42,6 +42,7 @@ static cl::opt<bool>
                  cl::desc("Enable generating the ISEL instruction."),
                  cl::init(true), cl::Hidden);
 
+namespace {
 class PPCExpandISEL : public MachineFunctionPass {
   DebugLoc dl;
   MachineFunction *MF;
@@ -143,6 +144,7 @@ public:
     return true;
   }
 };
+} // end anonymous namespace
 
 void PPCExpandISEL::initialize(MachineFunction &MFParam) {
   MF = &MFParam;
diff --git a/lib/Target/PowerPC/PPCFrameLowering.cpp b/lib/Target/PowerPC/PPCFrameLowering.cpp
index e786ef9aee0e0f24ef3743ad04995f2c3e38b700..4c9430a2eca07cf571ebd539e9561108e9e054c5 100644
--- a/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -433,8 +433,7 @@ unsigned PPCFrameLowering::determineFrameLayout(MachineFunction &MF,
   unsigned MaxAlign = MFI.getMaxAlignment(); // algmt required by data in frame
   unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   // If we are a leaf function, and use up to 224 bytes of stack space,
   // don't have a frame pointer, calls, or dynamic alloca then we do not need
@@ -519,8 +518,7 @@ void PPCFrameLowering::replaceFPWithRealFP(MachineFunction &MF) const {
   unsigned FPReg  = is31 ? PPC::R31 : PPC::R1;
   unsigned FP8Reg = is31 ? PPC::X31 : PPC::X1;
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   bool HasBP = RegInfo->hasBasePointer(MF);
   unsigned BPReg  = HasBP ? (unsigned) RegInfo->getBaseRegister(MF) : FPReg;
   unsigned BP8Reg = HasBP ? (unsigned) PPC::X30 : FPReg;
@@ -616,8 +614,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
     return true;
 
   // Get the list of callee-saved registers for the target.
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(MBB->getParent());
 
   // Get all the available registers in the block.
@@ -663,8 +660,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
 // and the stack frame is large, we need two scratch registers.
 bool
 PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   MachineFunction &MF = *(MBB->getParent());
   bool HasBP = RegInfo->hasBasePointer(MF);
   unsigned FrameSize = determineFrameLayout(MF, false);
@@ -694,10 +690,8 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
@@ -1221,10 +1215,8 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   if (MBBI != MBB.end())
     dl = MBBI->getDebugLoc();
   
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   // Get alignment info so we know how to restore the SP.
   const MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -1550,8 +1542,7 @@ void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const {
   if (MBBI != MBB.end())
     dl = MBBI->getDebugLoc();
 
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
 
   // Create branch instruction for pseudo tail call return instruction
   unsigned RetOpcode = MBBI->getOpcode();
@@ -1589,8 +1580,7 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
   //  Save and clear the LR state.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
@@ -1793,8 +1783,7 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
     MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
   }
 
-  const PPCRegisterInfo *RegInfo =
-      static_cast<const PPCRegisterInfo *>(Subtarget.getRegisterInfo());
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
   if (RegInfo->hasBasePointer(MF)) {
     HasGPSaveArea = true;
 
@@ -1941,8 +1930,7 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     return false;
 
   MachineFunction *MF = MBB.getParent();
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
   DebugLoc DL;
   bool CRSpilled = false;
   MachineInstrBuilder CRMIB;
@@ -2083,8 +2071,7 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
     return false;
 
   MachineFunction *MF = MBB.getParent();
-  const PPCInstrInfo &TII =
-      *static_cast<const PPCInstrInfo *>(Subtarget.getInstrInfo());
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
   bool CR2Spilled = false;
   bool CR3Spilled = false;
   bool CR4Spilled = false;
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index bef89c7b411365a71a70458c210317dd156bc788..9c72638023bb30115de6f9d30c205e1de2f6140f 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -2714,6 +2714,19 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
       return;
     }
+    // If this is a negated 64-bit zero-extension mask,
+    // i.e. the immediate is a sequence of ones from most significant side
+    // and all zero for reminder, we should use rldicr.
+    if (isInt64Immediate(N->getOperand(1).getNode(), Imm64) &&
+        isMask_64(~Imm64)) {
+      SDValue Val = N->getOperand(0);
+      MB = 63 - countTrailingOnes(~Imm64);
+      SH = 0;
+      SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
+      CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops);
+      return;
+    }
+
     // AND X, 0 -> 0, not "rlwinm 32".
     if (isInt32Immediate(N->getOperand(1), Imm) && (Imm == 0)) {
       ReplaceUses(SDValue(N, 0), N->getOperand(1));
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index 53e7dd1cb0face822d9579bf97c1a51c780abee9..f7663d8e5185c3eeeb838c4eb68bd46b5404f55e 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -2647,10 +2647,9 @@ SDValue PPCTargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
   // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg)
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(Chain)
-    .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-               DAG.getExternalSymbol("__trampoline_setup", PtrVT),
-               std::move(Args));
+  CLI.setDebugLoc(dl).setChain(Chain).setLibCallee(
+      CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+      DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = LowerCallTo(CLI);
   return CallResult.second;
@@ -2782,7 +2781,7 @@ bool llvm::CC_PPC32_SVR4_Custom_AlignArgRegs(unsigned &ValNo, MVT &ValVT,
   return false;
 }
 
-bool 
+bool
 llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
                                                   MVT &LocVT,
                                                   CCValAssign::LocInfo &LocInfo,
@@ -2797,7 +2796,7 @@ llvm::CC_PPC32_SVR4_Custom_SkipLastArgRegsPPCF128(unsigned &ValNo, MVT &ValVT,
   unsigned RegNum = State.getFirstUnallocated(ArgRegs);
   int RegsLeft = NumArgRegs - RegNum;
 
-  // Skip if there is not enough registers left for long double type (4 gpr regs 
+  // Skip if there is not enough registers left for long double type (4 gpr regs
   // in soft float mode) and put long double argument on the stack.
   if (RegNum != NumArgRegs && RegsLeft < 4) {
     for (int i = 0; i < RegsLeft; i++) {
@@ -4111,7 +4110,7 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,
 
 static bool
 hasSameArgumentList(const Function *CallerFn, ImmutableCallSite *CS) {
-  if (CS->arg_size() != CallerFn->getArgumentList().size())
+  if (CS->arg_size() != CallerFn->arg_size())
     return false;
 
   ImmutableCallSite::arg_iterator CalleeArgIter = CS->arg_begin();
@@ -5147,10 +5146,30 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   };
 
   const unsigned NumGPRs = array_lengthof(GPR);
-  const unsigned NumFPRs = 13;
+  const unsigned NumFPRs = useSoftFloat() ? 0 : 13;
   const unsigned NumVRs  = array_lengthof(VR);
   const unsigned NumQFPRs = NumFPRs;
 
+  // On ELFv2, we can avoid allocating the parameter area if all the arguments
+  // can be passed to the callee in registers.
+  // For the fast calling convention, there is another check below.
+  // Note: We should keep consistent with LowerFormalArguments_64SVR4()
+  bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast;
+  if (!HasParameterArea) {
+    unsigned ParamAreaSize = NumGPRs * PtrByteSize;
+    unsigned AvailableFPRs = NumFPRs;
+    unsigned AvailableVRs = NumVRs;
+    unsigned NumBytesTmp = NumBytes;
+    for (unsigned i = 0; i != NumOps; ++i) {
+      if (Outs[i].Flags.isNest()) continue;
+      if (CalculateStackSlotUsed(Outs[i].VT, Outs[i].ArgVT, Outs[i].Flags,
+                                PtrByteSize, LinkageSize, ParamAreaSize,
+                                NumBytesTmp, AvailableFPRs, AvailableVRs,
+                                Subtarget.hasQPX()))
+        HasParameterArea = true;
+    }
+  }
+
   // When using the fast calling convention, we don't provide backing for
   // arguments that will be in registers.
   unsigned NumGPRsUsed = 0, NumFPRsUsed = 0, NumVRsUsed = 0;
@@ -5218,13 +5237,18 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
 
   unsigned NumBytesActuallyUsed = NumBytes;
 
-  // The prolog code of the callee may store up to 8 GPR argument registers to
+  // In the old ELFv1 ABI,
+  // the prolog code of the callee may store up to 8 GPR argument registers to
   // the stack, allowing va_start to index over them in memory if its varargs.
   // Because we cannot tell if this is needed on the caller side, we have to
   // conservatively assume that it is needed.  As such, make sure we have at
   // least enough stack space for the caller to store the 8 GPRs.
-  // FIXME: On ELFv2, it may be unnecessary to allocate the parameter area.
-  NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+  // In the ELFv2 ABI, we allocate the parameter area iff a callee
+  // really requires memory operands, e.g. a vararg function.
+  if (HasParameterArea)
+    NumBytes = std::max(NumBytes, LinkageSize + 8 * PtrByteSize);
+  else
+    NumBytes = LinkageSize;
 
   // Tail call needs the stack to be aligned.
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
@@ -5443,6 +5467,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
@@ -5528,6 +5554,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
           PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, PtrOff, ConstFour);
         }
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, false, MemOpChains,
                          TailCallArguments, dl);
@@ -5562,6 +5590,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       // GPRs when within range.  For now, we always put the value in both
       // locations (or even all three).
       if (isVarArg) {
+        assert(HasParameterArea &&
+               "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
         SDValue Store =
@@ -5594,6 +5624,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
@@ -5614,6 +5646,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     case MVT::v4i1: {
       bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
       if (isVarArg) {
+        assert(HasParameterArea &&
+               "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
         // entirely in R registers.  Maybe later.
         SDValue Store =
@@ -5646,6 +5680,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         if (CallConv == CallingConv::Fast)
           ComputePtrOff();
 
+        assert(HasParameterArea &&
+               "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
                          true, isTailCall, true, MemOpChains,
                          TailCallArguments, dl);
@@ -5660,7 +5696,8 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     }
   }
 
-  assert(NumBytesActuallyUsed == ArgOffset);
+  assert((!HasParameterArea || NumBytesActuallyUsed == ArgOffset) &&
+         "mismatch in size of parameter area");
   (void)NumBytesActuallyUsed;
 
   if (!MemOpChains.empty())
@@ -11353,9 +11390,20 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       if (BSwapOp.getValueType() == MVT::i16)
         BSwapOp = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, BSwapOp);
 
+      // If the type of BSWAP operand is wider than stored memory width
+      // it need to be shifted to the right side before STBRX.
+      EVT mVT = cast<StoreSDNode>(N)->getMemoryVT();
+      if (Op1VT.bitsGT(mVT)) {
+        int Shift = Op1VT.getSizeInBits() - mVT.getSizeInBits();
+        BSwapOp = DAG.getNode(ISD::SRL, dl, Op1VT, BSwapOp,
+                              DAG.getConstant(Shift, dl, MVT::i32));
+        // Need to truncate if this is a bswap of i64 stored as i32/i16.
+        if (Op1VT == MVT::i64)
+          BSwapOp = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, BSwapOp);
+      }
+
       SDValue Ops[] = {
-        N->getOperand(0), BSwapOp, N->getOperand(2),
-        DAG.getValueType(N->getOperand(1).getValueType())
+        N->getOperand(0), BSwapOp, N->getOperand(2), DAG.getValueType(mVT)
       };
       return
         DAG.getMemIntrinsicNode(PPCISD::STBRX, dl, DAG.getVTList(MVT::Other),
@@ -11969,6 +12017,7 @@ PPCTargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
 void PPCTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       APInt &KnownZero,
                                                       APInt &KnownOne,
+                                                      const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h
index d12cd4ab1f38e87b5b932aac347ac08779bfe114..6113eb58f421150d23e60f39dde3c0999a90b970 100644
--- a/lib/Target/PowerPC/PPCISelLowering.h
+++ b/lib/Target/PowerPC/PPCISelLowering.h
@@ -531,6 +531,10 @@ namespace llvm {
       return true;
     }
 
+    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+      return VT.isScalarInteger();
+    }
+
     bool supportSplitCSR(MachineFunction *MF) const override {
       return
         MF->getFunction()->getCallingConv() == CallingConv::CXX_FAST_TLS &&
@@ -604,6 +608,7 @@ namespace llvm {
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
                                        APInt &KnownOne,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
@@ -711,6 +716,10 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    bool convertSelectOfConstantsToMath() const override {
+      return true;
+    }
+
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
     bool getTgtMemIntrinsic(IntrinsicInfo &Info,
diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td
index 0f12aa1aaa875b29405ea207f9492eb06d21855c..997b96ca6ec8b15adfee74486e61106b065646ce 100644
--- a/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -1232,6 +1232,10 @@ def : Pat<(srl i64:$rS, i32:$rB),
 def : Pat<(shl i64:$rS, i32:$rB),
           (SLD $rS, $rB)>;
 
+// SUBFIC
+def : Pat<(sub imm64SExt16:$imm, i64:$in),
+          (SUBFIC8 $in, imm:$imm)>;
+
 // SHL/SRL
 def : Pat<(shl i64:$in, (i32 imm:$imm)),
           (RLDICR $in, imm:$imm, (SHL64 imm:$imm))>;
diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td
index 9bff61e3460e1d46a6dc0c0b8cb42f1783a76537..c380766e9f5c44f9f86c0ad097e335f0d5412a03 100644
--- a/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -851,6 +851,10 @@ def V_SETALLONES  : VXForm_3<908, (outs vrrc:$vD), (ins),
 // Additional Altivec Patterns
 //
 
+// Extended mnemonics
+def : InstAlias<"vmr $vD, $vA", (VOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>;
+def : InstAlias<"vnot $vD, $vA", (VNOR vrrc:$vD, vrrc:$vA, vrrc:$vA)>;
+
 // Loads.
 def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index 315911db4e30de40a3867cefa326a9840f9cf81c..8e159f47ea2eedc4c6e39ac04feb60880060097a 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -65,7 +65,9 @@ UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
 void PPCInstrInfo::anchor() {}
 
 PPCInstrInfo::PPCInstrInfo(PPCSubtarget &STI)
-    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP),
+    : PPCGenInstrInfo(PPC::ADJCALLSTACKDOWN, PPC::ADJCALLSTACKUP,
+                      /* CatchRetOpcode */ -1,
+                      STI.isPPC64() ? PPC::BLR8 : PPC::BLR),
       Subtarget(STI), RI(STI.getTargetMachine()) {}
 
 /// CreateTargetHazardRecognizer - Return the hazard recognizer to use for
@@ -1491,7 +1493,7 @@ bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI,
   return Found;
 }
 
-bool PPCInstrInfo::isPredicable(MachineInstr &MI) const {
+bool PPCInstrInfo::isPredicable(const MachineInstr &MI) const {
   unsigned OpC = MI.getOpcode();
   switch (OpC) {
   default:
@@ -1834,8 +1836,7 @@ unsigned PPCInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     PatchPointOpers Opers(&MI);
     return Opers.getNumPatchBytes();
   } else {
-    const MCInstrDesc &Desc = get(Opcode);
-    return Desc.getSize();
+    return get(Opcode).getSize();
   }
 }
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.h b/lib/Target/PowerPC/PPCInstrInfo.h
index 32b2f009a3f5d7ca5b16d966441174e1a60e52c3..f11aed8fa268f25edbcd09cd74822edc12f0e952 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/lib/Target/PowerPC/PPCInstrInfo.h
@@ -253,7 +253,7 @@ public:
   bool DefinesPredicate(MachineInstr &MI,
                         std::vector<MachineOperand> &Pred) const override;
 
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
 
   // Comparison optimization.
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td
index d34ee2e5099481117d1bab0aa28faa19b7caa517..f004ce49cac0dc6be403543d00fed6be3a394338 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/lib/Target/PowerPC/PPCInstrInfo.td
@@ -770,9 +770,10 @@ def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
 }
 
 // A single-register address. This is used with the SjLj
-// pseudo-instructions.
+// pseudo-instructions which tranlates to LD/LWZ.  These instructions requires
+// G8RC_NOX0 registers.
 def memr : Operand<iPTR> {
-  let MIOperandInfo = (ops ptr_rc:$ptrreg);
+  let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg);
 }
 def PPCTLSRegOperand : AsmOperandClass {
   let Name = "TLSReg"; let PredicateMethod = "isTLSReg";
diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td
index 2c5fa200272f69f7345180bc5dc7e0f47b0987d6..13603732397ad3d29616215dedd476edbc85d984 100644
--- a/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1410,6 +1410,11 @@ let Predicates = [HasDirectMove] in {
                               "mfvsrd $rA, $XT", IIC_VecGeneral,
                               [(set i64:$rA, (PPCmfvsr f64:$XT))]>,
       Requires<[In64BitMode]>;
+  let isCodeGenOnly = 1 in
+  def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vrrc:$XT),
+                             "mfvsrd $rA, $XT", IIC_VecGeneral,
+                             []>,
+      Requires<[In64BitMode]>;
   def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT),
                                "mfvsrwz $rA, $XT", IIC_VecGeneral,
                                [(set i32:$rA, (PPCmfvsr f64:$XT))]>;
@@ -1440,6 +1445,13 @@ let Predicates = [IsISA3_0, HasDirectMove] in {
 } // IsISA3_0, HasDirectMove
 } // UseVSXReg = 1
 
+// We want to parse this from asm, but we don't want to emit this as it would
+// be emitted with a VSX reg. So leave Emit = 0 here.
+def : InstAlias<"mfvrd $rA, $XT",
+                (MFVRD g8rc:$rA, vrrc:$XT), 0>;
+def : InstAlias<"mffprd $rA, $src",
+                (MFVSRD g8rc:$rA, f8rc:$src)>;
+
 /*  Direct moves of various widths from GPR's into VSR's. Each move lines
     the value up into element 0 (both BE and LE). Namely, entities smaller than
     a doubleword are shifted left and moved for BE. For LE, they're moved, then
@@ -2186,7 +2198,7 @@ let AddedComplexity = 400, Predicates = [HasP9Vector] in {
   } // UseVSXReg = 1
 
   // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
-  // seperate pattern so that it can convert the input register class from
+  // separate pattern so that it can convert the input register class from
   // VRRC(v8i16) to VSRC.
   def : Pat<(v4f32 (int_ppc_vsx_xvcvhpsp v8i16:$A)),
             (v4f32 (XVCVHPSP (COPY_TO_REGCLASS $A, VSRC)))>;
diff --git a/lib/Target/PowerPC/PPCScheduleP8.td b/lib/Target/PowerPC/PPCScheduleP8.td
index 8e52da583a0d4a569a45868e297766cf581e3949..79963dd6a3e9da8254849c858f921146e23238fa 100644
--- a/lib/Target/PowerPC/PPCScheduleP8.td
+++ b/lib/Target/PowerPC/PPCScheduleP8.td
@@ -377,7 +377,7 @@ def P8Itineraries : ProcessorItineraries<
                                    InstrStage<1, [P8_FPU1, P8_FPU2]>],
                                   [7, 1, 1]>,
   InstrItinData<IIC_VecPerm     , [InstrStage<1, [P8_DU1, P8_DU2], 0>,
-                                   InstrStage<1, [P8_FPU2, P8_FPU2]>],
+                                   InstrStage<1, [P8_FPU1, P8_FPU2]>],
                                   [3, 1, 1]>
 ]>;
 
diff --git a/lib/Target/PowerPC/PPCSubtarget.h b/lib/Target/PowerPC/PPCSubtarget.h
index 48d6365d3e742b617265d2c4d66a1c4f0993ad79..5a97f595ad8cf6bf111b531ce5c08634d057ac79 100644
--- a/lib/Target/PowerPC/PPCSubtarget.h
+++ b/lib/Target/PowerPC/PPCSubtarget.h
@@ -318,6 +318,8 @@ public:
   /// classifyGlobalReference - Classify a global variable reference for the
   /// current subtarget accourding to how we should reference it.
   unsigned char classifyGlobalReference(const GlobalValue *GV) const;
+
+  bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; }
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index 5d68f32ccc5e1afb0de2e92c7ed6853fedb9adef..7806d45b54575711cdbbf4e074b6f97eccdfd7b8 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -218,9 +218,7 @@ PPCTargetMachine::PPCTargetMachine(const Target &T, const Triple &TT,
                         computeFSAdditions(FS, OL, TT), Options,
                         getEffectiveRelocModel(TT, RM), CM, OL),
       TLOF(createTLOF(getTargetTriple())),
-      TargetABI(computeTargetABI(TT, Options)),
-      Subtarget(TargetTriple, CPU, computeFSAdditions(FS, OL, TT), *this) {
-
+      TargetABI(computeTargetABI(TT, Options)) {
   initAsmInfo();
 }
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index 59b4f1e30c0e5af5695c451166a2eb6d7ccf3c31..f2838351cee56b001c698a7f50e9f7f755385de7 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -29,7 +29,6 @@ public:
 private:
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   PPCABI TargetABI;
-  PPCSubtarget Subtarget;
 
   mutable StringMap<std::unique_ptr<PPCSubtarget>> SubtargetMap;
 
diff --git a/lib/Target/PowerPC/PPCTargetStreamer.h b/lib/Target/PowerPC/PPCTargetStreamer.h
index dbe7617d3542dc5b7809d8c91bb6778af374067a..310fea9ef09ff637dc3feb1937aaea32c29982d6 100644
--- a/lib/Target/PowerPC/PPCTargetStreamer.h
+++ b/lib/Target/PowerPC/PPCTargetStreamer.h
@@ -1,4 +1,4 @@
-//===-- PPCTargetStreamer.h - PPC Target Streamer --s-----------*- C++ -*--===//
+//===- PPCTargetStreamer.h - PPC Target Streamer ----------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,18 +10,26 @@
 #ifndef LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
 #define LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCStreamer.h"
 
 namespace llvm {
+
+class MCExpr;
+class MCSymbol;
+class MCSymbolELF;
+
 class PPCTargetStreamer : public MCTargetStreamer {
 public:
   PPCTargetStreamer(MCStreamer &S);
   ~PPCTargetStreamer() override;
+
   virtual void emitTCEntry(const MCSymbol &S) = 0;
   virtual void emitMachine(StringRef CPU) = 0;
   virtual void emitAbiVersion(int AbiVersion) = 0;
   virtual void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) = 0;
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_POWERPC_PPCTARGETSTREAMER_H
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index f94d1eab097db82c6fda53870e7a5856ec834fbd..7ee1317bf72f2d3b6a4214f05fdd0e0e3e34bb1a 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -302,14 +302,16 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   return LT.first;
 }
 
-int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -352,7 +354,7 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
 }
 
 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
@@ -401,6 +403,10 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
   if (IsVSXType || (ST->hasVSX() && IsAltivecType))
     return Cost;
 
+  // Newer PPC supports unaligned memory access.
+  if (TLI->allowsMisalignedMemoryAccesses(LT.second, 0))
+    return Cost;
+
   // PPC in general does not support unaligned loads and stores. They'll need
   // to be decomposed based on the alignment factor.
 
diff --git a/lib/Target/PowerPC/PPCTargetTransformInfo.h b/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 30ee2814aba1aff05c140aa79c89a83ace592b66..6ce70fbd8778e29c420f5b8d3c5b6238798440ea 100644
--- a/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -74,11 +74,13 @@ public:
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
   int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
                                  unsigned Factor,
                                  ArrayRef<unsigned> Indices,
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index f8ef142255c860358f6bd60b01d9db2051050498..d6f2672271e9b6e33b1c8a65a5db92fb139ac508 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -33,7 +33,7 @@ public:
   ~RISCVAsmBackend() override {}
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -71,7 +71,7 @@ bool RISCVAsmBackend::writeNopData(uint64_t Count, MCObjectWriter *OW) const {
 
 void RISCVAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                  unsigned DataSize, uint64_t Value,
-                                 bool IsPCRel) const {
+                                 bool IsPCRel, MCContext &Ctx) const {
   return;
 }
 
diff --git a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index 4fc69a7fcabad1b238fa48e1bf6ce1a1b30b7b9c..41be0a2084b37369309a86a0330166c2f031bf1d 100644
--- a/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -44,13 +44,12 @@ static MCRegisterInfo *createRISCVMCRegisterInfo(const Triple &TT) {
 
 static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
                                        const Triple &TT) {
-  MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
-  return MAI;
+  return new RISCVMCAsmInfo(TT);
 }
 
 extern "C" void LLVMInitializeRISCVTargetMC() {
   for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
-    RegisterMCAsmInfoFn X(*T, createRISCVMCAsmInfo);
+    TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
     TargetRegistry::RegisterMCInstrInfo(*T, createRISCVMCInstrInfo);
     TargetRegistry::RegisterMCRegInfo(*T, createRISCVMCRegisterInfo);
     TargetRegistry::RegisterMCAsmBackend(*T, createRISCVAsmBackend);
diff --git a/lib/Target/RISCV/RISCVInstrFormats.td b/lib/Target/RISCV/RISCVInstrFormats.td
index 1e9bc3bf9bc5e6771f8c3f98f4e9830c79e8f56d..3fab7122f6f1a51a8c2089d2e8aead9eb3f0a18e 100644
--- a/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/lib/Target/RISCV/RISCVInstrFormats.td
@@ -44,8 +44,9 @@ class RISCVInst<dag outs, dag ins, string asmstr, list<dag> pattern>
 
 // Pseudo instructions
 class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern>
-    : RISCVInst<outs, ins, asmstr, pattern> {
+    : RISCVInst<outs, ins, "", pattern> {
   let isPseudo = 1;
+  let isCodeGenOnly = 1;
 }
 
 class FR<bits<7> funct7, bits<3> funct3, bits<7> opcode, dag outs, dag ins,
diff --git a/lib/Target/RISCV/RISCVTargetMachine.cpp b/lib/Target/RISCV/RISCVTargetMachine.cpp
index afbbe004186e70117aa8c02b87fede2ddac36e38..a20331cd0a3edadfb4553bdee8b186ae2950dc57 100644
--- a/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -32,7 +32,7 @@ static std::string computeDataLayout(const Triple &TT) {
     return "e-m:e-i64:64-n32:64-S128";
   } else {
     assert(TT.isArch32Bit() && "only RV32 and RV64 are currently supported");
-    return "e-m:e-i64:64-n32-S128";
+    return "e-m:e-p:32:32-i64:64-n32-S128";
   }
 }
 
@@ -51,7 +51,9 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOpt::Level OL)
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
                         getEffectiveRelocModel(TT, RM), CM, OL),
-      TLOF(make_unique<TargetLoweringObjectFileELF>()) {}
+      TLOF(make_unique<TargetLoweringObjectFileELF>()) {
+  initAsmInfo();
+}
 
 TargetPassConfig *RISCVTargetMachine::createPassConfig(PassManagerBase &PM) {
   return new TargetPassConfig(this, PM);
diff --git a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index e775aa607b530bac130b2fb6175bbe6e84d66f7b..7e6dff6b7894850c07ca49634171e3a2c37dc365 100644
--- a/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -9,32 +9,49 @@
 
 #include "MCTargetDesc/SparcMCExpr.h"
 #include "MCTargetDesc/SparcMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cstdint>
+#include <memory>
 
 using namespace llvm;
 
 // The generated AsmMatcher SparcGenAsmMatcher uses "Sparc" as the target
 // namespace. But SPARC backend uses "SP" as its namespace.
 namespace llvm {
-  namespace Sparc {
+namespace Sparc {
+
     using namespace SP;
-  }
-}
+
+} // end namespace Sparc
+} // end namespace llvm
 
 namespace {
+
 class SparcOperand;
-class SparcAsmParser : public MCTargetAsmParser {
 
+class SparcAsmParser : public MCTargetAsmParser {
   MCAsmParser &Parser;
 
   /// @name Auto-generated Match Functions
@@ -95,9 +112,10 @@ public:
     // Initialize the set of available features.
     setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
   }
-
 };
 
+} // end anonymous namespace
+
   static const MCPhysReg IntRegs[32] = {
     Sparc::G0, Sparc::G1, Sparc::G2, Sparc::G3,
     Sparc::G4, Sparc::G5, Sparc::G6, Sparc::G7,
@@ -166,6 +184,8 @@ public:
     Sparc::C16_C17, Sparc::C18_C19, Sparc::C20_C21, Sparc::C22_C23,
     Sparc::C24_C25, Sparc::C26_C27, Sparc::C28_C29, Sparc::C30_C31};
   
+namespace {
+
 /// SparcOperand - Instances of this class represent a parsed Sparc machine
 /// instruction.
 class SparcOperand : public MCParsedAsmOperand {
@@ -219,6 +239,7 @@ private:
     struct ImmOp Imm;
     struct MemOp Mem;
   };
+
 public:
   SparcOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
 
@@ -464,7 +485,7 @@ public:
   }
 };
 
-} // end namespace
+} // end anonymous namespace
 
 bool SparcAsmParser::expandSET(MCInst &Inst, SMLoc IDLoc,
                                SmallVectorImpl<MCInst> &Instructions) {
@@ -591,9 +612,8 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   llvm_unreachable("Implement any new match types added!");
 }
 
-bool SparcAsmParser::
-ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc)
-{
+bool SparcAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                   SMLoc &EndLoc) {
   const AsmToken &Tok = Parser.getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
@@ -695,7 +715,7 @@ ParseDirective(AsmToken DirectiveID)
 
 bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
   if (getLexer().isNot(AsmToken::EndOfStatement)) {
-    for (;;) {
+    while (true) {
       const MCExpr *Value;
       if (getParser().parseExpression(Value))
         return true;
@@ -717,7 +737,6 @@ bool SparcAsmParser:: parseDirectiveWord(unsigned Size, SMLoc L) {
 
 OperandMatchResultTy
 SparcAsmParser::parseMEMOperand(OperandVector &Operands) {
-
   SMLoc S, E;
   unsigned BaseReg = 0;
 
@@ -824,7 +843,6 @@ SparcAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 OperandMatchResultTy
 SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
                                      bool isCall) {
-
   SMLoc S = Parser.getTok().getLoc();
   SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   const MCExpr *EVal;
@@ -910,11 +928,9 @@ SparcAsmParser::parseSparcAsmOperand(std::unique_ptr<SparcOperand> &Op,
 
 OperandMatchResultTy
 SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
-
   // parse (,a|,pn|,pt)+
 
   while (getLexer().is(AsmToken::Comma)) {
-
     Parser.Lex(); // Eat the comma
 
     if (!getLexer().is(AsmToken::Identifier))
@@ -929,10 +945,8 @@ SparcAsmParser::parseBranchModifiers(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
-bool SparcAsmParser::matchRegisterName(const AsmToken &Tok,
-                                       unsigned &RegNo,
-                                       unsigned &RegKind)
-{
+bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, unsigned &RegNo,
+                                       unsigned &RegKind) {
   int64_t intVal = 0;
   RegNo = 0;
   RegKind = SparcOperand::rk_None;
@@ -1211,8 +1225,7 @@ static bool hasGOTReference(const MCExpr *Expr) {
 
 const SparcMCExpr *
 SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
-                                    const MCExpr *subExpr)
-{
+                                    const MCExpr *subExpr) {
   // When in PIC mode, "%lo(...)" and "%hi(...)" behave differently.
   // If the expression refers contains _GLOBAL_OFFSETE_TABLE, it is
   // actually a %pc10 or %pc22 relocation. Otherwise, they are interpreted
@@ -1236,8 +1249,7 @@ SparcAsmParser::adjustPICRelocation(SparcMCExpr::VariantKind VK,
 }
 
 bool SparcAsmParser::matchSparcAsmModifiers(const MCExpr *&EVal,
-                                            SMLoc &EndLoc)
-{
+                                            SMLoc &EndLoc) {
   AsmToken Tok = Parser.getTok();
   if (!Tok.is(AsmToken::Identifier))
     return false;
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 6106a6c32dc89c767f9c39e929228a80a43b540d..cc07547ede2c2a603f4610db0a127fb6d7b82f9d 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -274,7 +274,8 @@ namespace {
       SparcAsmBackend(T), OSType(OSType) { }
 
     void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                    uint64_t Value, bool IsPCRel) const override {
+                    uint64_t Value, bool IsPCRel,
+                    MCContext &Ctx) const override {
 
       Value = adjustFixupValue(Fixup.getKind(), Value);
       if (!Value) return;           // Doesn't change encoding.
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 280c6d7937b2b4b4a04d266b334e124437e161a0..3ed09898fb78de6b8b4c84cc3ec92d03b083fd2b 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -1,4 +1,4 @@
-//===-- SparcMCAsmInfo.cpp - Sparc asm properties -------------------------===//
+//===- SparcMCAsmInfo.cpp - Sparc asm properties --------------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -14,7 +14,10 @@
 #include "SparcMCAsmInfo.h"
 #include "SparcMCExpr.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/Dwarf.h"
 
 using namespace llvm;
 
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
index ad441227600eeb396c0a0ba7e61761fd3188af03..5e8d0cb5031297dfe97a77caa47d1b6dcb090e1a 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.h
@@ -1,4 +1,4 @@
-//===-- SparcMCAsmInfo.h - Sparc asm properties ----------------*- C++ -*--===//
+//===- SparcMCAsmInfo.h - Sparc asm properties -----------------*- C++ -*--===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCAsmInfoELF.h"
 
 namespace llvm {
+
 class Triple;
 
 class SparcELFMCAsmInfo : public MCAsmInfoELF {
@@ -24,6 +25,7 @@ class SparcELFMCAsmInfo : public MCAsmInfoELF {
 
 public:
   explicit SparcELFMCAsmInfo(const Triple &TheTriple);
+
   const MCExpr*
   getExprForPersonalitySymbol(const MCSymbol *Sym, unsigned Encoding,
                               MCStreamer &Streamer) const override;
@@ -33,6 +35,6 @@ public:
 
 };
 
-} // namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SPARC_MCTARGETDESC_SPARCMCASMINFO_H
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
index 86341c61d1e244399e1f209f426401a6b59dd356..684f66970dbe92229d416a279a39dd66b67d75a6 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCCodeEmitter.cpp
@@ -11,20 +11,29 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SparcMCExpr.h"
 #include "MCTargetDesc/SparcFixupKinds.h"
+#include "SparcMCExpr.h"
 #include "SparcMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/Endian.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
@@ -33,17 +42,17 @@ using namespace llvm;
 STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
 
 namespace {
+
 class SparcMCCodeEmitter : public MCCodeEmitter {
-  SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete;
-  void operator=(const SparcMCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
 
 public:
   SparcMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
       : MCII(mcii), Ctx(ctx) {}
-
-  ~SparcMCCodeEmitter() override {}
+  SparcMCCodeEmitter(const SparcMCCodeEmitter &) = delete;
+  SparcMCCodeEmitter &operator=(const SparcMCCodeEmitter &) = delete;
+  ~SparcMCCodeEmitter() override = default;
 
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
@@ -79,13 +88,8 @@ private:
   void verifyInstructionPredicates(const MCInst &MI,
                                    uint64_t AvailableFeatures) const;
 };
-} // end anonymous namespace
 
-MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
-                                              const MCRegisterInfo &MRI,
-                                              MCContext &Ctx) {
-  return new SparcMCCodeEmitter(MCII, Ctx);
-}
+} // end anonymous namespace
 
 void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
                                            SmallVectorImpl<MCFixup> &Fixups,
@@ -121,12 +125,10 @@ void SparcMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   ++MCNumEmitted;  // Keep track of the # of mi's emitted.
 }
 
-
 unsigned SparcMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const {
-
   if (MO.isReg())
     return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
 
@@ -209,6 +211,7 @@ getBranchPredTargetOpValue(const MCInst &MI, unsigned OpNo,
                                    (MCFixupKind)Sparc::fixup_sparc_br19));
   return 0;
 }
+
 unsigned SparcMCCodeEmitter::
 getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
                            SmallVectorImpl<MCFixup> &Fixups,
@@ -227,3 +230,9 @@ getBranchOnRegTargetOpValue(const MCInst &MI, unsigned OpNo,
 
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "SparcGenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createSparcMCCodeEmitter(const MCInstrInfo &MCII,
+                                              const MCRegisterInfo &MRI,
+                                              MCContext &Ctx) {
+  return new SparcMCCodeEmitter(MCII, Ctx);
+}
diff --git a/lib/Target/Sparc/SparcFrameLowering.cpp b/lib/Target/Sparc/SparcFrameLowering.cpp
index 122f830e0dc5fe557111235d43c44073ee57fed3..c07cc213c3ed718005a4c6008952c1074657d0ed 100644
--- a/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -288,11 +288,11 @@ static bool LLVM_ATTRIBUTE_UNUSED verifyLeafProcRegUse(MachineRegisterInfo *MRI)
 {
 
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg)
-    if (!MRI->reg_nodbg_empty(reg))
+    if (MRI->isPhysRegUsed(reg))
       return false;
 
   for (unsigned reg = SP::L0; reg <= SP::L7; ++reg)
-    if (!MRI->reg_nodbg_empty(reg))
+    if (MRI->isPhysRegUsed(reg))
       return false;
 
   return true;
@@ -305,8 +305,8 @@ bool SparcFrameLowering::isLeafProc(MachineFunction &MF) const
   MachineFrameInfo    &MFI = MF.getFrameInfo();
 
   return !(MFI.hasCalls()                  // has calls
-           || !MRI.reg_nodbg_empty(SP::L0) // Too many registers needed
-           || !MRI.reg_nodbg_empty(SP::O6) // %SP is used
+           || MRI.isPhysRegUsed(SP::L0)    // Too many registers needed
+           || MRI.isPhysRegUsed(SP::O6)    // %SP is used
            || hasFP(MF));                  // need %FP
 }
 
@@ -314,11 +314,10 @@ void SparcFrameLowering::remapRegsForLeafProc(MachineFunction &MF) const {
   MachineRegisterInfo &MRI = MF.getRegInfo();
   // Remap %i[0-7] to %o[0-7].
   for (unsigned reg = SP::I0; reg <= SP::I7; ++reg) {
-    if (MRI.reg_nodbg_empty(reg))
+    if (!MRI.isPhysRegUsed(reg))
       continue;
 
     unsigned mapped_reg = reg - SP::I0 + SP::O0;
-    assert(MRI.reg_nodbg_empty(mapped_reg));
 
     // Replace I register with O register.
     MRI.replaceRegWith(reg, mapped_reg);
diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp
index 2ac9aae2471bab4eb3ccea266318b4ca5b174ade..455d1ee1564a891fa367114ddd87d01d295f6301 100644
--- a/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/lib/Target/Sparc/SparcISelLowering.cpp
@@ -1877,6 +1877,7 @@ void SparcTargetLowering::computeKnownBitsForTargetNode
                                 (const SDValue Op,
                                  APInt &KnownZero,
                                  APInt &KnownOne,
+                                 const APInt &DemandedElts,
                                  const SelectionDAG &DAG,
                                  unsigned Depth) const {
   APInt KnownZero2, KnownOne2;
@@ -2177,8 +2178,8 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
     Entry.Node = RetPtr;
     Entry.Ty   = PointerType::getUnqual(RetTy);
     if (!Subtarget->is64Bit())
-      Entry.isSRet = true;
-    Entry.isReturned = false;
+      Entry.IsSRet = true;
+    Entry.IsReturned = false;
     Args.push_back(Entry);
     RetTyABI = Type::getVoidTy(*DAG.getContext());
   }
diff --git a/lib/Target/Sparc/SparcISelLowering.h b/lib/Target/Sparc/SparcISelLowering.h
index e0a421b837126e80e06409c1de5edbdf07881369..90d03984060cd029af2d5e045470a1d28043fca2 100644
--- a/lib/Target/Sparc/SparcISelLowering.h
+++ b/lib/Target/Sparc/SparcISelLowering.h
@@ -68,6 +68,7 @@ namespace llvm {
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
                                        APInt &KnownOne,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index a94717c934563a917cb947139b3341c49e717128..3f91ca9035a61da5c7b955f8928fcd971c33eb3b 100644
--- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -8,16 +8,31 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
 #include "llvm/Support/TargetRegistry.h"
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
+#include <memory>
+#include <string>
 
 using namespace llvm;
 
@@ -31,6 +46,7 @@ static bool inRange(const MCExpr *Expr, int64_t MinValue, int64_t MaxValue) {
 }
 
 namespace {
+
 enum RegisterKind {
   GR32Reg,
   GRH32Reg,
@@ -56,7 +72,6 @@ enum MemoryKind {
 };
 
 class SystemZOperand : public MCParsedAsmOperand {
-public:
 private:
   enum OperandKind {
     KindInvalid,
@@ -140,12 +155,14 @@ public:
                                                        SMLoc EndLoc) {
     return make_unique<SystemZOperand>(KindInvalid, StartLoc, EndLoc);
   }
+
   static std::unique_ptr<SystemZOperand> createToken(StringRef Str, SMLoc Loc) {
     auto Op = make_unique<SystemZOperand>(KindToken, Loc, Loc);
     Op->Token.Data = Str.data();
     Op->Token.Length = Str.size();
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createReg(RegisterKind Kind, unsigned Num, SMLoc StartLoc, SMLoc EndLoc) {
     auto Op = make_unique<SystemZOperand>(KindReg, StartLoc, EndLoc);
@@ -153,12 +170,14 @@ public:
     Op->Reg.Num = Num;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createImm(const MCExpr *Expr, SMLoc StartLoc, SMLoc EndLoc) {
     auto Op = make_unique<SystemZOperand>(KindImm, StartLoc, EndLoc);
     Op->Imm = Expr;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createMem(MemoryKind MemKind, RegisterKind RegKind, unsigned Base,
             const MCExpr *Disp, unsigned Index, const MCExpr *LengthImm,
@@ -175,6 +194,7 @@ public:
       Op->Mem.Length.Reg = LengthReg;
     return Op;
   }
+
   static std::unique_ptr<SystemZOperand>
   createImmTLS(const MCExpr *Imm, const MCExpr *Sym,
                SMLoc StartLoc, SMLoc EndLoc) {
@@ -503,6 +523,7 @@ public:
     return parsePCRel(Operands, -(1LL << 32), (1LL << 32) - 1, true);
   }
 };
+
 } // end anonymous namespace
 
 #define GET_REGISTER_MATCHER
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
index 1207c7b327e879bb586a3a5182429a6cf0a209a7..6cd12e13e220c02c51d85c5c5e3a17a77fc2faa1 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.cpp
@@ -1,4 +1,4 @@
-//===-- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax ===//
+//===- SystemZInstPrinter.cpp - Convert SystemZ MCInst to assembly syntax -===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -10,10 +10,13 @@
 #include "SystemZInstPrinter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
diff --git a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
index 6336f5ee0efa890da37e50f2a77096ab21a66a08..d65c661545eb5749e16e0f9da4a2c17ccc182926 100644
--- a/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
+++ b/lib/Target/SystemZ/InstPrinter/SystemZInstPrinter.h
@@ -15,8 +15,10 @@
 #define LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
 
 #include "llvm/MC/MCInstPrinter.h"
+#include <cstdint>
 
 namespace llvm {
+
 class MCOperand;
 
 class SystemZInstPrinter : public MCInstPrinter {
@@ -70,6 +72,7 @@ private:
   // This forms part of the instruction name rather than the operand list.
   void printCond4Operand(const MCInst *MI, int OpNum, raw_ostream &O);
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_SYSTEMZ_INSTPRINTER_SYSTEMZINSTPRINTER_H
diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 9192448afd0478ffda1f1342938ca4857956ba66..23b7d5b5d50132ab219ef8321ee284aacfb474e0 100644
--- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -51,7 +51,7 @@ public:
   }
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
   bool mayNeedRelaxation(const MCInst &Inst) const override {
     return false;
   }
@@ -91,7 +91,7 @@ SystemZMCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 
 void SystemZMCAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                      unsigned DataSize, uint64_t Value,
-                                     bool IsPCRel) const {
+                                     bool IsPCRel, MCContext &Ctx) const {
   MCFixupKind Kind = Fixup.getKind();
   unsigned Offset = Fixup.getOffset();
   unsigned BitSize = getFixupKindInfo(Kind).TargetSize;
diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp
index e5477f8c9c1c1d0f2c1dcdc14d44658b5c21b10c..84d3c7bed50a2f1d58ea322cb7f713a62b00cab8 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -194,6 +194,9 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UMUL_LOHI, VT, Custom);
 
       // Only z196 and above have native support for conversions to unsigned.
+      // On z10, promoting to i64 doesn't generate an inexact condition for
+      // values that are outside the i32 range but in the i64 range, so use
+      // the default expansion.
       if (!Subtarget.hasFPExtension())
         setOperationAction(ISD::FP_TO_UINT, VT, Expand);
     }
@@ -344,9 +347,13 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
     // There should be no need to check for float types other than v2f64
     // since <2 x f32> isn't a legal type.
     setOperationAction(ISD::FP_TO_SINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_SINT, MVT::v2f64, Legal);
     setOperationAction(ISD::FP_TO_UINT, MVT::v2i64, Legal);
+    setOperationAction(ISD::FP_TO_UINT, MVT::v2f64, Legal);
     setOperationAction(ISD::SINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::SINT_TO_FP, MVT::v2f64, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::v2i64, Legal);
+    setOperationAction(ISD::UINT_TO_FP, MVT::v2f64, Legal);
   }
 
   // Handle floating-point types.
@@ -2789,8 +2796,9 @@ SDValue SystemZTargetLowering::lowerBITCAST(SDValue Op,
   // but we need this case for bitcasts that are created during lowering
   // and which are then lowered themselves.
   if (auto *LoadN = dyn_cast<LoadSDNode>(In))
-    return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
-                       LoadN->getMemOperand());
+    if (ISD::isNormalLoad(LoadN))
+      return DAG.getLoad(ResVT, DL, LoadN->getChain(), LoadN->getBasePtr(),
+                         LoadN->getMemOperand());
 
   if (InVT == MVT::i32 && ResVT == MVT::f32) {
     SDValue In64;
@@ -4732,9 +4740,12 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
 }
 
 // Return true if VT is a vector whose elements are a whole number of bytes
-// in width.
-static bool canTreatAsByteVector(EVT VT) {
-  return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0;
+// in width. Also check for presence of vector support.
+bool SystemZTargetLowering::canTreatAsByteVector(EVT VT) const {
+  if (!Subtarget.hasVector())
+    return false;
+
+  return VT.isVector() && VT.getScalarSizeInBits() % 8 == 0 && VT.isSimple();
 }
 
 // Try to simplify an EXTRACT_VECTOR_ELT from a vector of type VecVT
@@ -4996,6 +5007,10 @@ SDValue SystemZTargetLowering::combineSTORE(
 
 SDValue SystemZTargetLowering::combineEXTRACT_VECTOR_ELT(
     SDNode *N, DAGCombinerInfo &DCI) const {
+
+  if (!Subtarget.hasVector())
+    return SDValue();
+
   // Try to simplify a vector extraction.
   if (auto *IndexN = dyn_cast<ConstantSDNode>(N->getOperand(1))) {
     SDValue Op0 = N->getOperand(0);
diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h
index 7a21a474c1192e165b88316e985f7eab0c0d0ae6..7d92a7355877879c0cc1df14d0573bf6570f0e18 100644
--- a/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/lib/Target/SystemZ/SystemZISelLowering.h
@@ -537,6 +537,7 @@ private:
                                  unsigned UnpackHigh) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 
+  bool canTreatAsByteVector(EVT VT) const;
   SDValue combineExtract(const SDLoc &DL, EVT ElemVT, EVT VecVT, SDValue OrigOp,
                          unsigned Index, DAGCombinerInfo &DCI,
                          bool Force) const;
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.cpp b/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 0e2ed5988650b6129d59811394e964e5eb311e27..c8ff9558cc8826b1f5b5962b33e04e20c03cbb4d 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -23,7 +23,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -80,12 +79,25 @@ void SystemZInstrInfo::splitMove(MachineBasicBlock::iterator MI,
   MachineInstr *EarlierMI = MF.CloneMachineInstr(&*MI);
   MBB->insert(MI, EarlierMI);
 
-  // Set up the two 64-bit registers.
+  // Set up the two 64-bit registers and remember super reg and its flags.
   MachineOperand &HighRegOp = EarlierMI->getOperand(0);
   MachineOperand &LowRegOp = MI->getOperand(0);
+  unsigned Reg128 = LowRegOp.getReg();
+  unsigned Reg128Killed = getKillRegState(LowRegOp.isKill());
+  unsigned Reg128Undef  = getUndefRegState(LowRegOp.isUndef());
   HighRegOp.setReg(RI.getSubReg(HighRegOp.getReg(), SystemZ::subreg_h64));
   LowRegOp.setReg(RI.getSubReg(LowRegOp.getReg(), SystemZ::subreg_l64));
 
+  if (MI->mayStore()) {
+    // Add implicit uses of the super register in case one of the subregs is
+    // undefined. We could track liveness and skip storing an undefined
+    // subreg, but this is hopefully rare (discovered with llvm-stress).
+    // If Reg128 was killed, set kill flag on MI.
+    unsigned Reg128UndefImpl = (Reg128Undef | RegState::Implicit);
+    MachineInstrBuilder(MF, EarlierMI).addReg(Reg128, Reg128UndefImpl);
+    MachineInstrBuilder(MF, MI).addReg(Reg128, (Reg128UndefImpl | Reg128Killed));
+  }
+
   // The address in the first (high) instruction is already correct.
   // Adjust the offset in the second (low) instruction.
   MachineOperand &HighOffsetOp = EarlierMI->getOperand(2);
@@ -208,9 +220,15 @@ void SystemZInstrInfo::expandLOCRPseudo(MachineInstr &MI, unsigned LowOpcode,
 // are low registers, otherwise use RISB[LH]G.
 void SystemZInstrInfo::expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                                         unsigned Size) const {
-  emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
-                MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
-                Size, MI.getOperand(1).isKill(), MI.getOperand(1).isUndef());
+  MachineInstrBuilder MIB =
+    emitGRX32Move(*MI.getParent(), MI, MI.getDebugLoc(),
+               MI.getOperand(0).getReg(), MI.getOperand(1).getReg(), LowOpcode,
+               Size, MI.getOperand(1).isKill(), MI.getOperand(1).isUndef());
+
+  // Keep the remaining operands as-is.
+  for (unsigned I = 2; I < MI.getNumOperands(); ++I)
+    MIB.add(MI.getOperand(I));
+
   MI.eraseFromParent();
 }
 
@@ -250,12 +268,13 @@ void SystemZInstrInfo::expandLoadStackGuard(MachineInstr *MI) const {
 // are low registers, otherwise use RISB[LH]G.  Size is the number of bits
 // taken from the low end of SrcReg (8 for LLCR, 16 for LLHR and 32 for LR).
 // KillSrc is true if this move is the last use of SrcReg.
-void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MBBI,
-                                     const DebugLoc &DL, unsigned DestReg,
-                                     unsigned SrcReg, unsigned LowLowOpcode,
-                                     unsigned Size, bool KillSrc,
-                                     bool UndefSrc) const {
+MachineInstrBuilder
+SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                const DebugLoc &DL, unsigned DestReg,
+                                unsigned SrcReg, unsigned LowLowOpcode,
+                                unsigned Size, bool KillSrc,
+                                bool UndefSrc) const {
   unsigned Opcode;
   bool DestIsHigh = isHighReg(DestReg);
   bool SrcIsHigh = isHighReg(SrcReg);
@@ -266,12 +285,11 @@ void SystemZInstrInfo::emitGRX32Move(MachineBasicBlock &MBB,
   else if (!DestIsHigh && SrcIsHigh)
     Opcode = SystemZ::RISBLH;
   else {
-    BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
+    return BuildMI(MBB, MBBI, DL, get(LowLowOpcode), DestReg)
       .addReg(SrcReg, getKillRegState(KillSrc) | getUndefRegState(UndefSrc));
-    return;
   }
   unsigned Rotate = (DestIsHigh != SrcIsHigh ? 32 : 0);
-  BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
+  return BuildMI(MBB, MBBI, DL, get(Opcode), DestReg)
     .addReg(DestReg, RegState::Undef)
     .addReg(SrcReg, getKillRegState(KillSrc) | getUndefRegState(UndefSrc))
     .addImm(32 - Size).addImm(128 + 31).addImm(Rotate);
@@ -661,6 +679,12 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
     else {
       Opc = SystemZ::LOCR;
       MRI.constrainRegClass(DstReg, &SystemZ::GR32BitRegClass);
+      unsigned TReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+      unsigned FReg = MRI.createVirtualRegister(&SystemZ::GR32BitRegClass);
+      BuildMI(MBB, I, DL, get(TargetOpcode::COPY), TReg).addReg(TrueReg);
+      BuildMI(MBB, I, DL, get(TargetOpcode::COPY), FReg).addReg(FalseReg);
+      TrueReg = TReg;
+      FalseReg = FReg;
     }
   } else if (SystemZ::GR64BitRegClass.hasSubClassEq(RC))
     Opc = SystemZ::LOCGR;
@@ -727,7 +751,7 @@ bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   return true;
 }
 
-bool SystemZInstrInfo::isPredicable(MachineInstr &MI) const {
+bool SystemZInstrInfo::isPredicable(const MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
   if (Opcode == SystemZ::Return ||
       Opcode == SystemZ::Trap ||
@@ -1127,12 +1151,12 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     // destination register instead.
     if (OpNum == 1) {
       unsigned LoadOpcode = Op0IsGPR ? SystemZ::LG : SystemZ::LD;
-      unsigned Dest = MI.getOperand(0).getReg();
       return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
-                     get(LoadOpcode), Dest)
-          .addFrameIndex(FrameIndex)
-          .addImm(0)
-          .addReg(0);
+                     get(LoadOpcode))
+        .add(MI.getOperand(0))
+        .addFrameIndex(FrameIndex)
+        .addImm(0)
+        .addReg(0);
     }
   }
 
diff --git a/lib/Target/SystemZ/SystemZInstrInfo.h b/lib/Target/SystemZ/SystemZInstrInfo.h
index b07f101d4cb91bf46a1d86d53ccc2e7dda6b767d..b8be1f5f39212752224f8435ea63ba27fcc85dfe 100644
--- a/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/Target/TargetInstrInfo.h"
 #include <cstdint>
 
@@ -160,10 +161,13 @@ class SystemZInstrInfo : public SystemZGenInstrInfo {
   void expandZExtPseudo(MachineInstr &MI, unsigned LowOpcode,
                         unsigned Size) const;
   void expandLoadStackGuard(MachineInstr *MI) const;
-  void emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-                     const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
-                     unsigned LowLowOpcode, unsigned Size, bool KillSrc,
-                     bool UndefSrc) const;
+
+  MachineInstrBuilder
+  emitGRX32Move(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                const DebugLoc &DL, unsigned DestReg, unsigned SrcReg,
+                unsigned LowLowOpcode, unsigned Size, bool KillSrc,
+                bool UndefSrc) const;
+
   virtual void anchor();
 
 protected:
@@ -215,7 +219,7 @@ public:
                     unsigned FalseReg) const override;
   bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
                      MachineRegisterInfo *MRI) const override;
-  bool isPredicable(MachineInstr &MI) const override;
+  bool isPredicable(const MachineInstr &MI) const override;
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
                            unsigned ExtraPredCycles,
                            BranchProbability Probability) const override;
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index b10c0e09a0d4551300ab234aff26d29c6b5b9c7a..e74c9a80515d84d8238900e166f6e4061c49f595 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -259,11 +259,8 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L,
         }
       }
       if (isa<StoreInst>(&I)) {
-        NumStores++;
         Type *MemAccessTy = I.getOperand(0)->getType();
-        if((MemAccessTy->isIntegerTy() || MemAccessTy->isFloatingPointTy()) &&
-           (getDataLayout().getTypeSizeInBits(MemAccessTy) == 128))
-          NumStores++;  // 128 bit fp/int stores get split.
+        NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, 0, 0);
       }
     }
 
@@ -313,3 +310,547 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) {
   return 0;
 }
 
+int SystemZTTIImpl::getArithmeticInstrCost(
+    unsigned Opcode, Type *Ty,  
+    TTI::OperandValueKind Op1Info, TTI::OperandValueKind Op2Info,
+    TTI::OperandValueProperties Opd1PropInfo,
+    TTI::OperandValueProperties Opd2PropInfo,
+    ArrayRef<const Value *> Args) {
+
+  // TODO: return a good value for BB-VECTORIZER that includes the
+  // immediate loads, which we do not want to count for the loop
+  // vectorizer, since they are hopefully hoisted out of the loop. This
+  // would require a new parameter 'InLoop', but not sure if constant
+  // args are common enough to motivate this.
+
+  unsigned ScalarBits = Ty->getScalarSizeInBits();
+
+  if (Ty->isVectorTy()) {
+    assert (ST->hasVector() && "getArithmeticInstrCost() called with vector type.");
+    unsigned VF = Ty->getVectorNumElements();
+    unsigned NumVectors = getNumberOfParts(Ty);
+
+    // These vector operations are custom handled, but are still supported
+    // with one instruction per vector, regardless of element size.
+    if (Opcode == Instruction::Shl || Opcode == Instruction::LShr ||
+        Opcode == Instruction::AShr) {
+      return NumVectors;
+    }
+
+    // These FP operations are supported with a single vector instruction for
+    // double (base implementation assumes float generally costs 2). For
+    // FP128, the scalar cost is 1, and there is no overhead since the values
+    // are already in scalar registers.
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv) {
+      switch (ScalarBits) {
+      case 32: {
+        // Return the cost of multiple scalar invocation plus the cost of
+        // inserting and extracting the values.
+        unsigned ScalarCost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+        // FIXME: VF 2 for these FP operations are currently just as
+        // expensive as for VF 4.
+        if (VF == 2)
+          Cost *= 2;
+        return Cost;
+      }
+      case 64:
+      case 128:
+        return NumVectors;
+      default:
+        break;
+      }
+    }
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem) {
+      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+      // FIXME: VF 2 for float is currently just as expensive as for VF 4.
+      if (VF == 2 && ScalarBits == 32)
+        Cost *= 2;
+      return Cost;
+    }
+  }
+  else {  // Scalar:
+    // These FP operations are supported with a dedicated instruction for
+    // float, double and fp128 (base implementation assumes float generally
+    // costs 2).
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
+      return 1;
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem)
+      return LIBCALL_COST;
+
+    if (Opcode == Instruction::LShr || Opcode == Instruction::AShr)
+      return (ScalarBits >= 32 ? 1 : 2 /*ext*/);
+
+    // Or requires one instruction, although it has custom handling for i64.
+    if (Opcode == Instruction::Or)
+      return 1;
+
+    if (Opcode == Instruction::Xor && ScalarBits == 1)
+      // 2 * ipm sequences ; xor ; shift ; compare
+      return 7;
+
+    // An extra extension for narrow types is needed.
+    if ((Opcode == Instruction::SDiv || Opcode == Instruction::SRem))
+      // sext of op(s) for narrow types
+      return (ScalarBits < 32 ? 4 : (ScalarBits == 32 ? 2 : 1));
+
+    if (Opcode == Instruction::UDiv || Opcode == Instruction::URem)
+      // Clearing of low 64 bit reg + sext of op(s) for narrow types + dl[g]r
+      return (ScalarBits < 32 ? 4 : 2);
+  }
+
+  // Fallback to the default implementation.
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+                                       Opd1PropInfo, Opd2PropInfo, Args);
+}
+
+
+int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
+                                   Type *SubTp) {
+  assert (Tp->isVectorTy());
+  assert (ST->hasVector() && "getShuffleCost() called.");
+  unsigned NumVectors = getNumberOfParts(Tp);
+  
+  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
+
+  // FP128 values are always in scalar registers, so there is no work
+  // involved with a shuffle, except for broadcast. In that case register
+  // moves are done with a single instruction per element.
+  if (Tp->getScalarType()->isFP128Ty())
+    return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
+
+  switch (Kind) {
+  case  TargetTransformInfo::SK_ExtractSubvector:
+    // ExtractSubvector Index indicates start offset.
+
+    // Extracting a subvector from first index is a noop.
+    return (Index == 0 ? 0 : NumVectors);
+
+  case TargetTransformInfo::SK_Broadcast:
+    // Loop vectorizer calls here to figure out the extra cost of
+    // broadcasting a loaded value to all elements of a vector. Since vlrep
+    // loads and replicates with a single instruction, adjust the returned
+    // value.
+    return NumVectors - 1;
+
+  default:
+
+    // SystemZ supports single instruction permutation / replication.
+    return NumVectors;
+  }
+
+  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+}
+
+// Return the log2 difference of the element sizes of the two vector types.
+static unsigned getElSizeLog2Diff(Type *Ty0, Type *Ty1) {
+  unsigned Bits0 = Ty0->getScalarSizeInBits();
+  unsigned Bits1 = Ty1->getScalarSizeInBits();
+
+  if (Bits1 >  Bits0)
+    return (Log2_32(Bits1) - Log2_32(Bits0));
+
+  return (Log2_32(Bits0) - Log2_32(Bits1));
+}
+
+// Return the number of instructions needed to truncate SrcTy to DstTy.
+unsigned SystemZTTIImpl::
+getVectorTruncCost(Type *SrcTy, Type *DstTy) {
+  assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
+  assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
+          "Packing must reduce size of vector type.");
+  assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
+          "Packing should not change number of elements.");
+
+  // TODO: Since fp32 is expanded, the extract cost should always be 0.
+
+  unsigned NumParts = getNumberOfParts(SrcTy);
+  if (NumParts <= 2)
+    // Up to 2 vector registers can be truncated efficiently with pack or
+    // permute. The latter requires an immediate mask to be loaded, which
+    // typically gets hoisted out of a loop.  TODO: return a good value for
+    // BB-VECTORIZER that includes the immediate loads, which we do not want
+    // to count for the loop vectorizer.
+    return 1;
+
+  unsigned Cost = 0;
+  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+  unsigned VF = SrcTy->getVectorNumElements();
+  for (unsigned P = 0; P < Log2Diff; ++P) {
+    if (NumParts > 1)
+      NumParts /= 2;
+    Cost += NumParts;
+  }
+
+  // Currently, a general mix of permutes and pack instructions is output by
+  // isel, which follow the cost computation above except for this case which
+  // is one instruction less:
+  if (VF == 8 && SrcTy->getScalarSizeInBits() == 64 &&
+      DstTy->getScalarSizeInBits() == 8)
+    Cost--;
+
+  return Cost;
+}
+
+// Return the cost of converting a vector bitmask produced by a compare
+// (SrcTy), to the type of the select or extend instruction (DstTy).
+unsigned SystemZTTIImpl::
+getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy) {
+  assert (SrcTy->isVectorTy() && DstTy->isVectorTy() &&
+          "Should only be called with vector types.");
+
+  unsigned PackCost = 0;
+  unsigned SrcScalarBits = SrcTy->getScalarSizeInBits();
+  unsigned DstScalarBits = DstTy->getScalarSizeInBits();
+  unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
+  if (SrcScalarBits > DstScalarBits)
+    // The bitmask will be truncated.
+    PackCost = getVectorTruncCost(SrcTy, DstTy);
+  else if (SrcScalarBits < DstScalarBits) {
+    unsigned DstNumParts = getNumberOfParts(DstTy);
+    // Each vector select needs its part of the bitmask unpacked.
+    PackCost = Log2Diff * DstNumParts;
+    // Extra cost for moving part of mask before unpacking.
+    PackCost += DstNumParts - 1;
+  }
+
+  return PackCost;
+}
+
+// Return the type of the compared operands. This is needed to compute the
+// cost for a Select / ZExt or SExt instruction.
+static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
+  Type *OpTy = nullptr;
+  if (CmpInst *CI = dyn_cast<CmpInst>(I->getOperand(0)))
+    OpTy = CI->getOperand(0)->getType();
+  else if (Instruction *LogicI = dyn_cast<Instruction>(I->getOperand(0)))
+    if (CmpInst *CI0 = dyn_cast<CmpInst>(LogicI->getOperand(0)))
+      if (isa<CmpInst>(LogicI->getOperand(1)))
+        OpTy = CI0->getOperand(0)->getType();
+
+  if (OpTy != nullptr) {
+    if (VF == 1) {
+      assert (!OpTy->isVectorTy() && "Expected scalar type");
+      return OpTy;
+    }
+    // Return the potentially vectorized type based on 'I' and 'VF'.  'I' may
+    // be either scalar or already vectorized with a same or lesser VF.
+    Type *ElTy = OpTy->getScalarType();
+    return VectorType::get(ElTy, VF);
+  }
+
+  return nullptr;
+}
+
+int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     const Instruction *I) {
+  unsigned DstScalarBits = Dst->getScalarSizeInBits();
+  unsigned SrcScalarBits = Src->getScalarSizeInBits();
+
+  if (Src->isVectorTy()) {
+    assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
+    assert (Dst->isVectorTy());
+    unsigned VF = Src->getVectorNumElements();
+    unsigned NumDstVectors = getNumberOfParts(Dst);
+    unsigned NumSrcVectors = getNumberOfParts(Src);
+
+    if (Opcode == Instruction::Trunc) {
+      if (Src->getScalarSizeInBits() == Dst->getScalarSizeInBits())
+        return 0; // Check for NOOP conversions.
+      return getVectorTruncCost(Src, Dst);
+    }
+
+    if (Opcode == Instruction::ZExt || Opcode == Instruction::SExt) {
+      if (SrcScalarBits >= 8) {
+        // ZExt/SExt will be handled with one unpack per doubling of width.
+        unsigned NumUnpacks = getElSizeLog2Diff(Src, Dst);
+
+        // For types that spans multiple vector registers, some additional
+        // instructions are used to setup the unpacking.
+        unsigned NumSrcVectorOps =
+          (NumUnpacks > 1 ? (NumDstVectors - NumSrcVectors)
+                          : (NumDstVectors / 2));
+
+        return (NumUnpacks * NumDstVectors) + NumSrcVectorOps;
+      }
+      else if (SrcScalarBits == 1) {
+        // This should be extension of a compare i1 result.
+        // If we know what the widths of the compared operands, get the
+        // cost of converting it to Dst. Otherwise assume same widths.
+        unsigned Cost = 0;
+        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+        if (CmpOpTy != nullptr)
+          Cost = getVectorBitmaskConversionCost(CmpOpTy, Dst);
+        if (Opcode == Instruction::ZExt)
+          // One 'vn' per dst vector with an immediate mask.
+          Cost += NumDstVectors;
+        return Cost;
+      }
+    }
+  
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP ||
+        Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) {
+      // TODO: Fix base implementation which could simplify things a bit here
+      // (seems to miss on differentiating on scalar/vector types).
+
+      // Only 64 bit vector conversions are natively supported.
+      if (SrcScalarBits == 64 && DstScalarBits == 64)
+        return NumDstVectors;
+
+      // Return the cost of multiple scalar invocation plus the cost of
+      // inserting and extracting the values. Base implementation does not
+      // realize float->int gets scalarized.
+      unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
+                                             Src->getScalarType());
+      unsigned TotCost = VF * ScalarCost;
+      bool NeedsInserts = true, NeedsExtracts = true;
+      // FP128 registers do not get inserted or extracted.
+      if (DstScalarBits == 128 &&
+          (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP))
+        NeedsInserts = false;
+      if (SrcScalarBits == 128 &&
+          (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
+        NeedsExtracts = false;
+
+      TotCost += getScalarizationOverhead(Dst, NeedsInserts, NeedsExtracts);
+
+      // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
+      if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
+        TotCost *= 2;
+
+      return TotCost;
+    }
+
+    if (Opcode == Instruction::FPTrunc) {
+      if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
+        return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+      else // double -> float
+        return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
+    }
+
+    if (Opcode == Instruction::FPExt) {
+      if (SrcScalarBits == 32 && DstScalarBits == 64) {
+        // float -> double is very rare and currently unoptimized. Instead of
+        // using vldeb, which can do two at a time, all conversions are
+        // scalarized.
+        return VF * 2;
+      }
+      // -> fp128.  VF * lxdb/lxeb + extraction of elements.
+      return VF + getScalarizationOverhead(Src, false, true);
+    }
+  }
+  else { // Scalar
+    assert (!Dst->isVectorTy());
+
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP)
+      return (SrcScalarBits >= 32 ? 1 : 2 /*i8/i16 extend*/);
+    
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+        Src->isIntegerTy(1)) {
+      // This should be extension of a compare i1 result, which is done with
+      // ipm and a varying sequence of instructions.
+      unsigned Cost = 0;
+      if (Opcode == Instruction::SExt)
+        Cost = (DstScalarBits < 64 ? 3 : 4);
+      if (Opcode == Instruction::ZExt)
+        Cost = 3;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+        // If operands of an fp-type was compared, this costs +1.
+        Cost++;
+
+      return Cost;
+    }
+  }
+
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+}
+
+int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                       const Instruction *I) {
+  if (ValTy->isVectorTy()) {
+    assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
+    assert (CondTy == nullptr || CondTy->isVectorTy());
+    unsigned VF = ValTy->getVectorNumElements();
+
+    // Called with a compare instruction.
+    if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
+      unsigned PredicateExtraCost = 0;
+      if (I != nullptr) {
+        // Some predicates cost one or two extra instructions.
+        switch (dyn_cast<CmpInst>(I)->getPredicate()) {
+        case CmpInst::Predicate::ICMP_NE:
+        case CmpInst::Predicate::ICMP_UGE:
+        case CmpInst::Predicate::ICMP_ULE:
+        case CmpInst::Predicate::ICMP_SGE:
+        case CmpInst::Predicate::ICMP_SLE:
+          PredicateExtraCost = 1;
+          break;
+        case CmpInst::Predicate::FCMP_ONE:
+        case CmpInst::Predicate::FCMP_ORD:
+        case CmpInst::Predicate::FCMP_UEQ:
+        case CmpInst::Predicate::FCMP_UNO:
+          PredicateExtraCost = 2;
+          break;
+        default:
+          break;
+        }
+      }
+
+      // Float is handled with 2*vmr[lh]f + 2*vldeb + vfchdb for each pair of
+      // floats.  FIXME: <2 x float> generates same code as <4 x float>.
+      unsigned CmpCostPerVector = (ValTy->getScalarType()->isFloatTy() ? 10 : 1);
+      unsigned NumVecs_cmp = getNumberOfParts(ValTy);
+
+      unsigned Cost = (NumVecs_cmp * (CmpCostPerVector + PredicateExtraCost));
+      return Cost;
+    }
+    else { // Called with a select instruction.
+      assert (Opcode == Instruction::Select);
+
+      // We can figure out the extra cost of packing / unpacking if the
+      // instruction was passed and the compare instruction is found.
+      unsigned PackCost = 0;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I, VF) : nullptr);
+      if (CmpOpTy != nullptr)
+        PackCost =
+          getVectorBitmaskConversionCost(CmpOpTy, ValTy);
+
+      return getNumberOfParts(ValTy) /*vsel*/ + PackCost;
+    }
+  }
+  else { // Scalar
+    switch (Opcode) {
+    case Instruction::ICmp: {
+      unsigned Cost = 1;
+      if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
+        Cost += 2; // extend both operands
+      return Cost;
+    }
+    case Instruction::Select:
+      if (ValTy->isFloatingPointTy())
+        return 4; // No load on condition for FP, so this costs a conditional jump.
+      return 1; // Load On Condition.
+    }
+  }
+
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
+}
+
+int SystemZTTIImpl::
+getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
+  // vlvgp will insert two grs into a vector register, so only count half the
+  // number of instructions.
+  if (Opcode == Instruction::InsertElement &&
+      Val->getScalarType()->isIntegerTy(64))
+    return ((Index % 2 == 0) ? 1 : 0);
+
+  if (Opcode == Instruction::ExtractElement) {
+    int Cost = ((Val->getScalarSizeInBits() == 1) ? 2 /*+test-under-mask*/ : 1);
+
+    // Give a slight penalty for moving out of vector pipeline to FXU unit.
+    if (Index == 0 && Val->getScalarType()->isIntegerTy())
+      Cost += 1;
+
+    return Cost;
+  }
+
+  return BaseT::getVectorInstrCost(Opcode, Val, Index);
+}
+
+int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
+                                    unsigned Alignment, unsigned AddressSpace,
+                                    const Instruction *I) {
+  assert(!Src->isVoidTy() && "Invalid type");
+
+  if (!Src->isVectorTy() && Opcode == Instruction::Load &&
+      I != nullptr && I->hasOneUse()) {
+      const Instruction *UserI = cast<Instruction>(*I->user_begin());
+      unsigned Bits = Src->getScalarSizeInBits();
+      bool FoldsLoad = false;
+      switch (UserI->getOpcode()) {
+      case Instruction::ICmp:
+      case Instruction::Add:
+      case Instruction::Sub:
+      case Instruction::Mul:
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::And:
+      case Instruction::Or:
+      case Instruction::Xor:
+      // This also makes sense for float operations, but disabled for now due
+      // to regressions.
+      // case Instruction::FCmp:
+      // case Instruction::FAdd:
+      // case Instruction::FSub:
+      // case Instruction::FMul:
+      // case Instruction::FDiv:
+        FoldsLoad = (Bits == 32 || Bits == 64);
+        break;
+      }
+
+      if (FoldsLoad) {
+        assert (UserI->getNumOperands() == 2 &&
+                "Expected to only handle binops.");
+
+        // UserI can't fold two loads, so in that case return 0 cost only
+        // half of the time.
+        for (unsigned i = 0; i < 2; ++i) {
+          if (UserI->getOperand(i) == I)
+            continue;
+          if (LoadInst *LI = dyn_cast<LoadInst>(UserI->getOperand(i))) {
+            if (LI->hasOneUse())
+              return i == 0;
+          }
+        }
+
+        return 0;
+      }
+  }
+
+  unsigned NumOps = getNumberOfParts(Src);
+
+  if (Src->getScalarSizeInBits() == 128)
+    // 128 bit scalars are held in a pair of two 64 bit registers.
+    NumOps *= 2;
+
+  return  NumOps;
+}
+
+int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                               unsigned Factor,
+                                               ArrayRef<unsigned> Indices,
+                                               unsigned Alignment,
+                                               unsigned AddressSpace) {
+  assert(isa<VectorType>(VecTy) &&
+         "Expect a vector type for interleaved memory op");
+
+  unsigned WideBits = (VecTy->isPtrOrPtrVectorTy() ?
+     (64U * VecTy->getVectorNumElements()) : VecTy->getPrimitiveSizeInBits());
+  assert (WideBits > 0 && "Could not compute size of vector");
+  int NumWideParts =
+    ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
+
+  // How many source vectors are handled to produce a vectorized operand?
+  int NumElsPerVector = (VecTy->getVectorNumElements() / NumWideParts);
+  int NumSrcParts =
+    ((NumWideParts > NumElsPerVector) ? NumElsPerVector : NumWideParts);
+
+  // A Load group may have gaps.
+  unsigned NumOperands =
+    ((Opcode == Instruction::Load) ? Indices.size() : Factor);
+
+  // Each needed permute takes two vectors as input.
+  if (NumSrcParts > 1)
+    NumSrcParts--;
+  int NumPermutes = NumSrcParts * NumOperands;
+
+  // Cost of load/store operations and the permutations needed.
+  return NumWideParts + NumPermutes;
+}
diff --git a/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index f7d2d827f11b0b1c083dcecd33927d4508631e72..3766ed45b8c4e47c1e5281e2602bf188cc963696 100644
--- a/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -27,6 +27,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
   const SystemZSubtarget *getST() const { return ST; }
   const SystemZTargetLowering *getTLI() const { return TLI; }
 
+  unsigned const LIBCALL_COST = 30;
+
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
@@ -53,6 +55,32 @@ public:
   unsigned getNumberOfRegisters(bool Vector);
   unsigned getRegisterBitWidth(bool Vector);
 
+  bool supportsEfficientVectorElementLoadStore() { return true; }
+  bool enableInterleavedAccessVectorization() { return true; }
+
+  int getArithmeticInstrCost(
+      unsigned Opcode, Type *Ty,
+      TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
+      TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
+      TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
+      TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
+      ArrayRef<const Value *> Args = ArrayRef<const Value *>());
+  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
+  unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
+  int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
+                      unsigned AddressSpace, const Instruction *I = nullptr);
+
+  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
+                                 unsigned Factor,
+                                 ArrayRef<unsigned> Indices,
+                                 unsigned Alignment,
+                                 unsigned AddressSpace);
   /// @}
 };
 
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index 4beecb4dbb689b065e623b380d4581de3c501a53..e8fe0a2b218ee4760370c30f8bc855abb86e3693 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -35,10 +35,6 @@ cl::opt<bool> EnableIPRA("enable-ipra", cl::init(false), cl::Hidden,
                          cl::desc("Enable interprocedural register allocation "
                                   "to reduce load/store at procedure calls."));
 
-cl::opt<bool> DebugInfoForProfiling(
-    "debug-info-for-profiling", cl::init(false), cl::Hidden,
-    cl::desc("Emit extra debug info to make sample profile more accurate."));
-
 //---------------------------------------------------------------------------
 // TargetMachine Class
 //
@@ -51,8 +47,6 @@ TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
       RequireStructuredCFG(false), DefaultOptions(Options), Options(Options) {
   if (EnableIPRA.getNumOccurrences())
     this->Options.EnableIPRA = EnableIPRA;
-  if (DebugInfoForProfiling.getNumOccurrences())
-    this->Options.DebugInfoForProfiling = DebugInfoForProfiling;
 }
 
 TargetMachine::~TargetMachine() {
@@ -80,7 +74,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
       Options.X = DefaultOptions.X;                                            \
   } while (0)
 
-  RESET_OPTION(LessPreciseFPMADOption, "less-precise-fpmad");
   RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
   RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
diff --git a/lib/Target/WebAssembly/CMakeLists.txt b/lib/Target/WebAssembly/CMakeLists.txt
index d9c53ecc8d084ae35ff7ba9f814f051f4d26fabb..78b2cdb61b76521cd4922c9996bc690f2103e399 100644
--- a/lib/Target/WebAssembly/CMakeLists.txt
+++ b/lib/Target/WebAssembly/CMakeLists.txt
@@ -14,6 +14,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyAsmPrinter.cpp
   WebAssemblyCallIndirectFixup.cpp
   WebAssemblyCFGStackify.cpp
+  WebAssemblyCFGSort.cpp
   WebAssemblyExplicitLocals.cpp
   WebAssemblyFastISel.cpp
   WebAssemblyFixIrreducibleControlFlow.cpp
@@ -35,6 +36,7 @@ add_llvm_target(WebAssemblyCodeGen
   WebAssemblyRegNumbering.cpp
   WebAssemblyRegStackify.cpp
   WebAssemblyReplacePhysRegs.cpp
+  WebAssemblyRuntimeLibcallSignatures.cpp
   WebAssemblySelectionDAGInfo.cpp
   WebAssemblySetP2AlignOperands.cpp
   WebAssemblyStoreResults.cpp
diff --git a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index b4763ca60ab62555d4653a58df21e0205b0c5c9d..b5f53114d3e16fc2605b699a12b249f09dc71c75 100644
--- a/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -63,89 +63,8 @@ extern "C" void LLVMInitializeWebAssemblyDisassembler() {
 MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
     MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t /*Address*/,
     raw_ostream &OS, raw_ostream &CS) const {
-  Size = 0;
-  uint64_t Pos = 0;
 
-  // Read the opcode.
-  if (Pos + sizeof(uint64_t) > Bytes.size())
-    return MCDisassembler::Fail;
-  uint64_t Opcode = support::endian::read64le(Bytes.data() + Pos);
-  Pos += sizeof(uint64_t);
+  // TODO: Implement disassembly.
 
-  if (Opcode >= WebAssembly::INSTRUCTION_LIST_END)
-    return MCDisassembler::Fail;
-
-  MI.setOpcode(Opcode);
-  const MCInstrDesc &Desc = MCII->get(Opcode);
-  unsigned NumFixedOperands = Desc.NumOperands;
-
-  // If it's variadic, read the number of extra operands.
-  unsigned NumExtraOperands = 0;
-  if (Desc.isVariadic()) {
-    if (Pos + sizeof(uint64_t) > Bytes.size())
-      return MCDisassembler::Fail;
-    NumExtraOperands = support::endian::read64le(Bytes.data() + Pos);
-    Pos += sizeof(uint64_t);
-  }
-
-  // Read the fixed operands. These are described by the MCInstrDesc.
-  for (unsigned i = 0; i < NumFixedOperands; ++i) {
-    const MCOperandInfo &Info = Desc.OpInfo[i];
-    switch (Info.OperandType) {
-    case MCOI::OPERAND_IMMEDIATE:
-    case WebAssembly::OPERAND_LOCAL:
-    case WebAssembly::OPERAND_P2ALIGN:
-    case WebAssembly::OPERAND_BASIC_BLOCK: {
-      if (Pos + sizeof(uint64_t) > Bytes.size())
-        return MCDisassembler::Fail;
-      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
-      Pos += sizeof(uint64_t);
-      MI.addOperand(MCOperand::createImm(Imm));
-      break;
-    }
-    case MCOI::OPERAND_REGISTER: {
-      if (Pos + sizeof(uint64_t) > Bytes.size())
-        return MCDisassembler::Fail;
-      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
-      Pos += sizeof(uint64_t);
-      MI.addOperand(MCOperand::createReg(Reg));
-      break;
-    }
-    case WebAssembly::OPERAND_F32IMM:
-    case WebAssembly::OPERAND_F64IMM: {
-      // TODO: MC converts all floating point immediate operands to double.
-      // This is fine for numeric values, but may cause NaNs to change bits.
-      if (Pos + sizeof(uint64_t) > Bytes.size())
-        return MCDisassembler::Fail;
-      uint64_t Bits = support::endian::read64le(Bytes.data() + Pos);
-      Pos += sizeof(uint64_t);
-      double Imm;
-      memcpy(&Imm, &Bits, sizeof(Imm));
-      MI.addOperand(MCOperand::createFPImm(Imm));
-      break;
-    }
-    default:
-      llvm_unreachable("unimplemented operand kind");
-    }
-  }
-
-  // Read the extra operands.
-  assert(NumExtraOperands == 0 || Desc.isVariadic());
-  for (unsigned i = 0; i < NumExtraOperands; ++i) {
-    if (Pos + sizeof(uint64_t) > Bytes.size())
-      return MCDisassembler::Fail;
-    if (Desc.TSFlags & WebAssemblyII::VariableOpIsImmediate) {
-      // Decode extra immediate operands.
-      uint64_t Imm = support::endian::read64le(Bytes.data() + Pos);
-      MI.addOperand(MCOperand::createImm(Imm));
-    } else {
-      // Decode extra register operands.
-      uint64_t Reg = support::endian::read64le(Bytes.data() + Pos);
-      MI.addOperand(MCOperand::createReg(Reg));
-    }
-    Pos += sizeof(uint64_t);
-  }
-
-  Size = Pos;
-  return MCDisassembler::Success;
+  return MCDisassembler::Fail;
 }
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
index 0af13cffdb04a690b1075423722403276579a37e..f31dde0ce48f1f7557199cc854d84bc0f0d91dd4 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.cpp
@@ -242,3 +242,17 @@ const char *llvm::WebAssembly::TypeToString(MVT Ty) {
     llvm_unreachable("unsupported type");
   }
 }
+
+const char *llvm::WebAssembly::TypeToString(wasm::ValType Type) {
+  switch (Type) {
+  case wasm::ValType::I32:
+    return "i32";
+  case wasm::ValType::I64:
+    return "i64";
+  case wasm::ValType::F32:
+    return "f32";
+  case wasm::ValType::F64:
+    return "f64";
+  }
+  llvm_unreachable("unsupported type");
+}
diff --git a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
index d11f99c1ff3912a0e0ede4abd1a6bb85b1d8d3fa..c6158720d62f1a4e62d8ef927ad2c2d31b7084d1 100644
--- a/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
+++ b/lib/Target/WebAssembly/InstPrinter/WebAssemblyInstPrinter.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCInstPrinter.h"
+#include "llvm/Support/Wasm.h"
 
 namespace llvm {
 
@@ -50,6 +51,7 @@ public:
 namespace WebAssembly {
 
 const char *TypeToString(MVT Ty);
+const char *TypeToString(wasm::ValType Type);
 
 } // end namespace WebAssembly
 
diff --git a/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt b/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
index fd41df7b96355445a111e99880d05f8dee1e41c6..13c0fe91590840e14ae6698522f499e6767c1e92 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
+++ b/lib/Target/WebAssembly/MCTargetDesc/CMakeLists.txt
@@ -5,4 +5,5 @@ add_llvm_library(LLVMWebAssemblyDesc
   WebAssemblyMCCodeEmitter.cpp
   WebAssemblyMCTargetDesc.cpp
   WebAssemblyTargetStreamer.cpp
+  WebAssemblyWasmObjectWriter.cpp
 )
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 97454a824a341871f36f2d119d4d16e3f6f6cea6..7c78285fbda4557632a506390b34616ed1596639 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyFixupKinds.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCDirectives.h"
@@ -22,21 +23,22 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 namespace {
-class WebAssemblyAsmBackend final : public MCAsmBackend {
+class WebAssemblyAsmBackendELF final : public MCAsmBackend {
   bool Is64Bit;
 
 public:
-  explicit WebAssemblyAsmBackend(bool Is64Bit)
+  explicit WebAssemblyAsmBackendELF(bool Is64Bit)
       : MCAsmBackend(), Is64Bit(Is64Bit) {}
-  ~WebAssemblyAsmBackend() override {}
+  ~WebAssemblyAsmBackendELF() override {}
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override;
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
 
   MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
 
@@ -61,6 +63,95 @@ public:
   bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
 };
 
+class WebAssemblyAsmBackend final : public MCAsmBackend {
+  bool Is64Bit;
+
+public:
+  explicit WebAssemblyAsmBackend(bool Is64Bit)
+      : MCAsmBackend(), Is64Bit(Is64Bit) {}
+  ~WebAssemblyAsmBackend() override {}
+
+  unsigned getNumFixupKinds() const override {
+    return WebAssembly::NumTargetFixupKinds;
+  }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
+
+  void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override;
+
+  MCObjectWriter *createObjectWriter(raw_pwrite_stream &OS) const override;
+
+  // No instruction requires relaxation
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    return false;
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst) const override { return false; }
+
+  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
+                        MCInst &Res) const override {}
+
+  bool writeNopData(uint64_t Count, MCObjectWriter *OW) const override;
+};
+
+bool WebAssemblyAsmBackendELF::writeNopData(uint64_t Count,
+                                            MCObjectWriter *OW) const {
+  for (uint64_t i = 0; i < Count; ++i)
+    OW->write8(WebAssembly::Nop);
+
+  return true;
+}
+
+void WebAssemblyAsmBackendELF::applyFixup(const MCFixup &Fixup, char *Data,
+                                          unsigned DataSize, uint64_t Value,
+                                          bool IsPCRel, MCContext &Ctx) const {
+  const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
+  assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
+
+  unsigned NumBytes = alignTo(Info.TargetSize, 8) / 8;
+  if (Value == 0)
+    return; // Doesn't change encoding.
+
+  // Shift the value into position.
+  Value <<= Info.TargetOffset;
+
+  unsigned Offset = Fixup.getOffset();
+  assert(Offset + NumBytes <= DataSize && "Invalid fixup offset!");
+
+  // For each byte of the fragment that the fixup touches, mask in the
+  // bits from the fixup value.
+  for (unsigned i = 0; i != NumBytes; ++i)
+    Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+}
+
+MCObjectWriter *
+WebAssemblyAsmBackendELF::createObjectWriter(raw_pwrite_stream &OS) const {
+  return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
+}
+
+const MCFixupKindInfo &
+WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[WebAssembly::NumTargetFixupKinds] = {
+    // This table *must* be in the order that the fixup_* kinds are defined in
+    // WebAssemblyFixupKinds.h.
+    //
+    // Name                     Offset (bits) Size (bits)     Flags
+    { "fixup_code_sleb128_i32", 0,            5*8,            0 },
+    { "fixup_code_sleb128_i64", 0,            10*8,           0 },
+    { "fixup_code_uleb128_i32", 0,            5*8,            0 },
+  };
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
 bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
                                          MCObjectWriter *OW) const {
   if (Count == 0)
@@ -74,11 +165,11 @@ bool WebAssemblyAsmBackend::writeNopData(uint64_t Count,
 
 void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
                                        unsigned DataSize, uint64_t Value,
-                                       bool IsPCRel) const {
+                                       bool IsPCRel, MCContext &Ctx) const {
   const MCFixupKindInfo &Info = getFixupKindInfo(Fixup.getKind());
   assert(Info.Flags == 0 && "WebAssembly does not use MCFixupKindInfo flags");
 
-  unsigned NumBytes = (Info.TargetSize + 7) / 8;
+  unsigned NumBytes = alignTo(Info.TargetSize, 8) / 8;
   if (Value == 0)
     return; // Doesn't change encoding.
 
@@ -96,10 +187,12 @@ void WebAssemblyAsmBackend::applyFixup(const MCFixup &Fixup, char *Data,
 
 MCObjectWriter *
 WebAssemblyAsmBackend::createObjectWriter(raw_pwrite_stream &OS) const {
-  return createWebAssemblyELFObjectWriter(OS, Is64Bit, 0);
+  return createWebAssemblyWasmObjectWriter(OS, Is64Bit);
 }
 } // end anonymous namespace
 
 MCAsmBackend *llvm::createWebAssemblyAsmBackend(const Triple &TT) {
+  if (TT.isOSBinFormatELF())
+    return new WebAssemblyAsmBackendELF(TT.isArch64Bit());
   return new WebAssemblyAsmBackend(TT.isArch64Bit());
 }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
new file mode 100644
index 0000000000000000000000000000000000000000..b0af63c924bd600b531805c5dd703b9b2a16d574
--- /dev/null
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
@@ -0,0 +1,31 @@
+//=- WebAssemblyFixupKinds.h - WebAssembly Specific Fixup Entries -*- C++ -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYFIXUPKINDS_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace WebAssembly {
+enum Fixups {
+  fixup_code_sleb128_i32 = FirstTargetFixupKind,      // 32-bit signed
+  fixup_code_sleb128_i64,                             // 64-bit signed
+  fixup_code_uleb128_i32,                             // 32-bit unsigned
+
+  fixup_code_global_index,                            // 32-bit unsigned
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // end namespace WebAssembly
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
index d8c39216c53b5f3b79a87c4030e44eec043c376b..2dcec5263fa1e0715924f4060bfa477845f324e4 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.cpp
@@ -19,9 +19,9 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-mc-asm-info"
 
-WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
+WebAssemblyMCAsmInfoELF::~WebAssemblyMCAsmInfoELF() {}
 
-WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
+WebAssemblyMCAsmInfoELF::WebAssemblyMCAsmInfoELF(const Triple &T) {
   PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
 
   // TODO: What should MaxInstLength be?
@@ -51,3 +51,33 @@ WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
   // WebAssembly's stack is never executable.
   UsesNonexecutableStackSection = false;
 }
+
+WebAssemblyMCAsmInfo::~WebAssemblyMCAsmInfo() {}
+
+WebAssemblyMCAsmInfo::WebAssemblyMCAsmInfo(const Triple &T) {
+  PointerSize = CalleeSaveStackSlotSize = T.isArch64Bit() ? 8 : 4;
+
+  // TODO: What should MaxInstLength be?
+
+  UseDataRegionDirectives = true;
+
+  // Use .skip instead of .zero because .zero is confusing when used with two
+  // arguments (it doesn't actually zero things out).
+  ZeroDirective = "\t.skip\t";
+
+  Data8bitsDirective = "\t.int8\t";
+  Data16bitsDirective = "\t.int16\t";
+  Data32bitsDirective = "\t.int32\t";
+  Data64bitsDirective = "\t.int64\t";
+
+  AlignmentIsInBytes = false;
+  COMMDirectiveAlignmentIsInBytes = false;
+  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
+
+  SupportsDebugInformation = true;
+
+  // For now, WebAssembly does not support exceptions.
+  ExceptionsType = ExceptionHandling::None;
+
+  // TODO: UseIntegratedAssembler?
+}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
index 2dcf2cd3c892aae2553fc490244f776b025385ad..d9547096190ead5293933f3c08d5c26f1d996383 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCAsmInfo.h
@@ -16,12 +16,19 @@
 #define LLVM_LIB_TARGET_WEBASSEMBLY_MCTARGETDESC_WEBASSEMBLYMCASMINFO_H
 
 #include "llvm/MC/MCAsmInfoELF.h"
+#include "llvm/MC/MCAsmInfoWasm.h"
 
 namespace llvm {
 
 class Triple;
 
-class WebAssemblyMCAsmInfo final : public MCAsmInfoELF {
+class WebAssemblyMCAsmInfoELF final : public MCAsmInfoELF {
+public:
+  explicit WebAssemblyMCAsmInfoELF(const Triple &T);
+  ~WebAssemblyMCAsmInfoELF() override;
+};
+
+class WebAssemblyMCAsmInfo final : public MCAsmInfoWasm {
 public:
   explicit WebAssemblyMCAsmInfo(const Triple &T);
   ~WebAssemblyMCAsmInfo() override;
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index d0e0eecd3002cac66fd75ece91a119fae5980df2..a0b008947491a1cbfb276d124ecd5ef24ebd6d2c 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyFixupKinds.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/MC/MCCodeEmitter.h"
@@ -35,6 +36,7 @@ STATISTIC(MCNumFixups, "Number of MC fixups created.");
 namespace {
 class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
   const MCInstrInfo &MCII;
+  MCContext &Ctx;
 
   // Implementation generated by tablegen.
   uint64_t getBinaryCodeForInstr(const MCInst &MI,
@@ -46,12 +48,14 @@ class WebAssemblyMCCodeEmitter final : public MCCodeEmitter {
                          const MCSubtargetInfo &STI) const override;
 
 public:
-  explicit WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii) : MCII(mcii) {}
+  WebAssemblyMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : MCII(mcii), Ctx(ctx) {}
 };
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII) {
-  return new WebAssemblyMCCodeEmitter(MCII);
+MCCodeEmitter *llvm::createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
+                                                    MCContext &Ctx) {
+  return new WebAssemblyMCCodeEmitter(MCII, Ctx);
 }
 
 void WebAssemblyMCCodeEmitter::encodeInstruction(
@@ -63,6 +67,13 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
   assert(Binary < UINT8_MAX && "Multi-byte opcodes not supported yet");
   OS << uint8_t(Binary);
 
+  // For br_table instructions, encode the size of the table. In the MCInst,
+  // there's an index operand, one operand for each table entry, and the
+  // default operand.
+  if (MI.getOpcode() == WebAssembly::BR_TABLE_I32 ||
+      MI.getOpcode() == WebAssembly::BR_TABLE_I64)
+    encodeULEB128(MI.getNumOperands() - 2, OS);
+
   const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
   for (unsigned i = 0, e = MI.getNumOperands(); i < e; ++i) {
     const MCOperand &MO = MI.getOperand(i);
@@ -77,6 +88,12 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
           encodeSLEB128(int32_t(MO.getImm()), OS);
         } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
           encodeSLEB128(int64_t(MO.getImm()), OS);
+        } else if (Info.OperandType == WebAssembly::OPERAND_GLOBAL) {
+          Fixups.push_back(MCFixup::create(
+              OS.tell() - Start, MCConstantExpr::create(MO.getImm(), Ctx),
+              MCFixupKind(WebAssembly::fixup_code_global_index), MI.getLoc()));
+          ++MCNumFixups;
+          encodeULEB128(uint64_t(MO.getImm()), OS);
         } else {
           encodeULEB128(uint64_t(MO.getImm()), OS);
         }
@@ -102,14 +119,28 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
         support::endian::Writer<support::little>(OS).write<double>(d);
       }
     } else if (MO.isExpr()) {
+      const MCOperandInfo &Info = Desc.OpInfo[i];
+      llvm::MCFixupKind FixupKind;
+      size_t PaddedSize;
+      if (Info.OperandType == WebAssembly::OPERAND_I32IMM) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i32);
+        PaddedSize = 5;
+      } else if (Info.OperandType == WebAssembly::OPERAND_I64IMM) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_sleb128_i64);
+        PaddedSize = 10;
+      } else if (Info.OperandType == WebAssembly::OPERAND_FUNCTION32 ||
+                 Info.OperandType == WebAssembly::OPERAND_OFFSET32 ||
+                 Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
+        FixupKind = MCFixupKind(WebAssembly::fixup_code_uleb128_i32);
+        PaddedSize = 5;
+      } else {
+        llvm_unreachable("unexpected symbolic operand kind");
+      }
       Fixups.push_back(MCFixup::create(
           OS.tell() - Start, MO.getExpr(),
-          STI.getTargetTriple().isArch64Bit() ? FK_Data_8 : FK_Data_4,
-          MI.getLoc()));
+          FixupKind, MI.getLoc()));
       ++MCNumFixups;
-      encodeULEB128(STI.getTargetTriple().isArch64Bit() ? UINT64_MAX
-                                                        : uint64_t(UINT32_MAX),
-                    OS);
+      encodeULEB128(0, OS, PaddedSize - 1);
     } else {
       llvm_unreachable("unexpected operand kind");
     }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
index 3dc1ded1711641cdc620dd20e72fa5d9f18ddbf9..9fd3ec81c258f4209e972611c462eb8f191671f3 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.cpp
@@ -36,6 +36,8 @@ using namespace llvm;
 
 static MCAsmInfo *createMCAsmInfo(const MCRegisterInfo & /*MRI*/,
                                   const Triple &TT) {
+  if (TT.isOSBinFormatELF())
+    return new WebAssemblyMCAsmInfoELF(TT);
   return new WebAssemblyMCAsmInfo(TT);
 }
 
@@ -71,8 +73,8 @@ static MCInstPrinter *createMCInstPrinter(const Triple & /*T*/,
 
 static MCCodeEmitter *createCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo & /*MRI*/,
-                                        MCContext & /*Ctx*/) {
-  return createWebAssemblyMCCodeEmitter(MCII);
+                                        MCContext &Ctx) {
+  return createWebAssemblyMCCodeEmitter(MCII, Ctx);
 }
 
 static MCAsmBackend *createAsmBackend(const Target & /*T*/,
@@ -88,8 +90,12 @@ static MCSubtargetInfo *createMCSubtargetInfo(const Triple &TT, StringRef CPU,
 }
 
 static MCTargetStreamer *
-createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo & /*STI*/) {
-  return new WebAssemblyTargetELFStreamer(S);
+createObjectTargetStreamer(MCStreamer &S, const MCSubtargetInfo &STI) {
+  const Triple &TT = STI.getTargetTriple();
+  if (TT.isOSBinFormatELF())
+    return new WebAssemblyTargetELFStreamer(S);
+
+  return new WebAssemblyTargetWasmStreamer(S);
 }
 
 static MCTargetStreamer *createAsmTargetStreamer(MCStreamer &S,
@@ -135,12 +141,12 @@ extern "C" void LLVMInitializeWebAssemblyTargetMC() {
   }
 }
 
-WebAssembly::ValType WebAssembly::toValType(const MVT &Ty) {
+wasm::ValType WebAssembly::toValType(const MVT &Ty) {
   switch (Ty.SimpleTy) {
-  case MVT::i32: return WebAssembly::ValType::I32;
-  case MVT::i64: return WebAssembly::ValType::I64;
-  case MVT::f32: return WebAssembly::ValType::F32;
-  case MVT::f64: return WebAssembly::ValType::F64;
+  case MVT::i32: return wasm::ValType::I32;
+  case MVT::i64: return wasm::ValType::I64;
+  case MVT::f32: return wasm::ValType::F32;
+  case MVT::f64: return wasm::ValType::F64;
   default: llvm_unreachable("unexpected type");
   }
 }
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index 8583b772deab2b7bb59afc910173d2ac54ba1be2..795658ca96b4c0dd2ac53e866ea117a985157fdf 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -17,6 +17,7 @@
 
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Wasm.h"
 
 namespace llvm {
 
@@ -34,19 +35,25 @@ class raw_pwrite_stream;
 Target &getTheWebAssemblyTarget32();
 Target &getTheWebAssemblyTarget64();
 
-MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
+MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII,
+                                              MCContext &Ctx);
 
 MCAsmBackend *createWebAssemblyAsmBackend(const Triple &TT);
 
 MCObjectWriter *createWebAssemblyELFObjectWriter(raw_pwrite_stream &OS,
                                                  bool Is64Bit, uint8_t OSABI);
 
+MCObjectWriter *createWebAssemblyWasmObjectWriter(raw_pwrite_stream &OS,
+                                                  bool Is64Bit);
+
 namespace WebAssembly {
 enum OperandType {
   /// Basic block label in a branch construct.
   OPERAND_BASIC_BLOCK = MCOI::OPERAND_FIRST_TARGET,
   /// Local index.
   OPERAND_LOCAL,
+  /// Global index.
+  OPERAND_GLOBAL,
   /// 32-bit integer immediates.
   OPERAND_I32IMM,
   /// 64-bit integer immediates.
@@ -62,7 +69,9 @@ enum OperandType {
   /// p2align immediate for load and store address alignment.
   OPERAND_P2ALIGN,
   /// signature immediate for block/loop.
-  OPERAND_SIGNATURE
+  OPERAND_SIGNATURE,
+  /// type signature immediate for call_indirect.
+  OPERAND_TYPEINDEX,
 };
 } // end namespace WebAssembly
 
@@ -141,40 +150,25 @@ static const unsigned StoreP2AlignOperandNo = 0;
 
 /// This is used to indicate block signatures.
 enum class ExprType {
-  Void    = 0x40,
-  I32     = 0x7f,
-  I64     = 0x7e,
-  F32     = 0x7d,
-  F64     = 0x7c,
-  I8x16   = 0x7b,
-  I16x8   = 0x7a,
-  I32x4   = 0x79,
-  F32x4   = 0x78,
-  B8x16   = 0x77,
-  B16x8   = 0x76,
-  B32x4   = 0x75
-};
-
-/// This is used to indicate local types.
-enum class ValType {
-  I32     = 0x7f,
-  I64     = 0x7e,
-  F32     = 0x7d,
-  F64     = 0x7c,
-  I8x16   = 0x7b,
-  I16x8   = 0x7a,
-  I32x4   = 0x79,
-  F32x4   = 0x78,
-  B8x16   = 0x77,
-  B16x8   = 0x76,
-  B32x4   = 0x75
+  Void    = -0x40,
+  I32     = -0x01,
+  I64     = -0x02,
+  F32     = -0x03,
+  F64     = -0x04,
+  I8x16   = -0x05,
+  I16x8   = -0x06,
+  I32x4   = -0x07,
+  F32x4   = -0x08,
+  B8x16   = -0x09,
+  B16x8   = -0x0a,
+  B32x4   = -0x0b
 };
 
 /// Instruction opcodes emitted via means other than CodeGen.
 static const unsigned Nop = 0x01;
 static const unsigned End = 0x0b;
 
-ValType toValType(const MVT &Ty);
+wasm::ValType toValType(const MVT &Ty);
 
 } // end namespace WebAssembly
 } // end namespace llvm
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 3cee8b2a184442585ba48432beb142336e6922d0..ad59f2f405879e90cb8470c116b47243f1af3427 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -18,9 +18,11 @@
 #include "WebAssemblyMCTargetDesc.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCSectionELF.h"
+#include "llvm/MC/MCSectionWasm.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbolELF.h"
-#include "llvm/Support/ELF.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 using namespace llvm;
@@ -28,6 +30,10 @@ using namespace llvm;
 WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S) {}
 
+void WebAssemblyTargetStreamer::emitValueType(wasm::ValType Type) {
+  Streamer.EmitSLEB128IntValue(int32_t(Type));
+}
+
 WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
     MCStreamer &S, formatted_raw_ostream &OS)
     : WebAssemblyTargetStreamer(S), OS(OS) {}
@@ -35,6 +41,9 @@ WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
 WebAssemblyTargetELFStreamer::WebAssemblyTargetELFStreamer(MCStreamer &S)
     : WebAssemblyTargetStreamer(S) {}
 
+WebAssemblyTargetWasmStreamer::WebAssemblyTargetWasmStreamer(MCStreamer &S)
+    : WebAssemblyTargetStreamer(S) {}
+
 static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
   bool First = true;
   for (MVT Type : Types) {
@@ -47,14 +56,28 @@ static void PrintTypes(formatted_raw_ostream &OS, ArrayRef<MVT> Types) {
   OS << '\n';
 }
 
-void WebAssemblyTargetAsmStreamer::emitParam(ArrayRef<MVT> Types) {
-  OS << "\t.param  \t";
-  PrintTypes(OS, Types);
+void WebAssemblyTargetAsmStreamer::emitParam(MCSymbol *Symbol,
+                                             ArrayRef<MVT> Types) {
+  if (!Types.empty()) {
+    OS << "\t.param  \t";
+
+    // FIXME: Currently this applies to the "current" function; it may
+    // be cleaner to specify an explicit symbol as part of the directive.
+
+    PrintTypes(OS, Types);
+  }
 }
 
-void WebAssemblyTargetAsmStreamer::emitResult(ArrayRef<MVT> Types) {
-  OS << "\t.result \t";
-  PrintTypes(OS, Types);
+void WebAssemblyTargetAsmStreamer::emitResult(MCSymbol *Symbol,
+                                              ArrayRef<MVT> Types) {
+  if (!Types.empty()) {
+    OS << "\t.result \t";
+
+    // FIXME: Currently this applies to the "current" function; it may
+    // be cleaner to specify an explicit symbol as part of the directive.
+
+    PrintTypes(OS, Types);
+  }
 }
 
 void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
@@ -64,6 +87,31 @@ void WebAssemblyTargetAsmStreamer::emitLocal(ArrayRef<MVT> Types) {
   }
 }
 
+void WebAssemblyTargetAsmStreamer::emitGlobal(
+    ArrayRef<wasm::Global> Globals) {
+  if (!Globals.empty()) {
+    OS << "\t.globalvar  \t";
+
+    bool First = true;
+    for (const wasm::Global &G : Globals) {
+      if (First)
+        First = false;
+      else
+        OS << ", ";
+      OS << WebAssembly::TypeToString(G.Type);
+      if (!G.InitialModule.empty())
+        OS << '=' << G.InitialModule << ':' << G.InitialName;
+      else
+        OS << '=' << G.InitialValue;
+    }
+    OS << '\n';
+  }
+}
+
+void WebAssemblyTargetAsmStreamer::emitStackPointer(uint32_t Index) {
+  OS << "\t.stack_pointer\t" << Index << '\n';
+}
+
 void WebAssemblyTargetAsmStreamer::emitEndFunc() { OS << "\t.endfunc\n"; }
 
 void WebAssemblyTargetAsmStreamer::emitIndirectFunctionType(
@@ -88,18 +136,30 @@ void WebAssemblyTargetAsmStreamer::emitIndIdx(const MCExpr *Value) {
   OS << "\t.indidx  \t" << *Value << '\n';
 }
 
-void WebAssemblyTargetELFStreamer::emitParam(ArrayRef<MVT> Types) {
+void WebAssemblyTargetELFStreamer::emitParam(MCSymbol *Symbol,
+                                             ArrayRef<MVT> Types) {
   // Nothing to emit; params are declared as part of the function signature.
 }
 
-void WebAssemblyTargetELFStreamer::emitResult(ArrayRef<MVT> Types) {
+void WebAssemblyTargetELFStreamer::emitResult(MCSymbol *Symbol,
+                                              ArrayRef<MVT> Types) {
   // Nothing to emit; results are declared as part of the function signature.
 }
 
 void WebAssemblyTargetELFStreamer::emitLocal(ArrayRef<MVT> Types) {
   Streamer.EmitULEB128IntValue(Types.size());
   for (MVT Type : Types)
-    Streamer.EmitIntValue(int64_t(WebAssembly::toValType(Type)), 1);
+    emitValueType(WebAssembly::toValType(Type));
+}
+
+void WebAssemblyTargetELFStreamer::emitGlobal(
+    ArrayRef<wasm::Global> Globals) {
+  llvm_unreachable(".globalvar encoding not yet implemented");
+}
+
+void WebAssemblyTargetELFStreamer::emitStackPointer(
+    uint32_t Index) {
+  llvm_unreachable(".stack_pointer encoding not yet implemented");
 }
 
 void WebAssemblyTargetELFStreamer::emitEndFunc() {
@@ -117,4 +177,88 @@ void WebAssemblyTargetELFStreamer::emitIndirectFunctionType(
 }
 
 void WebAssemblyTargetELFStreamer::emitGlobalImport(StringRef name) {
-}
\ No newline at end of file
+}
+
+void WebAssemblyTargetWasmStreamer::emitParam(MCSymbol *Symbol,
+                                              ArrayRef<MVT> Types) {
+  SmallVector<wasm::ValType, 4> Params;
+  for (MVT Ty : Types)
+    Params.push_back(WebAssembly::toValType(Ty));
+
+  cast<MCSymbolWasm>(Symbol)->setParams(std::move(Params));
+}
+
+void WebAssemblyTargetWasmStreamer::emitResult(MCSymbol *Symbol,
+                                               ArrayRef<MVT> Types) {
+  SmallVector<wasm::ValType, 4> Returns;
+  for (MVT Ty : Types)
+    Returns.push_back(WebAssembly::toValType(Ty));
+
+  cast<MCSymbolWasm>(Symbol)->setReturns(std::move(Returns));
+}
+
+void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef<MVT> Types) {
+  SmallVector<std::pair<MVT, uint32_t>, 4> Grouped;
+  for (MVT Type : Types) {
+    if (Grouped.empty() || Grouped.back().first != Type)
+      Grouped.push_back(std::make_pair(Type, 1));
+    else
+      ++Grouped.back().second;
+  }
+
+  Streamer.EmitULEB128IntValue(Grouped.size());
+  for (auto Pair : Grouped) {
+    Streamer.EmitULEB128IntValue(Pair.second);
+    emitValueType(WebAssembly::toValType(Pair.first));
+  }
+}
+
+void WebAssemblyTargetWasmStreamer::emitGlobal(
+    ArrayRef<wasm::Global> Globals) {
+  // Encode the globals use by the funciton into the special .global_variables
+  // section. This will later be decoded and turned into contents for the
+  // Globals Section.
+  Streamer.PushSection();
+  Streamer.SwitchSection(Streamer.getContext()
+                                 .getWasmSection(".global_variables", 0, 0));
+  for (const wasm::Global &G : Globals) {
+    Streamer.EmitIntValue(int32_t(G.Type), 1);
+    Streamer.EmitIntValue(G.Mutable, 1);
+    if (G.InitialModule.empty()) {
+      Streamer.EmitIntValue(0, 1); // indicate that we have an int value
+      Streamer.EmitSLEB128IntValue(0);
+    } else {
+      Streamer.EmitIntValue(1, 1); // indicate that we have a module import
+      Streamer.EmitBytes(G.InitialModule);
+      Streamer.EmitIntValue(0, 1); // nul-terminate
+      Streamer.EmitBytes(G.InitialName);
+      Streamer.EmitIntValue(0, 1); // nul-terminate
+    }
+  }
+  Streamer.PopSection();
+}
+
+void WebAssemblyTargetWasmStreamer::emitStackPointer(uint32_t Index) {
+  Streamer.PushSection();
+  Streamer.SwitchSection(Streamer.getContext()
+                                 .getWasmSection(".stack_pointer", 0, 0));
+  Streamer.EmitIntValue(Index, 4);
+  Streamer.PopSection();
+}
+
+void WebAssemblyTargetWasmStreamer::emitEndFunc() {
+  llvm_unreachable(".end_func is not needed for direct wasm output");
+}
+
+void WebAssemblyTargetWasmStreamer::emitIndIdx(const MCExpr *Value) {
+  llvm_unreachable(".indidx encoding not yet implemented");
+}
+
+void WebAssemblyTargetWasmStreamer::emitIndirectFunctionType(
+    StringRef name, SmallVectorImpl<MVT> &Params, SmallVectorImpl<MVT> &Results) {
+  // Nothing to emit here. TODO: Re-design how linking works and re-evaluate
+  // whether it's necessary for .o files to declare indirect function types.
+}
+
+void WebAssemblyTargetWasmStreamer::emitGlobalImport(StringRef name) {
+}
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 23ac3190243ad126178f92b1c8c118de845fa954..68d6747298dfc537e8e4bfd6f6a759bc33b7f587 100644
--- a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -18,10 +18,12 @@
 
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Support/Wasm.h"
 
 namespace llvm {
 
 class MCELFStreamer;
+class MCWasmStreamer;
 
 /// WebAssembly-specific streamer interface, to implement support
 /// WebAssembly-specific assembly directives.
@@ -30,11 +32,15 @@ public:
   explicit WebAssemblyTargetStreamer(MCStreamer &S);
 
   /// .param
-  virtual void emitParam(ArrayRef<MVT> Types) = 0;
+  virtual void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) = 0;
   /// .result
-  virtual void emitResult(ArrayRef<MVT> Types) = 0;
+  virtual void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) = 0;
   /// .local
   virtual void emitLocal(ArrayRef<MVT> Types) = 0;
+  /// .globalvar
+  virtual void emitGlobal(ArrayRef<wasm::Global> Globals) = 0;
+  /// .stack_pointer
+  virtual void emitStackPointer(uint32_t Index) = 0;
   /// .endfunc
   virtual void emitEndFunc() = 0;
   /// .functype
@@ -47,6 +53,9 @@ public:
   virtual void emitIndIdx(const MCExpr *Value) = 0;
   /// .import_global
   virtual void emitGlobalImport(StringRef name) = 0;
+
+protected:
+  void emitValueType(wasm::ValType Type);
 };
 
 /// This part is for ascii assembly output
@@ -56,9 +65,11 @@ class WebAssemblyTargetAsmStreamer final : public WebAssemblyTargetStreamer {
 public:
   WebAssemblyTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
 
-  void emitParam(ArrayRef<MVT> Types) override;
-  void emitResult(ArrayRef<MVT> Types) override;
+  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
   void emitLocal(ArrayRef<MVT> Types) override;
+  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
+  void emitStackPointer(uint32_t Index) override;
   void emitEndFunc() override;
   void emitIndirectFunctionType(StringRef name,
                                 SmallVectorImpl<MVT> &Params,
@@ -72,9 +83,29 @@ class WebAssemblyTargetELFStreamer final : public WebAssemblyTargetStreamer {
 public:
   explicit WebAssemblyTargetELFStreamer(MCStreamer &S);
 
-  void emitParam(ArrayRef<MVT> Types) override;
-  void emitResult(ArrayRef<MVT> Types) override;
+  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitLocal(ArrayRef<MVT> Types) override;
+  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
+  void emitStackPointer(uint32_t Index) override;
+  void emitEndFunc() override;
+  void emitIndirectFunctionType(StringRef name,
+                                SmallVectorImpl<MVT> &Params,
+                                SmallVectorImpl<MVT> &Results) override;
+  void emitIndIdx(const MCExpr *Value) override;
+  void emitGlobalImport(StringRef name) override;
+};
+
+/// This part is for Wasm object output
+class WebAssemblyTargetWasmStreamer final : public WebAssemblyTargetStreamer {
+public:
+  explicit WebAssemblyTargetWasmStreamer(MCStreamer &S);
+
+  void emitParam(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
+  void emitResult(MCSymbol *Symbol, ArrayRef<MVT> Types) override;
   void emitLocal(ArrayRef<MVT> Types) override;
+  void emitGlobal(ArrayRef<wasm::Global> Globals) override;
+  void emitStackPointer(uint32_t Index) override;
   void emitEndFunc() override;
   void emitIndirectFunctionType(StringRef name,
                                 SmallVectorImpl<MVT> &Params,
diff --git a/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..2846ec5e933773e0349d415e74bf91d2010d9998
--- /dev/null
+++ b/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -0,0 +1,92 @@
+//===-- WebAssemblyWasmObjectWriter.cpp - WebAssembly Wasm Writer ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file handles Wasm-specific object emission, converting LLVM's
+/// internal fixups into the appropriate relocations.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "MCTargetDesc/WebAssemblyFixupKinds.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCSymbolWasm.h"
+#include "llvm/MC/MCWasmObjectWriter.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Wasm.h"
+using namespace llvm;
+
+namespace {
+class WebAssemblyWasmObjectWriter final : public MCWasmObjectTargetWriter {
+public:
+  explicit WebAssemblyWasmObjectWriter(bool Is64Bit);
+
+private:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+} // end anonymous namespace
+
+WebAssemblyWasmObjectWriter::WebAssemblyWasmObjectWriter(bool Is64Bit)
+    : MCWasmObjectTargetWriter(Is64Bit) {}
+
+// Test whether the given expression computes a function address.
+static bool IsFunctionExpr(const MCExpr *Expr) {
+  if (const MCSymbolRefExpr *SyExp =
+          dyn_cast<MCSymbolRefExpr>(Expr))
+    return cast<MCSymbolWasm>(SyExp->getSymbol()).isFunction();
+
+  if (const MCBinaryExpr *BinOp =
+          dyn_cast<MCBinaryExpr>(Expr))
+    return IsFunctionExpr(BinOp->getLHS()) != IsFunctionExpr(BinOp->getRHS());
+
+  if (const MCUnaryExpr *UnOp =
+          dyn_cast<MCUnaryExpr>(Expr))
+    return IsFunctionExpr(UnOp->getSubExpr());
+
+  return false;
+}
+
+unsigned WebAssemblyWasmObjectWriter::getRelocType(MCContext &Ctx,
+                                                   const MCValue &Target,
+                                                   const MCFixup &Fixup,
+                                                   bool IsPCRel) const {
+  // WebAssembly functions are not allocated in the data address space. To
+  // resolve a pointer to a function, we must use a special relocation type.
+  bool IsFunction = IsFunctionExpr(Fixup.getValue());
+
+  assert(!IsPCRel);
+  switch (unsigned(Fixup.getKind())) {
+  case WebAssembly::fixup_code_sleb128_i32:
+    if (IsFunction)
+      return wasm::R_WEBASSEMBLY_TABLE_INDEX_SLEB;
+    return wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB;
+  case WebAssembly::fixup_code_sleb128_i64:
+    llvm_unreachable("fixup_sleb128_i64 not implemented yet");
+  case WebAssembly::fixup_code_uleb128_i32:
+    if (IsFunction)
+      return wasm::R_WEBASSEMBLY_FUNCTION_INDEX_LEB;
+    return wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB;
+  case FK_Data_4:
+    if (IsFunction)
+      return wasm::R_WEBASSEMBLY_TABLE_INDEX_I32;
+    return wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32;
+  case FK_Data_8:
+    llvm_unreachable("FK_Data_8 not implemented yet");
+  default:
+    llvm_unreachable("unimplemented fixup kind");
+  }
+}
+
+MCObjectWriter *llvm::createWebAssemblyWasmObjectWriter(raw_pwrite_stream &OS,
+                                                        bool Is64Bit) {
+  MCWasmObjectTargetWriter *MOTW = new WebAssemblyWasmObjectWriter(Is64Bit);
+  return createWasmObjectWriter(MOTW, OS);
+}
diff --git a/lib/Target/WebAssembly/README.txt b/lib/Target/WebAssembly/README.txt
index 64991ad14071c0e1748f5aee08828e057e6f97e6..3433b1553e8c3a4e793045f5a0bfb193b8b57dd0 100644
--- a/lib/Target/WebAssembly/README.txt
+++ b/lib/Target/WebAssembly/README.txt
@@ -145,3 +145,24 @@ WebAssemblyRegStackify could be extended, or possibly rewritten, to take
 advantage of the new opportunities.
 
 //===---------------------------------------------------------------------===//
+
+Add support for mergeable sections in the Wasm writer, such as for strings and
+floating-point constants.
+
+//===---------------------------------------------------------------------===//
+
+The function @dynamic_alloca_redzone in test/CodeGen/WebAssembly/userstack.ll
+ends up with a tee_local in its prolog which has an unused result, requiring
+an extra drop:
+
+    get_global  $push8=, 0
+    tee_local   $push9=, 1, $pop8
+    drop        $pop9
+    [...]
+
+The prologue code initially thinks it needs an FP register, but later it
+turns out to be unneeded, so one could either approach this by being more
+clever about not inserting code for an FP in the first place, or optimizing
+away the copy later.
+
+//===---------------------------------------------------------------------===//
diff --git a/lib/Target/WebAssembly/WebAssembly.h b/lib/Target/WebAssembly/WebAssembly.h
index 8738263ad84736d2099f67c3b8cfe0bed15a6259..e04c4db19c8c71236f64696ee6afcb41c866c4e5 100644
--- a/lib/Target/WebAssembly/WebAssembly.h
+++ b/lib/Target/WebAssembly/WebAssembly.h
@@ -46,6 +46,7 @@ FunctionPass *createWebAssemblyRegStackify();
 FunctionPass *createWebAssemblyRegColoring();
 FunctionPass *createWebAssemblyExplicitLocals();
 FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
+FunctionPass *createWebAssemblyCFGSort();
 FunctionPass *createWebAssemblyCFGStackify();
 FunctionPass *createWebAssemblyLowerBrUnless();
 FunctionPass *createWebAssemblyRegNumbering();
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index 5b4b82eb56037b97aa82ec515235abfb93a50348..d9c2dba5bace33733cd095457d6a414a4be62fc3 100644
--- a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -14,6 +14,7 @@
 ///
 //===----------------------------------------------------------------------===//
 
+#include "WebAssemblyAsmPrinter.h"
 #include "InstPrinter/WebAssemblyInstPrinter.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "MCTargetDesc/WebAssemblyTargetStreamer.h"
@@ -21,13 +22,14 @@
 #include "WebAssemblyMCInstLower.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblyRegisterInfo.h"
-#include "WebAssemblySubtarget.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
@@ -38,56 +40,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-namespace {
-
-class WebAssemblyAsmPrinter final : public AsmPrinter {
-  const MachineRegisterInfo *MRI;
-  WebAssemblyFunctionInfo *MFI;
-
-public:
-  WebAssemblyAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)), MRI(nullptr), MFI(nullptr) {}
-
-private:
-  StringRef getPassName() const override {
-    return "WebAssembly Assembly Printer";
-  }
-
-  //===------------------------------------------------------------------===//
-  // MachineFunctionPass Implementation.
-  //===------------------------------------------------------------------===//
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    MRI = &MF.getRegInfo();
-    MFI = MF.getInfo<WebAssemblyFunctionInfo>();
-    return AsmPrinter::runOnMachineFunction(MF);
-  }
-
-  //===------------------------------------------------------------------===//
-  // AsmPrinter Implementation.
-  //===------------------------------------------------------------------===//
-
-  void EmitEndOfAsmFile(Module &M) override;
-  void EmitJumpTableInfo() override;
-  void EmitConstantPool() override;
-  void EmitFunctionBodyStart() override;
-  void EmitFunctionBodyEnd() override;
-  void EmitInstruction(const MachineInstr *MI) override;
-  const MCExpr *lowerConstant(const Constant *CV) override;
-  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
-                       unsigned AsmVariant, const char *ExtraCode,
-                       raw_ostream &OS) override;
-  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
-                             unsigned AsmVariant, const char *ExtraCode,
-                             raw_ostream &OS) override;
-
-  MVT getRegType(unsigned RegNo) const;
-  std::string regToString(const MachineOperand &MO);
-  WebAssemblyTargetStreamer *getTargetStreamer();
-};
-
-} // end anonymous namespace
-
 //===----------------------------------------------------------------------===//
 // Helpers.
 //===----------------------------------------------------------------------===//
@@ -135,9 +87,19 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
   }
   for (const auto &G : M.globals()) {
     if (!G.hasInitializer() && G.hasExternalLinkage()) {
+      uint16_t Size = M.getDataLayout().getTypeAllocSize(G.getValueType());
       getTargetStreamer()->emitGlobalImport(G.getGlobalIdentifier());
+      OutStreamer->emitELFSize(getSymbol(&G),
+                               MCConstantExpr::create(Size, OutContext));
     }
   }
+
+  if (!TM.getTargetTriple().isOSBinFormatELF()) {
+    MachineModuleInfoWasm &MMIW = MMI->getObjFileInfo<MachineModuleInfoWasm>();
+    getTargetStreamer()->emitGlobal(MMIW.getGlobals());
+    if (MMIW.hasStackPointerGlobal())
+      getTargetStreamer()->emitStackPointer(MMIW.getStackPointerGlobal());
+  }
 }
 
 void WebAssemblyAsmPrinter::EmitConstantPool() {
@@ -150,8 +112,7 @@ void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
 }
 
 void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
-  if (!MFI->getParams().empty())
-    getTargetStreamer()->emitParam(MFI->getParams());
+  getTargetStreamer()->emitParam(CurrentFnSym, MFI->getParams());
 
   SmallVector<MVT, 4> ResultVTs;
   const Function &F(*MF->getFunction());
@@ -169,23 +130,26 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
   // If the return type needs to be legalized it will get converted into
   // passing a pointer.
   if (ResultVTs.size() == 1)
-    getTargetStreamer()->emitResult(ResultVTs);
-
-  // FIXME: When ExplicitLocals is enabled by default, we won't need
-  // to define the locals here (and MFI can go back to being pointer-to-const).
-  for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
-    unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
-    unsigned WAReg = MFI->getWAReg(VReg);
-    // Don't declare unused registers.
-    if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
-      continue;
-    // Don't redeclare parameters.
-    if (WAReg < MFI->getParams().size())
-      continue;
-    // Don't declare stackified registers.
-    if (int(WAReg) < 0)
-      continue;
-    MFI->addLocal(getRegType(VReg));
+    getTargetStreamer()->emitResult(CurrentFnSym, ResultVTs);
+  else
+    getTargetStreamer()->emitResult(CurrentFnSym, ArrayRef<MVT>());
+
+  if (TM.getTargetTriple().isOSBinFormatELF()) {
+    assert(MFI->getLocals().empty());
+    for (unsigned Idx = 0, IdxE = MRI->getNumVirtRegs(); Idx != IdxE; ++Idx) {
+      unsigned VReg = TargetRegisterInfo::index2VirtReg(Idx);
+      unsigned WAReg = MFI->getWAReg(VReg);
+      // Don't declare unused registers.
+      if (WAReg == WebAssemblyFunctionInfo::UnusedReg)
+        continue;
+      // Don't redeclare parameters.
+      if (WAReg < MFI->getParams().size())
+        continue;
+      // Don't declare stackified registers.
+      if (int(WAReg) < 0)
+        continue;
+      MFI->addLocal(getRegType(VReg));
+    }
   }
 
   getTargetStreamer()->emitLocal(MFI->getLocals());
@@ -194,7 +158,8 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
 }
 
 void WebAssemblyAsmPrinter::EmitFunctionBodyEnd() {
-  getTargetStreamer()->emitEndFunc();
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    getTargetStreamer()->emitEndFunc();
 }
 
 void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
diff --git a/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8917b8d7e48a83d8503a0fc27ee0858ea43ad9c
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -0,0 +1,77 @@
+// WebAssemblyAsmPrinter.h - WebAssembly implementation of AsmPrinter-*- C++ -*-
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYASMPRINTER_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_WEBASSEMBLYASMPRINTER_H
+
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/Target/TargetMachine.h"
+
+namespace llvm {
+class MCSymbol;
+class WebAssemblyFunctionInfo;
+class WebAssemblyTargetStreamer;
+class WebAssemblyMCInstLower;
+
+class LLVM_LIBRARY_VISIBILITY WebAssemblyAsmPrinter final : public AsmPrinter {
+  const WebAssemblySubtarget *Subtarget;
+  const MachineRegisterInfo *MRI;
+  WebAssemblyFunctionInfo *MFI;
+
+public:
+  explicit WebAssemblyAsmPrinter(TargetMachine &TM,
+                                 std::unique_ptr<MCStreamer> Streamer)
+      : AsmPrinter(TM, std::move(Streamer)),
+        Subtarget(nullptr), MRI(nullptr), MFI(nullptr) {}
+
+  StringRef getPassName() const override {
+    return "WebAssembly Assembly Printer";
+  }
+
+  const WebAssemblySubtarget &getSubtarget() const { return *Subtarget; }
+
+  //===------------------------------------------------------------------===//
+  // MachineFunctionPass Implementation.
+  //===------------------------------------------------------------------===//
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
+    MRI = &MF.getRegInfo();
+    MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+    return AsmPrinter::runOnMachineFunction(MF);
+  }
+
+  //===------------------------------------------------------------------===//
+  // AsmPrinter Implementation.
+  //===------------------------------------------------------------------===//
+
+  void EmitEndOfAsmFile(Module &M) override;
+  void EmitJumpTableInfo() override;
+  void EmitConstantPool() override;
+  void EmitFunctionBodyStart() override;
+  void EmitFunctionBodyEnd() override;
+  void EmitInstruction(const MachineInstr *MI) override;
+  const MCExpr *lowerConstant(const Constant *CV) override;
+  bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
+                       unsigned AsmVariant, const char *ExtraCode,
+                       raw_ostream &OS) override;
+  bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
+                             unsigned AsmVariant, const char *ExtraCode,
+                             raw_ostream &OS) override;
+
+  MVT getRegType(unsigned RegNo) const;
+  std::string regToString(const MachineOperand &MO);
+  WebAssemblyTargetStreamer *getTargetStreamer();
+};
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..40e1928197bcf77e52fd312aa137934a7f1de794
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -0,0 +1,277 @@
+//===-- WebAssemblyCFGSort.cpp - CFG Sorting ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file implements a CFG sorting pass.
+///
+/// This pass reorders the blocks in a function to put them into topological
+/// order, ignoring loop backedges, and without any loop being interrupted
+/// by a block not dominated by the loop header, with special care to keep the
+/// order as similar as possible to the original order.
+///
+////===----------------------------------------------------------------------===//
+
+#include "WebAssembly.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-cfg-sort"
+
+namespace {
+class WebAssemblyCFGSort final : public MachineFunctionPass {
+  StringRef getPassName() const override { return "WebAssembly CFG Sort"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+    AU.addRequired<MachineLoopInfo>();
+    AU.addPreserved<MachineLoopInfo>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyCFGSort() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyCFGSort::ID = 0;
+FunctionPass *llvm::createWebAssemblyCFGSort() {
+  return new WebAssemblyCFGSort();
+}
+
+static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
+#ifndef NDEBUG
+  bool AnyBarrier = false;
+#endif
+  bool AllAnalyzable = true;
+  for (const MachineInstr &Term : MBB->terminators()) {
+#ifndef NDEBUG
+    AnyBarrier |= Term.isBarrier();
+#endif
+    AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
+  }
+  assert((AnyBarrier || AllAnalyzable) &&
+         "AnalyzeBranch needs to analyze any block with a fallthrough");
+  if (AllAnalyzable)
+    MBB->updateTerminator();
+}
+
+namespace {
+/// Sort blocks by their number.
+struct CompareBlockNumbers {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() > B->getNumber();
+  }
+};
+/// Sort blocks by their number in the opposite order..
+struct CompareBlockNumbersBackwards {
+  bool operator()(const MachineBasicBlock *A,
+                  const MachineBasicBlock *B) const {
+    return A->getNumber() < B->getNumber();
+  }
+};
+/// Bookkeeping for a loop to help ensure that we don't mix blocks not dominated
+/// by the loop header among the loop's blocks.
+struct Entry {
+  const MachineLoop *Loop;
+  unsigned NumBlocksLeft;
+
+  /// List of blocks not dominated by Loop's header that are deferred until
+  /// after all of Loop's blocks have been seen.
+  std::vector<MachineBasicBlock *> Deferred;
+
+  explicit Entry(const MachineLoop *L)
+      : Loop(L), NumBlocksLeft(L->getNumBlocks()) {}
+};
+} // end anonymous namespace
+
+/// Sort the blocks, taking special care to make sure that loops are not
+/// interrupted by blocks not dominated by their header.
+/// TODO: There are many opportunities for improving the heuristics here.
+/// Explore them.
+static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
+                       const MachineDominatorTree &MDT) {
+  // Prepare for a topological sort: Record the number of predecessors each
+  // block has, ignoring loop backedges.
+  MF.RenumberBlocks();
+  SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
+  for (MachineBasicBlock &MBB : MF) {
+    unsigned N = MBB.pred_size();
+    if (MachineLoop *L = MLI.getLoopFor(&MBB))
+      if (L->getHeader() == &MBB)
+        for (const MachineBasicBlock *Pred : MBB.predecessors())
+          if (L->contains(Pred))
+            --N;
+    NumPredsLeft[MBB.getNumber()] = N;
+  }
+
+  // Topological sort the CFG, with additional constraints:
+  //  - Between a loop header and the last block in the loop, there can be
+  //    no blocks not dominated by the loop header.
+  //  - It's desirable to preserve the original block order when possible.
+  // We use two ready lists; Preferred and Ready. Preferred has recently
+  // processed sucessors, to help preserve block sequences from the original
+  // order. Ready has the remaining ready blocks.
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbers>
+      Preferred;
+  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
+                CompareBlockNumbersBackwards>
+      Ready;
+  SmallVector<Entry, 4> Loops;
+  for (MachineBasicBlock *MBB = &MF.front();;) {
+    const MachineLoop *L = MLI.getLoopFor(MBB);
+    if (L) {
+      // If MBB is a loop header, add it to the active loop list. We can't put
+      // any blocks that it doesn't dominate until we see the end of the loop.
+      if (L->getHeader() == MBB)
+        Loops.push_back(Entry(L));
+      // For each active loop the block is in, decrement the count. If MBB is
+      // the last block in an active loop, take it off the list and pick up any
+      // blocks deferred because the header didn't dominate them.
+      for (Entry &E : Loops)
+        if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0)
+          for (auto DeferredBlock : E.Deferred)
+            Ready.push(DeferredBlock);
+      while (!Loops.empty() && Loops.back().NumBlocksLeft == 0)
+        Loops.pop_back();
+    }
+    // The main topological sort logic.
+    for (MachineBasicBlock *Succ : MBB->successors()) {
+      // Ignore backedges.
+      if (MachineLoop *SuccL = MLI.getLoopFor(Succ))
+        if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
+          continue;
+      // Decrement the predecessor count. If it's now zero, it's ready.
+      if (--NumPredsLeft[Succ->getNumber()] == 0)
+        Preferred.push(Succ);
+    }
+    // Determine the block to follow MBB. First try to find a preferred block,
+    // to preserve the original block order when possible.
+    MachineBasicBlock *Next = nullptr;
+    while (!Preferred.empty()) {
+      Next = Preferred.top();
+      Preferred.pop();
+      // If X isn't dominated by the top active loop header, defer it until that
+      // loop is done.
+      if (!Loops.empty() &&
+          !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+        Loops.back().Deferred.push_back(Next);
+        Next = nullptr;
+        continue;
+      }
+      // If Next was originally ordered before MBB, and it isn't because it was
+      // loop-rotated above the header, it's not preferred.
+      if (Next->getNumber() < MBB->getNumber() &&
+          (!L || !L->contains(Next) ||
+           L->getHeader()->getNumber() < Next->getNumber())) {
+        Ready.push(Next);
+        Next = nullptr;
+        continue;
+      }
+      break;
+    }
+    // If we didn't find a suitable block in the Preferred list, check the
+    // general Ready list.
+    if (!Next) {
+      // If there are no more blocks to process, we're done.
+      if (Ready.empty()) {
+        MaybeUpdateTerminator(MBB);
+        break;
+      }
+      for (;;) {
+        Next = Ready.top();
+        Ready.pop();
+        // If Next isn't dominated by the top active loop header, defer it until
+        // that loop is done.
+        if (!Loops.empty() &&
+            !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
+          Loops.back().Deferred.push_back(Next);
+          continue;
+        }
+        break;
+      }
+    }
+    // Move the next block into place and iterate.
+    Next->moveAfter(MBB);
+    MaybeUpdateTerminator(MBB);
+    MBB = Next;
+  }
+  assert(Loops.empty() && "Active loop list not finished");
+  MF.RenumberBlocks();
+
+#ifndef NDEBUG
+  SmallSetVector<MachineLoop *, 8> OnStack;
+
+  // Insert a sentinel representing the degenerate loop that starts at the
+  // function entry block and includes the entire function as a "loop" that
+  // executes once.
+  OnStack.insert(nullptr);
+
+  for (auto &MBB : MF) {
+    assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
+
+    MachineLoop *Loop = MLI.getLoopFor(&MBB);
+    if (Loop && &MBB == Loop->getHeader()) {
+      // Loop header. The loop predecessor should be sorted above, and the other
+      // predecessors should be backedges below.
+      for (auto Pred : MBB.predecessors())
+        assert(
+            (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) &&
+            "Loop header predecessors must be loop predecessors or backedges");
+      assert(OnStack.insert(Loop) && "Loops should be declared at most once.");
+    } else {
+      // Not a loop header. All predecessors should be sorted above.
+      for (auto Pred : MBB.predecessors())
+        assert(Pred->getNumber() < MBB.getNumber() &&
+               "Non-loop-header predecessors should be topologically sorted");
+      assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
+             "Blocks must be nested in their loops");
+    }
+    while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
+      OnStack.pop_back();
+  }
+  assert(OnStack.pop_back_val() == nullptr &&
+         "The function entry block shouldn't actually be a loop header");
+  assert(OnStack.empty() &&
+         "Control flow stack pushes and pops should be balanced.");
+#endif
+}
+
+bool WebAssemblyCFGSort::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** CFG Sorting **********\n"
+                  "********** Function: "
+               << MF.getName() << '\n');
+
+  const auto &MLI = getAnalysis<MachineLoopInfo>();
+  auto &MDT = getAnalysis<MachineDominatorTree>();
+  // Liveness is not tracked for VALUE_STACK physreg.
+  MF.getRegInfo().invalidateLiveness();
+
+  // Sort the blocks, with contiguous loops.
+  SortBlocks(MF, MLI, MDT);
+
+  return true;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 49b9754e6b62c1e30f3edbb13cb8000f937231bd..bd11d1b469063f69836b036433f673629d433492 100644
--- a/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -10,12 +10,7 @@
 /// \file
 /// \brief This file implements a CFG stacking pass.
 ///
-/// This pass reorders the blocks in a function to put them into topological
-/// order, ignoring loop backedges, and without any loop being interrupted
-/// by a block not dominated by the loop header, with special care to keep the
-/// order as similar as possible to the original order.
-///
-/// Then, it inserts BLOCK and LOOP markers to mark the start of scopes, since
+/// This pass inserts BLOCK and LOOP markers to mark the start of scopes, since
 /// scope boundaries serve as the labels for WebAssembly's control transfers.
 ///
 /// This is sufficient to convert arbitrary CFGs into a form that works on
@@ -28,8 +23,6 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyUtilities.h"
-#include "llvm/ADT/PriorityQueue.h"
-#include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -68,217 +61,6 @@ FunctionPass *llvm::createWebAssemblyCFGStackify() {
   return new WebAssemblyCFGStackify();
 }
 
-/// Return the "bottom" block of a loop. This differs from
-/// MachineLoop::getBottomBlock in that it works even if the loop is
-/// discontiguous.
-static MachineBasicBlock *LoopBottom(const MachineLoop *Loop) {
-  MachineBasicBlock *Bottom = Loop->getHeader();
-  for (MachineBasicBlock *MBB : Loop->blocks())
-    if (MBB->getNumber() > Bottom->getNumber())
-      Bottom = MBB;
-  return Bottom;
-}
-
-static void MaybeUpdateTerminator(MachineBasicBlock *MBB) {
-#ifndef NDEBUG
-  bool AnyBarrier = false;
-#endif
-  bool AllAnalyzable = true;
-  for (const MachineInstr &Term : MBB->terminators()) {
-#ifndef NDEBUG
-    AnyBarrier |= Term.isBarrier();
-#endif
-    AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
-  }
-  assert((AnyBarrier || AllAnalyzable) &&
-         "AnalyzeBranch needs to analyze any block with a fallthrough");
-  if (AllAnalyzable)
-    MBB->updateTerminator();
-}
-
-namespace {
-/// Sort blocks by their number.
-struct CompareBlockNumbers {
-  bool operator()(const MachineBasicBlock *A,
-                  const MachineBasicBlock *B) const {
-    return A->getNumber() > B->getNumber();
-  }
-};
-/// Sort blocks by their number in the opposite order..
-struct CompareBlockNumbersBackwards {
-  bool operator()(const MachineBasicBlock *A,
-                  const MachineBasicBlock *B) const {
-    return A->getNumber() < B->getNumber();
-  }
-};
-/// Bookkeeping for a loop to help ensure that we don't mix blocks not dominated
-/// by the loop header among the loop's blocks.
-struct Entry {
-  const MachineLoop *Loop;
-  unsigned NumBlocksLeft;
-
-  /// List of blocks not dominated by Loop's header that are deferred until
-  /// after all of Loop's blocks have been seen.
-  std::vector<MachineBasicBlock *> Deferred;
-
-  explicit Entry(const MachineLoop *L)
-      : Loop(L), NumBlocksLeft(L->getNumBlocks()) {}
-};
-}
-
-/// Sort the blocks, taking special care to make sure that loops are not
-/// interrupted by blocks not dominated by their header.
-/// TODO: There are many opportunities for improving the heuristics here.
-/// Explore them.
-static void SortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
-                       const MachineDominatorTree &MDT) {
-  // Prepare for a topological sort: Record the number of predecessors each
-  // block has, ignoring loop backedges.
-  MF.RenumberBlocks();
-  SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
-  for (MachineBasicBlock &MBB : MF) {
-    unsigned N = MBB.pred_size();
-    if (MachineLoop *L = MLI.getLoopFor(&MBB))
-      if (L->getHeader() == &MBB)
-        for (const MachineBasicBlock *Pred : MBB.predecessors())
-          if (L->contains(Pred))
-            --N;
-    NumPredsLeft[MBB.getNumber()] = N;
-  }
-
-  // Topological sort the CFG, with additional constraints:
-  //  - Between a loop header and the last block in the loop, there can be
-  //    no blocks not dominated by the loop header.
-  //  - It's desirable to preserve the original block order when possible.
-  // We use two ready lists; Preferred and Ready. Preferred has recently
-  // processed sucessors, to help preserve block sequences from the original
-  // order. Ready has the remaining ready blocks.
-  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
-                CompareBlockNumbers>
-      Preferred;
-  PriorityQueue<MachineBasicBlock *, std::vector<MachineBasicBlock *>,
-                CompareBlockNumbersBackwards>
-      Ready;
-  SmallVector<Entry, 4> Loops;
-  for (MachineBasicBlock *MBB = &MF.front();;) {
-    const MachineLoop *L = MLI.getLoopFor(MBB);
-    if (L) {
-      // If MBB is a loop header, add it to the active loop list. We can't put
-      // any blocks that it doesn't dominate until we see the end of the loop.
-      if (L->getHeader() == MBB)
-        Loops.push_back(Entry(L));
-      // For each active loop the block is in, decrement the count. If MBB is
-      // the last block in an active loop, take it off the list and pick up any
-      // blocks deferred because the header didn't dominate them.
-      for (Entry &E : Loops)
-        if (E.Loop->contains(MBB) && --E.NumBlocksLeft == 0)
-          for (auto DeferredBlock : E.Deferred)
-            Ready.push(DeferredBlock);
-      while (!Loops.empty() && Loops.back().NumBlocksLeft == 0)
-        Loops.pop_back();
-    }
-    // The main topological sort logic.
-    for (MachineBasicBlock *Succ : MBB->successors()) {
-      // Ignore backedges.
-      if (MachineLoop *SuccL = MLI.getLoopFor(Succ))
-        if (SuccL->getHeader() == Succ && SuccL->contains(MBB))
-          continue;
-      // Decrement the predecessor count. If it's now zero, it's ready.
-      if (--NumPredsLeft[Succ->getNumber()] == 0)
-        Preferred.push(Succ);
-    }
-    // Determine the block to follow MBB. First try to find a preferred block,
-    // to preserve the original block order when possible.
-    MachineBasicBlock *Next = nullptr;
-    while (!Preferred.empty()) {
-      Next = Preferred.top();
-      Preferred.pop();
-      // If X isn't dominated by the top active loop header, defer it until that
-      // loop is done.
-      if (!Loops.empty() &&
-          !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
-        Loops.back().Deferred.push_back(Next);
-        Next = nullptr;
-        continue;
-      }
-      // If Next was originally ordered before MBB, and it isn't because it was
-      // loop-rotated above the header, it's not preferred.
-      if (Next->getNumber() < MBB->getNumber() &&
-          (!L || !L->contains(Next) ||
-           L->getHeader()->getNumber() < Next->getNumber())) {
-        Ready.push(Next);
-        Next = nullptr;
-        continue;
-      }
-      break;
-    }
-    // If we didn't find a suitable block in the Preferred list, check the
-    // general Ready list.
-    if (!Next) {
-      // If there are no more blocks to process, we're done.
-      if (Ready.empty()) {
-        MaybeUpdateTerminator(MBB);
-        break;
-      }
-      for (;;) {
-        Next = Ready.top();
-        Ready.pop();
-        // If Next isn't dominated by the top active loop header, defer it until
-        // that loop is done.
-        if (!Loops.empty() &&
-            !MDT.dominates(Loops.back().Loop->getHeader(), Next)) {
-          Loops.back().Deferred.push_back(Next);
-          continue;
-        }
-        break;
-      }
-    }
-    // Move the next block into place and iterate.
-    Next->moveAfter(MBB);
-    MaybeUpdateTerminator(MBB);
-    MBB = Next;
-  }
-  assert(Loops.empty() && "Active loop list not finished");
-  MF.RenumberBlocks();
-
-#ifndef NDEBUG
-  SmallSetVector<MachineLoop *, 8> OnStack;
-
-  // Insert a sentinel representing the degenerate loop that starts at the
-  // function entry block and includes the entire function as a "loop" that
-  // executes once.
-  OnStack.insert(nullptr);
-
-  for (auto &MBB : MF) {
-    assert(MBB.getNumber() >= 0 && "Renumbered blocks should be non-negative.");
-
-    MachineLoop *Loop = MLI.getLoopFor(&MBB);
-    if (Loop && &MBB == Loop->getHeader()) {
-      // Loop header. The loop predecessor should be sorted above, and the other
-      // predecessors should be backedges below.
-      for (auto Pred : MBB.predecessors())
-        assert(
-            (Pred->getNumber() < MBB.getNumber() || Loop->contains(Pred)) &&
-            "Loop header predecessors must be loop predecessors or backedges");
-      assert(OnStack.insert(Loop) && "Loops should be declared at most once.");
-    } else {
-      // Not a loop header. All predecessors should be sorted above.
-      for (auto Pred : MBB.predecessors())
-        assert(Pred->getNumber() < MBB.getNumber() &&
-               "Non-loop-header predecessors should be topologically sorted");
-      assert(OnStack.count(MLI.getLoopFor(&MBB)) &&
-             "Blocks must be nested in their loops");
-    }
-    while (OnStack.size() > 1 && &MBB == LoopBottom(OnStack.back()))
-      OnStack.pop_back();
-  }
-  assert(OnStack.pop_back_val() == nullptr &&
-         "The function entry block shouldn't actually be a loop header");
-  assert(OnStack.empty() &&
-         "Control flow stack pushes and pops should be balanced.");
-#endif
-}
-
 /// Test whether Pred has any terminators explicitly branching to MBB, as
 /// opposed to falling through. Note that it's possible (eg. in unoptimized
 /// code) for a branch instruction to both branch to a block and fallthrough
@@ -488,6 +270,15 @@ static void FixEndsAtEndOfFunction(
   }
 }
 
+// WebAssembly functions end with an end instruction, as if the function body
+// were a block.
+static void AppendEndToFunction(
+    MachineFunction &MF,
+    const WebAssemblyInstrInfo &TII) {
+  BuildMI(MF.back(), MF.back().end(), DebugLoc(),
+          TII.get(WebAssembly::END_FUNCTION));
+}
+
 /// Insert LOOP and BLOCK markers at appropriate places.
 static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
                          const WebAssemblyInstrInfo &TII,
@@ -555,6 +346,11 @@ static void PlaceMarkers(MachineFunction &MF, const MachineLoopInfo &MLI,
   // Fix up block/loop signatures at the end of the function to conform to
   // WebAssembly's rules.
   FixEndsAtEndOfFunction(MF, MFI, BlockTops, LoopTops);
+
+  // Add an end instruction at the end of the function body.
+  if (!MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF())
+    AppendEndToFunction(MF, TII);
 }
 
 bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
@@ -569,9 +365,6 @@ bool WebAssemblyCFGStackify::runOnMachineFunction(MachineFunction &MF) {
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MF.getRegInfo().invalidateLiveness();
 
-  // Sort the blocks, with contiguous loops.
-  SortBlocks(MF, MLI, MDT);
-
   // Place the BLOCK and LOOP markers to indicate the beginnings of scopes.
   PlaceMarkers(MF, MLI, TII, MDT, MFI);
 
diff --git a/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp b/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
index fc0a01ca30e505ffe2f466fe6fb884a141b38199..bc6360aafd61c9d4bf0786afc2ed0b8a8037f30d 100644
--- a/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
@@ -97,15 +97,28 @@ bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
         MI.setDesc(Desc);
 
         // Rewrite argument order
-        auto Uses = MI.explicit_uses();
-        MachineInstr::mop_iterator it = Uses.begin();
-        const MachineOperand MO = *it;
+        SmallVector<MachineOperand, 8> Ops;
+
+        // Set up a placeholder for the type signature immediate.
+        Ops.push_back(MachineOperand::CreateImm(0));
 
         // Set up the flags immediate, which currently has no defined flags
         // so it's always zero.
-        it->ChangeToImmediate(0);
-
-        MI.addOperand(MF, MO);
+        Ops.push_back(MachineOperand::CreateImm(0));
+
+        for (const MachineOperand &MO :
+                 make_range(MI.operands_begin() +
+                                MI.getDesc().getNumDefs() + 1,
+                            MI.operands_begin() +
+                                MI.getNumExplicitOperands()))
+          Ops.push_back(MO);
+        Ops.push_back(MI.getOperand(MI.getDesc().getNumDefs()));
+
+        // Replace the instructions operands.
+        while (MI.getNumOperands() > MI.getDesc().getNumDefs())
+          MI.RemoveOperand(MI.getNumOperands() - 1);
+        for (const MachineOperand &MO : Ops)
+          MI.addOperand(MO);
 
         DEBUG(dbgs() << "  After transform: " << MI);
         Changed = true;
diff --git a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index 04ede7ff110c8586f37829ea044bf7d3126ae849..41249117ae0eb3222b405da23ea6b1b56a530dad 100644
--- a/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -31,6 +31,14 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-explicit-locals"
 
+// A command-line option to disable this pass. Note that this produces output
+// which is not valid WebAssembly, though it may be more convenient for writing
+// LLVM unit tests with.
+static cl::opt<bool> DisableWebAssemblyExplicitLocals(
+    "disable-wasm-explicit-locals", cl::ReallyHidden,
+    cl::desc("WebAssembly: Disable emission of get_local/set_local."),
+    cl::init(false));
+
 namespace {
 class WebAssemblyExplicitLocals final : public MachineFunctionPass {
   StringRef getPassName() const override {
@@ -60,7 +68,25 @@ FunctionPass *llvm::createWebAssemblyExplicitLocals() {
 /// if it doesn't yet have one.
 static unsigned getLocalId(DenseMap<unsigned, unsigned> &Reg2Local,
                            unsigned &CurLocal, unsigned Reg) {
-  return Reg2Local.insert(std::make_pair(Reg, CurLocal++)).first->second;
+  auto P = Reg2Local.insert(std::make_pair(Reg, CurLocal));
+  if (P.second)
+    ++CurLocal;
+  return P.first->second;
+}
+
+/// Get the appropriate drop opcode for the given register class.
+static unsigned getDropOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::DROP_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::DROP_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::DROP_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::DROP_F64;
+  if (RC == &WebAssembly::V128RegClass)
+    return WebAssembly::DROP_V128;
+  llvm_unreachable("Unexpected register class");
 }
 
 /// Get the appropriate get_local opcode for the given register class.
@@ -146,6 +172,10 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
                   "********** Function: "
                << MF.getName() << '\n');
 
+  // Disable this pass if directed to do so.
+  if (DisableWebAssemblyExplicitLocals)
+    return false;
+
   // Disable this pass if we aren't doing direct wasm object emission.
   if (MF.getSubtarget<WebAssemblySubtarget>()
         .getTargetTriple().isOSBinFormatELF())
@@ -176,6 +206,12 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
   // Start assigning local numbers after the last parameter.
   unsigned CurLocal = MFI.getParams().size();
 
+  // Precompute the set of registers that are unused, so that we can insert
+  // drops to their defs.
+  BitVector UseEmpty(MRI.getNumVirtRegs());
+  for (unsigned i = 0, e = MRI.getNumVirtRegs(); i < e; ++i)
+    UseEmpty[i] = MRI.use_empty(TargetRegisterInfo::index2VirtReg(i));
+
   // Visit each instruction in the function.
   for (MachineBasicBlock &MBB : MF) {
     for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;) {
@@ -224,15 +260,26 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
       assert(MI.getDesc().getNumDefs() <= 1);
       if (MI.getDesc().getNumDefs() == 1) {
         unsigned OldReg = MI.getOperand(0).getReg();
-        if (!MFI.isVRegStackified(OldReg) && !MRI.use_empty(OldReg)) {
-          unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+        if (!MFI.isVRegStackified(OldReg)) {
           const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
           unsigned NewReg = MRI.createVirtualRegister(RC);
           auto InsertPt = std::next(MachineBasicBlock::iterator(&MI));
-          unsigned Opc = getSetLocalOpcode(RC);
-          BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
-              .addImm(LocalId)
-              .addReg(NewReg);
+          if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) {
+            MI.eraseFromParent();
+            Changed = true;
+            continue;
+          }
+          if (UseEmpty[TargetRegisterInfo::virtReg2Index(OldReg)]) {
+            unsigned Opc = getDropOpcode(RC);
+            BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
+                .addReg(NewReg);
+          } else {
+            unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+            unsigned Opc = getSetLocalOpcode(RC);
+            BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc))
+                .addImm(LocalId)
+                .addReg(NewReg);
+          }
           MI.getOperand(0).setReg(NewReg);
           MFI.stackifyVReg(NewReg);
           Changed = true;
@@ -278,13 +325,16 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
   }
 
   // Define the locals.
+  // TODO: Sort the locals for better compression.
+  MFI.setNumLocals(CurLocal - MFI.getParams().size());
   for (size_t i = 0, e = MRI.getNumVirtRegs(); i < e; ++i) {
     unsigned Reg = TargetRegisterInfo::index2VirtReg(i);
     auto I = Reg2Local.find(Reg);
     if (I == Reg2Local.end() || I->second < MFI.getParams().size())
       continue;
 
-    MFI.addLocal(typeForRegClass(MRI.getRegClass(Reg)));
+    MFI.setLocal(I->second - MFI.getParams().size(),
+                 typeForRegClass(MRI.getRegClass(Reg)));
     Changed = true;
   }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index bc7020fded8c9729cf292f08164432ea9639b247..e7fd4ef33e1c5385d58eb5624c922fcb5398a52b 100644
--- a/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -116,6 +116,8 @@ private:
     case MVT::f32:
     case MVT::f64:
       return VT;
+    case MVT::f16:
+      return MVT::f32;
     case MVT::v16i8:
     case MVT::v8i16:
     case MVT::v4i32:
@@ -594,7 +596,7 @@ bool WebAssemblyFastISel::fastLowerArguments() {
 
   unsigned i = 0;
   for (auto const &Arg : F->args()) {
-    const AttributeSet &Attrs = F->getAttributes();
+    const AttributeList &Attrs = F->getAttributes();
     if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
         Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
         Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
@@ -744,7 +746,7 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     if (ArgTy == MVT::INVALID_SIMPLE_VALUE_TYPE)
       return false;
 
-    const AttributeSet &Attrs = Call->getAttributes();
+    const AttributeList &Attrs = Call->getAttributes();
     if (Attrs.hasAttribute(i+1, Attribute::ByVal) ||
         Attrs.hasAttribute(i+1, Attribute::SwiftSelf) ||
         Attrs.hasAttribute(i+1, Attribute::SwiftError) ||
diff --git a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index cdbd9ea4b098563f4317e0dcd07e1cb4d9f1e12d..76a2ff3f9803b6d8f745d4cd656569545496d6af 100644
--- a/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -84,7 +84,7 @@ static void FindUses(Value *V, Function &F,
 //  - Call with fewer arguments than needed: arguments are filled in with undef
 //  - Return value is not needed: drop it
 //  - Return value needed but not present: supply an undef
-//  
+//
 // For now, return nullptr without creating a wrapper if the wrapper cannot
 // be generated due to incompatible types.
 static Function *CreateWrapper(Function *F, FunctionType *Ty) {
diff --git a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index a6a2c0bf06ae29c3749353720414aff4e0abe763..4209bc333f230640d83efbc83a48743312defce1 100644
--- a/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -24,10 +24,11 @@
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "WebAssemblyTargetMachine.h"
+#include "WebAssemblyUtilities.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 using namespace llvm;
@@ -101,25 +102,35 @@ static void writeSPToMemory(unsigned SrcReg, MachineFunction &MF,
                             MachineBasicBlock::iterator &InsertAddr,
                             MachineBasicBlock::iterator &InsertStore,
                             const DebugLoc &DL) {
-  const char *ES = "__stack_pointer";
-  auto *SPSymbol = MF.createExternalSymbolName(ES);
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const TargetRegisterClass *PtrRC =
-      MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
-  unsigned Zero = MRI.createVirtualRegister(PtrRC);
   const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
-  BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
-      .addImm(0);
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
-      MachineMemOperand::MOStore, 4, 4);
-  BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32))
-      .addImm(2)  // p2align
-      .addExternalSymbol(SPSymbol)
-      .addReg(Zero)
-      .addReg(SrcReg)
-      .addMemOperand(MMO);
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF()) {
+    const char *ES = "__stack_pointer";
+    auto *SPSymbol = MF.createExternalSymbolName(ES);
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    const TargetRegisterClass *PtrRC =
+        MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
+    unsigned Zero = MRI.createVirtualRegister(PtrRC);
+
+    BuildMI(MBB, InsertAddr, DL, TII->get(WebAssembly::CONST_I32), Zero)
+        .addImm(0);
+    MachineMemOperand *MMO = MF.getMachineMemOperand(
+        MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
+        MachineMemOperand::MOStore, 4, 4);
+    BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::STORE_I32))
+        .addImm(2)  // p2align
+        .addExternalSymbol(SPSymbol)
+        .addReg(Zero)
+        .addReg(SrcReg)
+        .addMemOperand(MMO);
+  } else {
+    MachineModuleInfoWasm &MMIW =
+        MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
+    BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::SET_GLOBAL_I32))
+        .addImm(MMIW.getStackPointerGlobal())
+        .addReg(SrcReg);
+  }
 }
 
 MachineBasicBlock::iterator
@@ -151,27 +162,50 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   auto &MRI = MF.getRegInfo();
 
   auto InsertPt = MBB.begin();
+  while (InsertPt != MBB.end() && WebAssembly::isArgument(*InsertPt))
+    ++InsertPt;
   DebugLoc DL;
 
   const TargetRegisterClass *PtrRC =
       MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
-  unsigned Zero = MRI.createVirtualRegister(PtrRC);
   unsigned SPReg = WebAssembly::SP32;
   if (StackSize)
     SPReg = MRI.createVirtualRegister(PtrRC);
-  const char *ES = "__stack_pointer";
-  auto *SPSymbol = MF.createExternalSymbolName(ES);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
-      .addImm(0);
-  MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
-      MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
-      MachineMemOperand::MOLoad, 4, 4);
-  // Load the SP value.
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
-      .addImm(2)       // p2align
-      .addExternalSymbol(SPSymbol)
-      .addReg(Zero)    // addr
-      .addMemOperand(LoadMMO);
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF()) {
+    const char *ES = "__stack_pointer";
+    auto *SPSymbol = MF.createExternalSymbolName(ES);
+    unsigned Zero = MRI.createVirtualRegister(PtrRC);
+
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), Zero)
+        .addImm(0);
+    MachineMemOperand *LoadMMO = MF.getMachineMemOperand(
+        MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES)),
+        MachineMemOperand::MOLoad, 4, 4);
+    // Load the SP value.
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::LOAD_I32), SPReg)
+        .addImm(2)       // p2align
+        .addExternalSymbol(SPSymbol)
+        .addReg(Zero)    // addr
+        .addMemOperand(LoadMMO);
+  } else {
+    auto &MMIW = MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
+    if (!MMIW.hasStackPointerGlobal()) {
+      MMIW.setStackPointerGlobal(MMIW.getGlobals().size());
+
+      // Create the stack-pointer global. For now, just use the
+      // Emscripten/Binaryen ABI names.
+      wasm::Global G;
+      G.Type = wasm::ValType::I32;
+      G.Mutable = true;
+      G.InitialValue = 0;
+      G.InitialModule = "env";
+      G.InitialName = "STACKTOP";
+      MMIW.addGlobal(G);
+    }
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GET_GLOBAL_I32), SPReg)
+        .addImm(MMIW.getStackPointerGlobal());
+  }
 
   bool HasBP = hasBP(MF);
   if (HasBP) {
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 6a7f75a6b3a18b082426a79890d82d49c2ebf2e6..31a5ca1f4cc2729806cd1e44f751acc7f5fa3472 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -95,6 +95,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     // Support minnan and maxnan, which otherwise default to expand.
     setOperationAction(ISD::FMINNAN, T, Legal);
     setOperationAction(ISD::FMAXNAN, T, Legal);
+    // WebAssembly currently has no builtin f16 support.
+    setOperationAction(ISD::FP16_TO_FP, T, Expand);
+    setOperationAction(ISD::FP_TO_FP16, T, Expand);
+    setLoadExtAction(ISD::EXTLOAD, T, MVT::f16, Expand);
+    setTruncStoreAction(T, MVT::f16, Expand);
   }
 
   for (auto T : {MVT::i32, MVT::i64}) {
@@ -253,7 +258,8 @@ bool WebAssemblyTargetLowering::allowsMisalignedMemoryAccesses(
   return true;
 }
 
-bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT,
+                                              AttributeList Attr) const {
   // The current thinking is that wasm engines will perform this optimization,
   // so we can save on code size.
   return true;
diff --git a/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 5bc723028e63bf7c2738d0e0d10b2de2c0bed6db..99d3d0d558f5fd905938a5890d006317beb5e8dd 100644
--- a/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -58,7 +58,7 @@ class WebAssemblyTargetLowering final : public TargetLowering {
                              unsigned AS) const override;
   bool allowsMisalignedMemoryAccesses(EVT, unsigned AddrSpace, unsigned Align,
                                       bool *Fast) const override;
-  bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+  bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 047f4be066c0b1c63f119ba7e849bc67da44bc7a..73d1d4be293ba251d9ee1f2826cdef396b33b599 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -30,13 +30,15 @@ multiclass CALL<WebAssemblyRegClass vt, string prefix> {
                    [(set vt:$dst, (WebAssemblycall1 (i32 imm:$callee)))],
                    !strconcat(prefix, "call\t$dst, $callee"),
                    0x10>;
+
   let isCodeGenOnly = 1 in {
     def PCALL_INDIRECT_#vt : I<(outs vt:$dst), (ins I32:$callee, variable_ops),
                               [(set vt:$dst, (WebAssemblycall1 I32:$callee))],
                               "PSEUDO CALL INDIRECT\t$callee">;
   } // isCodeGenOnly = 1
 
-  def CALL_INDIRECT_#vt : I<(outs vt:$dst), (ins i32imm:$flags, variable_ops),
+  def CALL_INDIRECT_#vt : I<(outs vt:$dst),
+                            (ins TypeIndex:$type, i32imm:$flags, variable_ops),
                             [],
                             !strconcat(prefix, "call_indirect\t$dst"),
                             0x11>;
@@ -48,6 +50,7 @@ multiclass SIMD_CALL<ValueType vt, string prefix> {
                                (WebAssemblycall1 (i32 imm:$callee)))],
                          !strconcat(prefix, "call\t$dst, $callee"),
                          0x10>;
+
   let isCodeGenOnly = 1 in {
     def PCALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
                                     (ins I32:$callee, variable_ops),
@@ -57,7 +60,8 @@ multiclass SIMD_CALL<ValueType vt, string prefix> {
   } // isCodeGenOnly = 1
 
   def CALL_INDIRECT_#vt : SIMD_I<(outs V128:$dst),
-                                  (ins i32imm:$flags, variable_ops),
+                                  (ins TypeIndex:$type, i32imm:$flags,
+                                       variable_ops),
                                   [],
                                   !strconcat(prefix, "call_indirect\t$dst"),
                                   0x11>;
@@ -76,13 +80,15 @@ let Uses = [SP32, SP64], isCall = 1 in {
   def CALL_VOID : I<(outs), (ins function32_op:$callee, variable_ops),
                     [(WebAssemblycall0 (i32 imm:$callee))],
                     "call    \t$callee", 0x10>;
+
   let isCodeGenOnly = 1 in {
     def PCALL_INDIRECT_VOID : I<(outs), (ins I32:$callee, variable_ops),
                       [(WebAssemblycall0 I32:$callee)],
                       "PSEUDO CALL INDIRECT\t$callee">;
   } // isCodeGenOnly = 1
 
-  def CALL_INDIRECT_VOID : I<(outs), (ins i32imm:$flags, variable_ops),
+  def CALL_INDIRECT_VOID : I<(outs),
+                             (ins TypeIndex:$type, i32imm:$flags, variable_ops),
                              [],
                              "call_indirect\t", 0x11>;
 } // Uses = [SP32,SP64], isCall = 1
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 1146431e6b77acac30ef6f9896e487fb92f1de45..39cb1ca336f2d10d75bd3eb9dbc4430622fe7c9d 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -64,9 +64,12 @@ let Uses = [VALUE_STACK], Defs = [VALUE_STACK] in {
 def BLOCK     : I<(outs), (ins Signature:$sig), [], "block   \t$sig", 0x02>;
 def LOOP      : I<(outs), (ins Signature:$sig), [], "loop    \t$sig", 0x03>;
 
-// END_BLOCK and END_LOOP are represented with the same opcode in wasm.
+// END_BLOCK, END_LOOP, and END_FUNCTION are represented with the same opcode
+// in wasm.
 def END_BLOCK : I<(outs), (ins), [], "end_block", 0x0b>;
 def END_LOOP  : I<(outs), (ins), [], "end_loop", 0x0b>;
+let isTerminator = 1, isBarrier = 1 in
+def END_FUNCTION : I<(outs), (ins), [], "end_function", 0x0b>;
 } // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
 
 multiclass RETURN<WebAssemblyRegClass vt> {
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
index 030be0862a5663224463b9dcfeded03b06a1f2af..03c9c1f8d5c022924611aa4a677197b54d760921 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrFloat.td
@@ -55,8 +55,8 @@ defm EQ : ComparisonFP<SETOEQ, "eq  ", 0x5b, 0x61>;
 defm NE : ComparisonFP<SETUNE, "ne  ", 0x5c, 0x62>;
 } // isCommutable = 1
 defm LT : ComparisonFP<SETOLT, "lt  ", 0x5d, 0x63>;
-defm LE : ComparisonFP<SETOLE, "le  ", 0x5e, 0x64>;
-defm GT : ComparisonFP<SETOGT, "gt  ", 0x5f, 0x65>;
+defm LE : ComparisonFP<SETOLE, "le  ", 0x5f, 0x65>;
+defm GT : ComparisonFP<SETOGT, "gt  ", 0x5e, 0x64>;
 defm GE : ComparisonFP<SETOGE, "ge  ", 0x60, 0x66>;
 
 } // Defs = [ARGUMENTS]
diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index dcfd1a42c6aac65ec27c8e16cdaf84dd2f4408e3..a601b575f5791474927c6fecdcf45c25b506448c 100644
--- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -74,6 +74,9 @@ def bb_op : Operand<OtherVT>;
 let OperandType = "OPERAND_LOCAL" in
 def local_op : Operand<i32>;
 
+let OperandType = "OPERAND_GLOBAL" in
+def global_op : Operand<i32>;
+
 let OperandType = "OPERAND_I32IMM" in
 def i32imm_op : Operand<i32>;
 
@@ -104,6 +107,9 @@ def Signature : Operand<i32> {
 }
 } // OperandType = "OPERAND_SIGNATURE"
 
+let OperandType = "OPERAND_TYPEINDEX" in
+def TypeIndex : Operand<i32>;
+
 } // OperandNamespace = "WebAssembly"
 
 //===----------------------------------------------------------------------===//
@@ -178,6 +184,18 @@ let hasSideEffects = 0 in {
   def TEE_LOCAL_#vt : I<(outs vt:$res), (ins local_op:$local, vt:$src), [],
                          "tee_local\t$res, $local, $src", 0x22>;
 
+  // Unused values must be dropped in some contexts.
+  def DROP_#vt : I<(outs), (ins vt:$src), [],
+                   "drop\t$src", 0x1a>;
+
+  let mayLoad = 1 in
+  def GET_GLOBAL_#vt : I<(outs vt:$res), (ins global_op:$local), [],
+                         "get_global\t$res, $local", 0x23>;
+
+  let mayStore = 1 in
+  def SET_GLOBAL_#vt : I<(outs), (ins global_op:$local, vt:$src), [],
+                         "set_global\t$local, $src", 0x24>;
+
 } // hasSideEffects = 0
 }
 defm : LOCAL<I32>;
diff --git a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index 72cb1ccbe668dca3cb113f5a589ebff7985bc6ed..e1b2f79c81ccaf907be27dacb94ee191e31c79a0 100644
--- a/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -412,7 +412,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
   if (CI->doesNotReturn()) {
     if (auto *F = dyn_cast<Function>(CI->getCalledValue()))
       F->removeFnAttr(Attribute::NoReturn);
-    CI->removeAttribute(AttributeSet::FunctionIndex, Attribute::NoReturn);
+    CI->removeAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
   }
 
   IRBuilder<> IRB(C);
@@ -436,24 +436,22 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
   // Because we added the pointer to the callee as first argument, all
   // argument attribute indices have to be incremented by one.
   SmallVector<AttributeSet, 8> AttributesVec;
-  const AttributeSet &InvokePAL = CI->getAttributes();
-  CallSite::arg_iterator AI = CI->arg_begin();
-  unsigned i = 1; // Argument attribute index starts from 1
-  for (unsigned e = CI->getNumArgOperands(); i <= e; ++AI, ++i) {
-    if (InvokePAL.hasAttributes(i)) {
-      AttrBuilder B(InvokePAL, i);
-      AttributesVec.push_back(AttributeSet::get(C, i + 1, B));
-    }
-  }
+  const AttributeList &InvokeAL = CI->getAttributes();
+
   // Add any return attributes.
-  if (InvokePAL.hasAttributes(AttributeSet::ReturnIndex))
-    AttributesVec.push_back(AttributeSet::get(C, InvokePAL.getRetAttributes()));
+  AttributesVec.push_back(InvokeAL.getRetAttributes());
+  // No attributes for the callee pointer.
+  AttributesVec.push_back(AttributeSet());
+  // Copy the argument attributes from the original
+  for (unsigned i = 1, e = CI->getNumArgOperands(); i <= e; ++i) {
+    AttributesVec.push_back(InvokeAL.getParamAttributes(i));
+  }
+
   // Add any function attributes.
-  if (InvokePAL.hasAttributes(AttributeSet::FunctionIndex))
-    AttributesVec.push_back(AttributeSet::get(C, InvokePAL.getFnAttributes()));
+  AttributesVec.push_back(InvokeAL.getFnAttributes());
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttributeSet NewCallPAL = AttributeSet::get(C, AttributesVec);
-  NewCall->setAttributes(NewCallPAL);
+  AttributeList NewCallAL = AttributeList::get(C, AttributesVec);
+  NewCall->setAttributes(NewCallAL);
 
   CI->replaceAllUsesWith(NewCall);
 
@@ -624,7 +622,7 @@ void WebAssemblyLowerEmscriptenEHSjLj::createSetThrewFunction(Module &M) {
   Function *F =
       Function::Create(FTy, GlobalValue::ExternalLinkage, SetThrewFName, &M);
   Argument *Arg1 = &*(F->arg_begin());
-  Argument *Arg2 = &*(++F->arg_begin());
+  Argument *Arg2 = &*std::next(F->arg_begin());
   Arg1->setName("threw");
   Arg2->setName("value");
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index f3384210b0e9e249a93fbfd7e8cd45c5abe29bcd..ff186eb915039b4e5af88d9514cb3f664170df35 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -14,7 +14,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyMCInstLower.h"
+#include "WebAssemblyAsmPrinter.h"
 #include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblyRuntimeLibcallSignatures.h"
+#include "WebAssemblyUtilities.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Constants.h"
@@ -22,18 +25,85 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 MCSymbol *
 WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
-  return Printer.getSymbol(MO.getGlobal());
+  const GlobalValue *Global = MO.getGlobal();
+  MCSymbol *Sym = Printer.getSymbol(Global);
+  if (isa<MCSymbolELF>(Sym))
+    return Sym;
+
+  MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+
+  if (const auto *FuncTy = dyn_cast<FunctionType>(Global->getValueType())) {
+    const MachineFunction &MF = *MO.getParent()->getParent()->getParent();
+    const TargetMachine &TM = MF.getTarget();
+    const Function &CurrentFunc = *MF.getFunction();
+
+    SmallVector<wasm::ValType, 4> Returns;
+    SmallVector<wasm::ValType, 4> Params;
+
+    wasm::ValType iPTR =
+        MF.getSubtarget<WebAssemblySubtarget>().hasAddr64() ?
+        wasm::ValType::I64 :
+        wasm::ValType::I32;
+
+    SmallVector<MVT, 4> ResultMVTs;
+    ComputeLegalValueVTs(CurrentFunc, TM, FuncTy->getReturnType(), ResultMVTs);
+    // WebAssembly can't currently handle returning tuples.
+    if (ResultMVTs.size() <= 1)
+      for (MVT ResultMVT : ResultMVTs)
+        Returns.push_back(WebAssembly::toValType(ResultMVT));
+    else
+      Params.push_back(iPTR);
+
+    for (Type *Ty : FuncTy->params()) {
+      SmallVector<MVT, 4> ParamMVTs;
+      ComputeLegalValueVTs(CurrentFunc, TM, Ty, ParamMVTs);
+      for (MVT ParamMVT : ParamMVTs)
+        Params.push_back(WebAssembly::toValType(ParamMVT));
+    }
+
+    if (FuncTy->isVarArg())
+      Params.push_back(iPTR);
+
+    WasmSym->setReturns(std::move(Returns));
+    WasmSym->setParams(std::move(Params));
+    WasmSym->setIsFunction(true);
+  }
+
+  return WasmSym;
 }
 
 MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
     const MachineOperand &MO) const {
-  return Printer.GetExternalSymbolSymbol(MO.getSymbolName());
+  const char *Name = MO.getSymbolName();
+  MCSymbol *Sym = Printer.GetExternalSymbolSymbol(Name);
+  if (isa<MCSymbolELF>(Sym))
+    return Sym;
+
+  MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+  const WebAssemblySubtarget &Subtarget = Printer.getSubtarget();
+
+  // __stack_pointer is a global variable; all other external symbols used by
+  // CodeGen are functions.
+  if (strcmp(Name, "__stack_pointer") == 0)
+    return WasmSym;
+
+  SmallVector<wasm::ValType, 4> Returns;
+  SmallVector<wasm::ValType, 4> Params;
+  GetSignature(Subtarget, Name, Returns, Params);
+
+  WasmSym->setReturns(std::move(Returns));
+  WasmSym->setParams(std::move(Params));
+  WasmSym->setIsFunction(true);
+
+  return WasmSym;
 }
 
 MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
@@ -42,6 +112,9 @@ MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
   MCSymbolRefExpr::VariantKind VK =
       IsFunc ? MCSymbolRefExpr::VK_WebAssembly_FUNCTION
              : MCSymbolRefExpr::VK_None;
+  if (!isa<MCSymbolELF>(Sym))
+    cast<MCSymbolWasm>(Sym)->setIsFunction(IsFunc);
+
   const MCExpr *Expr = MCSymbolRefExpr::create(Sym, VK, Ctx);
 
   if (Offset != 0) {
@@ -54,10 +127,24 @@ MCOperand WebAssemblyMCInstLower::LowerSymbolOperand(MCSymbol *Sym,
   return MCOperand::createExpr(Expr);
 }
 
+// Return the WebAssembly type associated with the given register class.
+static wasm::ValType getType(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return wasm::ValType::I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return wasm::ValType::I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return wasm::ValType::F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return wasm::ValType::F64;
+  llvm_unreachable("Unexpected register class");
+}
+
 void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
                                    MCInst &OutMI) const {
   OutMI.setOpcode(MI->getOpcode());
 
+  const MCInstrDesc &Desc = MI->getDesc();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
 
@@ -80,6 +167,41 @@ void WebAssemblyMCInstLower::Lower(const MachineInstr *MI,
       break;
     }
     case MachineOperand::MO_Immediate:
+      if (i < Desc.NumOperands) {
+        const MCOperandInfo &Info = Desc.OpInfo[i];
+        if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
+          MCSymbol *Sym = Printer.createTempSymbol("typeindex");
+          if (!isa<MCSymbolELF>(Sym)) {
+            SmallVector<wasm::ValType, 4> Returns;
+            SmallVector<wasm::ValType, 4> Params;
+
+            const MachineRegisterInfo &MRI =
+                MI->getParent()->getParent()->getRegInfo();
+            for (const MachineOperand &MO : MI->defs())
+              Returns.push_back(getType(MRI.getRegClass(MO.getReg())));
+            for (const MachineOperand &MO : MI->explicit_uses())
+              if (MO.isReg())
+                Params.push_back(getType(MRI.getRegClass(MO.getReg())));
+
+            // call_indirect instructions have a callee operand at the end which
+            // doesn't count as a param.
+            if (WebAssembly::isCallIndirect(*MI))
+              Params.pop_back();
+
+            MCSymbolWasm *WasmSym = cast<MCSymbolWasm>(Sym);
+            WasmSym->setReturns(std::move(Returns));
+            WasmSym->setParams(std::move(Params));
+            WasmSym->setIsFunction(true);
+
+            const MCExpr *Expr =
+                MCSymbolRefExpr::create(WasmSym,
+                                        MCSymbolRefExpr::VK_WebAssembly_TYPEINDEX,
+                                        Ctx);
+            MCOp = MCOperand::createExpr(Expr);
+            break;
+          }
+        }
+      }
       MCOp = MCOperand::createImm(MO.getImm());
       break;
     case MachineOperand::MO_FPImmediate: {
diff --git a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
index ab4ba1c28d53c6ab4e8448acbde44a99443a30aa..d1d2794c3b8f6bbe374276aa536c336af9f1a9e6 100644
--- a/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
+++ b/lib/Target/WebAssembly/WebAssemblyMCInstLower.h
@@ -20,7 +20,7 @@
 #include "llvm/Support/Compiler.h"
 
 namespace llvm {
-class AsmPrinter;
+class WebAssemblyAsmPrinter;
 class MCContext;
 class MCSymbol;
 class MachineInstr;
@@ -29,7 +29,7 @@ class MachineOperand;
 /// This class is used to lower an MachineInstr into an MCInst.
 class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
   MCContext &Ctx;
-  AsmPrinter &Printer;
+  WebAssemblyAsmPrinter &Printer;
 
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
   MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
@@ -37,7 +37,7 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyMCInstLower {
                                bool IsFunc) const;
 
 public:
-  WebAssemblyMCInstLower(MCContext &ctx, AsmPrinter &printer)
+  WebAssemblyMCInstLower(MCContext &ctx, WebAssemblyAsmPrinter &printer)
       : Ctx(ctx), Printer(printer) {}
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
 };
diff --git a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 756619bebbed0e1d4b2e0b3b98a8cda0028c8b3c..1fcbb7791d4e5f7ccff9f3ae3587d483cc8eba23 100644
--- a/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -60,6 +60,8 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   void addResult(MVT VT) { Results.push_back(VT); }
   const std::vector<MVT> &getResults() const { return Results; }
 
+  void setNumLocals(size_t NumLocals) { Locals.resize(NumLocals, MVT::i32); }
+  void setLocal(size_t i, MVT VT) { Locals[i] = VT; }
   void addLocal(MVT VT) { Locals.push_back(VT); }
   const std::vector<MVT> &getLocals() const { return Locals; }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index 1f2c4cecfe50cbd6b8738e0b9b6ecd8f19a5e3f0..d2fbc5a22308fdc30f63717c85c24b60b5c3ff54 100644
--- a/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -80,19 +80,31 @@ static bool MaybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
     return false;
   if (&MBB != &MF.back())
     return false;
-  if (&MI != &MBB.back())
-    return false;
+  if (MF.getSubtarget<WebAssemblySubtarget>()
+        .getTargetTriple().isOSBinFormatELF()) {
+    if (&MI != &MBB.back())
+      return false;
+  } else {
+    MachineBasicBlock::iterator End = MBB.end();
+    --End;
+    assert(End->getOpcode() == WebAssembly::END_FUNCTION);
+    --End;
+    if (&MI != &*End)
+      return false;
+  }
 
-  // If the operand isn't stackified, insert a COPY to read the operand and
-  // stackify it.
-  MachineOperand &MO = MI.getOperand(0);
-  unsigned Reg = MO.getReg();
-  if (!MFI.isVRegStackified(Reg)) {
-    unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
-    BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
-        .addReg(Reg);
-    MO.setReg(NewReg);
-    MFI.stackifyVReg(NewReg);
+  if (FallthroughOpc != WebAssembly::FALLTHROUGH_RETURN_VOID) {
+    // If the operand isn't stackified, insert a COPY to read the operand and
+    // stackify it.
+    MachineOperand &MO = MI.getOperand(0);
+    unsigned Reg = MO.getReg();
+    if (!MFI.isVRegStackified(Reg)) {
+      unsigned NewReg = MRI.createVirtualRegister(MRI.getRegClass(Reg));
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
+          .addReg(Reg);
+      MO.setReg(NewReg);
+      MFI.stackifyVReg(NewReg);
+    }
   }
 
   // Rewrite the return.
@@ -188,9 +200,9 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
             WebAssembly::COPY_V128);
         break;
       case WebAssembly::RETURN_VOID:
-        if (!DisableWebAssemblyFallthroughReturnOpt &&
-            &MBB == &MF.back() && &MI == &MBB.back())
-          MI.setDesc(TII.get(WebAssembly::FALLTHROUGH_RETURN_VOID));
+        Changed |= MaybeRewriteToFallthrough(
+            MI, MBB, MF, MFI, MRI, TII, WebAssembly::FALLTHROUGH_RETURN_VOID,
+            WebAssembly::INSTRUCTION_LIST_END);
         break;
       }
 
diff --git a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 32ee09e457960eabf53395e74f5597aba2ff0042..57d454746b06849282141fbd0d85426efbf0c28f 100644
--- a/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -30,6 +30,7 @@
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
@@ -152,7 +153,7 @@ static void QueryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
 }
 
 // Determine whether MI reads memory, writes memory, has side effects,
-// and/or uses the __stack_pointer value.
+// and/or uses the stack pointer value.
 static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
                   bool &Write, bool &Effects, bool &StackPointer) {
   assert(!MI.isPosition());
@@ -169,15 +170,28 @@ static void Query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
   if (MI.mayStore()) {
     Write = true;
 
-    // Check for stores to __stack_pointer.
-    for (auto MMO : MI.memoperands()) {
-      const MachinePointerInfo &MPI = MMO->getPointerInfo();
-      if (MPI.V.is<const PseudoSourceValue *>()) {
-        auto PSV = MPI.V.get<const PseudoSourceValue *>();
-        if (const ExternalSymbolPseudoSourceValue *EPSV =
-                dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
-          if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
-            StackPointer = true;
+    const MachineFunction &MF = *MI.getParent()->getParent();
+    if (MF.getSubtarget<WebAssemblySubtarget>()
+          .getTargetTriple().isOSBinFormatELF()) {
+      // Check for stores to __stack_pointer.
+      for (auto MMO : MI.memoperands()) {
+        const MachinePointerInfo &MPI = MMO->getPointerInfo();
+        if (MPI.V.is<const PseudoSourceValue *>()) {
+          auto PSV = MPI.V.get<const PseudoSourceValue *>();
+          if (const ExternalSymbolPseudoSourceValue *EPSV =
+                  dyn_cast<ExternalSymbolPseudoSourceValue>(PSV))
+            if (StringRef(EPSV->getSymbol()) == "__stack_pointer")
+              StackPointer = true;
+        }
+      }
+    } else {
+      // Check for sets of the stack pointer.
+      const MachineModuleInfoWasm &MMIW =
+          MF.getMMI().getObjFileInfo<MachineModuleInfoWasm>();
+      if ((MI.getOpcode() == WebAssembly::SET_LOCAL_I32 ||
+           MI.getOpcode() == WebAssembly::SET_LOCAL_I64) &&
+          MI.getOperand(0).getImm() == MMIW.getStackPointerGlobal()) {
+        StackPointer = true;
       }
     }
   } else if (MI.hasOrderedMemoryRef()) {
diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c02ef4a1c399b85c04e39b5b0d62d67eb76ced5d
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -0,0 +1,1302 @@
+// CodeGen/RuntimeLibcallSignatures.cpp - R.T. Lib. Call Signatures -*- C++ -*--
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file contains signature information for runtime libcalls.
+///
+/// CodeGen uses external symbols, which it refers to by name. The WebAssembly
+/// target needs type information for all functions. This file contains a big
+/// table providing type signatures for all runtime library functions that LLVM
+/// uses.
+///
+/// This is currently a fairly heavy-handed solution.
+///
+//===----------------------------------------------------------------------===//
+
+#include "WebAssemblyRuntimeLibcallSignatures.h"
+#include "WebAssemblySubtarget.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+
+using namespace llvm;
+
+namespace {
+
+enum RuntimeLibcallSignature {
+  func,
+  f32_func_f32,
+  f32_func_f64,
+  f32_func_i32,
+  f32_func_i64,
+  f32_func_i16,
+  f64_func_f32,
+  f64_func_f64,
+  f64_func_i32,
+  f64_func_i64,
+  i32_func_f32,
+  i32_func_f64,
+  i32_func_i32,
+  i64_func_f32,
+  i64_func_f64,
+  i64_func_i64,
+  f32_func_f32_f32,
+  f32_func_f32_i32,
+  f32_func_i64_i64,
+  f64_func_f64_f64,
+  f64_func_f64_i32,
+  f64_func_i64_i64,
+  i16_func_f32,
+  i8_func_i8_i8,
+  func_f32_iPTR_iPTR,
+  func_f64_iPTR_iPTR,
+  i16_func_i16_i16,
+  i32_func_f32_f32,
+  i32_func_f64_f64,
+  i32_func_i32_i32,
+  i64_func_i64_i64,
+  i64_i64_func_f32,
+  i64_i64_func_f64,
+  i16_i16_func_i16_i16,
+  i32_i32_func_i32_i32,
+  i64_i64_func_i64_i64,
+  i64_i64_func_i64_i64_i64_i64,
+  i64_i64_i64_i64_func_i64_i64_i64_i64,
+  i64_i64_func_i64_i64_i32,
+  iPTR_func_iPTR_i32_iPTR,
+  iPTR_func_iPTR_iPTR_iPTR,
+  f32_func_f32_f32_f32,
+  f64_func_f64_f64_f64,
+  func_i64_i64_iPTR_iPTR,
+  func_iPTR_f32,
+  func_iPTR_f64,
+  func_iPTR_i32,
+  func_iPTR_i64,
+  func_iPTR_i64_i64,
+  func_iPTR_i64_i64_i64_i64,
+  func_iPTR_i64_i64_i64_i64_i64_i64,
+  i32_func_i64_i64,
+  i32_func_i64_i64_i64_i64,
+  unsupported
+};
+
+} // end anonymous namespace
+
+static const RuntimeLibcallSignature
+RuntimeLibcallSignatures[RTLIB::UNKNOWN_LIBCALL] = {
+// Integer
+/* SHL_I16 */ i16_func_i16_i16,
+/* SHL_I32 */ i32_func_i32_i32,
+/* SHL_I64 */ i64_func_i64_i64,
+/* SHL_I128 */ i64_i64_func_i64_i64_i32,
+/* SRL_I16 */ i16_func_i16_i16,
+/* SRL_I32 */ i32_func_i32_i32,
+/* SRL_I64 */ i64_func_i64_i64,
+/* SRL_I128 */ i64_i64_func_i64_i64_i32,
+/* SRA_I16 */ i16_func_i16_i16,
+/* SRA_I32 */ i32_func_i32_i32,
+/* SRA_I64 */ i64_func_i64_i64,
+/* SRA_I128 */ i64_i64_func_i64_i64_i32,
+/* MUL_I8 */ i8_func_i8_i8,
+/* MUL_I16 */ i16_func_i16_i16,
+/* MUL_I32 */ i32_func_i32_i32,
+/* MUL_I64 */ i64_func_i64_i64,
+/* MUL_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* MULO_I32 */ i32_func_i32_i32,
+/* MULO_I64 */ i64_func_i64_i64,
+/* MULO_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* SDIV_I8 */ i8_func_i8_i8,
+/* SDIV_I16 */ i16_func_i16_i16,
+/* SDIV_I32 */ i32_func_i32_i32,
+/* SDIV_I64 */ i64_func_i64_i64,
+/* SDIV_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* UDIV_I8 */ i8_func_i8_i8,
+/* UDIV_I16 */ i16_func_i16_i16,
+/* UDIV_I32 */ i32_func_i32_i32,
+/* UDIV_I64 */ i64_func_i64_i64,
+/* UDIV_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* SREM_I8 */ i8_func_i8_i8,
+/* SREM_I16 */ i16_func_i16_i16,
+/* SREM_I32 */ i32_func_i32_i32,
+/* SREM_I64 */ i64_func_i64_i64,
+/* SREM_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* UREM_I8 */ i8_func_i8_i8,
+/* UREM_I16 */ i16_func_i16_i16,
+/* UREM_I32 */ i32_func_i32_i32,
+/* UREM_I64 */ i64_func_i64_i64,
+/* UREM_I128 */ i64_i64_func_i64_i64_i64_i64,
+/* SDIVREM_I8 */ i8_func_i8_i8,
+/* SDIVREM_I16 */ i16_i16_func_i16_i16,
+/* SDIVREM_I32 */ i32_i32_func_i32_i32,
+/* SDIVREM_I64 */ i64_func_i64_i64,
+/* SDIVREM_I128 */ i64_i64_i64_i64_func_i64_i64_i64_i64,
+/* UDIVREM_I8 */ i8_func_i8_i8,
+/* UDIVREM_I16 */ i16_i16_func_i16_i16,
+/* UDIVREM_I32 */ i32_i32_func_i32_i32,
+/* UDIVREM_I64 */ i64_i64_func_i64_i64,
+/* UDIVREM_I128 */ i64_i64_i64_i64_func_i64_i64_i64_i64,
+/* NEG_I32 */ i32_func_i32,
+/* NEG_I64 */ i64_func_i64,
+
+// FLOATING POINT
+/* ADD_F32 */ f32_func_f32_f32,
+/* ADD_F64 */ f64_func_f64_f64,
+/* ADD_F80 */ unsupported,
+/* ADD_F128 */ func_iPTR_i64_i64_i64_i64,
+/* ADD_PPCF128 */ unsupported,
+/* SUB_F32 */ f32_func_f32_f32,
+/* SUB_F64 */ f64_func_f64_f64,
+/* SUB_F80 */ unsupported,
+/* SUB_F128 */ func_iPTR_i64_i64_i64_i64,
+/* SUB_PPCF128 */ unsupported,
+/* MUL_F32 */ f32_func_f32_f32,
+/* MUL_F64 */ f64_func_f64_f64,
+/* MUL_F80 */ unsupported,
+/* MUL_F128 */ func_iPTR_i64_i64_i64_i64,
+/* MUL_PPCF128 */ unsupported,
+/* DIV_F32 */ f32_func_f32_f32,
+/* DIV_F64 */ f64_func_f64_f64,
+/* DIV_F80 */ unsupported,
+/* DIV_F128 */ func_iPTR_i64_i64_i64_i64,
+/* DIV_PPCF128 */ unsupported,
+/* REM_F32 */ f32_func_f32_f32,
+/* REM_F64 */ f64_func_f64_f64,
+/* REM_F80 */ unsupported,
+/* REM_F128 */ func_iPTR_i64_i64_i64_i64,
+/* REM_PPCF128 */ unsupported,
+/* FMA_F32 */ f32_func_f32_f32_f32,
+/* FMA_F64 */ f64_func_f64_f64_f64,
+/* FMA_F80 */ unsupported,
+/* FMA_F128 */ func_iPTR_i64_i64_i64_i64_i64_i64,
+/* FMA_PPCF128 */ unsupported,
+/* POWI_F32 */ f32_func_f32_i32,
+/* POWI_F64 */ f64_func_f64_i32,
+/* POWI_F80 */ unsupported,
+/* POWI_F128 */ func_iPTR_i64_i64_i64_i64,
+/* POWI_PPCF128 */ unsupported,
+/* SQRT_F32 */ f32_func_f32,
+/* SQRT_F64 */ f64_func_f64,
+/* SQRT_F80 */ unsupported,
+/* SQRT_F128 */ func_iPTR_i64_i64,
+/* SQRT_PPCF128 */ unsupported,
+/* LOG_F32 */ f32_func_f32,
+/* LOG_F64 */ f64_func_f64,
+/* LOG_F80 */ unsupported,
+/* LOG_F128 */ func_iPTR_i64_i64,
+/* LOG_PPCF128 */ unsupported,
+/* LOG2_F32 */ f32_func_f32,
+/* LOG2_F64 */ f64_func_f64,
+/* LOG2_F80 */ unsupported,
+/* LOG2_F128 */ func_iPTR_i64_i64,
+/* LOG2_PPCF128 */ unsupported,
+/* LOG10_F32 */ f32_func_f32,
+/* LOG10_F64 */ f64_func_f64,
+/* LOG10_F80 */ unsupported,
+/* LOG10_F128 */ func_iPTR_i64_i64,
+/* LOG10_PPCF128 */ unsupported,
+/* EXP_F32 */ f32_func_f32,
+/* EXP_F64 */ f64_func_f64,
+/* EXP_F80 */ unsupported,
+/* EXP_F128 */ func_iPTR_i64_i64,
+/* EXP_PPCF128 */ unsupported,
+/* EXP2_F32 */ f32_func_f32,
+/* EXP2_F64 */ f64_func_f64,
+/* EXP2_F80 */ unsupported,
+/* EXP2_F128 */ func_iPTR_i64_i64,
+/* EXP2_PPCF128 */ unsupported,
+/* SIN_F32 */ f32_func_f32,
+/* SIN_F64 */ f64_func_f64,
+/* SIN_F80 */ unsupported,
+/* SIN_F128 */ func_iPTR_i64_i64,
+/* SIN_PPCF128 */ unsupported,
+/* COS_F32 */ f32_func_f32,
+/* COS_F64 */ f64_func_f64,
+/* COS_F80 */ unsupported,
+/* COS_F128 */ func_iPTR_i64_i64,
+/* COS_PPCF128 */ unsupported,
+/* SINCOS_F32 */ func_f32_iPTR_iPTR,
+/* SINCOS_F64 */ func_f64_iPTR_iPTR,
+/* SINCOS_F80 */ unsupported,
+/* SINCOS_F128 */ func_i64_i64_iPTR_iPTR,
+/* SINCOS_PPCF128 */ unsupported,
+/* POW_F32 */ f32_func_f32_f32,
+/* POW_F64 */ f64_func_f64_f64,
+/* POW_F80 */ unsupported,
+/* POW_F128 */ func_iPTR_i64_i64_i64_i64,
+/* POW_PPCF128 */ unsupported,
+/* CEIL_F32 */ f32_func_f32,
+/* CEIL_F64 */ f64_func_f64,
+/* CEIL_F80 */ unsupported,
+/* CEIL_F128 */ func_iPTR_i64_i64,
+/* CEIL_PPCF128 */ unsupported,
+/* TRUNC_F32 */ f32_func_f32,
+/* TRUNC_F64 */ f64_func_f64,
+/* TRUNC_F80 */ unsupported,
+/* TRUNC_F128 */ func_iPTR_i64_i64,
+/* TRUNC_PPCF128 */ unsupported,
+/* RINT_F32 */ f32_func_f32,
+/* RINT_F64 */ f64_func_f64,
+/* RINT_F80 */ unsupported,
+/* RINT_F128 */ func_iPTR_i64_i64,
+/* RINT_PPCF128 */ unsupported,
+/* NEARBYINT_F32 */ f32_func_f32,
+/* NEARBYINT_F64 */ f64_func_f64,
+/* NEARBYINT_F80 */ unsupported,
+/* NEARBYINT_F128 */ func_iPTR_i64_i64,
+/* NEARBYINT_PPCF128 */ unsupported,
+/* ROUND_F32 */ f32_func_f32,
+/* ROUND_F64 */ f64_func_f64,
+/* ROUND_F80 */ unsupported,
+/* ROUND_F128 */ func_iPTR_i64_i64,
+/* ROUND_PPCF128 */ unsupported,
+/* FLOOR_F32 */ f32_func_f32,
+/* FLOOR_F64 */ f64_func_f64,
+/* FLOOR_F80 */ unsupported,
+/* FLOOR_F128 */ func_iPTR_i64_i64,
+/* FLOOR_PPCF128 */ unsupported,
+/* COPYSIGN_F32 */ f32_func_f32_f32,
+/* COPYSIGN_F64 */ f64_func_f64_f64,
+/* COPYSIGN_F80 */ unsupported,
+/* COPYSIGN_F128 */ func_iPTR_i64_i64_i64_i64,
+/* COPYSIGN_PPCF128 */ unsupported,
+/* FMIN_F32 */ f32_func_f32_f32,
+/* FMIN_F64 */ f64_func_f64_f64,
+/* FMIN_F80 */ unsupported,
+/* FMIN_F128 */ func_iPTR_i64_i64_i64_i64,
+/* FMIN_PPCF128 */ unsupported,
+/* FMAX_F32 */ f32_func_f32_f32,
+/* FMAX_F64 */ f64_func_f64_f64,
+/* FMAX_F80 */ unsupported,
+/* FMAX_F128 */ func_iPTR_i64_i64_i64_i64,
+/* FMAX_PPCF128 */ unsupported,
+
+// CONVERSION
+/* FPEXT_F32_PPCF128 */ unsupported,
+/* FPEXT_F64_PPCF128 */ unsupported,
+/* FPEXT_F64_F128 */ func_iPTR_f64,
+/* FPEXT_F32_F128 */ func_iPTR_f32,
+/* FPEXT_F32_F64 */ f64_func_f32,
+/* FPEXT_F16_F32 */ f32_func_i16,
+/* FPROUND_F32_F16 */ i16_func_f32,
+/* FPROUND_F64_F16 */ unsupported,
+/* FPROUND_F80_F16 */ unsupported,
+/* FPROUND_F128_F16 */ unsupported,
+/* FPROUND_PPCF128_F16 */ unsupported,
+/* FPROUND_F64_F32 */ f32_func_f64,
+/* FPROUND_F80_F32 */ unsupported,
+/* FPROUND_F128_F32 */ f32_func_i64_i64,
+/* FPROUND_PPCF128_F32 */ unsupported,
+/* FPROUND_F80_F64 */ unsupported,
+/* FPROUND_F128_F64 */ f64_func_i64_i64,
+/* FPROUND_PPCF128_F64 */ unsupported,
+/* FPTOSINT_F32_I32 */ i32_func_f32,
+/* FPTOSINT_F32_I64 */ i64_func_f32,
+/* FPTOSINT_F32_I128 */ i64_i64_func_f32,
+/* FPTOSINT_F64_I32 */ i32_func_f64,
+/* FPTOSINT_F64_I64 */ i64_func_f64,
+/* FPTOSINT_F64_I128 */ i64_i64_func_f64,
+/* FPTOSINT_F80_I32 */ unsupported,
+/* FPTOSINT_F80_I64 */ unsupported,
+/* FPTOSINT_F80_I128 */ unsupported,
+/* FPTOSINT_F128_I32 */ i32_func_i64_i64,
+/* FPTOSINT_F128_I64 */ i64_func_i64_i64,
+/* FPTOSINT_F128_I128 */ i64_i64_func_i64_i64,
+/* FPTOSINT_PPCF128_I32 */ unsupported,
+/* FPTOSINT_PPCF128_I64 */ unsupported,
+/* FPTOSINT_PPCF128_I128 */ unsupported,
+/* FPTOUINT_F32_I32 */ i32_func_f32,
+/* FPTOUINT_F32_I64 */ i64_func_f32,
+/* FPTOUINT_F32_I128 */ i64_i64_func_f32,
+/* FPTOUINT_F64_I32 */ i32_func_f64,
+/* FPTOUINT_F64_I64 */ i64_func_f64,
+/* FPTOUINT_F64_I128 */ i64_i64_func_f64,
+/* FPTOUINT_F80_I32 */ unsupported,
+/* FPTOUINT_F80_I64 */ unsupported,
+/* FPTOUINT_F80_I128 */ unsupported,
+/* FPTOUINT_F128_I32 */ i32_func_i64_i64,
+/* FPTOUINT_F128_I64 */ i64_func_i64_i64,
+/* FPTOUINT_F128_I128 */ i64_i64_func_i64_i64,
+/* FPTOUINT_PPCF128_I32 */ unsupported,
+/* FPTOUINT_PPCF128_I64 */ unsupported,
+/* FPTOUINT_PPCF128_I128 */ unsupported,
+/* SINTTOFP_I32_F32 */ f32_func_i32,
+/* SINTTOFP_I32_F64 */ f64_func_i32,
+/* SINTTOFP_I32_F80 */ unsupported,
+/* SINTTOFP_I32_F128 */ func_iPTR_i32,
+/* SINTTOFP_I32_PPCF128 */ unsupported,
+/* SINTTOFP_I64_F32 */ f32_func_i64,
+/* SINTTOFP_I64_F64 */ f64_func_i64,
+/* SINTTOFP_I64_F80 */ unsupported,
+/* SINTTOFP_I64_F128 */ func_iPTR_i64,
+/* SINTTOFP_I64_PPCF128 */ unsupported,
+/* SINTTOFP_I128_F32 */ f32_func_i64_i64,
+/* SINTTOFP_I128_F64 */ f64_func_i64_i64,
+/* SINTTOFP_I128_F80 */ unsupported,
+/* SINTTOFP_I128_F128 */ func_iPTR_i64_i64,
+/* SINTTOFP_I128_PPCF128 */ unsupported,
+/* UINTTOFP_I32_F32 */ f32_func_i32,
+/* UINTTOFP_I32_F64 */ f64_func_i64,
+/* UINTTOFP_I32_F80 */ unsupported,
+/* UINTTOFP_I32_F128 */ func_iPTR_i32,
+/* UINTTOFP_I32_PPCF128 */ unsupported,
+/* UINTTOFP_I64_F32 */ f32_func_i64,
+/* UINTTOFP_I64_F64 */ f64_func_i64,
+/* UINTTOFP_I64_F80 */ unsupported,
+/* UINTTOFP_I64_F128 */ func_iPTR_i64,
+/* UINTTOFP_I64_PPCF128 */ unsupported,
+/* UINTTOFP_I128_F32 */ f32_func_i64_i64,
+/* UINTTOFP_I128_F64 */ f64_func_i64_i64,
+/* UINTTOFP_I128_F80 */ unsupported,
+/* UINTTOFP_I128_F128 */ func_iPTR_i64_i64,
+/* UINTTOFP_I128_PPCF128 */ unsupported,
+
+// COMPARISON
+/* OEQ_F32 */ i32_func_f32_f32,
+/* OEQ_F64 */ i32_func_f64_f64,
+/* OEQ_F128 */ i32_func_i64_i64_i64_i64,
+/* OEQ_PPCF128 */ unsupported,
+/* UNE_F32 */ i32_func_f32_f32,
+/* UNE_F64 */ i32_func_f64_f64,
+/* UNE_F128 */ i32_func_i64_i64_i64_i64,
+/* UNE_PPCF128 */ unsupported,
+/* OGE_F32 */ i32_func_f32_f32,
+/* OGE_F64 */ i32_func_f64_f64,
+/* OGE_F128 */ i32_func_i64_i64_i64_i64,
+/* OGE_PPCF128 */ unsupported,
+/* OLT_F32 */ i32_func_f32_f32,
+/* OLT_F64 */ i32_func_f64_f64,
+/* OLT_F128 */ i32_func_i64_i64_i64_i64,
+/* OLT_PPCF128 */ unsupported,
+/* OLE_F32 */ i32_func_f32_f32,
+/* OLE_F64 */ i32_func_f64_f64,
+/* OLE_F128 */ i32_func_i64_i64_i64_i64,
+/* OLE_PPCF128 */ unsupported,
+/* OGT_F32 */ i32_func_f32_f32,
+/* OGT_F64 */ i32_func_f64_f64,
+/* OGT_F128 */ i32_func_i64_i64_i64_i64,
+/* OGT_PPCF128 */ unsupported,
+/* UO_F32 */ i32_func_f32_f32,
+/* UO_F64 */ i32_func_f64_f64,
+/* UO_F128 */ i32_func_i64_i64_i64_i64,
+/* UO_PPCF128 */ unsupported,
+/* O_F32 */ i32_func_f32_f32,
+/* O_F64 */ i32_func_f64_f64,
+/* O_F128 */ i32_func_i64_i64_i64_i64,
+/* O_PPCF128 */ unsupported,
+
+// MEMORY
+/* MEMCPY */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMSET */ iPTR_func_iPTR_i32_iPTR,
+/* MEMMOVE */ iPTR_func_iPTR_iPTR_iPTR,
+
+// ELEMENT-WISE ATOMIC MEMORY
+/* MEMCPY_ELEMENT_ATOMIC_1 */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMCPY_ELEMENT_ATOMIC_2 */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMCPY_ELEMENT_ATOMIC_4 */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMCPY_ELEMENT_ATOMIC_8 */ iPTR_func_iPTR_iPTR_iPTR,
+/* MEMCPY_ELEMENT_ATOMIC_16 */ iPTR_func_iPTR_iPTR_iPTR,
+
+// EXCEPTION HANDLING
+/* UNWIND_RESUME */ unsupported,
+
+// Note: there's two sets of atomics libcalls; see
+// <http://llvm.org/docs/Atomics.html> for more info on the
+// difference between them.
+
+// Atomic '__sync_*' libcalls.
+/* SYNC_VAL_COMPARE_AND_SWAP_1 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_2 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_4 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_8 */ unsupported,
+/* SYNC_VAL_COMPARE_AND_SWAP_16 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_1 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_2 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_4 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_8 */ unsupported,
+/* SYNC_LOCK_TEST_AND_SET_16 */ unsupported,
+/* SYNC_FETCH_AND_ADD_1 */ unsupported,
+/* SYNC_FETCH_AND_ADD_2 */ unsupported,
+/* SYNC_FETCH_AND_ADD_4 */ unsupported,
+/* SYNC_FETCH_AND_ADD_8 */ unsupported,
+/* SYNC_FETCH_AND_ADD_16 */ unsupported,
+/* SYNC_FETCH_AND_SUB_1 */ unsupported,
+/* SYNC_FETCH_AND_SUB_2 */ unsupported,
+/* SYNC_FETCH_AND_SUB_4 */ unsupported,
+/* SYNC_FETCH_AND_SUB_8 */ unsupported,
+/* SYNC_FETCH_AND_SUB_16 */ unsupported,
+/* SYNC_FETCH_AND_AND_1 */ unsupported,
+/* SYNC_FETCH_AND_AND_2 */ unsupported,
+/* SYNC_FETCH_AND_AND_4 */ unsupported,
+/* SYNC_FETCH_AND_AND_8 */ unsupported,
+/* SYNC_FETCH_AND_AND_16 */ unsupported,
+/* SYNC_FETCH_AND_OR_1 */ unsupported,
+/* SYNC_FETCH_AND_OR_2 */ unsupported,
+/* SYNC_FETCH_AND_OR_4 */ unsupported,
+/* SYNC_FETCH_AND_OR_8 */ unsupported,
+/* SYNC_FETCH_AND_OR_16 */ unsupported,
+/* SYNC_FETCH_AND_XOR_1 */ unsupported,
+/* SYNC_FETCH_AND_XOR_2 */ unsupported,
+/* SYNC_FETCH_AND_XOR_4 */ unsupported,
+/* SYNC_FETCH_AND_XOR_8 */ unsupported,
+/* SYNC_FETCH_AND_XOR_16 */ unsupported,
+/* SYNC_FETCH_AND_NAND_1 */ unsupported,
+/* SYNC_FETCH_AND_NAND_2 */ unsupported,
+/* SYNC_FETCH_AND_NAND_4 */ unsupported,
+/* SYNC_FETCH_AND_NAND_8 */ unsupported,
+/* SYNC_FETCH_AND_NAND_16 */ unsupported,
+/* SYNC_FETCH_AND_MAX_1 */ unsupported,
+/* SYNC_FETCH_AND_MAX_2 */ unsupported,
+/* SYNC_FETCH_AND_MAX_4 */ unsupported,
+/* SYNC_FETCH_AND_MAX_8 */ unsupported,
+/* SYNC_FETCH_AND_MAX_16 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_1 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_2 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_4 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_8 */ unsupported,
+/* SYNC_FETCH_AND_UMAX_16 */ unsupported,
+/* SYNC_FETCH_AND_MIN_1 */ unsupported,
+/* SYNC_FETCH_AND_MIN_2 */ unsupported,
+/* SYNC_FETCH_AND_MIN_4 */ unsupported,
+/* SYNC_FETCH_AND_MIN_8 */ unsupported,
+/* SYNC_FETCH_AND_MIN_16 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_1 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_2 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_4 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_8 */ unsupported,
+/* SYNC_FETCH_AND_UMIN_16 */ unsupported,
+
+// Atomic '__atomic_*' libcalls.
+/* ATOMIC_LOAD */ unsupported,
+/* ATOMIC_LOAD_1 */ unsupported,
+/* ATOMIC_LOAD_2 */ unsupported,
+/* ATOMIC_LOAD_4 */ unsupported,
+/* ATOMIC_LOAD_8 */ unsupported,
+/* ATOMIC_LOAD_16 */ unsupported,
+
+/* ATOMIC_STORE */ unsupported,
+/* ATOMIC_STORE_1 */ unsupported,
+/* ATOMIC_STORE_2 */ unsupported,
+/* ATOMIC_STORE_4 */ unsupported,
+/* ATOMIC_STORE_8 */ unsupported,
+/* ATOMIC_STORE_16 */ unsupported,
+
+/* ATOMIC_EXCHANGE */ unsupported,
+/* ATOMIC_EXCHANGE_1 */ unsupported,
+/* ATOMIC_EXCHANGE_2 */ unsupported,
+/* ATOMIC_EXCHANGE_4 */ unsupported,
+/* ATOMIC_EXCHANGE_8 */ unsupported,
+/* ATOMIC_EXCHANGE_16 */ unsupported,
+
+/* ATOMIC_COMPARE_EXCHANGE */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_1 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_2 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_4 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_8 */ unsupported,
+/* ATOMIC_COMPARE_EXCHANGE_16 */ unsupported,
+
+/* ATOMIC_FETCH_ADD_1 */ unsupported,
+/* ATOMIC_FETCH_ADD_2 */ unsupported,
+/* ATOMIC_FETCH_ADD_4 */ unsupported,
+/* ATOMIC_FETCH_ADD_8 */ unsupported,
+/* ATOMIC_FETCH_ADD_16 */ unsupported,
+
+/* ATOMIC_FETCH_SUB_1 */ unsupported,
+/* ATOMIC_FETCH_SUB_2 */ unsupported,
+/* ATOMIC_FETCH_SUB_4 */ unsupported,
+/* ATOMIC_FETCH_SUB_8 */ unsupported,
+/* ATOMIC_FETCH_SUB_16 */ unsupported,
+
+/* ATOMIC_FETCH_AND_1 */ unsupported,
+/* ATOMIC_FETCH_AND_2 */ unsupported,
+/* ATOMIC_FETCH_AND_4 */ unsupported,
+/* ATOMIC_FETCH_AND_8 */ unsupported,
+/* ATOMIC_FETCH_AND_16 */ unsupported,
+
+/* ATOMIC_FETCH_OR_1 */ unsupported,
+/* ATOMIC_FETCH_OR_2 */ unsupported,
+/* ATOMIC_FETCH_OR_4 */ unsupported,
+/* ATOMIC_FETCH_OR_8 */ unsupported,
+/* ATOMIC_FETCH_OR_16 */ unsupported,
+
+/* ATOMIC_FETCH_XOR_1 */ unsupported,
+/* ATOMIC_FETCH_XOR_2 */ unsupported,
+/* ATOMIC_FETCH_XOR_4 */ unsupported,
+/* ATOMIC_FETCH_XOR_8 */ unsupported,
+/* ATOMIC_FETCH_XOR_16 */ unsupported,
+
+/* ATOMIC_FETCH_NAND_1 */ unsupported,
+/* ATOMIC_FETCH_NAND_2 */ unsupported,
+/* ATOMIC_FETCH_NAND_4 */ unsupported,
+/* ATOMIC_FETCH_NAND_8 */ unsupported,
+/* ATOMIC_FETCH_NAND_16 */ unsupported,
+
+// Stack Protector Fail.
+/* STACKPROTECTOR_CHECK_FAIL */ func,
+
+// Deoptimization.
+/* DEOPTIMIZE */ unsupported,
+
+};
+
+static const char *
+RuntimeLibcallNames[RTLIB::UNKNOWN_LIBCALL] = {
+/* SHL_I16 */ "__ashlhi3",
+/* SHL_I32 */ "__ashlsi3",
+/* SHL_I64 */ "__ashldi3",
+/* SHL_I128 */ "__ashlti3",
+/* SRL_I16 */ "__lshrhi3",
+/* SRL_I32 */ "__lshrsi3",
+/* SRL_I64 */ "__lshrdi3",
+/* SRL_I128 */ "__lshrti3",
+/* SRA_I16 */ "__ashrhi3",
+/* SRA_I32 */ "__ashrsi3",
+/* SRA_I64 */ "__ashrdi3",
+/* SRA_I128 */ "__ashrti3",
+/* MUL_I8 */ "__mulqi3",
+/* MUL_I16 */ "__mulhi3",
+/* MUL_I32 */ "__mulsi3",
+/* MUL_I64 */ "__muldi3",
+/* MUL_I128 */ "__multi3",
+/* MULO_I32 */ "__mulosi4",
+/* MULO_I64 */ "__mulodi4",
+/* MULO_I128 */ "__muloti4",
+/* SDIV_I8 */ "__divqi3",
+/* SDIV_I16 */ "__divhi3",
+/* SDIV_I32 */ "__divsi3",
+/* SDIV_I64 */ "__divdi3",
+/* SDIV_I128 */ "__divti3",
+/* UDIV_I8 */ "__udivqi3",
+/* UDIV_I16 */ "__udivhi3",
+/* UDIV_I32 */ "__udivsi3",
+/* UDIV_I64 */ "__udivdi3",
+/* UDIV_I128 */ "__udivti3",
+/* SREM_I8 */ "__modqi3",
+/* SREM_I16 */ "__modhi3",
+/* SREM_I32 */ "__modsi3",
+/* SREM_I64 */ "__moddi3",
+/* SREM_I128 */ "__modti3",
+/* UREM_I8 */ "__umodqi3",
+/* UREM_I16 */ "__umodhi3",
+/* UREM_I32 */ "__umodsi3",
+/* UREM_I64 */ "__umoddi3",
+/* UREM_I128 */ "__umodti3",
+/* SDIVREM_I8 */ nullptr,
+/* SDIVREM_I16 */ nullptr,
+/* SDIVREM_I32 */ nullptr,
+/* SDIVREM_I64 */ nullptr,
+/* SDIVREM_I128 */ nullptr,
+/* UDIVREM_I8 */ nullptr,
+/* UDIVREM_I16 */ nullptr,
+/* UDIVREM_I32 */ nullptr,
+/* UDIVREM_I64 */ nullptr,
+/* UDIVREM_I128 */ nullptr,
+/* NEG_I32 */ "__negsi2",
+/* NEG_I64 */ "__negdi2",
+/* ADD_F32 */ "__addsf3",
+/* ADD_F64 */ "__adddf3",
+/* ADD_F80 */ nullptr,
+/* ADD_F128 */ "__addtf3",
+/* ADD_PPCF128 */ nullptr,
+/* SUB_F32 */ "__subsf3",
+/* SUB_F64 */ "__subdf3",
+/* SUB_F80 */ nullptr,
+/* SUB_F128 */ "__subtf3",
+/* SUB_PPCF128 */ nullptr,
+/* MUL_F32 */ "__mulsf3",
+/* MUL_F64 */ "__muldf3",
+/* MUL_F80 */ nullptr,
+/* MUL_F128 */ "__multf3",
+/* MUL_PPCF128 */ nullptr,
+/* DIV_F32 */ "__divsf3",
+/* DIV_F64 */ "__divdf3",
+/* DIV_F80 */ nullptr,
+/* DIV_F128 */ "__divtf3",
+/* DIV_PPCF128 */ nullptr,
+/* REM_F32 */ "fmodf",
+/* REM_F64 */ "fmod",
+/* REM_F80 */ nullptr,
+/* REM_F128 */ "fmodl",
+/* REM_PPCF128 */ nullptr,
+/* FMA_F32 */ "fmaf",
+/* FMA_F64 */ "fma",
+/* FMA_F80 */ nullptr,
+/* FMA_F128 */ "fmal",
+/* FMA_PPCF128 */ nullptr,
+/* POWI_F32 */ "__powisf2",
+/* POWI_F64 */ "__powidf2",
+/* POWI_F80 */ nullptr,
+/* POWI_F128 */ "__powitf2",
+/* POWI_PPCF128 */ nullptr,
+/* SQRT_F32 */ "sqrtf",
+/* SQRT_F64 */ "sqrt",
+/* SQRT_F80 */ nullptr,
+/* SQRT_F128 */ "sqrtl",
+/* SQRT_PPCF128 */ nullptr,
+/* LOG_F32 */ "logf",
+/* LOG_F64 */ "log",
+/* LOG_F80 */ nullptr,
+/* LOG_F128 */ "logl",
+/* LOG_PPCF128 */ nullptr,
+/* LOG2_F32 */ "log2f",
+/* LOG2_F64 */ "log2",
+/* LOG2_F80 */ nullptr,
+/* LOG2_F128 */ "log2l",
+/* LOG2_PPCF128 */ nullptr,
+/* LOG10_F32 */ "log10f",
+/* LOG10_F64 */ "log10",
+/* LOG10_F80 */ nullptr,
+/* LOG10_F128 */ "log10l",
+/* LOG10_PPCF128 */ nullptr,
+/* EXP_F32 */ "expf",
+/* EXP_F64 */ "exp",
+/* EXP_F80 */ nullptr,
+/* EXP_F128 */ "expl",
+/* EXP_PPCF128 */ nullptr,
+/* EXP2_F32 */ "exp2f",
+/* EXP2_F64 */ "exp2",
+/* EXP2_F80 */ nullptr,
+/* EXP2_F128 */ "exp2l",
+/* EXP2_PPCF128 */ nullptr,
+/* SIN_F32 */ "sinf",
+/* SIN_F64 */ "sin",
+/* SIN_F80 */ nullptr,
+/* SIN_F128 */ "sinl",
+/* SIN_PPCF128 */ nullptr,
+/* COS_F32 */ "cosf",
+/* COS_F64 */ "cos",
+/* COS_F80 */ nullptr,
+/* COS_F128 */ "cosl",
+/* COS_PPCF128 */ nullptr,
+/* SINCOS_F32 */ "sincosf",
+/* SINCOS_F64 */ "sincos",
+/* SINCOS_F80 */ nullptr,
+/* SINCOS_F128 */ "sincosl",
+/* SINCOS_PPCF128 */ nullptr,
+/* POW_F32 */ "powf",
+/* POW_F64 */ "pow",
+/* POW_F80 */ nullptr,
+/* POW_F128 */ "powl",
+/* POW_PPCF128 */ nullptr,
+/* CEIL_F32 */ "ceilf",
+/* CEIL_F64 */ "ceil",
+/* CEIL_F80 */ nullptr,
+/* CEIL_F128 */ "ceill",
+/* CEIL_PPCF128 */ nullptr,
+/* TRUNC_F32 */ "truncf",
+/* TRUNC_F64 */ "trunc",
+/* TRUNC_F80 */ nullptr,
+/* TRUNC_F128 */ "truncl",
+/* TRUNC_PPCF128 */ nullptr,
+/* RINT_F32 */ "rintf",
+/* RINT_F64 */ "rint",
+/* RINT_F80 */ nullptr,
+/* RINT_F128 */ "rintl",
+/* RINT_PPCF128 */ nullptr,
+/* NEARBYINT_F32 */ "nearbyintf",
+/* NEARBYINT_F64 */ "nearbyint",
+/* NEARBYINT_F80 */ nullptr,
+/* NEARBYINT_F128 */ "nearbyintl",
+/* NEARBYINT_PPCF128 */ nullptr,
+/* ROUND_F32 */ "roundf",
+/* ROUND_F64 */ "round",
+/* ROUND_F80 */ nullptr,
+/* ROUND_F128 */ "roundl",
+/* ROUND_PPCF128 */ nullptr,
+/* FLOOR_F32 */ "floorf",
+/* FLOOR_F64 */ "floor",
+/* FLOOR_F80 */ nullptr,
+/* FLOOR_F128 */ "floorl",
+/* FLOOR_PPCF128 */ nullptr,
+/* COPYSIGN_F32 */ "copysignf",
+/* COPYSIGN_F64 */ "copysign",
+/* COPYSIGN_F80 */ nullptr,
+/* COPYSIGN_F128 */ "copysignl",
+/* COPYSIGN_PPCF128 */ nullptr,
+/* FMIN_F32 */ "fminf",
+/* FMIN_F64 */ "fmin",
+/* FMIN_F80 */ nullptr,
+/* FMIN_F128 */ "fminl",
+/* FMIN_PPCF128 */ nullptr,
+/* FMAX_F32 */ "fmaxf",
+/* FMAX_F64 */ "fmax",
+/* FMAX_F80 */ nullptr,
+/* FMAX_F128 */ "fmaxl",
+/* FMAX_PPCF128 */ nullptr,
+/* FPEXT_F32_PPCF128 */ nullptr,
+/* FPEXT_F64_PPCF128 */ nullptr,
+/* FPEXT_F64_F128 */ "__extenddftf2",
+/* FPEXT_F32_F128 */ "__extendsftf2",
+/* FPEXT_F32_F64 */ "__extendsfdf2",
+/* FPEXT_F16_F32 */ "__gnu_h2f_ieee",
+/* FPROUND_F32_F16 */ "__gnu_f2h_ieee",
+/* FPROUND_F64_F16 */ nullptr,
+/* FPROUND_F80_F16 */ nullptr,
+/* FPROUND_F128_F16 */ nullptr,
+/* FPROUND_PPCF128_F16 */ nullptr,
+/* FPROUND_F64_F32 */ "__truncdfsf2",
+/* FPROUND_F80_F32 */ "__truncxfsf2",
+/* FPROUND_F128_F32 */ "__trunctfsf2",
+/* FPROUND_PPCF128_F32 */ nullptr,
+/* FPROUND_F80_F64 */ "__truncxfdf2",
+/* FPROUND_F128_F64 */ "__trunctfdf2",
+/* FPROUND_PPCF128_F64 */ nullptr,
+/* FPTOSINT_F32_I32 */ "__fixsfsi",
+/* FPTOSINT_F32_I64 */ "__fixsfdi",
+/* FPTOSINT_F32_I128 */ "__fixsfti",
+/* FPTOSINT_F64_I32 */ "__fixdfsi",
+/* FPTOSINT_F64_I64 */ "__fixdfdi",
+/* FPTOSINT_F64_I128 */ "__fixdfti",
+/* FPTOSINT_F80_I32 */ "__fixxfsi",
+/* FPTOSINT_F80_I64 */ "__fixxfdi",
+/* FPTOSINT_F80_I128 */ "__fixxfti",
+/* FPTOSINT_F128_I32 */ "__fixtfsi",
+/* FPTOSINT_F128_I64 */ "__fixtfdi",
+/* FPTOSINT_F128_I128 */ "__fixtfti",
+/* FPTOSINT_PPCF128_I32 */ nullptr,
+/* FPTOSINT_PPCF128_I64 */ nullptr,
+/* FPTOSINT_PPCF128_I128 */ nullptr,
+/* FPTOUINT_F32_I32 */ "__fixunssfsi",
+/* FPTOUINT_F32_I64 */ "__fixunssfdi",
+/* FPTOUINT_F32_I128 */ "__fixunssfti",
+/* FPTOUINT_F64_I32 */ "__fixunsdfsi",
+/* FPTOUINT_F64_I64 */ "__fixunsdfdi",
+/* FPTOUINT_F64_I128 */ "__fixunsdfti",
+/* FPTOUINT_F80_I32 */ "__fixunsxfsi",
+/* FPTOUINT_F80_I64 */ "__fixunsxfdi",
+/* FPTOUINT_F80_I128 */ "__fixunsxfti",
+/* FPTOUINT_F128_I32 */ "__fixunstfsi",
+/* FPTOUINT_F128_I64 */ "__fixunstfdi",
+/* FPTOUINT_F128_I128 */ "__fixunstfti",
+/* FPTOUINT_PPCF128_I32 */ nullptr,
+/* FPTOUINT_PPCF128_I64 */ nullptr,
+/* FPTOUINT_PPCF128_I128 */ nullptr,
+/* SINTTOFP_I32_F32 */ "__floatsisf",
+/* SINTTOFP_I32_F64 */ "__floatsidf",
+/* SINTTOFP_I32_F80 */ nullptr,
+/* SINTTOFP_I32_F128 */ "__floatsitf",
+/* SINTTOFP_I32_PPCF128 */ nullptr,
+/* SINTTOFP_I64_F32 */ "__floatdisf",
+/* SINTTOFP_I64_F64 */ "__floatdidf",
+/* SINTTOFP_I64_F80 */ nullptr,
+/* SINTTOFP_I64_F128 */ "__floatditf",
+/* SINTTOFP_I64_PPCF128 */ nullptr,
+/* SINTTOFP_I128_F32 */ "__floattisf",
+/* SINTTOFP_I128_F64 */ "__floattidf",
+/* SINTTOFP_I128_F80 */ nullptr,
+/* SINTTOFP_I128_F128 */ "__floattitf",
+/* SINTTOFP_I128_PPCF128 */ nullptr,
+/* UINTTOFP_I32_F32 */ "__floatunsisf",
+/* UINTTOFP_I32_F64 */ "__floatunsidf",
+/* UINTTOFP_I32_F80 */ nullptr,
+/* UINTTOFP_I32_F128 */ "__floatunsitf",
+/* UINTTOFP_I32_PPCF128 */ nullptr,
+/* UINTTOFP_I64_F32 */ "__floatundisf",
+/* UINTTOFP_I64_F64 */ "__floatundidf",
+/* UINTTOFP_I64_F80 */ nullptr,
+/* UINTTOFP_I64_F128 */ "__floatunditf",
+/* UINTTOFP_I64_PPCF128 */ nullptr,
+/* UINTTOFP_I128_F32 */ "__floatuntisf",
+/* UINTTOFP_I128_F64 */ "__floatuntidf",
+/* UINTTOFP_I128_F80 */ nullptr,
+/* UINTTOFP_I128_F128 */ "__floatuntitf",
+/* UINTTOFP_I128_PPCF128 */ nullptr,
+/* OEQ_F32 */ "__eqsf2",
+/* OEQ_F64 */ "__eqdf2",
+/* OEQ_F128 */ "__eqtf2",
+/* OEQ_PPCF128 */ nullptr,
+/* UNE_F32 */ "__nesf2",
+/* UNE_F64 */ "__nedf2",
+/* UNE_F128 */ "__netf2",
+/* UNE_PPCF128 */ nullptr,
+/* OGE_F32 */ "__gesf2",
+/* OGE_F64 */ "__gedf2",
+/* OGE_F128 */ "__getf2",
+/* OGE_PPCF128 */ nullptr,
+/* OLT_F32 */ "__ltsf2",
+/* OLT_F64 */ "__ltdf2",
+/* OLT_F128 */ "__lttf2",
+/* OLT_PPCF128 */ nullptr,
+/* OLE_F32 */ "__lesf2",
+/* OLE_F64 */ "__ledf2",
+/* OLE_F128 */ "__letf2",
+/* OLE_PPCF128 */ nullptr,
+/* OGT_F32 */ "__gtsf2",
+/* OGT_F64 */ "__gtdf2",
+/* OGT_F128 */ "__gttf2",
+/* OGT_PPCF128 */ nullptr,
+/* UO_F32 */ "__unordsf2",
+/* UO_F64 */ "__unorddf2",
+/* UO_F128 */ "__unordtf2",
+/* UO_PPCF128 */ nullptr,
+/* O_F32 */ "__unordsf2",
+/* O_F64 */ "__unorddf2",
+/* O_F128 */ "__unordtf2",
+/* O_PPCF128 */ nullptr,
+/* MEMCPY */ "memcpy",
+/* MEMMOVE */ "memset",
+/* MEMSET */ "memmove",
+/* MEMCPY_ELEMENT_ATOMIC_1 */ "MEMCPY_ELEMENT_ATOMIC_1",
+/* MEMCPY_ELEMENT_ATOMIC_2 */ "MEMCPY_ELEMENT_ATOMIC_2",
+/* MEMCPY_ELEMENT_ATOMIC_4 */ "MEMCPY_ELEMENT_ATOMIC_4",
+/* MEMCPY_ELEMENT_ATOMIC_8 */ "MEMCPY_ELEMENT_ATOMIC_8",
+/* MEMCPY_ELEMENT_ATOMIC_16 */ "MEMCPY_ELEMENT_ATOMIC_16",
+/* UNWIND_RESUME */ "_Unwind_Resume",
+/* SYNC_VAL_COMPARE_AND_SWAP_1 */ "__sync_val_compare_and_swap_1",
+/* SYNC_VAL_COMPARE_AND_SWAP_2 */ "__sync_val_compare_and_swap_2",
+/* SYNC_VAL_COMPARE_AND_SWAP_4 */ "__sync_val_compare_and_swap_4",
+/* SYNC_VAL_COMPARE_AND_SWAP_8 */ "__sync_val_compare_and_swap_8",
+/* SYNC_VAL_COMPARE_AND_SWAP_16 */ "__sync_val_compare_and_swap_16",
+/* SYNC_LOCK_TEST_AND_SET_1 */ "__sync_lock_test_and_set_1",
+/* SYNC_LOCK_TEST_AND_SET_2 */ "__sync_lock_test_and_set_2",
+/* SYNC_LOCK_TEST_AND_SET_4 */ "__sync_lock_test_and_set_4",
+/* SYNC_LOCK_TEST_AND_SET_8 */ "__sync_lock_test_and_set_8",
+/* SYNC_LOCK_TEST_AND_SET_16 */ "__sync_lock_test_and_set_16",
+/* SYNC_FETCH_AND_ADD_1 */ "__sync_fetch_and_add_1",
+/* SYNC_FETCH_AND_ADD_2 */ "__sync_fetch_and_add_2",
+/* SYNC_FETCH_AND_ADD_4 */ "__sync_fetch_and_add_4",
+/* SYNC_FETCH_AND_ADD_8 */ "__sync_fetch_and_add_8",
+/* SYNC_FETCH_AND_ADD_16 */ "__sync_fetch_and_add_16",
+/* SYNC_FETCH_AND_SUB_1 */ "__sync_fetch_and_sub_1",
+/* SYNC_FETCH_AND_SUB_2 */ "__sync_fetch_and_sub_2",
+/* SYNC_FETCH_AND_SUB_4 */ "__sync_fetch_and_sub_4",
+/* SYNC_FETCH_AND_SUB_8 */ "__sync_fetch_and_sub_8",
+/* SYNC_FETCH_AND_SUB_16 */ "__sync_fetch_and_sub_16",
+/* SYNC_FETCH_AND_AND_1 */ "__sync_fetch_and_and_1",
+/* SYNC_FETCH_AND_AND_2 */ "__sync_fetch_and_and_2",
+/* SYNC_FETCH_AND_AND_4 */ "__sync_fetch_and_and_4",
+/* SYNC_FETCH_AND_AND_8 */ "__sync_fetch_and_and_8",
+/* SYNC_FETCH_AND_AND_16 */ "__sync_fetch_and_and_16",
+/* SYNC_FETCH_AND_OR_1 */ "__sync_fetch_and_or_1",
+/* SYNC_FETCH_AND_OR_2 */ "__sync_fetch_and_or_2",
+/* SYNC_FETCH_AND_OR_4 */ "__sync_fetch_and_or_4",
+/* SYNC_FETCH_AND_OR_8 */ "__sync_fetch_and_or_8",
+/* SYNC_FETCH_AND_OR_16 */ "__sync_fetch_and_or_16",
+/* SYNC_FETCH_AND_XOR_1 */ "__sync_fetch_and_xor_1",
+/* SYNC_FETCH_AND_XOR_2 */ "__sync_fetch_and_xor_2",
+/* SYNC_FETCH_AND_XOR_4 */ "__sync_fetch_and_xor_4",
+/* SYNC_FETCH_AND_XOR_8 */ "__sync_fetch_and_xor_8",
+/* SYNC_FETCH_AND_XOR_16 */ "__sync_fetch_and_xor_16",
+/* SYNC_FETCH_AND_NAND_1 */ "__sync_fetch_and_nand_1",
+/* SYNC_FETCH_AND_NAND_2 */ "__sync_fetch_and_nand_2",
+/* SYNC_FETCH_AND_NAND_4 */ "__sync_fetch_and_nand_4",
+/* SYNC_FETCH_AND_NAND_8 */ "__sync_fetch_and_nand_8",
+/* SYNC_FETCH_AND_NAND_16 */ "__sync_fetch_and_nand_16",
+/* SYNC_FETCH_AND_MAX_1 */ "__sync_fetch_and_max_1",
+/* SYNC_FETCH_AND_MAX_2 */ "__sync_fetch_and_max_2",
+/* SYNC_FETCH_AND_MAX_4 */ "__sync_fetch_and_max_4",
+/* SYNC_FETCH_AND_MAX_8 */ "__sync_fetch_and_max_8",
+/* SYNC_FETCH_AND_MAX_16 */ "__sync_fetch_and_max_16",
+/* SYNC_FETCH_AND_UMAX_1 */ "__sync_fetch_and_umax_1",
+/* SYNC_FETCH_AND_UMAX_2 */ "__sync_fetch_and_umax_2",
+/* SYNC_FETCH_AND_UMAX_4 */ "__sync_fetch_and_umax_4",
+/* SYNC_FETCH_AND_UMAX_8 */ "__sync_fetch_and_umax_8",
+/* SYNC_FETCH_AND_UMAX_16 */ "__sync_fetch_and_umax_16",
+/* SYNC_FETCH_AND_MIN_1 */ "__sync_fetch_and_min_1",
+/* SYNC_FETCH_AND_MIN_2 */ "__sync_fetch_and_min_2",
+/* SYNC_FETCH_AND_MIN_4 */ "__sync_fetch_and_min_4",
+/* SYNC_FETCH_AND_MIN_8 */ "__sync_fetch_and_min_8",
+/* SYNC_FETCH_AND_MIN_16 */ "__sync_fetch_and_min_16",
+/* SYNC_FETCH_AND_UMIN_1 */ "__sync_fetch_and_umin_1",
+/* SYNC_FETCH_AND_UMIN_2 */ "__sync_fetch_and_umin_2",
+/* SYNC_FETCH_AND_UMIN_4 */ "__sync_fetch_and_umin_4",
+/* SYNC_FETCH_AND_UMIN_8 */ "__sync_fetch_and_umin_8",
+/* SYNC_FETCH_AND_UMIN_16 */ "__sync_fetch_and_umin_16",
+
+/* ATOMIC_LOAD */ "__atomic_load",
+/* ATOMIC_LOAD_1 */ "__atomic_load_1",
+/* ATOMIC_LOAD_2 */ "__atomic_load_2",
+/* ATOMIC_LOAD_4 */ "__atomic_load_4",
+/* ATOMIC_LOAD_8 */ "__atomic_load_8",
+/* ATOMIC_LOAD_16 */ "__atomic_load_16",
+
+/* ATOMIC_STORE */ "__atomic_store",
+/* ATOMIC_STORE_1 */ "__atomic_store_1",
+/* ATOMIC_STORE_2 */ "__atomic_store_2",
+/* ATOMIC_STORE_4 */ "__atomic_store_4",
+/* ATOMIC_STORE_8 */ "__atomic_store_8",
+/* ATOMIC_STORE_16 */ "__atomic_store_16",
+
+/* ATOMIC_EXCHANGE */ "__atomic_exchange",
+/* ATOMIC_EXCHANGE_1 */ "__atomic_exchange_1",
+/* ATOMIC_EXCHANGE_2 */ "__atomic_exchange_2",
+/* ATOMIC_EXCHANGE_4 */ "__atomic_exchange_4",
+/* ATOMIC_EXCHANGE_8 */ "__atomic_exchange_8",
+/* ATOMIC_EXCHANGE_16 */ "__atomic_exchange_16",
+
+/* ATOMIC_COMPARE_EXCHANGE */ "__atomic_compare_exchange",
+/* ATOMIC_COMPARE_EXCHANGE_1 */ "__atomic_compare_exchange_1",
+/* ATOMIC_COMPARE_EXCHANGE_2 */ "__atomic_compare_exchange_2",
+/* ATOMIC_COMPARE_EXCHANGE_4 */ "__atomic_compare_exchange_4",
+/* ATOMIC_COMPARE_EXCHANGE_8 */ "__atomic_compare_exchange_8",
+/* ATOMIC_COMPARE_EXCHANGE_16 */ "__atomic_compare_exchange_16",
+
+/* ATOMIC_FETCH_ADD_1 */ "__atomic_fetch_add_1",
+/* ATOMIC_FETCH_ADD_2 */ "__atomic_fetch_add_2",
+/* ATOMIC_FETCH_ADD_4 */ "__atomic_fetch_add_4",
+/* ATOMIC_FETCH_ADD_8 */ "__atomic_fetch_add_8",
+/* ATOMIC_FETCH_ADD_16 */ "__atomic_fetch_add_16",
+/* ATOMIC_FETCH_SUB_1 */ "__atomic_fetch_sub_1",
+/* ATOMIC_FETCH_SUB_2 */ "__atomic_fetch_sub_2",
+/* ATOMIC_FETCH_SUB_4 */ "__atomic_fetch_sub_4",
+/* ATOMIC_FETCH_SUB_8 */ "__atomic_fetch_sub_8",
+/* ATOMIC_FETCH_SUB_16 */ "__atomic_fetch_sub_16",
+/* ATOMIC_FETCH_AND_1 */ "__atomic_fetch_and_1",
+/* ATOMIC_FETCH_AND_2 */ "__atomic_fetch_and_2",
+/* ATOMIC_FETCH_AND_4 */ "__atomic_fetch_and_4",
+/* ATOMIC_FETCH_AND_8 */ "__atomic_fetch_and_8",
+/* ATOMIC_FETCH_AND_16 */ "__atomic_fetch_and_16",
+/* ATOMIC_FETCH_OR_1 */ "__atomic_fetch_or_1",
+/* ATOMIC_FETCH_OR_2 */ "__atomic_fetch_or_2",
+/* ATOMIC_FETCH_OR_4 */ "__atomic_fetch_or_4",
+/* ATOMIC_FETCH_OR_8 */ "__atomic_fetch_or_8",
+/* ATOMIC_FETCH_OR_16 */ "__atomic_fetch_or_16",
+/* ATOMIC_FETCH_XOR_1 */ "__atomic_fetch_xor_1",
+/* ATOMIC_FETCH_XOR_2 */ "__atomic_fetch_xor_2",
+/* ATOMIC_FETCH_XOR_4 */ "__atomic_fetch_xor_4",
+/* ATOMIC_FETCH_XOR_8 */ "__atomic_fetch_xor_8",
+/* ATOMIC_FETCH_XOR_16 */ "__atomic_fetch_xor_16",
+/* ATOMIC_FETCH_NAND_1 */ "__atomic_fetch_nand_1",
+/* ATOMIC_FETCH_NAND_2 */ "__atomic_fetch_nand_2",
+/* ATOMIC_FETCH_NAND_4 */ "__atomic_fetch_nand_4",
+/* ATOMIC_FETCH_NAND_8 */ "__atomic_fetch_nand_8",
+/* ATOMIC_FETCH_NAND_16 */ "__atomic_fetch_nand_16",
+
+/* STACKPROTECTOR_CHECK_FAIL */ "__stack_chk_fail",
+
+/* DEOPTIMIZE */ "__llvm_deoptimize",
+};
+
+void llvm::GetSignature(const WebAssemblySubtarget &Subtarget,
+                        RTLIB::Libcall LC, SmallVectorImpl<wasm::ValType> &Rets,
+                        SmallVectorImpl<wasm::ValType> &Params) {
+  assert(Rets.empty());
+  assert(Params.empty());
+
+  WebAssembly::ExprType iPTR = Subtarget.hasAddr64() ?
+                               WebAssembly::ExprType::I64 :
+                               WebAssembly::ExprType::I32;
+
+  switch (RuntimeLibcallSignatures[LC]) {
+  case func:
+    break;
+  case f32_func_f32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f32_func_f64:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case f32_func_i32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f32_func_i64:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case f32_func_i16:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f64_func_f32:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f64_func_f64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case f64_func_i32:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f64_func_i64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i32_func_f32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i32_func_f64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i32_func_i32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i64_func_f32:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i64_func_f64:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i64_func_i64:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case f32_func_f32_f32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f32_func_f32_i32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f32_func_i64_i64:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case f64_func_f64_f64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case f64_func_f64_i32:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case f64_func_i64_i64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i16_func_f32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i8_func_i8_i8:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case func_f32_iPTR_iPTR:
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case func_f64_iPTR_iPTR:
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case i16_func_i16_i16:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i32_func_f32_f32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i32_func_f64_f64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i32_func_i32_i32:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i64_func_i64_i64:
+    Rets.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_func_f32:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case i64_i64_func_f64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case i16_i16_func_i16_i16:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I32);
+    Rets.push_back(wasm::ValType::I32);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i32_i32_func_i32_i32:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I32);
+    Rets.push_back(wasm::ValType::I32);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case i64_i64_func_i64_i64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_func_i64_i64_i64_i64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_i64_i64_func_i64_i64_i64_i64:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i64_i64_func_i64_i64_i32:
+#if 0 // TODO: Enable this when wasm gets multiple-return-value support.
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+    Rets.push_back(wasm::ValType::I64);
+#else
+    Params.push_back(wasm::ValType(iPTR));
+#endif
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case iPTR_func_iPTR_i32_iPTR:
+    Rets.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case iPTR_func_iPTR_iPTR_iPTR:
+    Rets.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case f32_func_f32_f32_f32:
+    Rets.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case f64_func_f64_f64_f64:
+    Rets.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case func_i64_i64_iPTR_iPTR:
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType(iPTR));
+    break;
+  case func_iPTR_f32:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::F32);
+    break;
+  case func_iPTR_f64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::F64);
+    break;
+  case func_iPTR_i32:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I32);
+    break;
+  case func_iPTR_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case func_iPTR_i64_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case func_iPTR_i64_i64_i64_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case func_iPTR_i64_i64_i64_i64_i64_i64:
+    Params.push_back(wasm::ValType(iPTR));
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i32_func_i64_i64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case i32_func_i64_i64_i64_i64:
+    Rets.push_back(wasm::ValType::I32);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    break;
+  case unsupported:
+    llvm_unreachable("unsupported runtime library signature");
+  }
+}
+
+void llvm::GetSignature(const WebAssemblySubtarget &Subtarget, const char *Name,
+                        SmallVectorImpl<wasm::ValType> &Rets,
+                        SmallVectorImpl<wasm::ValType> &Params) {
+  assert(strcmp(RuntimeLibcallNames[RTLIB::DEOPTIMIZE], "__llvm_deoptimize") ==
+         0);
+
+  for (size_t i = 0, e = RTLIB::UNKNOWN_LIBCALL; i < e; ++i)
+    if (RuntimeLibcallNames[i] && strcmp(RuntimeLibcallNames[i], Name) == 0)
+      return GetSignature(Subtarget, RTLIB::Libcall(i), Rets, Params);
+
+  llvm_unreachable("unexpected runtime library name");
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
new file mode 100644
index 0000000000000000000000000000000000000000..129067604784f813ac88438dcff3deb74e57611e
--- /dev/null
+++ b/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.h
@@ -0,0 +1,37 @@
+// CodeGen/RuntimeLibcallSignatures.h - R.T. Lib. Call Signatures -*- C++ -*--//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief This file provides signature information for runtime libcalls.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_WEBASSEMBLY_RUNTIME_LIBCALL_SIGNATURES_H
+#define LLVM_LIB_TARGET_WEBASSEMBLY_RUNTIME_LIBCALL_SIGNATURES_H
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/RuntimeLibcalls.h"
+
+namespace llvm {
+
+class WebAssemblySubtarget;
+
+extern void GetSignature(const WebAssemblySubtarget &Subtarget,
+                         RTLIB::Libcall LC,
+                         SmallVectorImpl<wasm::ValType> &Rets,
+                         SmallVectorImpl<wasm::ValType> &Params);
+
+extern void GetSignature(const WebAssemblySubtarget &Subtarget,
+                         const char *Name, SmallVectorImpl<wasm::ValType> &Rets,
+                         SmallVectorImpl<wasm::ValType> &Params);
+
+} // end namespace llvm
+
+#endif
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index f5ef35a2ad40de5650c23128ea57e60ba8559e5c..44c794ef5da1976c6ac0b319489bf9ab80ee38eb 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -74,13 +74,25 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
                                          : "e-m:e-p:32:32-i64:64-n32:64-S128",
                         TT, CPU, FS, Options, getEffectiveRelocModel(RM),
                         CM, OL),
-      TLOF(make_unique<WebAssemblyTargetObjectFile>()) {
+      TLOF(TT.isOSBinFormatELF() ?
+              static_cast<TargetLoweringObjectFile*>(
+                  new WebAssemblyTargetObjectFileELF()) :
+              static_cast<TargetLoweringObjectFile*>(
+                  new WebAssemblyTargetObjectFile())) {
   // WebAssembly type-checks instructions, but a noreturn function with a return
   // type that doesn't match the context will cause a check failure. So we lower
   // LLVM 'unreachable' to ISD::TRAP and then lower that to WebAssembly's
   // 'unreachable' instructions which is meant for that case.
   this->Options.TrapUnreachable = true;
 
+  // WebAssembly treats each function as an independent unit. Force
+  // -ffunction-sections, effectively, so that we can emit them independently.
+  if (!TT.isOSBinFormatELF()) {
+    this->Options.FunctionSections = true;
+    this->Options.DataSections = true;
+    this->Options.UniqueSectionNames = true;
+  }
+
   initAsmInfo();
 
   // Note that we don't use setRequiresStructuredCFG(true). It disables
@@ -260,13 +272,19 @@ void WebAssemblyPassConfig::addPreEmitPass() {
     addPass(createWebAssemblyRegColoring());
   }
 
+  // Eliminate multiple-entry loops. Do this before inserting explicit get_local
+  // and set_local operators because we create a new variable that we want
+  // converted into a local.
+  addPass(createWebAssemblyFixIrreducibleControlFlow());
+
   // Insert explicit get_local and set_local operators.
   addPass(createWebAssemblyExplicitLocals());
 
-  // Eliminate multiple-entry loops.
-  addPass(createWebAssemblyFixIrreducibleControlFlow());
+  // Sort the blocks of the CFG into topological order, a prerequisite for
+  // BLOCK and LOOP markers.
+  addPass(createWebAssemblyCFGSort());
 
-  // Put the CFG in structured form; insert BLOCK and LOOP markers.
+  // Insert BLOCK and LOOP markers.
   addPass(createWebAssemblyCFGStackify());
 
   // Lower br_unless into br_if.
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
index 74e33b93e00d951aef3cfc996742a5cf629d5019..b1fd108bc249439eeb46e9b55a92ce2199115d52 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.cpp
@@ -17,8 +17,14 @@
 #include "WebAssemblyTargetMachine.h"
 using namespace llvm;
 
-void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
-                                             const TargetMachine &TM) {
+void WebAssemblyTargetObjectFileELF::Initialize(MCContext &Ctx,
+                                                const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
   InitializeELF(TM.Options.UseInitArray);
 }
+
+void WebAssemblyTargetObjectFile::Initialize(MCContext &Ctx,
+                                             const TargetMachine &TM) {
+  TargetLoweringObjectFileWasm::Initialize(Ctx, TM);
+  InitializeWasm();
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
index 39e50c9c575d8b46621c48d69df5e9b511259e42..ace87c9e442fc59ac4565b3049736dc57812539c 100644
--- a/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
+++ b/lib/Target/WebAssembly/WebAssemblyTargetObjectFile.h
@@ -20,7 +20,13 @@
 
 namespace llvm {
 
-class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileELF {
+class WebAssemblyTargetObjectFileELF final
+    : public TargetLoweringObjectFileELF {
+public:
+  void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+};
+
+class WebAssemblyTargetObjectFile final : public TargetLoweringObjectFileWasm {
 public:
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
 };
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index a0049c147d2c08e9646aa2290717656e47578342..e32772d491cf717e7d429d46fcafd47e0f4b7560 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -15,6 +15,7 @@
 #include "WebAssemblyUtilities.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
 using namespace llvm;
 
 bool WebAssembly::isArgument(const MachineInstr &MI) {
@@ -69,3 +70,28 @@ bool WebAssembly::isChild(const MachineInstr &MI,
   return TargetRegisterInfo::isVirtualRegister(Reg) &&
          MFI.isVRegStackified(Reg);
 }
+
+bool WebAssembly::isCallIndirect(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::CALL_INDIRECT_VOID:
+  case WebAssembly::CALL_INDIRECT_I32:
+  case WebAssembly::CALL_INDIRECT_I64:
+  case WebAssembly::CALL_INDIRECT_F32:
+  case WebAssembly::CALL_INDIRECT_F64:
+  case WebAssembly::CALL_INDIRECT_v16i8:
+  case WebAssembly::CALL_INDIRECT_v8i16:
+  case WebAssembly::CALL_INDIRECT_v4i32:
+  case WebAssembly::CALL_INDIRECT_v4f32:
+    return true;
+  default:
+    return false;
+  }
+}
+
+MachineBasicBlock *llvm::LoopBottom(const MachineLoop *Loop) {
+  MachineBasicBlock *Bottom = Loop->getHeader();
+  for (MachineBasicBlock *MBB : Loop->blocks())
+    if (MBB->getNumber() > Bottom->getNumber())
+      Bottom = MBB;
+  return Bottom;
+}
diff --git a/lib/Target/WebAssembly/WebAssemblyUtilities.h b/lib/Target/WebAssembly/WebAssemblyUtilities.h
index eb114403d14e8568be13e7758ad893cc6c3838a4..595491f1bf5b13938999d4bf4e1ee726ab4beb04 100644
--- a/lib/Target/WebAssembly/WebAssemblyUtilities.h
+++ b/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -18,7 +18,9 @@
 
 namespace llvm {
 
+class MachineBasicBlock;
 class MachineInstr;
+class MachineLoop;
 class WebAssemblyFunctionInfo;
 
 namespace WebAssembly {
@@ -27,8 +29,15 @@ bool isArgument(const MachineInstr &MI);
 bool isCopy(const MachineInstr &MI);
 bool isTee(const MachineInstr &MI);
 bool isChild(const MachineInstr &MI, const WebAssemblyFunctionInfo &MFI);
+bool isCallIndirect(const MachineInstr &MI);
 
 } // end namespace WebAssembly
+
+/// Return the "bottom" block of a loop. This differs from
+/// MachineLoop::getBottomBlock in that it works even if the loop is
+/// discontiguous.
+MachineBasicBlock *LoopBottom(const MachineLoop *Loop);
+
 } // end namespace llvm
 
 #endif
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
index c38a7d1dd44df1d23f92f029c457de567d11cae2..788fac62626b71339472df5fe3989cb44c3d8bf4 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.cpp
@@ -1,4 +1,4 @@
-//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly C++ -*-===//
+//===-- X86AsmInstrumentation.cpp - Instrument X86 inline assembly --------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -7,24 +7,31 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86AsmInstrumentation.h"
-#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86Operand.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstBuilder.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
 #include <algorithm>
 #include <cassert>
+#include <cstdint>
+#include <limits>
+#include <memory>
 #include <vector>
 
 // Following comment describes how assembly instrumentation works.
@@ -91,30 +98,35 @@
 //   register as a frame register and temprorary override current CFA
 //   register.
 
-namespace llvm {
-namespace {
+using namespace llvm;
 
 static cl::opt<bool> ClAsanInstrumentAssembly(
     "asan-instrument-assembly",
     cl::desc("instrument assembly with AddressSanitizer checks"), cl::Hidden,
     cl::init(false));
 
-const int64_t MinAllowedDisplacement = std::numeric_limits<int32_t>::min();
-const int64_t MaxAllowedDisplacement = std::numeric_limits<int32_t>::max();
+static const int64_t MinAllowedDisplacement =
+    std::numeric_limits<int32_t>::min();
+static const int64_t MaxAllowedDisplacement =
+    std::numeric_limits<int32_t>::max();
 
-int64_t ApplyDisplacementBounds(int64_t Displacement) {
+static int64_t ApplyDisplacementBounds(int64_t Displacement) {
   return std::max(std::min(MaxAllowedDisplacement, Displacement),
                   MinAllowedDisplacement);
 }
 
-void CheckDisplacementBounds(int64_t Displacement) {
+static void CheckDisplacementBounds(int64_t Displacement) {
   assert(Displacement >= MinAllowedDisplacement &&
          Displacement <= MaxAllowedDisplacement);
 }
 
-bool IsStackReg(unsigned Reg) { return Reg == X86::RSP || Reg == X86::ESP; }
+static bool IsStackReg(unsigned Reg) {
+  return Reg == X86::RSP || Reg == X86::ESP;
+}
 
-bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+static bool IsSmallMemAccess(unsigned AccessSize) { return AccessSize < 8; }
+
+namespace {
 
 class X86AddressSanitizer : public X86AsmInstrumentation {
 public:
@@ -178,7 +190,7 @@ public:
   X86AddressSanitizer(const MCSubtargetInfo *&STI)
       : X86AsmInstrumentation(STI), RepPrefix(false), OrigSPOffset(0) {}
 
-  ~X86AddressSanitizer() override {}
+  ~X86AddressSanitizer() override = default;
 
   // X86AsmInstrumentation implementation:
   void InstrumentAndEmitInstruction(const MCInst &Inst,
@@ -255,9 +267,11 @@ protected:
   bool is64BitMode() const {
     return STI->getFeatureBits()[X86::Mode64Bit];
   }
+
   bool is32BitMode() const {
     return STI->getFeatureBits()[X86::Mode32Bit];
   }
+
   bool is16BitMode() const {
     return STI->getFeatureBits()[X86::Mode16Bit];
   }
@@ -498,7 +512,7 @@ public:
   X86AddressSanitizer32(const MCSubtargetInfo *&STI)
       : X86AddressSanitizer(STI) {}
 
-  ~X86AddressSanitizer32() override {}
+  ~X86AddressSanitizer32() override = default;
 
   unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
     unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
@@ -604,9 +618,9 @@ private:
     EmitInstruction(
         Out, MCInstBuilder(X86::PUSH32r).addReg(RegCtx.AddressReg(32)));
 
-    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") +
                                             (IsWrite ? "store" : "load") +
-                                            llvm::Twine(AccessSize));
+                                            Twine(AccessSize));
     const MCSymbolRefExpr *FnExpr =
         MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALLpcrel32).addExpr(FnExpr));
@@ -756,7 +770,7 @@ public:
   X86AddressSanitizer64(const MCSubtargetInfo *&STI)
       : X86AddressSanitizer(STI) {}
 
-  ~X86AddressSanitizer64() override {}
+  ~X86AddressSanitizer64() override = default;
 
   unsigned GetFrameReg(const MCContext &Ctx, MCStreamer &Out) {
     unsigned FrameReg = GetFrameRegGeneric(Ctx, Out);
@@ -875,15 +889,17 @@ private:
       EmitInstruction(Out, MCInstBuilder(X86::MOV64rr).addReg(X86::RDI).addReg(
                                RegCtx.AddressReg(64)));
     }
-    MCSymbol *FnSym = Ctx.getOrCreateSymbol(llvm::Twine("__asan_report_") +
+    MCSymbol *FnSym = Ctx.getOrCreateSymbol(Twine("__asan_report_") +
                                             (IsWrite ? "store" : "load") +
-                                            llvm::Twine(AccessSize));
+                                            Twine(AccessSize));
     const MCSymbolRefExpr *FnExpr =
         MCSymbolRefExpr::create(FnSym, MCSymbolRefExpr::VK_PLT, Ctx);
     EmitInstruction(Out, MCInstBuilder(X86::CALL64pcrel32).addExpr(FnExpr));
   }
 };
 
+} // end anonymous namespace
+
 void X86AddressSanitizer64::InstrumentMemOperandSmall(
     X86Operand &Op, unsigned AccessSize, bool IsWrite,
     const RegisterContext &RegCtx, MCContext &Ctx, MCStreamer &Out) {
@@ -1022,12 +1038,10 @@ void X86AddressSanitizer64::InstrumentMOVSImpl(unsigned AccessSize,
   RestoreFlags(Out);
 }
 
-} // End anonymous namespace
-
 X86AsmInstrumentation::X86AsmInstrumentation(const MCSubtargetInfo *&STI)
-    : STI(STI), InitialFrameReg(0) {}
+    : STI(STI) {}
 
-X86AsmInstrumentation::~X86AsmInstrumentation() {}
+X86AsmInstrumentation::~X86AsmInstrumentation() = default;
 
 void X86AsmInstrumentation::InstrumentAndEmitInstruction(
     const MCInst &Inst, OperandVector &Operands, MCContext &Ctx,
@@ -1060,8 +1074,9 @@ unsigned X86AsmInstrumentation::GetFrameRegGeneric(const MCContext &Ctx,
 }
 
 X86AsmInstrumentation *
-CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
-                            const MCContext &Ctx, const MCSubtargetInfo *&STI) {
+llvm::CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
+                                  const MCContext &Ctx,
+                                  const MCSubtargetInfo *&STI) {
   Triple T(STI->getTargetTriple());
   const bool hasCompilerRTSupport = T.isOSLinux();
   if (ClAsanInstrumentAssembly && hasCompilerRTSupport &&
@@ -1073,5 +1088,3 @@ CreateX86AsmInstrumentation(const MCTargetOptions &MCOptions,
   }
   return new X86AsmInstrumentation(STI);
 }
-
-} // end llvm namespace
diff --git a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
index 470ceadb0aa6bc9b1b59f2e85d4b94e20bb83d4c..97a55cd8ad9836fefa675fc9696538dac5e30841 100644
--- a/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
+++ b/lib/Target/X86/AsmParser/X86AsmInstrumentation.h
@@ -1,4 +1,4 @@
-//===- X86AsmInstrumentation.h - Instrument X86 inline assembly *- C++ -*-===//
+//===- X86AsmInstrumentation.h - Instrument X86 inline assembly -*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,7 +11,6 @@
 #define LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
 
 #include "llvm/ADT/SmallVector.h"
-
 #include <memory>
 
 namespace llvm {
@@ -23,7 +22,6 @@ class MCParsedAsmOperand;
 class MCStreamer;
 class MCSubtargetInfo;
 class MCTargetOptions;
-
 class X86AsmInstrumentation;
 
 X86AsmInstrumentation *
@@ -43,7 +41,7 @@ public:
   // Tries to instrument and emit instruction.
   virtual void InstrumentAndEmitInstruction(
       const MCInst &Inst,
-      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand> > &Operands,
+      SmallVectorImpl<std::unique_ptr<MCParsedAsmOperand>> &Operands,
       MCContext &Ctx, const MCInstrInfo &MII, MCStreamer &Out);
 
 protected:
@@ -60,9 +58,9 @@ protected:
 
   const MCSubtargetInfo *&STI;
 
-  unsigned InitialFrameReg;
+  unsigned InitialFrameReg = 0;
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86ASMINSTRUMENTATION_H
diff --git a/lib/Target/X86/AsmParser/X86AsmParser.cpp b/lib/Target/X86/AsmParser/X86AsmParser.cpp
index f416d8956ff6ad8a5a3a82166083732e3bc05269..324da650e74e71fd515c1402080d9508fd573b06 100644
--- a/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -98,6 +98,14 @@ private:
     IC_REGISTER
   };
 
+  enum IntelOperatorKind {
+    IOK_INVALID = 0,
+    IOK_LENGTH,
+    IOK_SIZE,
+    IOK_TYPE,
+    IOK_OFFSET
+  };
+
   class InfixCalculator {
     typedef std::pair< InfixCalculatorTok, int64_t > ICToken;
     SmallVector<InfixCalculatorTok, 4> InfixOperatorStack;
@@ -704,10 +712,12 @@ private:
   std::unique_ptr<X86Operand> ParseIntelOperand();
   std::unique_ptr<X86Operand> ParseIntelOffsetOfOperator();
   bool ParseIntelDotOperator(const MCExpr *Disp, const MCExpr *&NewDisp);
-  std::unique_ptr<X86Operand> ParseIntelOperator(unsigned OpKind);
+  unsigned IdentifyIntelOperator(StringRef Name);
+  unsigned ParseIntelOperator(unsigned OpKind);
   std::unique_ptr<X86Operand>
   ParseIntelSegmentOverride(unsigned SegReg, SMLoc Start, unsigned Size);
   std::unique_ptr<X86Operand> ParseRoundingModeOp(SMLoc Start, SMLoc End);
+  bool ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM);
   bool ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End);
   std::unique_ptr<X86Operand>
   ParseIntelBracExpression(unsigned SegReg, SMLoc Start, int64_t ImmDisp,
@@ -814,6 +824,7 @@ private:
   /// }
 
 public:
+
   X86AsmParser(const MCSubtargetInfo &sti, MCAsmParser &Parser,
                const MCInstrInfo &mii, const MCTargetOptions &Options)
       : MCTargetAsmParser(Options, sti), MII(mii), InstInfo(nullptr),
@@ -1266,10 +1277,12 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
     }
   }
   // Remove all the ImmPrefix rewrites within the brackets.
+  // We may have some Imm rewrties as a result of an operator applying,
+  // remove them as well
   for (AsmRewrite &AR : AsmRewrites) {
     if (AR.Loc.getPointer() < StartInBrac.getPointer())
       continue;
-    if (AR.Kind == AOK_ImmPrefix)
+    if (AR.Kind == AOK_ImmPrefix || AR.Kind == AOK_Imm)
       AR.Kind = AOK_Delete;
   }
   const char *SymLocPtr = SymName.data();
@@ -1286,6 +1299,30 @@ RewriteIntelBracExpression(SmallVectorImpl<AsmRewrite> &AsmRewrites,
   }
 }
 
+// Some binary bitwise operators have a named synonymous
+// Query a candidate string for being such a named operator
+// and if so - invoke the appropriate handler
+bool X86AsmParser::ParseIntelNamedOperator(StringRef Name, IntelExprStateMachine &SM) {
+  // A named operator should be either lower or upper case, but not a mix
+  if (Name.compare(Name.lower()) && Name.compare(Name.upper()))
+    return false;
+  if (Name.equals_lower("not"))
+    SM.onNot();
+  else if (Name.equals_lower("or"))
+    SM.onOr();
+  else if (Name.equals_lower("shl"))
+    SM.onLShift();
+  else if (Name.equals_lower("shr"))
+    SM.onRShift();
+  else if (Name.equals_lower("xor"))
+    SM.onXor();
+  else if (Name.equals_lower("and"))
+    SM.onAnd();
+  else
+    return false;
+  return true;
+}
+
 bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
@@ -1324,31 +1361,36 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       const MCExpr *Val;
       SMLoc IdentLoc = Tok.getLoc();
       StringRef Identifier = Tok.getString();
+      UpdateLocLex = false;
       if (TK != AsmToken::String && !ParseRegister(TmpReg, IdentLoc, End)) {
         SM.onRegister(TmpReg);
-        UpdateLocLex = false;
-        break;
+      } else if (ParseIntelNamedOperator(Identifier, SM)) {
+        UpdateLocLex = true;
+      } else if (!isParsingInlineAsm()) {
+        if (getParser().parsePrimaryExpr(Val, End))
+          return Error(Tok.getLoc(), "Unexpected identifier!");
+        SM.onIdentifierExpr(Val, Identifier);
+      } else if (unsigned OpKind = IdentifyIntelOperator(Identifier)) {
+        if (OpKind == IOK_OFFSET) 
+          return Error(IdentLoc, "Dealing OFFSET operator as part of"
+            "a compound immediate expression is yet to be supported");
+        int64_t Val = ParseIntelOperator(OpKind);
+        if (!Val)
+          return true;
+        StringRef ErrMsg;
+        if (SM.onInteger(Val, ErrMsg))
+          return Error(IdentLoc, ErrMsg);
+      } else if (Identifier.find('.') != StringRef::npos &&
+            PrevTK == AsmToken::RBrac) {
+          return false;
       } else {
-        if (!isParsingInlineAsm()) {
-          if (getParser().parsePrimaryExpr(Val, End))
-            return Error(Tok.getLoc(), "Unexpected identifier!");
-        } else {
-          // This is a dot operator, not an adjacent identifier.
-          if (Identifier.find('.') != StringRef::npos &&
-              PrevTK == AsmToken::RBrac) {
-            return false;
-          } else {
-            InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
-            if (ParseIntelIdentifier(Val, Identifier, Info,
-                                     /*Unevaluated=*/false, End))
-              return true;
-          }
-        }
+        InlineAsmIdentifierInfo &Info = SM.getIdentifierInfo();
+        if (ParseIntelIdentifier(Val, Identifier, Info,
+                                 /*Unevaluated=*/false, End))
+          return true;
         SM.onIdentifierExpr(Val, Identifier);
-        UpdateLocLex = false;
-        break;
       }
-      return Error(Tok.getLoc(), "Unexpected identifier!");
+      break;
     }
     case AsmToken::Integer: {
       StringRef ErrMsg;
@@ -1715,11 +1757,16 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOffsetOfOperator() {
                                OffsetOfLoc, Identifier, Info.OpDecl);
 }
 
-enum IntelOperatorKind {
-  IOK_LENGTH,
-  IOK_SIZE,
-  IOK_TYPE
-};
+// Query a candidate string for being an Intel assembly operator
+// Report back its kind, or IOK_INVALID if does not evaluated as a known one
+unsigned X86AsmParser::IdentifyIntelOperator(StringRef Name) {
+  return StringSwitch<unsigned>(Name)
+    .Cases("TYPE","type",IOK_TYPE)
+    .Cases("SIZE","size",IOK_SIZE)
+    .Cases("LENGTH","length",IOK_LENGTH)
+    .Cases("OFFSET","offset",IOK_OFFSET)
+    .Default(IOK_INVALID);
+}
 
 /// Parse the 'LENGTH', 'TYPE' and 'SIZE' operators.  The LENGTH operator
 /// returns the number of elements in an array.  It returns the value 1 for
@@ -1727,7 +1774,7 @@ enum IntelOperatorKind {
 /// variable.  A variable's size is the product of its LENGTH and TYPE.  The
 /// TYPE operator returns the size of a C or C++ type or variable. If the
 /// variable is an array, TYPE returns the size of a single element.
-std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
+unsigned X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   MCAsmParser &Parser = getParser();
   const AsmToken &Tok = Parser.getTok();
   SMLoc TypeLoc = Tok.getLoc();
@@ -1739,11 +1786,13 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   StringRef Identifier = Tok.getString();
   if (ParseIntelIdentifier(Val, Identifier, Info,
                            /*Unevaluated=*/true, End))
-    return nullptr;
-
-  if (!Info.OpDecl)
-    return ErrorOperand(Start, "unable to lookup expression");
+    return 0;
 
+  if (!Info.OpDecl) {
+    Error(Start, "unable to lookup expression");
+    return 0;
+  }
+  
   unsigned CVal = 0;
   switch(OpKind) {
   default: llvm_unreachable("Unexpected operand kind!");
@@ -1757,8 +1806,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperator(unsigned OpKind) {
   unsigned Len = End.getPointer() - TypeLoc.getPointer();
   InstInfo->AsmRewrites->emplace_back(AOK_Imm, TypeLoc, Len, CVal);
 
-  const MCExpr *Imm = MCConstantExpr::create(CVal, getContext());
-  return X86Operand::CreateImm(Imm, Start, End);
+  return CVal;
 }
 
 std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
@@ -1766,18 +1814,12 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   const AsmToken &Tok = Parser.getTok();
   SMLoc Start, End;
 
-  // Offset, length, type and size operators.
-  if (isParsingInlineAsm()) {
-    StringRef AsmTokStr = Tok.getString();
-    if (AsmTokStr == "offset" || AsmTokStr == "OFFSET")
+  // FIXME: Offset operator
+  // Should be handled as part of immediate expression, as other operators
+  // Currently, only supported as a stand-alone operand
+  if (isParsingInlineAsm())
+    if (IdentifyIntelOperator(Tok.getString()) == IOK_OFFSET)
       return ParseIntelOffsetOfOperator();
-    if (AsmTokStr == "length" || AsmTokStr == "LENGTH")
-      return ParseIntelOperator(IOK_LENGTH);
-    if (AsmTokStr == "size" || AsmTokStr == "SIZE")
-      return ParseIntelOperator(IOK_SIZE);
-    if (AsmTokStr == "type" || AsmTokStr == "TYPE")
-      return ParseIntelOperator(IOK_TYPE);
-  }
 
   bool PtrInOperand = false;
   unsigned Size = getIntelMemOperandSize(Tok.getString());
diff --git a/lib/Target/X86/AsmParser/X86Operand.h b/lib/Target/X86/AsmParser/X86Operand.h
index 6672cbfe98151f2e22b873623701c2fbbba30db1..9f1fa6c65907044bc918f99315e92400b76c2485 100644
--- a/lib/Target/X86/AsmParser/X86Operand.h
+++ b/lib/Target/X86/AsmParser/X86Operand.h
@@ -1,4 +1,4 @@
-//===-- X86Operand.h - Parsed X86 machine instruction --------------------===//
+//===- X86Operand.h - Parsed X86 machine instruction ------------*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -11,12 +11,17 @@
 #define LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
 
 #include "X86AsmParserCommon.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/ADT/STLExtras.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <memory>
 
 namespace llvm {
 
@@ -74,11 +79,14 @@ struct X86Operand : public MCParsedAsmOperand {
 
   /// getStartLoc - Get the location of the first token of this operand.
   SMLoc getStartLoc() const override { return StartLoc; }
+
   /// getEndLoc - Get the location of the last token of this operand.
   SMLoc getEndLoc() const override { return EndLoc; }
+
   /// getLocRange - Get the range between the first and last token of this
   /// operand.
   SMRange getLocRange() const { return SMRange(StartLoc, EndLoc); }
+
   /// getOffsetOfLoc - Get the location of the offset operator.
   SMLoc getOffsetOfLoc() const override { return OffsetOfLoc; }
 
@@ -422,10 +430,12 @@ struct X86Operand : public MCParsedAsmOperand {
       RegNo = getGR32FromGR64(RegNo);
     Inst.addOperand(MCOperand::createReg(RegNo));
   }
+
   void addAVX512RCOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
   }
+
   void addImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     addExpr(Inst, getImm());
@@ -454,6 +464,7 @@ struct X86Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
     Inst.addOperand(MCOperand::createReg(getMemSegReg()));
   }
+
   void addDstIdxOperands(MCInst &Inst, unsigned N) const {
     assert((N == 1) && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
@@ -544,6 +555,6 @@ struct X86Operand : public MCParsedAsmOperand {
   }
 };
 
-} // End of namespace llvm
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_ASMPARSER_X86OPERAND_H
diff --git a/lib/Target/X86/CMakeLists.txt b/lib/Target/X86/CMakeLists.txt
index 9dfd09022bdcdc8e12dd899a6549af8b74be3c7a..fc4adddc149baf0f3133829a65df57a08ec59432 100644
--- a/lib/Target/X86/CMakeLists.txt
+++ b/lib/Target/X86/CMakeLists.txt
@@ -10,11 +10,20 @@ tablegen(LLVM X86GenDAGISel.inc -gen-dag-isel)
 tablegen(LLVM X86GenFastISel.inc -gen-fast-isel)
 tablegen(LLVM X86GenCallingConv.inc -gen-callingconv)
 tablegen(LLVM X86GenSubtargetInfo.inc -gen-subtarget)
+tablegen(LLVM X86GenEVEX2VEXTables.inc -gen-x86-EVEX2VEX-tables)
+if(LLVM_BUILD_GLOBAL_ISEL)
+  tablegen(LLVM X86GenRegisterBank.inc -gen-register-bank)
+  tablegen(LLVM X86GenGlobalISel.inc -gen-global-isel)
+endif()
+
 add_public_tablegen_target(X86CommonTableGen)
 
 # Add GlobalISel files if the build option was enabled.
 set(GLOBAL_ISEL_FILES
   X86CallLowering.cpp
+  X86LegalizerInfo.cpp
+  X86RegisterBankInfo.cpp
+  X86InstructionSelector.cpp
   )
 
 if(LLVM_BUILD_GLOBAL_ISEL)
@@ -43,6 +52,7 @@ set(sources
   X86EvexToVex.cpp
   X86MCInstLower.cpp
   X86MachineFunctionInfo.cpp
+  X86MacroFusion.cpp
   X86OptimizeLEAs.cpp
   X86PadShortFunction.cpp
   X86RegisterInfo.cpp
diff --git a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
index 88fab4b61a92f3248f6a3bba84c574b953074fb6..b7f637e9a8cd7ac4fac8d25d979ee7fbc31858d2 100644
--- a/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
+++ b/lib/Target/X86/Disassembler/X86DisassemblerDecoder.cpp
@@ -650,11 +650,6 @@ static int readPrefixes(struct InternalInstruction* insn) {
       insn->addressSize        = (hasAdSize ? 4 : 8);
       insn->displacementSize   = 4;
       insn->immediateSize      = 4;
-    } else if (insn->rexPrefix) {
-      insn->registerSize       = (hasOpSize ? 2 : 4);
-      insn->addressSize        = (hasAdSize ? 4 : 8);
-      insn->displacementSize   = (hasOpSize ? 2 : 4);
-      insn->immediateSize      = (hasOpSize ? 2 : 4);
     } else {
       insn->registerSize       = (hasOpSize ? 2 : 4);
       insn->addressSize        = (hasAdSize ? 4 : 8);
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
index 76cb5188f0e7358dea9bbb385eaea30fbe7908bd..6aa7003067440c997273d9698686b73a029b1a73 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.cpp
@@ -12,19 +12,22 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "X86ATTInstPrinter.h"
 #include "X86InstComments.h"
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cinttypes>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
@@ -146,6 +149,7 @@ void X86ATTInstPrinter::printRoundingControl(const MCInst *MI, unsigned Op,
   case 3: O << "{rz-sae}"; break;
   }
 }
+
 /// printPCRelImm - This is used to print an immediate value that ends up
 /// being encoded as a pc-relative value (e.g. for jumps and calls).  These
 /// print slightly differently than normal immediates.  For example, a $ is not
diff --git a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
index bbb30907661076b315145cb7f5c71eb013771e6e..946c1c73f088ac487b072ca152031d5f5bc1500d 100644
--- a/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86ATTInstPrinter.h
@@ -1,4 +1,4 @@
-//==- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax -*- C++ -*-=//
+//=- X86ATTInstPrinter.h - Convert X86 MCInst to assembly syntax --*- C++ -*-=//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -137,6 +137,7 @@ public:
 private:
   bool HasCustomInstComment;
 };
-}
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86ATTINSTPRINTER_H
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
index 4443edb8e342bff041c985943de479761eb3876b..a8c631ae282f9bf6a973d088253bc37bba5a806f 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.cpp
@@ -12,16 +12,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86IntelInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
-#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86InstComments.h"
+#include "X86IntelInstPrinter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FormattedStream.h"
-#include <cctype>
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
diff --git a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
index 20cd7ffb2e638881d057f6e2b8062d0d80877b20..ace31186a0544153a2a3b617579cf03f49958cdd 100644
--- a/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
+++ b/lib/Target/X86/InstPrinter/X86IntelInstPrinter.h
@@ -157,6 +157,6 @@ public:
   }
 };
 
-}
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_INSTPRINTER_X86INTELINSTPRINTER_H
diff --git a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index e83ec9f4045ad94cf78dea7d75f964ec989d6760..a713af6aadb5afb40909f2a203a263d3a3102278 100644
--- a/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -109,7 +109,7 @@ public:
   }
 
   void applyFixup(const MCFixup &Fixup, char *Data, unsigned DataSize,
-                  uint64_t Value, bool IsPCRel) const override {
+                  uint64_t Value, bool IsPCRel, MCContext &Ctx) const override {
     unsigned Size = 1 << getFixupKindLog2Size(Fixup.getKind());
 
     assert(Fixup.getOffset() + Size <= DataSize &&
diff --git a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index aab552547fac42df0d5a658191816ccdbb64c5c9..d8953da4abb2dae86b750a7b5f5c40c9fd1e95ea 100644
--- a/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -212,7 +212,12 @@ namespace X86II {
     /// the offset from beginning of section.
     ///
     /// This is the TLS offset for the COFF/Windows TLS mechanism.
-    MO_SECREL
+    MO_SECREL,
+
+    /// MO_ABS8 - On a symbol operand this indicates that the symbol is known
+    /// to be an absolute symbol in range [0,128), so we can use the @ABS8
+    /// symbol modifier.
+    MO_ABS8,
   };
 
   enum : uint64_t {
diff --git a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index da69da51df1085df93ff28a1f63a329bcae24d5a..0b73df3a2ff8c251dab8a16fbc298942cb9116e7 100644
--- a/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -13,24 +13,28 @@
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/ELF.h"
 #include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
 namespace {
-  class X86ELFObjectWriter : public MCELFObjectTargetWriter {
-  public:
-    X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
 
-    ~X86ELFObjectWriter() override;
+class X86ELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  X86ELFObjectWriter(bool IsELF64, uint8_t OSABI, uint16_t EMachine);
+  ~X86ELFObjectWriter() override = default;
 
-  protected:
-    unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
-                          const MCFixup &Fixup, bool IsPCRel) const override;
-  };
-}
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+};
+
+} // end anonymous namespace
 
 X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
                                        uint16_t EMachine)
@@ -40,9 +44,6 @@ X86ELFObjectWriter::X86ELFObjectWriter(bool IsELF64, uint8_t OSABI,
                               (EMachine != ELF::EM_386) &&
                                   (EMachine != ELF::EM_IAMCU)) {}
 
-X86ELFObjectWriter::~X86ELFObjectWriter()
-{}
-
 enum X86_64RelType { RT64_64, RT64_32, RT64_32S, RT64_16, RT64_8 };
 
 static X86_64RelType getType64(unsigned Kind,
@@ -96,6 +97,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
   default:
     llvm_unreachable("Unimplemented");
   case MCSymbolRefExpr::VK_None:
+  case MCSymbolRefExpr::VK_X86_ABS8:
     switch (Type) {
     case RT64_64:
       return IsPCRel ? ELF::R_X86_64_PC64 : ELF::R_X86_64_64;
@@ -219,6 +221,7 @@ static unsigned getRelocType32(MCContext &Ctx,
   default:
     llvm_unreachable("Unimplemented");
   case MCSymbolRefExpr::VK_None:
+  case MCSymbolRefExpr::VK_X86_ABS8:
     switch (Type) {
     case RT32_32:
       return IsPCRel ? ELF::R_386_PC32 : ELF::R_386_32;
diff --git a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 8045e7c6d87295df88c2bf0f76f9410a95c0cbd5..10e2bbc64d3cfbb2066fc0dff57c06411ad028aa 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -11,35 +11,43 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86FixupKinds.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+#include <cstdlib>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mccodeemitter"
 
 namespace {
+
 class X86MCCodeEmitter : public MCCodeEmitter {
-  X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
-  void operator=(const X86MCCodeEmitter &) = delete;
   const MCInstrInfo &MCII;
   MCContext &Ctx;
+
 public:
   X86MCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
     : MCII(mcii), Ctx(ctx) {
   }
-
-  ~X86MCCodeEmitter() override {}
+  X86MCCodeEmitter(const X86MCCodeEmitter &) = delete;
+  X86MCCodeEmitter &operator=(const X86MCCodeEmitter &) = delete;
+  ~X86MCCodeEmitter() override = default;
 
   bool is64BitMode(const MCSubtargetInfo &STI) const {
     return STI.getFeatureBits()[X86::Mode64Bit];
@@ -106,8 +114,7 @@ public:
                      SmallVectorImpl<MCFixup> &Fixups,
                      int ImmOffset = 0) const;
 
-  inline static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode,
-                                  unsigned RM) {
+  static uint8_t ModRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
     assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
     return RM | (RegOpcode << 3) | (Mod << 6);
   }
@@ -149,12 +156,6 @@ public:
 
 } // end anonymous namespace
 
-MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
-                                            const MCRegisterInfo &MRI,
-                                            MCContext &Ctx) {
-  return new X86MCCodeEmitter(MCII, Ctx);
-}
-
 /// isDisp8 - Return true if this signed displacement fits in a 8-bit
 /// sign-extended field.
 static bool isDisp8(int Value) {
@@ -1436,7 +1437,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM0r: case X86II::MRM1r:
   case X86II::MRM2r: case X86II::MRM3r:
   case X86II::MRM4r: case X86II::MRM5r:
-  case X86II::MRM6r: case X86II::MRM7r: {
+  case X86II::MRM6r: case X86II::MRM7r:
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
     if (HasEVEX_K) // Skip writemask
@@ -1446,13 +1447,12 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
                      (Form == X86II::MRMXr) ? 0 : Form-X86II::MRM0r,
                      CurByte, OS);
     break;
-  }
 
   case X86II::MRMXm:
   case X86II::MRM0m: case X86II::MRM1m:
   case X86II::MRM2m: case X86II::MRM3m:
   case X86II::MRM4m: case X86II::MRM5m:
-  case X86II::MRM6m: case X86II::MRM7m: {
+  case X86II::MRM6m: case X86II::MRM7m:
     if (HasVEX_4V) // Skip the register dst (which is encoded in VEX_VVVV).
       ++CurOp;
     if (HasEVEX_K) // Skip writemask
@@ -1463,7 +1463,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
                      Rex, CurByte, OS, Fixups, STI);
     CurOp += X86::AddrNumOperands;
     break;
-  }
+
   case X86II::MRM_C0: case X86II::MRM_C1: case X86II::MRM_C2:
   case X86II::MRM_C3: case X86II::MRM_C4: case X86II::MRM_C5:
   case X86II::MRM_C6: case X86II::MRM_C7: case X86II::MRM_C8:
@@ -1527,3 +1527,9 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   }
 #endif
 }
+
+MCCodeEmitter *llvm::createX86MCCodeEmitter(const MCInstrInfo &MCII,
+                                            const MCRegisterInfo &MRI,
+                                            MCContext &Ctx) {
+  return new X86MCCodeEmitter(MCII, Ctx);
+}
diff --git a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 33376b6d1b90685f9c4d2c9320b43adde0474c68..d6777fc8aa6aeb9e553618f3c4d221e515cec61b 100644
--- a/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -10,6 +10,7 @@
 #include "MCTargetDesc/X86FixupKinds.h"
 #include "MCTargetDesc/X86MCTargetDesc.h"
 #include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCWinCOFFObjectWriter.h"
 #include "llvm/Support/COFF.h"
@@ -17,28 +18,24 @@
 
 using namespace llvm;
 
-namespace llvm {
-  class MCObjectWriter;
-}
-
 namespace {
-  class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
-  public:
-    X86WinCOFFObjectWriter(bool Is64Bit);
-    ~X86WinCOFFObjectWriter() override;
 
-    unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
-                          bool IsCrossSection,
-                          const MCAsmBackend &MAB) const override;
-  };
-}
+class X86WinCOFFObjectWriter : public MCWinCOFFObjectTargetWriter {
+public:
+  X86WinCOFFObjectWriter(bool Is64Bit);
+  ~X86WinCOFFObjectWriter() override = default;
+
+  unsigned getRelocType(const MCValue &Target, const MCFixup &Fixup,
+                        bool IsCrossSection,
+                        const MCAsmBackend &MAB) const override;
+};
+
+} // end anonymous namespace
 
 X86WinCOFFObjectWriter::X86WinCOFFObjectWriter(bool Is64Bit)
     : MCWinCOFFObjectTargetWriter(Is64Bit ? COFF::IMAGE_FILE_MACHINE_AMD64
                                           : COFF::IMAGE_FILE_MACHINE_I386) {}
 
-X86WinCOFFObjectWriter::~X86WinCOFFObjectWriter() {}
-
 unsigned X86WinCOFFObjectWriter::getRelocType(const MCValue &Target,
                                               const MCFixup &Fixup,
                                               bool IsCrossSection,
diff --git a/lib/Target/X86/X86.h b/lib/Target/X86/X86.h
index 2cb80a482d06fe5ad30942e7a8525f849d67b772..fdcc7e1ab7b0545096cea2785abf8e0fd505e91d 100644
--- a/lib/Target/X86/X86.h
+++ b/lib/Target/X86/X86.h
@@ -21,7 +21,10 @@ namespace llvm {
 
 class FunctionPass;
 class ImmutablePass;
+class InstructionSelector;
 class PassRegistry;
+class X86RegisterBankInfo;
+class X86Subtarget;
 class X86TargetMachine;
 
 /// This pass converts a legalized DAG into a X86-specific DAG, ready for
@@ -92,6 +95,9 @@ void initializeFixupBWInstPassPass(PassRegistry &);
 /// encoding when possible in order to reduce code size.
 FunctionPass *createX86EvexToVexInsts();
 
+InstructionSelector *createX86InstructionSelector(X86Subtarget &,
+                                                  X86RegisterBankInfo &);
+
 void initializeEvexToVexInstPassPass(PassRegistry &);
 
 } // End llvm namespace
diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td
index 83a23d4ad680ecadffe229b8b08e8d2870d5bfea..8fcc8e31d5d44152b611f04a6e1d11b1c93acb58 100644
--- a/lib/Target/X86/X86.td
+++ b/lib/Target/X86/X86.td
@@ -187,8 +187,6 @@ def FeatureBMI2    : SubtargetFeature<"bmi2", "HasBMI2", "true",
                                       "Support BMI2 instructions">;
 def FeatureRTM     : SubtargetFeature<"rtm", "HasRTM", "true",
                                       "Support RTM instructions">;
-def FeatureHLE     : SubtargetFeature<"hle", "HasHLE", "true",
-                                      "Support HLE">;
 def FeatureADX     : SubtargetFeature<"adx", "HasADX", "true",
                                       "Support ADX instructions">;
 def FeatureSHA     : SubtargetFeature<"sha", "HasSHA", "true",
@@ -202,6 +200,8 @@ def FeatureLAHFSAHF : SubtargetFeature<"sahf", "HasLAHFSAHF", "true",
                                        "Support LAHF and SAHF instructions">;
 def FeatureMWAITX  : SubtargetFeature<"mwaitx", "HasMWAITX", "true",
                                       "Enable MONITORX/MWAITX timer functionality">;
+def FeatureCLZERO  : SubtargetFeature<"clzero", "HasCLZERO", "true",
+                                      "Enable Cache Line Zero">;
 def FeatureMPX     : SubtargetFeature<"mpx", "HasMPX", "true",
                                       "Support MPX instructions">;
 def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
@@ -215,18 +215,10 @@ def FeatureSlowDivide64 : SubtargetFeature<"idivq-to-divl",
 def FeaturePadShortFunctions : SubtargetFeature<"pad-short-functions",
                                      "PadShortFunctions", "true",
                                      "Pad short functions">;
-def FeatureINVPCID : SubtargetFeature<"invpcid", "HasInvPCId", "true",
-                                      "Invalidate Process-Context Identifier">;
-def FeatureVMFUNC  : SubtargetFeature<"vmfunc", "HasVMFUNC", "true",
-                                      "VM Functions">;
-def FeatureSMAP    : SubtargetFeature<"smap", "HasSMAP", "true",
-                                      "Supervisor Mode Access Protection">;
 def FeatureSGX     : SubtargetFeature<"sgx", "HasSGX", "true",
                                       "Enable Software Guard Extensions">;
 def FeatureCLFLUSHOPT : SubtargetFeature<"clflushopt", "HasCLFLUSHOPT", "true",
                                       "Flush A Cache Line Optimized">;
-def FeaturePCOMMIT : SubtargetFeature<"pcommit", "HasPCOMMIT", "true",
-                                      "Enable Persistent Commit">;
 def FeatureCLWB    : SubtargetFeature<"clwb", "HasCLWB", "true",
                                       "Cache Line Write Back">;
 // TODO: This feature ought to be renamed.
@@ -246,11 +238,12 @@ def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true",
 def FeatureSoftFloat
     : SubtargetFeature<"soft-float", "UseSoftFloat", "true",
                        "Use software floating point features.">;
-// On at least some AMD processors, there is no performance hazard to writing
-// only the lower parts of a YMM register without clearing the upper part.
-def FeatureFastPartialYMMWrite
-    : SubtargetFeature<"fast-partial-ymm-write", "HasFastPartialYMMWrite",
-                       "true", "Partial writes to YMM registers are fast">;
+// On some X86 processors, there is no performance hazard to writing only the
+// lower parts of a YMM or ZMM register without clearing the upper part.
+def FeatureFastPartialYMMorZMMWrite
+    : SubtargetFeature<"fast-partial-ymm-or-zmm-write",
+                       "HasFastPartialYMMorZMMWrite",
+                       "true", "Partial writes to YMM/ZMM registers are fast">;
 // FeatureFastScalarFSQRT should be enabled if scalar FSQRT has shorter latency
 // than the corresponding NR code. FeatureFastVectorFSQRT should be enabled if
 // vector FSQRT has higher throughput than the corresponding NR code.
@@ -271,6 +264,15 @@ def FeatureFastLZCNT
           "fast-lzcnt", "HasFastLZCNT", "true",
           "LZCNT instructions are as fast as most simple integer ops">;
 
+
+// Sandy Bridge and newer processors can use SHLD with the same source on both
+// inputs to implement rotate to avoid the partial flag update of the normal
+// rotate instructions.
+def FeatureFastSHLDRotate
+    : SubtargetFeature<
+          "fast-shld-rotate", "HasFastSHLDRotate", "true",
+          "SHLD can be used as a faster rotate">;
+
 //===----------------------------------------------------------------------===//
 // X86 processors supported.
 //===----------------------------------------------------------------------===//
@@ -466,7 +468,8 @@ def SNBFeatures : ProcessorFeatures<[], [
   FeatureXSAVE,
   FeatureXSAVEOPT,
   FeatureLAHFSAHF,
-  FeatureFastScalarFSQRT
+  FeatureFastScalarFSQRT,
+  FeatureFastSHLDRotate
 ]>;
 
 class SandyBridgeProc<string Name> : ProcModel<Name, SandyBridgeModel,
@@ -498,10 +501,6 @@ def HSWFeatures : ProcessorFeatures<IVBFeatures.Value, [
   FeatureFMA,
   FeatureLZCNT,
   FeatureMOVBE,
-  FeatureINVPCID,
-  FeatureVMFUNC,
-  FeatureRTM,
-  FeatureHLE,
   FeatureSlowIncDec
 ]>;
 
@@ -512,8 +511,7 @@ def : HaswellProc<"core-avx2">; // Legacy alias.
 
 def BDWFeatures : ProcessorFeatures<HSWFeatures.Value, [
   FeatureADX,
-  FeatureRDSEED,
-  FeatureSMAP
+  FeatureRDSEED
 ]>;
 class BroadwellProc<string Name> : ProcModel<Name, HaswellModel,
                                              BDWFeatures.Value, []>;
@@ -521,6 +519,7 @@ def : BroadwellProc<"broadwell">;
 
 def SKLFeatures : ProcessorFeatures<BDWFeatures.Value, [
   FeatureMPX,
+  FeatureRTM,
   FeatureXSAVEC,
   FeatureXSAVES,
   FeatureSGX,
@@ -547,7 +546,8 @@ class KnightsLandingProc<string Name> : ProcModel<Name, HaswellModel,
   FeatureLZCNT,
   FeatureBMI,
   FeatureBMI2,
-  FeatureFMA
+  FeatureFMA,
+  FeatureFastPartialYMMorZMMWrite
 ]>;
 def : KnightsLandingProc<"knl">;
 
@@ -558,7 +558,6 @@ def SKXFeatures : ProcessorFeatures<SKLFeatures.Value, [
   FeatureBWI,
   FeatureVLX,
   FeaturePKU,
-  FeaturePCOMMIT,
   FeatureCLWB
 ]>;
 
@@ -662,7 +661,7 @@ def : ProcessorModel<"btver2", BtVer2Model, [
   FeatureXSAVEOPT,
   FeatureSlowSHLD,
   FeatureLAHFSAHF,
-  FeatureFastPartialYMMWrite
+  FeatureFastPartialYMMorZMMWrite
 ]>;
 
 // Bulldozer
@@ -771,6 +770,7 @@ def: ProcessorModel<"znver1", BtVer2Model, [
   FeatureBMI,
   FeatureBMI2,
   FeatureCLFLUSHOPT,
+  FeatureCLZERO,
   FeatureCMPXCHG16B,
   FeatureF16C,
   FeatureFMA,
@@ -788,7 +788,6 @@ def: ProcessorModel<"znver1", BtVer2Model, [
   FeatureRDRAND,
   FeatureRDSEED,
   FeatureSHA,
-  FeatureSMAP,
   FeatureSSE4A,
   FeatureSlowSHLD,
   FeatureX87,
@@ -824,6 +823,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel,
 //===----------------------------------------------------------------------===//
 
 include "X86RegisterInfo.td"
+include "X86RegisterBanks.td"
 
 //===----------------------------------------------------------------------===//
 // Instruction Descriptions
diff --git a/lib/Target/X86/X86AsmPrinter.h b/lib/Target/X86/X86AsmPrinter.h
index 6798253d0f6aa07075f3ba532ed7803afc167404..44bc373b0394c5afd724cdc622913f346cebc9d4 100644
--- a/lib/Target/X86/X86AsmPrinter.h
+++ b/lib/Target/X86/X86AsmPrinter.h
@@ -81,7 +81,7 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void LowerSTACKMAP(const MachineInstr &MI);
   void LowerPATCHPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerSTATEPOINT(const MachineInstr &MI, X86MCInstLower &MCIL);
-  void LowerFAULTING_LOAD_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
+  void LowerFAULTING_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerPATCHABLE_OP(const MachineInstr &MI, X86MCInstLower &MCIL);
 
   void LowerTlsAddr(X86MCInstLower &MCInstLowering, const MachineInstr &MI);
@@ -92,6 +92,8 @@ class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   void LowerPATCHABLE_RET(const MachineInstr &MI, X86MCInstLower &MCIL);
   void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
 
+  void LowerFENTRY_CALL(const MachineInstr &MI, X86MCInstLower &MCIL);
+
   // Helper function that emits the XRay sleds we've collected for a particular
   // function.
   void EmitXRayTable();
diff --git a/lib/Target/X86/X86CallFrameOptimization.cpp b/lib/Target/X86/X86CallFrameOptimization.cpp
index 23606a39d1a28bb7f8dcbf1b5d2f64a9d0785e94..b8f088dfbe589fe76de9d6b5ac1c3602d0bf7cb7 100644
--- a/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -17,22 +17,35 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include <algorithm>
-
-#include "X86.h"
+#include "MCTargetDesc/X86BaseInfo.h"
+#include "X86FrameLowering.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
-#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Function.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+#include <cassert>
+#include <cstddef>
+#include <cstdint>
+#include <iterator>
 
 using namespace llvm;
 
@@ -44,6 +57,7 @@ static cl::opt<bool>
                cl::init(false), cl::Hidden);
 
 namespace {
+
 class X86CallFrameOptimization : public MachineFunctionPass {
 public:
   X86CallFrameOptimization() : MachineFunctionPass(ID) {}
@@ -53,30 +67,28 @@ public:
 private:
   // Information we know about a particular call site
   struct CallContext {
-    CallContext()
-        : FrameSetup(nullptr), Call(nullptr), SPCopy(nullptr), ExpectedDist(0),
-          MovVector(4, nullptr), NoStackParams(false), UsePush(false) {}
+    CallContext() : FrameSetup(nullptr), MovVector(4, nullptr) {}
 
     // Iterator referring to the frame setup instruction
     MachineBasicBlock::iterator FrameSetup;
 
     // Actual call instruction
-    MachineInstr *Call;
+    MachineInstr *Call = nullptr;
 
     // A copy of the stack pointer
-    MachineInstr *SPCopy;
+    MachineInstr *SPCopy = nullptr;
 
     // The total displacement of all passed parameters
-    int64_t ExpectedDist;
+    int64_t ExpectedDist = 0;
 
     // The sequence of movs used to pass the parameters
     SmallVector<MachineInstr *, 4> MovVector;
 
     // True if this call site has no stack parameters
-    bool NoStackParams;
+    bool NoStackParams = false;
 
     // True if this call site can use push instructions
-    bool UsePush;
+    bool UsePush = false;
   };
 
   typedef SmallVector<CallContext, 8> ContextVector;
@@ -112,11 +124,8 @@ private:
 };
 
 char X86CallFrameOptimization::ID = 0;
-} // end anonymous namespace
 
-FunctionPass *llvm::createX86CallFrameOptimization() {
-  return new X86CallFrameOptimization();
-}
+} // end anonymous namespace
 
 // This checks whether the transformation is legal.
 // Also returns false in cases where it's potentially legal, but
@@ -485,7 +494,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
       Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).add(PushOp);
       break;
     case X86::MOV32mr:
-    case X86::MOV64mr:
+    case X86::MOV64mr: {
       unsigned int Reg = PushOp.getReg();
 
       // If storing a 32-bit vreg on 64-bit targets, extend to a 64-bit vreg
@@ -524,6 +533,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
       }
       break;
     }
+    }
 
     // For debugging, when using SP-based CFA, we need to adjust the CFA
     // offset after each push.
@@ -583,3 +593,7 @@ MachineInstr *X86CallFrameOptimization::canFoldIntoRegPush(
 
   return &DefMI;
 }
+
+FunctionPass *llvm::createX86CallFrameOptimization() {
+  return new X86CallFrameOptimization();
+}
diff --git a/lib/Target/X86/X86CallLowering.cpp b/lib/Target/X86/X86CallLowering.cpp
index b648b90666e5bedf8265f4444480d2dd489922d8..137ef166aaeb00897734bafbc5dd212691f3bef4 100644
--- a/lib/Target/X86/X86CallLowering.cpp
+++ b/lib/Target/X86/X86CallLowering.cpp
@@ -14,14 +14,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86CallLowering.h"
+#include "X86CallingConv.h"
 #include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
 #include "X86TargetMachine.h"
-#include "X86CallingConv.h"
 
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
-#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
@@ -35,17 +35,94 @@ using namespace llvm;
 X86CallLowering::X86CallLowering(const X86TargetLowering &TLI)
     : CallLowering(&TLI) {}
 
+void X86CallLowering::splitToValueTypes(const ArgInfo &OrigArg,
+                                        SmallVectorImpl<ArgInfo> &SplitArgs,
+                                        const DataLayout &DL,
+                                        MachineRegisterInfo &MRI,
+                                        SplitArgTy PerformArgSplit) const {
+
+  const X86TargetLowering &TLI = *getTLI<X86TargetLowering>();
+  LLVMContext &Context = OrigArg.Ty->getContext();
+  EVT VT = TLI.getValueType(DL, OrigArg.Ty);
+  unsigned NumParts = TLI.getNumRegisters(Context, VT);
+
+  if (NumParts == 1) {
+    // replace the original type ( pointer -> GPR ).
+    SplitArgs.emplace_back(OrigArg.Reg, VT.getTypeForEVT(Context),
+                           OrigArg.Flags, OrigArg.IsFixed);
+    return;
+  }
+
+  SmallVector<uint64_t, 4> BitOffsets;
+  SmallVector<unsigned, 8> SplitRegs;
+
+  EVT PartVT = TLI.getRegisterType(Context, VT);
+  Type *PartTy = PartVT.getTypeForEVT(Context);
+
+  for (unsigned i = 0; i < NumParts; ++i) {
+    ArgInfo Info =
+        ArgInfo{MRI.createGenericVirtualRegister(getLLTForType(*PartTy, DL)),
+                PartTy, OrigArg.Flags};
+    SplitArgs.push_back(Info);
+    PerformArgSplit(Info.Reg, PartVT.getSizeInBits() * i);
+  }
+}
+
+namespace {
+struct FuncReturnHandler : public CallLowering::ValueHandler {
+  FuncReturnHandler(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
+                    MachineInstrBuilder &MIB, CCAssignFn *AssignFn)
+      : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB) {}
+
+  unsigned getStackAddress(uint64_t Size, int64_t Offset,
+                           MachinePointerInfo &MPO) override {
+    llvm_unreachable("Don't know how to get a stack address yet");
+  }
+
+  void assignValueToReg(unsigned ValVReg, unsigned PhysReg,
+                        CCValAssign &VA) override {
+    MIB.addUse(PhysReg, RegState::Implicit);
+    unsigned ExtReg = extendRegister(ValVReg, VA);
+    MIRBuilder.buildCopy(PhysReg, ExtReg);
+  }
+
+  void assignValueToAddress(unsigned ValVReg, unsigned Addr, uint64_t Size,
+                            MachinePointerInfo &MPO, CCValAssign &VA) override {
+    llvm_unreachable("Don't know how to assign a value to an address yet");
+  }
+
+  MachineInstrBuilder &MIB;
+};
+} // End anonymous namespace.
+
 bool X86CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                                   const Value *Val, unsigned VReg) const {
-  // TODO: handle functions returning non-void values.
-  if (Val)
-    return false;
 
-  // silence unused-function warning, remove after the function implementation.
-  (void)RetCC_X86;
+  assert(((Val && VReg) || (!Val && !VReg)) && "Return value without a vreg");
+
+  auto MIB = MIRBuilder.buildInstrNoInsert(X86::RET).addImm(0);
+
+  if (VReg) {
+    MachineFunction &MF = MIRBuilder.getMF();
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    auto &DL = MF.getDataLayout();
+    const Function &F = *MF.getFunction();
 
-  MIRBuilder.buildInstr(X86::RET).addImm(0);
+    ArgInfo OrigArg{VReg, Val->getType()};
+    setArgFlags(OrigArg, AttributeList::ReturnIndex, DL, F);
 
+    SmallVector<ArgInfo, 8> SplitArgs;
+    splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+                      [&](unsigned Reg, uint64_t Offset) {
+                        MIRBuilder.buildExtract(Reg, VReg, Offset);
+                      });
+
+    FuncReturnHandler Handler(MIRBuilder, MRI, MIB, RetCC_X86);
+    if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+      return false;
+  }
+
+  MIRBuilder.insertInstr(MIB);
   return true;
 }
 
@@ -62,9 +139,8 @@ struct FormalArgHandler : public CallLowering::ValueHandler {
     int FI = MFI.CreateFixedObject(Size, Offset, true);
     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
 
-    unsigned AddrReg =
-        MRI.createGenericVirtualRegister(LLT::pointer(0,
-                                         DL.getPointerSizeInBits(0)));
+    unsigned AddrReg = MRI.createGenericVirtualRegister(
+        LLT::pointer(0, DL.getPointerSizeInBits(0)));
     MIRBuilder.buildFrameIndex(AddrReg, FI);
     return AddrReg;
   }
@@ -86,7 +162,7 @@ struct FormalArgHandler : public CallLowering::ValueHandler {
 
   const DataLayout &DL;
 };
-}
+} // namespace
 
 bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
                                            const Function &F,
@@ -94,22 +170,48 @@ bool X86CallLowering::lowerFormalArguments(MachineIRBuilder &MIRBuilder,
   if (F.arg_empty())
     return true;
 
-  //TODO: handle variadic function
+  // TODO: handle variadic function
   if (F.isVarArg())
     return false;
 
-  auto DL = MIRBuilder.getMF().getDataLayout();
+  MachineFunction &MF = MIRBuilder.getMF();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto DL = MF.getDataLayout();
 
-  SmallVector<ArgInfo, 8> ArgInfos;
+  SmallVector<ArgInfo, 8> SplitArgs;
   unsigned Idx = 0;
-  for (auto &Arg : F.getArgumentList()) {
-    ArgInfo AInfo(VRegs[Idx], Arg.getType());
-    setArgFlags(AInfo, Idx + 1, DL, F);
-    ArgInfos.push_back(AInfo);
+  for (auto &Arg : F.args()) {
+    ArgInfo OrigArg(VRegs[Idx], Arg.getType());
+    setArgFlags(OrigArg, Idx + 1, DL, F);
+    LLT Ty = MRI.getType(VRegs[Idx]);
+    unsigned Dst = VRegs[Idx];
+    bool Split = false;
+    splitToValueTypes(OrigArg, SplitArgs, DL, MRI,
+                      [&](unsigned Reg, uint64_t Offset) {
+                        if (!Split) {
+                          Split = true;
+                          Dst = MRI.createGenericVirtualRegister(Ty);
+                          MIRBuilder.buildUndef(Dst);
+                        }
+                        unsigned Tmp = MRI.createGenericVirtualRegister(Ty);
+                        MIRBuilder.buildInsert(Tmp, Dst, Reg, Offset);
+                        Dst = Tmp;
+                      });
+    if (Dst != VRegs[Idx])
+      MIRBuilder.buildCopy(VRegs[Idx], Dst);
     Idx++;
   }
 
-  FormalArgHandler ArgHandler(MIRBuilder, MIRBuilder.getMF().getRegInfo(),
-                              CC_X86, DL);
-  return handleAssignments(MIRBuilder, ArgInfos, ArgHandler);
+  MachineBasicBlock &MBB = MIRBuilder.getMBB();
+  if (!MBB.empty())
+    MIRBuilder.setInstr(*MBB.begin());
+
+  FormalArgHandler Handler(MIRBuilder, MRI, CC_X86, DL);
+  if (!handleAssignments(MIRBuilder, SplitArgs, Handler))
+    return false;
+
+  // Move back to the end of the basic block.
+  MIRBuilder.setMBB(MBB);
+
+  return true;
 }
diff --git a/lib/Target/X86/X86CallLowering.h b/lib/Target/X86/X86CallLowering.h
index f2672f09d8558da23ed485828e03edd5ac0e0431..204e6974c702e3601cdddf8a8c33ffa8d1d71728 100644
--- a/lib/Target/X86/X86CallLowering.h
+++ b/lib/Target/X86/X86CallLowering.h
@@ -34,6 +34,14 @@ public:
 
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<unsigned> VRegs) const override;
+private:
+  /// A function of this type is used to perform value split action.
+  typedef std::function<void(unsigned, uint64_t)> SplitArgTy;
+
+  void splitToValueTypes(const ArgInfo &OrigArgInfo,
+                         SmallVectorImpl<ArgInfo> &SplitArgs,
+                         const DataLayout &DL, MachineRegisterInfo &MRI,
+                         SplitArgTy SplitArg) const;
 };
 } // End of namespace llvm;
 #endif
diff --git a/lib/Target/X86/X86CallingConv.td b/lib/Target/X86/X86CallingConv.td
index cf7bc981b8a587ed771f8537cd58709391363340..6781d761a1c4fca81d0073959095900996aa5846 100644
--- a/lib/Target/X86/X86CallingConv.td
+++ b/lib/Target/X86/X86CallingConv.td
@@ -1074,6 +1074,8 @@ def CSR_32_AllRegs_AVX512 : CalleeSavedRegs<(add CSR_32_AllRegs,
                                                  (sequence "K%u", 0, 7))>;
 
 def CSR_64_AllRegs     : CalleeSavedRegs<(add CSR_64_MostRegs, RAX)>;
+def CSR_64_AllRegs_NoSSE : CalleeSavedRegs<(add RAX, RBX, RCX, RDX, RSI, RDI, R8, R9,
+                                                R10, R11, R12, R13, R14, R15, RBP)>;
 def CSR_64_AllRegs_AVX : CalleeSavedRegs<(sub (add CSR_64_MostRegs, RAX,
                                                    (sequence "YMM%u", 0, 15)),
                                               (sequence "XMM%u", 0, 15))>;
diff --git a/lib/Target/X86/X86EvexToVex.cpp b/lib/Target/X86/X86EvexToVex.cpp
index bdd1ab537bb2e977a45a703d516857dd0d1c3ad3..6472bbbc90169a00d74a794cf5483299a6643bfa 100755
--- a/lib/Target/X86/X86EvexToVex.cpp
+++ b/lib/Target/X86/X86EvexToVex.cpp
@@ -20,16 +20,30 @@
 //===---------------------------------------------------------------------===//
 
 #include "InstPrinter/X86InstComments.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86.h"
-#include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
-#include "X86InstrTablesInfo.h"
-#include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
-#include "X86TargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/Pass.h"
+#include <cassert>
+#include <cstdint>
 
 using namespace llvm;
 
+// Including the generated EVEX2VEX tables.
+struct X86EvexToVexCompressTableEntry {
+  uint16_t EvexOpcode;
+  uint16_t VexOpcode;
+};
+#include "X86GenEVEX2VEXTables.inc"
+
 #define EVEX2VEX_DESC "Compressing EVEX instrs to VEX encoding when possible"
 #define EVEX2VEX_NAME "x86-evex-to-vex-compress"
 
@@ -56,8 +70,6 @@ class EvexToVexInstPass : public MachineFunctionPass {
 public:
   static char ID;
 
-  StringRef getPassName() const override { return EVEX2VEX_DESC; }
-
   EvexToVexInstPass() : MachineFunctionPass(ID) {
     initializeEvexToVexInstPassPass(*PassRegistry::getPassRegistry());
 
@@ -72,6 +84,8 @@ public:
     }
   }
 
+  StringRef getPassName() const override { return EVEX2VEX_DESC; }
+
   /// Loop over all of the basic blocks, replacing EVEX instructions
   /// by equivalent VEX instructions when possible for reducing code size.
   bool runOnMachineFunction(MachineFunction &MF) override;
@@ -88,13 +102,8 @@ private:
 };
 
 char EvexToVexInstPass::ID = 0;
-}
 
-INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
-
-FunctionPass *llvm::createX86EvexToVexInsts() {
-  return new EvexToVexInstPass();
-}
+} // end anonymous namespace
 
 bool EvexToVexInstPass::runOnMachineFunction(MachineFunction &MF) {
   TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
@@ -125,7 +134,6 @@ void EvexToVexInstPass::AddTableEntry(EvexToVexTableType &EvexToVexTable,
 // For EVEX instructions that can be encoded using VEX encoding
 // replace them by the VEX encoding in order to reduce size.
 bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
-
   // VEX format.
   // # of bytes: 0,2,3  1      1      0,1   0,1,2,4  0,1
   //  [Prefixes] [VEX]  OPCODE ModR/M [SIB] [DISP]  [IMM]
@@ -211,3 +219,9 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
   MI.setAsmPrinterFlag(AC_EVEX_2_VEX);
   return true; 
 }
+
+INITIALIZE_PASS(EvexToVexInstPass, EVEX2VEX_NAME, EVEX2VEX_DESC, false, false)
+
+FunctionPass *llvm::createX86EvexToVexInsts() {
+  return new EvexToVexInstPass();
+}
diff --git a/lib/Target/X86/X86ExpandPseudo.cpp b/lib/Target/X86/X86ExpandPseudo.cpp
index c4bc29e963e7babdfbdccb60a4fe7918c74498cd..5dfd95f713015f7a1405c2ef2e7dc8a9fa6de476 100644
--- a/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/lib/Target/X86/X86ExpandPseudo.cpp
@@ -122,8 +122,9 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         Op = X86::TAILJMPd_CC;
         break;
       case X86::TCRETURNdi64cc:
-        assert(!IsWin64 && "Conditional tail calls confuse the Win64 unwinder.");
-        // TODO: We could do it for Win64 "leaf" functions though; PR30337.
+        assert(!MBB.getParent()->hasWinCFI() &&
+               "Conditional tail calls confuse "
+               "the Win64 unwinder.");
         Op = X86::TAILJMPd64_CC;
         break;
       default:
diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp
index c890fdd1e519824c5c9278675157fae2d63d595a..e82f43c1e0e123c62501219621fa975fa47a0c6c 100644
--- a/lib/Target/X86/X86FastISel.cpp
+++ b/lib/Target/X86/X86FastISel.cpp
@@ -367,6 +367,10 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
   switch (VT.getSimpleVT().SimpleTy) {
   default: return false;
   case MVT::i1:
+    // TODO: Support this properly.
+    if (Subtarget->hasAVX512())
+      return false;
+    LLVM_FALLTHROUGH;
   case MVT::i8:
     Opc = X86::MOV8rm;
     RC  = &X86::GR8RegClass;
@@ -524,6 +528,7 @@ bool X86FastISel::X86FastEmitLoad(EVT VT, X86AddressMode &AM,
 bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
                                    X86AddressMode &AM,
                                    MachineMemOperand *MMO, bool Aligned) {
+  bool HasSSE1 = Subtarget->hasSSE1();
   bool HasSSE2 = Subtarget->hasSSE2();
   bool HasSSE4A = Subtarget->hasSSE4A();
   bool HasAVX = Subtarget->hasAVX();
@@ -537,6 +542,16 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
   case MVT::f80: // No f80 support yet.
   default: return false;
   case MVT::i1: {
+    // In case ValReg is a K register, COPY to a GPR
+    if (MRI.getRegClass(ValReg) == &X86::VK1RegClass) {
+      unsigned KValReg = ValReg;
+      ValReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ValReg)
+          .addReg(KValReg);
+      ValReg = fastEmitInst_extractsubreg(MVT::i8, ValReg, /*Kill=*/true,
+                                          X86::sub_8bit);
+    }
     // Mask out all but lowest bit.
     unsigned AndResult = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -574,6 +589,9 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
     } else
       Opc = X86::ST_Fp64m;
     break;
+  case MVT::x86mmx:
+    Opc = (IsNonTemporal && HasSSE1) ? X86::MMX_MOVNTQmr : X86::MMX_MOVQ64mr;
+    break;
   case MVT::v4f32:
     if (Aligned) {
       if (IsNonTemporal)
@@ -1268,6 +1286,16 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
       if (SrcVT == MVT::i1) {
         if (Outs[0].Flags.isSExt())
           return false;
+        // In case SrcReg is a K register, COPY to a GPR
+        if (MRI.getRegClass(SrcReg) == &X86::VK1RegClass) {
+          unsigned KSrcReg = SrcReg;
+          SrcReg = createResultReg(&X86::GR32RegClass);
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(TargetOpcode::COPY), SrcReg)
+              .addReg(KSrcReg);
+          SrcReg = fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
+                                              X86::sub_8bit);
+        }
         SrcReg = fastEmitZExtFromI1(MVT::i8, SrcReg, /*TODO: Kill=*/false);
         SrcVT = MVT::i8;
       }
@@ -1559,6 +1587,17 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
   // Handle zero-extension from i1 to i8, which is common.
   MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
   if (SrcVT == MVT::i1) {
+    // In case ResultReg is a K register, COPY to a GPR
+    if (MRI.getRegClass(ResultReg) == &X86::VK1RegClass) {
+      unsigned KResultReg = ResultReg;
+      ResultReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg)
+          .addReg(KResultReg);
+      ResultReg = fastEmitInst_extractsubreg(MVT::i8, ResultReg, /*Kill=*/true,
+                                             X86::sub_8bit);
+    }
+
     // Set the high bits to zero.
     ResultReg = fastEmitZExtFromI1(MVT::i8, ResultReg, /*TODO: Kill=*/false);
     SrcVT = MVT::i8;
@@ -1740,10 +1779,12 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   // In case OpReg is a K register, COPY to a GPR
   if (MRI.getRegClass(OpReg) == &X86::VK1RegClass) {
     unsigned KOpReg = OpReg;
-    OpReg = createResultReg(&X86::GR8RegClass);
+    OpReg = createResultReg(&X86::GR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), OpReg)
         .addReg(KOpReg);
+    OpReg = fastEmitInst_extractsubreg(MVT::i8, OpReg, /*Kill=*/true,
+                                       X86::sub_8bit);
   }
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
       .addReg(OpReg)
@@ -2084,10 +2125,12 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     // In case OpReg is a K register, COPY to a GPR
     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
       unsigned KCondReg = CondReg;
-      CondReg = createResultReg(&X86::GR8RegClass);
+      CondReg = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), CondReg)
           .addReg(KCondReg, getKillRegState(CondIsKill));
+      CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true,
+                                           X86::sub_8bit);
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
         .addReg(CondReg, getKillRegState(CondIsKill))
@@ -2297,10 +2340,12 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
     // In case OpReg is a K register, COPY to a GPR
     if (MRI.getRegClass(CondReg) == &X86::VK1RegClass) {
       unsigned KCondReg = CondReg;
-      CondReg = createResultReg(&X86::GR8RegClass);
+      CondReg = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), CondReg)
           .addReg(KCondReg, getKillRegState(CondIsKill));
+      CondReg = fastEmitInst_extractsubreg(MVT::i8, CondReg, /*Kill=*/true,
+                                           X86::sub_8bit);
     }
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::TEST8ri))
         .addReg(CondReg, getKillRegState(CondIsKill))
@@ -2423,12 +2468,22 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
   if (OpReg == 0)
     return false;
 
+  unsigned ImplicitDefReg;
+  if (Subtarget->hasAVX()) {
+    ImplicitDefReg = createResultReg(RC);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+            TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
+
+  }
+
   unsigned ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB;
   MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
                 ResultReg);
+
   if (Subtarget->hasAVX())
-    MIB.addReg(OpReg);
+    MIB.addReg(ImplicitDefReg);
+
   MIB.addReg(OpReg);
   updateValueMap(I, ResultReg);
   return true;
@@ -2461,7 +2516,8 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   EVT DstVT = TLI.getValueType(DL, I->getType());
 
   // This code only handles truncation to byte.
-  if (DstVT != MVT::i8 && DstVT != MVT::i1)
+  // TODO: Support truncate to i1 with AVX512.
+  if (DstVT != MVT::i8 && (DstVT != MVT::i1 || Subtarget->hasAVX512()))
     return false;
   if (!TLI.isTypeLegal(SrcVT))
     return false;
@@ -3266,6 +3322,16 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
       // Handle zero-extension from i1 to i8, which is common.
       if (ArgVT == MVT::i1) {
+        // In case SrcReg is a K register, COPY to a GPR
+        if (MRI.getRegClass(ArgReg) == &X86::VK1RegClass) {
+          unsigned KArgReg = ArgReg;
+          ArgReg = createResultReg(&X86::GR32RegClass);
+          BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                  TII.get(TargetOpcode::COPY), ArgReg)
+              .addReg(KArgReg);
+          ArgReg = fastEmitInst_extractsubreg(MVT::i8, ArgReg, /*Kill=*/true,
+                                              X86::sub_8bit);
+        }
         // Set the high bits to zero.
         ArgReg = fastEmitZExtFromI1(MVT::i8, ArgReg, /*TODO: Kill=*/false);
         ArgVT = MVT::i8;
@@ -3463,6 +3529,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     CCValAssign &VA = RVLocs[i];
     EVT CopyVT = VA.getValVT();
     unsigned CopyReg = ResultReg + i;
+    unsigned SrcReg = VA.getLocReg();
 
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64) &&
@@ -3470,9 +3537,19 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       report_fatal_error("SSE register return with SSE disabled");
     }
 
+    // If the return value is an i1 and AVX-512 is enabled, we need
+    // to do a fixup to make the copy legal.
+    if (CopyVT == MVT::i1 && SrcReg == X86::AL && Subtarget->hasAVX512()) {
+      // Need to copy to a GR32 first.
+      // TODO: MOVZX isn't great here. We don't care about the upper bits.
+      SrcReg = createResultReg(&X86::GR32RegClass);
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(X86::MOVZX32rr8), SrcReg).addReg(X86::AL);
+    }
+
     // If we prefer to use the value in xmm registers, copy it out as f80 and
     // use a truncate to move it from fp stack reg to xmm reg.
-    if ((VA.getLocReg() == X86::FP0 || VA.getLocReg() == X86::FP1) &&
+    if ((SrcReg == X86::FP0 || SrcReg == X86::FP1) &&
         isScalarFPTypeInSSEReg(VA.getValVT())) {
       CopyVT = MVT::f80;
       CopyReg = createResultReg(&X86::RFP80RegClass);
@@ -3480,7 +3557,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
     // Copy out the result.
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::COPY), CopyReg).addReg(VA.getLocReg());
+            TII.get(TargetOpcode::COPY), CopyReg).addReg(SrcReg);
     InRegs.push_back(VA.getLocReg());
 
     // Round the f80 to the right size, which also moves it to the appropriate
@@ -3601,6 +3678,13 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected value type");
     case MVT::i1:
+      if (Subtarget->hasAVX512()) {
+        // Need to copy to a VK1 register.
+        unsigned ResultReg = createResultReg(&X86::VK1RegClass);
+        BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                TII.get(TargetOpcode::COPY), ResultReg).addReg(SrcReg);
+        return ResultReg;
+      }
     case MVT::i8:
       return fastEmitInst_extractsubreg(MVT::i8, SrcReg, /*Kill=*/true,
                                         X86::sub_8bit);
@@ -3622,7 +3706,12 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
   unsigned Opc = 0;
   switch (VT.SimpleTy) {
   default: llvm_unreachable("Unexpected value type");
-  case MVT::i1:  VT = MVT::i8;       LLVM_FALLTHROUGH;
+  case MVT::i1:
+    // TODO: Support this properly.
+    if (Subtarget->hasAVX512())
+      return 0;
+    VT = MVT::i8;
+    LLVM_FALLTHROUGH;
   case MVT::i8:  Opc = X86::MOV8ri;  break;
   case MVT::i16: Opc = X86::MOV16ri; break;
   case MVT::i32: Opc = X86::MOV32ri; break;
diff --git a/lib/Target/X86/X86FixupBWInsts.cpp b/lib/Target/X86/X86FixupBWInsts.cpp
index 3583980ff4f314d6a38f6955e97ee269d0ce6791..c28746f96439baba63b502543d4c6a0be627a986 100644
--- a/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/lib/Target/X86/X86FixupBWInsts.cpp
@@ -95,10 +95,9 @@ class FixupBWInstPass : public MachineFunctionPass {
 
   // Change the MachineInstr \p MI into an eqivalent 32 bit instruction if
   // possible.  Return the replacement instruction if OK, return nullptr
-  // otherwise. Set WasCandidate to true or false depending on whether the
-  // MI was a candidate for this sort of transformation.
-  MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB,
-                                bool &WasCandidate) const;
+  // otherwise.
+  MachineInstr *tryReplaceInstr(MachineInstr *MI, MachineBasicBlock &MBB) const;
+
 public:
   static char ID;
 
@@ -269,12 +268,8 @@ MachineInstr *FixupBWInstPass::tryReplaceCopy(MachineInstr *MI) const {
   return MIB;
 }
 
-MachineInstr *FixupBWInstPass::tryReplaceInstr(
-                  MachineInstr *MI, MachineBasicBlock &MBB,
-                  bool &WasCandidate) const {
-  MachineInstr *NewMI = nullptr;
-  WasCandidate = false;
-
+MachineInstr *FixupBWInstPass::tryReplaceInstr(MachineInstr *MI,
+                                               MachineBasicBlock &MBB) const {
   // See if this is an instruction of the type we are currently looking for.
   switch (MI->getOpcode()) {
 
@@ -282,12 +277,9 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(
     // Only replace 8 bit loads with the zero extending versions if
     // in an inner most loop and not optimizing for size. This takes
     // an extra byte to encode, and provides limited performance upside.
-    if (MachineLoop *ML = MLI->getLoopFor(&MBB)) {
-      if (ML->begin() == ML->end() && !OptForSize) {
-        NewMI = tryReplaceLoad(X86::MOVZX32rm8, MI);
-        WasCandidate = true;
-      }
-    }
+    if (MachineLoop *ML = MLI->getLoopFor(&MBB))
+      if (ML->begin() == ML->end() && !OptForSize)
+        return tryReplaceLoad(X86::MOVZX32rm8, MI);
     break;
 
   case X86::MOV16rm:
@@ -295,9 +287,7 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(
     // Code size is the same, and there is sometimes a perf advantage
     // from eliminating a false dependence on the upper portion of
     // the register.
-    NewMI = tryReplaceLoad(X86::MOVZX32rm16, MI);
-    WasCandidate = true;
-    break;
+    return tryReplaceLoad(X86::MOVZX32rm16, MI);
 
   case X86::MOV8rr:
   case X86::MOV16rr:
@@ -305,16 +295,14 @@ MachineInstr *FixupBWInstPass::tryReplaceInstr(
     // Code size is either less (16) or equal (8), and there is sometimes a
     // perf advantage from eliminating a false dependence on the upper portion
     // of the register.
-    NewMI = tryReplaceCopy(MI);
-    WasCandidate = true;
-    break;
+    return tryReplaceCopy(MI);
 
   default:
     // nothing to do here.
     break;
   }
 
-  return NewMI;
+  return nullptr;
 }
 
 void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
@@ -338,18 +326,11 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
   // We run after PEI, so we need to AddPristinesAndCSRs.
   LiveRegs.addLiveOuts(MBB);
 
-  bool WasCandidate = false;
-
   for (auto I = MBB.rbegin(); I != MBB.rend(); ++I) {
     MachineInstr *MI = &*I;
     
-    MachineInstr *NewMI = tryReplaceInstr(MI, MBB, WasCandidate);
-
-    // Add this to replacements if it was a candidate, even if NewMI is
-    // nullptr.  We will revisit that in a bit.
-    if (WasCandidate) {
+    if (MachineInstr *NewMI = tryReplaceInstr(MI, MBB))
       MIReplacements.push_back(std::make_pair(MI, NewMI));
-    }
 
     // We're done with this instruction, update liveness for the next one.
     LiveRegs.stepBackward(*MI);
@@ -359,9 +340,7 @@ void FixupBWInstPass::processBasicBlock(MachineFunction &MF,
     MachineInstr *MI = MIReplacements.back().first;
     MachineInstr *NewMI = MIReplacements.back().second;
     MIReplacements.pop_back();
-    if (NewMI) {
-      MBB.insert(MI, NewMI);
-      MBB.erase(MI);
-    }
+    MBB.insert(MI, NewMI);
+    MBB.erase(MI);
   }
 }
diff --git a/lib/Target/X86/X86FrameLowering.cpp b/lib/Target/X86/X86FrameLowering.cpp
index cd690442bb9f83f20285452af4ef673a02e4a3b0..fc5c9ac2e25f374aee21418906128520c0c9afac 100644
--- a/lib/Target/X86/X86FrameLowering.cpp
+++ b/lib/Target/X86/X86FrameLowering.cpp
@@ -252,40 +252,76 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
                                     int64_t NumBytes, bool InEpilogue) const {
   bool isSub = NumBytes < 0;
   uint64_t Offset = isSub ? -NumBytes : NumBytes;
+  MachineInstr::MIFlag Flag =
+      isSub ? MachineInstr::FrameSetup : MachineInstr::FrameDestroy;
 
   uint64_t Chunk = (1LL << 31) - 1;
   DebugLoc DL = MBB.findDebugLoc(MBBI);
 
-  while (Offset) {
-    if (Offset > Chunk) {
-      // Rather than emit a long series of instructions for large offsets,
-      // load the offset into a register and do one sub/add
-      unsigned Reg = 0;
+  if (Offset > Chunk) {
+    // Rather than emit a long series of instructions for large offsets,
+    // load the offset into a register and do one sub/add
+    unsigned Reg = 0;
+    unsigned Rax = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
 
-      if (isSub && !isEAXLiveIn(MBB))
-        Reg = (unsigned)(Is64Bit ? X86::RAX : X86::EAX);
+    if (isSub && !isEAXLiveIn(MBB))
+      Reg = Rax;
+    else
+      Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
+
+    unsigned MovRIOpc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
+    unsigned AddSubRROpc =
+        isSub ? getSUBrrOpcode(Is64Bit) : getADDrrOpcode(Is64Bit);
+    if (Reg) {
+      BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Reg)
+          .addImm(Offset)
+          .setMIFlag(Flag);
+      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(AddSubRROpc), StackPtr)
+                             .addReg(StackPtr)
+                             .addReg(Reg);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+      return;
+    } else if (Offset > 8 * Chunk) {
+      // If we would need more than 8 add or sub instructions (a >16GB stack
+      // frame), it's worth spilling RAX to materialize this immediate.
+      //   pushq %rax
+      //   movabsq +-$Offset+-SlotSize, %rax
+      //   addq %rsp, %rax
+      //   xchg %rax, (%rsp)
+      //   movq (%rsp), %rsp
+      assert(Is64Bit && "can't have 32-bit 16GB stack frame");
+      BuildMI(MBB, MBBI, DL, TII.get(X86::PUSH64r))
+          .addReg(Rax, RegState::Kill)
+          .setMIFlag(Flag);
+      // Subtract is not commutative, so negate the offset and always use add.
+      // Subtract 8 less and add 8 more to account for the PUSH we just did.
+      if (isSub)
+        Offset = -(Offset - SlotSize);
       else
-        Reg = findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
-
-      if (Reg) {
-        unsigned Opc = Is64Bit ? X86::MOV64ri : X86::MOV32ri;
-        BuildMI(MBB, MBBI, DL, TII.get(Opc), Reg)
-          .addImm(Offset);
-        Opc = isSub
-          ? getSUBrrOpcode(Is64Bit)
-          : getADDrrOpcode(Is64Bit);
-        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
-          .addReg(StackPtr)
-          .addReg(Reg);
-        MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
-        Offset = 0;
-        continue;
-      }
+        Offset = Offset + SlotSize;
+      BuildMI(MBB, MBBI, DL, TII.get(MovRIOpc), Rax)
+          .addImm(Offset)
+          .setMIFlag(Flag);
+      MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(X86::ADD64rr), Rax)
+                             .addReg(Rax)
+                             .addReg(StackPtr);
+      MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+      // Exchange the new SP in RAX with the top of the stack.
+      addRegOffset(
+          BuildMI(MBB, MBBI, DL, TII.get(X86::XCHG64rm), Rax).addReg(Rax),
+          StackPtr, false, 0);
+      // Load new SP from the top of the stack into RSP.
+      addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), StackPtr),
+                   StackPtr, false, 0);
+      return;
     }
+  }
 
+  while (Offset) {
     uint64_t ThisVal = std::min(Offset, Chunk);
-    if (ThisVal == (Is64Bit ? 8 : 4)) {
-      // Use push / pop instead.
+    if (ThisVal == SlotSize) {
+      // Use push / pop for slot sized adjustments as a size optimization. We
+      // need to find a dead register when using pop.
       unsigned Reg = isSub
         ? (unsigned)(Is64Bit ? X86::RAX : X86::EAX)
         : findDeadCallerSavedReg(MBB, MBBI, TRI, Is64Bit);
@@ -293,23 +329,16 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
         unsigned Opc = isSub
           ? (Is64Bit ? X86::PUSH64r : X86::PUSH32r)
           : (Is64Bit ? X86::POP64r  : X86::POP32r);
-        MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc))
-          .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub));
-        if (isSub)
-          MI->setFlag(MachineInstr::FrameSetup);
-        else
-          MI->setFlag(MachineInstr::FrameDestroy);
+        BuildMI(MBB, MBBI, DL, TII.get(Opc))
+            .addReg(Reg, getDefRegState(!isSub) | getUndefRegState(isSub))
+            .setMIFlag(Flag);
         Offset -= ThisVal;
         continue;
       }
     }
 
-    MachineInstrBuilder MI = BuildStackAdjustment(
-        MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue);
-    if (isSub)
-      MI.setMIFlag(MachineInstr::FrameSetup);
-    else
-      MI.setMIFlag(MachineInstr::FrameDestroy);
+    BuildStackAdjustment(MBB, MBBI, DL, isSub ? -ThisVal : ThisVal, InEpilogue)
+        .setMIFlag(Flag);
 
     Offset -= ThisVal;
   }
@@ -959,6 +988,16 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
         .getValueAsString()
         .getAsInteger(0, StackProbeSize);
 
+  // Re-align the stack on 64-bit if the x86-interrupt calling convention is
+  // used and an error code was pushed, since the x86-64 ABI requires a 16-byte
+  // stack alignment.
+  if (Fn->getCallingConv() == CallingConv::X86_INTR && Is64Bit &&
+      Fn->arg_size() == 2) {
+    StackSize += 8;
+    MFI.setStackSize(StackSize);
+    emitSPUpdate(MBB, MBBI, -8, /*InEpilogue=*/false);
+  }
+
   // If this is x86-64 and the Red Zone is not disabled, if we are a leaf
   // function, and use up to 128 bytes of stack space, don't have a frame
   // pointer, calls, or dynamic alloca then we do not need to adjust the
diff --git a/lib/Target/X86/X86GenRegisterBankInfo.def b/lib/Target/X86/X86GenRegisterBankInfo.def
new file mode 100644
index 0000000000000000000000000000000000000000..06be142432f72bccfa6f4c81838c636c33ff8619
--- /dev/null
+++ b/lib/Target/X86/X86GenRegisterBankInfo.def
@@ -0,0 +1,104 @@
+//===- X86GenRegisterBankInfo.def ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file defines all the static objects used by X86RegisterBankInfo.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+RegisterBankInfo::PartialMapping X86GenRegisterBankInfo::PartMappings[]{
+    /* StartIdx, Length, RegBank */
+    // GPR value
+    {0, 8, X86::GPRRegBank},   // :0
+    {0, 16, X86::GPRRegBank},  // :1
+    {0, 32, X86::GPRRegBank},  // :2
+    {0, 64, X86::GPRRegBank},  // :3
+    // FR32/64 , xmm registers
+    {0, 32, X86::VECRRegBank},  // :4
+    {0, 64, X86::VECRRegBank},  // :5
+    // VR128/256/512
+    {0, 128, X86::VECRRegBank}, // :6
+    {0, 256, X86::VECRRegBank}, // :7
+    {0, 512, X86::VECRRegBank}, // :8   
+};
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
+#ifdef GET_TARGET_REGBANK_INFO_CLASS
+enum PartialMappingIdx {
+  PMI_None = -1,
+  PMI_GPR8,
+  PMI_GPR16,
+  PMI_GPR32,
+  PMI_GPR64,
+  PMI_FP32,
+  PMI_FP64,
+  PMI_VEC128,
+  PMI_VEC256,
+  PMI_VEC512
+};
+#endif // GET_TARGET_REGBANK_INFO_CLASS
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+#define INSTR_3OP(INFO) INFO, INFO, INFO,
+#define BREAKDOWN(INDEX, NUM)                                                  \
+  { &X86GenRegisterBankInfo::PartMappings[INDEX], NUM }
+// ValueMappings.
+RegisterBankInfo::ValueMapping X86GenRegisterBankInfo::ValMappings[]{
+    /* BreakDown, NumBreakDowns */
+    // 3-operands instructions (all binary operations should end up with one of
+    // those mapping).
+    INSTR_3OP(BREAKDOWN(PMI_GPR8, 1))  // 0: GPR_8
+    INSTR_3OP(BREAKDOWN(PMI_GPR16, 1)) // 3: GPR_16
+    INSTR_3OP(BREAKDOWN(PMI_GPR32, 1)) // 6: GPR_32
+    INSTR_3OP(BREAKDOWN(PMI_GPR64, 1)) // 9: GPR_64    
+    INSTR_3OP(BREAKDOWN(PMI_FP32, 1))   // 12: Fp32
+    INSTR_3OP(BREAKDOWN(PMI_FP64, 1))   // 15: Fp64
+    INSTR_3OP(BREAKDOWN(PMI_VEC128, 1)) // 18: Vec128
+    INSTR_3OP(BREAKDOWN(PMI_VEC256, 1)) // 21: Vec256
+    INSTR_3OP(BREAKDOWN(PMI_VEC512, 1)) // 24: Vec512    
+};
+#undef INSTR_3OP
+#undef BREAKDOWN
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
+#ifdef GET_TARGET_REGBANK_INFO_CLASS
+enum ValueMappingIdx {
+  VMI_None = -1,
+  VMI_3OpsGpr8Idx =  PMI_GPR8  * 3,
+  VMI_3OpsGpr16Idx = PMI_GPR16 * 3,
+  VMI_3OpsGpr32Idx = PMI_GPR32 * 3,
+  VMI_3OpsGpr64Idx = PMI_GPR64 * 3,  
+  VMI_3OpsFp32Idx = PMI_FP32 * 3,
+  VMI_3OpsFp64Idx = PMI_FP64 * 3,
+  VMI_3OpsVec128Idx = PMI_VEC128 * 3,
+  VMI_3OpsVec256Idx = PMI_VEC256 * 3,
+  VMI_3OpsVec512Idx = PMI_VEC512 * 3,
+};
+#undef GET_TARGET_REGBANK_INFO_CLASS
+#endif // GET_TARGET_REGBANK_INFO_CLASS
+
+#ifdef GET_TARGET_REGBANK_INFO_IMPL
+#undef GET_TARGET_REGBANK_INFO_IMPL
+const RegisterBankInfo::ValueMapping *
+X86GenRegisterBankInfo::getValueMapping(PartialMappingIdx Idx,
+                                        unsigned NumOperands) {
+  
+  // We can use VMI_3Ops Mapping for all the cases.
+  if (NumOperands <= 3 && (Idx >= PMI_GPR8 && Idx <= PMI_VEC512))
+    return &ValMappings[(unsigned)Idx * 3];
+  
+  llvm_unreachable("Unsupported PartialMappingIdx.");
+}
+
+#endif // GET_TARGET_REGBANK_INFO_IMPL
+
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 8ab4c0616880c6ebf3bb43d0b96dac5505640ef6..eb5c56ff2ff9113c1a31ecab4f54d2382a409036 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -188,7 +188,6 @@ namespace {
 
   private:
     void Select(SDNode *N) override;
-    bool tryGather(SDNode *N, unsigned Opc);
 
     bool foldOffsetIntoAddress(uint64_t Offset, X86ISelAddressMode &AM);
     bool matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM);
@@ -384,6 +383,16 @@ namespace {
     bool ComplexPatternFuncMutatesDAG() const override {
       return true;
     }
+
+    bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
+
+    /// Returns whether this is a relocatable immediate in the range
+    /// [-2^Width .. 2^Width-1].
+    template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
+      if (auto *CN = dyn_cast<ConstantSDNode>(N))
+        return isInt<Width>(CN->getSExtValue());
+      return isSExtAbsoluteSymbolRef(Width, N);
+    }
   };
 }
 
@@ -709,7 +718,8 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
   // For more information see http://people.redhat.com/drepper/tls.pdf
   if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Address))
     if (C->getSExtValue() == 0 && AM.Segment.getNode() == nullptr &&
-        Subtarget->isTargetGlibc())
+        (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
+         Subtarget->isTargetFuchsia()))
       switch (N->getPointerInfo().getAddrSpace()) {
       case 256:
         AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
@@ -1325,8 +1335,8 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM,
     AM.Scale = 1;
 
     // Insert the new nodes into the topological ordering.
-    insertDAGNode(*CurDAG, N, Zero);
-    insertDAGNode(*CurDAG, N, Neg);
+    insertDAGNode(*CurDAG, Handle.getValue(), Zero);
+    insertDAGNode(*CurDAG, Handle.getValue(), Neg);
     return false;
   }
 
@@ -1789,6 +1799,21 @@ SDNode *X86DAGToDAGISel::getGlobalBaseReg() {
   return CurDAG->getRegister(GlobalBaseReg, TLI->getPointerTy(DL)).getNode();
 }
 
+bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
+  if (N->getOpcode() == ISD::TRUNCATE)
+    N = N->getOperand(0).getNode();
+  if (N->getOpcode() != X86ISD::Wrapper)
+    return false;
+
+  auto *GA = dyn_cast<GlobalAddressSDNode>(N->getOperand(0));
+  if (!GA)
+    return false;
+
+  Optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
+  return CR && CR->getSignedMin().sge(-1ull << Width) &&
+         CR->getSignedMax().slt(1ull << Width);
+}
+
 /// Test whether the given X86ISD::CMP node has any uses which require the SF
 /// or OF bits to be accurate.
 static bool hasNoSignedComparisonUses(SDNode *N) {
@@ -1905,6 +1930,8 @@ static bool isLoadIncOrDecStore(StoreSDNode *StoreNode, unsigned Opc,
       SDValue Op = Chain.getOperand(i);
       if (Op == Load.getValue(1)) {
         ChainCheck = true;
+        // Drop Load, but keep its chain. No cycle check necessary.
+        ChainOps.push_back(Load.getOperand(0));
         continue;
       }
 
@@ -1954,39 +1981,6 @@ static unsigned getFusedLdStOpcode(EVT &LdVT, unsigned Opc) {
   llvm_unreachable("unrecognized size for LdVT");
 }
 
-/// Customized ISel for GATHER operations.
-bool X86DAGToDAGISel::tryGather(SDNode *Node, unsigned Opc) {
-  // Operands of Gather: VSrc, Base, VIdx, VMask, Scale
-  SDValue Chain = Node->getOperand(0);
-  SDValue VSrc = Node->getOperand(2);
-  SDValue Base = Node->getOperand(3);
-  SDValue VIdx = Node->getOperand(4);
-  SDValue VMask = Node->getOperand(5);
-  ConstantSDNode *Scale = dyn_cast<ConstantSDNode>(Node->getOperand(6));
-  if (!Scale)
-    return false;
-
-  SDVTList VTs = CurDAG->getVTList(VSrc.getValueType(), VSrc.getValueType(),
-                                   MVT::Other);
-
-  SDLoc DL(Node);
-
-  // Memory Operands: Base, Scale, Index, Disp, Segment
-  SDValue Disp = CurDAG->getTargetConstant(0, DL, MVT::i32);
-  SDValue Segment = CurDAG->getRegister(0, MVT::i32);
-  const SDValue Ops[] = { VSrc, Base, getI8Imm(Scale->getSExtValue(), DL), VIdx,
-                          Disp, Segment, VMask, Chain};
-  SDNode *ResNode = CurDAG->getMachineNode(Opc, DL, VTs, Ops);
-  // Node has 2 outputs: VDst and MVT::Other.
-  // ResNode has 3 outputs: VDst, VMask_wb, and MVT::Other.
-  // We replace VDst of Node with VDst of ResNode, and Other of Node with Other
-  // of ResNode.
-  ReplaceUses(SDValue(Node, 0), SDValue(ResNode, 0));
-  ReplaceUses(SDValue(Node, 1), SDValue(ResNode, 2));
-  CurDAG->RemoveDeadNode(Node);
-  return true;
-}
-
 void X86DAGToDAGISel::Select(SDNode *Node) {
   MVT NVT = Node->getSimpleValueType(0);
   unsigned Opc, MOpc;
@@ -2024,55 +2018,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
-  case ISD::INTRINSIC_W_CHAIN: {
-    unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
-    switch (IntNo) {
-    default: break;
-    case Intrinsic::x86_avx2_gather_d_pd:
-    case Intrinsic::x86_avx2_gather_d_pd_256:
-    case Intrinsic::x86_avx2_gather_q_pd:
-    case Intrinsic::x86_avx2_gather_q_pd_256:
-    case Intrinsic::x86_avx2_gather_d_ps:
-    case Intrinsic::x86_avx2_gather_d_ps_256:
-    case Intrinsic::x86_avx2_gather_q_ps:
-    case Intrinsic::x86_avx2_gather_q_ps_256:
-    case Intrinsic::x86_avx2_gather_d_q:
-    case Intrinsic::x86_avx2_gather_d_q_256:
-    case Intrinsic::x86_avx2_gather_q_q:
-    case Intrinsic::x86_avx2_gather_q_q_256:
-    case Intrinsic::x86_avx2_gather_d_d:
-    case Intrinsic::x86_avx2_gather_d_d_256:
-    case Intrinsic::x86_avx2_gather_q_d:
-    case Intrinsic::x86_avx2_gather_q_d_256: {
-      if (!Subtarget->hasAVX2())
-        break;
-      unsigned Opc;
-      switch (IntNo) {
-      default: llvm_unreachable("Impossible intrinsic");
-      case Intrinsic::x86_avx2_gather_d_pd:     Opc = X86::VGATHERDPDrm;  break;
-      case Intrinsic::x86_avx2_gather_d_pd_256: Opc = X86::VGATHERDPDYrm; break;
-      case Intrinsic::x86_avx2_gather_q_pd:     Opc = X86::VGATHERQPDrm;  break;
-      case Intrinsic::x86_avx2_gather_q_pd_256: Opc = X86::VGATHERQPDYrm; break;
-      case Intrinsic::x86_avx2_gather_d_ps:     Opc = X86::VGATHERDPSrm;  break;
-      case Intrinsic::x86_avx2_gather_d_ps_256: Opc = X86::VGATHERDPSYrm; break;
-      case Intrinsic::x86_avx2_gather_q_ps:     Opc = X86::VGATHERQPSrm;  break;
-      case Intrinsic::x86_avx2_gather_q_ps_256: Opc = X86::VGATHERQPSYrm; break;
-      case Intrinsic::x86_avx2_gather_d_q:      Opc = X86::VPGATHERDQrm;  break;
-      case Intrinsic::x86_avx2_gather_d_q_256:  Opc = X86::VPGATHERDQYrm; break;
-      case Intrinsic::x86_avx2_gather_q_q:      Opc = X86::VPGATHERQQrm;  break;
-      case Intrinsic::x86_avx2_gather_q_q_256:  Opc = X86::VPGATHERQQYrm; break;
-      case Intrinsic::x86_avx2_gather_d_d:      Opc = X86::VPGATHERDDrm;  break;
-      case Intrinsic::x86_avx2_gather_d_d_256:  Opc = X86::VPGATHERDDYrm; break;
-      case Intrinsic::x86_avx2_gather_q_d:      Opc = X86::VPGATHERQDrm;  break;
-      case Intrinsic::x86_avx2_gather_q_d_256:  Opc = X86::VPGATHERQDYrm; break;
-      }
-      if (tryGather(Node, Opc))
-        return;
-      break;
-    }
-    }
-    break;
-  }
   case X86ISD::GlobalBaseReg:
     ReplaceNode(Node, getGlobalBaseReg());
     return;
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 9dec066e8eb44a258647e0403cb89770cb8197b5..5c8a95963c3bef8cf13bd40b446b50c6d3a28cb6 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -53,6 +53,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetLowering.h"
 #include "llvm/Target/TargetOptions.h"
 #include <algorithm>
 #include <bitset>
@@ -70,6 +71,13 @@ static cl::opt<bool> ExperimentalVectorWideningLegalization(
              "rather than promotion."),
     cl::Hidden);
 
+static cl::opt<int> ExperimentalPrefLoopAlignment(
+    "x86-experimental-pref-loop-alignment", cl::init(4),
+    cl::desc("Sets the preferable loop alignment for experiments "
+             "(the last x86-experimental-pref-loop-alignment bits"
+             " of the loop header PC will be 0)."),
+    cl::Hidden);
+
 X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                                      const X86Subtarget &STI)
     : TargetLowering(TM), Subtarget(STI) {
@@ -427,7 +435,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ExternalSymbol  , VT, Custom);
     setOperationAction(ISD::BlockAddress    , VT, Custom);
   }
-  // 64-bit addm sub, shl, sra, srl (iff 32-bit x86)
+
+  // 64-bit shl, sra, srl (iff 32-bit x86)
   for (auto VT : { MVT::i32, MVT::i64 }) {
     if (VT == MVT::i64 && !Subtarget.is64Bit())
       continue;
@@ -889,6 +898,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   }
 
   if (!Subtarget.useSoftFloat() && Subtarget.hasSSSE3()) {
+    setOperationAction(ISD::ABS,                MVT::v16i8, Legal);
+    setOperationAction(ISD::ABS,                MVT::v8i16, Legal);
+    setOperationAction(ISD::ABS,                MVT::v4i32, Legal);
     setOperationAction(ISD::BITREVERSE,         MVT::v16i8, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v16i8, Custom);
     setOperationAction(ISD::CTLZ,               MVT::v8i16, Custom);
@@ -923,6 +935,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
     // SSE41 brings specific instructions for doing vector sign extend even in
     // cases where we don't have SRA.
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v2i64, Legal);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v4i32, Legal);
+    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v8i16, Legal);
+
+    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v2i64, Legal);
+    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v4i32, Legal);
+    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v8i16, Legal);
+
     for (MVT VT : MVT::integer_vector_valuetypes()) {
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i8, Custom);
       setLoadExtAction(ISD::SEXTLOAD, VT, MVT::v2i16, Custom);
@@ -1066,6 +1086,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MULHS,     MVT::v32i8,  Custom);
 
     for (auto VT : { MVT::v32i8, MVT::v16i16, MVT::v8i32 }) {
+      setOperationAction(ISD::ABS,  VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::UMAX, VT, HasInt256 ? Legal : Custom);
       setOperationAction(ISD::SMIN, VT, HasInt256 ? Legal : Custom);
@@ -1127,7 +1148,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
       setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
     }
 
@@ -1272,6 +1293,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       }
     }
     if (Subtarget.hasVLX()) {
+      setOperationAction(ISD::ABS,              MVT::v4i64, Legal);
+      setOperationAction(ISD::ABS,              MVT::v2i64, Legal);
       setOperationAction(ISD::SINT_TO_FP,       MVT::v8i32, Legal);
       setOperationAction(ISD::UINT_TO_FP,       MVT::v8i32, Legal);
       setOperationAction(ISD::FP_TO_SINT,       MVT::v8i32, Legal);
@@ -1368,6 +1391,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
 
     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+      setOperationAction(ISD::ABS, VT, Legal);
       setOperationAction(ISD::SRL, VT, Custom);
       setOperationAction(ISD::SHL, VT, Custom);
       setOperationAction(ISD::SRA, VT, Custom);
@@ -1442,7 +1466,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::VSELECT,             VT, Legal);
       setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
       setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
       setOperationAction(ISD::MLOAD,               VT, Legal);
       setOperationAction(ISD::MSTORE,              VT, Legal);
       setOperationAction(ISD::MGATHER,             VT, Legal);
@@ -1480,8 +1504,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
     setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Custom);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
+    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
     setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i1,  Custom);
@@ -1547,6 +1571,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
       setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
       setOperationAction(ISD::VSELECT,      VT, Legal);
+      setOperationAction(ISD::ABS,          VT, Legal);
       setOperationAction(ISD::SRL,          VT, Custom);
       setOperationAction(ISD::SHL,          VT, Custom);
       setOperationAction(ISD::SRA,          VT, Custom);
@@ -1672,6 +1697,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
+  setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::VSELECT);
   setTargetDAGCombine(ISD::SELECT);
@@ -1697,6 +1723,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND_INREG);
+  setTargetDAGCombine(ISD::SIGN_EXTEND_VECTOR_INREG);
+  setTargetDAGCombine(ISD::ZERO_EXTEND_VECTOR_INREG);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::SETCC);
@@ -1713,7 +1741,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = 8; // For @llvm.memmove -> sequence of stores
   MaxStoresPerMemmoveOptSize = 4;
-  setPrefLoopAlignment(4); // 2^4 bytes.
+  // Set loop alignment to 2^ExperimentalPrefLoopAlignment bytes (default: 2^4).
+  setPrefLoopAlignment(ExperimentalPrefLoopAlignment);
 
   // An out-of-order CPU can speculatively execute past a predictable branch,
   // but a conditional move could be stalled by an expensive earlier operation.
@@ -1934,6 +1963,34 @@ bool X86TargetLowering::useSoftFloat() const {
   return Subtarget.useSoftFloat();
 }
 
+void X86TargetLowering::markLibCallAttributes(MachineFunction *MF, unsigned CC,
+                                              ArgListTy &Args) const {
+
+  // Only relabel X86-32 for C / Stdcall CCs.
+  if (Subtarget.is64Bit())
+    return;
+  if (CC != CallingConv::C && CC != CallingConv::X86_StdCall)
+    return;
+  unsigned ParamRegs = 0;
+  if (auto *M = MF->getFunction()->getParent())
+    ParamRegs = M->getNumberRegisterParameters();
+
+  // Mark the first N int arguments as having reg
+  for (unsigned Idx = 0; Idx < Args.size(); Idx++) {
+    Type *T = Args[Idx].Ty;
+    if (T->isPointerTy() || T->isIntegerTy())
+      if (MF->getDataLayout().getTypeAllocSize(T) <= 8) {
+        unsigned numRegs = 1;
+        if (MF->getDataLayout().getTypeAllocSize(T) > 4)
+          numRegs = 2;
+        if (ParamRegs < numRegs)
+          return;
+        ParamRegs -= numRegs;
+        Args[Idx].IsInReg = true;
+      }
+  }
+}
+
 const MCExpr *
 X86TargetLowering::LowerCustomJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                              const MachineBasicBlock *MBB,
@@ -2002,21 +2059,37 @@ unsigned X86TargetLowering::getAddressSpace() const {
   return 256;
 }
 
-Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
-  // glibc has a special slot for the stack guard in tcbhead_t, use it instead
-  // of the usual global variable (see sysdeps/{i386,x86_64}/nptl/tls.h)
-  if (!Subtarget.isTargetGlibc())
-    return TargetLowering::getIRStackGuard(IRB);
-
-  // %fs:0x28, unless we're using a Kernel code model, in which case it's %gs:
-  // %gs:0x14 on i386
-  unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
-  unsigned AddressSpace = getAddressSpace();
+static bool hasStackGuardSlotTLS(const Triple &TargetTriple) {
+  return TargetTriple.isOSGlibc() || TargetTriple.isOSFuchsia() ||
+         (TargetTriple.isAndroid() && !TargetTriple.isAndroidVersionLT(17));
+}
+
+static Constant* SegmentOffset(IRBuilder<> &IRB,
+                               unsigned Offset, unsigned AddressSpace) {
   return ConstantExpr::getIntToPtr(
       ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
       Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
 }
 
+Value *X86TargetLowering::getIRStackGuard(IRBuilder<> &IRB) const {
+  // glibc, bionic, and Fuchsia have a special slot for the stack guard in
+  // tcbhead_t; use it instead of the usual global variable (see
+  // sysdeps/{i386,x86_64}/nptl/tls.h)
+  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple())) {
+    if (Subtarget.isTargetFuchsia()) {
+      // <magenta/tls.h> defines MX_TLS_STACK_GUARD_OFFSET with this value.
+      return SegmentOffset(IRB, 0x10, getAddressSpace());
+    } else {
+      // %fs:0x28, unless we're using a Kernel code model, in which case
+      // it's %gs:0x28.  gs:0x14 on i386.
+      unsigned Offset = (Subtarget.is64Bit()) ? 0x28 : 0x14;
+      return SegmentOffset(IRB, Offset, getAddressSpace());
+    }
+  }
+
+  return TargetLowering::getIRStackGuard(IRB);
+}
+
 void X86TargetLowering::insertSSPDeclarations(Module &M) const {
   // MSVC CRT provides functionalities for stack protection.
   if (Subtarget.getTargetTriple().isOSMSVCRT()) {
@@ -2028,13 +2101,13 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
     auto *SecurityCheckCookie = cast<Function>(
         M.getOrInsertFunction("__security_check_cookie",
                               Type::getVoidTy(M.getContext()),
-                              Type::getInt8PtrTy(M.getContext()), nullptr));
+                              Type::getInt8PtrTy(M.getContext())));
     SecurityCheckCookie->setCallingConv(CallingConv::X86_FastCall);
     SecurityCheckCookie->addAttribute(1, Attribute::AttrKind::InReg);
     return;
   }
-  // glibc has a special slot for the stack guard.
-  if (Subtarget.isTargetGlibc())
+  // glibc, bionic, and Fuchsia have a special slot for the stack guard.
+  if (hasStackGuardSlotTLS(Subtarget.getTargetTriple()))
     return;
   TargetLowering::insertSSPDeclarations(M);
 }
@@ -2057,21 +2130,23 @@ Value *X86TargetLowering::getSafeStackPointerLocation(IRBuilder<> &IRB) const {
   if (Subtarget.getTargetTriple().isOSContiki())
     return getDefaultSafeStackPointerLocation(IRB, false);
 
-  if (!Subtarget.isTargetAndroid())
-    return TargetLowering::getSafeStackPointerLocation(IRB);
-
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
   // definition of TLS_SLOT_SAFESTACK in
   // https://android.googlesource.com/platform/bionic/+/master/libc/private/bionic_tls.h
-  unsigned AddressSpace, Offset;
+  if (Subtarget.isTargetAndroid()) {
+    // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
+    // %gs:0x24 on i386
+    unsigned Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
+    return SegmentOffset(IRB, Offset, getAddressSpace());
+  }
 
-  // %fs:0x48, unless we're using a Kernel code model, in which case it's %gs:
-  // %gs:0x24 on i386
-  Offset = (Subtarget.is64Bit()) ? 0x48 : 0x24;
-  AddressSpace = getAddressSpace();
-  return ConstantExpr::getIntToPtr(
-      ConstantInt::get(Type::getInt32Ty(IRB.getContext()), Offset),
-      Type::getInt8PtrTy(IRB.getContext())->getPointerTo(AddressSpace));
+  // Fuchsia is similar.
+  if (Subtarget.isTargetFuchsia()) {
+    // <magenta/tls.h> defines MX_TLS_UNSAFE_SP_OFFSET with this value.
+    return SegmentOffset(IRB, 0x18, getAddressSpace());
+  }
+
+  return TargetLowering::getSafeStackPointerLocation(IRB);
 }
 
 bool X86TargetLowering::isNoopAddrSpaceCast(unsigned SrcAS,
@@ -2180,6 +2255,11 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
        ++I, ++OutsIndex) {
     CCValAssign &VA = RVLocs[I];
     assert(VA.isRegLoc() && "Can only return in registers!");
+
+    // Add the register to the CalleeSaveDisableRegs list.
+    if (CallConv == CallingConv::X86_RegCall)
+      MF.getRegInfo().disableCalleeSavedRegister(VA.getLocReg());
+
     SDValue ValToCopy = OutVals[OutsIndex];
     EVT ValVT = ValToCopy.getValueType();
 
@@ -2254,6 +2334,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
       assert(2 == RegsToPass.size() &&
              "Expecting two registers after Pass64BitArgInRegs");
+
+      // Add the second register to the CalleeSaveDisableRegs list.
+      if (CallConv == CallingConv::X86_RegCall)
+        MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
     } else {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
     }
@@ -2310,6 +2394,10 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     // RAX/EAX now acts like a return value.
     RetOps.push_back(
         DAG.getRegister(RetValReg, getPointerTy(DAG.getDataLayout())));
+
+    // Add the returned register to the CalleeSaveDisableRegs list.
+    if (CallConv == CallingConv::X86_RegCall)
+      MF.getRegInfo().disableCalleeSavedRegister(RetValReg);
   }
 
   const X86RegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -2445,7 +2533,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
   // Convert the i32 type into v32i1 type
   Hi = DAG.getBitcast(MVT::v32i1, ArgValueHi);
 
-  // Concantenate the two values together
+  // Concatenate the two values together
   return DAG.getNode(ISD::CONCAT_VECTORS, Dl, MVT::v64i1, Lo, Hi);
 }
 
@@ -2489,8 +2577,10 @@ static SDValue lowerRegToMasks(const SDValue &ValArg, const EVT &ValVT,
 SDValue X86TargetLowering::LowerCallResult(
     SDValue Chain, SDValue InFlag, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
-    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
+    uint32_t *RegMask) const {
 
+  const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   // Assign locations to each value returned by this call.
   SmallVector<CCValAssign, 16> RVLocs;
   bool Is64Bit = Subtarget.is64Bit();
@@ -2504,6 +2594,14 @@ SDValue X86TargetLowering::LowerCallResult(
     CCValAssign &VA = RVLocs[I];
     EVT CopyVT = VA.getLocVT();
 
+    // In some calling conventions we need to remove the used registers
+    // from the register mask.
+    if (RegMask && CallConv == CallingConv::X86_RegCall) {
+      for (MCSubRegIterator SubRegs(VA.getLocReg(), TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+    }
+
     // If this is x86-64, and we disabled SSE, we can't return FP values
     if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) &&
         ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) {
@@ -2670,6 +2768,7 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
       CallConv, DAG.getTarget().Options.GuaranteedTailCallOpt);
   bool isImmutable = !AlwaysUseMutable && !Flags.isByVal();
   EVT ValVT;
+  MVT PtrVT = getPointerTy(DAG.getDataLayout());
 
   // If value is passed by pointer we have address passed instead of the value
   // itself. No need to extend if the mask value and location share the same
@@ -2687,13 +2786,16 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
   // taken by a return address.
   int Offset = 0;
   if (CallConv == CallingConv::X86_INTR) {
-    const X86Subtarget& Subtarget =
-        static_cast<const X86Subtarget&>(DAG.getSubtarget());
     // X86 interrupts may take one or two arguments.
     // On the stack there will be no return address as in regular call.
     // Offset of last argument need to be set to -4/-8 bytes.
     // Where offset of the first argument out of two, should be set to 0 bytes.
     Offset = (Subtarget.is64Bit() ? 8 : 4) * ((i + 1) % Ins.size() - 1);
+    if (Subtarget.is64Bit() && Ins.size() == 2) {
+      // The stack pointer needs to be realigned for 64 bit handlers with error
+      // code, so the argument offset changes by 8 bytes.
+      Offset += 8;
+    }
   }
 
   // FIXME: For now, all byval parameter objects are marked mutable. This can be
@@ -2708,30 +2810,71 @@ X86TargetLowering::LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
     if (CallConv == CallingConv::X86_INTR) {
       MFI.setObjectOffset(FI, Offset);
     }
-    return DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-  } else {
-    int FI = MFI.CreateFixedObject(ValVT.getSizeInBits()/8,
-                                   VA.getLocMemOffset(), isImmutable);
-
-    // Set SExt or ZExt flag.
-    if (VA.getLocInfo() == CCValAssign::ZExt) {
-      MFI.setObjectZExt(FI, true);
-    } else if (VA.getLocInfo() == CCValAssign::SExt) {
-      MFI.setObjectSExt(FI, true);
+    return DAG.getFrameIndex(FI, PtrVT);
+  }
+
+  // This is an argument in memory. We might be able to perform copy elision.
+  if (Flags.isCopyElisionCandidate()) {
+    EVT ArgVT = Ins[i].ArgVT;
+    SDValue PartAddr;
+    if (Ins[i].PartOffset == 0) {
+      // If this is a one-part value or the first part of a multi-part value,
+      // create a stack object for the entire argument value type and return a
+      // load from our portion of it. This assumes that if the first part of an
+      // argument is in memory, the rest will also be in memory.
+      int FI = MFI.CreateFixedObject(ArgVT.getStoreSize(), VA.getLocMemOffset(),
+                                     /*Immutable=*/false);
+      PartAddr = DAG.getFrameIndex(FI, PtrVT);
+      return DAG.getLoad(
+          ValVT, dl, Chain, PartAddr,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+    } else {
+      // This is not the first piece of an argument in memory. See if there is
+      // already a fixed stack object including this offset. If so, assume it
+      // was created by the PartOffset == 0 branch above and create a load from
+      // the appropriate offset into it.
+      int64_t PartBegin = VA.getLocMemOffset();
+      int64_t PartEnd = PartBegin + ValVT.getSizeInBits() / 8;
+      int FI = MFI.getObjectIndexBegin();
+      for (; MFI.isFixedObjectIndex(FI); ++FI) {
+        int64_t ObjBegin = MFI.getObjectOffset(FI);
+        int64_t ObjEnd = ObjBegin + MFI.getObjectSize(FI);
+        if (ObjBegin <= PartBegin && PartEnd <= ObjEnd)
+          break;
+      }
+      if (MFI.isFixedObjectIndex(FI)) {
+        SDValue Addr =
+            DAG.getNode(ISD::ADD, dl, PtrVT, DAG.getFrameIndex(FI, PtrVT),
+                        DAG.getIntPtrConstant(Ins[i].PartOffset, dl));
+        return DAG.getLoad(
+            ValVT, dl, Chain, Addr,
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI,
+                                              Ins[i].PartOffset));
+      }
     }
+  }
 
-    // Adjust SP offset of interrupt parameter.
-    if (CallConv == CallingConv::X86_INTR) {
-      MFI.setObjectOffset(FI, Offset);
-    }
+  int FI = MFI.CreateFixedObject(ValVT.getSizeInBits() / 8,
+                                 VA.getLocMemOffset(), isImmutable);
 
-    SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
-    SDValue Val = DAG.getLoad(
-        ValVT, dl, Chain, FIN,
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
-    return ExtendedInMem ?
-      DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val) : Val;
+  // Set SExt or ZExt flag.
+  if (VA.getLocInfo() == CCValAssign::ZExt) {
+    MFI.setObjectZExt(FI, true);
+  } else if (VA.getLocInfo() == CCValAssign::SExt) {
+    MFI.setObjectSExt(FI, true);
   }
+
+  // Adjust SP offset of interrupt parameter.
+  if (CallConv == CallingConv::X86_INTR) {
+    MFI.setObjectOffset(FI, Offset);
+  }
+
+  SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+  SDValue Val = DAG.getLoad(
+      ValVT, dl, Chain, FIN,
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+  return ExtendedInMem ? DAG.getNode(ISD::TRUNCATE, dl, VA.getValVT(), Val)
+                       : Val;
 }
 
 // FIXME: Get this from tablegen.
@@ -2782,12 +2925,14 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
   return makeArrayRef(std::begin(XMMArgRegs64Bit), std::end(XMMArgRegs64Bit));
 }
 
+#ifndef NDEBUG
 static bool isSortedByValueNo(const SmallVectorImpl<CCValAssign> &ArgLocs) {
   return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
                         [](const CCValAssign &A, const CCValAssign &B) -> bool {
                           return A.getValNo() < B.getValNo();
                         });
 }
+#endif
 
 SDValue X86TargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -2837,8 +2982,8 @@ SDValue X86TargetLowering::LowerFormalArguments(
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
-  if (!isSortedByValueNo(ArgLocs))
-    llvm_unreachable("Argument Location list must be sorted before lowering");
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
 
   SDValue ArgValue;
   for (unsigned I = 0, InsIndex = 0, E = ArgLocs.size(); I != E;
@@ -2854,7 +2999,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
             "Currently the only custom case is when we split v64i1 to 2 regs");
 
         // v64i1 values, in regcall calling convention, that are
-        // compiled to 32 bit arch, are splited up into two registers.
+        // compiled to 32 bit arch, are split up into two registers.
         ArgValue =
             getv64i1Argument(VA, ArgLocs[++I], Chain, DAG, dl, Subtarget);
       } else {
@@ -3108,8 +3253,9 @@ SDValue X86TargetLowering::LowerFormalArguments(
                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
-    // X86 interrupts must pop the error code if present
-    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 8 : 4);
+    // X86 interrupts must pop the error code (and the alignment padding) if
+    // present.
+    FuncInfo->setBytesToPopOnReturn(Is64Bit ? 16 : 4);
   } else {
     FuncInfo->setBytesToPopOnReturn(0); // Callee pops nothing.
     // If this is an sret function, the return should pop the hidden pointer.
@@ -3147,6 +3293,12 @@ SDValue X86TargetLowering::LowerFormalArguments(
     }
   }
 
+  if (CallConv == CallingConv::X86_RegCall) {
+    const MachineRegisterInfo &MRI = MF.getRegInfo();
+    for (const auto &Pair : make_range(MRI.livein_begin(), MRI.livein_end()))
+      MF.getRegInfo().disableCalleeSavedRegister(Pair.first);
+  }
+
   return Chain;
 }
 
@@ -3349,8 +3501,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // The next loop assumes that the locations are in the same order of the
   // input arguments.
-  if (!isSortedByValueNo(ArgLocs))
-    llvm_unreachable("Argument Location list must be sorted before lowering");
+  assert(isSortedByValueNo(ArgLocs) &&
+         "Argument Location list must be sorted before lowering");
 
   // Walk the register/memloc assignments, inserting copies/loads.  In the case
   // of tail call optimization arguments are handle later.
@@ -3518,7 +3670,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       if (VA.isRegLoc()) {
         if (VA.needsCustom()) {
           assert((CallConv == CallingConv::X86_RegCall) &&
-                 "Expecting custome case only in regcall calling convention");
+                 "Expecting custom case only in regcall calling convention");
           // This means that we are in special case where one argument was
           // passed through two register locations - Skip the next location
           ++I;
@@ -3663,7 +3815,32 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       Mask = RegInfo->getNoPreservedMask();
   }
 
-  Ops.push_back(DAG.getRegisterMask(Mask));
+  // Define a new register mask from the existing mask.
+  uint32_t *RegMask = nullptr;
+
+  // In some calling conventions we need to remove the used physical registers
+  // from the reg mask.
+  if (CallConv == CallingConv::X86_RegCall) {
+    const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
+
+    // Allocate a new Reg Mask and copy Mask.
+    RegMask = MF.allocateRegisterMask(TRI->getNumRegs());
+    unsigned RegMaskSize = (TRI->getNumRegs() + 31) / 32;
+    memcpy(RegMask, Mask, sizeof(uint32_t) * RegMaskSize);
+
+    // Make sure all sub registers of the argument registers are reset
+    // in the RegMask.
+    for (auto const &RegPair : RegsToPass)
+      for (MCSubRegIterator SubRegs(RegPair.first, TRI, /*IncludeSelf=*/true);
+           SubRegs.isValid(); ++SubRegs)
+        RegMask[*SubRegs / 32] &= ~(1u << (*SubRegs % 32));
+
+    // Create the RegMask Operand according to our updated mask.
+    Ops.push_back(DAG.getRegisterMask(RegMask));
+  } else {
+    // Create the RegMask Operand according to the static mask.
+    Ops.push_back(DAG.getRegisterMask(Mask));
+  }
 
   if (InFlag.getNode())
     Ops.push_back(InFlag);
@@ -3716,8 +3893,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Handle result values, copying them out of physregs into vregs that we
   // return.
-  return LowerCallResult(Chain, InFlag, CallConv, isVarArg,
-                         Ins, dl, DAG, InVals);
+  return LowerCallResult(Chain, InFlag, CallConv, isVarArg, Ins, dl, DAG,
+                         InVals, RegMask);
 }
 
 //===----------------------------------------------------------------------===//
@@ -4450,6 +4627,11 @@ bool X86TargetLowering::isCtlzFast() const {
   return Subtarget.hasFastLZCNT();
 }
 
+bool X86TargetLowering::isMaskAndCmp0FoldingBeneficial(
+    const Instruction &AndI) const {
+  return true;
+}
+
 bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   if (!Subtarget.hasBMI())
     return false;
@@ -4462,6 +4644,26 @@ bool X86TargetLowering::hasAndNotCompare(SDValue Y) const {
   return true;
 }
 
+MVT X86TargetLowering::hasFastEqualityCompare(unsigned NumBits) const {
+  MVT VT = MVT::getIntegerVT(NumBits);
+  if (isTypeLegal(VT))
+    return VT;
+
+  // PMOVMSKB can handle this.
+  if (NumBits == 128 && isTypeLegal(MVT::v16i8))
+    return MVT::v16i8;
+
+  // VPMOVMSKB can handle this.
+  if (NumBits == 256 && isTypeLegal(MVT::v32i8))
+    return MVT::v32i8;
+
+  // TODO: Allow 64-bit type for 32-bit target.
+  // TODO: 512-bit types should be allowed, but make sure that those
+  // cases are handled in combineVectorSizedSetCCEquality().
+
+  return MVT::INVALID_SIMPLE_VALUE_TYPE;
+}
+
 /// Val is the undef sentinel value or equal to the specified value.
 static bool isUndefOrEqual(int Val, int CmpVal) {
   return ((Val == SM_SentinelUndef) || (Val == CmpVal));
@@ -4557,28 +4759,30 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
                                     SmallVectorImpl<int> &WidenedMask) {
   WidenedMask.assign(Mask.size() / 2, 0);
   for (int i = 0, Size = Mask.size(); i < Size; i += 2) {
+    int M0 = Mask[i];
+    int M1 = Mask[i + 1];
+
     // If both elements are undef, its trivial.
-    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] == SM_SentinelUndef) {
+    if (M0 == SM_SentinelUndef && M1 == SM_SentinelUndef) {
       WidenedMask[i / 2] = SM_SentinelUndef;
       continue;
     }
 
     // Check for an undef mask and a mask value properly aligned to fit with
     // a pair of values. If we find such a case, use the non-undef mask's value.
-    if (Mask[i] == SM_SentinelUndef && Mask[i + 1] >= 0 &&
-        Mask[i + 1] % 2 == 1) {
-      WidenedMask[i / 2] = Mask[i + 1] / 2;
+    if (M0 == SM_SentinelUndef && M1 >= 0 && (M1 % 2) == 1) {
+      WidenedMask[i / 2] = M1 / 2;
       continue;
     }
-    if (Mask[i + 1] == SM_SentinelUndef && Mask[i] >= 0 && Mask[i] % 2 == 0) {
-      WidenedMask[i / 2] = Mask[i] / 2;
+    if (M1 == SM_SentinelUndef && M0 >= 0 && (M0 % 2) == 0) {
+      WidenedMask[i / 2] = M0 / 2;
       continue;
     }
 
     // When zeroing, we need to spread the zeroing across both lanes to widen.
-    if (Mask[i] == SM_SentinelZero || Mask[i + 1] == SM_SentinelZero) {
-      if ((Mask[i] == SM_SentinelZero || Mask[i] == SM_SentinelUndef) &&
-          (Mask[i + 1] == SM_SentinelZero || Mask[i + 1] == SM_SentinelUndef)) {
+    if (M0 == SM_SentinelZero || M1 == SM_SentinelZero) {
+      if ((M0 == SM_SentinelZero || M0 == SM_SentinelUndef) &&
+          (M1 == SM_SentinelZero || M1 == SM_SentinelUndef)) {
         WidenedMask[i / 2] = SM_SentinelZero;
         continue;
       }
@@ -4587,9 +4791,8 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask,
 
     // Finally check if the two mask values are adjacent and aligned with
     // a pair.
-    if (Mask[i] != SM_SentinelUndef && Mask[i] % 2 == 0 &&
-        Mask[i] + 1 == Mask[i + 1]) {
-      WidenedMask[i / 2] = Mask[i] / 2;
+    if (M0 != SM_SentinelUndef && (M0 % 2) == 0 && (M0 + 1) == M1) {
+      WidenedMask[i / 2] = M0 / 2;
       continue;
     }
 
@@ -4772,9 +4975,10 @@ static SDValue getConstVector(ArrayRef<int> Values, MVT VT, SelectionDAG &DAG,
   return ConstsNode;
 }
 
-static SDValue getConstVector(ArrayRef<APInt> Bits, SmallBitVector &Undefs,
+static SDValue getConstVector(ArrayRef<APInt> Bits, APInt &Undefs,
                               MVT VT, SelectionDAG &DAG, const SDLoc &dl) {
-  assert(Bits.size() == Undefs.size() && "Unequal constant and undef arrays");
+  assert(Bits.size() == Undefs.getBitWidth() &&
+         "Unequal constant and undef arrays");
   SmallVector<SDValue, 32> Ops;
   bool Split = false;
 
@@ -4916,50 +5120,6 @@ static SDValue insertSubVector(SDValue Result, SDValue Vec, unsigned IdxVal,
 static SDValue insert128BitVector(SDValue Result, SDValue Vec, unsigned IdxVal,
                                   SelectionDAG &DAG, const SDLoc &dl) {
   assert(Vec.getValueType().is128BitVector() && "Unexpected vector size!");
-
-  // For insertion into the zero index (low half) of a 256-bit vector, it is
-  // more efficient to generate a blend with immediate instead of an insert*128.
-  // We are still creating an INSERT_SUBVECTOR below with an undef node to
-  // extend the subvector to the size of the result vector. Make sure that
-  // we are not recursing on that node by checking for undef here.
-  if (IdxVal == 0 && Result.getValueType().is256BitVector() &&
-      !Result.isUndef()) {
-    EVT ResultVT = Result.getValueType();
-    SDValue ZeroIndex = DAG.getIntPtrConstant(0, dl);
-    SDValue Undef = DAG.getUNDEF(ResultVT);
-    SDValue Vec256 = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, ResultVT, Undef,
-                                 Vec, ZeroIndex);
-
-    // The blend instruction, and therefore its mask, depend on the data type.
-    MVT ScalarType = ResultVT.getVectorElementType().getSimpleVT();
-    if (ScalarType.isFloatingPoint()) {
-      // Choose either vblendps (float) or vblendpd (double).
-      unsigned ScalarSize = ScalarType.getSizeInBits();
-      assert((ScalarSize == 64 || ScalarSize == 32) && "Unknown float type");
-      unsigned MaskVal = (ScalarSize == 64) ? 0x03 : 0x0f;
-      SDValue Mask = DAG.getConstant(MaskVal, dl, MVT::i8);
-      return DAG.getNode(X86ISD::BLENDI, dl, ResultVT, Result, Vec256, Mask);
-    }
-
-    const X86Subtarget &Subtarget =
-    static_cast<const X86Subtarget &>(DAG.getSubtarget());
-
-    // AVX2 is needed for 256-bit integer blend support.
-    // Integers must be cast to 32-bit because there is only vpblendd;
-    // vpblendw can't be used for this because it has a handicapped mask.
-
-    // If we don't have AVX2, then cast to float. Using a wrong domain blend
-    // is still more efficient than using the wrong domain vinsertf128 that
-    // will be created by InsertSubVector().
-    MVT CastVT = Subtarget.hasAVX2() ? MVT::v8i32 : MVT::v8f32;
-
-    SDValue Mask = DAG.getConstant(0x0f, dl, MVT::i8);
-    Result = DAG.getBitcast(CastVT, Result);
-    Vec256 = DAG.getBitcast(CastVT, Vec256);
-    Vec256 = DAG.getNode(X86ISD::BLENDI, dl, CastVT, Result, Vec256, Mask);
-    return DAG.getBitcast(ResultVT, Vec256);
-  }
-
   return insertSubVector(Result, Vec, IdxVal, DAG, dl, 128);
 }
 
@@ -5095,8 +5255,7 @@ static SDValue concat256BitVectors(SDValue V1, SDValue V2, EVT VT,
 /// Returns a vector of specified type with all bits set.
 /// Always build ones vectors as <4 x i32>, <8 x i32> or <16 x i32>.
 /// Then bitcast to their original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
-                             SelectionDAG &DAG, const SDLoc &dl) {
+static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, const SDLoc &dl) {
   assert((VT.is128BitVector() || VT.is256BitVector() || VT.is512BitVector()) &&
          "Expected a 128/256/512-bit vector type");
 
@@ -5106,6 +5265,26 @@ static SDValue getOnesVector(EVT VT, const X86Subtarget &Subtarget,
   return DAG.getBitcast(VT, Vec);
 }
 
+static SDValue getExtendInVec(unsigned Opc, const SDLoc &DL, EVT VT, SDValue In,
+                              SelectionDAG &DAG) {
+  EVT InVT = In.getValueType();
+  assert((X86ISD::VSEXT == Opc || X86ISD::VZEXT == Opc) && "Unexpected opcode");
+
+  if (VT.is128BitVector() && InVT.is128BitVector())
+    return X86ISD::VSEXT == Opc ? DAG.getSignExtendVectorInReg(In, DL, VT)
+                                : DAG.getZeroExtendVectorInReg(In, DL, VT);
+
+  // For 256-bit vectors, we only need the lower (128-bit) input half.
+  // For 512-bit vectors, we only need the lower input half or quarter.
+  if (VT.getSizeInBits() > 128 && InVT.getSizeInBits() > 128) {
+    int Scale = VT.getScalarSizeInBits() / InVT.getScalarSizeInBits();
+    In = extractSubVector(In, 0, DAG, DL,
+                          std::max(128, (int)VT.getSizeInBits() / Scale));
+  }
+
+  return DAG.getNode(Opc, DL, VT, In);
+}
+
 /// Generate unpacklo/unpackhi shuffle mask.
 static void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
                                     bool Unary) {
@@ -5191,9 +5370,10 @@ static const Constant *getTargetConstantFromNode(SDValue Op) {
 
 // Extract raw constant bits from constant pools.
 static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
-                                          SmallBitVector &UndefElts,
-                                          SmallVectorImpl<APInt> &EltBits) {
-  assert(UndefElts.empty() && "Expected an empty UndefElts vector");
+                                          APInt &UndefElts,
+                                          SmallVectorImpl<APInt> &EltBits,
+                                          bool AllowWholeUndefs = true,
+                                          bool AllowPartialUndefs = true) {
   assert(EltBits.empty() && "Expected an empty EltBits vector");
 
   Op = peekThroughBitcasts(Op);
@@ -5203,56 +5383,83 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   assert((SizeInBits % EltSizeInBits) == 0 && "Can't split constant!");
   unsigned NumElts = SizeInBits / EltSizeInBits;
 
+  unsigned SrcEltSizeInBits = VT.getScalarSizeInBits();
+  unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
+
   // Extract all the undef/constant element data and pack into single bitsets.
   APInt UndefBits(SizeInBits, 0);
   APInt MaskBits(SizeInBits, 0);
 
   // Split the undef/constant single bitset data into the target elements.
   auto SplitBitData = [&]() {
-    UndefElts = SmallBitVector(NumElts, false);
+    // Don't split if we don't allow undef bits.
+    bool AllowUndefs = AllowWholeUndefs || AllowPartialUndefs;
+    if (UndefBits.getBoolValue() && !AllowUndefs)
+      return false;
+
+    UndefElts = APInt(NumElts, 0);
     EltBits.resize(NumElts, APInt(EltSizeInBits, 0));
 
     for (unsigned i = 0; i != NumElts; ++i) {
-      APInt UndefEltBits = UndefBits.lshr(i * EltSizeInBits);
-      UndefEltBits = UndefEltBits.zextOrTrunc(EltSizeInBits);
+      unsigned BitOffset = i * EltSizeInBits;
+      APInt UndefEltBits = UndefBits.extractBits(EltSizeInBits, BitOffset);
 
-      // Only treat an element as UNDEF if all bits are UNDEF, otherwise
-      // treat it as zero.
+      // Only treat an element as UNDEF if all bits are UNDEF.
       if (UndefEltBits.isAllOnesValue()) {
-        UndefElts[i] = true;
+        if (!AllowWholeUndefs)
+          return false;
+        UndefElts.setBit(i);
         continue;
       }
 
-      APInt Bits = MaskBits.lshr(i * EltSizeInBits);
-      Bits = Bits.zextOrTrunc(EltSizeInBits);
+      // If only some bits are UNDEF then treat them as zero (or bail if not
+      // supported).
+      if (UndefEltBits.getBoolValue() && !AllowPartialUndefs)
+        return false;
+
+      APInt Bits = MaskBits.extractBits(EltSizeInBits, BitOffset);
       EltBits[i] = Bits.getZExtValue();
     }
     return true;
   };
 
-  auto ExtractConstantBits = [SizeInBits](const Constant *Cst, APInt &Mask,
-                                          APInt &Undefs) {
+  // Collect constant bits and insert into mask/undef bit masks.
+  auto CollectConstantBits = [](const Constant *Cst, APInt &Mask, APInt &Undefs,
+                                unsigned BitOffset) {
     if (!Cst)
       return false;
     unsigned CstSizeInBits = Cst->getType()->getPrimitiveSizeInBits();
     if (isa<UndefValue>(Cst)) {
-      Mask = APInt::getNullValue(SizeInBits);
-      Undefs = APInt::getLowBitsSet(SizeInBits, CstSizeInBits);
+      Undefs.setBits(BitOffset, BitOffset + CstSizeInBits);
       return true;
     }
     if (auto *CInt = dyn_cast<ConstantInt>(Cst)) {
-      Mask = CInt->getValue().zextOrTrunc(SizeInBits);
-      Undefs = APInt::getNullValue(SizeInBits);
+      Mask.insertBits(CInt->getValue(), BitOffset);
       return true;
     }
     if (auto *CFP = dyn_cast<ConstantFP>(Cst)) {
-      Mask = CFP->getValueAPF().bitcastToAPInt().zextOrTrunc(SizeInBits);
-      Undefs = APInt::getNullValue(SizeInBits);
+      Mask.insertBits(CFP->getValueAPF().bitcastToAPInt(), BitOffset);
       return true;
     }
     return false;
   };
 
+  // Extract constant bits from build vector.
+  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
+    for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+      const SDValue &Src = Op.getOperand(i);
+      unsigned BitOffset = i * SrcEltSizeInBits;
+      if (Src.isUndef()) {
+        UndefBits.setBits(BitOffset, BitOffset + SrcEltSizeInBits);
+        continue;
+      }
+      auto *Cst = cast<ConstantSDNode>(Src);
+      APInt Bits = Cst->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
+      MaskBits.insertBits(Bits, BitOffset);
+    }
+    return SplitBitData();
+  }
+
   // Extract constant bits from constant pool vector.
   if (auto *Cst = getTargetConstantFromNode(Op)) {
     Type *CstTy = Cst->getType();
@@ -5260,117 +5467,59 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
       return false;
 
     unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
-    for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i) {
-      APInt Bits, Undefs;
-      if (!ExtractConstantBits(Cst->getAggregateElement(i), Bits, Undefs))
+    for (unsigned i = 0, e = CstTy->getVectorNumElements(); i != e; ++i)
+      if (!CollectConstantBits(Cst->getAggregateElement(i), MaskBits, UndefBits,
+                               i * CstEltSizeInBits))
         return false;
-      MaskBits |= Bits.shl(i * CstEltSizeInBits);
-      UndefBits |= Undefs.shl(i * CstEltSizeInBits);
-    }
 
     return SplitBitData();
   }
 
   // Extract constant bits from a broadcasted constant pool scalar.
   if (Op.getOpcode() == X86ISD::VBROADCAST &&
-      EltSizeInBits <= Op.getScalarValueSizeInBits()) {
+      EltSizeInBits <= SrcEltSizeInBits) {
     if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
-      APInt Bits, Undefs;
-      if (ExtractConstantBits(Broadcast, Bits, Undefs)) {
-        unsigned NumBroadcastBits = Op.getScalarValueSizeInBits();
-        unsigned NumBroadcastElts = SizeInBits / NumBroadcastBits;
-        for (unsigned i = 0; i != NumBroadcastElts; ++i) {
-          MaskBits |= Bits.shl(i * NumBroadcastBits);
-          UndefBits |= Undefs.shl(i * NumBroadcastBits);
+      APInt Bits(SizeInBits, 0);
+      APInt Undefs(SizeInBits, 0);
+      if (CollectConstantBits(Broadcast, Bits, Undefs, 0)) {
+        for (unsigned i = 0; i != NumSrcElts; ++i) {
+          MaskBits |= Bits.shl(i * SrcEltSizeInBits);
+          UndefBits |= Undefs.shl(i * SrcEltSizeInBits);
         }
         return SplitBitData();
       }
     }
   }
 
+  // Extract a rematerialized scalar constant insertion.
+  if (Op.getOpcode() == X86ISD::VZEXT_MOVL &&
+      Op.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR &&
+      isa<ConstantSDNode>(Op.getOperand(0).getOperand(0))) {
+    auto *CN = cast<ConstantSDNode>(Op.getOperand(0).getOperand(0));
+    MaskBits = CN->getAPIntValue().zextOrTrunc(SrcEltSizeInBits);
+    MaskBits = MaskBits.zext(SizeInBits);
+    return SplitBitData();
+  }
+
   return false;
 }
 
-// TODO: Merge more of this with getTargetConstantBitsFromNode.
 static bool getTargetShuffleMaskIndices(SDValue MaskNode,
                                         unsigned MaskEltSizeInBits,
                                         SmallVectorImpl<uint64_t> &RawMask) {
-  MaskNode = peekThroughBitcasts(MaskNode);
-
-  MVT VT = MaskNode.getSimpleValueType();
-  assert(VT.isVector() && "Can't produce a non-vector with a build_vector!");
-  unsigned NumMaskElts = VT.getSizeInBits() / MaskEltSizeInBits;
-
-  // Split an APInt element into MaskEltSizeInBits sized pieces and
-  // insert into the shuffle mask.
-  auto SplitElementToMask = [&](APInt Element) {
-    // Note that this is x86 and so always little endian: the low byte is
-    // the first byte of the mask.
-    int Split = VT.getScalarSizeInBits() / MaskEltSizeInBits;
-    for (int i = 0; i < Split; ++i) {
-      APInt RawElt = Element.getLoBits(MaskEltSizeInBits);
-      Element = Element.lshr(MaskEltSizeInBits);
-      RawMask.push_back(RawElt.getZExtValue());
-    }
-  };
-
-  if (MaskNode.getOpcode() == X86ISD::VBROADCAST) {
-    // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
-    // TODO: Handle (VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0
-    if (VT.getScalarSizeInBits() != MaskEltSizeInBits)
-      return false;
-    if (auto *CN = dyn_cast<ConstantSDNode>(MaskNode.getOperand(0))) {
-      const APInt &MaskElement = CN->getAPIntValue();
-      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
-        APInt RawElt = MaskElement.getLoBits(MaskEltSizeInBits);
-        RawMask.push_back(RawElt.getZExtValue());
-      }
-    }
-    return false;
-  }
-
-  if (MaskNode.getOpcode() == X86ISD::VZEXT_MOVL &&
-      MaskNode.getOperand(0).getOpcode() == ISD::SCALAR_TO_VECTOR) {
-    SDValue MaskOp = MaskNode.getOperand(0).getOperand(0);
-    if (auto *CN = dyn_cast<ConstantSDNode>(MaskOp)) {
-      if ((MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0) {
-        RawMask.push_back(CN->getZExtValue());
-        RawMask.append(NumMaskElts - 1, 0);
-        return true;
-      }
-
-      if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) == 0) {
-        unsigned ElementSplit = VT.getScalarSizeInBits() / MaskEltSizeInBits;
-        SplitElementToMask(CN->getAPIntValue());
-        RawMask.append((VT.getVectorNumElements() - 1) * ElementSplit, 0);
-        return true;
-      }
-    }
+  APInt UndefElts;
+  SmallVector<APInt, 64> EltBits;
+
+  // Extract the raw target constant bits.
+  // FIXME: We currently don't support UNDEF bits or mask entries.
+  if (!getTargetConstantBitsFromNode(MaskNode, MaskEltSizeInBits, UndefElts,
+                                     EltBits, /* AllowWholeUndefs */ false,
+                                     /* AllowPartialUndefs */ false))
     return false;
-  }
 
-  if (MaskNode.getOpcode() != ISD::BUILD_VECTOR)
-    return false;
-
-  // We can always decode if the buildvector is all zero constants,
-  // but can't use isBuildVectorAllZeros as it might contain UNDEFs.
-  if (all_of(MaskNode->ops(), X86::isZeroNode)) {
-    RawMask.append(NumMaskElts, 0);
-    return true;
-  }
-
-  // TODO: Handle (MaskEltSizeInBits % VT.getScalarSizeInBits()) == 0
-  if ((VT.getScalarSizeInBits() % MaskEltSizeInBits) != 0)
-    return false;
-
-  for (SDValue Op : MaskNode->ops()) {
-    if (auto *CN = dyn_cast<ConstantSDNode>(Op.getNode()))
-      SplitElementToMask(CN->getAPIntValue());
-    else if (auto *CFN = dyn_cast<ConstantFPSDNode>(Op.getNode()))
-      SplitElementToMask(CFN->getValueAPF().bitcastToAPInt());
-    else
-      return false;
-  }
+  // Insert the extracted elements into the mask.
+  for (APInt Elt : EltBits)
+    RawMask.push_back(Elt.getZExtValue());
 
   return true;
 }
@@ -5397,6 +5546,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::BLENDI:
     ImmN = N->getOperand(N->getNumOperands()-1);
     DecodeBLENDMask(VT, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUFP:
     ImmN = N->getOperand(N->getNumOperands()-1);
@@ -5465,8 +5615,18 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     IsUnary = true;
     break;
   case X86ISD::VBROADCAST: {
-    // We only decode broadcasts of same-sized vectors at the moment.
-    if (N->getOperand(0).getValueType() == VT) {
+    SDValue N0 = N->getOperand(0);
+    // See if we're broadcasting from index 0 of an EXTRACT_SUBVECTOR. If so,
+    // add the pre-extracted value to the Ops vector.
+    if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N0.getOperand(0).getValueType() == VT &&
+        N0.getConstantOperandVal(1) == 0)
+      Ops.push_back(N0.getOperand(0));
+
+    // We only decode broadcasts of same-sized vectors, unless the broadcast
+    // came from an extract from the original width. If we found one, we
+    // pushed it the Ops vector above.
+    if (N0.getValueType() == VT || !Ops.empty()) {
       DecodeVectorBroadcast(VT, Mask);
       IsUnary = true;
       break;
@@ -5661,6 +5821,19 @@ static bool setTargetShuffleZeroElements(SDValue N,
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
+  assert((VT.getSizeInBits() % Mask.size()) == 0 &&
+         "Illegal split of shuffle value type");
+  unsigned EltSizeInBits = VT.getSizeInBits() / Mask.size();
+
+  // Extract known constant input data.
+  APInt UndefSrcElts[2];
+  SmallVector<APInt, 32> SrcEltBits[2];
+  bool IsSrcConstant[2] = {
+      getTargetConstantBitsFromNode(V1, EltSizeInBits, UndefSrcElts[0],
+                                    SrcEltBits[0], true, false),
+      getTargetConstantBitsFromNode(V2, EltSizeInBits, UndefSrcElts[1],
+                                    SrcEltBits[1], true, false)};
+
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     int M = Mask[i];
 
@@ -5669,6 +5842,7 @@ static bool setTargetShuffleZeroElements(SDValue N,
       continue;
 
     // Determine shuffle input and normalize the mask.
+    unsigned SrcIdx = M / Size;
     SDValue V = M < Size ? V1 : V2;
     M %= Size;
 
@@ -5679,47 +5853,26 @@ static bool setTargetShuffleZeroElements(SDValue N,
     }
 
     // SCALAR_TO_VECTOR - only the first element is defined, and the rest UNDEF.
+    // TODO: We currently only set UNDEF for integer types - floats use the same
+    // registers as vectors and many of the scalar folded loads rely on the
+    // SCALAR_TO_VECTOR pattern.
     if (V.getOpcode() == ISD::SCALAR_TO_VECTOR &&
         (Size % V.getValueType().getVectorNumElements()) == 0) {
       int Scale = Size / V.getValueType().getVectorNumElements();
-      if (((M / Scale) == 0) && X86::isZeroNode(V.getOperand(0)))
-        Mask[i] = SM_SentinelZero;
-      continue;
-    }
-
-    // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements.
-    if (V.getOpcode() != ISD::BUILD_VECTOR)
-      continue;
-
-    // If the BUILD_VECTOR has fewer elements then the (larger) source
-    // element must be UNDEF/ZERO.
-    // TODO: Is it worth testing the individual bits of a constant?
-    if ((Size % V.getNumOperands()) == 0) {
-      int Scale = Size / V->getNumOperands();
-      SDValue Op = V.getOperand(M / Scale);
-      if (Op.isUndef())
+      int Idx = M / Scale;
+      if (Idx != 0 && !VT.isFloatingPoint())
         Mask[i] = SM_SentinelUndef;
-      else if (X86::isZeroNode(Op))
+      else if (Idx == 0 && X86::isZeroNode(V.getOperand(0)))
         Mask[i] = SM_SentinelZero;
       continue;
     }
 
-    // If the BUILD_VECTOR has more elements then all the (smaller) source
-    // elements must be all UNDEF or all ZERO.
-    if ((V.getNumOperands() % Size) == 0) {
-      int Scale = V->getNumOperands() / Size;
-      bool AllUndef = true;
-      bool AllZero = true;
-      for (int j = 0; j < Scale; ++j) {
-        SDValue Op = V.getOperand((M * Scale) + j);
-        AllUndef &= Op.isUndef();
-        AllZero &= X86::isZeroNode(Op);
-      }
-      if (AllUndef)
+    // Attempt to extract from the source's constant bits.
+    if (IsSrcConstant[SrcIdx]) {
+      if (UndefSrcElts[SrcIdx][M])
         Mask[i] = SM_SentinelUndef;
-      else if (AllZero)
+      else if (SrcEltBits[SrcIdx][M] == 0)
         Mask[i] = SM_SentinelZero;
-      continue;
     }
   }
 
@@ -5748,7 +5901,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
   case ISD::AND:
   case X86ISD::ANDNP: {
     // Attempt to decode as a per-byte mask.
-    SmallBitVector UndefElts;
+    APInt UndefElts;
     SmallVector<APInt, 32> EltBits;
     SDValue N0 = N.getOperand(0);
     SDValue N1 = N.getOperand(1);
@@ -5769,15 +5922,41 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     Ops.push_back(IsAndN ? N1 : N0);
     return true;
   }
+  case ISD::SCALAR_TO_VECTOR: {
+    // Match against a scalar_to_vector of an extract from a similar vector.
+    SDValue N0 = N.getOperand(0);
+    if (N0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+        N0.getOperand(0).getValueType() != VT ||
+        !isa<ConstantSDNode>(N0.getOperand(1)) ||
+        NumElts <= N0.getConstantOperandVal(1) ||
+        !N->isOnlyUserOf(N0.getNode()))
+      return false;
+    Ops.push_back(N0.getOperand(0));
+    Mask.push_back(N0.getConstantOperandVal(1));
+    Mask.append(NumElts - 1, SM_SentinelUndef);
+    return true;
+  }
+  case X86ISD::PINSRB:
   case X86ISD::PINSRW: {
-    // Attempt to recognise a PINSRW(ASSERTZEXT(PEXTRW)) shuffle pattern.
-    // TODO: Expand this to support PINSRB/INSERT_VECTOR_ELT/etc.
     SDValue InVec = N.getOperand(0);
     SDValue InScl = N.getOperand(1);
     uint64_t InIdx = N.getConstantOperandVal(2);
     assert(InIdx < NumElts && "Illegal insertion index");
+
+    // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
+    if (X86::isZeroNode(InScl)) {
+      Ops.push_back(InVec);
+      for (unsigned i = 0; i != NumElts; ++i)
+        Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
+      return true;
+    }
+
+    // Attempt to recognise a PINSR*(ASSERTZEXT(PEXTR*)) shuffle pattern.
+    // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
+    unsigned ExOp =
+        (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
     if (InScl.getOpcode() != ISD::AssertZext ||
-        InScl.getOperand(0).getOpcode() != X86ISD::PEXTRW)
+        InScl.getOperand(0).getOpcode() != ExOp)
       return false;
 
     SDValue ExVec = InScl.getOperand(0).getOperand(0);
@@ -5821,6 +6000,7 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
     }
     return true;
   }
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
   case X86ISD::VZEXT: {
     // TODO - add support for VPMOVZX with smaller input vector types.
     SDValue Src = N.getOperand(0);
@@ -5836,37 +6016,38 @@ static bool getFauxShuffleMask(SDValue N, SmallVectorImpl<int> &Mask,
   return false;
 }
 
-/// Removes unused shuffle source ops and adjusts the shuffle mask accordingly.
-static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Ops,
+/// Removes unused shuffle source inputs and adjusts the shuffle mask accordingly.
+static void resolveTargetShuffleInputsAndMask(SmallVectorImpl<SDValue> &Inputs,
                                               SmallVectorImpl<int> &Mask) {
   int MaskWidth = Mask.size();
-  SmallVector<SDValue, 8> UsedOps;
-  for (int i = 0, e = Ops.size(); i < e; ++i) {
-    int lo = UsedOps.size() * MaskWidth;
+  SmallVector<SDValue, 16> UsedInputs;
+  for (int i = 0, e = Inputs.size(); i < e; ++i) {
+    int lo = UsedInputs.size() * MaskWidth;
     int hi = lo + MaskWidth;
     if (any_of(Mask, [lo, hi](int i) { return (lo <= i) && (i < hi); })) {
-      UsedOps.push_back(Ops[i]);
+      UsedInputs.push_back(Inputs[i]);
       continue;
     }
     for (int &M : Mask)
       if (lo <= M)
         M -= MaskWidth;
   }
-  Ops = UsedOps;
+  Inputs = UsedInputs;
 }
 
 /// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs
 /// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the
 /// remaining input indices in case we now have a unary shuffle and adjust the
-/// Op0/Op1 inputs accordingly.
+/// inputs accordingly.
 /// Returns true if the target shuffle mask was decoded.
-static bool resolveTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Ops,
+static bool resolveTargetShuffleInputs(SDValue Op,
+                                       SmallVectorImpl<SDValue> &Inputs,
                                        SmallVectorImpl<int> &Mask) {
-  if (!setTargetShuffleZeroElements(Op, Mask, Ops))
-    if (!getFauxShuffleMask(Op, Mask, Ops))
+  if (!setTargetShuffleZeroElements(Op, Mask, Inputs))
+    if (!getFauxShuffleMask(Op, Mask, Inputs))
       return false;
 
-  resolveTargetShuffleInputsAndMask(Ops, Mask);
+  resolveTargetShuffleInputsAndMask(Inputs, Mask);
   return true;
 }
 
@@ -5941,10 +6122,9 @@ static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
 
 /// Custom lower build_vector of v16i8.
 static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
-                                       unsigned NumNonZero, unsigned NumZero,
-                                       SelectionDAG &DAG,
-                                       const X86Subtarget &Subtarget,
-                                       const TargetLowering &TLI) {
+                                     unsigned NumNonZero, unsigned NumZero,
+                                     SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
   if (NumNonZero > 8)
     return SDValue();
 
@@ -5955,18 +6135,26 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
   // SSE4.1 - use PINSRB to insert each byte directly.
   if (Subtarget.hasSSE41()) {
     for (unsigned i = 0; i < 16; ++i) {
-      bool isNonZero = (NonZeros & (1 << i)) != 0;
-      if (isNonZero) {
+      bool IsNonZero = (NonZeros & (1 << i)) != 0;
+      if (IsNonZero) {
+        // If the build vector contains zeros or our first insertion is not the
+        // first index then insert into zero vector to break any register
+        // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
         if (First) {
-          if (NumZero)
-            V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
-          else
-            V = DAG.getUNDEF(MVT::v16i8);
           First = false;
+          if (NumZero || 0 != i)
+            V = getZeroVector(MVT::v16i8, Subtarget, DAG, dl);
+          else {
+            assert(0 == i && "Expected insertion into zero-index");
+            V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+            V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+            V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+            V = DAG.getBitcast(MVT::v16i8, V);
+            continue;
+          }
         }
-        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
-                        MVT::v16i8, V, Op.getOperand(i),
-                        DAG.getIntPtrConstant(i, dl));
+        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v16i8, V,
+                        Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
       }
     }
 
@@ -5985,24 +6173,35 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
     }
 
     if ((i & 1) != 0) {
+      // FIXME: Investigate extending to i32 instead of just i16.
+      // FIXME: Investigate combining the first 4 bytes as a i32 instead.
       SDValue ThisElt, LastElt;
-      bool LastIsNonZero = (NonZeros & (1 << (i-1))) != 0;
+      bool LastIsNonZero = (NonZeros & (1 << (i - 1))) != 0;
       if (LastIsNonZero) {
-        LastElt = DAG.getNode(ISD::ZERO_EXTEND, dl,
-                              MVT::i16, Op.getOperand(i-1));
+        LastElt =
+            DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i - 1));
       }
       if (ThisIsNonZero) {
         ThisElt = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i16, Op.getOperand(i));
-        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16,
-                              ThisElt, DAG.getConstant(8, dl, MVT::i8));
+        ThisElt = DAG.getNode(ISD::SHL, dl, MVT::i16, ThisElt,
+                              DAG.getConstant(8, dl, MVT::i8));
         if (LastIsNonZero)
           ThisElt = DAG.getNode(ISD::OR, dl, MVT::i16, ThisElt, LastElt);
       } else
         ThisElt = LastElt;
 
-      if (ThisElt.getNode())
-        V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
-                        DAG.getIntPtrConstant(i/2, dl));
+      if (ThisElt) {
+        if (1 == i) {
+          V = NumZero ? DAG.getZExtOrTrunc(ThisElt, dl, MVT::i32)
+                      : DAG.getAnyExtOrTrunc(ThisElt, dl, MVT::i32);
+          V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+          V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+          V = DAG.getBitcast(MVT::v8i16, V);
+        } else {
+          V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V, ThisElt,
+                          DAG.getIntPtrConstant(i / 2, dl));
+        }
+      }
     }
   }
 
@@ -6013,8 +6212,7 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
 static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
                                      unsigned NumNonZero, unsigned NumZero,
                                      SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget,
-                                     const TargetLowering &TLI) {
+                                     const X86Subtarget &Subtarget) {
   if (NumNonZero > 4)
     return SDValue();
 
@@ -6022,18 +6220,26 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
   SDValue V;
   bool First = true;
   for (unsigned i = 0; i < 8; ++i) {
-    bool isNonZero = (NonZeros & (1 << i)) != 0;
-    if (isNonZero) {
+    bool IsNonZero = (NonZeros & (1 << i)) != 0;
+    if (IsNonZero) {
+      // If the build vector contains zeros or our first insertion is not the
+      // first index then insert into zero vector to break any register
+      // dependency else use SCALAR_TO_VECTOR/VZEXT_MOVL.
       if (First) {
-        if (NumZero)
-          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
-        else
-          V = DAG.getUNDEF(MVT::v8i16);
         First = false;
+        if (NumZero || 0 != i)
+          V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
+        else {
+          assert(0 == i && "Expected insertion into zero-index");
+          V = DAG.getAnyExtOrTrunc(Op.getOperand(i), dl, MVT::i32);
+          V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, V);
+          V = DAG.getNode(X86ISD::VZEXT_MOVL, dl, MVT::v4i32, V);
+          V = DAG.getBitcast(MVT::v8i16, V);
+          continue;
+        }
       }
-      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
-                      MVT::v8i16, V, Op.getOperand(i),
-                      DAG.getIntPtrConstant(i, dl));
+      V = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16, V,
+                      Op.getOperand(i), DAG.getIntPtrConstant(i, dl));
     }
   }
 
@@ -6042,8 +6248,7 @@ static SDValue LowerBuildVectorv8i16(SDValue Op, unsigned NonZeros,
 
 /// Custom lower build_vector of v4i32 or v4f32.
 static SDValue LowerBuildVectorv4x32(SDValue Op, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget,
-                                     const TargetLowering &TLI) {
+                                     const X86Subtarget &Subtarget) {
   // Find all zeroable elements.
   std::bitset<4> Zeroable;
   for (int i=0; i < 4; ++i) {
@@ -6239,7 +6444,7 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
 ///
 /// Example: <load i32 *a, load i32 *a+4, zero, undef> -> zextload a
 static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
-                                        SDLoc &DL, SelectionDAG &DAG,
+                                        const SDLoc &DL, SelectionDAG &DAG,
                                         bool isAfterLegalize) {
   unsigned NumElems = Elts.size();
 
@@ -6410,7 +6615,7 @@ static Constant *getConstantVector(MVT VT, const APInt &SplatValue,
 
   SmallVector<Constant *, 32> ConstantVec;
   for (unsigned i = 0; i < NumElm; i++) {
-    APInt Val = SplatValue.lshr(ScalarSize * i).trunc(ScalarSize);
+    APInt Val = SplatValue.extractBits(ScalarSize, ScalarSize * i);
     Constant *Const;
     if (VT.isFloatingPoint()) {
       assert((ScalarSize == 32 || ScalarSize == 64) &&
@@ -6691,6 +6896,7 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
 
     SDValue ExtractedFromVec = Op.getOperand(i).getOperand(0);
     SDValue ExtIdx = Op.getOperand(i).getOperand(1);
+
     // Quit if non-constant index.
     if (!isa<ConstantSDNode>(ExtIdx))
       return SDValue();
@@ -6721,11 +6927,10 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
 
   VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
   SDValue NV = DAG.getVectorShuffle(VT, DL, VecIn1, VecIn2, Mask);
-  for (unsigned i = 0, e = InsertIndices.size(); i != e; ++i) {
-    unsigned Idx = InsertIndices[i];
+
+  for (unsigned Idx : InsertIndices)
     NV = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT, NV, Op.getOperand(Idx),
                      DAG.getIntPtrConstant(Idx, DL));
-  }
 
   return NV;
 }
@@ -7374,7 +7579,7 @@ static SDValue materializeVectorConstant(SDValue Op, SelectionDAG &DAG,
         (VT == MVT::v8i32 && Subtarget.hasInt256()))
       return Op;
 
-    return getOnesVector(VT, Subtarget, DAG, DL);
+    return getOnesVector(VT, DAG, DL);
   }
 
   return SDValue();
@@ -7445,7 +7650,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     // a constant pool load than it is to do a movd + shuffle.
     if (ExtVT == MVT::i64 && !Subtarget.is64Bit() &&
         (!IsAllConstants || Idx == 0)) {
-      if (DAG.MaskedValueIsZero(Item, APInt::getBitsSet(64, 32, 64))) {
+      if (DAG.MaskedValueIsZero(Item, APInt::getHighBitsSet(64, 32))) {
         // Handle SSE only.
         assert(VT == MVT::v2i64 && "Expected an SSE value type!");
         MVT VecVT = MVT::v4i32;
@@ -7588,17 +7793,17 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   // If element VT is < 32 bits, convert it to inserts into a zero vector.
   if (EVTBits == 8 && NumElems == 16)
     if (SDValue V = LowerBuildVectorv16i8(Op, NonZeros, NumNonZero, NumZero,
-                                          DAG, Subtarget, *this))
+                                          DAG, Subtarget))
       return V;
 
   if (EVTBits == 16 && NumElems == 8)
     if (SDValue V = LowerBuildVectorv8i16(Op, NonZeros, NumNonZero, NumZero,
-                                          DAG, Subtarget, *this))
+                                          DAG, Subtarget))
       return V;
 
   // If element VT is == 32 bits and has 4 elems, try to generate an INSERTPS
   if (EVTBits == 32 && NumElems == 4)
-    if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget, *this))
+    if (SDValue V = LowerBuildVectorv4x32(Op, DAG, Subtarget))
       return V;
 
   // If element VT is == 32 bits, turn it into a number of shuffles.
@@ -7983,7 +8188,7 @@ static bool isShuffleEquivalent(SDValue V1, SDValue V2, ArrayRef<int> Mask,
               ExpectedBV->getOperand(ExpectedMask[i] % Size))
         return false;
     }
-}
+  }
 
   return true;
 }
@@ -8013,6 +8218,41 @@ static bool isTargetShuffleEquivalent(ArrayRef<int> Mask,
   return true;
 }
 
+// Merges a general DAG shuffle mask and zeroable bit mask into a target shuffle
+// mask.
+static SmallVector<int, 64> createTargetShuffleMask(ArrayRef<int> Mask,
+                                                    const APInt &Zeroable) {
+  int NumElts = Mask.size();
+  assert(NumElts == (int)Zeroable.getBitWidth() && "Mismatch mask sizes");
+
+  SmallVector<int, 64> TargetMask(NumElts, SM_SentinelUndef);
+  for (int i = 0; i != NumElts; ++i) {
+    int M = Mask[i];
+    if (M == SM_SentinelUndef)
+      continue;
+    assert(0 <= M && M < (2 * NumElts) && "Out of range shuffle index");
+    TargetMask[i] = (Zeroable[i] ? SM_SentinelZero : M);
+  }
+  return TargetMask;
+}
+
+// Check if the shuffle mask is suitable for the AVX vpunpcklwd or vpunpckhwd
+// instructions.
+static bool isUnpackWdShuffleMask(ArrayRef<int> Mask, MVT VT) {
+  if (VT != MVT::v8i32 && VT != MVT::v8f32)
+    return false;
+
+  SmallVector<int, 8> Unpcklwd;
+  createUnpackShuffleMask(MVT::v8i16, Unpcklwd, /* Lo = */ true,
+                          /* Unary = */ false);
+  SmallVector<int, 8> Unpckhwd;
+  createUnpackShuffleMask(MVT::v8i16, Unpckhwd, /* Lo = */ false,
+                          /* Unary = */ false);
+  bool IsUnpackwdMask = (isTargetShuffleEquivalent(Mask, Unpcklwd) ||
+                         isTargetShuffleEquivalent(Mask, Unpckhwd));
+  return IsUnpackwdMask;
+}
+
 /// \brief Get a 4-lane 8-bit shuffle immediate for a mask.
 ///
 /// This helper function produces an 8-bit shuffle immediate corresponding to
@@ -8049,9 +8289,9 @@ static SDValue getV4X86ShuffleImm8ForMask(ArrayRef<int> Mask, const SDLoc &DL,
 /// zero. Many x86 shuffles can zero lanes cheaply and we often want to handle
 /// as many lanes with this technique as possible to simplify the remaining
 /// shuffle.
-static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
-                                                     SDValue V1, SDValue V2) {
-  SmallBitVector Zeroable(Mask.size(), false);
+static APInt computeZeroableShuffleElements(ArrayRef<int> Mask,
+                                            SDValue V1, SDValue V2) {
+  APInt Zeroable(Mask.size(), 0);
   V1 = peekThroughBitcasts(V1);
   V2 = peekThroughBitcasts(V2);
 
@@ -8066,7 +8306,7 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
     int M = Mask[i];
     // Handle the easy cases.
     if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) {
-      Zeroable[i] = true;
+      Zeroable.setBit(i);
       continue;
     }
 
@@ -8084,17 +8324,19 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
       int Scale = Size / V->getNumOperands();
       SDValue Op = V.getOperand(M / Scale);
       if (Op.isUndef() || X86::isZeroNode(Op))
-        Zeroable[i] = true;
+        Zeroable.setBit(i);
       else if (ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op)) {
         APInt Val = Cst->getAPIntValue();
         Val = Val.lshr((M % Scale) * ScalarSizeInBits);
         Val = Val.getLoBits(ScalarSizeInBits);
-        Zeroable[i] = (Val == 0);
+        if (Val == 0)
+          Zeroable.setBit(i);
       } else if (ConstantFPSDNode *Cst = dyn_cast<ConstantFPSDNode>(Op)) {
         APInt Val = Cst->getValueAPF().bitcastToAPInt();
         Val = Val.lshr((M % Scale) * ScalarSizeInBits);
         Val = Val.getLoBits(ScalarSizeInBits);
-        Zeroable[i] = (Val == 0);
+        if (Val == 0)
+          Zeroable.setBit(i);
       }
       continue;
     }
@@ -8108,7 +8350,8 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
         SDValue Op = V.getOperand((M * Scale) + j);
         AllZeroable &= (Op.isUndef() || X86::isZeroNode(Op));
       }
-      Zeroable[i] = AllZeroable;
+      if (AllZeroable)
+        Zeroable.setBit(i);
       continue;
     }
   }
@@ -8123,19 +8366,20 @@ static SmallBitVector computeZeroableShuffleElements(ArrayRef<int> Mask,
 //
 // The function looks for a sub-mask that the nonzero elements are in
 // increasing order. If such sub-mask exist. The function returns true.
-static bool isNonZeroElementsInOrder(const SmallBitVector &Zeroable,
+static bool isNonZeroElementsInOrder(const APInt &Zeroable,
                                      ArrayRef<int> Mask, const EVT &VectorType,
                                      bool &IsZeroSideLeft) {
   int NextElement = -1;
   // Check if the Mask's nonzero elements are in increasing order.
-  for (int i = 0, e = Zeroable.size(); i < e; i++) {
+  for (int i = 0, e = Mask.size(); i < e; i++) {
     // Checks if the mask's zeros elements are built from only zeros.
-    if (Mask[i] == -1)
+    assert(Mask[i] >= -1 && "Out of bound mask element!");
+    if (Mask[i] < 0)
       return false;
     if (Zeroable[i])
       continue;
     // Find the lowest non zero element
-    if (NextElement == -1) {
+    if (NextElement < 0) {
       NextElement = Mask[i] != 0 ? VectorType.getVectorNumElements() : 0;
       IsZeroSideLeft = NextElement != 0;
     }
@@ -8151,7 +8395,7 @@ static bool isNonZeroElementsInOrder(const SmallBitVector &Zeroable,
 static SDValue lowerVectorShuffleWithPSHUFB(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2,
-                                            const SmallBitVector &Zeroable,
+                                            const APInt &Zeroable,
                                             const X86Subtarget &Subtarget,
                                             SelectionDAG &DAG) {
   int Size = Mask.size();
@@ -8206,19 +8450,9 @@ static SDValue getMaskNode(SDValue Mask, MVT MaskVT,
                            const X86Subtarget &Subtarget, SelectionDAG &DAG,
                            const SDLoc &dl);
 
-// Function convertBitVectorToUnsigned - The function gets SmallBitVector
-// as argument and convert him to unsigned.
-// The output of the function is not(zeroable)
-static unsigned convertBitVectorToUnsiged(const SmallBitVector &Zeroable) {
-  unsigned convertBit = 0;
-  for (int i = 0, e = Zeroable.size(); i < e; i++)
-    convertBit |= !(Zeroable[i]) << i;
-  return convertBit;
-}
-
 // X86 has dedicated shuffle that can be lowered to VEXPAND
 static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
-                                          const SmallBitVector &Zeroable,
+                                          const APInt &Zeroable,
                                           ArrayRef<int> Mask, SDValue &V1,
                                           SDValue &V2, SelectionDAG &DAG,
                                           const X86Subtarget &Subtarget) {
@@ -8226,7 +8460,7 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
   if (!isNonZeroElementsInOrder(Zeroable, Mask, V1.getValueType(),
                                 IsLeftZeroSide))
     return SDValue();
-  unsigned VEXPANDMask = convertBitVectorToUnsiged(Zeroable);
+  unsigned VEXPANDMask = (~Zeroable).getZExtValue();
   MVT IntegerType =
       MVT::getIntegerVT(std::max((int)VT.getVectorNumElements(), 8));
   SDValue MaskNode = DAG.getConstant(VEXPANDMask, DL, IntegerType);
@@ -8242,6 +8476,91 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT,
                      ZeroVector);
 }
 
+static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2,
+                                        unsigned &UnpackOpcode, bool IsUnary,
+                                        ArrayRef<int> TargetMask, SDLoc &DL,
+                                        SelectionDAG &DAG,
+                                        const X86Subtarget &Subtarget) {
+  int NumElts = VT.getVectorNumElements();
+
+  bool Undef1 = true, Undef2 = true, Zero1 = true, Zero2 = true;
+  for (int i = 0; i != NumElts; i += 2) {
+    int M1 = TargetMask[i + 0];
+    int M2 = TargetMask[i + 1];
+    Undef1 &= (SM_SentinelUndef == M1);
+    Undef2 &= (SM_SentinelUndef == M2);
+    Zero1 &= isUndefOrZero(M1);
+    Zero2 &= isUndefOrZero(M2);
+  }
+  assert(!((Undef1 || Zero1) && (Undef2 || Zero2)) &&
+         "Zeroable shuffle detected");
+
+  // Attempt to match the target mask against the unpack lo/hi mask patterns.
+  SmallVector<int, 64> Unpckl, Unpckh;
+  createUnpackShuffleMask(VT, Unpckl, /* Lo = */ true, IsUnary);
+  if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+    UnpackOpcode = X86ISD::UNPCKL;
+    V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+    V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+    return true;
+  }
+
+  createUnpackShuffleMask(VT, Unpckh, /* Lo = */ false, IsUnary);
+  if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+    UnpackOpcode = X86ISD::UNPCKH;
+    V2 = (Undef2 ? DAG.getUNDEF(VT) : (IsUnary ? V1 : V2));
+    V1 = (Undef1 ? DAG.getUNDEF(VT) : V1);
+    return true;
+  }
+
+  // If an unary shuffle, attempt to match as an unpack lo/hi with zero.
+  if (IsUnary && (Zero1 || Zero2)) {
+    // Don't bother if we can blend instead.
+    if ((Subtarget.hasSSE41() || VT == MVT::v2i64 || VT == MVT::v2f64) &&
+        isSequentialOrUndefOrZeroInRange(TargetMask, 0, NumElts, 0))
+      return false;
+
+    bool MatchLo = true, MatchHi = true;
+    for (int i = 0; (i != NumElts) && (MatchLo || MatchHi); ++i) {
+      int M = TargetMask[i];
+
+      // Ignore if the input is known to be zero or the index is undef.
+      if ((((i & 1) == 0) && Zero1) || (((i & 1) == 1) && Zero2) ||
+          (M == SM_SentinelUndef))
+        continue;
+
+      MatchLo &= (M == Unpckl[i]);
+      MatchHi &= (M == Unpckh[i]);
+    }
+
+    if (MatchLo || MatchHi) {
+      UnpackOpcode = MatchLo ? X86ISD::UNPCKL : X86ISD::UNPCKH;
+      V2 = Zero2 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+      V1 = Zero1 ? getZeroVector(VT, Subtarget, DAG, DL) : V1;
+      return true;
+    }
+  }
+
+  // If a binary shuffle, commute and try again.
+  if (!IsUnary) {
+    ShuffleVectorSDNode::commuteMask(Unpckl);
+    if (isTargetShuffleEquivalent(TargetMask, Unpckl)) {
+      UnpackOpcode = X86ISD::UNPCKL;
+      std::swap(V1, V2);
+      return true;
+    }
+
+    ShuffleVectorSDNode::commuteMask(Unpckh);
+    if (isTargetShuffleEquivalent(TargetMask, Unpckh)) {
+      UnpackOpcode = X86ISD::UNPCKH;
+      std::swap(V1, V2);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 // X86 has dedicated unpack instructions that can handle specific blend
 // operations: UNPCKH and UNPCKL.
 static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
@@ -8275,13 +8594,12 @@ static SDValue lowerVectorShuffleWithUNPCK(const SDLoc &DL, MVT VT,
 /// one of the inputs being zeroable.
 static SDValue lowerVectorShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
-                                           const SmallBitVector &Zeroable,
+                                           const APInt &Zeroable,
                                            SelectionDAG &DAG) {
   assert(!VT.isFloatingPoint() && "Floating point types are not supported");
   MVT EltVT = VT.getVectorElementType();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
-  SDValue AllOnes =
-      DAG.getConstant(APInt::getAllOnesValue(EltVT.getSizeInBits()), DL, EltVT);
+  SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   SmallVector<SDValue, 16> VMaskOps(Mask.size(), Zero);
   SDValue V;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
@@ -8313,10 +8631,8 @@ static SDValue lowerVectorShuffleAsBitBlend(const SDLoc &DL, MVT VT, SDValue V1,
                                             SelectionDAG &DAG) {
   assert(VT.isInteger() && "Only supports integer vector types!");
   MVT EltVT = VT.getVectorElementType();
-  int NumEltBits = EltVT.getSizeInBits();
   SDValue Zero = DAG.getConstant(0, DL, EltVT);
-  SDValue AllOnes = DAG.getConstant(APInt::getAllOnesValue(NumEltBits), DL,
-                                    EltVT);
+  SDValue AllOnes = DAG.getAllOnesConstant(DL, EltVT);
   SmallVector<SDValue, 16> MaskOps;
   for (int i = 0, Size = Mask.size(); i < Size; ++i) {
     if (Mask[i] >= 0 && Mask[i] != i && Mask[i] != i + Size)
@@ -8339,29 +8655,25 @@ static SDValue getVectorMaskingNode(SDValue Op, SDValue Mask,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG);
 
-/// \brief Try to emit a blend instruction for a shuffle.
-///
-/// This doesn't do any checks for the availability of instructions for blending
-/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
-/// be matched in the backend with the type given. What it does check for is
-/// that the shuffle mask is a blend, or convertible into a blend with zero.
-static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
-                                         SDValue V2, ArrayRef<int> Original,
-                                         const SmallBitVector &Zeroable,
-                                         const X86Subtarget &Subtarget,
-                                         SelectionDAG &DAG) {
-  bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode());
-  bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode());
-  SmallVector<int, 8> Mask(Original.begin(), Original.end());
-  bool ForceV1Zero = false, ForceV2Zero = false;
+static bool matchVectorShuffleAsBlend(SDValue V1, SDValue V2,
+                                      MutableArrayRef<int> TargetMask,
+                                      bool &ForceV1Zero, bool &ForceV2Zero,
+                                      uint64_t &BlendMask) {
+  bool V1IsZeroOrUndef =
+      V1.isUndef() || ISD::isBuildVectorAllZeros(V1.getNode());
+  bool V2IsZeroOrUndef =
+      V2.isUndef() || ISD::isBuildVectorAllZeros(V2.getNode());
+
+  BlendMask = 0;
+  ForceV1Zero = false, ForceV2Zero = false;
+  assert(TargetMask.size() <= 64 && "Shuffle mask too big for blend mask");
 
   // Attempt to generate the binary blend mask. If an input is zero then
   // we can use any lane.
   // TODO: generalize the zero matching to any scalar like isShuffleEquivalent.
-  uint64_t BlendMask = 0;
-  for (int i = 0, Size = Mask.size(); i < Size; ++i) {
-    int M = Mask[i];
-    if (M < 0)
+  for (int i = 0, Size = TargetMask.size(); i < Size; ++i) {
+    int M = TargetMask[i];
+    if (M == SM_SentinelUndef)
       continue;
     if (M == i)
       continue;
@@ -8369,21 +8681,50 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
       BlendMask |= 1ull << i;
       continue;
     }
-    if (Zeroable[i]) {
-      if (V1IsZero) {
+    if (M == SM_SentinelZero) {
+      if (V1IsZeroOrUndef) {
         ForceV1Zero = true;
-        Mask[i] = i;
+        TargetMask[i] = i;
         continue;
       }
-      if (V2IsZero) {
+      if (V2IsZeroOrUndef) {
         ForceV2Zero = true;
         BlendMask |= 1ull << i;
-        Mask[i] = i + Size;
+        TargetMask[i] = i + Size;
         continue;
       }
     }
-    return SDValue(); // Shuffled input!
+    return false;
   }
+  return true;
+}
+
+uint64_t scaleVectorShuffleBlendMask(uint64_t BlendMask, int Size, int Scale) {
+  uint64_t ScaledMask = 0;
+  for (int i = 0; i != Size; ++i)
+    if (BlendMask & (1ull << i))
+      ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
+  return ScaledMask;
+}
+
+/// \brief Try to emit a blend instruction for a shuffle.
+///
+/// This doesn't do any checks for the availability of instructions for blending
+/// these values. It relies on the availability of the X86ISD::BLENDI pattern to
+/// be matched in the backend with the type given. What it does check for is
+/// that the shuffle mask is a blend, or convertible into a blend with zero.
+static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
+                                         SDValue V2, ArrayRef<int> Original,
+                                         const APInt &Zeroable,
+                                         const X86Subtarget &Subtarget,
+                                         SelectionDAG &DAG) {
+  SmallVector<int, 64> Mask = createTargetShuffleMask(Original, Zeroable);
+
+  uint64_t BlendMask = 0;
+  bool ForceV1Zero = false, ForceV2Zero = false;
+  if (!matchVectorShuffleAsBlend(V1, V2, Mask, ForceV1Zero, ForceV2Zero,
+                                 BlendMask))
+    return SDValue();
 
   // Create a REAL zero vector - ISD::isBuildVectorAllZeros allows UNDEFs.
   if (ForceV1Zero)
@@ -8391,14 +8732,6 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
   if (ForceV2Zero)
     V2 = getZeroVector(VT, Subtarget, DAG, DL);
 
-  auto ScaleBlendMask = [](uint64_t BlendMask, int Size, int Scale) {
-    uint64_t ScaledMask = 0;
-    for (int i = 0; i != Size; ++i)
-      if (BlendMask & (1ull << i))
-        ScaledMask |= ((1ull << Scale) - 1) << (i * Scale);
-    return ScaledMask;
-  };
-
   switch (VT.SimpleTy) {
   case MVT::v2f64:
   case MVT::v4f32:
@@ -8418,7 +8751,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     if (Subtarget.hasAVX2()) {
       // Scale the blend by the number of 32-bit dwords per element.
       int Scale =  VT.getScalarSizeInBits() / 32;
-      BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+      BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
       MVT BlendVT = VT.getSizeInBits() > 128 ? MVT::v8i32 : MVT::v4i32;
       V1 = DAG.getBitcast(BlendVT, V1);
       V2 = DAG.getBitcast(BlendVT, V2);
@@ -8431,7 +8764,7 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
     // For integer shuffles we need to expand the mask and cast the inputs to
     // v8i16s prior to blending.
     int Scale = 8 / VT.getVectorNumElements();
-    BlendMask = ScaleBlendMask(BlendMask, Mask.size(), Scale);
+    BlendMask = scaleVectorShuffleBlendMask(BlendMask, Mask.size(), Scale);
     V1 = DAG.getBitcast(MVT::v8i16, V1);
     V2 = DAG.getBitcast(MVT::v8i16, V2);
     return DAG.getBitcast(VT,
@@ -8551,7 +8884,7 @@ static SDValue lowerVectorShuffleAsBlendAndPermute(const SDLoc &DL, MVT VT,
   return DAG.getVectorShuffle(VT, DL, V, DAG.getUNDEF(VT), PermuteMask);
 }
 
-/// \brief Generic routine to decompose a shuffle and blend into indepndent
+/// \brief Generic routine to decompose a shuffle and blend into independent
 /// blends and permutes.
 ///
 /// This matches the extremely common pattern for handling combined
@@ -8805,7 +9138,7 @@ static SDValue lowerVectorShuffleAsRotate(const SDLoc &DL, MVT VT,
 static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
                                      unsigned ScalarSizeInBits,
                                      ArrayRef<int> Mask, int MaskOffset,
-                                     const SmallBitVector &Zeroable,
+                                     const APInt &Zeroable,
                                      const X86Subtarget &Subtarget) {
   int Size = Mask.size();
   unsigned SizeInBits = Size * ScalarSizeInBits;
@@ -8867,7 +9200,7 @@ static int matchVectorShuffleAsShift(MVT &ShiftVT, unsigned &Opcode,
 
 static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
                                          SDValue V2, ArrayRef<int> Mask,
-                                         const SmallBitVector &Zeroable,
+                                         const APInt &Zeroable,
                                          const X86Subtarget &Subtarget,
                                          SelectionDAG &DAG) {
   int Size = Mask.size();
@@ -8903,12 +9236,12 @@ static SDValue lowerVectorShuffleAsShift(const SDLoc &DL, MVT VT, SDValue V1,
 /// \brief Try to lower a vector shuffle using SSE4a EXTRQ/INSERTQ.
 static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
                                            SDValue V2, ArrayRef<int> Mask,
-                                           const SmallBitVector &Zeroable,
+                                           const APInt &Zeroable,
                                            SelectionDAG &DAG) {
   int Size = Mask.size();
   int HalfSize = Size / 2;
   assert(Size == (int)VT.getVectorNumElements() && "Unexpected mask size");
-  assert(!Zeroable.all() && "Fully zeroable shuffle mask");
+  assert(!Zeroable.isAllOnesValue() && "Fully zeroable shuffle mask");
 
   // Upper half must be undefined.
   if (!isUndefInRange(Mask, HalfSize, HalfSize))
@@ -9035,7 +9368,7 @@ static SDValue lowerVectorShuffleWithSSE4A(const SDLoc &DL, MVT VT, SDValue V1,
 /// Given a specific number of elements, element bit width, and extension
 /// stride, produce either a zero or any extension based on the available
 /// features of the subtarget. The extended elements are consecutive and
-/// begin and can start from an offseted element index in the input; to
+/// begin and can start from an offsetted element index in the input; to
 /// avoid excess shuffling the offset must either being in the bottom lane
 /// or at the start of a higher lane. All extended elements must be from
 /// the same lane.
@@ -9075,21 +9408,14 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
   // Found a valid zext mask! Try various lowering strategies based on the
   // input type and available ISA extensions.
   if (Subtarget.hasSSE41()) {
-    // Not worth offseting 128-bit vectors if scale == 2, a pattern using
+    // Not worth offsetting 128-bit vectors if scale == 2, a pattern using
     // PUNPCK will catch this in a later shuffle match.
     if (Offset && Scale == 2 && VT.is128BitVector())
       return SDValue();
     MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(EltBits * Scale),
                                  NumElements / Scale);
     InputV = ShuffleOffset(InputV);
-
-    // For 256-bit vectors, we only need the lower (128-bit) input half.
-    // For 512-bit vectors, we only need the lower input half or quarter.
-    if (VT.getSizeInBits() > 128)
-      InputV = extractSubVector(InputV, 0, DAG, DL,
-                                std::max(128, (int)VT.getSizeInBits() / Scale));
-
-    InputV = DAG.getNode(X86ISD::VZEXT, DL, ExtVT, InputV);
+    InputV = getExtendInVec(X86ISD::VZEXT, DL, ExtVT, InputV, DAG);
     return DAG.getBitcast(VT, InputV);
   }
 
@@ -9206,7 +9532,7 @@ static SDValue lowerVectorShuffleAsSpecificZeroOrAnyExtend(
 /// are both incredibly common and often quite performance sensitive.
 static SDValue lowerVectorShuffleAsZeroOrAnyExtend(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+    const APInt &Zeroable, const X86Subtarget &Subtarget,
     SelectionDAG &DAG) {
   int Bits = VT.getSizeInBits();
   int NumLanes = Bits / 128;
@@ -9362,7 +9688,7 @@ static bool isShuffleFoldableLoad(SDValue V) {
 /// across all subtarget feature sets.
 static SDValue lowerVectorShuffleAsElementInsertion(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const SmallBitVector &Zeroable, const X86Subtarget &Subtarget,
+    const APInt &Zeroable, const X86Subtarget &Subtarget,
     SelectionDAG &DAG) {
   MVT ExtVT = VT;
   MVT EltVT = VT.getVectorElementType();
@@ -9660,7 +9986,16 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
     if (((BroadcastIdx * EltSize) % 128) != 0)
       return SDValue();
 
-    MVT ExtVT = MVT::getVectorVT(VT.getScalarType(), 128 / EltSize);
+    // The shuffle input might have been a bitcast we looked through; look at
+    // the original input vector.  Emit an EXTRACT_SUBVECTOR of that type; we'll
+    // later bitcast it to BroadcastVT.
+    MVT SrcVT = V.getSimpleValueType();
+    assert(SrcVT.getScalarSizeInBits() == BroadcastVT.getScalarSizeInBits() &&
+           "Unexpected vector element size");
+    assert((SrcVT.is256BitVector() || SrcVT.is512BitVector()) &&
+           "Unexpected vector size");
+
+    MVT ExtVT = MVT::getVectorVT(SrcVT.getScalarType(), 128 / EltSize);
     V = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ExtVT, V,
                     DAG.getIntPtrConstant(BroadcastIdx, DL));
   }
@@ -9690,6 +10025,12 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
     BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
   }
 
+  // We only support broadcasting from 128-bit vectors to minimize the
+  // number of patterns we need to deal with in isel. So extract down to
+  // 128-bits.
+  if (SrcVT.getSizeInBits() > 128)
+    V = extract128BitVector(V, 0, DAG, DL);
+
   return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
 }
 
@@ -9701,7 +10042,7 @@ static SDValue lowerVectorShuffleAsBroadcast(const SDLoc &DL, MVT VT,
 // elements are zeroable.
 static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
                                          unsigned &InsertPSMask,
-                                         const SmallBitVector &Zeroable,
+                                         const APInt &Zeroable,
                                          ArrayRef<int> Mask,
                                          SelectionDAG &DAG) {
   assert(V1.getSimpleValueType().is128BitVector() && "Bad operand type!");
@@ -9790,7 +10131,7 @@ static bool matchVectorShuffleAsInsertPS(SDValue &V1, SDValue &V2,
 
 static SDValue lowerVectorShuffleAsInsertPS(const SDLoc &DL, SDValue V1,
                                             SDValue V2, ArrayRef<int> Mask,
-                                            const SmallBitVector &Zeroable,
+                                            const APInt &Zeroable,
                                             SelectionDAG &DAG) {
   assert(V1.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
   assert(V2.getSimpleValueType() == MVT::v4f32 && "Bad operand type!");
@@ -9925,7 +10266,7 @@ static SDValue lowerVectorShuffleAsPermuteAndUnpack(const SDLoc &DL, MVT VT,
 /// it is better to avoid lowering through this for integer vectors where
 /// possible.
 static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -10007,7 +10348,7 @@ static SDValue lowerV2F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// it falls back to the floating point shuffle operation with appropriate bit
 /// casting.
 static SDValue lowerV2I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -10226,7 +10567,7 @@ static SDValue lowerVectorShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
 /// domain crossing penalties, as these are sufficient to implement all v4f32
 /// shuffles.
 static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -10309,7 +10650,7 @@ static SDValue lowerV4F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// We try to handle these with integer-domain shuffles where we can, but for
 /// blends we use the floating point domain blend instructions.
 static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -10401,7 +10742,7 @@ static SDValue lowerV4I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // We implement this with SHUFPS because it can blend from two vectors.
   // Because we're going to eventually use SHUFPS, we use SHUFPS even to build
-  // up the inputs, bypassing domain shift penalties that we would encur if we
+  // up the inputs, bypassing domain shift penalties that we would incur if we
   // directly used PSHUFD on Nehalem and older. For newer chips, this isn't
   // relevant.
   SDValue CastV1 = DAG.getBitcast(MVT::v4f32, V1);
@@ -10432,18 +10773,16 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   assert(VT.getVectorElementType() == MVT::i16 && "Bad input type!");
   MVT PSHUFDVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() / 2);
 
-  assert(Mask.size() == 8 && "Shuffle mask length doen't match!");
+  assert(Mask.size() == 8 && "Shuffle mask length doesn't match!");
   MutableArrayRef<int> LoMask = Mask.slice(0, 4);
   MutableArrayRef<int> HiMask = Mask.slice(4, 4);
 
   SmallVector<int, 4> LoInputs;
-  std::copy_if(LoMask.begin(), LoMask.end(), std::back_inserter(LoInputs),
-               [](int M) { return M >= 0; });
+  copy_if(LoMask, std::back_inserter(LoInputs), [](int M) { return M >= 0; });
   std::sort(LoInputs.begin(), LoInputs.end());
   LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()), LoInputs.end());
   SmallVector<int, 4> HiInputs;
-  std::copy_if(HiMask.begin(), HiMask.end(), std::back_inserter(HiInputs),
-               [](int M) { return M >= 0; });
+  copy_if(HiMask, std::back_inserter(HiInputs), [](int M) { return M >= 0; });
   std::sort(HiInputs.begin(), HiInputs.end());
   HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()), HiInputs.end());
   int NumLToL =
@@ -10622,7 +10961,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
   };
   if ((NumLToL == 3 && NumHToL == 1) || (NumLToL == 1 && NumHToL == 3))
     return balanceSides(LToLInputs, HToLInputs, HToHInputs, LToHInputs, 0, 4);
-  else if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
+  if ((NumHToH == 3 && NumLToH == 1) || (NumHToH == 1 && NumLToH == 3))
     return balanceSides(HToHInputs, LToHInputs, LToLInputs, HToLInputs, 4, 0);
 
   // At this point there are at most two inputs to the low and high halves from
@@ -10878,7 +11217,7 @@ static SDValue lowerV8I16GeneralSingleInputVectorShuffle(
 /// blend if only one input is used.
 static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
     const SDLoc &DL, MVT VT, SDValue V1, SDValue V2, ArrayRef<int> Mask,
-    const SmallBitVector &Zeroable, SelectionDAG &DAG, bool &V1InUse,
+    const APInt &Zeroable, SelectionDAG &DAG, bool &V1InUse,
     bool &V2InUse) {
   SDValue V1Mask[16];
   SDValue V2Mask[16];
@@ -10939,7 +11278,7 @@ static SDValue lowerVectorShuffleAsBlendOfPSHUFBs(
 /// halves of the inputs separately (making them have relatively few inputs)
 /// and then concatenate them.
 static SDValue lowerV8I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -11123,7 +11462,7 @@ static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
 /// the existing lowering for v8i16 blends on each half, finally PACK-ing them
 /// back together.
 static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -11180,14 +11519,13 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
       if (!canWidenViaDuplication(Mask))
         return SDValue();
       SmallVector<int, 4> LoInputs;
-      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(LoInputs),
-                   [](int M) { return M >= 0 && M < 8; });
+      copy_if(Mask, std::back_inserter(LoInputs),
+              [](int M) { return M >= 0 && M < 8; });
       std::sort(LoInputs.begin(), LoInputs.end());
       LoInputs.erase(std::unique(LoInputs.begin(), LoInputs.end()),
                      LoInputs.end());
       SmallVector<int, 4> HiInputs;
-      std::copy_if(Mask.begin(), Mask.end(), std::back_inserter(HiInputs),
-                   [](int M) { return M >= 8; });
+      copy_if(Mask, std::back_inserter(HiInputs), [](int M) { return M >= 8; });
       std::sort(HiInputs.begin(), HiInputs.end());
       HiInputs.erase(std::unique(HiInputs.begin(), HiInputs.end()),
                      HiInputs.end());
@@ -11241,7 +11579,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
             PostDupI16Shuffle[i / 2] = MappedMask;
           else
             assert(PostDupI16Shuffle[i / 2] == MappedMask &&
-                   "Conflicting entrties in the original shuffle!");
+                   "Conflicting entries in the original shuffle!");
         }
       return DAG.getBitcast(
           MVT::v16i8,
@@ -11413,7 +11751,7 @@ static SDValue lowerV16I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// dispatches to the lowering routines accordingly.
 static SDValue lower128BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   switch (VT.SimpleTy) {
@@ -11669,7 +12007,7 @@ static SDValue lowerVectorShuffleAsLanePermuteAndBlend(const SDLoc &DL, MVT VT,
 /// \brief Handle lowering 2-lane 128-bit shuffles.
 static SDValue lowerV2X128VectorShuffle(const SDLoc &DL, MVT VT, SDValue V1,
                                         SDValue V2, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   SmallVector<int, 4> WidenedMask;
@@ -12139,7 +12477,7 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
                                          unsigned &ShuffleImm,
                                          ArrayRef<int> Mask) {
   int NumElts = VT.getVectorNumElements();
-  assert(VT.getScalarType() == MVT::f64 &&
+  assert(VT.getScalarSizeInBits() == 64 &&
          (NumElts == 2 || NumElts == 4 || NumElts == 8) &&
          "Unexpected data type for VSHUFPD");
 
@@ -12175,6 +12513,9 @@ static bool matchVectorShuffleWithSHUFPD(MVT VT, SDValue &V1, SDValue &V2,
 static SDValue lowerVectorShuffleWithSHUFPD(const SDLoc &DL, MVT VT,
                                             ArrayRef<int> Mask, SDValue V1,
                                             SDValue V2, SelectionDAG &DAG) {
+  assert((VT == MVT::v2f64 || VT == MVT::v4f64 || VT == MVT::v8f64)&&
+         "Unexpected data type for VSHUFPD");
+
   unsigned Immediate = 0;
   if (!matchVectorShuffleWithSHUFPD(VT, V1, V2, Immediate, Mask))
     return SDValue();
@@ -12201,7 +12542,7 @@ static SDValue lowerVectorShuffleWithPERMV(const SDLoc &DL, MVT VT,
 /// Also ends up handling lowering of 4-lane 64-bit integer shuffles when AVX2
 /// isn't available.
 static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12298,7 +12639,7 @@ static SDValue lowerV4F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v4i64 shuffling..
 static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12386,7 +12727,7 @@ static SDValue lowerV4I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// Also ends up handling lowering of 8-lane 32-bit integer shuffles when AVX2
 /// isn't available.
 static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12462,6 +12803,14 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                V1, V2, DAG, Subtarget))
       return V;
 
+  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+  // since after split we get a more efficient code using vpunpcklwd and
+  // vpunpckhwd instrs than vblend.
+  if (!Subtarget.hasAVX512() && isUnpackWdShuffleMask(Mask, MVT::v8f32))
+    if (SDValue V = lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8f32, V1, V2,
+                                                     Mask, DAG))
+      return V;
+
   // If we have AVX2 then we always want to lower with a blend because at v8 we
   // can fully permute the elements.
   if (Subtarget.hasAVX2())
@@ -12477,7 +12826,7 @@ static SDValue lowerV8F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v8i32 shuffling..
 static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12493,6 +12842,15 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v8i32, V1, V2, Mask, Zeroable, Subtarget, DAG))
     return ZExt;
 
+  // For non-AVX512 if the Mask is of 16bit elements in lane then try to split
+  // since after split we get a more efficient code than vblend by using
+  // vpunpcklwd and vpunpckhwd instrs.
+  if (isUnpackWdShuffleMask(Mask, MVT::v8i32) && !V2.isUndef() &&
+      !Subtarget.hasAVX512())
+    if (SDValue V =
+            lowerVectorShuffleAsSplitOrBlend(DL, MVT::v8i32, V1, V2, Mask, DAG))
+      return V;
+
   if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8i32, V1, V2, Mask,
                                                 Zeroable, Subtarget, DAG))
     return Blend;
@@ -12581,7 +12939,7 @@ static SDValue lowerV8I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v16i16 shuffling..
 static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -12667,7 +13025,7 @@ static SDValue lowerV16I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// This routine is only called when we have AVX2 and thus a reasonable
 /// instruction set for v32i8 shuffling..
 static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12740,7 +13098,7 @@ static SDValue lowerV32I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// together based on the available instructions.
 static SDValue lower256BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   // If we have a single input to the zero element, insert that into V1 if we
@@ -12892,7 +13250,7 @@ static SDValue lowerV4X128VectorShuffle(const SDLoc &DL, MVT VT,
 
 /// \brief Handle lowering of 8-lane 64-bit floating point shuffles.
 static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -12948,7 +13306,7 @@ static SDValue lowerV8F64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
 /// \brief Handle lowering of 16-lane 32-bit floating point shuffles.
 static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -12994,7 +13352,7 @@ static SDValue lowerV16F32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
 /// \brief Handle lowering of 8-lane 64-bit integer shuffles.
 static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -13059,7 +13417,7 @@ static SDValue lowerV8I64VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
 /// \brief Handle lowering of 16-lane 32-bit integer shuffles.
 static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -13130,7 +13488,7 @@ static SDValue lowerV16I32VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
 /// \brief Handle lowering of 32-lane 16-bit integer shuffles.
 static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         SDValue V1, SDValue V2,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
@@ -13181,7 +13539,7 @@ static SDValue lowerV32I16VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
 /// \brief Handle lowering of 64-lane 8-bit integer shuffles.
 static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
-                                       const SmallBitVector &Zeroable,
+                                       const APInt &Zeroable,
                                        SDValue V1, SDValue V2,
                                        const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
@@ -13241,7 +13599,7 @@ static SDValue lowerV64I8VectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
 /// together based on the available instructions.
 static SDValue lower512BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                         MVT VT, SDValue V1, SDValue V2,
-                                        const SmallBitVector &Zeroable,
+                                        const APInt &Zeroable,
                                         const X86Subtarget &Subtarget,
                                         SelectionDAG &DAG) {
   assert(Subtarget.hasAVX512() &&
@@ -13322,7 +13680,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (ISD::isBuildVectorAllZeros(V1.getNode()))
     V1 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   else if (ISD::isBuildVectorAllOnes(V1.getNode()))
-    V1 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+    V1 = getOnesVector(ExtVT, DAG, DL);
   else
     V1 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V1);
 
@@ -13331,7 +13689,7 @@ static SDValue lower1BitVectorShuffle(const SDLoc &DL, ArrayRef<int> Mask,
   else if (ISD::isBuildVectorAllZeros(V2.getNode()))
     V2 = getZeroVector(ExtVT, Subtarget, DAG, DL);
   else if (ISD::isBuildVectorAllOnes(V2.getNode()))
-    V2 = getOnesVector(ExtVT, Subtarget, DAG, DL);
+    V2 = getOnesVector(ExtVT, DAG, DL);
   else
     V2 = DAG.getNode(ISD::SIGN_EXTEND, DL, ExtVT, V2);
 
@@ -13463,8 +13821,8 @@ static SDValue lowerVectorShuffle(SDValue Op, const X86Subtarget &Subtarget,
   // We actually see shuffles that are entirely re-arrangements of a set of
   // zero inputs. This mostly happens while decomposing complex shuffles into
   // simple ones. Directly lower these as a buildvector of zeros.
-  SmallBitVector Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
-  if (Zeroable.all())
+  APInt Zeroable = computeZeroableShuffleElements(Mask, V1, V2);
+  if (Zeroable.isAllOnesValue())
     return getZeroVector(VT, Subtarget, DAG, DL);
 
   // Try to collapse shuffles into using a vector type with fewer elements but
@@ -13640,10 +13998,14 @@ X86TargetLowering::ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG) const
          "Unexpected vector type in ExtractBitFromMaskVector");
 
   // variable index can't be handled in mask registers,
-  // extend vector to VR512
+  // extend vector to VR512/128
   if (!isa<ConstantSDNode>(Idx)) {
-    MVT ExtVT = (VecVT == MVT::v8i1 ?  MVT::v8i64 : MVT::v16i32);
-    SDValue Ext = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, Vec);
+    unsigned NumElts = VecVT.getVectorNumElements();
+    // Extending v8i1/v16i1 to 512-bit get better performance on KNL
+    // than extending to 128/256bit.
+    unsigned VecSize = (NumElts <= 4 ? 128 : 512);
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(VecSize/NumElts), NumElts);
+    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND, dl, ExtVT, Vec);
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
                               ExtVT.getVectorElementType(), Ext, Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
@@ -13681,24 +14043,36 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     return ExtractBitFromMaskVector(Op, DAG);
 
   if (!isa<ConstantSDNode>(Idx)) {
-    if (VecVT.is512BitVector() ||
-        (VecVT.is256BitVector() && Subtarget.hasInt256() &&
-         VecVT.getScalarSizeInBits() == 32)) {
-
-      MVT MaskEltVT =
-        MVT::getIntegerVT(VecVT.getScalarSizeInBits());
-      MVT MaskVT = MVT::getVectorVT(MaskEltVT, VecVT.getSizeInBits() /
-                                    MaskEltVT.getSizeInBits());
+    // Its more profitable to go through memory (1 cycles throughput)
+    // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
+    // IACA tool was used to get performance estimation
+    // (https://software.intel.com/en-us/articles/intel-architecture-code-analyzer)
+    //
+    // example : extractelement <16 x i8> %a, i32 %i
+    //
+    // Block Throughput: 3.00 Cycles
+    // Throughput Bottleneck: Port5
+    //
+    // | Num Of |   Ports pressure in cycles  |    |
+    // |  Uops  |  0  - DV  |  5  |  6  |  7  |    |
+    // ---------------------------------------------
+    // |   1    |           | 1.0 |     |     | CP | vmovd xmm1, edi
+    // |   1    |           | 1.0 |     |     | CP | vpshufb xmm0, xmm0, xmm1
+    // |   2    | 1.0       | 1.0 |     |     | CP | vpextrb eax, xmm0, 0x0
+    // Total Num Of Uops: 4
+    //
+    //
+    // Block Throughput: 1.00 Cycles
+    // Throughput Bottleneck: PORT2_AGU, PORT3_AGU, Port4
+    //
+    // |    |  Ports pressure in cycles   |  |
+    // |Uops| 1 | 2 - D  |3 -  D  | 4 | 5 |  |
+    // ---------------------------------------------------------
+    // |2^  |   | 0.5    | 0.5    |1.0|   |CP| vmovaps xmmword ptr [rsp-0x18], xmm0
+    // |1   |0.5|        |        |   |0.5|  | lea rax, ptr [rsp-0x18]
+    // |1   |   |0.5, 0.5|0.5, 0.5|   |   |CP| mov al, byte ptr [rdi+rax*1]
+    // Total Num Of Uops: 4
 
-      Idx = DAG.getZExtOrTrunc(Idx, dl, MaskEltVT);
-      auto PtrVT = getPointerTy(DAG.getDataLayout());
-      SDValue Mask = DAG.getNode(X86ISD::VINSERT, dl, MaskVT,
-                                 getZeroVector(MaskVT, Subtarget, DAG, dl), Idx,
-                                 DAG.getConstant(0, dl, PtrVT));
-      SDValue Perm = DAG.getNode(X86ISD::VPERMV, dl, VecVT, Mask, Vec);
-      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, Op.getValueType(), Perm,
-                         DAG.getConstant(0, dl, PtrVT));
-    }
     return SDValue();
   }
 
@@ -13746,7 +14120,33 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     if (SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG))
       return Res;
 
-  // TODO: handle v16i8.
+  // TODO: We only extract a single element from v16i8, we can probably afford
+  // to be more aggressive here before using the default approach of spilling to
+  // stack.
+  if (VT.getSizeInBits() == 8 && Op->isOnlyUserOf(Vec.getNode())) {
+    // Extract either the lowest i32 or any i16, and extract the sub-byte.
+    int DWordIdx = IdxVal / 4;
+    if (DWordIdx == 0) {
+      SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                DAG.getBitcast(MVT::v4i32, Vec),
+                                DAG.getIntPtrConstant(DWordIdx, dl));
+      int ShiftVal = (IdxVal % 4) * 8;
+      if (ShiftVal != 0)
+        Res = DAG.getNode(ISD::SRL, dl, MVT::i32, Res,
+                          DAG.getConstant(ShiftVal, dl, MVT::i32));
+      return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+    }
+
+    int WordIdx = IdxVal / 2;
+    SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16,
+                              DAG.getBitcast(MVT::v8i16, Vec),
+                              DAG.getIntPtrConstant(WordIdx, dl));
+    int ShiftVal = (IdxVal % 2) * 8;
+    if (ShiftVal != 0)
+      Res = DAG.getNode(ISD::SRL, dl, MVT::i16, Res,
+                        DAG.getConstant(ShiftVal, dl, MVT::i16));
+    return DAG.getNode(ISD::TRUNCATE, dl, VT, Res);
+  }
 
   if (VT.getSizeInBits() == 32) {
     if (IdxVal == 0)
@@ -13861,17 +14261,21 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   auto *N2C = cast<ConstantSDNode>(N2);
   unsigned IdxVal = N2C->getZExtValue();
 
-  // If we are clearing out a element, we do this more efficiently with a
-  // blend shuffle than a costly integer insertion.
-  // TODO: would other rematerializable values (e.g. allbits) benefit as well?
+  bool IsZeroElt = X86::isZeroNode(N1);
+  bool IsAllOnesElt = VT.isInteger() && llvm::isAllOnesConstant(N1);
+
+  // If we are inserting a element, see if we can do this more efficiently with
+  // a blend shuffle with a rematerializable vector than a costly integer
+  // insertion.
   // TODO: pre-SSE41 targets will tend to use bit masking - this could still
   // be beneficial if we are inserting several zeros and can combine the masks.
-  if (X86::isZeroNode(N1) && Subtarget.hasSSE41() && NumElts <= 8) {
-    SmallVector<int, 8> ClearMask;
+  if ((IsZeroElt || IsAllOnesElt) && Subtarget.hasSSE41() && NumElts <= 8) {
+    SmallVector<int, 8> BlendMask;
     for (unsigned i = 0; i != NumElts; ++i)
-      ClearMask.push_back(i == IdxVal ? i + NumElts : i);
-    SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, dl);
-    return DAG.getVectorShuffle(VT, dl, N0, ZeroVector, ClearMask);
+      BlendMask.push_back(i == IdxVal ? i + NumElts : i);
+    SDValue CstVector = IsZeroElt ? getZeroVector(VT, Subtarget, DAG, dl)
+                                  : DAG.getConstant(-1, dl, VT);
+    return DAG.getVectorShuffle(VT, dl, N0, CstVector, BlendMask);
   }
 
   // If the vector is wider than 128 bits, extract the 128-bit subvector, insert
@@ -13908,25 +14312,27 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   }
   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
-  if (Subtarget.hasSSE41()) {
-    if (EltVT.getSizeInBits() == 8 || EltVT.getSizeInBits() == 16) {
-      unsigned Opc;
-      if (VT == MVT::v8i16) {
-        Opc = X86ISD::PINSRW;
-      } else {
-        assert(VT == MVT::v16i8);
-        Opc = X86ISD::PINSRB;
-      }
-
-      // Transform it so it match pinsr{b,w} which expects a GR32 as its second
-      // argument.
-      if (N1.getValueType() != MVT::i32)
-        N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
-      if (N2.getValueType() != MVT::i32)
-        N2 = DAG.getIntPtrConstant(IdxVal, dl);
-      return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+  // Transform it so it match pinsr{b,w} which expects a GR32 as its second
+  // argument. SSE41 required for pinsrb.
+  if (VT == MVT::v8i16 || (VT == MVT::v16i8 && Subtarget.hasSSE41())) {
+    unsigned Opc;
+    if (VT == MVT::v8i16) {
+      assert(Subtarget.hasSSE2() && "SSE2 required for PINSRW");
+      Opc = X86ISD::PINSRW;
+    } else {
+      assert(VT == MVT::v16i8 && "PINSRB requires v16i8 vector");
+      assert(Subtarget.hasSSE41() && "SSE41 required for PINSRB");
+      Opc = X86ISD::PINSRB;
     }
 
+    if (N1.getValueType() != MVT::i32)
+      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
+    if (N2.getValueType() != MVT::i32)
+      N2 = DAG.getIntPtrConstant(IdxVal, dl);
+    return DAG.getNode(Opc, dl, VT, N0, N1, N2);
+  }
+
+  if (Subtarget.hasSSE41()) {
     if (EltVT == MVT::f32) {
       // Bits [7:6] of the constant are the source select. This will always be
       //   zero here. The DAG Combiner may combine an extract_elt index into
@@ -13956,24 +14362,11 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
       return DAG.getNode(X86ISD::INSERTPS, dl, VT, N0, N1, N2);
     }
 
-    if (EltVT == MVT::i32 || EltVT == MVT::i64) {
-      // PINSR* works with constant index.
+    // PINSR* works with constant index.
+    if (EltVT == MVT::i32 || EltVT == MVT::i64)
       return Op;
-    }
   }
 
-  if (EltVT == MVT::i8)
-    return SDValue();
-
-  if (EltVT.getSizeInBits() == 16) {
-    // Transform it so it match pinsrw which expects a 16-bit value in a GR32
-    // as its second argument.
-    if (N1.getValueType() != MVT::i32)
-      N1 = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, N1);
-    if (N2.getValueType() != MVT::i32)
-      N2 = DAG.getIntPtrConstant(IdxVal, dl);
-    return DAG.getNode(X86ISD::PINSRW, dl, VT, N0, N1, N2);
-  }
   return SDValue();
 }
 
@@ -14028,20 +14421,14 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
           In.getSimpleValueType().is512BitVector()) &&
          "Can only extract from 256-bit or 512-bit vectors");
 
-  if (ResVT.is128BitVector())
-    return extract128BitVector(In, IdxVal, DAG, dl);
-  if (ResVT.is256BitVector())
-    return extract256BitVector(In, IdxVal, DAG, dl);
-
-  llvm_unreachable("Unimplemented!");
-}
+  // If the input is a buildvector just emit a smaller one.
+  unsigned ElemsPerChunk = ResVT.getVectorNumElements();
+  if (In.getOpcode() == ISD::BUILD_VECTOR)
+    return DAG.getNode(ISD::BUILD_VECTOR, dl, ResVT,
+                       makeArrayRef(In->op_begin() + IdxVal, ElemsPerChunk));
 
-static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
-  for (SDNode::use_iterator I = N->use_begin(), E = N->use_end(); I != E; ++I)
-    if (llvm::all_of(ValidUsers,
-                     [&I](SDValue V) { return V.getNode() != *I; }))
-      return false;
-  return true;
+  // Everything else is legal.
+  return Op;
 }
 
 // Lower a node with an INSERT_SUBVECTOR opcode.  This may result in a
@@ -14049,83 +14436,9 @@ static bool areOnlyUsersOf(SDNode *N, ArrayRef<SDValue> ValidUsers) {
 // the upper bits of a vector.
 static SDValue LowerINSERT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
                                      SelectionDAG &DAG) {
-  assert(Subtarget.hasAVX() && "INSERT_SUBVECTOR requires AVX");
-
-  SDLoc dl(Op);
-  SDValue Vec = Op.getOperand(0);
-  SDValue SubVec = Op.getOperand(1);
-  SDValue Idx = Op.getOperand(2);
-
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  MVT OpVT = Op.getSimpleValueType();
-  MVT SubVecVT = SubVec.getSimpleValueType();
-
-  if (OpVT.getVectorElementType() == MVT::i1)
-    return insert1BitVector(Op, DAG, Subtarget);
-
-  assert((OpVT.is256BitVector() || OpVT.is512BitVector()) &&
-         "Can only insert into 256-bit or 512-bit vectors");
-
-  // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
-  // load:
-  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
-  //                   (load16 addr + 16), Elts/2)
-  // --> load32 addr
-  // or:
-  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
-  //                   (load32 addr + 32), Elts/2)
-  // --> load64 addr
-  // or a 16-byte or 32-byte broadcast:
-  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
-  //                   (load16 addr), Elts/2)
-  // --> X86SubVBroadcast(load16 addr)
-  // or:
-  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
-  //                   (load32 addr), Elts/2)
-  // --> X86SubVBroadcast(load32 addr)
-  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
-      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
-      OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
-    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
-    if (Idx2 && Idx2->getZExtValue() == 0) {
-      SDValue SubVec2 = Vec.getOperand(1);
-      // If needed, look through bitcasts to get to the load.
-      if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
-        bool Fast;
-        unsigned Alignment = FirstLd->getAlignment();
-        unsigned AS = FirstLd->getAddressSpace();
-        const X86TargetLowering *TLI = Subtarget.getTargetLowering();
-        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
-                                    OpVT, AS, Alignment, &Fast) && Fast) {
-          SDValue Ops[] = {SubVec2, SubVec};
-          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
-            return Ld;
-        }
-      }
-      // If lower/upper loads are the same and the only users of the load, then
-      // lower to a VBROADCASTF128/VBROADCASTI128/etc.
-      if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
-        if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
-            areOnlyUsersOf(SubVec2.getNode(), {Op, Vec})) {
-          return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
-        }
-      }
-      // If this is subv_broadcast insert into both halves, use a larger
-      // subv_broadcast.
-      if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
-        return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
-                           SubVec.getOperand(0));
-      }
-    }
-  }
+  assert(Op.getSimpleValueType().getVectorElementType() == MVT::i1);
 
-  if (SubVecVT.is128BitVector())
-    return insert128BitVector(Vec, SubVec, IdxVal, DAG, dl);
-
-  if (SubVecVT.is256BitVector())
-    return insert256BitVector(Vec, SubVec, IdxVal, DAG, dl);
-
-  llvm_unreachable("Unimplemented!");
+  return insert1BitVector(Op, DAG, Subtarget);
 }
 
 // Returns the appropriate wrapper opcode for a global reference.
@@ -14143,7 +14456,7 @@ unsigned X86TargetLowering::getGlobalWrapperKind(const GlobalValue *GV) const {
 }
 
 // ConstantPool, JumpTable, GlobalAddress, and ExternalSymbol are lowered as
-// their target countpart wrapped in the X86ISD::Wrapper node. Suppose N is
+// their target counterpart wrapped in the X86ISD::Wrapper node. Suppose N is
 // one of the above mentioned nodes. It has to be wrapped because otherwise
 // Select(N) returns N. So the raw TargetGlobalAddress nodes, etc. can only
 // be used to form addressing mode. These wrapped nodes will be selected
@@ -14519,7 +14832,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const {
       Subtarget.isTargetWindowsItanium() ||
       Subtarget.isTargetWindowsGNU()) {
     // Just use the implicit TLS architecture
-    // Need to generate someting similar to:
+    // Need to generate something similar to:
     //   mov     rdx, qword [gs:abs 58H]; Load pointer to ThreadLocalStorage
     //                                  ; from TEB
     //   mov     ecx, dword [rel _tls_index]: Load index (from C runtime)
@@ -15570,32 +15883,21 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     // word to byte only under BWI
     if (InVT == MVT::v16i16 && !Subtarget.hasBWI()) // v16i16 -> v16i8
       return DAG.getNode(X86ISD::VTRUNC, DL, VT,
-                         DAG.getNode(X86ISD::VSEXT, DL, MVT::v16i32, In));
+                         getExtendInVec(X86ISD::VSEXT, DL, MVT::v16i32, In, DAG));
     return DAG.getNode(X86ISD::VTRUNC, DL, VT, In);
   }
 
-  // Truncate with PACKSS if we are truncating a vector comparison result.
-  // TODO: We should be able to support other operations as long as we
-  // we are saturating+packing zero/all bits only.
-  auto IsPackableComparison = [](SDValue V) {
-    unsigned Opcode = V.getOpcode();
-    return (Opcode == X86ISD::PCMPGT || Opcode == X86ISD::PCMPEQ ||
-            Opcode == X86ISD::CMPP);
-  };
-
-  if (IsPackableComparison(In) || (In.getOpcode() == ISD::CONCAT_VECTORS &&
-                                   all_of(In->ops(), IsPackableComparison))) {
+  // Truncate with PACKSS if we are truncating a vector zero/all-bits result.
+  if (InVT.getScalarSizeInBits() == DAG.ComputeNumSignBits(In))
     if (SDValue V = truncateVectorCompareWithPACKSS(VT, In, DL, DAG, Subtarget))
       return V;
-  }
 
   if ((VT == MVT::v4i32) && (InVT == MVT::v4i64)) {
     // On AVX2, v4i64 -> v4i32 becomes VPERMD.
     if (Subtarget.hasInt256()) {
       static const int ShufMask[] = {0, 2, 4, 6, -1, -1, -1, -1};
       In = DAG.getBitcast(MVT::v8i32, In);
-      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, DAG.getUNDEF(MVT::v8i32),
-                                ShufMask);
+      In = DAG.getVectorShuffle(MVT::v8i32, DL, In, In, ShufMask);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, In,
                          DAG.getIntPtrConstant(0, DL));
     }
@@ -15611,30 +15913,20 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   if ((VT == MVT::v8i16) && (InVT == MVT::v8i32)) {
-    // On AVX2, v8i32 -> v8i16 becomed PSHUFB.
+    // On AVX2, v8i32 -> v8i16 becomes PSHUFB.
     if (Subtarget.hasInt256()) {
       In = DAG.getBitcast(MVT::v32i8, In);
 
-      SmallVector<SDValue,32> pshufbMask;
-      for (unsigned i = 0; i < 2; ++i) {
-        pshufbMask.push_back(DAG.getConstant(0x0, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x1, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x4, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x5, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x8, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0x9, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0xc, DL, MVT::i8));
-        pshufbMask.push_back(DAG.getConstant(0xd, DL, MVT::i8));
-        for (unsigned j = 0; j < 8; ++j)
-          pshufbMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
-      }
-      SDValue BV = DAG.getBuildVector(MVT::v32i8, DL, pshufbMask);
-      In = DAG.getNode(X86ISD::PSHUFB, DL, MVT::v32i8, In, BV);
+      // The PSHUFB mask:
+      static const int ShufMask1[] = { 0,  1,  4,  5,  8,  9, 12, 13,
+                                      -1, -1, -1, -1, -1, -1, -1, -1,
+                                      16, 17, 20, 21, 24, 25, 28, 29,
+                                      -1, -1, -1, -1, -1, -1, -1, -1 };
+      In = DAG.getVectorShuffle(MVT::v32i8, DL, In, In, ShufMask1);
       In = DAG.getBitcast(MVT::v4i64, In);
 
-      static const int ShufMask[] = {0,  2,  -1,  -1};
-      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, DAG.getUNDEF(MVT::v4i64),
-                                ShufMask);
+      static const int ShufMask2[] = {0,  2,  -1,  -1};
+      In = DAG.getVectorShuffle(MVT::v4i64, DL,  In, In, ShufMask2);
       In = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MVT::v2i64, In,
                        DAG.getIntPtrConstant(0, DL));
       return DAG.getBitcast(VT, In);
@@ -15653,9 +15945,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
     static const int ShufMask1[] = {0,  1,  4,  5,  8,  9, 12, 13,
                                    -1, -1, -1, -1, -1, -1, -1, -1};
 
-    SDValue Undef = DAG.getUNDEF(MVT::v16i8);
-    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, Undef, ShufMask1);
-    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, Undef, ShufMask1);
+    OpLo = DAG.getVectorShuffle(MVT::v16i8, DL, OpLo, OpLo, ShufMask1);
+    OpHi = DAG.getVectorShuffle(MVT::v16i8, DL, OpHi, OpHi, ShufMask1);
 
     OpLo = DAG.getBitcast(MVT::v4i32, OpLo);
     OpHi = DAG.getBitcast(MVT::v4i32, OpHi);
@@ -15679,17 +15970,14 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   // Prepare truncation shuffle mask
   for (unsigned i = 0; i != NumElems; ++i)
     MaskVec[i] = i * 2;
-  SDValue V = DAG.getVectorShuffle(NVT, DL, DAG.getBitcast(NVT, In),
-                                   DAG.getUNDEF(NVT), MaskVec);
+  In = DAG.getBitcast(NVT, In);
+  SDValue V = DAG.getVectorShuffle(NVT, DL, In, In, MaskVec);
   return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V,
                      DAG.getIntPtrConstant(0, DL));
 }
 
-SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
-                                          const X86Subtarget &Subtarget,
-                                          SelectionDAG &DAG) const {
+SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   bool IsSigned = Op.getOpcode() == ISD::FP_TO_SINT;
-
   MVT VT = Op.getSimpleValueType();
 
   if (VT.isVector()) {
@@ -15697,8 +15985,7 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op,
     SDValue Src = Op.getOperand(0);
     SDLoc dl(Op);
     if (VT == MVT::v2i64 && Src.getSimpleValueType() == MVT::v2f32) {
-      return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI,
-                         dl, VT,
+      return DAG.getNode(IsSigned ? X86ISD::CVTTP2SI : X86ISD::CVTTP2UI, dl, VT,
                          DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                      DAG.getUNDEF(MVT::v2f32)));
     }
@@ -15972,7 +16259,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
   for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
     VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
 
-  // If more than one full vectors are evaluated, OR them first before PTEST.
+  // If more than one full vector is evaluated, OR them first before PTEST.
   for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
     // Each iteration will OR 2 nodes and append the result until there is only
     // 1 node left, i.e. the final OR'd value of all vectors.
@@ -15981,8 +16268,7 @@ static SDValue LowerVectorAllZeroTest(SDValue Op, const X86Subtarget &Subtarget,
     VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
   }
 
-  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32,
-                     VecIns.back(), VecIns.back());
+  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
 }
 
 /// \brief return true if \c Op has a use that doesn't just read flags.
@@ -16447,7 +16733,7 @@ SDValue X86TargetLowering::getRecipEstimate(SDValue Op, SelectionDAG &DAG,
 }
 
 /// If we have at least two divisions that use the same divisor, convert to
-/// multplication by a reciprocal. This may need to be adjusted for a given
+/// multiplication by a reciprocal. This may need to be adjusted for a given
 /// CPU if a division's cost is not at least twice the cost of a multiplication.
 /// This is because we still need one division to calculate the reciprocal and
 /// then we need two multiplies by that reciprocal as replacements for the
@@ -17543,17 +17829,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
 
   // SKX processor
   if ((InVTElt == MVT::i1) &&
-      (((Subtarget.hasBWI() && Subtarget.hasVLX() &&
-        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() <= 16)) ||
-
-       ((Subtarget.hasBWI() && VT.is512BitVector() &&
-        VTElt.getSizeInBits() <= 16)) ||
+      (((Subtarget.hasBWI() && VTElt.getSizeInBits() <= 16)) ||
 
-       ((Subtarget.hasDQI() && Subtarget.hasVLX() &&
-        VT.getSizeInBits() <= 256 && VTElt.getSizeInBits() >= 32)) ||
+       ((Subtarget.hasDQI() && VTElt.getSizeInBits() >= 32))))
 
-       ((Subtarget.hasDQI() && VT.is512BitVector() &&
-        VTElt.getSizeInBits() >= 32))))
     return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
 
   unsigned NumElts = VT.getVectorNumElements();
@@ -17561,8 +17840,8 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
   if (VT.is512BitVector() && InVTElt != MVT::i1 &&
       (NumElts == 8 || NumElts == 16 || Subtarget.hasBWI())) {
     if (In.getOpcode() == X86ISD::VSEXT || In.getOpcode() == X86ISD::VZEXT)
-      return DAG.getNode(In.getOpcode(), dl, VT, In.getOperand(0));
-    return DAG.getNode(X86ISD::VSEXT, dl, VT, In);
+      return getExtendInVec(In.getOpcode(), dl, VT, In.getOperand(0), DAG);
+    return getExtendInVec(X86ISD::VSEXT, dl, VT, In, DAG);
   }
 
   if (InVTElt != MVT::i1)
@@ -17574,10 +17853,10 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op,
 
   SDValue V;
   if (Subtarget.hasDQI()) {
-    V = DAG.getNode(X86ISD::VSEXT, dl, ExtVT, In);
+    V = getExtendInVec(X86ISD::VSEXT, dl, ExtVT, In, DAG);
     assert(!VT.is512BitVector() && "Unexpected vector type");
   } else {
-    SDValue NegOne = getOnesVector(ExtVT, Subtarget, DAG, dl);
+    SDValue NegOne = getOnesVector(ExtVT, DAG, dl);
     SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl);
     V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero);
     if (ExtVT == VT)
@@ -17626,11 +17905,15 @@ static SDValue LowerEXTEND_VECTOR_INREG(SDValue Op,
   assert((Op.getOpcode() != ISD::ZERO_EXTEND_VECTOR_INREG ||
           InVT == MVT::v64i8) && "Zero extend only for v64i8 input!");
 
-  // SSE41 targets can use the pmovsx* instructions directly.
-  unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
-                      X86ISD::VSEXT : X86ISD::VZEXT;
-  if (Subtarget.hasSSE41())
+  // SSE41 targets can use the pmovsx* instructions directly for 128-bit results,
+  // so are legal and shouldn't occur here. AVX2/AVX512 pmovsx* instructions still
+  // need to be handled here for 256/512-bit results.
+  if (Subtarget.hasInt256()) {
+    assert(VT.getSizeInBits() > 128 && "Unexpected 128-bit vector extension");
+    unsigned ExtOpc = Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ?
+                        X86ISD::VSEXT : X86ISD::VZEXT;
     return DAG.getNode(ExtOpc, dl, VT, In);
+  }
 
   // We should only get here for sign extend.
   assert(Op.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG &&
@@ -17715,8 +17998,8 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
   MVT HalfVT = MVT::getVectorVT(VT.getVectorElementType(),
                                 VT.getVectorNumElements() / 2);
 
-  OpLo = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpLo);
-  OpHi = DAG.getNode(X86ISD::VSEXT, dl, HalfVT, OpHi);
+  OpLo = DAG.getSignExtendVectorInReg(OpLo, dl, HalfVT);
+  OpHi = DAG.getSignExtendVectorInReg(OpHi, dl, HalfVT);
 
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, OpLo, OpHi);
 }
@@ -17794,7 +18077,8 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
   MVT VT = Op.getValueType().getSimpleVT();
   unsigned NumElts = VT.getVectorNumElements();
 
-  if ((Subtarget.hasVLX() && Subtarget.hasBWI() && Subtarget.hasDQI()) ||
+  if ((Subtarget.hasBWI() && NumElts >= 32) ||
+      (Subtarget.hasDQI() && NumElts < 16) ||
       NumElts == 16) {
     // Load and extend - everything is legal
     if (NumElts < 8) {
@@ -17823,7 +18107,7 @@ static SDValue LowerExtended1BitVectorLoad(SDValue Op,
 
   if (NumElts <= 8) {
     // A subset, assume that we have only AVX-512F
-    unsigned NumBitsToLoad = NumElts < 8 ? 8 : NumElts;
+    unsigned NumBitsToLoad = 8;
     MVT TypeToLoad = MVT::getIntegerVT(NumBitsToLoad);
     SDValue Load = DAG.getLoad(TypeToLoad, dl, Ld->getChain(),
                               Ld->getBasePtr(),
@@ -18031,7 +18315,7 @@ static SDValue LowerExtendedLoad(SDValue Op, const X86Subtarget &Subtarget,
   if (Ext == ISD::SEXTLOAD) {
     // If we have SSE4.1, we can directly emit a VSEXT node.
     if (Subtarget.hasSSE41()) {
-      SDValue Sext = DAG.getNode(X86ISD::VSEXT, dl, RegVT, SlicedVec);
+      SDValue Sext = getExtendInVec(X86ISD::VSEXT, dl, RegVT, SlicedVec, DAG);
       DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), TF);
       return Sext;
     }
@@ -18589,6 +18873,11 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
                                           SelectionDAG &DAG) {
   MVT ElementType = VT.getVectorElementType();
 
+  // Bitcast the source vector to the output type, this is mainly necessary for
+  // vXi8/vXi64 shifts.
+  if (VT != SrcOp.getSimpleValueType())
+    SrcOp = DAG.getBitcast(VT, SrcOp);
+
   // Fold this packed shift into its first operand if ShiftAmt is 0.
   if (ShiftAmt == 0)
     return SrcOp;
@@ -18605,9 +18894,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
          && "Unknown target vector shift-by-constant node");
 
   // Fold this packed vector shift into a build vector if SrcOp is a
-  // vector of Constants or UNDEFs, and SrcOp valuetype is the same as VT.
-  if (VT == SrcOp.getSimpleValueType() &&
-      ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
+  // vector of Constants or UNDEFs.
+  if (ISD::isBuildVectorOfConstantSDNodes(SrcOp.getNode())) {
     SmallVector<SDValue, 8> Elts;
     unsigned NumElts = SrcOp->getNumOperands();
     ConstantSDNode *ND;
@@ -18698,11 +18986,11 @@ static SDValue getTargetVShiftNode(unsigned Opc, const SDLoc &dl, MVT VT,
            ShAmt.getOperand(0).getSimpleValueType() == MVT::i16) {
     ShAmt = ShAmt.getOperand(0);
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v8i16, ShAmt);
-    ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else if (Subtarget.hasSSE41() &&
              ShAmt.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
     ShAmt = DAG.getNode(ISD::SCALAR_TO_VECTOR, SDLoc(ShAmt), MVT::v4i32, ShAmt);
-    ShAmt = DAG.getNode(X86ISD::VZEXT, SDLoc(ShAmt), MVT::v2i64, ShAmt);
+    ShAmt = DAG.getZeroExtendVectorInReg(ShAmt, SDLoc(ShAmt), MVT::v2i64);
   } else {
     SmallVector<SDValue, 4> ShOps = {ShAmt, DAG.getConstant(0, dl, SVT),
                                      DAG.getUNDEF(SVT), DAG.getUNDEF(SVT)};
@@ -18973,6 +19261,14 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
       SDValue Src2 = Op.getOperand(2);
       SDValue passThru = Op.getOperand(3);
       SDValue Mask = Op.getOperand(4);
+      unsigned IntrWithRoundingModeOpcode = IntrData->Opc1;
+      if (IntrWithRoundingModeOpcode != 0) {
+        SDValue Rnd = Op.getOperand(5);
+        if (!isRoundModeCurDirection(Rnd))
+          return getScalarMaskingNode(DAG.getNode(IntrWithRoundingModeOpcode,
+                                                  dl, VT, Src1, Src2, Rnd),
+                                      Mask, passThru, Subtarget, DAG);
+      }
       return getScalarMaskingNode(DAG.getNode(IntrData->Opc0, dl, VT, Src1, Src2),
                                   Mask, passThru, Subtarget, DAG);
     }
@@ -19426,6 +19722,15 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
                                 Src2, Src1);
       return DAG.getBitcast(VT, Res);
     }
+    case MASK_BINOP: {
+      MVT VT = Op.getSimpleValueType();
+      MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getSizeInBits());
+
+      SDValue Src1 = getMaskNode(Op.getOperand(1), MaskVT, Subtarget, DAG, dl);
+      SDValue Src2 = getMaskNode(Op.getOperand(2), MaskVT, Subtarget, DAG, dl);
+      SDValue Res = DAG.getNode(IntrData->Opc0, dl, MaskVT, Src1, Src2);
+      return DAG.getBitcast(VT, Res);
+    }
     case FIXUPIMMS:
     case FIXUPIMMS_MASKZ:
     case FIXUPIMM:
@@ -19598,6 +19903,33 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
     return DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, SetCC);
   }
 
+  case Intrinsic::x86_avx512_knot_w: {
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getConstant(1, dl, MVT::v16i1);
+    SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
+    return DAG.getBitcast(MVT::i16, Res);
+  }
+
+  case Intrinsic::x86_avx512_kandn_w: {
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    // Invert LHS for the not.
+    LHS = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS,
+                      DAG.getConstant(1, dl, MVT::v16i1));
+    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+    SDValue Res = DAG.getNode(ISD::AND, dl, MVT::v16i1, LHS, RHS);
+    return DAG.getBitcast(MVT::i16, Res);
+  }
+
+  case Intrinsic::x86_avx512_kxnor_w: {
+    SDValue LHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(1));
+    SDValue RHS = DAG.getBitcast(MVT::v16i1, Op.getOperand(2));
+    SDValue Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, LHS, RHS);
+    // Invert result for the not.
+    Res = DAG.getNode(ISD::XOR, dl, MVT::v16i1, Res,
+                      DAG.getConstant(1, dl, MVT::v16i1));
+    return DAG.getBitcast(MVT::i16, Res);
+  }
+
   case Intrinsic::x86_sse42_pcmpistria128:
   case Intrinsic::x86_sse42_pcmpestria128:
   case Intrinsic::x86_sse42_pcmpistric128:
@@ -19723,6 +20055,28 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget
   }
 }
 
+static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
+                                 SDValue Src, SDValue Mask, SDValue Base,
+                                 SDValue Index, SDValue ScaleOp, SDValue Chain,
+                                 const X86Subtarget &Subtarget) {
+  SDLoc dl(Op);
+  auto *C = cast<ConstantSDNode>(ScaleOp);
+  SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl, MVT::i8);
+  EVT MaskVT = Mask.getValueType();
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
+  SDValue Segment = DAG.getRegister(0, MVT::i32);
+  // If source is undef or we know it won't be used, use a zero vector
+  // to break register dependency.
+  // TODO: use undef instead and let ExecutionDepsFix deal with it?
+  if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
+    Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
+  SDValue Ops[] = {Src, Base, Scale, Index, Disp, Segment, Mask, Chain};
+  SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
+  SDValue RetOps[] = { SDValue(Res, 0), SDValue(Res, 2) };
+  return DAG.getMergeValues(RetOps, dl);
+}
+
 static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
                               SDValue Src, SDValue Mask, SDValue Base,
                               SDValue Index, SDValue ScaleOp, SDValue Chain,
@@ -19737,7 +20091,10 @@ static SDValue getGatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
   SDValue Disp = DAG.getTargetConstant(0, dl, MVT::i32);
   SDValue Segment = DAG.getRegister(0, MVT::i32);
-  if (Src.isUndef())
+  // If source is undef or we know it won't be used, use a zero vector
+  // to break register dependency.
+  // TODO: use undef instead and let ExecutionDepsFix deal with it?
+  if (Src.isUndef() || ISD::isBuildVectorAllOnes(VMask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
   SDValue Ops[] = {Src, VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, VTs, Ops);
@@ -19776,7 +20133,6 @@ static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   MVT MaskVT =
     MVT::getVectorVT(MVT::i1, Index.getSimpleValueType().getVectorNumElements());
   SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
-  //SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = {VMask, Base, Scale, Index, Disp, Segment, Chain};
   SDNode *Res = DAG.getMachineNode(Opc, dl, MVT::Other, Ops);
   return SDValue(Res, 0);
@@ -20048,6 +20404,16 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(ISD::MERGE_VALUES, dl, Op->getVTList(), Result, isValid,
                        SDValue(Result.getNode(), 2));
   }
+  case GATHER_AVX2: {
+    SDValue Chain = Op.getOperand(0);
+    SDValue Src   = Op.getOperand(2);
+    SDValue Base  = Op.getOperand(3);
+    SDValue Index = Op.getOperand(4);
+    SDValue Mask  = Op.getOperand(5);
+    SDValue Scale = Op.getOperand(6);
+    return getAVX2GatherNode(IntrData->Opc0, Op, DAG, Src, Mask, Base, Index,
+                             Scale, Chain, Subtarget);
+  }
   case GATHER: {
   //gather(v1, mask, index, base, scale);
     SDValue Chain = Op.getOperand(0);
@@ -20073,8 +20439,9 @@ static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
   case PREFETCH: {
     SDValue Hint = Op.getOperand(6);
     unsigned HintVal = cast<ConstantSDNode>(Hint)->getZExtValue();
-    assert(HintVal < 2 && "Wrong prefetch hint in intrinsic: should be 0 or 1");
-    unsigned Opcode = (HintVal ? IntrData->Opc1 : IntrData->Opc0);
+    assert((HintVal == 2 || HintVal == 3) &&
+           "Wrong prefetch hint in intrinsic: should be 2 or 3");
+    unsigned Opcode = (HintVal == 2 ? IntrData->Opc1 : IntrData->Opc0);
     SDValue Chain = Op.getOperand(0);
     SDValue Mask  = Op.getOperand(2);
     SDValue Index = Op.getOperand(3);
@@ -20488,7 +20855,7 @@ SDValue X86TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op,
 
       // Check that ECX wasn't needed by an 'inreg' parameter.
       FunctionType *FTy = Func->getFunctionType();
-      const AttributeSet &Attrs = Func->getAttributes();
+      const AttributeList &Attrs = Func->getAttributes();
 
       if (!Attrs.isEmpty() && !Func->isVarArg()) {
         unsigned InRegCount = 0;
@@ -20933,6 +21300,25 @@ static SDValue LowerADD_SUB(SDValue Op, SelectionDAG &DAG) {
   return Lower256IntArith(Op, DAG);
 }
 
+static SDValue LowerABS(SDValue Op, SelectionDAG &DAG) {
+  assert(Op.getSimpleValueType().is256BitVector() &&
+         Op.getSimpleValueType().isInteger() &&
+         "Only handle AVX 256-bit vector integer operation");
+  MVT VT = Op.getSimpleValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+
+  SDLoc dl(Op);
+  SDValue Src = Op.getOperand(0);
+  SDValue Lo = extract128BitVector(Src, 0, DAG, dl);
+  SDValue Hi = extract128BitVector(Src, NumElems / 2, DAG, dl);
+
+  MVT EltVT = VT.getVectorElementType();
+  MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(ISD::ABS, dl, NewVT, Lo),
+                     DAG.getNode(ISD::ABS, dl, NewVT, Hi));
+}
+
 static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
@@ -20985,8 +21371,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
     // Extract the lo parts and sign extend to i16
     SDValue ALo, BLo;
     if (Subtarget.hasSSE41()) {
-      ALo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, A);
-      BLo = DAG.getNode(X86ISD::VSEXT, dl, ExVT, B);
+      ALo = DAG.getSignExtendVectorInReg(A, dl, ExVT);
+      BLo = DAG.getSignExtendVectorInReg(B, dl, ExVT);
     } else {
       const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
                               -1, 4, -1, 5, -1, 6, -1, 7};
@@ -21005,8 +21391,8 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
                               -1, -1, -1, -1, -1, -1, -1, -1};
       AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
       BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-      AHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, AHi);
-      BHi = DAG.getNode(X86ISD::VSEXT, dl, ExVT, BHi);
+      AHi = DAG.getSignExtendVectorInReg(AHi, dl, ExVT);
+      BHi = DAG.getSignExtendVectorInReg(BHi, dl, ExVT);
     } else {
       const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
                               -1, 12, -1, 13, -1, 14, -1, 15};
@@ -21167,8 +21553,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
                          DAG.getVectorShuffle(MVT::v16i16, dl, Lo, Hi, HiMask));
     }
 
-    SDValue ExA = DAG.getNode(ExSSE41, dl, MVT::v16i16, A);
-    SDValue ExB = DAG.getNode(ExSSE41, dl, MVT::v16i16, B);
+    SDValue ExA = getExtendInVec(ExSSE41, dl, MVT::v16i16, A, DAG);
+    SDValue ExB = getExtendInVec(ExSSE41, dl, MVT::v16i16, B, DAG);
     SDValue Mul = DAG.getNode(ISD::MUL, dl, MVT::v16i16, ExA, ExB);
     SDValue MulH = DAG.getNode(ISD::SRL, dl, MVT::v16i16, Mul,
                                DAG.getConstant(8, dl, MVT::v16i16));
@@ -21184,8 +21570,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
   // Extract the lo parts and zero/sign extend to i16.
   SDValue ALo, BLo;
   if (Subtarget.hasSSE41()) {
-    ALo = DAG.getNode(ExSSE41, dl, ExVT, A);
-    BLo = DAG.getNode(ExSSE41, dl, ExVT, B);
+    ALo = getExtendInVec(ExSSE41, dl, ExVT, A, DAG);
+    BLo = getExtendInVec(ExSSE41, dl, ExVT, B, DAG);
   } else {
     const int ShufMask[] = {-1, 0, -1, 1, -1, 2, -1, 3,
                             -1, 4, -1, 5, -1, 6, -1, 7};
@@ -21204,8 +21590,8 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
                             -1, -1, -1, -1, -1, -1, -1, -1};
     AHi = DAG.getVectorShuffle(VT, dl, A, A, ShufMask);
     BHi = DAG.getVectorShuffle(VT, dl, B, B, ShufMask);
-    AHi = DAG.getNode(ExSSE41, dl, ExVT, AHi);
-    BHi = DAG.getNode(ExSSE41, dl, ExVT, BHi);
+    AHi = getExtendInVec(ExSSE41, dl, ExVT, AHi, DAG);
+    BHi = getExtendInVec(ExSSE41, dl, ExVT, BHi, DAG);
   } else {
     const int ShufMask[] = {-1, 8,  -1, 9,  -1, 10, -1, 11,
                             -1, 12, -1, 13, -1, 14, -1, 15};
@@ -21259,8 +21645,8 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
                            MachinePointerInfo(), /* Alignment = */ 16);
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Ty = PointerType::get(ArgTy,0);
-    Entry.isSExt = false;
-    Entry.isZExt = false;
+    Entry.IsSExt = false;
+    Entry.IsZExt = false;
     Args.push_back(Entry);
   }
 
@@ -21268,11 +21654,15 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
                                          getPointerTy(DAG.getDataLayout()));
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain)
-    .setCallee(getLibcallCallingConv(LC),
-               static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()),
-               Callee, std::move(Args))
-    .setInRegister().setSExtResult(isSigned).setZExtResult(!isSigned);
+  CLI.setDebugLoc(dl)
+      .setChain(InChain)
+      .setLibCallee(
+          getLibcallCallingConv(LC),
+          static_cast<EVT>(MVT::v2i64).getTypeForEVT(*DAG.getContext()), Callee,
+          std::move(Args))
+      .setInRegister()
+      .setSExtResult(isSigned)
+      .setZExtResult(!isSigned);
 
   std::pair<SDValue, SDValue> CallInfo = LowerCallTo(CLI);
   return DAG.getBitcast(VT, CallInfo.first);
@@ -21380,15 +21770,15 @@ static bool SupportedVectorShiftWithImm(MVT VT, const X86Subtarget &Subtarget,
   if (VT.getScalarSizeInBits() < 16)
     return false;
 
-  if (VT.is512BitVector() &&
+  if (VT.is512BitVector() && Subtarget.hasAVX512() &&
       (VT.getScalarSizeInBits() > 16 || Subtarget.hasBWI()))
     return true;
 
-  bool LShift = VT.is128BitVector() ||
-    (VT.is256BitVector() && Subtarget.hasInt256());
+  bool LShift = (VT.is128BitVector() && Subtarget.hasSSE2()) ||
+                (VT.is256BitVector() && Subtarget.hasInt256());
 
-  bool AShift = LShift && (Subtarget.hasVLX() ||
-    (VT != MVT::v2i64 && VT != MVT::v4i64));
+  bool AShift = LShift && (Subtarget.hasAVX512() ||
+                           (VT != MVT::v2i64 && VT != MVT::v4i64));
   return (Opcode == ISD::SRA) ? AShift : LShift;
 }
 
@@ -22173,10 +22563,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
     // A subtract of one will be selected as a INC. Note that INC doesn't
     // set CF, so we can't do this for UADDO.
     if (isOneConstant(RHS)) {
-        BaseOp = X86ISD::INC;
-        Cond = X86::COND_O;
-        break;
-      }
+      BaseOp = X86ISD::INC;
+      Cond = X86::COND_O;
+      break;
+    }
     BaseOp = X86ISD::ADD;
     Cond = X86::COND_O;
     break;
@@ -22188,10 +22578,10 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
     // A subtract of one will be selected as a DEC. Note that DEC doesn't
     // set CF, so we can't do this for USUBO.
     if (isOneConstant(RHS)) {
-        BaseOp = X86ISD::DEC;
-        Cond = X86::COND_O;
-        break;
-      }
+      BaseOp = X86ISD::DEC;
+      Cond = X86::COND_O;
+      break;
+    }
     BaseOp = X86ISD::SUB;
     Cond = X86::COND_O;
     break;
@@ -22581,7 +22971,7 @@ static SDValue LowerVectorCTPOPInRegLUT(SDValue Op, const SDLoc &DL,
   // index into a in-register pre-computed pop count table. We then split up the
   // input vector in two new ones: (1) a vector with only the shifted-right
   // higher nibbles for each byte and (2) a vector with the lower nibbles (and
-  // masked out higher ones) for each byte. PSHUB is used separately with both
+  // masked out higher ones) for each byte. PSHUFB is used separately with both
   // to index the in-register table. Next, both are added and the result is a
   // i8 vector where each element contains the pop count for input byte.
   //
@@ -22978,8 +23368,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
 
   Entry.Node = Arg;
   Entry.Ty = ArgTy;
-  Entry.isSExt = false;
-  Entry.isZExt = false;
+  Entry.IsSExt = false;
+  Entry.IsZExt = false;
   Args.push_back(Entry);
 
   bool isF64 = ArgVT == MVT::f64;
@@ -22996,8 +23386,9 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
     : (Type*)VectorType::get(ArgTy, 4);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(DAG.getEntryNode())
-    .setCallee(CallingConv::C, RetTy, Callee, std::move(Args));
+  CLI.setDebugLoc(dl)
+      .setChain(DAG.getEntryNode())
+      .setLibCallee(CallingConv::C, RetTy, Callee, std::move(Args));
 
   std::pair<SDValue, SDValue> CallResult = TLI.LowerCallTo(CLI);
 
@@ -23197,7 +23588,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
   // Mask element has to be i1.
   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
-         "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+         "We handle 4x32, 4x64 and 2x64 vectors only in this case");
 
   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
 
@@ -23253,7 +23644,7 @@ static SDValue LowerMSTORE(SDValue Op, const X86Subtarget &Subtarget,
   // Mask element has to be i1.
   MVT MaskEltTy = Mask.getSimpleValueType().getScalarType();
   assert((MaskEltTy == MVT::i1 || VT.getVectorNumElements() <= 4) &&
-         "We handle 4x32, 4x64 and 2x64 vectors only in this casse");
+         "We handle 4x32, 4x64 and 2x64 vectors only in this case");
 
   MVT WideMaskVT = MVT::getVectorVT(MaskEltTy, NumEltsInWideVec);
 
@@ -23313,7 +23704,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
     Mask = ExtendToType(Mask, ExtMaskVT, DAG, true);
     Mask = DAG.getNode(ISD::TRUNCATE, dl, MaskBitVT, Mask);
 
-    // The pass-thru value
+    // The pass-through value
     MVT NewVT = MVT::getVectorVT(VT.getScalarType(), NumElts);
     Src0 = ExtendToType(Src0, NewVT, DAG);
 
@@ -23414,7 +23805,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SIGN_EXTEND_VECTOR_INREG:
     return LowerEXTEND_VECTOR_INREG(Op, Subtarget, DAG);
   case ISD::FP_TO_SINT:
-  case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, Subtarget, DAG);
+  case ISD::FP_TO_UINT:         return LowerFP_TO_INT(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
   case ISD::LOAD:               return LowerExtendedLoad(Op, Subtarget, DAG);
   case ISD::FABS:
@@ -23477,6 +23868,7 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::SMIN:
   case ISD::UMAX:
   case ISD::UMIN:               return LowerMINMAX(Op, DAG);
+  case ISD::ABS:                return LowerABS(Op, DAG);
   case ISD::FSINCOS:            return LowerFSINCOS(Op, Subtarget, DAG);
   case ISD::MLOAD:              return LowerMLOAD(Op, Subtarget, DAG);
   case ISD::MSTORE:             return LowerMSTORE(Op, Subtarget, DAG);
@@ -23879,7 +24271,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
   case X86ISD::PINSRB:             return "X86ISD::PINSRB";
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
-  case X86ISD::MMX_PINSRW:         return "X86ISD::MMX_PINSRW";
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
   case X86ISD::BLENDI:             return "X86ISD::BLENDI";
@@ -23890,16 +24281,19 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::HSUB:               return "X86ISD::HSUB";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
-  case X86ISD::ABS:                return "X86ISD::ABS";
   case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
   case X86ISD::FMAX:               return "X86ISD::FMAX";
+  case X86ISD::FMAXS:              return "X86ISD::FMAXS";
   case X86ISD::FMAX_RND:           return "X86ISD::FMAX_RND";
+  case X86ISD::FMAXS_RND:          return "X86ISD::FMAX_RND";
   case X86ISD::FMIN:               return "X86ISD::FMIN";
+  case X86ISD::FMINS:              return "X86ISD::FMINS";
   case X86ISD::FMIN_RND:           return "X86ISD::FMIN_RND";
+  case X86ISD::FMINS_RND:          return "X86ISD::FMINS_RND";
   case X86ISD::FMAXC:              return "X86ISD::FMAXC";
   case X86ISD::FMINC:              return "X86ISD::FMINC";
   case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
-  case X86ISD::FRSQRTS:             return "X86ISD::FRSQRTS";
+  case X86ISD::FRSQRTS:            return "X86ISD::FRSQRTS";
   case X86ISD::FRCP:               return "X86ISD::FRCP";
   case X86ISD::FRCPS:              return "X86ISD::FRCPS";
   case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
@@ -23938,7 +24332,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
   case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
   case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
-  case X86ISD::VINSERT:            return "X86ISD::VINSERT";
   case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
   case X86ISD::VFPEXT_RND:         return "X86ISD::VFPEXT_RND";
   case X86ISD::VFPEXTS_RND:        return "X86ISD::VFPEXTS_RND";
@@ -24089,9 +24482,13 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
   case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
   case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
+  case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
   case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
+  case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
   case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
+  case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
   case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
+  case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
   case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
   case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
   case X86ISD::FGETEXP_RND:        return "X86ISD::FGETEXP_RND";
@@ -24526,6 +24923,26 @@ static MachineBasicBlock *emitMonitor(MachineInstr &MI, MachineBasicBlock *BB,
   return BB;
 }
 
+static MachineBasicBlock *emitClzero(MachineInstr *MI, MachineBasicBlock *BB,
+                                      const X86Subtarget &Subtarget) {
+  DebugLoc dl = MI->getDebugLoc();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  // Address into RAX/EAX
+  unsigned MemOpc = Subtarget.is64Bit() ? X86::LEA64r : X86::LEA32r;
+  unsigned MemReg = Subtarget.is64Bit() ? X86::RAX : X86::EAX;
+  MachineInstrBuilder MIB = BuildMI(*BB, MI, dl, TII->get(MemOpc), MemReg);
+  for (int i = 0; i < X86::AddrNumOperands; ++i)
+    MIB.add(MI->getOperand(i));
+
+  // The instruction doesn't actually take any operands though.
+  BuildMI(*BB, MI, dl, TII->get(X86::CLZEROr));
+
+  MI->eraseFromParent(); // The pseudo is gone now.
+  return BB;
+}
+
+
+
 MachineBasicBlock *
 X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const {
@@ -24980,7 +25397,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
   //
   //   (CMOV (CMOV F, T, cc1), T, cc2)
   //
-  // to two successives branches.  For that, we look for another CMOV as the
+  // to two successive branches.  For that, we look for another CMOV as the
   // following instruction.
   //
   // Without this, we would add a PHI between the two jumps, which ends up
@@ -25738,7 +26155,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
   DebugLoc DL = MI.getDebugLoc();
   MachineFunction *MF = MBB->getParent();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
 
   MVT PVT = getPointerTy(MF->getDataLayout());
   assert((PVT == MVT::i64 || PVT == MVT::i32) && "Invalid Pointer Size!");
@@ -25757,8 +26174,6 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
     VR = MRI->createVirtualRegister(TRC);
     Op = (PVT == MVT::i64) ? X86::MOV64mr : X86::MOV32mr;
 
-    /* const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII); */
-
     if (Subtarget.is64Bit())
       BuildMI(*MBB, MI, DL, TII->get(X86::LEA64r), VR)
           .addReg(X86::RIP)
@@ -25768,7 +26183,7 @@ void X86TargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
           .addReg(0);
     else
       BuildMI(*MBB, MI, DL, TII->get(X86::LEA32r), VR)
-          .addReg(0) /* XII->getGlobalBaseReg(MF) */
+          .addReg(0) /* TII->getGlobalBaseReg(MF) */
           .addImm(1)
           .addReg(0)
           .addMBB(DispatchBB, Subtarget.classifyBlockAddressReference())
@@ -25790,7 +26205,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
   MachineFunction *MF = BB->getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineRegisterInfo *MRI = &MF->getRegInfo();
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
   int FI = MFI.getFunctionContextIndex();
 
   // Get a mapping of the call site numbers to all of the landing pads they're
@@ -25862,9 +26277,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       MF->getOrCreateJumpTableInfo(getJumpTableEncoding());
   unsigned MJTI = JTI->createJumpTableIndex(LPadList);
 
-  const X86InstrInfo *XII = static_cast<const X86InstrInfo *>(TII);
-  const X86RegisterInfo &RI = XII->getRegisterInfo();
-
+  const X86RegisterInfo &RI = TII->getRegisterInfo();
   // Add a register mask with no preserved registers.  This results in all
   // registers being marked as clobbered.
   if (RI.hasBasePointer(*MF)) {
@@ -25912,8 +26325,7 @@ X86TargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   // N.B. the order the invoke BBs are processed in doesn't matter here.
   SmallVector<MachineBasicBlock *, 64> MBBLPads;
-  const MCPhysReg *SavedRegs =
-      Subtarget.getRegisterInfo()->getCalleeSavedRegs(MF);
+  const MCPhysReg *SavedRegs = MF->getRegInfo().getCalleeSavedRegs();
   for (MachineBasicBlock *MBB : InvokeBBs) {
     // Remove the landing pad successor from the invoke block and replace it
     // with the new dispatch block.
@@ -26146,6 +26558,11 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     return emitMonitor(MI, BB, Subtarget, X86::MONITORrrr);
   case X86::MONITORX:
     return emitMonitor(MI, BB, Subtarget, X86::MONITORXrrr);
+
+  // Cache line zero
+  case X86::CLZERO:
+    return emitClzero(&MI, BB, Subtarget);
+
   // PKU feature
   case X86::WRPKRU:
     return emitWRPKRU(MI, BB, Subtarget);
@@ -26250,10 +26667,12 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       APInt &KnownZero,
                                                       APInt &KnownOne,
+                                                      const APInt &DemandedElts,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   unsigned BitWidth = KnownZero.getBitWidth();
   unsigned Opc = Op.getOpcode();
+  EVT VT = Op.getValueType();
   assert((Opc >= ISD::BUILTIN_OP_END ||
           Opc == ISD::INTRINSIC_WO_CHAIN ||
           Opc == ISD::INTRINSIC_W_CHAIN ||
@@ -26280,44 +26699,91 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       break;
     LLVM_FALLTHROUGH;
   case X86ISD::SETCC:
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - 1);
+    KnownZero.setBits(1, BitWidth);
     break;
   case X86ISD::MOVMSK: {
     unsigned NumLoBits = Op.getOperand(0).getValueType().getVectorNumElements();
-    KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - NumLoBits);
+    KnownZero.setBits(NumLoBits, BitWidth);
+    break;
+  }
+  case X86ISD::VSHLI:
+  case X86ISD::VSRLI: {
+    if (auto *ShiftImm = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      if (ShiftImm->getAPIntValue().uge(VT.getScalarSizeInBits())) {
+        KnownZero = APInt::getAllOnesValue(BitWidth);
+        break;
+      }
+
+      DAG.computeKnownBits(Op.getOperand(0), KnownZero, KnownOne, Depth + 1);
+      unsigned ShAmt = ShiftImm->getZExtValue();
+      if (Opc == X86ISD::VSHLI) {
+        KnownZero = KnownZero << ShAmt;
+        KnownOne = KnownOne << ShAmt;
+        // Low bits are known zero.
+        KnownZero.setLowBits(ShAmt);
+      } else {
+        KnownZero = KnownZero.lshr(ShAmt);
+        KnownOne = KnownOne.lshr(ShAmt);
+        // High bits are known zero.
+        KnownZero.setHighBits(ShAmt);
+      }
+    }
     break;
   }
   case X86ISD::VZEXT: {
     SDValue N0 = Op.getOperand(0);
-    unsigned NumElts = Op.getValueType().getVectorNumElements();
-    unsigned InNumElts = N0.getValueType().getVectorNumElements();
-    unsigned InBitWidth = N0.getValueType().getScalarSizeInBits();
+    unsigned NumElts = VT.getVectorNumElements();
+
+    EVT SrcVT = N0.getValueType();
+    unsigned InNumElts = SrcVT.getVectorNumElements();
+    unsigned InBitWidth = SrcVT.getScalarSizeInBits();
+    assert(InNumElts >= NumElts && "Illegal VZEXT input");
 
     KnownZero = KnownOne = APInt(InBitWidth, 0);
-    APInt DemandedElts = APInt::getLowBitsSet(InNumElts, NumElts);
-    DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedElts, Depth + 1);
+    APInt DemandedSrcElts = APInt::getLowBitsSet(InNumElts, NumElts);
+    DAG.computeKnownBits(N0, KnownZero, KnownOne, DemandedSrcElts, Depth + 1);
     KnownOne = KnownOne.zext(BitWidth);
     KnownZero = KnownZero.zext(BitWidth);
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - InBitWidth);
+    KnownZero.setBits(InBitWidth, BitWidth);
     break;
   }
   }
 }
 
 unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
-    SDValue Op, const SelectionDAG &DAG, unsigned Depth) const {
-  // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
-  if (Op.getOpcode() == X86ISD::SETCC_CARRY)
-    return Op.getScalarValueSizeInBits();
+    SDValue Op, const APInt &DemandedElts, const SelectionDAG &DAG,
+    unsigned Depth) const {
+  unsigned VTBits = Op.getScalarValueSizeInBits();
+  unsigned Opcode = Op.getOpcode();
+  switch (Opcode) {
+  case X86ISD::SETCC_CARRY:
+    // SETCC_CARRY sets the dest to ~0 for true or 0 for false.
+    return VTBits;
 
-  if (Op.getOpcode() == X86ISD::VSEXT) {
-    EVT VT = Op.getValueType();
-    EVT SrcVT = Op.getOperand(0).getValueType();
-    unsigned Tmp = DAG.ComputeNumSignBits(Op.getOperand(0), Depth + 1);
-    Tmp += VT.getScalarSizeInBits() - SrcVT.getScalarSizeInBits();
+  case X86ISD::VSEXT: {
+    SDValue Src = Op.getOperand(0);
+    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+    Tmp += VTBits - Src.getScalarValueSizeInBits();
     return Tmp;
   }
 
+  case X86ISD::VSRAI: {
+    SDValue Src = Op.getOperand(0);
+    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+    APInt ShiftVal = cast<ConstantSDNode>(Op.getOperand(1))->getAPIntValue();
+    ShiftVal += Tmp;
+    return ShiftVal.uge(VTBits) ? VTBits : ShiftVal.getZExtValue();
+  }
+
+  case X86ISD::PCMPGT:
+  case X86ISD::PCMPEQ:
+  case X86ISD::CMPP:
+  case X86ISD::VPCOM:
+  case X86ISD::VPCOMU:
+    // Vector compares return zero/all-bits result values.
+    return VTBits;
+  }
+
   // Fallback case.
   return 1;
 }
@@ -26341,24 +26807,17 @@ bool X86TargetLowering::isGAPlusOffset(SDNode *N,
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                    bool FloatDomain,
+                                    bool AllowFloatDomain, bool AllowIntDomain,
+                                    SDValue &V1, SDLoc &DL, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget,
                                     unsigned &Shuffle, MVT &SrcVT, MVT &DstVT) {
   unsigned NumMaskElts = Mask.size();
   unsigned MaskEltSize = MaskVT.getScalarSizeInBits();
 
-  // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
-  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
-      isUndefOrEqual(Mask[0], 0) &&
-      isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
-    Shuffle = X86ISD::VZEXT_MOVL;
-    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
-    return true;
-  }
-
-  // Match against a VZEXT instruction.
-  // TODO: Add 256/512-bit vector support.
-  if (!FloatDomain && MaskVT.is128BitVector() && Subtarget.hasSSE41()) {
+  // Match against a ZERO_EXTEND_VECTOR_INREG/VZEXT instruction.
+  // TODO: Add 512-bit vector support (split AVX512F and AVX512BW).
+  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE41()) ||
+                         (MaskVT.is256BitVector() && Subtarget.hasInt256()))) {
     unsigned MaxScale = 64 / MaskEltSize;
     for (unsigned Scale = 2; Scale <= MaxScale; Scale *= 2) {
       bool Match = true;
@@ -26368,19 +26827,32 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
         Match &= isUndefOrZeroInRange(Mask, (i * Scale) + 1, Scale - 1);
       }
       if (Match) {
-        SrcVT = MaskVT;
+        unsigned SrcSize = std::max(128u, NumDstElts * MaskEltSize);
+        SrcVT = MVT::getVectorVT(MaskVT.getScalarType(), SrcSize / MaskEltSize);
+        if (SrcVT != MaskVT)
+          V1 = extractSubVector(V1, 0, DAG, DL, SrcSize);
         DstVT = MVT::getIntegerVT(Scale * MaskEltSize);
         DstVT = MVT::getVectorVT(DstVT, NumDstElts);
-        Shuffle = X86ISD::VZEXT;
+        Shuffle = SrcVT != MaskVT ? unsigned(X86ISD::VZEXT)
+                                  : unsigned(ISD::ZERO_EXTEND_VECTOR_INREG);
         return true;
       }
     }
   }
 
+  // Match against a VZEXT_MOVL instruction, SSE1 only supports 32-bits (MOVSS).
+  if (((MaskEltSize == 32) || (MaskEltSize == 64 && Subtarget.hasSSE2())) &&
+      isUndefOrEqual(Mask[0], 0) &&
+      isUndefOrZeroInRange(Mask, 1, NumMaskElts - 1)) {
+    Shuffle = X86ISD::VZEXT_MOVL;
+    SrcVT = DstVT = !Subtarget.hasSSE2() ? MVT::v4f32 : MaskVT;
+    return true;
+  }
+
   // Check if we have SSE3 which will let us use MOVDDUP etc. The
   // instructions are no slower than UNPCKLPD but has the option to
   // fold the input operand into even an unaligned memory load.
-  if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && FloatDomain) {
+  if (MaskVT.is128BitVector() && Subtarget.hasSSE3() && AllowFloatDomain) {
     if (isTargetShuffleEquivalent(Mask, {0, 0})) {
       Shuffle = X86ISD::MOVDDUP;
       SrcVT = DstVT = MVT::v2f64;
@@ -26398,7 +26870,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
     }
   }
 
-  if (MaskVT.is256BitVector() && FloatDomain) {
+  if (MaskVT.is256BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX() && "AVX required for 256-bit vector shuffles");
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2})) {
       Shuffle = X86ISD::MOVDDUP;
@@ -26417,7 +26889,7 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
     }
   }
 
-  if (MaskVT.is512BitVector() && FloatDomain) {
+  if (MaskVT.is512BitVector() && AllowFloatDomain) {
     assert(Subtarget.hasAVX512() &&
            "AVX512 required for 512-bit vector shuffles");
     if (isTargetShuffleEquivalent(Mask, {0, 0, 2, 2, 4, 4, 6, 6})) {
@@ -26456,24 +26928,26 @@ static bool matchUnaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 // permute instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                           bool FloatDomain,
+                                           bool AllowFloatDomain,
+                                           bool AllowIntDomain,
                                            const X86Subtarget &Subtarget,
                                            unsigned &Shuffle, MVT &ShuffleVT,
                                            unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
 
   bool ContainsZeros = false;
-  SmallBitVector Zeroable(NumMaskElts, false);
+  APInt Zeroable(NumMaskElts, false);
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int M = Mask[i];
-    Zeroable[i] = isUndefOrZero(M);
+    if (isUndefOrZero(M))
+      Zeroable.setBit(i);
     ContainsZeros |= (M == SM_SentinelZero);
   }
 
   // Attempt to match against byte/bit shifts.
   // FIXME: Add 512-bit support.
-  if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
-                       (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ShiftAmt = matchVectorShuffleAsShift(ShuffleVT, Shuffle,
                                              MaskVT.getScalarSizeInBits(), Mask,
                                              0, Zeroable, Subtarget);
@@ -26536,19 +27010,21 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 
   // AVX introduced the VPERMILPD/VPERMILPS float permutes, before then we
   // had to use 2-input SHUFPD/SHUFPS shuffles (not handled here).
-  if (FloatDomain && !Subtarget.hasAVX())
+  if ((AllowFloatDomain && !AllowIntDomain) && !Subtarget.hasAVX())
     return false;
 
   // Pre-AVX2 we must use float shuffles on 256-bit vectors.
-  if (MaskVT.is256BitVector() && !Subtarget.hasAVX2())
-    FloatDomain = true;
+  if (MaskVT.is256BitVector() && !Subtarget.hasAVX2()) {
+    AllowFloatDomain = true;
+    AllowIntDomain = false;
+  }
 
   // Check for lane crossing permutes.
   if (is128BitLaneCrossingShuffleMask(MaskEltVT, Mask)) {
     // PERMPD/PERMQ permutes within a 256-bit vector (AVX2+).
     if (Subtarget.hasAVX2() && MaskVT.is256BitVector() && Mask.size() == 4) {
       Shuffle = X86ISD::VPERMI;
-      ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+      ShuffleVT = (AllowFloatDomain ? MVT::v4f64 : MVT::v4i64);
       PermuteImm = getV4X86ShuffleImm(Mask);
       return true;
     }
@@ -26556,7 +27032,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
       SmallVector<int, 4> RepeatedMask;
       if (is256BitLaneRepeatedShuffleMask(MVT::v8f64, Mask, RepeatedMask)) {
         Shuffle = X86ISD::VPERMI;
-        ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+        ShuffleVT = (AllowFloatDomain ? MVT::v8f64 : MVT::v8i64);
         PermuteImm = getV4X86ShuffleImm(RepeatedMask);
         return true;
       }
@@ -26565,7 +27041,7 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // VPERMILPD can permute with a non-repeating shuffle.
-  if (FloatDomain && MaskScalarSizeInBits == 64) {
+  if (AllowFloatDomain && MaskScalarSizeInBits == 64) {
     Shuffle = X86ISD::VPERMILPI;
     ShuffleVT = MVT::getVectorVT(MVT::f64, Mask.size());
     PermuteImm = 0;
@@ -26589,8 +27065,8 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   if (MaskScalarSizeInBits == 64)
     scaleShuffleMask(2, RepeatedMask, WordMask);
 
-  Shuffle = (FloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
-  ShuffleVT = (FloatDomain ? MVT::f32 : MVT::i32);
+  Shuffle = (AllowFloatDomain ? X86ISD::VPERMILPI : X86ISD::PSHUFD);
+  ShuffleVT = (AllowFloatDomain ? MVT::f32 : MVT::i32);
   ShuffleVT = MVT::getVectorVT(ShuffleVT, InputSizeInBits / 32);
   PermuteImm = getV4X86ShuffleImm(WordMask);
   return true;
@@ -26600,34 +27076,36 @@ static bool matchUnaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 // shuffle instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
 static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                     bool FloatDomain, SDValue &V1, SDValue &V2,
+                                     bool AllowFloatDomain, bool AllowIntDomain,
+                                     SDValue &V1, SDValue &V2, SDLoc &DL,
+                                     SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget,
                                      unsigned &Shuffle, MVT &ShuffleVT,
                                      bool IsUnary) {
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   if (MaskVT.is128BitVector()) {
-    if (isTargetShuffleEquivalent(Mask, {0, 0}) && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {0, 0}) && AllowFloatDomain) {
       V2 = V1;
       Shuffle = X86ISD::MOVLHPS;
       ShuffleVT = MVT::v4f32;
       return true;
     }
-    if (isTargetShuffleEquivalent(Mask, {1, 1}) && FloatDomain) {
+    if (isTargetShuffleEquivalent(Mask, {1, 1}) && AllowFloatDomain) {
       V2 = V1;
       Shuffle = X86ISD::MOVHLPS;
       ShuffleVT = MVT::v4f32;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {0, 3}) && Subtarget.hasSSE2() &&
-        (FloatDomain || !Subtarget.hasSSE41())) {
+        (AllowFloatDomain || !Subtarget.hasSSE41())) {
       std::swap(V1, V2);
       Shuffle = X86ISD::MOVSD;
       ShuffleVT = MaskVT;
       return true;
     }
     if (isTargetShuffleEquivalent(Mask, {4, 1, 2, 3}) &&
-        (FloatDomain || !Subtarget.hasSSE41())) {
+        (AllowFloatDomain || !Subtarget.hasSSE41())) {
       Shuffle = X86ISD::MOVSS;
       ShuffleVT = MaskVT;
       return true;
@@ -26640,57 +27118,12 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
       (MaskVT.is256BitVector() && 32 <= EltSizeInBits && Subtarget.hasAVX()) ||
       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
       (MaskVT.is512BitVector() && Subtarget.hasAVX512())) {
-    MVT LegalVT = MaskVT;
-    if (LegalVT.is256BitVector() && !Subtarget.hasAVX2())
-      LegalVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
-
-    SmallVector<int, 64> Unpckl, Unpckh;
-    if (IsUnary) {
-      createUnpackShuffleMask(MaskVT, Unpckl, true, true);
-      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
-        V2 = V1;
-        Shuffle = X86ISD::UNPCKL;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      createUnpackShuffleMask(MaskVT, Unpckh, false, true);
-      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
-        V2 = V1;
-        Shuffle = X86ISD::UNPCKH;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-    } else {
-      createUnpackShuffleMask(MaskVT, Unpckl, true, false);
-      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
-        Shuffle = X86ISD::UNPCKL;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      createUnpackShuffleMask(MaskVT, Unpckh, false, false);
-      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
-        Shuffle = X86ISD::UNPCKH;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      ShuffleVectorSDNode::commuteMask(Unpckl);
-      if (isTargetShuffleEquivalent(Mask, Unpckl)) {
-        std::swap(V1, V2);
-        Shuffle = X86ISD::UNPCKL;
-        ShuffleVT = LegalVT;
-        return true;
-      }
-
-      ShuffleVectorSDNode::commuteMask(Unpckh);
-      if (isTargetShuffleEquivalent(Mask, Unpckh)) {
-        std::swap(V1, V2);
-        Shuffle = X86ISD::UNPCKH;
-        ShuffleVT = LegalVT;
-        return true;
-      }
+    if (matchVectorShuffleWithUNPCK(MaskVT, V1, V2, Shuffle, IsUnary, Mask, DL,
+                                    DAG, Subtarget)) {
+      ShuffleVT = MaskVT;
+      if (ShuffleVT.is256BitVector() && !Subtarget.hasAVX2())
+        ShuffleVT = (32 == EltSizeInBits ? MVT::v8f32 : MVT::v4f64);
+      return true;
     }
   }
 
@@ -26698,17 +27131,19 @@ static bool matchBinaryVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
 }
 
 static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
-                                            bool FloatDomain,
-                                            SDValue &V1, SDValue &V2,
-                                            SDLoc &DL, SelectionDAG &DAG,
+                                            bool AllowFloatDomain,
+                                            bool AllowIntDomain,
+                                            SDValue &V1, SDValue &V2, SDLoc &DL,
+                                            SelectionDAG &DAG,
                                             const X86Subtarget &Subtarget,
                                             unsigned &Shuffle, MVT &ShuffleVT,
                                             unsigned &PermuteImm) {
   unsigned NumMaskElts = Mask.size();
+  unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
   // Attempt to match against PALIGNR byte rotate.
-  if (!FloatDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
-                       (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
+                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
     int ByteRotation = matchVectorShuffleAsByteRotate(MaskVT, V1, V2, Mask);
     if (0 < ByteRotation) {
       Shuffle = X86ISD::PALIGNR;
@@ -26719,77 +27154,74 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // Attempt to combine to X86ISD::BLENDI.
-  if (NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
-                           (Subtarget.hasAVX() && MaskVT.is256BitVector()))) {
-    // Determine a type compatible with X86ISD::BLENDI.
-    // TODO - add 16i16 support (requires lane duplication).
-    MVT BlendVT = MaskVT;
-    if (Subtarget.hasAVX2()) {
-      if (BlendVT == MVT::v4i64)
-        BlendVT = MVT::v8i32;
-      else if (BlendVT == MVT::v2i64)
-        BlendVT = MVT::v4i32;
-    } else {
-      if (BlendVT == MVT::v2i64 || BlendVT == MVT::v4i32)
-        BlendVT = MVT::v8i16;
-      else if (BlendVT == MVT::v4i64)
-        BlendVT = MVT::v4f64;
-      else if (BlendVT == MVT::v8i32)
-        BlendVT = MVT::v8f32;
-    }
-
-    unsigned BlendSize = BlendVT.getVectorNumElements();
-    unsigned MaskRatio = BlendSize / NumMaskElts;
-
-    // Can we blend with zero?
-    if (isSequentialOrUndefOrZeroInRange(Mask, /*Pos*/ 0, /*Size*/ NumMaskElts,
-                                         /*Low*/ 0) &&
-        NumMaskElts <= BlendVT.getVectorNumElements()) {
-      PermuteImm = 0;
-      for (unsigned i = 0; i != BlendSize; ++i)
-        if (Mask[i / MaskRatio] < 0)
-          PermuteImm |= 1u << i;
-
-      V2 = getZeroVector(BlendVT, Subtarget, DAG, DL);
-      Shuffle = X86ISD::BLENDI;
-      ShuffleVT = BlendVT;
-      return true;
-    }
-
-    // Attempt to match as a binary blend.
-    if (NumMaskElts <= BlendVT.getVectorNumElements()) {
-      bool MatchBlend = true;
-      for (int i = 0; i != (int)NumMaskElts; ++i) {
-        int M = Mask[i];
-        if (M == SM_SentinelUndef)
-          continue;
-        else if (M == SM_SentinelZero)
-          MatchBlend = false;
-        else if ((M != i) && (M != (i + (int)NumMaskElts)))
-          MatchBlend = false;
-      }
+  if ((NumMaskElts <= 8 && ((Subtarget.hasSSE41() && MaskVT.is128BitVector()) ||
+                            (Subtarget.hasAVX() && MaskVT.is256BitVector()))) ||
+      (MaskVT == MVT::v16i16 && Subtarget.hasAVX2())) {
+    uint64_t BlendMask = 0;
+    bool ForceV1Zero = false, ForceV2Zero = false;
+    SmallVector<int, 8> TargetMask(Mask.begin(), Mask.end());
+    if (matchVectorShuffleAsBlend(V1, V2, TargetMask, ForceV1Zero, ForceV2Zero,
+                                  BlendMask)) {
+      if (MaskVT == MVT::v16i16) {
+        // We can only use v16i16 PBLENDW if the lanes are repeated.
+        SmallVector<int, 8> RepeatedMask;
+        if (isRepeatedTargetShuffleMask(128, MaskVT, TargetMask,
+                                        RepeatedMask)) {
+          assert(RepeatedMask.size() == 8 &&
+                 "Repeated mask size doesn't match!");
+          PermuteImm = 0;
+          for (int i = 0; i < 8; ++i)
+            if (RepeatedMask[i] >= 8)
+              PermuteImm |= 1 << i;
+          V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+          V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+          Shuffle = X86ISD::BLENDI;
+          ShuffleVT = MaskVT;
+          return true;
+        }
+      } else {
+        // Determine a type compatible with X86ISD::BLENDI.
+        ShuffleVT = MaskVT;
+        if (Subtarget.hasAVX2()) {
+          if (ShuffleVT == MVT::v4i64)
+            ShuffleVT = MVT::v8i32;
+          else if (ShuffleVT == MVT::v2i64)
+            ShuffleVT = MVT::v4i32;
+        } else {
+          if (ShuffleVT == MVT::v2i64 || ShuffleVT == MVT::v4i32)
+            ShuffleVT = MVT::v8i16;
+          else if (ShuffleVT == MVT::v4i64)
+            ShuffleVT = MVT::v4f64;
+          else if (ShuffleVT == MVT::v8i32)
+            ShuffleVT = MVT::v8f32;
+        }
 
-      if (MatchBlend) {
-        PermuteImm = 0;
-        for (unsigned i = 0; i != BlendSize; ++i)
-          if ((int)NumMaskElts <= Mask[i / MaskRatio])
-            PermuteImm |= 1u << i;
+        if (!ShuffleVT.isFloatingPoint()) {
+          int Scale = EltSizeInBits / ShuffleVT.getScalarSizeInBits();
+          BlendMask =
+              scaleVectorShuffleBlendMask(BlendMask, NumMaskElts, Scale);
+          ShuffleVT = MVT::getIntegerVT(EltSizeInBits / Scale);
+          ShuffleVT = MVT::getVectorVT(ShuffleVT, NumMaskElts * Scale);
+        }
 
+        V1 = ForceV1Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V1;
+        V2 = ForceV2Zero ? getZeroVector(MaskVT, Subtarget, DAG, DL) : V2;
+        PermuteImm = (unsigned)BlendMask;
         Shuffle = X86ISD::BLENDI;
-        ShuffleVT = BlendVT;
         return true;
       }
     }
   }
 
   // Attempt to combine to INSERTPS.
-  if (Subtarget.hasSSE41() && MaskVT == MVT::v4f32) {
-    SmallBitVector Zeroable(4, false);
+  if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
+      MaskVT.is128BitVector()) {
+    APInt Zeroable(4, 0);
     for (unsigned i = 0; i != NumMaskElts; ++i)
       if (Mask[i] < 0)
-        Zeroable[i] = true;
+        Zeroable.setBit(i);
 
-    if (Zeroable.any() &&
+    if (Zeroable.getBoolValue() &&
         matchVectorShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
       Shuffle = X86ISD::INSERTPS;
       ShuffleVT = MVT::v4f32;
@@ -26798,22 +27230,26 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // Attempt to combine to SHUFPD.
-  if ((MaskVT == MVT::v2f64 && Subtarget.hasSSE2()) ||
-      (MaskVT == MVT::v4f64 && Subtarget.hasAVX()) ||
-      (MaskVT == MVT::v8f64 && Subtarget.hasAVX512())) {
+  if (AllowFloatDomain && EltSizeInBits == 64 &&
+      ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+       (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     if (matchVectorShuffleWithSHUFPD(MaskVT, V1, V2, PermuteImm, Mask)) {
       Shuffle = X86ISD::SHUFP;
-      ShuffleVT = MaskVT;
+      ShuffleVT = MVT::getVectorVT(MVT::f64, MaskVT.getSizeInBits() / 64);
       return true;
     }
   }
 
   // Attempt to combine to SHUFPS.
-  if ((MaskVT == MVT::v4f32 && Subtarget.hasSSE1()) ||
-      (MaskVT == MVT::v8f32 && Subtarget.hasAVX()) ||
-      (MaskVT == MVT::v16f32 && Subtarget.hasAVX512())) {
+  if (AllowFloatDomain && EltSizeInBits == 32 &&
+      ((MaskVT.is128BitVector() && Subtarget.hasSSE1()) ||
+       (MaskVT.is256BitVector() && Subtarget.hasAVX()) ||
+       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     SmallVector<int, 4> RepeatedMask;
     if (isRepeatedTargetShuffleMask(128, MaskVT, Mask, RepeatedMask)) {
+      // Match each half of the repeated mask, to determine if its just
+      // referencing one of the vectors, is zeroable or entirely undef.
       auto MatchHalf = [&](unsigned Offset, int &S0, int &S1) {
         int M0 = RepeatedMask[Offset];
         int M1 = RepeatedMask[Offset + 1];
@@ -26845,7 +27281,7 @@ static bool matchBinaryPermuteVectorShuffle(MVT MaskVT, ArrayRef<int> Mask,
         V1 = Lo;
         V2 = Hi;
         Shuffle = X86ISD::SHUFP;
-        ShuffleVT = MaskVT;
+        ShuffleVT = MVT::getVectorVT(MVT::f32, MaskVT.getSizeInBits() / 32);
         PermuteImm = getV4X86ShuffleImm(ShufMask);
         return true;
       }
@@ -26877,7 +27313,8 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // here, we're not going to remove the operands we find.
   bool UnaryShuffle = (Inputs.size() == 1);
   SDValue V1 = peekThroughBitcasts(Inputs[0]);
-  SDValue V2 = (UnaryShuffle ? V1 : peekThroughBitcasts(Inputs[1]));
+  SDValue V2 = (UnaryShuffle ? DAG.getUNDEF(V1.getValueType())
+                             : peekThroughBitcasts(Inputs[1]));
 
   MVT VT1 = V1.getSimpleValueType();
   MVT VT2 = V2.getSimpleValueType();
@@ -26966,6 +27403,11 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   MVT ShuffleSrcVT, ShuffleVT;
   unsigned Shuffle, PermuteImm;
 
+  // Which shuffle domains are permitted?
+  // Permit domain crossing at higher combine depths.
+  bool AllowFloatDomain = FloatDomain || (Depth > 3);
+  bool AllowIntDomain = !FloatDomain || (Depth > 3);
+
   if (UnaryShuffle) {
     // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
     // directly if we don't shuffle the lower element and we shuffle the upper
@@ -26982,8 +27424,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       }
     }
 
-    if (matchUnaryVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget, Shuffle,
-                                ShuffleSrcVT, ShuffleVT)) {
+    if (matchUnaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
+                                V1, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
+                                ShuffleVT)) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return false; // Nothing to do!
       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -26997,8 +27440,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       return true;
     }
 
-    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, Subtarget,
-                                       Shuffle, ShuffleVT, PermuteImm)) {
+    if (matchUnaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
+                                       AllowIntDomain, Subtarget, Shuffle,
+                                       ShuffleVT, PermuteImm)) {
       if (Depth == 1 && Root.getOpcode() == Shuffle)
         return false; // Nothing to do!
       if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -27014,8 +27458,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
   }
 
-  if (matchBinaryVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, Subtarget,
-                               Shuffle, ShuffleVT, UnaryShuffle)) {
+  if (matchBinaryVectorShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain,
+                               V1, V2, DL, DAG, Subtarget, Shuffle, ShuffleVT,
+                               UnaryShuffle)) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return false; // Nothing to do!
     if (IsEVEXShuffle && (NumRootElts != ShuffleVT.getVectorNumElements()))
@@ -27031,8 +27476,9 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     return true;
   }
 
-  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, FloatDomain, V1, V2, DL,
-                                      DAG, Subtarget, Shuffle, ShuffleVT,
+  if (matchBinaryPermuteVectorShuffle(MaskVT, Mask, AllowFloatDomain,
+                                      AllowIntDomain, V1, V2, DL, DAG,
+                                      Subtarget, Shuffle, ShuffleVT,
                                       PermuteImm)) {
     if (Depth == 1 && Root.getOpcode() == Shuffle)
       return false; // Nothing to do!
@@ -27152,12 +27598,12 @@ static bool combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
       DAG.getTargetLoweringInfo().isTypeLegal(MaskVT)) {
     APInt Zero = APInt::getNullValue(MaskEltSizeInBits);
     APInt AllOnes = APInt::getAllOnesValue(MaskEltSizeInBits);
-    SmallBitVector UndefElts(NumMaskElts, false);
+    APInt UndefElts(NumMaskElts, 0);
     SmallVector<APInt, 64> EltBits(NumMaskElts, Zero);
     for (unsigned i = 0; i != NumMaskElts; ++i) {
       int M = Mask[i];
       if (M == SM_SentinelUndef) {
-        UndefElts[i] = true;
+        UndefElts.setBit(i);
         continue;
       }
       if (M == SM_SentinelZero)
@@ -27341,8 +27787,8 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 
   // Extract constant bits from each source op.
   bool OneUseConstantOp = false;
-  SmallVector<SmallBitVector, 4> UndefEltsOps(NumOps);
-  SmallVector<SmallVector<APInt, 8>, 4> RawBitsOps(NumOps);
+  SmallVector<APInt, 16> UndefEltsOps(NumOps);
+  SmallVector<SmallVector<APInt, 16>, 16> RawBitsOps(NumOps);
   for (unsigned i = 0; i != NumOps; ++i) {
     SDValue SrcOp = Ops[i];
     OneUseConstantOp |= SrcOp.hasOneUse();
@@ -27358,18 +27804,18 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
     return false;
 
   // Shuffle the constant bits according to the mask.
-  SmallBitVector UndefElts(NumMaskElts, false);
-  SmallBitVector ZeroElts(NumMaskElts, false);
-  SmallBitVector ConstantElts(NumMaskElts, false);
+  APInt UndefElts(NumMaskElts, 0);
+  APInt ZeroElts(NumMaskElts, 0);
+  APInt ConstantElts(NumMaskElts, 0);
   SmallVector<APInt, 8> ConstantBitData(NumMaskElts,
                                         APInt::getNullValue(MaskSizeInBits));
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     int M = Mask[i];
     if (M == SM_SentinelUndef) {
-      UndefElts[i] = true;
+      UndefElts.setBit(i);
       continue;
     } else if (M == SM_SentinelZero) {
-      ZeroElts[i] = true;
+      ZeroElts.setBit(i);
       continue;
     }
     assert(0 <= M && M < (int)(NumMaskElts * NumOps));
@@ -27379,21 +27825,21 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 
     auto &SrcUndefElts = UndefEltsOps[SrcOpIdx];
     if (SrcUndefElts[SrcMaskIdx]) {
-      UndefElts[i] = true;
+      UndefElts.setBit(i);
       continue;
     }
 
     auto &SrcEltBits = RawBitsOps[SrcOpIdx];
     APInt &Bits = SrcEltBits[SrcMaskIdx];
     if (!Bits) {
-      ZeroElts[i] = true;
+      ZeroElts.setBit(i);
       continue;
     }
 
-    ConstantElts[i] = true;
+    ConstantElts.setBit(i);
     ConstantBitData[i] = Bits;
   }
-  assert((UndefElts | ZeroElts | ConstantElts).count() == NumMaskElts);
+  assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
 
   // Create the constant data.
   MVT MaskSVT;
@@ -27443,6 +27889,7 @@ static bool combineX86ShufflesConstants(const SmallVectorImpl<SDValue> &Ops,
 static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
                                           int SrcOpIndex, SDValue Root,
                                           ArrayRef<int> RootMask,
+                                          ArrayRef<const SDNode*> SrcNodes,
                                           int Depth, bool HasVariableMask,
                                           SelectionDAG &DAG,
                                           TargetLowering::DAGCombinerInfo &DCI,
@@ -27466,7 +27913,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
          "Can only combine shuffles of the same vector register size.");
 
   // Extract target shuffle mask and resolve sentinels and inputs.
-  SmallVector<int, 16> OpMask;
+  SmallVector<int, 64> OpMask;
   SmallVector<SDValue, 2> OpInputs;
   if (!resolveTargetShuffleInputs(Op, OpInputs, OpMask))
     return false;
@@ -27476,7 +27923,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
   SDValue Input1 = (OpInputs.size() > 1 ? OpInputs[1] : SDValue());
 
   // Add the inputs to the Ops list, avoiding duplicates.
-  SmallVector<SDValue, 8> Ops(SrcOps.begin(), SrcOps.end());
+  SmallVector<SDValue, 16> Ops(SrcOps.begin(), SrcOps.end());
 
   int InputIdx0 = -1, InputIdx1 = -1;
   for (int i = 0, e = Ops.size(); i < e; ++i) {
@@ -27509,8 +27956,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
           (RootRatio == 1) != (OpRatio == 1)) &&
          "Must not have a ratio for both incoming and op masks!");
 
-  SmallVector<int, 16> Mask;
-  Mask.reserve(MaskWidth);
+  SmallVector<int, 64> Mask((unsigned)MaskWidth, SM_SentinelUndef);
 
   // Merge this shuffle operation's mask into our accumulated mask. Note that
   // this shuffle's mask will be the first applied to the input, followed by the
@@ -27520,7 +27966,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
     int RootIdx = i / RootRatio;
     if (RootMask[RootIdx] < 0) {
       // This is a zero or undef lane, we're done.
-      Mask.push_back(RootMask[RootIdx]);
+      Mask[i] = RootMask[RootIdx];
       continue;
     }
 
@@ -27530,7 +27976,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
     // than the SrcOp we're currently inserting.
     if ((RootMaskedIdx < (SrcOpIndex * MaskWidth)) ||
         (((SrcOpIndex + 1) * MaskWidth) <= RootMaskedIdx)) {
-      Mask.push_back(RootMaskedIdx);
+      Mask[i] = RootMaskedIdx;
       continue;
     }
 
@@ -27540,7 +27986,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
     if (OpMask[OpIdx] < 0) {
       // The incoming lanes are zero or undef, it doesn't matter which ones we
       // are using.
-      Mask.push_back(OpMask[OpIdx]);
+      Mask[i] = OpMask[OpIdx];
       continue;
     }
 
@@ -27556,7 +28002,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
       OpMaskedIdx += InputIdx1 * MaskWidth;
     }
 
-    Mask.push_back(OpMaskedIdx);
+    Mask[i] = OpMaskedIdx;
   }
 
   // Handle the all undef/zero cases early.
@@ -27579,11 +28025,20 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
 
   HasVariableMask |= isTargetShuffleVariableMask(Op.getOpcode());
 
-  // See if we can recurse into each shuffle source op (if it's a target shuffle).
+  // Update the list of shuffle nodes that have been combined so far.
+  SmallVector<const SDNode *, 16> CombinedNodes(SrcNodes.begin(),
+                                                SrcNodes.end());
+  CombinedNodes.push_back(Op.getNode());
+
+  // See if we can recurse into each shuffle source op (if it's a target
+  // shuffle). The source op should only be combined if it either has a
+  // single use (i.e. current Op) or all its users have already been combined.
   for (int i = 0, e = Ops.size(); i < e; ++i)
-    if (Ops[i].getNode()->hasOneUse() || Op->isOnlyUserOf(Ops[i].getNode()))
-      if (combineX86ShufflesRecursively(Ops, i, Root, Mask, Depth + 1,
-                                        HasVariableMask, DAG, DCI, Subtarget))
+    if (Ops[i].getNode()->hasOneUse() ||
+        SDNode::areOnlyUsersOf(CombinedNodes, Ops[i].getNode()))
+      if (combineX86ShufflesRecursively(Ops, i, Root, Mask, CombinedNodes,
+                                        Depth + 1, HasVariableMask, DAG, DCI,
+                                        Subtarget))
         return true;
 
   // Attempt to constant fold all of the constant source ops.
@@ -27600,7 +28055,7 @@ static bool combineX86ShufflesRecursively(ArrayRef<SDValue> SrcOps,
   // elements, and shrink them to the half-width mask. It does this in a loop
   // so it will reduce the size of the mask to the minimal width mask which
   // performs an equivalent shuffle.
-  SmallVector<int, 16> WidenedMask;
+  SmallVector<int, 64> WidenedMask;
   while (Mask.size() > 1 && canWidenShuffleElements(Mask, WidenedMask)) {
     Mask = std::move(WidenedMask);
   }
@@ -27666,8 +28121,7 @@ static SmallVector<int, 4> getPSHUFShuffleMask(SDValue N) {
 /// altering anything.
 static SDValue
 combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
-                             SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI) {
+                             SelectionDAG &DAG) {
   assert(N.getOpcode() == X86ISD::PSHUFD &&
          "Called with something other than an x86 128-bit half shuffle!");
   SDLoc DL(N);
@@ -27947,19 +28401,20 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   }
   case X86ISD::MOVSD:
   case X86ISD::MOVSS: {
-    bool isFloat = VT.isFloatingPoint();
     SDValue V0 = peekThroughBitcasts(N->getOperand(0));
     SDValue V1 = peekThroughBitcasts(N->getOperand(1));
-    bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
-    bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
     bool isZero0 = ISD::isBuildVectorAllZeros(V0.getNode());
     bool isZero1 = ISD::isBuildVectorAllZeros(V1.getNode());
-    assert(!(isZero0 && isZero1) && "Zeroable shuffle detected.");
+    if (isZero0 && isZero1)
+      return SDValue();
 
     // We often lower to MOVSD/MOVSS from integer as well as native float
     // types; remove unnecessary domain-crossing bitcasts if we can to make it
     // easier to combine shuffles later on. We've already accounted for the
     // domain switching cost when we decided to lower with it.
+    bool isFloat = VT.isFloatingPoint();
+    bool isFloat0 = V0.getSimpleValueType().isFloatingPoint();
+    bool isFloat1 = V1.getSimpleValueType().isFloatingPoint();
     if ((isFloat != isFloat0 || isZero0) && (isFloat != isFloat1 || isZero1)) {
       MVT NewVT = isFloat ? (X86ISD::MOVSD == Opcode ? MVT::v2i64 : MVT::v4i32)
                           : (X86ISD::MOVSD == Opcode ? MVT::v2f64 : MVT::v4f32);
@@ -28130,7 +28585,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     break;
 
   case X86ISD::PSHUFD:
-    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG, DCI))
+    if (SDValue NewN = combineRedundantDWordShuffle(N, Mask, DAG))
       return NewN;
 
     break;
@@ -28278,12 +28733,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
-
-  // Don't create instructions with illegal types after legalize types has run.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (!DCI.isBeforeLegalize() && !TLI.isTypeLegal(VT.getVectorElementType()))
-    return SDValue();
-
   // If we have legalized the vector types, look for blends of FADD and FSUB
   // nodes that we can fuse into an ADDSUB node.
   if (TLI.isTypeLegal(VT))
@@ -28354,11 +28804,18 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   // load4, <0, 1, 2, 3> into a 128-bit load if the load addresses are
   // consecutive, non-overlapping, and in the right order.
   SmallVector<SDValue, 16> Elts;
-  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i)
-    Elts.push_back(getShuffleScalarElt(N, i, DAG, 0));
+  for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+    if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+      Elts.push_back(Elt);
+      continue;
+    }
+    Elts.clear();
+    break;
+  }
 
-  if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
-    return LD;
+  if (Elts.size() == VT.getVectorNumElements())
+    if (SDValue LD = EltsFromConsecutiveLoads(VT, Elts, dl, DAG, true))
+      return LD;
 
   // For AVX2, we sometimes want to combine
   // (vector_shuffle <mask> (concat_vectors t1, undef)
@@ -28381,7 +28838,7 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
     // a particular chain.
     SmallVector<int, 1> NonceMask; // Just a placeholder.
     NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                       DCI, Subtarget))
       return SDValue(); // This routine will use CombineTo to replace N.
@@ -28408,18 +28865,13 @@ static SDValue XFormVExtractWithShuffleIntoLoad(SDNode *N, SelectionDAG &DAG,
 
   EVT OriginalVT = InVec.getValueType();
 
-  if (InVec.getOpcode() == ISD::BITCAST) {
-    // Don't duplicate a load with other uses.
-    if (!InVec.hasOneUse())
-      return SDValue();
-    EVT BCVT = InVec.getOperand(0).getValueType();
-    if (!BCVT.isVector() ||
-        BCVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
-      return SDValue();
-    InVec = InVec.getOperand(0);
-  }
+  // Peek through bitcasts, don't duplicate a load with other uses.
+  InVec = peekThroughOneUseBitcasts(InVec);
 
   EVT CurrentVT = InVec.getValueType();
+  if (!CurrentVT.isVector() ||
+      CurrentVT.getVectorNumElements() != OriginalVT.getVectorNumElements())
+    return SDValue();
 
   if (!isTargetShuffle(InVec.getOpcode()))
     return SDValue();
@@ -28498,19 +28950,41 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
                               const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  EVT SrcVT = N0.getValueType();
 
-  // Detect bitcasts between i32 to x86mmx low word. Since MMX types are
-  // special and don't usually play with other vector types, it's better to
-  // handle them early to be sure we emit efficient code by avoiding
-  // store-load conversions.
+  // Since MMX types are special and don't usually play with other vector types,
+  // it's better to handle them early to be sure we emit efficient code by
+  // avoiding store-load conversions.
+
+  // Detect bitcasts between i32 to x86mmx low word.
   if (VT == MVT::x86mmx && N0.getOpcode() == ISD::BUILD_VECTOR &&
-      N0.getValueType() == MVT::v2i32 &&
-      isNullConstant(N0.getOperand(1))) {
+      SrcVT == MVT::v2i32 && isNullConstant(N0.getOperand(1))) {
     SDValue N00 = N0->getOperand(0);
     if (N00.getValueType() == MVT::i32)
       return DAG.getNode(X86ISD::MMX_MOVW2D, SDLoc(N00), VT, N00);
   }
 
+  // Detect bitcasts between element or subvector extraction to x86mmx.
+  if (VT == MVT::x86mmx &&
+      (N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+       N0.getOpcode() == ISD::EXTRACT_SUBVECTOR) &&
+      isNullConstant(N0.getOperand(1))) {
+    SDValue N00 = N0->getOperand(0);
+    if (N00.getValueType().is128BitVector())
+      return DAG.getNode(X86ISD::MOVDQ2Q, SDLoc(N00), VT,
+                         DAG.getBitcast(MVT::v2i64, N00));
+  }
+
+  // Detect bitcasts from FP_TO_SINT to x86mmx.
+  if (VT == MVT::x86mmx && SrcVT == MVT::v2i32 &&
+      N0.getOpcode() == ISD::FP_TO_SINT) {
+    SDLoc DL(N0);
+    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v4i32, N0,
+                              DAG.getUNDEF(MVT::v2i32));
+    return DAG.getNode(X86ISD::MOVDQ2Q, DL, VT,
+                       DAG.getBitcast(MVT::v2i64, Res));
+  }
+
   // Convert a bitcasted integer logic operation that has one bitcasted
   // floating-point operand into a floating-point logic operation. This may
   // create a load of a constant, but that is cheaper than materializing the
@@ -28616,12 +29090,18 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
   if (SetCC.getOpcode() != ISD::SETCC)
     return false;
   ISD::CondCode CC = cast<CondCodeSDNode>(SetCC.getOperand(2))->get();
-  if (CC != ISD::SETGT)
+  if (CC != ISD::SETGT && CC != ISD::SETLT)
     return false;
 
   SDValue SelectOp1 = Select->getOperand(1);
   SDValue SelectOp2 = Select->getOperand(2);
 
+  // The following instructions assume SelectOp1 is the subtraction operand
+  // and SelectOp2 is the negation operand.
+  // In the case of SETLT this is the other way around.
+  if (CC == ISD::SETLT)
+    std::swap(SelectOp1, SelectOp2);
+
   // The second operand of the select should be the negation of the first
   // operand, which is implemented as 0 - SelectOp1.
   if (!(SelectOp2.getOpcode() == ISD::SUB &&
@@ -28634,8 +29114,17 @@ static bool detectZextAbsDiff(const SDValue &Select, SDValue &Op0,
   if (SetCC.getOperand(0) != SelectOp1)
     return false;
 
-  // The second operand of the comparison can be either -1 or 0.
-  if (!(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
+  // In SetLT case, The second operand of the comparison can be either 1 or 0.
+  APInt SplatVal;
+  if ((CC == ISD::SETLT) &&
+      !((ISD::isConstantSplatVector(SetCC.getOperand(1).getNode(), SplatVal) &&
+         SplatVal == 1) ||
+        (ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()))))
+    return false;
+
+  // In SetGT case, The second operand of the comparison can be either -1 or 0.
+  if ((CC == ISD::SETGT) &&
+      !(ISD::isBuildVectorAllZeros(SetCC.getOperand(1).getNode()) ||
         ISD::isBuildVectorAllOnes(SetCC.getOperand(1).getNode())))
     return false;
 
@@ -28681,17 +29170,92 @@ static SDValue createPSADBW(SelectionDAG &DAG, const SDValue &Zext0,
   return DAG.getNode(X86ISD::PSADBW, DL, SadVT, SadOp0, SadOp1);
 }
 
+// Attempt to replace an all_of/any_of style horizontal reduction with a MOVMSK.
+static SDValue combineHorizontalPredicateResult(SDNode *Extract,
+                                                SelectionDAG &DAG,
+                                                const X86Subtarget &Subtarget) {
+  // Bail without SSE2 or with AVX512VL (which uses predicate registers).
+  if (!Subtarget.hasSSE2() || Subtarget.hasVLX())
+    return SDValue();
+
+  EVT ExtractVT = Extract->getValueType(0);
+  unsigned BitWidth = ExtractVT.getSizeInBits();
+  if (ExtractVT != MVT::i64 && ExtractVT != MVT::i32 && ExtractVT != MVT::i16 &&
+      ExtractVT != MVT::i8)
+    return SDValue();
+
+  // Check for OR(any_of) and AND(all_of) horizontal reduction patterns.
+  for (ISD::NodeType Op : {ISD::OR, ISD::AND}) {
+    SDValue Match = matchBinOpReduction(Extract, Op);
+    if (!Match)
+      continue;
+
+    // EXTRACT_VECTOR_ELT can require implicit extension of the vector element
+    // which we can't support here for now.
+    if (Match.getScalarValueSizeInBits() != BitWidth)
+      continue;
+
+    // We require AVX2 for PMOVMSKB for v16i16/v32i8;
+    unsigned MatchSizeInBits = Match.getValueSizeInBits();
+    if (!(MatchSizeInBits == 128 ||
+          (MatchSizeInBits == 256 &&
+           ((Subtarget.hasAVX() && BitWidth >= 32) || Subtarget.hasAVX2()))))
+      return SDValue();
+
+    // Don't bother performing this for 2-element vectors.
+    if (Match.getValueType().getVectorNumElements() <= 2)
+      return SDValue();
+
+    // Check that we are extracting a reduction of all sign bits.
+    if (DAG.ComputeNumSignBits(Match) != BitWidth)
+      return SDValue();
+
+    // For 32/64 bit comparisons use MOVMSKPS/MOVMSKPD, else PMOVMSKB.
+    MVT MaskVT;
+    if (64 == BitWidth || 32 == BitWidth)
+      MaskVT = MVT::getVectorVT(MVT::getFloatingPointVT(BitWidth),
+                                MatchSizeInBits / BitWidth);
+    else
+      MaskVT = MVT::getVectorVT(MVT::i8, MatchSizeInBits / 8);
+
+    APInt CompareBits;
+    ISD::CondCode CondCode;
+    if (Op == ISD::OR) {
+      // any_of -> MOVMSK != 0
+      CompareBits = APInt::getNullValue(32);
+      CondCode = ISD::CondCode::SETNE;
+    } else {
+      // all_of -> MOVMSK == ((1 << NumElts) - 1)
+      CompareBits = APInt::getLowBitsSet(32, MaskVT.getVectorNumElements());
+      CondCode = ISD::CondCode::SETEQ;
+    }
+
+    // Perform the select as i32/i64 and then truncate to avoid partial register
+    // stalls.
+    unsigned ResWidth = std::max(BitWidth, 32u);
+    EVT ResVT = EVT::getIntegerVT(*DAG.getContext(), ResWidth);
+    SDLoc DL(Extract);
+    SDValue Zero = DAG.getConstant(0, DL, ResVT);
+    SDValue Ones = DAG.getAllOnesConstant(DL, ResVT);
+    SDValue Res = DAG.getBitcast(MaskVT, Match);
+    Res = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Res);
+    Res = DAG.getSelectCC(DL, Res, DAG.getConstant(CompareBits, DL, MVT::i32),
+                          Ones, Zero, CondCode);
+    return DAG.getSExtOrTrunc(Res, DL, ExtractVT);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
                                       const X86Subtarget &Subtarget) {
   // PSADBW is only supported on SSE2 and up.
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  // Verify the type we're extracting from is appropriate
-  // TODO: There's nothing special about i32, any integer type above i16 should
-  // work just as well.
+  // Verify the type we're extracting from is any integer type above i16.
   EVT VT = Extract->getOperand(0).getValueType();
-  if (!VT.isSimple() || !(VT.getVectorElementType() == MVT::i32))
+  if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
     return SDValue();
 
   unsigned RegSize = 128;
@@ -28700,15 +29264,28 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   else if (Subtarget.hasAVX2())
     RegSize = 256;
 
-  // We only handle v16i32 for SSE2 / v32i32 for AVX2 / v64i32 for AVX512.
+  // We handle upto v16i* for SSE2 / v32i* for AVX2 / v64i* for AVX512.
   // TODO: We should be able to handle larger vectors by splitting them before
   // feeding them into several SADs, and then reducing over those.
-  if (VT.getSizeInBits() / 4 > RegSize)
+  if (RegSize / VT.getVectorNumElements() < 8)
     return SDValue();
 
   // Match shuffle + add pyramid.
   SDValue Root = matchBinOpReduction(Extract, ISD::ADD);
 
+  // The operand is expected to be zero extended from i8
+  // (verified in detectZextAbsDiff).
+  // In order to convert to i64 and above, additional any/zero/sign
+  // extend is expected.
+  // The zero extend from 32 bit has no mathematical effect on the result.
+  // Also the sign extend is basically zero extend
+  // (extends the sign bit which is zero).
+  // So it is correct to skip the sign/zero extend instruction.
+  if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
+	  Root.getOpcode() == ISD::ZERO_EXTEND ||
+	  Root.getOpcode() == ISD::ANY_EXTEND))
+    Root = Root.getOperand(0);
+
   // If there was a match, we want Root to be a select that is the root of an
   // abs-diff pattern.
   if (!Root || (Root.getOpcode() != ISD::VSELECT))
@@ -28719,7 +29296,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   if (!detectZextAbsDiff(Root, Zext0, Zext1))
     return SDValue();
 
-  // Create the SAD instruction
+  // Create the SAD instruction.
   SDLoc DL(Extract);
   SDValue SAD = createPSADBW(DAG, Zext0, Zext1, DL);
 
@@ -28741,13 +29318,103 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
     }
   }
 
-  // Return the lowest i32.
-  MVT ResVT = MVT::getVectorVT(MVT::i32, SadVT.getSizeInBits() / 32);
+  MVT Type = Extract->getSimpleValueType(0);
+  unsigned TypeSizeInBits = Type.getSizeInBits();
+  // Return the lowest TypeSizeInBits bits.
+  MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
   SAD = DAG.getNode(ISD::BITCAST, DL, ResVT, SAD);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SAD,
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
                      Extract->getOperand(1));
 }
 
+// Attempt to peek through a target shuffle and extract the scalar from the
+// source.
+static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
+                                         TargetLowering::DAGCombinerInfo &DCI,
+                                         const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue Src = N->getOperand(0);
+  SDValue Idx = N->getOperand(1);
+
+  EVT VT = N->getValueType(0);
+  EVT SrcVT = Src.getValueType();
+  EVT SrcSVT = SrcVT.getVectorElementType();
+  unsigned NumSrcElts = SrcVT.getVectorNumElements();
+
+  // Don't attempt this for boolean mask vectors or unknown extraction indices.
+  if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
+    return SDValue();
+
+  // Resolve the target shuffle inputs and mask.
+  SmallVector<int, 16> Mask;
+  SmallVector<SDValue, 2> Ops;
+  if (!resolveTargetShuffleInputs(peekThroughBitcasts(Src), Ops, Mask))
+    return SDValue();
+
+  // Attempt to narrow/widen the shuffle mask to the correct size.
+  if (Mask.size() != NumSrcElts) {
+    if ((NumSrcElts % Mask.size()) == 0) {
+      SmallVector<int, 16> ScaledMask;
+      int Scale = NumSrcElts / Mask.size();
+      scaleShuffleMask(Scale, Mask, ScaledMask);
+      Mask = std::move(ScaledMask);
+    } else if ((Mask.size() % NumSrcElts) == 0) {
+      SmallVector<int, 16> WidenedMask;
+      while (Mask.size() > NumSrcElts &&
+             canWidenShuffleElements(Mask, WidenedMask))
+        Mask = std::move(WidenedMask);
+      // TODO - investigate support for wider shuffle masks with known upper
+      // undef/zero elements for implicit zero-extension.
+    }
+  }
+
+  // Check if narrowing/widening failed.
+  if (Mask.size() != NumSrcElts)
+    return SDValue();
+
+  int SrcIdx = Mask[N->getConstantOperandVal(1)];
+  SDLoc dl(N);
+
+  // If the shuffle source element is undef/zero then we can just accept it.
+  if (SrcIdx == SM_SentinelUndef)
+    return DAG.getUNDEF(VT);
+
+  if (SrcIdx == SM_SentinelZero)
+    return VT.isFloatingPoint() ? DAG.getConstantFP(0.0, dl, VT)
+                                : DAG.getConstant(0, dl, VT);
+
+  SDValue SrcOp = Ops[SrcIdx / Mask.size()];
+  SrcOp = DAG.getBitcast(SrcVT, SrcOp);
+  SrcIdx = SrcIdx % Mask.size();
+
+  // We can only extract other elements from 128-bit vectors and in certain
+  // circumstances, depending on SSE-level.
+  // TODO: Investigate using extract_subvector for larger vectors.
+  // TODO: Investigate float/double extraction if it will be just stored.
+  if ((SrcVT == MVT::v4i32 || SrcVT == MVT::v2i64) &&
+      ((SrcIdx == 0 && Subtarget.hasSSE2()) || Subtarget.hasSSE41())) {
+    assert(SrcSVT == VT && "Unexpected extraction type");
+    return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SrcSVT, SrcOp,
+                       DAG.getIntPtrConstant(SrcIdx, dl));
+  }
+
+  if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
+      (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
+    assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
+           "Unexpected extraction type");
+    unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
+    SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
+                                DAG.getIntPtrConstant(SrcIdx, dl));
+    SDValue Assert = DAG.getNode(ISD::AssertZext, dl, MVT::i32, ExtOp,
+                                 DAG.getValueType(SrcSVT));
+    return DAG.getZExtOrTrunc(Assert, dl, VT);
+  }
+
+  return SDValue();
+}
+
 /// Detect vector gather/scatter index generation and convert it from being a
 /// bunch of shuffles and extracts into a somewhat faster sequence.
 /// For i686, the best sequence is apparently storing the value and loading
@@ -28758,14 +29425,29 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue NewOp = XFormVExtractWithShuffleIntoLoad(N, DAG, DCI))
     return NewOp;
 
+  if (SDValue NewOp = combineExtractWithShuffle(N, DAG, DCI, Subtarget))
+    return NewOp;
+
   SDValue InputVector = N->getOperand(0);
+  SDValue EltIdx = N->getOperand(1);
+
+  EVT SrcVT = InputVector.getValueType();
+  EVT VT = N->getValueType(0);
   SDLoc dl(InputVector);
+
+  // Detect mmx extraction of all bits as a i64. It works better as a bitcast.
+  if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
+      VT == MVT::i64 && SrcVT == MVT::v1i64 && isNullConstant(EltIdx)) {
+    SDValue MMXSrc = InputVector.getOperand(0);
+
+    // The bitcast source is a direct mmx result.
+    if (MMXSrc.getValueType() == MVT::x86mmx)
+      return DAG.getBitcast(VT, InputVector);
+  }
+
   // Detect mmx to i32 conversion through a v2i32 elt extract.
   if (InputVector.getOpcode() == ISD::BITCAST && InputVector.hasOneUse() &&
-      N->getValueType(0) == MVT::i32 &&
-      InputVector.getValueType() == MVT::v2i32 &&
-      isa<ConstantSDNode>(N->getOperand(1)) &&
-      N->getConstantOperandVal(1) == 0) {
+      VT == MVT::i32 && SrcVT == MVT::v2i32 && isNullConstant(EltIdx)) {
     SDValue MMXSrc = InputVector.getOperand(0);
 
     // The bitcast source is a direct mmx result.
@@ -28773,15 +29455,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(X86ISD::MMX_MOVD2W, dl, MVT::i32, MMXSrc);
   }
 
-  EVT VT = N->getValueType(0);
-
-  if (VT == MVT::i1 && isa<ConstantSDNode>(N->getOperand(1)) &&
-      InputVector.getOpcode() == ISD::BITCAST &&
+  if (VT == MVT::i1 && InputVector.getOpcode() == ISD::BITCAST &&
+      isa<ConstantSDNode>(EltIdx) &&
       isa<ConstantSDNode>(InputVector.getOperand(0))) {
-    uint64_t ExtractedElt =
-        cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-    uint64_t InputValue =
-        cast<ConstantSDNode>(InputVector.getOperand(0))->getZExtValue();
+    uint64_t ExtractedElt = N->getConstantOperandVal(1);
+    uint64_t InputValue = InputVector.getConstantOperandVal(0);
     uint64_t Res = (InputValue >> ExtractedElt) & 1;
     return DAG.getConstant(Res, dl, MVT::i1);
   }
@@ -28792,9 +29470,13 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   if (SDValue SAD = combineBasicSADPattern(N, DAG, Subtarget))
     return SAD;
 
+  // Attempt to replace an all_of/any_of horizontal reduction with a MOVMSK.
+  if (SDValue Cmp = combineHorizontalPredicateResult(N, DAG, Subtarget))
+    return Cmp;
+
   // Only operate on vectors of 4 elements, where the alternative shuffling
   // gets to be more expensive.
-  if (InputVector.getValueType() != MVT::v4i32)
+  if (SrcVT != MVT::v4i32)
     return SDValue();
 
   // Check whether every use of InputVector is an EXTRACT_VECTOR_ELT with a
@@ -28822,9 +29504,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       return SDValue();
 
     // Record which element was extracted.
-    ExtractedElements |=
-      1 << cast<ConstantSDNode>(Extract->getOperand(1))->getZExtValue();
-
+    ExtractedElements |= 1 << Extract->getConstantOperandVal(1);
     Uses.push_back(Extract);
   }
 
@@ -28857,11 +29537,11 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
       DAG.getNode(ISD::SRA, dl, MVT::i64, TopHalf, ShAmt));
   } else {
     // Store the value to a temporary stack slot.
-    SDValue StackPtr = DAG.CreateStackTemporary(InputVector.getValueType());
+    SDValue StackPtr = DAG.CreateStackTemporary(SrcVT);
     SDValue Ch = DAG.getStore(DAG.getEntryNode(), dl, InputVector, StackPtr,
                               MachinePointerInfo());
 
-    EVT ElementType = InputVector.getValueType().getVectorElementType();
+    EVT ElementType = SrcVT.getVectorElementType();
     unsigned EltSize = ElementType.getSizeInBits() / 8;
 
     // Replace each use (extract) with a load of the appropriate element.
@@ -28884,8 +29564,7 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
     UE = Uses.end(); UI != UE; ++UI) {
     SDNode *Extract = *UI;
 
-    SDValue Idx = Extract->getOperand(1);
-    uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+    uint64_t IdxVal = Extract->getConstantOperandVal(1);
     DAG.ReplaceAllUsesOfValueWith(SDValue(Extract, 0), Vals[IdxVal]);
   }
 
@@ -28893,6 +29572,16 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// TODO - merge with combineExtractVectorElt once it can handle the implicit
+// zero-extension of X86ISD::PINSRW/X86ISD::PINSRB in:
+// XFormVExtractWithShuffleIntoLoad, combineHorizontalPredicateResult and
+// combineBasicSADPattern.
+static SDValue combineExtractVectorElt_SSE(SDNode *N, SelectionDAG &DAG,
+                                           TargetLowering::DAGCombinerInfo &DCI,
+                                           const X86Subtarget &Subtarget) {
+  return combineExtractWithShuffle(N, DAG, DCI, Subtarget);
+}
+
 /// If a vector select has an operand that is -1 or 0, try to simplify the
 /// select to a bitwise logic operation.
 static SDValue
@@ -28917,12 +29606,11 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
   // This situation only applies to avx512.
   if (FValIsAllZeros  && Subtarget.hasAVX512() && Cond.hasOneUse() &&
       CondVT.getVectorElementType() == MVT::i1) {
-      //Invert the cond to not(cond) : xor(op,allones)=not(op)
-      SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
-        DAG.getConstant(APInt::getAllOnesValue(CondVT.getScalarSizeInBits()),
-                        DL, CondVT));
-      //Vselect cond, op1, op2 = Vselect not(cond), op2, op1
-      return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
+    // Invert the cond to not(cond) : xor(op,allones)=not(op)
+    SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
+                                  DAG.getAllOnesConstant(DL, CondVT));
+    // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+    return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS);
   }
 
   // To use the condition operand as a bitwise mask, it must have elements that
@@ -29025,18 +29713,6 @@ static SDValue combineSelectOfTwoConstants(SDNode *N, SelectionDAG &DAG) {
                        DAG.getConstant(ShAmt, DL, MVT::i8));
   }
 
-  // Optimize Cond ? cst+1 : cst -> zext(setcc(C)+cst.
-  if (FalseC->getAPIntValue() + 1 == TrueC->getAPIntValue()) {
-    if (NeedsCondInvert) // Invert the condition if needed.
-      Cond = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond,
-                         DAG.getConstant(1, DL, Cond.getValueType()));
-
-    // Zero extend the condition if needed.
-    Cond = DAG.getNode(ISD::ZERO_EXTEND, DL, FalseC->getValueType(0), Cond);
-    return DAG.getNode(ISD::ADD, DL, Cond.getValueType(), Cond,
-                       SDValue(FalseC, 0));
-  }
-
   // Optimize cases that will turn into an LEA instruction.  This requires
   // an i32 or i64 and an efficient multiplier (1, 2, 3, 4, 5, 8, 9).
   if (N->getValueType(0) == MVT::i32 || N->getValueType(0) == MVT::i64) {
@@ -29491,8 +30167,8 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
 
   // If this is a *dynamic* select (non-constant condition) and we can match
   // this node with one of the variable blend instructions, restructure the
-  // condition so that the blends can use the high bit of each element and use
-  // SimplifyDemandedBits to simplify the condition operand.
+  // condition so that blends can use the high (sign) bit of each element and
+  // use SimplifyDemandedBits to simplify the condition operand.
   if (N->getOpcode() == ISD::VSELECT && DCI.isBeforeLegalizeOps() &&
       !DCI.isBeforeLegalize() &&
       !ISD::isBuildVectorOfConstantSDNodes(Cond.getNode())) {
@@ -29527,49 +30203,45 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       return SDValue();
 
     assert(BitWidth >= 8 && BitWidth <= 64 && "Invalid mask size");
-    APInt DemandedMask = APInt::getHighBitsSet(BitWidth, 1);
-
+    APInt DemandedMask(APInt::getSignBit(BitWidth));
     APInt KnownZero, KnownOne;
     TargetLowering::TargetLoweringOpt TLO(DAG, DCI.isBeforeLegalize(),
                                           DCI.isBeforeLegalizeOps());
     if (TLO.ShrinkDemandedConstant(Cond, DemandedMask) ||
         TLI.SimplifyDemandedBits(Cond, DemandedMask, KnownZero, KnownOne,
                                  TLO)) {
-      // If we changed the computation somewhere in the DAG, this change
-      // will affect all users of Cond.
-      // Make sure it is fine and update all the nodes so that we do not
-      // use the generic VSELECT anymore. Otherwise, we may perform
-      // wrong optimizations as we messed up with the actual expectation
+      // If we changed the computation somewhere in the DAG, this change will
+      // affect all users of Cond. Make sure it is fine and update all the nodes
+      // so that we do not use the generic VSELECT anymore. Otherwise, we may
+      // perform wrong optimizations as we messed with the actual expectation
       // for the vector boolean values.
       if (Cond != TLO.Old) {
-        // Check all uses of that condition operand to check whether it will be
-        // consumed by non-BLEND instructions, which may depend on all bits are
-        // set properly.
-        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
-             I != E; ++I)
-          if (I->getOpcode() != ISD::VSELECT)
-            // TODO: Add other opcodes eventually lowered into BLEND.
+        // Check all uses of the condition operand to check whether it will be
+        // consumed by non-BLEND instructions. Those may require that all bits
+        // are set properly.
+        for (SDNode *U : Cond->uses()) {
+          // TODO: Add other opcodes eventually lowered into BLEND.
+          if (U->getOpcode() != ISD::VSELECT)
             return SDValue();
+        }
 
-        // Update all the users of the condition, before committing the change,
-        // so that the VSELECT optimizations that expect the correct vector
-        // boolean value will not be triggered.
-        for (SDNode::use_iterator I = Cond->use_begin(), E = Cond->use_end();
-             I != E; ++I)
-          DAG.ReplaceAllUsesOfValueWith(
-              SDValue(*I, 0),
-              DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(*I), I->getValueType(0),
-                          Cond, I->getOperand(1), I->getOperand(2)));
+        // Update all users of the condition before committing the change, so
+        // that the VSELECT optimizations that expect the correct vector boolean
+        // value will not be triggered.
+        for (SDNode *U : Cond->uses()) {
+          SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(U),
+                                   U->getValueType(0), Cond, U->getOperand(1),
+                                   U->getOperand(2));
+          DAG.ReplaceAllUsesOfValueWith(SDValue(U, 0), SB);
+        }
         DCI.CommitTargetLoweringOpt(TLO);
         return SDValue();
       }
-      // At this point, only Cond is changed. Change the condition
-      // just for N to keep the opportunity to optimize all other
-      // users their own way.
-      DAG.ReplaceAllUsesOfValueWith(
-          SDValue(N, 0),
-          DAG.getNode(X86ISD::SHRUNKBLEND, SDLoc(N), N->getValueType(0),
-                      TLO.New, N->getOperand(1), N->getOperand(2)));
+      // Only Cond (rather than other nodes in the computation chain) was
+      // changed. Change the condition just for N to keep the opportunity to
+      // optimize all other users their own way.
+      SDValue SB = DAG.getNode(X86ISD::SHRUNKBLEND, DL, VT, TLO.New, LHS, RHS);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), SB);
       return SDValue();
     }
   }
@@ -30532,21 +31204,24 @@ static SDValue combineShift(SDNode* N, SelectionDAG &DAG,
   return SDValue();
 }
 
-static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
-                                  TargetLowering::DAGCombinerInfo &DCI,
-                                  const X86Subtarget &Subtarget) {
+static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget &Subtarget) {
   unsigned Opcode = N->getOpcode();
   assert((X86ISD::VSHLI == Opcode || X86ISD::VSRAI == Opcode ||
           X86ISD::VSRLI == Opcode) &&
          "Unexpected shift opcode");
   bool LogicalShift = X86ISD::VSHLI == Opcode || X86ISD::VSRLI == Opcode;
   EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
   unsigned NumBitsPerElt = VT.getScalarSizeInBits();
-  assert((NumBitsPerElt % 8) == 0);
+  assert(VT == N0.getValueType() && (NumBitsPerElt % 8) == 0 &&
+         "Unexpected value type");
 
   // Out of range logical bit shifts are guaranteed to be zero.
   // Out of range arithmetic bit shifts splat the sign bit.
-  APInt ShiftVal = cast<ConstantSDNode>(N->getOperand(1))->getAPIntValue();
+  APInt ShiftVal = cast<ConstantSDNode>(N1)->getAPIntValue();
   if (ShiftVal.zextOrTrunc(8).uge(NumBitsPerElt)) {
     if (LogicalShift)
       return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
@@ -30554,8 +31229,6 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
       ShiftVal = NumBitsPerElt - 1;
   }
 
-  SDValue N0 = N->getOperand(0);
-
   // Shift N0 by zero -> N0.
   if (!ShiftVal)
     return N0;
@@ -30564,19 +31237,26 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
   if (ISD::isBuildVectorAllZeros(N0.getNode()))
     return getZeroVector(VT.getSimpleVT(), Subtarget, DAG, SDLoc(N));
 
+  // fold (VSRLI (VSRAI X, Y), 31) -> (VSRLI X, 31).
+  // This VSRLI only looks at the sign bit, which is unmodified by VSRAI.
+  // TODO - support other sra opcodes as needed.
+  if (Opcode == X86ISD::VSRLI && (ShiftVal + 1) == NumBitsPerElt &&
+      N0.getOpcode() == X86ISD::VSRAI)
+    return DAG.getNode(X86ISD::VSRLI, SDLoc(N), VT, N0.getOperand(0), N1);
+
   // We can decode 'whole byte' logical bit shifts as shuffles.
   if (LogicalShift && (ShiftVal.getZExtValue() % 8) == 0) {
     SDValue Op(N, 0);
     SmallVector<int, 1> NonceMask; // Just a placeholder.
     NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                       DCI, Subtarget))
       return SDValue(); // This routine will use CombineTo to replace N.
   }
 
   // Constant Folding.
-  SmallBitVector UndefElts;
+  APInt UndefElts;
   SmallVector<APInt, 32> EltBits;
   if (N->isOnlyUserOf(N0.getNode()) &&
       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
@@ -30597,6 +31277,25 @@ static SDValue combineVectorShift(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const X86Subtarget &Subtarget) {
+  assert(
+      ((N->getOpcode() == X86ISD::PINSRB && N->getValueType(0) == MVT::v16i8) ||
+       (N->getOpcode() == X86ISD::PINSRW &&
+        N->getValueType(0) == MVT::v8i16)) &&
+      "Unexpected vector insertion");
+
+  // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
+  SDValue Op(N, 0);
+  SmallVector<int, 1> NonceMask; // Just a placeholder.
+  NonceMask.push_back(0);
+  combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
+                                /*Depth*/ 1, /*HasVarMask*/ false, DAG,
+                                DCI, Subtarget);
+  return SDValue();
+}
+
 /// Recognize the distinctive (AND (setcc ...) (setcc ..)) where both setccs
 /// reference the same FP CMP, and rewrite for CMPEQSS and friends. Likewise for
 /// OR -> CMPNEQSS.
@@ -30840,38 +31539,34 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// If this is a PCMPEQ or PCMPGT result that is bitwise-anded with 1 (this is
-/// the x86 lowering of a SETCC + ZEXT), replace the 'and' with a shift-right to
-/// eliminate loading the vector constant mask value. This relies on the fact
-/// that a PCMP always creates an all-ones or all-zeros bitmask per element.
-static SDValue combinePCMPAnd1(SDNode *N, SelectionDAG &DAG) {
+/// If this is a zero/all-bits result that is bitwise-anded with a low bits
+/// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
+/// with a shift-right to eliminate loading the vector constant mask value.
+static SDValue combineAndMaskToShift(SDNode *N, SelectionDAG &DAG,
+                                     const X86Subtarget &Subtarget) {
   SDValue Op0 = peekThroughBitcasts(N->getOperand(0));
   SDValue Op1 = peekThroughBitcasts(N->getOperand(1));
+  EVT VT0 = Op0.getValueType();
+  EVT VT1 = Op1.getValueType();
 
-  // TODO: Use AssertSext to mark any nodes that have the property of producing
-  // all-ones or all-zeros. Then check for that node rather than particular
-  // opcodes.
-  if (Op0.getOpcode() != X86ISD::PCMPEQ && Op0.getOpcode() != X86ISD::PCMPGT)
+  if (VT0 != VT1 || !VT0.isSimple() || !VT0.isInteger())
     return SDValue();
 
-  // The existence of the PCMP node guarantees that we have the required SSE2 or
-  // AVX2 for a shift of this vector type, but there is no vector shift by
-  // immediate for a vector with byte elements (PSRLB). 512-bit vectors use the
-  // masked compare nodes, so they should not make it here.
-  EVT VT0 = Op0.getValueType();
-  EVT VT1 = Op1.getValueType();
-  unsigned EltBitWidth = VT0.getScalarSizeInBits();
-  if (VT0 != VT1 || EltBitWidth == 8)
+  APInt SplatVal;
+  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) ||
+      !SplatVal.isMask())
     return SDValue();
 
-  assert(VT0.getSizeInBits() == 128 || VT0.getSizeInBits() == 256);
+  if (!SupportedVectorShiftWithImm(VT0.getSimpleVT(), Subtarget, ISD::SRL))
+    return SDValue();
 
-  APInt SplatVal;
-  if (!ISD::isConstantSplatVector(Op1.getNode(), SplatVal) || SplatVal != 1)
+  unsigned EltBitWidth = VT0.getScalarSizeInBits();
+  if (EltBitWidth != DAG.ComputeNumSignBits(Op0))
     return SDValue();
 
   SDLoc DL(N);
-  SDValue ShAmt = DAG.getConstant(EltBitWidth - 1, DL, MVT::i8);
+  unsigned ShiftVal = SplatVal.countTrailingOnes();
+  SDValue ShAmt = DAG.getConstant(EltBitWidth - ShiftVal, DL, MVT::i8);
   SDValue Shift = DAG.getNode(X86ISD::VSRLI, DL, VT0, Op0, ShAmt);
   return DAG.getBitcast(N->getValueType(0), Shift);
 }
@@ -30891,7 +31586,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineANDXORWithAllOnesIntoANDNP(N, DAG))
     return R;
 
-  if (SDValue ShiftRight = combinePCMPAnd1(N, DAG))
+  if (SDValue ShiftRight = combineAndMaskToShift(N, DAG, Subtarget))
     return ShiftRight;
 
   EVT VT = N->getValueType(0);
@@ -30904,7 +31599,7 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
     SDValue Op(N, 0);
     SmallVector<int, 1> NonceMask; // Just a placeholder.
     NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                       DCI, Subtarget))
       return SDValue(); // This routine will use CombineTo to replace N.
@@ -31113,7 +31808,7 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
     return N->getOpcode() == X86ISD::SETCC && N->hasOneUse() &&
            X86::CondCode(N->getConstantOperandVal(0)) == X86::COND_E &&
            N->getOperand(1).getOpcode() == X86ISD::CMP &&
-           N->getOperand(1).getConstantOperandVal(1) == 0 &&
+           isNullConstant(N->getOperand(1).getOperand(1)) &&
            N->getOperand(1).getValueType().bitsGE(MVT::i32);
   };
 
@@ -31454,7 +32149,7 @@ static SDValue detectUSatPattern(SDValue In, EVT VT) {
   if (ISD::isConstantSplatVector(In.getOperand(1).getNode(), C)) {
     // C should be equal to UINT32_MAX / UINT16_MAX / UINT8_MAX according
     // the element size of the destination type.
-    return APIntOps::isMask(VT.getScalarSizeInBits(), C) ? In.getOperand(0) :
+    return C.isMask(VT.getScalarSizeInBits()) ? In.getOperand(0) :
       SDValue();
   }
   return SDValue();
@@ -31876,7 +32571,7 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
                                      Mld->getBasePtr(), NewMask, WideSrc0,
                                      Mld->getMemoryVT(), Mld->getMemOperand(),
                                      ISD::NON_EXTLOAD);
-  SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
+  SDValue NewVec = getExtendInVec(X86ISD::VSEXT, dl, VT, WideLd, DAG);
   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
 }
 
@@ -33062,7 +33757,7 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
     SDValue Op(N, 0);
     SmallVector<int, 1> NonceMask; // Just a placeholder.
     NonceMask.push_back(0);
-    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask,
+    if (combineX86ShufflesRecursively({Op}, 0, Op, NonceMask, {},
                                       /*Depth*/ 1, /*HasVarMask*/ false, DAG,
                                       DCI, Subtarget))
       return SDValue(); // This routine will use CombineTo to replace N.
@@ -33332,13 +34027,22 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   if (!DCI.isBeforeLegalizeOps()) {
     if (InVT == MVT::i1) {
       SDValue Zero = DAG.getConstant(0, DL, VT);
-      SDValue AllOnes =
-          DAG.getConstant(APInt::getAllOnesValue(VT.getSizeInBits()), DL, VT);
+      SDValue AllOnes = DAG.getAllOnesConstant(DL, VT);
       return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero);
     }
     return SDValue();
   }
 
+  if (InVT == MVT::i1 && N0.getOpcode() == ISD::XOR &&
+      isAllOnesConstant(N0.getOperand(1)) && N0.hasOneUse()) {
+    // Invert and sign-extend a boolean is the same as zero-extend and subtract
+    // 1 because 0 becomes -1 and 1 becomes 0. The subtract is efficiently
+    // lowered with an LEA or a DEC. This is the same as: select Bool, 0, -1.
+    // sext (xor Bool, -1) --> sub (zext Bool), 1
+    SDValue Zext = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N0.getOperand(0));
+    return DAG.getNode(ISD::SUB, DL, VT, Zext, DAG.getConstant(1, DL, VT));
+  }
+
   if (SDValue V = combineToExtendVectorInReg(N, DAG, DCI, Subtarget))
     return V;
 
@@ -33479,8 +34183,47 @@ static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// Optimize x == -y --> x+y == 0
-///          x != -y --> x+y != 0
+/// Try to map a 128-bit or larger integer comparison to vector instructions
+/// before type legalization splits it up into chunks.
+static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
+                                               const X86Subtarget &Subtarget) {
+  ISD::CondCode CC = cast<CondCodeSDNode>(SetCC->getOperand(2))->get();
+  assert((CC == ISD::SETNE || CC == ISD::SETEQ) && "Bad comparison predicate");
+
+  // We're looking for an oversized integer equality comparison, but ignore a
+  // comparison with zero because that gets special treatment in EmitTest().
+  SDValue X = SetCC->getOperand(0);
+  SDValue Y = SetCC->getOperand(1);
+  EVT OpVT = X.getValueType();
+  unsigned OpSize = OpVT.getSizeInBits();
+  if (!OpVT.isScalarInteger() || OpSize < 128 || isNullConstant(Y))
+    return SDValue();
+
+  // TODO: Use PXOR + PTEST for SSE4.1 or later?
+  // TODO: Add support for AVX-512.
+  EVT VT = SetCC->getValueType(0);
+  SDLoc DL(SetCC);
+  if ((OpSize == 128 && Subtarget.hasSSE2()) ||
+      (OpSize == 256 && Subtarget.hasAVX2())) {
+    EVT VecVT = OpSize == 128 ? MVT::v16i8 : MVT::v32i8;
+    SDValue VecX = DAG.getBitcast(VecVT, X);
+    SDValue VecY = DAG.getBitcast(VecVT, Y);
+
+    // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
+    // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
+    // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
+    // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
+    // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
+    SDValue Cmp = DAG.getNode(X86ISD::PCMPEQ, DL, VecVT, VecX, VecY);
+    SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
+    SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
+                                    MVT::i32);
+    return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
+  }
+
+  return SDValue();
+}
+
 static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
                             const X86Subtarget &Subtarget) {
   ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get();
@@ -33489,21 +34232,27 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   SDLoc DL(N);
 
-  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && LHS.getOpcode() == ISD::SUB)
-    if (isNullConstant(LHS.getOperand(0)) && LHS.hasOneUse()) {
-      SDValue addV = DAG.getNode(ISD::ADD, DL, LHS.getValueType(), RHS,
-                                 LHS.getOperand(1));
-      return DAG.getSetCC(DL, N->getValueType(0), addV,
-                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+  if (CC == ISD::SETNE || CC == ISD::SETEQ) {
+    EVT OpVT = LHS.getValueType();
+    // 0-x == y --> x+y == 0
+    // 0-x != y --> x+y != 0
+    if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
+        LHS.hasOneUse()) {
+      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
+      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
     }
-  if ((CC == ISD::SETNE || CC == ISD::SETEQ) && RHS.getOpcode() == ISD::SUB)
-    if (isNullConstant(RHS.getOperand(0)) && RHS.hasOneUse()) {
-      SDValue addV = DAG.getNode(ISD::ADD, DL, RHS.getValueType(), LHS,
-                                 RHS.getOperand(1));
-      return DAG.getSetCC(DL, N->getValueType(0), addV,
-                          DAG.getConstant(0, DL, addV.getValueType()), CC);
+    // x == 0-y --> x+y == 0
+    // x != 0-y --> x+y != 0
+    if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
+        RHS.hasOneUse()) {
+      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
+      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
     }
 
+    if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
+      return V;
+  }
+
   if (VT.getScalarType() == MVT::i1 &&
       (CC == ISD::SETNE || CC == ISD::SETEQ || ISD::isSignedIntSetCC(CC))) {
     bool IsSEXT0 =
@@ -33560,56 +34309,13 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-// Helper function of performSETCCCombine. It is to materialize "setb reg"
-// as "sbb reg,reg", since it can be extended without zext and produces
-// an all-ones bit which is more useful than 0/1 in some cases.
-static SDValue MaterializeSETB(const SDLoc &DL, SDValue EFLAGS,
-                               SelectionDAG &DAG, MVT VT) {
-  if (VT == MVT::i8)
-    return DAG.getNode(ISD::AND, DL, VT,
-                       DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                   DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                   EFLAGS),
-                       DAG.getConstant(1, DL, VT));
-  assert (VT == MVT::i1 && "Unexpected type for SECCC node");
-  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1,
-                     DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8,
-                                 DAG.getConstant(X86::COND_B, DL, MVT::i8),
-                                 EFLAGS));
-}
-
 // Optimize  RES = X86ISD::SETCC CONDCODE, EFLAG_INPUT
 static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
-                               TargetLowering::DAGCombinerInfo &DCI,
                                const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   X86::CondCode CC = X86::CondCode(N->getConstantOperandVal(0));
   SDValue EFLAGS = N->getOperand(1);
 
-  if (CC == X86::COND_A) {
-    // Try to convert COND_A into COND_B in an attempt to facilitate
-    // materializing "setb reg".
-    //
-    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
-    // cannot take an immediate as its first operand.
-    //
-    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
-        EFLAGS.getValueType().isInteger() &&
-        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
-      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
-                                   EFLAGS.getNode()->getVTList(),
-                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
-      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
-      return MaterializeSETB(DL, NewEFLAGS, DAG, N->getSimpleValueType(0));
-    }
-  }
-
-  // Materialize "setb reg" as "sbb reg,reg", since it can be extended without
-  // a zext and produces an all-ones bit which is more useful than 0/1 in some
-  // cases.
-  if (CC == X86::COND_B)
-    return MaterializeSETB(DL, EFLAGS, DAG, N->getSimpleValueType(0));
-
   // Try to simplify the EFLAGS and condition code operands.
   if (SDValue Flags = combineSetCCEFLAGS(EFLAGS, CC, DAG))
     return getSETCC(CC, Flags, DL, DAG);
@@ -33619,7 +34325,6 @@ static SDValue combineX86SetCC(SDNode *N, SelectionDAG &DAG,
 
 /// Optimize branch condition evaluation.
 static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
-                             TargetLowering::DAGCombinerInfo &DCI,
                              const X86Subtarget &Subtarget) {
   SDLoc DL(N);
   SDValue EFLAGS = N->getOperand(3);
@@ -33805,45 +34510,159 @@ static SDValue combineADC(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
-/// fold (add Y, (sete  X, 0)) -> adc  0, Y
-///      (add Y, (setne X, 0)) -> sbb -1, Y
-///      (sub (sete  X, 0), Y) -> sbb  0, Y
-///      (sub (setne X, 0), Y) -> adc -1, Y
-static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
+/// Materialize "setb reg" as "sbb reg,reg", since it produces an all-ones bit
+/// which is more useful than 0/1 in some cases.
+static SDValue materializeSBB(SDNode *N, SDValue EFLAGS, SelectionDAG &DAG) {
   SDLoc DL(N);
+  // "Condition code B" is also known as "the carry flag" (CF).
+  SDValue CF = DAG.getConstant(X86::COND_B, DL, MVT::i8);
+  SDValue SBB = DAG.getNode(X86ISD::SETCC_CARRY, DL, MVT::i8, CF, EFLAGS);
+  MVT VT = N->getSimpleValueType(0);
+  if (VT == MVT::i8)
+    return DAG.getNode(ISD::AND, DL, VT, SBB, DAG.getConstant(1, DL, VT));
 
-  // Look through ZExts.
-  SDValue Ext = N->getOperand(N->getOpcode() == ISD::SUB ? 1 : 0);
-  if (Ext.getOpcode() != ISD::ZERO_EXTEND || !Ext.hasOneUse())
-    return SDValue();
+  assert(VT == MVT::i1 && "Unexpected type for SETCC node");
+  return DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, SBB);
+}
+
+/// If this is an add or subtract where one operand is produced by a cmp+setcc,
+/// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB}
+/// with CMP+{ADC, SBB}.
+static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
+  bool IsSub = N->getOpcode() == ISD::SUB;
+  SDValue X = N->getOperand(0);
+  SDValue Y = N->getOperand(1);
+
+  // If this is an add, canonicalize a zext operand to the RHS.
+  // TODO: Incomplete? What if both sides are zexts?
+  if (!IsSub && X.getOpcode() == ISD::ZERO_EXTEND &&
+      Y.getOpcode() != ISD::ZERO_EXTEND)
+    std::swap(X, Y);
+
+  // Look through a one-use zext.
+  bool PeekedThroughZext = false;
+  if (Y.getOpcode() == ISD::ZERO_EXTEND && Y.hasOneUse()) {
+    Y = Y.getOperand(0);
+    PeekedThroughZext = true;
+  }
 
-  SDValue SetCC = Ext.getOperand(0);
-  if (SetCC.getOpcode() != X86ISD::SETCC || !SetCC.hasOneUse())
+  // If this is an add, canonicalize a setcc operand to the RHS.
+  // TODO: Incomplete? What if both sides are setcc?
+  // TODO: Should we allow peeking through a zext of the other operand?
+  if (!IsSub && !PeekedThroughZext && X.getOpcode() == X86ISD::SETCC &&
+      Y.getOpcode() != X86ISD::SETCC)
+    std::swap(X, Y);
+
+  if (Y.getOpcode() != X86ISD::SETCC || !Y.hasOneUse())
     return SDValue();
 
-  X86::CondCode CC = (X86::CondCode)SetCC.getConstantOperandVal(0);
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  X86::CondCode CC = (X86::CondCode)Y.getConstantOperandVal(0);
+
+  if (CC == X86::COND_B) {
+    // X + SETB Z --> X + (mask SBB Z, Z)
+    // X - SETB Z --> X - (mask SBB Z, Z)
+    // TODO: Produce ADC/SBB here directly and avoid SETCC_CARRY?
+    SDValue SBB = materializeSBB(Y.getNode(), Y.getOperand(1), DAG);
+    if (SBB.getValueSizeInBits() != VT.getSizeInBits())
+      SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
+    return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+  }
+
+  if (CC == X86::COND_A) {
+    SDValue EFLAGS = Y->getOperand(1);
+    // Try to convert COND_A into COND_B in an attempt to facilitate
+    // materializing "setb reg".
+    //
+    // Do not flip "e > c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.hasOneUse() &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
+                                   EFLAGS.getNode()->getVTList(),
+                                   EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+      SDValue SBB = materializeSBB(Y.getNode(), NewEFLAGS, DAG);
+      if (SBB.getValueSizeInBits() != VT.getSizeInBits())
+        SBB = DAG.getZExtOrTrunc(SBB, DL, VT);
+      return DAG.getNode(IsSub ? ISD::SUB : ISD::ADD, DL, VT, X, SBB);
+    }
+  }
+
   if (CC != X86::COND_E && CC != X86::COND_NE)
     return SDValue();
 
-  SDValue Cmp = SetCC.getOperand(1);
+  SDValue Cmp = Y.getOperand(1);
   if (Cmp.getOpcode() != X86ISD::CMP || !Cmp.hasOneUse() ||
       !X86::isZeroNode(Cmp.getOperand(1)) ||
       !Cmp.getOperand(0).getValueType().isInteger())
     return SDValue();
 
-  SDValue CmpOp0 = Cmp.getOperand(0);
-  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, CmpOp0,
-                               DAG.getConstant(1, DL, CmpOp0.getValueType()));
+  // (cmp Z, 1) sets the carry flag if Z is 0.
+  SDValue Z = Cmp.getOperand(0);
+  SDValue NewCmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z,
+                               DAG.getConstant(1, DL, Z.getValueType()));
+
+  SDVTList VTs = DAG.getVTList(N->getValueType(0), MVT::i32);
 
-  SDValue OtherVal = N->getOperand(N->getOpcode() == ISD::SUB ? 0 : 1);
+  // X - (Z != 0) --> sub X, (zext(setne Z, 0)) --> adc X, -1, (cmp Z, 1)
+  // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
   if (CC == X86::COND_NE)
-    return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::ADC : X86ISD::SBB,
-                       DL, OtherVal.getValueType(), OtherVal,
-                       DAG.getConstant(-1ULL, DL, OtherVal.getValueType()),
-                       NewCmp);
-  return DAG.getNode(N->getOpcode() == ISD::SUB ? X86ISD::SBB : X86ISD::ADC,
-                     DL, OtherVal.getValueType(), OtherVal,
-                     DAG.getConstant(0, DL, OtherVal.getValueType()), NewCmp);
+    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
+                       DAG.getConstant(-1ULL, DL, VT), NewCmp);
+
+  // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
+  // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
+  return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
+                     DAG.getConstant(0, DL, VT), NewCmp);
+}
+
+static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
+                                      const X86Subtarget &Subtarget) {
+  SDValue MulOp = N->getOperand(0);
+  SDValue Phi = N->getOperand(1);
+
+  if (MulOp.getOpcode() != ISD::MUL)
+    std::swap(MulOp, Phi);
+  if (MulOp.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  ShrinkMode Mode;
+  if (!canReduceVMulWidth(MulOp.getNode(), DAG, Mode))
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+
+  unsigned RegSize = 128;
+  if (Subtarget.hasBWI())
+    RegSize = 512;
+  else if (Subtarget.hasAVX2())
+    RegSize = 256;
+  unsigned VectorSize = VT.getVectorNumElements() * 16;
+  // If the vector size is less than 128, or greater than the supported RegSize,
+  // do not use PMADD.
+  if (VectorSize < 128 || VectorSize > RegSize)
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                   VT.getVectorNumElements());
+  EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
+                                VT.getVectorNumElements() / 2);
+
+  // Shrink the operands of mul.
+  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
+
+  // Madd vector size is half of the original vector size
+  SDValue Madd = DAG.getNode(X86ISD::VPMADDWD, DL, MAddVT, N0, N1);
+  // Fill the rest of the output with 0
+  SDValue Zero = getZeroVector(Madd.getSimpleValueType(), Subtarget, DAG, DL);
+  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
+  return DAG.getNode(ISD::ADD, DL, VT, Concat, Phi);
 }
 
 static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
@@ -33923,6 +34742,8 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
   if (Flags->hasVectorReduction()) {
     if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
       return Sad;
+    if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
+      return MAdd;
   }
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
@@ -33934,7 +34755,7 @@ static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(Op0, Op1, true))
     return DAG.getNode(X86ISD::HADD, SDLoc(N), VT, Op0, Op1);
 
-  return OptimizeConditionalInDecrement(N, DAG);
+  return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
 static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
@@ -33967,36 +34788,44 @@ static SDValue combineSub(SDNode *N, SelectionDAG &DAG,
       isHorizontalBinOp(Op0, Op1, false))
     return DAG.getNode(X86ISD::HSUB, SDLoc(N), VT, Op0, Op1);
 
-  return OptimizeConditionalInDecrement(N, DAG);
+  return combineAddOrSubToADCOrSBB(N, DAG);
 }
 
 static SDValue combineVSZext(SDNode *N, SelectionDAG &DAG,
                              TargetLowering::DAGCombinerInfo &DCI,
                              const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalize())
+    return SDValue();
+
   SDLoc DL(N);
   unsigned Opcode = N->getOpcode();
   MVT VT = N->getSimpleValueType(0);
   MVT SVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned EltSizeInBits = SVT.getSizeInBits();
+
   SDValue Op = N->getOperand(0);
   MVT OpVT = Op.getSimpleValueType();
   MVT OpEltVT = OpVT.getVectorElementType();
-  unsigned InputBits = OpEltVT.getSizeInBits() * VT.getVectorNumElements();
+  unsigned OpEltSizeInBits = OpEltVT.getSizeInBits();
+  unsigned InputBits = OpEltSizeInBits * NumElts;
 
   // Perform any constant folding.
   // FIXME: Reduce constant pool usage and don't fold when OptSize is enabled.
-  if (ISD::isBuildVectorOfConstantSDNodes(Op.getNode())) {
-    unsigned NumDstElts = VT.getVectorNumElements();
-    SmallBitVector Undefs(NumDstElts, false);
-    SmallVector<APInt, 4> Vals(NumDstElts, APInt(SVT.getSizeInBits(), 0));
-    for (unsigned i = 0; i != NumDstElts; ++i) {
-      SDValue OpElt = Op.getOperand(i);
-      if (OpElt.getOpcode() == ISD::UNDEF) {
-        Undefs[i] = true;
+  APInt UndefElts;
+  SmallVector<APInt, 64> EltBits;
+  if (getTargetConstantBitsFromNode(Op, OpEltSizeInBits, UndefElts, EltBits)) {
+    APInt Undefs(NumElts, 0);
+    SmallVector<APInt, 4> Vals(NumElts, APInt(EltSizeInBits, 0));
+    bool IsZEXT =
+        (Opcode == X86ISD::VZEXT) || (Opcode == ISD::ZERO_EXTEND_VECTOR_INREG);
+    for (unsigned i = 0; i != NumElts; ++i) {
+      if (UndefElts[i]) {
+        Undefs.setBit(i);
         continue;
       }
-      APInt Cst = cast<ConstantSDNode>(OpElt.getNode())->getAPIntValue();
-      Vals[i] = Opcode == X86ISD::VZEXT ? Cst.zextOrTrunc(SVT.getSizeInBits())
-                                        : Cst.sextOrTrunc(SVT.getSizeInBits());
+      Vals[i] = IsZEXT ? EltBits[i].zextOrTrunc(EltSizeInBits)
+                       : EltBits[i].sextOrTrunc(EltSizeInBits);
     }
     return getConstVector(Vals, Undefs, VT, DAG, DL);
   }
@@ -34096,7 +34925,7 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
 
   if (N->getOperand(0) == N->getOperand(1)) {
     if (N->getOpcode() == X86ISD::PCMPEQ)
-      return getOnesVector(VT, Subtarget, DAG, DL);
+      return getOnesVector(VT, DAG, DL);
     if (N->getOpcode() == X86ISD::PCMPGT)
       return getZeroVector(VT, Subtarget, DAG, DL);
   }
@@ -34104,6 +34933,98 @@ static SDValue combineVectorCompare(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const X86Subtarget &Subtarget) {
+  if (DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDLoc dl(N);
+  SDValue Vec = N->getOperand(0);
+  SDValue SubVec = N->getOperand(1);
+  SDValue Idx = N->getOperand(2);
+
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  MVT OpVT = N->getSimpleValueType(0);
+  MVT SubVecVT = SubVec.getSimpleValueType();
+
+  // If this is an insert of an extract, combine to a shuffle. Don't do this
+  // if the insert or extract can be represented with a subvector operation.
+  if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      SubVec.getOperand(0).getSimpleValueType() == OpVT &&
+      (IdxVal != 0 || !Vec.isUndef())) {
+    int ExtIdxVal = cast<ConstantSDNode>(SubVec.getOperand(1))->getZExtValue();
+    if (ExtIdxVal != 0) {
+      int VecNumElts = OpVT.getVectorNumElements();
+      int SubVecNumElts = SubVecVT.getVectorNumElements();
+      SmallVector<int, 64> Mask(VecNumElts);
+      // First create an identity shuffle mask.
+      for (int i = 0; i != VecNumElts; ++i)
+        Mask[i] = i;
+      // Now insert the extracted portion.
+      for (int i = 0; i != SubVecNumElts; ++i)
+        Mask[i + IdxVal] = i + ExtIdxVal + VecNumElts;
+
+      return DAG.getVectorShuffle(OpVT, dl, Vec, SubVec.getOperand(0), Mask);
+    }
+  }
+
+  // Fold two 16-byte or 32-byte subvector loads into one 32-byte or 64-byte
+  // load:
+  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+  //                   (load16 addr + 16), Elts/2)
+  // --> load32 addr
+  // or:
+  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+  //                   (load32 addr + 32), Elts/2)
+  // --> load64 addr
+  // or a 16-byte or 32-byte broadcast:
+  // (insert_subvector (insert_subvector undef, (load16 addr), 0),
+  //                   (load16 addr), Elts/2)
+  // --> X86SubVBroadcast(load16 addr)
+  // or:
+  // (insert_subvector (insert_subvector undef, (load32 addr), 0),
+  //                   (load32 addr), Elts/2)
+  // --> X86SubVBroadcast(load32 addr)
+  if ((IdxVal == OpVT.getVectorNumElements() / 2) &&
+      Vec.getOpcode() == ISD::INSERT_SUBVECTOR &&
+      OpVT.getSizeInBits() == SubVecVT.getSizeInBits() * 2) {
+    auto *Idx2 = dyn_cast<ConstantSDNode>(Vec.getOperand(2));
+    if (Idx2 && Idx2->getZExtValue() == 0) {
+      SDValue SubVec2 = Vec.getOperand(1);
+      // If needed, look through bitcasts to get to the load.
+      if (auto *FirstLd = dyn_cast<LoadSDNode>(peekThroughBitcasts(SubVec2))) {
+        bool Fast;
+        unsigned Alignment = FirstLd->getAlignment();
+        unsigned AS = FirstLd->getAddressSpace();
+        const X86TargetLowering *TLI = Subtarget.getTargetLowering();
+        if (TLI->allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(),
+                                    OpVT, AS, Alignment, &Fast) && Fast) {
+          SDValue Ops[] = {SubVec2, SubVec};
+          if (SDValue Ld = EltsFromConsecutiveLoads(OpVT, Ops, dl, DAG, false))
+            return Ld;
+        }
+      }
+      // If lower/upper loads are the same and the only users of the load, then
+      // lower to a VBROADCASTF128/VBROADCASTI128/etc.
+      if (auto *Ld = dyn_cast<LoadSDNode>(peekThroughOneUseBitcasts(SubVec2))) {
+        if (SubVec2 == SubVec && ISD::isNormalLoad(Ld) &&
+            SDNode::areOnlyUsersOf({N, Vec.getNode()}, SubVec2.getNode())) {
+          return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT, SubVec);
+        }
+      }
+      // If this is subv_broadcast insert into both halves, use a larger
+      // subv_broadcast.
+      if (SubVec.getOpcode() == X86ISD::SUBV_BROADCAST && SubVec == SubVec2) {
+        return DAG.getNode(X86ISD::SUBV_BROADCAST, dl, OpVT,
+                           SubVec.getOperand(0));
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
@@ -34112,6 +35033,11 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   default: break;
   case ISD::EXTRACT_VECTOR_ELT:
     return combineExtractVectorElt(N, DAG, DCI, Subtarget);
+  case X86ISD::PEXTRW:
+  case X86ISD::PEXTRB:
+    return combineExtractVectorElt_SSE(N, DAG, DCI, Subtarget);
+  case ISD::INSERT_SUBVECTOR:
+    return combineInsertSubvector(N, DAG, DCI, Subtarget);
   case ISD::VSELECT:
   case ISD::SELECT:
   case X86ISD::SHRUNKBLEND: return combineSelect(N, DAG, DCI, Subtarget);
@@ -34152,13 +35078,18 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::SIGN_EXTEND:    return combineSext(N, DAG, DCI, Subtarget);
   case ISD::SIGN_EXTEND_INREG: return combineSignExtendInReg(N, DAG, Subtarget);
   case ISD::SETCC:          return combineSetCC(N, DAG, Subtarget);
-  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, DCI, Subtarget);
-  case X86ISD::BRCOND:      return combineBrCond(N, DAG, DCI, Subtarget);
+  case X86ISD::SETCC:       return combineX86SetCC(N, DAG, Subtarget);
+  case X86ISD::BRCOND:      return combineBrCond(N, DAG, Subtarget);
   case X86ISD::VSHLI:
   case X86ISD::VSRAI:
-  case X86ISD::VSRLI:       return combineVectorShift(N, DAG, DCI, Subtarget);
+  case X86ISD::VSRLI:
+    return combineVectorShiftImm(N, DAG, DCI, Subtarget);
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG:
   case X86ISD::VSEXT:
   case X86ISD::VZEXT:       return combineVSZext(N, DAG, DCI, Subtarget);
+  case X86ISD::PINSRB:
+  case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::INSERTPS:
   case X86ISD::PALIGNR:
@@ -35081,7 +36012,7 @@ int X86TargetLowering::getScalingFactorCost(const DataLayout &DL,
   return -1;
 }
 
-bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
+bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // Integer division on x86 is expensive. However, when aggressively optimizing
   // for code size, we prefer to use a div instruction, as it is usually smaller
   // than the alternative sequence.
@@ -35089,8 +36020,8 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeSet Attr) const {
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
-  bool OptSize = Attr.hasAttribute(AttributeSet::FunctionIndex,
-                                   Attribute::MinSize);
+  bool OptSize =
+      Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 9969c9093473b88cdecbc4d713e4a2b6f4b75860..ab4910daca02b9dc5d7fe6917513e7e4b4792a3b 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -149,8 +149,7 @@ namespace llvm {
       WrapperRIP,
 
       /// Copies a 64-bit value from the low word of an XMM vector
-      /// to an MMX vector.  If you think this is too close to the previous
-      /// mnemonic, so do I; blame Intel.
+      /// to an MMX vector.
       MOVDQ2Q,
 
       /// Copies a 32-bit value from the low word of a MMX
@@ -179,7 +178,7 @@ namespace llvm {
 
       /// Insert the lower 16-bits of a 32-bit value to a vector,
       /// corresponds to X86::PINSRW.
-      PINSRW, MMX_PINSRW,
+      PINSRW,
 
       /// Shuffle 16 8-bit values within a vector.
       PSHUFB,
@@ -195,21 +194,21 @@ namespace llvm {
       /// Blend where the selector is an immediate.
       BLENDI,
 
-      /// Blend where the condition has been shrunk.
-      /// This is used to emphasize that the condition mask is
-      /// no more valid for generic VSELECT optimizations.
+      /// Dynamic (non-constant condition) vector blend where only the sign bits
+      /// of the condition elements are used. This is used to enforce that the
+      /// condition mask is not valid for generic VSELECT optimizations.
       SHRUNKBLEND,
 
       /// Combined add and sub on an FP vector.
       ADDSUB,
 
       //  FP vector ops with rounding mode.
-      FADD_RND,
-      FSUB_RND,
-      FMUL_RND,
-      FDIV_RND,
-      FMAX_RND,
-      FMIN_RND,
+      FADD_RND, FADDS_RND,
+      FSUB_RND, FSUBS_RND,
+      FMUL_RND, FMULS_RND,
+      FDIV_RND, FDIVS_RND,
+      FMAX_RND, FMAXS_RND,
+      FMIN_RND, FMINS_RND,
       FSQRT_RND, FSQRTS_RND,
 
       // FP vector get exponent.
@@ -239,9 +238,6 @@ namespace llvm {
       FHADD,
       FHSUB,
 
-      // Integer absolute value
-      ABS,
-
       // Detect Conflicts Within a Vector
       CONFLICT,
 
@@ -251,6 +247,9 @@ namespace llvm {
       /// Commutative FMIN and FMAX.
       FMAXC, FMINC,
 
+      /// Scalar intrinsic floating point max and min.
+      FMAXS, FMINS,
+
       /// Floating point reciprocal-sqrt and reciprocal approximation.
       /// Note that these typically require refinement
       /// in order to obtain suitable precision.
@@ -446,8 +445,7 @@ namespace llvm {
       // Broadcast subvector to vector.
       SUBV_BROADCAST,
 
-      // Insert/Extract vector element.
-      VINSERT,
+      // Extract vector element.
       VEXTRACT,
 
       /// SSE4A Extraction and Insertion.
@@ -689,6 +687,9 @@ namespace llvm {
     unsigned getJumpTableEncoding() const override;
     bool useSoftFloat() const override;
 
+    void markLibCallAttributes(MachineFunction *MF, unsigned CC,
+                               ArgListTy &Args) const override;
+
     MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
       return MVT::i8;
     }
@@ -809,8 +810,17 @@ namespace llvm {
       return false;
     }
 
+    bool isMaskAndCmp0FoldingBeneficial(const Instruction &AndI) const override;
+
     bool hasAndNotCompare(SDValue Y) const override;
 
+    bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
+      return VT.isScalarInteger();
+    }
+
+    /// Vector-sized comparisons are fast using PCMPEQ + PMOVMSK or PTEST.
+    MVT hasFastEqualityCompare(unsigned NumBits) const override;
+
     /// Return the value type to use for ISD::SETCC.
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
@@ -820,11 +830,13 @@ namespace llvm {
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
                                        APInt &KnownOne,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
     /// Determine the number of bits in the operation that are sign bits.
     unsigned ComputeNumSignBitsForTargetNode(SDValue Op,
+                                             const APInt &DemandedElts,
                                              const SelectionDAG &DAG,
                                              unsigned Depth) const override;
 
@@ -987,6 +999,10 @@ namespace llvm {
     bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                            Type *Ty) const override;
 
+    bool convertSelectOfConstantsToMath() const override {
+      return true;
+    }
+
     /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
     /// with this index.
     bool isExtractSubvectorCheap(EVT ResVT, unsigned Index) const override;
@@ -1038,7 +1054,7 @@ namespace llvm {
     /// \brief Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(EVT VT) const override;
 
-    bool isIntDivCheap(EVT VT, AttributeSet Attr) const override;
+    bool isIntDivCheap(EVT VT, AttributeList Attr) const override;
 
     bool supportSwiftError() const override;
 
@@ -1079,7 +1095,8 @@ namespace llvm {
                             CallingConv::ID CallConv, bool isVarArg,
                             const SmallVectorImpl<ISD::InputArg> &Ins,
                             const SDLoc &dl, SelectionDAG &DAG,
-                            SmallVectorImpl<SDValue> &InVals) const;
+                            SmallVectorImpl<SDValue> &InVals,
+                            uint32_t *RegMask) const;
     SDValue LowerMemArgument(SDValue Chain, CallingConv::ID CallConv,
                              const SmallVectorImpl<ISD::InputArg> &ArgInfo,
                              const SDLoc &dl, SelectionDAG &DAG,
@@ -1141,8 +1158,7 @@ namespace llvm {
     SDValue LowerUINT_TO_FP_i32(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerUINT_TO_FP_vec(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerFP_TO_INT(SDValue Op, const X86Subtarget &Subtarget,
-                           SelectionDAG &DAG) const;
+    SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerToBT(SDValue And, ISD::CondCode CC, const SDLoc &dl,
                       SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
diff --git a/lib/Target/X86/X86Instr3DNow.td b/lib/Target/X86/X86Instr3DNow.td
index ba1aede3c1a0c60b0773ebad5e5c8cd673d52074..08b501ff20bf0abccc3160e1301068f733876a3b 100644
--- a/lib/Target/X86/X86Instr3DNow.td
+++ b/lib/Target/X86/X86Instr3DNow.td
@@ -38,7 +38,9 @@ multiclass I3DNow_binop_rm<bits<8> opc, string Mn> {
   def rm : I3DNow_binop<opc, MRMSrcMem, (ins VR64:$src1, i64mem:$src2), Mn, []>;
 }
 
-multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, string Ver = ""> {
+multiclass I3DNow_binop_rm_int<bits<8> opc, string Mn, bit Commutable = 0,
+                               string Ver = ""> {
+  let isCommutable = Commutable in
   def rr : I3DNow_binop<opc, MRMSrcReg, (ins VR64:$src1, VR64:$src2), Mn,
     [(set VR64:$dst, (!cast<Intrinsic>(
       !strconcat("int_x86_3dnow", Ver, "_", Mn)) VR64:$src1, VR64:$src2))]>;
@@ -63,25 +65,25 @@ multiclass I3DNow_conv_rm_int<bits<8> opc, string Mn, string Ver = ""> {
         (bitconvert (load_mmx addr:$src))))]>;
 }
 
-defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb">;
+defm PAVGUSB  : I3DNow_binop_rm_int<0xBF, "pavgusb", 1>;
 defm PF2ID    : I3DNow_conv_rm_int<0x1D, "pf2id">;
 defm PFACC    : I3DNow_binop_rm_int<0xAE, "pfacc">;
-defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd">;
-defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq">;
+defm PFADD    : I3DNow_binop_rm_int<0x9E, "pfadd", 1>;
+defm PFCMPEQ  : I3DNow_binop_rm_int<0xB0, "pfcmpeq", 1>;
 defm PFCMPGE  : I3DNow_binop_rm_int<0x90, "pfcmpge">;
 defm PFCMPGT  : I3DNow_binop_rm_int<0xA0, "pfcmpgt">;
 defm PFMAX    : I3DNow_binop_rm_int<0xA4, "pfmax">;
 defm PFMIN    : I3DNow_binop_rm_int<0x94, "pfmin">;
-defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul">;
+defm PFMUL    : I3DNow_binop_rm_int<0xB4, "pfmul", 1>;
 defm PFRCP    : I3DNow_conv_rm_int<0x96, "pfrcp">;
 defm PFRCPIT1 : I3DNow_binop_rm_int<0xA6, "pfrcpit1">;
 defm PFRCPIT2 : I3DNow_binop_rm_int<0xB6, "pfrcpit2">;
 defm PFRSQIT1 : I3DNow_binop_rm_int<0xA7, "pfrsqit1">;
 defm PFRSQRT  : I3DNow_conv_rm_int<0x97, "pfrsqrt">;
-defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub">;
-defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr">;
+defm PFSUB    : I3DNow_binop_rm_int<0x9A, "pfsub", 1>;
+defm PFSUBR   : I3DNow_binop_rm_int<0xAA, "pfsubr", 1>;
 defm PI2FD    : I3DNow_conv_rm_int<0x0D, "pi2fd">;
-defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw">;
+defm PMULHRW  : I3DNow_binop_rm_int<0xB7, "pmulhrw", 1>;
 
 
 def FEMMS : I3DNow<0x0E, RawFrm, (outs), (ins), "femms",
@@ -98,6 +100,6 @@ def PREFETCHW : I<0x0D, MRM1m, (outs), (ins i8mem:$addr), "prefetchw\t$addr",
 // "3DNowA" instructions
 defm PF2IW    : I3DNow_conv_rm_int<0x1C, "pf2iw", "a">;
 defm PI2FW    : I3DNow_conv_rm_int<0x0C, "pi2fw", "a">;
-defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", "a">;
-defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", "a">;
+defm PFNACC   : I3DNow_binop_rm_int<0x8A, "pfnacc", 0, "a">;
+defm PFPNACC  : I3DNow_binop_rm_int<0x8E, "pfpnacc", 0, "a">;
 defm PSWAPD   : I3DNow_conv_rm_int<0xBB, "pswapd", "a">;
diff --git a/lib/Target/X86/X86InstrAVX512.td b/lib/Target/X86/X86InstrAVX512.td
index d58a93eed851fd40b9fb8a4e1fee9e60aaf54529..78c44050c6102eb56338668e22d2f12807596135 100644
--- a/lib/Target/X86/X86InstrAVX512.td
+++ b/lib/Target/X86/X86InstrAVX512.td
@@ -34,13 +34,6 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   ValueType KVT = !cast<ValueType>(!if (!eq (NumElts, 1), "i1",
                                                           "v" # NumElts # "i1"));
 
-  // The GPR register class that can hold the write mask.  Use GR8 for fewer
-  // than 8 elements.  Use shift-right and equal to work around the lack of
-  // !lt in tablegen.
-  RegisterClass MRC =
-    !cast<RegisterClass>("GR" #
-                         !if (!eq (!srl(NumElts, 3), 0), 8, NumElts));
-
   // Suffix used in the instruction mnemonic.
   string Suffix = suffix;
 
@@ -69,6 +62,9 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   // The corresponding memory operand, e.g. i512mem for VR512.
   X86MemOperand MemOp = !cast<X86MemOperand>(TypeVariantName # Size # "mem");
   X86MemOperand ScalarMemOp = !cast<X86MemOperand>(EltVT # "mem");
+  // FP scalar memory operand for intrinsics - ssmem/sdmem.
+  Operand IntScalarMemOp = !if (!eq (EltTypeName, "f32"), !cast<Operand>("ssmem"),
+                           !if (!eq (EltTypeName, "f64"), !cast<Operand>("sdmem"), ?));
 
   // Load patterns
   // Note: For 128/256-bit integer VT we choose loadv2i64/loadv4i64
@@ -89,6 +85,12 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
 
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
 
+  ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"),
+                                          !cast<ComplexPattern>("sse_load_f32"),
+                                    !if (!eq (EltTypeName, "f64"),
+                                          !cast<ComplexPattern>("sse_load_f64"),
+                                    ?));
+
   // The corresponding float type, e.g. v16f32 for v16i32
   // Note: For EltSize < 32, FloatVT is illegal and TableGen
   //       fails to compile, so we choose FloatVT = VT
@@ -207,7 +209,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
                        Pattern, itin>;
 
   // Prefer over VMOV*rrk Pat<>
-  let AddedComplexity = 20, isCommutable = IsKCommutable in
+  let isCommutable = IsKCommutable in
     def NAME#k: AVX512<O, F, Outs, MaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}}|"#
                                      "$dst {${mask}}, "#IntelSrcAsm#"}",
@@ -219,7 +221,7 @@ multiclass AVX512_maskable_custom<bits<8> O, Format F,
 
   // Zero mask does not add any restrictions to commute operands transformation.
   // So, it is Ok to use IsCommutable instead of IsKCommutable.
-  let AddedComplexity = 30, isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
+  let isCommutable = IsCommutable in // Prefer over VMOV*rrkz Pat<>
     def NAME#kz: AVX512<O, F, Outs, ZeroMaskingIns,
                        OpcodeStr#"\t{"#AttSrcAsm#", $dst {${mask}} {z}|"#
                                      "$dst {${mask}} {z}, "#IntelSrcAsm#"}",
@@ -250,6 +252,23 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                          MaskingConstraint, NoItinerary, IsCommutable,
                          IsKCommutable>;
 
+// Similar to AVX512_maskable_common, but with scalar types.
+multiclass AVX512_maskable_fp_common<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  SDNode Select = vselect,
+                                  string MaskingConstraint = "",
+                                  InstrItinClass itin = NoItinerary,
+                                  bit IsCommutable = 0,
+                                  bit IsKCommutable = 0> :
+  AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+                         AttSrcAsm, IntelSrcAsm,
+                         [], [], [],
+                         MaskingConstraint, NoItinerary, IsCommutable,
+                         IsKCommutable>;
+
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
 // perserved vector elements come from a new dummy input operand tied to $dst.
@@ -484,7 +503,7 @@ multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To
                                                        PatFrag vinsert_insert> {
   let ExeDomain = To.ExeDomain in {
     defm rr : AVX512_maskable<Opcode, MRMSrcReg, To, (outs To.RC:$dst),
-                   (ins To.RC:$src1, From.RC:$src2, i32u8imm:$src3),
+                   (ins To.RC:$src1, From.RC:$src2, u8imm:$src3),
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
@@ -492,7 +511,7 @@ multiclass vinsert_for_size<int Opcode, X86VectorVTInfo From, X86VectorVTInfo To
                                          (iPTR imm))>, AVX512AIi8Base, EVEX_4V;
 
     defm rm : AVX512_maskable<Opcode, MRMSrcMem, To, (outs To.RC:$dst),
-                   (ins To.RC:$src1, From.MemOp:$src2, i32u8imm:$src3),
+                   (ins To.RC:$src1, From.MemOp:$src2, u8imm:$src3),
                    "vinsert" # From.EltTypeName # "x" # From.NumElts,
                    "$src3, $src2, $src1", "$src1, $src2, $src3",
                    (vinsert_insert:$src3 (To.VT To.RC:$src1),
@@ -625,14 +644,14 @@ multiclass vextract_for_size<int Opcode,
     // vextract_extract), we interesting only in patterns without mask,
     // intrinsics pattern match generated bellow.
     defm rr : AVX512_maskable_in_asm<Opcode, MRMDestReg, To, (outs To.RC:$dst),
-                (ins From.RC:$src1, i32u8imm:$idx),
+                (ins From.RC:$src1, u8imm:$idx),
                 "vextract" # To.EltTypeName # "x" # To.NumElts,
                 "$idx, $src1", "$src1, $idx",
                 [(set To.RC:$dst, (vextract_extract:$idx (From.VT From.RC:$src1),
                                                          (iPTR imm)))]>,
               AVX512AIi8Base, EVEX;
     def mr  : AVX512AIi8<Opcode, MRMDestMem, (outs),
-                    (ins To.MemOp:$dst, From.RC:$src1, i32u8imm:$idx),
+                    (ins To.MemOp:$dst, From.RC:$src1, u8imm:$idx),
                     "vextract" # To.EltTypeName # "x" # To.NumElts #
                         "\t{$idx, $src1, $dst|$dst, $src1, $idx}",
                     [(store (To.VT (vextract_extract:$idx
@@ -642,7 +661,7 @@ multiclass vextract_for_size<int Opcode,
     let mayStore = 1, hasSideEffects = 0 in
     def mrk : AVX512AIi8<Opcode, MRMDestMem, (outs),
                     (ins To.MemOp:$dst, To.KRCWM:$mask,
-                                        From.RC:$src1, i32u8imm:$idx),
+                                        From.RC:$src1, u8imm:$idx),
                      "vextract" # To.EltTypeName # "x" # To.NumElts #
                           "\t{$idx, $src1, $dst {${mask}}|"
                           "$dst {${mask}}, $src1, $idx}",
@@ -880,7 +899,6 @@ multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
                           (SrcInfo.VT (scalar_to_vector
                                        (SrcInfo.ScalarLdFrag addr:$src))))),
             (!cast<Instruction>(NAME#DestInfo.ZSuffix#m) addr:$src)>;
-  let AddedComplexity = 20 in
   def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
                           (X86VBroadcast
                            (SrcInfo.VT (scalar_to_vector
@@ -888,7 +906,6 @@ multiclass avx512_broadcast_rm<bits<8> opc, string OpcodeStr,
                           DestInfo.RC:$src0)),
             (!cast<Instruction>(NAME#DestInfo.ZSuffix#mk)
              DestInfo.RC:$src0, DestInfo.KRCWM:$mask, addr:$src)>;
-  let AddedComplexity = 30 in
   def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
                           (X86VBroadcast
                            (SrcInfo.VT (scalar_to_vector
@@ -939,39 +956,42 @@ def : Pat<(int_x86_avx512_vbroadcast_sd_512 addr:$src),
           (VBROADCASTSDZm addr:$src)>;
 
 multiclass avx512_int_broadcast_reg<bits<8> opc, X86VectorVTInfo _,
+                                    SDPatternOperator OpNode,
                                     RegisterClass SrcRC> {
+  let ExeDomain = _.ExeDomain in
   defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins SrcRC:$src),
                          "vpbroadcast"##_.Suffix, "$src", "$src",
-                         (_.VT (X86VBroadcast SrcRC:$src))>, T8PD, EVEX;
+                         (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX;
 }
 
 multiclass avx512_int_broadcast_reg_vl<bits<8> opc, AVX512VLVectorVTInfo _,
+                                       SDPatternOperator OpNode,
                                        RegisterClass SrcRC, Predicate prd> {
   let Predicates = [prd] in
-    defm Z : avx512_int_broadcast_reg<opc, _.info512, SrcRC>, EVEX_V512;
+    defm Z : avx512_int_broadcast_reg<opc, _.info512, OpNode, SrcRC>, EVEX_V512;
   let Predicates = [prd, HasVLX] in {
-    defm Z256 : avx512_int_broadcast_reg<opc, _.info256, SrcRC>, EVEX_V256;
-    defm Z128 : avx512_int_broadcast_reg<opc, _.info128, SrcRC>, EVEX_V128;
+    defm Z256 : avx512_int_broadcast_reg<opc, _.info256, OpNode, SrcRC>, EVEX_V256;
+    defm Z128 : avx512_int_broadcast_reg<opc, _.info128, OpNode, SrcRC>, EVEX_V128;
   }
 }
 
 let isCodeGenOnly = 1 in {
-defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info, GR8,
-                                                 HasBWI>;
-defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info, GR16,
-                                                 HasBWI>;
+defm VPBROADCASTBr : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
+                                                 X86VBroadcast, GR8, HasBWI>;
+defm VPBROADCASTWr : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
+                                                 X86VBroadcast, GR16, HasBWI>;
 }
 let isAsmParserOnly = 1 in {
   defm VPBROADCASTBr_Alt : avx512_int_broadcast_reg_vl<0x7A, avx512vl_i8_info,
-                                                       GR32, HasBWI>;
+                                                       null_frag, GR32, HasBWI>;
   defm VPBROADCASTWr_Alt : avx512_int_broadcast_reg_vl<0x7B, avx512vl_i16_info,
-                                                       GR32, HasBWI>;
+                                                       null_frag, GR32, HasBWI>;
 }
-defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info, GR32,
-                                                 HasAVX512>;
-defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info, GR64,
-                                                 HasAVX512>, VEX_W;
+defm VPBROADCASTDr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i32_info,
+                                                 X86VBroadcast, GR32, HasAVX512>;
+defm VPBROADCASTQr : avx512_int_broadcast_reg_vl<0x7C, avx512vl_i64_info,
+                                                 X86VBroadcast, GR64, HasAVX512>, VEX_W;
 
 def : Pat <(v16i32 (X86vzext VK16WM:$mask)),
            (VPBROADCASTDrZrkz VK16WM:$mask, (i32 (MOV32ri 0x1)))>;
@@ -1023,7 +1043,18 @@ multiclass avx512_subvec_broadcast_rm<bits<8> opc, string OpcodeStr,
                             AVX5128IBase, EVEX;
 }
 
+let Predicates = [HasAVX512] in {
+  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
+  def : Pat<(v8i64 (X86VBroadcast (v8i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQZm addr:$src)>;
+}
+
 let Predicates = [HasVLX, HasBWI] in {
+  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
+  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQZ128m addr:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQZ256m addr:$src)>;
   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
   // This means we'll encounter truncated i32 loads; match that here.
   def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
@@ -1517,11 +1548,10 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
                               imm:$cc)>, EVEX_4V;
   defm  rm_Int  : AVX512_maskable_cmp<0xC2, MRMSrcMem, _,
                     (outs _.KRC:$dst),
-                    (ins _.RC:$src1, _.ScalarMemOp:$src2, AVXCC:$cc),
+                    (ins _.RC:$src1, _.IntScalarMemOp:$src2, AVXCC:$cc),
                     "vcmp${cc}"#_.Suffix,
                     "$src2, $src1", "$src1, $src2",
-                    (OpNode (_.VT _.RC:$src1),
-                        (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
+                    (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
                         imm:$cc)>, EVEX_4V, EVEX_CD8<_.EltSize, CD8VT1>;
 
   defm  rrb_Int  : AVX512_maskable_cmp<0xC2, MRMSrcReg, _,
@@ -1578,8 +1608,10 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeRnd>
 }
 
 let Predicates = [HasAVX512] in {
+  let ExeDomain = SSEPackedSingle in
   defm VCMPSSZ : avx512_cmp_scalar<f32x_info, X86cmpms, X86cmpmsRnd>,
                                    AVX512XSIi8Base;
+  let ExeDomain = SSEPackedDouble in
   defm VCMPSDZ : avx512_cmp_scalar<f64x_info, X86cmpms, X86cmpmsRnd>,
                                    AVX512XDIi8Base, VEX_W;
 }
@@ -1998,22 +2030,20 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr, SDNode OpNode,
                       [(set _.KRC:$dst,(or _.KRCWM:$mask,
                                       (OpNode (_.VT _.RC:$src1),
                                       (i32 imm:$src2))))], NoItinerary>, EVEX_K;
-    let AddedComplexity = 20 in {
-      def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
-                      (ins _.MemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix##
-                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                      [(set _.KRC:$dst,
-                            (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                                    (i32 imm:$src2)))], NoItinerary>;
-      def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
-                      (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix##
-                      "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
-                      [(set _.KRC:$dst,(or _.KRCWM:$mask,
+    def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##
+                              "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    [(set _.KRC:$dst,
                           (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                              (i32 imm:$src2))))], NoItinerary>, EVEX_K;
-    }
+                                  (i32 imm:$src2)))], NoItinerary>;
+    def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
+                    (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
+                    OpcodeStr##_.Suffix##
+                    "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
+                    [(set _.KRC:$dst,(or _.KRCWM:$mask,
+                        (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                            (i32 imm:$src2))))], NoItinerary>, EVEX_K;
   }
 }
 
@@ -2153,28 +2183,26 @@ let Predicates = [HasBWI] in {
 
 // GR from/to mask register
 def : Pat<(v16i1 (bitconvert (i16 GR16:$src))),
-          (COPY_TO_REGCLASS GR16:$src, VK16)>;
+          (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR16:$src, sub_16bit)), VK16)>;
 def : Pat<(i16 (bitconvert (v16i1 VK16:$src))),
-          (COPY_TO_REGCLASS VK16:$src, GR16)>;
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK16:$src, GR32)), sub_16bit)>;
 
 def : Pat<(v8i1 (bitconvert (i8 GR8:$src))),
-          (COPY_TO_REGCLASS GR8:$src, VK8)>;
+          (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$src, sub_8bit)), VK8)>;
 def : Pat<(i8 (bitconvert (v8i1 VK8:$src))),
-          (COPY_TO_REGCLASS VK8:$src, GR8)>;
+          (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit)>;
 
 def : Pat<(i32 (zext (i16 (bitconvert (v16i1 VK16:$src))))),
           (KMOVWrk VK16:$src)>;
 def : Pat<(i32 (anyext (i16 (bitconvert (v16i1 VK16:$src))))),
-          (i32 (INSERT_SUBREG (IMPLICIT_DEF),
-                (i16 (COPY_TO_REGCLASS VK16:$src, GR16)), sub_16bit))>;
+          (COPY_TO_REGCLASS VK16:$src, GR32)>;
 
 def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
-          (MOVZX32rr8 (COPY_TO_REGCLASS VK8:$src, GR8))>, Requires<[NoDQI]>;
+          (MOVZX32rr8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)), sub_8bit))>, Requires<[NoDQI]>;
 def : Pat<(i32 (zext (i8 (bitconvert (v8i1 VK8:$src))))),
           (KMOVBrk VK8:$src)>, Requires<[HasDQI]>;
 def : Pat<(i32 (anyext (i8 (bitconvert (v8i1 VK8:$src))))),
-          (i32 (INSERT_SUBREG (IMPLICIT_DEF),
-                (i8 (COPY_TO_REGCLASS VK8:$src, GR8)), sub_8bit))>;
+          (COPY_TO_REGCLASS VK8:$src, GR32)>;
 
 def : Pat<(v32i1 (bitconvert (i32 GR32:$src))),
           (COPY_TO_REGCLASS GR32:$src, VK32)>;
@@ -2207,20 +2235,20 @@ let Predicates = [HasDQI] in {
 let Predicates = [HasAVX512, NoDQI] in {
   def : Pat<(store VK1:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)),
+              sub_8bit)))>;
   def : Pat<(store VK2:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK2:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK2:$src, GR32)),
+              sub_8bit)))>;
   def : Pat<(store VK4:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK4:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK4:$src, GR32)),
+              sub_8bit)))>;
   def : Pat<(store VK8:$src, addr:$dst),
             (MOV8mr addr:$dst,
-             (EXTRACT_SUBREG (KMOVWrk (COPY_TO_REGCLASS VK8:$src, VK16)),
-              sub_8bit))>;
+             (i8 (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK8:$src, GR32)),
+              sub_8bit)))>;
 
   def : Pat<(v8i1 (load addr:$src)),
             (COPY_TO_REGCLASS (MOVZX32rm8 addr:$src), VK8)>;
@@ -2251,44 +2279,41 @@ let Predicates = [HasBWI] in {
 
 let Predicates = [HasAVX512] in {
   def : Pat<(i1 (trunc (i64 GR64:$src))),
-            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
-                                        (i32 1))), VK1)>;
+            (COPY_TO_REGCLASS (AND32ri8 (EXTRACT_SUBREG $src, sub_32bit),
+                                        (i32 1)), VK1)>;
 
   def : Pat<(i1 (trunc (i32 GR32:$src))),
-            (COPY_TO_REGCLASS (KMOVWkr (AND32ri8 $src, (i32 1))), VK1)>;
+            (COPY_TO_REGCLASS (AND32ri8 $src, (i32 1)), VK1)>;
 
   def : Pat<(i1 (trunc (i32 (assertzext_i1 GR32:$src)))),
             (COPY_TO_REGCLASS GR32:$src, VK1)>;
 
   def : Pat<(i1 (trunc (i8 GR8:$src))),
        (COPY_TO_REGCLASS
-        (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                          GR8:$src, sub_8bit), (i32 1))),
-       VK1)>;
+        (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                 GR8:$src, sub_8bit), (i32 1)), VK1)>;
 
   def : Pat<(i1 (trunc (i16 GR16:$src))),
        (COPY_TO_REGCLASS
-        (KMOVWkr (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
-                                          GR16:$src, sub_16bit), (i32 1))),
-       VK1)>;
+        (AND32ri8 (INSERT_SUBREG (i32 (IMPLICIT_DEF)),
+                                 GR16:$src, sub_16bit), (i32 1)), VK1)>;
 
   def : Pat<(i32 (zext VK1:$src)),
-            (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1))>;
+            (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1))>;
 
   def : Pat<(i32 (anyext VK1:$src)),
             (COPY_TO_REGCLASS VK1:$src, GR32)>;
 
   def : Pat<(i8 (zext VK1:$src)),
             (EXTRACT_SUBREG
-             (AND32ri8 (KMOVWrk
-                        (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)), sub_8bit)>;
+             (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_8bit)>;
 
   def : Pat<(i8 (anyext VK1:$src)),
             (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_8bit)>;
 
   def : Pat<(i64 (zext VK1:$src)),
-            (AND64ri8 (SUBREG_TO_REG (i64 0),
-             (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), sub_32bit), (i64 1))>;
+            (SUBREG_TO_REG (i64 0),
+             (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_32bit)>;
 
   def : Pat<(i64 (anyext VK1:$src)),
             (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
@@ -2296,8 +2321,7 @@ let Predicates = [HasAVX512] in {
 
   def : Pat<(i16 (zext VK1:$src)),
             (EXTRACT_SUBREG
-             (AND32ri8 (KMOVWrk (COPY_TO_REGCLASS VK1:$src, VK16)), (i32 1)),
-              sub_16bit)>;
+             (AND32ri8 (COPY_TO_REGCLASS VK1:$src, GR32), (i32 1)), sub_16bit)>;
 
   def : Pat<(i16 (anyext VK1:$src)),
             (EXTRACT_SUBREG (i32 (COPY_TO_REGCLASS VK1:$src, GR32)), sub_16bit)>;
@@ -2351,15 +2375,6 @@ multiclass avx512_mask_unop_all<bits<8> opc, string OpcodeStr,
 
 defm KNOT : avx512_mask_unop_all<0x44, "knot", vnot>;
 
-multiclass avx512_mask_unop_int<string IntName, string InstName> {
-  let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
-                (i16 GR16:$src)),
-              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
-              (v16i1 (COPY_TO_REGCLASS GR16:$src, VK16))), GR16)>;
-}
-defm : avx512_mask_unop_int<"knot", "KNOT">;
-
 // KNL does not support KMOVB, 8-bit mask is promoted to 16-bit
 let Predicates = [HasAVX512, NoDQI] in
 def : Pat<(vnot VK8:$src),
@@ -2408,21 +2423,6 @@ defm KXOR  : avx512_mask_binop_all<0x47, "kxor",  xor,   1>;
 defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn, 0>;
 defm KADD  : avx512_mask_binop_all<0x4A, "kadd",  add,   1, HasDQI>;
 
-multiclass avx512_mask_binop_int<string IntName, string InstName> {
-  let Predicates = [HasAVX512] in
-    def : Pat<(!cast<Intrinsic>("int_x86_avx512_"##IntName##"_w")
-                (i16 GR16:$src1), (i16 GR16:$src2)),
-              (COPY_TO_REGCLASS (!cast<Instruction>(InstName##"Wrr")
-              (v16i1 (COPY_TO_REGCLASS GR16:$src1, VK16)),
-              (v16i1 (COPY_TO_REGCLASS GR16:$src2, VK16))), GR16)>;
-}
-
-defm : avx512_mask_binop_int<"kand",  "KAND">;
-defm : avx512_mask_binop_int<"kandn", "KANDN">;
-defm : avx512_mask_binop_int<"kor",   "KOR">;
-defm : avx512_mask_binop_int<"kxnor", "KXNOR">;
-defm : avx512_mask_binop_int<"kxor",  "KXOR">;
-
 multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
                             Instruction Inst> {
   // With AVX512F, 8-bit mask is promoted to 16-bit mask,
@@ -2552,9 +2552,11 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v8i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK8)>;
   def : Pat<(v4i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK4)>;
   def : Pat<(v2i1 immAllOnesV),  (COPY_TO_REGCLASS (KSET1W), VK2)>;
-  def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
-  def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
-  def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+  let AddedComplexity = 10 in { // To optimize isel table.
+    def : Pat<(i1 0), (COPY_TO_REGCLASS (KSET0W), VK1)>;
+    def : Pat<(i1 1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+    def : Pat<(i1 -1), (COPY_TO_REGCLASS (KSHIFTRWri (KSET1W), (i8 15)), VK1)>;
+  }
 }
 
 // Patterns for kmask insert_subvector/extract_subvector to/from index=0
@@ -3070,6 +3072,10 @@ def VMOV64toSDZrr : AVX512BI<0x6E, MRMSrcReg, (outs FR64X:$dst), (ins GR64:$src)
                        "vmovq\t{$src, $dst|$dst, $src}",
                        [(set FR64X:$dst, (bitconvert GR64:$src))],
                        IIC_SSE_MOVDQ>, EVEX, VEX_W, Sched<[WriteMove]>;
+def VMOV64toSDZrm : AVX512XSI<0x7E, MRMSrcMem, (outs FR64X:$dst), (ins i64mem:$src),
+                      "vmovq\t{$src, $dst|$dst, $src}",
+                      [(set FR64X:$dst, (bitconvert (loadi64 addr:$src)))]>,
+                      EVEX, VEX_W, EVEX_CD8<8, CD8VT8>;
 def VMOVSDto64Zrr : AVX512BI<0x7E, MRMDestReg, (outs GR64:$dst), (ins FR64X:$src),
                          "vmovq\t{$src, $dst|$dst, $src}",
                          [(set GR64:$dst, (bitconvert FR64X:$src))],
@@ -3182,20 +3188,22 @@ multiclass avx512_move_scalar<string asm, SDNode OpNode,
                                     (scalar_to_vector _.FRC:$src2))))],
              _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V;
   def rrkz : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
-              (ins _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+              (ins _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2),
               !strconcat(asm, "\t{$src2, $src1, $dst {${mask}} {z}|",
               "$dst {${mask}} {z}, $src1, $src2}"),
               [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
-                                      (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                                      (_.VT (OpNode _.RC:$src1,
+                                            (scalar_to_vector _.FRC:$src2))),
                                       _.ImmAllZerosV)))],
               _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_KZ;
   let Constraints = "$src0 = $dst"  in
   def rrk : AVX512PI<0x10, MRMSrcReg, (outs _.RC:$dst),
-             (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.RC:$src2),
+             (ins _.RC:$src0, _.KRCWM:$mask, _.RC:$src1, _.FRC:$src2),
              !strconcat(asm, "\t{$src2, $src1, $dst {${mask}}|",
              "$dst {${mask}}, $src1, $src2}"),
              [(set _.RC:$dst, (_.VT (X86selects _.KRCWM:$mask,
-                                     (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                                     (_.VT (OpNode _.RC:$src1,
+                                           (scalar_to_vector _.FRC:$src2))),
                                      (_.VT _.RC:$src0))))],
              _.ExeDomain,IIC_SSE_MOV_S_RR>, EVEX_4V, EVEX_K;
   let canFoldAsLoad = 1, isReMaterializable = 1 in
@@ -3245,8 +3253,7 @@ def : Pat<(_.VT (OpNode _.RC:$src0,
           (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrk)
                                           (COPY_TO_REGCLASS _.FRC:$src2, _.RC),
                                           (COPY_TO_REGCLASS GR32:$mask, VK1WM),
-                                          (_.VT _.RC:$src0),
-                                          (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+                                          (_.VT _.RC:$src0), _.FRC:$src1),
                             _.RC)>;
 
 def : Pat<(_.VT (OpNode _.RC:$src0,
@@ -3256,10 +3263,8 @@ def : Pat<(_.VT (OpNode _.RC:$src0,
                                                        (_.EltVT ZeroFP))))))),
           (COPY_TO_REGCLASS (!cast<Instruction>(InstrStr#rrkz)
                                           (COPY_TO_REGCLASS GR32:$mask, VK1WM),
-                                          (_.VT _.RC:$src0),
-                                          (COPY_TO_REGCLASS _.FRC:$src1, _.RC)),
+                                          (_.VT _.RC:$src0), _.FRC:$src1),
                             _.RC)>;
-
 }
 
 multiclass avx512_store_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
@@ -3269,14 +3274,31 @@ def : Pat<(masked_store addr:$dst, Mask,
              (_.info512.VT (insert_subvector undef,
                                (_.info256.VT (insert_subvector undef,
                                                  (_.info128.VT _.info128.RC:$src),
-                                                 (i64 0))),
-                               (i64 0)))),
+                                                 (iPTR 0))),
+                               (iPTR 0)))),
           (!cast<Instruction>(InstrStr#mrk) addr:$dst,
                       (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
                       (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
 
 }
 
+multiclass avx512_store_scalar_lowering_subreg<string InstrStr,
+                                               AVX512VLVectorVTInfo _,
+                                               dag Mask, RegisterClass MaskRC,
+                                               SubRegIndex subreg> {
+
+def : Pat<(masked_store addr:$dst, Mask,
+             (_.info512.VT (insert_subvector undef,
+                               (_.info256.VT (insert_subvector undef,
+                                                 (_.info128.VT _.info128.RC:$src),
+                                                 (iPTR 0))),
+                               (iPTR 0)))),
+          (!cast<Instruction>(InstrStr#mrk) addr:$dst,
+                      (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+                      (COPY_TO_REGCLASS _.info128.RC:$src, _.info128.FRC))>;
+
+}
+
 multiclass avx512_load_scalar_lowering<string InstrStr, AVX512VLVectorVTInfo _,
                                        dag Mask, RegisterClass MaskRC> {
 
@@ -3284,7 +3306,7 @@ def : Pat<(_.info128.VT (extract_subvector
                          (_.info512.VT (masked_load addr:$srcAddr, Mask,
                                         (_.info512.VT (bitconvert
                                                        (v16i32 immAllZerosV))))),
-                           (i64 0))),
+                           (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmkz)
                       (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
                       addr:$srcAddr)>;
@@ -3294,53 +3316,81 @@ def : Pat<(_.info128.VT (extract_subvector
                       (_.info512.VT (insert_subvector undef,
                             (_.info256.VT (insert_subvector undef,
                                   (_.info128.VT (X86vzmovl _.info128.RC:$src)),
-                                  (i64 0))),
-                            (i64 0))))),
-                (i64 0))),
+                                  (iPTR 0))),
+                            (iPTR 0))))),
+                (iPTR 0))),
           (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
                       (i1 (COPY_TO_REGCLASS MaskRC:$mask, VK1WM)),
                       addr:$srcAddr)>;
 
 }
 
+multiclass avx512_load_scalar_lowering_subreg<string InstrStr,
+                                              AVX512VLVectorVTInfo _,
+                                              dag Mask, RegisterClass MaskRC,
+                                              SubRegIndex subreg> {
+
+def : Pat<(_.info128.VT (extract_subvector
+                         (_.info512.VT (masked_load addr:$srcAddr, Mask,
+                                        (_.info512.VT (bitconvert
+                                                       (v16i32 immAllZerosV))))),
+                           (iPTR 0))),
+          (!cast<Instruction>(InstrStr#rmkz)
+                      (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+                      addr:$srcAddr)>;
+
+def : Pat<(_.info128.VT (extract_subvector
+                (_.info512.VT (masked_load addr:$srcAddr, Mask,
+                      (_.info512.VT (insert_subvector undef,
+                            (_.info256.VT (insert_subvector undef,
+                                  (_.info128.VT (X86vzmovl _.info128.RC:$src)),
+                                  (iPTR 0))),
+                            (iPTR 0))))),
+                (iPTR 0))),
+          (!cast<Instruction>(InstrStr#rmk) _.info128.RC:$src,
+                      (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), MaskRC:$mask, subreg)), VK1WM)),
+                      addr:$srcAddr)>;
+
+}
+
 defm : avx512_move_scalar_lowering<"VMOVSSZ", X86Movss, fp32imm0, v4f32x_info>;
 defm : avx512_move_scalar_lowering<"VMOVSDZ", X86Movsd, fp64imm0, v2f64x_info>;
 
 defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
-defm : avx512_store_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
-                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
-defm : avx512_store_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
-                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>;
+defm : avx512_store_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
+                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
 defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
                    (v16i1 (bitconvert (i16 (trunc (and GR32:$mask, (i32 1)))))), GR32>;
-defm : avx512_load_scalar_lowering<"VMOVSSZ", avx512vl_f32_info,
-                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16>;
-defm : avx512_load_scalar_lowering<"VMOVSDZ", avx512vl_f64_info,
-                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSSZ", avx512vl_f32_info,
+                   (v16i1 (bitconvert (i16 (and GR16:$mask, (i16 1))))), GR16, sub_16bit>;
+defm : avx512_load_scalar_lowering_subreg<"VMOVSDZ", avx512vl_f64_info,
+                   (v8i1 (bitconvert (i8 (and GR8:$mask, (i8 1))))), GR8, sub_8bit>;
 
 def : Pat<(f32 (X86selects VK1WM:$mask, (f32 FR32X:$src1), (f32 FR32X:$src2))),
           (COPY_TO_REGCLASS (VMOVSSZrrk (COPY_TO_REGCLASS FR32X:$src2, VR128X),
-           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)),(COPY_TO_REGCLASS FR32X:$src1, VR128X)), FR32X)>;
+           VK1WM:$mask, (v4f32 (IMPLICIT_DEF)), FR32X:$src1), FR32X)>;
 
 def : Pat<(f64 (X86selects VK1WM:$mask, (f64 FR64X:$src1), (f64 FR64X:$src2))),
           (COPY_TO_REGCLASS (VMOVSDZrrk (COPY_TO_REGCLASS FR64X:$src2, VR128X),
-           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), (COPY_TO_REGCLASS FR64X:$src1, VR128X)), FR64X)>;
+           VK1WM:$mask, (v2f64 (IMPLICIT_DEF)), FR64X:$src1), FR64X)>;
 
 def : Pat<(int_x86_avx512_mask_store_ss addr:$dst, VR128X:$src, GR8:$mask),
-          (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS GR8:$mask, VK1WM)),
+          (VMOVSSZmrk addr:$dst, (i1 (COPY_TO_REGCLASS (i32 (INSERT_SUBREG (IMPLICIT_DEF), GR8:$mask, sub_8bit)), VK1WM)),
            (COPY_TO_REGCLASS VR128X:$src, FR32X))>;
 
 let hasSideEffects = 0 in
 defm VMOVSSZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f32x_info,
-                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+                           (outs VR128X:$dst), (ins VR128X:$src1, FR32X:$src2),
                            "vmovss.s", "$src2, $src1", "$src1, $src2", []>,
                            XS, EVEX_4V, VEX_LIG;
 
 let hasSideEffects = 0 in
-defm VMOVSSDrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
-                           (outs VR128X:$dst), (ins VR128X:$src1, VR128X:$src2),
+defm VMOVSDZrr_REV : AVX512_maskable_in_asm<0x11, MRMDestReg, f64x_info,
+                           (outs VR128X:$dst), (ins VR128X:$src1, FR64X:$src2),
                            "vmovsd.s", "$src2, $src1", "$src1, $src2", []>,
                            XD, EVEX_4V, VEX_LIG, VEX_W;
 
@@ -3492,10 +3542,6 @@ let Predicates = [HasAVX512] in {
             (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
   def : Pat<(v2f64 (X86Movsd VR128X:$src1, VR128X:$src2)),
             (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
-  def : Pat<(v4f32 (X86Movsd VR128X:$src1, VR128X:$src2)),
-            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
-  def : Pat<(v4i32 (X86Movsd VR128X:$src1, VR128X:$src2)),
-            (VMOVSDZrr VR128X:$src1, (COPY_TO_REGCLASS VR128X:$src2, FR64X))>;
 
   // 256-bit variants
   def : Pat<(v4i64 (X86Movsd VR256X:$src1, VR256X:$src2)),
@@ -3545,6 +3591,8 @@ let Predicates = [HasAVX512] in {
   }
   // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
   let AddedComplexity = 20 in {
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+              (VMOVDI2PDIZrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (VMOVDI2PDIZrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
@@ -3579,19 +3627,6 @@ let Predicates = [HasAVX512] in {
   def : Pat<(v8i64 (X86vzload addr:$src)),
             (SUBREG_TO_REG (i64 0), (VMOVQI2PQIZrm addr:$src), sub_xmm)>;
 }
-
-def : Pat<(v16i32 (X86Vinsert (v16i32 immAllZerosV), GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v8i64 (X86Vinsert (bc_v8i64 (v16i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
-
-def : Pat<(v16i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIZrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v8i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIZrr GR64:$src2), sub_xmm)>;
-
 //===----------------------------------------------------------------------===//
 // AVX-512 - Non-temporals
 //===----------------------------------------------------------------------===//
@@ -4150,16 +4185,16 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
   defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
-                           (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
-                           (i32 FROUND_CURRENT)),
+                           (_.VT (VecNode _.RC:$src1, _.RC:$src2,
+                                          (i32 FROUND_CURRENT))),
                            itins.rr>;
 
   defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
+                         (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
-                         (VecNode (_.VT _.RC:$src1),
-                          (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
-                           (i32 FROUND_CURRENT)),
+                         (_.VT (VecNode _.RC:$src1,
+                                        _.ScalarIntMemCPat:$src2,
+                                        (i32 FROUND_CURRENT))),
                          itins.rm>;
   let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -4189,13 +4224,43 @@ multiclass avx512_fp_scalar_round<bits<8> opc, string OpcodeStr,X86VectorVTInfo
                           EVEX_B, EVEX_RC;
 }
 multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
-                         SDNode VecNode, OpndItins itins, bit IsCommutable> {
-  let ExeDomain = _.ExeDomain in
+                                SDNode OpNode, SDNode VecNode, SDNode SaeNode,
+                                OpndItins itins, bit IsCommutable> {
+  let ExeDomain = _.ExeDomain in {
+  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                           (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
+                           "$src2, $src1", "$src1, $src2",
+                           (_.VT (VecNode _.RC:$src1, _.RC:$src2)),
+                           itins.rr>;
+
+  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
+                         "$src2, $src1", "$src1, $src2",
+                         (_.VT (VecNode _.RC:$src1,
+                                        _.ScalarIntMemCPat:$src2)),
+                         itins.rm>;
+
+  let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+  def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.FRC:$src2),
+                          OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                          [(set _.FRC:$dst, (OpNode _.FRC:$src1, _.FRC:$src2))],
+                          itins.rr> {
+    let isCommutable = IsCommutable;
+  }
+  def rm : I< opc, MRMSrcMem, (outs _.FRC:$dst),
+                         (ins _.FRC:$src1, _.ScalarMemOp:$src2),
+                         OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                         [(set _.FRC:$dst, (OpNode _.FRC:$src1,
+                         (_.ScalarLdFrag addr:$src2)))], itins.rm>;
+  }
+
   defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                             (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                             "{sae}, $src2, $src1", "$src1, $src2, {sae}",
-                            (VecNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
+                            (SaeNode (_.VT _.RC:$src1), (_.VT _.RC:$src2),
                             (i32 FROUND_NO_EXC))>, EVEX_B;
+  }
 }
 
 multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -4214,31 +4279,29 @@ multiclass avx512_binop_s_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 multiclass avx512_binop_s_sae<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                  SDNode VecNode,
+                                  SDNode VecNode, SDNode SaeNode,
                                   SizeItins itins, bit IsCommutable> {
-  defm SSZ : avx512_fp_scalar<opc, OpcodeStr#"ss", f32x_info, OpNode, VecNode,
-                              itins.s, IsCommutable>,
-             avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, VecNode,
-                              itins.s, IsCommutable>,
+  defm SSZ : avx512_fp_scalar_sae<opc, OpcodeStr#"ss", f32x_info, OpNode,
+                              VecNode, SaeNode, itins.s, IsCommutable>,
                               XS, EVEX_4V, VEX_LIG,  EVEX_CD8<32, CD8VT1>;
-  defm SDZ : avx512_fp_scalar<opc, OpcodeStr#"sd", f64x_info, OpNode, VecNode,
-                              itins.d,                  IsCommutable>,
-             avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, VecNode,
-                              itins.d, IsCommutable>,
+  defm SDZ : avx512_fp_scalar_sae<opc, OpcodeStr#"sd", f64x_info, OpNode,
+                              VecNode, SaeNode, itins.d, IsCommutable>,
                               XD, VEX_W, EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>;
 }
-defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnd, SSE_ALU_ITINS_S, 1>;
-defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnd, SSE_MUL_ITINS_S, 1>;
-defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnd, SSE_ALU_ITINS_S, 0>;
-defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnd, SSE_DIV_ITINS_S, 0>;
-defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fminRnd, SSE_ALU_ITINS_S, 0>;
-defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxRnd, SSE_ALU_ITINS_S, 0>;
+defm VADD : avx512_binop_s_round<0x58, "vadd", fadd, X86faddRnds, SSE_ALU_ITINS_S, 1>;
+defm VMUL : avx512_binop_s_round<0x59, "vmul", fmul, X86fmulRnds, SSE_MUL_ITINS_S, 1>;
+defm VSUB : avx512_binop_s_round<0x5C, "vsub", fsub, X86fsubRnds, SSE_ALU_ITINS_S, 0>;
+defm VDIV : avx512_binop_s_round<0x5E, "vdiv", fdiv, X86fdivRnds, SSE_DIV_ITINS_S, 0>;
+defm VMIN : avx512_binop_s_sae  <0x5D, "vmin", X86fmin, X86fmins, X86fminRnds,
+                                 SSE_ALU_ITINS_S, 0>;
+defm VMAX : avx512_binop_s_sae  <0x5F, "vmax", X86fmax, X86fmaxs, X86fmaxRnds,
+                                 SSE_ALU_ITINS_S, 0>;
 
 // MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
 // X86fminc and X86fmaxc instead of X86fmin and X86fmax
 multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
                           X86VectorVTInfo _, SDNode OpNode, OpndItins itins> {
-  let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
+  let isCodeGenOnly = 1, Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
                          (ins _.FRC:$src1, _.FRC:$src2),
                           OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
@@ -4496,6 +4559,7 @@ let Predicates = [HasVLX,HasDQI] in {
 
 multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -4511,10 +4575,12 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                    (OpNode  _.RC:$src1, (_.VT (X86VBroadcast
                                               (_.ScalarLdFrag addr:$src2))), (i32 FROUND_CURRENT))>,
                    EVEX_4V, EVEX_B;
+  }
 }
 
 multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
                   "$src2, $src1", "$src1, $src2",
@@ -4525,6 +4591,7 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   (OpNode _.RC:$src1,
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
                           (i32 FROUND_CURRENT))>;
+  }
 }
 
 multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr, SDNode OpNode, SDNode OpNodeScal> {
@@ -4797,6 +4864,33 @@ defm VPSLL : avx512_shift_types<0xF2, 0xF3, 0xF1, "vpsll", X86vshl>;
 defm VPSRA : avx512_shift_types<0xE2, 0xE2, 0xE1, "vpsra", X86vsra>;
 defm VPSRL : avx512_shift_types<0xD2, 0xD3, 0xD1, "vpsrl", X86vsrl>;
 
+// Use 512bit VPSRA/VPSRAI version to implement v2i64/v4i64 in case NoVLX.
+let Predicates = [HasAVX512, NoVLX] in {
+  def : Pat<(v4i64 (X86vsra (v4i64 VR256X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                 VR128X:$src2)), sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vsra (v2i64 VR128X:$src1), (v2i64 VR128X:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZrr
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                 VR128X:$src2)), sub_xmm)>;
+
+  def : Pat<(v4i64 (X86vsrai (v4i64 VR256X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR256X:$src1, sub_ymm)),
+                 imm:$src2)), sub_ymm)>;
+
+  def : Pat<(v2i64 (X86vsrai (v2i64 VR128X:$src1), (i8 imm:$src2))),
+            (EXTRACT_SUBREG (v8i64
+              (VPSRAQZri
+                (v8i64 (INSERT_SUBREG (IMPLICIT_DEF), VR128X:$src1, sub_xmm)),
+                 imm:$src2)), sub_xmm)>;
+}
+
 //===-------------------------------------------------------------------===//
 // Variable Bit Shifts
 //===-------------------------------------------------------------------===//
@@ -4915,7 +5009,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
     def : Pat<(_.VT (X86vsrav _.RC:$src1, (bitconvert (_.LdFrag addr:$src2)))),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rm)
                _.RC:$src1, addr:$src2)>;
-    let AddedComplexity = 20 in {
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1, _.RC:$src2), _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrk) _.RC:$src0,
@@ -4925,8 +5018,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
                      _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, addr:$src2)>;
-    }
-    let AddedComplexity = 30 in {
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1, _.RC:$src2), _.ImmAllZerosV)),
               (!cast<Instruction>(InstrStr#_.ZSuffix#rrkz) _.KRC:$mask,
@@ -4936,7 +5027,6 @@ multiclass avx512_var_shift_int_lowering<string InstrStr, X86VectorVTInfo _,
                      _.ImmAllZerosV)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmkz) _.KRC:$mask,
                _.RC:$src1, addr:$src2)>;
-    }
   }
 }
 
@@ -4948,14 +5038,12 @@ multiclass avx512_var_shift_int_lowering_mb<string InstrStr, X86VectorVTInfo _,
                      (X86VBroadcast (_.ScalarLdFrag addr:$src2)))),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmb)
                _.RC:$src1, addr:$src2)>;
-    let AddedComplexity = 20 in
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1,
                       (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
                      _.RC:$src0)),
               (!cast<Instruction>(InstrStr#_.ZSuffix##rmbk) _.RC:$src0,
                _.KRC:$mask, _.RC:$src1, addr:$src2)>;
-    let AddedComplexity = 30 in
     def : Pat<(_.VT (vselect _.KRCWM:$mask,
                      (X86vsrav _.RC:$src1,
                       (X86VBroadcast (_.ScalarLdFrag addr:$src2))),
@@ -5153,6 +5241,7 @@ let Predicates = [HasAVX512] in {
 //===----------------------------------------------------------------------===//
 multiclass avx512_mov_hilo_packed<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in
   def rm : AVX512<opc, MRMSrcMem, (outs _.RC:$dst),
                   (ins _.RC:$src1, f64mem:$src2),
                   !strconcat(OpcodeStr,
@@ -5501,7 +5590,7 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
           "$src3, $src2", "$src2, $src3", RHS_VEC_r, 1, 1>, AVX512FMA3Base;
 
   defm m_Int: AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-          (ins _.RC:$src2, _.ScalarMemOp:$src3), OpcodeStr,
+          (ins _.RC:$src2, _.IntScalarMemOp:$src3), OpcodeStr,
           "$src3, $src2", "$src2, $src3", RHS_VEC_m, 1, 1>, AVX512FMA3Base;
 
   defm rb_Int: AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -5527,13 +5616,13 @@ multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
                             string OpcodeStr, SDNode OpNode, SDNode OpNodeRnds1,
                             SDNode OpNodeRnds3, X86VectorVTInfo _ , string SUFF> {
-
+  let ExeDomain = _.ExeDomain in {
   defm NAME#213#SUFF#Z: avx512_fma3s_common<opc213, OpcodeStr#"213"#_.Suffix , _ ,
                 // Operands for intrinsic are in 123 order to preserve passthu
                 // semantics.
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3, (i32 FROUND_CURRENT))),
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2,
-                         (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))), (i32 FROUND_CURRENT))),
+                         _.ScalarIntMemCPat:$src3, (i32 FROUND_CURRENT))),
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src2, _.RC:$src3,
                          (i32 imm:$rc))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src2, _.FRC:$src1,
@@ -5543,8 +5632,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
 
   defm NAME#231#SUFF#Z: avx512_fma3s_common<opc231, OpcodeStr#"231"#_.Suffix , _ ,
                 (_.VT (OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnds3 _.RC:$src2,
-                       (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+                (_.VT (OpNodeRnds3 _.RC:$src2, _.ScalarIntMemCPat:$src3,
                               _.RC:$src1, (i32 FROUND_CURRENT))),
                 (_.VT ( OpNodeRnds3 _.RC:$src2, _.RC:$src3, _.RC:$src1,
                                   (i32 imm:$rc))),
@@ -5555,8 +5643,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
 
   defm NAME#132#SUFF#Z: avx512_fma3s_common<opc132, OpcodeStr#"132"#_.Suffix , _ ,
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 FROUND_CURRENT))),
-                (_.VT (OpNodeRnds1 _.RC:$src1,
-                       (_.VT (scalar_to_vector(_.ScalarLdFrag addr:$src3))),
+                (_.VT (OpNodeRnds1 _.RC:$src1, _.ScalarIntMemCPat:$src3,
                               _.RC:$src2, (i32 FROUND_CURRENT))),
                 (_.VT (OpNodeRnds1 _.RC:$src1, _.RC:$src3, _.RC:$src2,
                          (i32 imm:$rc))),
@@ -5564,6 +5651,7 @@ multiclass avx512_fma3s_all<bits<8> opc213, bits<8> opc231, bits<8> opc132,
                          _.FRC:$src2))),
                 (set _.FRC:$dst, (_.EltVT (OpNode _.FRC:$src1,
                           (_.ScalarLdFrag addr:$src3), _.FRC:$src2)))>;
+  }
 }
 
 multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
@@ -5594,6 +5682,7 @@ defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub,
 let Constraints = "$src1 = $dst" in {
 multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
@@ -5613,6 +5702,7 @@ multiclass avx512_pmadd52_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
             (OpNode _.RC:$src1,
              _.RC:$src2,(_.VT (X86VBroadcast (_.ScalarLdFrag addr:$src3))))>,
             AVX512FMA3Base, EVEX_B;
+  }
 }
 } // Constraints = "$src1 = $dst"
 
@@ -5780,10 +5870,10 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT ,
                 !strconcat(asm,"\t{$rc, $src, $dst|$dst, $src, $rc}"),
                 [(set DstVT.RC:$dst, (OpNode (SrcVT.VT SrcVT.RC:$src),(i32 imm:$rc)))]>,
                 EVEX, VEX_LIG, EVEX_B, EVEX_RC;
-    def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src),
+    def rm : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode
-                      (SrcVT.VT (scalar_to_vector (SrcVT.ScalarLdFrag addr:$src))),
+                      (SrcVT.VT SrcVT.ScalarIntMemCPat:$src),
                       (i32 FROUND_CURRENT)))]>,
                 EVEX, VEX_LIG;
   } // Predicates = [HasAVX512]
@@ -5820,20 +5910,20 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info,
 let Predicates = [HasAVX512] in {
   def : Pat<(i32 (int_x86_sse_cvtss2si (v4f32 VR128X:$src))),
             (VCVTSS2SIZrr VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse_cvtss2si (sse_load_f32 addr:$src))),
-            (VCVTSS2SIZrm addr:$src)>;
+  def : Pat<(i32 (int_x86_sse_cvtss2si sse_load_f32:$src)),
+            (VCVTSS2SIZrm sse_load_f32:$src)>;
   def : Pat<(i64 (int_x86_sse_cvtss2si64 (v4f32 VR128X:$src))),
             (VCVTSS2SI64Zrr VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse_cvtss2si64 (sse_load_f32 addr:$src))),
-            (VCVTSS2SI64Zrm addr:$src)>;
+  def : Pat<(i64 (int_x86_sse_cvtss2si64 sse_load_f32:$src)),
+            (VCVTSS2SI64Zrm sse_load_f32:$src)>;
   def : Pat<(i32 (int_x86_sse2_cvtsd2si (v2f64 VR128X:$src))),
             (VCVTSD2SIZrr VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse2_cvtsd2si (sse_load_f64 addr:$src))),
-            (VCVTSD2SIZrm addr:$src)>;
+  def : Pat<(i32 (int_x86_sse2_cvtsd2si sse_load_f64:$src)),
+            (VCVTSD2SIZrm sse_load_f64:$src)>;
   def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (v2f64 VR128X:$src))),
             (VCVTSD2SI64Zrr VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse2_cvtsd2si64 (sse_load_f64 addr:$src))),
-            (VCVTSD2SI64Zrm addr:$src)>;
+  def : Pat<(i64 (int_x86_sse2_cvtsd2si64 sse_load_f64:$src)),
+            (VCVTSD2SI64Zrm sse_load_f64:$src)>;
 } // HasAVX512
 
 let Predicates = [HasAVX512] in {
@@ -5920,7 +6010,7 @@ let Predicates = [HasAVX512] in {
                                     EVEX,VEX_LIG , EVEX_B;
     let mayLoad = 1, hasSideEffects = 0 in
       def rm_Int : AVX512<opc, MRMSrcMem, (outs _DstRC.RC:$dst),
-                  (ins _SrcRC.MemOp:$src),
+                  (ins _SrcRC.IntScalarMemOp:$src),
                   !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                   []>, EVEX, VEX_LIG;
 
@@ -5957,47 +6047,58 @@ defm VCVTTSD2USI64Z: avx512_cvt_s_all<0x78, "vcvttsd2usi", f64x_info, i64x_info,
 let Predicates = [HasAVX512] in {
   def : Pat<(i32 (int_x86_sse_cvttss2si (v4f32 VR128X:$src))),
             (VCVTTSS2SIZrr_Int VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse_cvttss2si (sse_load_f32 addr:$src))),
-            (VCVTTSS2SIZrm_Int addr:$src)>;
+  def : Pat<(i32 (int_x86_sse_cvttss2si sse_load_f32:$src)),
+            (VCVTTSS2SIZrm_Int ssmem:$src)>;
   def : Pat<(i64 (int_x86_sse_cvttss2si64 (v4f32 VR128X:$src))),
             (VCVTTSS2SI64Zrr_Int VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse_cvttss2si64 (sse_load_f32 addr:$src))),
-            (VCVTTSS2SI64Zrm_Int addr:$src)>;
+  def : Pat<(i64 (int_x86_sse_cvttss2si64 sse_load_f32:$src)),
+            (VCVTTSS2SI64Zrm_Int ssmem:$src)>;
   def : Pat<(i32 (int_x86_sse2_cvttsd2si (v2f64 VR128X:$src))),
             (VCVTTSD2SIZrr_Int VR128X:$src)>;
-  def : Pat<(i32 (int_x86_sse2_cvttsd2si (sse_load_f64 addr:$src))),
-            (VCVTTSD2SIZrm_Int addr:$src)>;
+  def : Pat<(i32 (int_x86_sse2_cvttsd2si sse_load_f64:$src)),
+            (VCVTTSD2SIZrm_Int sdmem:$src)>;
   def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (v2f64 VR128X:$src))),
             (VCVTTSD2SI64Zrr_Int VR128X:$src)>;
-  def : Pat<(i64 (int_x86_sse2_cvttsd2si64 (sse_load_f64 addr:$src))),
-            (VCVTTSD2SI64Zrm_Int addr:$src)>;
+  def : Pat<(i64 (int_x86_sse2_cvttsd2si64 sse_load_f64:$src)),
+            (VCVTTSD2SI64Zrm_Int sdmem:$src)>;
 } // HasAVX512
 //===----------------------------------------------------------------------===//
 // AVX-512  Convert form float to double and back
 //===----------------------------------------------------------------------===//
 multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNode> {
-  defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rr_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
                                        (_Src.VT _Src.RC:$src2),
                                        (i32 FROUND_CURRENT)))>,
                          EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
-  defm rm : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                         (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2), OpcodeStr,
+  defm rm_Int : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
-                                  (_Src.VT (scalar_to_vector
-                                            (_Src.ScalarLdFrag addr:$src2))),
+                                  (_Src.VT _Src.ScalarIntMemCPat:$src2),
                                   (i32 FROUND_CURRENT)))>,
                          EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+
+  let isCodeGenOnly = 1, hasSideEffects = 0 in {
+    def rr : I<opc, MRMSrcReg, (outs _.FRC:$dst),
+               (ins _.FRC:$src1, _Src.FRC:$src2),
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               EVEX_4V, VEX_LIG, Sched<[WriteCvtF2F]>;
+    let mayLoad = 1 in
+    def rm : I<opc, MRMSrcMem, (outs _.FRC:$dst),
+               (ins _.FRC:$src1, _Src.ScalarMemOp:$src2),
+               OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>,
+               EVEX_4V, VEX_LIG, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+  }
 }
 
 // Scalar Coversion with SAE - suppress all exceptions
 multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNodeRnd> {
-  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2), OpcodeStr,
                         "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
@@ -6009,7 +6110,7 @@ multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTIn
 // Scalar Conversion with rounding control (RC)
 multiclass avx512_cvt_fp_rc_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          X86VectorVTInfo _Src, SDNode OpNodeRnd> {
-  defm rrb : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rrb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src1, _Src.RC:$src2, AVX512RC:$rc), OpcodeStr,
                         "$rc, $src2, $src1", "$src1, $src2, $rc",
                         (_.VT (OpNodeRnd (_.VT _.RC:$src1),
@@ -6042,39 +6143,36 @@ defm VCVTSS2SD : avx512_cvt_fp_scalar_ss2sd<0x5A, "vcvtss2sd",
                                           X86fpextRnd,f32x_info, f64x_info >;
 
 def : Pat<(f64 (fpextend FR32X:$src)),
-          (COPY_TO_REGCLASS (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, VR128X),
-                               (COPY_TO_REGCLASS FR32X:$src, VR128X)), VR128X)>,
+          (VCVTSS2SDZrr (COPY_TO_REGCLASS FR32X:$src, FR64X), FR32X:$src)>,
           Requires<[HasAVX512]>;
 def : Pat<(f64 (fpextend (loadf32 addr:$src))),
-          (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+          (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
           Requires<[HasAVX512]>;
 
 def : Pat<(f64 (extloadf32 addr:$src)),
-      (COPY_TO_REGCLASS (VCVTSS2SDZrm (v4f32 (IMPLICIT_DEF)), addr:$src), VR128X)>,
+          (VCVTSS2SDZrm (f64 (IMPLICIT_DEF)), addr:$src)>,
       Requires<[HasAVX512, OptForSize]>;
 
 def : Pat<(f64 (extloadf32 addr:$src)),
-          (COPY_TO_REGCLASS (VCVTSS2SDZrr (v4f32 (IMPLICIT_DEF)),
-                    (COPY_TO_REGCLASS (VMOVSSZrm addr:$src), VR128X)), VR128X)>,
+          (VCVTSS2SDZrr (f64 (IMPLICIT_DEF)), (VMOVSSZrm addr:$src))>,
           Requires<[HasAVX512, OptForSpeed]>;
 
 def : Pat<(f32 (fpround FR64X:$src)),
-          (COPY_TO_REGCLASS (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, VR128X),
-                    (COPY_TO_REGCLASS FR64X:$src, VR128X)), VR128X)>,
+          (VCVTSD2SSZrr (COPY_TO_REGCLASS FR64X:$src, FR32X), FR64X:$src)>,
            Requires<[HasAVX512]>;
 
 def : Pat<(v4f32 (X86Movss
                    (v4f32 VR128X:$dst),
                    (v4f32 (scalar_to_vector
                      (f32 (fpround (f64 (extractelt VR128X:$src, (iPTR 0))))))))),
-          (VCVTSD2SSZrr VR128X:$dst, VR128X:$src)>,
+          (VCVTSD2SSZrr_Int VR128X:$dst, VR128X:$src)>,
           Requires<[HasAVX512]>;
 
 def : Pat<(v2f64 (X86Movsd
                    (v2f64 VR128X:$dst),
                    (v2f64 (scalar_to_vector
                      (f64 (fpextend (f32 (extractelt VR128X:$src, (iPTR 0))))))))),
-          (VCVTSS2SDZrr VR128X:$dst, VR128X:$src)>,
+          (VCVTSS2SDZrr_Int VR128X:$dst, VR128X:$src)>,
           Requires<[HasAVX512]>;
 
 //===----------------------------------------------------------------------===//
@@ -6710,7 +6808,7 @@ let Predicates = [HasAVX512] in {
   let Predicates = [HasVLX] in {
     defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem>,
                         EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
-    defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f128mem>,
+    defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem>,
                         EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
   }
 }
@@ -6819,7 +6917,7 @@ let Defs = [EFLAGS], Predicates = [HasAVX512] in {
 /// avx512_fp14_s rcp14ss, rcp14sd, rsqrt14ss, rsqrt14sd
 multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             X86VectorVTInfo _> {
-  let AddedComplexity = 20 , Predicates = [HasAVX512] in {
+  let Predicates = [HasAVX512], ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
@@ -6844,6 +6942,7 @@ defm VRSQRT14SD   : avx512_fp14_s<0x4F, "vrsqrt14sd", X86frsqrt14s, f64x_info>,
 /// avx512_fp14_p rcp14ps, rcp14pd, rsqrt14ps, rsqrt14pd
 multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (_.FloatVT (OpNode _.RC:$src))>, EVEX, T8PD;
@@ -6857,6 +6956,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           (OpNode (_.FloatVT
                             (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
                           EVEX, T8PD, EVEX_B;
+  }
 }
 
 multiclass avx512_fp14_p_vl_all<bits<8> opc, string OpcodeStr, SDNode OpNode> {
@@ -6888,7 +6988,7 @@ defm VRCP14 : avx512_fp14_p_vl_all<0x4C, "vrcp14", X86frcp>;
 /// avx512_fp28_s rcp28ss, rcp28sd, rsqrt28ss, rsqrt28sd
 multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          SDNode OpNode> {
-
+  let ExeDomain = _.ExeDomain in {
   defm r : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                            (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                            "$src2, $src1", "$src1, $src2",
@@ -6907,6 +7007,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          (OpNode (_.VT _.RC:$src1),
                           (_.VT (scalar_to_vector (_.ScalarLdFrag addr:$src2))),
                          (i32 FROUND_CURRENT))>;
+  }
 }
 
 multiclass avx512_eri_s<bits<8> opc, string OpcodeStr, SDNode OpNode> {
@@ -6926,7 +7027,7 @@ defm VGETEXP   : avx512_eri_s<0x43, "vgetexp", X86fgetexpRnds>, T8PD, EVEX_4V;
 
 multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          SDNode OpNode> {
-
+  let ExeDomain = _.ExeDomain in {
   defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (OpNode (_.VT _.RC:$src), (i32 FROUND_CURRENT))>;
@@ -6943,9 +7044,11 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          (OpNode (_.FloatVT
                                   (X86VBroadcast (_.ScalarLdFrag addr:$src))),
                                  (i32 FROUND_CURRENT))>, EVEX_B;
+  }
 }
 multiclass avx512_fp28_p_round<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                          SDNode OpNode> {
+  let ExeDomain = _.ExeDomain in
   defm rb : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src), OpcodeStr,
                         "{sae}, $src", "$src, {sae}",
@@ -6986,6 +7089,7 @@ defm VGETEXP   : avx512_eri<0x42, "vgetexp", X86fgetexpRnd>,
 
 multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
                               SDNode OpNodeRnd, X86VectorVTInfo _>{
+  let ExeDomain = _.ExeDomain in
   defm rb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src, AVX512RC:$rc), OpcodeStr, "$rc, $src", "$src, $rc",
                          (_.VT (OpNodeRnd _.RC:$src, (i32 imm:$rc)))>,
@@ -6994,6 +7098,7 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
 
 multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
                               SDNode OpNode, X86VectorVTInfo _>{
+  let ExeDomain = _.ExeDomain in {
   defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
                          (_.FloatVT (OpNode _.RC:$src))>, EVEX;
@@ -7008,6 +7113,7 @@ multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
                           (OpNode (_.FloatVT
                             (X86VBroadcast (_.ScalarLdFrag addr:$src))))>,
                           EVEX, EVEX_B;
+  }
 }
 
 multiclass avx512_sqrt_packed_all<bits<8> opc, string OpcodeStr,
@@ -7045,7 +7151,7 @@ multiclass avx512_sqrt_packed_all_round<bits<8> opc, string OpcodeStr,
 
 multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                               string SUFF, SDNode OpNode, SDNode OpNodeRnd> {
-
+  let ExeDomain = _.ExeDomain in {
   defm r_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
@@ -7078,6 +7184,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                  (ins _.FRC:$src1, _.ScalarMemOp:$src2),
                  OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}", []>;
   }
+  }
 
   def : Pat<(_.EltVT (OpNode _.FRC:$src)),
             (!cast<Instruction>(NAME#SUFF#Zr)
@@ -7382,11 +7489,11 @@ multiclass avx512_extend_common<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode,
+          SDPatternOperator OpNode, SDPatternOperator InVecNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasBWI] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v8i16x_info,
-                    v16i8x_info, i64mem, LdFrag, OpNode>,
+                    v16i8x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VH>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v16i16x_info,
@@ -7401,11 +7508,11 @@ multiclass avx512_extend_BW<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode,
+          SDPatternOperator OpNode, SDPatternOperator InVecNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
-                   v16i8x_info, i32mem, LdFrag, OpNode>,
+                   v16i8x_info, i32mem, LdFrag, InVecNode>,
                          EVEX_CD8<8, CD8VQ>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v8i32x_info,
@@ -7420,11 +7527,11 @@ multiclass avx512_extend_BD<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
-          SDPatternOperator OpNode,
+          SDPatternOperator OpNode, SDPatternOperator InVecNode,
           string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi8")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
-                   v16i8x_info, i16mem, LdFrag, OpNode>,
+                   v16i8x_info, i16mem, LdFrag, InVecNode>,
                      EVEX_CD8<8, CD8VO>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
@@ -7439,11 +7546,11 @@ multiclass avx512_extend_BQ<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode,
+         SDPatternOperator OpNode, SDPatternOperator InVecNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v4i32x_info,
-                   v8i16x_info, i64mem, LdFrag, OpNode>,
+                   v8i16x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VH>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v8i32x_info,
@@ -7458,11 +7565,11 @@ multiclass avx512_extend_WD<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode,
+         SDPatternOperator OpNode, SDPatternOperator InVecNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi16")> {
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
-                   v8i16x_info, i32mem, LdFrag, OpNode>,
+                   v8i16x_info, i32mem, LdFrag, InVecNode>,
                      EVEX_CD8<16, CD8VQ>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
@@ -7477,12 +7584,12 @@ multiclass avx512_extend_WQ<bits<8> opc, string OpcodeStr,
 }
 
 multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
-         SDPatternOperator OpNode,
+         SDPatternOperator OpNode, SDPatternOperator InVecNode,
          string ExtTy,PatFrag LdFrag = !cast<PatFrag>(ExtTy#"extloadvi32")> {
 
   let Predicates = [HasVLX, HasAVX512] in {
     defm Z128:  avx512_extend_common<opc, OpcodeStr, v2i64x_info,
-                   v4i32x_info, i64mem, LdFrag, OpNode>,
+                   v4i32x_info, i64mem, LdFrag, InVecNode>,
                      EVEX_CD8<32, CD8VH>, T8PD, EVEX_V128;
 
     defm Z256:  avx512_extend_common<opc, OpcodeStr, v4i64x_info,
@@ -7496,19 +7603,19 @@ multiclass avx512_extend_DQ<bits<8> opc, string OpcodeStr,
   }
 }
 
-defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, "z">;
-defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, "z">;
-defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, "z">;
-defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, "z">;
-defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, "z">;
-defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, "z">;
+defm VPMOVZXBW : avx512_extend_BW<0x30, "vpmovzxbw", X86vzext, zext_invec, "z">;
+defm VPMOVZXBD : avx512_extend_BD<0x31, "vpmovzxbd", X86vzext, zext_invec, "z">;
+defm VPMOVZXBQ : avx512_extend_BQ<0x32, "vpmovzxbq", X86vzext, zext_invec, "z">;
+defm VPMOVZXWD : avx512_extend_WD<0x33, "vpmovzxwd", X86vzext, zext_invec, "z">;
+defm VPMOVZXWQ : avx512_extend_WQ<0x34, "vpmovzxwq", X86vzext, zext_invec, "z">;
+defm VPMOVZXDQ : avx512_extend_DQ<0x35, "vpmovzxdq", X86vzext, zext_invec, "z">;
 
-defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, "s">;
-defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, "s">;
-defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, "s">;
-defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, "s">;
-defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, "s">;
-defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, "s">;
+defm VPMOVSXBW: avx512_extend_BW<0x20, "vpmovsxbw", X86vsext, sext_invec, "s">;
+defm VPMOVSXBD: avx512_extend_BD<0x21, "vpmovsxbd", X86vsext, sext_invec, "s">;
+defm VPMOVSXBQ: avx512_extend_BQ<0x22, "vpmovsxbq", X86vsext, sext_invec, "s">;
+defm VPMOVSXWD: avx512_extend_WD<0x23, "vpmovsxwd", X86vsext, sext_invec, "s">;
+defm VPMOVSXWQ: avx512_extend_WQ<0x24, "vpmovsxwq", X86vsext, sext_invec, "s">;
+defm VPMOVSXDQ: avx512_extend_DQ<0x25, "vpmovsxdq", X86vsext, sext_invec, "s">;
 
 // EXTLOAD patterns, implemented using vpmovz
 multiclass avx512_ext_lowering<string InstrStr, X86VectorVTInfo To,
@@ -7551,69 +7658,69 @@ let Predicates = [HasAVX512] in {
   defm : avx512_ext_lowering<"DQZ",    v8i64_info,   v8i32x_info,  extloadvi32>;
 }
 
-multiclass AVX512_pmovx_patterns<string OpcPrefix,
-                                 SDNode ExtOp, PatFrag ExtLoad16> {
+multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
+                                 SDNode InVecOp, PatFrag ExtLoad16> {
   // 128-bit patterns
   let Predicates = [HasVLX, HasBWI] in {
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+  def : Pat<(v8i16 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (v16i8 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (v16i8 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
-  def : Pat<(v8i16 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v8i16 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BWZ128rm) addr:$src)>;
   }
   let Predicates = [HasVLX] in {
-  def : Pat<(v4i32 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+  def : Pat<(v4i32 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BDZ128rm) addr:$src)>;
 
-  def : Pat<(v2i64 (ExtOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (ExtLoad16 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v16i8 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v16i8 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v16i8 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v16i8 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v16i8 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (bc_v16i8 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#BQZ128rm) addr:$src)>;
 
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+  def : Pat<(v4i32 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v8i16 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v8i16 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
-  def : Pat<(v4i32 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v4i32 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WDZ128rm) addr:$src)>;
 
-  def : Pat<(v2i64 (ExtOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v8i16 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v8i16 (vzmovl_v4i32 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v8i16 (vzmovl_v4i32 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v8i16 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v8i16 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v8i16 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (bc_v8i16 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#WQZ128rm) addr:$src)>;
 
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+  def : Pat<(v2i64 (InVecOp (bc_v4i32 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v4i32 (vzmovl_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v4i32 (vzmovl_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (v4i32 (vzload_v2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (v4i32 (vzload_v2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
-  def : Pat<(v2i64 (ExtOp (bc_v4i32 (loadv2i64 addr:$src)))),
+  def : Pat<(v2i64 (InVecOp (bc_v4i32 (loadv2i64 addr:$src)))),
             (!cast<I>(OpcPrefix#DQZ128rm) addr:$src)>;
   }
   // 256-bit patterns
@@ -7692,8 +7799,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix,
   }
 }
 
-defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, extloadi32i16>;
-defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, loadi16_anyext>;
+defm : AVX512_pmovx_patterns<"VPMOVSX", X86vsext, sext_invec, extloadi32i16>;
+defm : AVX512_pmovx_patterns<"VPMOVZX", X86vzext, zext_invec, loadi16_anyext>;
 
 //===----------------------------------------------------------------------===//
 // GATHER - SCATTER Operations
@@ -7884,6 +7991,17 @@ def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
                   [(set Vec.RC:$dst, (Vec.VT (X86vsext Vec.KRC:$src)))]>, EVEX;
 }
 
+// Use 512bit version to implement 128/256 bit in case NoVLX.
+multiclass avx512_convert_mask_to_vector_lowering<X86VectorVTInfo X86Info,
+                                                            X86VectorVTInfo _> {
+
+  def : Pat<(X86Info.VT (X86vsext (X86Info.KVT X86Info.KRC:$src))),
+            (X86Info.VT (EXTRACT_SUBREG
+                           (_.VT (!cast<Instruction>(NAME#"Zrr")
+                             (_.KVT (COPY_TO_REGCLASS X86Info.KRC:$src,_.KRC)))),
+                           X86Info.SubRegIdx))>;
+}
+
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
                                  string OpcodeStr, Predicate prd> {
 let Predicates = [prd] in
@@ -7893,20 +8011,17 @@ let Predicates = [prd] in
     defm Z256 : cvt_by_vec_width<opc, VTInfo.info256, OpcodeStr>, EVEX_V256;
     defm Z128 : cvt_by_vec_width<opc, VTInfo.info128, OpcodeStr>, EVEX_V128;
   }
-}
+let Predicates = [prd, NoVLX] in {
+   defm Z256_Alt :   avx512_convert_mask_to_vector_lowering<VTInfo.info256,VTInfo.info512>;
+   defm Z128_Alt :   avx512_convert_mask_to_vector_lowering<VTInfo.info128,VTInfo.info512>;
+  }
 
-multiclass avx512_convert_mask_to_vector<string OpcodeStr> {
-  defm NAME##B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info,  OpcodeStr,
-                                       HasBWI>;
-  defm NAME##W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, OpcodeStr,
-                                       HasBWI>, VEX_W;
-  defm NAME##D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, OpcodeStr,
-                                       HasDQI>;
-  defm NAME##Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, OpcodeStr,
-                                       HasDQI>, VEX_W;
 }
 
-defm VPMOVM2 : avx512_convert_mask_to_vector<"vpmovm2">;
+defm VPMOVM2B : cvt_mask_by_elt_width<0x28, avx512vl_i8_info, "vpmovm2" , HasBWI>;
+defm VPMOVM2W : cvt_mask_by_elt_width<0x28, avx512vl_i16_info, "vpmovm2", HasBWI> , VEX_W;
+defm VPMOVM2D : cvt_mask_by_elt_width<0x38, avx512vl_i32_info, "vpmovm2", HasDQI>;
+defm VPMOVM2Q : cvt_mask_by_elt_width<0x38, avx512vl_i64_info, "vpmovm2", HasDQI> , VEX_W;
 
 multiclass convert_vector_to_mask_common<bits<8> opc, X86VectorVTInfo _, string OpcodeStr > {
     def rr : AVX512XS8I<opc, MRMSrcReg, (outs _.KRC:$dst), (ins _.RC:$src),
@@ -8221,6 +8336,7 @@ multiclass avx512_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
 //handle scalar instruction  reg_vec1 = op(reg_vec2,reg_vec3,imm),{sae}
 multiclass avx512_fp_sae_scalar_imm<bits<8> opc, string OpcodeStr,
                                              SDNode OpNode, X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in
   defm NAME#rrib : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, _.RC:$src2, i32u8imm:$src3),
                       OpcodeStr, "$src3, {sae}, $src2, $src1",
@@ -8438,6 +8554,7 @@ defm VDBPSADBW: avx512_common_3Op_rm_imm8<0x42, X86dbpsadbw, "vdbpsadbw" ,
 
 multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                            X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                     (ins _.RC:$src1), OpcodeStr,
                     "$src1", "$src1",
@@ -8448,6 +8565,7 @@ multiclass avx512_unary_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                   "$src1", "$src1",
                   (_.VT (OpNode (bitconvert (_.LdFrag addr:$src1))))>,
             EVEX, AVX5128IBase, EVEX_CD8<_.EltSize, CD8VF>;
+  }
 }
 
 multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -8512,66 +8630,7 @@ multiclass avx512_unary_rm_vl_all<bits<8> opc_b, bits<8> opc_w,
                                     HasBWI>;
 }
 
-defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", X86Abs>;
-
-def avx512_v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
-                                                      VR128X:$src))>;
-def avx512_v8i1sextv8i16 : PatLeaf<(v8i16 (X86vsrai VR128X:$src, (i8 15)))>;
-def avx512_v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128X:$src, (i8 31)))>;
-def avx512_v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
-                                                      VR256X:$src))>;
-def avx512_v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256X:$src, (i8 15)))>;
-def avx512_v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256X:$src, (i8 31)))>;
-
-let Predicates = [HasBWI, HasVLX] in {
-  def : Pat<(xor
-            (bc_v2i64 (avx512_v16i1sextv16i8)),
-            (bc_v2i64 (add (v16i8 VR128X:$src), (avx512_v16i1sextv16i8)))),
-            (VPABSBZ128rr VR128X:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (avx512_v8i1sextv8i16)),
-            (bc_v2i64 (add (v8i16 VR128X:$src), (avx512_v8i1sextv8i16)))),
-            (VPABSWZ128rr VR128X:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (avx512_v32i1sextv32i8)),
-            (bc_v4i64 (add (v32i8 VR256X:$src), (avx512_v32i1sextv32i8)))),
-            (VPABSBZ256rr VR256X:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (avx512_v16i1sextv16i16)),
-            (bc_v4i64 (add (v16i16 VR256X:$src), (avx512_v16i1sextv16i16)))),
-            (VPABSWZ256rr VR256X:$src)>;
-}
-let Predicates = [HasAVX512, HasVLX] in {
-  def : Pat<(xor
-            (bc_v2i64 (avx512_v4i1sextv4i32)),
-            (bc_v2i64 (add (v4i32 VR128X:$src), (avx512_v4i1sextv4i32)))),
-            (VPABSDZ128rr VR128X:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (avx512_v8i1sextv8i32)),
-            (bc_v4i64 (add (v8i32 VR256X:$src), (avx512_v8i1sextv8i32)))),
-            (VPABSDZ256rr VR256X:$src)>;
-}
-
-let Predicates = [HasAVX512] in {
-def : Pat<(xor
-          (bc_v8i64 (v16i1sextv16i32)),
-          (bc_v8i64 (add (v16i32 VR512:$src), (v16i1sextv16i32)))),
-          (VPABSDZrr VR512:$src)>;
-def : Pat<(xor
-          (bc_v8i64 (v8i1sextv8i64)),
-          (bc_v8i64 (add (v8i64 VR512:$src), (v8i1sextv8i64)))),
-          (VPABSQZrr VR512:$src)>;
-}
-let Predicates = [HasBWI] in {
-def : Pat<(xor
-          (bc_v8i64 (v64i1sextv64i8)),
-          (bc_v8i64 (add (v64i8 VR512:$src), (v64i1sextv64i8)))),
-          (VPABSBZrr VR512:$src)>;
-def : Pat<(xor
-          (bc_v8i64 (v32i1sextv32i16)),
-          (bc_v8i64 (add (v32i16 VR512:$src), (v32i1sextv32i16)))),
-          (VPABSWZrr VR512:$src)>;
-}
+defm VPABS : avx512_unary_rm_vl_all<0x1C, 0x1D, 0x1E, 0x1F, "vpabs", abs>;
 
 multiclass avx512_ctlz<bits<8> opc, string OpcodeStr, Predicate prd>{
 
@@ -8598,6 +8657,7 @@ defm VMOVSLDUP : avx512_replicate<0x12, "vmovsldup", X86Movsldup>;
 
 multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                                             X86VectorVTInfo _> {
+  let ExeDomain = _.ExeDomain in {
   defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                    (ins _.RC:$src), OpcodeStr, "$src", "$src",
                    (_.VT (OpNode (_.VT _.RC:$src)))>, EVEX;
@@ -8606,6 +8666,7 @@ multiclass avx512_movddup_128<bits<8> opc, string OpcodeStr, SDNode OpNode,
                  (_.VT (OpNode (_.VT (scalar_to_vector
                                        (_.ScalarLdFrag addr:$src)))))>,
                  EVEX, EVEX_CD8<_.EltSize, CD8VH>;
+  }
 }
 
 multiclass avx512_movddup_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -8882,6 +8943,68 @@ multiclass avx512_psadbw_packed_all<bits<8> opc, SDNode OpNode,
 defm VPSADBW : avx512_psadbw_packed_all<0xf6, X86psadbw, "vpsadbw",
                                        HasBWI>, EVEX_4V;
 
+// Transforms to swizzle an immediate to enable better matching when
+// memory operand isn't in the right place.
+def VPTERNLOG321_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by swapping operand 0 and operand 2.
+  uint8_t Imm = N->getZExtValue();
+  // Swap bits 1/4 and 3/6.
+  uint8_t NewImm = Imm & 0xa5;
+  if (Imm & 0x02) NewImm |= 0x10;
+  if (Imm & 0x10) NewImm |= 0x02;
+  if (Imm & 0x08) NewImm |= 0x40;
+  if (Imm & 0x40) NewImm |= 0x08;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG213_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
+  uint8_t Imm = N->getZExtValue();
+  // Swap bits 2/4 and 3/5.
+  uint8_t NewImm = Imm & 0xc3;
+  if (Imm & 0x04) NewImm |= 0x10;
+  if (Imm & 0x10) NewImm |= 0x04;
+  if (Imm & 0x08) NewImm |= 0x20;
+  if (Imm & 0x20) NewImm |= 0x08;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG132_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by swapping operand 1 and operand 2.
+  uint8_t Imm = N->getZExtValue();
+  // Swap bits 1/2 and 5/6.
+  uint8_t NewImm = Imm & 0x99;
+  if (Imm & 0x02) NewImm |= 0x04;
+  if (Imm & 0x04) NewImm |= 0x02;
+  if (Imm & 0x20) NewImm |= 0x40;
+  if (Imm & 0x40) NewImm |= 0x20;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG231_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by moving operand 1 to the end.
+  uint8_t Imm = N->getZExtValue();
+  // Move bits 1->2, 2->4, 3->6, 4->1, 5->3, 6->5
+  uint8_t NewImm = Imm & 0x81;
+  if (Imm & 0x02) NewImm |= 0x04;
+  if (Imm & 0x04) NewImm |= 0x10;
+  if (Imm & 0x08) NewImm |= 0x40;
+  if (Imm & 0x10) NewImm |= 0x02;
+  if (Imm & 0x20) NewImm |= 0x08;
+  if (Imm & 0x40) NewImm |= 0x20;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+def VPTERNLOG312_imm8 : SDNodeXForm<imm, [{
+  // Convert a VPTERNLOG immediate by moving operand 2 to the beginning.
+  uint8_t Imm = N->getZExtValue();
+  // Move bits 1->4, 2->1, 3->5, 4->2, 5->6, 6->3
+  uint8_t NewImm = Imm & 0x81;
+  if (Imm & 0x02) NewImm |= 0x10;
+  if (Imm & 0x04) NewImm |= 0x02;
+  if (Imm & 0x08) NewImm |= 0x20;
+  if (Imm & 0x10) NewImm |= 0x04;
+  if (Imm & 0x20) NewImm |= 0x40;
+  if (Imm & 0x40) NewImm |= 0x08;
+  return getI8Imm(NewImm, SDLoc(N));
+}]>;
+
 multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           X86VectorVTInfo _>{
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain in {
@@ -8910,6 +9033,141 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                             (i8 imm:$src4)), 1, 0>, EVEX_B,
                     AVX512AIi8Base, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>;
   }// Constraints = "$src1 = $dst"
+
+  // Additional patterns for matching passthru operand in other positions.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, _.RC:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+
+  // Additional patterns for matching loads in other positions.
+  def : Pat<(_.VT (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                          _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (OpNode _.RC:$src1,
+                          (bitconvert (_.LdFrag addr:$src3)),
+                          _.RC:$src2, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching zero masking with loads in other
+  // positions.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching masked loads with different
+  // operand orders.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, _.RC:$src1,
+                    (bitconvert (_.LdFrag addr:$src3)), (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (bitconvert (_.LdFrag addr:$src3)),
+                    _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
+
+  // Additional patterns for matching broadcasts in other positions.
+  def : Pat<(_.VT (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                          _.RC:$src2, _.RC:$src1, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (OpNode _.RC:$src1,
+                          (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                          _.RC:$src2, (i8 imm:$src4))),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbi) _.RC:$src1, _.RC:$src2,
+                                   addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching zero masking with broadcasts in other
+  // positions.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
+             _.KRCWM:$mask, _.RC:$src2, addr:$src3,
+             (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.ImmAllZerosV)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbikz) _.RC:$src1,
+             _.KRCWM:$mask, _.RC:$src2, addr:$src3,
+             (VPTERNLOG132_imm8 imm:$src4))>;
+
+  // Additional patterns for matching masked broadcasts with different
+  // operand orders.
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src1,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src2, _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2, _.RC:$src1,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    (i8 imm:$src4)), _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode _.RC:$src2,
+                    (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src1, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 imm:$src4))>;
+  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+                   (OpNode (X86VBroadcast (_.ScalarLdFrag addr:$src3)),
+                    _.RC:$src1, _.RC:$src2, (i8 imm:$src4)),
+                   _.RC:$src1)),
+            (!cast<Instruction>(NAME#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
+             _.RC:$src2, addr:$src3, (VPTERNLOG312_imm8 imm:$src4))>;
 }
 
 multiclass avx512_common_ternlog<string OpcodeStr, AVX512VLVectorVTInfo _>{
diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td
index 3c27eb8077d0f41413bb9c470c46b1e999ddc441..e592c2b3c0aa581c1291504d8918f33f5e746ac7 100644
--- a/lib/Target/X86/X86InstrCompiler.td
+++ b/lib/Target/X86/X86InstrCompiler.td
@@ -259,20 +259,20 @@ def MORESTACK_RET_RESTORE_R10 : I<0, Pseudo, (outs), (ins),
 // Alias instruction mapping movr0 to xor.
 // FIXME: remove when we can teach regalloc that xor reg, reg is ok.
 let Defs = [EFLAGS], isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1, AddedComplexity = 20 in
+    isPseudo = 1, AddedComplexity = 10 in
 def MOV32r0  : I<0, Pseudo, (outs GR32:$dst), (ins), "",
                  [(set GR32:$dst, 0)], IIC_ALU_NONMEM>, Sched<[WriteZero]>;
 
 // Other widths can also make use of the 32-bit xor, which may have a smaller
 // encoding and avoid partial register updates.
+let AddedComplexity = 10 in {
 def : Pat<(i8 0), (EXTRACT_SUBREG (MOV32r0), sub_8bit)>;
 def : Pat<(i16 0), (EXTRACT_SUBREG (MOV32r0), sub_16bit)>;
-def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)> {
-  let AddedComplexity = 20;
+def : Pat<(i64 0), (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit)>;
 }
 
 let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
-    AddedComplexity = 15 in {
+    AddedComplexity = 10 in {
   // Pseudo instructions for materializing 1 and -1 using XOR+INC/DEC,
   // which only require 3 bytes compared to MOV32ri which requires 5.
   let Defs = [EFLAGS], isReMaterializable = 1, isPseudo = 1 in {
@@ -287,7 +287,7 @@ let Predicates = [OptForSize, NotSlowIncDec, Not64BitMode],
   def : Pat<(i16 -1), (EXTRACT_SUBREG (MOV32r_1), sub_16bit)>;
 }
 
-let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 10 in {
+let isReMaterializable = 1, isPseudo = 1, AddedComplexity = 5 in {
 // AddedComplexity higher than MOV64ri but lower than MOV32r0 and MOV32r1.
 // FIXME: Add itinerary class and Schedule.
 def MOV32ImmSExti8 : I<0, Pseudo, (outs GR32:$dst), (ins i32i8imm:$src), "",
@@ -772,11 +772,11 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b",
 // the pseudo. The argument feeding EBX is ebx_input.
 //
 // The additional argument, $ebx_save, is a temporary register used to
-// save the value of RBX accross the actual instruction.
+// save the value of RBX across the actual instruction.
 //
 // To make sure the register assigned to $ebx_save does not interfere with
 // the definition of the actual instruction, we use a definition $dst which
-// is tied to $rbx_save. That way, the live-range of $rbx_save spans accross
+// is tied to $rbx_save. That way, the live-range of $rbx_save spans across
 // the instruction and we are sure we will have a valid register to restore
 // the value of RBX.
 let Defs = [EAX, EDX, EBX, EFLAGS], Uses = [EAX, ECX, EDX],
@@ -1743,6 +1743,12 @@ def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
 def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
 def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
 
+// sub reg, relocImm
+def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2),
+          (SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
+def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt32_su:$src2),
+          (SUB64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
 // mul reg, reg
 def : Pat<(mul GR16:$src1, GR16:$src2),
           (IMUL16rr GR16:$src1, GR16:$src2)>;
diff --git a/lib/Target/X86/X86InstrFMA.td b/lib/Target/X86/X86InstrFMA.td
index 4b19f801dae1d0d305b24a7c9828d792822d4bbc..1941ae57f0f1f5980eff828ca29ff0183fe8b7ca 100644
--- a/lib/Target/X86/X86InstrFMA.td
+++ b/lib/Target/X86/X86InstrFMA.td
@@ -191,13 +191,15 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
 multiclass fma3s_forms<bits<8> opc132, bits<8> opc213, bits<8> opc231,
                        string OpStr, string PackTy, string Suff,
                        SDNode OpNode, RegisterClass RC,
-                       X86MemOperand x86memop> {
-  defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
-                                x86memop, RC>;
-  defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
-                                x86memop, RC, OpNode>;
-  defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
-                                x86memop, RC>;
+                       X86MemOperand x86memop> { 
+  let Predicates = [HasFMA, NoAVX512] in {
+    defm NAME#132#Suff : fma3s_rm<opc132, !strconcat(OpStr, "132", PackTy),
+                                  x86memop, RC>;
+    defm NAME#213#Suff : fma3s_rm<opc213, !strconcat(OpStr, "213", PackTy),
+                                  x86memop, RC, OpNode>;
+    defm NAME#231#Suff : fma3s_rm<opc231, !strconcat(OpStr, "231", PackTy),
+                                  x86memop, RC>;
+  }
 }
 
 // The FMA 213 form is created for lowering of scalar FMA intrinscis
diff --git a/lib/Target/X86/X86InstrFMA3Info.cpp b/lib/Target/X86/X86InstrFMA3Info.cpp
index db83497ee69dfd587b1977872b26c3c2c13e62d8..00ef65cdb6bd77a4b07f3cd50b3848ccce6b86dd 100644
--- a/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -16,11 +16,14 @@
 #include "X86InstrInfo.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/Threading.h"
+#include <cassert>
+#include <cstdint>
+
 using namespace llvm;
 
 /// This flag is used in the method llvm::call_once() used below to make the
 /// initialization of the map 'OpcodeToGroup' thread safe.
-LLVM_DEFINE_ONCE_FLAG(InitGroupsOnceFlag);
+static llvm::once_flag InitGroupsOnceFlag;
 
 static ManagedStatic<X86InstrFMA3Info> X86InstrFMA3InfoObj;
 X86InstrFMA3Info *X86InstrFMA3Info::getX86InstrFMA3Info() {
diff --git a/lib/Target/X86/X86InstrFMA3Info.h b/lib/Target/X86/X86InstrFMA3Info.h
index 025cee3b2b909e71905215dd47fccf44637358e0..e3568160da46973674abbb5f211fa541fcbe8de8 100644
--- a/lib/Target/X86/X86InstrFMA3Info.h
+++ b/lib/Target/X86/X86InstrFMA3Info.h
@@ -1,4 +1,4 @@
-//===-- X86InstrFMA3Info.h - X86 FMA3 Instruction Information -------------===//
+//===- X86InstrFMA3Info.h - X86 FMA3 Instruction Information ----*- C++ -*-===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -18,9 +18,11 @@
 #include "X86.h"
 #include "llvm/ADT/DenseMap.h"
 #include <cassert>
+#include <cstdint>
 #include <set>
 
 namespace llvm {
+
 /// This class is used to group {132, 213, 231} forms of FMA opcodes together.
 /// Each of the groups has either 3 register opcodes, 3 memory opcodes,
 /// or 6 register and memory opcodes. Also, each group has an attrubutes field
@@ -201,7 +203,7 @@ public:
   static X86InstrFMA3Info *getX86InstrFMA3Info();
 
   /// Constructor. Just creates an object of the class.
-  X86InstrFMA3Info() {}
+  X86InstrFMA3Info() = default;
 
   /// Destructor. Deallocates the memory used for FMA3 Groups.
   ~X86InstrFMA3Info() {
@@ -310,6 +312,7 @@ public:
     return rm_iterator(getX86InstrFMA3Info()->OpcodeToGroup.end());
   }
 };
-} // namespace llvm
 
-#endif
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
diff --git a/lib/Target/X86/X86InstrFPStack.td b/lib/Target/X86/X86InstrFPStack.td
index c005f5b3798e456f42e0f40efe7cdeb62b9fdd19..11b1d070ef2f995456be066d6987f8791ac3e55f 100644
--- a/lib/Target/X86/X86InstrFPStack.td
+++ b/lib/Target/X86/X86InstrFPStack.td
@@ -668,15 +668,16 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", [], IIC_FCOMPP>;
 
 let Predicates = [HasFXSR] in {
   def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
-                 "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
+               "fxsave\t$dst", [(int_x86_fxsave addr:$dst)], IIC_FXSAVE>, TB;
   def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaque512mem:$dst),
-                    "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
-                    IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
+                 "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)],
+                 IIC_FXSAVE>, TB, Requires<[In64BitMode]>;
   def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, TB;
+                "fxrstor\t$src", [(int_x86_fxrstor addr:$src)], IIC_FXRSTOR>, 
+                TB;
   def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaque512mem:$src),
-                     "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
-                     IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
+                  "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)],
+                  IIC_FXRSTOR>, TB, Requires<[In64BitMode]>;
 } // Predicates = [FeatureFXSR]
 } // SchedRW
 
diff --git a/lib/Target/X86/X86InstrFormats.td b/lib/Target/X86/X86InstrFormats.td
index 610756aa37da7f7976b86498b96997c621d5e779..c2fe786732dcdd5fd7f57e47600c5b513f9eb270 100644
--- a/lib/Target/X86/X86InstrFormats.td
+++ b/lib/Target/X86/X86InstrFormats.td
@@ -199,7 +199,8 @@ class TAPS : TA { Prefix OpPrefix = PS; }
 class TAPD : TA { Prefix OpPrefix = PD; }
 class TAXD : TA { Prefix OpPrefix = XD; }
 class VEX    { Encoding OpEnc = EncVEX; }
-class VEX_W  { bit hasVEX_WPrefix = 1; }
+class VEX_W    { bits<2> VEX_WPrefix = 1; }
+class VEX_WIG  { bits<2> VEX_WPrefix = 2; }
 class VEX_4V : VEX { bit hasVEX_4V = 1; }
 class VEX_L  { bit hasVEX_L = 1; }
 class VEX_LIG { bit ignoresVEX_L = 1; }
@@ -270,7 +271,7 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   bit hasREPPrefix = 0;     // Does this inst have a REP prefix?
   Encoding OpEnc = EncNormal; // Encoding used by this instruction
   bits<2> OpEncBits = OpEnc.Value;
-  bit hasVEX_WPrefix = 0;   // Does this inst set the VEX_W field?
+  bits<2> VEX_WPrefix = 0;  // Does this inst set the VEX_W field?
   bit hasVEX_4V = 0;        // Does this inst require the VEX.VVVV field?
   bit hasVEX_L = 0;         // Does this inst use large (256-bit) registers?
   bit ignoresVEX_L = 0;     // Does this instruction ignore the L-bit
@@ -317,7 +318,8 @@ class X86Inst<bits<8> opcod, Format f, ImmType i, dag outs, dag ins,
   let TSFlags{28-27} = ExeDomain.Value;
   let TSFlags{30-29} = OpEncBits;
   let TSFlags{38-31} = Opcode;
-  let TSFlags{39}    = hasVEX_WPrefix;
+  // Currently no need for second bit in TSFlags - W Ignore is equivalent to 0.
+  let TSFlags{39}    = VEX_WPrefix{0};
   let TSFlags{40}    = hasVEX_4V;
   let TSFlags{41}    = hasVEX_L;
   let TSFlags{42}    = hasEVEX_K;
@@ -453,7 +455,7 @@ class SI_Int<bits<8> o, Format F, dag outs, dag ins, string asm,
          Domain d = GenericDomain>
       : I<o, F, outs, ins, asm, pattern, itin, d> {
   let Predicates = !if(!eq(OpEnc.Value, EncEVEX.Value), [HasAVX512],
-                   !if(!eq(OpEnc.Value, EncVEX.Value), [HasAVX],
+                   !if(!eq(OpEnc.Value, EncVEX.Value), [UseAVX],
                    !if(!eq(OpPrefix.Value, XS.Value), [UseSSE1],
                    !if(!eq(OpPrefix.Value, XD.Value), [UseSSE2],
                    !if(!eq(OpPrefix.Value, PD.Value), [UseSSE2],
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 196ba39e8acf3c1c525df3321aab0988630971de..9867ba84bb9ba5d1cf117c25ef31f6e9942330e8 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -27,21 +27,19 @@ def MMX_X86movw2d : SDNode<"X86ISD::MMX_MOVW2D", SDTypeProfile<1, 1,
 //===----------------------------------------------------------------------===//
 
 def load_mmx : PatFrag<(ops node:$ptr), (x86mmx (load node:$ptr))>;
-def load_mvmmx : PatFrag<(ops node:$ptr),
-                         (x86mmx (MMX_X86movw2d (load node:$ptr)))>;
 
 //===----------------------------------------------------------------------===//
 // SSE specific DAG Nodes.
 //===----------------------------------------------------------------------===//
 
-def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>,
-                                       SDTCisFP<1>, SDTCisVT<3, i8>,
-                                       SDTCisVec<1>]>;
-def SDTX86CmpTestSae : SDTypeProfile<1, 3, [SDTCisVT<0, i32>, 
-                                     SDTCisSameAs<1, 2>, SDTCisInt<3>]>;
+def SDTX86VFCMP : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+                                       SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
+                                       SDTCisVT<3, i8>]>;
 
 def X86fmin    : SDNode<"X86ISD::FMIN",      SDTFPBinOp>;
 def X86fmax    : SDNode<"X86ISD::FMAX",      SDTFPBinOp>;
+def X86fmins   : SDNode<"X86ISD::FMINS",     SDTFPBinOp>;
+def X86fmaxs   : SDNode<"X86ISD::FMAXS",     SDTFPBinOp>;
 
 // Commutative and Associative FMIN and FMAX.
 def X86fminc    : SDNode<"X86ISD::FMINC", SDTFPBinOp,
@@ -239,10 +237,11 @@ def X86vpermil2 : SDNode<"X86ISD::VPERMIL2",
                                              SDTCisSameAs<0,2>,
                                              SDTCisSameSizeAs<0,3>,
                                              SDTCisSameNumEltsAs<0, 3>,
+                                             SDTCisFP<0>, SDTCisInt<3>,
                                              SDTCisVT<4, i8>]>>;
 def X86vpperm : SDNode<"X86ISD::VPPERM",
                         SDTypeProfile<1, 3, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
-                                             SDTCisSameAs<0,2>]>>;
+                                             SDTCisSameAs<0,2>, SDTCisSameAs<0, 3>]>>;
 
 def SDTX86CmpPTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>,
                                           SDTCisVec<1>,
@@ -309,13 +308,17 @@ def SDTShuff2Op : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
 
 def SDTShuff2OpM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                         SDTCisSameSizeAs<0,2>,
-                                        SDTCisSameNumEltsAs<0,2>]>;
+                                        SDTCisSameNumEltsAs<0,2>,
+                                        SDTCisFP<0>, SDTCisInt<2>]>;
 def SDTShuff2OpI : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                  SDTCisSameAs<0,1>, SDTCisVT<2, i8>]>;
 def SDTShuff3OpI : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                  SDTCisSameAs<0,2>, SDTCisVT<3, i8>]>;
-def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                             SDTCisSameAs<0,2>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>;
+def SDTFPBinOpImmRound: SDTypeProfile<1, 4, [SDTCisFP<0>, SDTCisVec<0>,
+                                             SDTCisSameAs<0,1>,
+                                             SDTCisSameAs<0,2>,
+                                             SDTCisVT<3, i32>,
+                                             SDTCisVT<4, i32>]>;
 def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
                                                  SDTCisSameAs<0,2>,
                                                  SDTCisInt<3>,
@@ -323,8 +326,10 @@ def SDTFPTernaryOpImmRound: SDTypeProfile<1, 5, [SDTCisFP<0>, SDTCisSameAs<0,1>,
                                                  SDTCisSameNumEltsAs<0, 3>,
                                                  SDTCisVT<4, i32>,
                                                  SDTCisVT<5, i32>]>;
-def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                              SDTCisVT<2, i32>, SDTCisVT<3, i32>]>;
+def SDTFPUnaryOpImmRound: SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisVec<0>,
+                                               SDTCisSameAs<0,1>,
+                                               SDTCisVT<2, i32>,
+                                               SDTCisVT<3, i32>]>;
 
 def SDTVBroadcast  : SDTypeProfile<1, 1, [SDTCisVec<0>]>;
 def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
@@ -333,9 +338,9 @@ def SDTVBroadcastm : SDTypeProfile<1, 1, [SDTCisVec<0>,
 def SDTBlend : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                              SDTCisSameAs<1,2>, SDTCisVT<3, i8>]>;
 
-def SDTTernlog  : SDTypeProfile<1, 4, [SDTCisVec<0>, SDTCisSameAs<0,1>,
-                                SDTCisSameAs<0,2>, SDTCisSameAs<0,3>,
-                                SDTCisVT<4, i8>]>;
+def SDTTernlog  : SDTypeProfile<1, 4, [SDTCisInt<0>, SDTCisVec<0>,
+                                       SDTCisSameAs<0,1>, SDTCisSameAs<0,2>,
+                                       SDTCisSameAs<0,3>, SDTCisVT<4, i8>]>;
 
 def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisFP<0>, SDTCisVT<3, i32>]>;
@@ -343,16 +348,13 @@ def SDTFPBinOpRound : SDTypeProfile<1, 3, [      // fadd_round, fmul_round, etc.
 def SDTFPUnaryOpRound : SDTypeProfile<1, 2, [      // fsqrt_round, fgetexp_round, etc.
   SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisVT<2, i32>]>;
 
-def SDTFma : SDTypeProfile<1, 3, [SDTCisSameAs<0,1>,
-                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
 def SDTFmaRound : SDTypeProfile<1, 4, [SDTCisSameAs<0,1>,
                            SDTCisSameAs<1,2>, SDTCisSameAs<1,3>,
-                           SDTCisVT<4, i32>]>;
+                           SDTCisFP<0>, SDTCisVT<4, i32>]>;
 
 def X86PAlignr : SDNode<"X86ISD::PALIGNR", SDTShuff3OpI>;
 def X86VAlign  : SDNode<"X86ISD::VALIGN", SDTShuff3OpI>;
 
-def X86Abs      : SDNode<"X86ISD::ABS", SDTIntUnaryOp>;
 def X86Conflict : SDNode<"X86ISD::CONFLICT", SDTIntUnaryOp>;
 
 def X86PShufd  : SDNode<"X86ISD::PSHUFD", SDTShuff2OpI>;
@@ -376,17 +378,28 @@ def X86Movhlps : SDNode<"X86ISD::MOVHLPS", SDTShuff2Op>;
 def X86Movlps : SDNode<"X86ISD::MOVLPS", SDTShuff2Op>;
 def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
 
-def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+def SDTPack : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisInt<0>,
+                                   SDTCisVec<1>, SDTCisInt<1>,
                                    SDTCisSameSizeAs<0,1>,
-                                   SDTCisSameAs<1,2>]>;
+                                   SDTCisSameAs<1,2>,
+                                   SDTCisOpSmallerThanOp<0, 1>]>;
 def X86Packss : SDNode<"X86ISD::PACKSS", SDTPack>;
 def X86Packus : SDNode<"X86ISD::PACKUS", SDTPack>;
 
 def X86Unpckl : SDNode<"X86ISD::UNPCKL", SDTShuff2Op>;
 def X86Unpckh : SDNode<"X86ISD::UNPCKH", SDTShuff2Op>;
 
-def X86vpmaddubsw  : SDNode<"X86ISD::VPMADDUBSW" , SDTPack>;
-def X86vpmaddwd    : SDNode<"X86ISD::VPMADDWD"   , SDTPack, [SDNPCommutative]>;
+def X86vpmaddubsw  : SDNode<"X86ISD::VPMADDUBSW",
+                            SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+                                                 SDTCVecEltisVT<1, i8>,
+                                                 SDTCisSameSizeAs<0,1>,
+                                                 SDTCisSameAs<1,2>]>>;
+def X86vpmaddwd    : SDNode<"X86ISD::VPMADDWD",
+                            SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i32>,
+                                                 SDTCVecEltisVT<1, i16>,
+                                                 SDTCisSameSizeAs<0,1>,
+                                                 SDTCisSameAs<1,2>]>,
+                            [SDNPCommutative]>;
 
 def X86VPermilpv  : SDNode<"X86ISD::VPERMILPV", SDTShuff2OpM>;
 def X86VPermilpi  : SDNode<"X86ISD::VPERMILPI", SDTShuff2OpI>;
@@ -423,8 +436,8 @@ def X86VReduce     : SDNode<"X86ISD::VREDUCE",   SDTFPUnaryOpImmRound>;
 def X86VRndScale   : SDNode<"X86ISD::VRNDSCALE", SDTFPUnaryOpImmRound>;
 def X86VGetMant    : SDNode<"X86ISD::VGETMANT",  SDTFPUnaryOpImmRound>;
 def X86Vfpclass    : SDNode<"X86ISD::VFPCLASS",
-                       SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCVecEltisVT<0, i1>,
-                                            SDTCisVec<1>, SDTCisFP<1>,
+                       SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i1>,
+                                            SDTCisFP<1>,
                                             SDTCisSameNumEltsAs<0,1>,
                                             SDTCisVT<2, i32>]>, []>;
 def X86Vfpclasss   : SDNode<"X86ISD::VFPCLASSS",
@@ -437,9 +450,6 @@ def X86SubVBroadcast : SDNode<"X86ISD::SUBV_BROADCAST",
 
 def X86VBroadcast : SDNode<"X86ISD::VBROADCAST", SDTVBroadcast>;
 def X86VBroadcastm : SDNode<"X86ISD::VBROADCASTM", SDTVBroadcastm>;
-def X86Vinsert   : SDNode<"X86ISD::VINSERT",  SDTypeProfile<1, 3,
-                              [SDTCisSameAs<0, 1>, SDTCisEltOfVec<2, 1>,
-                               SDTCisPtrTy<3>]>, []>;
 def X86Vextract   : SDNode<"X86ISD::VEXTRACT",  SDTypeProfile<1, 2,
                               [SDTCisEltOfVec<0, 1>, SDTCisVec<1>,
                                SDTCisPtrTy<2>]>, []>;
@@ -449,24 +459,30 @@ def X86Blendi    : SDNode<"X86ISD::BLENDI",   SDTBlend>;
 def X86Addsub    : SDNode<"X86ISD::ADDSUB", SDTFPBinOp>;
 
 def X86faddRnd   : SDNode<"X86ISD::FADD_RND",  SDTFPBinOpRound>;
+def X86faddRnds  : SDNode<"X86ISD::FADDS_RND", SDTFPBinOpRound>;
 def X86fsubRnd   : SDNode<"X86ISD::FSUB_RND",  SDTFPBinOpRound>;
+def X86fsubRnds  : SDNode<"X86ISD::FSUBS_RND", SDTFPBinOpRound>;
 def X86fmulRnd   : SDNode<"X86ISD::FMUL_RND",  SDTFPBinOpRound>;
+def X86fmulRnds  : SDNode<"X86ISD::FMULS_RND", SDTFPBinOpRound>;
 def X86fdivRnd   : SDNode<"X86ISD::FDIV_RND",  SDTFPBinOpRound>;
-def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",       SDTFPBinOpRound>;
+def X86fdivRnds  : SDNode<"X86ISD::FDIVS_RND", SDTFPBinOpRound>;
+def X86fmaxRnd   : SDNode<"X86ISD::FMAX_RND",  SDTFPBinOpRound>;
+def X86fmaxRnds  : SDNode<"X86ISD::FMAXS_RND", SDTFPBinOpRound>;
+def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",  SDTFPBinOpRound>;
+def X86fminRnds  : SDNode<"X86ISD::FMINS_RND", SDTFPBinOpRound>;
 def X86scalef    : SDNode<"X86ISD::SCALEF",         SDTFPBinOpRound>;
 def X86scalefs   : SDNode<"X86ISD::SCALEFS",        SDTFPBinOpRound>;
-def X86fminRnd   : SDNode<"X86ISD::FMIN_RND",       SDTFPBinOpRound>;
 def X86fsqrtRnd     : SDNode<"X86ISD::FSQRT_RND",   SDTFPUnaryOpRound>;
 def X86fsqrtRnds    : SDNode<"X86ISD::FSQRTS_RND", SDTFPBinOpRound>;
 def X86fgetexpRnd   : SDNode<"X86ISD::FGETEXP_RND", SDTFPUnaryOpRound>;
 def X86fgetexpRnds  : SDNode<"X86ISD::FGETEXPS_RND", SDTFPBinOpRound>;
 
-def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFma>;
-def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFma>;
-def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFma>;
-def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFma>;
-def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFma>;
-def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFma>;
+def X86Fmadd     : SDNode<"X86ISD::FMADD",     SDTFPTernaryOp>;
+def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFPTernaryOp>;
+def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFPTernaryOp>;
+def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFPTernaryOp>;
+def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFPTernaryOp>;
+def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFPTernaryOp>;
 
 def X86FmaddRnd     : SDNode<"X86ISD::FMADD_RND",     SDTFmaRound>;
 def X86FnmaddRnd    : SDNode<"X86ISD::FNMADD_RND",    SDTFmaRound>;
@@ -487,8 +503,10 @@ def X86FnmaddRnds3  : SDNode<"X86ISD::FNMADDS3_RND",    SDTFmaRound>;
 def X86FmsubRnds3   : SDNode<"X86ISD::FMSUBS3_RND",     SDTFmaRound>;
 def X86FnmsubRnds3  : SDNode<"X86ISD::FNMSUBS3_RND",    SDTFmaRound>;
 
-def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTFma>;
-def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTFma>;
+def SDTIFma : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<0,1>,
+                           SDTCisSameAs<1,2>, SDTCisSameAs<1,3>]>;
+def x86vpmadd52l     : SDNode<"X86ISD::VPMADD52L",     SDTIFma>;
+def x86vpmadd52h     : SDNode<"X86ISD::VPMADD52H",     SDTIFma>;
 
 def X86rsqrt28   : SDNode<"X86ISD::RSQRT28",  SDTFPUnaryOpRound>;
 def X86rcp28     : SDNode<"X86ISD::RCP28",    SDTFPUnaryOpRound>;
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 856fbf7803700ce4525be8621650de5e7b155e78..722fb12fadd53a042893ff38f65b438d1786f445 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -414,17 +414,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VEXTRACTI64x2Zrr,X86::VEXTRACTI64x2Zmr, TB_FOLDED_STORE },
     { X86::VEXTRACTI64x4Zrr,X86::VEXTRACTI64x4Zmr, TB_FOLDED_STORE },
     { X86::VEXTRACTPSZrr,   X86::VEXTRACTPSZmr,    TB_FOLDED_STORE },
-    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
     { X86::VMOVAPDZrr,      X86::VMOVAPDZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVAPSZrr,      X86::VMOVAPSZmr,    TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVDQA32Zrr,    X86::VMOVDQA32Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
     { X86::VMOVDQA64Zrr,    X86::VMOVDQA64Zmr,  TB_FOLDED_STORE | TB_ALIGN_64 },
-    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
-    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
     { X86::VMOVDQU8Zrr,     X86::VMOVDQU8Zmr,   TB_FOLDED_STORE },
     { X86::VMOVDQU16Zrr,    X86::VMOVDQU16Zmr,  TB_FOLDED_STORE },
     { X86::VMOVDQU32Zrr,    X86::VMOVDQU32Zmr,  TB_FOLDED_STORE },
     { X86::VMOVDQU64Zrr,    X86::VMOVDQU64Zmr,  TB_FOLDED_STORE },
+    { X86::VMOVPDI2DIZrr,   X86::VMOVPDI2DIZmr, TB_FOLDED_STORE },
+    { X86::VMOVPQIto64Zrr,  X86::VMOVPQI2QIZmr, TB_FOLDED_STORE },
+    { X86::VMOVSDto64Zrr,   X86::VMOVSDto64Zmr, TB_FOLDED_STORE },
+    { X86::VMOVSS2DIZrr,    X86::VMOVSS2DIZmr,  TB_FOLDED_STORE },
+    { X86::VMOVUPDZrr,      X86::VMOVUPDZmr,    TB_FOLDED_STORE },
+    { X86::VMOVUPSZrr,      X86::VMOVUPSZmr,    TB_FOLDED_STORE },
+    { X86::VPEXTRDZrr,      X86::VPEXTRDZmr,    TB_FOLDED_STORE },
+    { X86::VPEXTRQZrr,      X86::VPEXTRQZmr,    TB_FOLDED_STORE },
     { X86::VPMOVDBZrr,      X86::VPMOVDBZmr,    TB_FOLDED_STORE },
     { X86::VPMOVDWZrr,      X86::VPMOVDWZmr,    TB_FOLDED_STORE },
     { X86::VPMOVQDZrr,      X86::VPMOVQDZmr,    TB_FOLDED_STORE },
@@ -869,7 +874,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VBROADCASTSSZr,   X86::VBROADCASTSSZm,     TB_NO_REVERSE },
     { X86::VBROADCASTSDZr,   X86::VBROADCASTSDZm,     TB_NO_REVERSE },
     { X86::VMOV64toPQIZrr,   X86::VMOVQI2PQIZrm,      0 },
-    { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm,      TB_NO_REVERSE },
+    { X86::VMOV64toSDZrr,    X86::VMOV64toSDZrm,      0 },
+    { X86::VMOVDI2PDIZrr,    X86::VMOVDI2PDIZrm,      0 },
     { X86::VMOVDI2SSZrr,     X86::VMOVDI2SSZrm,       0 },
     { X86::VMOVAPDZrr,       X86::VMOVAPDZrm,         TB_ALIGN_64 },
     { X86::VMOVAPSZrr,       X86::VMOVAPSZrm,         TB_ALIGN_64 },
@@ -881,8 +887,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Zrr,     X86::VMOVDQU64Zrm,       0 },
     { X86::VMOVUPDZrr,       X86::VMOVUPDZrm,         0 },
     { X86::VMOVUPSZrr,       X86::VMOVUPSZrm,         0 },
+    { X86::VMOVZPQILo2PQIZrr,X86::VMOVQI2PQIZrm,      TB_NO_REVERSE },
+    { X86::VPABSBZrr,        X86::VPABSBZrm,          0 },
     { X86::VPABSDZrr,        X86::VPABSDZrm,          0 },
     { X86::VPABSQZrr,        X86::VPABSQZrm,          0 },
+    { X86::VPABSWZrr,        X86::VPABSWZrm,          0 },
     { X86::VPERMILPDZri,     X86::VPERMILPDZmi,       0 },
     { X86::VPERMILPSZri,     X86::VPERMILPSZmi,       0 },
     { X86::VPERMPDZri,       X86::VPERMPDZmi,         0 },
@@ -902,6 +911,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZri,       X86::VPSHUFDZmi,         0 },
     { X86::VPSHUFHWZri,      X86::VPSHUFHWZmi,        0 },
     { X86::VPSHUFLWZri,      X86::VPSHUFLWZmi,        0 },
+    { X86::VPSLLDQZ512rr,    X86::VPSLLDQZ512rm,      0 },
+    { X86::VPSLLDZri,        X86::VPSLLDZmi,          0 },
+    { X86::VPSLLQZri,        X86::VPSLLQZmi,          0 },
+    { X86::VPSLLWZri,        X86::VPSLLWZmi,          0 },
+    { X86::VPSRADZri,        X86::VPSRADZmi,          0 },
+    { X86::VPSRAQZri,        X86::VPSRAQZmi,          0 },
+    { X86::VPSRAWZri,        X86::VPSRAWZmi,          0 },
+    { X86::VPSRLDQZ512rr,    X86::VPSRLDQZ512rm,      0 },
+    { X86::VPSRLDZri,        X86::VPSRLDZmi,          0 },
+    { X86::VPSRLQZri,        X86::VPSRLQZmi,          0 },
+    { X86::VPSRLWZri,        X86::VPSRLWZmi,          0 },
 
     // AVX-512 foldable instructions (256-bit versions)
     { X86::VBROADCASTSSZ256r,    X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
@@ -916,6 +936,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Z256rr,      X86::VMOVDQU64Z256rm,      0 },
     { X86::VMOVUPDZ256rr,        X86::VMOVUPDZ256rm,        0 },
     { X86::VMOVUPSZ256rr,        X86::VMOVUPSZ256rm,        0 },
+    { X86::VPABSBZ256rr,         X86::VPABSBZ256rm,         0 },
+    { X86::VPABSDZ256rr,         X86::VPABSDZ256rm,         0 },
+    { X86::VPABSQZ256rr,         X86::VPABSQZ256rm,         0 },
+    { X86::VPABSWZ256rr,         X86::VPABSWZ256rm,         0 },
     { X86::VPERMILPDZ256ri,      X86::VPERMILPDZ256mi,      0 },
     { X86::VPERMILPSZ256ri,      X86::VPERMILPSZ256mi,      0 },
     { X86::VPERMPDZ256ri,        X86::VPERMPDZ256mi,        0 },
@@ -935,6 +959,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ256ri,        X86::VPSHUFDZ256mi,        0 },
     { X86::VPSHUFHWZ256ri,       X86::VPSHUFHWZ256mi,       0 },
     { X86::VPSHUFLWZ256ri,       X86::VPSHUFLWZ256mi,       0 },
+    { X86::VPSLLDQZ256rr,        X86::VPSLLDQZ256rm,        0 },
+    { X86::VPSLLDZ256ri,         X86::VPSLLDZ256mi,         0 },
+    { X86::VPSLLQZ256ri,         X86::VPSLLQZ256mi,         0 },
+    { X86::VPSLLWZ256ri,         X86::VPSLLWZ256mi,         0 },
+    { X86::VPSRADZ256ri,         X86::VPSRADZ256mi,         0 },
+    { X86::VPSRAQZ256ri,         X86::VPSRAQZ256mi,         0 },
+    { X86::VPSRAWZ256ri,         X86::VPSRAWZ256mi,         0 },
+    { X86::VPSRLDQZ256rr,        X86::VPSRLDQZ256rm,        0 },
+    { X86::VPSRLDZ256ri,         X86::VPSRLDZ256mi,         0 },
+    { X86::VPSRLQZ256ri,         X86::VPSRLQZ256mi,         0 },
+    { X86::VPSRLWZ256ri,         X86::VPSRLWZ256mi,         0 },
 
     // AVX-512 foldable instructions (128-bit versions)
     { X86::VBROADCASTSSZ128r,    X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
@@ -948,6 +983,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMOVDQU64Z128rr,      X86::VMOVDQU64Z128rm,      0 },
     { X86::VMOVUPDZ128rr,        X86::VMOVUPDZ128rm,        0 },
     { X86::VMOVUPSZ128rr,        X86::VMOVUPSZ128rm,        0 },
+    { X86::VPABSBZ128rr,         X86::VPABSBZ128rm,         0 },
+    { X86::VPABSDZ128rr,         X86::VPABSDZ128rm,         0 },
+    { X86::VPABSQZ128rr,         X86::VPABSQZ128rm,         0 },
+    { X86::VPABSWZ128rr,         X86::VPABSWZ128rm,         0 },
     { X86::VPERMILPDZ128ri,      X86::VPERMILPDZ128mi,      0 },
     { X86::VPERMILPSZ128ri,      X86::VPERMILPSZ128mi,      0 },
     { X86::VPMOVSXBDZ128rr,      X86::VPMOVSXBDZ128rm,      TB_NO_REVERSE },
@@ -965,6 +1004,17 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ128ri,        X86::VPSHUFDZ128mi,        0 },
     { X86::VPSHUFHWZ128ri,       X86::VPSHUFHWZ128mi,       0 },
     { X86::VPSHUFLWZ128ri,       X86::VPSHUFLWZ128mi,       0 },
+    { X86::VPSLLDQZ128rr,        X86::VPSLLDQZ128rm,        0 },
+    { X86::VPSLLDZ128ri,         X86::VPSLLDZ128mi,         0 },
+    { X86::VPSLLQZ128ri,         X86::VPSLLQZ128mi,         0 },
+    { X86::VPSLLWZ128ri,         X86::VPSLLWZ128mi,         0 },
+    { X86::VPSRADZ128ri,         X86::VPSRADZ128mi,         0 },
+    { X86::VPSRAQZ128ri,         X86::VPSRAQZ128mi,         0 },
+    { X86::VPSRAWZ128ri,         X86::VPSRAWZ128mi,         0 },
+    { X86::VPSRLDQZ128rr,        X86::VPSRLDQZ128rm,        0 },
+    { X86::VPSRLDZ128ri,         X86::VPSRLDZ128mi,         0 },
+    { X86::VPSRLQZ128ri,         X86::VPSRLQZ128mi,         0 },
+    { X86::VPSRLWZ128ri,         X86::VPSRLWZ128mi,         0 },
 
     // F16C foldable instructions
     { X86::VCVTPH2PSrr,        X86::VCVTPH2PSrm,            0 },
@@ -1165,18 +1215,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PINSRWrri,       X86::PINSRWrmi,     0 },
     { X86::PMADDUBSWrr,     X86::PMADDUBSWrm,   TB_ALIGN_16 },
     { X86::PMADDWDrr,       X86::PMADDWDrm,     TB_ALIGN_16 },
+    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
+    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
     { X86::PMAXSWrr,        X86::PMAXSWrm,      TB_ALIGN_16 },
     { X86::PMAXUBrr,        X86::PMAXUBrm,      TB_ALIGN_16 },
-    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
-    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
+    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
+    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
     { X86::PMINSBrr,        X86::PMINSBrm,      TB_ALIGN_16 },
     { X86::PMINSDrr,        X86::PMINSDrm,      TB_ALIGN_16 },
+    { X86::PMINSWrr,        X86::PMINSWrm,      TB_ALIGN_16 },
+    { X86::PMINUBrr,        X86::PMINUBrm,      TB_ALIGN_16 },
     { X86::PMINUDrr,        X86::PMINUDrm,      TB_ALIGN_16 },
     { X86::PMINUWrr,        X86::PMINUWrm,      TB_ALIGN_16 },
-    { X86::PMAXSBrr,        X86::PMAXSBrm,      TB_ALIGN_16 },
-    { X86::PMAXSDrr,        X86::PMAXSDrm,      TB_ALIGN_16 },
-    { X86::PMAXUDrr,        X86::PMAXUDrm,      TB_ALIGN_16 },
-    { X86::PMAXUWrr,        X86::PMAXUWrm,      TB_ALIGN_16 },
     { X86::PMULDQrr,        X86::PMULDQrm,      TB_ALIGN_16 },
     { X86::PMULHRSWrr,      X86::PMULHRSWrm,    TB_ALIGN_16 },
     { X86::PMULHUWrr,       X86::PMULHUWrm,     TB_ALIGN_16 },
@@ -1335,8 +1385,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::PMULHRWrr,         X86::PMULHRWrm,         0 },
 
     // AVX 128-bit versions of foldable instructions
-    { X86::VCVTSD2SSrr,       X86::VCVTSD2SSrm,        0 },
-    { X86::Int_VCVTSD2SSrr,   X86::Int_VCVTSD2SSrm,    TB_NO_REVERSE },
     { X86::VCVTSI2SD64rr,     X86::VCVTSI2SD64rm,      0 },
     { X86::Int_VCVTSI2SD64rr, X86::Int_VCVTSI2SD64rm,  0 },
     { X86::VCVTSI2SDrr,       X86::VCVTSI2SDrm,        0 },
@@ -1345,8 +1393,6 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::Int_VCVTSI2SS64rr, X86::Int_VCVTSI2SS64rm,  0 },
     { X86::VCVTSI2SSrr,       X86::VCVTSI2SSrm,        0 },
     { X86::Int_VCVTSI2SSrr,   X86::Int_VCVTSI2SSrm,    0 },
-    { X86::VCVTSS2SDrr,       X86::VCVTSS2SDrm,        0 },
-    { X86::Int_VCVTSS2SDrr,   X86::Int_VCVTSS2SDrm,    TB_NO_REVERSE },
     { X86::VADDPDrr,          X86::VADDPDrm,           0 },
     { X86::VADDPSrr,          X86::VADDPSrm,           0 },
     { X86::VADDSDrr,          X86::VADDSDrm,           0 },
@@ -1453,18 +1499,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPINSRWrri,        X86::VPINSRWrmi,         0 },
     { X86::VPMADDUBSWrr,      X86::VPMADDUBSWrm,       0 },
     { X86::VPMADDWDrr,        X86::VPMADDWDrm,         0 },
+    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
+    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
     { X86::VPMAXSWrr,         X86::VPMAXSWrm,          0 },
     { X86::VPMAXUBrr,         X86::VPMAXUBrm,          0 },
-    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
-    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
+    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
+    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
     { X86::VPMINSBrr,         X86::VPMINSBrm,          0 },
     { X86::VPMINSDrr,         X86::VPMINSDrm,          0 },
+    { X86::VPMINSWrr,         X86::VPMINSWrm,          0 },
+    { X86::VPMINUBrr,         X86::VPMINUBrm,          0 },
     { X86::VPMINUDrr,         X86::VPMINUDrm,          0 },
     { X86::VPMINUWrr,         X86::VPMINUWrm,          0 },
-    { X86::VPMAXSBrr,         X86::VPMAXSBrm,          0 },
-    { X86::VPMAXSDrr,         X86::VPMAXSDrm,          0 },
-    { X86::VPMAXUDrr,         X86::VPMAXUDrm,          0 },
-    { X86::VPMAXUWrr,         X86::VPMAXUWrm,          0 },
     { X86::VPMULDQrr,         X86::VPMULDQrm,          0 },
     { X86::VPMULHRSWrr,       X86::VPMULHRSWrm,        0 },
     { X86::VPMULHUWrr,        X86::VPMULHUWrm,         0 },
@@ -1621,18 +1667,18 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPHSUBWYrr,        X86::VPHSUBWYrm,         0 },
     { X86::VPMADDUBSWYrr,     X86::VPMADDUBSWYrm,      0 },
     { X86::VPMADDWDYrr,       X86::VPMADDWDYrm,        0 },
+    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
+    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
     { X86::VPMAXSWYrr,        X86::VPMAXSWYrm,         0 },
     { X86::VPMAXUBYrr,        X86::VPMAXUBYrm,         0 },
-    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
-    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
+    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
+    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
     { X86::VPMINSBYrr,        X86::VPMINSBYrm,         0 },
     { X86::VPMINSDYrr,        X86::VPMINSDYrm,         0 },
+    { X86::VPMINSWYrr,        X86::VPMINSWYrm,         0 },
+    { X86::VPMINUBYrr,        X86::VPMINUBYrm,         0 },
     { X86::VPMINUDYrr,        X86::VPMINUDYrm,         0 },
     { X86::VPMINUWYrr,        X86::VPMINUWYrm,         0 },
-    { X86::VPMAXSBYrr,        X86::VPMAXSBYrm,         0 },
-    { X86::VPMAXSDYrr,        X86::VPMAXSDYrm,         0 },
-    { X86::VPMAXUDYrr,        X86::VPMAXUDYrm,         0 },
-    { X86::VPMAXUWYrr,        X86::VPMAXUWYrm,         0 },
     { X86::VMPSADBWYrri,      X86::VMPSADBWYrmi,       0 },
     { X86::VPMULDQYrr,        X86::VPMULDQYrm,         0 },
     { X86::VPMULHRSWYrr,      X86::VPMULHRSWYrm,       0 },
@@ -1727,7 +1773,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
     // XOP foldable instructions
     { X86::VPCMOVrrr,         X86::VPCMOVrmr,           0 },
-    { X86::VPCMOVrrrY,        X86::VPCMOVrmrY,          0 },
+    { X86::VPCMOVYrrr,        X86::VPCMOVYrmr,          0 },
     { X86::VPCOMBri,          X86::VPCOMBmi,            0 },
     { X86::VPCOMDri,          X86::VPCOMDmi,            0 },
     { X86::VPCOMQri,          X86::VPCOMQmi,            0 },
@@ -1737,9 +1783,9 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPCOMUQri,         X86::VPCOMUQmi,           0 },
     { X86::VPCOMUWri,         X86::VPCOMUWmi,           0 },
     { X86::VPERMIL2PDrr,      X86::VPERMIL2PDmr,        0 },
-    { X86::VPERMIL2PDrrY,     X86::VPERMIL2PDmrY,       0 },
+    { X86::VPERMIL2PDYrr,     X86::VPERMIL2PDYmr,       0 },
     { X86::VPERMIL2PSrr,      X86::VPERMIL2PSmr,        0 },
-    { X86::VPERMIL2PSrrY,     X86::VPERMIL2PSmrY,       0 },
+    { X86::VPERMIL2PSYrr,     X86::VPERMIL2PSYmr,       0 },
     { X86::VPMACSDDrr,        X86::VPMACSDDrm,          0 },
     { X86::VPMACSDQHrr,       X86::VPMACSDQHrm,         0 },
     { X86::VPMACSDQLrr,       X86::VPMACSDQLrm,         0 },
@@ -1835,6 +1881,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMINSDZrr_Int,     X86::VMINSDZrm_Int,       TB_NO_REVERSE },
     { X86::VMINSSZrr,         X86::VMINSSZrm,           0 },
     { X86::VMINSSZrr_Int,     X86::VMINSSZrm_Int,       TB_NO_REVERSE },
+    { X86::VMOVLHPSZrr,       X86::VMOVHPSZ128rm,       TB_NO_REVERSE },
     { X86::VMULPDZrr,         X86::VMULPDZrm,           0 },
     { X86::VMULPSZrr,         X86::VMULPSZrm,           0 },
     { X86::VMULSDZrr,         X86::VMULSDZrm,           0 },
@@ -1843,6 +1890,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULSSZrr_Int,     X86::VMULSSZrm_Int,       TB_NO_REVERSE },
     { X86::VORPDZrr,          X86::VORPDZrm,            0 },
     { X86::VORPSZrr,          X86::VORPSZrm,            0 },
+    { X86::VPACKSSDWZrr,      X86::VPACKSSDWZrm,        0 },
+    { X86::VPACKSSWBZrr,      X86::VPACKSSWBZrm,        0 },
+    { X86::VPACKUSDWZrr,      X86::VPACKUSDWZrm,        0 },
+    { X86::VPACKUSWBZrr,      X86::VPACKUSWBZrm,        0 },
     { X86::VPADDBZrr,         X86::VPADDBZrm,           0 },
     { X86::VPADDDZrr,         X86::VPADDDZrm,           0 },
     { X86::VPADDQZrr,         X86::VPADDQZrm,           0 },
@@ -1856,6 +1907,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZrr,        X86::VPANDNDZrm,          0 },
     { X86::VPANDNQZrr,        X86::VPANDNQZrm,          0 },
     { X86::VPANDQZrr,         X86::VPANDQZrm,           0 },
+    { X86::VPAVGBZrr,         X86::VPAVGBZrm,           0 },
+    { X86::VPAVGWZrr,         X86::VPAVGWZrm,           0 },
     { X86::VPCMPBZrri,        X86::VPCMPBZrmi,          0 },
     { X86::VPCMPDZrri,        X86::VPCMPDZrmi,          0 },
     { X86::VPCMPEQBZrr,       X86::VPCMPEQBZrm,         0 },
@@ -1880,26 +1933,55 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMPSZrr,        X86::VPERMPSZrm,          0 },
     { X86::VPERMQZrr,         X86::VPERMQZrm,           0 },
     { X86::VPERMWZrr,         X86::VPERMWZrm,           0 },
+    { X86::VPINSRBZrr,        X86::VPINSRBZrm,          0 },
+    { X86::VPINSRDZrr,        X86::VPINSRDZrm,          0 },
+    { X86::VPINSRQZrr,        X86::VPINSRQZrm,          0 },
+    { X86::VPINSRWZrr,        X86::VPINSRWZrm,          0 },
     { X86::VPMADDUBSWZrr,     X86::VPMADDUBSWZrm,       0 },
     { X86::VPMADDWDZrr,       X86::VPMADDWDZrm,         0 },
+    { X86::VPMAXSBZrr,        X86::VPMAXSBZrm,          0 },
     { X86::VPMAXSDZrr,        X86::VPMAXSDZrm,          0 },
     { X86::VPMAXSQZrr,        X86::VPMAXSQZrm,          0 },
+    { X86::VPMAXSWZrr,        X86::VPMAXSWZrm,          0 },
+    { X86::VPMAXUBZrr,        X86::VPMAXUBZrm,          0 },
     { X86::VPMAXUDZrr,        X86::VPMAXUDZrm,          0 },
     { X86::VPMAXUQZrr,        X86::VPMAXUQZrm,          0 },
+    { X86::VPMAXUWZrr,        X86::VPMAXUWZrm,          0 },
+    { X86::VPMINSBZrr,        X86::VPMINSBZrm,          0 },
     { X86::VPMINSDZrr,        X86::VPMINSDZrm,          0 },
     { X86::VPMINSQZrr,        X86::VPMINSQZrm,          0 },
+    { X86::VPMINSWZrr,        X86::VPMINSWZrm,          0 },
+    { X86::VPMINUBZrr,        X86::VPMINUBZrm,          0 },
     { X86::VPMINUDZrr,        X86::VPMINUDZrm,          0 },
     { X86::VPMINUQZrr,        X86::VPMINUQZrm,          0 },
+    { X86::VPMINUWZrr,        X86::VPMINUWZrm,          0 },
     { X86::VPMULDQZrr,        X86::VPMULDQZrm,          0 },
+    { X86::VPMULLDZrr,        X86::VPMULLDZrm,          0 },
+    { X86::VPMULLQZrr,        X86::VPMULLQZrm,          0 },
+    { X86::VPMULLWZrr,        X86::VPMULLWZrm,          0 },
     { X86::VPMULUDQZrr,       X86::VPMULUDQZrm,         0 },
     { X86::VPORDZrr,          X86::VPORDZrm,            0 },
     { X86::VPORQZrr,          X86::VPORQZrm,            0 },
+    { X86::VPSADBWZ512rr,     X86::VPSADBWZ512rm,       0 },
     { X86::VPSHUFBZrr,        X86::VPSHUFBZrm,          0 },
+    { X86::VPSLLDZrr,         X86::VPSLLDZrm,           0 },
+    { X86::VPSLLQZrr,         X86::VPSLLQZrm,           0 },
     { X86::VPSLLVDZrr,        X86::VPSLLVDZrm,          0 },
     { X86::VPSLLVQZrr,        X86::VPSLLVQZrm,          0 },
+    { X86::VPSLLVWZrr,        X86::VPSLLVWZrm,          0 },
+    { X86::VPSLLWZrr,         X86::VPSLLWZrm,           0 },
+    { X86::VPSRADZrr,         X86::VPSRADZrm,           0 },
+    { X86::VPSRAQZrr,         X86::VPSRAQZrm,           0 },
     { X86::VPSRAVDZrr,        X86::VPSRAVDZrm,          0 },
+    { X86::VPSRAVQZrr,        X86::VPSRAVQZrm,          0 },
+    { X86::VPSRAVWZrr,        X86::VPSRAVWZrm,          0 },
+    { X86::VPSRAWZrr,         X86::VPSRAWZrm,           0 },
+    { X86::VPSRLDZrr,         X86::VPSRLDZrm,           0 },
+    { X86::VPSRLQZrr,         X86::VPSRLQZrm,           0 },
     { X86::VPSRLVDZrr,        X86::VPSRLVDZrm,          0 },
     { X86::VPSRLVQZrr,        X86::VPSRLVQZrm,          0 },
+    { X86::VPSRLVWZrr,        X86::VPSRLVWZrm,          0 },
+    { X86::VPSRLWZrr,         X86::VPSRLWZrm,           0 },
     { X86::VPSUBBZrr,         X86::VPSUBBZrm,           0 },
     { X86::VPSUBDZrr,         X86::VPSUBDZrm,           0 },
     { X86::VPSUBQZrr,         X86::VPSUBQZrm,           0 },
@@ -1986,6 +2068,14 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VORPDZ256rr,       X86::VORPDZ256rm,         0 },
     { X86::VORPSZ128rr,       X86::VORPSZ128rm,         0 },
     { X86::VORPSZ256rr,       X86::VORPSZ256rm,         0 },
+    { X86::VPACKSSDWZ256rr,   X86::VPACKSSDWZ256rm,     0 },
+    { X86::VPACKSSDWZ128rr,   X86::VPACKSSDWZ128rm,     0 },
+    { X86::VPACKSSWBZ256rr,   X86::VPACKSSWBZ256rm,     0 },
+    { X86::VPACKSSWBZ128rr,   X86::VPACKSSWBZ128rm,     0 },
+    { X86::VPACKUSDWZ256rr,   X86::VPACKUSDWZ256rm,     0 },
+    { X86::VPACKUSDWZ128rr,   X86::VPACKUSDWZ128rm,     0 },
+    { X86::VPACKUSWBZ256rr,   X86::VPACKUSWBZ256rm,     0 },
+    { X86::VPACKUSWBZ128rr,   X86::VPACKUSWBZ128rm,     0 },
     { X86::VPADDBZ128rr,      X86::VPADDBZ128rm,        0 },
     { X86::VPADDBZ256rr,      X86::VPADDBZ256rm,        0 },
     { X86::VPADDDZ128rr,      X86::VPADDDZ128rm,        0 },
@@ -2012,6 +2102,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNQZ256rr,     X86::VPANDNQZ256rm,       0 },
     { X86::VPANDQZ128rr,      X86::VPANDQZ128rm,        0 },
     { X86::VPANDQZ256rr,      X86::VPANDQZ256rm,        0 },
+    { X86::VPAVGBZ128rr,      X86::VPAVGBZ128rm,        0 },
+    { X86::VPAVGBZ256rr,      X86::VPAVGBZ256rm,        0 },
+    { X86::VPAVGWZ128rr,      X86::VPAVGWZ128rm,        0 },
+    { X86::VPAVGWZ256rr,      X86::VPAVGWZ256rm,        0 },
     { X86::VPCMPBZ128rri,     X86::VPCMPBZ128rmi,       0 },
     { X86::VPCMPBZ256rri,     X86::VPCMPBZ256rmi,       0 },
     { X86::VPCMPDZ128rri,     X86::VPCMPDZ128rmi,       0 },
@@ -2060,12 +2154,92 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPMADDUBSWZ256rr,  X86::VPMADDUBSWZ256rm,    0 },
     { X86::VPMADDWDZ128rr,    X86::VPMADDWDZ128rm,      0 },
     { X86::VPMADDWDZ256rr,    X86::VPMADDWDZ256rm,      0 },
+    { X86::VPMAXSBZ128rr,     X86::VPMAXSBZ128rm,       0 },
+    { X86::VPMAXSBZ256rr,     X86::VPMAXSBZ256rm,       0 },
+    { X86::VPMAXSDZ128rr,     X86::VPMAXSDZ128rm,       0 },
+    { X86::VPMAXSDZ256rr,     X86::VPMAXSDZ256rm,       0 },
+    { X86::VPMAXSQZ128rr,     X86::VPMAXSQZ128rm,       0 },
+    { X86::VPMAXSQZ256rr,     X86::VPMAXSQZ256rm,       0 },
+    { X86::VPMAXSWZ128rr,     X86::VPMAXSWZ128rm,       0 },
+    { X86::VPMAXSWZ256rr,     X86::VPMAXSWZ256rm,       0 },
+    { X86::VPMAXUBZ128rr,     X86::VPMAXUBZ128rm,       0 },
+    { X86::VPMAXUBZ256rr,     X86::VPMAXUBZ256rm,       0 },
+    { X86::VPMAXUDZ128rr,     X86::VPMAXUDZ128rm,       0 },
+    { X86::VPMAXUDZ256rr,     X86::VPMAXUDZ256rm,       0 },
+    { X86::VPMAXUQZ128rr,     X86::VPMAXUQZ128rm,       0 },
+    { X86::VPMAXUQZ256rr,     X86::VPMAXUQZ256rm,       0 },
+    { X86::VPMAXUWZ128rr,     X86::VPMAXUWZ128rm,       0 },
+    { X86::VPMAXUWZ256rr,     X86::VPMAXUWZ256rm,       0 },
+    { X86::VPMINSBZ128rr,     X86::VPMINSBZ128rm,       0 },
+    { X86::VPMINSBZ256rr,     X86::VPMINSBZ256rm,       0 },
+    { X86::VPMINSDZ128rr,     X86::VPMINSDZ128rm,       0 },
+    { X86::VPMINSDZ256rr,     X86::VPMINSDZ256rm,       0 },
+    { X86::VPMINSQZ128rr,     X86::VPMINSQZ128rm,       0 },
+    { X86::VPMINSQZ256rr,     X86::VPMINSQZ256rm,       0 },
+    { X86::VPMINSWZ128rr,     X86::VPMINSWZ128rm,       0 },
+    { X86::VPMINSWZ256rr,     X86::VPMINSWZ256rm,       0 },
+    { X86::VPMINUBZ128rr,     X86::VPMINUBZ128rm,       0 },
+    { X86::VPMINUBZ256rr,     X86::VPMINUBZ256rm,       0 },
+    { X86::VPMINUDZ128rr,     X86::VPMINUDZ128rm,       0 },
+    { X86::VPMINUDZ256rr,     X86::VPMINUDZ256rm,       0 },
+    { X86::VPMINUQZ128rr,     X86::VPMINUQZ128rm,       0 },
+    { X86::VPMINUQZ256rr,     X86::VPMINUQZ256rm,       0 },
+    { X86::VPMINUWZ128rr,     X86::VPMINUWZ128rm,       0 },
+    { X86::VPMINUWZ256rr,     X86::VPMINUWZ256rm,       0 },
+    { X86::VPMULDQZ128rr,     X86::VPMULDQZ128rm,       0 },
+    { X86::VPMULDQZ256rr,     X86::VPMULDQZ256rm,       0 },
+    { X86::VPMULLDZ128rr,     X86::VPMULLDZ128rm,       0 },
+    { X86::VPMULLDZ256rr,     X86::VPMULLDZ256rm,       0 },
+    { X86::VPMULLQZ128rr,     X86::VPMULLQZ128rm,       0 },
+    { X86::VPMULLQZ256rr,     X86::VPMULLQZ256rm,       0 },
+    { X86::VPMULLWZ128rr,     X86::VPMULLWZ128rm,       0 },
+    { X86::VPMULLWZ256rr,     X86::VPMULLWZ256rm,       0 },
+    { X86::VPMULUDQZ128rr,    X86::VPMULUDQZ128rm,      0 },
+    { X86::VPMULUDQZ256rr,    X86::VPMULUDQZ256rm,      0 },
     { X86::VPORDZ128rr,       X86::VPORDZ128rm,         0 },
     { X86::VPORDZ256rr,       X86::VPORDZ256rm,         0 },
     { X86::VPORQZ128rr,       X86::VPORQZ128rm,         0 },
     { X86::VPORQZ256rr,       X86::VPORQZ256rm,         0 },
+    { X86::VPSADBWZ128rr,     X86::VPSADBWZ128rm,       0 },
+    { X86::VPSADBWZ256rr,     X86::VPSADBWZ256rm,       0 },
     { X86::VPSHUFBZ128rr,     X86::VPSHUFBZ128rm,       0 },
     { X86::VPSHUFBZ256rr,     X86::VPSHUFBZ256rm,       0 },
+    { X86::VPSLLDZ128rr,      X86::VPSLLDZ128rm,        0 },
+    { X86::VPSLLDZ256rr,      X86::VPSLLDZ256rm,        0 },
+    { X86::VPSLLQZ128rr,      X86::VPSLLQZ128rm,        0 },
+    { X86::VPSLLQZ256rr,      X86::VPSLLQZ256rm,        0 },
+    { X86::VPSLLVDZ128rr,     X86::VPSLLVDZ128rm,       0 },
+    { X86::VPSLLVDZ256rr,     X86::VPSLLVDZ256rm,       0 },
+    { X86::VPSLLVQZ128rr,     X86::VPSLLVQZ128rm,       0 },
+    { X86::VPSLLVQZ256rr,     X86::VPSLLVQZ256rm,       0 },
+    { X86::VPSLLVWZ128rr,     X86::VPSLLVWZ128rm,       0 },
+    { X86::VPSLLVWZ256rr,     X86::VPSLLVWZ256rm,       0 },
+    { X86::VPSLLWZ128rr,      X86::VPSLLWZ128rm,        0 },
+    { X86::VPSLLWZ256rr,      X86::VPSLLWZ256rm,        0 },
+    { X86::VPSRADZ128rr,      X86::VPSRADZ128rm,        0 },
+    { X86::VPSRADZ256rr,      X86::VPSRADZ256rm,        0 },
+    { X86::VPSRAQZ128rr,      X86::VPSRAQZ128rm,        0 },
+    { X86::VPSRAQZ256rr,      X86::VPSRAQZ256rm,        0 },
+    { X86::VPSRAVDZ128rr,     X86::VPSRAVDZ128rm,       0 },
+    { X86::VPSRAVDZ256rr,     X86::VPSRAVDZ256rm,       0 },
+    { X86::VPSRAVQZ128rr,     X86::VPSRAVQZ128rm,       0 },
+    { X86::VPSRAVQZ256rr,     X86::VPSRAVQZ256rm,       0 },
+    { X86::VPSRAVWZ128rr,     X86::VPSRAVWZ128rm,       0 },
+    { X86::VPSRAVWZ256rr,     X86::VPSRAVWZ256rm,       0 },
+    { X86::VPSRAWZ128rr,      X86::VPSRAWZ128rm,        0 },
+    { X86::VPSRAWZ256rr,      X86::VPSRAWZ256rm,        0 },
+    { X86::VPSRLDZ128rr,      X86::VPSRLDZ128rm,        0 },
+    { X86::VPSRLDZ256rr,      X86::VPSRLDZ256rm,        0 },
+    { X86::VPSRLQZ128rr,      X86::VPSRLQZ128rm,        0 },
+    { X86::VPSRLQZ256rr,      X86::VPSRLQZ256rm,        0 },
+    { X86::VPSRLVDZ128rr,     X86::VPSRLVDZ128rm,       0 },
+    { X86::VPSRLVDZ256rr,     X86::VPSRLVDZ256rm,       0 },
+    { X86::VPSRLVQZ128rr,     X86::VPSRLVQZ128rm,       0 },
+    { X86::VPSRLVQZ256rr,     X86::VPSRLVQZ256rm,       0 },
+    { X86::VPSRLVWZ128rr,     X86::VPSRLVWZ128rm,       0 },
+    { X86::VPSRLVWZ256rr,     X86::VPSRLVWZ256rm,       0 },
+    { X86::VPSRLWZ128rr,      X86::VPSRLWZ128rm,        0 },
+    { X86::VPSRLWZ256rr,      X86::VPSRLWZ256rm,        0 },
     { X86::VPSUBBZ128rr,      X86::VPSUBBZ128rm,        0 },
     { X86::VPSUBBZ256rr,      X86::VPSUBBZ256rm,        0 },
     { X86::VPSUBDZ128rr,      X86::VPSUBDZ128rm,        0 },
@@ -2102,6 +2276,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPXORDZ256rr,      X86::VPXORDZ256rm,        0 },
     { X86::VPXORQZ128rr,      X86::VPXORQZ128rm,        0 },
     { X86::VPXORQZ256rr,      X86::VPXORQZ256rm,        0 },
+    { X86::VSHUFPDZ128rri,    X86::VSHUFPDZ128rmi,      0 },
+    { X86::VSHUFPDZ256rri,    X86::VSHUFPDZ256rmi,      0 },
+    { X86::VSHUFPSZ128rri,    X86::VSHUFPSZ128rmi,      0 },
+    { X86::VSHUFPSZ256rri,    X86::VSHUFPSZ256rmi,      0 },
     { X86::VSUBPDZ128rr,      X86::VSUBPDZ128rm,        0 },
     { X86::VSUBPDZ256rr,      X86::VSUBPDZ256rm,        0 },
     { X86::VSUBPSZ128rr,      X86::VSUBPSZ128rm,        0 },
@@ -2122,6 +2300,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     // AVX-512 masked foldable instructions
     { X86::VBROADCASTSSZrkz,  X86::VBROADCASTSSZmkz,    TB_NO_REVERSE },
     { X86::VBROADCASTSDZrkz,  X86::VBROADCASTSDZmkz,    TB_NO_REVERSE },
+    { X86::VPABSBZrrkz,       X86::VPABSBZrmkz,         0 },
+    { X86::VPABSDZrrkz,       X86::VPABSDZrmkz,         0 },
+    { X86::VPABSQZrrkz,       X86::VPABSQZrmkz,         0 },
+    { X86::VPABSWZrrkz,       X86::VPABSWZrmkz,         0 },
     { X86::VPERMILPDZrikz,    X86::VPERMILPDZmikz,      0 },
     { X86::VPERMILPSZrikz,    X86::VPERMILPSZmikz,      0 },
     { X86::VPERMPDZrikz,      X86::VPERMPDZmikz,        0 },
@@ -2141,10 +2323,23 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZrikz,      X86::VPSHUFDZmikz,        0 },
     { X86::VPSHUFHWZrikz,     X86::VPSHUFHWZmikz,       0 },
     { X86::VPSHUFLWZrikz,     X86::VPSHUFLWZmikz,       0 },
+    { X86::VPSLLDZrikz,       X86::VPSLLDZmikz,         0 },
+    { X86::VPSLLQZrikz,       X86::VPSLLQZmikz,         0 },
+    { X86::VPSLLWZrikz,       X86::VPSLLWZmikz,         0 },
+    { X86::VPSRADZrikz,       X86::VPSRADZmikz,         0 },
+    { X86::VPSRAQZrikz,       X86::VPSRAQZmikz,         0 },
+    { X86::VPSRAWZrikz,       X86::VPSRAWZmikz,         0 },
+    { X86::VPSRLDZrikz,       X86::VPSRLDZmikz,         0 },
+    { X86::VPSRLQZrikz,       X86::VPSRLQZmikz,         0 },
+    { X86::VPSRLWZrikz,       X86::VPSRLWZmikz,         0 },
 
     // AVX-512VL 256-bit masked foldable instructions
     { X86::VBROADCASTSDZ256rkz,  X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
     { X86::VBROADCASTSSZ256rkz,  X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
+    { X86::VPABSBZ256rrkz,    X86::VPABSBZ256rmkz,      0 },
+    { X86::VPABSDZ256rrkz,    X86::VPABSDZ256rmkz,      0 },
+    { X86::VPABSQZ256rrkz,    X86::VPABSQZ256rmkz,      0 },
+    { X86::VPABSWZ256rrkz,    X86::VPABSWZ256rmkz,      0 },
     { X86::VPERMILPDZ256rikz, X86::VPERMILPDZ256mikz,   0 },
     { X86::VPERMILPSZ256rikz, X86::VPERMILPSZ256mikz,   0 },
     { X86::VPERMPDZ256rikz,   X86::VPERMPDZ256mikz,     0 },
@@ -2164,9 +2359,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ256rikz,   X86::VPSHUFDZ256mikz,     0 },
     { X86::VPSHUFHWZ256rikz,  X86::VPSHUFHWZ256mikz,    0 },
     { X86::VPSHUFLWZ256rikz,  X86::VPSHUFLWZ256mikz,    0 },
+    { X86::VPSLLDZ256rikz,    X86::VPSLLDZ256mikz,      0 },
+    { X86::VPSLLQZ256rikz,    X86::VPSLLQZ256mikz,      0 },
+    { X86::VPSLLWZ256rikz,    X86::VPSLLWZ256mikz,      0 },
+    { X86::VPSRADZ256rikz,    X86::VPSRADZ256mikz,      0 },
+    { X86::VPSRAQZ256rikz,    X86::VPSRAQZ256mikz,      0 },
+    { X86::VPSRAWZ256rikz,    X86::VPSRAWZ256mikz,      0 },
+    { X86::VPSRLDZ256rikz,    X86::VPSRLDZ256mikz,      0 },
+    { X86::VPSRLQZ256rikz,    X86::VPSRLQZ256mikz,      0 },
+    { X86::VPSRLWZ256rikz,    X86::VPSRLWZ256mikz,      0 },
 
     // AVX-512VL 128-bit masked foldable instructions
     { X86::VBROADCASTSSZ128rkz,  X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
+    { X86::VPABSBZ128rrkz,    X86::VPABSBZ128rmkz,      0 },
+    { X86::VPABSDZ128rrkz,    X86::VPABSDZ128rmkz,      0 },
+    { X86::VPABSQZ128rrkz,    X86::VPABSQZ128rmkz,      0 },
+    { X86::VPABSWZ128rrkz,    X86::VPABSWZ128rmkz,      0 },
     { X86::VPERMILPDZ128rikz, X86::VPERMILPDZ128mikz,   0 },
     { X86::VPERMILPSZ128rikz, X86::VPERMILPSZ128mikz,   0 },
     { X86::VPMOVSXBDZ128rrkz, X86::VPMOVSXBDZ128rmkz,   TB_NO_REVERSE },
@@ -2184,6 +2392,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ128rikz,   X86::VPSHUFDZ128mikz,     0 },
     { X86::VPSHUFHWZ128rikz,  X86::VPSHUFHWZ128mikz,    0 },
     { X86::VPSHUFLWZ128rikz,  X86::VPSHUFLWZ128mikz,    0 },
+    { X86::VPSLLDZ128rikz,    X86::VPSLLDZ128mikz,      0 },
+    { X86::VPSLLQZ128rikz,    X86::VPSLLQZ128mikz,      0 },
+    { X86::VPSLLWZ128rikz,    X86::VPSLLWZ128mikz,      0 },
+    { X86::VPSRADZ128rikz,    X86::VPSRADZ128mikz,      0 },
+    { X86::VPSRAQZ128rikz,    X86::VPSRAQZ128mikz,      0 },
+    { X86::VPSRAWZ128rikz,    X86::VPSRAWZ128mikz,      0 },
+    { X86::VPSRLDZ128rikz,    X86::VPSRLDZ128mikz,      0 },
+    { X86::VPSRLQZ128rikz,    X86::VPSRLQZ128mikz,      0 },
+    { X86::VPSRLWZ128rikz,    X86::VPSRLWZ128mikz,      0 },
 
     // AES foldable instructions
     { X86::AESDECLASTrr,      X86::AESDECLASTrm,        TB_ALIGN_16 },
@@ -2257,11 +2474,11 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
     // XOP foldable instructions
     { X86::VPCMOVrrr,             X86::VPCMOVrrm,             0 },
-    { X86::VPCMOVrrrY,            X86::VPCMOVrrmY,            0 },
+    { X86::VPCMOVYrrr,            X86::VPCMOVYrrm,            0 },
     { X86::VPERMIL2PDrr,          X86::VPERMIL2PDrm,          0 },
-    { X86::VPERMIL2PDrrY,         X86::VPERMIL2PDrmY,         0 },
+    { X86::VPERMIL2PDYrr,         X86::VPERMIL2PDYrm,         0 },
     { X86::VPERMIL2PSrr,          X86::VPERMIL2PSrm,          0 },
-    { X86::VPERMIL2PSrrY,         X86::VPERMIL2PSrmY,         0 },
+    { X86::VPERMIL2PSYrr,         X86::VPERMIL2PSYrm,         0 },
     { X86::VPPERMrrr,             X86::VPPERMrrm,             0 },
 
     // AVX-512 instructions with 3 source operands.
@@ -2315,6 +2532,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     // AVX-512 masked instructions
     { X86::VADDPDZrrkz,           X86::VADDPDZrmkz,           0 },
     { X86::VADDPSZrrkz,           X86::VADDPSZrmkz,           0 },
+    { X86::VADDSDZrr_Intkz,       X86::VADDSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VADDSSZrr_Intkz,       X86::VADDSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VALIGNDZrrikz,         X86::VALIGNDZrmikz,         0 },
     { X86::VALIGNQZrrikz,         X86::VALIGNQZrmikz,         0 },
     { X86::VANDNPDZrrkz,          X86::VANDNPDZrmkz,          0 },
@@ -2323,6 +2542,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VANDPSZrrkz,           X86::VANDPSZrmkz,           0 },
     { X86::VDIVPDZrrkz,           X86::VDIVPDZrmkz,           0 },
     { X86::VDIVPSZrrkz,           X86::VDIVPSZrmkz,           0 },
+    { X86::VDIVSDZrr_Intkz,       X86::VDIVSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VDIVSSZrr_Intkz,       X86::VDIVSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VINSERTF32x4Zrrkz,     X86::VINSERTF32x4Zrmkz,     0 },
     { X86::VINSERTF32x8Zrrkz,     X86::VINSERTF32x8Zrmkz,     0 },
     { X86::VINSERTF64x2Zrrkz,     X86::VINSERTF64x2Zrmkz,     0 },
@@ -2335,14 +2556,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXCPSZrrkz,          X86::VMAXCPSZrmkz,          0 },
     { X86::VMAXPDZrrkz,           X86::VMAXPDZrmkz,           0 },
     { X86::VMAXPSZrrkz,           X86::VMAXPSZrmkz,           0 },
+    { X86::VMAXSDZrr_Intkz,       X86::VMAXSDZrm_Intkz,       0 },
+    { X86::VMAXSSZrr_Intkz,       X86::VMAXSSZrm_Intkz,       0 },
     { X86::VMINCPDZrrkz,          X86::VMINCPDZrmkz,          0 },
     { X86::VMINCPSZrrkz,          X86::VMINCPSZrmkz,          0 },
     { X86::VMINPDZrrkz,           X86::VMINPDZrmkz,           0 },
     { X86::VMINPSZrrkz,           X86::VMINPSZrmkz,           0 },
+    { X86::VMINSDZrr_Intkz,       X86::VMINSDZrm_Intkz,       0 },
+    { X86::VMINSSZrr_Intkz,       X86::VMINSSZrm_Intkz,       0 },
     { X86::VMULPDZrrkz,           X86::VMULPDZrmkz,           0 },
     { X86::VMULPSZrrkz,           X86::VMULPSZrmkz,           0 },
+    { X86::VMULSDZrr_Intkz,       X86::VMULSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VMULSSZrr_Intkz,       X86::VMULSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VORPDZrrkz,            X86::VORPDZrmkz,            0 },
     { X86::VORPSZrrkz,            X86::VORPSZrmkz,            0 },
+    { X86::VPACKSSDWZrrkz,        X86::VPACKSSDWZrmkz,        0 },
+    { X86::VPACKSSWBZrrkz,        X86::VPACKSSWBZrmkz,        0 },
+    { X86::VPACKUSDWZrrkz,        X86::VPACKUSDWZrmkz,        0 },
+    { X86::VPACKUSWBZrrkz,        X86::VPACKUSWBZrmkz,        0 },
     { X86::VPADDBZrrkz,           X86::VPADDBZrmkz,           0 },
     { X86::VPADDDZrrkz,           X86::VPADDDZrmkz,           0 },
     { X86::VPADDQZrrkz,           X86::VPADDQZrmkz,           0 },
@@ -2356,6 +2587,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZrrkz,          X86::VPANDNDZrmkz,          0 },
     { X86::VPANDNQZrrkz,          X86::VPANDNQZrmkz,          0 },
     { X86::VPANDQZrrkz,           X86::VPANDQZrmkz,           0 },
+    { X86::VPAVGBZrrkz,           X86::VPAVGBZrmkz,           0 },
+    { X86::VPAVGWZrrkz,           X86::VPAVGWZrmkz,           0 },
     { X86::VPERMBZrrkz,           X86::VPERMBZrmkz,           0 },
     { X86::VPERMDZrrkz,           X86::VPERMDZrmkz,           0 },
     { X86::VPERMILPDZrrkz,        X86::VPERMILPDZrmkz,        0 },
@@ -2366,9 +2599,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZrrkz,           X86::VPERMWZrmkz,           0 },
     { X86::VPMADDUBSWZrrkz,       X86::VPMADDUBSWZrmkz,       0 },
     { X86::VPMADDWDZrrkz,         X86::VPMADDWDZrmkz,         0 },
+    { X86::VPMAXSBZrrkz,          X86::VPMAXSBZrmkz,          0 },
+    { X86::VPMAXSDZrrkz,          X86::VPMAXSDZrmkz,          0 },
+    { X86::VPMAXSQZrrkz,          X86::VPMAXSQZrmkz,          0 },
+    { X86::VPMAXSWZrrkz,          X86::VPMAXSWZrmkz,          0 },
+    { X86::VPMAXUBZrrkz,          X86::VPMAXUBZrmkz,          0 },
+    { X86::VPMAXUDZrrkz,          X86::VPMAXUDZrmkz,          0 },
+    { X86::VPMAXUQZrrkz,          X86::VPMAXUQZrmkz,          0 },
+    { X86::VPMAXUWZrrkz,          X86::VPMAXUWZrmkz,          0 },
+    { X86::VPMINSBZrrkz,          X86::VPMINSBZrmkz,          0 },
+    { X86::VPMINSDZrrkz,          X86::VPMINSDZrmkz,          0 },
+    { X86::VPMINSQZrrkz,          X86::VPMINSQZrmkz,          0 },
+    { X86::VPMINSWZrrkz,          X86::VPMINSWZrmkz,          0 },
+    { X86::VPMINUBZrrkz,          X86::VPMINUBZrmkz,          0 },
+    { X86::VPMINUDZrrkz,          X86::VPMINUDZrmkz,          0 },
+    { X86::VPMINUQZrrkz,          X86::VPMINUQZrmkz,          0 },
+    { X86::VPMINUWZrrkz,          X86::VPMINUWZrmkz,          0 },
+    { X86::VPMULLDZrrkz,          X86::VPMULLDZrmkz,          0 },
+    { X86::VPMULLQZrrkz,          X86::VPMULLQZrmkz,          0 },
+    { X86::VPMULLWZrrkz,          X86::VPMULLWZrmkz,          0 },
+    { X86::VPMULDQZrrkz,          X86::VPMULDQZrmkz,          0 },
+    { X86::VPMULUDQZrrkz,         X86::VPMULUDQZrmkz,         0 },
     { X86::VPORDZrrkz,            X86::VPORDZrmkz,            0 },
     { X86::VPORQZrrkz,            X86::VPORQZrmkz,            0 },
     { X86::VPSHUFBZrrkz,          X86::VPSHUFBZrmkz,          0 },
+    { X86::VPSLLDZrrkz,           X86::VPSLLDZrmkz,           0 },
+    { X86::VPSLLQZrrkz,           X86::VPSLLQZrmkz,           0 },
+    { X86::VPSLLVDZrrkz,          X86::VPSLLVDZrmkz,          0 },
+    { X86::VPSLLVQZrrkz,          X86::VPSLLVQZrmkz,          0 },
+    { X86::VPSLLVWZrrkz,          X86::VPSLLVWZrmkz,          0 },
+    { X86::VPSLLWZrrkz,           X86::VPSLLWZrmkz,           0 },
+    { X86::VPSRADZrrkz,           X86::VPSRADZrmkz,           0 },
+    { X86::VPSRAQZrrkz,           X86::VPSRAQZrmkz,           0 },
+    { X86::VPSRAVDZrrkz,          X86::VPSRAVDZrmkz,          0 },
+    { X86::VPSRAVQZrrkz,          X86::VPSRAVQZrmkz,          0 },
+    { X86::VPSRAVWZrrkz,          X86::VPSRAVWZrmkz,          0 },
+    { X86::VPSRAWZrrkz,           X86::VPSRAWZrmkz,           0 },
+    { X86::VPSRLDZrrkz,           X86::VPSRLDZrmkz,           0 },
+    { X86::VPSRLQZrrkz,           X86::VPSRLQZrmkz,           0 },
+    { X86::VPSRLVDZrrkz,          X86::VPSRLVDZrmkz,          0 },
+    { X86::VPSRLVQZrrkz,          X86::VPSRLVQZrmkz,          0 },
+    { X86::VPSRLVWZrrkz,          X86::VPSRLVWZrmkz,          0 },
+    { X86::VPSRLWZrrkz,           X86::VPSRLWZrmkz,           0 },
     { X86::VPSUBBZrrkz,           X86::VPSUBBZrmkz,           0 },
     { X86::VPSUBDZrrkz,           X86::VPSUBDZrmkz,           0 },
     { X86::VPSUBQZrrkz,           X86::VPSUBQZrmkz,           0 },
@@ -2387,8 +2659,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZrrkz,       X86::VPUNPCKLWDZrmkz,       0 },
     { X86::VPXORDZrrkz,           X86::VPXORDZrmkz,           0 },
     { X86::VPXORQZrrkz,           X86::VPXORQZrmkz,           0 },
+    { X86::VSHUFPDZrrikz,         X86::VSHUFPDZrmikz,         0 },
+    { X86::VSHUFPSZrrikz,         X86::VSHUFPSZrmikz,         0 },
     { X86::VSUBPDZrrkz,           X86::VSUBPDZrmkz,           0 },
     { X86::VSUBPSZrrkz,           X86::VSUBPSZrmkz,           0 },
+    { X86::VSUBSDZrr_Intkz,       X86::VSUBSDZrm_Intkz,       TB_NO_REVERSE },
+    { X86::VSUBSSZrr_Intkz,       X86::VSUBSSZrm_Intkz,       TB_NO_REVERSE },
     { X86::VUNPCKHPDZrrkz,        X86::VUNPCKHPDZrmkz,        0 },
     { X86::VUNPCKHPSZrrkz,        X86::VUNPCKHPSZrmkz,        0 },
     { X86::VUNPCKLPDZrrkz,        X86::VUNPCKLPDZrmkz,        0 },
@@ -2423,6 +2699,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ256rrkz,        X86::VMULPSZ256rmkz,        0 },
     { X86::VORPDZ256rrkz,         X86::VORPDZ256rmkz,         0 },
     { X86::VORPSZ256rrkz,         X86::VORPSZ256rmkz,         0 },
+    { X86::VPACKSSDWZ256rrkz,     X86::VPACKSSDWZ256rmkz,     0 },
+    { X86::VPACKSSWBZ256rrkz,     X86::VPACKSSWBZ256rmkz,     0 },
+    { X86::VPACKUSDWZ256rrkz,     X86::VPACKUSDWZ256rmkz,     0 },
+    { X86::VPACKUSWBZ256rrkz,     X86::VPACKUSWBZ256rmkz,     0 },
     { X86::VPADDBZ256rrkz,        X86::VPADDBZ256rmkz,        0 },
     { X86::VPADDDZ256rrkz,        X86::VPADDDZ256rmkz,        0 },
     { X86::VPADDQZ256rrkz,        X86::VPADDQZ256rmkz,        0 },
@@ -2436,6 +2716,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ256rrkz,       X86::VPANDNDZ256rmkz,       0 },
     { X86::VPANDNQZ256rrkz,       X86::VPANDNQZ256rmkz,       0 },
     { X86::VPANDQZ256rrkz,        X86::VPANDQZ256rmkz,        0 },
+    { X86::VPAVGBZ256rrkz,        X86::VPAVGBZ256rmkz,        0 },
+    { X86::VPAVGWZ256rrkz,        X86::VPAVGWZ256rmkz,        0 },
     { X86::VPERMBZ256rrkz,        X86::VPERMBZ256rmkz,        0 },
     { X86::VPERMDZ256rrkz,        X86::VPERMDZ256rmkz,        0 },
     { X86::VPERMILPDZ256rrkz,     X86::VPERMILPDZ256rmkz,     0 },
@@ -2446,9 +2728,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZ256rrkz,        X86::VPERMWZ256rmkz,        0 },
     { X86::VPMADDUBSWZ256rrkz,    X86::VPMADDUBSWZ256rmkz,    0 },
     { X86::VPMADDWDZ256rrkz,      X86::VPMADDWDZ256rmkz,      0 },
+    { X86::VPMAXSBZ256rrkz,       X86::VPMAXSBZ256rmkz,       0 },
+    { X86::VPMAXSDZ256rrkz,       X86::VPMAXSDZ256rmkz,       0 },
+    { X86::VPMAXSQZ256rrkz,       X86::VPMAXSQZ256rmkz,       0 },
+    { X86::VPMAXSWZ256rrkz,       X86::VPMAXSWZ256rmkz,       0 },
+    { X86::VPMAXUBZ256rrkz,       X86::VPMAXUBZ256rmkz,       0 },
+    { X86::VPMAXUDZ256rrkz,       X86::VPMAXUDZ256rmkz,       0 },
+    { X86::VPMAXUQZ256rrkz,       X86::VPMAXUQZ256rmkz,       0 },
+    { X86::VPMAXUWZ256rrkz,       X86::VPMAXUWZ256rmkz,       0 },
+    { X86::VPMINSBZ256rrkz,       X86::VPMINSBZ256rmkz,       0 },
+    { X86::VPMINSDZ256rrkz,       X86::VPMINSDZ256rmkz,       0 },
+    { X86::VPMINSQZ256rrkz,       X86::VPMINSQZ256rmkz,       0 },
+    { X86::VPMINSWZ256rrkz,       X86::VPMINSWZ256rmkz,       0 },
+    { X86::VPMINUBZ256rrkz,       X86::VPMINUBZ256rmkz,       0 },
+    { X86::VPMINUDZ256rrkz,       X86::VPMINUDZ256rmkz,       0 },
+    { X86::VPMINUQZ256rrkz,       X86::VPMINUQZ256rmkz,       0 },
+    { X86::VPMINUWZ256rrkz,       X86::VPMINUWZ256rmkz,       0 },
+    { X86::VPMULDQZ256rrkz,       X86::VPMULDQZ256rmkz,       0 },
+    { X86::VPMULLDZ256rrkz,       X86::VPMULLDZ256rmkz,       0 },
+    { X86::VPMULLQZ256rrkz,       X86::VPMULLQZ256rmkz,       0 },
+    { X86::VPMULLWZ256rrkz,       X86::VPMULLWZ256rmkz,       0 },
+    { X86::VPMULUDQZ256rrkz,      X86::VPMULUDQZ256rmkz,      0 },
     { X86::VPORDZ256rrkz,         X86::VPORDZ256rmkz,         0 },
     { X86::VPORQZ256rrkz,         X86::VPORQZ256rmkz,         0 },
     { X86::VPSHUFBZ256rrkz,       X86::VPSHUFBZ256rmkz,       0 },
+    { X86::VPSLLDZ256rrkz,        X86::VPSLLDZ256rmkz,        0 },
+    { X86::VPSLLQZ256rrkz,        X86::VPSLLQZ256rmkz,        0 },
+    { X86::VPSLLVDZ256rrkz,       X86::VPSLLVDZ256rmkz,       0 },
+    { X86::VPSLLVQZ256rrkz,       X86::VPSLLVQZ256rmkz,       0 },
+    { X86::VPSLLVWZ256rrkz,       X86::VPSLLVWZ256rmkz,       0 },
+    { X86::VPSLLWZ256rrkz,        X86::VPSLLWZ256rmkz,        0 },
+    { X86::VPSRADZ256rrkz,        X86::VPSRADZ256rmkz,        0 },
+    { X86::VPSRAQZ256rrkz,        X86::VPSRAQZ256rmkz,        0 },
+    { X86::VPSRAVDZ256rrkz,       X86::VPSRAVDZ256rmkz,       0 },
+    { X86::VPSRAVQZ256rrkz,       X86::VPSRAVQZ256rmkz,       0 },
+    { X86::VPSRAVWZ256rrkz,       X86::VPSRAVWZ256rmkz,       0 },
+    { X86::VPSRAWZ256rrkz,        X86::VPSRAWZ256rmkz,        0 },
+    { X86::VPSRLDZ256rrkz,        X86::VPSRLDZ256rmkz,        0 },
+    { X86::VPSRLQZ256rrkz,        X86::VPSRLQZ256rmkz,        0 },
+    { X86::VPSRLVDZ256rrkz,       X86::VPSRLVDZ256rmkz,       0 },
+    { X86::VPSRLVQZ256rrkz,       X86::VPSRLVQZ256rmkz,       0 },
+    { X86::VPSRLVWZ256rrkz,       X86::VPSRLVWZ256rmkz,       0 },
+    { X86::VPSRLWZ256rrkz,        X86::VPSRLWZ256rmkz,        0 },
     { X86::VPSUBBZ256rrkz,        X86::VPSUBBZ256rmkz,        0 },
     { X86::VPSUBDZ256rrkz,        X86::VPSUBDZ256rmkz,        0 },
     { X86::VPSUBQZ256rrkz,        X86::VPSUBQZ256rmkz,        0 },
@@ -2467,6 +2788,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ256rrkz,    X86::VPUNPCKLWDZ256rmkz,    0 },
     { X86::VPXORDZ256rrkz,        X86::VPXORDZ256rmkz,        0 },
     { X86::VPXORQZ256rrkz,        X86::VPXORQZ256rmkz,        0 },
+    { X86::VSHUFPDZ256rrikz,      X86::VSHUFPDZ256rmikz,      0 },
+    { X86::VSHUFPSZ256rrikz,      X86::VSHUFPSZ256rmikz,      0 },
     { X86::VSUBPDZ256rrkz,        X86::VSUBPDZ256rmkz,        0 },
     { X86::VSUBPSZ256rrkz,        X86::VSUBPSZ256rmkz,        0 },
     { X86::VUNPCKHPDZ256rrkz,     X86::VUNPCKHPDZ256rmkz,     0 },
@@ -2499,6 +2822,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ128rrkz,        X86::VMULPSZ128rmkz,        0 },
     { X86::VORPDZ128rrkz,         X86::VORPDZ128rmkz,         0 },
     { X86::VORPSZ128rrkz,         X86::VORPSZ128rmkz,         0 },
+    { X86::VPACKSSDWZ128rrkz,     X86::VPACKSSDWZ128rmkz,     0 },
+    { X86::VPACKSSWBZ128rrkz,     X86::VPACKSSWBZ128rmkz,     0 },
+    { X86::VPACKUSDWZ128rrkz,     X86::VPACKUSDWZ128rmkz,     0 },
+    { X86::VPACKUSWBZ128rrkz,     X86::VPACKUSWBZ128rmkz,     0 },
     { X86::VPADDBZ128rrkz,        X86::VPADDBZ128rmkz,        0 },
     { X86::VPADDDZ128rrkz,        X86::VPADDDZ128rmkz,        0 },
     { X86::VPADDQZ128rrkz,        X86::VPADDQZ128rmkz,        0 },
@@ -2512,15 +2839,56 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ128rrkz,       X86::VPANDNDZ128rmkz,       0 },
     { X86::VPANDNQZ128rrkz,       X86::VPANDNQZ128rmkz,       0 },
     { X86::VPANDQZ128rrkz,        X86::VPANDQZ128rmkz,        0 },
+    { X86::VPAVGBZ128rrkz,        X86::VPAVGBZ128rmkz,        0 },
+    { X86::VPAVGWZ128rrkz,        X86::VPAVGWZ128rmkz,        0 },
     { X86::VPERMBZ128rrkz,        X86::VPERMBZ128rmkz,        0 },
     { X86::VPERMILPDZ128rrkz,     X86::VPERMILPDZ128rmkz,     0 },
     { X86::VPERMILPSZ128rrkz,     X86::VPERMILPSZ128rmkz,     0 },
     { X86::VPERMWZ128rrkz,        X86::VPERMWZ128rmkz,        0 },
     { X86::VPMADDUBSWZ128rrkz,    X86::VPMADDUBSWZ128rmkz,    0 },
     { X86::VPMADDWDZ128rrkz,      X86::VPMADDWDZ128rmkz,      0 },
+    { X86::VPMAXSBZ128rrkz,       X86::VPMAXSBZ128rmkz,       0 },
+    { X86::VPMAXSDZ128rrkz,       X86::VPMAXSDZ128rmkz,       0 },
+    { X86::VPMAXSQZ128rrkz,       X86::VPMAXSQZ128rmkz,       0 },
+    { X86::VPMAXSWZ128rrkz,       X86::VPMAXSWZ128rmkz,       0 },
+    { X86::VPMAXUBZ128rrkz,       X86::VPMAXUBZ128rmkz,       0 },
+    { X86::VPMAXUDZ128rrkz,       X86::VPMAXUDZ128rmkz,       0 },
+    { X86::VPMAXUQZ128rrkz,       X86::VPMAXUQZ128rmkz,       0 },
+    { X86::VPMAXUWZ128rrkz,       X86::VPMAXUWZ128rmkz,       0 },
+    { X86::VPMINSBZ128rrkz,       X86::VPMINSBZ128rmkz,       0 },
+    { X86::VPMINSDZ128rrkz,       X86::VPMINSDZ128rmkz,       0 },
+    { X86::VPMINSQZ128rrkz,       X86::VPMINSQZ128rmkz,       0 },
+    { X86::VPMINSWZ128rrkz,       X86::VPMINSWZ128rmkz,       0 },
+    { X86::VPMINUBZ128rrkz,       X86::VPMINUBZ128rmkz,       0 },
+    { X86::VPMINUDZ128rrkz,       X86::VPMINUDZ128rmkz,       0 },
+    { X86::VPMINUQZ128rrkz,       X86::VPMINUQZ128rmkz,       0 },
+    { X86::VPMINUWZ128rrkz,       X86::VPMINUWZ128rmkz,       0 },
+    { X86::VPMULDQZ128rrkz,       X86::VPMULDQZ128rmkz,       0 },
+    { X86::VPMULLDZ128rrkz,       X86::VPMULLDZ128rmkz,       0 },
+    { X86::VPMULLQZ128rrkz,       X86::VPMULLQZ128rmkz,       0 },
+    { X86::VPMULLWZ128rrkz,       X86::VPMULLWZ128rmkz,       0 },
+    { X86::VPMULUDQZ128rrkz,      X86::VPMULUDQZ128rmkz,      0 },
     { X86::VPORDZ128rrkz,         X86::VPORDZ128rmkz,         0 },
     { X86::VPORQZ128rrkz,         X86::VPORQZ128rmkz,         0 },
     { X86::VPSHUFBZ128rrkz,       X86::VPSHUFBZ128rmkz,       0 },
+    { X86::VPSLLDZ128rrkz,        X86::VPSLLDZ128rmkz,        0 },
+    { X86::VPSLLQZ128rrkz,        X86::VPSLLQZ128rmkz,        0 },
+    { X86::VPSLLVDZ128rrkz,       X86::VPSLLVDZ128rmkz,       0 },
+    { X86::VPSLLVQZ128rrkz,       X86::VPSLLVQZ128rmkz,       0 },
+    { X86::VPSLLVWZ128rrkz,       X86::VPSLLVWZ128rmkz,       0 },
+    { X86::VPSLLWZ128rrkz,        X86::VPSLLWZ128rmkz,        0 },
+    { X86::VPSRADZ128rrkz,        X86::VPSRADZ128rmkz,        0 },
+    { X86::VPSRAQZ128rrkz,        X86::VPSRAQZ128rmkz,        0 },
+    { X86::VPSRAVDZ128rrkz,       X86::VPSRAVDZ128rmkz,       0 },
+    { X86::VPSRAVQZ128rrkz,       X86::VPSRAVQZ128rmkz,       0 },
+    { X86::VPSRAVWZ128rrkz,       X86::VPSRAVWZ128rmkz,       0 },
+    { X86::VPSRAWZ128rrkz,        X86::VPSRAWZ128rmkz,        0 },
+    { X86::VPSRLDZ128rrkz,        X86::VPSRLDZ128rmkz,        0 },
+    { X86::VPSRLQZ128rrkz,        X86::VPSRLQZ128rmkz,        0 },
+    { X86::VPSRLVDZ128rrkz,       X86::VPSRLVDZ128rmkz,       0 },
+    { X86::VPSRLVQZ128rrkz,       X86::VPSRLVQZ128rmkz,       0 },
+    { X86::VPSRLVWZ128rrkz,       X86::VPSRLVWZ128rmkz,       0 },
+    { X86::VPSRLWZ128rrkz,        X86::VPSRLWZ128rmkz,        0 },
     { X86::VPSUBBZ128rrkz,        X86::VPSUBBZ128rmkz,        0 },
     { X86::VPSUBDZ128rrkz,        X86::VPSUBDZ128rmkz,        0 },
     { X86::VPSUBQZ128rrkz,        X86::VPSUBQZ128rmkz,        0 },
@@ -2539,6 +2907,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ128rrkz,    X86::VPUNPCKLWDZ128rmkz,    0 },
     { X86::VPXORDZ128rrkz,        X86::VPXORDZ128rmkz,        0 },
     { X86::VPXORQZ128rrkz,        X86::VPXORQZ128rmkz,        0 },
+    { X86::VSHUFPDZ128rrikz,      X86::VSHUFPDZ128rmikz,      0 },
+    { X86::VSHUFPSZ128rrikz,      X86::VSHUFPSZ128rmikz,      0 },
     { X86::VSUBPDZ128rrkz,        X86::VSUBPDZ128rmkz,        0 },
     { X86::VSUBPSZ128rrkz,        X86::VSUBPSZ128rmkz,        0 },
     { X86::VUNPCKHPDZ128rrkz,     X86::VUNPCKHPDZ128rmkz,     0 },
@@ -2551,6 +2921,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     // AVX-512 masked foldable instructions
     { X86::VBROADCASTSSZrk,       X86::VBROADCASTSSZmk,       TB_NO_REVERSE },
     { X86::VBROADCASTSDZrk,       X86::VBROADCASTSDZmk,       TB_NO_REVERSE },
+    { X86::VPABSBZrrk,            X86::VPABSBZrmk,            0 },
+    { X86::VPABSDZrrk,            X86::VPABSDZrmk,            0 },
+    { X86::VPABSQZrrk,            X86::VPABSQZrmk,            0 },
+    { X86::VPABSWZrrk,            X86::VPABSWZrmk,            0 },
     { X86::VPERMILPDZrik,         X86::VPERMILPDZmik,         0 },
     { X86::VPERMILPSZrik,         X86::VPERMILPSZmik,         0 },
     { X86::VPERMPDZrik,           X86::VPERMPDZmik,           0 },
@@ -2570,10 +2944,23 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZrik,           X86::VPSHUFDZmik,           0 },
     { X86::VPSHUFHWZrik,          X86::VPSHUFHWZmik,          0 },
     { X86::VPSHUFLWZrik,          X86::VPSHUFLWZmik,          0 },
+    { X86::VPSLLDZrik,            X86::VPSLLDZmik,            0 },
+    { X86::VPSLLQZrik,            X86::VPSLLQZmik,            0 },
+    { X86::VPSLLWZrik,            X86::VPSLLWZmik,            0 },
+    { X86::VPSRADZrik,            X86::VPSRADZmik,            0 },
+    { X86::VPSRAQZrik,            X86::VPSRAQZmik,            0 },
+    { X86::VPSRAWZrik,            X86::VPSRAWZmik,            0 },
+    { X86::VPSRLDZrik,            X86::VPSRLDZmik,            0 },
+    { X86::VPSRLQZrik,            X86::VPSRLQZmik,            0 },
+    { X86::VPSRLWZrik,            X86::VPSRLWZmik,            0 },
 
     // AVX-512VL 256-bit masked foldable instructions
     { X86::VBROADCASTSSZ256rk,    X86::VBROADCASTSSZ256mk,    TB_NO_REVERSE },
     { X86::VBROADCASTSDZ256rk,    X86::VBROADCASTSDZ256mk,    TB_NO_REVERSE },
+    { X86::VPABSBZ256rrk,         X86::VPABSBZ256rmk,         0 },
+    { X86::VPABSDZ256rrk,         X86::VPABSDZ256rmk,         0 },
+    { X86::VPABSQZ256rrk,         X86::VPABSQZ256rmk,         0 },
+    { X86::VPABSWZ256rrk,         X86::VPABSWZ256rmk,         0 },
     { X86::VPERMILPDZ256rik,      X86::VPERMILPDZ256mik,      0 },
     { X86::VPERMILPSZ256rik,      X86::VPERMILPSZ256mik,      0 },
     { X86::VPERMPDZ256rik,        X86::VPERMPDZ256mik,        0 },
@@ -2593,9 +2980,22 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ256rik,        X86::VPSHUFDZ256mik,        0 },
     { X86::VPSHUFHWZ256rik,       X86::VPSHUFHWZ256mik,       0 },
     { X86::VPSHUFLWZ256rik,       X86::VPSHUFLWZ256mik,       0 },
+    { X86::VPSLLDZ256rik,         X86::VPSLLDZ256mik,         0 },
+    { X86::VPSLLQZ256rik,         X86::VPSLLQZ256mik,         0 },
+    { X86::VPSLLWZ256rik,         X86::VPSLLWZ256mik,         0 },
+    { X86::VPSRADZ256rik,         X86::VPSRADZ256mik,         0 },
+    { X86::VPSRAQZ256rik,         X86::VPSRAQZ256mik,         0 },
+    { X86::VPSRAWZ256rik,         X86::VPSRAWZ256mik,         0 },
+    { X86::VPSRLDZ256rik,         X86::VPSRLDZ256mik,         0 },
+    { X86::VPSRLQZ256rik,         X86::VPSRLQZ256mik,         0 },
+    { X86::VPSRLWZ256rik,         X86::VPSRLWZ256mik,         0 },
 
     // AVX-512VL 128-bit masked foldable instructions
     { X86::VBROADCASTSSZ128rk,    X86::VBROADCASTSSZ128mk,    TB_NO_REVERSE },
+    { X86::VPABSBZ128rrk,         X86::VPABSBZ128rmk,         0 },
+    { X86::VPABSDZ128rrk,         X86::VPABSDZ128rmk,         0 },
+    { X86::VPABSQZ128rrk,         X86::VPABSQZ128rmk,         0 },
+    { X86::VPABSWZ128rrk,         X86::VPABSWZ128rmk,         0 },
     { X86::VPERMILPDZ128rik,      X86::VPERMILPDZ128mik,      0 },
     { X86::VPERMILPSZ128rik,      X86::VPERMILPSZ128mik,      0 },
     { X86::VPMOVSXBDZ128rrk,      X86::VPMOVSXBDZ128rmk,      TB_NO_REVERSE },
@@ -2613,6 +3013,15 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPSHUFDZ128rik,        X86::VPSHUFDZ128mik,        0 },
     { X86::VPSHUFHWZ128rik,       X86::VPSHUFHWZ128mik,       0 },
     { X86::VPSHUFLWZ128rik,       X86::VPSHUFLWZ128mik,       0 },
+    { X86::VPSLLDZ128rik,         X86::VPSLLDZ128mik,         0 },
+    { X86::VPSLLQZ128rik,         X86::VPSLLQZ128mik,         0 },
+    { X86::VPSLLWZ128rik,         X86::VPSLLWZ128mik,         0 },
+    { X86::VPSRADZ128rik,         X86::VPSRADZ128mik,         0 },
+    { X86::VPSRAQZ128rik,         X86::VPSRAQZ128mik,         0 },
+    { X86::VPSRAWZ128rik,         X86::VPSRAWZ128mik,         0 },
+    { X86::VPSRLDZ128rik,         X86::VPSRLDZ128mik,         0 },
+    { X86::VPSRLQZ128rik,         X86::VPSRLQZ128mik,         0 },
+    { X86::VPSRLWZ128rik,         X86::VPSRLWZ128mik,         0 },
   };
 
   for (X86MemoryFoldTableEntry Entry : MemoryFoldTable3) {
@@ -2642,6 +3051,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     // AVX-512 foldable masked instructions
     { X86::VADDPDZrrk,         X86::VADDPDZrmk,           0 },
     { X86::VADDPSZrrk,         X86::VADDPSZrmk,           0 },
+    { X86::VADDSDZrr_Intk,     X86::VADDSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VADDSSZrr_Intk,     X86::VADDSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VALIGNDZrrik,       X86::VALIGNDZrmik,         0 },
     { X86::VALIGNQZrrik,       X86::VALIGNQZrmik,         0 },
     { X86::VANDNPDZrrk,        X86::VANDNPDZrmk,          0 },
@@ -2650,6 +3061,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VANDPSZrrk,         X86::VANDPSZrmk,           0 },
     { X86::VDIVPDZrrk,         X86::VDIVPDZrmk,           0 },
     { X86::VDIVPSZrrk,         X86::VDIVPSZrmk,           0 },
+    { X86::VDIVSDZrr_Intk,     X86::VDIVSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VDIVSSZrr_Intk,     X86::VDIVSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VINSERTF32x4Zrrk,   X86::VINSERTF32x4Zrmk,     0 },
     { X86::VINSERTF32x8Zrrk,   X86::VINSERTF32x8Zrmk,     0 },
     { X86::VINSERTF64x2Zrrk,   X86::VINSERTF64x2Zrmk,     0 },
@@ -2662,14 +3075,24 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMAXCPSZrrk,        X86::VMAXCPSZrmk,          0 },
     { X86::VMAXPDZrrk,         X86::VMAXPDZrmk,           0 },
     { X86::VMAXPSZrrk,         X86::VMAXPSZrmk,           0 },
+    { X86::VMAXSDZrr_Intk,     X86::VMAXSDZrm_Intk,       0 },
+    { X86::VMAXSSZrr_Intk,     X86::VMAXSSZrm_Intk,       0 },
     { X86::VMINCPDZrrk,        X86::VMINCPDZrmk,          0 },
     { X86::VMINCPSZrrk,        X86::VMINCPSZrmk,          0 },
     { X86::VMINPDZrrk,         X86::VMINPDZrmk,           0 },
     { X86::VMINPSZrrk,         X86::VMINPSZrmk,           0 },
+    { X86::VMINSDZrr_Intk,     X86::VMINSDZrm_Intk,       0 },
+    { X86::VMINSSZrr_Intk,     X86::VMINSSZrm_Intk,       0 },
     { X86::VMULPDZrrk,         X86::VMULPDZrmk,           0 },
     { X86::VMULPSZrrk,         X86::VMULPSZrmk,           0 },
+    { X86::VMULSDZrr_Intk,     X86::VMULSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VMULSSZrr_Intk,     X86::VMULSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VORPDZrrk,          X86::VORPDZrmk,            0 },
     { X86::VORPSZrrk,          X86::VORPSZrmk,            0 },
+    { X86::VPACKSSDWZrrk,      X86::VPACKSSDWZrmk,        0 },
+    { X86::VPACKSSWBZrrk,      X86::VPACKSSWBZrmk,        0 },
+    { X86::VPACKUSDWZrrk,      X86::VPACKUSDWZrmk,        0 },
+    { X86::VPACKUSWBZrrk,      X86::VPACKUSWBZrmk,        0 },
     { X86::VPADDBZrrk,         X86::VPADDBZrmk,           0 },
     { X86::VPADDDZrrk,         X86::VPADDDZrmk,           0 },
     { X86::VPADDQZrrk,         X86::VPADDQZrmk,           0 },
@@ -2683,6 +3106,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZrrk,        X86::VPANDNDZrmk,          0 },
     { X86::VPANDNQZrrk,        X86::VPANDNQZrmk,          0 },
     { X86::VPANDQZrrk,         X86::VPANDQZrmk,           0 },
+    { X86::VPAVGBZrrk,         X86::VPAVGBZrmk,           0 },
+    { X86::VPAVGWZrrk,         X86::VPAVGWZrmk,           0 },
     { X86::VPERMBZrrk,         X86::VPERMBZrmk,           0 },
     { X86::VPERMDZrrk,         X86::VPERMDZrmk,           0 },
     { X86::VPERMI2Brrk,        X86::VPERMI2Brmk,          0 },
@@ -2705,9 +3130,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZrrk,         X86::VPERMWZrmk,           0 },
     { X86::VPMADDUBSWZrrk,     X86::VPMADDUBSWZrmk,       0 },
     { X86::VPMADDWDZrrk,       X86::VPMADDWDZrmk,         0 },
+    { X86::VPMAXSBZrrk,        X86::VPMAXSBZrmk,          0 },
+    { X86::VPMAXSDZrrk,        X86::VPMAXSDZrmk,          0 },
+    { X86::VPMAXSQZrrk,        X86::VPMAXSQZrmk,          0 },
+    { X86::VPMAXSWZrrk,        X86::VPMAXSWZrmk,          0 },
+    { X86::VPMAXUBZrrk,        X86::VPMAXUBZrmk,          0 },
+    { X86::VPMAXUDZrrk,        X86::VPMAXUDZrmk,          0 },
+    { X86::VPMAXUQZrrk,        X86::VPMAXUQZrmk,          0 },
+    { X86::VPMAXUWZrrk,        X86::VPMAXUWZrmk,          0 },
+    { X86::VPMINSBZrrk,        X86::VPMINSBZrmk,          0 },
+    { X86::VPMINSDZrrk,        X86::VPMINSDZrmk,          0 },
+    { X86::VPMINSQZrrk,        X86::VPMINSQZrmk,          0 },
+    { X86::VPMINSWZrrk,        X86::VPMINSWZrmk,          0 },
+    { X86::VPMINUBZrrk,        X86::VPMINUBZrmk,          0 },
+    { X86::VPMINUDZrrk,        X86::VPMINUDZrmk,          0 },
+    { X86::VPMINUQZrrk,        X86::VPMINUQZrmk,          0 },
+    { X86::VPMINUWZrrk,        X86::VPMINUWZrmk,          0 },
+    { X86::VPMULDQZrrk,        X86::VPMULDQZrmk,          0 },
+    { X86::VPMULLDZrrk,        X86::VPMULLDZrmk,          0 },
+    { X86::VPMULLQZrrk,        X86::VPMULLQZrmk,          0 },
+    { X86::VPMULLWZrrk,        X86::VPMULLWZrmk,          0 },
+    { X86::VPMULUDQZrrk,       X86::VPMULUDQZrmk,         0 },
     { X86::VPORDZrrk,          X86::VPORDZrmk,            0 },
     { X86::VPORQZrrk,          X86::VPORQZrmk,            0 },
     { X86::VPSHUFBZrrk,        X86::VPSHUFBZrmk,          0 },
+    { X86::VPSLLDZrrk,         X86::VPSLLDZrmk,           0 },
+    { X86::VPSLLQZrrk,         X86::VPSLLQZrmk,           0 },
+    { X86::VPSLLVDZrrk,        X86::VPSLLVDZrmk,          0 },
+    { X86::VPSLLVQZrrk,        X86::VPSLLVQZrmk,          0 },
+    { X86::VPSLLVWZrrk,        X86::VPSLLVWZrmk,          0 },
+    { X86::VPSLLWZrrk,         X86::VPSLLWZrmk,           0 },
+    { X86::VPSRADZrrk,         X86::VPSRADZrmk,           0 },
+    { X86::VPSRAQZrrk,         X86::VPSRAQZrmk,           0 },
+    { X86::VPSRAVDZrrk,        X86::VPSRAVDZrmk,          0 },
+    { X86::VPSRAVQZrrk,        X86::VPSRAVQZrmk,          0 },
+    { X86::VPSRAVWZrrk,        X86::VPSRAVWZrmk,          0 },
+    { X86::VPSRAWZrrk,         X86::VPSRAWZrmk,           0 },
+    { X86::VPSRLDZrrk,         X86::VPSRLDZrmk,           0 },
+    { X86::VPSRLQZrrk,         X86::VPSRLQZrmk,           0 },
+    { X86::VPSRLVDZrrk,        X86::VPSRLVDZrmk,          0 },
+    { X86::VPSRLVQZrrk,        X86::VPSRLVQZrmk,          0 },
+    { X86::VPSRLVWZrrk,        X86::VPSRLVWZrmk,          0 },
+    { X86::VPSRLWZrrk,         X86::VPSRLWZrmk,           0 },
     { X86::VPSUBBZrrk,         X86::VPSUBBZrmk,           0 },
     { X86::VPSUBDZrrk,         X86::VPSUBDZrmk,           0 },
     { X86::VPSUBQZrrk,         X86::VPSUBQZrmk,           0 },
@@ -2727,8 +3191,12 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZrrk,     X86::VPUNPCKLWDZrmk,       0 },
     { X86::VPXORDZrrk,         X86::VPXORDZrmk,           0 },
     { X86::VPXORQZrrk,         X86::VPXORQZrmk,           0 },
+    { X86::VSHUFPDZrrik,       X86::VSHUFPDZrmik,         0 },
+    { X86::VSHUFPSZrrik,       X86::VSHUFPSZrmik,         0 },
     { X86::VSUBPDZrrk,         X86::VSUBPDZrmk,           0 },
     { X86::VSUBPSZrrk,         X86::VSUBPSZrmk,           0 },
+    { X86::VSUBSDZrr_Intk,     X86::VSUBSDZrm_Intk,       TB_NO_REVERSE },
+    { X86::VSUBSSZrr_Intk,     X86::VSUBSSZrm_Intk,       TB_NO_REVERSE },
     { X86::VUNPCKHPDZrrk,      X86::VUNPCKHPDZrmk,        0 },
     { X86::VUNPCKHPSZrrk,      X86::VUNPCKHPSZrmk,        0 },
     { X86::VUNPCKLPDZrrk,      X86::VUNPCKLPDZrmk,        0 },
@@ -2763,6 +3231,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ256rrk,      X86::VMULPSZ256rmk,        0 },
     { X86::VORPDZ256rrk,       X86::VORPDZ256rmk,         0 },
     { X86::VORPSZ256rrk,       X86::VORPSZ256rmk,         0 },
+    { X86::VPACKSSDWZ256rrk,   X86::VPACKSSDWZ256rmk,     0 },
+    { X86::VPACKSSWBZ256rrk,   X86::VPACKSSWBZ256rmk,     0 },
+    { X86::VPACKUSDWZ256rrk,   X86::VPACKUSDWZ256rmk,     0 },
+    { X86::VPACKUSWBZ256rrk,   X86::VPACKUSWBZ256rmk,     0 },
     { X86::VPADDBZ256rrk,      X86::VPADDBZ256rmk,        0 },
     { X86::VPADDDZ256rrk,      X86::VPADDDZ256rmk,        0 },
     { X86::VPADDQZ256rrk,      X86::VPADDQZ256rmk,        0 },
@@ -2776,6 +3248,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ256rrk,     X86::VPANDNDZ256rmk,       0 },
     { X86::VPANDNQZ256rrk,     X86::VPANDNQZ256rmk,       0 },
     { X86::VPANDQZ256rrk,      X86::VPANDQZ256rmk,        0 },
+    { X86::VPAVGBZ256rrk,      X86::VPAVGBZ256rmk,        0 },
+    { X86::VPAVGWZ256rrk,      X86::VPAVGWZ256rmk,        0 },
     { X86::VPERMBZ256rrk,      X86::VPERMBZ256rmk,        0 },
     { X86::VPERMDZ256rrk,      X86::VPERMDZ256rmk,        0 },
     { X86::VPERMI2B256rrk,     X86::VPERMI2B256rmk,       0 },
@@ -2798,9 +3272,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZ256rrk,      X86::VPERMWZ256rmk,        0 },
     { X86::VPMADDUBSWZ256rrk,  X86::VPMADDUBSWZ256rmk,    0 },
     { X86::VPMADDWDZ256rrk,    X86::VPMADDWDZ256rmk,      0 },
+    { X86::VPMAXSBZ256rrk,     X86::VPMAXSBZ256rmk,       0 },
+    { X86::VPMAXSDZ256rrk,     X86::VPMAXSDZ256rmk,       0 },
+    { X86::VPMAXSQZ256rrk,     X86::VPMAXSQZ256rmk,       0 },
+    { X86::VPMAXSWZ256rrk,     X86::VPMAXSWZ256rmk,       0 },
+    { X86::VPMAXUBZ256rrk,     X86::VPMAXUBZ256rmk,       0 },
+    { X86::VPMAXUDZ256rrk,     X86::VPMAXUDZ256rmk,       0 },
+    { X86::VPMAXUQZ256rrk,     X86::VPMAXUQZ256rmk,       0 },
+    { X86::VPMAXUWZ256rrk,     X86::VPMAXUWZ256rmk,       0 },
+    { X86::VPMINSBZ256rrk,     X86::VPMINSBZ256rmk,       0 },
+    { X86::VPMINSDZ256rrk,     X86::VPMINSDZ256rmk,       0 },
+    { X86::VPMINSQZ256rrk,     X86::VPMINSQZ256rmk,       0 },
+    { X86::VPMINSWZ256rrk,     X86::VPMINSWZ256rmk,       0 },
+    { X86::VPMINUBZ256rrk,     X86::VPMINUBZ256rmk,       0 },
+    { X86::VPMINUDZ256rrk,     X86::VPMINUDZ256rmk,       0 },
+    { X86::VPMINUQZ256rrk,     X86::VPMINUQZ256rmk,       0 },
+    { X86::VPMINUWZ256rrk,     X86::VPMINUWZ256rmk,       0 },
+    { X86::VPMULDQZ256rrk,     X86::VPMULDQZ256rmk,       0 },
+    { X86::VPMULLDZ256rrk,     X86::VPMULLDZ256rmk,       0 },
+    { X86::VPMULLQZ256rrk,     X86::VPMULLQZ256rmk,       0 },
+    { X86::VPMULLWZ256rrk,     X86::VPMULLWZ256rmk,       0 },
+    { X86::VPMULUDQZ256rrk,    X86::VPMULUDQZ256rmk,      0 },
     { X86::VPORDZ256rrk,       X86::VPORDZ256rmk,         0 },
     { X86::VPORQZ256rrk,       X86::VPORQZ256rmk,         0 },
     { X86::VPSHUFBZ256rrk,     X86::VPSHUFBZ256rmk,       0 },
+    { X86::VPSLLDZ256rrk,      X86::VPSLLDZ256rmk,        0 },
+    { X86::VPSLLQZ256rrk,      X86::VPSLLQZ256rmk,        0 },
+    { X86::VPSLLVDZ256rrk,     X86::VPSLLVDZ256rmk,       0 },
+    { X86::VPSLLVQZ256rrk,     X86::VPSLLVQZ256rmk,       0 },
+    { X86::VPSLLVWZ256rrk,     X86::VPSLLVWZ256rmk,       0 },
+    { X86::VPSLLWZ256rrk,      X86::VPSLLWZ256rmk,        0 },
+    { X86::VPSRADZ256rrk,      X86::VPSRADZ256rmk,        0 },
+    { X86::VPSRAQZ256rrk,      X86::VPSRAQZ256rmk,        0 },
+    { X86::VPSRAVDZ256rrk,     X86::VPSRAVDZ256rmk,       0 },
+    { X86::VPSRAVQZ256rrk,     X86::VPSRAVQZ256rmk,       0 },
+    { X86::VPSRAVWZ256rrk,     X86::VPSRAVWZ256rmk,       0 },
+    { X86::VPSRAWZ256rrk,      X86::VPSRAWZ256rmk,        0 },
+    { X86::VPSRLDZ256rrk,      X86::VPSRLDZ256rmk,        0 },
+    { X86::VPSRLQZ256rrk,      X86::VPSRLQZ256rmk,        0 },
+    { X86::VPSRLVDZ256rrk,     X86::VPSRLVDZ256rmk,       0 },
+    { X86::VPSRLVQZ256rrk,     X86::VPSRLVQZ256rmk,       0 },
+    { X86::VPSRLVWZ256rrk,     X86::VPSRLVWZ256rmk,       0 },
+    { X86::VPSRLWZ256rrk,      X86::VPSRLWZ256rmk,        0 },
     { X86::VPSUBBZ256rrk,      X86::VPSUBBZ256rmk,        0 },
     { X86::VPSUBDZ256rrk,      X86::VPSUBDZ256rmk,        0 },
     { X86::VPSUBQZ256rrk,      X86::VPSUBQZ256rmk,        0 },
@@ -2821,6 +3334,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ256rrk,  X86::VPUNPCKLWDZ256rmk,    0 },
     { X86::VPXORDZ256rrk,      X86::VPXORDZ256rmk,        0 },
     { X86::VPXORQZ256rrk,      X86::VPXORQZ256rmk,        0 },
+    { X86::VSHUFPDZ256rrik,    X86::VSHUFPDZ256rmik,      0 },
+    { X86::VSHUFPSZ256rrik,    X86::VSHUFPSZ256rmik,      0 },
     { X86::VSUBPDZ256rrk,      X86::VSUBPDZ256rmk,        0 },
     { X86::VSUBPSZ256rrk,      X86::VSUBPSZ256rmk,        0 },
     { X86::VUNPCKHPDZ256rrk,   X86::VUNPCKHPDZ256rmk,     0 },
@@ -2853,6 +3368,10 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VMULPSZ128rrk,      X86::VMULPSZ128rmk,        0 },
     { X86::VORPDZ128rrk,       X86::VORPDZ128rmk,         0 },
     { X86::VORPSZ128rrk,       X86::VORPSZ128rmk,         0 },
+    { X86::VPACKSSDWZ128rrk,   X86::VPACKSSDWZ128rmk,     0 },
+    { X86::VPACKSSWBZ128rrk,   X86::VPACKSSWBZ128rmk,     0 },
+    { X86::VPACKUSDWZ128rrk,   X86::VPACKUSDWZ128rmk,     0 },
+    { X86::VPACKUSWBZ128rrk,   X86::VPACKUSWBZ128rmk,     0 },
     { X86::VPADDBZ128rrk,      X86::VPADDBZ128rmk,        0 },
     { X86::VPADDDZ128rrk,      X86::VPADDDZ128rmk,        0 },
     { X86::VPADDQZ128rrk,      X86::VPADDQZ128rmk,        0 },
@@ -2866,6 +3385,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPANDNDZ128rrk,     X86::VPANDNDZ128rmk,       0 },
     { X86::VPANDNQZ128rrk,     X86::VPANDNQZ128rmk,       0 },
     { X86::VPANDQZ128rrk,      X86::VPANDQZ128rmk,        0 },
+    { X86::VPAVGBZ128rrk,      X86::VPAVGBZ128rmk,        0 },
+    { X86::VPAVGWZ128rrk,      X86::VPAVGWZ128rmk,        0 },
     { X86::VPERMBZ128rrk,      X86::VPERMBZ128rmk,        0 },
     { X86::VPERMI2B128rrk,     X86::VPERMI2B128rmk,       0 },
     { X86::VPERMI2D128rrk,     X86::VPERMI2D128rmk,       0 },
@@ -2884,9 +3405,48 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPERMWZ128rrk,      X86::VPERMWZ128rmk,        0 },
     { X86::VPMADDUBSWZ128rrk,  X86::VPMADDUBSWZ128rmk,    0 },
     { X86::VPMADDWDZ128rrk,    X86::VPMADDWDZ128rmk,      0 },
+    { X86::VPMAXSBZ128rrk,     X86::VPMAXSBZ128rmk,       0 },
+    { X86::VPMAXSDZ128rrk,     X86::VPMAXSDZ128rmk,       0 },
+    { X86::VPMAXSQZ128rrk,     X86::VPMAXSQZ128rmk,       0 },
+    { X86::VPMAXSWZ128rrk,     X86::VPMAXSWZ128rmk,       0 },
+    { X86::VPMAXUBZ128rrk,     X86::VPMAXUBZ128rmk,       0 },
+    { X86::VPMAXUDZ128rrk,     X86::VPMAXUDZ128rmk,       0 },
+    { X86::VPMAXUQZ128rrk,     X86::VPMAXUQZ128rmk,       0 },
+    { X86::VPMAXUWZ128rrk,     X86::VPMAXUWZ128rmk,       0 },
+    { X86::VPMINSBZ128rrk,     X86::VPMINSBZ128rmk,       0 },
+    { X86::VPMINSDZ128rrk,     X86::VPMINSDZ128rmk,       0 },
+    { X86::VPMINSQZ128rrk,     X86::VPMINSQZ128rmk,       0 },
+    { X86::VPMINSWZ128rrk,     X86::VPMINSWZ128rmk,       0 },
+    { X86::VPMINUBZ128rrk,     X86::VPMINUBZ128rmk,       0 },
+    { X86::VPMINUDZ128rrk,     X86::VPMINUDZ128rmk,       0 },
+    { X86::VPMINUQZ128rrk,     X86::VPMINUQZ128rmk,       0 },
+    { X86::VPMINUWZ128rrk,     X86::VPMINUWZ128rmk,       0 },
+    { X86::VPMULDQZ128rrk,     X86::VPMULDQZ128rmk,       0 },
+    { X86::VPMULLDZ128rrk,     X86::VPMULLDZ128rmk,       0 },
+    { X86::VPMULLQZ128rrk,     X86::VPMULLQZ128rmk,       0 },
+    { X86::VPMULLWZ128rrk,     X86::VPMULLWZ128rmk,       0 },
+    { X86::VPMULUDQZ128rrk,    X86::VPMULUDQZ128rmk,      0 },
     { X86::VPORDZ128rrk,       X86::VPORDZ128rmk,         0 },
     { X86::VPORQZ128rrk,       X86::VPORQZ128rmk,         0 },
     { X86::VPSHUFBZ128rrk,     X86::VPSHUFBZ128rmk,       0 },
+    { X86::VPSLLDZ128rrk,      X86::VPSLLDZ128rmk,        0 },
+    { X86::VPSLLQZ128rrk,      X86::VPSLLQZ128rmk,        0 },
+    { X86::VPSLLVDZ128rrk,     X86::VPSLLVDZ128rmk,       0 },
+    { X86::VPSLLVQZ128rrk,     X86::VPSLLVQZ128rmk,       0 },
+    { X86::VPSLLVWZ128rrk,     X86::VPSLLVWZ128rmk,       0 },
+    { X86::VPSLLWZ128rrk,      X86::VPSLLWZ128rmk,        0 },
+    { X86::VPSRADZ128rrk,      X86::VPSRADZ128rmk,        0 },
+    { X86::VPSRAQZ128rrk,      X86::VPSRAQZ128rmk,        0 },
+    { X86::VPSRAVDZ128rrk,     X86::VPSRAVDZ128rmk,       0 },
+    { X86::VPSRAVQZ128rrk,     X86::VPSRAVQZ128rmk,       0 },
+    { X86::VPSRAVWZ128rrk,     X86::VPSRAVWZ128rmk,       0 },
+    { X86::VPSRAWZ128rrk,      X86::VPSRAWZ128rmk,        0 },
+    { X86::VPSRLDZ128rrk,      X86::VPSRLDZ128rmk,        0 },
+    { X86::VPSRLQZ128rrk,      X86::VPSRLQZ128rmk,        0 },
+    { X86::VPSRLVDZ128rrk,     X86::VPSRLVDZ128rmk,       0 },
+    { X86::VPSRLVQZ128rrk,     X86::VPSRLVQZ128rmk,       0 },
+    { X86::VPSRLVWZ128rrk,     X86::VPSRLVWZ128rmk,       0 },
+    { X86::VPSRLWZ128rrk,      X86::VPSRLWZ128rmk,        0 },
     { X86::VPSUBBZ128rrk,      X86::VPSUBBZ128rmk,        0 },
     { X86::VPSUBDZ128rrk,      X86::VPSUBDZ128rmk,        0 },
     { X86::VPSUBQZ128rrk,      X86::VPSUBQZ128rmk,        0 },
@@ -2907,6 +3467,8 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
     { X86::VPUNPCKLWDZ128rrk,  X86::VPUNPCKLWDZ128rmk,    0 },
     { X86::VPXORDZ128rrk,      X86::VPXORDZ128rmk,        0 },
     { X86::VPXORQZ128rrk,      X86::VPXORQZ128rmk,        0 },
+    { X86::VSHUFPDZ128rrik,    X86::VSHUFPDZ128rmik,      0 },
+    { X86::VSHUFPSZ128rrik,    X86::VSHUFPSZ128rmik,      0 },
     { X86::VSUBPDZ128rrk,      X86::VSUBPDZ128rmk,        0 },
     { X86::VSUBPSZ128rrk,      X86::VSUBPSZ128rmk,        0 },
     { X86::VUNPCKHPDZ128rrk,   X86::VUNPCKHPDZ128rmk,     0 },
@@ -4445,6 +5007,18 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
                                                    OpIdx1, OpIdx2);
   }
+  case X86::PFSUBrr:
+  case X86::PFSUBRrr: {
+    // PFSUB  x, y: x = x - y
+    // PFSUBR x, y: x = y - x
+    unsigned Opc =
+        (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
+    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI.setDesc(get(Opc));
+    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
+                                                   OpIdx1, OpIdx2);
+    break;
+  }
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
   case X86::PBLENDWrri:
@@ -4714,18 +5288,30 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
   case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
   case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
-  case X86::VPTERNLOGDZrrik:     case X86::VPTERNLOGDZrmik:
-  case X86::VPTERNLOGDZ128rrik:  case X86::VPTERNLOGDZ128rmik:
-  case X86::VPTERNLOGDZ256rrik:  case X86::VPTERNLOGDZ256rmik:
-  case X86::VPTERNLOGQZrrik:     case X86::VPTERNLOGQZrmik:
-  case X86::VPTERNLOGQZ128rrik:  case X86::VPTERNLOGQZ128rmik:
-  case X86::VPTERNLOGQZ256rrik:  case X86::VPTERNLOGQZ256rmik:
+  case X86::VPTERNLOGDZrrik:
+  case X86::VPTERNLOGDZ128rrik:
+  case X86::VPTERNLOGDZ256rrik:
+  case X86::VPTERNLOGQZrrik:
+  case X86::VPTERNLOGQZ128rrik:
+  case X86::VPTERNLOGQZ256rrik:
   case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
   case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
   case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
   case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
   case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
-  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz: {
+  case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+  case X86::VPTERNLOGDZ128rmbi:
+  case X86::VPTERNLOGDZ256rmbi:
+  case X86::VPTERNLOGDZrmbi:
+  case X86::VPTERNLOGQZ128rmbi:
+  case X86::VPTERNLOGQZ256rmbi:
+  case X86::VPTERNLOGQZrmbi:
+  case X86::VPTERNLOGDZ128rmbikz:
+  case X86::VPTERNLOGDZ256rmbikz:
+  case X86::VPTERNLOGDZrmbikz:
+  case X86::VPTERNLOGQZ128rmbikz:
+  case X86::VPTERNLOGQZ256rmbikz:
+  case X86::VPTERNLOGQZrmbikz: {
     auto &WorkingMI = cloneIfNew(MI);
     if (!commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2))
       return nullptr;
@@ -4906,18 +5492,30 @@ bool X86InstrInfo::findCommutedOpIndices(MachineInstr &MI, unsigned &SrcOpIdx1,
   case X86::VPTERNLOGQZrri:      case X86::VPTERNLOGQZrmi:
   case X86::VPTERNLOGQZ128rri:   case X86::VPTERNLOGQZ128rmi:
   case X86::VPTERNLOGQZ256rri:   case X86::VPTERNLOGQZ256rmi:
-  case X86::VPTERNLOGDZrrik:     case X86::VPTERNLOGDZrmik:
-  case X86::VPTERNLOGDZ128rrik:  case X86::VPTERNLOGDZ128rmik:
-  case X86::VPTERNLOGDZ256rrik:  case X86::VPTERNLOGDZ256rmik:
-  case X86::VPTERNLOGQZrrik:     case X86::VPTERNLOGQZrmik:
-  case X86::VPTERNLOGQZ128rrik:  case X86::VPTERNLOGQZ128rmik:
-  case X86::VPTERNLOGQZ256rrik:  case X86::VPTERNLOGQZ256rmik:
+  case X86::VPTERNLOGDZrrik:
+  case X86::VPTERNLOGDZ128rrik:
+  case X86::VPTERNLOGDZ256rrik:
+  case X86::VPTERNLOGQZrrik:
+  case X86::VPTERNLOGQZ128rrik:
+  case X86::VPTERNLOGQZ256rrik:
   case X86::VPTERNLOGDZrrikz:    case X86::VPTERNLOGDZrmikz:
   case X86::VPTERNLOGDZ128rrikz: case X86::VPTERNLOGDZ128rmikz:
   case X86::VPTERNLOGDZ256rrikz: case X86::VPTERNLOGDZ256rmikz:
   case X86::VPTERNLOGQZrrikz:    case X86::VPTERNLOGQZrmikz:
   case X86::VPTERNLOGQZ128rrikz: case X86::VPTERNLOGQZ128rmikz:
   case X86::VPTERNLOGQZ256rrikz: case X86::VPTERNLOGQZ256rmikz:
+  case X86::VPTERNLOGDZ128rmbi:
+  case X86::VPTERNLOGDZ256rmbi:
+  case X86::VPTERNLOGDZrmbi:
+  case X86::VPTERNLOGQZ128rmbi:
+  case X86::VPTERNLOGQZ256rmbi:
+  case X86::VPTERNLOGQZrmbi:
+  case X86::VPTERNLOGDZ128rmbikz:
+  case X86::VPTERNLOGDZ256rmbikz:
+  case X86::VPTERNLOGDZrmbikz:
+  case X86::VPTERNLOGQZ128rmbikz:
+  case X86::VPTERNLOGQZ256rmbikz:
+  case X86::VPTERNLOGQZrmbikz:
     return findThreeSrcCommutedOpIndices(MI, SrcOpIdx1, SrcOpIdx2);
   default:
     const X86InstrFMA3Group *FMA3Group =
@@ -5239,9 +5837,9 @@ bool X86InstrInfo::canMakeTailCallConditional(
     return false;
   }
 
-  if (Subtarget.isTargetWin64()) {
+  const MachineFunction *MF = TailCall.getParent()->getParent();
+  if (Subtarget.isTargetWin64() && MF->hasWinCFI()) {
     // Conditional tail calls confuse the Win64 unwinder.
-    // TODO: Allow them for "leaf" functions; PR30337.
     return false;
   }
 
@@ -5251,8 +5849,7 @@ bool X86InstrInfo::canMakeTailCallConditional(
     return false;
   }
 
-  const X86MachineFunctionInfo *X86FI =
-      TailCall.getParent()->getParent()->getInfo<X86MachineFunctionInfo>();
+  const X86MachineFunctionInfo *X86FI = MF->getInfo<X86MachineFunctionInfo>();
   if (X86FI->getTCReturnAddrDelta() != 0 ||
       TailCall.getOperand(1).getImm() != 0) {
     // A conditional tail call cannot do any stack adjustment.
@@ -5292,6 +5889,17 @@ void X86InstrInfo::replaceBranchWithTailCall(
   MIB->addOperand(BranchCond[0]); // Condition.
   MIB.copyImplicitOps(TailCall); // Regmask and (imp-used) parameters.
 
+  // Add implicit uses and defs of all live regs potentially clobbered by the
+  // call. This way they still appear live across the call.
+  LivePhysRegs LiveRegs(&getRegisterInfo());
+  LiveRegs.addLiveOuts(MBB);
+  SmallVector<std::pair<unsigned, const MachineOperand *>, 8> Clobbers;
+  LiveRegs.stepForward(*MIB, Clobbers);
+  for (const auto &C : Clobbers) {
+    MIB.addReg(C.first, RegState::Implicit);
+    MIB.addReg(C.first, RegState::Implicit | RegState::Define);
+  }
+
   I->eraseFromParent();
 }
 
@@ -5701,8 +6309,6 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
 
   // SrcReg(MaskReg) -> DestReg(GR64)
   // SrcReg(MaskReg) -> DestReg(GR32)
-  // SrcReg(MaskReg) -> DestReg(GR16)
-  // SrcReg(MaskReg) -> DestReg(GR8)
 
   // All KMASK RegClasses hold the same k registers, can be tested against anyone.
   if (X86::VK16RegClass.contains(SrcReg)) {
@@ -5712,20 +6318,10 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
     }
     if (X86::GR32RegClass.contains(DestReg))
       return Subtarget.hasBWI() ? X86::KMOVDrk : X86::KMOVWrk;
-    if (X86::GR16RegClass.contains(DestReg)) {
-      DestReg = getX86SubSuperRegister(DestReg, 32);
-      return X86::KMOVWrk;
-    }
-    if (X86::GR8RegClass.contains(DestReg)) {
-      DestReg = getX86SubSuperRegister(DestReg, 32);
-      return Subtarget.hasDQI() ? X86::KMOVBrk : X86::KMOVWrk;
-    }
   }
 
   // SrcReg(GR64) -> DestReg(MaskReg)
   // SrcReg(GR32) -> DestReg(MaskReg)
-  // SrcReg(GR16) -> DestReg(MaskReg)
-  // SrcReg(GR8)  -> DestReg(MaskReg)
 
   // All KMASK RegClasses hold the same k registers, can be tested against anyone.
   if (X86::VK16RegClass.contains(DestReg)) {
@@ -5735,14 +6331,6 @@ static unsigned CopyToFromAsymmetricReg(unsigned &DestReg, unsigned &SrcReg,
     }
     if (X86::GR32RegClass.contains(SrcReg))
       return Subtarget.hasBWI() ? X86::KMOVDkr : X86::KMOVWkr;
-    if (X86::GR16RegClass.contains(SrcReg)) {
-      SrcReg = getX86SubSuperRegister(SrcReg, 32);
-      return X86::KMOVWkr;
-    }
-    if (X86::GR8RegClass.contains(SrcReg)) {
-      SrcReg = getX86SubSuperRegister(SrcReg, 32);
-      return Subtarget.hasDQI() ? X86::KMOVBkr : X86::KMOVWkr;
-    }
   }
 
 
@@ -6204,12 +6792,14 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::CMP16ri:
   case X86::CMP16ri8:
   case X86::CMP8ri:
-    if (!MI.getOperand(1).isImm())
-      return false;
     SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = MI.getOperand(1).getImm();
+    if (MI.getOperand(1).isImm()) {
+      CmpMask = ~0;
+      CmpValue = MI.getOperand(1).getImm();
+    } else {
+      CmpMask = CmpValue = 0;
+    }
     return true;
   // A SUB can be used to perform comparison.
   case X86::SUB64rm:
@@ -6218,7 +6808,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::SUB8rm:
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
-    CmpMask = ~0;
+    CmpMask = 0;
     CmpValue = 0;
     return true;
   case X86::SUB64rr:
@@ -6227,7 +6817,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::SUB8rr:
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = MI.getOperand(2).getReg();
-    CmpMask = ~0;
+    CmpMask = 0;
     CmpValue = 0;
     return true;
   case X86::SUB64ri32:
@@ -6237,12 +6827,14 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::SUB16ri:
   case X86::SUB16ri8:
   case X86::SUB8ri:
-    if (!MI.getOperand(2).isImm())
-      return false;
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
-    CmpMask = ~0;
-    CmpValue = MI.getOperand(2).getImm();
+    if (MI.getOperand(2).isImm()) {
+      CmpMask = ~0;
+      CmpValue = MI.getOperand(2).getImm();
+    } else {
+      CmpMask = CmpValue = 0;
+    }
     return true;
   case X86::CMP64rr:
   case X86::CMP32rr:
@@ -6250,7 +6842,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case X86::CMP8rr:
     SrcReg = MI.getOperand(0).getReg();
     SrcReg2 = MI.getOperand(1).getReg();
-    CmpMask = ~0;
+    CmpMask = 0;
     CmpValue = 0;
     return true;
   case X86::TEST8rr:
@@ -6276,8 +6868,8 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
 /// SrcReg, SrcRegs: register operands for FlagI.
 /// ImmValue: immediate for FlagI if it takes an immediate.
 inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
-                                        unsigned SrcReg2, int ImmValue,
-                                        MachineInstr &OI) {
+                                        unsigned SrcReg2, int ImmMask,
+                                        int ImmValue, MachineInstr &OI) {
   if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
        (FlagI.getOpcode() == X86::CMP32rr && OI.getOpcode() == X86::SUB32rr) ||
        (FlagI.getOpcode() == X86::CMP16rr && OI.getOpcode() == X86::SUB16rr) ||
@@ -6288,7 +6880,8 @@ inline static bool isRedundantFlagInstr(MachineInstr &FlagI, unsigned SrcReg,
         OI.getOperand(2).getReg() == SrcReg)))
     return true;
 
-  if (((FlagI.getOpcode() == X86::CMP64ri32 &&
+  if (ImmMask != 0 &&
+      ((FlagI.getOpcode() == X86::CMP64ri32 &&
         OI.getOpcode() == X86::SUB64ri32) ||
        (FlagI.getOpcode() == X86::CMP64ri8 &&
         OI.getOpcode() == X86::SUB64ri8) ||
@@ -6475,7 +7068,7 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
 
   // If we are comparing against zero, check whether we can use MI to update
   // EFLAGS. If MI is not in the same BB as CmpInstr, do not optimize.
-  bool IsCmpZero = (SrcReg2 == 0 && CmpValue == 0);
+  bool IsCmpZero = (CmpMask != 0 && CmpValue == 0);
   if (IsCmpZero && MI->getParent() != CmpInstr.getParent())
     return false;
 
@@ -6525,8 +7118,8 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   for (; RI != RE; ++RI) {
     MachineInstr &Instr = *RI;
     // Check whether CmpInstr can be made redundant by the current instruction.
-    if (!IsCmpZero &&
-        isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpValue, Instr)) {
+    if (!IsCmpZero && isRedundantFlagInstr(CmpInstr, SrcReg, SrcReg2, CmpMask,
+                                           CmpValue, Instr)) {
       Sub = &Instr;
       break;
     }
@@ -7430,7 +8023,7 @@ static bool hasPartialRegUpdate(unsigned Opcode) {
   return false;
 }
 
-/// Inform the ExeDepsFix pass how many idle
+/// Inform the ExecutionDepsFix pass how many idle
 /// instructions we would like before a partial register update.
 unsigned X86InstrInfo::getPartialRegUpdateClearance(
     const MachineInstr &MI, unsigned OpNum,
@@ -7548,11 +8141,15 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
   case X86::VCVTUSI642SDZrrb_Int:
   case X86::VCVTUSI642SDZrm_Int:
   case X86::VCVTSD2SSZrr:
-  case X86::VCVTSD2SSZrrb:
+  case X86::VCVTSD2SSZrr_Int:
+  case X86::VCVTSD2SSZrrb_Int:
   case X86::VCVTSD2SSZrm:
+  case X86::VCVTSD2SSZrm_Int:
   case X86::VCVTSS2SDZrr:
-  case X86::VCVTSS2SDZrrb:
+  case X86::VCVTSS2SDZrr_Int:
+  case X86::VCVTSS2SDZrrb_Int:
   case X86::VCVTSS2SDZrm:
+  case X86::VCVTSS2SDZrm_Int:
   case X86::VRNDSCALESDr:
   case X86::VRNDSCALESDrb:
   case X86::VRNDSCALESDm:
@@ -7579,8 +8176,8 @@ static bool hasUndefRegUpdate(unsigned Opcode) {
   return false;
 }
 
-/// Inform the ExeDepsFix pass how many idle instructions we would like before
-/// certain undef register reads.
+/// Inform the ExecutionDepsFix pass how many idle instructions we would like
+/// before certain undef register reads.
 ///
 /// This catches the VCVTSI2SD family of instructions:
 ///
@@ -7726,6 +8323,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
     case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
     case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
+    case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
+    case X86::VDIVSSZrr_Intk: case X86::VDIVSSZrr_Intkz:
+    case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz:
+    case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz:
+    case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz:
+    case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz:
     case X86::VFMADDSS4rr_Int:   case X86::VFNMADDSS4rr_Int:
     case X86::VFMSUBSS4rr_Int:   case X86::VFNMSUBSS4rr_Int:
     case X86::VFMADD132SSr_Int:  case X86::VFNMADD132SSr_Int:
@@ -7740,6 +8343,18 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFMSUB132SSZr_Int: case X86::VFNMSUB132SSZr_Int:
     case X86::VFMSUB213SSZr_Int: case X86::VFNMSUB213SSZr_Int:
     case X86::VFMSUB231SSZr_Int: case X86::VFNMSUB231SSZr_Int:
+    case X86::VFMADD132SSZr_Intk: case X86::VFNMADD132SSZr_Intk:
+    case X86::VFMADD213SSZr_Intk: case X86::VFNMADD213SSZr_Intk:
+    case X86::VFMADD231SSZr_Intk: case X86::VFNMADD231SSZr_Intk:
+    case X86::VFMSUB132SSZr_Intk: case X86::VFNMSUB132SSZr_Intk:
+    case X86::VFMSUB213SSZr_Intk: case X86::VFNMSUB213SSZr_Intk:
+    case X86::VFMSUB231SSZr_Intk: case X86::VFNMSUB231SSZr_Intk:
+    case X86::VFMADD132SSZr_Intkz: case X86::VFNMADD132SSZr_Intkz:
+    case X86::VFMADD213SSZr_Intkz: case X86::VFNMADD213SSZr_Intkz:
+    case X86::VFMADD231SSZr_Intkz: case X86::VFNMADD231SSZr_Intkz:
+    case X86::VFMSUB132SSZr_Intkz: case X86::VFNMSUB132SSZr_Intkz:
+    case X86::VFMSUB213SSZr_Intkz: case X86::VFNMSUB213SSZr_Intkz:
+    case X86::VFMSUB231SSZr_Intkz: case X86::VFNMSUB231SSZr_Intkz:
       return false;
     default:
       return true;
@@ -7759,6 +8374,12 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
     case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
     case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
+    case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
+    case X86::VDIVSDZrr_Intk: case X86::VDIVSDZrr_Intkz:
+    case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz:
+    case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz:
+    case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz:
+    case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz:
     case X86::VFMADDSD4rr_Int:   case X86::VFNMADDSD4rr_Int:
     case X86::VFMSUBSD4rr_Int:   case X86::VFNMSUBSD4rr_Int:
     case X86::VFMADD132SDr_Int:  case X86::VFNMADD132SDr_Int:
@@ -7773,6 +8394,18 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFMSUB132SDZr_Int: case X86::VFNMSUB132SDZr_Int:
     case X86::VFMSUB213SDZr_Int: case X86::VFNMSUB213SDZr_Int:
     case X86::VFMSUB231SDZr_Int: case X86::VFNMSUB231SDZr_Int:
+    case X86::VFMADD132SDZr_Intk: case X86::VFNMADD132SDZr_Intk:
+    case X86::VFMADD213SDZr_Intk: case X86::VFNMADD213SDZr_Intk:
+    case X86::VFMADD231SDZr_Intk: case X86::VFNMADD231SDZr_Intk:
+    case X86::VFMSUB132SDZr_Intk: case X86::VFNMSUB132SDZr_Intk:
+    case X86::VFMSUB213SDZr_Intk: case X86::VFNMSUB213SDZr_Intk:
+    case X86::VFMSUB231SDZr_Intk: case X86::VFNMSUB231SDZr_Intk:
+    case X86::VFMADD132SDZr_Intkz: case X86::VFNMADD132SDZr_Intkz:
+    case X86::VFMADD213SDZr_Intkz: case X86::VFNMADD213SDZr_Intkz:
+    case X86::VFMADD231SDZr_Intkz: case X86::VFNMADD231SDZr_Intkz:
+    case X86::VFMSUB132SDZr_Intkz: case X86::VFNMSUB132SDZr_Intkz:
+    case X86::VFMSUB213SDZr_Intkz: case X86::VFNMSUB213SDZr_Intkz:
+    case X86::VFMSUB231SDZr_Intkz: case X86::VFNMSUB231SDZr_Intkz:
       return false;
     default:
       return true;
@@ -8347,28 +8980,29 @@ X86InstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2,
     break;
   }
 
-  // Check if chain operands and base addresses match.
-  if (Load1->getOperand(0) != Load2->getOperand(0) ||
-      Load1->getOperand(5) != Load2->getOperand(5))
+  // Lambda to check if both the loads have the same value for an operand index.
+  auto HasSameOp = [&](int I) {
+    return Load1->getOperand(I) == Load2->getOperand(I);
+  };
+
+  // All operands except the displacement should match.
+  if (!HasSameOp(X86::AddrBaseReg) || !HasSameOp(X86::AddrScaleAmt) ||
+      !HasSameOp(X86::AddrIndexReg) || !HasSameOp(X86::AddrSegmentReg))
     return false;
-  // Segment operands should match as well.
-  if (Load1->getOperand(4) != Load2->getOperand(4))
+
+  // Chain Operand must be the same.
+  if (!HasSameOp(5))
     return false;
-  // Scale should be 1, Index should be Reg0.
-  if (Load1->getOperand(1) == Load2->getOperand(1) &&
-      Load1->getOperand(2) == Load2->getOperand(2)) {
-    if (cast<ConstantSDNode>(Load1->getOperand(1))->getZExtValue() != 1)
-      return false;
 
-    // Now let's examine the displacements.
-    if (isa<ConstantSDNode>(Load1->getOperand(3)) &&
-        isa<ConstantSDNode>(Load2->getOperand(3))) {
-      Offset1 = cast<ConstantSDNode>(Load1->getOperand(3))->getSExtValue();
-      Offset2 = cast<ConstantSDNode>(Load2->getOperand(3))->getSExtValue();
-      return true;
-    }
-  }
-  return false;
+  // Now let's examine if the displacements are constants.
+  auto Disp1 = dyn_cast<ConstantSDNode>(Load1->getOperand(X86::AddrDisp));
+  auto Disp2 = dyn_cast<ConstantSDNode>(Load2->getOperand(X86::AddrDisp));
+  if (!Disp1 || !Disp2)
+    return false;
+
+  Offset1 = Disp1->getSExtValue();
+  Offset2 = Disp2->getSExtValue();
+  return true;
 }
 
 bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
@@ -8419,165 +9053,6 @@ bool X86InstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2,
   return true;
 }
 
-bool X86InstrInfo::shouldScheduleAdjacent(const MachineInstr &First,
-                                          const MachineInstr &Second) const {
-  // Check if this processor supports macro-fusion. Since this is a minor
-  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
-  // proxy for SandyBridge+.
-  if (!Subtarget.hasAVX())
-    return false;
-
-  enum {
-    FuseTest,
-    FuseCmp,
-    FuseInc
-  } FuseKind;
-
-  switch (Second.getOpcode()) {
-  default:
-    return false;
-  case X86::JE_1:
-  case X86::JNE_1:
-  case X86::JL_1:
-  case X86::JLE_1:
-  case X86::JG_1:
-  case X86::JGE_1:
-    FuseKind = FuseInc;
-    break;
-  case X86::JB_1:
-  case X86::JBE_1:
-  case X86::JA_1:
-  case X86::JAE_1:
-    FuseKind = FuseCmp;
-    break;
-  case X86::JS_1:
-  case X86::JNS_1:
-  case X86::JP_1:
-  case X86::JNP_1:
-  case X86::JO_1:
-  case X86::JNO_1:
-    FuseKind = FuseTest;
-    break;
-  }
-  switch (First.getOpcode()) {
-  default:
-    return false;
-  case X86::TEST8rr:
-  case X86::TEST16rr:
-  case X86::TEST32rr:
-  case X86::TEST64rr:
-  case X86::TEST8ri:
-  case X86::TEST16ri:
-  case X86::TEST32ri:
-  case X86::TEST32i32:
-  case X86::TEST64i32:
-  case X86::TEST64ri32:
-  case X86::TEST8rm:
-  case X86::TEST16rm:
-  case X86::TEST32rm:
-  case X86::TEST64rm:
-  case X86::TEST8ri_NOREX:
-  case X86::AND16i16:
-  case X86::AND16ri:
-  case X86::AND16ri8:
-  case X86::AND16rm:
-  case X86::AND16rr:
-  case X86::AND32i32:
-  case X86::AND32ri:
-  case X86::AND32ri8:
-  case X86::AND32rm:
-  case X86::AND32rr:
-  case X86::AND64i32:
-  case X86::AND64ri32:
-  case X86::AND64ri8:
-  case X86::AND64rm:
-  case X86::AND64rr:
-  case X86::AND8i8:
-  case X86::AND8ri:
-  case X86::AND8rm:
-  case X86::AND8rr:
-    return true;
-  case X86::CMP16i16:
-  case X86::CMP16ri:
-  case X86::CMP16ri8:
-  case X86::CMP16rm:
-  case X86::CMP16rr:
-  case X86::CMP32i32:
-  case X86::CMP32ri:
-  case X86::CMP32ri8:
-  case X86::CMP32rm:
-  case X86::CMP32rr:
-  case X86::CMP64i32:
-  case X86::CMP64ri32:
-  case X86::CMP64ri8:
-  case X86::CMP64rm:
-  case X86::CMP64rr:
-  case X86::CMP8i8:
-  case X86::CMP8ri:
-  case X86::CMP8rm:
-  case X86::CMP8rr:
-  case X86::ADD16i16:
-  case X86::ADD16ri:
-  case X86::ADD16ri8:
-  case X86::ADD16ri8_DB:
-  case X86::ADD16ri_DB:
-  case X86::ADD16rm:
-  case X86::ADD16rr:
-  case X86::ADD16rr_DB:
-  case X86::ADD32i32:
-  case X86::ADD32ri:
-  case X86::ADD32ri8:
-  case X86::ADD32ri8_DB:
-  case X86::ADD32ri_DB:
-  case X86::ADD32rm:
-  case X86::ADD32rr:
-  case X86::ADD32rr_DB:
-  case X86::ADD64i32:
-  case X86::ADD64ri32:
-  case X86::ADD64ri32_DB:
-  case X86::ADD64ri8:
-  case X86::ADD64ri8_DB:
-  case X86::ADD64rm:
-  case X86::ADD64rr:
-  case X86::ADD64rr_DB:
-  case X86::ADD8i8:
-  case X86::ADD8mi:
-  case X86::ADD8mr:
-  case X86::ADD8ri:
-  case X86::ADD8rm:
-  case X86::ADD8rr:
-  case X86::SUB16i16:
-  case X86::SUB16ri:
-  case X86::SUB16ri8:
-  case X86::SUB16rm:
-  case X86::SUB16rr:
-  case X86::SUB32i32:
-  case X86::SUB32ri:
-  case X86::SUB32ri8:
-  case X86::SUB32rm:
-  case X86::SUB32rr:
-  case X86::SUB64i32:
-  case X86::SUB64ri32:
-  case X86::SUB64ri8:
-  case X86::SUB64rm:
-  case X86::SUB64rr:
-  case X86::SUB8i8:
-  case X86::SUB8ri:
-  case X86::SUB8rm:
-  case X86::SUB8rr:
-    return FuseKind == FuseCmp || FuseKind == FuseInc;
-  case X86::INC16r:
-  case X86::INC32r:
-  case X86::INC64r:
-  case X86::INC8r:
-  case X86::DEC16r:
-  case X86::DEC32r:
-  case X86::DEC64r:
-  case X86::DEC8r:
-    return FuseKind == FuseInc;
-  }
-}
-
 bool X86InstrInfo::
 reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 1 && "Invalid X86 branch condition!");
@@ -8628,6 +9103,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
   { X86::MOVUPSmr,   X86::MOVUPDmr,  X86::MOVDQUmr  },
   { X86::MOVUPSrm,   X86::MOVUPDrm,  X86::MOVDQUrm  },
   { X86::MOVLPSmr,   X86::MOVLPDmr,  X86::MOVPQI2QImr },
+  { X86::MOVSDmr,    X86::MOVSDmr,   X86::MOVPQI2QImr },
   { X86::MOVSSmr,    X86::MOVSSmr,   X86::MOVPDI2DImr },
   { X86::MOVSDrm,    X86::MOVSDrm,   X86::MOVQI2PQIrm },
   { X86::MOVSSrm,    X86::MOVSSrm,   X86::MOVDI2PDIrm },
@@ -8647,6 +9123,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
   { X86::VMOVUPSmr,  X86::VMOVUPDmr,  X86::VMOVDQUmr  },
   { X86::VMOVUPSrm,  X86::VMOVUPDrm,  X86::VMOVDQUrm  },
   { X86::VMOVLPSmr,  X86::VMOVLPDmr,  X86::VMOVPQI2QImr },
+  { X86::VMOVSDmr,   X86::VMOVSDmr,   X86::VMOVPQI2QImr },
   { X86::VMOVSSmr,   X86::VMOVSSmr,   X86::VMOVPDI2DImr },
   { X86::VMOVSDrm,   X86::VMOVSDrm,   X86::VMOVQI2PQIrm },
   { X86::VMOVSSrm,   X86::VMOVSSrm,   X86::VMOVDI2PDIrm },
@@ -8669,7 +9146,7 @@ static const uint16_t ReplaceableInstrs[][3] = {
   // AVX512 support
   { X86::VMOVLPSZ128mr,  X86::VMOVLPDZ128mr,  X86::VMOVPQI2QIZmr  },
   { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
-  { X86::VMOVNTPSZ128mr, X86::VMOVNTPDZ128mr, X86::VMOVNTDQZ128mr },
+  { X86::VMOVNTPSZ256mr, X86::VMOVNTPDZ256mr, X86::VMOVNTDQZ256mr },
   { X86::VMOVNTPSZmr,    X86::VMOVNTPDZmr,    X86::VMOVNTDQZmr    },
   { X86::VMOVSDZmr,      X86::VMOVSDZmr,      X86::VMOVPQI2QIZmr  },
   { X86::VMOVSSZmr,      X86::VMOVSSZmr,      X86::VMOVPDI2DIZmr  },
@@ -8697,10 +9174,6 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
   { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
   { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
   { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
-  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
-  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
-  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
-  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
   { X86::VPERM2F128rm,   X86::VPERM2F128rm,   X86::VPERM2I128rm },
   { X86::VPERM2F128rr,   X86::VPERM2F128rr,   X86::VPERM2I128rr },
   { X86::VBROADCASTSSrm, X86::VBROADCASTSSrm, X86::VPBROADCASTDrm},
@@ -8712,6 +9185,14 @@ static const uint16_t ReplaceableInstrsAVX2[][3] = {
   { X86::VBROADCASTF128,  X86::VBROADCASTF128,  X86::VBROADCASTI128 },
 };
 
+static const uint16_t ReplaceableInstrsAVX2InsertExtract[][3] = {
+  //PackedSingle       PackedDouble       PackedInt
+  { X86::VEXTRACTF128mr, X86::VEXTRACTF128mr, X86::VEXTRACTI128mr },
+  { X86::VEXTRACTF128rr, X86::VEXTRACTF128rr, X86::VEXTRACTI128rr },
+  { X86::VINSERTF128rm,  X86::VINSERTF128rm,  X86::VINSERTI128rm },
+  { X86::VINSERTF128rr,  X86::VINSERTF128rr,  X86::VINSERTI128rr },
+};
+
 static const uint16_t ReplaceableInstrsAVX512[][4] = {
   // Two integer columns for 64-bit and 32-bit elements.
   //PackedSingle        PackedDouble        PackedInt             PackedInt
@@ -8973,16 +9454,25 @@ X86InstrInfo::getExecutionDomain(const MachineInstr &MI) const {
       validDomains = 0xe;
     } else if (lookup(opcode, domain, ReplaceableInstrsAVX2)) {
       validDomains = Subtarget.hasAVX2() ? 0xe : 0x6;
+    } else if (lookup(opcode, domain, ReplaceableInstrsAVX2InsertExtract)) {
+      // Insert/extract instructions should only effect domain if AVX2
+      // is enabled.
+      if (!Subtarget.hasAVX2())
+        return std::make_pair(0, 0);
+      validDomains = 0xe;
     } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512)) {
       validDomains = 0xe;
-    } else if (lookupAVX512(opcode, domain, ReplaceableInstrsAVX512DQ)) {
-      validDomains = Subtarget.hasDQI() ? 0xe : 0x8;
-    } else if (const uint16_t *table = lookupAVX512(opcode, domain,
+    } else if (Subtarget.hasDQI() && lookupAVX512(opcode, domain,
+                                                  ReplaceableInstrsAVX512DQ)) {
+      validDomains = 0xe;
+    } else if (Subtarget.hasDQI()) {
+      if (const uint16_t *table = lookupAVX512(opcode, domain,
                                              ReplaceableInstrsAVX512DQMasked)) {
-      if (domain == 1 || (domain == 3 && table[3] == opcode))
-        validDomains = Subtarget.hasDQI() ? 0xa : 0x8;
-      else
-        validDomains = Subtarget.hasDQI() ? 0xc : 0x8;
+        if (domain == 1 || (domain == 3 && table[3] == opcode))
+          validDomains = 0xa;
+        else
+          validDomains = 0xc;
+      }
     }
   }
   return std::make_pair(domain, validDomains);
@@ -8998,6 +9488,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
            "256-bit vector operations only available in AVX2");
     table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2);
   }
+  if (!table) { // try the other table
+    assert(Subtarget.hasAVX2() &&
+           "256-bit insert/extract only available in AVX2");
+    table = lookup(MI.getOpcode(), dom, ReplaceableInstrsAVX2InsertExtract);
+  }
   if (!table) { // try the AVX512 table
     assert(Subtarget.hasAVX512() && "Requires AVX-512");
     table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512);
@@ -9661,28 +10156,6 @@ X86InstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
   return makeArrayRef(TargetFlags);
 }
 
-bool X86InstrInfo::isTailCall(const MachineInstr &Inst) const {
-  switch (Inst.getOpcode()) {
-    case X86::TCRETURNdi:
-    case X86::TCRETURNmi:
-    case X86::TCRETURNri:
-    case X86::TCRETURNdi64:
-    case X86::TCRETURNmi64:
-    case X86::TCRETURNri64:
-    case X86::TAILJMPd:
-    case X86::TAILJMPm:
-    case X86::TAILJMPr:
-    case X86::TAILJMPd64:
-    case X86::TAILJMPm64:
-    case X86::TAILJMPr64:
-    case X86::TAILJMPm64_REX:
-    case X86::TAILJMPr64_REX:
-      return true;
-    default:
-      return false;
-  }
-}
-
 namespace {
   /// Create Global Base Reg pass. This initializes the PIC
   /// global base register for x86-32.
@@ -9869,3 +10342,124 @@ namespace {
 char LDTLSCleanup::ID = 0;
 FunctionPass*
 llvm::createCleanupLocalDynamicTLSPass() { return new LDTLSCleanup(); }
+
+unsigned X86InstrInfo::getOutliningBenefit(size_t SequenceSize,
+                                           size_t Occurrences,
+                                           bool CanBeTailCall) const {
+  unsigned NotOutlinedSize = SequenceSize * Occurrences;
+  unsigned OutlinedSize;
+
+  // Is it a tail call?
+  if (CanBeTailCall) {
+    // If yes, we don't have to include a return instruction-- it's already in
+    // our sequence. So we have one occurrence of the sequence + #Occurrences
+    // calls.
+    OutlinedSize = SequenceSize + Occurrences;
+  } else {
+    // If not, add one for the return instruction.
+    OutlinedSize = (SequenceSize + 1) + Occurrences;
+  }
+
+  // Return the number of instructions saved by outlining this sequence.
+  return NotOutlinedSize > OutlinedSize ? NotOutlinedSize - OutlinedSize : 0;
+}
+
+bool X86InstrInfo::isFunctionSafeToOutlineFrom(MachineFunction &MF) const {
+  return MF.getFunction()->hasFnAttribute(Attribute::NoRedZone);
+}
+
+X86GenInstrInfo::MachineOutlinerInstrType
+X86InstrInfo::getOutliningType(MachineInstr &MI) const {
+
+  // Don't allow debug values to impact outlining type.
+  if (MI.isDebugValue() || MI.isIndirectDebugValue())
+    return MachineOutlinerInstrType::Invisible;
+
+  // Is this a tail call? If yes, we can outline as a tail call.
+  if (isTailCall(MI))
+    return MachineOutlinerInstrType::Legal;
+
+  // Is this the terminator of a basic block?
+  if (MI.isTerminator() || MI.isReturn()) {
+
+    // Does its parent have any successors in its MachineFunction?
+    if (MI.getParent()->succ_empty())
+        return MachineOutlinerInstrType::Legal;
+
+    // It does, so we can't tail call it.
+    return MachineOutlinerInstrType::Illegal;
+  }
+
+  // Don't outline anything that modifies or reads from the stack pointer.
+  //
+  // FIXME: There are instructions which are being manually built without
+  // explicit uses/defs so we also have to check the MCInstrDesc. We should be
+  // able to remove the extra checks once those are fixed up. For example,
+  // sometimes we might get something like %RAX<def> = POP64r 1. This won't be
+  // caught by modifiesRegister or readsRegister even though the instruction
+  // really ought to be formed so that modifiesRegister/readsRegister would
+  // catch it.
+  if (MI.modifiesRegister(X86::RSP, &RI) || MI.readsRegister(X86::RSP, &RI) ||
+      MI.getDesc().hasImplicitUseOfPhysReg(X86::RSP) ||
+      MI.getDesc().hasImplicitDefOfPhysReg(X86::RSP)) 
+    return MachineOutlinerInstrType::Illegal;
+
+  // Outlined calls change the instruction pointer, so don't read from it.
+  if (MI.readsRegister(X86::RIP, &RI) ||
+      MI.getDesc().hasImplicitUseOfPhysReg(X86::RIP) ||
+      MI.getDesc().hasImplicitDefOfPhysReg(X86::RIP))
+    return MachineOutlinerInstrType::Illegal;
+
+  // Positions can't safely be outlined.
+  if (MI.isPosition())
+    return MachineOutlinerInstrType::Illegal;
+
+  // Make sure none of the operands of this instruction do anything tricky.
+  for (const MachineOperand &MOP : MI.operands())
+    if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+        MOP.isTargetIndex())
+      return MachineOutlinerInstrType::Illegal;
+
+  return MachineOutlinerInstrType::Legal;
+}
+
+void X86InstrInfo::insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                                          MachineFunction &MF,
+                                          bool IsTailCall) const {
+
+  // If we're a tail call, we already have a return, so don't do anything.
+  if (IsTailCall)
+    return;
+
+  // We're a normal call, so our sequence doesn't have a return instruction.
+  // Add it in.
+  MachineInstr *retq = BuildMI(MF, DebugLoc(), get(X86::RETQ));
+  MBB.insert(MBB.end(), retq);
+}
+
+void X86InstrInfo::insertOutlinerPrologue(MachineBasicBlock &MBB,
+                                          MachineFunction &MF,
+                                          bool IsTailCall) const {
+  return;
+}
+
+MachineBasicBlock::iterator
+X86InstrInfo::insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator &It,
+                                 MachineFunction &MF,
+                                 bool IsTailCall) const {
+  // Is it a tail call?
+  if (IsTailCall) {
+    // Yes, just insert a JMP.
+    It = MBB.insert(It,
+                  BuildMI(MF, DebugLoc(), get(X86::JMP_1))
+                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+  } else {
+    // No, insert a call.
+    It = MBB.insert(It,
+                  BuildMI(MF, DebugLoc(), get(X86::CALL64pcrel32))
+                      .addGlobalAddress(M.getNamedValue(MF.getName())));
+  }
+
+  return It;
+}
diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h
index 8d746172dcbc88fcfe912f3d9e6b655cd98ca003..582515dc1154baa9bd71f24029c156b492117da8 100644
--- a/lib/Target/X86/X86InstrInfo.h
+++ b/lib/Target/X86/X86InstrInfo.h
@@ -443,9 +443,6 @@ public:
                                int64_t Offset1, int64_t Offset2,
                                unsigned NumLoads) const override;
 
-  bool shouldScheduleAdjacent(const MachineInstr &First,
-                              const MachineInstr &Second) const override;
-
   void getNoopForMachoTarget(MCInst &NopInst) const override;
 
   bool
@@ -546,8 +543,28 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableDirectMachineOperandTargetFlags() const override;
 
-  bool isTailCall(const MachineInstr &Inst) const override;
+  unsigned getOutliningBenefit(size_t SequenceSize,
+                               size_t Occurrences,
+                               bool CanBeTailCall) const override;
+
+  bool isFunctionSafeToOutlineFrom(MachineFunction &MF) const override;
+
+  llvm::X86GenInstrInfo::MachineOutlinerInstrType
+  getOutliningType(MachineInstr &MI) const override;
+
+  void insertOutlinerEpilogue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool IsTailCall) const override;
+
+  void insertOutlinerPrologue(MachineBasicBlock &MBB,
+                              MachineFunction &MF,
+                              bool isTailCall) const override;
 
+  MachineBasicBlock::iterator
+  insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator &It,
+                     MachineFunction &MF,
+                     bool IsTailCall) const override;
 protected:
   /// Commutes the operands in the given instruction by changing the operands
   /// order and/or changing the instruction's opcode and/or the immediate value
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index cb6cc935de8cf7a2fc936548caa9a3c32a821d70..163f4eef72ed7077f3df76bf9483647a4b8cb888 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -833,7 +833,6 @@ def HasXSAVEC    : Predicate<"Subtarget->hasXSAVEC()">;
 def HasXSAVES    : Predicate<"Subtarget->hasXSAVES()">;
 def HasPCLMUL    : Predicate<"Subtarget->hasPCLMUL()">;
 def HasFMA       : Predicate<"Subtarget->hasFMA()">;
-def UseFMAOnAVX  : Predicate<"Subtarget->hasFMA() && !Subtarget->hasAVX512()">;
 def HasFMA4      : Predicate<"Subtarget->hasFMA4()">;
 def HasXOP       : Predicate<"Subtarget->hasXOP()">;
 def HasTBM       : Predicate<"Subtarget->hasTBM()">;
@@ -850,8 +849,6 @@ def HasVBMI      : Predicate<"Subtarget->hasVBMI()">,
 def HasIFMA      : Predicate<"Subtarget->hasIFMA()">,
                      AssemblerPredicate<"FeatureIFMA", "AVX-512 IFMA ISA">;
 def HasRTM       : Predicate<"Subtarget->hasRTM()">;
-def HasHLE       : Predicate<"Subtarget->hasHLE()">;
-def HasTSX       : Predicate<"Subtarget->hasRTM() || Subtarget->hasHLE()">;
 def HasADX       : Predicate<"Subtarget->hasADX()">;
 def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
@@ -859,9 +856,11 @@ def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
 def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;
+def HasCLZERO    : Predicate<"Subtarget->hasCLZERO()">;
 def FPStackf32   : Predicate<"!Subtarget->hasSSE1()">;
 def FPStackf64   : Predicate<"!Subtarget->hasSSE2()">;
 def HasMPX       : Predicate<"Subtarget->hasMPX()">;
+def HasCLFLUSHOPT : Predicate<"Subtarget->hasCLFLUSHOPT()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
 def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
                              AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
@@ -897,6 +896,7 @@ def FavorMemIndirectCall  : Predicate<"!Subtarget->callRegIndirect()">;
 def NotSlowIncDec : Predicate<"!Subtarget->slowIncDec()">;
 def HasFastMem32 : Predicate<"!Subtarget->isUnalignedMem32Slow()">;
 def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
+def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
 
 //===----------------------------------------------------------------------===//
@@ -933,6 +933,15 @@ def i32immSExt8  : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
 def i64immSExt8  : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
 def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
 
+// FIXME: Ideally we would just replace the above i*immSExt* matchers with
+// relocImm-based matchers, but then FastISel would be unable to use them.
+def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{
+  return isSExtRelocImm<8>(N);
+}]>;
+def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
+  return isSExtRelocImm<32>(N);
+}]>;
+
 // If we have multiple users of an immediate, it's much smaller to reuse
 // the register, rather than encode the immediate in every instruction.
 // This has the risk of increasing register pressure from stretched live
@@ -973,6 +982,13 @@ def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
 
+def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
 // i64immZExt32 predicate - True if the 64-bit immediate fits in a 32-bit
 // unsigned field.
 def i64immZExt32 : ImmLeaf<i64, [{ return isUInt<32>(Imm); }]>;
@@ -2455,8 +2471,19 @@ def : InstAlias<"monitorx\t{%rax, %rcx, %rdx|rdx, rcx, rax}", (MONITORXrrr)>,
 //===----------------------------------------------------------------------===//
 // CLZERO Instruction
 //
-let Uses = [EAX] in
-def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", []>, TB;
+let SchedRW = [WriteSystem] in {
+  let Uses = [EAX] in
+  def CLZEROr : I<0x01, MRM_FC, (outs), (ins), "clzero", [], IIC_SSE_CLZERO>,
+                TB, Requires<[HasCLZERO]>;
+
+  let usesCustomInserter = 1 in {
+  def CLZERO : PseudoI<(outs), (ins i32mem:$src1),
+                       [(int_x86_clzero addr:$src1)]>, Requires<[HasCLZERO]>;
+  }
+} // SchedRW
+
+def : InstAlias<"clzero\t{%eax|eax}", (CLZEROr)>, Requires<[Not64BitMode]>;
+def : InstAlias<"clzero\t{%rax|rax}", (CLZEROr)>, Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
 // Pattern fragments to auto generate TBM instructions.
@@ -2529,10 +2556,10 @@ let Predicates = [HasTBM] in {
 // Memory Instructions
 //
 
+let Predicates = [HasCLFLUSHOPT] in
 def CLFLUSHOPT : I<0xAE, MRM7m, (outs), (ins i8mem:$src),
                    "clflushopt\t$src", [(int_x86_clflushopt addr:$src)]>, PD;
 def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src", []>, PD;
-def PCOMMIT    : I<0xAE, MRM_F8, (outs), (ins), "pcommit", []>, PD;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/X86/X86InstrMMX.td b/lib/Target/X86/X86InstrMMX.td
index 0bb10682398358246a43e01c9112579687400234..dc3800ce381b0a5e6de4ec9176078d80e3fadaad 100644
--- a/lib/Target/X86/X86InstrMMX.td
+++ b/lib/Target/X86/X86InstrMMX.td
@@ -294,6 +294,7 @@ def MMX_MOVQ64rm : MMXI<0x6F, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                         [(set VR64:$dst, (load_mmx addr:$src))],
                         IIC_MMX_MOVQ_RM>;
 } // SchedRW
+
 let SchedRW = [WriteStore] in
 def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -378,7 +379,6 @@ defm MMX_PHADD   : SS3I_binop_rm_int_mm<0x02, "phaddd", int_x86_ssse3_phadd_d,
 defm MMX_PHADDSW : SS3I_binop_rm_int_mm<0x03, "phaddsw",int_x86_ssse3_phadd_sw,
                                    MMX_PHADDSUBW>;
 
-
 // -- Subtraction
 defm MMX_PSUBB : MMXI_binop_rm_int<0xF8, "psubb", int_x86_mmx_psub_b,
                                    MMX_INTALU_ITINS>;
@@ -479,13 +479,6 @@ defm MMX_PSRLQ : MMXI_binop_rmi_int<0xD3, 0x73, MRM2r, "psrlq",
                                     int_x86_mmx_psrl_q, int_x86_mmx_psrli_q,
                                     MMX_SHIFT_ITINS>;
 
-def : Pat<(int_x86_mmx_psrl_w VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRLWrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psrl_d VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRLDrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psrl_q VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRLQrm VR64:$src1, addr:$src2)>;
-
 defm MMX_PSLLW : MMXI_binop_rmi_int<0xF1, 0x71, MRM6r, "psllw",
                                     int_x86_mmx_psll_w, int_x86_mmx_pslli_w,
                                     MMX_SHIFT_ITINS>;
@@ -496,13 +489,6 @@ defm MMX_PSLLQ : MMXI_binop_rmi_int<0xF3, 0x73, MRM6r, "psllq",
                                     int_x86_mmx_psll_q, int_x86_mmx_pslli_q,
                                     MMX_SHIFT_ITINS>;
 
-def : Pat<(int_x86_mmx_psll_w VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSLLWrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psll_d VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSLLDrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psll_q VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSLLQrm VR64:$src1, addr:$src2)>;
-
 defm MMX_PSRAW : MMXI_binop_rmi_int<0xE1, 0x71, MRM4r, "psraw",
                                     int_x86_mmx_psra_w, int_x86_mmx_psrai_w,
                                     MMX_SHIFT_ITINS>;
@@ -510,11 +496,6 @@ defm MMX_PSRAD : MMXI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                     int_x86_mmx_psra_d, int_x86_mmx_psrai_d,
                                     MMX_SHIFT_ITINS>;
 
-def : Pat<(int_x86_mmx_psra_w VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRAWrm VR64:$src1, addr:$src2)>;
-def : Pat<(int_x86_mmx_psra_d VR64:$src1, (load_mvmmx addr:$src2)),
-          (MMX_PSRADrm VR64:$src1, addr:$src2)>;
-
 // Comparison Instructions
 defm MMX_PCMPEQB : MMXI_binop_rm_int<0x74, "pcmpeqb", int_x86_mmx_pcmpeq_b,
                                      MMX_INTALU_ITINS>;
@@ -576,9 +557,6 @@ def MMX_PSHUFWmi : MMXIi8<0x70, MRMSrcMem,
                                                    imm:$src2))],
                           IIC_MMX_PSHUF>, Sched<[WriteShuffleLd]>;
 
-
-
-
 // -- Conversion Instructions
 defm MMX_CVTPS2PI : sse12_cvt_pint<0x2D, VR128, VR64, int_x86_sse_cvtps2pi,
                       f64mem, load, "cvtps2pi\t{$src, $dst|$dst, $src}",
@@ -639,7 +617,6 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
                           [(set GR32orGR64:$dst,
                                 (int_x86_mmx_pmovmskb VR64:$src))]>;
 
-
 // Low word of XMM to MMX.
 def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
                             [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
@@ -670,6 +647,16 @@ def : Pat<(f64 (bitconvert (x86mmx VR64:$src))),
           (MMX_MOVQ2FR64rr VR64:$src)>;
 def : Pat<(x86mmx (bitconvert (f64 FR64:$src))),
           (MMX_MOVFR642Qrr FR64:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (int_x86_sse2_cvtps2dq VR128:$src))))),
+          (MMX_CVTPS2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (fp_to_sint (v4f32 VR128:$src)))))),
+          (MMX_CVTTPS2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (X86cvtp2Int (v2f64 VR128:$src)))))),
+          (MMX_CVTPD2PIirr VR128:$src)>;
+def : Pat<(x86mmx (MMX_X86movdq2q
+                   (bc_v2i64 (v4i32 (X86cvttp2si (v2f64 VR128:$src)))))),
+          (MMX_CVTTPD2PIirr VR128:$src)>;
 }
-
-
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 922fa69343b2aff5e83a725a2dac0de82be3b61b..f4fc87f80e3d14bdabfce5cec93a0904a8344a46 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -259,8 +259,8 @@ multiclass sse12_fp_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 /// sse12_fp_scalar_int - SSE 1 & 2 scalar instructions intrinsics class
 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
-                               SDPatternOperator Int, RegisterClass RC,
-                               string asm, Operand memopr,
+                               SDPatternOperator OpNode, RegisterClass RC,
+                               ValueType VT, string asm, Operand memopr,
                                ComplexPattern mem_cpat, Domain d,
                                OpndItins itins, bit Is2Addr = 1> {
 let isCodeGenOnly = 1, hasSideEffects = 0 in {
@@ -268,14 +268,14 @@ let isCodeGenOnly = 1, hasSideEffects = 0 in {
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (Int RC:$src1, RC:$src2))], itins.rr, d>,
+       [(set RC:$dst, (VT (OpNode RC:$src1, RC:$src2)))], itins.rr, d>,
        Sched<[itins.Sched]>;
   let mayLoad = 1 in
   def rm_Int : SI_Int<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, memopr:$src2),
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (Int RC:$src1, mem_cpat:$src2))], itins.rm, d>,
+       [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], itins.rm, d>,
        Sched<[itins.Sched.Folded, ReadAfterLd]>;
 }
 }
@@ -526,12 +526,12 @@ multiclass sse12_move<RegisterClass RC, SDNode OpNode, ValueType vt,
   // AVX
   defm V#NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}", d>,
-                              VEX_4V, VEX_LIG;
+                              VEX_4V, VEX_LIG, VEX_WIG;
 
   def V#NAME#mr : SI<0x11, MRMDestMem, (outs), (ins x86memop:$dst, RC:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(store RC:$src, addr:$dst)], IIC_SSE_MOV_S_MR, d>,
-                     VEX, VEX_LIG, Sched<[WriteStore]>;
+                     VEX, VEX_LIG, Sched<[WriteStore]>, VEX_WIG;
   // SSE1 & 2
   let Constraints = "$src1 = $dst" in {
     defm NAME : sse12_move_rr<RC, OpNode, vt, x86memop, OpcodeStr,
@@ -551,7 +551,7 @@ multiclass sse12_move_rm<RegisterClass RC, X86MemOperand x86memop,
   def V#NAME#rm : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set RC:$dst, (mem_pat addr:$src))],
-                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>;
+                     IIC_SSE_MOV_S_RM, d>, VEX, VEX_LIG, Sched<[WriteLoad]>, VEX_WIG;
   def NAME#rm   : SI<0x10, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
                      !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                      [(set RC:$dst, (mem_pat addr:$src))],
@@ -643,10 +643,6 @@ let Predicates = [UseAVX] in {
             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
             (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (VMOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 
   // 256-bit variants
   def : Pat<(v4i64 (X86Movsd VR256:$src1, VR256:$src2)),
@@ -737,10 +733,6 @@ let Predicates = [UseSSE2] in {
             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
   def : Pat<(v2f64 (X86Movsd VR128:$src1, VR128:$src2)),
             (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4f32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
-  def : Pat<(v4i32 (X86Movsd VR128:$src1, VR128:$src2)),
-            (MOVSDrr VR128:$src1, (COPY_TO_REGCLASS VR128:$src2, FR64))>;
 
   // FIXME: Instead of a X86Movlps there should be a X86Movsd here, the problem
   // is during lowering, where it's not possible to recognize the fold because
@@ -785,29 +777,29 @@ let canFoldAsLoad = 1, isReMaterializable = 1 in
 let Predicates = [HasAVX, NoVLX] in {
 defm VMOVAPS : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv4f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              PS, VEX;
+                              PS, VEX, VEX_WIG;
 defm VMOVAPD : sse12_mov_packed<0x28, VR128, f128mem, alignedloadv2f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              PD, VEX;
+                              PD, VEX, VEX_WIG;
 defm VMOVUPS : sse12_mov_packed<0x10, VR128, f128mem, loadv4f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              PS, VEX;
+                              PS, VEX, VEX_WIG;
 defm VMOVUPD : sse12_mov_packed<0x10, VR128, f128mem, loadv2f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
-                              PD, VEX;
+                              PD, VEX, VEX_WIG;
 
 defm VMOVAPSY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv8f32,
                               "movaps", SSEPackedSingle, SSE_MOVA_ITINS>,
-                              PS, VEX, VEX_L;
+                              PS, VEX, VEX_L, VEX_WIG;
 defm VMOVAPDY : sse12_mov_packed<0x28, VR256, f256mem, alignedloadv4f64,
                               "movapd", SSEPackedDouble, SSE_MOVA_ITINS>,
-                              PD, VEX, VEX_L;
+                              PD, VEX, VEX_L, VEX_WIG;
 defm VMOVUPSY : sse12_mov_packed<0x10, VR256, f256mem, loadv8f32,
                               "movups", SSEPackedSingle, SSE_MOVU_ITINS>,
-                              PS, VEX, VEX_L;
+                              PS, VEX, VEX_L, VEX_WIG;
 defm VMOVUPDY : sse12_mov_packed<0x10, VR256, f256mem, loadv4f64,
                               "movupd", SSEPackedDouble, SSE_MOVU_ITINS>,
-                              PD, VEX, VEX_L;
+                              PD, VEX, VEX_L, VEX_WIG;
 }
 
 let Predicates = [UseSSE1] in {
@@ -831,35 +823,35 @@ let SchedRW = [WriteStore], Predicates = [HasAVX, NoVLX]  in {
 def VMOVAPSmr : VPSI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v4f32 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
 def VMOVAPDmr : VPDI<0x29, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
                    [(alignedstore (v2f64 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
 def VMOVUPSmr : VPSI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v4f32 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
 def VMOVUPDmr : VPDI<0x11, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v2f64 VR128:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_WIG;
 def VMOVAPSYmr : VPSI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movaps\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
 def VMOVAPDYmr : VPDI<0x29, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movapd\t{$src, $dst|$dst, $src}",
                    [(alignedstore256 (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVA_P_MR>, VEX, VEX_L, VEX_WIG;
 def VMOVUPSYmr : VPSI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movups\t{$src, $dst|$dst, $src}",
                    [(store (v8f32 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
 def VMOVUPDYmr : VPDI<0x11, MRMDestMem, (outs), (ins f256mem:$dst, VR256:$src),
                    "movupd\t{$src, $dst|$dst, $src}",
                    [(store (v4f64 VR256:$src), addr:$dst)],
-                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L;
+                   IIC_SSE_MOVU_P_MR>, VEX, VEX_L, VEX_WIG;
 } // SchedRW
 
 // For disassembler
@@ -868,35 +860,35 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
   def VMOVAPSrr_REV : VPSI<0x29, MRMDestReg, (outs VR128:$dst),
                           (ins VR128:$src),
                           "movaps\t{$src, $dst|$dst, $src}", [],
-                          IIC_SSE_MOVA_P_RR>, VEX;
+                          IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG;
   def VMOVAPDrr_REV : VPDI<0x29, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movapd\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVA_P_RR>, VEX;
+                           IIC_SSE_MOVA_P_RR>, VEX, VEX_WIG;
   def VMOVUPSrr_REV : VPSI<0x11, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movups\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVU_P_RR>, VEX;
+                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG;
   def VMOVUPDrr_REV : VPDI<0x11, MRMDestReg, (outs VR128:$dst),
                            (ins VR128:$src),
                            "movupd\t{$src, $dst|$dst, $src}", [],
-                           IIC_SSE_MOVU_P_RR>, VEX;
+                           IIC_SSE_MOVU_P_RR>, VEX, VEX_WIG;
   def VMOVAPSYrr_REV : VPSI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movaps\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
   def VMOVAPDYrr_REV : VPDI<0x29, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movapd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
   def VMOVUPSYrr_REV : VPSI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movups\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
   def VMOVUPDYrr_REV : VPDI<0x11, MRMDestReg, (outs VR256:$dst),
                             (ins VR256:$src),
                             "movupd\t{$src, $dst|$dst, $src}", [],
-                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+                            IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
 }
 
 // Aliases to help the assembler pick two byte VEX encodings by swapping the
@@ -954,24 +946,10 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
                          IIC_SSE_MOVU_P_RR>;
 }
 
-// Use vmovaps/vmovups for AVX integer load/store.
 let Predicates = [HasAVX, NoVLX] in {
-  // 128-bit load/store
-  def : Pat<(alignedloadv2i64 addr:$src),
-            (VMOVAPSrm addr:$src)>;
-  def : Pat<(loadv2i64 addr:$src),
-            (VMOVUPSrm addr:$src)>;
-
-  def : Pat<(alignedstore (v2i64 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v2i64 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-
-  // 256-bit load/store
+  // 256-bit load/store need to use floating point load/store in case we don't
+  // have AVX2. Execution domain fixing will convert to integer if AVX2 is
+  // available and changing the domain is beneficial.
   def : Pat<(alignedloadv4i64 addr:$src),
             (VMOVAPSYrm addr:$src)>;
   def : Pat<(loadv4i64 addr:$src),
@@ -980,10 +958,18 @@ let Predicates = [HasAVX, NoVLX] in {
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(alignedstore256 (v8i32 VR256:$src), addr:$dst),
             (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
+            (VMOVAPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v4i64 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
   def : Pat<(store (v8i32 VR256:$src), addr:$dst),
             (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
+  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
+            (VMOVUPSYmr addr:$dst, VR256:$src)>;
 
   // Special patterns for storing subvector extracts of lower 128-bits
   // Its cheaper to just use VMOVAPS/VMOVUPS instead of VEXTRACTF128mr
@@ -993,18 +979,6 @@ let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(alignedstore (v4f32 (extract_subvector
                                   (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
             (VMOVAPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v2i64 (extract_subvector
-                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v4i32 (extract_subvector
-                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v8i16 (extract_subvector
-                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(alignedstore (v16i8 (extract_subvector
-                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVAPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
 
   def : Pat<(store (v2f64 (extract_subvector
                            (v4f64 VR256:$src), (iPTR 0))), addr:$dst),
@@ -1012,40 +986,6 @@ let Predicates = [HasAVX, NoVLX] in {
   def : Pat<(store (v4f32 (extract_subvector
                            (v8f32 VR256:$src), (iPTR 0))), addr:$dst),
             (VMOVUPSmr addr:$dst, (v4f32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v2i64 (extract_subvector
-                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPDmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v4i32 (extract_subvector
-                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPSmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v8i16 (extract_subvector
-                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPSmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-  def : Pat<(store (v16i8 (extract_subvector
-                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
-            (VMOVUPSmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
-}
-
-let Predicates = [HasAVX, NoVLX] in {
-  // 128-bit load/store
-  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
-            (VMOVAPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
-            (VMOVUPSmr addr:$dst, VR128:$src)>;
-
-  // 256-bit load/store
-  def : Pat<(alignedstore256 (v16i16 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(alignedstore256 (v32i8 VR256:$src), addr:$dst),
-            (VMOVAPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v16i16 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
-  def : Pat<(store (v32i8 VR256:$src), addr:$dst),
-            (VMOVUPSYmr addr:$dst, VR256:$src)>;
 }
 
 // Use movaps / movups for SSE integer load / store (one byte shorter).
@@ -1106,7 +1046,7 @@ multiclass sse12_mov_hilo_packed<bits<8>opc, SDNode psnode, SDNode pdnode,
   let Predicates = [UseAVX] in
     defm V#NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
                                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                                    itin>, VEX_4V;
+                                    itin>, VEX_4V, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in
     defm NAME : sse12_mov_hilo_packed_base<opc, psnode, pdnode, base_opc,
@@ -1125,12 +1065,12 @@ def VMOVLPSmr : VPSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt (bc_v2f64 (v4f32 VR128:$src)),
                                  (iPTR 0))), addr:$dst)],
-                                 IIC_SSE_MOV_LH>, VEX;
+                                 IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 def VMOVLPDmr : VPDI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt (v2f64 VR128:$src),
                                  (iPTR 0))), addr:$dst)],
-                                 IIC_SSE_MOV_LH>, VEX;
+                                 IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 }// UseAVX
 def MOVLPSmr : PSI<0x13, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movlps\t{$src, $dst|$dst, $src}",
@@ -1237,12 +1177,12 @@ def VMOVHPSmr : VPSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    [(store (f64 (extractelt
                                  (X86Unpckh (bc_v2f64 (v4f32 VR128:$src)),
                                             (bc_v2f64 (v4f32 VR128:$src))),
-                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 def VMOVHPDmr : VPDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhpd\t{$src, $dst|$dst, $src}",
                    [(store (f64 (extractelt
                                  (v2f64 (X86Unpckh VR128:$src, VR128:$src)),
-                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX;
+                                 (iPTR 0))), addr:$dst)], IIC_SSE_MOV_LH>, VEX, VEX_WIG;
 } // UseAVX
 def MOVHPSmr : PSI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
                    "movhps\t{$src, $dst|$dst, $src}",
@@ -1342,14 +1282,14 @@ let AddedComplexity = 20, Predicates = [UseAVX] in {
                       [(set VR128:$dst,
                         (v4f32 (X86Movlhps VR128:$src1, VR128:$src2)))],
                         IIC_SSE_MOV_LH>,
-                      VEX_4V, Sched<[WriteFShuffle]>;
+                      VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
   def VMOVHLPSrr : VPSI<0x12, MRMSrcReg, (outs VR128:$dst),
                                        (ins VR128:$src1, VR128:$src2),
                       "movhlps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set VR128:$dst,
                         (v4f32 (X86Movhlps VR128:$src1, VR128:$src2)))],
                         IIC_SSE_MOV_LH>,
-                      VEX_4V, Sched<[WriteFShuffle]>;
+                      VEX_4V, Sched<[WriteFShuffle]>, VEX_WIG;
 }
 let Constraints = "$src1 = $dst", AddedComplexity = 20 in {
   def MOVLHPSrr : PSI<0x16, MRMSrcReg, (outs VR128:$dst),
@@ -1724,11 +1664,11 @@ defm CVTSS2SI64 : sse12_cvt_sint<0x2D, VR128, GR64, int_x86_sse_cvtss2si64,
 defm VCVTDQ2PS   : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, loadv2i64,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               PS, VEX, Requires<[HasAVX, NoVLX]>;
+                               PS, VEX, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 defm VCVTDQ2PSY  : sse12_cvt_p<0x5B, VR256, i256mem, v8f32, v8i32, loadv4i64,
                                "vcvtdq2ps\t{$src, $dst|$dst, $src}",
                                SSEPackedSingle, SSE_CVT_PS>,
-                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>;
+                               PS, VEX, VEX_L, Requires<[HasAVX, NoVLX]>, VEX_WIG;
 
 defm CVTDQ2PS : sse12_cvt_p<0x5B, VR128, i128mem, v4f32, v4i32, memopv2i64,
                             "cvtdq2ps\t{$src, $dst|$dst, $src}",
@@ -1776,20 +1716,21 @@ def : InstAlias<"cvtsd2si{q}\t{$src, $dst|$dst, $src}",
 // Convert scalar double to scalar single
 let hasSideEffects = 0, Predicates = [UseAVX] in {
 def VCVTSD2SSrr  : VSDI<0x5A, MRMSrcReg, (outs FR32:$dst),
-                       (ins FR64:$src1, FR64:$src2),
+                       (ins FR32:$src1, FR64:$src2),
                       "cvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}", [],
                       IIC_SSE_CVT_Scalar_RR>, VEX_4V, VEX_LIG,
-                      Sched<[WriteCvtF2F]>;
+                      Sched<[WriteCvtF2F]>, VEX_WIG;
 let mayLoad = 1 in
 def VCVTSD2SSrm  : I<0x5A, MRMSrcMem, (outs FR32:$dst),
-                       (ins FR64:$src1, f64mem:$src2),
+                       (ins FR32:$src1, f64mem:$src2),
                       "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [], IIC_SSE_CVT_Scalar_RM>,
                       XD, Requires<[HasAVX, OptForSize]>, VEX_4V, VEX_LIG,
-                      Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                      Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
 }
 
-def : Pat<(f32 (fpround FR64:$src)), (VCVTSD2SSrr FR64:$src, FR64:$src)>,
+def : Pat<(f32 (fpround FR64:$src)), 
+            (VCVTSD2SSrr (COPY_TO_REGCLASS FR64:$src, FR32), FR64:$src)>,
           Requires<[UseAVX]>;
 
 def CVTSD2SSrr  : SDI<0x5A, MRMSrcReg, (outs FR32:$dst), (ins FR64:$src),
@@ -1809,15 +1750,15 @@ def Int_VCVTSD2SSrr: I<0x5A, MRMSrcReg,
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtsd2ss VR128:$src1, VR128:$src2))],
-                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, Requires<[HasAVX]>,
-                       Sched<[WriteCvtF2F]>;
+                       IIC_SSE_CVT_Scalar_RR>, XD, VEX_4V, VEX_WIG,
+                       Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
 def Int_VCVTSD2SSrm: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst, (int_x86_sse2_cvtsd2ss
                                           VR128:$src1, sse_load_f64:$src2))],
-                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, Requires<[HasAVX]>,
-                       Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                       IIC_SSE_CVT_Scalar_RM>, XD, VEX_4V, VEX_WIG,
+                       Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 
 let Constraints = "$src1 = $dst" in {
 def Int_CVTSD2SSrr: I<0x5A, MRMSrcReg,
@@ -1841,30 +1782,30 @@ def Int_CVTSD2SSrm: I<0x5A, MRMSrcMem,
 // SSE2 instructions with XS prefix
 let hasSideEffects = 0, Predicates = [UseAVX] in {
 def VCVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst),
-                    (ins FR32:$src1, FR32:$src2),
+                    (ins FR64:$src1, FR32:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [], IIC_SSE_CVT_Scalar_RR>,
                     XS, Requires<[HasAVX]>, VEX_4V, VEX_LIG,
-                    Sched<[WriteCvtF2F]>;
+                    Sched<[WriteCvtF2F]>, VEX_WIG;
 let mayLoad = 1 in
 def VCVTSS2SDrm : I<0x5A, MRMSrcMem, (outs FR64:$dst),
-                    (ins FR32:$src1, f32mem:$src2),
+                    (ins FR64:$src1, f32mem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [], IIC_SSE_CVT_Scalar_RM>,
                     XS, VEX_4V, VEX_LIG, Requires<[HasAVX, OptForSize]>,
-                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                    Sched<[WriteCvtF2FLd, ReadAfterLd]>, VEX_WIG;
 }
 
 def : Pat<(f64 (fpextend FR32:$src)),
-    (VCVTSS2SDrr FR32:$src, FR32:$src)>, Requires<[UseAVX]>;
+    (VCVTSS2SDrr (COPY_TO_REGCLASS FR32:$src, FR64), FR32:$src)>, Requires<[UseAVX]>;
 def : Pat<(fpextend (loadf32 addr:$src)),
-    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
+    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>, Requires<[UseAVX]>;
 
 def : Pat<(extloadf32 addr:$src),
-    (VCVTSS2SDrm (f32 (IMPLICIT_DEF)), addr:$src)>,
+    (VCVTSS2SDrm (f64 (IMPLICIT_DEF)), addr:$src)>,
     Requires<[UseAVX, OptForSize]>;
 def : Pat<(extloadf32 addr:$src),
-    (VCVTSS2SDrr (f32 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
+    (VCVTSS2SDrr (f64 (IMPLICIT_DEF)), (VMOVSSrm addr:$src))>,
     Requires<[UseAVX, OptForSpeed]>;
 
 def CVTSS2SDrr : I<0x5A, MRMSrcReg, (outs FR64:$dst), (ins FR32:$src),
@@ -1894,15 +1835,15 @@ def Int_VCVTSS2SDrr: I<0x5A, MRMSrcReg,
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, VR128:$src2))],
-                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, Requires<[HasAVX]>,
-                    Sched<[WriteCvtF2F]>;
+                    IIC_SSE_CVT_Scalar_RR>, XS, VEX_4V, VEX_WIG,
+                    Requires<[HasAVX]>, Sched<[WriteCvtF2F]>;
 def Int_VCVTSS2SDrm: I<0x5A, MRMSrcMem,
                       (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2),
                     "vcvtss2sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set VR128:$dst,
                       (int_x86_sse2_cvtss2sd VR128:$src1, sse_load_f32:$src2))],
-                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, Requires<[HasAVX]>,
-                    Sched<[WriteCvtF2FLd, ReadAfterLd]>;
+                    IIC_SSE_CVT_Scalar_RM>, XS, VEX_4V, VEX_WIG,
+                    Requires<[HasAVX]>, Sched<[WriteCvtF2FLd, ReadAfterLd]>;
 let Constraints = "$src1 = $dst" in { // SSE2 instructions with XS prefix
 def Int_CVTSS2SDrr: I<0x5A, MRMSrcReg,
                       (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
@@ -1998,22 +1939,22 @@ def : Pat<(v4f32 (X86Movss
 def VCVTPS2DQrr : VPDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
-                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+                       IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTPS2DQrm : VPDI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtps2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (int_x86_sse2_cvtps2dq (loadv4f32 addr:$src)))],
-                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                       IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def VCVTPS2DQYrr : VPDI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 VR256:$src))],
-                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                        IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTPS2DQYrm : VPDI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         "cvtps2dq\t{$src, $dst|$dst, $src}",
                         [(set VR256:$dst,
                           (int_x86_avx_cvt_ps2dq_256 (loadv8f32 addr:$src)))],
-                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                        IIC_SSE_CVT_PS_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def CVTPS2DQrr : PDI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                      "cvtps2dq\t{$src, $dst|$dst, $src}",
                      [(set VR128:$dst, (int_x86_sse2_cvtps2dq VR128:$src))],
@@ -2034,7 +1975,7 @@ def VCVTPD2DQrr  : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (v2f64 VR128:$src))))]>,
-                       VEX, Sched<[WriteCvtF2I]>;
+                       VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 
 // XMM only
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
@@ -2043,7 +1984,7 @@ def VCVTPD2DQrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                       "vcvtpd2dq{x}\t{$src, $dst|$dst, $src}",
                       [(set VR128:$dst,
                         (v4i32 (X86cvtp2Int (loadv2f64 addr:$src))))]>, VEX,
-                      Sched<[WriteCvtF2ILd]>;
+                      Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQrm VR128:$dst, f128mem:$src), 0>;
 
@@ -2052,12 +1993,12 @@ def VCVTPD2DQYrr : SDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                        "vcvtpd2dq\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (v4f64 VR256:$src))))]>,
-                       VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                       VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTPD2DQYrm : SDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                        "vcvtpd2dq{y}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
                          (v4i32 (X86cvtp2Int (loadv4f64 addr:$src))))]>,
-                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                       VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
                 (VCVTPD2DQYrr VR128:$dst, VR256:$src), 0>;
 def : InstAlias<"vcvtpd2dqy\t{$src, $dst|$dst, $src}",
@@ -2082,23 +2023,23 @@ def VCVTTPS2DQrr : VS2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (v4f32 VR128:$src))))],
-                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>;
+                         IIC_SSE_CVT_PS_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTTPS2DQrm : VS2SI<0x5B, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                          "cvttps2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (loadv4f32 addr:$src))))],
-                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                         IIC_SSE_CVT_PS_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def VCVTTPS2DQYrr : VS2SI<0x5B, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
                             (v8i32 (fp_to_sint (v8f32 VR256:$src))))],
-                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                          IIC_SSE_CVT_PS_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTTPS2DQYrm : VS2SI<0x5B, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                           "cvttps2dq\t{$src, $dst|$dst, $src}",
                           [(set VR256:$dst,
                             (v8i32 (fp_to_sint (loadv8f32 addr:$src))))],
                           IIC_SSE_CVT_PS_RM>, VEX, VEX_L,
-                          Sched<[WriteCvtF2ILd]>;
+                          Sched<[WriteCvtF2ILd]>, VEX_WIG;
 }
 
 def CVTTPS2DQrr : S2SI<0x5B, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -2117,7 +2058,7 @@ def VCVTTPD2DQrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "cvttpd2dq\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v4i32 (X86cvttp2si (v2f64 VR128:$src))))],
-                        IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>;
+                        IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2I]>, VEX_WIG;
 
 // The assembler can recognize rr 256-bit instructions by seeing a ymm
 // register, but the same isn't true when using memory operands instead.
@@ -2131,7 +2072,7 @@ def VCVTTPD2DQrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                         "cvttpd2dq{x}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v4i32 (X86cvttp2si (loadv2f64 addr:$src))))],
-                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>;
+                        IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 def : InstAlias<"vcvttpd2dqx\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQrm VR128:$dst, f128mem:$src), 0>;
 
@@ -2141,12 +2082,12 @@ def VCVTTPD2DQYrr : VPDI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                          "cvttpd2dq\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (v4f64 VR256:$src))))],
-                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>;
+                         IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2I]>, VEX_WIG;
 def VCVTTPD2DQYrm : VPDI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                          "cvttpd2dq{y}\t{$src, $dst|$dst, $src}",
                          [(set VR128:$dst,
                            (v4i32 (fp_to_sint (loadv4f64 addr:$src))))],
-                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>;
+                         IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2ILd]>, VEX_WIG;
 }
 def : InstAlias<"vcvttpd2dqy\t{$src, $dst|$dst, $src}",
                 (VCVTTPD2DQYrr VR128:$dst, VR256:$src), 0>;
@@ -2192,19 +2133,19 @@ let Predicates = [HasAVX, NoVLX] in {
 def VCVTPS2PDrr : I<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (X86vfpext (v4f32 VR128:$src))))],
-                    IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>;
+                    IIC_SSE_CVT_PD_RR>, PS, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
 def VCVTPS2PDrm : I<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f64mem:$src),
                     "vcvtps2pd\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2f64 (extloadv2f32 addr:$src)))],
-                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>;
+                    IIC_SSE_CVT_PD_RM>, PS, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 def VCVTPS2PDYrr : I<0x5A, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (fpextend (v4f32 VR128:$src))))],
-                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+                     IIC_SSE_CVT_PD_RR>, PS, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
 def VCVTPS2PDYrm : I<0x5A, MRMSrcMem, (outs VR256:$dst), (ins f128mem:$src),
                      "vcvtps2pd\t{$src, $dst|$dst, $src}",
                      [(set VR256:$dst, (v4f64 (extloadv4f32 addr:$src)))],
-                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+                     IIC_SSE_CVT_PD_RM>, PS, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 }
 
 let Predicates = [UseSSE2] in {
@@ -2224,30 +2165,30 @@ let hasSideEffects = 0, mayLoad = 1 in
 def VCVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
-                          (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))]>,
-                        VEX, Sched<[WriteCvtI2FLd]>;
+                          (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))]>,
+                        VEX, Sched<[WriteCvtI2FLd]>, VEX_WIG;
 def VCVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst,
                           (v2f64 (X86VSintToFP (v4i32 VR128:$src))))]>,
-                        VEX, Sched<[WriteCvtI2F]>;
+                        VEX, Sched<[WriteCvtI2F]>, VEX_WIG;
 def VCVTDQ2PDYrm  : S2SI<0xE6, MRMSrcMem, (outs VR256:$dst), (ins i128mem:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
                            (v4f64 (sint_to_fp (bc_v4i32 (loadv2i64 addr:$src)))))]>,
-                         VEX, VEX_L, Sched<[WriteCvtI2FLd]>;
+                         VEX, VEX_L, Sched<[WriteCvtI2FLd]>, VEX_WIG;
 def VCVTDQ2PDYrr  : S2SI<0xE6, MRMSrcReg, (outs VR256:$dst), (ins VR128:$src),
                          "vcvtdq2pd\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst,
                            (v4f64 (sint_to_fp (v4i32 VR128:$src))))]>,
-                         VEX, VEX_L, Sched<[WriteCvtI2F]>;
+                         VEX, VEX_L, Sched<[WriteCvtI2F]>, VEX_WIG;
 }
 
 let hasSideEffects = 0, mayLoad = 1 in
 def CVTDQ2PDrm  : S2SI<0xE6, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst,
-                         (v2f64 (X86VSintToFP (bc_v4i32 (loadv2i64 addr:$src)))))],
+                         (v2f64 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload addr:$src))))))],
                        IIC_SSE_CVT_PD_RR>, Sched<[WriteCvtI2FLd]>;
 def CVTDQ2PDrr  : S2SI<0xE6, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtdq2pd\t{$src, $dst|$dst, $src}",
@@ -2275,7 +2216,7 @@ let Predicates = [HasAVX, NoVLX] in
 def VCVTPD2PSrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                        "cvtpd2ps\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (X86vfpround (v2f64 VR128:$src)))],
-                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>;
+                       IIC_SSE_CVT_PD_RR>, VEX, Sched<[WriteCvtF2F]>, VEX_WIG;
 
 // XMM only
 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
@@ -2284,7 +2225,7 @@ let Predicates = [HasAVX, NoVLX] in
 def VCVTPD2PSrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        "cvtpd2ps{x}\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (X86vfpround (loadv2f64 addr:$src)))],
-                       IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>;
+                       IIC_SSE_CVT_PD_RM>, VEX, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 def : InstAlias<"vcvtpd2psx\t{$src, $dst|$dst, $src}",
                 (VCVTPD2PSrm VR128:$dst, f128mem:$src), 0>;
 
@@ -2293,11 +2234,11 @@ let Predicates = [HasAVX, NoVLX] in {
 def VCVTPD2PSYrr : VPDI<0x5A, MRMSrcReg, (outs VR128:$dst), (ins VR256:$src),
                         "cvtpd2ps\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (fpround VR256:$src))],
-                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>;
+                        IIC_SSE_CVT_PD_RR>, VEX, VEX_L, Sched<[WriteCvtF2F]>, VEX_WIG;
 def VCVTPD2PSYrm : VPDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f256mem:$src),
                         "cvtpd2ps{y}\t{$src, $dst|$dst, $src}",
                         [(set VR128:$dst, (fpround (loadv4f64 addr:$src)))],
-                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>;
+                        IIC_SSE_CVT_PD_RM>, VEX, VEX_L, Sched<[WriteCvtF2FLd]>, VEX_WIG;
 }
 def : InstAlias<"vcvtpd2psy\t{$src, $dst|$dst, $src}",
                 (VCVTPD2PSYrr VR128:$dst, VR256:$src), 0>;
@@ -2367,21 +2308,25 @@ multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
   }
 }
 
+let ExeDomain = SSEPackedSingle in
 defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, AVXCC, X86cmps, f32, loadf32,
                  "cmp${cc}ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG;
+                 SSE_ALU_F32S, i8immZExt5>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+let ExeDomain = SSEPackedDouble in
 defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, AVXCC, X86cmps, f64, loadf64,
                  "cmp${cc}sd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                  "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                  SSE_ALU_F32S, i8immZExt5>, // same latency as 32 bit compare
-                 XD, VEX_4V, VEX_LIG;
+                 XD, VEX_4V, VEX_LIG, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in {
+  let ExeDomain = SSEPackedSingle in
   defm CMPSS : sse12_cmp_scalar<FR32, f32mem, SSECC, X86cmps, f32, loadf32,
                   "cmp${cc}ss\t{$src2, $dst|$dst, $src2}",
                   "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}", SSE_ALU_F32S,
                   i8immZExt3>, XS;
+  let ExeDomain = SSEPackedDouble in
   defm CMPSD : sse12_cmp_scalar<FR64, f64mem, SSECC, X86cmps, f64, loadf64,
                   "cmp${cc}sd\t{$src2, $dst|$dst, $src2}",
                   "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
@@ -2407,18 +2352,22 @@ multiclass sse12_cmp_scalar_int<Operand memop, Operand CC,
 
 let isCodeGenOnly = 1 in {
   // Aliases to match intrinsics which expect XMM operand(s).
+  let ExeDomain = SSEPackedSingle in
   defm Int_VCMPSS  : sse12_cmp_scalar_int<ssmem, AVXCC, int_x86_sse_cmp_ss,
                        "cmp${cc}ss\t{$src, $src1, $dst|$dst, $src1, $src}",
                        SSE_ALU_F32S, i8immZExt5, sse_load_f32>,
                        XS, VEX_4V;
+  let ExeDomain = SSEPackedDouble in
   defm Int_VCMPSD  : sse12_cmp_scalar_int<sdmem, AVXCC, int_x86_sse2_cmp_sd,
                        "cmp${cc}sd\t{$src, $src1, $dst|$dst, $src1, $src}",
                        SSE_ALU_F32S, i8immZExt5, sse_load_f64>, // same latency as f32
                        XD, VEX_4V;
   let Constraints = "$src1 = $dst" in {
+    let ExeDomain = SSEPackedSingle in
     defm Int_CMPSS  : sse12_cmp_scalar_int<ssmem, SSECC, int_x86_sse_cmp_ss,
                          "cmp${cc}ss\t{$src, $dst|$dst, $src}",
                          SSE_ALU_F32S, i8immZExt3, sse_load_f32>, XS;
+    let ExeDomain = SSEPackedDouble in
     defm Int_CMPSD  : sse12_cmp_scalar_int<sdmem, SSECC, int_x86_sse2_cmp_sd,
                          "cmp${cc}sd\t{$src, $dst|$dst, $src}",
                          SSE_ALU_F64S, i8immZExt3, sse_load_f64>,
@@ -2463,26 +2412,26 @@ multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
 
 let Defs = [EFLAGS] in {
   defm VUCOMISS : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
-                                  "ucomiss">, PS, VEX, VEX_LIG;
+                                  "ucomiss">, PS, VEX, VEX_LIG, VEX_WIG;
   defm VUCOMISD : sse12_ord_cmp<0x2E, FR64, X86cmp, f64, f64mem, loadf64,
-                                  "ucomisd">, PD, VEX, VEX_LIG;
+                                  "ucomisd">, PD, VEX, VEX_LIG, VEX_WIG;
   let Pattern = []<dag> in {
     defm VCOMISS  : sse12_ord_cmp<0x2F, FR32, undef, f32, f32mem, loadf32,
-                                    "comiss">, PS, VEX, VEX_LIG;
+                                    "comiss">, PS, VEX, VEX_LIG, VEX_WIG;
     defm VCOMISD  : sse12_ord_cmp<0x2F, FR64, undef, f64, f64mem, loadf64,
-                                    "comisd">, PD, VEX, VEX_LIG;
+                                    "comisd">, PD, VEX, VEX_LIG, VEX_WIG;
   }
 
   let isCodeGenOnly = 1 in {
     defm Int_VUCOMISS  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v4f32, ssmem,
-                              sse_load_f32, "ucomiss">, PS, VEX;
+                              sse_load_f32, "ucomiss">, PS, VEX, VEX_WIG;
     defm Int_VUCOMISD  : sse12_ord_cmp_int<0x2E, VR128, X86ucomi, v2f64, sdmem,
-                              sse_load_f64, "ucomisd">, PD, VEX;
+                              sse_load_f64, "ucomisd">, PD, VEX, VEX_WIG;
 
     defm Int_VCOMISS  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v4f32, ssmem,
-                              sse_load_f32, "comiss">, PS, VEX;
+                              sse_load_f32, "comiss">, PS, VEX, VEX_WIG;
     defm Int_VCOMISD  : sse12_ord_cmp_int<0x2F, VR128, X86comi, v2f64, sdmem,
-                              sse_load_f64, "comisd">, PD, VEX;
+                              sse_load_f64, "comisd">, PD, VEX, VEX_WIG;
   }
   defm UCOMISS  : sse12_ord_cmp<0x2E, FR32, X86cmp, f32, f32mem, loadf32,
                                   "ucomiss">, PS;
@@ -2511,18 +2460,19 @@ let Defs = [EFLAGS] in {
 
 // sse12_cmp_packed - sse 1 & 2 compare packed instructions
 multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
-                            Operand CC, Intrinsic Int, string asm,
+                            Operand CC,  ValueType VT, string asm,
                             string asm_alt, Domain d, ImmLeaf immLeaf,
                             PatFrag ld_frag, OpndItins itins = SSE_ALU_F32P> {
   let isCommutable = 1 in
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, CC:$cc), asm,
-             [(set RC:$dst, (Int RC:$src1, RC:$src2, immLeaf:$cc))],
+             [(set RC:$dst, (VT (X86cmpp RC:$src1, RC:$src2, immLeaf:$cc)))],
              itins.rr, d>,
             Sched<[WriteFAdd]>;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, CC:$cc), asm,
-             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2), immLeaf:$cc))],
+             [(set RC:$dst,
+               (VT (X86cmpp RC:$src1, (ld_frag addr:$src2), immLeaf:$cc)))],
              itins.rm, d>,
             Sched<[WriteFAddLd, ReadAfterLd]>;
 
@@ -2539,67 +2489,33 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
   }
 }
 
-defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse_cmp_ps,
+defm VCMPPS : sse12_cmp_packed<VR128, f128mem, AVXCC, v4f32,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V;
-defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, int_x86_sse2_cmp_pd,
+               SSEPackedSingle, i8immZExt5, loadv4f32>, PS, VEX_4V, VEX_WIG;
+defm VCMPPD : sse12_cmp_packed<VR128, f128mem, AVXCC, v2f64,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V;
-defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_ps_256,
+               SSEPackedDouble, i8immZExt5, loadv2f64>, PD, VEX_4V, VEX_WIG;
+defm VCMPPSY : sse12_cmp_packed<VR256, f256mem, AVXCC, v8f32,
                "cmp${cc}ps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmpps\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SSEPackedSingle, i8immZExt5, loadv8f32>, PS, VEX_4V, VEX_L;
-defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, int_x86_avx_cmp_pd_256,
+defm VCMPPDY : sse12_cmp_packed<VR256, f256mem, AVXCC, v4f64,
                "cmp${cc}pd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                "cmppd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
                SSEPackedDouble, i8immZExt5, loadv4f64>, PD, VEX_4V, VEX_L;
 let Constraints = "$src1 = $dst" in {
-  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse_cmp_ps,
+  defm CMPPS : sse12_cmp_packed<VR128, f128mem, SSECC, v4f32,
                  "cmp${cc}ps\t{$src2, $dst|$dst, $src2}",
                  "cmpps\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedSingle, i8immZExt5, memopv4f32, SSE_ALU_F32P>, PS;
-  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, int_x86_sse2_cmp_pd,
+  defm CMPPD : sse12_cmp_packed<VR128, f128mem, SSECC, v2f64,
                  "cmp${cc}pd\t{$src2, $dst|$dst, $src2}",
                  "cmppd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
                  SSEPackedDouble, i8immZExt5, memopv2f64, SSE_ALU_F64P>, PD;
 }
 
-let Predicates = [HasAVX] in {
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
-          (VCMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (loadv4f32 addr:$src2), imm:$cc)),
-          (VCMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
-          (VCMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (loadv2f64 addr:$src2), imm:$cc)),
-          (VCMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
-
-def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), VR256:$src2, imm:$cc)),
-          (VCMPPSYrri (v8f32 VR256:$src1), (v8f32 VR256:$src2), imm:$cc)>;
-def : Pat<(v8f32 (X86cmpp (v8f32 VR256:$src1), (loadv8f32 addr:$src2), imm:$cc)),
-          (VCMPPSYrmi (v8f32 VR256:$src1), addr:$src2, imm:$cc)>;
-def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), VR256:$src2, imm:$cc)),
-          (VCMPPDYrri VR256:$src1, VR256:$src2, imm:$cc)>;
-def : Pat<(v4f64 (X86cmpp (v4f64 VR256:$src1), (loadv4f64 addr:$src2), imm:$cc)),
-          (VCMPPDYrmi VR256:$src1, addr:$src2, imm:$cc)>;
-}
-
-let Predicates = [UseSSE1] in {
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), VR128:$src2, imm:$cc)),
-          (CMPPSrri (v4f32 VR128:$src1), (v4f32 VR128:$src2), imm:$cc)>;
-def : Pat<(v4f32 (X86cmpp (v4f32 VR128:$src1), (memopv4f32 addr:$src2), imm:$cc)),
-          (CMPPSrmi (v4f32 VR128:$src1), addr:$src2, imm:$cc)>;
-}
-
-let Predicates = [UseSSE2] in {
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), VR128:$src2, imm:$cc)),
-          (CMPPDrri VR128:$src1, VR128:$src2, imm:$cc)>;
-def : Pat<(v2f64 (X86cmpp (v2f64 VR128:$src1), (memopv2f64 addr:$src2), imm:$cc)),
-          (CMPPDrmi VR128:$src1, addr:$src2, imm:$cc)>;
-}
-
 //===----------------------------------------------------------------------===//
 // SSE 1 & 2 - Shuffle Instructions
 //===----------------------------------------------------------------------===//
@@ -2623,16 +2539,16 @@ multiclass sse12_shuffle<RegisterClass RC, X86MemOperand x86memop,
 let Predicates = [HasAVX, NoVLX] in {
   defm VSHUFPS  : sse12_shuffle<VR128, f128mem, v4f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv4f32, SSEPackedSingle>, PS, VEX_4V;
+           loadv4f32, SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
   defm VSHUFPSY : sse12_shuffle<VR256, f256mem, v8f32,
            "shufps\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L;
+           loadv8f32, SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
   defm VSHUFPD  : sse12_shuffle<VR128, f128mem, v2f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv2f64, SSEPackedDouble>, PD, VEX_4V;
+           loadv2f64, SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
   defm VSHUFPDY : sse12_shuffle<VR256, f256mem, v4f64,
            "shufpd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
-           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L;
+           loadv4f64, SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }
 let Constraints = "$src1 = $dst" in {
   defm SHUFPS : sse12_shuffle<VR128, f128mem, v4f32,
@@ -2714,29 +2630,29 @@ multiclass sse12_unpack_interleave<bits<8> opc, SDNode OpNode, ValueType vt,
 let Predicates = [HasAVX, NoVLX] in {
 defm VUNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, loadv4f32,
       VR128, f128mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
 defm VUNPCKHPD: sse12_unpack_interleave<0x15, X86Unpckh, v2f64, loadv2f64,
       VR128, f128mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 defm VUNPCKLPS: sse12_unpack_interleave<0x14, X86Unpckl, v4f32, loadv4f32,
       VR128, f128mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_WIG;
 defm VUNPCKLPD: sse12_unpack_interleave<0x14, X86Unpckl, v2f64, loadv2f64,
       VR128, f128mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_WIG;
 
 defm VUNPCKHPSY: sse12_unpack_interleave<0x15, X86Unpckh, v8f32, loadv8f32,
       VR256, f256mem, "unpckhps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKHPDY: sse12_unpack_interleave<0x15, X86Unpckh, v4f64, loadv4f64,
       VR256, f256mem, "unpckhpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKLPSY: sse12_unpack_interleave<0x14, X86Unpckl, v8f32, loadv8f32,
       VR256, f256mem, "unpcklps\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedSingle>, PS, VEX_4V, VEX_L;
+                     SSEPackedSingle>, PS, VEX_4V, VEX_L, VEX_WIG;
 defm VUNPCKLPDY: sse12_unpack_interleave<0x14, X86Unpckl, v4f64, loadv4f64,
       VR256, f256mem, "unpcklpd\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                     SSEPackedDouble>, PD, VEX_4V, VEX_L;
+                     SSEPackedDouble>, PD, VEX_4V, VEX_L, VEX_WIG;
 }// Predicates = [HasAVX, NoVLX]
 let Constraints = "$src1 = $dst" in {
   defm UNPCKHPS: sse12_unpack_interleave<0x15, X86Unpckh, v4f32, memopv4f32,
@@ -2788,13 +2704,13 @@ multiclass sse12_extr_sign_mask<RegisterClass RC, ValueType vt,
 
 let Predicates = [HasAVX] in {
   defm VMOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
-                                        SSEPackedSingle>, PS, VEX;
+                                        SSEPackedSingle>, PS, VEX, VEX_WIG;
   defm VMOVMSKPD : sse12_extr_sign_mask<VR128, v2f64, "movmskpd",
-                                        SSEPackedDouble>, PD, VEX;
+                                        SSEPackedDouble>, PD, VEX, VEX_WIG;
   defm VMOVMSKPSY : sse12_extr_sign_mask<VR256, v8f32, "movmskps",
-                                         SSEPackedSingle>, PS, VEX, VEX_L;
+                                         SSEPackedSingle>, PS, VEX, VEX_L, VEX_WIG;
   defm VMOVMSKPDY : sse12_extr_sign_mask<VR256, v4f64, "movmskpd",
-                                         SSEPackedDouble>, PD, VEX, VEX_L;
+                                         SSEPackedDouble>, PD, VEX, VEX_L, VEX_WIG;
 }
 
 defm MOVMSKPS : sse12_extr_sign_mask<VR128, v4f32, "movmskps",
@@ -2838,7 +2754,7 @@ multiclass PDI_binop_all<bits<8> opc, string OpcodeStr, SDNode Opcode,
                          OpndItins itins, bit IsCommutable = 0, Predicate prd> {
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode, OpVT128,
-                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V;
+                    VR128, loadv2i64, i128mem, itins, IsCommutable, 0>, VEX_4V, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rm<opc, OpcodeStr, Opcode, OpVT128, VR128,
@@ -2847,7 +2763,7 @@ let Constraints = "$src1 = $dst" in
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rm<opc, !strconcat("v", OpcodeStr), Opcode,
                                OpVT256, VR256, loadv4i64, i256mem, itins,
-                               IsCommutable, 0>, VEX_4V, VEX_L;
+                               IsCommutable, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 // These are ordered here for pattern ordering requirements with the fp versions
@@ -2875,7 +2791,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
                                   (bc_v4i64 (v8f32 VR256:$src2))))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v8f32 VR256:$src1)),
-                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L;
+                           (loadv4i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_L, VEX_WIG;
 
   defm V#NAME#PDY : sse12_fp_packed_logical_rm<opc, VR256, SSEPackedDouble,
         !strconcat(OpcodeStr, "pd"), f256mem,
@@ -2883,14 +2799,14 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                   (bc_v4i64 (v4f64 VR256:$src2))))],
         [(set VR256:$dst, (OpNode (bc_v4i64 (v4f64 VR256:$src1)),
                                   (loadv4i64 addr:$src2)))], 0>,
-                                  PD, VEX_4V, VEX_L;
+                                  PD, VEX_4V, VEX_L, VEX_WIG;
 
   defm V#NAME#PS : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedSingle,
        !strconcat(OpcodeStr, "ps"), f128mem,
        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
                                  (bc_v2i64 (v4f32 VR128:$src2))))],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v4f32 VR128:$src1)),
-                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V;
+                                 (loadv2i64 addr:$src2)))], 0>, PS, VEX_4V, VEX_WIG;
 
   defm V#NAME#PD : sse12_fp_packed_logical_rm<opc, VR128, SSEPackedDouble,
        !strconcat(OpcodeStr, "pd"), f128mem,
@@ -2898,7 +2814,7 @@ multiclass sse12_fp_packed_logical<bits<8> opc, string OpcodeStr,
                                  (bc_v2i64 (v2f64 VR128:$src2))))],
        [(set VR128:$dst, (OpNode (bc_v2i64 (v2f64 VR128:$src1)),
                                  (loadv2i64 addr:$src2)))], 0>,
-                                                 PD, VEX_4V;
+                                                 PD, VEX_4V, VEX_WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
@@ -3064,17 +2980,17 @@ multiclass basic_sse12_fp_binop_p<bits<8> opc, string OpcodeStr,
   let Predicates = [HasAVX, NoVLX] in {
   defm V#NAME#PS : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"), OpNode,
                                VR128, v4f32, f128mem, loadv4f32,
-                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V;
+                               SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_WIG;
   defm V#NAME#PD : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"), OpNode,
                                VR128, v2f64, f128mem, loadv2f64,
-                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V;
+                               SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_WIG;
 
   defm V#NAME#PSY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "ps"),
                         OpNode, VR256, v8f32, f256mem, loadv8f32,
-                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L;
+                        SSEPackedSingle, itins.s, 0>, PS, VEX_4V, VEX_L, VEX_WIG;
   defm V#NAME#PDY : sse12_fp_packed<opc, !strconcat(OpcodeStr, "pd"),
                         OpNode, VR256, v4f64, f256mem, loadv4f64,
-                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L;
+                        SSEPackedDouble, itins.d, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
   }
 
   let Constraints = "$src1 = $dst" in {
@@ -3091,10 +3007,10 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                   SizeItins itins> {
   defm V#NAME#SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
                          OpNode, FR32, f32mem, SSEPackedSingle, itins.s, 0>,
-                         XS, VEX_4V, VEX_LIG;
+                         XS, VEX_4V, VEX_LIG, VEX_WIG;
   defm V#NAME#SD : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "sd"),
                          OpNode, FR64, f64mem, SSEPackedDouble, itins.d, 0>,
-                         XD, VEX_4V, VEX_LIG;
+                         XD, VEX_4V, VEX_LIG, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in {
     defm SS : sse12_fp_scalar<opc, !strconcat(OpcodeStr, "ss"),
@@ -3107,21 +3023,20 @@ multiclass basic_sse12_fp_binop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
 }
 
 multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
-                                      SDPatternOperator IntSS,
-                                      SDPatternOperator IntSD,
+                                      SDPatternOperator OpNode,
                                       SizeItins itins> {
-  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+  defm V#NAME#SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
-                   SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG;
-  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+                   SSEPackedSingle, itins.s, 0>, XS, VEX_4V, VEX_LIG, VEX_WIG;
+  defm V#NAME#SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
-                   SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG;
+                   SSEPackedDouble, itins.d, 0>, XD, VEX_4V, VEX_LIG, VEX_WIG;
 
   let Constraints = "$src1 = $dst" in {
-    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, IntSS, VR128,
+    defm SS : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v4f32,
                    !strconcat(OpcodeStr, "ss"), ssmem, sse_load_f32,
                    SSEPackedSingle, itins.s>, XS;
-    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, IntSD, VR128,
+    defm SD : sse12_fp_scalar_int<opc, OpcodeStr, OpNode, VR128, v2f64,
                    !strconcat(OpcodeStr, "sd"), sdmem, sse_load_f64,
                    SSEPackedDouble, itins.d>, XD;
   }
@@ -3130,29 +3045,23 @@ multiclass basic_sse12_fp_binop_s_int<bits<8> opc, string OpcodeStr,
 // Binary Arithmetic instructions
 defm ADD : basic_sse12_fp_binop_p<0x58, "add", fadd, SSE_ALU_ITINS_P>,
            basic_sse12_fp_binop_s<0x58, "add", fadd, SSE_ALU_ITINS_S>,
-           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, null_frag,
-                                      SSE_ALU_ITINS_S>;
+           basic_sse12_fp_binop_s_int<0x58, "add", null_frag, SSE_ALU_ITINS_S>;
 defm MUL : basic_sse12_fp_binop_p<0x59, "mul", fmul, SSE_MUL_ITINS_P>,
            basic_sse12_fp_binop_s<0x59, "mul", fmul, SSE_MUL_ITINS_S>,
-           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, null_frag,
-                                      SSE_MUL_ITINS_S>;
+           basic_sse12_fp_binop_s_int<0x59, "mul", null_frag, SSE_MUL_ITINS_S>;
 let isCommutable = 0 in {
   defm SUB : basic_sse12_fp_binop_p<0x5C, "sub", fsub, SSE_ALU_ITINS_P>,
              basic_sse12_fp_binop_s<0x5C, "sub", fsub, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag, null_frag,
-                                        SSE_ALU_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5C, "sub", null_frag,SSE_ALU_ITINS_S>;
   defm DIV : basic_sse12_fp_binop_p<0x5E, "div", fdiv, SSE_DIV_ITINS_P>,
              basic_sse12_fp_binop_s<0x5E, "div", fdiv, SSE_DIV_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag, null_frag,
-                                        SSE_DIV_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5E, "div", null_frag,SSE_DIV_ITINS_S>;
   defm MAX : basic_sse12_fp_binop_p<0x5F, "max", X86fmax, SSE_ALU_ITINS_P>,
              basic_sse12_fp_binop_s<0x5F, "max", X86fmax, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5F, "max", int_x86_sse_max_ss,
-                                        int_x86_sse2_max_sd, SSE_ALU_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5F, "max", X86fmaxs, SSE_ALU_ITINS_S>;
   defm MIN : basic_sse12_fp_binop_p<0x5D, "min", X86fmin, SSE_ALU_ITINS_P>,
              basic_sse12_fp_binop_s<0x5D, "min", X86fmin, SSE_ALU_ITINS_S>,
-             basic_sse12_fp_binop_s_int<0x5D, "min", int_x86_sse_min_ss,
-                                        int_x86_sse2_min_sd, SSE_ALU_ITINS_S>;
+             basic_sse12_fp_binop_s_int<0x5D, "min", X86fmins, SSE_ALU_ITINS_S>;
 }
 
 let isCodeGenOnly = 1 in {
@@ -3399,7 +3308,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
             Sched<[itins.Sched.Folded, ReadAfterLd]>,
             Requires<[target, OptForSize]>;
 
-  let isCodeGenOnly = 1, Constraints = "$src1 = $dst" in {
+  let isCodeGenOnly = 1, Constraints = "$src1 = $dst", ExeDomain = d in {
   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
               !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
             []>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
@@ -3443,7 +3352,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   def m : I<opc, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
             [], itins.rm, d>, Sched<[itins.Sched.Folded, ReadAfterLd]>;
-  let isCodeGenOnly = 1 in {
+  let isCodeGenOnly = 1, ExeDomain = d in {
   def r_Int : I<opc, MRMSrcReg, (outs VR128:$dst),
                 (ins VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -3464,7 +3373,7 @@ multiclass avx_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
   // which has a clobber before the rcp, vs.
   // vrcpss mem, %xmm0, %xmm0
   // TODO: In theory, we could fold the load, and avoid the stall caused by
-  // the partial register store, either in ExeDepFix or with smarter RA.
+  // the partial register store, either in ExecutionDepsFix or with smarter RA.
   let Predicates = [UseAVX] in {
    def : Pat<(OpNode RC:$src),  (!cast<Instruction>("V"#NAME#Suffix##r)
                                 (ScalarVT (IMPLICIT_DEF)), RC:$src)>;
@@ -3494,22 +3403,22 @@ let Predicates = prds in {
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (v4f32 (OpNode VR128:$src)))],
-                       itins.rr>, VEX, Sched<[itins.Sched]>;
+                       itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PSm : PSI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "ps\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (OpNode (loadv4f32 addr:$src)))],
-                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
   def V#NAME#PSYr : PSI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (v8f32 (OpNode VR256:$src)))],
-                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PSYm : PSI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "ps\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (OpNode (loadv8f32 addr:$src)))],
-                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
 }
 
   def PSr : PSI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -3530,22 +3439,22 @@ let Predicates = [HasAVX] in {
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (v2f64 (OpNode VR128:$src)))],
-                       itins.rr>, VEX, Sched<[itins.Sched]>;
+                       itins.rr>, VEX, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PDm : PDI<opc, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
                        !strconcat("v", OpcodeStr,
                                   "pd\t{$src, $dst|$dst, $src}"),
                        [(set VR128:$dst, (OpNode (loadv2f64 addr:$src)))],
-                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>;
+                       itins.rm>, VEX, Sched<[itins.Sched.Folded]>, VEX_WIG;
   def V#NAME#PDYr : PDI<opc, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (v4f64 (OpNode VR256:$src)))],
-                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>;
+                        itins.rr>, VEX, VEX_L, Sched<[itins.Sched]>, VEX_WIG;
   def V#NAME#PDYm : PDI<opc, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
                         !strconcat("v", OpcodeStr,
                                    "pd\t{$src, $dst|$dst, $src}"),
                         [(set VR256:$dst, (OpNode (loadv4f64 addr:$src)))],
-                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>;
+                        itins.rm>, VEX, VEX_L, Sched<[itins.Sched.Folded]>, VEX_WIG;
 }
 
   def PDr : PDI<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
@@ -3566,7 +3475,7 @@ multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, v4f32, f32,
                       f32mem,
                       !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss), OpNode,
-                      SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG;
+                      SSEPackedSingle, itins, "SS">, XS, VEX_4V, VEX_LIG, VEX_WIG;
 }
 
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -3578,7 +3487,7 @@ multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          f64mem,
                          !cast<Intrinsic>("int_x86_sse2_"##OpcodeStr##_sd),
                          OpNode, SSEPackedDouble, itins, "SD">,
-                         XD, VEX_4V, VEX_LIG;
+                         XD, VEX_4V, VEX_LIG, VEX_WIG;
 }
 
 // Square root.
@@ -3646,41 +3555,41 @@ def VMOVNTPSmr : VPSI<0x2B, MRMDestMem, (outs),
                      "movntps\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v4f32 VR128:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX;
+                                               IIC_SSE_MOVNT>, VEX, VEX_WIG;
 def VMOVNTPDmr : VPDI<0x2B, MRMDestMem, (outs),
                      (ins f128mem:$dst, VR128:$src),
                      "movntpd\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v2f64 VR128:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX;
+                                               IIC_SSE_MOVNT>, VEX, VEX_WIG;
 
 let ExeDomain = SSEPackedInt in
 def VMOVNTDQmr    : VPDI<0xE7, MRMDestMem, (outs),
-                         (ins f128mem:$dst, VR128:$src),
+                         (ins i128mem:$dst, VR128:$src),
                          "movntdq\t{$src, $dst|$dst, $src}",
                          [(alignednontemporalstore (v2i64 VR128:$src),
                                                    addr:$dst)],
-                                                   IIC_SSE_MOVNT>, VEX;
+                                                   IIC_SSE_MOVNT>, VEX, VEX_WIG;
 
 def VMOVNTPSYmr : VPSI<0x2B, MRMDestMem, (outs),
                      (ins f256mem:$dst, VR256:$src),
                      "movntps\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v8f32 VR256:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+                                               IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
 def VMOVNTPDYmr : VPDI<0x2B, MRMDestMem, (outs),
                      (ins f256mem:$dst, VR256:$src),
                      "movntpd\t{$src, $dst|$dst, $src}",
                      [(alignednontemporalstore (v4f64 VR256:$src),
                                                addr:$dst)],
-                                               IIC_SSE_MOVNT>, VEX, VEX_L;
+                                               IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
 let ExeDomain = SSEPackedInt in
 def VMOVNTDQYmr : VPDI<0xE7, MRMDestMem, (outs),
-                    (ins f256mem:$dst, VR256:$src),
+                    (ins i256mem:$dst, VR256:$src),
                     "movntdq\t{$src, $dst|$dst, $src}",
                     [(alignednontemporalstore (v4i64 VR256:$src),
                                               addr:$dst)],
-                                              IIC_SSE_MOVNT>, VEX, VEX_L;
+                                              IIC_SSE_MOVNT>, VEX, VEX_L, VEX_WIG;
 }
 
 def MOVNTPSmr : PSI<0x2B, MRMDestMem, (outs), (ins f128mem:$dst, VR128:$src),
@@ -3796,20 +3705,18 @@ def : Pat<(X86MFence), (MFENCE)>;
 //===----------------------------------------------------------------------===//
 
 def VLDMXCSR : VPSI<0xAE, MRM2m, (outs), (ins i32mem:$src),
-                  "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
-                  IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>;
+               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+               IIC_SSE_LDMXCSR>, VEX, Sched<[WriteLoad]>, VEX_WIG;
 def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-                  "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
-                  IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>;
+               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+               IIC_SSE_STMXCSR>, VEX, Sched<[WriteStore]>, VEX_WIG;
 
-let Predicates = [UseSSE1] in {
 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
-                "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
-                IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
+              "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)],
+              IIC_SSE_LDMXCSR>, TB, Sched<[WriteLoad]>;
 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
-                "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
-                IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
-}
+              "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)],
+              IIC_SSE_STMXCSR>, TB, Sched<[WriteStore]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -3820,16 +3727,16 @@ let ExeDomain = SSEPackedInt in { // SSE integer instructions
 let hasSideEffects = 0, SchedRW = [WriteMove] in {
 def VMOVDQArr  : VPDI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX;
+                    VEX, VEX_WIG;
 def VMOVDQAYrr : VPDI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RR>,
-                    VEX, VEX_L;
+                    VEX, VEX_L, VEX_WIG;
 def VMOVDQUrr  : VSSI<0x6F, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX;
+                    VEX, VEX_WIG;
 def VMOVDQUYrr : VSSI<0x6F, MRMSrcReg, (outs VR256:$dst), (ins VR256:$src),
                     "movdqu\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVU_P_RR>,
-                    VEX, VEX_L;
+                    VEX, VEX_L, VEX_WIG;
 }
 
 // For Disassembler
@@ -3838,54 +3745,58 @@ let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
 def VMOVDQArr_REV  : VPDI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVA_P_RR>,
-                        VEX;
+                        VEX, VEX_WIG;
 def VMOVDQAYrr_REV : VPDI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqa\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L;
+                        IIC_SSE_MOVA_P_RR>, VEX, VEX_L, VEX_WIG;
 def VMOVDQUrr_REV  : VSSI<0x7F, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
                         IIC_SSE_MOVU_P_RR>,
-                        VEX;
+                        VEX, VEX_WIG;
 def VMOVDQUYrr_REV : VSSI<0x7F, MRMDestReg, (outs VR256:$dst), (ins VR256:$src),
                         "movdqu\t{$src, $dst|$dst, $src}", [],
-                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L;
+                        IIC_SSE_MOVU_P_RR>, VEX, VEX_L, VEX_WIG;
 }
 
 let canFoldAsLoad = 1, mayLoad = 1, isReMaterializable = 1,
     hasSideEffects = 0, SchedRW = [WriteLoad] in {
+let Predicates = [HasAVX,NoVLX] in
 def VMOVDQArm  : VPDI<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                   "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
-                   VEX;
+                   "movdqa\t{$src, $dst|$dst, $src}",
+                   [(set VR128:$dst, (alignedloadv2i64 addr:$src))],
+                   IIC_SSE_MOVA_P_RM>, VEX, VEX_WIG;
 def VMOVDQAYrm : VPDI<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_RM>,
-                   VEX, VEX_L;
-let Predicates = [HasAVX] in {
-  def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
-                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
-                    XS, VEX;
-  def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
-                    "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
-                    XS, VEX, VEX_L;
-}
+                   VEX, VEX_L, VEX_WIG;
+let Predicates = [HasAVX,NoVLX] in
+def VMOVDQUrm  : I<0x6F, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",
+                  [(set VR128:$dst, (loadv2i64 addr:$src))],
+                  IIC_SSE_MOVU_P_RM>, XS, VEX, VEX_WIG;
+def VMOVDQUYrm : I<0x6F, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
+                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_RM>,
+                  XS, VEX, VEX_L, VEX_WIG;
 }
 
 let mayStore = 1, hasSideEffects = 0, SchedRW = [WriteStore] in {
+let Predicates = [HasAVX,NoVLX] in
 def VMOVDQAmr  : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i128mem:$dst, VR128:$src),
-                     "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
-                     VEX;
+                     "movdqa\t{$src, $dst|$dst, $src}",
+                     [(alignedstore (v2i64 VR128:$src), addr:$dst)],
+                     IIC_SSE_MOVA_P_MR>, VEX, VEX_WIG;
 def VMOVDQAYmr : VPDI<0x7F, MRMDestMem, (outs),
                      (ins i256mem:$dst, VR256:$src),
                      "movdqa\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVA_P_MR>,
-                     VEX, VEX_L;
-let Predicates = [HasAVX] in {
+                     VEX, VEX_L, VEX_WIG;
+let Predicates = [HasAVX,NoVLX] in
 def VMOVDQUmr  : I<0x7F, MRMDestMem, (outs), (ins i128mem:$dst, VR128:$src),
-                  "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX;
+                  "vmovdqu\t{$src, $dst|$dst, $src}",
+                  [(store (v2i64 VR128:$src), addr:$dst)], IIC_SSE_MOVU_P_MR>,
+                  XS, VEX, VEX_WIG;
 def VMOVDQUYmr : I<0x7F, MRMDestMem, (outs), (ins i256mem:$dst, VR256:$src),
                   "vmovdqu\t{$src, $dst|$dst, $src}",[], IIC_SSE_MOVU_P_MR>,
-                  XS, VEX, VEX_L;
-}
+                  XS, VEX, VEX_L, VEX_WIG;
 }
 
 let SchedRW = [WriteMove] in {
@@ -3948,6 +3859,50 @@ def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
 def : InstAlias<"vmovdqu\t{$src, $dst|$dst, $src}",
                 (VMOVDQUYrr_REV VR256L:$dst, VR256H:$src), 0>;
 
+let Predicates = [HasAVX, NoVLX] in {
+  // Additional patterns for other integer sizes.
+  def : Pat<(alignedstore (v4i32 VR128:$src), addr:$dst),
+            (VMOVDQAmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v8i16 VR128:$src), addr:$dst),
+            (VMOVDQAmr addr:$dst, VR128:$src)>;
+  def : Pat<(alignedstore (v16i8 VR128:$src), addr:$dst),
+            (VMOVDQAmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v4i32 VR128:$src), addr:$dst),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v8i16 VR128:$src), addr:$dst),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+  def : Pat<(store (v16i8 VR128:$src), addr:$dst),
+            (VMOVDQUmr addr:$dst, VR128:$src)>;
+
+  // Special patterns for storing subvector extracts of lower 128-bits
+  // Its cheaper to just use VMOVDQA/VMOVDQU instead of VEXTRACTF128mr
+  def : Pat<(alignedstore (v2i64 (extract_subvector
+                                  (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v4i32 (extract_subvector
+                                  (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v8i16 (extract_subvector
+                                  (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(alignedstore (v16i8 (extract_subvector
+                                  (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQAmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+
+  def : Pat<(store (v2i64 (extract_subvector
+                           (v4i64 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v2i64 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v4i32 (extract_subvector
+                           (v8i32 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v4i32 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v8i16 (extract_subvector
+                           (v16i16 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v8i16 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+  def : Pat<(store (v16i8 (extract_subvector
+                           (v32i8 VR256:$src), (iPTR 0))), addr:$dst),
+            (VMOVDQUmr addr:$dst, (v16i8 (EXTRACT_SUBREG VR256:$src,sub_xmm)))>;
+}
+
 //===---------------------------------------------------------------------===//
 // SSE2 - Packed Integer Arithmetic Instructions
 //===---------------------------------------------------------------------===//
@@ -4036,12 +3991,12 @@ defm PAVGW   : PDI_binop_all<0xE3, "pavgw", X86avg, v8i16, v16i16,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPMADDWD : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
-                              loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V;
+                              loadv2i64, i128mem, SSE_PMADD, 0>, VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPMADDWDY : PDI_binop_rm2<0xF5, "vpmaddwd", X86vpmaddwd, v8i32, v16i16,
                                VR256, loadv4i64, i256mem, SSE_PMADD,
-                               0>, VEX_4V, VEX_L;
+                               0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
                              memopv2i64, i128mem, SSE_PMADD>;
@@ -4049,11 +4004,11 @@ defm PMADDWD : PDI_binop_rm2<0xF5, "pmaddwd", X86vpmaddwd, v4i32, v8i16, VR128,
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
 defm VPSADBW : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v2i64, v16i8, VR128,
                              loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
-                             VEX_4V;
+                             VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
 defm VPSADBWY : PDI_binop_rm2<0xF6, "vpsadbw", X86psadbw, v4i64, v32i8, VR256,
                              loadv4i64, i256mem, SSE_INTMUL_ITINS_P, 0>,
-                             VEX_4V, VEX_L;
+                             VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
                             memopv2i64, i128mem, SSE_INTALU_ITINS_P>;
@@ -4061,11 +4016,11 @@ defm PSADBW : PDI_binop_rm2<0xF6, "psadbw", X86psadbw, v2i64, v16i8, VR128,
 let Predicates = [HasAVX, NoVLX] in
 defm VPMULUDQ : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v2i64, v4i32, VR128,
                               loadv2i64, i128mem, SSE_INTMUL_ITINS_P, 0>,
-                              VEX_4V;
+                              VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX] in
 defm VPMULUDQY : PDI_binop_rm2<0xF4, "vpmuludq", X86pmuludq, v4i64, v8i32,
                                VR256, loadv4i64, i256mem,
-                               SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L;
+                               SSE_INTMUL_ITINS_P, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
 defm PMULUDQ : PDI_binop_rm2<0xF4, "pmuludq", X86pmuludq, v2i64, v4i32, VR128,
                              memopv2i64, i128mem, SSE_INTMUL_ITINS_P>;
@@ -4112,11 +4067,11 @@ multiclass PDI_binop_rmi_all<bits<8> opc, bits<8> opc2, Format ImmForm,
 let Predicates = [HasAVX, prd] in
   defm V#NAME : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                               OpNode, OpNode2, VR128, DstVT128, SrcVT,
-                              loadv2i64, 0>, VEX_4V;
+                              loadv2i64, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, prd] in
   defm V#NAME#Y : PDI_binop_rmi<opc, opc2, ImmForm, !strconcat("v", OpcodeStr),
                                 OpNode, OpNode2, VR256, DstVT256, SrcVT,
-                                loadv2i64, 0>, VEX_4V, VEX_L;
+                                loadv2i64, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_rmi<opc, opc2, ImmForm, OpcodeStr, OpNode, OpNode2,
                            VR128, DstVT128, SrcVT, memopv2i64>;
@@ -4137,10 +4092,10 @@ multiclass PDI_binop_ri_all<bits<8> opc, Format ImmForm, string OpcodeStr,
                            SDNode OpNode> {
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in
   defm V#NAME : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
-                             VR128, v16i8, 0>, VEX_4V;
+                             VR128, v16i8, 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in
   defm V#NAME#Y : PDI_binop_ri<opc, ImmForm, !strconcat("v", OpcodeStr), OpNode,
-                               VR256, v32i8, 0>, VEX_4V, VEX_L;
+                               VR256, v32i8, 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst" in
   defm NAME : PDI_binop_ri<opc, ImmForm, OpcodeStr, OpNode, VR128, v16i8>;
 }
@@ -4201,7 +4156,7 @@ let Predicates = [HasAVX, prd] in {
                                  "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                       [(set VR128:$dst,
                         (vt128 (OpNode VR128:$src1, (i8 imm:$src2))))],
-                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>;
+                      IIC_SSE_PSHUF_RI>, VEX, Sched<[WriteShuffle]>, VEX_WIG;
   def V#NAME#mi : Ii8<0x70, MRMSrcMem, (outs VR128:$dst),
                       (ins i128mem:$src1, u8imm:$src2),
                       !strconcat("v", OpcodeStr,
@@ -4209,7 +4164,7 @@ let Predicates = [HasAVX, prd] in {
                      [(set VR128:$dst,
                        (vt128 (OpNode (bitconvert (loadv2i64 addr:$src1)),
                         (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX,
-                  Sched<[WriteShuffleLd]>;
+                  Sched<[WriteShuffleLd]>, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, prd] in {
@@ -4219,7 +4174,7 @@ let Predicates = [HasAVX2, prd] in {
                                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
                          (vt256 (OpNode VR256:$src1, (i8 imm:$src2))))],
-                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>;
+                       IIC_SSE_PSHUF_RI>, VEX, VEX_L, Sched<[WriteShuffle]>, VEX_WIG;
   def V#NAME#Ymi : Ii8<0x70, MRMSrcMem, (outs VR256:$dst),
                        (ins i256mem:$src1, u8imm:$src2),
                        !strconcat("v", OpcodeStr,
@@ -4227,7 +4182,7 @@ let Predicates = [HasAVX2, prd] in {
                       [(set VR256:$dst,
                         (vt256 (OpNode (bitconvert (loadv4i64 addr:$src1)),
                          (i8 imm:$src2))))], IIC_SSE_PSHUF_MI>, VEX, VEX_L,
-                   Sched<[WriteShuffleLd]>;
+                   Sched<[WriteShuffleLd]>, VEX_WIG;
 }
 
 let Predicates = [UseSSE2] in {
@@ -4256,20 +4211,6 @@ defm PSHUFHW : sse2_pshuffle<"pshufhw", v8i16, v16i16, X86PShufhw,
 defm PSHUFLW : sse2_pshuffle<"pshuflw", v8i16, v16i16, X86PShuflw,
                              NoVLX_Or_NoBWI>, XD;
 
-let Predicates = [HasAVX] in {
-  def : Pat<(v4f32 (X86PShufd (loadv4f32 addr:$src1), (i8 imm:$imm))),
-            (VPSHUFDmi addr:$src1, imm:$imm)>;
-  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-            (VPSHUFDri VR128:$src1, imm:$imm)>;
-}
-
-let Predicates = [UseSSE2] in {
-  def : Pat<(v4f32 (X86PShufd (memopv4f32 addr:$src1), (i8 imm:$imm))),
-            (PSHUFDmi addr:$src1, imm:$imm)>;
-  def : Pat<(v4f32 (X86PShufd VR128:$src1, (i8 imm:$imm))),
-            (PSHUFDri VR128:$src1, imm:$imm)>;
-}
-
 //===---------------------------------------------------------------------===//
 // Packed Integer Pack Instructions (SSE & AVX)
 //===---------------------------------------------------------------------===//
@@ -4363,24 +4304,24 @@ multiclass sse4_pack_y<bits<8> opc, string OpcodeStr, ValueType OutVT,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack<0x63, "vpacksswb", v16i8, v8i16, X86Packss,
-                             loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPACKSSDW : sse2_pack<0x6B, "vpackssdw", v8i16, v4i32, X86Packss,
-                             loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack<0x67, "vpackuswb", v16i8, v8i16, X86Packus,
-                             loadv2i64, 0>, VEX_4V;
+                             loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPACKUSDW : sse4_pack<0x2B, "vpackusdw", v8i16, v4i32, X86Packus,
                              loadv2i64, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPACKSSWB : sse2_pack_y<0x63, "vpacksswb", v32i8, v16i16, X86Packss>,
-                               VEX_4V, VEX_L;
+                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKSSDW : sse2_pack_y<0x6B, "vpackssdw", v16i16, v8i32, X86Packss>,
-                               VEX_4V, VEX_L;
+                               VEX_4V, VEX_L, VEX_WIG;
 
   defm VPACKUSWB : sse2_pack_y<0x67, "vpackuswb", v32i8, v16i16, X86Packus>,
-                               VEX_4V, VEX_L;
+                               VEX_4V, VEX_L, VEX_WIG;
   defm VPACKUSDW : sse4_pack_y<0x2B, "vpackusdw", v16i16, v8i32, X86Packus>,
                                VEX_4V, VEX_L;
 }
@@ -4442,44 +4383,44 @@ multiclass sse2_unpack_y<bits<8> opc, string OpcodeStr, ValueType vt,
 
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack<0x60, "vpunpcklbw", v16i8, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack<0x61, "vpunpcklwd", v8i16, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack<0x69, "vpunpckhwd", v8i16, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Unpckl,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Unpckh,
-                                 loadv2i64, 0>, VEX_4V;
+                                 loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPUNPCKLBW  : sse2_unpack_y<0x60, "vpunpcklbw", v32i8, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLWD  : sse2_unpack_y<0x61, "vpunpcklwd", v16i16, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHWD  : sse2_unpack_y<0x69, "vpunpckhwd", v16i16, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Unpckl>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
   defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Unpckh>,
-                                   VEX_4V, VEX_L;
+                                   VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -4564,14 +4505,14 @@ def VPMOVMSKBrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
            (ins VR128:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32orGR64:$dst, (X86movmsk (v16i8 VR128:$src)))],
-           IIC_SSE_MOVMSK>, VEX;
+           IIC_SSE_MOVMSK>, VEX, VEX_WIG;
 
 let Predicates = [HasAVX2] in {
 def VPMOVMSKBYrr  : VPDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
            (ins VR256:$src),
            "pmovmskb\t{$src, $dst|$dst, $src}",
            [(set GR32orGR64:$dst, (X86movmsk (v32i8 VR256:$src)))]>,
-           VEX, VEX_L;
+           VEX, VEX_L, VEX_WIG;
 }
 
 def PMOVMSKBrr : PDI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst), (ins VR128:$src),
@@ -4592,13 +4533,13 @@ def VMASKMOVDQU : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, EDI)],
-           IIC_SSE_MASKMOV>, VEX;
+           IIC_SSE_MASKMOV>, VEX, VEX_WIG;
 let Uses = [RDI], Predicates = [HasAVX,In64BitMode] in
 def VMASKMOVDQU64 : VPDI<0xF7, MRMSrcReg, (outs),
            (ins VR128:$src, VR128:$mask),
            "maskmovdqu\t{$mask, $src|$src, $mask}",
            [(int_x86_sse2_maskmov_dqu VR128:$src, VR128:$mask, RDI)],
-           IIC_SSE_MASKMOV>, VEX;
+           IIC_SSE_MASKMOV>, VEX, VEX_WIG;
 
 let Uses = [EDI], Predicates = [UseSSE2,Not64BitMode] in
 def MASKMOVDQU : PDI<0xF7, MRMSrcReg, (outs), (ins VR128:$src, VR128:$mask),
@@ -4724,19 +4665,6 @@ def MOVPDI2DImr  : S2I<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR128:$src),
                                      (iPTR 0))), addr:$dst)],
                                      IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 } // ExeDomain = SSEPackedInt
-
-def : Pat<(v8i32 (X86Vinsert (v8i32 immAllZerosV), GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v4i64 (X86Vinsert (bc_v4i64 (v8i32 immAllZerosV)), GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
-
-def : Pat<(v8i32 (X86Vinsert undef, GR32:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOVDI2PDIrr GR32:$src2), sub_xmm)>;
-
-def : Pat<(v4i64 (X86Vinsert undef, GR64:$src2, (iPTR 0))),
-        (SUBREG_TO_REG (i32 0), (VMOV64toPQIrr GR64:$src2), sub_xmm)>;
-
 //===---------------------------------------------------------------------===//
 // Move Packed Doubleword Int first element to Doubleword Int
 //
@@ -4757,12 +4685,12 @@ def MOVPQIto64rr : RS2I<0x7E, MRMDestReg, (outs GR64:$dst), (ins VR128:$src),
 } //SchedRW
 
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def VMOVPQIto64rm : VRS2I<0x7E, MRMDestMem, (outs),
+def VMOVPQIto64mr : VRS2I<0x7E, MRMDestMem, (outs),
                           (ins i64mem:$dst, VR128:$src),
                           "movq\t{$src, $dst|$dst, $src}",
                           [], IIC_SSE_MOVDQ>, VEX, Sched<[WriteStore]>;
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0, mayStore = 1 in
-def MOVPQIto64rm : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
+def MOVPQIto64mr : RS2I<0x7E, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                         "mov{d|q}\t{$src, $dst|$dst, $src}",
                         [], IIC_SSE_MOVDQ>, Sched<[WriteStore]>;
 } // ExeDomain = SSEPackedInt
@@ -4836,6 +4764,8 @@ let Predicates = [UseAVX] in {
   // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
   // These instructions also write zeros in the high part of a 256-bit register.
   let AddedComplexity = 20 in {
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+              (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (VMOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
@@ -4865,6 +4795,8 @@ let Predicates = [UseSSE2] in {
               (MOV64toPQIrr GR64:$src)>;
   }
   let AddedComplexity = 20 in {
+    def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
+              (MOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (v4i32 (scalar_to_vector (loadi32 addr:$src))))),
               (MOVDI2PDIrm addr:$src)>;
     def : Pat<(v4i32 (X86vzmovl (bc_v4i32 (loadv4f32 addr:$src)))),
@@ -4902,7 +4834,7 @@ def VMOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
                       (v2i64 (scalar_to_vector (loadi64 addr:$src))))]>, XS,
-                    VEX, Requires<[UseAVX]>;
+                    VEX, Requires<[UseAVX]>, VEX_WIG;
 def MOVQI2PQIrm : I<0x7E, MRMSrcMem, (outs VR128:$dst), (ins i64mem:$src),
                     "movq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst,
@@ -4919,7 +4851,7 @@ def VMOVPQI2QImr : VS2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (extractelt (v2i64 VR128:$src),
                                     (iPTR 0))), addr:$dst)],
-                                    IIC_SSE_MOVDQ>, VEX;
+                                    IIC_SSE_MOVDQ>, VEX, VEX_WIG;
 def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}",
                       [(store (i64 (extractelt (v2i64 VR128:$src),
@@ -4931,7 +4863,7 @@ def MOVPQI2QImr : S2I<0xD6, MRMDestMem, (outs), (ins i64mem:$dst, VR128:$src),
 let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0,
     SchedRW = [WriteVecLogic] in {
 def VMOVPQI2QIrr : VS2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
-                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX;
+                     "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>, VEX, VEX_WIG;
 def MOVPQI2QIrr : S2I<0xD6, MRMDestReg, (outs VR128:$dst), (ins VR128:$src),
                       "movq\t{$src, $dst|$dst, $src}", [], IIC_SSE_MOVQ_RR>;
 }
@@ -4977,7 +4909,7 @@ def VMOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "vmovq\t{$src, $dst|$dst, $src}",
                     [(set VR128:$dst, (v2i64 (X86vzmovl (v2i64 VR128:$src))))],
                     IIC_SSE_MOVQ_RR>,
-                      XS, VEX, Requires<[UseAVX]>;
+                      XS, VEX, Requires<[UseAVX]>, VEX_WIG;
 let AddedComplexity = 15 in
 def MOVZPQILo2PQIrr : I<0x7E, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
                         "movq\t{$src, $dst|$dst, $src}",
@@ -5015,13 +4947,13 @@ def rm : S3SI<op, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
 
 let Predicates = [HasAVX, NoVLX] in {
   defm VMOVSHDUP  : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
   defm VMOVSLDUP  : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                       v4f32, VR128, loadv4f32, f128mem>, VEX;
+                                       v4f32, VR128, loadv4f32, f128mem>, VEX, VEX_WIG;
   defm VMOVSHDUPY : sse3_replicate_sfp<0x16, X86Movshdup, "vmovshdup",
-                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
   defm VMOVSLDUPY : sse3_replicate_sfp<0x12, X86Movsldup, "vmovsldup",
-                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L;
+                                 v8f32, VR256, loadv8f32, f256mem>, VEX, VEX_L, VEX_WIG;
 }
 defm MOVSHDUP : sse3_replicate_sfp<0x16, X86Movshdup, "movshdup", v4f32, VR128,
                                    memopv4f32, f128mem>;
@@ -5089,8 +5021,8 @@ def rm  : S3DI<0x12, MRMSrcMem, (outs VR256:$dst), (ins f256mem:$src),
 }
 
 let Predicates = [HasAVX, NoVLX] in {
-  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX;
-  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L;
+  defm VMOVDDUP  : sse3_replicate_dfp<"vmovddup">, VEX, VEX_WIG;
+  defm VMOVDDUPY : sse3_replicate_dfp_y<"vmovddup">, VEX, VEX_L, VEX_WIG;
 }
 
 defm MOVDDUP : sse3_replicate_dfp<"movddup">;
@@ -5127,11 +5059,11 @@ let SchedRW = [WriteLoad] in {
 let Predicates = [HasAVX] in {
   def VLDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
-                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX;
+                   [(set VR128:$dst, (int_x86_sse3_ldu_dq addr:$src))]>, VEX, VEX_WIG;
   def VLDDQUYrm : S3DI<0xF0, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                    "vlddqu\t{$src, $dst|$dst, $src}",
                    [(set VR256:$dst, (int_x86_avx_ldu_dq_256 addr:$src))]>,
-                   VEX, VEX_L;
+                   VEX, VEX_L, VEX_WIG;
 }
 def LDDQUrm : S3DI<0xF0, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                    "lddqu\t{$src, $dst|$dst, $src}",
@@ -5165,15 +5097,15 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
-                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V;
+                               f128mem, SSE_ALU_F32P, loadv4f32, 0>, XD, VEX_4V, VEX_WIG;
     defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
-                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L;
+                        f256mem, SSE_ALU_F32P, loadv8f32, 0>, XD, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
-                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V;
+                               f128mem, SSE_ALU_F64P, loadv2f64, 0>, PD, VEX_4V, VEX_WIG;
     defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
-                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L;
+                        f256mem, SSE_ALU_F64P, loadv4f64, 0>, PD, VEX_4V, VEX_L, VEX_WIG;
   }
 }
 let Constraints = "$src1 = $dst", Predicates = [UseSSE3] in {
@@ -5260,23 +5192,23 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
 let Predicates = [HasAVX] in {
   let ExeDomain = SSEPackedSingle in {
     defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                            X86fhadd, loadv4f32, 0>, VEX_4V;
+                            X86fhadd, loadv4f32, 0>, VEX_4V, VEX_WIG;
     defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                            X86fhsub, loadv4f32, 0>, VEX_4V;
+                            X86fhsub, loadv4f32, 0>, VEX_4V, VEX_WIG;
     defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L;
+                            X86fhadd, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
     defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L;
+                            X86fhsub, loadv8f32, 0>, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
     defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                            X86fhadd, loadv2f64, 0>, VEX_4V;
+                            X86fhadd, loadv2f64, 0>, VEX_4V, VEX_WIG;
     defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                            X86fhsub, loadv2f64, 0>, VEX_4V;
+                            X86fhsub, loadv2f64, 0>, VEX_4V, VEX_WIG;
     defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L;
+                            X86fhadd, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
     defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L;
+                            X86fhsub, loadv4f64, 0>, VEX_4V, VEX_L, VEX_WIG;
   }
 }
 
@@ -5334,84 +5266,24 @@ multiclass SS3I_unop_rm_y<bits<8> opc, string OpcodeStr, ValueType vt,
                   Sched<[WriteVecALULd]>;
 }
 
-// Helper fragments to match sext vXi1 to vXiY.
-def v16i1sextv16i8 : PatLeaf<(v16i8 (X86pcmpgt (bc_v16i8 (v4i32 immAllZerosV)),
-                                               VR128:$src))>;
-def v8i1sextv8i16  : PatLeaf<(v8i16 (X86vsrai VR128:$src, (i8 15)))>;
-def v4i1sextv4i32  : PatLeaf<(v4i32 (X86vsrai VR128:$src, (i8 31)))>;
-def v32i1sextv32i8 : PatLeaf<(v32i8 (X86pcmpgt (bc_v32i8 (v8i32 immAllZerosV)),
-                                               VR256:$src))>;
-def v16i1sextv16i16: PatLeaf<(v16i16 (X86vsrai VR256:$src, (i8 15)))>;
-def v8i1sextv8i32  : PatLeaf<(v8i32 (X86vsrai VR256:$src, (i8 31)))>;
-
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, X86Abs, loadv2i64>, VEX;
-  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, X86Abs, loadv2i64>, VEX;
+  defm VPABSB  : SS3I_unop_rm<0x1C, "vpabsb", v16i8, abs, loadv2i64>, VEX, VEX_WIG;
+  defm VPABSW  : SS3I_unop_rm<0x1D, "vpabsw", v8i16, abs, loadv2i64>, VEX, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX] in {
-  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, X86Abs, loadv2i64>, VEX;
-}
-
-let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
-  def : Pat<(xor
-            (bc_v2i64 (v16i1sextv16i8)),
-            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
-            (VPABSBrr VR128:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (v8i1sextv8i16)),
-            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
-            (VPABSWrr VR128:$src)>;
-}
-let Predicates = [HasAVX, NoVLX] in {
-  def : Pat<(xor
-            (bc_v2i64 (v4i1sextv4i32)),
-            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
-            (VPABSDrr VR128:$src)>;
-}
-
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, X86Abs>, VEX, VEX_L;
-  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, X86Abs>, VEX, VEX_L;
+  defm VPABSD  : SS3I_unop_rm<0x1E, "vpabsd", v4i32, abs, loadv2i64>, VEX, VEX_WIG;
 }
-let Predicates = [HasAVX2, NoVLX] in {
-  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, X86Abs>, VEX, VEX_L;
-}
-
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-  def : Pat<(xor
-            (bc_v4i64 (v32i1sextv32i8)),
-            (bc_v4i64 (add (v32i8 VR256:$src), (v32i1sextv32i8)))),
-            (VPABSBYrr VR256:$src)>;
-  def : Pat<(xor
-            (bc_v4i64 (v16i1sextv16i16)),
-            (bc_v4i64 (add (v16i16 VR256:$src), (v16i1sextv16i16)))),
-            (VPABSWYrr VR256:$src)>;
+  defm VPABSB  : SS3I_unop_rm_y<0x1C, "vpabsb", v32i8, abs>, VEX, VEX_L, VEX_WIG;
+  defm VPABSW  : SS3I_unop_rm_y<0x1D, "vpabsw", v16i16, abs>, VEX, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX] in {
-  def : Pat<(xor
-            (bc_v4i64 (v8i1sextv8i32)),
-            (bc_v4i64 (add (v8i32 VR256:$src), (v8i1sextv8i32)))),
-            (VPABSDYrr VR256:$src)>;
+  defm VPABSD  : SS3I_unop_rm_y<0x1E, "vpabsd", v8i32, abs>, VEX, VEX_L, VEX_WIG;
 }
 
-defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, X86Abs, memopv2i64>;
-defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, X86Abs, memopv2i64>;
-defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, X86Abs, memopv2i64>;
-
-let Predicates = [UseSSSE3] in {
-  def : Pat<(xor
-            (bc_v2i64 (v16i1sextv16i8)),
-            (bc_v2i64 (add (v16i8 VR128:$src), (v16i1sextv16i8)))),
-            (PABSBrr VR128:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (v8i1sextv8i16)),
-            (bc_v2i64 (add (v8i16 VR128:$src), (v8i1sextv8i16)))),
-            (PABSWrr VR128:$src)>;
-  def : Pat<(xor
-            (bc_v2i64 (v4i1sextv4i32)),
-            (bc_v2i64 (add (v4i32 VR128:$src), (v4i1sextv4i32)))),
-            (PABSDrr VR128:$src)>;
-}
+defm PABSB : SS3I_unop_rm<0x1C, "pabsb", v16i8, abs, memopv2i64>;
+defm PABSW : SS3I_unop_rm<0x1D, "pabsw", v8i16, abs, memopv2i64>;
+defm PABSD : SS3I_unop_rm<0x1E, "pabsd", v4i32, abs, memopv2i64>;
 
 //===---------------------------------------------------------------------===//
 // SSSE3 - Packed Binary Operator Instructions
@@ -5509,45 +5381,45 @@ let ImmT = NoImm, Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFB    : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v16i8, v16i8,
                                   VR128, loadv2i64, i128mem,
-                                  SSE_PSHUFB, 0>, VEX_4V;
+                                  SSE_PSHUFB, 0>, VEX_4V, VEX_WIG;
   defm VPMADDUBSW : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v8i16,
                                   v16i8, VR128, loadv2i64, i128mem,
-                                  SSE_PMADD, 0>, VEX_4V;
+                                  SSE_PMADD, 0>, VEX_4V, VEX_WIG;
 }
 defm VPMULHRSW    : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v8i16, v8i16,
                                   VR128, loadv2i64, i128mem,
-                                  SSE_PMULHRSW, 0>, VEX_4V;
+                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX] in {
 let isCommutable = 0 in {
   defm VPHADDW    : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v8i16, v8i16, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
   defm VPHADDD    : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v4i32, v4i32, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBD, 0>, VEX_4V;
+                                  SSE_PHADDSUBD, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBW    : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v8i16, v8i16, VR128,
                                   loadv2i64, i128mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBD    : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v4i32, v4i32, VR128,
                                   loadv2i64, i128mem,
                                   SSE_PHADDSUBD, 0>, VEX_4V;
   defm VPSIGNB    : SS3I_binop_rm_int<0x08, "vpsignb",
                                       int_x86_ssse3_psign_b_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPSIGNW    : SS3I_binop_rm_int<0x09, "vpsignw",
                                       int_x86_ssse3_psign_w_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPSIGND    : SS3I_binop_rm_int<0x0A, "vpsignd",
                                       int_x86_ssse3_psign_d_128,
-                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V;
+                                      SSE_PSIGN, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int<0x03, "vphaddsw",
                                       int_x86_ssse3_phadd_sw_128,
-                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int<0x07, "vphsubsw",
                                       int_x86_ssse3_phsub_sw_128,
-                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V;
+                                      SSE_PHADDSUBSW, loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 }
 
@@ -5555,42 +5427,42 @@ let ImmT = NoImm, Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 let isCommutable = 0 in {
   defm VPSHUFBY   : SS3I_binop_rm<0x00, "vpshufb", X86pshufb, v32i8, v32i8,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L;
+                                  SSE_PSHUFB, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPMADDUBSWY : SS3I_binop_rm<0x04, "vpmaddubsw", X86vpmaddubsw, v16i16,
                                    v32i8, VR256, loadv4i64, i256mem,
-                                   SSE_PMADD, 0>, VEX_4V, VEX_L;
+                                   SSE_PMADD, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 defm VPMULHRSWY   : SS3I_binop_rm<0x0B, "vpmulhrsw", X86mulhrs, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_L;
+                                  SSE_PMULHRSW, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let ImmT = NoImm, Predicates = [HasAVX2] in {
 let isCommutable = 0 in {
   defm VPHADDWY   : SS3I_binop_rm<0x01, "vphaddw", X86hadd, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDDY   : SS3I_binop_rm<0x02, "vphaddd", X86hadd, v8i32, v8i32, VR256,
                                   loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBWY   : SS3I_binop_rm<0x05, "vphsubw", X86hsub, v16i16, v16i16,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
+                                  SSE_PHADDSUBW, 0>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBDY   : SS3I_binop_rm<0x06, "vphsubd", X86hsub, v8i32, v8i32, VR256,
                                   loadv4i64, i256mem,
                                   SSE_PHADDSUBW, 0>, VEX_4V, VEX_L;
   defm VPSIGNBY   : SS3I_binop_rm_int_y<0x08, "vpsignb", int_x86_avx2_psign_b,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPSIGNWY   : SS3I_binop_rm_int_y<0x09, "vpsignw", int_x86_avx2_psign_w,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPSIGNDY   : SS3I_binop_rm_int_y<0x0A, "vpsignd", int_x86_avx2_psign_d,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHADDSW   : SS3I_binop_rm_int_y<0x03, "vphaddsw",
                                         int_x86_avx2_phadd_sw,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
   defm VPHSUBSW   : SS3I_binop_rm_int_y<0x07, "vphsubsw",
                                         int_x86_avx2_phsub_sw,
-                                        WriteVecALU>, VEX_4V, VEX_L;
+                                        WriteVecALU>, VEX_4V, VEX_L, VEX_WIG;
 }
 }
 
@@ -5668,9 +5540,9 @@ multiclass ssse3_palignr_y<string asm, bit Is2Addr = 1> {
 }
 
 let Predicates = [HasAVX] in
-  defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V;
+  defm VPALIGNR : ssse3_palignr<"vpalignr", 0>, VEX_4V, VEX_WIG;
 let Predicates = [HasAVX2] in
-  defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L;
+  defm VPALIGNR : ssse3_palignr_y<"vpalignr", 0>, VEX_4V, VEX_L, VEX_WIG;
 let Constraints = "$src1 = $dst", Predicates = [UseSSSE3] in
   defm PALIGNR : ssse3_palignr<"palignr">;
 
@@ -5761,10 +5633,10 @@ multiclass SS41I_pmovx_rm_all<bits<8> opc, string OpcodeStr,
   defm NAME : SS41I_pmovx_rrrm<opc, OpcodeStr, MemOp, VR128, VR128, SSEItins>;
   let Predicates = [HasAVX, prd] in
     defm V#NAME   : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemOp,
-                                     VR128, VR128, AVXItins>, VEX;
+                                     VR128, VR128, AVXItins>, VEX, VEX_WIG;
   let Predicates = [HasAVX2, prd] in
     defm V#NAME#Y : SS41I_pmovx_rrrm<opc, !strconcat("v", OpcodeStr), MemYOp,
-                                     VR256, VR128, AVX2Itins>, VEX, VEX_L;
+                                     VR256, VR128, AVX2Itins>, VEX, VEX_L, VEX_WIG;
 }
 
 multiclass SS41I_pmovx_rm<bits<8> opc, string OpcodeStr, X86MemOperand MemOp,
@@ -5992,12 +5864,12 @@ multiclass SS41I_pmovx_patterns<string OpcPrefix, string ExtTy,
   }
 }
 
-defm : SS41I_pmovx_patterns<"VPMOVSX", "s", X86vsext, extloadi32i16>;
-defm : SS41I_pmovx_patterns<"VPMOVZX", "z", X86vzext, loadi16_anyext>;
+defm : SS41I_pmovx_patterns<"VPMOVSX", "s", sext_invec, extloadi32i16>;
+defm : SS41I_pmovx_patterns<"VPMOVZX", "z", zext_invec, loadi16_anyext>;
 
 let Predicates = [UseSSE41] in {
-  defm : SS41I_pmovx_patterns<"PMOVSX", "s", X86vsext, extloadi32i16>;
-  defm : SS41I_pmovx_patterns<"PMOVZX", "z", X86vzext, loadi16_anyext>;
+  defm : SS41I_pmovx_patterns<"PMOVSX", "s", sext_invec, extloadi32i16>;
+  defm : SS41I_pmovx_patterns<"PMOVZX", "z", zext_invec, loadi16_anyext>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6085,20 +5957,20 @@ multiclass SS41I_extract64<bits<8> opc, string OpcodeStr> {
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(set GR64:$dst,
                   (extractelt (v2i64 VR128:$src1), imm:$src2))]>,
-                  Sched<[WriteShuffle]>, REX_W;
+                  Sched<[WriteShuffle]>;
   let SchedRW = [WriteShuffleLd, WriteRMW] in
   def mr : SS4AIi8<opc, MRMDestMem, (outs),
                  (ins i64mem:$dst, VR128:$src1, u8imm:$src2),
                  !strconcat(OpcodeStr,
                   "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                  [(store (extractelt (v2i64 VR128:$src1), imm:$src2),
-                          addr:$dst)]>, REX_W;
+                          addr:$dst)]>;
 }
 
 let Predicates = [HasAVX, NoDQI] in
   defm VPEXTRQ : SS41I_extract64<0x16, "vpextrq">, VEX, VEX_W;
 
-defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">;
+defm PEXTRQ      : SS41I_extract64<0x16, "pextrq">, REX_W;
 
 /// SS41I_extractf32 - SSE 4.1 extract 32 bits fp value to int reg or memory
 /// destination
@@ -6122,7 +5994,7 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr,
 
 let ExeDomain = SSEPackedSingle in {
   let Predicates = [UseAVX] in
-    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX, VEX_WIG;
   defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps", SSE_EXTRACT_ITINS>;
 }
 
@@ -6250,7 +6122,7 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1,
 
 let ExeDomain = SSEPackedSingle in {
   let Predicates = [UseAVX] in
-    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V, VEX_WIG;
   let Constraints = "$src1 = $dst" in
     defm INSERTPS : SS41I_insertf32<0x21, "insertps", 1, SSE_INSERT_ITINS>;
 }
@@ -6443,14 +6315,14 @@ let Predicates = [HasAVX] in {
   defm VROUND  : sse41_fp_unop_p<0x08, 0x09, "vround", f128mem, VR128,
                                  loadv4f32, loadv2f64,
                                  int_x86_sse41_round_ps,
-                                 int_x86_sse41_round_pd>, VEX;
+                                 int_x86_sse41_round_pd>, VEX, VEX_WIG;
   defm VROUNDY : sse41_fp_unop_p<0x08, 0x09, "vround", f256mem, VR256,
                                  loadv8f32, loadv4f64,
                                  int_x86_avx_round_ps_256,
-                                 int_x86_avx_round_pd_256>, VEX, VEX_L;
+                                 int_x86_avx_round_pd_256>, VEX, VEX_L, VEX_WIG;
   defm VROUND  : sse41_fp_binop_s<0x0A, 0x0B, "vround",
                                  int_x86_sse41_round_ss,
-                                 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
+                                 int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG, VEX_WIG;
   defm VROUND  : avx_fp_unop_rm<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
 }
 
@@ -6588,20 +6460,20 @@ let Defs = [EFLAGS], Predicates = [HasAVX] in {
 def VPTESTrr  : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR128:$src1, (v2i64 VR128:$src2)))]>,
-                Sched<[WriteVecLogic]>, VEX;
+                Sched<[WriteVecLogic]>, VEX, VEX_WIG;
 def VPTESTrm  : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR128:$src1, (loadv2i64 addr:$src2)))]>,
-                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX;
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_WIG;
 
 def VPTESTYrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR256:$src1, VR256:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS, (X86ptest VR256:$src1, (v4i64 VR256:$src2)))]>,
-                Sched<[WriteVecLogic]>, VEX, VEX_L;
+                Sched<[WriteVecLogic]>, VEX, VEX_L, VEX_WIG;
 def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
                 "vptest\t{$src2, $src1|$src1, $src2}",
                 [(set EFLAGS,(X86ptest VR256:$src1, (loadv4i64 addr:$src2)))]>,
-                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L;
+                Sched<[WriteVecLogicLd, ReadAfterLd]>, VEX, VEX_L, VEX_WIG;
 }
 
 let Defs = [EFLAGS] in {
@@ -6704,7 +6576,7 @@ multiclass SS41I_unop_rm_int_v16<bits<8> opc, string OpcodeStr,
 let Predicates = [HasAVX] in
 defm VPHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "vphminposuw",
                                          int_x86_sse41_phminposuw, loadv2i64,
-                                         WriteVecIMul>, VEX;
+                                         WriteVecIMul>, VEX, VEX_WIG;
 defm PHMINPOSUW : SS41I_unop_rm_int_v16 <0x41, "phminposuw",
                                          int_x86_sse41_phminposuw, memopv2i64,
                                          WriteVecIMul>;
@@ -6760,65 +6632,65 @@ multiclass SS48I_binop_rm2<bits<8> opc, string OpcodeStr, SDNode OpNode,
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMINSD   : SS48I_binop_rm<0x39, "vpminsd", smin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMINUD   : SS48I_binop_rm<0x3B, "vpminud", umin, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXSD   : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXUD   : SS48I_binop_rm<0x3F, "vpmaxud", umax, v4i32, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMULDQ   : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v2i64, v4i32,
                                    VR128, loadv2i64, i128mem,
-                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V;
+                                   SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX, NoVLX_Or_NoBWI] in {
   defm VPMINSB   : SS48I_binop_rm<0x38, "vpminsb", smin, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMINUW   : SS48I_binop_rm<0x3A, "vpminuw", umin, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXSB   : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v16i8, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
   defm VPMAXUW   : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v8i16, VR128,
                                   loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V;
+                                  VEX_4V, VEX_WIG;
 }
 
 let Predicates = [HasAVX2, NoVLX] in {
   defm VPMINSDY  : SS48I_binop_rm<0x39, "vpminsd", smin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUDY  : SS48I_binop_rm<0x3B, "vpminud", umin, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSDY  : SS48I_binop_rm<0x3D, "vpmaxsd", smax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUDY  : SS48I_binop_rm<0x3F, "vpmaxud", umax, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMULDQY : SS48I_binop_rm2<0x28, "vpmuldq", X86pmuldq, v4i64, v8i32,
                                   VR256, loadv4i64, i256mem,
-                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L;
+                                  SSE_INTMUL_ITINS_P, 1, 0>, VEX_4V, VEX_L, VEX_WIG;
 }
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
   defm VPMINSBY  : SS48I_binop_rm<0x38, "vpminsb", smin, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMINUWY  : SS48I_binop_rm<0x3A, "vpminuw", umin, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXSBY  : SS48I_binop_rm<0x3C, "vpmaxsb", smax, v32i8, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPMAXUWY  : SS48I_binop_rm<0x3E, "vpmaxuw", umax, v16i16, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6846,18 +6718,18 @@ let Constraints = "$src1 = $dst" in {
 let Predicates = [HasAVX, NoVLX] in {
   defm VPMULLD  : SS48I_binop_rm<0x40, "vpmulld", mul, v4i32, VR128,
                                  loadv2i64, i128mem, 0, SSE_PMULLD_ITINS>,
-                                 VEX_4V;
+                                 VEX_4V, VEX_WIG;
   defm VPCMPEQQ : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v2i64, VR128,
                                  loadv2i64, i128mem, 0, SSE_INTALU_ITINS_P>,
-                                 VEX_4V;
+                                 VEX_4V, VEX_WIG;
 }
 let Predicates = [HasAVX2] in {
   defm VPMULLDY  : SS48I_binop_rm<0x40, "vpmulld", mul, v8i32, VR256,
                                   loadv4i64, i256mem, 0, SSE_PMULLD_ITINS>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
   defm VPCMPEQQY : SS48I_binop_rm<0x29, "vpcmpeqq", X86pcmpeq, v4i64, VR256,
                                   loadv4i64, i256mem, 0, SSE_INTALU_ITINS_P>,
-                                  VEX_4V, VEX_L;
+                                  VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -6927,52 +6799,52 @@ let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
     defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
                                         VR128, loadv2i64, i128mem, 0,
-                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V;
+                                        DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_WIG;
   }
 
   let ExeDomain = SSEPackedSingle in {
   defm VBLENDPS : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v4f32,
                                   VR128, loadv4f32, f128mem, 0,
-                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
   defm VBLENDPSY : SS41I_binop_rmi<0x0C, "vblendps", X86Blendi, v8f32,
                                    VR256, loadv8f32, f256mem, 0,
-                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
   }
   let ExeDomain = SSEPackedDouble in {
   defm VBLENDPD : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v2f64,
                                   VR128, loadv2f64, f128mem, 0,
-                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V;
+                                  DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_WIG;
   defm VBLENDPDY : SS41I_binop_rmi<0x0D, "vblendpd", X86Blendi, v4f64,
                                    VR256, loadv4f64, f256mem, 0,
-                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L;
+                                   DEFAULT_ITINS_FBLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
   }
   defm VPBLENDW : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v8i16,
                                   VR128, loadv2i64, i128mem, 0,
-                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V;
+                                  DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_WIG;
 
   let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
                                    VR128, loadv4f32, f128mem, 0,
-                                   SSE_DPPS_ITINS>, VEX_4V;
+                                   SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
                                    VR128, loadv2f64, f128mem, 0,
-                                   SSE_DPPS_ITINS>, VEX_4V;
+                                   SSE_DPPS_ITINS>, VEX_4V, VEX_WIG;
   let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
                                     VR256, loadv8f32, i256mem, 0,
-                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L;
+                                    SSE_DPPS_ITINS>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Predicates = [HasAVX2] in {
   let isCommutable = 0 in {
   defm VMPSADBWY : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_avx2_mpsadbw,
                                   VR256, loadv4i64, i256mem, 0,
-                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L;
+                                  DEFAULT_ITINS_MPSADSCHED>, VEX_4V, VEX_L, VEX_WIG;
   }
   defm VPBLENDWY : SS41I_binop_rmi<0x0E, "vpblendw", X86Blendi, v16i16,
                                    VR256, loadv4i64, i256mem, 0,
-                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L;
+                                   DEFAULT_ITINS_BLENDSCHED>, VEX_4V, VEX_L, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -7002,6 +6874,19 @@ let Constraints = "$src1 = $dst" in {
                                   SSE_DPPD_ITINS>;
 }
 
+// For insertion into the zero index (low half) of a 256-bit vector, it is
+// more efficient to generate a blend with immediate instead of an insert*128.
+let Predicates = [HasAVX] in {
+def : Pat<(insert_subvector (v4f64 VR256:$src1), (v2f64 VR128:$src2), (iPTR 0)),
+          (VBLENDPDYrri VR256:$src1,
+                        (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0x3)>;
+def : Pat<(insert_subvector (v8f32 VR256:$src1), (v4f32 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+}
+
 /// SS41I_quaternary_int_avx - AVX SSE 4.1 with 4 operators
 multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
                                     RegisterClass RC, X86MemOperand x86memop,
@@ -7147,14 +7032,14 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
     def rr0 : SS48I<opc, MRMSrcReg, (outs VR128:$dst),
                     (ins VR128:$src1, VR128:$src2),
                     !strconcat(OpcodeStr,
-                     "\t{$src2, $dst|$dst, $src2}"),
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0))],
                     itins.rr>, Sched<[itins.Sched]>;
 
     def rm0 : SS48I<opc, MRMSrcMem, (outs VR128:$dst),
                     (ins VR128:$src1, x86memop:$src2),
                     !strconcat(OpcodeStr,
-                     "\t{$src2, $dst|$dst, $src2}"),
+                     "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
                     [(set VR128:$dst,
                       (IntId VR128:$src1,
                        (bitconvert (mem_frag addr:$src2)), XMM0))],
@@ -7175,18 +7060,18 @@ defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", memopv2i64, i128mem,
                                   DEFAULT_ITINS_VARBLENDSCHED>;
 
 // Aliases with the implicit xmm0 argument
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPDrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvpd\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPDrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPSrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"blendvps\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (BLENDVPSrm0 VR128:$dst, f128mem:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (PBLENDVBrr0 VR128:$dst, VR128:$src2)>;
-def : InstAlias<"pblendvb\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (PBLENDVBrm0 VR128:$dst, i128mem:$src2)>;
+def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPDrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"blendvpd\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPDrm0 VR128:$dst, f128mem:$src2), 0>;
+def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPSrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"blendvps\t{$src2, $dst|$dst, $src2}",
+                (BLENDVPSrm0 VR128:$dst, f128mem:$src2), 0>;
+def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
+                (PBLENDVBrr0 VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"pblendvb\t{$src2, $dst|$dst, $src2}",
+                (PBLENDVBrm0 VR128:$dst, i128mem:$src2), 0>;
 
 let Predicates = [UseSSE41] in {
   def : Pat<(v16i8 (vselect (v16i8 XMM0), (v16i8 VR128:$src1),
@@ -7212,12 +7097,12 @@ let Predicates = [HasAVX, NoVLX] in
 def VMOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "vmovntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>,
-                       VEX;
+                       VEX, VEX_WIG;
 let Predicates = [HasAVX2, NoVLX] in
 def VMOVNTDQAYrm : SS48I<0x2A, MRMSrcMem, (outs VR256:$dst), (ins i256mem:$src),
                          "vmovntdqa\t{$src, $dst|$dst, $src}",
                          [(set VR256:$dst, (int_x86_avx2_movntdqa addr:$src))]>,
-                         VEX, VEX_L;
+                         VEX, VEX_L, VEX_WIG;
 def MOVNTDQArm : SS48I<0x2A, MRMSrcMem, (outs VR128:$dst), (ins i128mem:$src),
                        "movntdqa\t{$src, $dst|$dst, $src}",
                        [(set VR128:$dst, (int_x86_sse41_movntdqa addr:$src))]>;
@@ -7277,11 +7162,11 @@ multiclass SS42I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 let Predicates = [HasAVX] in
   defm VPCMPGTQ : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v2i64, VR128,
-                                 loadv2i64, i128mem, 0>, VEX_4V;
+                                 loadv2i64, i128mem, 0>, VEX_4V, VEX_WIG;
 
 let Predicates = [HasAVX2] in
   defm VPCMPGTQY : SS42I_binop_rm<0x37, "vpcmpgtq", X86pcmpgt, v4i64, VR256,
-                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L;
+                                  loadv4i64, i256mem, 0>, VEX_4V, VEX_L, VEX_WIG;
 
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm<0x37, "pcmpgtq", X86pcmpgt, v2i64, VR128,
@@ -7305,7 +7190,7 @@ multiclass pseudo_pcmpistrm<string asm, PatFrag ld_frag> {
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
   defm VPCMPISTRM128 : pseudo_pcmpistrm<"#VPCMPISTRM128", loadv2i64>,
-                         Requires<[HasAVX]>;
+                         Requires<[HasAVX]>, VEX_WIG;
   defm PCMPISTRM128 : pseudo_pcmpistrm<"#PCMPISTRM128", memopv2i64>,
                          Requires<[UseSSE42]>;
 }
@@ -7379,7 +7264,7 @@ multiclass pseudo_pcmpistri<string asm, PatFrag ld_frag> {
 
 let Defs = [EFLAGS], usesCustomInserter = 1 in {
   defm VPCMPISTRI : pseudo_pcmpistri<"#VPCMPISTRI", loadv2i64>,
-                      Requires<[HasAVX]>;
+                      Requires<[HasAVX]>, VEX_WIG;
   defm PCMPISTRI  : pseudo_pcmpistri<"#PCMPISTRI", memopv2i64>,
                       Requires<[UseSSE42]>;
 }
@@ -7497,14 +7382,18 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                       bit UsesXMM0 = 0> {
   def rr : I<Opc, MRMSrcReg, (outs VR128:$dst),
              (ins VR128:$src1, VR128:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             !if(UsesXMM0,
+                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>, T8;
 
   def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
+             !if(UsesXMM0,
+                 !strconcat(OpcodeStr, "\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}"),
+                 !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}")),
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1,
                     (bc_v4i32 (memopv2i64 addr:$src2)), XMM0)),
@@ -7539,10 +7428,10 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
 }
 
 // Aliases with explicit %xmm0
-def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (SHA256RNDS2rr VR128:$dst, VR128:$src2)>;
-def : InstAlias<"sha256rnds2\t{%xmm0, $src2, $dst|$dst, $src2, xmm0}",
-                (SHA256RNDS2rm VR128:$dst, i128mem:$src2)>;
+def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
+                (SHA256RNDS2rr VR128:$dst, VR128:$src2), 0>;
+def : InstAlias<"sha256rnds2\t{$src2, $dst|$dst, $src2}",
+                (SHA256RNDS2rm VR128:$dst, i128mem:$src2), 0>;
 
 //===----------------------------------------------------------------------===//
 // AES-NI Instructions
@@ -7570,13 +7459,13 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr, Intrinsic IntId128,
 // Perform One Round of an AES Encryption/Decryption Flow
 let Predicates = [HasAVX, HasAES] in {
   defm VAESENC          : AESI_binop_rm_int<0xDC, "vaesenc",
-                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesenc, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VAESENCLAST      : AESI_binop_rm_int<0xDD, "vaesenclast",
-                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesenclast, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VAESDEC          : AESI_binop_rm_int<0xDE, "vaesdec",
-                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesdec, loadv2i64, 0>, VEX_4V, VEX_WIG;
   defm VAESDECLAST      : AESI_binop_rm_int<0xDF, "vaesdeclast",
-                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V;
+                         int_x86_aesni_aesdeclast, loadv2i64, 0>, VEX_4V, VEX_WIG;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -7597,12 +7486,12 @@ let Predicates = [HasAVX, HasAES] in {
       "vaesimc\t{$src1, $dst|$dst, $src1}",
       [(set VR128:$dst,
         (int_x86_aesni_aesimc VR128:$src1))]>, Sched<[WriteAESIMC]>,
-      VEX;
+      VEX, VEX_WIG;
   def VAESIMCrm : AES8I<0xDB, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1),
       "vaesimc\t{$src1, $dst|$dst, $src1}",
       [(set VR128:$dst, (int_x86_aesni_aesimc (loadv2i64 addr:$src1)))]>,
-      Sched<[WriteAESIMCLd]>, VEX;
+      Sched<[WriteAESIMCLd]>, VEX, VEX_WIG;
 }
 def AESIMCrr : AES8I<0xDB, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1),
@@ -7622,13 +7511,13 @@ let Predicates = [HasAVX, HasAES] in {
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist VR128:$src1, imm:$src2))]>,
-      Sched<[WriteAESKeyGen]>, VEX;
+      Sched<[WriteAESKeyGen]>, VEX, VEX_WIG;
   def VAESKEYGENASSIST128rm : AESAI<0xDF, MRMSrcMem, (outs VR128:$dst),
       (ins i128mem:$src1, u8imm:$src2),
       "vaeskeygenassist\t{$src2, $src1, $dst|$dst, $src1, $src2}",
       [(set VR128:$dst,
         (int_x86_aesni_aeskeygenassist (loadv2i64 addr:$src1), imm:$src2))]>,
-      Sched<[WriteAESKeyGenLd]>, VEX;
+      Sched<[WriteAESKeyGenLd]>, VEX, VEX_WIG;
 }
 def AESKEYGENASSIST128rr : AESAI<0xDF, MRMSrcReg, (outs VR128:$dst),
   (ins VR128:$src1, u8imm:$src2),
@@ -7654,14 +7543,14 @@ def VPCLMULQDQrr : AVXPCLMULIi8<0x44, MRMSrcReg, (outs VR128:$dst),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst,
              (int_x86_pclmulqdq VR128:$src1, VR128:$src2, imm:$src3))]>,
-           Sched<[WriteCLMul]>;
+           Sched<[WriteCLMul]>, VEX_WIG;
 
 def VPCLMULQDQrm : AVXPCLMULIi8<0x44, MRMSrcMem, (outs VR128:$dst),
            (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
            "vpclmulqdq\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}",
            [(set VR128:$dst, (int_x86_pclmulqdq VR128:$src1,
                               (loadv2i64 addr:$src2), imm:$src3))]>,
-           Sched<[WriteCLMulLd, ReadAfterLd]>;
+           Sched<[WriteCLMulLd, ReadAfterLd]>, VEX_WIG;
 
 // Carry-less Multiplication instructions
 let Constraints = "$src1 = $dst" in {
@@ -8020,41 +7909,6 @@ let ExeDomain = SSEPackedDouble in {
                                loadv4i64, v4f64, v4i64>, VEX_L;
 }
 
-let Predicates = [HasAVX, NoVLX] in {
-def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (v8i32 VR256:$src2))),
-          (VPERMILPSYrr VR256:$src1, VR256:$src2)>;
-def : Pat<(v8f32 (X86VPermilpv VR256:$src1, (bc_v8i32 (loadv4i64 addr:$src2)))),
-          (VPERMILPSYrm VR256:$src1, addr:$src2)>;
-def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (v4i64 VR256:$src2))),
-          (VPERMILPDYrr VR256:$src1, VR256:$src2)>;
-def : Pat<(v4f64 (X86VPermilpv VR256:$src1, (loadv4i64 addr:$src2))),
-          (VPERMILPDYrm VR256:$src1, addr:$src2)>;
-
-def : Pat<(v8i32 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
-          (VPERMILPSYri VR256:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilpi VR256:$src1, (i8 imm:$imm))),
-          (VPERMILPDYri VR256:$src1, imm:$imm)>;
-def : Pat<(v8i32 (X86VPermilpi (bc_v8i32 (loadv4i64 addr:$src1)),
-                               (i8 imm:$imm))),
-          (VPERMILPSYmi addr:$src1, imm:$imm)>;
-def : Pat<(v4i64 (X86VPermilpi (loadv4i64 addr:$src1), (i8 imm:$imm))),
-          (VPERMILPDYmi addr:$src1, imm:$imm)>;
-
-def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (v4i32 VR128:$src2))),
-          (VPERMILPSrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v4f32 (X86VPermilpv VR128:$src1, (bc_v4i32 (loadv2i64 addr:$src2)))),
-          (VPERMILPSrm VR128:$src1, addr:$src2)>;
-def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (v2i64 VR128:$src2))),
-          (VPERMILPDrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2f64 (X86VPermilpv VR128:$src1, (loadv2i64 addr:$src2))),
-          (VPERMILPDrm VR128:$src1, addr:$src2)>;
-
-def : Pat<(v2i64 (X86VPermilpi VR128:$src1, (i8 imm:$imm))),
-          (VPERMILPDri VR128:$src1, imm:$imm)>;
-def : Pat<(v2i64 (X86VPermilpi (loadv2i64 addr:$src1), (i8 imm:$imm))),
-          (VPERMILPDmi addr:$src1, imm:$imm)>;
-}
-
 //===----------------------------------------------------------------------===//
 // VPERM2F128 - Permute Floating-Point Values in 128-bit chunks
 //
@@ -8109,15 +7963,16 @@ def : Pat<(v16i16 (X86VPerm2x128 VR256:$src1,
 //===----------------------------------------------------------------------===//
 // VZERO - Zero YMM registers
 //
+// Note, these instruction do not affect the YMM16-YMM31.
 let Defs = [YMM0, YMM1, YMM2, YMM3, YMM4, YMM5, YMM6, YMM7,
             YMM8, YMM9, YMM10, YMM11, YMM12, YMM13, YMM14, YMM15] in {
   // Zero All YMM registers
   def VZEROALL : I<0x77, RawFrm, (outs), (ins), "vzeroall",
-                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>;
+                  [(int_x86_avx_vzeroall)]>, PS, VEX, VEX_L, Requires<[HasAVX]>, VEX_WIG;
 
   // Zero Upper bits of YMM registers
   def VZEROUPPER : I<0x77, RawFrm, (outs), (ins), "vzeroupper",
-                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>;
+                     [(int_x86_avx_vzeroupper)]>, PS, VEX, Requires<[HasAVX]>, VEX_WIG;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8226,6 +8081,46 @@ defm VPBLENDD : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v4i32,
 defm VPBLENDDY : AVX2_binop_rmi<0x02, "vpblendd", X86Blendi, v8i32,
                                 VR256, loadv4i64, i256mem>, VEX_L;
 
+// For insertion into the zero index (low half) of a 256-bit vector, it is
+// more efficient to generate a blend with immediate instead of an insert*128.
+let Predicates = [HasAVX2] in {
+def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
+          (VPBLENDDYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+}
+
+let Predicates = [HasAVX1Only] in {
+def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v4i64 VR256:$src1), (v2i64 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v16i16 VR256:$src1), (v8i16 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+def : Pat<(insert_subvector (v32i8 VR256:$src1), (v16i8 VR128:$src2), (iPTR 0)),
+          (VBLENDPSYrri VR256:$src1,
+                        (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)),
+                                       VR128:$src2, sub_xmm), 0xf)>;
+}
+
 //===----------------------------------------------------------------------===//
 // VPBROADCAST - Load from memory and broadcast to all elements of the
 //               destination operand
@@ -8273,6 +8168,11 @@ defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
                                     v2i64, v4i64, NoVLX>;
 
 let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
+  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
+  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQrm addr:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (v4i64 (X86vzload addr:$src)))),
+            (VPBROADCASTQYrm addr:$src)>;
   // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
   // This means we'll encounter truncated i32 loads; match that here.
   def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
@@ -8334,18 +8234,13 @@ let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
 }
 let Predicates = [HasAVX2, NoVLX] in {
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
-            (VBROADCASTSSrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
+            (VPBROADCASTDrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
   def : Pat<(v8i32 (X86VBroadcast GR32:$src)),
-            (VBROADCASTSSYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
-  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
-            (VBROADCASTSDYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
-
-  // The patterns for VPBROADCASTD are not needed because they would match
-  // the exact same thing as VBROADCASTSS patterns.
-
+            (VPBROADCASTDYrr (COPY_TO_REGCLASS GR32:$src, VR128))>;
   def : Pat<(v2i64 (X86VBroadcast GR64:$src)),
-        (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
-  // The v4i64 pattern is not needed because VBROADCASTSDYrr already match.
+            (VPBROADCASTQrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+  def : Pat<(v4i64 (X86VBroadcast GR64:$src)),
+            (VPBROADCASTQYrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
 }
 
 // AVX1 broadcast patterns
@@ -8368,15 +8263,15 @@ let Predicates = [HasAVX, NoVLX] in {
 
 let Predicates = [HasAVX1Only] in {
   def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
-            (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
+            (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0)>;
   def : Pat<(v8f32 (X86VBroadcast FR32:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v8f32 (IMPLICIT_DEF)),
-              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
-              (VPSHUFDri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
+              (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), sub_xmm),
+              (VPERMILPSri (COPY_TO_REGCLASS FR32:$src, VR128), 0), 1)>;
   def : Pat<(v4f64 (X86VBroadcast FR64:$src)),
             (VINSERTF128rr (INSERT_SUBREG (v4f64 (IMPLICIT_DEF)),
-              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), sub_xmm),
-              (VPSHUFDri (COPY_TO_REGCLASS FR64:$src, VR128), 0x44), 1)>;
+              (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), sub_xmm),
+              (VMOVDDUPrr (COPY_TO_REGCLASS FR64:$src, VR128)), 1)>;
 
   def : Pat<(v4i32 (X86VBroadcast GR32:$src)),
             (VPSHUFDri (COPY_TO_REGCLASS GR32:$src, VR128), 0)>;
@@ -8390,7 +8285,7 @@ let Predicates = [HasAVX1Only] in {
               (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44), 1)>;
 
   def : Pat<(v2i64 (X86VBroadcast i64:$src)),
-              (VMOVDDUPrr (COPY_TO_REGCLASS GR64:$src, VR128))>;
+            (VPSHUFDri (COPY_TO_REGCLASS GR64:$src, VR128), 0x44)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8398,7 +8293,8 @@ let Predicates = [HasAVX1Only] in {
 //
 
 multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                     ValueType OpVT, X86FoldableSchedWrite Sched> {
+                     ValueType OpVT, X86FoldableSchedWrite Sched,
+                     X86MemOperand memOp> {
   let Predicates = [HasAVX2, NoVLX] in {
     def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
                      (ins VR256:$src1, VR256:$src2),
@@ -8408,7 +8304,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                        (OpVT (X86VPermv VR256:$src1, VR256:$src2)))]>,
                      Sched<[Sched]>, VEX_4V, VEX_L;
     def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
-                     (ins VR256:$src1, i256mem:$src2),
+                     (ins VR256:$src1, memOp:$src2),
                      !strconcat(OpcodeStr,
                          "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                      [(set VR256:$dst,
@@ -8418,12 +8314,15 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
   }
 }
 
-defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256>;
+defm VPERMD : avx2_perm<0x36, "vpermd", loadv4i64, v8i32, WriteShuffle256,
+                        i256mem>;
 let ExeDomain = SSEPackedSingle in
-defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256>;
+defm VPERMPS : avx2_perm<0x16, "vpermps", loadv8f32, v8f32, WriteFShuffle256,
+                        f256mem>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
-                         ValueType OpVT, X86FoldableSchedWrite Sched> {
+                         ValueType OpVT, X86FoldableSchedWrite Sched,
+                         X86MemOperand memOp> {
   let Predicates = [HasAVX2, NoVLX] in {
     def Yri : AVX2AIi8<opc, MRMSrcReg, (outs VR256:$dst),
                        (ins VR256:$src1, u8imm:$src2),
@@ -8433,7 +8332,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
                          (OpVT (X86VPermi VR256:$src1, (i8 imm:$src2))))]>,
                        Sched<[Sched]>, VEX, VEX_L;
     def Ymi : AVX2AIi8<opc, MRMSrcMem, (outs VR256:$dst),
-                       (ins i256mem:$src1, u8imm:$src2),
+                       (ins memOp:$src1, u8imm:$src2),
                        !strconcat(OpcodeStr,
                            "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                        [(set VR256:$dst,
@@ -8444,10 +8343,10 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
 }
 
 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", loadv4i64, v4i64,
-                            WriteShuffle256>, VEX_W;
+                            WriteShuffle256, i256mem>, VEX_W;
 let ExeDomain = SSEPackedDouble in
 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", loadv4f64, v4f64,
-                             WriteFShuffle256>, VEX_W;
+                             WriteFShuffle256, f256mem>, VEX_W;
 
 //===----------------------------------------------------------------------===//
 // VPERM2I128 - Permute Floating-Point Values in 128-bit chunks
diff --git a/lib/Target/X86/X86InstrShiftRotate.td b/lib/Target/X86/X86InstrShiftRotate.td
index b7b77e73b436bf649d80f4a8d97d39e108371391..b21f0b923da8d476d29c1c5424ffb2e02052dd09 100644
--- a/lib/Target/X86/X86InstrShiftRotate.td
+++ b/lib/Target/X86/X86InstrShiftRotate.td
@@ -662,19 +662,19 @@ def ROR64mi  : RIi8<0xC1, MRM1m, (outs), (ins i64mem:$dst, u8imm:$src),
 // Rotate by 1
 def ROR8m1   : I<0xD0, MRM1m, (outs), (ins i8mem :$dst),
                  "ror{b}\t$dst",
-               [(store (rotr (loadi8 addr:$dst), (i8 1)), addr:$dst)],
+               [(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst)],
                IIC_SR>;
 def ROR16m1  : I<0xD1, MRM1m, (outs), (ins i16mem:$dst),
                  "ror{w}\t$dst",
-              [(store (rotr (loadi16 addr:$dst), (i8 1)), addr:$dst)],
+              [(store (rotl (loadi16 addr:$dst), (i8 15)), addr:$dst)],
               IIC_SR>, OpSize16;
 def ROR32m1  : I<0xD1, MRM1m, (outs), (ins i32mem:$dst),
                  "ror{l}\t$dst",
-              [(store (rotr (loadi32 addr:$dst), (i8 1)), addr:$dst)],
+              [(store (rotl (loadi32 addr:$dst), (i8 31)), addr:$dst)],
               IIC_SR>, OpSize32;
 def ROR64m1  : RI<0xD1, MRM1m, (outs), (ins i64mem:$dst),
                  "ror{q}\t$dst",
-               [(store (rotr (loadi64 addr:$dst), (i8 1)), addr:$dst)],
+               [(store (rotl (loadi64 addr:$dst), (i8 63)), addr:$dst)],
                IIC_SR>;
 } // SchedRW
 
@@ -846,6 +846,15 @@ def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
 
 } // Defs = [EFLAGS]
 
+// Sandy Bridge and newer Intel processors support faster rotates using
+// SHLD to avoid a partial flag update on the normal rotate instructions.
+let Predicates = [HasFastSHLDRotate], AddedComplexity = 5 in {
+  def : Pat<(rotl GR32:$src, (i8 imm:$shamt)),
+            (SHLD32rri8 GR32:$src, GR32:$src, imm:$shamt)>;
+  def : Pat<(rotl GR64:$src, (i8 imm:$shamt)),
+            (SHLD64rri8 GR64:$src, GR64:$src, imm:$shamt)>;
+}
+
 def ROT32L2R_imm8  : SDNodeXForm<imm, [{
   // Convert a ROTL shamt to a ROTR shamt on 32-bit integer.
   return getI8Imm(32 - N->getZExtValue(), SDLoc(N));
diff --git a/lib/Target/X86/X86InstrTSX.td b/lib/Target/X86/X86InstrTSX.td
index 7267d752653e33eaf9cbb4b6d5cc792135ba5931..38ac8be9448323ea047d5703fea4e119431e8279 100644
--- a/lib/Target/X86/X86InstrTSX.td
+++ b/lib/Target/X86/X86InstrTSX.td
@@ -25,9 +25,9 @@ def XBEGIN : I<0, Pseudo, (outs GR32:$dst), (ins),
 
 let isBranch = 1, isTerminator = 1, Defs = [EAX] in {
 def XBEGIN_2 : Ii16PCRel<0xc7, MRM_F8, (outs), (ins brtarget16:$dst),
-                         "xbegin\t$dst", []>, OpSize16, Requires<[HasRTM]>;
+                         "xbegin\t$dst", []>, OpSize16;
 def XBEGIN_4 : Ii32PCRel<0xc7, MRM_F8, (outs), (ins brtarget32:$dst),
-                         "xbegin\t$dst", []>, OpSize32, Requires<[HasRTM]>;
+                         "xbegin\t$dst", []>, OpSize32;
 }
 
 def XEND : I<0x01, MRM_D5, (outs), (ins),
@@ -35,7 +35,7 @@ def XEND : I<0x01, MRM_D5, (outs), (ins),
 
 let Defs = [EFLAGS] in
 def XTEST : I<0x01, MRM_D6, (outs), (ins),
-              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasTSX]>;
+              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasRTM]>;
 
 def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
                  "xabort\t$imm",
@@ -44,7 +44,7 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
 // HLE prefixes
 
 let isAsmParserOnly = 1 in {
-def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>, Requires<[HasHLE]>;
-def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>, Requires<[HasHLE]>;
+def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>;
+def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>;
 }
 
diff --git a/lib/Target/X86/X86InstrTablesInfo.h b/lib/Target/X86/X86InstrTablesInfo.h
deleted file mode 100755
index 09e635c9dff2e701357b82a72d2b05b0cdc323ee..0000000000000000000000000000000000000000
--- a/lib/Target/X86/X86InstrTablesInfo.h
+++ /dev/null
@@ -1,1159 +0,0 @@
-//===-- X86InstrTablesInfo.h - X86 Instruction Tables -----------*- C++ -*-===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// This file contains related X86 Instruction Information Tables.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
-#define LLVM_LIB_TARGET_X86_X86INSTRTABLESINFO_H
-
-using namespace llvm;
-
-struct X86EvexToVexCompressTableEntry {
-  uint16_t EvexOpcode;
-  uint16_t VexOpcode;
-};
-
-
-
-// X86 EVEX encoded instructions that have a VEX 128 encoding
-// (table format: <EVEX opcode, VEX-128 opcode>).
-static const X86EvexToVexCompressTableEntry X86EvexToVex128CompressTable[] = {
-  // EVEX scalar with corresponding VEX.
-  { X86::Int_VCOMISDZrm         ,  X86::Int_VCOMISDrm            },
-  { X86::Int_VCOMISDZrr         ,  X86::Int_VCOMISDrr            },
-  { X86::Int_VCOMISSZrm         ,  X86::Int_VCOMISSrm            },
-  { X86::Int_VCOMISSZrr         ,  X86::Int_VCOMISSrr            },
-  { X86::Int_VUCOMISDZrm        ,  X86::Int_VUCOMISDrm           },
-  { X86::Int_VUCOMISDZrr        ,  X86::Int_VUCOMISDrr           },
-  { X86::Int_VUCOMISSZrm        ,  X86::Int_VUCOMISSrm           },
-  { X86::Int_VUCOMISSZrr        ,  X86::Int_VUCOMISSrr           },
-  { X86::VADDSDZrm              ,  X86::VADDSDrm                 },
-  { X86::VADDSDZrm_Int          ,  X86::VADDSDrm_Int             },
-  { X86::VADDSDZrr              ,  X86::VADDSDrr                 },
-  { X86::VADDSDZrr_Int          ,  X86::VADDSDrr_Int             },
-  { X86::VADDSSZrm              ,  X86::VADDSSrm                 },
-  { X86::VADDSSZrm_Int          ,  X86::VADDSSrm_Int             },
-  { X86::VADDSSZrr              ,  X86::VADDSSrr                 },
-  { X86::VADDSSZrr_Int          ,  X86::VADDSSrr_Int             },
-  { X86::VCOMISDZrm             ,  X86::VCOMISDrm                },
-  { X86::VCOMISDZrr             ,  X86::VCOMISDrr                },
-  { X86::VCOMISSZrm             ,  X86::VCOMISSrm                },
-  { X86::VCOMISSZrr             ,  X86::VCOMISSrr                },
-  { X86::VCVTSD2SI64Zrm         ,  X86::VCVTSD2SI64rm            },
-  { X86::VCVTSD2SI64Zrr         ,  X86::VCVTSD2SI64rr            },
-  { X86::VCVTSD2SIZrm           ,  X86::VCVTSD2SIrm              },
-  { X86::VCVTSD2SIZrr           ,  X86::VCVTSD2SIrr              },
-  { X86::VCVTSD2SSZrm           ,  X86::VCVTSD2SSrm              },
-  { X86::VCVTSD2SSZrr           ,  X86::VCVTSD2SSrr              },
-  { X86::VCVTSI2SDZrm           ,  X86::VCVTSI2SDrm              },
-  { X86::VCVTSI2SDZrm_Int       ,  X86::Int_VCVTSI2SDrm          },
-  { X86::VCVTSI2SDZrr           ,  X86::VCVTSI2SDrr              },
-  { X86::VCVTSI2SDZrr_Int       ,  X86::Int_VCVTSI2SDrr          },
-  { X86::VCVTSI2SSZrm           ,  X86::VCVTSI2SSrm              },
-  { X86::VCVTSI2SSZrm_Int       ,  X86::Int_VCVTSI2SSrm          },
-  { X86::VCVTSI2SSZrr           ,  X86::VCVTSI2SSrr              },
-  { X86::VCVTSI2SSZrr_Int       ,  X86::Int_VCVTSI2SSrr          },
-  { X86::VCVTSS2SDZrm           ,  X86::VCVTSS2SDrm              },
-  { X86::VCVTSS2SDZrr           ,  X86::VCVTSS2SDrr              },
-  { X86::VCVTSS2SI64Zrm         ,  X86::VCVTSS2SI64rm            },
-  { X86::VCVTSS2SI64Zrr         ,  X86::VCVTSS2SI64rr            },
-  { X86::VCVTSS2SIZrm           ,  X86::VCVTSS2SIrm              },
-  { X86::VCVTSS2SIZrr           ,  X86::VCVTSS2SIrr              },
-  { X86::VCVTTSD2SI64Zrm        ,  X86::VCVTTSD2SI64rm           },
-  { X86::VCVTTSD2SI64Zrm_Int    ,  X86::Int_VCVTTSD2SI64rm       },
-  { X86::VCVTTSD2SI64Zrr        ,  X86::VCVTTSD2SI64rr           },
-  { X86::VCVTTSD2SI64Zrr_Int    ,  X86::Int_VCVTTSD2SI64rr       },
-  { X86::VCVTTSD2SIZrm          ,  X86::VCVTTSD2SIrm             },
-  { X86::VCVTTSD2SIZrm_Int      ,  X86::Int_VCVTTSD2SIrm         },
-  { X86::VCVTTSD2SIZrr          ,  X86::VCVTTSD2SIrr             },
-  { X86::VCVTTSD2SIZrr_Int      ,  X86::Int_VCVTTSD2SIrr         },
-  { X86::VCVTTSS2SI64Zrm        ,  X86::VCVTTSS2SI64rm           },
-  { X86::VCVTTSS2SI64Zrm_Int    ,  X86::Int_VCVTTSS2SI64rm       },
-  { X86::VCVTTSS2SI64Zrr        ,  X86::VCVTTSS2SI64rr           },
-  { X86::VCVTTSS2SI64Zrr_Int    ,  X86::Int_VCVTTSS2SI64rr       },
-  { X86::VCVTTSS2SIZrm          ,  X86::VCVTTSS2SIrm             },
-  { X86::VCVTTSS2SIZrm_Int      ,  X86::Int_VCVTTSS2SIrm         },
-  { X86::VCVTTSS2SIZrr          ,  X86::VCVTTSS2SIrr             },
-  { X86::VCVTTSS2SIZrr_Int      ,  X86::Int_VCVTTSS2SIrr         },
-  { X86::VDIVSDZrm              ,  X86::VDIVSDrm                 },
-  { X86::VDIVSDZrm_Int          ,  X86::VDIVSDrm_Int             },
-  { X86::VDIVSDZrr              ,  X86::VDIVSDrr                 },
-  { X86::VDIVSDZrr_Int          ,  X86::VDIVSDrr_Int             },
-  { X86::VDIVSSZrm              ,  X86::VDIVSSrm                 },
-  { X86::VDIVSSZrm_Int          ,  X86::VDIVSSrm_Int             },
-  { X86::VDIVSSZrr              ,  X86::VDIVSSrr                 },
-  { X86::VDIVSSZrr_Int          ,  X86::VDIVSSrr_Int             },
-  { X86::VFMADD132SDZm          ,  X86::VFMADD132SDm             },
-  { X86::VFMADD132SDZm_Int      ,  X86::VFMADD132SDm_Int         },
-  { X86::VFMADD132SDZr          ,  X86::VFMADD132SDr             },
-  { X86::VFMADD132SDZr_Int      ,  X86::VFMADD132SDr_Int         },
-  { X86::VFMADD132SSZm          ,  X86::VFMADD132SSm             },
-  { X86::VFMADD132SSZm_Int      ,  X86::VFMADD132SSm_Int         },
-  { X86::VFMADD132SSZr          ,  X86::VFMADD132SSr             },
-  { X86::VFMADD132SSZr_Int      ,  X86::VFMADD132SSr_Int         },
-  { X86::VFMADD213SDZm          ,  X86::VFMADD213SDm             },
-  { X86::VFMADD213SDZm_Int      ,  X86::VFMADD213SDm_Int         },
-  { X86::VFMADD213SDZr          ,  X86::VFMADD213SDr             },
-  { X86::VFMADD213SDZr_Int      ,  X86::VFMADD213SDr_Int         },
-  { X86::VFMADD213SSZm          ,  X86::VFMADD213SSm             },
-  { X86::VFMADD213SSZm_Int      ,  X86::VFMADD213SSm_Int         },
-  { X86::VFMADD213SSZr          ,  X86::VFMADD213SSr             },
-  { X86::VFMADD213SSZr_Int      ,  X86::VFMADD213SSr_Int         },
-  { X86::VFMADD231SDZm          ,  X86::VFMADD231SDm             },
-  { X86::VFMADD231SDZm_Int      ,  X86::VFMADD231SDm_Int         },
-  { X86::VFMADD231SDZr          ,  X86::VFMADD231SDr             },
-  { X86::VFMADD231SDZr_Int      ,  X86::VFMADD231SDr_Int         },
-  { X86::VFMADD231SSZm          ,  X86::VFMADD231SSm             },
-  { X86::VFMADD231SSZm_Int      ,  X86::VFMADD231SSm_Int         },
-  { X86::VFMADD231SSZr          ,  X86::VFMADD231SSr             },
-  { X86::VFMADD231SSZr_Int      ,  X86::VFMADD231SSr_Int         },
-  { X86::VFMSUB132SDZm          ,  X86::VFMSUB132SDm             },
-  { X86::VFMSUB132SDZm_Int      ,  X86::VFMSUB132SDm_Int         },
-  { X86::VFMSUB132SDZr          ,  X86::VFMSUB132SDr             },
-  { X86::VFMSUB132SDZr_Int      ,  X86::VFMSUB132SDr_Int         },
-  { X86::VFMSUB132SSZm          ,  X86::VFMSUB132SSm             },
-  { X86::VFMSUB132SSZm_Int      ,  X86::VFMSUB132SSm_Int         },
-  { X86::VFMSUB132SSZr          ,  X86::VFMSUB132SSr             },
-  { X86::VFMSUB132SSZr_Int      ,  X86::VFMSUB132SSr_Int         },
-  { X86::VFMSUB213SDZm          ,  X86::VFMSUB213SDm             },
-  { X86::VFMSUB213SDZm_Int      ,  X86::VFMSUB213SDm_Int         },
-  { X86::VFMSUB213SDZr          ,  X86::VFMSUB213SDr             },
-  { X86::VFMSUB213SDZr_Int      ,  X86::VFMSUB213SDr_Int         },
-  { X86::VFMSUB213SSZm          ,  X86::VFMSUB213SSm             },
-  { X86::VFMSUB213SSZm_Int      ,  X86::VFMSUB213SSm_Int         },
-  { X86::VFMSUB213SSZr          ,  X86::VFMSUB213SSr             },
-  { X86::VFMSUB213SSZr_Int      ,  X86::VFMSUB213SSr_Int         },
-  { X86::VFMSUB231SDZm          ,  X86::VFMSUB231SDm             },
-  { X86::VFMSUB231SDZm_Int      ,  X86::VFMSUB231SDm_Int         },
-  { X86::VFMSUB231SDZr          ,  X86::VFMSUB231SDr             },
-  { X86::VFMSUB231SDZr_Int      ,  X86::VFMSUB231SDr_Int         },
-  { X86::VFMSUB231SSZm          ,  X86::VFMSUB231SSm             },
-  { X86::VFMSUB231SSZm_Int      ,  X86::VFMSUB231SSm_Int         },
-  { X86::VFMSUB231SSZr          ,  X86::VFMSUB231SSr             },
-  { X86::VFMSUB231SSZr_Int      ,  X86::VFMSUB231SSr_Int         },
-  { X86::VFNMADD132SDZm         ,  X86::VFNMADD132SDm            },
-  { X86::VFNMADD132SDZm_Int     ,  X86::VFNMADD132SDm_Int        },
-  { X86::VFNMADD132SDZr         ,  X86::VFNMADD132SDr            },
-  { X86::VFNMADD132SDZr_Int     ,  X86::VFNMADD132SDr_Int        },
-  { X86::VFNMADD132SSZm         ,  X86::VFNMADD132SSm            },
-  { X86::VFNMADD132SSZm_Int     ,  X86::VFNMADD132SSm_Int        },
-  { X86::VFNMADD132SSZr         ,  X86::VFNMADD132SSr            },
-  { X86::VFNMADD132SSZr_Int     ,  X86::VFNMADD132SSr_Int        },
-  { X86::VFNMADD213SDZm         ,  X86::VFNMADD213SDm            },
-  { X86::VFNMADD213SDZm_Int     ,  X86::VFNMADD213SDm_Int        },
-  { X86::VFNMADD213SDZr         ,  X86::VFNMADD213SDr            },
-  { X86::VFNMADD213SDZr_Int     ,  X86::VFNMADD213SDr_Int        },
-  { X86::VFNMADD213SSZm         ,  X86::VFNMADD213SSm            },
-  { X86::VFNMADD213SSZm_Int     ,  X86::VFNMADD213SSm_Int        },
-  { X86::VFNMADD213SSZr         ,  X86::VFNMADD213SSr            },
-  { X86::VFNMADD213SSZr_Int     ,  X86::VFNMADD213SSr_Int        },
-  { X86::VFNMADD231SDZm         ,  X86::VFNMADD231SDm            },
-  { X86::VFNMADD231SDZm_Int     ,  X86::VFNMADD231SDm_Int        },
-  { X86::VFNMADD231SDZr         ,  X86::VFNMADD231SDr            },
-  { X86::VFNMADD231SDZr_Int     ,  X86::VFNMADD231SDr_Int        },
-  { X86::VFNMADD231SSZm         ,  X86::VFNMADD231SSm            },
-  { X86::VFNMADD231SSZm_Int     ,  X86::VFNMADD231SSm_Int        },
-  { X86::VFNMADD231SSZr         ,  X86::VFNMADD231SSr            },
-  { X86::VFNMADD231SSZr_Int     ,  X86::VFNMADD231SSr_Int        },
-  { X86::VFNMSUB132SDZm         ,  X86::VFNMSUB132SDm            },
-  { X86::VFNMSUB132SDZm_Int     ,  X86::VFNMSUB132SDm_Int        },
-  { X86::VFNMSUB132SDZr         ,  X86::VFNMSUB132SDr            },
-  { X86::VFNMSUB132SDZr_Int     ,  X86::VFNMSUB132SDr_Int        },
-  { X86::VFNMSUB132SSZm         ,  X86::VFNMSUB132SSm            },
-  { X86::VFNMSUB132SSZm_Int     ,  X86::VFNMSUB132SSm_Int        },
-  { X86::VFNMSUB132SSZr         ,  X86::VFNMSUB132SSr            },
-  { X86::VFNMSUB132SSZr_Int     ,  X86::VFNMSUB132SSr_Int        },
-  { X86::VFNMSUB213SDZm         ,  X86::VFNMSUB213SDm            },
-  { X86::VFNMSUB213SDZm_Int     ,  X86::VFNMSUB213SDm_Int        },
-  { X86::VFNMSUB213SDZr         ,  X86::VFNMSUB213SDr            },
-  { X86::VFNMSUB213SDZr_Int     ,  X86::VFNMSUB213SDr_Int        },
-  { X86::VFNMSUB213SSZm         ,  X86::VFNMSUB213SSm            },
-  { X86::VFNMSUB213SSZm_Int     ,  X86::VFNMSUB213SSm_Int        },
-  { X86::VFNMSUB213SSZr         ,  X86::VFNMSUB213SSr            },
-  { X86::VFNMSUB213SSZr_Int     ,  X86::VFNMSUB213SSr_Int        },
-  { X86::VFNMSUB231SDZm         ,  X86::VFNMSUB231SDm            },
-  { X86::VFNMSUB231SDZm_Int     ,  X86::VFNMSUB231SDm_Int        },
-  { X86::VFNMSUB231SDZr         ,  X86::VFNMSUB231SDr            },
-  { X86::VFNMSUB231SDZr_Int     ,  X86::VFNMSUB231SDr_Int        },
-  { X86::VFNMSUB231SSZm         ,  X86::VFNMSUB231SSm            },
-  { X86::VFNMSUB231SSZm_Int     ,  X86::VFNMSUB231SSm_Int        },
-  { X86::VFNMSUB231SSZr         ,  X86::VFNMSUB231SSr            },
-  { X86::VFNMSUB231SSZr_Int     ,  X86::VFNMSUB231SSr_Int        },
-  { X86::VMAXCSDZrm             ,  X86::VMAXCSDrm                },
-  { X86::VMAXCSDZrr             ,  X86::VMAXCSDrr                },
-  { X86::VMAXCSSZrm             ,  X86::VMAXCSSrm                },
-  { X86::VMAXCSSZrr             ,  X86::VMAXCSSrr                },
-  { X86::VMAXSDZrm              ,  X86::VMAXSDrm                 },
-  { X86::VMAXSDZrm_Int          ,  X86::VMAXSDrm_Int             },
-  { X86::VMAXSDZrr              ,  X86::VMAXSDrr                 },
-  { X86::VMAXSDZrr_Int          ,  X86::VMAXSDrr_Int             },
-  { X86::VMAXSSZrm              ,  X86::VMAXSSrm                 },
-  { X86::VMAXSSZrm_Int          ,  X86::VMAXSSrm_Int             },
-  { X86::VMAXSSZrr              ,  X86::VMAXSSrr                 },
-  { X86::VMAXSSZrr_Int          ,  X86::VMAXSSrr_Int             },
-  { X86::VMINCSDZrm             ,  X86::VMINCSDrm                },
-  { X86::VMINCSDZrr             ,  X86::VMINCSDrr                },
-  { X86::VMINCSSZrm             ,  X86::VMINCSSrm                },
-  { X86::VMINCSSZrr             ,  X86::VMINCSSrr                },
-  { X86::VMINSDZrm              ,  X86::VMINSDrm                 },
-  { X86::VMINSDZrm_Int          ,  X86::VMINSDrm_Int             },
-  { X86::VMINSDZrr              ,  X86::VMINSDrr                 },
-  { X86::VMINSDZrr_Int          ,  X86::VMINSDrr_Int             },
-  { X86::VMINSSZrm              ,  X86::VMINSSrm                 },
-  { X86::VMINSSZrm_Int          ,  X86::VMINSSrm_Int             },
-  { X86::VMINSSZrr              ,  X86::VMINSSrr                 },
-  { X86::VMINSSZrr_Int          ,  X86::VMINSSrr_Int             },
-  { X86::VMOV64toSDZrr          ,  X86::VMOV64toSDrr             },
-  { X86::VMOVDI2SSZrm           ,  X86::VMOVDI2SSrm              },
-  { X86::VMOVDI2SSZrr           ,  X86::VMOVDI2SSrr              },
-  { X86::VMOVSDZmr              ,  X86::VMOVSDmr                 },
-  { X86::VMOVSDZrm              ,  X86::VMOVSDrm                 },
-  { X86::VMOVSDZrr              ,  X86::VMOVSDrr                 },
-  { X86::VMOVSSZmr              ,  X86::VMOVSSmr                 },
-  { X86::VMOVSSZrm              ,  X86::VMOVSSrm                 },
-  { X86::VMOVSSZrr              ,  X86::VMOVSSrr                 },
-  { X86::VMOVSSZrr_REV          ,  X86::VMOVSSrr_REV             },
-  { X86::VMULSDZrm              ,  X86::VMULSDrm                 },
-  { X86::VMULSDZrm_Int          ,  X86::VMULSDrm_Int             },
-  { X86::VMULSDZrr              ,  X86::VMULSDrr                 },
-  { X86::VMULSDZrr_Int          ,  X86::VMULSDrr_Int             },
-  { X86::VMULSSZrm              ,  X86::VMULSSrm                 },
-  { X86::VMULSSZrm_Int          ,  X86::VMULSSrm_Int             },
-  { X86::VMULSSZrr              ,  X86::VMULSSrr                 },
-  { X86::VMULSSZrr_Int          ,  X86::VMULSSrr_Int             },
-  { X86::VSQRTSDZm              ,  X86::VSQRTSDm                 },
-  { X86::VSQRTSDZm_Int          ,  X86::VSQRTSDm_Int             },
-  { X86::VSQRTSDZr              ,  X86::VSQRTSDr                 },
-  { X86::VSQRTSDZr_Int          ,  X86::VSQRTSDr_Int             },
-  { X86::VSQRTSSZm              ,  X86::VSQRTSSm                 },
-  { X86::VSQRTSSZm_Int          ,  X86::VSQRTSSm_Int             },
-  { X86::VSQRTSSZr              ,  X86::VSQRTSSr                 },
-  { X86::VSQRTSSZr_Int          ,  X86::VSQRTSSr_Int             },
-  { X86::VSUBSDZrm              ,  X86::VSUBSDrm                 },
-  { X86::VSUBSDZrm_Int          ,  X86::VSUBSDrm_Int             },
-  { X86::VSUBSDZrr              ,  X86::VSUBSDrr                 },
-  { X86::VSUBSDZrr_Int          ,  X86::VSUBSDrr_Int             },
-  { X86::VSUBSSZrm              ,  X86::VSUBSSrm                 },
-  { X86::VSUBSSZrm_Int          ,  X86::VSUBSSrm_Int             },
-  { X86::VSUBSSZrr              ,  X86::VSUBSSrr                 },
-  { X86::VSUBSSZrr_Int          ,  X86::VSUBSSrr_Int             },
-  { X86::VUCOMISDZrm            ,  X86::VUCOMISDrm               },
-  { X86::VUCOMISDZrr            ,  X86::VUCOMISDrr               },
-  { X86::VUCOMISSZrm            ,  X86::VUCOMISSrm               },
-  { X86::VUCOMISSZrr            ,  X86::VUCOMISSrr               },
-
-  { X86::VMOV64toPQIZrr         ,   X86::VMOV64toPQIrr           },
-  { X86::VMOV64toSDZrr          ,   X86::VMOV64toSDrr            },
-  { X86::VMOVDI2PDIZrm          ,   X86::VMOVDI2PDIrm            },
-  { X86::VMOVDI2PDIZrr          ,   X86::VMOVDI2PDIrr            },
-  { X86::VMOVLHPSZrr            ,   X86::VMOVLHPSrr              },
-  { X86::VMOVHLPSZrr            ,   X86::VMOVHLPSrr              },
-  { X86::VMOVPDI2DIZmr          ,   X86::VMOVPDI2DImr            },
-  { X86::VMOVPDI2DIZrr          ,   X86::VMOVPDI2DIrr            },
-  { X86::VMOVPQI2QIZmr          ,   X86::VMOVPQI2QImr            },
-  { X86::VMOVPQIto64Zrr         ,   X86::VMOVPQIto64rr           },
-  { X86::VMOVQI2PQIZrm          ,   X86::VMOVQI2PQIrm            },
-  { X86::VMOVZPQILo2PQIZrr      ,   X86::VMOVZPQILo2PQIrr        },
-
-  { X86::VPEXTRBZmr             ,   X86::VPEXTRBmr               },
-  { X86::VPEXTRBZrr             ,   X86::VPEXTRBrr               },
-  { X86::VPEXTRDZmr             ,   X86::VPEXTRDmr               },
-  { X86::VPEXTRDZrr             ,   X86::VPEXTRDrr               },
-  { X86::VPEXTRQZmr             ,   X86::VPEXTRQmr               },
-  { X86::VPEXTRQZrr             ,   X86::VPEXTRQrr               },
-  { X86::VPEXTRWZmr             ,   X86::VPEXTRWmr               },
-  { X86::VPEXTRWZrr             ,   X86::VPEXTRWri               },
-
-  { X86::VPINSRBZrm             ,   X86::VPINSRBrm               },
-  { X86::VPINSRBZrr             ,   X86::VPINSRBrr               },
-  { X86::VPINSRDZrm             ,   X86::VPINSRDrm               },
-  { X86::VPINSRDZrr             ,   X86::VPINSRDrr               },
-  { X86::VPINSRQZrm             ,   X86::VPINSRQrm               },
-  { X86::VPINSRQZrr             ,   X86::VPINSRQrr               },
-  { X86::VPINSRWZrm             ,   X86::VPINSRWrmi              },
-  { X86::VPINSRWZrr             ,   X86::VPINSRWrri              },
-
-  // EVEX 128 with corresponding VEX.
-  { X86::VADDPDZ128rm           ,    X86::VADDPDrm               },
-  { X86::VADDPDZ128rr           ,    X86::VADDPDrr               },
-  { X86::VADDPSZ128rm           ,    X86::VADDPSrm               },
-  { X86::VADDPSZ128rr           ,    X86::VADDPSrr               },
-  { X86::VANDNPDZ128rm          ,    X86::VANDNPDrm              },
-  { X86::VANDNPDZ128rr          ,    X86::VANDNPDrr              },
-  { X86::VANDNPSZ128rm          ,    X86::VANDNPSrm              },
-  { X86::VANDNPSZ128rr          ,    X86::VANDNPSrr              },
-  { X86::VANDPDZ128rm           ,    X86::VANDPDrm               },
-  { X86::VANDPDZ128rr           ,    X86::VANDPDrr               },
-  { X86::VANDPSZ128rm           ,    X86::VANDPSrm               },
-  { X86::VANDPSZ128rr           ,    X86::VANDPSrr               },
-  { X86::VBROADCASTSSZ128m      ,    X86::VBROADCASTSSrm         },
-  { X86::VBROADCASTSSZ128r      ,    X86::VBROADCASTSSrr         },
-  { X86::VCVTDQ2PDZ128rm        ,    X86::VCVTDQ2PDrm            },
-  { X86::VCVTDQ2PDZ128rr        ,    X86::VCVTDQ2PDrr            },
-  { X86::VCVTDQ2PSZ128rm        ,    X86::VCVTDQ2PSrm            },
-  { X86::VCVTDQ2PSZ128rr        ,    X86::VCVTDQ2PSrr            },
-  { X86::VCVTPD2DQZ128rm        ,    X86::VCVTPD2DQrm            },
-  { X86::VCVTPD2DQZ128rr        ,    X86::VCVTPD2DQrr            },
-  { X86::VCVTPD2PSZ128rm        ,    X86::VCVTPD2PSrm            },
-  { X86::VCVTPD2PSZ128rr        ,    X86::VCVTPD2PSrr            },
-  { X86::VCVTPH2PSZ128rm        ,    X86::VCVTPH2PSrm            },
-  { X86::VCVTPH2PSZ128rr        ,    X86::VCVTPH2PSrr            },
-  { X86::VCVTPS2DQZ128rm        ,    X86::VCVTPS2DQrm            },
-  { X86::VCVTPS2DQZ128rr        ,    X86::VCVTPS2DQrr            },
-  { X86::VCVTPS2PDZ128rm        ,    X86::VCVTPS2PDrm            },
-  { X86::VCVTPS2PDZ128rr        ,    X86::VCVTPS2PDrr            },
-  { X86::VCVTPS2PHZ128mr        ,    X86::VCVTPS2PHmr            },
-  { X86::VCVTPS2PHZ128rr        ,    X86::VCVTPS2PHrr            },
-  { X86::VCVTTPD2DQZ128rm       ,    X86::VCVTTPD2DQrm           },
-  { X86::VCVTTPD2DQZ128rr       ,    X86::VCVTTPD2DQrr           },
-  { X86::VCVTTPS2DQZ128rm       ,    X86::VCVTTPS2DQrm           },
-  { X86::VCVTTPS2DQZ128rr       ,    X86::VCVTTPS2DQrr           },
-  { X86::VDIVPDZ128rm           ,    X86::VDIVPDrm               },
-  { X86::VDIVPDZ128rr           ,    X86::VDIVPDrr               },
-  { X86::VDIVPSZ128rm           ,    X86::VDIVPSrm               },
-  { X86::VDIVPSZ128rr           ,    X86::VDIVPSrr               },
-  { X86::VFMADD132PDZ128m       ,    X86::VFMADD132PDm           },
-  { X86::VFMADD132PDZ128r       ,    X86::VFMADD132PDr           },
-  { X86::VFMADD132PSZ128m       ,    X86::VFMADD132PSm           },
-  { X86::VFMADD132PSZ128r       ,    X86::VFMADD132PSr           },
-  { X86::VFMADD213PDZ128m       ,    X86::VFMADD213PDm           },
-  { X86::VFMADD213PDZ128r       ,    X86::VFMADD213PDr           },
-  { X86::VFMADD213PSZ128m       ,    X86::VFMADD213PSm           },
-  { X86::VFMADD213PSZ128r       ,    X86::VFMADD213PSr           },
-  { X86::VFMADD231PDZ128m       ,    X86::VFMADD231PDm           },
-  { X86::VFMADD231PDZ128r       ,    X86::VFMADD231PDr           },
-  { X86::VFMADD231PSZ128m       ,    X86::VFMADD231PSm           },
-  { X86::VFMADD231PSZ128r       ,    X86::VFMADD231PSr           },
-  { X86::VFMADDSUB132PDZ128m    ,    X86::VFMADDSUB132PDm        },
-  { X86::VFMADDSUB132PDZ128r    ,    X86::VFMADDSUB132PDr        },
-  { X86::VFMADDSUB132PSZ128m    ,    X86::VFMADDSUB132PSm        },
-  { X86::VFMADDSUB132PSZ128r    ,    X86::VFMADDSUB132PSr        },
-  { X86::VFMADDSUB213PDZ128m    ,    X86::VFMADDSUB213PDm        },
-  { X86::VFMADDSUB213PDZ128r    ,    X86::VFMADDSUB213PDr        },
-  { X86::VFMADDSUB213PSZ128m    ,    X86::VFMADDSUB213PSm        },
-  { X86::VFMADDSUB213PSZ128r    ,    X86::VFMADDSUB213PSr        },
-  { X86::VFMADDSUB231PDZ128m    ,    X86::VFMADDSUB231PDm        },
-  { X86::VFMADDSUB231PDZ128r    ,    X86::VFMADDSUB231PDr        },
-  { X86::VFMADDSUB231PSZ128m    ,    X86::VFMADDSUB231PSm        },
-  { X86::VFMADDSUB231PSZ128r    ,    X86::VFMADDSUB231PSr        },
-  { X86::VFMSUB132PDZ128m       ,    X86::VFMSUB132PDm           },
-  { X86::VFMSUB132PDZ128r       ,    X86::VFMSUB132PDr           },
-  { X86::VFMSUB132PSZ128m       ,    X86::VFMSUB132PSm           },
-  { X86::VFMSUB132PSZ128r       ,    X86::VFMSUB132PSr           },
-  { X86::VFMSUB213PDZ128m       ,    X86::VFMSUB213PDm           },
-  { X86::VFMSUB213PDZ128r       ,    X86::VFMSUB213PDr           },
-  { X86::VFMSUB213PSZ128m       ,    X86::VFMSUB213PSm           },
-  { X86::VFMSUB213PSZ128r       ,    X86::VFMSUB213PSr           },
-  { X86::VFMSUB231PDZ128m       ,    X86::VFMSUB231PDm           },
-  { X86::VFMSUB231PDZ128r       ,    X86::VFMSUB231PDr           },
-  { X86::VFMSUB231PSZ128m       ,    X86::VFMSUB231PSm           },
-  { X86::VFMSUB231PSZ128r       ,    X86::VFMSUB231PSr           },
-  { X86::VFMSUBADD132PDZ128m    ,    X86::VFMSUBADD132PDm        },
-  { X86::VFMSUBADD132PDZ128r    ,    X86::VFMSUBADD132PDr        },
-  { X86::VFMSUBADD132PSZ128m    ,    X86::VFMSUBADD132PSm        },
-  { X86::VFMSUBADD132PSZ128r    ,    X86::VFMSUBADD132PSr        },
-  { X86::VFMSUBADD213PDZ128m    ,    X86::VFMSUBADD213PDm        },
-  { X86::VFMSUBADD213PDZ128r    ,    X86::VFMSUBADD213PDr        },
-  { X86::VFMSUBADD213PSZ128m    ,    X86::VFMSUBADD213PSm        },
-  { X86::VFMSUBADD213PSZ128r    ,    X86::VFMSUBADD213PSr        },
-  { X86::VFMSUBADD231PDZ128m    ,    X86::VFMSUBADD231PDm        },
-  { X86::VFMSUBADD231PDZ128r    ,    X86::VFMSUBADD231PDr        },
-  { X86::VFMSUBADD231PSZ128m    ,    X86::VFMSUBADD231PSm        },
-  { X86::VFMSUBADD231PSZ128r    ,    X86::VFMSUBADD231PSr        },
-  { X86::VFNMADD132PDZ128m      ,    X86::VFNMADD132PDm          },
-  { X86::VFNMADD132PDZ128r      ,    X86::VFNMADD132PDr          },
-  { X86::VFNMADD132PSZ128m      ,    X86::VFNMADD132PSm          },
-  { X86::VFNMADD132PSZ128r      ,    X86::VFNMADD132PSr          },
-  { X86::VFNMADD213PDZ128m      ,    X86::VFNMADD213PDm          },
-  { X86::VFNMADD213PDZ128r      ,    X86::VFNMADD213PDr          },
-  { X86::VFNMADD213PSZ128m      ,    X86::VFNMADD213PSm          },
-  { X86::VFNMADD213PSZ128r      ,    X86::VFNMADD213PSr          },
-  { X86::VFNMADD231PDZ128m      ,    X86::VFNMADD231PDm          },
-  { X86::VFNMADD231PDZ128r      ,    X86::VFNMADD231PDr          },
-  { X86::VFNMADD231PSZ128m      ,    X86::VFNMADD231PSm          },
-  { X86::VFNMADD231PSZ128r      ,    X86::VFNMADD231PSr          },
-  { X86::VFNMSUB132PDZ128m      ,    X86::VFNMSUB132PDm          },
-  { X86::VFNMSUB132PDZ128r      ,    X86::VFNMSUB132PDr          },
-  { X86::VFNMSUB132PSZ128m      ,    X86::VFNMSUB132PSm          },
-  { X86::VFNMSUB132PSZ128r      ,    X86::VFNMSUB132PSr          },
-  { X86::VFNMSUB213PDZ128m      ,    X86::VFNMSUB213PDm          },
-  { X86::VFNMSUB213PDZ128r      ,    X86::VFNMSUB213PDr          },
-  { X86::VFNMSUB213PSZ128m      ,    X86::VFNMSUB213PSm          },
-  { X86::VFNMSUB213PSZ128r      ,    X86::VFNMSUB213PSr          },
-  { X86::VFNMSUB231PDZ128m      ,    X86::VFNMSUB231PDm          },
-  { X86::VFNMSUB231PDZ128r      ,    X86::VFNMSUB231PDr          },
-  { X86::VFNMSUB231PSZ128m      ,    X86::VFNMSUB231PSm          },
-  { X86::VFNMSUB231PSZ128r      ,    X86::VFNMSUB231PSr          },
-  { X86::VMAXCPDZ128rm          ,    X86::VMAXCPDrm              },
-  { X86::VMAXCPDZ128rr          ,    X86::VMAXCPDrr              },
-  { X86::VMAXCPSZ128rm          ,    X86::VMAXCPSrm              },
-  { X86::VMAXCPSZ128rr          ,    X86::VMAXCPSrr              },
-  { X86::VMAXPDZ128rm           ,    X86::VMAXPDrm               },
-  { X86::VMAXPDZ128rr           ,    X86::VMAXPDrr               },
-  { X86::VMAXPSZ128rm           ,    X86::VMAXPSrm               },
-  { X86::VMAXPSZ128rr           ,    X86::VMAXPSrr               },
-  { X86::VMINCPDZ128rm          ,    X86::VMINCPDrm              },
-  { X86::VMINCPDZ128rr          ,    X86::VMINCPDrr              },
-  { X86::VMINCPSZ128rm          ,    X86::VMINCPSrm              },
-  { X86::VMINCPSZ128rr          ,    X86::VMINCPSrr              },
-  { X86::VMINPDZ128rm           ,    X86::VMINPDrm               },
-  { X86::VMINPDZ128rr           ,    X86::VMINPDrr               },
-  { X86::VMINPSZ128rm           ,    X86::VMINPSrm               },
-  { X86::VMINPSZ128rr           ,    X86::VMINPSrr               },
-  { X86::VMOVAPDZ128mr          ,    X86::VMOVAPDmr              },
-  { X86::VMOVAPDZ128rm          ,    X86::VMOVAPDrm              },
-  { X86::VMOVAPDZ128rr          ,    X86::VMOVAPDrr              },
-  { X86::VMOVAPDZ128rr_REV      ,    X86::VMOVAPDrr_REV          },
-  { X86::VMOVAPSZ128mr          ,    X86::VMOVAPSmr              },
-  { X86::VMOVAPSZ128rm          ,    X86::VMOVAPSrm              },
-  { X86::VMOVAPSZ128rr          ,    X86::VMOVAPSrr              },
-  { X86::VMOVAPSZ128rr_REV      ,    X86::VMOVAPSrr_REV          },
-  { X86::VMOVDDUPZ128rm         ,    X86::VMOVDDUPrm             },
-  { X86::VMOVDDUPZ128rr         ,    X86::VMOVDDUPrr             },
-  { X86::VMOVDQA32Z128mr        ,    X86::VMOVDQAmr              },
-  { X86::VMOVDQA32Z128rm        ,    X86::VMOVDQArm              },
-  { X86::VMOVDQA32Z128rr        ,    X86::VMOVDQArr              },
-  { X86::VMOVDQA32Z128rr_REV    ,    X86::VMOVDQArr_REV          },
-  { X86::VMOVDQA64Z128mr        ,    X86::VMOVDQAmr              },
-  { X86::VMOVDQA64Z128rm        ,    X86::VMOVDQArm              },
-  { X86::VMOVDQA64Z128rr        ,    X86::VMOVDQArr              },
-  { X86::VMOVDQA64Z128rr_REV    ,    X86::VMOVDQArr_REV          },
-  { X86::VMOVDQU16Z128mr        ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU16Z128rm        ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU16Z128rr        ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU16Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVDQU32Z128mr        ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU32Z128rm        ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU32Z128rr        ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU32Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVDQU64Z128mr        ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU64Z128rm        ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU64Z128rr        ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU64Z128rr_REV    ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVDQU8Z128mr         ,    X86::VMOVDQUmr              },
-  { X86::VMOVDQU8Z128rm         ,    X86::VMOVDQUrm              },
-  { X86::VMOVDQU8Z128rr         ,    X86::VMOVDQUrr              },
-  { X86::VMOVDQU8Z128rr_REV     ,    X86::VMOVDQUrr_REV          },
-  { X86::VMOVHPDZ128mr          ,    X86::VMOVHPDmr              },
-  { X86::VMOVHPDZ128rm          ,    X86::VMOVHPDrm              },
-  { X86::VMOVHPSZ128mr          ,    X86::VMOVHPSmr              },
-  { X86::VMOVHPSZ128rm          ,    X86::VMOVHPSrm              },
-  { X86::VMOVLPDZ128mr          ,    X86::VMOVLPDmr              },
-  { X86::VMOVLPDZ128rm          ,    X86::VMOVLPDrm              },
-  { X86::VMOVLPSZ128mr          ,    X86::VMOVLPSmr              },
-  { X86::VMOVLPSZ128rm          ,    X86::VMOVLPSrm              },
-  { X86::VMOVNTDQAZ128rm        ,    X86::VMOVNTDQArm            },
-  { X86::VMOVNTDQZ128mr         ,    X86::VMOVNTDQmr             },
-  { X86::VMOVNTPDZ128mr         ,    X86::VMOVNTPDmr             },
-  { X86::VMOVNTPSZ128mr         ,    X86::VMOVNTPSmr             },
-  { X86::VMOVSHDUPZ128rm        ,    X86::VMOVSHDUPrm            },
-  { X86::VMOVSHDUPZ128rr        ,    X86::VMOVSHDUPrr            },
-  { X86::VMOVSLDUPZ128rm        ,    X86::VMOVSLDUPrm            },
-  { X86::VMOVSLDUPZ128rr        ,    X86::VMOVSLDUPrr            },
-  { X86::VMOVUPDZ128mr          ,    X86::VMOVUPDmr              },
-  { X86::VMOVUPDZ128rm          ,    X86::VMOVUPDrm              },
-  { X86::VMOVUPDZ128rr          ,    X86::VMOVUPDrr              },
-  { X86::VMOVUPDZ128rr_REV      ,    X86::VMOVUPDrr_REV          },
-  { X86::VMOVUPSZ128mr          ,    X86::VMOVUPSmr              },
-  { X86::VMOVUPSZ128rm          ,    X86::VMOVUPSrm              },
-  { X86::VMOVUPSZ128rr          ,    X86::VMOVUPSrr              },
-  { X86::VMOVUPSZ128rr_REV      ,    X86::VMOVUPSrr_REV          },
-  { X86::VMULPDZ128rm           ,    X86::VMULPDrm               },
-  { X86::VMULPDZ128rr           ,    X86::VMULPDrr               },
-  { X86::VMULPSZ128rm           ,    X86::VMULPSrm               },
-  { X86::VMULPSZ128rr           ,    X86::VMULPSrr               },
-  { X86::VORPDZ128rm            ,    X86::VORPDrm                },
-  { X86::VORPDZ128rr            ,    X86::VORPDrr                },
-  { X86::VORPSZ128rm            ,    X86::VORPSrm                },
-  { X86::VORPSZ128rr            ,    X86::VORPSrr                },
-  { X86::VPABSBZ128rm           ,    X86::VPABSBrm               },
-  { X86::VPABSBZ128rr           ,    X86::VPABSBrr               },
-  { X86::VPABSDZ128rm           ,    X86::VPABSDrm               },
-  { X86::VPABSDZ128rr           ,    X86::VPABSDrr               },
-  { X86::VPABSWZ128rm           ,    X86::VPABSWrm               },
-  { X86::VPABSWZ128rr           ,    X86::VPABSWrr               },
-  { X86::VPACKSSDWZ128rm        ,    X86::VPACKSSDWrm            },
-  { X86::VPACKSSDWZ128rr        ,    X86::VPACKSSDWrr            },
-  { X86::VPACKSSWBZ128rm        ,    X86::VPACKSSWBrm            },
-  { X86::VPACKSSWBZ128rr        ,    X86::VPACKSSWBrr            },
-  { X86::VPACKUSDWZ128rm        ,    X86::VPACKUSDWrm            },
-  { X86::VPACKUSDWZ128rr        ,    X86::VPACKUSDWrr            },
-  { X86::VPACKUSWBZ128rm        ,    X86::VPACKUSWBrm            },
-  { X86::VPACKUSWBZ128rr        ,    X86::VPACKUSWBrr            },
-  { X86::VPADDBZ128rm           ,    X86::VPADDBrm               },
-  { X86::VPADDBZ128rr           ,    X86::VPADDBrr               },
-  { X86::VPADDDZ128rm           ,    X86::VPADDDrm               },
-  { X86::VPADDDZ128rr           ,    X86::VPADDDrr               },
-  { X86::VPADDQZ128rm           ,    X86::VPADDQrm               },
-  { X86::VPADDQZ128rr           ,    X86::VPADDQrr               },
-  { X86::VPADDSBZ128rm          ,    X86::VPADDSBrm              },
-  { X86::VPADDSBZ128rr          ,    X86::VPADDSBrr              },
-  { X86::VPADDSWZ128rm          ,    X86::VPADDSWrm              },
-  { X86::VPADDSWZ128rr          ,    X86::VPADDSWrr              },
-  { X86::VPADDUSBZ128rm         ,    X86::VPADDUSBrm             },
-  { X86::VPADDUSBZ128rr         ,    X86::VPADDUSBrr             },
-  { X86::VPADDUSWZ128rm         ,    X86::VPADDUSWrm             },
-  { X86::VPADDUSWZ128rr         ,    X86::VPADDUSWrr             },
-  { X86::VPADDWZ128rm           ,    X86::VPADDWrm               },
-  { X86::VPADDWZ128rr           ,    X86::VPADDWrr               },
-  { X86::VPALIGNRZ128rmi        ,    X86::VPALIGNRrmi            },
-  { X86::VPALIGNRZ128rri        ,    X86::VPALIGNRrri            },
-  { X86::VPANDDZ128rm           ,    X86::VPANDrm                },
-  { X86::VPANDDZ128rr           ,    X86::VPANDrr                },
-  { X86::VPANDQZ128rm           ,    X86::VPANDrm                },
-  { X86::VPANDQZ128rr           ,    X86::VPANDrr                },
-  { X86::VPAVGBZ128rm           ,    X86::VPAVGBrm               },
-  { X86::VPAVGBZ128rr           ,    X86::VPAVGBrr               },
-  { X86::VPAVGWZ128rm           ,    X86::VPAVGWrm               },
-  { X86::VPAVGWZ128rr           ,    X86::VPAVGWrr               },
-  { X86::VPBROADCASTBZ128m      ,    X86::VPBROADCASTBrm         },
-  { X86::VPBROADCASTBZ128r      ,    X86::VPBROADCASTBrr         },
-  { X86::VPBROADCASTDZ128m      ,    X86::VPBROADCASTDrm         },
-  { X86::VPBROADCASTDZ128r      ,    X86::VPBROADCASTDrr         },
-  { X86::VPBROADCASTQZ128m      ,    X86::VPBROADCASTQrm         },
-  { X86::VPBROADCASTQZ128r      ,    X86::VPBROADCASTQrr         },
-  { X86::VPBROADCASTWZ128m      ,    X86::VPBROADCASTWrm         },
-  { X86::VPBROADCASTWZ128r      ,    X86::VPBROADCASTWrr         },
-  { X86::VPERMILPDZ128mi        ,    X86::VPERMILPDmi            },
-  { X86::VPERMILPDZ128ri        ,    X86::VPERMILPDri            },
-  { X86::VPERMILPDZ128rm        ,    X86::VPERMILPDrm            },
-  { X86::VPERMILPDZ128rr        ,    X86::VPERMILPDrr            },
-  { X86::VPERMILPSZ128mi        ,    X86::VPERMILPSmi            },
-  { X86::VPERMILPSZ128ri        ,    X86::VPERMILPSri            },
-  { X86::VPERMILPSZ128rm        ,    X86::VPERMILPSrm            },
-  { X86::VPERMILPSZ128rr        ,    X86::VPERMILPSrr            },
-  { X86::VPMADDUBSWZ128rm       ,    X86::VPMADDUBSWrm           },
-  { X86::VPMADDUBSWZ128rr       ,    X86::VPMADDUBSWrr           },
-  { X86::VPMADDWDZ128rm         ,    X86::VPMADDWDrm             },
-  { X86::VPMADDWDZ128rr         ,    X86::VPMADDWDrr             },
-  { X86::VPMAXSBZ128rm          ,    X86::VPMAXSBrm              },
-  { X86::VPMAXSBZ128rr          ,    X86::VPMAXSBrr              },
-  { X86::VPMAXSDZ128rm          ,    X86::VPMAXSDrm              },
-  { X86::VPMAXSDZ128rr          ,    X86::VPMAXSDrr              },
-  { X86::VPMAXSWZ128rm          ,    X86::VPMAXSWrm              },
-  { X86::VPMAXSWZ128rr          ,    X86::VPMAXSWrr              },
-  { X86::VPMAXUBZ128rm          ,    X86::VPMAXUBrm              },
-  { X86::VPMAXUBZ128rr          ,    X86::VPMAXUBrr              },
-  { X86::VPMAXUDZ128rm          ,    X86::VPMAXUDrm              },
-  { X86::VPMAXUDZ128rr          ,    X86::VPMAXUDrr              },
-  { X86::VPMAXUWZ128rm          ,    X86::VPMAXUWrm              },
-  { X86::VPMAXUWZ128rr          ,    X86::VPMAXUWrr              },
-  { X86::VPMINSBZ128rm          ,    X86::VPMINSBrm              },
-  { X86::VPMINSBZ128rr          ,    X86::VPMINSBrr              },
-  { X86::VPMINSDZ128rm          ,    X86::VPMINSDrm              },
-  { X86::VPMINSDZ128rr          ,    X86::VPMINSDrr              },
-  { X86::VPMINSWZ128rm          ,    X86::VPMINSWrm              },
-  { X86::VPMINSWZ128rr          ,    X86::VPMINSWrr              },
-  { X86::VPMINUBZ128rm          ,    X86::VPMINUBrm              },
-  { X86::VPMINUBZ128rr          ,    X86::VPMINUBrr              },
-  { X86::VPMINUDZ128rm          ,    X86::VPMINUDrm              },
-  { X86::VPMINUDZ128rr          ,    X86::VPMINUDrr              },
-  { X86::VPMINUWZ128rm          ,    X86::VPMINUWrm              },
-  { X86::VPMINUWZ128rr          ,    X86::VPMINUWrr              },
-  { X86::VPMOVSXBDZ128rm        ,    X86::VPMOVSXBDrm            },
-  { X86::VPMOVSXBDZ128rr        ,    X86::VPMOVSXBDrr            },
-  { X86::VPMOVSXBQZ128rm        ,    X86::VPMOVSXBQrm            },
-  { X86::VPMOVSXBQZ128rr        ,    X86::VPMOVSXBQrr            },
-  { X86::VPMOVSXBWZ128rm        ,    X86::VPMOVSXBWrm            },
-  { X86::VPMOVSXBWZ128rr        ,    X86::VPMOVSXBWrr            },
-  { X86::VPMOVSXDQZ128rm        ,    X86::VPMOVSXDQrm            },
-  { X86::VPMOVSXDQZ128rr        ,    X86::VPMOVSXDQrr            },
-  { X86::VPMOVSXWDZ128rm        ,    X86::VPMOVSXWDrm            },
-  { X86::VPMOVSXWDZ128rr        ,    X86::VPMOVSXWDrr            },
-  { X86::VPMOVSXWQZ128rm        ,    X86::VPMOVSXWQrm            },
-  { X86::VPMOVSXWQZ128rr        ,    X86::VPMOVSXWQrr            },
-  { X86::VPMOVZXBDZ128rm        ,    X86::VPMOVZXBDrm            },
-  { X86::VPMOVZXBDZ128rr        ,    X86::VPMOVZXBDrr            },
-  { X86::VPMOVZXBQZ128rm        ,    X86::VPMOVZXBQrm            },
-  { X86::VPMOVZXBQZ128rr        ,    X86::VPMOVZXBQrr            },
-  { X86::VPMOVZXBWZ128rm        ,    X86::VPMOVZXBWrm            },
-  { X86::VPMOVZXBWZ128rr        ,    X86::VPMOVZXBWrr            },
-  { X86::VPMOVZXDQZ128rm        ,    X86::VPMOVZXDQrm            },
-  { X86::VPMOVZXDQZ128rr        ,    X86::VPMOVZXDQrr            },
-  { X86::VPMOVZXWDZ128rm        ,    X86::VPMOVZXWDrm            },
-  { X86::VPMOVZXWDZ128rr        ,    X86::VPMOVZXWDrr            },
-  { X86::VPMOVZXWQZ128rm        ,    X86::VPMOVZXWQrm            },
-  { X86::VPMOVZXWQZ128rr        ,    X86::VPMOVZXWQrr            },
-  { X86::VPMULDQZ128rm          ,    X86::VPMULDQrm              },
-  { X86::VPMULDQZ128rr          ,    X86::VPMULDQrr              },
-  { X86::VPMULHRSWZ128rm        ,    X86::VPMULHRSWrm            },
-  { X86::VPMULHRSWZ128rr        ,    X86::VPMULHRSWrr            },
-  { X86::VPMULHUWZ128rm         ,    X86::VPMULHUWrm             },
-  { X86::VPMULHUWZ128rr         ,    X86::VPMULHUWrr             },
-  { X86::VPMULHWZ128rm          ,    X86::VPMULHWrm              },
-  { X86::VPMULHWZ128rr          ,    X86::VPMULHWrr              },
-  { X86::VPMULLDZ128rm          ,    X86::VPMULLDrm              },
-  { X86::VPMULLDZ128rr          ,    X86::VPMULLDrr              },
-  { X86::VPMULLWZ128rm          ,    X86::VPMULLWrm              },
-  { X86::VPMULLWZ128rr          ,    X86::VPMULLWrr              },
-  { X86::VPMULUDQZ128rm         ,    X86::VPMULUDQrm             },
-  { X86::VPMULUDQZ128rr         ,    X86::VPMULUDQrr             },
-  { X86::VPORDZ128rm            ,    X86::VPORrm                 },
-  { X86::VPORDZ128rr            ,    X86::VPORrr                 },
-  { X86::VPORQZ128rm            ,    X86::VPORrm                 },
-  { X86::VPORQZ128rr            ,    X86::VPORrr                 },
-  { X86::VPSADBWZ128rm          ,    X86::VPSADBWrm              },
-  { X86::VPSADBWZ128rr          ,    X86::VPSADBWrr              },
-  { X86::VPSHUFBZ128rm          ,    X86::VPSHUFBrm              },
-  { X86::VPSHUFBZ128rr          ,    X86::VPSHUFBrr              },
-  { X86::VPSHUFDZ128mi          ,    X86::VPSHUFDmi              },
-  { X86::VPSHUFDZ128ri          ,    X86::VPSHUFDri              },
-  { X86::VPSHUFHWZ128mi         ,    X86::VPSHUFHWmi             },
-  { X86::VPSHUFHWZ128ri         ,    X86::VPSHUFHWri             },
-  { X86::VPSHUFLWZ128mi         ,    X86::VPSHUFLWmi             },
-  { X86::VPSHUFLWZ128ri         ,    X86::VPSHUFLWri             },
-  { X86::VPSLLDQZ128rr          ,    X86::VPSLLDQri              },
-  { X86::VPSLLDZ128ri           ,    X86::VPSLLDri               },
-  { X86::VPSLLDZ128rm           ,    X86::VPSLLDrm               },
-  { X86::VPSLLDZ128rr           ,    X86::VPSLLDrr               },
-  { X86::VPSLLQZ128ri           ,    X86::VPSLLQri               },
-  { X86::VPSLLQZ128rm           ,    X86::VPSLLQrm               },
-  { X86::VPSLLQZ128rr           ,    X86::VPSLLQrr               },
-  { X86::VPSLLVDZ128rm          ,    X86::VPSLLVDrm              },
-  { X86::VPSLLVDZ128rr          ,    X86::VPSLLVDrr              },
-  { X86::VPSLLVQZ128rm          ,    X86::VPSLLVQrm              },
-  { X86::VPSLLVQZ128rr          ,    X86::VPSLLVQrr              },
-  { X86::VPSLLWZ128ri           ,    X86::VPSLLWri               },
-  { X86::VPSLLWZ128rm           ,    X86::VPSLLWrm               },
-  { X86::VPSLLWZ128rr           ,    X86::VPSLLWrr               },
-  { X86::VPSRADZ128ri           ,    X86::VPSRADri               },
-  { X86::VPSRADZ128rm           ,    X86::VPSRADrm               },
-  { X86::VPSRADZ128rr           ,    X86::VPSRADrr               },
-  { X86::VPSRAVDZ128rm          ,    X86::VPSRAVDrm              },
-  { X86::VPSRAVDZ128rr          ,    X86::VPSRAVDrr              },
-  { X86::VPSRAWZ128ri           ,    X86::VPSRAWri               },
-  { X86::VPSRAWZ128rm           ,    X86::VPSRAWrm               },
-  { X86::VPSRAWZ128rr           ,    X86::VPSRAWrr               },
-  { X86::VPSRLDQZ128rr          ,    X86::VPSRLDQri              },
-  { X86::VPSRLDZ128ri           ,    X86::VPSRLDri               },
-  { X86::VPSRLDZ128rm           ,    X86::VPSRLDrm               },
-  { X86::VPSRLDZ128rr           ,    X86::VPSRLDrr               },
-  { X86::VPSRLQZ128ri           ,    X86::VPSRLQri               },
-  { X86::VPSRLQZ128rm           ,    X86::VPSRLQrm               },
-  { X86::VPSRLQZ128rr           ,    X86::VPSRLQrr               },
-  { X86::VPSRLVDZ128rm          ,    X86::VPSRLVDrm              },
-  { X86::VPSRLVDZ128rr          ,    X86::VPSRLVDrr              },
-  { X86::VPSRLVQZ128rm          ,    X86::VPSRLVQrm              },
-  { X86::VPSRLVQZ128rr          ,    X86::VPSRLVQrr              },
-  { X86::VPSRLWZ128ri           ,    X86::VPSRLWri               },
-  { X86::VPSRLWZ128rm           ,    X86::VPSRLWrm               },
-  { X86::VPSRLWZ128rr           ,    X86::VPSRLWrr               },
-  { X86::VPSUBBZ128rm           ,    X86::VPSUBBrm               },
-  { X86::VPSUBBZ128rr           ,    X86::VPSUBBrr               },
-  { X86::VPSUBDZ128rm           ,    X86::VPSUBDrm               },
-  { X86::VPSUBDZ128rr           ,    X86::VPSUBDrr               },
-  { X86::VPSUBQZ128rm           ,    X86::VPSUBQrm               },
-  { X86::VPSUBQZ128rr           ,    X86::VPSUBQrr               },
-  { X86::VPSUBSBZ128rm          ,    X86::VPSUBSBrm              },
-  { X86::VPSUBSBZ128rr          ,    X86::VPSUBSBrr              },
-  { X86::VPSUBSWZ128rm          ,    X86::VPSUBSWrm              },
-  { X86::VPSUBSWZ128rr          ,    X86::VPSUBSWrr              },
-  { X86::VPSUBUSBZ128rm         ,    X86::VPSUBUSBrm             },
-  { X86::VPSUBUSBZ128rr         ,    X86::VPSUBUSBrr             },
-  { X86::VPSUBUSWZ128rm         ,    X86::VPSUBUSWrm             },
-  { X86::VPSUBUSWZ128rr         ,    X86::VPSUBUSWrr             },
-  { X86::VPSUBWZ128rm           ,    X86::VPSUBWrm               },
-  { X86::VPSUBWZ128rr           ,    X86::VPSUBWrr               },
-  { X86::VPUNPCKHBWZ128rm       ,    X86::VPUNPCKHBWrm           },
-  { X86::VPUNPCKHBWZ128rr       ,    X86::VPUNPCKHBWrr           },
-  { X86::VPUNPCKHDQZ128rm       ,    X86::VPUNPCKHDQrm           },
-  { X86::VPUNPCKHDQZ128rr       ,    X86::VPUNPCKHDQrr           },
-  { X86::VPUNPCKHQDQZ128rm      ,    X86::VPUNPCKHQDQrm          },
-  { X86::VPUNPCKHQDQZ128rr      ,    X86::VPUNPCKHQDQrr          },
-  { X86::VPUNPCKHWDZ128rm       ,    X86::VPUNPCKHWDrm           },
-  { X86::VPUNPCKHWDZ128rr       ,    X86::VPUNPCKHWDrr           },
-  { X86::VPUNPCKLBWZ128rm       ,    X86::VPUNPCKLBWrm           },
-  { X86::VPUNPCKLBWZ128rr       ,    X86::VPUNPCKLBWrr           },
-  { X86::VPUNPCKLDQZ128rm       ,    X86::VPUNPCKLDQrm           },
-  { X86::VPUNPCKLDQZ128rr       ,    X86::VPUNPCKLDQrr           },
-  { X86::VPUNPCKLQDQZ128rm      ,    X86::VPUNPCKLQDQrm          },
-  { X86::VPUNPCKLQDQZ128rr      ,    X86::VPUNPCKLQDQrr          },
-  { X86::VPUNPCKLWDZ128rm       ,    X86::VPUNPCKLWDrm           },
-  { X86::VPUNPCKLWDZ128rr       ,    X86::VPUNPCKLWDrr           },
-  { X86::VPXORDZ128rm           ,    X86::VPXORrm                },
-  { X86::VPXORDZ128rr           ,    X86::VPXORrr                },
-  { X86::VPXORQZ128rm           ,    X86::VPXORrm                },
-  { X86::VPXORQZ128rr           ,    X86::VPXORrr                },
-  { X86::VSHUFPDZ128rmi         ,    X86::VSHUFPDrmi             },
-  { X86::VSHUFPDZ128rri         ,    X86::VSHUFPDrri             },
-  { X86::VSHUFPSZ128rmi         ,    X86::VSHUFPSrmi             },
-  { X86::VSHUFPSZ128rri         ,    X86::VSHUFPSrri             },
-  { X86::VSQRTPDZ128m           ,    X86::VSQRTPDm               },
-  { X86::VSQRTPDZ128r           ,    X86::VSQRTPDr               },
-  { X86::VSQRTPSZ128m           ,    X86::VSQRTPSm               },
-  { X86::VSQRTPSZ128r           ,    X86::VSQRTPSr               },
-  { X86::VSUBPDZ128rm           ,    X86::VSUBPDrm               },
-  { X86::VSUBPDZ128rr           ,    X86::VSUBPDrr               },
-  { X86::VSUBPSZ128rm           ,    X86::VSUBPSrm               },
-  { X86::VSUBPSZ128rr           ,    X86::VSUBPSrr               },
-  { X86::VUNPCKHPDZ128rm        ,    X86::VUNPCKHPDrm            },
-  { X86::VUNPCKHPDZ128rr        ,    X86::VUNPCKHPDrr            },
-  { X86::VUNPCKHPSZ128rm        ,    X86::VUNPCKHPSrm            },
-  { X86::VUNPCKHPSZ128rr        ,    X86::VUNPCKHPSrr            },
-  { X86::VUNPCKLPDZ128rm        ,    X86::VUNPCKLPDrm            },
-  { X86::VUNPCKLPDZ128rr        ,    X86::VUNPCKLPDrr            },
-  { X86::VUNPCKLPSZ128rm        ,    X86::VUNPCKLPSrm            },
-  { X86::VUNPCKLPSZ128rr        ,    X86::VUNPCKLPSrr            },
-  { X86::VXORPDZ128rm           ,    X86::VXORPDrm               },
-  { X86::VXORPDZ128rr           ,    X86::VXORPDrr               },
-  { X86::VXORPSZ128rm           ,    X86::VXORPSrm               },
-  { X86::VXORPSZ128rr           ,    X86::VXORPSrr               },
-};
-
-
-// X86 EVEX encoded instructions that have a VEX 256 encoding
-// (table format: <EVEX opcode, VEX-256 opcode>).
- static const X86EvexToVexCompressTableEntry X86EvexToVex256CompressTable[] = {
-  { X86::VADDPDZ256rm           ,     X86::VADDPDYrm             },
-  { X86::VADDPDZ256rr           ,     X86::VADDPDYrr             },
-  { X86::VADDPSZ256rm           ,     X86::VADDPSYrm             },
-  { X86::VADDPSZ256rr           ,     X86::VADDPSYrr             },
-  { X86::VANDNPDZ256rm          ,     X86::VANDNPDYrm            },
-  { X86::VANDNPDZ256rr          ,     X86::VANDNPDYrr            },
-  { X86::VANDNPSZ256rm          ,     X86::VANDNPSYrm            },
-  { X86::VANDNPSZ256rr          ,     X86::VANDNPSYrr            },
-  { X86::VANDPDZ256rm           ,     X86::VANDPDYrm             },
-  { X86::VANDPDZ256rr           ,     X86::VANDPDYrr             },
-  { X86::VANDPSZ256rm           ,     X86::VANDPSYrm             },
-  { X86::VANDPSZ256rr           ,     X86::VANDPSYrr             },
-  { X86::VBROADCASTSDZ256m      ,     X86::VBROADCASTSDYrm       },
-  { X86::VBROADCASTSDZ256r      ,     X86::VBROADCASTSDYrr       },
-  { X86::VBROADCASTSSZ256m      ,     X86::VBROADCASTSSYrm       },
-  { X86::VBROADCASTSSZ256r      ,     X86::VBROADCASTSSYrr       },
-  { X86::VCVTDQ2PDZ256rm        ,     X86::VCVTDQ2PDYrm          },
-  { X86::VCVTDQ2PDZ256rr        ,     X86::VCVTDQ2PDYrr          },
-  { X86::VCVTDQ2PSZ256rm        ,     X86::VCVTDQ2PSYrm          },
-  { X86::VCVTDQ2PSZ256rr        ,     X86::VCVTDQ2PSYrr          },
-  { X86::VCVTPD2DQZ256rm        ,     X86::VCVTPD2DQYrm          },
-  { X86::VCVTPD2DQZ256rr        ,     X86::VCVTPD2DQYrr          },
-  { X86::VCVTPD2PSZ256rm        ,     X86::VCVTPD2PSYrm          },
-  { X86::VCVTPD2PSZ256rr        ,     X86::VCVTPD2PSYrr          },
-  { X86::VCVTPH2PSZ256rm        ,     X86::VCVTPH2PSYrm          },
-  { X86::VCVTPH2PSZ256rr        ,     X86::VCVTPH2PSYrr          },
-  { X86::VCVTPS2DQZ256rm        ,     X86::VCVTPS2DQYrm          },
-  { X86::VCVTPS2DQZ256rr        ,     X86::VCVTPS2DQYrr          },
-  { X86::VCVTPS2PDZ256rm        ,     X86::VCVTPS2PDYrm          },
-  { X86::VCVTPS2PDZ256rr        ,     X86::VCVTPS2PDYrr          },
-  { X86::VCVTPS2PHZ256mr        ,     X86::VCVTPS2PHYmr          },
-  { X86::VCVTPS2PHZ256rr        ,     X86::VCVTPS2PHYrr          },
-  { X86::VCVTTPD2DQZ256rm       ,     X86::VCVTTPD2DQYrm         },
-  { X86::VCVTTPD2DQZ256rr       ,     X86::VCVTTPD2DQYrr         },
-  { X86::VCVTTPS2DQZ256rm       ,     X86::VCVTTPS2DQYrm         },
-  { X86::VCVTTPS2DQZ256rr       ,     X86::VCVTTPS2DQYrr         },
-  { X86::VDIVPDZ256rm           ,     X86::VDIVPDYrm             },
-  { X86::VDIVPDZ256rr           ,     X86::VDIVPDYrr             },
-  { X86::VDIVPSZ256rm           ,     X86::VDIVPSYrm             },
-  { X86::VDIVPSZ256rr           ,     X86::VDIVPSYrr             },
-  { X86::VEXTRACTF32x4Z256mr    ,    X86::VEXTRACTF128mr         },
-  { X86::VEXTRACTF64x2Z256mr    ,    X86::VEXTRACTF128mr         },
-  { X86::VEXTRACTF32x4Z256rr    ,    X86::VEXTRACTF128rr         },
-  { X86::VEXTRACTF64x2Z256rr    ,    X86::VEXTRACTF128rr         },
-  { X86::VEXTRACTI32x4Z256mr    ,    X86::VEXTRACTI128mr         },
-  { X86::VEXTRACTI64x2Z256mr    ,    X86::VEXTRACTI128mr         },
-  { X86::VEXTRACTI32x4Z256rr    ,    X86::VEXTRACTI128rr         },
-  { X86::VEXTRACTI64x2Z256rr    ,    X86::VEXTRACTI128rr         },
-  { X86::VFMADD132PDZ256m       ,     X86::VFMADD132PDYm         },
-  { X86::VFMADD132PDZ256r       ,     X86::VFMADD132PDYr         },
-  { X86::VFMADD132PSZ256m       ,     X86::VFMADD132PSYm         },
-  { X86::VFMADD132PSZ256r       ,     X86::VFMADD132PSYr         },
-  { X86::VFMADD213PDZ256m       ,     X86::VFMADD213PDYm         },
-  { X86::VFMADD213PDZ256r       ,     X86::VFMADD213PDYr         },
-  { X86::VFMADD213PSZ256m       ,     X86::VFMADD213PSYm         },
-  { X86::VFMADD213PSZ256r       ,     X86::VFMADD213PSYr         },
-  { X86::VFMADD231PDZ256m       ,     X86::VFMADD231PDYm         },
-  { X86::VFMADD231PDZ256r       ,     X86::VFMADD231PDYr         },
-  { X86::VFMADD231PSZ256m       ,     X86::VFMADD231PSYm         },
-  { X86::VFMADD231PSZ256r       ,     X86::VFMADD231PSYr         },
-  { X86::VFMADDSUB132PDZ256m    ,     X86::VFMADDSUB132PDYm      },
-  { X86::VFMADDSUB132PDZ256r    ,     X86::VFMADDSUB132PDYr      },
-  { X86::VFMADDSUB132PSZ256m    ,     X86::VFMADDSUB132PSYm      },
-  { X86::VFMADDSUB132PSZ256r    ,     X86::VFMADDSUB132PSYr      },
-  { X86::VFMADDSUB213PDZ256m    ,     X86::VFMADDSUB213PDYm      },
-  { X86::VFMADDSUB213PDZ256r    ,     X86::VFMADDSUB213PDYr      },
-  { X86::VFMADDSUB213PSZ256m    ,     X86::VFMADDSUB213PSYm      },
-  { X86::VFMADDSUB213PSZ256r    ,     X86::VFMADDSUB213PSYr      },
-  { X86::VFMADDSUB231PDZ256m    ,     X86::VFMADDSUB231PDYm      },
-  { X86::VFMADDSUB231PDZ256r    ,     X86::VFMADDSUB231PDYr      },
-  { X86::VFMADDSUB231PSZ256m    ,     X86::VFMADDSUB231PSYm      },
-  { X86::VFMADDSUB231PSZ256r    ,     X86::VFMADDSUB231PSYr      },
-  { X86::VFMSUB132PDZ256m       ,     X86::VFMSUB132PDYm         },
-  { X86::VFMSUB132PDZ256r       ,     X86::VFMSUB132PDYr         },
-  { X86::VFMSUB132PSZ256m       ,     X86::VFMSUB132PSYm         },
-  { X86::VFMSUB132PSZ256r       ,     X86::VFMSUB132PSYr         },
-  { X86::VFMSUB213PDZ256m       ,     X86::VFMSUB213PDYm         },
-  { X86::VFMSUB213PDZ256r       ,     X86::VFMSUB213PDYr         },
-  { X86::VFMSUB213PSZ256m       ,     X86::VFMSUB213PSYm         },
-  { X86::VFMSUB213PSZ256r       ,     X86::VFMSUB213PSYr         },
-  { X86::VFMSUB231PDZ256m       ,     X86::VFMSUB231PDYm         },
-  { X86::VFMSUB231PDZ256r       ,     X86::VFMSUB231PDYr         },
-  { X86::VFMSUB231PSZ256m       ,     X86::VFMSUB231PSYm         },
-  { X86::VFMSUB231PSZ256r       ,     X86::VFMSUB231PSYr         },
-  { X86::VFMSUBADD132PDZ256m    ,     X86::VFMSUBADD132PDYm      },
-  { X86::VFMSUBADD132PDZ256r    ,     X86::VFMSUBADD132PDYr      },
-  { X86::VFMSUBADD132PSZ256m    ,     X86::VFMSUBADD132PSYm      },
-  { X86::VFMSUBADD132PSZ256r    ,     X86::VFMSUBADD132PSYr      },
-  { X86::VFMSUBADD213PDZ256m    ,     X86::VFMSUBADD213PDYm      },
-  { X86::VFMSUBADD213PDZ256r    ,     X86::VFMSUBADD213PDYr      },
-  { X86::VFMSUBADD213PSZ256m    ,     X86::VFMSUBADD213PSYm      },
-  { X86::VFMSUBADD213PSZ256r    ,     X86::VFMSUBADD213PSYr      },
-  { X86::VFMSUBADD231PDZ256m    ,     X86::VFMSUBADD231PDYm      },
-  { X86::VFMSUBADD231PDZ256r    ,     X86::VFMSUBADD231PDYr      },
-  { X86::VFMSUBADD231PSZ256m    ,     X86::VFMSUBADD231PSYm      },
-  { X86::VFMSUBADD231PSZ256r    ,     X86::VFMSUBADD231PSYr      },
-  { X86::VFNMADD132PDZ256m      ,     X86::VFNMADD132PDYm        },
-  { X86::VFNMADD132PDZ256r      ,     X86::VFNMADD132PDYr        },
-  { X86::VFNMADD132PSZ256m      ,     X86::VFNMADD132PSYm        },
-  { X86::VFNMADD132PSZ256r      ,     X86::VFNMADD132PSYr        },
-  { X86::VFNMADD213PDZ256m      ,     X86::VFNMADD213PDYm        },
-  { X86::VFNMADD213PDZ256r      ,     X86::VFNMADD213PDYr        },
-  { X86::VFNMADD213PSZ256m      ,     X86::VFNMADD213PSYm        },
-  { X86::VFNMADD213PSZ256r      ,     X86::VFNMADD213PSYr        },
-  { X86::VFNMADD231PDZ256m      ,     X86::VFNMADD231PDYm        },
-  { X86::VFNMADD231PDZ256r      ,     X86::VFNMADD231PDYr        },
-  { X86::VFNMADD231PSZ256m      ,     X86::VFNMADD231PSYm        },
-  { X86::VFNMADD231PSZ256r      ,     X86::VFNMADD231PSYr        },
-  { X86::VFNMSUB132PDZ256m      ,     X86::VFNMSUB132PDYm        },
-  { X86::VFNMSUB132PDZ256r      ,     X86::VFNMSUB132PDYr        },
-  { X86::VFNMSUB132PSZ256m      ,     X86::VFNMSUB132PSYm        },
-  { X86::VFNMSUB132PSZ256r      ,     X86::VFNMSUB132PSYr        },
-  { X86::VFNMSUB213PDZ256m      ,     X86::VFNMSUB213PDYm        },
-  { X86::VFNMSUB213PDZ256r      ,     X86::VFNMSUB213PDYr        },
-  { X86::VFNMSUB213PSZ256m      ,     X86::VFNMSUB213PSYm        },
-  { X86::VFNMSUB213PSZ256r      ,     X86::VFNMSUB213PSYr        },
-  { X86::VFNMSUB231PDZ256m      ,     X86::VFNMSUB231PDYm        },
-  { X86::VFNMSUB231PDZ256r      ,     X86::VFNMSUB231PDYr        },
-  { X86::VFNMSUB231PSZ256m      ,     X86::VFNMSUB231PSYm        },
-  { X86::VFNMSUB231PSZ256r      ,     X86::VFNMSUB231PSYr        },
-  { X86::VINSERTF32x4Z256rm     ,    X86::VINSERTF128rm          },
-  { X86::VINSERTF64x2Z256rm     ,    X86::VINSERTF128rm          },
-  { X86::VINSERTF32x4Z256rr     ,    X86::VINSERTF128rr          },
-  { X86::VINSERTF64x2Z256rr     ,    X86::VINSERTF128rr          },
-  { X86::VINSERTI32x4Z256rm     ,    X86::VINSERTI128rm          },
-  { X86::VINSERTI64x2Z256rm     ,    X86::VINSERTI128rm          },
-  { X86::VINSERTI32x4Z256rr     ,    X86::VINSERTI128rr          },
-  { X86::VINSERTI64x2Z256rr     ,    X86::VINSERTI128rr          },
-  { X86::VMAXCPDZ256rm          ,     X86::VMAXCPDYrm            },
-  { X86::VMAXCPDZ256rr          ,     X86::VMAXCPDYrr            },
-  { X86::VMAXCPSZ256rm          ,     X86::VMAXCPSYrm            },
-  { X86::VMAXCPSZ256rr          ,     X86::VMAXCPSYrr            },
-  { X86::VMAXPDZ256rm           ,     X86::VMAXPDYrm             },
-  { X86::VMAXPDZ256rr           ,     X86::VMAXPDYrr             },
-  { X86::VMAXPSZ256rm           ,     X86::VMAXPSYrm             },
-  { X86::VMAXPSZ256rr           ,     X86::VMAXPSYrr             },
-  { X86::VMINCPDZ256rm          ,     X86::VMINCPDYrm            },
-  { X86::VMINCPDZ256rr          ,     X86::VMINCPDYrr            },
-  { X86::VMINCPSZ256rm          ,     X86::VMINCPSYrm            },
-  { X86::VMINCPSZ256rr          ,     X86::VMINCPSYrr            },
-  { X86::VMINPDZ256rm           ,     X86::VMINPDYrm             },
-  { X86::VMINPDZ256rr           ,     X86::VMINPDYrr             },
-  { X86::VMINPSZ256rm           ,     X86::VMINPSYrm             },
-  { X86::VMINPSZ256rr           ,     X86::VMINPSYrr             },
-  { X86::VMOVAPDZ256mr          ,     X86::VMOVAPDYmr            },
-  { X86::VMOVAPDZ256rm          ,     X86::VMOVAPDYrm            },
-  { X86::VMOVAPDZ256rr          ,     X86::VMOVAPDYrr            },
-  { X86::VMOVAPDZ256rr_REV      ,     X86::VMOVAPDYrr_REV        },
-  { X86::VMOVAPSZ256mr          ,     X86::VMOVAPSYmr            },
-  { X86::VMOVAPSZ256rm          ,     X86::VMOVAPSYrm            },
-  { X86::VMOVAPSZ256rr          ,     X86::VMOVAPSYrr            },
-  { X86::VMOVAPSZ256rr_REV      ,     X86::VMOVAPSYrr_REV        },
-  { X86::VMOVDDUPZ256rm         ,     X86::VMOVDDUPYrm           },
-  { X86::VMOVDDUPZ256rr         ,     X86::VMOVDDUPYrr           },
-  { X86::VMOVDQA32Z256mr        ,     X86::VMOVDQAYmr            },
-  { X86::VMOVDQA32Z256rm        ,     X86::VMOVDQAYrm            },
-  { X86::VMOVDQA32Z256rr        ,     X86::VMOVDQAYrr            },
-  { X86::VMOVDQA32Z256rr_REV    ,     X86::VMOVDQAYrr_REV        },
-  { X86::VMOVDQA64Z256mr        ,     X86::VMOVDQAYmr            },
-  { X86::VMOVDQA64Z256rm        ,     X86::VMOVDQAYrm            },
-  { X86::VMOVDQA64Z256rr        ,     X86::VMOVDQAYrr            },
-  { X86::VMOVDQA64Z256rr_REV    ,     X86::VMOVDQAYrr_REV        },
-  { X86::VMOVDQU16Z256mr        ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU16Z256rm        ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU16Z256rr        ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU16Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVDQU32Z256mr        ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU32Z256rm        ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU32Z256rr        ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU32Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVDQU64Z256mr        ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU64Z256rm        ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU64Z256rr        ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU64Z256rr_REV    ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVDQU8Z256mr         ,     X86::VMOVDQUYmr            },
-  { X86::VMOVDQU8Z256rm         ,     X86::VMOVDQUYrm            },
-  { X86::VMOVDQU8Z256rr         ,     X86::VMOVDQUYrr            },
-  { X86::VMOVDQU8Z256rr_REV     ,     X86::VMOVDQUYrr_REV        },
-  { X86::VMOVNTDQAZ256rm        ,     X86::VMOVNTDQAYrm          },
-  { X86::VMOVNTDQZ256mr         ,     X86::VMOVNTDQYmr           },
-  { X86::VMOVNTPDZ256mr         ,     X86::VMOVNTPDYmr           },
-  { X86::VMOVNTPSZ256mr         ,     X86::VMOVNTPSYmr           },
-  { X86::VMOVSHDUPZ256rm        ,     X86::VMOVSHDUPYrm          },
-  { X86::VMOVSHDUPZ256rr        ,     X86::VMOVSHDUPYrr          },
-  { X86::VMOVSLDUPZ256rm        ,     X86::VMOVSLDUPYrm          },
-  { X86::VMOVSLDUPZ256rr        ,     X86::VMOVSLDUPYrr          },
-  { X86::VMOVUPDZ256mr          ,     X86::VMOVUPDYmr            },
-  { X86::VMOVUPDZ256rm          ,     X86::VMOVUPDYrm            },
-  { X86::VMOVUPDZ256rr          ,     X86::VMOVUPDYrr            },
-  { X86::VMOVUPDZ256rr_REV      ,     X86::VMOVUPDYrr_REV        },
-  { X86::VMOVUPSZ256mr          ,     X86::VMOVUPSYmr            },
-  { X86::VMOVUPSZ256rm          ,     X86::VMOVUPSYrm            },
-  { X86::VMOVUPSZ256rr          ,     X86::VMOVUPSYrr            },
-  { X86::VMOVUPSZ256rr_REV      ,     X86::VMOVUPSYrr_REV        },
-  { X86::VMULPDZ256rm           ,     X86::VMULPDYrm             },
-  { X86::VMULPDZ256rr           ,     X86::VMULPDYrr             },
-  { X86::VMULPSZ256rm           ,     X86::VMULPSYrm             },
-  { X86::VMULPSZ256rr           ,     X86::VMULPSYrr             },
-  { X86::VORPDZ256rm            ,     X86::VORPDYrm              },
-  { X86::VORPDZ256rr            ,     X86::VORPDYrr              },
-  { X86::VORPSZ256rm            ,     X86::VORPSYrm              },
-  { X86::VORPSZ256rr            ,     X86::VORPSYrr              },
-  { X86::VPABSBZ256rm           ,     X86::VPABSBYrm             },
-  { X86::VPABSBZ256rr           ,     X86::VPABSBYrr             },
-  { X86::VPABSDZ256rm           ,     X86::VPABSDYrm             },
-  { X86::VPABSDZ256rr           ,     X86::VPABSDYrr             },
-  { X86::VPABSWZ256rm           ,     X86::VPABSWYrm             },
-  { X86::VPABSWZ256rr           ,     X86::VPABSWYrr             },
-  { X86::VPACKSSDWZ256rm        ,     X86::VPACKSSDWYrm          },
-  { X86::VPACKSSDWZ256rr        ,     X86::VPACKSSDWYrr          },
-  { X86::VPACKSSWBZ256rm        ,     X86::VPACKSSWBYrm          },
-  { X86::VPACKSSWBZ256rr        ,     X86::VPACKSSWBYrr          },
-  { X86::VPACKUSDWZ256rm        ,     X86::VPACKUSDWYrm          },
-  { X86::VPACKUSDWZ256rr        ,     X86::VPACKUSDWYrr          },
-  { X86::VPACKUSWBZ256rm        ,     X86::VPACKUSWBYrm          },
-  { X86::VPACKUSWBZ256rr        ,     X86::VPACKUSWBYrr          },
-  { X86::VPADDBZ256rm           ,     X86::VPADDBYrm             },
-  { X86::VPADDBZ256rr           ,     X86::VPADDBYrr             },
-  { X86::VPADDDZ256rm           ,     X86::VPADDDYrm             },
-  { X86::VPADDDZ256rr           ,     X86::VPADDDYrr             },
-  { X86::VPADDQZ256rm           ,     X86::VPADDQYrm             },
-  { X86::VPADDQZ256rr           ,     X86::VPADDQYrr             },
-  { X86::VPADDSBZ256rm          ,     X86::VPADDSBYrm            },
-  { X86::VPADDSBZ256rr          ,     X86::VPADDSBYrr            },
-  { X86::VPADDSWZ256rm          ,     X86::VPADDSWYrm            },
-  { X86::VPADDSWZ256rr          ,     X86::VPADDSWYrr            },
-  { X86::VPADDUSBZ256rm         ,     X86::VPADDUSBYrm           },
-  { X86::VPADDUSBZ256rr         ,     X86::VPADDUSBYrr           },
-  { X86::VPADDUSWZ256rm         ,     X86::VPADDUSWYrm           },
-  { X86::VPADDUSWZ256rr         ,     X86::VPADDUSWYrr           },
-  { X86::VPADDWZ256rm           ,     X86::VPADDWYrm             },
-  { X86::VPADDWZ256rr           ,     X86::VPADDWYrr             },
-  { X86::VPALIGNRZ256rmi        ,     X86::VPALIGNRYrmi          },
-  { X86::VPALIGNRZ256rri        ,     X86::VPALIGNRYrri          },
-  { X86::VPANDDZ256rm           ,     X86::VPANDYrm              },
-  { X86::VPANDDZ256rr           ,     X86::VPANDYrr              },
-  { X86::VPANDQZ256rm           ,     X86::VPANDYrm              },
-  { X86::VPANDQZ256rr           ,     X86::VPANDYrr              },
-  { X86::VPAVGBZ256rm           ,     X86::VPAVGBYrm             },
-  { X86::VPAVGBZ256rr           ,     X86::VPAVGBYrr             },
-  { X86::VPAVGWZ256rm           ,     X86::VPAVGWYrm             },
-  { X86::VPAVGWZ256rr           ,     X86::VPAVGWYrr             },
-  { X86::VPBROADCASTBZ256m      ,     X86::VPBROADCASTBYrm       },
-  { X86::VPBROADCASTBZ256r      ,     X86::VPBROADCASTBYrr       },
-  { X86::VPBROADCASTDZ256m      ,     X86::VPBROADCASTDYrm       },
-  { X86::VPBROADCASTDZ256r      ,     X86::VPBROADCASTDYrr       },
-  { X86::VPBROADCASTQZ256m      ,     X86::VPBROADCASTQYrm       },
-  { X86::VPBROADCASTQZ256r      ,     X86::VPBROADCASTQYrr       },
-  { X86::VPBROADCASTWZ256m      ,     X86::VPBROADCASTWYrm       },
-  { X86::VPBROADCASTWZ256r      ,     X86::VPBROADCASTWYrr       },
-  { X86::VPERMDZ256rm           ,     X86::VPERMDYrm             },
-  { X86::VPERMDZ256rr           ,     X86::VPERMDYrr             },
-  { X86::VPERMILPDZ256mi        ,     X86::VPERMILPDYmi          },
-  { X86::VPERMILPDZ256ri        ,     X86::VPERMILPDYri          },
-  { X86::VPERMILPDZ256rm        ,     X86::VPERMILPDYrm          },
-  { X86::VPERMILPDZ256rr        ,     X86::VPERMILPDYrr          },
-  { X86::VPERMILPSZ256mi        ,     X86::VPERMILPSYmi          },
-  { X86::VPERMILPSZ256ri        ,     X86::VPERMILPSYri          },
-  { X86::VPERMILPSZ256rm        ,     X86::VPERMILPSYrm          },
-  { X86::VPERMILPSZ256rr        ,     X86::VPERMILPSYrr          },
-  { X86::VPERMPDZ256mi          ,     X86::VPERMPDYmi            },
-  { X86::VPERMPDZ256ri          ,     X86::VPERMPDYri            },
-  { X86::VPERMPSZ256rm          ,     X86::VPERMPSYrm            },
-  { X86::VPERMPSZ256rr          ,     X86::VPERMPSYrr            },
-  { X86::VPERMQZ256mi           ,     X86::VPERMQYmi             },
-  { X86::VPERMQZ256ri           ,     X86::VPERMQYri             },
-  { X86::VPMADDUBSWZ256rm       ,     X86::VPMADDUBSWYrm         },
-  { X86::VPMADDUBSWZ256rr       ,     X86::VPMADDUBSWYrr         },
-  { X86::VPMADDWDZ256rm         ,     X86::VPMADDWDYrm           },
-  { X86::VPMADDWDZ256rr         ,     X86::VPMADDWDYrr           },
-  { X86::VPMAXSBZ256rm          ,     X86::VPMAXSBYrm            },
-  { X86::VPMAXSBZ256rr          ,     X86::VPMAXSBYrr            },
-  { X86::VPMAXSDZ256rm          ,     X86::VPMAXSDYrm            },
-  { X86::VPMAXSDZ256rr          ,     X86::VPMAXSDYrr            },
-  { X86::VPMAXSWZ256rm          ,     X86::VPMAXSWYrm            },
-  { X86::VPMAXSWZ256rr          ,     X86::VPMAXSWYrr            },
-  { X86::VPMAXUBZ256rm          ,     X86::VPMAXUBYrm            },
-  { X86::VPMAXUBZ256rr          ,     X86::VPMAXUBYrr            },
-  { X86::VPMAXUDZ256rm          ,     X86::VPMAXUDYrm            },
-  { X86::VPMAXUDZ256rr          ,     X86::VPMAXUDYrr            },
-  { X86::VPMAXUWZ256rm          ,     X86::VPMAXUWYrm            },
-  { X86::VPMAXUWZ256rr          ,     X86::VPMAXUWYrr            },
-  { X86::VPMINSBZ256rm          ,     X86::VPMINSBYrm            },
-  { X86::VPMINSBZ256rr          ,     X86::VPMINSBYrr            },
-  { X86::VPMINSDZ256rm          ,     X86::VPMINSDYrm            },
-  { X86::VPMINSDZ256rr          ,     X86::VPMINSDYrr            },
-  { X86::VPMINSWZ256rm          ,     X86::VPMINSWYrm            },
-  { X86::VPMINSWZ256rr          ,     X86::VPMINSWYrr            },
-  { X86::VPMINUBZ256rm          ,     X86::VPMINUBYrm            },
-  { X86::VPMINUBZ256rr          ,     X86::VPMINUBYrr            },
-  { X86::VPMINUDZ256rm          ,     X86::VPMINUDYrm            },
-  { X86::VPMINUDZ256rr          ,     X86::VPMINUDYrr            },
-  { X86::VPMINUWZ256rm          ,     X86::VPMINUWYrm            },
-  { X86::VPMINUWZ256rr          ,     X86::VPMINUWYrr            },
-  { X86::VPMOVSXBDZ256rm        ,     X86::VPMOVSXBDYrm          },
-  { X86::VPMOVSXBDZ256rr        ,     X86::VPMOVSXBDYrr          },
-  { X86::VPMOVSXBQZ256rm        ,     X86::VPMOVSXBQYrm          },
-  { X86::VPMOVSXBQZ256rr        ,     X86::VPMOVSXBQYrr          },
-  { X86::VPMOVSXBWZ256rm        ,     X86::VPMOVSXBWYrm          },
-  { X86::VPMOVSXBWZ256rr        ,     X86::VPMOVSXBWYrr          },
-  { X86::VPMOVSXDQZ256rm        ,     X86::VPMOVSXDQYrm          },
-  { X86::VPMOVSXDQZ256rr        ,     X86::VPMOVSXDQYrr          },
-  { X86::VPMOVSXWDZ256rm        ,     X86::VPMOVSXWDYrm          },
-  { X86::VPMOVSXWDZ256rr        ,     X86::VPMOVSXWDYrr          },
-  { X86::VPMOVSXWQZ256rm        ,     X86::VPMOVSXWQYrm          },
-  { X86::VPMOVSXWQZ256rr        ,     X86::VPMOVSXWQYrr          },
-  { X86::VPMOVZXBDZ256rm        ,     X86::VPMOVZXBDYrm          },
-  { X86::VPMOVZXBDZ256rr        ,     X86::VPMOVZXBDYrr          },
-  { X86::VPMOVZXBQZ256rm        ,     X86::VPMOVZXBQYrm          },
-  { X86::VPMOVZXBQZ256rr        ,     X86::VPMOVZXBQYrr          },
-  { X86::VPMOVZXBWZ256rm        ,     X86::VPMOVZXBWYrm          },
-  { X86::VPMOVZXBWZ256rr        ,     X86::VPMOVZXBWYrr          },
-  { X86::VPMOVZXDQZ256rm        ,     X86::VPMOVZXDQYrm          },
-  { X86::VPMOVZXDQZ256rr        ,     X86::VPMOVZXDQYrr          },
-  { X86::VPMOVZXWDZ256rm        ,     X86::VPMOVZXWDYrm          },
-  { X86::VPMOVZXWDZ256rr        ,     X86::VPMOVZXWDYrr          },
-  { X86::VPMOVZXWQZ256rm        ,     X86::VPMOVZXWQYrm          },
-  { X86::VPMOVZXWQZ256rr        ,     X86::VPMOVZXWQYrr          },
-  { X86::VPMULDQZ256rm          ,     X86::VPMULDQYrm            },
-  { X86::VPMULDQZ256rr          ,     X86::VPMULDQYrr            },
-  { X86::VPMULHRSWZ256rm        ,     X86::VPMULHRSWYrm          },
-  { X86::VPMULHRSWZ256rr        ,     X86::VPMULHRSWYrr          },
-  { X86::VPMULHUWZ256rm         ,     X86::VPMULHUWYrm           },
-  { X86::VPMULHUWZ256rr         ,     X86::VPMULHUWYrr           },
-  { X86::VPMULHWZ256rm          ,     X86::VPMULHWYrm            },
-  { X86::VPMULHWZ256rr          ,     X86::VPMULHWYrr            },
-  { X86::VPMULLDZ256rm          ,     X86::VPMULLDYrm            },
-  { X86::VPMULLDZ256rr          ,     X86::VPMULLDYrr            },
-  { X86::VPMULLWZ256rm          ,     X86::VPMULLWYrm            },
-  { X86::VPMULLWZ256rr          ,     X86::VPMULLWYrr            },
-  { X86::VPMULUDQZ256rm         ,     X86::VPMULUDQYrm           },
-  { X86::VPMULUDQZ256rr         ,     X86::VPMULUDQYrr           },
-  { X86::VPORDZ256rm            ,     X86::VPORYrm               },
-  { X86::VPORDZ256rr            ,     X86::VPORYrr               },
-  { X86::VPORQZ256rm            ,     X86::VPORYrm               },
-  { X86::VPORQZ256rr            ,     X86::VPORYrr               },
-  { X86::VPSADBWZ256rm          ,     X86::VPSADBWYrm            },
-  { X86::VPSADBWZ256rr          ,     X86::VPSADBWYrr            },
-  { X86::VPSHUFBZ256rm          ,     X86::VPSHUFBYrm            },
-  { X86::VPSHUFBZ256rr          ,     X86::VPSHUFBYrr            },
-  { X86::VPSHUFDZ256mi          ,     X86::VPSHUFDYmi            },
-  { X86::VPSHUFDZ256ri          ,     X86::VPSHUFDYri            },
-  { X86::VPSHUFHWZ256mi         ,     X86::VPSHUFHWYmi           },
-  { X86::VPSHUFHWZ256ri         ,     X86::VPSHUFHWYri           },
-  { X86::VPSHUFLWZ256mi         ,     X86::VPSHUFLWYmi           },
-  { X86::VPSHUFLWZ256ri         ,     X86::VPSHUFLWYri           },
-  { X86::VPSLLDQZ256rr          ,     X86::VPSLLDQYri            },
-  { X86::VPSLLDZ256ri           ,     X86::VPSLLDYri             },
-  { X86::VPSLLDZ256rm           ,     X86::VPSLLDYrm             },
-  { X86::VPSLLDZ256rr           ,     X86::VPSLLDYrr             },
-  { X86::VPSLLQZ256ri           ,     X86::VPSLLQYri             },
-  { X86::VPSLLQZ256rm           ,     X86::VPSLLQYrm             },
-  { X86::VPSLLQZ256rr           ,     X86::VPSLLQYrr             },
-  { X86::VPSLLVDZ256rm          ,     X86::VPSLLVDYrm            },
-  { X86::VPSLLVDZ256rr          ,     X86::VPSLLVDYrr            },
-  { X86::VPSLLVQZ256rm          ,     X86::VPSLLVQYrm            },
-  { X86::VPSLLVQZ256rr          ,     X86::VPSLLVQYrr            },
-  { X86::VPSLLWZ256ri           ,     X86::VPSLLWYri             },
-  { X86::VPSLLWZ256rm           ,     X86::VPSLLWYrm             },
-  { X86::VPSLLWZ256rr           ,     X86::VPSLLWYrr             },
-  { X86::VPSRADZ256ri           ,     X86::VPSRADYri             },
-  { X86::VPSRADZ256rm           ,     X86::VPSRADYrm             },
-  { X86::VPSRADZ256rr           ,     X86::VPSRADYrr             },
-  { X86::VPSRAVDZ256rm          ,     X86::VPSRAVDYrm            },
-  { X86::VPSRAVDZ256rr          ,     X86::VPSRAVDYrr            },
-  { X86::VPSRAWZ256ri           ,     X86::VPSRAWYri             },
-  { X86::VPSRAWZ256rm           ,     X86::VPSRAWYrm             },
-  { X86::VPSRAWZ256rr           ,     X86::VPSRAWYrr             },
-  { X86::VPSRLDQZ256rr          ,     X86::VPSRLDQYri            },
-  { X86::VPSRLDZ256ri           ,     X86::VPSRLDYri             },
-  { X86::VPSRLDZ256rm           ,     X86::VPSRLDYrm             },
-  { X86::VPSRLDZ256rr           ,     X86::VPSRLDYrr             },
-  { X86::VPSRLQZ256ri           ,     X86::VPSRLQYri             },
-  { X86::VPSRLQZ256rm           ,     X86::VPSRLQYrm             },
-  { X86::VPSRLQZ256rr           ,     X86::VPSRLQYrr             },
-  { X86::VPSRLVDZ256rm          ,     X86::VPSRLVDYrm            },
-  { X86::VPSRLVDZ256rr          ,     X86::VPSRLVDYrr            },
-  { X86::VPSRLVQZ256rm          ,     X86::VPSRLVQYrm            },
-  { X86::VPSRLVQZ256rr          ,     X86::VPSRLVQYrr            },
-  { X86::VPSRLWZ256ri           ,     X86::VPSRLWYri             },
-  { X86::VPSRLWZ256rm           ,     X86::VPSRLWYrm             },
-  { X86::VPSRLWZ256rr           ,     X86::VPSRLWYrr             },
-  { X86::VPSUBBZ256rm           ,     X86::VPSUBBYrm             },
-  { X86::VPSUBBZ256rr           ,     X86::VPSUBBYrr             },
-  { X86::VPSUBDZ256rm           ,     X86::VPSUBDYrm             },
-  { X86::VPSUBDZ256rr           ,     X86::VPSUBDYrr             },
-  { X86::VPSUBQZ256rm           ,     X86::VPSUBQYrm             },
-  { X86::VPSUBQZ256rr           ,     X86::VPSUBQYrr             },
-  { X86::VPSUBSBZ256rm          ,     X86::VPSUBSBYrm            },
-  { X86::VPSUBSBZ256rr          ,     X86::VPSUBSBYrr            },
-  { X86::VPSUBSWZ256rm          ,     X86::VPSUBSWYrm            },
-  { X86::VPSUBSWZ256rr          ,     X86::VPSUBSWYrr            },
-  { X86::VPSUBUSBZ256rm         ,     X86::VPSUBUSBYrm           },
-  { X86::VPSUBUSBZ256rr         ,     X86::VPSUBUSBYrr           },
-  { X86::VPSUBUSWZ256rm         ,     X86::VPSUBUSWYrm           },
-  { X86::VPSUBUSWZ256rr         ,     X86::VPSUBUSWYrr           },
-  { X86::VPSUBWZ256rm           ,     X86::VPSUBWYrm             },
-  { X86::VPSUBWZ256rr           ,     X86::VPSUBWYrr             },
-  { X86::VPUNPCKHBWZ256rm       ,     X86::VPUNPCKHBWYrm         },
-  { X86::VPUNPCKHBWZ256rr       ,     X86::VPUNPCKHBWYrr         },
-  { X86::VPUNPCKHDQZ256rm       ,     X86::VPUNPCKHDQYrm         },
-  { X86::VPUNPCKHDQZ256rr       ,     X86::VPUNPCKHDQYrr         },
-  { X86::VPUNPCKHQDQZ256rm      ,     X86::VPUNPCKHQDQYrm        },
-  { X86::VPUNPCKHQDQZ256rr      ,     X86::VPUNPCKHQDQYrr        },
-  { X86::VPUNPCKHWDZ256rm       ,     X86::VPUNPCKHWDYrm         },
-  { X86::VPUNPCKHWDZ256rr       ,     X86::VPUNPCKHWDYrr         },
-  { X86::VPUNPCKLBWZ256rm       ,     X86::VPUNPCKLBWYrm         },
-  { X86::VPUNPCKLBWZ256rr       ,     X86::VPUNPCKLBWYrr         },
-  { X86::VPUNPCKLDQZ256rm       ,     X86::VPUNPCKLDQYrm         },
-  { X86::VPUNPCKLDQZ256rr       ,     X86::VPUNPCKLDQYrr         },
-  { X86::VPUNPCKLQDQZ256rm      ,     X86::VPUNPCKLQDQYrm        },
-  { X86::VPUNPCKLQDQZ256rr      ,     X86::VPUNPCKLQDQYrr        },
-  { X86::VPUNPCKLWDZ256rm       ,     X86::VPUNPCKLWDYrm         },
-  { X86::VPUNPCKLWDZ256rr       ,     X86::VPUNPCKLWDYrr         },
-  { X86::VPXORDZ256rm           ,     X86::VPXORYrm              },
-  { X86::VPXORDZ256rr           ,     X86::VPXORYrr              },
-  { X86::VPXORQZ256rm           ,     X86::VPXORYrm              },
-  { X86::VPXORQZ256rr           ,     X86::VPXORYrr              },
-  { X86::VSHUFPDZ256rmi         ,     X86::VSHUFPDYrmi           },
-  { X86::VSHUFPDZ256rri         ,     X86::VSHUFPDYrri           },
-  { X86::VSHUFPSZ256rmi         ,     X86::VSHUFPSYrmi           },
-  { X86::VSHUFPSZ256rri         ,     X86::VSHUFPSYrri           },
-  { X86::VSQRTPDZ256m           ,     X86::VSQRTPDYm             },
-  { X86::VSQRTPDZ256r           ,     X86::VSQRTPDYr             },
-  { X86::VSQRTPSZ256m           ,     X86::VSQRTPSYm             },
-  { X86::VSQRTPSZ256r           ,     X86::VSQRTPSYr             },
-  { X86::VSUBPDZ256rm           ,     X86::VSUBPDYrm             },
-  { X86::VSUBPDZ256rr           ,     X86::VSUBPDYrr             },
-  { X86::VSUBPSZ256rm           ,     X86::VSUBPSYrm             },
-  { X86::VSUBPSZ256rr           ,     X86::VSUBPSYrr             },
-  { X86::VUNPCKHPDZ256rm        ,     X86::VUNPCKHPDYrm          },
-  { X86::VUNPCKHPDZ256rr        ,     X86::VUNPCKHPDYrr          },
-  { X86::VUNPCKHPSZ256rm        ,     X86::VUNPCKHPSYrm          },
-  { X86::VUNPCKHPSZ256rr        ,     X86::VUNPCKHPSYrr          },
-  { X86::VUNPCKLPDZ256rm        ,     X86::VUNPCKLPDYrm          },
-  { X86::VUNPCKLPDZ256rr        ,     X86::VUNPCKLPDYrr          },
-  { X86::VUNPCKLPSZ256rm        ,     X86::VUNPCKLPSYrm          },
-  { X86::VUNPCKLPSZ256rr        ,     X86::VUNPCKLPSYrr          },
-  { X86::VXORPDZ256rm           ,     X86::VXORPDYrm             },
-  { X86::VXORPDZ256rr           ,     X86::VXORPDYrr             },
-  { X86::VXORPSZ256rm           ,     X86::VXORPSYrm             },
-  { X86::VXORPSZ256rr           ,     X86::VXORPSYrr             },
-};
-
-#endif
diff --git a/lib/Target/X86/X86InstrXOP.td b/lib/Target/X86/X86InstrXOP.td
index 10acdca2df749415ff1f7a3b90c9ecd9b1ce8070..53224431c0e90c481b91ba8cf178cc9efb99546c 100644
--- a/lib/Target/X86/X86InstrXOP.td
+++ b/lib/Target/X86/X86InstrXOP.td
@@ -290,159 +290,87 @@ let ExeDomain = SSEPackedInt in {
 }
 
 // Instruction where either second or third source can be memory
-multiclass xop4op_int<bits<8> opc, string OpcodeStr,
-                      Intrinsic Int128, Intrinsic Int256> {
-  // 128-bit Instruction
-  def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs VR128:$dst),
-            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
+                       X86MemOperand x86memop, ValueType VT> {
+  def rrr : IXOPi8Reg<opc, MRMSrcReg, (outs RC:$dst),
+            (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2, VR128:$src3))]>,
-            XOP_4V;
-  def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR128:$dst),
-            (ins VR128:$src1, VR128:$src2, i128mem:$src3),
+            [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
+                                   (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V;
+  def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
+            (ins RC:$src1, RC:$src2, x86memop:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set VR128:$dst,
-              (Int128 VR128:$src1, VR128:$src2,
-               (bitconvert (loadv2i64 addr:$src3))))]>,
+            [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1),
+                                   (X86andnp (load addr:$src3), RC:$src2))))]>,
             XOP_4V, VEX_W;
-  def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs VR128:$dst),
-            (ins VR128:$src1, i128mem:$src2, VR128:$src3),
+  def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
+            (ins RC:$src1, x86memop:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set VR128:$dst,
-              (Int128 VR128:$src1, (bitconvert (loadv2i64 addr:$src2)),
-               VR128:$src3))]>,
+            [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
+                                   (X86andnp RC:$src3, (load addr:$src2)))))]>,
             XOP_4V;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR128:$dst),
-            (ins VR128:$src1, VR128:$src2, VR128:$src3),
+  def rrr_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs RC:$dst),
+            (ins RC:$src1, RC:$src2, RC:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
             []>, XOP_4V, VEX_W;
-
-  // 256-bit Instruction
-  def rrrY : IXOPi8Reg<opc, MRMSrcReg, (outs VR256:$dst),
-             (ins VR256:$src1, VR256:$src2, VR256:$src3),
-             !strconcat(OpcodeStr,
-             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-             [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2, VR256:$src3))]>,
-             XOP_4V, VEX_L;
-  def rrmY : IXOPi8Reg<opc, MRMSrcMemOp4, (outs VR256:$dst),
-             (ins VR256:$src1, VR256:$src2, i256mem:$src3),
-             !strconcat(OpcodeStr,
-             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-             [(set VR256:$dst,
-               (Int256 VR256:$src1, VR256:$src2,
-               (bitconvert (loadv4i64 addr:$src3))))]>,
-             XOP_4V, VEX_W, VEX_L;
-  def rmrY : IXOPi8Reg<opc, MRMSrcMem, (outs VR256:$dst),
-             (ins VR256:$src1, f256mem:$src2, VR256:$src3),
-             !strconcat(OpcodeStr,
-             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-             [(set VR256:$dst,
-               (Int256 VR256:$src1, (bitconvert (loadv4i64 addr:$src2)),
-                VR256:$src3))]>,
-             XOP_4V, VEX_L;
-  // For disassembler
-  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rrrY_REV : IXOPi8Reg<opc, MRMSrcRegOp4, (outs VR256:$dst),
-            (ins VR256:$src1, VR256:$src2, VR256:$src3),
-            !strconcat(OpcodeStr,
-            "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            []>, XOP_4V, VEX_W, VEX_L;
 }
 
 let ExeDomain = SSEPackedInt in {
-  defm VPCMOV : xop4op_int<0xA2, "vpcmov",
-                           int_x86_xop_vpcmov, int_x86_xop_vpcmov_256>;
+  defm VPCMOV : xop4op_int<0xA2, "vpcmov", VR128, i128mem, v2i64>;
+  defm VPCMOVY : xop4op_int<0xA2, "vpcmov", VR256, i256mem, v4i64>, VEX_L;
 }
 
-let Predicates = [HasXOP] in {
-  def : Pat<(v2i64 (or (and VR128:$src3, VR128:$src1),
-                       (X86andnp VR128:$src3, VR128:$src2))),
-            (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
-
-  def : Pat<(v4i64 (or (and VR256:$src3, VR256:$src1),
-                       (X86andnp VR256:$src3, VR256:$src2))),
-            (VPCMOVrrrY VR256:$src1, VR256:$src2, VR256:$src3)>;
-}
-
-multiclass xop5op<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                  ValueType vt128, ValueType vt256,
-                  ValueType id128, ValueType id256,
-                  PatFrag ld_128, PatFrag ld_256> {
-  def rr : IXOP5<opc, MRMSrcReg, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+multiclass xop_vpermil2<bits<8> Opc, string OpcodeStr, RegisterClass RC,
+                        X86MemOperand intmemop, X86MemOperand fpmemop,
+                        ValueType VT, PatFrag FPLdFrag,
+                        PatFrag IntLdFrag> {
+  def rr : IXOP5<Opc, MRMSrcReg, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR128:$dst,
-           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
-  def rm : IXOP5<opc, MRMSrcMemOp4, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, i128mem:$src3, u8imm:$src4),
+        [(set RC:$dst,
+           (VT (X86vpermil2 RC:$src1, RC:$src2, RC:$src3, (i8 imm:$src4))))]>;
+  def rm : IXOP5<Opc, MRMSrcMemOp4, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, intmemop:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR128:$dst,
-           (vt128 (OpNode (vt128 VR128:$src1), (vt128 VR128:$src2),
-                          (id128 (bitconvert (loadv2i64 addr:$src3))),
-                          (i8 imm:$src4))))]>,
-        VEX_W;
-  def mr : IXOP5<opc, MRMSrcMem, (outs VR128:$dst),
-        (ins VR128:$src1, f128mem:$src2, VR128:$src3, u8imm:$src4),
+        [(set RC:$dst,
+          (VT (X86vpermil2 RC:$src1, RC:$src2,
+                           (bitconvert (IntLdFrag addr:$src3)),
+                           (i8 imm:$src4))))]>, VEX_W;
+  def mr : IXOP5<Opc, MRMSrcMem, (outs RC:$dst),
+        (ins RC:$src1, fpmemop:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR128:$dst,
-           (vt128 (OpNode (vt128 VR128:$src1),
-                          (vt128 (bitconvert (ld_128 addr:$src2))),
-                          (id128 VR128:$src3), (i8 imm:$src4))))]>;
+        [(set RC:$dst,
+          (VT (X86vpermil2 RC:$src1, (FPLdFrag addr:$src2),
+                           RC:$src3, (i8 imm:$src4))))]>;
   // For disassembler
   let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rr_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR128:$dst),
-        (ins VR128:$src1, VR128:$src2, VR128:$src3, u8imm:$src4),
+  def rr_REV : IXOP5<Opc, MRMSrcRegOp4, (outs RC:$dst),
+        (ins RC:$src1, RC:$src2, RC:$src3, u8imm:$src4),
         !strconcat(OpcodeStr,
         "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
         []>, VEX_W;
-
-  def rrY : IXOP5<opc, MRMSrcReg, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR256:$dst,
-           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
-                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
-  def rmY : IXOP5<opc, MRMSrcMemOp4, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, i256mem:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR256:$dst,
-           (vt256 (OpNode (vt256 VR256:$src1), (vt256 VR256:$src2),
-                          (id256 (bitconvert (loadv4i64 addr:$src3))),
-                          (i8 imm:$src4))))]>, VEX_W, VEX_L;
-  def mrY : IXOP5<opc, MRMSrcMem, (outs VR256:$dst),
-        (ins VR256:$src1, f256mem:$src2, VR256:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        [(set VR256:$dst,
-           (vt256 (OpNode (vt256 VR256:$src1),
-                          (vt256 (bitconvert (ld_256 addr:$src2))),
-                          (id256 VR256:$src3), (i8 imm:$src4))))]>, VEX_L;
-  // For disassembler
-  let isCodeGenOnly = 1, ForceDisassemble = 1, hasSideEffects = 0 in
-  def rrY_REV : IXOP5<opc, MRMSrcRegOp4, (outs VR256:$dst),
-        (ins VR256:$src1, VR256:$src2, VR256:$src3, u8imm:$src4),
-        !strconcat(OpcodeStr,
-        "\t{$src4, $src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3, $src4}"),
-        []>, VEX_W, VEX_L;
 }
 
-let ExeDomain = SSEPackedDouble in
-  defm VPERMIL2PD : xop5op<0x49, "vpermil2pd", X86vpermil2, v2f64, v4f64,
-                           v2i64, v4i64, loadv2f64, loadv4f64>;
+let ExeDomain = SSEPackedDouble in {
+  defm VPERMIL2PD : xop_vpermil2<0x49, "vpermil2pd", VR128, i128mem, f128mem,
+                                 v2f64, loadv2f64, loadv2i64>;
+  defm VPERMIL2PDY : xop_vpermil2<0x49, "vpermil2pd", VR256, i256mem, f256mem,
+                                  v4f64, loadv4f64, loadv4i64>, VEX_L;
+}
 
-let ExeDomain = SSEPackedSingle in
-  defm VPERMIL2PS : xop5op<0x48, "vpermil2ps", X86vpermil2, v4f32, v8f32,
-                           v4i32, v8i32, loadv4f32, loadv8f32>;
+let ExeDomain = SSEPackedSingle in {
+  defm VPERMIL2PS : xop_vpermil2<0x48, "vpermil2ps", VR128, i128mem, f128mem,
+                                 v4f32, loadv4f32, loadv2i64>;
+  defm VPERMIL2PSY : xop_vpermil2<0x48, "vpermil2ps", VR256, i256mem, f256mem,
+                                  v8f32, loadv8f32, loadv4i64>, VEX_L;
+}
 
diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6cc5e8b63597502644483cb0e4eec7826e0b6e98
--- /dev/null
+++ b/lib/Target/X86/X86InstructionSelector.cpp
@@ -0,0 +1,516 @@
+//===- X86InstructionSelector.cpp ----------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the InstructionSelector class for
+/// X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrBuilder.h"
+#include "X86InstrInfo.h"
+#include "X86RegisterBankInfo.h"
+#include "X86RegisterInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "X86-isel"
+
+using namespace llvm;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+namespace {
+
+class X86InstructionSelector : public InstructionSelector {
+public:
+  X86InstructionSelector(const X86Subtarget &STI,
+                         const X86RegisterBankInfo &RBI);
+
+  bool select(MachineInstr &I) const override;
+
+private:
+  /// tblgen-erated 'select' implementation, used as the initial selector for
+  /// the patterns that don't require complex C++.
+  bool selectImpl(MachineInstr &I) const;
+
+  // TODO: remove after selectImpl support pattern with a predicate.
+  unsigned getFAddOp(LLT &Ty, const RegisterBank &RB) const;
+  unsigned getFSubOp(LLT &Ty, const RegisterBank &RB) const;
+  unsigned getAddOp(LLT &Ty, const RegisterBank &RB) const;
+  unsigned getSubOp(LLT &Ty, const RegisterBank &RB) const;
+  unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc,
+                          uint64_t Alignment) const;
+
+  bool selectBinaryOp(MachineInstr &I, MachineRegisterInfo &MRI,
+                      MachineFunction &MF) const;
+  bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
+                         MachineFunction &MF) const;
+  bool selectFrameIndex(MachineInstr &I, MachineRegisterInfo &MRI,
+                        MachineFunction &MF) const;
+  bool selectConstant(MachineInstr &I, MachineRegisterInfo &MRI,
+                      MachineFunction &MF) const;
+
+  const X86Subtarget &STI;
+  const X86InstrInfo &TII;
+  const X86RegisterInfo &TRI;
+  const X86RegisterBankInfo &RBI;
+
+#define GET_GLOBALISEL_TEMPORARIES_DECL
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_DECL
+};
+
+} // end anonymous namespace
+
+#define GET_GLOBALISEL_IMPL
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_IMPL
+
+X86InstructionSelector::X86InstructionSelector(const X86Subtarget &STI,
+                                               const X86RegisterBankInfo &RBI)
+    : InstructionSelector(), STI(STI), TII(*STI.getInstrInfo()),
+      TRI(*STI.getRegisterInfo()), RBI(RBI)
+#define GET_GLOBALISEL_TEMPORARIES_INIT
+#include "X86GenGlobalISel.inc"
+#undef GET_GLOBALISEL_TEMPORARIES_INIT
+{
+}
+
+// FIXME: This should be target-independent, inferred from the types declared
+// for each class in the bank.
+static const TargetRegisterClass *
+getRegClassForTypeOnBank(LLT Ty, const RegisterBank &RB) {
+  if (RB.getID() == X86::GPRRegBankID) {
+    if (Ty.getSizeInBits() == 32)
+      return &X86::GR32RegClass;
+    if (Ty.getSizeInBits() == 64)
+      return &X86::GR64RegClass;
+  }
+  if (RB.getID() == X86::VECRRegBankID) {
+    if (Ty.getSizeInBits() == 32)
+      return &X86::FR32XRegClass;
+    if (Ty.getSizeInBits() == 64)
+      return &X86::FR64XRegClass;
+    if (Ty.getSizeInBits() == 128)
+      return &X86::VR128XRegClass;
+    if (Ty.getSizeInBits() == 256)
+      return &X86::VR256XRegClass;
+    if (Ty.getSizeInBits() == 512)
+      return &X86::VR512RegClass;
+  }
+
+  llvm_unreachable("Unknown RegBank!");
+}
+
+// Set X86 Opcode and constrain DestReg.
+static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
+                       MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
+                       const RegisterBankInfo &RBI) {
+
+  unsigned DstReg = I.getOperand(0).getReg();
+  if (TargetRegisterInfo::isPhysicalRegister(DstReg)) {
+    assert(I.isCopy() && "Generic operators do not allow physical registers");
+    return true;
+  }
+
+  const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
+  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+  (void)DstSize;
+  unsigned SrcReg = I.getOperand(1).getReg();
+  const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI);
+  (void)SrcSize;
+  assert((!TargetRegisterInfo::isPhysicalRegister(SrcReg) || I.isCopy()) &&
+         "No phys reg on generic operators");
+  assert((DstSize == SrcSize ||
+          // Copies are a mean to setup initial types, the number of
+          // bits may not exactly match.
+          (TargetRegisterInfo::isPhysicalRegister(SrcReg) &&
+           DstSize <= RBI.getSizeInBits(SrcReg, MRI, TRI))) &&
+         "Copy with different width?!");
+
+  const TargetRegisterClass *RC = nullptr;
+
+  switch (RegBank.getID()) {
+  case X86::GPRRegBankID:
+    assert((DstSize <= 64) && "GPRs cannot get more than 64-bit width values.");
+    RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+    break;
+  case X86::VECRRegBankID:
+    RC = getRegClassForTypeOnBank(MRI.getType(DstReg), RegBank);
+    break;
+  default:
+    llvm_unreachable("Unknown RegBank!");
+  }
+
+  // No need to constrain SrcReg. It will get constrained when
+  // we hit another of its use or its defs.
+  // Copies do not have constraints.
+  const TargetRegisterClass *OldRC = MRI.getRegClassOrNull(DstReg);
+  if (!OldRC || !RC->hasSubClassEq(OldRC)) {
+    if (!RBI.constrainGenericRegister(DstReg, *RC, MRI)) {
+      DEBUG(dbgs() << "Failed to constrain " << TII.getName(I.getOpcode())
+                   << " operand\n");
+      return false;
+    }
+  }
+  I.setDesc(TII.get(X86::COPY));
+  return true;
+}
+
+bool X86InstructionSelector::select(MachineInstr &I) const {
+  assert(I.getParent() && "Instruction should be in a basic block!");
+  assert(I.getParent()->getParent() && "Instruction should be in a function!");
+
+  MachineBasicBlock &MBB = *I.getParent();
+  MachineFunction &MF = *MBB.getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned Opcode = I.getOpcode();
+  if (!isPreISelGenericOpcode(Opcode)) {
+    // Certain non-generic instructions also need some special handling.
+
+    if (I.isCopy())
+      return selectCopy(I, TII, MRI, TRI, RBI);
+
+    // TODO: handle more cases - LOAD_STACK_GUARD, PHI
+    return true;
+  }
+
+  assert(I.getNumOperands() == I.getNumExplicitOperands() &&
+         "Generic instruction has unexpected implicit operands\n");
+
+  // TODO: This should be implemented by tblgen, pattern with predicate not
+  // supported yet.
+  if (selectBinaryOp(I, MRI, MF))
+    return true;
+  if (selectLoadStoreOp(I, MRI, MF))
+    return true;
+  if (selectFrameIndex(I, MRI, MF))
+    return true;
+  if (selectConstant(I, MRI, MF))
+    return true;
+
+  return selectImpl(I);
+}
+
+unsigned X86InstructionSelector::getFAddOp(LLT &Ty,
+                                           const RegisterBank &RB) const {
+
+  if (X86::VECRRegBankID != RB.getID())
+    return TargetOpcode::G_FADD;
+
+  if (Ty == LLT::scalar(32)) {
+    if (STI.hasAVX512()) {
+      return X86::VADDSSZrr;
+    } else if (STI.hasAVX()) {
+      return X86::VADDSSrr;
+    } else if (STI.hasSSE1()) {
+      return X86::ADDSSrr;
+    }
+  } else if (Ty == LLT::scalar(64)) {
+    if (STI.hasAVX512()) {
+      return X86::VADDSDZrr;
+    } else if (STI.hasAVX()) {
+      return X86::VADDSDrr;
+    } else if (STI.hasSSE2()) {
+      return X86::ADDSDrr;
+    }
+  } else if (Ty == LLT::vector(4, 32)) {
+    if ((STI.hasAVX512()) && (STI.hasVLX())) {
+      return X86::VADDPSZ128rr;
+    } else if (STI.hasAVX()) {
+      return X86::VADDPSrr;
+    } else if (STI.hasSSE1()) {
+      return X86::ADDPSrr;
+    }
+  }
+
+  return TargetOpcode::G_FADD;
+}
+
+unsigned X86InstructionSelector::getFSubOp(LLT &Ty,
+                                           const RegisterBank &RB) const {
+
+  if (X86::VECRRegBankID != RB.getID())
+    return TargetOpcode::G_FSUB;
+
+  if (Ty == LLT::scalar(32)) {
+    if (STI.hasAVX512()) {
+      return X86::VSUBSSZrr;
+    } else if (STI.hasAVX()) {
+      return X86::VSUBSSrr;
+    } else if (STI.hasSSE1()) {
+      return X86::SUBSSrr;
+    }
+  } else if (Ty == LLT::scalar(64)) {
+    if (STI.hasAVX512()) {
+      return X86::VSUBSDZrr;
+    } else if (STI.hasAVX()) {
+      return X86::VSUBSDrr;
+    } else if (STI.hasSSE2()) {
+      return X86::SUBSDrr;
+    }
+  } else if (Ty == LLT::vector(4, 32)) {
+    if ((STI.hasAVX512()) && (STI.hasVLX())) {
+      return X86::VSUBPSZ128rr;
+    } else if (STI.hasAVX()) {
+      return X86::VSUBPSrr;
+    } else if (STI.hasSSE1()) {
+      return X86::SUBPSrr;
+    }
+  }
+
+  return TargetOpcode::G_FSUB;
+}
+
+unsigned X86InstructionSelector::getAddOp(LLT &Ty,
+                                          const RegisterBank &RB) const {
+
+  if (X86::VECRRegBankID != RB.getID())
+    return TargetOpcode::G_ADD;
+
+  if (Ty == LLT::vector(4, 32)) {
+    if (STI.hasAVX512() && STI.hasVLX()) {
+      return X86::VPADDDZ128rr;
+    } else if (STI.hasAVX()) {
+      return X86::VPADDDrr;
+    } else if (STI.hasSSE2()) {
+      return X86::PADDDrr;
+    }
+  }
+
+  return TargetOpcode::G_ADD;
+}
+
+unsigned X86InstructionSelector::getSubOp(LLT &Ty,
+                                          const RegisterBank &RB) const {
+
+  if (X86::VECRRegBankID != RB.getID())
+    return TargetOpcode::G_SUB;
+
+  if (Ty == LLT::vector(4, 32)) {
+    if (STI.hasAVX512() && STI.hasVLX()) {
+      return X86::VPSUBDZ128rr;
+    } else if (STI.hasAVX()) {
+      return X86::VPSUBDrr;
+    } else if (STI.hasSSE2()) {
+      return X86::PSUBDrr;
+    }
+  }
+
+  return TargetOpcode::G_SUB;
+}
+
+bool X86InstructionSelector::selectBinaryOp(MachineInstr &I,
+                                            MachineRegisterInfo &MRI,
+                                            MachineFunction &MF) const {
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+  const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+  unsigned NewOpc = I.getOpcode();
+
+  switch (NewOpc) {
+  case TargetOpcode::G_FADD:
+    NewOpc = getFAddOp(Ty, RB);
+    break;
+  case TargetOpcode::G_FSUB:
+    NewOpc = getFSubOp(Ty, RB);
+    break;
+  case TargetOpcode::G_ADD:
+    NewOpc = getAddOp(Ty, RB);
+    break;
+  case TargetOpcode::G_SUB:
+    NewOpc = getSubOp(Ty, RB);
+    break;
+  default:
+    break;
+  }
+
+  if (NewOpc == I.getOpcode())
+    return false;
+
+  I.setDesc(TII.get(NewOpc));
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB,
+                                                unsigned Opc,
+                                                uint64_t Alignment) const {
+  bool Isload = (Opc == TargetOpcode::G_LOAD);
+  bool HasAVX = STI.hasAVX();
+  bool HasAVX512 = STI.hasAVX512();
+  bool HasVLX = STI.hasVLX();
+
+  if (Ty == LLT::scalar(8)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV8rm : X86::MOV8mr;
+  } else if (Ty == LLT::scalar(16)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV16rm : X86::MOV16mr;
+  } else if (Ty == LLT::scalar(32)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV32rm : X86::MOV32mr;
+    if (X86::VECRRegBankID == RB.getID())
+      return Isload ? (HasAVX512 ? X86::VMOVSSZrm
+                                 : HasAVX ? X86::VMOVSSrm : X86::MOVSSrm)
+                    : (HasAVX512 ? X86::VMOVSSZmr
+                                 : HasAVX ? X86::VMOVSSmr : X86::MOVSSmr);
+  } else if (Ty == LLT::scalar(64)) {
+    if (X86::GPRRegBankID == RB.getID())
+      return Isload ? X86::MOV64rm : X86::MOV64mr;
+    if (X86::VECRRegBankID == RB.getID())
+      return Isload ? (HasAVX512 ? X86::VMOVSDZrm
+                                 : HasAVX ? X86::VMOVSDrm : X86::MOVSDrm)
+                    : (HasAVX512 ? X86::VMOVSDZmr
+                                 : HasAVX ? X86::VMOVSDmr : X86::MOVSDmr);
+  } else if (Ty.isVector() && Ty.getSizeInBits() == 128) {
+    if (Alignment >= 16)
+      return Isload ? (HasVLX ? X86::VMOVAPSZ128rm
+                              : HasAVX512
+                                    ? X86::VMOVAPSZ128rm_NOVLX
+                                    : HasAVX ? X86::VMOVAPSrm : X86::MOVAPSrm)
+                    : (HasVLX ? X86::VMOVAPSZ128mr
+                              : HasAVX512
+                                    ? X86::VMOVAPSZ128mr_NOVLX
+                                    : HasAVX ? X86::VMOVAPSmr : X86::MOVAPSmr);
+    else
+      return Isload ? (HasVLX ? X86::VMOVUPSZ128rm
+                              : HasAVX512
+                                    ? X86::VMOVUPSZ128rm_NOVLX
+                                    : HasAVX ? X86::VMOVUPSrm : X86::MOVUPSrm)
+                    : (HasVLX ? X86::VMOVUPSZ128mr
+                              : HasAVX512
+                                    ? X86::VMOVUPSZ128mr_NOVLX
+                                    : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
+  }
+  return Opc;
+}
+
+bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
+                                               MachineRegisterInfo &MRI,
+                                               MachineFunction &MF) const {
+
+  unsigned Opc = I.getOpcode();
+
+  if (Opc != TargetOpcode::G_STORE && Opc != TargetOpcode::G_LOAD)
+    return false;
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+  const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI);
+
+  auto &MemOp = **I.memoperands_begin();
+  unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
+  if (NewOpc == Opc)
+    return false;
+
+  I.setDesc(TII.get(NewOpc));
+  MachineInstrBuilder MIB(MF, I);
+  if (Opc == TargetOpcode::G_LOAD)
+    addOffset(MIB, 0);
+  else {
+    // G_STORE (VAL, Addr), X86Store instruction (Addr, VAL)
+    I.RemoveOperand(0);
+    addOffset(MIB, 0).addUse(DefReg);
+  }
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectFrameIndex(MachineInstr &I,
+                                              MachineRegisterInfo &MRI,
+                                              MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_FRAME_INDEX)
+    return false;
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+
+  // Use LEA to calculate frame index.
+  unsigned NewOpc;
+  if (Ty == LLT::pointer(0, 64))
+    NewOpc = X86::LEA64r;
+  else if (Ty == LLT::pointer(0, 32))
+    NewOpc = STI.isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r;
+  else
+    llvm_unreachable("Can't select G_FRAME_INDEX, unsupported type.");
+
+  I.setDesc(TII.get(NewOpc));
+  MachineInstrBuilder MIB(MF, I);
+  addOffset(MIB, 0);
+
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+bool X86InstructionSelector::selectConstant(MachineInstr &I,
+                                            MachineRegisterInfo &MRI,
+                                            MachineFunction &MF) const {
+  if (I.getOpcode() != TargetOpcode::G_CONSTANT)
+    return false;
+
+  const unsigned DefReg = I.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DefReg);
+
+  assert(Ty.isScalar() && "invalid element type.");
+
+  uint64_t Val = 0;
+  if (I.getOperand(1).isCImm()) {
+    Val = I.getOperand(1).getCImm()->getZExtValue();
+    I.getOperand(1).ChangeToImmediate(Val);
+  } else if (I.getOperand(1).isImm()) {
+    Val = I.getOperand(1).getImm();
+  } else
+    llvm_unreachable("Unsupported operand type.");
+
+  unsigned NewOpc;
+  switch (Ty.getSizeInBits()) {
+  case 8:
+    NewOpc = X86::MOV8ri;
+    break;
+  case 16:
+    NewOpc = X86::MOV16ri;
+    break;
+  case 32:
+    NewOpc = X86::MOV32ri;
+    break;
+  case 64: {
+    // TODO: in case isUInt<32>(Val), X86::MOV32ri can be used
+    if (isInt<32>(Val))
+      NewOpc = X86::MOV64ri32;
+    else
+      NewOpc = X86::MOV64ri;
+    break;
+  }
+  default:
+    llvm_unreachable("Can't select G_CONSTANT, unsupported type.");
+  }
+
+  I.setDesc(TII.get(NewOpc));
+  return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+}
+
+InstructionSelector *
+llvm::createX86InstructionSelector(X86Subtarget &Subtarget,
+                                   X86RegisterBankInfo &RBI) {
+  return new X86InstructionSelector(Subtarget, RBI);
+}
diff --git a/lib/Target/X86/X86InterleavedAccess.cpp b/lib/Target/X86/X86InterleavedAccess.cpp
index d9edf4676faf8b02265983749fbda23db373032b..806d6cc888f0f73fcde962a7d7a5b052eb423289 100644
--- a/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/lib/Target/X86/X86InterleavedAccess.cpp
@@ -19,6 +19,7 @@
 
 using namespace llvm;
 
+namespace {
 /// \brief This class holds necessary information to represent an interleaved
 /// access group and supports utilities to lower the group into
 /// X86-specific instructions/intrinsics.
@@ -27,7 +28,6 @@ using namespace llvm;
 ///        %wide.vec = load <8 x i32>, <8 x i32>* %ptr
 ///        %v0 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <0, 2, 4, 6>
 ///        %v1 = shuffle <8 x i32> %wide.vec, <8 x i32> undef, <1, 3, 5, 7>
-
 class X86InterleavedAccessGroup {
   /// \brief Reference to the wide-load instruction of an interleaved access
   /// group.
@@ -95,6 +95,7 @@ public:
   /// instructions/intrinsics.
   bool lowerIntoOptimizedSequence();
 };
+} // end anonymous namespace
 
 bool X86InterleavedAccessGroup::isSupported() const {
   VectorType *ShuffleVecTy = Shuffles[0]->getType();
diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h
index 63a02af02faada8b6e888b48613127994af3fff8..2a40399ba5712d515c2034c87aafe1fed400e128 100644
--- a/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/lib/Target/X86/X86IntrinsicsInfo.h
@@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t {
   TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32,
   EXPAND_FROM_MEM,
   TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS,
-  FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK
+  FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP,
 };
 
 struct IntrinsicData {
@@ -67,6 +67,23 @@ static const IntrinsicData IntrinsicsWithChain[] = {
   X86_INTRINSIC_DATA(addcarryx_u32, ADX, X86ISD::ADC, 0),
   X86_INTRINSIC_DATA(addcarryx_u64, ADX, X86ISD::ADC, 0),
 
+  X86_INTRINSIC_DATA(avx2_gather_d_d,      GATHER_AVX2, X86::VPGATHERDDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_d_256,  GATHER_AVX2, X86::VPGATHERDDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_pd,     GATHER_AVX2, X86::VGATHERDPDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_pd_256, GATHER_AVX2, X86::VGATHERDPDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_ps,     GATHER_AVX2, X86::VGATHERDPSrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_ps_256, GATHER_AVX2, X86::VGATHERDPSYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_q,      GATHER_AVX2, X86::VPGATHERDQrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_d_q_256,  GATHER_AVX2, X86::VPGATHERDQYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_d,      GATHER_AVX2, X86::VPGATHERQDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_d_256,  GATHER_AVX2, X86::VPGATHERQDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_pd,     GATHER_AVX2, X86::VGATHERQPDrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_pd_256, GATHER_AVX2, X86::VGATHERQPDYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_ps,     GATHER_AVX2, X86::VGATHERQPSrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_ps_256, GATHER_AVX2, X86::VGATHERQPSYrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_q,      GATHER_AVX2, X86::VPGATHERQQrm, 0),
+  X86_INTRINSIC_DATA(avx2_gather_q_q_256,  GATHER_AVX2, X86::VPGATHERQQYrm, 0),
+
   X86_INTRINSIC_DATA(avx512_gather_dpd_512, GATHER, X86::VGATHERDPDZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather_dpi_512, GATHER, X86::VPGATHERDDZrm, 0),
   X86_INTRINSIC_DATA(avx512_gather_dpq_512, GATHER, X86::VPGATHERDQZrm, 0),
@@ -325,6 +342,8 @@ static const IntrinsicData* getIntrinsicWithChain(uint16_t IntNo) {
  * the alphabetical order.
  */
 static const IntrinsicData  IntrinsicsWithoutChain[] = {
+  X86_INTRINSIC_DATA(avx_cmp_pd_256,    INTR_TYPE_3OP, X86ISD::CMPP, 0),
+  X86_INTRINSIC_DATA(avx_cmp_ps_256,    INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(avx_cvt_pd2_ps_256,CVTPD2PS, ISD::FP_ROUND, 0),
   X86_INTRINSIC_DATA(avx_cvt_pd2dq_256, INTR_TYPE_1OP, X86ISD::CVTP2SI, 0),
   X86_INTRINSIC_DATA(avx_cvtdq2_ps_256, INTR_TYPE_1OP, ISD::SINT_TO_FP, 0),
@@ -351,9 +370,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_ps_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
-  X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_b, INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_d, INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx2_pabs_w, INTR_TYPE_1OP, ISD::ABS, 0),
   X86_INTRINSIC_DATA(avx2_packssdw, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
@@ -421,18 +440,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_cvtd2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtd2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtd2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2b_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2b_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2b_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2d_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2d_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2d_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2q_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2q_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2q_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2w_128, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2w_256, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
-  X86_INTRINSIC_DATA(avx512_cvtmask2w_512, CONVERT_MASK_TO_VEC, X86ISD::VSEXT, 0),
   X86_INTRINSIC_DATA(avx512_cvtq2mask_128, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtq2mask_256, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_cvtq2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
@@ -455,18 +462,20 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_cvtw2mask_512, CONVERT_TO_MASK, X86ISD::CVT2MASK, 0),
   X86_INTRINSIC_DATA(avx512_exp2_pd, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
   X86_INTRINSIC_DATA(avx512_exp2_ps, INTR_TYPE_1OP_MASK_RM, X86ISD::EXP2, 0),
+  X86_INTRINSIC_DATA(avx512_kand_w, MASK_BINOP, ISD::AND, 0),
+  X86_INTRINSIC_DATA(avx512_kor_w, MASK_BINOP, ISD::OR, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_bw, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_dq, KUNPCK, ISD::CONCAT_VECTORS, 0),
   X86_INTRINSIC_DATA(avx512_kunpck_wd, KUNPCK, ISD::CONCAT_VECTORS, 0),
-
+  X86_INTRINSIC_DATA(avx512_kxor_w, MASK_BINOP, ISD::XOR, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_pd_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_mask_add_ps_512, INTR_TYPE_2OP_MASK, ISD::FADD,
   X86ISD::FADD_RND),
   X86_INTRINSIC_DATA(avx512_mask_add_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FADD_RND, 0),
+                     X86ISD::FADDS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_add_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FADD_RND, 0),
+                     X86ISD::FADDS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_256, BRCST32x2_TO_VEC,
                      X86ISD::VBROADCAST, 0),
   X86_INTRINSIC_DATA(avx512_mask_broadcastf32x2_512, BRCST32x2_TO_VEC,
@@ -720,9 +729,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_div_ps_512, INTR_TYPE_2OP_MASK, ISD::FDIV,
                      X86ISD::FDIV_RND),
   X86_INTRINSIC_DATA(avx512_mask_div_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FDIV_RND, 0),
+                     X86ISD::FDIVS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_div_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FDIV_RND, 0),
+                     X86ISD::FDIVS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_d_128,  COMPRESS_EXPAND_IN_REG,
                      X86ISD::EXPAND, 0),
   X86_INTRINSIC_DATA(avx512_mask_expand_d_256,  COMPRESS_EXPAND_IN_REG,
@@ -795,74 +804,42 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VGETMANTS, 0),
   X86_INTRINSIC_DATA(avx512_mask_getmant_ss, INTR_TYPE_3OP_SCALAR_MASK_RM,
                      X86ISD::VGETMANTS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_128, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_256, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_d_512, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_128, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_256, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_lzcnt_q_512, INTR_TYPE_1OP_MASK,
-                     ISD::CTLZ, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
                      X86ISD::FMAX_RND),
-  X86_INTRINSIC_DATA(avx512_mask_max_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMAX, 0),
   X86_INTRINSIC_DATA(avx512_mask_max_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMAX,
                      X86ISD::FMAX_RND),
-  X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMAX_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMAX_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_pd_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_pd_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(avx512_mask_max_sd_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMAXS, X86ISD::FMAXS_RND),
+  X86_INTRINSIC_DATA(avx512_mask_max_ss_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMAXS, X86ISD::FMAXS_RND),
   X86_INTRINSIC_DATA(avx512_mask_min_pd_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
                      X86ISD::FMIN_RND),
-  X86_INTRINSIC_DATA(avx512_mask_min_ps_128, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_ps_256, INTR_TYPE_2OP_MASK, X86ISD::FMIN, 0),
   X86_INTRINSIC_DATA(avx512_mask_min_ps_512, INTR_TYPE_2OP_MASK, X86ISD::FMIN,
                      X86ISD::FMIN_RND),
-  X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMIN_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMIN_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_min_sd_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMINS, X86ISD::FMINS_RND),
+  X86_INTRINSIC_DATA(avx512_mask_min_ss_round, INTR_TYPE_SCALAR_MASK,
+                     X86ISD::FMINS, X86ISD::FMINS_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_pd_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
                      X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_ps_512, INTR_TYPE_2OP_MASK, ISD::FMUL,
                      X86ISD::FMUL_RND),
   X86_INTRINSIC_DATA(avx512_mask_mul_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMUL_RND, 0),
+                     X86ISD::FMULS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_mul_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FMUL_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packssdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packssdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packssdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packsswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packsswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packsswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKSS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packusdw_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packusdw_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packusdw_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packuswb_128, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packuswb_256, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_packuswb_512, INTR_TYPE_2OP_MASK, X86ISD::PACKUS, 0),
+                     X86ISD::FMULS_RND, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_b_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_d_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_q_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_128, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_256, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pabs_w_512, INTR_TYPE_1OP_MASK, ISD::ABS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_b_128, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_b_256, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
   X86_INTRINSIC_DATA(avx512_mask_padds_b_512, INTR_TYPE_2OP_MASK, X86ISD::ADDS, 0),
@@ -1191,9 +1168,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
                      X86ISD::FSUB_RND),
   X86_INTRINSIC_DATA(avx512_mask_sub_sd_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FSUB_RND, 0),
+                     X86ISD::FSUBS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FSUB_RND, 0),
+                     X86ISD::FSUBS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_b_128,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_b_256,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
   X86_INTRINSIC_DATA(avx512_mask_ucmp_b_512,    CMP_MASK_CC,  X86ISD::CMPMU, 0),
@@ -1486,6 +1463,10 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VPMADD52L, 0),
   X86_INTRINSIC_DATA(avx512_maskz_vpmadd52l_uq_512, FMA_OP_MASKZ,
                      X86ISD::VPMADD52L, 0),
+  X86_INTRINSIC_DATA(avx512_packssdw_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
+  X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
+  X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0),
   X86_INTRINSIC_DATA(avx512_pmul_dq_512, INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(avx512_pmulu_dq_512, INTR_TYPE_2OP, X86ISD::PMULUDQ, 0),
   X86_INTRINSIC_DATA(avx512_psad_bw_512, INTR_TYPE_2OP, X86ISD::PSADBW, 0),
@@ -1613,6 +1594,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(fma_vfnmsub_pd_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_ps,       INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
   X86_INTRINSIC_DATA(fma_vfnmsub_ps_256,   INTR_TYPE_3OP, X86ISD::FNMSUB, 0),
+  X86_INTRINSIC_DATA(sse_cmp_ps,        INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
@@ -1620,7 +1602,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse_comilt_ss,     COMI, X86ISD::COMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse_comineq_ss,    COMI, X86ISD::COMI, ISD::SETNE),
   X86_INTRINSIC_DATA(sse_max_ps,        INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse_max_ss,        INTR_TYPE_2OP, X86ISD::FMAXS, 0),
   X86_INTRINSIC_DATA(sse_min_ps,        INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse_min_ss,        INTR_TYPE_2OP, X86ISD::FMINS, 0),
   X86_INTRINSIC_DATA(sse_movmsk_ps,     INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
@@ -1631,6 +1615,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse_ucomile_ss,    COMI, X86ISD::UCOMI, ISD::SETLE),
   X86_INTRINSIC_DATA(sse_ucomilt_ss,    COMI, X86ISD::UCOMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse_ucomineq_ss,   COMI, X86ISD::UCOMI, ISD::SETNE),
+  X86_INTRINSIC_DATA(sse2_cmp_pd,       INTR_TYPE_3OP, X86ISD::CMPP, 0),
   X86_INTRINSIC_DATA(sse2_comieq_sd,    COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_comige_sd,    COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse2_comigt_sd,    COMI, X86ISD::COMI, ISD::SETGT),
@@ -1643,7 +1628,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse2_cvttpd2dq,    INTR_TYPE_1OP, X86ISD::CVTTP2SI, 0),
   X86_INTRINSIC_DATA(sse2_cvttps2dq,    INTR_TYPE_1OP, ISD::FP_TO_SINT, 0),
   X86_INTRINSIC_DATA(sse2_max_pd,       INTR_TYPE_2OP, X86ISD::FMAX, 0),
+  X86_INTRINSIC_DATA(sse2_max_sd,       INTR_TYPE_2OP, X86ISD::FMAXS, 0),
   X86_INTRINSIC_DATA(sse2_min_pd,       INTR_TYPE_2OP, X86ISD::FMIN, 0),
+  X86_INTRINSIC_DATA(sse2_min_sd,       INTR_TYPE_2OP, X86ISD::FMINS, 0),
   X86_INTRINSIC_DATA(sse2_movmsk_pd,    INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
   X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0),
@@ -1696,9 +1683,9 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse41_pmuldq,      INTR_TYPE_2OP, X86ISD::PMULDQ, 0),
   X86_INTRINSIC_DATA(sse4a_extrqi,      INTR_TYPE_3OP, X86ISD::EXTRQI, 0),
   X86_INTRINSIC_DATA(sse4a_insertqi,    INTR_TYPE_4OP, X86ISD::INSERTQI, 0),
-  X86_INTRINSIC_DATA(ssse3_pabs_b_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(ssse3_pabs_d_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
-  X86_INTRINSIC_DATA(ssse3_pabs_w_128,  INTR_TYPE_1OP, X86ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_b_128,  INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_d_128,  INTR_TYPE_1OP, ISD::ABS, 0),
+  X86_INTRINSIC_DATA(ssse3_pabs_w_128,  INTR_TYPE_1OP, ISD::ABS, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_d_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phadd_w_128, INTR_TYPE_2OP, X86ISD::HADD, 0),
   X86_INTRINSIC_DATA(ssse3_phsub_d_128, INTR_TYPE_2OP, X86ISD::HSUB, 0),
diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c2dc762fec5eb30ab077329881ec8303c6841429
--- /dev/null
+++ b/lib/Target/X86/X86LegalizerInfo.cpp
@@ -0,0 +1,142 @@
+//===- X86LegalizerInfo.cpp --------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the Machinelegalizer class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86LegalizerInfo.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/CodeGen/ValueTypes.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Type.h"
+#include "llvm/Target/TargetOpcodes.h"
+
+using namespace llvm;
+using namespace TargetOpcode;
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
+                                   const X86TargetMachine &TM)
+    : Subtarget(STI), TM(TM) {
+
+  setLegalizerInfo32bit();
+  setLegalizerInfo64bit();
+  setLegalizerInfoSSE1();
+  setLegalizerInfoSSE2();
+
+  computeTables();
+}
+
+void X86LegalizerInfo::setLegalizerInfo32bit() {
+
+  if (Subtarget.is64Bit())
+    return;
+
+  const LLT p0 = LLT::pointer(0, 32);
+  const LLT s1 = LLT::scalar(1);
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {s8, s16, s32})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE}) {
+    for (auto Ty : {s8, s16, s32, p0})
+      setAction({MemOp, Ty}, Legal);
+
+    // And everything's fine in addrspace 0.
+    setAction({MemOp, 1, p0}, Legal);
+  }
+
+  // Pointer-handling
+  setAction({G_FRAME_INDEX, p0}, Legal);
+
+  // Constants
+  for (auto Ty : {s8, s16, s32, p0})
+    setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
+
+  setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
+  setAction({TargetOpcode::G_CONSTANT, s64}, NarrowScalar);
+}
+
+void X86LegalizerInfo::setLegalizerInfo64bit() {
+
+  if (!Subtarget.is64Bit())
+    return;
+
+  const LLT p0 = LLT::pointer(0, TM.getPointerSize() * 8);
+  const LLT s1 = LLT::scalar(1);
+  const LLT s8 = LLT::scalar(8);
+  const LLT s16 = LLT::scalar(16);
+  const LLT s32 = LLT::scalar(32);
+  const LLT s64 = LLT::scalar(64);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {s8, s16, s32, s64})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE}) {
+    for (auto Ty : {s8, s16, s32, s64, p0})
+      setAction({MemOp, Ty}, Legal);
+
+    // And everything's fine in addrspace 0.
+    setAction({MemOp, 1, p0}, Legal);
+  }
+
+  // Pointer-handling
+  setAction({G_FRAME_INDEX, p0}, Legal);
+
+  // Constants
+  for (auto Ty : {s8, s16, s32, s64, p0})
+    setAction({TargetOpcode::G_CONSTANT, Ty}, Legal);
+
+  setAction({TargetOpcode::G_CONSTANT, s1}, WidenScalar);
+}
+
+void X86LegalizerInfo::setLegalizerInfoSSE1() {
+  if (!Subtarget.hasSSE1())
+    return;
+
+  const LLT s32 = LLT::scalar(32);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+    for (auto Ty : {s32, v4s32})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned MemOp : {G_LOAD, G_STORE})
+    for (auto Ty : {v4s32, v2s64})
+      setAction({MemOp, Ty}, Legal);
+}
+
+void X86LegalizerInfo::setLegalizerInfoSSE2() {
+  if (!Subtarget.hasSSE2())
+    return;
+
+  const LLT s64 = LLT::scalar(64);
+  const LLT v4s32 = LLT::vector(4, 32);
+  const LLT v2s64 = LLT::vector(2, 64);
+
+  for (unsigned BinOp : {G_FADD, G_FSUB, G_FMUL, G_FDIV})
+    for (auto Ty : {s64, v2s64})
+      setAction({BinOp, Ty}, Legal);
+
+  for (unsigned BinOp : {G_ADD, G_SUB})
+    for (auto Ty : {v4s32})
+      setAction({BinOp, Ty}, Legal);
+}
diff --git a/lib/Target/X86/X86LegalizerInfo.h b/lib/Target/X86/X86LegalizerInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..3f00898b42322b27dc65db16ad3995f84a3be628
--- /dev/null
+++ b/lib/Target/X86/X86LegalizerInfo.h
@@ -0,0 +1,43 @@
+//===- X86LegalizerInfo.h ------------------------------------------*- C++
+//-*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the Machinelegalizer class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86MACHINELEGALIZER_H
+#define LLVM_LIB_TARGET_X86_X86MACHINELEGALIZER_H
+
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+
+namespace llvm {
+
+class X86Subtarget;
+class X86TargetMachine;
+
+/// This class provides the information for the target register banks.
+class X86LegalizerInfo : public LegalizerInfo {
+private:
+  /// Keep a reference to the X86Subtarget around so that we can
+  /// make the right decision when generating code for different targets.
+  const X86Subtarget &Subtarget;
+  const X86TargetMachine &TM;
+
+public:
+  X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM);
+
+private:
+  void setLegalizerInfo32bit();
+  void setLegalizerInfo64bit();
+  void setLegalizerInfoSSE1();
+  void setLegalizerInfoSSE2();
+};
+} // namespace llvm
+#endif
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 8fa431412259838ea6c5162dc5672f2c92153290..55b090b67640f4b65b56203b32686d115199ca32 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -215,6 +215,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   case X86II::MO_GOT:       RefKind = MCSymbolRefExpr::VK_GOT; break;
   case X86II::MO_GOTOFF:    RefKind = MCSymbolRefExpr::VK_GOTOFF; break;
   case X86II::MO_PLT:       RefKind = MCSymbolRefExpr::VK_PLT; break;
+  case X86II::MO_ABS8:      RefKind = MCSymbolRefExpr::VK_X86_ABS8; break;
   case X86II::MO_PIC_BASE_OFFSET:
   case X86II::MO_DARWIN_NONLAZY_PIC_BASE:
     Expr = MCSymbolRefExpr::create(Sym, Ctx);
@@ -893,30 +894,47 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
   SM.recordStatepoint(MI);
 }
 
-void X86AsmPrinter::LowerFAULTING_LOAD_OP(const MachineInstr &MI,
-                                       X86MCInstLower &MCIL) {
-  // FAULTING_LOAD_OP <def>, <MBB handler>, <load opcode>, <load operands>
+void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
+                                     X86MCInstLower &MCIL) {
+  // FAULTING_LOAD_OP <def>, <faltinf type>, <MBB handler>,
+  //                  <opcode>, <operands>
 
-  unsigned LoadDefRegister = MI.getOperand(0).getReg();
-  MCSymbol *HandlerLabel = MI.getOperand(1).getMBB()->getSymbol();
-  unsigned LoadOpcode = MI.getOperand(2).getImm();
-  unsigned LoadOperandsBeginIdx = 3;
+  unsigned DefRegister = FaultingMI.getOperand(0).getReg();
+  FaultMaps::FaultKind FK =
+      static_cast<FaultMaps::FaultKind>(FaultingMI.getOperand(1).getImm());
+  MCSymbol *HandlerLabel = FaultingMI.getOperand(2).getMBB()->getSymbol();
+  unsigned Opcode = FaultingMI.getOperand(3).getImm();
+  unsigned OperandsBeginIdx = 4;
 
-  FM.recordFaultingOp(FaultMaps::FaultingLoad, HandlerLabel);
+  assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
+  FM.recordFaultingOp(FK, HandlerLabel);
 
-  MCInst LoadMI;
-  LoadMI.setOpcode(LoadOpcode);
+  MCInst MI;
+  MI.setOpcode(Opcode);
 
-  if (LoadDefRegister != X86::NoRegister)
-    LoadMI.addOperand(MCOperand::createReg(LoadDefRegister));
+  if (DefRegister != X86::NoRegister)
+    MI.addOperand(MCOperand::createReg(DefRegister));
 
-  for (auto I = MI.operands_begin() + LoadOperandsBeginIdx,
-            E = MI.operands_end();
+  for (auto I = FaultingMI.operands_begin() + OperandsBeginIdx,
+            E = FaultingMI.operands_end();
        I != E; ++I)
-    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, *I))
-      LoadMI.addOperand(MaybeOperand.getValue());
+    if (auto MaybeOperand = MCIL.LowerMachineOperand(&FaultingMI, *I))
+      MI.addOperand(MaybeOperand.getValue());
+
+  OutStreamer->EmitInstruction(MI, getSubtargetInfo());
+}
 
-  OutStreamer->EmitInstruction(LoadMI, getSubtargetInfo());
+void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
+                                     X86MCInstLower &MCIL) {
+  bool Is64Bits = Subtarget->is64Bit();
+  MCContext &Ctx = OutStreamer->getContext();
+  MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
+  const MCSymbolRefExpr *Op =
+      MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_None, Ctx);
+
+  EmitAndCountInstruction(
+      MCInstBuilder(Is64Bits ? X86::CALL64pcrel32 : X86::CALLpcrel32)
+          .addExpr(Op));
 }
 
 void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
@@ -1374,8 +1392,11 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case TargetOpcode::STATEPOINT:
     return LowerSTATEPOINT(*MI, MCInstLowering);
 
-  case TargetOpcode::FAULTING_LOAD_OP:
-    return LowerFAULTING_LOAD_OP(*MI, MCInstLowering);
+  case TargetOpcode::FAULTING_OP:
+    return LowerFAULTING_OP(*MI, MCInstLowering);
+
+  case TargetOpcode::FENTRY_CALL:
+    return LowerFENTRY_CALL(*MI, MCInstLowering);
 
   case TargetOpcode::PATCHABLE_OP:
     return LowerPATCHABLE_OP(*MI, MCInstLowering);
@@ -1586,8 +1607,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   case X86::VPERMIL2PDrm:
   case X86::VPERMIL2PSrm:
-  case X86::VPERMIL2PDrmY:
-  case X86::VPERMIL2PSrmY: {
+  case X86::VPERMIL2PDYrm:
+  case X86::VPERMIL2PSYrm: {
     if (!OutStreamer->isVerboseAsm())
       break;
     assert(MI->getNumOperands() >= 8 &&
@@ -1600,8 +1621,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     unsigned ElSize;
     switch (MI->getOpcode()) {
     default: llvm_unreachable("Invalid opcode");
-    case X86::VPERMIL2PSrm: case X86::VPERMIL2PSrmY: ElSize = 32; break;
-    case X86::VPERMIL2PDrm: case X86::VPERMIL2PDrmY: ElSize = 64; break;
+    case X86::VPERMIL2PSrm: case X86::VPERMIL2PSYrm: ElSize = 32; break;
+    case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break;
     }
 
     const MachineOperand &MaskOp = MI->getOperand(6);
diff --git a/lib/Target/X86/X86MachineFunctionInfo.cpp b/lib/Target/X86/X86MachineFunctionInfo.cpp
index c9e636f1eb00b78e584fa3f50c5847a0b8467617..3fcb642424adc45b0f8cba950c0ed1a757bfdaa7 100644
--- a/lib/Target/X86/X86MachineFunctionInfo.cpp
+++ b/lib/Target/X86/X86MachineFunctionInfo.cpp
@@ -9,6 +9,7 @@
 
 #include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
 
 using namespace llvm;
@@ -20,11 +21,8 @@ void X86MachineFunctionInfo::setRestoreBasePointer(const MachineFunction *MF) {
     const X86RegisterInfo *RegInfo = static_cast<const X86RegisterInfo *>(
       MF->getSubtarget().getRegisterInfo());
     unsigned SlotSize = RegInfo->getSlotSize();
-    for (const MCPhysReg *CSR =
-      RegInfo->X86RegisterInfo::getCalleeSavedRegs(MF);
-      unsigned Reg = *CSR;
-       ++CSR)
-    {
+    for (const MCPhysReg *CSR = MF->getRegInfo().getCalleeSavedRegs();
+         unsigned Reg = *CSR; ++CSR) {
       if (X86::GR64RegClass.contains(Reg) || X86::GR32RegClass.contains(Reg))
         RestoreBasePointerOffset -= SlotSize;
     }
diff --git a/lib/Target/X86/X86MacroFusion.cpp b/lib/Target/X86/X86MacroFusion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..dd21e2b7c4a1363067b17dc0745f76e674890c0a
--- /dev/null
+++ b/lib/Target/X86/X86MacroFusion.cpp
@@ -0,0 +1,271 @@
+//===- X86MacroFusion.cpp - X86 Macro Fusion ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// \file This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 implementation of the DAG scheduling mutation to
+// pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86MacroFusion.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetInstrInfo.h"
+
+#define DEBUG_TYPE "misched"
+
+STATISTIC(NumFused, "Number of instr pairs fused");
+
+using namespace llvm;
+
+static cl::opt<bool> EnableMacroFusion("x86-misched-fusion", cl::Hidden,
+  cl::desc("Enable scheduling for macro fusion."), cl::init(true));
+
+namespace {
+
+/// \brief Verify that the instruction pair, First and Second,
+/// should be scheduled back to back.  If either instruction is unspecified,
+/// then verify that the other instruction may be part of a pair at all.
+static bool shouldScheduleAdjacent(const X86Subtarget &ST,
+                                   const MachineInstr *First,
+                                   const MachineInstr *Second) {
+  // Check if this processor supports macro-fusion. Since this is a minor
+  // heuristic, we haven't specifically reserved a feature. hasAVX is a decent
+  // proxy for SandyBridge+.
+  if (!ST.hasAVX())
+    return false;
+
+  enum {
+    FuseTest,
+    FuseCmp,
+    FuseInc
+  } FuseKind;
+
+  assert((First || Second) && "At least one instr must be specified");
+  unsigned FirstOpcode = First
+                         ? First->getOpcode()
+                         : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
+  unsigned SecondOpcode = Second
+                          ? Second->getOpcode()
+                          : static_cast<unsigned>(X86::INSTRUCTION_LIST_END);
+
+  switch (SecondOpcode) {
+  default:
+    return false;
+  case X86::JE_1:
+  case X86::JNE_1:
+  case X86::JL_1:
+  case X86::JLE_1:
+  case X86::JG_1:
+  case X86::JGE_1:
+    FuseKind = FuseInc;
+    break;
+  case X86::JB_1:
+  case X86::JBE_1:
+  case X86::JA_1:
+  case X86::JAE_1:
+    FuseKind = FuseCmp;
+    break;
+  case X86::JS_1:
+  case X86::JNS_1:
+  case X86::JP_1:
+  case X86::JNP_1:
+  case X86::JO_1:
+  case X86::JNO_1:
+    FuseKind = FuseTest;
+    break;
+  }
+
+  switch (FirstOpcode) {
+  default:
+    return false;
+  case X86::TEST8rr:
+  case X86::TEST16rr:
+  case X86::TEST32rr:
+  case X86::TEST64rr:
+  case X86::TEST8ri:
+  case X86::TEST16ri:
+  case X86::TEST32ri:
+  case X86::TEST32i32:
+  case X86::TEST64i32:
+  case X86::TEST64ri32:
+  case X86::TEST8rm:
+  case X86::TEST16rm:
+  case X86::TEST32rm:
+  case X86::TEST64rm:
+  case X86::TEST8ri_NOREX:
+  case X86::AND16i16:
+  case X86::AND16ri:
+  case X86::AND16ri8:
+  case X86::AND16rm:
+  case X86::AND16rr:
+  case X86::AND32i32:
+  case X86::AND32ri:
+  case X86::AND32ri8:
+  case X86::AND32rm:
+  case X86::AND32rr:
+  case X86::AND64i32:
+  case X86::AND64ri32:
+  case X86::AND64ri8:
+  case X86::AND64rm:
+  case X86::AND64rr:
+  case X86::AND8i8:
+  case X86::AND8ri:
+  case X86::AND8rm:
+  case X86::AND8rr:
+    return true;
+  case X86::CMP16i16:
+  case X86::CMP16ri:
+  case X86::CMP16ri8:
+  case X86::CMP16rm:
+  case X86::CMP16rr:
+  case X86::CMP32i32:
+  case X86::CMP32ri:
+  case X86::CMP32ri8:
+  case X86::CMP32rm:
+  case X86::CMP32rr:
+  case X86::CMP64i32:
+  case X86::CMP64ri32:
+  case X86::CMP64ri8:
+  case X86::CMP64rm:
+  case X86::CMP64rr:
+  case X86::CMP8i8:
+  case X86::CMP8ri:
+  case X86::CMP8rm:
+  case X86::CMP8rr:
+  case X86::ADD16i16:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD16ri8_DB:
+  case X86::ADD16ri_DB:
+  case X86::ADD16rm:
+  case X86::ADD16rr:
+  case X86::ADD16rr_DB:
+  case X86::ADD32i32:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD32ri8_DB:
+  case X86::ADD32ri_DB:
+  case X86::ADD32rm:
+  case X86::ADD32rr:
+  case X86::ADD32rr_DB:
+  case X86::ADD64i32:
+  case X86::ADD64ri32:
+  case X86::ADD64ri32_DB:
+  case X86::ADD64ri8:
+  case X86::ADD64ri8_DB:
+  case X86::ADD64rm:
+  case X86::ADD64rr:
+  case X86::ADD64rr_DB:
+  case X86::ADD8i8:
+  case X86::ADD8mi:
+  case X86::ADD8mr:
+  case X86::ADD8ri:
+  case X86::ADD8rm:
+  case X86::ADD8rr:
+  case X86::SUB16i16:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB16rm:
+  case X86::SUB16rr:
+  case X86::SUB32i32:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB32rm:
+  case X86::SUB32rr:
+  case X86::SUB64i32:
+  case X86::SUB64ri32:
+  case X86::SUB64ri8:
+  case X86::SUB64rm:
+  case X86::SUB64rr:
+  case X86::SUB8i8:
+  case X86::SUB8ri:
+  case X86::SUB8rm:
+  case X86::SUB8rr:
+    return FuseKind == FuseCmp || FuseKind == FuseInc;
+  case X86::INC16r:
+  case X86::INC32r:
+  case X86::INC64r:
+  case X86::INC8r:
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::DEC64r:
+  case X86::DEC8r:
+    return FuseKind == FuseInc;
+  case X86::INSTRUCTION_LIST_END:
+    return true;
+  }
+}
+
+/// \brief Post-process the DAG to create cluster edges between instructions
+/// that may be fused by the processor into a single operation.
+class X86MacroFusion : public ScheduleDAGMutation {
+public:
+  X86MacroFusion() {}
+
+  void apply(ScheduleDAGInstrs *DAGInstrs) override;
+};
+
+void X86MacroFusion::apply(ScheduleDAGInstrs *DAGInstrs) {
+  ScheduleDAGMI *DAG = static_cast<ScheduleDAGMI*>(DAGInstrs);
+  const X86Subtarget &ST = DAG->MF.getSubtarget<X86Subtarget>();
+
+  // For now, assume targets can only fuse with the branch.
+  SUnit &ExitSU = DAG->ExitSU;
+  MachineInstr *Branch = ExitSU.getInstr();
+  if (!Branch || !shouldScheduleAdjacent(ST, nullptr, Branch))
+    return;
+
+  for (SDep &PredDep : ExitSU.Preds) {
+    if (PredDep.isWeak())
+      continue;
+    SUnit &SU = *PredDep.getSUnit();
+    MachineInstr &Pred = *SU.getInstr();
+    if (!shouldScheduleAdjacent(ST, &Pred, Branch))
+      continue;
+
+    // Create a single weak edge from SU to ExitSU. The only effect is to cause
+    // bottom-up scheduling to heavily prioritize the clustered SU.  There is no
+    // need to copy predecessor edges from ExitSU to SU, since top-down
+    // scheduling cannot prioritize ExitSU anyway. To defer top-down scheduling
+    // of SU, we could create an artificial edge from the deepest root, but it
+    // hasn't been needed yet.
+    bool Success = DAG->addEdge(&ExitSU, SDep(&SU, SDep::Cluster));
+    (void)Success;
+    assert(Success && "No DAG nodes should be reachable from ExitSU");
+
+    // Adjust latency of data deps between the nodes.
+    for (SDep &PredDep : ExitSU.Preds)
+      if (PredDep.getSUnit() == &SU)
+        PredDep.setLatency(0);
+    for (SDep &SuccDep : SU.Succs)
+      if (SuccDep.getSUnit() == &ExitSU)
+        SuccDep.setLatency(0);
+
+    ++NumFused;
+    DEBUG(dbgs() << DAG->MF.getName() << "(): Macro fuse ";
+          SU.print(dbgs(), DAG);
+          dbgs() << " - ExitSU"
+                 << " / " << DAG->TII->getName(Pred.getOpcode()) << " - "
+                 << DAG->TII->getName(Branch->getOpcode()) << '\n';);
+
+    break;
+  }
+}
+
+} // end namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation>
+createX86MacroFusionDAGMutation () {
+  return EnableMacroFusion ? make_unique<X86MacroFusion>() : nullptr;
+}
+
+} // end namespace llvm
diff --git a/lib/Target/X86/X86MacroFusion.h b/lib/Target/X86/X86MacroFusion.h
new file mode 100644
index 0000000000000000000000000000000000000000..e630f802e8e63b8eedcbbd51ac237490da0c88ed
--- /dev/null
+++ b/lib/Target/X86/X86MacroFusion.h
@@ -0,0 +1,30 @@
+//===- X86MacroFusion.h - X86 Macro Fusion --------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// \file This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the X86 definition of the DAG scheduling mutation to pair
+// instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/MachineScheduler.h"
+
+//===----------------------------------------------------------------------===//
+// X86MacroFusion - DAG post-processing to encourage fusion of macro ops.
+//===----------------------------------------------------------------------===//
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createX86MacroFusionDAGMutation());
+/// to X86PassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation>
+createX86MacroFusionDAGMutation();
+
+} // end namespace llvm
diff --git a/lib/Target/X86/X86OptimizeLEAs.cpp b/lib/Target/X86/X86OptimizeLEAs.cpp
index e1447006cd18c9d2f2c1fef29c697cecee8a5d84..debb192732e5c3c540b663a9f4c9140e5da847b1 100644
--- a/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -389,9 +389,6 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
   assert(isLEA(First) && isLEA(Last) &&
          "The function works only with LEA instructions");
 
-  // Get new address displacement.
-  AddrDispShift = getAddrDispShift(Last, 1, First, 1);
-
   // Make sure that LEA def registers belong to the same class. There may be
   // instructions (like MOV8mr_NOREX) which allow a limited set of registers to
   // be used as their operands, so we must be sure that replacing one LEA
@@ -400,10 +397,13 @@ bool OptimizeLEAPass::isReplaceable(const MachineInstr &First,
       MRI->getRegClass(Last.getOperand(0).getReg()))
     return false;
 
+  // Get new address displacement.
+  AddrDispShift = getAddrDispShift(Last, 1, First, 1);
+
   // Loop over all uses of the Last LEA to check that its def register is
   // used only as address base for memory accesses. If so, it can be
   // replaced, otherwise - no.
-  for (auto &MO : MRI->use_operands(Last.getOperand(0).getReg())) {
+  for (auto &MO : MRI->use_nodbg_operands(Last.getOperand(0).getReg())) {
     MachineInstr &MI = *MO.getParent();
 
     // Get the number of the first memory operand.
@@ -563,8 +563,9 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
         // Loop over all uses of the Last LEA and update their operands. Note
         // that the correctness of this has already been checked in the
         // isReplaceable function.
-        for (auto UI = MRI->use_begin(Last.getOperand(0).getReg()),
-                  UE = MRI->use_end();
+        unsigned LastVReg = Last.getOperand(0).getReg();
+        for (auto UI = MRI->use_nodbg_begin(LastVReg),
+                  UE = MRI->use_nodbg_end();
              UI != UE;) {
           MachineOperand &MO = *UI++;
           MachineInstr &MI = *MO.getParent();
@@ -586,6 +587,9 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
             Op.setOffset(Op.getOffset() + AddrDispShift);
         }
 
+        // Mark debug values referring to Last LEA as undefined.
+        MRI->markUsesInDebugValueAsUndef(LastVReg);
+
         // Since we can possibly extend register lifetime, clear kill flags.
         MRI->clearKillFlags(First.getOperand(0).getReg());
 
@@ -594,7 +598,7 @@ bool OptimizeLEAPass::removeRedundantLEAs(MemOpMap &LEAs) {
 
         // By this moment, all of the Last LEA's uses must be replaced. So we
         // can freely remove it.
-        assert(MRI->use_empty(Last.getOperand(0).getReg()) &&
+        assert(MRI->use_empty(LastVReg) &&
                "The LEA's def register must have no uses");
         Last.eraseFromParent();
 
diff --git a/lib/Target/X86/X86RegisterBankInfo.cpp b/lib/Target/X86/X86RegisterBankInfo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..d395c826e6bf7a050bdd8b5ffdd9b65eb32bf3c6
--- /dev/null
+++ b/lib/Target/X86/X86RegisterBankInfo.cpp
@@ -0,0 +1,243 @@
+//===- X86RegisterBankInfo.cpp -----------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file implements the targeting of the RegisterBankInfo class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#include "X86RegisterBankInfo.h"
+#include "X86InstrInfo.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBank.h"
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Target/TargetRegisterInfo.h"
+
+#define GET_TARGET_REGBANK_IMPL
+#include "X86GenRegisterBank.inc"
+
+using namespace llvm;
+// This file will be TableGen'ed at some point.
+#define GET_TARGET_REGBANK_INFO_IMPL
+#include "X86GenRegisterBankInfo.def"
+
+#ifndef LLVM_BUILD_GLOBAL_ISEL
+#error "You shouldn't build this"
+#endif
+
+X86RegisterBankInfo::X86RegisterBankInfo(const TargetRegisterInfo &TRI)
+    : X86GenRegisterBankInfo() {
+
+  // validate RegBank initialization.
+  const RegisterBank &RBGPR = getRegBank(X86::GPRRegBankID);
+  (void)RBGPR;
+  assert(&X86::GPRRegBank == &RBGPR && "Incorrect RegBanks inizalization.");
+
+  // The GPR register bank is fully defined by all the registers in
+  // GR64 + its subclasses.
+  assert(RBGPR.covers(*TRI.getRegClass(X86::GR64RegClassID)) &&
+         "Subclass not added?");
+  assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+}
+
+const RegisterBank &X86RegisterBankInfo::getRegBankFromRegClass(
+    const TargetRegisterClass &RC) const {
+
+  if (X86::GR8RegClass.hasSubClassEq(&RC) ||
+      X86::GR16RegClass.hasSubClassEq(&RC) ||
+      X86::GR32RegClass.hasSubClassEq(&RC) ||
+      X86::GR64RegClass.hasSubClassEq(&RC))
+    return getRegBank(X86::GPRRegBankID);
+
+  if (X86::FR32XRegClass.hasSubClassEq(&RC) ||
+      X86::FR64XRegClass.hasSubClassEq(&RC) ||
+      X86::VR128XRegClass.hasSubClassEq(&RC) ||
+      X86::VR256XRegClass.hasSubClassEq(&RC) ||
+      X86::VR512RegClass.hasSubClassEq(&RC))
+    return getRegBank(X86::VECRRegBankID);
+
+  llvm_unreachable("Unsupported register kind yet.");
+}
+
+X86GenRegisterBankInfo::PartialMappingIdx
+X86GenRegisterBankInfo::getPartialMappingIdx(const LLT &Ty, bool isFP) {
+  if ((Ty.isScalar() && !isFP) || Ty.isPointer()) {
+    switch (Ty.getSizeInBits()) {
+    case 8:
+      return PMI_GPR8;
+    case 16:
+      return PMI_GPR16;
+    case 32:
+      return PMI_GPR32;
+    case 64:
+      return PMI_GPR64;
+      break;
+    default:
+      llvm_unreachable("Unsupported register size.");
+    }
+  } else if (Ty.isScalar()) {
+    switch (Ty.getSizeInBits()) {
+    case 32:
+      return PMI_FP32;
+    case 64:
+      return PMI_FP64;
+    default:
+      llvm_unreachable("Unsupported register size.");
+    }
+  } else {
+    switch (Ty.getSizeInBits()) {
+    case 128:
+      return PMI_VEC128;
+    case 256:
+      return PMI_VEC256;
+    case 512:
+      return PMI_VEC512;
+    default:
+      llvm_unreachable("Unsupported register size.");
+    }
+  }
+
+  return PMI_None;
+}
+
+void X86RegisterBankInfo::getInstrPartialMappingIdxs(
+    const MachineInstr &MI, const MachineRegisterInfo &MRI, const bool isFP,
+    SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx) {
+
+  unsigned NumOperands = MI.getNumOperands();
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    auto &MO = MI.getOperand(Idx);
+    if (!MO.isReg())
+      OpRegBankIdx[Idx] = PMI_None;
+    else
+      OpRegBankIdx[Idx] = getPartialMappingIdx(MRI.getType(MO.getReg()), isFP);
+  }
+}
+
+bool X86RegisterBankInfo::getInstrValueMapping(
+    const MachineInstr &MI,
+    const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx,
+    SmallVectorImpl<const ValueMapping *> &OpdsMapping) {
+
+  unsigned NumOperands = MI.getNumOperands();
+  for (unsigned Idx = 0; Idx < NumOperands; ++Idx) {
+    if (!MI.getOperand(Idx).isReg())
+      continue;
+
+    auto Mapping = getValueMapping(OpRegBankIdx[Idx], 1);
+    if (!Mapping->isValid())
+      return false;
+
+    OpdsMapping[Idx] = Mapping;
+  }
+  return true;
+}
+
+RegisterBankInfo::InstructionMapping
+X86RegisterBankInfo::getSameOperandsMapping(const MachineInstr &MI, bool isFP) {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  unsigned NumOperands = MI.getNumOperands();
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+
+  if (NumOperands != 3 || (Ty != MRI.getType(MI.getOperand(1).getReg())) ||
+      (Ty != MRI.getType(MI.getOperand(2).getReg())))
+    llvm_unreachable("Unsupported operand mapping yet.");
+
+  auto Mapping = getValueMapping(getPartialMappingIdx(Ty, isFP), 3);
+  return InstructionMapping{DefaultMappingID, 1, Mapping, NumOperands};
+}
+
+RegisterBankInfo::InstructionMapping
+X86RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  auto Opc = MI.getOpcode();
+
+  // Try the default logic for non-generic instructions that are either copies
+  // or already have some operands assigned to banks.
+  if (!isPreISelGenericOpcode(Opc)) {
+    InstructionMapping Mapping = getInstrMappingImpl(MI);
+    if (Mapping.isValid())
+      return Mapping;
+  }
+
+  switch (Opc) {
+  case TargetOpcode::G_ADD:
+  case TargetOpcode::G_SUB:
+    return getSameOperandsMapping(MI, false);
+    break;
+  case TargetOpcode::G_FADD:
+  case TargetOpcode::G_FSUB:
+  case TargetOpcode::G_FMUL:
+  case TargetOpcode::G_FDIV:
+    return getSameOperandsMapping(MI, true);
+    break;
+  default:
+    break;
+  }
+
+  unsigned NumOperands = MI.getNumOperands();
+
+  // Track the bank of each register, use NotFP mapping (all scalars in GPRs)
+  SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+  getInstrPartialMappingIdxs(MI, MRI, /* isFP */ false, OpRegBankIdx);
+
+  // Finally construct the computed mapping.
+  SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
+  if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping))
+    return InstructionMapping();
+
+  return InstructionMapping{DefaultMappingID, /* Cost */ 1,
+                            getOperandsMapping(OpdsMapping), NumOperands};
+}
+
+void X86RegisterBankInfo::applyMappingImpl(
+    const OperandsMapper &OpdMapper) const {
+  return applyDefaultMapping(OpdMapper);
+}
+
+RegisterBankInfo::InstructionMappings
+X86RegisterBankInfo::getInstrAlternativeMappings(const MachineInstr &MI) const {
+
+  const MachineFunction &MF = *MI.getParent()->getParent();
+  const TargetSubtargetInfo &STI = MF.getSubtarget();
+  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_LOAD:
+  case TargetOpcode::G_STORE: {
+    // we going to try to map 32/64 bit to PMI_FP32/PMI_FP64
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    if (Size != 32 && Size != 64)
+      break;
+
+    unsigned NumOperands = MI.getNumOperands();
+
+    // Track the bank of each register, use FP mapping (all scalars in VEC)
+    SmallVector<PartialMappingIdx, 4> OpRegBankIdx(NumOperands);
+    getInstrPartialMappingIdxs(MI, MRI, /* isFP */ true, OpRegBankIdx);
+
+    // Finally construct the computed mapping.
+    SmallVector<const ValueMapping *, 8> OpdsMapping(NumOperands);
+    if (!getInstrValueMapping(MI, OpRegBankIdx, OpdsMapping))
+      break;
+
+    RegisterBankInfo::InstructionMapping Mapping = InstructionMapping{
+        /*ID*/ 1, /*Cost*/ 1, getOperandsMapping(OpdsMapping), NumOperands};
+    InstructionMappings AltMappings;
+    AltMappings.emplace_back(std::move(Mapping));
+    return AltMappings;
+  }
+  default:
+    break;
+  }
+  return RegisterBankInfo::getInstrAlternativeMappings(MI);
+}
diff --git a/lib/Target/X86/X86RegisterBankInfo.h b/lib/Target/X86/X86RegisterBankInfo.h
new file mode 100644
index 0000000000000000000000000000000000000000..a1e01a9ab94978391d7fcc74d7c891739e72943f
--- /dev/null
+++ b/lib/Target/X86/X86RegisterBankInfo.h
@@ -0,0 +1,81 @@
+//===- X86RegisterBankInfo ---------------------------------------*- C++ -*-==//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file declares the targeting of the RegisterBankInfo class for X86.
+/// \todo This should be generated by TableGen.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
+#define LLVM_LIB_TARGET_X86_X86REGISTERBANKINFO_H
+
+#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
+
+#define GET_REGBANK_DECLARATIONS
+#include "X86GenRegisterBank.inc"
+
+namespace llvm {
+
+class LLT;
+
+class X86GenRegisterBankInfo : public RegisterBankInfo {
+protected:
+#define GET_TARGET_REGBANK_CLASS
+#include "X86GenRegisterBank.inc"
+#define GET_TARGET_REGBANK_INFO_CLASS
+#include "X86GenRegisterBankInfo.def"
+
+  static RegisterBankInfo::PartialMapping PartMappings[];
+  static RegisterBankInfo::ValueMapping ValMappings[];
+
+  static PartialMappingIdx getPartialMappingIdx(const LLT &Ty, bool isFP);
+  static const RegisterBankInfo::ValueMapping *
+  getValueMapping(PartialMappingIdx Idx, unsigned NumOperands);
+};
+
+class TargetRegisterInfo;
+
+/// This class provides the information for the target register banks.
+class X86RegisterBankInfo final : public X86GenRegisterBankInfo {
+private:
+  /// Get an instruction mapping.
+  /// \return An InstructionMappings with a statically allocated
+  /// OperandsMapping.
+  static InstructionMapping getSameOperandsMapping(const MachineInstr &MI,
+                                                   bool isFP);
+
+  /// Track the bank of each instruction operand(register)
+  static void
+  getInstrPartialMappingIdxs(const MachineInstr &MI,
+                             const MachineRegisterInfo &MRI, const bool isFP,
+                             SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx);
+
+  /// Construct the instruction ValueMapping from PartialMappingIdxs
+  /// \return true if mapping succeeded.
+  static bool
+  getInstrValueMapping(const MachineInstr &MI,
+                       const SmallVectorImpl<PartialMappingIdx> &OpRegBankIdx,
+                       SmallVectorImpl<const ValueMapping *> &OpdsMapping);
+
+public:
+  X86RegisterBankInfo(const TargetRegisterInfo &TRI);
+
+  const RegisterBank &
+  getRegBankFromRegClass(const TargetRegisterClass &RC) const override;
+
+  InstructionMappings
+  getInstrAlternativeMappings(const MachineInstr &MI) const override;
+
+  /// See RegisterBankInfo::applyMapping.
+  void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
+
+  InstructionMapping getInstrMapping(const MachineInstr &MI) const override;
+};
+
+} // namespace llvm
+#endif
diff --git a/lib/Target/X86/X86RegisterBanks.td b/lib/Target/X86/X86RegisterBanks.td
new file mode 100644
index 0000000000000000000000000000000000000000..6d17cd53a0c14373e79b358a7f5f167ff52ca8ba
--- /dev/null
+++ b/lib/Target/X86/X86RegisterBanks.td
@@ -0,0 +1,17 @@
+//=- X86RegisterBank.td - Describe the AArch64 Banks -----*- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//
+//===----------------------------------------------------------------------===//
+
+/// General Purpose Registers: RAX, RCX,...
+def GPRRegBank : RegisterBank<"GPR", [GR64]>;
+
+/// Floating Point/Vector Registers
+def VECRRegBank : RegisterBank<"VECR", [VR512]>;
diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp
index 65f438f94b042804d078542858a6ac67ec077ae5..9bab9a4cf3ba4a649a0ea9643da938a918af2071 100644
--- a/lib/Target/X86/X86RegisterInfo.cpp
+++ b/lib/Target/X86/X86RegisterInfo.cpp
@@ -80,7 +80,7 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT)
 
 bool
 X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  // ExeDepsFixer and PostRAScheduler require liveness.
+  // ExecutionDepsFixer and PostRAScheduler require liveness.
   return true;
 }
 
@@ -337,7 +337,9 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
         return CSR_64_AllRegs_AVX512_SaveList;
       if (HasAVX)
         return CSR_64_AllRegs_AVX_SaveList;
-      return CSR_64_AllRegs_SaveList;
+      if (HasSSE)
+        return CSR_64_AllRegs_SaveList;
+      return CSR_64_AllRegs_NoSSE_SaveList;
     } else {
       if (HasAVX512)
         return CSR_32_AllRegs_AVX512_SaveList;
@@ -447,7 +449,9 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
         return CSR_64_AllRegs_AVX512_RegMask;
       if (HasAVX)
         return CSR_64_AllRegs_AVX_RegMask;
-      return CSR_64_AllRegs_RegMask;
+      if (HasSSE)
+        return CSR_64_AllRegs_RegMask;
+      return CSR_64_AllRegs_NoSSE_RegMask;
     } else {
       if (HasAVX512)
         return CSR_32_AllRegs_AVX512_RegMask;
diff --git a/lib/Target/X86/X86RegisterInfo.td b/lib/Target/X86/X86RegisterInfo.td
index 372a15aff15a03c9154e77bfbe97de39de46f60e..b8cae2f0bd26a5678447338ac66d50b02cbb931d 100644
--- a/lib/Target/X86/X86RegisterInfo.td
+++ b/lib/Target/X86/X86RegisterInfo.td
@@ -189,22 +189,22 @@ def XMM13: X86Reg<"xmm13", 13>, DwarfRegNum<[30, -2, -2]>;
 def XMM14: X86Reg<"xmm14", 14>, DwarfRegNum<[31, -2, -2]>;
 def XMM15: X86Reg<"xmm15", 15>, DwarfRegNum<[32, -2, -2]>;
 
-def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[60, -2, -2]>;
-def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[61, -2, -2]>;
-def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[62, -2, -2]>;
-def XMM19:  X86Reg<"xmm19", 19>, DwarfRegNum<[63, -2, -2]>;
-def XMM20:  X86Reg<"xmm20", 20>, DwarfRegNum<[64, -2, -2]>;
-def XMM21:  X86Reg<"xmm21", 21>, DwarfRegNum<[65, -2, -2]>;
-def XMM22:  X86Reg<"xmm22", 22>, DwarfRegNum<[66, -2, -2]>;
-def XMM23:  X86Reg<"xmm23", 23>, DwarfRegNum<[67, -2, -2]>;
-def XMM24:  X86Reg<"xmm24", 24>, DwarfRegNum<[68, -2, -2]>;
-def XMM25:  X86Reg<"xmm25", 25>, DwarfRegNum<[69, -2, -2]>;
-def XMM26:  X86Reg<"xmm26", 26>, DwarfRegNum<[70, -2, -2]>;
-def XMM27:  X86Reg<"xmm27", 27>, DwarfRegNum<[71, -2, -2]>;
-def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[72, -2, -2]>;
-def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[73, -2, -2]>;
-def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[74, -2, -2]>;
-def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[75, -2, -2]>;
+def XMM16:  X86Reg<"xmm16", 16>, DwarfRegNum<[67, -2, -2]>;
+def XMM17:  X86Reg<"xmm17", 17>, DwarfRegNum<[68, -2, -2]>;
+def XMM18:  X86Reg<"xmm18", 18>, DwarfRegNum<[69, -2, -2]>;
+def XMM19:  X86Reg<"xmm19", 19>, DwarfRegNum<[70, -2, -2]>;
+def XMM20:  X86Reg<"xmm20", 20>, DwarfRegNum<[71, -2, -2]>;
+def XMM21:  X86Reg<"xmm21", 21>, DwarfRegNum<[72, -2, -2]>;
+def XMM22:  X86Reg<"xmm22", 22>, DwarfRegNum<[73, -2, -2]>;
+def XMM23:  X86Reg<"xmm23", 23>, DwarfRegNum<[74, -2, -2]>;
+def XMM24:  X86Reg<"xmm24", 24>, DwarfRegNum<[75, -2, -2]>;
+def XMM25:  X86Reg<"xmm25", 25>, DwarfRegNum<[76, -2, -2]>;
+def XMM26:  X86Reg<"xmm26", 26>, DwarfRegNum<[77, -2, -2]>;
+def XMM27:  X86Reg<"xmm27", 27>, DwarfRegNum<[78, -2, -2]>;
+def XMM28:  X86Reg<"xmm28", 28>, DwarfRegNum<[79, -2, -2]>;
+def XMM29:  X86Reg<"xmm29", 29>, DwarfRegNum<[80, -2, -2]>;
+def XMM30:  X86Reg<"xmm30", 30>, DwarfRegNum<[81, -2, -2]>;
+def XMM31:  X86Reg<"xmm31", 31>, DwarfRegNum<[82, -2, -2]>;
 
 } // CostPerUse
 
diff --git a/lib/Target/X86/X86Schedule.td b/lib/Target/X86/X86Schedule.td
index 35257f89100caa3d49f0ec251d9abb090bcf3183..7f7efd7cad3f615eb859b86d476e3d9b464cd303 100644
--- a/lib/Target/X86/X86Schedule.td
+++ b/lib/Target/X86/X86Schedule.td
@@ -366,6 +366,7 @@ def IIC_SSE_MWAIT : InstrItinClass;
 def IIC_SSE_MONITOR : InstrItinClass;
 def IIC_SSE_MWAITX : InstrItinClass;
 def IIC_SSE_MONITORX : InstrItinClass;
+def IIC_SSE_CLZERO : InstrItinClass;
 
 def IIC_SSE_PREFETCH : InstrItinClass;
 def IIC_SSE_PAUSE : InstrItinClass;
diff --git a/lib/Target/X86/X86SelectionDAGInfo.cpp b/lib/Target/X86/X86SelectionDAGInfo.cpp
index f031a281e5dd0c7e1c8515f8cae9a8a3aa599436..9da8a18965ea60de27dfc6ac8b6fea4abfc0cb23 100644
--- a/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -85,10 +85,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
       Args.push_back(Entry);
 
       TargetLowering::CallLoweringInfo CLI(DAG);
-      CLI.setDebugLoc(dl).setChain(Chain)
-        .setCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol(bzeroEntry, IntPtr), std::move(Args))
-        .setDiscardResult();
+      CLI.setDebugLoc(dl)
+          .setChain(Chain)
+          .setLibCallee(CallingConv::C, Type::getVoidTy(*DAG.getContext()),
+                        DAG.getExternalSymbol(bzeroEntry, IntPtr),
+                        std::move(Args))
+          .setDiscardResult();
 
       std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
       return CallResult.second;
diff --git a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index 11115524c81090d5da150619b0d9597ebea5f66b..2cebb76022ef88427f3b3198f2a3f62532e2734c 100644
--- a/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -14,7 +14,7 @@
 
 #include "X86ShuffleDecodeConstantPool.h"
 #include "Utils/X86ShuffleDecode.h"
-#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/IR/Constants.h"
 
@@ -25,7 +25,7 @@
 namespace llvm {
 
 static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
-                                SmallBitVector &UndefElts,
+                                APInt &UndefElts,
                                 SmallVectorImpl<uint64_t> &RawMask) {
   // It is not an error for shuffle masks to not be a vector of
   // MaskEltSizeInBits because the constant pool uniques constants by their
@@ -49,6 +49,33 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
   unsigned NumCstElts = CstTy->getVectorNumElements();
 
+  assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
+         "Unaligned shuffle mask size");
+
+  unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits;
+  UndefElts = APInt(NumMaskElts, 0);
+  RawMask.resize(NumMaskElts, 0);
+
+  // Fast path - if the constants match the mask size then copy direct.
+  if (MaskEltSizeInBits == CstEltSizeInBits) {
+    assert(NumCstElts == NumMaskElts && "Unaligned shuffle mask size");
+    for (unsigned i = 0; i != NumMaskElts; ++i) {
+      Constant *COp = C->getAggregateElement(i);
+      if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
+        return false;
+
+      if (isa<UndefValue>(COp)) {
+        UndefElts.setBit(i);
+        RawMask[i] = 0;
+        continue;
+      }
+
+      auto *Elt = cast<ConstantInt>(COp);
+      RawMask[i] = Elt->getValue().getZExtValue();
+    }
+    return true;
+  }
+
   // Extract all the undef/constant element data and pack into single bitsets.
   APInt UndefBits(CstSizeInBits, 0);
   APInt MaskBits(CstSizeInBits, 0);
@@ -57,39 +84,30 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
     if (!COp || (!isa<UndefValue>(COp) && !isa<ConstantInt>(COp)))
       return false;
 
+    unsigned BitOffset = i * CstEltSizeInBits;
+
     if (isa<UndefValue>(COp)) {
-      APInt EltUndef = APInt::getLowBitsSet(CstSizeInBits, CstEltSizeInBits);
-      UndefBits |= EltUndef.shl(i * CstEltSizeInBits);
+      UndefBits.setBits(BitOffset, BitOffset + CstEltSizeInBits);
       continue;
     }
 
-    APInt EltBits = cast<ConstantInt>(COp)->getValue();
-    EltBits = EltBits.zextOrTrunc(CstSizeInBits);
-    MaskBits |= EltBits.shl(i * CstEltSizeInBits);
+    MaskBits.insertBits(cast<ConstantInt>(COp)->getValue(), BitOffset);
   }
 
   // Now extract the undef/constant bit data into the raw shuffle masks.
-  assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
-         "Unaligned shuffle mask size");
-
-  unsigned NumMaskElts = CstSizeInBits / MaskEltSizeInBits;
-  UndefElts = SmallBitVector(NumMaskElts, false);
-  RawMask.resize(NumMaskElts, 0);
-
   for (unsigned i = 0; i != NumMaskElts; ++i) {
-    APInt EltUndef = UndefBits.lshr(i * MaskEltSizeInBits);
-    EltUndef = EltUndef.zextOrTrunc(MaskEltSizeInBits);
+    unsigned BitOffset = i * MaskEltSizeInBits;
+    APInt EltUndef = UndefBits.extractBits(MaskEltSizeInBits, BitOffset);
 
     // Only treat the element as UNDEF if all bits are UNDEF, otherwise
     // treat it as zero.
     if (EltUndef.isAllOnesValue()) {
-      UndefElts[i] = true;
+      UndefElts.setBit(i);
       RawMask[i] = 0;
       continue;
     }
 
-    APInt EltBits = MaskBits.lshr(i * MaskEltSizeInBits);
-    EltBits = EltBits.zextOrTrunc(MaskEltSizeInBits);
+    APInt EltBits = MaskBits.extractBits(MaskEltSizeInBits, BitOffset);
     RawMask[i] = EltBits.getZExtValue();
   }
 
@@ -104,8 +122,8 @@ void DecodePSHUFBMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 32> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 64> RawMask;
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
@@ -145,8 +163,8 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize,
   assert((ElSize == 32 || ElSize == 64) && "Unexpected vector element size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 8> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 16> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
@@ -180,7 +198,7 @@ void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
   assert((MaskTySize == 128 || MaskTySize == 256) && "Unexpected vector size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
+  APInt UndefElts;
   SmallVector<uint64_t, 8> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
@@ -231,8 +249,8 @@ void DecodeVPPERMMask(const Constant *C, SmallVectorImpl<int> &ShuffleMask) {
          "Unexpected vector size.");
 
   // The shuffle mask requires a byte vector.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 32> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 16> RawMask;
   if (!extractConstantMask(C, 8, UndefElts, RawMask))
     return;
 
@@ -286,8 +304,8 @@ void DecodeVPERMVMask(const Constant *C, unsigned ElSize,
          "Unexpected vector element size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 8> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 64> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
@@ -314,8 +332,8 @@ void DecodeVPERMV3Mask(const Constant *C, unsigned ElSize,
          "Unexpected vector element size.");
 
   // The shuffle mask requires elements the same size as the target.
-  SmallBitVector UndefElts;
-  SmallVector<uint64_t, 8> RawMask;
+  APInt UndefElts;
+  SmallVector<uint64_t, 64> RawMask;
   if (!extractConstantMask(C, ElSize, UndefElts, RawMask))
     return;
 
diff --git a/lib/Target/X86/X86Subtarget.cpp b/lib/Target/X86/X86Subtarget.cpp
index 586bb7bd7b1a5539339159700140636411071d2f..92a68759195c8850afcd0da7a013aae13232378f 100644
--- a/lib/Target/X86/X86Subtarget.cpp
+++ b/lib/Target/X86/X86Subtarget.cpp
@@ -11,19 +11,23 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86Subtarget.h"
-#include "X86InstrInfo.h"
 #include "X86TargetMachine.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/IR/Attributes.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Host.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
+#include <cassert>
+#include <string>
 
 #if defined(_MSC_VER)
 #include <intrin.h>
@@ -93,8 +97,17 @@ unsigned char X86Subtarget::classifyGlobalReference(const GlobalValue *GV,
     return X86II::MO_NO_FLAG;
 
   // Absolute symbols can be referenced directly.
-  if (GV && GV->isAbsoluteSymbolRef())
-    return X86II::MO_NO_FLAG;
+  if (GV) {
+    if (Optional<ConstantRange> CR = GV->getAbsoluteSymbolRange()) {
+      // See if we can use the 8-bit immediate form. Note that some instructions
+      // will sign extend the immediate operand, so to be conservative we only
+      // accept the range [0,128).
+      if (CR->getUnsignedMax().ult(128))
+        return X86II::MO_ABS8;
+      else
+        return X86II::MO_NO_FLAG;
+    }
+  }
 
   if (TM.shouldAssumeDSOLocal(M, GV))
     return classifyLocalReference(GV);
@@ -195,7 +208,6 @@ void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
       FullFS = "+sahf";
   }
 
-
   // Parse features string and set the CPU.
   ParseSubtargetFeatures(CPUName, FullFS);
 
@@ -263,7 +275,6 @@ void X86Subtarget::initializeEnvironment() {
   HasVBMI = false;
   HasIFMA = false;
   HasRTM = false;
-  HasHLE = false;
   HasERI = false;
   HasCDI = false;
   HasPFI = false;
@@ -277,6 +288,7 @@ void X86Subtarget::initializeEnvironment() {
   HasRDSEED = false;
   HasLAHFSAHF = false;
   HasMWAITX = false;
+  HasCLZERO = false;
   HasMPX = false;
   IsBTMemSlow = false;
   IsPMULLDSlow = false;
@@ -286,10 +298,11 @@ void X86Subtarget::initializeEnvironment() {
   HasSSEUnalignedMem = false;
   HasCmpxchg16b = false;
   UseLeaForSP = false;
-  HasFastPartialYMMWrite = false;
+  HasFastPartialYMMorZMMWrite = false;
   HasFastScalarFSQRT = false;
   HasFastVectorFSQRT = false;
   HasFastLZCNT = false;
+  HasFastSHLDRotate = false;
   HasSlowDivide32 = false;
   HasSlowDivide64 = false;
   PadShortFunctions = false;
@@ -321,7 +334,7 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef FS,
                   TargetTriple.getEnvironment() != Triple::CODE16),
       In16BitMode(TargetTriple.getArch() == Triple::x86 &&
                   TargetTriple.getEnvironment() == Triple::CODE16),
-      TSInfo(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
   // Determine the PICStyle based on the target selected.
   if (!isPositionIndependent())
@@ -359,4 +372,3 @@ const RegisterBankInfo *X86Subtarget::getRegBankInfo() const {
 bool X86Subtarget::enableEarlyIfConversion() const {
   return hasCMov() && X86EarlyIfConv;
 }
-
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index d80dc4a9b5e805012f7ca04562b95176fdc48f16..c2c95658482d96c30d7693f607219b5525e5d01d 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -18,33 +18,36 @@
 #include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
 #include "X86SelectionDAGInfo.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/IR/CallingConv.h"
+#include "llvm/MC/MCInstrItineraries.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetSubtargetInfo.h"
-#include <string>
+#include <memory>
 
 #define GET_SUBTARGETINFO_HEADER
 #include "X86GenSubtargetInfo.inc"
 
 namespace llvm {
+
 class GlobalValue;
-class StringRef;
-class TargetMachine;
 
 /// The X86 backend supports a number of different styles of PIC.
 ///
 namespace PICStyles {
+
 enum Style {
   StubPIC,          // Used on i386-darwin in pic mode.
   GOT,              // Used on 32 bit elf on when in pic mode.
   RIPRel,           // Used on X86-64 when in pic mode.
   None              // Set when not in pic mode.
 };
-}
 
-class X86Subtarget final : public X86GenSubtargetInfo {
+} // end namespace PICStyles
 
+class X86Subtarget final : public X86GenSubtargetInfo {
 protected:
   enum X86SSEEnum {
     NoSSE, SSE1, SSE2, SSE3, SSSE3, SSE41, SSE42, AVX, AVX2, AVX512F
@@ -96,10 +99,13 @@ protected:
 
   /// Target has XSAVE instructions
   bool HasXSAVE;
+
   /// Target has XSAVEOPT instructions
   bool HasXSAVEOPT;
+
   /// Target has XSAVEC instructions
   bool HasXSAVEC;
+
   /// Target has XSAVES instructions
   bool HasXSAVES;
 
@@ -148,9 +154,6 @@ protected:
   /// Processor has RTM instructions.
   bool HasRTM;
 
-  /// Processor has HLE.
-  bool HasHLE;
-
   /// Processor has ADX instructions.
   bool HasADX;
 
@@ -169,6 +172,9 @@ protected:
   /// Processor has MONITORX/MWAITX instructions.
   bool HasMWAITX;
 
+  /// Processor has Cache Line Zero instruction
+  bool HasCLZERO;
+
   /// Processor has Prefetch with intent to Write instruction
   bool HasPFPREFETCHWT1;
 
@@ -201,8 +207,8 @@ protected:
   bool UseLeaForSP;
 
   /// True if there is no performance penalty to writing only the lower parts
-  /// of a YMM register without clearing the upper part.
-  bool HasFastPartialYMMWrite;
+  /// of a YMM or ZMM register without clearing the upper part.
+  bool HasFastPartialYMMorZMMWrite;
 
   /// True if hardware SQRTSS instruction is at least as fast (latency) as
   /// RSQRTSS followed by a Newton-Raphson iteration.
@@ -223,6 +229,9 @@ protected:
   /// True if LZCNT instruction is fast.
   bool HasFastLZCNT;
 
+  /// True if SHLD based rotate is fast.
+  bool HasFastSHLDRotate;
+
   /// True if the short functions should be padded to prevent
   /// a stall when returning too early.
   bool PadShortFunctions;
@@ -265,24 +274,12 @@ protected:
   /// Processor supports MPX - Memory Protection Extensions
   bool HasMPX;
 
-  /// Processor supports Invalidate Process-Context Identifier
-  bool HasInvPCId;
-
-  /// Processor has VM Functions
-  bool HasVMFUNC;
-
-  /// Processor has Supervisor Mode Access Protection
-  bool HasSMAP;
-
   /// Processor has Software Guard Extensions
   bool HasSGX;
 
   /// Processor supports Flush Cache Line instruction
   bool HasCLFLUSHOPT;
 
-  /// Processor has Persistent Commit feature
-  bool HasPCOMMIT;
-
   /// Processor supports Cache Line Write Back instruction
   bool HasCLWB;
 
@@ -307,8 +304,8 @@ protected:
   /// This is used to avoid ifndefs spreading around while GISel is
   /// an optional library.
   std::unique_ptr<GISelAccessor> GISel;
-private:
 
+private:
   /// Override the stack alignment.
   unsigned StackAlignOverride;
 
@@ -341,13 +338,17 @@ public:
   const X86TargetLowering *getTargetLowering() const override {
     return &TLInfo;
   }
+
   const X86InstrInfo *getInstrInfo() const override { return &InstrInfo; }
+
   const X86FrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
+
   const X86SelectionDAGInfo *getSelectionDAGInfo() const override {
     return &TSInfo;
   }
+
   const X86RegisterInfo *getRegisterInfo() const override {
     return &getInstrInfo()->getRegisterInfo();
   }
@@ -370,12 +371,14 @@ public:
   const InstructionSelector *getInstructionSelector() const override;
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
+
 private:
   /// Initialize the full set of dependencies so we can use an initializer
   /// list for X86Subtarget.
   X86Subtarget &initializeSubtargetDependencies(StringRef CPU, StringRef FS);
   void initializeEnvironment();
   void initSubtargetFeatures(StringRef CPU, StringRef FS);
+
 public:
   /// Is this x86_64? (disregarding specific ABI / programming model)
   bool is64Bit() const {
@@ -432,9 +435,9 @@ public:
   bool hasPCLMUL() const { return HasPCLMUL; }
   // Prefer FMA4 to FMA - its better for commutation/memory folding and
   // has equal or better performance on all supported targets.
-  bool hasFMA() const { return HasFMA && !HasFMA4; }
+  bool hasFMA() const { return (HasFMA || hasAVX512()) && !HasFMA4; }
   bool hasFMA4() const { return HasFMA4; }
-  bool hasAnyFMA() const { return hasFMA() || hasFMA4() || hasAVX512(); }
+  bool hasAnyFMA() const { return hasFMA() || hasFMA4(); }
   bool hasXOP() const { return HasXOP; }
   bool hasTBM() const { return HasTBM; }
   bool hasMOVBE() const { return HasMOVBE; }
@@ -447,13 +450,13 @@ public:
   bool hasVBMI() const { return HasVBMI; }
   bool hasIFMA() const { return HasIFMA; }
   bool hasRTM() const { return HasRTM; }
-  bool hasHLE() const { return HasHLE; }
   bool hasADX() const { return HasADX; }
   bool hasSHA() const { return HasSHA; }
   bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasRDSEED() const { return HasRDSEED; }
   bool hasLAHFSAHF() const { return HasLAHFSAHF; }
   bool hasMWAITX() const { return HasMWAITX; }
+  bool hasCLZERO() const { return HasCLZERO; }
   bool isBTMemSlow() const { return IsBTMemSlow; }
   bool isSHLDSlow() const { return IsSHLDSlow; }
   bool isPMULLDSlow() const { return IsPMULLDSlow; }
@@ -462,10 +465,13 @@ public:
   bool hasSSEUnalignedMem() const { return HasSSEUnalignedMem; }
   bool hasCmpxchg16b() const { return HasCmpxchg16b; }
   bool useLeaForSP() const { return UseLeaForSP; }
-  bool hasFastPartialYMMWrite() const { return HasFastPartialYMMWrite; }
+  bool hasFastPartialYMMorZMMWrite() const {
+    return HasFastPartialYMMorZMMWrite;
+  }
   bool hasFastScalarFSQRT() const { return HasFastScalarFSQRT; }
   bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; }
   bool hasFastLZCNT() const { return HasFastLZCNT; }
+  bool hasFastSHLDRotate() const { return HasFastSHLDRotate; }
   bool hasSlowDivide32() const { return HasSlowDivide32; }
   bool hasSlowDivide64() const { return HasSlowDivide64; }
   bool padShortFunctions() const { return PadShortFunctions; }
@@ -481,8 +487,9 @@ public:
   bool hasVLX() const { return HasVLX; }
   bool hasPKU() const { return HasPKU; }
   bool hasMPX() const { return HasMPX; }
+  bool hasCLFLUSHOPT() const { return HasCLFLUSHOPT; }
 
-  virtual bool isXRaySupported() const override { return is64Bit(); }
+  bool isXRaySupported() const override { return is64Bit(); }
 
   bool isAtom() const { return X86ProcFamily == IntelAtom; }
   bool isSLM() const { return X86ProcFamily == IntelSLM; }
@@ -513,6 +520,7 @@ public:
   bool isTargetNaCl32() const { return isTargetNaCl() && !is64Bit(); }
   bool isTargetNaCl64() const { return isTargetNaCl() && is64Bit(); }
   bool isTargetMCU() const { return TargetTriple.isOSIAMCU(); }
+  bool isTargetFuchsia() const { return TargetTriple.isOSFuchsia(); }
 
   bool isTargetWindowsMSVC() const {
     return TargetTriple.isWindowsMSVCEnvironment();
@@ -628,6 +636,6 @@ public:
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_X86SUBTARGET_H
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index aa5cfc64e9ebce8b878076cf9afe7e28e64de51c..03a1958121ab8e32c7cb9ed46e148e32de01a6af 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -11,22 +11,47 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "X86TargetMachine.h"
+#include "MCTargetDesc/X86MCTargetDesc.h"
 #include "X86.h"
 #include "X86CallLowering.h"
+#include "X86LegalizerInfo.h"
+#ifdef LLVM_BUILD_GLOBAL_ISEL
+#include "X86RegisterBankInfo.h"
+#endif
+#include "X86MacroFusion.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
 #include "X86TargetObjectFile.h"
 #include "X86TargetTransformInfo.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/CodeGen/ExecutionDepsFix.h"
+#include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/GISelAccessor.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
+#include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
+#include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Attributes.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetLoweringObjectFile.h"
 #include "llvm/Target/TargetOptions.h"
+#include <memory>
+#include <string>
+
 using namespace llvm;
 
 static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
@@ -34,8 +59,11 @@ static cl::opt<bool> EnableMachineCombinerPass("x86-machine-combiner",
                                cl::init(true), cl::Hidden);
 
 namespace llvm {
+
 void initializeWinEHStatePassPass(PassRegistry &);
-}
+void initializeX86ExecutionDepsFixPass(PassRegistry &);
+
+} // end namespace llvm
 
 extern "C" void LLVMInitializeX86Target() {
   // Register the target.
@@ -47,27 +75,28 @@ extern "C" void LLVMInitializeX86Target() {
   initializeWinEHStatePassPass(PR);
   initializeFixupBWInstPassPass(PR);
   initializeEvexToVexInstPassPass(PR);
+  initializeX86ExecutionDepsFixPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
   if (TT.isOSBinFormatMachO()) {
     if (TT.getArch() == Triple::x86_64)
-      return make_unique<X86_64MachoTargetObjectFile>();
-    return make_unique<TargetLoweringObjectFileMachO>();
+      return llvm::make_unique<X86_64MachoTargetObjectFile>();
+    return llvm::make_unique<TargetLoweringObjectFileMachO>();
   }
 
   if (TT.isOSFreeBSD())
-    return make_unique<X86FreeBSDTargetObjectFile>();
+    return llvm::make_unique<X86FreeBSDTargetObjectFile>();
   if (TT.isOSLinux() || TT.isOSNaCl())
-    return make_unique<X86LinuxNaClTargetObjectFile>();
+    return llvm::make_unique<X86LinuxNaClTargetObjectFile>();
   if (TT.isOSFuchsia())
-    return make_unique<X86FuchsiaTargetObjectFile>();
+    return llvm::make_unique<X86FuchsiaTargetObjectFile>();
   if (TT.isOSBinFormatELF())
-    return make_unique<X86ELFTargetObjectFile>();
+    return llvm::make_unique<X86ELFTargetObjectFile>();
   if (TT.isKnownWindowsMSVCEnvironment() || TT.isWindowsCoreCLREnvironment())
-    return make_unique<X86WindowsTargetObjectFile>();
+    return llvm::make_unique<X86WindowsTargetObjectFile>();
   if (TT.isOSBinFormatCOFF())
-    return make_unique<TargetLoweringObjectFileCOFF>();
+    return llvm::make_unique<TargetLoweringObjectFileCOFF>();
   llvm_unreachable("unknown subtarget type");
 }
 
@@ -177,31 +206,37 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
   initAsmInfo();
 }
 
-X86TargetMachine::~X86TargetMachine() {}
+X86TargetMachine::~X86TargetMachine() = default;
 
 #ifdef LLVM_BUILD_GLOBAL_ISEL
 namespace {
+
 struct X86GISelActualAccessor : public GISelAccessor {
-  std::unique_ptr<CallLowering> CL;
-  X86GISelActualAccessor(CallLowering* CL): CL(CL) {}
+  std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<LegalizerInfo> Legalizer;
+  std::unique_ptr<RegisterBankInfo> RegBankInfo;
+  std::unique_ptr<InstructionSelector> InstSelector;
+
   const CallLowering *getCallLowering() const override {
-    return CL.get();
+    return CallLoweringInfo.get();
   }
+
   const InstructionSelector *getInstructionSelector() const override {
-    //TODO: Implement
-    return nullptr;
+    return InstSelector.get();
   }
+
   const LegalizerInfo *getLegalizerInfo() const override {
-    //TODO: Implement
-    return nullptr;
+    return Legalizer.get();
   }
+
   const RegisterBankInfo *getRegBankInfo() const override {
-    //TODO: Implement
-    return nullptr;
+    return RegBankInfo.get();
   }
 };
-} // End anonymous namespace.
+
+} // end anonymous namespace
 #endif
+
 const X86Subtarget *
 X86TargetMachine::getSubtargetImpl(const Function &F) const {
   Attribute CPUAttr = F.getFnAttribute("target-cpu");
@@ -244,8 +279,14 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
 #ifndef LLVM_BUILD_GLOBAL_ISEL
     GISelAccessor *GISel = new GISelAccessor();
 #else
-    X86GISelActualAccessor *GISel = new X86GISelActualAccessor(
-        new X86CallLowering(*I->getTargetLowering()));
+    X86GISelActualAccessor *GISel = new X86GISelActualAccessor();
+
+    GISel->CallLoweringInfo.reset(new X86CallLowering(*I->getTargetLowering()));
+    GISel->Legalizer.reset(new X86LegalizerInfo(*I, *this));
+
+    auto *RBI = new X86RegisterBankInfo(*I->getRegisterInfo());
+    GISel->RegBankInfo.reset(RBI);
+    GISel->InstSelector.reset(createX86InstructionSelector(*I, *RBI));
 #endif
     I->setGISelAccessor(*GISel);
   }
@@ -270,12 +311,12 @@ TargetIRAnalysis X86TargetMachine::getTargetIRAnalysis() {
   });
 }
 
-
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
 
 namespace {
+
 /// X86 Code Generator Pass Configuration Options.
 class X86PassConfig : public TargetPassConfig {
 public:
@@ -289,7 +330,7 @@ public:
   ScheduleDAGInstrs *
   createMachineScheduler(MachineSchedContext *C) const override {
     ScheduleDAGMILive *DAG = createGenericSchedLive(C);
-    DAG->addMutation(createMacroFusionDAGMutation(DAG->TII));
+    DAG->addMutation(createX86MacroFusionDAGMutation());
     return DAG;
   }
 
@@ -301,14 +342,28 @@ public:
   bool addRegBankSelect() override;
   bool addGlobalInstructionSelect() override;
 #endif
-bool addILPOpts() override;
+  bool addILPOpts() override;
   bool addPreISel() override;
   void addPreRegAlloc() override;
   void addPostRegAlloc() override;
   void addPreEmitPass() override;
   void addPreSched2() override;
 };
-} // namespace
+
+class X86ExecutionDepsFix : public ExecutionDepsFix {
+public:
+  static char ID;
+  X86ExecutionDepsFix() : ExecutionDepsFix(ID, X86::VR128XRegClass) {}
+  StringRef getPassName() const override {
+    return "X86 Execution Dependency Fix";
+  }
+};
+char X86ExecutionDepsFix::ID;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(X86ExecutionDepsFix, "x86-execution-deps-fix",
+                "X86 Execution Dependency Fix", false, false)
 
 TargetPassConfig *X86TargetMachine::createPassConfig(PassManagerBase &PM) {
   return new X86PassConfig(this, PM);
@@ -343,17 +398,17 @@ bool X86PassConfig::addIRTranslator() {
 }
 
 bool X86PassConfig::addLegalizeMachineIR() {
-  //TODO: Implement
+  addPass(new Legalizer());
   return false;
 }
 
 bool X86PassConfig::addRegBankSelect() {
-  //TODO: Implement
+  addPass(new RegBankSelect());
   return false;
 }
 
 bool X86PassConfig::addGlobalInstructionSelect() {
-  //TODO: Implement
+  addPass(new InstructionSelect());
   return false;
 }
 #endif
@@ -391,7 +446,7 @@ void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
 
 void X86PassConfig::addPreEmitPass() {
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(createExecutionDependencyFixPass(&X86::VR128XRegClass));
+    addPass(new X86ExecutionDepsFix());
 
   if (UseVZeroUpper)
     addPass(createX86IssueVZeroUpperPass());
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index d756d07926dd000cbd1e5d5451679736218bddf6..cf933f52604ef8f35f4841e7a1f6d1706b757a59 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -13,14 +13,20 @@
 
 #ifndef LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
 #define LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
-#include "X86InstrInfo.h"
+
 #include "X86Subtarget.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Support/CodeGen.h"
 #include "llvm/Target/TargetMachine.h"
+#include <memory>
 
 namespace llvm {
 
 class StringRef;
+class X86Subtarget;
+class X86RegisterBankInfo;
 
 class X86TargetMachine final : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
@@ -32,17 +38,19 @@ public:
                    Optional<Reloc::Model> RM, CodeModel::Model CM,
                    CodeGenOpt::Level OL);
   ~X86TargetMachine() override;
+
   const X86Subtarget *getSubtargetImpl(const Function &F) const override;
 
   TargetIRAnalysis getTargetIRAnalysis() override;
 
   // Set up the pass pipeline.
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
 };
 
-} // End llvm namespace
+} // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_X86_X86TARGETMACHINE_H
diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp
index 3ebfdd0ede7867f598f8c1a14d419a80bceab539..b742fb472372cd9f84c6159c6a98d7610caf94a0 100644
--- a/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -78,7 +78,7 @@ unsigned X86TTIImpl::getNumberOfRegisters(bool Vector) {
   return 8;
 }
 
-unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
+unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) const {
   if (Vector) {
     if (ST->hasAVX512())
       return 512;
@@ -95,6 +95,10 @@ unsigned X86TTIImpl::getRegisterBitWidth(bool Vector) {
   return 32;
 }
 
+unsigned X86TTIImpl::getLoadStoreVecRegBitWidth(unsigned) const {
+  return getRegisterBitWidth(true);
+}
+
 unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // If the loop will not be vectorized, don't interleave the loop.
   // Let regular unroll to unroll the loop, which saves the overflow
@@ -819,7 +823,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v32i8,  2 }, // vperm2i128 + pshufb
 
     { TTI::SK_Alternate, MVT::v16i16, 1 }, // vpblendw
-    { TTI::SK_Alternate, MVT::v32i8,  1 }  // vpblendvb
+    { TTI::SK_Alternate, MVT::v32i8,  1 }, // vpblendvb
+
+    { TTI::SK_PermuteSingleSrc, MVT::v4i64,  1 }, // vpermq
+    { TTI::SK_PermuteSingleSrc, MVT::v8i32,  1 }, // vpermd
+    { TTI::SK_PermuteSingleSrc, MVT::v16i16, 4 }, // vperm2i128 + 2 * vpshufb
+                                                  // + vpblendvb
+    { TTI::SK_PermuteSingleSrc, MVT::v32i8,  4 }  // vperm2i128 + 2 * vpshufb
+                                                  // + vpblendvb
   };
 
   if (ST->hasAVX2())
@@ -876,7 +887,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Reverse,   MVT::v16i8,  1 }, // pshufb
 
     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pshufb + pshufb + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pshufb + pshufb + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pshufb + pshufb + por
+
+    { TTI::SK_PermuteSingleSrc, MVT::v8i16, 1 }, // pshufb
+    { TTI::SK_PermuteSingleSrc, MVT::v16i8, 1 }  // pshufb
   };
 
   if (ST->hasSSSE3())
@@ -901,7 +915,10 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     { TTI::SK_Alternate, MVT::v2f64,  1 }, // movsd
     { TTI::SK_Alternate, MVT::v4i32,  2 }, // 2*shufps
     { TTI::SK_Alternate, MVT::v8i16,  3 }, // pand + pandn + por
-    { TTI::SK_Alternate, MVT::v16i8,  3 }  // pand + pandn + por
+    { TTI::SK_Alternate, MVT::v16i8,  3 }, // pand + pandn + por
+
+    { TTI::SK_PermuteSingleSrc, MVT::v2i64, 1 }, // pshufd
+    { TTI::SK_PermuteSingleSrc, MVT::v4i32, 1 }  // pshufd
   };
 
   if (ST->hasSSE2())
@@ -921,7 +938,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
 }
 
-int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
+int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
@@ -1287,7 +1305,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src) {
   return BaseT::getCastInstrCost(Opcode, Dst, Src);
 }
 
-int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
+int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   const Instruction *I) {
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
@@ -1353,11 +1372,12 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy) {
     if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
 }
 
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<Type *> Tys, FastMathFlags FMF) {
+                                      ArrayRef<Type *> Tys, FastMathFlags FMF,
+                                      unsigned ScalarizationCostPassed) {
   // Costs should match the codegen from:
   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
@@ -1433,8 +1453,8 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v4f64,  43 }, // SNB from http://www.agner.org/
   };
   static const CostTblEntry SSE42CostTbl[] = {
-    { ISD::FSQRT, MVT::f32,   18 }, // Nehalem from http://www.agner.org/
-    { ISD::FSQRT, MVT::v4f32, 18 }, // Nehalem from http://www.agner.org/
+    { ISD::FSQRT,      MVT::f32,    18 }, // Nehalem from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f32,  18 }, // Nehalem from http://www.agner.org/
   };
   static const CostTblEntry SSSE3CostTbl[] = {
     { ISD::BITREVERSE, MVT::v2i64,   5 },
@@ -1458,6 +1478,10 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::CTTZ,       MVT::v16i8,   9 }
   };
   static const CostTblEntry SSE2CostTbl[] = {
+    { ISD::BITREVERSE, MVT::v2i64,  29 },
+    { ISD::BITREVERSE, MVT::v4i32,  27 },
+    { ISD::BITREVERSE, MVT::v8i16,  27 },
+    { ISD::BITREVERSE, MVT::v16i8,  20 },
     { ISD::BSWAP,      MVT::v2i64,   7 },
     { ISD::BSWAP,      MVT::v4i32,   7 },
     { ISD::BSWAP,      MVT::v8i16,   7 },
@@ -1477,8 +1501,16 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
   };
   static const CostTblEntry SSE1CostTbl[] = {
-    { ISD::FSQRT, MVT::f32,   28 }, // Pentium III from http://www.agner.org/
-    { ISD::FSQRT, MVT::v4f32, 56 }, // Pentium III from http://www.agner.org/
+    { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
+    { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
+  };
+  static const CostTblEntry X64CostTbl[] = { // 64-bit targets
+    { ISD::BITREVERSE, MVT::i64,    14 }
+  };
+  static const CostTblEntry X86CostTbl[] = { // 32 or 64-bit targets
+    { ISD::BITREVERSE, MVT::i32,    14 },
+    { ISD::BITREVERSE, MVT::i16,    14 },
+    { ISD::BITREVERSE, MVT::i8,     11 }
   };
 
   unsigned ISD = ISD::DELETED_NODE;
@@ -1538,12 +1570,19 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
       return LT.first * Entry->Cost;
 
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF);
+  if (ST->is64Bit())
+    if (const auto *Entry = CostTableLookup(X64CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (const auto *Entry = CostTableLookup(X86CostTbl, ISD, MTy))
+    return LT.first * Entry->Cost;
+
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
 }
 
 int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<Value *> Args, FastMathFlags FMF) {
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF);
+                     ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) {
+  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
 }
 
 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -1578,7 +1617,7 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
 }
 
 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                                unsigned AddressSpace) {
+                                unsigned AddressSpace, const Instruction *I) {
   // Handle non-power-of-two vectors such as <3 x float>
   if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
     unsigned NumElem = VTy->getVectorNumElements();
diff --git a/lib/Target/X86/X86TargetTransformInfo.h b/lib/Target/X86/X86TargetTransformInfo.h
index 63a1493002fff250f79120ff305be32e7a02bc89..9bef9e80c395ca20cb1d5435ab554c10b9f895e4 100644
--- a/lib/Target/X86/X86TargetTransformInfo.h
+++ b/lib/Target/X86/X86TargetTransformInfo.h
@@ -51,7 +51,8 @@ public:
   /// @{
 
   unsigned getNumberOfRegisters(bool Vector);
-  unsigned getRegisterBitWidth(bool Vector);
+  unsigned getRegisterBitWidth(bool Vector) const;
+  unsigned getLoadStoreVecRegBitWidth(unsigned AS) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
@@ -61,11 +62,13 @@ public:
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>());
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
-  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src);
-  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy);
+  int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       const Instruction *I = nullptr);
+  int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                      unsigned AddressSpace);
+                      unsigned AddressSpace, const Instruction *I = nullptr);
   int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
                             unsigned AddressSpace);
   int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
@@ -74,9 +77,11 @@ public:
                                 const SCEV *Ptr);
 
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Type *> Tys, FastMathFlags FMF);
+                            ArrayRef<Type *> Tys, FastMathFlags FMF,
+                            unsigned ScalarizationCostPassed = UINT_MAX);
   int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Value *> Args, FastMathFlags FMF);
+                            ArrayRef<Value *> Args, FastMathFlags FMF,
+                            unsigned VF = 1);
 
   int getReductionCost(unsigned Opcode, Type *Ty, bool IsPairwiseForm);
 
diff --git a/lib/Target/X86/X86VZeroUpper.cpp b/lib/Target/X86/X86VZeroUpper.cpp
index 9766b84be6521f0ac5041264721eba28267bf9f2..d17dfac6a99744b9115df4e588f6683e8f9d75cb 100644
--- a/lib/Target/X86/X86VZeroUpper.cpp
+++ b/lib/Target/X86/X86VZeroUpper.cpp
@@ -56,11 +56,11 @@ namespace {
 
     // Core algorithm state:
     // BlockState - Each block is either:
-    //   - PASS_THROUGH: There are neither YMM dirtying instructions nor
+    //   - PASS_THROUGH: There are neither YMM/ZMM dirtying instructions nor
     //                   vzeroupper instructions in this block.
     //   - EXITS_CLEAN: There is (or will be) a vzeroupper instruction in this
-    //                  block that will ensure that YMM is clean on exit.
-    //   - EXITS_DIRTY: An instruction in the block dirties YMM and no
+    //                  block that will ensure that YMM/ZMM is clean on exit.
+    //   - EXITS_DIRTY: An instruction in the block dirties YMM/ZMM and no
     //                  subsequent vzeroupper in the block clears it.
     //
     // AddedToDirtySuccessors - This flag is raised when a block is added to the
@@ -97,6 +97,7 @@ FunctionPass *llvm::createX86IssueVZeroUpperPass() {
   return new VZeroUpperInserter();
 }
 
+#ifndef NDEBUG
 const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
   switch (ST) {
     case PASS_THROUGH: return "Pass-through";
@@ -105,52 +106,56 @@ const char* VZeroUpperInserter::getBlockExitStateName(BlockExitState ST) {
   }
   llvm_unreachable("Invalid block exit state.");
 }
+#endif
 
-static bool isYmmReg(unsigned Reg) {
-  return (Reg >= X86::YMM0 && Reg <= X86::YMM15);
+/// VZEROUPPER cleans state that is related to Y/ZMM0-15 only.
+/// Thus, there is no need to check for Y/ZMM16 and above.
+static bool isYmmOrZmmReg(unsigned Reg) {
+  return (Reg >= X86::YMM0 && Reg <= X86::YMM15) ||
+         (Reg >= X86::ZMM0 && Reg <= X86::ZMM15);
 }
 
-static bool checkFnHasLiveInYmm(MachineRegisterInfo &MRI) {
+static bool checkFnHasLiveInYmmOrZmm(MachineRegisterInfo &MRI) {
   for (MachineRegisterInfo::livein_iterator I = MRI.livein_begin(),
        E = MRI.livein_end(); I != E; ++I)
-    if (isYmmReg(I->first))
+    if (isYmmOrZmmReg(I->first))
       return true;
 
   return false;
 }
 
-static bool clobbersAllYmmRegs(const MachineOperand &MO) {
+static bool clobbersAllYmmAndZmmRegs(const MachineOperand &MO) {
   for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
     if (!MO.clobbersPhysReg(reg))
       return false;
   }
+  for (unsigned reg = X86::ZMM0; reg <= X86::ZMM15; ++reg) {
+    if (!MO.clobbersPhysReg(reg))
+      return false;
+  }
   return true;
 }
 
-static bool hasYmmReg(MachineInstr &MI) {
+static bool hasYmmOrZmmReg(MachineInstr &MI) {
   for (const MachineOperand &MO : MI.operands()) {
-    if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmRegs(MO))
+    if (MI.isCall() && MO.isRegMask() && !clobbersAllYmmAndZmmRegs(MO))
       return true;
     if (!MO.isReg())
       continue;
     if (MO.isDebug())
       continue;
-    if (isYmmReg(MO.getReg()))
+    if (isYmmOrZmmReg(MO.getReg()))
       return true;
   }
   return false;
 }
 
-/// Check if any YMM register will be clobbered by this instruction.
-static bool callClobbersAnyYmmReg(MachineInstr &MI) {
+/// Check if given call instruction has a RegMask operand.
+static bool callHasRegMask(MachineInstr &MI) {
   assert(MI.isCall() && "Can only be called on call instructions.");
   for (const MachineOperand &MO : MI.operands()) {
-    if (!MO.isRegMask())
-      continue;
-    for (unsigned reg = X86::YMM0; reg <= X86::YMM15; ++reg) {
-      if (MO.clobbersPhysReg(reg))
-        return true;
-    }
+    if (MO.isRegMask())
+      return true;
   }
   return false;
 }
@@ -175,17 +180,20 @@ void VZeroUpperInserter::addDirtySuccessor(MachineBasicBlock &MBB) {
 /// Loop over all of the instructions in the basic block, inserting vzeroupper
 /// instructions before function calls.
 void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
-
   // Start by assuming that the block is PASS_THROUGH which implies no unguarded
   // calls.
   BlockExitState CurState = PASS_THROUGH;
   BlockStates[MBB.getNumber()].FirstUnguardedCall = MBB.end();
 
   for (MachineInstr &MI : MBB) {
+    bool IsCall = MI.isCall();
+    bool IsReturn = MI.isReturn();
+    bool IsControlFlow = IsCall || IsReturn;
+
     // No need for vzeroupper before iret in interrupt handler function,
-    // epilogue will restore YMM registers if needed.
-    bool IsReturnFromX86INTR = IsX86INTR && MI.isReturn();
-    bool IsControlFlow = MI.isCall() || MI.isReturn();
+    // epilogue will restore YMM/ZMM registers if needed.
+    if (IsX86INTR && IsReturn)
+      continue;
 
     // An existing VZERO* instruction resets the state.
     if (MI.getOpcode() == X86::VZEROALL || MI.getOpcode() == X86::VZEROUPPER) {
@@ -194,30 +202,30 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
     }
 
     // Shortcut: don't need to check regular instructions in dirty state.
-    if ((!IsControlFlow || IsReturnFromX86INTR) && CurState == EXITS_DIRTY)
+    if (!IsControlFlow && CurState == EXITS_DIRTY)
       continue;
 
-    if (hasYmmReg(MI)) {
-      // We found a ymm-using instruction; this could be an AVX instruction,
-      // or it could be control flow.
+    if (hasYmmOrZmmReg(MI)) {
+      // We found a ymm/zmm-using instruction; this could be an AVX/AVX512
+      // instruction, or it could be control flow.
       CurState = EXITS_DIRTY;
       continue;
     }
 
     // Check for control-flow out of the current function (which might
     // indirectly execute SSE instructions).
-    if (!IsControlFlow || IsReturnFromX86INTR)
+    if (!IsControlFlow)
       continue;
 
-    // If the call won't clobber any YMM register, skip it as well. It usually
-    // happens on helper function calls (such as '_chkstk', '_ftol2') where
-    // standard calling convention is not used (RegMask is not used to mark
-    // register clobbered and register usage (def/imp-def/use) is well-defined
-    // and explicitly specified.
-    if (MI.isCall() && !callClobbersAnyYmmReg(MI))
+    // If the call has no RegMask, skip it as well. It usually happens on
+    // helper function calls (such as '_chkstk', '_ftol2') where standard
+    // calling convention is not used (RegMask is not used to mark register
+    // clobbered and register usage (def/imp-def/use) is well-defined and
+    // explicitly specified.
+    if (IsCall && !callHasRegMask(MI))
       continue;
 
-    // The VZEROUPPER instruction resets the upper 128 bits of all AVX
+    // The VZEROUPPER instruction resets the upper 128 bits of YMM0-YMM15
     // registers. In addition, the processor changes back to Clean state, after
     // which execution of SSE instructions or AVX instructions has no transition
     // penalty. Add the VZEROUPPER instruction before any function call/return
@@ -226,7 +234,7 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
     // predecessor block.
     if (CurState == EXITS_DIRTY) {
       // After the inserted VZEROUPPER the state becomes clean again, but
-      // other YMM may appear before other subsequent calls or even before
+      // other YMM/ZMM may appear before other subsequent calls or even before
       // the end of the BB.
       insertVZeroUpper(MI, MBB);
       CurState = EXITS_CLEAN;
@@ -257,30 +265,32 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 /// function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
-  if (!ST.hasAVX() || ST.hasAVX512() || ST.hasFastPartialYMMWrite())
+  if (!ST.hasAVX() || ST.hasFastPartialYMMorZMMWrite())
     return false;
   TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   EverMadeChange = false;
   IsX86INTR = MF.getFunction()->getCallingConv() == CallingConv::X86_INTR;
 
-  bool FnHasLiveInYmm = checkFnHasLiveInYmm(MRI);
-
-  // Fast check: if the function doesn't use any ymm registers, we don't need
-  // to insert any VZEROUPPER instructions.  This is constant-time, so it is
-  // cheap in the common case of no ymm use.
-  bool YMMUsed = FnHasLiveInYmm;
-  if (!YMMUsed) {
-    const TargetRegisterClass *RC = &X86::VR256RegClass;
-    for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
-         i++) {
-      if (!MRI.reg_nodbg_empty(*i)) {
-        YMMUsed = true;
-        break;
+  bool FnHasLiveInYmmOrZmm = checkFnHasLiveInYmmOrZmm(MRI);
+
+  // Fast check: if the function doesn't use any ymm/zmm registers, we don't
+  // need to insert any VZEROUPPER instructions.  This is constant-time, so it
+  // is cheap in the common case of no ymm/zmm use.
+  bool YmmOrZmmUsed = FnHasLiveInYmmOrZmm;
+  const TargetRegisterClass *RCs[2] = {&X86::VR256RegClass, &X86::VR512RegClass};
+  for (auto *RC : RCs) {
+    if (!YmmOrZmmUsed) {
+      for (TargetRegisterClass::iterator i = RC->begin(), e = RC->end(); i != e;
+           i++) {
+        if (!MRI.reg_nodbg_empty(*i)) {
+          YmmOrZmmUsed = true;
+          break;
+        }
       }
     }
   }
-  if (!YMMUsed) {
+  if (!YmmOrZmmUsed) {
     return false;
   }
 
@@ -294,9 +304,9 @@ bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
   for (MachineBasicBlock &MBB : MF)
     processBasicBlock(MBB);
 
-  // If any YMM regs are live-in to this function, add the entry block to the
-  // DirtySuccessors list
-  if (FnHasLiveInYmm)
+  // If any YMM/ZMM regs are live-in to this function, add the entry block to
+  // the DirtySuccessors list
+  if (FnHasLiveInYmmOrZmm)
     addDirtySuccessor(MF.front());
 
   // Re-visit all blocks that are successors of EXITS_DIRTY blocks. Add
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
index 500c84d2a41827f582fc917554bdb757380da25c..b03c1852281d149628e71227a2f50a340e85902f 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.cpp
@@ -12,13 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "XCoreInstPrinter.h"
-#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
diff --git a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
index dc513f7b225b176214c84e0266d330c689d87ef0..8a7efe2e39c61a00c1a1583ef178d6292a7b46ea 100644
--- a/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
+++ b/lib/Target/XCore/InstPrinter/XCoreInstPrinter.h
@@ -15,6 +15,8 @@
 
 #ifndef LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
 #define LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
+
+#include "llvm/ADT/StringRef.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -32,12 +34,14 @@ public:
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
   void printInst(const MCInst *MI, raw_ostream &O, StringRef Annot,
                  const MCSubtargetInfo &STI) override;
+
 private:
   void printInlineJT(const MCInst *MI, int opNum, raw_ostream &O);
   void printInlineJT32(const MCInst *MI, int opNum, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
 };
+
 } // end namespace llvm
 
-#endif
+#endif // LLVM_LIB_TARGET_XCORE_INSTPRINTER_XCOREINSTPRINTER_H
diff --git a/lib/Target/XCore/XCoreFrameLowering.cpp b/lib/Target/XCore/XCoreFrameLowering.cpp
index e0e2e0319964077271d6473ae420adf16e77530f..a752357400b3eea3718eb4698a77eac2bfbfb887 100644
--- a/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -238,7 +238,7 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF,
     report_fatal_error("emitPrologue unsupported alignment: "
                        + Twine(MFI.getMaxAlignment()));
 
-  const AttributeSet &PAL = MF.getFunction()->getAttributes();
+  const AttributeList &PAL = MF.getFunction()->getAttributes();
   if (PAL.hasAttrSomewhere(Attribute::Nest))
     BuildMI(MBB, MBBI, dl, TII.get(XCore::LDWSP_ru6), XCore::R11).addImm(0);
     // FIX: Needs addMemOperand() but can't use getFixedStack() or getStack().
diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp
index 9244d594460ffb6f41e8f3f0376fbdd7867fc304..45437815fa371a167eba16291d8db0f3314b28b0 100644
--- a/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/lib/Target/XCore/XCoreISelLowering.cpp
@@ -483,7 +483,7 @@ LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   Args.push_back(Entry);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(DL).setChain(Chain).setCallee(
+  CLI.setDebugLoc(DL).setChain(Chain).setLibCallee(
       CallingConv::C, IntPtrTy,
       DAG.getExternalSymbol("__misaligned_load",
                             getPointerTy(DAG.getDataLayout())),
@@ -1824,6 +1824,7 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
 void XCoreTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                         APInt &KnownZero,
                                                         APInt &KnownOne,
+                                                        const APInt &DemandedElts,
                                                         const SelectionDAG &DAG,
                                                         unsigned Depth) const {
   KnownZero = KnownOne = APInt(KnownZero.getBitWidth(), 0);
diff --git a/lib/Target/XCore/XCoreISelLowering.h b/lib/Target/XCore/XCoreISelLowering.h
index 41813bbb8156be90d36a908b0ac1d9d339c86018..188f4f1fa06b0a3250d7a59e1d29d4a102a849d6 100644
--- a/lib/Target/XCore/XCoreISelLowering.h
+++ b/lib/Target/XCore/XCoreISelLowering.h
@@ -202,6 +202,7 @@ namespace llvm {
     void computeKnownBitsForTargetNode(const SDValue Op,
                                        APInt &KnownZero,
                                        APInt &KnownOne,
+                                       const APInt &DemandedElts,
                                        const SelectionDAG &DAG,
                                        unsigned Depth = 0) const override;
 
diff --git a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index c03b0afceba375d5559a90c737a1c3d945222d28..646309e02de8252745aa562144646a63a1afa14d 100644
--- a/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -35,11 +35,11 @@ SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy(
     TargetLowering::CallLoweringInfo CLI(DAG);
     CLI.setDebugLoc(dl)
         .setChain(Chain)
-        .setCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
-                   Type::getVoidTy(*DAG.getContext()),
-                   DAG.getExternalSymbol("__memcpy_4",
-                                         TLI.getPointerTy(DAG.getDataLayout())),
-                   std::move(Args))
+        .setLibCallee(TLI.getLibcallCallingConv(RTLIB::MEMCPY),
+                      Type::getVoidTy(*DAG.getContext()),
+                      DAG.getExternalSymbol(
+                          "__memcpy_4", TLI.getPointerTy(DAG.getDataLayout())),
+                      std::move(Args))
         .setDiscardResult();
 
     std::pair<SDValue,SDValue> CallResult = TLI.LowerCallTo(CLI);
diff --git a/lib/Transforms/Coroutines/CoroElide.cpp b/lib/Transforms/Coroutines/CoroElide.cpp
index 99974d8da64c0ca31cc0b18fa6a2b9e7b6ec7d9f..c6ac3f614ff7eec47e489542ab566a8b5d710413 100644
--- a/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/lib/Transforms/Coroutines/CoroElide.cpp
@@ -92,7 +92,7 @@ static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
 
 // Given a resume function @f.resume(%f.frame* %frame), returns %f.frame type.
 static Type *getFrameType(Function *Resume) {
-  auto *ArgType = Resume->getArgumentList().front().getType();
+  auto *ArgType = Resume->arg_begin()->getType();
   return cast<PointerType>(ArgType)->getElementType();
 }
 
@@ -127,7 +127,8 @@ void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
   // is spilled into the coroutine frame and recreate the alignment information
   // here. Possibly we will need to do a mini SROA here and break the coroutine
   // frame into individual AllocaInst recreating the original alignment.
-  auto *Frame = new AllocaInst(FrameTy, "", InsertPt);
+  const DataLayout &DL = F->getParent()->getDataLayout();
+  auto *Frame = new AllocaInst(FrameTy, DL.getAllocaAddrSpace(), "", InsertPt);
   auto *FrameVoidPtr =
       new BitCastInst(Frame, Type::getInt8PtrTy(C), "vFrame", InsertPt);
 
diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp
index d7eaeaff87eae2c3868f0c14a39877b94a29a6c9..19e6789dfa74a09b7c01c8f0e37070c07c49c5c2 100644
--- a/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -435,6 +435,10 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) {
           // normal edge and insert the spill in the new block.
           auto NewBB = SplitEdge(II->getParent(), II->getNormalDest());
           InsertPt = NewBB->getTerminator();
+        } else if (dyn_cast<PHINode>(CurrentValue)) {
+          // Skip the PHINodes and EH pads instructions.
+          InsertPt =
+              &*cast<Instruction>(E.def())->getParent()->getFirstInsertionPt();
         } else {
           // For all other values, the spill is placed immediately after
           // the definition.
@@ -701,13 +705,12 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
           Spills.emplace_back(&I, U);
 
   // Rewrite materializable instructions to be materialized at the use point.
-  std::sort(Spills.begin(), Spills.end());
   DEBUG(dump("Materializations", Spills));
   rewriteMaterializableInstructions(Builder, Spills);
 
   // Collect the spills for arguments and other not-materializable values.
   Spills.clear();
-  for (Argument &A : F.getArgumentList())
+  for (Argument &A : F.args())
     for (User *U : A.users())
       if (Checker.isDefinitionAcrossSuspend(A, U))
         Spills.emplace_back(&A, U);
@@ -733,7 +736,6 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
         Spills.emplace_back(&I, U);
       }
   }
-  std::sort(Spills.begin(), Spills.end());
   DEBUG(dump("Spills", Spills));
   moveSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
   Shape.FrameTy = buildFrameType(F, Shape, Spills);
diff --git a/lib/Transforms/Coroutines/CoroSplit.cpp b/lib/Transforms/Coroutines/CoroSplit.cpp
index 7a3f4f60bae90d2d8cdf4de603a57229451e57fb..ab648f884c5b18bb7af7ef71ca222d7ed2f32008 100644
--- a/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -22,6 +22,7 @@
 #include "CoroInternal.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Verifier.h"
@@ -144,6 +145,33 @@ static void replaceFallthroughCoroEnd(IntrinsicInst *End,
   BB->getTerminator()->eraseFromParent();
 }
 
+// In Resumers, we replace unwind coro.end with True to force the immediate
+// unwind to caller.
+static void replaceUnwindCoroEnds(coro::Shape &Shape, ValueToValueMapTy &VMap) {
+  if (Shape.CoroEnds.empty())
+    return;
+
+  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
+  auto *True = ConstantInt::getTrue(Context);
+  for (CoroEndInst *CE : Shape.CoroEnds) {
+    if (!CE->isUnwind())
+      continue;
+
+    auto *NewCE = cast<IntrinsicInst>(VMap[CE]);
+
+    // If coro.end has an associated bundle, add cleanupret instruction.
+    if (auto Bundle = NewCE->getOperandBundle(LLVMContext::OB_funclet)) {
+      Value *FromPad = Bundle->Inputs[0];
+      auto *CleanupRet = CleanupReturnInst::Create(FromPad, nullptr, NewCE);
+      NewCE->getParent()->splitBasicBlock(NewCE);
+      CleanupRet->getParent()->getTerminator()->eraseFromParent();
+    }
+
+    NewCE->replaceAllUsesWith(True);
+    NewCE->eraseFromParent();
+  }
+}
+
 // Rewrite final suspend point handling. We do not use suspend index to
 // represent the final suspend point. Instead we zero-out ResumeFnAddr in the
 // coroutine frame, since it is undefined behavior to resume a coroutine
@@ -157,9 +185,9 @@ static void handleFinalSuspend(IRBuilder<> &Builder, Value *FramePtr,
                                coro::Shape &Shape, SwitchInst *Switch,
                                bool IsDestroy) {
   assert(Shape.HasFinalSuspend);
-  auto FinalCase = --Switch->case_end();
-  BasicBlock *ResumeBB = FinalCase.getCaseSuccessor();
-  Switch->removeCase(FinalCase);
+  auto FinalCaseIt = std::prev(Switch->case_end());
+  BasicBlock *ResumeBB = FinalCaseIt->getCaseSuccessor();
+  Switch->removeCase(FinalCaseIt);
   if (IsDestroy) {
     BasicBlock *OldSwitchBB = Switch->getParent();
     auto *NewSwitchBB = OldSwitchBB->splitBasicBlock(Switch, "Switch");
@@ -195,7 +223,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
   // Replace all args with undefs. The buildCoroutineFrame algorithm already
   // rewritten access to the args that occurs after suspend points with loads
   // and stores to/from the coroutine frame.
-  for (Argument &A : F.getArgumentList())
+  for (Argument &A : F.args())
     VMap[&A] = UndefValue::get(A.getType());
 
   SmallVector<ReturnInst *, 4> Returns;
@@ -216,9 +244,9 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 
   // Remove old return attributes.
   NewF->removeAttributes(
-      AttributeSet::ReturnIndex,
-      AttributeSet::get(
-          NewF->getContext(), AttributeSet::ReturnIndex,
+      AttributeList::ReturnIndex,
+      AttributeList::get(
+          NewF->getContext(), AttributeList::ReturnIndex,
           AttributeFuncs::typeIncompatible(NewF->getReturnType())));
 
   // Make AllocaSpillBlock the new entry block.
@@ -236,7 +264,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
   IRBuilder<> Builder(&NewF->getEntryBlock().front());
 
   // Remap frame pointer.
-  Argument *NewFramePtr = &NewF->getArgumentList().front();
+  Argument *NewFramePtr = &*NewF->arg_begin();
   Value *OldFramePtr = cast<Value>(VMap[Shape.FramePtr]);
   NewFramePtr->takeName(OldFramePtr);
   OldFramePtr->replaceAllUsesWith(NewFramePtr);
@@ -270,9 +298,7 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 
   // Remove coro.end intrinsics.
   replaceFallthroughCoroEnd(Shape.CoroEnds.front(), VMap);
-  // FIXME: coming in upcoming patches:
-  // replaceUnwindCoroEnds(Shape.CoroEnds, VMap);
-
+  replaceUnwindCoroEnds(Shape, VMap);
   // Eliminate coro.free from the clones, replacing it with 'null' in cleanup,
   // to suppress deallocation code.
   coro::replaceCoroFree(cast<CoroIdInst>(VMap[Shape.CoroBegin->getId()]),
@@ -284,8 +310,16 @@ static Function *createClone(Function &F, Twine Suffix, coro::Shape &Shape,
 }
 
 static void removeCoroEnds(coro::Shape &Shape) {
-  for (CoroEndInst *CE : Shape.CoroEnds)
+  if (Shape.CoroEnds.empty())
+    return;
+
+  LLVMContext &Context = Shape.CoroEnds.front()->getContext();
+  auto *False = ConstantInt::getFalse(Context);
+
+  for (CoroEndInst *CE : Shape.CoroEnds) {
+    CE->replaceAllUsesWith(False);
     CE->eraseFromParent();
+  }
 }
 
 static void replaceFrameSize(coro::Shape &Shape) {
diff --git a/lib/Transforms/Coroutines/Coroutines.cpp b/lib/Transforms/Coroutines/Coroutines.cpp
index 877ec34b4d3b2b64918e5912192c52b90c31ce61..ea48043f9381ffbd35c254816a63667ffde127c3 100644
--- a/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/lib/Transforms/Coroutines/Coroutines.cpp
@@ -245,9 +245,9 @@ void coro::Shape::buildFrom(Function &F) {
           if (CoroBegin)
             report_fatal_error(
                 "coroutine should have exactly one defining @llvm.coro.begin");
-          CB->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
-          CB->addAttribute(AttributeSet::ReturnIndex, Attribute::NoAlias);
-          CB->removeAttribute(AttributeSet::FunctionIndex,
+          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
+          CB->addAttribute(AttributeList::ReturnIndex, Attribute::NoAlias);
+          CB->removeAttribute(AttributeList::FunctionIndex,
                               Attribute::NoDuplicate);
           CoroBegin = CB;
         }
diff --git a/lib/Transforms/IPO/ArgumentPromotion.cpp b/lib/Transforms/IPO/ArgumentPromotion.cpp
index a67332d067a855befddb6481fd9b9bc68540954d..c43557b4e1a9235577d708cca8eb94585c971a07 100644
--- a/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -29,7 +29,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -37,6 +39,7 @@
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/CFG.h"
@@ -67,9 +70,11 @@ typedef std::vector<uint64_t> IndicesVector;
 /// DoPromotion - This method actually performs the promotion of the specified
 /// arguments, and returns the new function.  At this point, we know that it's
 /// safe to do so.
-static CallGraphNode *
+static Function *
 doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
-            SmallPtrSetImpl<Argument *> &ByValArgsToTransform, CallGraph &CG) {
+            SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
+            Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
+                ReplaceCallSite) {
 
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has modified arguments.
@@ -98,12 +103,10 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
   // that we are *not* promoting. For the ones that we do promote, the parameter
   // attributes are lost
   SmallVector<AttributeSet, 8> AttributesVec;
-  const AttributeSet &PAL = F->getAttributes();
+  const AttributeList &PAL = F->getAttributes();
 
   // Add any return attributes.
-  if (PAL.hasAttributes(AttributeSet::ReturnIndex))
-    AttributesVec.push_back(
-        AttributeSet::get(F->getContext(), PAL.getRetAttributes()));
+  AttributesVec.push_back(PAL.getRetAttributes());
 
   // First, determine the new argument list
   unsigned ArgIndex = 1;
@@ -114,16 +117,13 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       Type *AgTy = cast<PointerType>(I->getType())->getElementType();
       StructType *STy = cast<StructType>(AgTy);
       Params.insert(Params.end(), STy->element_begin(), STy->element_end());
+      AttributesVec.insert(AttributesVec.end(), STy->getNumElements(),
+                           AttributeSet());
       ++NumByValArgsPromoted;
     } else if (!ArgsToPromote.count(&*I)) {
       // Unchanged argument
       Params.push_back(I->getType());
-      AttributeSet attrs = PAL.getParamAttributes(ArgIndex);
-      if (attrs.hasAttributes(ArgIndex)) {
-        AttrBuilder B(attrs, ArgIndex);
-        AttributesVec.push_back(
-            AttributeSet::get(F->getContext(), Params.size(), B));
-      }
+      AttributesVec.push_back(PAL.getParamAttributes(ArgIndex));
     } else if (I->use_empty()) {
       // Dead argument (which are always marked as promotable)
       ++NumArgumentsDead;
@@ -168,6 +168,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
         Params.push_back(GetElementPtrInst::getIndexedType(
             cast<PointerType>(I->getType()->getScalarType())->getElementType(),
             ArgIndex.second));
+        AttributesVec.push_back(AttributeSet());
         assert(Params.back());
       }
 
@@ -179,9 +180,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
   }
 
   // Add any function attributes.
-  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-    AttributesVec.push_back(
-        AttributeSet::get(FTy->getContext(), PAL.getFnAttributes()));
+  AttributesVec.push_back(PAL.getFnAttributes());
 
   Type *RetTy = FTy->getReturnType();
 
@@ -201,15 +200,12 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
   // Recompute the parameter attributes list based on the new arguments for
   // the function.
-  NF->setAttributes(AttributeSet::get(F->getContext(), AttributesVec));
+  NF->setAttributes(AttributeList::get(F->getContext(), AttributesVec));
   AttributesVec.clear();
 
   F->getParent()->getFunctionList().insert(F->getIterator(), NF);
   NF->takeName(F);
 
-  // Get a new callgraph node for NF.
-  CallGraphNode *NF_CGN = CG.getOrInsertFunction(NF);
-
   // Loop over all of the callers of the function, transforming the call sites
   // to pass in the loaded pointers.
   //
@@ -218,12 +214,10 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     CallSite CS(F->user_back());
     assert(CS.getCalledFunction() == F);
     Instruction *Call = CS.getInstruction();
-    const AttributeSet &CallPAL = CS.getAttributes();
+    const AttributeList &CallPAL = CS.getAttributes();
 
     // Add any return attributes.
-    if (CallPAL.hasAttributes(AttributeSet::ReturnIndex))
-      AttributesVec.push_back(
-          AttributeSet::get(F->getContext(), CallPAL.getRetAttributes()));
+    AttributesVec.push_back(CallPAL.getRetAttributes());
 
     // Loop over the operands, inserting GEP and loads in the caller as
     // appropriate.
@@ -233,12 +227,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
          ++I, ++AI, ++ArgIndex)
       if (!ArgsToPromote.count(&*I) && !ByValArgsToTransform.count(&*I)) {
         Args.push_back(*AI); // Unmodified argument
-
-        if (CallPAL.hasAttributes(ArgIndex)) {
-          AttrBuilder B(CallPAL, ArgIndex);
-          AttributesVec.push_back(
-              AttributeSet::get(F->getContext(), Args.size(), B));
-        }
+        AttributesVec.push_back(CallPAL.getAttributes(ArgIndex));
       } else if (ByValArgsToTransform.count(&*I)) {
         // Emit a GEP and load for each element of the struct.
         Type *AgTy = cast<PointerType>(I->getType())->getElementType();
@@ -251,6 +240,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
               STy, *AI, Idxs, (*AI)->getName() + "." + Twine(i), Call);
           // TODO: Tell AA about the new values?
           Args.push_back(new LoadInst(Idx, Idx->getName() + ".val", Call));
+          AttributesVec.push_back(AttributeSet());
         }
       } else if (!I->use_empty()) {
         // Non-dead argument: insert GEPs and loads as appropriate.
@@ -293,23 +283,18 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
           newLoad->setAAMetadata(AAInfo);
 
           Args.push_back(newLoad);
+          AttributesVec.push_back(AttributeSet());
         }
       }
 
     // Push any varargs arguments on the list.
     for (; AI != CS.arg_end(); ++AI, ++ArgIndex) {
       Args.push_back(*AI);
-      if (CallPAL.hasAttributes(ArgIndex)) {
-        AttrBuilder B(CallPAL, ArgIndex);
-        AttributesVec.push_back(
-            AttributeSet::get(F->getContext(), Args.size(), B));
-      }
+      AttributesVec.push_back(CallPAL.getAttributes(ArgIndex));
     }
 
     // Add any function attributes.
-    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
-      AttributesVec.push_back(
-          AttributeSet::get(Call->getContext(), CallPAL.getFnAttributes()));
+    AttributesVec.push_back(CallPAL.getFnAttributes());
 
     SmallVector<OperandBundleDef, 1> OpBundles;
     CS.getOperandBundlesAsDefs(OpBundles);
@@ -320,12 +305,12 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
                                Args, OpBundles, "", Call);
       cast<InvokeInst>(New)->setCallingConv(CS.getCallingConv());
       cast<InvokeInst>(New)->setAttributes(
-          AttributeSet::get(II->getContext(), AttributesVec));
+          AttributeList::get(II->getContext(), AttributesVec));
     } else {
       New = CallInst::Create(NF, Args, OpBundles, "", Call);
       cast<CallInst>(New)->setCallingConv(CS.getCallingConv());
       cast<CallInst>(New)->setAttributes(
-          AttributeSet::get(New->getContext(), AttributesVec));
+          AttributeList::get(New->getContext(), AttributesVec));
       cast<CallInst>(New)->setTailCallKind(
           cast<CallInst>(Call)->getTailCallKind());
     }
@@ -334,8 +319,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     AttributesVec.clear();
 
     // Update the callgraph to know that the callsite has been transformed.
-    CallGraphNode *CalleeNode = CG[Call->getParent()->getParent()];
-    CalleeNode->replaceCallEdge(CS, CallSite(New), NF_CGN);
+    if (ReplaceCallSite)
+      (*ReplaceCallSite)(CS, CallSite(New));
 
     if (!Call->use_empty()) {
       Call->replaceAllUsesWith(New);
@@ -347,6 +332,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     Call->eraseFromParent();
   }
 
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
   // Since we have now created the new function, splice the body of the old
   // function right into the new function, leaving the old rotting hulk of the
   // function empty.
@@ -374,7 +361,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
       // Just add all the struct element types.
       Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      Value *TheAlloca = new AllocaInst(AgTy, nullptr, "", InsertPt);
+      Value *TheAlloca = new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
+                                        "", InsertPt);
       StructType *STy = cast<StructType>(AgTy);
       Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
                         nullptr};
@@ -463,18 +451,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
     std::advance(I2, ArgIndices.size());
   }
 
-  NF_CGN->stealCalledFunctionsFrom(CG[F]);
-
-  // Now that the old function is dead, delete it.  If there is a dangling
-  // reference to the CallgraphNode, just leave the dead function around for
-  // someone else to nuke.
-  CallGraphNode *CGN = CG[F];
-  if (CGN->getNumReferences() == 0)
-    delete CG.removeFunctionFromModule(CGN);
-  else
-    F->setLinkage(Function::ExternalLinkage);
-
-  return NF_CGN;
+  return NF;
 }
 
 /// AllCallersPassInValidPointerForArgument - Return true if we can prove that
@@ -818,14 +795,13 @@ static bool canPaddingBeAccessed(Argument *arg) {
 /// example, all callers are direct).  If safe to promote some arguments, it
 /// calls the DoPromotion method.
 ///
-static CallGraphNode *
-promoteArguments(CallGraphNode *CGN, CallGraph &CG,
-                 function_ref<AAResults &(Function &F)> AARGetter,
-                 unsigned MaxElements) {
-  Function *F = CGN->getFunction();
-
+static Function *
+promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
+                 unsigned MaxElements,
+                 Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
+                     ReplaceCallSite) {
   // Make sure that it is local to this module.
-  if (!F || !F->hasLocalLinkage())
+  if (!F->hasLocalLinkage())
     return nullptr;
 
   // Don't promote arguments for variadic functions. Adding, removing, or
@@ -950,7 +926,52 @@ promoteArguments(CallGraphNode *CGN, CallGraph &CG,
   if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
     return nullptr;
 
-  return doPromotion(F, ArgsToPromote, ByValArgsToTransform, CG);
+  return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
+}
+
+PreservedAnalyses ArgumentPromotionPass::run(LazyCallGraph::SCC &C,
+                                             CGSCCAnalysisManager &AM,
+                                             LazyCallGraph &CG,
+                                             CGSCCUpdateResult &UR) {
+  bool Changed = false, LocalChange;
+
+  // Iterate until we stop promoting from this SCC.
+  do {
+    LocalChange = false;
+
+    for (LazyCallGraph::Node &N : C) {
+      Function &OldF = N.getFunction();
+
+      FunctionAnalysisManager &FAM =
+          AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+      // FIXME: This lambda must only be used with this function. We should
+      // skip the lambda and just get the AA results directly.
+      auto AARGetter = [&](Function &F) -> AAResults & {
+        assert(&F == &OldF && "Called with an unexpected function!");
+        return FAM.getResult<AAManager>(F);
+      };
+
+      Function *NewF = promoteArguments(&OldF, AARGetter, 3u, None);
+      if (!NewF)
+        continue;
+      LocalChange = true;
+
+      // Directly substitute the functions in the call graph. Note that this
+      // requires the old function to be completely dead and completely
+      // replaced by the new function. It does no call graph updates, it merely
+      // swaps out the particular function mapped to a particular node in the
+      // graph.
+      C.getOuterRefSCC().replaceNodeFunction(N, *NewF);
+      OldF.eraseFromParent();
+    }
+
+    Changed |= LocalChange;
+  } while (LocalChange);
+
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
 }
 
 namespace {
@@ -1001,16 +1022,7 @@ bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
   // changes.
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
 
-  // We compute dedicated AA results for each function in the SCC as needed. We
-  // use a lambda referencing external objects so that they live long enough to
-  // be queried, but we re-use them each time.
-  Optional<BasicAAResult> BAR;
-  Optional<AAResults> AAR;
-  auto AARGetter = [&](Function &F) -> AAResults & {
-    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
-    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
-    return *AAR;
-  };
+  LegacyAARGetter AARGetter(*this);
 
   bool Changed = false, LocalChange;
 
@@ -1019,9 +1031,31 @@ bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
     LocalChange = false;
     // Attempt to promote arguments from all functions in this SCC.
     for (CallGraphNode *OldNode : SCC) {
-      if (CallGraphNode *NewNode =
-              promoteArguments(OldNode, CG, AARGetter, MaxElements)) {
+      Function *OldF = OldNode->getFunction();
+      if (!OldF)
+        continue;
+
+      auto ReplaceCallSite = [&](CallSite OldCS, CallSite NewCS) {
+        Function *Caller = OldCS.getInstruction()->getParent()->getParent();
+        CallGraphNode *NewCalleeNode =
+            CG.getOrInsertFunction(NewCS.getCalledFunction());
+        CallGraphNode *CallerNode = CG[Caller];
+        CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode);
+      };
+
+      if (Function *NewF = promoteArguments(OldF, AARGetter, MaxElements,
+                                            {ReplaceCallSite})) {
         LocalChange = true;
+
+        // Update the call graph for the newly promoted function.
+        CallGraphNode *NewNode = CG.getOrInsertFunction(NewF);
+        NewNode->stealCalledFunctionsFrom(OldNode);
+        if (OldNode->getNumReferences() == 0)
+          delete CG.removeFunctionFromModule(OldNode);
+        else
+          OldF->setLinkage(Function::ExternalLinkage);
+
+        // And updat ethe SCC we're iterating as well.
         SCC.ReplaceNode(OldNode, NewNode);
       }
     }
diff --git a/lib/Transforms/IPO/ConstantMerge.cpp b/lib/Transforms/IPO/ConstantMerge.cpp
index d75ed206ad23c69cab10326bd3c03bbf92723f86..62b5a9c9ba26614c0c5d25acb7bc19dccac84bdd 100644
--- a/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/lib/Transforms/IPO/ConstantMerge.cpp
@@ -60,6 +60,23 @@ static bool IsBetterCanonical(const GlobalVariable &A,
   return A.hasGlobalUnnamedAddr();
 }
 
+static bool hasMetadataOtherThanDebugLoc(const GlobalVariable *GV) {
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  GV->getAllMetadata(MDs);
+  for (const auto &V : MDs)
+    if (V.first != LLVMContext::MD_dbg)
+      return true;
+  return false;
+}
+
+static void copyDebugLocMetadata(const GlobalVariable *From,
+                                 GlobalVariable *To) {
+  SmallVector<DIGlobalVariableExpression *, 1> MDs;
+  From->getDebugInfo(MDs);
+  for (auto MD : MDs)
+    To->addDebugInfo(MD);
+}
+
 static unsigned getAlignment(GlobalVariable *GV) {
   unsigned Align = GV->getAlignment();
   if (Align)
@@ -113,6 +130,10 @@ static bool mergeConstants(Module &M) {
       if (GV->isWeakForLinker())
         continue;
 
+      // Don't touch globals with metadata other then !dbg.
+      if (hasMetadataOtherThanDebugLoc(GV))
+        continue;
+
       Constant *Init = GV->getInitializer();
 
       // Check to see if the initializer is already known.
@@ -155,6 +176,9 @@ static bool mergeConstants(Module &M) {
       if (!Slot->hasGlobalUnnamedAddr() && !GV->hasGlobalUnnamedAddr())
         continue;
 
+      if (hasMetadataOtherThanDebugLoc(GV))
+        continue;
+
       if (!GV->hasGlobalUnnamedAddr())
         Slot->setUnnamedAddr(GlobalValue::UnnamedAddr::None);
 
@@ -178,6 +202,8 @@ static bool mergeConstants(Module &M) {
                      getAlignment(Replacements[i].second)));
       }
 
+      copyDebugLocMetadata(Replacements[i].first, Replacements[i].second);
+
       // Eliminate any uses of the dead global.
       Replacements[i].first->replaceAllUsesWith(Replacements[i].second);
 
diff --git a/lib/Transforms/IPO/CrossDSOCFI.cpp b/lib/Transforms/IPO/CrossDSOCFI.cpp
index ba2e60dee3bcbe5b39e7b205b6e0f9680abf0859..1b111de061576e4afefcd2a4c1fee9f7237bfde1 100644
--- a/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -98,8 +98,11 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
   LLVMContext &Ctx = M.getContext();
   Constant *C = M.getOrInsertFunction(
       "__cfi_check", Type::getVoidTy(Ctx), Type::getInt64Ty(Ctx),
-      Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx), nullptr);
+      Type::getInt8PtrTy(Ctx), Type::getInt8PtrTy(Ctx));
   Function *F = dyn_cast<Function>(C);
+  // Take over the existing function. The frontend emits a weak stub so that the
+  // linker knows about the symbol; this pass replaces the function body.
+  F->deleteBody();
   F->setAlignment(4096);
   auto args = F->arg_begin();
   Value &CallSiteTypeId = *(args++);
@@ -117,7 +120,7 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
   IRBuilder<> IRBFail(TrapBB);
   Constant *CFICheckFailFn = M.getOrInsertFunction(
       "__cfi_check_fail", Type::getVoidTy(Ctx), Type::getInt8PtrTy(Ctx),
-      Type::getInt8PtrTy(Ctx), nullptr);
+      Type::getInt8PtrTy(Ctx));
   IRBFail.CreateCall(CFICheckFailFn, {&CFICheckFailData, &Addr});
   IRBFail.CreateBr(ExitBB);
 
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 1a5ed46922118b597ccad892d697ef48d1326002..66eb33f246ac018dc603668f77c08f0cdc723573 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -166,15 +166,16 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
     Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs);
 
     // Drop any attributes that were on the vararg arguments.
-    AttributeSet PAL = CS.getAttributes();
+    AttributeList PAL = CS.getAttributes();
     if (!PAL.isEmpty() && PAL.getSlotIndex(PAL.getNumSlots() - 1) > NumArgs) {
-      SmallVector<AttributeSet, 8> AttributesVec;
+      SmallVector<AttributeList, 8> AttributesVec;
       for (unsigned i = 0; PAL.getSlotIndex(i) <= NumArgs; ++i)
         AttributesVec.push_back(PAL.getSlotAttributes(i));
-      if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-        AttributesVec.push_back(AttributeSet::get(Fn.getContext(),
-                                                  PAL.getFnAttributes()));
-      PAL = AttributeSet::get(Fn.getContext(), AttributesVec);
+      if (PAL.hasAttributes(AttributeList::FunctionIndex))
+        AttributesVec.push_back(AttributeList::get(Fn.getContext(),
+                                                   AttributeList::FunctionIndex,
+                                                   PAL.getFnAttributes()));
+      PAL = AttributeList::get(Fn.getContext(), AttributesVec);
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
@@ -194,6 +195,9 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
           cast<CallInst>(Call)->getTailCallKind());
     }
     New->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      New->setProfWeight(W);
 
     Args.clear();
 
@@ -682,7 +686,11 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
   // Set up to build a new list of parameter attributes.
   SmallVector<AttributeSet, 8> AttributesVec;
-  const AttributeSet &PAL = F->getAttributes();
+  const AttributeList &PAL = F->getAttributes();
+
+  // Reserve an empty slot for the return value attributes, which we will
+  // compute last.
+  AttributesVec.push_back(AttributeSet());
 
   // Remember which arguments are still alive.
   SmallVector<bool, 10> ArgAlive(FTy->getNumParams(), false);
@@ -696,16 +704,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     if (LiveValues.erase(Arg)) {
       Params.push_back(I->getType());
       ArgAlive[i] = true;
-
-      // Get the original parameter attributes (skipping the first one, that is
-      // for the return value.
-      if (PAL.hasAttributes(i + 1)) {
-        AttrBuilder B(PAL, i + 1);
-        if (B.contains(Attribute::Returned))
-          HasLiveReturnedArg = true;
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Params.size(), B));
-      }
+      AttributesVec.push_back(PAL.getParamAttributes(i + 1));
+      HasLiveReturnedArg |= PAL.hasAttribute(i + 1, Attribute::Returned);
     } else {
       ++NumArgumentsEliminated;
       DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument " << i
@@ -779,30 +779,26 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   assert(NRetTy && "No new return type found?");
 
   // The existing function return attributes.
-  AttributeSet RAttrs = PAL.getRetAttributes();
+  AttrBuilder RAttrs(PAL.getRetAttributes());
 
   // Remove any incompatible attributes, but only if we removed all return
   // values. Otherwise, ensure that we don't have any conflicting attributes
   // here. Currently, this should not be possible, but special handling might be
   // required when new return value attributes are added.
   if (NRetTy->isVoidTy())
-    RAttrs = RAttrs.removeAttributes(NRetTy->getContext(),
-                                     AttributeSet::ReturnIndex,
-                                     AttributeFuncs::typeIncompatible(NRetTy));
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
   else
-    assert(!AttrBuilder(RAttrs, AttributeSet::ReturnIndex).
-             overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
+    assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
            "Return attributes no longer compatible?");
 
-  if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
-    AttributesVec.push_back(AttributeSet::get(NRetTy->getContext(), RAttrs));
+  AttributesVec[0] = AttributeSet::get(F->getContext(), RAttrs);
 
-  if (PAL.hasAttributes(AttributeSet::FunctionIndex))
-    AttributesVec.push_back(AttributeSet::get(F->getContext(),
-                                              PAL.getFnAttributes()));
+  // Transfer the function attributes, if any.
+  AttributesVec.push_back(PAL.getFnAttributes());
 
   // Reconstruct the AttributesList based on the vector we constructed.
-  AttributeSet NewPAL = AttributeSet::get(F->getContext(), AttributesVec);
+  assert(AttributesVec.size() == Params.size() + 2);
+  AttributeList NewPAL = AttributeList::get(F->getContext(), AttributesVec);
 
   // Create the new function type based on the recomputed parameters.
   FunctionType *NFTy = FunctionType::get(NRetTy, Params, FTy->isVarArg());
@@ -830,17 +826,13 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     Instruction *Call = CS.getInstruction();
 
     AttributesVec.clear();
-    const AttributeSet &CallPAL = CS.getAttributes();
+    const AttributeList &CallPAL = CS.getAttributes();
 
-    // The call return attributes.
-    AttributeSet RAttrs = CallPAL.getRetAttributes();
-
-    // Adjust in case the function was changed to return void.
-    RAttrs = RAttrs.removeAttributes(NRetTy->getContext(),
-                                     AttributeSet::ReturnIndex,
-                        AttributeFuncs::typeIncompatible(NF->getReturnType()));
-    if (RAttrs.hasAttributes(AttributeSet::ReturnIndex))
-      AttributesVec.push_back(AttributeSet::get(NF->getContext(), RAttrs));
+    // Adjust the call return attributes in case the function was changed to
+    // return void.
+    AttrBuilder RAttrs(CallPAL.getRetAttributes());
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+    AttributesVec.push_back(AttributeSet::get(F->getContext(), RAttrs));
 
     // Declare these outside of the loops, so we can reuse them for the second
     // loop, which loops the varargs.
@@ -852,36 +844,33 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       if (ArgAlive[i]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        if (CallPAL.hasAttributes(i + 1)) {
-          AttrBuilder B(CallPAL, i + 1);
+        AttributeSet Attrs = CallPAL.getParamAttributes(i + 1);
+        if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
           // If the return type has changed, then get rid of 'returned' on the
           // call site. The alternative is to make all 'returned' attributes on
           // call sites keep the return value alive just like 'returned'
-          // attributes on function declaration but it's less clearly a win
-          // and this is not an expected case anyway
-          if (NRetTy != RetTy && B.contains(Attribute::Returned))
-            B.removeAttribute(Attribute::Returned);
-          AttributesVec.
-            push_back(AttributeSet::get(F->getContext(), Args.size(), B));
+          // attributes on function declaration but it's less clearly a win and
+          // this is not an expected case anyway
+          AttributesVec.push_back(AttributeSet::get(
+              F->getContext(),
+              AttrBuilder(Attrs).removeAttribute(Attribute::Returned)));
+        } else {
+          // Otherwise, use the original attributes.
+          AttributesVec.push_back(Attrs);
         }
       }
 
     // Push any varargs arguments on the list. Don't forget their attributes.
     for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
       Args.push_back(*I);
-      if (CallPAL.hasAttributes(i + 1)) {
-        AttrBuilder B(CallPAL, i + 1);
-        AttributesVec.
-          push_back(AttributeSet::get(F->getContext(), Args.size(), B));
-      }
+      AttributesVec.push_back(CallPAL.getParamAttributes(i + 1));
     }
 
-    if (CallPAL.hasAttributes(AttributeSet::FunctionIndex))
-      AttributesVec.push_back(AttributeSet::get(Call->getContext(),
-                                                CallPAL.getFnAttributes()));
+    AttributesVec.push_back(CallPAL.getFnAttributes());
 
     // Reconstruct the AttributesList based on the vector we constructed.
-    AttributeSet NewCallPAL = AttributeSet::get(F->getContext(), AttributesVec);
+    AttributeList NewCallPAL =
+        AttributeList::get(F->getContext(), AttributesVec);
 
     SmallVector<OperandBundleDef, 1> OpBundles;
     CS.getOperandBundlesAsDefs(OpBundles);
@@ -900,6 +889,9 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
           cast<CallInst>(Call)->getTailCallKind());
     }
     New->setDebugLoc(Call->getDebugLoc());
+    uint64_t W;
+    if (Call->extractProfTotalWeight(W))
+      New->setProfWeight(W);
 
     Args.clear();
 
diff --git a/lib/Transforms/IPO/FunctionAttrs.cpp b/lib/Transforms/IPO/FunctionAttrs.cpp
index 740e9fb8b90d31fb113411c967aaaf72905ca011..80fea977f4003727441b69746a19ad68551761c9 100644
--- a/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -49,31 +49,35 @@ STATISTIC(NumNoAlias, "Number of function returns marked noalias");
 STATISTIC(NumNonNullReturn, "Number of function returns marked nonnull");
 STATISTIC(NumNoRecurse, "Number of functions marked as norecurse");
 
-namespace {
-typedef SmallSetVector<Function *, 8> SCCNodeSet;
-}
+// FIXME: This is disabled by default to avoid exposing security vulnerabilities
+// in C/C++ code compiled by clang:
+// http://lists.llvm.org/pipermail/cfe-dev/2017-January/052066.html
+static cl::opt<bool> EnableNonnullArgPropagation(
+    "enable-nonnull-arg-prop", cl::Hidden,
+    cl::desc("Try to propagate nonnull argument attributes from callsites to "
+             "caller functions."));
 
 namespace {
-/// The three kinds of memory access relevant to 'readonly' and
-/// 'readnone' attributes.
-enum MemoryAccessKind {
-  MAK_ReadNone = 0,
-  MAK_ReadOnly = 1,
-  MAK_MayWrite = 2
-};
+typedef SmallSetVector<Function *, 8> SCCNodeSet;
 }
 
-static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR,
+/// Returns the memory access attribute for function F using AAR for AA results,
+/// where SCCNodes is the current SCC.
+///
+/// If ThisBody is true, this function may examine the function body and will
+/// return a result pertaining to this copy of the function. If it is false, the
+/// result will be based only on AA results for the function declaration; it
+/// will be assumed that some other (perhaps less optimized) version of the
+/// function may be selected at link time.
+static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
+                                                  AAResults &AAR,
                                                   const SCCNodeSet &SCCNodes) {
   FunctionModRefBehavior MRB = AAR.getModRefBehavior(&F);
   if (MRB == FMRB_DoesNotAccessMemory)
     // Already perfect!
     return MAK_ReadNone;
 
-  // Non-exact function definitions may not be selected at link time, and an
-  // alternative version that writes to memory may be selected.  See the comment
-  // on GlobalValue::isDefinitionExact for more details.
-  if (!F.hasExactDefinition()) {
+  if (!ThisBody) {
     if (AliasAnalysis::onlyReadsMemory(MRB))
       return MAK_ReadOnly;
 
@@ -172,9 +176,14 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, AAResults &AAR,
   return ReadsMemory ? MAK_ReadOnly : MAK_ReadNone;
 }
 
+MemoryAccessKind llvm::computeFunctionBodyMemoryAccess(Function &F,
+                                                       AAResults &AAR) {
+  return checkFunctionMemoryAccess(F, /*ThisBody=*/true, AAR, {});
+}
+
 /// Deduce readonly/readnone attributes for the SCC.
 template <typename AARGetterT>
-static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) {
+static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT &&AARGetter) {
   // Check if any of the functions in the SCC read or write memory.  If they
   // write memory then they can't be marked readnone or readonly.
   bool ReadsMemory = false;
@@ -182,7 +191,11 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) {
     // Call the callable parameter to look up AA results for this function.
     AAResults &AAR = AARGetter(*F);
 
-    switch (checkFunctionMemoryAccess(*F, AAR, SCCNodes)) {
+    // Non-exact function definitions may not be selected at link time, and an
+    // alternative version that writes to memory may be selected.  See the
+    // comment on GlobalValue::isDefinitionExact for more details.
+    switch (checkFunctionMemoryAccess(*F, F->hasExactDefinition(),
+                                      AAR, SCCNodes)) {
     case MAK_MayWrite:
       return false;
     case MAK_ReadOnly:
@@ -212,11 +225,11 @@ static bool addReadAttrs(const SCCNodeSet &SCCNodes, AARGetterT AARGetter) {
     AttrBuilder B;
     B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
     F->removeAttributes(
-        AttributeSet::FunctionIndex,
-        AttributeSet::get(F->getContext(), AttributeSet::FunctionIndex, B));
+        AttributeList::FunctionIndex,
+        AttributeList::get(F->getContext(), AttributeList::FunctionIndex, B));
 
     // Add in the new attribute.
-    F->addAttribute(AttributeSet::FunctionIndex,
+    F->addAttribute(AttributeList::FunctionIndex,
                     ReadsMemory ? Attribute::ReadOnly : Attribute::ReadNone);
 
     if (ReadsMemory)
@@ -522,7 +535,7 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
 
     if (Value *RetArg = FindRetArg()) {
       auto *A = cast<Argument>(RetArg);
-      A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+      A->addAttr(AttributeList::get(F->getContext(), A->getArgNo() + 1, B));
       ++NumReturned;
       Changed = true;
     }
@@ -531,6 +544,49 @@ static bool addArgumentReturnedAttrs(const SCCNodeSet &SCCNodes) {
   return Changed;
 }
 
+/// If a callsite has arguments that are also arguments to the parent function,
+/// try to propagate attributes from the callsite's arguments to the parent's
+/// arguments. This may be important because inlining can cause information loss
+/// when attribute knowledge disappears with the inlined call.
+static bool addArgumentAttrsFromCallsites(Function &F) {
+  if (!EnableNonnullArgPropagation)
+    return false;
+
+  bool Changed = false;
+
+  // For an argument attribute to transfer from a callsite to the parent, the
+  // call must be guaranteed to execute every time the parent is called.
+  // Conservatively, just check for calls in the entry block that are guaranteed
+  // to execute.
+  // TODO: This could be enhanced by testing if the callsite post-dominates the
+  // entry block or by doing simple forward walks or backward walks to the
+  // callsite.
+  BasicBlock &Entry = F.getEntryBlock();
+  for (Instruction &I : Entry) {
+    if (auto CS = CallSite(&I)) {
+      if (auto *CalledFunc = CS.getCalledFunction()) {
+        for (auto &CSArg : CalledFunc->args()) {
+          if (!CSArg.hasNonNullAttr())
+            continue;
+
+          // If the non-null callsite argument operand is an argument to 'F'
+          // (the caller) and the call is guaranteed to execute, then the value
+          // must be non-null throughout 'F'.
+          auto *FArg = dyn_cast<Argument>(CS.getArgOperand(CSArg.getArgNo()));
+          if (FArg && !FArg->hasNonNullAttr()) {
+            FArg->addAttr(Attribute::NonNull);
+            Changed = true;
+          }
+        }
+      }
+    }
+    if (!isGuaranteedToTransferExecutionToSuccessor(&I))
+      break;
+  }
+  
+  return Changed;
+}
+
 /// Deduce nocapture attributes for the SCC.
 static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
   bool Changed = false;
@@ -549,6 +605,8 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
     if (!F->hasExactDefinition())
       continue;
 
+    Changed |= addArgumentAttrsFromCallsites(*F);
+
     // Functions that are readonly (or readnone) and nounwind and don't return
     // a value can't capture arguments. Don't analyze them.
     if (F->onlyReadsMemory() && F->doesNotThrow() &&
@@ -556,7 +614,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
       for (Function::arg_iterator A = F->arg_begin(), E = F->arg_end(); A != E;
            ++A) {
         if (A->getType()->isPointerTy() && !A->hasNoCaptureAttr()) {
-          A->addAttr(AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+          A->addAttr(AttributeList::get(F->getContext(), A->getArgNo() + 1, B));
           ++NumNoCapture;
           Changed = true;
         }
@@ -576,7 +634,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
           if (Tracker.Uses.empty()) {
             // If it's trivially not captured, mark it nocapture now.
             A->addAttr(
-                AttributeSet::get(F->getContext(), A->getArgNo() + 1, B));
+                AttributeList::get(F->getContext(), A->getArgNo() + 1, B));
             ++NumNoCapture;
             Changed = true;
           } else {
@@ -604,7 +662,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
         if (R != Attribute::None) {
           AttrBuilder B;
           B.addAttribute(R);
-          A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+          A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B));
           Changed = true;
           R == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
         }
@@ -629,7 +687,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
       if (ArgumentSCC[0]->Uses.size() == 1 &&
           ArgumentSCC[0]->Uses[0] == ArgumentSCC[0]) {
         Argument *A = ArgumentSCC[0]->Definition;
-        A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+        A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B));
         ++NumNoCapture;
         Changed = true;
       }
@@ -671,7 +729,7 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
 
     for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
       Argument *A = ArgumentSCC[i]->Definition;
-      A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+      A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B));
       ++NumNoCapture;
       Changed = true;
     }
@@ -708,8 +766,9 @@ static bool addArgumentAttrs(const SCCNodeSet &SCCNodes) {
       for (unsigned i = 0, e = ArgumentSCC.size(); i != e; ++i) {
         Argument *A = ArgumentSCC[i]->Definition;
         // Clear out existing readonly/readnone attributes
-        A->removeAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, R));
-        A->addAttr(AttributeSet::get(A->getContext(), A->getArgNo() + 1, B));
+        A->removeAttr(
+            AttributeList::get(A->getContext(), A->getArgNo() + 1, R));
+        A->addAttr(AttributeList::get(A->getContext(), A->getArgNo() + 1, B));
         ReadAttr == Attribute::ReadOnly ? ++NumReadOnlyArg : ++NumReadNoneArg;
         Changed = true;
       }
@@ -905,7 +964,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
   // pointers.
   for (Function *F : SCCNodes) {
     // Already nonnull.
-    if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+    if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                         Attribute::NonNull))
       continue;
 
@@ -926,7 +985,7 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
         // Mark the function eagerly since we may discover a function
         // which prevents us from speculating about the entire SCC
         DEBUG(dbgs() << "Eagerly marking " << F->getName() << " as nonnull\n");
-        F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+        F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
         ++NumNonNullReturn;
         MadeChange = true;
       }
@@ -939,13 +998,13 @@ static bool addNonNullAttrs(const SCCNodeSet &SCCNodes) {
 
   if (SCCReturnsNonNull) {
     for (Function *F : SCCNodes) {
-      if (F->getAttributes().hasAttribute(AttributeSet::ReturnIndex,
+      if (F->getAttributes().hasAttribute(AttributeList::ReturnIndex,
                                           Attribute::NonNull) ||
           !F->getReturnType()->isPointerTy())
         continue;
 
       DEBUG(dbgs() << "SCC marking " << F->getName() << " as nonnull\n");
-      F->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+      F->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
       ++NumNonNullReturn;
       MadeChange = true;
     }
@@ -1163,19 +1222,7 @@ static bool runImpl(CallGraphSCC &SCC, AARGetterT AARGetter) {
 bool PostOrderFunctionAttrsLegacyPass::runOnSCC(CallGraphSCC &SCC) {
   if (skipSCC(SCC))
     return false;
-
-  // We compute dedicated AA results for each function in the SCC as needed. We
-  // use a lambda referencing external objects so that they live long enough to
-  // be queried, but we re-use them each time.
-  Optional<BasicAAResult> BAR;
-  Optional<AAResults> AAR;
-  auto AARGetter = [&](Function &F) -> AAResults & {
-    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
-    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
-    return *AAR;
-  };
-
-  return runImpl(SCC, AARGetter);
+  return runImpl(SCC, LegacyAARGetter(*this));
 }
 
 namespace {
diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp
index b8fc79a03b6d1d9c863a8baea326d4f13dd810e8..d1bf6e3553ace07d2869a7e3aee07ab282a0ecc8 100644
--- a/lib/Transforms/IPO/FunctionImport.cpp
+++ b/lib/Transforms/IPO/FunctionImport.cpp
@@ -75,12 +75,6 @@ static cl::opt<bool> PrintImports("print-imports", cl::init(false), cl::Hidden,
 static cl::opt<bool> ComputeDead("compute-dead", cl::init(true), cl::Hidden,
                                  cl::desc("Compute dead symbols"));
 
-// Temporary allows the function import pass to disable always linking
-// referenced discardable symbols.
-static cl::opt<bool>
-    DontForceImportReferencedDiscardableSymbols("disable-force-link-odr",
-                                                cl::init(false), cl::Hidden);
-
 static cl::opt<bool> EnableImportMetadata(
     "enable-import-metadata", cl::init(
 #if !defined(NDEBUG)
@@ -203,6 +197,15 @@ static void computeImportForFunction(
     auto GUID = Edge.first.getGUID();
     DEBUG(dbgs() << " edge -> " << GUID << " Threshold:" << Threshold << "\n");
 
+    if (Index.findGlobalValueSummaryList(GUID) == Index.end()) {
+      // For SamplePGO, the indirect call targets for local functions will
+      // have its original name annotated in profile. We try to find the
+      // corresponding PGOFuncName as the GUID.
+      GUID = Index.getGUIDFromOriginalID(GUID);
+      if (GUID == 0)
+        continue;
+    }
+
     if (DefinedGVSummaries.count(GUID)) {
       DEBUG(dbgs() << "ignored! Target already in destination module.\n");
       continue;
@@ -604,7 +607,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
   // the current module.
   StringSet<> AsmUndefinedRefs;
   ModuleSymbolTable::CollectAsmSymbols(
-      Triple(TheModule.getTargetTriple()), TheModule.getModuleInlineAsm(),
+      TheModule,
       [&AsmUndefinedRefs](StringRef Name, object::BasicSymbolRef::Flags Flags) {
         if (Flags & object::BasicSymbolRef::SF_Undefined)
           AsmUndefinedRefs.insert(Name);
@@ -659,14 +662,12 @@ void llvm::thinLTOInternalizeModule(Module &TheModule,
 // index.
 //
 Expected<bool> FunctionImporter::importFunctions(
-    Module &DestModule, const FunctionImporter::ImportMapTy &ImportList,
-    bool ForceImportReferencedDiscardableSymbols) {
+    Module &DestModule, const FunctionImporter::ImportMapTy &ImportList) {
   DEBUG(dbgs() << "Starting import for Module "
                << DestModule.getModuleIdentifier() << "\n");
   unsigned ImportedCount = 0;
 
-  // Linker that will be used for importing function
-  Linker TheLinker(DestModule);
+  IRMover Mover(DestModule);
   // Do the actual import of functions now, one Module at a time
   std::set<StringRef> ModuleNameOrderedList;
   for (auto &FunctionsToImportPerModule : ImportList) {
@@ -690,7 +691,7 @@ Expected<bool> FunctionImporter::importFunctions(
 
     auto &ImportGUIDs = FunctionsToImportPerModule->second;
     // Find the globals to import
-    DenseSet<const GlobalValue *> GlobalsToImport;
+    SetVector<GlobalValue *> GlobalsToImport;
     for (Function &F : *SrcModule) {
       if (!F.hasName())
         continue;
@@ -729,6 +730,13 @@ Expected<bool> FunctionImporter::importFunctions(
       }
     }
     for (GlobalAlias &GA : SrcModule->aliases()) {
+      // FIXME: This should eventually be controlled entirely by the summary.
+      if (FunctionImportGlobalProcessing::doImportAsDefinition(
+              &GA, &GlobalsToImport)) {
+        GlobalsToImport.insert(&GA);
+        continue;
+      }
+
       if (!GA.hasName())
         continue;
       auto GUID = GA.getGUID();
@@ -773,12 +781,9 @@ Expected<bool> FunctionImporter::importFunctions(
                << " from " << SrcModule->getSourceFileName() << "\n";
     }
 
-    // Instruct the linker that the client will take care of linkonce resolution
-    unsigned Flags = Linker::Flags::None;
-    if (!ForceImportReferencedDiscardableSymbols)
-      Flags |= Linker::Flags::DontForceLinkLinkonceODR;
-
-    if (TheLinker.linkInModule(std::move(SrcModule), Flags, &GlobalsToImport))
+    if (Mover.move(std::move(SrcModule), GlobalsToImport.getArrayRef(),
+                   [](GlobalValue &, IRMover::ValueAdder) {},
+                   /*IsPerformingImport=*/true))
       report_fatal_error("Function Import: link error");
 
     ImportedCount += GlobalsToImport.size();
@@ -838,8 +843,7 @@ static bool doImportingForModule(Module &M) {
     return loadFile(Identifier, M.getContext());
   };
   FunctionImporter Importer(*Index, ModuleLoader);
-  Expected<bool> Result = Importer.importFunctions(
-      M, ImportList, !DontForceImportReferencedDiscardableSymbols);
+  Expected<bool> Result = Importer.importFunctions(M, ImportList);
 
   // FIXME: Probably need to propagate Errors through the pass manager.
   if (!Result) {
diff --git a/lib/Transforms/IPO/GlobalOpt.cpp b/lib/Transforms/IPO/GlobalOpt.cpp
index 484fdbed0cb86551ca20631c26a18951e0810baf..ade4f21ceb5240171c1b22cec766bfce5f73b34c 100644
--- a/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/lib/Transforms/IPO/GlobalOpt.cpp
@@ -1819,12 +1819,14 @@ static bool processInternalGlobal(
       GS.AccessingFunction->doesNotRecurse() &&
       isPointerValueDeadOnEntryToFunction(GS.AccessingFunction, GV,
                                           LookupDomTree)) {
+    const DataLayout &DL = GV->getParent()->getDataLayout();
+
     DEBUG(dbgs() << "LOCALIZING GLOBAL: " << *GV << "\n");
     Instruction &FirstI = const_cast<Instruction&>(*GS.AccessingFunction
                                                    ->getEntryBlock().begin());
     Type *ElemTy = GV->getValueType();
     // FIXME: Pass Global's alignment when globals have alignment
-    AllocaInst *Alloca = new AllocaInst(ElemTy, nullptr,
+    AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr,
                                         GV->getName(), &FirstI);
     if (!isa<UndefValue>(GV->getInitializer()))
       new StoreInst(GV->getInitializer(), Alloca, &FirstI);
@@ -1977,7 +1979,7 @@ static void ChangeCalleesToFastCall(Function *F) {
   }
 }
 
-static AttributeSet StripNest(LLVMContext &C, const AttributeSet &Attrs) {
+static AttributeList StripNest(LLVMContext &C, const AttributeList &Attrs) {
   for (unsigned i = 0, e = Attrs.getNumSlots(); i != e; ++i) {
     unsigned Index = Attrs.getSlotIndex(i);
     if (!Attrs.getSlotAttributes(i).hasAttribute(Index, Attribute::Nest))
diff --git a/lib/Transforms/IPO/GlobalSplit.cpp b/lib/Transforms/IPO/GlobalSplit.cpp
index bbbd096e89c0ff38a3c33f4e6a42c07b5e54042b..4705ebe265ae120289e0097a29a55d69234a9a28 100644
--- a/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/lib/Transforms/IPO/GlobalSplit.cpp
@@ -85,7 +85,16 @@ bool splitGlobal(GlobalVariable &GV) {
       uint64_t ByteOffset = cast<ConstantInt>(
               cast<ConstantAsMetadata>(Type->getOperand(0))->getValue())
               ->getZExtValue();
-      if (ByteOffset < SplitBegin || ByteOffset >= SplitEnd)
+      // Type metadata may be attached one byte after the end of the vtable, for
+      // classes without virtual methods in Itanium ABI. AFAIK, it is never
+      // attached to the first byte of a vtable. Subtract one to get the right
+      // slice.
+      // This is making an assumption that vtable groups are the only kinds of
+      // global variables that !type metadata can be attached to, and that they
+      // are either Itanium ABI vtable groups or contain a single vtable (i.e.
+      // Microsoft ABI vtables).
+      uint64_t AttachedTo = (ByteOffset == 0) ? ByteOffset : ByteOffset - 1;
+      if (AttachedTo < SplitBegin || AttachedTo >= SplitEnd)
         continue;
       SplitGV->addMetadata(
           LLVMContext::MD_type,
diff --git a/lib/Transforms/IPO/IPConstantPropagation.cpp b/lib/Transforms/IPO/IPConstantPropagation.cpp
index 916135e33cd50544c6e8c162e100ef2cfde66a14..349807496dc2c0dc0f8cd188a76aa61275d64dcc 100644
--- a/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -136,7 +136,13 @@ static bool PropagateConstantReturn(Function &F) {
   // For more details, see GlobalValue::mayBeDerefined.
   if (!F.isDefinitionExact())
     return false;
-    
+
+  // Don't touch naked functions. The may contain asm returning
+  // value we don't see, so we may end up interprocedurally propagating
+  // the return value incorrectly.
+  if (F.hasFnAttribute(Attribute::Naked))
+    return false;
+
   // Check to see if this function returns a constant.
   SmallVector<Value *,4> RetVals;
   StructType *STy = dyn_cast<StructType>(F.getReturnType());
diff --git a/lib/Transforms/IPO/InlineSimple.cpp b/lib/Transforms/IPO/InlineSimple.cpp
index 514baa810903a4a9993ee5f102d488e40bd4fd80..50e7cc89a3b32f2608418b72019db810dfebdf25 100644
--- a/lib/Transforms/IPO/InlineSimple.cpp
+++ b/lib/Transforms/IPO/InlineSimple.cpp
@@ -93,8 +93,12 @@ Pass *llvm::createFunctionInliningPass(int Threshold) {
 }
 
 Pass *llvm::createFunctionInliningPass(unsigned OptLevel,
-                                       unsigned SizeOptLevel) {
-  return new SimpleInliner(llvm::getInlineParams(OptLevel, SizeOptLevel));
+                                       unsigned SizeOptLevel,
+                                       bool DisableInlineHotCallSite) {
+  auto Param = llvm::getInlineParams(OptLevel, SizeOptLevel);
+  if (DisableInlineHotCallSite)
+    Param.HotCallSiteThreshold = 0;
+  return new SimpleInliner(Param);
 }
 
 Pass *llvm::createFunctionInliningPass(InlineParams &Params) {
diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp
index c7c7af7acfaee66e60baf8aee0b954b6612f0db1..6c83c99ae3be5ebb83a0c3fa10a7eaee5fbade61 100644
--- a/lib/Transforms/IPO/Inliner.cpp
+++ b/lib/Transforms/IPO/Inliner.cpp
@@ -289,7 +289,7 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
   // treating them as truly abstract units etc.
   TotalSecondaryCost = 0;
   // The candidate cost to be imposed upon the current function.
-  int CandidateCost = IC.getCost() - (InlineConstants::CallPenalty + 1);
+  int CandidateCost = IC.getCost() - 1;
   // This bool tracks what happens if we do NOT inline C into B.
   bool callerWillBeRemoved = Caller->hasLocalLinkage();
   // This bool tracks what happens if we DO inline C into B.
@@ -326,7 +326,7 @@ shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
   // one is set very low by getInlineCost, in anticipation that Caller will
   // be removed entirely.  We did not account for this above unless there
   // is only one caller of Caller.
-  if (callerWillBeRemoved && !Caller->use_empty())
+  if (callerWillBeRemoved && !Caller->hasOneUse())
     TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus;
 
   if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost())
@@ -636,22 +636,12 @@ bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
   ACT = &getAnalysis<AssumptionCacheTracker>();
   PSI = getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
-  // We compute dedicated AA results for each function in the SCC as needed. We
-  // use a lambda referencing external objects so that they live long enough to
-  // be queried, but we re-use them each time.
-  Optional<BasicAAResult> BAR;
-  Optional<AAResults> AAR;
-  auto AARGetter = [&](Function &F) -> AAResults & {
-    BAR.emplace(createLegacyPMBasicAAResult(*this, F));
-    AAR.emplace(createLegacyPMAAResults(*this, F, *BAR));
-    return *AAR;
-  };
   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
     return ACT->getAssumptionCache(F);
   };
   return inlineCallsImpl(SCC, CG, GetAssumptionCache, PSI, TLI, InsertLifetime,
                          [this](CallSite CS) { return getInlineCost(CS); },
-                         AARGetter, ImportedFunctionsStats);
+                         LegacyAARGetter(*this), ImportedFunctionsStats);
 }
 
 /// Remove now-dead linkonce functions at the end of
@@ -756,20 +746,52 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   Module &M = *InitialC.begin()->getFunction().getParent();
   ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
 
-  // We use a worklist of nodes to process so that we can handle if the SCC
-  // structure changes and some nodes are no longer part of the current SCC. We
-  // also need to use an updatable pointer for the SCC as a consequence.
-  SmallVector<LazyCallGraph::Node *, 16> Nodes;
-  for (auto &N : InitialC)
-    Nodes.push_back(&N);
+  // We use a single common worklist for calls across the entire SCC. We
+  // process these in-order and append new calls introduced during inlining to
+  // the end.
+  //
+  // Note that this particular order of processing is actually critical to
+  // avoid very bad behaviors. Consider *highly connected* call graphs where
+  // each function contains a small amonut of code and a couple of calls to
+  // other functions. Because the LLVM inliner is fundamentally a bottom-up
+  // inliner, it can handle gracefully the fact that these all appear to be
+  // reasonable inlining candidates as it will flatten things until they become
+  // too big to inline, and then move on and flatten another batch.
+  //
+  // However, when processing call edges *within* an SCC we cannot rely on this
+  // bottom-up behavior. As a consequence, with heavily connected *SCCs* of
+  // functions we can end up incrementally inlining N calls into each of
+  // N functions because each incremental inlining decision looks good and we
+  // don't have a topological ordering to prevent explosions.
+  //
+  // To compensate for this, we don't process transitive edges made immediate
+  // by inlining until we've done one pass of inlining across the entire SCC.
+  // Large, highly connected SCCs still lead to some amount of code bloat in
+  // this model, but it is uniformly spread across all the functions in the SCC
+  // and eventually they all become too large to inline, rather than
+  // incrementally maknig a single function grow in a super linear fashion.
+  SmallVector<std::pair<CallSite, int>, 16> Calls;
+
+  // Populate the initial list of calls in this SCC.
+  for (auto &N : InitialC) {
+    // We want to generally process call sites top-down in order for
+    // simplifications stemming from replacing the call with the returned value
+    // after inlining to be visible to subsequent inlining decisions.
+    // FIXME: Using instructions sequence is a really bad way to do this.
+    // Instead we should do an actual RPO walk of the function body.
+    for (Instruction &I : instructions(N.getFunction()))
+      if (auto CS = CallSite(&I))
+        if (Function *Callee = CS.getCalledFunction())
+          if (!Callee->isDeclaration())
+            Calls.push_back({CS, -1});
+  }
+  if (Calls.empty())
+    return PreservedAnalyses::all();
+
+  // Capture updatable variables for the current SCC and RefSCC.
   auto *C = &InitialC;
   auto *RC = &C->getOuterRefSCC();
 
-  // We also use a secondary worklist of call sites within a particular node to
-  // allow quickly continuing to inline through newly inlined call sites where
-  // possible.
-  SmallVector<std::pair<CallSite, int>, 16> Calls;
-
   // When inlining a callee produces new call sites, we want to keep track of
   // the fact that they were inlined from the callee.  This allows us to avoid
   // infinite inlining in some obscure cases.  To represent this, we use an
@@ -785,11 +807,17 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // defer deleting these to make it easier to handle the call graph updates.
   SmallVector<Function *, 4> DeadFunctions;
 
-  do {
-    auto &N = *Nodes.pop_back_val();
+  // Loop forward over all of the calls. Note that we cannot cache the size as
+  // inlining can introduce new calls that need to be processed.
+  for (int i = 0; i < (int)Calls.size(); ++i) {
+    // We expect the calls to typically be batched with sequences of calls that
+    // have the same caller, so we first set up some shared infrastructure for
+    // this caller. We also do any pruning we can at this layer on the caller
+    // alone.
+    Function &F = *Calls[i].first.getCaller();
+    LazyCallGraph::Node &N = *CG.lookup(F);
     if (CG.lookupSCC(N) != C)
       continue;
-    Function &F = N.getFunction();
     if (F.hasFnAttribute(Attribute::OptimizeNone))
       continue;
 
@@ -823,23 +851,14 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // Get the remarks emission analysis for the caller.
     auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
 
-    // We want to generally process call sites top-down in order for
-    // simplifications stemming from replacing the call with the returned value
-    // after inlining to be visible to subsequent inlining decisions. So we
-    // walk the function backwards and then process the back of the vector.
-    // FIXME: Using reverse is a really bad way to do this. Instead we should
-    // do an actual PO walk of the function body.
-    for (Instruction &I : reverse(instructions(F)))
-      if (auto CS = CallSite(&I))
-        if (Function *Callee = CS.getCalledFunction())
-          if (!Callee->isDeclaration())
-            Calls.push_back({CS, -1});
-
+    // Now process as many calls as we have within this caller in the sequnece.
+    // We bail out as soon as the caller has to change so we can update the
+    // call graph and prepare the context of that new caller.
     bool DidInline = false;
-    while (!Calls.empty()) {
+    for (; i < (int)Calls.size() && Calls[i].first.getCaller() == &F; ++i) {
       int InlineHistoryID;
       CallSite CS;
-      std::tie(CS, InlineHistoryID) = Calls.pop_back_val();
+      std::tie(CS, InlineHistoryID) = Calls[i];
       Function &Callee = *CS.getCalledFunction();
 
       if (InlineHistoryID != -1 &&
@@ -884,6 +903,12 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         // made dead by this operation on other functions).
         Callee.removeDeadConstantUsers();
         if (Callee.use_empty()) {
+          Calls.erase(
+              std::remove_if(Calls.begin() + i + 1, Calls.end(),
+                             [&Callee](const std::pair<CallSite, int> &Call) {
+                               return Call.first.getCaller() == &Callee;
+                             }),
+              Calls.end());
           // Clear the body and queue the function itself for deletion when we
           // finish inlining and call graph updates.
           // Note that after this point, it is an error to do anything other
@@ -896,6 +921,10 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       }
     }
 
+    // Back the call index up by one to put us in a good position to go around
+    // the outer loop.
+    --i;
+
     if (!DidInline)
       continue;
     Changed = true;
@@ -910,8 +939,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // below.
     for (Function *InlinedCallee : InlinedCallees) {
       LazyCallGraph::Node &CalleeN = *CG.lookup(*InlinedCallee);
-      for (LazyCallGraph::Edge &E : CalleeN)
-        RC->insertTrivialRefEdge(N, *E.getNode());
+      for (LazyCallGraph::Edge &E : *CalleeN)
+        RC->insertTrivialRefEdge(N, E.getNode());
     }
     InlinedCallees.clear();
 
@@ -924,7 +953,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
     DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
     RC = &C->getOuterRefSCC();
-  } while (!Nodes.empty());
+  }
 
   // Now that we've finished inlining all of the calls across this SCC, delete
   // all of the trivially dead functions, updating the call graph and the CGSCC
diff --git a/lib/Transforms/IPO/LowerTypeTests.cpp b/lib/Transforms/IPO/LowerTypeTests.cpp
index 120e93ab5b731c6c8d65fd7816287e898f4af0d6..785207efbe5c8e2df3494ddfa75ce170dafa97ba 100644
--- a/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -42,8 +42,6 @@
 using namespace llvm;
 using namespace lowertypetests;
 
-using SummaryAction = LowerTypeTestsSummaryAction;
-
 #define DEBUG_TYPE "lowertypetests"
 
 STATISTIC(ByteArraySizeBits, "Byte array size in bits");
@@ -57,13 +55,13 @@ static cl::opt<bool> AvoidReuse(
     cl::desc("Try to avoid reuse of byte array addresses using aliases"),
     cl::Hidden, cl::init(true));
 
-static cl::opt<SummaryAction> ClSummaryAction(
+static cl::opt<PassSummaryAction> ClSummaryAction(
     "lowertypetests-summary-action",
     cl::desc("What to do with the summary when running this pass"),
-    cl::values(clEnumValN(SummaryAction::None, "none", "Do nothing"),
-               clEnumValN(SummaryAction::Import, "import",
+    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+               clEnumValN(PassSummaryAction::Import, "import",
                           "Import typeid resolutions from summary and globals"),
-               clEnumValN(SummaryAction::Export, "export",
+               clEnumValN(PassSummaryAction::Export, "export",
                           "Export typeid resolutions to summary and globals")),
     cl::Hidden);
 
@@ -234,8 +232,8 @@ public:
 class LowerTypeTestsModule {
   Module &M;
 
-  SummaryAction Action;
-  ModuleSummaryIndex *Summary;
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
 
   bool LinkerSubsectionsViaSymbols;
   Triple::ArchType Arch;
@@ -267,7 +265,7 @@ class LowerTypeTestsModule {
   /// regular LTO or the regular LTO phase of ThinLTO), or indirectly using type
   /// identifier summaries and external symbol references (in ThinLTO backends).
   struct TypeIdLowering {
-    TypeTestResolution::Kind TheKind;
+    TypeTestResolution::Kind TheKind = TypeTestResolution::Unsat;
 
     /// All except Unsat: the start address within the combined global.
     Constant *OffsetedGlobal;
@@ -334,8 +332,8 @@ class LowerTypeTestsModule {
   void createJumpTable(Function *F, ArrayRef<GlobalTypeMember *> Functions);
 
 public:
-  LowerTypeTestsModule(Module &M, SummaryAction Action,
-                       ModuleSummaryIndex *Summary);
+  LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
+                       const ModuleSummaryIndex *ImportSummary);
   bool lower();
 
   // Lower the module using the action and summary passed as command line
@@ -348,15 +346,17 @@ struct LowerTypeTests : public ModulePass {
 
   bool UseCommandLine = false;
 
-  SummaryAction Action;
-  ModuleSummaryIndex *Summary;
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
 
   LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
     initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
   }
 
-  LowerTypeTests(SummaryAction Action, ModuleSummaryIndex *Summary)
-      : ModulePass(ID), Action(Action), Summary(Summary) {
+  LowerTypeTests(ModuleSummaryIndex *ExportSummary,
+                 const ModuleSummaryIndex *ImportSummary)
+      : ModulePass(ID), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary) {
     initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
   }
 
@@ -365,7 +365,7 @@ struct LowerTypeTests : public ModulePass {
       return false;
     if (UseCommandLine)
       return LowerTypeTestsModule::runForTesting(M);
-    return LowerTypeTestsModule(M, Action, Summary).lower();
+    return LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower();
   }
 };
 
@@ -375,9 +375,10 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
                 false)
 char LowerTypeTests::ID = 0;
 
-ModulePass *llvm::createLowerTypeTestsPass(SummaryAction Action,
-                                           ModuleSummaryIndex *Summary) {
-  return new LowerTypeTests(Action, Summary);
+ModulePass *
+llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
+                               const ModuleSummaryIndex *ImportSummary) {
+  return new LowerTypeTests(ExportSummary, ImportSummary);
 }
 
 /// Build a bit set for TypeId using the object layouts in
@@ -501,8 +502,7 @@ Value *LowerTypeTestsModule::createBitSetTest(IRBuilder<> &B,
     return createMaskedBitTest(B, TIL.InlineBits, BitOffset);
   } else {
     Constant *ByteArray = TIL.TheByteArray;
-    if (!LinkerSubsectionsViaSymbols && AvoidReuse &&
-        Action != SummaryAction::Import) {
+    if (!LinkerSubsectionsViaSymbols && AvoidReuse && !ImportSummary) {
       // Each use of the byte array uses a different alias. This makes the
       // backend less likely to reuse previously computed byte array addresses,
       // improving the security of the CFI mechanism based on this pass.
@@ -702,7 +702,8 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
 /// information about the type identifier.
 void LowerTypeTestsModule::exportTypeId(StringRef TypeId,
                                         const TypeIdLowering &TIL) {
-  TypeTestResolution &TTRes = Summary->getTypeIdSummary(TypeId).TTRes;
+  TypeTestResolution &TTRes =
+      ExportSummary->getOrInsertTypeIdSummary(TypeId).TTRes;
   TTRes.TheKind = TIL.TheKind;
 
   auto ExportGlobal = [&](StringRef Name, Constant *C) {
@@ -740,13 +741,15 @@ void LowerTypeTestsModule::exportTypeId(StringRef TypeId,
 
 LowerTypeTestsModule::TypeIdLowering
 LowerTypeTestsModule::importTypeId(StringRef TypeId) {
-  TypeTestResolution &TTRes = Summary->getTypeIdSummary(TypeId).TTRes;
+  const TypeIdSummary *TidSummary = ImportSummary->getTypeIdSummary(TypeId);
+  if (!TidSummary)
+    return {}; // Unsat: no globals match this type id.
+  const TypeTestResolution &TTRes = TidSummary->TTRes;
 
   TypeIdLowering TIL;
   TIL.TheKind = TTRes.TheKind;
 
   auto ImportGlobal = [&](StringRef Name, unsigned AbsWidth) {
-    unsigned PtrWidth = IntPtrTy->getBitWidth();
     Constant *C =
         M.getOrInsertGlobal(("__typeid_" + TypeId + "_" + Name).str(), Int8Ty);
     auto *GV = dyn_cast<GlobalVariable>(C);
@@ -757,13 +760,12 @@ LowerTypeTestsModule::importTypeId(StringRef TypeId) {
 
     GV->setVisibility(GlobalValue::HiddenVisibility);
     auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
-      auto *T = IntegerType::get(M.getContext(), PtrWidth);
-      auto *MinC = ConstantAsMetadata::get(ConstantInt::get(T, Min));
-      auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(T, Max));
+      auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+      auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
       GV->setMetadata(LLVMContext::MD_absolute_symbol,
                       MDNode::get(M.getContext(), {MinC, MaxC}));
     };
-    if (AbsWidth == PtrWidth)
+    if (AbsWidth == IntPtrTy->getBitWidth())
       SetAbsRange(~0ull, ~0ull); // Full set.
     else if (AbsWidth)
       SetAbsRange(0, 1ull << AbsWidth);
@@ -1294,9 +1296,11 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
 }
 
 /// Lower all type tests in this module.
-LowerTypeTestsModule::LowerTypeTestsModule(Module &M, SummaryAction Action,
-                                           ModuleSummaryIndex *Summary)
-    : M(M), Action(Action), Summary(Summary) {
+LowerTypeTestsModule::LowerTypeTestsModule(
+    Module &M, ModuleSummaryIndex *ExportSummary,
+    const ModuleSummaryIndex *ImportSummary)
+    : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary) {
+  assert(!(ExportSummary && ImportSummary));
   Triple TargetTriple(M.getTargetTriple());
   LinkerSubsectionsViaSymbols = TargetTriple.isMacOSX();
   Arch = TargetTriple.getArch();
@@ -1320,7 +1324,11 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
     ExitOnErr(errorCodeToError(In.error()));
   }
 
-  bool Changed = LowerTypeTestsModule(M, ClSummaryAction, &Summary).lower();
+  bool Changed =
+      LowerTypeTestsModule(
+          M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
+          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
+          .lower();
 
   if (!ClWriteSummary.empty()) {
     ExitOnError ExitOnErr("-lowertypetests-write-summary: " + ClWriteSummary +
@@ -1339,11 +1347,10 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
 bool LowerTypeTestsModule::lower() {
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
-  if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
-      Action != SummaryAction::Export)
+  if ((!TypeTestFunc || TypeTestFunc->use_empty()) && !ExportSummary)
     return false;
 
-  if (Action == SummaryAction::Import) {
+  if (ImportSummary) {
     for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
          UI != UE;) {
       auto *CI = cast<CallInst>((*UI++).getUser());
@@ -1424,7 +1431,7 @@ bool LowerTypeTestsModule::lower() {
     }
   }
 
-  if (Action == SummaryAction::Export) {
+  if (ExportSummary) {
     DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
     for (auto &P : TypeIdInfo) {
       if (auto *TypeId = dyn_cast<MDString>(P.first))
@@ -1432,7 +1439,7 @@ bool LowerTypeTestsModule::lower() {
             TypeId);
     }
 
-    for (auto &P : *Summary) {
+    for (auto &P : *ExportSummary) {
       for (auto &S : P.second) {
         auto *FS = dyn_cast<FunctionSummary>(S.get());
         if (!FS)
@@ -1503,8 +1510,9 @@ bool LowerTypeTestsModule::lower() {
 
 PreservedAnalyses LowerTypeTestsPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
-  bool Changed =
-      LowerTypeTestsModule(M, SummaryAction::None, /*Summary=*/nullptr).lower();
+  bool Changed = LowerTypeTestsModule(M, /*ExportSummary=*/nullptr,
+                                      /*ImportSummary=*/nullptr)
+                     .lower();
   if (!Changed)
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
diff --git a/lib/Transforms/IPO/MergeFunctions.cpp b/lib/Transforms/IPO/MergeFunctions.cpp
index d530393c9a47fb62e830c342e1189b2c8129ef14..4ce4de13c93847924f837fb2064e40ccf84c2d61 100644
--- a/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/lib/Transforms/IPO/MergeFunctions.cpp
@@ -436,11 +436,11 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
       auto CallSiteAttrs = CS.getAttributes();
 
       CallSiteAttrs = CallSiteAttrs.addAttributes(
-          Context, AttributeSet::ReturnIndex, NewFuncAttrs.getRetAttributes());
+          Context, AttributeList::ReturnIndex, NewFuncAttrs.getRetAttributes());
 
       for (unsigned argIdx = 0; argIdx < CS.arg_size(); argIdx++) {
         AttributeSet Attrs = NewFuncAttrs.getParamAttributes(argIdx);
-        if (Attrs.getNumSlots())
+        if (Attrs.hasAttributes())
           CallSiteAttrs = CallSiteAttrs.addAttributes(Context, argIdx, Attrs);
       }
 
diff --git a/lib/Transforms/IPO/PassManagerBuilder.cpp b/lib/Transforms/IPO/PassManagerBuilder.cpp
index 3c862d55697722abcc39293c17287fce0244989b..6cc6d3b63f49c633ea99811c9b5c2cf60df164db 100644
--- a/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -168,6 +168,7 @@ PassManagerBuilder::PassManagerBuilder() {
     PGOInstrUse = RunPGOInstrUse;
     PrepareForThinLTO = EnablePrepareForThinLTO;
     PerformThinLTO = false;
+    DivergentTarget = false;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -244,7 +245,7 @@ void PassManagerBuilder::populateFunctionPassManager(
   FPM.add(createCFGSimplificationPass());
   FPM.add(createSROAPass());
   FPM.add(createEarlyCSEPass());
-  if(EnableGVNHoist)
+  if (EnableGVNHoist)
     FPM.add(createGVNHoistPass());
   FPM.add(createLowerExpectIntrinsicPass());
 }
@@ -301,13 +302,17 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createLibCallsShrinkWrapPass());
   addExtensionsToPM(EP_Peephole, MPM);
 
+  // Optimize memory intrinsic calls based on the profiled size information.
+  if (SizeLevel == 0)
+    MPM.add(createPGOMemOPSizeOptLegacyPass());
+
   MPM.add(createTailCallEliminationPass()); // Eliminate tail calls
   MPM.add(createCFGSimplificationPass());     // Merge & remove BBs
   MPM.add(createReassociatePass());           // Reassociate expressions
   // Rotate Loop - disable header duplication at -Oz
   MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1));
   MPM.add(createLICMPass());                  // Hoist loop invariants
-  MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+  MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
   MPM.add(createCFGSimplificationPass());
   addInstructionCombiningPass(MPM);
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
@@ -320,7 +325,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
     MPM.add(createCFGSimplificationPass());
   }
   if (!DisableUnrollLoops)
-    MPM.add(createSimpleLoopUnrollPass());    // Unroll small loops
+    MPM.add(createSimpleLoopUnrollPass(OptLevel));    // Unroll small loops
   addExtensionsToPM(EP_LoopOptimizerEnd, MPM);
 
   if (OptLevel > 1) {
@@ -366,7 +371,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
 
       // BBVectorize may have significantly shortened a loop body; unroll again.
       if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());
+        MPM.add(createLoopUnrollPass(OptLevel));
     }
   }
 
@@ -431,7 +436,16 @@ void PassManagerBuilder::populateModulePassManager(
   // earlier in the pass pipeline, here before globalopt. Otherwise imported
   // available_externally functions look unreferenced and are removed.
   if (PerformThinLTO)
-    MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true));
+    MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true,
+                                                     !PGOSampleUse.empty()));
+
+  // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
+  // as it will change the CFG too much to make the 2nd profile annotation
+  // in backend more difficult.
+  bool PrepareForThinLTOUsingPGOSampleProfile =
+      PrepareForThinLTO && !PGOSampleUse.empty();
+  if (PrepareForThinLTOUsingPGOSampleProfile)
+    DisableUnrollLoops = true;
 
   if (!DisableUnitAtATime) {
     // Infer attributes about declarations if possible.
@@ -451,14 +465,18 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
   }
 
-  if (!PerformThinLTO) {
+  // For SamplePGO in ThinLTO compile phase, we do not want to do indirect
+  // call promotion as it will change the CFG too much to make the 2nd
+  // profile annotation in backend more difficult.
+  if (!PerformThinLTO && !PrepareForThinLTOUsingPGOSampleProfile) {
     /// PGO instrumentation is added during the compile phase for ThinLTO, do
     /// not run it a second time
     addPGOInstrPasses(MPM);
     // Indirect call promotion that promotes intra-module targets only.
     // For ThinLTO this is done earlier due to interactions with globalopt
     // for imported functions.
-    MPM.add(createPGOIndirectCallPromotionLegacyPass());
+    MPM.add(
+        createPGOIndirectCallPromotionLegacyPass(false, !PGOSampleUse.empty()));
   }
 
   if (EnableNonLTOGlobalsModRef)
@@ -586,7 +604,7 @@ void PassManagerBuilder::populateModulePassManager(
     MPM.add(createCorrelatedValuePropagationPass());
     addInstructionCombiningPass(MPM);
     MPM.add(createLICMPass());
-    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3));
+    MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
     MPM.add(createCFGSimplificationPass());
     addInstructionCombiningPass(MPM);
   }
@@ -612,16 +630,16 @@ void PassManagerBuilder::populateModulePassManager(
 
       // BBVectorize may have significantly shortened a loop body; unroll again.
       if (!DisableUnrollLoops)
-        MPM.add(createLoopUnrollPass());
+        MPM.add(createLoopUnrollPass(OptLevel));
     }
   }
 
   addExtensionsToPM(EP_Peephole, MPM);
-  MPM.add(createCFGSimplificationPass());
+  MPM.add(createLateCFGSimplificationPass()); // Switches to lookup tables
   addInstructionCombiningPass(MPM);
 
   if (!DisableUnrollLoops) {
-    MPM.add(createLoopUnrollPass());    // Unroll small loops
+    MPM.add(createLoopUnrollPass(OptLevel));    // Unroll small loops
 
     // LoopUnroll may generate some redundency to cleanup.
     addInstructionCombiningPass(MPM);
@@ -681,7 +699,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     // left by the earlier promotion pass that promotes intra-module targets.
     // This two-step promotion is to save the compile time. For LTO, it should
     // produce the same result as if we only do promotion here.
-    PM.add(createPGOIndirectCallPromotionLegacyPass(true));
+    PM.add(
+        createPGOIndirectCallPromotionLegacyPass(true, !PGOSampleUse.empty()));
 
     // Propagate constants at call sites into the functions they call.  This
     // opens opportunities for globalopt (and inlining) by substituting function
@@ -700,7 +719,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createGlobalSplitPass());
 
   // Apply whole-program devirtualization and virtual constant propagation.
-  PM.add(createWholeProgramDevirtPass());
+  PM.add(createWholeProgramDevirtPass(ExportSummary, nullptr));
 
   // That's all we need at opt level 1.
   if (OptLevel == 1)
@@ -771,11 +790,11 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     PM.add(createLoopInterchangePass());
 
   if (!DisableUnrollLoops)
-    PM.add(createSimpleLoopUnrollPass());   // Unroll small loops
+    PM.add(createSimpleLoopUnrollPass(OptLevel));   // Unroll small loops
   PM.add(createLoopVectorizePass(true, LoopVectorize));
   // The vectorizer may have significantly shortened a loop body; unroll again.
   if (!DisableUnrollLoops)
-    PM.add(createLoopUnrollPass());
+    PM.add(createLoopUnrollPass(OptLevel));
 
   // Now that we've optimized loops (in particular loop induction variables),
   // we may have exposed more scalar opportunities. Run parts of the scalar
@@ -829,9 +848,22 @@ void PassManagerBuilder::populateThinLTOPassManager(
   if (VerifyInput)
     PM.add(createVerifierPass());
 
-  if (Summary)
-    PM.add(
-        createLowerTypeTestsPass(LowerTypeTestsSummaryAction::Import, Summary));
+  if (ImportSummary) {
+    // These passes import type identifier resolutions for whole-program
+    // devirtualization and CFI. They must run early because other passes may
+    // disturb the specific instruction patterns that these passes look for,
+    // creating dependencies on resolutions that may not appear in the summary.
+    //
+    // For example, GVN may transform the pattern assume(type.test) appearing in
+    // two basic blocks into assume(phi(type.test, type.test)), which would
+    // transform a dependency on a WPD resolution into a dependency on a type
+    // identifier resolution for CFI.
+    //
+    // Also, WPD has access to more precise information than ICP and can
+    // devirtualize more effectively, so it should operate on the IR first.
+    PM.add(createWholeProgramDevirtPass(nullptr, ImportSummary));
+    PM.add(createLowerTypeTestsPass(nullptr, ImportSummary));
+  }
 
   populateModulePassManager(PM);
 
@@ -857,9 +889,7 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   // Lower type metadata and the type.test intrinsic. This pass supports Clang's
   // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
   // link time if CFI is enabled. The pass does nothing if CFI is disabled.
-  PM.add(createLowerTypeTestsPass(Summary ? LowerTypeTestsSummaryAction::Export
-                                          : LowerTypeTestsSummaryAction::None,
-                                  Summary));
+  PM.add(createLowerTypeTestsPass(ExportSummary, nullptr));
 
   if (OptLevel != 0)
     addLateLTOOptimizationPasses(PM);
diff --git a/lib/Transforms/IPO/SampleProfile.cpp b/lib/Transforms/IPO/SampleProfile.cpp
index cba0a813c44de4d347b780a6fabc9a495752b022..bae08d9b9ee76d2c56682789313911486f8e8b12 100644
--- a/lib/Transforms/IPO/SampleProfile.cpp
+++ b/lib/Transforms/IPO/SampleProfile.cpp
@@ -52,6 +52,7 @@
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include <cctype>
 
@@ -162,7 +163,8 @@ protected:
   ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
   const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
-  bool inlineHotFunctions(Function &F);
+  bool inlineHotFunctions(Function &F,
+                          DenseSet<GlobalValue::GUID> &ImportGUIDs);
   void printEdgeWeight(raw_ostream &OS, Edge E);
   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
@@ -175,7 +177,7 @@ protected:
   void buildEdges(Function &F);
   bool propagateThroughEdges(Function &F, bool UpdateBlockCount);
   void computeDominanceAndLoopInfo(Function &F);
-  unsigned getOffset(unsigned L, unsigned H) const;
+  unsigned getOffset(const DILocation *DIL) const;
   void clearFunctionData();
 
   /// \brief Map basic blocks to their computed weights.
@@ -400,15 +402,11 @@ void SampleProfileLoader::clearFunctionData() {
   CoverageTracker.clear();
 }
 
-/// \brief Returns the offset of lineno \p L to head_lineno \p H
-///
-/// \param L  Lineno
-/// \param H  Header lineno of the function
-///
-/// \returns offset to the header lineno. 16 bits are used to represent offset.
+/// Returns the line offset to the start line of the subprogram.
 /// We assume that a single function will not exceed 65535 LOC.
-unsigned SampleProfileLoader::getOffset(unsigned L, unsigned H) const {
-  return (L - H) & 0xffff;
+unsigned SampleProfileLoader::getOffset(const DILocation *DIL) const {
+  return (DIL->getLine() - DIL->getScope()->getSubprogram()->getLine()) &
+         0xffff;
 }
 
 /// \brief Print the weight of edge \p E on stream \p OS.
@@ -471,19 +469,14 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   // If a call/invoke instruction is inlined in profile, but not inlined here,
   // it means that the inlined callsite has no sample, thus the call
   // instruction should have 0 count.
-  bool IsCall = isa<CallInst>(Inst) || isa<InvokeInst>(Inst);
-  if (IsCall && findCalleeFunctionSamples(Inst))
+  if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
+      findCalleeFunctionSamples(Inst))
     return 0;
 
   const DILocation *DIL = DLoc;
-  unsigned Lineno = DLoc.getLine();
-  unsigned HeaderLineno = DIL->getScope()->getSubprogram()->getLine();
-
-  uint32_t LineOffset = getOffset(Lineno, HeaderLineno);
-  uint32_t Discriminator = DIL->getDiscriminator();
-  ErrorOr<uint64_t> R = IsCall
-                            ? FS->findCallSamplesAt(LineOffset, Discriminator)
-                            : FS->findSamplesAt(LineOffset, Discriminator);
+  uint32_t LineOffset = getOffset(DIL);
+  uint32_t Discriminator = DIL->getBaseDiscriminator();
+  ErrorOr<uint64_t> R = FS->findSamplesAt(LineOffset, Discriminator);
   if (R) {
     bool FirstMark =
         CoverageTracker.markSamplesUsed(FS, LineOffset, Discriminator, R.get());
@@ -496,9 +489,10 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
               " samples from profile (offset: " + Twine(LineOffset) +
               ((Discriminator) ? Twine(".") + Twine(Discriminator) : "") + ")");
     }
-    DEBUG(dbgs() << "    " << Lineno << "." << DIL->getDiscriminator() << ":"
-                 << Inst << " (line offset: " << Lineno - HeaderLineno << "."
-                 << DIL->getDiscriminator() << " - weight: " << R.get()
+    DEBUG(dbgs() << "    " << DLoc.getLine() << "."
+                 << DIL->getBaseDiscriminator() << ":" << Inst
+                 << " (line offset: " << LineOffset << "."
+                 << DIL->getBaseDiscriminator() << " - weight: " << R.get()
                  << ")\n");
   }
   return R;
@@ -565,16 +559,12 @@ SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
   if (!DIL) {
     return nullptr;
   }
-  DISubprogram *SP = DIL->getScope()->getSubprogram();
-  if (!SP)
-    return nullptr;
-
   const FunctionSamples *FS = findFunctionSamples(Inst);
   if (FS == nullptr)
     return nullptr;
 
-  return FS->findFunctionSamplesAt(LineLocation(
-      getOffset(DIL->getLine(), SP->getLine()), DIL->getDiscriminator()));
+  return FS->findFunctionSamplesAt(
+      LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()));
 }
 
 /// \brief Get the FunctionSamples for an instruction.
@@ -593,13 +583,8 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
   if (!DIL) {
     return Samples;
   }
-  for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt()) {
-    DISubprogram *SP = DIL->getScope()->getSubprogram();
-    if (!SP)
-      return nullptr;
-    S.push_back(LineLocation(getOffset(DIL->getLine(), SP->getLine()),
-                             DIL->getDiscriminator()));
-  }
+  for (DIL = DIL->getInlinedAt(); DIL; DIL = DIL->getInlinedAt())
+    S.push_back(LineLocation(getOffset(DIL), DIL->getBaseDiscriminator()));
   if (S.size() == 0)
     return Samples;
   const FunctionSamples *FS = Samples;
@@ -614,14 +599,17 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
 /// Iteratively traverse all callsites of the function \p F, and find if
 /// the corresponding inlined instance exists and is hot in profile. If
 /// it is hot enough, inline the callsites and adds new callsites of the
-/// callee into the caller.
-///
-/// TODO: investigate the possibility of not invoking InlineFunction directly.
+/// callee into the caller. If the call is an indirect call, first promote
+/// it to direct call. Each indirect call is limited with a single target.
 ///
 /// \param F function to perform iterative inlining.
+/// \param ImportGUIDs a set to be updated to include all GUIDs that come
+///     from a different module but inlined in the profiled binary.
 ///
 /// \returns True if there is any inline happened.
-bool SampleProfileLoader::inlineHotFunctions(Function &F) {
+bool SampleProfileLoader::inlineHotFunctions(
+    Function &F, DenseSet<GlobalValue::GUID> &ImportGUIDs) {
+  DenseSet<Instruction *> PromotedInsns;
   bool Changed = false;
   LLVMContext &Ctx = F.getContext();
   std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](
@@ -647,18 +635,40 @@ bool SampleProfileLoader::inlineHotFunctions(Function &F) {
     }
     for (auto I : CIS) {
       InlineFunctionInfo IFI(nullptr, ACT ? &GetAssumptionCache : nullptr);
-      CallSite CS(I);
-      Function *CalledFunction = CS.getCalledFunction();
-      if (!CalledFunction || !CalledFunction->getSubprogram())
+      Function *CalledFunction = CallSite(I).getCalledFunction();
+      Instruction *DI = I;
+      if (!CalledFunction && !PromotedInsns.count(I) &&
+          CallSite(I).isIndirectCall()) {
+        auto CalleeFunctionName = findCalleeFunctionSamples(*I)->getName();
+        const char *Reason = "Callee function not available";
+        CalledFunction = F.getParent()->getFunction(CalleeFunctionName);
+        if (CalledFunction && isLegalToPromote(I, CalledFunction, &Reason)) {
+          // The indirect target was promoted and inlined in the profile, as a
+          // result, we do not have profile info for the branch probability.
+          // We set the probability to 80% taken to indicate that the static
+          // call is likely taken.
+          DI = dyn_cast<Instruction>(
+              promoteIndirectCall(I, CalledFunction, 80, 100, false)
+                  ->stripPointerCasts());
+          PromotedInsns.insert(I);
+        } else {
+          DEBUG(dbgs() << "\nFailed to promote indirect call to "
+                       << CalleeFunctionName << " because " << Reason << "\n");
+          continue;
+        }
+      }
+      if (!CalledFunction || !CalledFunction->getSubprogram()) {
+        findCalleeFunctionSamples(*I)->findImportedFunctions(
+            ImportGUIDs, F.getParent(),
+            Samples->getTotalSamples() * SampleProfileHotThreshold / 100);
         continue;
+      }
       DebugLoc DLoc = I->getDebugLoc();
-      uint64_t NumSamples = findCalleeFunctionSamples(*I)->getTotalSamples();
-      if (InlineFunction(CS, IFI)) {
+      if (InlineFunction(CallSite(DI), IFI)) {
         LocalChanged = true;
         emitOptimizationRemark(Ctx, DEBUG_TYPE, F, DLoc,
                                Twine("inlined hot callee '") +
-                                   CalledFunction->getName() + "' with " +
-                                   Twine(NumSamples) + " samples into '" +
+                                   CalledFunction->getName() + "' into '" +
                                    F.getName() + "'");
       }
     }
@@ -1035,10 +1045,6 @@ void SampleProfileLoader::propagateWeights(Function &F) {
   bool Changed = true;
   unsigned I = 0;
 
-  // Add an entry count to the function using the samples gathered
-  // at the function entry.
-  F.setEntryCount(Samples->getHeadSamples() + 1);
-
   // If BB weight is larger than its corresponding loop's header BB weight,
   // use the BB weight to replace the loop header BB weight.
   for (auto &BI : F) {
@@ -1099,9 +1105,8 @@ void SampleProfileLoader::propagateWeights(Function &F) {
           if (!DLoc)
             continue;
           const DILocation *DIL = DLoc;
-          uint32_t LineOffset = getOffset(
-              DLoc.getLine(), DIL->getScope()->getSubprogram()->getLine());
-          uint32_t Discriminator = DIL->getDiscriminator();
+          uint32_t LineOffset = getOffset(DIL);
+          uint32_t Discriminator = DIL->getBaseDiscriminator();
 
           const FunctionSamples *FS = findFunctionSamples(I);
           if (!FS)
@@ -1155,9 +1160,13 @@ void SampleProfileLoader::propagateWeights(Function &F) {
       }
     }
 
+    uint64_t TempWeight;
     // Only set weights if there is at least one non-zero weight.
     // In any other case, let the analyzer set weights.
-    if (MaxWeight > 0) {
+    // Do not set weights if the weights are present. In ThinLTO, the profile
+    // annotation is done twice. If the first annotation already set the
+    // weights, the second pass does not need to set it.
+    if (MaxWeight > 0 && !TI->extractProfTotalWeight(TempWeight)) {
       DEBUG(dbgs() << "SUCCESS. Found non-zero weights.\n");
       TI->setMetadata(llvm::LLVMContext::MD_prof,
                       MDB.createBranchWeights(Weights));
@@ -1268,12 +1277,19 @@ bool SampleProfileLoader::emitAnnotations(Function &F) {
   DEBUG(dbgs() << "Line number for the first instruction in " << F.getName()
                << ": " << getFunctionLoc(F) << "\n");
 
-  Changed |= inlineHotFunctions(F);
+  DenseSet<GlobalValue::GUID> ImportGUIDs;
+  Changed |= inlineHotFunctions(F, ImportGUIDs);
 
   // Compute basic block weights.
   Changed |= computeBlockWeights(F);
 
   if (Changed) {
+    // Add an entry count to the function using the samples gathered at the
+    // function entry. Also sets the GUIDs that comes from a different
+    // module but inlined in the profiled binary. This is aiming at making
+    // the IR match the profiled binary before annotation.
+    F.setEntryCount(Samples->getHeadSamples() + 1, &ImportGUIDs);
+
     // Compute dominance and loop info needed for propagation.
     computeDominanceAndLoopInfo(F);
 
@@ -1369,7 +1385,7 @@ bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
 bool SampleProfileLoader::runOnFunction(Function &F) {
   F.setEntryCount(0);
   Samples = Reader->getSamplesFor(F);
-  if (!Samples->empty())
+  if (Samples && !Samples->empty())
     return emitAnnotations(F);
   return false;
 }
diff --git a/lib/Transforms/IPO/StripSymbols.cpp b/lib/Transforms/IPO/StripSymbols.cpp
index 8f6f161428e89daf665fa93a087379ee20cd5359..fb64367eef917c77ac9e2a7dfe0ff9a0d83e676d 100644
--- a/lib/Transforms/IPO/StripSymbols.cpp
+++ b/lib/Transforms/IPO/StripSymbols.cpp
@@ -323,6 +323,14 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
       LiveGVs.insert(GVE);
   }
 
+  std::set<DICompileUnit *> LiveCUs;
+  // Any CU referenced from a subprogram is live.
+  for (DISubprogram *SP : F.subprograms()) {
+    if (SP->getUnit())
+      LiveCUs.insert(SP->getUnit());
+  }
+
+  bool HasDeadCUs = false;
   for (DICompileUnit *DIC : F.compile_units()) {
     // Create our live global variable list.
     bool GlobalVariableChange = false;
@@ -341,6 +349,11 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
         GlobalVariableChange = true;
     }
 
+    if (!LiveGlobalVariables.empty())
+      LiveCUs.insert(DIC);
+    else if (!LiveCUs.count(DIC))
+      HasDeadCUs = true;
+
     // If we found dead global variables, replace the current global
     // variable list with our new live global variable list.
     if (GlobalVariableChange) {
@@ -352,5 +365,16 @@ bool StripDeadDebugInfo::runOnModule(Module &M) {
     LiveGlobalVariables.clear();
   }
 
+  if (HasDeadCUs) {
+    // Delete the old node and replace it with a new one
+    NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
+    NMD->clearOperands();
+    if (!LiveCUs.empty()) {
+      for (DICompileUnit *CU : LiveCUs)
+        NMD->addOperand(CU);
+    }
+    Changed = true;
+  }
+
   return Changed;
 }
diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 6ec69e32198012f10103b77964dac4184dbf139c..65deb82cd2a5fbd11a13df6f5d9321c08ced3561 100644
--- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -14,16 +14,21 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/IPO.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ScopedPrinter.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 using namespace llvm;
 
@@ -41,23 +46,14 @@ namespace {
 std::string getModuleId(Module *M) {
   MD5 Md5;
   bool ExportsSymbols = false;
-  auto AddGlobal = [&](GlobalValue &GV) {
+  for (auto &GV : M->global_values()) {
     if (GV.isDeclaration() || GV.getName().startswith("llvm.") ||
         !GV.hasExternalLinkage())
-      return;
+      continue;
     ExportsSymbols = true;
     Md5.update(GV.getName());
     Md5.update(ArrayRef<uint8_t>{0});
-  };
-
-  for (auto &F : *M)
-    AddGlobal(F);
-  for (auto &GV : M->globals())
-    AddGlobal(GV);
-  for (auto &GA : M->aliases())
-    AddGlobal(GA);
-  for (auto &IF : M->ifuncs())
-    AddGlobal(IF);
+  }
 
   if (!ExportsSymbols)
     return "";
@@ -73,15 +69,21 @@ std::string getModuleId(Module *M) {
 // Promote each local-linkage entity defined by ExportM and used by ImportM by
 // changing visibility and appending the given ModuleId.
 void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
-  auto PromoteInternal = [&](GlobalValue &ExportGV) {
+  DenseMap<const Comdat *, Comdat *> RenamedComdats;
+  for (auto &ExportGV : ExportM.global_values()) {
     if (!ExportGV.hasLocalLinkage())
-      return;
+      continue;
 
-    GlobalValue *ImportGV = ImportM.getNamedValue(ExportGV.getName());
+    auto Name = ExportGV.getName();
+    GlobalValue *ImportGV = ImportM.getNamedValue(Name);
     if (!ImportGV || ImportGV->use_empty())
-      return;
+      continue;
+
+    std::string NewName = (Name + ModuleId).str();
 
-    std::string NewName = (ExportGV.getName() + ModuleId).str();
+    if (const auto *C = ExportGV.getComdat())
+      if (C->getName() == Name)
+        RenamedComdats.try_emplace(C, ExportM.getOrInsertComdat(NewName));
 
     ExportGV.setName(NewName);
     ExportGV.setLinkage(GlobalValue::ExternalLinkage);
@@ -89,16 +91,15 @@ void promoteInternals(Module &ExportM, Module &ImportM, StringRef ModuleId) {
 
     ImportGV->setName(NewName);
     ImportGV->setVisibility(GlobalValue::HiddenVisibility);
-  };
+  }
 
-  for (auto &F : ExportM)
-    PromoteInternal(F);
-  for (auto &GV : ExportM.globals())
-    PromoteInternal(GV);
-  for (auto &GA : ExportM.aliases())
-    PromoteInternal(GA);
-  for (auto &IF : ExportM.ifuncs())
-    PromoteInternal(IF);
+  if (!RenamedComdats.empty())
+    for (auto &GO : ExportM.global_objects())
+      if (auto *C = GO.getComdat()) {
+        auto Replacement = RenamedComdats.find(C);
+        if (Replacement != RenamedComdats.end())
+          GO.setComdat(Replacement->second);
+      }
 }
 
 // Promote all internal (i.e. distinct) type ids used by the module by replacing
@@ -195,6 +196,26 @@ void simplifyExternals(Module &M) {
 
 void filterModule(
     Module *M, function_ref<bool(const GlobalValue *)> ShouldKeepDefinition) {
+  for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end();
+       I != E;) {
+    GlobalAlias *GA = &*I++;
+    if (ShouldKeepDefinition(GA))
+      continue;
+
+    GlobalObject *GO;
+    if (GA->getValueType()->isFunctionTy())
+      GO = Function::Create(cast<FunctionType>(GA->getValueType()),
+                            GlobalValue::ExternalLinkage, "", M);
+    else
+      GO = new GlobalVariable(
+          *M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
+          (Constant *)nullptr, "", (GlobalVariable *)nullptr,
+          GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
+    GO->takeName(GA);
+    GA->replaceAllUsesWith(GO);
+    GA->eraseFromParent();
+  }
+
   for (Function &F : *M) {
     if (ShouldKeepDefinition(&F))
       continue;
@@ -213,73 +234,149 @@ void filterModule(
     GV.setComdat(nullptr);
     GV.clearMetadata();
   }
+}
 
-  for (Module::alias_iterator I = M->alias_begin(), E = M->alias_end();
-       I != E;) {
-    GlobalAlias *GA = &*I++;
-    if (ShouldKeepDefinition(GA))
-      continue;
-
-    GlobalObject *GO;
-    if (I->getValueType()->isFunctionTy())
-      GO = Function::Create(cast<FunctionType>(GA->getValueType()),
-                            GlobalValue::ExternalLinkage, "", M);
-    else
-      GO = new GlobalVariable(
-          *M, GA->getValueType(), false, GlobalValue::ExternalLinkage,
-          (Constant *)nullptr, "", (GlobalVariable *)nullptr,
-          GA->getThreadLocalMode(), GA->getType()->getAddressSpace());
-    GO->takeName(GA);
-    GA->replaceAllUsesWith(GO);
-    GA->eraseFromParent();
-  }
+void forEachVirtualFunction(Constant *C, function_ref<void(Function *)> Fn) {
+  if (auto *F = dyn_cast<Function>(C))
+    return Fn(F);
+  if (isa<GlobalValue>(C))
+    return;
+  for (Value *Op : C->operands())
+    forEachVirtualFunction(cast<Constant>(Op), Fn);
 }
 
 // If it's possible to split M into regular and thin LTO parts, do so and write
 // a multi-module bitcode file with the two parts to OS. Otherwise, write only a
 // regular LTO bitcode file to OS.
-void splitAndWriteThinLTOBitcode(raw_ostream &OS, Module &M) {
+void splitAndWriteThinLTOBitcode(
+    raw_ostream &OS, raw_ostream *ThinLinkOS,
+    function_ref<AAResults &(Function &)> AARGetter, Module &M) {
   std::string ModuleId = getModuleId(&M);
   if (ModuleId.empty()) {
     // We couldn't generate a module ID for this module, just write it out as a
     // regular LTO module.
     WriteBitcodeToFile(&M, OS);
+    if (ThinLinkOS)
+      // We don't have a ThinLTO part, but still write the module to the
+      // ThinLinkOS if requested so that the expected output file is produced.
+      WriteBitcodeToFile(&M, *ThinLinkOS);
     return;
   }
 
   promoteTypeIds(M, ModuleId);
 
-  auto IsInMergedM = [&](const GlobalValue *GV) {
-    auto *GVar = dyn_cast<GlobalVariable>(GV->getBaseObject());
-    if (!GVar)
-      return false;
-
+  // Returns whether a global has attached type metadata. Such globals may
+  // participate in CFI or whole-program devirtualization, so they need to
+  // appear in the merged module instead of the thin LTO module.
+  auto HasTypeMetadata = [&](const GlobalObject *GO) {
     SmallVector<MDNode *, 1> MDs;
-    GVar->getMetadata(LLVMContext::MD_type, MDs);
+    GO->getMetadata(LLVMContext::MD_type, MDs);
     return !MDs.empty();
   };
 
+  // Collect the set of virtual functions that are eligible for virtual constant
+  // propagation. Each eligible function must not access memory, must return
+  // an integer of width <=64 bits, must take at least one argument, must not
+  // use its first argument (assumed to be "this") and all arguments other than
+  // the first one must be of <=64 bit integer type.
+  //
+  // Note that we test whether this copy of the function is readnone, rather
+  // than testing function attributes, which must hold for any copy of the
+  // function, even a less optimized version substituted at link time. This is
+  // sound because the virtual constant propagation optimizations effectively
+  // inline all implementations of the virtual function into each call site,
+  // rather than using function attributes to perform local optimization.
+  std::set<const Function *> EligibleVirtualFns;
+  // If any member of a comdat lives in MergedM, put all members of that
+  // comdat in MergedM to keep the comdat together.
+  DenseSet<const Comdat *> MergedMComdats;
+  for (GlobalVariable &GV : M.globals())
+    if (HasTypeMetadata(&GV)) {
+      if (const auto *C = GV.getComdat())
+        MergedMComdats.insert(C);
+      forEachVirtualFunction(GV.getInitializer(), [&](Function *F) {
+        auto *RT = dyn_cast<IntegerType>(F->getReturnType());
+        if (!RT || RT->getBitWidth() > 64 || F->arg_empty() ||
+            !F->arg_begin()->use_empty())
+          return;
+        for (auto &Arg : make_range(std::next(F->arg_begin()), F->arg_end())) {
+          auto *ArgT = dyn_cast<IntegerType>(Arg.getType());
+          if (!ArgT || ArgT->getBitWidth() > 64)
+            return;
+        }
+        if (computeFunctionBodyMemoryAccess(*F, AARGetter(*F)) == MAK_ReadNone)
+          EligibleVirtualFns.insert(F);
+      });
+    }
+
   ValueToValueMapTy VMap;
-  std::unique_ptr<Module> MergedM(CloneModule(&M, VMap, IsInMergedM));
+  std::unique_ptr<Module> MergedM(
+      CloneModule(&M, VMap, [&](const GlobalValue *GV) -> bool {
+        if (const auto *C = GV->getComdat())
+          if (MergedMComdats.count(C))
+            return true;
+        if (auto *F = dyn_cast<Function>(GV))
+          return EligibleVirtualFns.count(F);
+        if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+          return HasTypeMetadata(GVar);
+        return false;
+      }));
+  StripDebugInfo(*MergedM);
+
+  for (Function &F : *MergedM)
+    if (!F.isDeclaration()) {
+      // Reset the linkage of all functions eligible for virtual constant
+      // propagation. The canonical definitions live in the thin LTO module so
+      // that they can be imported.
+      F.setLinkage(GlobalValue::AvailableExternallyLinkage);
+      F.setComdat(nullptr);
+    }
 
-  filterModule(&M, [&](const GlobalValue *GV) { return !IsInMergedM(GV); });
+  // Remove all globals with type metadata, globals with comdats that live in
+  // MergedM, and aliases pointing to such globals from the thin LTO module.
+  filterModule(&M, [&](const GlobalValue *GV) {
+    if (auto *GVar = dyn_cast_or_null<GlobalVariable>(GV->getBaseObject()))
+      if (HasTypeMetadata(GVar))
+        return false;
+    if (const auto *C = GV->getComdat())
+      if (MergedMComdats.count(C))
+        return false;
+    return true;
+  });
 
   promoteInternals(*MergedM, M, ModuleId);
   promoteInternals(M, *MergedM, ModuleId);
 
   simplifyExternals(*MergedM);
 
-  SmallVector<char, 0> Buffer;
-  BitcodeWriter W(Buffer);
 
   // FIXME: Try to re-use BSI and PFI from the original module here.
   ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr);
-  W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
-                /*GenerateHash=*/true);
 
-  W.writeModule(MergedM.get());
+  SmallVector<char, 0> Buffer;
 
+  BitcodeWriter W(Buffer);
+  // Save the module hash produced for the full bitcode, which will
+  // be used in the backends, and use that in the minimized bitcode
+  // produced for the full link.
+  ModuleHash ModHash = {{0}};
+  W.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+                /*GenerateHash=*/true, &ModHash);
+  W.writeModule(MergedM.get());
   OS << Buffer;
+
+  // If a minimized bitcode module was requested for the thin link,
+  // strip the debug info (the merged module was already stripped above)
+  // and write it to the given OS.
+  if (ThinLinkOS) {
+    Buffer.clear();
+    BitcodeWriter W2(Buffer);
+    StripDebugInfo(M);
+    W2.writeModule(&M, /*ShouldPreserveUseListOrder=*/false, &Index,
+                   /*GenerateHash=*/false, &ModHash);
+    W2.writeModule(MergedM.get());
+    *ThinLinkOS << Buffer;
+  }
 }
 
 // Returns whether this module needs to be split because it uses type metadata.
@@ -294,28 +391,45 @@ bool requiresSplit(Module &M) {
   return false;
 }
 
-void writeThinLTOBitcode(raw_ostream &OS, Module &M,
-                         const ModuleSummaryIndex *Index) {
+void writeThinLTOBitcode(raw_ostream &OS, raw_ostream *ThinLinkOS,
+                         function_ref<AAResults &(Function &)> AARGetter,
+                         Module &M, const ModuleSummaryIndex *Index) {
   // See if this module has any type metadata. If so, we need to split it.
   if (requiresSplit(M))
-    return splitAndWriteThinLTOBitcode(OS, M);
+    return splitAndWriteThinLTOBitcode(OS, ThinLinkOS, AARGetter, M);
 
   // Otherwise we can just write it out as a regular module.
+
+  // Save the module hash produced for the full bitcode, which will
+  // be used in the backends, and use that in the minimized bitcode
+  // produced for the full link.
+  ModuleHash ModHash = {{0}};
   WriteBitcodeToFile(&M, OS, /*ShouldPreserveUseListOrder=*/false, Index,
-                     /*GenerateHash=*/true);
+                     /*GenerateHash=*/true, &ModHash);
+  // If a minimized bitcode module was requested for the thin link,
+  // strip the debug info and write it to the given OS.
+  if (ThinLinkOS) {
+    StripDebugInfo(M);
+    WriteBitcodeToFile(&M, *ThinLinkOS, /*ShouldPreserveUseListOrder=*/false,
+                       Index,
+                       /*GenerateHash=*/false, &ModHash);
+  }
 }
 
 class WriteThinLTOBitcode : public ModulePass {
   raw_ostream &OS; // raw_ostream to print on
+  // The output stream on which to emit a minimized module for use
+  // just in the thin link, if requested.
+  raw_ostream *ThinLinkOS;
 
 public:
   static char ID; // Pass identification, replacement for typeid
-  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()) {
+  WriteThinLTOBitcode() : ModulePass(ID), OS(dbgs()), ThinLinkOS(nullptr) {
     initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
   }
 
-  explicit WriteThinLTOBitcode(raw_ostream &o)
-      : ModulePass(ID), OS(o) {
+  explicit WriteThinLTOBitcode(raw_ostream &o, raw_ostream *ThinLinkOS)
+      : ModulePass(ID), OS(o), ThinLinkOS(ThinLinkOS) {
     initializeWriteThinLTOBitcodePass(*PassRegistry::getPassRegistry());
   }
 
@@ -324,12 +438,14 @@ public:
   bool runOnModule(Module &M) override {
     const ModuleSummaryIndex *Index =
         &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex());
-    writeThinLTOBitcode(OS, M, Index);
+    writeThinLTOBitcode(OS, ThinLinkOS, LegacyAARGetter(*this), M, Index);
     return true;
   }
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesAll();
+    AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<ModuleSummaryIndexWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 } // anonymous namespace
@@ -337,10 +453,13 @@ public:
 char WriteThinLTOBitcode::ID = 0;
 INITIALIZE_PASS_BEGIN(WriteThinLTOBitcode, "write-thinlto-bitcode",
                       "Write ThinLTO Bitcode", false, true)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ModuleSummaryIndexWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(WriteThinLTOBitcode, "write-thinlto-bitcode",
                     "Write ThinLTO Bitcode", false, true)
 
-ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str) {
-  return new WriteThinLTOBitcode(Str);
+ModulePass *llvm::createWriteThinLTOBitcodePass(raw_ostream &Str,
+                                                raw_ostream *ThinLinkOS) {
+  return new WriteThinLTOBitcode(Str, ThinLinkOS);
 }
diff --git a/lib/Transforms/IPO/WholeProgramDevirt.cpp b/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 844cc0f70eed653ceb67d32e954af15c7e4b3005..cb7d487b68b0ba55c86d33f80077993dce87eb05 100644
--- a/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -25,6 +25,20 @@
 //   returns 0, or a single vtable's function returns 1, replace each virtual
 //   call with a comparison of the vptr against that vtable's address.
 //
+// This pass is intended to be used during the regular and thin LTO pipelines.
+// During regular LTO, the pass determines the best optimization for each
+// virtual call and applies the resolutions directly to virtual calls that are
+// eligible for virtual call optimization (i.e. calls that use either of the
+// llvm.assume(llvm.type.test) or llvm.type.checked.load intrinsics). During
+// ThinLTO, the pass operates in two phases:
+// - Export phase: this is run during the thin link over a single merged module
+//   that contains all vtables with !type metadata that participate in the link.
+//   The pass computes a resolution for each virtual call and stores it in the
+//   type identifier summary.
+// - Import phase: this is run during the thin backends over the individual
+//   modules. The pass applies the resolutions previously computed during the
+//   import phase to each eligible virtual call.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
@@ -35,6 +49,8 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
@@ -54,12 +70,16 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndexYAML.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
 #include "llvm/PassSupport.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/FunctionAttrs.h"
 #include "llvm/Transforms/Utils/Evaluator.h"
 #include <algorithm>
 #include <cstddef>
@@ -72,6 +92,26 @@ using namespace wholeprogramdevirt;
 
 #define DEBUG_TYPE "wholeprogramdevirt"
 
+static cl::opt<PassSummaryAction> ClSummaryAction(
+    "wholeprogramdevirt-summary-action",
+    cl::desc("What to do with the summary when running this pass"),
+    cl::values(clEnumValN(PassSummaryAction::None, "none", "Do nothing"),
+               clEnumValN(PassSummaryAction::Import, "import",
+                          "Import typeid resolutions from summary and globals"),
+               clEnumValN(PassSummaryAction::Export, "export",
+                          "Export typeid resolutions to summary and globals")),
+    cl::Hidden);
+
+static cl::opt<std::string> ClReadSummary(
+    "wholeprogramdevirt-read-summary",
+    cl::desc("Read summary from given YAML file before running pass"),
+    cl::Hidden);
+
+static cl::opt<std::string> ClWriteSummary(
+    "wholeprogramdevirt-write-summary",
+    cl::desc("Write summary to given YAML file after running pass"),
+    cl::Hidden);
+
 // Find the minimum offset that we may store a value of size Size bits at. If
 // IsAfter is set, look for an offset before the object, otherwise look for an
 // offset after the object.
@@ -259,15 +299,92 @@ struct VirtualCallSite {
   }
 };
 
+// Call site information collected for a specific VTableSlot and possibly a list
+// of constant integer arguments. The grouping by arguments is handled by the
+// VTableSlotInfo class.
+struct CallSiteInfo {
+  /// The set of call sites for this slot. Used during regular LTO and the
+  /// import phase of ThinLTO (as well as the export phase of ThinLTO for any
+  /// call sites that appear in the merged module itself); in each of these
+  /// cases we are directly operating on the call sites at the IR level.
+  std::vector<VirtualCallSite> CallSites;
+
+  // These fields are used during the export phase of ThinLTO and reflect
+  // information collected from function summaries.
+
+  /// Whether any function summary contains an llvm.assume(llvm.type.test) for
+  /// this slot.
+  bool SummaryHasTypeTestAssumeUsers;
+
+  /// CFI-specific: a vector containing the list of function summaries that use
+  /// the llvm.type.checked.load intrinsic and therefore will require
+  /// resolutions for llvm.type.test in order to implement CFI checks if
+  /// devirtualization was unsuccessful. If devirtualization was successful, the
+  /// pass will clear this vector by calling markDevirt(). If at the end of the
+  /// pass the vector is non-empty, we will need to add a use of llvm.type.test
+  /// to each of the function summaries in the vector.
+  std::vector<FunctionSummary *> SummaryTypeCheckedLoadUsers;
+
+  bool isExported() const {
+    return SummaryHasTypeTestAssumeUsers ||
+           !SummaryTypeCheckedLoadUsers.empty();
+  }
+
+  /// As explained in the comment for SummaryTypeCheckedLoadUsers.
+  void markDevirt() { SummaryTypeCheckedLoadUsers.clear(); }
+};
+
+// Call site information collected for a specific VTableSlot.
+struct VTableSlotInfo {
+  // The set of call sites which do not have all constant integer arguments
+  // (excluding "this").
+  CallSiteInfo CSInfo;
+
+  // The set of call sites with all constant integer arguments (excluding
+  // "this"), grouped by argument list.
+  std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo;
+
+  void addCallSite(Value *VTable, CallSite CS, unsigned *NumUnsafeUses);
+
+private:
+  CallSiteInfo &findCallSiteInfo(CallSite CS);
+};
+
+CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallSite CS) {
+  std::vector<uint64_t> Args;
+  auto *CI = dyn_cast<IntegerType>(CS.getType());
+  if (!CI || CI->getBitWidth() > 64 || CS.arg_empty())
+    return CSInfo;
+  for (auto &&Arg : make_range(CS.arg_begin() + 1, CS.arg_end())) {
+    auto *CI = dyn_cast<ConstantInt>(Arg);
+    if (!CI || CI->getBitWidth() > 64)
+      return CSInfo;
+    Args.push_back(CI->getZExtValue());
+  }
+  return ConstCSInfo[Args];
+}
+
+void VTableSlotInfo::addCallSite(Value *VTable, CallSite CS,
+                                 unsigned *NumUnsafeUses) {
+  findCallSiteInfo(CS).CallSites.push_back({VTable, CS, NumUnsafeUses});
+}
+
 struct DevirtModule {
   Module &M;
+  function_ref<AAResults &(Function &)> AARGetter;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
   IntegerType *Int8Ty;
   PointerType *Int8PtrTy;
   IntegerType *Int32Ty;
+  IntegerType *Int64Ty;
+  IntegerType *IntPtrTy;
 
   bool RemarksEnabled;
 
-  MapVector<VTableSlot, std::vector<VirtualCallSite>> CallSlots;
+  MapVector<VTableSlot, VTableSlotInfo> CallSlots;
 
   // This map keeps track of the number of "unsafe" uses of a loaded function
   // pointer. The key is the associated llvm.type.test intrinsic call generated
@@ -279,11 +396,18 @@ struct DevirtModule {
   // true.
   std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
 
-  DevirtModule(Module &M)
-      : M(M), Int8Ty(Type::getInt8Ty(M.getContext())),
+  DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
+               ModuleSummaryIndex *ExportSummary,
+               const ModuleSummaryIndex *ImportSummary)
+      : M(M), AARGetter(AARGetter), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary), Int8Ty(Type::getInt8Ty(M.getContext())),
         Int8PtrTy(Type::getInt8PtrTy(M.getContext())),
         Int32Ty(Type::getInt32Ty(M.getContext())),
-        RemarksEnabled(areRemarksEnabled()) {}
+        Int64Ty(Type::getInt64Ty(M.getContext())),
+        IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
+        RemarksEnabled(areRemarksEnabled()) {
+    assert(!(ExportSummary && ImportSummary));
+  }
 
   bool areRemarksEnabled();
 
@@ -298,57 +422,169 @@ struct DevirtModule {
   tryFindVirtualCallTargets(std::vector<VirtualCallTarget> &TargetsForSlot,
                             const std::set<TypeMemberInfo> &TypeMemberInfos,
                             uint64_t ByteOffset);
+
+  void applySingleImplDevirt(VTableSlotInfo &SlotInfo, Constant *TheFn,
+                             bool &IsExported);
   bool trySingleImplDevirt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                           MutableArrayRef<VirtualCallSite> CallSites);
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res);
+
   bool tryEvaluateFunctionsWithArgs(
       MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-      ArrayRef<ConstantInt *> Args);
-  bool tryUniformRetValOpt(IntegerType *RetType,
-                           MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                           MutableArrayRef<VirtualCallSite> CallSites);
+      ArrayRef<uint64_t> Args);
+
+  void applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                             uint64_t TheRetVal);
+  bool tryUniformRetValOpt(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
+                           CallSiteInfo &CSInfo,
+                           WholeProgramDevirtResolution::ByArg *Res);
+
+  // Returns the global symbol name that is used to export information about the
+  // given vtable slot and list of arguments.
+  std::string getGlobalName(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                            StringRef Name);
+
+  // This function is called during the export phase to create a symbol
+  // definition containing information about the given vtable slot and list of
+  // arguments.
+  void exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args, StringRef Name,
+                    Constant *C);
+
+  // This function is called during the import phase to create a reference to
+  // the symbol definition created during the export phase.
+  Constant *importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                         StringRef Name, unsigned AbsWidth = 0);
+
+  void applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName, bool IsOne,
+                            Constant *UniqueMemberAddr);
   bool tryUniqueRetValOpt(unsigned BitWidth,
                           MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                          MutableArrayRef<VirtualCallSite> CallSites);
+                          CallSiteInfo &CSInfo,
+                          WholeProgramDevirtResolution::ByArg *Res,
+                          VTableSlot Slot, ArrayRef<uint64_t> Args);
+
+  void applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+                             Constant *Byte, Constant *Bit);
   bool tryVirtualConstProp(MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-                           ArrayRef<VirtualCallSite> CallSites);
+                           VTableSlotInfo &SlotInfo,
+                           WholeProgramDevirtResolution *Res, VTableSlot Slot);
 
   void rebuildGlobal(VTableBits &B);
 
+  // Apply the summary resolution for Slot to all virtual calls in SlotInfo.
+  void importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo);
+
+  // If we were able to eliminate all unsafe uses for a type checked load,
+  // eliminate the associated type tests by replacing them with true.
+  void removeRedundantTypeTests();
+
   bool run();
+
+  // Lower the module using the action and summary passed as command line
+  // arguments. For testing purposes only.
+  static bool runForTesting(Module &M,
+                            function_ref<AAResults &(Function &)> AARGetter);
 };
 
 struct WholeProgramDevirt : public ModulePass {
   static char ID;
 
-  WholeProgramDevirt() : ModulePass(ID) {
+  bool UseCommandLine = false;
+
+  ModuleSummaryIndex *ExportSummary;
+  const ModuleSummaryIndex *ImportSummary;
+
+  WholeProgramDevirt() : ModulePass(ID), UseCommandLine(true) {
+    initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
+  }
+
+  WholeProgramDevirt(ModuleSummaryIndex *ExportSummary,
+                     const ModuleSummaryIndex *ImportSummary)
+      : ModulePass(ID), ExportSummary(ExportSummary),
+        ImportSummary(ImportSummary) {
     initializeWholeProgramDevirtPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnModule(Module &M) override {
     if (skipModule(M))
       return false;
+    if (UseCommandLine)
+      return DevirtModule::runForTesting(M, LegacyAARGetter(*this));
+    return DevirtModule(M, LegacyAARGetter(*this), ExportSummary, ImportSummary)
+        .run();
+  }
 
-    return DevirtModule(M).run();
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
 } // end anonymous namespace
 
-INITIALIZE_PASS(WholeProgramDevirt, "wholeprogramdevirt",
-                "Whole program devirtualization", false, false)
+INITIALIZE_PASS_BEGIN(WholeProgramDevirt, "wholeprogramdevirt",
+                      "Whole program devirtualization", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(WholeProgramDevirt, "wholeprogramdevirt",
+                    "Whole program devirtualization", false, false)
 char WholeProgramDevirt::ID = 0;
 
-ModulePass *llvm::createWholeProgramDevirtPass() {
-  return new WholeProgramDevirt;
+ModulePass *
+llvm::createWholeProgramDevirtPass(ModuleSummaryIndex *ExportSummary,
+                                   const ModuleSummaryIndex *ImportSummary) {
+  return new WholeProgramDevirt(ExportSummary, ImportSummary);
 }
 
 PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
-                                              ModuleAnalysisManager &) {
-  if (!DevirtModule(M).run())
+                                              ModuleAnalysisManager &AM) {
+  auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto AARGetter = [&](Function &F) -> AAResults & {
+    return FAM.getResult<AAManager>(F);
+  };
+  if (!DevirtModule(M, AARGetter, nullptr, nullptr).run())
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
 }
 
+bool DevirtModule::runForTesting(
+    Module &M, function_ref<AAResults &(Function &)> AARGetter) {
+  ModuleSummaryIndex Summary;
+
+  // Handle the command-line summary arguments. This code is for testing
+  // purposes only, so we handle errors directly.
+  if (!ClReadSummary.empty()) {
+    ExitOnError ExitOnErr("-wholeprogramdevirt-read-summary: " + ClReadSummary +
+                          ": ");
+    auto ReadSummaryFile =
+        ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
+
+    yaml::Input In(ReadSummaryFile->getBuffer());
+    In >> Summary;
+    ExitOnErr(errorCodeToError(In.error()));
+  }
+
+  bool Changed =
+      DevirtModule(
+          M, AARGetter,
+          ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
+          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
+          .run();
+
+  if (!ClWriteSummary.empty()) {
+    ExitOnError ExitOnErr(
+        "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": ");
+    std::error_code EC;
+    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::F_Text);
+    ExitOnErr(errorCodeToError(EC));
+
+    yaml::Output Out(OS);
+    Out << Summary;
+  }
+
+  return Changed;
+}
+
 void DevirtModule::buildTypeIdentifierMap(
     std::vector<VTableBits> &Bits,
     DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
@@ -443,9 +679,31 @@ bool DevirtModule::tryFindVirtualCallTargets(
   return !TargetsForSlot.empty();
 }
 
+void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
+                                         Constant *TheFn, bool &IsExported) {
+  auto Apply = [&](CallSiteInfo &CSInfo) {
+    for (auto &&VCallSite : CSInfo.CallSites) {
+      if (RemarksEnabled)
+        VCallSite.emitRemark("single-impl", TheFn->getName());
+      VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
+          TheFn, VCallSite.CS.getCalledValue()->getType()));
+      // This use is no longer unsafe.
+      if (VCallSite.NumUnsafeUses)
+        --*VCallSite.NumUnsafeUses;
+    }
+    if (CSInfo.isExported()) {
+      IsExported = true;
+      CSInfo.markDevirt();
+    }
+  };
+  Apply(SlotInfo.CSInfo);
+  for (auto &P : SlotInfo.ConstCSInfo)
+    Apply(P.second);
+}
+
 bool DevirtModule::trySingleImplDevirt(
     MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    MutableArrayRef<VirtualCallSite> CallSites) {
+    VTableSlotInfo &SlotInfo, WholeProgramDevirtResolution *Res) {
   // See if the program contains a single implementation of this virtual
   // function.
   Function *TheFn = TargetsForSlot[0].Fn;
@@ -453,39 +711,51 @@ bool DevirtModule::trySingleImplDevirt(
     if (TheFn != Target.Fn)
       return false;
 
+  // If so, update each call site to call that implementation directly.
   if (RemarksEnabled)
     TargetsForSlot[0].WasDevirt = true;
-  // If so, update each call site to call that implementation directly.
-  for (auto &&VCallSite : CallSites) {
-    if (RemarksEnabled)
-      VCallSite.emitRemark("single-impl", TheFn->getName());
-    VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
-        TheFn, VCallSite.CS.getCalledValue()->getType()));
-    // This use is no longer unsafe.
-    if (VCallSite.NumUnsafeUses)
-      --*VCallSite.NumUnsafeUses;
+
+  bool IsExported = false;
+  applySingleImplDevirt(SlotInfo, TheFn, IsExported);
+  if (!IsExported)
+    return false;
+
+  // If the only implementation has local linkage, we must promote to external
+  // to make it visible to thin LTO objects. We can only get here during the
+  // ThinLTO export phase.
+  if (TheFn->hasLocalLinkage()) {
+    TheFn->setLinkage(GlobalValue::ExternalLinkage);
+    TheFn->setVisibility(GlobalValue::HiddenVisibility);
+    TheFn->setName(TheFn->getName() + "$merged");
   }
+
+  Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
+  Res->SingleImplName = TheFn->getName();
+
   return true;
 }
 
 bool DevirtModule::tryEvaluateFunctionsWithArgs(
     MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    ArrayRef<ConstantInt *> Args) {
+    ArrayRef<uint64_t> Args) {
   // Evaluate each function and store the result in each target's RetVal
   // field.
   for (VirtualCallTarget &Target : TargetsForSlot) {
     if (Target.Fn->arg_size() != Args.size() + 1)
       return false;
-    for (unsigned I = 0; I != Args.size(); ++I)
-      if (Target.Fn->getFunctionType()->getParamType(I + 1) !=
-          Args[I]->getType())
-        return false;
 
     Evaluator Eval(M.getDataLayout(), nullptr);
     SmallVector<Constant *, 2> EvalArgs;
     EvalArgs.push_back(
         Constant::getNullValue(Target.Fn->getFunctionType()->getParamType(0)));
-    EvalArgs.insert(EvalArgs.end(), Args.begin(), Args.end());
+    for (unsigned I = 0; I != Args.size(); ++I) {
+      auto *ArgTy = dyn_cast<IntegerType>(
+          Target.Fn->getFunctionType()->getParamType(I + 1));
+      if (!ArgTy)
+        return false;
+      EvalArgs.push_back(ConstantInt::get(ArgTy, Args[I]));
+    }
+
     Constant *RetVal;
     if (!Eval.EvaluateFunction(Target.Fn, RetVal, EvalArgs) ||
         !isa<ConstantInt>(RetVal))
@@ -495,9 +765,18 @@ bool DevirtModule::tryEvaluateFunctionsWithArgs(
   return true;
 }
 
+void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                                         uint64_t TheRetVal) {
+  for (auto Call : CSInfo.CallSites)
+    Call.replaceAndErase(
+        "uniform-ret-val", FnName, RemarksEnabled,
+        ConstantInt::get(cast<IntegerType>(Call.CS.getType()), TheRetVal));
+  CSInfo.markDevirt();
+}
+
 bool DevirtModule::tryUniformRetValOpt(
-    IntegerType *RetType, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    MutableArrayRef<VirtualCallSite> CallSites) {
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, CallSiteInfo &CSInfo,
+    WholeProgramDevirtResolution::ByArg *Res) {
   // Uniform return value optimization. If all functions return the same
   // constant, replace all calls with that constant.
   uint64_t TheRetVal = TargetsForSlot[0].RetVal;
@@ -505,19 +784,77 @@ bool DevirtModule::tryUniformRetValOpt(
     if (Target.RetVal != TheRetVal)
       return false;
 
-  auto TheRetValConst = ConstantInt::get(RetType, TheRetVal);
-  for (auto Call : CallSites)
-    Call.replaceAndErase("uniform-ret-val", TargetsForSlot[0].Fn->getName(),
-                         RemarksEnabled, TheRetValConst);
+  if (CSInfo.isExported()) {
+    Res->TheKind = WholeProgramDevirtResolution::ByArg::UniformRetVal;
+    Res->Info = TheRetVal;
+  }
+
+  applyUniformRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), TheRetVal);
   if (RemarksEnabled)
     for (auto &&Target : TargetsForSlot)
       Target.WasDevirt = true;
   return true;
 }
 
+std::string DevirtModule::getGlobalName(VTableSlot Slot,
+                                        ArrayRef<uint64_t> Args,
+                                        StringRef Name) {
+  std::string FullName = "__typeid_";
+  raw_string_ostream OS(FullName);
+  OS << cast<MDString>(Slot.TypeID)->getString() << '_' << Slot.ByteOffset;
+  for (uint64_t Arg : Args)
+    OS << '_' << Arg;
+  OS << '_' << Name;
+  return OS.str();
+}
+
+void DevirtModule::exportGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                StringRef Name, Constant *C) {
+  GlobalAlias *GA = GlobalAlias::create(Int8Ty, 0, GlobalValue::ExternalLinkage,
+                                        getGlobalName(Slot, Args, Name), C, &M);
+  GA->setVisibility(GlobalValue::HiddenVisibility);
+}
+
+Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
+                                     StringRef Name, unsigned AbsWidth) {
+  Constant *C = M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Ty);
+  auto *GV = dyn_cast<GlobalVariable>(C);
+  // We only need to set metadata if the global is newly created, in which
+  // case it would not have hidden visibility.
+  if (!GV || GV->getVisibility() == GlobalValue::HiddenVisibility)
+    return C;
+
+  GV->setVisibility(GlobalValue::HiddenVisibility);
+  auto SetAbsRange = [&](uint64_t Min, uint64_t Max) {
+    auto *MinC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Min));
+    auto *MaxC = ConstantAsMetadata::get(ConstantInt::get(IntPtrTy, Max));
+    GV->setMetadata(LLVMContext::MD_absolute_symbol,
+                    MDNode::get(M.getContext(), {MinC, MaxC}));
+  };
+  if (AbsWidth == IntPtrTy->getBitWidth())
+    SetAbsRange(~0ull, ~0ull); // Full set.
+  else if (AbsWidth)
+    SetAbsRange(0, 1ull << AbsWidth);
+  return GV;
+}
+
+void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
+                                        bool IsOne,
+                                        Constant *UniqueMemberAddr) {
+  for (auto &&Call : CSInfo.CallSites) {
+    IRBuilder<> B(Call.CS.getInstruction());
+    Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
+                              Call.VTable, UniqueMemberAddr);
+    Cmp = B.CreateZExt(Cmp, Call.CS->getType());
+    Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, Cmp);
+  }
+  CSInfo.markDevirt();
+}
+
 bool DevirtModule::tryUniqueRetValOpt(
     unsigned BitWidth, MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    MutableArrayRef<VirtualCallSite> CallSites) {
+    CallSiteInfo &CSInfo, WholeProgramDevirtResolution::ByArg *Res,
+    VTableSlot Slot, ArrayRef<uint64_t> Args) {
   // IsOne controls whether we look for a 0 or a 1.
   auto tryUniqueRetValOptFor = [&](bool IsOne) {
     const TypeMemberInfo *UniqueMember = nullptr;
@@ -533,16 +870,23 @@ bool DevirtModule::tryUniqueRetValOpt(
     // checked for a uniform return value in tryUniformRetValOpt.
     assert(UniqueMember);
 
-    // Replace each call with the comparison.
-    for (auto &&Call : CallSites) {
-      IRBuilder<> B(Call.CS.getInstruction());
-      Value *OneAddr = B.CreateBitCast(UniqueMember->Bits->GV, Int8PtrTy);
-      OneAddr = B.CreateConstGEP1_64(OneAddr, UniqueMember->Offset);
-      Value *Cmp = B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
-                                Call.VTable, OneAddr);
-      Call.replaceAndErase("unique-ret-val", TargetsForSlot[0].Fn->getName(),
-                           RemarksEnabled, Cmp);
+    Constant *UniqueMemberAddr =
+        ConstantExpr::getBitCast(UniqueMember->Bits->GV, Int8PtrTy);
+    UniqueMemberAddr = ConstantExpr::getGetElementPtr(
+        Int8Ty, UniqueMemberAddr,
+        ConstantInt::get(Int64Ty, UniqueMember->Offset));
+
+    if (CSInfo.isExported()) {
+      Res->TheKind = WholeProgramDevirtResolution::ByArg::UniqueRetVal;
+      Res->Info = IsOne;
+
+      exportGlobal(Slot, Args, "unique_member", UniqueMemberAddr);
     }
+
+    // Replace each call with the comparison.
+    applyUniqueRetValOpt(CSInfo, TargetsForSlot[0].Fn->getName(), IsOne,
+                         UniqueMemberAddr);
+
     // Update devirtualization statistics for targets.
     if (RemarksEnabled)
       for (auto &&Target : TargetsForSlot)
@@ -560,9 +904,30 @@ bool DevirtModule::tryUniqueRetValOpt(
   return false;
 }
 
+void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
+                                         Constant *Byte, Constant *Bit) {
+  for (auto Call : CSInfo.CallSites) {
+    auto *RetType = cast<IntegerType>(Call.CS.getType());
+    IRBuilder<> B(Call.CS.getInstruction());
+    Value *Addr = B.CreateGEP(Int8Ty, Call.VTable, Byte);
+    if (RetType->getBitWidth() == 1) {
+      Value *Bits = B.CreateLoad(Addr);
+      Value *BitsAndBit = B.CreateAnd(Bits, Bit);
+      auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
+      Call.replaceAndErase("virtual-const-prop-1-bit", FnName, RemarksEnabled,
+                           IsBitSet);
+    } else {
+      Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
+      Value *Val = B.CreateLoad(RetType, ValAddr);
+      Call.replaceAndErase("virtual-const-prop", FnName, RemarksEnabled, Val);
+    }
+  }
+  CSInfo.markDevirt();
+}
+
 bool DevirtModule::tryVirtualConstProp(
-    MutableArrayRef<VirtualCallTarget> TargetsForSlot,
-    ArrayRef<VirtualCallSite> CallSites) {
+    MutableArrayRef<VirtualCallTarget> TargetsForSlot, VTableSlotInfo &SlotInfo,
+    WholeProgramDevirtResolution *Res, VTableSlot Slot) {
   // This only works if the function returns an integer.
   auto RetType = dyn_cast<IntegerType>(TargetsForSlot[0].Fn->getReturnType());
   if (!RetType)
@@ -571,55 +936,38 @@ bool DevirtModule::tryVirtualConstProp(
   if (BitWidth > 64)
     return false;
 
-  // Make sure that each function does not access memory, takes at least one
-  // argument, does not use its first argument (which we assume is 'this'),
-  // and has the same return type.
+  // Make sure that each function is defined, does not access memory, takes at
+  // least one argument, does not use its first argument (which we assume is
+  // 'this'), and has the same return type.
+  //
+  // Note that we test whether this copy of the function is readnone, rather
+  // than testing function attributes, which must hold for any copy of the
+  // function, even a less optimized version substituted at link time. This is
+  // sound because the virtual constant propagation optimizations effectively
+  // inline all implementations of the virtual function into each call site,
+  // rather than using function attributes to perform local optimization.
   for (VirtualCallTarget &Target : TargetsForSlot) {
-    if (!Target.Fn->doesNotAccessMemory() || Target.Fn->arg_empty() ||
-        !Target.Fn->arg_begin()->use_empty() ||
+    if (Target.Fn->isDeclaration() ||
+        computeFunctionBodyMemoryAccess(*Target.Fn, AARGetter(*Target.Fn)) !=
+            MAK_ReadNone ||
+        Target.Fn->arg_empty() || !Target.Fn->arg_begin()->use_empty() ||
         Target.Fn->getReturnType() != RetType)
       return false;
   }
 
-  // Group call sites by the list of constant arguments they pass.
-  // The comparator ensures deterministic ordering.
-  struct ByAPIntValue {
-    bool operator()(const std::vector<ConstantInt *> &A,
-                    const std::vector<ConstantInt *> &B) const {
-      return std::lexicographical_compare(
-          A.begin(), A.end(), B.begin(), B.end(),
-          [](ConstantInt *AI, ConstantInt *BI) {
-            return AI->getValue().ult(BI->getValue());
-          });
-    }
-  };
-  std::map<std::vector<ConstantInt *>, std::vector<VirtualCallSite>,
-           ByAPIntValue>
-      VCallSitesByConstantArg;
-  for (auto &&VCallSite : CallSites) {
-    std::vector<ConstantInt *> Args;
-    if (VCallSite.CS.getType() != RetType)
-      continue;
-    for (auto &&Arg :
-         make_range(VCallSite.CS.arg_begin() + 1, VCallSite.CS.arg_end())) {
-      if (!isa<ConstantInt>(Arg))
-        break;
-      Args.push_back(cast<ConstantInt>(&Arg));
-    }
-    if (Args.size() + 1 != VCallSite.CS.arg_size())
-      continue;
-
-    VCallSitesByConstantArg[Args].push_back(VCallSite);
-  }
-
-  for (auto &&CSByConstantArg : VCallSitesByConstantArg) {
+  for (auto &&CSByConstantArg : SlotInfo.ConstCSInfo) {
     if (!tryEvaluateFunctionsWithArgs(TargetsForSlot, CSByConstantArg.first))
       continue;
 
-    if (tryUniformRetValOpt(RetType, TargetsForSlot, CSByConstantArg.second))
+    WholeProgramDevirtResolution::ByArg *ResByArg = nullptr;
+    if (Res)
+      ResByArg = &Res->ResByArg[CSByConstantArg.first];
+
+    if (tryUniformRetValOpt(TargetsForSlot, CSByConstantArg.second, ResByArg))
       continue;
 
-    if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second))
+    if (tryUniqueRetValOpt(BitWidth, TargetsForSlot, CSByConstantArg.second,
+                           ResByArg, Slot, CSByConstantArg.first))
       continue;
 
     // Find an allocation offset in bits in all vtables associated with the
@@ -659,26 +1007,20 @@ bool DevirtModule::tryVirtualConstProp(
       for (auto &&Target : TargetsForSlot)
         Target.WasDevirt = true;
 
-    // Rewrite each call to a load from OffsetByte/OffsetBit.
-    for (auto Call : CSByConstantArg.second) {
-      IRBuilder<> B(Call.CS.getInstruction());
-      Value *Addr = B.CreateConstGEP1_64(Call.VTable, OffsetByte);
-      if (BitWidth == 1) {
-        Value *Bits = B.CreateLoad(Addr);
-        Value *Bit = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
-        Value *BitsAndBit = B.CreateAnd(Bits, Bit);
-        auto IsBitSet = B.CreateICmpNE(BitsAndBit, ConstantInt::get(Int8Ty, 0));
-        Call.replaceAndErase("virtual-const-prop-1-bit",
-                             TargetsForSlot[0].Fn->getName(),
-                             RemarksEnabled, IsBitSet);
-      } else {
-        Value *ValAddr = B.CreateBitCast(Addr, RetType->getPointerTo());
-        Value *Val = B.CreateLoad(RetType, ValAddr);
-        Call.replaceAndErase("virtual-const-prop",
-                             TargetsForSlot[0].Fn->getName(),
-                             RemarksEnabled, Val);
-      }
+    Constant *ByteConst = ConstantInt::get(Int32Ty, OffsetByte);
+    Constant *BitConst = ConstantInt::get(Int8Ty, 1ULL << OffsetBit);
+
+    if (CSByConstantArg.second.isExported()) {
+      ResByArg->TheKind = WholeProgramDevirtResolution::ByArg::VirtualConstProp;
+      exportGlobal(Slot, CSByConstantArg.first, "byte",
+                   ConstantExpr::getIntToPtr(ByteConst, Int8PtrTy));
+      exportGlobal(Slot, CSByConstantArg.first, "bit",
+                   ConstantExpr::getIntToPtr(BitConst, Int8PtrTy));
     }
+
+    // Rewrite each call to a load from OffsetByte/OffsetBit.
+    applyVirtualConstProp(CSByConstantArg.second,
+                          TargetsForSlot[0].Fn->getName(), ByteConst, BitConst);
   }
   return true;
 }
@@ -733,7 +1075,11 @@ bool DevirtModule::areRemarksEnabled() {
   if (FL.empty())
     return false;
   const Function &Fn = FL.front();
-  auto DI = OptimizationRemark(DEBUG_TYPE, Fn, DebugLoc(), "");
+
+  const auto &BBL = Fn.getBasicBlockList();
+  if (BBL.empty())
+    return false;
+  auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBL.front());
   return DI.isEnabled();
 }
 
@@ -766,8 +1112,8 @@ void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc,
       Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
       if (SeenPtrs.insert(Ptr).second) {
         for (DevirtCallSite Call : DevirtCalls) {
-          CallSlots[{TypeId, Call.Offset}].push_back(
-              {CI->getArgOperand(0), Call.CS, nullptr});
+          CallSlots[{TypeId, Call.Offset}].addCallSite(CI->getArgOperand(0),
+                                                       Call.CS, nullptr);
         }
       }
     }
@@ -853,14 +1199,79 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
     if (HasNonCallUses)
       ++NumUnsafeUses;
     for (DevirtCallSite Call : DevirtCalls) {
-      CallSlots[{TypeId, Call.Offset}].push_back(
-          {Ptr, Call.CS, &NumUnsafeUses});
+      CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS,
+                                                   &NumUnsafeUses);
     }
 
     CI->eraseFromParent();
   }
 }
 
+void DevirtModule::importResolution(VTableSlot Slot, VTableSlotInfo &SlotInfo) {
+  const TypeIdSummary *TidSummary =
+      ImportSummary->getTypeIdSummary(cast<MDString>(Slot.TypeID)->getString());
+  if (!TidSummary)
+    return;
+  auto ResI = TidSummary->WPDRes.find(Slot.ByteOffset);
+  if (ResI == TidSummary->WPDRes.end())
+    return;
+  const WholeProgramDevirtResolution &Res = ResI->second;
+
+  if (Res.TheKind == WholeProgramDevirtResolution::SingleImpl) {
+    // The type of the function in the declaration is irrelevant because every
+    // call site will cast it to the correct type.
+    auto *SingleImpl = M.getOrInsertFunction(
+        Res.SingleImplName, Type::getVoidTy(M.getContext()));
+
+    // This is the import phase so we should not be exporting anything.
+    bool IsExported = false;
+    applySingleImplDevirt(SlotInfo, SingleImpl, IsExported);
+    assert(!IsExported);
+  }
+
+  for (auto &CSByConstantArg : SlotInfo.ConstCSInfo) {
+    auto I = Res.ResByArg.find(CSByConstantArg.first);
+    if (I == Res.ResByArg.end())
+      continue;
+    auto &ResByArg = I->second;
+    // FIXME: We should figure out what to do about the "function name" argument
+    // to the apply* functions, as the function names are unavailable during the
+    // importing phase. For now we just pass the empty string. This does not
+    // impact correctness because the function names are just used for remarks.
+    switch (ResByArg.TheKind) {
+    case WholeProgramDevirtResolution::ByArg::UniformRetVal:
+      applyUniformRetValOpt(CSByConstantArg.second, "", ResByArg.Info);
+      break;
+    case WholeProgramDevirtResolution::ByArg::UniqueRetVal: {
+      Constant *UniqueMemberAddr =
+          importGlobal(Slot, CSByConstantArg.first, "unique_member");
+      applyUniqueRetValOpt(CSByConstantArg.second, "", ResByArg.Info,
+                           UniqueMemberAddr);
+      break;
+    }
+    case WholeProgramDevirtResolution::ByArg::VirtualConstProp: {
+      Constant *Byte = importGlobal(Slot, CSByConstantArg.first, "byte", 32);
+      Byte = ConstantExpr::getPtrToInt(Byte, Int32Ty);
+      Constant *Bit = importGlobal(Slot, CSByConstantArg.first, "bit", 8);
+      Bit = ConstantExpr::getPtrToInt(Bit, Int8Ty);
+      applyVirtualConstProp(CSByConstantArg.second, "", Byte, Bit);
+    }
+    default:
+      break;
+    }
+  }
+}
+
+void DevirtModule::removeRedundantTypeTests() {
+  auto True = ConstantInt::getTrue(M.getContext());
+  for (auto &&U : NumUnsafeUsesForTypeTest) {
+    if (U.second == 0) {
+      U.first->replaceAllUsesWith(True);
+      U.first->eraseFromParent();
+    }
+  }
+}
+
 bool DevirtModule::run() {
   Function *TypeTestFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::type_test));
@@ -868,7 +1279,11 @@ bool DevirtModule::run() {
       M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
   Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
 
-  if ((!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
+  // Normally if there are no users of the devirtualization intrinsics in the
+  // module, this pass has nothing to do. But if we are exporting, we also need
+  // to handle any users that appear only in the function summaries.
+  if (!ExportSummary &&
+      (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
        AssumeFunc->use_empty()) &&
       (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
     return false;
@@ -879,6 +1294,17 @@ bool DevirtModule::run() {
   if (TypeCheckedLoadFunc)
     scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
 
+  if (ImportSummary) {
+    for (auto &S : CallSlots)
+      importResolution(S.first, S.second);
+
+    removeRedundantTypeTests();
+
+    // The rest of the code is only necessary when exporting or during regular
+    // LTO, so we are done.
+    return true;
+  }
+
   // Rebuild type metadata into a map for easy lookup.
   std::vector<VTableBits> Bits;
   DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
@@ -886,6 +1312,53 @@ bool DevirtModule::run() {
   if (TypeIdMap.empty())
     return true;
 
+  // Collect information from summary about which calls to try to devirtualize.
+  if (ExportSummary) {
+    DenseMap<GlobalValue::GUID, TinyPtrVector<Metadata *>> MetadataByGUID;
+    for (auto &P : TypeIdMap) {
+      if (auto *TypeId = dyn_cast<MDString>(P.first))
+        MetadataByGUID[GlobalValue::getGUID(TypeId->getString())].push_back(
+            TypeId);
+    }
+
+    for (auto &P : *ExportSummary) {
+      for (auto &S : P.second) {
+        auto *FS = dyn_cast<FunctionSummary>(S.get());
+        if (!FS)
+          continue;
+        // FIXME: Only add live functions.
+        for (FunctionSummary::VFuncId VF : FS->type_test_assume_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+            CallSlots[{MD, VF.Offset}].CSInfo.SummaryHasTypeTestAssumeUsers =
+                true;
+          }
+        }
+        for (FunctionSummary::VFuncId VF : FS->type_checked_load_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VF.GUID]) {
+            CallSlots[{MD, VF.Offset}]
+                .CSInfo.SummaryTypeCheckedLoadUsers.push_back(FS);
+          }
+        }
+        for (const FunctionSummary::ConstVCall &VC :
+             FS->type_test_assume_const_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+            CallSlots[{MD, VC.VFunc.Offset}]
+                .ConstCSInfo[VC.Args]
+                .SummaryHasTypeTestAssumeUsers = true;
+          }
+        }
+        for (const FunctionSummary::ConstVCall &VC :
+             FS->type_checked_load_const_vcalls()) {
+          for (Metadata *MD : MetadataByGUID[VC.VFunc.GUID]) {
+            CallSlots[{MD, VC.VFunc.Offset}]
+                .ConstCSInfo[VC.Args]
+                .SummaryTypeCheckedLoadUsers.push_back(FS);
+          }
+        }
+      }
+    }
+  }
+
   // For each (type, offset) pair:
   bool DidVirtualConstProp = false;
   std::map<std::string, Function*> DevirtTargets;
@@ -894,19 +1367,39 @@ bool DevirtModule::run() {
     // function implementation at offset S.first.ByteOffset, and add to
     // TargetsForSlot.
     std::vector<VirtualCallTarget> TargetsForSlot;
-    if (!tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID],
-                                   S.first.ByteOffset))
-      continue;
-
-    if (!trySingleImplDevirt(TargetsForSlot, S.second) &&
-        tryVirtualConstProp(TargetsForSlot, S.second))
+    if (tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID],
+                                  S.first.ByteOffset)) {
+      WholeProgramDevirtResolution *Res = nullptr;
+      if (ExportSummary && isa<MDString>(S.first.TypeID))
+        Res = &ExportSummary
+                   ->getOrInsertTypeIdSummary(
+                       cast<MDString>(S.first.TypeID)->getString())
+                   .WPDRes[S.first.ByteOffset];
+
+      if (!trySingleImplDevirt(TargetsForSlot, S.second, Res) &&
+          tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first))
         DidVirtualConstProp = true;
 
-    // Collect functions devirtualized at least for one call site for stats.
-    if (RemarksEnabled)
-      for (const auto &T : TargetsForSlot)
-        if (T.WasDevirt)
-          DevirtTargets[T.Fn->getName()] = T.Fn;
+      // Collect functions devirtualized at least for one call site for stats.
+      if (RemarksEnabled)
+        for (const auto &T : TargetsForSlot)
+          if (T.WasDevirt)
+            DevirtTargets[T.Fn->getName()] = T.Fn;
+    }
+
+    // CFI-specific: if we are exporting and any llvm.type.checked.load
+    // intrinsics were *not* devirtualized, we need to add the resulting
+    // llvm.type.test intrinsics to the function summaries so that the
+    // LowerTypeTests pass will export them.
+    if (ExportSummary && isa<MDString>(S.first.TypeID)) {
+      auto GUID =
+          GlobalValue::getGUID(cast<MDString>(S.first.TypeID)->getString());
+      for (auto FS : S.second.CSInfo.SummaryTypeCheckedLoadUsers)
+        FS->addTypeTest(GUID);
+      for (auto &CCS : S.second.ConstCSInfo)
+        for (auto FS : CCS.second.SummaryTypeCheckedLoadUsers)
+          FS->addTypeTest(GUID);
+    }
   }
 
   if (RemarksEnabled) {
@@ -914,23 +1407,12 @@ bool DevirtModule::run() {
     for (const auto &DT : DevirtTargets) {
       Function *F = DT.second;
       DISubprogram *SP = F->getSubprogram();
-      DebugLoc DL = SP ? DebugLoc::get(SP->getScopeLine(), 0, SP) : DebugLoc();
-      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, DL,
+      emitOptimizationRemark(F->getContext(), DEBUG_TYPE, *F, SP,
                              Twine("devirtualized ") + F->getName());
     }
   }
 
-  // If we were able to eliminate all unsafe uses for a type checked load,
-  // eliminate the type test by replacing it with true.
-  if (TypeCheckedLoadFunc) {
-    auto True = ConstantInt::getTrue(M.getContext());
-    for (auto &&U : NumUnsafeUsesForTypeTest) {
-      if (U.second == 0) {
-        U.first->replaceAllUsesWith(True);
-        U.first->eraseFromParent();
-      }
-    }
-  }
+  removeRedundantTypeTests();
 
   // Rebuild each global we touched as part of virtual constant propagation to
   // include the before and after bytes.
diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 2d34c1cc74bd8cf7516bf0fd7471ce893d8727ed..1077121f8cb2b30304bc070f919757b65fd0b984 100644
--- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -902,7 +902,7 @@ bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS,
   APInt RHSKnownOne(BitWidth, 0);
   computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI);
 
-  // Addition of two 2's compliment numbers having opposite signs will never
+  // Addition of two 2's complement numbers having opposite signs will never
   // overflow.
   if ((LHSKnownOne[BitWidth - 1] && RHSKnownZero[BitWidth - 1]) ||
       (LHSKnownZero[BitWidth - 1] && RHSKnownOne[BitWidth - 1]))
@@ -939,7 +939,7 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS,
   APInt RHSKnownOne(BitWidth, 0);
   computeKnownBits(RHS, RHSKnownZero, RHSKnownOne, 0, &CxtI);
 
-  // Subtraction of two 2's compliment numbers having identical signs will
+  // Subtraction of two 2's complement numbers having identical signs will
   // never overflow.
   if ((LHSKnownOne[BitWidth - 1] && RHSKnownOne[BitWidth - 1]) ||
       (LHSKnownZero[BitWidth - 1] && RHSKnownZero[BitWidth - 1]))
@@ -1042,43 +1042,42 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   if (Value *V = SimplifyUsingDistributiveLaws(I))
     return replaceInstUsesWith(I, V);
 
-  const APInt *Val;
-  if (match(RHS, m_APInt(Val))) {
-    // X + (signbit) --> X ^ signbit
-    if (Val->isSignBit())
+  const APInt *RHSC;
+  if (match(RHS, m_APInt(RHSC))) {
+    if (RHSC->isSignBit()) {
+      // If wrapping is not allowed, then the addition must set the sign bit:
+      // X + (signbit) --> X | signbit
+      if (I.hasNoSignedWrap() || I.hasNoUnsignedWrap())
+        return BinaryOperator::CreateOr(LHS, RHS);
+
+      // If wrapping is allowed, then the addition flips the sign bit of LHS:
+      // X + (signbit) --> X ^ signbit
       return BinaryOperator::CreateXor(LHS, RHS);
+    }
 
     // Is this add the last step in a convoluted sext?
     Value *X;
     const APInt *C;
     if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) &&
         C->isMinSignedValue() &&
-        C->sext(LHS->getType()->getScalarSizeInBits()) == *Val) {
+        C->sext(LHS->getType()->getScalarSizeInBits()) == *RHSC) {
       // add(zext(xor i16 X, -32768), -32768) --> sext X
       return CastInst::Create(Instruction::SExt, X, LHS->getType());
     }
 
-    if (Val->isNegative() &&
+    if (RHSC->isNegative() &&
         match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) &&
-        Val->sge(-C->sext(Val->getBitWidth()))) {
+        RHSC->sge(-C->sext(RHSC->getBitWidth()))) {
       // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val))
-      return CastInst::Create(
-          Instruction::ZExt,
-          Builder->CreateNUWAdd(
-              X, Constant::getIntegerValue(X->getType(),
-                                           *C + Val->trunc(C->getBitWidth()))),
-          I.getType());
+      Constant *NewC =
+          ConstantInt::get(X->getType(), *C + RHSC->trunc(C->getBitWidth()));
+      return new ZExtInst(Builder->CreateNUWAdd(X, NewC), I.getType());
     }
   }
 
   // FIXME: Use the match above instead of dyn_cast to allow these transforms
   // for splat vectors.
   if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
-    // See if SimplifyDemandedBits can simplify this.  This handles stuff like
-    // (X & 254)+1 -> (X&254)|1
-    if (SimplifyDemandedInstructionBits(I))
-      return &I;
-
     // zext(bool) + C -> bool ? C + 1 : C
     if (ZExtInst *ZI = dyn_cast<ZExtInst>(LHS))
       if (ZI->getSrcTy()->isIntegerTy(1))
@@ -1129,8 +1128,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     }
   }
 
-  if (isa<Constant>(RHS) && isa<PHINode>(LHS))
-    if (Instruction *NV = FoldOpIntoPhi(I))
+  if (isa<Constant>(RHS))
+    if (Instruction *NV = foldOpWithConstantIntoOperand(I))
       return NV;
 
   if (I.getType()->getScalarType()->isIntegerTy(1))
@@ -1201,11 +1200,6 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
         return BinaryOperator::CreateAnd(NewAdd, C2);
       }
     }
-
-    // Try to fold constant add into select arguments.
-    if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
-      if (Instruction *R = FoldOpIntoSelect(I, SI))
-        return R;
   }
 
   // add (select X 0 (sub n A)) A  -->  select X A n
@@ -1253,7 +1247,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
     // (add (sext x), (sext y)) --> (sext (add int x, y))
     if (SExtInst *RHSConv = dyn_cast<SExtInst>(RHS)) {
-      // Only do this if x/y have the same type, if at last one of them has a
+      // Only do this if x/y have the same type, if at least one of them has a
       // single use (so we don't increase the number of sexts), and if the
       // integer add will not overflow.
       if (LHSConv->getOperand(0)->getType() ==
@@ -1290,7 +1284,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
 
     // (add (zext x), (zext y)) --> (zext (add int x, y))
     if (auto *RHSConv = dyn_cast<ZExtInst>(RHS)) {
-      // Only do this if x/y have the same type, if at last one of them has a
+      // Only do this if x/y have the same type, if at least one of them has a
       // single use (so we don't increase the number of zexts), and if the
       // integer add will not overflow.
       if (LHSConv->getOperand(0)->getType() ==
@@ -1311,13 +1305,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   {
     Value *A = nullptr, *B = nullptr;
     if (match(RHS, m_Xor(m_Value(A), m_Value(B))) &&
-        (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(LHS, m_And(m_Specific(B), m_Specific(A)))))
+        match(LHS, m_c_And(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
 
     if (match(LHS, m_Xor(m_Value(A), m_Value(B))) &&
-        (match(RHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(RHS, m_And(m_Specific(B), m_Specific(A)))))
+        match(RHS, m_c_And(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
   }
 
@@ -1325,8 +1317,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   {
     Value *A = nullptr, *B = nullptr;
     if (match(RHS, m_Or(m_Value(A), m_Value(B))) &&
-        (match(LHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(LHS, m_And(m_Specific(B), m_Specific(A))))) {
+        match(LHS, m_c_And(m_Specific(A), m_Specific(B)))) {
       auto *New = BinaryOperator::CreateAdd(A, B);
       New->setHasNoSignedWrap(I.hasNoSignedWrap());
       New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
@@ -1334,8 +1325,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
     }
 
     if (match(LHS, m_Or(m_Value(A), m_Value(B))) &&
-        (match(RHS, m_And(m_Specific(A), m_Specific(B))) ||
-         match(RHS, m_And(m_Specific(B), m_Specific(A))))) {
+        match(RHS, m_c_And(m_Specific(A), m_Specific(B)))) {
       auto *New = BinaryOperator::CreateAdd(A, B);
       New->setHasNoSignedWrap(I.hasNoSignedWrap());
       New->setHasNoUnsignedWrap(I.hasNoUnsignedWrap());
@@ -1394,6 +1384,8 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
   // Check for (fadd double (sitofp x), y), see if we can merge this into an
   // integer add followed by a promotion.
   if (SIToFPInst *LHSConv = dyn_cast<SIToFPInst>(LHS)) {
+    Value *LHSIntVal = LHSConv->getOperand(0);
+
     // (fadd double (sitofp x), fpcst) --> (sitofp (add int x, intcst))
     // ... if the constant fits in the integer value.  This is useful for things
     // like (double)(x & 1234) + 4.0 -> (double)((X & 1234)+4) which no longer
@@ -1401,12 +1393,12 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
     // instcombined.
     if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHS)) {
       Constant *CI =
-      ConstantExpr::getFPToSI(CFP, LHSConv->getOperand(0)->getType());
+      ConstantExpr::getFPToSI(CFP, LHSIntVal->getType());
       if (LHSConv->hasOneUse() &&
           ConstantExpr::getSIToFP(CI, I.getType()) == CFP &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0), CI, I)) {
+          WillNotOverflowSignedAdd(LHSIntVal, CI, I)) {
         // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
+        Value *NewAdd = Builder->CreateNSWAdd(LHSIntVal,
                                               CI, "addconv");
         return new SIToFPInst(NewAdd, I.getType());
       }
@@ -1414,17 +1406,17 @@ Instruction *InstCombiner::visitFAdd(BinaryOperator &I) {
 
     // (fadd double (sitofp x), (sitofp y)) --> (sitofp (add int x, y))
     if (SIToFPInst *RHSConv = dyn_cast<SIToFPInst>(RHS)) {
-      // Only do this if x/y have the same type, if at last one of them has a
+      Value *RHSIntVal = RHSConv->getOperand(0);
+
+      // Only do this if x/y have the same type, if at least one of them has a
       // single use (so we don't increase the number of int->fp conversions),
       // and if the integer add will not overflow.
-      if (LHSConv->getOperand(0)->getType() ==
-              RHSConv->getOperand(0)->getType() &&
+      if (LHSIntVal->getType() == RHSIntVal->getType() &&
           (LHSConv->hasOneUse() || RHSConv->hasOneUse()) &&
-          WillNotOverflowSignedAdd(LHSConv->getOperand(0),
-                                   RHSConv->getOperand(0), I)) {
+          WillNotOverflowSignedAdd(LHSIntVal, RHSIntVal, I)) {
         // Insert the new integer add.
-        Value *NewAdd = Builder->CreateNSWAdd(LHSConv->getOperand(0),
-                                              RHSConv->getOperand(0),"addconv");
+        Value *NewAdd = Builder->CreateNSWAdd(LHSIntVal,
+                                              RHSIntVal, "addconv");
         return new SIToFPInst(NewAdd, I.getType());
       }
     }
@@ -1562,7 +1554,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     return Res;
   }
 
-  if (I.getType()->isIntegerTy(1))
+  if (I.getType()->getScalarType()->isIntegerTy(1))
     return BinaryOperator::CreateXor(Op0, Op1);
 
   // Replace (-1 - A) with (~A).
@@ -1585,9 +1577,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     if (match(Op1, m_Add(m_Value(X), m_Constant(C2))))
       return BinaryOperator::CreateSub(ConstantExpr::getSub(C, C2), X);
 
-    if (SimplifyDemandedInstructionBits(I))
-      return &I;
-
     // Fold (sub 0, (zext bool to B)) --> (sext bool to B)
     if (C->isNullValue() && match(Op1, m_ZExt(m_Value(X))))
       if (X->getType()->getScalarType()->isIntegerTy(1))
@@ -1622,11 +1611,11 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
     // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
     // zero.
-    if ((*Op0C + 1).isPowerOf2()) {
-      APInt KnownZero(BitWidth, 0);
-      APInt KnownOne(BitWidth, 0);
-      computeKnownBits(&I, KnownZero, KnownOne, 0, &I);
-      if ((*Op0C | KnownZero).isAllOnesValue())
+    if (Op0C->isMask()) {
+      APInt RHSKnownZero(BitWidth, 0);
+      APInt RHSKnownOne(BitWidth, 0);
+      computeKnownBits(Op1, RHSKnownZero, RHSKnownOne, 0, &I);
+      if ((*Op0C | RHSKnownZero).isAllOnesValue())
         return BinaryOperator::CreateXor(Op1, Op0);
     }
   }
@@ -1634,8 +1623,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   {
     Value *Y;
     // X-(X+Y) == -Y    X-(Y+X) == -Y
-    if (match(Op1, m_Add(m_Specific(Op0), m_Value(Y))) ||
-        match(Op1, m_Add(m_Value(Y), m_Specific(Op0))))
+    if (match(Op1, m_c_Add(m_Specific(Op0), m_Value(Y))))
       return BinaryOperator::CreateNeg(Y);
 
     // (X-Y)-X == -Y
@@ -1645,18 +1633,16 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
   // (sub (or A, B) (xor A, B)) --> (and A, B)
   {
-    Value *A = nullptr, *B = nullptr;
+    Value *A, *B;
     if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
-        (match(Op0, m_Or(m_Specific(A), m_Specific(B))) ||
-         match(Op0, m_Or(m_Specific(B), m_Specific(A)))))
+        match(Op0, m_c_Or(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateAnd(A, B);
   }
 
-  if (Op0->hasOneUse()) {
-    Value *Y = nullptr;
+  {
+    Value *Y;
     // ((X | Y) - X) --> (~X & Y)
-    if (match(Op0, m_Or(m_Value(Y), m_Specific(Op1))) ||
-        match(Op0, m_Or(m_Specific(Op1), m_Value(Y))))
+    if (match(Op0, m_OneUse(m_c_Or(m_Value(Y), m_Specific(Op1)))))
       return BinaryOperator::CreateAnd(
           Y, Builder->CreateNot(Op1, Op1->getName() + ".not"));
   }
@@ -1664,7 +1650,6 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   if (Op1->hasOneUse()) {
     Value *X = nullptr, *Y = nullptr, *Z = nullptr;
     Constant *C = nullptr;
-    Constant *CI = nullptr;
 
     // (X - (Y - Z))  -->  (X + (Z - Y)).
     if (match(Op1, m_Sub(m_Value(Y), m_Value(Z))))
@@ -1673,8 +1658,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
 
     // (X - (X & Y))   -->   (X & ~Y)
     //
-    if (match(Op1, m_And(m_Value(Y), m_Specific(Op0))) ||
-        match(Op1, m_And(m_Specific(Op0), m_Value(Y))))
+    if (match(Op1, m_c_And(m_Value(Y), m_Specific(Op0))))
       return BinaryOperator::CreateAnd(Op0,
                                   Builder->CreateNot(Y, Y->getName() + ".not"));
 
@@ -1702,14 +1686,14 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     // X - A*-B -> X + A*B
     // X - -A*B -> X + A*B
     Value *A, *B;
-    if (match(Op1, m_Mul(m_Value(A), m_Neg(m_Value(B)))) ||
-        match(Op1, m_Mul(m_Neg(m_Value(A)), m_Value(B))))
+    Constant *CI;
+    if (match(Op1, m_c_Mul(m_Value(A), m_Neg(m_Value(B)))))
       return BinaryOperator::CreateAdd(Op0, Builder->CreateMul(A, B));
 
     // X - A*CI -> X + A*-CI
-    // X - CI*A -> X + A*-CI
-    if (match(Op1, m_Mul(m_Value(A), m_Constant(CI))) ||
-        match(Op1, m_Mul(m_Constant(CI), m_Value(A)))) {
+    // No need to handle commuted multiply because multiply handling will
+    // ensure constant will be move to the right hand side.
+    if (match(Op1, m_Mul(m_Value(A), m_Constant(CI)))) {
       Value *NewMul = Builder->CreateMul(A, ConstantExpr::getNeg(CI));
       return BinaryOperator::CreateAdd(Op0, NewMul);
     }
diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index b06bb1bb877374f67d3a3f54933f8ae9a0c213aa..99a983ab47479b3a8eee251cf1865822f2f1dd67 100644
--- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -137,9 +137,8 @@ Value *InstCombiner::SimplifyBSwap(BinaryOperator &I) {
 }
 
 /// This handles expressions of the form ((val OP C1) & C2).  Where
-/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
-/// guaranteed to be a binary operator.
-Instruction *InstCombiner::OptAndOp(Instruction *Op,
+/// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.
+Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
                                     ConstantInt *OpRHS,
                                     ConstantInt *AndRHS,
                                     BinaryOperator &TheAnd) {
@@ -149,6 +148,7 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     Together = ConstantExpr::getAnd(AndRHS, OpRHS);
 
   switch (Op->getOpcode()) {
+  default: break;
   case Instruction::Xor:
     if (Op->hasOneUse()) {
       // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
@@ -159,13 +159,6 @@ Instruction *InstCombiner::OptAndOp(Instruction *Op,
     break;
   case Instruction::Or:
     if (Op->hasOneUse()){
-      if (Together != OpRHS) {
-        // (X | C1) & C2 --> (X | (C1&C2)) & C2
-        Value *Or = Builder->CreateOr(X, Together);
-        Or->takeName(Op);
-        return BinaryOperator::CreateAnd(Or, AndRHS);
-      }
-
       ConstantInt *TogetherCI = dyn_cast<ConstantInt>(Together);
       if (TogetherCI && !TogetherCI->isZero()){
         // (X | C1) & C2 --> (X & (C2^(C1&C2))) | C1
@@ -302,178 +295,91 @@ Value *InstCombiner::insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
   return Builder->CreateICmp(Pred, VMinusLo, HiMinusLo);
 }
 
-/// Returns true iff Val consists of one contiguous run of 1s with any number
-/// of 0s on either side.  The 1s are allowed to wrap from LSB to MSB,
-/// so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is
-/// not, since all 1s are not contiguous.
-static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {
-  const APInt& V = Val->getValue();
-  uint32_t BitWidth = Val->getType()->getBitWidth();
-  if (!APIntOps::isShiftedMask(BitWidth, V)) return false;
-
-  // look for the first zero bit after the run of ones
-  MB = BitWidth - ((V - 1) ^ V).countLeadingZeros();
-  // look for the first non-zero bit
-  ME = V.getActiveBits();
-  return true;
-}
-
-/// This is part of an expression (LHS +/- RHS) & Mask, where isSub determines
-/// whether the operator is a sub. If we can fold one of the following xforms:
+/// Classify (icmp eq (A & B), C) and (icmp ne (A & B), C) as matching patterns
+/// that can be simplified.
+/// One of A and B is considered the mask. The other is the value. This is
+/// described as the "AMask" or "BMask" part of the enum. If the enum contains
+/// only "Mask", then both A and B can be considered masks. If A is the mask,
+/// then it was proven that (A & C) == C. This is trivial if C == A or C == 0.
+/// If both A and C are constants, this proof is also easy.
+/// For the following explanations, we assume that A is the mask.
 ///
-/// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask
-/// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
-/// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+/// "AllOnes" declares that the comparison is true only if (A & B) == A or all
+/// bits of A are set in B.
+///   Example: (icmp eq (A & 3), 3) -> AMask_AllOnes
 ///
-/// return (A +/- B).
+/// "AllZeros" declares that the comparison is true only if (A & B) == 0 or all
+/// bits of A are cleared in B.
+///   Example: (icmp eq (A & 3), 0) -> Mask_AllZeroes
+///
+/// "Mixed" declares that (A & B) == C and C might or might not contain any
+/// number of one bits and zero bits.
+///   Example: (icmp eq (A & 3), 1) -> AMask_Mixed
+///
+/// "Not" means that in above descriptions "==" should be replaced by "!=".
+///   Example: (icmp ne (A & 3), 3) -> AMask_NotAllOnes
 ///
-Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
-                                        ConstantInt *Mask, bool isSub,
-                                        Instruction &I) {
-  Instruction *LHSI = dyn_cast<Instruction>(LHS);
-  if (!LHSI || LHSI->getNumOperands() != 2 ||
-      !isa<ConstantInt>(LHSI->getOperand(1))) return nullptr;
-
-  ConstantInt *N = cast<ConstantInt>(LHSI->getOperand(1));
-
-  switch (LHSI->getOpcode()) {
-  default: return nullptr;
-  case Instruction::And:
-    if (ConstantExpr::getAnd(N, Mask) == Mask) {
-      // If the AndRHS is a power of two minus one (0+1+), this is simple.
-      if ((Mask->getValue().countLeadingZeros() +
-           Mask->getValue().countPopulation()) ==
-          Mask->getValue().getBitWidth())
-        break;
-
-      // Otherwise, if Mask is 0+1+0+, and if B is known to have the low 0+
-      // part, we don't need any explicit masks to take them out of A.  If that
-      // is all N is, ignore it.
-      uint32_t MB = 0, ME = 0;
-      if (isRunOfOnes(Mask, MB, ME)) {  // begin/end bit of run, inclusive
-        uint32_t BitWidth = cast<IntegerType>(RHS->getType())->getBitWidth();
-        APInt Mask(APInt::getLowBitsSet(BitWidth, MB-1));
-        if (MaskedValueIsZero(RHS, Mask, 0, &I))
-          break;
-      }
-    }
-    return nullptr;
-  case Instruction::Or:
-  case Instruction::Xor:
-    // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0
-    if ((Mask->getValue().countLeadingZeros() +
-         Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()
-        && ConstantExpr::getAnd(N, Mask)->isNullValue())
-      break;
-    return nullptr;
-  }
-
-  if (isSub)
-    return Builder->CreateSub(LHSI->getOperand(0), RHS, "fold");
-  return Builder->CreateAdd(LHSI->getOperand(0), RHS, "fold");
-}
-
-/// enum for classifying (icmp eq (A & B), C) and (icmp ne (A & B), C)
-/// One of A and B is considered the mask, the other the value. This is
-/// described as the "AMask" or "BMask" part of the enum. If the enum
-/// contains only "Mask", then both A and B can be considered masks.
-/// If A is the mask, then it was proven, that (A & C) == C. This
-/// is trivial if C == A, or C == 0. If both A and C are constants, this
-/// proof is also easy.
-/// For the following explanations we assume that A is the mask.
-/// The part "AllOnes" declares, that the comparison is true only
-/// if (A & B) == A, or all bits of A are set in B.
-///   Example: (icmp eq (A & 3), 3) -> FoldMskICmp_AMask_AllOnes
-/// The part "AllZeroes" declares, that the comparison is true only
-/// if (A & B) == 0, or all bits of A are cleared in B.
-///   Example: (icmp eq (A & 3), 0) -> FoldMskICmp_Mask_AllZeroes
-/// The part "Mixed" declares, that (A & B) == C and C might or might not
-/// contain any number of one bits and zero bits.
-///   Example: (icmp eq (A & 3), 1) -> FoldMskICmp_AMask_Mixed
-/// The Part "Not" means, that in above descriptions "==" should be replaced
-/// by "!=".
-///   Example: (icmp ne (A & 3), 3) -> FoldMskICmp_AMask_NotAllOnes
 /// If the mask A contains a single bit, then the following is equivalent:
 ///    (icmp eq (A & B), A) equals (icmp ne (A & B), 0)
 ///    (icmp ne (A & B), A) equals (icmp eq (A & B), 0)
 enum MaskedICmpType {
-  FoldMskICmp_AMask_AllOnes           =     1,
-  FoldMskICmp_AMask_NotAllOnes        =     2,
-  FoldMskICmp_BMask_AllOnes           =     4,
-  FoldMskICmp_BMask_NotAllOnes        =     8,
-  FoldMskICmp_Mask_AllZeroes          =    16,
-  FoldMskICmp_Mask_NotAllZeroes       =    32,
-  FoldMskICmp_AMask_Mixed             =    64,
-  FoldMskICmp_AMask_NotMixed          =   128,
-  FoldMskICmp_BMask_Mixed             =   256,
-  FoldMskICmp_BMask_NotMixed          =   512
+  AMask_AllOnes           =     1,
+  AMask_NotAllOnes        =     2,
+  BMask_AllOnes           =     4,
+  BMask_NotAllOnes        =     8,
+  Mask_AllZeros           =    16,
+  Mask_NotAllZeros        =    32,
+  AMask_Mixed             =    64,
+  AMask_NotMixed          =   128,
+  BMask_Mixed             =   256,
+  BMask_NotMixed          =   512
 };
 
-/// Return the set of pattern classes (from MaskedICmpType)
-/// that (icmp SCC (A & B), C) satisfies.
-static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
-                                    ICmpInst::Predicate SCC)
-{
+/// Return the set of patterns (from MaskedICmpType) that (icmp SCC (A & B), C)
+/// satisfies.
+static unsigned getMaskedICmpType(Value *A, Value *B, Value *C,
+                                  ICmpInst::Predicate Pred) {
   ConstantInt *ACst = dyn_cast<ConstantInt>(A);
   ConstantInt *BCst = dyn_cast<ConstantInt>(B);
   ConstantInt *CCst = dyn_cast<ConstantInt>(C);
-  bool icmp_eq = (SCC == ICmpInst::ICMP_EQ);
-  bool icmp_abit = (ACst && !ACst->isZero() &&
-                    ACst->getValue().isPowerOf2());
-  bool icmp_bbit = (BCst && !BCst->isZero() &&
-                    BCst->getValue().isPowerOf2());
-  unsigned result = 0;
+  bool IsEq = (Pred == ICmpInst::ICMP_EQ);
+  bool IsAPow2 = (ACst && !ACst->isZero() && ACst->getValue().isPowerOf2());
+  bool IsBPow2 = (BCst && !BCst->isZero() && BCst->getValue().isPowerOf2());
+  unsigned MaskVal = 0;
   if (CCst && CCst->isZero()) {
     // if C is zero, then both A and B qualify as mask
-    result |= (icmp_eq ? (FoldMskICmp_Mask_AllZeroes |
-                          FoldMskICmp_AMask_Mixed |
-                          FoldMskICmp_BMask_Mixed)
-                       : (FoldMskICmp_Mask_NotAllZeroes |
-                          FoldMskICmp_AMask_NotMixed |
-                          FoldMskICmp_BMask_NotMixed));
-    if (icmp_abit)
-      result |= (icmp_eq ? (FoldMskICmp_AMask_NotAllOnes |
-                            FoldMskICmp_AMask_NotMixed)
-                         : (FoldMskICmp_AMask_AllOnes |
-                            FoldMskICmp_AMask_Mixed));
-    if (icmp_bbit)
-      result |= (icmp_eq ? (FoldMskICmp_BMask_NotAllOnes |
-                            FoldMskICmp_BMask_NotMixed)
-                         : (FoldMskICmp_BMask_AllOnes |
-                            FoldMskICmp_BMask_Mixed));
-    return result;
+    MaskVal |= (IsEq ? (Mask_AllZeros | AMask_Mixed | BMask_Mixed)
+                     : (Mask_NotAllZeros | AMask_NotMixed | BMask_NotMixed));
+    if (IsAPow2)
+      MaskVal |= (IsEq ? (AMask_NotAllOnes | AMask_NotMixed)
+                       : (AMask_AllOnes | AMask_Mixed));
+    if (IsBPow2)
+      MaskVal |= (IsEq ? (BMask_NotAllOnes | BMask_NotMixed)
+                       : (BMask_AllOnes | BMask_Mixed));
+    return MaskVal;
   }
+
   if (A == C) {
-    result |= (icmp_eq ? (FoldMskICmp_AMask_AllOnes |
-                          FoldMskICmp_AMask_Mixed)
-                       : (FoldMskICmp_AMask_NotAllOnes |
-                          FoldMskICmp_AMask_NotMixed));
-    if (icmp_abit)
-      result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes |
-                            FoldMskICmp_AMask_NotMixed)
-                         : (FoldMskICmp_Mask_AllZeroes |
-                            FoldMskICmp_AMask_Mixed));
-  } else if (ACst && CCst &&
-             ConstantExpr::getAnd(ACst, CCst) == CCst) {
-    result |= (icmp_eq ? FoldMskICmp_AMask_Mixed
-                       : FoldMskICmp_AMask_NotMixed);
+    MaskVal |= (IsEq ? (AMask_AllOnes | AMask_Mixed)
+                     : (AMask_NotAllOnes | AMask_NotMixed));
+    if (IsAPow2)
+      MaskVal |= (IsEq ? (Mask_NotAllZeros | AMask_NotMixed)
+                       : (Mask_AllZeros | AMask_Mixed));
+  } else if (ACst && CCst && ConstantExpr::getAnd(ACst, CCst) == CCst) {
+    MaskVal |= (IsEq ? AMask_Mixed : AMask_NotMixed);
   }
+
   if (B == C) {
-    result |= (icmp_eq ? (FoldMskICmp_BMask_AllOnes |
-                          FoldMskICmp_BMask_Mixed)
-                       : (FoldMskICmp_BMask_NotAllOnes |
-                          FoldMskICmp_BMask_NotMixed));
-    if (icmp_bbit)
-      result |= (icmp_eq ? (FoldMskICmp_Mask_NotAllZeroes |
-                            FoldMskICmp_BMask_NotMixed)
-                         : (FoldMskICmp_Mask_AllZeroes |
-                            FoldMskICmp_BMask_Mixed));
-  } else if (BCst && CCst &&
-             ConstantExpr::getAnd(BCst, CCst) == CCst) {
-    result |= (icmp_eq ? FoldMskICmp_BMask_Mixed
-                       : FoldMskICmp_BMask_NotMixed);
-  }
-  return result;
+    MaskVal |= (IsEq ? (BMask_AllOnes | BMask_Mixed)
+                     : (BMask_NotAllOnes | BMask_NotMixed));
+    if (IsBPow2)
+      MaskVal |= (IsEq ? (Mask_NotAllZeros | BMask_NotMixed)
+                       : (Mask_AllZeros | BMask_Mixed));
+  } else if (BCst && CCst && ConstantExpr::getAnd(BCst, CCst) == CCst) {
+    MaskVal |= (IsEq ? BMask_Mixed : BMask_NotMixed);
+  }
+
+  return MaskVal;
 }
 
 /// Convert an analysis of a masked ICmp into its equivalent if all boolean
@@ -482,32 +388,30 @@ static unsigned getTypeOfMaskedICmp(Value* A, Value* B, Value* C,
 /// involves swapping those bits over.
 static unsigned conjugateICmpMask(unsigned Mask) {
   unsigned NewMask;
-  NewMask = (Mask & (FoldMskICmp_AMask_AllOnes | FoldMskICmp_BMask_AllOnes |
-                     FoldMskICmp_Mask_AllZeroes | FoldMskICmp_AMask_Mixed |
-                     FoldMskICmp_BMask_Mixed))
+  NewMask = (Mask & (AMask_AllOnes | BMask_AllOnes | Mask_AllZeros |
+                     AMask_Mixed | BMask_Mixed))
             << 1;
 
-  NewMask |=
-      (Mask & (FoldMskICmp_AMask_NotAllOnes | FoldMskICmp_BMask_NotAllOnes |
-               FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_AMask_NotMixed |
-               FoldMskICmp_BMask_NotMixed))
-      >> 1;
+  NewMask |= (Mask & (AMask_NotAllOnes | BMask_NotAllOnes | Mask_NotAllZeros |
+                      AMask_NotMixed | BMask_NotMixed))
+             >> 1;
 
   return NewMask;
 }
 
-/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E)
-/// Return the set of pattern classes (from MaskedICmpType)
-/// that both LHS and RHS satisfy.
-static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
-                                             Value*& B, Value*& C,
-                                             Value*& D, Value*& E,
-                                             ICmpInst *LHS, ICmpInst *RHS,
-                                             ICmpInst::Predicate &LHSCC,
-                                             ICmpInst::Predicate &RHSCC) {
-  if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType()) return 0;
+/// Handle (icmp(A & B) ==/!= C) &/| (icmp(A & D) ==/!= E).
+/// Return the set of pattern classes (from MaskedICmpType) that both LHS and
+/// RHS satisfy.
+static unsigned getMaskedTypeForICmpPair(Value *&A, Value *&B, Value *&C,
+                                         Value *&D, Value *&E, ICmpInst *LHS,
+                                         ICmpInst *RHS,
+                                         ICmpInst::Predicate &PredL,
+                                         ICmpInst::Predicate &PredR) {
+  if (LHS->getOperand(0)->getType() != RHS->getOperand(0)->getType())
+    return 0;
   // vectors are not (yet?) supported
-  if (LHS->getOperand(0)->getType()->isVectorTy()) return 0;
+  if (LHS->getOperand(0)->getType()->isVectorTy())
+    return 0;
 
   // Here comes the tricky part:
   // LHS might be of the form L11 & L12 == X, X == L21 & L22,
@@ -517,9 +421,9 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   // above.
   Value *L1 = LHS->getOperand(0);
   Value *L2 = LHS->getOperand(1);
-  Value *L11,*L12,*L21,*L22;
+  Value *L11, *L12, *L21, *L22;
   // Check whether the icmp can be decomposed into a bit test.
-  if (decomposeBitTestICmp(LHS, LHSCC, L11, L12, L2)) {
+  if (decomposeBitTestICmp(LHS, PredL, L11, L12, L2)) {
     L21 = L22 = L1 = nullptr;
   } else {
     // Look for ANDs in the LHS icmp.
@@ -543,22 +447,26 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
   }
 
   // Bail if LHS was a icmp that can't be decomposed into an equality.
-  if (!ICmpInst::isEquality(LHSCC))
+  if (!ICmpInst::isEquality(PredL))
     return 0;
 
   Value *R1 = RHS->getOperand(0);
   Value *R2 = RHS->getOperand(1);
-  Value *R11,*R12;
-  bool ok = false;
-  if (decomposeBitTestICmp(RHS, RHSCC, R11, R12, R2)) {
+  Value *R11, *R12;
+  bool Ok = false;
+  if (decomposeBitTestICmp(RHS, PredR, R11, R12, R2)) {
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
-      A = R11; D = R12;
+      A = R11;
+      D = R12;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
-      A = R12; D = R11;
+      A = R12;
+      D = R11;
     } else {
       return 0;
     }
-    E = R2; R1 = nullptr; ok = true;
+    E = R2;
+    R1 = nullptr;
+    Ok = true;
   } else if (R1->getType()->isIntegerTy()) {
     if (!match(R1, m_And(m_Value(R11), m_Value(R12)))) {
       // As before, model no mask as a trivial mask if it'll let us do an
@@ -568,46 +476,62 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
     }
 
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
-      A = R11; D = R12; E = R2; ok = true;
+      A = R11;
+      D = R12;
+      E = R2;
+      Ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
-      A = R12; D = R11; E = R2; ok = true;
+      A = R12;
+      D = R11;
+      E = R2;
+      Ok = true;
     }
   }
 
   // Bail if RHS was a icmp that can't be decomposed into an equality.
-  if (!ICmpInst::isEquality(RHSCC))
+  if (!ICmpInst::isEquality(PredR))
     return 0;
 
   // Look for ANDs on the right side of the RHS icmp.
-  if (!ok && R2->getType()->isIntegerTy()) {
+  if (!Ok && R2->getType()->isIntegerTy()) {
     if (!match(R2, m_And(m_Value(R11), m_Value(R12)))) {
       R11 = R2;
       R12 = Constant::getAllOnesValue(R2->getType());
     }
 
     if (R11 == L11 || R11 == L12 || R11 == L21 || R11 == L22) {
-      A = R11; D = R12; E = R1; ok = true;
+      A = R11;
+      D = R12;
+      E = R1;
+      Ok = true;
     } else if (R12 == L11 || R12 == L12 || R12 == L21 || R12 == L22) {
-      A = R12; D = R11; E = R1; ok = true;
+      A = R12;
+      D = R11;
+      E = R1;
+      Ok = true;
     } else {
       return 0;
     }
   }
-  if (!ok)
+  if (!Ok)
     return 0;
 
   if (L11 == A) {
-    B = L12; C = L2;
+    B = L12;
+    C = L2;
   } else if (L12 == A) {
-    B = L11; C = L2;
+    B = L11;
+    C = L2;
   } else if (L21 == A) {
-    B = L22; C = L1;
+    B = L22;
+    C = L1;
   } else if (L22 == A) {
-    B = L21; C = L1;
+    B = L21;
+    C = L1;
   }
 
-  unsigned LeftType = getTypeOfMaskedICmp(A, B, C, LHSCC);
-  unsigned RightType = getTypeOfMaskedICmp(A, D, E, RHSCC);
+  unsigned LeftType = getMaskedICmpType(A, B, C, PredL);
+  unsigned RightType = getMaskedICmpType(A, D, E, PredR);
   return LeftType & RightType;
 }
 
@@ -616,12 +540,14 @@ static unsigned foldLogOpOfMaskedICmpsHelper(Value*& A,
 static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
                                      llvm::InstCombiner::BuilderTy *Builder) {
   Value *A = nullptr, *B = nullptr, *C = nullptr, *D = nullptr, *E = nullptr;
-  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
-  unsigned Mask = foldLogOpOfMaskedICmpsHelper(A, B, C, D, E, LHS, RHS,
-                                               LHSCC, RHSCC);
-  if (Mask == 0) return nullptr;
-  assert(ICmpInst::isEquality(LHSCC) && ICmpInst::isEquality(RHSCC) &&
-         "foldLogOpOfMaskedICmpsHelper must return an equality predicate.");
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
+  unsigned Mask =
+      getMaskedTypeForICmpPair(A, B, C, D, E, LHS, RHS, PredL, PredR);
+  if (Mask == 0)
+    return nullptr;
+
+  assert(ICmpInst::isEquality(PredL) && ICmpInst::isEquality(PredR) &&
+         "Expected equality predicates for masked type of icmps.");
 
   // In full generality:
   //     (icmp (A & B) Op C) | (icmp (A & D) Op E)
@@ -642,7 +568,7 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     Mask = conjugateICmpMask(Mask);
   }
 
-  if (Mask & FoldMskICmp_Mask_AllZeroes) {
+  if (Mask & Mask_AllZeros) {
     // (icmp eq (A & B), 0) & (icmp eq (A & D), 0)
     // -> (icmp eq (A & (B|D)), 0)
     Value *NewOr = Builder->CreateOr(B, D);
@@ -653,14 +579,14 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     Value *Zero = Constant::getNullValue(A->getType());
     return Builder->CreateICmp(NewCC, NewAnd, Zero);
   }
-  if (Mask & FoldMskICmp_BMask_AllOnes) {
+  if (Mask & BMask_AllOnes) {
     // (icmp eq (A & B), B) & (icmp eq (A & D), D)
     // -> (icmp eq (A & (B|D)), (B|D))
     Value *NewOr = Builder->CreateOr(B, D);
     Value *NewAnd = Builder->CreateAnd(A, NewOr);
     return Builder->CreateICmp(NewCC, NewAnd, NewOr);
   }
-  if (Mask & FoldMskICmp_AMask_AllOnes) {
+  if (Mask & AMask_AllOnes) {
     // (icmp eq (A & B), A) & (icmp eq (A & D), A)
     // -> (icmp eq (A & (B&D)), A)
     Value *NewAnd1 = Builder->CreateAnd(B, D);
@@ -672,11 +598,13 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
   // their actual values. This isn't strictly necessary, just a "handle the
   // easy cases for now" decision.
   ConstantInt *BCst = dyn_cast<ConstantInt>(B);
-  if (!BCst) return nullptr;
+  if (!BCst)
+    return nullptr;
   ConstantInt *DCst = dyn_cast<ConstantInt>(D);
-  if (!DCst) return nullptr;
+  if (!DCst)
+    return nullptr;
 
-  if (Mask & (FoldMskICmp_Mask_NotAllZeroes | FoldMskICmp_BMask_NotAllOnes)) {
+  if (Mask & (Mask_NotAllZeros | BMask_NotAllOnes)) {
     // (icmp ne (A & B), 0) & (icmp ne (A & D), 0) and
     // (icmp ne (A & B), B) & (icmp ne (A & D), D)
     //     -> (icmp ne (A & B), 0) or (icmp ne (A & D), 0)
@@ -689,7 +617,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     else if (NewMask == DCst->getValue())
       return RHS;
   }
-  if (Mask & FoldMskICmp_AMask_NotAllOnes) {
+
+  if (Mask & AMask_NotAllOnes) {
     // (icmp ne (A & B), B) & (icmp ne (A & D), D)
     //     -> (icmp ne (A & B), A) or (icmp ne (A & D), A)
     // Only valid if one of the masks is a superset of the other (check "B|D" is
@@ -701,7 +630,8 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     else if (NewMask == DCst->getValue())
       return RHS;
   }
-  if (Mask & FoldMskICmp_BMask_Mixed) {
+
+  if (Mask & BMask_Mixed) {
     // (icmp eq (A & B), C) & (icmp eq (A & D), E)
     // We already know that B & C == C && D & E == E.
     // If we can prove that (B & D) & (C ^ E) == 0, that is, the bits of
@@ -713,23 +643,28 @@ static Value *foldLogOpOfMaskedICmps(ICmpInst *LHS, ICmpInst *RHS, bool IsAnd,
     //   (icmp ne (A & B), B) & (icmp eq (A & D), D)
     // with B and D, having a single bit set.
     ConstantInt *CCst = dyn_cast<ConstantInt>(C);
-    if (!CCst) return nullptr;
+    if (!CCst)
+      return nullptr;
     ConstantInt *ECst = dyn_cast<ConstantInt>(E);
-    if (!ECst) return nullptr;
-    if (LHSCC != NewCC)
+    if (!ECst)
+      return nullptr;
+    if (PredL != NewCC)
       CCst = cast<ConstantInt>(ConstantExpr::getXor(BCst, CCst));
-    if (RHSCC != NewCC)
+    if (PredR != NewCC)
       ECst = cast<ConstantInt>(ConstantExpr::getXor(DCst, ECst));
+
     // If there is a conflict, we should actually return a false for the
     // whole construct.
     if (((BCst->getValue() & DCst->getValue()) &
          (CCst->getValue() ^ ECst->getValue())) != 0)
       return ConstantInt::get(LHS->getType(), !IsAnd);
+
     Value *NewOr1 = Builder->CreateOr(B, D);
     Value *NewOr2 = ConstantExpr::getOr(CCst, ECst);
     Value *NewAnd = Builder->CreateAnd(A, NewOr1);
     return Builder->CreateICmp(NewCC, NewAnd, NewOr2);
   }
+
   return nullptr;
 }
 
@@ -791,10 +726,10 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1,
 
 /// Fold (icmp)&(icmp) if possible.
 Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
-  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
 
   // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
-  if (PredicatesFoldable(LHSCC, RHSCC)) {
+  if (PredicatesFoldable(PredL, PredR)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
       LHS->swapOperands();
@@ -820,85 +755,86 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
     return V;
 
   // This only handles icmp of constants: (icmp1 A, C1) & (icmp2 B, C2).
-  Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
-  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
-  if (!LHSCst || !RHSCst) return nullptr;
+  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  if (!LHSC || !RHSC)
+    return nullptr;
 
-  if (LHSCst == RHSCst && LHSCC == RHSCC) {
+  if (LHSC == RHSC && PredL == PredR) {
     // (icmp ult A, C) & (icmp ult B, C) --> (icmp ult (A|B), C)
     // where C is a power of 2 or
     // (icmp eq A, 0) & (icmp eq B, 0) --> (icmp eq (A|B), 0)
-    if ((LHSCC == ICmpInst::ICMP_ULT && LHSCst->getValue().isPowerOf2()) ||
-        (LHSCC == ICmpInst::ICMP_EQ && LHSCst->isZero())) {
-      Value *NewOr = Builder->CreateOr(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+    if ((PredL == ICmpInst::ICMP_ULT && LHSC->getValue().isPowerOf2()) ||
+        (PredL == ICmpInst::ICMP_EQ && LHSC->isZero())) {
+      Value *NewOr = Builder->CreateOr(LHS0, RHS0);
+      return Builder->CreateICmp(PredL, NewOr, LHSC);
     }
   }
 
   // (trunc x) == C1 & (and x, CA) == C2 -> (and x, CA|CMAX) == C1|C2
   // where CMAX is the all ones value for the truncated type,
   // iff the lower bits of C2 and CA are zero.
-  if (LHSCC == ICmpInst::ICMP_EQ && LHSCC == RHSCC &&
-      LHS->hasOneUse() && RHS->hasOneUse()) {
+  if (PredL == ICmpInst::ICMP_EQ && PredL == PredR && LHS->hasOneUse() &&
+      RHS->hasOneUse()) {
     Value *V;
-    ConstantInt *AndCst, *SmallCst = nullptr, *BigCst = nullptr;
+    ConstantInt *AndC, *SmallC = nullptr, *BigC = nullptr;
 
     // (trunc x) == C1 & (and x, CA) == C2
     // (and x, CA) == C2 & (trunc x) == C1
-    if (match(Val2, m_Trunc(m_Value(V))) &&
-        match(Val, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
-      SmallCst = RHSCst;
-      BigCst = LHSCst;
-    } else if (match(Val, m_Trunc(m_Value(V))) &&
-               match(Val2, m_And(m_Specific(V), m_ConstantInt(AndCst)))) {
-      SmallCst = LHSCst;
-      BigCst = RHSCst;
+    if (match(RHS0, m_Trunc(m_Value(V))) &&
+        match(LHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+      SmallC = RHSC;
+      BigC = LHSC;
+    } else if (match(LHS0, m_Trunc(m_Value(V))) &&
+               match(RHS0, m_And(m_Specific(V), m_ConstantInt(AndC)))) {
+      SmallC = LHSC;
+      BigC = RHSC;
     }
 
-    if (SmallCst && BigCst) {
-      unsigned BigBitSize = BigCst->getType()->getBitWidth();
-      unsigned SmallBitSize = SmallCst->getType()->getBitWidth();
+    if (SmallC && BigC) {
+      unsigned BigBitSize = BigC->getType()->getBitWidth();
+      unsigned SmallBitSize = SmallC->getType()->getBitWidth();
 
       // Check that the low bits are zero.
       APInt Low = APInt::getLowBitsSet(BigBitSize, SmallBitSize);
-      if ((Low & AndCst->getValue()) == 0 && (Low & BigCst->getValue()) == 0) {
-        Value *NewAnd = Builder->CreateAnd(V, Low | AndCst->getValue());
-        APInt N = SmallCst->getValue().zext(BigBitSize) | BigCst->getValue();
-        Value *NewVal = ConstantInt::get(AndCst->getType()->getContext(), N);
-        return Builder->CreateICmp(LHSCC, NewAnd, NewVal);
+      if ((Low & AndC->getValue()) == 0 && (Low & BigC->getValue()) == 0) {
+        Value *NewAnd = Builder->CreateAnd(V, Low | AndC->getValue());
+        APInt N = SmallC->getValue().zext(BigBitSize) | BigC->getValue();
+        Value *NewVal = ConstantInt::get(AndC->getType()->getContext(), N);
+        return Builder->CreateICmp(PredL, NewAnd, NewVal);
       }
     }
   }
 
   // From here on, we only handle:
   //    (icmp1 A, C1) & (icmp2 A, C2) --> something simpler.
-  if (Val != Val2) return nullptr;
+  if (LHS0 != RHS0)
+    return nullptr;
 
-  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
-  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
-      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
-      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
-      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
     return nullptr;
 
   // We can't fold (ugt x, C) & (sgt x, C2).
-  if (!PredicatesFoldable(LHSCC, RHSCC))
+  if (!PredicatesFoldable(PredL, PredR))
     return nullptr;
 
   // Ensure that the larger constant is on the RHS.
   bool ShouldSwap;
-  if (CmpInst::isSigned(LHSCC) ||
-      (ICmpInst::isEquality(LHSCC) &&
-       CmpInst::isSigned(RHSCC)))
-    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  if (CmpInst::isSigned(PredL) ||
+      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
   else
-    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
 
   if (ShouldSwap) {
     std::swap(LHS, RHS);
-    std::swap(LHSCst, RHSCst);
-    std::swap(LHSCC, RHSCC);
+    std::swap(LHSC, RHSC);
+    std::swap(PredL, PredR);
   }
 
   // At this point, we know we have two icmp instructions
@@ -907,113 +843,104 @@ Value *InstCombiner::FoldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS) {
   // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know
   // (from the icmp folding check above), that the two constants
   // are not equal and that the larger constant is on the RHS
-  assert(LHSCst != RHSCst && "Compares not folded above?");
+  assert(LHSC != RHSC && "Compares not folded above?");
 
-  switch (LHSCC) {
-  default: llvm_unreachable("Unknown integer condition code!");
+  switch (PredL) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
   case ICmpInst::ICMP_EQ:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_NE:         // (X == 13 & X != 15) -> X == 13
-    case ICmpInst::ICMP_ULT:        // (X == 13 & X <  15) -> X == 13
-    case ICmpInst::ICMP_SLT:        // (X == 13 & X <  15) -> X == 13
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_NE:  // (X == 13 & X != 15) -> X == 13
+    case ICmpInst::ICMP_ULT: // (X == 13 & X <  15) -> X == 13
+    case ICmpInst::ICMP_SLT: // (X == 13 & X <  15) -> X == 13
       return LHS;
     }
   case ICmpInst::ICMP_NE:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_ULT:
-      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
-        return Builder->CreateICmpULT(Val, LHSCst);
-      if (LHSCst->isNullValue())    // (X !=  0 & X u< 14) -> X-1 u< 13
-        return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
+      if (LHSC == SubOne(RHSC)) // (X != 13 & X u< 14) -> X < 13
+        return Builder->CreateICmpULT(LHS0, LHSC);
+      if (LHSC->isNullValue()) // (X !=  0 & X u< 14) -> X-1 u< 13
+        return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
                                false, true);
-      break;                        // (X != 13 & X u< 15) -> no change
+      break; // (X != 13 & X u< 15) -> no change
     case ICmpInst::ICMP_SLT:
-      if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
-        return Builder->CreateICmpSLT(Val, LHSCst);
-      break;                        // (X != 13 & X s< 15) -> no change
-    case ICmpInst::ICMP_EQ:         // (X != 13 & X == 15) -> X == 15
-    case ICmpInst::ICMP_UGT:        // (X != 13 & X u> 15) -> X u> 15
-    case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
+      if (LHSC == SubOne(RHSC)) // (X != 13 & X s< 14) -> X < 13
+        return Builder->CreateICmpSLT(LHS0, LHSC);
+      break;                 // (X != 13 & X s< 15) -> no change
+    case ICmpInst::ICMP_EQ:  // (X != 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT: // (X != 13 & X u> 15) -> X u> 15
+    case ICmpInst::ICMP_SGT: // (X != 13 & X s> 15) -> X s> 15
       return RHS;
     case ICmpInst::ICMP_NE:
       // Special case to get the ordering right when the values wrap around
       // zero.
-      if (LHSCst->getValue() == 0 && RHSCst->getValue().isAllOnesValue())
-        std::swap(LHSCst, RHSCst);
-      if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
-        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
+      if (LHSC->getValue() == 0 && RHSC->getValue().isAllOnesValue())
+        std::swap(LHSC, RHSC);
+      if (LHSC == SubOne(RHSC)) { // (X != 13 & X != 14) -> X-13 >u 1
+        Constant *AddC = ConstantExpr::getNeg(LHSC);
+        Value *Add = Builder->CreateAdd(LHS0, AddC, LHS0->getName() + ".off");
         return Builder->CreateICmpUGT(Add, ConstantInt::get(Add->getType(), 1),
-                                      Val->getName()+".cmp");
+                                      LHS0->getName() + ".cmp");
       }
-      break;                        // (X != 13 & X != 15) -> no change
+      break; // (X != 13 & X != 15) -> no change
     }
     break;
   case ICmpInst::ICMP_ULT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u< 13 & X == 15) -> false
-    case ICmpInst::ICMP_UGT:        // (X u< 13 & X u> 15) -> false
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X u< 13 & X == 15) -> false
+    case ICmpInst::ICMP_UGT: // (X u< 13 & X u> 15) -> false
       return ConstantInt::get(CmpInst::makeCmpResultType(LHS->getType()), 0);
-    case ICmpInst::ICMP_SGT:        // (X u< 13 & X s> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X u< 13 & X != 15) -> X u< 13
-    case ICmpInst::ICMP_ULT:        // (X u< 13 & X u< 15) -> X u< 13
+    case ICmpInst::ICMP_NE:  // (X u< 13 & X != 15) -> X u< 13
+    case ICmpInst::ICMP_ULT: // (X u< 13 & X u< 15) -> X u< 13
       return LHS;
-    case ICmpInst::ICMP_SLT:        // (X u< 13 & X s< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_SLT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_UGT:        // (X s< 13 & X u> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X s< 13 & X != 15) -> X < 13
-    case ICmpInst::ICMP_SLT:        // (X s< 13 & X s< 15) -> X < 13
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_NE:  // (X s< 13 & X != 15) -> X < 13
+    case ICmpInst::ICMP_SLT: // (X s< 13 & X s< 15) -> X < 13
       return LHS;
-    case ICmpInst::ICMP_ULT:        // (X s< 13 & X u< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_UGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X == 15
-    case ICmpInst::ICMP_UGT:        // (X u> 13 & X u> 15) -> X u> 15
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X u> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_UGT: // (X u> 13 & X u> 15) -> X u> 15
       return RHS;
-    case ICmpInst::ICMP_SGT:        // (X u> 13 & X s> 15) -> no change
-      break;
     case ICmpInst::ICMP_NE:
-      if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14
-        return Builder->CreateICmp(LHSCC, Val, RHSCst);
-      break;                        // (X u> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_ULT:        // (X u> 13 & X u< 15) -> (X-14) <u 1
-      return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
+      if (RHSC == AddOne(LHSC)) // (X u> 13 & X != 14) -> X u> 14
+        return Builder->CreateICmp(PredL, LHS0, RHSC);
+      break;                 // (X u> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_ULT: // (X u> 13 & X u< 15) -> (X-14) <u 1
+      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(),
                              false, true);
-    case ICmpInst::ICMP_SLT:        // (X u> 13 & X s< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_SGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X s> 13 & X == 15) -> X == 15
-    case ICmpInst::ICMP_SGT:        // (X s> 13 & X s> 15) -> X s> 15
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X s> 13 & X == 15) -> X == 15
+    case ICmpInst::ICMP_SGT: // (X s> 13 & X s> 15) -> X s> 15
       return RHS;
-    case ICmpInst::ICMP_UGT:        // (X s> 13 & X u> 15) -> no change
-      break;
     case ICmpInst::ICMP_NE:
-      if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14
-        return Builder->CreateICmp(LHSCC, Val, RHSCst);
-      break;                        // (X s> 13 & X != 15) -> no change
-    case ICmpInst::ICMP_SLT:        // (X s> 13 & X s< 15) -> (X-14) s< 1
-      return insertRangeTest(Val, LHSCst->getValue() + 1, RHSCst->getValue(),
-                             true, true);
-    case ICmpInst::ICMP_ULT:        // (X s> 13 & X u< 15) -> no change
-      break;
+      if (RHSC == AddOne(LHSC)) // (X s> 13 & X != 14) -> X s> 14
+        return Builder->CreateICmp(PredL, LHS0, RHSC);
+      break;                 // (X s> 13 & X != 15) -> no change
+    case ICmpInst::ICMP_SLT: // (X s> 13 & X s< 15) -> (X-14) s< 1
+      return insertRangeTest(LHS0, LHSC->getValue() + 1, RHSC->getValue(), true,
+                             true);
     }
     break;
   }
@@ -1314,39 +1241,11 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
 
         break;
       }
-      case Instruction::Add:
-        // ((A & N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == AndRHS.
-        // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
-        // ((A ^ N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
-        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, false, I))
-          return BinaryOperator::CreateAnd(V, AndRHS);
-        if (Value *V = FoldLogicalPlusAnd(Op0RHS, Op0LHS, AndRHS, false, I))
-          return BinaryOperator::CreateAnd(V, AndRHS);  // Add commutes
-        break;
-
       case Instruction::Sub:
-        // ((A & N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == AndRHS.
-        // ((A | N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
-        // ((A ^ N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
-        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I))
-          return BinaryOperator::CreateAnd(V, AndRHS);
-
         // -x & 1 -> x & 1
         if (AndRHSMask == 1 && match(Op0LHS, m_Zero()))
           return BinaryOperator::CreateAnd(Op0RHS, AndRHS);
 
-        // (A - N) & AndRHS -> -N & AndRHS iff A&AndRHS==0 and AndRHS
-        // has 1's for all bits that the subtraction with A might affect.
-        if (Op0I->hasOneUse() && !match(Op0LHS, m_Zero())) {
-          uint32_t BitWidth = AndRHSMask.getBitWidth();
-          uint32_t Zeros = AndRHSMask.countLeadingZeros();
-          APInt Mask = APInt::getLowBitsSet(BitWidth, BitWidth - Zeros);
-
-          if (MaskedValueIsZero(Op0LHS, Mask, 0, &I)) {
-            Value *NewNeg = Builder->CreateNeg(Op0RHS);
-            return BinaryOperator::CreateAnd(NewNeg, AndRHS);
-          }
-        }
         break;
 
       case Instruction::Shl:
@@ -1373,8 +1272,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
       case Instruction::Sub:
         Value *X;
         ConstantInt *C1;
-        if (match(Op0I, m_BinOp(m_ZExt(m_Value(X)), m_ConstantInt(C1))) ||
-            match(Op0I, m_BinOp(m_ConstantInt(C1), m_ZExt(m_Value(X))))) {
+        if (match(Op0I, m_c_BinOp(m_ZExt(m_Value(X)), m_ConstantInt(C1)))) {
           if (AndRHSMask.isIntN(X->getType()->getScalarSizeInBits())) {
             auto *TruncC1 = ConstantExpr::getTrunc(C1, X->getType());
             Value *BinOp;
@@ -1409,10 +1307,11 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         return BinaryOperator::CreateAnd(NewCast, C3);
       }
     }
+  }
 
+  if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
-  }
 
   if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder))
     return DeMorgan;
@@ -1658,15 +1557,15 @@ static Value *matchSelectFromAndOr(Value *A, Value *C, Value *B, Value *D,
 /// Fold (icmp)|(icmp) if possible.
 Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
                                    Instruction *CxtI) {
-  ICmpInst::Predicate LHSCC = LHS->getPredicate(), RHSCC = RHS->getPredicate();
+  ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
 
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
   // if K1 and K2 are a one-bit mask.
-  ConstantInt *LHSCst = dyn_cast<ConstantInt>(LHS->getOperand(1));
-  ConstantInt *RHSCst = dyn_cast<ConstantInt>(RHS->getOperand(1));
+  ConstantInt *LHSC = dyn_cast<ConstantInt>(LHS->getOperand(1));
+  ConstantInt *RHSC = dyn_cast<ConstantInt>(RHS->getOperand(1));
 
-  if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero() &&
-      RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) {
+  if (LHS->getPredicate() == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero() &&
+      RHS->getPredicate() == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) {
 
     BinaryOperator *LAnd = dyn_cast<BinaryOperator>(LHS->getOperand(0));
     BinaryOperator *RAnd = dyn_cast<BinaryOperator>(RHS->getOperand(0));
@@ -1708,52 +1607,52 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // 4) LowRange1 ^ LowRange2 and HighRange1 ^ HighRange2 are one-bit mask.
   // This implies all values in the two ranges differ by exactly one bit.
 
-  if ((LHSCC == ICmpInst::ICMP_ULT || LHSCC == ICmpInst::ICMP_ULE) &&
-      LHSCC == RHSCC && LHSCst && RHSCst && LHS->hasOneUse() &&
-      RHS->hasOneUse() && LHSCst->getType() == RHSCst->getType() &&
-      LHSCst->getValue() == (RHSCst->getValue())) {
+  if ((PredL == ICmpInst::ICMP_ULT || PredL == ICmpInst::ICMP_ULE) &&
+      PredL == PredR && LHSC && RHSC && LHS->hasOneUse() && RHS->hasOneUse() &&
+      LHSC->getType() == RHSC->getType() &&
+      LHSC->getValue() == (RHSC->getValue())) {
 
     Value *LAdd = LHS->getOperand(0);
     Value *RAdd = RHS->getOperand(0);
 
     Value *LAddOpnd, *RAddOpnd;
-    ConstantInt *LAddCst, *RAddCst;
-    if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddCst))) &&
-        match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddCst))) &&
-        LAddCst->getValue().ugt(LHSCst->getValue()) &&
-        RAddCst->getValue().ugt(LHSCst->getValue())) {
-
-      APInt DiffCst = LAddCst->getValue() ^ RAddCst->getValue();
-      if (LAddOpnd == RAddOpnd && DiffCst.isPowerOf2()) {
-        ConstantInt *MaxAddCst = nullptr;
-        if (LAddCst->getValue().ult(RAddCst->getValue()))
-          MaxAddCst = RAddCst;
+    ConstantInt *LAddC, *RAddC;
+    if (match(LAdd, m_Add(m_Value(LAddOpnd), m_ConstantInt(LAddC))) &&
+        match(RAdd, m_Add(m_Value(RAddOpnd), m_ConstantInt(RAddC))) &&
+        LAddC->getValue().ugt(LHSC->getValue()) &&
+        RAddC->getValue().ugt(LHSC->getValue())) {
+
+      APInt DiffC = LAddC->getValue() ^ RAddC->getValue();
+      if (LAddOpnd == RAddOpnd && DiffC.isPowerOf2()) {
+        ConstantInt *MaxAddC = nullptr;
+        if (LAddC->getValue().ult(RAddC->getValue()))
+          MaxAddC = RAddC;
         else
-          MaxAddCst = LAddCst;
+          MaxAddC = LAddC;
 
-        APInt RRangeLow = -RAddCst->getValue();
-        APInt RRangeHigh = RRangeLow + LHSCst->getValue();
-        APInt LRangeLow = -LAddCst->getValue();
-        APInt LRangeHigh = LRangeLow + LHSCst->getValue();
+        APInt RRangeLow = -RAddC->getValue();
+        APInt RRangeHigh = RRangeLow + LHSC->getValue();
+        APInt LRangeLow = -LAddC->getValue();
+        APInt LRangeHigh = LRangeLow + LHSC->getValue();
         APInt LowRangeDiff = RRangeLow ^ LRangeLow;
         APInt HighRangeDiff = RRangeHigh ^ LRangeHigh;
         APInt RangeDiff = LRangeLow.sgt(RRangeLow) ? LRangeLow - RRangeLow
                                                    : RRangeLow - LRangeLow;
 
         if (LowRangeDiff.isPowerOf2() && LowRangeDiff == HighRangeDiff &&
-            RangeDiff.ugt(LHSCst->getValue())) {
-          Value *MaskCst = ConstantInt::get(LAddCst->getType(), ~DiffCst);
+            RangeDiff.ugt(LHSC->getValue())) {
+          Value *MaskC = ConstantInt::get(LAddC->getType(), ~DiffC);
 
-          Value *NewAnd = Builder->CreateAnd(LAddOpnd, MaskCst);
-          Value *NewAdd = Builder->CreateAdd(NewAnd, MaxAddCst);
-          return (Builder->CreateICmp(LHS->getPredicate(), NewAdd, LHSCst));
+          Value *NewAnd = Builder->CreateAnd(LAddOpnd, MaskC);
+          Value *NewAdd = Builder->CreateAdd(NewAnd, MaxAddC);
+          return (Builder->CreateICmp(LHS->getPredicate(), NewAdd, LHSC));
         }
       }
     }
   }
 
   // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
-  if (PredicatesFoldable(LHSCC, RHSCC)) {
+  if (PredicatesFoldable(PredL, PredR)) {
     if (LHS->getOperand(0) == RHS->getOperand(1) &&
         LHS->getOperand(1) == RHS->getOperand(0))
       LHS->swapOperands();
@@ -1771,25 +1670,25 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, false, Builder))
     return V;
 
-  Value *Val = LHS->getOperand(0), *Val2 = RHS->getOperand(0);
+  Value *LHS0 = LHS->getOperand(0), *RHS0 = RHS->getOperand(0);
   if (LHS->hasOneUse() || RHS->hasOneUse()) {
     // (icmp eq B, 0) | (icmp ult A, B) -> (icmp ule A, B-1)
     // (icmp eq B, 0) | (icmp ugt B, A) -> (icmp ule A, B-1)
     Value *A = nullptr, *B = nullptr;
-    if (LHSCC == ICmpInst::ICMP_EQ && LHSCst && LHSCst->isZero()) {
-      B = Val;
-      if (RHSCC == ICmpInst::ICMP_ULT && Val == RHS->getOperand(1))
-        A = Val2;
-      else if (RHSCC == ICmpInst::ICMP_UGT && Val == Val2)
+    if (PredL == ICmpInst::ICMP_EQ && LHSC && LHSC->isZero()) {
+      B = LHS0;
+      if (PredR == ICmpInst::ICMP_ULT && LHS0 == RHS->getOperand(1))
+        A = RHS0;
+      else if (PredR == ICmpInst::ICMP_UGT && LHS0 == RHS0)
         A = RHS->getOperand(1);
     }
     // (icmp ult A, B) | (icmp eq B, 0) -> (icmp ule A, B-1)
     // (icmp ugt B, A) | (icmp eq B, 0) -> (icmp ule A, B-1)
-    else if (RHSCC == ICmpInst::ICMP_EQ && RHSCst && RHSCst->isZero()) {
-      B = Val2;
-      if (LHSCC == ICmpInst::ICMP_ULT && Val2 == LHS->getOperand(1))
-        A = Val;
-      else if (LHSCC == ICmpInst::ICMP_UGT && Val2 == Val)
+    else if (PredR == ICmpInst::ICMP_EQ && RHSC && RHSC->isZero()) {
+      B = RHS0;
+      if (PredL == ICmpInst::ICMP_ULT && RHS0 == LHS->getOperand(1))
+        A = LHS0;
+      else if (PredL == ICmpInst::ICMP_UGT && LHS0 == RHS0)
         A = LHS->getOperand(1);
     }
     if (A && B)
@@ -1807,53 +1706,54 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
     return V;
 
   // This only handles icmp of constants: (icmp1 A, C1) | (icmp2 B, C2).
-  if (!LHSCst || !RHSCst) return nullptr;
+  if (!LHSC || !RHSC)
+    return nullptr;
 
-  if (LHSCst == RHSCst && LHSCC == RHSCC) {
+  if (LHSC == RHSC && PredL == PredR) {
     // (icmp ne A, 0) | (icmp ne B, 0) --> (icmp ne (A|B), 0)
-    if (LHSCC == ICmpInst::ICMP_NE && LHSCst->isZero()) {
-      Value *NewOr = Builder->CreateOr(Val, Val2);
-      return Builder->CreateICmp(LHSCC, NewOr, LHSCst);
+    if (PredL == ICmpInst::ICMP_NE && LHSC->isZero()) {
+      Value *NewOr = Builder->CreateOr(LHS0, RHS0);
+      return Builder->CreateICmp(PredL, NewOr, LHSC);
     }
   }
 
   // (icmp ult (X + CA), C1) | (icmp eq X, C2) -> (icmp ule (X + CA), C1)
   //   iff C2 + CA == C1.
-  if (LHSCC == ICmpInst::ICMP_ULT && RHSCC == ICmpInst::ICMP_EQ) {
-    ConstantInt *AddCst;
-    if (match(Val, m_Add(m_Specific(Val2), m_ConstantInt(AddCst))))
-      if (RHSCst->getValue() + AddCst->getValue() == LHSCst->getValue())
-        return Builder->CreateICmpULE(Val, LHSCst);
+  if (PredL == ICmpInst::ICMP_ULT && PredR == ICmpInst::ICMP_EQ) {
+    ConstantInt *AddC;
+    if (match(LHS0, m_Add(m_Specific(RHS0), m_ConstantInt(AddC))))
+      if (RHSC->getValue() + AddC->getValue() == LHSC->getValue())
+        return Builder->CreateICmpULE(LHS0, LHSC);
   }
 
   // From here on, we only handle:
   //    (icmp1 A, C1) | (icmp2 A, C2) --> something simpler.
-  if (Val != Val2) return nullptr;
+  if (LHS0 != RHS0)
+    return nullptr;
 
-  // ICMP_[US][GL]E X, CST is folded to ICMP_[US][GL]T elsewhere.
-  if (LHSCC == ICmpInst::ICMP_UGE || LHSCC == ICmpInst::ICMP_ULE ||
-      RHSCC == ICmpInst::ICMP_UGE || RHSCC == ICmpInst::ICMP_ULE ||
-      LHSCC == ICmpInst::ICMP_SGE || LHSCC == ICmpInst::ICMP_SLE ||
-      RHSCC == ICmpInst::ICMP_SGE || RHSCC == ICmpInst::ICMP_SLE)
+  // ICMP_[US][GL]E X, C is folded to ICMP_[US][GL]T elsewhere.
+  if (PredL == ICmpInst::ICMP_UGE || PredL == ICmpInst::ICMP_ULE ||
+      PredR == ICmpInst::ICMP_UGE || PredR == ICmpInst::ICMP_ULE ||
+      PredL == ICmpInst::ICMP_SGE || PredL == ICmpInst::ICMP_SLE ||
+      PredR == ICmpInst::ICMP_SGE || PredR == ICmpInst::ICMP_SLE)
     return nullptr;
 
   // We can't fold (ugt x, C) | (sgt x, C2).
-  if (!PredicatesFoldable(LHSCC, RHSCC))
+  if (!PredicatesFoldable(PredL, PredR))
     return nullptr;
 
   // Ensure that the larger constant is on the RHS.
   bool ShouldSwap;
-  if (CmpInst::isSigned(LHSCC) ||
-      (ICmpInst::isEquality(LHSCC) &&
-       CmpInst::isSigned(RHSCC)))
-    ShouldSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+  if (CmpInst::isSigned(PredL) ||
+      (ICmpInst::isEquality(PredL) && CmpInst::isSigned(PredR)))
+    ShouldSwap = LHSC->getValue().sgt(RHSC->getValue());
   else
-    ShouldSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+    ShouldSwap = LHSC->getValue().ugt(RHSC->getValue());
 
   if (ShouldSwap) {
     std::swap(LHS, RHS);
-    std::swap(LHSCst, RHSCst);
-    std::swap(LHSCC, RHSCC);
+    std::swap(LHSC, RHSC);
+    std::swap(PredL, PredR);
   }
 
   // At this point, we know we have two icmp instructions
@@ -1862,127 +1762,118 @@ Value *InstCombiner::FoldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
   // icmp folding check above), that the two constants are not
   // equal.
-  assert(LHSCst != RHSCst && "Compares not folded above?");
+  assert(LHSC != RHSC && "Compares not folded above?");
 
-  switch (LHSCC) {
-  default: llvm_unreachable("Unknown integer condition code!");
+  switch (PredL) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
   case ICmpInst::ICMP_EQ:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
     case ICmpInst::ICMP_EQ:
       if (LHS->getOperand(0) == RHS->getOperand(0)) {
-        // if LHSCst and RHSCst differ only by one bit:
+        // if LHSC and RHSC differ only by one bit:
         // (A == C1 || A == C2) -> (A | (C1 ^ C2)) == C2
-        assert(LHSCst->getValue().ule(LHSCst->getValue()));
+        assert(LHSC->getValue().ule(LHSC->getValue()));
 
-        APInt Xor = LHSCst->getValue() ^ RHSCst->getValue();
+        APInt Xor = LHSC->getValue() ^ RHSC->getValue();
         if (Xor.isPowerOf2()) {
-          Value *Cst = Builder->getInt(Xor);
-          Value *Or = Builder->CreateOr(LHS->getOperand(0), Cst);
-          return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSCst);
+          Value *C = Builder->getInt(Xor);
+          Value *Or = Builder->CreateOr(LHS->getOperand(0), C);
+          return Builder->CreateICmp(ICmpInst::ICMP_EQ, Or, RHSC);
         }
       }
 
-      if (LHSCst == SubOne(RHSCst)) {
+      if (LHSC == SubOne(RHSC)) {
         // (X == 13 | X == 14) -> X-13 <u 2
-        Constant *AddCST = ConstantExpr::getNeg(LHSCst);
-        Value *Add = Builder->CreateAdd(Val, AddCST, Val->getName()+".off");
-        AddCST = ConstantExpr::getSub(AddOne(RHSCst), LHSCst);
-        return Builder->CreateICmpULT(Add, AddCST);
+        Constant *AddC = ConstantExpr::getNeg(LHSC);
+        Value *Add = Builder->CreateAdd(LHS0, AddC, LHS0->getName() + ".off");
+        AddC = ConstantExpr::getSub(AddOne(RHSC), LHSC);
+        return Builder->CreateICmpULT(Add, AddC);
       }
 
-      break;                         // (X == 13 | X == 15) -> no change
-    case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
-    case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
+      break;                 // (X == 13 | X == 15) -> no change
+    case ICmpInst::ICMP_UGT: // (X == 13 | X u> 14) -> no change
+    case ICmpInst::ICMP_SGT: // (X == 13 | X s> 14) -> no change
       break;
-    case ICmpInst::ICMP_NE:          // (X == 13 | X != 15) -> X != 15
-    case ICmpInst::ICMP_ULT:         // (X == 13 | X u< 15) -> X u< 15
-    case ICmpInst::ICMP_SLT:         // (X == 13 | X s< 15) -> X s< 15
+    case ICmpInst::ICMP_NE:  // (X == 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT: // (X == 13 | X u< 15) -> X u< 15
+    case ICmpInst::ICMP_SLT: // (X == 13 | X s< 15) -> X s< 15
       return RHS;
     }
     break;
   case ICmpInst::ICMP_NE:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:          // (X != 13 | X == 15) -> X != 13
-    case ICmpInst::ICMP_UGT:         // (X != 13 | X u> 15) -> X != 13
-    case ICmpInst::ICMP_SGT:         // (X != 13 | X s> 15) -> X != 13
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X != 13 | X == 15) -> X != 13
+    case ICmpInst::ICMP_UGT: // (X != 13 | X u> 15) -> X != 13
+    case ICmpInst::ICMP_SGT: // (X != 13 | X s> 15) -> X != 13
       return LHS;
-    case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
-    case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
-    case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
+    case ICmpInst::ICMP_NE:  // (X != 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT: // (X != 13 | X u< 15) -> true
+    case ICmpInst::ICMP_SLT: // (X != 13 | X s< 15) -> true
       return Builder->getTrue();
     }
   case ICmpInst::ICMP_ULT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u< 13 | X == 14) -> no change
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ: // (X u< 13 | X == 14) -> no change
       break;
-    case ICmpInst::ICMP_UGT:        // (X u< 13 | X u> 15) -> (X-13) u> 2
-      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+    case ICmpInst::ICMP_UGT: // (X u< 13 | X u> 15) -> (X-13) u> 2
+      // If RHSC is [us]MAXINT, it is always false.  Not handling
       // this can cause overflow.
-      if (RHSCst->isMaxValue(false))
+      if (RHSC->isMaxValue(false))
         return LHS;
-      return insertRangeTest(Val, LHSCst->getValue(), RHSCst->getValue() + 1,
+      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1,
                              false, false);
-    case ICmpInst::ICMP_SGT:        // (X u< 13 | X s> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X u< 13 | X != 15) -> X != 15
-    case ICmpInst::ICMP_ULT:        // (X u< 13 | X u< 15) -> X u< 15
+    case ICmpInst::ICMP_NE:  // (X u< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_ULT: // (X u< 13 | X u< 15) -> X u< 15
       return RHS;
-    case ICmpInst::ICMP_SLT:        // (X u< 13 | X s< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_SLT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X s< 13 | X == 14) -> no change
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ: // (X s< 13 | X == 14) -> no change
       break;
-    case ICmpInst::ICMP_SGT:        // (X s< 13 | X s> 15) -> (X-13) s> 2
-      // If RHSCst is [us]MAXINT, it is always false.  Not handling
+    case ICmpInst::ICMP_SGT: // (X s< 13 | X s> 15) -> (X-13) s> 2
+      // If RHSC is [us]MAXINT, it is always false.  Not handling
       // this can cause overflow.
-      if (RHSCst->isMaxValue(true))
+      if (RHSC->isMaxValue(true))
         return LHS;
-      return insertRangeTest(Val, LHSCst->getValue(), RHSCst->getValue() + 1,
-                             true, false);
-    case ICmpInst::ICMP_UGT:        // (X s< 13 | X u> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X s< 13 | X != 15) -> X != 15
-    case ICmpInst::ICMP_SLT:        // (X s< 13 | X s< 15) -> X s< 15
+      return insertRangeTest(LHS0, LHSC->getValue(), RHSC->getValue() + 1, true,
+                             false);
+    case ICmpInst::ICMP_NE:  // (X s< 13 | X != 15) -> X != 15
+    case ICmpInst::ICMP_SLT: // (X s< 13 | X s< 15) -> X s< 15
       return RHS;
-    case ICmpInst::ICMP_ULT:        // (X s< 13 | X u< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_UGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X u> 13 | X == 15) -> X u> 13
-    case ICmpInst::ICMP_UGT:        // (X u> 13 | X u> 15) -> X u> 13
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X u> 13 | X == 15) -> X u> 13
+    case ICmpInst::ICMP_UGT: // (X u> 13 | X u> 15) -> X u> 13
       return LHS;
-    case ICmpInst::ICMP_SGT:        // (X u> 13 | X s> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
-    case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
+    case ICmpInst::ICMP_NE:  // (X u> 13 | X != 15) -> true
+    case ICmpInst::ICMP_ULT: // (X u> 13 | X u< 15) -> true
       return Builder->getTrue();
-    case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
-      break;
     }
     break;
   case ICmpInst::ICMP_SGT:
-    switch (RHSCC) {
-    default: llvm_unreachable("Unknown integer condition code!");
-    case ICmpInst::ICMP_EQ:         // (X s> 13 | X == 15) -> X > 13
-    case ICmpInst::ICMP_SGT:        // (X s> 13 | X s> 15) -> X > 13
+    switch (PredR) {
+    default:
+      llvm_unreachable("Unknown integer condition code!");
+    case ICmpInst::ICMP_EQ:  // (X s> 13 | X == 15) -> X > 13
+    case ICmpInst::ICMP_SGT: // (X s> 13 | X s> 15) -> X > 13
       return LHS;
-    case ICmpInst::ICMP_UGT:        // (X s> 13 | X u> 15) -> no change
-      break;
-    case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
-    case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
+    case ICmpInst::ICMP_NE:  // (X s> 13 | X != 15) -> true
+    case ICmpInst::ICMP_SLT: // (X s> 13 | X s< 15) -> true
       return Builder->getTrue();
-    case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
-      break;
     }
     break;
   }
@@ -2128,17 +2019,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
 
   if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
     ConstantInt *C1 = nullptr; Value *X = nullptr;
-    // (X & C1) | C2 --> (X | C2) & (C1|C2)
-    // iff (C1 & C2) == 0.
-    if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) &&
-        (RHS->getValue() & C1->getValue()) != 0 &&
-        Op0->hasOneUse()) {
-      Value *Or = Builder->CreateOr(X, RHS);
-      Or->takeName(Op0);
-      return BinaryOperator::CreateAnd(Or,
-                             Builder->getInt(RHS->getValue() | C1->getValue()));
-    }
-
     // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
     if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) &&
         Op0->hasOneUse()) {
@@ -2147,45 +2027,51 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
       return BinaryOperator::CreateXor(Or,
                             Builder->getInt(C1->getValue() & ~RHS->getValue()));
     }
+  }
 
+  if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
-  }
 
   // Given an OR instruction, check to see if this is a bswap.
   if (Instruction *BSwap = MatchBSwap(I))
     return BSwap;
 
-  Value *A = nullptr, *B = nullptr;
-  ConstantInt *C1 = nullptr, *C2 = nullptr;
+  {
+    Value *A;
+    const APInt *C;
+    // (X^C)|Y -> (X|Y)^C iff Y&C == 0
+    if (match(Op0, m_OneUse(m_Xor(m_Value(A), m_APInt(C)))) &&
+        MaskedValueIsZero(Op1, *C, 0, &I)) {
+      Value *NOr = Builder->CreateOr(A, Op1);
+      NOr->takeName(Op0);
+      return BinaryOperator::CreateXor(NOr,
+                                       cast<Instruction>(Op0)->getOperand(1));
+    }
 
-  // (X^C)|Y -> (X|Y)^C iff Y&C == 0
-  if (Op0->hasOneUse() &&
-      match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
-      MaskedValueIsZero(Op1, C1->getValue(), 0, &I)) {
-    Value *NOr = Builder->CreateOr(A, Op1);
-    NOr->takeName(Op0);
-    return BinaryOperator::CreateXor(NOr, C1);
+    // Y|(X^C) -> (X|Y)^C iff Y&C == 0
+    if (match(Op1, m_OneUse(m_Xor(m_Value(A), m_APInt(C)))) &&
+        MaskedValueIsZero(Op0, *C, 0, &I)) {
+      Value *NOr = Builder->CreateOr(A, Op0);
+      NOr->takeName(Op0);
+      return BinaryOperator::CreateXor(NOr,
+                                       cast<Instruction>(Op1)->getOperand(1));
+    }
   }
 
-  // Y|(X^C) -> (X|Y)^C iff Y&C == 0
-  if (Op1->hasOneUse() &&
-      match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
-      MaskedValueIsZero(Op0, C1->getValue(), 0, &I)) {
-    Value *NOr = Builder->CreateOr(A, Op0);
-    NOr->takeName(Op0);
-    return BinaryOperator::CreateXor(NOr, C1);
-  }
+  Value *A, *B;
 
   // ((~A & B) | A) -> (A | B)
-  if (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
-      match(Op1, m_Specific(A)))
-    return BinaryOperator::CreateOr(A, B);
+  if (match(Op0, m_c_And(m_Not(m_Specific(Op1)), m_Value(A))))
+    return BinaryOperator::CreateOr(A, Op1);
+  if (match(Op1, m_c_And(m_Not(m_Specific(Op0)), m_Value(A))))
+    return BinaryOperator::CreateOr(Op0, A);
 
   // ((A & B) | ~A) -> (~A | B)
-  if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
-      match(Op1, m_Not(m_Specific(A))))
-    return BinaryOperator::CreateOr(Builder->CreateNot(A), B);
+  // The NOT is guaranteed to be in the RHS by complexity ordering.
+  if (match(Op1, m_Not(m_Value(A))) &&
+      match(Op0, m_c_And(m_Specific(A), m_Value(B))))
+    return BinaryOperator::CreateOr(Op1, B);
 
   // (A & ~B) | (A ^ B) -> (A ^ B)
   // (~B & A) | (A ^ B) -> (A ^ B)
@@ -2205,8 +2091,8 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
       match(Op1, m_And(m_Value(B), m_Value(D)))) {
     Value *V1 = nullptr, *V2 = nullptr;
-    C1 = dyn_cast<ConstantInt>(C);
-    C2 = dyn_cast<ConstantInt>(D);
+    ConstantInt *C1 = dyn_cast<ConstantInt>(C);
+    ConstantInt *C2 = dyn_cast<ConstantInt>(D);
     if (C1 && C2) {  // (A & C1)|(B & C2)
       if ((C1->getValue() & C2->getValue()) == 0) {
         // ((V | N) & C1) | (V & C2) --> (V|N) & (C1|C2)
@@ -2431,6 +2317,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   // be simplified by a later pass either, so we try swapping the inner/outer
   // ORs in the hopes that we'll be able to simplify it this way.
   // (X|C) | V --> (X|V) | C
+  ConstantInt *C1;
   if (Op0->hasOneUse() && !isa<ConstantInt>(Op1) &&
       match(Op0, m_Or(m_Value(A), m_ConstantInt(C1)))) {
     Value *Inner = Builder->CreateOr(A, Op1);
@@ -2521,23 +2408,22 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     }
   }
 
-  if (Constant *RHS = dyn_cast<Constant>(Op1)) {
-    if (RHS->isAllOnesValue() && Op0->hasOneUse())
-      // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
-      if (CmpInst *CI = dyn_cast<CmpInst>(Op0))
-        return CmpInst::Create(CI->getOpcode(),
-                               CI->getInversePredicate(),
-                               CI->getOperand(0), CI->getOperand(1));
+  // xor (cmp A, B), true = not (cmp A, B) = !cmp A, B
+  ICmpInst::Predicate Pred;
+  if (match(Op0, m_OneUse(m_Cmp(Pred, m_Value(), m_Value()))) &&
+      match(Op1, m_AllOnes())) {
+    cast<CmpInst>(Op0)->setPredicate(CmpInst::getInversePredicate(Pred));
+    return replaceInstUsesWith(I, Op0);
   }
 
-  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+  if (ConstantInt *RHSC = dyn_cast<ConstantInt>(Op1)) {
     // fold (xor(zext(cmp)), 1) and (xor(sext(cmp)), -1) to ext(!cmp).
     if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) {
       if (CmpInst *CI = dyn_cast<CmpInst>(Op0C->getOperand(0))) {
         if (CI->hasOneUse() && Op0C->hasOneUse()) {
           Instruction::CastOps Opcode = Op0C->getOpcode();
           if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-              (RHS == ConstantExpr::getCast(Opcode, Builder->getTrue(),
+              (RHSC == ConstantExpr::getCast(Opcode, Builder->getTrue(),
                                             Op0C->getDestTy()))) {
             CI->setPredicate(CI->getInversePredicate());
             return CastInst::Create(Opcode, CI, Op0C->getType());
@@ -2548,26 +2434,23 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
 
     if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
       // ~(c-X) == X-c-1 == X+(-c-1)
-      if (Op0I->getOpcode() == Instruction::Sub && RHS->isAllOnesValue())
+      if (Op0I->getOpcode() == Instruction::Sub && RHSC->isAllOnesValue())
         if (Constant *Op0I0C = dyn_cast<Constant>(Op0I->getOperand(0))) {
           Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C);
-          Constant *ConstantRHS = ConstantExpr::getSub(NegOp0I0C,
-                                      ConstantInt::get(I.getType(), 1));
-          return BinaryOperator::CreateAdd(Op0I->getOperand(1), ConstantRHS);
+          return BinaryOperator::CreateAdd(Op0I->getOperand(1),
+                                           SubOne(NegOp0I0C));
         }
 
       if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1))) {
         if (Op0I->getOpcode() == Instruction::Add) {
           // ~(X-c) --> (-c-1)-X
-          if (RHS->isAllOnesValue()) {
+          if (RHSC->isAllOnesValue()) {
             Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI);
-            return BinaryOperator::CreateSub(
-                           ConstantExpr::getSub(NegOp0CI,
-                                      ConstantInt::get(I.getType(), 1)),
-                                      Op0I->getOperand(0));
-          } else if (RHS->getValue().isSignBit()) {
+            return BinaryOperator::CreateSub(SubOne(NegOp0CI),
+                                             Op0I->getOperand(0));
+          } else if (RHSC->getValue().isSignBit()) {
             // (X + C) ^ signbit -> (X + C + signbit)
-            Constant *C = Builder->getInt(RHS->getValue() + Op0CI->getValue());
+            Constant *C = Builder->getInt(RHSC->getValue() + Op0CI->getValue());
             return BinaryOperator::CreateAdd(Op0I->getOperand(0), C);
 
           }
@@ -2575,10 +2458,10 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
           // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
           if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue(),
                                 0, &I)) {
-            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS);
+            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHSC);
             // Anything in both C1 and C2 is known to be zero, remove it from
             // NewRHS.
-            Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHS);
+            Constant *CommonBits = ConstantExpr::getAnd(Op0CI, RHSC);
             NewRHS = ConstantExpr::getAnd(NewRHS,
                                        ConstantExpr::getNot(CommonBits));
             Worklist.Add(Op0I);
@@ -2596,7 +2479,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
               E1->getOpcode() == Instruction::Xor &&
               (C1 = dyn_cast<ConstantInt>(E1->getOperand(1)))) {
             // fold (C1 >> C2) ^ C3
-            ConstantInt *C2 = Op0CI, *C3 = RHS;
+            ConstantInt *C2 = Op0CI, *C3 = RHSC;
             APInt FoldConst = C1->getValue().lshr(C2->getValue());
             FoldConst ^= C3->getValue();
             // Prepare the two operands.
@@ -2610,27 +2493,26 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
         }
       }
     }
+  }
 
+  if (isa<Constant>(Op1))
     if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I))
       return FoldedLogic;
-  }
 
-  BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1);
-  if (Op1I) {
+  {
     Value *A, *B;
-    if (match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
-      if (A == Op0) {              // B^(B|A) == (A|B)^B
-        Op1I->swapOperands();
-        I.swapOperands();
-        std::swap(Op0, Op1);
-      } else if (B == Op0) {       // B^(A|B) == (A|B)^B
+    if (match(Op1, m_OneUse(m_Or(m_Value(A), m_Value(B))))) {
+      if (A == Op0) {                                      // A^(A|B) == A^(B|A)
+        cast<BinaryOperator>(Op1)->swapOperands();
+        std::swap(A, B);
+      }
+      if (B == Op0) {                                      // A^(B|A) == (B|A)^A
         I.swapOperands();     // Simplified below.
         std::swap(Op0, Op1);
       }
-    } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) &&
-               Op1I->hasOneUse()){
+    } else if (match(Op1, m_OneUse(m_And(m_Value(A), m_Value(B))))) {
       if (A == Op0) {                                      // A^(A&B) -> A^(B&A)
-        Op1I->swapOperands();
+        cast<BinaryOperator>(Op1)->swapOperands();
         std::swap(A, B);
       }
       if (B == Op0) {                                      // A^(B&A) -> (B&A)^A
@@ -2640,65 +2522,63 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     }
   }
 
-  BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0);
-  if (Op0I) {
+  {
     Value *A, *B;
-    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
-        Op0I->hasOneUse()) {
+    if (match(Op0, m_OneUse(m_Or(m_Value(A), m_Value(B))))) {
       if (A == Op1)                                  // (B|A)^B == (A|B)^B
         std::swap(A, B);
       if (B == Op1)                                  // (A|B)^B == A & ~B
         return BinaryOperator::CreateAnd(A, Builder->CreateNot(Op1));
-    } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
-               Op0I->hasOneUse()){
+    } else if (match(Op0, m_OneUse(m_And(m_Value(A), m_Value(B))))) {
       if (A == Op1)                                        // (A&B)^A -> (B&A)^A
         std::swap(A, B);
+      const APInt *C;
       if (B == Op1 &&                                      // (B&A)^A == ~B & A
-          !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
+          !match(Op1, m_APInt(C))) {  // Canonical form is (B&C)^C
         return BinaryOperator::CreateAnd(Builder->CreateNot(A), Op1);
       }
     }
   }
 
-  if (Op0I && Op1I) {
+  {
     Value *A, *B, *C, *D;
     // (A & B)^(A | B) -> A ^ B
-    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_Or(m_Value(C), m_Value(D)))) {
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Or(m_Value(C), m_Value(D)))) {
       if ((A == C && B == D) || (A == D && B == C))
         return BinaryOperator::CreateXor(A, B);
     }
     // (A | B)^(A & B) -> A ^ B
-    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_And(m_Value(C), m_Value(D)))) {
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1, m_And(m_Value(C), m_Value(D)))) {
       if ((A == C && B == D) || (A == D && B == C))
         return BinaryOperator::CreateXor(A, B);
     }
     // (A | ~B) ^ (~A | B) -> A ^ B
     // (~B | A) ^ (~A | B) -> A ^ B
-    if (match(Op0I, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
-        match(Op1I, m_Or(m_Not(m_Specific(A)), m_Specific(B))))
+    if (match(Op0, m_c_Or(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_Or(m_Not(m_Specific(A)), m_Specific(B))))
       return BinaryOperator::CreateXor(A, B);
 
     // (~A | B) ^ (A | ~B) -> A ^ B
-    if (match(Op0I, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
-        match(Op1I, m_Or(m_Specific(A), m_Not(m_Specific(B))))) {
+    if (match(Op0, m_Or(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op1, m_Or(m_Specific(A), m_Not(m_Specific(B))))) {
       return BinaryOperator::CreateXor(A, B);
     }
     // (A & ~B) ^ (~A & B) -> A ^ B
     // (~B & A) ^ (~A & B) -> A ^ B
-    if (match(Op0I, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
-        match(Op1I, m_And(m_Not(m_Specific(A)), m_Specific(B))))
+    if (match(Op0, m_c_And(m_Value(A), m_Not(m_Value(B)))) &&
+        match(Op1, m_And(m_Not(m_Specific(A)), m_Specific(B))))
       return BinaryOperator::CreateXor(A, B);
 
     // (~A & B) ^ (A & ~B) -> A ^ B
-    if (match(Op0I, m_And(m_Not(m_Value(A)), m_Value(B))) &&
-        match(Op1I, m_And(m_Specific(A), m_Not(m_Specific(B))))) {
+    if (match(Op0, m_And(m_Not(m_Value(A)), m_Value(B))) &&
+        match(Op1, m_And(m_Specific(A), m_Not(m_Specific(B))))) {
       return BinaryOperator::CreateXor(A, B);
     }
     // (A ^ C)^(A | B) -> ((~A) & B) ^ C
-    if (match(Op0I, m_Xor(m_Value(D), m_Value(C))) &&
-        match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
+    if (match(Op0, m_Xor(m_Value(D), m_Value(C))) &&
+        match(Op1, m_Or(m_Value(A), m_Value(B)))) {
       if (D == A)
         return BinaryOperator::CreateXor(
             Builder->CreateAnd(Builder->CreateNot(A), B), C);
@@ -2707,8 +2587,8 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
             Builder->CreateAnd(Builder->CreateNot(B), A), C);
     }
     // (A | B)^(A ^ C) -> ((~A) & B) ^ C
-    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_Xor(m_Value(D), m_Value(C)))) {
+    if (match(Op0, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1, m_Xor(m_Value(D), m_Value(C)))) {
       if (D == A)
         return BinaryOperator::CreateXor(
             Builder->CreateAnd(Builder->CreateNot(A), B), C);
@@ -2717,12 +2597,12 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
             Builder->CreateAnd(Builder->CreateNot(B), A), C);
     }
     // (A & B) ^ (A ^ B) -> (A | B)
-    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_Xor(m_Specific(A), m_Specific(B))))
+    if (match(Op0, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_Xor(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
     // (A ^ B) ^ (A & B) -> (A | B)
-    if (match(Op0I, m_Xor(m_Value(A), m_Value(B))) &&
-        match(Op1I, m_And(m_Specific(A), m_Specific(B))))
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B))) &&
+        match(Op1, m_c_And(m_Specific(A), m_Specific(B))))
       return BinaryOperator::CreateOr(A, B);
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 19746651d99ad9721bfb893c7a7622c39c34fed8..cdae9571851ed84da1f026ebb5e47c17031db35c 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -60,6 +60,12 @@ using namespace PatternMatch;
 
 STATISTIC(NumSimplified, "Number of library calls simplified");
 
+static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
+    "unfold-element-atomic-memcpy-max-elements",
+    cl::init(16),
+    cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
+             "allowed to unfold"));
+
 /// Return the specified type promoted as it would be to pass though a va_arg
 /// area.
 static Type *getPromotedType(Type *Ty) {
@@ -70,27 +76,6 @@ static Type *getPromotedType(Type *Ty) {
   return Ty;
 }
 
-/// Given an aggregate type which ultimately holds a single scalar element,
-/// like {{{type}}} or [1 x type], return type.
-static Type *reduceToSingleValueType(Type *T) {
-  while (!T->isSingleValueType()) {
-    if (StructType *STy = dyn_cast<StructType>(T)) {
-      if (STy->getNumElements() == 1)
-        T = STy->getElementType(0);
-      else
-        break;
-    } else if (ArrayType *ATy = dyn_cast<ArrayType>(T)) {
-      if (ATy->getNumElements() == 1)
-        T = ATy->getElementType();
-      else
-        break;
-    } else
-      break;
-  }
-
-  return T;
-}
-
 /// Return a constant boolean vector that has true elements in all positions
 /// where the input constant data vector has an element with the sign bit set.
 static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
@@ -108,6 +93,78 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
   return ConstantVector::get(BoolVec);
 }
 
+Instruction *
+InstCombiner::SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI) {
+  // Try to unfold this intrinsic into sequence of explicit atomic loads and
+  // stores.
+  // First check that number of elements is compile time constant.
+  auto *NumElementsCI = dyn_cast<ConstantInt>(AMI->getNumElements());
+  if (!NumElementsCI)
+    return nullptr;
+
+  // Check that there are not too many elements.
+  uint64_t NumElements = NumElementsCI->getZExtValue();
+  if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
+    return nullptr;
+
+  // Don't unfold into illegal integers
+  uint64_t ElementSizeInBytes = AMI->getElementSizeInBytes() * 8;
+  if (!getDataLayout().isLegalInteger(ElementSizeInBytes))
+    return nullptr;
+
+  // Cast source and destination to the correct type. Intrinsic input arguments
+  // are usually represented as i8*.
+  // Often operands will be explicitly casted to i8* and we can just strip
+  // those casts instead of inserting new ones. However it's easier to rely on
+  // other InstCombine rules which will cover trivial cases anyway.
+  Value *Src = AMI->getRawSource();
+  Value *Dst = AMI->getRawDest();
+  Type *ElementPointerType = Type::getIntNPtrTy(
+      AMI->getContext(), ElementSizeInBytes, Src->getType()->getPointerAddressSpace());
+
+  Value *SrcCasted = Builder->CreatePointerCast(Src, ElementPointerType,
+                                                "memcpy_unfold.src_casted");
+  Value *DstCasted = Builder->CreatePointerCast(Dst, ElementPointerType,
+                                                "memcpy_unfold.dst_casted");
+
+  for (uint64_t i = 0; i < NumElements; ++i) {
+    // Get current element addresses
+    ConstantInt *ElementIdxCI =
+        ConstantInt::get(AMI->getContext(), APInt(64, i));
+    Value *SrcElementAddr =
+        Builder->CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
+    Value *DstElementAddr =
+        Builder->CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");
+
+    // Load from the source. Transfer alignment information and mark load as
+    // unordered atomic.
+    LoadInst *Load = Builder->CreateLoad(SrcElementAddr, "memcpy_unfold.val");
+    Load->setOrdering(AtomicOrdering::Unordered);
+    // We know alignment of the first element. It is also guaranteed by the
+    // verifier that element size is less or equal than first element alignment
+    // and both of this values are powers of two.
+    // This means that all subsequent accesses are at least element size
+    // aligned.
+    // TODO: We can infer better alignment but there is no evidence that this
+    // will matter.
+    Load->setAlignment(i == 0 ? AMI->getSrcAlignment()
+                              : AMI->getElementSizeInBytes());
+    Load->setDebugLoc(AMI->getDebugLoc());
+
+    // Store loaded value via unordered atomic store.
+    StoreInst *Store = Builder->CreateStore(Load, DstElementAddr);
+    Store->setOrdering(AtomicOrdering::Unordered);
+    Store->setAlignment(i == 0 ? AMI->getDstAlignment()
+                               : AMI->getElementSizeInBytes());
+    Store->setDebugLoc(AMI->getDebugLoc());
+  }
+
+  // Set the number of elements of the copy to 0, it will be deleted on the
+  // next iteration.
+  AMI->setNumElements(Constant::getNullValue(NumElementsCI->getType()));
+  return AMI;
+}
+
 Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   unsigned DstAlign = getKnownAlignment(MI->getArgOperand(0), DL, MI, &AC, &DT);
   unsigned SrcAlign = getKnownAlignment(MI->getArgOperand(1), DL, MI, &AC, &DT);
@@ -144,41 +201,19 @@ Instruction *InstCombiner::SimplifyMemTransfer(MemIntrinsic *MI) {
   Type *NewSrcPtrTy = PointerType::get(IntType, SrcAddrSp);
   Type *NewDstPtrTy = PointerType::get(IntType, DstAddrSp);
 
-  // Memcpy forces the use of i8* for the source and destination.  That means
-  // that if you're using memcpy to move one double around, you'll get a cast
-  // from double* to i8*.  We'd much rather use a double load+store rather than
-  // an i64 load+store, here because this improves the odds that the source or
-  // dest address will be promotable.  See if we can find a better type than the
-  // integer datatype.
-  Value *StrippedDest = MI->getArgOperand(0)->stripPointerCasts();
+  // If the memcpy has metadata describing the members, see if we can get the
+  // TBAA tag describing our copy.
   MDNode *CopyMD = nullptr;
-  if (StrippedDest != MI->getArgOperand(0)) {
-    Type *SrcETy = cast<PointerType>(StrippedDest->getType())
-                                    ->getElementType();
-    if (SrcETy->isSized() && DL.getTypeStoreSize(SrcETy) == Size) {
-      // The SrcETy might be something like {{{double}}} or [1 x double].  Rip
-      // down through these levels if so.
-      SrcETy = reduceToSingleValueType(SrcETy);
-
-      if (SrcETy->isSingleValueType()) {
-        NewSrcPtrTy = PointerType::get(SrcETy, SrcAddrSp);
-        NewDstPtrTy = PointerType::get(SrcETy, DstAddrSp);
-
-        // If the memcpy has metadata describing the members, see if we can
-        // get the TBAA tag describing our copy.
-        if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
-          if (M->getNumOperands() == 3 && M->getOperand(0) &&
-              mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
-              mdconst::extract<ConstantInt>(M->getOperand(0))->isNullValue() &&
-              M->getOperand(1) &&
-              mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
-              mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
-                  Size &&
-              M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
-            CopyMD = cast<MDNode>(M->getOperand(2));
-        }
-      }
-    }
+  if (MDNode *M = MI->getMetadata(LLVMContext::MD_tbaa_struct)) {
+    if (M->getNumOperands() == 3 && M->getOperand(0) &&
+        mdconst::hasa<ConstantInt>(M->getOperand(0)) &&
+        mdconst::extract<ConstantInt>(M->getOperand(0))->isNullValue() &&
+        M->getOperand(1) &&
+        mdconst::hasa<ConstantInt>(M->getOperand(1)) &&
+        mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
+        Size &&
+        M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
+      CopyMD = cast<MDNode>(M->getOperand(2));
   }
 
   // If the memcpy/memmove provides better alignment info than we can
@@ -1455,6 +1490,27 @@ static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
   return true;
 }
 
+// Constant fold llvm.amdgcn.fmed3 intrinsics for standard inputs.
+//
+// A single NaN input is folded to minnum, so we rely on that folding for
+// handling NaNs.
+static APFloat fmed3AMDGCN(const APFloat &Src0, const APFloat &Src1,
+                           const APFloat &Src2) {
+  APFloat Max3 = maxnum(maxnum(Src0, Src1), Src2);
+
+  APFloat::cmpResult Cmp0 = Max3.compare(Src0);
+  assert(Cmp0 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp0 == APFloat::cmpEqual)
+    return maxnum(Src1, Src2);
+
+  APFloat::cmpResult Cmp1 = Max3.compare(Src1);
+  assert(Cmp1 != APFloat::cmpUnordered && "nans handled separately");
+  if (Cmp1 == APFloat::cmpEqual)
+    return maxnum(Src0, Src2);
+
+  return maxnum(Src0, Src1);
+}
+
 // Returns true iff the 2 intrinsics have the same operands, limiting the
 // comparison to the first NumOperands.
 static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
@@ -1835,6 +1891,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (Changed) return II;
   }
 
+  if (auto *AMI = dyn_cast<ElementAtomicMemCpyInst>(II)) {
+    if (Constant *C = dyn_cast<Constant>(AMI->getNumElements()))
+      if (C->isNullValue())
+        return eraseInstFromFunction(*AMI);
+
+    if (Instruction *I = SimplifyElementAtomicMemCpy(AMI))
+      return I;
+  }
+
   if (Instruction *I = SimplifyNVVMIntrinsic(II, *this))
     return I;
 
@@ -1957,8 +2022,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return replaceInstUsesWith(*II, V);
     break;
   }
-  case Intrinsic::fma:
   case Intrinsic::fmuladd: {
+    // Canonicalize fast fmuladd to the separate fmul + fadd.
+    if (II->hasUnsafeAlgebra()) {
+      BuilderTy::FastMathFlagGuard Guard(*Builder);
+      Builder->setFastMathFlags(II->getFastMathFlags());
+      Value *Mul = Builder->CreateFMul(II->getArgOperand(0),
+                                       II->getArgOperand(1));
+      Value *Add = Builder->CreateFAdd(Mul, II->getArgOperand(2));
+      Add->takeName(II);
+      return replaceInstUsesWith(*II, Add);
+    }
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::fma: {
     Value *Src0 = II->getArgOperand(0);
     Value *Src1 = II->getArgOperand(1);
 
@@ -2013,6 +2091,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::floor:
   case Intrinsic::round:
   case Intrinsic::nearbyint:
+  case Intrinsic::rint:
   case Intrinsic::trunc: {
     Value *ExtSrc;
     if (match(II->getArgOperand(0), m_FPExt(m_Value(ExtSrc))) &&
@@ -2543,7 +2622,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_sse2_packsswb_128:
   case Intrinsic::x86_avx2_packssdw:
   case Intrinsic::x86_avx2_packsswb:
-  // TODO Add support for Intrinsic::x86_avx512_mask_packss*
+  case Intrinsic::x86_avx512_packssdw_512:
+  case Intrinsic::x86_avx512_packsswb_512:
     if (Value *V = simplifyX86pack(*II, *this, *Builder, true))
       return replaceInstUsesWith(*II, V);
     break;
@@ -2552,7 +2632,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_sse41_packusdw:
   case Intrinsic::x86_avx2_packusdw:
   case Intrinsic::x86_avx2_packuswb:
-  // TODO Add support for Intrinsic::x86_avx512_mask_packus*
+  case Intrinsic::x86_avx512_packusdw_512:
+  case Intrinsic::x86_avx512_packuswb_512:
     if (Value *V = simplifyX86pack(*II, *this, *Builder, false))
       return replaceInstUsesWith(*II, V);
     break;
@@ -2985,9 +3066,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
-
   case Intrinsic::amdgcn_rcp: {
-    if (const ConstantFP *C = dyn_cast<ConstantFP>(II->getArgOperand(0))) {
+    Value *Src = II->getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(CI, Src);
+
+    if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
       const APFloat &ArgVal = C->getValueAPF();
       APFloat Val(ArgVal.getSemantics(), 1.0);
       APFloat::opStatus Status = Val.divide(ArgVal,
@@ -3000,6 +3086,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::amdgcn_rsq: {
+    Value *Src = II->getArgOperand(0);
+
+    // TODO: Move to ConstantFolding/InstSimplify?
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(CI, Src);
+    break;
+  }
   case Intrinsic::amdgcn_frexp_mant:
   case Intrinsic::amdgcn_frexp_exp: {
     Value *Src = II->getArgOperand(0);
@@ -3104,6 +3198,274 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     return replaceInstUsesWith(*II, ConstantInt::get(II->getType(), Result));
   }
+  case Intrinsic::amdgcn_cvt_pkrtz: {
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        const fltSemantics &HalfSem
+          = II->getType()->getScalarType()->getFltSemantics();
+        bool LosesInfo;
+        APFloat Val0 = C0->getValueAPF();
+        APFloat Val1 = C1->getValueAPF();
+        Val0.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+        Val1.convert(HalfSem, APFloat::rmTowardZero, &LosesInfo);
+
+        Constant *Folded = ConstantVector::get({
+            ConstantFP::get(II->getContext(), Val0),
+            ConstantFP::get(II->getContext(), Val1) });
+        return replaceInstUsesWith(*II, Folded);
+      }
+    }
+
+    if (isa<UndefValue>(Src0) && isa<UndefValue>(Src1))
+      return replaceInstUsesWith(*II, UndefValue::get(II->getType()));
+
+    break;
+  }
+  case Intrinsic::amdgcn_ubfe:
+  case Intrinsic::amdgcn_sbfe: {
+    // Decompose simple cases into standard shifts.
+    Value *Src = II->getArgOperand(0);
+    if (isa<UndefValue>(Src))
+      return replaceInstUsesWith(*II, Src);
+
+    unsigned Width;
+    Type *Ty = II->getType();
+    unsigned IntSize = Ty->getIntegerBitWidth();
+
+    ConstantInt *CWidth = dyn_cast<ConstantInt>(II->getArgOperand(2));
+    if (CWidth) {
+      Width = CWidth->getZExtValue();
+      if ((Width & (IntSize - 1)) == 0)
+        return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));
+
+      if (Width >= IntSize) {
+        // Hardware ignores high bits, so remove those.
+        II->setArgOperand(2, ConstantInt::get(CWidth->getType(),
+                                              Width & (IntSize - 1)));
+        return II;
+      }
+    }
+
+    unsigned Offset;
+    ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    if (COffset) {
+      Offset = COffset->getZExtValue();
+      if (Offset >= IntSize) {
+        II->setArgOperand(1, ConstantInt::get(COffset->getType(),
+                                              Offset & (IntSize - 1)));
+        return II;
+      }
+    }
+
+    bool Signed = II->getIntrinsicID() == Intrinsic::amdgcn_sbfe;
+
+    // TODO: Also emit sub if only width is constant.
+    if (!CWidth && COffset && Offset == 0) {
+      Constant *KSize = ConstantInt::get(COffset->getType(), IntSize);
+      Value *ShiftVal = Builder->CreateSub(KSize, II->getArgOperand(2));
+      ShiftVal = Builder->CreateZExt(ShiftVal, II->getType());
+
+      Value *Shl = Builder->CreateShl(Src, ShiftVal);
+      Value *RightShift = Signed ?
+        Builder->CreateAShr(Shl, ShiftVal) :
+        Builder->CreateLShr(Shl, ShiftVal);
+      RightShift->takeName(II);
+      return replaceInstUsesWith(*II, RightShift);
+    }
+
+    if (!CWidth || !COffset)
+      break;
+
+    // TODO: This allows folding to undef when the hardware has specific
+    // behavior?
+    if (Offset + Width < IntSize) {
+      Value *Shl = Builder->CreateShl(Src, IntSize  - Offset - Width);
+      Value *RightShift = Signed ?
+        Builder->CreateAShr(Shl, IntSize - Width) :
+        Builder->CreateLShr(Shl, IntSize - Width);
+      RightShift->takeName(II);
+      return replaceInstUsesWith(*II, RightShift);
+    }
+
+    Value *RightShift = Signed ?
+      Builder->CreateAShr(Src, Offset) :
+      Builder->CreateLShr(Src, Offset);
+
+    RightShift->takeName(II);
+    return replaceInstUsesWith(*II, RightShift);
+  }
+  case Intrinsic::amdgcn_exp:
+  case Intrinsic::amdgcn_exp_compr: {
+    ConstantInt *En = dyn_cast<ConstantInt>(II->getArgOperand(1));
+    if (!En) // Illegal.
+      break;
+
+    unsigned EnBits = En->getZExtValue();
+    if (EnBits == 0xf)
+      break; // All inputs enabled.
+
+    bool IsCompr = II->getIntrinsicID() == Intrinsic::amdgcn_exp_compr;
+    bool Changed = false;
+    for (int I = 0; I < (IsCompr ? 2 : 4); ++I) {
+      if ((!IsCompr && (EnBits & (1 << I)) == 0) ||
+          (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
+        Value *Src = II->getArgOperand(I + 2);
+        if (!isa<UndefValue>(Src)) {
+          II->setArgOperand(I + 2, UndefValue::get(Src->getType()));
+          Changed = true;
+        }
+      }
+    }
+
+    if (Changed)
+      return II;
+
+    break;
+
+  }
+  case Intrinsic::amdgcn_fmed3: {
+    // Note this does not preserve proper sNaN behavior if IEEE-mode is enabled
+    // for the shader.
+
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+    Value *Src2 = II->getArgOperand(2);
+
+    bool Swap = false;
+    // Canonicalize constants to RHS operands.
+    //
+    // fmed3(c0, x, c1) -> fmed3(x, c0, c1)
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src1) && !isa<Constant>(Src2)) {
+      std::swap(Src1, Src2);
+      Swap = true;
+    }
+
+    if (isa<Constant>(Src0) && !isa<Constant>(Src1)) {
+      std::swap(Src0, Src1);
+      Swap = true;
+    }
+
+    if (Swap) {
+      II->setArgOperand(0, Src0);
+      II->setArgOperand(1, Src1);
+      II->setArgOperand(2, Src2);
+      return II;
+    }
+
+    if (match(Src2, m_NaN()) || isa<UndefValue>(Src2)) {
+      CallInst *NewCall = Builder->CreateMinNum(Src0, Src1);
+      NewCall->copyFastMathFlags(II);
+      NewCall->takeName(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
+    if (const ConstantFP *C0 = dyn_cast<ConstantFP>(Src0)) {
+      if (const ConstantFP *C1 = dyn_cast<ConstantFP>(Src1)) {
+        if (const ConstantFP *C2 = dyn_cast<ConstantFP>(Src2)) {
+          APFloat Result = fmed3AMDGCN(C0->getValueAPF(), C1->getValueAPF(),
+                                       C2->getValueAPF());
+          return replaceInstUsesWith(*II,
+            ConstantFP::get(Builder->getContext(), Result));
+        }
+      }
+    }
+
+    break;
+  }
+  case Intrinsic::amdgcn_icmp:
+  case Intrinsic::amdgcn_fcmp: {
+    const ConstantInt *CC = dyn_cast<ConstantInt>(II->getArgOperand(2));
+    if (!CC)
+      break;
+
+    // Guard against invalid arguments.
+    int64_t CCVal = CC->getZExtValue();
+    bool IsInteger = II->getIntrinsicID() == Intrinsic::amdgcn_icmp;
+    if ((IsInteger && (CCVal < CmpInst::FIRST_ICMP_PREDICATE ||
+                       CCVal > CmpInst::LAST_ICMP_PREDICATE)) ||
+        (!IsInteger && (CCVal < CmpInst::FIRST_FCMP_PREDICATE ||
+                        CCVal > CmpInst::LAST_FCMP_PREDICATE)))
+      break;
+
+    Value *Src0 = II->getArgOperand(0);
+    Value *Src1 = II->getArgOperand(1);
+
+    if (auto *CSrc0 = dyn_cast<Constant>(Src0)) {
+      if (auto *CSrc1 = dyn_cast<Constant>(Src1)) {
+        Constant *CCmp = ConstantExpr::getCompare(CCVal, CSrc0, CSrc1);
+        return replaceInstUsesWith(*II,
+                                   ConstantExpr::getSExt(CCmp, II->getType()));
+      }
+
+      // Canonicalize constants to RHS.
+      CmpInst::Predicate SwapPred
+        = CmpInst::getSwappedPredicate(static_cast<CmpInst::Predicate>(CCVal));
+      II->setArgOperand(0, Src1);
+      II->setArgOperand(1, Src0);
+      II->setArgOperand(2, ConstantInt::get(CC->getType(),
+                                            static_cast<int>(SwapPred)));
+      return II;
+    }
+
+    if (CCVal != CmpInst::ICMP_EQ && CCVal != CmpInst::ICMP_NE)
+      break;
+
+    // Canonicalize compare eq with true value to compare != 0
+    // llvm.amdgcn.icmp(zext (i1 x), 1, eq)
+    //   -> llvm.amdgcn.icmp(zext (i1 x), 0, ne)
+    // llvm.amdgcn.icmp(sext (i1 x), -1, eq)
+    //   -> llvm.amdgcn.icmp(sext (i1 x), 0, ne)
+    Value *ExtSrc;
+    if (CCVal == CmpInst::ICMP_EQ &&
+        ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) ||
+         (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) &&
+        ExtSrc->getType()->isIntegerTy(1)) {
+      II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType()));
+      II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
+      return II;
+    }
+
+    CmpInst::Predicate SrcPred;
+    Value *SrcLHS;
+    Value *SrcRHS;
+
+    // Fold compare eq/ne with 0 from a compare result as the predicate to the
+    // intrinsic. The typical use is a wave vote function in the library, which
+    // will be fed from a user code condition compared with 0. Fold in the
+    // redundant compare.
+
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, ne)
+    //   -> llvm.amdgcn.[if]cmp(a, b, pred)
+    //
+    // llvm.amdgcn.icmp([sz]ext ([if]cmp pred a, b), 0, eq)
+    //   -> llvm.amdgcn.[if]cmp(a, b, inv pred)
+    if (match(Src1, m_Zero()) &&
+        match(Src0,
+              m_ZExtOrSExt(m_Cmp(SrcPred, m_Value(SrcLHS), m_Value(SrcRHS))))) {
+      if (CCVal == CmpInst::ICMP_EQ)
+        SrcPred = CmpInst::getInversePredicate(SrcPred);
+
+      Intrinsic::ID NewIID = CmpInst::isFPPredicate(SrcPred) ?
+        Intrinsic::amdgcn_fcmp : Intrinsic::amdgcn_icmp;
+
+      Value *NewF = Intrinsic::getDeclaration(II->getModule(), NewIID,
+                                              SrcLHS->getType());
+      Value *Args[] = { SrcLHS, SrcRHS,
+                        ConstantInt::get(CC->getType(), SrcPred) };
+      CallInst *NewCall = Builder->CreateCall(NewF, Args);
+      NewCall->takeName(II);
+      return replaceInstUsesWith(*II, NewCall);
+    }
+
+    break;
+  }
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
@@ -3244,7 +3606,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
       // isKnownNonNull -> nonnull attribute
       if (isKnownNonNullAt(DerivedPtr, II, &DT))
-        II->addAttribute(AttributeSet::ReturnIndex, Attribute::NonNull);
+        II->addAttribute(AttributeList::ReturnIndex, Attribute::NonNull);
     }
 
     // TODO: bitcast(relocate(p)) -> relocate(bitcast(p))
@@ -3255,19 +3617,36 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   }
 
   case Intrinsic::experimental_guard: {
-    Value *IIOperand = II->getArgOperand(0);
-
-    // Remove a guard if it is immediately followed by an identical guard.
-    if (match(II->getNextNode(),
-              m_Intrinsic<Intrinsic::experimental_guard>(m_Specific(IIOperand))))
-      return eraseInstFromFunction(*II);
+    // Is this guard followed by another guard?
+    Instruction *NextInst = II->getNextNode();
+    Value *NextCond = nullptr;
+    if (match(NextInst,
+              m_Intrinsic<Intrinsic::experimental_guard>(m_Value(NextCond)))) {
+      Value *CurrCond = II->getArgOperand(0);
+
+      // Remove a guard that it is immediately preceded by an identical guard.
+      if (CurrCond == NextCond)
+        return eraseInstFromFunction(*NextInst);
+
+      // Otherwise canonicalize guard(a); guard(b) -> guard(a & b).
+      II->setArgOperand(0, Builder->CreateAnd(CurrCond, NextCond));
+      return eraseInstFromFunction(*NextInst);
+    }
     break;
   }
   }
-
   return visitCallSite(II);
 }
 
+// Fence instruction simplification
+Instruction *InstCombiner::visitFenceInst(FenceInst &FI) {
+  // Remove identical consecutive fences.
+  if (auto *NFI = dyn_cast<FenceInst>(FI.getNextNode()))
+    if (FI.isIdenticalTo(NFI))
+      return eraseInstFromFunction(FI);
+  return nullptr;
+}
+
 // InvokeInst simplification
 //
 Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
@@ -3423,7 +3802,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   assert(ArgNo == CS.arg_size() && "sanity check");
 
   if (!Indices.empty()) {
-    AttributeSet AS = CS.getAttributes();
+    AttributeList AS = CS.getAttributes();
     LLVMContext &Ctx = CS.getInstruction()->getContext();
     AS = AS.addAttribute(Ctx, Indices,
                          Attribute::get(Ctx, Attribute::NonNull));
@@ -3545,7 +3924,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     return false;
 
   Instruction *Caller = CS.getInstruction();
-  const AttributeSet &CallerPAL = CS.getAttributes();
+  const AttributeList &CallerPAL = CS.getAttributes();
 
   // Okay, this is a cast from a function to a different type.  Unless doing so
   // would cause a type conversion of one of our arguments, change this call to
@@ -3572,7 +3951,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     }
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
-      AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+      AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
       if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
         return false;   // Attribute not compatible with transformed value.
     }
@@ -3613,7 +3992,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     if (!CastInst::isBitOrNoopPointerCastable(ActTy, ParamTy, DL))
       return false;   // Cannot transform this parameter value.
 
-    if (AttrBuilder(CallerPAL.getParamAttributes(i + 1), i + 1).
+    if (AttrBuilder(CallerPAL.getParamAttributes(i + 1)).
           overlaps(AttributeFuncs::typeIncompatible(ParamTy)))
       return false;   // Attribute not compatible with transformed value.
 
@@ -3622,9 +4001,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
     // If the parameter is passed as a byval argument, then we have to have a
     // sized type and the sized type has to have the same size as the old type.
-    if (ParamTy != ActTy &&
-        CallerPAL.getParamAttributes(i + 1).hasAttribute(i + 1,
-                                                         Attribute::ByVal)) {
+    if (ParamTy != ActTy && CallerPAL.hasAttribute(i + 1, Attribute::ByVal)) {
       PointerType *ParamPTy = dyn_cast<PointerType>(ParamTy);
       if (!ParamPTy || !ParamPTy->getElementType()->isSized())
         return false;
@@ -3669,7 +4046,7 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         break;
 
       // Check if it has an attribute that's incompatible with varargs.
-      AttributeSet PAttrs = CallerPAL.getSlotAttributes(i - 1);
+      AttributeList PAttrs = CallerPAL.getSlotAttributes(i - 1);
       if (PAttrs.hasAttribute(Index, Attribute::StructRet))
         return false;
     }
@@ -3679,11 +4056,11 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
   // inserting cast instructions as necessary.
   std::vector<Value*> Args;
   Args.reserve(NumActualArgs);
-  SmallVector<AttributeSet, 8> attrVec;
+  SmallVector<AttributeList, 8> attrVec;
   attrVec.reserve(NumCommonArgs);
 
   // Get any return attributes.
-  AttrBuilder RAttrs(CallerPAL, AttributeSet::ReturnIndex);
+  AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
 
   // If the return value is not being used, the type may not be compatible
   // with the existing attributes.  Wipe out any problematic attributes.
@@ -3691,8 +4068,8 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
 
   // Add the new return attributes.
   if (RAttrs.hasAttributes())
-    attrVec.push_back(AttributeSet::get(Caller->getContext(),
-                                        AttributeSet::ReturnIndex, RAttrs));
+    attrVec.push_back(AttributeList::get(Caller->getContext(),
+                                         AttributeList::ReturnIndex, RAttrs));
 
   AI = CS.arg_begin();
   for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
@@ -3705,10 +4082,10 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     }
 
     // Add any parameter attributes.
-    AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
+    AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1));
     if (PAttrs.hasAttributes())
-      attrVec.push_back(AttributeSet::get(Caller->getContext(), i + 1,
-                                          PAttrs));
+      attrVec.push_back(
+          AttributeList::get(Caller->getContext(), i + 1, PAttrs));
   }
 
   // If the function takes more arguments than the call was taking, add them
@@ -3733,23 +4110,25 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
         }
 
         // Add any parameter attributes.
-        AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1), i + 1);
+        AttrBuilder PAttrs(CallerPAL.getParamAttributes(i + 1));
         if (PAttrs.hasAttributes())
-          attrVec.push_back(AttributeSet::get(FT->getContext(), i + 1,
-                                              PAttrs));
+          attrVec.push_back(
+              AttributeList::get(FT->getContext(), i + 1, PAttrs));
       }
     }
   }
 
   AttributeSet FnAttrs = CallerPAL.getFnAttributes();
-  if (CallerPAL.hasAttributes(AttributeSet::FunctionIndex))
-    attrVec.push_back(AttributeSet::get(Callee->getContext(), FnAttrs));
+  if (CallerPAL.hasAttributes(AttributeList::FunctionIndex))
+    attrVec.push_back(AttributeList::get(Callee->getContext(),
+                                         AttributeList::FunctionIndex,
+                                         AttrBuilder(FnAttrs)));
 
   if (NewRetTy->isVoidTy())
     Caller->setName("");   // Void type should not have a name.
 
-  const AttributeSet &NewCallerPAL = AttributeSet::get(Callee->getContext(),
-                                                       attrVec);
+  const AttributeList &NewCallerPAL =
+      AttributeList::get(Callee->getContext(), attrVec);
 
   SmallVector<OperandBundleDef, 1> OpBundles;
   CS.getOperandBundlesAsDefs(OpBundles);
@@ -3765,6 +4144,12 @@ bool InstCombiner::transformConstExprCastCall(CallSite CS) {
     CallInst *CI = cast<CallInst>(Caller);
     NC = Builder->CreateCall(Callee, Args, OpBundles);
     NC->takeName(CI);
+    // Preserve the weight metadata for the new call instruction. The metadata
+    // is used by SamplePGO to check callsite's hotness.
+    uint64_t W;
+    if (CI->extractProfTotalWeight(W))
+      NC->setProfWeight(W);
+
     cast<CallInst>(NC)->setTailCallKind(CI->getTailCallKind());
     cast<CallInst>(NC)->setCallingConv(CI->getCallingConv());
     cast<CallInst>(NC)->setAttributes(NewCallerPAL);
@@ -3815,7 +4200,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   Value *Callee = CS.getCalledValue();
   PointerType *PTy = cast<PointerType>(Callee->getType());
   FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
-  const AttributeSet &Attrs = CS.getAttributes();
+  AttributeList Attrs = CS.getAttributes();
 
   // If the call already has the 'nest' attribute somewhere then give up -
   // otherwise 'nest' would occur twice after splicing in the chain.
@@ -3828,7 +4213,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
   Function *NestF =cast<Function>(Tramp->getArgOperand(1)->stripPointerCasts());
   FunctionType *NestFTy = cast<FunctionType>(NestF->getValueType());
 
-  const AttributeSet &NestAttrs = NestF->getAttributes();
+  AttributeList NestAttrs = NestF->getAttributes();
   if (!NestAttrs.isEmpty()) {
     unsigned NestIdx = 1;
     Type *NestTy = nullptr;
@@ -3847,18 +4232,15 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
     if (NestTy) {
       Instruction *Caller = CS.getInstruction();
       std::vector<Value*> NewArgs;
+      std::vector<AttributeSet> NewAttrs;
       NewArgs.reserve(CS.arg_size() + 1);
-
-      SmallVector<AttributeSet, 8> NewAttrs;
-      NewAttrs.reserve(Attrs.getNumSlots() + 1);
+      NewAttrs.reserve(CS.arg_size() + 2);
 
       // Insert the nest argument into the call argument list, which may
       // mean appending it.  Likewise for attributes.
 
       // Add any result attributes.
-      if (Attrs.hasAttributes(AttributeSet::ReturnIndex))
-        NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
-                                             Attrs.getRetAttributes()));
+      NewAttrs.push_back(Attrs.getRetAttributes());
 
       {
         unsigned Idx = 1;
@@ -3870,8 +4252,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
             if (NestVal->getType() != NestTy)
               NestVal = Builder->CreateBitCast(NestVal, NestTy, "nest");
             NewArgs.push_back(NestVal);
-            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
-                                                 NestAttr));
+            NewAttrs.push_back(NestAttr);
           }
 
           if (I == E)
@@ -3879,12 +4260,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
 
           // Add the original argument and attributes.
           NewArgs.push_back(*I);
-          AttributeSet Attr = Attrs.getParamAttributes(Idx);
-          if (Attr.hasAttributes(Idx)) {
-            AttrBuilder B(Attr, Idx);
-            NewAttrs.push_back(AttributeSet::get(Caller->getContext(),
-                                                 Idx + (Idx >= NestIdx), B));
-          }
+          NewAttrs.push_back(Attrs.getParamAttributes(Idx));
 
           ++Idx;
           ++I;
@@ -3892,9 +4268,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
       }
 
       // Add any function attributes.
-      if (Attrs.hasAttributes(AttributeSet::FunctionIndex))
-        NewAttrs.push_back(AttributeSet::get(FTy->getContext(),
-                                             Attrs.getFnAttributes()));
+      NewAttrs.push_back(Attrs.getFnAttributes());
 
       // The trampoline may have been bitcast to a bogus type (FTy).
       // Handle this by synthesizing a new function type, equal to FTy
@@ -3934,8 +4308,7 @@ InstCombiner::transformCallThroughTrampoline(CallSite CS,
         NestF->getType() == PointerType::getUnqual(NewFTy) ?
         NestF : ConstantExpr::getBitCast(NestF,
                                          PointerType::getUnqual(NewFTy));
-      const AttributeSet &NewPAL =
-          AttributeSet::get(FTy->getContext(), NewAttrs);
+      AttributeList NewPAL = AttributeList::get(FTy->getContext(), NewAttrs);
 
       SmallVector<OperandBundleDef, 1> OpBundles;
       CS.getOperandBundlesAsDefs(OpBundles);
diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 178920678d83dc2013bd2d4fd2a3434aaf488270..e08c301ccdd0c4dfbe91724fb1cd68e5dba22a04 100644
--- a/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -278,7 +278,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
     // Don't do this if it would create a PHI node with an illegal type from a
     // legal type.
     if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() ||
-        ShouldChangeType(CI.getType(), Src->getType()))
+        shouldChangeType(CI.getType(), Src->getType()))
       if (Instruction *NV = FoldOpIntoPhi(CI))
         return NV;
   }
@@ -447,7 +447,7 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC,
 Instruction *InstCombiner::shrinkBitwiseLogic(TruncInst &Trunc) {
   Type *SrcTy = Trunc.getSrcTy();
   Type *DestTy = Trunc.getType();
-  if (isa<IntegerType>(SrcTy) && !ShouldChangeType(SrcTy, DestTy))
+  if (isa<IntegerType>(SrcTy) && !shouldChangeType(SrcTy, DestTy))
     return nullptr;
 
   BinaryOperator *LogicOp;
@@ -463,6 +463,56 @@ Instruction *InstCombiner::shrinkBitwiseLogic(TruncInst &Trunc) {
   return BinaryOperator::Create(LogicOp->getOpcode(), NarrowOp0, NarrowC);
 }
 
+/// Try to narrow the width of a splat shuffle. This could be generalized to any
+/// shuffle with a constant operand, but we limit the transform to avoid
+/// creating a shuffle type that targets may not be able to lower effectively.
+static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
+                                       InstCombiner::BuilderTy &Builder) {
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
+  if (Shuf && Shuf->hasOneUse() && isa<UndefValue>(Shuf->getOperand(1)) &&
+      Shuf->getMask()->getSplatValue() &&
+      Shuf->getType() == Shuf->getOperand(0)->getType()) {
+    // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask
+    Constant *NarrowUndef = UndefValue::get(Trunc.getType());
+    Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
+    return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getMask());
+  }
+
+  return nullptr;
+}
+
+/// Try to narrow the width of an insert element. This could be generalized for
+/// any vector constant, but we limit the transform to insertion into undef to
+/// avoid potential backend problems from unsupported insertion widths. This
+/// could also be extended to handle the case of inserting a scalar constant
+/// into a vector variable.
+static Instruction *shrinkInsertElt(CastInst &Trunc,
+                                    InstCombiner::BuilderTy &Builder) {
+  Instruction::CastOps Opcode = Trunc.getOpcode();
+  assert((Opcode == Instruction::Trunc || Opcode == Instruction::FPTrunc) &&
+         "Unexpected instruction for shrinking");
+
+  auto *InsElt = dyn_cast<InsertElementInst>(Trunc.getOperand(0));
+  if (!InsElt || !InsElt->hasOneUse())
+    return nullptr;
+
+  Type *DestTy = Trunc.getType();
+  Type *DestScalarTy = DestTy->getScalarType();
+  Value *VecOp = InsElt->getOperand(0);
+  Value *ScalarOp = InsElt->getOperand(1);
+  Value *Index = InsElt->getOperand(2);
+
+  if (isa<UndefValue>(VecOp)) {
+    // trunc   (inselt undef, X, Index) --> inselt undef,   (trunc X), Index
+    // fptrunc (inselt undef, X, Index) --> inselt undef, (fptrunc X), Index
+    UndefValue *NarrowUndef = UndefValue::get(DestTy);
+    Value *NarrowOp = Builder.CreateCast(Opcode, ScalarOp, DestScalarTy);
+    return InsertElementInst::Create(NarrowUndef, NarrowOp, Index);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
@@ -488,7 +538,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
-  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
       canEvaluateTruncated(Src, DestTy, *this, &CI)) {
 
     // If this cast is a truncate, evaluting in a different type always
@@ -554,8 +604,14 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   if (Instruction *I = shrinkBitwiseLogic(CI))
     return I;
 
+  if (Instruction *I = shrinkSplatShuffle(CI, *Builder))
+    return I;
+
+  if (Instruction *I = shrinkInsertElt(CI, *Builder))
+    return I;
+
   if (Src->hasOneUse() && isa<IntegerType>(SrcTy) &&
-      ShouldChangeType(SrcTy, DestTy)) {
+      shouldChangeType(SrcTy, DestTy)) {
     // Transform "trunc (shl X, cst)" -> "shl (trunc X), cst" so long as the
     // dest type is native and cst < dest size.
     if (match(Src, m_Shl(m_Value(A), m_ConstantInt(Cst))) &&
@@ -838,11 +894,6 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   if (Instruction *Result = commonCastTransforms(CI))
     return Result;
 
-  // See if we can simplify any instructions used by the input whose sole
-  // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(CI))
-    return &CI;
-
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
@@ -851,10 +902,10 @@ Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
   // expression tree to something weird like i93 unless the source is also
   // strange.
   unsigned BitsToClear;
-  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
       canEvaluateZExtd(Src, DestTy, BitsToClear, *this, &CI)) {
-    assert(BitsToClear < SrcTy->getScalarSizeInBits() &&
-           "Unreasonable BitsToClear");
+    assert(BitsToClear <= SrcTy->getScalarSizeInBits() &&
+           "Can't clear more bits than in SrcTy");
 
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -1124,11 +1175,6 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   if (Instruction *I = commonCastTransforms(CI))
     return I;
 
-  // See if we can simplify any instructions used by the input whose sole
-  // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(CI))
-    return &CI;
-
   Value *Src = CI.getOperand(0);
   Type *SrcTy = Src->getType(), *DestTy = CI.getType();
 
@@ -1145,7 +1191,7 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
-  if ((DestTy->isVectorTy() || ShouldChangeType(SrcTy, DestTy)) &&
+  if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
       canEvaluateSExtd(Src, DestTy)) {
     // Okay, we can transform this!  Insert the new expression now.
     DEBUG(dbgs() << "ICE: EvaluateInDifferentType converting expression type"
@@ -1167,18 +1213,16 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
                                       ShAmt);
   }
 
-  // If this input is a trunc from our destination, then turn sext(trunc(x))
+  // If the input is a trunc from the destination type, then turn sext(trunc(x))
   // into shifts.
-  if (TruncInst *TI = dyn_cast<TruncInst>(Src))
-    if (TI->hasOneUse() && TI->getOperand(0)->getType() == DestTy) {
-      uint32_t SrcBitSize = SrcTy->getScalarSizeInBits();
-      uint32_t DestBitSize = DestTy->getScalarSizeInBits();
-
-      // We need to emit a shl + ashr to do the sign extend.
-      Value *ShAmt = ConstantInt::get(DestTy, DestBitSize-SrcBitSize);
-      Value *Res = Builder->CreateShl(TI->getOperand(0), ShAmt, "sext");
-      return BinaryOperator::CreateAShr(Res, ShAmt);
-    }
+  Value *X;
+  if (match(Src, m_OneUse(m_Trunc(m_Value(X)))) && X->getType() == DestTy) {
+    // sext(trunc(X)) --> ashr(shl(X, C), C)
+    unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
+    unsigned DestBitSize = DestTy->getScalarSizeInBits();
+    Constant *ShAmt = ConstantInt::get(DestTy, DestBitSize - SrcBitSize);
+    return BinaryOperator::CreateAShr(Builder->CreateShl(X, ShAmt), ShAmt);
+  }
 
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src))
     return transformSExtICmp(ICI, CI);
@@ -1225,17 +1269,15 @@ static Constant *fitsInFPType(ConstantFP *CFP, const fltSemantics &Sem) {
   return nullptr;
 }
 
-/// If this is a floating-point extension instruction, look
-/// through it until we get the source value.
+/// Look through floating-point extensions until we get the source value.
 static Value *lookThroughFPExtensions(Value *V) {
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    if (I->getOpcode() == Instruction::FPExt)
-      return lookThroughFPExtensions(I->getOperand(0));
+  while (auto *FPExt = dyn_cast<FPExtInst>(V))
+    V = FPExt->getOperand(0);
 
   // If this value is a constant, return the constant in the smallest FP type
   // that can accurately represent it.  This allows us to turn
   // (float)((double)X+2.0) into x+2.0f.
-  if (ConstantFP *CFP = dyn_cast<ConstantFP>(V)) {
+  if (auto *CFP = dyn_cast<ConstantFP>(V)) {
     if (CFP->getType() == Type::getPPC_FP128Ty(V->getContext()))
       return V;  // No constant folding of this.
     // See if the value can be truncated to half and then reextended.
@@ -1400,10 +1442,22 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
     case Intrinsic::round:
     case Intrinsic::nearbyint:
     case Intrinsic::trunc: {
+      Value *Src = II->getArgOperand(0);
+      if (!Src->hasOneUse())
+        break;
+
+      // Except for fabs, this transformation requires the input of the unary FP
+      // operation to be itself an fpext from the type to which we're
+      // truncating.
+      if (II->getIntrinsicID() != Intrinsic::fabs) {
+        FPExtInst *FPExtSrc = dyn_cast<FPExtInst>(Src);
+        if (!FPExtSrc || FPExtSrc->getOperand(0)->getType() != CI.getType())
+          break;
+      }
+
       // Do unary FP operation on smaller type.
       // (fptrunc (fabs x)) -> (fabs (fptrunc x))
-      Value *InnerTrunc = Builder->CreateFPTrunc(II->getArgOperand(0),
-                                                 CI.getType());
+      Value *InnerTrunc = Builder->CreateFPTrunc(Src, CI.getType());
       Type *IntrinsicType[] = { CI.getType() };
       Function *Overload = Intrinsic::getDeclaration(
         CI.getModule(), II->getIntrinsicID(), IntrinsicType);
@@ -1420,6 +1474,9 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &CI) {
     }
   }
 
+  if (Instruction *I = shrinkInsertElt(CI, *Builder))
+    return I;
+
   return nullptr;
 }
 
diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 3a1be81b7f58bf0cc8c7f57cc860ecfb741ae449..2419d4f32885d6b59526e77c551a2694dfd78501 100644
--- a/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -230,7 +230,9 @@ Instruction *InstCombiner::foldCmpLoadFromIndexedGlobal(GetElementPtrInst *GEP,
     return nullptr;
 
   uint64_t ArrayElementCount = Init->getType()->getArrayNumElements();
-  if (ArrayElementCount > 1024) return nullptr; // Don't blow up on huge arrays.
+  // Don't blow up on huge arrays.
+  if (ArrayElementCount > MaxArraySizeForCombine)
+    return nullptr;
 
   // There are many forms of this optimization we can handle, for now, just do
   // the simple index into a single-dimensional array.
@@ -884,6 +886,10 @@ static Instruction *transformToIndexedCompare(GEPOperator *GEPLHS, Value *RHS,
   if (!GEPLHS->hasAllConstantIndices())
     return nullptr;
 
+  // Make sure the pointers have the same type.
+  if (GEPLHS->getType() != RHS->getType())
+    return nullptr;
+
   Value *PtrBase, *Index;
   std::tie(PtrBase, Index) = getAsConstantIndexedAddress(GEPLHS, DL);
 
@@ -1659,7 +1665,7 @@ Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
       (Cmp.isEquality() || (!C1->isNegative() && !C2->isNegative()))) {
     // TODO: Is this a good transform for vectors? Wider types may reduce
     // throughput. Should this transform be limited (even for scalars) by using
-    // ShouldChangeType()?
+    // shouldChangeType()?
     if (!Cmp.getType()->isVectorTy()) {
       Type *WideType = W->getType();
       unsigned WideScalarBits = WideType->getScalarSizeInBits();
@@ -1788,6 +1794,15 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
                           ConstantInt::get(V->getType(), 1));
   }
 
+  // X | C == C --> X <=u C
+  // X | C != C --> X  >u C
+  //   iff C+1 is a power of 2 (C is a bitmask of the low bits)
+  if (Cmp.isEquality() && Cmp.getOperand(1) == Or->getOperand(1) &&
+      (*C + 1).isPowerOf2()) {
+    Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+    return new ICmpInst(Pred, Or->getOperand(0), Or->getOperand(1));
+  }
+
   if (!Cmp.isEquality() || *C != 0 || !Or->hasOneUse())
     return nullptr;
 
@@ -2366,8 +2381,24 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
   // Fold icmp pred (add X, C2), C.
   Value *X = Add->getOperand(0);
   Type *Ty = Add->getType();
-  auto CR =
-      ConstantRange::makeExactICmpRegion(Cmp.getPredicate(), *C).subtract(*C2);
+  CmpInst::Predicate Pred = Cmp.getPredicate();
+
+  // If the add does not wrap, we can always adjust the compare by subtracting
+  // the constants. Equality comparisons are handled elsewhere. SGE/SLE are
+  // canonicalized to SGT/SLT.
+  if (Add->hasNoSignedWrap() &&
+      (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLT)) {
+    bool Overflow;
+    APInt NewC = C->ssub_ov(*C2, Overflow);
+    // If there is overflow, the result must be true or false.
+    // TODO: Can we assert there is no overflow because InstSimplify always
+    // handles those cases?
+    if (!Overflow)
+      // icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2)
+      return new ICmpInst(Pred, X, ConstantInt::get(Ty, NewC));
+  }
+
+  auto CR = ConstantRange::makeExactICmpRegion(Pred, *C).subtract(*C2);
   const APInt &Upper = CR.getUpper();
   const APInt &Lower = CR.getLower();
   if (Cmp.isSigned()) {
@@ -2388,16 +2419,14 @@ Instruction *InstCombiner::foldICmpAddConstant(ICmpInst &Cmp,
   // X+C <u C2 -> (X & -C2) == C
   //   iff C & (C2-1) == 0
   //       C2 is a power of 2
-  if (Cmp.getPredicate() == ICmpInst::ICMP_ULT && C->isPowerOf2() &&
-      (*C2 & (*C - 1)) == 0)
+  if (Pred == ICmpInst::ICMP_ULT && C->isPowerOf2() && (*C2 & (*C - 1)) == 0)
     return new ICmpInst(ICmpInst::ICMP_EQ, Builder->CreateAnd(X, -(*C)),
                         ConstantExpr::getNeg(cast<Constant>(Y)));
 
   // X+C >u C2 -> (X & ~C2) != C
   //   iff C & C2 == 0
   //       C2+1 is a power of 2
-  if (Cmp.getPredicate() == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() &&
-      (*C2 & *C) == 0)
+  if (Pred == ICmpInst::ICMP_UGT && (*C + 1).isPowerOf2() && (*C2 & *C) == 0)
     return new ICmpInst(ICmpInst::ICMP_NE, Builder->CreateAnd(X, ~(*C)),
                         ConstantExpr::getNeg(cast<Constant>(Y)));
 
@@ -2791,12 +2820,6 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) {
     D = BO1->getOperand(1);
   }
 
-  // icmp (X+cst) < 0 --> X < -cst
-  if (NoOp0WrapProblem && ICmpInst::isSigned(Pred) && match(Op1, m_Zero()))
-    if (ConstantInt *RHSC = dyn_cast_or_null<ConstantInt>(B))
-      if (!RHSC->isMinValue(/*isSigned=*/true))
-        return new ICmpInst(Pred, A, ConstantExpr::getNeg(RHSC));
-
   // icmp (X+Y), X -> icmp Y, 0 for equalities or if there is no overflow.
   if ((A == Op1 || B == Op1) && NoOp0WrapProblem)
     return new ICmpInst(Pred, A == Op1 ? B : A,
@@ -3936,7 +3959,7 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
   assert((SIOpd == 1 || SIOpd == 2) && "Invalid select operand!");
   if (isChainSelectCmpBranch(SI) && Icmp->getPredicate() == ICmpInst::ICMP_EQ) {
     BasicBlock *Succ = SI->getParent()->getTerminator()->getSuccessor(1);
-    // The check for the unique predecessor is not the best that can be
+    // The check for the single predecessor is not the best that can be
     // done. But it protects efficiently against cases like when SI's
     // home block has two successors, Succ and Succ1, and Succ1 predecessor
     // of Succ. Then SI can't be replaced by SIOpd because the use that gets
@@ -3944,8 +3967,10 @@ bool InstCombiner::replacedSelectWithOperand(SelectInst *SI,
     // guarantees that the path all uses of SI (outside SI's parent) are on
     // is disjoint from all other paths out of SI. But that information
     // is more expensive to compute, and the trade-off here is in favor
-    // of compile-time.
-    if (Succ->getUniquePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
+    // of compile-time. It should also be noticed that we check for a single
+    // predecessor and not only uniqueness. This to handle the situation when
+    // Succ and Succ1 points to the same basic block.
+    if (Succ->getSinglePredecessor() && dominatesAllUses(SI, Icmp, Succ)) {
       NumSel++;
       SI->replaceUsesOutsideBlock(SI->getOperand(SIOpd), SI->getParent());
       return true;
@@ -3981,12 +4006,12 @@ Instruction *InstCombiner::foldICmpUsingKnownBits(ICmpInst &I) {
   APInt Op0KnownZero(BitWidth, 0), Op0KnownOne(BitWidth, 0);
   APInt Op1KnownZero(BitWidth, 0), Op1KnownOne(BitWidth, 0);
 
-  if (SimplifyDemandedBits(I.getOperandUse(0),
+  if (SimplifyDemandedBits(&I, 0,
                            getDemandedBitsLHSMask(I, BitWidth, IsSignBit),
                            Op0KnownZero, Op0KnownOne, 0))
     return &I;
 
-  if (SimplifyDemandedBits(I.getOperandUse(1), APInt::getAllOnesValue(BitWidth),
+  if (SimplifyDemandedBits(&I, 1, APInt::getAllOnesValue(BitWidth),
                            Op1KnownZero, Op1KnownOne, 0))
     return &I;
 
diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h
index 2847ce858e79181adfbf778064c94c5abcbf3fd1..ab15fd84b24a0688feef64b7f8b487d02f36d360 100644
--- a/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -28,6 +28,9 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/InstCombine/InstCombineWorklist.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Dwarf.h"
+#include "llvm/IR/DIBuilder.h"
 
 #define DEBUG_TYPE "instcombine"
 
@@ -40,21 +43,29 @@ class DbgDeclareInst;
 class MemIntrinsic;
 class MemSetInst;
 
-/// \brief Assign a complexity or rank value to LLVM Values.
+/// Assign a complexity or rank value to LLVM Values. This is used to reduce
+/// the amount of pattern matching needed for compares and commutative
+/// instructions. For example, if we have:
+///   icmp ugt X, Constant
+/// or
+///   xor (add X, Constant), cast Z
+///
+/// We do not have to consider the commuted variants of these patterns because
+/// canonicalization based on complexity guarantees the above ordering.
 ///
 /// This routine maps IR values to various complexity ranks:
 ///   0 -> undef
 ///   1 -> Constants
 ///   2 -> Other non-instructions
 ///   3 -> Arguments
-///   3 -> Unary operations
-///   4 -> Other instructions
+///   4 -> Cast and (f)neg/not instructions
+///   5 -> Other instructions
 static inline unsigned getComplexity(Value *V) {
   if (isa<Instruction>(V)) {
-    if (BinaryOperator::isNeg(V) || BinaryOperator::isFNeg(V) ||
-        BinaryOperator::isNot(V))
-      return 3;
-    return 4;
+    if (isa<CastInst>(V) || BinaryOperator::isNeg(V) ||
+        BinaryOperator::isFNeg(V) || BinaryOperator::isNot(V))
+      return 4;
+    return 5;
   }
   if (isa<Argument>(V))
     return 3;
@@ -289,6 +300,7 @@ public:
   Instruction *visitLoadInst(LoadInst &LI);
   Instruction *visitStoreInst(StoreInst &SI);
   Instruction *visitBranchInst(BranchInst &BI);
+  Instruction *visitFenceInst(FenceInst &FI);
   Instruction *visitSwitchInst(SwitchInst &SI);
   Instruction *visitReturnInst(ReturnInst &RI);
   Instruction *visitInsertValueInst(InsertValueInst &IV);
@@ -313,9 +325,14 @@ public:
   bool replacedSelectWithOperand(SelectInst *SI, const ICmpInst *Icmp,
                                  const unsigned SIOpd);
 
+  /// Try to replace instruction \p I with value \p V which are pointers
+  /// in different address space.
+  /// \return true if successful.
+  bool replacePointer(Instruction &I, Value *V);
+
 private:
-  bool ShouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
-  bool ShouldChangeType(Type *From, Type *To) const;
+  bool shouldChangeType(unsigned FromBitWidth, unsigned ToBitWidth) const;
+  bool shouldChangeType(Type *From, Type *To) const;
   Value *dyn_castNegVal(Value *V) const;
   Value *dyn_castFNegVal(Value *V, bool NoSignedZero = false) const;
   Type *FindElementAtOffset(PointerType *PtrTy, int64_t Offset,
@@ -456,8 +473,9 @@ public:
   /// methods should return the value returned by this function.
   Instruction *eraseInstFromFunction(Instruction &I) {
     DEBUG(dbgs() << "IC: ERASE " << I << '\n');
-
     assert(I.use_empty() && "Cannot erase instruction that is used!");
+    salvageDebugInfo(I);
+
     // Make sure that we reprocess all operands now that we reduced their
     // use counts.
     if (I.getNumOperands() < 8) {
@@ -499,6 +517,9 @@ public:
     return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT);
   }
 
+  /// Maximum size of array considered when transforming.
+  uint64_t MaxArraySizeForCombine;
+
 private:
   /// \brief Performs a few simplifications for operators which are associative
   /// or commutative.
@@ -518,7 +539,8 @@ private:
   Value *SimplifyDemandedUseBits(Value *V, APInt DemandedMask, APInt &KnownZero,
                                  APInt &KnownOne, unsigned Depth,
                                  Instruction *CxtI);
-  bool SimplifyDemandedBits(Use &U, const APInt &DemandedMask, APInt &KnownZero,
+  bool SimplifyDemandedBits(Instruction *I, unsigned Op,
+                            const APInt &DemandedMask, APInt &KnownZero,
                             APInt &KnownOne, unsigned Depth = 0);
   /// Helper routine of SimplifyDemandedUseBits. It tries to simplify demanded
   /// bit for "r1 = shr x, c1; r2 = shl r1, c2" instruction sequence.
@@ -549,7 +571,7 @@ private:
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI);
 
   /// This is a convenience wrapper function for the above two functions.
-  Instruction *foldOpWithConstantIntoOperand(Instruction &I);
+  Instruction *foldOpWithConstantIntoOperand(BinaryOperator &I);
 
   /// \brief Try to rotate an operation below a PHI node, using PHI nodes for
   /// its operands.
@@ -628,16 +650,16 @@ private:
                             SelectPatternFlavor SPF2, Value *C);
   Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI);
 
-  Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
+  Instruction *OptAndOp(BinaryOperator *Op, ConstantInt *OpRHS,
                         ConstantInt *AndRHS, BinaryOperator &TheAnd);
 
-  Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
-                            bool isSub, Instruction &I);
   Value *insertRangeTest(Value *V, const APInt &Lo, const APInt &Hi,
                          bool isSigned, bool Inside);
   Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
   Instruction *MatchBSwap(BinaryOperator &I);
   bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+
+  Instruction *SimplifyElementAtomicMemCpy(ElementAtomicMemCpyInst *AMI);
   Instruction *SimplifyMemTransfer(MemIntrinsic *MI);
   Instruction *SimplifyMemSet(MemSetInst *MI);
 
diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index 5fe9e5c645a4143c6204c3980850e7fee07ffd1e..6288e054f1bc57c9f285f04ac89a416d9bb9a761 100644
--- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -12,13 +12,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "InstCombineInternal.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -223,6 +225,107 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
   return nullptr;
 }
 
+namespace {
+// If I and V are pointers in different address space, it is not allowed to
+// use replaceAllUsesWith since I and V have different types. A
+// non-target-specific transformation should not use addrspacecast on V since
+// the two address space may be disjoint depending on target.
+//
+// This class chases down uses of the old pointer until reaching the load
+// instructions, then replaces the old pointer in the load instructions with
+// the new pointer. If during the chasing it sees bitcast or GEP, it will
+// create new bitcast or GEP with the new pointer and use them in the load
+// instruction.
+class PointerReplacer {
+public:
+  PointerReplacer(InstCombiner &IC) : IC(IC) {}
+  void replacePointer(Instruction &I, Value *V);
+
+private:
+  void findLoadAndReplace(Instruction &I);
+  void replace(Instruction *I);
+  Value *getReplacement(Value *I);
+
+  SmallVector<Instruction *, 4> Path;
+  MapVector<Value *, Value *> WorkMap;
+  InstCombiner &IC;
+};
+} // end anonymous namespace
+
+void PointerReplacer::findLoadAndReplace(Instruction &I) {
+  for (auto U : I.users()) {
+    auto *Inst = dyn_cast<Instruction>(&*U);
+    if (!Inst)
+      return;
+    DEBUG(dbgs() << "Found pointer user: " << *U << '\n');
+    if (isa<LoadInst>(Inst)) {
+      for (auto P : Path)
+        replace(P);
+      replace(Inst);
+    } else if (isa<GetElementPtrInst>(Inst) || isa<BitCastInst>(Inst)) {
+      Path.push_back(Inst);
+      findLoadAndReplace(*Inst);
+      Path.pop_back();
+    } else {
+      return;
+    }
+  }
+}
+
+Value *PointerReplacer::getReplacement(Value *V) {
+  auto Loc = WorkMap.find(V);
+  if (Loc != WorkMap.end())
+    return Loc->second;
+  return nullptr;
+}
+
+void PointerReplacer::replace(Instruction *I) {
+  if (getReplacement(I))
+    return;
+
+  if (auto *LT = dyn_cast<LoadInst>(I)) {
+    auto *V = getReplacement(LT->getPointerOperand());
+    assert(V && "Operand not replaced");
+    auto *NewI = new LoadInst(V);
+    NewI->takeName(LT);
+    IC.InsertNewInstWith(NewI, *LT);
+    IC.replaceInstUsesWith(*LT, NewI);
+    WorkMap[LT] = NewI;
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    auto *V = getReplacement(GEP->getPointerOperand());
+    assert(V && "Operand not replaced");
+    SmallVector<Value *, 8> Indices;
+    Indices.append(GEP->idx_begin(), GEP->idx_end());
+    auto *NewI = GetElementPtrInst::Create(
+        V->getType()->getPointerElementType(), V, Indices);
+    IC.InsertNewInstWith(NewI, *GEP);
+    NewI->takeName(GEP);
+    WorkMap[GEP] = NewI;
+  } else if (auto *BC = dyn_cast<BitCastInst>(I)) {
+    auto *V = getReplacement(BC->getOperand(0));
+    assert(V && "Operand not replaced");
+    auto *NewT = PointerType::get(BC->getType()->getPointerElementType(),
+                                  V->getType()->getPointerAddressSpace());
+    auto *NewI = new BitCastInst(V, NewT);
+    IC.InsertNewInstWith(NewI, *BC);
+    NewI->takeName(BC);
+    WorkMap[BC] = NewI;
+  } else {
+    llvm_unreachable("should never reach here");
+  }
+}
+
+void PointerReplacer::replacePointer(Instruction &I, Value *V) {
+#ifndef NDEBUG
+  auto *PT = cast<PointerType>(I.getType());
+  auto *NT = cast<PointerType>(V->getType());
+  assert(PT != NT && PT->getElementType() == NT->getElementType() &&
+         "Invalid usage");
+#endif
+  WorkMap[&I] = V;
+  findLoadAndReplace(I);
+}
+
 Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
   if (auto *I = simplifyAllocaArraySize(*this, AI))
     return I;
@@ -293,12 +396,22 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
           eraseInstFromFunction(*ToDelete[i]);
         Constant *TheSrc = cast<Constant>(Copy->getSource());
-        Constant *Cast
-          = ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, AI.getType());
-        Instruction *NewI = replaceInstUsesWith(AI, Cast);
-        eraseInstFromFunction(*Copy);
-        ++NumGlobalCopies;
-        return NewI;
+        auto *SrcTy = TheSrc->getType();
+        auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(),
+                                        SrcTy->getPointerAddressSpace());
+        Constant *Cast =
+            ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
+        if (AI.getType()->getPointerAddressSpace() ==
+            SrcTy->getPointerAddressSpace()) {
+          Instruction *NewI = replaceInstUsesWith(AI, Cast);
+          eraseInstFromFunction(*Copy);
+          ++NumGlobalCopies;
+          return NewI;
+        } else {
+          PointerReplacer PtrReplacer(*this);
+          PtrReplacer.replacePointer(AI, Cast);
+          ++NumGlobalCopies;
+        }
       }
     }
   }
@@ -502,7 +615,8 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
       !DL.isNonIntegralPointerType(Ty)) {
     if (all_of(LI.users(), [&LI](User *U) {
           auto *SI = dyn_cast<StoreInst>(U);
-          return SI && SI->getPointerOperand() != &LI;
+          return SI && SI->getPointerOperand() != &LI &&
+                 !SI->getPointerOperand()->isSwiftError();
         })) {
       LoadInst *NewLoad = combineLoadToNewType(
           IC, LI,
@@ -607,7 +721,7 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     // arrays of arbitrary size but this has a terrible impact on compile time.
     // The threshold here is chosen arbitrarily, maybe needs a little bit of
     // tuning.
-    if (NumElements > 1024)
+    if (NumElements > IC.MaxArraySizeForCombine)
       return nullptr;
 
     const DataLayout &DL = IC.getDataLayout();
@@ -1112,7 +1226,7 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
     // arrays of arbitrary size but this has a terrible impact on compile time.
     // The threshold here is chosen arbitrarily, maybe needs a little bit of
     // tuning.
-    if (NumElements > 1024)
+    if (NumElements > IC.MaxArraySizeForCombine)
       return false;
 
     const DataLayout &DL = IC.getDataLayout();
@@ -1424,7 +1538,9 @@ bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
                                    SI.getOrdering(),
                                    SI.getSynchScope());
   InsertNewInstBefore(NewSI, *BBI);
-  NewSI->setDebugLoc(OtherStore->getDebugLoc());
+  // The debug locations of the original instructions might differ; merge them.
+  NewSI->setDebugLoc(DILocation::getMergedLocation(SI.getDebugLoc(),
+                                                   OtherStore->getDebugLoc()));
 
   // If the two stores had AA tags, merge them.
   AAMDNodes AATags;
diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 45a19fb0f1f267051f5a0d7da042261ccbc69901..a238f3f05056689bf393a84c6284bcae3dd10f38 100644
--- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -298,39 +298,33 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   // (X / Y) *  Y = X - (X % Y)
   // (X / Y) * -Y = (X % Y) - X
   {
-    Value *Op1C = Op1;
-    BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0);
-    if (!BO ||
-        (BO->getOpcode() != Instruction::UDiv &&
-         BO->getOpcode() != Instruction::SDiv)) {
-      Op1C = Op0;
-      BO = dyn_cast<BinaryOperator>(Op1);
+    Value *Y = Op1;
+    BinaryOperator *Div = dyn_cast<BinaryOperator>(Op0);
+    if (!Div || (Div->getOpcode() != Instruction::UDiv &&
+                 Div->getOpcode() != Instruction::SDiv)) {
+      Y = Op0;
+      Div = dyn_cast<BinaryOperator>(Op1);
     }
-    Value *Neg = dyn_castNegVal(Op1C);
-    if (BO && BO->hasOneUse() &&
-        (BO->getOperand(1) == Op1C || BO->getOperand(1) == Neg) &&
-        (BO->getOpcode() == Instruction::UDiv ||
-         BO->getOpcode() == Instruction::SDiv)) {
-      Value *Op0BO = BO->getOperand(0), *Op1BO = BO->getOperand(1);
+    Value *Neg = dyn_castNegVal(Y);
+    if (Div && Div->hasOneUse() &&
+        (Div->getOperand(1) == Y || Div->getOperand(1) == Neg) &&
+        (Div->getOpcode() == Instruction::UDiv ||
+         Div->getOpcode() == Instruction::SDiv)) {
+      Value *X = Div->getOperand(0), *DivOp1 = Div->getOperand(1);
 
       // If the division is exact, X % Y is zero, so we end up with X or -X.
-      if (PossiblyExactOperator *SDiv = dyn_cast<PossiblyExactOperator>(BO))
-        if (SDiv->isExact()) {
-          if (Op1BO == Op1C)
-            return replaceInstUsesWith(I, Op0BO);
-          return BinaryOperator::CreateNeg(Op0BO);
-        }
-
-      Value *Rem;
-      if (BO->getOpcode() == Instruction::UDiv)
-        Rem = Builder->CreateURem(Op0BO, Op1BO);
-      else
-        Rem = Builder->CreateSRem(Op0BO, Op1BO);
-      Rem->takeName(BO);
+      if (Div->isExact()) {
+        if (DivOp1 == Y)
+          return replaceInstUsesWith(I, X);
+        return BinaryOperator::CreateNeg(X);
+      }
 
-      if (Op1BO == Op1C)
-        return BinaryOperator::CreateSub(Op0BO, Rem);
-      return BinaryOperator::CreateSub(Rem, Op0BO);
+      auto RemOpc = Div->getOpcode() == Instruction::UDiv ? Instruction::URem
+                                                          : Instruction::SRem;
+      Value *Rem = Builder->CreateBinOp(RemOpc, X, DivOp1);
+      if (DivOp1 == Y)
+        return BinaryOperator::CreateSub(X, Rem);
+      return BinaryOperator::CreateSub(Rem, X);
     }
   }
 
diff --git a/lib/Transforms/InstCombine/InstCombinePHI.cpp b/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 4cbffe9533b759b75be4ec6005a7f26f872dd139..d8574175307b3cf0614af12cc28160dcaad0492a 100644
--- a/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -507,7 +507,7 @@ Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
     // Be careful about transforming integer PHIs.  We don't want to pessimize
     // the code by turning an i32 into an i1293.
     if (PN.getType()->isIntegerTy() && CastSrcTy->isIntegerTy()) {
-      if (!ShouldChangeType(PN.getType(), CastSrcTy))
+      if (!shouldChangeType(PN.getType(), CastSrcTy))
         return nullptr;
     }
   } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
diff --git a/lib/Transforms/InstCombine/InstCombineSelect.cpp b/lib/Transforms/InstCombine/InstCombineSelect.cpp
index b5718c3a9e0b0bb34956217d386f59b7bb5f8311..84dace5db760073dfbc439bc8f38cd1db458cf20 100644
--- a/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -120,6 +120,16 @@ static Constant *getSelectFoldableConstant(Instruction *I) {
 /// We have (select c, TI, FI), and we know that TI and FI have the same opcode.
 Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
                                           Instruction *FI) {
+  // Don't break up min/max patterns. The hasOneUse checks below prevent that
+  // for most cases, but vector min/max with bitcasts can be transformed. If the
+  // one-use restrictions are eased for other patterns, we still don't want to
+  // obfuscate min/max.
+  if ((match(&SI, m_SMin(m_Value(), m_Value())) ||
+       match(&SI, m_SMax(m_Value(), m_Value())) ||
+       match(&SI, m_UMin(m_Value(), m_Value())) ||
+       match(&SI, m_UMax(m_Value(), m_Value()))))
+    return nullptr;
+
   // If this is a cast from the same type, merge.
   if (TI->getNumOperands() == 1 && TI->isCast()) {
     Type *FIOpndTy = FI->getOperand(0)->getType();
@@ -499,18 +509,16 @@ static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
   return true;
 }
 
-/// If this is an integer min/max where the select's 'true' operand is a
-/// constant, canonicalize that constant to the 'false' operand:
-/// select (icmp Pred X, C), C, X --> select (icmp Pred' X, C), X, C
+/// If this is an integer min/max (icmp + select) with a constant operand,
+/// create the canonical icmp for the min/max operation and canonicalize the
+/// constant to the 'false' operand of the select:
+/// select (icmp Pred X, C1), C2, X --> select (icmp Pred' X, C2), X, C2
+/// Note: if C1 != C2, this will change the icmp constant to the existing
+/// constant operand of the select.
 static Instruction *
 canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
                                InstCombiner::BuilderTy &Builder) {
-  // TODO: We should also canonicalize min/max when the select has a different
-  // constant value than the cmp constant, but we need to fix the backend first.
-  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)) ||
-      !isa<Constant>(Sel.getTrueValue()) ||
-      isa<Constant>(Sel.getFalseValue()) ||
-      Cmp.getOperand(1) != Sel.getTrueValue())
+  if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
     return nullptr;
 
   // Canonicalize the compare predicate based on whether we have min or max.
@@ -525,16 +533,25 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
   default: return nullptr;
   }
 
-  // Canonicalize the constant to the right side.
-  if (isa<Constant>(LHS))
-    std::swap(LHS, RHS);
+  // Is this already canonical?
+  if (Cmp.getOperand(0) == LHS && Cmp.getOperand(1) == RHS &&
+      Cmp.getPredicate() == NewPred)
+    return nullptr;
 
-  Value *NewCmp = Builder.CreateICmp(NewPred, LHS, RHS);
-  SelectInst *NewSel = SelectInst::Create(NewCmp, LHS, RHS, "", nullptr, &Sel);
+  // Create the canonical compare and plug it into the select.
+  Sel.setCondition(Builder.CreateICmp(NewPred, LHS, RHS));
 
-  // We swapped the select operands, so swap the metadata too.
-  NewSel->swapProfMetadata();
-  return NewSel;
+  // If the select operands did not change, we're done.
+  if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
+    return &Sel;
+
+  // If we are swapping the select operands, swap the metadata too.
+  assert(Sel.getTrueValue() == RHS && Sel.getFalseValue() == LHS &&
+         "Unexpected results from matchSelectPattern");
+  Sel.setTrueValue(LHS);
+  Sel.setFalseValue(RHS);
+  Sel.swapProfMetadata();
+  return &Sel;
 }
 
 /// Visit a SelectInst that has an ICmpInst as its first operand.
@@ -785,7 +802,9 @@ Instruction *InstCombiner::foldSPFofSPF(Instruction *Inner,
   // This transform is performance neutral if we can elide at least one xor from
   // the set of three operands, since we'll be tacking on an xor at the very
   // end.
-  if (IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
+  if (SelectPatternResult::isMinOrMax(SPF1) &&
+      SelectPatternResult::isMinOrMax(SPF2) &&
+      IsFreeOrProfitableToInvert(A, NotA, ElidesXor) &&
       IsFreeOrProfitableToInvert(B, NotB, ElidesXor) &&
       IsFreeOrProfitableToInvert(C, NotC, ElidesXor) && ElidesXor) {
     if (!NotA)
diff --git a/lib/Transforms/InstCombine/InstCombineShifts.cpp b/lib/Transforms/InstCombine/InstCombineShifts.cpp
index 4b6879255636d37bbe9b3cf1594a9fc2c2b45391..9aa679c60e47b358d6a3f28ffecc636707f7e9a4 100644
--- a/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -309,41 +309,6 @@ static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
   }
 }
 
-/// Try to fold (X << C1) << C2, where the shifts are some combination of
-/// shl/ashr/lshr.
-static Instruction *
-foldShiftByConstOfShiftByConst(BinaryOperator &I, const APInt *COp1,
-                               InstCombiner::BuilderTy *Builder) {
-  Value *Op0 = I.getOperand(0);
-  unsigned TypeBits = Op0->getType()->getScalarSizeInBits();
-
-  // Find out if this is a shift of a shift by a constant.
-  BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
-  if (!ShiftOp || !ShiftOp->isShift())
-    return nullptr;
-
-  const APInt *ShAmt1;
-  if (!match(ShiftOp->getOperand(1), m_APInt(ShAmt1)))
-    return nullptr;
-
-  // Check for (X << c1) << c2  and  (X >> c1) >> c2
-  if (I.getOpcode() == ShiftOp->getOpcode()) {
-    unsigned AmtSum = (*ShAmt1 + *COp1).getZExtValue();
-    // If this is an oversized composite shift, then unsigned shifts become
-    // zero (handled in InstSimplify) and ashr saturates.
-    if (AmtSum >= TypeBits) {
-      if (I.getOpcode() != Instruction::AShr)
-        return nullptr;
-      AmtSum = TypeBits - 1; // Saturate to 31 for i32 ashr.
-    }
-
-    return BinaryOperator::Create(I.getOpcode(), ShiftOp->getOperand(0),
-                                  ConstantInt::get(I.getType(), AmtSum));
-  }
-
-  return nullptr;
-}
-
 Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
                                                BinaryOperator &I) {
   bool isLeftShift = I.getOpcode() == Instruction::Shl;
@@ -370,13 +335,6 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
   assert(!Op1C->uge(TypeBits) &&
          "Shift over the type width should have been removed already");
 
-  // ((X*C1) << C2) == (X * (C1 << C2))
-  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
-    if (BO->getOpcode() == Instruction::Mul && isLeftShift)
-      if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
-        return BinaryOperator::CreateMul(BO->getOperand(0),
-                                         ConstantExpr::getShl(BOOp, Op1));
-
   if (Instruction *FoldedShift = foldOpWithConstantIntoOperand(I))
     return FoldedShift;
 
@@ -553,9 +511,6 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
     }
   }
 
-  if (Instruction *Folded = foldShiftByConstOfShiftByConst(I, Op1C, Builder))
-    return Folded;
-
   return nullptr;
 }
 
@@ -598,10 +553,10 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
     // arithmetic expressions are still recognizable by scalar evolution.
     // The inexact versions are deferred to DAGCombine, so we don't hide shl
     // behind a bit mask.
-    const APInt *ShrOp1;
-    if (match(Op0, m_CombineOr(m_Exact(m_LShr(m_Value(X), m_APInt(ShrOp1))),
-                               m_Exact(m_AShr(m_Value(X), m_APInt(ShrOp1)))))) {
-      unsigned ShrAmt = ShrOp1->getZExtValue();
+    const APInt *ShOp1;
+    if (match(Op0, m_CombineOr(m_Exact(m_LShr(m_Value(X), m_APInt(ShOp1))),
+                               m_Exact(m_AShr(m_Value(X), m_APInt(ShOp1)))))) {
+      unsigned ShrAmt = ShOp1->getZExtValue();
       if (ShrAmt < ShAmt) {
         // If C1 < C2: (X >>?,exact C1) << C2 --> X << (C2 - C1)
         Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShrAmt);
@@ -620,6 +575,14 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
       }
     }
 
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized shifts are simplified to zero in InstSimplify.
+      if (AmtSum < BitWidth)
+        // (X << C1) << C2 --> X << (C1 + C2)
+        return BinaryOperator::CreateShl(X, ConstantInt::get(Ty, AmtSum));
+    }
+
     // If the shifted-out value is known-zero, then this is a NUW shift.
     if (!I.hasNoUnsignedWrap() &&
         MaskedValueIsZero(Op0, APInt::getHighBitsSet(BitWidth, ShAmt), 0, &I)) {
@@ -634,12 +597,18 @@ Instruction *InstCombiner::visitShl(BinaryOperator &I) {
     }
   }
 
-  // (C1 << A) << C2 -> (C1 << C2) << A
-  Constant *C1, *C2;
-  Value *A;
-  if (match(Op0, m_OneUse(m_Shl(m_Constant(C1), m_Value(A)))) &&
-      match(Op1, m_Constant(C2)))
-    return BinaryOperator::CreateShl(ConstantExpr::getShl(C1, C2), A);
+  Constant *C1;
+  if (match(Op1, m_Constant(C1))) {
+    Constant *C2;
+    Value *X;
+    // (C2 << X) << C1 --> (C2 << C1) << X
+    if (match(Op0, m_OneUse(m_Shl(m_Constant(C2), m_Value(X)))))
+      return BinaryOperator::CreateShl(ConstantExpr::getShl(C2, C1), X);
+
+    // (X * C2) << C1 --> X * (C2 << C1)
+    if (match(Op0, m_Mul(m_Value(X), m_Constant(C2))))
+      return BinaryOperator::CreateMul(X, ConstantExpr::getShl(C2, C1));
+  }
 
   return nullptr;
 }
@@ -675,9 +644,9 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
     }
 
     Value *X;
-    const APInt *ShlAmtAPInt;
-    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShlAmtAPInt)))) {
-      unsigned ShlAmt = ShlAmtAPInt->getZExtValue();
+    const APInt *ShOp1;
+    if (match(Op0, m_Shl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned ShlAmt = ShOp1->getZExtValue();
       if (ShlAmt < ShAmt) {
         Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
         if (cast<BinaryOperator>(Op0)->hasNoUnsignedWrap()) {
@@ -710,6 +679,14 @@ Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(X, ConstantInt::get(Ty, Mask));
     }
 
+    if (match(Op0, m_LShr(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized shifts are simplified to zero in InstSimplify.
+      if (AmtSum < BitWidth)
+        // (X >>u C1) >>u C2 --> X >>u (C1 + C2)
+        return BinaryOperator::CreateLShr(X, ConstantInt::get(Ty, AmtSum));
+    }
+
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
         MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
@@ -747,9 +724,9 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
 
     // We can't handle (X << C1) >>s C2. It shifts arbitrary bits in. However,
     // we can handle (X <<nsw C1) >>s C2 since it only shifts in sign bits.
-    const APInt *ShlAmtAPInt;
-    if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShlAmtAPInt)))) {
-      unsigned ShlAmt = ShlAmtAPInt->getZExtValue();
+    const APInt *ShOp1;
+    if (match(Op0, m_NSWShl(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned ShlAmt = ShOp1->getZExtValue();
       if (ShlAmt < ShAmt) {
         // (X <<nsw C1) >>s C2 --> X >>s (C2 - C1)
         Constant *ShiftDiff = ConstantInt::get(Ty, ShAmt - ShlAmt);
@@ -766,6 +743,14 @@ Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
       }
     }
 
+    if (match(Op0, m_AShr(m_Value(X), m_APInt(ShOp1)))) {
+      unsigned AmtSum = ShAmt + ShOp1->getZExtValue();
+      // Oversized arithmetic shifts replicate the sign bit.
+      AmtSum = std::min(AmtSum, BitWidth - 1);
+      // (X >>s C1) >>s C2 --> X >>s (C1 + C2)
+      return BinaryOperator::CreateAShr(X, ConstantInt::get(Ty, AmtSum));
+    }
+
     // If the shifted-out value is known-zero, then this is an exact shift.
     if (!I.isExact() &&
         MaskedValueIsZero(Op0, APInt::getLowBitsSet(BitWidth, ShAmt), 0, &I)) {
diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index fb7177f1ddbd8c88de463ee449f1eb19b2618c3a..934fcfe78cb3058f626350411ef5b1ad53c90901 100644
--- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -30,18 +30,20 @@ static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo,
   assert(I && "No instruction?");
   assert(OpNo < I->getNumOperands() && "Operand index too large");
 
-  // If the operand is not a constant integer, nothing to do.
-  ConstantInt *OpC = dyn_cast<ConstantInt>(I->getOperand(OpNo));
-  if (!OpC) return false;
+  // The operand must be a constant integer or splat integer.
+  Value *Op = I->getOperand(OpNo);
+  const APInt *C;
+  if (!match(Op, m_APInt(C)))
+    return false;
 
   // If there are no bits set that aren't demanded, nothing to do.
-  Demanded = Demanded.zextOrTrunc(OpC->getValue().getBitWidth());
-  if ((~Demanded & OpC->getValue()) == 0)
+  Demanded = Demanded.zextOrTrunc(C->getBitWidth());
+  if ((~Demanded & *C) == 0)
     return false;
 
   // This instruction is producing bits that are not demanded. Shrink the RHS.
-  Demanded &= OpC->getValue();
-  I->setOperand(OpNo, ConstantInt::get(OpC->getType(), Demanded));
+  Demanded &= *C;
+  I->setOperand(OpNo, ConstantInt::get(Op->getType(), Demanded));
 
   return true;
 }
@@ -66,12 +68,13 @@ bool InstCombiner::SimplifyDemandedInstructionBits(Instruction &Inst) {
 /// This form of SimplifyDemandedBits simplifies the specified instruction
 /// operand if possible, updating it in place. It returns true if it made any
 /// change and false otherwise.
-bool InstCombiner::SimplifyDemandedBits(Use &U, const APInt &DemandedMask,
+bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
+                                        const APInt &DemandedMask,
                                         APInt &KnownZero, APInt &KnownOne,
                                         unsigned Depth) {
-  auto *UserI = dyn_cast<Instruction>(U.getUser());
+  Use &U = I->getOperandUse(OpNo);
   Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, KnownZero,
-                                          KnownOne, Depth, UserI);
+                                          KnownOne, Depth, I);
   if (!NewVal) return false;
   U = NewVal;
   return true;
@@ -114,9 +117,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       KnownOne.getBitWidth() == BitWidth &&
       "Value *V, DemandedMask, KnownZero and KnownOne "
       "must have same BitWidth");
-  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
-    // We know all of the bits for a constant!
-    KnownOne = CI->getValue() & DemandedMask;
+  const APInt *C;
+  if (match(V, m_APInt(C))) {
+    // We know all of the bits for a scalar constant or a splat vector constant!
+    KnownOne = *C & DemandedMask;
     KnownZero = ~KnownOne & DemandedMask;
     return nullptr;
   }
@@ -231,7 +235,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
   // operands.  This allows visitTruncInst (for example) to simplify the
   // operand of a trunc without duplicating all the logic below.
   if (Depth == 0 && !V->hasOneUse())
-    DemandedMask = APInt::getAllOnesValue(BitWidth);
+    DemandedMask.setAllBits();
 
   switch (I->getOpcode()) {
   default:
@@ -239,10 +243,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   case Instruction::And:
     // If either the LHS or the RHS are Zero, the result is zero.
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownZero,
-                             LHSKnownZero, LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnownZero, RHSKnownOne,
+                             Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnownZero, LHSKnownZero,
+                             LHSKnownOne, Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
@@ -277,10 +281,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   case Instruction::Or:
     // If either the LHS or the RHS are One, the result is One.
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask & ~RHSKnownOne,
-                             LHSKnownZero, LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnownZero, RHSKnownOne,
+                             Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask & ~RHSKnownOne, LHSKnownZero,
+                             LHSKnownOne, Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
@@ -319,10 +323,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     KnownOne = RHSKnownOne | LHSKnownOne;
     break;
   case Instruction::Xor: {
-    if (SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, LHSKnownZero,
-                             LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 1, DemandedMask, RHSKnownZero, RHSKnownOne,
+                             Depth + 1) ||
+        SimplifyDemandedBits(I, 0, DemandedMask, LHSKnownZero, LHSKnownOne,
+                             Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
@@ -412,10 +416,10 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (matchSelectPattern(I, LHS, RHS).Flavor != SPF_UNKNOWN)
       return nullptr;
 
-    if (SimplifyDemandedBits(I->getOperandUse(2), DemandedMask, RHSKnownZero,
-                             RHSKnownOne, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(1), DemandedMask, LHSKnownZero,
-                             LHSKnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 2, DemandedMask, RHSKnownZero, RHSKnownOne,
+                             Depth + 1) ||
+        SimplifyDemandedBits(I, 1, DemandedMask, LHSKnownZero, LHSKnownOne,
+                             Depth + 1))
       return I;
     assert(!(RHSKnownZero & RHSKnownOne) && "Bits known to be one AND zero?");
     assert(!(LHSKnownZero & LHSKnownOne) && "Bits known to be one AND zero?");
@@ -434,8 +438,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     DemandedMask = DemandedMask.zext(truncBf);
     KnownZero = KnownZero.zext(truncBf);
     KnownOne = KnownOne.zext(truncBf);
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
-                             KnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, DemandedMask, KnownZero, KnownOne,
+                             Depth + 1))
       return I;
     DemandedMask = DemandedMask.trunc(BitWidth);
     KnownZero = KnownZero.trunc(BitWidth);
@@ -460,8 +464,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // Don't touch a vector-to-scalar bitcast.
       return nullptr;
 
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
-                             KnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, DemandedMask, KnownZero, KnownOne,
+                             Depth + 1))
       return I;
     assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
     break;
@@ -472,15 +476,15 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     DemandedMask = DemandedMask.trunc(SrcBitWidth);
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMask, KnownZero,
-                             KnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, DemandedMask, KnownZero, KnownOne,
+                             Depth + 1))
       return I;
     DemandedMask = DemandedMask.zext(BitWidth);
     KnownZero = KnownZero.zext(BitWidth);
     KnownOne = KnownOne.zext(BitWidth);
     assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
     // The top bits are known to be zero.
-    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    KnownZero.setBitsFrom(SrcBitWidth);
     break;
   }
   case Instruction::SExt: {
@@ -490,7 +494,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     APInt InputDemandedBits = DemandedMask &
                               APInt::getLowBitsSet(BitWidth, SrcBitWidth);
 
-    APInt NewBits(APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth));
+    APInt NewBits(APInt::getBitsSetFrom(BitWidth, SrcBitWidth));
     // If any of the sign extended bits are demanded, we know that the sign
     // bit is demanded.
     if ((NewBits & DemandedMask) != 0)
@@ -499,8 +503,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     InputDemandedBits = InputDemandedBits.trunc(SrcBitWidth);
     KnownZero = KnownZero.trunc(SrcBitWidth);
     KnownOne = KnownOne.trunc(SrcBitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), InputDemandedBits, KnownZero,
-                             KnownOne, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, InputDemandedBits, KnownZero, KnownOne,
+                             Depth + 1))
       return I;
     InputDemandedBits = InputDemandedBits.zext(BitWidth);
     KnownZero = KnownZero.zext(BitWidth);
@@ -530,11 +534,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // Right fill the mask of bits for this ADD/SUB to demand the most
       // significant bit and all those below it.
       APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedFromOps,
-                               LHSKnownZero, LHSKnownOne, Depth + 1) ||
+      if (ShrinkDemandedConstant(I, 0, DemandedFromOps) ||
+          SimplifyDemandedBits(I, 0, DemandedFromOps, LHSKnownZero, LHSKnownOne,
+                               Depth + 1) ||
           ShrinkDemandedConstant(I, 1, DemandedFromOps) ||
-          SimplifyDemandedBits(I->getOperandUse(1), DemandedFromOps,
-                               LHSKnownZero, LHSKnownOne, Depth + 1)) {
+          SimplifyDemandedBits(I, 1, DemandedFromOps, RHSKnownZero, RHSKnownOne,
+                               Depth + 1)) {
         // Disable the nsw and nuw flags here: We can no longer guarantee that
         // we won't wrap after simplification. Removing the nsw/nuw flags is
         // legal here because the top bit is not demanded.
@@ -543,6 +548,15 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         BinOP.setHasNoUnsignedWrap(false);
         return I;
       }
+
+      // If we are known to be adding/subtracting zeros to every bit below
+      // the highest demanded bit, we just return the other side.
+      if ((DemandedFromOps & RHSKnownZero) == DemandedFromOps)
+        return I->getOperand(0);
+      // We can't do this with the LHS for subtraction.
+      if (I->getOpcode() == Instruction::Add &&
+          (DemandedFromOps & LHSKnownZero) == DemandedFromOps)
+        return I->getOperand(1);
     }
 
     // Otherwise just hand the add/sub off to computeKnownBits to fill in
@@ -569,19 +583,19 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // If the shift is NUW/NSW, then it does demand the high bits.
       ShlOperator *IOp = cast<ShlOperator>(I);
       if (IOp->hasNoSignedWrap())
-        DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt+1);
+        DemandedMaskIn.setHighBits(ShiftAmt+1);
       else if (IOp->hasNoUnsignedWrap())
-        DemandedMaskIn |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+        DemandedMaskIn.setHighBits(ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
-                               KnownOne, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, KnownZero, KnownOne,
+                               Depth + 1))
         return I;
       assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
       KnownZero <<= ShiftAmt;
       KnownOne  <<= ShiftAmt;
       // low bits known zero.
       if (ShiftAmt)
-        KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+        KnownZero.setLowBits(ShiftAmt);
     }
     break;
   case Instruction::LShr:
@@ -595,19 +609,16 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<LShrOperator>(I)->isExact())
-        DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+        DemandedMaskIn.setLowBits(ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
-                               KnownOne, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, KnownZero, KnownOne,
+                               Depth + 1))
         return I;
       assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
-      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
-      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
-      if (ShiftAmt) {
-        // Compute the new bits that are at the top now.
-        APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
-        KnownZero |= HighBits;  // high bits known zero.
-      }
+      KnownZero = KnownZero.lshr(ShiftAmt);
+      KnownOne  = KnownOne.lshr(ShiftAmt);
+      if (ShiftAmt)
+        KnownZero.setHighBits(ShiftAmt);  // high bits known zero.
     }
     break;
   case Instruction::AShr:
@@ -640,21 +651,21 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // If the shift is exact, then it does demand the low bits (and knows that
       // they are zero).
       if (cast<AShrOperator>(I)->isExact())
-        DemandedMaskIn |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+        DemandedMaskIn.setLowBits(ShiftAmt);
 
-      if (SimplifyDemandedBits(I->getOperandUse(0), DemandedMaskIn, KnownZero,
-                               KnownOne, Depth + 1))
+      if (SimplifyDemandedBits(I, 0, DemandedMaskIn, KnownZero, KnownOne,
+                               Depth + 1))
         return I;
       assert(!(KnownZero & KnownOne) && "Bits known to be one AND zero?");
       // Compute the new bits that are at the top now.
       APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
-      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
-      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+      KnownZero = KnownZero.lshr(ShiftAmt);
+      KnownOne  = KnownOne.lshr(ShiftAmt);
 
       // Handle the sign bits.
       APInt SignBit(APInt::getSignBit(BitWidth));
       // Adjust to where it is now in the mask.
-      SignBit = APIntOps::lshr(SignBit, ShiftAmt);
+      SignBit = SignBit.lshr(ShiftAmt);
 
       // If the input sign bit is known to be zero, or if none of the top bits
       // are demanded, turn this into an unsigned shift right.
@@ -683,8 +694,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
         APInt LowBits = RA - 1;
         APInt Mask2 = LowBits | APInt::getSignBit(BitWidth);
-        if (SimplifyDemandedBits(I->getOperandUse(0), Mask2, LHSKnownZero,
-                                 LHSKnownOne, Depth + 1))
+        if (SimplifyDemandedBits(I, 0, Mask2, LHSKnownZero, LHSKnownOne,
+                                 Depth + 1))
           return I;
 
         // The low bits of LHS are unchanged by the srem.
@@ -713,21 +724,17 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                        CxtI);
       // If it's known zero, our sign bit is also zero.
       if (LHSKnownZero.isNegative())
-        KnownZero.setBit(KnownZero.getBitWidth() - 1);
+        KnownZero.setSignBit();
     }
     break;
   case Instruction::URem: {
     APInt KnownZero2(BitWidth, 0), KnownOne2(BitWidth, 0);
     APInt AllOnes = APInt::getAllOnesValue(BitWidth);
-    if (SimplifyDemandedBits(I->getOperandUse(0), AllOnes, KnownZero2,
-                             KnownOne2, Depth + 1) ||
-        SimplifyDemandedBits(I->getOperandUse(1), AllOnes, KnownZero2,
-                             KnownOne2, Depth + 1))
+    if (SimplifyDemandedBits(I, 0, AllOnes, KnownZero2, KnownOne2, Depth + 1) ||
+        SimplifyDemandedBits(I, 1, AllOnes, KnownZero2, KnownOne2, Depth + 1))
       return I;
 
     unsigned Leaders = KnownZero2.countLeadingOnes();
-    Leaders = std::max(Leaders,
-                       KnownZero2.countLeadingOnes());
     KnownZero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
     break;
   }
@@ -792,11 +799,11 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
           return ConstantInt::getNullValue(VTy);
 
         // We know that the upper bits are set to zero.
-        KnownZero = APInt::getHighBitsSet(BitWidth, BitWidth - ArgWidth);
+        KnownZero.setBitsFrom(ArgWidth);
         return nullptr;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
-        KnownZero = APInt::getHighBitsSet(64, 32);
+        KnownZero.setBitsFrom(32);
         return nullptr;
       }
     }
@@ -849,7 +856,7 @@ Value *InstCombiner::SimplifyShrShlDemandedBits(Instruction *Shr,
   unsigned ShrAmt = ShrOp1.getZExtValue();
 
   KnownOne.clearAllBits();
-  KnownZero = APInt::getBitsSet(KnownZero.getBitWidth(), 0, ShlAmt-1);
+  KnownZero.setLowBits(ShlAmt - 1);
   KnownZero &= DemandedMask;
 
   APInt BitMask1(APInt::getAllOnesValue(BitWidth));
@@ -1479,8 +1486,11 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     case Intrinsic::x86_avx2_packssdw:
     case Intrinsic::x86_avx2_packsswb:
     case Intrinsic::x86_avx2_packusdw:
-    case Intrinsic::x86_avx2_packuswb: {
-      // TODO Add support for Intrinsic::x86_avx512_mask_pack*
+    case Intrinsic::x86_avx2_packuswb:
+    case Intrinsic::x86_avx512_packssdw_512:
+    case Intrinsic::x86_avx512_packsswb_512:
+    case Intrinsic::x86_avx512_packusdw_512:
+    case Intrinsic::x86_avx512_packuswb_512: {
       auto *Ty0 = II->getArgOperand(0)->getType();
       unsigned InnerVWidth = Ty0->getVectorNumElements();
       assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
@@ -1553,8 +1563,52 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     case Intrinsic::x86_sse4a_extrqi:
     case Intrinsic::x86_sse4a_insertq:
     case Intrinsic::x86_sse4a_insertqi:
-      UndefElts |= APInt::getHighBitsSet(VWidth, VWidth / 2);
+      UndefElts.setHighBits(VWidth / 2);
       break;
+    case Intrinsic::amdgcn_buffer_load:
+    case Intrinsic::amdgcn_buffer_load_format: {
+      if (VWidth == 1 || !DemandedElts.isMask())
+        return nullptr;
+
+      // TODO: Handle 3 vectors when supported in code gen.
+      unsigned NewNumElts = PowerOf2Ceil(DemandedElts.countTrailingOnes());
+      if (NewNumElts == VWidth)
+        return nullptr;
+
+      Module *M = II->getParent()->getParent()->getParent();
+      Type *EltTy = V->getType()->getVectorElementType();
+
+      Type *NewTy = (NewNumElts == 1) ? EltTy :
+        VectorType::get(EltTy, NewNumElts);
+
+      Function *NewIntrin = Intrinsic::getDeclaration(M, II->getIntrinsicID(),
+                                                      NewTy);
+
+      SmallVector<Value *, 5> Args;
+      for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
+        Args.push_back(II->getArgOperand(I));
+
+      IRBuilderBase::InsertPointGuard Guard(*Builder);
+      Builder->SetInsertPoint(II);
+
+      CallInst *NewCall = Builder->CreateCall(NewIntrin, Args);
+      NewCall->takeName(II);
+      NewCall->copyMetadata(*II);
+      if (NewNumElts == 1) {
+        return Builder->CreateInsertElement(UndefValue::get(V->getType()),
+                                            NewCall, static_cast<uint64_t>(0));
+      }
+
+      SmallVector<uint32_t, 8> EltMask;
+      for (unsigned I = 0; I < VWidth; ++I)
+        EltMask.push_back(I);
+
+      Value *Shuffle = Builder->CreateShuffleVector(
+        NewCall, UndefValue::get(NewTy), EltMask);
+
+      MadeChange = true;
+      return Shuffle;
+    }
     }
     break;
   }
diff --git a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index b2477f6c8633b1dbe462ea33da58a93d72d2237b..e89b400a4afc8f3f71f0a1642b537fcdba71ecde 100644
--- a/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -645,6 +645,36 @@ static Instruction *foldInsSequenceIntoBroadcast(InsertElementInst &InsElt) {
   return new ShuffleVectorInst(InsertFirst, UndefValue::get(VT), ZeroMask);
 }
 
+/// If we have an insertelement instruction feeding into another insertelement
+/// and the 2nd is inserting a constant into the vector, canonicalize that
+/// constant insertion before the insertion of a variable:
+///
+/// insertelement (insertelement X, Y, IdxC1), ScalarC, IdxC2 -->
+/// insertelement (insertelement X, ScalarC, IdxC2), Y, IdxC1
+///
+/// This has the potential of eliminating the 2nd insertelement instruction
+/// via constant folding of the scalar constant into a vector constant.
+static Instruction *hoistInsEltConst(InsertElementInst &InsElt2,
+                                     InstCombiner::BuilderTy &Builder) {
+  auto *InsElt1 = dyn_cast<InsertElementInst>(InsElt2.getOperand(0));
+  if (!InsElt1 || !InsElt1->hasOneUse())
+    return nullptr;
+
+  Value *X, *Y;
+  Constant *ScalarC;
+  ConstantInt *IdxC1, *IdxC2;
+  if (match(InsElt1->getOperand(0), m_Value(X)) &&
+      match(InsElt1->getOperand(1), m_Value(Y)) && !isa<Constant>(Y) &&
+      match(InsElt1->getOperand(2), m_ConstantInt(IdxC1)) &&
+      match(InsElt2.getOperand(1), m_Constant(ScalarC)) &&
+      match(InsElt2.getOperand(2), m_ConstantInt(IdxC2)) && IdxC1 != IdxC2) {
+    Value *NewInsElt1 = Builder.CreateInsertElement(X, ScalarC, IdxC2);
+    return InsertElementInst::Create(NewInsElt1, Y, IdxC1);
+  }
+
+  return nullptr;
+}
+
 /// insertelt (shufflevector X, CVec, Mask|insertelt X, C1, CIndex1), C, CIndex
 /// --> shufflevector X, CVec', Mask'
 static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
@@ -806,6 +836,9 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
   if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
     return Shuf;
 
+  if (Instruction *NewInsElt = hoistInsEltConst(IE, *Builder))
+    return NewInsElt;
+
   // Turn a sequence of inserts that broadcasts a scalar into a single
   // insert + shufflevector.
   if (Instruction *Broadcast = foldInsSequenceIntoBroadcast(IE))
@@ -1107,12 +1140,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   SmallVector<int, 16> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
 
-  bool MadeChange = false;
-
-  // Undefined shuffle mask -> undefined value.
-  if (isa<UndefValue>(SVI.getOperand(2)))
-    return replaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+  if (auto *V = SimplifyShuffleVectorInst(LHS, RHS, SVI.getMask(),
+                                          SVI.getType(), DL, &TLI, &DT, &AC))
+    return replaceInstUsesWith(SVI, V);
 
+  bool MadeChange = false;
   unsigned VWidth = SVI.getType()->getVectorNumElements();
 
   APInt UndefElts(VWidth, 0);
@@ -1209,7 +1241,6 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (isShuffleExtractingFromLHS(SVI, Mask)) {
     Value *V = LHS;
     unsigned MaskElems = Mask.size();
-    unsigned BegIdx = Mask.front();
     VectorType *SrcTy = cast<VectorType>(V->getType());
     unsigned VecBitWidth = SrcTy->getBitWidth();
     unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
@@ -1223,6 +1254,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
           // Only visit bitcasts that weren't previously handled.
           BCs.push_back(BC);
     for (BitCastInst *BC : BCs) {
+      unsigned BegIdx = Mask.front();
       Type *TgtTy = BC->getDestTy();
       unsigned TgtElemBitWidth = DL.getTypeSizeInBits(TgtTy);
       if (!TgtElemBitWidth)
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 24517a848ba106815c9c37a68c0563b7e19bcdf7..f8b930f577125df9dada35cf6002a3b2e432b4ba 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -82,18 +82,24 @@ static cl::opt<bool>
 EnableExpensiveCombines("expensive-combines",
                         cl::desc("Enable expensive instruction combines"));
 
+static cl::opt<unsigned>
+MaxArraySize("instcombine-maxarray-size", cl::init(1024),
+             cl::desc("Maximum array size considered when doing a combine"));
+
 Value *InstCombiner::EmitGEPOffset(User *GEP) {
   return llvm::EmitGEPOffset(Builder, DL, GEP);
 }
 
 /// Return true if it is desirable to convert an integer computation from a
 /// given bit width to a new bit width.
-/// We don't want to convert from a legal to an illegal type for example or from
-/// a smaller to a larger illegal type.
-bool InstCombiner::ShouldChangeType(unsigned FromWidth,
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. A width of '1' is always treated as a legal type
+/// because i1 is a fundamental type in IR, and there are many specialized
+/// optimizations for i1 types.
+bool InstCombiner::shouldChangeType(unsigned FromWidth,
                                     unsigned ToWidth) const {
-  bool FromLegal = DL.isLegalInteger(FromWidth);
-  bool ToLegal = DL.isLegalInteger(ToWidth);
+  bool FromLegal = FromWidth == 1 || DL.isLegalInteger(FromWidth);
+  bool ToLegal = ToWidth == 1 || DL.isLegalInteger(ToWidth);
 
   // If this is a legal integer from type, and the result would be an illegal
   // type, don't do the transformation.
@@ -109,14 +115,16 @@ bool InstCombiner::ShouldChangeType(unsigned FromWidth,
 }
 
 /// Return true if it is desirable to convert a computation from 'From' to 'To'.
-/// We don't want to convert from a legal to an illegal type for example or from
-/// a smaller to a larger illegal type.
-bool InstCombiner::ShouldChangeType(Type *From, Type *To) const {
+/// We don't want to convert from a legal to an illegal type or from a smaller
+/// to a larger illegal type. i1 is always treated as a legal type because it is
+/// a fundamental type in IR, and there are many specialized optimizations for
+/// i1 types.
+bool InstCombiner::shouldChangeType(Type *From, Type *To) const {
   assert(From->isIntegerTy() && To->isIntegerTy());
 
   unsigned FromWidth = From->getPrimitiveSizeInBits();
   unsigned ToWidth = To->getPrimitiveSizeInBits();
-  return ShouldChangeType(FromWidth, ToWidth);
+  return shouldChangeType(FromWidth, ToWidth);
 }
 
 // Return true, if No Signed Wrap should be maintained for I.
@@ -447,16 +455,11 @@ static bool RightDistributesOverLeft(Instruction::BinaryOps LOp,
 
 /// This function returns identity value for given opcode, which can be used to
 /// factor patterns like (X * 2) + X ==> (X * 2) + (X * 1) ==> X * (2 + 1).
-static Value *getIdentityValue(Instruction::BinaryOps OpCode, Value *V) {
+static Value *getIdentityValue(Instruction::BinaryOps Opcode, Value *V) {
   if (isa<Constant>(V))
     return nullptr;
 
-  if (OpCode == Instruction::Mul)
-    return ConstantInt::get(V->getType(), 1);
-
-  // TODO: We can handle other cases e.g. Instruction::And, Instruction::Or etc.
-
-  return nullptr;
+  return ConstantExpr::getBinOpIdentity(Opcode, V->getType());
 }
 
 /// This function factors binary ops which can be combined using distributive
@@ -564,13 +567,11 @@ static Value *tryFactorization(InstCombiner::BuilderTy *Builder,
         if (isa<OverflowingBinaryOperator>(&I))
           HasNSW = I.hasNoSignedWrap();
 
-        if (BinaryOperator *Op0 = dyn_cast<BinaryOperator>(LHS))
-          if (isa<OverflowingBinaryOperator>(Op0))
-            HasNSW &= Op0->hasNoSignedWrap();
+        if (auto *LOBO = dyn_cast<OverflowingBinaryOperator>(LHS))
+          HasNSW &= LOBO->hasNoSignedWrap();
 
-        if (BinaryOperator *Op1 = dyn_cast<BinaryOperator>(RHS))
-          if (isa<OverflowingBinaryOperator>(Op1))
-            HasNSW &= Op1->hasNoSignedWrap();
+        if (auto *ROBO = dyn_cast<OverflowingBinaryOperator>(RHS))
+          HasNSW &= ROBO->hasNoSignedWrap();
 
         // We can propagate 'nsw' if we know that
         //  %Y = mul nsw i16 %X, C
@@ -720,6 +721,21 @@ Value *InstCombiner::dyn_castNegVal(Value *V) const {
     if (C->getType()->getElementType()->isIntegerTy())
       return ConstantExpr::getNeg(C);
 
+  if (ConstantVector *CV = dyn_cast<ConstantVector>(V)) {
+    for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
+      Constant *Elt = CV->getAggregateElement(i);
+      if (!Elt)
+        return nullptr;
+
+      if (isa<UndefValue>(Elt))
+        continue;
+
+      if (!isa<ConstantInt>(Elt))
+        return nullptr;
+    }
+    return ConstantExpr::getNeg(CV);
+  }
+
   return nullptr;
 }
 
@@ -902,7 +918,11 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
       // Beware of ConstantExpr:  it may eventually evaluate to getNullValue,
       // even if currently isNullValue gives false.
       Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i));
-      if (InC && !isa<ConstantExpr>(InC))
+      // For vector constants, we cannot use isNullValue to fold into
+      // FalseVInPred versus TrueVInPred. When we have individual nonzero
+      // elements in the vector, we will incorrectly fold InC to
+      // `TrueVInPred`.
+      if (InC && !isa<ConstantExpr>(InC) && isa<ConstantInt>(InC))
         InV = InC->isNullValue() ? FalseVInPred : TrueVInPred;
       else
         InV = Builder->CreateSelect(PN->getIncomingValue(i),
@@ -927,11 +947,15 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
     Constant *C = cast<Constant>(I.getOperand(1));
     for (unsigned i = 0; i != NumPHIValues; ++i) {
       Value *InV = nullptr;
-      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) {
         InV = ConstantExpr::get(I.getOpcode(), InC, C);
-      else
+      } else {
         InV = Builder->CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
                                    PN->getIncomingValue(i), C, "phitmp");
+        auto *FPInst = dyn_cast<Instruction>(InV);
+        if (FPInst && isa<FPMathOperator>(FPInst))
+          FPInst->copyFastMathFlags(&I);
+      }
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
   } else {
@@ -957,7 +981,7 @@ Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
   return replaceInstUsesWith(I, NewPN);
 }
 
-Instruction *InstCombiner::foldOpWithConstantIntoOperand(Instruction &I) {
+Instruction *InstCombiner::foldOpWithConstantIntoOperand(BinaryOperator &I) {
   assert(isa<Constant>(I.getOperand(1)) && "Unexpected operand type");
 
   if (auto *Sel = dyn_cast<SelectInst>(I.getOperand(0))) {
@@ -1315,22 +1339,19 @@ Value *InstCombiner::SimplifyVectorOp(BinaryOperator &Inst) {
   assert(cast<VectorType>(LHS->getType())->getNumElements() == VWidth);
   assert(cast<VectorType>(RHS->getType())->getNumElements() == VWidth);
 
-  // If both arguments of binary operation are shuffles, which use the same
-  // mask and shuffle within a single vector, it is worthwhile to move the
-  // shuffle after binary operation:
+  // If both arguments of the binary operation are shuffles that use the same
+  // mask and shuffle within a single vector, move the shuffle after the binop:
   //   Op(shuffle(v1, m), shuffle(v2, m)) -> shuffle(Op(v1, v2), m)
-  if (isa<ShuffleVectorInst>(LHS) && isa<ShuffleVectorInst>(RHS)) {
-    ShuffleVectorInst *LShuf = cast<ShuffleVectorInst>(LHS);
-    ShuffleVectorInst *RShuf = cast<ShuffleVectorInst>(RHS);
-    if (isa<UndefValue>(LShuf->getOperand(1)) &&
-        isa<UndefValue>(RShuf->getOperand(1)) &&
-        LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType() &&
-        LShuf->getMask() == RShuf->getMask()) {
-      Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
-          RShuf->getOperand(0), Builder);
-      return Builder->CreateShuffleVector(NewBO,
-          UndefValue::get(NewBO->getType()), LShuf->getMask());
-    }
+  auto *LShuf = dyn_cast<ShuffleVectorInst>(LHS);
+  auto *RShuf = dyn_cast<ShuffleVectorInst>(RHS);
+  if (LShuf && RShuf && LShuf->getMask() == RShuf->getMask() &&
+      isa<UndefValue>(LShuf->getOperand(1)) &&
+      isa<UndefValue>(RShuf->getOperand(1)) &&
+      LShuf->getOperand(0)->getType() == RShuf->getOperand(0)->getType()) {
+    Value *NewBO = CreateBinOpAsGiven(Inst, LShuf->getOperand(0),
+                                      RShuf->getOperand(0), Builder);
+    return Builder->CreateShuffleVector(
+        NewBO, UndefValue::get(NewBO->getType()), LShuf->getMask());
   }
 
   // If one argument is a shuffle within one vector, the other is a constant,
@@ -1648,14 +1669,14 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
   }
 
-  // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
-  Value *StrippedPtr = PtrOp->stripPointerCasts();
-  PointerType *StrippedPtrTy = dyn_cast<PointerType>(StrippedPtr->getType());
-
   // We do not handle pointer-vector geps here.
-  if (!StrippedPtrTy)
+  if (GEP.getType()->isVectorTy())
     return nullptr;
 
+  // Handle gep(bitcast x) and gep(gep x, 0, 0, 0).
+  Value *StrippedPtr = PtrOp->stripPointerCasts();
+  PointerType *StrippedPtrTy = cast<PointerType>(StrippedPtr->getType());
+
   if (StrippedPtr != PtrOp) {
     bool HasZeroPointerIndex = false;
     if (ConstantInt *C = dyn_cast<ConstantInt>(GEP.getOperand(1)))
@@ -2233,11 +2254,11 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
   ConstantInt *AddRHS;
   if (match(Cond, m_Add(m_Value(Op0), m_ConstantInt(AddRHS)))) {
     // Change 'switch (X+4) case 1:' into 'switch (X) case -3'.
-    for (SwitchInst::CaseIt CaseIter : SI.cases()) {
-      Constant *NewCase = ConstantExpr::getSub(CaseIter.getCaseValue(), AddRHS);
+    for (auto Case : SI.cases()) {
+      Constant *NewCase = ConstantExpr::getSub(Case.getCaseValue(), AddRHS);
       assert(isa<ConstantInt>(NewCase) &&
              "Result of expression should be constant");
-      CaseIter.setValue(cast<ConstantInt>(NewCase));
+      Case.setValue(cast<ConstantInt>(NewCase));
     }
     SI.setCondition(Op0);
     return &SI;
@@ -2269,9 +2290,9 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
     Value *NewCond = Builder->CreateTrunc(Cond, Ty, "trunc");
     SI.setCondition(NewCond);
 
-    for (SwitchInst::CaseIt CaseIter : SI.cases()) {
-      APInt TruncatedCase = CaseIter.getCaseValue()->getValue().trunc(NewWidth);
-      CaseIter.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
+    for (auto Case : SI.cases()) {
+      APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
+      Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
     }
     return &SI;
   }
@@ -2928,8 +2949,8 @@ bool InstCombiner::run() {
         Result->takeName(I);
 
         // Push the new instruction and any users onto the worklist.
-        Worklist.Add(Result);
         Worklist.AddUsersToWorkList(*Result);
+        Worklist.Add(Result);
 
         // Insert the new instruction into the basic block...
         BasicBlock *InstParent = I->getParent();
@@ -2952,8 +2973,8 @@ bool InstCombiner::run() {
         if (isInstructionTriviallyDead(I, &TLI)) {
           eraseInstFromFunction(*I);
         } else {
-          Worklist.Add(I);
           Worklist.AddUsersToWorkList(*I);
+          Worklist.Add(I);
         }
       }
       MadeIRChange = true;
@@ -3016,12 +3037,11 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
         }
 
       // See if we can constant fold its operands.
-      for (User::op_iterator i = Inst->op_begin(), e = Inst->op_end(); i != e;
-           ++i) {
-        if (!isa<ConstantVector>(i) && !isa<ConstantExpr>(i))
+      for (Use &U : Inst->operands()) {
+        if (!isa<ConstantVector>(U) && !isa<ConstantExpr>(U))
           continue;
 
-        auto *C = cast<Constant>(i);
+        auto *C = cast<Constant>(U);
         Constant *&FoldRes = FoldedConstants[C];
         if (!FoldRes)
           FoldRes = ConstantFoldConstant(C, DL, TLI);
@@ -3029,7 +3049,10 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
           FoldRes = C;
 
         if (FoldRes != C) {
-          *i = FoldRes;
+          DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
+                       << "\n    Old = " << *C
+                       << "\n    New = " << *FoldRes << '\n');
+          U = FoldRes;
           MadeIRChange = true;
         }
       }
@@ -3049,17 +3072,7 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
       if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
-        // See if this is an explicit destination.
-        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-             i != e; ++i)
-          if (i.getCaseValue() == Cond) {
-            BasicBlock *ReachableBB = i.getCaseSuccessor();
-            Worklist.push_back(ReachableBB);
-            continue;
-          }
-
-        // Otherwise it is the default destination.
-        Worklist.push_back(SI->getDefaultDest());
+        Worklist.push_back(SI->findCaseValue(Cond)->getCaseSuccessor());
         continue;
       }
     }
@@ -3146,6 +3159,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist,
 
     InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines,
                     AA, AC, TLI, DT, DL, LI);
+    IC.MaxArraySizeForCombine = MaxArraySize;
     Changed |= IC.run();
 
     if (!Changed)
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index c17484effdcf3aa481afbeddda023e66b41582df..94cfc69ed5551342f6d37545d9e3be01df0b5b46 100644
--- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -80,6 +80,7 @@ static const uint64_t kMIPS64_ShadowOffset64 = 1ULL << 37;
 static const uint64_t kAArch64_ShadowOffset64 = 1ULL << 36;
 static const uint64_t kFreeBSD_ShadowOffset32 = 1ULL << 30;
 static const uint64_t kFreeBSD_ShadowOffset64 = 1ULL << 46;
+static const uint64_t kPS4CPU_ShadowOffset64 = 1ULL << 40;
 static const uint64_t kWindowsShadowOffset32 = 3ULL << 28;
 // The shadow memory space is dynamically allocated.
 static const uint64_t kWindowsShadowOffset64 = kDynamicShadowSentinel;
@@ -380,6 +381,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   bool IsAndroid = TargetTriple.isAndroid();
   bool IsIOS = TargetTriple.isiOS() || TargetTriple.isWatchOS();
   bool IsFreeBSD = TargetTriple.isOSFreeBSD();
+  bool IsPS4CPU = TargetTriple.isPS4CPU();
   bool IsLinux = TargetTriple.isOSLinux();
   bool IsPPC64 = TargetTriple.getArch() == llvm::Triple::ppc64 ||
                  TargetTriple.getArch() == llvm::Triple::ppc64le;
@@ -392,6 +394,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
                   TargetTriple.getArch() == llvm::Triple::mips64el;
   bool IsAArch64 = TargetTriple.getArch() == llvm::Triple::aarch64;
   bool IsWindows = TargetTriple.isOSWindows();
+  bool IsFuchsia = TargetTriple.isOSFuchsia();
 
   ShadowMapping Mapping;
 
@@ -412,12 +415,18 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
     else
       Mapping.Offset = kDefaultShadowOffset32;
   } else {  // LongSize == 64
-    if (IsPPC64)
+    // Fuchsia is always PIE, which means that the beginning of the address
+    // space is always available.
+    if (IsFuchsia)
+      Mapping.Offset = 0;
+    else if (IsPPC64)
       Mapping.Offset = kPPC64_ShadowOffset64;
     else if (IsSystemZ)
       Mapping.Offset = kSystemZ_ShadowOffset64;
     else if (IsFreeBSD)
       Mapping.Offset = kFreeBSD_ShadowOffset64;
+    else if (IsPS4CPU)
+      Mapping.Offset = kPS4CPU_ShadowOffset64;
     else if (IsLinux && IsX86_64) {
       if (IsKasan)
         Mapping.Offset = kLinuxKasan_ShadowOffset64;
@@ -456,9 +465,9 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   // offset is not necessary 1/8-th of the address space.  On SystemZ,
   // we could OR the constant in a single instruction, but it's more
   // efficient to load it once and use indexed addressing.
-  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ
-                           && !(Mapping.Offset & (Mapping.Offset - 1))
-                           && Mapping.Offset != kDynamicShadowSentinel;
+  Mapping.OrShadowOffset = !IsAArch64 && !IsPPC64 && !IsSystemZ && !IsPS4CPU &&
+                           !(Mapping.Offset & (Mapping.Offset - 1)) &&
+                           Mapping.Offset != kDynamicShadowSentinel;
 
   return Mapping;
 }
@@ -567,8 +576,6 @@ struct AddressSanitizer : public FunctionPass {
   Type *IntptrTy;
   ShadowMapping Mapping;
   DominatorTree *DT;
-  Function *AsanCtorFunction = nullptr;
-  Function *AsanInitFunction = nullptr;
   Function *AsanHandleNoReturnFunc;
   Function *AsanPtrCmpFunction, *AsanPtrSubFunction;
   // This array is indexed by AccessIsWrite, Experiment and log2(AccessSize).
@@ -1013,7 +1020,9 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
        (!ClSkipPromotableAllocas || !isAllocaPromotable(&AI)) &&
        // inalloca allocas are not treated as static, and we don't want
        // dynamic alloca instrumentation for them as well.
-       !AI.isUsedWithInAlloca());
+       !AI.isUsedWithInAlloca() &&
+       // swifterror allocas are register promoted by ISel
+       !AI.isSwiftError());
 
   ProcessedAllocas[&AI] = IsInteresting;
   return IsInteresting;
@@ -1088,12 +1097,19 @@ Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
     }
   }
 
-  // Do not instrument acesses from different address spaces; we cannot deal
-  // with them.
   if (PtrOperand) {
+    // Do not instrument acesses from different address spaces; we cannot deal
+    // with them.
     Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
     if (PtrTy->getPointerAddressSpace() != 0)
       return nullptr;
+
+    // Ignore swifterror addresses.
+    // swifterror memory addresses are mem2reg promoted by instruction
+    // selection. As such they cannot have regular uses like an instrumentation
+    // function and it makes no sense to track them as memory.
+    if (PtrOperand->isSwiftError())
+      return nullptr;
   }
 
   // Treat memory accesses to promotable allocas as non-interesting since they
@@ -1552,31 +1568,31 @@ void AddressSanitizerModule::initializeCallbacks(Module &M) {
 
   // Declare our poisoning and unpoisoning functions.
   AsanPoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
+      kAsanPoisonGlobalsName, IRB.getVoidTy(), IntptrTy));
   AsanPoisonGlobals->setLinkage(Function::ExternalLinkage);
   AsanUnpoisonGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanUnpoisonGlobalsName, IRB.getVoidTy(), nullptr));
+      kAsanUnpoisonGlobalsName, IRB.getVoidTy()));
   AsanUnpoisonGlobals->setLinkage(Function::ExternalLinkage);
 
   // Declare functions that register/unregister globals.
   AsanRegisterGlobals = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy));
   AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
   AsanUnregisterGlobals = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(kAsanUnregisterGlobalsName, IRB.getVoidTy(),
-                            IntptrTy, IntptrTy, nullptr));
+                            IntptrTy, IntptrTy));
   AsanUnregisterGlobals->setLinkage(Function::ExternalLinkage);
 
   // Declare the functions that find globals in a shared object and then invoke
   // the (un)register function on them.
   AsanRegisterImageGlobals =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
+          kAsanRegisterImageGlobalsName, IRB.getVoidTy(), IntptrTy));
   AsanRegisterImageGlobals->setLinkage(Function::ExternalLinkage);
 
   AsanUnregisterImageGlobals =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy, nullptr));
+          kAsanUnregisterImageGlobalsName, IRB.getVoidTy(), IntptrTy));
   AsanUnregisterImageGlobals->setLinkage(Function::ExternalLinkage);
 }
 
@@ -1609,11 +1625,12 @@ void AddressSanitizerModule::SetComdatForGlobalMetadata(
 GlobalVariable *
 AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer,
                                              StringRef OriginalName) {
-  GlobalVariable *Metadata =
-      new GlobalVariable(M, Initializer->getType(), false,
-                         GlobalVariable::InternalLinkage, Initializer,
-                         Twine("__asan_global_") +
-                             GlobalValue::getRealLinkageName(OriginalName));
+  auto Linkage = TargetTriple.isOSBinFormatMachO()
+                     ? GlobalVariable::InternalLinkage
+                     : GlobalVariable::PrivateLinkage;
+  GlobalVariable *Metadata = new GlobalVariable(
+      M, Initializer->getType(), false, Linkage, Initializer,
+      Twine("__asan_global_") + GlobalValue::getRealLinkageName(OriginalName));
   Metadata->setSection(getGlobalMetadataSection());
   return Metadata;
 }
@@ -1918,13 +1935,19 @@ bool AddressSanitizerModule::runOnModule(Module &M) {
   Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
   initializeCallbacks(M);
 
-  bool Changed = false;
+  if (CompileKernel)
+    return false;
 
+  Function *AsanCtorFunction;
+  std::tie(AsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
+      M, kAsanModuleCtorName, kAsanInitName, /*InitArgTypes=*/{},
+      /*InitArgs=*/{}, kAsanVersionCheckName);
+  appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
+
+  bool Changed = false;
   // TODO(glider): temporarily disabled globals instrumentation for KASan.
-  if (ClGlobals && !CompileKernel) {
-    Function *CtorFunc = M.getFunction(kAsanModuleCtorName);
-    assert(CtorFunc);
-    IRBuilder<> IRB(CtorFunc->getEntryBlock().getTerminator());
+  if (ClGlobals) {
+    IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
     Changed |= InstrumentGlobals(IRB, M);
   }
 
@@ -1941,49 +1964,60 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
       const std::string ExpStr = Exp ? "exp_" : "";
       const std::string SuffixStr = CompileKernel ? "N" : "_n";
       const std::string EndingStr = Recover ? "_noabort" : "";
-      Type *ExpType = Exp ? Type::getInt32Ty(*C) : nullptr;
-      AsanErrorCallbackSized[AccessIsWrite][Exp] =
-          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-              kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr + EndingStr,
-              IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
-      AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
-          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-              ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
-              IRB.getVoidTy(), IntptrTy, IntptrTy, ExpType, nullptr));
-      for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
-           AccessSizeIndex++) {
-        const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
-        AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
-            checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-                kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
-                IRB.getVoidTy(), IntptrTy, ExpType, nullptr));
-        AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
-            checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-                ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
-                IRB.getVoidTy(), IntptrTy, ExpType, nullptr));
+
+      SmallVector<Type *, 3> Args2 = {IntptrTy, IntptrTy};
+      SmallVector<Type *, 2> Args1{1, IntptrTy};
+      if (Exp) {
+        Type *ExpType = Type::getInt32Ty(*C);
+        Args2.push_back(ExpType);
+        Args1.push_back(ExpType);
       }
-    }
+	    AsanErrorCallbackSized[AccessIsWrite][Exp] =
+	        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	            kAsanReportErrorTemplate + ExpStr + TypeStr + SuffixStr +
+	                EndingStr,
+	            FunctionType::get(IRB.getVoidTy(), Args2, false)));
+
+	    AsanMemoryAccessCallbackSized[AccessIsWrite][Exp] =
+	        checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	            ClMemoryAccessCallbackPrefix + ExpStr + TypeStr + "N" + EndingStr,
+	            FunctionType::get(IRB.getVoidTy(), Args2, false)));
+
+	    for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
+	         AccessSizeIndex++) {
+	      const std::string Suffix = TypeStr + itostr(1ULL << AccessSizeIndex);
+	      AsanErrorCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+	          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	              kAsanReportErrorTemplate + ExpStr + Suffix + EndingStr,
+	              FunctionType::get(IRB.getVoidTy(), Args1, false)));
+
+	      AsanMemoryAccessCallback[AccessIsWrite][Exp][AccessSizeIndex] =
+	          checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+	              ClMemoryAccessCallbackPrefix + ExpStr + Suffix + EndingStr,
+	              FunctionType::get(IRB.getVoidTy(), Args1, false)));
+	    }
+	  }
   }
 
   const std::string MemIntrinCallbackPrefix =
       CompileKernel ? std::string("") : ClMemoryAccessCallbackPrefix;
   AsanMemmove = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       MemIntrinCallbackPrefix + "memmove", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr));
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy));
   AsanMemcpy = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       MemIntrinCallbackPrefix + "memcpy", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy, nullptr));
+      IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IntptrTy));
   AsanMemset = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
       MemIntrinCallbackPrefix + "memset", IRB.getInt8PtrTy(),
-      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy, nullptr));
+      IRB.getInt8PtrTy(), IRB.getInt32Ty(), IntptrTy));
 
   AsanHandleNoReturnFunc = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy(), nullptr));
+      M.getOrInsertFunction(kAsanHandleNoReturnName, IRB.getVoidTy()));
 
   AsanPtrCmpFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy));
   AsanPtrSubFunction = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy));
   // We insert an empty inline asm after __asan_report* to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
                             StringRef(""), StringRef(""),
@@ -1993,7 +2027,6 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
 // virtual
 bool AddressSanitizer::doInitialization(Module &M) {
   // Initialize the private fields. No one has accessed them before.
-
   GlobalsMD.init(M);
 
   C = &(M.getContext());
@@ -2001,13 +2034,6 @@ bool AddressSanitizer::doInitialization(Module &M) {
   IntptrTy = Type::getIntNTy(*C, LongSize);
   TargetTriple = Triple(M.getTargetTriple());
 
-  if (!CompileKernel) {
-    std::tie(AsanCtorFunction, AsanInitFunction) =
-        createSanitizerCtorAndInitFunctions(
-            M, kAsanModuleCtorName, kAsanInitName,
-            /*InitArgTypes=*/{}, /*InitArgs=*/{}, kAsanVersionCheckName);
-    appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority);
-  }
   Mapping = getShadowMapping(TargetTriple, LongSize, CompileKernel);
   return true;
 }
@@ -2026,6 +2052,8 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   // We cannot just ignore these methods, because they may call other
   // instrumented functions.
   if (F.getName().find(" load]") != std::string::npos) {
+    Function *AsanInitFunction =
+        declareSanitizerInitFunction(*F.getParent(), kAsanInitName, {});
     IRBuilder<> IRB(&F.front(), F.front().begin());
     IRB.CreateCall(AsanInitFunction, {});
     return true;
@@ -2073,7 +2101,6 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
 }
 
 bool AddressSanitizer::runOnFunction(Function &F) {
-  if (&F == AsanCtorFunction) return false;
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
   if (!ClDebugFunc.empty() && ClDebugFunc == F.getName()) return false;
   if (F.getName().startswith("__asan_")) return false;
@@ -2167,8 +2194,9 @@ bool AddressSanitizer::runOnFunction(Function &F) {
       (ClInstrumentationWithCallsThreshold >= 0 &&
        ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
   const DataLayout &DL = F.getParent()->getDataLayout();
-  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(),
-                                     /*RoundToAlign=*/true);
+  ObjectSizeOpts ObjSizeOpts;
+  ObjSizeOpts.RoundToAlign = true;
+  ObjectSizeOffsetVisitor ObjSizeVis(DL, TLI, F.getContext(), ObjSizeOpts);
 
   // Instrument.
   int NumInstrumented = 0;
@@ -2226,18 +2254,18 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
     std::string Suffix = itostr(i);
     AsanStackMallocFunc[i] = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanStackMallocNameTemplate + Suffix, IntptrTy,
-                              IntptrTy, nullptr));
+                              IntptrTy));
     AsanStackFreeFunc[i] = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanStackFreeNameTemplate + Suffix,
-                              IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+                              IRB.getVoidTy(), IntptrTy, IntptrTy));
   }
   if (ASan.UseAfterScope) {
     AsanPoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanPoisonStackMemoryName, IRB.getVoidTy(),
-                              IntptrTy, IntptrTy, nullptr));
+                              IntptrTy, IntptrTy));
     AsanUnpoisonStackMemoryFunc = checkSanitizerInterfaceFunction(
         M.getOrInsertFunction(kAsanUnpoisonStackMemoryName, IRB.getVoidTy(),
-                              IntptrTy, IntptrTy, nullptr));
+                              IntptrTy, IntptrTy));
   }
 
   for (size_t Val : {0x00, 0xf1, 0xf2, 0xf3, 0xf5, 0xf8}) {
@@ -2246,14 +2274,14 @@ void FunctionStackPoisoner::initializeCallbacks(Module &M) {
     Name << std::setw(2) << std::setfill('0') << std::hex << Val;
     AsanSetShadowFunc[Val] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+            Name.str(), IRB.getVoidTy(), IntptrTy, IntptrTy));
   }
 
   AsanAllocaPoisonFunc = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+      kAsanAllocaPoison, IRB.getVoidTy(), IntptrTy, IntptrTy));
   AsanAllocasUnpoisonFunc =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy, nullptr));
+          kAsanAllocasUnpoison, IRB.getVoidTy(), IntptrTy, IntptrTy));
 }
 
 void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index b34d5b8c45a719a5e577ed0c215810598eee444b..4e454f0c95b6598b69921a79b1f25ba592988216 100644
--- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -254,7 +254,7 @@ class DataFlowSanitizer : public ModulePass {
   MDNode *ColdCallWeights;
   DFSanABIList ABIList;
   DenseMap<Value *, Function *> UnwrappedFnMap;
-  AttributeSet ReadOnlyNoneAttrs;
+  AttributeList ReadOnlyNoneAttrs;
   bool DFSanRuntimeShadowMask;
 
   Value *getShadowAddress(Value *Addr, Instruction *Pos);
@@ -331,6 +331,10 @@ class DFSanVisitor : public InstVisitor<DFSanVisitor> {
   DFSanFunction &DFSF;
   DFSanVisitor(DFSanFunction &DFSF) : DFSF(DFSF) {}
 
+  const DataLayout &getDataLayout() const {
+    return DFSF.F->getParent()->getDataLayout();
+  }
+
   void visitOperandShadowInst(Instruction &I);
 
   void visitBinaryOperator(BinaryOperator &BO);
@@ -539,16 +543,17 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
                                     F->getParent());
   NewF->copyAttributesFrom(F);
   NewF->removeAttributes(
-    AttributeSet::ReturnIndex,
-    AttributeSet::get(F->getContext(), AttributeSet::ReturnIndex,
-                    AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
+      AttributeList::ReturnIndex,
+      AttributeList::get(
+          F->getContext(), AttributeList::ReturnIndex,
+          AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
   if (F->isVarArg()) {
     NewF->removeAttributes(
-        AttributeSet::FunctionIndex,
-        AttributeSet().addAttribute(*Ctx, AttributeSet::FunctionIndex,
-                                    "split-stack"));
+        AttributeList::FunctionIndex,
+        AttributeList().addAttribute(*Ctx, AttributeList::FunctionIndex,
+                                     "split-stack"));
     CallInst::Create(DFSanVarargWrapperFn,
                      IRBuilder<>(BB).CreateGlobalStringPtr(F->getName()), "",
                      BB);
@@ -580,8 +585,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
     Function::arg_iterator AI = F->arg_begin(); ++AI;
     for (unsigned N = FT->getNumParams(); N != 0; ++AI, --N)
       Args.push_back(&*AI);
-    CallInst *CI =
-        CallInst::Create(&F->getArgumentList().front(), Args, "", BB);
+    CallInst *CI = CallInst::Create(&*F->arg_begin(), Args, "", BB);
     ReturnInst *RI;
     if (FT->getReturnType()->isVoidTy())
       RI = ReturnInst::Create(*Ctx, BB);
@@ -595,7 +599,7 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
     DFSanVisitor(DFSF).visitCallInst(*CI);
     if (!FT->getReturnType()->isVoidTy())
       new StoreInst(DFSF.getShadow(RI->getReturnValue()),
-                    &F->getArgumentList().back(), RI);
+                    &*std::prev(F->arg_end()), RI);
   }
 
   return C;
@@ -622,26 +626,26 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
 
   DFSanUnionFn = Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy);
   if (Function *F = dyn_cast<Function>(DFSanUnionFn)) {
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
-    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
     F->addAttribute(1, Attribute::ZExt);
     F->addAttribute(2, Attribute::ZExt);
   }
   DFSanCheckedUnionFn = Mod->getOrInsertFunction("dfsan_union", DFSanUnionFnTy);
   if (Function *F = dyn_cast<Function>(DFSanCheckedUnionFn)) {
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
-    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
     F->addAttribute(1, Attribute::ZExt);
     F->addAttribute(2, Attribute::ZExt);
   }
   DFSanUnionLoadFn =
       Mod->getOrInsertFunction("__dfsan_union_load", DFSanUnionLoadFnTy);
   if (Function *F = dyn_cast<Function>(DFSanUnionLoadFn)) {
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::NoUnwind);
-    F->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadOnly);
-    F->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::NoUnwind);
+    F->addAttribute(AttributeList::FunctionIndex, Attribute::ReadOnly);
+    F->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
   }
   DFSanUnimplementedFn =
       Mod->getOrInsertFunction("__dfsan_unimplemented", DFSanUnimplementedFnTy);
@@ -696,7 +700,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
 
   AttrBuilder B;
   B.addAttribute(Attribute::ReadOnly).addAttribute(Attribute::ReadNone);
-  ReadOnlyNoneAttrs = AttributeSet::get(*Ctx, AttributeSet::FunctionIndex, B);
+  ReadOnlyNoneAttrs = AttributeList::get(*Ctx, AttributeList::FunctionIndex, B);
 
   // First, change the ABI of every function in the module.  ABI-listed
   // functions keep their original ABI and get a wrapper function.
@@ -717,9 +721,10 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
         Function *NewF = Function::Create(NewFT, F.getLinkage(), "", &M);
         NewF->copyAttributesFrom(&F);
         NewF->removeAttributes(
-          AttributeSet::ReturnIndex,
-          AttributeSet::get(NewF->getContext(), AttributeSet::ReturnIndex,
-                    AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
+            AttributeList::ReturnIndex,
+            AttributeList::get(
+                NewF->getContext(), AttributeList::ReturnIndex,
+                AttributeFuncs::typeIncompatible(NewFT->getReturnType())));
         for (Function::arg_iterator FArg = F.arg_begin(),
                                     NewFArg = NewF->arg_begin(),
                                     FArgEnd = F.arg_end();
@@ -758,7 +763,7 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
           &F, std::string("dfsw$") + std::string(F.getName()),
           GlobalValue::LinkOnceODRLinkage, NewFT);
       if (getInstrumentedABI() == IA_TLS)
-        NewF->removeAttributes(AttributeSet::FunctionIndex, ReadOnlyNoneAttrs);
+        NewF->removeAttributes(AttributeList::FunctionIndex, ReadOnlyNoneAttrs);
 
       Value *WrappedFnCst =
           ConstantExpr::getBitCast(NewF, PointerType::getUnqual(FT));
@@ -906,7 +911,7 @@ Value *DFSanFunction::getShadow(Value *V) {
         break;
       }
       case DataFlowSanitizer::IA_Args: {
-        unsigned ArgIdx = A->getArgNo() + F->getArgumentList().size() / 2;
+        unsigned ArgIdx = A->getArgNo() + F->arg_size() / 2;
         Function::arg_iterator i = F->arg_begin();
         while (ArgIdx--)
           ++i;
@@ -983,7 +988,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
   IRBuilder<> IRB(Pos);
   if (AvoidNewBlocks) {
     CallInst *Call = IRB.CreateCall(DFS.DFSanCheckedUnionFn, {V1, V2});
-    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
     Call->addAttribute(1, Attribute::ZExt);
     Call->addAttribute(2, Attribute::ZExt);
 
@@ -996,7 +1001,7 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, Instruction *Pos) {
         Ne, Pos, /*Unreachable=*/false, DFS.ColdCallWeights, &DT));
     IRBuilder<> ThenIRB(BI);
     CallInst *Call = ThenIRB.CreateCall(DFS.DFSanUnionFn, {V1, V2});
-    Call->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    Call->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
     Call->addAttribute(1, Attribute::ZExt);
     Call->addAttribute(2, Attribute::ZExt);
 
@@ -1099,7 +1104,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     CallInst *FallbackCall = FallbackIRB.CreateCall(
         DFS.DFSanUnionLoadFn,
         {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
-    FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+    FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
 
     // Compare each of the shadows stored in the loaded 64 bits to each other,
     // by computing (WideShadow rotl ShadowWidth) == WideShadow.
@@ -1156,7 +1161,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
   IRBuilder<> IRB(Pos);
   CallInst *FallbackCall = IRB.CreateCall(
       DFS.DFSanUnionLoadFn, {ShadowAddr, ConstantInt::get(DFS.IntptrTy, Size)});
-  FallbackCall->addAttribute(AttributeSet::ReturnIndex, Attribute::ZExt);
+  FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
   return FallbackCall;
 }
 
@@ -1446,7 +1451,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
 
           // Custom functions returning non-void will write to the return label.
           if (!FT->getReturnType()->isVoidTy()) {
-            CustomFn->removeAttributes(AttributeSet::FunctionIndex,
+            CustomFn->removeAttributes(AttributeList::FunctionIndex,
                                        DFSF.DFS.ReadOnlyNoneAttrs);
           }
         }
@@ -1481,7 +1486,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
           auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,
                                            CS.arg_size() - FT->getNumParams());
           auto *LabelVAAlloca = new AllocaInst(
-              LabelVATy, "labelva", &DFSF.F->getEntryBlock().front());
+              LabelVATy, getDataLayout().getAllocaAddrSpace(),
+              "labelva", &DFSF.F->getEntryBlock().front());
 
           for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {
             auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
@@ -1494,8 +1500,9 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
         if (!FT->getReturnType()->isVoidTy()) {
           if (!DFSF.LabelReturnAlloca) {
             DFSF.LabelReturnAlloca =
-                new AllocaInst(DFSF.DFS.ShadowTy, "labelreturn",
-                               &DFSF.F->getEntryBlock().front());
+              new AllocaInst(DFSF.DFS.ShadowTy,
+                             getDataLayout().getAllocaAddrSpace(),
+                             "labelreturn", &DFSF.F->getEntryBlock().front());
           }
           Args.push_back(DFSF.LabelReturnAlloca);
         }
@@ -1574,7 +1581,8 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
       unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
       ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
       AllocaInst *VarArgShadow =
-          new AllocaInst(VarArgArrayTy, "", &DFSF.F->getEntryBlock().front());
+        new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
+                       "", &DFSF.F->getEntryBlock().front());
       Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
       for (unsigned n = 0; i != e; ++i, ++n) {
         IRB.CreateStore(
@@ -1593,7 +1601,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
     }
     NewCS.setCallingConv(CS.getCallingConv());
     NewCS.setAttributes(CS.getAttributes().removeAttributes(
-        *DFSF.DFS.Ctx, AttributeSet::ReturnIndex,
+        *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
         AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType())));
 
     if (Next) {
diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
index 05eba6c4dc6995196afcd5187004d989327446bf..7dea1dee756acd6747630c1d307d2dbdf3f9cfd2 100644
--- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp
@@ -267,35 +267,35 @@ void EfficiencySanitizer::initializeCallbacks(Module &M) {
     SmallString<32> AlignedLoadName("__esan_aligned_load" + ByteSizeStr);
     EsanAlignedLoad[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            AlignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            AlignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
     SmallString<32> AlignedStoreName("__esan_aligned_store" + ByteSizeStr);
     EsanAlignedStore[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            AlignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            AlignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
     SmallString<32> UnalignedLoadName("__esan_unaligned_load" + ByteSizeStr);
     EsanUnalignedLoad[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedLoadName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
     SmallString<32> UnalignedStoreName("__esan_unaligned_store" + ByteSizeStr);
     EsanUnalignedStore[Idx] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedStoreName, IRB.getVoidTy(), IRB.getInt8PtrTy()));
   }
   EsanUnalignedLoadN = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("__esan_unaligned_loadN", IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   EsanUnalignedStoreN = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("__esan_unaligned_storeN", IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemmoveFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemcpyFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemsetFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt32Ty(), IntptrTy, nullptr));
+                            IRB.getInt32Ty(), IntptrTy));
 }
 
 bool EfficiencySanitizer::shouldIgnoreStructType(StructType *StructTy) {
@@ -533,7 +533,7 @@ void EfficiencySanitizer::createDestructor(Module &M, Constant *ToolInfoArg) {
   IRBuilder<> IRB_Dtor(EsanDtorFunction->getEntryBlock().getTerminator());
   Function *EsanExit = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(EsanExitName, IRB_Dtor.getVoidTy(),
-                            Int8PtrTy, nullptr));
+                            Int8PtrTy));
   EsanExit->setLinkage(Function::ExternalLinkage);
   IRB_Dtor.CreateCall(EsanExit, {ToolInfoArg});
   appendToGlobalDtors(M, EsanDtorFunction, EsanCtorAndDtorPriority);
@@ -757,7 +757,7 @@ bool EfficiencySanitizer::instrumentGetElementPtr(Instruction *I, Module &M) {
     return false;
   }
   Type *SourceTy = GepInst->getSourceElementType();
-  StructType *StructTy;
+  StructType *StructTy = nullptr;
   ConstantInt *Idx;
   // Check if GEP calculates address from a struct array.
   if (isa<StructType>(SourceTy)) {
diff --git a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 384407b7129c6be0a5c01fc723b989f680bc7c0e..61d627673c907076dee6d15df4125bb6523332cf 100644
--- a/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -1,4 +1,4 @@
-//===-- IndirectCallPromotion.cpp - Promote indirect calls to direct calls ===//
+//===-- IndirectCallPromotion.cpp - Optimizations based on value profiling ===//
 //
 //                      The LLVM Compiler Infrastructure
 //
@@ -17,6 +17,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
 #include "llvm/Analysis/IndirectCallSiteVisitor.h"
 #include "llvm/IR/BasicBlock.h"
@@ -40,6 +42,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/PGOInstrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -53,6 +56,8 @@ using namespace llvm;
 
 STATISTIC(NumOfPGOICallPromotion, "Number of indirect call promotions.");
 STATISTIC(NumOfPGOICallsites, "Number of indirect call candidate sites.");
+STATISTIC(NumOfPGOMemOPOpt, "Number of memop intrinsics optimized.");
+STATISTIC(NumOfPGOMemOPAnnotate, "Number of memop intrinsics annotated.");
 
 // Command line option to disable indirect-call promotion with the default as
 // false. This is for debug purpose.
@@ -80,6 +85,12 @@ static cl::opt<bool> ICPLTOMode("icp-lto", cl::init(false), cl::Hidden,
                                 cl::desc("Run indirect-call promotion in LTO "
                                          "mode"));
 
+// Set if the pass is called in SamplePGO mode. The difference for SamplePGO
+// mode is it will add prof metadatato the created direct call.
+static cl::opt<bool>
+    ICPSamplePGOMode("icp-samplepgo", cl::init(false), cl::Hidden,
+                     cl::desc("Run indirect-call promotion in SamplePGO mode"));
+
 // If the option is set to true, only call instructions will be considered for
 // transformation -- invoke instructions will be ignored.
 static cl::opt<bool>
@@ -100,13 +111,51 @@ static cl::opt<bool>
     ICPDUMPAFTER("icp-dumpafter", cl::init(false), cl::Hidden,
                  cl::desc("Dump IR after transformation happens"));
 
+// The minimum call count to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPCountThreshold("pgo-memop-count-threshold", cl::Hidden, cl::ZeroOrMore,
+                        cl::init(1000),
+                        cl::desc("The minimum count to optimize memory "
+                                 "intrinsic calls"));
+
+// Command line option to disable memory intrinsic optimization. The default is
+// false. This is for debug purpose.
+static cl::opt<bool> DisableMemOPOPT("disable-memop-opt", cl::init(false),
+                                     cl::Hidden, cl::desc("Disable optimize"));
+
+// The percent threshold to optimize memory intrinsic calls.
+static cl::opt<unsigned>
+    MemOPPercentThreshold("pgo-memop-percent-threshold", cl::init(40),
+                          cl::Hidden, cl::ZeroOrMore,
+                          cl::desc("The percentage threshold for the "
+                                   "memory intrinsic calls optimization"));
+
+// Maximum number of versions for optimizing memory intrinsic call.
+static cl::opt<unsigned>
+    MemOPMaxVersion("pgo-memop-max-version", cl::init(3), cl::Hidden,
+                    cl::ZeroOrMore,
+                    cl::desc("The max version for the optimized memory "
+                             " intrinsic calls"));
+
+// Scale the counts from the annotation using the BB count value.
+static cl::opt<bool>
+    MemOPScaleCount("pgo-memop-scale-count", cl::init(true), cl::Hidden,
+                    cl::desc("Scale the memop size counts using the basic "
+                             " block count value"));
+
+// This option sets the rangge of precise profile memop sizes.
+extern cl::opt<std::string> MemOPSizeRange;
+
+// This option sets the value that groups large memop sizes
+extern cl::opt<unsigned> MemOPSizeLarge;
+
 namespace {
 class PGOIndirectCallPromotionLegacyPass : public ModulePass {
 public:
   static char ID;
 
-  PGOIndirectCallPromotionLegacyPass(bool InLTO = false)
-      : ModulePass(ID), InLTO(InLTO) {
+  PGOIndirectCallPromotionLegacyPass(bool InLTO = false, bool SamplePGO = false)
+      : ModulePass(ID), InLTO(InLTO), SamplePGO(SamplePGO) {
     initializePGOIndirectCallPromotionLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -119,6 +168,28 @@ private:
   // If this pass is called in LTO. We need to special handling the PGOFuncName
   // for the static variables due to LTO's internalization.
   bool InLTO;
+
+  // If this pass is called in SamplePGO. We need to add the prof metadata to
+  // the promoted direct call.
+  bool SamplePGO;
+};
+
+class PGOMemOPSizeOptLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  PGOMemOPSizeOptLegacyPass() : FunctionPass(ID) {
+    initializePGOMemOPSizeOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return "PGOMemOPSize"; }
+
+private:
+  bool runOnFunction(Function &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
 };
 } // end anonymous namespace
 
@@ -128,8 +199,22 @@ INITIALIZE_PASS(PGOIndirectCallPromotionLegacyPass, "pgo-icall-prom",
                 "direct calls.",
                 false, false)
 
-ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO) {
-  return new PGOIndirectCallPromotionLegacyPass(InLTO);
+ModulePass *llvm::createPGOIndirectCallPromotionLegacyPass(bool InLTO,
+                                                           bool SamplePGO) {
+  return new PGOIndirectCallPromotionLegacyPass(InLTO, SamplePGO);
+}
+
+char PGOMemOPSizeOptLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                      "Optimize memory intrinsic using its size value profile",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
+                    "Optimize memory intrinsic using its size value profile",
+                    false, false)
+
+FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
+  return new PGOMemOPSizeOptLegacyPass();
 }
 
 namespace {
@@ -144,6 +229,8 @@ private:
   // defines.
   InstrProfSymtab *Symtab;
 
+  bool SamplePGO;
+
   // Test if we can legally promote this direct-call of Target.
   bool isPromotionLegal(Instruction *Inst, uint64_t Target, Function *&F,
                         const char **Reason = nullptr);
@@ -175,9 +262,9 @@ private:
   ICallPromotionFunc &operator=(const ICallPromotionFunc &other) = delete;
 
 public:
-  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab)
-      : F(Func), M(Modu), Symtab(Symtab) {
-  }
+  ICallPromotionFunc(Function &Func, Module *Modu, InstrProfSymtab *Symtab,
+                     bool SamplePGO)
+      : F(Func), M(Modu), Symtab(Symtab), SamplePGO(SamplePGO) {}
 
   bool processFunction();
 };
@@ -509,10 +596,14 @@ static void insertCallRetPHI(Instruction *Inst, Instruction *CallResult,
 //     Ret = phi(Ret1, Ret2);
 // It adds type casts for the args do not match the parameters and the return
 // value. Branch weights metadata also updated.
+// If \p AttachProfToDirectCall is true, a prof metadata is attached to the
+// new direct call to contain \p Count. This is used by SamplePGO inliner to
+// check callsite hotness.
 // Returns the promoted direct call instruction.
 Instruction *llvm::promoteIndirectCall(Instruction *Inst,
                                        Function *DirectCallee, uint64_t Count,
-                                       uint64_t TotalCount) {
+                                       uint64_t TotalCount,
+                                       bool AttachProfToDirectCall) {
   assert(DirectCallee != nullptr);
   BasicBlock *BB = Inst->getParent();
   // Just to suppress the non-debug build warning.
@@ -527,6 +618,14 @@ Instruction *llvm::promoteIndirectCall(Instruction *Inst,
   Instruction *NewInst =
       createDirectCallInst(Inst, DirectCallee, DirectCallBB, MergeBB);
 
+  if (AttachProfToDirectCall) {
+    SmallVector<uint32_t, 1> Weights;
+    Weights.push_back(Count);
+    MDBuilder MDB(NewInst->getContext());
+    dyn_cast<Instruction>(NewInst->stripPointerCasts())
+        ->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+  }
+
   // Move Inst from MergeBB to IndirectCallBB.
   Inst->removeFromParent();
   IndirectCallBB->getInstList().insert(IndirectCallBB->getFirstInsertionPt(),
@@ -569,7 +668,7 @@ uint32_t ICallPromotionFunc::tryToPromote(
 
   for (auto &C : Candidates) {
     uint64_t Count = C.Count;
-    promoteIndirectCall(Inst, C.TargetFunction, Count, TotalCount);
+    promoteIndirectCall(Inst, C.TargetFunction, Count, TotalCount, SamplePGO);
     assert(TotalCount >= Count);
     TotalCount -= Count;
     NumOfPGOICallPromotion++;
@@ -610,7 +709,7 @@ bool ICallPromotionFunc::processFunction() {
 }
 
 // A wrapper function that does the actual work.
-static bool promoteIndirectCalls(Module &M, bool InLTO) {
+static bool promoteIndirectCalls(Module &M, bool InLTO, bool SamplePGO) {
   if (DisableICP)
     return false;
   InstrProfSymtab Symtab;
@@ -621,7 +720,7 @@ static bool promoteIndirectCalls(Module &M, bool InLTO) {
       continue;
     if (F.hasFnAttribute(Attribute::OptimizeNone))
       continue;
-    ICallPromotionFunc ICallPromotion(F, &M, &Symtab);
+    ICallPromotionFunc ICallPromotion(F, &M, &Symtab, SamplePGO);
     bool FuncChanged = ICallPromotion.processFunction();
     if (ICPDUMPAFTER && FuncChanged) {
       DEBUG(dbgs() << "\n== IR Dump After =="; F.print(dbgs()));
@@ -638,12 +737,289 @@ static bool promoteIndirectCalls(Module &M, bool InLTO) {
 
 bool PGOIndirectCallPromotionLegacyPass::runOnModule(Module &M) {
   // Command-line option has the priority for InLTO.
-  return promoteIndirectCalls(M, InLTO | ICPLTOMode);
+  return promoteIndirectCalls(M, InLTO | ICPLTOMode,
+                              SamplePGO | ICPSamplePGOMode);
 }
 
-PreservedAnalyses PGOIndirectCallPromotion::run(Module &M, ModuleAnalysisManager &AM) {
-  if (!promoteIndirectCalls(M, InLTO | ICPLTOMode))
+PreservedAnalyses PGOIndirectCallPromotion::run(Module &M,
+                                                ModuleAnalysisManager &AM) {
+  if (!promoteIndirectCalls(M, InLTO | ICPLTOMode,
+                            SamplePGO | ICPSamplePGOMode))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
 }
+
+namespace {
+class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
+public:
+  MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI)
+      : Func(Func), BFI(BFI), Changed(false) {
+    ValueDataArray =
+        llvm::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
+    // Get the MemOPSize range information from option MemOPSizeRange,
+    getMemOPSizeRangeFromOption(MemOPSizeRange, PreciseRangeStart,
+                                PreciseRangeLast);
+  }
+  bool isChanged() const { return Changed; }
+  void perform() {
+    WorkList.clear();
+    visit(Func);
+
+    for (auto &MI : WorkList) {
+      ++NumOfPGOMemOPAnnotate;
+      if (perform(MI)) {
+        Changed = true;
+        ++NumOfPGOMemOPOpt;
+        DEBUG(dbgs() << "MemOP calls: " << MI->getCalledFunction()->getName()
+                     << "is Transformed.\n");
+      }
+    }
+  }
+
+  void visitMemIntrinsic(MemIntrinsic &MI) {
+    Value *Length = MI.getLength();
+    // Not perform on constant length calls.
+    if (dyn_cast<ConstantInt>(Length))
+      return;
+    WorkList.push_back(&MI);
+  }
+
+private:
+  Function &Func;
+  BlockFrequencyInfo &BFI;
+  bool Changed;
+  std::vector<MemIntrinsic *> WorkList;
+  // Start of the previse range.
+  int64_t PreciseRangeStart;
+  // Last value of the previse range.
+  int64_t PreciseRangeLast;
+  // The space to read the profile annotation.
+  std::unique_ptr<InstrProfValueData[]> ValueDataArray;
+  bool perform(MemIntrinsic *MI);
+
+  // This kind shows which group the value falls in. For PreciseValue, we have
+  // the profile count for that value. LargeGroup groups the values that are in
+  // range [LargeValue, +inf). NonLargeGroup groups the rest of values.
+  enum MemOPSizeKind { PreciseValue, NonLargeGroup, LargeGroup };
+
+  MemOPSizeKind getMemOPSizeKind(int64_t Value) const {
+    if (Value == MemOPSizeLarge && MemOPSizeLarge != 0)
+      return LargeGroup;
+    if (Value == PreciseRangeLast + 1)
+      return NonLargeGroup;
+    return PreciseValue;
+  }
+};
+
+static const char *getMIName(const MemIntrinsic *MI) {
+  switch (MI->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+    return "memcpy";
+  case Intrinsic::memmove:
+    return "memmove";
+  case Intrinsic::memset:
+    return "memset";
+  default:
+    return "unknown";
+  }
+}
+
+static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
+  assert(Count <= TotalCount);
+  if (Count < MemOPCountThreshold)
+    return false;
+  if (Count < TotalCount * MemOPPercentThreshold / 100)
+    return false;
+  return true;
+}
+
+static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
+                                      uint64_t Denom) {
+  if (!MemOPScaleCount)
+    return Count;
+  bool Overflowed;
+  uint64_t ScaleCount = SaturatingMultiply(Count, Num, &Overflowed);
+  return ScaleCount / Denom;
+}
+
+bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
+  assert(MI);
+  if (MI->getIntrinsicID() == Intrinsic::memmove)
+    return false;
+
+  uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
+  uint64_t TotalCount;
+  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
+                                ValueDataArray.get(), NumVals, TotalCount))
+    return false;
+
+  uint64_t ActualCount = TotalCount;
+  uint64_t SavedTotalCount = TotalCount;
+  if (MemOPScaleCount) {
+    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
+    if (!BBEdgeCount)
+      return false;
+    ActualCount = *BBEdgeCount;
+  }
+
+  if (ActualCount < MemOPCountThreshold)
+    return false;
+
+  ArrayRef<InstrProfValueData> VDs(ValueDataArray.get(), NumVals);
+  TotalCount = ActualCount;
+  if (MemOPScaleCount)
+    DEBUG(dbgs() << "Scale counts: numberator = " << ActualCount
+                 << " denominator = " << SavedTotalCount << "\n");
+
+  // Keeping track of the count of the default case:
+  uint64_t RemainCount = TotalCount;
+  SmallVector<uint64_t, 16> SizeIds;
+  SmallVector<uint64_t, 16> CaseCounts;
+  uint64_t MaxCount = 0;
+  unsigned Version = 0;
+  // Default case is in the front -- save the slot here.
+  CaseCounts.push_back(0);
+  for (auto &VD : VDs) {
+    int64_t V = VD.Value;
+    uint64_t C = VD.Count;
+    if (MemOPScaleCount)
+      C = getScaledCount(C, ActualCount, SavedTotalCount);
+
+    // Only care precise value here.
+    if (getMemOPSizeKind(V) != PreciseValue)
+      continue;
+
+    // ValueCounts are sorted on the count. Break at the first un-profitable
+    // value.
+    if (!isProfitable(C, RemainCount))
+      break;
+
+    SizeIds.push_back(V);
+    CaseCounts.push_back(C);
+    if (C > MaxCount)
+      MaxCount = C;
+
+    assert(RemainCount >= C);
+    RemainCount -= C;
+
+    if (++Version > MemOPMaxVersion && MemOPMaxVersion != 0)
+      break;
+  }
+
+  if (Version == 0)
+    return false;
+
+  CaseCounts[0] = RemainCount;
+  if (RemainCount > MaxCount)
+    MaxCount = RemainCount;
+
+  uint64_t SumForOpt = TotalCount - RemainCount;
+  DEBUG(dbgs() << "Read one memory intrinsic profile: " << SumForOpt << " vs "
+               << TotalCount << "\n");
+  DEBUG(
+      for (auto &VD
+           : VDs) { dbgs() << "  (" << VD.Value << "," << VD.Count << ")\n"; });
+
+  DEBUG(dbgs() << "Optimize one memory intrinsic call to " << Version
+               << " Versions\n");
+
+  // mem_op(..., size)
+  // ==>
+  // switch (size) {
+  //   case s1:
+  //      mem_op(..., s1);
+  //      goto merge_bb;
+  //   case s2:
+  //      mem_op(..., s2);
+  //      goto merge_bb;
+  //   ...
+  //   default:
+  //      mem_op(..., size);
+  //      goto merge_bb;
+  // }
+  // merge_bb:
+
+  BasicBlock *BB = MI->getParent();
+  DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
+  DEBUG(dbgs() << *BB << "\n");
+
+  BasicBlock *DefaultBB = SplitBlock(BB, MI);
+  BasicBlock::iterator It(*MI);
+  ++It;
+  assert(It != DefaultBB->end());
+  BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It));
+  DefaultBB->setName("MemOP.Default");
+  MergeBB->setName("MemOP.Merge");
+
+  auto &Ctx = Func.getContext();
+  IRBuilder<> IRB(BB);
+  BB->getTerminator()->eraseFromParent();
+  Value *SizeVar = MI->getLength();
+  SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+
+  // Clear the value profile data.
+  MI->setMetadata(LLVMContext::MD_prof, nullptr);
+
+  DEBUG(dbgs() << "\n\n== Basic Block After==\n");
+
+  for (uint64_t SizeId : SizeIds) {
+    ConstantInt *CaseSizeId = ConstantInt::get(Type::getInt64Ty(Ctx), SizeId);
+    BasicBlock *CaseBB = BasicBlock::Create(
+        Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
+    Instruction *NewInst = MI->clone();
+    // Fix the argument.
+    dyn_cast<MemIntrinsic>(NewInst)->setLength(CaseSizeId);
+    CaseBB->getInstList().push_back(NewInst);
+    IRBuilder<> IRBCase(CaseBB);
+    IRBCase.CreateBr(MergeBB);
+    SI->addCase(CaseSizeId, CaseBB);
+    DEBUG(dbgs() << *CaseBB << "\n");
+  }
+  setProfMetadata(Func.getParent(), SI, CaseCounts, MaxCount);
+
+  DEBUG(dbgs() << *BB << "\n");
+  DEBUG(dbgs() << *DefaultBB << "\n");
+  DEBUG(dbgs() << *MergeBB << "\n");
+
+  emitOptimizationRemark(Func.getContext(), "memop-opt", Func,
+                         MI->getDebugLoc(),
+                         Twine("optimize ") + getMIName(MI) + " with count " +
+                             Twine(SumForOpt) + " out of " + Twine(TotalCount) +
+                             " for " + Twine(Version) + " versions");
+
+  return true;
+}
+} // namespace
+
+static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI) {
+  if (DisableMemOPOPT)
+    return false;
+
+  if (F.hasFnAttribute(Attribute::OptimizeForSize))
+    return false;
+  MemOPSizeOpt MemOPSizeOpt(F, BFI);
+  MemOPSizeOpt.perform();
+  return MemOPSizeOpt.isChanged();
+}
+
+bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
+  BlockFrequencyInfo &BFI =
+      getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+  return PGOMemOPSizeOptImpl(F, BFI);
+}
+
+namespace llvm {
+char &PGOMemOPSizeOptID = PGOMemOPSizeOptLegacyPass::ID;
+
+PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
+                                       FunctionAnalysisManager &FAM) {
+  auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+  bool Changed = PGOMemOPSizeOptImpl(F, BFI);
+  if (!Changed)
+    return PreservedAnalyses::all();
+  auto  PA = PreservedAnalyses();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
+} // namespace llvm
diff --git a/lib/Transforms/Instrumentation/InstrProfiling.cpp b/lib/Transforms/Instrumentation/InstrProfiling.cpp
index be5c2c711f0e462cb284a78c99200147ca3ecd93..1f8bcb9a330ea1e3f32f4e9142f5e9bf17cc7096 100644
--- a/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -51,6 +51,21 @@ using namespace llvm;
 
 #define DEBUG_TYPE "instrprof"
 
+// The start and end values of precise value profile range for memory
+// intrinsic sizes
+cl::opt<std::string> MemOPSizeRange(
+    "memop-size-range",
+    cl::desc("Set the range of size in memory intrinsic calls to be profiled "
+             "precisely, in a format of <start_val>:<end_val>"),
+    cl::init(""));
+
+// The value that considered to be large value in  memory intrinsic.
+cl::opt<unsigned> MemOPSizeLarge(
+    "memop-size-large",
+    cl::desc("Set large value thresthold in memory intrinsic size profiling. "
+             "Value of 0 disables the large value profiling."),
+    cl::init(8192));
+
 namespace {
 
 cl::opt<bool> DoNameCompression("enable-name-compression",
@@ -165,6 +180,8 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
   NamesSize = 0;
   ProfileDataMap.clear();
   UsedVars.clear();
+  getMemOPSizeRangeFromOption(MemOPSizeRange, MemOPSizeRangeStart,
+                              MemOPSizeRangeLast);
 
   // We did not know how many value sites there would be inside
   // the instrumented function. This is counting the number of instrumented
@@ -217,17 +234,34 @@ bool InstrProfiling::run(Module &M, const TargetLibraryInfo &TLI) {
 }
 
 static Constant *getOrInsertValueProfilingCall(Module &M,
-                                               const TargetLibraryInfo &TLI) {
+                                               const TargetLibraryInfo &TLI,
+                                               bool IsRange = false) {
   LLVMContext &Ctx = M.getContext();
   auto *ReturnTy = Type::getVoidTy(M.getContext());
-  Type *ParamTypes[] = {
+
+  Constant *Res;
+  if (!IsRange) {
+    Type *ParamTypes[] = {
 #define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
 #include "llvm/ProfileData/InstrProfData.inc"
-  };
-  auto *ValueProfilingCallTy =
-      FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
-  Constant *Res = M.getOrInsertFunction(getInstrProfValueProfFuncName(),
-                                        ValueProfilingCallTy);
+    };
+    auto *ValueProfilingCallTy =
+        FunctionType::get(ReturnTy, makeArrayRef(ParamTypes), false);
+    Res = M.getOrInsertFunction(getInstrProfValueProfFuncName(),
+                                ValueProfilingCallTy);
+  } else {
+    Type *RangeParamTypes[] = {
+#define VALUE_RANGE_PROF 1
+#define VALUE_PROF_FUNC_PARAM(ParamType, ParamName, ParamLLVMType) ParamLLVMType
+#include "llvm/ProfileData/InstrProfData.inc"
+#undef VALUE_RANGE_PROF
+    };
+    auto *ValueRangeProfilingCallTy =
+        FunctionType::get(ReturnTy, makeArrayRef(RangeParamTypes), false);
+    Res = M.getOrInsertFunction(getInstrProfValueRangeProfFuncName(),
+                                ValueRangeProfilingCallTy);
+  }
+
   if (Function *FunRes = dyn_cast<Function>(Res)) {
     if (auto AK = TLI.getExtAttrForI32Param(false))
       FunRes->addAttribute(3, AK);
@@ -261,11 +295,25 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
     Index += It->second.NumValueSites[Kind];
 
   IRBuilder<> Builder(Ind);
-  Value *Args[3] = {Ind->getTargetValue(),
-                    Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
-                    Builder.getInt32(Index)};
-  CallInst *Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI),
-                                      Args);
+  bool IsRange = (Ind->getValueKind()->getZExtValue() ==
+                  llvm::InstrProfValueKind::IPVK_MemOPSize);
+  CallInst *Call = nullptr;
+  if (!IsRange) {
+    Value *Args[3] = {Ind->getTargetValue(),
+                      Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+                      Builder.getInt32(Index)};
+    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args);
+  } else {
+    Value *Args[6] = {
+        Ind->getTargetValue(),
+        Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
+        Builder.getInt32(Index),
+        Builder.getInt64(MemOPSizeRangeStart),
+        Builder.getInt64(MemOPSizeRangeLast),
+        Builder.getInt64(MemOPSizeLarge == 0 ? INT64_MIN : MemOPSizeLarge)};
+    Call =
+        Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI, true), Args);
+  }
   if (auto AK = TLI->getExtAttrForI32Param(false))
     Call->addAttribute(3, AK);
   Ind->replaceAllUsesWith(Call);
@@ -295,7 +343,9 @@ void InstrProfiling::lowerCoverageData(GlobalVariable *CoverageNamesVar) {
 
     Name->setLinkage(GlobalValue::PrivateLinkage);
     ReferencedNames.push_back(Name);
+    NC->dropAllReferences();
   }
+  CoverageNamesVar->eraseFromParent();
 }
 
 /// Get the name of a profiling variable for a particular function.
@@ -532,6 +582,9 @@ void InstrProfiling::emitNameData() {
   NamesSize = CompressedNameStr.size();
   NamesVar->setSection(getNameSection());
   UsedVars.push_back(NamesVar);
+
+  for (auto *NamePtr : ReferencedNames)
+    NamePtr->eraseFromParent();
 }
 
 void InstrProfiling::emitRegistration() {
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 2963d08752c4651bef36999629d5eb49adc0519a..7bb62d2c8455f18b9822be3162f37df219945837 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -63,6 +63,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializePGOInstrumentationGenLegacyPassPass(Registry);
   initializePGOInstrumentationUseLegacyPassPass(Registry);
   initializePGOIndirectCallPromotionLegacyPassPass(Registry);
+  initializePGOMemOPSizeOptLegacyPassPass(Registry);
   initializeInstrProfilingLegacyPassPass(Registry);
   initializeMemorySanitizerPass(Registry);
   initializeThreadSanitizerPass(Registry);
diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index fafb0fcbd01724008dfc3fb29093937d07460247..ddc594bed8a4233deee7dcdd44bff9b18c458093 100644
--- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -425,7 +425,7 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   // which is not yet implemented.
   StringRef WarningFnName = Recover ? "__msan_warning"
                                     : "__msan_warning_noreturn";
-  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), nullptr);
+  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy());
 
   for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
        AccessSizeIndex++) {
@@ -433,31 +433,31 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
     std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
     MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
         FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
-        IRB.getInt32Ty(), nullptr);
+        IRB.getInt32Ty());
 
     FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
     MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
         FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
-        IRB.getInt8PtrTy(), IRB.getInt32Ty(), nullptr);
+        IRB.getInt8PtrTy(), IRB.getInt32Ty());
   }
 
   MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
     "__msan_set_alloca_origin4", IRB.getVoidTy(), IRB.getInt8PtrTy(), IntptrTy,
-    IRB.getInt8PtrTy(), IntptrTy, nullptr);
+    IRB.getInt8PtrTy(), IntptrTy);
   MsanPoisonStackFn =
       M.getOrInsertFunction("__msan_poison_stack", IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr);
+                            IRB.getInt8PtrTy(), IntptrTy);
   MsanChainOriginFn = M.getOrInsertFunction(
-    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty(), nullptr);
+    "__msan_chain_origin", IRB.getInt32Ty(), IRB.getInt32Ty());
   MemmoveFn = M.getOrInsertFunction(
     "__msan_memmove", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IRB.getInt8PtrTy(), IntptrTy, nullptr);
+    IRB.getInt8PtrTy(), IntptrTy);
   MemcpyFn = M.getOrInsertFunction(
     "__msan_memcpy", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-    IntptrTy, nullptr);
+    IntptrTy);
   MemsetFn = M.getOrInsertFunction(
     "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
-    IntptrTy, nullptr);
+    IntptrTy);
 
   // Create globals.
   RetvalTLS = new GlobalVariable(
@@ -1037,15 +1037,19 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     OriginMap[V] = Origin;
   }
 
+  Constant *getCleanShadow(Type *OrigTy) {
+    Type *ShadowTy = getShadowTy(OrigTy);
+    if (!ShadowTy)
+      return nullptr;
+    return Constant::getNullValue(ShadowTy);
+  }
+
   /// \brief Create a clean shadow value for a given value.
   ///
   /// Clean shadow (all zeroes) means all bits of the value are defined
   /// (initialized).
   Constant *getCleanShadow(Value *V) {
-    Type *ShadowTy = getShadowTy(V);
-    if (!ShadowTy)
-      return nullptr;
-    return Constant::getNullValue(ShadowTy);
+    return getCleanShadow(V->getType());
   }
 
   /// \brief Create a dirty shadow of a given shadow type.
@@ -1942,7 +1946,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (ClCheckAccessAddress)
       insertShadowCheck(Addr, &I);
 
-    // FIXME: use ClStoreCleanOrigin
     // FIXME: factor out common code from materializeStores
     if (MS.TrackOrigins)
       IRB.CreateStore(getOrigin(&I, 1), getOriginPtr(Addr, IRB, 1));
@@ -2325,11 +2328,49 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  void handleStmxcsr(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value* Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    Value *ShadowPtr = getShadowPtr(Addr, Ty, IRB);
+
+    IRB.CreateStore(getCleanShadow(Ty),
+                    IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+  }
+
+  void handleLdmxcsr(IntrinsicInst &I) {
+    if (!InsertChecks) return;
+
+    IRBuilder<> IRB(&I);
+    Value *Addr = I.getArgOperand(0);
+    Type *Ty = IRB.getInt32Ty();
+    unsigned Alignment = 1;
+
+    if (ClCheckAccessAddress)
+      insertShadowCheck(Addr, &I);
+
+    Value *Shadow = IRB.CreateAlignedLoad(getShadowPtr(Addr, Ty, IRB),
+                                          Alignment, "_ldmxcsr");
+    Value *Origin = MS.TrackOrigins
+                        ? IRB.CreateLoad(getOriginPtr(Addr, IRB, Alignment))
+                        : getCleanOrigin();
+    insertShadowCheck(Shadow, Origin, &I);
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case llvm::Intrinsic::bswap:
       handleBswap(I);
       break;
+    case llvm::Intrinsic::x86_sse_stmxcsr:
+      handleStmxcsr(I);
+      break;
+    case llvm::Intrinsic::x86_sse_ldmxcsr:
+      handleLdmxcsr(I);
+      break;
     case llvm::Intrinsic::x86_avx512_vcvtsd2usi64:
     case llvm::Intrinsic::x86_avx512_vcvtsd2usi32:
     case llvm::Intrinsic::x86_avx512_vcvtss2usi64:
@@ -2566,10 +2607,10 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
         AttrBuilder B;
         B.addAttribute(Attribute::ReadOnly)
           .addAttribute(Attribute::ReadNone);
-        Func->removeAttributes(AttributeSet::FunctionIndex,
-                               AttributeSet::get(Func->getContext(),
-                                                 AttributeSet::FunctionIndex,
-                                                 B));
+        Func->removeAttributes(AttributeList::FunctionIndex,
+                               AttributeList::get(Func->getContext(),
+                                                  AttributeList::FunctionIndex,
+                                                  B));
       }
 
       maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
@@ -2690,7 +2731,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     } else {
       Value *Shadow = getShadow(RetVal);
       IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
-      // FIXME: make it conditional if ClStoreCleanOrigin==0
       if (MS.TrackOrigins)
         IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
     }
@@ -2717,15 +2757,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getCleanOrigin());
     IRBuilder<> IRB(I.getNextNode());
     const DataLayout &DL = F.getParent()->getDataLayout();
-    uint64_t Size = DL.getTypeAllocSize(I.getAllocatedType());
+    uint64_t TypeSize = DL.getTypeAllocSize(I.getAllocatedType());
+    Value *Len = ConstantInt::get(MS.IntptrTy, TypeSize);
+    if (I.isArrayAllocation())
+      Len = IRB.CreateMul(Len, I.getArraySize());
     if (PoisonStack && ClPoisonStackWithCall) {
       IRB.CreateCall(MS.MsanPoisonStackFn,
-                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
-                      ConstantInt::get(MS.IntptrTy, Size)});
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len});
     } else {
       Value *ShadowBase = getShadowPtr(&I, Type::getInt8PtrTy(*MS.C), IRB);
       Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
-      IRB.CreateMemSet(ShadowBase, PoisonValue, Size, I.getAlignment());
+      IRB.CreateMemSet(ShadowBase, PoisonValue, Len, I.getAlignment());
     }
 
     if (PoisonStack && MS.TrackOrigins) {
@@ -2742,8 +2784,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                                StackDescription.str());
 
       IRB.CreateCall(MS.MsanSetAllocaOrigin4Fn,
-                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()),
-                      ConstantInt::get(MS.IntptrTy, Size),
+                     {IRB.CreatePointerCast(&I, IRB.getInt8PtrTy()), Len,
                       IRB.CreatePointerCast(Descr, IRB.getInt8PtrTy()),
                       IRB.CreatePointerCast(&F, MS.IntptrTy)});
     }
@@ -3618,9 +3659,9 @@ bool MemorySanitizer::runOnFunction(Function &F) {
   AttrBuilder B;
   B.addAttribute(Attribute::ReadOnly)
     .addAttribute(Attribute::ReadNone);
-  F.removeAttributes(AttributeSet::FunctionIndex,
-                     AttributeSet::get(F.getContext(),
-                                       AttributeSet::FunctionIndex, B));
+  F.removeAttributes(
+      AttributeList::FunctionIndex,
+      AttributeList::get(F.getContext(), AttributeList::FunctionIndex, B));
 
   return Visitor.runOnFunction();
 }
diff --git a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index b31d27151fc334c3fd2f139133a62faa4ae2af8e..990bcec109de7e3768df763fe5ff2270090abab7 100644
--- a/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -91,6 +91,7 @@ using namespace llvm;
 
 STATISTIC(NumOfPGOInstrument, "Number of edges instrumented.");
 STATISTIC(NumOfPGOSelectInsts, "Number of select instruction instrumented.");
+STATISTIC(NumOfPGOMemIntrinsics, "Number of mem intrinsics instrumented.");
 STATISTIC(NumOfPGOEdge, "Number of edges.");
 STATISTIC(NumOfPGOBB, "Number of basic-blocks.");
 STATISTIC(NumOfPGOSplit, "Number of critical edge splits.");
@@ -120,6 +121,13 @@ static cl::opt<unsigned> MaxNumAnnotations(
     cl::desc("Max number of annotations for a single indirect "
              "call callsite"));
 
+// Command line option to set the maximum number of value annotations
+// to write to the metadata for a single memop intrinsic.
+static cl::opt<unsigned> MaxNumMemOPAnnotations(
+    "memop-max-annotations", cl::init(4), cl::Hidden, cl::ZeroOrMore,
+    cl::desc("Max number of preicise value annotations for a single memop"
+             "intrinsic"));
+
 // Command line option to control appending FunctionHash to the name of a COMDAT
 // function. This is to avoid the hash mismatch caused by the preinliner.
 static cl::opt<bool> DoComdatRenaming(
@@ -129,39 +137,59 @@ static cl::opt<bool> DoComdatRenaming(
 
 // Command line option to enable/disable the warning about missing profile
 // information.
-static cl::opt<bool> PGOWarnMissing("pgo-warn-missing-function",
-                                     cl::init(false),
-                                     cl::Hidden);
+static cl::opt<bool>
+    PGOWarnMissing("pgo-warn-missing-function", cl::init(false), cl::Hidden,
+                   cl::desc("Use this option to turn on/off "
+                            "warnings about missing profile data for "
+                            "functions."));
 
 // Command line option to enable/disable the warning about a hash mismatch in
 // the profile data.
-static cl::opt<bool> NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false),
-                                       cl::Hidden);
+static cl::opt<bool>
+    NoPGOWarnMismatch("no-pgo-warn-mismatch", cl::init(false), cl::Hidden,
+                      cl::desc("Use this option to turn off/on "
+                               "warnings about profile cfg mismatch."));
 
 // Command line option to enable/disable the warning about a hash mismatch in
 // the profile data for Comdat functions, which often turns out to be false
 // positive due to the pre-instrumentation inline.
-static cl::opt<bool> NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat",
-                                             cl::init(true), cl::Hidden);
+static cl::opt<bool>
+    NoPGOWarnMismatchComdat("no-pgo-warn-mismatch-comdat", cl::init(true),
+                            cl::Hidden,
+                            cl::desc("The option is used to turn on/off "
+                                     "warnings about hash mismatch for comdat "
+                                     "functions."));
 
 // Command line option to enable/disable select instruction instrumentation.
-static cl::opt<bool> PGOInstrSelect("pgo-instr-select", cl::init(true),
-                                    cl::Hidden);
-
-// Command line option to specify the name of the function for CFG dump
-static cl::opt<std::string>
-    PGOViewFunction("pgo-view-function", cl::Hidden,
-                    cl::desc("The option to specify "
-                             "the name of the function "
-                             "whose CFG will be displayed."));
+static cl::opt<bool>
+    PGOInstrSelect("pgo-instr-select", cl::init(true), cl::Hidden,
+                   cl::desc("Use this option to turn on/off SELECT "
+                            "instruction instrumentation. "));
 
 // Command line option to turn on CFG dot dump of raw profile counts
-static cl::opt<bool> PGOViewRawCounts("pgo-view-raw-counts", cl::init(false),
-                                      cl::Hidden);
+static cl::opt<bool>
+    PGOViewRawCounts("pgo-view-raw-counts", cl::init(false), cl::Hidden,
+                     cl::desc("A boolean option to show CFG dag "
+                              "with raw profile counts from "
+                              "profile data. See also option "
+                              "-pgo-view-counts. To limit graph "
+                              "display to only one function, use "
+                              "filtering option -view-bfi-func-name."));
+
+// Command line option to enable/disable memop intrinsic call.size profiling.
+static cl::opt<bool>
+    PGOInstrMemOP("pgo-instr-memop", cl::init(true), cl::Hidden,
+                  cl::desc("Use this option to turn on/off "
+                           "memory instrinsic size profiling."));
 
 // Command line option to turn on CFG dot dump after profile annotation.
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -pgo-view-counts
 extern cl::opt<bool> PGOViewCounts;
 
+// Command line option to specify the name of the function for CFG dump
+// Defined in Analysis/BlockFrequencyInfo.cpp:  -view-bfi-func-name=
+extern cl::opt<std::string> ViewBlockFreqFuncName;
+
 namespace {
 
 /// The select instruction visitor plays three roles specified
@@ -186,6 +214,7 @@ struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
   SelectInstVisitor(Function &Func) : F(Func) {}
 
   void countSelects(Function &Func) {
+    NSIs = 0;
     Mode = VM_counting;
     visit(Func);
   }
@@ -215,9 +244,54 @@ struct SelectInstVisitor : public InstVisitor<SelectInstVisitor> {
   void annotateOneSelectInst(SelectInst &SI);
   // Visit \p SI instruction and perform tasks according to visit mode.
   void visitSelectInst(SelectInst &SI);
+  // Return the number of select instructions. This needs be called after
+  // countSelects().
   unsigned getNumOfSelectInsts() const { return NSIs; }
 };
 
+/// Instruction Visitor class to visit memory intrinsic calls.
+struct MemIntrinsicVisitor : public InstVisitor<MemIntrinsicVisitor> {
+  Function &F;
+  unsigned NMemIs = 0;          // Number of memIntrinsics instrumented.
+  VisitMode Mode = VM_counting; // Visiting mode.
+  unsigned CurCtrId = 0;        // Current counter index.
+  unsigned TotalNumCtrs = 0;    // Total number of counters
+  GlobalVariable *FuncNameVar = nullptr;
+  uint64_t FuncHash = 0;
+  PGOUseFunc *UseFunc = nullptr;
+  std::vector<Instruction *> Candidates;
+
+  MemIntrinsicVisitor(Function &Func) : F(Func) {}
+
+  void countMemIntrinsics(Function &Func) {
+    NMemIs = 0;
+    Mode = VM_counting;
+    visit(Func);
+  }
+
+  void instrumentMemIntrinsics(Function &Func, unsigned TotalNC,
+                               GlobalVariable *FNV, uint64_t FHash) {
+    Mode = VM_instrument;
+    TotalNumCtrs = TotalNC;
+    FuncHash = FHash;
+    FuncNameVar = FNV;
+    visit(Func);
+  }
+
+  std::vector<Instruction *> findMemIntrinsics(Function &Func) {
+    Candidates.clear();
+    Mode = VM_annotate;
+    visit(Func);
+    return Candidates;
+  }
+
+  // Visit the IR stream and annotate all mem intrinsic call instructions.
+  void instrumentOneMemIntrinsic(MemIntrinsic &MI);
+  // Visit \p MI instruction and perform tasks according to visit mode.
+  void visitMemIntrinsic(MemIntrinsic &SI);
+  unsigned getNumOfMemIntrinsics() const { return NMemIs; }
+};
+
 class PGOInstrumentationGenLegacyPass : public ModulePass {
 public:
   static char ID;
@@ -335,8 +409,9 @@ private:
   std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers;
 
 public:
-  std::vector<Instruction *> IndirectCallSites;
+  std::vector<std::vector<Instruction *>> ValueSites;
   SelectInstVisitor SIVisitor;
+  MemIntrinsicVisitor MIVisitor;
   std::string FuncName;
   GlobalVariable *FuncNameVar;
   // CFG hash value for this function.
@@ -366,13 +441,16 @@ public:
       std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
       bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
       BlockFrequencyInfo *BFI = nullptr)
-      : F(Func), ComdatMembers(ComdatMembers), SIVisitor(Func), FunctionHash(0),
-        MST(F, BPI, BFI) {
+      : F(Func), ComdatMembers(ComdatMembers), ValueSites(IPVK_Last + 1),
+        SIVisitor(Func), MIVisitor(Func), FunctionHash(0), MST(F, BPI, BFI) {
 
     // This should be done before CFG hash computation.
     SIVisitor.countSelects(Func);
+    MIVisitor.countMemIntrinsics(Func);
     NumOfPGOSelectInsts += SIVisitor.getNumOfSelectInsts();
-    IndirectCallSites = findIndirectCallSites(Func);
+    NumOfPGOMemIntrinsics += MIVisitor.getNumOfMemIntrinsics();
+    ValueSites[IPVK_IndirectCallTarget] = findIndirectCallSites(Func);
+    ValueSites[IPVK_MemOPSize] = MIVisitor.findMemIntrinsics(Func);
 
     FuncName = getPGOFuncName(F);
     computeCFGHash();
@@ -424,7 +502,7 @@ void FuncPGOInstrumentation<Edge, BBInfo>::computeCFGHash() {
   }
   JC.update(Indexes);
   FunctionHash = (uint64_t)SIVisitor.getNumOfSelectInsts() << 56 |
-                 (uint64_t)IndirectCallSites.size() << 48 |
+                 (uint64_t)ValueSites[IPVK_IndirectCallTarget].size() << 48 |
                  (uint64_t)MST.AllEdges.size() << 32 | JC.getCRC();
 }
 
@@ -571,7 +649,7 @@ static void instrumentOneFunc(
     return;
 
   unsigned NumIndirectCallSites = 0;
-  for (auto &I : FuncInfo.IndirectCallSites) {
+  for (auto &I : FuncInfo.ValueSites[IPVK_IndirectCallTarget]) {
     CallSite CS(I);
     Value *Callee = CS.getCalledValue();
     DEBUG(dbgs() << "Instrument one indirect call: CallSite Index = "
@@ -584,10 +662,14 @@ static void instrumentOneFunc(
         {llvm::ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
          Builder.getInt64(FuncInfo.FunctionHash),
          Builder.CreatePtrToInt(Callee, Builder.getInt64Ty()),
-         Builder.getInt32(llvm::InstrProfValueKind::IPVK_IndirectCallTarget),
+         Builder.getInt32(IPVK_IndirectCallTarget),
          Builder.getInt32(NumIndirectCallSites++)});
   }
   NumOfPGOICall += NumIndirectCallSites;
+
+  // Now instrument memop intrinsic calls.
+  FuncInfo.MIVisitor.instrumentMemIntrinsics(
+      F, NumCounters, FuncInfo.FuncNameVar, FuncInfo.FunctionHash);
 }
 
 // This class represents a CFG edge in profile use compilation.
@@ -672,8 +754,11 @@ public:
   // Set the branch weights based on the count values.
   void setBranchWeights();
 
-  // Annotate the indirect call sites.
-  void annotateIndirectCallSites();
+  // Annotate the value profile call sites all all value kind.
+  void annotateValueSites();
+
+  // Annotate the value profile call sites for one value kind.
+  void annotateValueSites(uint32_t Kind);
 
   // The hotness of the function from the profile count.
   enum FuncFreqAttr { FFA_Normal, FFA_Cold, FFA_Hot };
@@ -782,7 +867,7 @@ void PGOUseFunc::setInstrumentedCounts(
     NewEdge1.InMST = true;
     getBBInfo(InstrBB).setBBInfoCount(CountValue);
   }
-  ProfileCountSize =  CountFromProfile.size();
+  ProfileCountSize = CountFromProfile.size();
   CountPosition = I;
 }
 
@@ -953,21 +1038,6 @@ void PGOUseFunc::populateCounters() {
   DEBUG(FuncInfo.dumpInfo("after reading profile."));
 }
 
-static void setProfMetadata(Module *M, Instruction *TI,
-                            ArrayRef<uint64_t> EdgeCounts, uint64_t MaxCount) {
-  MDBuilder MDB(M->getContext());
-  assert(MaxCount > 0 && "Bad max count");
-  uint64_t Scale = calculateCountScale(MaxCount);
-  SmallVector<unsigned, 4> Weights;
-  for (const auto &ECI : EdgeCounts)
-    Weights.push_back(scaleBranchCount(ECI, Scale));
-
-  DEBUG(dbgs() << "Weight is: ";
-        for (const auto &W : Weights) { dbgs() << W << " "; } 
-        dbgs() << "\n";);
-  TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
-}
-
 // Assign the scaled count values to the BB with multiple out edges.
 void PGOUseFunc::setBranchWeights() {
   // Generate MD_prof metadata for every branch instruction.
@@ -1011,8 +1081,8 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
   Builder.CreateCall(
       Intrinsic::getDeclaration(M, Intrinsic::instrprof_increment_step),
       {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
-       Builder.getInt64(FuncHash),
-       Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
+       Builder.getInt64(FuncHash), Builder.getInt32(TotalNumCtrs),
+       Builder.getInt32(*CurCtrIdx), Step});
   ++(*CurCtrIdx);
 }
 
@@ -1041,9 +1111,9 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
   if (SI.getCondition()->getType()->isVectorTy())
     return;
 
-  NSIs++;
   switch (Mode) {
   case VM_counting:
+    NSIs++;
     return;
   case VM_instrument:
     instrumentOneSelectInst(SI);
@@ -1056,35 +1126,79 @@ void SelectInstVisitor::visitSelectInst(SelectInst &SI) {
   llvm_unreachable("Unknown visiting mode");
 }
 
-// Traverse all the indirect callsites and annotate the instructions.
-void PGOUseFunc::annotateIndirectCallSites() {
+void MemIntrinsicVisitor::instrumentOneMemIntrinsic(MemIntrinsic &MI) {
+  Module *M = F.getParent();
+  IRBuilder<> Builder(&MI);
+  Type *Int64Ty = Builder.getInt64Ty();
+  Type *I8PtrTy = Builder.getInt8PtrTy();
+  Value *Length = MI.getLength();
+  assert(!dyn_cast<ConstantInt>(Length));
+  Builder.CreateCall(
+      Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
+      {llvm::ConstantExpr::getBitCast(FuncNameVar, I8PtrTy),
+       Builder.getInt64(FuncHash), Builder.CreatePtrToInt(Length, Int64Ty),
+       Builder.getInt32(IPVK_MemOPSize), Builder.getInt32(CurCtrId)});
+  ++CurCtrId;
+}
+
+void MemIntrinsicVisitor::visitMemIntrinsic(MemIntrinsic &MI) {
+  if (!PGOInstrMemOP)
+    return;
+  Value *Length = MI.getLength();
+  // Not instrument constant length calls.
+  if (dyn_cast<ConstantInt>(Length))
+    return;
+
+  switch (Mode) {
+  case VM_counting:
+    NMemIs++;
+    return;
+  case VM_instrument:
+    instrumentOneMemIntrinsic(MI);
+    return;
+  case VM_annotate:
+    Candidates.push_back(&MI);
+    return;
+  }
+  llvm_unreachable("Unknown visiting mode");
+}
+
+// Traverse all valuesites and annotate the instructions for all value kind.
+void PGOUseFunc::annotateValueSites() {
   if (DisableValueProfiling)
     return;
 
   // Create the PGOFuncName meta data.
   createPGOFuncNameMetadata(F, FuncInfo.FuncName);
 
-  unsigned IndirectCallSiteIndex = 0;
-  auto &IndirectCallSites = FuncInfo.IndirectCallSites;
-  unsigned NumValueSites =
-      ProfileRecord.getNumValueSites(IPVK_IndirectCallTarget);
-  if (NumValueSites != IndirectCallSites.size()) {
-    std::string Msg =
-        std::string("Inconsistent number of indirect call sites: ") +
-        F.getName().str();
+  for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind)
+    annotateValueSites(Kind);
+}
+
+// Annotate the instructions for a specific value kind.
+void PGOUseFunc::annotateValueSites(uint32_t Kind) {
+  unsigned ValueSiteIndex = 0;
+  auto &ValueSites = FuncInfo.ValueSites[Kind];
+  unsigned NumValueSites = ProfileRecord.getNumValueSites(Kind);
+  if (NumValueSites != ValueSites.size()) {
     auto &Ctx = M->getContext();
-    Ctx.diagnose(
-        DiagnosticInfoPGOProfile(M->getName().data(), Msg, DS_Warning));
+    Ctx.diagnose(DiagnosticInfoPGOProfile(
+        M->getName().data(),
+        Twine("Inconsistent number of value sites for kind = ") + Twine(Kind) +
+            " in " + F.getName().str(),
+        DS_Warning));
     return;
   }
 
-  for (auto &I : IndirectCallSites) {
-    DEBUG(dbgs() << "Read one indirect call instrumentation: Index="
-                 << IndirectCallSiteIndex << " out of " << NumValueSites
-                 << "\n");
-    annotateValueSite(*M, *I, ProfileRecord, IPVK_IndirectCallTarget,
-                      IndirectCallSiteIndex, MaxNumAnnotations);
-    IndirectCallSiteIndex++;
+  for (auto &I : ValueSites) {
+    DEBUG(dbgs() << "Read one value site profile (kind = " << Kind
+                 << "): Index = " << ValueSiteIndex << " out of "
+                 << NumValueSites << "\n");
+    annotateValueSite(*M, *I, ProfileRecord,
+                      static_cast<InstrProfValueKind>(Kind), ValueSiteIndex,
+                      Kind == IPVK_MemOPSize ? MaxNumMemOPAnnotations
+                                             : MaxNumAnnotations);
+    ValueSiteIndex++;
   }
 }
 } // end anonymous namespace
@@ -1217,15 +1331,14 @@ static bool annotateAllFunctions(
       continue;
     Func.populateCounters();
     Func.setBranchWeights();
-    Func.annotateIndirectCallSites();
+    Func.annotateValueSites();
     PGOUseFunc::FuncFreqAttr FreqAttr = Func.getFuncFreqAttr();
     if (FreqAttr == PGOUseFunc::FFA_Cold)
       ColdFunctions.push_back(&F);
     else if (FreqAttr == PGOUseFunc::FFA_Hot)
       HotFunctions.push_back(&F);
-#ifndef NDEBUG
-    if (PGOViewCounts &&
-        (PGOViewFunction.empty() || F.getName().equals(PGOViewFunction))) {
+    if (PGOViewCounts && (ViewBlockFreqFuncName.empty() ||
+                          F.getName().equals(ViewBlockFreqFuncName))) {
       LoopInfo LI{DominatorTree(F)};
       std::unique_ptr<BranchProbabilityInfo> NewBPI =
           llvm::make_unique<BranchProbabilityInfo>(F, LI);
@@ -1234,10 +1347,9 @@ static bool annotateAllFunctions(
 
       NewBFI->view();
     }
-#endif
-    if (PGOViewRawCounts &&
-        (PGOViewFunction.empty() || F.getName().equals(PGOViewFunction))) {
-      if (PGOViewFunction.empty())
+    if (PGOViewRawCounts && (ViewBlockFreqFuncName.empty() ||
+                             F.getName().equals(ViewBlockFreqFuncName))) {
+      if (ViewBlockFreqFuncName.empty())
         WriteGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
       else
         ViewGraph(&Func, Twine("PGORawCounts_") + Func.getFunc().getName());
@@ -1299,6 +1411,21 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
 }
 
 namespace llvm {
+void setProfMetadata(Module *M, Instruction *TI, ArrayRef<uint64_t> EdgeCounts,
+                     uint64_t MaxCount) {
+  MDBuilder MDB(M->getContext());
+  assert(MaxCount > 0 && "Bad max count");
+  uint64_t Scale = calculateCountScale(MaxCount);
+  SmallVector<unsigned, 4> Weights;
+  for (const auto &ECI : EdgeCounts)
+    Weights.push_back(scaleBranchCount(ECI, Scale));
+
+  DEBUG(dbgs() << "Weight is: ";
+        for (const auto &W : Weights) { dbgs() << W << " "; }
+        dbgs() << "\n";);
+  TI->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
+}
+
 template <> struct GraphTraits<PGOUseFunc *> {
   typedef const BasicBlock *NodeRef;
   typedef succ_const_iterator ChildIteratorType;
@@ -1319,6 +1446,16 @@ template <> struct GraphTraits<PGOUseFunc *> {
   }
 };
 
+static std::string getSimpleNodeName(const BasicBlock *Node) {
+  if (!Node->getName().empty())
+    return Node->getName();
+
+  std::string SimpleNodeName;
+  raw_string_ostream OS(SimpleNodeName);
+  Node->printAsOperand(OS, false);
+  return OS.str();
+}
+
 template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
   explicit DOTGraphTraits(bool isSimple = false)
       : DefaultDOTGraphTraits(isSimple) {}
@@ -1330,13 +1467,32 @@ template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
   std::string getNodeLabel(const BasicBlock *Node, const PGOUseFunc *Graph) {
     std::string Result;
     raw_string_ostream OS(Result);
-    OS << Node->getName().str() << " : ";
+
+    OS << getSimpleNodeName(Node) << ":\\l";
     UseBBInfo *BI = Graph->findBBInfo(Node);
+    OS << "Count : ";
     if (BI && BI->CountValid)
-      OS << BI->CountValue;
+      OS << BI->CountValue << "\\l";
     else
-      OS << "Unknown";
+      OS << "Unknown\\l";
+
+    if (!PGOInstrSelect)
+      return Result;
+
+    for (auto BI = Node->begin(); BI != Node->end(); ++BI) {
+      auto *I = &*BI;
+      if (!isa<SelectInst>(I))
+        continue;
+      // Display scaled counts for SELECT instruction:
+      OS << "SELECT : { T = ";
+      uint64_t TC, FC;
+      bool HasProf = I->extractProfMetadata(TC, FC);
+      if (!HasProf)
+        OS << "Unknown, F = Unknown }\\l";
+      else
+        OS << TC << ", F = " << FC << " }\\l";
+    }
     return Result;
   }
 };
-}
+} // namespace llvm
diff --git a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index e34dd611b259bc4009fefafa74a646eb82037aa3..fa0c7cc5a4c53710a15cbea956b06de98e184ba9 100644
--- a/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -78,7 +78,6 @@ static const char *const SanCovTraceSwitchName = "__sanitizer_cov_trace_switch";
 static const char *const SanCovModuleCtorName = "sancov.module_ctor";
 static const uint64_t SanCtorAndDtorPriority = 2;
 
-static const char *const SanCovTracePCGuardSection = "__sancov_guards";
 static const char *const SanCovTracePCGuardName =
     "__sanitizer_cov_trace_pc_guard";
 static const char *const SanCovTracePCGuardInitName =
@@ -216,6 +215,9 @@ private:
            SanCovWithCheckFunction->getNumUses() + SanCovTraceBB->getNumUses() +
            SanCovTraceEnter->getNumUses();
   }
+  StringRef getSanCovTracePCGuardSection() const;
+  StringRef getSanCovTracePCGuardSectionStart() const;
+  StringRef getSanCovTracePCGuardSectionEnd() const;
   Function *SanCovFunction;
   Function *SanCovWithCheckFunction;
   Function *SanCovIndirCallFunction, *SanCovTracePCIndir;
@@ -227,6 +229,7 @@ private:
   InlineAsm *EmptyAsm;
   Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy;
   Module *CurModule;
+  Triple TargetTriple;
   LLVMContext *C;
   const DataLayout *DL;
 
@@ -246,6 +249,7 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   C = &(M.getContext());
   DL = &M.getDataLayout();
   CurModule = &M;
+  TargetTriple = Triple(M.getTargetTriple());
   HasSancovGuardsSection = false;
   IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
   IntptrPtrTy = PointerType::getUnqual(IntptrTy);
@@ -258,39 +262,39 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   Int32Ty = IRB.getInt32Ty();
 
   SanCovFunction = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovName, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovName, VoidTy, Int32PtrTy));
   SanCovWithCheckFunction = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovWithCheckName, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovWithCheckName, VoidTy, Int32PtrTy));
   SanCovTracePCIndir = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy, nullptr));
+      M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy));
   SanCovIndirCallFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovIndirCallName, VoidTy, IntptrTy, IntptrTy, nullptr));
+          SanCovIndirCallName, VoidTy, IntptrTy, IntptrTy));
   SanCovTraceCmpFunction[0] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceCmp1, VoidTy, IRB.getInt8Ty(), IRB.getInt8Ty(), nullptr));
+          SanCovTraceCmp1, VoidTy, IRB.getInt8Ty(), IRB.getInt8Ty()));
   SanCovTraceCmpFunction[1] = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(SanCovTraceCmp2, VoidTy, IRB.getInt16Ty(),
-                            IRB.getInt16Ty(), nullptr));
+                            IRB.getInt16Ty()));
   SanCovTraceCmpFunction[2] = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction(SanCovTraceCmp4, VoidTy, IRB.getInt32Ty(),
-                            IRB.getInt32Ty(), nullptr));
+                            IRB.getInt32Ty()));
   SanCovTraceCmpFunction[3] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty, nullptr));
+          SanCovTraceCmp8, VoidTy, Int64Ty, Int64Ty));
 
   SanCovTraceDivFunction[0] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceDiv4, VoidTy, IRB.getInt32Ty(), nullptr));
+          SanCovTraceDiv4, VoidTy, IRB.getInt32Ty()));
   SanCovTraceDivFunction[1] =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceDiv8, VoidTy, Int64Ty, nullptr));
+          SanCovTraceDiv8, VoidTy, Int64Ty));
   SanCovTraceGepFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceGep, VoidTy, IntptrTy, nullptr));
+          SanCovTraceGep, VoidTy, IntptrTy));
   SanCovTraceSwitchFunction =
       checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy, nullptr));
+          SanCovTraceSwitchName, VoidTy, Int64Ty, Int64PtrTy));
 
   // We insert an empty inline asm after cov callbacks to avoid callback merge.
   EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
@@ -298,13 +302,13 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
                             /*hasSideEffects=*/true);
 
   SanCovTracePC = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTracePCName, VoidTy, nullptr));
+      M.getOrInsertFunction(SanCovTracePCName, VoidTy));
   SanCovTracePCGuard = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      SanCovTracePCGuardName, VoidTy, Int32PtrTy, nullptr));
+      SanCovTracePCGuardName, VoidTy, Int32PtrTy));
   SanCovTraceEnter = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTraceEnterName, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovTraceEnterName, VoidTy, Int32PtrTy));
   SanCovTraceBB = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction(SanCovTraceBBName, VoidTy, Int32PtrTy, nullptr));
+      M.getOrInsertFunction(SanCovTraceBBName, VoidTy, Int32PtrTy));
 
   // At this point we create a dummy array of guards because we don't
   // know how many elements we will need.
@@ -363,22 +367,28 @@ bool SanitizerCoverageModule::runOnModule(Module &M) {
   if (Options.TracePCGuard) {
     if (HasSancovGuardsSection) {
       Function *CtorFunc;
-      std::string SectionName(SanCovTracePCGuardSection);
-      GlobalVariable *Bounds[2];
-      const char *Prefix[2] = {"__start_", "__stop_"};
-      for (int i = 0; i < 2; i++) {
-        Bounds[i] = new GlobalVariable(M, Int32PtrTy, false,
-                                       GlobalVariable::ExternalLinkage, nullptr,
-                                       Prefix[i] + SectionName);
-        Bounds[i]->setVisibility(GlobalValue::HiddenVisibility);
-      }
+      GlobalVariable *SecStart = new GlobalVariable(
+          M, Int32PtrTy, false, GlobalVariable::ExternalLinkage, nullptr,
+          getSanCovTracePCGuardSectionStart());
+      SecStart->setVisibility(GlobalValue::HiddenVisibility);
+      GlobalVariable *SecEnd = new GlobalVariable(
+          M, Int32PtrTy, false, GlobalVariable::ExternalLinkage, nullptr,
+          getSanCovTracePCGuardSectionEnd());
+      SecEnd->setVisibility(GlobalValue::HiddenVisibility);
+
       std::tie(CtorFunc, std::ignore) = createSanitizerCtorAndInitFunctions(
           M, SanCovModuleCtorName, SanCovTracePCGuardInitName,
           {Int32PtrTy, Int32PtrTy},
-          {IRB.CreatePointerCast(Bounds[0], Int32PtrTy),
-            IRB.CreatePointerCast(Bounds[1], Int32PtrTy)});
-
-      appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+          {IRB.CreatePointerCast(SecStart, Int32PtrTy),
+            IRB.CreatePointerCast(SecEnd, Int32PtrTy)});
+
+      if (TargetTriple.supportsCOMDAT()) {
+        // Use comdat to dedup CtorFunc.
+        CtorFunc->setComdat(M.getOrInsertComdat(SanCovModuleCtorName));
+        appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority, CtorFunc);
+      } else {
+        appendToGlobalCtors(M, CtorFunc, SanCtorAndDtorPriority);
+      }
     }
   } else if (!Options.TracePC) {
     Function *CtorFunc;
@@ -435,6 +445,11 @@ static bool shouldInstrumentBlock(const Function& F, const BasicBlock *BB, const
   if (isa<UnreachableInst>(BB->getTerminator()))
     return false;
 
+  // Don't insert coverage into blocks without a valid insertion point
+  // (catchswitch blocks).
+  if (BB->getFirstInsertionPt() == BB->end())
+    return false;
+
   if (!ClPruneBlocks || &F.getEntryBlock() == BB)
     return true;
 
@@ -517,7 +532,7 @@ void SanitizerCoverageModule::CreateFunctionGuardArray(size_t NumGuards,
       Constant::getNullValue(ArrayOfInt32Ty), "__sancov_gen_");
   if (auto Comdat = F.getComdat())
     FunctionGuardArray->setComdat(Comdat);
-  FunctionGuardArray->setSection(SanCovTracePCGuardSection);
+  FunctionGuardArray->setSection(getSanCovTracePCGuardSection());
 }
 
 bool SanitizerCoverageModule::InjectCoverage(Function &F,
@@ -755,6 +770,27 @@ void SanitizerCoverageModule::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   }
 }
 
+StringRef SanitizerCoverageModule::getSanCovTracePCGuardSection() const {
+  if (TargetTriple.getObjectFormat() == Triple::COFF)
+    return ".SCOV$M";
+  if (TargetTriple.isOSBinFormatMachO())
+    return "__DATA,__sancov_guards";
+  return "__sancov_guards";
+}
+
+StringRef SanitizerCoverageModule::getSanCovTracePCGuardSectionStart() const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$start$__DATA$__sancov_guards";
+  return "__start___sancov_guards";
+}
+
+StringRef SanitizerCoverageModule::getSanCovTracePCGuardSectionEnd() const {
+  if (TargetTriple.isOSBinFormatMachO())
+    return "\1section$end$__DATA$__sancov_guards";
+  return "__stop___sancov_guards";
+}
+
+
 char SanitizerCoverageModule::ID = 0;
 INITIALIZE_PASS_BEGIN(SanitizerCoverageModule, "sancov",
                       "SanitizerCoverage: TODO."
diff --git a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index d9659694da462fb4a5de43eb4347f8483327869d..2ec6d09594dee4207d966858b7c1a3a06d2c5e94 100644
--- a/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -155,17 +155,18 @@ FunctionPass *llvm::createThreadSanitizerPass() {
 
 void ThreadSanitizer::initializeCallbacks(Module &M) {
   IRBuilder<> IRB(M.getContext());
-  AttributeSet Attr;
-  Attr = Attr.addAttribute(M.getContext(), AttributeSet::FunctionIndex, Attribute::NoUnwind);
+  AttributeList Attr;
+  Attr = Attr.addAttribute(M.getContext(), AttributeList::FunctionIndex,
+                           Attribute::NoUnwind);
   // Initialize the callbacks.
   TsanFuncEntry = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_func_entry", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+      "__tsan_func_entry", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
   TsanFuncExit = checkSanitizerInterfaceFunction(
-      M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy(), nullptr));
+      M.getOrInsertFunction("__tsan_func_exit", Attr, IRB.getVoidTy()));
   TsanIgnoreBegin = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_ignore_thread_begin", Attr, IRB.getVoidTy(), nullptr));
+      "__tsan_ignore_thread_begin", Attr, IRB.getVoidTy()));
   TsanIgnoreEnd = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_ignore_thread_end", Attr, IRB.getVoidTy(), nullptr));
+      "__tsan_ignore_thread_end", Attr, IRB.getVoidTy()));
   OrdTy = IRB.getInt32Ty();
   for (size_t i = 0; i < kNumberOfAccessSizes; ++i) {
     const unsigned ByteSize = 1U << i;
@@ -174,31 +175,31 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
     std::string BitSizeStr = utostr(BitSize);
     SmallString<32> ReadName("__tsan_read" + ByteSizeStr);
     TsanRead[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        ReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+        ReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     SmallString<32> WriteName("__tsan_write" + ByteSizeStr);
     TsanWrite[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        WriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+        WriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     SmallString<64> UnalignedReadName("__tsan_unaligned_read" + ByteSizeStr);
     TsanUnalignedRead[i] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     SmallString<64> UnalignedWriteName("__tsan_unaligned_write" + ByteSizeStr);
     TsanUnalignedWrite[i] =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-            UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+            UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
 
     Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
     TsanAtomicLoad[i] = checkSanitizerInterfaceFunction(
-        M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy, nullptr));
+        M.getOrInsertFunction(AtomicLoadName, Attr, Ty, PtrTy, OrdTy));
 
     SmallString<32> AtomicStoreName("__tsan_atomic" + BitSizeStr + "_store");
     TsanAtomicStore[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy, nullptr));
+        AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy));
 
     for (int op = AtomicRMWInst::FIRST_BINOP;
         op <= AtomicRMWInst::LAST_BINOP; ++op) {
@@ -222,33 +223,33 @@ void ThreadSanitizer::initializeCallbacks(Module &M) {
         continue;
       SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
       TsanAtomicRMW[op][i] = checkSanitizerInterfaceFunction(
-          M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy, nullptr));
+          M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy));
     }
 
     SmallString<32> AtomicCASName("__tsan_atomic" + BitSizeStr +
                                   "_compare_exchange_val");
     TsanAtomicCAS[i] = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-        AtomicCASName, Attr, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy, nullptr));
+        AtomicCASName, Attr, Ty, PtrTy, Ty, Ty, OrdTy, OrdTy));
   }
   TsanVptrUpdate = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("__tsan_vptr_update", Attr, IRB.getVoidTy(),
-                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), nullptr));
+                            IRB.getInt8PtrTy(), IRB.getInt8PtrTy()));
   TsanVptrLoad = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_vptr_read", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy(), nullptr));
+      "__tsan_vptr_read", Attr, IRB.getVoidTy(), IRB.getInt8PtrTy()));
   TsanAtomicThreadFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_atomic_thread_fence", Attr, IRB.getVoidTy(), OrdTy, nullptr));
+      "__tsan_atomic_thread_fence", Attr, IRB.getVoidTy(), OrdTy));
   TsanAtomicSignalFence = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-      "__tsan_atomic_signal_fence", Attr, IRB.getVoidTy(), OrdTy, nullptr));
+      "__tsan_atomic_signal_fence", Attr, IRB.getVoidTy(), OrdTy));
 
   MemmoveFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memmove", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemcpyFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memcpy", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt8PtrTy(), IntptrTy, nullptr));
+                            IRB.getInt8PtrTy(), IntptrTy));
   MemsetFn = checkSanitizerInterfaceFunction(
       M.getOrInsertFunction("memset", Attr, IRB.getInt8PtrTy(), IRB.getInt8PtrTy(),
-                            IRB.getInt32Ty(), IntptrTy, nullptr));
+                            IRB.getInt32Ty(), IntptrTy));
 }
 
 bool ThreadSanitizer::doInitialization(Module &M) {
@@ -488,6 +489,13 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
   Value *Addr = IsWrite
       ? cast<StoreInst>(I)->getPointerOperand()
       : cast<LoadInst>(I)->getPointerOperand();
+
+  // swifterror memory addresses are mem2reg promoted by instruction selection.
+  // As such they cannot have regular uses like an instrumentation function and
+  // it makes no sense to track them as memory.
+  if (Addr->isSwiftError())
+    return false;
+
   int Idx = getMemoryAccessFuncIndex(Addr, DL);
   if (Idx < 0)
     return false;
diff --git a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index c74827210364e2dcf72270730780efd05da4fc4f..c541fa4c8bee7255f760cb2a77523ce7b98cfda9 100644
--- a/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -127,9 +127,8 @@ private:
 
     LLVMContext &C = TheModule->getContext();
     Type *Params[] = { PointerType::getUnqual(Type::getInt8Ty(C)) };
-    AttributeSet Attr =
-      AttributeSet().addAttribute(C, AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
+    AttributeList Attr = AttributeList().addAttribute(
+        C, AttributeList::FunctionIndex, Attribute::NoUnwind);
     FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
                                           /*isVarArg=*/false);
     return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
@@ -144,10 +143,10 @@ private:
     Type *I8X = PointerType::getUnqual(Type::getInt8Ty(C));
     Type *Params[] = { I8X };
     FunctionType *Fty = FunctionType::get(I8X, Params, /*isVarArg=*/false);
-    AttributeSet Attr = AttributeSet();
+    AttributeList Attr = AttributeList();
 
     if (NoUnwind)
-      Attr = Attr.addAttribute(C, AttributeSet::FunctionIndex,
+      Attr = Attr.addAttribute(C, AttributeList::FunctionIndex,
                                Attribute::NoUnwind);
 
     return Decl = TheModule->getOrInsertFunction(Name, Fty, Attr);
@@ -162,9 +161,8 @@ private:
     Type *I8XX = PointerType::getUnqual(I8X);
     Type *Params[] = { I8XX, I8X };
 
-    AttributeSet Attr =
-      AttributeSet().addAttribute(C, AttributeSet::FunctionIndex,
-                                  Attribute::NoUnwind);
+    AttributeList Attr = AttributeList().addAttribute(
+        C, AttributeList::FunctionIndex, Attribute::NoUnwind);
     Attr = Attr.addAttribute(C, 1, Attribute::NoCapture);
 
     FunctionType *Fty = FunctionType::get(Type::getVoidTy(C), Params,
diff --git a/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index 23c1f5990ba528ae3eb0b46041138d671085f1cd..a86eaaec76412ede8872866eb1351d479c650083 100644
--- a/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -394,6 +394,7 @@ void ObjCARCContract::tryToContractReleaseIntoStoreStrong(Instruction *Release,
 
   DEBUG(llvm::dbgs() << "        New Store Strong: " << *StoreStrong << "\n");
 
+  if (&*Iter == Retain) ++Iter;
   if (&*Iter == Store) ++Iter;
   Store->eraseFromParent();
   Release->eraseFromParent();
diff --git a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index 136d54a6cb7594cb70483df72d98e71dedf86e63..3c73376c990680db13eceb113ca3797624f7c12d 100644
--- a/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -85,41 +85,6 @@ static const Value *FindSingleUseIdentifiedObject(const Value *Arg) {
   return nullptr;
 }
 
-/// This is a wrapper around getUnderlyingObjCPtr along the lines of
-/// GetUnderlyingObjects except that it returns early when it sees the first
-/// alloca.
-static inline bool AreAnyUnderlyingObjectsAnAlloca(const Value *V,
-                                                   const DataLayout &DL) {
-  SmallPtrSet<const Value *, 4> Visited;
-  SmallVector<const Value *, 4> Worklist;
-  Worklist.push_back(V);
-  do {
-    const Value *P = Worklist.pop_back_val();
-    P = GetUnderlyingObjCPtr(P, DL);
-
-    if (isa<AllocaInst>(P))
-      return true;
-
-    if (!Visited.insert(P).second)
-      continue;
-
-    if (const SelectInst *SI = dyn_cast<const SelectInst>(P)) {
-      Worklist.push_back(SI->getTrueValue());
-      Worklist.push_back(SI->getFalseValue());
-      continue;
-    }
-
-    if (const PHINode *PN = dyn_cast<const PHINode>(P)) {
-      for (Value *IncValue : PN->incoming_values())
-        Worklist.push_back(IncValue);
-      continue;
-    }
-  } while (!Worklist.empty());
-
-  return false;
-}
-
-
 /// @}
 ///
 /// \defgroup ARCOpt ARC Optimization.
@@ -481,9 +446,6 @@ namespace {
     /// MDKind identifiers.
     ARCMDKindCache MDKindCache;
 
-    // This is used to track if a pointer is stored into an alloca.
-    DenseSet<const Value *> MultiOwnersSet;
-
     /// A flag indicating whether this optimization pass should run.
     bool Run;
 
@@ -524,8 +486,7 @@ namespace {
     PairUpRetainsAndReleases(DenseMap<const BasicBlock *, BBState> &BBStates,
                              BlotMapVector<Value *, RRInfo> &Retains,
                              DenseMap<Value *, RRInfo> &Releases, Module *M,
-                             SmallVectorImpl<Instruction *> &NewRetains,
-                             SmallVectorImpl<Instruction *> &NewReleases,
+                             Instruction * Retain,
                              SmallVectorImpl<Instruction *> &DeadInsts,
                              RRInfo &RetainsToMove, RRInfo &ReleasesToMove,
                              Value *Arg, bool KnownSafe,
@@ -1155,29 +1116,6 @@ bool ObjCARCOpt::VisitInstructionBottomUp(
   case ARCInstKind::None:
     // These are irrelevant.
     return NestingDetected;
-  case ARCInstKind::User:
-    // If we have a store into an alloca of a pointer we are tracking, the
-    // pointer has multiple owners implying that we must be more conservative.
-    //
-    // This comes up in the context of a pointer being ``KnownSafe''. In the
-    // presence of a block being initialized, the frontend will emit the
-    // objc_retain on the original pointer and the release on the pointer loaded
-    // from the alloca. The optimizer will through the provenance analysis
-    // realize that the two are related, but since we only require KnownSafe in
-    // one direction, will match the inner retain on the original pointer with
-    // the guard release on the original pointer. This is fixed by ensuring that
-    // in the presence of allocas we only unconditionally remove pointers if
-    // both our retain and our release are KnownSafe.
-    if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-      const DataLayout &DL = BB->getModule()->getDataLayout();
-      if (AreAnyUnderlyingObjectsAnAlloca(SI->getPointerOperand(), DL)) {
-        auto I = MyStates.findPtrBottomUpState(
-            GetRCIdentityRoot(SI->getValueOperand()));
-        if (I != MyStates.bottom_up_ptr_end())
-          MultiOwnersSet.insert(I->first);
-      }
-    }
-    break;
   default:
     break;
   }
@@ -1540,8 +1478,7 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
     DenseMap<const BasicBlock *, BBState> &BBStates,
     BlotMapVector<Value *, RRInfo> &Retains,
     DenseMap<Value *, RRInfo> &Releases, Module *M,
-    SmallVectorImpl<Instruction *> &NewRetains,
-    SmallVectorImpl<Instruction *> &NewReleases,
+    Instruction *Retain,
     SmallVectorImpl<Instruction *> &DeadInsts, RRInfo &RetainsToMove,
     RRInfo &ReleasesToMove, Value *Arg, bool KnownSafe,
     bool &AnyPairsCompletelyEliminated) {
@@ -1549,7 +1486,6 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
   // is already incremented, we can similarly ignore possible decrements unless
   // we are dealing with a retainable object with multiple provenance sources.
   bool KnownSafeTD = true, KnownSafeBU = true;
-  bool MultipleOwners = false;
   bool CFGHazardAfflicted = false;
 
   // Connect the dots between the top-down-collected RetainsToMove and
@@ -1561,14 +1497,13 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
   unsigned OldCount = 0;
   unsigned NewCount = 0;
   bool FirstRelease = true;
-  for (;;) {
+  for (SmallVector<Instruction *, 4> NewRetains{Retain};;) {
+    SmallVector<Instruction *, 4> NewReleases;
     for (Instruction *NewRetain : NewRetains) {
       auto It = Retains.find(NewRetain);
       assert(It != Retains.end());
       const RRInfo &NewRetainRRI = It->second;
       KnownSafeTD &= NewRetainRRI.KnownSafe;
-      MultipleOwners =
-        MultipleOwners || MultiOwnersSet.count(GetArgRCIdentityRoot(NewRetain));
       for (Instruction *NewRetainRelease : NewRetainRRI.Calls) {
         auto Jt = Releases.find(NewRetainRelease);
         if (Jt == Releases.end())
@@ -1691,7 +1626,6 @@ bool ObjCARCOpt::PairUpRetainsAndReleases(
         }
       }
     }
-    NewReleases.clear();
     if (NewRetains.empty()) break;
   }
 
@@ -1745,10 +1679,6 @@ bool ObjCARCOpt::PerformCodePlacement(
   DEBUG(dbgs() << "\n== ObjCARCOpt::PerformCodePlacement ==\n");
 
   bool AnyPairsCompletelyEliminated = false;
-  RRInfo RetainsToMove;
-  RRInfo ReleasesToMove;
-  SmallVector<Instruction *, 4> NewRetains;
-  SmallVector<Instruction *, 4> NewReleases;
   SmallVector<Instruction *, 8> DeadInsts;
 
   // Visit each retain.
@@ -1780,9 +1710,10 @@ bool ObjCARCOpt::PerformCodePlacement(
 
     // Connect the dots between the top-down-collected RetainsToMove and
     // bottom-up-collected ReleasesToMove to form sets of related calls.
-    NewRetains.push_back(Retain);
+    RRInfo RetainsToMove, ReleasesToMove;
+
     bool PerformMoveCalls = PairUpRetainsAndReleases(
-        BBStates, Retains, Releases, M, NewRetains, NewReleases, DeadInsts,
+        BBStates, Retains, Releases, M, Retain, DeadInsts,
         RetainsToMove, ReleasesToMove, Arg, KnownSafe,
         AnyPairsCompletelyEliminated);
 
@@ -1792,12 +1723,6 @@ bool ObjCARCOpt::PerformCodePlacement(
       MoveCalls(Arg, RetainsToMove, ReleasesToMove,
                 Retains, Releases, DeadInsts, M);
     }
-
-    // Clean up state for next retain.
-    NewReleases.clear();
-    NewRetains.clear();
-    RetainsToMove.clear();
-    ReleasesToMove.clear();
   }
 
   // Now that we're done moving everything, we can delete the newly dead
@@ -1987,9 +1912,6 @@ bool ObjCARCOpt::OptimizeSequences(Function &F) {
                                                            Releases,
                                                            F.getParent());
 
-  // Cleanup.
-  MultiOwnersSet.clear();
-
   return AnyPairsCompletelyEliminated && NestingDetected;
 }
 
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
index 571e70c746ec6a8072bcbae71127eca4d0deb4ce..5b467dc9fe1251d4528c50aeb822c04f49131589 100644
--- a/lib/Transforms/Scalar/ADCE.cpp
+++ b/lib/Transforms/Scalar/ADCE.cpp
@@ -41,8 +41,8 @@ using namespace llvm;
 STATISTIC(NumRemoved, "Number of instructions removed");
 STATISTIC(NumBranchesRemoved, "Number of branch instructions removed");
 
-// This is a tempoary option until we change the interface
-// to this pass based on optimization level.
+// This is a temporary option until we change the interface to this pass based
+// on optimization level.
 static cl::opt<bool> RemoveControlFlowFlag("adce-remove-control-flow",
                                            cl::init(true), cl::Hidden);
 
@@ -110,7 +110,7 @@ class AggressiveDeadCodeElimination {
 
   /// The set of blocks which we have determined whose control
   /// dependence sources must be live and which have not had
-  /// those dependences analyized.
+  /// those dependences analyzed.
   SmallPtrSet<BasicBlock *, 16> NewLiveBlocks;
 
   /// Set up auxiliary data structures for Instructions and BasicBlocks and
@@ -145,7 +145,7 @@ class AggressiveDeadCodeElimination {
   /// was removed.
   bool removeDeadInstructions();
 
-  /// Identify connected sections of the control flow grap which have
+  /// Identify connected sections of the control flow graph which have
   /// dead terminators and rewrite the control flow graph to remove them.
   void updateDeadRegions();
 
@@ -253,25 +253,6 @@ void AggressiveDeadCodeElimination::initialize() {
     }
   }
 
-  // Mark blocks live if there is no path from the block to the
-  // return of the function or a successor for which this is true.
-  // This protects IDFCalculator which cannot handle such blocks.
-  for (auto &BBInfoPair : BlockInfo) {
-    auto &BBInfo = BBInfoPair.second;
-    if (BBInfo.terminatorIsLive())
-      continue;
-    auto *BB = BBInfo.BB;
-    if (!PDT.getNode(BB)) {
-      markLive(BBInfo.Terminator);
-      continue;
-    }
-    for (auto *Succ : successors(BB))
-      if (!PDT.getNode(Succ)) {
-        markLive(BBInfo.Terminator);
-        break;
-      }
-  }
-
   // Mark blocks live if there is no path from the block to the
   // return of the function or a successor for which this is true.
   // This protects IDFCalculator which cannot handle such blocks.
@@ -579,7 +560,7 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
         PreferredSucc = Info;
     }
     assert((PreferredSucc && PreferredSucc->PostOrder > 0) &&
-           "Failed to find safe successor for dead branc");
+           "Failed to find safe successor for dead branch");
     bool First = true;
     for (auto *Succ : successors(BB)) {
       if (!First || Succ != PreferredSucc->BB)
@@ -595,12 +576,12 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
 // reverse top-sort order
 void AggressiveDeadCodeElimination::computeReversePostOrder() {
 
-  // This provides a post-order numbering of the reverse conrtol flow graph
+  // This provides a post-order numbering of the reverse control flow graph
   // Note that it is incomplete in the presence of infinite loops but we don't
   // need numbers blocks which don't reach the end of the functions since
   // all branches in those blocks are forced live.
 
-  // For each block without successors, extend the DFS from the bloack
+  // For each block without successors, extend the DFS from the block
   // backward through the graph
   SmallPtrSet<BasicBlock*, 16> Visited;
   unsigned PostOrder = 0;
diff --git a/lib/Transforms/Scalar/ConstantHoisting.cpp b/lib/Transforms/Scalar/ConstantHoisting.cpp
index ebe35aac0986197727d462db6027472ab387e5f1..ee6333e88716b8920cdb94402b0c8d8d6958c03b 100644
--- a/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -136,8 +136,16 @@ Instruction *ConstantHoistingPass::findMatInsertPt(Instruction *Inst,
   if (Idx != ~0U && isa<PHINode>(Inst))
     return cast<PHINode>(Inst)->getIncomingBlock(Idx)->getTerminator();
 
-  BasicBlock *IDom = DT->getNode(Inst->getParent())->getIDom()->getBlock();
-  return IDom->getTerminator();
+  // This must be an EH pad. Iterate over immediate dominators until we find a
+  // non-EH pad. We need to skip over catchswitch blocks, which are both EH pads
+  // and terminators.
+  auto IDom = DT->getNode(Inst->getParent())->getIDom();
+  while (IDom->getBlock()->isEHPad()) {
+    assert(Entry != IDom->getBlock() && "eh pad in entry block");
+    IDom = IDom->getIDom();
+  }
+
+  return IDom->getBlock()->getTerminator();
 }
 
 /// \brief Find an insertion point that dominates all uses.
diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 60786a3373feb4683c055983c68d04cee40978de..ed5ad002f601348b6ff74809b7825c5e8d240085 100644
--- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -41,6 +41,8 @@ STATISTIC(NumSDivs,     "Number of sdiv converted to udiv");
 STATISTIC(NumAShrs,     "Number of ashr converted to lshr");
 STATISTIC(NumSRems,     "Number of srem converted to urem");
 
+static cl::opt<bool> DontProcessAdds("cvp-dont-process-adds", cl::init(true));
+
 namespace {
   class CorrelatedValuePropagation : public FunctionPass {
   public:
@@ -233,9 +235,8 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
   // Analyse each switch case in turn.  This is done in reverse order so that
   // removing a case doesn't cause trouble for the iteration.
   bool Changed = false;
-  for (SwitchInst::CaseIt CI = SI->case_end(), CE = SI->case_begin(); CI-- != CE;
-       ) {
-    ConstantInt *Case = CI.getCaseValue();
+  for (auto CI = SI->case_begin(), CE = SI->case_end(); CI != CE;) {
+    ConstantInt *Case = CI->getCaseValue();
 
     // Check to see if the switch condition is equal to/not equal to the case
     // value on every incoming edge, equal/not equal being the same each time.
@@ -268,8 +269,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
     if (State == LazyValueInfo::False) {
       // This case never fires - remove it.
-      CI.getCaseSuccessor()->removePredecessor(BB);
-      SI->removeCase(CI); // Does not invalidate the iterator.
+      CI->getCaseSuccessor()->removePredecessor(BB);
+      CI = SI->removeCase(CI);
+      CE = SI->case_end();
 
       // The condition can be modified by removePredecessor's PHI simplification
       // logic.
@@ -277,7 +279,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
 
       ++NumDeadCases;
       Changed = true;
-    } else if (State == LazyValueInfo::True) {
+      continue;
+    }
+    if (State == LazyValueInfo::True) {
       // This case always fires.  Arrange for the switch to be turned into an
       // unconditional branch by replacing the switch condition with the case
       // value.
@@ -286,6 +290,9 @@ static bool processSwitch(SwitchInst *SI, LazyValueInfo *LVI) {
       Changed = true;
       break;
     }
+
+    // Increment the case iterator sense we didn't delete it.
+    ++CI;
   }
 
   if (Changed)
@@ -320,7 +327,7 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
   if (Indices.empty())
     return false;
 
-  AttributeSet AS = CS.getAttributes();
+  AttributeList AS = CS.getAttributes();
   LLVMContext &Ctx = CS.getInstruction()->getContext();
   AS = AS.addAttribute(Ctx, Indices, Attribute::get(Ctx, Attribute::NonNull));
   CS.setAttributes(AS);
@@ -405,6 +412,9 @@ static bool processAShr(BinaryOperator *SDI, LazyValueInfo *LVI) {
 static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) {
   typedef OverflowingBinaryOperator OBO;
 
+  if (DontProcessAdds)
+    return false;
+
   if (AddOp->getType()->isVectorTy() || hasLocalDefs(AddOp))
     return false;
 
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index c3c0295bee569e3c3c65e4265e1fb99a80752d21..1ec38e56aa4cbb1db6ff1d420126c84ca22c239b 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -287,19 +287,14 @@ static uint64_t getPointerSize(const Value *V, const DataLayout &DL,
 }
 
 namespace {
-enum OverwriteResult {
-  OverwriteBegin,
-  OverwriteComplete,
-  OverwriteEnd,
-  OverwriteUnknown
-};
+enum OverwriteResult { OW_Begin, OW_Complete, OW_End, OW_Unknown };
 }
 
-/// Return 'OverwriteComplete' if a store to the 'Later' location completely
-/// overwrites a store to the 'Earlier' location, 'OverwriteEnd' if the end of
-/// the 'Earlier' location is completely overwritten by 'Later',
-/// 'OverwriteBegin' if the beginning of the 'Earlier' location is overwritten
-/// by 'Later', or 'OverwriteUnknown' if nothing can be determined.
+/// Return 'OW_Complete' if a store to the 'Later' location completely
+/// overwrites a store to the 'Earlier' location, 'OW_End' if the end of the
+/// 'Earlier' location is completely overwritten by 'Later', 'OW_Begin' if the
+/// beginning of the 'Earlier' location is overwritten by 'Later', or
+/// 'OW_Unknown' if nothing can be determined.
 static OverwriteResult isOverwrite(const MemoryLocation &Later,
                                    const MemoryLocation &Earlier,
                                    const DataLayout &DL,
@@ -310,7 +305,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   // If we don't know the sizes of either access, then we can't do a comparison.
   if (Later.Size == MemoryLocation::UnknownSize ||
       Earlier.Size == MemoryLocation::UnknownSize)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
@@ -320,7 +315,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (P1 == P2) {
     // Make sure that the Later size is >= the Earlier size.
     if (Later.Size >= Earlier.Size)
-      return OverwriteComplete;
+      return OW_Complete;
   }
 
   // Check to see if the later store is to the entire object (either a global,
@@ -332,13 +327,13 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   // If we can't resolve the same pointers to the same object, then we can't
   // analyze them at all.
   if (UO1 != UO2)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   // If the "Later" store is to a recognizable object, get its size.
   uint64_t ObjectSize = getPointerSize(UO2, DL, TLI);
   if (ObjectSize != MemoryLocation::UnknownSize)
     if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
-      return OverwriteComplete;
+      return OW_Complete;
 
   // Okay, we have stores to two completely different pointers.  Try to
   // decompose the pointer into a "base + constant_offset" form.  If the base
@@ -350,7 +345,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
 
   // If the base pointers still differ, we have two completely different stores.
   if (BP1 != BP2)
-    return OverwriteUnknown;
+    return OW_Unknown;
 
   // The later store completely overlaps the earlier store if:
   //
@@ -370,7 +365,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (EarlierOff >= LaterOff &&
       Later.Size >= Earlier.Size &&
       uint64_t(EarlierOff - LaterOff) + Earlier.Size <= Later.Size)
-    return OverwriteComplete;
+    return OW_Complete;
 
   // We may now overlap, although the overlap is not complete. There might also
   // be other incomplete overlaps, and together, they might cover the complete
@@ -428,7 +423,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
                       ") Composite Later [" <<
                       ILI->second << ", " << ILI->first << ")\n");
       ++NumCompletePartials;
-      return OverwriteComplete;
+      return OW_Complete;
     }
   }
 
@@ -443,7 +438,7 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
   if (!EnablePartialOverwriteTracking &&
       (LaterOff > EarlierOff && LaterOff < int64_t(EarlierOff + Earlier.Size) &&
        int64_t(LaterOff + Later.Size) >= int64_t(EarlierOff + Earlier.Size)))
-    return OverwriteEnd;
+    return OW_End;
 
   // Finally, we also need to check if the later store overwrites the beginning
   // of the earlier store.
@@ -458,11 +453,11 @@ static OverwriteResult isOverwrite(const MemoryLocation &Later,
       (LaterOff <= EarlierOff && int64_t(LaterOff + Later.Size) > EarlierOff)) {
     assert(int64_t(LaterOff + Later.Size) <
                int64_t(EarlierOff + Earlier.Size) &&
-           "Expect to be handled as OverwriteComplete");
-    return OverwriteBegin;
+           "Expect to be handled as OW_Complete");
+    return OW_Begin;
   }
   // Otherwise, they don't completely overlap.
-  return OverwriteUnknown;
+  return OW_Unknown;
 }
 
 /// If 'Inst' might be a self read (i.e. a noop copy of a
@@ -551,7 +546,7 @@ static bool memoryIsNotModifiedBetween(Instruction *FirstI,
       Instruction *I = &*BI;
       if (I->mayWriteToMemory() && I != SecondI) {
         auto Res = AA->getModRefInfo(I, MemLoc);
-        if (Res != MRI_NoModRef)
+        if (Res & MRI_Mod)
           return false;
       }
     }
@@ -909,7 +904,7 @@ static bool tryToShortenBegin(Instruction *EarlierWrite,
 
   if (LaterStart <= EarlierStart && LaterStart + LaterSize > EarlierStart) {
     assert(LaterStart + LaterSize < EarlierStart + EarlierSize &&
-           "Should have been handled as OverwriteComplete");
+           "Should have been handled as OW_Complete");
     if (tryToShorten(EarlierWrite, EarlierStart, EarlierSize, LaterStart,
                      LaterSize, false)) {
       IntervalMap.erase(OII);
@@ -1105,7 +1100,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
         OverwriteResult OR =
             isOverwrite(Loc, DepLoc, DL, *TLI, DepWriteOffset, InstWriteOffset,
                         DepWrite, IOL);
-        if (OR == OverwriteComplete) {
+        if (OR == OW_Complete) {
           DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: "
                 << *DepWrite << "\n  KILLER: " << *Inst << '\n');
 
@@ -1117,15 +1112,15 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
           // We erased DepWrite; start over.
           InstDep = MD->getDependency(Inst);
           continue;
-        } else if ((OR == OverwriteEnd && isShortenableAtTheEnd(DepWrite)) ||
-                   ((OR == OverwriteBegin &&
+        } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
+                   ((OR == OW_Begin &&
                      isShortenableAtTheBeginning(DepWrite)))) {
           assert(!EnablePartialOverwriteTracking && "Do not expect to perform "
                                                     "when partial-overwrite "
                                                     "tracking is enabled");
           int64_t EarlierSize = DepLoc.Size;
           int64_t LaterSize = Loc.Size;
-          bool IsOverwriteEnd = (OR == OverwriteEnd);
+          bool IsOverwriteEnd = (OR == OW_End);
           MadeChange |= tryToShorten(DepWrite, DepWriteOffset, EarlierSize,
                                     InstWriteOffset, LaterSize, IsOverwriteEnd);
         }
diff --git a/lib/Transforms/Scalar/EarlyCSE.cpp b/lib/Transforms/Scalar/EarlyCSE.cpp
index 5fc0dab90473925580010aa03397e5313f38d024..04479b6e49ac8513a468c14ba8ededc9712fcbd9 100644
--- a/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -19,6 +19,8 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/DataLayout.h"
@@ -32,7 +34,6 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 #include <deque>
 using namespace llvm;
 using namespace llvm::PatternMatch;
@@ -253,6 +254,7 @@ public:
   DominatorTree &DT;
   AssumptionCache &AC;
   MemorySSA *MSSA;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
   typedef RecyclingAllocator<
       BumpPtrAllocator, ScopedHashTableVal<SimpleValue, Value *>> AllocatorTy;
   typedef ScopedHashTable<SimpleValue, Value *, DenseMapInfo<SimpleValue>,
@@ -315,7 +317,9 @@ public:
   /// \brief Set up the EarlyCSE runner for a particular function.
   EarlyCSE(const TargetLibraryInfo &TLI, const TargetTransformInfo &TTI,
            DominatorTree &DT, AssumptionCache &AC, MemorySSA *MSSA)
-      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA), CurrentGeneration(0) {}
+      : TLI(TLI), TTI(TTI), DT(DT), AC(AC), MSSA(MSSA),
+        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)), CurrentGeneration(0) {
+  }
 
   bool run();
 
@@ -388,7 +392,7 @@ private:
     ParseMemoryInst(Instruction *Inst, const TargetTransformInfo &TTI)
       : IsTargetMemInst(false), Inst(Inst) {
       if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst))
-        if (TTI.getTgtMemIntrinsic(II, Info) && Info.NumMemRefs == 1)
+        if (TTI.getTgtMemIntrinsic(II, Info))
           IsTargetMemInst = true;
     }
     bool isLoad() const {
@@ -400,17 +404,14 @@ private:
       return isa<StoreInst>(Inst);
     }
     bool isAtomic() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return false;
-      }
+      if (IsTargetMemInst)
+        return Info.Ordering != AtomicOrdering::NotAtomic;
       return Inst->isAtomic();
     }
     bool isUnordered() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return true;
-      }
+      if (IsTargetMemInst)
+        return Info.isUnordered();
+
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
         return LI->isUnordered();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -421,10 +422,9 @@ private:
     }
 
     bool isVolatile() const {
-      if (IsTargetMemInst) {
-        assert(Info.IsSimple && "need to refine IsSimple in TTI");
-        return false;
-      }
+      if (IsTargetMemInst)
+        return Info.IsVolatile;
+
       if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
         return LI->isVolatile();
       } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
@@ -517,7 +517,7 @@ private:
           if (MemoryPhi *MP = dyn_cast<MemoryPhi>(U))
             PhisToCheck.push_back(MP);
 
-        MSSA->removeMemoryAccess(WI);
+        MSSAUpdater->removeMemoryAccess(WI);
 
         for (MemoryPhi *MP : PhisToCheck) {
           MemoryAccess *FirstIn = MP->getIncomingValue(0);
@@ -587,27 +587,28 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
   // which reaches this block where the condition might hold a different
   // value.  Since we're adding this to the scoped hash table (like any other
   // def), it will have been popped if we encounter a future merge block.
-  if (BasicBlock *Pred = BB->getSinglePredecessor())
-    if (auto *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
-      if (BI->isConditional())
-        if (auto *CondInst = dyn_cast<Instruction>(BI->getCondition()))
-          if (SimpleValue::canHandle(CondInst)) {
-            assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
-            auto *ConditionalConstant = (BI->getSuccessor(0) == BB) ?
-              ConstantInt::getTrue(BB->getContext()) :
-              ConstantInt::getFalse(BB->getContext());
-            AvailableValues.insert(CondInst, ConditionalConstant);
-            DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
-                  << CondInst->getName() << "' as " << *ConditionalConstant
-                  << " in " << BB->getName() << "\n");
-            // Replace all dominated uses with the known value.
-            if (unsigned Count =
-                    replaceDominatedUsesWith(CondInst, ConditionalConstant, DT,
-                                             BasicBlockEdge(Pred, BB))) {
-              Changed = true;
-              NumCSECVP = NumCSECVP + Count;
-            }
-          }
+  if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+    auto *BI = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (BI && BI->isConditional()) {
+      auto *CondInst = dyn_cast<Instruction>(BI->getCondition());
+      if (CondInst && SimpleValue::canHandle(CondInst)) {
+        assert(BI->getSuccessor(0) == BB || BI->getSuccessor(1) == BB);
+        auto *TorF = (BI->getSuccessor(0) == BB)
+                         ? ConstantInt::getTrue(BB->getContext())
+                         : ConstantInt::getFalse(BB->getContext());
+        AvailableValues.insert(CondInst, TorF);
+        DEBUG(dbgs() << "EarlyCSE CVP: Add conditional value for '"
+                     << CondInst->getName() << "' as " << *TorF << " in "
+                     << BB->getName() << "\n");
+        // Replace all dominated uses with the known value.
+        if (unsigned Count = replaceDominatedUsesWith(
+                CondInst, TorF, DT, BasicBlockEdge(Pred, BB))) {
+          Changed = true;
+          NumCSECVP = NumCSECVP + Count;
+        }
+      }
+    }
+  }
 
   /// LastStore - Keep track of the last non-volatile store that we saw... for
   /// as long as there in no instruction that reads memory.  If we see a store
diff --git a/lib/Transforms/Scalar/GVN.cpp b/lib/Transforms/Scalar/GVN.cpp
index 91b27aa7235b0223f87c1f0560dc740a4405c33d..be696df548d52f5681ebe3f0b6844b5f2b1bcb97 100644
--- a/lib/Transforms/Scalar/GVN.cpp
+++ b/lib/Transforms/Scalar/GVN.cpp
@@ -36,7 +36,6 @@
 #include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -51,9 +50,12 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+
 #include <vector>
 using namespace llvm;
 using namespace llvm::gvn;
+using namespace llvm::VNCoercion;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "gvn"
@@ -595,6 +597,7 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
   PreservedAnalyses PA;
   PA.preserve<DominatorTreeAnalysis>();
   PA.preserve<GlobalsAA>();
+  PA.preserve<TargetLibraryAnalysis>();
   return PA;
 }
 
@@ -691,442 +694,6 @@ SpeculationFailure:
 }
 
 
-/// Return true if CoerceAvailableValueToLoadType will succeed.
-static bool CanCoerceMustAliasedValueToLoad(Value *StoredVal,
-                                            Type *LoadTy,
-                                            const DataLayout &DL) {
-  // If the loaded or stored value is an first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
-      StoredVal->getType()->isStructTy() ||
-      StoredVal->getType()->isArrayTy())
-    return false;
-
-  // The store has to be at least as big as the load.
-  if (DL.getTypeSizeInBits(StoredVal->getType()) <
-        DL.getTypeSizeInBits(LoadTy))
-    return false;
-
-  return true;
-}
-
-/// If we saw a store of a value to memory, and
-/// then a load from a must-aliased pointer of a different type, try to coerce
-/// the stored value.  LoadedTy is the type of the load we want to replace.
-/// IRB is IRBuilder used to insert new instructions.
-///
-/// If we can't do it, return null.
-static Value *CoerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
-                                             IRBuilder<> &IRB,
-                                             const DataLayout &DL) {
-  assert(CanCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
-         "precondition violation - materialization can't fail");
-
-  if (auto *C = dyn_cast<Constant>(StoredVal))
-    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-      StoredVal = FoldedStoredVal;
-
-  // If this is already the right type, just return it.
-  Type *StoredValTy = StoredVal->getType();
-
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
-
-  // If the store and reload are the same size, we can always reuse it.
-  if (StoredValSize == LoadedValSize) {
-    // Pointer to Pointer -> use bitcast.
-    if (StoredValTy->getScalarType()->isPointerTy() &&
-        LoadedTy->getScalarType()->isPointerTy()) {
-      StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy);
-    } else {
-      // Convert source pointers to integers, which can be bitcast.
-      if (StoredValTy->getScalarType()->isPointerTy()) {
-        StoredValTy = DL.getIntPtrType(StoredValTy);
-        StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
-      }
-
-      Type *TypeToCastTo = LoadedTy;
-      if (TypeToCastTo->getScalarType()->isPointerTy())
-        TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
-
-      if (StoredValTy != TypeToCastTo)
-        StoredVal = IRB.CreateBitCast(StoredVal, TypeToCastTo);
-
-      // Cast to pointer if the load needs a pointer type.
-      if (LoadedTy->getScalarType()->isPointerTy())
-        StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy);
-    }
-
-    if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
-      if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-        StoredVal = FoldedStoredVal;
-
-    return StoredVal;
-  }
-
-  // If the loaded value is smaller than the available value, then we can
-  // extract out a piece from it.  If the available value is too small, then we
-  // can't do anything.
-  assert(StoredValSize >= LoadedValSize &&
-         "CanCoerceMustAliasedValueToLoad fail");
-
-  // Convert source pointers to integers, which can be manipulated.
-  if (StoredValTy->getScalarType()->isPointerTy()) {
-    StoredValTy = DL.getIntPtrType(StoredValTy);
-    StoredVal = IRB.CreatePtrToInt(StoredVal, StoredValTy);
-  }
-
-  // Convert vectors and fp to integer, which can be manipulated.
-  if (!StoredValTy->isIntegerTy()) {
-    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
-    StoredVal = IRB.CreateBitCast(StoredVal, StoredValTy);
-  }
-
-  // If this is a big-endian system, we need to shift the value down to the low
-  // bits so that a truncate will work.
-  if (DL.isBigEndian()) {
-    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
-                        DL.getTypeStoreSizeInBits(LoadedTy);
-    StoredVal = IRB.CreateLShr(StoredVal, ShiftAmt, "tmp");
-  }
-
-  // Truncate the integer to the right size now.
-  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
-  StoredVal  = IRB.CreateTrunc(StoredVal, NewIntTy, "trunc");
-
-  if (LoadedTy != NewIntTy) {
-    // If the result is a pointer, inttoptr.
-    if (LoadedTy->getScalarType()->isPointerTy())
-      StoredVal = IRB.CreateIntToPtr(StoredVal, LoadedTy, "inttoptr");
-    else
-      // Otherwise, bitcast.
-      StoredVal = IRB.CreateBitCast(StoredVal, LoadedTy, "bitcast");
-  }
-
-  if (auto *C = dyn_cast<Constant>(StoredVal))
-    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-      StoredVal = FoldedStoredVal;
-
-  return StoredVal;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering memory write (store,
-/// memset, memcpy, memmove).  This means that the write *may* provide bits used
-/// by the load but we can't be sure because the pointers don't mustalias.
-///
-/// Check this case to see if there is anything more we can do before we give
-/// up.  This returns -1 if we have to give up, or a byte number in the stored
-/// value of the piece that feeds the load.
-static int AnalyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
-                                          Value *WritePtr,
-                                          uint64_t WriteSizeInBits,
-                                          const DataLayout &DL) {
-  // If the loaded or stored value is a first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
-    return -1;
-
-  int64_t StoreOffset = 0, LoadOffset = 0;
-  Value *StoreBase =
-      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
-  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
-  if (StoreBase != LoadBase)
-    return -1;
-
-  // If the load and store are to the exact same address, they should have been
-  // a must alias.  AA must have gotten confused.
-  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
-  // to a load from the base of the memset.
-
-  // If the load and store don't overlap at all, the store doesn't provide
-  // anything to the load.  In this case, they really don't alias at all, AA
-  // must have gotten confused.
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
-
-  if ((WriteSizeInBits & 7) | (LoadSize & 7))
-    return -1;
-  uint64_t StoreSize = WriteSizeInBits / 8;  // Convert to bytes.
-  LoadSize /= 8;
-
-
-  bool isAAFailure = false;
-  if (StoreOffset < LoadOffset)
-    isAAFailure = StoreOffset+int64_t(StoreSize) <= LoadOffset;
-  else
-    isAAFailure = LoadOffset+int64_t(LoadSize) <= StoreOffset;
-
-  if (isAAFailure)
-    return -1;
-
-  // If the Load isn't completely contained within the stored bits, we don't
-  // have all the bits to feed it.  We could do something crazy in the future
-  // (issue a smaller load then merge the bits in) but this seems unlikely to be
-  // valuable.
-  if (StoreOffset > LoadOffset ||
-      StoreOffset+StoreSize < LoadOffset+LoadSize)
-    return -1;
-
-  // Okay, we can do this transformation.  Return the number of bytes into the
-  // store that the load is.
-  return LoadOffset-StoreOffset;
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.
-static int AnalyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
-                                          StoreInst *DepSI) {
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepSI->getValueOperand()->getType()->isStructTy() ||
-      DepSI->getValueOperand()->getType()->isArrayTy())
-    return -1;
-
-  const DataLayout &DL = DepSI->getModule()->getDataLayout();
-  Value *StorePtr = DepSI->getPointerOperand();
-  uint64_t StoreSize =DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
-  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
-                                        StorePtr, StoreSize, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being clobbered by another load.  See if
-/// the other load can feed into the second load.
-static int AnalyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr,
-                                         LoadInst *DepLI, const DataLayout &DL){
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
-    return -1;
-
-  Value *DepPtr = DepLI->getPointerOperand();
-  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
-  int R = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
-  if (R != -1) return R;
-
-  // If we have a load/load clobber an DepLI can be widened to cover this load,
-  // then we should widen it!
-  int64_t LoadOffs = 0;
-  const Value *LoadBase =
-      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
-
-  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
-      LoadBase, LoadOffs, LoadSize, DepLI);
-  if (Size == 0) return -1;
-
-  // Check non-obvious conditions enforced by MDA which we rely on for being
-  // able to materialize this potentially available value
-  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
-  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
-
-  return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size*8, DL);
-}
-
-
-
-static int AnalyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
-                                            MemIntrinsic *MI,
-                                            const DataLayout &DL) {
-  // If the mem operation is a non-constant size, we can't handle it.
-  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
-  if (!SizeCst) return -1;
-  uint64_t MemSizeInBits = SizeCst->getZExtValue()*8;
-
-  // If this is memset, we just need to see if the offset is valid in the size
-  // of the memset..
-  if (MI->getIntrinsicID() == Intrinsic::memset)
-    return AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
-                                          MemSizeInBits, DL);
-
-  // If we have a memcpy/memmove, the only case we can handle is if this is a
-  // copy from constant memory.  In that case, we can read directly from the
-  // constant memory.
-  MemTransferInst *MTI = cast<MemTransferInst>(MI);
-
-  Constant *Src = dyn_cast<Constant>(MTI->getSource());
-  if (!Src) return -1;
-
-  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
-  if (!GV || !GV->isConstant()) return -1;
-
-  // See if the access is within the bounds of the transfer.
-  int Offset = AnalyzeLoadFromClobberingWrite(LoadTy, LoadPtr,
-                                              MI->getDest(), MemSizeInBits, DL);
-  if (Offset == -1)
-    return Offset;
-
-  unsigned AS = Src->getType()->getPointerAddressSpace();
-  // Otherwise, see if we can constant fold a load from the constant with the
-  // offset applied as appropriate.
-  Src = ConstantExpr::getBitCast(Src,
-                                 Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
-    return Offset;
-  return -1;
-}
-
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering store.  This means
-/// that the store provides bits used by the load but we the pointers don't
-/// mustalias.  Check this case to see if there is anything more we can do
-/// before we give up.
-static Value *GetStoreValueForLoad(Value *SrcVal, unsigned Offset,
-                                   Type *LoadTy,
-                                   Instruction *InsertPt, const DataLayout &DL){
-  LLVMContext &Ctx = SrcVal->getType()->getContext();
-
-  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
-  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
-
-  IRBuilder<> Builder(InsertPt);
-
-  // Compute which bits of the stored value are being used by the load.  Convert
-  // to an integer type to start with.
-  if (SrcVal->getType()->getScalarType()->isPointerTy())
-    SrcVal = Builder.CreatePtrToInt(SrcVal,
-        DL.getIntPtrType(SrcVal->getType()));
-  if (!SrcVal->getType()->isIntegerTy())
-    SrcVal = Builder.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize*8));
-
-  // Shift the bits to the least significant depending on endianness.
-  unsigned ShiftAmt;
-  if (DL.isLittleEndian())
-    ShiftAmt = Offset*8;
-  else
-    ShiftAmt = (StoreSize-LoadSize-Offset)*8;
-
-  if (ShiftAmt)
-    SrcVal = Builder.CreateLShr(SrcVal, ShiftAmt);
-
-  if (LoadSize != StoreSize)
-    SrcVal = Builder.CreateTrunc(SrcVal, IntegerType::get(Ctx, LoadSize*8));
-
-  return CoerceAvailableValueToLoadType(SrcVal, LoadTy, Builder, DL);
-}
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering load.  This means
-/// that the load *may* provide bits used by the load but we can't be sure
-/// because the pointers don't mustalias.  Check this case to see if there is
-/// anything more we can do before we give up.
-static Value *GetLoadValueForLoad(LoadInst *SrcVal, unsigned Offset,
-                                  Type *LoadTy, Instruction *InsertPt,
-                                  GVN &gvn) {
-  const DataLayout &DL = SrcVal->getModule()->getDataLayout();
-  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
-  // widen SrcVal out to a larger load.
-  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
-  if (Offset+LoadSize > SrcValStoreSize) {
-    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
-    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
-    // If we have a load/load clobber an DepLI can be widened to cover this
-    // load, then we should widen it to the next power of 2 size big enough!
-    unsigned NewLoadSize = Offset+LoadSize;
-    if (!isPowerOf2_32(NewLoadSize))
-      NewLoadSize = NextPowerOf2(NewLoadSize);
-
-    Value *PtrVal = SrcVal->getPointerOperand();
-
-    // Insert the new load after the old load.  This ensures that subsequent
-    // memdep queries will find the new load.  We can't easily remove the old
-    // load completely because it is already in the value numbering table.
-    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
-    Type *DestPTy =
-      IntegerType::get(LoadTy->getContext(), NewLoadSize*8);
-    DestPTy = PointerType::get(DestPTy,
-                               PtrVal->getType()->getPointerAddressSpace());
-    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
-    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
-    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
-    NewLoad->takeName(SrcVal);
-    NewLoad->setAlignment(SrcVal->getAlignment());
-
-    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
-    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
-
-    // Replace uses of the original load with the wider load.  On a big endian
-    // system, we need to shift down to get the relevant bits.
-    Value *RV = NewLoad;
-    if (DL.isBigEndian())
-      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
-    RV = Builder.CreateTrunc(RV, SrcVal->getType());
-    SrcVal->replaceAllUsesWith(RV);
-
-    // We would like to use gvn.markInstructionForDeletion here, but we can't
-    // because the load is already memoized into the leader map table that GVN
-    // tracks.  It is potentially possible to remove the load from the table,
-    // but then there all of the operations based on it would need to be
-    // rehashed.  Just leave the dead load around.
-    gvn.getMemDep().removeInstruction(SrcVal);
-    SrcVal = NewLoad;
-  }
-
-  return GetStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
-}
-
-
-/// This function is called when we have a
-/// memdep query of a load that ends up being a clobbering mem intrinsic.
-static Value *GetMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
-                                     Type *LoadTy, Instruction *InsertPt,
-                                     const DataLayout &DL){
-  LLVMContext &Ctx = LoadTy->getContext();
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy)/8;
-
-  IRBuilder<> Builder(InsertPt);
-
-  // We know that this method is only called when the mem transfer fully
-  // provides the bits for the load.
-  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
-    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
-    // independently of what the offset is.
-    Value *Val = MSI->getValue();
-    if (LoadSize != 1)
-      Val = Builder.CreateZExt(Val, IntegerType::get(Ctx, LoadSize*8));
-
-    Value *OneElt = Val;
-
-    // Splat the value out to the right number of bits.
-    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize; ) {
-      // If we can double the number of bytes set, do it.
-      if (NumBytesSet*2 <= LoadSize) {
-        Value *ShVal = Builder.CreateShl(Val, NumBytesSet*8);
-        Val = Builder.CreateOr(Val, ShVal);
-        NumBytesSet <<= 1;
-        continue;
-      }
-
-      // Otherwise insert one byte at a time.
-      Value *ShVal = Builder.CreateShl(Val, 1*8);
-      Val = Builder.CreateOr(OneElt, ShVal);
-      ++NumBytesSet;
-    }
-
-    return CoerceAvailableValueToLoadType(Val, LoadTy, Builder, DL);
-  }
-
-  // Otherwise, this is a memcpy/memmove from a constant global.
-  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
-  Constant *Src = cast<Constant>(MTI->getSource());
-  unsigned AS = Src->getType()->getPointerAddressSpace();
-
-  // Otherwise, see if we can constant fold a load from the constant with the
-  // offset applied as appropriate.
-  Src = ConstantExpr::getBitCast(Src,
-                                 Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-    ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
-  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
-  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
-}
 
 
 /// Given a set of loads specified by ValuesPerBlock,
@@ -1172,7 +739,7 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
   if (isSimpleValue()) {
     Res = getSimpleValue();
     if (Res->getType() != LoadTy) {
-      Res = GetStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
+      Res = getStoreValueForLoad(Res, Offset, LoadTy, InsertPt, DL);
 
       DEBUG(dbgs() << "GVN COERCED NONLOCAL VAL:\nOffset: " << Offset << "  "
                    << *getSimpleValue() << '\n'
@@ -1183,14 +750,20 @@ Value *AvailableValue::MaterializeAdjustedValue(LoadInst *LI,
     if (Load->getType() == LoadTy && Offset == 0) {
       Res = Load;
     } else {
-      Res = GetLoadValueForLoad(Load, Offset, LoadTy, InsertPt, gvn);
-
+      Res = getLoadValueForLoad(Load, Offset, LoadTy, InsertPt, DL);
+      // We would like to use gvn.markInstructionForDeletion here, but we can't
+      // because the load is already memoized into the leader map table that GVN
+      // tracks.  It is potentially possible to remove the load from the table,
+      // but then there all of the operations based on it would need to be
+      // rehashed.  Just leave the dead load around.
+      gvn.getMemDep().removeInstruction(Load);
       DEBUG(dbgs() << "GVN COERCED NONLOCAL LOAD:\nOffset: " << Offset << "  "
                    << *getCoercedLoadValue() << '\n'
-                   << *Res << '\n' << "\n\n\n");
+                   << *Res << '\n'
+                   << "\n\n\n");
     }
   } else if (isMemIntrinValue()) {
-    Res = GetMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
+    Res = getMemInstValueForLoad(getMemIntrinValue(), Offset, LoadTy,
                                  InsertPt, DL);
     DEBUG(dbgs() << "GVN COERCED NONLOCAL MEM INTRIN:\nOffset: " << Offset
                  << "  " << *getMemIntrinValue() << '\n'
@@ -1259,7 +832,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
       // Can't forward from non-atomic to atomic without violating memory model.
       if (Address && LI->isAtomic() <= DepSI->isAtomic()) {
         int Offset =
-          AnalyzeLoadFromClobberingStore(LI->getType(), Address, DepSI);
+          analyzeLoadFromClobberingStore(LI->getType(), Address, DepSI, DL);
         if (Offset != -1) {
           Res = AvailableValue::get(DepSI->getValueOperand(), Offset);
           return true;
@@ -1277,7 +850,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
       // Can't forward from non-atomic to atomic without violating memory model.
       if (DepLI != LI && Address && LI->isAtomic() <= DepLI->isAtomic()) {
         int Offset =
-          AnalyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
+          analyzeLoadFromClobberingLoad(LI->getType(), Address, DepLI, DL);
 
         if (Offset != -1) {
           Res = AvailableValue::getLoad(DepLI, Offset);
@@ -1290,7 +863,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // forward a value on from it.
     if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInfo.getInst())) {
       if (Address && !LI->isAtomic()) {
-        int Offset = AnalyzeLoadFromClobberingMemInst(LI->getType(), Address,
+        int Offset = analyzeLoadFromClobberingMemInst(LI->getType(), Address,
                                                       DepMI, DL);
         if (Offset != -1) {
           Res = AvailableValue::getMI(DepMI, Offset);
@@ -1335,7 +908,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // different types if we have to. If the stored value is larger or equal to
     // the loaded value, we can reuse it.
     if (S->getValueOperand()->getType() != LI->getType() &&
-        !CanCoerceMustAliasedValueToLoad(S->getValueOperand(),
+        !canCoerceMustAliasedValueToLoad(S->getValueOperand(),
                                          LI->getType(), DL))
       return false;
 
@@ -1352,7 +925,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
     // If the stored value is larger or equal to the loaded value, we can reuse
     // it.
     if (LD->getType() != LI->getType() &&
-        !CanCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
+        !canCoerceMustAliasedValueToLoad(LD, LI->getType(), DL))
       return false;
 
     // Can't forward from non-atomic to atomic without violating memory model.
@@ -1714,7 +1287,7 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
       // If instruction I has debug info, then we should not update it.
       // Also, if I has a null DebugLoc, then it is still potentially incorrect
       // to propagate LI's DebugLoc because LI may not post-dominate I.
-      if (LI->getDebugLoc() && ValuesPerBlock.size() != 1)
+      if (LI->getDebugLoc() && LI->getParent() == I->getParent())
         I->setDebugLoc(LI->getDebugLoc());
     if (V->getType()->getScalarType()->isPointerTy())
       MD->invalidateCachedPointerInfo(V);
@@ -1796,7 +1369,7 @@ static void patchReplacementInstruction(Instruction *I, Value *Repl) {
 
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
-  // Note that if 'I' is a load being replaced by some operation, 
+  // Note that if 'I' is a load being replaced by some operation,
   // for example, by an arithmetic operation, then andIRFlags()
   // would just erase all math flags from the original arithmetic
   // operation, which is clearly not wanted and not needed.
@@ -2188,11 +1761,11 @@ bool GVN::processInstruction(Instruction *I) {
 
     for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
          i != e; ++i) {
-      BasicBlock *Dst = i.getCaseSuccessor();
+      BasicBlock *Dst = i->getCaseSuccessor();
       // If there is only a single edge, propagate the case value into it.
       if (SwitchEdges.lookup(Dst) == 1) {
         BasicBlockEdge E(Parent, Dst);
-        Changed |= propagateEquality(SwitchCond, i.getCaseValue(), E, true);
+        Changed |= propagateEquality(SwitchCond, i->getCaseValue(), E, true);
       }
     }
     return Changed;
@@ -2582,21 +2155,12 @@ bool GVN::iterateOnFunction(Function &F) {
 
   // Top-down walk of the dominator tree
   bool Changed = false;
-  // Save the blocks this function have before transformation begins. GVN may
-  // split critical edge, and hence may invalidate the RPO/DT iterator.
-  //
-  std::vector<BasicBlock *> BBVect;
-  BBVect.reserve(256);
   // Needed for value numbering with phi construction to work.
+  // RPOT walks the graph in its constructor and will not be invalidated during
+  // processBlock.
   ReversePostOrderTraversal<Function *> RPOT(&F);
-  for (ReversePostOrderTraversal<Function *>::rpo_iterator RI = RPOT.begin(),
-                                                           RE = RPOT.end();
-       RI != RE; ++RI)
-    BBVect.push_back(*RI);
-
-  for (std::vector<BasicBlock *>::iterator I = BBVect.begin(), E = BBVect.end();
-       I != E; I++)
-    Changed |= processBlock(*I);
+  for (BasicBlock *BB : RPOT)
+    Changed |= processBlock(BB);
 
   return Changed;
 }
@@ -2784,6 +2348,7 @@ public:
 
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<TargetLibraryInfoWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
diff --git a/lib/Transforms/Scalar/GVNHoist.cpp b/lib/Transforms/Scalar/GVNHoist.cpp
index 90c26e13db78af5aabf8457f2133cd54021aeca8..6adfe130d148b6e41ae488cb0f5e41d34dfea3da 100644
--- a/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/lib/Transforms/Scalar/GVNHoist.cpp
@@ -17,16 +17,39 @@
 // is disabled in the following cases.
 // 1. Scalars across calls.
 // 2. geps when corresponding load/store cannot be hoisted.
+//
+// TODO: Hoist from >2 successors. Currently GVNHoist will not hoist stores
+// in this case because it works on two instructions at a time.
+// entry:
+//   switch i32 %c1, label %exit1 [
+//     i32 0, label %sw0
+//     i32 1, label %sw1
+//   ]
+//
+// sw0:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// sw1:
+//   store i32 1, i32* @G
+//   br label %exit
+//
+// exit1:
+//   store i32 1, i32* @G
+//   ret void
+// exit:
+//   ret void
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/GVN.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
 
 using namespace llvm;
 
@@ -60,7 +83,7 @@ static cl::opt<int>
                    cl::desc("Maximum length of dependent chains to hoist "
                             "(default = 10, unlimited = -1)"));
 
-namespace {
+namespace llvm {
 
 // Provides a sorting function based on the execution order of two instructions.
 struct SortByDFSIn {
@@ -72,13 +95,6 @@ public:
 
   // Returns true when A executes before B.
   bool operator()(const Instruction *A, const Instruction *B) const {
-    // FIXME: libc++ has a std::sort() algorithm that will call the compare
-    // function on the same element.  Once PR20837 is fixed and some more years
-    // pass by and all the buildbots have moved to a corrected std::sort(),
-    // enable the following assert:
-    //
-    // assert(A != B);
-
     const BasicBlock *BA = A->getParent();
     const BasicBlock *BB = B->getParent();
     unsigned ADFS, BDFS;
@@ -200,13 +216,12 @@ static void combineKnownMetadata(Instruction *ReplInst, Instruction *I) {
 class GVNHoist {
 public:
   GVNHoist(DominatorTree *DT, AliasAnalysis *AA, MemoryDependenceResults *MD,
-           MemorySSA *MSSA, bool OptForMinSize)
-      : DT(DT), AA(AA), MD(MD), MSSA(MSSA), OptForMinSize(OptForMinSize),
-        HoistingGeps(OptForMinSize), HoistedCtr(0) {
-      // Hoist as far as possible when optimizing for code-size.
-      if (OptForMinSize)
-        MaxNumberOfBBSInPath = -1;
-  }
+           MemorySSA *MSSA)
+      : DT(DT), AA(AA), MD(MD), MSSA(MSSA),
+        MSSAUpdater(make_unique<MemorySSAUpdater>(MSSA)),
+        HoistingGeps(false),
+        HoistedCtr(0)
+  { }
 
   bool run(Function &F) {
     VN.setDomTree(DT);
@@ -251,10 +266,11 @@ private:
   AliasAnalysis *AA;
   MemoryDependenceResults *MD;
   MemorySSA *MSSA;
-  const bool OptForMinSize;
+  std::unique_ptr<MemorySSAUpdater> MSSAUpdater;
   const bool HoistingGeps;
   DenseMap<const Value *, unsigned> DFSNumber;
   BBSideEffectsSet BBSideEffects;
+  DenseSet<const BasicBlock*> HoistBarrier;
   int HoistedCtr;
 
   enum InsKind { Unknown, Scalar, Load, Store };
@@ -310,8 +326,8 @@ private:
         continue;
       }
 
-      // Check for end of function, calls that do not return, etc.
-      if (!isGuaranteedToTransferExecutionToSuccessor(BB->getTerminator()))
+      // We reached the leaf Basic Block => not all paths have this instruction.
+      if (!BB->getTerminator()->getNumSuccessors())
         return false;
 
       // When reaching the back-edge of a loop, there may be a path through the
@@ -363,7 +379,7 @@ private:
             ReachedNewPt = true;
           }
         }
-        if (defClobbersUseOrDef(Def, MU, *AA))
+        if (MemorySSAUtil::defClobbersUseOrDef(Def, MU, *AA))
           return true;
       }
 
@@ -390,7 +406,8 @@ private:
     // executed between the execution of NewBB and OldBB. Hoisting an expression
     // from OldBB into NewBB has to be safe on all execution paths.
     for (auto I = idf_begin(OldBB), E = idf_end(OldBB); I != E;) {
-      if (*I == NewBB) {
+      const BasicBlock *BB = *I;
+      if (BB == NewBB) {
         // Stop traversal when reaching HoistPt.
         I.skipChildren();
         continue;
@@ -401,11 +418,17 @@ private:
         return true;
 
       // Impossible to hoist with exceptions on the path.
-      if (hasEH(*I))
+      if (hasEH(BB))
+        return true;
+
+      // No such instruction after HoistBarrier in a basic block was
+      // selected for hoisting so instructions selected within basic block with
+      // a hoist barrier can be hoisted.
+      if ((BB != OldBB) && HoistBarrier.count(BB))
         return true;
 
       // Check that we do not move a store past loads.
-      if (hasMemoryUse(NewPt, Def, *I))
+      if (hasMemoryUse(NewPt, Def, BB))
         return true;
 
       // -1 is unlimited number of blocks on all paths.
@@ -422,17 +445,18 @@ private:
   // Decrement by 1 NBBsOnAllPaths for each block between HoistPt and BB, and
   // return true when the counter NBBsOnAllPaths reaches 0, except when it is
   // initialized to -1 which is unlimited.
-  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *BB,
+  bool hasEHOnPath(const BasicBlock *HoistPt, const BasicBlock *SrcBB,
                    int &NBBsOnAllPaths) {
-    assert(DT->dominates(HoistPt, BB) && "Invalid path");
+    assert(DT->dominates(HoistPt, SrcBB) && "Invalid path");
 
     // Walk all basic blocks reachable in depth-first iteration on
     // the inverse CFG from BBInsn to NewHoistPt. These blocks are all the
     // blocks that may be executed between the execution of NewHoistPt and
     // BBInsn. Hoisting an expression from BBInsn into NewHoistPt has to be safe
     // on all execution paths.
-    for (auto I = idf_begin(BB), E = idf_end(BB); I != E;) {
-      if (*I == HoistPt) {
+    for (auto I = idf_begin(SrcBB), E = idf_end(SrcBB); I != E;) {
+      const BasicBlock *BB = *I;
+      if (BB == HoistPt) {
         // Stop traversal when reaching NewHoistPt.
         I.skipChildren();
         continue;
@@ -443,7 +467,13 @@ private:
         return true;
 
       // Impossible to hoist with exceptions on the path.
-      if (hasEH(*I))
+      if (hasEH(BB))
+        return true;
+
+      // No such instruction after HoistBarrier in a basic block was
+      // selected for hoisting so instructions selected within basic block with
+      // a hoist barrier can be hoisted.
+      if ((BB != SrcBB) && HoistBarrier.count(BB))
         return true;
 
       // -1 is unlimited number of blocks on all paths.
@@ -505,11 +535,6 @@ private:
   bool safeToHoistScalar(const BasicBlock *HoistBB,
                          SmallPtrSetImpl<const BasicBlock *> &WL,
                          int &NBBsOnAllPaths) {
-    // Enable scalar hoisting at -Oz as it is safe to hoist scalars to a place
-    // where they are partially needed.
-    if (OptForMinSize)
-      return true;
-
     // Check that the hoisted expression is needed on all paths.
     if (!hoistingFromAllPaths(HoistBB, WL))
       return false;
@@ -634,6 +659,8 @@ private:
       // Compute the insertion point and the list of expressions to be hoisted.
       SmallVecInsn InstructionsToHoist;
       for (auto I : V)
+        // We don't need to check for hoist-barriers here because if
+        // I->getParent() is a barrier then I precedes the barrier.
         if (!hasEH(I->getParent()))
           InstructionsToHoist.push_back(I);
 
@@ -817,9 +844,9 @@ private:
           // legal when the ld/st is not moved past its current definition.
           MemoryAccess *Def = OldMemAcc->getDefiningAccess();
           NewMemAcc =
-              MSSA->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
+            MSSAUpdater->createMemoryAccessInBB(Repl, Def, HoistPt, MemorySSA::End);
           OldMemAcc->replaceAllUsesWith(NewMemAcc);
-          MSSA->removeMemoryAccess(OldMemAcc);
+          MSSAUpdater->removeMemoryAccess(OldMemAcc);
         }
       }
 
@@ -858,7 +885,7 @@ private:
             // Update the uses of the old MSSA access with NewMemAcc.
             MemoryAccess *OldMA = MSSA->getMemoryAccess(I);
             OldMA->replaceAllUsesWith(NewMemAcc);
-            MSSA->removeMemoryAccess(OldMA);
+            MSSAUpdater->removeMemoryAccess(OldMA);
           }
 
           Repl->andIRFlags(I);
@@ -880,7 +907,7 @@ private:
           auto In = Phi->incoming_values();
           if (all_of(In, [&](Use &U) { return U == NewMemAcc; })) {
             Phi->replaceAllUsesWith(NewMemAcc);
-            MSSA->removeMemoryAccess(Phi);
+            MSSAUpdater->removeMemoryAccess(Phi);
           }
         }
       }
@@ -904,6 +931,12 @@ private:
     for (BasicBlock *BB : depth_first(&F.getEntryBlock())) {
       int InstructionNb = 0;
       for (Instruction &I1 : *BB) {
+        // If I1 cannot guarantee progress, subsequent instructions
+        // in BB cannot be hoisted anyways.
+        if (!isGuaranteedToTransferExecutionToSuccessor(&I1)) {
+           HoistBarrier.insert(BB);
+           break;
+        }
         // Only hoist the first instructions in BB up to MaxDepthInBB. Hoisting
         // deeper may increase the register pressure and compilation time.
         if (MaxDepthInBB != -1 && InstructionNb++ >= MaxDepthInBB)
@@ -923,13 +956,8 @@ private:
                 Intr->getIntrinsicID() == Intrinsic::assume)
               continue;
           }
-          if (Call->mayHaveSideEffects()) {
-            if (!OptForMinSize)
-              break;
-            // We may continue hoisting across calls which write to memory.
-            if (Call->mayThrow())
-              break;
-          }
+          if (Call->mayHaveSideEffects())
+            break;
 
           if (Call->isConvergent())
             break;
@@ -971,7 +999,7 @@ public:
     auto &MD = getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
     auto &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
 
-    GVNHoist G(&DT, &AA, &MD, &MSSA, F.optForMinSize());
+    GVNHoist G(&DT, &AA, &MD, &MSSA);
     return G.run(F);
   }
 
@@ -991,7 +1019,7 @@ PreservedAnalyses GVNHoistPass::run(Function &F, FunctionAnalysisManager &AM) {
   AliasAnalysis &AA = AM.getResult<AAManager>(F);
   MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
   MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  GVNHoist G(&DT, &AA, &MD, &MSSA, F.optForMinSize());
+  GVNHoist G(&DT, &AA, &MD, &MSSA);
   if (!G.run(F))
     return PreservedAnalyses::all();
 
diff --git a/lib/Transforms/Scalar/GuardWidening.cpp b/lib/Transforms/Scalar/GuardWidening.cpp
index f4a0832d7de6c6e40974b43a55841f65036272b5..7019287954a15ff7aac442139dc77c834f3cb33c 100644
--- a/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/lib/Transforms/Scalar/GuardWidening.cpp
@@ -568,8 +568,7 @@ bool GuardWideningImpl::combineRangeChecks(
       return RC.getBase() == CurrentBase && RC.getLength() == CurrentLength;
     };
 
-    std::copy_if(Checks.begin(), Checks.end(),
-                 std::back_inserter(CurrentChecks), IsCurrentCheck);
+    copy_if(Checks, std::back_inserter(CurrentChecks), IsCurrentCheck);
     Checks.erase(remove_if(Checks, IsCurrentCheck), Checks.end());
 
     assert(CurrentChecks.size() != 0 && "We know we have at least one!");
diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
index 0aa5f2d9a46563441d981ffffdb18e51b1d3bb80..dcb2a4a0c6e6bba92857e2850a7bdeddd568395e 100644
--- a/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -231,8 +231,9 @@ static bool ConvertToSInt(const APFloat &APF, int64_t &IntVal) {
   bool isExact = false;
   // See if we can convert this to an int64_t
   uint64_t UIntVal;
-  if (APF.convertToInteger(&UIntVal, 64, true, APFloat::rmTowardZero,
-                           &isExact) != APFloat::opOK || !isExact)
+  if (APF.convertToInteger(makeMutableArrayRef(UIntVal), 64, true,
+                           APFloat::rmTowardZero, &isExact) != APFloat::opOK ||
+      !isExact)
     return false;
   IntVal = UIntVal;
   return true;
@@ -906,7 +907,7 @@ class WidenIV {
   SmallVector<NarrowIVDefUse, 8> NarrowIVUsers;
 
   enum ExtendKind { ZeroExtended, SignExtended, Unknown };
-  // A map tracking the kind of extension used to widen each narrow IV 
+  // A map tracking the kind of extension used to widen each narrow IV
   // and narrow IV user.
   // Key: pointer to a narrow IV or IV user.
   // Value: the kind of extension used to widen this Instruction.
@@ -1608,7 +1609,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
       return;
 
     CmpInst::Predicate P =
-            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);  
+            TrueDest ? Pred : CmpInst::getInversePredicate(Pred);
 
     auto CmpRHSRange = SE->getSignedRange(SE->getSCEV(CmpRHS));
     auto CmpConstrainedLHSRange =
@@ -1634,7 +1635,7 @@ void WidenIV::calculatePostIncRange(Instruction *NarrowDef,
   UpdateRangeFromGuards(NarrowUser);
 
   BasicBlock *NarrowUserBB = NarrowUser->getParent();
-  // If NarrowUserBB is statically unreachable asking dominator queries may 
+  // If NarrowUserBB is statically unreachable asking dominator queries may
   // yield surprising results. (e.g. the block may not have a dom tree node)
   if (!DT->isReachableFromEntry(NarrowUserBB))
     return;
@@ -2152,6 +2153,8 @@ linearFunctionTestReplace(Loop *L,
   Value *CmpIndVar = IndVar;
   const SCEV *IVCount = BackedgeTakenCount;
 
+  assert(L->getLoopLatch() && "Loop no longer in simplified form?");
+
   // If the exiting block is the same as the backedge block, we prefer to
   // compare against the post-incremented value, otherwise we must compare
   // against the preincremented value.
@@ -2376,6 +2379,7 @@ bool IndVarSimplify::run(Loop *L) {
   //    Loop::getCanonicalInductionVariable only supports loops with preheaders,
   //    and we're in trouble if we can't find the induction variable even when
   //    we've manually inserted one.
+  //  - LFTR relies on having a single backedge.
   if (!L->isLoopSimplifyForm())
     return false;
 
diff --git a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 8e81541c233786afc19457108e6d3dc650a139bf..85db6e5e11052ad483026755ef3c44ab260d64be 100644
--- a/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -446,6 +446,15 @@ struct LoopStructure {
   BasicBlock *LatchExit;
   unsigned LatchBrExitIdx;
 
+  // The loop represented by this instance of LoopStructure is semantically
+  // equivalent to:
+  //
+  // intN_ty inc = IndVarIncreasing ? 1 : -1;
+  // pred_ty predicate = IndVarIncreasing ? ICMP_SLT : ICMP_SGT;
+  //
+  // for (intN_ty iv = IndVarStart; predicate(iv, LoopExitAt); iv = IndVarNext)
+  //   ... body ...
+
   Value *IndVarNext;
   Value *IndVarStart;
   Value *LoopExitAt;
@@ -789,6 +798,10 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
     return None;
   }
 
+  const SCEV *StartNext = IndVarNext->getStart();
+  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
+  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
+
   ConstantInt *One = ConstantInt::get(IndVarTy, 1);
   // TODO: generalize the predicates here to also match their unsigned variants.
   if (IsIncreasing) {
@@ -809,10 +822,22 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
+      if (!SE.isLoopEntryGuardedByCond(
+              &L, CmpInst::ICMP_SLT, IndVarStart,
+              SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+        FailureReason = "Induction variable start not bounded by upper limit";
+        return None;
+      }
+
       IRBuilder<> B(Preheader->getTerminator());
       RightValue = B.CreateAdd(RightValue, One);
+    } else {
+      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SLT, IndVarStart,
+                                       RightSCEV)) {
+        FailureReason = "Induction variable start not bounded by upper limit";
+        return None;
+      }
     }
-
   } else {
     bool FoundExpectedPred =
         (Pred == ICmpInst::ICMP_SGT && LatchBrExitIdx == 1) ||
@@ -831,15 +856,24 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE, BranchProbabilityInfo &BP
         return None;
       }
 
+      if (!SE.isLoopEntryGuardedByCond(
+              &L, CmpInst::ICMP_SGT, IndVarStart,
+              SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType())))) {
+        FailureReason = "Induction variable start not bounded by lower limit";
+        return None;
+      }
+
       IRBuilder<> B(Preheader->getTerminator());
       RightValue = B.CreateSub(RightValue, One);
+    } else {
+      if (!SE.isLoopEntryGuardedByCond(&L, CmpInst::ICMP_SGT, IndVarStart,
+                                       RightSCEV)) {
+        FailureReason = "Induction variable start not bounded by lower limit";
+        return None;
+      }
     }
   }
 
-  const SCEV *StartNext = IndVarNext->getStart();
-  const SCEV *Addend = SE.getNegativeSCEV(IndVarNext->getStepRecurrence(SE));
-  const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
-
   BasicBlock *LatchExit = LatchBr->getSuccessor(LatchBrExitIdx);
 
   assert(SE.getLoopDisposition(LatchCount, &L) ==
diff --git a/lib/Transforms/Scalar/InferAddressSpaces.cpp b/lib/Transforms/Scalar/InferAddressSpaces.cpp
index 16f2fc0a302b229bf5fd5f1df35bd1baa90c841d..5d8701431a2ce7242eeb4877c3995425e41e6623 100644
--- a/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -108,12 +108,12 @@
 using namespace llvm;
 
 namespace {
-static const unsigned UnknownAddressSpace = ~0u;
+static const unsigned UninitializedAddressSpace = ~0u;
 
 using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
 
 /// \brief InferAddressSpaces
-class InferAddressSpaces: public FunctionPass {
+class InferAddressSpaces : public FunctionPass {
   /// Target specific address space which uses of should be replaced if
   /// possible.
   unsigned FlatAddrSpace;
@@ -141,6 +141,8 @@ private:
   void inferAddressSpaces(const std::vector<Value *> &Postorder,
                           ValueToAddrSpaceMapTy *InferredAddrSpace) const;
 
+  bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
+
   // Changes the flat address expressions in function F to point to specific
   // address spaces if InferredAddrSpace says so. Postorder is the postorder of
   // all flat expressions in the use-def graph of function F.
@@ -153,7 +155,15 @@ private:
     Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
     DenseSet<Value *> *Visited) const;
 
+  bool rewriteIntrinsicOperands(IntrinsicInst *II,
+                                Value *OldV, Value *NewV) const;
+  void collectRewritableIntrinsicOperands(
+    IntrinsicInst *II,
+    std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) const;
+
   std::vector<Value *> collectFlatAddressExpressions(Function &F) const;
+
   Value *cloneValueWithNewAddressSpace(
     Value *V, unsigned NewAddrSpace,
     const ValueToValueMapTy &ValueWithNewAddrSpace,
@@ -183,6 +193,7 @@ static bool isAddressExpression(const Value &V) {
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
   case Instruction::GetElementPtr:
+  case Instruction::Select:
     return true;
   default:
     return false;
@@ -194,7 +205,7 @@ static bool isAddressExpression(const Value &V) {
 // Precondition: V is an address expression.
 static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
   assert(isAddressExpression(V));
-  const Operator& Op = cast<Operator>(V);
+  const Operator &Op = cast<Operator>(V);
   switch (Op.getOpcode()) {
   case Instruction::PHI: {
     auto IncomingValues = cast<PHINode>(Op).incoming_values();
@@ -205,16 +216,64 @@ static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
   case Instruction::AddrSpaceCast:
   case Instruction::GetElementPtr:
     return {Op.getOperand(0)};
+  case Instruction::Select:
+    return {Op.getOperand(1), Op.getOperand(2)};
   default:
     llvm_unreachable("Unexpected instruction type.");
   }
 }
 
+// TODO: Move logic to TTI?
+bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
+                                                  Value *OldV,
+                                                  Value *NewV) const {
+  Module *M = II->getParent()->getParent()->getParent();
+
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:{
+    const ConstantInt *IsVolatile = dyn_cast<ConstantInt>(II->getArgOperand(4));
+    if (!IsVolatile || !IsVolatile->isNullValue())
+      return false;
+
+    LLVM_FALLTHROUGH;
+  }
+  case Intrinsic::objectsize: {
+    Type *DestTy = II->getType();
+    Type *SrcTy = NewV->getType();
+    Function *NewDecl =
+        Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
+    II->setArgOperand(0, NewV);
+    II->setCalledFunction(NewDecl);
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
+// TODO: Move logic to TTI?
+void InferAddressSpaces::collectRewritableIntrinsicOperands(
+    IntrinsicInst *II, std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) const {
+  switch (II->getIntrinsicID()) {
+  case Intrinsic::objectsize:
+  case Intrinsic::amdgcn_atomic_inc:
+  case Intrinsic::amdgcn_atomic_dec:
+    appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
+                                                 PostorderStack, Visited);
+    break;
+  default:
+    break;
+  }
+}
+
+// Returns all flat address expressions in function F. The elements are
 // If V is an unvisited flat address expression, appends V to PostorderStack
 // and marks it as visited.
 void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
-  Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
-  DenseSet<Value *> *Visited) const {
+    Value *V, std::vector<std::pair<Value *, bool>> *PostorderStack,
+    DenseSet<Value *> *Visited) const {
   assert(V->getType()->isPointerTy());
   if (isAddressExpression(*V) &&
       V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
@@ -224,18 +283,18 @@ void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
 }
 
 // Returns all flat address expressions in function F. The elements are ordered
-// in postorder.
+// ordered in postorder.
 std::vector<Value *>
 InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
   // This function implements a non-recursive postorder traversal of a partial
   // use-def graph of function F.
-  std::vector<std::pair<Value*, bool>> PostorderStack;
+  std::vector<std::pair<Value *, bool>> PostorderStack;
   // The set of visited expressions.
-  DenseSet<Value*> Visited;
+  DenseSet<Value *> Visited;
 
   auto PushPtrOperand = [&](Value *Ptr) {
-    appendsFlatAddressExpressionToPostorderStack(
-      Ptr, &PostorderStack, &Visited);
+    appendsFlatAddressExpressionToPostorderStack(Ptr, &PostorderStack,
+                                                 &Visited);
   };
 
   // We only explore address expressions that are reachable from loads and
@@ -249,8 +308,22 @@ InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
       PushPtrOperand(RMW->getPointerOperand());
     else if (auto *CmpX = dyn_cast<AtomicCmpXchgInst>(&I))
       PushPtrOperand(CmpX->getPointerOperand());
-
-    // TODO: Support intrinsics
+    else if (auto *MI = dyn_cast<MemIntrinsic>(&I)) {
+      // For memset/memcpy/memmove, any pointer operand can be replaced.
+      PushPtrOperand(MI->getRawDest());
+
+      // Handle 2nd operand for memcpy/memmove.
+      if (auto *MTI = dyn_cast<MemTransferInst>(MI))
+        PushPtrOperand(MTI->getRawSource());
+    } else if (auto *II = dyn_cast<IntrinsicInst>(&I))
+      collectRewritableIntrinsicOperands(II, &PostorderStack, &Visited);
+    else if (ICmpInst *Cmp = dyn_cast<ICmpInst>(&I)) {
+      // FIXME: Handle vectors of pointers
+      if (Cmp->getOperand(0)->getType()->isPointerTy()) {
+        PushPtrOperand(Cmp->getOperand(0));
+        PushPtrOperand(Cmp->getOperand(1));
+      }
+    }
   }
 
   std::vector<Value *> Postorder; // The resultant postorder.
@@ -265,8 +338,8 @@ InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
     // Otherwise, adds its operands to the stack and explores them.
     PostorderStack.back().second = true;
     for (Value *PtrOperand : getPointerOperands(*PostorderStack.back().first)) {
-      appendsFlatAddressExpressionToPostorderStack(
-        PtrOperand, &PostorderStack, &Visited);
+      appendsFlatAddressExpressionToPostorderStack(PtrOperand, &PostorderStack,
+                                                   &Visited);
     }
   }
   return Postorder;
@@ -276,16 +349,22 @@ InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
 // of OperandUse.get() in the new address space. If the clone is not ready yet,
 // returns an undef in the new address space as a placeholder.
 static Value *operandWithNewAddressSpaceOrCreateUndef(
-  const Use &OperandUse, unsigned NewAddrSpace,
-  const ValueToValueMapTy &ValueWithNewAddrSpace,
-  SmallVectorImpl<const Use *> *UndefUsesToFix) {
+    const Use &OperandUse, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
   Value *Operand = OperandUse.get();
+
+  Type *NewPtrTy =
+      Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+
+  if (Constant *C = dyn_cast<Constant>(Operand))
+    return ConstantExpr::getAddrSpaceCast(C, NewPtrTy);
+
   if (Value *NewOperand = ValueWithNewAddrSpace.lookup(Operand))
     return NewOperand;
 
   UndefUsesToFix->push_back(&OperandUse);
-  return UndefValue::get(
-    Operand->getType()->getPointerElementType()->getPointerTo(NewAddrSpace));
+  return UndefValue::get(NewPtrTy);
 }
 
 // Returns a clone of `I` with its operands converted to those specified in
@@ -298,11 +377,11 @@ static Value *operandWithNewAddressSpaceOrCreateUndef(
 // from a pointer whose type already matches. Therefore, this function returns a
 // Value* instead of an Instruction*.
 static Value *cloneInstructionWithNewAddressSpace(
-  Instruction *I, unsigned NewAddrSpace,
-  const ValueToValueMapTy &ValueWithNewAddrSpace,
-  SmallVectorImpl<const Use *> *UndefUsesToFix) {
+    Instruction *I, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace,
+    SmallVectorImpl<const Use *> *UndefUsesToFix) {
   Type *NewPtrType =
-    I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
+      I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
 
   if (I->getOpcode() == Instruction::AddrSpaceCast) {
     Value *Src = I->getOperand(0);
@@ -342,11 +421,16 @@ static Value *cloneInstructionWithNewAddressSpace(
   case Instruction::GetElementPtr: {
     GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
     GetElementPtrInst *NewGEP = GetElementPtrInst::Create(
-      GEP->getSourceElementType(), NewPointerOperands[0],
-      SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
+        GEP->getSourceElementType(), NewPointerOperands[0],
+        SmallVector<Value *, 4>(GEP->idx_begin(), GEP->idx_end()));
     NewGEP->setIsInBounds(GEP->isInBounds());
     return NewGEP;
   }
+  case Instruction::Select: {
+    assert(I->getType()->isPointerTy());
+    return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
+                              NewPointerOperands[2], "", nullptr, I);
+  }
   default:
     llvm_unreachable("Unexpected opcode");
   }
@@ -370,6 +454,24 @@ static Value *cloneConstantExprWithNewAddressSpace(
     return ConstantExpr::getBitCast(CE->getOperand(0), TargetType);
   }
 
+  if (CE->getOpcode() == Instruction::BitCast) {
+    if (Value *NewOperand = ValueWithNewAddrSpace.lookup(CE->getOperand(0)))
+      return ConstantExpr::getBitCast(cast<Constant>(NewOperand), TargetType);
+    return ConstantExpr::getAddrSpaceCast(CE, TargetType);
+  }
+
+  if (CE->getOpcode() == Instruction::Select) {
+    Constant *Src0 = CE->getOperand(1);
+    Constant *Src1 = CE->getOperand(2);
+    if (Src0->getType()->getPointerAddressSpace() ==
+        Src1->getType()->getPointerAddressSpace()) {
+
+      return ConstantExpr::getSelect(
+          CE->getOperand(0), ConstantExpr::getAddrSpaceCast(Src0, TargetType),
+          ConstantExpr::getAddrSpaceCast(Src1, TargetType));
+    }
+  }
+
   // Computes the operands of the new constant expression.
   SmallVector<Constant *, 4> NewOperands;
   for (unsigned Index = 0; Index < CE->getNumOperands(); ++Index) {
@@ -434,9 +536,9 @@ unsigned InferAddressSpaces::joinAddressSpaces(unsigned AS1,
   if (AS1 == FlatAddrSpace || AS2 == FlatAddrSpace)
     return FlatAddrSpace;
 
-  if (AS1 == UnknownAddressSpace)
+  if (AS1 == UninitializedAddressSpace)
     return AS2;
-  if (AS2 == UnknownAddressSpace)
+  if (AS2 == UninitializedAddressSpace)
     return AS1;
 
   // The join of two different specific address spaces is flat.
@@ -447,9 +549,10 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  const TargetTransformInfo &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  const TargetTransformInfo &TTI =
+      getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   FlatAddrSpace = TTI.getFlatAddressSpace();
-  if (FlatAddrSpace == UnknownAddressSpace)
+  if (FlatAddrSpace == UninitializedAddressSpace)
     return false;
 
   // Collects all flat address expressions in postorder.
@@ -466,15 +569,15 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
 }
 
 void InferAddressSpaces::inferAddressSpaces(
-  const std::vector<Value *> &Postorder,
-  ValueToAddrSpaceMapTy *InferredAddrSpace) const {
+    const std::vector<Value *> &Postorder,
+    ValueToAddrSpaceMapTy *InferredAddrSpace) const {
   SetVector<Value *> Worklist(Postorder.begin(), Postorder.end());
   // Initially, all expressions are in the uninitialized address space.
   for (Value *V : Postorder)
-    (*InferredAddrSpace)[V] = UnknownAddressSpace;
+    (*InferredAddrSpace)[V] = UninitializedAddressSpace;
 
   while (!Worklist.empty()) {
-    Value* V = Worklist.pop_back_val();
+    Value *V = Worklist.pop_back_val();
 
     // Tries to update the address space of the stack top according to the
     // address spaces of its operands.
@@ -510,23 +613,53 @@ void InferAddressSpaces::inferAddressSpaces(
 }
 
 Optional<unsigned> InferAddressSpaces::updateAddressSpace(
-  const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
+    const Value &V, const ValueToAddrSpaceMapTy &InferredAddrSpace) const {
   assert(InferredAddrSpace.count(&V));
 
   // The new inferred address space equals the join of the address spaces
   // of all its pointer operands.
-  unsigned NewAS = UnknownAddressSpace;
-  for (Value *PtrOperand : getPointerOperands(V)) {
-    unsigned OperandAS;
-    if (InferredAddrSpace.count(PtrOperand))
-      OperandAS = InferredAddrSpace.lookup(PtrOperand);
+  unsigned NewAS = UninitializedAddressSpace;
+
+  const Operator &Op = cast<Operator>(V);
+  if (Op.getOpcode() == Instruction::Select) {
+    Value *Src0 = Op.getOperand(1);
+    Value *Src1 = Op.getOperand(2);
+
+    auto I = InferredAddrSpace.find(Src0);
+    unsigned Src0AS = (I != InferredAddrSpace.end()) ?
+      I->second : Src0->getType()->getPointerAddressSpace();
+
+    auto J = InferredAddrSpace.find(Src1);
+    unsigned Src1AS = (J != InferredAddrSpace.end()) ?
+      J->second : Src1->getType()->getPointerAddressSpace();
+
+    auto *C0 = dyn_cast<Constant>(Src0);
+    auto *C1 = dyn_cast<Constant>(Src1);
+
+    // If one of the inputs is a constant, we may be able to do a constant
+    // addrspacecast of it. Defer inferring the address space until the input
+    // address space is known.
+    if ((C1 && Src0AS == UninitializedAddressSpace) ||
+        (C0 && Src1AS == UninitializedAddressSpace))
+      return None;
+
+    if (C0 && isSafeToCastConstAddrSpace(C0, Src1AS))
+      NewAS = Src1AS;
+    else if (C1 && isSafeToCastConstAddrSpace(C1, Src0AS))
+      NewAS = Src0AS;
     else
-      OperandAS = PtrOperand->getType()->getPointerAddressSpace();
-    NewAS = joinAddressSpaces(NewAS, OperandAS);
-
-    // join(flat, *) = flat. So we can break if NewAS is already flat.
-    if (NewAS == FlatAddrSpace)
-      break;
+      NewAS = joinAddressSpaces(Src0AS, Src1AS);
+  } else {
+    for (Value *PtrOperand : getPointerOperands(V)) {
+      auto I = InferredAddrSpace.find(PtrOperand);
+      unsigned OperandAS = I != InferredAddrSpace.end() ?
+        I->second : PtrOperand->getType()->getPointerAddressSpace();
+
+      // join(flat, *) = flat. So we can break if NewAS is already flat.
+      NewAS = joinAddressSpaces(NewAS, OperandAS);
+      if (NewAS == FlatAddrSpace)
+        break;
+    }
   }
 
   unsigned OldAS = InferredAddrSpace.lookup(&V);
@@ -560,6 +693,93 @@ static bool isSimplePointerUseValidToReplace(Use &U) {
   return false;
 }
 
+/// Update memory intrinsic uses that require more complex processing than
+/// simple memory instructions. Thse require re-mangling and may have multiple
+/// pointer operands.
+static bool handleMemIntrinsicPtrUse(MemIntrinsic *MI, Value *OldV,
+                                     Value *NewV) {
+  IRBuilder<> B(MI);
+  MDNode *TBAA = MI->getMetadata(LLVMContext::MD_tbaa);
+  MDNode *ScopeMD = MI->getMetadata(LLVMContext::MD_alias_scope);
+  MDNode *NoAliasMD = MI->getMetadata(LLVMContext::MD_noalias);
+
+  if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
+    B.CreateMemSet(NewV, MSI->getValue(),
+                   MSI->getLength(), MSI->getAlignment(),
+                   false, // isVolatile
+                   TBAA, ScopeMD, NoAliasMD);
+  } else if (auto *MTI = dyn_cast<MemTransferInst>(MI)) {
+    Value *Src = MTI->getRawSource();
+    Value *Dest = MTI->getRawDest();
+
+    // Be careful in case this is a self-to-self copy.
+    if (Src == OldV)
+      Src = NewV;
+
+    if (Dest == OldV)
+      Dest = NewV;
+
+    if (isa<MemCpyInst>(MTI)) {
+      MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
+      B.CreateMemCpy(Dest, Src, MTI->getLength(),
+                     MTI->getAlignment(),
+                     false, // isVolatile
+                     TBAA, TBAAStruct, ScopeMD, NoAliasMD);
+    } else {
+      assert(isa<MemMoveInst>(MTI));
+      B.CreateMemMove(Dest, Src, MTI->getLength(),
+                      MTI->getAlignment(),
+                      false, // isVolatile
+                      TBAA, ScopeMD, NoAliasMD);
+    }
+  } else
+    llvm_unreachable("unhandled MemIntrinsic");
+
+  MI->eraseFromParent();
+  return true;
+}
+
+// \p returns true if it is OK to change the address space of constant \p C with
+// a ConstantExpr addrspacecast.
+bool InferAddressSpaces::isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const {
+  assert(NewAS != UninitializedAddressSpace);
+
+  unsigned SrcAS = C->getType()->getPointerAddressSpace();
+  if (SrcAS == NewAS || isa<UndefValue>(C))
+    return true;
+
+  // Prevent illegal casts between different non-flat address spaces.
+  if (SrcAS != FlatAddrSpace && NewAS != FlatAddrSpace)
+    return false;
+
+  if (isa<ConstantPointerNull>(C))
+    return true;
+
+  if (auto *Op = dyn_cast<Operator>(C)) {
+    // If we already have a constant addrspacecast, it should be safe to cast it
+    // off.
+    if (Op->getOpcode() == Instruction::AddrSpaceCast)
+      return isSafeToCastConstAddrSpace(cast<Constant>(Op->getOperand(0)), NewAS);
+
+    if (Op->getOpcode() == Instruction::IntToPtr &&
+        Op->getType()->getPointerAddressSpace() == FlatAddrSpace)
+      return true;
+  }
+
+  return false;
+}
+
+static Value::use_iterator skipToNextUser(Value::use_iterator I,
+                                          Value::use_iterator End) {
+  User *CurUser = I->getUser();
+  ++I;
+
+  while (I != End && I->getUser() == CurUser)
+    ++I;
+
+  return I;
+}
+
 bool InferAddressSpaces::rewriteWithNewAddressSpaces(
   const std::vector<Value *> &Postorder,
   const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const {
@@ -581,7 +801,7 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
     return false;
 
   // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
-  for (const Use* UndefUse : UndefUsesToFix) {
+  for (const Use *UndefUse : UndefUsesToFix) {
     User *V = UndefUse->getUser();
     User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
     unsigned OperandNo = UndefUse->getOperandNo();
@@ -595,40 +815,82 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
     if (NewV == nullptr)
       continue;
 
-    SmallVector<Use *, 4> Uses;
-    for (Use &U : V->uses())
-      Uses.push_back(&U);
-
     DEBUG(dbgs() << "Replacing the uses of " << *V
                  << "\n  with\n  " << *NewV << '\n');
 
-    for (Use *U : Uses) {
-      if (isSimplePointerUseValidToReplace(*U)) {
+    Value::use_iterator I, E, Next;
+    for (I = V->use_begin(), E = V->use_end(); I != E; ) {
+      Use &U = *I;
+
+      // Some users may see the same pointer operand in multiple operands. Skip
+      // to the next instruction.
+      I = skipToNextUser(I, E);
+
+      if (isSimplePointerUseValidToReplace(U)) {
         // If V is used as the pointer operand of a compatible memory operation,
         // sets the pointer operand to NewV. This replacement does not change
         // the element type, so the resultant load/store is still valid.
-        U->set(NewV);
-      } else if (isa<Instruction>(U->getUser())) {
+        U.set(NewV);
+        continue;
+      }
+
+      User *CurUser = U.getUser();
+      // Handle more complex cases like intrinsic that need to be remangled.
+      if (auto *MI = dyn_cast<MemIntrinsic>(CurUser)) {
+        if (!MI->isVolatile() && handleMemIntrinsicPtrUse(MI, V, NewV))
+          continue;
+      }
+
+      if (auto *II = dyn_cast<IntrinsicInst>(CurUser)) {
+        if (rewriteIntrinsicOperands(II, V, NewV))
+          continue;
+      }
+
+      if (isa<Instruction>(CurUser)) {
+        if (ICmpInst *Cmp = dyn_cast<ICmpInst>(CurUser)) {
+          // If we can infer that both pointers are in the same addrspace,
+          // transform e.g.
+          //   %cmp = icmp eq float* %p, %q
+          // into
+          //   %cmp = icmp eq float addrspace(3)* %new_p, %new_q
+
+          unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+          int SrcIdx = U.getOperandNo();
+          int OtherIdx = (SrcIdx == 0) ? 1 : 0;
+          Value *OtherSrc = Cmp->getOperand(OtherIdx);
+
+          if (Value *OtherNewV = ValueWithNewAddrSpace.lookup(OtherSrc)) {
+            if (OtherNewV->getType()->getPointerAddressSpace() == NewAS) {
+              Cmp->setOperand(OtherIdx, OtherNewV);
+              Cmp->setOperand(SrcIdx, NewV);
+              continue;
+            }
+          }
+
+          // Even if the type mismatches, we can cast the constant.
+          if (auto *KOtherSrc = dyn_cast<Constant>(OtherSrc)) {
+            if (isSafeToCastConstAddrSpace(KOtherSrc, NewAS)) {
+              Cmp->setOperand(SrcIdx, NewV);
+              Cmp->setOperand(OtherIdx,
+                ConstantExpr::getAddrSpaceCast(KOtherSrc, NewV->getType()));
+              continue;
+            }
+          }
+        }
+
         // Otherwise, replaces the use with flat(NewV).
-        // TODO: Some optimization opportunities are missed. For example, in
-        //   %0 = icmp eq float* %p, %q
-        // if both p and q are inferred to be shared, we can rewrite %0 as
-        //   %0 = icmp eq float addrspace(3)* %new_p, %new_q
-        // instead of currently
-        //   %flat_p = addrspacecast float addrspace(3)* %new_p to float*
-        //   %flat_q = addrspacecast float addrspace(3)* %new_q to float*
-        //   %0 = icmp eq float* %flat_p, %flat_q
         if (Instruction *I = dyn_cast<Instruction>(V)) {
           BasicBlock::iterator InsertPos = std::next(I->getIterator());
           while (isa<PHINode>(InsertPos))
             ++InsertPos;
-          U->set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
+          U.set(new AddrSpaceCastInst(NewV, V->getType(), "", &*InsertPos));
         } else {
-          U->set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
-                                                V->getType()));
+          U.set(ConstantExpr::getAddrSpaceCast(cast<Constant>(NewV),
+                                               V->getType()));
         }
       }
     }
+
     if (V->use_empty())
       RecursivelyDeleteTriviallyDeadInstructions(V);
   }
diff --git a/lib/Transforms/Scalar/JumpThreading.cpp b/lib/Transforms/Scalar/JumpThreading.cpp
index ecc6a0afbad89f03f22be143a1c11d85cc5fd849..08eb95a1a3d3e8396777b85b57dd39f433be4f01 100644
--- a/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/lib/Transforms/Scalar/JumpThreading.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/BlockFrequencyInfoImpl.h"
@@ -30,11 +31,13 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
@@ -89,6 +92,7 @@ namespace {
     bool runOnFunction(Function &F) override;
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<AAResultsWrapperPass>();
       AU.addRequired<LazyValueInfoWrapperPass>();
       AU.addPreserved<LazyValueInfoWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
@@ -104,6 +108,7 @@ INITIALIZE_PASS_BEGIN(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 INITIALIZE_PASS_DEPENDENCY(LazyValueInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_END(JumpThreading, "jump-threading",
                 "Jump Threading", false, false)
 
@@ -121,6 +126,7 @@ bool JumpThreading::runOnFunction(Function &F) {
     return false;
   auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
   auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
+  auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = F.getEntryCount().hasValue();
@@ -129,7 +135,8 @@ bool JumpThreading::runOnFunction(Function &F) {
     BPI.reset(new BranchProbabilityInfo(F, LI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
-  return Impl.runImpl(F, TLI, LVI, HasProfileData, std::move(BFI),
+
+  return Impl.runImpl(F, TLI, LVI, AA, HasProfileData, std::move(BFI),
                       std::move(BPI));
 }
 
@@ -138,6 +145,8 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &LVI = AM.getResult<LazyValueAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+
   std::unique_ptr<BlockFrequencyInfo> BFI;
   std::unique_ptr<BranchProbabilityInfo> BPI;
   bool HasProfileData = F.getEntryCount().hasValue();
@@ -146,8 +155,9 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
     BPI.reset(new BranchProbabilityInfo(F, LI));
     BFI.reset(new BlockFrequencyInfo(F, *BPI, LI));
   }
-  bool Changed =
-      runImpl(F, &TLI, &LVI, HasProfileData, std::move(BFI), std::move(BPI));
+
+  bool Changed = runImpl(F, &TLI, &LVI, &AA, HasProfileData, std::move(BFI),
+                         std::move(BPI));
 
   if (!Changed)
     return PreservedAnalyses::all();
@@ -157,18 +167,23 @@ PreservedAnalyses JumpThreadingPass::run(Function &F,
 }
 
 bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
-                                LazyValueInfo *LVI_, bool HasProfileData_,
+                                LazyValueInfo *LVI_, AliasAnalysis *AA_,
+                                bool HasProfileData_,
                                 std::unique_ptr<BlockFrequencyInfo> BFI_,
                                 std::unique_ptr<BranchProbabilityInfo> BPI_) {
 
   DEBUG(dbgs() << "Jump threading on function '" << F.getName() << "'\n");
   TLI = TLI_;
   LVI = LVI_;
+  AA = AA_;
   BFI.reset();
   BPI.reset();
   // When profile data is available, we need to update edge weights after
   // successful jump threading, which requires both BPI and BFI being available.
   HasProfileData = HasProfileData_;
+  auto *GuardDecl = F.getParent()->getFunction(
+      Intrinsic::getName(Intrinsic::experimental_guard));
+  HasGuards = GuardDecl && !GuardDecl->use_empty();
   if (HasProfileData) {
     BPI = std::move(BPI_);
     BFI = std::move(BFI_);
@@ -222,26 +237,13 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
           BB != &BB->getParent()->getEntryBlock() &&
           // If the terminator is the only non-phi instruction, try to nuke it.
           BB->getFirstNonPHIOrDbg()->isTerminator() && !LoopHeaders.count(BB)) {
-        // Since TryToSimplifyUncondBranchFromEmptyBlock may delete the
-        // block, we have to make sure it isn't in the LoopHeaders set.  We
-        // reinsert afterward if needed.
-        bool ErasedFromLoopHeaders = LoopHeaders.erase(BB);
-        BasicBlock *Succ = BI->getSuccessor(0);
-
         // FIXME: It is always conservatively correct to drop the info
         // for a block even if it doesn't get erased.  This isn't totally
         // awesome, but it allows us to use AssertingVH to prevent nasty
         // dangling pointer issues within LazyValueInfo.
         LVI->eraseBlock(BB);
-        if (TryToSimplifyUncondBranchFromEmptyBlock(BB)) {
+        if (TryToSimplifyUncondBranchFromEmptyBlock(BB))
           Changed = true;
-          // If we deleted BB and BB was the header of a loop, then the
-          // successor is now the header of the loop.
-          BB = Succ;
-        }
-
-        if (ErasedFromLoopHeaders)
-          LoopHeaders.insert(BB);
       }
     }
     EverChanged |= Changed;
@@ -251,10 +253,13 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
   return EverChanged;
 }
 
-/// getJumpThreadDuplicationCost - Return the cost of duplicating this block to
-/// thread across it. Stop scanning the block when passing the threshold.
-static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
+/// Return the cost of duplicating a piece of this block from first non-phi
+/// and before StopAt instruction to thread across it. Stop scanning the block
+/// when exceeding the threshold. If duplication is impossible, returns ~0U.
+static unsigned getJumpThreadDuplicationCost(BasicBlock *BB,
+                                             Instruction *StopAt,
                                              unsigned Threshold) {
+  assert(StopAt->getParent() == BB && "Not an instruction from proper BB?");
   /// Ignore PHI nodes, these will be flattened when duplication happens.
   BasicBlock::const_iterator I(BB->getFirstNonPHI());
 
@@ -262,15 +267,17 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
   // branch, so they shouldn't count against the duplication cost.
 
   unsigned Bonus = 0;
-  const TerminatorInst *BBTerm = BB->getTerminator();
-  // Threading through a switch statement is particularly profitable.  If this
-  // block ends in a switch, decrease its cost to make it more likely to happen.
-  if (isa<SwitchInst>(BBTerm))
-    Bonus = 6;
-
-  // The same holds for indirect branches, but slightly more so.
-  if (isa<IndirectBrInst>(BBTerm))
-    Bonus = 8;
+  if (BB->getTerminator() == StopAt) {
+    // Threading through a switch statement is particularly profitable.  If this
+    // block ends in a switch, decrease its cost to make it more likely to
+    // happen.
+    if (isa<SwitchInst>(StopAt))
+      Bonus = 6;
+
+    // The same holds for indirect branches, but slightly more so.
+    if (isa<IndirectBrInst>(StopAt))
+      Bonus = 8;
+  }
 
   // Bump the threshold up so the early exit from the loop doesn't skip the
   // terminator-based Size adjustment at the end.
@@ -279,7 +286,7 @@ static unsigned getJumpThreadDuplicationCost(const BasicBlock *BB,
   // Sum up the cost of each instruction until we get to the terminator.  Don't
   // include the terminator because the copy won't include it.
   unsigned Size = 0;
-  for (; !isa<TerminatorInst>(I); ++I) {
+  for (; &*I != StopAt; ++I) {
 
     // Stop scanning the block if we've reached the threshold.
     if (Size > Threshold)
@@ -725,6 +732,10 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
   if (TryToUnfoldSelectInCurrBB(BB))
     return true;
 
+  // Look if we can propagate guards to predecessors.
+  if (HasGuards && ProcessGuards(BB))
+    return true;
+
   // What kind of constant we're looking for.
   ConstantPreference Preference = WantInteger;
 
@@ -800,7 +811,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     return false;
   }
 
-
   if (CmpInst *CondCmp = dyn_cast<CmpInst>(CondInst)) {
     // If we're branching on a conditional, LVI might be able to determine
     // it's value at the branch instruction.  We only handle comparisons
@@ -808,7 +818,12 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     // TODO: This should be extended to handle switches as well.
     BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
     Constant *CondConst = dyn_cast<Constant>(CondCmp->getOperand(1));
-    if (CondBr && CondConst && CondBr->isConditional()) {
+    if (CondBr && CondConst) {
+      // We should have returned as soon as we turn a conditional branch to
+      // unconditional. Because its no longer interesting as far as jump
+      // threading is concerned.
+      assert(CondBr->isConditional() && "Threading on unconditional terminator");
+
       LazyValueInfo::Tristate Ret =
         LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
                             CondConst, CondBr);
@@ -831,10 +846,12 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
         }
         return true;
       }
-    }
 
-    if (CondBr && CondConst && TryToUnfoldSelect(CondCmp, BB))
-      return true;
+      // We did not manage to simplify this branch, try to see whether
+      // CondCmp depends on a known phi-select pattern.
+      if (TryToUnfoldSelect(CondCmp, BB))
+        return true;
+    }
   }
 
   // Check for some cases that are worth simplifying.  Right now we want to look
@@ -853,7 +870,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     if (SimplifyPartiallyRedundantLoad(LI))
       return true;
 
-
   // Handle a variety of cases where we are branching on something derived from
   // a PHI node in the current block.  If we can prove that any predecessors
   // compute a predictable value based on a PHI node, thread those predecessors.
@@ -867,7 +883,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
     if (PN->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
       return ProcessBranchOnPHI(PN);
 
-
   // If this is an otherwise-unfoldable branch on a XOR, see if we can simplify.
   if (CondInst->getOpcode() == Instruction::Xor &&
       CondInst->getParent() == BB && isa<BranchInst>(BB->getTerminator()))
@@ -916,6 +931,14 @@ bool JumpThreadingPass::ProcessImpliedCondition(BasicBlock *BB) {
   return false;
 }
 
+/// Return true if Op is an instruction defined in the given block.
+static bool isOpDefinedInBlock(Value *Op, BasicBlock *BB) {
+  if (Instruction *OpInst = dyn_cast<Instruction>(Op))
+    if (OpInst->getParent() == BB)
+      return true;
+  return false;
+}
+
 /// SimplifyPartiallyRedundantLoad - If LI is an obviously partially redundant
 /// load instruction, eliminate it by replacing it with a PHI node.  This is an
 /// important optimization that encourages jump threading, and needs to be run
@@ -938,18 +961,17 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
 
   Value *LoadedPtr = LI->getOperand(0);
 
-  // If the loaded operand is defined in the LoadBB, it can't be available.
-  // TODO: Could do simple PHI translation, that would be fun :)
-  if (Instruction *PtrOp = dyn_cast<Instruction>(LoadedPtr))
-    if (PtrOp->getParent() == LoadBB)
-      return false;
+  // If the loaded operand is defined in the LoadBB and its not a phi,
+  // it can't be available in predecessors.
+  if (isOpDefinedInBlock(LoadedPtr, LoadBB) && !isa<PHINode>(LoadedPtr))
+    return false;
 
   // Scan a few instructions up from the load, to see if it is obviously live at
   // the entry to its block.
   BasicBlock::iterator BBIt(LI);
   bool IsLoadCSE;
-  if (Value *AvailableVal =
-        FindAvailableLoadedValue(LI, LoadBB, BBIt, DefMaxInstsToScan, nullptr, &IsLoadCSE)) {
+  if (Value *AvailableVal = FindAvailableLoadedValue(
+          LI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
     // If the value of the load is locally available within the block, just use
     // it.  This frequently occurs for reg2mem'd allocas.
 
@@ -993,12 +1015,34 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
     if (!PredsScanned.insert(PredBB).second)
       continue;
 
-    // Scan the predecessor to see if the value is available in the pred.
     BBIt = PredBB->end();
-    Value *PredAvailable = FindAvailableLoadedValue(LI, PredBB, BBIt,
-                                                    DefMaxInstsToScan,
-                                                    nullptr,
-                                                    &IsLoadCSE);
+    unsigned NumScanedInst = 0;
+    Value *PredAvailable = nullptr;
+    // NOTE: We don't CSE load that is volatile or anything stronger than
+    // unordered, that should have been checked when we entered the function.
+    assert(LI->isUnordered() && "Attempting to CSE volatile or atomic loads");
+    // If this is a load on a phi pointer, phi-translate it and search
+    // for available load/store to the pointer in predecessors.
+    Value *Ptr = LoadedPtr->DoPHITranslation(LoadBB, PredBB);
+    PredAvailable = FindAvailablePtrLoadStore(
+        Ptr, LI->getType(), LI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
+        AA, &IsLoadCSE, &NumScanedInst);
+
+    // If PredBB has a single predecessor, continue scanning through the
+    // single precessor.
+    BasicBlock *SinglePredBB = PredBB;
+    while (!PredAvailable && SinglePredBB && BBIt == SinglePredBB->begin() &&
+           NumScanedInst < DefMaxInstsToScan) {
+      SinglePredBB = SinglePredBB->getSinglePredecessor();
+      if (SinglePredBB) {
+        BBIt = SinglePredBB->end();
+        PredAvailable = FindAvailablePtrLoadStore(
+            Ptr, LI->getType(), LI->isAtomic(), SinglePredBB, BBIt,
+            (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+            &NumScanedInst);
+      }
+    }
+
     if (!PredAvailable) {
       OneUnavailablePred = PredBB;
       continue;
@@ -1058,10 +1102,10 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LI) {
   if (UnavailablePred) {
     assert(UnavailablePred->getTerminator()->getNumSuccessors() == 1 &&
            "Can't handle critical edge here!");
-    LoadInst *NewVal =
-        new LoadInst(LoadedPtr, LI->getName() + ".pr", false,
-                     LI->getAlignment(), LI->getOrdering(), LI->getSynchScope(),
-                     UnavailablePred->getTerminator());
+    LoadInst *NewVal = new LoadInst(
+        LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
+        LI->getName() + ".pr", false, LI->getAlignment(), LI->getOrdering(),
+        LI->getSynchScope(), UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LI->getDebugLoc());
     if (AATags)
       NewVal->setAAMetadata(AATags);
@@ -1225,7 +1269,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
     else if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
       DestBB = BI->getSuccessor(cast<ConstantInt>(Val)->isZero());
     else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-      DestBB = SI->findCaseValue(cast<ConstantInt>(Val)).getCaseSuccessor();
+      DestBB = SI->findCaseValue(cast<ConstantInt>(Val))->getCaseSuccessor();
     } else {
       assert(isa<IndirectBrInst>(BB->getTerminator())
               && "Unexpected terminator");
@@ -1464,7 +1508,8 @@ bool JumpThreadingPass::ThreadEdge(BasicBlock *BB,
     return false;
   }
 
-  unsigned JumpThreadCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  unsigned JumpThreadCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (JumpThreadCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
           << "' - Cost is too high: " << JumpThreadCost << "\n");
@@ -1752,7 +1797,8 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
     return false;
   }
 
-  unsigned DuplicationCost = getJumpThreadDuplicationCost(BB, BBDupThreshold);
+  unsigned DuplicationCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
   if (DuplicationCost > BBDupThreshold) {
     DEBUG(dbgs() << "  Not duplicating BB '" << BB->getName()
           << "' - Cost is too high: " << DuplicationCost << "\n");
@@ -1884,10 +1930,10 @@ bool JumpThreadingPass::DuplicateCondBranchOnPHIIntoPred(
 /// TryToUnfoldSelect - Look for blocks of the form
 /// bb1:
 ///   %a = select
-///   br bb
+///   br bb2
 ///
 /// bb2:
-///   %p = phi [%a, %bb] ...
+///   %p = phi [%a, %bb1] ...
 ///   %c = icmp %p
 ///   br i1 %c
 ///
@@ -2017,3 +2063,130 @@ bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
   
   return false;
 }
+
+/// Try to propagate a guard from the current BB into one of its predecessors
+/// in case if another branch of execution implies that the condition of this
+/// guard is always true. Currently we only process the simplest case that
+/// looks like:
+///
+/// Start:
+///   %cond = ...
+///   br i1 %cond, label %T1, label %F1
+/// T1:
+///   br label %Merge
+/// F1:
+///   br label %Merge
+/// Merge:
+///   %condGuard = ...
+///   call void(i1, ...) @llvm.experimental.guard( i1 %condGuard )[ "deopt"() ]
+///
+/// And cond either implies condGuard or !condGuard. In this case all the
+/// instructions before the guard can be duplicated in both branches, and the
+/// guard is then threaded to one of them.
+bool JumpThreadingPass::ProcessGuards(BasicBlock *BB) {
+  using namespace PatternMatch;
+  // We only want to deal with two predecessors.
+  BasicBlock *Pred1, *Pred2;
+  auto PI = pred_begin(BB), PE = pred_end(BB);
+  if (PI == PE)
+    return false;
+  Pred1 = *PI++;
+  if (PI == PE)
+    return false;
+  Pred2 = *PI++;
+  if (PI != PE)
+    return false;
+  if (Pred1 == Pred2)
+    return false;
+
+  // Try to thread one of the guards of the block.
+  // TODO: Look up deeper than to immediate predecessor?
+  auto *Parent = Pred1->getSinglePredecessor();
+  if (!Parent || Parent != Pred2->getSinglePredecessor())
+    return false;
+
+  if (auto *BI = dyn_cast<BranchInst>(Parent->getTerminator()))
+    for (auto &I : *BB)
+      if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>()))
+        if (ThreadGuard(BB, cast<IntrinsicInst>(&I), BI))
+          return true;
+
+  return false;
+}
+
+/// Try to propagate the guard from BB which is the lower block of a diamond
+/// to one of its branches, in case if diamond's condition implies guard's
+/// condition.
+bool JumpThreadingPass::ThreadGuard(BasicBlock *BB, IntrinsicInst *Guard,
+                                    BranchInst *BI) {
+  assert(BI->getNumSuccessors() == 2 && "Wrong number of successors?");
+  assert(BI->isConditional() && "Unconditional branch has 2 successors?");
+  Value *GuardCond = Guard->getArgOperand(0);
+  Value *BranchCond = BI->getCondition();
+  BasicBlock *TrueDest = BI->getSuccessor(0);
+  BasicBlock *FalseDest = BI->getSuccessor(1);
+
+  auto &DL = BB->getModule()->getDataLayout();
+  bool TrueDestIsSafe = false;
+  bool FalseDestIsSafe = false;
+
+  // True dest is safe if BranchCond => GuardCond.
+  auto Impl = isImpliedCondition(BranchCond, GuardCond, DL);
+  if (Impl && *Impl)
+    TrueDestIsSafe = true;
+  else {
+    // False dest is safe if !BranchCond => GuardCond.
+    Impl =
+        isImpliedCondition(BranchCond, GuardCond, DL, /* InvertAPred */ true);
+    if (Impl && *Impl)
+      FalseDestIsSafe = true;
+  }
+
+  if (!TrueDestIsSafe && !FalseDestIsSafe)
+    return false;
+
+  BasicBlock *UnguardedBlock = TrueDestIsSafe ? TrueDest : FalseDest;
+  BasicBlock *GuardedBlock = FalseDestIsSafe ? TrueDest : FalseDest;
+
+  ValueToValueMapTy UnguardedMapping, GuardedMapping;
+  Instruction *AfterGuard = Guard->getNextNode();
+  unsigned Cost = getJumpThreadDuplicationCost(BB, AfterGuard, BBDupThreshold);
+  if (Cost > BBDupThreshold)
+    return false;
+  // Duplicate all instructions before the guard and the guard itself to the
+  // branch where implication is not proved.
+  GuardedBlock = DuplicateInstructionsInSplitBetween(
+      BB, GuardedBlock, AfterGuard, GuardedMapping);
+  assert(GuardedBlock && "Could not create the guarded block?");
+  // Duplicate all instructions before the guard in the unguarded branch.
+  // Since we have successfully duplicated the guarded block and this block
+  // has fewer instructions, we expect it to succeed.
+  UnguardedBlock = DuplicateInstructionsInSplitBetween(BB, UnguardedBlock,
+                                                       Guard, UnguardedMapping);
+  assert(UnguardedBlock && "Could not create the unguarded block?");
+  DEBUG(dbgs() << "Moved guard " << *Guard << " to block "
+               << GuardedBlock->getName() << "\n");
+
+  // Some instructions before the guard may still have uses. For them, we need
+  // to create Phi nodes merging their copies in both guarded and unguarded
+  // branches. Those instructions that have no uses can be just removed.
+  SmallVector<Instruction *, 4> ToRemove;
+  for (auto BI = BB->begin(); &*BI != AfterGuard; ++BI)
+    if (!isa<PHINode>(&*BI))
+      ToRemove.push_back(&*BI);
+
+  Instruction *InsertionPoint = &*BB->getFirstInsertionPt();
+  assert(InsertionPoint && "Empty block?");
+  // Substitute with Phis & remove.
+  for (auto *Inst : reverse(ToRemove)) {
+    if (!Inst->use_empty()) {
+      PHINode *NewPN = PHINode::Create(Inst->getType(), 2);
+      NewPN->addIncoming(UnguardedMapping[Inst], UnguardedBlock);
+      NewPN->addIncoming(GuardedMapping[Inst], GuardedBlock);
+      NewPN->insertBefore(InsertionPoint);
+      Inst->replaceAllUsesWith(NewPN);
+    }
+    Inst->eraseFromParent();
+  }
+  return true;
+}
diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
index 07a72287217e21581cc40eed6f5c63268abd2163..340c81fed0fdacaddaa6da457968abdbc3cf9e9f 100644
--- a/lib/Transforms/Scalar/LICM.cpp
+++ b/lib/Transforms/Scalar/LICM.cpp
@@ -77,10 +77,16 @@ STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
 STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
 STATISTIC(NumPromoted, "Number of memory locations promoted to registers");
 
+/// Memory promotion is enabled by default.
 static cl::opt<bool>
-    DisablePromotion("disable-licm-promotion", cl::Hidden,
+    DisablePromotion("disable-licm-promotion", cl::Hidden, cl::init(false),
                      cl::desc("Disable memory promotion in LICM pass"));
 
+static cl::opt<uint32_t> MaxNumUsesTraversed(
+    "licm-max-num-uses-traversed", cl::Hidden, cl::init(8),
+    cl::desc("Max num uses visited for identifying load "
+             "invariance in loop using invariant start (default = 8)"));
+
 static bool inSubLoop(BasicBlock *BB, Loop *CurLoop, LoopInfo *LI);
 static bool isNotUsedInLoop(const Instruction &I, const Loop *CurLoop,
                             const LoopSafetyInfo *SafetyInfo);
@@ -425,6 +431,29 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         continue;
       }
 
+      // Attempt to remove floating point division out of the loop by converting
+      // it to a reciprocal multiplication.
+      if (I.getOpcode() == Instruction::FDiv &&
+          CurLoop->isLoopInvariant(I.getOperand(1)) &&
+          I.hasAllowReciprocal()) {
+        auto Divisor = I.getOperand(1);
+        auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
+        auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
+        ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags());
+        ReciprocalDivisor->insertBefore(&I);
+
+        auto Product = BinaryOperator::CreateFMul(I.getOperand(0),
+                                                  ReciprocalDivisor);
+        Product->setFastMathFlags(I.getFastMathFlags());
+        Product->insertAfter(&I);
+        I.replaceAllUsesWith(Product);
+        I.eraseFromParent();
+
+        hoist(*ReciprocalDivisor, DT, CurLoop, SafetyInfo, ORE);
+        Changed = true;
+        continue;
+      }
+
       // Try hoisting the instruction out to the preheader.  We can only do this
       // if all of the operands of the instruction are loop invariant and if it
       // is safe to hoist the instruction.
@@ -480,6 +509,59 @@ void llvm::computeLoopSafetyInfo(LoopSafetyInfo *SafetyInfo, Loop *CurLoop) {
         SafetyInfo->BlockColors = colorEHFunclets(*Fn);
 }
 
+// Return true if LI is invariant within scope of the loop. LI is invariant if
+// CurLoop is dominated by an invariant.start representing the same memory location
+// and size as the memory location LI loads from, and also the invariant.start
+// has no uses.
+static bool isLoadInvariantInLoop(LoadInst *LI, DominatorTree *DT,
+                                  Loop *CurLoop) {
+  Value *Addr = LI->getOperand(0);
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+  const uint32_t LocSizeInBits = DL.getTypeSizeInBits(
+      cast<PointerType>(Addr->getType())->getElementType());
+
+  // if the type is i8 addrspace(x)*, we know this is the type of
+  // llvm.invariant.start operand
+  auto *PtrInt8Ty = PointerType::get(Type::getInt8Ty(LI->getContext()),
+                                     LI->getPointerAddressSpace());
+  unsigned BitcastsVisited = 0;
+  // Look through bitcasts until we reach the i8* type (this is invariant.start
+  // operand type).
+  while (Addr->getType() != PtrInt8Ty) {
+    auto *BC = dyn_cast<BitCastInst>(Addr);
+    // Avoid traversing high number of bitcast uses.
+    if (++BitcastsVisited > MaxNumUsesTraversed || !BC)
+      return false;
+    Addr = BC->getOperand(0);
+  }
+
+  unsigned UsesVisited = 0;
+  // Traverse all uses of the load operand value, to see if invariant.start is
+  // one of the uses, and whether it dominates the load instruction.
+  for (auto *U : Addr->users()) {
+    // Avoid traversing for Load operand with high number of users.
+    if (++UsesVisited > MaxNumUsesTraversed)
+      return false;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(U);
+    // If there are escaping uses of invariant.start instruction, the load maybe
+    // non-invariant.
+    if (!II || II->getIntrinsicID() != Intrinsic::invariant_start ||
+        II->hasNUsesOrMore(1))
+      continue;
+    unsigned InvariantSizeInBits =
+        cast<ConstantInt>(II->getArgOperand(0))->getSExtValue() * 8;
+    // Confirm the invariant.start location size contains the load operand size
+    // in bits. Also, the invariant.start should dominate the load, and we
+    // should not hoist the load out of a loop that contains this dominating
+    // invariant.start.
+    if (LocSizeInBits <= InvariantSizeInBits &&
+        DT->properlyDominates(II->getParent(), CurLoop->getHeader()))
+      return true;
+  }
+
+  return false;
+}
+
 bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
                               Loop *CurLoop, AliasSetTracker *CurAST,
                               LoopSafetyInfo *SafetyInfo,
@@ -496,6 +578,10 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     if (LI->getMetadata(LLVMContext::MD_invariant_load))
       return true;
 
+    // This checks for an invariant.start dominating the load.
+    if (isLoadInvariantInLoop(LI, DT, CurLoop))
+      return true;
+
     // Don't hoist loads which have may-aliased stores in loop.
     uint64_t Size = 0;
     if (LI->getType()->isSized())
@@ -785,7 +871,7 @@ static bool hoist(Instruction &I, const DominatorTree *DT, const Loop *CurLoop,
   DEBUG(dbgs() << "LICM hoisting to " << Preheader->getName() << ": " << I
                << "\n");
   ORE->emit(OptimizationRemark(DEBUG_TYPE, "Hoisted", &I)
-            << "hosting " << ore::NV("Inst", &I));
+            << "hoisting " << ore::NV("Inst", &I));
 
   // Metadata can be dependent on conditions we are hoisting above.
   // Conservatively strip all metadata on the instruction unless we were
@@ -855,6 +941,7 @@ class LoopPromoter : public LoadAndStorePromoter {
   LoopInfo &LI;
   DebugLoc DL;
   int Alignment;
+  bool UnorderedAtomic;
   AAMDNodes AATags;
 
   Value *maybeInsertLCSSAPHI(Value *V, BasicBlock *BB) const {
@@ -878,10 +965,11 @@ public:
                SmallVectorImpl<BasicBlock *> &LEB,
                SmallVectorImpl<Instruction *> &LIP, PredIteratorCache &PIC,
                AliasSetTracker &ast, LoopInfo &li, DebugLoc dl, int alignment,
-               const AAMDNodes &AATags)
+               bool UnorderedAtomic, const AAMDNodes &AATags)
       : LoadAndStorePromoter(Insts, S), SomePtr(SP), PointerMustAliases(PMA),
         LoopExitBlocks(LEB), LoopInsertPts(LIP), PredCache(PIC), AST(ast),
-        LI(li), DL(std::move(dl)), Alignment(alignment), AATags(AATags) {}
+        LI(li), DL(std::move(dl)), Alignment(alignment),
+        UnorderedAtomic(UnorderedAtomic),AATags(AATags) {}
 
   bool isInstInList(Instruction *I,
                     const SmallVectorImpl<Instruction *> &) const override {
@@ -905,6 +993,8 @@ public:
       Value *Ptr = maybeInsertLCSSAPHI(SomePtr, ExitBlock);
       Instruction *InsertPos = LoopInsertPts[i];
       StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
+      if (UnorderedAtomic)
+        NewSI->setOrdering(AtomicOrdering::Unordered);
       NewSI->setAlignment(Alignment);
       NewSI->setDebugLoc(DL);
       if (AATags)
@@ -995,6 +1085,9 @@ bool llvm::promoteLoopAccessesToScalars(
   // We start with an alignment of one and try to find instructions that allow
   // us to prove better alignment.
   unsigned Alignment = 1;
+  // Keep track of which types of access we see
+  bool SawUnorderedAtomic = false; 
+  bool SawNotAtomic = false;
   AAMDNodes AATags;
 
   const DataLayout &MDL = Preheader->getModule()->getDataLayout();
@@ -1052,8 +1145,11 @@ bool llvm::promoteLoopAccessesToScalars(
       // it.
       if (LoadInst *Load = dyn_cast<LoadInst>(UI)) {
         assert(!Load->isVolatile() && "AST broken");
-        if (!Load->isSimple())
+        if (!Load->isUnordered())
           return false;
+        
+        SawUnorderedAtomic |= Load->isAtomic();
+        SawNotAtomic |= !Load->isAtomic();
 
         if (!DereferenceableInPH)
           DereferenceableInPH = isSafeToExecuteUnconditionally(
@@ -1064,9 +1160,12 @@ bool llvm::promoteLoopAccessesToScalars(
         if (UI->getOperand(1) != ASIV)
           continue;
         assert(!Store->isVolatile() && "AST broken");
-        if (!Store->isSimple())
+        if (!Store->isUnordered())
           return false;
 
+        SawUnorderedAtomic |= Store->isAtomic();
+        SawNotAtomic |= !Store->isAtomic();
+
         // If the store is guaranteed to execute, both properties are satisfied.
         // We may want to check if a store is guaranteed to execute even if we
         // already know that promotion is safe, since it may have higher
@@ -1119,6 +1218,12 @@ bool llvm::promoteLoopAccessesToScalars(
     }
   }
 
+  // If we found both an unordered atomic instruction and a non-atomic memory
+  // access, bail.  We can't blindly promote non-atomic to atomic since we
+  // might not be able to lower the result.  We can't downgrade since that
+  // would violate memory model.  Also, align 0 is an error for atomics.
+  if (SawUnorderedAtomic && SawNotAtomic)
+    return false;
 
   // If we couldn't prove we can hoist the load, bail.
   if (!DereferenceableInPH)
@@ -1162,12 +1267,15 @@ bool llvm::promoteLoopAccessesToScalars(
   SmallVector<PHINode *, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
-                        InsertPts, PIC, *CurAST, *LI, DL, Alignment, AATags);
+                        InsertPts, PIC, *CurAST, *LI, DL, Alignment,
+                        SawUnorderedAtomic, AATags);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
   LoadInst *PreheaderLoad = new LoadInst(
       SomePtr, SomePtr->getName() + ".promoted", Preheader->getTerminator());
+  if (SawUnorderedAtomic)
+    PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
   PreheaderLoad->setAlignment(Alignment);
   PreheaderLoad->setDebugLoc(DL);
   if (AATags)
diff --git a/lib/Transforms/Scalar/LoadCombine.cpp b/lib/Transforms/Scalar/LoadCombine.cpp
index 389f1c595aa408b80055884e09c2947e858860f8..02215d3450c23f80d032cd0dbec30d41ed99c80b 100644
--- a/lib/Transforms/Scalar/LoadCombine.cpp
+++ b/lib/Transforms/Scalar/LoadCombine.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -53,18 +54,20 @@ struct LoadPOPPair {
 class LoadCombine : public BasicBlockPass {
   LLVMContext *C;
   AliasAnalysis *AA;
+  DominatorTree *DT;
 
 public:
   LoadCombine() : BasicBlockPass(ID), C(nullptr), AA(nullptr) {
     initializeLoadCombinePass(*PassRegistry::getPassRegistry());
   }
-  
+
   using llvm::Pass::doInitialization;
   bool doInitialization(Function &) override;
   bool runOnBasicBlock(BasicBlock &BB) override;
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 
@@ -234,6 +237,14 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
     return false;
 
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
+  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  // Skip analysing dead blocks (not forward reachable from function entry).
+  if (!DT->isReachableFromEntry(&BB)) {
+    DEBUG(dbgs() << "LC: skipping unreachable " << BB.getName() <<
+          " in " << BB.getParent()->getName() << "\n");
+    return false;
+  }
 
   IRBuilder<TargetFolder> TheBuilder(
       BB.getContext(), TargetFolder(BB.getModule()->getDataLayout()));
@@ -245,13 +256,17 @@ bool LoadCombine::runOnBasicBlock(BasicBlock &BB) {
   bool Combined = false;
   unsigned Index = 0;
   for (auto &I : BB) {
-    if (I.mayThrow() || (I.mayWriteToMemory() && AST.containsUnknown(&I))) {
+    if (I.mayThrow() || AST.containsUnknown(&I)) {
       if (combineLoads(LoadMap))
         Combined = true;
       LoadMap.clear();
       AST.clear();
       continue;
     }
+    if (I.mayWriteToMemory()) {
+      AST.add(&I);
+      continue;
+    }
     LoadInst *LI = dyn_cast<LoadInst>(&I);
     if (!LI)
       continue;
diff --git a/lib/Transforms/Scalar/LoopDeletion.cpp b/lib/Transforms/Scalar/LoopDeletion.cpp
index 54d48e841afa12c7174e183d3d9ef4fb1d8d40ca..73e8ce0e1d93cfbe769f1c26848df2e89b67e90f 100644
--- a/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -77,16 +77,10 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
 
   // Make sure that no instructions in the block have potential side-effects.
   // This includes instructions that could write to memory, and loads that are
-  // marked volatile.  This could be made more aggressive by using aliasing
-  // information to identify readonly and readnone calls.
-  for (Loop::block_iterator LI = L->block_begin(), LE = L->block_end();
-       LI != LE; ++LI) {
-    for (Instruction &I : **LI) {
-      if (I.mayHaveSideEffects())
-        return false;
-    }
-  }
-
+  // marked volatile.
+  for (auto &I : L->blocks())
+    if (any_of(*I, [](Instruction &I) { return I.mayHaveSideEffects(); }))
+      return false;
   return true;
 }
 
@@ -100,16 +94,14 @@ static bool isLoopDead(Loop *L, ScalarEvolution &SE,
 /// This entire process relies pretty heavily on LoopSimplify form and LCSSA in
 /// order to make various safety checks work.
 ///
-/// \returns true if the loop is deleted.
-///
-/// This also sets the \p Changed output parameter to `true` if any changes
-/// were made. This may mutate the loop even if it is unable to delete it due
-/// to hoisting trivially loop invariant instructions out of the loop.
+/// \returns true if any changes were made. This may mutate the loop even if it
+/// is unable to delete it due to hoisting trivially loop invariant
+/// instructions out of the loop.
 ///
 /// This also updates the relevant analysis information in \p DT, \p SE, and \p
-/// LI.
+/// LI. It also updates the loop PM if an updater struct is provided.
 static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
-                             LoopInfo &LI, bool &Changed) {
+                             LoopInfo &LI, LPMUpdater *Updater = nullptr) {
   assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
 
   // We can only remove the loop if there is a preheader that we can
@@ -139,14 +131,15 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
     return false;
 
   // Finally, we have to check that the loop really is dead.
+  bool Changed = false;
   if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader))
-    return false;
+    return Changed;
 
   // Don't remove loops for which we can't solve the trip count.
   // They could be infinite, in which case we'd be changing program behavior.
   const SCEV *S = SE.getMaxBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(S))
-    return false;
+    return Changed;
 
   // Now that we know the removal is safe, remove the loop by changing the
   // branch from the preheader to go to the single exit block.
@@ -154,6 +147,10 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
   // Because we're deleting a large chunk of code at once, the sequence in which
   // we remove things is very important to avoid invalidation issues.
 
+  // If we have an LPM updater, tell it about the loop being removed.
+  if (Updater)
+    Updater->markLoopAsDeleted(*L);
+
   // Tell ScalarEvolution that the loop is deleted. Do this before
   // deleting the loop so that ScalarEvolution can look at the loop
   // to determine what it needs to clean up.
@@ -214,8 +211,6 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 
   // The last step is to update LoopInfo now that we've eliminated this loop.
   LI.markAsRemoved(L);
-  Changed = true;
-
   ++NumDeleted;
 
   return true;
@@ -224,15 +219,8 @@ static bool deleteLoopIfDead(Loop *L, DominatorTree &DT, ScalarEvolution &SE,
 PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
                                         LoopStandardAnalysisResults &AR,
                                         LPMUpdater &Updater) {
-  bool Changed = false;
-
-  if (deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, Changed)) {
-    assert(Changed && "Cannot delete a loop without changing something!");
-    // Need to update the LPM about this loop going away.
-    Updater.markLoopAsDeleted(L);
-  } else if (!Changed) {
+  if (!deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, &Updater))
     return PreservedAnalyses::all();
-  }
 
   return getLoopPassPreservedAnalyses();
 }
@@ -271,6 +259,5 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &) {
   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
-  bool Changed = false;
-  return deleteLoopIfDead(L, DT, SE, LI, Changed) || Changed;
+  return deleteLoopIfDead(L, DT, SE, LI);
 }
diff --git a/lib/Transforms/Scalar/LoopDistribute.cpp b/lib/Transforms/Scalar/LoopDistribute.cpp
index 19716b28ad66a1e2ec94e13c32b9a6aab87a47d9..3624bba10345073d03af1bcae5dd18a11f3872f7 100644
--- a/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -812,29 +812,29 @@ private:
       const RuntimePointerChecking *RtPtrChecking) {
     SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
 
-    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
-                 [&](const RuntimePointerChecking::PointerCheck &Check) {
-                   for (unsigned PtrIdx1 : Check.first->Members)
-                     for (unsigned PtrIdx2 : Check.second->Members)
-                       // Only include this check if there is a pair of pointers
-                       // that require checking and the pointers fall into
-                       // separate partitions.
-                       //
-                       // (Note that we already know at this point that the two
-                       // pointer groups need checking but it doesn't follow
-                       // that each pair of pointers within the two groups need
-                       // checking as well.
-                       //
-                       // In other words we don't want to include a check just
-                       // because there is a pair of pointers between the two
-                       // pointer groups that require checks and a different
-                       // pair whose pointers fall into different partitions.)
-                       if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
-                           !RuntimePointerChecking::arePointersInSamePartition(
-                               PtrToPartition, PtrIdx1, PtrIdx2))
-                         return true;
-                   return false;
-                 });
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerChecking::PointerCheck &Check) {
+              for (unsigned PtrIdx1 : Check.first->Members)
+                for (unsigned PtrIdx2 : Check.second->Members)
+                  // Only include this check if there is a pair of pointers
+                  // that require checking and the pointers fall into
+                  // separate partitions.
+                  //
+                  // (Note that we already know at this point that the two
+                  // pointer groups need checking but it doesn't follow
+                  // that each pair of pointers within the two groups need
+                  // checking as well.
+                  //
+                  // In other words we don't want to include a check just
+                  // because there is a pair of pointers between the two
+                  // pointer groups that require checks and a different
+                  // pair whose pointers fall into different partitions.)
+                  if (RtPtrChecking->needsChecking(PtrIdx1, PtrIdx2) &&
+                      !RuntimePointerChecking::arePointersInSamePartition(
+                          PtrToPartition, PtrIdx1, PtrIdx2))
+                    return true;
+              return false;
+            });
 
     return Checks;
   }
diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index d509f2928b109bc7414ce60703589ce448b99f6a..946d85d7360fd021b0885b5af08c52a474dd316b 100644
--- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -823,7 +823,7 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Module *M = TheStore->getModule();
     Value *MSP =
         M->getOrInsertFunction("memset_pattern16", Builder.getVoidTy(),
-                               Int8PtrTy, Int8PtrTy, IntPtr, (void *)nullptr);
+                               Int8PtrTy, Int8PtrTy, IntPtr);
     inferLibFuncAttributes(*M->getFunction("memset_pattern16"), *TLI);
 
     // Otherwise we should form a memset_pattern16.  PatternValue is known to be
diff --git a/lib/Transforms/Scalar/LoopLoadElimination.cpp b/lib/Transforms/Scalar/LoopLoadElimination.cpp
index b44cca4a90f885490bc0c747bb369c61fd515e54..cf63cb660db8cd87a70623feb7c74a6c7c72bcf4 100644
--- a/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -374,15 +374,15 @@ public:
     const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
     SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
 
-    std::copy_if(AllChecks.begin(), AllChecks.end(), std::back_inserter(Checks),
-                 [&](const RuntimePointerChecking::PointerCheck &Check) {
-                   for (auto PtrIdx1 : Check.first->Members)
-                     for (auto PtrIdx2 : Check.second->Members)
-                       if (needsChecking(PtrIdx1, PtrIdx2,
-                                         PtrsWrittenOnFwdingPath, CandLoadPtrs))
-                         return true;
-                   return false;
-                 });
+    copy_if(AllChecks, std::back_inserter(Checks),
+            [&](const RuntimePointerChecking::PointerCheck &Check) {
+              for (auto PtrIdx1 : Check.first->Members)
+                for (auto PtrIdx2 : Check.second->Members)
+                  if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
+                                    CandLoadPtrs))
+                    return true;
+              return false;
+            });
 
     DEBUG(dbgs() << "\nPointer Checks (count: " << Checks.size() << "):\n");
     DEBUG(LAI.getRuntimePointerChecking()->printChecks(dbgs(), Checks));
diff --git a/lib/Transforms/Scalar/LoopPredication.cpp b/lib/Transforms/Scalar/LoopPredication.cpp
index 4f6831480389a2980f8b17851856bc8539662be1..0ce60442932615cdb6f8ee6190dacf10b9efc9b0 100644
--- a/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/lib/Transforms/Scalar/LoopPredication.cpp
@@ -141,10 +141,9 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
     std::swap(LHSS, RHSS);
     Pred = ICmpInst::getSwappedPredicate(Pred);
   }
-  if (!SE->isLoopInvariant(RHSS, L))
+  if (!SE->isLoopInvariant(RHSS, L) || !isSafeToExpand(RHSS, *SE))
     return None;
 
-  Value *Bound = RHS;
   const SCEVAddRecExpr *IndexAR = dyn_cast<SCEVAddRecExpr>(LHSS);
   if (!IndexAR || IndexAR->getLoop() != L)
     return None;
@@ -164,7 +163,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
                             ? IndexAR->getStart()
                             : SE->getSCEVAtScope(IndexAR, L->getParentLoop());
   if (NewLHSS == IndexAR) {
-    DEBUG(dbgs() << "Can't compute NewLHSS!");
+    DEBUG(dbgs() << "Can't compute NewLHSS!\n");
     return None;
   }
 
@@ -176,9 +175,12 @@ Optional<Value *> LoopPredication::widenICmpRangeCheck(ICmpInst *ICI,
 
   DEBUG(dbgs() << "NewLHSS is loop invariant and safe to expand. Expand!\n");
 
-  Value *NewLHS = Expander.expandCodeFor(NewLHSS, Bound->getType(),
-                                         Preheader->getTerminator());
-  return Builder.CreateICmp(Pred, NewLHS, Bound);
+  Type *Ty = LHS->getType();
+  Instruction *InsertAt = Preheader->getTerminator();
+  assert(Ty == RHS->getType() && "icmp operands have different types?");
+  Value *NewLHS = Expander.expandCodeFor(NewLHSS, Ty, InsertAt);
+  Value *NewRHS = Expander.expandCodeFor(RHSS, Ty, InsertAt);
+  return Builder.CreateICmp(Pred, NewLHS, NewRHS);
 }
 
 bool LoopPredication::widenGuardConditions(IntrinsicInst *Guard,
diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
index 0d57ce1e827524c057fc08a52ef23a81c8fb44de..e5689368de80d38073e51fd376630911e5a139cb 100644
--- a/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/lib/Transforms/Scalar/LoopRotation.cpp
@@ -79,7 +79,8 @@ private:
 /// to merge the two values.  Do this now.
 static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
                                             BasicBlock *OrigPreheader,
-                                            ValueToValueMapTy &ValueMap) {
+                                            ValueToValueMapTy &ValueMap,
+                                SmallVectorImpl<PHINode*> *InsertedPHIs) {
   // Remove PHI node entries that are no longer live.
   BasicBlock::iterator I, E = OrigHeader->end();
   for (I = OrigHeader->begin(); PHINode *PN = dyn_cast<PHINode>(I); ++I)
@@ -87,7 +88,7 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
 
   // Now fix up users of the instructions in OrigHeader, inserting PHI nodes
   // as necessary.
-  SSAUpdater SSA;
+  SSAUpdater SSA(InsertedPHIs);
   for (I = OrigHeader->begin(); I != E; ++I) {
     Value *OrigHeaderVal = &*I;
 
@@ -174,6 +175,38 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
   }
 }
 
+/// Propagate dbg.value intrinsics through the newly inserted Phis.
+static void insertDebugValues(BasicBlock *OrigHeader,
+                              SmallVectorImpl<PHINode*> &InsertedPHIs) {
+  ValueToValueMapTy DbgValueMap;
+
+  // Map existing PHI nodes to their dbg.values.
+  for (auto &I : *OrigHeader) {
+    if (auto DbgII = dyn_cast<DbgInfoIntrinsic>(&I)) {
+      if (auto *Loc = dyn_cast_or_null<PHINode>(DbgII->getVariableLocation()))
+        DbgValueMap.insert({Loc, DbgII});
+    }
+  }
+
+  // Then iterate through the new PHIs and look to see if they use one of the
+  // previously mapped PHIs. If so, insert a new dbg.value intrinsic that will
+  // propagate the info through the new PHI.
+  LLVMContext &C = OrigHeader->getContext();
+  for (auto PHI : InsertedPHIs) {
+    for (auto VI : PHI->operand_values()) {
+      auto V = DbgValueMap.find(VI);
+      if (V != DbgValueMap.end()) {
+        auto *DbgII = cast<DbgInfoIntrinsic>(V->second);
+        Instruction *NewDbgII = DbgII->clone();
+        auto PhiMAV = MetadataAsValue::get(C, ValueAsMetadata::get(PHI));
+        NewDbgII->setOperand(0, PhiMAV);
+        BasicBlock *Parent = PHI->getParent();
+        NewDbgII->insertBefore(Parent->getFirstNonPHIOrDbgOrLifetime());
+      }
+    }
+  }
+}
+
 /// Rotate loop LP. Return true if the loop is rotated.
 ///
 /// \param SimplifiedLatch is true if the latch was just folded into the final
@@ -347,9 +380,18 @@ bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // remove the corresponding incoming values from the PHI nodes in OrigHeader.
   LoopEntryBranch->eraseFromParent();
 
+
+  SmallVector<PHINode*, 2> InsertedPHIs;
   // If there were any uses of instructions in the duplicated block outside the
   // loop, update them, inserting PHI nodes as required
-  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap);
+  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                  &InsertedPHIs);
+
+  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+  // previously had debug metadata attached. This keeps the debug info
+  // up-to-date in the loop body.
+  if (!InsertedPHIs.empty())
+    insertDebugValues(OrigHeader, InsertedPHIs);
 
   // NewHeader is now the header of the loop.
   L->moveToHeader(NewHeader);
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 97e87c5521e2ec8e2952bc66d8d3795895991899..1dad080efbff44ee51edf5ff471b13576d01847f 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -129,6 +129,17 @@ static cl::opt<bool> EnablePhiElim(
   "enable-lsr-phielim", cl::Hidden, cl::init(true),
   cl::desc("Enable LSR phi elimination"));
 
+// The flag adds instruction count to solutions cost comparision.
+static cl::opt<bool> InsnsCost(
+  "lsr-insns-cost", cl::Hidden, cl::init(false),
+  cl::desc("Add instruction count to a LSR cost model"));
+
+// Flag to choose how to narrow complex lsr solution
+static cl::opt<bool> LSRExpNarrow(
+  "lsr-exp-narrow", cl::Hidden, cl::init(false),
+  cl::desc("Narrow LSR complex solution using"
+           " expectation of registers number"));
+
 #ifndef NDEBUG
 // Stress test IV chain generation.
 static cl::opt<bool> StressIVChain(
@@ -296,9 +307,13 @@ struct Formula {
   /// canonical representation of a formula is
   /// 1. BaseRegs.size > 1 implies ScaledReg != NULL and
   /// 2. ScaledReg != NULL implies Scale != 1 || !BaseRegs.empty().
+  /// 3. The reg containing recurrent expr related with currect loop in the
+  /// formula should be put in the ScaledReg.
   /// #1 enforces that the scaled register is always used when at least two
   /// registers are needed by the formula: e.g., reg1 + reg2 is reg1 + 1 * reg2.
   /// #2 enforces that 1 * reg is reg.
+  /// #3 ensures invariant regs with respect to current loop can be combined
+  /// together in LSR codegen.
   /// This invariant can be temporarly broken while building a formula.
   /// However, every formula inserted into the LSRInstance must be in canonical
   /// form.
@@ -319,12 +334,14 @@ struct Formula {
 
   void initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE);
 
-  bool isCanonical() const;
+  bool isCanonical(const Loop &L) const;
 
-  void canonicalize();
+  void canonicalize(const Loop &L);
 
   bool unscale();
 
+  bool hasZeroEnd() const;
+
   size_t getNumRegs() const;
   Type *getType() const;
 
@@ -411,16 +428,35 @@ void Formula::initialMatch(const SCEV *S, Loop *L, ScalarEvolution &SE) {
       BaseRegs.push_back(Sum);
     HasBaseReg = true;
   }
-  canonicalize();
+  canonicalize(*L);
 }
 
 /// \brief Check whether or not this formula statisfies the canonical
 /// representation.
 /// \see Formula::BaseRegs.
-bool Formula::isCanonical() const {
-  if (ScaledReg)
-    return Scale != 1 || !BaseRegs.empty();
-  return BaseRegs.size() <= 1;
+bool Formula::isCanonical(const Loop &L) const {
+  if (!ScaledReg)
+    return BaseRegs.size() <= 1;
+
+  if (Scale != 1)
+    return true;
+
+  if (Scale == 1 && BaseRegs.empty())
+    return false;
+
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (SAR && SAR->getLoop() == &L)
+    return true;
+
+  // If ScaledReg is not a recurrent expr, or it is but its loop is not current
+  // loop, meanwhile BaseRegs contains a recurrent expr reg related with current
+  // loop, we want to swap the reg in BaseRegs with ScaledReg.
+  auto I =
+      find_if(make_range(BaseRegs.begin(), BaseRegs.end()), [&](const SCEV *S) {
+        return isa<const SCEVAddRecExpr>(S) &&
+               (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+      });
+  return I == BaseRegs.end();
 }
 
 /// \brief Helper method to morph a formula into its canonical representation.
@@ -429,21 +465,33 @@ bool Formula::isCanonical() const {
 /// field. Otherwise, we would have to do special cases everywhere in LSR
 /// to treat reg1 + reg2 + ... the same way as reg1 + 1*reg2 + ...
 /// On the other hand, 1*reg should be canonicalized into reg.
-void Formula::canonicalize() {
-  if (isCanonical())
+void Formula::canonicalize(const Loop &L) {
+  if (isCanonical(L))
     return;
   // So far we did not need this case. This is easy to implement but it is
   // useless to maintain dead code. Beside it could hurt compile time.
   assert(!BaseRegs.empty() && "1*reg => reg, should not be needed.");
+
   // Keep the invariant sum in BaseRegs and one of the variant sum in ScaledReg.
-  ScaledReg = BaseRegs.back();
-  BaseRegs.pop_back();
-  Scale = 1;
-  size_t BaseRegsSize = BaseRegs.size();
-  size_t Try = 0;
-  // If ScaledReg is an invariant, try to find a variant expression.
-  while (Try < BaseRegsSize && !isa<SCEVAddRecExpr>(ScaledReg))
-    std::swap(ScaledReg, BaseRegs[Try++]);
+  if (!ScaledReg) {
+    ScaledReg = BaseRegs.back();
+    BaseRegs.pop_back();
+    Scale = 1;
+  }
+
+  // If ScaledReg is an invariant with respect to L, find the reg from
+  // BaseRegs containing the recurrent expr related with Loop L. Swap the
+  // reg with ScaledReg.
+  const SCEVAddRecExpr *SAR = dyn_cast<const SCEVAddRecExpr>(ScaledReg);
+  if (!SAR || SAR->getLoop() != &L) {
+    auto I = find_if(make_range(BaseRegs.begin(), BaseRegs.end()),
+                     [&](const SCEV *S) {
+                       return isa<const SCEVAddRecExpr>(S) &&
+                              (cast<SCEVAddRecExpr>(S)->getLoop() == &L);
+                     });
+    if (I != BaseRegs.end())
+      std::swap(ScaledReg, *I);
+  }
 }
 
 /// \brief Get rid of the scale in the formula.
@@ -459,6 +507,14 @@ bool Formula::unscale() {
   return true;
 }
 
+bool Formula::hasZeroEnd() const {
+  if (UnfoldedOffset || BaseOffset)
+    return false;
+  if (BaseRegs.size() != 1 || ScaledReg)
+    return false;
+  return true;
+}
+
 /// Return the total number of register operands used by this formula. This does
 /// not include register uses implied by non-constant addrec strides.
 size_t Formula::getNumRegs() const {
@@ -713,7 +769,7 @@ static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
   bool isAddress = isa<LoadInst>(Inst);
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
-    if (SI->getOperand(1) == OperandVal)
+    if (SI->getPointerOperand() == OperandVal)
       isAddress = true;
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     // Addressing modes can also be folded into prefetches and a variety
@@ -725,6 +781,12 @@ static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
           isAddress = true;
         break;
     }
+  } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    if (RMW->getPointerOperand() == OperandVal)
+      isAddress = true;
+  } else if (AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    if (CmpX->getPointerOperand() == OperandVal)
+      isAddress = true;
   }
   return isAddress;
 }
@@ -737,6 +799,10 @@ static MemAccessTy getAccessType(const Instruction *Inst) {
     AccessTy.AddrSpace = SI->getPointerAddressSpace();
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
     AccessTy.AddrSpace = LI->getPointerAddressSpace();
+  } else if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(Inst)) {
+    AccessTy.AddrSpace = RMW->getPointerAddressSpace();
+  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(Inst)) {
+    AccessTy.AddrSpace = CmpX->getPointerAddressSpace();
   }
 
   // All pointers have the same requirements, so canonicalize them to an
@@ -877,7 +943,8 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  const LSRUse &LU, const Formula &F);
 // Get the cost of the scaling factor used in F for LU.
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F);
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L);
 
 namespace {
 
@@ -885,6 +952,7 @@ namespace {
 class Cost {
   /// TODO: Some of these could be merged. Also, a lexical ordering
   /// isn't always optimal.
+  unsigned Insns;
   unsigned NumRegs;
   unsigned AddRecCost;
   unsigned NumIVMuls;
@@ -895,8 +963,8 @@ class Cost {
 
 public:
   Cost()
-    : NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0), ImmCost(0),
-      SetupCost(0), ScaleCost(0) {}
+    : Insns(0), NumRegs(0), AddRecCost(0), NumIVMuls(0), NumBaseAdds(0),
+      ImmCost(0), SetupCost(0), ScaleCost(0) {}
 
   bool operator<(const Cost &Other) const;
 
@@ -905,9 +973,9 @@ public:
 #ifndef NDEBUG
   // Once any of the metrics loses, they must all remain losers.
   bool isValid() {
-    return ((NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
+    return ((Insns | NumRegs | AddRecCost | NumIVMuls | NumBaseAdds
              | ImmCost | SetupCost | ScaleCost) != ~0u)
-      || ((NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
+      || ((Insns & NumRegs & AddRecCost & NumIVMuls & NumBaseAdds
            & ImmCost & SetupCost & ScaleCost) == ~0u);
   }
 #endif
@@ -1069,7 +1137,8 @@ public:
   }
   
   bool HasFormulaWithSameRegs(const Formula &F) const;
-  bool InsertFormula(const Formula &F);
+  float getNotSelectedProbability(const SCEV *Reg) const;
+  bool InsertFormula(const Formula &F, const Loop &L);
   void DeleteFormula(Formula &F);
   void RecomputeRegs(size_t LUIdx, RegUseTracker &Reguses);
 
@@ -1085,17 +1154,23 @@ void Cost::RateRegister(const SCEV *Reg,
                         const Loop *L,
                         ScalarEvolution &SE, DominatorTree &DT) {
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
-    // If this is an addrec for another loop, don't second-guess its addrec phi
-    // nodes. LSR isn't currently smart enough to reason about more than one
-    // loop at a time. LSR has already run on inner loops, will not run on outer
-    // loops, and cannot be expected to change sibling loops.
+    // If this is an addrec for another loop, it should be an invariant
+    // with respect to L since L is the innermost loop (at least
+    // for now LSR only handles innermost loops).
     if (AR->getLoop() != L) {
       // If the AddRec exists, consider it's register free and leave it alone.
       if (isExistingPhi(AR, SE))
         return;
 
-      // Otherwise, do not consider this formula at all.
-      Lose();
+      // It is bad to allow LSR for current loop to add induction variables
+      // for its sibling loops.
+      if (!AR->getLoop()->contains(L)) {
+        Lose();
+        return;
+      }
+
+      // Otherwise, it will be an invariant with respect to Loop L.
+      ++NumRegs;
       return;
     }
     AddRecCost += 1; /// TODO: This should be a function of the stride.
@@ -1152,8 +1227,11 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
                        ScalarEvolution &SE, DominatorTree &DT,
                        const LSRUse &LU,
                        SmallPtrSetImpl<const SCEV *> *LoserRegs) {
-  assert(F.isCanonical() && "Cost is accurate only for canonical formula");
+  assert(F.isCanonical(*L) && "Cost is accurate only for canonical formula");
   // Tally up the registers.
+  unsigned PrevAddRecCost = AddRecCost;
+  unsigned PrevNumRegs = NumRegs;
+  unsigned PrevNumBaseAdds = NumBaseAdds;
   if (const SCEV *ScaledReg = F.ScaledReg) {
     if (VisitedRegs.count(ScaledReg)) {
       Lose();
@@ -1173,6 +1251,18 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
       return;
   }
 
+  // Treat every new register that exceeds TTI.getNumberOfRegisters() - 1 as
+  // additional instruction (at least fill).
+  unsigned TTIRegNum = TTI.getNumberOfRegisters(false) - 1;
+  if (NumRegs > TTIRegNum) {
+    // Cost already exceeded TTIRegNum, then only newly added register can add
+    // new instructions.
+    if (PrevNumRegs > TTIRegNum)
+      Insns += (NumRegs - PrevNumRegs);
+    else
+      Insns += (NumRegs - TTIRegNum);
+  }
+
   // Determine how many (unfolded) adds we'll need inside the loop.
   size_t NumBaseParts = F.getNumRegs();
   if (NumBaseParts > 1)
@@ -1183,7 +1273,7 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
   NumBaseAdds += (F.UnfoldedOffset != 0);
 
   // Accumulate non-free scaling amounts.
-  ScaleCost += getScalingFactorCost(TTI, LU, F);
+  ScaleCost += getScalingFactorCost(TTI, LU, F, *L);
 
   // Tally up the non-zero immediates.
   for (const LSRFixup &Fixup : LU.Fixups) {
@@ -1201,11 +1291,30 @@ void Cost::RateFormula(const TargetTransformInfo &TTI,
         !TTI.isFoldableMemAccessOffset(Fixup.UserInst, Offset))
       NumBaseAdds++;
   }
+
+  // If ICmpZero formula ends with not 0, it could not be replaced by
+  // just add or sub. We'll need to compare final result of AddRec.
+  // That means we'll need an additional instruction.
+  // For -10 + {0, +, 1}:
+  // i = i + 1;
+  // cmp i, 10
+  //
+  // For {-10, +, 1}:
+  // i = i + 1;
+  if (LU.Kind == LSRUse::ICmpZero && !F.hasZeroEnd())
+    Insns++;
+  // Each new AddRec adds 1 instruction to calculation.
+  Insns += (AddRecCost - PrevAddRecCost);
+
+  // BaseAdds adds instructions for unfolded registers.
+  if (LU.Kind != LSRUse::ICmpZero)
+    Insns += NumBaseAdds - PrevNumBaseAdds;
   assert(isValid() && "invalid cost");
 }
 
 /// Set this cost to a losing value.
 void Cost::Lose() {
+  Insns = ~0u;
   NumRegs = ~0u;
   AddRecCost = ~0u;
   NumIVMuls = ~0u;
@@ -1217,6 +1326,8 @@ void Cost::Lose() {
 
 /// Choose the lower cost.
 bool Cost::operator<(const Cost &Other) const {
+  if (InsnsCost && Insns != Other.Insns)
+    return Insns < Other.Insns;
   return std::tie(NumRegs, AddRecCost, NumIVMuls, NumBaseAdds, ScaleCost,
                   ImmCost, SetupCost) <
          std::tie(Other.NumRegs, Other.AddRecCost, Other.NumIVMuls,
@@ -1225,6 +1336,7 @@ bool Cost::operator<(const Cost &Other) const {
 }
 
 void Cost::print(raw_ostream &OS) const {
+  OS << Insns << " instruction" << (Insns == 1 ? " " : "s ");
   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
   if (AddRecCost != 0)
     OS << ", with addrec cost " << AddRecCost;
@@ -1304,10 +1416,19 @@ bool LSRUse::HasFormulaWithSameRegs(const Formula &F) const {
   return Uniquifier.count(Key);
 }
 
+/// The function returns a probability of selecting formula without Reg.
+float LSRUse::getNotSelectedProbability(const SCEV *Reg) const {
+  unsigned FNum = 0;
+  for (const Formula &F : Formulae)
+    if (F.referencesReg(Reg))
+      FNum++;
+  return ((float)(Formulae.size() - FNum)) / Formulae.size();
+}
+
 /// If the given formula has not yet been inserted, add it to the list, and
 /// return true. Return false otherwise.  The formula must be in canonical form.
-bool LSRUse::InsertFormula(const Formula &F) {
-  assert(F.isCanonical() && "Invalid canonical representation");
+bool LSRUse::InsertFormula(const Formula &F, const Loop &L) {
+  assert(F.isCanonical(L) && "Invalid canonical representation");
 
   if (!Formulae.empty() && RigidFormula)
     return false;
@@ -1477,7 +1598,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
                                  int64_t MinOffset, int64_t MaxOffset,
                                  LSRUse::KindType Kind, MemAccessTy AccessTy,
-                                 const Formula &F) {
+                                 const Formula &F, const Loop &L) {
   // For the purpose of isAMCompletelyFolded either having a canonical formula
   // or a scale not equal to zero is correct.
   // Problems may arise from non canonical formulae having a scale == 0.
@@ -1485,7 +1606,7 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
   // However, when we generate the scaled formulae, we first check that the
   // scaling factor is profitable before computing the actual ScaledReg for
   // compile time sake.
-  assert((F.isCanonical() || F.Scale != 0));
+  assert((F.isCanonical(L) || F.Scale != 0));
   return isAMCompletelyFolded(TTI, MinOffset, MaxOffset, Kind, AccessTy,
                               F.BaseGV, F.BaseOffset, F.HasBaseReg, F.Scale);
 }
@@ -1520,14 +1641,15 @@ static bool isAMCompletelyFolded(const TargetTransformInfo &TTI,
 }
 
 static unsigned getScalingFactorCost(const TargetTransformInfo &TTI,
-                                     const LSRUse &LU, const Formula &F) {
+                                     const LSRUse &LU, const Formula &F,
+                                     const Loop &L) {
   if (!F.Scale)
     return 0;
 
   // If the use is not completely folded in that instruction, we will have to
   // pay an extra cost only for scale != 1.
   if (!isAMCompletelyFolded(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind,
-                            LU.AccessTy, F))
+                            LU.AccessTy, F, L))
     return F.Scale != 1;
 
   switch (LU.Kind) {
@@ -1777,6 +1899,7 @@ class LSRInstance {
   void NarrowSearchSpaceByDetectingSupersets();
   void NarrowSearchSpaceByCollapsingUnrolledCode();
   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
+  void NarrowSearchSpaceByDeletingCostlyFormulas();
   void NarrowSearchSpaceByPickingWinnerRegs();
   void NarrowSearchSpaceUsingHeuristics();
 
@@ -2497,7 +2620,12 @@ static Value *getWideOperand(Value *Oper) {
 static bool isCompatibleIVType(Value *LVal, Value *RVal) {
   Type *LType = LVal->getType();
   Type *RType = RVal->getType();
-  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy());
+  return (LType == RType) || (LType->isPointerTy() && RType->isPointerTy() &&
+                              // Different address spaces means (possibly)
+                              // different types of the pointer implementation,
+                              // e.g. i16 vs i32 so disallow that.
+                              (LType->getPointerAddressSpace() ==
+                               RType->getPointerAddressSpace()));
 }
 
 /// Return an approximation of this SCEV expression's "base", or NULL for any
@@ -3115,7 +3243,8 @@ bool LSRInstance::InsertFormula(LSRUse &LU, unsigned LUIdx, const Formula &F) {
   // Do not insert formula that we will not be able to expand.
   assert(isLegalUse(TTI, LU.MinOffset, LU.MaxOffset, LU.Kind, LU.AccessTy, F) &&
          "Formula is illegal");
-  if (!LU.InsertFormula(F))
+
+  if (!LU.InsertFormula(F, *L))
     return false;
 
   CountRegisters(F, LUIdx);
@@ -3354,7 +3483,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
       F.BaseRegs.push_back(*J);
     // We may have changed the number of register in base regs, adjust the
     // formula accordingly.
-    F.canonicalize();
+    F.canonicalize(*L);
 
     if (InsertFormula(LU, LUIdx, F))
       // If that formula hadn't been seen before, recurse to find more like
@@ -3366,7 +3495,7 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
 /// Split out subexpressions from adds and the bases of addrecs.
 void LSRInstance::GenerateReassociations(LSRUse &LU, unsigned LUIdx,
                                          Formula Base, unsigned Depth) {
-  assert(Base.isCanonical() && "Input must be in the canonical form");
+  assert(Base.isCanonical(*L) && "Input must be in the canonical form");
   // Arbitrarily cap recursion to protect compile time.
   if (Depth >= 3)
     return;
@@ -3407,7 +3536,7 @@ void LSRInstance::GenerateCombinations(LSRUse &LU, unsigned LUIdx,
     // rather than proceed with zero in a register.
     if (!Sum->isZero()) {
       F.BaseRegs.push_back(Sum);
-      F.canonicalize();
+      F.canonicalize(*L);
       (void)InsertFormula(LU, LUIdx, F);
     }
   }
@@ -3464,7 +3593,7 @@ void LSRInstance::GenerateConstantOffsetsImpl(
           F.ScaledReg = nullptr;
         } else
           F.deleteBaseReg(F.BaseRegs[Idx]);
-        F.canonicalize();
+        F.canonicalize(*L);
       } else if (IsScaledReg)
         F.ScaledReg = NewG;
       else
@@ -3627,10 +3756,10 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
     if (LU.Kind == LSRUse::ICmpZero &&
         !Base.HasBaseReg && Base.BaseOffset == 0 && !Base.BaseGV)
       continue;
-    // For each addrec base reg, apply the scale, if possible.
-    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
-      if (const SCEVAddRecExpr *AR =
-            dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
+    // For each addrec base reg, if its loop is current loop, apply the scale.
+    for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
+      const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i]);
+      if (AR && (AR->getLoop() == L || LU.AllFixupsOutsideLoop)) {
         const SCEV *FactorS = SE.getConstant(IntTy, Factor);
         if (FactorS->isZero())
           continue;
@@ -3644,11 +3773,17 @@ void LSRInstance::GenerateScales(LSRUse &LU, unsigned LUIdx, Formula Base) {
           // The canonical representation of 1*reg is reg, which is already in
           // Base. In that case, do not try to insert the formula, it will be
           // rejected anyway.
-          if (F.Scale == 1 && F.BaseRegs.empty())
+          if (F.Scale == 1 && (F.BaseRegs.empty() ||
+                               (AR->getLoop() != L && LU.AllFixupsOutsideLoop)))
             continue;
+          // If AllFixupsOutsideLoop is true and F.Scale is 1, we may generate
+          // non canonical Formula with ScaledReg's loop not being L.
+          if (F.Scale == 1 && LU.AllFixupsOutsideLoop)
+            F.canonicalize(*L);
           (void)InsertFormula(LU, LUIdx, F);
         }
       }
+    }
   }
 }
 
@@ -3829,7 +3964,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
             continue;
 
         // OK, looks good.
-        NewF.canonicalize();
+        NewF.canonicalize(*this->L);
         (void)InsertFormula(LU, LUIdx, NewF);
       } else {
         // Use the immediate in a base register.
@@ -3861,7 +3996,7 @@ void LSRInstance::GenerateCrossUseConstantOffsets() {
                 goto skip_formula;
 
           // Ok, looks good.
-          NewF.canonicalize();
+          NewF.canonicalize(*this->L);
           (void)InsertFormula(LU, LUIdx, NewF);
           break;
         skip_formula:;
@@ -4173,6 +4308,144 @@ void LSRInstance::NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters(){
   }
 }
 
+/// The function delete formulas with high registers number expectation.
+/// Assuming we don't know the value of each formula (already delete
+/// all inefficient), generate probability of not selecting for each
+/// register.
+/// For example,
+/// Use1:
+///  reg(a) + reg({0,+,1})
+///  reg(a) + reg({-1,+,1}) + 1
+///  reg({a,+,1})
+/// Use2:
+///  reg(b) + reg({0,+,1})
+///  reg(b) + reg({-1,+,1}) + 1
+///  reg({b,+,1})
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1})
+///  reg(c) + reg({b,+,1})
+///
+/// Probability of not selecting
+///                 Use1   Use2    Use3
+/// reg(a)         (1/3) *   1   *   1
+/// reg(b)           1   * (1/3) * (1/2)
+/// reg({0,+,1})   (2/3) * (2/3) * (1/2)
+/// reg({-1,+,1})  (2/3) * (2/3) *   1
+/// reg({a,+,1})   (2/3) *   1   *   1
+/// reg({b,+,1})     1   * (2/3) * (2/3)
+/// reg(c)           1   *   1   *   0
+///
+/// Now count registers number mathematical expectation for each formula:
+/// Note that for each use we exclude probability if not selecting for the use.
+/// For example for Use1 probability for reg(a) would be just 1 * 1 (excluding
+/// probabilty 1/3 of not selecting for Use1).
+/// Use1:
+///  reg(a) + reg({0,+,1})          1 + 1/3       -- to be deleted
+///  reg(a) + reg({-1,+,1}) + 1     1 + 4/9       -- to be deleted
+///  reg({a,+,1})                   1
+/// Use2:
+///  reg(b) + reg({0,+,1})          1/2 + 1/3     -- to be deleted
+///  reg(b) + reg({-1,+,1}) + 1     1/2 + 2/3     -- to be deleted
+///  reg({b,+,1})                   2/3
+/// Use3:
+///  reg(c) + reg(b) + reg({0,+,1}) 1 + 1/3 + 4/9 -- to be deleted
+///  reg(c) + reg({b,+,1})          1 + 2/3
+
+void LSRInstance::NarrowSearchSpaceByDeletingCostlyFormulas() {
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+  // Ok, we have too many of formulae on our hands to conveniently handle.
+  // Use a rough heuristic to thin out the list.
+
+  // Set of Regs wich will be 100% used in final solution.
+  // Used in each formula of a solution (in example above this is reg(c)).
+  // We can skip them in calculations.
+  SmallPtrSet<const SCEV *, 4> UniqRegs;
+  DEBUG(dbgs() << "The search space is too complex.\n");
+
+  // Map each register to probability of not selecting
+  DenseMap <const SCEV *, float> RegNumMap;
+  for (const SCEV *Reg : RegUses) {
+    if (UniqRegs.count(Reg))
+      continue;
+    float PNotSel = 1;
+    for (const LSRUse &LU : Uses) {
+      if (!LU.Regs.count(Reg))
+        continue;
+      float P = LU.getNotSelectedProbability(Reg);
+      if (P != 0.0)
+        PNotSel *= P;
+      else
+        UniqRegs.insert(Reg);
+    }
+    RegNumMap.insert(std::make_pair(Reg, PNotSel));
+  }
+
+  DEBUG(dbgs() << "Narrowing the search space by deleting costly formulas\n");
+
+  // Delete formulas where registers number expectation is high.
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+    // If nothing to delete - continue.
+    if (LU.Formulae.size() < 2)
+      continue;
+    // This is temporary solution to test performance. Float should be
+    // replaced with round independent type (based on integers) to avoid
+    // different results for different target builds.
+    float FMinRegNum = LU.Formulae[0].getNumRegs();
+    float FMinARegNum = LU.Formulae[0].getNumRegs();
+    size_t MinIdx = 0;
+    for (size_t i = 0, e = LU.Formulae.size(); i != e; ++i) {
+      Formula &F = LU.Formulae[i];
+      float FRegNum = 0;
+      float FARegNum = 0;
+      for (const SCEV *BaseReg : F.BaseRegs) {
+        if (UniqRegs.count(BaseReg))
+          continue;
+        FRegNum += RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+        if (isa<SCEVAddRecExpr>(BaseReg))
+          FARegNum +=
+              RegNumMap[BaseReg] / LU.getNotSelectedProbability(BaseReg);
+      }
+      if (const SCEV *ScaledReg = F.ScaledReg) {
+        if (!UniqRegs.count(ScaledReg)) {
+          FRegNum +=
+              RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+          if (isa<SCEVAddRecExpr>(ScaledReg))
+            FARegNum +=
+                RegNumMap[ScaledReg] / LU.getNotSelectedProbability(ScaledReg);
+        }
+      }
+      if (FMinRegNum > FRegNum ||
+          (FMinRegNum == FRegNum && FMinARegNum > FARegNum)) {
+        FMinRegNum = FRegNum;
+        FMinARegNum = FARegNum;
+        MinIdx = i;
+      }
+    }
+    DEBUG(dbgs() << "  The formula "; LU.Formulae[MinIdx].print(dbgs());
+          dbgs() << " with min reg num " << FMinRegNum << '\n');
+    if (MinIdx != 0)
+      std::swap(LU.Formulae[MinIdx], LU.Formulae[0]);
+    while (LU.Formulae.size() != 1) {
+      DEBUG(dbgs() << "  Deleting "; LU.Formulae.back().print(dbgs());
+            dbgs() << '\n');
+      LU.Formulae.pop_back();
+    }
+    LU.RecomputeRegs(LUIdx, RegUses);
+    assert(LU.Formulae.size() == 1 && "Should be exactly 1 min regs formula");
+    Formula &F = LU.Formulae[0];
+    DEBUG(dbgs() << "  Leaving only "; F.print(dbgs()); dbgs() << '\n');
+    // When we choose the formula, the regs become unique.
+    UniqRegs.insert(F.BaseRegs.begin(), F.BaseRegs.end());
+    if (F.ScaledReg)
+      UniqRegs.insert(F.ScaledReg);
+  }
+  DEBUG(dbgs() << "After pre-selection:\n";
+  print_uses(dbgs()));
+}
+
+
 /// Pick a register which seems likely to be profitable, and then in any use
 /// which has any reference to that register, delete all formulae which do not
 /// reference that register.
@@ -4245,7 +4518,10 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
   NarrowSearchSpaceByDetectingSupersets();
   NarrowSearchSpaceByCollapsingUnrolledCode();
   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
-  NarrowSearchSpaceByPickingWinnerRegs();
+  if (LSRExpNarrow)
+    NarrowSearchSpaceByDeletingCostlyFormulas();
+  else
+    NarrowSearchSpaceByPickingWinnerRegs();
 }
 
 /// This is the recursive solver.
diff --git a/lib/Transforms/Scalar/LoopUnrollPass.cpp b/lib/Transforms/Scalar/LoopUnrollPass.cpp
index d827db680d213bffb1529f878b218be4f02406d3..62aa6ee48069d8e967d3f88b9012bb6e97d1422e 100644
--- a/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -110,7 +110,7 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold(
              "aggressively unrolled."));
 
 static cl::opt<bool>
-    UnrollAllowPeeling("unroll-allow-peeling", cl::Hidden,
+    UnrollAllowPeeling("unroll-allow-peeling", cl::init(true), cl::Hidden,
                        cl::desc("Allows loops to be peeled when the dynamic "
                                 "trip count is known to be low."));
 
@@ -131,13 +131,14 @@ static const unsigned NoThreshold = UINT_MAX;
 /// Gather the various unrolling parameters based on the defaults, compiler
 /// flags, TTI overrides and user specified parameters.
 static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
-    Loop *L, const TargetTransformInfo &TTI, Optional<unsigned> UserThreshold,
-    Optional<unsigned> UserCount, Optional<bool> UserAllowPartial,
-    Optional<bool> UserRuntime, Optional<bool> UserUpperBound) {
+    Loop *L, const TargetTransformInfo &TTI, int OptLevel,
+    Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
+    Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
+    Optional<bool> UserUpperBound) {
   TargetTransformInfo::UnrollingPreferences UP;
 
   // Set up the defaults
-  UP.Threshold = 150;
+  UP.Threshold = OptLevel > 2 ? 300 : 150;
   UP.MaxPercentThresholdBoost = 400;
   UP.OptSizeThreshold = 0;
   UP.PartialThreshold = 150;
@@ -154,7 +155,7 @@ static TargetTransformInfo::UnrollingPreferences gatherUnrollingPreferences(
   UP.AllowExpensiveTripCount = false;
   UP.Force = false;
   UP.UpperBound = false;
-  UP.AllowPeeling = false;
+  UP.AllowPeeling = true;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, UP);
@@ -508,7 +509,7 @@ analyzeLoopUnrollCost(const Loop *L, unsigned TripCount, DominatorTree &DT,
             KnownSucc = SI->getSuccessor(0);
           else if (ConstantInt *SimpleCondVal =
                        dyn_cast<ConstantInt>(SimpleCond))
-            KnownSucc = SI->findCaseValue(SimpleCondVal).getCaseSuccessor();
+            KnownSucc = SI->findCaseValue(SimpleCondVal)->getCaseSuccessor();
         }
       }
       if (KnownSucc) {
@@ -783,7 +784,15 @@ static bool computeUnrollCount(
     }
   }
 
-  // 4rd priority is partial unrolling.
+  // 4th priority is loop peeling
+  computePeelCount(L, LoopSize, UP, TripCount);
+  if (UP.PeelCount) {
+    UP.Runtime = false;
+    UP.Count = 1;
+    return ExplicitUnroll;
+  }
+
+  // 5th priority is partial unrolling.
   // Try partial unroll only when TripCount could be staticaly calculated.
   if (TripCount) {
     UP.Partial |= ExplicitUnroll;
@@ -846,14 +855,6 @@ static bool computeUnrollCount(
         << "Unable to fully unroll loop as directed by unroll(full) pragma "
            "because loop has a runtime trip count.");
 
-  // 5th priority is loop peeling
-  computePeelCount(L, LoopSize, UP);
-  if (UP.PeelCount) {
-    UP.Runtime = false;
-    UP.Count = 1;
-    return ExplicitUnroll;
-  }
-
   // 6th priority is runtime unrolling.
   // Don't unroll a runtime trip count loop when it is disabled.
   if (HasRuntimeUnrollDisablePragma(L)) {
@@ -927,7 +928,7 @@ static bool computeUnrollCount(
 static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                             ScalarEvolution *SE, const TargetTransformInfo &TTI,
                             AssumptionCache &AC, OptimizationRemarkEmitter &ORE,
-                            bool PreserveLCSSA,
+                            bool PreserveLCSSA, int OptLevel,
                             Optional<unsigned> ProvidedCount,
                             Optional<unsigned> ProvidedThreshold,
                             Optional<bool> ProvidedAllowPartial,
@@ -947,7 +948,7 @@ static bool tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   bool NotDuplicatable;
   bool Convergent;
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
-      L, TTI, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
+      L, TTI, OptLevel, ProvidedThreshold, ProvidedCount, ProvidedAllowPartial,
       ProvidedRuntime, ProvidedUpperBound);
   // Exit early if unrolling is disabled.
   if (UP.Threshold == 0 && (!UP.Partial || UP.PartialThreshold == 0))
@@ -1047,16 +1048,17 @@ namespace {
 class LoopUnroll : public LoopPass {
 public:
   static char ID; // Pass ID, replacement for typeid
-  LoopUnroll(Optional<unsigned> Threshold = None,
+  LoopUnroll(int OptLevel = 2, Optional<unsigned> Threshold = None,
              Optional<unsigned> Count = None,
              Optional<bool> AllowPartial = None, Optional<bool> Runtime = None,
              Optional<bool> UpperBound = None)
-      : LoopPass(ID), ProvidedCount(std::move(Count)),
+      : LoopPass(ID), OptLevel(OptLevel), ProvidedCount(std::move(Count)),
         ProvidedThreshold(Threshold), ProvidedAllowPartial(AllowPartial),
         ProvidedRuntime(Runtime), ProvidedUpperBound(UpperBound) {
     initializeLoopUnrollPass(*PassRegistry::getPassRegistry());
   }
 
+  int OptLevel;
   Optional<unsigned> ProvidedCount;
   Optional<unsigned> ProvidedThreshold;
   Optional<bool> ProvidedAllowPartial;
@@ -1081,7 +1083,7 @@ public:
     OptimizationRemarkEmitter ORE(&F);
     bool PreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
 
-    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA,
+    return tryToUnrollLoop(L, DT, LI, SE, TTI, AC, ORE, PreserveLCSSA, OptLevel,
                            ProvidedCount, ProvidedThreshold,
                            ProvidedAllowPartial, ProvidedRuntime,
                            ProvidedUpperBound);
@@ -1107,21 +1109,22 @@ INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(LoopUnroll, "loop-unroll", "Unroll loops", false, false)
 
-Pass *llvm::createLoopUnrollPass(int Threshold, int Count, int AllowPartial,
-                                 int Runtime, int UpperBound) {
+Pass *llvm::createLoopUnrollPass(int OptLevel, int Threshold, int Count,
+                                 int AllowPartial, int Runtime,
+                                 int UpperBound) {
   // TODO: It would make more sense for this function to take the optionals
   // directly, but that's dangerous since it would silently break out of tree
   // callers.
-  return new LoopUnroll(Threshold == -1 ? None : Optional<unsigned>(Threshold),
-                        Count == -1 ? None : Optional<unsigned>(Count),
-                        AllowPartial == -1 ? None
-                                           : Optional<bool>(AllowPartial),
-                        Runtime == -1 ? None : Optional<bool>(Runtime),
-                        UpperBound == -1 ? None : Optional<bool>(UpperBound));
+  return new LoopUnroll(
+      OptLevel, Threshold == -1 ? None : Optional<unsigned>(Threshold),
+      Count == -1 ? None : Optional<unsigned>(Count),
+      AllowPartial == -1 ? None : Optional<bool>(AllowPartial),
+      Runtime == -1 ? None : Optional<bool>(Runtime),
+      UpperBound == -1 ? None : Optional<bool>(UpperBound));
 }
 
-Pass *llvm::createSimpleLoopUnrollPass() {
-  return llvm::createLoopUnrollPass(-1, -1, 0, 0, 0);
+Pass *llvm::createSimpleLoopUnrollPass(int OptLevel) {
+  return llvm::createLoopUnrollPass(OptLevel, -1, -1, 0, 0, 0);
 }
 
 PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
@@ -1153,10 +1156,10 @@ PreservedAnalyses LoopUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
   Optional<bool> AllowPartialParam, RuntimeParam, UpperBoundParam;
   if (!AllowPartialUnrolling)
     AllowPartialParam = RuntimeParam = UpperBoundParam = false;
-  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
-                                 /*PreserveLCSSA*/ true, /*Count*/ None,
-                                 /*Threshold*/ None, AllowPartialParam,
-                                 RuntimeParam, UpperBoundParam);
+  bool Changed = tryToUnrollLoop(
+      &L, AR.DT, &AR.LI, &AR.SE, AR.TTI, AR.AC, *ORE,
+      /*PreserveLCSSA*/ true, OptLevel, /*Count*/ None,
+      /*Threshold*/ None, AllowPartialParam, RuntimeParam, UpperBoundParam);
   if (!Changed)
     return PreservedAnalyses::all();
 
diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
index d7daab4c666a6182ae94292938d926d46e3c4814..a99c9999c61912cfbc5bb9bbda52e6642178c6c4 100644
--- a/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/Analysis/DivergenceAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
@@ -47,6 +48,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/CommandLine.h"
@@ -77,19 +79,6 @@ static cl::opt<unsigned>
 Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
           cl::init(100), cl::Hidden);
 
-static cl::opt<bool>
-LoopUnswitchWithBlockFrequency("loop-unswitch-with-block-frequency",
-    cl::init(false), cl::Hidden,
-    cl::desc("Enable the use of the block frequency analysis to access PGO "
-             "heuristics to minimize code growth in cold regions."));
-
-static cl::opt<unsigned>
-ColdnessThreshold("loop-unswitch-coldness-threshold", cl::init(1), cl::Hidden,
-    cl::desc("Coldness threshold in percentage. The loop header frequency "
-             "(relative to the entry frequency) is compared with this "
-             "threshold to determine if non-trivial unswitching should be "
-             "enabled."));
-
 namespace {
 
   class LUAnalysisCache {
@@ -174,13 +163,6 @@ namespace {
 
     LUAnalysisCache BranchesInfo;
 
-    bool EnabledPGO;
-
-    // BFI and ColdEntryFreq are only used when PGO and
-    // LoopUnswitchWithBlockFrequency are enabled.
-    BlockFrequencyInfo BFI;
-    BlockFrequency ColdEntryFreq;
-
     bool OptimizeForSize;
     bool redoLoop;
 
@@ -199,12 +181,14 @@ namespace {
     // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
     std::vector<BasicBlock*> NewBlocks;
 
+    bool hasBranchDivergence;
+
   public:
     static char ID; // Pass ID, replacement for typeid
-    explicit LoopUnswitch(bool Os = false) :
+    explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false) :
       LoopPass(ID), OptimizeForSize(Os), redoLoop(false),
       currentLoop(nullptr), DT(nullptr), loopHeader(nullptr),
-      loopPreheader(nullptr) {
+      loopPreheader(nullptr), hasBranchDivergence(hasBranchDivergence) {
         initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
       }
 
@@ -217,6 +201,8 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
+      if (hasBranchDivergence)
+        AU.addRequired<DivergenceAnalysis>();
       getLoopAnalysisUsage(AU);
     }
 
@@ -255,6 +241,11 @@ namespace {
                                         TerminatorInst *TI);
 
     void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+
+    /// Given that the Invariant is not equal to Val. Simplify instructions
+    /// in the loop.
+    Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
+                                           Constant *Val);
   };
 }
 
@@ -381,16 +372,35 @@ INITIALIZE_PASS_BEGIN(LoopUnswitch, "loop-unswitch", "Unswitch loops",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(LoopPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DivergenceAnalysis)
 INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 
-Pass *llvm::createLoopUnswitchPass(bool Os) {
-  return new LoopUnswitch(Os);
+Pass *llvm::createLoopUnswitchPass(bool Os, bool hasBranchDivergence) {
+  return new LoopUnswitch(Os, hasBranchDivergence);
 }
 
+/// Operator chain lattice.
+enum OperatorChain {
+  OC_OpChainNone,    ///< There is no operator.
+  OC_OpChainOr,      ///< There are only ORs.
+  OC_OpChainAnd,     ///< There are only ANDs.
+  OC_OpChainMixed    ///< There are ANDs and ORs.
+};
+
 /// Cond is a condition that occurs in L. If it is invariant in the loop, or has
 /// an invariant piece, return the invariant. Otherwise, return null.
+//
+/// NOTE: FindLIVLoopCondition will not return a partial LIV by walking up a
+/// mixed operator chain, as we can not reliably find a value which will simplify
+/// the operator chain. If the chain is AND-only or OR-only, we can use 0 or ~0
+/// to simplify the chain.
+///
+/// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
+/// simplify the condition itself to a loop variant condition, but at the
+/// cost of creating an entirely new loop.
 static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+                                   OperatorChain &ParentChain,
                                    DenseMap<Value *, Value *> &Cache) {
   auto CacheIt = Cache.find(Cond);
   if (CacheIt != Cache.end())
@@ -414,21 +424,53 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
     return Cond;
   }
 
+  // Walk up the operator chain to find partial invariant conditions.
   if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
     if (BO->getOpcode() == Instruction::And ||
         BO->getOpcode() == Instruction::Or) {
-      // If either the left or right side is invariant, we can unswitch on this,
-      // which will cause the branch to go away in one loop and the condition to
-      // simplify in the other one.
-      if (Value *LHS =
-              FindLIVLoopCondition(BO->getOperand(0), L, Changed, Cache)) {
-        Cache[Cond] = LHS;
-        return LHS;
+      // Given the previous operator, compute the current operator chain status.
+      OperatorChain NewChain;
+      switch (ParentChain) {
+      case OC_OpChainNone:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainOr;
+        break;
+      case OC_OpChainOr:
+        NewChain = BO->getOpcode() == Instruction::Or ? OC_OpChainOr :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainAnd:
+        NewChain = BO->getOpcode() == Instruction::And ? OC_OpChainAnd :
+                                      OC_OpChainMixed;
+        break;
+      case OC_OpChainMixed:
+        NewChain = OC_OpChainMixed;
+        break;
       }
-      if (Value *RHS =
-              FindLIVLoopCondition(BO->getOperand(1), L, Changed, Cache)) {
-        Cache[Cond] = RHS;
-        return RHS;
+
+      // If we reach a Mixed state, we do not want to keep walking up as we can not
+      // reliably find a value that will simplify the chain. With this check, we
+      // will return null on the first sight of mixed chain and the caller will
+      // either backtrack to find partial LIV in other operand or return null.
+      if (NewChain != OC_OpChainMixed) {
+        // Update the current operator chain type before we search up the chain.
+        ParentChain = NewChain;
+        // If either the left or right side is invariant, we can unswitch on this,
+        // which will cause the branch to go away in one loop and the condition to
+        // simplify in the other one.
+        if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
+                                              ParentChain, Cache)) {
+          Cache[Cond] = LHS;
+          return LHS;
+        }
+        // We did not manage to find a partial LIV in operand(0). Backtrack and try
+        // operand(1).
+        ParentChain = NewChain;
+        if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
+                                              ParentChain, Cache)) {
+          Cache[Cond] = RHS;
+          return RHS;
+        }
       }
     }
 
@@ -436,9 +478,21 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
   return nullptr;
 }
 
-static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+/// Cond is a condition that occurs in L. If it is invariant in the loop, or has
+/// an invariant piece, return the invariant along with the operator chain type.
+/// Otherwise, return null.
+static std::pair<Value *, OperatorChain> FindLIVLoopCondition(Value *Cond,
+                                                              Loop *L,
+                                                              bool &Changed) {
   DenseMap<Value *, Value *> Cache;
-  return FindLIVLoopCondition(Cond, L, Changed, Cache);
+  OperatorChain OpChain = OC_OpChainNone;
+  Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache);
+
+  // In case we do find a LIV, it can not be obtained by walking up a mixed
+  // operator chain.
+  assert((!FCond || OpChain != OC_OpChainMixed) &&
+        "Do not expect a partial LIV with mixed operator chain");
+  return {FCond, OpChain};
 }
 
 bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
@@ -457,19 +511,6 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
   if (SanitizeMemory)
     computeLoopSafetyInfo(&SafetyInfo, L);
 
-  EnabledPGO = F->getEntryCount().hasValue();
-
-  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
-    BranchProbabilityInfo BPI(*F, *LI);
-    BFI.calculate(*L->getHeader()->getParent(), BPI, *LI);
-
-    // Use BranchProbability to compute a minimum frequency based on
-    // function entry baseline frequency. Loops with headers below this
-    // frequency are considered as cold.
-    const BranchProbability ColdProb(ColdnessThreshold, 100);
-    ColdEntryFreq = BlockFrequency(BFI.getEntryFreq()) * ColdProb;
-  }
-
   bool Changed = false;
   do {
     assert(currentLoop->isLCSSAForm(*DT));
@@ -581,19 +622,9 @@ bool LoopUnswitch::processCurrentLoop() {
       loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
     return false;
 
-  if (LoopUnswitchWithBlockFrequency && EnabledPGO) {
-    // Compute the weighted frequency of the hottest block in the
-    // loop (loopHeader in this case since inner loops should be
-    // processed before outer loop). If it is less than ColdFrequency,
-    // we should not unswitch.
-    BlockFrequency LoopEntryFreq = BFI.getBlockFreq(loopHeader);
-    if (LoopEntryFreq < ColdEntryFreq)
-      return false;
-  }
-
   for (IntrinsicInst *Guard : Guards) {
     Value *LoopCond =
-        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed);
+        FindLIVLoopCondition(Guard->getOperand(0), currentLoop, Changed).first;
     if (LoopCond &&
         UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
       // NB! Unswitching (if successful) could have erased some of the
@@ -634,7 +665,7 @@ bool LoopUnswitch::processCurrentLoop() {
         // See if this, or some part of it, is loop invariant.  If so, we can
         // unswitch on it if we desire.
         Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                               currentLoop, Changed);
+                                               currentLoop, Changed).first;
         if (LoopCond &&
             UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
           ++NumBranches;
@@ -642,24 +673,48 @@ bool LoopUnswitch::processCurrentLoop() {
         }
       }
     } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
-      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                             currentLoop, Changed);
+      Value *SC = SI->getCondition();
+      Value *LoopCond;
+      OperatorChain OpChain;
+      std::tie(LoopCond, OpChain) =
+        FindLIVLoopCondition(SC, currentLoop, Changed);
+
       unsigned NumCases = SI->getNumCases();
       if (LoopCond && NumCases) {
         // Find a value to unswitch on:
         // FIXME: this should chose the most expensive case!
         // FIXME: scan for a case with a non-critical edge?
         Constant *UnswitchVal = nullptr;
-
-        // Do not process same value again and again.
-        // At this point we have some cases already unswitched and
-        // some not yet unswitched. Let's find the first not yet unswitched one.
-        for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-             i != e; ++i) {
-          Constant *UnswitchValCandidate = i.getCaseValue();
-          if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
-            UnswitchVal = UnswitchValCandidate;
-            break;
+        // Find a case value such that at least one case value is unswitched
+        // out.
+        if (OpChain == OC_OpChainAnd) {
+          // If the chain only has ANDs and the switch has a case value of 0.
+          // Dropping in a 0 to the chain will unswitch out the 0-casevalue.
+          auto *AllZero = cast<ConstantInt>(Constant::getNullValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllZero))
+            continue;
+          // We are unswitching 0 out.
+          UnswitchVal = AllZero;
+        } else if (OpChain == OC_OpChainOr) {
+          // If the chain only has ORs and the switch has a case value of ~0.
+          // Dropping in a ~0 to the chain will unswitch out the ~0-casevalue.
+          auto *AllOne = cast<ConstantInt>(Constant::getAllOnesValue(SC->getType()));
+          if (BranchesInfo.isUnswitched(SI, AllOne))
+            continue;
+          // We are unswitching ~0 out.
+          UnswitchVal = AllOne;
+        } else {
+          assert(OpChain == OC_OpChainNone && 
+                 "Expect to unswitch on trivial chain");
+          // Do not process same value again and again.
+          // At this point we have some cases already unswitched and
+          // some not yet unswitched. Let's find the first not yet unswitched one.
+          for (auto Case : SI->cases()) {
+            Constant *UnswitchValCandidate = Case.getCaseValue();
+            if (!BranchesInfo.isUnswitched(SI, UnswitchValCandidate)) {
+              UnswitchVal = UnswitchValCandidate;
+              break;
+            }
           }
         }
 
@@ -668,6 +723,11 @@ bool LoopUnswitch::processCurrentLoop() {
 
         if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
           ++NumSwitches;
+          // In case of a full LIV, UnswitchVal is the value we unswitched out.
+          // In case of a partial LIV, we only unswitch when its an AND-chain
+          // or OR-chain. In both cases switch input value simplifies to
+          // UnswitchVal.
+          BranchesInfo.setUnswitched(SI, UnswitchVal);
           return true;
         }
       }
@@ -678,7 +738,7 @@ bool LoopUnswitch::processCurrentLoop() {
          BBI != E; ++BBI)
       if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
         Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                               currentLoop, Changed);
+                                               currentLoop, Changed).first;
         if (LoopCond && UnswitchIfProfitable(LoopCond,
                                              ConstantInt::getTrue(Context))) {
           ++NumSelects;
@@ -753,6 +813,15 @@ bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
                  << ". Cost too high.\n");
     return false;
   }
+  if (hasBranchDivergence &&
+      getAnalysis<DivergenceAnalysis>().isDivergent(LoopCond)) {
+    DEBUG(dbgs() << "NOT unswitching loop %"
+                 << currentLoop->getHeader()->getName()
+                 << " at non-trivial condition '" << *Val
+                 << "' == " << *LoopCond << "\n"
+                 << ". Condition is divergent.\n");
+    return false;
+  }
 
   UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
   return true;
@@ -917,8 +986,8 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
       if (!Cond)
         break;
       // Find the target block we are definitely going to.
-      CurrentBB = SI->findCaseValue(Cond).getCaseSuccessor();
-    } else { 
+      CurrentBB = SI->findCaseValue(Cond)->getCaseSuccessor();
+    } else {
       // We do not understand these terminator instructions.
       break;
     }
@@ -937,7 +1006,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
       return false;
 
     Value *LoopCond = FindLIVLoopCondition(BI->getCondition(),
-                                           currentLoop, Changed);
+                                           currentLoop, Changed).first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -968,7 +1037,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
     // If this isn't switching on an invariant condition, we can't unswitch it.
     Value *LoopCond = FindLIVLoopCondition(SI->getCondition(),
-                                           currentLoop, Changed);
+                                           currentLoop, Changed).first;
 
     // Unswitch only if the trivial condition itself is an LIV (not
     // partial LIV which could occur in and/or)
@@ -981,13 +1050,12 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     // this.
     // Note that we can't trivially unswitch on the default case or
     // on already unswitched cases.
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-         i != e; ++i) {
+    for (auto Case : SI->cases()) {
       BasicBlock *LoopExitCandidate;
-      if ((LoopExitCandidate = isTrivialLoopExitBlock(currentLoop,
-                                               i.getCaseSuccessor()))) {
+      if ((LoopExitCandidate =
+               isTrivialLoopExitBlock(currentLoop, Case.getCaseSuccessor()))) {
         // Okay, we found a trivial case, remember the value that is trivial.
-        ConstantInt *CaseVal = i.getCaseValue();
+        ConstantInt *CaseVal = Case.getCaseValue();
 
         // Check that it was not unswitched before, since already unswitched
         // trivial vals are looks trivial too.
@@ -1006,6 +1074,9 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
 
     UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
                              nullptr);
+
+    // We are only unswitching full LIV.
+    BranchesInfo.setUnswitched(SI, CondVal);
     ++NumSwitches;
     return true;
   }
@@ -1261,18 +1332,38 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     if (!UI || !L->contains(UI))
       continue;
 
-    Worklist.push_back(UI);
+    // At this point, we know LIC is definitely not Val. Try to use some simple
+    // logic to simplify the user w.r.t. to the context.
+    if (Value *Replacement = SimplifyInstructionWithNotEqual(UI, LIC, Val)) {
+      if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
+        // This in-loop instruction has been simplified w.r.t. its context,
+        // i.e. LIC != Val, make sure we propagate its replacement value to
+        // all its users.
+        //  
+        // We can not yet delete UI, the LIC user, yet, because that would invalidate
+        // the LIC->users() iterator !. However, we can make this instruction
+        // dead by replacing all its users and push it onto the worklist so that
+        // it can be properly deleted and its operands simplified. 
+        UI->replaceAllUsesWith(Replacement);
+      }
+    }
 
-    // TODO: We could do other simplifications, for example, turning
-    // 'icmp eq LIC, Val' -> false.
+    // This is a LIC user, push it into the worklist so that SimplifyCode can
+    // attempt to simplify it.
+    Worklist.push_back(UI);
 
     // If we know that LIC is not Val, use this info to simplify code.
     SwitchInst *SI = dyn_cast<SwitchInst>(UI);
     if (!SI || !isa<ConstantInt>(Val)) continue;
 
-    SwitchInst::CaseIt DeadCase = SI->findCaseValue(cast<ConstantInt>(Val));
+    // NOTE: if a case value for the switch is unswitched out, we record it
+    // after the unswitch finishes. We can not record it here as the switch
+    // is not a direct user of the partial LIV.
+    SwitchInst::CaseHandle DeadCase =
+        *SI->findCaseValue(cast<ConstantInt>(Val));
     // Default case is live for multiple values.
-    if (DeadCase == SI->case_default()) continue;
+    if (DeadCase == *SI->case_default())
+      continue;
 
     // Found a dead case value.  Don't remove PHI nodes in the
     // successor if they become single-entry, those PHI nodes may
@@ -1282,8 +1373,6 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     BasicBlock *SISucc = DeadCase.getCaseSuccessor();
     BasicBlock *Latch = L->getLoopLatch();
 
-    BranchesInfo.setUnswitched(SI, Val);
-
     if (!SI->findCaseDest(SISucc)) continue;  // Edge is critical.
     // If the DeadCase successor dominates the loop latch, then the
     // transformation isn't safe since it will delete the sole predecessor edge
@@ -1405,3 +1494,27 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
     }
   }
 }
+
+/// Simple simplifications we can do given the information that Cond is
+/// definitely not equal to Val.
+Value *LoopUnswitch::SimplifyInstructionWithNotEqual(Instruction *Inst,
+                                                     Value *Invariant,
+                                                     Constant *Val) {
+  // icmp eq cond, val -> false
+  ICmpInst *CI = dyn_cast<ICmpInst>(Inst);
+  if (CI && CI->isEquality()) {
+    Value *Op0 = CI->getOperand(0);
+    Value *Op1 = CI->getOperand(1);
+    if ((Op0 == Invariant && Op1 == Val) || (Op0 == Val && Op1 == Invariant)) {
+      LLVMContext &Ctx = Inst->getContext();
+      if (CI->getPredicate() == CmpInst::ICMP_EQ)
+        return ConstantInt::getFalse(Ctx);
+      else 
+        return ConstantInt::getTrue(Ctx);
+     }
+  }
+
+  // FIXME: there may be other opportunities, e.g. comparison with floating
+  // point, or Invariant - Val != 0, etc.
+  return nullptr;
+}
diff --git a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 52975ef351531dbd0c13b89f36a6747b182b8816..a143b9a3c645fc29774b469eee0d8b306ef9b964 100644
--- a/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -67,11 +67,11 @@ static bool handleSwitchExpect(SwitchInst &SI) {
   if (!ExpectedValue)
     return false;
 
-  SwitchInst::CaseIt Case = SI.findCaseValue(ExpectedValue);
+  SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
   unsigned n = SI.getNumCases(); // +1 for default case.
   SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
 
-  if (Case == SI.case_default())
+  if (Case == *SI.case_default())
     Weights[0] = LikelyBranchWeight;
   else
     Weights[Case.getCaseIndex() + 1] = LikelyBranchWeight;
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index a800471375386d92f0bf528b6c4572b095119e88..a3f3f25c1e0f6c6b4cfe7eb56a7f922617453881 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -330,49 +330,33 @@ void MemsetRanges::addRange(int64_t Start, int64_t Size, Value *Ptr,
 
 namespace {
 
-  class MemCpyOptLegacyPass : public FunctionPass {
-    MemCpyOptPass Impl;
+class MemCpyOptLegacyPass : public FunctionPass {
+  MemCpyOptPass Impl;
 
-  public:
-    static char ID; // Pass identification, replacement for typeid
-
-    MemCpyOptLegacyPass() : FunctionPass(ID) {
-      initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
+public:
+  static char ID; // Pass identification, replacement for typeid
 
-    bool runOnFunction(Function &F) override;
-
-  private:
-    // This transformation requires dominator postdominator info
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<AssumptionCacheTracker>();
-      AU.addRequired<DominatorTreeWrapperPass>();
-      AU.addRequired<MemoryDependenceWrapperPass>();
-      AU.addRequired<AAResultsWrapperPass>();
-      AU.addRequired<TargetLibraryInfoWrapperPass>();
-      AU.addPreserved<GlobalsAAWrapperPass>();
-      AU.addPreserved<MemoryDependenceWrapperPass>();
-    }
+  MemCpyOptLegacyPass() : FunctionPass(ID) {
+    initializeMemCpyOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
 
-    // Helper functions
-    bool processStore(StoreInst *SI, BasicBlock::iterator &BBI);
-    bool processMemSet(MemSetInst *SI, BasicBlock::iterator &BBI);
-    bool processMemCpy(MemCpyInst *M);
-    bool processMemMove(MemMoveInst *M);
-    bool performCallSlotOptzn(Instruction *cpy, Value *cpyDst, Value *cpySrc,
-                              uint64_t cpyLen, unsigned cpyAlign, CallInst *C);
-    bool processMemCpyMemCpyDependence(MemCpyInst *M, MemCpyInst *MDep);
-    bool processMemSetMemCpyDependence(MemCpyInst *M, MemSetInst *MDep);
-    bool performMemCpyToMemSetOptzn(MemCpyInst *M, MemSetInst *MDep);
-    bool processByValArgument(CallSite CS, unsigned ArgNo);
-    Instruction *tryMergingIntoMemset(Instruction *I, Value *StartPtr,
-                                      Value *ByteVal);
-
-    bool iterateOnFunction(Function &F);
-  };
+  bool runOnFunction(Function &F) override;
+
+private:
+  // This transformation requires dominator postdominator info
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<MemoryDependenceWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<MemoryDependenceWrapperPass>();
+  }
+};
 
-  char MemCpyOptLegacyPass::ID = 0;
+char MemCpyOptLegacyPass::ID = 0;
 
 } // end anonymous namespace
 
@@ -948,6 +932,17 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   if (MR != MRI_NoModRef)
     return false;
 
+  // We can't create address space casts here because we don't know if they're
+  // safe for the target.
+  if (cpySrc->getType()->getPointerAddressSpace() !=
+      cpyDest->getType()->getPointerAddressSpace())
+    return false;
+  for (unsigned i = 0; i < CS.arg_size(); ++i)
+    if (CS.getArgument(i)->stripPointerCasts() == cpySrc &&
+        cpySrc->getType()->getPointerAddressSpace() !=
+        CS.getArgument(i)->getType()->getPointerAddressSpace())
+      return false;
+
   // All the checks have passed, so do the transformation.
   bool changedArgument = false;
   for (unsigned i = 0; i < CS.arg_size(); ++i)
@@ -1340,6 +1335,11 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
                                  CS.getInstruction(), &AC, &DT) < ByValAlign)
     return false;
 
+  // The address space of the memcpy source must match the byval argument
+  if (MDep->getSource()->getType()->getPointerAddressSpace() !=
+      ByValArg->getType()->getPointerAddressSpace())
+    return false;
+
   // Verify that the copied-from memory doesn't change in between the memcpy and
   // the byval call.
   //    memcpy(a <- b)
diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp
index b29331ce864326bbb6c40c52108478c21e719bc3..6e58b5f8128309e002f1efff4f6ad42d716c27f7 100644
--- a/lib/Transforms/Scalar/NewGVN.cpp
+++ b/lib/Transforms/Scalar/NewGVN.cpp
@@ -61,13 +61,9 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -76,23 +72,26 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
-#include "llvm/IR/PredIteratorCache.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Transforms/Utils/MemorySSA.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include <numeric>
 #include <unordered_map>
 #include <utility>
 #include <vector>
 using namespace llvm;
 using namespace PatternMatch;
 using namespace llvm::GVNExpression;
-
+using namespace llvm::VNCoercion;
 #define DEBUG_TYPE "newgvn"
 
 STATISTIC(NumGVNInstrDeleted, "Number of instructions deleted");
@@ -108,6 +107,14 @@ STATISTIC(NumGVNAvoidedSortedLeaderChanges,
 STATISTIC(NumGVNNotMostDominatingLeader,
           "Number of times a member dominated it's new classes' leader");
 STATISTIC(NumGVNDeadStores, "Number of redundant/dead stores eliminated");
+DEBUG_COUNTER(VNCounter, "newgvn-vn",
+              "Controls which instructions are value numbered")
+
+// Currently store defining access refinement is too slow due to basicaa being
+// egregiously slow.  This flag lets us keep it working while we work on this
+// issue.
+static cl::opt<bool> EnableStoreRefinement("enable-store-refinement",
+                                           cl::init(false), cl::Hidden);
 
 //===----------------------------------------------------------------------===//
 //                                GVN Pass
@@ -136,47 +143,152 @@ PHIExpression::~PHIExpression() = default;
 // For any Value in the Member set, it is valid to replace any dominated member
 // with that Value.
 //
-// Every congruence class has a leader, and the leader is used to
-// symbolize instructions in a canonical way (IE every operand of an
-// instruction that is a member of the same congruence class will
-// always be replaced with leader during symbolization).
-// To simplify symbolization, we keep the leader as a constant if class can be
-// proved to be a constant value.
-// Otherwise, the leader is a randomly chosen member of the value set, it does
-// not matter which one is chosen.
-// Each congruence class also has a defining expression,
-// though the expression may be null.  If it exists, it can be used for forward
-// propagation and reassociation of values.
-//
-struct CongruenceClass {
-  using MemberSet = SmallPtrSet<Value *, 4>;
+// Every congruence class has a leader, and the leader is used to symbolize
+// instructions in a canonical way (IE every operand of an instruction that is a
+// member of the same congruence class will always be replaced with leader
+// during symbolization).  To simplify symbolization, we keep the leader as a
+// constant if class can be proved to be a constant value.  Otherwise, the
+// leader is the member of the value set with the smallest DFS number.  Each
+// congruence class also has a defining expression, though the expression may be
+// null.  If it exists, it can be used for forward propagation and reassociation
+// of values.
+
+// For memory, we also track a representative MemoryAccess, and a set of memory
+// members for MemoryPhis (which have no real instructions). Note that for
+// memory, it seems tempting to try to split the memory members into a
+// MemoryCongruenceClass or something.  Unfortunately, this does not work
+// easily.  The value numbering of a given memory expression depends on the
+// leader of the memory congruence class, and the leader of memory congruence
+// class depends on the value numbering of a given memory expression.  This
+// leads to wasted propagation, and in some cases, missed optimization.  For
+// example: If we had value numbered two stores together before, but now do not,
+// we move them to a new value congruence class.  This in turn will move at one
+// of the memorydefs to a new memory congruence class.  Which in turn, affects
+// the value numbering of the stores we just value numbered (because the memory
+// congruence class is part of the value number).  So while theoretically
+// possible to split them up, it turns out to be *incredibly* complicated to get
+// it to work right, because of the interdependency.  While structurally
+// slightly messier, it is algorithmically much simpler and faster to do what we
+// do here, and track them both at once in the same class.
+// Note: The default iterators for this class iterate over values
+class CongruenceClass {
+public:
+  using MemberType = Value;
+  using MemberSet = SmallPtrSet<MemberType *, 4>;
+  using MemoryMemberType = MemoryPhi;
+  using MemoryMemberSet = SmallPtrSet<const MemoryMemberType *, 2>;
+
+  explicit CongruenceClass(unsigned ID) : ID(ID) {}
+  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
+      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
+  unsigned getID() const { return ID; }
+  // True if this class has no members left.  This is mainly used for assertion
+  // purposes, and for skipping empty classes.
+  bool isDead() const {
+    // If it's both dead from a value perspective, and dead from a memory
+    // perspective, it's really dead.
+    return empty() && memory_empty();
+  }
+  // Leader functions
+  Value *getLeader() const { return RepLeader; }
+  void setLeader(Value *Leader) { RepLeader = Leader; }
+  const std::pair<Value *, unsigned int> &getNextLeader() const {
+    return NextLeader;
+  }
+  void resetNextLeader() { NextLeader = {nullptr, ~0}; }
+
+  void addPossibleNextLeader(std::pair<Value *, unsigned int> LeaderPair) {
+    if (LeaderPair.second < NextLeader.second)
+      NextLeader = LeaderPair;
+  }
+
+  Value *getStoredValue() const { return RepStoredValue; }
+  void setStoredValue(Value *Leader) { RepStoredValue = Leader; }
+  const MemoryAccess *getMemoryLeader() const { return RepMemoryAccess; }
+  void setMemoryLeader(const MemoryAccess *Leader) { RepMemoryAccess = Leader; }
+
+  // Forward propagation info
+  const Expression *getDefiningExpr() const { return DefiningExpr; }
+  void setDefiningExpr(const Expression *E) { DefiningExpr = E; }
+
+  // Value member set
+  bool empty() const { return Members.empty(); }
+  unsigned size() const { return Members.size(); }
+  MemberSet::const_iterator begin() const { return Members.begin(); }
+  MemberSet::const_iterator end() const { return Members.end(); }
+  void insert(MemberType *M) { Members.insert(M); }
+  void erase(MemberType *M) { Members.erase(M); }
+  void swap(MemberSet &Other) { Members.swap(Other); }
+
+  // Memory member set
+  bool memory_empty() const { return MemoryMembers.empty(); }
+  unsigned memory_size() const { return MemoryMembers.size(); }
+  MemoryMemberSet::const_iterator memory_begin() const {
+    return MemoryMembers.begin();
+  }
+  MemoryMemberSet::const_iterator memory_end() const {
+    return MemoryMembers.end();
+  }
+  iterator_range<MemoryMemberSet::const_iterator> memory() const {
+    return make_range(memory_begin(), memory_end());
+  }
+  void memory_insert(const MemoryMemberType *M) { MemoryMembers.insert(M); }
+  void memory_erase(const MemoryMemberType *M) { MemoryMembers.erase(M); }
+
+  // Store count
+  unsigned getStoreCount() const { return StoreCount; }
+  void incStoreCount() { ++StoreCount; }
+  void decStoreCount() {
+    assert(StoreCount != 0 && "Store count went negative");
+    --StoreCount;
+  }
+
+  // Return true if two congruence classes are equivalent to each other.  This
+  // means
+  // that every field but the ID number and the dead field are equivalent.
+  bool isEquivalentTo(const CongruenceClass *Other) const {
+    if (!Other)
+      return false;
+    if (this == Other)
+      return true;
+
+    if (std::tie(StoreCount, RepLeader, RepStoredValue, RepMemoryAccess) !=
+        std::tie(Other->StoreCount, Other->RepLeader, Other->RepStoredValue,
+                 Other->RepMemoryAccess))
+      return false;
+    if (DefiningExpr != Other->DefiningExpr)
+      if (!DefiningExpr || !Other->DefiningExpr ||
+          *DefiningExpr != *Other->DefiningExpr)
+        return false;
+    // We need some ordered set
+    std::set<Value *> AMembers(Members.begin(), Members.end());
+    std::set<Value *> BMembers(Members.begin(), Members.end());
+    return AMembers == BMembers;
+  }
+
+private:
   unsigned ID;
   // Representative leader.
   Value *RepLeader = nullptr;
-  // If this is represented by a store, the value.
+  // The most dominating leader after our current leader, because the member set
+  // is not sorted and is expensive to keep sorted all the time.
+  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
+  // If this is represented by a store, the value of the store.
   Value *RepStoredValue = nullptr;
-  // If this class contains MemoryDefs, what is the represented memory state.
-  MemoryAccess *RepMemoryAccess = nullptr;
+  // If this class contains MemoryDefs or MemoryPhis, this is the leading memory
+  // access.
+  const MemoryAccess *RepMemoryAccess = nullptr;
   // Defining Expression.
   const Expression *DefiningExpr = nullptr;
   // Actual members of this class.
   MemberSet Members;
-
-  // True if this class has no members left.  This is mainly used for assertion
-  // purposes, and for skipping empty classes.
-  bool Dead = false;
-
+  // This is the set of MemoryPhis that exist in the class. MemoryDefs and
+  // MemoryUses have real instructions representing them, so we only need to
+  // track MemoryPhis here.
+  MemoryMemberSet MemoryMembers;
   // Number of stores in this congruence class.
   // This is used so we can detect store equivalence changes properly.
   int StoreCount = 0;
-
-  // The most dominating leader after our current leader, because the member set
-  // is not sorted and is expensive to keep sorted all the time.
-  std::pair<Value *, unsigned int> NextLeader = {nullptr, ~0U};
-
-  explicit CongruenceClass(unsigned ID) : ID(ID) {}
-  CongruenceClass(unsigned ID, Value *Leader, const Expression *E)
-      : ID(ID), RepLeader(Leader), DefiningExpr(E) {}
 };
 
 namespace llvm {
@@ -205,19 +317,30 @@ template <> struct DenseMapInfo<const Expression *> {
 };
 } // end namespace llvm
 
-class NewGVN : public FunctionPass {
+namespace {
+class NewGVN {
+  Function &F;
   DominatorTree *DT;
-  const DataLayout *DL;
-  const TargetLibraryInfo *TLI;
   AssumptionCache *AC;
+  const TargetLibraryInfo *TLI;
   AliasAnalysis *AA;
   MemorySSA *MSSA;
   MemorySSAWalker *MSSAWalker;
+  const DataLayout &DL;
+  std::unique_ptr<PredicateInfo> PredInfo;
   BumpPtrAllocator ExpressionAllocator;
   ArrayRecycler<Value *> ArgRecycler;
 
+  // Number of function arguments, used by ranking
+  unsigned int NumFuncArgs;
+
   // Congruence class info.
-  CongruenceClass *InitialClass;
+
+  // This class is called INITIAL in the paper. It is the class everything
+  // startsout in, and represents any value. Being an optimistic analysis,
+  // anything in the TOP class has the value TOP, which is indeterminate and
+  // equivalent to everything.
+  CongruenceClass *TOPClass;
   std::vector<CongruenceClass *> CongruenceClasses;
   unsigned NextCongruenceNum;
 
@@ -225,6 +348,16 @@ class NewGVN : public FunctionPass {
   DenseMap<Value *, CongruenceClass *> ValueToClass;
   DenseMap<Value *, const Expression *> ValueToExpression;
 
+  // Mapping from predicate info we used to the instructions we used it with.
+  // In order to correctly ensure propagation, we must keep track of what
+  // comparisons we used, so that when the values of the comparisons change, we
+  // propagate the information to the places we used the comparison.
+  DenseMap<const Value *, SmallPtrSet<Instruction *, 2>> PredicateToUsers;
+  // Mapping from MemoryAccess we used to the MemoryAccess we used it with.  Has
+  // the same reasoning as PredicateToUsers.  When we skip MemoryAccesses for
+  // stores, we no longer can rely solely on the def-use chains of MemorySSA.
+  DenseMap<const MemoryAccess *, SmallPtrSet<MemoryAccess *, 2>> MemoryToUsers;
+
   // A table storing which memorydefs/phis represent a memory state provably
   // equivalent to another memory state.
   // We could use the congruence class machinery, but the MemoryAccess's are
@@ -232,6 +365,19 @@ class NewGVN : public FunctionPass {
   // and not to constants, etc.
   DenseMap<const MemoryAccess *, CongruenceClass *> MemoryAccessToClass;
 
+  // We could, if we wanted, build MemoryPhiExpressions and
+  // MemoryVariableExpressions, etc, and value number them the same way we value
+  // number phi expressions.  For the moment, this seems like overkill.  They
+  // can only exist in one of three states: they can be TOP (equal to
+  // everything), Equivalent to something else, or unique.  Because we do not
+  // create expressions for them, we need to simulate leader change not just
+  // when they change class, but when they change state.  Note: We can do the
+  // same thing for phis, and avoid having phi expressions if we wanted, We
+  // should eventually unify in one direction or the other, so this is a little
+  // bit of an experiment in which turns out easier to maintain.
+  enum MemoryPhiState { MPS_Invalid, MPS_TOP, MPS_Equivalent, MPS_Unique };
+  DenseMap<const MemoryPhi *, MemoryPhiState> MemoryPhiState;
+
   // Expression to class mapping.
   using ExpressionClassMap = DenseMap<const Expression *, CongruenceClass *>;
   ExpressionClassMap ExpressionToClass;
@@ -256,8 +402,6 @@ class NewGVN : public FunctionPass {
   BitVector TouchedInstructions;
 
   DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
-  DenseMap<const DomTreeNode *, std::pair<unsigned, unsigned>>
-      DominatedInstRange;
 
 #ifndef NDEBUG
   // Debugging for how many times each block and instruction got processed.
@@ -277,47 +421,29 @@ class NewGVN : public FunctionPass {
   SmallPtrSet<Instruction *, 8> InstructionsToErase;
 
 public:
-  static char ID; // Pass identification, replacement for typeid.
-  NewGVN() : FunctionPass(ID) {
-    initializeNewGVNPass(*PassRegistry::getPassRegistry());
-  }
-
-  bool runOnFunction(Function &F) override;
-  bool runGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
-              TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA);
+  NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
+         TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
+         const DataLayout &DL)
+      : F(F), DT(DT), AC(AC), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
+        PredInfo(make_unique<PredicateInfo>(F, *DT, *AC)) {}
+  bool runGVN();
 
 private:
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<AssumptionCacheTracker>();
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addRequired<MemorySSAWrapperPass>();
-    AU.addRequired<AAResultsWrapperPass>();
-
-    AU.addPreserved<DominatorTreeWrapperPass>();
-    AU.addPreserved<GlobalsAAWrapperPass>();
-  }
-
   // Expression handling.
-  const Expression *createExpression(Instruction *, const BasicBlock *);
-  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *,
-                                           const BasicBlock *);
+  const Expression *createExpression(Instruction *);
+  const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *);
   PHIExpression *createPHIExpression(Instruction *);
   const VariableExpression *createVariableExpression(Value *);
   const ConstantExpression *createConstantExpression(Constant *);
-  const Expression *createVariableOrConstant(Value *V, const BasicBlock *B);
+  const Expression *createVariableOrConstant(Value *V);
   const UnknownExpression *createUnknownExpression(Instruction *);
-  const StoreExpression *createStoreExpression(StoreInst *, MemoryAccess *,
-                                               const BasicBlock *);
+  const StoreExpression *createStoreExpression(StoreInst *,
+                                               const MemoryAccess *);
   LoadExpression *createLoadExpression(Type *, Value *, LoadInst *,
-                                       MemoryAccess *, const BasicBlock *);
-
-  const CallExpression *createCallExpression(CallInst *, MemoryAccess *,
-                                             const BasicBlock *);
-  const AggregateValueExpression *
-  createAggregateValueExpression(Instruction *, const BasicBlock *);
-  bool setBasicExpressionInfo(Instruction *, BasicExpression *,
-                              const BasicBlock *);
+                                       const MemoryAccess *);
+  const CallExpression *createCallExpression(CallInst *, const MemoryAccess *);
+  const AggregateValueExpression *createAggregateValueExpression(Instruction *);
+  bool setBasicExpressionInfo(Instruction *, BasicExpression *);
 
   // Congruence class handling.
   CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) {
@@ -326,9 +452,21 @@ private:
     return result;
   }
 
+  CongruenceClass *createMemoryClass(MemoryAccess *MA) {
+    auto *CC = createCongruenceClass(nullptr, nullptr);
+    CC->setMemoryLeader(MA);
+    return CC;
+  }
+  CongruenceClass *ensureLeaderOfMemoryClass(MemoryAccess *MA) {
+    auto *CC = getMemoryClass(MA);
+    if (CC->getMemoryLeader() != MA)
+      CC = createMemoryClass(MA);
+    return CC;
+  }
+
   CongruenceClass *createSingletonCongruenceClass(Value *Member) {
     CongruenceClass *CClass = createCongruenceClass(Member, nullptr);
-    CClass->Members.insert(Member);
+    CClass->insert(Member);
     ValueToClass[Member] = CClass;
     return CClass;
   }
@@ -341,41 +479,49 @@ private:
   // Symbolic evaluation.
   const Expression *checkSimplificationResults(Expression *, Instruction *,
                                                Value *);
-  const Expression *performSymbolicEvaluation(Value *, const BasicBlock *);
-  const Expression *performSymbolicLoadEvaluation(Instruction *,
-                                                  const BasicBlock *);
-  const Expression *performSymbolicStoreEvaluation(Instruction *,
-                                                   const BasicBlock *);
-  const Expression *performSymbolicCallEvaluation(Instruction *,
-                                                  const BasicBlock *);
-  const Expression *performSymbolicPHIEvaluation(Instruction *,
-                                                 const BasicBlock *);
-  const Expression *performSymbolicAggrValueEvaluation(Instruction *,
-                                                       const BasicBlock *);
+  const Expression *performSymbolicEvaluation(Value *);
+  const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *,
+                                                Instruction *, MemoryAccess *);
+  const Expression *performSymbolicLoadEvaluation(Instruction *);
+  const Expression *performSymbolicStoreEvaluation(Instruction *);
+  const Expression *performSymbolicCallEvaluation(Instruction *);
+  const Expression *performSymbolicPHIEvaluation(Instruction *);
+  const Expression *performSymbolicAggrValueEvaluation(Instruction *);
+  const Expression *performSymbolicCmpEvaluation(Instruction *);
+  const Expression *performSymbolicPredicateInfoEvaluation(Instruction *);
 
   // Congruence finding.
-  // Templated to allow them to work both on BB's and BB-edges.
-  template <class T>
-  Value *lookupOperandLeader(Value *, const User *, const T &) const;
+  bool someEquivalentDominates(const Instruction *, const Instruction *) const;
+  Value *lookupOperandLeader(Value *) const;
   void performCongruenceFinding(Instruction *, const Expression *);
-  void moveValueToNewCongruenceClass(Instruction *, CongruenceClass *,
-                                     CongruenceClass *);
-  bool setMemoryAccessEquivTo(MemoryAccess *From, CongruenceClass *To);
-  MemoryAccess *lookupMemoryAccessEquiv(MemoryAccess *) const;
+  void moveValueToNewCongruenceClass(Instruction *, const Expression *,
+                                     CongruenceClass *, CongruenceClass *);
+  void moveMemoryToNewCongruenceClass(Instruction *, MemoryAccess *,
+                                      CongruenceClass *, CongruenceClass *);
+  Value *getNextValueLeader(CongruenceClass *) const;
+  const MemoryAccess *getNextMemoryLeader(CongruenceClass *) const;
+  bool setMemoryClass(const MemoryAccess *From, CongruenceClass *To);
+  CongruenceClass *getMemoryClass(const MemoryAccess *MA) const;
+  const MemoryAccess *lookupMemoryLeader(const MemoryAccess *) const;
   bool isMemoryAccessTop(const MemoryAccess *) const;
 
+  // Ranking
+  unsigned int getRank(const Value *) const;
+  bool shouldSwapOperands(const Value *, const Value *) const;
+
   // Reachability handling.
   void updateReachableEdge(BasicBlock *, BasicBlock *);
   void processOutgoingEdges(TerminatorInst *, BasicBlock *);
-  bool isOnlyReachableViaThisEdge(const BasicBlockEdge &) const;
-  Value *findConditionEquivalence(Value *, BasicBlock *) const;
+  Value *findConditionEquivalence(Value *) const;
 
   // Elimination.
   struct ValueDFS;
-  void convertDenseToDFSOrdered(const CongruenceClass::MemberSet &,
-                                SmallVectorImpl<ValueDFS> &);
-  void convertDenseToLoadsAndStores(const CongruenceClass::MemberSet &,
-                                    SmallVectorImpl<ValueDFS> &);
+  void convertClassToDFSOrdered(const CongruenceClass &,
+                                SmallVectorImpl<ValueDFS> &,
+                                DenseMap<const Value *, unsigned int> &,
+                                SmallPtrSetImpl<Instruction *> &) const;
+  void convertClassToLoadsAndStores(const CongruenceClass &,
+                                    SmallVectorImpl<ValueDFS> &) const;
 
   bool eliminateInstructions(Function &);
   void replaceInstruction(Instruction *, Value *);
@@ -387,35 +533,58 @@ private:
 
   // Various instruction touch utilities
   void markUsersTouched(Value *);
-  void markMemoryUsersTouched(MemoryAccess *);
-  void markLeaderChangeTouched(CongruenceClass *CC);
+  void markMemoryUsersTouched(const MemoryAccess *);
+  void markMemoryDefTouched(const MemoryAccess *);
+  void markPredicateUsersTouched(Instruction *);
+  void markValueLeaderChangeTouched(CongruenceClass *CC);
+  void markMemoryLeaderChangeTouched(CongruenceClass *CC);
+  void addPredicateUsers(const PredicateBase *, Instruction *);
+  void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U);
+
+  // Main loop of value numbering
+  void iterateTouchedInstructions();
 
   // Utilities.
   void cleanupTables();
   std::pair<unsigned, unsigned> assignDFSNumbers(BasicBlock *, unsigned);
   void updateProcessedCount(Value *V);
   void verifyMemoryCongruency() const;
+  void verifyIterationSettled(Function &F);
   bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const;
-};
+  BasicBlock *getBlockForValue(Value *V) const;
+  void deleteExpression(const Expression *E);
+  unsigned InstrToDFSNum(const Value *V) const {
+    assert(isa<Instruction>(V) && "This should not be used for MemoryAccesses");
+    return InstrDFS.lookup(V);
+  }
 
-char NewGVN::ID = 0;
+  unsigned InstrToDFSNum(const MemoryAccess *MA) const {
+    return MemoryToDFSNum(MA);
+  }
+  Value *InstrFromDFSNum(unsigned DFSNum) { return DFSToInstr[DFSNum]; }
+  // Given a MemoryAccess, return the relevant instruction DFS number.  Note:
+  // This deliberately takes a value so it can be used with Use's, which will
+  // auto-convert to Value's but not to MemoryAccess's.
+  unsigned MemoryToDFSNum(const Value *MA) const {
+    assert(isa<MemoryAccess>(MA) &&
+           "This should not be used with instructions");
+    return isa<MemoryUseOrDef>(MA)
+               ? InstrToDFSNum(cast<MemoryUseOrDef>(MA)->getMemoryInst())
+               : InstrDFS.lookup(MA);
+  }
 
-// createGVNPass - The public interface to this file.
-FunctionPass *llvm::createNewGVNPass() { return new NewGVN(); }
+  template <class T, class Range> T *getMinDFSOfRange(const Range &) const;
+  // Debug counter info.  When verifying, we have to reset the value numbering
+  // debug counter to the same state it started in to get the same results.
+  std::pair<int, int> StartingVNCounter;
+};
+} // end anonymous namespace
 
 template <typename T>
 static bool equalsLoadStoreHelper(const T &LHS, const Expression &RHS) {
-  if ((!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS)) ||
-      !LHS.BasicExpression::equals(RHS)) {
+  if (!isa<LoadExpression>(RHS) && !isa<StoreExpression>(RHS))
     return false;
-  } else if (const auto *L = dyn_cast<LoadExpression>(&RHS)) {
-    if (LHS.getDefiningAccess() != L->getDefiningAccess())
-      return false;
-  } else if (const auto *S = dyn_cast<StoreExpression>(&RHS)) {
-    if (LHS.getDefiningAccess() != S->getDefiningAccess())
-      return false;
-  }
-  return true;
+  return LHS.MemoryExpression::equals(RHS);
 }
 
 bool LoadExpression::equals(const Expression &Other) const {
@@ -423,13 +592,13 @@ bool LoadExpression::equals(const Expression &Other) const {
 }
 
 bool StoreExpression::equals(const Expression &Other) const {
-  bool Result = equalsLoadStoreHelper(*this, Other);
+  if (!equalsLoadStoreHelper(*this, Other))
+    return false;
   // Make sure that store vs store includes the value operand.
-  if (Result)
-    if (const auto *S = dyn_cast<StoreExpression>(&Other))
-      if (getStoredValue() != S->getStoredValue())
-        return false;
-  return Result;
+  if (const auto *S = dyn_cast<StoreExpression>(&Other))
+    if (getStoredValue() != S->getStoredValue())
+      return false;
+  return true;
 }
 
 #ifndef NDEBUG
@@ -438,14 +607,25 @@ static std::string getBlockName(const BasicBlock *B) {
 }
 #endif
 
-INITIALIZE_PASS_BEGIN(NewGVN, "newgvn", "Global Value Numbering", false, false)
-INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
-INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
-INITIALIZE_PASS_END(NewGVN, "newgvn", "Global Value Numbering", false, false)
+// Get the basic block from an instruction/memory value.
+BasicBlock *NewGVN::getBlockForValue(Value *V) const {
+  if (auto *I = dyn_cast<Instruction>(V))
+    return I->getParent();
+  else if (auto *MP = dyn_cast<MemoryPhi>(V))
+    return MP->getBlock();
+  llvm_unreachable("Should have been able to figure out a block for our value");
+  return nullptr;
+}
+
+// Delete a definitely dead expression, so it can be reused by the expression
+// allocator.  Some of these are not in creation functions, so we have to accept
+// const versions.
+void NewGVN::deleteExpression(const Expression *E) {
+  assert(isa<BasicExpression>(E));
+  auto *BE = cast<BasicExpression>(E);
+  const_cast<BasicExpression *>(BE)->deallocateOperands(ArgRecycler);
+  ExpressionAllocator.Deallocate(E);
+}
 
 PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
   BasicBlock *PHIBlock = I->getParent();
@@ -459,7 +639,7 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
 
   // Filter out unreachable phi operands.
   auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) {
-    return ReachableBlocks.count(PN->getIncomingBlock(U));
+    return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock});
   });
 
   std::transform(Filtered.begin(), Filtered.end(), op_inserter(E),
@@ -467,16 +647,14 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I) {
                    // Don't try to transform self-defined phis.
                    if (U == PN)
                      return PN;
-                   const BasicBlockEdge BBE(PN->getIncomingBlock(U), PHIBlock);
-                   return lookupOperandLeader(U, I, BBE);
+                   return lookupOperandLeader(U);
                  });
   return E;
 }
 
 // Set basic expression info (Arguments, type, opcode) for Expression
 // E from Instruction I in block B.
-bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
-                                    const BasicBlock *B) {
+bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) {
   bool AllConstant = true;
   if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
     E->setType(GEP->getSourceElementType());
@@ -488,7 +666,7 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
   // Transform the operand array into an operand leader array, and keep track of
   // whether all members are constant.
   std::transform(I->op_begin(), I->op_end(), op_inserter(E), [&](Value *O) {
-    auto Operand = lookupOperandLeader(O, I, B);
+    auto Operand = lookupOperandLeader(O);
     AllConstant &= isa<Constant>(Operand);
     return Operand;
   });
@@ -497,8 +675,7 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E,
 }
 
 const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
-                                                 Value *Arg1, Value *Arg2,
-                                                 const BasicBlock *B) {
+                                                 Value *Arg1, Value *Arg2) {
   auto *E = new (ExpressionAllocator) BasicExpression(2);
 
   E->setType(T);
@@ -509,13 +686,13 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T,
     // of their operands get the same value number by sorting the operand value
     // numbers.  Since all commutative instructions have two operands it is more
     // efficient to sort by hand rather than using, say, std::sort.
-    if (Arg1 > Arg2)
+    if (shouldSwapOperands(Arg1, Arg2))
       std::swap(Arg1, Arg2);
   }
-  E->op_push_back(lookupOperandLeader(Arg1, nullptr, B));
-  E->op_push_back(lookupOperandLeader(Arg2, nullptr, B));
+  E->op_push_back(lookupOperandLeader(Arg1));
+  E->op_push_back(lookupOperandLeader(Arg2));
 
-  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), *DL, TLI,
+  Value *V = SimplifyBinOp(Opcode, E->getOperand(0), E->getOperand(1), DL, TLI,
                            DT, AC);
   if (const Expression *SimplifiedE = checkSimplificationResults(E, nullptr, V))
     return SimplifiedE;
@@ -538,40 +715,32 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E,
     NumGVNOpsSimplified++;
     assert(isa<BasicExpression>(E) &&
            "We should always have had a basic expression here");
-
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createConstantExpression(C);
   } else if (isa<Argument>(V) || isa<GlobalVariable>(V)) {
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
                    << " variable " << *V << "\n");
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createVariableExpression(V);
   }
 
   CongruenceClass *CC = ValueToClass.lookup(V);
-  if (CC && CC->DefiningExpr) {
+  if (CC && CC->getDefiningExpr()) {
     if (I)
       DEBUG(dbgs() << "Simplified " << *I << " to "
                    << " expression " << *V << "\n");
     NumGVNOpsSimplified++;
-    assert(isa<BasicExpression>(E) &&
-           "We should always have had a basic expression here");
-    cast<BasicExpression>(E)->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
-    return CC->DefiningExpr;
+    deleteExpression(E);
+    return CC->getDefiningExpr();
   }
   return nullptr;
 }
 
-const Expression *NewGVN::createExpression(Instruction *I,
-                                           const BasicBlock *B) {
-
+const Expression *NewGVN::createExpression(Instruction *I) {
   auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands());
 
-  bool AllConstant = setBasicExpressionInfo(I, E, B);
+  bool AllConstant = setBasicExpressionInfo(I, E);
 
   if (I->isCommutative()) {
     // Ensure that commutative instructions that only differ by a permutation
@@ -579,7 +748,7 @@ const Expression *NewGVN::createExpression(Instruction *I,
     // numbers.  Since all commutative instructions have two operands it is more
     // efficient to sort by hand rather than using, say, std::sort.
     assert(I->getNumOperands() == 2 && "Unsupported commutative instruction!");
-    if (E->getOperand(0) > E->getOperand(1))
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1)))
       E->swapOperands(0, 1);
   }
 
@@ -595,48 +764,43 @@ const Expression *NewGVN::createExpression(Instruction *I,
     // Sort the operand value numbers so x<y and y>x get the same value
     // number.
     CmpInst::Predicate Predicate = CI->getPredicate();
-    if (E->getOperand(0) > E->getOperand(1)) {
+    if (shouldSwapOperands(E->getOperand(0), E->getOperand(1))) {
       E->swapOperands(0, 1);
       Predicate = CmpInst::getSwappedPredicate(Predicate);
     }
     E->setOpcode((CI->getOpcode() << 8) | Predicate);
     // TODO: 25% of our time is spent in SimplifyCmpInst with pointer operands
-    // TODO: Since we noop bitcasts, we may need to check types before
-    // simplifying, so that we don't end up simplifying based on a wrong
-    // type assumption. We should clean this up so we can use constants of the
-    // wrong type
-
     assert(I->getOperand(0)->getType() == I->getOperand(1)->getType() &&
            "Wrong types on cmp instruction");
-    if ((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
-         E->getOperand(1)->getType() == I->getOperand(1)->getType())) {
-      Value *V = SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1),
-                                 *DL, TLI, DT, AC);
-      if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
-        return SimplifiedE;
-    }
+    assert((E->getOperand(0)->getType() == I->getOperand(0)->getType() &&
+            E->getOperand(1)->getType() == I->getOperand(1)->getType()));
+    Value *V = SimplifyCmpInst(Predicate, E->getOperand(0), E->getOperand(1),
+                               DL, TLI, DT, AC);
+    if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
+      return SimplifiedE;
   } else if (isa<SelectInst>(I)) {
     if (isa<Constant>(E->getOperand(0)) ||
-        (E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
-         E->getOperand(2)->getType() == I->getOperand(2)->getType())) {
+        E->getOperand(0) == E->getOperand(1)) {
+      assert(E->getOperand(1)->getType() == I->getOperand(1)->getType() &&
+             E->getOperand(2)->getType() == I->getOperand(2)->getType());
       Value *V = SimplifySelectInst(E->getOperand(0), E->getOperand(1),
-                                    E->getOperand(2), *DL, TLI, DT, AC);
+                                    E->getOperand(2), DL, TLI, DT, AC);
       if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
         return SimplifiedE;
     }
   } else if (I->isBinaryOp()) {
     Value *V = SimplifyBinOp(E->getOpcode(), E->getOperand(0), E->getOperand(1),
-                             *DL, TLI, DT, AC);
+                             DL, TLI, DT, AC);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (auto *BI = dyn_cast<BitCastInst>(I)) {
-    Value *V = SimplifyInstruction(BI, *DL, TLI, DT, AC);
+    Value *V = SimplifyInstruction(BI, DL, TLI, DT, AC);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (isa<GetElementPtrInst>(I)) {
     Value *V = SimplifyGEPInst(E->getType(),
                                ArrayRef<Value *>(E->op_begin(), E->op_end()),
-                               *DL, TLI, DT, AC);
+                               DL, TLI, DT, AC);
     if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
       return SimplifiedE;
   } else if (AllConstant) {
@@ -651,7 +815,7 @@ const Expression *NewGVN::createExpression(Instruction *I,
     for (Value *Arg : E->operands())
       C.emplace_back(cast<Constant>(Arg));
 
-    if (Value *V = ConstantFoldInstOperands(I, C, *DL, TLI))
+    if (Value *V = ConstantFoldInstOperands(I, C, DL, TLI))
       if (const Expression *SimplifiedE = checkSimplificationResults(E, I, V))
         return SimplifiedE;
   }
@@ -659,18 +823,18 @@ const Expression *NewGVN::createExpression(Instruction *I,
 }
 
 const AggregateValueExpression *
-NewGVN::createAggregateValueExpression(Instruction *I, const BasicBlock *B) {
+NewGVN::createAggregateValueExpression(Instruction *I) {
   if (auto *II = dyn_cast<InsertValueInst>(I)) {
     auto *E = new (ExpressionAllocator)
         AggregateValueExpression(I->getNumOperands(), II->getNumIndices());
-    setBasicExpressionInfo(I, E, B);
+    setBasicExpressionInfo(I, E);
     E->allocateIntOperands(ExpressionAllocator);
     std::copy(II->idx_begin(), II->idx_end(), int_op_inserter(E));
     return E;
   } else if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
     auto *E = new (ExpressionAllocator)
         AggregateValueExpression(I->getNumOperands(), EI->getNumIndices());
-    setBasicExpressionInfo(EI, E, B);
+    setBasicExpressionInfo(EI, E);
     E->allocateIntOperands(ExpressionAllocator);
     std::copy(EI->idx_begin(), EI->idx_end(), int_op_inserter(E));
     return E;
@@ -684,12 +848,10 @@ const VariableExpression *NewGVN::createVariableExpression(Value *V) {
   return E;
 }
 
-const Expression *NewGVN::createVariableOrConstant(Value *V,
-                                                   const BasicBlock *B) {
-  auto Leader = lookupOperandLeader(V, nullptr, B);
-  if (auto *C = dyn_cast<Constant>(Leader))
+const Expression *NewGVN::createVariableOrConstant(Value *V) {
+  if (auto *C = dyn_cast<Constant>(V))
     return createConstantExpression(C);
-  return createVariableExpression(Leader);
+  return createVariableExpression(V);
 }
 
 const ConstantExpression *NewGVN::createConstantExpression(Constant *C) {
@@ -705,54 +867,90 @@ const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) {
 }
 
 const CallExpression *NewGVN::createCallExpression(CallInst *CI,
-                                                   MemoryAccess *HV,
-                                                   const BasicBlock *B) {
+                                                   const MemoryAccess *MA) {
   // FIXME: Add operand bundles for calls.
   auto *E =
-      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, HV);
-  setBasicExpressionInfo(CI, E, B);
+      new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA);
+  setBasicExpressionInfo(CI, E);
   return E;
 }
 
+// Return true if some equivalent of instruction Inst dominates instruction U.
+bool NewGVN::someEquivalentDominates(const Instruction *Inst,
+                                     const Instruction *U) const {
+  auto *CC = ValueToClass.lookup(Inst);
+  // This must be an instruction because we are only called from phi nodes
+  // in the case that the value it needs to check against is an instruction.
+
+  // The most likely candiates for dominance are the leader and the next leader.
+  // The leader or nextleader will dominate in all cases where there is an
+  // equivalent that is higher up in the dom tree.
+  // We can't *only* check them, however, because the
+  // dominator tree could have an infinite number of non-dominating siblings
+  // with instructions that are in the right congruence class.
+  //       A
+  // B C D E F G
+  // |
+  // H
+  // Instruction U could be in H,  with equivalents in every other sibling.
+  // Depending on the rpo order picked, the leader could be the equivalent in
+  // any of these siblings.
+  if (!CC)
+    return false;
+  if (DT->dominates(cast<Instruction>(CC->getLeader()), U))
+    return true;
+  if (CC->getNextLeader().first &&
+      DT->dominates(cast<Instruction>(CC->getNextLeader().first), U))
+    return true;
+  return llvm::any_of(*CC, [&](const Value *Member) {
+    return Member != CC->getLeader() &&
+           DT->dominates(cast<Instruction>(Member), U);
+  });
+}
+
 // See if we have a congruence class and leader for this operand, and if so,
 // return it. Otherwise, return the operand itself.
-template <class T>
-Value *NewGVN::lookupOperandLeader(Value *V, const User *U, const T &B) const {
+Value *NewGVN::lookupOperandLeader(Value *V) const {
   CongruenceClass *CC = ValueToClass.lookup(V);
-  if (CC && (CC != InitialClass))
-    return CC->RepStoredValue ? CC->RepStoredValue : CC->RepLeader;
+  if (CC) {
+    // Everything in TOP is represneted by undef, as it can be any value.
+    // We do have to make sure we get the type right though, so we can't set the
+    // RepLeader to undef.
+    if (CC == TOPClass)
+      return UndefValue::get(V->getType());
+    return CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
+  }
+
   return V;
 }
 
-MemoryAccess *NewGVN::lookupMemoryAccessEquiv(MemoryAccess *MA) const {
-  auto *CC = MemoryAccessToClass.lookup(MA);
-  if (CC && CC->RepMemoryAccess)
-    return CC->RepMemoryAccess;
-  // FIXME: We need to audit all the places that current set a nullptr To, and
-  // fix them.  There should always be *some* congruence class, even if it is
-  // singular.  Right now, we don't bother setting congruence classes for
-  // anything but stores, which means we have to return the original access
-  // here.  Otherwise, this should be unreachable.
-  return MA;
+const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const {
+  auto *CC = getMemoryClass(MA);
+  assert(CC->getMemoryLeader() &&
+         "Every MemoryAccess should be mapped to a "
+         "congruence class with a represenative memory "
+         "access");
+  return CC->getMemoryLeader();
 }
 
 // Return true if the MemoryAccess is really equivalent to everything. This is
 // equivalent to the lattice value "TOP" in most lattices.  This is the initial
-// state of all memory accesses.
+// state of all MemoryAccesses.
 bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const {
-  return MemoryAccessToClass.lookup(MA) == InitialClass;
+  return getMemoryClass(MA) == TOPClass;
 }
 
 LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
-                                             LoadInst *LI, MemoryAccess *DA,
-                                             const BasicBlock *B) {
-  auto *E = new (ExpressionAllocator) LoadExpression(1, LI, DA);
+                                             LoadInst *LI,
+                                             const MemoryAccess *MA) {
+  auto *E =
+      new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA));
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
   E->setType(LoadType);
 
   // Give store and loads same opcode so they value number together.
   E->setOpcode(0);
-  E->op_push_back(lookupOperandLeader(PointerOp, LI, B));
+  E->op_push_back(PointerOp);
   if (LI)
     E->setAlignment(LI->getAlignment());
 
@@ -763,17 +961,16 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
 }
 
 const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
-                                                     MemoryAccess *DA,
-                                                     const BasicBlock *B) {
-  auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand(), SI, B);
+                                                     const MemoryAccess *MA) {
+  auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand());
   auto *E = new (ExpressionAllocator)
-      StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, DA);
+      StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA);
   E->allocateOperands(ArgRecycler, ExpressionAllocator);
   E->setType(SI->getValueOperand()->getType());
 
   // Give store and loads same opcode so they value number together.
   E->setOpcode(0);
-  E->op_push_back(lookupOperandLeader(SI->getPointerOperand(), SI, B));
+  E->op_push_back(lookupOperandLeader(SI->getPointerOperand()));
 
   // TODO: Value number heap versions. We may be able to discover
   // things alias analysis can't on it's own (IE that a store and a
@@ -781,24 +978,20 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI,
   return E;
 }
 
-// Utility function to check whether the congruence class has a member other
-// than the given instruction.
-bool hasMemberOtherThanUs(const CongruenceClass *CC, Instruction *I) {
-  // Either it has more than one store, in which case it must contain something
-  // other than us (because it's indexed by value), or if it only has one store
-  // right now, that member should not be us.
-  return CC->StoreCount > 1 || CC->Members.count(I) == 0;
-}
-
-const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
-                                                         const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) {
   // Unlike loads, we never try to eliminate stores, so we do not check if they
   // are simple and avoid value numbering them.
   auto *SI = cast<StoreInst>(I);
-  MemoryAccess *StoreAccess = MSSA->getMemoryAccess(SI);
+  auto *StoreAccess = MSSA->getMemoryAccess(SI);
   // Get the expression, if any, for the RHS of the MemoryDef.
-  MemoryAccess *StoreRHS = lookupMemoryAccessEquiv(
-      cast<MemoryDef>(StoreAccess)->getDefiningAccess());
+  const MemoryAccess *StoreRHS = StoreAccess->getDefiningAccess();
+  if (EnableStoreRefinement)
+    StoreRHS = MSSAWalker->getClobberingMemoryAccess(StoreAccess);
+  // If we bypassed the use-def chains, make sure we add a use.
+  if (StoreRHS != StoreAccess->getDefiningAccess())
+    addMemoryUsers(StoreRHS, StoreAccess);
+
+  StoreRHS = lookupMemoryLeader(StoreRHS);
   // If we are defined by ourselves, use the live on entry def.
   if (StoreRHS == StoreAccess)
     StoreRHS = MSSA->getLiveOnEntryDef();
@@ -807,33 +1000,118 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I,
     // See if we are defined by a previous store expression, it already has a
     // value, and it's the same value as our current store. FIXME: Right now, we
     // only do this for simple stores, we should expand to cover memcpys, etc.
-    const Expression *OldStore = createStoreExpression(SI, StoreRHS, B);
-    CongruenceClass *CC = ExpressionToClass.lookup(OldStore);
+    const auto *LastStore = createStoreExpression(SI, StoreRHS);
+    const auto *LastCC = ExpressionToClass.lookup(LastStore);
     // Basically, check if the congruence class the store is in is defined by a
     // store that isn't us, and has the same value.  MemorySSA takes care of
     // ensuring the store has the same memory state as us already.
     // The RepStoredValue gets nulled if all the stores disappear in a class, so
     // we don't need to check if the class contains a store besides us.
-    if (CC &&
-        CC->RepStoredValue == lookupOperandLeader(SI->getValueOperand(), SI, B))
-      return createStoreExpression(SI, StoreRHS, B);
+    if (LastCC &&
+        LastCC->getStoredValue() == lookupOperandLeader(SI->getValueOperand()))
+      return LastStore;
+    deleteExpression(LastStore);
     // Also check if our value operand is defined by a load of the same memory
-    // location, and the memory state is the same as it was then
-    // (otherwise, it could have been overwritten later. See test32 in
-    // transforms/DeadStoreElimination/simple.ll)
-    if (LoadInst *LI = dyn_cast<LoadInst>(SI->getValueOperand())) {
-      if ((lookupOperandLeader(LI->getPointerOperand(), LI, LI->getParent()) ==
-           lookupOperandLeader(SI->getPointerOperand(), SI, B)) &&
-          (lookupMemoryAccessEquiv(
-               MSSA->getMemoryAccess(LI)->getDefiningAccess()) == StoreRHS))
+    // location, and the memory state is the same as it was then (otherwise, it
+    // could have been overwritten later. See test32 in
+    // transforms/DeadStoreElimination/simple.ll).
+    if (auto *LI =
+            dyn_cast<LoadInst>(lookupOperandLeader(SI->getValueOperand()))) {
+      if ((lookupOperandLeader(LI->getPointerOperand()) ==
+           lookupOperandLeader(SI->getPointerOperand())) &&
+          (lookupMemoryLeader(MSSA->getMemoryAccess(LI)->getDefiningAccess()) ==
+           StoreRHS))
         return createVariableExpression(LI);
     }
   }
-  return createStoreExpression(SI, StoreAccess, B);
+
+  // If the store is not equivalent to anything, value number it as a store that
+  // produces a unique memory state (instead of using it's MemoryUse, we use
+  // it's MemoryDef).
+  return createStoreExpression(SI, StoreAccess);
+}
+
+// See if we can extract the value of a loaded pointer from a load, a store, or
+// a memory instruction.
+const Expression *
+NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
+                                    LoadInst *LI, Instruction *DepInst,
+                                    MemoryAccess *DefiningAccess) {
+  assert((!LI || LI->isSimple()) && "Not a simple load");
+  if (auto *DepSI = dyn_cast<StoreInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    // Also don't need to coerce if they are the same type, we will just
+    // propogate..
+    if (LI->isAtomic() > DepSI->isAtomic() ||
+        LoadType == DepSI->getValueOperand()->getType())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingStore(LoadType, LoadPtr, DepSI, DL);
+    if (Offset >= 0) {
+      if (auto *C = dyn_cast<Constant>(
+              lookupOperandLeader(DepSI->getValueOperand()))) {
+        DEBUG(dbgs() << "Coercing load from store " << *DepSI << " to constant "
+                     << *C << "\n");
+        return createConstantExpression(
+            getConstantStoreValueForLoad(C, Offset, LoadType, DL));
+      }
+    }
+
+  } else if (LoadInst *DepLI = dyn_cast<LoadInst>(DepInst)) {
+    // Can't forward from non-atomic to atomic without violating memory model.
+    if (LI->isAtomic() > DepLI->isAtomic())
+      return nullptr;
+    int Offset = analyzeLoadFromClobberingLoad(LoadType, LoadPtr, DepLI, DL);
+    if (Offset >= 0) {
+      // We can coerce a constant load into a load
+      if (auto *C = dyn_cast<Constant>(lookupOperandLeader(DepLI)))
+        if (auto *PossibleConstant =
+                getConstantLoadValueForLoad(C, Offset, LoadType, DL)) {
+          DEBUG(dbgs() << "Coercing load from load " << *LI << " to constant "
+                       << *PossibleConstant << "\n");
+          return createConstantExpression(PossibleConstant);
+        }
+    }
+
+  } else if (MemIntrinsic *DepMI = dyn_cast<MemIntrinsic>(DepInst)) {
+    int Offset = analyzeLoadFromClobberingMemInst(LoadType, LoadPtr, DepMI, DL);
+    if (Offset >= 0) {
+      if (auto *PossibleConstant =
+              getConstantMemInstValueForLoad(DepMI, Offset, LoadType, DL)) {
+        DEBUG(dbgs() << "Coercing load from meminst " << *DepMI
+                     << " to constant " << *PossibleConstant << "\n");
+        return createConstantExpression(PossibleConstant);
+      }
+    }
+  }
+
+  // All of the below are only true if the loaded pointer is produced
+  // by the dependent instruction.
+  if (LoadPtr != lookupOperandLeader(DepInst) &&
+      !AA->isMustAlias(LoadPtr, DepInst))
+    return nullptr;
+  // If this load really doesn't depend on anything, then we must be loading an
+  // undef value.  This can happen when loading for a fresh allocation with no
+  // intervening stores, for example.  Note that this is only true in the case
+  // that the result of the allocation is pointer equal to the load ptr.
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load occurs either right after a lifetime begin,
+  // then the loaded value is undefined.
+  else if (auto *II = dyn_cast<IntrinsicInst>(DepInst)) {
+    if (II->getIntrinsicID() == Intrinsic::lifetime_start)
+      return createConstantExpression(UndefValue::get(LoadType));
+  }
+  // If this load follows a calloc (which zero initializes memory),
+  // then the loaded value is zero
+  else if (isCallocLikeFn(DepInst, TLI)) {
+    return createConstantExpression(Constant::getNullValue(LoadType));
+  }
+
+  return nullptr;
 }
 
-const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
-                                                        const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) {
   auto *LI = cast<LoadInst>(I);
 
   // We can eliminate in favor of non-simple loads, but we won't be able to
@@ -841,7 +1119,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
   if (!LI->isSimple())
     return nullptr;
 
-  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand(), I, B);
+  Value *LoadAddressLeader = lookupOperandLeader(LI->getPointerOperand());
   // Load of undef is undef.
   if (isa<UndefValue>(LoadAddressLeader))
     return createConstantExpression(UndefValue::get(LI->getType()));
@@ -854,66 +1132,189 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I,
       // If the defining instruction is not reachable, replace with undef.
       if (!ReachableBlocks.count(DefiningInst->getParent()))
         return createConstantExpression(UndefValue::get(LI->getType()));
+      // This will handle stores and memory insts.  We only do if it the
+      // defining access has a different type, or it is a pointer produced by
+      // certain memory operations that cause the memory to have a fixed value
+      // (IE things like calloc).
+      if (const auto *CoercionResult =
+              performSymbolicLoadCoercion(LI->getType(), LoadAddressLeader, LI,
+                                          DefiningInst, DefiningAccess))
+        return CoercionResult;
     }
   }
 
-  const Expression *E =
-      createLoadExpression(LI->getType(), LI->getPointerOperand(), LI,
-                           lookupMemoryAccessEquiv(DefiningAccess), B);
+  const Expression *E = createLoadExpression(LI->getType(), LoadAddressLeader,
+                                             LI, DefiningAccess);
   return E;
 }
 
+const Expression *
+NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) {
+  auto *PI = PredInfo->getPredicateInfoFor(I);
+  if (!PI)
+    return nullptr;
+
+  DEBUG(dbgs() << "Found predicate info from instruction !\n");
+
+  auto *PWC = dyn_cast<PredicateWithCondition>(PI);
+  if (!PWC)
+    return nullptr;
+
+  auto *CopyOf = I->getOperand(0);
+  auto *Cond = PWC->Condition;
+
+  // If this a copy of the condition, it must be either true or false depending
+  // on the predicate info type and edge
+  if (CopyOf == Cond) {
+    // We should not need to add predicate users because the predicate info is
+    // already a use of this operand.
+    if (isa<PredicateAssume>(PI))
+      return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+    if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+      if (PBranch->TrueEdge)
+        return createConstantExpression(ConstantInt::getTrue(Cond->getType()));
+      return createConstantExpression(ConstantInt::getFalse(Cond->getType()));
+    }
+    if (auto *PSwitch = dyn_cast<PredicateSwitch>(PI))
+      return createConstantExpression(cast<Constant>(PSwitch->CaseValue));
+  }
+
+  // Not a copy of the condition, so see what the predicates tell us about this
+  // value.  First, though, we check to make sure the value is actually a copy
+  // of one of the condition operands. It's possible, in certain cases, for it
+  // to be a copy of a predicateinfo copy. In particular, if two branch
+  // operations use the same condition, and one branch dominates the other, we
+  // will end up with a copy of a copy.  This is currently a small deficiency in
+  // predicateinfo.  What will end up happening here is that we will value
+  // number both copies the same anyway.
+
+  // Everything below relies on the condition being a comparison.
+  auto *Cmp = dyn_cast<CmpInst>(Cond);
+  if (!Cmp)
+    return nullptr;
+
+  if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) {
+    DEBUG(dbgs() << "Copy is not of any condition operands!");
+    return nullptr;
+  }
+  Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0));
+  Value *SecondOp = lookupOperandLeader(Cmp->getOperand(1));
+  bool SwappedOps = false;
+  // Sort the ops
+  if (shouldSwapOperands(FirstOp, SecondOp)) {
+    std::swap(FirstOp, SecondOp);
+    SwappedOps = true;
+  }
+  CmpInst::Predicate Predicate =
+      SwappedOps ? Cmp->getSwappedPredicate() : Cmp->getPredicate();
+
+  if (isa<PredicateAssume>(PI)) {
+    // If the comparison is true when the operands are equal, then we know the
+    // operands are equal, because assumes must always be true.
+    if (CmpInst::isTrueWhenEqual(Predicate)) {
+      addPredicateUsers(PI, I);
+      return createVariableOrConstant(FirstOp);
+    }
+  }
+  if (const auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+    // If we are *not* a copy of the comparison, we may equal to the other
+    // operand when the predicate implies something about equality of
+    // operations.  In particular, if the comparison is true/false when the
+    // operands are equal, and we are on the right edge, we know this operation
+    // is equal to something.
+    if ((PBranch->TrueEdge && Predicate == CmpInst::ICMP_EQ) ||
+        (!PBranch->TrueEdge && Predicate == CmpInst::ICMP_NE)) {
+      addPredicateUsers(PI, I);
+      return createVariableOrConstant(FirstOp);
+    }
+    // Handle the special case of floating point.
+    if (((PBranch->TrueEdge && Predicate == CmpInst::FCMP_OEQ) ||
+         (!PBranch->TrueEdge && Predicate == CmpInst::FCMP_UNE)) &&
+        isa<ConstantFP>(FirstOp) && !cast<ConstantFP>(FirstOp)->isZero()) {
+      addPredicateUsers(PI, I);
+      return createConstantExpression(cast<Constant>(FirstOp));
+    }
+  }
+  return nullptr;
+}
+
 // Evaluate read only and pure calls, and create an expression result.
-const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I,
-                                                        const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) {
   auto *CI = cast<CallInst>(I);
-  if (AA->doesNotAccessMemory(CI))
-    return createCallExpression(CI, nullptr, B);
-  if (AA->onlyReadsMemory(CI)) {
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    // Instrinsics with the returned attribute are copies of arguments.
+    if (auto *ReturnedValue = II->getReturnedArgOperand()) {
+      if (II->getIntrinsicID() == Intrinsic::ssa_copy)
+        if (const auto *Result = performSymbolicPredicateInfoEvaluation(I))
+          return Result;
+      return createVariableOrConstant(ReturnedValue);
+    }
+  }
+  if (AA->doesNotAccessMemory(CI)) {
+    return createCallExpression(CI, TOPClass->getMemoryLeader());
+  } else if (AA->onlyReadsMemory(CI)) {
     MemoryAccess *DefiningAccess = MSSAWalker->getClobberingMemoryAccess(CI);
-    return createCallExpression(CI, lookupMemoryAccessEquiv(DefiningAccess), B);
+    return createCallExpression(CI, DefiningAccess);
   }
   return nullptr;
 }
 
-// Update the memory access equivalence table to say that From is equal to To,
+// Retrieve the memory class for a given MemoryAccess.
+CongruenceClass *NewGVN::getMemoryClass(const MemoryAccess *MA) const {
+
+  auto *Result = MemoryAccessToClass.lookup(MA);
+  assert(Result && "Should have found memory class");
+  return Result;
+}
+
+// Update the MemoryAccess equivalence table to say that From is equal to To,
 // and return true if this is different from what already existed in the table.
-// FIXME: We need to audit all the places that current set a nullptr To, and fix
-// them. There should always be *some* congruence class, even if it is singular.
-bool NewGVN::setMemoryAccessEquivTo(MemoryAccess *From, CongruenceClass *To) {
+bool NewGVN::setMemoryClass(const MemoryAccess *From,
+                            CongruenceClass *NewClass) {
+  assert(NewClass &&
+         "Every MemoryAccess should be getting mapped to a non-null class");
   DEBUG(dbgs() << "Setting " << *From);
-  if (To) {
-    DEBUG(dbgs() << " equivalent to congruence class ");
-    DEBUG(dbgs() << To->ID << " with current memory access leader ");
-    DEBUG(dbgs() << *To->RepMemoryAccess);
-  } else {
-    DEBUG(dbgs() << " equivalent to itself");
-  }
+  DEBUG(dbgs() << " equivalent to congruence class ");
+  DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader ");
+  DEBUG(dbgs() << *NewClass->getMemoryLeader());
   DEBUG(dbgs() << "\n");
 
   auto LookupResult = MemoryAccessToClass.find(From);
   bool Changed = false;
   // If it's already in the table, see if the value changed.
   if (LookupResult != MemoryAccessToClass.end()) {
-    if (To && LookupResult->second != To) {
+    auto *OldClass = LookupResult->second;
+    if (OldClass != NewClass) {
+      // If this is a phi, we have to handle memory member updates.
+      if (auto *MP = dyn_cast<MemoryPhi>(From)) {
+        OldClass->memory_erase(MP);
+        NewClass->memory_insert(MP);
+        // This may have killed the class if it had no non-memory members
+        if (OldClass->getMemoryLeader() == From) {
+          if (OldClass->memory_empty()) {
+            OldClass->setMemoryLeader(nullptr);
+          } else {
+            OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+            DEBUG(dbgs() << "Memory class leader change for class "
+                         << OldClass->getID() << " to "
+                         << *OldClass->getMemoryLeader()
+                         << " due to removal of a memory member " << *From
+                         << "\n");
+            markMemoryLeaderChangeTouched(OldClass);
+          }
+        }
+      }
       // It wasn't equivalent before, and now it is.
-      LookupResult->second = To;
-      Changed = true;
-    } else if (!To) {
-      // It used to be equivalent to something, and now it's not.
-      MemoryAccessToClass.erase(LookupResult);
+      LookupResult->second = NewClass;
       Changed = true;
     }
-  } else {
-    assert(!To &&
-           "Memory equivalence should never change from nothing to something");
   }
 
   return Changed;
 }
+
 // Evaluate PHI nodes symbolically, and create an expression result.
-const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
-                                                       const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) {
   auto *E = cast<PHIExpression>(createPHIExpression(I));
   // We match the semantics of SimplifyPhiNode from InstructionSimplify here.
 
@@ -933,8 +1334,7 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
   if (Filtered.begin() == Filtered.end()) {
     DEBUG(dbgs() << "Simplified PHI node " << *I << " to undef"
                  << "\n");
-    E->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
+    deleteExpression(E);
     return createConstantExpression(UndefValue::get(I->getType()));
   }
   Value *AllSameValue = *(Filtered.begin());
@@ -955,25 +1355,20 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I,
     if (HasUndef) {
       // Only have to check for instructions
       if (auto *AllSameInst = dyn_cast<Instruction>(AllSameValue))
-        if (!DT->dominates(AllSameInst, I))
+        if (!someEquivalentDominates(AllSameInst, I))
           return E;
     }
 
     NumGVNPhisAllSame++;
     DEBUG(dbgs() << "Simplified PHI node " << *I << " to " << *AllSameValue
                  << "\n");
-    E->deallocateOperands(ArgRecycler);
-    ExpressionAllocator.Deallocate(E);
-    if (auto *C = dyn_cast<Constant>(AllSameValue))
-      return createConstantExpression(C);
-    return createVariableExpression(AllSameValue);
+    deleteExpression(E);
+    return createVariableOrConstant(AllSameValue);
   }
   return E;
 }
 
-const Expression *
-NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
-                                           const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) {
   if (auto *EI = dyn_cast<ExtractValueInst>(I)) {
     auto *II = dyn_cast<IntrinsicInst>(EI->getAggregateOperand());
     if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) {
@@ -1003,19 +1398,130 @@ NewGVN::performSymbolicAggrValueEvaluation(Instruction *I,
         // expression.
         assert(II->getNumArgOperands() == 2 &&
                "Expect two args for recognised intrinsics.");
-        return createBinaryExpression(Opcode, EI->getType(),
-                                      II->getArgOperand(0),
-                                      II->getArgOperand(1), B);
+        return createBinaryExpression(
+            Opcode, EI->getType(), II->getArgOperand(0), II->getArgOperand(1));
       }
     }
   }
 
-  return createAggregateValueExpression(I, B);
+  return createAggregateValueExpression(I);
+}
+const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) {
+  auto *CI = dyn_cast<CmpInst>(I);
+  // See if our operands are equal to those of a previous predicate, and if so,
+  // if it implies true or false.
+  auto Op0 = lookupOperandLeader(CI->getOperand(0));
+  auto Op1 = lookupOperandLeader(CI->getOperand(1));
+  auto OurPredicate = CI->getPredicate();
+  if (shouldSwapOperands(Op0, Op1)) {
+    std::swap(Op0, Op1);
+    OurPredicate = CI->getSwappedPredicate();
+  }
+
+  // Avoid processing the same info twice
+  const PredicateBase *LastPredInfo = nullptr;
+  // See if we know something about the comparison itself, like it is the target
+  // of an assume.
+  auto *CmpPI = PredInfo->getPredicateInfoFor(I);
+  if (dyn_cast_or_null<PredicateAssume>(CmpPI))
+    return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+
+  if (Op0 == Op1) {
+    // This condition does not depend on predicates, no need to add users
+    if (CI->isTrueWhenEqual())
+      return createConstantExpression(ConstantInt::getTrue(CI->getType()));
+    else if (CI->isFalseWhenEqual())
+      return createConstantExpression(ConstantInt::getFalse(CI->getType()));
+  }
+
+  // NOTE: Because we are comparing both operands here and below, and using
+  // previous comparisons, we rely on fact that predicateinfo knows to mark
+  // comparisons that use renamed operands as users of the earlier comparisons.
+  // It is *not* enough to just mark predicateinfo renamed operands as users of
+  // the earlier comparisons, because the *other* operand may have changed in a
+  // previous iteration.
+  // Example:
+  // icmp slt %a, %b
+  // %b.0 = ssa.copy(%b)
+  // false branch:
+  // icmp slt %c, %b.0
+
+  // %c and %a may start out equal, and thus, the code below will say the second
+  // %icmp is false.  c may become equal to something else, and in that case the
+  // %second icmp *must* be reexamined, but would not if only the renamed
+  // %operands are considered users of the icmp.
+
+  // *Currently* we only check one level of comparisons back, and only mark one
+  // level back as touched when changes appen .  If you modify this code to look
+  // back farther through comparisons, you *must* mark the appropriate
+  // comparisons as users in PredicateInfo.cpp, or you will cause bugs.  See if
+  // we know something just from the operands themselves
+
+  // See if our operands have predicate info, so that we may be able to derive
+  // something from a previous comparison.
+  for (const auto &Op : CI->operands()) {
+    auto *PI = PredInfo->getPredicateInfoFor(Op);
+    if (const auto *PBranch = dyn_cast_or_null<PredicateBranch>(PI)) {
+      if (PI == LastPredInfo)
+        continue;
+      LastPredInfo = PI;
+
+      // TODO: Along the false edge, we may know more things too, like icmp of
+      // same operands is false.
+      // TODO: We only handle actual comparison conditions below, not and/or.
+      auto *BranchCond = dyn_cast<CmpInst>(PBranch->Condition);
+      if (!BranchCond)
+        continue;
+      auto *BranchOp0 = lookupOperandLeader(BranchCond->getOperand(0));
+      auto *BranchOp1 = lookupOperandLeader(BranchCond->getOperand(1));
+      auto BranchPredicate = BranchCond->getPredicate();
+      if (shouldSwapOperands(BranchOp0, BranchOp1)) {
+        std::swap(BranchOp0, BranchOp1);
+        BranchPredicate = BranchCond->getSwappedPredicate();
+      }
+      if (BranchOp0 == Op0 && BranchOp1 == Op1) {
+        if (PBranch->TrueEdge) {
+          // If we know the previous predicate is true and we are in the true
+          // edge then we may be implied true or false.
+          if (CmpInst::isImpliedTrueByMatchingCmp(OurPredicate,
+                                                  BranchPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+
+          if (CmpInst::isImpliedFalseByMatchingCmp(OurPredicate,
+                                                   BranchPredicate)) {
+            addPredicateUsers(PI, I);
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          }
+
+        } else {
+          // Just handle the ne and eq cases, where if we have the same
+          // operands, we may know something.
+          if (BranchPredicate == OurPredicate) {
+            addPredicateUsers(PI, I);
+            // Same predicate, same ops,we know it was false, so this is false.
+            return createConstantExpression(
+                ConstantInt::getFalse(CI->getType()));
+          } else if (BranchPredicate ==
+                     CmpInst::getInversePredicate(OurPredicate)) {
+            addPredicateUsers(PI, I);
+            // Inverse predicate, we know the other was false, so this is true.
+            return createConstantExpression(
+                ConstantInt::getTrue(CI->getType()));
+          }
+        }
+      }
+    }
+  }
+  // Create expression will take care of simplifyCmpInst
+  return createExpression(I);
 }
 
 // Substitute and symbolize the value before value numbering.
-const Expression *NewGVN::performSymbolicEvaluation(Value *V,
-                                                    const BasicBlock *B) {
+const Expression *NewGVN::performSymbolicEvaluation(Value *V) {
   const Expression *E = nullptr;
   if (auto *C = dyn_cast<Constant>(V))
     E = createConstantExpression(C);
@@ -1029,24 +1535,27 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     switch (I->getOpcode()) {
     case Instruction::ExtractValue:
     case Instruction::InsertValue:
-      E = performSymbolicAggrValueEvaluation(I, B);
+      E = performSymbolicAggrValueEvaluation(I);
       break;
     case Instruction::PHI:
-      E = performSymbolicPHIEvaluation(I, B);
+      E = performSymbolicPHIEvaluation(I);
       break;
     case Instruction::Call:
-      E = performSymbolicCallEvaluation(I, B);
+      E = performSymbolicCallEvaluation(I);
       break;
     case Instruction::Store:
-      E = performSymbolicStoreEvaluation(I, B);
+      E = performSymbolicStoreEvaluation(I);
       break;
     case Instruction::Load:
-      E = performSymbolicLoadEvaluation(I, B);
+      E = performSymbolicLoadEvaluation(I);
       break;
     case Instruction::BitCast: {
-      E = createExpression(I, B);
+      E = createExpression(I);
+    } break;
+    case Instruction::ICmp:
+    case Instruction::FCmp: {
+      E = performSymbolicCmpEvaluation(I);
     } break;
-
     case Instruction::Add:
     case Instruction::FAdd:
     case Instruction::Sub:
@@ -1065,8 +1574,6 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor:
-    case Instruction::ICmp:
-    case Instruction::FCmp:
     case Instruction::Trunc:
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -1083,7 +1590,7 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::InsertElement:
     case Instruction::ShuffleVector:
     case Instruction::GetElementPtr:
-      E = createExpression(I, B);
+      E = createExpression(I);
       break;
     default:
       return nullptr;
@@ -1092,169 +1599,297 @@ const Expression *NewGVN::performSymbolicEvaluation(Value *V,
   return E;
 }
 
-// There is an edge from 'Src' to 'Dst'.  Return true if every path from
-// the entry block to 'Dst' passes via this edge.  In particular 'Dst'
-// must not be reachable via another edge from 'Src'.
-bool NewGVN::isOnlyReachableViaThisEdge(const BasicBlockEdge &E) const {
-
-  // While in theory it is interesting to consider the case in which Dst has
-  // more than one predecessor, because Dst might be part of a loop which is
-  // only reachable from Src, in practice it is pointless since at the time
-  // GVN runs all such loops have preheaders, which means that Dst will have
-  // been changed to have only one predecessor, namely Src.
-  const BasicBlock *Pred = E.getEnd()->getSinglePredecessor();
-  const BasicBlock *Src = E.getStart();
-  assert((!Pred || Pred == Src) && "No edge between these basic blocks!");
-  (void)Src;
-  return Pred != nullptr;
-}
-
 void NewGVN::markUsersTouched(Value *V) {
   // Now mark the users as touched.
   for (auto *User : V->users()) {
     assert(isa<Instruction>(User) && "Use of value not within an instruction?");
-    TouchedInstructions.set(InstrDFS.lookup(User));
+    TouchedInstructions.set(InstrToDFSNum(User));
   }
 }
 
-void NewGVN::markMemoryUsersTouched(MemoryAccess *MA) {
-  for (auto U : MA->users()) {
-    if (auto *MUD = dyn_cast<MemoryUseOrDef>(U))
-      TouchedInstructions.set(InstrDFS.lookup(MUD->getMemoryInst()));
-    else
-      TouchedInstructions.set(InstrDFS.lookup(U));
+void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) {
+  DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n");
+  MemoryToUsers[To].insert(U);
+}
+
+void NewGVN::markMemoryDefTouched(const MemoryAccess *MA) {
+  TouchedInstructions.set(MemoryToDFSNum(MA));
+}
+
+void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) {
+  if (isa<MemoryUse>(MA))
+    return;
+  for (auto U : MA->users())
+    TouchedInstructions.set(MemoryToDFSNum(U));
+  const auto Result = MemoryToUsers.find(MA);
+  if (Result != MemoryToUsers.end()) {
+    for (auto *User : Result->second)
+      TouchedInstructions.set(MemoryToDFSNum(User));
+    MemoryToUsers.erase(Result);
+  }
+}
+
+// Add I to the set of users of a given predicate.
+void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) {
+  if (auto *PBranch = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PBranch->Condition].insert(I);
+  else if (auto *PAssume = dyn_cast<PredicateBranch>(PB))
+    PredicateToUsers[PAssume->Condition].insert(I);
+}
+
+// Touch all the predicates that depend on this instruction.
+void NewGVN::markPredicateUsersTouched(Instruction *I) {
+  const auto Result = PredicateToUsers.find(I);
+  if (Result != PredicateToUsers.end()) {
+    for (auto *User : Result->second)
+      TouchedInstructions.set(InstrToDFSNum(User));
+    PredicateToUsers.erase(Result);
   }
 }
 
+// Mark users affected by a memory leader change.
+void NewGVN::markMemoryLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : CC->memory())
+    markMemoryDefTouched(M);
+}
+
 // Touch the instructions that need to be updated after a congruence class has a
 // leader change, and mark changed values.
-void NewGVN::markLeaderChangeTouched(CongruenceClass *CC) {
-  for (auto M : CC->Members) {
+void NewGVN::markValueLeaderChangeTouched(CongruenceClass *CC) {
+  for (auto M : *CC) {
     if (auto *I = dyn_cast<Instruction>(M))
-      TouchedInstructions.set(InstrDFS.lookup(I));
+      TouchedInstructions.set(InstrToDFSNum(I));
     LeaderChanges.insert(M);
   }
 }
 
+// Give a range of things that have instruction DFS numbers, this will return
+// the member of the range with the smallest dfs number.
+template <class T, class Range>
+T *NewGVN::getMinDFSOfRange(const Range &R) const {
+  std::pair<T *, unsigned> MinDFS = {nullptr, ~0U};
+  for (const auto X : R) {
+    auto DFSNum = InstrToDFSNum(X);
+    if (DFSNum < MinDFS.second)
+      MinDFS = {X, DFSNum};
+  }
+  return MinDFS.first;
+}
+
+// This function returns the MemoryAccess that should be the next leader of
+// congruence class CC, under the assumption that the current leader is going to
+// disappear.
+const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const {
+  // TODO: If this ends up to slow, we can maintain a next memory leader like we
+  // do for regular leaders.
+  // Make sure there will be a leader to find
+  assert((CC->getStoreCount() > 0 || !CC->memory_empty()) &&
+         "Can't get next leader if there is none");
+  if (CC->getStoreCount() > 0) {
+    if (auto *NL = dyn_cast_or_null<StoreInst>(CC->getNextLeader().first))
+      return MSSA->getMemoryAccess(NL);
+    // Find the store with the minimum DFS number.
+    auto *V = getMinDFSOfRange<Value>(make_filter_range(
+        *CC, [&](const Value *V) { return isa<StoreInst>(V); }));
+    return MSSA->getMemoryAccess(cast<StoreInst>(V));
+  }
+  assert(CC->getStoreCount() == 0);
+
+  // Given our assertion, hitting this part must mean
+  // !OldClass->memory_empty()
+  if (CC->memory_size() == 1)
+    return *CC->memory_begin();
+  return getMinDFSOfRange<const MemoryPhi>(CC->memory());
+}
+
+// This function returns the next value leader of a congruence class, under the
+// assumption that the current leader is going away.  This should end up being
+// the next most dominating member.
+Value *NewGVN::getNextValueLeader(CongruenceClass *CC) const {
+  // We don't need to sort members if there is only 1, and we don't care about
+  // sorting the TOP class because everything either gets out of it or is
+  // unreachable.
+
+  if (CC->size() == 1 || CC == TOPClass) {
+    return *(CC->begin());
+  } else if (CC->getNextLeader().first) {
+    ++NumGVNAvoidedSortedLeaderChanges;
+    return CC->getNextLeader().first;
+  } else {
+    ++NumGVNSortedLeaderChanges;
+    // NOTE: If this ends up to slow, we can maintain a dual structure for
+    // member testing/insertion, or keep things mostly sorted, and sort only
+    // here, or use SparseBitVector or ....
+    return getMinDFSOfRange<Value>(*CC);
+  }
+}
+
+// Move a MemoryAccess, currently in OldClass, to NewClass, including updates to
+// the memory members, etc for the move.
+//
+// The invariants of this function are:
+//
+// I must be moving to NewClass from OldClass The StoreCount of OldClass and
+// NewClass is expected to have been updated for I already if it is is a store.
+// The OldClass memory leader has not been updated yet if I was the leader.
+void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I,
+                                            MemoryAccess *InstMA,
+                                            CongruenceClass *OldClass,
+                                            CongruenceClass *NewClass) {
+  // If the leader is I, and we had a represenative MemoryAccess, it should
+  // be the MemoryAccess of OldClass.
+  assert((!InstMA || !OldClass->getMemoryLeader() ||
+          OldClass->getLeader() != I ||
+          OldClass->getMemoryLeader() == InstMA) &&
+         "Representative MemoryAccess mismatch");
+  // First, see what happens to the new class
+  if (!NewClass->getMemoryLeader()) {
+    // Should be a new class, or a store becoming a leader of a new class.
+    assert(NewClass->size() == 1 ||
+           (isa<StoreInst>(I) && NewClass->getStoreCount() == 1));
+    NewClass->setMemoryLeader(InstMA);
+    // Mark it touched if we didn't just create a singleton
+    DEBUG(dbgs() << "Memory class leader change for class " << NewClass->getID()
+                 << " due to new memory instruction becoming leader\n");
+    markMemoryLeaderChangeTouched(NewClass);
+  }
+  setMemoryClass(InstMA, NewClass);
+  // Now, fixup the old class if necessary
+  if (OldClass->getMemoryLeader() == InstMA) {
+    if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) {
+      OldClass->setMemoryLeader(getNextMemoryLeader(OldClass));
+      DEBUG(dbgs() << "Memory class leader change for class "
+                   << OldClass->getID() << " to "
+                   << *OldClass->getMemoryLeader()
+                   << " due to removal of old leader " << *InstMA << "\n");
+      markMemoryLeaderChangeTouched(OldClass);
+    } else
+      OldClass->setMemoryLeader(nullptr);
+  }
+}
+
 // Move a value, currently in OldClass, to be part of NewClass
-// Update OldClass for the move (including changing leaders, etc)
-void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
+// Update OldClass and NewClass for the move (including changing leaders, etc).
+void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E,
                                            CongruenceClass *OldClass,
                                            CongruenceClass *NewClass) {
-  DEBUG(dbgs() << "New congruence class for " << I << " is " << NewClass->ID
-               << "\n");
-
-  if (I == OldClass->NextLeader.first)
-    OldClass->NextLeader = {nullptr, ~0U};
+  if (I == OldClass->getNextLeader().first)
+    OldClass->resetNextLeader();
 
   // It's possible, though unlikely, for us to discover equivalences such
   // that the current leader does not dominate the old one.
   // This statistic tracks how often this happens.
   // We assert on phi nodes when this happens, currently, for debugging, because
   // we want to make sure we name phi node cycles properly.
-  if (isa<Instruction>(NewClass->RepLeader) && NewClass->RepLeader &&
-      I != NewClass->RepLeader &&
-      DT->properlyDominates(
-          I->getParent(),
-          cast<Instruction>(NewClass->RepLeader)->getParent())) {
-    ++NumGVNNotMostDominatingLeader;
-    assert(!isa<PHINode>(I) &&
-           "New class for instruction should not be dominated by instruction");
+  if (isa<Instruction>(NewClass->getLeader()) && NewClass->getLeader() &&
+      I != NewClass->getLeader()) {
+    auto *IBB = I->getParent();
+    auto *NCBB = cast<Instruction>(NewClass->getLeader())->getParent();
+    bool Dominated =
+        IBB == NCBB && InstrToDFSNum(I) < InstrToDFSNum(NewClass->getLeader());
+    Dominated = Dominated || DT->properlyDominates(IBB, NCBB);
+    if (Dominated) {
+      ++NumGVNNotMostDominatingLeader;
+      assert(
+          !isa<PHINode>(I) &&
+          "New class for instruction should not be dominated by instruction");
+    }
   }
 
-  if (NewClass->RepLeader != I) {
-    auto DFSNum = InstrDFS.lookup(I);
-    if (DFSNum < NewClass->NextLeader.second)
-      NewClass->NextLeader = {I, DFSNum};
-  }
+  if (NewClass->getLeader() != I)
+    NewClass->addPossibleNextLeader({I, InstrToDFSNum(I)});
 
-  OldClass->Members.erase(I);
-  NewClass->Members.insert(I);
-  MemoryAccess *StoreAccess = nullptr;
+  OldClass->erase(I);
+  NewClass->insert(I);
+  // Handle our special casing of stores.
   if (auto *SI = dyn_cast<StoreInst>(I)) {
-    StoreAccess = MSSA->getMemoryAccess(SI);
-    --OldClass->StoreCount;
-    assert(OldClass->StoreCount >= 0);
-    ++NewClass->StoreCount;
-    assert(NewClass->StoreCount > 0);
-    if (!NewClass->RepMemoryAccess) {
-      // If we don't have a representative memory access, it better be the only
-      // store in there.
-      assert(NewClass->StoreCount == 1);
-      NewClass->RepMemoryAccess = StoreAccess;
+    OldClass->decStoreCount();
+    // Okay, so when do we want to make a store a leader of a class?
+    // If we have a store defined by an earlier load, we want the earlier load
+    // to lead the class.
+    // If we have a store defined by something else, we want the store to lead
+    // the class so everything else gets the "something else" as a value.
+    // If we have a store as the single member of the class, we want the store
+    // as the leader
+    if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) {
+      // If it's a store expression we are using, it means we are not equivalent
+      // to something earlier.
+      if (isa<StoreExpression>(E)) {
+        assert(lookupOperandLeader(SI->getValueOperand()) !=
+               NewClass->getLeader());
+        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
+        markValueLeaderChangeTouched(NewClass);
+        // Shift the new class leader to be the store
+        DEBUG(dbgs() << "Changing leader of congruence class "
+                     << NewClass->getID() << " from " << *NewClass->getLeader()
+                     << " to  " << *SI << " because store joined class\n");
+        // If we changed the leader, we have to mark it changed because we don't
+        // know what it will do to symbolic evlauation.
+        NewClass->setLeader(SI);
+      }
+      // We rely on the code below handling the MemoryAccess change.
     }
-    setMemoryAccessEquivTo(StoreAccess, NewClass);
+    NewClass->incStoreCount();
   }
-
+  // True if there is no memory instructions left in a class that had memory
+  // instructions before.
+
+  // If it's not a memory use, set the MemoryAccess equivalence
+  auto *InstMA = dyn_cast_or_null<MemoryDef>(MSSA->getMemoryAccess(I));
+  bool InstWasMemoryLeader = InstMA && OldClass->getMemoryLeader() == InstMA;
+  if (InstMA)
+    moveMemoryToNewCongruenceClass(I, InstMA, OldClass, NewClass);
   ValueToClass[I] = NewClass;
   // See if we destroyed the class or need to swap leaders.
-  if (OldClass->Members.empty() && OldClass != InitialClass) {
-    if (OldClass->DefiningExpr) {
-      OldClass->Dead = true;
-      DEBUG(dbgs() << "Erasing expression " << OldClass->DefiningExpr
+  if (OldClass->empty() && OldClass != TOPClass) {
+    if (OldClass->getDefiningExpr()) {
+      DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr()
                    << " from table\n");
-      ExpressionToClass.erase(OldClass->DefiningExpr);
+      ExpressionToClass.erase(OldClass->getDefiningExpr());
     }
-  } else if (OldClass->RepLeader == I) {
+  } else if (OldClass->getLeader() == I) {
     // When the leader changes, the value numbering of
     // everything may change due to symbolization changes, so we need to
     // reprocess.
-    DEBUG(dbgs() << "Leader change!\n");
+    DEBUG(dbgs() << "Value class leader change for class " << OldClass->getID()
+                 << "\n");
     ++NumGVNLeaderChanges;
     // Destroy the stored value if there are no more stores to represent it.
-    if (OldClass->StoreCount == 0) {
-      if (OldClass->RepStoredValue != nullptr)
-        OldClass->RepStoredValue = nullptr;
-      if (OldClass->RepMemoryAccess != nullptr)
-        OldClass->RepMemoryAccess = nullptr;
+    // Note that this is basically clean up for the expression removal that
+    // happens below.  If we remove stores from a class, we may leave it as a
+    // class of equivalent memory phis.
+    if (OldClass->getStoreCount() == 0) {
+      if (OldClass->getStoredValue())
+        OldClass->setStoredValue(nullptr);
     }
-
-    // If we destroy the old access leader, we have to effectively destroy the
-    // congruence class.  When it comes to scalars, anything with the same value
-    // is as good as any other.  That means that one leader is as good as
-    // another, and as long as you have some leader for the value, you are
-    // good.. When it comes to *memory states*, only one particular thing really
-    // represents the definition of a given memory state.  Once it goes away, we
-    // need to re-evaluate which pieces of memory are really still
-    // equivalent. The best way to do this is to re-value number things.  The
-    // only way to really make that happen is to destroy the rest of the class.
-    // In order to effectively destroy the class, we reset ExpressionToClass for
-    // each by using the ValueToExpression mapping.  The members later get
-    // marked as touched due to the leader change.  We will create new
-    // congruence classes, and the pieces that are still equivalent will end
-    // back together in a new class.  If this becomes too expensive, it is
-    // possible to use a versioning scheme for the congruence classes to avoid
-    // the expressions finding this old class.
-    if (OldClass->StoreCount > 0 && OldClass->RepMemoryAccess == StoreAccess) {
-      DEBUG(dbgs() << "Kicking everything out of class " << OldClass->ID
-                   << " because memory access leader changed");
-      for (auto Member : OldClass->Members)
+    // If we destroy the old access leader and it's a store, we have to
+    // effectively destroy the congruence class.  When it comes to scalars,
+    // anything with the same value is as good as any other.  That means that
+    // one leader is as good as another, and as long as you have some leader for
+    // the value, you are good.. When it comes to *memory states*, only one
+    // particular thing really represents the definition of a given memory
+    // state.  Once it goes away, we need to re-evaluate which pieces of memory
+    // are really still equivalent. The best way to do this is to re-value
+    // number things.  The only way to really make that happen is to destroy the
+    // rest of the class.  In order to effectively destroy the class, we reset
+    // ExpressionToClass for each by using the ValueToExpression mapping.  The
+    // members later get marked as touched due to the leader change.  We will
+    // create new congruence classes, and the pieces that are still equivalent
+    // will end back together in a new class.  If this becomes too expensive, it
+    // is possible to use a versioning scheme for the congruence classes to
+    // avoid the expressions finding this old class.  Note that the situation is
+    // different for memory phis, becuase they are evaluated anew each time, and
+    // they become equal not by hashing, but by seeing if all operands are the
+    // same (or only one is reachable).
+    if (OldClass->getStoreCount() > 0 && InstWasMemoryLeader) {
+      DEBUG(dbgs() << "Kicking everything out of class " << OldClass->getID()
+                   << " because MemoryAccess leader changed");
+      for (auto Member : *OldClass)
         ExpressionToClass.erase(ValueToExpression.lookup(Member));
     }
-
-    // We don't need to sort members if there is only 1, and we don't care about
-    // sorting the initial class because everything either gets out of it or is
-    // unreachable.
-    if (OldClass->Members.size() == 1 || OldClass == InitialClass) {
-      OldClass->RepLeader = *(OldClass->Members.begin());
-    } else if (OldClass->NextLeader.first) {
-      ++NumGVNAvoidedSortedLeaderChanges;
-      OldClass->RepLeader = OldClass->NextLeader.first;
-      OldClass->NextLeader = {nullptr, ~0U};
-    } else {
-      ++NumGVNSortedLeaderChanges;
-      // TODO: If this ends up to slow, we can maintain a dual structure for
-      // member testing/insertion, or keep things mostly sorted, and sort only
-      // here, or ....
-      std::pair<Value *, unsigned> MinDFS = {nullptr, ~0U};
-      for (const auto X : OldClass->Members) {
-        auto DFSNum = InstrDFS.lookup(X);
-        if (DFSNum < MinDFS.second)
-          MinDFS = {X, DFSNum};
-      }
-      OldClass->RepLeader = MinDFS.first;
-    }
-    markLeaderChangeTouched(OldClass);
+    OldClass->setLeader(getNextValueLeader(OldClass));
+    OldClass->resetNextLeader();
+    markValueLeaderChangeTouched(OldClass);
   }
 }
 
@@ -1262,12 +1897,12 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I,
 void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
   ValueToExpression[I] = E;
   // This is guaranteed to return something, since it will at least find
-  // INITIAL.
+  // TOP.
 
   CongruenceClass *IClass = ValueToClass[I];
   assert(IClass && "Should have found a IClass");
   // Dead classes should have been eliminated from the mapping.
-  assert(!IClass->Dead && "Found a dead class");
+  assert(!IClass->isDead() && "Found a dead class");
 
   CongruenceClass *EClass;
   if (const auto *VE = dyn_cast<VariableExpression>(E)) {
@@ -1283,51 +1918,52 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) {
 
       // Constants and variables should always be made the leader.
       if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
-        NewClass->RepLeader = CE->getConstantValue();
+        NewClass->setLeader(CE->getConstantValue());
       } else if (const auto *SE = dyn_cast<StoreExpression>(E)) {
         StoreInst *SI = SE->getStoreInst();
-        NewClass->RepLeader = SI;
-        NewClass->RepStoredValue =
-            lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
+        NewClass->setLeader(SI);
+        NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand()));
         // The RepMemoryAccess field will be filled in properly by the
         // moveValueToNewCongruenceClass call.
       } else {
-        NewClass->RepLeader = I;
+        NewClass->setLeader(I);
       }
       assert(!isa<VariableExpression>(E) &&
              "VariableExpression should have been handled already");
 
       EClass = NewClass;
       DEBUG(dbgs() << "Created new congruence class for " << *I
-                   << " using expression " << *E << " at " << NewClass->ID
-                   << " and leader " << *(NewClass->RepLeader));
-      if (NewClass->RepStoredValue)
-        DEBUG(dbgs() << " and stored value " << *(NewClass->RepStoredValue));
+                   << " using expression " << *E << " at " << NewClass->getID()
+                   << " and leader " << *(NewClass->getLeader()));
+      if (NewClass->getStoredValue())
+        DEBUG(dbgs() << " and stored value " << *(NewClass->getStoredValue()));
       DEBUG(dbgs() << "\n");
-      DEBUG(dbgs() << "Hash value was " << E->getHashValue() << "\n");
     } else {
       EClass = lookupResult.first->second;
       if (isa<ConstantExpression>(E))
-        assert(isa<Constant>(EClass->RepLeader) &&
+        assert((isa<Constant>(EClass->getLeader()) ||
+                (EClass->getStoredValue() &&
+                 isa<Constant>(EClass->getStoredValue()))) &&
                "Any class with a constant expression should have a "
                "constant leader");
 
       assert(EClass && "Somehow don't have an eclass");
 
-      assert(!EClass->Dead && "We accidentally looked up a dead class");
+      assert(!EClass->isDead() && "We accidentally looked up a dead class");
     }
   }
   bool ClassChanged = IClass != EClass;
   bool LeaderChanged = LeaderChanges.erase(I);
   if (ClassChanged || LeaderChanged) {
-    DEBUG(dbgs() << "Found class " << EClass->ID << " for expression " << E
+    DEBUG(dbgs() << "New class " << EClass->getID() << " for expression " << *E
                  << "\n");
-
     if (ClassChanged)
-      moveValueToNewCongruenceClass(I, IClass, EClass);
+      moveValueToNewCongruenceClass(I, E, IClass, EClass);
     markUsersTouched(I);
     if (MemoryAccess *MA = MSSA->getMemoryAccess(I))
       markMemoryUsersTouched(MA);
+    if (auto *CI = dyn_cast<CmpInst>(I))
+      markPredicateUsersTouched(CI);
   }
 }
 
@@ -1351,11 +1987,11 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
       // they are the only thing that depend on new edges. Anything using their
       // values will get propagated to if necessary.
       if (MemoryAccess *MemPhi = MSSA->getMemoryAccess(To))
-        TouchedInstructions.set(InstrDFS.lookup(MemPhi));
+        TouchedInstructions.set(InstrToDFSNum(MemPhi));
 
       auto BI = To->begin();
       while (isa<PHINode>(BI)) {
-        TouchedInstructions.set(InstrDFS.lookup(&*BI));
+        TouchedInstructions.set(InstrToDFSNum(&*BI));
         ++BI;
       }
     }
@@ -1364,8 +2000,8 @@ void NewGVN::updateReachableEdge(BasicBlock *From, BasicBlock *To) {
 
 // Given a predicate condition (from a switch, cmp, or whatever) and a block,
 // see if we know some constant value for it already.
-Value *NewGVN::findConditionEquivalence(Value *Cond, BasicBlock *B) const {
-  auto Result = lookupOperandLeader(Cond, nullptr, B);
+Value *NewGVN::findConditionEquivalence(Value *Cond) const {
+  auto Result = lookupOperandLeader(Cond);
   if (isa<Constant>(Result))
     return Result;
   return nullptr;
@@ -1377,10 +2013,10 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
   BranchInst *BR;
   if ((BR = dyn_cast<BranchInst>(TI)) && BR->isConditional()) {
     Value *Cond = BR->getCondition();
-    Value *CondEvaluated = findConditionEquivalence(Cond, B);
+    Value *CondEvaluated = findConditionEquivalence(Cond);
     if (!CondEvaluated) {
       if (auto *I = dyn_cast<Instruction>(Cond)) {
-        const Expression *E = createExpression(I, B);
+        const Expression *E = createExpression(I);
         if (const auto *CE = dyn_cast<ConstantExpression>(E)) {
           CondEvaluated = CE->getConstantValue();
         }
@@ -1413,13 +2049,13 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
     SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
 
     Value *SwitchCond = SI->getCondition();
-    Value *CondEvaluated = findConditionEquivalence(SwitchCond, B);
+    Value *CondEvaluated = findConditionEquivalence(SwitchCond);
     // See if we were able to turn this switch statement into a constant.
     if (CondEvaluated && isa<ConstantInt>(CondEvaluated)) {
       auto *CondVal = cast<ConstantInt>(CondEvaluated);
       // We should be able to get case value for this.
-      auto CaseVal = SI->findCaseValue(CondVal);
-      if (CaseVal.getCaseSuccessor() == SI->getDefaultDest()) {
+      auto Case = *SI->findCaseValue(CondVal);
+      if (Case.getCaseSuccessor() == SI->getDefaultDest()) {
         // We proved the value is outside of the range of the case.
         // We can't do anything other than mark the default dest as reachable,
         // and go home.
@@ -1427,7 +2063,7 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
         return;
       }
       // Now get where it goes and mark it reachable.
-      BasicBlock *TargetBlock = CaseVal.getCaseSuccessor();
+      BasicBlock *TargetBlock = Case.getCaseSuccessor();
       updateReachableEdge(B, TargetBlock);
     } else {
       for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
@@ -1445,45 +2081,66 @@ void NewGVN::processOutgoingEdges(TerminatorInst *TI, BasicBlock *B) {
     }
 
     // This also may be a memory defining terminator, in which case, set it
-    // equivalent to nothing.
-    if (MemoryAccess *MA = MSSA->getMemoryAccess(TI))
-      setMemoryAccessEquivTo(MA, nullptr);
+    // equivalent only to itself.
+    //
+    auto *MA = MSSA->getMemoryAccess(TI);
+    if (MA && !isa<MemoryUse>(MA)) {
+      auto *CC = ensureLeaderOfMemoryClass(MA);
+      if (setMemoryClass(MA, CC))
+        markMemoryUsersTouched(MA);
+    }
   }
 }
 
-// The algorithm initially places the values of the routine in the INITIAL
-// congruence
-// class. The leader of INITIAL is the undetermined value `TOP`.
-// When the algorithm has finished, values still in INITIAL are unreachable.
+// The algorithm initially places the values of the routine in the TOP
+// congruence class. The leader of TOP is the undetermined value `undef`.
+// When the algorithm has finished, values still in TOP are unreachable.
 void NewGVN::initializeCongruenceClasses(Function &F) {
-  // FIXME now i can't remember why this is 2
-  NextCongruenceNum = 2;
-  // Initialize all other instructions to be in INITIAL class.
-  CongruenceClass::MemberSet InitialValues;
-  InitialClass = createCongruenceClass(nullptr, nullptr);
-  InitialClass->RepMemoryAccess = MSSA->getLiveOnEntryDef();
+  NextCongruenceNum = 0;
+
+  // Note that even though we use the live on entry def as a representative
+  // MemoryAccess, it is *not* the same as the actual live on entry def. We
+  // have no real equivalemnt to undef for MemoryAccesses, and so we really
+  // should be checking whether the MemoryAccess is top if we want to know if it
+  // is equivalent to everything.  Otherwise, what this really signifies is that
+  // the access "it reaches all the way back to the beginning of the function"
+
+  // Initialize all other instructions to be in TOP class.
+  TOPClass = createCongruenceClass(nullptr, nullptr);
+  TOPClass->setMemoryLeader(MSSA->getLiveOnEntryDef());
+  //  The live on entry def gets put into it's own class
+  MemoryAccessToClass[MSSA->getLiveOnEntryDef()] =
+      createMemoryClass(MSSA->getLiveOnEntryDef());
+
   for (auto &B : F) {
-    if (auto *MP = MSSA->getMemoryAccess(&B))
-      MemoryAccessToClass[MP] = InitialClass;
+    // All MemoryAccesses are equivalent to live on entry to start. They must
+    // be initialized to something so that initial changes are noticed. For
+    // the maximal answer, we initialize them all to be the same as
+    // liveOnEntry.
+    auto *MemoryBlockDefs = MSSA->getBlockDefs(&B);
+    if (MemoryBlockDefs)
+      for (const auto &Def : *MemoryBlockDefs) {
+        MemoryAccessToClass[&Def] = TOPClass;
+        auto *MD = dyn_cast<MemoryDef>(&Def);
+        // Insert the memory phis into the member list.
+        if (!MD) {
+          const MemoryPhi *MP = cast<MemoryPhi>(&Def);
+          TOPClass->memory_insert(MP);
+          MemoryPhiState.insert({MP, MPS_TOP});
+        }
 
-    for (auto &I : B) {
-      InitialValues.insert(&I);
-      ValueToClass[&I] = InitialClass;
-      // All memory accesses are equivalent to live on entry to start. They must
-      // be initialized to something so that initial changes are noticed. For
-      // the maximal answer, we initialize them all to be the same as
-      // liveOnEntry.  Note that to save time, we only initialize the
-      // MemoryDef's for stores and all MemoryPhis to be equal.  Right now, no
-      // other expression can generate a memory equivalence.  If we start
-      // handling memcpy/etc, we can expand this.
-      if (isa<StoreInst>(&I)) {
-        MemoryAccessToClass[MSSA->getMemoryAccess(&I)] = InitialClass;
-        ++InitialClass->StoreCount;
-        assert(InitialClass->StoreCount > 0);
+        if (MD && isa<StoreInst>(MD->getMemoryInst()))
+          TOPClass->incStoreCount();
       }
+    for (auto &I : B) {
+      // Don't insert void terminators into the class. We don't value number
+      // them, and they just end up sitting in TOP.
+      if (isa<TerminatorInst>(I) && I.getType()->isVoidTy())
+        continue;
+      TOPClass->insert(&I);
+      ValueToClass[&I] = TOPClass;
     }
   }
-  InitialClass->Members.swap(InitialValues);
 
   // Initialize arguments to be in their own unique congruence classes
   for (auto &FA : F.args())
@@ -1492,8 +2149,8 @@ void NewGVN::initializeCongruenceClasses(Function &F) {
 
 void NewGVN::cleanupTables() {
   for (unsigned i = 0, e = CongruenceClasses.size(); i != e; ++i) {
-    DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->ID << " has "
-                 << CongruenceClasses[i]->Members.size() << " members\n");
+    DEBUG(dbgs() << "Congruence class " << CongruenceClasses[i]->getID()
+                 << " has " << CongruenceClasses[i]->size() << " members\n");
     // Make sure we delete the congruence class (probably worth switching to
     // a unique_ptr at some point.
     delete CongruenceClasses[i];
@@ -1513,12 +2170,12 @@ void NewGVN::cleanupTables() {
 #endif
   InstrDFS.clear();
   InstructionsToErase.clear();
-
   DFSToInstr.clear();
   BlockInstRange.clear();
   TouchedInstructions.clear();
-  DominatedInstRange.clear();
   MemoryAccessToClass.clear();
+  PredicateToUsers.clear();
+  MemoryToUsers.clear();
 }
 
 std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
@@ -1530,6 +2187,16 @@ std::pair<unsigned, unsigned> NewGVN::assignDFSNumbers(BasicBlock *B,
   }
 
   for (auto &I : *B) {
+    // There's no need to call isInstructionTriviallyDead more than once on
+    // an instruction. Therefore, once we know that an instruction is dead
+    // we change its DFS number so that it doesn't get value numbered.
+    if (isInstructionTriviallyDead(&I, TLI)) {
+      InstrDFS[&I] = 0;
+      DEBUG(dbgs() << "Skipping trivially dead instruction " << I << "\n");
+      markInstructionForDeletion(&I);
+      continue;
+    }
+
     InstrDFS[&I] = End++;
     DFSToInstr.emplace_back(&I);
   }
@@ -1556,16 +2223,17 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
   // If all the arguments are the same, the MemoryPhi has the same value as the
   // argument.
   // Filter out unreachable blocks and self phis from our operands.
+  const BasicBlock *PHIBlock = MP->getBlock();
   auto Filtered = make_filter_range(MP->operands(), [&](const Use &U) {
-    return lookupMemoryAccessEquiv(cast<MemoryAccess>(U)) != MP &&
+    return lookupMemoryLeader(cast<MemoryAccess>(U)) != MP &&
            !isMemoryAccessTop(cast<MemoryAccess>(U)) &&
-           ReachableBlocks.count(MP->getIncomingBlock(U));
+           ReachableEdges.count({MP->getIncomingBlock(U), PHIBlock});
   });
   // If all that is left is nothing, our memoryphi is undef. We keep it as
   // InitialClass.  Note: The only case this should happen is if we have at
   // least one self-argument.
   if (Filtered.begin() == Filtered.end()) {
-    if (setMemoryAccessEquivTo(MP, InitialClass))
+    if (setMemoryClass(MP, TOPClass))
       markMemoryUsersTouched(MP);
     return;
   }
@@ -1573,14 +2241,14 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
   // Transform the remaining operands into operand leaders.
   // FIXME: mapped_iterator should have a range version.
   auto LookupFunc = [&](const Use &U) {
-    return lookupMemoryAccessEquiv(cast<MemoryAccess>(U));
+    return lookupMemoryLeader(cast<MemoryAccess>(U));
   };
   auto MappedBegin = map_iterator(Filtered.begin(), LookupFunc);
   auto MappedEnd = map_iterator(Filtered.end(), LookupFunc);
 
   // and now check if all the elements are equal.
   // Sadly, we can't use std::equals since these are random access iterators.
-  MemoryAccess *AllSameValue = *MappedBegin;
+  const auto *AllSameValue = *MappedBegin;
   ++MappedBegin;
   bool AllEqual = std::all_of(
       MappedBegin, MappedEnd,
@@ -1590,9 +2258,18 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
     DEBUG(dbgs() << "Memory Phi value numbered to " << *AllSameValue << "\n");
   else
     DEBUG(dbgs() << "Memory Phi value numbered to itself\n");
-
-  if (setMemoryAccessEquivTo(
-          MP, AllEqual ? MemoryAccessToClass.lookup(AllSameValue) : nullptr))
+  // If it's equal to something, it's in that class. Otherwise, it has to be in
+  // a class where it is the leader (other things may be equivalent to it, but
+  // it needs to start off in its own class, which means it must have been the
+  // leader, and it can't have stopped being the leader because it was never
+  // removed).
+  CongruenceClass *CC =
+      AllEqual ? getMemoryClass(AllSameValue) : ensureLeaderOfMemoryClass(MP);
+  auto OldState = MemoryPhiState.lookup(MP);
+  assert(OldState != MPS_Invalid && "Invalid memory phi state");
+  auto NewState = AllEqual ? MPS_Equivalent : MPS_Unique;
+  MemoryPhiState[MP] = NewState;
+  if (setMemoryClass(MP, CC) || OldState != NewState)
     markMemoryUsersTouched(MP);
 }
 
@@ -1600,26 +2277,25 @@ void NewGVN::valueNumberMemoryPhi(MemoryPhi *MP) {
 // congruence finding, and updating mappings.
 void NewGVN::valueNumberInstruction(Instruction *I) {
   DEBUG(dbgs() << "Processing instruction " << *I << "\n");
-
-  // There's no need to call isInstructionTriviallyDead more than once on
-  // an instruction. Therefore, once we know that an instruction is dead
-  // we change its DFS number so that it doesn't get numbered again.
-  if (InstrDFS[I] != 0 && isInstructionTriviallyDead(I, TLI)) {
-    InstrDFS[I] = 0;
-    DEBUG(dbgs() << "Skipping unused instruction\n");
-    markInstructionForDeletion(I);
-    return;
-  }
   if (!I->isTerminator()) {
-    const auto *Symbolized = performSymbolicEvaluation(I, I->getParent());
+    const Expression *Symbolized = nullptr;
+    if (DebugCounter::shouldExecute(VNCounter)) {
+      Symbolized = performSymbolicEvaluation(I);
+    } else {
+      // Mark the instruction as unused so we don't value number it again.
+      InstrDFS[I] = 0;
+    }
     // If we couldn't come up with a symbolic expression, use the unknown
     // expression
-    if (Symbolized == nullptr)
+    if (Symbolized == nullptr) {
       Symbolized = createUnknownExpression(I);
+    }
+
     performCongruenceFinding(I, Symbolized);
   } else {
     // Handle terminators that return values. All of them produce values we
-    // don't currently understand.
+    // don't currently understand.  We don't place non-value producing
+    // terminators in a class.
     if (!I->getType()->isVoidTy()) {
       auto *Symbolized = createUnknownExpression(I);
       performCongruenceFinding(I, Symbolized);
@@ -1634,28 +2310,33 @@ bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
                                     const MemoryAccess *Second) const {
   if (First == Second)
     return true;
-
-  if (auto *FirstDef = dyn_cast<MemoryUseOrDef>(First)) {
-    auto *DefAccess = FirstDef->getDefiningAccess();
-    return singleReachablePHIPath(DefAccess, Second);
-  } else {
-    auto *MP = cast<MemoryPhi>(First);
-    auto ReachableOperandPred = [&](const Use &U) {
-      return ReachableBlocks.count(MP->getIncomingBlock(U));
-    };
-    auto FilteredPhiArgs =
-        make_filter_range(MP->operands(), ReachableOperandPred);
-    SmallVector<const Value *, 32> OperandList;
-    std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
-              std::back_inserter(OperandList));
-    bool Okay = OperandList.size() == 1;
-    if (!Okay)
-      Okay = std::equal(OperandList.begin(), OperandList.end(),
-                        OperandList.begin());
-    if (Okay)
-      return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+  if (MSSA->isLiveOnEntryDef(First))
     return false;
+
+  const auto *EndDef = First;
+  for (auto *ChainDef : optimized_def_chain(First)) {
+    if (ChainDef == Second)
+      return true;
+    if (MSSA->isLiveOnEntryDef(ChainDef))
+      return false;
+    EndDef = ChainDef;
   }
+  auto *MP = cast<MemoryPhi>(EndDef);
+  auto ReachableOperandPred = [&](const Use &U) {
+    return ReachableEdges.count({MP->getIncomingBlock(U), MP->getBlock()});
+  };
+  auto FilteredPhiArgs =
+      make_filter_range(MP->operands(), ReachableOperandPred);
+  SmallVector<const Value *, 32> OperandList;
+  std::copy(FilteredPhiArgs.begin(), FilteredPhiArgs.end(),
+            std::back_inserter(OperandList));
+  bool Okay = OperandList.size() == 1;
+  if (!Okay)
+    Okay =
+        std::equal(OperandList.begin(), OperandList.end(), OperandList.begin());
+  if (Okay)
+    return singleReachablePHIPath(cast<MemoryAccess>(OperandList[0]), Second);
+  return false;
 }
 
 // Verify the that the memory equivalence table makes sense relative to the
@@ -1663,7 +2344,31 @@ bool NewGVN::singleReachablePHIPath(const MemoryAccess *First,
 // subject to very rare false negatives. It is only useful for
 // testing/debugging.
 void NewGVN::verifyMemoryCongruency() const {
-  // Anything equivalent in the memory access table should be in the same
+#ifndef NDEBUG
+  // Verify that the memory table equivalence and memory member set match
+  for (const auto *CC : CongruenceClasses) {
+    if (CC == TOPClass || CC->isDead())
+      continue;
+    if (CC->getStoreCount() != 0) {
+      assert((CC->getStoredValue() || !isa<StoreInst>(CC->getLeader())) &&
+             "Any class with a store as a "
+             "leader should have a "
+             "representative stored value\n");
+      assert(CC->getMemoryLeader() &&
+             "Any congruence class with a store should "
+             "have a representative access\n");
+    }
+
+    if (CC->getMemoryLeader())
+      assert(MemoryAccessToClass.lookup(CC->getMemoryLeader()) == CC &&
+             "Representative MemoryAccess does not appear to be reverse "
+             "mapped properly");
+    for (auto M : CC->memory())
+      assert(MemoryAccessToClass.lookup(M) == CC &&
+             "Memory member does not appear to be reverse mapped properly");
+  }
+
+  // Anything equivalent in the MemoryAccess table should be in the same
   // congruence class.
 
   // Filter out the unreachable and trivially dead entries, because they may
@@ -1673,19 +2378,21 @@ void NewGVN::verifyMemoryCongruency() const {
         bool Result = ReachableBlocks.count(Pair.first->getBlock());
         if (!Result)
           return false;
+        if (MSSA->isLiveOnEntryDef(Pair.first))
+          return true;
         if (auto *MemDef = dyn_cast<MemoryDef>(Pair.first))
           return !isInstructionTriviallyDead(MemDef->getMemoryInst());
+        if (MemoryToDFSNum(Pair.first) == 0)
+          return false;
         return true;
       };
 
   auto Filtered = make_filter_range(MemoryAccessToClass, ReachableAccessPred);
   for (auto KV : Filtered) {
-    // Unreachable instructions may not have changed because we never process
-    // them.
-    if (!ReachableBlocks.count(KV.first->getBlock()))
-      continue;
+    assert(KV.second != TOPClass &&
+           "Memory not unreachable but ended up in TOP");
     if (auto *FirstMUD = dyn_cast<MemoryUseOrDef>(KV.first)) {
-      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->RepMemoryAccess);
+      auto *SecondMUD = dyn_cast<MemoryUseOrDef>(KV.second->getMemoryLeader());
       if (FirstMUD && SecondMUD)
         assert((singleReachablePHIPath(FirstMUD, SecondMUD) ||
                 ValueToClass.lookup(FirstMUD->getMemoryInst()) ==
@@ -1694,11 +2401,11 @@ void NewGVN::verifyMemoryCongruency() const {
                "been in the same congruence class or reachable through"
                "a single argument phi");
     } else if (auto *FirstMP = dyn_cast<MemoryPhi>(KV.first)) {
-
       // We can only sanely verify that MemoryDefs in the operand list all have
       // the same class.
       auto ReachableOperandPred = [&](const Use &U) {
-        return ReachableBlocks.count(FirstMP->getIncomingBlock(U)) &&
+        return ReachableEdges.count(
+                   {FirstMP->getIncomingBlock(U), FirstMP->getBlock()}) &&
                isa<MemoryDef>(U);
 
       };
@@ -1716,19 +2423,127 @@ void NewGVN::verifyMemoryCongruency() const {
              "All MemoryPhi arguments should be in the same class");
     }
   }
+#endif
+}
+
+// Verify that the sparse propagation we did actually found the maximal fixpoint
+// We do this by storing the value to class mapping, touching all instructions,
+// and redoing the iteration to see if anything changed.
+void NewGVN::verifyIterationSettled(Function &F) {
+#ifndef NDEBUG
+  DEBUG(dbgs() << "Beginning iteration verification\n");
+  if (DebugCounter::isCounterSet(VNCounter))
+    DebugCounter::setCounterValue(VNCounter, StartingVNCounter);
+
+  // Note that we have to store the actual classes, as we may change existing
+  // classes during iteration.  This is because our memory iteration propagation
+  // is not perfect, and so may waste a little work.  But it should generate
+  // exactly the same congruence classes we have now, with different IDs.
+  std::map<const Value *, CongruenceClass> BeforeIteration;
+
+  for (auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    BeforeIteration.insert({KV.first, *KV.second});
+  }
+
+  TouchedInstructions.set();
+  TouchedInstructions.reset(0);
+  iterateTouchedInstructions();
+  DenseSet<std::pair<const CongruenceClass *, const CongruenceClass *>>
+      EqualClasses;
+  for (const auto &KV : ValueToClass) {
+    if (auto *I = dyn_cast<Instruction>(KV.first))
+      // Skip unused/dead instructions.
+      if (InstrToDFSNum(I) == 0)
+        continue;
+    // We could sink these uses, but i think this adds a bit of clarity here as
+    // to what we are comparing.
+    auto *BeforeCC = &BeforeIteration.find(KV.first)->second;
+    auto *AfterCC = KV.second;
+    // Note that the classes can't change at this point, so we memoize the set
+    // that are equal.
+    if (!EqualClasses.count({BeforeCC, AfterCC})) {
+      assert(BeforeCC->isEquivalentTo(AfterCC) &&
+             "Value number changed after main loop completed!");
+      EqualClasses.insert({BeforeCC, AfterCC});
+    }
+  }
+#endif
+}
+
+// This is the main value numbering loop, it iterates over the initial touched
+// instruction set, propagating value numbers, marking things touched, etc,
+// until the set of touched instructions is completely empty.
+void NewGVN::iterateTouchedInstructions() {
+  unsigned int Iterations = 0;
+  // Figure out where touchedinstructions starts
+  int FirstInstr = TouchedInstructions.find_first();
+  // Nothing set, nothing to iterate, just return.
+  if (FirstInstr == -1)
+    return;
+  BasicBlock *LastBlock = getBlockForValue(InstrFromDFSNum(FirstInstr));
+  while (TouchedInstructions.any()) {
+    ++Iterations;
+    // Walk through all the instructions in all the blocks in RPO.
+    // TODO: As we hit a new block, we should push and pop equalities into a
+    // table lookupOperandLeader can use, to catch things PredicateInfo
+    // might miss, like edge-only equivalences.
+    for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
+         InstrNum = TouchedInstructions.find_next(InstrNum)) {
+
+      // This instruction was found to be dead. We don't bother looking
+      // at it again.
+      if (InstrNum == 0) {
+        TouchedInstructions.reset(InstrNum);
+        continue;
+      }
+
+      Value *V = InstrFromDFSNum(InstrNum);
+      BasicBlock *CurrBlock = getBlockForValue(V);
+
+      // If we hit a new block, do reachability processing.
+      if (CurrBlock != LastBlock) {
+        LastBlock = CurrBlock;
+        bool BlockReachable = ReachableBlocks.count(CurrBlock);
+        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
+
+        // If it's not reachable, erase any touched instructions and move on.
+        if (!BlockReachable) {
+          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
+          DEBUG(dbgs() << "Skipping instructions in block "
+                       << getBlockName(CurrBlock)
+                       << " because it is unreachable\n");
+          continue;
+        }
+        updateProcessedCount(CurrBlock);
+      }
+
+      if (auto *MP = dyn_cast<MemoryPhi>(V)) {
+        DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
+        valueNumberMemoryPhi(MP);
+      } else if (auto *I = dyn_cast<Instruction>(V)) {
+        valueNumberInstruction(I);
+      } else {
+        llvm_unreachable("Should have been a MemoryPhi or Instruction");
+      }
+      updateProcessedCount(V);
+      // Reset after processing (because we may mark ourselves as touched when
+      // we propagate equalities).
+      TouchedInstructions.reset(InstrNum);
+    }
+  }
+  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
 }
 
 // This is the main transformation entry point.
-bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
-                    TargetLibraryInfo *_TLI, AliasAnalysis *_AA,
-                    MemorySSA *_MSSA) {
+bool NewGVN::runGVN() {
+  if (DebugCounter::isCounterSet(VNCounter))
+    StartingVNCounter = DebugCounter::getCounterValue(VNCounter);
   bool Changed = false;
-  DT = _DT;
-  AC = _AC;
-  TLI = _TLI;
-  AA = _AA;
-  MSSA = _MSSA;
-  DL = &F.getParent()->getDataLayout();
+  NumFuncArgs = F.arg_size();
   MSSAWalker = MSSA->getWalker();
 
   // Count number of instructions for sizing of hash tables, and come
@@ -1736,9 +2551,9 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   unsigned ICount = 1;
   // Add an empty instruction to account for the fact that we start at 1
   DFSToInstr.emplace_back(nullptr);
-  // Note: We want RPO traversal of the blocks, which is not quite the same as
-  // dominator tree order, particularly with regard whether backedges get
-  // visited first or second, given a block with multiple successors.
+  // Note: We want ideal RPO traversal of the blocks, which is not quite the
+  // same as dominator tree order, particularly with regard whether backedges
+  // get visited first or second, given a block with multiple successors.
   // If we visit in the wrong order, we will end up performing N times as many
   // iterations.
   // The dominator tree does guarantee that, for a given dom tree node, it's
@@ -1783,7 +2598,6 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   }
 
   TouchedInstructions.resize(ICount);
-  DominatedInstRange.reserve(F.size());
   // Ensure we don't end up resizing the expressionToClass map, as
   // that can be quite expensive. At most, we have one expression per
   // instruction.
@@ -1795,68 +2609,10 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   ReachableBlocks.insert(&F.getEntryBlock());
 
   initializeCongruenceClasses(F);
-
-  unsigned int Iterations = 0;
-  // We start out in the entry block.
-  BasicBlock *LastBlock = &F.getEntryBlock();
-  while (TouchedInstructions.any()) {
-    ++Iterations;
-    // Walk through all the instructions in all the blocks in RPO.
-    for (int InstrNum = TouchedInstructions.find_first(); InstrNum != -1;
-         InstrNum = TouchedInstructions.find_next(InstrNum)) {
-
-      // This instruction was found to be dead. We don't bother looking
-      // at it again.
-      if (InstrNum == 0) {
-        TouchedInstructions.reset(InstrNum);
-        continue;
-      }
-
-      Value *V = DFSToInstr[InstrNum];
-      BasicBlock *CurrBlock = nullptr;
-
-      if (auto *I = dyn_cast<Instruction>(V))
-        CurrBlock = I->getParent();
-      else if (auto *MP = dyn_cast<MemoryPhi>(V))
-        CurrBlock = MP->getBlock();
-      else
-        llvm_unreachable("DFSToInstr gave us an unknown type of instruction");
-
-      // If we hit a new block, do reachability processing.
-      if (CurrBlock != LastBlock) {
-        LastBlock = CurrBlock;
-        bool BlockReachable = ReachableBlocks.count(CurrBlock);
-        const auto &CurrInstRange = BlockInstRange.lookup(CurrBlock);
-
-        // If it's not reachable, erase any touched instructions and move on.
-        if (!BlockReachable) {
-          TouchedInstructions.reset(CurrInstRange.first, CurrInstRange.second);
-          DEBUG(dbgs() << "Skipping instructions in block "
-                       << getBlockName(CurrBlock)
-                       << " because it is unreachable\n");
-          continue;
-        }
-        updateProcessedCount(CurrBlock);
-      }
-
-      if (auto *MP = dyn_cast<MemoryPhi>(V)) {
-        DEBUG(dbgs() << "Processing MemoryPhi " << *MP << "\n");
-        valueNumberMemoryPhi(MP);
-      } else if (auto *I = dyn_cast<Instruction>(V)) {
-        valueNumberInstruction(I);
-      } else {
-        llvm_unreachable("Should have been a MemoryPhi or Instruction");
-      }
-      updateProcessedCount(V);
-      // Reset after processing (because we may mark ourselves as touched when
-      // we propagate equalities).
-      TouchedInstructions.reset(InstrNum);
-    }
-  }
-  NumGVNMaxIterations = std::max(NumGVNMaxIterations.getValue(), Iterations);
-#ifndef NDEBUG
+  iterateTouchedInstructions();
   verifyMemoryCongruency();
-#endif
+  verifyIterationSettled(F);
+
   Changed |= eliminateInstructions(F);
 
   // Delete all instructions marked for deletion.
@@ -1883,36 +2639,6 @@ bool NewGVN::runGVN(Function &F, DominatorTree *_DT, AssumptionCache *_AC,
   return Changed;
 }
 
-bool NewGVN::runOnFunction(Function &F) {
-  if (skipFunction(F))
-    return false;
-  return runGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
-                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
-                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
-                &getAnalysis<AAResultsWrapperPass>().getAAResults(),
-                &getAnalysis<MemorySSAWrapperPass>().getMSSA());
-}
-
-PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
-  NewGVN Impl;
-
-  // Apparently the order in which we get these results matter for
-  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
-  // the same order here, just in case.
-  auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
-  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-  auto &AA = AM.getResult<AAManager>(F);
-  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
-  bool Changed = Impl.runGVN(F, &DT, &AC, &TLI, &AA, &MSSA);
-  if (!Changed)
-    return PreservedAnalyses::all();
-  PreservedAnalyses PA;
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<GlobalsAA>();
-  return PA;
-}
-
 // Return true if V is a value that will always be available (IE can
 // be placed anywhere) in the function.  We don't do globals here
 // because they are often worse to put in place.
@@ -1921,21 +2647,15 @@ static bool alwaysAvailable(Value *V) {
   return isa<Constant>(V) || isa<Argument>(V);
 }
 
-// Get the basic block from an instruction/value.
-static BasicBlock *getBlockForValue(Value *V) {
-  if (auto *I = dyn_cast<Instruction>(V))
-    return I->getParent();
-  return nullptr;
-}
-
 struct NewGVN::ValueDFS {
   int DFSIn = 0;
   int DFSOut = 0;
   int LocalNum = 0;
-  // Only one of these will be set.
-  Value *Val = nullptr;
+  // Only one of Def and U will be set.
+  // The bool in the Def tells us whether the Def is the stored value of a
+  // store.
+  PointerIntPair<Value *, 1, bool> Def;
   Use *U = nullptr;
-
   bool operator<(const ValueDFS &Other) const {
     // It's not enough that any given field be less than - we have sets
     // of fields that need to be evaluated together to give a proper ordering.
@@ -1975,57 +2695,68 @@ struct NewGVN::ValueDFS {
     // but .val  and .u.
     // It does not matter what order we replace these operands in.
     // You will always end up with the same IR, and this is guaranteed.
-    return std::tie(DFSIn, DFSOut, LocalNum, Val, U) <
-           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Val,
+    return std::tie(DFSIn, DFSOut, LocalNum, Def, U) <
+           std::tie(Other.DFSIn, Other.DFSOut, Other.LocalNum, Other.Def,
                     Other.U);
   }
 };
 
 // This function converts the set of members for a congruence class from values,
-// to sets of defs and uses with associated DFS info.
-void NewGVN::convertDenseToDFSOrdered(
-    const CongruenceClass::MemberSet &Dense,
-    SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+// to sets of defs and uses with associated DFS info.  The total number of
+// reachable uses for each value is stored in UseCount, and instructions that
+// seem
+// dead (have no non-dead uses) are stored in ProbablyDead.
+void NewGVN::convertClassToDFSOrdered(
+    const CongruenceClass &Dense, SmallVectorImpl<ValueDFS> &DFSOrderedSet,
+    DenseMap<const Value *, unsigned int> &UseCounts,
+    SmallPtrSetImpl<Instruction *> &ProbablyDead) const {
   for (auto D : Dense) {
     // First add the value.
     BasicBlock *BB = getBlockForValue(D);
     // Constants are handled prior to ever calling this function, so
     // we should only be left with instructions as members.
     assert(BB && "Should have figured out a basic block for value");
-    ValueDFS VD;
+    ValueDFS VDDef;
     DomTreeNode *DomNode = DT->getNode(BB);
-    VD.DFSIn = DomNode->getDFSNumIn();
-    VD.DFSOut = DomNode->getDFSNumOut();
-    // If it's a store, use the leader of the value operand.
+    VDDef.DFSIn = DomNode->getDFSNumIn();
+    VDDef.DFSOut = DomNode->getDFSNumOut();
+    // If it's a store, use the leader of the value operand, if it's always
+    // available, or the value operand.  TODO: We could do dominance checks to
+    // find a dominating leader, but not worth it ATM.
     if (auto *SI = dyn_cast<StoreInst>(D)) {
-      auto Leader =
-          lookupOperandLeader(SI->getValueOperand(), SI, SI->getParent());
-      VD.Val = alwaysAvailable(Leader) ? Leader : SI->getValueOperand();
+      auto Leader = lookupOperandLeader(SI->getValueOperand());
+      if (alwaysAvailable(Leader)) {
+        VDDef.Def.setPointer(Leader);
+      } else {
+        VDDef.Def.setPointer(SI->getValueOperand());
+        VDDef.Def.setInt(true);
+      }
     } else {
-      VD.Val = D;
+      VDDef.Def.setPointer(D);
     }
-
-    if (auto *I = dyn_cast<Instruction>(D))
-      VD.LocalNum = InstrDFS.lookup(I);
-    else
-      llvm_unreachable("Should have been an instruction");
-
-    DFSOrderedSet.emplace_back(VD);
-
+    assert(isa<Instruction>(D) &&
+           "The dense set member should always be an instruction");
+    VDDef.LocalNum = InstrToDFSNum(D);
+    DFSOrderedSet.emplace_back(VDDef);
+    Instruction *Def = cast<Instruction>(D);
+    unsigned int UseCount = 0;
     // Now add the uses.
-    for (auto &U : D->uses()) {
+    for (auto &U : Def->uses()) {
       if (auto *I = dyn_cast<Instruction>(U.getUser())) {
-        ValueDFS VD;
+        // Don't try to replace into dead uses
+        if (InstructionsToErase.count(I))
+          continue;
+        ValueDFS VDUse;
         // Put the phi node uses in the incoming block.
         BasicBlock *IBlock;
         if (auto *P = dyn_cast<PHINode>(I)) {
           IBlock = P->getIncomingBlock(U);
           // Make phi node users appear last in the incoming block
           // they are from.
-          VD.LocalNum = InstrDFS.size() + 1;
+          VDUse.LocalNum = InstrDFS.size() + 1;
         } else {
           IBlock = I->getParent();
-          VD.LocalNum = InstrDFS.lookup(I);
+          VDUse.LocalNum = InstrToDFSNum(I);
         }
 
         // Skip uses in unreachable blocks, as we're going
@@ -2034,20 +2765,29 @@ void NewGVN::convertDenseToDFSOrdered(
           continue;
 
         DomTreeNode *DomNode = DT->getNode(IBlock);
-        VD.DFSIn = DomNode->getDFSNumIn();
-        VD.DFSOut = DomNode->getDFSNumOut();
-        VD.U = &U;
-        DFSOrderedSet.emplace_back(VD);
+        VDUse.DFSIn = DomNode->getDFSNumIn();
+        VDUse.DFSOut = DomNode->getDFSNumOut();
+        VDUse.U = &U;
+        ++UseCount;
+        DFSOrderedSet.emplace_back(VDUse);
       }
     }
+
+    // If there are no uses, it's probably dead (but it may have side-effects,
+    // so not definitely dead. Otherwise, store the number of uses so we can
+    // track if it becomes dead later).
+    if (UseCount == 0)
+      ProbablyDead.insert(Def);
+    else
+      UseCounts[Def] = UseCount;
   }
 }
 
 // This function converts the set of members for a congruence class from values,
 // to the set of defs for loads and stores, with associated DFS info.
-void NewGVN::convertDenseToLoadsAndStores(
-    const CongruenceClass::MemberSet &Dense,
-    SmallVectorImpl<ValueDFS> &LoadsAndStores) {
+void NewGVN::convertClassToLoadsAndStores(
+    const CongruenceClass &Dense,
+    SmallVectorImpl<ValueDFS> &LoadsAndStores) const {
   for (auto D : Dense) {
     if (!isa<LoadInst>(D) && !isa<StoreInst>(D))
       continue;
@@ -2057,11 +2797,11 @@ void NewGVN::convertDenseToLoadsAndStores(
     DomTreeNode *DomNode = DT->getNode(BB);
     VD.DFSIn = DomNode->getDFSNumIn();
     VD.DFSOut = DomNode->getDFSNumOut();
-    VD.Val = D;
+    VD.Def.setPointer(D);
 
     // If it's an instruction, use the real local dfs number.
     if (auto *I = dyn_cast<Instruction>(D))
-      VD.LocalNum = InstrDFS.lookup(I);
+      VD.LocalNum = InstrToDFSNum(I);
     else
       llvm_unreachable("Should have been an instruction");
 
@@ -2070,31 +2810,34 @@ void NewGVN::convertDenseToLoadsAndStores(
 }
 
 static void patchReplacementInstruction(Instruction *I, Value *Repl) {
+  auto *ReplInst = dyn_cast<Instruction>(Repl);
+  if (!ReplInst)
+    return;
+
   // Patch the replacement so that it is not more restrictive than the value
   // being replaced.
-  auto *Op = dyn_cast<BinaryOperator>(I);
-  auto *ReplOp = dyn_cast<BinaryOperator>(Repl);
-
-  if (Op && ReplOp)
-    ReplOp->andIRFlags(Op);
-
-  if (auto *ReplInst = dyn_cast<Instruction>(Repl)) {
-    // FIXME: If both the original and replacement value are part of the
-    // same control-flow region (meaning that the execution of one
-    // guarentees the executation of the other), then we can combine the
-    // noalias scopes here and do better than the general conservative
-    // answer used in combineMetadata().
-
-    // In general, GVN unifies expressions over different control-flow
-    // regions, and so we need a conservative combination of the noalias
-    // scopes.
-    unsigned KnownIDs[] = {
-        LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
-        LLVMContext::MD_noalias,        LLVMContext::MD_range,
-        LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
-        LLVMContext::MD_invariant_group};
-    combineMetadata(ReplInst, I, KnownIDs);
-  }
+  // Note that if 'I' is a load being replaced by some operation,
+  // for example, by an arithmetic operation, then andIRFlags()
+  // would just erase all math flags from the original arithmetic
+  // operation, which is clearly not wanted and not needed.
+  if (!isa<LoadInst>(I))
+    ReplInst->andIRFlags(I);
+
+  // FIXME: If both the original and replacement value are part of the
+  // same control-flow region (meaning that the execution of one
+  // guarantees the execution of the other), then we can combine the
+  // noalias scopes here and do better than the general conservative
+  // answer used in combineMetadata().
+
+  // In general, GVN unifies expressions over different control-flow
+  // regions, and so we need a conservative combination of the noalias
+  // scopes.
+  static const unsigned KnownIDs[] = {
+      LLVMContext::MD_tbaa,           LLVMContext::MD_alias_scope,
+      LLVMContext::MD_noalias,        LLVMContext::MD_range,
+      LLVMContext::MD_fpmath,         LLVMContext::MD_invariant_load,
+      LLVMContext::MD_invariant_group};
+  combineMetadata(ReplInst, I, KnownIDs);
 }
 
 static void patchAndReplaceAllUsesWith(Instruction *I, Value *Repl) {
@@ -2228,54 +2971,57 @@ bool NewGVN::eliminateInstructions(Function &F) {
     }
   }
 
-  for (CongruenceClass *CC : CongruenceClasses) {
+  // Map to store the use counts
+  DenseMap<const Value *, unsigned int> UseCounts;
+  for (CongruenceClass *CC : reverse(CongruenceClasses)) {
     // Track the equivalent store info so we can decide whether to try
     // dead store elimination.
     SmallVector<ValueDFS, 8> PossibleDeadStores;
-
-    // FIXME: We should eventually be able to replace everything still
-    // in the initial class with undef, as they should be unreachable.
-    // Right now, initial still contains some things we skip value
-    // numbering of (UNREACHABLE's, for example).
-    if (CC == InitialClass || CC->Dead)
+    SmallPtrSet<Instruction *, 8> ProbablyDead;
+    if (CC->isDead() || CC->empty())
       continue;
-    assert(CC->RepLeader && "We should have had a leader");
+    // Everything still in the TOP class is unreachable or dead.
+    if (CC == TOPClass) {
+#ifndef NDEBUG
+      for (auto M : *CC)
+        assert((!ReachableBlocks.count(cast<Instruction>(M)->getParent()) ||
+                InstructionsToErase.count(cast<Instruction>(M))) &&
+               "Everything in TOP should be unreachable or dead at this "
+               "point");
+#endif
+      continue;
+    }
 
+    assert(CC->getLeader() && "We should have had a leader");
     // If this is a leader that is always available, and it's a
     // constant or has no equivalences, just replace everything with
     // it. We then update the congruence class with whatever members
     // are left.
-    Value *Leader = CC->RepStoredValue ? CC->RepStoredValue : CC->RepLeader;
+    Value *Leader =
+        CC->getStoredValue() ? CC->getStoredValue() : CC->getLeader();
     if (alwaysAvailable(Leader)) {
-      SmallPtrSet<Value *, 4> MembersLeft;
-      for (auto M : CC->Members) {
+      CongruenceClass::MemberSet MembersLeft;
+      for (auto M : *CC) {
         Value *Member = M;
         // Void things have no uses we can replace.
-        if (Member == CC->RepLeader || Member->getType()->isVoidTy()) {
+        if (Member == Leader || !isa<Instruction>(Member) ||
+            Member->getType()->isVoidTy()) {
           MembersLeft.insert(Member);
           continue;
         }
         DEBUG(dbgs() << "Found replacement " << *(Leader) << " for " << *Member
                      << "\n");
-        // Due to equality propagation, these may not always be
-        // instructions, they may be real values.  We don't really
-        // care about trying to replace the non-instructions.
-        if (auto *I = dyn_cast<Instruction>(Member)) {
-          assert(Leader != I && "About to accidentally remove our leader");
-          replaceInstruction(I, Leader);
-          AnythingReplaced = true;
-
-          continue;
-        } else {
-          MembersLeft.insert(I);
-        }
+        auto *I = cast<Instruction>(Member);
+        assert(Leader != I && "About to accidentally remove our leader");
+        replaceInstruction(I, Leader);
+        AnythingReplaced = true;
       }
-      CC->Members.swap(MembersLeft);
+      CC->swap(MembersLeft);
     } else {
-      DEBUG(dbgs() << "Eliminating in congruence class " << CC->ID << "\n");
+      DEBUG(dbgs() << "Eliminating in congruence class " << CC->getID()
+                   << "\n");
       // If this is a singleton, we can skip it.
-      if (CC->Members.size() != 1) {
-
+      if (CC->size() != 1) {
         // This is a stack because equality replacement/etc may place
         // constants in the middle of the member list, and we want to use
         // those constant values in preference to the current leader, over
@@ -2284,18 +3030,18 @@ bool NewGVN::eliminateInstructions(Function &F) {
 
         // Convert the members to DFS ordered sets and then merge them.
         SmallVector<ValueDFS, 8> DFSOrderedSet;
-        convertDenseToDFSOrdered(CC->Members, DFSOrderedSet);
+        convertClassToDFSOrdered(*CC, DFSOrderedSet, UseCounts, ProbablyDead);
 
         // Sort the whole thing.
         std::sort(DFSOrderedSet.begin(), DFSOrderedSet.end());
         for (auto &VD : DFSOrderedSet) {
           int MemberDFSIn = VD.DFSIn;
           int MemberDFSOut = VD.DFSOut;
-          Value *Member = VD.Val;
-          Use *MemberUse = VD.U;
-
+          Value *Def = VD.Def.getPointer();
+          bool FromStore = VD.Def.getInt();
+          Use *U = VD.U;
           // We ignore void things because we can't get a value from them.
-          if (Member && Member->getType()->isVoidTy())
+          if (Def && Def->getType()->isVoidTy())
             continue;
 
           if (EliminationStack.empty()) {
@@ -2321,78 +3067,115 @@ bool NewGVN::eliminateInstructions(Function &F) {
           // start using, we also push.
           // Otherwise, we walk along, processing members who are
           // dominated by this scope, and eliminate them.
-          bool ShouldPush =
-              Member && (EliminationStack.empty() || isa<Constant>(Member));
+          bool ShouldPush = Def && EliminationStack.empty();
           bool OutOfScope =
               !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut);
 
           if (OutOfScope || ShouldPush) {
             // Sync to our current scope.
             EliminationStack.popUntilDFSScope(MemberDFSIn, MemberDFSOut);
-            ShouldPush |= Member && EliminationStack.empty();
+            bool ShouldPush = Def && EliminationStack.empty();
             if (ShouldPush) {
-              EliminationStack.push_back(Member, MemberDFSIn, MemberDFSOut);
+              EliminationStack.push_back(Def, MemberDFSIn, MemberDFSOut);
+            }
+          }
+
+          // Skip the Def's, we only want to eliminate on their uses.  But mark
+          // dominated defs as dead.
+          if (Def) {
+            // For anything in this case, what and how we value number
+            // guarantees that any side-effets that would have occurred (ie
+            // throwing, etc) can be proven to either still occur (because it's
+            // dominated by something that has the same side-effects), or never
+            // occur.  Otherwise, we would not have been able to prove it value
+            // equivalent to something else. For these things, we can just mark
+            // it all dead.  Note that this is different from the "ProbablyDead"
+            // set, which may not be dominated by anything, and thus, are only
+            // easy to prove dead if they are also side-effect free. Note that
+            // because stores are put in terms of the stored value, we skip
+            // stored values here. If the stored value is really dead, it will
+            // still be marked for deletion when we process it in its own class.
+            if (!EliminationStack.empty() && Def != EliminationStack.back() &&
+                isa<Instruction>(Def) && !FromStore)
+              markInstructionForDeletion(cast<Instruction>(Def));
+            continue;
+          }
+          // At this point, we know it is a Use we are trying to possibly
+          // replace.
+
+          assert(isa<Instruction>(U->get()) &&
+                 "Current def should have been an instruction");
+          assert(isa<Instruction>(U->getUser()) &&
+                 "Current user should have been an instruction");
+
+          // If the thing we are replacing into is already marked to be dead,
+          // this use is dead.  Note that this is true regardless of whether
+          // we have anything dominating the use or not.  We do this here
+          // because we are already walking all the uses anyway.
+          Instruction *InstUse = cast<Instruction>(U->getUser());
+          if (InstructionsToErase.count(InstUse)) {
+            auto &UseCount = UseCounts[U->get()];
+            if (--UseCount == 0) {
+              ProbablyDead.insert(cast<Instruction>(U->get()));
             }
           }
 
           // If we get to this point, and the stack is empty we must have a use
-          // with nothing we can use to eliminate it, just skip it.
+          // with nothing we can use to eliminate this use, so just skip it.
           if (EliminationStack.empty())
             continue;
 
-          // Skip the Value's, we only want to eliminate on their uses.
-          if (Member)
-            continue;
-          Value *Result = EliminationStack.back();
+          Value *DominatingLeader = EliminationStack.back();
 
           // Don't replace our existing users with ourselves.
-          if (MemberUse->get() == Result)
+          if (U->get() == DominatingLeader)
             continue;
-
-          DEBUG(dbgs() << "Found replacement " << *Result << " for "
-                       << *MemberUse->get() << " in " << *(MemberUse->getUser())
-                       << "\n");
+          DEBUG(dbgs() << "Found replacement " << *DominatingLeader << " for "
+                       << *U->get() << " in " << *(U->getUser()) << "\n");
 
           // If we replaced something in an instruction, handle the patching of
-          // metadata.
-          if (auto *ReplacedInst = dyn_cast<Instruction>(MemberUse->get()))
-            patchReplacementInstruction(ReplacedInst, Result);
-
-          assert(isa<Instruction>(MemberUse->getUser()));
-          MemberUse->set(Result);
+          // metadata.  Skip this if we are replacing predicateinfo with its
+          // original operand, as we already know we can just drop it.
+          auto *ReplacedInst = cast<Instruction>(U->get());
+          auto *PI = PredInfo->getPredicateInfoFor(ReplacedInst);
+          if (!PI || DominatingLeader != PI->OriginalOp)
+            patchReplacementInstruction(ReplacedInst, DominatingLeader);
+          U->set(DominatingLeader);
+          // This is now a use of the dominating leader, which means if the
+          // dominating leader was dead, it's now live!
+          auto &LeaderUseCount = UseCounts[DominatingLeader];
+          // It's about to be alive again.
+          if (LeaderUseCount == 0 && isa<Instruction>(DominatingLeader))
+            ProbablyDead.erase(cast<Instruction>(DominatingLeader));
+          ++LeaderUseCount;
           AnythingReplaced = true;
         }
       }
     }
 
+    // At this point, anything still in the ProbablyDead set is actually dead if
+    // would be trivially dead.
+    for (auto *I : ProbablyDead)
+      if (wouldInstructionBeTriviallyDead(I))
+        markInstructionForDeletion(I);
+
     // Cleanup the congruence class.
-    SmallPtrSet<Value *, 4> MembersLeft;
-    for (Value *Member : CC->Members) {
-      if (Member->getType()->isVoidTy()) {
+    CongruenceClass::MemberSet MembersLeft;
+    for (auto *Member : *CC)
+      if (!isa<Instruction>(Member) ||
+          !InstructionsToErase.count(cast<Instruction>(Member)))
         MembersLeft.insert(Member);
-        continue;
-      }
-
-      if (auto *MemberInst = dyn_cast<Instruction>(Member)) {
-        if (isInstructionTriviallyDead(MemberInst)) {
-          // TODO: Don't mark loads of undefs.
-          markInstructionForDeletion(MemberInst);
-          continue;
-        }
-      }
-      MembersLeft.insert(Member);
-    }
-    CC->Members.swap(MembersLeft);
+    CC->swap(MembersLeft);
 
     // If we have possible dead stores to look at, try to eliminate them.
-    if (CC->StoreCount > 0) {
-      convertDenseToLoadsAndStores(CC->Members, PossibleDeadStores);
+    if (CC->getStoreCount() > 0) {
+      convertClassToLoadsAndStores(*CC, PossibleDeadStores);
       std::sort(PossibleDeadStores.begin(), PossibleDeadStores.end());
       ValueDFSStack EliminationStack;
       for (auto &VD : PossibleDeadStores) {
         int MemberDFSIn = VD.DFSIn;
         int MemberDFSOut = VD.DFSOut;
-        Instruction *Member = cast<Instruction>(VD.Val);
+        Instruction *Member = cast<Instruction>(VD.Def.getPointer());
         if (EliminationStack.empty() ||
             !EliminationStack.isInScope(MemberDFSIn, MemberDFSOut)) {
           // Sync to our current scope.
@@ -2413,7 +3196,7 @@ bool NewGVN::eliminateInstructions(Function &F) {
         DEBUG(dbgs() << "Marking dead store " << *Member
                      << " that is dominated by " << *Leader << "\n");
         markInstructionForDeletion(Member);
-        CC->Members.erase(Member);
+        CC->erase(Member);
         ++NumGVNDeadStores;
       }
     }
@@ -2421,3 +3204,103 @@ bool NewGVN::eliminateInstructions(Function &F) {
 
   return AnythingReplaced;
 }
+
+// This function provides global ranking of operations so that we can place them
+// in a canonical order.  Note that rank alone is not necessarily enough for a
+// complete ordering, as constants all have the same rank.  However, generally,
+// we will simplify an operation with all constants so that it doesn't matter
+// what order they appear in.
+unsigned int NewGVN::getRank(const Value *V) const {
+  // Prefer undef to anything else
+  if (isa<UndefValue>(V))
+    return 0;
+  if (isa<Constant>(V))
+    return 1;
+  else if (auto *A = dyn_cast<Argument>(V))
+    return 2 + A->getArgNo();
+
+  // Need to shift the instruction DFS by number of arguments + 3 to account for
+  // the constant and argument ranking above.
+  unsigned Result = InstrToDFSNum(V);
+  if (Result > 0)
+    return 3 + NumFuncArgs + Result;
+  // Unreachable or something else, just return a really large number.
+  return ~0;
+}
+
+// This is a function that says whether two commutative operations should
+// have their order swapped when canonicalizing.
+bool NewGVN::shouldSwapOperands(const Value *A, const Value *B) const {
+  // Because we only care about a total ordering, and don't rewrite expressions
+  // in this order, we order by rank, which will give a strict weak ordering to
+  // everything but constants, and then we order by pointer address.
+  return std::make_pair(getRank(A), A) > std::make_pair(getRank(B), B);
+}
+
+class NewGVNLegacyPass : public FunctionPass {
+public:
+  static char ID; // Pass identification, replacement for typeid.
+  NewGVNLegacyPass() : FunctionPass(ID) {
+    initializeNewGVNLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override;
+
+private:
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    AU.addRequired<MemorySSAWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
+};
+
+bool NewGVNLegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+  return NewGVN(F, &getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
+                &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F),
+                &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(),
+                &getAnalysis<AAResultsWrapperPass>().getAAResults(),
+                &getAnalysis<MemorySSAWrapperPass>().getMSSA(),
+                F.getParent()->getDataLayout())
+      .runGVN();
+}
+
+INITIALIZE_PASS_BEGIN(NewGVNLegacyPass, "newgvn", "Global Value Numbering",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_END(NewGVNLegacyPass, "newgvn", "Global Value Numbering", false,
+                    false)
+
+char NewGVNLegacyPass::ID = 0;
+
+// createGVNPass - The public interface to this file.
+FunctionPass *llvm::createNewGVNPass() { return new NewGVNLegacyPass(); }
+
+PreservedAnalyses NewGVNPass::run(Function &F, AnalysisManager<Function> &AM) {
+  // Apparently the order in which we get these results matter for
+  // the old GVN (see Chandler's comment in GVN.cpp). I'll keep
+  // the same order here, just in case.
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+  bool Changed =
+      NewGVN(F, &DT, &AC, &TLI, &AA, &MSSA, F.getParent()->getDataLayout())
+          .runGVN();
+  if (!Changed)
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserve<DominatorTreeAnalysis>();
+  PA.preserve<GlobalsAA>();
+  return PA;
+}
diff --git a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
index 5494356a60b7bdf19550c24942b62e37c9b65d23..1bfecea2f61e6f263cfaf7e8013236f34db6d4b1 100644
--- a/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
+++ b/lib/Transforms/Scalar/PartiallyInlineLibCalls.cpp
@@ -66,7 +66,7 @@ static bool optimizeSQRT(CallInst *Call, Function *CalledFunc,
   // Add attribute "readnone" so that backend can use a native sqrt instruction
   // for this call. Insert a FP compare instruction and a conditional branch
   // at the end of CurrBB.
-  Call->addAttribute(AttributeSet::FunctionIndex, Attribute::ReadNone);
+  Call->addAttribute(AttributeList::FunctionIndex, Attribute::ReadNone);
   CurrBB.getTerminator()->eraseFromParent();
   Builder.SetInsertPoint(&CurrBB);
   Value *FCmp = Builder.CreateFCmpOEQ(Call, Call);
diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
index f5b1a4428ee422847361467817441488b6dd8af7..3dcab609078960115a4bcac956f02a1a1d751e03 100644
--- a/lib/Transforms/Scalar/Reassociate.cpp
+++ b/lib/Transforms/Scalar/Reassociate.cpp
@@ -1069,8 +1069,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
 ///
 /// Ops is the top-level list of add operands we're trying to factor.
 static void FindSingleUseMultiplyFactors(Value *V,
-                                         SmallVectorImpl<Value*> &Factors,
-                                       const SmallVectorImpl<ValueEntry> &Ops) {
+                                         SmallVectorImpl<Value*> &Factors) {
   BinaryOperator *BO = isReassociableOp(V, Instruction::Mul, Instruction::FMul);
   if (!BO) {
     Factors.push_back(V);
@@ -1078,8 +1077,8 @@ static void FindSingleUseMultiplyFactors(Value *V,
   }
 
   // Otherwise, add the LHS and RHS to the list of factors.
-  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors, Ops);
-  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors, Ops);
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
 }
 
 /// Optimize a series of operands to an 'and', 'or', or 'xor' instruction.
@@ -1499,7 +1498,7 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
 
     // Compute all of the factors of this added value.
     SmallVector<Value*, 8> Factors;
-    FindSingleUseMultiplyFactors(BOp, Factors, Ops);
+    FindSingleUseMultiplyFactors(BOp, Factors);
     assert(Factors.size() > 1 && "Bad linearize!");
 
     // Add one to FactorOccurrences for each unique factor in this op.
@@ -1521,8 +1520,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
       if (ConstantInt *CI = dyn_cast<ConstantInt>(Factor)) {
         if (CI->isNegative() && !CI->isMinValue(true)) {
           Factor = ConstantInt::get(CI->getContext(), -CI->getValue());
-          assert(!Duplicates.count(Factor) &&
-                 "Shouldn't have two constant factors, missed a canonicalize");
+          if (!Duplicates.insert(Factor).second)
+            continue;
           unsigned Occ = ++FactorOccurrences[Factor];
           if (Occ > MaxOcc) {
             MaxOcc = Occ;
@@ -1534,8 +1533,8 @@ Value *ReassociatePass::OptimizeAdd(Instruction *I,
           APFloat F(CF->getValueAPF());
           F.changeSign();
           Factor = ConstantFP::get(CF->getContext(), F);
-          assert(!Duplicates.count(Factor) &&
-                 "Shouldn't have two constant factors, missed a canonicalize");
+          if (!Duplicates.insert(Factor).second)
+            continue;
           unsigned Occ = ++FactorOccurrences[Factor];
           if (Occ > MaxOcc) {
             MaxOcc = Occ;
diff --git a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index 13c3549c238b74affed6e6eb10aec5ca033f4fef..f344eb151464a6c82c5deab93256c8fa4c03afd3 100644
--- a/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -365,6 +365,11 @@ findBaseDefiningValueOfVector(Value *I) {
     // for particular sufflevector patterns.
     return BaseDefiningValueResult(I, false);
 
+  // The behavior of getelementptr instructions is the same for vector and
+  // non-vector data types.
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    return findBaseDefiningValue(GEP->getPointerOperand());
+
   // A PHI or Select is a base defining value.  The outer findBasePointer
   // algorithm is responsible for constructing a base value for this BDV.
   assert((isa<SelectInst>(I) || isa<PHINode>(I)) &&
@@ -1123,14 +1128,14 @@ normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
 
 // Create new attribute set containing only attributes which can be transferred
 // from original call to the safepoint.
-static AttributeSet legalizeCallAttributes(AttributeSet AS) {
-  AttributeSet Ret;
+static AttributeList legalizeCallAttributes(AttributeList AS) {
+  AttributeList Ret;
 
   for (unsigned Slot = 0; Slot < AS.getNumSlots(); Slot++) {
     unsigned Index = AS.getSlotIndex(Slot);
 
-    if (Index == AttributeSet::ReturnIndex ||
-        Index == AttributeSet::FunctionIndex) {
+    if (Index == AttributeList::ReturnIndex ||
+        Index == AttributeList::FunctionIndex) {
 
       for (Attribute Attr : make_range(AS.begin(Slot), AS.end(Slot))) {
 
@@ -1148,7 +1153,7 @@ static AttributeSet legalizeCallAttributes(AttributeSet AS) {
 
         Ret = Ret.addAttributes(
             AS.getContext(), Index,
-            AttributeSet::get(AS.getContext(), Index, AttrBuilder(Attr)));
+            AttributeList::get(AS.getContext(), Index, AttrBuilder(Attr)));
       }
     }
 
@@ -1299,12 +1304,11 @@ static StringRef getDeoptLowering(CallSite CS) {
   const char *DeoptLowering = "deopt-lowering";
   if (CS.hasFnAttr(DeoptLowering)) {
     // FIXME: CallSite has a *really* confusing interface around attributes
-    // with values.  
-    const AttributeSet &CSAS = CS.getAttributes();
-    if (CSAS.hasAttribute(AttributeSet::FunctionIndex,
-                          DeoptLowering))
-      return CSAS.getAttribute(AttributeSet::FunctionIndex,
-                               DeoptLowering).getValueAsString();
+    // with values.
+    const AttributeList &CSAS = CS.getAttributes();
+    if (CSAS.hasAttribute(AttributeList::FunctionIndex, DeoptLowering))
+      return CSAS.getAttribute(AttributeList::FunctionIndex, DeoptLowering)
+          .getValueAsString();
     Function *F = CS.getCalledFunction();
     assert(F && F->hasFnAttribute(DeoptLowering));
     return F->getFnAttribute(DeoptLowering).getValueAsString();
@@ -1388,7 +1392,6 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
 
   // Create the statepoint given all the arguments
   Instruction *Token = nullptr;
-  AttributeSet ReturnAttrs;
   if (CS.isCall()) {
     CallInst *ToReplace = cast<CallInst>(CS.getInstruction());
     CallInst *Call = Builder.CreateGCStatepointCall(
@@ -1400,11 +1403,12 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
 
     // Currently we will fail on parameter attributes and on certain
     // function attributes.
-    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+    AttributeList NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
     // In case if we can handle this set of attributes - set up function attrs
     // directly on statepoint and return attrs later for gc_result intrinsic.
-    Call->setAttributes(NewAttrs.getFnAttributes());
-    ReturnAttrs = NewAttrs.getRetAttributes();
+    Call->setAttributes(AttributeList::get(Call->getContext(),
+                                           AttributeList::FunctionIndex,
+                                           NewAttrs.getFnAttributes()));
 
     Token = Call;
 
@@ -1428,11 +1432,12 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
 
     // Currently we will fail on parameter attributes and on certain
     // function attributes.
-    AttributeSet NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
+    AttributeList NewAttrs = legalizeCallAttributes(ToReplace->getAttributes());
     // In case if we can handle this set of attributes - set up function attrs
     // directly on statepoint and return attrs later for gc_result intrinsic.
-    Invoke->setAttributes(NewAttrs.getFnAttributes());
-    ReturnAttrs = NewAttrs.getRetAttributes();
+    Invoke->setAttributes(AttributeList::get(Invoke->getContext(),
+                                             AttributeList::FunctionIndex,
+                                             NewAttrs.getFnAttributes()));
 
     Token = Invoke;
 
@@ -1478,7 +1483,9 @@ makeStatepointExplicitImpl(const CallSite CS, /* to replace */
       StringRef Name =
           CS.getInstruction()->hasName() ? CS.getInstruction()->getName() : "";
       CallInst *GCResult = Builder.CreateGCResult(Token, CS.getType(), Name);
-      GCResult->setAttributes(CS.getAttributes().getRetAttributes());
+      GCResult->setAttributes(
+          AttributeList::get(GCResult->getContext(), AttributeList::ReturnIndex,
+                             CS.getAttributes().getRetAttributes()));
 
       // We cannot RAUW or delete CS.getInstruction() because it could be in the
       // live set of some other safepoint, in which case that safepoint's
@@ -1615,8 +1622,10 @@ static void relocationViaAlloca(
 
   // Emit alloca for "LiveValue" and record it in "allocaMap" and
   // "PromotableAllocas"
+  const DataLayout &DL = F.getParent()->getDataLayout();
   auto emitAllocaFor = [&](Value *LiveValue) {
-    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(), "",
+    AllocaInst *Alloca = new AllocaInst(LiveValue->getType(),
+                                        DL.getAllocaAddrSpace(), "",
                                         F.getEntryBlock().getFirstNonPHI());
     AllocaMap[LiveValue] = Alloca;
     PromotableAllocas.push_back(Alloca);
@@ -1873,7 +1882,7 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
              "non noop cast is found during rematerialization");
 
       Type *SrcTy = CI->getOperand(0)->getType();
-      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy);
+      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);
 
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
       // Cost of the address calculation
@@ -2304,7 +2313,7 @@ static void RemoveNonValidAttrAtIndex(LLVMContext &Ctx, AttrHolder &AH,
 
   if (!R.empty())
     AH.setAttributes(AH.getAttributes().removeAttributes(
-        Ctx, Index, AttributeSet::get(Ctx, Index, R)));
+        Ctx, Index, AttributeList::get(Ctx, Index, R)));
 }
 
 void
@@ -2316,7 +2325,7 @@ RewriteStatepointsForGC::stripNonValidAttributesFromPrototype(Function &F) {
       RemoveNonValidAttrAtIndex(Ctx, F, A.getArgNo() + 1);
 
   if (isa<PointerType>(F.getReturnType()))
-    RemoveNonValidAttrAtIndex(Ctx, F, AttributeSet::ReturnIndex);
+    RemoveNonValidAttrAtIndex(Ctx, F, AttributeList::ReturnIndex);
 }
 
 void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
@@ -2351,7 +2360,7 @@ void RewriteStatepointsForGC::stripNonValidAttributesFromBody(Function &F) {
         if (isa<PointerType>(CS.getArgument(i)->getType()))
           RemoveNonValidAttrAtIndex(Ctx, CS, i + 1);
       if (isa<PointerType>(CS.getType()))
-        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeSet::ReturnIndex);
+        RemoveNonValidAttrAtIndex(Ctx, CS, AttributeList::ReturnIndex);
     }
   }
 }
diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
index 334642fe35dd770235c2a5e4ca628bfcfe1d4a66..8908dae2f5459a42c843cbd326ee181a09484f8d 100644
--- a/lib/Transforms/Scalar/SCCP.cpp
+++ b/lib/Transforms/Scalar/SCCP.cpp
@@ -140,6 +140,14 @@ public:
     return nullptr;
   }
 
+  /// getBlockAddress - If this is a constant with a BlockAddress value, return
+  /// it, otherwise return null.
+  BlockAddress *getBlockAddress() const {
+    if (isConstant())
+      return dyn_cast<BlockAddress>(getConstant());
+    return nullptr;
+  }
+
   void markForcedConstant(Constant *V) {
     assert(isUnknown() && "Can't force a defined value!");
     Val.setInt(forcedconstant);
@@ -306,20 +314,14 @@ public:
     return MRVFunctionsTracked;
   }
 
-  void markOverdefined(Value *V) {
-    assert(!V->getType()->isStructTy() &&
-           "structs should use markAnythingOverdefined");
-    markOverdefined(ValueState[V], V);
-  }
-
-  /// markAnythingOverdefined - Mark the specified value overdefined.  This
+  /// markOverdefined - Mark the specified value overdefined.  This
   /// works with both scalars and structs.
-  void markAnythingOverdefined(Value *V) {
+  void markOverdefined(Value *V) {
     if (auto *STy = dyn_cast<StructType>(V->getType()))
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
         markOverdefined(getStructValueState(V, i), V);
     else
-      markOverdefined(V);
+      markOverdefined(ValueState[V], V);
   }
 
   // isStructLatticeConstant - Return true if all the lattice values
@@ -513,12 +515,12 @@ private:
   void visitCmpInst(CmpInst &I);
   void visitExtractValueInst(ExtractValueInst &EVI);
   void visitInsertValueInst(InsertValueInst &IVI);
-  void visitLandingPadInst(LandingPadInst &I) { markAnythingOverdefined(&I); }
+  void visitLandingPadInst(LandingPadInst &I) { markOverdefined(&I); }
   void visitFuncletPadInst(FuncletPadInst &FPI) {
-    markAnythingOverdefined(&FPI);
+    markOverdefined(&FPI);
   }
   void visitCatchSwitchInst(CatchSwitchInst &CPI) {
-    markAnythingOverdefined(&CPI);
+    markOverdefined(&CPI);
     visitTerminatorInst(CPI);
   }
 
@@ -538,16 +540,16 @@ private:
   void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
   void visitAtomicCmpXchgInst(AtomicCmpXchgInst &I) {
-    markAnythingOverdefined(&I);
+    markOverdefined(&I);
   }
   void visitAtomicRMWInst (AtomicRMWInst &I) { markOverdefined(&I); }
   void visitAllocaInst    (Instruction &I) { markOverdefined(&I); }
-  void visitVAArgInst     (Instruction &I) { markAnythingOverdefined(&I); }
+  void visitVAArgInst     (Instruction &I) { markOverdefined(&I); }
 
   void visitInstruction(Instruction &I) {
     // If a new instruction is added to LLVM that we don't handle.
     DEBUG(dbgs() << "SCCP: Don't know how to handle: " << I << '\n');
-    markAnythingOverdefined(&I);   // Just in case
+    markOverdefined(&I);   // Just in case
   }
 };
 
@@ -602,14 +604,36 @@ void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
       return;
     }
 
-    Succs[SI->findCaseValue(CI).getSuccessorIndex()] = true;
+    Succs[SI->findCaseValue(CI)->getSuccessorIndex()] = true;
     return;
   }
 
-  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
-  if (isa<IndirectBrInst>(&TI)) {
-    // Just mark all destinations executable!
-    Succs.assign(TI.getNumSuccessors(), true);
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
+    // Casts are folded by visitCastInst.
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+    if (!Addr) {   // Overdefined or unknown condition?
+      // All destinations are executable!
+      if (!IBRValue.isUnknown())
+        Succs.assign(TI.getNumSuccessors(), true);
+      return;
+    }
+
+    BasicBlock* T = Addr->getBasicBlock();
+    assert(Addr->getFunction() == T->getParent() &&
+           "Block address of a different function ?");
+    for (unsigned i = 0; i < IBR->getNumSuccessors(); ++i) {
+      // This is the target.
+      if (IBR->getDestination(i) == T) {
+        Succs[i] = true;
+        return;
+      }
+    }
+
+    // If we didn't find our destination in the IBR successor list, then we
+    // have undefined behavior. Its ok to assume no successor is executable.
     return;
   }
 
@@ -659,13 +683,21 @@ bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
     if (!CI)
       return !SCValue.isUnknown();
 
-    return SI->findCaseValue(CI).getCaseSuccessor() == To;
+    return SI->findCaseValue(CI)->getCaseSuccessor() == To;
   }
 
-  // Just mark all destinations executable!
-  // TODO: This could be improved if the operand is a [cast of a] BlockAddress.
-  if (isa<IndirectBrInst>(TI))
-    return true;
+  // In case of indirect branch and its address is a blockaddress, we mark
+  // the target as executable.
+  if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+    LatticeVal IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = IBRValue.getBlockAddress();
+
+    if (!Addr)
+      return !IBRValue.isUnknown();
+
+    // At this point, the indirectbr is branching on a blockaddress.
+    return Addr->getBasicBlock() == To;
+  }
 
   DEBUG(dbgs() << "Unknown terminator instruction: " << *TI << '\n');
   llvm_unreachable("SCCP: Don't know how to handle this terminator!");
@@ -693,7 +725,7 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
   // If this PN returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
   if (PN.getType()->isStructTy())
-    return markAnythingOverdefined(&PN);
+    return markOverdefined(&PN);
 
   if (getValueState(&PN).isOverdefined())
     return;  // Quick exit
@@ -803,7 +835,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
   // If this returns a struct, mark all elements over defined, we don't track
   // structs in structs.
   if (EVI.getType()->isStructTy())
-    return markAnythingOverdefined(&EVI);
+    return markOverdefined(&EVI);
 
   // If this is extracting from more than one level of struct, we don't know.
   if (EVI.getNumIndices() != 1)
@@ -828,7 +860,7 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
   // If this has more than one index, we can't handle it, drive all results to
   // undef.
   if (IVI.getNumIndices() != 1)
-    return markAnythingOverdefined(&IVI);
+    return markOverdefined(&IVI);
 
   Value *Aggr = IVI.getAggregateOperand();
   unsigned Idx = *IVI.idx_begin();
@@ -857,7 +889,7 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
   // If this select returns a struct, just mark the result overdefined.
   // TODO: We could do a lot better than this if code actually uses this.
   if (I.getType()->isStructTy())
-    return markAnythingOverdefined(&I);
+    return markOverdefined(&I);
 
   LatticeVal CondValue = getValueState(I.getCondition());
   if (CondValue.isUnknown())
@@ -1028,7 +1060,7 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
 void SCCPSolver::visitLoadInst(LoadInst &I) {
   // If this load is of a struct, just mark the result overdefined.
   if (I.getType()->isStructTy())
-    return markAnythingOverdefined(&I);
+    return markOverdefined(&I);
 
   LatticeVal PtrVal = getValueState(I.getOperand(0));
   if (PtrVal.isUnknown()) return;   // The pointer is not resolved yet!
@@ -1114,7 +1146,7 @@ CallOverdefined:
     }
 
     // Otherwise, we don't know anything about this call, mark it overdefined.
-    return markAnythingOverdefined(I);
+    return markOverdefined(I);
   }
 
   // If this is a local function that doesn't have its address taken, mark its
@@ -1490,6 +1522,31 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       return true;
     }
 
+   if (auto *IBR = dyn_cast<IndirectBrInst>(TI)) {
+      // Indirect branch with no successor ?. Its ok to assume it branches
+      // to no target.
+      if (IBR->getNumSuccessors() < 1)
+        continue;
+
+      if (!getValueState(IBR->getAddress()).isUnknown())
+        continue;
+
+      // If the input to SCCP is actually branch on undef, fix the undef to
+      // the first successor of the indirect branch.
+      if (isa<UndefValue>(IBR->getAddress())) {
+        IBR->setAddress(BlockAddress::get(IBR->getSuccessor(0)));
+        markEdgeExecutable(&BB, IBR->getSuccessor(0));
+        return true;
+      }
+
+      // Otherwise, it is a branch on a symbolic value which is currently
+      // considered to be undef.  Handle this by forcing the input value to the
+      // branch to the first successor.
+      markForcedConstant(IBR->getAddress(),
+                         BlockAddress::get(IBR->getSuccessor(0)));
+      return true;
+    }
+
     if (auto *SI = dyn_cast<SwitchInst>(TI)) {
       if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown())
         continue;
@@ -1497,12 +1554,12 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // If the input to SCCP is actually switch on undef, fix the undef to
       // the first constant.
       if (isa<UndefValue>(SI->getCondition())) {
-        SI->setCondition(SI->case_begin().getCaseValue());
-        markEdgeExecutable(&BB, SI->case_begin().getCaseSuccessor());
+        SI->setCondition(SI->case_begin()->getCaseValue());
+        markEdgeExecutable(&BB, SI->case_begin()->getCaseSuccessor());
         return true;
       }
 
-      markForcedConstant(SI->getCondition(), SI->case_begin().getCaseValue());
+      markForcedConstant(SI->getCondition(), SI->case_begin()->getCaseValue());
       return true;
     }
   }
@@ -1552,7 +1609,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
 
   // Mark all arguments to the function as being overdefined.
   for (Argument &AI : F.args())
-    Solver.markAnythingOverdefined(&AI);
+    Solver.markOverdefined(&AI);
 
   // Solve for constants.
   bool ResolvedUndefs = true;
@@ -1712,7 +1769,10 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
 
     // If this is an exact definition of this function, then we can propagate
     // information about its result into callsites of it.
-    if (F.hasExactDefinition())
+    // Don't touch naked functions. They may contain asm returning a
+    // value we don't see, so we may end up interprocedurally propagating
+    // the return value incorrectly.
+    if (F.hasExactDefinition() && !F.hasFnAttribute(Attribute::Naked))
       Solver.AddTrackedFunction(&F);
 
     // If this function only has direct calls that we can see, we can track its
@@ -1732,7 +1792,7 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
 
     // Assume nothing about the incoming arguments.
     for (Argument &AI : F.args())
-      Solver.markAnythingOverdefined(&AI);
+      Solver.markOverdefined(&AI);
   }
 
   // Loop over global variables.  We inform the solver about any internal global
@@ -1821,32 +1881,9 @@ static bool runIPSCCP(Module &M, const DataLayout &DL,
         if (!I) continue;
 
         bool Folded = ConstantFoldTerminator(I->getParent());
-        if (!Folded) {
-          // The constant folder may not have been able to fold the terminator
-          // if this is a branch or switch on undef.  Fold it manually as a
-          // branch to the first successor.
-#ifndef NDEBUG
-          if (auto *BI = dyn_cast<BranchInst>(I)) {
-            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
-                   "Branch should be foldable!");
-          } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
-            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
-          } else {
-            llvm_unreachable("Didn't fold away reference to block!");
-          }
-#endif
-
-          // Make this an uncond branch to the first successor.
-          TerminatorInst *TI = I->getParent()->getTerminator();
-          BranchInst::Create(TI->getSuccessor(0), TI);
-
-          // Remove entries in successor phi nodes to remove edges.
-          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
-            TI->getSuccessor(i)->removePredecessor(TI->getParent());
-
-          // Remove the old terminator.
-          TI->eraseFromParent();
-        }
+        assert(Folded &&
+              "Expect TermInst on constantint or blockaddress to be folded");
+        (void) Folded;
       }
 
       // Finally, delete the basic block.
diff --git a/lib/Transforms/Scalar/SROA.cpp b/lib/Transforms/Scalar/SROA.cpp
index 9b54679e47a48e37eafd205a70a60ecb0ebe2d35..d01e91a7f2356f4261193881a19b02605579372b 100644
--- a/lib/Transforms/Scalar/SROA.cpp
+++ b/lib/Transforms/Scalar/SROA.cpp
@@ -2295,7 +2295,8 @@ private:
 #endif
 
     return getAdjustedPtr(IRB, DL, &NewAI,
-                          APInt(DL.getPointerSizeInBits(), Offset), PointerTy,
+                          APInt(DL.getPointerTypeSizeInBits(PointerTy), Offset),
+                          PointerTy,
 #ifndef NDEBUG
                           Twine(OldName) + "."
 #else
@@ -2370,6 +2371,8 @@ private:
     Value *OldOp = LI.getOperand(0);
     assert(OldOp == OldPtr);
 
+    unsigned AS = LI.getPointerAddressSpace();
+
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                              : LI.getType();
     const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
@@ -2388,6 +2391,10 @@ private:
                                               LI.isVolatile(), LI.getName());
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSynchScope());
+
+      // Try to preserve nonnull metadata
+      if (TargetTy->isPointerTy())
+        NewLI->copyMetadata(LI, LLVMContext::MD_nonnull);
       V = NewLI;
 
       // If this is an integer load past the end of the slice (which means the
@@ -2402,7 +2409,7 @@ private:
                                 "endian_shift");
           }
     } else {
-      Type *LTy = TargetTy->getPointerTo();
+      Type *LTy = TargetTy->getPointerTo(AS);
       LoadInst *NewLI = IRB.CreateAlignedLoad(getNewAllocaSlicePtr(IRB, LTy),
                                               getSliceAlign(TargetTy),
                                               LI.isVolatile(), LI.getName());
@@ -2430,7 +2437,7 @@ private:
       // the computed value, and then replace the placeholder with LI, leaving
       // LI only used for this computation.
       Value *Placeholder =
-          new LoadInst(UndefValue::get(LI.getType()->getPointerTo()));
+          new LoadInst(UndefValue::get(LI.getType()->getPointerTo(AS)));
       V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
                         "insert");
       LI.replaceAllUsesWith(V);
@@ -2543,7 +2550,8 @@ private:
       NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
                                      SI.isVolatile());
     } else {
-      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo());
+      unsigned AS = SI.getPointerAddressSpace();
+      Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
       NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
                                      SI.isVolatile());
     }
@@ -3858,7 +3866,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     if (Alignment <= DL.getABITypeAlignment(SliceTy))
       Alignment = 0;
     NewAI = new AllocaInst(
-        SliceTy, nullptr, Alignment,
+      SliceTy, AI.getType()->getAddressSpace(), nullptr, Alignment,
         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
     ++NumNewAllocas;
   }
@@ -4185,7 +4193,7 @@ bool SROA::promoteAllocas(Function &F) {
   NumPromoted += PromotableAllocas.size();
 
   DEBUG(dbgs() << "Promoting allocas with mem2reg...\n");
-  PromoteMemToReg(PromotableAllocas, *DT, nullptr, AC);
+  PromoteMemToReg(PromotableAllocas, *DT, AC);
   PromotableAllocas.clear();
   return true;
 }
diff --git a/lib/Transforms/Scalar/Scalar.cpp b/lib/Transforms/Scalar/Scalar.cpp
index 002e125576b5f2b1f492b53efdaa501b69af843a..00e3c95f6f06dc7171efa3593b12a0d8e9ae65fe 100644
--- a/lib/Transforms/Scalar/Scalar.cpp
+++ b/lib/Transforms/Scalar/Scalar.cpp
@@ -43,7 +43,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeDSELegacyPassPass(Registry);
   initializeGuardWideningLegacyPassPass(Registry);
   initializeGVNLegacyPassPass(Registry);
-  initializeNewGVNPass(Registry);
+  initializeNewGVNLegacyPassPass(Registry);
   initializeEarlyCSELegacyPassPass(Registry);
   initializeEarlyCSEMemSSALegacyPassPass(Registry);
   initializeGVNHoistLegacyPassPass(Registry);
@@ -81,6 +81,7 @@ void llvm::initializeScalarOpts(PassRegistry &Registry) {
   initializeIPSCCPLegacyPassPass(Registry);
   initializeSROALegacyPassPass(Registry);
   initializeCFGSimplifyPassPass(Registry);
+  initializeLateCFGSimplifyPassPass(Registry);
   initializeStructurizeCFGPass(Registry);
   initializeSinkingLegacyPassPass(Registry);
   initializeTailCallElimPass(Registry);
@@ -117,6 +118,10 @@ void LLVMAddCFGSimplificationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCFGSimplificationPass());
 }
 
+void LLVMAddLateCFGSimplificationPass(LLVMPassManagerRef PM) {
+  unwrap(PM)->add(createLateCFGSimplificationPass());
+}
+
 void LLVMAddDeadStoreEliminationPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createDeadStoreEliminationPass());
 }
diff --git a/lib/Transforms/Scalar/Scalarizer.cpp b/lib/Transforms/Scalar/Scalarizer.cpp
index 39969e27367f66be616f03074d41856f5a720aee..c0c09a7e43fe93abb054d4c70d4dcac1d8ff955f 100644
--- a/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/lib/Transforms/Scalar/Scalarizer.cpp
@@ -520,12 +520,25 @@ bool Scalarizer::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   unsigned NumElems = VT->getNumElements();
   unsigned NumIndices = GEPI.getNumIndices();
 
-  Scatterer Base = scatter(&GEPI, GEPI.getOperand(0));
+  // The base pointer might be scalar even if it's a vector GEP. In those cases,
+  // splat the pointer into a vector value, and scatter that vector.
+  Value *Op0 = GEPI.getOperand(0);
+  if (!Op0->getType()->isVectorTy())
+    Op0 = Builder.CreateVectorSplat(NumElems, Op0);
+  Scatterer Base = scatter(&GEPI, Op0);
 
   SmallVector<Scatterer, 8> Ops;
   Ops.resize(NumIndices);
-  for (unsigned I = 0; I < NumIndices; ++I)
-    Ops[I] = scatter(&GEPI, GEPI.getOperand(I + 1));
+  for (unsigned I = 0; I < NumIndices; ++I) {
+    Value *Op = GEPI.getOperand(I + 1);
+
+    // The indices might be scalars even if it's a vector GEP. In those cases,
+    // splat the scalar into a vector value, and scatter that vector.
+    if (!Op->getType()->isVectorTy())
+      Op = Builder.CreateVectorSplat(NumElems, Op);
+
+    Ops[I] = scatter(&GEPI, Op);
+  }
 
   ValueVector Res;
   Res.resize(NumElems);
diff --git a/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index f2723bd7af82ec70e2d7e39545ffda416c7ad9f5..8754c714c5b285711f965f8cb0ef7cf3efe59067 100644
--- a/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -130,7 +130,8 @@ static bool mergeEmptyReturnBlocks(Function &F) {
 /// iterating until no more changes are made.
 static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
                                    AssumptionCache *AC,
-                                   unsigned BonusInstThreshold) {
+                                   unsigned BonusInstThreshold,
+                                   bool LateSimplifyCFG) {
   bool Changed = false;
   bool LocalChange = true;
 
@@ -145,7 +146,7 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 
     // Loop over all of the basic blocks and remove them if they are unneeded.
     for (Function::iterator BBIt = F.begin(); BBIt != F.end(); ) {
-      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders)) {
+      if (SimplifyCFG(&*BBIt++, TTI, BonusInstThreshold, AC, &LoopHeaders, LateSimplifyCFG)) {
         LocalChange = true;
         ++NumSimpl;
       }
@@ -156,10 +157,12 @@ static bool iterativelySimplifyCFG(Function &F, const TargetTransformInfo &TTI,
 }
 
 static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
-                                AssumptionCache *AC, int BonusInstThreshold) {
+                                AssumptionCache *AC, int BonusInstThreshold,
+                                bool LateSimplifyCFG) {
   bool EverChanged = removeUnreachableBlocks(F);
   EverChanged |= mergeEmptyReturnBlocks(F);
-  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+  EverChanged |= iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+                                        LateSimplifyCFG);
 
   // If neither pass changed anything, we're done.
   if (!EverChanged) return false;
@@ -173,7 +176,8 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
     return true;
 
   do {
-    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold);
+    EverChanged = iterativelySimplifyCFG(F, TTI, AC, BonusInstThreshold,
+                                         LateSimplifyCFG);
     EverChanged |= removeUnreachableBlocks(F);
   } while (EverChanged);
 
@@ -181,17 +185,19 @@ static bool simplifyFunctionCFG(Function &F, const TargetTransformInfo &TTI,
 }
 
 SimplifyCFGPass::SimplifyCFGPass()
-    : BonusInstThreshold(UserBonusInstThreshold) {}
+    : BonusInstThreshold(UserBonusInstThreshold),
+      LateSimplifyCFG(true) {}
 
-SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold)
-    : BonusInstThreshold(BonusInstThreshold) {}
+SimplifyCFGPass::SimplifyCFGPass(int BonusInstThreshold, bool LateSimplifyCFG)
+    : BonusInstThreshold(BonusInstThreshold),
+      LateSimplifyCFG(LateSimplifyCFG) {}
 
 PreservedAnalyses SimplifyCFGPass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
 
-  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold))
+  if (!simplifyFunctionCFG(F, TTI, &AC, BonusInstThreshold, LateSimplifyCFG))
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
   PA.preserve<GlobalsAA>();
@@ -199,16 +205,17 @@ PreservedAnalyses SimplifyCFGPass::run(Function &F,
 }
 
 namespace {
-struct CFGSimplifyPass : public FunctionPass {
-  static char ID; // Pass identification, replacement for typeid
+struct BaseCFGSimplifyPass : public FunctionPass {
   unsigned BonusInstThreshold;
   std::function<bool(const Function &)> PredicateFtor;
+  bool LateSimplifyCFG;
 
-  CFGSimplifyPass(int T = -1,
-                  std::function<bool(const Function &)> Ftor = nullptr)
-      : FunctionPass(ID), PredicateFtor(std::move(Ftor)) {
+  BaseCFGSimplifyPass(int T, bool LateSimplifyCFG,
+                      std::function<bool(const Function &)> Ftor,
+                      char &ID)
+      : FunctionPass(ID), PredicateFtor(std::move(Ftor)),
+        LateSimplifyCFG(LateSimplifyCFG) {
     BonusInstThreshold = (T == -1) ? UserBonusInstThreshold : unsigned(T);
-    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
   }
   bool runOnFunction(Function &F) override {
     if (skipFunction(F) || (PredicateFtor && !PredicateFtor(F)))
@@ -218,7 +225,7 @@ struct CFGSimplifyPass : public FunctionPass {
         &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
     const TargetTransformInfo &TTI =
         getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold);
+    return simplifyFunctionCFG(F, TTI, AC, BonusInstThreshold, LateSimplifyCFG);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -227,6 +234,26 @@ struct CFGSimplifyPass : public FunctionPass {
     AU.addPreserved<GlobalsAAWrapperPass>();
   }
 };
+
+struct CFGSimplifyPass : public BaseCFGSimplifyPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  CFGSimplifyPass(int T = -1,
+                  std::function<bool(const Function &)> Ftor = nullptr)
+                  : BaseCFGSimplifyPass(T, false, Ftor, ID) {
+    initializeCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
+
+struct LateCFGSimplifyPass : public BaseCFGSimplifyPass {
+  static char ID; // Pass identification, replacement for typeid
+
+  LateCFGSimplifyPass(int T = -1,
+                      std::function<bool(const Function &)> Ftor = nullptr)
+                      : BaseCFGSimplifyPass(T, true, Ftor, ID) {
+    initializeLateCFGSimplifyPassPass(*PassRegistry::getPassRegistry());
+  }
+};
 }
 
 char CFGSimplifyPass::ID = 0;
@@ -237,9 +264,24 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_END(CFGSimplifyPass, "simplifycfg", "Simplify the CFG", false,
                     false)
 
+char LateCFGSimplifyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(LateCFGSimplifyPass, "latesimplifycfg",
+                      "Simplify the CFG more aggressively", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(LateCFGSimplifyPass, "latesimplifycfg",
+                    "Simplify the CFG more aggressively", false, false)
+
 // Public interface to the CFGSimplification pass
 FunctionPass *
 llvm::createCFGSimplificationPass(int Threshold,
-                                  std::function<bool(const Function &)> Ftor) {
+    std::function<bool(const Function &)> Ftor) {
   return new CFGSimplifyPass(Threshold, std::move(Ftor));
 }
+
+// Public interface to the LateCFGSimplification pass
+FunctionPass *
+llvm::createLateCFGSimplificationPass(int Threshold, 
+                                  std::function<bool(const Function &)> Ftor) {
+  return new LateCFGSimplifyPass(Threshold, std::move(Ftor));
+}
diff --git a/lib/Transforms/Scalar/Sink.cpp b/lib/Transforms/Scalar/Sink.cpp
index 504a22a229c89677151c2a9ef4779a36d13ca4e5..102e9eaeab772e8eb58be4454ae5f4706fe7c71b 100644
--- a/lib/Transforms/Scalar/Sink.cpp
+++ b/lib/Transforms/Scalar/Sink.cpp
@@ -164,13 +164,14 @@ static bool SinkInstruction(Instruction *Inst,
 
   // Instructions can only be sunk if all their uses are in blocks
   // dominated by one of the successors.
-  // Look at all the postdominators and see if we can sink it in one.
+  // Look at all the dominated blocks and see if we can sink it in one.
   DomTreeNode *DTN = DT.getNode(Inst->getParent());
   for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
       I != E && SuccToSinkTo == nullptr; ++I) {
     BasicBlock *Candidate = (*I)->getBlock();
-    if ((*I)->getIDom()->getBlock() == Inst->getParent() &&
-        IsAcceptableTarget(Inst, Candidate, DT, LI))
+    // A node always immediate-dominates its children on the dominator
+    // tree.
+    if (IsAcceptableTarget(Inst, Candidate, DT, LI))
       SuccToSinkTo = Candidate;
   }
 
diff --git a/lib/Transforms/Utils/AddDiscriminators.cpp b/lib/Transforms/Utils/AddDiscriminators.cpp
index 2e95926c0b3f5c6ed6905a3b6fa20c047cf5901d..4c9746b8c691e4fb9623c66a672118347312c7bf 100644
--- a/lib/Transforms/Utils/AddDiscriminators.cpp
+++ b/lib/Transforms/Utils/AddDiscriminators.cpp
@@ -102,6 +102,10 @@ FunctionPass *llvm::createAddDiscriminatorsPass() {
   return new AddDiscriminatorsLegacyPass();
 }
 
+static bool shouldHaveDiscriminator(const Instruction *I) {
+  return !isa<IntrinsicInst>(I) || isa<MemIntrinsic>(I);
+}
+
 /// \brief Assign DWARF discriminators.
 ///
 /// To assign discriminators, we examine the boundaries of every
@@ -176,7 +180,13 @@ static bool addDiscriminators(Function &F) {
   // discriminator for this instruction.
   for (BasicBlock &B : F) {
     for (auto &I : B.getInstList()) {
-      if (isa<IntrinsicInst>(&I))
+      // Not all intrinsic calls should have a discriminator.
+      // We want to avoid a non-deterministic assignment of discriminators at
+      // different debug levels. We still allow discriminators on memory
+      // intrinsic calls because those can be early expanded by SROA into
+      // pairs of loads and stores, and the expanded load/store instructions
+      // should have a valid discriminator.
+      if (!shouldHaveDiscriminator(&I))
         continue;
       const DILocation *DIL = I.getDebugLoc();
       if (!DIL)
@@ -190,8 +200,8 @@ static bool addDiscriminators(Function &F) {
       // discriminator is needed to distinguish both instructions.
       // Only the lowest 7 bits are used to represent a discriminator to fit
       // it in 1 byte ULEB128 representation.
-      unsigned Discriminator = (R.second ? ++LDM[L] : LDM[L]) & 0x7f;
-      I.setDebugLoc(DIL->cloneWithDiscriminator(Discriminator));
+      unsigned Discriminator = R.second ? ++LDM[L] : LDM[L];
+      I.setDebugLoc(DIL->setBaseDiscriminator(Discriminator));
       DEBUG(dbgs() << DIL->getFilename() << ":" << DIL->getLine() << ":"
                    << DIL->getColumn() << ":" << Discriminator << " " << I
                    << "\n");
@@ -207,6 +217,10 @@ static bool addDiscriminators(Function &F) {
     LocationSet CallLocations;
     for (auto &I : B.getInstList()) {
       CallInst *Current = dyn_cast<CallInst>(&I);
+      // We bypass intrinsic calls for the following two reasons:
+      //  1) We want to avoid a non-deterministic assigment of
+      //     discriminators.
+      //  2) We want to minimize the number of base discriminators used.
       if (!Current || isa<IntrinsicInst>(&I))
         continue;
 
@@ -216,8 +230,8 @@ static bool addDiscriminators(Function &F) {
       Location L =
           std::make_pair(CurrentDIL->getFilename(), CurrentDIL->getLine());
       if (!CallLocations.insert(L).second) {
-        Current->setDebugLoc(
-            CurrentDIL->cloneWithDiscriminator((++LDM[L]) & 0x7f));
+        unsigned Discriminator = ++LDM[L];
+        Current->setDebugLoc(CurrentDIL->setBaseDiscriminator(Discriminator));
         Changed = true;
       }
     }
diff --git a/lib/Transforms/Utils/BasicBlockUtils.cpp b/lib/Transforms/Utils/BasicBlockUtils.cpp
index b90349d3cdad1e79f55ed024bcfbc679411e4ed2..22af21d55c019767ddb6b6030b6b1ad01e4030d5 100644
--- a/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -438,7 +438,7 @@ BasicBlock *llvm::SplitBlockPredecessors(BasicBlock *BB,
 
   // The new block unconditionally branches to the old block.
   BranchInst *BI = BranchInst::Create(BB, NewBB);
-  BI->setDebugLoc(BB->getFirstNonPHI()->getDebugLoc());
+  BI->setDebugLoc(BB->getFirstNonPHIOrDbg()->getDebugLoc());
 
   // Move the edges from Preds to point to NewBB instead of BB.
   for (unsigned i = 0, e = Preds.size(); i != e; ++i) {
@@ -646,9 +646,10 @@ llvm::SplitBlockAndInsertIfThen(Value *Cond, Instruction *SplitBefore,
   }
 
   if (LI) {
-    Loop *L = LI->getLoopFor(Head);
-    L->addBasicBlockToLoop(ThenBlock, *LI);
-    L->addBasicBlockToLoop(Tail, *LI);
+    if (Loop *L = LI->getLoopFor(Head)) {
+      L->addBasicBlockToLoop(ThenBlock, *LI);
+      L->addBasicBlockToLoop(Tail, *LI);
+    }
   }
 
   return CheckTerm;
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 4f6bfcfe5245e2e7adb40ab6bc5043b138477339..6cd9f1614991afa37fd0322b4f781dd24b32d166 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -96,9 +96,9 @@ static bool setDoesNotAlias(Function &F, unsigned n) {
 }
 
 static bool setNonNull(Function &F, unsigned n) {
-  assert((n != AttributeSet::ReturnIndex ||
-          F.getReturnType()->isPointerTy()) &&
-         "nonnull applies only to pointers");
+  assert(
+      (n != AttributeList::ReturnIndex || F.getReturnType()->isPointerTy()) &&
+      "nonnull applies only to pointers");
   if (F.getAttributes().hasAttribute(n, Attribute::NonNull))
     return false;
   F.addAttribute(n, Attribute::NonNull);
@@ -683,8 +683,8 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
   case LibFunc_msvc_new_array_int: // new[](unsigned int)
   case LibFunc_msvc_new_array_longlong: // new[](unsigned long long)
     // Operator new always returns a nonnull noalias pointer
-    Changed |= setNonNull(F, AttributeSet::ReturnIndex);
-    Changed |= setDoesNotAlias(F, AttributeSet::ReturnIndex);
+    Changed |= setNonNull(F, AttributeList::ReturnIndex);
+    Changed |= setDoesNotAlias(F, AttributeList::ReturnIndex);
     return Changed;
   //TODO: add LibFunc entries for:
   //case LibFunc_memset_pattern4:
@@ -723,7 +723,7 @@ Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
   Module *M = B.GetInsertBlock()->getModule();
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Constant *StrLen = M->getOrInsertFunction("strlen", DL.getIntPtrType(Context),
-                                            B.getInt8PtrTy(), nullptr);
+                                            B.getInt8PtrTy());
   inferLibFuncAttributes(*M->getFunction("strlen"), *TLI);
   CallInst *CI = B.CreateCall(StrLen, castToCStr(Ptr, B), "strlen");
   if (const Function *F = dyn_cast<Function>(StrLen->stripPointerCasts()))
@@ -741,7 +741,7 @@ Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
   Constant *StrChr =
-      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty, nullptr);
+      M->getOrInsertFunction("strchr", I8Ptr, I8Ptr, I32Ty);
   inferLibFuncAttributes(*M->getFunction("strchr"), *TLI);
   CallInst *CI = B.CreateCall(
       StrChr, {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, "strchr");
@@ -759,7 +759,7 @@ Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *StrNCmp = M->getOrInsertFunction("strncmp", B.getInt32Ty(),
                                           B.getInt8PtrTy(), B.getInt8PtrTy(),
-                                          DL.getIntPtrType(Context), nullptr);
+                                          DL.getIntPtrType(Context));
   inferLibFuncAttributes(*M->getFunction("strncmp"), *TLI);
   CallInst *CI = B.CreateCall(
       StrNCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "strncmp");
@@ -777,7 +777,7 @@ Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
 
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
-  Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr, nullptr);
+  Value *StrCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr);
   inferLibFuncAttributes(*M->getFunction(Name), *TLI);
   CallInst *CI =
       B.CreateCall(StrCpy, {castToCStr(Dst, B), castToCStr(Src, B)}, Name);
@@ -794,7 +794,7 @@ Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
   Module *M = B.GetInsertBlock()->getModule();
   Type *I8Ptr = B.getInt8PtrTy();
   Value *StrNCpy = M->getOrInsertFunction(Name, I8Ptr, I8Ptr, I8Ptr,
-                                          Len->getType(), nullptr);
+                                          Len->getType());
   inferLibFuncAttributes(*M->getFunction(Name), *TLI);
   CallInst *CI = B.CreateCall(
       StrNCpy, {castToCStr(Dst, B), castToCStr(Src, B), Len}, "strncpy");
@@ -810,14 +810,14 @@ Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  AttributeSet AS;
-  AS = AttributeSet::get(M->getContext(), AttributeSet::FunctionIndex,
-                         Attribute::NoUnwind);
+  AttributeList AS;
+  AS = AttributeList::get(M->getContext(), AttributeList::FunctionIndex,
+                          Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCpy = M->getOrInsertFunction(
-      "__memcpy_chk", AttributeSet::get(M->getContext(), AS), B.getInt8PtrTy(),
+      "__memcpy_chk", AttributeList::get(M->getContext(), AS), B.getInt8PtrTy(),
       B.getInt8PtrTy(), B.getInt8PtrTy(), DL.getIntPtrType(Context),
-      DL.getIntPtrType(Context), nullptr);
+      DL.getIntPtrType(Context));
   Dst = castToCStr(Dst, B);
   Src = castToCStr(Src, B);
   CallInst *CI = B.CreateCall(MemCpy, {Dst, Src, Len, ObjSize});
@@ -835,7 +835,7 @@ Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemChr = M->getOrInsertFunction("memchr", B.getInt8PtrTy(),
                                          B.getInt8PtrTy(), B.getInt32Ty(),
-                                         DL.getIntPtrType(Context), nullptr);
+                                         DL.getIntPtrType(Context));
   inferLibFuncAttributes(*M->getFunction("memchr"), *TLI);
   CallInst *CI = B.CreateCall(MemChr, {castToCStr(Ptr, B), Val, Len}, "memchr");
 
@@ -854,7 +854,7 @@ Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   Value *MemCmp = M->getOrInsertFunction("memcmp", B.getInt32Ty(),
                                          B.getInt8PtrTy(), B.getInt8PtrTy(),
-                                         DL.getIntPtrType(Context), nullptr);
+                                         DL.getIntPtrType(Context));
   inferLibFuncAttributes(*M->getFunction("memcmp"), *TLI);
   CallInst *CI = B.CreateCall(
       MemCmp, {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, "memcmp");
@@ -881,13 +881,13 @@ static void appendTypeSuffix(Value *Op, StringRef &Name,
 }
 
 Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
-                                  const AttributeSet &Attrs) {
+                                  const AttributeList &Attrs) {
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op, Name, NameBuffer);
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *Callee = M->getOrInsertFunction(Name, Op->getType(),
-                                         Op->getType(), nullptr);
+                                         Op->getType());
   CallInst *CI = B.CreateCall(Callee, Op, Name);
   CI->setAttributes(Attrs);
   if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
@@ -897,13 +897,13 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
 }
 
 Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
-                                  IRBuilder<> &B, const AttributeSet &Attrs) {
+                                   IRBuilder<> &B, const AttributeList &Attrs) {
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op1, Name, NameBuffer);
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *Callee = M->getOrInsertFunction(Name, Op1->getType(), Op1->getType(),
-                                         Op2->getType(), nullptr);
+                                         Op2->getType());
   CallInst *CI = B.CreateCall(Callee, {Op1, Op2}, Name);
   CI->setAttributes(Attrs);
   if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
@@ -918,8 +918,8 @@ Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
     return nullptr;
 
   Module *M = B.GetInsertBlock()->getModule();
-  Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(),
-                                          B.getInt32Ty(), nullptr);
+  Value *PutChar = M->getOrInsertFunction("putchar", B.getInt32Ty(), B.getInt32Ty());
+  inferLibFuncAttributes(*M->getFunction("putchar"), *TLI);
   CallInst *CI = B.CreateCall(PutChar,
                               B.CreateIntCast(Char,
                               B.getInt32Ty(),
@@ -939,7 +939,7 @@ Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
 
   Module *M = B.GetInsertBlock()->getModule();
   Value *PutS =
-      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy(), nullptr);
+      M->getOrInsertFunction("puts", B.getInt32Ty(), B.getInt8PtrTy());
   inferLibFuncAttributes(*M->getFunction("puts"), *TLI);
   CallInst *CI = B.CreateCall(PutS, castToCStr(Str, B), "puts");
   if (const Function *F = dyn_cast<Function>(PutS->stripPointerCasts()))
@@ -954,7 +954,7 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
 
   Module *M = B.GetInsertBlock()->getModule();
   Constant *F = M->getOrInsertFunction("fputc", B.getInt32Ty(), B.getInt32Ty(),
-                                       File->getType(), nullptr);
+                                       File->getType());
   if (File->getType()->isPointerTy())
     inferLibFuncAttributes(*M->getFunction("fputc"), *TLI);
   Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/true,
@@ -974,7 +974,7 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
   Module *M = B.GetInsertBlock()->getModule();
   StringRef FPutsName = TLI->getName(LibFunc_fputs);
   Constant *F = M->getOrInsertFunction(
-      FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType(), nullptr);
+      FPutsName, B.getInt32Ty(), B.getInt8PtrTy(), File->getType());
   if (File->getType()->isPointerTy())
     inferLibFuncAttributes(*M->getFunction(FPutsName), *TLI);
   CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, "fputs");
@@ -994,8 +994,8 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
   StringRef FWriteName = TLI->getName(LibFunc_fwrite);
   Constant *F = M->getOrInsertFunction(
       FWriteName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
-      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType(),
-      nullptr);
+      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
+
   if (File->getType()->isPointerTy())
     inferLibFuncAttributes(*M->getFunction(FWriteName), *TLI);
   CallInst *CI =
diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp
index bc2cef26edcbc8f3578afd6aa52070876d6aa345..1cfe3bd536482fb1567b4dafec32c4ee724f4f4b 100644
--- a/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -17,6 +17,8 @@
 
 #include "llvm/Transforms/Utils/BypassSlowDivision.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -36,12 +38,21 @@ namespace {
       : SignedOp(InSignedOp), Dividend(InDividend), Divisor(InDivisor) {}
   };
 
-  struct DivPhiNodes {
-    PHINode *Quotient;
-    PHINode *Remainder;
+  struct QuotRemPair {
+    Value *Quotient;
+    Value *Remainder;
 
-    DivPhiNodes(PHINode *InQuotient, PHINode *InRemainder)
-      : Quotient(InQuotient), Remainder(InRemainder) {}
+    QuotRemPair(Value *InQuotient, Value *InRemainder)
+        : Quotient(InQuotient), Remainder(InRemainder) {}
+  };
+
+  /// A quotient and remainder, plus a BB from which they logically "originate".
+  /// If you use Quotient or Remainder in a Phi node, you should use BB as its
+  /// corresponding predecessor.
+  struct QuotRemWithBB {
+    BasicBlock *BB = nullptr;
+    Value *Quotient = nullptr;
+    Value *Remainder = nullptr;
   };
 }
 
@@ -69,159 +80,376 @@ namespace llvm {
     }
   };
 
-  typedef DenseMap<DivOpInfo, DivPhiNodes> DivCacheTy;
+  typedef DenseMap<DivOpInfo, QuotRemPair> DivCacheTy;
+  typedef DenseMap<unsigned, unsigned> BypassWidthsTy;
+  typedef SmallPtrSet<Instruction *, 4> VisitedSetTy;
 }
 
-// insertFastDiv - Substitutes the div/rem instruction with code that checks the
-// value of the operands and uses a shorter-faster div/rem instruction when
-// possible and the longer-slower div/rem instruction otherwise.
-static bool insertFastDiv(Instruction *I, IntegerType *BypassType,
-                          bool UseDivOp, bool UseSignedOp,
-                          DivCacheTy &PerBBDivCache) {
-  Function *F = I->getParent()->getParent();
-  // Get instruction operands
-  Value *Dividend = I->getOperand(0);
-  Value *Divisor = I->getOperand(1);
+namespace {
+enum ValueRange {
+  /// Operand definitely fits into BypassType. No runtime checks are needed.
+  VALRNG_KNOWN_SHORT,
+  /// A runtime check is required, as value range is unknown.
+  VALRNG_UNKNOWN,
+  /// Operand is unlikely to fit into BypassType. The bypassing should be
+  /// disabled.
+  VALRNG_LIKELY_LONG
+};
+
+class FastDivInsertionTask {
+  bool IsValidTask = false;
+  Instruction *SlowDivOrRem = nullptr;
+  IntegerType *BypassType = nullptr;
+  BasicBlock *MainBB = nullptr;
+
+  bool isHashLikeValue(Value *V, VisitedSetTy &Visited);
+  ValueRange getValueRange(Value *Op, VisitedSetTy &Visited);
+  QuotRemWithBB createSlowBB(BasicBlock *Successor);
+  QuotRemWithBB createFastBB(BasicBlock *Successor);
+  QuotRemPair createDivRemPhiNodes(QuotRemWithBB &LHS, QuotRemWithBB &RHS,
+                                   BasicBlock *PhiBB);
+  Value *insertOperandRuntimeCheck(Value *Op1, Value *Op2);
+  Optional<QuotRemPair> insertFastDivAndRem();
+
+  bool isSignedOp() {
+    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+           SlowDivOrRem->getOpcode() == Instruction::SRem;
+  }
+  bool isDivisionOp() {
+    return SlowDivOrRem->getOpcode() == Instruction::SDiv ||
+           SlowDivOrRem->getOpcode() == Instruction::UDiv;
+  }
+  Type *getSlowType() { return SlowDivOrRem->getType(); }
+
+public:
+  FastDivInsertionTask(Instruction *I, const BypassWidthsTy &BypassWidths);
+  Value *getReplacement(DivCacheTy &Cache);
+};
+} // anonymous namespace
+
+FastDivInsertionTask::FastDivInsertionTask(Instruction *I,
+                                           const BypassWidthsTy &BypassWidths) {
+  switch (I->getOpcode()) {
+  case Instruction::UDiv:
+  case Instruction::SDiv:
+  case Instruction::URem:
+  case Instruction::SRem:
+    SlowDivOrRem = I;
+    break;
+  default:
+    // I is not a div/rem operation.
+    return;
+  }
 
-  if (isa<ConstantInt>(Divisor)) {
-    // Division by a constant should have been been solved and replaced earlier
-    // in the pipeline.
-    return false;
+  // Skip division on vector types. Only optimize integer instructions.
+  IntegerType *SlowType = dyn_cast<IntegerType>(SlowDivOrRem->getType());
+  if (!SlowType)
+    return;
+
+  // Skip if this bitwidth is not bypassed.
+  auto BI = BypassWidths.find(SlowType->getBitWidth());
+  if (BI == BypassWidths.end())
+    return;
+
+  // Get type for div/rem instruction with bypass bitwidth.
+  IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
+  BypassType = BT;
+
+  // The original basic block.
+  MainBB = I->getParent();
+
+  // The instruction is indeed a slow div or rem operation.
+  IsValidTask = true;
+}
+
+/// Reuses previously-computed dividend or remainder from the current BB if
+/// operands and operation are identical. Otherwise calls insertFastDivAndRem to
+/// perform the optimization and caches the resulting dividend and remainder.
+/// If no replacement can be generated, nullptr is returned.
+Value *FastDivInsertionTask::getReplacement(DivCacheTy &Cache) {
+  // First, make sure that the task is valid.
+  if (!IsValidTask)
+    return nullptr;
+
+  // Then, look for a value in Cache.
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+  DivOpInfo Key(isSignedOp(), Dividend, Divisor);
+  auto CacheI = Cache.find(Key);
+
+  if (CacheI == Cache.end()) {
+    // If previous instance does not exist, try to insert fast div.
+    Optional<QuotRemPair> OptResult = insertFastDivAndRem();
+    // Bail out if insertFastDivAndRem has failed.
+    if (!OptResult)
+      return nullptr;
+    CacheI = Cache.insert({Key, *OptResult}).first;
   }
 
-  // If the numerator is a constant, bail if it doesn't fit into BypassType.
-  if (ConstantInt *ConstDividend = dyn_cast<ConstantInt>(Dividend))
-    if (ConstDividend->getValue().getActiveBits() > BypassType->getBitWidth())
+  QuotRemPair &Value = CacheI->second;
+  return isDivisionOp() ? Value.Quotient : Value.Remainder;
+}
+
+/// \brief Check if a value looks like a hash.
+///
+/// The routine is expected to detect values computed using the most common hash
+/// algorithms. Typically, hash computations end with one of the following
+/// instructions:
+///
+/// 1) MUL with a constant wider than BypassType
+/// 2) XOR instruction
+///
+/// And even if we are wrong and the value is not a hash, it is still quite
+/// unlikely that such values will fit into BypassType.
+///
+/// To detect string hash algorithms like FNV we have to look through PHI-nodes.
+/// It is implemented as a depth-first search for values that look neither long
+/// nor hash-like.
+bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I)
+    return false;
+
+  switch (I->getOpcode()) {
+  case Instruction::Xor:
+    return true;
+  case Instruction::Mul: {
+    // After Constant Hoisting pass, long constants may be represented as
+    // bitcast instructions. As a result, some constants may look like an
+    // instruction at first, and an additional check is necessary to find out if
+    // an operand is actually a constant.
+    Value *Op1 = I->getOperand(1);
+    ConstantInt *C = dyn_cast<ConstantInt>(Op1);
+    if (!C && isa<BitCastInst>(Op1))
+      C = dyn_cast<ConstantInt>(cast<BitCastInst>(Op1)->getOperand(0));
+    return C && C->getValue().getMinSignedBits() > BypassType->getBitWidth();
+  }
+  case Instruction::PHI: {
+    // Stop IR traversal in case of a crazy input code. This limits recursion
+    // depth.
+    if (Visited.size() >= 16)
       return false;
+    // Do not visit nodes that have been visited already. We return true because
+    // it means that we couldn't find any value that doesn't look hash-like.
+    if (Visited.find(I) != Visited.end())
+      return true;
+    Visited.insert(I);
+    return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
+      // Ignore undef values as they probably don't affect the division
+      // operands.
+      return getValueRange(V, Visited) == VALRNG_LIKELY_LONG ||
+             isa<UndefValue>(V);
+    });
+  }
+  default:
+    return false;
+  }
+}
+
+/// Check if an integer value fits into our bypass type.
+ValueRange FastDivInsertionTask::getValueRange(Value *V,
+                                               VisitedSetTy &Visited) {
+  unsigned ShortLen = BypassType->getBitWidth();
+  unsigned LongLen = V->getType()->getIntegerBitWidth();
+
+  assert(LongLen > ShortLen && "Value type must be wider than BypassType");
+  unsigned HiBits = LongLen - ShortLen;
+
+  const DataLayout &DL = SlowDivOrRem->getModule()->getDataLayout();
+  APInt Zeros(LongLen, 0), Ones(LongLen, 0);
 
-  // Basic Block is split before divide
-  BasicBlock *MainBB = &*I->getParent();
-  BasicBlock *SuccessorBB = MainBB->splitBasicBlock(I);
-
-  // Add new basic block for slow divide operation
-  BasicBlock *SlowBB =
-      BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
-  SlowBB->moveBefore(SuccessorBB);
-  IRBuilder<> SlowBuilder(SlowBB, SlowBB->begin());
-  Value *SlowQuotientV;
-  Value *SlowRemainderV;
-  if (UseSignedOp) {
-    SlowQuotientV = SlowBuilder.CreateSDiv(Dividend, Divisor);
-    SlowRemainderV = SlowBuilder.CreateSRem(Dividend, Divisor);
+  computeKnownBits(V, Zeros, Ones, DL);
+
+  if (Zeros.countLeadingOnes() >= HiBits)
+    return VALRNG_KNOWN_SHORT;
+
+  if (Ones.countLeadingZeros() < HiBits)
+    return VALRNG_LIKELY_LONG;
+
+  // Long integer divisions are often used in hashtable implementations. It's
+  // not worth bypassing such divisions because hash values are extremely
+  // unlikely to have enough leading zeros. The call below tries to detect
+  // values that are unlikely to fit BypassType (including hashes).
+  if (isHashLikeValue(V, Visited))
+    return VALRNG_LIKELY_LONG;
+
+  return VALRNG_UNKNOWN;
+}
+
+/// Add new basic block for slow div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
+  QuotRemWithBB DivRemPair;
+  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+                                     MainBB->getParent(), SuccessorBB);
+  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+
+  if (isSignedOp()) {
+    DivRemPair.Quotient = Builder.CreateSDiv(Dividend, Divisor);
+    DivRemPair.Remainder = Builder.CreateSRem(Dividend, Divisor);
   } else {
-    SlowQuotientV = SlowBuilder.CreateUDiv(Dividend, Divisor);
-    SlowRemainderV = SlowBuilder.CreateURem(Dividend, Divisor);
+    DivRemPair.Quotient = Builder.CreateUDiv(Dividend, Divisor);
+    DivRemPair.Remainder = Builder.CreateURem(Dividend, Divisor);
   }
-  SlowBuilder.CreateBr(SuccessorBB);
-
-  // Add new basic block for fast divide operation
-  BasicBlock *FastBB =
-      BasicBlock::Create(F->getContext(), "", MainBB->getParent(), SuccessorBB);
-  FastBB->moveBefore(SlowBB);
-  IRBuilder<> FastBuilder(FastBB, FastBB->begin());
-  Value *ShortDivisorV = FastBuilder.CreateCast(Instruction::Trunc, Divisor,
-                                                BypassType);
-  Value *ShortDividendV = FastBuilder.CreateCast(Instruction::Trunc, Dividend,
-                                                 BypassType);
-
-  // udiv/urem because optimization only handles positive numbers
-  Value *ShortQuotientV = FastBuilder.CreateUDiv(ShortDividendV, ShortDivisorV);
-  Value *ShortRemainderV = FastBuilder.CreateURem(ShortDividendV,
-                                                  ShortDivisorV);
-  Value *FastQuotientV = FastBuilder.CreateCast(Instruction::ZExt,
-                                                ShortQuotientV,
-                                                Dividend->getType());
-  Value *FastRemainderV = FastBuilder.CreateCast(Instruction::ZExt,
-                                                 ShortRemainderV,
-                                                 Dividend->getType());
-  FastBuilder.CreateBr(SuccessorBB);
-
-  // Phi nodes for result of div and rem
-  IRBuilder<> SuccessorBuilder(SuccessorBB, SuccessorBB->begin());
-  PHINode *QuoPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
-  QuoPhi->addIncoming(SlowQuotientV, SlowBB);
-  QuoPhi->addIncoming(FastQuotientV, FastBB);
-  PHINode *RemPhi = SuccessorBuilder.CreatePHI(I->getType(), 2);
-  RemPhi->addIncoming(SlowRemainderV, SlowBB);
-  RemPhi->addIncoming(FastRemainderV, FastBB);
-
-  // Replace I with appropriate phi node
-  if (UseDivOp)
-    I->replaceAllUsesWith(QuoPhi);
-  else
-    I->replaceAllUsesWith(RemPhi);
-  I->eraseFromParent();
 
-  // Combine operands into a single value with OR for value testing below
-  MainBB->getInstList().back().eraseFromParent();
-  IRBuilder<> MainBuilder(MainBB, MainBB->end());
+  Builder.CreateBr(SuccessorBB);
+  return DivRemPair;
+}
+
+/// Add new basic block for fast div and rem operations and put it before
+/// SuccessorBB.
+QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
+  QuotRemWithBB DivRemPair;
+  DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
+                                     MainBB->getParent(), SuccessorBB);
+  IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+  Value *ShortDivisorV =
+      Builder.CreateCast(Instruction::Trunc, Divisor, BypassType);
+  Value *ShortDividendV =
+      Builder.CreateCast(Instruction::Trunc, Dividend, BypassType);
+
+  // udiv/urem because this optimization only handles positive numbers.
+  Value *ShortQV = Builder.CreateUDiv(ShortDividendV, ShortDivisorV);
+  Value *ShortRV = Builder.CreateURem(ShortDividendV, ShortDivisorV);
+  DivRemPair.Quotient =
+      Builder.CreateCast(Instruction::ZExt, ShortQV, getSlowType());
+  DivRemPair.Remainder =
+      Builder.CreateCast(Instruction::ZExt, ShortRV, getSlowType());
+  Builder.CreateBr(SuccessorBB);
+
+  return DivRemPair;
+}
 
-  // We should have bailed out above if the divisor is a constant, but the
-  // dividend may still be a constant.  Set OrV to our non-constant operands
-  // OR'ed together.
-  assert(!isa<ConstantInt>(Divisor));
+/// Creates Phi nodes for result of Div and Rem.
+QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
+                                                       QuotRemWithBB &RHS,
+                                                       BasicBlock *PhiBB) {
+  IRBuilder<> Builder(PhiBB, PhiBB->begin());
+  PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
+  QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
+  QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
+  PHINode *RemPhi = Builder.CreatePHI(getSlowType(), 2);
+  RemPhi->addIncoming(LHS.Remainder, LHS.BB);
+  RemPhi->addIncoming(RHS.Remainder, RHS.BB);
+  return QuotRemPair(QuoPhi, RemPhi);
+}
+
+/// Creates a runtime check to test whether both the divisor and dividend fit
+/// into BypassType. The check is inserted at the end of MainBB. True return
+/// value means that the operands fit. Either of the operands may be NULL if it
+/// doesn't need a runtime check.
+Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
+  assert((Op1 || Op2) && "Nothing to check");
+  IRBuilder<> Builder(MainBB, MainBB->end());
 
   Value *OrV;
-  if (!isa<ConstantInt>(Dividend))
-    OrV = MainBuilder.CreateOr(Dividend, Divisor);
+  if (Op1 && Op2)
+    OrV = Builder.CreateOr(Op1, Op2);
   else
-    OrV = Divisor;
+    OrV = Op1 ? Op1 : Op2;
 
   // BitMask is inverted to check if the operands are
   // larger than the bypass type
   uint64_t BitMask = ~BypassType->getBitMask();
-  Value *AndV = MainBuilder.CreateAnd(OrV, BitMask);
-
-  // Compare operand values and branch
-  Value *ZeroV = ConstantInt::getSigned(Dividend->getType(), 0);
-  Value *CmpV = MainBuilder.CreateICmpEQ(AndV, ZeroV);
-  MainBuilder.CreateCondBr(CmpV, FastBB, SlowBB);
-
-  // Cache phi nodes to be used later in place of other instances
-  // of div or rem with the same sign, dividend, and divisor
-  DivOpInfo Key(UseSignedOp, Dividend, Divisor);
-  DivPhiNodes Value(QuoPhi, RemPhi);
-  PerBBDivCache.insert(std::pair<DivOpInfo, DivPhiNodes>(Key, Value));
-  return true;
+  Value *AndV = Builder.CreateAnd(OrV, BitMask);
+
+  // Compare operand values
+  Value *ZeroV = ConstantInt::getSigned(getSlowType(), 0);
+  return Builder.CreateICmpEQ(AndV, ZeroV);
 }
 
-// reuseOrInsertFastDiv - Reuses previously computed dividend or remainder from
-// the current BB if operands and operation are identical. Otherwise calls
-// insertFastDiv to perform the optimization and caches the resulting dividend
-// and remainder.
-static bool reuseOrInsertFastDiv(Instruction *I, IntegerType *BypassType,
-                                 bool UseDivOp, bool UseSignedOp,
-                                 DivCacheTy &PerBBDivCache) {
-  // Get instruction operands
-  DivOpInfo Key(UseSignedOp, I->getOperand(0), I->getOperand(1));
-  DivCacheTy::iterator CacheI = PerBBDivCache.find(Key);
-
-  if (CacheI == PerBBDivCache.end()) {
-    // If previous instance does not exist, insert fast div
-    return insertFastDiv(I, BypassType, UseDivOp, UseSignedOp, PerBBDivCache);
+/// Substitutes the div/rem instruction with code that checks the value of the
+/// operands and uses a shorter-faster div/rem instruction when possible.
+Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
+  Value *Dividend = SlowDivOrRem->getOperand(0);
+  Value *Divisor = SlowDivOrRem->getOperand(1);
+
+  if (isa<ConstantInt>(Divisor)) {
+    // Keep division by a constant for DAGCombiner.
+    return None;
   }
 
-  // Replace operation value with previously generated phi node
-  DivPhiNodes &Value = CacheI->second;
-  if (UseDivOp) {
-    // Replace all uses of div instruction with quotient phi node
-    I->replaceAllUsesWith(Value.Quotient);
+  VisitedSetTy SetL;
+  ValueRange DividendRange = getValueRange(Dividend, SetL);
+  if (DividendRange == VALRNG_LIKELY_LONG)
+    return None;
+
+  VisitedSetTy SetR;
+  ValueRange DivisorRange = getValueRange(Divisor, SetR);
+  if (DivisorRange == VALRNG_LIKELY_LONG)
+    return None;
+
+  bool DividendShort = (DividendRange == VALRNG_KNOWN_SHORT);
+  bool DivisorShort = (DivisorRange == VALRNG_KNOWN_SHORT);
+
+  if (DividendShort && DivisorShort) {
+    // If both operands are known to be short then just replace the long
+    // division with a short one in-place.
+
+    IRBuilder<> Builder(SlowDivOrRem);
+    Value *TruncDividend = Builder.CreateTrunc(Dividend, BypassType);
+    Value *TruncDivisor = Builder.CreateTrunc(Divisor, BypassType);
+    Value *TruncDiv = Builder.CreateUDiv(TruncDividend, TruncDivisor);
+    Value *TruncRem = Builder.CreateURem(TruncDividend, TruncDivisor);
+    Value *ExtDiv = Builder.CreateZExt(TruncDiv, getSlowType());
+    Value *ExtRem = Builder.CreateZExt(TruncRem, getSlowType());
+    return QuotRemPair(ExtDiv, ExtRem);
+  } else if (DividendShort && !isSignedOp()) {
+    // If the division is unsigned and Dividend is known to be short, then
+    // either
+    // 1) Divisor is less or equal to Dividend, and the result can be computed
+    //    with a short division.
+    // 2) Divisor is greater than Dividend. In this case, no division is needed
+    //    at all: The quotient is 0 and the remainder is equal to Dividend.
+    //
+    // So instead of checking at runtime whether Divisor fits into BypassType,
+    // we emit a runtime check to differentiate between these two cases. This
+    // lets us entirely avoid a long div.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Long;
+    Long.BB = MainBB;
+    Long.Quotient = ConstantInt::get(getSlowType(), 0);
+    Long.Remainder = Dividend;
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
+    Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
+    return Result;
   } else {
-    // Replace all uses of rem instruction with remainder phi node
-    I->replaceAllUsesWith(Value.Remainder);
+    // General case. Create both slow and fast div/rem pairs and choose one of
+    // them at runtime.
+
+    // Split the basic block before the div/rem.
+    BasicBlock *SuccessorBB = MainBB->splitBasicBlock(SlowDivOrRem);
+    // Remove the unconditional branch from MainBB to SuccessorBB.
+    MainBB->getInstList().back().eraseFromParent();
+    QuotRemWithBB Fast = createFastBB(SuccessorBB);
+    QuotRemWithBB Slow = createSlowBB(SuccessorBB);
+    QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
+    Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
+                                            DivisorShort ? nullptr : Divisor);
+    IRBuilder<> Builder(MainBB, MainBB->end());
+    Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
+    return Result;
   }
-
-  // Remove redundant operation
-  I->eraseFromParent();
-  return true;
 }
 
-// bypassSlowDivision - This optimization identifies DIV instructions in a BB
-// that can be profitably bypassed and carried out with a shorter, faster
-// divide.
-bool llvm::bypassSlowDivision(
-    BasicBlock *BB, const DenseMap<unsigned int, unsigned int> &BypassWidths) {
-  DivCacheTy DivCache;
+/// This optimization identifies DIV/REM instructions in a BB that can be
+/// profitably bypassed and carried out with a shorter, faster divide.
+bool llvm::bypassSlowDivision(BasicBlock *BB,
+                              const BypassWidthsTy &BypassWidths) {
+  DivCacheTy PerBBDivCache;
 
   bool MadeChange = false;
   Instruction* Next = &*BB->begin();
@@ -231,42 +459,20 @@ bool llvm::bypassSlowDivision(
     Instruction* I = Next;
     Next = Next->getNextNode();
 
-    // Get instruction details
-    unsigned Opcode = I->getOpcode();
-    bool UseDivOp = Opcode == Instruction::SDiv || Opcode == Instruction::UDiv;
-    bool UseRemOp = Opcode == Instruction::SRem || Opcode == Instruction::URem;
-    bool UseSignedOp = Opcode == Instruction::SDiv ||
-                       Opcode == Instruction::SRem;
-
-    // Only optimize div or rem ops
-    if (!UseDivOp && !UseRemOp)
-      continue;
-
-    // Skip division on vector types, only optimize integer instructions
-    if (!I->getType()->isIntegerTy())
-      continue;
-
-    // Get bitwidth of div/rem instruction
-    IntegerType *T = cast<IntegerType>(I->getType());
-    unsigned int bitwidth = T->getBitWidth();
-
-    // Continue if bitwidth is not bypassed
-    DenseMap<unsigned int, unsigned int>::const_iterator BI = BypassWidths.find(bitwidth);
-    if (BI == BypassWidths.end())
-      continue;
-
-    // Get type for div/rem instruction with bypass bitwidth
-    IntegerType *BT = IntegerType::get(I->getContext(), BI->second);
-
-    MadeChange |= reuseOrInsertFastDiv(I, BT, UseDivOp, UseSignedOp, DivCache);
+    FastDivInsertionTask Task(I, BypassWidths);
+    if (Value *Replacement = Task.getReplacement(PerBBDivCache)) {
+      I->replaceAllUsesWith(Replacement);
+      I->eraseFromParent();
+      MadeChange = true;
+    }
   }
 
   // Above we eagerly create divs and rems, as pairs, so that we can efficiently
   // create divrem machine instructions.  Now erase any unused divs / rems so we
   // don't leave extra instructions sitting around.
-  for (auto &KV : DivCache)
-    for (Instruction *Phi : {KV.second.Quotient, KV.second.Remainder})
-      RecursivelyDeleteTriviallyDeadInstructions(Phi);
+  for (auto &KV : PerBBDivCache)
+    for (Value *V : {KV.second.Quotient, KV.second.Remainder})
+      RecursivelyDeleteTriviallyDeadInstructions(V);
 
   return MadeChange;
 }
diff --git a/lib/Transforms/Utils/CMakeLists.txt b/lib/Transforms/Utils/CMakeLists.txt
index f8a1b4d2f49edbe9fd500d9e7a86570ff88c239c..7a21c03da221ab57d6c6234354379ea3f6e5bdb3 100644
--- a/lib/Transforms/Utils/CMakeLists.txt
+++ b/lib/Transforms/Utils/CMakeLists.txt
@@ -31,13 +31,13 @@ add_llvm_library(LLVMTransformUtils
   LoopUtils.cpp
   LoopVersioning.cpp
   LowerInvoke.cpp
+  LowerMemIntrinsics.cpp
   LowerSwitch.cpp
   Mem2Reg.cpp
-  MemorySSA.cpp
-  MemorySSAUpdater.cpp
   MetaRenamer.cpp
   ModuleUtils.cpp
   NameAnonGlobals.cpp
+  PredicateInfo.cpp
   PromoteMemoryToRegister.cpp
   StripGCRelocates.cpp
   SSAUpdater.cpp
@@ -52,6 +52,7 @@ add_llvm_library(LLVMTransformUtils
   UnifyFunctionExitNodes.cpp
   Utils.cpp
   ValueMapper.cpp
+  VNCoercion.cpp
 
   ADDITIONAL_HEADER_DIRS
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/Transforms
diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp
index 4d33e22fecfbd33a34f7981ef858315c28432824..ae58d6133d91f3e340a0f9b978d09b38a9a433c0 100644
--- a/lib/Transforms/Utils/CloneFunction.cpp
+++ b/lib/Transforms/Utils/CloneFunction.cpp
@@ -90,9 +90,9 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
     assert(VMap.count(&I) && "No mapping from source argument specified!");
 #endif
 
-  // Copy all attributes other than those stored in the AttributeSet.  We need
-  // to remap the parameter indices of the AttributeSet.
-  AttributeSet NewAttrs = NewFunc->getAttributes();
+  // Copy all attributes other than those stored in the AttributeList.  We need
+  // to remap the parameter indices of the AttributeList.
+  AttributeList NewAttrs = NewFunc->getAttributes();
   NewFunc->copyAttributesFrom(OldFunc);
   NewFunc->setAttributes(NewAttrs);
 
@@ -103,22 +103,23 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc,
                  ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges,
                  TypeMapper, Materializer));
 
-  AttributeSet OldAttrs = OldFunc->getAttributes();
+  SmallVector<AttributeSet, 4> AttrVec(NewFunc->arg_size() + 2);
+  AttributeList OldAttrs = OldFunc->getAttributes();
+
+  // Copy the return attributes.
+  AttrVec[0] = OldAttrs.getRetAttributes();
+
   // Clone any argument attributes that are present in the VMap.
   for (const Argument &OldArg : OldFunc->args())
     if (Argument *NewArg = dyn_cast<Argument>(VMap[&OldArg])) {
-      AttributeSet attrs =
+      AttrVec[NewArg->getArgNo() + 1] =
           OldAttrs.getParamAttributes(OldArg.getArgNo() + 1);
-      if (attrs.getNumSlots() > 0)
-        NewArg->addAttr(attrs);
     }
 
-  NewFunc->setAttributes(
-      NewFunc->getAttributes()
-          .addAttributes(NewFunc->getContext(), AttributeSet::ReturnIndex,
-                         OldAttrs.getRetAttributes())
-          .addAttributes(NewFunc->getContext(), AttributeSet::FunctionIndex,
-                         OldAttrs.getFnAttributes()));
+  // Copy any function attributes.
+  AttrVec.back() = OldAttrs.getFnAttributes();
+
+  NewFunc->setAttributes(AttributeList::get(NewFunc->getContext(), AttrVec));
 
   SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
   OldFunc->getAllMetadata(MDs);
@@ -353,7 +354,7 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
       Cond = dyn_cast_or_null<ConstantInt>(V);
     }
     if (Cond) {     // Constant fold to uncond branch!
-      SwitchInst::ConstCaseIt Case = SI->findCaseValue(Cond);
+      SwitchInst::ConstCaseHandle Case = *SI->findCaseValue(Cond);
       BasicBlock *Dest = const_cast<BasicBlock*>(Case.getCaseSuccessor());
       VMap[OldTI] = BranchInst::Create(Dest, NewBB);
       ToClone.push_back(Dest);
@@ -747,3 +748,40 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
 
   return NewLoop;
 }
+
+/// \brief Duplicate non-Phi instructions from the beginning of block up to
+/// StopAt instruction into a split block between BB and its predecessor.
+BasicBlock *
+llvm::DuplicateInstructionsInSplitBetween(BasicBlock *BB, BasicBlock *PredBB,
+                                          Instruction *StopAt,
+                                          ValueToValueMapTy &ValueMapping) {
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in BB, evaluate them to
+  // account for entry from PredBB.
+  BasicBlock::iterator BI = BB->begin();
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(PredBB);
+
+  BasicBlock *NewBB = SplitEdge(PredBB, BB);
+  NewBB->setName(PredBB->getName() + ".split");
+  Instruction *NewTerm = NewBB->getTerminator();
+
+  // Clone the non-phi instructions of BB into NewBB, keeping track of the
+  // mapping and using it to remap operands in the cloned instructions.
+  for (; StopAt != &*BI; ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    New->insertBefore(NewTerm);
+    ValueMapping[&*BI] = New;
+
+    // Remap operands to patch up intra-block references.
+    for (unsigned i = 0, e = New->getNumOperands(); i != e; ++i)
+      if (Instruction *Inst = dyn_cast<Instruction>(New->getOperand(i))) {
+        auto I = ValueMapping.find(Inst);
+        if (I != ValueMapping.end())
+          New->setOperand(i, I->second);
+      }
+  }
+
+  return NewBB;
+}
diff --git a/lib/Transforms/Utils/CodeExtractor.cpp b/lib/Transforms/Utils/CodeExtractor.cpp
index c514c9c9cd4a617e06886750cc0137fd39f1615b..644d93b727b3de9fc4d8ff923c675307460724f5 100644
--- a/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/lib/Transforms/Utils/CodeExtractor.cpp
@@ -362,9 +362,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
   //  "target-features" attribute allowing it to be lowered.
   // FIXME: This should be changed to check to see if a specific
   //           attribute can not be inherited.
-  AttributeSet OldFnAttrs = oldFunction->getAttributes().getFnAttributes();
-  AttrBuilder AB(OldFnAttrs, AttributeSet::FunctionIndex);
-  for (auto Attr : AB.td_attrs())
+  AttrBuilder AB(oldFunction->getAttributes().getFnAttributes());
+  for (const auto &Attr : AB.td_attrs())
     newFunction->addFnAttr(Attr.first, Attr.second);
 
   newFunction->getBasicBlockList().push_back(newRootNode);
@@ -440,8 +439,10 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
   // Emit a call to the new function, passing in: *pointer to struct (if
   // aggregating parameters), or plan inputs and allocated memory for outputs
   std::vector<Value*> params, StructValues, ReloadOutputs, Reloads;
-  
-  LLVMContext &Context = newFunction->getContext();
+
+  Module *M = newFunction->getParent();
+  LLVMContext &Context = M->getContext();
+  const DataLayout &DL = M->getDataLayout();
 
   // Add inputs as params, or to be filled into the struct
   for (Value *input : inputs)
@@ -456,8 +457,9 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
       StructValues.push_back(output);
     } else {
       AllocaInst *alloca =
-          new AllocaInst(output->getType(), nullptr, output->getName() + ".loc",
-                         &codeReplacer->getParent()->front().front());
+        new AllocaInst(output->getType(), DL.getAllocaAddrSpace(),
+                       nullptr, output->getName() + ".loc",
+                       &codeReplacer->getParent()->front().front());
       ReloadOutputs.push_back(alloca);
       params.push_back(alloca);
     }
@@ -473,7 +475,8 @@ emitCallAndSwitchStatement(Function *newFunction, BasicBlock *codeReplacer,
 
     // Allocate a struct at the beginning of this function
     StructArgTy = StructType::get(newFunction->getContext(), ArgTypes);
-    Struct = new AllocaInst(StructArgTy, nullptr, "structArg",
+    Struct = new AllocaInst(StructArgTy, DL.getAllocaAddrSpace(), nullptr,
+                            "structArg",
                             &codeReplacer->getParent()->front().front());
     params.push_back(Struct);
 
diff --git a/lib/Transforms/Utils/DemoteRegToStack.cpp b/lib/Transforms/Utils/DemoteRegToStack.cpp
index 75a1dde57c4cc992f42f22a946b9690854431b3f..0eee6e19efac604fe032195545d2cb6cb3043002 100644
--- a/lib/Transforms/Utils/DemoteRegToStack.cpp
+++ b/lib/Transforms/Utils/DemoteRegToStack.cpp
@@ -28,15 +28,17 @@ AllocaInst *llvm::DemoteRegToStack(Instruction &I, bool VolatileLoads,
     return nullptr;
   }
 
+  Function *F = I.getParent()->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
   if (AllocaPoint) {
-    Slot = new AllocaInst(I.getType(), nullptr,
+    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
                           I.getName()+".reg2mem", AllocaPoint);
   } else {
-    Function *F = I.getParent()->getParent();
-    Slot = new AllocaInst(I.getType(), nullptr, I.getName() + ".reg2mem",
-                          &F->getEntryBlock().front());
+    Slot = new AllocaInst(I.getType(), DL.getAllocaAddrSpace(), nullptr,
+                          I.getName() + ".reg2mem", &F->getEntryBlock().front());
   }
 
   // We cannot demote invoke instructions to the stack if their normal edge
@@ -110,14 +112,17 @@ AllocaInst *llvm::DemotePHIToStack(PHINode *P, Instruction *AllocaPoint) {
     return nullptr;
   }
 
+  const DataLayout &DL = P->getModule()->getDataLayout();
+
   // Create a stack slot to hold the value.
   AllocaInst *Slot;
   if (AllocaPoint) {
-    Slot = new AllocaInst(P->getType(), nullptr,
+    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
                           P->getName()+".reg2mem", AllocaPoint);
   } else {
     Function *F = P->getParent()->getParent();
-    Slot = new AllocaInst(P->getType(), nullptr, P->getName() + ".reg2mem",
+    Slot = new AllocaInst(P->getType(), DL.getAllocaAddrSpace(), nullptr,
+                          P->getName() + ".reg2mem",
                           &F->getEntryBlock().front());
   }
 
diff --git a/lib/Transforms/Utils/Evaluator.cpp b/lib/Transforms/Utils/Evaluator.cpp
index 4adf1754253d0b951e1e46b48a71f33dacb63098..59f176e2f231d59f78f35363a36a3cdfef455e28 100644
--- a/lib/Transforms/Utils/Evaluator.cpp
+++ b/lib/Transforms/Utils/Evaluator.cpp
@@ -16,6 +16,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -486,7 +487,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         ConstantInt *Val =
           dyn_cast<ConstantInt>(getVal(SI->getCondition()));
         if (!Val) return false;  // Cannot determine.
-        NextBB = SI->findCaseValue(Val).getCaseSuccessor();
+        NextBB = SI->findCaseValue(Val)->getCaseSuccessor();
       } else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(CurInst)) {
         Value *Val = getVal(IBI->getAddress())->stripPointerCasts();
         if (BlockAddress *BA = dyn_cast<BlockAddress>(Val))
diff --git a/lib/Transforms/Utils/FunctionComparator.cpp b/lib/Transforms/Utils/FunctionComparator.cpp
index 81a7c4ceffab85bed8d3c2fe7e2de04288e29f16..73a0b2737e9572d523ba653a8a8463005fde6a0a 100644
--- a/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/lib/Transforms/Utils/FunctionComparator.cpp
@@ -74,14 +74,14 @@ int FunctionComparator::cmpMem(StringRef L, StringRef R) const {
   return L.compare(R);
 }
 
-int FunctionComparator::cmpAttrs(const AttributeSet L,
-                                 const AttributeSet R) const {
+int FunctionComparator::cmpAttrs(const AttributeList L,
+                                 const AttributeList R) const {
   if (int Res = cmpNumbers(L.getNumSlots(), R.getNumSlots()))
     return Res;
 
   for (unsigned i = 0, e = L.getNumSlots(); i != e; ++i) {
-    AttributeSet::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i),
-                           RE = R.end(i);
+    AttributeList::iterator LI = L.begin(i), LE = L.end(i), RI = R.begin(i),
+                            RE = R.end(i);
     for (; LI != LE && RI != RE; ++LI, ++RI) {
       Attribute LA = *LI;
       Attribute RA = *RI;
diff --git a/lib/Transforms/Utils/FunctionImportUtils.cpp b/lib/Transforms/Utils/FunctionImportUtils.cpp
index 9844190ef84a28a8ce8b70024d112ff94a30da04..b00f4b14068a2f152f2e7b718439528d95678d57 100644
--- a/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -21,11 +21,11 @@ using namespace llvm;
 /// Checks if we should import SGV as a definition, otherwise import as a
 /// declaration.
 bool FunctionImportGlobalProcessing::doImportAsDefinition(
-    const GlobalValue *SGV, DenseSet<const GlobalValue *> *GlobalsToImport) {
+    const GlobalValue *SGV, SetVector<GlobalValue *> *GlobalsToImport) {
 
   // For alias, we tie the definition to the base object. Extract it and recurse
   if (auto *GA = dyn_cast<GlobalAlias>(SGV)) {
-    if (GA->hasWeakAnyLinkage())
+    if (GA->isInterposable())
       return false;
     const GlobalObject *GO = GA->getBaseObject();
     if (!GO->hasLinkOnceODRLinkage())
@@ -34,7 +34,7 @@ bool FunctionImportGlobalProcessing::doImportAsDefinition(
         GO, GlobalsToImport);
   }
   // Only import the globals requested for importing.
-  if (GlobalsToImport->count(SGV))
+  if (GlobalsToImport->count(const_cast<GlobalValue *>(SGV)))
     return true;
   // Otherwise no.
   return false;
@@ -57,7 +57,8 @@ bool FunctionImportGlobalProcessing::shouldPromoteLocalToGlobal(
     return false;
 
   if (isPerformingImport()) {
-    assert((!GlobalsToImport->count(SGV) || !isNonRenamableLocal(*SGV)) &&
+    assert((!GlobalsToImport->count(const_cast<GlobalValue *>(SGV)) ||
+            !isNonRenamableLocal(*SGV)) &&
            "Attempting to promote non-renamable local");
     // We don't know for sure yet if we are importing this value (as either
     // a reference or a def), since we are simply walking all values in the
@@ -254,9 +255,8 @@ bool FunctionImportGlobalProcessing::run() {
   return false;
 }
 
-bool llvm::renameModuleForThinLTO(
-    Module &M, const ModuleSummaryIndex &Index,
-    DenseSet<const GlobalValue *> *GlobalsToImport) {
+bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
+                                  SetVector<GlobalValue *> *GlobalsToImport) {
   FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport);
   return ThinLTOProcessing.run();
 }
diff --git a/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp b/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
index ed018bb73107486280c1852a7003a4f237bb399a..b8c12ad5ea84636e1d6202375540054e6ddc4641 100644
--- a/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
+++ b/lib/Transforms/Utils/ImportedFunctionsInliningStatistics.cpp
@@ -62,6 +62,8 @@ void ImportedFunctionsInliningStatistics::recordInline(const Function &Caller,
 void ImportedFunctionsInliningStatistics::setModuleInfo(const Module &M) {
   ModuleName = M.getName();
   for (const auto &F : M.functions()) {
+    if (F.isDeclaration())
+      continue;
     AllFunctions++;
     ImportedFunctions += int(F.getMetadata("thinlto_src_module") != nullptr);
   }
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 86f40c32dc035d7309a21dc483f4f3ef0800baff..5d6fbc3325fff5ff6c71e6e3305fb90a74a1e396 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Analysis/CaptureTracking.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/CallSite.h"
@@ -1108,26 +1109,23 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
   bool DTCalculated = false;
 
   Function *CalledFunc = CS.getCalledFunction();
-  for (Function::arg_iterator I = CalledFunc->arg_begin(),
-                              E = CalledFunc->arg_end();
-       I != E; ++I) {
-    unsigned Align = I->getType()->isPointerTy() ? I->getParamAlignment() : 0;
-    if (Align && !I->hasByValOrInAllocaAttr() && !I->hasNUses(0)) {
+  for (Argument &Arg : CalledFunc->args()) {
+    unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0;
+    if (Align && !Arg.hasByValOrInAllocaAttr() && !Arg.hasNUses(0)) {
       if (!DTCalculated) {
-        DT.recalculate(const_cast<Function&>(*CS.getInstruction()->getParent()
-                                               ->getParent()));
+        DT.recalculate(*CS.getCaller());
         DTCalculated = true;
       }
 
       // If we can already prove the asserted alignment in the context of the
       // caller, then don't bother inserting the assumption.
-      Value *Arg = CS.getArgument(I->getArgNo());
-      if (getKnownAlignment(Arg, DL, CS.getInstruction(), AC, &DT) >= Align)
+      Value *ArgVal = CS.getArgument(Arg.getArgNo());
+      if (getKnownAlignment(ArgVal, DL, CS.getInstruction(), AC, &DT) >= Align)
         continue;
 
-      CallInst *NewAssumption = IRBuilder<>(CS.getInstruction())
-                                    .CreateAlignmentAssumption(DL, Arg, Align);
-      AC->registerAssumption(NewAssumption);
+      CallInst *NewAsmp = IRBuilder<>(CS.getInstruction())
+                              .CreateAlignmentAssumption(DL, ArgVal, Align);
+      AC->registerAssumption(NewAsmp);
     }
   }
 }
@@ -1141,7 +1139,7 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
                                          ValueToValueMapTy &VMap,
                                          InlineFunctionInfo &IFI) {
   CallGraph &CG = *IFI.CG;
-  const Function *Caller = CS.getInstruction()->getParent()->getParent();
+  const Function *Caller = CS.getCaller();
   const Function *Callee = CS.getCalledFunction();
   CallGraphNode *CalleeNode = CG[Callee];
   CallGraphNode *CallerNode = CG[Caller];
@@ -1226,7 +1224,8 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
   PointerType *ArgTy = cast<PointerType>(Arg->getType());
   Type *AggTy = ArgTy->getElementType();
 
-  Function *Caller = TheCall->getParent()->getParent();
+  Function *Caller = TheCall->getFunction();
+  const DataLayout &DL = Caller->getParent()->getDataLayout();
 
   // If the called function is readonly, then it could not mutate the caller's
   // copy of the byval'd memory.  In this case, it is safe to elide the copy and
@@ -1240,31 +1239,30 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
 
     AssumptionCache *AC =
         IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr;
-    const DataLayout &DL = Caller->getParent()->getDataLayout();
 
     // If the pointer is already known to be sufficiently aligned, or if we can
     // round it up to a larger alignment, then we don't need a temporary.
     if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall, AC) >=
         ByValAlignment)
       return Arg;
-    
+
     // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
     // for code quality, but rarely happens and is required for correctness.
   }
 
   // Create the alloca.  If we have DataLayout, use nice alignment.
-  unsigned Align =
-      Caller->getParent()->getDataLayout().getPrefTypeAlignment(AggTy);
+  unsigned Align = DL.getPrefTypeAlignment(AggTy);
 
   // If the byval had an alignment specified, we *must* use at least that
   // alignment, as it is required by the byval argument (and uses of the
   // pointer inside the callee).
   Align = std::max(Align, ByValAlignment);
-  
-  Value *NewAlloca = new AllocaInst(AggTy, nullptr, Align, Arg->getName(), 
+
+  Value *NewAlloca = new AllocaInst(AggTy, DL.getAllocaAddrSpace(),
+                                    nullptr, Align, Arg->getName(),
                                     &*Caller->begin()->begin());
   IFI.StaticAllocas.push_back(cast<AllocaInst>(NewAlloca));
-  
+
   // Uses of the argument in the function should use our new alloca
   // instead.
   return NewAlloca;
@@ -1411,9 +1409,16 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
       continue;
     auto *OrigBB = cast<BasicBlock>(Entry.first);
     auto *ClonedBB = cast<BasicBlock>(Entry.second);
-    ClonedBBs.insert(ClonedBB);
-    CallerBFI->setBlockFreq(ClonedBB,
-                            CalleeBFI->getBlockFreq(OrigBB).getFrequency());
+    uint64_t Freq = CalleeBFI->getBlockFreq(OrigBB).getFrequency();
+    if (!ClonedBBs.insert(ClonedBB).second) {
+      // Multiple blocks in the callee might get mapped to one cloned block in
+      // the caller since we prune the callee as we clone it. When that happens,
+      // we want to use the maximum among the original blocks' frequencies.
+      uint64_t NewFreq = CallerBFI->getBlockFreq(ClonedBB).getFrequency();
+      if (NewFreq > Freq)
+        Freq = NewFreq;
+    }
+    CallerBFI->setBlockFreq(ClonedBB, Freq);
   }
   BasicBlock *EntryClone = cast<BasicBlock>(VMap.lookup(&CalleeEntryBlock));
   CallerBFI->setBlockFreqAndScale(
@@ -1421,28 +1426,54 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
       ClonedBBs);
 }
 
+/// Update the branch metadata for cloned call instructions.
+static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
+                              const Optional<uint64_t> &CalleeEntryCount,
+                              const Instruction *TheCall) {
+  if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1)
+    return;
+  Optional<uint64_t> CallSiteCount =
+      ProfileSummaryInfo::getProfileCount(TheCall, nullptr);
+  uint64_t CallCount =
+      std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0,
+               CalleeEntryCount.getValue());
+
+  for (auto const &Entry : VMap)
+    if (isa<CallInst>(Entry.first))
+      if (auto *CI = dyn_cast_or_null<CallInst>(Entry.second))
+        CI->updateProfWeight(CallCount, CalleeEntryCount.getValue());
+  for (BasicBlock &BB : *Callee)
+    // No need to update the callsite if it is pruned during inlining.
+    if (VMap.count(&BB))
+      for (Instruction &I : BB)
+        if (CallInst *CI = dyn_cast<CallInst>(&I))
+          CI->updateProfWeight(CalleeEntryCount.getValue() - CallCount,
+                               CalleeEntryCount.getValue());
+}
+
 /// Update the entry count of callee after inlining.
 ///
 /// The callsite's block count is subtracted from the callee's function entry
 /// count.
-static void updateCalleeCount(BlockFrequencyInfo &CallerBFI, BasicBlock *CallBB,
-                              Function *Callee) {
+static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB,
+                              Instruction *CallInst, Function *Callee) {
   // If the callee has a original count of N, and the estimated count of
   // callsite is M, the new callee count is set to N - M. M is estimated from
   // the caller's entry count, its entry block frequency and the block frequency
   // of the callsite.
   Optional<uint64_t> CalleeCount = Callee->getEntryCount();
-  if (!CalleeCount)
+  if (!CalleeCount.hasValue())
     return;
-  Optional<uint64_t> CallSiteCount = CallerBFI.getBlockProfileCount(CallBB);
-  if (!CallSiteCount)
+  Optional<uint64_t> CallCount =
+      ProfileSummaryInfo::getProfileCount(CallInst, CallerBFI);
+  if (!CallCount.hasValue())
     return;
   // Since CallSiteCount is an estimate, it could exceed the original callee
   // count and has to be set to 0.
-  if (CallSiteCount.getValue() > CalleeCount.getValue())
+  if (CallCount.getValue() > CalleeCount.getValue())
     Callee->setEntryCount(0);
   else
-    Callee->setEntryCount(CalleeCount.getValue() - CallSiteCount.getValue());
+    Callee->setEntryCount(CalleeCount.getValue() - CallCount.getValue());
 }
 
 /// This function inlines the called function into the basic block of the
@@ -1456,8 +1487,8 @@ static void updateCalleeCount(BlockFrequencyInfo &CallerBFI, BasicBlock *CallBB,
 bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
                           AAResults *CalleeAAR, bool InsertLifetime) {
   Instruction *TheCall = CS.getInstruction();
-  assert(TheCall->getParent() && TheCall->getParent()->getParent() &&
-         "Instruction not in function!");
+  assert(TheCall->getParent() && TheCall->getFunction()
+         && "Instruction not in function!");
 
   // If IFI has any state in it, zap it before we fill it in.
   IFI.reset();
@@ -1599,7 +1630,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // matches up the formal to the actual argument values.
     CallSite::arg_iterator AI = CS.arg_begin();
     unsigned ArgNo = 0;
-    for (Function::const_arg_iterator I = CalledFunc->arg_begin(),
+    for (Function::arg_iterator I = CalledFunc->arg_begin(),
          E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
       Value *ActualArg = *AI;
 
@@ -1632,13 +1663,14 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Remember the first block that is newly cloned over.
     FirstNewBlock = LastBlock; ++FirstNewBlock;
 
-    if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr) {
+    if (IFI.CallerBFI != nullptr && IFI.CalleeBFI != nullptr)
       // Update the BFI of blocks cloned into the caller.
       updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
                       CalledFunc->front());
-      // Update the profile count of callee.
-      updateCalleeCount(*IFI.CallerBFI, OrigBB, CalledFunc);
-    }
+
+    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall);
+    // Update the profile count of callee.
+    updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc);
 
     // Inject byval arguments initialization.
     for (std::pair<Value*, Value*> &Init : ByValInit)
diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp
index 9e217fec20cbeb6537ef01effb95fb9a1e93310d..18b29226c2ef5adc9983c61d5d0ab01b394c3fe7 100644
--- a/lib/Transforms/Utils/Local.cpp
+++ b/lib/Transforms/Utils/Local.cpp
@@ -126,21 +126,20 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     // If the default is unreachable, ignore it when searching for TheOnlyDest.
     if (isa<UnreachableInst>(DefaultDest->getFirstNonPHIOrDbg()) &&
         SI->getNumCases() > 0) {
-      TheOnlyDest = SI->case_begin().getCaseSuccessor();
+      TheOnlyDest = SI->case_begin()->getCaseSuccessor();
     }
 
     // Figure out which case it goes to.
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end();
-         i != e; ++i) {
+    for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
       // Found case matching a constant operand?
-      if (i.getCaseValue() == CI) {
-        TheOnlyDest = i.getCaseSuccessor();
+      if (i->getCaseValue() == CI) {
+        TheOnlyDest = i->getCaseSuccessor();
         break;
       }
 
       // Check to see if this branch is going to the same place as the default
       // dest.  If so, eliminate it as an explicit compare.
-      if (i.getCaseSuccessor() == DefaultDest) {
+      if (i->getCaseSuccessor() == DefaultDest) {
         MDNode *MD = SI->getMetadata(LLVMContext::MD_prof);
         unsigned NCases = SI->getNumCases();
         // Fold the case metadata into the default if there will be any branches
@@ -154,7 +153,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
             Weights.push_back(CI->getValue().getZExtValue());
           }
           // Merge weight of this case to the default weight.
-          unsigned idx = i.getCaseIndex();
+          unsigned idx = i->getCaseIndex();
           Weights[0] += Weights[idx+1];
           // Remove weight for this case.
           std::swap(Weights[idx+1], Weights.back());
@@ -165,15 +164,19 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
         }
         // Remove this entry.
         DefaultDest->removePredecessor(SI->getParent());
-        SI->removeCase(i);
-        --i; --e;
+        i = SI->removeCase(i);
+        e = SI->case_end();
         continue;
       }
 
       // Otherwise, check to see if the switch only branches to one destination.
       // We do this by reseting "TheOnlyDest" to null when we find two non-equal
       // destinations.
-      if (i.getCaseSuccessor() != TheOnlyDest) TheOnlyDest = nullptr;
+      if (i->getCaseSuccessor() != TheOnlyDest)
+        TheOnlyDest = nullptr;
+
+      // Increment this iterator as we haven't removed the case.
+      ++i;
     }
 
     if (CI && !TheOnlyDest) {
@@ -209,7 +212,7 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
     if (SI->getNumCases() == 1) {
       // Otherwise, we can fold this switch into a conditional branch
       // instruction if it has only one non-default destination.
-      SwitchInst::CaseIt FirstCase = SI->case_begin();
+      auto FirstCase = *SI->case_begin();
       Value *Cond = Builder.CreateICmpEQ(SI->getCondition(),
           FirstCase.getCaseValue(), "cond");
 
@@ -287,7 +290,15 @@ bool llvm::ConstantFoldTerminator(BasicBlock *BB, bool DeleteDeadConditions,
 ///
 bool llvm::isInstructionTriviallyDead(Instruction *I,
                                       const TargetLibraryInfo *TLI) {
-  if (!I->use_empty() || isa<TerminatorInst>(I)) return false;
+  if (!I->use_empty())
+    return false;
+  return wouldInstructionBeTriviallyDead(I, TLI);
+}
+
+bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
+                                           const TargetLibraryInfo *TLI) {
+  if (isa<TerminatorInst>(I))
+    return false;
 
   // We don't want the landingpad-like instructions removed by anything this
   // general.
@@ -307,7 +318,8 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
     return true;
   }
 
-  if (!I->mayHaveSideEffects()) return true;
+  if (!I->mayHaveSideEffects())
+    return true;
 
   // Special case intrinsics that "may have side effects" but can be deleted
   // when dead.
@@ -334,7 +346,8 @@ bool llvm::isInstructionTriviallyDead(Instruction *I,
     }
   }
 
-  if (isAllocLikeFn(I, TLI)) return true;
+  if (isAllocLikeFn(I, TLI))
+    return true;
 
   if (CallInst *CI = isFreeCall(I, TLI))
     if (Constant *C = dyn_cast<Constant>(CI->getArgOperand(0)))
@@ -1075,11 +1088,11 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar,
   // Since we can't guarantee that the original dbg.declare instrinsic
   // is removed by LowerDbgDeclare(), we need to make sure that we are
   // not inserting the same dbg.value intrinsic over and over.
-  DbgValueList DbgValues;
-  FindAllocaDbgValues(DbgValues, APN);
-  for (auto DVI : DbgValues) {
-    assert (DVI->getValue() == APN);
-    assert (DVI->getOffset() == 0);
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  findDbgValues(DbgValues, APN);
+  for (auto *DVI : DbgValues) {
+    assert(DVI->getValue() == APN);
+    assert(DVI->getOffset() == 0);
     if ((DVI->getVariable() == DIVar) && (DVI->getExpression() == DIExpr))
       return true;
   }
@@ -1241,9 +1254,7 @@ DbgDeclareInst *llvm::FindAllocaDbgDeclare(Value *V) {
   return nullptr;
 }
 
-/// FindAllocaDbgValues - Finds the llvm.dbg.value intrinsics describing the
-/// alloca 'V', if any.
-void llvm::FindAllocaDbgValues(DbgValueList &DbgValues, Value *V) {
+void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
   if (auto *L = LocalAsMetadata::getIfExists(V))
     if (auto *MDV = MetadataAsValue::getIfExists(V->getContext(), L))
       for (User *U : MDV->users())
@@ -1251,36 +1262,32 @@ void llvm::FindAllocaDbgValues(DbgValueList &DbgValues, Value *V) {
           DbgValues.push_back(DVI);
 }
 
-static void DIExprAddDeref(SmallVectorImpl<uint64_t> &Expr) {
-  Expr.push_back(dwarf::DW_OP_deref);
-}
-
-static void DIExprAddOffset(SmallVectorImpl<uint64_t> &Expr, int Offset) {
+static void appendOffset(SmallVectorImpl<uint64_t> &Ops, int64_t Offset) {
   if (Offset > 0) {
-    Expr.push_back(dwarf::DW_OP_plus);
-    Expr.push_back(Offset);
+    Ops.push_back(dwarf::DW_OP_plus);
+    Ops.push_back(Offset);
   } else if (Offset < 0) {
-    Expr.push_back(dwarf::DW_OP_minus);
-    Expr.push_back(-Offset);
+    Ops.push_back(dwarf::DW_OP_minus);
+    Ops.push_back(-Offset);
   }
 }
 
-static DIExpression *BuildReplacementDIExpr(DIBuilder &Builder,
-                                            DIExpression *DIExpr, bool Deref,
-                                            int Offset) {
+/// Prepend \p DIExpr with a deref and offset operation.
+static DIExpression *prependDIExpr(DIBuilder &Builder, DIExpression *DIExpr,
+                                   bool Deref, int64_t Offset) {
   if (!Deref && !Offset)
     return DIExpr;
   // Create a copy of the original DIDescriptor for user variable, prepending
   // "deref" operation to a list of address elements, as new llvm.dbg.declare
   // will take a value storing address of the memory for variable, not
   // alloca itself.
-  SmallVector<uint64_t, 4> NewDIExpr;
+  SmallVector<uint64_t, 4> Ops;
   if (Deref)
-    DIExprAddDeref(NewDIExpr);
-  DIExprAddOffset(NewDIExpr, Offset);
+    Ops.push_back(dwarf::DW_OP_deref);
+  appendOffset(Ops, Offset);
   if (DIExpr)
-    NewDIExpr.append(DIExpr->elements_begin(), DIExpr->elements_end());
-  return Builder.createExpression(NewDIExpr);
+    Ops.append(DIExpr->elements_begin(), DIExpr->elements_end());
+  return Builder.createExpression(Ops);
 }
 
 bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
@@ -1294,7 +1301,7 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
   auto *DIExpr = DDI->getExpression();
   assert(DIVar && "Missing variable");
 
-  DIExpr = BuildReplacementDIExpr(Builder, DIExpr, Deref, Offset);
+  DIExpr = prependDIExpr(Builder, DIExpr, Deref, Offset);
 
   // Insert llvm.dbg.declare immediately after the original alloca, and remove
   // old llvm.dbg.declare.
@@ -1326,11 +1333,11 @@ static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
   // Insert the offset immediately after the first deref.
   // We could just change the offset argument of dbg.value, but it's unsigned...
   if (Offset) {
-    SmallVector<uint64_t, 4> NewDIExpr;
-    DIExprAddDeref(NewDIExpr);
-    DIExprAddOffset(NewDIExpr, Offset);
-    NewDIExpr.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
-    DIExpr = Builder.createExpression(NewDIExpr);
+    SmallVector<uint64_t, 4> Ops;
+    Ops.push_back(dwarf::DW_OP_deref);
+    appendOffset(Ops, Offset);
+    Ops.append(DIExpr->elements_begin() + 1, DIExpr->elements_end());
+    DIExpr = Builder.createExpression(Ops);
   }
 
   Builder.insertDbgValueIntrinsic(NewAddress, DVI->getOffset(), DIVar, DIExpr,
@@ -1349,6 +1356,53 @@ void llvm::replaceDbgValueForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
       }
 }
 
+void llvm::salvageDebugInfo(Instruction &I) {
+  SmallVector<DbgValueInst *, 1> DbgValues;
+  auto &M = *I.getModule();
+
+  auto MDWrap = [&](Value *V) {
+    return MetadataAsValue::get(I.getContext(), ValueAsMetadata::get(V));
+  };
+
+  if (isa<BitCastInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      // Bitcasts are entirely irrelevant for debug info. Rewrite the dbg.value
+      // to use the cast's source.
+      DVI->setOperand(0, MDWrap(I.getOperand(0)));
+      DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+    }
+  } else if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      unsigned BitWidth =
+          M.getDataLayout().getPointerSizeInBits(GEP->getPointerAddressSpace());
+      APInt Offset(BitWidth, 0);
+      // Rewrite a constant GEP into a DIExpression.
+      if (GEP->accumulateConstantOffset(M.getDataLayout(), Offset)) {
+        auto *DIExpr = DVI->getExpression();
+        DIBuilder DIB(M, /*AllowUnresolved*/ false);
+        // GEP offsets are i32 and thus alwaus fit into an int64_t.
+        DIExpr = prependDIExpr(DIB, DIExpr, NoDeref, Offset.getSExtValue());
+        DVI->setOperand(0, MDWrap(I.getOperand(0)));
+        DVI->setOperand(3, MetadataAsValue::get(I.getContext(), DIExpr));
+        DEBUG(dbgs() << "SALVAGE: " << *DVI << '\n');
+      }
+    }
+  } else if (isa<LoadInst>(&I)) {
+    findDbgValues(DbgValues, &I);
+    for (auto *DVI : DbgValues) {
+      // Rewrite the load into DW_OP_deref.
+      auto *DIExpr = DVI->getExpression();
+      DIBuilder DIB(M, /*AllowUnresolved*/ false);
+      DIExpr = prependDIExpr(DIB, DIExpr, WithDeref, 0);
+      DVI->setOperand(0, MDWrap(I.getOperand(0)));
+      DVI->setOperand(3, MetadataAsValue::get(I.getContext(), DIExpr));
+      DEBUG(dbgs() << "SALVAGE:  " << *DVI << '\n');
+    }
+  }
+}
+
 unsigned llvm::removeAllNonTerminatorAndEHPadInstructions(BasicBlock *BB) {
   unsigned NumDeadInst = 0;
   // Delete the instructions backwards, as it has a reduced likelihood of
@@ -2072,5 +2126,5 @@ void llvm::maybeMarkSanitizerLibraryCallNoBuiltin(
   if (F && !F->hasLocalLinkage() && F->hasName() &&
       TLI->getLibFunc(F->getName(), Func) && TLI->hasOptimizedCodeGen(Func) &&
       !F->doesNotAccessMemory())
-    CI->addAttribute(AttributeSet::FunctionIndex, Attribute::NoBuiltin);
+    CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
 }
diff --git a/lib/Transforms/Utils/LoopSimplify.cpp b/lib/Transforms/Utils/LoopSimplify.cpp
index 85bdeecb79cc637ee1ee521c8015f237b5f011b6..e7ba19665d5917d0151559754cea897492f22d40 100644
--- a/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/lib/Transforms/Utils/LoopSimplify.cpp
@@ -645,14 +645,7 @@ ReprocessLoop:
   // loop-invariant instructions out of the way to open up more
   // opportunities, and the disadvantage of having the responsibility
   // to preserve dominator information.
-  bool UniqueExit = true;
-  if (!ExitBlocks.empty())
-    for (unsigned i = 1, e = ExitBlocks.size(); i != e; ++i)
-      if (ExitBlocks[i] != ExitBlocks[0]) {
-        UniqueExit = false;
-        break;
-      }
-  if (UniqueExit) {
+  if (ExitBlockSet.size() == 1) {
     for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
       BasicBlock *ExitingBlock = ExitingBlocks[i];
       if (!ExitingBlock->getSinglePredecessor()) continue;
diff --git a/lib/Transforms/Utils/LoopUnroll.cpp b/lib/Transforms/Utils/LoopUnroll.cpp
index 4fc7f410412560d33c1ce9e30d8048d74b82eaaa..3c669ce644e204da4e283430cf463ab993a23ebb 100644
--- a/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/lib/Transforms/Utils/LoopUnroll.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/LLVMContext.h"
@@ -215,6 +216,45 @@ const Loop* llvm::addClonedBlockToLoopInfo(BasicBlock *OriginalBB,
   }
 }
 
+/// The function chooses which type of unroll (epilog or prolog) is more
+/// profitabale.
+/// Epilog unroll is more profitable when there is PHI that starts from
+/// constant.  In this case epilog will leave PHI start from constant,
+/// but prolog will convert it to non-constant.
+///
+/// loop:
+///   PN = PHI [I, Latch], [CI, PreHeader]
+///   I = foo(PN)
+///   ...
+///
+/// Epilog unroll case.
+/// loop:
+///   PN = PHI [I2, Latch], [CI, PreHeader]
+///   I1 = foo(PN)
+///   I2 = foo(I1)
+///   ...
+/// Prolog unroll case.
+///   NewPN = PHI [PrologI, Prolog], [CI, PreHeader]
+/// loop:
+///   PN = PHI [I2, Latch], [NewPN, PreHeader]
+///   I1 = foo(PN)
+///   I2 = foo(I1)
+///   ...
+///
+static bool isEpilogProfitable(Loop *L) {
+  BasicBlock *PreHeader = L->getLoopPreheader();
+  BasicBlock *Header = L->getHeader();
+  assert(PreHeader && Header);
+  for (Instruction &BBI : *Header) {
+    PHINode *PN = dyn_cast<PHINode>(&BBI);
+    if (!PN)
+      break;
+    if (isa<ConstantInt>(PN->getIncomingValueForBlock(PreHeader)))
+      return true;
+  }
+  return false;
+}
+
 /// Unroll the given loop by Count. The loop must be in LCSSA form. Returns true
 /// if unrolling was successful, or false if the loop was unmodified. Unrolling
 /// can only fail when the loop's latch block is not terminated by a conditional
@@ -358,15 +398,19 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
                "convergent operation.");
       });
 
+  bool EpilogProfitability =
+      UnrollRuntimeEpilog.getNumOccurrences() ? UnrollRuntimeEpilog
+                                              : isEpilogProfitable(L);
+
   if (RuntimeTripCount && TripMultiple % Count != 0 &&
       !UnrollRuntimeLoopRemainder(L, Count, AllowExpensiveTripCount,
-                                  UnrollRuntimeEpilog, LI, SE, DT, 
+                                  EpilogProfitability, LI, SE, DT,
                                   PreserveLCSSA)) {
     if (Force)
       RuntimeTripCount = false;
     else {
       DEBUG(
-          dbgs() << "Wont unroll; prolog and epilog code could not be inserted "
+          dbgs() << "Wont unroll; remainder loop could not be generated"
                     "when assuming runtime trip count\n");
       return false;
     }
@@ -462,6 +506,12 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
   for (Loop *SubLoop : *L)
     LoopsToSimplify.insert(SubLoop);
 
+  if (Header->getParent()->isDebugInfoForProfiling())
+    for (BasicBlock *BB : L->getBlocks())
+      for (Instruction &I : *BB)
+        if (const DILocation *DIL = I.getDebugLoc())
+          I.setDebugLoc(DIL->cloneWithDuplicationFactor(Count));
+
   for (unsigned It = 1; It != Count; ++It) {
     std::vector<BasicBlock*> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -472,19 +522,16 @@ bool llvm::UnrollLoop(Loop *L, unsigned Count, unsigned TripCount, bool Force,
       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
       Header->getParent()->getBasicBlockList().push_back(New);
 
+      assert((*BB != Header || LI->getLoopFor(*BB) == L) &&
+             "Header should not be in a sub-loop");
       // Tell LI about New.
-      if (*BB == Header) {
-        assert(LI->getLoopFor(*BB) == L && "Header should not be in a sub-loop");
-        L->addBasicBlockToLoop(New, *LI);
-      } else {
-        const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
-        if (OldLoop) {
-          LoopsToSimplify.insert(NewLoops[OldLoop]);
+      const Loop *OldLoop = addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
+      if (OldLoop) {
+        LoopsToSimplify.insert(NewLoops[OldLoop]);
 
-          // Forget the old loop, since its inputs may have changed.
-          if (SE)
-            SE->forgetLoop(OldLoop);
-        }
+        // Forget the old loop, since its inputs may have changed.
+        if (SE)
+          SE->forgetLoop(OldLoop);
       }
 
       if (*BB == Header)
diff --git a/lib/Transforms/Utils/LoopUnrollPeel.cpp b/lib/Transforms/Utils/LoopUnrollPeel.cpp
index 3cb9e6e2e02fe61a02903030eefee696f3b19f1c..73c14f5606b73035e2f046136ddd03ec191ac5ea 100644
--- a/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -56,12 +56,20 @@ static bool canPeel(Loop *L) {
   if (!L->getExitingBlock() || !L->getUniqueExitBlock())
     return false;
 
+  // Don't try to peel loops where the latch is not the exiting block.
+  // This can be an indication of two different things:
+  // 1) The loop is not rotated.
+  // 2) The loop contains irreducible control flow that involves the latch.
+  if (L->getLoopLatch() != L->getExitingBlock())
+    return false;
+
   return true;
 }
 
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
-                            TargetTransformInfo::UnrollingPreferences &UP) {
+                            TargetTransformInfo::UnrollingPreferences &UP,
+                            unsigned &TripCount) {
   UP.PeelCount = 0;
   if (!canPeel(L))
     return;
@@ -70,6 +78,39 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (!L->empty())
     return;
 
+  // Try to find a Phi node that has the same loop invariant as an input from
+  // its only back edge. If there is such Phi, peeling 1 iteration from the
+  // loop is profitable, because starting from 2nd iteration we will have an
+  // invariant instead of this Phi.
+  if (LoopSize <= UP.Threshold) {
+    BasicBlock *BackEdge = L->getLoopLatch();
+    assert(BackEdge && "Loop is not in simplified form?");
+    BasicBlock *Header = L->getHeader();
+    // Iterate over Phis to find one with invariant input on back edge.
+    bool FoundCandidate = false;
+    PHINode *Phi;
+    for (auto BI = Header->begin(); isa<PHINode>(&*BI); ++BI) {
+      Phi = cast<PHINode>(&*BI);
+      Value *Input = Phi->getIncomingValueForBlock(BackEdge);
+      if (L->isLoopInvariant(Input)) {
+        FoundCandidate = true;
+        break;
+      }
+    }
+    if (FoundCandidate) {
+      DEBUG(dbgs() << "Peel one iteration to get rid of " << *Phi
+                   << " because starting from 2nd iteration it is always"
+                   << " an invariant\n");
+      UP.PeelCount = 1;
+      return;
+    }
+  }
+
+  // Bail if we know the statically calculated trip count.
+  // In this case we rather prefer partial unrolling.
+  if (TripCount)
+    return;
+
   // If the user provided a peel count, use that.
   bool UserPeelCount = UnrollForcePeelCount.getNumOccurrences() > 0;
   if (UserPeelCount) {
@@ -374,6 +415,11 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
 
     cloneLoopBlocks(L, Iter, InsertTop, InsertBot, Exit,
                     NewBlocks, LoopBlocks, VMap, LVMap, DT, LI);
+
+    // Remap to use values from the current iteration instead of the
+    // previous one.
+    remapInstructionsInBlocks(NewBlocks, VMap);
+
     if (DT) {
       // Latches of the cloned loops dominate over the loop exit, so idom of the
       // latter is the first cloned loop body, as original PreHeader dominates
@@ -396,10 +442,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     F->getBasicBlockList().splice(InsertTop->getIterator(),
                                   F->getBasicBlockList(),
                                   NewBlocks[0]->getIterator(), F->end());
-
-    // Remap to use values from the current iteration instead of the
-    // previous one.
-    remapInstructionsInBlocks(NewBlocks, VMap);
   }
 
   // Now adjust the phi nodes in the loop header to get their initial values
diff --git a/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index 8406c563a9c274733d2bdd020af6293eb1ab1108..85db734fb182755cda9ea1025a5f01c3c58b5cfb 100644
--- a/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -300,20 +300,10 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
   Function *F = Header->getParent();
   LoopBlocksDFS::RPOIterator BlockBegin = LoopBlocks.beginRPO();
   LoopBlocksDFS::RPOIterator BlockEnd = LoopBlocks.endRPO();
-  Loop *NewLoop = nullptr;
   Loop *ParentLoop = L->getParentLoop();
-  if (CreateRemainderLoop) {
-    NewLoop = new Loop();
-    if (ParentLoop)
-      ParentLoop->addChildLoop(NewLoop);
-    else
-      LI->addTopLevelLoop(NewLoop);
-  }
-
   NewLoopsMap NewLoops;
-  if (NewLoop)
-    NewLoops[L] = NewLoop;
-  else if (ParentLoop)
+  NewLoops[ParentLoop] = ParentLoop;
+  if (!CreateRemainderLoop)
     NewLoops[L] = ParentLoop;
 
   // For each block in the original loop, create a new copy,
@@ -321,7 +311,7 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
   for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, "." + suffix, F);
     NewBlocks.push_back(NewBB);
-   
+
     // If we're unrolling the outermost loop, there's no remainder loop,
     // and this block isn't in a nested loop, then the new block is not
     // in any loop. Otherwise, add it to loopinfo.
@@ -396,7 +386,9 @@ static void CloneLoopBlocks(Loop *L, Value *NewIter,
         NewPHI->setIncomingValue(idx, V);
     }
   }
-  if (NewLoop) {
+  if (CreateRemainderLoop) {
+    Loop *NewLoop = NewLoops[L];
+    assert(NewLoop && "L should have been cloned");
     // Add unroll disable metadata to disable future unrolling for this loop.
     SmallVector<Metadata *, 4> MDs;
     // Reserve first location for self reference to the LoopID metadata node.
diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp
index b5e6418ad3251f204ea2a7be2a92bb4f7262daf8..444bc16e0a1567ecc3d45ea9ba19fe588c2e6d76 100644
--- a/lib/Transforms/Utils/LoopUtils.cpp
+++ b/lib/Transforms/Utils/LoopUtils.cpp
@@ -553,13 +553,23 @@ bool RecurrenceDescriptor::isFirstOrderRecurrence(PHINode *Phi, Loop *TheLoop,
   if (!Previous || !TheLoop->contains(Previous) || isa<PHINode>(Previous))
     return false;
 
-  // Ensure every user of the phi node is dominated by the previous value. The
-  // dominance requirement ensures the loop vectorizer will not need to
-  // vectorize the initial value prior to the first iteration of the loop.
   for (User *U : Phi->users())
-    if (auto *I = dyn_cast<Instruction>(U))
+    if (auto *I = dyn_cast<Instruction>(U)) {
+      // Ensure every user of the phi node is dominated by the previous value.
+      // The dominance requirement ensures the loop vectorizer will not need to
+      // vectorize the initial value prior to the first iteration of the loop.
       if (!DT->dominates(Previous, I))
         return false;
+      // When the phi node has users outside the loop, the current logic for
+      // fixFirstOrderRecurrences may generate incorrect code. Specifically, we
+      // extract the last element from the vectorized phi, which would be the
+      // update to the phi before exiting the loop. However, what we want is the
+      // previous phi value before the update (i.e. the second last update
+      // before end of the vectorized loop).
+      // See added test cases in first-order-recurrence.ll
+      if (!TheLoop->contains(I))
+        return false;
+    }
 
   return true;
 }
diff --git a/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..c7cb561b5e21d0699a927870de7543fc57396c0c
--- /dev/null
+++ b/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -0,0 +1,231 @@
+//===- LowerMemIntrinsics.cpp ----------------------------------*- C++ -*--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/LowerMemIntrinsics.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+
+using namespace llvm;
+
+void llvm::createMemCpyLoop(Instruction *InsertBefore,
+                            Value *SrcAddr, Value *DstAddr, Value *CopyLen,
+                            unsigned SrcAlign, unsigned DestAlign,
+                            bool SrcIsVolatile, bool DstIsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  BasicBlock *NewBB =
+    InsertBefore->getParent()->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "loadstoreloop",
+                                          F, NewBB);
+
+  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // SrcAddr and DstAddr are expected to be pointer types,
+  // so no check is made here.
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+
+  // Cast pointers to (char *)
+  SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS));
+  DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS));
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
+
+  // load from SrcAddr+LoopIndex
+  // TODO: we can leverage the align parameter of llvm.memcpy for more efficient
+  // word-sized loads and stores.
+  Value *Element =
+    LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP(
+                             LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex),
+                           SrcIsVolatile);
+  // store at DstAddr+LoopIndex
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(),
+                                                        DstAddr, LoopIndex),
+                          DstIsVolatile);
+
+  // The value for LoopIndex coming from backedge is (LoopIndex + 1)
+  Value *NewIndex =
+    LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+// Lower memmove to IR. memmove is required to correctly copy overlapping memory
+// regions; therefore, it has to check the relative positions of the source and
+// destination pointers and choose the copy direction accordingly.
+//
+// The code below is an IR rendition of this C function:
+//
+// void* memmove(void* dst, const void* src, size_t n) {
+//   unsigned char* d = dst;
+//   const unsigned char* s = src;
+//   if (s < d) {
+//     // copy backwards
+//     while (n--) {
+//       d[n] = s[n];
+//     }
+//   } else {
+//     // copy forward
+//     for (size_t i = 0; i < n; ++i) {
+//       d[i] = s[i];
+//     }
+//   }
+//   return dst;
+// }
+static void createMemMoveLoop(Instruction *InsertBefore,
+                              Value *SrcAddr, Value *DstAddr, Value *CopyLen,
+                              unsigned SrcAlign, unsigned DestAlign,
+                              bool SrcIsVolatile, bool DstIsVolatile) {
+  Type *TypeOfCopyLen = CopyLen->getType();
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+
+  // Create the a comparison of src and dst, based on which we jump to either
+  // the forward-copy part of the function (if src >= dst) or the backwards-copy
+  // part (if src < dst).
+  // SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
+  // structure. Its block terminators (unconditional branches) are replaced by
+  // the appropriate conditional branches when the loop is built.
+  ICmpInst *PtrCompare = new ICmpInst(InsertBefore, ICmpInst::ICMP_ULT,
+                                      SrcAddr, DstAddr, "compare_src_dst");
+  TerminatorInst *ThenTerm, *ElseTerm;
+  SplitBlockAndInsertIfThenElse(PtrCompare, InsertBefore, &ThenTerm,
+                                &ElseTerm);
+
+  // Each part of the function consists of two blocks:
+  //   copy_backwards:        used to skip the loop when n == 0
+  //   copy_backwards_loop:   the actual backwards loop BB
+  //   copy_forward:          used to skip the loop when n == 0
+  //   copy_forward_loop:     the actual forward loop BB
+  BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
+  CopyBackwardsBB->setName("copy_backwards");
+  BasicBlock *CopyForwardBB = ElseTerm->getParent();
+  CopyForwardBB->setName("copy_forward");
+  BasicBlock *ExitBB = InsertBefore->getParent();
+  ExitBB->setName("memmove_done");
+
+  // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
+  // between both backwards and forward copy clauses.
+  ICmpInst *CompareN =
+      new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, CopyLen,
+                   ConstantInt::get(TypeOfCopyLen, 0), "compare_n_to_0");
+
+  // Copying backwards.
+  BasicBlock *LoopBB =
+    BasicBlock::Create(F->getContext(), "copy_backwards_loop", F, CopyForwardBB);
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
+  Value *IndexPtr = LoopBuilder.CreateSub(
+      LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
+  Value *Element = LoopBuilder.CreateLoad(
+      LoopBuilder.CreateInBoundsGEP(SrcAddr, IndexPtr), "element");
+  LoopBuilder.CreateStore(Element,
+                          LoopBuilder.CreateInBoundsGEP(DstAddr, IndexPtr));
+  LoopBuilder.CreateCondBr(
+      LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
+      ExitBB, LoopBB);
+  LoopPhi->addIncoming(IndexPtr, LoopBB);
+  LoopPhi->addIncoming(CopyLen, CopyBackwardsBB);
+  BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
+  ThenTerm->eraseFromParent();
+
+  // Copying forward.
+  BasicBlock *FwdLoopBB =
+    BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
+  IRBuilder<> FwdLoopBuilder(FwdLoopBB);
+  PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
+  Value *FwdElement = FwdLoopBuilder.CreateLoad(
+      FwdLoopBuilder.CreateInBoundsGEP(SrcAddr, FwdCopyPhi), "element");
+  FwdLoopBuilder.CreateStore(
+      FwdElement, FwdLoopBuilder.CreateInBoundsGEP(DstAddr, FwdCopyPhi));
+  Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
+      FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
+  FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
+                              ExitBB, FwdLoopBB);
+  FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
+  FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), CopyForwardBB);
+
+  BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
+  ElseTerm->eraseFromParent();
+}
+
+static void createMemSetLoop(Instruction *InsertBefore,
+                             Value *DstAddr, Value *CopyLen, Value *SetValue,
+                             unsigned Align, bool IsVolatile) {
+  BasicBlock *OrigBB = InsertBefore->getParent();
+  Function *F = OrigBB->getParent();
+  BasicBlock *NewBB =
+      OrigBB->splitBasicBlock(InsertBefore, "split");
+  BasicBlock *LoopBB
+    = BasicBlock::Create(F->getContext(), "loadstoreloop", F, NewBB);
+
+  OrigBB->getTerminator()->setSuccessor(0, LoopBB);
+  IRBuilder<> Builder(OrigBB->getTerminator());
+
+  // Cast pointer to the type of value getting stored
+  unsigned dstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
+  DstAddr = Builder.CreateBitCast(DstAddr,
+                                  PointerType::get(SetValue->getType(), dstAS));
+
+  IRBuilder<> LoopBuilder(LoopBB);
+  PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLen->getType(), 0);
+  LoopIndex->addIncoming(ConstantInt::get(CopyLen->getType(), 0), OrigBB);
+
+  LoopBuilder.CreateStore(
+      SetValue,
+      LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
+      IsVolatile);
+
+  Value *NewIndex =
+      LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLen->getType(), 1));
+  LoopIndex->addIncoming(NewIndex, LoopBB);
+
+  LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB,
+                           NewBB);
+}
+
+void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy) {
+  createMemCpyLoop(/* InsertBefore */ Memcpy,
+                   /* SrcAddr */ Memcpy->getRawSource(),
+                   /* DstAddr */ Memcpy->getRawDest(),
+                   /* CopyLen */ Memcpy->getLength(),
+                   /* SrcAlign */ Memcpy->getAlignment(),
+                   /* DestAlign */ Memcpy->getAlignment(),
+                   /* SrcIsVolatile */ Memcpy->isVolatile(),
+                   /* DstIsVolatile */ Memcpy->isVolatile());
+}
+
+void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
+  createMemMoveLoop(/* InsertBefore */ Memmove,
+                    /* SrcAddr */ Memmove->getRawSource(),
+                    /* DstAddr */ Memmove->getRawDest(),
+                    /* CopyLen */ Memmove->getLength(),
+                    /* SrcAlign */ Memmove->getAlignment(),
+                    /* DestAlign */ Memmove->getAlignment(),
+                    /* SrcIsVolatile */ Memmove->isVolatile(),
+                    /* DstIsVolatile */ Memmove->isVolatile());
+}
+
+void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
+  createMemSetLoop(/* InsertBefore */ Memset,
+                   /* DstAddr */ Memset->getRawDest(),
+                   /* CopyLen */ Memset->getLength(),
+                   /* SetValue */ Memset->getValue(),
+                   /* Alignment */ Memset->getAlignment(),
+                   Memset->isVolatile());
+}
diff --git a/lib/Transforms/Utils/LowerSwitch.cpp b/lib/Transforms/Utils/LowerSwitch.cpp
index 75cd3bc8b2bfbd6da56c672290b2073e36c19fc5..b375d51005d574319b19996d4175634a7f92a189 100644
--- a/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/lib/Transforms/Utils/LowerSwitch.cpp
@@ -356,10 +356,10 @@ unsigned LowerSwitch::Clusterify(CaseVector& Cases, SwitchInst *SI) {
   unsigned numCmps = 0;
 
   // Start with "simple" cases
-  for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e; ++i)
-    Cases.push_back(CaseRange(i.getCaseValue(), i.getCaseValue(),
-                              i.getCaseSuccessor()));
-  
+  for (auto Case : SI->cases())
+    Cases.push_back(CaseRange(Case.getCaseValue(), Case.getCaseValue(),
+                              Case.getCaseSuccessor()));
+
   std::sort(Cases.begin(), Cases.end(), CaseCmp());
 
   // Merge case into clusters
diff --git a/lib/Transforms/Utils/Mem2Reg.cpp b/lib/Transforms/Utils/Mem2Reg.cpp
index 4984c3cc2a72e96c1dfd20a02b57499dac401465..b659a2e4463fffe30ceff476e8a180488d93cd5e 100644
--- a/lib/Transforms/Utils/Mem2Reg.cpp
+++ b/lib/Transforms/Utils/Mem2Reg.cpp
@@ -46,7 +46,7 @@ static bool promoteMemoryToRegister(Function &F, DominatorTree &DT,
     if (Allocas.empty())
       break;
 
-    PromoteMemToReg(Allocas, DT, nullptr, &AC);
+    PromoteMemToReg(Allocas, DT, &AC);
     NumPromoted += Allocas.size();
     Changed = true;
   }
diff --git a/lib/Transforms/Utils/MetaRenamer.cpp b/lib/Transforms/Utils/MetaRenamer.cpp
index c999bd008fefd3006a64e2fd6df2c7fffc49e94d..481c6aa29c3a1d12c38cb6f7a856d9108f6a1a10 100644
--- a/lib/Transforms/Utils/MetaRenamer.cpp
+++ b/lib/Transforms/Utils/MetaRenamer.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
@@ -67,6 +68,7 @@ namespace {
     }
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
+      AU.addRequired<TargetLibraryInfoWrapperPass>();
       AU.setPreservesAll();
     }
 
@@ -110,9 +112,15 @@ namespace {
       }
 
       // Rename all functions
+      const TargetLibraryInfo &TLI =
+          getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       for (auto &F : M) {
         StringRef Name = F.getName();
-        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1))
+        LibFunc Tmp;
+        // Leave library functions alone because their presence or absence could
+        // affect the behavior of other passes.
+        if (Name.startswith("llvm.") || (!Name.empty() && Name[0] == 1) ||
+            TLI.getLibFunc(F, Tmp))
           continue;
 
         F.setName(renamer.newName());
@@ -139,8 +147,11 @@ namespace {
 }
 
 char MetaRenamer::ID = 0;
-INITIALIZE_PASS(MetaRenamer, "metarenamer", 
-                "Assign new names to everything", false, false)
+INITIALIZE_PASS_BEGIN(MetaRenamer, "metarenamer",
+                      "Assign new names to everything", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_END(MetaRenamer, "metarenamer",
+                    "Assign new names to everything", false, false)
 //===----------------------------------------------------------------------===//
 //
 // MetaRenamer - Rename everything with metasyntactic names.
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index a53da85f79f2b185fb2503565d5a6d65f79468e3..dbe42c201dd4f77e1a803eb7cb5d2375f2025204 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -138,6 +138,17 @@ Function *llvm::checkSanitizerInterfaceFunction(Constant *FuncOrBitcast) {
   report_fatal_error(Err);
 }
 
+Function *llvm::declareSanitizerInitFunction(Module &M, StringRef InitName,
+                                             ArrayRef<Type *> InitArgTypes) {
+  assert(!InitName.empty() && "Expected init function name");
+  Function *F = checkSanitizerInterfaceFunction(M.getOrInsertFunction(
+      InitName,
+      FunctionType::get(Type::getVoidTy(M.getContext()), InitArgTypes, false),
+      AttributeList()));
+  F->setLinkage(Function::ExternalLinkage);
+  return F;
+}
+
 std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
     Module &M, StringRef CtorName, StringRef InitName,
     ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
@@ -145,22 +156,19 @@ std::pair<Function *, Function *> llvm::createSanitizerCtorAndInitFunctions(
   assert(!InitName.empty() && "Expected init function name");
   assert(InitArgs.size() == InitArgTypes.size() &&
          "Sanitizer's init function expects different number of arguments");
+  Function *InitFunction =
+      declareSanitizerInitFunction(M, InitName, InitArgTypes);
   Function *Ctor = Function::Create(
       FunctionType::get(Type::getVoidTy(M.getContext()), false),
       GlobalValue::InternalLinkage, CtorName, &M);
   BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
   IRBuilder<> IRB(ReturnInst::Create(M.getContext(), CtorBB));
-  Function *InitFunction =
-      checkSanitizerInterfaceFunction(M.getOrInsertFunction(
-          InitName, FunctionType::get(IRB.getVoidTy(), InitArgTypes, false),
-          AttributeSet()));
-  InitFunction->setLinkage(Function::ExternalLinkage);
   IRB.CreateCall(InitFunction, InitArgs);
   if (!VersionCheckName.empty()) {
     Function *VersionCheckFunction =
         checkSanitizerInterfaceFunction(M.getOrInsertFunction(
             VersionCheckName, FunctionType::get(IRB.getVoidTy(), {}, false),
-            AttributeSet()));
+            AttributeList()));
     IRB.CreateCall(VersionCheckFunction, {});
   }
   return std::make_pair(Ctor, InitFunction);
diff --git a/lib/Transforms/Utils/PredicateInfo.cpp b/lib/Transforms/Utils/PredicateInfo.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8877aeafecdec8ebbce57a989a4c0286dce6a7d0
--- /dev/null
+++ b/lib/Transforms/Utils/PredicateInfo.cpp
@@ -0,0 +1,782 @@
+//===-- PredicateInfo.cpp - PredicateInfo Builder--------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------===//
+//
+// This file implements the PredicateInfo class.
+//
+//===----------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/PredicateInfo.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Transforms/Scalar.h"
+#include <algorithm>
+#define DEBUG_TYPE "predicateinfo"
+using namespace llvm;
+using namespace PatternMatch;
+using namespace llvm::PredicateInfoClasses;
+
+INITIALIZE_PASS_BEGIN(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+                      "PredicateInfo Printer", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
+                    "PredicateInfo Printer", false, false)
+static cl::opt<bool> VerifyPredicateInfo(
+    "verify-predicateinfo", cl::init(false), cl::Hidden,
+    cl::desc("Verify PredicateInfo in legacy printer pass."));
+namespace {
+DEBUG_COUNTER(RenameCounter, "predicateinfo-rename",
+              "Controls which variables are renamed with predicateinfo")
+// Given a predicate info that is a type of branching terminator, get the
+// branching block.
+const BasicBlock *getBranchBlock(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Only branches and switches should have PHIOnly defs that "
+         "require branch blocks.");
+  return cast<PredicateWithEdge>(PB)->From;
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// branching terminator.
+static Instruction *getBranchTerminator(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Not a predicate info type we know how to get a terminator from.");
+  return cast<PredicateWithEdge>(PB)->From->getTerminator();
+}
+
+// Given a predicate info that is a type of branching terminator, get the
+// edge this predicate info represents
+const std::pair<BasicBlock *, BasicBlock *>
+getBlockEdge(const PredicateBase *PB) {
+  assert(isa<PredicateWithEdge>(PB) &&
+         "Not a predicate info type we know how to get an edge from.");
+  const auto *PEdge = cast<PredicateWithEdge>(PB);
+  return std::make_pair(PEdge->From, PEdge->To);
+}
+}
+
+namespace llvm {
+namespace PredicateInfoClasses {
+enum LocalNum {
+  // Operations that must appear first in the block.
+  LN_First,
+  // Operations that are somewhere in the middle of the block, and are sorted on
+  // demand.
+  LN_Middle,
+  // Operations that must appear last in a block, like successor phi node uses.
+  LN_Last
+};
+
+// Associate global and local DFS info with defs and uses, so we can sort them
+// into a global domination ordering.
+struct ValueDFS {
+  int DFSIn = 0;
+  int DFSOut = 0;
+  unsigned int LocalNum = LN_Middle;
+  // Only one of Def or Use will be set.
+  Value *Def = nullptr;
+  Use *U = nullptr;
+  // Neither PInfo nor EdgeOnly participate in the ordering
+  PredicateBase *PInfo = nullptr;
+  bool EdgeOnly = false;
+};
+
+// This compares ValueDFS structures, creating OrderedBasicBlocks where
+// necessary to compare uses/defs in the same block.  Doing so allows us to walk
+// the minimum number of instructions necessary to compute our def/use ordering.
+struct ValueDFS_Compare {
+  DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> &OBBMap;
+  ValueDFS_Compare(
+      DenseMap<const BasicBlock *, std::unique_ptr<OrderedBasicBlock>> &OBBMap)
+      : OBBMap(OBBMap) {}
+  bool operator()(const ValueDFS &A, const ValueDFS &B) const {
+    if (&A == &B)
+      return false;
+    // The only case we can't directly compare them is when they in the same
+    // block, and both have localnum == middle.  In that case, we have to use
+    // comesbefore to see what the real ordering is, because they are in the
+    // same basic block.
+
+    bool SameBlock = std::tie(A.DFSIn, A.DFSOut) == std::tie(B.DFSIn, B.DFSOut);
+
+    // We want to put the def that will get used for a given set of phi uses,
+    // before those phi uses.
+    // So we sort by edge, then by def.
+    // Note that only phi nodes uses and defs can come last.
+    if (SameBlock && A.LocalNum == LN_Last && B.LocalNum == LN_Last)
+      return comparePHIRelated(A, B);
+
+    if (!SameBlock || A.LocalNum != LN_Middle || B.LocalNum != LN_Middle)
+      return std::tie(A.DFSIn, A.DFSOut, A.LocalNum, A.Def, A.U) <
+             std::tie(B.DFSIn, B.DFSOut, B.LocalNum, B.Def, B.U);
+    return localComesBefore(A, B);
+  }
+
+  // For a phi use, or a non-materialized def, return the edge it represents.
+  const std::pair<BasicBlock *, BasicBlock *>
+  getBlockEdge(const ValueDFS &VD) const {
+    if (!VD.Def && VD.U) {
+      auto *PHI = cast<PHINode>(VD.U->getUser());
+      return std::make_pair(PHI->getIncomingBlock(*VD.U), PHI->getParent());
+    }
+    // This is really a non-materialized def.
+    return ::getBlockEdge(VD.PInfo);
+  }
+
+  // For two phi related values, return the ordering.
+  bool comparePHIRelated(const ValueDFS &A, const ValueDFS &B) const {
+    auto &ABlockEdge = getBlockEdge(A);
+    auto &BBlockEdge = getBlockEdge(B);
+    // Now sort by block edge and then defs before uses.
+    return std::tie(ABlockEdge, A.Def, A.U) < std::tie(BBlockEdge, B.Def, B.U);
+  }
+
+  // Get the definition of an instruction that occurs in the middle of a block.
+  Value *getMiddleDef(const ValueDFS &VD) const {
+    if (VD.Def)
+      return VD.Def;
+    // It's possible for the defs and uses to be null.  For branches, the local
+    // numbering will say the placed predicaeinfos should go first (IE
+    // LN_beginning), so we won't be in this function. For assumes, we will end
+    // up here, beause we need to order the def we will place relative to the
+    // assume.  So for the purpose of ordering, we pretend the def is the assume
+    // because that is where we will insert the info.
+    if (!VD.U) {
+      assert(VD.PInfo &&
+             "No def, no use, and no predicateinfo should not occur");
+      assert(isa<PredicateAssume>(VD.PInfo) &&
+             "Middle of block should only occur for assumes");
+      return cast<PredicateAssume>(VD.PInfo)->AssumeInst;
+    }
+    return nullptr;
+  }
+
+  // Return either the Def, if it's not null, or the user of the Use, if the def
+  // is null.
+  const Instruction *getDefOrUser(const Value *Def, const Use *U) const {
+    if (Def)
+      return cast<Instruction>(Def);
+    return cast<Instruction>(U->getUser());
+  }
+
+  // This performs the necessary local basic block ordering checks to tell
+  // whether A comes before B, where both are in the same basic block.
+  bool localComesBefore(const ValueDFS &A, const ValueDFS &B) const {
+    auto *ADef = getMiddleDef(A);
+    auto *BDef = getMiddleDef(B);
+
+    // See if we have real values or uses. If we have real values, we are
+    // guaranteed they are instructions or arguments. No matter what, we are
+    // guaranteed they are in the same block if they are instructions.
+    auto *ArgA = dyn_cast_or_null<Argument>(ADef);
+    auto *ArgB = dyn_cast_or_null<Argument>(BDef);
+
+    if (ArgA && !ArgB)
+      return true;
+    if (ArgB && !ArgA)
+      return false;
+    if (ArgA && ArgB)
+      return ArgA->getArgNo() < ArgB->getArgNo();
+
+    auto *AInst = getDefOrUser(ADef, A.U);
+    auto *BInst = getDefOrUser(BDef, B.U);
+
+    auto *BB = AInst->getParent();
+    auto LookupResult = OBBMap.find(BB);
+    if (LookupResult != OBBMap.end())
+      return LookupResult->second->dominates(AInst, BInst);
+
+    auto Result = OBBMap.insert({BB, make_unique<OrderedBasicBlock>(BB)});
+    return Result.first->second->dominates(AInst, BInst);
+  }
+};
+
+} // namespace PredicateInfoClasses
+
+bool PredicateInfo::stackIsInScope(const ValueDFSStack &Stack,
+                                   const ValueDFS &VDUse) const {
+  if (Stack.empty())
+    return false;
+  // If it's a phi only use, make sure it's for this phi node edge, and that the
+  // use is in a phi node.  If it's anything else, and the top of the stack is
+  // EdgeOnly, we need to pop the stack.  We deliberately sort phi uses next to
+  // the defs they must go with so that we can know it's time to pop the stack
+  // when we hit the end of the phi uses for a given def.
+  if (Stack.back().EdgeOnly) {
+    if (!VDUse.U)
+      return false;
+    auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
+    if (!PHI)
+      return false;
+    // Check edge
+    BasicBlock *EdgePred = PHI->getIncomingBlock(*VDUse.U);
+    if (EdgePred != getBranchBlock(Stack.back().PInfo))
+      return false;
+
+    // Use dominates, which knows how to handle edge dominance.
+    return DT.dominates(getBlockEdge(Stack.back().PInfo), *VDUse.U);
+  }
+
+  return (VDUse.DFSIn >= Stack.back().DFSIn &&
+          VDUse.DFSOut <= Stack.back().DFSOut);
+}
+
+void PredicateInfo::popStackUntilDFSScope(ValueDFSStack &Stack,
+                                          const ValueDFS &VD) {
+  while (!Stack.empty() && !stackIsInScope(Stack, VD))
+    Stack.pop_back();
+}
+
+// Convert the uses of Op into a vector of uses, associating global and local
+// DFS info with each one.
+void PredicateInfo::convertUsesToDFSOrdered(
+    Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
+  for (auto &U : Op->uses()) {
+    if (auto *I = dyn_cast<Instruction>(U.getUser())) {
+      ValueDFS VD;
+      // Put the phi node uses in the incoming block.
+      BasicBlock *IBlock;
+      if (auto *PN = dyn_cast<PHINode>(I)) {
+        IBlock = PN->getIncomingBlock(U);
+        // Make phi node users appear last in the incoming block
+        // they are from.
+        VD.LocalNum = LN_Last;
+      } else {
+        // If it's not a phi node use, it is somewhere in the middle of the
+        // block.
+        IBlock = I->getParent();
+        VD.LocalNum = LN_Middle;
+      }
+      DomTreeNode *DomNode = DT.getNode(IBlock);
+      // It's possible our use is in an unreachable block. Skip it if so.
+      if (!DomNode)
+        continue;
+      VD.DFSIn = DomNode->getDFSNumIn();
+      VD.DFSOut = DomNode->getDFSNumOut();
+      VD.U = &U;
+      DFSOrderedSet.push_back(VD);
+    }
+  }
+}
+
+// Collect relevant operations from Comparison that we may want to insert copies
+// for.
+void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
+  auto *Op0 = Comparison->getOperand(0);
+  auto *Op1 = Comparison->getOperand(1);
+  if (Op0 == Op1)
+    return;
+  CmpOperands.push_back(Comparison);
+  // Only want real values, not constants.  Additionally, operands with one use
+  // are only being used in the comparison, which means they will not be useful
+  // for us to consider for predicateinfo.
+  //
+  if ((isa<Instruction>(Op0) || isa<Argument>(Op0)) && !Op0->hasOneUse())
+    CmpOperands.push_back(Op0);
+  if ((isa<Instruction>(Op1) || isa<Argument>(Op1)) && !Op1->hasOneUse())
+    CmpOperands.push_back(Op1);
+}
+
+// Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
+void PredicateInfo::addInfoFor(SmallPtrSetImpl<Value *> &OpsToRename, Value *Op,
+                               PredicateBase *PB) {
+  OpsToRename.insert(Op);
+  auto &OperandInfo = getOrCreateValueInfo(Op);
+  AllInfos.push_back(PB);
+  OperandInfo.Infos.push_back(PB);
+}
+
+// Process an assume instruction and place relevant operations we want to rename
+// into OpsToRename.
+void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  // See if we have a comparison we support
+  SmallVector<Value *, 8> CmpOperands;
+  SmallVector<Value *, 2> ConditionsToProcess;
+  CmpInst::Predicate Pred;
+  Value *Operand = II->getOperand(0);
+  if (m_c_And(m_Cmp(Pred, m_Value(), m_Value()),
+              m_Cmp(Pred, m_Value(), m_Value()))
+          .match(II->getOperand(0))) {
+    ConditionsToProcess.push_back(cast<BinaryOperator>(Operand)->getOperand(0));
+    ConditionsToProcess.push_back(cast<BinaryOperator>(Operand)->getOperand(1));
+    ConditionsToProcess.push_back(Operand);
+  } else if (isa<CmpInst>(Operand)) {
+
+    ConditionsToProcess.push_back(Operand);
+  }
+  for (auto Cond : ConditionsToProcess) {
+    if (auto *Cmp = dyn_cast<CmpInst>(Cond)) {
+      collectCmpOps(Cmp, CmpOperands);
+      // Now add our copy infos for our operands
+      for (auto *Op : CmpOperands) {
+        auto *PA = new PredicateAssume(Op, II, Cmp);
+        addInfoFor(OpsToRename, Op, PA);
+      }
+      CmpOperands.clear();
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(Cond)) {
+      // Otherwise, it should be an AND.
+      assert(BinOp->getOpcode() == Instruction::And &&
+             "Should have been an AND");
+      auto *PA = new PredicateAssume(BinOp, II, BinOp);
+      addInfoFor(OpsToRename, BinOp, PA);
+    } else {
+      llvm_unreachable("Unknown type of condition");
+    }
+  }
+}
+
+// Process a block terminating branch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  BasicBlock *FirstBB = BI->getSuccessor(0);
+  BasicBlock *SecondBB = BI->getSuccessor(1);
+  SmallVector<BasicBlock *, 2> SuccsToProcess;
+  SuccsToProcess.push_back(FirstBB);
+  SuccsToProcess.push_back(SecondBB);
+  SmallVector<Value *, 2> ConditionsToProcess;
+
+  auto InsertHelper = [&](Value *Op, bool isAnd, bool isOr, Value *Cond) {
+    for (auto *Succ : SuccsToProcess) {
+      // Don't try to insert on a self-edge. This is mainly because we will
+      // eliminate during renaming anyway.
+      if (Succ == BranchBB)
+        continue;
+      bool TakenEdge = (Succ == FirstBB);
+      // For and, only insert on the true edge
+      // For or, only insert on the false edge
+      if ((isAnd && !TakenEdge) || (isOr && TakenEdge))
+        continue;
+      PredicateBase *PB =
+          new PredicateBranch(Op, BranchBB, Succ, Cond, TakenEdge);
+      addInfoFor(OpsToRename, Op, PB);
+      if (!Succ->getSinglePredecessor())
+        EdgeUsesOnly.insert({BranchBB, Succ});
+    }
+  };
+
+  // Match combinations of conditions.
+  CmpInst::Predicate Pred;
+  bool isAnd = false;
+  bool isOr = false;
+  SmallVector<Value *, 8> CmpOperands;
+  if (match(BI->getCondition(), m_And(m_Cmp(Pred, m_Value(), m_Value()),
+                                      m_Cmp(Pred, m_Value(), m_Value()))) ||
+      match(BI->getCondition(), m_Or(m_Cmp(Pred, m_Value(), m_Value()),
+                                     m_Cmp(Pred, m_Value(), m_Value())))) {
+    auto *BinOp = cast<BinaryOperator>(BI->getCondition());
+    if (BinOp->getOpcode() == Instruction::And)
+      isAnd = true;
+    else if (BinOp->getOpcode() == Instruction::Or)
+      isOr = true;
+    ConditionsToProcess.push_back(BinOp->getOperand(0));
+    ConditionsToProcess.push_back(BinOp->getOperand(1));
+    ConditionsToProcess.push_back(BI->getCondition());
+  } else if (isa<CmpInst>(BI->getCondition())) {
+    ConditionsToProcess.push_back(BI->getCondition());
+  }
+  for (auto Cond : ConditionsToProcess) {
+    if (auto *Cmp = dyn_cast<CmpInst>(Cond)) {
+      collectCmpOps(Cmp, CmpOperands);
+      // Now add our copy infos for our operands
+      for (auto *Op : CmpOperands)
+        InsertHelper(Op, isAnd, isOr, Cmp);
+    } else if (auto *BinOp = dyn_cast<BinaryOperator>(Cond)) {
+      // This must be an AND or an OR.
+      assert((BinOp->getOpcode() == Instruction::And ||
+              BinOp->getOpcode() == Instruction::Or) &&
+             "Should have been an AND or an OR");
+      // The actual value of the binop is not subject to the same restrictions
+      // as the comparison. It's either true or false on the true/false branch.
+      InsertHelper(BinOp, false, false, BinOp);
+    } else {
+      llvm_unreachable("Unknown type of condition");
+    }
+    CmpOperands.clear();
+  }
+}
+// Process a block terminating switch, and place relevant operations to be
+// renamed into OpsToRename.
+void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB,
+                                  SmallPtrSetImpl<Value *> &OpsToRename) {
+  Value *Op = SI->getCondition();
+  if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse())
+    return;
+
+  // Remember how many outgoing edges there are to every successor.
+  SmallDenseMap<BasicBlock *, unsigned, 16> SwitchEdges;
+  for (unsigned i = 0, e = SI->getNumSuccessors(); i != e; ++i) {
+    BasicBlock *TargetBlock = SI->getSuccessor(i);
+    ++SwitchEdges[TargetBlock];
+  }
+
+  // Now propagate info for each case value
+  for (auto C : SI->cases()) {
+    BasicBlock *TargetBlock = C.getCaseSuccessor();
+    if (SwitchEdges.lookup(TargetBlock) == 1) {
+      PredicateSwitch *PS = new PredicateSwitch(
+          Op, SI->getParent(), TargetBlock, C.getCaseValue(), SI);
+      addInfoFor(OpsToRename, Op, PS);
+      if (!TargetBlock->getSinglePredecessor())
+        EdgeUsesOnly.insert({BranchBB, TargetBlock});
+    }
+  }
+}
+
+// Build predicate info for our function
+void PredicateInfo::buildPredicateInfo() {
+  DT.updateDFSNumbers();
+  // Collect operands to rename from all conditional branch terminators, as well
+  // as assume statements.
+  SmallPtrSet<Value *, 8> OpsToRename;
+  for (auto DTN : depth_first(DT.getRootNode())) {
+    BasicBlock *BranchBB = DTN->getBlock();
+    if (auto *BI = dyn_cast<BranchInst>(BranchBB->getTerminator())) {
+      if (!BI->isConditional())
+        continue;
+      processBranch(BI, BranchBB, OpsToRename);
+    } else if (auto *SI = dyn_cast<SwitchInst>(BranchBB->getTerminator())) {
+      processSwitch(SI, BranchBB, OpsToRename);
+    }
+  }
+  for (auto &Assume : AC.assumptions()) {
+    if (auto *II = dyn_cast_or_null<IntrinsicInst>(Assume))
+      processAssume(II, II->getParent(), OpsToRename);
+  }
+  // Now rename all our operations.
+  renameUses(OpsToRename);
+}
+
+// Given the renaming stack, make all the operands currently on the stack real
+// by inserting them into the IR.  Return the last operation's value.
+Value *PredicateInfo::materializeStack(unsigned int &Counter,
+                                       ValueDFSStack &RenameStack,
+                                       Value *OrigOp) {
+  // Find the first thing we have to materialize
+  auto RevIter = RenameStack.rbegin();
+  for (; RevIter != RenameStack.rend(); ++RevIter)
+    if (RevIter->Def)
+      break;
+
+  size_t Start = RevIter - RenameStack.rbegin();
+  // The maximum number of things we should be trying to materialize at once
+  // right now is 4, depending on if we had an assume, a branch, and both used
+  // and of conditions.
+  for (auto RenameIter = RenameStack.end() - Start;
+       RenameIter != RenameStack.end(); ++RenameIter) {
+    auto *Op =
+        RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def;
+    ValueDFS &Result = *RenameIter;
+    auto *ValInfo = Result.PInfo;
+    // For edge predicates, we can just place the operand in the block before
+    // the terminator.  For assume, we have to place it right before the assume
+    // to ensure we dominate all of our uses.  Always insert right before the
+    // relevant instruction (terminator, assume), so that we insert in proper
+    // order in the case of multiple predicateinfo in the same block.
+    if (isa<PredicateWithEdge>(ValInfo)) {
+      IRBuilder<> B(getBranchTerminator(ValInfo));
+      Function *IF = Intrinsic::getDeclaration(
+          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      CallInst *PIC =
+          B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
+      PredicateMap.insert({PIC, ValInfo});
+      Result.Def = PIC;
+    } else {
+      auto *PAssume = dyn_cast<PredicateAssume>(ValInfo);
+      assert(PAssume &&
+             "Should not have gotten here without it being an assume");
+      IRBuilder<> B(PAssume->AssumeInst);
+      Function *IF = Intrinsic::getDeclaration(
+          F.getParent(), Intrinsic::ssa_copy, Op->getType());
+      CallInst *PIC = B.CreateCall(IF, Op);
+      PredicateMap.insert({PIC, ValInfo});
+      Result.Def = PIC;
+    }
+  }
+  return RenameStack.back().Def;
+}
+
+// Instead of the standard SSA renaming algorithm, which is O(Number of
+// instructions), and walks the entire dominator tree, we walk only the defs +
+// uses.  The standard SSA renaming algorithm does not really rely on the
+// dominator tree except to order the stack push/pops of the renaming stacks, so
+// that defs end up getting pushed before hitting the correct uses.  This does
+// not require the dominator tree, only the *order* of the dominator tree. The
+// complete and correct ordering of the defs and uses, in dominator tree is
+// contained in the DFS numbering of the dominator tree. So we sort the defs and
+// uses into the DFS ordering, and then just use the renaming stack as per
+// normal, pushing when we hit a def (which is a predicateinfo instruction),
+// popping when we are out of the dfs scope for that def, and replacing any uses
+// with top of stack if it exists.  In order to handle liveness without
+// propagating liveness info, we don't actually insert the predicateinfo
+// instruction def until we see a use that it would dominate.  Once we see such
+// a use, we materialize the predicateinfo instruction in the right place and
+// use it.
+//
+// TODO: Use this algorithm to perform fast single-variable renaming in
+// promotememtoreg and memoryssa.
+void PredicateInfo::renameUses(SmallPtrSetImpl<Value *> &OpsToRename) {
+  ValueDFS_Compare Compare(OBBMap);
+  // Compute liveness, and rename in O(uses) per Op.
+  for (auto *Op : OpsToRename) {
+    unsigned Counter = 0;
+    SmallVector<ValueDFS, 16> OrderedUses;
+    const auto &ValueInfo = getValueInfo(Op);
+    // Insert the possible copies into the def/use list.
+    // They will become real copies if we find a real use for them, and never
+    // created otherwise.
+    for (auto &PossibleCopy : ValueInfo.Infos) {
+      ValueDFS VD;
+      // Determine where we are going to place the copy by the copy type.
+      // The predicate info for branches always come first, they will get
+      // materialized in the split block at the top of the block.
+      // The predicate info for assumes will be somewhere in the middle,
+      // it will get materialized in front of the assume.
+      if (const auto *PAssume = dyn_cast<PredicateAssume>(PossibleCopy)) {
+        VD.LocalNum = LN_Middle;
+        DomTreeNode *DomNode = DT.getNode(PAssume->AssumeInst->getParent());
+        if (!DomNode)
+          continue;
+        VD.DFSIn = DomNode->getDFSNumIn();
+        VD.DFSOut = DomNode->getDFSNumOut();
+        VD.PInfo = PossibleCopy;
+        OrderedUses.push_back(VD);
+      } else if (isa<PredicateWithEdge>(PossibleCopy)) {
+        // If we can only do phi uses, we treat it like it's in the branch
+        // block, and handle it specially. We know that it goes last, and only
+        // dominate phi uses.
+        auto BlockEdge = getBlockEdge(PossibleCopy);
+        if (EdgeUsesOnly.count(BlockEdge)) {
+          VD.LocalNum = LN_Last;
+          auto *DomNode = DT.getNode(BlockEdge.first);
+          if (DomNode) {
+            VD.DFSIn = DomNode->getDFSNumIn();
+            VD.DFSOut = DomNode->getDFSNumOut();
+            VD.PInfo = PossibleCopy;
+            VD.EdgeOnly = true;
+            OrderedUses.push_back(VD);
+          }
+        } else {
+          // Otherwise, we are in the split block (even though we perform
+          // insertion in the branch block).
+          // Insert a possible copy at the split block and before the branch.
+          VD.LocalNum = LN_First;
+          auto *DomNode = DT.getNode(BlockEdge.second);
+          if (DomNode) {
+            VD.DFSIn = DomNode->getDFSNumIn();
+            VD.DFSOut = DomNode->getDFSNumOut();
+            VD.PInfo = PossibleCopy;
+            OrderedUses.push_back(VD);
+          }
+        }
+      }
+    }
+
+    convertUsesToDFSOrdered(Op, OrderedUses);
+    std::sort(OrderedUses.begin(), OrderedUses.end(), Compare);
+    SmallVector<ValueDFS, 8> RenameStack;
+    // For each use, sorted into dfs order, push values and replaces uses with
+    // top of stack, which will represent the reaching def.
+    for (auto &VD : OrderedUses) {
+      // We currently do not materialize copy over copy, but we should decide if
+      // we want to.
+      bool PossibleCopy = VD.PInfo != nullptr;
+      if (RenameStack.empty()) {
+        DEBUG(dbgs() << "Rename Stack is empty\n");
+      } else {
+        DEBUG(dbgs() << "Rename Stack Top DFS numbers are ("
+                     << RenameStack.back().DFSIn << ","
+                     << RenameStack.back().DFSOut << ")\n");
+      }
+
+      DEBUG(dbgs() << "Current DFS numbers are (" << VD.DFSIn << ","
+                   << VD.DFSOut << ")\n");
+
+      bool ShouldPush = (VD.Def || PossibleCopy);
+      bool OutOfScope = !stackIsInScope(RenameStack, VD);
+      if (OutOfScope || ShouldPush) {
+        // Sync to our current scope.
+        popStackUntilDFSScope(RenameStack, VD);
+        if (ShouldPush) {
+          RenameStack.push_back(VD);
+        }
+      }
+      // If we get to this point, and the stack is empty we must have a use
+      // with no renaming needed, just skip it.
+      if (RenameStack.empty())
+        continue;
+      // Skip values, only want to rename the uses
+      if (VD.Def || PossibleCopy)
+        continue;
+      if (!DebugCounter::shouldExecute(RenameCounter)) {
+        DEBUG(dbgs() << "Skipping execution due to debug counter\n");
+        continue;
+      }
+      ValueDFS &Result = RenameStack.back();
+
+      // If the possible copy dominates something, materialize our stack up to
+      // this point. This ensures every comparison that affects our operation
+      // ends up with predicateinfo.
+      if (!Result.Def)
+        Result.Def = materializeStack(Counter, RenameStack, Op);
+
+      DEBUG(dbgs() << "Found replacement " << *Result.Def << " for "
+                   << *VD.U->get() << " in " << *(VD.U->getUser()) << "\n");
+      assert(DT.dominates(cast<Instruction>(Result.Def), *VD.U) &&
+             "Predicateinfo def should have dominated this use");
+      VD.U->set(Result.Def);
+    }
+  }
+}
+
+PredicateInfo::ValueInfo &PredicateInfo::getOrCreateValueInfo(Value *Operand) {
+  auto OIN = ValueInfoNums.find(Operand);
+  if (OIN == ValueInfoNums.end()) {
+    // This will grow it
+    ValueInfos.resize(ValueInfos.size() + 1);
+    // This will use the new size and give us a 0 based number of the info
+    auto InsertResult = ValueInfoNums.insert({Operand, ValueInfos.size() - 1});
+    assert(InsertResult.second && "Value info number already existed?");
+    return ValueInfos[InsertResult.first->second];
+  }
+  return ValueInfos[OIN->second];
+}
+
+const PredicateInfo::ValueInfo &
+PredicateInfo::getValueInfo(Value *Operand) const {
+  auto OINI = ValueInfoNums.lookup(Operand);
+  assert(OINI != 0 && "Operand was not really in the Value Info Numbers");
+  assert(OINI < ValueInfos.size() &&
+         "Value Info Number greater than size of Value Info Table");
+  return ValueInfos[OINI];
+}
+
+PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
+                             AssumptionCache &AC)
+    : F(F), DT(DT), AC(AC) {
+  // Push an empty operand info so that we can detect 0 as not finding one
+  ValueInfos.resize(1);
+  buildPredicateInfo();
+}
+
+PredicateInfo::~PredicateInfo() {}
+
+void PredicateInfo::verifyPredicateInfo() const {}
+
+char PredicateInfoPrinterLegacyPass::ID = 0;
+
+PredicateInfoPrinterLegacyPass::PredicateInfoPrinterLegacyPass()
+    : FunctionPass(ID) {
+  initializePredicateInfoPrinterLegacyPassPass(
+      *PassRegistry::getPassRegistry());
+}
+
+void PredicateInfoPrinterLegacyPass::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  AU.addRequiredTransitive<DominatorTreeWrapperPass>();
+  AU.addRequired<AssumptionCacheTracker>();
+}
+
+bool PredicateInfoPrinterLegacyPass::runOnFunction(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+  auto PredInfo = make_unique<PredicateInfo>(F, DT, AC);
+  PredInfo->print(dbgs());
+  if (VerifyPredicateInfo)
+    PredInfo->verifyPredicateInfo();
+  return false;
+}
+
+PreservedAnalyses PredicateInfoPrinterPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  OS << "PredicateInfo for function: " << F.getName() << "\n";
+  make_unique<PredicateInfo>(F, DT, AC)->print(OS);
+
+  return PreservedAnalyses::all();
+}
+
+/// \brief An assembly annotator class to print PredicateInfo information in
+/// comments.
+class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
+  friend class PredicateInfo;
+  const PredicateInfo *PredInfo;
+
+public:
+  PredicateInfoAnnotatedWriter(const PredicateInfo *M) : PredInfo(M) {}
+
+  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                        formatted_raw_ostream &OS) {}
+
+  virtual void emitInstructionAnnot(const Instruction *I,
+                                    formatted_raw_ostream &OS) {
+    if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
+      OS << "; Has predicate info\n";
+      if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
+        OS << "; branch predicate info { TrueEdge: " << PB->TrueEdge
+           << " Comparison:" << *PB->Condition << " Edge: [";
+        PB->From->printAsOperand(OS);
+        OS << ",";
+        PB->To->printAsOperand(OS);
+        OS << "] }\n";
+      } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) {
+        OS << "; switch predicate info { CaseValue: " << *PS->CaseValue
+           << " Switch:" << *PS->Switch << " Edge: [";
+        PS->From->printAsOperand(OS);
+        OS << ",";
+        PS->To->printAsOperand(OS);
+        OS << "] }\n";
+      } else if (const auto *PA = dyn_cast<PredicateAssume>(PI)) {
+        OS << "; assume predicate info {"
+           << " Comparison:" << *PA->Condition << " }\n";
+      }
+    }
+  }
+};
+
+void PredicateInfo::print(raw_ostream &OS) const {
+  PredicateInfoAnnotatedWriter Writer(this);
+  F.print(OS, &Writer);
+}
+
+void PredicateInfo::dump() const {
+  PredicateInfoAnnotatedWriter Writer(this);
+  F.print(dbgs(), &Writer);
+}
+
+PreservedAnalyses PredicateInfoVerifierPass::run(Function &F,
+                                                 FunctionAnalysisManager &AM) {
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &AC = AM.getResult<AssumptionAnalysis>(F);
+  make_unique<PredicateInfo>(F, DT, AC)->verifyPredicateInfo();
+
+  return PreservedAnalyses::all();
+}
+}
diff --git a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 35faa6f65efda22deb1cbefd03f533737951c76f..a33b85c4ee69ae9d08324dade18fa77b4d613bc8 100644
--- a/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -15,7 +15,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
@@ -23,6 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -38,6 +38,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
 #include <algorithm>
 using namespace llvm;
 
@@ -225,9 +226,6 @@ struct PromoteMem2Reg {
   DominatorTree &DT;
   DIBuilder DIB;
 
-  /// An AliasSetTracker object to update.  If null, don't update it.
-  AliasSetTracker *AST;
-
   /// A cache of @llvm.assume intrinsics used by SimplifyInstruction.
   AssumptionCache *AC;
 
@@ -269,10 +267,10 @@ struct PromoteMem2Reg {
 
 public:
   PromoteMem2Reg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                 AliasSetTracker *AST, AssumptionCache *AC)
+                 AssumptionCache *AC)
       : Allocas(Allocas.begin(), Allocas.end()), DT(DT),
         DIB(*DT.getRoot()->getParent()->getParent(), /*AllowUnresolved*/ false),
-        AST(AST), AC(AC) {}
+        AC(AC) {}
 
   void run();
 
@@ -301,6 +299,18 @@ private:
 
 } // end of anonymous namespace
 
+/// Given a LoadInst LI this adds assume(LI != null) after it.
+static void addAssumeNonNull(AssumptionCache *AC, LoadInst *LI) {
+  Function *AssumeIntrinsic =
+      Intrinsic::getDeclaration(LI->getModule(), Intrinsic::assume);
+  ICmpInst *LoadNotNull = new ICmpInst(ICmpInst::ICMP_NE, LI,
+                                       Constant::getNullValue(LI->getType()));
+  LoadNotNull->insertAfter(LI);
+  CallInst *CI = CallInst::Create(AssumeIntrinsic, {LoadNotNull});
+  CI->insertAfter(LoadNotNull);
+  AC->registerAssumption(CI);
+}
+
 static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
   // Knowing that this alloca is promotable, we know that it's safe to kill all
   // instructions except for load and store.
@@ -334,9 +344,8 @@ static void removeLifetimeIntrinsicUsers(AllocaInst *AI) {
 /// and thus must be phi-ed with undef. We fall back to the standard alloca
 /// promotion algorithm in that case.
 static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
-                                     LargeBlockInfo &LBI,
-                                     DominatorTree &DT,
-                                     AliasSetTracker *AST) {
+                                     LargeBlockInfo &LBI, DominatorTree &DT,
+                                     AssumptionCache *AC) {
   StoreInst *OnlyStore = Info.OnlyStore;
   bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
   BasicBlock *StoreBB = OnlyStore->getParent();
@@ -387,9 +396,15 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
     // code.
     if (ReplVal == LI)
       ReplVal = UndefValue::get(LI->getType());
+
+    // If the load was marked as nonnull we don't want to lose
+    // that information when we erase this Load. So we preserve
+    // it with an assume.
+    if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+        !llvm::isKnownNonNullAt(ReplVal, LI, &DT))
+      addAssumeNonNull(AC, LI);
+
     LI->replaceAllUsesWith(ReplVal);
-    if (AST && LI->getType()->isPointerTy())
-      AST->deleteValue(LI);
     LI->eraseFromParent();
     LBI.deleteValue(LI);
   }
@@ -410,8 +425,6 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
   Info.OnlyStore->eraseFromParent();
   LBI.deleteValue(Info.OnlyStore);
 
-  if (AST)
-    AST->deleteValue(AI);
   AI->eraseFromParent();
   LBI.deleteValue(AI);
   return true;
@@ -435,7 +448,8 @@ static bool rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info,
 ///  }
 static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
                                      LargeBlockInfo &LBI,
-                                     AliasSetTracker *AST) {
+                                     DominatorTree &DT,
+                                     AssumptionCache *AC) {
   // The trickiest case to handle is when we have large blocks. Because of this,
   // this code is optimized assuming that large blocks happen.  This does not
   // significantly pessimize the small block case.  This uses LargeBlockInfo to
@@ -476,13 +490,18 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
         // There is no store before this load, bail out (load may be affected
         // by the following stores - see main comment).
         return false;
-    }
-    else
+    } else {
       // Otherwise, there was a store before this load, the load takes its value.
-      LI->replaceAllUsesWith(std::prev(I)->second->getOperand(0));
+      // Note, if the load was marked as nonnull we don't want to lose that
+      // information when we erase it. So we preserve it with an assume.
+      Value *ReplVal = std::prev(I)->second->getOperand(0);
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !llvm::isKnownNonNullAt(ReplVal, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
+      LI->replaceAllUsesWith(ReplVal);
+    }
 
-    if (AST && LI->getType()->isPointerTy())
-      AST->deleteValue(LI);
     LI->eraseFromParent();
     LBI.deleteValue(LI);
   }
@@ -499,8 +518,6 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
     LBI.deleteValue(SI);
   }
 
-  if (AST)
-    AST->deleteValue(AI);
   AI->eraseFromParent();
   LBI.deleteValue(AI);
 
@@ -517,8 +534,6 @@ static bool promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
 void PromoteMem2Reg::run() {
   Function &F = *DT.getRoot()->getParent();
 
-  if (AST)
-    PointerAllocaValues.resize(Allocas.size());
   AllocaDbgDeclares.resize(Allocas.size());
 
   AllocaInfo Info;
@@ -536,8 +551,6 @@ void PromoteMem2Reg::run() {
 
     if (AI->use_empty()) {
       // If there are no uses of the alloca, just delete it now.
-      if (AST)
-        AST->deleteValue(AI);
       AI->eraseFromParent();
 
       // Remove the alloca from the Allocas list, since it has been processed
@@ -553,7 +566,7 @@ void PromoteMem2Reg::run() {
     // If there is only a single store to this value, replace any loads of
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
-      if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AST)) {
+      if (rewriteSingleStoreAlloca(AI, Info, LBI, DT, AC)) {
         // The alloca has been processed, move on.
         RemoveFromAllocasList(AllocaNum);
         ++NumSingleStore;
@@ -564,7 +577,7 @@ void PromoteMem2Reg::run() {
     // If the alloca is only read and written in one basic block, just perform a
     // linear sweep over the block to eliminate it.
     if (Info.OnlyUsedInOneBlock &&
-        promoteSingleBlockAlloca(AI, Info, LBI, AST)) {
+        promoteSingleBlockAlloca(AI, Info, LBI, DT, AC)) {
       // The alloca has been processed, move on.
       RemoveFromAllocasList(AllocaNum);
       continue;
@@ -578,11 +591,6 @@ void PromoteMem2Reg::run() {
         BBNumbers[&BB] = ID++;
     }
 
-    // If we have an AST to keep updated, remember some pointer value that is
-    // stored into the alloca.
-    if (AST)
-      PointerAllocaValues[AllocaNum] = Info.AllocaPointerVal;
-
     // Remember the dbg.declare intrinsic describing this alloca, if any.
     if (Info.DbgDeclare)
       AllocaDbgDeclares[AllocaNum] = Info.DbgDeclare;
@@ -662,8 +670,6 @@ void PromoteMem2Reg::run() {
     // tree. Just delete the users now.
     if (!A->use_empty())
       A->replaceAllUsesWith(UndefValue::get(A->getType()));
-    if (AST)
-      AST->deleteValue(A);
     A->eraseFromParent();
   }
 
@@ -694,8 +700,6 @@ void PromoteMem2Reg::run() {
 
       // If this PHI node merges one value and/or undefs, get the value.
       if (Value *V = SimplifyInstruction(PN, DL, nullptr, &DT, AC)) {
-        if (AST && PN->getType()->isPointerTy())
-          AST->deleteValue(PN);
         PN->replaceAllUsesWith(V);
         PN->eraseFromParent();
         NewPhiNodes.erase(I++);
@@ -863,10 +867,6 @@ bool PromoteMem2Reg::QueuePhiNode(BasicBlock *BB, unsigned AllocaNo,
                        &BB->front());
   ++NumPHIInsert;
   PhiToAllocaMap[PN] = AllocaNo;
-
-  if (AST && PN->getType()->isPointerTy())
-    AST->copyValue(PointerAllocaValues[AllocaNo], PN);
-
   return true;
 }
 
@@ -940,10 +940,15 @@ NextIteration:
 
       Value *V = IncomingVals[AI->second];
 
+      // If the load was marked as nonnull we don't want to lose
+      // that information when we erase this Load. So we preserve
+      // it with an assume.
+      if (AC && LI->getMetadata(LLVMContext::MD_nonnull) &&
+          !llvm::isKnownNonNullAt(V, LI, &DT))
+        addAssumeNonNull(AC, LI);
+
       // Anything using the load now uses the current value.
       LI->replaceAllUsesWith(V);
-      if (AST && LI->getType()->isPointerTy())
-        AST->deleteValue(LI);
       BB->getInstList().erase(LI);
     } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
       // Delete this instruction and mark the name as the current holder of the
@@ -987,10 +992,10 @@ NextIteration:
 }
 
 void llvm::PromoteMemToReg(ArrayRef<AllocaInst *> Allocas, DominatorTree &DT,
-                           AliasSetTracker *AST, AssumptionCache *AC) {
+                           AssumptionCache *AC) {
   // If there is nothing to do, bail out...
   if (Allocas.empty())
     return;
 
-  PromoteMem2Reg(Allocas, DT, AST, AC).run();
+  PromoteMem2Reg(Allocas, DT, AC).run();
 }
diff --git a/lib/Transforms/Utils/SimplifyCFG.cpp b/lib/Transforms/Utils/SimplifyCFG.cpp
index 464793629e7c9196fdb30ffae8b047892034def8..127a44df5344fec7d05023ae201ae481adabb8f9 100644
--- a/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -169,6 +170,8 @@ class SimplifyCFGOpt {
   unsigned BonusInstThreshold;
   AssumptionCache *AC;
   SmallPtrSetImpl<BasicBlock *> *LoopHeaders;
+  // See comments in SimplifyCFGOpt::SimplifySwitch.
+  bool LateSimplifyCFG;
   Value *isValueEqualityComparison(TerminatorInst *TI);
   BasicBlock *GetValueEqualityComparisonCases(
       TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases);
@@ -192,9 +195,10 @@ class SimplifyCFGOpt {
 public:
   SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
                  unsigned BonusInstThreshold, AssumptionCache *AC,
-                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders)
+                 SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
+                 bool LateSimplifyCFG)
       : TTI(TTI), DL(DL), BonusInstThreshold(BonusInstThreshold), AC(AC),
-        LoopHeaders(LoopHeaders) {}
+        LoopHeaders(LoopHeaders), LateSimplifyCFG(LateSimplifyCFG) {}
 
   bool run(BasicBlock *BB);
 };
@@ -710,10 +714,9 @@ BasicBlock *SimplifyCFGOpt::GetValueEqualityComparisonCases(
     TerminatorInst *TI, std::vector<ValueEqualityComparisonCase> &Cases) {
   if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
     Cases.reserve(SI->getNumCases());
-    for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e;
-         ++i)
-      Cases.push_back(
-          ValueEqualityComparisonCase(i.getCaseValue(), i.getCaseSuccessor()));
+    for (auto Case : SI->cases())
+      Cases.push_back(ValueEqualityComparisonCase(Case.getCaseValue(),
+                                                  Case.getCaseSuccessor()));
     return SI->getDefaultDest();
   }
 
@@ -846,12 +849,12 @@ bool SimplifyCFGOpt::SimplifyEqualityComparisonWithOnlyPredecessor(
       }
     for (SwitchInst::CaseIt i = SI->case_end(), e = SI->case_begin(); i != e;) {
       --i;
-      if (DeadCases.count(i.getCaseValue())) {
+      if (DeadCases.count(i->getCaseValue())) {
         if (HasWeight) {
-          std::swap(Weights[i.getCaseIndex() + 1], Weights.back());
+          std::swap(Weights[i->getCaseIndex() + 1], Weights.back());
           Weights.pop_back();
         }
-        i.getCaseSuccessor()->removePredecessor(TI->getParent());
+        i->getCaseSuccessor()->removePredecessor(TI->getParent());
         SI->removeCase(i);
       }
     }
@@ -996,8 +999,7 @@ bool SimplifyCFGOpt::FoldValueComparisonIntoPredecessors(TerminatorInst *TI,
       SmallSetVector<BasicBlock*, 4> FailBlocks;
       if (!SafeToMergeTerminators(TI, PTI, &FailBlocks)) {
         for (auto *Succ : FailBlocks) {
-          std::vector<BasicBlock*> Blocks = { TI->getParent() };
-          if (!SplitBlockPredecessors(Succ, Blocks, ".fold.split"))
+          if (!SplitBlockPredecessors(Succ, TI->getParent(), ".fold.split"))
             return false;
         }
       }
@@ -1472,29 +1474,28 @@ static bool canSinkInstructions(
       return false;
   }
 
+  // Because SROA can't handle speculating stores of selects, try not
+  // to sink loads or stores of allocas when we'd have to create a PHI for
+  // the address operand. Also, because it is likely that loads or stores
+  // of allocas will disappear when Mem2Reg/SROA is run, don't sink them.
+  // This can cause code churn which can have unintended consequences down
+  // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
+  // FIXME: This is a workaround for a deficiency in SROA - see
+  // https://llvm.org/bugs/show_bug.cgi?id=30188
+  if (isa<StoreInst>(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(1));
+      }))
+    return false;
+  if (isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
+        return isa<AllocaInst>(I->getOperand(0));
+      }))
+    return false;
+
   for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
     if (I0->getOperand(OI)->getType()->isTokenTy())
       // Don't touch any operand of token type.
       return false;
 
-    // Because SROA can't handle speculating stores of selects, try not
-    // to sink loads or stores of allocas when we'd have to create a PHI for
-    // the address operand. Also, because it is likely that loads or stores
-    // of allocas will disappear when Mem2Reg/SROA is run, don't sink them.
-    // This can cause code churn which can have unintended consequences down
-    // the line - see https://llvm.org/bugs/show_bug.cgi?id=30244.
-    // FIXME: This is a workaround for a deficiency in SROA - see
-    // https://llvm.org/bugs/show_bug.cgi?id=30188
-    if (OI == 1 && isa<StoreInst>(I0) &&
-        any_of(Insts, [](const Instruction *I) {
-          return isa<AllocaInst>(I->getOperand(1));
-        }))
-      return false;
-    if (OI == 0 && isa<LoadInst>(I0) && any_of(Insts, [](const Instruction *I) {
-          return isa<AllocaInst>(I->getOperand(0));
-        }))
-      return false;
-
     auto SameAsI0 = [&I0, OI](const Instruction *I) {
       assert(I->getNumOperands() == I0->getNumOperands());
       return I->getOperand(OI) == I0->getOperand(OI);
@@ -2150,7 +2151,8 @@ static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
 /// If we have a conditional branch on a PHI node value that is defined in the
 /// same block as the branch and if any PHI entries are constants, thread edges
 /// corresponding to that entry to be branches to their ultimate destination.
-static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
+static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL,
+                                AssumptionCache *AC) {
   BasicBlock *BB = BI->getParent();
   PHINode *PN = dyn_cast<PHINode>(BI->getCondition());
   // NOTE: we currently cannot transform this case if the PHI node is used
@@ -2242,6 +2244,11 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
       // Insert the new instruction into its new home.
       if (N)
         EdgeBB->getInstList().insert(InsertPt, N);
+
+      // Register the new instruction with the assumption cache if necessary.
+      if (auto *II = dyn_cast_or_null<IntrinsicInst>(N))
+        if (II->getIntrinsicID() == Intrinsic::assume)
+          AC->registerAssumption(II);
     }
 
     // Loop over all of the edges from PredBB to BB, changing them to branch
@@ -2254,7 +2261,7 @@ static bool FoldCondBranchOnPHI(BranchInst *BI, const DataLayout &DL) {
       }
 
     // Recurse, simplifying any other constants.
-    return FoldCondBranchOnPHI(BI, DL) | true;
+    return FoldCondBranchOnPHI(BI, DL, AC) | true;
   }
 
   return false;
@@ -3436,8 +3443,8 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
 
   // Find the relevant condition and destinations.
   Value *Condition = Select->getCondition();
-  BasicBlock *TrueBB = SI->findCaseValue(TrueVal).getCaseSuccessor();
-  BasicBlock *FalseBB = SI->findCaseValue(FalseVal).getCaseSuccessor();
+  BasicBlock *TrueBB = SI->findCaseValue(TrueVal)->getCaseSuccessor();
+  BasicBlock *FalseBB = SI->findCaseValue(FalseVal)->getCaseSuccessor();
 
   // Get weight for TrueBB and FalseBB.
   uint32_t TrueWeight = 0, FalseWeight = 0;
@@ -3447,9 +3454,9 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
     GetBranchWeights(SI, Weights);
     if (Weights.size() == 1 + SI->getNumCases()) {
       TrueWeight =
-          (uint32_t)Weights[SI->findCaseValue(TrueVal).getSuccessorIndex()];
+          (uint32_t)Weights[SI->findCaseValue(TrueVal)->getSuccessorIndex()];
       FalseWeight =
-          (uint32_t)Weights[SI->findCaseValue(FalseVal).getSuccessorIndex()];
+          (uint32_t)Weights[SI->findCaseValue(FalseVal)->getSuccessorIndex()];
     }
   }
 
@@ -4151,15 +4158,16 @@ bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
         }
       }
     } else if (auto *SI = dyn_cast<SwitchInst>(TI)) {
-      for (SwitchInst::CaseIt i = SI->case_begin(), e = SI->case_end(); i != e;
-           ++i)
-        if (i.getCaseSuccessor() == BB) {
-          BB->removePredecessor(SI->getParent());
-          SI->removeCase(i);
-          --i;
-          --e;
-          Changed = true;
+      for (auto i = SI->case_begin(), e = SI->case_end(); i != e;) {
+        if (i->getCaseSuccessor() != BB) {
+          ++i;
+          continue;
         }
+        BB->removePredecessor(SI->getParent());
+        i = SI->removeCase(i);
+        e = SI->case_end();
+        Changed = true;
+      }
     } else if (auto *II = dyn_cast<InvokeInst>(TI)) {
       if (II->getUnwindDest() == BB) {
         removeUnwindEdge(TI->getParent());
@@ -4242,18 +4250,18 @@ static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
   SmallVector<ConstantInt *, 16> CasesA;
   SmallVector<ConstantInt *, 16> CasesB;
 
-  for (SwitchInst::CaseIt I : SI->cases()) {
-    BasicBlock *Dest = I.getCaseSuccessor();
+  for (auto Case : SI->cases()) {
+    BasicBlock *Dest = Case.getCaseSuccessor();
     if (!DestA)
       DestA = Dest;
     if (Dest == DestA) {
-      CasesA.push_back(I.getCaseValue());
+      CasesA.push_back(Case.getCaseValue());
       continue;
     }
     if (!DestB)
       DestB = Dest;
     if (Dest == DestB) {
-      CasesB.push_back(I.getCaseValue());
+      CasesB.push_back(Case.getCaseValue());
       continue;
     }
     return false; // More than two destinations.
@@ -4378,7 +4386,7 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
   bool HasDefault =
       !isa<UnreachableInst>(SI->getDefaultDest()->getFirstNonPHIOrDbg());
   const unsigned NumUnknownBits =
-      Bits - (KnownZero.Or(KnownOne)).countPopulation();
+      Bits - (KnownZero | KnownOne).countPopulation();
   assert(NumUnknownBits <= Bits);
   if (HasDefault && DeadCases.empty() &&
       NumUnknownBits < 64 /* avoid overflow */ &&
@@ -4403,17 +4411,17 @@ static bool EliminateDeadSwitchCases(SwitchInst *SI, AssumptionCache *AC,
 
   // Remove dead cases from the switch.
   for (ConstantInt *DeadCase : DeadCases) {
-    SwitchInst::CaseIt Case = SI->findCaseValue(DeadCase);
-    assert(Case != SI->case_default() &&
+    SwitchInst::CaseIt CaseI = SI->findCaseValue(DeadCase);
+    assert(CaseI != SI->case_default() &&
            "Case was not found. Probably mistake in DeadCases forming.");
     if (HasWeight) {
-      std::swap(Weights[Case.getCaseIndex() + 1], Weights.back());
+      std::swap(Weights[CaseI->getCaseIndex() + 1], Weights.back());
       Weights.pop_back();
     }
 
     // Prune unused values from PHI nodes.
-    Case.getCaseSuccessor()->removePredecessor(SI->getParent());
-    SI->removeCase(Case);
+    CaseI->getCaseSuccessor()->removePredecessor(SI->getParent());
+    SI->removeCase(CaseI);
   }
   if (HasWeight && Weights.size() >= 2) {
     SmallVector<uint32_t, 8> MDWeights(Weights.begin(), Weights.end());
@@ -4467,10 +4475,9 @@ static bool ForwardSwitchConditionToPHI(SwitchInst *SI) {
   typedef DenseMap<PHINode *, SmallVector<int, 4>> ForwardingNodesMap;
   ForwardingNodesMap ForwardingNodes;
 
-  for (SwitchInst::CaseIt I = SI->case_begin(), E = SI->case_end(); I != E;
-       ++I) {
-    ConstantInt *CaseValue = I.getCaseValue();
-    BasicBlock *CaseDest = I.getCaseSuccessor();
+  for (auto Case : SI->cases()) {
+    ConstantInt *CaseValue = Case.getCaseValue();
+    BasicBlock *CaseDest = Case.getCaseSuccessor();
 
     int PhiIndex;
     PHINode *PHI =
@@ -5205,8 +5212,8 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   // common destination, as well as the min and max case values.
   assert(SI->case_begin() != SI->case_end());
   SwitchInst::CaseIt CI = SI->case_begin();
-  ConstantInt *MinCaseVal = CI.getCaseValue();
-  ConstantInt *MaxCaseVal = CI.getCaseValue();
+  ConstantInt *MinCaseVal = CI->getCaseValue();
+  ConstantInt *MaxCaseVal = CI->getCaseValue();
 
   BasicBlock *CommonDest = nullptr;
   typedef SmallVector<std::pair<ConstantInt *, Constant *>, 4> ResultListTy;
@@ -5216,7 +5223,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
   SmallVector<PHINode *, 4> PHIs;
 
   for (SwitchInst::CaseIt E = SI->case_end(); CI != E; ++CI) {
-    ConstantInt *CaseVal = CI.getCaseValue();
+    ConstantInt *CaseVal = CI->getCaseValue();
     if (CaseVal->getValue().slt(MinCaseVal->getValue()))
       MinCaseVal = CaseVal;
     if (CaseVal->getValue().sgt(MaxCaseVal->getValue()))
@@ -5225,7 +5232,7 @@ static bool SwitchToLookupTable(SwitchInst *SI, IRBuilder<> &Builder,
     // Resulting value at phi nodes for this case value.
     typedef SmallVector<std::pair<PHINode *, Constant *>, 4> ResultsTy;
     ResultsTy Results;
-    if (!GetCaseResults(SI, CaseVal, CI.getCaseSuccessor(), &CommonDest,
+    if (!GetCaseResults(SI, CaseVal, CI->getCaseSuccessor(), &CommonDest,
                         Results, DL, TTI))
       return false;
 
@@ -5506,11 +5513,10 @@ static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
   auto *Rot = Builder.CreateOr(LShr, Shl);
   SI->replaceUsesOfWith(SI->getCondition(), Rot);
 
-  for (SwitchInst::CaseIt C = SI->case_begin(), E = SI->case_end(); C != E;
-       ++C) {
-    auto *Orig = C.getCaseValue();
+  for (auto Case : SI->cases()) {
+    auto *Orig = Case.getCaseValue();
     auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
-    C.setValue(
+    Case.setValue(
         cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(ShiftC->getValue()))));
   }
   return true;
@@ -5556,7 +5562,12 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   if (ForwardSwitchConditionToPHI(SI))
     return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
-  if (SwitchToLookupTable(SI, Builder, DL, TTI))
+  // The conversion from switch to lookup tables results in difficult
+  // to analyze code and makes pruning branches much harder.
+  // This is a problem of the switch expression itself can still be
+  // restricted as a result of inlining or CVP. There only apply this
+  // transformation during late steps of the optimisation chain.
+  if (LateSimplifyCFG && SwitchToLookupTable(SI, Builder, DL, TTI))
     return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   if (ReduceSwitchRange(SI, Builder, DL, TTI))
@@ -5836,7 +5847,7 @@ bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   // through this block if any PHI node entries are constants.
   if (PHINode *PN = dyn_cast<PHINode>(BI->getCondition()))
     if (PN->getParent() == BI->getParent())
-      if (FoldCondBranchOnPHI(BI, DL))
+      if (FoldCondBranchOnPHI(BI, DL, AC))
         return SimplifyCFG(BB, TTI, BonusInstThreshold, AC) | true;
 
   // Scan predecessor blocks for conditional branches.
@@ -6015,8 +6026,9 @@ bool SimplifyCFGOpt::run(BasicBlock *BB) {
 ///
 bool llvm::SimplifyCFG(BasicBlock *BB, const TargetTransformInfo &TTI,
                        unsigned BonusInstThreshold, AssumptionCache *AC,
-                       SmallPtrSetImpl<BasicBlock *> *LoopHeaders) {
+                       SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
+                       bool LateSimplifyCFG) {
   return SimplifyCFGOpt(TTI, BB->getModule()->getDataLayout(),
-                        BonusInstThreshold, AC, LoopHeaders)
+                        BonusInstThreshold, AC, LoopHeaders, LateSimplifyCFG)
       .run(BB);
 }
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 6b1d3dc4133052dfd0d97908bb566c052857c65f..a4cc6a031ad4c5cc5f513f8d074e4f207eba8118 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -35,6 +35,9 @@ using namespace llvm;
 STATISTIC(NumElimIdentity, "Number of IV identities eliminated");
 STATISTIC(NumElimOperand,  "Number of IV operands folded into a use");
 STATISTIC(NumElimRem     , "Number of IV remainder operations eliminated");
+STATISTIC(
+    NumSimplifiedSDiv,
+    "Number of IV signed division operations converted to unsigned division");
 STATISTIC(NumElimCmp     , "Number of IV comparisons eliminated");
 
 namespace {
@@ -75,6 +78,7 @@ namespace {
     void eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand);
     void eliminateIVRemainder(BinaryOperator *Rem, Value *IVOperand,
                               bool IsSigned);
+    bool eliminateSDiv(BinaryOperator *SDiv);
     bool strengthenOverflowingOperation(BinaryOperator *OBO, Value *IVOperand);
   };
 }
@@ -265,6 +269,33 @@ void SimplifyIndvar::eliminateIVComparison(ICmpInst *ICmp, Value *IVOperand) {
   Changed = true;
 }
 
+bool SimplifyIndvar::eliminateSDiv(BinaryOperator *SDiv) {
+  // Get the SCEVs for the ICmp operands.
+  auto *N = SE->getSCEV(SDiv->getOperand(0));
+  auto *D = SE->getSCEV(SDiv->getOperand(1));
+
+  // Simplify unnecessary loops away.
+  const Loop *L = LI->getLoopFor(SDiv->getParent());
+  N = SE->getSCEVAtScope(N, L);
+  D = SE->getSCEVAtScope(D, L);
+
+  // Replace sdiv by udiv if both of the operands are non-negative
+  if (SE->isKnownNonNegative(N) && SE->isKnownNonNegative(D)) {
+    auto *UDiv = BinaryOperator::Create(
+        BinaryOperator::UDiv, SDiv->getOperand(0), SDiv->getOperand(1),
+        SDiv->getName() + ".udiv", SDiv);
+    UDiv->setIsExact(SDiv->isExact());
+    SDiv->replaceAllUsesWith(UDiv);
+    DEBUG(dbgs() << "INDVARS: Simplified sdiv: " << *SDiv << '\n');
+    ++NumSimplifiedSDiv;
+    Changed = true;
+    DeadInsts.push_back(SDiv);
+    return true;
+  }
+
+  return false;
+}
+
 /// SimplifyIVUsers helper for eliminating useless
 /// remainder operations operating on an induction variable.
 void SimplifyIndvar::eliminateIVRemainder(BinaryOperator *Rem,
@@ -426,12 +457,15 @@ bool SimplifyIndvar::eliminateIVUser(Instruction *UseInst,
     eliminateIVComparison(ICmp, IVOperand);
     return true;
   }
-  if (BinaryOperator *Rem = dyn_cast<BinaryOperator>(UseInst)) {
-    bool IsSigned = Rem->getOpcode() == Instruction::SRem;
-    if (IsSigned || Rem->getOpcode() == Instruction::URem) {
-      eliminateIVRemainder(Rem, IVOperand, IsSigned);
+  if (BinaryOperator *Bin = dyn_cast<BinaryOperator>(UseInst)) {
+    bool IsSRem = Bin->getOpcode() == Instruction::SRem;
+    if (IsSRem || Bin->getOpcode() == Instruction::URem) {
+      eliminateIVRemainder(Bin, IVOperand, IsSRem);
       return true;
     }
+
+    if (Bin->getOpcode() == Instruction::SDiv)
+      return eliminateSDiv(Bin);
   }
 
   if (auto *CI = dyn_cast<CallInst>(UseInst))
diff --git a/lib/Transforms/Utils/SimplifyInstructions.cpp b/lib/Transforms/Utils/SimplifyInstructions.cpp
index 432a2c5479b383f9c3ba3605776a88f30743b1da..f6070868de44e28f46aa719a087a33e98d425a8e 100644
--- a/lib/Transforms/Utils/SimplifyInstructions.cpp
+++ b/lib/Transforms/Utils/SimplifyInstructions.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/OptimizationDiagnosticInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -35,7 +36,8 @@ using namespace llvm;
 STATISTIC(NumSimplified, "Number of redundant instructions removed");
 
 static bool runImpl(Function &F, const DominatorTree *DT,
-                    const TargetLibraryInfo *TLI, AssumptionCache *AC) {
+                    const TargetLibraryInfo *TLI, AssumptionCache *AC,
+                    OptimizationRemarkEmitter *ORE) {
   const DataLayout &DL = F.getParent()->getDataLayout();
   SmallPtrSet<const Instruction *, 8> S1, S2, *ToSimplify = &S1, *Next = &S2;
   bool Changed = false;
@@ -54,7 +56,7 @@ static bool runImpl(Function &F, const DominatorTree *DT,
 
         // Don't waste time simplifying unused instructions.
         if (!I->use_empty()) {
-          if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC)) {
+          if (Value *V = SimplifyInstruction(I, DL, TLI, DT, AC, ORE)) {
             // Mark all uses for resimplification next time round the loop.
             for (User *U : I->users())
               Next->insert(cast<Instruction>(U));
@@ -95,6 +97,7 @@ namespace {
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<AssumptionCacheTracker>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     }
 
     /// runOnFunction - Remove instructions that simplify.
@@ -108,7 +111,10 @@ namespace {
           &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI();
       AssumptionCache *AC =
           &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-      return runImpl(F, DT, TLI, AC);
+      OptimizationRemarkEmitter *ORE =
+          &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+
+      return runImpl(F, DT, TLI, AC, ORE);
     }
   };
 }
@@ -119,6 +125,7 @@ INITIALIZE_PASS_BEGIN(InstSimplifier, "instsimplify",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_END(InstSimplifier, "instsimplify",
                     "Remove redundant instructions", false, false)
 char &llvm::InstructionSimplifierID = InstSimplifier::ID;
@@ -133,7 +140,8 @@ PreservedAnalyses InstSimplifierPass::run(Function &F,
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  bool Changed = runImpl(F, &DT, &TLI, &AC);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  bool Changed = runImpl(F, &DT, &TLI, &AC, &ORE);
   if (!Changed)
     return PreservedAnalyses::all();
 
diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp
index ec336798199b1b7896bac3138325c13316c42c1b..aa71e3669ea27804b2b01e07d41a54a93fb96dd7 100644
--- a/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -809,7 +809,7 @@ Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
 
 // TODO: Does this belong in BuildLibCalls or should all of those similar
 // functions be moved here?
-static Value *emitCalloc(Value *Num, Value *Size, const AttributeSet &Attrs,
+static Value *emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
                          IRBuilder<> &B, const TargetLibraryInfo &TLI) {
   LibFunc Func;
   if (!TLI.getLibFunc("calloc", Func) || !TLI.has(Func))
@@ -819,7 +819,7 @@ static Value *emitCalloc(Value *Num, Value *Size, const AttributeSet &Attrs,
   const DataLayout &DL = M->getDataLayout();
   IntegerType *PtrType = DL.getIntPtrType((B.GetInsertBlock()->getContext()));
   Value *Calloc = M->getOrInsertFunction("calloc", Attrs, B.getInt8PtrTy(),
-                                         PtrType, PtrType, nullptr);
+                                         PtrType, PtrType);
   CallInst *CI = B.CreateCall(Calloc, { Num, Size }, "calloc");
 
   if (const auto *F = dyn_cast<Function>(Calloc->stripPointerCasts()))
@@ -1219,7 +1219,7 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
       Module *M = CI->getModule();
       Value *NewCallee =
           M->getOrInsertFunction(TLI->getName(LdExp), Op->getType(),
-                                 Op->getType(), B.getInt32Ty(), nullptr);
+                                 Op->getType(), B.getInt32Ty());
       CallInst *CI = B.CreateCall(NewCallee, {One, LdExpArg});
       if (const Function *F = dyn_cast<Function>(Callee->stripPointerCasts()))
         CI->setCallingConv(F->getCallingConv());
@@ -1443,7 +1443,7 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
 
   Module *M = OrigCallee->getParent();
   Value *Callee = M->getOrInsertFunction(Name, OrigCallee->getAttributes(),
-                                         ResTy, ArgTy, nullptr);
+                                         ResTy, ArgTy);
 
   if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
     // If the argument is an instruction, it must dominate all uses so put our
@@ -1625,7 +1625,7 @@ Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
   // Proceedings of PACT'98, Oct. 1998, IEEE
   if (!CI->hasFnAttr(Attribute::Cold) &&
       isReportingError(Callee, CI, StreamArg)) {
-    CI->addAttribute(AttributeSet::FunctionIndex, Attribute::Cold);
+    CI->addAttribute(AttributeList::FunctionIndex, Attribute::Cold);
   }
 
   return nullptr;
@@ -2160,8 +2160,9 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
     case LibFunc_round:
       return replaceUnaryCall(CI, Builder, Intrinsic::round);
     case LibFunc_nearbyint:
-    case LibFunc_rint:
       return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint);
+    case LibFunc_rint:
+      return replaceUnaryCall(CI, Builder, Intrinsic::rint);
     case LibFunc_trunc:
       return replaceUnaryCall(CI, Builder, Intrinsic::trunc);
     case LibFunc_acos:
diff --git a/lib/Transforms/Utils/Utils.cpp b/lib/Transforms/Utils/Utils.cpp
index 7b9de2eadc611381558fe733ba0bcc655b01b67f..7106483c3bd2a4037c2e458a34d07fad88bedc57 100644
--- a/lib/Transforms/Utils/Utils.cpp
+++ b/lib/Transforms/Utils/Utils.cpp
@@ -35,9 +35,8 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializeUnifyFunctionExitNodesPass(Registry);
   initializeInstSimplifierPass(Registry);
   initializeMetaRenamerPass(Registry);
-  initializeMemorySSAWrapperPassPass(Registry);
-  initializeMemorySSAPrinterLegacyPassPass(Registry);
   initializeStripGCRelocatesPass(Registry);
+  initializePredicateInfoPrinterLegacyPassPass(Registry);
 }
 
 /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..4aeea02b1b1bfce6d6971cdd69f00332e3f2b0eb
--- /dev/null
+++ b/lib/Transforms/Utils/VNCoercion.cpp
@@ -0,0 +1,482 @@
+#include "llvm/Transforms/Utils/VNCoercion.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "vncoerce"
+namespace llvm {
+namespace VNCoercion {
+
+/// Return true if coerceAvailableValueToLoadType will succeed.
+bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
+                                     const DataLayout &DL) {
+  // If the loaded or stored value is an first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy() ||
+      StoredVal->getType()->isStructTy() || StoredVal->getType()->isArrayTy())
+    return false;
+
+  // The store has to be at least as big as the load.
+  if (DL.getTypeSizeInBits(StoredVal->getType()) < DL.getTypeSizeInBits(LoadTy))
+    return false;
+
+  return true;
+}
+
+template <class T, class HelperClass>
+static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
+                                               HelperClass &Helper,
+                                               const DataLayout &DL) {
+  assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
+         "precondition violation - materialization can't fail");
+  if (auto *C = dyn_cast<Constant>(StoredVal))
+    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+      StoredVal = FoldedStoredVal;
+
+  // If this is already the right type, just return it.
+  Type *StoredValTy = StoredVal->getType();
+
+  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
+  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
+
+  // If the store and reload are the same size, we can always reuse it.
+  if (StoredValSize == LoadedValSize) {
+    // Pointer to Pointer -> use bitcast.
+    if (StoredValTy->getScalarType()->isPointerTy() &&
+        LoadedTy->getScalarType()->isPointerTy()) {
+      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+    } else {
+      // Convert source pointers to integers, which can be bitcast.
+      if (StoredValTy->getScalarType()->isPointerTy()) {
+        StoredValTy = DL.getIntPtrType(StoredValTy);
+        StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+      }
+
+      Type *TypeToCastTo = LoadedTy;
+      if (TypeToCastTo->getScalarType()->isPointerTy())
+        TypeToCastTo = DL.getIntPtrType(TypeToCastTo);
+
+      if (StoredValTy != TypeToCastTo)
+        StoredVal = Helper.CreateBitCast(StoredVal, TypeToCastTo);
+
+      // Cast to pointer if the load needs a pointer type.
+      if (LoadedTy->getScalarType()->isPointerTy())
+        StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+    }
+
+    if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
+      if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+        StoredVal = FoldedStoredVal;
+
+    return StoredVal;
+  }
+  // If the loaded value is smaller than the available value, then we can
+  // extract out a piece from it.  If the available value is too small, then we
+  // can't do anything.
+  assert(StoredValSize >= LoadedValSize &&
+         "canCoerceMustAliasedValueToLoad fail");
+
+  // Convert source pointers to integers, which can be manipulated.
+  if (StoredValTy->getScalarType()->isPointerTy()) {
+    StoredValTy = DL.getIntPtrType(StoredValTy);
+    StoredVal = Helper.CreatePtrToInt(StoredVal, StoredValTy);
+  }
+
+  // Convert vectors and fp to integer, which can be manipulated.
+  if (!StoredValTy->isIntegerTy()) {
+    StoredValTy = IntegerType::get(StoredValTy->getContext(), StoredValSize);
+    StoredVal = Helper.CreateBitCast(StoredVal, StoredValTy);
+  }
+
+  // If this is a big-endian system, we need to shift the value down to the low
+  // bits so that a truncate will work.
+  if (DL.isBigEndian()) {
+    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
+                        DL.getTypeStoreSizeInBits(LoadedTy);
+    StoredVal = Helper.CreateLShr(
+        StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
+  }
+
+  // Truncate the integer to the right size now.
+  Type *NewIntTy = IntegerType::get(StoredValTy->getContext(), LoadedValSize);
+  StoredVal = Helper.CreateTruncOrBitCast(StoredVal, NewIntTy);
+
+  if (LoadedTy != NewIntTy) {
+    // If the result is a pointer, inttoptr.
+    if (LoadedTy->getScalarType()->isPointerTy())
+      StoredVal = Helper.CreateIntToPtr(StoredVal, LoadedTy);
+    else
+      // Otherwise, bitcast.
+      StoredVal = Helper.CreateBitCast(StoredVal, LoadedTy);
+  }
+
+  if (auto *C = dyn_cast<Constant>(StoredVal))
+    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
+      StoredVal = FoldedStoredVal;
+
+  return StoredVal;
+}
+
+/// If we saw a store of a value to memory, and
+/// then a load from a must-aliased pointer of a different type, try to coerce
+/// the stored value.  LoadedTy is the type of the load we want to replace.
+/// IRB is IRBuilder used to insert new instructions.
+///
+/// If we can't do it, return null.
+Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
+                                      IRBuilder<> &IRB, const DataLayout &DL) {
+  return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering memory write (store, memset, memcpy, memmove).  This
+/// means that the write *may* provide bits used by the load but we can't be
+/// sure because the pointers don't must-alias.
+///
+/// Check this case to see if there is anything more we can do before we give
+/// up.  This returns -1 if we have to give up, or a byte number in the stored
+/// value of the piece that feeds the load.
+static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
+                                          Value *WritePtr,
+                                          uint64_t WriteSizeInBits,
+                                          const DataLayout &DL) {
+  // If the loaded or stored value is a first class array or struct, don't try
+  // to transform them.  We need to be able to bitcast to integer.
+  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+    return -1;
+
+  int64_t StoreOffset = 0, LoadOffset = 0;
+  Value *StoreBase =
+      GetPointerBaseWithConstantOffset(WritePtr, StoreOffset, DL);
+  Value *LoadBase = GetPointerBaseWithConstantOffset(LoadPtr, LoadOffset, DL);
+  if (StoreBase != LoadBase)
+    return -1;
+
+  // If the load and store are to the exact same address, they should have been
+  // a must alias.  AA must have gotten confused.
+  // FIXME: Study to see if/when this happens.  One case is forwarding a memset
+  // to a load from the base of the memset.
+
+  // If the load and store don't overlap at all, the store doesn't provide
+  // anything to the load.  In this case, they really don't alias at all, AA
+  // must have gotten confused.
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
+
+  if ((WriteSizeInBits & 7) | (LoadSize & 7))
+    return -1;
+  uint64_t StoreSize = WriteSizeInBits / 8; // Convert to bytes.
+  LoadSize /= 8;
+
+  bool isAAFailure = false;
+  if (StoreOffset < LoadOffset)
+    isAAFailure = StoreOffset + int64_t(StoreSize) <= LoadOffset;
+  else
+    isAAFailure = LoadOffset + int64_t(LoadSize) <= StoreOffset;
+
+  if (isAAFailure)
+    return -1;
+
+  // If the Load isn't completely contained within the stored bits, we don't
+  // have all the bits to feed it.  We could do something crazy in the future
+  // (issue a smaller load then merge the bits in) but this seems unlikely to be
+  // valuable.
+  if (StoreOffset > LoadOffset ||
+      StoreOffset + StoreSize < LoadOffset + LoadSize)
+    return -1;
+
+  // Okay, we can do this transformation.  Return the number of bytes into the
+  // store that the load is.
+  return LoadOffset - StoreOffset;
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering store.
+int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
+                                   StoreInst *DepSI, const DataLayout &DL) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepSI->getValueOperand()->getType()->isStructTy() ||
+      DepSI->getValueOperand()->getType()->isArrayTy())
+    return -1;
+
+  Value *StorePtr = DepSI->getPointerOperand();
+  uint64_t StoreSize =
+      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
+                                        DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being clobbered by another load.  See if
+/// the other load can feed into the second load.
+int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
+                                  const DataLayout &DL) {
+  // Cannot handle reading from store of first-class aggregate yet.
+  if (DepLI->getType()->isStructTy() || DepLI->getType()->isArrayTy())
+    return -1;
+
+  Value *DepPtr = DepLI->getPointerOperand();
+  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+  int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
+  if (R != -1)
+    return R;
+
+  // If we have a load/load clobber an DepLI can be widened to cover this load,
+  // then we should widen it!
+  int64_t LoadOffs = 0;
+  const Value *LoadBase =
+      GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+
+  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
+      LoadBase, LoadOffs, LoadSize, DepLI);
+  if (Size == 0)
+    return -1;
+
+  // Check non-obvious conditions enforced by MDA which we rely on for being
+  // able to materialize this potentially available value
+  assert(DepLI->isSimple() && "Cannot widen volatile/atomic load!");
+  assert(DepLI->getType()->isIntegerTy() && "Can't widen non-integer load");
+
+  return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, Size * 8, DL);
+}
+
+int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
+                                     MemIntrinsic *MI, const DataLayout &DL) {
+  // If the mem operation is a non-constant size, we can't handle it.
+  ConstantInt *SizeCst = dyn_cast<ConstantInt>(MI->getLength());
+  if (!SizeCst)
+    return -1;
+  uint64_t MemSizeInBits = SizeCst->getZExtValue() * 8;
+
+  // If this is memset, we just need to see if the offset is valid in the size
+  // of the memset..
+  if (MI->getIntrinsicID() == Intrinsic::memset)
+    return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                          MemSizeInBits, DL);
+
+  // If we have a memcpy/memmove, the only case we can handle is if this is a
+  // copy from constant memory.  In that case, we can read directly from the
+  // constant memory.
+  MemTransferInst *MTI = cast<MemTransferInst>(MI);
+
+  Constant *Src = dyn_cast<Constant>(MTI->getSource());
+  if (!Src)
+    return -1;
+
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(GetUnderlyingObject(Src, DL));
+  if (!GV || !GV->isConstant())
+    return -1;
+
+  // See if the access is within the bounds of the transfer.
+  int Offset = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, MI->getDest(),
+                                              MemSizeInBits, DL);
+  if (Offset == -1)
+    return Offset;
+
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src =
+      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
+  Constant *OffsetCst =
+      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                       OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
+    return Offset;
+  return -1;
+}
+
+template <class T, class HelperClass>
+static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
+                                     HelperClass &Helper,
+                                     const DataLayout &DL) {
+  LLVMContext &Ctx = SrcVal->getType()->getContext();
+
+  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
+  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
+  // Compute which bits of the stored value are being used by the load.  Convert
+  // to an integer type to start with.
+  if (SrcVal->getType()->getScalarType()->isPointerTy())
+    SrcVal = Helper.CreatePtrToInt(SrcVal, DL.getIntPtrType(SrcVal->getType()));
+  if (!SrcVal->getType()->isIntegerTy())
+    SrcVal = Helper.CreateBitCast(SrcVal, IntegerType::get(Ctx, StoreSize * 8));
+
+  // Shift the bits to the least significant depending on endianness.
+  unsigned ShiftAmt;
+  if (DL.isLittleEndian())
+    ShiftAmt = Offset * 8;
+  else
+    ShiftAmt = (StoreSize - LoadSize - Offset) * 8;
+  if (ShiftAmt)
+    SrcVal = Helper.CreateLShr(SrcVal,
+                               ConstantInt::get(SrcVal->getType(), ShiftAmt));
+
+  if (LoadSize != StoreSize)
+    SrcVal = Helper.CreateTruncOrBitCast(SrcVal,
+                                         IntegerType::get(Ctx, LoadSize * 8));
+  return SrcVal;
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering store.  This means that the store provides bits used by
+/// the load but the pointers don't must-alias.  Check this case to see if
+/// there is anything more we can do before we give up.
+Value *getStoreValueForLoad(Value *SrcVal, unsigned Offset, Type *LoadTy,
+                            Instruction *InsertPt, const DataLayout &DL) {
+
+  IRBuilder<> Builder(InsertPt);
+  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, Builder, DL);
+  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, Builder, DL);
+}
+
+Constant *getConstantStoreValueForLoad(Constant *SrcVal, unsigned Offset,
+                                       Type *LoadTy, const DataLayout &DL) {
+  ConstantFolder F;
+  SrcVal = getStoreValueForLoadHelper(SrcVal, Offset, LoadTy, F, DL);
+  return coerceAvailableValueToLoadTypeHelper(SrcVal, LoadTy, F, DL);
+}
+
+/// This function is called when we have a memdep query of a load that ends up
+/// being a clobbering load.  This means that the load *may* provide bits used
+/// by the load but we can't be sure because the pointers don't must-alias.
+/// Check this case to see if there is anything more we can do before we give
+/// up.
+Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
+                           Instruction *InsertPt, const DataLayout &DL) {
+  // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
+  // widen SrcVal out to a larger load.
+  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  if (Offset + LoadSize > SrcValStoreSize) {
+    assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
+    assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
+    // If we have a load/load clobber an DepLI can be widened to cover this
+    // load, then we should widen it to the next power of 2 size big enough!
+    unsigned NewLoadSize = Offset + LoadSize;
+    if (!isPowerOf2_32(NewLoadSize))
+      NewLoadSize = NextPowerOf2(NewLoadSize);
+
+    Value *PtrVal = SrcVal->getPointerOperand();
+    // Insert the new load after the old load.  This ensures that subsequent
+    // memdep queries will find the new load.  We can't easily remove the old
+    // load completely because it is already in the value numbering table.
+    IRBuilder<> Builder(SrcVal->getParent(), ++BasicBlock::iterator(SrcVal));
+    Type *DestPTy = IntegerType::get(LoadTy->getContext(), NewLoadSize * 8);
+    DestPTy =
+        PointerType::get(DestPTy, PtrVal->getType()->getPointerAddressSpace());
+    Builder.SetCurrentDebugLocation(SrcVal->getDebugLoc());
+    PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
+    LoadInst *NewLoad = Builder.CreateLoad(PtrVal);
+    NewLoad->takeName(SrcVal);
+    NewLoad->setAlignment(SrcVal->getAlignment());
+
+    DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
+    DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
+
+    // Replace uses of the original load with the wider load.  On a big endian
+    // system, we need to shift down to get the relevant bits.
+    Value *RV = NewLoad;
+    if (DL.isBigEndian())
+      RV = Builder.CreateLShr(RV, (NewLoadSize - SrcValStoreSize) * 8);
+    RV = Builder.CreateTrunc(RV, SrcVal->getType());
+    SrcVal->replaceAllUsesWith(RV);
+
+    SrcVal = NewLoad;
+  }
+
+  return getStoreValueForLoad(SrcVal, Offset, LoadTy, InsertPt, DL);
+}
+
+Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
+                                      Type *LoadTy, const DataLayout &DL) {
+  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  if (Offset + LoadSize > SrcValStoreSize)
+    return nullptr;
+  return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
+}
+
+template <class T, class HelperClass>
+T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
+                                Type *LoadTy, HelperClass &Helper,
+                                const DataLayout &DL) {
+  LLVMContext &Ctx = LoadTy->getContext();
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy) / 8;
+
+  // We know that this method is only called when the mem transfer fully
+  // provides the bits for the load.
+  if (MemSetInst *MSI = dyn_cast<MemSetInst>(SrcInst)) {
+    // memset(P, 'x', 1234) -> splat('x'), even if x is a variable, and
+    // independently of what the offset is.
+    T *Val = cast<T>(MSI->getValue());
+    if (LoadSize != 1)
+      Val =
+          Helper.CreateZExtOrBitCast(Val, IntegerType::get(Ctx, LoadSize * 8));
+    T *OneElt = Val;
+
+    // Splat the value out to the right number of bits.
+    for (unsigned NumBytesSet = 1; NumBytesSet != LoadSize;) {
+      // If we can double the number of bytes set, do it.
+      if (NumBytesSet * 2 <= LoadSize) {
+        T *ShVal = Helper.CreateShl(
+            Val, ConstantInt::get(Val->getType(), NumBytesSet * 8));
+        Val = Helper.CreateOr(Val, ShVal);
+        NumBytesSet <<= 1;
+        continue;
+      }
+
+      // Otherwise insert one byte at a time.
+      T *ShVal = Helper.CreateShl(Val, ConstantInt::get(Val->getType(), 1 * 8));
+      Val = Helper.CreateOr(OneElt, ShVal);
+      ++NumBytesSet;
+    }
+
+    return coerceAvailableValueToLoadTypeHelper(Val, LoadTy, Helper, DL);
+  }
+
+  // Otherwise, this is a memcpy/memmove from a constant global.
+  MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
+  Constant *Src = cast<Constant>(MTI->getSource());
+  unsigned AS = Src->getType()->getPointerAddressSpace();
+
+  // Otherwise, see if we can constant fold a load from the constant with the
+  // offset applied as appropriate.
+  Src =
+      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
+  Constant *OffsetCst =
+      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
+                                       OffsetCst);
+  Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
+  return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
+}
+
+/// This function is called when we have a
+/// memdep query of a load that ends up being a clobbering mem intrinsic.
+Value *getMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                              Type *LoadTy, Instruction *InsertPt,
+                              const DataLayout &DL) {
+  IRBuilder<> Builder(InsertPt);
+  return getMemInstValueForLoadHelper<Value, IRBuilder<>>(SrcInst, Offset,
+                                                          LoadTy, Builder, DL);
+}
+
+Constant *getConstantMemInstValueForLoad(MemIntrinsic *SrcInst, unsigned Offset,
+                                         Type *LoadTy, const DataLayout &DL) {
+  // The only case analyzeLoadFromClobberingMemInst cannot be converted to a
+  // constant is when it's a memset of a non-constant.
+  if (auto *MSI = dyn_cast<MemSetInst>(SrcInst))
+    if (!isa<Constant>(MSI->getValue()))
+      return nullptr;
+  ConstantFolder F;
+  return getMemInstValueForLoadHelper<Constant, ConstantFolder>(SrcInst, Offset,
+                                                                LoadTy, F, DL);
+}
+} // namespace VNCoercion
+} // namespace llvm
diff --git a/lib/Transforms/Vectorize/BBVectorize.cpp b/lib/Transforms/Vectorize/BBVectorize.cpp
index c01740b27d59bfb58a4424e44901cd28b163f5ec..c83b3f7b225bc243b3ebb09517d1475d27fe66c3 100644
--- a/lib/Transforms/Vectorize/BBVectorize.cpp
+++ b/lib/Transforms/Vectorize/BBVectorize.cpp
@@ -494,13 +494,13 @@ namespace {
       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
         // For stores, it is the value type, not the pointer type that matters
         // because the value is what will come from a vector register.
-  
+
         Value *IVal = SI->getValueOperand();
         T1 = IVal->getType();
       } else {
         T1 = I->getType();
       }
-  
+
       if (CastInst *CI = dyn_cast<CastInst>(I))
         T2 = CI->getSrcTy();
       else
@@ -547,10 +547,11 @@ namespace {
     // Returns the cost of the provided instruction using TTI.
     // This does not handle loads and stores.
     unsigned getInstrCost(unsigned Opcode, Type *T1, Type *T2,
-                          TargetTransformInfo::OperandValueKind Op1VK = 
+                          TargetTransformInfo::OperandValueKind Op1VK =
                               TargetTransformInfo::OK_AnyValue,
                           TargetTransformInfo::OperandValueKind Op2VK =
-                              TargetTransformInfo::OK_AnyValue) {
+                              TargetTransformInfo::OK_AnyValue,
+                          const Instruction *I = nullptr) {
       switch (Opcode) {
       default: break;
       case Instruction::GetElementPtr:
@@ -584,7 +585,7 @@ namespace {
       case Instruction::Select:
       case Instruction::ICmp:
       case Instruction::FCmp:
-        return TTI->getCmpSelInstrCost(Opcode, T1, T2);
+        return TTI->getCmpSelInstrCost(Opcode, T1, T2, I);
       case Instruction::ZExt:
       case Instruction::SExt:
       case Instruction::FPToUI:
@@ -598,7 +599,7 @@ namespace {
       case Instruction::FPTrunc:
       case Instruction::BitCast:
       case Instruction::ShuffleVector:
-        return TTI->getCastInstrCost(Opcode, T1, T2);
+        return TTI->getCastInstrCost(Opcode, T1, T2, I);
       }
 
       return 1;
@@ -894,7 +895,7 @@ namespace {
       // vectors that has a scalar condition results in a malformed select.
       // FIXME: We could probably be smarter about this by rewriting the select
       // with different types instead.
-      return (SI->getCondition()->getType()->isVectorTy() == 
+      return (SI->getCondition()->getType()->isVectorTy() ==
               SI->getTrueValue()->getType()->isVectorTy());
     } else if (isa<CmpInst>(I)) {
       if (!Config.VectorizeCmp)
@@ -1044,14 +1045,14 @@ namespace {
         return false;
       }
     } else if (TTI) {
-      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2);
-      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2);
-      Type *VT1 = getVecTypeForPair(IT1, JT1),
-           *VT2 = getVecTypeForPair(IT2, JT2);
       TargetTransformInfo::OperandValueKind Op1VK =
           TargetTransformInfo::OK_AnyValue;
       TargetTransformInfo::OperandValueKind Op2VK =
           TargetTransformInfo::OK_AnyValue;
+      unsigned ICost = getInstrCost(I->getOpcode(), IT1, IT2, Op1VK, Op2VK, I);
+      unsigned JCost = getInstrCost(J->getOpcode(), JT1, JT2, Op1VK, Op2VK, J);
+      Type *VT1 = getVecTypeForPair(IT1, JT1),
+           *VT2 = getVecTypeForPair(IT2, JT2);
 
       // On some targets (example X86) the cost of a vector shift may vary
       // depending on whether the second operand is a Uniform or
@@ -1090,7 +1091,7 @@ namespace {
       // but this cost is ignored (because insert and extract element
       // instructions are assigned a zero depth factor and are not really
       // fused in general).
-      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK);
+      unsigned VCost = getInstrCost(I->getOpcode(), VT1, VT2, Op1VK, Op2VK, I);
 
       if (VCost > ICost + JCost)
         return false;
@@ -1127,39 +1128,51 @@ namespace {
         FastMathFlags FMFCI;
         if (auto *FPMOCI = dyn_cast<FPMathOperator>(CI))
           FMFCI = FPMOCI->getFastMathFlags();
+        SmallVector<Value *, 4> IArgs(CI->arg_operands());
+        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, IArgs, FMFCI);
 
-        SmallVector<Type*, 4> Tys;
-        for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i)
-          Tys.push_back(CI->getArgOperand(i)->getType());
-        unsigned ICost = TTI->getIntrinsicInstrCost(IID, IT1, Tys, FMFCI);
-
-        Tys.clear();
         CallInst *CJ = cast<CallInst>(J);
 
         FastMathFlags FMFCJ;
         if (auto *FPMOCJ = dyn_cast<FPMathOperator>(CJ))
           FMFCJ = FPMOCJ->getFastMathFlags();
 
-        for (unsigned i = 0, ie = CJ->getNumArgOperands(); i != ie; ++i)
-          Tys.push_back(CJ->getArgOperand(i)->getType());
-        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, Tys, FMFCJ);
+        SmallVector<Value *, 4> JArgs(CJ->arg_operands());
+        unsigned JCost = TTI->getIntrinsicInstrCost(IID, JT1, JArgs, FMFCJ);
 
-        Tys.clear();
         assert(CI->getNumArgOperands() == CJ->getNumArgOperands() &&
                "Intrinsic argument counts differ");
+        SmallVector<Type*, 4> Tys;
+        SmallVector<Value *, 4> VecArgs;
         for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
           if ((IID == Intrinsic::powi || IID == Intrinsic::ctlz ||
-               IID == Intrinsic::cttz) && i == 1)
+               IID == Intrinsic::cttz) && i == 1) {
             Tys.push_back(CI->getArgOperand(i)->getType());
-          else
+            VecArgs.push_back(CI->getArgOperand(i));
+          }
+          else {
             Tys.push_back(getVecTypeForPair(CI->getArgOperand(i)->getType(),
                                             CJ->getArgOperand(i)->getType()));
+            // Add both operands, and then count their scalarization overhead
+            // with VF 1.
+            VecArgs.push_back(CI->getArgOperand(i));
+            VecArgs.push_back(CJ->getArgOperand(i));
+          }
         }
 
+        // Compute the scalarization cost here with the original operands (to
+        // check for uniqueness etc), and then call getIntrinsicInstrCost()
+        // with the constructed vector types.
+        Type *RetTy = getVecTypeForPair(IT1, JT1);
+        unsigned ScalarizationCost = 0;
+        if (!RetTy->isVoidTy())
+          ScalarizationCost += TTI->getScalarizationOverhead(RetTy, true, false);
+        ScalarizationCost += TTI->getOperandsScalarizationOverhead(VecArgs, 1);
+
         FastMathFlags FMFV = FMFCI;
         FMFV &= FMFCJ;
-        Type *RetTy = getVecTypeForPair(IT1, JT1);
-        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV);
+        unsigned VCost = TTI->getIntrinsicInstrCost(IID, RetTy, Tys, FMFV,
+                                                    ScalarizationCost);
 
         if (VCost > ICost + JCost)
           return false;
@@ -2502,7 +2515,7 @@ namespace {
         if (I2 == I1 || isa<UndefValue>(I2))
           I2 = nullptr;
       }
-  
+
       if (HEE) {
         Value *I3 = HEE->getOperand(0);
         if (!I2 && I3 != I1)
@@ -2693,14 +2706,14 @@ namespace {
         // so extend the smaller vector to be the same length as the larger one.
         Instruction *NLOp;
         if (numElemL > 1) {
-  
+
           std::vector<Constant *> Mask(numElemH);
           unsigned v = 0;
           for (; v < numElemL; ++v)
             Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
           for (; v < numElemH; ++v)
             Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-    
+
           NLOp = new ShuffleVectorInst(LOp, UndefValue::get(ArgTypeL),
                                        ConstantVector::get(Mask),
                                        getReplacementName(IBeforeJ ? I : J,
@@ -2710,7 +2723,7 @@ namespace {
                                            getReplacementName(IBeforeJ ? I : J,
                                                               true, o, 1));
         }
-  
+
         NLOp->insertBefore(IBeforeJ ? J : I);
         LOp = NLOp;
       }
@@ -2720,7 +2733,7 @@ namespace {
       if (numElemH == 1 && expandIEChain(Context, I, J, o, LOp, numElemL,
                                          ArgTypeH, VArgType, IBeforeJ)) {
         Instruction *S =
-          InsertElementInst::Create(LOp, HOp, 
+          InsertElementInst::Create(LOp, HOp,
                                     ConstantInt::get(Type::getInt32Ty(Context),
                                                      numElemL),
                                     getReplacementName(IBeforeJ ? I : J,
@@ -2737,7 +2750,7 @@ namespace {
             Mask[v] = ConstantInt::get(Type::getInt32Ty(Context), v);
           for (; v < numElemL; ++v)
             Mask[v] = UndefValue::get(Type::getInt32Ty(Context));
-    
+
           NHOp = new ShuffleVectorInst(HOp, UndefValue::get(ArgTypeH),
                                        ConstantVector::get(Mask),
                                        getReplacementName(IBeforeJ ? I : J,
diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 20fbcf4533d4ea26d8b5958d17edf626c9c9382b..4409d7a404f8b8ddc1b4593c6bffa52ff8854a6f 100644
--- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -432,9 +432,12 @@ Vectorizer::splitOddVectorElts(ArrayRef<Instruction *> Chain,
   unsigned ElementSizeBytes = ElementSizeBits / 8;
   unsigned SizeBytes = ElementSizeBytes * Chain.size();
   unsigned NumLeft = (SizeBytes - (SizeBytes % 4)) / ElementSizeBytes;
-  if (NumLeft == Chain.size())
-    --NumLeft;
-  else if (NumLeft == 0)
+  if (NumLeft == Chain.size()) {
+    if ((NumLeft & 1) == 0)
+      NumLeft /= 2; // Split even in half
+    else
+      --NumLeft;    // Split off last element
+  } else if (NumLeft == 0)
     NumLeft = 1;
   return std::make_pair(Chain.slice(0, NumLeft), Chain.slice(NumLeft));
 }
diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp
index 4645714faeba6ba23324368bde0f7a21e7d2e754..b0ced809d94eb55d0bec394b7f86fd5b5bf87e47 100644
--- a/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -50,6 +50,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/Hashing.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -267,21 +268,6 @@ static bool hasCyclesInLoopBody(const Loop &L) {
   return false;
 }
 
-/// \brief This modifies LoopAccessReport to initialize message with
-/// loop-vectorizer-specific part.
-class VectorizationReport : public LoopAccessReport {
-public:
-  VectorizationReport(Instruction *I = nullptr)
-      : LoopAccessReport("loop not vectorized: ", I) {}
-
-  /// \brief This allows promotion of the loop-access analysis report into the
-  /// loop-vectorizer report.  It modifies the message to add the
-  /// loop-vectorizer-specific part of the message.
-  explicit VectorizationReport(const LoopAccessReport &R)
-      : LoopAccessReport(Twine("loop not vectorized: ") + R.str(),
-                         R.getInstr()) {}
-};
-
 /// A helper function for converting Scalar types to vector types.
 /// If the incoming type is void, we return void. If the VF is 1, we return
 /// the scalar type.
@@ -291,31 +277,9 @@ static Type *ToVectorTy(Type *Scalar, unsigned VF) {
   return VectorType::get(Scalar, VF);
 }
 
-/// A helper function that returns GEP instruction and knows to skip a
-/// 'bitcast'. The 'bitcast' may be skipped if the source and the destination
-/// pointee types of the 'bitcast' have the same size.
-/// For example:
-///   bitcast double** %var to i64* - can be skipped
-///   bitcast double** %var to i8*  - can not
-static GetElementPtrInst *getGEPInstruction(Value *Ptr) {
-
-  if (isa<GetElementPtrInst>(Ptr))
-    return cast<GetElementPtrInst>(Ptr);
-
-  if (isa<BitCastInst>(Ptr) &&
-      isa<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0))) {
-    Type *BitcastTy = Ptr->getType();
-    Type *GEPTy = cast<BitCastInst>(Ptr)->getSrcTy();
-    if (!isa<PointerType>(BitcastTy) || !isa<PointerType>(GEPTy))
-      return nullptr;
-    Type *Pointee1Ty = cast<PointerType>(BitcastTy)->getPointerElementType();
-    Type *Pointee2Ty = cast<PointerType>(GEPTy)->getPointerElementType();
-    const DataLayout &DL = cast<BitCastInst>(Ptr)->getModule()->getDataLayout();
-    if (DL.getTypeSizeInBits(Pointee1Ty) == DL.getTypeSizeInBits(Pointee2Ty))
-      return cast<GetElementPtrInst>(cast<BitCastInst>(Ptr)->getOperand(0));
-  }
-  return nullptr;
-}
+// FIXME: The following helper functions have multiple implementations
+// in the project. They can be effectively organized in a common Load/Store
+// utilities unit.
 
 /// A helper function that returns the pointer operand of a load or store
 /// instruction.
@@ -327,6 +291,34 @@ static Value *getPointerOperand(Value *I) {
   return nullptr;
 }
 
+/// A helper function that returns the type of loaded or stored value.
+static Type *getMemInstValueType(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getType();
+  return cast<StoreInst>(I)->getValueOperand()->getType();
+}
+
+/// A helper function that returns the alignment of load or store instruction.
+static unsigned getMemInstAlignment(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getAlignment();
+  return cast<StoreInst>(I)->getAlignment();
+}
+
+/// A helper function that returns the address space of the pointer operand of
+/// load or store instruction.
+static unsigned getMemInstAddressSpace(Value *I) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    return LI->getPointerAddressSpace();
+  return cast<StoreInst>(I)->getPointerAddressSpace();
+}
+
 /// A helper function that returns true if the given type is irregular. The
 /// type is irregular if its allocated size doesn't equal the store size of an
 /// element of the corresponding vector type at the given vectorization factor.
@@ -352,6 +344,23 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
 ///       we always assume predicated blocks have a 50% chance of executing.
 static unsigned getReciprocalPredBlockProb() { return 2; }
 
+/// A helper function that adds a 'fast' flag to floating-point operations.
+static Value *addFastMathFlag(Value *V) {
+  if (isa<FPMathOperator>(V)) {
+    FastMathFlags Flags;
+    Flags.setUnsafeAlgebra();
+    cast<Instruction>(V)->setFastMathFlags(Flags);
+  }
+  return V;
+}
+
+/// A helper function that returns an integer or floating-point constant with
+/// value C.
+static Constant *getSignedIntOrFpConstant(Type *Ty, int64_t C) {
+  return Ty->isIntegerTy() ? ConstantInt::getSigned(Ty, C)
+                           : ConstantFP::get(Ty, C);
+}
+
 /// InnerLoopVectorizer vectorizes loops which contain only one basic
 /// block to a specified vectorization factor (VF).
 /// This class performs the widening of scalars into vectors, or multiple
@@ -429,10 +438,17 @@ protected:
   /// Copy and widen the instructions from the old loop.
   virtual void vectorizeLoop();
 
+  /// Handle all cross-iteration phis in the header.
+  void fixCrossIterationPHIs();
+
   /// Fix a first-order recurrence. This is the second phase of vectorizing
   /// this phi node.
   void fixFirstOrderRecurrence(PHINode *Phi);
 
+  /// Fix a reduction cross-iteration phi. This is the second phase of
+  /// vectorizing this phi node.
+  void fixReduction(PHINode *Phi);
+
   /// \brief The Loop exit block may have single value PHI nodes where the
   /// incoming value is 'Undef'. While vectorizing we only handled real values
   /// that were defined inside the loop. Here we fix the 'undef case'.
@@ -464,13 +480,12 @@ protected:
   VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst);
 
   /// A helper function to vectorize a single BB within the innermost loop.
-  void vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV);
+  void vectorizeBlockInLoop(BasicBlock *BB);
 
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
   /// arbitrary length vectors.
-  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF,
-                           PhiVector *PV);
+  void widenPHIInstruction(Instruction *PN, unsigned UF, unsigned VF);
 
   /// Insert the new loop to the loop hierarchy and pass manager
   /// and update the analysis passes.
@@ -505,20 +520,21 @@ protected:
   /// \p EntryVal is the value from the original loop that maps to the steps.
   /// Note that \p EntryVal doesn't have to be an induction variable (e.g., it
   /// can be a truncate instruction).
-  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal);
-
-  /// Create a vector induction phi node based on an existing scalar one. This
-  /// currently only works for integer induction variables with a constant
-  /// step. \p EntryVal is the value from the original loop that maps to the
-  /// vector phi node. If \p EntryVal is a truncate instruction, instead of
-  /// widening the original IV, we widen a version of the IV truncated to \p
-  /// EntryVal's type.
-  void createVectorIntInductionPHI(const InductionDescriptor &II,
-                                   Instruction *EntryVal);
-
-  /// Widen an integer induction variable \p IV. If \p Trunc is provided, the
-  /// induction variable will first be truncated to the corresponding type.
-  void widenIntInduction(PHINode *IV, TruncInst *Trunc = nullptr);
+  void buildScalarSteps(Value *ScalarIV, Value *Step, Value *EntryVal,
+                        const InductionDescriptor &ID);
+
+  /// Create a vector induction phi node based on an existing scalar one. \p
+  /// EntryVal is the value from the original loop that maps to the vector phi
+  /// node, and \p Step is the loop-invariant step. If \p EntryVal is a
+  /// truncate instruction, instead of widening the original IV, we widen a
+  /// version of the IV truncated to \p EntryVal's type.
+  void createVectorIntOrFpInductionPHI(const InductionDescriptor &II,
+                                       Value *Step, Instruction *EntryVal);
+
+  /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
+  /// is provided, the integer induction variable will first be truncated to
+  /// the corresponding type.
+  void widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc = nullptr);
 
   /// Returns true if an instruction \p I should be scalarized instead of
   /// vectorized for the chosen vectorization factor.
@@ -584,6 +600,10 @@ protected:
   /// vector of instructions.
   void addMetadata(ArrayRef<Value *> To, Instruction *From);
 
+  /// \brief Set the debug location in the builder using the debug location in
+  /// the instruction.
+  void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr);
+
   /// This is a helper class for maintaining vectorization state. It's used for
   /// mapping values from the original loop to their corresponding values in
   /// the new loop. Two mappings are maintained: one for vectorized values and
@@ -804,8 +824,6 @@ public:
                             UnrollFactor, LVL, CM) {}
 
 private:
-  void scalarizeInstruction(Instruction *Instr,
-                            bool IfPredicateInstr = false) override;
   void vectorizeMemoryInstruction(Instruction *Instr) override;
   Value *getBroadcastInstrs(Value *V) override;
   Value *getStepVector(Value *Val, int StartIdx, Value *Step,
@@ -833,12 +851,14 @@ static Instruction *getDebugLocFromInstOrOperands(Instruction *I) {
   return I;
 }
 
-/// \brief Set the debug location in the builder using the debug location in the
-/// instruction.
-static void setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
-  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr))
-    B.SetCurrentDebugLocation(Inst->getDebugLoc());
-  else
+void InnerLoopVectorizer::setDebugLocFromInst(IRBuilder<> &B, const Value *Ptr) {
+  if (const Instruction *Inst = dyn_cast_or_null<Instruction>(Ptr)) {
+    const DILocation *DIL = Inst->getDebugLoc();
+    if (DIL && Inst->getFunction()->isDebugInfoForProfiling())
+      B.SetCurrentDebugLocation(DIL->cloneWithDuplicationFactor(UF * VF));
+    else
+      B.SetCurrentDebugLocation(DIL);
+  } else
     B.SetCurrentDebugLocation(DebugLoc());
 }
 
@@ -1498,14 +1518,6 @@ private:
   OptimizationRemarkEmitter &ORE;
 };
 
-static void emitAnalysisDiag(const Loop *TheLoop,
-                             const LoopVectorizeHints &Hints,
-                             OptimizationRemarkEmitter &ORE,
-                             const LoopAccessReport &Message) {
-  const char *Name = Hints.vectorizeAnalysisPassName();
-  LoopAccessReport::emitAnalysis(Message, TheLoop, Name, ORE);
-}
-
 static void emitMissedWarning(Function *F, Loop *L,
                               const LoopVectorizeHints &LH,
                               OptimizationRemarkEmitter *ORE) {
@@ -1513,13 +1525,17 @@ static void emitMissedWarning(Function *F, Loop *L,
 
   if (LH.getForce() == LoopVectorizeHints::FK_Enabled) {
     if (LH.getWidth() != 1)
-      emitLoopVectorizeWarning(
-          F->getContext(), *F, L->getStartLoc(),
-          "failed explicitly specified loop vectorization");
+      ORE->emit(DiagnosticInfoOptimizationFailure(
+                    DEBUG_TYPE, "FailedRequestedVectorization",
+                    L->getStartLoc(), L->getHeader())
+                << "loop not vectorized: "
+                << "failed explicitly specified loop vectorization");
     else if (LH.getInterleave() != 1)
-      emitLoopInterleaveWarning(
-          F->getContext(), *F, L->getStartLoc(),
-          "failed explicitly specified loop interleaving");
+      ORE->emit(DiagnosticInfoOptimizationFailure(
+                    DEBUG_TYPE, "FailedRequestedInterleaving", L->getStartLoc(),
+                    L->getHeader())
+                << "loop not interleaved: "
+                << "failed explicitly specified loop interleaving");
   }
 }
 
@@ -1547,7 +1563,7 @@ public:
       LoopVectorizeHints *H)
       : NumPredStores(0), TheLoop(L), PSE(PSE), TLI(TLI), TTI(TTI), DT(DT),
         GetLAA(GetLAA), LAI(nullptr), ORE(ORE), InterleaveInfo(PSE, L, DT, LI),
-        Induction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
+        PrimaryInduction(nullptr), WidestIndTy(nullptr), HasFunNoNaNAttr(false),
         Requirements(R), Hints(H) {}
 
   /// ReductionList contains the reduction descriptors for all
@@ -1567,8 +1583,8 @@ public:
   /// loop, only that it is legal to do so.
   bool canVectorize();
 
-  /// Returns the Induction variable.
-  PHINode *getInduction() { return Induction; }
+  /// Returns the primary induction variable.
+  PHINode *getPrimaryInduction() { return PrimaryInduction; }
 
   /// Returns the reduction variables found in the loop.
   ReductionList *getReductionVars() { return &Reductions; }
@@ -1608,12 +1624,6 @@ public:
   /// Returns true if the value V is uniform within the loop.
   bool isUniform(Value *V);
 
-  /// Returns true if \p I is known to be uniform after vectorization.
-  bool isUniformAfterVectorization(Instruction *I) { return Uniforms.count(I); }
-
-  /// Returns true if \p I is known to be scalar after vectorization.
-  bool isScalarAfterVectorization(Instruction *I) { return Scalars.count(I); }
-
   /// Returns the information that we collected about runtime memory check.
   const RuntimePointerChecking *getRuntimePointerChecking() const {
     return LAI->getRuntimePointerChecking();
@@ -1690,15 +1700,9 @@ public:
   /// instructions that may divide by zero.
   bool isScalarWithPredication(Instruction *I);
 
-  /// Returns true if \p I is a memory instruction that has a consecutive or
-  /// consecutive-like pointer operand. Consecutive-like pointers are pointers
-  /// that are treated like consecutive pointers during vectorization. The
-  /// pointer operands of interleaved accesses are an example.
-  bool hasConsecutiveLikePtrOperand(Instruction *I);
-
-  /// Returns true if \p I is a memory instruction that must be scalarized
-  /// during vectorization.
-  bool memoryInstructionMustBeScalarized(Instruction *I, unsigned VF = 1);
+  /// Returns true if \p I is a memory instruction with consecutive memory
+  /// access that can be widened.
+  bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1);
 
 private:
   /// Check if a single basic block loop is vectorizable.
@@ -1716,24 +1720,6 @@ private:
   /// transformation.
   bool canVectorizeWithIfConvert();
 
-  /// Collect the instructions that are uniform after vectorization. An
-  /// instruction is uniform if we represent it with a single scalar value in
-  /// the vectorized loop corresponding to each vector iteration. Examples of
-  /// uniform instructions include pointer operands of consecutive or
-  /// interleaved memory accesses. Note that although uniformity implies an
-  /// instruction will be scalar, the reverse is not true. In general, a
-  /// scalarized instruction will be represented by VF scalar values in the
-  /// vectorized loop, each corresponding to an iteration of the original
-  /// scalar loop.
-  void collectLoopUniforms();
-
-  /// Collect the instructions that are scalar after vectorization. An
-  /// instruction is scalar if it is known to be uniform or will be scalarized
-  /// during vectorization. Non-uniform scalarized instructions will be
-  /// represented by VF values in the vectorized loop, each corresponding to an
-  /// iteration of the original scalar loop.
-  void collectLoopScalars();
-
   /// Return true if all of the instructions in the block can be speculatively
   /// executed. \p SafePtrs is a list of addresses that are known to be legal
   /// and we know that we can read from them without segfault.
@@ -1745,14 +1731,6 @@ private:
   void addInductionPhi(PHINode *Phi, const InductionDescriptor &ID,
                        SmallPtrSetImpl<Value *> &AllowedExit);
 
-  /// Report an analysis message to assist the user in diagnosing loops that are
-  /// not vectorized.  These are handled as LoopAccessReport rather than
-  /// VectorizationReport because the << operator of VectorizationReport returns
-  /// LoopAccessReport.
-  void emitAnalysis(const LoopAccessReport &Message) const {
-    emitAnalysisDiag(TheLoop, *Hints, *ORE, Message);
-  }
-
   /// Create an analysis remark that explains why vectorization failed
   ///
   /// \p RemarkName is the identifier for the remark.  If \p I is passed it is
@@ -1805,9 +1783,9 @@ private:
 
   //  ---  vectorization state --- //
 
-  /// Holds the integer induction variable. This is the counter of the
+  /// Holds the primary induction variable. This is the counter of the
   /// loop.
-  PHINode *Induction;
+  PHINode *PrimaryInduction;
   /// Holds the reduction variables.
   ReductionList Reductions;
   /// Holds all of the induction variables that we found in the loop.
@@ -1823,12 +1801,6 @@ private:
   /// vars which can be accessed from outside the loop.
   SmallPtrSet<Value *, 4> AllowedExit;
 
-  /// Holds the instructions known to be uniform after vectorization.
-  SmallPtrSet<Instruction *, 4> Uniforms;
-
-  /// Holds the instructions known to be scalar after vectorization.
-  SmallPtrSet<Instruction *, 4> Scalars;
-
   /// Can we assume the absence of NaNs.
   bool HasFunNoNaNAttr;
 
@@ -1862,16 +1834,26 @@ public:
       : TheLoop(L), PSE(PSE), LI(LI), Legal(Legal), TTI(TTI), TLI(TLI), DB(DB),
         AC(AC), ORE(ORE), TheFunction(F), Hints(Hints) {}
 
+  /// \return An upper bound for the vectorization factor, or None if
+  /// vectorization should be avoided up front.
+  Optional<unsigned> computeMaxVF(bool OptForSize);
+
   /// Information about vectorization costs
   struct VectorizationFactor {
     unsigned Width; // Vector width with best cost
     unsigned Cost;  // Cost of the loop with that width
   };
   /// \return The most profitable vectorization factor and the cost of that VF.
-  /// This method checks every power of two up to VF. If UserVF is not ZERO
+  /// This method checks every power of two up to MaxVF. If UserVF is not ZERO
   /// then this vectorization factor will be selected if vectorization is
   /// possible.
-  VectorizationFactor selectVectorizationFactor(bool OptForSize);
+  VectorizationFactor selectVectorizationFactor(unsigned MaxVF);
+
+  /// Setup cost-based decisions for user vectorization factor.
+  void selectUserVectorizationFactor(unsigned UserVF) {
+    collectUniformsAndScalars(UserVF);
+    collectInstsToScalarize(UserVF);
+  }
 
   /// \return The size (in bits) of the smallest and widest types in the code
   /// that needs to be vectorized. We ignore values that remain scalar such as
@@ -1885,6 +1867,15 @@ public:
   unsigned selectInterleaveCount(bool OptForSize, unsigned VF,
                                  unsigned LoopCost);
 
+  /// Memory access instruction may be vectorized in more than one way.
+  /// Form of instruction after vectorization depends on cost.
+  /// This function takes cost-based decisions for Load/Store instructions
+  /// and collects them in a map. This decisions map is used for building
+  /// the lists of loop-uniform and loop-scalar instructions.
+  /// The calculated cost is saved with widening decision in order to
+  /// avoid redundant calculations.
+  void setCostBasedWideningDecision(unsigned VF);
+
   /// \brief A struct that represents some properties of the register usage
   /// of a loop.
   struct RegisterUsage {
@@ -1919,14 +1910,118 @@ public:
     return Scalars->second.count(I);
   }
 
+  /// Returns true if \p I is known to be uniform after vectorization.
+  bool isUniformAfterVectorization(Instruction *I, unsigned VF) const {
+    if (VF == 1)
+      return true;
+    assert(Uniforms.count(VF) && "VF not yet analyzed for uniformity");
+    auto UniformsPerVF = Uniforms.find(VF);
+    return UniformsPerVF->second.count(I);
+  }
+
+  /// Returns true if \p I is known to be scalar after vectorization.
+  bool isScalarAfterVectorization(Instruction *I, unsigned VF) const {
+    if (VF == 1)
+      return true;
+    assert(Scalars.count(VF) && "Scalar values are not calculated for VF");
+    auto ScalarsPerVF = Scalars.find(VF);
+    return ScalarsPerVF->second.count(I);
+  }
+
   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
   /// for vectorization factor \p VF.
   bool canTruncateToMinimalBitwidth(Instruction *I, unsigned VF) const {
     return VF > 1 && MinBWs.count(I) && !isProfitableToScalarize(I, VF) &&
-           !Legal->isScalarAfterVectorization(I);
+           !isScalarAfterVectorization(I, VF);
+  }
+
+  /// Decision that was taken during cost calculation for memory instruction.
+  enum InstWidening {
+    CM_Unknown,
+    CM_Widen,
+    CM_Interleave,
+    CM_GatherScatter,
+    CM_Scalarize
+  };
+
+  /// Save vectorization decision \p W and \p Cost taken by the cost model for
+  /// instruction \p I and vector width \p VF.
+  void setWideningDecision(Instruction *I, unsigned VF, InstWidening W,
+                           unsigned Cost) {
+    assert(VF >= 2 && "Expected VF >=2");
+    WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+  }
+
+  /// Save vectorization decision \p W and \p Cost taken by the cost model for
+  /// interleaving group \p Grp and vector width \p VF.
+  void setWideningDecision(const InterleaveGroup *Grp, unsigned VF,
+                           InstWidening W, unsigned Cost) {
+    assert(VF >= 2 && "Expected VF >=2");
+    /// Broadcast this decicion to all instructions inside the group.
+    /// But the cost will be assigned to one instruction only.
+    for (unsigned i = 0; i < Grp->getFactor(); ++i) {
+      if (auto *I = Grp->getMember(i)) {
+        if (Grp->getInsertPos() == I)
+          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, Cost);
+        else
+          WideningDecisions[std::make_pair(I, VF)] = std::make_pair(W, 0);
+      }
+    }
+  }
+
+  /// Return the cost model decision for the given instruction \p I and vector
+  /// width \p VF. Return CM_Unknown if this instruction did not pass
+  /// through the cost modeling.
+  InstWidening getWideningDecision(Instruction *I, unsigned VF) {
+    assert(VF >= 2 && "Expected VF >=2");
+    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    auto Itr = WideningDecisions.find(InstOnVF);
+    if (Itr == WideningDecisions.end())
+      return CM_Unknown;
+    return Itr->second.first;
+  }
+
+  /// Return the vectorization cost for the given instruction \p I and vector
+  /// width \p VF.
+  unsigned getWideningCost(Instruction *I, unsigned VF) {
+    assert(VF >= 2 && "Expected VF >=2");
+    std::pair<Instruction *, unsigned> InstOnVF = std::make_pair(I, VF);
+    assert(WideningDecisions.count(InstOnVF) && "The cost is not calculated");
+    return WideningDecisions[InstOnVF].second;
+  }
+
+  /// Return True if instruction \p I is an optimizable truncate whose operand
+  /// is an induction variable. Such a truncate will be removed by adding a new
+  /// induction variable with the destination type.
+  bool isOptimizableIVTruncate(Instruction *I, unsigned VF) {
+
+    // If the instruction is not a truncate, return false.
+    auto *Trunc = dyn_cast<TruncInst>(I);
+    if (!Trunc)
+      return false;
+
+    // Get the source and destination types of the truncate.
+    Type *SrcTy = ToVectorTy(cast<CastInst>(I)->getSrcTy(), VF);
+    Type *DestTy = ToVectorTy(cast<CastInst>(I)->getDestTy(), VF);
+
+    // If the truncate is free for the given types, return false. Replacing a
+    // free truncate with an induction variable would add an induction variable
+    // update instruction to each iteration of the loop. We exclude from this
+    // check the primary induction variable since it will need an update
+    // instruction regardless.
+    Value *Op = Trunc->getOperand(0);
+    if (Op != Legal->getPrimaryInduction() && TTI.isTruncateFree(SrcTy, DestTy))
+      return false;
+
+    // If the truncated value is not an induction variable, return false.
+    return Legal->isInductionVariable(Op);
   }
 
 private:
+  /// \return An upper bound for the vectorization factor, larger than zero.
+  /// One is returned if vectorization should best be avoided due to cost.
+  unsigned computeFeasibleMaxVF(bool OptForSize);
+
   /// The vectorization cost is a combination of the cost itself and a boolean
   /// indicating whether any of the contributing operations will actually
   /// operate on
@@ -1950,6 +2045,26 @@ private:
   /// the vector type as an output parameter.
   unsigned getInstructionCost(Instruction *I, unsigned VF, Type *&VectorTy);
 
+  /// Calculate vectorization cost of memory instruction \p I.
+  unsigned getMemoryInstructionCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for scalarized memory instruction.
+  unsigned getMemInstScalarizationCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for interleaving group of memory instructions.
+  unsigned getInterleaveGroupCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for Gather/Scatter instruction.
+  unsigned getGatherScatterCost(Instruction *I, unsigned VF);
+
+  /// The cost computation for widening instruction \p I with consecutive
+  /// memory access.
+  unsigned getConsecutiveMemOpCost(Instruction *I, unsigned VF);
+
+  /// The cost calculation for Load instruction \p I with uniform pointer -
+  /// scalar load + broadcast.
+  unsigned getUniformMemOpCost(Instruction *I, unsigned VF);
+
   /// Returns whether the instruction is a load or store and will be a emitted
   /// as a vector operation.
   bool isConsecutiveLoadOrStore(Instruction *I);
@@ -1973,12 +2088,24 @@ private:
   /// pairs.
   typedef DenseMap<Instruction *, unsigned> ScalarCostsTy;
 
+  /// A set containing all BasicBlocks that are known to present after
+  /// vectorization as a predicated block.
+  SmallPtrSet<BasicBlock *, 4> PredicatedBBsAfterVectorization;
+
   /// A map holding scalar costs for different vectorization factors. The
   /// presence of a cost for an instruction in the mapping indicates that the
   /// instruction will be scalarized when vectorizing with the associated
   /// vectorization factor. The entries are VF-ScalarCostTy pairs.
   DenseMap<unsigned, ScalarCostsTy> InstsToScalarize;
 
+  /// Holds the instructions known to be uniform after vectorization.
+  /// The data is collected per VF.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Uniforms;
+
+  /// Holds the instructions known to be scalar after vectorization.
+  /// The data is collected per VF.
+  DenseMap<unsigned, SmallPtrSet<Instruction *, 4>> Scalars;
+
   /// Returns the expected difference in cost from scalarizing the expression
   /// feeding a predicated instruction \p PredInst. The instructions to
   /// scalarize and their scalar costs are collected in \p ScalarCosts. A
@@ -1991,6 +2118,44 @@ private:
   /// the loop.
   void collectInstsToScalarize(unsigned VF);
 
+  /// Collect the instructions that are uniform after vectorization. An
+  /// instruction is uniform if we represent it with a single scalar value in
+  /// the vectorized loop corresponding to each vector iteration. Examples of
+  /// uniform instructions include pointer operands of consecutive or
+  /// interleaved memory accesses. Note that although uniformity implies an
+  /// instruction will be scalar, the reverse is not true. In general, a
+  /// scalarized instruction will be represented by VF scalar values in the
+  /// vectorized loop, each corresponding to an iteration of the original
+  /// scalar loop.
+  void collectLoopUniforms(unsigned VF);
+
+  /// Collect the instructions that are scalar after vectorization. An
+  /// instruction is scalar if it is known to be uniform or will be scalarized
+  /// during vectorization. Non-uniform scalarized instructions will be
+  /// represented by VF values in the vectorized loop, each corresponding to an
+  /// iteration of the original scalar loop.
+  void collectLoopScalars(unsigned VF);
+
+  /// Collect Uniform and Scalar values for the given \p VF.
+  /// The sets depend on CM decision for Load/Store instructions
+  /// that may be vectorized as interleave, gather-scatter or scalarized.
+  void collectUniformsAndScalars(unsigned VF) {
+    // Do the analysis once.
+    if (VF == 1 || Uniforms.count(VF))
+      return;
+    setCostBasedWideningDecision(VF);
+    collectLoopUniforms(VF);
+    collectLoopScalars(VF);
+  }
+
+  /// Keeps cost model vectorization decision and cost for instructions.
+  /// Right now it is used for memory instructions only.
+  typedef DenseMap<std::pair<Instruction *, unsigned>,
+                   std::pair<InstWidening, unsigned>>
+      DecisionList;
+
+  DecisionList WideningDecisions;
+
 public:
   /// The loop that we evaluate.
   Loop *TheLoop;
@@ -2020,6 +2185,23 @@ public:
   SmallPtrSet<const Value *, 16> VecValuesToIgnore;
 };
 
+/// LoopVectorizationPlanner - drives the vectorization process after having
+/// passed Legality checks.
+class LoopVectorizationPlanner {
+public:
+  LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {}
+
+  ~LoopVectorizationPlanner() {}
+
+  /// Plan how to best vectorize, return the best VF and its cost.
+  LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize,
+                                                       unsigned UserVF);
+
+private:
+  /// The profitablity analysis.
+  LoopVectorizationCostModel &CM;
+};
+
 /// \brief This holds vectorization requirements that must be verified late in
 /// the process. The requirements are set by legalize and costmodel. Once
 /// vectorization has been determined to be possible and profitable the
@@ -2155,7 +2337,7 @@ struct LoopVectorize : public FunctionPass {
 
 //===----------------------------------------------------------------------===//
 // Implementation of LoopVectorizationLegality, InnerLoopVectorizer and
-// LoopVectorizationCostModel.
+// LoopVectorizationCostModel and LoopVectorizationPlanner.
 //===----------------------------------------------------------------------===//
 
 Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
@@ -2175,27 +2357,51 @@ Value *InnerLoopVectorizer::getBroadcastInstrs(Value *V) {
   return Shuf;
 }
 
-void InnerLoopVectorizer::createVectorIntInductionPHI(
-    const InductionDescriptor &II, Instruction *EntryVal) {
+void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
+    const InductionDescriptor &II, Value *Step, Instruction *EntryVal) {
   Value *Start = II.getStartValue();
-  ConstantInt *Step = II.getConstIntStepValue();
-  assert(Step && "Can not widen an IV with a non-constant step");
 
   // Construct the initial value of the vector IV in the vector loop preheader
   auto CurrIP = Builder.saveIP();
   Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
   if (isa<TruncInst>(EntryVal)) {
+    assert(Start->getType()->isIntegerTy() &&
+           "Truncation requires an integer type");
     auto *TruncType = cast<IntegerType>(EntryVal->getType());
-    Step = ConstantInt::getSigned(TruncType, Step->getSExtValue());
+    Step = Builder.CreateTrunc(Step, TruncType);
     Start = Builder.CreateCast(Instruction::Trunc, Start, TruncType);
   }
   Value *SplatStart = Builder.CreateVectorSplat(VF, Start);
-  Value *SteppedStart = getStepVector(SplatStart, 0, Step);
+  Value *SteppedStart =
+      getStepVector(SplatStart, 0, Step, II.getInductionOpcode());
+
+  // We create vector phi nodes for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (Step->getType()->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = II.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
+
+  // Multiply the vectorization factor by the step using integer or
+  // floating-point arithmetic as appropriate.
+  Value *ConstVF = getSignedIntOrFpConstant(Step->getType(), VF);
+  Value *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, Step, ConstVF));
+
+  // Create a vector splat to use in the induction update.
+  //
+  // FIXME: If the step is non-constant, we create the vector splat with
+  //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
+  //        handle a constant vector splat.
+  Value *SplatVF = isa<Constant>(Mul)
+                       ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
+                       : Builder.CreateVectorSplat(VF, Mul);
   Builder.restoreIP(CurrIP);
 
-  Value *SplatVF =
-      ConstantVector::getSplat(VF, ConstantInt::getSigned(Start->getType(),
-                               VF * Step->getSExtValue()));
   // We may need to add the step a number of times, depending on the unroll
   // factor. The last of those goes into the PHI.
   PHINode *VecInd = PHINode::Create(SteppedStart->getType(), 2, "vec.ind",
@@ -2204,8 +2410,8 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
   VectorParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
     Entry[Part] = LastInduction;
-    LastInduction = cast<Instruction>(
-        Builder.CreateAdd(LastInduction, SplatVF, "step.add"));
+    LastInduction = cast<Instruction>(addFastMathFlag(
+        Builder.CreateBinOp(AddOp, LastInduction, SplatVF, "step.add")));
   }
   VectorLoopValueMap.initVector(EntryVal, Entry);
   if (isa<TruncInst>(EntryVal))
@@ -2224,7 +2430,7 @@ void InnerLoopVectorizer::createVectorIntInductionPHI(
 }
 
 bool InnerLoopVectorizer::shouldScalarizeInstruction(Instruction *I) const {
-  return Legal->isScalarAfterVectorization(I) ||
+  return Cost->isScalarAfterVectorization(I, VF) ||
          Cost->isProfitableToScalarize(I, VF);
 }
 
@@ -2238,7 +2444,10 @@ bool InnerLoopVectorizer::needsScalarInduction(Instruction *IV) const {
   return any_of(IV->users(), isScalarInst);
 }
 
-void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
+void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
+
+  assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
+         "Primary induction variable must have an integer type");
 
   auto II = Legal->getInductionVars()->find(IV);
   assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
@@ -2250,9 +2459,6 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // induction variable.
   Value *ScalarIV = nullptr;
 
-  // The step of the induction.
-  Value *Step = nullptr;
-
   // The value from the original loop to which we are mapping the new induction
   // variable.
   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
@@ -2265,45 +2471,49 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // least one user in the loop that is not widened.
   auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
 
-  // If the induction variable has a constant integer step value, go ahead and
-  // get it now.
-  if (ID.getConstIntStepValue())
-    Step = ID.getConstIntStepValue();
+  // Generate code for the induction step. Note that induction steps are
+  // required to be loop-invariant
+  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
+         "Induction step should be loop invariant");
+  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
+  Value *Step = nullptr;
+  if (PSE.getSE()->isSCEVable(IV->getType())) {
+    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+    Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
+                             LoopVectorPreHeader->getTerminator());
+  } else {
+    Step = cast<SCEVUnknown>(ID.getStep())->getValue();
+  }
 
   // Try to create a new independent vector induction variable. If we can't
   // create the phi node, we will splat the scalar induction variable in each
   // loop iteration.
-  if (VF > 1 && IV->getType() == Induction->getType() && Step &&
-      !shouldScalarizeInstruction(EntryVal)) {
-    createVectorIntInductionPHI(ID, EntryVal);
+  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
+    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
     VectorizedIV = true;
   }
 
   // If we haven't yet vectorized the induction variable, or if we will create
   // a scalar one, we need to define the scalar induction variable and step
   // values. If we were given a truncation type, truncate the canonical
-  // induction variable and constant step. Otherwise, derive these values from
-  // the induction descriptor.
+  // induction variable and step. Otherwise, derive these values from the
+  // induction descriptor.
   if (!VectorizedIV || NeedsScalarIV) {
+    ScalarIV = Induction;
+    if (IV != OldInduction) {
+      ScalarIV = IV->getType()->isIntegerTy()
+                     ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
+                     : Builder.CreateCast(Instruction::SIToFP, Induction,
+                                          IV->getType());
+      ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
+      ScalarIV->setName("offset.idx");
+    }
     if (Trunc) {
       auto *TruncType = cast<IntegerType>(Trunc->getType());
-      assert(Step && "Truncation requires constant integer step");
-      auto StepInt = cast<ConstantInt>(Step)->getSExtValue();
-      ScalarIV = Builder.CreateCast(Instruction::Trunc, Induction, TruncType);
-      Step = ConstantInt::getSigned(TruncType, StepInt);
-    } else {
-      ScalarIV = Induction;
-      auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-      if (IV != OldInduction) {
-        ScalarIV = Builder.CreateSExtOrTrunc(ScalarIV, IV->getType());
-        ScalarIV = ID.transform(Builder, ScalarIV, PSE.getSE(), DL);
-        ScalarIV->setName("offset.idx");
-      }
-      if (!Step) {
-        SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-        Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
-                                 &*Builder.GetInsertPoint());
-      }
+      assert(Step->getType()->isIntegerTy() &&
+             "Truncation requires an integer step");
+      ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
+      Step = Builder.CreateTrunc(Step, TruncType);
     }
   }
 
@@ -2313,7 +2523,8 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
     VectorParts Entry(UF);
     for (unsigned Part = 0; Part < UF; ++Part)
-      Entry[Part] = getStepVector(Broadcasted, VF * Part, Step);
+      Entry[Part] =
+          getStepVector(Broadcasted, VF * Part, Step, ID.getInductionOpcode());
     VectorLoopValueMap.initVector(EntryVal, Entry);
     if (Trunc)
       addMetadata(Entry, Trunc);
@@ -2326,7 +2537,7 @@ void InnerLoopVectorizer::widenIntInduction(PHINode *IV, TruncInst *Trunc) {
   // in the loop in the common case prior to InstCombine. We will be trading
   // one vector extract for each scalar step.
   if (NeedsScalarIV)
-    buildScalarSteps(ScalarIV, Step, EntryVal);
+    buildScalarSteps(ScalarIV, Step, EntryVal, ID);
 }
 
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
@@ -2386,30 +2597,43 @@ Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
 }
 
 void InnerLoopVectorizer::buildScalarSteps(Value *ScalarIV, Value *Step,
-                                           Value *EntryVal) {
+                                           Value *EntryVal,
+                                           const InductionDescriptor &ID) {
 
   // We shouldn't have to build scalar steps if we aren't vectorizing.
   assert(VF > 1 && "VF should be greater than one");
 
   // Get the value type and ensure it and the step have the same integer type.
   Type *ScalarIVTy = ScalarIV->getType()->getScalarType();
-  assert(ScalarIVTy->isIntegerTy() && ScalarIVTy == Step->getType() &&
-         "Val and Step should have the same integer type");
+  assert(ScalarIVTy == Step->getType() &&
+         "Val and Step should have the same type");
+
+  // We build scalar steps for both integer and floating-point induction
+  // variables. Here, we determine the kind of arithmetic we will perform.
+  Instruction::BinaryOps AddOp;
+  Instruction::BinaryOps MulOp;
+  if (ScalarIVTy->isIntegerTy()) {
+    AddOp = Instruction::Add;
+    MulOp = Instruction::Mul;
+  } else {
+    AddOp = ID.getInductionOpcode();
+    MulOp = Instruction::FMul;
+  }
 
   // Determine the number of scalars we need to generate for each unroll
   // iteration. If EntryVal is uniform, we only need to generate the first
   // lane. Otherwise, we generate all VF values.
   unsigned Lanes =
-      Legal->isUniformAfterVectorization(cast<Instruction>(EntryVal)) ? 1 : VF;
+    Cost->isUniformAfterVectorization(cast<Instruction>(EntryVal), VF) ? 1 : VF;
 
   // Compute the scalar steps and save the results in VectorLoopValueMap.
   ScalarParts Entry(UF);
   for (unsigned Part = 0; Part < UF; ++Part) {
     Entry[Part].resize(VF);
     for (unsigned Lane = 0; Lane < Lanes; ++Lane) {
-      auto *StartIdx = ConstantInt::get(ScalarIVTy, VF * Part + Lane);
-      auto *Mul = Builder.CreateMul(StartIdx, Step);
-      auto *Add = Builder.CreateAdd(ScalarIV, Mul);
+      auto *StartIdx = getSignedIntOrFpConstant(ScalarIVTy, VF * Part + Lane);
+      auto *Mul = addFastMathFlag(Builder.CreateBinOp(MulOp, StartIdx, Step));
+      auto *Add = addFastMathFlag(Builder.CreateBinOp(AddOp, ScalarIV, Mul));
       Entry[Part][Lane] = Add;
     }
   }
@@ -2468,7 +2692,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // known to be uniform after vectorization, this corresponds to lane zero
     // of the last unroll iteration. Otherwise, the last instruction is the one
     // we created for the last vector lane of the last unroll iteration.
-    unsigned LastLane = Legal->isUniformAfterVectorization(I) ? 0 : VF - 1;
+    unsigned LastLane = Cost->isUniformAfterVectorization(I, VF) ? 0 : VF - 1;
     auto *LastInst = cast<Instruction>(getScalarValue(V, UF - 1, LastLane));
 
     // Set the insert point after the last scalarized instruction. This ensures
@@ -2485,7 +2709,7 @@ InnerLoopVectorizer::getVectorValue(Value *V) {
     // VectorLoopValueMap, we will only generate the insertelements once.
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *VectorValue = nullptr;
-      if (Legal->isUniformAfterVectorization(I)) {
+      if (Cost->isUniformAfterVectorization(I, VF)) {
         VectorValue = getBroadcastInstrs(getScalarValue(V, Part, 0));
       } else {
         VectorValue = UndefValue::get(VectorType::get(V->getType(), VF));
@@ -2514,8 +2738,9 @@ Value *InnerLoopVectorizer::getScalarValue(Value *V, unsigned Part,
   if (OrigLoop->isLoopInvariant(V))
     return V;
 
-  assert(Lane > 0 ? !Legal->isUniformAfterVectorization(cast<Instruction>(V))
-                  : true && "Uniform values only have lane zero");
+  assert(Lane > 0 ?
+         !Cost->isUniformAfterVectorization(cast<Instruction>(V), VF)
+         : true && "Uniform values only have lane zero");
 
   // If the value from the original loop has not been vectorized, it is
   // represented by UF x VF scalar values in the new loop. Return the requested
@@ -2550,102 +2775,6 @@ Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
                                      "reverse");
 }
 
-// Get a mask to interleave \p NumVec vectors into a wide vector.
-// I.e.  <0, VF, VF*2, ..., VF*(NumVec-1), 1, VF+1, VF*2+1, ...>
-// E.g. For 2 interleaved vectors, if VF is 4, the mask is:
-//      <0, 4, 1, 5, 2, 6, 3, 7>
-static Constant *getInterleavedMask(IRBuilder<> &Builder, unsigned VF,
-                                    unsigned NumVec) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < VF; i++)
-    for (unsigned j = 0; j < NumVec; j++)
-      Mask.push_back(Builder.getInt32(j * VF + i));
-
-  return ConstantVector::get(Mask);
-}
-
-// Get the strided mask starting from index \p Start.
-// I.e.  <Start, Start + Stride, ..., Start + Stride*(VF-1)>
-static Constant *getStridedMask(IRBuilder<> &Builder, unsigned Start,
-                                unsigned Stride, unsigned VF) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < VF; i++)
-    Mask.push_back(Builder.getInt32(Start + i * Stride));
-
-  return ConstantVector::get(Mask);
-}
-
-// Get a mask of two parts: The first part consists of sequential integers
-// starting from 0, The second part consists of UNDEFs.
-// I.e. <0, 1, 2, ..., NumInt - 1, undef, ..., undef>
-static Constant *getSequentialMask(IRBuilder<> &Builder, unsigned NumInt,
-                                   unsigned NumUndef) {
-  SmallVector<Constant *, 16> Mask;
-  for (unsigned i = 0; i < NumInt; i++)
-    Mask.push_back(Builder.getInt32(i));
-
-  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
-  for (unsigned i = 0; i < NumUndef; i++)
-    Mask.push_back(Undef);
-
-  return ConstantVector::get(Mask);
-}
-
-// Concatenate two vectors with the same element type. The 2nd vector should
-// not have more elements than the 1st vector. If the 2nd vector has less
-// elements, extend it with UNDEFs.
-static Value *ConcatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
-                                    Value *V2) {
-  VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
-  VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
-  assert(VecTy1 && VecTy2 &&
-         VecTy1->getScalarType() == VecTy2->getScalarType() &&
-         "Expect two vectors with the same element type");
-
-  unsigned NumElts1 = VecTy1->getNumElements();
-  unsigned NumElts2 = VecTy2->getNumElements();
-  assert(NumElts1 >= NumElts2 && "Unexpect the first vector has less elements");
-
-  if (NumElts1 > NumElts2) {
-    // Extend with UNDEFs.
-    Constant *ExtMask =
-        getSequentialMask(Builder, NumElts2, NumElts1 - NumElts2);
-    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
-  }
-
-  Constant *Mask = getSequentialMask(Builder, NumElts1 + NumElts2, 0);
-  return Builder.CreateShuffleVector(V1, V2, Mask);
-}
-
-// Concatenate vectors in the given list. All vectors have the same type.
-static Value *ConcatenateVectors(IRBuilder<> &Builder,
-                                 ArrayRef<Value *> InputList) {
-  unsigned NumVec = InputList.size();
-  assert(NumVec > 1 && "Should be at least two vectors");
-
-  SmallVector<Value *, 8> ResList;
-  ResList.append(InputList.begin(), InputList.end());
-  do {
-    SmallVector<Value *, 8> TmpList;
-    for (unsigned i = 0; i < NumVec - 1; i += 2) {
-      Value *V0 = ResList[i], *V1 = ResList[i + 1];
-      assert((V0->getType() == V1->getType() || i == NumVec - 2) &&
-             "Only the last vector may have a different type");
-
-      TmpList.push_back(ConcatenateTwoVectors(Builder, V0, V1));
-    }
-
-    // Push the last vector if the total number of vectors is odd.
-    if (NumVec % 2 != 0)
-      TmpList.push_back(ResList[NumVec - 1]);
-
-    ResList = TmpList;
-    NumVec = ResList.size();
-  } while (NumVec > 1);
-
-  return ResList[0];
-}
-
 // Try to vectorize the interleave group that \p Instr belongs to.
 //
 // E.g. Translate following interleaved load group (factor = 3):
@@ -2682,15 +2811,13 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   if (Instr != Group->getInsertPos())
     return;
 
-  LoadInst *LI = dyn_cast<LoadInst>(Instr);
-  StoreInst *SI = dyn_cast<StoreInst>(Instr);
   Value *Ptr = getPointerOperand(Instr);
 
   // Prepare for the vector type of the interleaved load/store.
-  Type *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  Type *ScalarTy = getMemInstValueType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
   Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
-  Type *PtrTy = VecTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
+  Type *PtrTy = VecTy->getPointerTo(getMemInstAddressSpace(Instr));
 
   // Prepare for the new pointers.
   setDebugLocFromInst(Builder, Ptr);
@@ -2730,7 +2857,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
   Value *UndefVec = UndefValue::get(VecTy);
 
   // Vectorize the interleaved load group.
-  if (LI) {
+  if (isa<LoadInst>(Instr)) {
 
     // For each unroll part, create a wide load for the group.
     SmallVector<Value *, 2> NewLoads;
@@ -2751,7 +2878,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
         continue;
 
       VectorParts Entry(UF);
-      Constant *StrideMask = getStridedMask(Builder, I, InterleaveFactor, VF);
+      Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
       for (unsigned Part = 0; Part < UF; Part++) {
         Value *StridedVec = Builder.CreateShuffleVector(
             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
@@ -2795,10 +2922,10 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr) {
     }
 
     // Concatenate all vectors into a wide vector.
-    Value *WideVec = ConcatenateVectors(Builder, StoredVecs);
+    Value *WideVec = concatenateVectors(Builder, StoredVecs);
 
     // Interleave the elements in the wide vector.
-    Constant *IMask = getInterleavedMask(Builder, VF, InterleaveFactor);
+    Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
     Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
                                               "interleaved.vec");
 
@@ -2815,103 +2942,44 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr) {
 
   assert((LI || SI) && "Invalid Load/Store instruction");
 
-  // Try to vectorize the interleave group if this access is interleaved.
-  if (Legal->isAccessInterleaved(Instr))
+  LoopVectorizationCostModel::InstWidening Decision =
+      Cost->getWideningDecision(Instr, VF);
+  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
+         "CM decision should be taken at this point");
+  if (Decision == LoopVectorizationCostModel::CM_Interleave)
     return vectorizeInterleaveGroup(Instr);
 
-  Type *ScalarDataTy = LI ? LI->getType() : SI->getValueOperand()->getType();
+  Type *ScalarDataTy = getMemInstValueType(Instr);
   Type *DataTy = VectorType::get(ScalarDataTy, VF);
   Value *Ptr = getPointerOperand(Instr);
-  unsigned Alignment = LI ? LI->getAlignment() : SI->getAlignment();
+  unsigned Alignment = getMemInstAlignment(Instr);
   // An alignment of 0 means target abi alignment. We need to use the scalar's
   // target abi alignment in such a case.
   const DataLayout &DL = Instr->getModule()->getDataLayout();
   if (!Alignment)
     Alignment = DL.getABITypeAlignment(ScalarDataTy);
-  unsigned AddressSpace = Ptr->getType()->getPointerAddressSpace();
+  unsigned AddressSpace = getMemInstAddressSpace(Instr);
 
   // Scalarize the memory instruction if necessary.
-  if (Legal->memoryInstructionMustBeScalarized(Instr, VF))
+  if (Decision == LoopVectorizationCostModel::CM_Scalarize)
     return scalarizeInstruction(Instr, Legal->isScalarWithPredication(Instr));
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
   bool Reverse = ConsecutiveStride < 0;
-
-  // Determine if either a gather or scatter operation is legal.
   bool CreateGatherScatter =
-      !ConsecutiveStride && Legal->isLegalGatherOrScatter(Instr);
+      (Decision == LoopVectorizationCostModel::CM_GatherScatter);
 
   VectorParts VectorGep;
 
   // Handle consecutive loads/stores.
-  GetElementPtrInst *Gep = getGEPInstruction(Ptr);
   if (ConsecutiveStride) {
-    if (Gep) {
-      unsigned NumOperands = Gep->getNumOperands();
-#ifndef NDEBUG
-      // The original GEP that identified as a consecutive memory access
-      // should have only one loop-variant operand.
-      unsigned NumOfLoopVariantOps = 0;
-      for (unsigned i = 0; i < NumOperands; ++i)
-        if (!PSE.getSE()->isLoopInvariant(PSE.getSCEV(Gep->getOperand(i)),
-                                          OrigLoop))
-          NumOfLoopVariantOps++;
-      assert(NumOfLoopVariantOps == 1 &&
-             "Consecutive GEP should have only one loop-variant operand");
-#endif
-      GetElementPtrInst *Gep2 = cast<GetElementPtrInst>(Gep->clone());
-      Gep2->setName("gep.indvar");
-
-      // A new GEP is created for a 0-lane value of the first unroll iteration.
-      // The GEPs for the rest of the unroll iterations are computed below as an
-      // offset from this GEP.
-      for (unsigned i = 0; i < NumOperands; ++i)
-        // We can apply getScalarValue() for all GEP indices. It returns an
-        // original value for loop-invariant operand and 0-lane for consecutive
-        // operand.
-        Gep2->setOperand(i, getScalarValue(Gep->getOperand(i),
-                                           0, /* First unroll iteration */
-                                           0  /* 0-lane of the vector */ ));
-      setDebugLocFromInst(Builder, Gep);
-      Ptr = Builder.Insert(Gep2);
-
-    } else { // No GEP
-      setDebugLocFromInst(Builder, Ptr);
-      Ptr = getScalarValue(Ptr, 0, 0);
-    }
+    Ptr = getScalarValue(Ptr, 0, 0);
   } else {
     // At this point we should vector version of GEP for Gather or Scatter
     assert(CreateGatherScatter && "The instruction should be scalarized");
-    if (Gep) {
-      // Vectorizing GEP, across UF parts. We want to get a vector value for base
-      // and each index that's defined inside the loop, even if it is
-      // loop-invariant but wasn't hoisted out. Otherwise we want to keep them
-      // scalar.
-      SmallVector<VectorParts, 4> OpsV;
-      for (Value *Op : Gep->operands()) {
-        Instruction *SrcInst = dyn_cast<Instruction>(Op);
-        if (SrcInst && OrigLoop->contains(SrcInst))
-          OpsV.push_back(getVectorValue(Op));
-        else
-          OpsV.push_back(VectorParts(UF, Op));
-      }
-      for (unsigned Part = 0; Part < UF; ++Part) {
-        SmallVector<Value *, 4> Ops;
-        Value *GEPBasePtr = OpsV[0][Part];
-        for (unsigned i = 1; i < Gep->getNumOperands(); i++)
-          Ops.push_back(OpsV[i][Part]);
-        Value *NewGep =  Builder.CreateGEP(GEPBasePtr, Ops, "VectorGep");
-        cast<GetElementPtrInst>(NewGep)->setIsInBounds(Gep->isInBounds());
-        assert(NewGep->getType()->isVectorTy() && "Expected vector GEP");
-
-        NewGep =
-            Builder.CreateBitCast(NewGep, VectorType::get(Ptr->getType(), VF));
-        VectorGep.push_back(NewGep);
-      }
-    } else
-      VectorGep = getVectorValue(Ptr);
+    VectorGep = getVectorValue(Ptr);
   }
 
   VectorParts Mask = createBlockInMask(Instr->getParent());
@@ -3026,7 +3094,7 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
   // Determine the number of scalars we need to generate for each unroll
   // iteration. If the instruction is uniform, we only need to generate the
   // first lane. Otherwise, we generate all VF values.
-  unsigned Lanes = Legal->isUniformAfterVectorization(Instr) ? 1 : VF;
+  unsigned Lanes = Cost->isUniformAfterVectorization(Instr, VF) ? 1 : VF;
 
   // For each vector unroll 'part':
   for (unsigned Part = 0; Part < UF; ++Part) {
@@ -3037,7 +3105,9 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
       // Start if-block.
       Value *Cmp = nullptr;
       if (IfPredicateInstr) {
-        Cmp = Builder.CreateExtractElement(Cond[Part], Builder.getInt32(Lane));
+        Cmp = Cond[Part];
+        if (Cmp->getType()->isVectorTy())
+          Cmp = Builder.CreateExtractElement(Cmp, Builder.getInt32(Lane));
         Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cmp,
                                  ConstantInt::get(Cmp->getType(), 1));
       }
@@ -3345,7 +3415,7 @@ void InnerLoopVectorizer::createEmptyLoop() {
   //   - counts from zero, stepping by one
   //   - is the size of the widest induction variable type
   // then we create a new one.
-  OldInduction = Legal->getInduction();
+  OldInduction = Legal->getPrimaryInduction();
   Type *IdxTy = Legal->getWidestInductionType();
 
   // Split the single block loop into the two loop structure described above.
@@ -3542,7 +3612,7 @@ void InnerLoopVectorizer::fixupIVUsers(PHINode *OrigPhi,
 
 namespace {
 struct CSEDenseMapInfo {
-  static bool canHandle(Instruction *I) {
+  static bool canHandle(const Instruction *I) {
     return isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
            isa<ShuffleVectorInst>(I) || isa<GetElementPtrInst>(I);
   }
@@ -3552,12 +3622,12 @@ struct CSEDenseMapInfo {
   static inline Instruction *getTombstoneKey() {
     return DenseMapInfo<Instruction *>::getTombstoneKey();
   }
-  static unsigned getHashValue(Instruction *I) {
+  static unsigned getHashValue(const Instruction *I) {
     assert(canHandle(I) && "Unknown instruction!");
     return hash_combine(I->getOpcode(), hash_combine_range(I->value_op_begin(),
                                                            I->value_op_end()));
   }
-  static bool isEqual(Instruction *LHS, Instruction *RHS) {
+  static bool isEqual(const Instruction *LHS, const Instruction *RHS) {
     if (LHS == getEmptyKey() || RHS == getEmptyKey() ||
         LHS == getTombstoneKey() || RHS == getTombstoneKey())
       return LHS == RHS;
@@ -3588,16 +3658,6 @@ static void cse(BasicBlock *BB) {
   }
 }
 
-/// \brief Adds a 'fast' flag to floating point operations.
-static Value *addFastMathFlag(Value *V) {
-  if (isa<FPMathOperator>(V)) {
-    FastMathFlags Flags;
-    Flags.setUnsafeAlgebra();
-    cast<Instruction>(V)->setFastMathFlags(Flags);
-  }
-  return V;
-}
-
 /// \brief Estimate the overhead of scalarizing an instruction. This is a
 /// convenience wrapper for the type-based getScalarizationOverhead API.
 static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
@@ -3607,13 +3667,17 @@ static unsigned getScalarizationOverhead(Instruction *I, unsigned VF,
 
   unsigned Cost = 0;
   Type *RetTy = ToVectorTy(I->getType(), VF);
-  if (!RetTy->isVoidTy())
+  if (!RetTy->isVoidTy() &&
+      (!isa<LoadInst>(I) ||
+       !TTI.supportsEfficientVectorElementLoadStore()))
     Cost += TTI.getScalarizationOverhead(RetTy, true, false);
 
   if (CallInst *CI = dyn_cast<CallInst>(I)) {
     SmallVector<const Value *, 4> Operands(CI->arg_operands());
     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
-  } else {
+  }
+  else if (!isa<StoreInst>(I) ||
+           !TTI.supportsEfficientVectorElementLoadStore()) {
     SmallVector<const Value *, 4> Operands(I->operand_values());
     Cost += TTI.getOperandsScalarizationOverhead(Operands, VF);
   }
@@ -3679,16 +3743,12 @@ static unsigned getVectorIntrinsicCost(CallInst *CI, unsigned VF,
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   assert(ID && "Expected intrinsic call!");
 
-  Type *RetTy = ToVectorTy(CI->getType(), VF);
-  SmallVector<Type *, 4> Tys;
-  for (Value *ArgOperand : CI->arg_operands())
-    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
-
   FastMathFlags FMF;
   if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
     FMF = FPMO->getFastMathFlags();
 
-  return TTI.getIntrinsicInstrCost(ID, RetTy, Tys, FMF);
+  SmallVector<Value *, 4> Operands(CI->arg_operands());
+  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
 }
 
 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
@@ -3831,16 +3891,6 @@ void InnerLoopVectorizer::vectorizeLoop() {
   // the cost-model.
   //
   //===------------------------------------------------===//
-  Constant *Zero = Builder.getInt32(0);
-
-  // In order to support recurrences we need to be able to vectorize Phi nodes.
-  // Phi nodes have cycles, so we need to vectorize them in two stages. First,
-  // we create a new vector PHI node with no incoming edges. We use this value
-  // when we vectorize all of the instructions that use the PHI. Next, after
-  // all of the instructions in the block are complete we add the new incoming
-  // edges to the PHI. At this point all of the instructions in the basic block
-  // are vectorized, so we can use them to construct the PHI.
-  PhiVector PHIsToFix;
 
   // Collect instructions from the original loop that will become trivially
   // dead in the vectorized loop. We don't need to vectorize these
@@ -3854,7 +3904,7 @@ void InnerLoopVectorizer::vectorizeLoop() {
 
   // Vectorize all of the blocks in the original loop.
   for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO()))
-    vectorizeBlockInLoop(BB, &PHIsToFix);
+    vectorizeBlockInLoop(BB);
 
   // Insert truncates and extends for any truncated instructions as hints to
   // InstCombine.
@@ -3862,221 +3912,10 @@ void InnerLoopVectorizer::vectorizeLoop() {
     truncateToMinimalBitwidths();
 
   // At this point every instruction in the original loop is widened to a
-  // vector form. Now we need to fix the recurrences in PHIsToFix. These PHI
+  // vector form. Now we need to fix the recurrences in the loop. These PHI
   // nodes are currently empty because we did not want to introduce cycles.
   // This is the second stage of vectorizing recurrences.
-  for (PHINode *Phi : PHIsToFix) {
-    assert(Phi && "Unable to recover vectorized PHI");
-
-    // Handle first-order recurrences that need to be fixed.
-    if (Legal->isFirstOrderRecurrence(Phi)) {
-      fixFirstOrderRecurrence(Phi);
-      continue;
-    }
-
-    // If the phi node is not a first-order recurrence, it must be a reduction.
-    // Get it's reduction variable descriptor.
-    assert(Legal->isReductionVariable(Phi) &&
-           "Unable to find the reduction variable");
-    RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
-
-    RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
-    TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
-    Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
-    RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
-        RdxDesc.getMinMaxRecurrenceKind();
-    setDebugLocFromInst(Builder, ReductionStartValue);
-
-    // We need to generate a reduction vector from the incoming scalar.
-    // To do so, we need to generate the 'identity' vector and override
-    // one of the elements with the incoming scalar reduction. We need
-    // to do it in the vector-loop preheader.
-    Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
-
-    // This is the vector-clone of the value that leaves the loop.
-    const VectorParts &VectorExit = getVectorValue(LoopExitInst);
-    Type *VecTy = VectorExit[0]->getType();
-
-    // Find the reduction identity variable. Zero for addition, or, xor,
-    // one for multiplication, -1 for And.
-    Value *Identity;
-    Value *VectorStart;
-    if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
-        RK == RecurrenceDescriptor::RK_FloatMinMax) {
-      // MinMax reduction have the start value as their identify.
-      if (VF == 1) {
-        VectorStart = Identity = ReductionStartValue;
-      } else {
-        VectorStart = Identity =
-            Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
-      }
-    } else {
-      // Handle other reduction kinds:
-      Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
-          RK, VecTy->getScalarType());
-      if (VF == 1) {
-        Identity = Iden;
-        // This vector is the Identity vector where the first element is the
-        // incoming scalar reduction.
-        VectorStart = ReductionStartValue;
-      } else {
-        Identity = ConstantVector::getSplat(VF, Iden);
-
-        // This vector is the Identity vector where the first element is the
-        // incoming scalar reduction.
-        VectorStart =
-            Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
-      }
-    }
-
-    // Fix the vector-loop phi.
-
-    // Reductions do not have to start at zero. They can start with
-    // any loop invariant values.
-    const VectorParts &VecRdxPhi = getVectorValue(Phi);
-    BasicBlock *Latch = OrigLoop->getLoopLatch();
-    Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
-    const VectorParts &Val = getVectorValue(LoopVal);
-    for (unsigned part = 0; part < UF; ++part) {
-      // Make sure to add the reduction stat value only to the
-      // first unroll part.
-      Value *StartVal = (part == 0) ? VectorStart : Identity;
-      cast<PHINode>(VecRdxPhi[part])
-          ->addIncoming(StartVal, LoopVectorPreHeader);
-      cast<PHINode>(VecRdxPhi[part])
-          ->addIncoming(Val[part], LoopVectorBody);
-    }
-
-    // Before each round, move the insertion point right between
-    // the PHIs and the values we are going to write.
-    // This allows us to write both PHINodes and the extractelement
-    // instructions.
-    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
-
-    VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);
-    setDebugLocFromInst(Builder, LoopExitInst);
-
-    // If the vector reduction can be performed in a smaller type, we truncate
-    // then extend the loop exit value to enable InstCombine to evaluate the
-    // entire expression in the smaller type.
-    if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
-      Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
-      Builder.SetInsertPoint(LoopVectorBody->getTerminator());
-      for (unsigned part = 0; part < UF; ++part) {
-        Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
-        Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
-                                          : Builder.CreateZExt(Trunc, VecTy);
-        for (Value::user_iterator UI = RdxParts[part]->user_begin();
-             UI != RdxParts[part]->user_end();)
-          if (*UI != Trunc) {
-            (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
-            RdxParts[part] = Extnd;
-          } else {
-            ++UI;
-          }
-      }
-      Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
-      for (unsigned part = 0; part < UF; ++part)
-        RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
-    }
-
-    // Reduce all of the unrolled parts into a single vector.
-    Value *ReducedPartRdx = RdxParts[0];
-    unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
-    setDebugLocFromInst(Builder, ReducedPartRdx);
-    for (unsigned part = 1; part < UF; ++part) {
-      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-        // Floating point operations had to be 'fast' to enable the reduction.
-        ReducedPartRdx = addFastMathFlag(
-            Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
-                                ReducedPartRdx, "bin.rdx"));
-      else
-        ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
-            Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
-    }
-
-    if (VF > 1) {
-      // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
-      // and vector ops, reducing the set of values being computed by half each
-      // round.
-      assert(isPowerOf2_32(VF) &&
-             "Reduction emission only supported for pow2 vectors!");
-      Value *TmpVec = ReducedPartRdx;
-      SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
-      for (unsigned i = VF; i != 1; i >>= 1) {
-        // Move the upper half of the vector to the lower half.
-        for (unsigned j = 0; j != i / 2; ++j)
-          ShuffleMask[j] = Builder.getInt32(i / 2 + j);
-
-        // Fill the rest of the mask with undef.
-        std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
-                  UndefValue::get(Builder.getInt32Ty()));
-
-        Value *Shuf = Builder.CreateShuffleVector(
-            TmpVec, UndefValue::get(TmpVec->getType()),
-            ConstantVector::get(ShuffleMask), "rdx.shuf");
-
-        if (Op != Instruction::ICmp && Op != Instruction::FCmp)
-          // Floating point operations had to be 'fast' to enable the reduction.
-          TmpVec = addFastMathFlag(Builder.CreateBinOp(
-              (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
-        else
-          TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
-                                                        TmpVec, Shuf);
-      }
-
-      // The result is in the first element of the vector.
-      ReducedPartRdx =
-          Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
-
-      // If the reduction can be performed in a smaller type, we need to extend
-      // the reduction to the wider type before we branch to the original loop.
-      if (Phi->getType() != RdxDesc.getRecurrenceType())
-        ReducedPartRdx =
-            RdxDesc.isSigned()
-                ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
-                : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
-    }
-
-    // Create a phi node that merges control-flow from the backedge-taken check
-    // block and the middle block.
-    PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
-                                          LoopScalarPreHeader->getTerminator());
-    for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
-      BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
-    BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
-
-    // Now, we need to fix the users of the reduction variable
-    // inside and outside of the scalar remainder loop.
-    // We know that the loop is in LCSSA form. We need to update the
-    // PHI nodes in the exit blocks.
-    for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
-                              LEE = LoopExitBlock->end();
-         LEI != LEE; ++LEI) {
-      PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
-      if (!LCSSAPhi)
-        break;
-
-      // All PHINodes need to have a single entry edge, or two if
-      // we already fixed them.
-      assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
-
-      // We found a reduction value exit-PHI. Update it with the
-      // incoming bypass edge.
-      if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
-        LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
-    } // end of the LCSSA phi scan.
-
-    // Fix the scalar loop reduction variable with the incoming reduction sum
-    // from the vector body and from the backedge value.
-    int IncomingEdgeBlockIdx =
-        Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
-    assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
-    // Pick the other block.
-    int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
-    Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
-    Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
-  } // end of for each Phi in PHIsToFix.
+  fixCrossIterationPHIs();
 
   // Update the dominator tree.
   //
@@ -4101,6 +3940,25 @@ void InnerLoopVectorizer::vectorizeLoop() {
   cse(LoopVectorBody);
 }
 
+void InnerLoopVectorizer::fixCrossIterationPHIs() {
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #2: We now need to fix the recurrences by adding incoming edges to
+  // the currently empty PHI nodes. At this point every instruction in the
+  // original loop is widened to a vector form so we can use them to construct
+  // the incoming edges.
+  for (Instruction &I : *OrigLoop->getHeader()) {
+    PHINode *Phi = dyn_cast<PHINode>(&I);
+    if (!Phi)
+      break;
+    // Handle first-order recurrences and reductions that need to be fixed.
+    if (Legal->isFirstOrderRecurrence(Phi))
+      fixFirstOrderRecurrence(Phi);
+    else if (Legal->isReductionVariable(Phi))
+      fixReduction(Phi);
+  }
+}
+
 void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // This is the second phase of vectorizing first-order recurrences. An
@@ -4179,15 +4037,17 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   auto *VecPhi = Builder.CreatePHI(VectorInit->getType(), 2, "vector.recur");
   VecPhi->addIncoming(VectorInit, LoopVectorPreHeader);
 
-  // Get the vectorized previous value. We ensured the previous values was an
-  // instruction when detecting the recurrence.
+  // Get the vectorized previous value.
   auto &PreviousParts = getVectorValue(Previous);
 
-  // Set the insertion point to be after this instruction. We ensured the
-  // previous value dominated all uses of the phi when detecting the
-  // recurrence.
-  Builder.SetInsertPoint(
-      &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));
+  // Set the insertion point after the previous value if it is an instruction.
+  // Note that the previous value may have been constant-folded so it is not
+  // guaranteed to be an instruction in the vector loop.
+  if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]))
+    Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt());
+  else
+    Builder.SetInsertPoint(
+        &*++BasicBlock::iterator(cast<Instruction>(PreviousParts[UF - 1])));
 
   // We will construct a vector for the recurrence by combining the values for
   // the current and previous iterations. This is the required shuffle mask.
@@ -4217,7 +4077,12 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   VecPhi->addIncoming(Incoming, LI->getLoopFor(LoopVectorBody)->getLoopLatch());
 
   // Extract the last vector element in the middle block. This will be the
-  // initial value for the recurrence when jumping to the scalar loop.
+  // initial value for the recurrence when jumping to the scalar loop. 
+  // FIXME: Note that the last vector element need not always be the correct one:
+  // consider a loop  where we have phi uses outside the loop - we need the
+  // second last iteration value and not the last one). For now, we avoid
+  // considering such cases as firstOrderRecurrences (see
+  // isFirstOrderRecurrence).
   auto *Extract = Incoming;
   if (VF > 1) {
     Builder.SetInsertPoint(LoopMiddleBlock->getTerminator());
@@ -4252,11 +4117,217 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   }
 }
 
-void InnerLoopVectorizer::fixLCSSAPHIs() {
-  for (Instruction &LEI : *LoopExitBlock) {
-    auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
-    if (!LCSSAPhi)
-      break;
+void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
+  Constant *Zero = Builder.getInt32(0);
+
+  // Get it's reduction variable descriptor.
+  assert(Legal->isReductionVariable(Phi) &&
+         "Unable to find the reduction variable");
+  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
+
+  RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
+  TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
+  Instruction *LoopExitInst = RdxDesc.getLoopExitInstr();
+  RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind =
+    RdxDesc.getMinMaxRecurrenceKind();
+  setDebugLocFromInst(Builder, ReductionStartValue);
+
+  // We need to generate a reduction vector from the incoming scalar.
+  // To do so, we need to generate the 'identity' vector and override
+  // one of the elements with the incoming scalar reduction. We need
+  // to do it in the vector-loop preheader.
+  Builder.SetInsertPoint(LoopBypassBlocks[1]->getTerminator());
+
+  // This is the vector-clone of the value that leaves the loop.
+  const VectorParts &VectorExit = getVectorValue(LoopExitInst);
+  Type *VecTy = VectorExit[0]->getType();
+
+  // Find the reduction identity variable. Zero for addition, or, xor,
+  // one for multiplication, -1 for And.
+  Value *Identity;
+  Value *VectorStart;
+  if (RK == RecurrenceDescriptor::RK_IntegerMinMax ||
+      RK == RecurrenceDescriptor::RK_FloatMinMax) {
+    // MinMax reduction have the start value as their identify.
+    if (VF == 1) {
+      VectorStart = Identity = ReductionStartValue;
+    } else {
+      VectorStart = Identity =
+        Builder.CreateVectorSplat(VF, ReductionStartValue, "minmax.ident");
+    }
+  } else {
+    // Handle other reduction kinds:
+    Constant *Iden = RecurrenceDescriptor::getRecurrenceIdentity(
+        RK, VecTy->getScalarType());
+    if (VF == 1) {
+      Identity = Iden;
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart = ReductionStartValue;
+    } else {
+      Identity = ConstantVector::getSplat(VF, Iden);
+
+      // This vector is the Identity vector where the first element is the
+      // incoming scalar reduction.
+      VectorStart =
+        Builder.CreateInsertElement(Identity, ReductionStartValue, Zero);
+    }
+  }
+
+  // Fix the vector-loop phi.
+
+  // Reductions do not have to start at zero. They can start with
+  // any loop invariant values.
+  const VectorParts &VecRdxPhi = getVectorValue(Phi);
+  BasicBlock *Latch = OrigLoop->getLoopLatch();
+  Value *LoopVal = Phi->getIncomingValueForBlock(Latch);
+  const VectorParts &Val = getVectorValue(LoopVal);
+  for (unsigned part = 0; part < UF; ++part) {
+    // Make sure to add the reduction stat value only to the
+    // first unroll part.
+    Value *StartVal = (part == 0) ? VectorStart : Identity;
+    cast<PHINode>(VecRdxPhi[part])
+      ->addIncoming(StartVal, LoopVectorPreHeader);
+    cast<PHINode>(VecRdxPhi[part])
+      ->addIncoming(Val[part], LoopVectorBody);
+  }
+
+  // Before each round, move the insertion point right between
+  // the PHIs and the values we are going to write.
+  // This allows us to write both PHINodes and the extractelement
+  // instructions.
+  Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+
+  VectorParts &RdxParts = VectorLoopValueMap.getVector(LoopExitInst);
+  setDebugLocFromInst(Builder, LoopExitInst);
+
+  // If the vector reduction can be performed in a smaller type, we truncate
+  // then extend the loop exit value to enable InstCombine to evaluate the
+  // entire expression in the smaller type.
+  if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
+    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+    Builder.SetInsertPoint(LoopVectorBody->getTerminator());
+    for (unsigned part = 0; part < UF; ++part) {
+      Value *Trunc = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+      Value *Extnd = RdxDesc.isSigned() ? Builder.CreateSExt(Trunc, VecTy)
+        : Builder.CreateZExt(Trunc, VecTy);
+      for (Value::user_iterator UI = RdxParts[part]->user_begin();
+           UI != RdxParts[part]->user_end();)
+        if (*UI != Trunc) {
+          (*UI++)->replaceUsesOfWith(RdxParts[part], Extnd);
+          RdxParts[part] = Extnd;
+        } else {
+          ++UI;
+        }
+    }
+    Builder.SetInsertPoint(&*LoopMiddleBlock->getFirstInsertionPt());
+    for (unsigned part = 0; part < UF; ++part)
+      RdxParts[part] = Builder.CreateTrunc(RdxParts[part], RdxVecTy);
+  }
+
+  // Reduce all of the unrolled parts into a single vector.
+  Value *ReducedPartRdx = RdxParts[0];
+  unsigned Op = RecurrenceDescriptor::getRecurrenceBinOp(RK);
+  setDebugLocFromInst(Builder, ReducedPartRdx);
+  for (unsigned part = 1; part < UF; ++part) {
+    if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+      // Floating point operations had to be 'fast' to enable the reduction.
+      ReducedPartRdx = addFastMathFlag(
+          Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxParts[part],
+                              ReducedPartRdx, "bin.rdx"));
+    else
+      ReducedPartRdx = RecurrenceDescriptor::createMinMaxOp(
+          Builder, MinMaxKind, ReducedPartRdx, RdxParts[part]);
+  }
+
+  if (VF > 1) {
+    // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
+    // and vector ops, reducing the set of values being computed by half each
+    // round.
+    assert(isPowerOf2_32(VF) &&
+           "Reduction emission only supported for pow2 vectors!");
+    Value *TmpVec = ReducedPartRdx;
+    SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+    for (unsigned i = VF; i != 1; i >>= 1) {
+      // Move the upper half of the vector to the lower half.
+      for (unsigned j = 0; j != i / 2; ++j)
+        ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+
+      // Fill the rest of the mask with undef.
+      std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
+                UndefValue::get(Builder.getInt32Ty()));
+
+      Value *Shuf = Builder.CreateShuffleVector(
+          TmpVec, UndefValue::get(TmpVec->getType()),
+          ConstantVector::get(ShuffleMask), "rdx.shuf");
+
+      if (Op != Instruction::ICmp && Op != Instruction::FCmp)
+        // Floating point operations had to be 'fast' to enable the reduction.
+        TmpVec = addFastMathFlag(Builder.CreateBinOp(
+                                     (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx"));
+      else
+        TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind,
+                                                      TmpVec, Shuf);
+    }
+
+    // The result is in the first element of the vector.
+    ReducedPartRdx =
+      Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
+
+    // If the reduction can be performed in a smaller type, we need to extend
+    // the reduction to the wider type before we branch to the original loop.
+    if (Phi->getType() != RdxDesc.getRecurrenceType())
+      ReducedPartRdx =
+        RdxDesc.isSigned()
+        ? Builder.CreateSExt(ReducedPartRdx, Phi->getType())
+        : Builder.CreateZExt(ReducedPartRdx, Phi->getType());
+  }
+
+  // Create a phi node that merges control-flow from the backedge-taken check
+  // block and the middle block.
+  PHINode *BCBlockPhi = PHINode::Create(Phi->getType(), 2, "bc.merge.rdx",
+                                        LoopScalarPreHeader->getTerminator());
+  for (unsigned I = 0, E = LoopBypassBlocks.size(); I != E; ++I)
+    BCBlockPhi->addIncoming(ReductionStartValue, LoopBypassBlocks[I]);
+  BCBlockPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+
+  // Now, we need to fix the users of the reduction variable
+  // inside and outside of the scalar remainder loop.
+  // We know that the loop is in LCSSA form. We need to update the
+  // PHI nodes in the exit blocks.
+  for (BasicBlock::iterator LEI = LoopExitBlock->begin(),
+         LEE = LoopExitBlock->end();
+       LEI != LEE; ++LEI) {
+    PHINode *LCSSAPhi = dyn_cast<PHINode>(LEI);
+    if (!LCSSAPhi)
+      break;
+
+    // All PHINodes need to have a single entry edge, or two if
+    // we already fixed them.
+    assert(LCSSAPhi->getNumIncomingValues() < 3 && "Invalid LCSSA PHI");
+
+    // We found a reduction value exit-PHI. Update it with the
+    // incoming bypass edge.
+    if (LCSSAPhi->getIncomingValue(0) == LoopExitInst)
+      LCSSAPhi->addIncoming(ReducedPartRdx, LoopMiddleBlock);
+  } // end of the LCSSA phi scan.
+
+    // Fix the scalar loop reduction variable with the incoming reduction sum
+    // from the vector body and from the backedge value.
+  int IncomingEdgeBlockIdx =
+    Phi->getBasicBlockIndex(OrigLoop->getLoopLatch());
+  assert(IncomingEdgeBlockIdx >= 0 && "Invalid block index");
+  // Pick the other block.
+  int SelfEdgeBlockIdx = (IncomingEdgeBlockIdx ? 0 : 1);
+  Phi->setIncomingValue(SelfEdgeBlockIdx, BCBlockPhi);
+  Phi->setIncomingValue(IncomingEdgeBlockIdx, LoopExitInst);
+}
+
+void InnerLoopVectorizer::fixLCSSAPHIs() {
+  for (Instruction &LEI : *LoopExitBlock) {
+    auto *LCSSAPhi = dyn_cast<PHINode>(&LEI);
+    if (!LCSSAPhi)
+      break;
     if (LCSSAPhi->getNumIncomingValues() == 1)
       LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()),
                             LoopMiddleBlock);
@@ -4530,9 +4601,12 @@ InnerLoopVectorizer::createBlockInMask(BasicBlock *BB) {
 }
 
 void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
-                                              unsigned VF, PhiVector *PV) {
+                                              unsigned VF) {
   PHINode *P = cast<PHINode>(PN);
-  // Handle recurrences.
+  // In order to support recurrences we need to be able to vectorize Phi nodes.
+  // Phi nodes have cycles, so we need to vectorize them in two stages. This is
+  // stage #1: We create a new vector PHI node with no incoming edges. We'll use
+  // this value when we vectorize all of the instructions that use the PHI.
   if (Legal->isReductionVariable(P) || Legal->isFirstOrderRecurrence(P)) {
     VectorParts Entry(UF);
     for (unsigned part = 0; part < UF; ++part) {
@@ -4543,7 +4617,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
     }
     VectorLoopValueMap.initVector(P, Entry);
-    PV->push_back(P);
     return;
   }
 
@@ -4598,7 +4671,8 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
   case InductionDescriptor::IK_NoInduction:
     llvm_unreachable("Unknown induction");
   case InductionDescriptor::IK_IntInduction:
-    return widenIntInduction(P);
+  case InductionDescriptor::IK_FpInduction:
+    return widenIntOrFpInduction(P);
   case InductionDescriptor::IK_PtrInduction: {
     // Handle the pointer induction variable case.
     assert(P->getType()->isPointerTy() && "Unexpected type.");
@@ -4608,7 +4682,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     // Determine the number of scalars we need to generate for each unroll
     // iteration. If the instruction is uniform, we only need to generate the
     // first lane. Otherwise, we generate all VF values.
-    unsigned Lanes = Legal->isUniformAfterVectorization(P) ? 1 : VF;
+    unsigned Lanes = Cost->isUniformAfterVectorization(P, VF) ? 1 : VF;
     // These are the scalar results. Notice that we don't generate vector GEPs
     // because scalar GEPs result in better code.
     ScalarParts Entry(UF);
@@ -4625,30 +4699,6 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     VectorLoopValueMap.initScalar(P, Entry);
     return;
   }
-  case InductionDescriptor::IK_FpInduction: {
-    assert(P->getType() == II.getStartValue()->getType() &&
-           "Types must match");
-    // Handle other induction variables that are now based on the
-    // canonical one.
-    assert(P != OldInduction && "Primary induction can be integer only");
-
-    Value *V = Builder.CreateCast(Instruction::SIToFP, Induction, P->getType());
-    V = II.transform(Builder, V, PSE.getSE(), DL);
-    V->setName("fp.offset.idx");
-
-    // Now we have scalar op: %fp.offset.idx = StartVal +/- Induction*StepVal
-
-    Value *Broadcasted = getBroadcastInstrs(V);
-    // After broadcasting the induction variable we need to make the vector
-    // consecutive by adding StepVal*0, StepVal*1, StepVal*2, etc.
-    Value *StepVal = cast<SCEVUnknown>(II.getStep())->getValue();
-    VectorParts Entry(UF);
-    for (unsigned part = 0; part < UF; ++part)
-      Entry[part] = getStepVector(Broadcasted, VF * part, StepVal,
-                                  II.getInductionOpcode());
-    VectorLoopValueMap.initVector(P, Entry);
-    return;
-  }
   }
 }
 
@@ -4670,7 +4720,7 @@ static bool mayDivideByZero(Instruction &I) {
   return !CInt || CInt->isZero();
 }
 
-void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
+void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB) {
   // For each instruction in the old loop.
   for (Instruction &I : *BB) {
 
@@ -4695,10 +4745,75 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       continue;
     case Instruction::PHI: {
       // Vectorize PHINodes.
-      widenPHIInstruction(&I, UF, VF, PV);
+      widenPHIInstruction(&I, UF, VF);
       continue;
     } // End of PHI.
+    case Instruction::GetElementPtr: {
+      // Construct a vector GEP by widening the operands of the scalar GEP as
+      // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
+      // results in a vector of pointers when at least one operand of the GEP
+      // is vector-typed. Thus, to keep the representation compact, we only use
+      // vector-typed operands for loop-varying values.
+      auto *GEP = cast<GetElementPtrInst>(&I);
+      VectorParts Entry(UF);
+
+      if (VF > 1 && OrigLoop->hasLoopInvariantOperands(GEP)) {
+        // If we are vectorizing, but the GEP has only loop-invariant operands,
+        // the GEP we build (by only using vector-typed operands for
+        // loop-varying values) would be a scalar pointer. Thus, to ensure we
+        // produce a vector of pointers, we need to either arbitrarily pick an
+        // operand to broadcast, or broadcast a clone of the original GEP.
+        // Here, we broadcast a clone of the original.
+        //
+        // TODO: If at some point we decide to scalarize instructions having
+        //       loop-invariant operands, this special case will no longer be
+        //       required. We would add the scalarization decision to
+        //       collectLoopScalars() and teach getVectorValue() to broadcast
+        //       the lane-zero scalar value.
+        auto *Clone = Builder.Insert(GEP->clone());
+        for (unsigned Part = 0; Part < UF; ++Part)
+          Entry[Part] = Builder.CreateVectorSplat(VF, Clone);
+      } else {
+        // If the GEP has at least one loop-varying operand, we are sure to
+        // produce a vector of pointers. But if we are only unrolling, we want
+        // to produce a scalar GEP for each unroll part. Thus, the GEP we
+        // produce with the code below will be scalar (if VF == 1) or vector
+        // (otherwise). Note that for the unroll-only case, we still maintain
+        // values in the vector mapping with initVector, as we do for other
+        // instructions.
+        for (unsigned Part = 0; Part < UF; ++Part) {
+
+          // The pointer operand of the new GEP. If it's loop-invariant, we
+          // won't broadcast it.
+          auto *Ptr = OrigLoop->isLoopInvariant(GEP->getPointerOperand())
+                          ? GEP->getPointerOperand()
+                          : getVectorValue(GEP->getPointerOperand())[Part];
+
+          // Collect all the indices for the new GEP. If any index is
+          // loop-invariant, we won't broadcast it.
+          SmallVector<Value *, 4> Indices;
+          for (auto &U : make_range(GEP->idx_begin(), GEP->idx_end())) {
+            if (OrigLoop->isLoopInvariant(U.get()))
+              Indices.push_back(U.get());
+            else
+              Indices.push_back(getVectorValue(U.get())[Part]);
+          }
 
+          // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
+          // but it should be a vector, otherwise.
+          auto *NewGEP = GEP->isInBounds()
+                             ? Builder.CreateInBoundsGEP(Ptr, Indices)
+                             : Builder.CreateGEP(Ptr, Indices);
+          assert((VF == 1 || NewGEP->getType()->isVectorTy()) &&
+                 "NewGEP is not a pointer vector");
+          Entry[Part] = NewGEP;
+        }
+      }
+
+      VectorLoopValueMap.initVector(&I, Entry);
+      addMetadata(Entry, GEP);
+      break;
+    }
     case Instruction::UDiv:
     case Instruction::SDiv:
     case Instruction::SRem:
@@ -4822,10 +4937,9 @@ void InnerLoopVectorizer::vectorizeBlockInLoop(BasicBlock *BB, PhiVector *PV) {
       // induction variable. Notice that we can only optimize the 'trunc' case
       // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
       // (c) other casts depend on pointer size.
-      auto ID = Legal->getInductionVars()->lookup(OldInduction);
-      if (isa<TruncInst>(CI) && CI->getOperand(0) == OldInduction &&
-          ID.getConstIntStepValue()) {
-        widenIntInduction(OldInduction, cast<TruncInst>(CI));
+      if (Cost->isOptimizableIVTruncate(CI, VF)) {
+        widenIntOrFpInduction(cast<PHINode>(CI->getOperand(0)),
+                              cast<TruncInst>(CI));
         break;
       }
 
@@ -5112,12 +5226,6 @@ bool LoopVectorizationLegality::canVectorize() {
   if (UseInterleaved)
     InterleaveInfo.analyzeInterleaving(*getSymbolicStrides());
 
-  // Collect all instructions that are known to be uniform after vectorization.
-  collectLoopUniforms();
-
-  // Collect all instructions that are known to be scalar after vectorization.
-  collectLoopScalars();
-
   unsigned SCEVThreshold = VectorizeSCEVCheckThreshold;
   if (Hints->getForce() == LoopVectorizeHints::FK_Enabled)
     SCEVThreshold = PragmaVectorizeSCEVCheckThreshold;
@@ -5201,8 +5309,8 @@ void LoopVectorizationLegality::addInductionPhi(
     // one if there are multiple (no good reason for doing this other
     // than it is expedient). We've checked that it begins at zero and
     // steps by one, so this is a canonical induction variable.
-    if (!Induction || PhiTy == WidestIndTy)
-      Induction = Phi;
+    if (!PrimaryInduction || PhiTy == WidestIndTy)
+      PrimaryInduction = Phi;
   }
 
   // Both the PHI node itself, and the "post-increment" value feeding
@@ -5365,7 +5473,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
     } // next instr.
   }
 
-  if (!Induction) {
+  if (!PrimaryInduction) {
     DEBUG(dbgs() << "LV: Did not find one integer induction var.\n");
     if (Inductions.empty()) {
       ORE->emit(createMissedAnalysis("NoInductionVariable")
@@ -5377,46 +5485,166 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
   // Now we know the widest induction type, check if our found induction
   // is the same size. If it's not, unset it here and InnerLoopVectorizer
   // will create another.
-  if (Induction && WidestIndTy != Induction->getType())
-    Induction = nullptr;
+  if (PrimaryInduction && WidestIndTy != PrimaryInduction->getType())
+    PrimaryInduction = nullptr;
 
   return true;
 }
 
-void LoopVectorizationLegality::collectLoopScalars() {
+void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
+
+  // We should not collect Scalars more than once per VF. Right now, this
+  // function is called from collectUniformsAndScalars(), which already does
+  // this check. Collecting Scalars for VF=1 does not make any sense.
+  assert(VF >= 2 && !Scalars.count(VF) &&
+         "This function should not be visited twice for the same VF");
+
+  SmallSetVector<Instruction *, 8> Worklist;
+
+  // These sets are used to seed the analysis with pointers used by memory
+  // accesses that will remain scalar.
+  SmallSetVector<Instruction *, 8> ScalarPtrs;
+  SmallPtrSet<Instruction *, 8> PossibleNonScalarPtrs;
+
+  // A helper that returns true if the use of Ptr by MemAccess will be scalar.
+  // The pointer operands of loads and stores will be scalar as long as the
+  // memory access is not a gather or scatter operation. The value operand of a
+  // store will remain scalar if the store is scalarized.
+  auto isScalarUse = [&](Instruction *MemAccess, Value *Ptr) {
+    InstWidening WideningDecision = getWideningDecision(MemAccess, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+    if (auto *Store = dyn_cast<StoreInst>(MemAccess))
+      if (Ptr == Store->getValueOperand())
+        return WideningDecision == CM_Scalarize;
+    assert(Ptr == getPointerOperand(MemAccess) &&
+           "Ptr is neither a value or pointer operand");
+    return WideningDecision != CM_GatherScatter;
+  };
+
+  // A helper that returns true if the given value is a bitcast or
+  // getelementptr instruction contained in the loop.
+  auto isLoopVaryingBitCastOrGEP = [&](Value *V) {
+    return ((isa<BitCastInst>(V) && V->getType()->isPointerTy()) ||
+            isa<GetElementPtrInst>(V)) &&
+           !TheLoop->isLoopInvariant(V);
+  };
+
+  // A helper that evaluates a memory access's use of a pointer. If the use
+  // will be a scalar use, and the pointer is only used by memory accesses, we
+  // place the pointer in ScalarPtrs. Otherwise, the pointer is placed in
+  // PossibleNonScalarPtrs.
+  auto evaluatePtrUse = [&](Instruction *MemAccess, Value *Ptr) {
+
+    // We only care about bitcast and getelementptr instructions contained in
+    // the loop.
+    if (!isLoopVaryingBitCastOrGEP(Ptr))
+      return;
+
+    // If the pointer has already been identified as scalar (e.g., if it was
+    // also identified as uniform), there's nothing to do.
+    auto *I = cast<Instruction>(Ptr);
+    if (Worklist.count(I))
+      return;
 
-  // If an instruction is uniform after vectorization, it will remain scalar.
-  Scalars.insert(Uniforms.begin(), Uniforms.end());
+    // If the use of the pointer will be a scalar use, and all users of the
+    // pointer are memory accesses, place the pointer in ScalarPtrs. Otherwise,
+    // place the pointer in PossibleNonScalarPtrs.
+    if (isScalarUse(MemAccess, Ptr) && all_of(I->users(), [&](User *U) {
+          return isa<LoadInst>(U) || isa<StoreInst>(U);
+        }))
+      ScalarPtrs.insert(I);
+    else
+      PossibleNonScalarPtrs.insert(I);
+  };
 
-  // Collect the getelementptr instructions that will not be vectorized. A
-  // getelementptr instruction is only vectorized if it is used for a legal
-  // gather or scatter operation.
+  // We seed the scalars analysis with three classes of instructions: (1)
+  // instructions marked uniform-after-vectorization, (2) bitcast and
+  // getelementptr instructions used by memory accesses requiring a scalar use,
+  // and (3) pointer induction variables and their update instructions (we
+  // currently only scalarize these).
+  //
+  // (1) Add to the worklist all instructions that have been identified as
+  // uniform-after-vectorization.
+  Worklist.insert(Uniforms[VF].begin(), Uniforms[VF].end());
+
+  // (2) Add to the worklist all bitcast and getelementptr instructions used by
+  // memory accesses requiring a scalar use. The pointer operands of loads and
+  // stores will be scalar as long as the memory accesses is not a gather or
+  // scatter operation. The value operand of a store will remain scalar if the
+  // store is scalarized.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
-      if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-        Scalars.insert(GEP);
-        continue;
+      if (auto *Load = dyn_cast<LoadInst>(&I)) {
+        evaluatePtrUse(Load, Load->getPointerOperand());
+      } else if (auto *Store = dyn_cast<StoreInst>(&I)) {
+        evaluatePtrUse(Store, Store->getPointerOperand());
+        evaluatePtrUse(Store, Store->getValueOperand());
       }
-      auto *Ptr = getPointerOperand(&I);
-      if (!Ptr)
-        continue;
-      auto *GEP = getGEPInstruction(Ptr);
-      if (GEP && isLegalGatherOrScatter(&I))
-        Scalars.erase(GEP);
+    }
+  for (auto *I : ScalarPtrs)
+    if (!PossibleNonScalarPtrs.count(I)) {
+      DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
+      Worklist.insert(I);
     }
 
+  // (3) Add to the worklist all pointer induction variables and their update
+  // instructions.
+  //
+  // TODO: Once we are able to vectorize pointer induction variables we should
+  //       no longer insert them into the worklist here.
+  auto *Latch = TheLoop->getLoopLatch();
+  for (auto &Induction : *Legal->getInductionVars()) {
+    auto *Ind = Induction.first;
+    auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
+    if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
+      continue;
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
+  }
+
+  // Expand the worklist by looking through any bitcasts and getelementptr
+  // instructions we've already identified as scalar. This is similar to the
+  // expansion step in collectLoopUniforms(); however, here we're only
+  // expanding to include additional bitcasts and getelementptr instructions.
+  unsigned Idx = 0;
+  while (Idx != Worklist.size()) {
+    Instruction *Dst = Worklist[Idx++];
+    if (!isLoopVaryingBitCastOrGEP(Dst->getOperand(0)))
+      continue;
+    auto *Src = cast<Instruction>(Dst->getOperand(0));
+    if (all_of(Src->users(), [&](User *U) -> bool {
+          auto *J = cast<Instruction>(U);
+          return !TheLoop->contains(J) || Worklist.count(J) ||
+                 ((isa<LoadInst>(J) || isa<StoreInst>(J)) &&
+                  isScalarUse(J, Src));
+        })) {
+      Worklist.insert(Src);
+      DEBUG(dbgs() << "LV: Found scalar instruction: " << *Src << "\n");
+    }
+  }
+
   // An induction variable will remain scalar if all users of the induction
   // variable and induction variable update remain scalar.
-  auto *Latch = TheLoop->getLoopLatch();
-  for (auto &Induction : *getInductionVars()) {
+  for (auto &Induction : *Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
+    // We already considered pointer induction variables, so there's no reason
+    // to look at their users again.
+    //
+    // TODO: Once we are able to vectorize pointer induction variables we
+    //       should no longer skip over them here.
+    if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
+      continue;
+
     // Determine if all users of the induction variable are scalar after
     // vectorization.
     auto ScalarInd = all_of(Ind->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == IndUpdate || !TheLoop->contains(I) || Scalars.count(I);
+      return I == IndUpdate || !TheLoop->contains(I) || Worklist.count(I);
     });
     if (!ScalarInd)
       continue;
@@ -5425,23 +5653,19 @@ void LoopVectorizationLegality::collectLoopScalars() {
     // scalar after vectorization.
     auto ScalarIndUpdate = all_of(IndUpdate->users(), [&](User *U) -> bool {
       auto *I = cast<Instruction>(U);
-      return I == Ind || !TheLoop->contains(I) || Scalars.count(I);
+      return I == Ind || !TheLoop->contains(I) || Worklist.count(I);
     });
     if (!ScalarIndUpdate)
       continue;
 
     // The induction variable and its update instruction will remain scalar.
-    Scalars.insert(Ind);
-    Scalars.insert(IndUpdate);
+    Worklist.insert(Ind);
+    Worklist.insert(IndUpdate);
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *Ind << "\n");
+    DEBUG(dbgs() << "LV: Found scalar instruction: " << *IndUpdate << "\n");
   }
-}
 
-bool LoopVectorizationLegality::hasConsecutiveLikePtrOperand(Instruction *I) {
-  if (isAccessInterleaved(I))
-    return true;
-  if (auto *Ptr = getPointerOperand(I))
-    return isConsecutivePtr(Ptr);
-  return false;
+  Scalars[VF].insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
@@ -5461,48 +5685,48 @@ bool LoopVectorizationLegality::isScalarWithPredication(Instruction *I) {
   return false;
 }
 
-bool LoopVectorizationLegality::memoryInstructionMustBeScalarized(
-    Instruction *I, unsigned VF) {
-
-  // If the memory instruction is in an interleaved group, it will be
-  // vectorized and its pointer will remain uniform.
-  if (isAccessInterleaved(I))
-    return false;
-
+bool LoopVectorizationLegality::memoryInstructionCanBeWidened(Instruction *I,
+                                                              unsigned VF) {
   // Get and ensure we have a valid memory instruction.
   LoadInst *LI = dyn_cast<LoadInst>(I);
   StoreInst *SI = dyn_cast<StoreInst>(I);
   assert((LI || SI) && "Invalid memory instruction");
 
-  // If the pointer operand is uniform (loop invariant), the memory instruction
-  // will be scalarized.
   auto *Ptr = getPointerOperand(I);
-  if (LI && isUniform(Ptr))
-    return true;
 
-  // If the pointer operand is non-consecutive and neither a gather nor a
-  // scatter operation is legal, the memory instruction will be scalarized.
-  if (!isConsecutivePtr(Ptr) && !isLegalGatherOrScatter(I))
-    return true;
+  // In order to be widened, the pointer should be consecutive, first of all.
+  if (!isConsecutivePtr(Ptr))
+    return false;
 
   // If the instruction is a store located in a predicated block, it will be
   // scalarized.
   if (isScalarWithPredication(I))
-    return true;
+    return false;
 
   // If the instruction's allocated size doesn't equal it's type size, it
   // requires padding and will be scalarized.
   auto &DL = I->getModule()->getDataLayout();
   auto *ScalarTy = LI ? LI->getType() : SI->getValueOperand()->getType();
   if (hasIrregularType(ScalarTy, DL, VF))
-    return true;
+    return false;
 
-  // Otherwise, the memory instruction should be vectorized if the rest of the
-  // loop is.
-  return false;
+  return true;
 }
 
-void LoopVectorizationLegality::collectLoopUniforms() {
+void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
+
+  // We should not collect Uniforms more than once per VF. Right now,
+  // this function is called from collectUniformsAndScalars(), which 
+  // already does this check. Collecting Uniforms for VF=1 does not make any
+  // sense.
+
+  assert(VF >= 2 && !Uniforms.count(VF) &&
+         "This function should not be visited twice for the same VF");
+
+  // Visit the list of Uniforms. If we'll not find any uniform value, we'll 
+  // not analyze again.  Uniforms.count(VF) will return 1.
+  Uniforms[VF].clear();
+
   // We now know that the loop is vectorizable!
   // Collect instructions inside the loop that will remain uniform after
   // vectorization.
@@ -5535,6 +5759,14 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   // Holds pointer operands of instructions that are possibly non-uniform.
   SmallPtrSet<Instruction *, 8> PossibleNonUniformPtrs;
 
+  auto isUniformDecision = [&](Instruction *I, unsigned VF) {
+    InstWidening WideningDecision = getWideningDecision(I, VF);
+    assert(WideningDecision != CM_Unknown &&
+           "Widening decision should be ready at this moment");
+
+    return (WideningDecision == CM_Widen ||
+            WideningDecision == CM_Interleave);
+  };
   // Iterate over the instructions in the loop, and collect all
   // consecutive-like pointer operands in ConsecutiveLikePtrs. If it's possible
   // that a consecutive-like pointer operand will be scalarized, we collect it
@@ -5557,25 +5789,18 @@ void LoopVectorizationLegality::collectLoopUniforms() {
         return getPointerOperand(U) == Ptr;
       });
 
-      // Ensure the memory instruction will not be scalarized, making its
-      // pointer operand non-uniform. If the pointer operand is used by some
-      // instruction other than a memory access, we're not going to check if
-      // that other instruction may be scalarized here. Thus, conservatively
-      // assume the pointer operand may be non-uniform.
-      if (!UsersAreMemAccesses || memoryInstructionMustBeScalarized(&I))
+      // Ensure the memory instruction will not be scalarized or used by
+      // gather/scatter, making its pointer operand non-uniform. If the pointer
+      // operand is used by any instruction other than a memory access, we
+      // conservatively assume the pointer operand may be non-uniform.
+      if (!UsersAreMemAccesses || !isUniformDecision(&I, VF))
         PossibleNonUniformPtrs.insert(Ptr);
 
       // If the memory instruction will be vectorized and its pointer operand
-      // is consecutive-like, the pointer operand should remain uniform.
-      else if (hasConsecutiveLikePtrOperand(&I))
-        ConsecutiveLikePtrs.insert(Ptr);
-
-      // Otherwise, if the memory instruction will be vectorized and its
-      // pointer operand is non-consecutive-like, the memory instruction should
-      // be a gather or scatter operation. Its pointer operand will be
-      // non-uniform.
+      // is consecutive-like, or interleaving - the pointer operand should
+      // remain uniform.
       else
-        PossibleNonUniformPtrs.insert(Ptr);
+        ConsecutiveLikePtrs.insert(Ptr);
     }
 
   // Add to the Worklist all consecutive and consecutive-like pointers that
@@ -5599,7 +5824,9 @@ void LoopVectorizationLegality::collectLoopUniforms() {
         continue;
       auto *OI = cast<Instruction>(OV);
       if (all_of(OI->users(), [&](User *U) -> bool {
-            return isOutOfScope(U) || Worklist.count(cast<Instruction>(U));
+            auto *J = cast<Instruction>(U);
+            return !TheLoop->contains(J) || Worklist.count(J) ||
+                   (OI == getPointerOperand(J) && isUniformDecision(J, VF));
           })) {
         Worklist.insert(OI);
         DEBUG(dbgs() << "LV: Found uniform instruction: " << *OI << "\n");
@@ -5610,7 +5837,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   // Returns true if Ptr is the pointer operand of a memory access instruction
   // I, and I is known to not require scalarization.
   auto isVectorizedMemAccessUse = [&](Instruction *I, Value *Ptr) -> bool {
-    return getPointerOperand(I) == Ptr && !memoryInstructionMustBeScalarized(I);
+    return getPointerOperand(I) == Ptr && isUniformDecision(I, VF);
   };
 
   // For an instruction to be added into Worklist above, all its users inside
@@ -5619,7 +5846,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
   // nodes separately. An induction variable will remain uniform if all users
   // of the induction variable and induction variable update remain uniform.
   // The code below handles both pointer and non-pointer induction variables.
-  for (auto &Induction : Inductions) {
+  for (auto &Induction : *Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
@@ -5650,7 +5877,7 @@ void LoopVectorizationLegality::collectLoopUniforms() {
     DEBUG(dbgs() << "LV: Found uniform instruction: " << *IndUpdate << "\n");
   }
 
-  Uniforms.insert(Worklist.begin(), Worklist.end());
+  Uniforms[VF].insert(Worklist.begin(), Worklist.end());
 }
 
 bool LoopVectorizationLegality::canVectorizeMemory() {
@@ -5790,7 +6017,7 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
       uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
 
       // An alignment of 0 means target ABI alignment.
-      unsigned Align = LI ? LI->getAlignment() : SI->getAlignment();
+      unsigned Align = getMemInstAlignment(&I);
       if (!Align)
         Align = DL.getABITypeAlignment(PtrTy->getElementType());
 
@@ -5945,6 +6172,11 @@ void InterleavedAccessInfo::analyzeInterleaving(
       if (DesA.Stride != DesB.Stride || DesA.Size != DesB.Size)
         continue;
 
+      // Ignore A if the memory object of A and B don't belong to the same
+      // address space
+      if (getMemInstAddressSpace(A) != getMemInstAddressSpace(B))
+        continue;
+
       // Calculate the distance from A to B.
       const SCEVConstant *DistToB = dyn_cast<SCEVConstant>(
           PSE.getSE()->getMinusSCEV(DesA.Scev, DesB.Scev));
@@ -5988,35 +6220,35 @@ void InterleavedAccessInfo::analyzeInterleaving(
       releaseGroup(Group);
 
   // Remove interleaved groups with gaps (currently only loads) whose memory
-  // accesses may wrap around. We have to revisit the getPtrStride analysis, 
-  // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does 
+  // accesses may wrap around. We have to revisit the getPtrStride analysis,
+  // this time with ShouldCheckWrap=true, since collectConstStrideAccesses does
   // not check wrapping (see documentation there).
-  // FORNOW we use Assume=false; 
-  // TODO: Change to Assume=true but making sure we don't exceed the threshold 
+  // FORNOW we use Assume=false;
+  // TODO: Change to Assume=true but making sure we don't exceed the threshold
   // of runtime SCEV assumptions checks (thereby potentially failing to
-  // vectorize altogether). 
+  // vectorize altogether).
   // Additional optional optimizations:
-  // TODO: If we are peeling the loop and we know that the first pointer doesn't 
+  // TODO: If we are peeling the loop and we know that the first pointer doesn't
   // wrap then we can deduce that all pointers in the group don't wrap.
-  // This means that we can forcefully peel the loop in order to only have to 
-  // check the first pointer for no-wrap. When we'll change to use Assume=true 
+  // This means that we can forcefully peel the loop in order to only have to
+  // check the first pointer for no-wrap. When we'll change to use Assume=true
   // we'll only need at most one runtime check per interleaved group.
   //
   for (InterleaveGroup *Group : LoadGroups) {
 
     // Case 1: A full group. Can Skip the checks; For full groups, if the wide
-    // load would wrap around the address space we would do a memory access at 
-    // nullptr even without the transformation. 
-    if (Group->getNumMembers() == Group->getFactor()) 
+    // load would wrap around the address space we would do a memory access at
+    // nullptr even without the transformation.
+    if (Group->getNumMembers() == Group->getFactor())
       continue;
 
-    // Case 2: If first and last members of the group don't wrap this implies 
+    // Case 2: If first and last members of the group don't wrap this implies
     // that all the pointers in the group don't wrap.
     // So we check only group member 0 (which is always guaranteed to exist),
-    // and group member Factor - 1; If the latter doesn't exist we rely on 
+    // and group member Factor - 1; If the latter doesn't exist we rely on
     // peeling (if it is a non-reveresed accsess -- see Case 3).
     Value *FirstMemberPtr = getPointerOperand(Group->getMember(0));
-    if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false, 
+    if (!getPtrStride(PSE, FirstMemberPtr, TheLoop, Strides, /*Assume=*/false,
                       /*ShouldCheckWrap=*/true)) {
       DEBUG(dbgs() << "LV: Invalidate candidate interleaved group due to "
                       "first group member potentially pointer-wrapping.\n");
@@ -6032,8 +6264,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
                         "last group member potentially pointer-wrapping.\n");
         releaseGroup(Group);
       }
-    }
-    else {
+    } else {
       // Case 3: A non-reversed interleaved load group with gaps: We need
       // to execute at least one scalar epilogue iteration. This will ensure 
       // we don't speculatively access memory out-of-bounds. We only need
@@ -6049,27 +6280,62 @@ void InterleavedAccessInfo::analyzeInterleaving(
   }
 }
 
-LoopVectorizationCostModel::VectorizationFactor
-LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
-  // Width 1 means no vectorize
-  VectorizationFactor Factor = {1U, 0U};
-  if (OptForSize && Legal->getRuntimePointerChecking()->Need) {
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(bool OptForSize) {
+  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
+    ORE->emit(createMissedAnalysis("ConditionalStore")
+              << "store that is conditionally executed prevents vectorization");
+    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
+    return None;
+  }
+
+  if (!OptForSize) // Remaining checks deal with scalar loop when OptForSize.
+    return computeFeasibleMaxVF(OptForSize);
+
+  if (Legal->getRuntimePointerChecking()->Need) {
     ORE->emit(createMissedAnalysis("CantVersionLoopWithOptForSize")
               << "runtime pointer checks needed. Enable vectorization of this "
                  "loop with '#pragma clang loop vectorize(enable)' when "
                  "compiling with -Os/-Oz");
     DEBUG(dbgs()
           << "LV: Aborting. Runtime ptr check is required with -Os/-Oz.\n");
-    return Factor;
+    return None;
   }
 
-  if (!EnableCondStoresVectorization && Legal->getNumPredStores()) {
-    ORE->emit(createMissedAnalysis("ConditionalStore")
-              << "store that is conditionally executed prevents vectorization");
-    DEBUG(dbgs() << "LV: No vectorization. There are conditional stores.\n");
-    return Factor;
+  // If we optimize the program for size, avoid creating the tail loop.
+  unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
+  DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
+
+  // If we don't know the precise trip count, don't try to vectorize.
+  if (TC < 2) {
+    ORE->emit(
+        createMissedAnalysis("UnknownLoopCountComplexCFG")
+        << "unable to calculate the loop count due to complex control flow");
+    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+    return None;
+  }
+
+  unsigned MaxVF = computeFeasibleMaxVF(OptForSize);
+
+  if (TC % MaxVF != 0) {
+    // If the trip count that we found modulo the vectorization factor is not
+    // zero then we require a tail.
+    // FIXME: look for a smaller MaxVF that does divide TC rather than give up.
+    // FIXME: return None if loop requiresScalarEpilog(<MaxVF>), or look for a
+    //        smaller MaxVF that does not require a scalar epilog.
+
+    ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
+              << "cannot optimize for size and vectorize at the "
+                 "same time. Enable vectorization of this loop "
+                 "with '#pragma clang loop vectorize(enable)' "
+                 "when compiling with -Os/-Oz");
+    DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
+    return None;
   }
 
+  return MaxVF;
+}
+
+unsigned LoopVectorizationCostModel::computeFeasibleMaxVF(bool OptForSize) {
   MinBWs = computeMinimumValueSizes(TheLoop->getBlocks(), *DB, &TTI);
   unsigned SmallestType, WidestType;
   std::tie(SmallestType, WidestType) = getSmallestAndWidestTypes();
@@ -6103,7 +6369,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
   assert(MaxVectorSize <= 64 && "Did not expect to pack so many elements"
                                 " into one vector!");
 
-  unsigned VF = MaxVectorSize;
+  unsigned MaxVF = MaxVectorSize;
   if (MaximizeBandwidth && !OptForSize) {
     // Collect all viable vectorization factors.
     SmallVector<unsigned, 8> VFs;
@@ -6119,54 +6385,16 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
     unsigned TargetNumRegisters = TTI.getNumberOfRegisters(true);
     for (int i = RUs.size() - 1; i >= 0; --i) {
       if (RUs[i].MaxLocalUsers <= TargetNumRegisters) {
-        VF = VFs[i];
+        MaxVF = VFs[i];
         break;
       }
     }
   }
+  return MaxVF;
+}
 
-  // If we optimize the program for size, avoid creating the tail loop.
-  if (OptForSize) {
-    unsigned TC = PSE.getSE()->getSmallConstantTripCount(TheLoop);
-    DEBUG(dbgs() << "LV: Found trip count: " << TC << '\n');
-
-    // If we don't know the precise trip count, don't try to vectorize.
-    if (TC < 2) {
-      ORE->emit(
-          createMissedAnalysis("UnknownLoopCountComplexCFG")
-          << "unable to calculate the loop count due to complex control flow");
-      DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
-      return Factor;
-    }
-
-    // Find the maximum SIMD width that can fit within the trip count.
-    VF = TC % MaxVectorSize;
-
-    if (VF == 0)
-      VF = MaxVectorSize;
-    else {
-      // If the trip count that we found modulo the vectorization factor is not
-      // zero then we require a tail.
-      ORE->emit(createMissedAnalysis("NoTailLoopWithOptForSize")
-                << "cannot optimize for size and vectorize at the "
-                   "same time. Enable vectorization of this loop "
-                   "with '#pragma clang loop vectorize(enable)' "
-                   "when compiling with -Os/-Oz");
-      DEBUG(dbgs() << "LV: Aborting. A tail loop is required with -Os/-Oz.\n");
-      return Factor;
-    }
-  }
-
-  int UserVF = Hints->getWidth();
-  if (UserVF != 0) {
-    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
-    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
-
-    Factor.Width = UserVF;
-    collectInstsToScalarize(UserVF);
-    return Factor;
-  }
-
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationCostModel::selectVectorizationFactor(unsigned MaxVF) {
   float Cost = expectedCost(1).first;
 #ifndef NDEBUG
   const float ScalarCost = Cost;
@@ -6176,12 +6404,12 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
 
   bool ForceVectorization = Hints->getForce() == LoopVectorizeHints::FK_Enabled;
   // Ignore scalar width, because the user explicitly wants vectorization.
-  if (ForceVectorization && VF > 1) {
+  if (ForceVectorization && MaxVF > 1) {
     Width = 2;
     Cost = expectedCost(Width).first / (float)Width;
   }
 
-  for (unsigned i = 2; i <= VF; i *= 2) {
+  for (unsigned i = 2; i <= MaxVF; i *= 2) {
     // Notice that the vector loop needs to be executed less times, so
     // we need to divide the cost of the vector loops by the width of
     // the vector elements.
@@ -6205,8 +6433,7 @@ LoopVectorizationCostModel::selectVectorizationFactor(bool OptForSize) {
         << "LV: Vectorization seems to be not beneficial, "
         << "but was forced by a user.\n");
   DEBUG(dbgs() << "LV: Selecting VF: " << Width << ".\n");
-  Factor.Width = Width;
-  Factor.Cost = Width * Cost;
+  VectorizationFactor Factor = {Width, (unsigned)(Width * Cost)};
   return Factor;
 }
 
@@ -6244,9 +6471,16 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
         T = ST->getValueOperand()->getType();
 
       // Ignore loaded pointer types and stored pointer types that are not
-      // consecutive. However, we do want to take consecutive stores/loads of
-      // pointer vectors into account.
-      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I))
+      // vectorizable.
+      //
+      // FIXME: The check here attempts to predict whether a load or store will
+      //        be vectorized. We only know this for certain after a VF has
+      //        been selected. Here, we assume that if an access can be
+      //        vectorized, it will be. We should also look at extending this
+      //        optimization to non-pointer types.
+      //
+      if (T->isPointerTy() && !isConsecutiveLoadOrStore(&I) &&
+          !Legal->isAccessInterleaved(&I) && !Legal->isLegalGatherOrScatter(&I))
         continue;
 
       MinWidth = std::min(MinWidth,
@@ -6529,12 +6763,13 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
         MaxUsages[j] = std::max(MaxUsages[j], OpenIntervals.size());
         continue;
       }
-
+      collectUniformsAndScalars(VFs[j]);
       // Count the number of live intervals.
       unsigned RegUsage = 0;
       for (auto Inst : OpenIntervals) {
         // Skip ignored values for VF > 1.
-        if (VecValuesToIgnore.count(Inst))
+        if (VecValuesToIgnore.count(Inst) ||
+            isScalarAfterVectorization(Inst, VFs[j]))
           continue;
         RegUsage += GetRegUsage(Inst->getType(), VFs[j]);
       }
@@ -6595,6 +6830,9 @@ void LoopVectorizationCostModel::collectInstsToScalarize(unsigned VF) {
         ScalarCostsTy ScalarCosts;
         if (computePredInstDiscount(&I, ScalarCosts, VF) >= 0)
           ScalarCostsVF.insert(ScalarCosts.begin(), ScalarCosts.end());
+
+        // Remember that BB will remain after vectorization.
+        PredicatedBBsAfterVectorization.insert(BB);
       }
   }
 }
@@ -6603,7 +6841,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     Instruction *PredInst, DenseMap<Instruction *, unsigned> &ScalarCosts,
     unsigned VF) {
 
-  assert(!Legal->isUniformAfterVectorization(PredInst) &&
+  assert(!isUniformAfterVectorization(PredInst, VF) &&
          "Instruction marked uniform-after-vectorization will be predicated");
 
   // Initialize the discount to zero, meaning that the scalar version and the
@@ -6624,7 +6862,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // already be scalar to avoid traversing chains that are unlikely to be
     // beneficial.
     if (!I->hasOneUse() || PredInst->getParent() != I->getParent() ||
-        Legal->isScalarAfterVectorization(I))
+        isScalarAfterVectorization(I, VF))
       return false;
 
     // If the instruction is scalar with predication, it will be analyzed
@@ -6644,7 +6882,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // the lane zero values for uniforms rather than asserting.
     for (Use &U : I->operands())
       if (auto *J = dyn_cast<Instruction>(U.get()))
-        if (Legal->isUniformAfterVectorization(J))
+        if (isUniformAfterVectorization(J, VF))
           return false;
 
     // Otherwise, we can scalarize the instruction.
@@ -6657,7 +6895,7 @@ int LoopVectorizationCostModel::computePredInstDiscount(
   // and their return values are inserted into vectors. Thus, an extract would
   // still be required.
   auto needsExtract = [&](Instruction *I) -> bool {
-    return TheLoop->contains(I) && !Legal->isScalarAfterVectorization(I);
+    return TheLoop->contains(I) && !isScalarAfterVectorization(I, VF);
   };
 
   // Compute the expected cost discount from scalarizing the entire expression
@@ -6720,6 +6958,9 @@ LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::expectedCost(unsigned VF) {
   VectorizationCostTy Cost;
 
+  // Collect Uniform and Scalar instructions after vectorization with VF.
+  collectUniformsAndScalars(VF);
+
   // Collect the instructions (and their associated costs) that will be more
   // profitable to scalarize.
   collectInstsToScalarize(VF);
@@ -6799,11 +7040,141 @@ static bool isStrideMul(Instruction *I, LoopVectorizationLegality *Legal) {
          Legal->hasStride(I->getOperand(1));
 }
 
+unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
+                                                                 unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  auto SE = PSE.getSE();
+
+  unsigned Alignment = getMemInstAlignment(I);
+  unsigned AS = getMemInstAddressSpace(I);
+  Value *Ptr = getPointerOperand(I);
+  Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
+
+  // Figure out whether the access is strided and get the stride value
+  // if it's known in compile time
+  const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop);
+
+  // Get the cost of the scalar memory instruction and address computation.
+  unsigned Cost = VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
+
+  Cost += VF *
+          TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(), Alignment,
+                              AS, I);
+
+  // Get the overhead of the extractelement and insertelement instructions
+  // we might create due to scalarization.
+  Cost += getScalarizationOverhead(I, VF, TTI);
+
+  // If we have a predicated store, it may not be executed for each vector
+  // lane. Scale the cost by the probability of executing the predicated
+  // block.
+  if (Legal->isScalarWithPredication(I))
+    Cost /= getReciprocalPredBlockProb();
+
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
+                                                             unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = getMemInstAlignment(I);
+  Value *Ptr = getPointerOperand(I);
+  unsigned AS = getMemInstAddressSpace(I);
+  int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+
+  assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
+         "Stride should be 1 or -1 for consecutive memory access");
+  unsigned Cost = 0;
+  if (Legal->isMaskRequired(I))
+    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
+  else
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
+
+  bool Reverse = ConsecutiveStride < 0;
+  if (Reverse)
+    Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
+                                                         unsigned VF) {
+  LoadInst *LI = cast<LoadInst>(I);
+  Type *ValTy = LI->getType();
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = LI->getAlignment();
+  unsigned AS = LI->getPointerAddressSpace();
+
+  return TTI.getAddressComputationCost(ValTy) +
+         TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+         TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
+}
+
+unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
+                                                          unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned Alignment = getMemInstAlignment(I);
+  Value *Ptr = getPointerOperand(I);
+
+  return TTI.getAddressComputationCost(VectorTy) +
+         TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
+                                    Legal->isMaskRequired(I), Alignment);
+}
+
+unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
+                                                            unsigned VF) {
+  Type *ValTy = getMemInstValueType(I);
+  Type *VectorTy = ToVectorTy(ValTy, VF);
+  unsigned AS = getMemInstAddressSpace(I);
+
+  auto Group = Legal->getInterleavedAccessGroup(I);
+  assert(Group && "Fail to get an interleaved access group.");
+
+  unsigned InterleaveFactor = Group->getFactor();
+  Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+
+  // Holds the indices of existing members in an interleaved load group.
+  // An interleaved store group doesn't need this as it doesn't allow gaps.
+  SmallVector<unsigned, 4> Indices;
+  if (isa<LoadInst>(I)) {
+    for (unsigned i = 0; i < InterleaveFactor; i++)
+      if (Group->getMember(i))
+        Indices.push_back(i);
+  }
+
+  // Calculate the cost of the whole interleaved group.
+  unsigned Cost = TTI.getInterleavedMemoryOpCost(I->getOpcode(), WideVecTy,
+                                                 Group->getFactor(), Indices,
+                                                 Group->getAlignment(), AS);
+
+  if (Group->isReverse())
+    Cost += Group->getNumMembers() *
+            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
+  return Cost;
+}
+
+unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
+                                                              unsigned VF) {
+
+  // Calculate scalar cost only. Vectorization cost should be ready at this
+  // moment.
+  if (VF == 1) {
+    Type *ValTy = getMemInstValueType(I);
+    unsigned Alignment = getMemInstAlignment(I);
+    unsigned AS = getMemInstAlignment(I);
+
+    return TTI.getAddressComputationCost(ValTy) +
+           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
+  }
+  return getWideningCost(I, VF);
+}
+
 LoopVectorizationCostModel::VectorizationCostTy
 LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   // If we know that this instruction will remain uniform, check the cost of
   // the scalar version.
-  if (Legal->isUniformAfterVectorization(I))
+  if (isUniformAfterVectorization(I, VF))
     VF = 1;
 
   if (VF > 1 && isProfitableToScalarize(I, VF))
@@ -6817,6 +7188,79 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   return VectorizationCostTy(C, TypeNotScalarized);
 }
 
+void LoopVectorizationCostModel::setCostBasedWideningDecision(unsigned VF) {
+  if (VF == 1)
+    return;
+  for (BasicBlock *BB : TheLoop->blocks()) {
+    // For each instruction in the old loop.
+    for (Instruction &I : *BB) {
+      Value *Ptr = getPointerOperand(&I);
+      if (!Ptr)
+        continue;
+
+      if (isa<LoadInst>(&I) && Legal->isUniform(Ptr)) {
+        // Scalar load + broadcast
+        unsigned Cost = getUniformMemOpCost(&I, VF);
+        setWideningDecision(&I, VF, CM_Scalarize, Cost);
+        continue;
+      }
+
+      // We assume that widening is the best solution when possible.
+      if (Legal->memoryInstructionCanBeWidened(&I, VF)) {
+        unsigned Cost = getConsecutiveMemOpCost(&I, VF);
+        setWideningDecision(&I, VF, CM_Widen, Cost);
+        continue;
+      }
+
+      // Choose between Interleaving, Gather/Scatter or Scalarization.
+      unsigned InterleaveCost = UINT_MAX;
+      unsigned NumAccesses = 1;
+      if (Legal->isAccessInterleaved(&I)) {
+        auto Group = Legal->getInterleavedAccessGroup(&I);
+        assert(Group && "Fail to get an interleaved access group.");
+
+        // Make one decision for the whole group.
+        if (getWideningDecision(&I, VF) != CM_Unknown)
+          continue;
+
+        NumAccesses = Group->getNumMembers();
+        InterleaveCost = getInterleaveGroupCost(&I, VF);
+      }
+
+      unsigned GatherScatterCost =
+          Legal->isLegalGatherOrScatter(&I)
+              ? getGatherScatterCost(&I, VF) * NumAccesses
+              : UINT_MAX;
+
+      unsigned ScalarizationCost =
+          getMemInstScalarizationCost(&I, VF) * NumAccesses;
+
+      // Choose better solution for the current VF,
+      // write down this decision and use it during vectorization.
+      unsigned Cost;
+      InstWidening Decision;
+      if (InterleaveCost <= GatherScatterCost &&
+          InterleaveCost < ScalarizationCost) {
+        Decision = CM_Interleave;
+        Cost = InterleaveCost;
+      } else if (GatherScatterCost < ScalarizationCost) {
+        Decision = CM_GatherScatter;
+        Cost = GatherScatterCost;
+      } else {
+        Decision = CM_Scalarize;
+        Cost = ScalarizationCost;
+      }
+      // If the instructions belongs to an interleave group, the whole group
+      // receives the same decision. The whole group receives the cost, but
+      // the cost will actually be assigned to one instruction.
+      if (auto Group = Legal->getInterleavedAccessGroup(&I))
+        setWideningDecision(Group, VF, Decision, Cost);
+      else
+        setWideningDecision(&I, VF, Decision, Cost);
+    }
+  }
+}
+
 unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
                                                         unsigned VF,
                                                         Type *&VectorTy) {
@@ -6835,7 +7279,31 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // instruction cost.
     return 0;
   case Instruction::Br: {
-    return TTI.getCFInstrCost(I->getOpcode());
+    // In cases of scalarized and predicated instructions, there will be VF
+    // predicated blocks in the vectorized loop. Each branch around these
+    // blocks requires also an extract of its vector compare i1 element.
+    bool ScalarPredicatedBB = false;
+    BranchInst *BI = cast<BranchInst>(I);
+    if (VF > 1 && BI->isConditional() &&
+        (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
+         PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
+      ScalarPredicatedBB = true;
+
+    if (ScalarPredicatedBB) {
+      // Return cost for branches around scalarized and predicated blocks.
+      Type *Vec_i1Ty =
+          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+      return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
+              (TTI.getCFInstrCost(Instruction::Br) * VF));
+    } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
+      // The back-edge branch will remain, as will all scalar branches.
+      return TTI.getCFInstrCost(Instruction::Br);
+    else
+      // This branch will be eliminated by if-conversion.
+      return 0;
+    // Note: We currently assume zero cost for an unconditional branch inside
+    // a predicated block since it will become a fall-through, although we
+    // may decide in the future to call TTI for all branches.
   }
   case Instruction::PHI: {
     auto *Phi = cast<PHINode>(I);
@@ -6936,7 +7404,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (!ScalarCond)
       CondTy = VectorType::get(CondTy, VF);
 
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
@@ -6945,130 +7413,12 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
     VectorTy = ToVectorTy(ValTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
   }
   case Instruction::Store:
   case Instruction::Load: {
-    StoreInst *SI = dyn_cast<StoreInst>(I);
-    LoadInst *LI = dyn_cast<LoadInst>(I);
-    Type *ValTy = (SI ? SI->getValueOperand()->getType() : LI->getType());
-    VectorTy = ToVectorTy(ValTy, VF);
-
-    unsigned Alignment = SI ? SI->getAlignment() : LI->getAlignment();
-    unsigned AS =
-        SI ? SI->getPointerAddressSpace() : LI->getPointerAddressSpace();
-    Value *Ptr = getPointerOperand(I);
-    // We add the cost of address computation here instead of with the gep
-    // instruction because only here we know whether the operation is
-    // scalarized.
-    if (VF == 1)
-      return TTI.getAddressComputationCost(VectorTy) +
-             TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
-
-    if (LI && Legal->isUniform(Ptr)) {
-      // Scalar load + broadcast
-      unsigned Cost = TTI.getAddressComputationCost(ValTy->getScalarType());
-      Cost += TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                  Alignment, AS);
-      return Cost +
-             TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, ValTy);
-    }
-
-    // For an interleaved access, calculate the total cost of the whole
-    // interleave group.
-    if (Legal->isAccessInterleaved(I)) {
-      auto Group = Legal->getInterleavedAccessGroup(I);
-      assert(Group && "Fail to get an interleaved access group.");
-
-      // Only calculate the cost once at the insert position.
-      if (Group->getInsertPos() != I)
-        return 0;
-
-      unsigned InterleaveFactor = Group->getFactor();
-      Type *WideVecTy =
-          VectorType::get(VectorTy->getVectorElementType(),
-                          VectorTy->getVectorNumElements() * InterleaveFactor);
-
-      // Holds the indices of existing members in an interleaved load group.
-      // An interleaved store group doesn't need this as it doesn't allow gaps.
-      SmallVector<unsigned, 4> Indices;
-      if (LI) {
-        for (unsigned i = 0; i < InterleaveFactor; i++)
-          if (Group->getMember(i))
-            Indices.push_back(i);
-      }
-
-      // Calculate the cost of the whole interleaved group.
-      unsigned Cost = TTI.getInterleavedMemoryOpCost(
-          I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
-          Group->getAlignment(), AS);
-
-      if (Group->isReverse())
-        Cost +=
-            Group->getNumMembers() *
-            TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
-
-      // FIXME: The interleaved load group with a huge gap could be even more
-      // expensive than scalar operations. Then we could ignore such group and
-      // use scalar operations instead.
-      return Cost;
-    }
-
-    // Check if the memory instruction will be scalarized.
-    if (Legal->memoryInstructionMustBeScalarized(I, VF)) {
-      unsigned Cost = 0;
-      Type *PtrTy = ToVectorTy(Ptr->getType(), VF);
-
-      // Figure out whether the access is strided and get the stride value
-      // if it's known in compile time
-      const SCEV *PtrSCEV = getAddressAccessSCEV(Ptr, Legal, SE, TheLoop); 
-
-      // Get the cost of the scalar memory instruction and address computation.
-      Cost += VF * TTI.getAddressComputationCost(PtrTy, SE, PtrSCEV);
-      Cost += VF *
-              TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                  Alignment, AS);
-
-      // Get the overhead of the extractelement and insertelement instructions
-      // we might create due to scalarization.
-      Cost += getScalarizationOverhead(I, VF, TTI);
-
-      // If we have a predicated store, it may not be executed for each vector
-      // lane. Scale the cost by the probability of executing the predicated
-      // block.
-      if (Legal->isScalarWithPredication(I))
-        Cost /= getReciprocalPredBlockProb();
-
-      return Cost;
-    }
-
-    // Determine if the pointer operand of the access is either consecutive or
-    // reverse consecutive.
-    int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
-    bool Reverse = ConsecutiveStride < 0;
-
-    // Determine if either a gather or scatter operation is legal.
-    bool UseGatherOrScatter =
-        !ConsecutiveStride && Legal->isLegalGatherOrScatter(I);
-
-    unsigned Cost = TTI.getAddressComputationCost(VectorTy);
-    if (UseGatherOrScatter) {
-      assert(ConsecutiveStride == 0 &&
-             "Gather/Scatter are not used for consecutive stride");
-      return Cost +
-             TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
-                                        Legal->isMaskRequired(I), Alignment);
-    }
-    // Wide load/stores.
-    if (Legal->isMaskRequired(I))
-      Cost +=
-          TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
-    else
-      Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS);
-
-    if (Reverse)
-      Cost += TTI.getShuffleCost(TargetTransformInfo::SK_Reverse, VectorTy, 0);
-    return Cost;
+    VectorTy = ToVectorTy(getMemInstValueType(I), VF);
+    return getMemoryInstructionCost(I, VF);
   }
   case Instruction::ZExt:
   case Instruction::SExt:
@@ -7082,12 +7432,14 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast: {
-    // We optimize the truncation of induction variable.
-    // The cost of these is the same as the scalar operation.
-    if (I->getOpcode() == Instruction::Trunc &&
-        Legal->isInductionVariable(I->getOperand(0)))
-      return TTI.getCastInstrCost(I->getOpcode(), I->getType(),
-                                  I->getOperand(0)->getType());
+    // We optimize the truncation of induction variables having constant
+    // integer steps. The cost of these truncations is the same as the scalar
+    // operation.
+    if (isOptimizableIVTruncate(I, VF)) {
+      auto *Trunc = cast<TruncInst>(I);
+      return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
+                                  Trunc->getSrcTy(), Trunc);
+    }
 
     Type *SrcScalarTy = I->getOperand(0)->getType();
     Type *SrcVecTy = ToVectorTy(SrcScalarTy, VF);
@@ -7110,7 +7462,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       }
     }
 
-    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy);
+    return TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
   }
   case Instruction::Call: {
     bool NeedToScalarize;
@@ -7171,81 +7523,34 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
-
-  // Insert values known to be scalar into VecValuesToIgnore. This is a
-  // conservative estimation of the values that will later be scalarized.
-  //
-  // FIXME: Even though an instruction is not scalar-after-vectoriztion, it may
-  //        still be scalarized. For example, we may find an instruction to be
-  //        more profitable for a given vectorization factor if it were to be
-  //        scalarized. But at this point, we haven't yet computed the
-  //        vectorization factor.
-  for (auto *BB : TheLoop->getBlocks())
-    for (auto &I : *BB)
-      if (Legal->isScalarAfterVectorization(&I))
-        VecValuesToIgnore.insert(&I);
 }
 
-void InnerLoopUnroller::scalarizeInstruction(Instruction *Instr,
-                                             bool IfPredicateInstr) {
-  assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
-  // Holds vector parameters or scalars, in case of uniform vals.
-  SmallVector<VectorParts, 4> Params;
-
-  setDebugLocFromInst(Builder, Instr);
-
-  // Does this instruction return a value ?
-  bool IsVoidRetTy = Instr->getType()->isVoidTy();
-
-  // Initialize a new scalar map entry.
-  ScalarParts Entry(UF);
-
-  VectorParts Cond;
-  if (IfPredicateInstr)
-    Cond = createBlockInMask(Instr->getParent());
-
-  // For each vector unroll 'part':
-  for (unsigned Part = 0; Part < UF; ++Part) {
-    Entry[Part].resize(1);
-    // For each scalar that we create:
-
-    // Start an "if (pred) a[i] = ..." block.
-    Value *Cmp = nullptr;
-    if (IfPredicateInstr) {
-      if (Cond[Part]->getType()->isVectorTy())
-        Cond[Part] =
-            Builder.CreateExtractElement(Cond[Part], Builder.getInt32(0));
-      Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Cond[Part],
-                               ConstantInt::get(Cond[Part]->getType(), 1));
-    }
-
-    Instruction *Cloned = Instr->clone();
-    if (!IsVoidRetTy)
-      Cloned->setName(Instr->getName() + ".cloned");
-
-    // Replace the operands of the cloned instructions with their scalar
-    // equivalents in the new loop.
-    for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-      auto *NewOp = getScalarValue(Instr->getOperand(op), Part, 0);
-      Cloned->setOperand(op, NewOp);
-    }
+LoopVectorizationCostModel::VectorizationFactor
+LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) {
 
-    // Place the cloned scalar in the new loop.
-    Builder.Insert(Cloned);
+  // Width 1 means no vectorize, cost 0 means uncomputed cost.
+  const LoopVectorizationCostModel::VectorizationFactor NoVectorization = {1U,
+                                                                           0U};
+  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(OptForSize);
+  if (!MaybeMaxVF.hasValue()) // Cases considered too costly to vectorize.
+    return NoVectorization;
 
-    // Add the cloned scalar to the scalar map entry.
-    Entry[Part][0] = Cloned;
+  if (UserVF) {
+    DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
+    assert(isPowerOf2_32(UserVF) && "VF needs to be a power of two");
+    // Collect the instructions (and their associated costs) that will be more
+    // profitable to scalarize.
+    CM.selectUserVectorizationFactor(UserVF);
+    return {UserVF, 0};
+  }
 
-    // If we just cloned a new assumption, add it the assumption cache.
-    if (auto *II = dyn_cast<IntrinsicInst>(Cloned))
-      if (II->getIntrinsicID() == Intrinsic::assume)
-        AC->registerAssumption(II);
+  unsigned MaxVF = MaybeMaxVF.getValue();
+  assert(MaxVF != 0 && "MaxVF is zero.");
+  if (MaxVF == 1)
+    return NoVectorization;
 
-    // End if-block.
-    if (IfPredicateInstr)
-      PredicatedInstructions.push_back(std::make_pair(Cloned, Cmp));
-  }
-  VectorLoopValueMap.initScalar(Instr, Entry);
+  // Select the optimal vectorization factor.
+  return CM.selectVectorizationFactor(MaxVF);
 }
 
 void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) {
@@ -7379,11 +7684,6 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  // Use the cost model.
-  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
-                                &Hints);
-  CM.collectValuesToIgnore();
-
   // Check the function attributes to find out if this function should be
   // optimized for size.
   bool OptForSize =
@@ -7429,9 +7729,20 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     return false;
   }
 
-  // Select the optimal vectorization factor.
-  const LoopVectorizationCostModel::VectorizationFactor VF =
-      CM.selectVectorizationFactor(OptForSize);
+  // Use the cost model.
+  LoopVectorizationCostModel CM(L, PSE, LI, &LVL, *TTI, TLI, DB, AC, ORE, F,
+                                &Hints);
+  CM.collectValuesToIgnore();
+
+  // Use the planner for vectorization.
+  LoopVectorizationPlanner LVP(CM);
+
+  // Get user vectorization factor.
+  unsigned UserVF = Hints.getWidth();
+
+  // Plan how to best vectorize, return the best VF and its cost.
+  LoopVectorizationCostModel::VectorizationFactor VF =
+      LVP.plan(OptForSize, UserVF);
 
   // Select the interleave count.
   unsigned IC = CM.selectInterleaveCount(OptForSize, VF.Width, VF.Cost);
@@ -7487,10 +7798,10 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   const char *VAPassName = Hints.vectorizeAnalysisPassName();
   if (!VectorizeLoop && !InterleaveLoop) {
     // Do not vectorize or interleaving the loop.
-    ORE->emit(OptimizationRemarkAnalysis(VAPassName, VecDiagMsg.first,
+    ORE->emit(OptimizationRemarkMissed(VAPassName, VecDiagMsg.first,
                                          L->getStartLoc(), L->getHeader())
               << VecDiagMsg.second);
-    ORE->emit(OptimizationRemarkAnalysis(LV_NAME, IntDiagMsg.first,
+    ORE->emit(OptimizationRemarkMissed(LV_NAME, IntDiagMsg.first,
                                          L->getStartLoc(), L->getHeader())
               << IntDiagMsg.second);
     return false;
diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp
index eeaf29dceb5317a50a879427f74d57ba0ab42113..da3ac06ab464eb99ac4bd658da784836b5181a84 100644
--- a/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -39,6 +39,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/GraphWriter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
@@ -90,6 +91,10 @@ static cl::opt<unsigned> MinTreeSize(
     "slp-min-tree-size", cl::init(3), cl::Hidden,
     cl::desc("Only vectorize small trees if they are fully vectorizable"));
 
+static cl::opt<bool>
+    ViewSLPTree("view-slp-tree", cl::Hidden,
+                cl::desc("Display the SLP trees with Graphviz"));
+
 // Limit the number of alias checks. The limit is chosen so that
 // it has no negative effect on the llvm benchmarks.
 static const unsigned AliasedCheckLimit = 10;
@@ -212,14 +217,14 @@ static unsigned getSameOpcode(ArrayRef<Value *> VL) {
 /// Flag set: NSW, NUW, exact, and all of fast-math.
 static void propagateIRFlags(Value *I, ArrayRef<Value *> VL) {
   if (auto *VecOp = dyn_cast<Instruction>(I)) {
-    if (auto *Intersection = dyn_cast<Instruction>(VL[0])) {
-      // Intersection is initialized to the 0th scalar,
-      // so start counting from index '1'.
+    if (auto *I0 = dyn_cast<Instruction>(VL[0])) {
+      // VecOVp is initialized to the 0th scalar, so start counting from index
+      // '1'.
+      VecOp->copyIRFlags(I0);
       for (int i = 1, e = VL.size(); i < e; ++i) {
         if (auto *Scalar = dyn_cast<Instruction>(VL[i]))
-          Intersection->andIRFlags(Scalar);
+          VecOp->andIRFlags(Scalar);
       }
-      VecOp->copyIRFlags(Intersection);
     }
   }
 }
@@ -304,6 +309,8 @@ public:
   typedef SmallVector<Instruction *, 16> InstrList;
   typedef SmallPtrSet<Value *, 16> ValueSet;
   typedef SmallVector<StoreInst *, 8> StoreList;
+  typedef MapVector<Value *, SmallVector<Instruction *, 2>>
+      ExtraValueToDebugLocsMap;
 
   BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti,
           TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li,
@@ -330,6 +337,10 @@ public:
   /// \brief Vectorize the tree that starts with the elements in \p VL.
   /// Returns the vectorized root.
   Value *vectorizeTree();
+  /// Vectorize the tree but with the list of externally used values \p
+  /// ExternallyUsedValues. Values in this MapVector can be replaced but the
+  /// generated extractvalue instructions.
+  Value *vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues);
 
   /// \returns the cost incurred by unwanted spills and fills, caused by
   /// holding live values over call sites.
@@ -343,6 +354,13 @@ public:
   /// the purpose of scheduling and extraction in the \p UserIgnoreLst.
   void buildTree(ArrayRef<Value *> Roots,
                  ArrayRef<Value *> UserIgnoreLst = None);
+  /// Construct a vectorizable tree that starts at \p Roots, ignoring users for
+  /// the purpose of scheduling and extraction in the \p UserIgnoreLst taking
+  /// into account (anf updating it, if required) list of externally used
+  /// values stored in \p ExternallyUsedValues.
+  void buildTree(ArrayRef<Value *> Roots,
+                 ExtraValueToDebugLocsMap &ExternallyUsedValues,
+                 ArrayRef<Value *> UserIgnoreLst = None);
 
   /// Clear the internal data structures that are created by 'buildTree'.
   void deleteTree() {
@@ -404,16 +422,14 @@ private:
   int getEntryCost(TreeEntry *E);
 
   /// This is the recursive part of buildTree.
-  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth);
+  void buildTree_rec(ArrayRef<Value *> Roots, unsigned Depth, int);
 
   /// \returns True if the ExtractElement/ExtractValue instructions in VL can
   /// be vectorized to use the original vector (or aggregate "bitcast" to a vector).
   bool canReuseExtract(ArrayRef<Value *> VL, unsigned Opcode) const;
 
-  /// Vectorize a single entry in the tree. VL icontains all isomorphic scalars
-  /// in order of its usage in a user program, for example ADD1, ADD2 and so on
-  /// or LOAD1 , LOAD2 etc.
-  Value *vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E);
+  /// Vectorize a single entry in the tree.
+  Value *vectorizeTree(TreeEntry *E);
 
   /// Vectorize a single entry in the tree, starting in \p VL.
   Value *vectorizeTree(ArrayRef<Value *> VL);
@@ -453,8 +469,9 @@ private:
                                       SmallVectorImpl<Value *> &Left,
                                       SmallVectorImpl<Value *> &Right);
   struct TreeEntry {
-    TreeEntry() : Scalars(), VectorizedValue(nullptr),
-    NeedToGather(0), NeedToShuffle(0) {}
+    TreeEntry(std::vector<TreeEntry> &Container)
+        : Scalars(), VectorizedValue(nullptr), NeedToGather(0),
+          Container(Container) {}
 
     /// \returns true if the scalars in VL are equal to this entry.
     bool isSame(ArrayRef<Value *> VL) const {
@@ -462,15 +479,6 @@ private:
       return std::equal(VL.begin(), VL.end(), Scalars.begin());
     }
 
-    /// \returns true if the scalars in VL are found in this tree entry.
-    bool isFoundJumbled(ArrayRef<Value *> VL, const DataLayout &DL,
-                        ScalarEvolution &SE) const {
-      assert(VL.size() == Scalars.size() && "Invalid size");
-      SmallVector<Value *, 8> List;
-      sortMemAccesses(VL, DL, SE, List);
-      return std::equal(List.begin(), List.end(), Scalars.begin());
-    }
-
     /// A vector of scalars.
     ValueList Scalars;
 
@@ -480,19 +488,27 @@ private:
     /// Do we need to gather this sequence ?
     bool NeedToGather;
 
-    /// Do we need to shuffle the load ?
-    bool NeedToShuffle;
+    /// Points back to the VectorizableTree.
+    ///
+    /// Only used for Graphviz right now.  Unfortunately GraphTrait::NodeRef has
+    /// to be a pointer and needs to be able to initialize the child iterator.
+    /// Thus we need a reference back to the container to translate the indices
+    /// to entries.
+    std::vector<TreeEntry> &Container;
+
+    /// The TreeEntry index containing the user of this entry.  We can actually
+    /// have multiple users so the data structure is not truly a tree.
+    SmallVector<int, 1> UserTreeIndices;
   };
 
   /// Create a new VectorizableTree entry.
   TreeEntry *newTreeEntry(ArrayRef<Value *> VL, bool Vectorized,
-                          bool NeedToShuffle) {
-    VectorizableTree.emplace_back();
+                          int &UserTreeIdx) {
+    VectorizableTree.emplace_back(VectorizableTree);
     int idx = VectorizableTree.size() - 1;
     TreeEntry *Last = &VectorizableTree[idx];
     Last->Scalars.insert(Last->Scalars.begin(), VL.begin(), VL.end());
     Last->NeedToGather = !Vectorized;
-    Last->NeedToShuffle = NeedToShuffle;
     if (Vectorized) {
       for (int i = 0, e = VL.size(); i != e; ++i) {
         assert(!ScalarToTreeEntry.count(VL[i]) && "Scalar already in tree!");
@@ -501,6 +517,10 @@ private:
     } else {
       MustGather.insert(VL.begin(), VL.end());
     }
+
+    if (UserTreeIdx >= 0)
+      Last->UserTreeIndices.push_back(UserTreeIdx);
+    UserTreeIdx = idx;
     return Last;
   }
 
@@ -574,7 +594,9 @@ private:
   SmallVector<std::unique_ptr<Instruction>, 8> DeletedInstructions;
 
   /// A list of values that need to extracted out of the tree.
-  /// This list holds pairs of (Internal Scalar : External User).
+  /// This list holds pairs of (Internal Scalar : External User). External User
+  /// can be nullptr, it means that this Internal Scalar will be used later,
+  /// after vectorization.
   UserList ExternalUses;
 
   /// Values used only by @llvm.assume calls.
@@ -722,6 +744,8 @@ private:
     return os;
   }
 #endif
+  friend struct GraphTraits<BoUpSLP *>;
+  friend struct DOTGraphTraits<BoUpSLP *>;
 
   /// Contains all scheduling data for a basic block.
   ///
@@ -932,17 +956,98 @@ private:
   /// original width.
   MapVector<Value *, std::pair<uint64_t, bool>> MinBWs;
 };
+} // end namespace slpvectorizer
+
+template <> struct GraphTraits<BoUpSLP *> {
+  typedef BoUpSLP::TreeEntry TreeEntry;
+
+  /// NodeRef has to be a pointer per the GraphWriter.
+  typedef TreeEntry *NodeRef;
+
+  /// \brief Add the VectorizableTree to the index iterator to be able to return
+  /// TreeEntry pointers.
+  struct ChildIteratorType
+      : public iterator_adaptor_base<ChildIteratorType,
+                                     SmallVector<int, 1>::iterator> {
+
+    std::vector<TreeEntry> &VectorizableTree;
+
+    ChildIteratorType(SmallVector<int, 1>::iterator W,
+                      std::vector<TreeEntry> &VT)
+        : ChildIteratorType::iterator_adaptor_base(W), VectorizableTree(VT) {}
+
+    NodeRef operator*() { return &VectorizableTree[*I]; }
+  };
+
+  static NodeRef getEntryNode(BoUpSLP &R) { return &R.VectorizableTree[0]; }
+
+  static ChildIteratorType child_begin(NodeRef N) {
+    return {N->UserTreeIndices.begin(), N->Container};
+  }
+  static ChildIteratorType child_end(NodeRef N) {
+    return {N->UserTreeIndices.end(), N->Container};
+  }
+
+  /// For the node iterator we just need to turn the TreeEntry iterator into a
+  /// TreeEntry* iterator so that it dereferences to NodeRef.
+  typedef pointer_iterator<std::vector<TreeEntry>::iterator> nodes_iterator;
+
+  static nodes_iterator nodes_begin(BoUpSLP *R) {
+    return nodes_iterator(R->VectorizableTree.begin());
+  }
+  static nodes_iterator nodes_end(BoUpSLP *R) {
+    return nodes_iterator(R->VectorizableTree.end());
+  }
+
+  static unsigned size(BoUpSLP *R) { return R->VectorizableTree.size(); }
+};
+
+template <> struct DOTGraphTraits<BoUpSLP *> : public DefaultDOTGraphTraits {
+  typedef BoUpSLP::TreeEntry TreeEntry;
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(const TreeEntry *Entry, const BoUpSLP *R) {
+    std::string Str;
+    raw_string_ostream OS(Str);
+    if (isSplat(Entry->Scalars)) {
+      OS << "<splat> " << *Entry->Scalars[0];
+      return Str;
+    }
+    for (auto V : Entry->Scalars) {
+      OS << *V;
+      if (std::any_of(
+              R->ExternalUses.begin(), R->ExternalUses.end(),
+              [&](const BoUpSLP::ExternalUser &EU) { return EU.Scalar == V; }))
+        OS << " <extract>";
+      OS << "\n";
+    }
+    return Str;
+  }
+
+  static std::string getNodeAttributes(const TreeEntry *Entry,
+                                       const BoUpSLP *) {
+    if (Entry->NeedToGather)
+      return "color=red";
+    return "";
+  }
+};
 
 } // end namespace llvm
-} // end namespace slpvectorizer
 
 void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
                         ArrayRef<Value *> UserIgnoreLst) {
+  ExtraValueToDebugLocsMap ExternallyUsedValues;
+  buildTree(Roots, ExternallyUsedValues, UserIgnoreLst);
+}
+void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
+                        ExtraValueToDebugLocsMap &ExternallyUsedValues,
+                        ArrayRef<Value *> UserIgnoreLst) {
   deleteTree();
   UserIgnoreList = UserIgnoreLst;
   if (!allSameType(Roots))
     return;
-  buildTree_rec(Roots, 0);
+  buildTree_rec(Roots, 0, -1);
 
   // Collect the values that we need to extract from the tree.
   for (TreeEntry &EIdx : VectorizableTree) {
@@ -956,6 +1061,14 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
       if (Entry->NeedToGather)
         continue;
 
+      // Check if the scalar is externally used as an extra arg.
+      auto ExtI = ExternallyUsedValues.find(Scalar);
+      if (ExtI != ExternallyUsedValues.end()) {
+        DEBUG(dbgs() << "SLP: Need to extract: Extra arg from lane " <<
+              Lane << " from " << *Scalar << ".\n");
+        ExternalUses.emplace_back(Scalar, nullptr, Lane);
+        continue;
+      }
       for (User *U : Scalar->users()) {
         DEBUG(dbgs() << "SLP: Checking user:" << *U << ".\n");
 
@@ -992,28 +1105,28 @@ void BoUpSLP::buildTree(ArrayRef<Value *> Roots,
   }
 }
 
-
-void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
+void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
+                            int UserTreeIdx) {
   bool isAltShuffle = false;
   assert((allConstant(VL) || allSameType(VL)) && "Invalid types!");
 
   if (Depth == RecursionMaxDepth) {
     DEBUG(dbgs() << "SLP: Gathering due to max recursion depth.\n");
-    newTreeEntry(VL, false, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   // Don't handle vectors.
   if (VL[0]->getType()->isVectorTy()) {
     DEBUG(dbgs() << "SLP: Gathering due to vector type.\n");
-    newTreeEntry(VL, false, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     if (SI->getValueOperand()->getType()->isVectorTy()) {
       DEBUG(dbgs() << "SLP: Gathering due to store vector type.\n");
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   unsigned Opcode = getSameOpcode(VL);
@@ -1030,7 +1143,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   // If all of the operands are identical or constant we have a simple solution.
   if (allConstant(VL) || isSplat(VL) || !allSameBlock(VL) || !Opcode) {
     DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O. \n");
-    newTreeEntry(VL, false, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
@@ -1042,7 +1155,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     if (EphValues.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is ephemeral.\n");
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1055,10 +1168,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       DEBUG(dbgs() << "SLP: \tChecking bundle: " << *VL[i] << ".\n");
       if (E->Scalars[i] != VL[i]) {
         DEBUG(dbgs() << "SLP: Gathering due to partial overlap.\n");
-        newTreeEntry(VL, false, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         return;
       }
     }
+    // Record the reuse of the tree node.  FIXME, currently this is only used to
+    // properly draw the graph rather than for the actual vectorization.
+    E->UserTreeIndices.push_back(UserTreeIdx);
     DEBUG(dbgs() << "SLP: Perfect diamond merge at " << *VL[0] << ".\n");
     return;
   }
@@ -1068,7 +1184,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     if (ScalarToTreeEntry.count(VL[i])) {
       DEBUG(dbgs() << "SLP: The instruction (" << *VL[i] <<
             ") is already in tree.\n");
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1078,7 +1194,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
   for (unsigned i = 0, e = VL.size(); i != e; ++i) {
     if (MustGather.count(VL[i])) {
       DEBUG(dbgs() << "SLP: Gathering due to gathered scalar.\n");
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       return;
     }
   }
@@ -1092,7 +1208,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     // Don't go into unreachable blocks. They may contain instructions with
     // dependency cycles which confuse the final scheduling.
     DEBUG(dbgs() << "SLP: bundle in unreachable block.\n");
-    newTreeEntry(VL, false, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
 
@@ -1101,7 +1217,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     for (unsigned j = i+1; j < e; ++j)
       if (VL[i] == VL[j]) {
         DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n");
-        newTreeEntry(VL, false, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         return;
       }
 
@@ -1116,7 +1232,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     assert((!BS.getScheduleData(VL[0]) ||
             !BS.getScheduleData(VL[0])->isPartOfBundle()) &&
            "tryScheduleBundle should cancelScheduling on failure");
-    newTreeEntry(VL, false, false);
+    newTreeEntry(VL, false, UserTreeIdx);
     return;
   }
   DEBUG(dbgs() << "SLP: We are able to schedule this bundle.\n");
@@ -1133,12 +1249,12 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           if (Term) {
             DEBUG(dbgs() << "SLP: Need to swizzle PHINodes (TerminatorInst use).\n");
             BS.cancelScheduling(VL);
-            newTreeEntry(VL, false, false);
+            newTreeEntry(VL, false, UserTreeIdx);
             return;
           }
         }
 
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of PHINodes.\n");
 
       for (unsigned i = 0, e = PH->getNumIncomingValues(); i < e; ++i) {
@@ -1148,7 +1264,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           Operands.push_back(cast<PHINode>(j)->getIncomingValueForBlock(
               PH->getIncomingBlock(i)));
 
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1160,7 +1276,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       } else {
         BS.cancelScheduling(VL);
       }
-      newTreeEntry(VL, Reuse, false);
+      newTreeEntry(VL, Reuse, UserTreeIdx);
       return;
     }
     case Instruction::Load: {
@@ -1176,7 +1292,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       if (DL->getTypeSizeInBits(ScalarTy) !=
           DL->getTypeAllocSizeInBits(ScalarTy)) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: Gathering loads of non-packed type.\n");
         return;
       }
@@ -1187,13 +1303,15 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         LoadInst *L = cast<LoadInst>(VL[i]);
         if (!L->isSimple()) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering non-simple loads.\n");
           return;
         }
       }
 
       // Check if the loads are consecutive, reversed, or neither.
+      // TODO: What we really want is to sort the loads, but for now, check
+      // the two likely directions.
       bool Consecutive = true;
       bool ReverseConsecutive = true;
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i) {
@@ -1207,7 +1325,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
 
       if (Consecutive) {
         ++NumLoadsWantToKeepOrder;
-        newTreeEntry(VL, true, false);
+        newTreeEntry(VL, true, UserTreeIdx);
         DEBUG(dbgs() << "SLP: added a vector of loads.\n");
         return;
       }
@@ -1221,25 +1339,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
             break;
           }
 
-      if (VL.size() > 2 && !ReverseConsecutive) {
-        bool ShuffledLoads = true;
-        SmallVector<Value *, 8> List;
-        sortMemAccesses(VL, *DL, *SE, List);
-        auto NewVL = makeArrayRef(List.begin(), List.end());
-        for (unsigned i = 0, e = NewVL.size() - 1; i < e; ++i) {
-          if (!isConsecutiveAccess(NewVL[i], NewVL[i + 1], *DL, *SE)) {
-            ShuffledLoads = false;
-            break;
-          }
-        }
-        if (ShuffledLoads) {
-          newTreeEntry(NewVL, true, true);
-          return;
-        }
-      }
-
       BS.cancelScheduling(VL);
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false, UserTreeIdx);
 
       if (ReverseConsecutive) {
         ++NumLoadsWantToChangeOrder;
@@ -1262,16 +1363,16 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     case Instruction::FPTrunc:
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
-      for (Value *Val : VL) {
-        Type *Ty = cast<Instruction>(Val)->getOperand(0)->getType();
+      for (unsigned i = 0; i < VL.size(); ++i) {
+        Type *Ty = cast<Instruction>(VL[i])->getOperand(0)->getType();
         if (Ty != SrcTy || !isValidElementType(Ty)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering casts with different src types.\n");
           return;
         }
       }
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of casts.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1280,7 +1381,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth+1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1294,13 +1395,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         if (Cmp->getPredicate() != P0 ||
             Cmp->getOperand(0)->getType() != ComparedTy) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Gathering cmp with different predicate.\n");
           return;
         }
       }
 
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of compares.\n");
 
       for (unsigned i = 0, e = VL0->getNumOperands(); i < e; ++i) {
@@ -1309,7 +1410,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth+1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1332,7 +1433,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
     case Instruction::And:
     case Instruction::Or:
     case Instruction::Xor: {
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of bin op.\n");
 
       // Sort operands of the instructions so that each side is more likely to
@@ -1340,8 +1441,8 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       if (isa<BinaryOperator>(VL0) && VL0->isCommutative()) {
         ValueList Left, Right;
         reorderInputsAccordingToOpcode(VL, Left, Right);
-        buildTree_rec(Left, Depth + 1);
-        buildTree_rec(Right, Depth + 1);
+        buildTree_rec(Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1351,17 +1452,17 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth+1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     case Instruction::GetElementPtr: {
       // We don't combine GEPs with complicated (nested) indexing.
-      for (Value *Val : VL) {
-        if (cast<Instruction>(Val)->getNumOperands() != 2) {
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        if (cast<Instruction>(VL[j])->getNumOperands() != 2) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (nested indexes).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
@@ -1369,29 +1470,29 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       // We can't combine several GEPs into one vector if they operate on
       // different types.
       Type *Ty0 = cast<Instruction>(VL0)->getOperand(0)->getType();
-      for (Value *Val : VL) {
-        Type *CurTy = cast<Instruction>(Val)->getOperand(0)->getType();
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        Type *CurTy = cast<Instruction>(VL[j])->getOperand(0)->getType();
         if (Ty0 != CurTy) {
           DEBUG(dbgs() << "SLP: not-vectorizable GEP (different types).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
 
       // We don't combine GEPs with non-constant indexes.
-      for (Value *Val : VL) {
-        auto Op = cast<Instruction>(Val)->getOperand(1);
+      for (unsigned j = 0; j < VL.size(); ++j) {
+        auto Op = cast<Instruction>(VL[j])->getOperand(1);
         if (!isa<ConstantInt>(Op)) {
           DEBUG(
               dbgs() << "SLP: not-vectorizable GEP (non-constant indexes).\n");
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           return;
         }
       }
 
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of GEPs.\n");
       for (unsigned i = 0, e = 2; i < e; ++i) {
         ValueList Operands;
@@ -1399,7 +1500,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1408,19 +1509,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       for (unsigned i = 0, e = VL.size() - 1; i < e; ++i)
         if (!isConsecutiveAccess(VL[i], VL[i + 1], *DL, *SE)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: Non-consecutive store.\n");
           return;
         }
 
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a vector of stores.\n");
 
       ValueList Operands;
       for (Value *j : VL)
         Operands.push_back(cast<Instruction>(j)->getOperand(0));
 
-      buildTree_rec(Operands, Depth + 1);
+      buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       return;
     }
     case Instruction::Call: {
@@ -1431,7 +1532,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
       if (!isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
@@ -1445,7 +1546,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: mismatched calls:" << *CI << "!=" << *VL[i]
                        << "\n");
           return;
@@ -1456,7 +1557,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           Value *A1J = CI2->getArgOperand(1);
           if (A1I != A1J) {
             BS.cancelScheduling(VL);
-            newTreeEntry(VL, false, false);
+            newTreeEntry(VL, false, UserTreeIdx);
             DEBUG(dbgs() << "SLP: mismatched arguments in call:" << *CI
                          << " argument "<< A1I<<"!=" << A1J
                          << "\n");
@@ -1469,14 +1570,14 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
                         CI->op_begin() + CI->getBundleOperandsEndIndex(),
                         CI2->op_begin() + CI2->getBundleOperandsStartIndex())) {
           BS.cancelScheduling(VL);
-          newTreeEntry(VL, false, false);
+          newTreeEntry(VL, false, UserTreeIdx);
           DEBUG(dbgs() << "SLP: mismatched bundle operands in calls:" << *CI << "!="
                        << *VL[i] << '\n');
           return;
         }
       }
 
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true, UserTreeIdx);
       for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i) {
         ValueList Operands;
         // Prepare the operand vector.
@@ -1484,7 +1585,7 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
           CallInst *CI2 = dyn_cast<CallInst>(j);
           Operands.push_back(CI2->getArgOperand(i));
         }
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
@@ -1493,19 +1594,19 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
       // then do not vectorize this instruction.
       if (!isAltShuffle) {
         BS.cancelScheduling(VL);
-        newTreeEntry(VL, false, false);
+        newTreeEntry(VL, false, UserTreeIdx);
         DEBUG(dbgs() << "SLP: ShuffleVector are not vectorized.\n");
         return;
       }
-      newTreeEntry(VL, true, false);
+      newTreeEntry(VL, true, UserTreeIdx);
       DEBUG(dbgs() << "SLP: added a ShuffleVector op.\n");
 
       // Reorder operands if reordering would enable vectorization.
       if (isa<BinaryOperator>(VL0)) {
         ValueList Left, Right;
         reorderAltShuffleOperands(VL, Left, Right);
-        buildTree_rec(Left, Depth + 1);
-        buildTree_rec(Right, Depth + 1);
+        buildTree_rec(Left, Depth + 1, UserTreeIdx);
+        buildTree_rec(Right, Depth + 1, UserTreeIdx);
         return;
       }
 
@@ -1515,13 +1616,13 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth) {
         for (Value *j : VL)
           Operands.push_back(cast<Instruction>(j)->getOperand(i));
 
-        buildTree_rec(Operands, Depth + 1);
+        buildTree_rec(Operands, Depth + 1, UserTreeIdx);
       }
       return;
     }
     default:
       BS.cancelScheduling(VL);
-      newTreeEntry(VL, false, false);
+      newTreeEntry(VL, false, UserTreeIdx);
       DEBUG(dbgs() << "SLP: Gathering unknown instruction.\n");
       return;
   }
@@ -1601,6 +1702,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
+  else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
+    ScalarTy = CI->getOperand(0)->getType();
   VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
 
   // If we have computed a smaller type for the expression, update VecTy so
@@ -1630,7 +1733,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
         int DeadCost = 0;
         for (unsigned i = 0, e = VL.size(); i < e; ++i) {
           Instruction *E = cast<Instruction>(VL[i]);
-          if (E->hasOneUse())
+          // If all users are going to be vectorized, instruction can be
+          // considered as dead.
+          // The same, if have only one user, it will be vectorized for sure.
+          if (E->hasOneUse() ||
+              std::all_of(E->user_begin(), E->user_end(), [this](User *U) {
+                return ScalarToTreeEntry.count(U) > 0;
+              }))
             // Take credit for instruction that will become dead.
             DeadCost +=
                 TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
@@ -1655,10 +1764,10 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 
       // Calculate the cost of this instruction.
       int ScalarCost = VL.size() * TTI->getCastInstrCost(VL0->getOpcode(),
-                                                         VL0->getType(), SrcTy);
+                                                         VL0->getType(), SrcTy, VL0);
 
       VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
-      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy);
+      int VecCost = TTI->getCastInstrCost(VL0->getOpcode(), VecTy, SrcVecTy, VL0);
       return VecCost - ScalarCost;
     }
     case Instruction::FCmp:
@@ -1667,8 +1776,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Calculate the cost of this instruction.
       VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() *
-          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty());
-      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy);
+          TTI->getCmpSelInstrCost(Opcode, ScalarTy, Builder.getInt1Ty(), VL0);
+      int VecCost = TTI->getCmpSelInstrCost(Opcode, VecTy, MaskTy, VL0);
       return VecCost - ScalarCost;
     }
     case Instruction::Add:
@@ -1751,22 +1860,18 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Cost of wide load - cost of scalar loads.
       unsigned alignment = dyn_cast<LoadInst>(VL0)->getAlignment();
       int ScalarLdCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0);
+          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
       int VecLdCost = TTI->getMemoryOpCost(Instruction::Load,
-                                           VecTy, alignment, 0);
-      if (E->NeedToShuffle) {
-        VecLdCost += TTI->getShuffleCost(
-            TargetTransformInfo::SK_PermuteSingleSrc, VecTy, 0);
-      }
+                                           VecTy, alignment, 0, VL0);
       return VecLdCost - ScalarLdCost;
     }
     case Instruction::Store: {
       // We know that we can merge the stores. Calculate the cost.
       unsigned alignment = dyn_cast<StoreInst>(VL0)->getAlignment();
       int ScalarStCost = VecTy->getNumElements() *
-            TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0);
+          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, alignment, 0, VL0);
       int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, alignment, 0);
+                                           VecTy, alignment, 0, VL0);
       return VecStCost - ScalarStCost;
     }
     case Instruction::Call: {
@@ -1774,12 +1879,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       // Calculate the cost of the scalar and vector calls.
-      SmallVector<Type*, 4> ScalarTys, VecTys;
-      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op) {
+      SmallVector<Type*, 4> ScalarTys;
+      for (unsigned op = 0, opc = CI->getNumArgOperands(); op!= opc; ++op)
         ScalarTys.push_back(CI->getArgOperand(op)->getType());
-        VecTys.push_back(VectorType::get(CI->getArgOperand(op)->getType(),
-                                         VecTy->getNumElements()));
-      }
 
       FastMathFlags FMF;
       if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
@@ -1788,7 +1890,9 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       int ScalarCallCost = VecTy->getNumElements() *
           TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
 
-      int VecCallCost = TTI->getIntrinsicInstrCost(ID, VecTy, VecTys, FMF);
+      SmallVector<Value *, 4> Args(CI->arg_operands());
+      int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
+                                                   VecTy->getNumElements());
 
       DEBUG(dbgs() << "SLP: Call cost "<< VecCallCost - ScalarCallCost
             << " (" << VecCallCost  << "-" <<  ScalarCallCost << ")"
@@ -1982,9 +2086,18 @@ int BoUpSLP::getTreeCost() {
   int SpillCost = getSpillCost();
   Cost += SpillCost + ExtractCost;
 
-  DEBUG(dbgs() << "SLP: Spill Cost = " << SpillCost << ".\n"
-               << "SLP: Extract Cost = " << ExtractCost << ".\n"
-               << "SLP: Total Cost = " << Cost << ".\n");
+  std::string Str;
+  {
+    raw_string_ostream OS(Str);
+    OS << "SLP: Spill Cost = " << SpillCost << ".\n"
+       << "SLP: Extract Cost = " << ExtractCost << ".\n"
+       << "SLP: Total Cost = " << Cost << ".\n";
+  }
+  DEBUG(dbgs() << Str);
+
+  if (ViewSLPTree)
+    ViewGraph(this, "SLP" + F->getName(), false, Str);
+
   return Cost;
 }
 
@@ -2320,8 +2433,8 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   if (ScalarToTreeEntry.count(VL[0])) {
     int Idx = ScalarToTreeEntry[VL[0]];
     TreeEntry *E = &VectorizableTree[Idx];
-    if (E->isSame(VL) || (E->NeedToShuffle && E->isFoundJumbled(VL, *DL, *SE)))
-      return vectorizeTree(VL, E);
+    if (E->isSame(VL))
+      return vectorizeTree(E);
   }
 
   Type *ScalarTy = VL[0]->getType();
@@ -2332,10 +2445,10 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
   return Gather(VL, VecTy);
 }
 
-Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E) {
+Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   IRBuilder<>::InsertPointGuard Guard(Builder);
 
-  if (E->VectorizedValue && !E->NeedToShuffle) {
+  if (E->VectorizedValue) {
     DEBUG(dbgs() << "SLP: Diamond merged for " << *E->Scalars[0] << ".\n");
     return E->VectorizedValue;
   }
@@ -2569,35 +2682,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E) {
       LI->setAlignment(Alignment);
       E->VectorizedValue = LI;
       ++NumVectorInstructions;
-      propagateMetadata(LI, E->Scalars);
-
-      // As program order of scalar loads are jumbled, the vectorized 'load'
-      // must be followed by a 'shuffle' with the required jumbled mask.
-      if (!VL.empty() && (E->NeedToShuffle)) {
-        assert(VL.size() == E->Scalars.size() &&
-               "Equal number of scalars expected");
-        SmallVector<Constant *, 8> Mask;
-        for (Value *Val : VL) {
-          if (ScalarToTreeEntry.count(Val)) {
-            int Idx = ScalarToTreeEntry[Val];
-            TreeEntry *E = &VectorizableTree[Idx];
-            for (unsigned Lane = 0, LE = VL.size(); Lane != LE; ++Lane) {
-              if (E->Scalars[Lane] == Val) {
-                Mask.push_back(Builder.getInt32(Lane));
-                break;
-              }
-            }
-          }
-        }
-
-        // Generate shuffle for jumbled memory access
-        Value *Undef = UndefValue::get(VecTy);
-        Value *Shuf = Builder.CreateShuffleVector((Value *)LI, Undef,
-                                                  ConstantVector::get(Mask));
-        return Shuf;
-      }
-
-      return LI;
+      return propagateMetadata(LI, E->Scalars);
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(VL0);
@@ -2765,6 +2850,12 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL, TreeEntry *E) {
 }
 
 Value *BoUpSLP::vectorizeTree() {
+  ExtraValueToDebugLocsMap ExternallyUsedValues;
+  return vectorizeTree(ExternallyUsedValues);
+}
+
+Value *
+BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
 
   // All blocks must be scheduled before any instructions are inserted.
   for (auto &BSIter : BlocksSchedules) {
@@ -2772,7 +2863,7 @@ Value *BoUpSLP::vectorizeTree() {
   }
 
   Builder.SetInsertPoint(&F->getEntryBlock().front());
-  auto *VectorRoot = vectorizeTree(ArrayRef<Value *>(), &VectorizableTree[0]);
+  auto *VectorRoot = vectorizeTree(&VectorizableTree[0]);
 
   // If the vectorized tree can be rewritten in a smaller type, we truncate the
   // vectorized root. InstCombine will then rewrite the entire expression. We
@@ -2807,7 +2898,7 @@ Value *BoUpSLP::vectorizeTree() {
 
     // Skip users that we already RAUW. This happens when one instruction
     // has multiple uses of the same value.
-    if (!is_contained(Scalar->users(), User))
+    if (User && !is_contained(Scalar->users(), User))
       continue;
     assert(ScalarToTreeEntry.count(Scalar) && "Invalid scalar");
 
@@ -2819,6 +2910,28 @@ Value *BoUpSLP::vectorizeTree() {
     assert(Vec && "Can't find vectorizable value");
 
     Value *Lane = Builder.getInt32(ExternalUse.Lane);
+    // If User == nullptr, the Scalar is used as extra arg. Generate
+    // ExtractElement instruction and update the record for this scalar in
+    // ExternallyUsedValues.
+    if (!User) {
+      assert(ExternallyUsedValues.count(Scalar) &&
+             "Scalar with nullptr as an external user must be registered in "
+             "ExternallyUsedValues map");
+      if (auto *VecI = dyn_cast<Instruction>(Vec)) {
+        Builder.SetInsertPoint(VecI->getParent(),
+                               std::next(VecI->getIterator()));
+      } else {
+        Builder.SetInsertPoint(&F->getEntryBlock().front());
+      }
+      Value *Ex = Builder.CreateExtractElement(Vec, Lane);
+      Ex = extend(ScalarRoot, Ex, Scalar->getType());
+      CSEBlocks.insert(cast<Instruction>(Scalar)->getParent());
+      auto &Locs = ExternallyUsedValues[Scalar];
+      ExternallyUsedValues.insert({Ex, Locs});
+      ExternallyUsedValues.erase(Scalar);
+      continue;
+    }
+
     // Generate extracts for out-of-tree users.
     // Find the insertion point for the extractelement lane.
     if (auto *VecI = dyn_cast<Instruction>(Vec)) {
@@ -4186,16 +4299,10 @@ namespace {
 class HorizontalReduction {
   SmallVector<Value *, 16> ReductionOps;
   SmallVector<Value *, 32> ReducedVals;
+  // Use map vector to make stable output.
+  MapVector<Instruction *, Value *> ExtraArgs;
 
   BinaryOperator *ReductionRoot = nullptr;
-  // After successfull horizontal reduction vectorization attempt for PHI node
-  // vectorizer tries to update root binary op by combining vectorized tree and
-  // the ReductionPHI node. But during vectorization this ReductionPHI can be
-  // vectorized itself and replaced by the undef value, while the instruction
-  // itself is marked for deletion. This 'marked for deletion' PHI node then can
-  // be used in new binary operation, causing "Use still stuck around after Def
-  // is destroyed" crash upon PHI node deletion.
-  WeakVH ReductionPHI;
 
   /// The opcode of the reduction.
   Instruction::BinaryOps ReductionOpcode = Instruction::BinaryOpsEnd;
@@ -4205,6 +4312,26 @@ class HorizontalReduction {
   /// splits the vector in halves and adds those halves.
   bool IsPairwiseReduction = false;
 
+  /// Checks if the ParentStackElem.first should be marked as a reduction
+  /// operation with an extra argument or as extra argument itself.
+  void markExtraArg(std::pair<Instruction *, unsigned> &ParentStackElem,
+                    Value *ExtraArg) {
+    if (ExtraArgs.count(ParentStackElem.first)) {
+      ExtraArgs[ParentStackElem.first] = nullptr;
+      // We ran into something like:
+      // ParentStackElem.first = ExtraArgs[ParentStackElem.first] + ExtraArg.
+      // The whole ParentStackElem.first should be considered as an extra value
+      // in this case.
+      // Do not perform analysis of remaining operands of ParentStackElem.first
+      // instruction, this whole instruction is an extra argument.
+      ParentStackElem.second = ParentStackElem.first->getNumOperands();
+    } else {
+      // We ran into something like:
+      // ParentStackElem.first += ... + ExtraArg + ...
+      ExtraArgs[ParentStackElem.first] = ExtraArg;
+    }
+  }
+
 public:
   HorizontalReduction() = default;
 
@@ -4236,7 +4363,6 @@ public:
     ReductionOpcode = B->getOpcode();
     ReducedValueOpcode = 0;
     ReductionRoot = B;
-    ReductionPHI = Phi;
 
     // We currently only support adds.
     if ((ReductionOpcode != Instruction::Add &&
@@ -4257,8 +4383,23 @@ public:
       if (EdgeToVist == 2 || IsReducedValue) {
         if (IsReducedValue)
           ReducedVals.push_back(TreeN);
-        else
-          ReductionOps.push_back(TreeN);
+        else {
+          auto I = ExtraArgs.find(TreeN);
+          if (I != ExtraArgs.end() && !I->second) {
+            // Check if TreeN is an extra argument of its parent operation.
+            if (Stack.size() <= 1) {
+              // TreeN can't be an extra argument as it is a root reduction
+              // operation.
+              return false;
+            }
+            // Yes, TreeN is an extra argument, do not add it to a list of
+            // reduction operations.
+            // Stack[Stack.size() - 2] always points to the parent operation.
+            markExtraArg(Stack[Stack.size() - 2], TreeN);
+            ExtraArgs.erase(TreeN);
+          } else
+            ReductionOps.push_back(TreeN);
+        }
         // Retract.
         Stack.pop_back();
         continue;
@@ -4275,31 +4416,43 @@ public:
         if (I && (!ReducedValueOpcode || I->getOpcode() == ReducedValueOpcode ||
                   I->getOpcode() == ReductionOpcode)) {
           // Only handle trees in the current basic block.
-          if (I->getParent() != B->getParent())
-            return false;
+          if (I->getParent() != B->getParent()) {
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          }
 
           // Each tree node needs to have one user except for the ultimate
           // reduction.
-          if (!I->hasOneUse() && I != B)
-            return false;
+          if (!I->hasOneUse() && I != B) {
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
+          }
 
           if (I->getOpcode() == ReductionOpcode) {
             // We need to be able to reassociate the reduction operations.
-            if (!I->isAssociative())
-              return false;
+            if (!I->isAssociative()) {
+              // I is an extra argument for TreeN (its parent operation).
+              markExtraArg(Stack.back(), I);
+              continue;
+            }
           } else if (ReducedValueOpcode &&
                      ReducedValueOpcode != I->getOpcode()) {
             // Make sure that the opcodes of the operations that we are going to
             // reduce match.
-            return false;
+            // I is an extra argument for TreeN (its parent operation).
+            markExtraArg(Stack.back(), I);
+            continue;
           } else if (!ReducedValueOpcode)
             ReducedValueOpcode = I->getOpcode();
 
           Stack.push_back(std::make_pair(I, 0));
           continue;
         }
-        return false;
       }
+      // NextV is an extra argument for TreeN (its parent operation).
+      markExtraArg(Stack.back(), NextV);
     }
     return true;
   }
@@ -4326,12 +4479,17 @@ public:
     Builder.setFastMathFlags(Unsafe);
     unsigned i = 0;
 
+    BoUpSLP::ExtraValueToDebugLocsMap ExternallyUsedValues;
+    // The same extra argument may be used several time, so log each attempt
+    // to use it.
+    for (auto &Pair : ExtraArgs)
+      ExternallyUsedValues[Pair.second].push_back(Pair.first);
     while (i < NumReducedVals - ReduxWidth + 1 && ReduxWidth > 2) {
       auto VL = makeArrayRef(&ReducedVals[i], ReduxWidth);
-      V.buildTree(VL, ReductionOps);
+      V.buildTree(VL, ExternallyUsedValues, ReductionOps);
       if (V.shouldReorder()) {
         SmallVector<Value *, 8> Reversed(VL.rbegin(), VL.rend());
-        V.buildTree(Reversed, ReductionOps);
+        V.buildTree(Reversed, ExternallyUsedValues, ReductionOps);
       }
       if (V.isTreeTinyAndNotFullyVectorizable())
         break;
@@ -4349,15 +4507,16 @@ public:
 
       // Vectorize a tree.
       DebugLoc Loc = cast<Instruction>(ReducedVals[i])->getDebugLoc();
-      Value *VectorizedRoot = V.vectorizeTree();
+      Value *VectorizedRoot = V.vectorizeTree(ExternallyUsedValues);
 
       // Emit a reduction.
       Value *ReducedSubTree =
-          emitReduction(VectorizedRoot, Builder, ReduxWidth);
+          emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps);
       if (VectorizedTree) {
         Builder.SetCurrentDebugLocation(Loc);
         VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
                                              ReducedSubTree, "bin.rdx");
+        propagateIRFlags(VectorizedTree, ReductionOps);
       } else
         VectorizedTree = ReducedSubTree;
       i += ReduxWidth;
@@ -4367,18 +4526,25 @@ public:
     if (VectorizedTree) {
       // Finish the reduction.
       for (; i < NumReducedVals; ++i) {
-        Builder.SetCurrentDebugLocation(
-          cast<Instruction>(ReducedVals[i])->getDebugLoc());
-        VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
-                                             ReducedVals[i]);
+        auto *I = cast<Instruction>(ReducedVals[i]);
+        Builder.SetCurrentDebugLocation(I->getDebugLoc());
+        VectorizedTree =
+            Builder.CreateBinOp(ReductionOpcode, VectorizedTree, I);
+        propagateIRFlags(VectorizedTree, ReductionOps);
+      }
+      for (auto &Pair : ExternallyUsedValues) {
+        assert(!Pair.second.empty() &&
+               "At least one DebugLoc must be inserted");
+        // Add each externally used value to the final reduction.
+        for (auto *I : Pair.second) {
+          Builder.SetCurrentDebugLocation(I->getDebugLoc());
+          VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree,
+                                               Pair.first, "bin.extra");
+          propagateIRFlags(VectorizedTree, I);
+        }
       }
       // Update users.
-      if (ReductionPHI && !isa<UndefValue>(ReductionPHI)) {
-        assert(ReductionRoot && "Need a reduction operation");
-        ReductionRoot->setOperand(0, VectorizedTree);
-        ReductionRoot->setOperand(1, ReductionPHI);
-      } else
-        ReductionRoot->replaceAllUsesWith(VectorizedTree);
+      ReductionRoot->replaceAllUsesWith(VectorizedTree);
     }
     return VectorizedTree != nullptr;
   }
@@ -4415,7 +4581,7 @@ private:
 
   /// \brief Emit a horizontal reduction of the vectorized value.
   Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder,
-                       unsigned ReduxWidth) {
+                       unsigned ReduxWidth, ArrayRef<Value *> RedOps) {
     assert(VectorizedValue && "Need to have a vectorized tree node");
     assert(isPowerOf2_32(ReduxWidth) &&
            "We only handle power-of-two reductions for now");
@@ -4442,6 +4608,7 @@ private:
           TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf");
         TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx");
       }
+      propagateIRFlags(TmpVec, RedOps);
     }
 
     // The result is in the first element of the vector.
@@ -4493,16 +4660,19 @@ static bool findBuildVector(InsertElementInst *FirstInsertElem,
 static bool findBuildAggregate(InsertValueInst *IV,
                                SmallVectorImpl<Value *> &BuildVector,
                                SmallVectorImpl<Value *> &BuildVectorOpds) {
-  if (!IV->hasOneUse())
-    return false;
-  Value *V = IV->getAggregateOperand();
-  if (!isa<UndefValue>(V)) {
-    InsertValueInst *I = dyn_cast<InsertValueInst>(V);
-    if (!I || !findBuildAggregate(I, BuildVector, BuildVectorOpds))
+  Value *V;
+  do {
+    BuildVector.push_back(IV);
+    BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+    V = IV->getAggregateOperand();
+    if (isa<UndefValue>(V))
+      break;
+    IV = dyn_cast<InsertValueInst>(V);
+    if (!IV || !IV->hasOneUse())
       return false;
-  }
-  BuildVector.push_back(IV);
-  BuildVectorOpds.push_back(IV->getInsertedValueOperand());
+  } while (true);
+  std::reverse(BuildVector.begin(), BuildVector.end());
+  std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
   return true;
 }
 
diff --git a/lib/XRay/CMakeLists.txt b/lib/XRay/CMakeLists.txt
index 6c1acba79bfab2989c3ad902f83fbc5a466b481b..8d558209d8ee7cac1497133780bdbe45ad5f366b 100644
--- a/lib/XRay/CMakeLists.txt
+++ b/lib/XRay/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMXRay
+  InstrumentationMap.cpp
   Trace.cpp
 
   ADDITIONAL_HEADER_DIRS
@@ -7,7 +8,9 @@ add_llvm_library(LLVMXRay
 
   DEPENDS
   LLVMSupport
+  LLVMObject
 
   LINK_LIBS
   LLVMSupport
+  LLVMObject
   )
diff --git a/lib/XRay/InstrumentationMap.cpp b/lib/XRay/InstrumentationMap.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..431c251feb65ed045a8c3d18d6a1e5768adeca67
--- /dev/null
+++ b/lib/XRay/InstrumentationMap.cpp
@@ -0,0 +1,198 @@
+//===- InstrumentationMap.cpp - XRay Instrumentation Map ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the InstrumentationMap type for XRay sleds.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Object/Binary.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/XRay/InstrumentationMap.h"
+#include <algorithm>
+#include <cstddef>
+#include <cstdint>
+#include <system_error>
+#include <vector>
+
+using namespace llvm;
+using namespace xray;
+
+Optional<int32_t> InstrumentationMap::getFunctionId(uint64_t Addr) const {
+  auto I = FunctionIds.find(Addr);
+  if (I != FunctionIds.end())
+    return I->second;
+  return None;
+}
+
+Optional<uint64_t> InstrumentationMap::getFunctionAddr(int32_t FuncId) const {
+  auto I = FunctionAddresses.find(FuncId);
+  if (I != FunctionAddresses.end())
+    return I->second;
+  return None;
+}
+
+static Error
+loadELF64(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
+          InstrumentationMap::SledContainer &Sleds,
+          InstrumentationMap::FunctionAddressMap &FunctionAddresses,
+          InstrumentationMap::FunctionAddressReverseMap &FunctionIds) {
+  InstrumentationMap Map;
+
+  // Find the section named "xray_instr_map".
+  if (!ObjFile.getBinary()->isELF() ||
+      !(ObjFile.getBinary()->getArch() == Triple::x86_64 ||
+        ObjFile.getBinary()->getArch() == Triple::ppc64le))
+    return make_error<StringError>(
+        "File format not supported (only does ELF little endian 64-bit).",
+        std::make_error_code(std::errc::not_supported));
+
+  StringRef Contents = "";
+  const auto &Sections = ObjFile.getBinary()->sections();
+  auto I = llvm::find_if(Sections, [&](object::SectionRef Section) {
+    StringRef Name = "";
+    if (Section.getName(Name))
+      return false;
+    return Name == "xray_instr_map";
+  });
+
+  if (I == Sections.end())
+    return make_error<StringError>(
+        "Failed to find XRay instrumentation map.",
+        std::make_error_code(std::errc::executable_format_error));
+
+  if (I->getContents(Contents))
+    return errorCodeToError(
+        std::make_error_code(std::errc::executable_format_error));
+
+  // Copy the instrumentation map data into the Sleds data structure.
+  auto C = Contents.bytes_begin();
+  static constexpr size_t ELF64SledEntrySize = 32;
+
+  if ((C - Contents.bytes_end()) % ELF64SledEntrySize != 0)
+    return make_error<StringError>(
+        Twine("Instrumentation map entries not evenly divisible by size of "
+              "an XRay sled entry in ELF64."),
+        std::make_error_code(std::errc::executable_format_error));
+
+  int32_t FuncId = 1;
+  uint64_t CurFn = 0;
+  for (; C != Contents.bytes_end(); C += ELF64SledEntrySize) {
+    DataExtractor Extractor(
+        StringRef(reinterpret_cast<const char *>(C), ELF64SledEntrySize), true,
+        8);
+    Sleds.push_back({});
+    auto &Entry = Sleds.back();
+    uint32_t OffsetPtr = 0;
+    Entry.Address = Extractor.getU64(&OffsetPtr);
+    Entry.Function = Extractor.getU64(&OffsetPtr);
+    auto Kind = Extractor.getU8(&OffsetPtr);
+    static constexpr SledEntry::FunctionKinds Kinds[] = {
+        SledEntry::FunctionKinds::ENTRY, SledEntry::FunctionKinds::EXIT,
+        SledEntry::FunctionKinds::TAIL,
+    };
+    if (Kind >= sizeof(Kinds))
+      return errorCodeToError(
+          std::make_error_code(std::errc::executable_format_error));
+    Entry.Kind = Kinds[Kind];
+    Entry.AlwaysInstrument = Extractor.getU8(&OffsetPtr) != 0;
+
+    // We do replicate the function id generation scheme implemented in the
+    // XRay runtime.
+    // FIXME: Figure out how to keep this consistent with the XRay runtime.
+    if (CurFn == 0) {
+      CurFn = Entry.Function;
+      FunctionAddresses[FuncId] = Entry.Function;
+      FunctionIds[Entry.Function] = FuncId;
+    }
+    if (Entry.Function != CurFn) {
+      ++FuncId;
+      CurFn = Entry.Function;
+      FunctionAddresses[FuncId] = Entry.Function;
+      FunctionIds[Entry.Function] = FuncId;
+    }
+  }
+  return Error::success();
+}
+
+static Error
+loadYAML(int Fd, size_t FileSize, StringRef Filename,
+         InstrumentationMap::SledContainer &Sleds,
+         InstrumentationMap::FunctionAddressMap &FunctionAddresses,
+         InstrumentationMap::FunctionAddressReverseMap &FunctionIds) {
+  std::error_code EC;
+  sys::fs::mapped_file_region MappedFile(
+      Fd, sys::fs::mapped_file_region::mapmode::readonly, FileSize, 0, EC);
+  if (EC)
+    return make_error<StringError>(
+        Twine("Failed memory-mapping file '") + Filename + "'.", EC);
+
+  std::vector<YAMLXRaySledEntry> YAMLSleds;
+  yaml::Input In(StringRef(MappedFile.data(), MappedFile.size()));
+  In >> YAMLSleds;
+  if (In.error())
+    return make_error<StringError>(
+        Twine("Failed loading YAML document from '") + Filename + "'.",
+        In.error());
+
+  Sleds.reserve(YAMLSleds.size());
+  for (const auto &Y : YAMLSleds) {
+    FunctionAddresses[Y.FuncId] = Y.Function;
+    FunctionIds[Y.Function] = Y.FuncId;
+    Sleds.push_back(
+        SledEntry{Y.Address, Y.Function, Y.Kind, Y.AlwaysInstrument});
+  }
+  return Error::success();
+}
+
+// FIXME: Create error types that encapsulate a bit more information than what
+// StringError instances contain.
+Expected<InstrumentationMap>
+llvm::xray::loadInstrumentationMap(StringRef Filename) {
+  // At this point we assume the file is an object file -- and if that doesn't
+  // work, we treat it as YAML.
+  // FIXME: Extend to support non-ELF and non-x86_64 binaries.
+
+  InstrumentationMap Map;
+  auto ObjectFileOrError = object::ObjectFile::createObjectFile(Filename);
+  if (!ObjectFileOrError) {
+    auto E = ObjectFileOrError.takeError();
+    // We try to load it as YAML if the ELF load didn't work.
+    int Fd;
+    if (sys::fs::openFileForRead(Filename, Fd))
+      return std::move(E);
+
+    uint64_t FileSize;
+    if (sys::fs::file_size(Filename, FileSize))
+      return std::move(E);
+
+    // If the file is empty, we return the original error.
+    if (FileSize == 0)
+      return std::move(E);
+
+    // From this point on the errors will be only for the YAML parts, so we
+    // consume the errors at this point.
+    consumeError(std::move(E));
+    if (auto E = loadYAML(Fd, FileSize, Filename, Map.Sleds,
+                          Map.FunctionAddresses, Map.FunctionIds))
+      return std::move(E);
+  } else if (auto E = loadELF64(Filename, *ObjectFileOrError, Map.Sleds,
+                                Map.FunctionAddresses, Map.FunctionIds)) {
+    return std::move(E);
+  }
+  return Map;
+}
diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp
index 51000c777de827fafba147f567cf1b6cbebb48dd..d2984697c8a9ea251c2de8ebe86ff4d78cff405c 100644
--- a/lib/XRay/Trace.cpp
+++ b/lib/XRay/Trace.cpp
@@ -24,8 +24,8 @@ using llvm::yaml::Input;
 using XRayRecordStorage =
     std::aligned_storage<sizeof(XRayRecord), alignof(XRayRecord)>::type;
 
-Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
-                     std::vector<XRayRecord> &Records) {
+// Populates the FileHeader reference by reading the first 32 bytes of the file.
+Error readBinaryFormatHeader(StringRef Data, XRayFileHeader &FileHeader) {
   // FIXME: Maybe deduce whether the data is little or big-endian using some
   // magic bytes in the beginning of the file?
 
@@ -37,16 +37,6 @@ Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
   //   (4)   uint32 : bitfield
   //   (8)   uint64 : cycle frequency
   //   (16)  -      : padding
-  //
-  if (Data.size() < 32)
-    return make_error<StringError>(
-        "Not enough bytes for an XRay log.",
-        std::make_error_code(std::errc::invalid_argument));
-
-  if (Data.size() - 32 == 0 || Data.size() % 32 != 0)
-    return make_error<StringError>(
-        "Invalid-sized XRay data.",
-        std::make_error_code(std::errc::invalid_argument));
 
   DataExtractor HeaderExtractor(Data, true, 8);
   uint32_t OffsetPtr = 0;
@@ -56,11 +46,29 @@ Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
   FileHeader.ConstantTSC = Bitfield & 1uL;
   FileHeader.NonstopTSC = Bitfield & 1uL << 1;
   FileHeader.CycleFrequency = HeaderExtractor.getU64(&OffsetPtr);
-
+  std::memcpy(&FileHeader.FreeFormData, Data.bytes_begin() + OffsetPtr, 16);
   if (FileHeader.Version != 1)
     return make_error<StringError>(
         Twine("Unsupported XRay file version: ") + Twine(FileHeader.Version),
         std::make_error_code(std::errc::invalid_argument));
+  return Error::success();
+}
+
+Error loadNaiveFormatLog(StringRef Data, XRayFileHeader &FileHeader,
+                         std::vector<XRayRecord> &Records) {
+  // Check that there is at least a header
+  if (Data.size() < 32)
+    return make_error<StringError>(
+        "Not enough bytes for an XRay log.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (Data.size() - 32 == 0 || Data.size() % 32 != 0)
+    return make_error<StringError>(
+        "Invalid-sized XRay data.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (auto E = readBinaryFormatHeader(Data, FileHeader))
+    return E;
 
   // Each record after the header will be 32 bytes, in the following format:
   //
@@ -98,9 +106,327 @@ Error NaiveLogLoader(StringRef Data, XRayFileHeader &FileHeader,
   return Error::success();
 }
 
-Error YAMLLogLoader(StringRef Data, XRayFileHeader &FileHeader,
-                    std::vector<XRayRecord> &Records) {
+/// When reading from a Flight Data Recorder mode log, metadata records are
+/// sparse compared to packed function records, so we must maintain state as we
+/// read through the sequence of entries. This allows the reader to denormalize
+/// the CPUId and Thread Id onto each Function Record and transform delta
+/// encoded TSC values into absolute encodings on each record.
+struct FDRState {
+  uint16_t CPUId;
+  uint16_t ThreadId;
+  uint64_t BaseTSC;
+  /// Encode some of the state transitions for the FDR log reader as explicit
+  /// checks. These are expectations for the next Record in the stream.
+  enum class Token {
+    NEW_BUFFER_RECORD_OR_EOF,
+    WALLCLOCK_RECORD,
+    NEW_CPU_ID_RECORD,
+    FUNCTION_SEQUENCE,
+    SCAN_TO_END_OF_THREAD_BUF,
+  };
+  Token Expects;
+  // Each threads buffer may have trailing garbage to scan over, so we track our
+  // progress.
+  uint64_t CurrentBufferSize;
+  uint64_t CurrentBufferConsumed;
+};
+
+Twine fdrStateToTwine(const FDRState::Token &state) {
+  switch (state) {
+  case FDRState::Token::NEW_BUFFER_RECORD_OR_EOF:
+    return "NEW_BUFFER_RECORD_OR_EOF";
+  case FDRState::Token::WALLCLOCK_RECORD:
+    return "WALLCLOCK_RECORD";
+  case FDRState::Token::NEW_CPU_ID_RECORD:
+    return "NEW_CPU_ID_RECORD";
+  case FDRState::Token::FUNCTION_SEQUENCE:
+    return "FUNCTION_SEQUENCE";
+  case FDRState::Token::SCAN_TO_END_OF_THREAD_BUF:
+    return "SCAN_TO_END_OF_THREAD_BUF";
+  }
+  return "UNKNOWN";
+}
+
+/// State transition when a NewBufferRecord is encountered.
+Error processFDRNewBufferRecord(FDRState &State, uint8_t RecordFirstByte,
+                                DataExtractor &RecordExtractor) {
+
+  if (State.Expects != FDRState::Token::NEW_BUFFER_RECORD_OR_EOF)
+    return make_error<StringError>(
+        "Malformed log. Read New Buffer record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  uint32_t OffsetPtr = 1; // 1 byte into record.
+  State.ThreadId = RecordExtractor.getU16(&OffsetPtr);
+  State.Expects = FDRState::Token::WALLCLOCK_RECORD;
+  return Error::success();
+}
+
+/// State transition when an EndOfBufferRecord is encountered.
+Error processFDREndOfBufferRecord(FDRState &State, uint8_t RecordFirstByte,
+                                  DataExtractor &RecordExtractor) {
+  if (State.Expects == FDRState::Token::NEW_BUFFER_RECORD_OR_EOF)
+    return make_error<StringError>(
+        "Malformed log. Received EOB message without current buffer.",
+        std::make_error_code(std::errc::executable_format_error));
+  State.Expects = FDRState::Token::SCAN_TO_END_OF_THREAD_BUF;
+  return Error::success();
+}
+
+/// State transition when a NewCPUIdRecord is encountered.
+Error processFDRNewCPUIdRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor) {
+  if (State.Expects != FDRState::Token::FUNCTION_SEQUENCE &&
+      State.Expects != FDRState::Token::NEW_CPU_ID_RECORD)
+    return make_error<StringError>(
+        "Malformed log. Read NewCPUId record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  uint32_t OffsetPtr = 1; // Read starting after the first byte.
+  State.CPUId = RecordExtractor.getU16(&OffsetPtr);
+  State.BaseTSC = RecordExtractor.getU64(&OffsetPtr);
+  State.Expects = FDRState::Token::FUNCTION_SEQUENCE;
+  return Error::success();
+}
+
+/// State transition when a TSCWrapRecord (overflow detection) is encountered.
+Error processFDRTSCWrapRecord(FDRState &State, uint8_t RecordFirstByte,
+                              DataExtractor &RecordExtractor) {
+  if (State.Expects != FDRState::Token::FUNCTION_SEQUENCE)
+    return make_error<StringError>(
+        "Malformed log. Read TSCWrap record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  uint32_t OffsetPtr = 1; // Read starting after the first byte.
+  State.BaseTSC = RecordExtractor.getU64(&OffsetPtr);
+  return Error::success();
+}
+
+/// State transition when a WallTimeMarkerRecord is encountered.
+Error processFDRWallTimeRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor) {
+  if (State.Expects != FDRState::Token::WALLCLOCK_RECORD)
+    return make_error<StringError>(
+        "Malformed log. Read Wallclock record kind out of sequence",
+        std::make_error_code(std::errc::executable_format_error));
+  // We don't encode the wall time into any of the records.
+  // XRayRecords are concerned with the TSC instead.
+  State.Expects = FDRState::Token::NEW_CPU_ID_RECORD;
+  return Error::success();
+}
+
+/// Advances the state machine for reading the FDR record type by reading one
+/// Metadata Record and updating the State appropriately based on the kind of
+/// record encountered. The RecordKind is encoded in the first byte of the
+/// Record, which the caller should pass in because they have already read it
+/// to determine that this is a metadata record as opposed to a function record.
+Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor) {
+  // The remaining 7 bits are the RecordKind enum.
+  uint8_t RecordKind = RecordFirstByte >> 1;
+  switch (RecordKind) {
+  case 0: // NewBuffer
+    if (auto E =
+            processFDRNewBufferRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  case 1: // EndOfBuffer
+    if (auto E = processFDREndOfBufferRecord(State, RecordFirstByte,
+                                             RecordExtractor))
+      return E;
+    break;
+  case 2: // NewCPUId
+    if (auto E =
+            processFDRNewCPUIdRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  case 3: // TSCWrap
+    if (auto E =
+            processFDRTSCWrapRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  case 4: // WallTimeMarker
+    if (auto E =
+            processFDRWallTimeRecord(State, RecordFirstByte, RecordExtractor))
+      return E;
+    break;
+  default:
+    // Widen the record type to uint16_t to prevent conversion to char.
+    return make_error<StringError>(
+        Twine("Illegal metadata record type: ")
+            .concat(Twine(static_cast<unsigned>(RecordKind))),
+        std::make_error_code(std::errc::executable_format_error));
+  }
+  return Error::success();
+}
+
+/// Reads a function record from an FDR format log, appending a new XRayRecord
+/// to the vector being populated and updating the State with a new value
+/// reference value to interpret TSC deltas.
+///
+/// The XRayRecord constructed includes information from the function record
+/// processed here as well as Thread ID and CPU ID formerly extracted into
+/// State.
+Error processFDRFunctionRecord(FDRState &State, uint8_t RecordFirstByte,
+                               DataExtractor &RecordExtractor,
+                               std::vector<XRayRecord> &Records) {
+  switch (State.Expects) {
+  case FDRState::Token::NEW_BUFFER_RECORD_OR_EOF:
+    return make_error<StringError>(
+        "Malformed log. Received Function Record before new buffer setup.",
+        std::make_error_code(std::errc::executable_format_error));
+  case FDRState::Token::WALLCLOCK_RECORD:
+    return make_error<StringError>(
+        "Malformed log. Received Function Record when expecting wallclock.",
+        std::make_error_code(std::errc::executable_format_error));
+  case FDRState::Token::NEW_CPU_ID_RECORD:
+    return make_error<StringError>(
+        "Malformed log. Received Function Record before first CPU record.",
+        std::make_error_code(std::errc::executable_format_error));
+  default:
+    Records.emplace_back();
+    auto &Record = Records.back();
+    Record.RecordType = 0; // Record is type NORMAL.
+    // Strip off record type bit and use the next three bits.
+    uint8_t RecordType = (RecordFirstByte >> 1) & 0x07;
+    switch (RecordType) {
+    case static_cast<uint8_t>(RecordTypes::ENTER):
+      Record.Type = RecordTypes::ENTER;
+      break;
+    case static_cast<uint8_t>(RecordTypes::EXIT):
+    case 2: // TAIL_EXIT is not yet defined in RecordTypes.
+      Record.Type = RecordTypes::EXIT;
+      break;
+    default:
+      // When initializing the error, convert to uint16_t so that the record
+      // type isn't interpreted as a char.
+      return make_error<StringError>(
+          Twine("Illegal function record type: ")
+              .concat(Twine(static_cast<unsigned>(RecordType))),
+          std::make_error_code(std::errc::executable_format_error));
+    }
+    Record.CPU = State.CPUId;
+    Record.TId = State.ThreadId;
+    // Back up to read first 32 bits, including the 8 we pulled RecordType
+    // and RecordKind out of. The remaining 28 are FunctionId.
+    uint32_t OffsetPtr = 0;
+    // Despite function Id being a signed int on XRayRecord,
+    // when it is written to an FDR format, the top bits are truncated,
+    // so it is effectively an unsigned value. When we shift off the
+    // top four bits, we want the shift to be logical, so we read as
+    // uint32_t.
+    uint32_t FuncIdBitField = RecordExtractor.getU32(&OffsetPtr);
+    Record.FuncId = FuncIdBitField >> 4;
+    // FunctionRecords have a 32 bit delta from the previous absolute TSC
+    // or TSC delta. If this would overflow, we should read a TSCWrap record
+    // with an absolute TSC reading.
+    uint64_t new_tsc = State.BaseTSC + RecordExtractor.getU32(&OffsetPtr);
+    State.BaseTSC = new_tsc;
+    Record.TSC = new_tsc;
+  }
+  return Error::success();
+}
 
+/// Reads a log in FDR mode for version 1 of this binary format. FDR mode is
+/// defined as part of the compiler-rt project in xray_fdr_logging.h, and such
+/// a log consists of the familiar 32 bit XRayHeader, followed by sequences of
+/// of interspersed 16 byte Metadata Records and 8 byte Function Records.
+///
+/// The following is an attempt to document the grammar of the format, which is
+/// parsed by this function for little-endian machines. Since the format makes
+/// use of BitFields, when we support big-Endian architectures, we will need to
+/// adjust not only the endianness parameter to llvm's RecordExtractor, but also
+/// the bit twiddling logic, which is consistent with the little-endian
+/// convention that BitFields within a struct will first be packed into the
+/// least significant bits the address they belong to.
+///
+/// We expect a format complying with the grammar in the following pseudo-EBNF.
+///
+/// FDRLog: XRayFileHeader ThreadBuffer*
+/// XRayFileHeader: 32 bits to identify the log as FDR with machine metadata.
+/// ThreadBuffer: BufSize NewBuffer WallClockTime NewCPUId FunctionSequence EOB
+/// BufSize: 8 byte unsigned integer indicating how large the buffer is.
+/// NewBuffer: 16 byte metadata record with Thread Id.
+/// WallClockTime: 16 byte metadata record with human readable time.
+/// NewCPUId: 16 byte metadata record with CPUId and a 64 bit TSC reading.
+/// EOB: 16 byte record in a thread buffer plus mem garbage to fill BufSize.
+/// FunctionSequence: NewCPUId | TSCWrap | FunctionRecord
+/// TSCWrap: 16 byte metadata record with a full 64 bit TSC reading.
+/// FunctionRecord: 8 byte record with FunctionId, entry/exit, and TSC delta.
+Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader,
+                 std::vector<XRayRecord> &Records) {
+  if (Data.size() < 32)
+    return make_error<StringError>(
+        "Not enough bytes for an XRay log.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  // For an FDR log, there are records sized 16 and 8 bytes.
+  // There actually may be no records if no non-trivial functions are
+  // instrumented.
+  if (Data.size() % 8 != 0)
+    return make_error<StringError>(
+        "Invalid-sized XRay data.",
+        std::make_error_code(std::errc::invalid_argument));
+
+  if (auto E = readBinaryFormatHeader(Data, FileHeader))
+    return E;
+
+  uint64_t BufferSize = 0;
+  {
+    StringRef ExtraDataRef(FileHeader.FreeFormData, 16);
+    DataExtractor ExtraDataExtractor(ExtraDataRef, true, 8);
+    uint32_t ExtraDataOffset = 0;
+    BufferSize = ExtraDataExtractor.getU64(&ExtraDataOffset);
+  }
+  FDRState State{0,          0, 0, FDRState::Token::NEW_BUFFER_RECORD_OR_EOF,
+                 BufferSize, 0};
+  // RecordSize will tell the loop how far to seek ahead based on the record
+  // type that we have just read.
+  size_t RecordSize = 0;
+  for (auto S = Data.drop_front(32); !S.empty(); S = S.drop_front(RecordSize)) {
+    DataExtractor RecordExtractor(S, true, 8);
+    uint32_t OffsetPtr = 0;
+    if (State.Expects == FDRState::Token::SCAN_TO_END_OF_THREAD_BUF) {
+      RecordSize = State.CurrentBufferSize - State.CurrentBufferConsumed;
+      if (S.size() < State.CurrentBufferSize - State.CurrentBufferConsumed) {
+        return make_error<StringError>(
+            Twine("Incomplete thread buffer. Expected ") +
+                Twine(State.CurrentBufferSize - State.CurrentBufferConsumed) +
+                " remaining bytes but found " + Twine(S.size()),
+            make_error_code(std::errc::invalid_argument));
+      }
+      State.CurrentBufferConsumed = 0;
+      State.Expects = FDRState::Token::NEW_BUFFER_RECORD_OR_EOF;
+      continue;
+    }
+    uint8_t BitField = RecordExtractor.getU8(&OffsetPtr);
+    bool isMetadataRecord = BitField & 0x01uL;
+    if (isMetadataRecord) {
+      RecordSize = 16;
+      if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor))
+        return E;
+      State.CurrentBufferConsumed += RecordSize;
+    } else { // Process Function Record
+      RecordSize = 8;
+      if (auto E = processFDRFunctionRecord(State, BitField, RecordExtractor,
+                                            Records))
+        return E;
+      State.CurrentBufferConsumed += RecordSize;
+    }
+  }
+  // There are two conditions
+  if (State.Expects != FDRState::Token::NEW_BUFFER_RECORD_OR_EOF &&
+      !(State.Expects == FDRState::Token::SCAN_TO_END_OF_THREAD_BUF &&
+        State.CurrentBufferSize == State.CurrentBufferConsumed))
+    return make_error<StringError>(
+        Twine("Encountered EOF with unexpected state expectation ") +
+            fdrStateToTwine(State.Expects) +
+            ". Remaining expected bytes in thread buffer total " +
+            Twine(State.CurrentBufferSize - State.CurrentBufferConsumed),
+        std::make_error_code(std::errc::executable_format_error));
+
+  return Error::success();
+}
+
+Error loadYAMLLog(StringRef Data, XRayFileHeader &FileHeader,
+                  std::vector<XRayRecord> &Records) {
   // Load the documents from the MappedFile.
   YAMLXRayTrace Trace;
   Input In(Data);
@@ -175,14 +501,21 @@ Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
   uint16_t Version = HeaderExtractor.getU16(&OffsetPtr);
   uint16_t Type = HeaderExtractor.getU16(&OffsetPtr);
 
+  enum BinaryFormatType { NAIVE_FORMAT = 0, FLIGHT_DATA_RECORDER_FORMAT = 1 };
+
   Trace T;
-  if (Version == 1 && (Type == 0 || Type == 1)) {
-    if (auto E = NaiveLogLoader(StringRef(MappedFile.data(), MappedFile.size()),
-                                T.FileHeader, T.Records))
+  if (Version == 1 && Type == NAIVE_FORMAT) {
+    if (auto E =
+            loadNaiveFormatLog(StringRef(MappedFile.data(), MappedFile.size()),
+                               T.FileHeader, T.Records))
+      return std::move(E);
+  } else if (Version == 1 && Type == FLIGHT_DATA_RECORDER_FORMAT) {
+    if (auto E = loadFDRLog(StringRef(MappedFile.data(), MappedFile.size()),
+                            T.FileHeader, T.Records))
       return std::move(E);
   } else {
-    if (auto E = YAMLLogLoader(StringRef(MappedFile.data(), MappedFile.size()),
-                               T.FileHeader, T.Records))
+    if (auto E = loadYAMLLog(StringRef(MappedFile.data(), MappedFile.size()),
+                             T.FileHeader, T.Records))
       return std::move(E);
   }
 
diff --git a/runtimes/CMakeLists.txt b/runtimes/CMakeLists.txt
index f5d16952b4064a9a8f415f512ffb6b50639547e7..8be1d9e7c523425b35422d4ac196dc9342252b3d 100644
--- a/runtimes/CMakeLists.txt
+++ b/runtimes/CMakeLists.txt
@@ -61,6 +61,10 @@ if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_CURRENT_SOURCE_DIR})
   # This variable makes sure that e.g. llvm-lit is found.
   set(LLVM_MAIN_SRC_DIR ${LLVM_BUILD_MAIN_SRC_DIR})
 
+  if(APPLE)
+   set(LLVM_ENABLE_LIBCXX ON CACHE BOOL "")
+  endif()
+
   # Handle common options used by all runtimes.
   include(AddLLVM)
   include(HandleLLVMOptions)
@@ -145,11 +149,15 @@ else() # if this is included from LLVM's CMake
   set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS ${LLVM_BINARY_DIR}/runtimes/Components.cmake)
   include(LLVMExternalProjectUtils)
 
+  if(NOT LLVM_BUILD_RUNTIMES)
+    set(EXTRA_ARGS EXCLUDE_FROM_ALL)
+  endif()
+
   # If compiler-rt is present we need to build the builtin libraries first. This
   # is required because the other runtimes need the builtin libraries present
   # before the just-built compiler can pass the configuration tests.
   if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt)
-    if(APPLE OR NOT LLVM_BUILTIN_TARGETS)
+    if(NOT LLVM_BUILTIN_TARGETS)
       llvm_ExternalProject_Add(builtins
                                ${CMAKE_CURRENT_SOURCE_DIR}/compiler-rt/lib/builtins
                                CMAKE_ARGS -DLLVM_LIBRARY_OUTPUT_INTDIR=${LLVM_LIBRARY_DIR}
@@ -157,11 +165,20 @@ else() # if this is included from LLVM's CMake
                                           -DCMAKE_C_COMPILER_TARGET=${TARGET_TRIPLE}
                                           -DCMAKE_ASM_COMPILER_TARGET=${TARGET_TRIPLE}
                                PASSTHROUGH_PREFIXES COMPILER_RT
-                               USE_TOOLCHAIN)
+                               USE_TOOLCHAIN
+                               ${EXTRA_ARGS})
     else()
       get_cmake_property(variableNames VARIABLES)
       add_custom_target(builtins)
       foreach(target ${LLVM_BUILTIN_TARGETS})
+        string(REPLACE "-" ";" builtin_target_list ${target})
+        foreach(item ${builtin_target_list})
+          string(TOLOWER "${item}" item_lower)
+          if(item_lower MATCHES "darwin")
+            message(FATAL_ERROR "LLVM_BUILTIN_TARGETS isn't implemented for Darwin platform!")
+          endif()
+        endforeach()
+
         foreach(variableName ${variableNames})
           if(variableName MATCHES "^BUILTINS_${target}")
             string(REPLACE "BUILTINS_${target}_" "" new_name ${variableName})
@@ -178,8 +195,10 @@ else() # if this is included from LLVM's CMake
                                           -DCMAKE_ASM_COMPILER_WORKS=On
                                           -DCOMPILER_RT_DEFAULT_TARGET_ONLY=On
                                           ${${target}_extra_args}
+                               TOOLCHAIN_TOOLS clang lld llvm-ar llvm-ranlib
                                PASSTHROUGH_PREFIXES COMPILER_RT
-                               USE_TOOLCHAIN)
+                               USE_TOOLCHAIN
+                               ${EXTRA_ARGS})
         add_dependencies(builtins builtins-${target})
       endforeach()
     endif()
@@ -238,7 +257,8 @@ else() # if this is included from LLVM's CMake
                                             ${SUB_COMPONENTS}
                                             ${SUB_COMPONENT_CHECK_TARGETS}
                                             ${SUB_INSTALL_TARGETS}
-                             USE_TOOLCHAIN)
+                             USE_TOOLCHAIN
+                             ${EXTRA_ARGS})
     
     # TODO: This is a hack needed because the libcxx headers are copied into the
     # build directory during configuration. Without that step the clang in the
diff --git a/test/Analysis/BasicAA/call-attrs.ll b/test/Analysis/BasicAA/call-attrs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9cd17e486799e155a3cd191eb8be826e7cae6882
--- /dev/null
+++ b/test/Analysis/BasicAA/call-attrs.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+declare void @readonly_attr(i8* readonly nocapture)
+declare void @writeonly_attr(i8* writeonly nocapture)
+declare void @readnone_attr(i8* readnone nocapture)
+
+declare void @readonly_func(i8* nocapture) readonly
+declare void @writeonly_func(i8* nocapture) writeonly
+declare void @readnone_func(i8* nocapture) readnone
+
+declare void @read_write(i8* writeonly nocapture, i8* readonly nocapture, i8* readnone nocapture)
+
+declare void @func()
+
+define void @test(i8* noalias %p) {
+entry:
+  call void @readonly_attr(i8* %p)
+  call void @readonly_func(i8* %p)
+
+  call void @writeonly_attr(i8* %p)
+  call void @writeonly_func(i8* %p)
+
+  call void @readnone_attr(i8* %p)
+  call void @readnone_func(i8* %p)
+
+  call void @read_write(i8* %p, i8* %p, i8* %p)
+
+  call void @func() ["deopt" (i8* %p)]
+  call void @writeonly_attr(i8* %p) ["deopt" (i8* %p)]
+
+  ret void
+}
+
+; CHECK:  Just Ref:  Ptr: i8* %p	<->  call void @readonly_attr(i8* %p)
+; CHECK:  Just Ref:  Ptr: i8* %p	<->  call void @readonly_func(i8* %p)
+; CHECK:  Just Mod:  Ptr: i8* %p	<->  call void @writeonly_attr(i8* %p)
+; CHECK:  Just Mod:  Ptr: i8* %p	<->  call void @writeonly_func(i8* %p)
+; CHECK:  NoModRef:  Ptr: i8* %p	<->  call void @readnone_attr(i8* %p)
+; CHECK:  NoModRef:  Ptr: i8* %p	<->  call void @readnone_func(i8* %p)
+; CHECK:  Both ModRef:  Ptr: i8* %p	<->  call void @read_write(i8* %p, i8* %p, i8* %p)
+; CHECK:  Just Ref:  Ptr: i8* %p	<->  call void @func() [ "deopt"(i8* %p) ]
+; CHECK:  Both ModRef:  Ptr: i8* %p	<->  call void @writeonly_attr(i8* %p) [ "deopt"(i8* %p) ]
diff --git a/test/Analysis/BasicAA/modref.ll b/test/Analysis/BasicAA/modref.ll
index e42793936c3dcffa8edf718312c71195f0f33761..71a3eac3a74e60193fea415f40eca6f86b40178b 100644
--- a/test/Analysis/BasicAA/modref.ll
+++ b/test/Analysis/BasicAA/modref.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -basicaa -gvn -dse -S | FileCheck %s
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 declare void @external(i32*) 
 
@@ -67,7 +67,7 @@ define void @test3(i8* %P, i8 %X) {
   %P2 = getelementptr i8, i8* %P, i32 2
   store i8 %Y, i8* %P2  ;; Not read by lifetime.end, should be removed.
 ; CHECK: store i8 2, i8* %P2
-  call void @llvm.lifetime.end(i64 1, i8* %P)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %P)
   store i8 2, i8* %P2
 ; CHECK-NOT: store
   ret void
@@ -81,7 +81,7 @@ define void @test3a(i8* %P, i8 %X) {
   %P2 = getelementptr i8, i8* %P, i32 2
   store i8 %Y, i8* %P2
 ; CHECK-NEXT: call void @llvm.lifetime.end
-  call void @llvm.lifetime.end(i64 10, i8* %P)
+  call void @llvm.lifetime.end.p0i8(i64 10, i8* %P)
   ret void
 ; CHECK-NEXT: ret void
 }
diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll
index 67d3e9e850c3fef8444ea7c81aa7087fa303db6b..94ea5a3d1d8ea21a948c9388314348d851f17ee3 100644
--- a/test/Analysis/BranchProbabilityInfo/basic.ll
+++ b/test/Analysis/BranchProbabilityInfo/basic.ll
@@ -143,6 +143,43 @@ exit:
 
 declare i32 @regular_function(i32 %i)
 
+define i32 @test_cold_call_sites_with_prof(i32 %a, i32 %b, i1 %flag, i1 %flag2) {
+; CHECK: Printing analysis {{.*}} for function 'test_cold_call_sites_with_prof'
+entry:
+  br i1 %flag, label %then, label %else
+; CHECK: edge entry -> then probability is 0x07878788 / 0x80000000 = 5.88%
+; CHECK: edge entry -> else probability is 0x78787878 / 0x80000000 = 94.12% [HOT edge]
+
+then:
+  br i1 %flag2, label %then2, label %else2, !prof !3
+; CHECK: edge then -> then2 probability is 0x7ebb907a / 0x80000000 = 99.01% [HOT edge]
+; CHECK: edge then -> else2 probability is 0x01446f86 / 0x80000000 = 0.99%
+
+then2:
+  br label %join
+; CHECK: edge then2 -> join probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else2:
+  br label %join
+; CHECK: edge else2 -> join probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+join:
+  %joinresult = phi i32 [ %a, %then2 ], [ %b, %else2 ]
+  call void @coldfunc()
+  br label %exit
+; CHECK: edge join -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+else:
+  br label %exit
+; CHECK: edge else -> exit probability is 0x80000000 / 0x80000000 = 100.00% [HOT edge]
+
+exit:
+  %result = phi i32 [ %joinresult, %join ], [ %b, %else ]
+  ret i32 %result
+}
+
+!3 = !{!"branch_weights", i32 100, i32 1}
+
 define i32 @test_cold_call_sites(i32* %a) {
 ; Test that edges to blocks post-dominated by cold call sites
 ; are marked as not expected to be taken.
diff --git a/test/Analysis/ConstantFolding/gep-constanfolding-error.ll b/test/Analysis/ConstantFolding/gep-constanfolding-error.ll
new file mode 100644
index 0000000000000000000000000000000000000000..50ad61a8f100a900cb7d47020c043d92828d0de1
--- /dev/null
+++ b/test/Analysis/ConstantFolding/gep-constanfolding-error.ll
@@ -0,0 +1,52 @@
+; RUN: opt -gvn -S -o - %s | FileCheck %s
+; RUN: opt -newgvn -S -o - %s | FileCheck %s
+; Test that the constantfolding getelementptr computation results in
+; j[5][4][1] (j+239)
+; and not [1][4][4][1] (#449) which is an incorrect out-of-range error
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7-none-eabi"
+
+@f = local_unnamed_addr global i32 2, align 4
+@t6 = local_unnamed_addr global i32 1, align 4
+@j = local_unnamed_addr global [6 x [6 x [7 x i8]]] [[6 x [7 x i8]] [[7 x i8] c"\06\00\00\00\00\00\00", [7 x i8] zeroinitializer, [7 x i8] zeroinitializer, [7 x i8] zeroinitializer, [7 x i8] zeroinitializer, [7 x i8] zeroinitializer], [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer, [6 x [7 x i8]] zeroinitializer], align 1
+@p = internal global i64 0, align 8
+@y = local_unnamed_addr global i64* @p, align 4
+@b = internal unnamed_addr global i32 0, align 4
+@h = common local_unnamed_addr global i16 0, align 2
+@a = common local_unnamed_addr global i32 0, align 4
+@k = common local_unnamed_addr global i32 0, align 4
+@t11 = common local_unnamed_addr global i32 0, align 4
+
+; Function Attrs: nounwind
+define i32 @main() local_unnamed_addr {
+entry:
+  %0 = load i32, i32* @t6, align 4
+  %inc = add nsw i32 %0, 1
+  store i32 %inc, i32* @t6, align 4
+  store i16 4, i16* @h, align 2
+  %1 = load i32, i32* @a, align 4
+  %conv = trunc i32 %1 to i8
+  store i32 1, i32* @f, align 4
+  %2 = load i64, i64* @p, align 8
+  %cmp4 = icmp slt i64 %2, 2
+  %conv6 = zext i1 %cmp4 to i8
+  %3 = load i16, i16* @h, align 2
+  %conv7 = sext i16 %3 to i32
+  %add = add nsw i32 %conv7, 1
+  %f.promoted = load i32, i32* @f, align 4
+  %4 = mul i32 %conv7, 7
+  %5 = add i32 %4, 5
+  %6 = sub i32 -1, %f.promoted
+  %7 = icmp sgt i32 %6, -2
+  %smax = select i1 %7, i32 %6, i32 -2
+  %8 = sub i32 6, %smax
+  %scevgep = getelementptr [6 x [6 x [7 x i8]]], [6 x [6 x [7 x i8]]]* @j, i32 0, i32 0, i32 %5, i32 %8
+  %9 = add i32 %f.promoted, %smax
+  %10 = add i32 %9, 2
+  call void @llvm.memset.p0i8.i32(i8* %scevgep, i8 %conv6, i32 %10, i32 1, i1 false)
+; CHECK:  call void @llvm.memset.p0i8.i32(i8* getelementptr inbounds ([6 x [6 x [7 x i8]]], [6 x [6 x [7 x i8]]]* @j, i32 0, i64 5, i64 4, i32 1), i8 %conv6, i32 1, i32 1, i1 false)
+; CHECK-NOT: call void @llvm.memset.p0i8.i32(i8* getelementptr ([6 x [6 x [7 x i8]]], [6 x [6 x [7 x i8]]]* @j, i64 1, i64 4, i64 4, i32 1)
+  ret i32 0
+}
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i32, i1)
diff --git a/test/Analysis/ConstantFolding/timeout.ll b/test/Analysis/ConstantFolding/timeout.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3d28c2adbe48c1c5a354aa7d59d3d1b72bf32619
--- /dev/null
+++ b/test/Analysis/ConstantFolding/timeout.ll
@@ -0,0 +1,73 @@
+; NOTE: This is a timeout test for some O(something silly) constant folding behaviour. It may not be the best test. Providing it finishes, it passes.
+; RUN: opt < %s -O3 -S | FileCheck %s
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8-none-eabi"
+
+%struct.ST = type { %struct.ST* }
+
+@global = internal global [121 x i8] zeroinitializer, align 1
+
+define void @func() #0 {
+;CHECK-LABEL: func
+entry:
+  %s = alloca %struct.ST*, align 4
+  %j = alloca i32, align 4
+  store %struct.ST* bitcast ([121 x i8]* @global to %struct.ST*), %struct.ST** %s, align 4
+  store i32 0, i32* %j, align 4
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %0 = load i32, i32* %j, align 4
+  %cmp = icmp slt i32 %0, 30
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %1 = load %struct.ST*, %struct.ST** %s, align 4
+  %2 = bitcast %struct.ST* %1 to i8*
+  %add.ptr = getelementptr inbounds i8, i8* %2, i32 4
+  %3 = ptrtoint i8* %add.ptr to i32
+  %4 = load %struct.ST*, %struct.ST** %s, align 4
+  %5 = bitcast %struct.ST* %4 to i8*
+  %add.ptr1 = getelementptr inbounds i8, i8* %5, i32 4
+  %6 = ptrtoint i8* %add.ptr1 to i32
+  %rem = urem i32 %6, 2
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %cond.true, label %cond.false
+
+cond.true:                                        ; preds = %for.body
+  br label %cond.end
+
+cond.false:                                       ; preds = %for.body
+  %7 = load %struct.ST*, %struct.ST** %s, align 4
+  %8 = bitcast %struct.ST* %7 to i8*
+  %add.ptr3 = getelementptr inbounds i8, i8* %8, i32 4
+  %9 = ptrtoint i8* %add.ptr3 to i32
+  %rem4 = urem i32 %9, 2
+  br label %cond.end
+
+cond.end:                                         ; preds = %cond.false, %cond.true
+  %cond = phi i32 [ 0, %cond.true ], [ %rem4, %cond.false ]
+  %add = add i32 %3, %cond
+  %10 = inttoptr i32 %add to %struct.ST*
+  %11 = load %struct.ST*, %struct.ST** %s, align 4
+  %next = getelementptr inbounds %struct.ST, %struct.ST* %11, i32 0, i32 0
+  store %struct.ST* %10, %struct.ST** %next, align 4
+  %12 = load %struct.ST*, %struct.ST** %s, align 4
+  %next5 = getelementptr inbounds %struct.ST, %struct.ST* %12, i32 0, i32 0
+  %13 = load %struct.ST*, %struct.ST** %next5, align 4
+  store %struct.ST* %13, %struct.ST** %s, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %cond.end
+  %14 = load i32, i32* %j, align 4
+  %inc = add nsw i32 %14, 1
+  store i32 %inc, i32* %j, align 4
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond
+  %15 = load %struct.ST*, %struct.ST** %s, align 4
+  %next6 = getelementptr inbounds %struct.ST, %struct.ST* %15, i32 0, i32 0
+  store %struct.ST* null, %struct.ST** %next6, align 4
+  ret void
+}
+
diff --git a/test/Analysis/CostModel/AMDGPU/add-sub.ll b/test/Analysis/CostModel/AMDGPU/add-sub.ll
index 76b21d26faaa43b1342462073a8fb2f9da7c29b3..6419eb11b2be5d581770c9b1d0bbe12167fdd58c 100644
--- a/test/Analysis/CostModel/AMDGPU/add-sub.ll
+++ b/test/Analysis/CostModel/AMDGPU/add-sub.ll
@@ -3,7 +3,7 @@
 
 ; CHECK: 'add_i32'
 ; CHECK: estimated cost of 1 for {{.*}} add i32
-define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %add = add i32 %vec, %b
   store i32 %add, i32 addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @add_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 
 ; CHECK: 'add_v2i32'
 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i32>
-define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+define amdgpu_kernel void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %add = add <2 x i32> %vec, %b
   store <2 x i32> %add, <2 x i32> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @add_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %va
 
 ; CHECK: 'add_v3i32'
 ; CHECK: estimated cost of 3 for {{.*}} add <3 x i32>
-define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+define amdgpu_kernel void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %add = add <3 x i32> %vec, %b
   store <3 x i32> %add, <3 x i32> addrspace(1)* %out
@@ -30,7 +30,7 @@ define void @add_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %va
 
 ; CHECK: 'add_v4i32'
 ; CHECK: estimated cost of 4 for {{.*}} add <4 x i32>
-define void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
+define amdgpu_kernel void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %add = add <4 x i32> %vec, %b
   store <4 x i32> %add, <4 x i32> addrspace(1)* %out
@@ -39,7 +39,7 @@ define void @add_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %va
 
 ; CHECK: 'add_i64'
 ; CHECK: estimated cost of 2 for {{.*}} add i64
-define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %add = add i64 %vec, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @add_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 
 ; CHECK: 'add_v2i64'
 ; CHECK: estimated cost of 4 for {{.*}} add <2 x i64>
-define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+define amdgpu_kernel void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %add = add <2 x i64> %vec, %b
   store <2 x i64> %add, <2 x i64> addrspace(1)* %out
@@ -57,7 +57,7 @@ define void @add_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %va
 
 ; CHECK: 'add_v3i64'
 ; CHECK: estimated cost of 6 for {{.*}} add <3 x i64>
-define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+define amdgpu_kernel void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %add = add <3 x i64> %vec, %b
   store <3 x i64> %add, <3 x i64> addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @add_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %va
 
 ; CHECK: 'add_v4i64'
 ; CHECK: estimated cost of 8 for {{.*}} add <4 x i64>
-define void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
+define amdgpu_kernel void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %add = add <4 x i64> %vec, %b
   store <4 x i64> %add, <4 x i64> addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @add_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %va
 
 ; CHECK: 'add_v16i64'
 ; CHECK: estimated cost of 32 for {{.*}} add <16 x i64>
-define void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
+define amdgpu_kernel void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %vaddr, <16 x i64> %b) #0 {
   %vec = load <16 x i64>, <16 x i64> addrspace(1)* %vaddr
   %add = add <16 x i64> %vec, %b
   store <16 x i64> %add, <16 x i64> addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @add_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)*
 
 ; CHECK: 'add_i16'
 ; CHECK: estimated cost of 1 for {{.*}} add i16
-define void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
+define amdgpu_kernel void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
   %add = add i16 %vec, %b
   store i16 %add, i16 addrspace(1)* %out
@@ -93,7 +93,7 @@ define void @add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #
 
 ; CHECK: 'add_v2i16'
 ; CHECK: estimated cost of 2 for {{.*}} add <2 x i16>
-define void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
+define amdgpu_kernel void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %add = add <2 x i16> %vec, %b
   store <2 x i16> %add, <2 x i16> addrspace(1)* %out
@@ -102,7 +102,7 @@ define void @add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %va
 
 ; CHECK: 'sub_i32'
 ; CHECK: estimated cost of 1 for {{.*}} sub i32
-define void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %sub = sub i32 %vec, %b
   store i32 %sub, i32 addrspace(1)* %out
@@ -111,7 +111,7 @@ define void @sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 
 ; CHECK: 'sub_i64'
 ; CHECK: estimated cost of 2 for {{.*}} sub i64
-define void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %sub = sub i64 %vec, %b
   store i64 %sub, i64 addrspace(1)* %out
@@ -119,7 +119,7 @@ define void @sub_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 }
 ; CHECK: 'sub_i16'
 ; CHECK: estimated cost of 1 for {{.*}} sub i16
-define void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
+define amdgpu_kernel void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #0 {
   %vec = load i16, i16 addrspace(1)* %vaddr
   %sub = sub i16 %vec, %b
   store i16 %sub, i16 addrspace(1)* %out
@@ -128,7 +128,7 @@ define void @sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %vaddr, i16 %b) #
 
 ; CHECK: 'sub_v2i16'
 ; CHECK: estimated cost of 2 for {{.*}} sub <2 x i16>
-define void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
+define amdgpu_kernel void @sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, <2 x i16> %b) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %sub = sub <2 x i16> %vec, %b
   store <2 x i16> %sub, <2 x i16> addrspace(1)* %out
diff --git a/test/Analysis/CostModel/AMDGPU/bit-ops.ll b/test/Analysis/CostModel/AMDGPU/bit-ops.ll
index a809dbd77bbf87dcb6d603ac5ceb99e8d92f94b5..aa70f5032cbcc602780d41086249d6697319f15a 100644
--- a/test/Analysis/CostModel/AMDGPU/bit-ops.ll
+++ b/test/Analysis/CostModel/AMDGPU/bit-ops.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'or_i32'
 ; CHECK: estimated cost of 1 for {{.*}} or i32
-define void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = or i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0
 
 ; CHECK: 'or_i64'
 ; CHECK: estimated cost of 2 for {{.*}} or i64
-define void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = or i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0
 
 ; CHECK: 'xor_i32'
 ; CHECK: estimated cost of 1 for {{.*}} xor i32
-define void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = xor i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 
 ; CHECK: 'xor_i64'
 ; CHECK: estimated cost of 2 for {{.*}} xor i64
-define void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = xor i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -39,7 +39,7 @@ define void @xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 
 ; CHECK: 'and_i32'
 ; CHECK: estimated cost of 1 for {{.*}} and i32
-define void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = and i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @and_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 
 ; CHECK: 'and_i64'
 ; CHECK: estimated cost of 2 for {{.*}} and i64
-define void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = and i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
diff --git a/test/Analysis/CostModel/AMDGPU/br.ll b/test/Analysis/CostModel/AMDGPU/br.ll
index 0b9649397563cfb102a1027a0dcde9ade8bb35af..494f8d2c8b2c3b44e703fd6f357046908dedb829 100644
--- a/test/Analysis/CostModel/AMDGPU/br.ll
+++ b/test/Analysis/CostModel/AMDGPU/br.ll
@@ -4,7 +4,7 @@
 ; CHECK: estimated cost of 10 for instruction: br i1
 ; CHECK: estimated cost of 10 for instruction: br label
 ; CHECK: estimated cost of 10 for instruction: ret void
-define void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @test_br_cost(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
 bb0:
   br i1 undef, label %bb1, label %bb2
 
@@ -21,7 +21,7 @@ bb2:
 
 ; CHECK: 'test_switch_cost'
 ; CHECK: Unknown cost for instruction:   switch
-define void @test_switch_cost(i32 %a) #0 {
+define amdgpu_kernel void @test_switch_cost(i32 %a) #0 {
 entry:
   switch i32 %a, label %default [
     i32 0, label %case0
diff --git a/test/Analysis/CostModel/AMDGPU/extractelement.ll b/test/Analysis/CostModel/AMDGPU/extractelement.ll
index c328d7686466fcf80e5b716c6564e93d2b339d4e..1efbb5873acb279c3cefa6ee0d67de19d9f990fe 100644
--- a/test/Analysis/CostModel/AMDGPU/extractelement.ll
+++ b/test/Analysis/CostModel/AMDGPU/extractelement.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'extractelement_v2i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32>
-define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %elt = extractelement <2 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)
 
 ; CHECK: 'extractelement_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float>
-define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %elt = extractelement <2 x float> %vec, i32 1
   store float %elt, float addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspac
 
 ; CHECK: 'extractelement_v3i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32>
-define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %elt = extractelement <3 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)
 
 ; CHECK: 'extractelement_v4i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32>
-define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %elt = extractelement <4 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)
 
 ; CHECK: 'extractelement_v8i32'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32>
-define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) {
   %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
   %elt = extractelement <8 x i32> %vec, i32 1
   store i32 %elt, i32 addrspace(1)* %out
@@ -48,7 +48,7 @@ define void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)
 ; FIXME: Should be non-0
 ; CHECK: 'extractelement_v8i32_dynindex'
 ; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32>
-define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
+define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) {
   %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr
   %elt = extractelement <8 x i32> %vec, i32 %idx
   store i32 %elt, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@ define void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> add
 
 ; CHECK: 'extractelement_v2i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64>
-define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %elt = extractelement <2 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)
 
 ; CHECK: 'extractelement_v3i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64>
-define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %elt = extractelement <3 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)
 
 ; CHECK: 'extractelement_v4i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64>
-define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %elt = extractelement <4 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)
 
 ; CHECK: 'extractelement_v8i64'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64>
-define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %elt = extractelement <8 x i64> %vec, i64 1
   store i64 %elt, i64 addrspace(1)* %out
@@ -93,7 +93,7 @@ define void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)
 
 ; CHECK: 'extractelement_v4i8'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8>
-define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) {
   %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr
   %elt = extractelement <4 x i8> %vec, i8 1
   store i8 %elt, i8 addrspace(1)* %out
@@ -102,7 +102,7 @@ define void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %
 
 ; CHECK: 'extractelement_v2i16'
 ; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16>
-define void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %elt = extractelement <2 x i16> %vec, i16 1
   store i16 %elt, i16 addrspace(1)* %out
diff --git a/test/Analysis/CostModel/AMDGPU/fabs.ll b/test/Analysis/CostModel/AMDGPU/fabs.ll
index 9c551ec8afe51274b2a347d9bb9734001e9ad8e1..0d49e2967d2d9128af340c9f974542d53b28007d 100644
--- a/test/Analysis/CostModel/AMDGPU/fabs.ll
+++ b/test/Analysis/CostModel/AMDGPU/fabs.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'fabs_f32'
 ; CHECK: estimated cost of 0 for {{.*}} call float @llvm.fabs.f32
-define void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %fabs = call float @llvm.fabs.f32(float %vec) #1
   store float %fabs, float addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @fabs_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr) #0 {
 
 ; CHECK: 'fabs_v2f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x float> @llvm.fabs.v2f32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %vec) #1
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 ; CHECK: 'fabs_v3f32'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x float> @llvm.fabs.v3f32
-define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %fabs = call <3 x float> @llvm.fabs.v3f32(<3 x float> %vec) #1
   store <3 x float> %fabs, <3 x float> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @fabs_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 
 ; CHECK: 'fabs_f64'
 ; CHECK: estimated cost of 0 for {{.*}} call double @llvm.fabs.f64
-define void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %fabs = call double @llvm.fabs.f64(double %vec) #1
   store double %fabs, double addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @fabs_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr) #0
 
 ; CHECK: 'fabs_v2f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x double> @llvm.fabs.v2f64
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %vec) #1
   store <2 x double> %fabs, <2 x double> addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 
 ; CHECK: 'fabs_v3f64'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x double> @llvm.fabs.v3f64
-define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %fabs = call <3 x double> @llvm.fabs.v3f64(<3 x double> %vec) #1
   store <3 x double> %fabs, <3 x double> addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @fabs_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(
 
 ; CHECK: 'fabs_f16'
 ; CHECK: estimated cost of 0 for {{.*}} call half @llvm.fabs.f16
-define void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %fabs = call half @llvm.fabs.f16(half %vec) #1
   store half %fabs, half addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @fabs_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr) #0 {
 
 ; CHECK: 'fabs_v2f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <2 x half> @llvm.fabs.v2f16
-define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %vec) #1
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
@@ -74,7 +74,7 @@ define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*
 
 ; CHECK: 'fabs_v3f16'
 ; CHECK: estimated cost of 0 for {{.*}} call <3 x half> @llvm.fabs.v3f16
-define void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
+define amdgpu_kernel void @fabs_v3f16(<3 x half> addrspace(1)* %out, <3 x half> addrspace(1)* %vaddr) #0 {
   %vec = load <3 x half>, <3 x half> addrspace(1)* %vaddr
   %fabs = call <3 x half> @llvm.fabs.v3f16(<3 x half> %vec) #1
   store <3 x half> %fabs, <3 x half> addrspace(1)* %out
diff --git a/test/Analysis/CostModel/AMDGPU/fadd.ll b/test/Analysis/CostModel/AMDGPU/fadd.ll
index 00e91bd6223aa25a99c2751e1ce8ac3ccbcb4925..d7ac7359299841a0a444bbd7d98c080bd454d73c 100644
--- a/test/Analysis/CostModel/AMDGPU/fadd.ll
+++ b/test/Analysis/CostModel/AMDGPU/fadd.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'fadd_f32'
 ; ALL: estimated cost of 1 for {{.*}} fadd float
-define void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fadd float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @fadd_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa
 
 ; ALL: 'fadd_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fadd <2 x float>
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fadd <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 ; ALL: 'fadd_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fadd <3 x float>
-define void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fadd <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@ define void @fadd_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; ALL: 'fadd_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fadd double
 ; SLOWF64: estimated cost of 3 for {{.*}} fadd double
-define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fadd double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @fadd_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; ALL: 'fadd_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fadd <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fadd <2 x double>
-define void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fadd <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; ALL: 'fadd_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fadd <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fadd <3 x double>
-define void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fadd <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@ define void @fadd_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(
 
 ; ALL 'fadd_f16'
 ; ALL estimated cost of 1 for {{.*}} fadd half
-define void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fadd half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fadd_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %
 
 ; ALL 'fadd_v2f16'
 ; ALL estimated cost of 2 for {{.*}} fadd <2 x half>
-define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fadd <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*
 
 ; ALL 'fadd_v4f16'
 ; ALL estimated cost of 4 for {{.*}} fadd <4 x half>
-define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fadd <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/test/Analysis/CostModel/AMDGPU/fdiv.ll b/test/Analysis/CostModel/AMDGPU/fdiv.ll
index 3f374422ad9d1009119c563e1d96036dc17ab913..caa9bff7b2a91343c44b0c88bb3b6f34e164dd0f 100644
--- a/test/Analysis/CostModel/AMDGPU/fdiv.ll
+++ b/test/Analysis/CostModel/AMDGPU/fdiv.ll
@@ -5,7 +5,7 @@
 
 ; CHECK: 'fdiv_f32'
 ; ALL: estimated cost of 10 for {{.*}} fdiv float
-define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fdiv float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -14,7 +14,7 @@ define void @fdiv_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa
 
 ; ALL: 'fdiv_v2f32'
 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x float>
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fdiv <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -23,7 +23,7 @@ define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 ; ALL: 'fdiv_v3f32'
 ; ALL: estimated cost of 30 for {{.*}} fdiv <3 x float>
-define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fdiv <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @fdiv_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; CISLOWF64: estimated cost of 33 for {{.*}} fdiv double
 ; SIFASTF64: estimated cost of 32 for {{.*}} fdiv double
 ; SISLOWF64: estimated cost of 36 for {{.*}} fdiv double
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fdiv double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; CISLOWF64: estimated cost of 66 for {{.*}} fdiv <2 x double>
 ; SIFASTF64: estimated cost of 64 for {{.*}} fdiv <2 x double>
 ; SISLOWF64: estimated cost of 72 for {{.*}} fdiv <2 x double>
-define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fdiv <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -59,7 +59,7 @@ define void @fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; CISLOWF64: estimated cost of 99 for {{.*}} fdiv <3 x double>
 ; SIFASTF64: estimated cost of 96 for {{.*}} fdiv <3 x double>
 ; SISLOWF64: estimated cost of 108 for {{.*}} fdiv <3 x double>
-define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fdiv <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -68,7 +68,7 @@ define void @fdiv_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(
 
 ; ALL: 'fdiv_f16'
 ; ALL: estimated cost of 10 for {{.*}} fdiv half
-define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fdiv half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -77,7 +77,7 @@ define void @fdiv_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %
 
 ; ALL: 'fdiv_v2f16'
 ; ALL: estimated cost of 20 for {{.*}} fdiv <2 x half>
-define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fdiv <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -86,7 +86,7 @@ define void @fdiv_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*
 
 ; ALL: 'fdiv_v4f16'
 ; ALL: estimated cost of 40 for {{.*}} fdiv <4 x half>
-define void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fdiv_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fdiv <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/test/Analysis/CostModel/AMDGPU/fmul.ll b/test/Analysis/CostModel/AMDGPU/fmul.ll
index 6303bb7988c54cf9fac5d43a8d314f0ca4f8b6c0..915c35a23b30fa0a3456149b020d6cb8aa8656a7 100644
--- a/test/Analysis/CostModel/AMDGPU/fmul.ll
+++ b/test/Analysis/CostModel/AMDGPU/fmul.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'fmul_f32'
 ; ALL: estimated cost of 1 for {{.*}} fmul float
-define void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fmul float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @fmul_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa
 
 ; ALL: 'fmul_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fmul <2 x float>
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fmul <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 ; ALL: 'fmul_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fmul <3 x float>
-define void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fmul <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@ define void @fmul_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; ALL: 'fmul_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fmul double
 ; SLOWF64: estimated cost of 3 for {{.*}} fmul double
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fmul double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; ALL: 'fmul_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fmul <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fmul <2 x double>
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fmul <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; ALL: 'fmul_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fmul <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fmul <3 x double>
-define void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fmul <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@ define void @fmul_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(
 
 ; ALL 'fmul_f16'
 ; ALL estimated cost of 1 for {{.*}} fmul half
-define void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fmul half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fmul_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %
 
 ; ALL 'fmul_v2f16'
 ; ALL estimated cost of 2 for {{.*}} fmul <2 x half>
-define void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fmul <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fmul_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*
 
 ; ALL 'fmul_v4f16'
 ; ALL estimated cost of 4 for {{.*}} fmul <4 x half>
-define void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fmul_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fmul <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/test/Analysis/CostModel/AMDGPU/fsub.ll b/test/Analysis/CostModel/AMDGPU/fsub.ll
index e0850be9867ea9f26a04f81b4a37a0a91f261227..cb89d292f717620ee9c2c5aa98d5e8f5a11f49be 100644
--- a/test/Analysis/CostModel/AMDGPU/fsub.ll
+++ b/test/Analysis/CostModel/AMDGPU/fsub.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'fsub_f32'
 ; ALL: estimated cost of 1 for {{.*}} fsub float
-define void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
+define amdgpu_kernel void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, float %b) #0 {
   %vec = load float, float addrspace(1)* %vaddr
   %add = fsub float %vec, %b
   store float %add, float addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @fsub_f32(float addrspace(1)* %out, float addrspace(1)* %vaddr, floa
 
 ; ALL: 'fsub_v2f32'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x float>
-define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
+define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr, <2 x float> %b) #0 {
   %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr
   %add = fsub <2 x float> %vec, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
 
 ; ALL: 'fsub_v3f32'
 ; ALL: estimated cost of 3 for {{.*}} fsub <3 x float>
-define void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
+define amdgpu_kernel void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %vaddr, <3 x float> %b) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %vaddr
   %add = fsub <3 x float> %vec, %b
   store <3 x float> %add, <3 x float> addrspace(1)* %out
@@ -31,7 +31,7 @@ define void @fsub_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)
 ; ALL: 'fsub_f64'
 ; FASTF64: estimated cost of 2 for {{.*}} fsub double
 ; SLOWF64: estimated cost of 3 for {{.*}} fsub double
-define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
+define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, double %b) #0 {
   %vec = load double, double addrspace(1)* %vaddr
   %add = fsub double %vec, %b
   store double %add, double addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %vaddr, do
 ; ALL: 'fsub_v2f64'
 ; FASTF64: estimated cost of 4 for {{.*}} fsub <2 x double>
 ; SLOWF64: estimated cost of 6 for {{.*}} fsub <2 x double>
-define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
+define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %vaddr, <2 x double> %b) #0 {
   %vec = load <2 x double>, <2 x double> addrspace(1)* %vaddr
   %add = fsub <2 x double> %vec, %b
   store <2 x double> %add, <2 x double> addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; ALL: 'fsub_v3f64'
 ; FASTF64: estimated cost of 6 for {{.*}} fsub <3 x double>
 ; SLOWF64: estimated cost of 9 for {{.*}} fsub <3 x double>
-define void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
+define amdgpu_kernel void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %vaddr, <3 x double> %b) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %vaddr
   %add = fsub <3 x double> %vec, %b
   store <3 x double> %add, <3 x double> addrspace(1)* %out
@@ -60,7 +60,7 @@ define void @fsub_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(
 
 ; ALL: 'fsub_f16'
 ; ALL: estimated cost of 1 for {{.*}} fsub half
-define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
+define amdgpu_kernel void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %b) #0 {
   %vec = load half, half addrspace(1)* %vaddr
   %add = fsub half %vec, %b
   store half %add, half addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fsub_f16(half addrspace(1)* %out, half addrspace(1)* %vaddr, half %
 
 ; ALL: 'fsub_v2f16'
 ; ALL: estimated cost of 2 for {{.*}} fsub <2 x half>
-define void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
+define amdgpu_kernel void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %vaddr, <2 x half> %b) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(1)* %vaddr
   %add = fsub <2 x half> %vec, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fsub_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)*
 
 ; ALL: 'fsub_v4f16'
 ; ALL: estimated cost of 4 for {{.*}} fsub <4 x half>
-define void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
+define amdgpu_kernel void @fsub_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %vaddr, <4 x half> %b) #0 {
   %vec = load <4 x half>, <4 x half> addrspace(1)* %vaddr
   %add = fsub <4 x half> %vec, %b
   store <4 x half> %add, <4 x half> addrspace(1)* %out
diff --git a/test/Analysis/CostModel/AMDGPU/insertelement.ll b/test/Analysis/CostModel/AMDGPU/insertelement.ll
index 1765afe3169e1449621ff4009613fdd73c7dfc92..6f296a3e7a3454efe10f0b0246b4d03e2b9ec1e0 100644
--- a/test/Analysis/CostModel/AMDGPU/insertelement.ll
+++ b/test/Analysis/CostModel/AMDGPU/insertelement.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'insertelement_v2i32'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32>
-define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %insert = insertelement <2 x i32> %vec, i32 1, i32 123
   store <2 x i32> %insert, <2 x i32> addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspa
 
 ; CHECK: 'insertelement_v2i64'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64>
-define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %insert = insertelement <2 x i64> %vec, i64 1, i64 123
   store <2 x i64> %insert, <2 x i64> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspa
 
 ; CHECK: 'insertelement_v2i16'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16>
-define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) {
   %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr
   %insert = insertelement <2 x i16> %vec, i16 1, i16 123
   store <2 x i16> %insert, <2 x i16> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspa
 
 ; CHECK: 'insertelement_v2i8'
 ; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8>
-define void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
+define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) {
   %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr
   %insert = insertelement <2 x i8> %vec, i8 1, i8 123
   store <2 x i8> %insert, <2 x i8> addrspace(1)* %out
diff --git a/test/Analysis/CostModel/AMDGPU/mul.ll b/test/Analysis/CostModel/AMDGPU/mul.ll
index cbc755a6e6a977c692a2e2680c34595285e390e8..aac7b68f50c281266873c525eb76e7c5f1f6fb27 100644
--- a/test/Analysis/CostModel/AMDGPU/mul.ll
+++ b/test/Analysis/CostModel/AMDGPU/mul.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: 'mul_i32'
 ; CHECK: estimated cost of 3 for {{.*}} mul i32
-define void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %mul = mul i32 %vec, %b
   store i32 %mul, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 
 ; CHECK: 'mul_v2i32'
 ; CHECK: estimated cost of 6 for {{.*}} mul <2 x i32>
-define void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr, <2 x i32> %b) #0 {
   %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr
   %mul = mul <2 x i32> %vec, %b
   store <2 x i32> %mul, <2 x i32> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %va
 
 ; CHECK: 'mul_v3i32'
 ; CHECK: estimated cost of 9 for {{.*}} mul <3 x i32>
-define void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr, <3 x i32> %b) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr
   %mul = mul <3 x i32> %vec, %b
   store <3 x i32> %mul, <3 x i32> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @mul_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %va
 
 ; CHECK: 'mul_v4i32'
 ; CHECK: estimated cost of 12 for {{.*}} mul <4 x i32>
-define void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
+define amdgpu_kernel void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr, <4 x i32> %b) #0 {
   %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr
   %mul = mul <4 x i32> %vec, %b
   store <4 x i32> %mul, <4 x i32> addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %va
 
 ; CHECK: 'mul_i64'
 ; CHECK: estimated cost of 16 for {{.*}} mul i64
-define void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %mul = mul i64 %vec, %b
   store i64 %mul, i64 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 
 ; CHECK: 'mul_v2i64'
 ; CHECK: estimated cost of 32 for {{.*}} mul <2 x i64>
-define void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr, <2 x i64> %b) #0 {
   %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr
   %mul = mul <2 x i64> %vec, %b
   store <2 x i64> %mul, <2 x i64> addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @mul_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %va
 
 ; CHECK: 'mul_v3i64'
 ; CHECK: estimated cost of 48 for {{.*}} mul <3 x i64>
-define void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr, <3 x i64> %b) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr
   %mul = mul <3 x i64> %vec, %b
   store <3 x i64> %mul, <3 x i64> addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @mul_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %va
 
 ; CHECK: 'mul_v4i64'
 ; CHECK: estimated cost of 64 for {{.*}} mul <4 x i64>
-define void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr, <4 x i64> %b) #0 {
   %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr
   %mul = mul <4 x i64> %vec, %b
   store <4 x i64> %mul, <4 x i64> addrspace(1)* %out
@@ -75,7 +75,7 @@ define void @mul_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %va
 
 ; CHECK: 'mul_v8i64'
 ; CHECK: estimated cost of 128 for {{.*}} mul <8 x i64>
-define void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
+define amdgpu_kernel void @mul_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr, <8 x i64> %b) #0 {
   %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr
   %mul = mul <8 x i64> %vec, %b
   store <8 x i64> %mul, <8 x i64> addrspace(1)* %out
diff --git a/test/Analysis/CostModel/AMDGPU/shifts.ll b/test/Analysis/CostModel/AMDGPU/shifts.ll
index 003aed7b2fc8ecf9fa17aa892fee1c7967c905fd..85fb0ebe14e53e8171b078e9c6ae60a733f5ec07 100644
--- a/test/Analysis/CostModel/AMDGPU/shifts.ll
+++ b/test/Analysis/CostModel/AMDGPU/shifts.ll
@@ -3,7 +3,7 @@
 
 ; ALL: 'shl_i32'
 ; ALL: estimated cost of 1 for {{.*}} shl i32
-define void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = shl i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -13,7 +13,7 @@ define void @shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #
 ; ALL: 'shl_i64'
 ; FAST64: estimated cost of 2 for {{.*}} shl i64
 ; SLOW64: estimated cost of 3 for {{.*}} shl i64
-define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = shl i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -22,7 +22,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #
 
 ; ALL: 'lshr_i32'
 ; ALL: estimated cost of 1 for {{.*}} lshr i32
-define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = lshr i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -32,7 +32,7 @@ define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b)
 ; ALL: 'lshr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} lshr i64
 ; SLOW64: estimated cost of 3 for {{.*}} lshr i64
-define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = lshr i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -41,7 +41,7 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b)
 
 ; ALL: 'ashr_i32'
 ; ALL: estimated cost of 1 for {{.*}} ashr i32
-define void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
+define amdgpu_kernel void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b) #0 {
   %vec = load i32, i32 addrspace(1)* %vaddr
   %or = ashr i32 %vec, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @ashr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %vaddr, i32 %b)
 ; ALL: 'ashr_i64'
 ; FAST64: estimated cost of 2 for {{.*}} ashr i64
 ; SLOW64: estimated cost of 3 for {{.*}} ashr i64
-define void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
+define amdgpu_kernel void @ashr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %vaddr, i64 %b) #0 {
   %vec = load i64, i64 addrspace(1)* %vaddr
   %or = ashr i64 %vec, %b
   store i64 %or, i64 addrspace(1)* %out
diff --git a/test/Analysis/CostModel/PowerPC/load_store.ll b/test/Analysis/CostModel/PowerPC/load_store.ll
index d48be5b5f62be52f5e664edfefb3f4d7a5aec44d..b77dd444774f7c3386e7f1accec7c39285e58de2 100644
--- a/test/Analysis/CostModel/PowerPC/load_store.ll
+++ b/test/Analysis/CostModel/PowerPC/load_store.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s  -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5 | FileCheck %s
+; RUN: opt < %s  -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=g5 -disable-ppc-unaligned | FileCheck %s
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
diff --git a/test/Analysis/CostModel/PowerPC/unaligned_ld_st.ll b/test/Analysis/CostModel/PowerPC/unaligned_ld_st.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6addf25949e65731b495739902912d07f491fb3b
--- /dev/null
+++ b/test/Analysis/CostModel/PowerPC/unaligned_ld_st.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define i32 @test(i32 %arg) {
+
+  ; CHECK: cost of 1 {{.*}} load
+  load i8, i8* undef, align 1
+  ; CHECK: cost of 1 {{.*}} load
+  load i16, i16* undef, align 1
+  ; CHECK: cost of 1 {{.*}} load
+  load i32, i32* undef, align 1
+  ; CHECK: cost of 1 {{.*}} load
+  load i64, i64* undef, align 1
+
+  ; CHECK: cost of 1 {{.*}} store
+  store i8 undef, i8* undef, align 1
+  ; CHECK: cost of 1 {{.*}} store
+  store i16 undef, i16* undef, align 1
+  ; CHECK: cost of 1 {{.*}} store
+  store i32 undef, i32* undef, align 1
+  ; CHECK: cost of 1 {{.*}} store
+  store i64 undef, i64* undef, align 1
+
+  ret i32 undef
+}
diff --git a/test/Analysis/CostModel/SystemZ/cmp-ext.ll b/test/Analysis/CostModel/SystemZ/cmp-ext.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e33587876aa754a8ff006282e3f412d789b7da88
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/cmp-ext.ll
@@ -0,0 +1,2403 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+
+define i8 @fun0(i8 %val1, i8 %val2) {
+  %cmp = icmp eq i8 %val1, %val2
+  %v = sext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun0
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i8
+}
+
+define i16 @fun1(i8 %val1, i8 %val2) {
+  %cmp = icmp eq i8 %val1, %val2
+  %v = sext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun1
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i16
+}
+
+define i32 @fun2(i8 %val1, i8 %val2) {
+  %cmp = icmp eq i8 %val1, %val2
+  %v = sext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun2
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i32
+}
+
+define i64 @fun3(i8 %val1, i8 %val2) {
+  %cmp = icmp eq i8 %val1, %val2
+  %v = sext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun3
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext i1 %cmp to i64
+}
+
+define i8 @fun4(i16 %val1, i16 %val2) {
+  %cmp = icmp eq i16 %val1, %val2
+  %v = sext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun4
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i8
+}
+
+define i16 @fun5(i16 %val1, i16 %val2) {
+  %cmp = icmp eq i16 %val1, %val2
+  %v = sext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun5
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i16
+}
+
+define i32 @fun6(i16 %val1, i16 %val2) {
+  %cmp = icmp eq i16 %val1, %val2
+  %v = sext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun6
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i32
+}
+
+define i64 @fun7(i16 %val1, i16 %val2) {
+  %cmp = icmp eq i16 %val1, %val2
+  %v = sext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun7
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext i1 %cmp to i64
+}
+
+define i8 @fun8(i32 %val1, i32 %val2) {
+  %cmp = icmp eq i32 %val1, %val2
+  %v = sext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun8
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i8
+}
+
+define i16 @fun9(i32 %val1, i32 %val2) {
+  %cmp = icmp eq i32 %val1, %val2
+  %v = sext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun9
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i16
+}
+
+define i32 @fun10(i32 %val1, i32 %val2) {
+  %cmp = icmp eq i32 %val1, %val2
+  %v = sext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun10
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i32
+}
+
+define i64 @fun11(i32 %val1, i32 %val2) {
+  %cmp = icmp eq i32 %val1, %val2
+  %v = sext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun11
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext i1 %cmp to i64
+}
+
+define i8 @fun12(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = sext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun12
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i8
+}
+
+define i16 @fun13(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = sext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun13
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i16
+}
+
+define i32 @fun14(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = sext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun14
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext i1 %cmp to i32
+}
+
+define i64 @fun15(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = sext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun15
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext i1 %cmp to i64
+}
+
+define i8 @fun16(float %val1, float %val2) {
+  %cmp = fcmp ogt float %val1, %val2
+  %v = sext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun16
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext i1 %cmp to i8
+}
+
+define i16 @fun17(float %val1, float %val2) {
+  %cmp = fcmp ogt float %val1, %val2
+  %v = sext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun17
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext i1 %cmp to i16
+}
+
+define i32 @fun18(float %val1, float %val2) {
+  %cmp = fcmp ogt float %val1, %val2
+  %v = sext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun18
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext i1 %cmp to i32
+}
+
+define i64 @fun19(float %val1, float %val2) {
+  %cmp = fcmp ogt float %val1, %val2
+  %v = sext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun19
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = sext i1 %cmp to i64
+}
+
+define i8 @fun20(double %val1, double %val2) {
+  %cmp = fcmp ogt double %val1, %val2
+  %v = sext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun20
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext i1 %cmp to i8
+}
+
+define i16 @fun21(double %val1, double %val2) {
+  %cmp = fcmp ogt double %val1, %val2
+  %v = sext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun21
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext i1 %cmp to i16
+}
+
+define i32 @fun22(double %val1, double %val2) {
+  %cmp = fcmp ogt double %val1, %val2
+  %v = sext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun22
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext i1 %cmp to i32
+}
+
+define i64 @fun23(double %val1, double %val2) {
+  %cmp = fcmp ogt double %val1, %val2
+  %v = sext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun23
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = sext i1 %cmp to i64
+}
+
+define <2 x i8> @fun24(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun24
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun25(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun25
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun26(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun26
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = sext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun27(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun27
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <2 x i1> %cmp to <2 x i64>
+}
+
+define <2 x i8> @fun28(<2 x i16> %val1, <2 x i16> %val2) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun28
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun29(<2 x i16> %val1, <2 x i16> %val2) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun29
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun30(<2 x i16> %val1, <2 x i16> %val2) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun30
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun31(<2 x i16> %val1, <2 x i16> %val2) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun31
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = sext <2 x i1> %cmp to <2 x i64>
+}
+
+define <2 x i8> @fun32(<2 x i32> %val1, <2 x i32> %val2) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun32
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun33(<2 x i32> %val1, <2 x i32> %val2) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun33
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun34(<2 x i32> %val1, <2 x i32> %val2) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun34
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun35(<2 x i32> %val1, <2 x i32> %val2) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun35
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i64>
+}
+
+define <2 x i8> @fun36(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun36
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun37(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun37
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun38(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun38
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun39(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun39
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <2 x i1> %cmp to <2 x i64>
+}
+
+define <2 x i8> @fun40(<2 x float> %val1, <2 x float> %val2) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun40
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun41(<2 x float> %val1, <2 x float> %val2) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun41
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun42(<2 x float> %val1, <2 x float> %val2) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun42
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun43(<2 x float> %val1, <2 x float> %val2) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun43
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i64>
+}
+
+define <2 x i8> @fun44(<2 x double> %val1, <2 x double> %val2) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun44
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun45(<2 x double> %val1, <2 x double> %val2) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun45
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun46(<2 x double> %val1, <2 x double> %val2) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun46
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun47(<2 x double> %val1, <2 x double> %val2) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %v = sext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun47
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <2 x i1> %cmp to <2 x i64>
+}
+
+define <4 x i8> @fun48(<4 x i8> %val1, <4 x i8> %val2) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun48
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun49(<4 x i8> %val1, <4 x i8> %val2) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun49
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun50(<4 x i8> %val1, <4 x i8> %val2) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun50
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = sext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun51(<4 x i8> %val1, <4 x i8> %val2) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun51
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 7 for instruction:   %v = sext <4 x i1> %cmp to <4 x i64>
+}
+
+define <4 x i8> @fun52(<4 x i16> %val1, <4 x i16> %val2) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun52
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun53(<4 x i16> %val1, <4 x i16> %val2) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun53
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun54(<4 x i16> %val1, <4 x i16> %val2) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun54
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun55(<4 x i16> %val1, <4 x i16> %val2) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun55
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = sext <4 x i1> %cmp to <4 x i64>
+}
+
+define <4 x i8> @fun56(<4 x i32> %val1, <4 x i32> %val2) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun56
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun57(<4 x i32> %val1, <4 x i32> %val2) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun57
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun58(<4 x i32> %val1, <4 x i32> %val2) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun58
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun59(<4 x i32> %val1, <4 x i32> %val2) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun59
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <4 x i1> %cmp to <4 x i64>
+}
+
+define <4 x i8> @fun60(<4 x i64> %val1, <4 x i64> %val2) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun60
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun61(<4 x i64> %val1, <4 x i64> %val2) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun61
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun62(<4 x i64> %val1, <4 x i64> %val2) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun62
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun63(<4 x i64> %val1, <4 x i64> %val2) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun63
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <4 x i1> %cmp to <4 x i64>
+}
+
+define <4 x i8> @fun64(<4 x float> %val1, <4 x float> %val2) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun64
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun65(<4 x float> %val1, <4 x float> %val2) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun65
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun66(<4 x float> %val1, <4 x float> %val2) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun66
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun67(<4 x float> %val1, <4 x float> %val2) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun67
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <4 x i1> %cmp to <4 x i64>
+}
+
+define <4 x i8> @fun68(<4 x double> %val1, <4 x double> %val2) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun68
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun69(<4 x double> %val1, <4 x double> %val2) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun69
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun70(<4 x double> %val1, <4 x double> %val2) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun70
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun71(<4 x double> %val1, <4 x double> %val2) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %v = sext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun71
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <4 x i1> %cmp to <4 x i64>
+}
+
+define <8 x i8> @fun72(<8 x i8> %val1, <8 x i8> %val2) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun72
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun73(<8 x i8> %val1, <8 x i8> %val2) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun73
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun74(<8 x i8> %val1, <8 x i8> %val2) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun74
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = sext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun75(<8 x i8> %val1, <8 x i8> %val2) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun75
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 15 for instruction:   %v = sext <8 x i1> %cmp to <8 x i64>
+}
+
+define <8 x i8> @fun76(<8 x i16> %val1, <8 x i16> %val2) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun76
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun77(<8 x i16> %val1, <8 x i16> %val2) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun77
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun78(<8 x i16> %val1, <8 x i16> %val2) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun78
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun79(<8 x i16> %val1, <8 x i16> %val2) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun79
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 11 for instruction:   %v = sext <8 x i1> %cmp to <8 x i64>
+}
+
+define <8 x i8> @fun80(<8 x i32> %val1, <8 x i32> %val2) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun80
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun81(<8 x i32> %val1, <8 x i32> %val2) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun81
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun82(<8 x i32> %val1, <8 x i32> %val2) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun82
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun83(<8 x i32> %val1, <8 x i32> %val2) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun83
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 7 for instruction:   %v = sext <8 x i1> %cmp to <8 x i64>
+}
+
+define <8 x i8> @fun84(<8 x i64> %val1, <8 x i64> %val2) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun84
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun85(<8 x i64> %val1, <8 x i64> %val2) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun85
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun86(<8 x i64> %val1, <8 x i64> %val2) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun86
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = sext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun87(<8 x i64> %val1, <8 x i64> %val2) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun87
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <8 x i1> %cmp to <8 x i64>
+}
+
+define <8 x i8> @fun88(<8 x float> %val1, <8 x float> %val2) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun88
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun89(<8 x float> %val1, <8 x float> %val2) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun89
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun90(<8 x float> %val1, <8 x float> %val2) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun90
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun91(<8 x float> %val1, <8 x float> %val2) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun91
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 7 for instruction:   %v = sext <8 x i1> %cmp to <8 x i64>
+}
+
+define <8 x i8> @fun92(<8 x double> %val1, <8 x double> %val2) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun92
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun93(<8 x double> %val1, <8 x double> %val2) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun93
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun94(<8 x double> %val1, <8 x double> %val2) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun94
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = sext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun95(<8 x double> %val1, <8 x double> %val2) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %v = sext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun95
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <8 x i1> %cmp to <8 x i64>
+}
+
+define <16 x i8> @fun96(<16 x i8> %val1, <16 x i8> %val2) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun96
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun97(<16 x i8> %val1, <16 x i8> %val2) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun97
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun98(<16 x i8> %val1, <16 x i8> %val2) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun98
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 11 for instruction:   %v = sext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun99(<16 x i8> %val1, <16 x i8> %val2) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun99
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 31 for instruction:   %v = sext <16 x i1> %cmp to <16 x i64>
+}
+
+define <16 x i8> @fun100(<16 x i16> %val1, <16 x i16> %val2) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun100
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = sext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun101(<16 x i16> %val1, <16 x i16> %val2) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun101
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun102(<16 x i16> %val1, <16 x i16> %val2) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun102
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 7 for instruction:   %v = sext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun103(<16 x i16> %val1, <16 x i16> %val2) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun103
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 23 for instruction:   %v = sext <16 x i1> %cmp to <16 x i64>
+}
+
+define <16 x i8> @fun104(<16 x i32> %val1, <16 x i32> %val2) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun104
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun105(<16 x i32> %val1, <16 x i32> %val2) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun105
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = sext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun106(<16 x i32> %val1, <16 x i32> %val2) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun106
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun107(<16 x i32> %val1, <16 x i32> %val2) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun107
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 15 for instruction:   %v = sext <16 x i1> %cmp to <16 x i64>
+}
+
+define <16 x i8> @fun108(<16 x i64> %val1, <16 x i64> %val2) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun108
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 7 for instruction:   %v = sext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun109(<16 x i64> %val1, <16 x i64> %val2) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun109
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 6 for instruction:   %v = sext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun110(<16 x i64> %val1, <16 x i64> %val2) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun110
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun111(<16 x i64> %val1, <16 x i64> %val2) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun111
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <16 x i1> %cmp to <16 x i64>
+}
+
+define <16 x i8> @fun112(<16 x float> %val1, <16 x float> %val2) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun112
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = sext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun113(<16 x float> %val1, <16 x float> %val2) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun113
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = sext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun114(<16 x float> %val1, <16 x float> %val2) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun114
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun115(<16 x float> %val1, <16 x float> %val2) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun115
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 15 for instruction:   %v = sext <16 x i1> %cmp to <16 x i64>
+}
+
+define <16 x i8> @fun116(<16 x double> %val1, <16 x double> %val2) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun116
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 7 for instruction:   %v = sext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun117(<16 x double> %val1, <16 x double> %val2) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun117
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 6 for instruction:   %v = sext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun118(<16 x double> %val1, <16 x double> %val2) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun118
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = sext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun119(<16 x double> %val1, <16 x double> %val2) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %v = sext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun119
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 0 for instruction:   %v = sext <16 x i1> %cmp to <16 x i64>
+}
+
+define i8 @fun120(i8 %val1, i8 %val2) {
+  %cmp = icmp eq i8 %val1, %val2
+  %v = zext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun120
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i8
+}
+
+define i16 @fun121(i8 %val1, i8 %val2) {
+  %cmp = icmp eq i8 %val1, %val2
+  %v = zext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun121
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i16
+}
+
+define i32 @fun122(i8 %val1, i8 %val2) {
+  %cmp = icmp eq i8 %val1, %val2
+  %v = zext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun122
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i32
+}
+
+define i64 @fun123(i8 %val1, i8 %val2) {
+  %cmp = icmp eq i8 %val1, %val2
+  %v = zext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun123
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i64
+}
+
+define i8 @fun124(i16 %val1, i16 %val2) {
+  %cmp = icmp eq i16 %val1, %val2
+  %v = zext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun124
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i8
+}
+
+define i16 @fun125(i16 %val1, i16 %val2) {
+  %cmp = icmp eq i16 %val1, %val2
+  %v = zext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun125
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i16
+}
+
+define i32 @fun126(i16 %val1, i16 %val2) {
+  %cmp = icmp eq i16 %val1, %val2
+  %v = zext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun126
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i32
+}
+
+define i64 @fun127(i16 %val1, i16 %val2) {
+  %cmp = icmp eq i16 %val1, %val2
+  %v = zext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun127
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i64
+}
+
+define i8 @fun128(i32 %val1, i32 %val2) {
+  %cmp = icmp eq i32 %val1, %val2
+  %v = zext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun128
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i8
+}
+
+define i16 @fun129(i32 %val1, i32 %val2) {
+  %cmp = icmp eq i32 %val1, %val2
+  %v = zext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun129
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i16
+}
+
+define i32 @fun130(i32 %val1, i32 %val2) {
+  %cmp = icmp eq i32 %val1, %val2
+  %v = zext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun130
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i32
+}
+
+define i64 @fun131(i32 %val1, i32 %val2) {
+  %cmp = icmp eq i32 %val1, %val2
+  %v = zext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun131
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i64
+}
+
+define i8 @fun132(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = zext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun132
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i8
+}
+
+define i16 @fun133(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = zext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun133
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i16
+}
+
+define i32 @fun134(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = zext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun134
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i32
+}
+
+define i64 @fun135(i64 %val1, i64 %val2) {
+  %cmp = icmp eq i64 %val1, %val2
+  %v = zext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun135
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext i1 %cmp to i64
+}
+
+define i8 @fun136(float %val1, float %val2) {
+  %cmp = fcmp ogt float %val1, %val2
+  %v = zext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun136
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext i1 %cmp to i8
+}
+
+define i16 @fun137(float %val1, float %val2) {
+  %cmp = fcmp ogt float %val1, %val2
+  %v = zext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun137
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext i1 %cmp to i16
+}
+
+define i32 @fun138(float %val1, float %val2) {
+  %cmp = fcmp ogt float %val1, %val2
+  %v = zext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun138
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext i1 %cmp to i32
+}
+
+define i64 @fun139(float %val1, float %val2) {
+  %cmp = fcmp ogt float %val1, %val2
+  %v = zext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun139
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext i1 %cmp to i64
+}
+
+define i8 @fun140(double %val1, double %val2) {
+  %cmp = fcmp ogt double %val1, %val2
+  %v = zext i1 %cmp to i8
+  ret i8 %v
+
+; CHECK: fun140
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext i1 %cmp to i8
+}
+
+define i16 @fun141(double %val1, double %val2) {
+  %cmp = fcmp ogt double %val1, %val2
+  %v = zext i1 %cmp to i16
+  ret i16 %v
+
+; CHECK: fun141
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext i1 %cmp to i16
+}
+
+define i32 @fun142(double %val1, double %val2) {
+  %cmp = fcmp ogt double %val1, %val2
+  %v = zext i1 %cmp to i32
+  ret i32 %v
+
+; CHECK: fun142
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext i1 %cmp to i32
+}
+
+define i64 @fun143(double %val1, double %val2) {
+  %cmp = fcmp ogt double %val1, %val2
+  %v = zext i1 %cmp to i64
+  ret i64 %v
+
+; CHECK: fun143
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext i1 %cmp to i64
+}
+
+define <2 x i8> @fun144(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun144
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun145(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun145
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun146(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun146
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun147(<2 x i8> %val1, <2 x i8> %val2) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun147
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <2 x i1> %cmp to <2 x i64>
+}
+
+define <2 x i8> @fun148(<2 x i16> %val1, <2 x i16> %val2) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun148
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun149(<2 x i16> %val1, <2 x i16> %val2) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun149
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun150(<2 x i16> %val1, <2 x i16> %val2) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun150
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun151(<2 x i16> %val1, <2 x i16> %val2) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun151
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext <2 x i1> %cmp to <2 x i64>
+}
+
+define <2 x i8> @fun152(<2 x i32> %val1, <2 x i32> %val2) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun152
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun153(<2 x i32> %val1, <2 x i32> %val2) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun153
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun154(<2 x i32> %val1, <2 x i32> %val2) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun154
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun155(<2 x i32> %val1, <2 x i32> %val2) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun155
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i64>
+}
+
+define <2 x i8> @fun156(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun156
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun157(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun157
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun158(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun158
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun159(<2 x i64> %val1, <2 x i64> %val2) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun159
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <2 x i1> %cmp to <2 x i64>
+}
+
+define <2 x i8> @fun160(<2 x float> %val1, <2 x float> %val2) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun160
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun161(<2 x float> %val1, <2 x float> %val2) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun161
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun162(<2 x float> %val1, <2 x float> %val2) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun162
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun163(<2 x float> %val1, <2 x float> %val2) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun163
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i64>
+}
+
+define <2 x i8> @fun164(<2 x double> %val1, <2 x double> %val2) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i8>
+  ret <2 x i8> %v
+
+; CHECK: fun164
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i8>
+}
+
+define <2 x i16> @fun165(<2 x double> %val1, <2 x double> %val2) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i16>
+  ret <2 x i16> %v
+
+; CHECK: fun165
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i16>
+}
+
+define <2 x i32> @fun166(<2 x double> %val1, <2 x double> %val2) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i32>
+  ret <2 x i32> %v
+
+; CHECK: fun166
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <2 x i1> %cmp to <2 x i32>
+}
+
+define <2 x i64> @fun167(<2 x double> %val1, <2 x double> %val2) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %v = zext <2 x i1> %cmp to <2 x i64>
+  ret <2 x i64> %v
+
+; CHECK: fun167
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <2 x i1> %cmp to <2 x i64>
+}
+
+define <4 x i8> @fun168(<4 x i8> %val1, <4 x i8> %val2) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun168
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun169(<4 x i8> %val1, <4 x i8> %val2) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun169
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun170(<4 x i8> %val1, <4 x i8> %val2) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun170
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 3 for instruction:   %v = zext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun171(<4 x i8> %val1, <4 x i8> %val2) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun171
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 9 for instruction:   %v = zext <4 x i1> %cmp to <4 x i64>
+}
+
+define <4 x i8> @fun172(<4 x i16> %val1, <4 x i16> %val2) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun172
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun173(<4 x i16> %val1, <4 x i16> %val2) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun173
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun174(<4 x i16> %val1, <4 x i16> %val2) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun174
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun175(<4 x i16> %val1, <4 x i16> %val2) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun175
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 7 for instruction:   %v = zext <4 x i1> %cmp to <4 x i64>
+}
+
+define <4 x i8> @fun176(<4 x i32> %val1, <4 x i32> %val2) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun176
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun177(<4 x i32> %val1, <4 x i32> %val2) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun177
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun178(<4 x i32> %val1, <4 x i32> %val2) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun178
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun179(<4 x i32> %val1, <4 x i32> %val2) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun179
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = zext <4 x i1> %cmp to <4 x i64>
+}
+
+define <4 x i8> @fun180(<4 x i64> %val1, <4 x i64> %val2) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun180
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun181(<4 x i64> %val1, <4 x i64> %val2) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun181
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun182(<4 x i64> %val1, <4 x i64> %val2) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun182
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun183(<4 x i64> %val1, <4 x i64> %val2) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun183
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i64>
+}
+
+define <4 x i8> @fun184(<4 x float> %val1, <4 x float> %val2) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun184
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun185(<4 x float> %val1, <4 x float> %val2) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun185
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun186(<4 x float> %val1, <4 x float> %val2) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun186
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun187(<4 x float> %val1, <4 x float> %val2) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun187
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = zext <4 x i1> %cmp to <4 x i64>
+}
+
+define <4 x i8> @fun188(<4 x double> %val1, <4 x double> %val2) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i8>
+  ret <4 x i8> %v
+
+; CHECK: fun188
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i8>
+}
+
+define <4 x i16> @fun189(<4 x double> %val1, <4 x double> %val2) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i16>
+  ret <4 x i16> %v
+
+; CHECK: fun189
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i16>
+}
+
+define <4 x i32> @fun190(<4 x double> %val1, <4 x double> %val2) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i32>
+  ret <4 x i32> %v
+
+; CHECK: fun190
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i32>
+}
+
+define <4 x i64> @fun191(<4 x double> %val1, <4 x double> %val2) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %v = zext <4 x i1> %cmp to <4 x i64>
+  ret <4 x i64> %v
+
+; CHECK: fun191
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <4 x i1> %cmp to <4 x i64>
+}
+
+define <8 x i8> @fun192(<8 x i8> %val1, <8 x i8> %val2) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun192
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun193(<8 x i8> %val1, <8 x i8> %val2) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun193
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun194(<8 x i8> %val1, <8 x i8> %val2) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun194
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 7 for instruction:   %v = zext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun195(<8 x i8> %val1, <8 x i8> %val2) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun195
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 19 for instruction:   %v = zext <8 x i1> %cmp to <8 x i64>
+}
+
+define <8 x i8> @fun196(<8 x i16> %val1, <8 x i16> %val2) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun196
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun197(<8 x i16> %val1, <8 x i16> %val2) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun197
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun198(<8 x i16> %val1, <8 x i16> %val2) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun198
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = zext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun199(<8 x i16> %val1, <8 x i16> %val2) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun199
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 15 for instruction:   %v = zext <8 x i1> %cmp to <8 x i64>
+}
+
+define <8 x i8> @fun200(<8 x i32> %val1, <8 x i32> %val2) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun200
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun201(<8 x i32> %val1, <8 x i32> %val2) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun201
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun202(<8 x i32> %val1, <8 x i32> %val2) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun202
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun203(<8 x i32> %val1, <8 x i32> %val2) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun203
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 11 for instruction:   %v = zext <8 x i1> %cmp to <8 x i64>
+}
+
+define <8 x i8> @fun204(<8 x i64> %val1, <8 x i64> %val2) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun204
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun205(<8 x i64> %val1, <8 x i64> %val2) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun205
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun206(<8 x i64> %val1, <8 x i64> %val2) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun206
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun207(<8 x i64> %val1, <8 x i64> %val2) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun207
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <8 x i1> %cmp to <8 x i64>
+}
+
+define <8 x i8> @fun208(<8 x float> %val1, <8 x float> %val2) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun208
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun209(<8 x float> %val1, <8 x float> %val2) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun209
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun210(<8 x float> %val1, <8 x float> %val2) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun210
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun211(<8 x float> %val1, <8 x float> %val2) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun211
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 11 for instruction:   %v = zext <8 x i1> %cmp to <8 x i64>
+}
+
+define <8 x i8> @fun212(<8 x double> %val1, <8 x double> %val2) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i8>
+  ret <8 x i8> %v
+
+; CHECK: fun212
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <8 x i1> %cmp to <8 x i8>
+}
+
+define <8 x i16> @fun213(<8 x double> %val1, <8 x double> %val2) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i16>
+  ret <8 x i16> %v
+
+; CHECK: fun213
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <8 x i1> %cmp to <8 x i16>
+}
+
+define <8 x i32> @fun214(<8 x double> %val1, <8 x double> %val2) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i32>
+  ret <8 x i32> %v
+
+; CHECK: fun214
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <8 x i1> %cmp to <8 x i32>
+}
+
+define <8 x i64> @fun215(<8 x double> %val1, <8 x double> %val2) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %v = zext <8 x i1> %cmp to <8 x i64>
+  ret <8 x i64> %v
+
+; CHECK: fun215
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <8 x i1> %cmp to <8 x i64>
+}
+
+define <16 x i8> @fun216(<16 x i8> %val1, <16 x i8> %val2) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun216
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %v = zext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun217(<16 x i8> %val1, <16 x i8> %val2) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun217
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 5 for instruction:   %v = zext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun218(<16 x i8> %val1, <16 x i8> %val2) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun218
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 15 for instruction:   %v = zext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun219(<16 x i8> %val1, <16 x i8> %val2) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun219
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 39 for instruction:   %v = zext <16 x i1> %cmp to <16 x i64>
+}
+
+define <16 x i8> @fun220(<16 x i16> %val1, <16 x i16> %val2) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun220
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun221(<16 x i16> %val1, <16 x i16> %val2) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun221
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %v = zext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun222(<16 x i16> %val1, <16 x i16> %val2) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun222
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 11 for instruction:   %v = zext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun223(<16 x i16> %val1, <16 x i16> %val2) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun223
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 31 for instruction:   %v = zext <16 x i1> %cmp to <16 x i64>
+}
+
+define <16 x i8> @fun224(<16 x i32> %val1, <16 x i32> %val2) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun224
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun225(<16 x i32> %val1, <16 x i32> %val2) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun225
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun226(<16 x i32> %val1, <16 x i32> %val2) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun226
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun227(<16 x i32> %val1, <16 x i32> %val2) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun227
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 23 for instruction:   %v = zext <16 x i1> %cmp to <16 x i64>
+}
+
+define <16 x i8> @fun228(<16 x i64> %val1, <16 x i64> %val2) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun228
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 8 for instruction:   %v = zext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun229(<16 x i64> %val1, <16 x i64> %val2) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun229
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 8 for instruction:   %v = zext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun230(<16 x i64> %val1, <16 x i64> %val2) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun230
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 8 for instruction:   %v = zext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun231(<16 x i64> %val1, <16 x i64> %val2) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun231
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 8 for instruction:   %v = zext <16 x i1> %cmp to <16 x i64>
+}
+
+define <16 x i8> @fun232(<16 x float> %val1, <16 x float> %val2) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun232
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun233(<16 x float> %val1, <16 x float> %val2) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun233
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun234(<16 x float> %val1, <16 x float> %val2) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun234
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 4 for instruction:   %v = zext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun235(<16 x float> %val1, <16 x float> %val2) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun235
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 23 for instruction:   %v = zext <16 x i1> %cmp to <16 x i64>
+}
+
+define <16 x i8> @fun236(<16 x double> %val1, <16 x double> %val2) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i8>
+  ret <16 x i8> %v
+
+; CHECK: fun236
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 8 for instruction:   %v = zext <16 x i1> %cmp to <16 x i8>
+}
+
+define <16 x i16> @fun237(<16 x double> %val1, <16 x double> %val2) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i16>
+  ret <16 x i16> %v
+
+; CHECK: fun237
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 8 for instruction:   %v = zext <16 x i1> %cmp to <16 x i16>
+}
+
+define <16 x i32> @fun238(<16 x double> %val1, <16 x double> %val2) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i32>
+  ret <16 x i32> %v
+
+; CHECK: fun238
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 8 for instruction:   %v = zext <16 x i1> %cmp to <16 x i32>
+}
+
+define <16 x i64> @fun239(<16 x double> %val1, <16 x double> %val2) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %v = zext <16 x i1> %cmp to <16 x i64>
+  ret <16 x i64> %v
+
+; CHECK: fun239
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 8 for instruction:   %v = zext <16 x i1> %cmp to <16 x i64>
+}
+
diff --git a/test/Analysis/CostModel/SystemZ/cmpsel.ll b/test/Analysis/CostModel/SystemZ/cmpsel.ll
new file mode 100644
index 0000000000000000000000000000000000000000..de72ec3a8b4ea69478302444fceaba9806e8e28e
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/cmpsel.ll
@@ -0,0 +1,1987 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Note: Cost estimates of select of a fp-type is somewhat arbitrary, since it
+; involves a conditional jump.
+; Note: Vector fp32 is not directly supported, and not quite exact in
+; estimates (but it is big absolute values).
+
+define i8 @fun0(i8 %val1, i8 %val2,
+                i8 %val3, i8 %val4) {
+  %cmp = icmp eq i8 %val1, %val2
+  %sel = select i1 %cmp, i8 %val3, i8 %val4
+  ret i8 %sel
+
+; CHECK: fun0
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i8 %val3, i8 %val4
+}
+
+define i16 @fun1(i8 %val1, i8 %val2,
+                 i16 %val3, i16 %val4) {
+  %cmp = icmp eq i8 %val1, %val2
+  %sel = select i1 %cmp, i16 %val3, i16 %val4
+  ret i16 %sel
+
+; CHECK: fun1
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i16 %val3, i16 %val4
+}
+
+define i32 @fun2(i8 %val1, i8 %val2,
+                 i32 %val3, i32 %val4) {
+  %cmp = icmp eq i8 %val1, %val2
+  %sel = select i1 %cmp, i32 %val3, i32 %val4
+  ret i32 %sel
+
+; CHECK: fun2
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i32 %val3, i32 %val4
+}
+
+define i64 @fun3(i8 %val1, i8 %val2,
+                 i64 %val3, i64 %val4) {
+  %cmp = icmp eq i8 %val1, %val2
+  %sel = select i1 %cmp, i64 %val3, i64 %val4
+  ret i64 %sel
+
+; CHECK: fun3
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %val4
+}
+
+define float @fun4(i8 %val1, i8 %val2,
+                   float %val3, float %val4) {
+  %cmp = icmp eq i8 %val1, %val2
+  %sel = select i1 %cmp, float %val3, float %val4
+  ret float %sel
+
+; CHECK: fun4
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, float %val3, float %val4
+}
+
+define double @fun5(i8 %val1, i8 %val2,
+                    double %val3, double %val4) {
+  %cmp = icmp eq i8 %val1, %val2
+  %sel = select i1 %cmp, double %val3, double %val4
+  ret double %sel
+
+; CHECK: fun5
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i8 %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, double %val3, double %val4
+}
+
+define i8 @fun6(i16 %val1, i16 %val2,
+                i8 %val3, i8 %val4) {
+  %cmp = icmp eq i16 %val1, %val2
+  %sel = select i1 %cmp, i8 %val3, i8 %val4
+  ret i8 %sel
+
+; CHECK: fun6
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i8 %val3, i8 %val4
+}
+
+define i16 @fun7(i16 %val1, i16 %val2,
+                 i16 %val3, i16 %val4) {
+  %cmp = icmp eq i16 %val1, %val2
+  %sel = select i1 %cmp, i16 %val3, i16 %val4
+  ret i16 %sel
+
+; CHECK: fun7
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i16 %val3, i16 %val4
+}
+
+define i32 @fun8(i16 %val1, i16 %val2,
+                 i32 %val3, i32 %val4) {
+  %cmp = icmp eq i16 %val1, %val2
+  %sel = select i1 %cmp, i32 %val3, i32 %val4
+  ret i32 %sel
+
+; CHECK: fun8
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i32 %val3, i32 %val4
+}
+
+define i64 @fun9(i16 %val1, i16 %val2,
+                 i64 %val3, i64 %val4) {
+  %cmp = icmp eq i16 %val1, %val2
+  %sel = select i1 %cmp, i64 %val3, i64 %val4
+  ret i64 %sel
+
+; CHECK: fun9
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %val4
+}
+
+define float @fun10(i16 %val1, i16 %val2,
+                    float %val3, float %val4) {
+  %cmp = icmp eq i16 %val1, %val2
+  %sel = select i1 %cmp, float %val3, float %val4
+  ret float %sel
+
+; CHECK: fun10
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, float %val3, float %val4
+}
+
+define double @fun11(i16 %val1, i16 %val2,
+                     double %val3, double %val4) {
+  %cmp = icmp eq i16 %val1, %val2
+  %sel = select i1 %cmp, double %val3, double %val4
+  ret double %sel
+
+; CHECK: fun11
+; CHECK: cost of 3 for instruction:   %cmp = icmp eq i16 %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, double %val3, double %val4
+}
+
+define i8 @fun12(i32 %val1, i32 %val2,
+                 i8 %val3, i8 %val4) {
+  %cmp = icmp eq i32 %val1, %val2
+  %sel = select i1 %cmp, i8 %val3, i8 %val4
+  ret i8 %sel
+
+; CHECK: fun12
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i8 %val3, i8 %val4
+}
+
+define i16 @fun13(i32 %val1, i32 %val2,
+                  i16 %val3, i16 %val4) {
+  %cmp = icmp eq i32 %val1, %val2
+  %sel = select i1 %cmp, i16 %val3, i16 %val4
+  ret i16 %sel
+
+; CHECK: fun13
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i16 %val3, i16 %val4
+}
+
+define i32 @fun14(i32 %val1, i32 %val2,
+                  i32 %val3, i32 %val4) {
+  %cmp = icmp eq i32 %val1, %val2
+  %sel = select i1 %cmp, i32 %val3, i32 %val4
+  ret i32 %sel
+
+; CHECK: fun14
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i32 %val3, i32 %val4
+}
+
+define i64 @fun15(i32 %val1, i32 %val2,
+                  i64 %val3, i64 %val4) {
+  %cmp = icmp eq i32 %val1, %val2
+  %sel = select i1 %cmp, i64 %val3, i64 %val4
+  ret i64 %sel
+
+; CHECK: fun15
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %val4
+}
+
+define float @fun16(i32 %val1, i32 %val2,
+                    float %val3, float %val4) {
+  %cmp = icmp eq i32 %val1, %val2
+  %sel = select i1 %cmp, float %val3, float %val4
+  ret float %sel
+
+; CHECK: fun16
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, float %val3, float %val4
+}
+
+define double @fun17(i32 %val1, i32 %val2,
+                     double %val3, double %val4) {
+  %cmp = icmp eq i32 %val1, %val2
+  %sel = select i1 %cmp, double %val3, double %val4
+  ret double %sel
+
+; CHECK: fun17
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i32 %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, double %val3, double %val4
+}
+
+define i8 @fun18(i64 %val1, i64 %val2,
+                 i8 %val3, i8 %val4) {
+  %cmp = icmp eq i64 %val1, %val2
+  %sel = select i1 %cmp, i8 %val3, i8 %val4
+  ret i8 %sel
+
+; CHECK: fun18
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i8 %val3, i8 %val4
+}
+
+define i16 @fun19(i64 %val1, i64 %val2,
+                  i16 %val3, i16 %val4) {
+  %cmp = icmp eq i64 %val1, %val2
+  %sel = select i1 %cmp, i16 %val3, i16 %val4
+  ret i16 %sel
+
+; CHECK: fun19
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i16 %val3, i16 %val4
+}
+
+define i32 @fun20(i64 %val1, i64 %val2,
+                  i32 %val3, i32 %val4) {
+  %cmp = icmp eq i64 %val1, %val2
+  %sel = select i1 %cmp, i32 %val3, i32 %val4
+  ret i32 %sel
+
+; CHECK: fun20
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i32 %val3, i32 %val4
+}
+
+define i64 @fun21(i64 %val1, i64 %val2,
+                  i64 %val3, i64 %val4) {
+  %cmp = icmp eq i64 %val1, %val2
+  %sel = select i1 %cmp, i64 %val3, i64 %val4
+  ret i64 %sel
+
+; CHECK: fun21
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %val4
+}
+
+define float @fun22(i64 %val1, i64 %val2,
+                    float %val3, float %val4) {
+  %cmp = icmp eq i64 %val1, %val2
+  %sel = select i1 %cmp, float %val3, float %val4
+  ret float %sel
+
+; CHECK: fun22
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, float %val3, float %val4
+}
+
+define double @fun23(i64 %val1, i64 %val2,
+                     double %val3, double %val4) {
+  %cmp = icmp eq i64 %val1, %val2
+  %sel = select i1 %cmp, double %val3, double %val4
+  ret double %sel
+
+; CHECK: fun23
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq i64 %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, double %val3, double %val4
+}
+
+define <2 x i8> @fun24(<2 x i8> %val1, <2 x i8> %val2,
+                       <2 x i8> %val3, <2 x i8> %val4) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+
+; CHECK: fun24
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+}
+
+define <2 x i16> @fun25(<2 x i8> %val1, <2 x i8> %val2,
+                        <2 x i16> %val3, <2 x i16> %val4) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+
+; CHECK: fun25
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+}
+
+define <2 x i32> @fun26(<2 x i8> %val1, <2 x i8> %val2,
+                        <2 x i32> %val3, <2 x i32> %val4) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+
+; CHECK: fun26
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 3 for instruction:   %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+}
+
+define <2 x i64> @fun27(<2 x i8> %val1, <2 x i8> %val2,
+                        <2 x i64> %val3, <2 x i64> %val4) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+
+; CHECK: fun27
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+}
+
+define <2 x float> @fun28(<2 x i8> %val1, <2 x i8> %val2,
+                          <2 x float> %val3, <2 x float> %val4) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+
+; CHECK: fun28
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 3 for instruction:   %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+}
+
+define <2 x double> @fun29(<2 x i8> %val1, <2 x i8> %val2,
+                           <2 x double> %val3, <2 x double> %val4) {
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+
+; CHECK: fun29
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i8> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+}
+
+define <2 x i8> @fun30(<2 x i16> %val1, <2 x i16> %val2,
+                       <2 x i8> %val3, <2 x i8> %val4) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+
+; CHECK: fun30
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+}
+
+define <2 x i16> @fun31(<2 x i16> %val1, <2 x i16> %val2,
+                        <2 x i16> %val3, <2 x i16> %val4) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+
+; CHECK: fun31
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+}
+
+define <2 x i32> @fun32(<2 x i16> %val1, <2 x i16> %val2,
+                        <2 x i32> %val3, <2 x i32> %val4) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+
+; CHECK: fun32
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+}
+
+define <2 x i64> @fun33(<2 x i16> %val1, <2 x i16> %val2,
+                        <2 x i64> %val3, <2 x i64> %val4) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+
+; CHECK: fun33
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 3 for instruction:   %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+}
+
+define <2 x float> @fun34(<2 x i16> %val1, <2 x i16> %val2,
+                          <2 x float> %val3, <2 x float> %val4) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+
+; CHECK: fun34
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+}
+
+define <2 x double> @fun35(<2 x i16> %val1, <2 x i16> %val2,
+                           <2 x double> %val3, <2 x double> %val4) {
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+
+; CHECK: fun35
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i16> %val1, %val2
+; CHECK: cost of 3 for instruction:   %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+}
+
+define <2 x i8> @fun36(<2 x i32> %val1, <2 x i32> %val2,
+                       <2 x i8> %val3, <2 x i8> %val4) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+
+; CHECK: fun36
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+}
+
+define <2 x i16> @fun37(<2 x i32> %val1, <2 x i32> %val2,
+                        <2 x i16> %val3, <2 x i16> %val4) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+
+; CHECK: fun37
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+}
+
+define <2 x i32> @fun38(<2 x i32> %val1, <2 x i32> %val2,
+                        <2 x i32> %val3, <2 x i32> %val4) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+
+; CHECK: fun38
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+}
+
+define <2 x i64> @fun39(<2 x i32> %val1, <2 x i32> %val2,
+                        <2 x i64> %val3, <2 x i64> %val4) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+
+; CHECK: fun39
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+}
+
+define <2 x float> @fun40(<2 x i32> %val1, <2 x i32> %val2,
+                          <2 x float> %val3, <2 x float> %val4) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+
+; CHECK: fun40
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+}
+
+define <2 x double> @fun41(<2 x i32> %val1, <2 x i32> %val2,
+                           <2 x double> %val3, <2 x double> %val4) {
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+
+; CHECK: fun41
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+}
+
+define <2 x i8> @fun42(<2 x i64> %val1, <2 x i64> %val2,
+                       <2 x i8> %val3, <2 x i8> %val4) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+
+; CHECK: fun42
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+}
+
+define <2 x i16> @fun43(<2 x i64> %val1, <2 x i64> %val2,
+                        <2 x i16> %val3, <2 x i16> %val4) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+
+; CHECK: fun43
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+}
+
+define <2 x i32> @fun44(<2 x i64> %val1, <2 x i64> %val2,
+                        <2 x i32> %val3, <2 x i32> %val4) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+
+; CHECK: fun44
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+}
+
+define <2 x i64> @fun45(<2 x i64> %val1, <2 x i64> %val2,
+                        <2 x i64> %val3, <2 x i64> %val4) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+
+; CHECK: fun45
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+}
+
+define <2 x float> @fun46(<2 x i64> %val1, <2 x i64> %val2,
+                          <2 x float> %val3, <2 x float> %val4) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+
+; CHECK: fun46
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+}
+
+define <2 x double> @fun47(<2 x i64> %val1, <2 x i64> %val2,
+                           <2 x double> %val3, <2 x double> %val4) {
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+
+; CHECK: fun47
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <2 x i64> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+}
+
+define <4 x i8> @fun48(<4 x i8> %val1, <4 x i8> %val2,
+                       <4 x i8> %val3, <4 x i8> %val4) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+
+; CHECK: fun48
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+}
+
+define <4 x i16> @fun49(<4 x i8> %val1, <4 x i8> %val2,
+                        <4 x i16> %val3, <4 x i16> %val4) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+
+; CHECK: fun49
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+}
+
+define <4 x i32> @fun50(<4 x i8> %val1, <4 x i8> %val2,
+                        <4 x i32> %val3, <4 x i32> %val4) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+
+; CHECK: fun50
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 3 for instruction:   %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+}
+
+define <4 x i64> @fun51(<4 x i8> %val1, <4 x i8> %val2,
+                        <4 x i64> %val3, <4 x i64> %val4) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+
+; CHECK: fun51
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 9 for instruction:   %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+}
+
+define <4 x float> @fun52(<4 x i8> %val1, <4 x i8> %val2,
+                          <4 x float> %val3, <4 x float> %val4) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+
+; CHECK: fun52
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 3 for instruction:   %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+}
+
+define <4 x double> @fun53(<4 x i8> %val1, <4 x i8> %val2,
+                           <4 x double> %val3, <4 x double> %val4) {
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+
+; CHECK: fun53
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i8> %val1, %val2
+; CHECK: cost of 9 for instruction:   %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+}
+
+define <4 x i8> @fun54(<4 x i16> %val1, <4 x i16> %val2,
+                       <4 x i8> %val3, <4 x i8> %val4) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+
+; CHECK: fun54
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+}
+
+define <4 x i16> @fun55(<4 x i16> %val1, <4 x i16> %val2,
+                        <4 x i16> %val3, <4 x i16> %val4) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+
+; CHECK: fun55
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+}
+
+define <4 x i32> @fun56(<4 x i16> %val1, <4 x i16> %val2,
+                        <4 x i32> %val3, <4 x i32> %val4) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+
+; CHECK: fun56
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+}
+
+define <4 x i64> @fun57(<4 x i16> %val1, <4 x i16> %val2,
+                        <4 x i64> %val3, <4 x i64> %val4) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+
+; CHECK: fun57
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 7 for instruction:   %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+}
+
+define <4 x float> @fun58(<4 x i16> %val1, <4 x i16> %val2,
+                          <4 x float> %val3, <4 x float> %val4) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+
+; CHECK: fun58
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+}
+
+define <4 x double> @fun59(<4 x i16> %val1, <4 x i16> %val2,
+                           <4 x double> %val3, <4 x double> %val4) {
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+
+; CHECK: fun59
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i16> %val1, %val2
+; CHECK: cost of 7 for instruction:   %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+}
+
+define <4 x i8> @fun60(<4 x i32> %val1, <4 x i32> %val2,
+                       <4 x i8> %val3, <4 x i8> %val4) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+
+; CHECK: fun60
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+}
+
+define <4 x i16> @fun61(<4 x i32> %val1, <4 x i32> %val2,
+                        <4 x i16> %val3, <4 x i16> %val4) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+
+; CHECK: fun61
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+}
+
+define <4 x i32> @fun62(<4 x i32> %val1, <4 x i32> %val2,
+                        <4 x i32> %val3, <4 x i32> %val4) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+
+; CHECK: fun62
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+}
+
+define <4 x i64> @fun63(<4 x i32> %val1, <4 x i32> %val2,
+                        <4 x i64> %val3, <4 x i64> %val4) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+
+; CHECK: fun63
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 5 for instruction:   %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+}
+
+define <4 x float> @fun64(<4 x i32> %val1, <4 x i32> %val2,
+                          <4 x float> %val3, <4 x float> %val4) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+
+; CHECK: fun64
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+}
+
+define <4 x double> @fun65(<4 x i32> %val1, <4 x i32> %val2,
+                           <4 x double> %val3, <4 x double> %val4) {
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+
+; CHECK: fun65
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <4 x i32> %val1, %val2
+; CHECK: cost of 5 for instruction:   %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+}
+
+define <4 x i8> @fun66(<4 x i64> %val1, <4 x i64> %val2,
+                       <4 x i8> %val3, <4 x i8> %val4) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+
+; CHECK: fun66
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+}
+
+define <4 x i16> @fun67(<4 x i64> %val1, <4 x i64> %val2,
+                        <4 x i16> %val3, <4 x i16> %val4) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+
+; CHECK: fun67
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+}
+
+define <4 x i32> @fun68(<4 x i64> %val1, <4 x i64> %val2,
+                        <4 x i32> %val3, <4 x i32> %val4) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+
+; CHECK: fun68
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+}
+
+define <4 x i64> @fun69(<4 x i64> %val1, <4 x i64> %val2,
+                        <4 x i64> %val3, <4 x i64> %val4) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+
+; CHECK: fun69
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+}
+
+define <4 x float> @fun70(<4 x i64> %val1, <4 x i64> %val2,
+                          <4 x float> %val3, <4 x float> %val4) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+
+; CHECK: fun70
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+}
+
+define <4 x double> @fun71(<4 x i64> %val1, <4 x i64> %val2,
+                           <4 x double> %val3, <4 x double> %val4) {
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+
+; CHECK: fun71
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <4 x i64> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+}
+
+define <8 x i8> @fun72(<8 x i8> %val1, <8 x i8> %val2,
+                       <8 x i8> %val3, <8 x i8> %val4) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+
+; CHECK: fun72
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+}
+
+define <8 x i16> @fun73(<8 x i8> %val1, <8 x i8> %val2,
+                        <8 x i16> %val3, <8 x i16> %val4) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+
+; CHECK: fun73
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+}
+
+define <8 x i32> @fun74(<8 x i8> %val1, <8 x i8> %val2,
+                        <8 x i32> %val3, <8 x i32> %val4) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+
+; CHECK: fun74
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 7 for instruction:   %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+}
+
+define <8 x i64> @fun75(<8 x i8> %val1, <8 x i8> %val2,
+                        <8 x i64> %val3, <8 x i64> %val4) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+
+; CHECK: fun75
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 19 for instruction:   %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+}
+
+define <8 x float> @fun76(<8 x i8> %val1, <8 x i8> %val2,
+                          <8 x float> %val3, <8 x float> %val4) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+
+; CHECK: fun76
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 7 for instruction:   %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+}
+
+define <8 x double> @fun77(<8 x i8> %val1, <8 x i8> %val2,
+                           <8 x double> %val3, <8 x double> %val4) {
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+
+; CHECK: fun77
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i8> %val1, %val2
+; CHECK: cost of 19 for instruction:   %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+}
+
+define <8 x i8> @fun78(<8 x i16> %val1, <8 x i16> %val2,
+                       <8 x i8> %val3, <8 x i8> %val4) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+
+; CHECK: fun78
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+}
+
+define <8 x i16> @fun79(<8 x i16> %val1, <8 x i16> %val2,
+                        <8 x i16> %val3, <8 x i16> %val4) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+
+; CHECK: fun79
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+}
+
+define <8 x i32> @fun80(<8 x i16> %val1, <8 x i16> %val2,
+                        <8 x i32> %val3, <8 x i32> %val4) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+
+; CHECK: fun80
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 5 for instruction:   %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+}
+
+define <8 x i64> @fun81(<8 x i16> %val1, <8 x i16> %val2,
+                        <8 x i64> %val3, <8 x i64> %val4) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+
+; CHECK: fun81
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 15 for instruction:   %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+}
+
+define <8 x float> @fun82(<8 x i16> %val1, <8 x i16> %val2,
+                          <8 x float> %val3, <8 x float> %val4) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+
+; CHECK: fun82
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 5 for instruction:   %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+}
+
+define <8 x double> @fun83(<8 x i16> %val1, <8 x i16> %val2,
+                           <8 x double> %val3, <8 x double> %val4) {
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+
+; CHECK: fun83
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <8 x i16> %val1, %val2
+; CHECK: cost of 15 for instruction:   %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+}
+
+define <8 x i8> @fun84(<8 x i32> %val1, <8 x i32> %val2,
+                       <8 x i8> %val3, <8 x i8> %val4) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+
+; CHECK: fun84
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+}
+
+define <8 x i16> @fun85(<8 x i32> %val1, <8 x i32> %val2,
+                        <8 x i16> %val3, <8 x i16> %val4) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+
+; CHECK: fun85
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+}
+
+define <8 x i32> @fun86(<8 x i32> %val1, <8 x i32> %val2,
+                        <8 x i32> %val3, <8 x i32> %val4) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+
+; CHECK: fun86
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+}
+
+define <8 x i64> @fun87(<8 x i32> %val1, <8 x i32> %val2,
+                        <8 x i64> %val3, <8 x i64> %val4) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+
+; CHECK: fun87
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 11 for instruction:   %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+}
+
+define <8 x float> @fun88(<8 x i32> %val1, <8 x i32> %val2,
+                          <8 x float> %val3, <8 x float> %val4) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+
+; CHECK: fun88
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+}
+
+define <8 x double> @fun89(<8 x i32> %val1, <8 x i32> %val2,
+                           <8 x double> %val3, <8 x double> %val4) {
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+
+; CHECK: fun89
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <8 x i32> %val1, %val2
+; CHECK: cost of 11 for instruction:   %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+}
+
+define <8 x i8> @fun90(<8 x i64> %val1, <8 x i64> %val2,
+                       <8 x i8> %val3, <8 x i8> %val4) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+
+; CHECK: fun90
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+}
+
+define <8 x i16> @fun91(<8 x i64> %val1, <8 x i64> %val2,
+                        <8 x i16> %val3, <8 x i16> %val4) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+
+; CHECK: fun91
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+}
+
+define <8 x i32> @fun92(<8 x i64> %val1, <8 x i64> %val2,
+                        <8 x i32> %val3, <8 x i32> %val4) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+
+; CHECK: fun92
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+}
+
+define <8 x i64> @fun93(<8 x i64> %val1, <8 x i64> %val2,
+                        <8 x i64> %val3, <8 x i64> %val4) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+
+; CHECK: fun93
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+}
+
+define <8 x float> @fun94(<8 x i64> %val1, <8 x i64> %val2,
+                          <8 x float> %val3, <8 x float> %val4) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+
+; CHECK: fun94
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+}
+
+define <8 x double> @fun95(<8 x i64> %val1, <8 x i64> %val2,
+                           <8 x double> %val3, <8 x double> %val4) {
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+
+; CHECK: fun95
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <8 x i64> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+}
+
+define <16 x i8> @fun96(<16 x i8> %val1, <16 x i8> %val2,
+                        <16 x i8> %val3, <16 x i8> %val4) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+
+; CHECK: fun96
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+}
+
+define <16 x i16> @fun97(<16 x i8> %val1, <16 x i8> %val2,
+                         <16 x i16> %val3, <16 x i16> %val4) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+
+; CHECK: fun97
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 5 for instruction:   %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+}
+
+define <16 x i32> @fun98(<16 x i8> %val1, <16 x i8> %val2,
+                         <16 x i32> %val3, <16 x i32> %val4) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+
+; CHECK: fun98
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 15 for instruction:   %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+}
+
+define <16 x i64> @fun99(<16 x i8> %val1, <16 x i8> %val2,
+                         <16 x i64> %val3, <16 x i64> %val4) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+
+; CHECK: fun99
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 39 for instruction:   %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+}
+
+define <16 x float> @fun100(<16 x i8> %val1, <16 x i8> %val2,
+                            <16 x float> %val3, <16 x float> %val4) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+
+; CHECK: fun100
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 15 for instruction:   %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+}
+
+define <16 x double> @fun101(<16 x i8> %val1, <16 x i8> %val2,
+                             <16 x double> %val3, <16 x double> %val4) {
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+
+; CHECK: fun101
+; CHECK: cost of 1 for instruction:   %cmp = icmp eq <16 x i8> %val1, %val2
+; CHECK: cost of 39 for instruction:   %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+}
+
+define <16 x i8> @fun102(<16 x i16> %val1, <16 x i16> %val2,
+                         <16 x i8> %val3, <16 x i8> %val4) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+
+; CHECK: fun102
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+}
+
+define <16 x i16> @fun103(<16 x i16> %val1, <16 x i16> %val2,
+                          <16 x i16> %val3, <16 x i16> %val4) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+
+; CHECK: fun103
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+}
+
+define <16 x i32> @fun104(<16 x i16> %val1, <16 x i16> %val2,
+                          <16 x i32> %val3, <16 x i32> %val4) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+
+; CHECK: fun104
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 11 for instruction:   %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+}
+
+define <16 x i64> @fun105(<16 x i16> %val1, <16 x i16> %val2,
+                          <16 x i64> %val3, <16 x i64> %val4) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+
+; CHECK: fun105
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 31 for instruction:   %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+}
+
+define <16 x float> @fun106(<16 x i16> %val1, <16 x i16> %val2,
+                            <16 x float> %val3, <16 x float> %val4) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+
+; CHECK: fun106
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 11 for instruction:   %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+}
+
+define <16 x double> @fun107(<16 x i16> %val1, <16 x i16> %val2,
+                             <16 x double> %val3, <16 x double> %val4) {
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+
+; CHECK: fun107
+; CHECK: cost of 2 for instruction:   %cmp = icmp eq <16 x i16> %val1, %val2
+; CHECK: cost of 31 for instruction:   %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+}
+
+define <16 x i8> @fun108(<16 x i32> %val1, <16 x i32> %val2,
+                         <16 x i8> %val3, <16 x i8> %val4) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+
+; CHECK: fun108
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+}
+
+define <16 x i16> @fun109(<16 x i32> %val1, <16 x i32> %val2,
+                          <16 x i16> %val3, <16 x i16> %val4) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+
+; CHECK: fun109
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+}
+
+define <16 x i32> @fun110(<16 x i32> %val1, <16 x i32> %val2,
+                          <16 x i32> %val3, <16 x i32> %val4) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+
+; CHECK: fun110
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+}
+
+define <16 x i64> @fun111(<16 x i32> %val1, <16 x i32> %val2,
+                          <16 x i64> %val3, <16 x i64> %val4) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+
+; CHECK: fun111
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 23 for instruction:   %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+}
+
+define <16 x float> @fun112(<16 x i32> %val1, <16 x i32> %val2,
+                            <16 x float> %val3, <16 x float> %val4) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+
+; CHECK: fun112
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+}
+
+define <16 x double> @fun113(<16 x i32> %val1, <16 x i32> %val2,
+                             <16 x double> %val3, <16 x double> %val4) {
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+
+; CHECK: fun113
+; CHECK: cost of 4 for instruction:   %cmp = icmp eq <16 x i32> %val1, %val2
+; CHECK: cost of 23 for instruction:   %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+}
+
+define <16 x i8> @fun114(<16 x i64> %val1, <16 x i64> %val2,
+                         <16 x i8> %val3, <16 x i8> %val4) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+
+; CHECK: fun114
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+}
+
+define <16 x i16> @fun115(<16 x i64> %val1, <16 x i64> %val2,
+                          <16 x i16> %val3, <16 x i16> %val4) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+
+; CHECK: fun115
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+}
+
+define <16 x i32> @fun116(<16 x i64> %val1, <16 x i64> %val2,
+                          <16 x i32> %val3, <16 x i32> %val4) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+
+; CHECK: fun116
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+}
+
+define <16 x i64> @fun117(<16 x i64> %val1, <16 x i64> %val2,
+                          <16 x i64> %val3, <16 x i64> %val4) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+
+; CHECK: fun117
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+}
+
+define <16 x float> @fun118(<16 x i64> %val1, <16 x i64> %val2,
+                            <16 x float> %val3, <16 x float> %val4) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+
+; CHECK: fun118
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+}
+
+define <16 x double> @fun119(<16 x i64> %val1, <16 x i64> %val2,
+                             <16 x double> %val3, <16 x double> %val4) {
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+
+; CHECK: fun119
+; CHECK: cost of 8 for instruction:   %cmp = icmp eq <16 x i64> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+}
+
+define i8 @fun120(float %val1, float %val2,
+                  i8 %val3, i8 %val4) {
+  %cmp = fcmp ogt float %val1, %val2
+  %sel = select i1 %cmp, i8 %val3, i8 %val4
+  ret i8 %sel
+
+; CHECK: fun120
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i8 %val3, i8 %val4
+}
+
+define i16 @fun121(float %val1, float %val2,
+                   i16 %val3, i16 %val4) {
+  %cmp = fcmp ogt float %val1, %val2
+  %sel = select i1 %cmp, i16 %val3, i16 %val4
+  ret i16 %sel
+
+; CHECK: fun121
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i16 %val3, i16 %val4
+}
+
+define i32 @fun122(float %val1, float %val2,
+                   i32 %val3, i32 %val4) {
+  %cmp = fcmp ogt float %val1, %val2
+  %sel = select i1 %cmp, i32 %val3, i32 %val4
+  ret i32 %sel
+
+; CHECK: fun122
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i32 %val3, i32 %val4
+}
+
+define i64 @fun123(float %val1, float %val2,
+                   i64 %val3, i64 %val4) {
+  %cmp = fcmp ogt float %val1, %val2
+  %sel = select i1 %cmp, i64 %val3, i64 %val4
+  ret i64 %sel
+
+; CHECK: fun123
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %val4
+}
+
+define float @fun124(float %val1, float %val2,
+                     float %val3, float %val4) {
+  %cmp = fcmp ogt float %val1, %val2
+  %sel = select i1 %cmp, float %val3, float %val4
+  ret float %sel
+
+; CHECK: fun124
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, float %val3, float %val4
+}
+
+define double @fun125(float %val1, float %val2,
+                      double %val3, double %val4) {
+  %cmp = fcmp ogt float %val1, %val2
+  %sel = select i1 %cmp, double %val3, double %val4
+  ret double %sel
+
+; CHECK: fun125
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt float %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, double %val3, double %val4
+}
+
+define i8 @fun126(double %val1, double %val2,
+                  i8 %val3, i8 %val4) {
+  %cmp = fcmp ogt double %val1, %val2
+  %sel = select i1 %cmp, i8 %val3, i8 %val4
+  ret i8 %sel
+
+; CHECK: fun126
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i8 %val3, i8 %val4
+}
+
+define i16 @fun127(double %val1, double %val2,
+                   i16 %val3, i16 %val4) {
+  %cmp = fcmp ogt double %val1, %val2
+  %sel = select i1 %cmp, i16 %val3, i16 %val4
+  ret i16 %sel
+
+; CHECK: fun127
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i16 %val3, i16 %val4
+}
+
+define i32 @fun128(double %val1, double %val2,
+                   i32 %val3, i32 %val4) {
+  %cmp = fcmp ogt double %val1, %val2
+  %sel = select i1 %cmp, i32 %val3, i32 %val4
+  ret i32 %sel
+
+; CHECK: fun128
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i32 %val3, i32 %val4
+}
+
+define i64 @fun129(double %val1, double %val2,
+                   i64 %val3, i64 %val4) {
+  %cmp = fcmp ogt double %val1, %val2
+  %sel = select i1 %cmp, i64 %val3, i64 %val4
+  ret i64 %sel
+
+; CHECK: fun129
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select i1 %cmp, i64 %val3, i64 %val4
+}
+
+define float @fun130(double %val1, double %val2,
+                     float %val3, float %val4) {
+  %cmp = fcmp ogt double %val1, %val2
+  %sel = select i1 %cmp, float %val3, float %val4
+  ret float %sel
+
+; CHECK: fun130
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, float %val3, float %val4
+}
+
+define double @fun131(double %val1, double %val2,
+                      double %val3, double %val4) {
+  %cmp = fcmp ogt double %val1, %val2
+  %sel = select i1 %cmp, double %val3, double %val4
+  ret double %sel
+
+; CHECK: fun131
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt double %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select i1 %cmp, double %val3, double %val4
+}
+
+define <2 x i8> @fun132(<2 x float> %val1, <2 x float> %val2,
+                        <2 x i8> %val3, <2 x i8> %val4) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+
+; CHECK: fun132
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+}
+
+define <2 x i16> @fun133(<2 x float> %val1, <2 x float> %val2,
+                         <2 x i16> %val3, <2 x i16> %val4) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+
+; CHECK: fun133
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+}
+
+define <2 x i32> @fun134(<2 x float> %val1, <2 x float> %val2,
+                         <2 x i32> %val3, <2 x i32> %val4) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+
+; CHECK: fun134
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+}
+
+define <2 x i64> @fun135(<2 x float> %val1, <2 x float> %val2,
+                         <2 x i64> %val3, <2 x i64> %val4) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+
+; CHECK: fun135
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+}
+
+define <2 x float> @fun136(<2 x float> %val1, <2 x float> %val2,
+                           <2 x float> %val3, <2 x float> %val4) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+
+; CHECK: fun136
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+}
+
+define <2 x double> @fun137(<2 x float> %val1, <2 x float> %val2,
+                            <2 x double> %val3, <2 x double> %val4) {
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+
+; CHECK: fun137
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <2 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+}
+
+define <2 x i8> @fun138(<2 x double> %val1, <2 x double> %val2,
+                        <2 x i8> %val3, <2 x i8> %val4) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+
+; CHECK: fun138
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+}
+
+define <2 x i16> @fun139(<2 x double> %val1, <2 x double> %val2,
+                         <2 x i16> %val3, <2 x i16> %val4) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+
+; CHECK: fun139
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+}
+
+define <2 x i32> @fun140(<2 x double> %val1, <2 x double> %val2,
+                         <2 x i32> %val3, <2 x i32> %val4) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+
+; CHECK: fun140
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+}
+
+define <2 x i64> @fun141(<2 x double> %val1, <2 x double> %val2,
+                         <2 x i64> %val3, <2 x i64> %val4) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+
+; CHECK: fun141
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+}
+
+define <2 x float> @fun142(<2 x double> %val1, <2 x double> %val2,
+                           <2 x float> %val3, <2 x float> %val4) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+
+; CHECK: fun142
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+}
+
+define <2 x double> @fun143(<2 x double> %val1, <2 x double> %val2,
+                            <2 x double> %val3, <2 x double> %val4) {
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+
+; CHECK: fun143
+; CHECK: cost of 1 for instruction:   %cmp = fcmp ogt <2 x double> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+}
+
+define <4 x i8> @fun144(<4 x float> %val1, <4 x float> %val2,
+                        <4 x i8> %val3, <4 x i8> %val4) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+
+; CHECK: fun144
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+}
+
+define <4 x i16> @fun145(<4 x float> %val1, <4 x float> %val2,
+                         <4 x i16> %val3, <4 x i16> %val4) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+
+; CHECK: fun145
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+}
+
+define <4 x i32> @fun146(<4 x float> %val1, <4 x float> %val2,
+                         <4 x i32> %val3, <4 x i32> %val4) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+
+; CHECK: fun146
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+}
+
+define <4 x i64> @fun147(<4 x float> %val1, <4 x float> %val2,
+                         <4 x i64> %val3, <4 x i64> %val4) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+
+; CHECK: fun147
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 5 for instruction:   %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+}
+
+define <4 x float> @fun148(<4 x float> %val1, <4 x float> %val2,
+                           <4 x float> %val3, <4 x float> %val4) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+
+; CHECK: fun148
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 1 for instruction:   %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+}
+
+define <4 x double> @fun149(<4 x float> %val1, <4 x float> %val2,
+                            <4 x double> %val3, <4 x double> %val4) {
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+
+; CHECK: fun149
+; CHECK: cost of 10 for instruction:   %cmp = fcmp ogt <4 x float> %val1, %val2
+; CHECK: cost of 5 for instruction:   %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+}
+
+define <4 x i8> @fun150(<4 x double> %val1, <4 x double> %val2,
+                        <4 x i8> %val3, <4 x i8> %val4) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+
+; CHECK: fun150
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+}
+
+define <4 x i16> @fun151(<4 x double> %val1, <4 x double> %val2,
+                         <4 x i16> %val3, <4 x i16> %val4) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+
+; CHECK: fun151
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+}
+
+define <4 x i32> @fun152(<4 x double> %val1, <4 x double> %val2,
+                         <4 x i32> %val3, <4 x i32> %val4) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+
+; CHECK: fun152
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+}
+
+define <4 x i64> @fun153(<4 x double> %val1, <4 x double> %val2,
+                         <4 x i64> %val3, <4 x i64> %val4) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+
+; CHECK: fun153
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+}
+
+define <4 x float> @fun154(<4 x double> %val1, <4 x double> %val2,
+                           <4 x float> %val3, <4 x float> %val4) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+
+; CHECK: fun154
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+}
+
+define <4 x double> @fun155(<4 x double> %val1, <4 x double> %val2,
+                            <4 x double> %val3, <4 x double> %val4) {
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+
+; CHECK: fun155
+; CHECK: cost of 2 for instruction:   %cmp = fcmp ogt <4 x double> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+}
+
+define <8 x i8> @fun156(<8 x float> %val1, <8 x float> %val2,
+                        <8 x i8> %val3, <8 x i8> %val4) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+
+; CHECK: fun156
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+}
+
+define <8 x i16> @fun157(<8 x float> %val1, <8 x float> %val2,
+                         <8 x i16> %val3, <8 x i16> %val4) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+
+; CHECK: fun157
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+}
+
+define <8 x i32> @fun158(<8 x float> %val1, <8 x float> %val2,
+                         <8 x i32> %val3, <8 x i32> %val4) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+
+; CHECK: fun158
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+}
+
+define <8 x i64> @fun159(<8 x float> %val1, <8 x float> %val2,
+                         <8 x i64> %val3, <8 x i64> %val4) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+
+; CHECK: fun159
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 11 for instruction:   %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+}
+
+define <8 x float> @fun160(<8 x float> %val1, <8 x float> %val2,
+                           <8 x float> %val3, <8 x float> %val4) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+
+; CHECK: fun160
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 2 for instruction:   %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+}
+
+define <8 x double> @fun161(<8 x float> %val1, <8 x float> %val2,
+                            <8 x double> %val3, <8 x double> %val4) {
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+
+; CHECK: fun161
+; CHECK: cost of 20 for instruction:   %cmp = fcmp ogt <8 x float> %val1, %val2
+; CHECK: cost of 11 for instruction:   %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+}
+
+define <8 x i8> @fun162(<8 x double> %val1, <8 x double> %val2,
+                        <8 x i8> %val3, <8 x i8> %val4) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+
+; CHECK: fun162
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+}
+
+define <8 x i16> @fun163(<8 x double> %val1, <8 x double> %val2,
+                         <8 x i16> %val3, <8 x i16> %val4) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+
+; CHECK: fun163
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+}
+
+define <8 x i32> @fun164(<8 x double> %val1, <8 x double> %val2,
+                         <8 x i32> %val3, <8 x i32> %val4) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+
+; CHECK: fun164
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+}
+
+define <8 x i64> @fun165(<8 x double> %val1, <8 x double> %val2,
+                         <8 x i64> %val3, <8 x i64> %val4) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+
+; CHECK: fun165
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+}
+
+define <8 x float> @fun166(<8 x double> %val1, <8 x double> %val2,
+                           <8 x float> %val3, <8 x float> %val4) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+
+; CHECK: fun166
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+}
+
+define <8 x double> @fun167(<8 x double> %val1, <8 x double> %val2,
+                            <8 x double> %val3, <8 x double> %val4) {
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+
+; CHECK: fun167
+; CHECK: cost of 4 for instruction:   %cmp = fcmp ogt <8 x double> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+}
+
+define <16 x i8> @fun168(<16 x float> %val1, <16 x float> %val2,
+                         <16 x i8> %val3, <16 x i8> %val4) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+
+; CHECK: fun168
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+}
+
+define <16 x i16> @fun169(<16 x float> %val1, <16 x float> %val2,
+                          <16 x i16> %val3, <16 x i16> %val4) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+
+; CHECK: fun169
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+}
+
+define <16 x i32> @fun170(<16 x float> %val1, <16 x float> %val2,
+                          <16 x i32> %val3, <16 x i32> %val4) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+
+; CHECK: fun170
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+}
+
+define <16 x i64> @fun171(<16 x float> %val1, <16 x float> %val2,
+                          <16 x i64> %val3, <16 x i64> %val4) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+
+; CHECK: fun171
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 23 for instruction:   %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+}
+
+define <16 x float> @fun172(<16 x float> %val1, <16 x float> %val2,
+                            <16 x float> %val3, <16 x float> %val4) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+
+; CHECK: fun172
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 4 for instruction:   %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+}
+
+define <16 x double> @fun173(<16 x float> %val1, <16 x float> %val2,
+                             <16 x double> %val3, <16 x double> %val4) {
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+
+; CHECK: fun173
+; CHECK: cost of 40 for instruction:   %cmp = fcmp ogt <16 x float> %val1, %val2
+; CHECK: cost of 23 for instruction:   %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+}
+
+define <16 x i8> @fun174(<16 x double> %val1, <16 x double> %val2,
+                         <16 x i8> %val3, <16 x i8> %val4) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+
+; CHECK: fun174
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+}
+
+define <16 x i16> @fun175(<16 x double> %val1, <16 x double> %val2,
+                          <16 x i16> %val3, <16 x i16> %val4) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+
+; CHECK: fun175
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+}
+
+define <16 x i32> @fun176(<16 x double> %val1, <16 x double> %val2,
+                          <16 x i32> %val3, <16 x i32> %val4) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+
+; CHECK: fun176
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+}
+
+define <16 x i64> @fun177(<16 x double> %val1, <16 x double> %val2,
+                          <16 x i64> %val3, <16 x i64> %val4) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+
+; CHECK: fun177
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+}
+
+define <16 x float> @fun178(<16 x double> %val1, <16 x double> %val2,
+                            <16 x float> %val3, <16 x float> %val4) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+
+; CHECK: fun178
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+}
+
+define <16 x double> @fun179(<16 x double> %val1, <16 x double> %val2,
+                             <16 x double> %val3, <16 x double> %val4) {
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+
+; CHECK: fun179
+; CHECK: cost of 8 for instruction:   %cmp = fcmp ogt <16 x double> %val1, %val2
+; CHECK: cost of 8 for instruction:   %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+}
+
diff --git a/test/Analysis/CostModel/SystemZ/ext-load.ll b/test/Analysis/CostModel/SystemZ/ext-load.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d3d501a6d2971ea5ea537d36ce63a2ab2ebfb802
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/ext-load.ll
@@ -0,0 +1,56 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Test that an extension of a load does not get an additional cost in cases
+; where the load performs the extension.
+
+define void @sext() {
+  %li8 = load i8, i8* undef
+  sext i8 %li8 to i16
+  sext i8 %li8 to i32
+  sext i8 %li8 to i64
+
+  %li16 = load i16, i16* undef
+  sext i16 %li16 to i32
+  sext i16 %li16 to i64
+
+  %li32 = load i32, i32* undef
+  sext i32 %li32 to i64
+
+  ret void
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li8 = load i8, i8* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %1 = sext i8 %li8 to i16
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %2 = sext i8 %li8 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %3 = sext i8 %li8 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %4 = sext i16 %li16 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %5 = sext i16 %li16 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %6 = sext i32 %li32 to i64
+}
+
+define void @zext() {
+  %li8 = load i8, i8* undef
+  zext i8 %li8 to i16
+  zext i8 %li8 to i32
+  zext i8 %li8 to i64
+
+  %li16 = load i16, i16* undef
+  zext i16 %li16 to i32
+  zext i16 %li16 to i64
+
+  %li32 = load i32, i32* undef
+  zext i32 %li32 to i64
+
+  ret void
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li8 = load i8, i8* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %1 = zext i8 %li8 to i16
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %2 = zext i8 %li8 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %3 = zext i8 %li8 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li16 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %4 = zext i16 %li16 to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %5 = zext i16 %li16 to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %6 = zext i32 %li32 to i64
+}
diff --git a/test/Analysis/CostModel/SystemZ/fp-arith.ll b/test/Analysis/CostModel/SystemZ/fp-arith.ll
new file mode 100644
index 0000000000000000000000000000000000000000..08a7c291138f018b15bc789266655a0a86eda158
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/fp-arith.ll
@@ -0,0 +1,119 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Note: The scalarized vector instructions cost is not including any
+; extracts, due to the undef operands
+;
+; Note: FRem is implemented with libcall, so not included here.
+
+define void @fadd() {
+  %res0 = fadd float undef, undef
+  %res1 = fadd double undef, undef
+  %res2 = fadd fp128 undef, undef
+  %res3 = fadd <2 x float> undef, undef
+  %res4 = fadd <2 x double> undef, undef
+  %res5 = fadd <4 x float> undef, undef
+  %res6 = fadd <4 x double> undef, undef
+  %res7 = fadd <8 x float> undef, undef
+  %res8 = fadd <8 x double> undef, undef
+  %res9 = fadd <16 x float> undef, undef
+  %res10 = fadd <16 x double> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fadd float undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fadd double undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fadd fp128 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fadd <2 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fadd <2 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fadd <4 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fadd <4 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fadd <8 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fadd <8 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fadd <16 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fadd <16 x double> undef, undef
+
+  ret void;
+}
+
+define void @fsub() {
+  %res0 = fsub float undef, undef
+  %res1 = fsub double undef, undef
+  %res2 = fsub fp128 undef, undef
+  %res3 = fsub <2 x float> undef, undef
+  %res4 = fsub <2 x double> undef, undef
+  %res5 = fsub <4 x float> undef, undef
+  %res6 = fsub <4 x double> undef, undef
+  %res7 = fsub <8 x float> undef, undef
+  %res8 = fsub <8 x double> undef, undef
+  %res9 = fsub <16 x float> undef, undef
+  %res10 = fsub <16 x double> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fsub float undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fsub double undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fsub fp128 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fsub <2 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fsub <2 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fsub <4 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fsub <4 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fsub <8 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fsub <8 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fsub <16 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fsub <16 x double> undef, undef
+
+  ret void;
+}
+
+define void @fmul() {
+  %res0 = fmul float undef, undef
+  %res1 = fmul double undef, undef
+  %res2 = fmul fp128 undef, undef
+  %res3 = fmul <2 x float> undef, undef
+  %res4 = fmul <2 x double> undef, undef
+  %res5 = fmul <4 x float> undef, undef
+  %res6 = fmul <4 x double> undef, undef
+  %res7 = fmul <8 x float> undef, undef
+  %res8 = fmul <8 x double> undef, undef
+  %res9 = fmul <16 x float> undef, undef
+  %res10 = fmul <16 x double> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fmul float undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fmul double undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fmul fp128 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fmul <2 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fmul <2 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fmul <4 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fmul <4 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fmul <8 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fmul <8 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fmul <16 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fmul <16 x double> undef, undef
+
+  ret void;
+}
+
+define void @fdiv() {
+  %res0 = fdiv float undef, undef
+  %res1 = fdiv double undef, undef
+  %res2 = fdiv fp128 undef, undef
+  %res3 = fdiv <2 x float> undef, undef
+  %res4 = fdiv <2 x double> undef, undef
+  %res5 = fdiv <4 x float> undef, undef
+  %res6 = fdiv <4 x double> undef, undef
+  %res7 = fdiv <8 x float> undef, undef
+  %res8 = fdiv <8 x double> undef, undef
+  %res9 = fdiv <16 x float> undef, undef
+  %res10 = fdiv <16 x double> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = fdiv float undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = fdiv double undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = fdiv fp128 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res3 = fdiv <2 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = fdiv <2 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res5 = fdiv <4 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res6 = fdiv <4 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %res7 = fdiv <8 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res8 = fdiv <8 x double> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %res9 = fdiv <16 x float> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res10 = fdiv <16 x double> undef, undef
+
+  ret void;
+}
+
diff --git a/test/Analysis/CostModel/SystemZ/fp-cast.ll b/test/Analysis/CostModel/SystemZ/fp-cast.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4ea5a5033d737fed651faf80f7d265b699df1c92
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/fp-cast.ll
@@ -0,0 +1,541 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Note: The scalarized vector instructions costs are not including any
+; extracts, due to the undef operands.
+
+define void @fpext() {
+  %v0 = fpext double undef to fp128
+  %v1 = fpext float undef to fp128
+  %v2 = fpext float undef to double
+  %v3 = fpext <2 x double> undef to <2 x fp128>
+  %v4 = fpext <2 x float> undef to <2 x fp128>
+  %v5 = fpext <2 x float> undef to <2 x double>
+  %v6 = fpext <4 x double> undef to <4 x fp128>
+  %v7 = fpext <4 x float> undef to <4 x fp128>
+  %v8 = fpext <4 x float> undef to <4 x double>
+  %v9 = fpext <8 x double> undef to <8 x fp128>
+  %v10 = fpext <8 x float> undef to <8 x fp128>
+  %v11 = fpext <8 x float> undef to <8 x double>
+  %v12 = fpext <16 x float> undef to <16 x double>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = fpext double undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = fpext float undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = fpext float undef to double
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v3 = fpext <2 x double> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v4 = fpext <2 x float> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v5 = fpext <2 x float> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v6 = fpext <4 x double> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v7 = fpext <4 x float> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v8 = fpext <4 x float> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v9 = fpext <8 x double> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v10 = fpext <8 x float> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v11 = fpext <8 x float> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v12 = fpext <16 x float> undef to <16 x double>
+
+  ret void;
+}
+
+define void @fptosi() {
+  %v0 = fptosi fp128 undef to i64
+  %v1 = fptosi fp128 undef to i32
+  %v2 = fptosi fp128 undef to i16
+  %v3 = fptosi fp128 undef to i8
+  %v4 = fptosi double undef to i64
+  %v5 = fptosi double undef to i32
+  %v6 = fptosi double undef to i16
+  %v7 = fptosi double undef to i8
+  %v8 = fptosi float undef to i64
+  %v9 = fptosi float undef to i32
+  %v10 = fptosi float undef to i16
+  %v11 = fptosi float undef to i8
+  %v12 = fptosi <2 x fp128> undef to <2 x i64>
+  %v13 = fptosi <2 x fp128> undef to <2 x i32>
+  %v14 = fptosi <2 x fp128> undef to <2 x i16>
+  %v15 = fptosi <2 x fp128> undef to <2 x i8>
+  %v16 = fptosi <2 x double> undef to <2 x i64>
+  %v17 = fptosi <2 x double> undef to <2 x i32>
+  %v18 = fptosi <2 x double> undef to <2 x i16>
+  %v19 = fptosi <2 x double> undef to <2 x i8>
+  %v20 = fptosi <2 x float> undef to <2 x i64>
+  %v21 = fptosi <2 x float> undef to <2 x i32>
+  %v22 = fptosi <2 x float> undef to <2 x i16>
+  %v23 = fptosi <2 x float> undef to <2 x i8>
+  %v24 = fptosi <4 x fp128> undef to <4 x i64>
+  %v25 = fptosi <4 x fp128> undef to <4 x i32>
+  %v26 = fptosi <4 x fp128> undef to <4 x i16>
+  %v27 = fptosi <4 x fp128> undef to <4 x i8>
+  %v28 = fptosi <4 x double> undef to <4 x i64>
+  %v29 = fptosi <4 x double> undef to <4 x i32>
+  %v30 = fptosi <4 x double> undef to <4 x i16>
+  %v31 = fptosi <4 x double> undef to <4 x i8>
+  %v32 = fptosi <4 x float> undef to <4 x i64>
+  %v33 = fptosi <4 x float> undef to <4 x i32>
+  %v34 = fptosi <4 x float> undef to <4 x i16>
+  %v35 = fptosi <4 x float> undef to <4 x i8>
+  %v36 = fptosi <8 x fp128> undef to <8 x i64>
+  %v37 = fptosi <8 x fp128> undef to <8 x i32>
+  %v38 = fptosi <8 x fp128> undef to <8 x i16>
+  %v39 = fptosi <8 x fp128> undef to <8 x i8>
+  %v40 = fptosi <8 x double> undef to <8 x i64>
+  %v41 = fptosi <8 x double> undef to <8 x i32>
+  %v42 = fptosi <8 x double> undef to <8 x i16>
+  %v43 = fptosi <8 x double> undef to <8 x i8>
+  %v44 = fptosi <8 x float> undef to <8 x i64>
+  %v45 = fptosi <8 x float> undef to <8 x i32>
+  %v46 = fptosi <8 x float> undef to <8 x i16>
+  %v47 = fptosi <8 x float> undef to <8 x i8>
+  %v48 = fptosi <16 x double> undef to <16 x i64>
+  %v49 = fptosi <16 x double> undef to <16 x i32>
+  %v50 = fptosi <16 x double> undef to <16 x i16>
+  %v51 = fptosi <16 x double> undef to <16 x i8>
+  %v52 = fptosi <16 x float> undef to <16 x i64>
+  %v53 = fptosi <16 x float> undef to <16 x i32>
+  %v54 = fptosi <16 x float> undef to <16 x i16>
+  %v55 = fptosi <16 x float> undef to <16 x i8>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = fptosi fp128 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = fptosi fp128 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = fptosi fp128 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = fptosi fp128 undef to i8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = fptosi double undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = fptosi double undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = fptosi double undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v7 = fptosi double undef to i8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v8 = fptosi float undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = fptosi float undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v10 = fptosi float undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = fptosi float undef to i8
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v12 = fptosi <2 x fp128> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v13 = fptosi <2 x fp128> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v14 = fptosi <2 x fp128> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = fptosi <2 x fp128> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v16 = fptosi <2 x double> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v17 = fptosi <2 x double> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v18 = fptosi <2 x double> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v19 = fptosi <2 x double> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v20 = fptosi <2 x float> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 14 for instruction:   %v21 = fptosi <2 x float> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v22 = fptosi <2 x float> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v23 = fptosi <2 x float> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v24 = fptosi <4 x fp128> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v25 = fptosi <4 x fp128> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v26 = fptosi <4 x fp128> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = fptosi <4 x fp128> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v28 = fptosi <4 x double> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v29 = fptosi <4 x double> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v30 = fptosi <4 x double> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v31 = fptosi <4 x double> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v32 = fptosi <4 x float> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v33 = fptosi <4 x float> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v34 = fptosi <4 x float> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v35 = fptosi <4 x float> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v36 = fptosi <8 x fp128> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v37 = fptosi <8 x fp128> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v38 = fptosi <8 x fp128> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = fptosi <8 x fp128> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v40 = fptosi <8 x double> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v41 = fptosi <8 x double> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v42 = fptosi <8 x double> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v43 = fptosi <8 x double> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %v44 = fptosi <8 x float> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v45 = fptosi <8 x float> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v46 = fptosi <8 x float> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v47 = fptosi <8 x float> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = fptosi <16 x double> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v49 = fptosi <16 x double> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v50 = fptosi <16 x double> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v51 = fptosi <16 x double> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 41 for instruction:   %v52 = fptosi <16 x float> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v53 = fptosi <16 x float> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v54 = fptosi <16 x float> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v55 = fptosi <16 x float> undef to <16 x i8>
+
+  ret void;
+}
+
+
+define void @fptoui() {
+  %v0 = fptoui fp128 undef to i64
+  %v1 = fptoui fp128 undef to i32
+  %v2 = fptoui fp128 undef to i16
+  %v3 = fptoui fp128 undef to i8
+  %v4 = fptoui double undef to i64
+  %v5 = fptoui double undef to i32
+  %v6 = fptoui double undef to i16
+  %v7 = fptoui double undef to i8
+  %v8 = fptoui float undef to i64
+  %v9 = fptoui float undef to i32
+  %v10 = fptoui float undef to i16
+  %v11 = fptoui float undef to i8
+  %v12 = fptoui <2 x fp128> undef to <2 x i64>
+  %v13 = fptoui <2 x fp128> undef to <2 x i32>
+  %v14 = fptoui <2 x fp128> undef to <2 x i16>
+  %v15 = fptoui <2 x fp128> undef to <2 x i8>
+  %v16 = fptoui <2 x double> undef to <2 x i64>
+  %v17 = fptoui <2 x double> undef to <2 x i32>
+  %v18 = fptoui <2 x double> undef to <2 x i16>
+  %v19 = fptoui <2 x double> undef to <2 x i8>
+  %v20 = fptoui <2 x float> undef to <2 x i64>
+  %v21 = fptoui <2 x float> undef to <2 x i32>
+  %v22 = fptoui <2 x float> undef to <2 x i16>
+  %v23 = fptoui <2 x float> undef to <2 x i8>
+  %v24 = fptoui <4 x fp128> undef to <4 x i64>
+  %v25 = fptoui <4 x fp128> undef to <4 x i32>
+  %v26 = fptoui <4 x fp128> undef to <4 x i16>
+  %v27 = fptoui <4 x fp128> undef to <4 x i8>
+  %v28 = fptoui <4 x double> undef to <4 x i64>
+  %v29 = fptoui <4 x double> undef to <4 x i32>
+  %v30 = fptoui <4 x double> undef to <4 x i16>
+  %v31 = fptoui <4 x double> undef to <4 x i8>
+  %v32 = fptoui <4 x float> undef to <4 x i64>
+  %v33 = fptoui <4 x float> undef to <4 x i32>
+  %v34 = fptoui <4 x float> undef to <4 x i16>
+  %v35 = fptoui <4 x float> undef to <4 x i8>
+  %v36 = fptoui <8 x fp128> undef to <8 x i64>
+  %v37 = fptoui <8 x fp128> undef to <8 x i32>
+  %v38 = fptoui <8 x fp128> undef to <8 x i16>
+  %v39 = fptoui <8 x fp128> undef to <8 x i8>
+  %v40 = fptoui <8 x double> undef to <8 x i64>
+  %v41 = fptoui <8 x double> undef to <8 x i32>
+  %v42 = fptoui <8 x double> undef to <8 x i16>
+  %v43 = fptoui <8 x double> undef to <8 x i8>
+  %v44 = fptoui <8 x float> undef to <8 x i64>
+  %v45 = fptoui <8 x float> undef to <8 x i32>
+  %v46 = fptoui <8 x float> undef to <8 x i16>
+  %v47 = fptoui <8 x float> undef to <8 x i8>
+  %v48 = fptoui <16 x double> undef to <16 x i64>
+  %v49 = fptoui <16 x double> undef to <16 x i32>
+  %v50 = fptoui <16 x double> undef to <16 x i16>
+  %v51 = fptoui <16 x double> undef to <16 x i8>
+  %v52 = fptoui <16 x float> undef to <16 x i64>
+  %v53 = fptoui <16 x float> undef to <16 x i32>
+  %v54 = fptoui <16 x float> undef to <16 x i16>
+  %v55 = fptoui <16 x float> undef to <16 x i8>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = fptoui fp128 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = fptoui fp128 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = fptoui fp128 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = fptoui fp128 undef to i8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = fptoui double undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = fptoui double undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = fptoui double undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v7 = fptoui double undef to i8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v8 = fptoui float undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = fptoui float undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v10 = fptoui float undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = fptoui float undef to i8
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v12 = fptoui <2 x fp128> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v13 = fptoui <2 x fp128> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v14 = fptoui <2 x fp128> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = fptoui <2 x fp128> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v16 = fptoui <2 x double> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v17 = fptoui <2 x double> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v18 = fptoui <2 x double> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v19 = fptoui <2 x double> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v20 = fptoui <2 x float> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 14 for instruction:   %v21 = fptoui <2 x float> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v22 = fptoui <2 x float> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v23 = fptoui <2 x float> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v24 = fptoui <4 x fp128> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v25 = fptoui <4 x fp128> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v26 = fptoui <4 x fp128> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = fptoui <4 x fp128> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v28 = fptoui <4 x double> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v29 = fptoui <4 x double> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v30 = fptoui <4 x double> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v31 = fptoui <4 x double> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v32 = fptoui <4 x float> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v33 = fptoui <4 x float> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v34 = fptoui <4 x float> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 13 for instruction:   %v35 = fptoui <4 x float> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v36 = fptoui <8 x fp128> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v37 = fptoui <8 x fp128> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v38 = fptoui <8 x fp128> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = fptoui <8 x fp128> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v40 = fptoui <8 x double> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v41 = fptoui <8 x double> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v42 = fptoui <8 x double> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v43 = fptoui <8 x double> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 21 for instruction:   %v44 = fptoui <8 x float> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v45 = fptoui <8 x float> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v46 = fptoui <8 x float> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 25 for instruction:   %v47 = fptoui <8 x float> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = fptoui <16 x double> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v49 = fptoui <16 x double> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v50 = fptoui <16 x double> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v51 = fptoui <16 x double> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 41 for instruction:   %v52 = fptoui <16 x float> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v53 = fptoui <16 x float> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v54 = fptoui <16 x float> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 49 for instruction:   %v55 = fptoui <16 x float> undef to <16 x i8>
+
+  ret void;
+}
+
+define void @fptrunc() {
+  %v0 = fptrunc fp128 undef to double
+  %v1 = fptrunc fp128 undef to float
+  %v2 = fptrunc double undef to float
+  %v3 = fptrunc <2 x fp128> undef to <2 x double>
+  %v4 = fptrunc <2 x fp128> undef to <2 x float>
+  %v5 = fptrunc <2 x double> undef to <2 x float>
+  %v6 = fptrunc <4 x fp128> undef to <4 x double>
+  %v7 = fptrunc <4 x fp128> undef to <4 x float>
+  %v8 = fptrunc <4 x double> undef to <4 x float>
+  %v9 = fptrunc <8 x fp128> undef to <8 x double>
+  %v10 = fptrunc <8 x fp128> undef to <8 x float>
+  %v11 = fptrunc <8 x double> undef to <8 x float>
+  %v12 = fptrunc <16 x double> undef to <16 x float>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = fptrunc fp128 undef to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = fptrunc fp128 undef to float
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = fptrunc double undef to float
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v3 = fptrunc <2 x fp128> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v4 = fptrunc <2 x fp128> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v5 = fptrunc <2 x double> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v6 = fptrunc <4 x fp128> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v7 = fptrunc <4 x fp128> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v8 = fptrunc <4 x double> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v9 = fptrunc <8 x fp128> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v10 = fptrunc <8 x fp128> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v11 = fptrunc <8 x double> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v12 = fptrunc <16 x double> undef to <16 x float>
+
+  ret void;
+}
+
+define void @sitofp() {
+  %v0 = sitofp i64 undef to fp128
+  %v1 = sitofp i64 undef to double
+  %v2 = sitofp i64 undef to float
+  %v3 = sitofp i32 undef to fp128
+  %v4 = sitofp i32 undef to double
+  %v5 = sitofp i32 undef to float
+  %v6 = sitofp i16 undef to fp128
+  %v7 = sitofp i16 undef to double
+  %v8 = sitofp i16 undef to float
+  %v9 = sitofp i8 undef to fp128
+  %v10 = sitofp i8 undef to double
+  %v11 = sitofp i8 undef to float
+  %v12 = sitofp <2 x i64> undef to <2 x fp128>
+  %v13 = sitofp <2 x i64> undef to <2 x double>
+  %v14 = sitofp <2 x i64> undef to <2 x float>
+  %v15 = sitofp <2 x i32> undef to <2 x fp128>
+  %v16 = sitofp <2 x i32> undef to <2 x double>
+  %v17 = sitofp <2 x i32> undef to <2 x float>
+  %v18 = sitofp <2 x i16> undef to <2 x fp128>
+  %v19 = sitofp <2 x i16> undef to <2 x double>
+  %v20 = sitofp <2 x i16> undef to <2 x float>
+  %v21 = sitofp <2 x i8> undef to <2 x fp128>
+  %v22 = sitofp <2 x i8> undef to <2 x double>
+  %v23 = sitofp <2 x i8> undef to <2 x float>
+  %v24 = sitofp <4 x i64> undef to <4 x fp128>
+  %v25 = sitofp <4 x i64> undef to <4 x double>
+  %v26 = sitofp <4 x i64> undef to <4 x float>
+  %v27 = sitofp <4 x i32> undef to <4 x fp128>
+  %v28 = sitofp <4 x i32> undef to <4 x double>
+  %v29 = sitofp <4 x i32> undef to <4 x float>
+  %v30 = sitofp <4 x i16> undef to <4 x fp128>
+  %v31 = sitofp <4 x i16> undef to <4 x double>
+  %v32 = sitofp <4 x i16> undef to <4 x float>
+  %v33 = sitofp <4 x i8> undef to <4 x fp128>
+  %v34 = sitofp <4 x i8> undef to <4 x double>
+  %v35 = sitofp <4 x i8> undef to <4 x float>
+  %v36 = sitofp <8 x i64> undef to <8 x fp128>
+  %v37 = sitofp <8 x i64> undef to <8 x double>
+  %v38 = sitofp <8 x i64> undef to <8 x float>
+  %v39 = sitofp <8 x i32> undef to <8 x fp128>
+  %v40 = sitofp <8 x i32> undef to <8 x double>
+  %v41 = sitofp <8 x i32> undef to <8 x float>
+  %v42 = sitofp <8 x i16> undef to <8 x fp128>
+  %v43 = sitofp <8 x i16> undef to <8 x double>
+  %v44 = sitofp <8 x i16> undef to <8 x float>
+  %v45 = sitofp <8 x i8> undef to <8 x fp128>
+  %v46 = sitofp <8 x i8> undef to <8 x double>
+  %v47 = sitofp <8 x i8> undef to <8 x float>
+  %v48 = sitofp <16 x i64> undef to <16 x double>
+  %v49 = sitofp <16 x i64> undef to <16 x float>
+  %v50 = sitofp <16 x i32> undef to <16 x double>
+  %v51 = sitofp <16 x i32> undef to <16 x float>
+  %v52 = sitofp <16 x i16> undef to <16 x double>
+  %v53 = sitofp <16 x i16> undef to <16 x float>
+  %v54 = sitofp <16 x i8> undef to <16 x double>
+  %v55 = sitofp <16 x i8> undef to <16 x float>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = sitofp i64 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = sitofp i64 undef to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = sitofp i64 undef to float
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = sitofp i32 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = sitofp i32 undef to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = sitofp i32 undef to float
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v6 = sitofp i16 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v7 = sitofp i16 undef to double
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v8 = sitofp i16 undef to float
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v9 = sitofp i8 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v10 = sitofp i8 undef to double
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v11 = sitofp i8 undef to float
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v12 = sitofp <2 x i64> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v13 = sitofp <2 x i64> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v14 = sitofp <2 x i64> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = sitofp <2 x i32> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v16 = sitofp <2 x i32> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v17 = sitofp <2 x i32> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v18 = sitofp <2 x i16> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v19 = sitofp <2 x i16> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v20 = sitofp <2 x i16> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v21 = sitofp <2 x i8> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v22 = sitofp <2 x i8> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v23 = sitofp <2 x i8> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v24 = sitofp <4 x i64> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v25 = sitofp <4 x i64> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v26 = sitofp <4 x i64> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = sitofp <4 x i32> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v28 = sitofp <4 x i32> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = sitofp <4 x i32> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v30 = sitofp <4 x i16> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v31 = sitofp <4 x i16> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v32 = sitofp <4 x i16> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v33 = sitofp <4 x i8> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v34 = sitofp <4 x i8> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v35 = sitofp <4 x i8> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v36 = sitofp <8 x i64> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v37 = sitofp <8 x i64> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v38 = sitofp <8 x i64> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = sitofp <8 x i32> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v40 = sitofp <8 x i32> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v41 = sitofp <8 x i32> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v42 = sitofp <8 x i16> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v43 = sitofp <8 x i16> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v44 = sitofp <8 x i16> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v45 = sitofp <8 x i8> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v46 = sitofp <8 x i8> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v47 = sitofp <8 x i8> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = sitofp <16 x i64> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v49 = sitofp <16 x i64> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v50 = sitofp <16 x i32> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v51 = sitofp <16 x i32> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v52 = sitofp <16 x i16> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v53 = sitofp <16 x i16> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v54 = sitofp <16 x i8> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v55 = sitofp <16 x i8> undef to <16 x float>
+
+  ret void;
+}
+
+define void @uitofp() {
+  %v0 = uitofp i64 undef to fp128
+  %v1 = uitofp i64 undef to double
+  %v2 = uitofp i64 undef to float
+  %v3 = uitofp i32 undef to fp128
+  %v4 = uitofp i32 undef to double
+  %v5 = uitofp i32 undef to float
+  %v6 = uitofp i16 undef to fp128
+  %v7 = uitofp i16 undef to double
+  %v8 = uitofp i16 undef to float
+  %v9 = uitofp i8 undef to fp128
+  %v10 = uitofp i8 undef to double
+  %v11 = uitofp i8 undef to float
+  %v12 = uitofp <2 x i64> undef to <2 x fp128>
+  %v13 = uitofp <2 x i64> undef to <2 x double>
+  %v14 = uitofp <2 x i64> undef to <2 x float>
+  %v15 = uitofp <2 x i32> undef to <2 x fp128>
+  %v16 = uitofp <2 x i32> undef to <2 x double>
+  %v17 = uitofp <2 x i32> undef to <2 x float>
+  %v18 = uitofp <2 x i16> undef to <2 x fp128>
+  %v19 = uitofp <2 x i16> undef to <2 x double>
+  %v20 = uitofp <2 x i16> undef to <2 x float>
+  %v21 = uitofp <2 x i8> undef to <2 x fp128>
+  %v22 = uitofp <2 x i8> undef to <2 x double>
+  %v23 = uitofp <2 x i8> undef to <2 x float>
+  %v24 = uitofp <4 x i64> undef to <4 x fp128>
+  %v25 = uitofp <4 x i64> undef to <4 x double>
+  %v26 = uitofp <4 x i64> undef to <4 x float>
+  %v27 = uitofp <4 x i32> undef to <4 x fp128>
+  %v28 = uitofp <4 x i32> undef to <4 x double>
+  %v29 = uitofp <4 x i32> undef to <4 x float>
+  %v30 = uitofp <4 x i16> undef to <4 x fp128>
+  %v31 = uitofp <4 x i16> undef to <4 x double>
+  %v32 = uitofp <4 x i16> undef to <4 x float>
+  %v33 = uitofp <4 x i8> undef to <4 x fp128>
+  %v34 = uitofp <4 x i8> undef to <4 x double>
+  %v35 = uitofp <4 x i8> undef to <4 x float>
+  %v36 = uitofp <8 x i64> undef to <8 x fp128>
+  %v37 = uitofp <8 x i64> undef to <8 x double>
+  %v38 = uitofp <8 x i64> undef to <8 x float>
+  %v39 = uitofp <8 x i32> undef to <8 x fp128>
+  %v40 = uitofp <8 x i32> undef to <8 x double>
+  %v41 = uitofp <8 x i32> undef to <8 x float>
+  %v42 = uitofp <8 x i16> undef to <8 x fp128>
+  %v43 = uitofp <8 x i16> undef to <8 x double>
+  %v44 = uitofp <8 x i16> undef to <8 x float>
+  %v45 = uitofp <8 x i8> undef to <8 x fp128>
+  %v46 = uitofp <8 x i8> undef to <8 x double>
+  %v47 = uitofp <8 x i8> undef to <8 x float>
+  %v48 = uitofp <16 x i64> undef to <16 x double>
+  %v49 = uitofp <16 x i64> undef to <16 x float>
+  %v50 = uitofp <16 x i32> undef to <16 x double>
+  %v51 = uitofp <16 x i32> undef to <16 x float>
+  %v52 = uitofp <16 x i16> undef to <16 x double>
+  %v53 = uitofp <16 x i16> undef to <16 x float>
+  %v54 = uitofp <16 x i8> undef to <16 x double>
+  %v55 = uitofp <16 x i8> undef to <16 x float>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = uitofp i64 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = uitofp i64 undef to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = uitofp i64 undef to float
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = uitofp i32 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = uitofp i32 undef to double
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = uitofp i32 undef to float
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v6 = uitofp i16 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v7 = uitofp i16 undef to double
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v8 = uitofp i16 undef to float
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v9 = uitofp i8 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v10 = uitofp i8 undef to double
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v11 = uitofp i8 undef to float
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v12 = uitofp <2 x i64> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v13 = uitofp <2 x i64> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v14 = uitofp <2 x i64> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v15 = uitofp <2 x i32> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v16 = uitofp <2 x i32> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v17 = uitofp <2 x i32> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v18 = uitofp <2 x i16> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v19 = uitofp <2 x i16> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v20 = uitofp <2 x i16> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v21 = uitofp <2 x i8> undef to <2 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v22 = uitofp <2 x i8> undef to <2 x double>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v23 = uitofp <2 x i8> undef to <2 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v24 = uitofp <4 x i64> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v25 = uitofp <4 x i64> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v26 = uitofp <4 x i64> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v27 = uitofp <4 x i32> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v28 = uitofp <4 x i32> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = uitofp <4 x i32> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v30 = uitofp <4 x i16> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v31 = uitofp <4 x i16> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v32 = uitofp <4 x i16> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v33 = uitofp <4 x i8> undef to <4 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v34 = uitofp <4 x i8> undef to <4 x double>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v35 = uitofp <4 x i8> undef to <4 x float>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v36 = uitofp <8 x i64> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v37 = uitofp <8 x i64> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v38 = uitofp <8 x i64> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 16 for instruction:   %v39 = uitofp <8 x i32> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v40 = uitofp <8 x i32> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v41 = uitofp <8 x i32> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v42 = uitofp <8 x i16> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v43 = uitofp <8 x i16> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v44 = uitofp <8 x i16> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %v45 = uitofp <8 x i8> undef to <8 x fp128>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v46 = uitofp <8 x i8> undef to <8 x double>
+; CHECK: Cost Model: Found an estimated cost of 32 for instruction:   %v47 = uitofp <8 x i8> undef to <8 x float>
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %v48 = uitofp <16 x i64> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v49 = uitofp <16 x i64> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v50 = uitofp <16 x i32> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %v51 = uitofp <16 x i32> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v52 = uitofp <16 x i16> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v53 = uitofp <16 x i16> undef to <16 x float>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v54 = uitofp <16 x i8> undef to <16 x double>
+; CHECK: Cost Model: Found an estimated cost of 64 for instruction:   %v55 = uitofp <16 x i8> undef to <16 x float>
+
+  ret void;
+}
diff --git a/test/Analysis/CostModel/SystemZ/int-arith.ll b/test/Analysis/CostModel/SystemZ/int-arith.ll
new file mode 100644
index 0000000000000000000000000000000000000000..518c9b01e4e1431c74d632f7bc1e8a9532635f34
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/int-arith.ll
@@ -0,0 +1,326 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Note: The scalarized vector instructions costs are not including any
+; extracts, due to the undef operands.
+
+define void @add() {
+  %res0 = add i8 undef, undef
+  %res1 = add i16 undef, undef
+  %res2 = add i32 undef, undef
+  %res3 = add i64 undef, undef
+  %res4 = add <2 x i8> undef, undef
+  %res5 = add <2 x i16> undef, undef
+  %res6 = add <2 x i32> undef, undef
+  %res7 = add <2 x i64> undef, undef
+  %res8 = add <4 x i8> undef, undef
+  %res9 = add <4 x i16> undef, undef
+  %res10 = add <4 x i32> undef, undef
+  %res11 = add <4 x i64> undef, undef
+  %res12 = add <8 x i8> undef, undef
+  %res13 = add <8 x i16> undef, undef
+  %res14 = add <8 x i32> undef, undef
+  %res15 = add <8 x i64> undef, undef
+  %res16 = add <16 x i8> undef, undef
+  %res17 = add <16 x i16> undef, undef
+  %res18 = add <16 x i32> undef, undef
+  %res19 = add <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = add i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = add i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = add i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = add i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = add <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = add <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = add <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = add <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = add <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = add <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = add <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = add <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = add <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = add <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = add <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = add <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = add <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = add <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = add <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = add <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @sub() {
+  %res0 = sub i8 undef, undef
+  %res1 = sub i16 undef, undef
+  %res2 = sub i32 undef, undef
+  %res3 = sub i64 undef, undef
+  %res4 = sub <2 x i8> undef, undef
+  %res5 = sub <2 x i16> undef, undef
+  %res6 = sub <2 x i32> undef, undef
+  %res7 = sub <2 x i64> undef, undef
+  %res8 = sub <4 x i8> undef, undef
+  %res9 = sub <4 x i16> undef, undef
+  %res10 = sub <4 x i32> undef, undef
+  %res11 = sub <4 x i64> undef, undef
+  %res12 = sub <8 x i8> undef, undef
+  %res13 = sub <8 x i16> undef, undef
+  %res14 = sub <8 x i32> undef, undef
+  %res15 = sub <8 x i64> undef, undef
+  %res16 = sub <16 x i8> undef, undef
+  %res17 = sub <16 x i16> undef, undef
+  %res18 = sub <16 x i32> undef, undef
+  %res19 = sub <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = sub i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = sub i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = sub i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = sub i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = sub <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = sub <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = sub <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = sub <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = sub <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = sub <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = sub <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = sub <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = sub <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = sub <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = sub <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = sub <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = sub <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = sub <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = sub <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = sub <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @mul() {
+  %res0 = mul i8 undef, undef
+  %res1 = mul i16 undef, undef
+  %res2 = mul i32 undef, undef
+  %res3 = mul i64 undef, undef
+  %res4 = mul <2 x i8> undef, undef
+  %res5 = mul <2 x i16> undef, undef
+  %res6 = mul <2 x i32> undef, undef
+  %res7 = mul <2 x i64> undef, undef
+  %res8 = mul <4 x i8> undef, undef
+  %res9 = mul <4 x i16> undef, undef
+  %res10 = mul <4 x i32> undef, undef
+  %res11 = mul <4 x i64> undef, undef
+  %res12 = mul <8 x i8> undef, undef
+  %res13 = mul <8 x i16> undef, undef
+  %res14 = mul <8 x i32> undef, undef
+  %res15 = mul <8 x i64> undef, undef
+  %res16 = mul <16 x i8> undef, undef
+  %res17 = mul <16 x i16> undef, undef
+  %res18 = mul <16 x i32> undef, undef
+  %res19 = mul <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = mul i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = mul i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = mul i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = mul i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = mul <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = mul <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = mul <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = mul <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = mul <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = mul <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = mul <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = mul <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = mul <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = mul <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = mul <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res15 = mul <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = mul <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = mul <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = mul <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res19 = mul <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @sdiv() {
+  %res0 = sdiv i8 undef, undef
+  %res1 = sdiv i16 undef, undef
+  %res2 = sdiv i32 undef, undef
+  %res3 = sdiv i64 undef, undef
+  %res4 = sdiv <2 x i8> undef, undef
+  %res5 = sdiv <2 x i16> undef, undef
+  %res6 = sdiv <2 x i32> undef, undef
+  %res7 = sdiv <2 x i64> undef, undef
+  %res8 = sdiv <4 x i8> undef, undef
+  %res9 = sdiv <4 x i16> undef, undef
+  %res10 = sdiv <4 x i32> undef, undef
+  %res11 = sdiv <4 x i64> undef, undef
+  %res12 = sdiv <8 x i8> undef, undef
+  %res13 = sdiv <8 x i16> undef, undef
+  %res14 = sdiv <8 x i32> undef, undef
+  %res15 = sdiv <8 x i64> undef, undef
+  %res16 = sdiv <16 x i8> undef, undef
+  %res17 = sdiv <16 x i16> undef, undef
+  %res18 = sdiv <16 x i32> undef, undef
+  %res19 = sdiv <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = sdiv i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = sdiv i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = sdiv i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = sdiv i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = sdiv <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = sdiv <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = sdiv <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = sdiv <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = sdiv <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = sdiv <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = sdiv <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = sdiv <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res12 = sdiv <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res13 = sdiv <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res14 = sdiv <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res15 = sdiv <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res16 = sdiv <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res17 = sdiv <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %res18 = sdiv <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res19 = sdiv <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @srem() {
+  %res0 = srem i8 undef, undef
+  %res1 = srem i16 undef, undef
+  %res2 = srem i32 undef, undef
+  %res3 = srem i64 undef, undef
+  %res4 = srem <2 x i8> undef, undef
+  %res5 = srem <2 x i16> undef, undef
+  %res6 = srem <2 x i32> undef, undef
+  %res7 = srem <2 x i64> undef, undef
+  %res8 = srem <4 x i8> undef, undef
+  %res9 = srem <4 x i16> undef, undef
+  %res10 = srem <4 x i32> undef, undef
+  %res11 = srem <4 x i64> undef, undef
+  %res12 = srem <8 x i8> undef, undef
+  %res13 = srem <8 x i16> undef, undef
+  %res14 = srem <8 x i32> undef, undef
+  %res15 = srem <8 x i64> undef, undef
+  %res16 = srem <16 x i8> undef, undef
+  %res17 = srem <16 x i16> undef, undef
+  %res18 = srem <16 x i32> undef, undef
+  %res19 = srem <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = srem i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = srem i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = srem i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = srem i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = srem <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = srem <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = srem <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %res7 = srem <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = srem <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = srem <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = srem <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res11 = srem <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res12 = srem <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res13 = srem <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res14 = srem <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res15 = srem <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res16 = srem <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res17 = srem <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %res18 = srem <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res19 = srem <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @udiv() {
+  %res0 = udiv i8 undef, undef
+  %res1 = udiv i16 undef, undef
+  %res2 = udiv i32 undef, undef
+  %res3 = udiv i64 undef, undef
+  %res4 = udiv <2 x i8> undef, undef
+  %res5 = udiv <2 x i16> undef, undef
+  %res6 = udiv <2 x i32> undef, undef
+  %res7 = udiv <2 x i64> undef, undef
+  %res8 = udiv <4 x i8> undef, undef
+  %res9 = udiv <4 x i16> undef, undef
+  %res10 = udiv <4 x i32> undef, undef
+  %res11 = udiv <4 x i64> undef, undef
+  %res12 = udiv <8 x i8> undef, undef
+  %res13 = udiv <8 x i16> undef, undef
+  %res14 = udiv <8 x i32> undef, undef
+  %res15 = udiv <8 x i64> undef, undef
+  %res16 = udiv <16 x i8> undef, undef
+  %res17 = udiv <16 x i16> undef, undef
+  %res18 = udiv <16 x i32> undef, undef
+  %res19 = udiv <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = udiv i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = udiv i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = udiv i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res3 = udiv i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = udiv <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = udiv <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = udiv <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %res7 = udiv <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = udiv <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = udiv <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = udiv <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res11 = udiv <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res12 = udiv <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res13 = udiv <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res14 = udiv <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res15 = udiv <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res16 = udiv <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res17 = udiv <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %res18 = udiv <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res19 = udiv <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @urem() {
+  %res0 = urem i8 undef, undef
+  %res1 = urem i16 undef, undef
+  %res2 = urem i32 undef, undef
+  %res3 = urem i64 undef, undef
+  %res4 = urem <2 x i8> undef, undef
+  %res5 = urem <2 x i16> undef, undef
+  %res6 = urem <2 x i32> undef, undef
+  %res7 = urem <2 x i64> undef, undef
+  %res8 = urem <4 x i8> undef, undef
+  %res9 = urem <4 x i16> undef, undef
+  %res10 = urem <4 x i32> undef, undef
+  %res11 = urem <4 x i64> undef, undef
+  %res12 = urem <8 x i8> undef, undef
+  %res13 = urem <8 x i16> undef, undef
+  %res14 = urem <8 x i32> undef, undef
+  %res15 = urem <8 x i64> undef, undef
+  %res16 = urem <16 x i8> undef, undef
+  %res17 = urem <16 x i16> undef, undef
+  %res18 = urem <16 x i32> undef, undef
+  %res19 = urem <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res0 = urem i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res1 = urem i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res2 = urem i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res3 = urem i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res4 = urem <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res5 = urem <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %res6 = urem <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %res7 = urem <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res8 = urem <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res9 = urem <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %res10 = urem <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 10 for instruction:   %res11 = urem <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res12 = urem <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res13 = urem <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 24 for instruction:   %res14 = urem <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 20 for instruction:   %res15 = urem <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res16 = urem <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 80 for instruction:   %res17 = urem <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 48 for instruction:   %res18 = urem <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 40 for instruction:   %res19 = urem <16 x i64> undef, undef
+
+  ret void;
+}
diff --git a/test/Analysis/CostModel/SystemZ/int-cast.ll b/test/Analysis/CostModel/SystemZ/int-cast.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7764c6ff756f71c7df64abd9a9e4babe221efea0
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/int-cast.ll
@@ -0,0 +1,199 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+define void @sext() {
+  %v0 = sext i8 undef to i16
+  %v1 = sext i8 undef to i32
+  %v2 = sext i8 undef to i64
+  %v3 = sext i16 undef to i32
+  %v4 = sext i16 undef to i64
+  %v5 = sext i32 undef to i64
+  %v6 = sext <2 x i8> undef to <2 x i16>
+  %v7 = sext <2 x i8> undef to <2 x i32>
+  %v8 = sext <2 x i8> undef to <2 x i64>
+  %v9 = sext <2 x i16> undef to <2 x i32>
+  %v10 = sext <2 x i16> undef to <2 x i64>
+  %v11 = sext <2 x i32> undef to <2 x i64>
+  %v12 = sext <4 x i8> undef to <4 x i16>
+  %v13 = sext <4 x i8> undef to <4 x i32>
+  %v14 = sext <4 x i8> undef to <4 x i64>
+  %v15 = sext <4 x i16> undef to <4 x i32>
+  %v16 = sext <4 x i16> undef to <4 x i64>
+  %v17 = sext <4 x i32> undef to <4 x i64>
+  %v18 = sext <8 x i8> undef to <8 x i16>
+  %v19 = sext <8 x i8> undef to <8 x i32>
+  %v20 = sext <8 x i8> undef to <8 x i64>
+  %v21 = sext <8 x i16> undef to <8 x i32>
+  %v22 = sext <8 x i16> undef to <8 x i64>
+  %v23 = sext <8 x i32> undef to <8 x i64>
+  %v24 = sext <16 x i8> undef to <16 x i16>
+  %v25 = sext <16 x i8> undef to <16 x i32>
+  %v26 = sext <16 x i8> undef to <16 x i64>
+  %v27 = sext <16 x i16> undef to <16 x i32>
+  %v28 = sext <16 x i16> undef to <16 x i64>
+  %v29 = sext <16 x i32> undef to <16 x i64>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = sext i8 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = sext i8 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = sext i8 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = sext i16 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = sext i16 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = sext i32 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = sext <2 x i8> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v7 = sext <2 x i8> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v8 = sext <2 x i8> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = sext <2 x i16> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v10 = sext <2 x i16> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = sext <2 x i32> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v12 = sext <4 x i8> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v13 = sext <4 x i8> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v14 = sext <4 x i8> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v15 = sext <4 x i16> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v16 = sext <4 x i16> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v17 = sext <4 x i32> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v18 = sext <8 x i8> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v19 = sext <8 x i8> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 15 for instruction:   %v20 = sext <8 x i8> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v21 = sext <8 x i16> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v22 = sext <8 x i16> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v23 = sext <8 x i32> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v24 = sext <16 x i8> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v25 = sext <16 x i8> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 31 for instruction:   %v26 = sext <16 x i8> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v27 = sext <16 x i16> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 22 for instruction:   %v28 = sext <16 x i16> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = sext <16 x i32> undef to <16 x i64>
+
+ ret void
+}
+
+define void @zext() {
+  %v0 = zext i8 undef to i16
+  %v1 = zext i8 undef to i32
+  %v2 = zext i8 undef to i64
+  %v3 = zext i16 undef to i32
+  %v4 = zext i16 undef to i64
+  %v5 = zext i32 undef to i64
+  %v6 = zext <2 x i8> undef to <2 x i16>
+  %v7 = zext <2 x i8> undef to <2 x i32>
+  %v8 = zext <2 x i8> undef to <2 x i64>
+  %v9 = zext <2 x i16> undef to <2 x i32>
+  %v10 = zext <2 x i16> undef to <2 x i64>
+  %v11 = zext <2 x i32> undef to <2 x i64>
+  %v12 = zext <4 x i8> undef to <4 x i16>
+  %v13 = zext <4 x i8> undef to <4 x i32>
+  %v14 = zext <4 x i8> undef to <4 x i64>
+  %v15 = zext <4 x i16> undef to <4 x i32>
+  %v16 = zext <4 x i16> undef to <4 x i64>
+  %v17 = zext <4 x i32> undef to <4 x i64>
+  %v18 = zext <8 x i8> undef to <8 x i16>
+  %v19 = zext <8 x i8> undef to <8 x i32>
+  %v20 = zext <8 x i8> undef to <8 x i64>
+  %v21 = zext <8 x i16> undef to <8 x i32>
+  %v22 = zext <8 x i16> undef to <8 x i64>
+  %v23 = zext <8 x i32> undef to <8 x i64>
+  %v24 = zext <16 x i8> undef to <16 x i16>
+  %v25 = zext <16 x i8> undef to <16 x i32>
+  %v26 = zext <16 x i8> undef to <16 x i64>
+  %v27 = zext <16 x i16> undef to <16 x i32>
+  %v28 = zext <16 x i16> undef to <16 x i64>
+  %v29 = zext <16 x i32> undef to <16 x i64>
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v0 = zext i8 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v1 = zext i8 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v2 = zext i8 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v3 = zext i16 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v4 = zext i16 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v5 = zext i32 undef to i64
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = zext <2 x i8> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v7 = zext <2 x i8> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v8 = zext <2 x i8> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = zext <2 x i16> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v10 = zext <2 x i16> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = zext <2 x i32> undef to <2 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v12 = zext <4 x i8> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v13 = zext <4 x i8> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v14 = zext <4 x i8> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v15 = zext <4 x i16> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v16 = zext <4 x i16> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v17 = zext <4 x i32> undef to <4 x i64>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v18 = zext <8 x i8> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v19 = zext <8 x i8> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 15 for instruction:   %v20 = zext <8 x i8> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v21 = zext <8 x i16> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v22 = zext <8 x i16> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v23 = zext <8 x i32> undef to <8 x i64>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v24 = zext <16 x i8> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 11 for instruction:   %v25 = zext <16 x i8> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 31 for instruction:   %v26 = zext <16 x i8> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v27 = zext <16 x i16> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 22 for instruction:   %v28 = zext <16 x i16> undef to <16 x i64>
+; CHECK: Cost Model: Found an estimated cost of 12 for instruction:   %v29 = zext <16 x i32> undef to <16 x i64>
+
+ ret void
+}
+
+define void @trunc() {
+  %v0 = trunc i16 undef to i8
+  %v1 = trunc i32 undef to i16
+  %v2 = trunc i32 undef to i8
+  %v3 = trunc i64 undef to i32
+  %v4 = trunc i64 undef to i16
+  %v5 = trunc i64 undef to i8
+  %v6 = trunc <2 x i16> undef to <2 x i8>
+  %v7 = trunc <2 x i32> undef to <2 x i16>
+  %v8 = trunc <2 x i32> undef to <2 x i8>
+  %v9 = trunc <2 x i64> undef to <2 x i32>
+  %v10 = trunc <2 x i64> undef to <2 x i16>
+  %v11 = trunc <2 x i64> undef to <2 x i8>
+  %v12 = trunc <4 x i16> undef to <4 x i8>
+  %v13 = trunc <4 x i32> undef to <4 x i16>
+  %v14 = trunc <4 x i32> undef to <4 x i8>
+  %v15 = trunc <4 x i64> undef to <4 x i32>
+  %v16 = trunc <4 x i64> undef to <4 x i16>
+  %v17 = trunc <4 x i64> undef to <4 x i8>
+  %v18 = trunc <8 x i16> undef to <8 x i8>
+  %v19 = trunc <8 x i32> undef to <8 x i16>
+  %v20 = trunc <8 x i32> undef to <8 x i8>
+  %v21 = trunc <8 x i64> undef to <8 x i32>
+  %v22 = trunc <8 x i64> undef to <8 x i16>
+  %v23 = trunc <8 x i64> undef to <8 x i8>
+  %v24 = trunc <16 x i16> undef to <16 x i8>
+  %v25 = trunc <16 x i32> undef to <16 x i16>
+  %v26 = trunc <16 x i32> undef to <16 x i8>
+  %v27 = trunc <16 x i64> undef to <16 x i32>
+  %v28 = trunc <16 x i64> undef to <16 x i16>
+  %v29 = trunc <16 x i64> undef to <16 x i8>
+
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v0 = trunc i16 undef to i8
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v1 = trunc i32 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v2 = trunc i32 undef to i8
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v3 = trunc i64 undef to i32
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v4 = trunc i64 undef to i16
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %v5 = trunc i64 undef to i8
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v6 = trunc <2 x i16> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v7 = trunc <2 x i32> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v8 = trunc <2 x i32> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v9 = trunc <2 x i64> undef to <2 x i32>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v10 = trunc <2 x i64> undef to <2 x i16>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v11 = trunc <2 x i64> undef to <2 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v12 = trunc <4 x i16> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v13 = trunc <4 x i32> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v14 = trunc <4 x i32> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v15 = trunc <4 x i64> undef to <4 x i32>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v16 = trunc <4 x i64> undef to <4 x i16>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v17 = trunc <4 x i64> undef to <4 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v18 = trunc <8 x i16> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v19 = trunc <8 x i32> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v20 = trunc <8 x i32> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v21 = trunc <8 x i64> undef to <8 x i32>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v22 = trunc <8 x i64> undef to <8 x i16>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v23 = trunc <8 x i64> undef to <8 x i8>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %v24 = trunc <16 x i16> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %v25 = trunc <16 x i32> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %v26 = trunc <16 x i32> undef to <16 x i8>
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %v27 = trunc <16 x i64> undef to <16 x i32>
+; CHECK: Cost Model: Found an estimated cost of 6 for instruction:   %v28 = trunc <16 x i64> undef to <16 x i16>
+; CHECK: Cost Model: Found an estimated cost of 7 for instruction:   %v29 = trunc <16 x i64> undef to <16 x i8>
+
+ ret void
+}
diff --git a/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll b/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ff5b2a2053f005b748473609a859369959b7f9ac
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
@@ -0,0 +1,66 @@
+; RUN: opt < %s -O3 -S -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+;
+; Regression test for a crash in getIntrinsicInstrCost().
+; Don't call getScalarizationOverhead(RetTy, true, false) if RetTy is void type.
+
+%"class.llvm::SDNode.310.1762.9990.10474.10958.11442.11926.12410.12894.13378.13862.15314.15798.16282.17734.19186.21122.25962.26930.29350.29834.30318.30802.31286.31770.32254.32738.33706.36610.38062.41642" = type <{ %"class.llvm::FoldingSetImpl::Node.298.1750.9978.10462.10946.11430.11914.12398.12882.13366.13850.15302.15786.16270.17722.19174.21110.25950.26918.29338.29822.30306.30790.31274.31758.32242.32726.33694.36598.38050.41625", %"class.llvm::ilist_node.228.300.1752.9980.10464.10948.11432.11916.12400.12884.13368.13852.15304.15788.16272.17724.19176.21112.25952.26920.29340.29824.30308.30792.31276.31760.32244.32728.33696.36600.38052.41628", i16, %union.anon.230.302.1754.9982.10466.10950.11434.11918.12402.12886.13370.13854.15306.15790.16274.17726.19178.21114.25954.26922.29342.29826.30310.30794.31278.31762.32246.32730.33698.36602.38054.41630, i32, %"class.llvm::SDUse.304.1756.9984.10468.10952.11436.11920.12404.12888.13372.13856.15308.15792.16276.17728.19180.21116.25956.26924.29344.29828.30312.30796.31280.31764.32248.32732.33700.36604.38056.41632"*, %"struct.llvm::EVT.305.1757.9985.10469.10953.11437.11921.12405.12889.13373.13857.15309.15793.16277.17729.19181.21117.25957.26925.29345.29829.30313.30797.31281.31765.32249.32733.33701.36605.38057.41637"*, %"class.llvm::SDUse.304.1756.9984.10468.10952.11436.11920.12404.12888.13372.13856.15308.15792.16276.17728.19180.21116.25956.26924.29344.29828.30312.30796.31280.31764.32248.32732.33700.36604.38056.41632"*, i16, i16, i32, %"class.llvm::DebugLoc.309.1761.9989.10473.10957.11441.11925.12409.12893.13377.13861.15313.15797.16281.17733.19185.21121.25961.26929.29349.29833.30317.30801.31285.31769.32253.32737.33705.36609.38061.41641", i16, [6 x i8] }>
+%"class.llvm::FoldingSetImpl::Node.298.1750.9978.10462.10946.11430.11914.12398.12882.13366.13850.15302.15786.16270.17722.19174.21110.25950.26918.29338.29822.30306.30790.31274.31758.32242.32726.33694.36598.38050.41625" = type { i8* }
+%"class.llvm::ilist_node.228.300.1752.9980.10464.10948.11432.11916.12400.12884.13368.13852.15304.15788.16272.17724.19176.21112.25952.26920.29340.29824.30308.30792.31276.31760.32244.32728.33696.36600.38052.41628" = type { %"class.llvm::ilist_node_impl.229.299.1751.9979.10463.10947.11431.11915.12399.12883.13367.13851.15303.15787.16271.17723.19175.21111.25951.26919.29339.29823.30307.30791.31275.31759.32243.32727.33695.36599.38051.41627" }
+%"class.llvm::ilist_node_impl.229.299.1751.9979.10463.10947.11431.11915.12399.12883.13367.13851.15303.15787.16271.17723.19175.21111.25951.26919.29339.29823.30307.30791.31275.31759.32243.32727.33695.36599.38051.41627" = type { %"class.llvm::ilist_node_base.83.1535.9763.10247.10731.11215.11699.12183.12667.13151.13635.15087.15571.16055.17507.18959.20895.25735.26703.29123.29607.30091.30575.31059.31543.32027.32511.33479.36383.37835.41626" }
+%"class.llvm::ilist_node_base.83.1535.9763.10247.10731.11215.11699.12183.12667.13151.13635.15087.15571.16055.17507.18959.20895.25735.26703.29123.29607.30091.30575.31059.31543.32027.32511.33479.36383.37835.41626" = type { %"class.llvm::ilist_node_base.83.1535.9763.10247.10731.11215.11699.12183.12667.13151.13635.15087.15571.16055.17507.18959.20895.25735.26703.29123.29607.30091.30575.31059.31543.32027.32511.33479.36383.37835.41626"*, %"class.llvm::ilist_node_base.83.1535.9763.10247.10731.11215.11699.12183.12667.13151.13635.15087.15571.16055.17507.18959.20895.25735.26703.29123.29607.30091.30575.31059.31543.32027.32511.33479.36383.37835.41626"* }
+%union.anon.230.302.1754.9982.10466.10950.11434.11918.12402.12886.13370.13854.15306.15790.16274.17726.19178.21114.25954.26922.29342.29826.30310.30794.31278.31762.32246.32730.33698.36602.38054.41630 = type { %"class.llvm::SDNode::LSBaseSDNodeBitfields.301.1753.9981.10465.10949.11433.11917.12401.12885.13369.13853.15305.15789.16273.17725.19177.21113.25953.26921.29341.29825.30309.30793.31277.31761.32245.32729.33697.36601.38053.41629" }
+%"class.llvm::SDNode::LSBaseSDNodeBitfields.301.1753.9981.10465.10949.11433.11917.12401.12885.13369.13853.15305.15789.16273.17725.19177.21113.25953.26921.29341.29825.30309.30793.31277.31761.32245.32729.33697.36601.38053.41629" = type { i16 }
+%"struct.llvm::EVT.305.1757.9985.10469.10953.11437.11921.12405.12889.13373.13857.15309.15793.16277.17729.19181.21117.25957.26925.29345.29829.30313.30797.31281.31765.32249.32733.33701.36605.38057.41637" = type { %"class.llvm::MVT.62.1514.9742.10226.10710.11194.11678.12162.12646.13130.13614.15066.15550.16034.17486.18938.20874.25714.26682.29102.29586.30070.30554.31038.31522.32006.32490.33458.36362.37814.41633", %"class.llvm::Type.77.1529.9757.10241.10725.11209.11693.12177.12661.13145.13629.15081.15565.16049.17501.18953.20889.25729.26697.29117.29601.30085.30569.31053.31537.32021.32505.33473.36377.37829.41636"* }
+%"class.llvm::MVT.62.1514.9742.10226.10710.11194.11678.12162.12646.13130.13614.15066.15550.16034.17486.18938.20874.25714.26682.29102.29586.30070.30554.31038.31522.32006.32490.33458.36362.37814.41633" = type { i8 }
+%"class.llvm::Type.77.1529.9757.10241.10725.11209.11693.12177.12661.13145.13629.15081.15565.16049.17501.18953.20889.25729.26697.29117.29601.30085.30569.31053.31537.32021.32505.33473.36377.37829.41636" = type { %"class.llvm::LLVMContext.76.1528.9756.10240.10724.11208.11692.12176.12660.13144.13628.15080.15564.16048.17500.18952.20888.25728.26696.29116.29600.30084.30568.31052.31536.32020.32504.33472.36376.37828.41635"*, i32, i32, %"class.llvm::Type.77.1529.9757.10241.10725.11209.11693.12177.12661.13145.13629.15081.15565.16049.17501.18953.20889.25729.26697.29117.29601.30085.30569.31053.31537.32021.32505.33473.36377.37829.41636"** }
+%"class.llvm::LLVMContext.76.1528.9756.10240.10724.11208.11692.12176.12660.13144.13628.15080.15564.16048.17500.18952.20888.25728.26696.29116.29600.30084.30568.31052.31536.32020.32504.33472.36376.37828.41635" = type { %"class.llvm::LLVMContextImpl.75.1527.9755.10239.10723.11207.11691.12175.12659.13143.13627.15079.15563.16047.17499.18951.20887.25727.26695.29115.29599.30083.30567.31051.31535.32019.32503.33471.36375.37827.41634"* }
+%"class.llvm::LLVMContextImpl.75.1527.9755.10239.10723.11207.11691.12175.12659.13143.13627.15079.15563.16047.17499.18951.20887.25727.26695.29115.29599.30083.30567.31051.31535.32019.32503.33471.36375.37827.41634" = type opaque
+%"class.llvm::SDUse.304.1756.9984.10468.10952.11436.11920.12404.12888.13372.13856.15308.15792.16276.17728.19180.21116.25956.26924.29344.29828.30312.30796.31280.31764.32248.32732.33700.36604.38056.41632" = type { %"class.llvm::SDValue.303.1755.9983.10467.10951.11435.11919.12403.12887.13371.13855.15307.15791.16275.17727.19179.21115.25955.26923.29343.29827.30311.30795.31279.31763.32247.32731.33699.36603.38055.41631", %"class.llvm::SDNode.310.1762.9990.10474.10958.11442.11926.12410.12894.13378.13862.15314.15798.16282.17734.19186.21122.25962.26930.29350.29834.30318.30802.31286.31770.32254.32738.33706.36610.38062.41642"*, %"class.llvm::SDUse.304.1756.9984.10468.10952.11436.11920.12404.12888.13372.13856.15308.15792.16276.17728.19180.21116.25956.26924.29344.29828.30312.30796.31280.31764.32248.32732.33700.36604.38056.41632"**, %"class.llvm::SDUse.304.1756.9984.10468.10952.11436.11920.12404.12888.13372.13856.15308.15792.16276.17728.19180.21116.25956.26924.29344.29828.30312.30796.31280.31764.32248.32732.33700.36604.38056.41632"* }
+%"class.llvm::SDValue.303.1755.9983.10467.10951.11435.11919.12403.12887.13371.13855.15307.15791.16275.17727.19179.21115.25955.26923.29343.29827.30311.30795.31279.31763.32247.32731.33699.36603.38055.41631" = type <{ %"class.llvm::SDNode.310.1762.9990.10474.10958.11442.11926.12410.12894.13378.13862.15314.15798.16282.17734.19186.21122.25962.26930.29350.29834.30318.30802.31286.31770.32254.32738.33706.36610.38062.41642"*, i32, [4 x i8] }>
+%"class.llvm::DebugLoc.309.1761.9989.10473.10957.11441.11925.12409.12893.13377.13861.15313.15797.16281.17733.19185.21121.25961.26929.29349.29833.30317.30801.31285.31769.32253.32737.33705.36609.38061.41641" = type { %"class.llvm::TypedTrackingMDRef.308.1760.9988.10472.10956.11440.11924.12408.12892.13376.13860.15312.15796.16280.17732.19184.21120.25960.26928.29348.29832.30316.30800.31284.31768.32252.32736.33704.36608.38060.41640" }
+%"class.llvm::TypedTrackingMDRef.308.1760.9988.10472.10956.11440.11924.12408.12892.13376.13860.15312.15796.16280.17732.19184.21120.25960.26928.29348.29832.30316.30800.31284.31768.32252.32736.33704.36608.38060.41640" = type { %"class.llvm::TrackingMDRef.307.1759.9987.10471.10955.11439.11923.12407.12891.13375.13859.15311.15795.16279.17731.19183.21119.25959.26927.29347.29831.30315.30799.31283.31767.32251.32735.33703.36607.38059.41639" }
+%"class.llvm::TrackingMDRef.307.1759.9987.10471.10955.11439.11923.12407.12891.13375.13859.15311.15795.16279.17731.19183.21119.25959.26927.29347.29831.30315.30799.31283.31767.32251.32735.33703.36607.38059.41639" = type { %"class.llvm::Metadata.306.1758.9986.10470.10954.11438.11922.12406.12890.13374.13858.15310.15794.16278.17730.19182.21118.25958.26926.29346.29830.30314.30798.31282.31766.32250.32734.33702.36606.38058.41638"* }
+%"class.llvm::Metadata.306.1758.9986.10470.10954.11438.11922.12406.12890.13374.13858.15310.15794.16278.17730.19182.21118.25958.26926.29346.29830.30314.30798.31282.31766.32250.32734.33702.36606.38058.41638" = type { i8, i8, i16, i32 }
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+
+; Function Attrs: nounwind ssp uwtable
+define hidden void @fun(%"class.llvm::SDNode.310.1762.9990.10474.10958.11442.11926.12410.12894.13378.13862.15314.15798.16282.17734.19186.21122.25962.26930.29350.29834.30318.30802.31286.31770.32254.32738.33706.36610.38062.41642"* %N) #1 align 2 {
+; CHECK: *
+entry:
+  %NumOperands.i = getelementptr inbounds %"class.llvm::SDNode.310.1762.9990.10474.10958.11442.11926.12410.12894.13378.13862.15314.15798.16282.17734.19186.21122.25962.26930.29350.29834.30318.30802.31286.31770.32254.32738.33706.36610.38062.41642", %"class.llvm::SDNode.310.1762.9990.10474.10958.11442.11926.12410.12894.13378.13862.15314.15798.16282.17734.19186.21122.25962.26930.29350.29834.30318.30802.31286.31770.32254.32738.33706.36610.38062.41642"* %N, i64 0, i32 8
+  %0 = load i16, i16* %NumOperands.i, align 8, !tbaa !1
+  br i1 undef, label %for.cond.cleanup, label %for.body.lr.ph
+
+for.body.lr.ph:                                   ; preds = %entry
+  %wide.trip.count192 = zext i16 %0 to i64
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.lr.ph
+  %indvars.iv190 = phi i64 [ 0, %for.body.lr.ph ], [ %indvars.iv.next191, %for.body ]
+  call void @llvm.lifetime.end(i64 16, i8* nonnull null)
+  %indvars.iv.next191 = add nuw nsw i64 %indvars.iv190, 1
+  %exitcond193 = icmp eq i64 %indvars.iv.next191, %wide.trip.count192
+  br i1 %exitcond193, label %for.cond.cleanup, label %for.body
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (trunk 297799) (llvm/trunk 297808)"}
+!1 = !{!2, !3, i64 56}
+!2 = !{!"_ZTSN4llvm6SDNodeE", !3, i64 24, !4, i64 26, !6, i64 28, !7, i64 32, !7, i64 40, !7, i64 48, !3, i64 56, !3, i64 58, !6, i64 60, !8, i64 64, !3, i64 72}
+!3 = !{!"short", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C++ TBAA"}
+!6 = !{!"int", !4, i64 0}
+!7 = !{!"any pointer", !4, i64 0}
+!8 = !{!"_ZTSN4llvm8DebugLocE", !9, i64 0}
+!9 = !{!"_ZTSN4llvm18TypedTrackingMDRefINS_6MDNodeEEE", !10, i64 0}
+!10 = !{!"_ZTSN4llvm13TrackingMDRefE", !7, i64 0}
diff --git a/test/Analysis/CostModel/SystemZ/lit.local.cfg b/test/Analysis/CostModel/SystemZ/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..2f3cf7d3f0432b284957d97d9c63696f6a3e06c6
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'SystemZ' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Analysis/CostModel/SystemZ/load_store.ll b/test/Analysis/CostModel/SystemZ/load_store.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1ac92292c829bab85c87b89658bde67022d8f03a
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/load_store.ll
@@ -0,0 +1,137 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+define void @store() {
+  store i8 undef, i8* undef
+  store i16 undef, i16* undef
+  store i32 undef, i32* undef
+  store i64 undef, i64* undef
+  store float undef, float* undef
+  store double undef, double* undef
+  store fp128 undef, fp128* undef
+  store <2 x i8> undef, <2 x i8>* undef
+  store <2 x i16> undef, <2 x i16>* undef
+  store <2 x i32> undef, <2 x i32>* undef
+  store <2 x i64> undef, <2 x i64>* undef
+  store <2 x float> undef, <2 x float>* undef
+  store <2 x double> undef, <2 x double>* undef
+  store <4 x i8> undef, <4 x i8>* undef
+  store <4 x i16> undef, <4 x i16>* undef
+  store <4 x i32> undef, <4 x i32>* undef
+  store <4 x i64> undef, <4 x i64>* undef
+  store <4 x float> undef, <4 x float>* undef
+  store <4 x double> undef, <4 x double>* undef
+  store <8 x i8> undef, <8 x i8>* undef
+  store <8 x i16> undef, <8 x i16>* undef
+  store <8 x i32> undef, <8 x i32>* undef
+  store <8 x i64> undef, <8 x i64>* undef
+  store <8 x float> undef, <8 x float>* undef
+  store <8 x double> undef, <8 x double>* undef
+  store <16 x i8> undef, <16 x i8>* undef
+  store <16 x i16> undef, <16 x i16>* undef
+  store <16 x i32> undef, <16 x i32>* undef
+  store <16 x i64> undef, <16 x i64>* undef
+  store <16 x float> undef, <16 x float>* undef
+  store <16 x double> undef, <16 x double>* undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i8 undef, i8* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i16 undef, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i32 undef, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i64 undef, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store float undef, float* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store double undef, double* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store fp128 undef, fp128* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i8> undef, <2 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i16> undef, <2 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i32> undef, <2 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i64> undef, <2 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x float> undef, <2 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x double> undef, <2 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <4 x i8> undef, <4 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <4 x i16> undef, <4 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <4 x i32> undef, <4 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store <4 x i64> undef, <4 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <4 x float> undef, <4 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store <4 x double> undef, <4 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <8 x i8> undef, <8 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <8 x i16> undef, <8 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store <8 x i32> undef, <8 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   store <8 x i64> undef, <8 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store <8 x float> undef, <8 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   store <8 x double> undef, <8 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <16 x i8> undef, <16 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store <16 x i16> undef, <16 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   store <16 x i32> undef, <16 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   store <16 x i64> undef, <16 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   store <16 x float> undef, <16 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   store <16 x double> undef, <16 x double>* undef
+
+  ret void;
+}
+
+define void @load() {
+  load i8, i8* undef
+  load i16, i16* undef
+  load i32, i32* undef
+  load i64, i64* undef
+  load float, float* undef
+  load double, double* undef
+  load fp128, fp128* undef
+  load <2 x i8>, <2 x i8>* undef
+  load <2 x i16>, <2 x i16>* undef
+  load <2 x i32>, <2 x i32>* undef
+  load <2 x i64>, <2 x i64>* undef
+  load <2 x float>, <2 x float>* undef
+  load <2 x double>, <2 x double>* undef
+  load <4 x i8>, <4 x i8>* undef
+  load <4 x i16>, <4 x i16>* undef
+  load <4 x i32>, <4 x i32>* undef
+  load <4 x i64>, <4 x i64>* undef
+  load <4 x float>, <4 x float>* undef
+  load <4 x double>, <4 x double>* undef
+  load <8 x i8>, <8 x i8>* undef
+  load <8 x i16>, <8 x i16>* undef
+  load <8 x i32>, <8 x i32>* undef
+  load <8 x i64>, <8 x i64>* undef
+  load <8 x float>, <8 x float>* undef
+  load <8 x double>, <8 x double>* undef
+  load <16 x i8>, <16 x i8>* undef
+  load <16 x i16>, <16 x i16>* undef
+  load <16 x i32>, <16 x i32>* undef
+  load <16 x i64>, <16 x i64>* undef
+  load <16 x float>, <16 x float>* undef
+  load <16 x double>, <16 x double>* undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = load i8, i8* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = load i16, i16* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = load float, float* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = load double, double* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %7 = load fp128, fp128* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = load <2 x i8>, <2 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = load <2 x i16>, <2 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = load <2 x i32>, <2 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %11 = load <2 x i64>, <2 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %12 = load <2 x float>, <2 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %13 = load <2 x double>, <2 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %14 = load <4 x i8>, <4 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %15 = load <4 x i16>, <4 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %16 = load <4 x i32>, <4 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %17 = load <4 x i64>, <4 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %18 = load <4 x float>, <4 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %19 = load <4 x double>, <4 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %20 = load <8 x i8>, <8 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %21 = load <8 x i16>, <8 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %22 = load <8 x i32>, <8 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %23 = load <8 x i64>, <8 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %24 = load <8 x float>, <8 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %25 = load <8 x double>, <8 x double>* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %26 = load <16 x i8>, <16 x i8>* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %27 = load <16 x i16>, <16 x i16>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %28 = load <16 x i32>, <16 x i32>* undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %29 = load <16 x i64>, <16 x i64>* undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %30 = load <16 x float>, <16 x float>* undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %31 = load <16 x double>, <16 x double>* undef
+
+  ret void;
+}
diff --git a/test/Analysis/CostModel/SystemZ/logical.ll b/test/Analysis/CostModel/SystemZ/logical.ll
new file mode 100644
index 0000000000000000000000000000000000000000..41984e0a29c4206107a5981b729a2e7135ac508e
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/logical.ll
@@ -0,0 +1,277 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+define void @and() {
+  %res0 = and i8 undef, undef
+  %res1 = and i16 undef, undef
+  %res2 = and i32 undef, undef
+  %res3 = and i64 undef, undef
+  %res4 = and <2 x i8> undef, undef
+  %res5 = and <2 x i16> undef, undef
+  %res6 = and <2 x i32> undef, undef
+  %res7 = and <2 x i64> undef, undef
+  %res8 = and <4 x i8> undef, undef
+  %res9 = and <4 x i16> undef, undef
+  %res10 = and <4 x i32> undef, undef
+  %res11 = and <4 x i64> undef, undef
+  %res12 = and <8 x i8> undef, undef
+  %res13 = and <8 x i16> undef, undef
+  %res14 = and <8 x i32> undef, undef
+  %res15 = and <8 x i64> undef, undef
+  %res16 = and <16 x i8> undef, undef
+  %res17 = and <16 x i16> undef, undef
+  %res18 = and <16 x i32> undef, undef
+  %res19 = and <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = and i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = and i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = and i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = and i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = and <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = and <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = and <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = and <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = and <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = and <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = and <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = and <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = and <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = and <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = and <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = and <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = and <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = and <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = and <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = and <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @ashr() {
+  %res0 = ashr i8 undef, undef
+  %res1 = ashr i16 undef, undef
+  %res2 = ashr i32 undef, undef
+  %res3 = ashr i64 undef, undef
+  %res4 = ashr <2 x i8> undef, undef
+  %res5 = ashr <2 x i16> undef, undef
+  %res6 = ashr <2 x i32> undef, undef
+  %res7 = ashr <2 x i64> undef, undef
+  %res8 = ashr <4 x i8> undef, undef
+  %res9 = ashr <4 x i16> undef, undef
+  %res10 = ashr <4 x i32> undef, undef
+  %res11 = ashr <4 x i64> undef, undef
+  %res12 = ashr <8 x i8> undef, undef
+  %res13 = ashr <8 x i16> undef, undef
+  %res14 = ashr <8 x i32> undef, undef
+  %res15 = ashr <8 x i64> undef, undef
+  %res16 = ashr <16 x i8> undef, undef
+  %res17 = ashr <16 x i16> undef, undef
+  %res18 = ashr <16 x i32> undef, undef
+  %res19 = ashr <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res0 = ashr i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res1 = ashr i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = ashr i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = ashr i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = ashr <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = ashr <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = ashr <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = ashr <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = ashr <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = ashr <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = ashr <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = ashr <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = ashr <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = ashr <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = ashr <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = ashr <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = ashr <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = ashr <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = ashr <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = ashr <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @lshr() {
+  %res0 = lshr i8 undef, undef
+  %res1 = lshr i16 undef, undef
+  %res2 = lshr i32 undef, undef
+  %res3 = lshr i64 undef, undef
+  %res4 = lshr <2 x i8> undef, undef
+  %res5 = lshr <2 x i16> undef, undef
+  %res6 = lshr <2 x i32> undef, undef
+  %res7 = lshr <2 x i64> undef, undef
+  %res8 = lshr <4 x i8> undef, undef
+  %res9 = lshr <4 x i16> undef, undef
+  %res10 = lshr <4 x i32> undef, undef
+  %res11 = lshr <4 x i64> undef, undef
+  %res12 = lshr <8 x i8> undef, undef
+  %res13 = lshr <8 x i16> undef, undef
+  %res14 = lshr <8 x i32> undef, undef
+  %res15 = lshr <8 x i64> undef, undef
+  %res16 = lshr <16 x i8> undef, undef
+  %res17 = lshr <16 x i16> undef, undef
+  %res18 = lshr <16 x i32> undef, undef
+  %res19 = lshr <16 x i64> undef, undef
+
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res0 = lshr i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res1 = lshr i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = lshr i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = lshr i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = lshr <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = lshr <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = lshr <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = lshr <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = lshr <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = lshr <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = lshr <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = lshr <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = lshr <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = lshr <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = lshr <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = lshr <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = lshr <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = lshr <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = lshr <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = lshr <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @or() {
+  %res0 = or i8 undef, undef
+  %res1 = or i16 undef, undef
+  %res2 = or i32 undef, undef
+  %res3 = or i64 undef, undef
+  %res4 = or <2 x i8> undef, undef
+  %res5 = or <2 x i16> undef, undef
+  %res6 = or <2 x i32> undef, undef
+  %res7 = or <2 x i64> undef, undef
+  %res8 = or <4 x i8> undef, undef
+  %res9 = or <4 x i16> undef, undef
+  %res10 = or <4 x i32> undef, undef
+  %res11 = or <4 x i64> undef, undef
+  %res12 = or <8 x i8> undef, undef
+  %res13 = or <8 x i16> undef, undef
+  %res14 = or <8 x i32> undef, undef
+  %res15 = or <8 x i64> undef, undef
+  %res16 = or <16 x i8> undef, undef
+  %res17 = or <16 x i16> undef, undef
+  %res18 = or <16 x i32> undef, undef
+  %res19 = or <16 x i64> undef, undef
+  
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = or i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = or i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = or i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = or i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = or <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = or <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = or <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = or <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = or <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = or <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = or <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = or <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = or <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = or <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = or <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = or <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = or <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = or <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = or <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = or <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @shl() {
+  %res0 = shl i8 undef, undef
+  %res1 = shl i16 undef, undef
+  %res2 = shl i32 undef, undef
+  %res3 = shl i64 undef, undef
+  %res4 = shl <2 x i8> undef, undef
+  %res5 = shl <2 x i16> undef, undef
+  %res6 = shl <2 x i32> undef, undef
+  %res7 = shl <2 x i64> undef, undef
+  %res8 = shl <4 x i8> undef, undef
+  %res9 = shl <4 x i16> undef, undef
+  %res10 = shl <4 x i32> undef, undef
+  %res11 = shl <4 x i64> undef, undef
+  %res12 = shl <8 x i8> undef, undef
+  %res13 = shl <8 x i16> undef, undef
+  %res14 = shl <8 x i32> undef, undef
+  %res15 = shl <8 x i64> undef, undef
+  %res16 = shl <16 x i8> undef, undef
+  %res17 = shl <16 x i16> undef, undef
+  %res18 = shl <16 x i32> undef, undef
+  %res19 = shl <16 x i64> undef, undef
+  
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = shl i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = shl i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = shl i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = shl i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = shl <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = shl <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = shl <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = shl <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = shl <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = shl <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = shl <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = shl <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = shl <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = shl <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = shl <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = shl <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = shl <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = shl <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = shl <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = shl <16 x i64> undef, undef
+
+  ret void;
+}
+
+define void @xor() {
+  %res0 = xor i8 undef, undef
+  %res1 = xor i16 undef, undef
+  %res2 = xor i32 undef, undef
+  %res3 = xor i64 undef, undef
+  %res4 = xor <2 x i8> undef, undef
+  %res5 = xor <2 x i16> undef, undef
+  %res6 = xor <2 x i32> undef, undef
+  %res7 = xor <2 x i64> undef, undef
+  %res8 = xor <4 x i8> undef, undef
+  %res9 = xor <4 x i16> undef, undef
+  %res10 = xor <4 x i32> undef, undef
+  %res11 = xor <4 x i64> undef, undef
+  %res12 = xor <8 x i8> undef, undef
+  %res13 = xor <8 x i16> undef, undef
+  %res14 = xor <8 x i32> undef, undef
+  %res15 = xor <8 x i64> undef, undef
+  %res16 = xor <16 x i8> undef, undef
+  %res17 = xor <16 x i16> undef, undef
+  %res18 = xor <16 x i32> undef, undef
+  %res19 = xor <16 x i64> undef, undef
+  
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res0 = xor i8 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = xor i16 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = xor i32 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = xor i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = xor <2 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = xor <2 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = xor <2 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res7 = xor <2 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res8 = xor <4 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res9 = xor <4 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res10 = xor <4 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res11 = xor <4 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res12 = xor <8 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res13 = xor <8 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res14 = xor <8 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res15 = xor <8 x i64> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res16 = xor <16 x i8> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res17 = xor <16 x i16> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %res18 = xor <16 x i32> undef, undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %res19 = xor <16 x i64> undef, undef
+
+  ret void;
+}
diff --git a/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1b6a50d303f252aca8c0eeba138961ead01130a0
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/memop-folding-int-arith.ll
@@ -0,0 +1,259 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; Test that loads into operations that can fold one memory operand get zero
+; cost. In the case that both operands are loaded, one load should get a cost
+; value.
+
+define void @add() {
+  %li32 = load i32, i32* undef
+  add i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  add i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  add i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  add i64 %li64_0, %li64_1
+
+  ret void;
+
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = add i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = add i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = add i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = add i64 %li64_0, %li64_1
+}
+
+define void @sub() {
+  %li32 = load i32, i32* undef
+  sub i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  sub i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  sub i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  sub i64 %li64_0, %li64_1
+
+  ret void;
+
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = sub i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = sub i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = sub i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sub i64 %li64_0, %li64_1
+}
+
+define void @mul() {
+  %li32 = load i32, i32* undef
+  mul i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  mul i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  mul i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  mul i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = mul i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = mul i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = mul i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = mul i64 %li64_0, %li64_1
+}
+
+define void @sdiv() {
+  %li32 = load i32, i32* undef
+  sdiv i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  sdiv i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  sdiv i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  sdiv i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = sdiv i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %2 = sdiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = sdiv i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = sdiv i64 %li64_0, %li64_1
+}
+
+define void @udiv() {
+  %li32 = load i32, i32* undef
+  udiv i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  udiv i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  udiv i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  udiv i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = udiv i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %2 = udiv i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %3 = udiv i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %4 = udiv i64 %li64_0, %li64_1
+}
+
+define void @and() {
+  %li32 = load i32, i32* undef
+  and i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  and i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  and i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  and i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = and i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = and i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = and i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = and i64 %li64_0, %li64_1
+}
+
+define void @or() {
+  %li32 = load i32, i32* undef
+  or i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  or i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  or i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  or i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = or i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = or i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = or i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = or i64 %li64_0, %li64_1
+}
+
+define void @xor() {
+  %li32 = load i32, i32* undef
+  xor i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  xor i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  xor i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  xor i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = xor i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = xor i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = xor i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = xor i64 %li64_0, %li64_1
+}
+
+define void @icmp() {
+  %li32 = load i32, i32* undef
+  icmp eq i32 %li32, undef
+
+  %li32_0 = load i32, i32* undef
+  %li32_1 = load i32, i32* undef
+  icmp eq i32 %li32_0, %li32_1
+
+  %li64 = load i64, i64* undef
+  icmp eq i64 %li64, undef
+
+  %li64_0 = load i64, i64* undef
+  %li64_1 = load i64, i64* undef
+  icmp eq i64 %li64_0, %li64_1
+
+  ret void;
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = icmp eq i32 %li32, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li32_0 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li32_1 = load i32, i32* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = icmp eq i32 %li32_0, %li32_1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = icmp eq i64 %li64, undef
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %li64_0 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %li64_1 = load i64, i64* undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = icmp eq i64 %li64_0, %li64_1
+}
diff --git a/test/Analysis/CostModel/SystemZ/scalar-cmp-cmp-log-sel.ll b/test/Analysis/CostModel/SystemZ/scalar-cmp-cmp-log-sel.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9ba980780dedf59e5fa16dbd3f686d32f9026eda
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/scalar-cmp-cmp-log-sel.ll
@@ -0,0 +1,1624 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+; TODO: add more tests for differing operand types of the two compares.
+
+define i8 @fun0(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun0
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun1(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                 i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun1
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun2(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                 i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun2
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun3(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                 i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun3
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun4(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                   float %val5, float %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun4
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun5(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                    double %val5, double %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun5
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun6(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun6
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun7(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                 i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun7
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun8(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                 i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun8
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun9(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                 i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun9
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun10(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun10
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun11(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun11
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun12(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun12
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun13(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun13
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun14(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun14
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun15(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun15
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun16(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun16
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun17(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun17
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun18(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun18
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun19(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun19
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun20(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun20
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun21(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun21
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun22(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun22
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun23(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun23
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun24(float %val1, float %val2, float %val3, float %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun24
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun25(float %val1, float %val2, float %val3, float %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun25
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun26(float %val1, float %val2, float %val3, float %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun26
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun27(float %val1, float %val2, float %val3, float %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun27
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun28(float %val1, float %val2, float %val3, float %val4,
+                    float %val5, float %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun28
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun29(float %val1, float %val2, float %val3, float %val4,
+                     double %val5, double %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun29
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun30(double %val1, double %val2, double %val3, double %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun30
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun31(double %val1, double %val2, double %val3, double %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun31
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun32(double %val1, double %val2, double %val3, double %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun32
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun33(double %val1, double %val2, double %val3, double %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun33
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun34(double %val1, double %val2, double %val3, double %val4,
+                    float %val5, float %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun34
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun35(double %val1, double %val2, double %val3, double %val4,
+                     double %val5, double %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = and i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun35
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = and i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun36(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun36
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun37(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun37
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun38(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun38
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun39(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun39
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun40(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun40
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun41(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun41
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun42(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun42
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun43(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun43
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun44(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun44
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun45(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun45
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun46(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun46
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun47(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun47
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun48(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun48
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun49(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun49
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun50(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun50
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun51(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun51
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun52(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun52
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun53(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun53
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun54(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun54
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun55(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun55
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun56(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun56
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun57(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun57
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun58(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun58
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun59(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun59
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun60(float %val1, float %val2, float %val3, float %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun60
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun61(float %val1, float %val2, float %val3, float %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun61
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun62(float %val1, float %val2, float %val3, float %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun62
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun63(float %val1, float %val2, float %val3, float %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun63
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun64(float %val1, float %val2, float %val3, float %val4,
+                    float %val5, float %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun64
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun65(float %val1, float %val2, float %val3, float %val4,
+                     double %val5, double %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun65
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun66(double %val1, double %val2, double %val3, double %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun66
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun67(double %val1, double %val2, double %val3, double %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun67
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun68(double %val1, double %val2, double %val3, double %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun68
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun69(double %val1, double %val2, double %val3, double %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun69
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun70(double %val1, double %val2, double %val3, double %val4,
+                    float %val5, float %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun70
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun71(double %val1, double %val2, double %val3, double %val4,
+                     double %val5, double %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = or i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun71
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 1 for instruction:   %and = or i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun72(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun72
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun73(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun73
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun74(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun74
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun75(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun75
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun76(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun76
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun77(i8 %val1, i8 %val2, i8 %val3, i8 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i8 %val1, %val2
+  %cmp1 = icmp eq i8 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun77
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i8 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i8 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun78(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun78
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun79(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun79
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun80(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun80
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun81(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun81
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun82(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun82
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun83(i16 %val1, i16 %val2, i16 %val3, i16 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i16 %val1, %val2
+  %cmp1 = icmp eq i16 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun83
+; CHECK: cost of 3 for instruction:   %cmp0 = icmp eq i16 %val1, %val2
+; CHECK: cost of 3 for instruction:   %cmp1 = icmp eq i16 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun84(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun84
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun85(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun85
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun86(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun86
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun87(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun87
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun88(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun88
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun89(i32 %val1, i32 %val2, i32 %val3, i32 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i32 %val1, %val2
+  %cmp1 = icmp eq i32 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun89
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i32 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i32 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun90(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun90
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun91(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun91
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun92(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun92
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun93(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun93
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun94(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                    float %val5, float %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun94
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun95(i64 %val1, i64 %val2, i64 %val3, i64 %val4,
+                     double %val5, double %val6) {
+  %cmp0 = icmp eq i64 %val1, %val2
+  %cmp1 = icmp eq i64 %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun95
+; CHECK: cost of 1 for instruction:   %cmp0 = icmp eq i64 %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = icmp eq i64 %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun96(float %val1, float %val2, float %val3, float %val4,
+                 i8 %val5, i8 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun96
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun97(float %val1, float %val2, float %val3, float %val4,
+                  i16 %val5, i16 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun97
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun98(float %val1, float %val2, float %val3, float %val4,
+                  i32 %val5, i32 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun98
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun99(float %val1, float %val2, float %val3, float %val4,
+                  i64 %val5, i64 %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun99
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun100(float %val1, float %val2, float %val3, float %val4,
+                     float %val5, float %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun100
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun101(float %val1, float %val2, float %val3, float %val4,
+                      double %val5, double %val6) {
+  %cmp0 = fcmp ogt float %val1, %val2
+  %cmp1 = fcmp ogt float %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun101
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt float %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt float %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
+define i8 @fun102(double %val1, double %val2, double %val3, double %val4,
+                  i8 %val5, i8 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i8 %val5, i8 %val6
+  ret i8 %sel
+
+; CHECK: fun102
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i8 %val5, i8 %val6
+}
+
+define i16 @fun103(double %val1, double %val2, double %val3, double %val4,
+                   i16 %val5, i16 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i16 %val5, i16 %val6
+  ret i16 %sel
+
+; CHECK: fun103
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i16 %val5, i16 %val6
+}
+
+define i32 @fun104(double %val1, double %val2, double %val3, double %val4,
+                   i32 %val5, i32 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i32 %val5, i32 %val6
+  ret i32 %sel
+
+; CHECK: fun104
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i32 %val5, i32 %val6
+}
+
+define i64 @fun105(double %val1, double %val2, double %val3, double %val4,
+                   i64 %val5, i64 %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, i64 %val5, i64 %val6
+  ret i64 %sel
+
+; CHECK: fun105
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 1 for instruction:   %sel = select i1 %and, i64 %val5, i64 %val6
+}
+
+define float @fun106(double %val1, double %val2, double %val3, double %val4,
+                     float %val5, float %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, float %val5, float %val6
+  ret float %sel
+
+; CHECK: fun106
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, float %val5, float %val6
+}
+
+define double @fun107(double %val1, double %val2, double %val3, double %val4,
+                      double %val5, double %val6) {
+  %cmp0 = fcmp ogt double %val1, %val2
+  %cmp1 = fcmp ogt double %val3, %val4
+  %and = xor i1 %cmp0, %cmp1
+  %sel = select i1 %and, double %val5, double %val6
+  ret double %sel
+
+; CHECK: fun107
+; CHECK: cost of 1 for instruction:   %cmp0 = fcmp ogt double %val1, %val2
+; CHECK: cost of 1 for instruction:   %cmp1 = fcmp ogt double %val3, %val4
+; CHECK: cost of 7 for instruction:   %and = xor i1 %cmp0, %cmp1
+; CHECK: cost of 4 for instruction:   %sel = select i1 %and, double %val5, double %val6
+}
+
diff --git a/test/Analysis/CostModel/SystemZ/shuffle.ll b/test/Analysis/CostModel/SystemZ/shuffle.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e40dc1f09ba0cbe2deec86213349cc5fbd5bb951
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/shuffle.ll
@@ -0,0 +1,112 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+; CHECK: shuffle
+define void @shuffle() {
+
+  ;; Reverse shuffles
+  shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+
+  shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+
+  shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+  shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+
+  ;; Alternate shuffles
+  shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+  shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+
+  shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+  shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+
+  shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 0, i32 3>
+  shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 2, i32 1>
+
+  shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 0, i32 3>
+  shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 2, i32 1>
+
+  ;; Broadcast shuffles
+  shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+  shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
+
+  shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+  shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+  shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+
+  shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+  shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+
+  shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+  shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+
+  ;; Random shuffles
+  shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 4, i32 17, i32 2, i32 19, i32 0, i32 21, i32 8, i32 23, i32 6, i32 10, i32 10, i32 27, i32 29, i32 29, i32 14, i32 31>
+  shufflevector <18 x i8> undef, <18 x i8> undef, <18 x i32> <i32 4, i32 17, i32 2, i32 19, i32 0, i32 21, i32 8, i32 23, i32 6, i32 10, i32 10, i32 27, i32 29, i32 29, i32 14, i32 31, i32 0, i32 1>
+
+  shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 9, i32 9, i32 2, i32 2, i32 4, i32 13, i32 15, i32 15>
+  shufflevector <12 x i16> undef, <12 x i16> undef, <12 x i32> <i32 9, i32 9, i32 2, i32 2, i32 4, i32 13, i32 15, i32 15, i32 9, i32 2, i32 2, i32 4>
+
+  shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 4, i32 7>
+  shufflevector <6 x i32> undef, <6 x i32> undef, <6 x i32> <i32 0, i32 0, i32 4, i32 7, i32 4, i32 7>
+
+  shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 2>
+  shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 2>
+
+  shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 2, i32 1>
+  shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 2>
+
+  ret void
+
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %1 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %11 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %12 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %13 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 0, i32 9, i32 2, i32 11, i32 4, i32 13, i32 6, i32 15>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %14 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %15 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %16 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %17 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 0, i32 3>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %18 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 2, i32 1>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %19 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 0, i32 3>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %20 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 2, i32 1>
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %21 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %22 = shufflevector <32 x i8> undef, <32 x i8> undef, <32 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %23 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %24 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %25 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %26 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %27 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %28 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %29 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %30 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %31 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 4, i32 17, i32 2, i32 19, i32 0, i32 21, i32 8, i32 23, i32 6, i32 10, i32 10, i32 27, i32 29, i32 29, i32 14, i32 31>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %32 = shufflevector <18 x i8> undef, <18 x i8> undef, <18 x i32> <i32 4, i32 17, i32 2, i32 19, i32 0, i32 21, i32 8, i32 23, i32 6, i32 10, i32 10, i32 27, i32 29, i32 29, i32 14, i32 31, i32 0, i32 1>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %33 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 9, i32 9, i32 2, i32 2, i32 4, i32 13, i32 15, i32 15>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %34 = shufflevector <12 x i16> undef, <12 x i16> undef, <12 x i32> <i32 9, i32 9, i32 2, i32 2, i32 4, i32 13, i32 15, i32 15, i32 9, i32 2, i32 2, i32 4>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %35 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 0, i32 4, i32 7>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %36 = shufflevector <6 x i32> undef, <6 x i32> undef, <6 x i32> <i32 0, i32 0, i32 4, i32 7, i32 4, i32 7>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %37 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 2>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %38 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 0, i32 2>
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %39 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 2, i32 1>
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %40 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 2, i32 1, i32 0, i32 2>
+}
diff --git a/test/Analysis/CostModel/SystemZ/vectorinstrs.ll b/test/Analysis/CostModel/SystemZ/vectorinstrs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b557076512673afd2177ba752fc38dc09cefd4da
--- /dev/null
+++ b/test/Analysis/CostModel/SystemZ/vectorinstrs.ll
@@ -0,0 +1,56 @@
+; RUN: opt < %s -cost-model -analyze -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+
+; CHECK: vecinstrs
+define void @vecinstrs() {
+
+  ;; Extract element is penalized somewhat with a cost of 2 for index 0.
+  extractelement <16 x i8> undef, i32 0
+  extractelement <16 x i8> undef, i32 1
+
+  extractelement <8 x i16> undef, i32 0
+  extractelement <8 x i16> undef, i32 1
+
+  extractelement <4 x i32> undef, i32 0
+  extractelement <4 x i32> undef, i32 1
+
+  extractelement <2 x i64> undef, i32 0
+  extractelement <2 x i64> undef, i32 1
+
+  extractelement <2 x double> undef, i32 0
+  extractelement <2 x double> undef, i32 1
+
+  ; Extraction of i1 means extract + test under mask before branch.
+  extractelement <2 x i1> undef, i32 0
+  extractelement <4 x i1> undef, i32 1
+  extractelement <8 x i1> undef, i32 2
+
+  ;; Insert element
+  insertelement <16 x i8> undef, i8 undef, i32 0
+  insertelement <8 x i16> undef, i16 undef, i32 0
+  insertelement <4 x i32> undef, i32 undef, i32 0
+
+  ; vlvgp will do two grs into a vector register: only add cost half of the time.
+  insertelement <2 x i64> undef, i64 undef, i32 0
+  insertelement <2 x i64> undef, i64 undef, i32 1
+
+  ret void
+
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %1 = extractelement <16 x i8> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = extractelement <16 x i8> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %3 = extractelement <8 x i16> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = extractelement <8 x i16> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %5 = extractelement <4 x i32> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = extractelement <4 x i32> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %7 = extractelement <2 x i64> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = extractelement <2 x i64> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = extractelement <2 x double> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = extractelement <2 x double> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 3 for instruction:   %11 = extractelement <2 x i1> undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %12 = extractelement <4 x i1> undef, i32 1
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %13 = extractelement <8 x i1> undef, i32 2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %14 = insertelement <16 x i8> undef, i8 undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %15 = insertelement <8 x i16> undef, i16 undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %16 = insertelement <4 x i32> undef, i32 undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %17 = insertelement <2 x i64> undef, i64 undef, i32 0
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %18 = insertelement <2 x i64> undef, i64 undef, i32 1
+}
diff --git a/test/Analysis/CostModel/X86/arith-fp.ll b/test/Analysis/CostModel/X86/arith-fp.ll
index 689442f67a130280d23e8e44a5b2e20f0d411d15..e5043010c11f1504e324eb0913971f1cddf589ca 100644
--- a/test/Analysis/CostModel/X86/arith-fp.ll
+++ b/test/Analysis/CostModel/X86/arith-fp.ll
@@ -456,20 +456,20 @@ define i32 @fma(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32
   ; AVX512: cost of 1 {{.*}} %F32 = call float @llvm.fma.f32
   %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
-  ; SSE2: cost of 52 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
-  ; SSE42: cost of 52 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
+  ; SSE2: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
+  ; SSE42: cost of 43 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
   ; AVX: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
   ; AVX2: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
   ; AVX512: cost of 1 {{.*}} %V4F32 = call <4 x float> @llvm.fma.v4f32
   %V4F32 = call <4 x float> @llvm.fma.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
-  ; SSE2: cost of 104 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
-  ; SSE42: cost of 104 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
+  ; SSE2: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
+  ; SSE42: cost of 86 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
   ; AVX: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
   ; AVX2: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
   ; AVX512: cost of 1 {{.*}} %V8F32 = call <8 x float> @llvm.fma.v8f32
   %V8F32 = call <8 x float> @llvm.fma.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
-  ; SSE2: cost of 208 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
-  ; SSE42: cost of 208 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
+  ; SSE2: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
+  ; SSE42: cost of 172 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
   ; AVX: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
   ; AVX2: cost of 4 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
   ; AVX512: cost of 1 {{.*}} %V16F32 = call <16 x float> @llvm.fma.v16f32
@@ -481,20 +481,20 @@ define i32 @fma(i32 %arg) {
   ; AVX2: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64
   ; AVX512: cost of 1 {{.*}} %F64 = call double @llvm.fma.f64
   %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-  ; SSE2: cost of 24 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
-  ; SSE42: cost of 24 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
+  ; SSE2: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
+  ; SSE42: cost of 21 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
   ; AVX: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
   ; AVX2: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
   ; AVX512: cost of 1 {{.*}} %V2F64 = call <2 x double> @llvm.fma.v2f64
   %V2F64 = call <2 x double> @llvm.fma.v2f64(<2 x double> undef, <2 x double> undef, <2 x double> undef)
-  ; SSE2: cost of 48 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
-  ; SSE42: cost of 48 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
+  ; SSE2: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
+  ; SSE42: cost of 42 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
   ; AVX: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
   ; AVX2: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
   ; AVX512: cost of 1 {{.*}} %V4F64 = call <4 x double> @llvm.fma.v4f64
   %V4F64 = call <4 x double> @llvm.fma.v4f64(<4 x double> undef, <4 x double> undef, <4 x double> undef)
-  ; SSE2: cost of 96 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
-  ; SSE42: cost of 96 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
+  ; SSE2: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
+  ; SSE42: cost of 84 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
   ; AVX: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
   ; AVX2: cost of 4 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
   ; AVX512: cost of 1 {{.*}} %V8F64 = call <8 x double> @llvm.fma.v8f64
diff --git a/test/Analysis/CostModel/X86/bitreverse.ll b/test/Analysis/CostModel/X86/bitreverse.ll
index c9eea20c3404daeddbcb65ebd4730ab5706b9f29..2eb63babdc343263460fb48407ef05a54a74669f 100644
--- a/test/Analysis/CostModel/X86/bitreverse.ll
+++ b/test/Analysis/CostModel/X86/bitreverse.ll
@@ -1,7 +1,11 @@
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE2
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=SSE42
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX
-; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X86 -check-prefix=SSE2
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X86 -check-prefix=SSE42
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X86 -check-prefix=AVX
+; RUN: opt < %s -mtriple=i686-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X86 -check-prefix=AVX2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=pentium4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X64 -check-prefix=SSE2
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X64 -check-prefix=SSE42
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X64 -check-prefix=AVX
+; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=X64 -check-prefix=AVX2
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX
 ; RUN: opt < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver4 -cost-model -analyze | FileCheck %s -check-prefix=CHECK -check-prefix=XOP -check-prefix=XOPAVX2
 
@@ -14,10 +18,8 @@ declare  i8 @llvm.bitreverse.i8(i8)
 
 define i64 @var_bitreverse_i64(i64 %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_i64':
-; SSE2: Found an estimated cost of 1 for instruction:   %bitreverse
-; SSE42: Found an estimated cost of 1 for instruction:   %bitreverse
-; AVX: Found an estimated cost of 1 for instruction:   %bitreverse
-; AVX2: Found an estimated cost of 1 for instruction:   %bitreverse
+; X86: Found an estimated cost of 28 for instruction:   %bitreverse
+; X64: Found an estimated cost of 14 for instruction:   %bitreverse
 ; XOP: Found an estimated cost of 3 for instruction:   %bitreverse
   %bitreverse = call i64 @llvm.bitreverse.i64(i64 %a)
   ret i64 %bitreverse
@@ -25,10 +27,8 @@ define i64 @var_bitreverse_i64(i64 %a) {
 
 define i32 @var_bitreverse_i32(i32 %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_i32':
-; SSE2: Found an estimated cost of 1 for instruction:   %bitreverse
-; SSE42: Found an estimated cost of 1 for instruction:   %bitreverse
-; AVX: Found an estimated cost of 1 for instruction:   %bitreverse
-; AVX2: Found an estimated cost of 1 for instruction:   %bitreverse
+; X86: Found an estimated cost of 14 for instruction:   %bitreverse
+; X64: Found an estimated cost of 14 for instruction:   %bitreverse
 ; XOP: Found an estimated cost of 3 for instruction:   %bitreverse
   %bitreverse = call i32 @llvm.bitreverse.i32(i32 %a)
   ret i32 %bitreverse
@@ -36,10 +36,8 @@ define i32 @var_bitreverse_i32(i32 %a) {
 
 define i16 @var_bitreverse_i16(i16 %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_i16':
-; SSE2: Found an estimated cost of 1 for instruction:   %bitreverse
-; SSE42: Found an estimated cost of 1 for instruction:   %bitreverse
-; AVX: Found an estimated cost of 1 for instruction:   %bitreverse
-; AVX2: Found an estimated cost of 1 for instruction:   %bitreverse
+; X86: Found an estimated cost of 14 for instruction:   %bitreverse
+; X64: Found an estimated cost of 14 for instruction:   %bitreverse
 ; XOP: Found an estimated cost of 3 for instruction:   %bitreverse
   %bitreverse = call i16 @llvm.bitreverse.i16(i16 %a)
   ret i16 %bitreverse
@@ -47,10 +45,8 @@ define i16 @var_bitreverse_i16(i16 %a) {
 
 define i8 @var_bitreverse_i8(i8 %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_i8':
-; SSE2: Found an estimated cost of 1 for instruction:   %bitreverse
-; SSE42: Found an estimated cost of 1 for instruction:   %bitreverse
-; AVX: Found an estimated cost of 1 for instruction:   %bitreverse
-; AVX2: Found an estimated cost of 1 for instruction:   %bitreverse
+; X86: Found an estimated cost of 11 for instruction:   %bitreverse
+; X64: Found an estimated cost of 11 for instruction:   %bitreverse
 ; XOP: Found an estimated cost of 3 for instruction:   %bitreverse
   %bitreverse = call i8 @llvm.bitreverse.i8(i8 %a)
   ret i8 %bitreverse
@@ -70,7 +66,7 @@ declare <32 x i8> @llvm.bitreverse.v32i8(<32 x i8>)
 
 define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v2i64':
-; SSE2: Found an estimated cost of 6 for instruction:   %bitreverse
+; SSE2: Found an estimated cost of 29 for instruction:   %bitreverse
 ; SSE42: Found an estimated cost of 5 for instruction:   %bitreverse
 ; AVX: Found an estimated cost of 5 for instruction:   %bitreverse
 ; AVX2: Found an estimated cost of 5 for instruction:   %bitreverse
@@ -81,7 +77,7 @@ define <2 x i64> @var_bitreverse_v2i64(<2 x i64> %a) {
 
 define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v4i64':
-; SSE2: Found an estimated cost of 12 for instruction:   %bitreverse
+; SSE2: Found an estimated cost of 58 for instruction:   %bitreverse
 ; SSE42: Found an estimated cost of 10 for instruction:   %bitreverse
 ; AVX: Found an estimated cost of 10 for instruction:   %bitreverse
 ; AVX2: Found an estimated cost of 5 for instruction:   %bitreverse
@@ -92,7 +88,7 @@ define <4 x i64> @var_bitreverse_v4i64(<4 x i64> %a) {
 
 define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v4i32':
-; SSE2: Found an estimated cost of 12 for instruction:   %bitreverse
+; SSE2: Found an estimated cost of 27 for instruction:   %bitreverse
 ; SSE42: Found an estimated cost of 5 for instruction:   %bitreverse
 ; AVX: Found an estimated cost of 5 for instruction:   %bitreverse
 ; AVX2: Found an estimated cost of 5 for instruction:   %bitreverse
@@ -103,7 +99,7 @@ define <4 x i32> @var_bitreverse_v4i32(<4 x i32> %a) {
 
 define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v8i32':
-; SSE2: Found an estimated cost of 24 for instruction:   %bitreverse
+; SSE2: Found an estimated cost of 54 for instruction:   %bitreverse
 ; SSE42: Found an estimated cost of 10 for instruction:   %bitreverse
 ; AVX: Found an estimated cost of 10 for instruction:   %bitreverse
 ; AVX2: Found an estimated cost of 5 for instruction:   %bitreverse
@@ -114,7 +110,7 @@ define <8 x i32> @var_bitreverse_v8i32(<8 x i32> %a) {
 
 define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v8i16':
-; SSE2: Found an estimated cost of 24 for instruction:   %bitreverse
+; SSE2: Found an estimated cost of 27 for instruction:   %bitreverse
 ; SSE42: Found an estimated cost of 5 for instruction:   %bitreverse
 ; AVX: Found an estimated cost of 5 for instruction:   %bitreverse
 ; AVX2: Found an estimated cost of 5 for instruction:   %bitreverse
@@ -125,7 +121,7 @@ define <8 x i16> @var_bitreverse_v8i16(<8 x i16> %a) {
 
 define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v16i16':
-; SSE2: Found an estimated cost of 48 for instruction:   %bitreverse
+; SSE2: Found an estimated cost of 54 for instruction:   %bitreverse
 ; SSE42: Found an estimated cost of 10 for instruction:   %bitreverse
 ; AVX: Found an estimated cost of 10 for instruction:   %bitreverse
 ; AVX2: Found an estimated cost of 5 for instruction:   %bitreverse
@@ -136,7 +132,7 @@ define <16 x i16> @var_bitreverse_v16i16(<16 x i16> %a) {
 
 define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v16i8':
-; SSE2: Found an estimated cost of 48 for instruction:   %bitreverse
+; SSE2: Found an estimated cost of 20 for instruction:   %bitreverse
 ; SSE42: Found an estimated cost of 5 for instruction:   %bitreverse
 ; AVX: Found an estimated cost of 5 for instruction:   %bitreverse
 ; AVX2: Found an estimated cost of 5 for instruction:   %bitreverse
@@ -147,7 +143,7 @@ define <16 x i8> @var_bitreverse_v16i8(<16 x i8> %a) {
 
 define <32 x i8> @var_bitreverse_v32i8(<32 x i8> %a) {
 ; CHECK: 'Cost Model Analysis' for function 'var_bitreverse_v32i8':
-; SSE2: Found an estimated cost of 96 for instruction:   %bitreverse
+; SSE2: Found an estimated cost of 40 for instruction:   %bitreverse
 ; SSE42: Found an estimated cost of 10 for instruction:   %bitreverse
 ; AVX: Found an estimated cost of 10 for instruction:   %bitreverse
 ; AVX2: Found an estimated cost of 5 for instruction:   %bitreverse
diff --git a/test/Analysis/CostModel/X86/shuffle-single-src.ll b/test/Analysis/CostModel/X86/shuffle-single-src.ll
index a953ec17d80f9adf151049c5584d9bc940295164..e43e1afcdf5911539bb2a8d7c18d06dd380cf10f 100644
--- a/test/Analysis/CostModel/X86/shuffle-single-src.ll
+++ b/test/Analysis/CostModel/X86/shuffle-single-src.ll
@@ -1,30 +1,61 @@
-; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mcpu=skylake-avx512 | FileCheck %s --check-prefix=SKX
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+sse2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+ssse3 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSSE3
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+sse4.2 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE -check-prefix=SSE42
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX1
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX -check-prefix=AVX2
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: opt < %s -cost-model -analyze -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Verify the cost model for 1 src shuffles
 ;
 
-; SKX-LABEL: 'test_vXf64'
+; AVX512-LABEL: 'test_vXf64'
 define void @test_vXf64(<4 x double> %src256, <8 x double> %src512, <16 x double> %src1024) {
-  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE2: cost of 4 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 4 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 4 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 6 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 6 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <4 x double> %src256, <4 x double> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 
-  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE2: cost of 24 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 24 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 24 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 12 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 12 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <8 x double> %src512, <8 x double> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; SKX: cost of 2 {{.*}} %V1024 = shufflevector
+  ; SSE2: cost of 112 {{.*}} %V1024 = shufflevector
+  ; SSSE3: cost of 112 {{.*}} %V1024 = shufflevector
+  ; SSE42: cost of 112 {{.*}} %V1024 = shufflevector
+  ; AVX1: cost of 72 {{.*}} %V1024 = shufflevector
+  ; AVX2: cost of 72 {{.*}} %V1024 = shufflevector
+  ; AVX512: cost of 2 {{.*}} %V1024 = shufflevector
   %V1024 = shufflevector <16 x double> %src1024, <16 x double> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
   ret void
 }
 
-; SKX-LABEL: 'test_vXi64'
+; AVX512-LABEL: 'test_vXi64'
 define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 
-  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE2: cost of 8 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 8 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 8 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 8 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <4 x i64> %src256, <4 x i64> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 
-  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE2: cost of 48 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 48 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 48 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 16 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 16 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <8 x i64> %src512, <8 x i64> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
 
   ret void
@@ -33,13 +64,28 @@ define void @test_vXi64(<4 x i64> %src256, <8 x i64> %src512) {
 ; CHECK-LABEL: 'test_vXf32'
 define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %src512) {
 
-  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE2: cost of 6 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 6 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 6 {{.*}} %V128 = shufflevector
+  ; AVX1: cost of 6 {{.*}} %V128 = shufflevector
+  ; AVX2: cost of 6 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
   %V128 = shufflevector <4 x float> %src128, <4 x float> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 
-  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE2: cost of 12 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 12 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 12 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 14 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 14 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <8 x float> %src256, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE2: cost of 72 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 72 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 72 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 28 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 28 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <16 x float> %src512, <16 x float> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
   ret void
@@ -48,16 +94,36 @@ define void @test_vXf32(<4 x float> %src128, <8 x float> %src256, <16 x float> %
 ; CHECK-LABEL: 'test_vXi32'
 define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512, <32 x i32> %src1024) {
 
-  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE2: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX1: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
   %V128 = shufflevector <4 x i32> %src128, <4 x i32> undef, <4 x i32> <i32 3, i32 3, i32 1, i32 0>
 
-  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE2: cost of 16 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 16 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 16 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 16 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V256 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <8 x i32> %src256, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 5, i32 3, i32 2, i32 1, i32 0>
 
-  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE2: cost of 96 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 96 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 96 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 32 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 32 {{.*}} %V512 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <16 x i32> %src512, <16 x i32> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 13, i32 10, i32 9, i32 8, i32 8, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; SKX: cost of 2 {{.*}} %V1024 = shufflevector
+  ; SSE2: cost of 448 {{.*}} %V1024 = shufflevector
+  ; SSSE3: cost of 448 {{.*}} %V1024 = shufflevector
+  ; SSE42: cost of 448 {{.*}} %V1024 = shufflevector
+  ; AVX1: cost of 192 {{.*}} %V1024 = shufflevector
+  ; AVX2: cost of 192 {{.*}} %V1024 = shufflevector
+  ; AVX512: cost of 2 {{.*}} %V1024 = shufflevector
   %V1024 = shufflevector <32 x i32> %src1024, <32 x i32> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret void
 }
@@ -65,29 +131,70 @@ define void @test_vXi32(<4 x i32> %src128, <8 x i32> %src256, <16 x i32> %src512
 ; CHECK-LABEL: 'test_vXi16'
 define void @test_vXi16(<8 x i16> %src128, <16 x i16> %src256, <32 x i16> %src512, <64 x i16> %src1024) {
 
-  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE2: cost of 16 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX1: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512F: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V128 = shufflevector
   %V128 = shufflevector <8 x i16> %src128, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 6, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; SKX: cost of 1 {{.*}} %V256 = shufflevector
+  ; SSE2: cost of 32 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 32 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 32 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 32 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 4 {{.*}} %V256 = shufflevector
+  ; AVX512F: cost of 4 {{.*}} %V256 = shufflevector
+  ; AVX512BW cost of 1 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <16 x i16> %src256, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 13, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; SKX: cost of 1 {{.*}} %V512 = shufflevector
+  ; SSE2: cost of 192 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 192 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 192 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 64 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 64 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 64 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 1 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <32 x i16> %src512, <32 x i16> undef, <32 x i32> <i32 31, i32 30, i32 20, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 11, i32 9, i32 8, i32 7, i32 11, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; SKX: cost of 2 {{.*}} %V1024 = shufflevector
+  ; SSE2: cost of 896 {{.*}} %V1024 = shufflevector
+  ; SSSE3: cost of 896 {{.*}} %V1024 = shufflevector
+  ; SSE42: cost of 896 {{.*}} %V1024 = shufflevector
+  ; AVX1: cost of 384 {{.*}} %V1024 = shufflevector
+  ; AVX2: cost of 384 {{.*}} %V1024 = shufflevector
+  ; AVX512F: cost of 384 {{.*}} %V1024 = shufflevector
+  ; AVX512BW: cost of 2 {{.*}} %V1024 = shufflevector
   %V1024 = shufflevector <64 x i16> %src1024, <64 x i16> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   ret void
 }
 
 ; CHECK-LABEL: 'test_vXi8'
 define void @test_vXi8(<16 x i8> %src128, <32 x i8> %src256, <64 x i8> %src512) {
-  ; SKX: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE2: cost of 32 {{.*}} %V128 = shufflevector
+  ; SSSE3: cost of 1 {{.*}} %V128 = shufflevector
+  ; SSE42: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX1: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX2: cost of 1 {{.*}} %V128 = shufflevector
+  ; AVX512: cost of 1 {{.*}} %V128 = shufflevector
   %V128 = shufflevector <16 x i8> %src128, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 11, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; SKX: cost of 3 {{.*}} %V256 = shufflevector
+  ; SSE2: cost of 64 {{.*}} %V256 = shufflevector
+  ; SSSE3: cost of 64 {{.*}} %V256 = shufflevector
+  ; SSE42: cost of 64 {{.*}} %V256 = shufflevector
+  ; AVX1: cost of 64 {{.*}} %V256 = shufflevector
+  ; AVX2: cost of 4 {{.*}} %V256 = shufflevector
+  ; AVX512F: cost of 4 {{.*}} %V256 = shufflevector
+  ; AVX512BW: cost of 3 {{.*}} %V256 = shufflevector
   %V256 = shufflevector <32 x i8> %src256, <32 x i8> undef, <32 x i32> <i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 22, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 8, i32 8, i32 7, i32 6, i32 8, i32 4, i32 3, i32 2, i32 1, i32 0>
 
-  ; SKX: cost of 8 {{.*}} %V512 = shufflevector
+  ; SSE2: cost of 384 {{.*}} %V512 = shufflevector
+  ; SSSE3: cost of 384 {{.*}} %V512 = shufflevector
+  ; SSE42: cost of 384 {{.*}} %V512 = shufflevector
+  ; AVX1: cost of 128 {{.*}} %V512 = shufflevector
+  ; AVX2: cost of 128 {{.*}} %V512 = shufflevector
+  ; AVX512F: cost of 128 {{.*}} %V512 = shufflevector
+  ; AVX512BW: cost of 8 {{.*}} %V512 = shufflevector
   %V512 = shufflevector <64 x i8> %src512, <64 x i8> undef, <64 x i32> <i32 63, i32 62, i32 61, i32 60, i32 59, i32 58, i32 57, i32 56, i32 55, i32 54, i32 53, i32 52, i32 51, i32 50, i32 49, i32 48, i32 47, i32 46, i32 45, i32 44, i32 43, i32 42, i32 41, i32 40, i32 39, i32 38, i32 37, i32 36, i32 35, i32 34, i32 33, i32 32, i32 31, i32 30, i32 29, i32 28, i32 27, i32 26, i32 25, i32 24, i32 23, i32 20, i32 21, i32 20, i32 19, i32 18, i32 17, i32 16, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 
   ret void
diff --git a/test/Analysis/Delinearization/a.ll b/test/Analysis/Delinearization/a.ll
index 917fc355726ca67b6f6e50533cf33915af1d76ee..a105c205c5e6d2e7ac674993bcb94feb44cf7dc1 100644
--- a/test/Analysis/Delinearization/a.ll
+++ b/test/Analysis/Delinearization/a.ll
@@ -10,7 +10,7 @@
 ; AddRec: {{{(28 + (4 * (-4 + (3 * %m)) * %o) + %A),+,(8 * %m * %o)}<%for.i>,+,(12 * %o)}<%for.j>,+,20}<%for.k>
 ; CHECK: Base offset: %A
 ; CHECK: ArrayDecl[UnknownSize][%m][%o] with elements of 4 bytes.
-; CHECK: ArrayRef[{3,+,2}<%for.i>][{-4,+,3}<%for.j>][{7,+,5}<nw><%for.k>]
+; CHECK: ArrayRef[{3,+,2}<%for.i>][{-4,+,3}<nw><%for.j>][{7,+,5}<nw><%for.k>]
 
 define void @foo(i64 %n, i64 %m, i64 %o, i32* nocapture %A) #0 {
 entry:
diff --git a/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll b/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
index 0c893bf11379b421c3ff5d935c4580b2c61f7338..bd2f34df6a16ad785ff1bb1fb0f5d1aee6db35a0 100644
--- a/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
+++ b/test/Analysis/Delinearization/iv_times_constant_in_subscript.ll
@@ -11,7 +11,7 @@
 ; AddRec: {{((%m * %b * 8) + %A),+,(2 * %m * 8)}<%for.i>,+,(2 * 8)}<%for.j>
 ; CHECK: Base offset: %A
 ; CHECK: ArrayDecl[UnknownSize][%m] with elements of 8 bytes.
-; CHECK: ArrayRef[{%b,+,2}<%for.i>][{0,+,2}<%for.j>]
+; CHECK: ArrayRef[{%b,+,2}<nsw><%for.i>][{0,+,2}<%for.j>]
 
 
 define void @foo(i64 %n, i64 %m, i64 %b, double* %A) {
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
index 9847ad7434d5d9c4032966b74573a24666896bd2..3214dd41eeb4985a4d7760e9df38966ed957c028 100644
--- a/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/atomics.ll
@@ -12,34 +12,34 @@ define {i32, i1} @test2(i32* %ptr, i32 %cmp, i32 %new) {
   ret {i32, i1} %orig
 }
 
-; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val)
+; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
 define i32 @test_atomic_inc_i32(i32 addrspace(1)* %ptr, i32 %val) #0 {
-  %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val)
+  %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
   ret i32 %ret
 }
 
-; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val)
+; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
 define i64 @test_atomic_inc_i64(i64 addrspace(1)* %ptr, i64 %val) #0 {
-  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val)
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
   ret i64 %ret
 }
 
-; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val)
+; CHECK: DIVERGENT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
 define i32 @test_atomic_dec_i32(i32 addrspace(1)* %ptr, i32 %val) #0 {
-  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val)
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 %val, i32 0, i32 0, i1 false)
   ret i32 %ret
 }
 
-; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val)
+; CHECK: DIVERGENT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
 define i64 @test_atomic_dec_i64(i64 addrspace(1)* %ptr, i64 %val) #0 {
-  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val)
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 %val, i32 0, i32 0, i1 false)
   ret i64 %ret
 }
 
-declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32) #1
-declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64) #1
-declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32) #1
-declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64) #1
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1
+declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #1
+declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind argmemonly }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/interp-intrinsics.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/interp-intrinsics.ll
deleted file mode 100644
index d1c90ba608c8d164edd22bfd4ccad1e033437be4..0000000000000000000000000000000000000000
--- a/test/Analysis/DivergenceAnalysis/AMDGPU/interp-intrinsics.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; RUN: opt  -mtriple amdgcn--- -analyze -divergence %s | FileCheck %s
-
-; CHECK-LABEL: 'fs_interp'
-; CHECK: DIVERGENT: %v = call float @llvm.SI.fs.interp(
-define amdgpu_ps void @fs_interp(i32 inreg %prim_mask, <2 x i32> %interp_param) #1 {
-  %v = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %prim_mask, <2 x i32> %interp_param)
-  store volatile float %v, float addrspace(1)* undef
-  ret void
-}
-
-; CHECK-LABEL: 'fs_constant'
-; CHECK: DIVERGENT: %v = call float @llvm.SI.fs.constant(
-define amdgpu_ps void @fs_constant(i32 inreg %prim_mask, <2 x i32> %interp_param) #1 {
-  %v = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %prim_mask)
-  store volatile float %v, float addrspace(1)* undef
-  ret void
-}
-
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #0
-declare float @llvm.SI.fs.constant(i32, i32, i32) #0
-
-attributes #0 = { nounwind readnone }
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
index 319a697dfd3d6207e9c86e6a9060e5b3da4bad97..d2266952259119e6933ab90cf0b47c37e5dd21ba 100644
--- a/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/intrinsics.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -mtriple=amdgcn-- -analyze -divergence %s | FileCheck %s
 
 ; CHECK: DIVERGENT: %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
-define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) #0 {
   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   store i32 %swizzle, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
index b4fa79a6ba9f9f420359b035a85f49230ff5cbc9..6144ffea5b6112e80f5ddd34b650d51ff5e8f50d 100644
--- a/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/no-return-blocks.ll
@@ -5,7 +5,7 @@
 ; CHECK: DIVERGENT:  %tmp11 = load volatile float, float addrspace(1)* %tmp5, align 4
 
 ; The post dominator tree does not have a root node in this case
-define void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
+define amdgpu_kernel void @no_return_blocks(float addrspace(1)* noalias nocapture readonly %arg, float addrspace(1)* noalias nocapture readonly %arg1) #0 {
 bb0:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp2 = sext i32 %tmp to i64
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
index ca93dda2c5739f13336452a0578396f283cb36e8..7ade8eabd451b52349cee970c03876fd9879e559 100644
--- a/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/unreachable-loop-block.ll
@@ -1,7 +1,7 @@
 ; RUN: opt %s -mtriple amdgcn-- -analyze -divergence | FileCheck %s
 
 ; CHECK: DIVERGENT:  %tmp = cmpxchg volatile
-define void @unreachable_loop(i32 %tidx) #0 {
+define amdgpu_kernel void @unreachable_loop(i32 %tidx) #0 {
 entry:
   unreachable
 
diff --git a/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll b/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
index 669ee802c516b9c79fdb34f46073276bc597dbe0..98fbc88a2cfdad82e45b0a97d50bdc7610afd0f0 100644
--- a/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
+++ b/test/Analysis/DivergenceAnalysis/AMDGPU/workitem-intrinsics.ll
@@ -7,35 +7,35 @@ declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 
 ; CHECK: DIVERGENT:  %id.x = call i32 @llvm.amdgcn.workitem.id.x()
-define void @workitem_id_x() #1 {
+define amdgpu_kernel void @workitem_id_x() #1 {
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   store volatile i32 %id.x, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %id.y = call i32 @llvm.amdgcn.workitem.id.y()
-define void @workitem_id_y() #1 {
+define amdgpu_kernel void @workitem_id_y() #1 {
   %id.y = call i32 @llvm.amdgcn.workitem.id.y()
   store volatile i32 %id.y, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %id.z = call i32 @llvm.amdgcn.workitem.id.z()
-define void @workitem_id_z() #1 {
+define amdgpu_kernel void @workitem_id_z() #1 {
   %id.z = call i32 @llvm.amdgcn.workitem.id.z()
   store volatile i32 %id.z, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
-define void @mbcnt_lo() #1 {
+define amdgpu_kernel void @mbcnt_lo() #1 {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 0, i32 0)
   store volatile i32 %mbcnt.lo, i32 addrspace(1)* undef
   ret void
 }
 
 ; CHECK: DIVERGENT:  %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
-define void @mbcnt_hi() #1 {
+define amdgpu_kernel void @mbcnt_hi() #1 {
   %mbcnt.hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 0, i32 0)
   store volatile i32 %mbcnt.hi, i32 addrspace(1)* undef
   ret void
diff --git a/test/Analysis/LazyValueAnalysis/invalidation.ll b/test/Analysis/LazyValueAnalysis/invalidation.ll
index 21bfd2cfefa135c7be7fcc855c992f2d6200b5c5..67b6c9859396ec7543ba84baa5de126bfcc5104d 100644
--- a/test/Analysis/LazyValueAnalysis/invalidation.ll
+++ b/test/Analysis/LazyValueAnalysis/invalidation.ll
@@ -29,13 +29,13 @@ target triple = "x86_64-unknown-linux-gnu"
 
 @.str = private unnamed_addr constant [8 x i8] c"a = %l\0A\00", align 1
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare void @hoo(i64*)
 
 declare i32 @printf(i8* nocapture readonly, ...)
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define void @goo(i32 %N, i64* %b) {
 entry:
@@ -50,12 +50,12 @@ for.cond:                                         ; preds = %for.body, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start(i64 8, i8* %tmp)
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %tmp)
   call void @hoo(i64* %a.i)
   call void @hoo(i64* %c)
   %tmp1 = load volatile i64, i64* %a.i, align 8
   %call.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i64 %tmp1)
-  call void @llvm.lifetime.end(i64 8, i8* %tmp)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %tmp)
   %inc = add nsw i32 %i.0, 1
   br label %for.cond
 
diff --git a/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll b/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll
new file mode 100644
index 0000000000000000000000000000000000000000..00ab21e46d5d76df7a77c0d61d19ba8b30a7be79
--- /dev/null
+++ b/test/Analysis/LazyValueAnalysis/lvi-after-jumpthreading.ll
@@ -0,0 +1,84 @@
+; RUN: opt < %s -jump-threading -print-lazy-value-info -disable-output 2>&1 | FileCheck %s
+
+; Testing LVI cache after jump-threading
+
+; Jump-threading transforms the IR below to one where
+; loop and backedge basic blocks are merged into one.
+; basic block (named backedge) with the branch being:
+; %cont = icmp slt i32 %iv.next, 400
+; br i1 %cont, label %backedge, label %exit
+define i8 @test1(i32 %a, i32 %length) {
+; CHECK-LABEL: LVI for function 'test1':
+entry:
+  br label %loop
+; CHECK-LABEL: backedge:
+; CHECK-NEXT: ; CachedLatticeValues for: '  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]'
+; CHECK-DAG: ; at beginning of BasicBlock: '%backedge' LatticeVal: 'constantrange<0, 400>'
+; CHECK-NEXT: %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+; CHECK-NEXT: ; CachedLatticeValues for: '  %iv.next = add nsw i32 %iv, 1'
+; CHECK-NEXT: ; at beginning of BasicBlock: '%backedge' LatticeVal: 'constantrange<1, 401>'
+; CHECK-NEXT: %iv.next = add nsw i32 %iv, 1
+; CHECK-NEXT:  %cont = icmp slt i32 %iv.next, 400
+; CHECK-NEXT: br i1 %cont, label %backedge, label %exit
+
+; CHECK-NOT: loop
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %cnd = icmp sge i32 %iv, 0
+  br i1 %cnd, label %backedge, label %exit
+
+backedge:
+  %iv.next = add nsw i32 %iv, 1
+  %cont = icmp slt i32 %iv.next, 400
+  br i1 %cont, label %loop, label %exit
+
+exit:
+  ret i8 0
+}
+
+
+; Here JT does not transform the code, but LVICache is populated during the processing of blocks.
+define i8 @test2(i32 %n) {
+; CHECK-LABEL: LVI for function 'test2':
+; CHECK-LABEL: entry:
+; CHECK-LABEL: ; OverDefined values for block are:
+; CHECK-NEXT: ;i32 %n
+; CHECK-NEXT: br label %loop
+entry:
+  br label %loop
+
+; CHECK-LABEL: loop:
+; CHECK-LABEL: ; OverDefined values for block are:
+; CHECK-NEXT: ; %iv2 = phi i32 [ %n, %entry ], [ %iv2.next, %backedge ]
+; CHECK-NEXT: ; CachedLatticeValues for: '  %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]'
+; CHECK-DAG: ; at beginning of BasicBlock: '%loop' LatticeVal: 'constantrange<0, -2147483647>'
+; CHECK-DAG: ; at beginning of BasicBlock: '%backedge' LatticeVal: 'constantrange<0, -2147483648>'
+; CHECK-NEXT: %iv = phi i32 [ 0, %entry ], [ %iv.next, %backedge ]
+; CHECK: %cnd = and i1 %cnd1, %cnd2
+; CHECK: br i1 %cnd, label %backedge, label %exit
+loop:
+  %iv = phi i32 [0, %entry], [%iv.next, %backedge]
+  %iv2 = phi i32 [%n, %entry], [%iv2.next, %backedge]
+  %cnd1 = icmp sge i32 %iv, 0
+  %cnd2 = icmp sgt i32 %iv2, 0
+  %cnd = and i1 %cnd1, %cnd2
+  br i1 %cnd, label %backedge, label %exit
+
+; CHECK-LABEL: backedge:
+; CHECK-NEXT: ; CachedLatticeValues for: '  %iv.next = add nsw i32 %iv, 1'
+; CHECK-NEXT: ; at beginning of BasicBlock: '%backedge' LatticeVal: 'constantrange<1, -2147483647>'
+; CHECK-NEXT: %iv.next = add nsw i32 %iv, 1
+; CHECK-NEXT: %iv2.next = sub nsw i32 %iv2, 1
+; CHECK: %cont = and i1 %cont1, %cont2
+; CHECK: br i1 %cont, label %loop, label %exit
+backedge:
+  %iv.next = add nsw i32 %iv, 1
+  %iv2.next = sub nsw i32 %iv2, 1
+  %cont1 = icmp slt i32 %iv.next, 400
+  %cont2 = icmp sgt i32 %iv2.next, 0
+  %cont = and i1 %cont1, %cont2
+  br i1 %cont, label %loop, label %exit
+
+exit:
+  ret i8 0
+}
diff --git a/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll b/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll
index 87a6c18ab3034773cbd1584c132c84946ffbae8f..60c2a3930b5c0830c4563caf0dbc3915c6280bba 100644
--- a/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll
+++ b/test/Analysis/LoopAccessAnalysis/multiple-strides-rt-memory-checks.ll
@@ -13,9 +13,9 @@
 ;	int v3[Z][Z];
 ; } s;
 ;
-; void slow_function (s* const obj) {
+; void slow_function (s* const obj, int z) {
 ;    for (int j=0; j<Z; j++) {
-;        for (int k=0; k<Z; k++) {
+;        for (int k=0; k<z; k++) {
 ;            int x = obj->v1[k] + obj->v2[j];
 ;            obj->v3[j][k] += x;
 ;        }
@@ -35,7 +35,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 %struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] }
 
-define void @Test(%struct.s* nocapture %obj) #0 {
+define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
   br label %.outer.preheader
 
 
@@ -63,6 +63,6 @@ define void @Test(%struct.s* nocapture %obj) #0 {
   %8 = add nsw i32 %5, %7
   store i32 %8, i32* %6  
   %j.next = add nuw nsw i64 %j, 1
-  %exitcond.inner = icmp eq i64 %j.next, 32
+  %exitcond.inner = icmp eq i64 %j.next, %z
   br i1 %exitcond.inner, label %.outer, label %.inner
 }
diff --git a/test/Analysis/LoopAccessAnalysis/pr31098.ll b/test/Analysis/LoopAccessAnalysis/pr31098.ll
new file mode 100644
index 0000000000000000000000000000000000000000..04b73828f5148daddaaef6b86e318c78ba068e6c
--- /dev/null
+++ b/test/Analysis/LoopAccessAnalysis/pr31098.ll
@@ -0,0 +1,99 @@
+; RUN: opt -loop-accesses -analyze < %s | FileCheck %s
+; RUN: opt -passes='require<scalar-evolution>,require<aa>,loop(print-access-info)' -disable-output  < %s 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the compile-time-unknown depenendece-distance is resolved 
+; statically. Due to the non-unit stride of the accesses in this testcase
+; we are currently not able to create runtime dependence checks, and therefore
+; if we don't resolve the dependence statically we cannot vectorize the loop.
+;
+; Specifically in this example, during dependence analysis we get 6 unknown 
+; dependence distances between the 8 real/imaginary accesses below: 
+;    dist = 8*D, 4+8*D, -4+8*D, -8*D, 4-8*D, -4-8*D.
+; At compile time we can prove for all of the above that |dist|>loopBound*step
+; (where the step is 8bytes, and the loopBound is D-1), and thereby conclude 
+; that there are no dependencies (without runtime tests):
+; |8*D|>8*D-8, |4+8*D|>8*D-8, |-4+8*D|>8*D-8, etc.
+
+; #include <stdlib.h>
+; class Complex {
+; private:
+;   float real_;
+;   float imaginary_;
+;
+; public:
+;   Complex() : real_(0), imaginary_(0) { }
+;   Complex(float real, float imaginary) : real_(real), imaginary_(imaginary) { }
+;   Complex(const Complex &rhs) : real_(rhs.real()), imaginary_(rhs.imaginary()) { }
+; 
+;   inline float real() const { return real_; }
+;   inline float imaginary() const { return imaginary_; }
+; 
+;   Complex operator+(const Complex& rhs) const
+;   {
+;    return Complex(real_ + rhs.real_, imaginary_ + rhs.imaginary_);
+;   }
+;
+;   Complex operator-(const Complex& rhs) const
+;  {
+;     return Complex(real_ - rhs.real_, imaginary_ - rhs.imaginary_);
+;   }
+; };
+;
+; void Test(Complex *out, size_t size)
+; {
+;     size_t D = size / 2;
+;     for (size_t offset = 0; offset < D; ++offset)
+;     {
+;         Complex t0 = out[offset];
+;         Complex t1 = out[offset + D];
+;         out[offset] = t1 + t0;
+;         out[offset + D] = t0 - t1;
+;     }
+; }
+
+; CHECK-LABEL: Test
+; CHECK: Memory dependences are safe
+
+
+%class.Complex = type { float, float }
+
+define void @Test(%class.Complex* nocapture %out, i64 %size) local_unnamed_addr {
+entry:
+  %div = lshr i64 %size, 1
+  %cmp47 = icmp eq i64 %div, 0
+  br i1 %cmp47, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %offset.048 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %0 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 0
+  %1 = load float, float* %0, align 4
+  %imaginary_.i.i = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 1
+  %2 = load float, float* %imaginary_.i.i, align 4
+  %add = add nuw i64 %offset.048, %div
+  %3 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 0
+  %4 = load float, float* %3, align 4
+  %imaginary_.i.i28 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 1
+  %5 = load float, float* %imaginary_.i.i28, align 4
+  %add.i = fadd fast float %4, %1
+  %add4.i = fadd fast float %5, %2
+  store float %add.i, float* %0, align 4
+  store float %add4.i, float* %imaginary_.i.i, align 4
+  %sub.i = fsub fast float %1, %4
+  %sub4.i = fsub fast float %2, %5
+  store float %sub.i, float* %3, align 4
+  store float %sub4.i, float* %imaginary_.i.i28, align 4
+  %inc = add nuw nsw i64 %offset.048, 1
+  %exitcond = icmp eq i64 %inc, %div
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/test/Transforms/Util/MemorySSA/assume.ll b/test/Analysis/MemorySSA/assume.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/assume.ll
rename to test/Analysis/MemorySSA/assume.ll
diff --git a/test/Transforms/Util/MemorySSA/atomic-clobber.ll b/test/Analysis/MemorySSA/atomic-clobber.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/atomic-clobber.ll
rename to test/Analysis/MemorySSA/atomic-clobber.ll
diff --git a/test/Transforms/Util/MemorySSA/basicaa-memcpy.ll b/test/Analysis/MemorySSA/basicaa-memcpy.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/basicaa-memcpy.ll
rename to test/Analysis/MemorySSA/basicaa-memcpy.ll
diff --git a/test/Transforms/Util/MemorySSA/constant-memory.ll b/test/Analysis/MemorySSA/constant-memory.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/constant-memory.ll
rename to test/Analysis/MemorySSA/constant-memory.ll
diff --git a/test/Transforms/Util/MemorySSA/cyclicphi.ll b/test/Analysis/MemorySSA/cyclicphi.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/cyclicphi.ll
rename to test/Analysis/MemorySSA/cyclicphi.ll
diff --git a/test/Transforms/Util/MemorySSA/forward-unreachable.ll b/test/Analysis/MemorySSA/forward-unreachable.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/forward-unreachable.ll
rename to test/Analysis/MemorySSA/forward-unreachable.ll
diff --git a/test/Transforms/Util/MemorySSA/function-clobber.ll b/test/Analysis/MemorySSA/function-clobber.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/function-clobber.ll
rename to test/Analysis/MemorySSA/function-clobber.ll
diff --git a/test/Transforms/Util/MemorySSA/function-mem-attrs.ll b/test/Analysis/MemorySSA/function-mem-attrs.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/function-mem-attrs.ll
rename to test/Analysis/MemorySSA/function-mem-attrs.ll
diff --git a/test/Transforms/Util/MemorySSA/invariant-groups.ll b/test/Analysis/MemorySSA/invariant-groups.ll
similarity index 99%
rename from test/Transforms/Util/MemorySSA/invariant-groups.ll
rename to test/Analysis/MemorySSA/invariant-groups.ll
index 06797e64545de71cb9e0cf2f179bc32497ca8bbe..6e94ae178dbbfc5936decee9fb4ca01b16a59e3a 100644
--- a/test/Transforms/Util/MemorySSA/invariant-groups.ll
+++ b/test/Analysis/MemorySSA/invariant-groups.ll
@@ -19,7 +19,7 @@ define i32 @foo(i32* %a) {
   %a8 = call i8* @llvm.invariant.group.barrier(i8* %1)
   %a32 = bitcast i8* %a8 to i32*
 
-; This have to be MemoryUse(1), because we can't skip the barrier based on
+; This have to be MemoryUse(2), because we can't skip the barrier based on
 ; invariant.group.
 ; CHECK: MemoryUse(2)
 ; CHECK-NEXT: %2 = load i32
diff --git a/test/Transforms/Util/MemorySSA/lifetime-simple.ll b/test/Analysis/MemorySSA/lifetime-simple.ll
similarity index 68%
rename from test/Transforms/Util/MemorySSA/lifetime-simple.ll
rename to test/Analysis/MemorySSA/lifetime-simple.ll
index cdb36e31eb962e9049d38722b84a7d1b0d590e91..f1db15cc577d5e64dddbbd1ccbe863e735746017 100644
--- a/test/Transforms/Util/MemorySSA/lifetime-simple.ll
+++ b/test/Analysis/MemorySSA/lifetime-simple.ll
@@ -7,17 +7,17 @@
 define i8 @test(i8* %P, i8* %Q) {
 entry:
 ; CHECK:  1 = MemoryDef(liveOnEntry)
-; CHECK-NEXT:   call void @llvm.lifetime.start(i64 32, i8* %P)
-  call void @llvm.lifetime.start(i64 32, i8* %P)
-; CHECK:  MemoryUse(liveOnEntry)
+; CHECK-NEXT:   call void @llvm.lifetime.start.p0i8(i64 32, i8* %P)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %P)
+; CHECK:  MemoryUse(1)
 ; CHECK-NEXT:   %0 = load i8, i8* %P
   %0 = load i8, i8* %P
 ; CHECK:  2 = MemoryDef(1)
 ; CHECK-NEXT:   store i8 1, i8* %P
   store i8 1, i8* %P
 ; CHECK:  3 = MemoryDef(2)
-; CHECK-NEXT:   call void @llvm.lifetime.end(i64 32, i8* %P)
-  call void @llvm.lifetime.end(i64 32, i8* %P)
+; CHECK-NEXT:   call void @llvm.lifetime.end.p0i8(i64 32, i8* %P)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %P)
 ; CHECK:  MemoryUse(liveOnEntry)
 ; CHECK-NEXT:   %1 = load i8, i8* %P
   %1 = load i8, i8* %P
@@ -26,5 +26,5 @@ entry:
   %2 = load i8, i8* %Q
   ret i8 %1
 }
-declare void @llvm.lifetime.start(i64 %S, i8* nocapture %P) readonly
-declare void @llvm.lifetime.end(i64 %S, i8* nocapture %P)
+declare void @llvm.lifetime.start.p0i8(i64 %S, i8* nocapture %P) readonly
+declare void @llvm.lifetime.end.p0i8(i64 %S, i8* nocapture %P)
diff --git a/test/Transforms/Util/MemorySSA/load-invariant.ll b/test/Analysis/MemorySSA/load-invariant.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/load-invariant.ll
rename to test/Analysis/MemorySSA/load-invariant.ll
diff --git a/test/Transforms/Util/MemorySSA/many-dom-backedge.ll b/test/Analysis/MemorySSA/many-dom-backedge.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/many-dom-backedge.ll
rename to test/Analysis/MemorySSA/many-dom-backedge.ll
diff --git a/test/Transforms/Util/MemorySSA/many-doms.ll b/test/Analysis/MemorySSA/many-doms.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/many-doms.ll
rename to test/Analysis/MemorySSA/many-doms.ll
diff --git a/test/Transforms/Util/MemorySSA/multi-edges.ll b/test/Analysis/MemorySSA/multi-edges.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/multi-edges.ll
rename to test/Analysis/MemorySSA/multi-edges.ll
diff --git a/test/Transforms/Util/MemorySSA/multiple-backedges-hal.ll b/test/Analysis/MemorySSA/multiple-backedges-hal.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/multiple-backedges-hal.ll
rename to test/Analysis/MemorySSA/multiple-backedges-hal.ll
diff --git a/test/Transforms/Util/MemorySSA/multiple-locations.ll b/test/Analysis/MemorySSA/multiple-locations.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/multiple-locations.ll
rename to test/Analysis/MemorySSA/multiple-locations.ll
diff --git a/test/Transforms/Util/MemorySSA/no-disconnected.ll b/test/Analysis/MemorySSA/no-disconnected.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/no-disconnected.ll
rename to test/Analysis/MemorySSA/no-disconnected.ll
diff --git a/test/Transforms/Util/MemorySSA/optimize-use.ll b/test/Analysis/MemorySSA/optimize-use.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/optimize-use.ll
rename to test/Analysis/MemorySSA/optimize-use.ll
diff --git a/test/Transforms/Util/MemorySSA/phi-translation.ll b/test/Analysis/MemorySSA/phi-translation.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/phi-translation.ll
rename to test/Analysis/MemorySSA/phi-translation.ll
diff --git a/test/Transforms/Util/MemorySSA/pr28880.ll b/test/Analysis/MemorySSA/pr28880.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/pr28880.ll
rename to test/Analysis/MemorySSA/pr28880.ll
diff --git a/test/Analysis/MemorySSA/ptr-const-mem.ll b/test/Analysis/MemorySSA/ptr-const-mem.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a326d8d717a164924a44dfb9118e437fe4fcb024
--- /dev/null
+++ b/test/Analysis/MemorySSA/ptr-const-mem.ll
@@ -0,0 +1,23 @@
+; RUN: opt -basicaa -print-memoryssa -verify-memoryssa -analyze -memssa-check-limit=0 < %s 2>&1 | FileCheck %s
+; RUN: opt -aa-pipeline=basic-aa -passes='print<memoryssa>' -verify-memoryssa -disable-output -memssa-check-limit=0 < %s 2>&1 | FileCheck %s
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "amdgcn"
+
+@g4 = external unnamed_addr constant i8, align 1
+
+define signext i8 @cmp_constant(i8* %q, i8 %v) local_unnamed_addr {
+entry:
+
+  store i8 %v, i8* %q, align 1
+; CHECK: 1 = MemoryDef(liveOnEntry)
+; CHECK-NEXT: store i8 %v, i8* %q, align 1
+
+  %0 = load i8, i8* @g4, align 1
+; Make sure that this load is liveOnEntry just based on the fact that @g4 is
+; constant memory.
+; CHECK: MemoryUse(liveOnEntry)
+; CHECK-NEXT: load i8, i8* @g4, align 1
+
+  ret i8 %0
+}
+
diff --git a/test/Transforms/Util/MemorySSA/volatile-clobber.ll b/test/Analysis/MemorySSA/volatile-clobber.ll
similarity index 100%
rename from test/Transforms/Util/MemorySSA/volatile-clobber.ll
rename to test/Analysis/MemorySSA/volatile-clobber.ll
diff --git a/test/Analysis/RegionInfo/outgoing_edge.ll b/test/Analysis/RegionInfo/outgoing_edge.ll
new file mode 100644
index 0000000000000000000000000000000000000000..39e1a39d7e5b5246b15af15b28ffa01a935f1cdc
--- /dev/null
+++ b/test/Analysis/RegionInfo/outgoing_edge.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt -regions -analyze < %s | FileCheck %s
+; RUN: opt < %s -passes='print<regions>' 2>&1 | FileCheck %s
+
+; While working on improvements to the region info analysis, this test
+; case caused an incorrect region bb2 => bb3 to be detected. It is incorrect
+; because bb2 has an outgoing edge to bb4. This is interesting because
+; bb2 dom bb3 and bb3 pdom bb2, which should have been enough to prevent incoming
+; forward edges into the region and outgoing forward edges from the region.
+
+define void @meread_() nounwind {
+bb:
+   br label %bb1
+
+bb1:                                              ; preds = %bb4, %bb
+   br label %bb2
+
+bb2:                                              ; preds = %bb1
+  br i1 true, label %bb3, label %bb4
+
+bb3:                                              ; preds = %bb2
+  br i1 true, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb3, %bb2
+   br label %bb1
+
+bb5:                                              ; preds = %bb3
+   ret void
+ }
+
+; CHECK:      [0] bb => <Function Return>
+; CHECK-NEXT:   [1] bb1 => bb5
+; CHECK-NEXT: End region tree
diff --git a/test/Analysis/RegionInfo/outgoing_edge_1.ll b/test/Analysis/RegionInfo/outgoing_edge_1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6f51131a188c5c695fe4bd49f6608c9f442945a1
--- /dev/null
+++ b/test/Analysis/RegionInfo/outgoing_edge_1.ll
@@ -0,0 +1,39 @@
+; REQUIRES: asserts
+; RUN: opt -regions -analyze < %s | FileCheck %s
+; RUN: opt < %s -passes='print<regions>' 2>&1 | FileCheck %s
+
+; While working on improvements to region info analysis, this test
+; case caused an incorrect region bb2 => bb3 to be detected.
+
+define internal i8 @main_read() nounwind {
+bb:
+   br label %bb1
+
+bb1:
+   br i1 true, label %bb2, label %bb7
+
+bb2:
+  br i1 true, label %bb4, label %bb3
+
+bb3:
+  br i1 true, label %bb4, label %bb8
+
+bb4:
+   br label %bb5
+
+bb5:
+   br label %bb6
+
+bb6:
+   br label %bb1
+
+bb7:
+   br label %bb5
+
+bb8:
+   ret i8 1
+}
+
+; CHECK:    [0] bb => <Function Return>
+; CHECK-NEXT: [1] bb1 => bb8
+; CHECK-NEXT: End region tree
diff --git a/test/Analysis/ScalarEvolution/flags-from-poison.ll b/test/Analysis/ScalarEvolution/flags-from-poison.ll
index 8e73fe4fd54cdb49a8b5157df89e7d239fcf2f2b..44ee830d9c62f313c798b551d31b8617ca451ffa 100644
--- a/test/Analysis/ScalarEvolution/flags-from-poison.ll
+++ b/test/Analysis/ScalarEvolution/flags-from-poison.ll
@@ -272,17 +272,16 @@ exit:
   ret void
 }
 
-; Without inbounds, GEP does not propagate poison in the very
-; conservative approach used here.
-define void @test-add-no-inbounds(float* %input, i32 %offset, i32 %numIterations) {
-; CHECK-LABEL: @test-add-no-inbounds
+; Any poison input makes getelementptr produce poison
+define void @test-gep-propagates-poison(float* %input, i32 %offset, i32 %numIterations) {
+; CHECK-LABEL: @test-gep-propagates-poison
 entry:
   br label %loop
 loop:
   %i = phi i32 [ %nexti, %loop ], [ 0, %entry ]
 
 ; CHECK: %index32 =
-; CHECK: --> {%offset,+,1}<nw>
+; CHECK: --> {%offset,+,1}<nsw>
   %index32 = add nsw i32 %i, %offset
 
   %ptr = getelementptr float, float* %input, i32 %index32
@@ -317,17 +316,16 @@ exit:
   ret void
 }
 
-; Multiplication by a non-constant should not propagate poison in the
-; very conservative approach used here.
-define void @test-add-mul-no-propagation(float* %input, i32 %offset, i32 %numIterations) {
-; CHECK-LABEL: @test-add-mul-no-propagation
+; Any poison input to multiplication propages poison.
+define void @test-mul-propagates-poison(float* %input, i32 %offset, i32 %numIterations) {
+; CHECK-LABEL: @test-mul-propagates-poison
 entry:
   br label %loop
 loop:
   %i = phi i32 [ %nexti, %loop ], [ 0, %entry ]
 
 ; CHECK: %index32 =
-; CHECK: --> {%offset,+,1}<nw>
+; CHECK: --> {%offset,+,1}<nsw>
   %index32 = add nsw i32 %i, %offset
 
   %indexmul = mul nsw i32 %index32, %offset
@@ -340,17 +338,15 @@ exit:
   ret void
 }
 
-; Multiplication by a non-zero constant does not propagate poison
-; without a no-wrap flag.
-define void @test-add-mul-no-propagation2(float* %input, i32 %offset, i32 %numIterations) {
-; CHECK-LABEL: @test-add-mul-no-propagation2
+define void @test-mul-propagates-poison-2(float* %input, i32 %offset, i32 %numIterations) {
+; CHECK-LABEL: @test-mul-propagates-poison-2
 entry:
   br label %loop
 loop:
   %i = phi i32 [ %nexti, %loop ], [ 0, %entry ]
 
 ; CHECK: %index32 =
-; CHECK: --> {%offset,+,1}<nw>
+; CHECK: --> {%offset,+,1}<nsw>
   %index32 = add nsw i32 %i, %offset
 
   %indexmul = mul i32 %index32, 2
diff --git a/test/Analysis/ScalarEvolution/implied-via-addition.ll b/test/Analysis/ScalarEvolution/implied-via-addition.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c9c276cef466ce4e7425259c97315179a0035c44
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/implied-via-addition.ll
@@ -0,0 +1,50 @@
+; RUN: opt -indvars -S < %s | FileCheck %s
+
+declare void @use(i1)
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define void @test_01(i8 %t) {
+; CHECK-LABEL: test_01
+ entry:
+  %st = sext i8 %t to i16
+  %cmp1 = icmp slt i16 %st, 42
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %loop
+
+ loop:
+; CHECK-LABEL: loop
+  %idx = phi i8 [ %t, %entry ], [ %idx.inc, %loop ]
+  %idx.inc = add i8 %idx, 1
+  %c = icmp slt i8 %idx, 42
+; CHECK: call void @use(i1 true)
+  call void @use(i1 %c)
+  %be = icmp slt i8 %idx.inc, 42
+  br i1 %be, label %loop, label %exit
+
+ exit:
+  ret void
+}
+
+define void @test_02(i8 %t) {
+; CHECK-LABEL: test_02
+ entry:
+  %t.ptr = inttoptr i8 %t to i8*
+  %p.42 = inttoptr i8 42 to i8*
+  %cmp1 = icmp slt i8* %t.ptr, %p.42
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %loop
+
+ loop:
+; CHECK-LABEL: loop
+  %idx = phi i8* [ %t.ptr, %entry ], [ %snext, %loop ]
+  %snext = getelementptr inbounds i8, i8* %idx, i64 1
+  %c = icmp slt i8* %idx, %p.42
+; CHECK: call void @use(i1 true)
+  call void @use(i1 %c)
+  %be = icmp slt i8* %snext, %p.42
+  br i1 %be, label %loop, label %exit
+
+ exit:
+  ret void
+}
diff --git a/test/Analysis/ScalarEvolution/implied-via-division.ll b/test/Analysis/ScalarEvolution/implied-via-division.ll
new file mode 100644
index 0000000000000000000000000000000000000000..43f4c04fa92758ab4734c23b551b71919401c35f
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/implied-via-division.ll
@@ -0,0 +1,331 @@
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define void @test_1(i32 %n) nounwind {
+; Prove that (n > 1) ===> (n / 2 > 0).
+; CHECK:         Determining loop execution counts for: @test_1
+; CHECK:         Loop %header: backedge-taken count is (-1 + %n.div.2)<nsw>
+entry:
+  %cmp1 = icmp sgt i32 %n, 1
+  %n.div.2 = sdiv i32 %n, 2
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sgt i32 %n.div.2, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_1neg(i32 %n) nounwind {
+; Prove that (n > 0) =\=> (n / 2 > 0).
+; CHECK:         Determining loop execution counts for: @test_1neg
+; CHECK:         Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2))<nsw>
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  %n.div.2 = sdiv i32 %n, 2
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sgt i32 %n.div.2, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_2(i32 %n) nounwind {
+; Prove that (n >= 2) ===> (n / 2 > 0).
+; CHECK:         Determining loop execution counts for: @test_2
+; CHECK:         Loop %header: backedge-taken count is (-1 + %n.div.2)<nsw>
+entry:
+  %cmp1 = icmp sge i32 %n, 2
+  %n.div.2 = sdiv i32 %n, 2
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sgt i32 %n.div.2, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_2neg(i32 %n) nounwind {
+; Prove that (n >= 1) =\=> (n / 2 > 0).
+; CHECK:         Determining loop execution counts for: @test_2neg
+; CHECK:         Loop %header: backedge-taken count is (-1 + (1 smax %n.div.2))<nsw>
+entry:
+  %cmp1 = icmp sge i32 %n, 1
+  %n.div.2 = sdiv i32 %n, 2
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sgt i32 %n.div.2, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_3(i32 %n) nounwind {
+; Prove that (n > -2) ===> (n / 2 >= 0).
+; CHECK:         Determining loop execution counts for: @test_3
+; CHECK:         Loop %header: backedge-taken count is (1 + %n.div.2)<nsw>
+entry:
+  %cmp1 = icmp sgt i32 %n, -2
+  %n.div.2 = sdiv i32 %n, 2
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sge i32 %n.div.2, %indvar
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_3neg(i32 %n) nounwind {
+; Prove that (n > -3) =\=> (n / 2 >= 0).
+; CHECK:         Determining loop execution counts for: @test_3neg
+; CHECK:         Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)<nsw>)
+entry:
+  %cmp1 = icmp sgt i32 %n, -3
+  %n.div.2 = sdiv i32 %n, 2
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sge i32 %n.div.2, %indvar
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_4(i32 %n) nounwind {
+; Prove that (n >= -1) ===> (n / 2 >= 0).
+; CHECK:         Determining loop execution counts for: @test_4
+; CHECK:         Loop %header: backedge-taken count is (1 + %n.div.2)<nsw>
+entry:
+  %cmp1 = icmp sge i32 %n, -1
+  %n.div.2 = sdiv i32 %n, 2
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sge i32 %n.div.2, %indvar
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_4neg(i32 %n) nounwind {
+; Prove that (n >= -2) =\=> (n / 2 >= 0).
+; CHECK:         Determining loop execution counts for: @test_4neg
+; CHECK:         Loop %header: backedge-taken count is (0 smax (1 + %n.div.2)<nsw>)
+entry:
+  %cmp1 = icmp sge i32 %n, -2
+  %n.div.2 = sdiv i32 %n, 2
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i32 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i32 %indvar, 1
+  %exitcond = icmp sge i32 %n.div.2, %indvar
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_ext_01(i32 %n) nounwind {
+; Prove that (n > 1) ===> (n / 2 > 0).
+; CHECK:         Determining loop execution counts for: @test_ext_01
+; CHECK:         Loop %header: backedge-taken count is (-1 + (sext i32 %n.div.2 to i64))<nsw>
+entry:
+  %cmp1 = icmp sgt i32 %n, 1
+  %n.div.2 = sdiv i32 %n, 2
+  %n.div.2.ext = sext i32 %n.div.2 to i64
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i64 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp sgt i64 %n.div.2.ext, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_ext_01neg(i32 %n) nounwind {
+; Prove that (n > 0) =\=> (n / 2 > 0).
+; CHECK:         Determining loop execution counts for: @test_ext_01neg
+; CHECK:         Loop %header: backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64)))<nsw>
+entry:
+  %cmp1 = icmp sgt i32 %n, 0
+  %n.div.2 = sdiv i32 %n, 2
+  %n.div.2.ext = sext i32 %n.div.2 to i64
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i64 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp sgt i64 %n.div.2.ext, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_ext_02(i32 %n) nounwind {
+; Prove that (n >= 2) ===> (n / 2 > 0).
+; CHECK:         Determining loop execution counts for: @test_ext_02
+; CHECK:         Loop %header: backedge-taken count is (-1 + (sext i32 %n.div.2 to i64))<nsw>
+entry:
+  %cmp1 = icmp sge i32 %n, 2
+  %n.div.2 = sdiv i32 %n, 2
+  %n.div.2.ext = sext i32 %n.div.2 to i64
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i64 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp sgt i64 %n.div.2.ext, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_ext_02neg(i32 %n) nounwind {
+; Prove that (n >= 1) =\=> (n / 2 > 0).
+; CHECK:         Determining loop execution counts for: @test_ext_02neg
+; CHECK:         Loop %header: backedge-taken count is (-1 + (1 smax (sext i32 %n.div.2 to i64)))<nsw>
+entry:
+  %cmp1 = icmp sge i32 %n, 1
+  %n.div.2 = sdiv i32 %n, 2
+  %n.div.2.ext = sext i32 %n.div.2 to i64
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i64 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp sgt i64 %n.div.2.ext, %indvar.next
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_ext_03(i32 %n) nounwind {
+; Prove that (n > -2) ===> (n / 2 >= 0).
+; CHECK:         Determining loop execution counts for: @test_ext_03
+; CHECK:         Loop %header: backedge-taken count is (1 + (sext i32 %n.div.2 to i64))<nsw>
+entry:
+  %cmp1 = icmp sgt i32 %n, -2
+  %n.div.2 = sdiv i32 %n, 2
+  %n.div.2.ext = sext i32 %n.div.2 to i64
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i64 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp sge i64 %n.div.2.ext, %indvar
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_ext_03neg(i32 %n) nounwind {
+; Prove that (n > -3) =\=> (n / 2 >= 0).
+; CHECK:         Determining loop execution counts for: @test_ext_03neg
+; CHECK:         Loop %header: backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))<nsw>)
+entry:
+  %cmp1 = icmp sgt i32 %n, -3
+  %n.div.2 = sdiv i32 %n, 2
+  %n.div.2.ext = sext i32 %n.div.2 to i64
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i64 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp sge i64 %n.div.2.ext, %indvar
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_ext_04(i32 %n) nounwind {
+; Prove that (n >= -1) ===> (n / 2 >= 0).
+; CHECK:         Determining loop execution counts for: @test_ext_04
+; CHECK:         Loop %header: backedge-taken count is (1 + (sext i32 %n.div.2 to i64))<nsw>
+entry:
+  %cmp1 = icmp sge i32 %n, -1
+  %n.div.2 = sdiv i32 %n, 2
+  %n.div.2.ext = sext i32 %n.div.2 to i64
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i64 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp sge i64 %n.div.2.ext, %indvar
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
+
+define void @test_ext_04neg(i32 %n) nounwind {
+; Prove that (n >= -2) =\=> (n / 2 >= 0).
+; CHECK:         Determining loop execution counts for: @test_ext_04neg
+; CHECK:         Loop %header: backedge-taken count is (0 smax (1 + (sext i32 %n.div.2 to i64))<nsw>)
+entry:
+  %cmp1 = icmp sge i32 %n, -2
+  %n.div.2 = sdiv i32 %n, 2
+  %n.div.2.ext = sext i32 %n.div.2 to i64
+  call void(i1, ...) @llvm.experimental.guard(i1 %cmp1) [ "deopt"() ]
+  br label %header
+
+header:
+  %indvar = phi i64 [ %indvar.next, %header ], [ 0, %entry ]
+  %indvar.next = add i64 %indvar, 1
+  %exitcond = icmp sge i64 %n.div.2.ext, %indvar
+  br i1 %exitcond, label %header, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Analysis/ScalarEvolution/pr18606-min-zeros.ll b/test/Analysis/ScalarEvolution/pr18606-min-zeros.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f4fdf9d3932dc005438ee8ae78eab8a1c24b7eb0
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/pr18606-min-zeros.ll
@@ -0,0 +1,63 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; CHECK: @test
+; CHECK: %5 = add i32 %local_6_, %local_0_
+; CEHCK: %37 = mul i32 %36, %36
+
+define i32 @test(i32, i32) {
+bci_0:
+  br label %bci_30
+
+bci_68:                                           ; preds = %bci_45
+  %local_6_.lcssa = phi i32 [ %local_6_, %bci_45 ]
+  %.lcssa1.lcssa = phi i32 [ %37, %bci_45 ]
+  %.lcssa.lcssa = phi i32 [ 34, %bci_45 ]
+  %2 = add i32 %local_6_.lcssa, 262
+  %3 = add i32 %2, %.lcssa1.lcssa
+  %4 = add i32 %3, %.lcssa.lcssa
+  ret i32 %4
+
+bci_30:                                           ; preds = %bci_45, %bci_0
+  %local_0_ = phi i32 [ %0, %bci_0 ], [ %38, %bci_45 ]
+  %local_6_ = phi i32 [ 2, %bci_0 ], [ %39, %bci_45 ]
+  %5 = add i32 %local_6_, %local_0_
+  br label %bci_45
+
+bci_45:                                           ; preds = %bci_30
+  %6 = mul i32 %5, %5
+  %7 = mul i32 %6, %6
+  %8 = mul i32 %7, %7
+  %9 = mul i32 %8, %8
+  %10 = mul i32 %9, %9
+  %11 = mul i32 %10, %10
+  %12 = mul i32 %11, %11
+  %13 = mul i32 %12, %12
+  %14 = mul i32 %13, %13
+  %15 = mul i32 %14, %14
+  %16 = mul i32 %15, %15
+  %17 = mul i32 %16, %16
+  %18 = mul i32 %17, %17
+  %19 = mul i32 %18, %18
+  %20 = mul i32 %19, %19
+  %21 = mul i32 %20, %20
+  %22 = mul i32 %21, %21
+  %23 = mul i32 %22, %22
+  %24 = mul i32 %23, %23
+  %25 = mul i32 %24, %24
+  %26 = mul i32 %25, %25
+  %27 = mul i32 %26, %26
+  %28 = mul i32 %27, %27
+  %29 = mul i32 %28, %28
+  %30 = mul i32 %29, %29
+  %31 = mul i32 %30, %30
+  %32 = mul i32 %31, %31
+  %33 = mul i32 %32, %32
+  %34 = mul i32 %33, %33
+  %35 = mul i32 %34, %34
+  %36 = mul i32 %35, %35
+  %37 = mul i32 %36, %36
+  %38 = add i32 %37, -11
+  %39 = add i32 %local_6_, 1
+  %40 = icmp sgt i32 %39, 76
+  br i1 %40, label %bci_68, label %bci_30
+}
diff --git a/test/Analysis/ScalarEvolution/sext-mul.ll b/test/Analysis/ScalarEvolution/sext-mul.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ca25d9e2efad2ad3da9dc145a40ead46db6f37a4
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/sext-mul.ll
@@ -0,0 +1,89 @@
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
+
+; CHECK: %tmp9 = shl i64 %tmp8, 33
+; CHECK-NEXT: --> {{.*}} Exits: (-8589934592 + (8589934592 * (zext i32 %arg2 to i64)))
+; CHECK: %tmp10 = ashr exact i64 %tmp9, 32
+; CHECK-NEXT: --> {{.*}} Exits: (sext i32 (-2 + (2 * %arg2)) to i64)
+; CHECK: %tmp11 = getelementptr inbounds i32, i32* %arg, i64 %tmp10
+; CHECK-NEXT: --> {{.*}} Exits: ((4 * (sext i32 (-2 + (2 * %arg2)) to i64)) + %arg)
+; CHECK:  %tmp14 = or i64 %tmp10, 1
+; CHECK-NEXT: --> {{.*}} Exits: (1 + (sext i32 (-2 + (2 * %arg2)) to i64))<nsw>
+; CHECK: %tmp15 = getelementptr inbounds i32, i32* %arg, i64 %tmp14
+; CHECK-NEXT: --> {{.*}} Exits: (4 + (4 * (sext i32 (-2 + (2 * %arg2)) to i64)) + %arg)
+; CHECK:Loop %bb7: backedge-taken count is (-1 + (zext i32 %arg2 to i64))<nsw>
+; CHECK-NEXT:Loop %bb7: max backedge-taken count is -1
+; CHECK-NEXT:Loop %bb7: Predicated backedge-taken count is (-1 + (zext i32 %arg2 to i64))<nsw>
+
+define void @foo(i32* nocapture %arg, i32 %arg1, i32 %arg2) {
+bb:
+  %tmp = icmp sgt i32 %arg2, 0
+  br i1 %tmp, label %bb3, label %bb6
+
+bb3:                                              ; preds = %bb
+  %tmp4 = zext i32 %arg2 to i64
+  br label %bb7
+
+bb5:                                              ; preds = %bb7
+  br label %bb6
+
+bb6:                                              ; preds = %bb5, %bb
+  ret void
+
+bb7:                                              ; preds = %bb7, %bb3
+  %tmp8 = phi i64 [ %tmp18, %bb7 ], [ 0, %bb3 ]
+  %tmp9 = shl i64 %tmp8, 33
+  %tmp10 = ashr exact i64 %tmp9, 32
+  %tmp11 = getelementptr inbounds i32, i32* %arg, i64 %tmp10
+  %tmp12 = load i32, i32* %tmp11, align 4
+  %tmp13 = sub nsw i32 %tmp12, %arg1
+  store i32 %tmp13, i32* %tmp11, align 4
+  %tmp14 = or i64 %tmp10, 1
+  %tmp15 = getelementptr inbounds i32, i32* %arg, i64 %tmp14
+  %tmp16 = load i32, i32* %tmp15, align 4
+  %tmp17 = mul nsw i32 %tmp16, %arg1
+  store i32 %tmp17, i32* %tmp15, align 4
+  %tmp18 = add nuw nsw i64 %tmp8, 1
+  %tmp19 = icmp eq i64 %tmp18, %tmp4
+  br i1 %tmp19, label %bb5, label %bb7
+}
+
+; CHECK: %t10 = ashr exact i128 %t9, 1
+; CHECK-NEXT: --> {{.*}} Exits: (sext i127 (-633825300114114700748351602688 + (633825300114114700748351602688 * (zext i32 %arg5 to i127))) to i128)
+; CHECK: %t14 = or i128 %t10, 1
+; CHECK-NEXT: --> {{.*}} Exits: (1 + (sext i127 (-633825300114114700748351602688 + (633825300114114700748351602688 * (zext i32 %arg5 to i127))) to i128))<nsw>
+; CHECK: Loop %bb7: backedge-taken count is (-1 + (zext i32 %arg5 to i128))<nsw>
+; CHECK-NEXT: Loop %bb7: max backedge-taken count is -1
+; CHECK-NEXT: Loop %bb7: Predicated backedge-taken count is (-1 + (zext i32 %arg5 to i128))<nsw>
+
+define void @goo(i32* nocapture %arg3, i32 %arg4, i32 %arg5) {
+bb:
+  %t = icmp sgt i32 %arg5, 0
+  br i1 %t, label %bb3, label %bb6
+
+bb3:                                              ; preds = %bb
+  %t4 = zext i32 %arg5 to i128
+  br label %bb7
+
+bb5:                                              ; preds = %bb7
+  br label %bb6
+
+bb6:                                              ; preds = %bb5, %bb
+  ret void
+
+bb7:                                              ; preds = %bb7, %bb3
+  %t8 = phi i128 [ %t18, %bb7 ], [ 0, %bb3 ]
+  %t9 = shl i128 %t8, 100
+  %t10 = ashr exact i128 %t9, 1
+  %t11 = getelementptr inbounds i32, i32* %arg3, i128 %t10
+  %t12 = load i32, i32* %t11, align 4
+  %t13 = sub nsw i32 %t12, %arg4
+  store i32 %t13, i32* %t11, align 4
+  %t14 = or i128 %t10, 1
+  %t15 = getelementptr inbounds i32, i32* %arg3, i128 %t14
+  %t16 = load i32, i32* %t15, align 4
+  %t17 = mul nsw i32 %t16, %arg4
+  store i32 %t17, i32* %t15, align 4
+  %t18 = add nuw nsw i128 %t8, 1
+  %t19 = icmp eq i128 %t18, %t4
+  br i1 %t19, label %bb5, label %bb7
+}
diff --git a/test/Analysis/ScalarEvolution/sext-zero.ll b/test/Analysis/ScalarEvolution/sext-zero.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cac42638e959289058bf3841aa1b0bfe50f327ee
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/sext-zero.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -analyze -scalar-evolution | FileCheck %s
+
+; CHECK:  %tmp9 = shl i64 %tmp8, 33
+; CHECK-NEXT:  -->  {{.*}} Exits: (-8589934592 + (8589934592 * (zext i32 %arg2 to i64)))
+; CHECK-NEXT:  %tmp10 = ashr exact i64 %tmp9, 0
+; CHECK-NEXT:  -->  {{.*}} Exits: (-8589934592 + (8589934592 * (zext i32 %arg2 to i64)))
+
+define void @foo(i32* nocapture %arg, i32 %arg1, i32 %arg2) {
+bb:
+  %tmp = icmp sgt i32 %arg2, 0
+  br i1 %tmp, label %bb3, label %bb6
+
+bb3:                                              ; preds = %bb
+  %tmp4 = zext i32 %arg2 to i64
+  br label %bb7
+
+bb5:                                              ; preds = %bb7
+  br label %bb6
+
+bb6:                                              ; preds = %bb5, %bb
+  ret void
+
+bb7:                                              ; preds = %bb7, %bb3
+  %tmp8 = phi i64 [ %tmp18, %bb7 ], [ 0, %bb3 ]
+  %tmp9 = shl i64 %tmp8, 33
+  %tmp10 = ashr exact i64 %tmp9, 0
+  %tmp11 = getelementptr inbounds i32, i32* %arg, i64 %tmp10
+  %tmp12 = load i32, i32* %tmp11, align 4
+  %tmp13 = sub nsw i32 %tmp12, %arg1
+  store i32 %tmp13, i32* %tmp11, align 4
+  %tmp14 = or i64 %tmp10, 1
+  %tmp15 = getelementptr inbounds i32, i32* %arg, i64 %tmp14
+  %tmp16 = load i32, i32* %tmp15, align 4
+  %tmp17 = mul nsw i32 %tmp16, %arg1
+  store i32 %tmp17, i32* %tmp15, align 4
+  %tmp18 = add nuw nsw i64 %tmp8, 1
+  %tmp19 = icmp eq i64 %tmp18, %tmp4
+  br i1 %tmp19, label %bb5, label %bb7
+}
diff --git a/test/Analysis/ScalarEvolution/tripmultiple_calculation.ll b/test/Analysis/ScalarEvolution/tripmultiple_calculation.ll
new file mode 100644
index 0000000000000000000000000000000000000000..133532e31a5bef76dae20ef83eb09f008c6f7080
--- /dev/null
+++ b/test/Analysis/ScalarEvolution/tripmultiple_calculation.ll
@@ -0,0 +1,125 @@
+; RUN: opt -S -analyze -scalar-evolution < %s 2>&1 | FileCheck %s
+
+; umin is represented using -1 * umax in scalar evolution. -1 is considered as the
+; constant of the multiply expression (-1 * ((-1 + (-1 * %a)) umax (-1 + (-1 * %b)))).
+; Returns the greatest power of 2 divisor by evaluating the minimal trailing zeros
+; for the trip count expression.
+;
+; int foo(uint32_t a, uint32_t b, uint32_t *c) {
+;   for (uint32_t i = 0; i < (uint32_t)(a < b ? a : b) + 1; i++)
+;     c[i] = i;
+;   return 0;
+; }
+;
+; CHECK: Loop %for.body: Trip multiple is 1
+
+define i32 @foo(i32 %a, i32 %b, i32* %c) {
+entry:
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  %add = add i32 %cond, 1
+  %cmp18 = icmp eq i32 %add, 0
+  br i1 %cmp18, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret i32 0
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.09 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %c, i32 %i.09
+  store i32 %i.09, i32* %arrayidx, align 4
+  %inc = add nuw i32 %i.09, 1
+  %cmp1 = icmp ult i32 %inc, %add
+  br i1 %cmp1, label %for.body, label %for.cond.cleanup.loopexit
+}
+
+; Overflow may happen for the multiply expression n * 3, verify that trip
+; multiple is set to 1 if NUW/NSW are not set.
+;
+; __attribute__((noinline)) void a(unsigned n) {
+;   #pragma unroll(3)
+;   for (unsigned i = 0; i != n * 3; ++i)
+;     printf("TEST%u\n", i);
+; }
+; int main() { a(2863311531U); }
+;
+; CHECK: Loop %for.body: Trip multiple is 1
+
+@.str2 = private unnamed_addr constant [8 x i8] c"TEST%u\0A\00", align 1
+
+define void @foo2(i32 %n) {
+entry:
+  %mul = mul i32 %n, 3
+  %cmp4 = icmp eq i32 %mul, 0
+  br i1 %cmp4, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.05 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %call = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str2, i32 0, i32 0), i32 %i.05)
+  %inc = add nuw i32 %i.05, 1
+  %cmp = icmp eq i32 %inc, %mul
+  br i1 %cmp, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32 @printf(i8* nocapture readonly, ...)
+
+
+; If we couldn't prove no overflow for the multiply expression 24 * n,
+; returns the greatest power of 2 divisor. If overflows happens
+; the trip count is still divisible by the greatest power of 2 divisor.
+;
+; CHECK: Loop %l3: Trip multiple is 8
+
+declare void @f()
+
+define i32 @foo3(i32 %n) {
+entry:
+  %loop_ctl = mul i32 %n, 24
+  br label %l3
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  call void @f()
+  %inc = add i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, %loop_ctl
+  br i1 %exitcond, label %exit, label %l3
+
+exit:
+  ret i32 0
+}
+
+; If the trip count is a constant, verify that we obtained the trip
+; count itself. For huge trip counts, or zero, we return 1.
+;
+; CHECK: Loop %l3: Trip multiple is 3
+
+define i32 @foo4(i32 %n) {
+entry:
+  br label %l3
+
+l3:
+  %x.0 = phi i32 [ 0, %entry ], [ %inc, %l3 ]
+  call void @f()
+  %inc = add i32 %x.0, 1
+  %exitcond = icmp eq i32 %inc, 3
+  br i1 %exitcond, label %exit, label %l3
+
+exit:
+  ret i32 0
+}
+
diff --git a/test/Analysis/ScalarEvolution/zext-wrap.ll b/test/Analysis/ScalarEvolution/zext-wrap.ll
index 5bc149e2309a2b8598c360c81ccb6873329d84a1..34462208fbb317b29e2b4ab6d81bd1de4a07753f 100644
--- a/test/Analysis/ScalarEvolution/zext-wrap.ll
+++ b/test/Analysis/ScalarEvolution/zext-wrap.ll
@@ -6,6 +6,10 @@ entry:
         br label %bb.i
 
 bb.i:           ; preds = %bb1.i, %bb.nph
+; We should be able to find the range for this expression.
+; CHECK: %l_95.0.i1 = phi i8
+; CHECK: -->  {0,+,-1}<%bb.i> U: [2,1) S: [2,1){{ *}}Exits: 2
+
         %l_95.0.i1 = phi i8 [ %tmp1, %bb.i ], [ 0, %entry ]
 
 ; This cast shouldn't be folded into the addrec.
diff --git a/test/Analysis/ValueTracking/known-nonnull-at.ll b/test/Analysis/ValueTracking/known-nonnull-at.ll
index 8a0d1f3aff3bf20cf1922b685f13ba99425bdc27..93ef4f8c4c48a49031a0381a28b881d2469fb428 100644
--- a/test/Analysis/ValueTracking/known-nonnull-at.ll
+++ b/test/Analysis/ValueTracking/known-nonnull-at.ll
@@ -8,8 +8,7 @@ declare void @bar(i8* %a, i8* nonnull %b)
 define i1 @caller1(i8* %x, i8* %y) {
 ; CHECK-LABEL: @caller1(
 ; CHECK-NEXT:    call void @bar(i8* %x, i8* %y)
-; CHECK-NEXT:    [[NULL_CHECK:%.*]] = icmp eq i8* %y, null
-; CHECK-NEXT:    ret i1 [[NULL_CHECK]]
+; CHECK-NEXT:    ret i1 false
 ;
   call void @bar(i8* %x, i8* %y)
   %null_check = icmp eq i8* %y, null
@@ -34,24 +33,68 @@ define i1 @caller2(i8* %x, i8* %y) {
 define i1 @caller3(i8* %x, i8* %y) {
 ; CHECK-LABEL: @caller3(
 ; CHECK-NEXT:    call void @bar(i8* %x, i8* %y)
-; CHECK-NEXT:    [[NULL_CHECK:%.*]] = icmp ne i8* %y, null
-; CHECK-NEXT:    ret i1 [[NULL_CHECK]]
+; CHECK-NEXT:    ret i1 true
 ;
   call void @bar(i8* %x, i8* %y)
   %null_check = icmp ne i8* %y, null
   ret i1 %null_check
 }
 
-; Don't know anything about 'y'.
+; FIXME: The call is guaranteed to execute, so 'y' must be nonnull throughout.
 
 define i1 @caller4(i8* %x, i8* %y) {
 ; CHECK-LABEL: @caller4(
-; CHECK-NEXT:    call void @bar(i8* %y, i8* %x)
 ; CHECK-NEXT:    [[NULL_CHECK:%.*]] = icmp ne i8* %y, null
+; CHECK-NEXT:    call void @bar(i8* %x, i8* %y)
 ; CHECK-NEXT:    ret i1 [[NULL_CHECK]]
 ;
-  call void @bar(i8* %y, i8* %x)
   %null_check = icmp ne i8* %y, null
+  call void @bar(i8* %x, i8* %y)
+  ret i1 %null_check
+}
+
+; The call to bar() does not dominate the null check, so no change.
+
+define i1 @caller5(i8* %x, i8* %y) {
+; CHECK-LABEL: @caller5(
+; CHECK-NEXT:    [[NULL_CHECK:%.*]] = icmp eq i8* %y, null
+; CHECK-NEXT:    br i1 [[NULL_CHECK]], label %t, label %f
+; CHECK:       t:
+; CHECK-NEXT:    ret i1 [[NULL_CHECK]]
+; CHECK:       f:
+; CHECK-NEXT:    call void @bar(i8* %x, i8* %y)
+; CHECK-NEXT:    ret i1 [[NULL_CHECK]]
+;
+  %null_check = icmp eq i8* %y, null
+  br i1 %null_check, label %t, label %f
+t:
   ret i1 %null_check
+f:
+  call void @bar(i8* %x, i8* %y)
+  ret i1 %null_check
+}
+
+; Make sure that an invoke works similarly to a call.
+
+declare i32 @esfp(...)
+
+define i1 @caller6(i8* %x, i8* %y) personality i8* bitcast (i32 (...)* @esfp to i8*){
+; CHECK-LABEL: @caller6(
+; CHECK-NEXT:    invoke void @bar(i8* %x, i8* nonnull %y)
+; CHECK-NEXT:    to label %cont unwind label %exc
+; CHECK:       cont:
+; CHECK-NEXT:    ret i1 false
+;
+  invoke void @bar(i8* %x, i8* nonnull %y)
+    to label %cont unwind label %exc
+
+cont:
+  %null_check = icmp eq i8* %y, null
+  ret i1 %null_check
+
+exc:
+  %lp = landingpad { i8*, i32 }
+    filter [0 x i8*] zeroinitializer
+  unreachable
 }
 
diff --git a/test/Assembler/alloca-addrspace-parse-error-0.ll b/test/Assembler/alloca-addrspace-parse-error-0.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a9db43c08d2e8648698307f1428785606153ca6c
--- /dev/null
+++ b/test/Assembler/alloca-addrspace-parse-error-0.ll
@@ -0,0 +1,11 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+target datalayout = "A1"
+
+; CHECK: :8:3: error: expected metadata after comma
+define void @use_alloca() {
+  %alloca = alloca i32, addrspace(1),
+  ret void
+}
+
+!0 = !{}
diff --git a/test/Assembler/alloca-addrspace-parse-error-1.ll b/test/Assembler/alloca-addrspace-parse-error-1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5209f417da63725069f0b250a56fe019a73942ed
--- /dev/null
+++ b/test/Assembler/alloca-addrspace-parse-error-1.ll
@@ -0,0 +1,12 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+target datalayout = "A1"
+
+; addrspace and align in wrong order
+; CHECK: :8:39: error: expected metadata after comma
+define void @use_alloca() {
+  %alloca = alloca i32, addrspace(1), align 4
+  ret void
+}
+
+!0 = !{}
diff --git a/test/Assembler/alloca-addrspace0.ll b/test/Assembler/alloca-addrspace0.ll
new file mode 100644
index 0000000000000000000000000000000000000000..09b7a323f62fea9fa5a143e7478a84de0ad0e642
--- /dev/null
+++ b/test/Assembler/alloca-addrspace0.ll
@@ -0,0 +1,24 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+target datalayout = "A0"
+; CHECK: target datalayout = "A0"
+
+
+; CHECK: %alloca_scalar_no_align = alloca i32
+; CHECK-NEXT: %alloca_scalar_align4 = alloca i32, align 4
+; CHECK-NEXT: %alloca_scalar_no_align_metadata = alloca i32, !foo !0
+; CHECK-NEXT: %alloca_scalar_align4_metadata = alloca i32, align 4, !foo !0
+; CHECK-NEXT: %alloca_inalloca_scalar_no_align = alloca inalloca i32
+; CHECK-NEXT: %alloca_inalloca_scalar_align4_metadata = alloca inalloca i32, align 4, !foo !0
+define void @use_alloca() {
+  %alloca_scalar_no_align = alloca i32, addrspace(0)
+  %alloca_scalar_align4 = alloca i32, align 4, addrspace(0)
+  %alloca_scalar_no_align_metadata = alloca i32, addrspace(0), !foo !0
+  %alloca_scalar_align4_metadata = alloca i32, align 4, addrspace(0), !foo !0
+  %alloca_inalloca_scalar_no_align = alloca inalloca i32, addrspace(0)
+  %alloca_inalloca_scalar_align4_metadata = alloca inalloca i32, align 4, addrspace(0), !foo !0
+
+  ret void
+}
+
+!0 = !{}
diff --git a/test/Assembler/auto_upgrade_intrinsics.ll b/test/Assembler/auto_upgrade_intrinsics.ll
index 2f0f4f779e7c0db0c54c79e68da235b2be173de5..d00fe5882bcd5dead6f6dccade125ccd45e7d456 100644
--- a/test/Assembler/auto_upgrade_intrinsics.ll
+++ b/test/Assembler/auto_upgrade_intrinsics.ll
@@ -53,11 +53,20 @@ entry:
 
 define i32 @test.objectsize() {
 ; CHECK-LABEL: @test.objectsize(
-; CHECK: @llvm.objectsize.i32.p0i8
+; CHECK: @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
   %s = call i32 @llvm.objectsize.i32(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
   ret i32 %s
 }
 
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) nounwind readonly
+define i64 @test.objectsize.2() {
+; CHECK-LABEL: @test.objectsize.2(
+; CHECK: @llvm.objectsize.i64.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
+  %s = call i64 @llvm.objectsize.i64.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+  ret i64 %s
+}
+
+
 declare <2 x double> @llvm.masked.load.v2f64(<2 x double>* %ptrs, i32, <2 x i1> %mask, <2 x double> %src0)
 
 define <2 x double> @tests.masked.load(<2 x double>* %ptr, <2 x i1> %mask, <2 x double> %passthru)  {
@@ -101,6 +110,25 @@ define void @test.stackprotectorcheck() {
   ret void
 }
 
+declare void  @llvm.lifetime.start(i64, i8* nocapture) nounwind readonly
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+define void @tests.lifetime.start.end() {
+  ; CHECK-LABEL: @tests.lifetime.start.end(
+  %a = alloca i8
+  call void @llvm.lifetime.start(i64 1, i8* %a)
+  ; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %a)
+  store i8 0, i8* %a
+  call void @llvm.lifetime.end(i64 1, i8* %a)
+  ; CHECK: call void @llvm.lifetime.end.p0i8(i64 1, i8* %a)
+  ret void
+}
+
+
 ; This is part of @test.objectsize(), since llvm.objectsize declaration gets
 ; emitted at the end.
 ; CHECK: declare i32 @llvm.objectsize.i32.p0i8
+
+
+; CHECK: declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+; CHECK: declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
diff --git a/test/Assembler/datalayout-alloca-addrspace-mismatch-0.ll b/test/Assembler/datalayout-alloca-addrspace-mismatch-0.ll
new file mode 100644
index 0000000000000000000000000000000000000000..31920183c6595c7f35599d096d52f30b6ee644c4
--- /dev/null
+++ b/test/Assembler/datalayout-alloca-addrspace-mismatch-0.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+target datalayout = "A1"
+
+; CHECK: :7:41: error: address space must match datalayout
+define void @use_alloca() {
+  %alloca_scalar_no_align = alloca i32, addrspace(2)
+  ret void
+}
diff --git a/test/Assembler/datalayout-alloca-addrspace-mismatch-1.ll b/test/Assembler/datalayout-alloca-addrspace-mismatch-1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8778a05291c526325ac86c1266d0a04943467e0a
--- /dev/null
+++ b/test/Assembler/datalayout-alloca-addrspace-mismatch-1.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+target datalayout = "A1"
+
+; CHECK: :7:50: error: address space must match datalayout
+define void @use_alloca() {
+  %alloca_scalar_no_align = alloca i32, align 4, addrspace(2)
+  ret void
+}
diff --git a/test/Assembler/datalayout-alloca-addrspace-mismatch-2.ll b/test/Assembler/datalayout-alloca-addrspace-mismatch-2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b6e2738a4f6ef65ff9889a0bf5ec31fc7f41fd57
--- /dev/null
+++ b/test/Assembler/datalayout-alloca-addrspace-mismatch-2.ll
@@ -0,0 +1,11 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+target datalayout = "A1"
+
+; CHECK: :7:50: error: address space must match datalayout
+define void @use_alloca() {
+  %alloca_scalar_no_align = alloca i32, align 4, addrspace(2), !foo !0
+  ret void
+}
+
+!0 = !{}
diff --git a/test/Assembler/datalayout-alloca-addrspace.ll b/test/Assembler/datalayout-alloca-addrspace.ll
new file mode 100644
index 0000000000000000000000000000000000000000..578b7ef0b37d71dc01ae5cea991b95159bf70c9b
--- /dev/null
+++ b/test/Assembler/datalayout-alloca-addrspace.ll
@@ -0,0 +1,23 @@
+; RUN: llvm-as < %s | llvm-dis | FileCheck %s
+
+target datalayout = "A1"
+; CHECK: target datalayout = "A1"
+
+; CHECK: %alloca_scalar_no_align = alloca i32, addrspace(1)
+; CHECK-NEXT: %alloca_scalar_align4 = alloca i32, align 4, addrspace(1)
+; CHECK-NEXT: %alloca_scalar_no_align_metadata = alloca i32, addrspace(1), !foo !0
+; CHECK-NEXT: %alloca_scalar_align4_metadata = alloca i32, align 4, addrspace(1), !foo !0
+; CHECK-NEXT: %alloca_inalloca_scalar_no_align = alloca inalloca i32, addrspace(1)
+; CHECK-NEXT: %alloca_inalloca_scalar_align4_metadata = alloca inalloca i32, align 4, addrspace(1), !foo !0
+define void @use_alloca() {
+  %alloca_scalar_no_align = alloca i32, addrspace(1)
+  %alloca_scalar_align4 = alloca i32, align 4, addrspace(1)
+  %alloca_scalar_no_align_metadata = alloca i32, addrspace(1), !foo !0
+  %alloca_scalar_align4_metadata = alloca i32, align 4, addrspace(1), !foo !0
+  %alloca_inalloca_scalar_no_align = alloca inalloca i32, addrspace(1)
+  %alloca_inalloca_scalar_align4_metadata = alloca inalloca i32, align 4, addrspace(1), !foo !0
+
+  ret void
+}
+
+!0 = !{}
diff --git a/test/Assembler/debug-info.ll b/test/Assembler/debug-info.ll
index da19678dbf1e36ecb845dd5582ae84dc823ff64e..6be3a308e6275abfbca6b33eb862b401c3b63f07 100644
--- a/test/Assembler/debug-info.ll
+++ b/test/Assembler/debug-info.ll
@@ -37,8 +37,8 @@
 !13 = distinct !{}
 !14 = !DIFile(filename: "", directory: "")
 
-; CHECK-NEXT: !13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 32, align: 32)
-!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 32, align: 32)
+; CHECK-NEXT: !13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 32, align: 32, dwarfAddressSpace: 1)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 32, align: 32, dwarfAddressSpace: 1)
 
 ; CHECK-NEXT: !14 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyType", file: !10, line: 2, size: 32, align: 32, identifier: "MangledMyType")
 ; CHECK-NEXT: !15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "Base", scope: !14, file: !10, line: 3, size: 128, align: 32, offset: 64, flags: DIFlagPublic, elements: !16, runtimeLang: DW_LANG_C_plus_plus_11, vtableHolder: !15, templateParams: !18, identifier: "MangledBase")
@@ -84,4 +84,4 @@
 ; CHECK-NEXT: !33 = !DIFile(filename: "file", directory: "dir")
 !35 = !DIFile(filename: "file", directory: "dir", checksumkind: CSK_MD5, checksum: "000102030405060708090a0b0c0d0e0f")
 !36 = !DIFile(filename: "file", directory: "dir", checksumkind: CSK_None)
-!37 = !DIFile(filename: "file", directory: "dir", checksumkind: CSK_None, checksum: "")
\ No newline at end of file
+!37 = !DIFile(filename: "file", directory: "dir", checksumkind: CSK_None, checksum: "")
diff --git a/test/Assembler/diexpression.ll b/test/Assembler/diexpression.ll
index dd69c0edecc2484f56bda9b3ef68ae565c238e2b..c2fa3ee14c23482cee863b6c3479b8e6054cbff4 100644
--- a/test/Assembler/diexpression.ll
+++ b/test/Assembler/diexpression.ll
@@ -1,16 +1,18 @@
 ; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s
 ; RUN: verify-uselistorder %s
 
-; CHECK: !named = !{!0, !1, !2, !3, !4}
-!named = !{!0, !1, !2, !3, !4}
+; CHECK: !named = !{!0, !1, !2, !3, !4, !5}
+!named = !{!0, !1, !2, !3, !4, !5}
 
 ; CHECK:      !0 = !DIExpression()
 ; CHECK-NEXT: !1 = !DIExpression(DW_OP_deref)
 ; CHECK-NEXT: !2 = !DIExpression(DW_OP_plus, 3)
 ; CHECK-NEXT: !3 = !DIExpression(DW_OP_LLVM_fragment, 3, 7)
 ; CHECK-NEXT: !4 = !DIExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_LLVM_fragment, 3, 7)
+; CHECK-NEXT: !5 = !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef)
 !0 = !DIExpression()
 !1 = !DIExpression(DW_OP_deref)
 !2 = !DIExpression(DW_OP_plus, 3)
 !3 = !DIExpression(DW_OP_LLVM_fragment, 3, 7)
 !4 = !DIExpression(DW_OP_deref, DW_OP_plus, 3, DW_OP_LLVM_fragment, 3, 7)
+!5 = !DIExpression(DW_OP_constu, 2, DW_OP_swap, DW_OP_xderef)
diff --git a/test/Assembler/fast-math-flags.ll b/test/Assembler/fast-math-flags.ll
index f0d3ecc761d1f9c98071029708253f3636da5b88..4ef3607e1d0067e8a102630ce49edc97845ba773 100644
--- a/test/Assembler/fast-math-flags.ll
+++ b/test/Assembler/fast-math-flags.ll
@@ -74,6 +74,18 @@ entry:
   ret float %e
 }
 
+; CHECK: @contract(
+define float @contract(float %x, float %y) {
+entry:
+; CHECK: %a = fsub contract float %x, %y
+  %a = fsub contract float %x, %y
+; CHECK: %b = fadd contract float %x, %y
+  %b = fadd contract float %x, %y
+; CHECK: %c = fmul contract float %a, %b
+  %c = fmul contract float %a, %b
+  ret float %c
+}
+
 ; CHECK: no_nan_inf
 define float @no_nan_inf(float %x, float %y) {
 entry:
diff --git a/test/Assembler/invalid-datalayout-alloca-addrspace.ll b/test/Assembler/invalid-datalayout-alloca-addrspace.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9a0e07acf7669ee340acd99ec620d2e06e8fdd13
--- /dev/null
+++ b/test/Assembler/invalid-datalayout-alloca-addrspace.ll
@@ -0,0 +1,4 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+target datalayout = "A16777216"
+; CHECK: Invalid address space, must be a 24bit integer
diff --git a/test/Bitcode/DIGlobalVariableExpression.ll b/test/Bitcode/DIGlobalVariableExpression.ll
index 0bb0488b131f986cca52efeb8d861d2224f850d7..f6796bbdb7a054fdbe6e95c1a57a4c91fe549a8c 100644
--- a/test/Bitcode/DIGlobalVariableExpression.ll
+++ b/test/Bitcode/DIGlobalVariableExpression.ll
@@ -7,12 +7,16 @@
 ; CHECK: @h = common global i32 0, align 4, !dbg ![[H:[0-9]+]]
 ; CHECK: ![[G]] = {{.*}}!DIGlobalVariableExpression(var: ![[GVAR:[0-9]+]], expr: ![[GEXPR:[0-9]+]])
 ; CHECK: ![[GVAR]] = distinct !DIGlobalVariable(name: "g",
+; CHECK: DICompileUnit({{.*}}, imports: ![[IMPORTS:[0-9]+]]
 ; CHECK: !DIGlobalVariableExpression(var: ![[CVAR:[0-9]+]], expr: ![[CEXPR:[0-9]+]])
 ; CHECK: ![[CVAR]] = distinct !DIGlobalVariable(name: "c",
 ; CHECK: ![[CEXPR]] = !DIExpression(DW_OP_constu, 23, DW_OP_stack_value)
-; CHECK: ![[H]] = {{.*}}!DIGlobalVariableExpression(var: ![[HVAR:[0-9]+]])
-; CHECK: ![[HVAR]] = distinct !DIGlobalVariable(name: "h",
+; CHECK: ![[HVAR:[0-9]+]] = distinct !DIGlobalVariable(name: "h",
+; CHECK: ![[IMPORTS]] = !{![[CIMPORT:[0-9]+]]}
+; CHECK: ![[CIMPORT]] = !DIImportedEntity({{.*}}entity: ![[HVAR]]
 ; CHECK: ![[GEXPR]] = !DIExpression(DW_OP_plus, 1)
+; CHECK: ![[H]] = {{.*}}!DIGlobalVariableExpression(var: ![[HVAR]])
+
 @g = common global i32 0, align 4, !dbg !0
 @h = common global i32 0, align 4, !dbg !11
 
@@ -21,9 +25,9 @@
 !llvm.ident = !{!9}
 
 !0 = distinct !DIGlobalVariable(name: "g", scope: !1, file: !2, line: 1, type: !5, isLocal: false, isDefinition: true, expr: !DIExpression(DW_OP_plus, 1))
-!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang version 4.0.0 (trunk 286129) (llvm/trunk 286128)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !3, globals: !4)
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, producer: "clang version 4.0.0 (trunk 286129) (llvm/trunk 286128)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, imports: !3)
 !2 = !DIFile(filename: "a.c", directory: "/")
-!3 = !{}
+!3 = !{!12}
 !4 = !{!0, !10, !11}
 !5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 !6 = !{i32 2, !"Dwarf Version", i32 4}
@@ -32,3 +36,4 @@
 !9 = !{!"clang version 4.0.0 (trunk 286129) (llvm/trunk 286128)"}
 !10 = distinct !DIGlobalVariable(name: "c", scope: !1, file: !2, line: 1, type: !5, isLocal: false, isDefinition: true, expr: !DIExpression(DW_OP_constu, 23, DW_OP_stack_value))
 !11 = distinct !DIGlobalVariable(name: "h", scope: !1, file: !2, line: 2, type: !5, isLocal: false, isDefinition: true)
+!12 = !DIImportedEntity(tag: DW_TAG_imported_declaration, line: 1, scope: !1, entity: !11)
diff --git a/test/Bitcode/DIGlobalVariableExpression.ll.bc b/test/Bitcode/DIGlobalVariableExpression.ll.bc
index 54b6dbd6351a6f3fc00e8e18047e46c92c8aed52..0eeb3aaca825a55d8e9b152598e621a0603825cb 100644
Binary files a/test/Bitcode/DIGlobalVariableExpression.ll.bc and b/test/Bitcode/DIGlobalVariableExpression.ll.bc differ
diff --git a/test/Bitcode/DIGlobalVariableExpression2.ll b/test/Bitcode/DIGlobalVariableExpression2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..55974d5317dd524e5ba3e3ebf88cc95fb43aabc5
--- /dev/null
+++ b/test/Bitcode/DIGlobalVariableExpression2.ll
@@ -0,0 +1,31 @@
+; RUN: llvm-dis -o - %s.bc | FileCheck %s
+
+; CHECK: @g = common global i32 0, align 4, !dbg ![[G:[0-9]+]]
+; CHECK-DAG: ![[G]] = distinct !DIGlobalVariableExpression(var: ![[GVAR:[0-9]+]])
+; CHECK-DAG: distinct !DICompileUnit({{.*}}, globals: ![[GLOBS:[0-9]+]]
+; CHECK-DAG: ![[GLOBS]] = !{![[GEXPR:[0-9]+]]}
+; CHECK-DAG: ![[GEXPR]] = distinct !DIGlobalVariableExpression(var: ![[GVAR]])
+; CHECK-DAG: ![[GVAR]] = !DIGlobalVariable(name: "g",
+
+; Test the bitcode upgrade for DIGlobalVariable -> DIGlobalVariableExpression.
+
+; ModuleID = 'a.c'
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+@g = common global i32 0, align 4
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!6, !7, !8}
+!llvm.ident = !{!9}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.7.0 (clang-stage1-configure-RA_build 241111)", isOptimized: false, runtimeVersion: 0, emissionKind: 1, enums: !2, retainedTypes: !2, subprograms: !2, globals: !3, imports: !2)
+!1 = !DIFile(filename: "a.c", directory: "/tmp")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIGlobalVariable(name: "g", scope: !0, file: !1, line: 1, type: !5, isLocal: false, isDefinition: true, variable: i32* @g)
+!5 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!6 = !{i32 2, !"Dwarf Version", i32 2}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = !{i32 1, !"PIC Level", i32 2}
+!9 = !{!"clang version 3.7.0 (clang-stage1-configure-RA_build 241111)"}
diff --git a/test/Bitcode/DIGlobalVariableExpression2.ll.bc b/test/Bitcode/DIGlobalVariableExpression2.ll.bc
new file mode 100644
index 0000000000000000000000000000000000000000..5f6b398263c9eed83782b947418ba9358acd181c
Binary files /dev/null and b/test/Bitcode/DIGlobalVariableExpression2.ll.bc differ
diff --git a/test/Bitcode/compatibility-3.6.ll b/test/Bitcode/compatibility-3.6.ll
index 87958fc34183a9abc045c0181a88be8a49a28376..8d51ee11a209b1baa4f635c3e7bd006bd4c9d051 100644
--- a/test/Bitcode/compatibility-3.6.ll
+++ b/test/Bitcode/compatibility-3.6.ll
@@ -981,7 +981,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #33
+  ; CHECK: call void @f.nobuiltin() #34
 
   call fastcc noalias i32* @f.noalias() noinline
   ; CHECK: call fastcc noalias i32* @f.noalias() #11
@@ -1183,7 +1183,8 @@ define void @intrinsics.codegen() {
 ; CHECK: attributes #30 = { argmemonly nounwind readonly }
 ; CHECK: attributes #31 = { argmemonly nounwind }
 ; CHECK: attributes #32 = { nounwind readonly }
-; CHECK: attributes #33 = { builtin }
+; CHECK: attributes #33 = { inaccessiblemem_or_argmemonly nounwind }
+; CHECK: attributes #34 = { builtin }
 
 ;; Metadata
 
diff --git a/test/Bitcode/compatibility-3.7.ll b/test/Bitcode/compatibility-3.7.ll
index 4ae0aed2018196705c0ad6abc2767db477c0c00b..ebdf4c30587c973fcb7775d0f266287c93d77349 100644
--- a/test/Bitcode/compatibility-3.7.ll
+++ b/test/Bitcode/compatibility-3.7.ll
@@ -1022,7 +1022,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #36
+  ; CHECK: call void @f.nobuiltin() #37
 
   call fastcc noalias i32* @f.noalias() noinline
   ; CHECK: call fastcc noalias i32* @f.noalias() #12
@@ -1246,7 +1246,8 @@ define void @misc.metadata() {
 ; CHECK: attributes #33 = { argmemonly nounwind readonly }
 ; CHECK: attributes #34 = { argmemonly nounwind }
 ; CHECK: attributes #35 = { nounwind readonly }
-; CHECK: attributes #36 = { builtin }
+; CHECK: attributes #36 = { inaccessiblemem_or_argmemonly nounwind }
+; CHECK: attributes #37 = { builtin }
 
 ;; Metadata
 
diff --git a/test/Bitcode/compatibility-3.8.ll b/test/Bitcode/compatibility-3.8.ll
index 79c1ecfac9fc3e150e3b36d8c6807d507953adba..57ea3e068376f13ba0798665e47d312e2fe11aa5 100644
--- a/test/Bitcode/compatibility-3.8.ll
+++ b/test/Bitcode/compatibility-3.8.ll
@@ -1170,7 +1170,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #39
+  ; CHECK: call void @f.nobuiltin() #40
 
   call fastcc noalias i32* @f.noalias() noinline
   ; CHECK: call fastcc noalias i32* @f.noalias() #12
@@ -1556,7 +1556,8 @@ normal:
 ; CHECK: attributes #36 = { argmemonly nounwind readonly }
 ; CHECK: attributes #37 = { argmemonly nounwind }
 ; CHECK: attributes #38 = { nounwind readonly }
-; CHECK: attributes #39 = { builtin }
+; CHECK: attributes #39 = { inaccessiblemem_or_argmemonly nounwind }
+; CHECK: attributes #40 = { builtin }
 
 ;; Metadata
 
diff --git a/test/Bitcode/compatibility-3.9.ll b/test/Bitcode/compatibility-3.9.ll
index 300be3324e6adefcab839bc19c5368fe92e3ca81..2a6cfe14cdb14407a2b15c3ac7395505e4042e0f 100644
--- a/test/Bitcode/compatibility-3.9.ll
+++ b/test/Bitcode/compatibility-3.9.ll
@@ -1241,7 +1241,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #40
+  ; CHECK: call void @f.nobuiltin() #41
 
   call fastcc noalias i32* @f.noalias() noinline
   ; CHECK: call fastcc noalias i32* @f.noalias() #12
@@ -1588,7 +1588,7 @@ normal:
 }
 
 declare void @f.writeonly() writeonly
-; CHECK: declare void @f.writeonly() #39
+; CHECK: declare void @f.writeonly() #40
 
 ; CHECK: attributes #0 = { alignstack=4 }
 ; CHECK: attributes #1 = { alignstack=8 }
@@ -1629,8 +1629,9 @@ declare void @f.writeonly() writeonly
 ; CHECK: attributes #36 = { argmemonly nounwind readonly }
 ; CHECK: attributes #37 = { argmemonly nounwind }
 ; CHECK: attributes #38 = { nounwind readonly }
-; CHECK: attributes #39 = { writeonly }
-; CHECK: attributes #40 = { builtin }
+; CHECK: attributes #39 = { inaccessiblemem_or_argmemonly nounwind }
+; CHECK: attributes #40 = { writeonly }
+; CHECK: attributes #41 = { builtin }
 
 ;; Metadata
 
diff --git a/test/Bitcode/compatibility-4.0.ll b/test/Bitcode/compatibility-4.0.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c83c107a2927adc6be8cc91b6023c55e838f7ffc
--- /dev/null
+++ b/test/Bitcode/compatibility-4.0.ll
@@ -0,0 +1,1690 @@
+; Bitcode compatibility test for llvm 4.0.0
+;
+; N.b: This is 4.0-compatible IR. The CHECK lines occasionally differ from
+;      the IR used to generate the bitcode, and may need to be updated.
+
+; RUN: llvm-dis < %s.bc | FileCheck %s
+
+target datalayout = "E"
+; CHECK: target datalayout = "E"
+
+target triple = "x86_64-apple-macosx10.10.0"
+; CHECK: target triple = "x86_64-apple-macosx10.10.0"
+
+;; Module-level assembly
+module asm "beep boop"
+; CHECK: module asm "beep boop"
+
+;; Comdats
+$comdat.any = comdat any
+; CHECK: $comdat.any = comdat any
+$comdat.exactmatch = comdat exactmatch
+; CHECK: $comdat.exactmatch = comdat exactmatch
+$comdat.largest = comdat largest
+; CHECK: $comdat.largest = comdat largest
+$comdat.noduplicates = comdat noduplicates
+; CHECK: $comdat.noduplicates = comdat noduplicates
+$comdat.samesize = comdat samesize
+; CHECK: $comdat.samesize = comdat samesize
+
+;; Constants
+@const.true = constant i1 true
+; CHECK: @const.true = constant i1 true
+@const.false = constant i1 false
+; CHECK: @const.false = constant i1 false
+@const.int = constant i32 zeroinitializer
+; CHECK: @const.int = constant i32 0
+@const.float = constant double 0.0
+; CHECK: @const.float = constant double 0.0
+@const.null = constant i8* null
+; CHECK: @const.null = constant i8* null
+%const.struct.type = type { i32, i8 }
+%const.struct.type.packed = type <{ i32, i8 }>
+@const.struct = constant %const.struct.type { i32 -1, i8 undef }
+; CHECK: @const.struct = constant %const.struct.type { i32 -1, i8 undef }
+@const.struct.packed = constant %const.struct.type.packed <{ i32 -1, i8 1 }>
+; CHECK: @const.struct.packed = constant %const.struct.type.packed <{ i32 -1, i8 1 }>
+
+; CHECK: @constant.array.i8  = constant [3 x i8] c"\00\01\00"
+@constant.array.i8  = constant [3 x i8] [i8 -0, i8 1, i8 0]
+; CHECK: @constant.array.i16 = constant [3 x i16] [i16 0, i16 1, i16 0]
+@constant.array.i16 = constant [3 x i16] [i16 -0, i16 1, i16 0]
+; CHECK: @constant.array.i32 = constant [3 x i32] [i32 0, i32 1, i32 0]
+@constant.array.i32 = constant [3 x i32] [i32 -0, i32 1, i32 0]
+; CHECK: @constant.array.i64 = constant [3 x i64] [i64 0, i64 1, i64 0]
+@constant.array.i64 = constant [3 x i64] [i64 -0, i64 1, i64 0]
+; CHECK: @constant.array.f16 = constant [3 x half] [half 0xH8000, half 0xH3C00, half 0xH0000]
+@constant.array.f16 = constant [3 x half] [half -0.0, half 1.0, half 0.0]
+; CHECK: @constant.array.f32 = constant [3 x float] [float -0.000000e+00, float 1.000000e+00, float 0.000000e+00]
+@constant.array.f32 = constant [3 x float] [float -0.0, float 1.0, float 0.0]
+; CHECK: @constant.array.f64 = constant [3 x double] [double -0.000000e+00, double 1.000000e+00, double 0.000000e+00]
+@constant.array.f64 = constant [3 x double] [double -0.0, double 1.0, double 0.0]
+
+; CHECK: @constant.vector.i8  = constant <3 x i8>  <i8 0, i8 1, i8 0>
+@constant.vector.i8  = constant <3 x i8>  <i8 -0, i8 1, i8 0>
+; CHECK: @constant.vector.i16 = constant <3 x i16> <i16 0, i16 1, i16 0>
+@constant.vector.i16 = constant <3 x i16> <i16 -0, i16 1, i16 0>
+; CHECK: @constant.vector.i32 = constant <3 x i32> <i32 0, i32 1, i32 0>
+@constant.vector.i32 = constant <3 x i32> <i32 -0, i32 1, i32 0>
+; CHECK: @constant.vector.i64 = constant <3 x i64> <i64 0, i64 1, i64 0>
+@constant.vector.i64 = constant <3 x i64> <i64 -0, i64 1, i64 0>
+; CHECK: @constant.vector.f16 = constant <3 x half> <half 0xH8000, half 0xH3C00, half 0xH0000>
+@constant.vector.f16 = constant <3 x half> <half -0.0, half 1.0, half 0.0>
+; CHECK: @constant.vector.f32 = constant <3 x float> <float -0.000000e+00, float 1.000000e+00, float 0.000000e+00>
+@constant.vector.f32 = constant <3 x float> <float -0.0, float 1.0, float 0.0>
+; CHECK: @constant.vector.f64 = constant <3 x double> <double -0.000000e+00, double 1.000000e+00, double 0.000000e+00>
+@constant.vector.f64 = constant <3 x double> <double -0.0, double 1.0, double 0.0>
+
+;; Global Variables
+; Format: [@<GlobalVarName> =] [Linkage] [Visibility] [DLLStorageClass]
+;         [ThreadLocal] [(unnamed_addr|local_unnamed_addr)] [AddrSpace] [ExternallyInitialized]
+;         <global | constant> <Type> [<InitializerConstant>]
+;         [, section "name"] [, comdat [($name)]] [, align <Alignment>]
+
+; Global Variables -- Simple
+@g1 = global i32 0
+; CHECK: @g1 = global i32 0
+@g2 = constant i32 0
+; CHECK: @g2 = constant i32 0
+
+; Global Variables -- Linkage
+@g.private = private global i32 0
+; CHECK: @g.private = private global i32 0
+@g.internal = internal global i32 0
+; CHECK: @g.internal = internal global i32 0
+@g.available_externally = available_externally global i32 0
+; CHECK: @g.available_externally = available_externally global i32 0
+@g.linkonce = linkonce global i32 0
+; CHECK: @g.linkonce = linkonce global i32 0
+@g.weak = weak global i32 0
+; CHECK: @g.weak = weak global i32 0
+@g.common = common global i32 0
+; CHECK: @g.common = common global i32 0
+@g.appending = appending global [4 x i8] c"test"
+; CHECK: @g.appending = appending global [4 x i8] c"test"
+@g.extern_weak = extern_weak global i32
+; CHECK: @g.extern_weak = extern_weak global i32
+@g.linkonce_odr = linkonce_odr global i32 0
+; CHECK: @g.linkonce_odr = linkonce_odr global i32 0
+@g.weak_odr = weak_odr global i32 0
+; CHECK: @g.weak_odr = weak_odr global i32 0
+@g.external = external global i32
+; CHECK: @g.external = external global i32
+
+; Global Variables -- Visibility
+@g.default = default global i32 0
+; CHECK: @g.default = global i32 0
+@g.hidden = hidden global i32 0
+; CHECK: @g.hidden = hidden global i32 0
+@g.protected = protected global i32 0
+; CHECK: @g.protected = protected global i32 0
+
+; Global Variables -- DLLStorageClass
+@g.dlldefault = default global i32 0
+; CHECK: @g.dlldefault = global i32 0
+@g.dllimport = external dllimport global i32
+; CHECK: @g.dllimport = external dllimport global i32
+@g.dllexport = dllexport global i32 0
+; CHECK: @g.dllexport = dllexport global i32 0
+
+; Global Variables -- ThreadLocal
+@g.notthreadlocal = global i32 0
+; CHECK: @g.notthreadlocal = global i32 0
+@g.generaldynamic = thread_local global i32 0
+; CHECK: @g.generaldynamic = thread_local global i32 0
+@g.localdynamic = thread_local(localdynamic) global i32 0
+; CHECK: @g.localdynamic = thread_local(localdynamic) global i32 0
+@g.initialexec = thread_local(initialexec) global i32 0
+; CHECK: @g.initialexec = thread_local(initialexec) global i32 0
+@g.localexec = thread_local(localexec) global i32 0
+; CHECK: @g.localexec = thread_local(localexec) global i32 0
+
+; Global Variables -- unnamed_addr and local_unnamed_addr
+@g.unnamed_addr = unnamed_addr global i32 0
+; CHECK: @g.unnamed_addr = unnamed_addr global i32 0
+@g.local_unnamed_addr = local_unnamed_addr global i32 0
+; CHECK: @g.local_unnamed_addr = local_unnamed_addr global i32 0
+
+; Global Variables -- AddrSpace
+@g.addrspace = addrspace(1) global i32 0
+; CHECK: @g.addrspace = addrspace(1) global i32 0
+
+; Global Variables -- ExternallyInitialized
+@g.externally_initialized = external externally_initialized global i32
+; CHECK: @g.externally_initialized = external externally_initialized global i32
+
+; Global Variables -- section
+@g.section = global i32 0, section "_DATA"
+; CHECK: @g.section = global i32 0, section "_DATA"
+
+; Global Variables -- comdat
+@comdat.any = global i32 0, comdat
+; CHECK: @comdat.any = global i32 0, comdat
+@comdat.exactmatch = global i32 0, comdat
+; CHECK: @comdat.exactmatch = global i32 0, comdat
+@comdat.largest = global i32 0, comdat
+; CHECK: @comdat.largest = global i32 0, comdat
+@comdat.noduplicates = global i32 0, comdat
+; CHECK: @comdat.noduplicates = global i32 0, comdat
+@comdat.samesize = global i32 0, comdat
+; CHECK: @comdat.samesize = global i32 0, comdat
+
+; Force two globals from different comdats into sections with the same name.
+$comdat1 = comdat any
+$comdat2 = comdat any
+@g.comdat1 = global i32 0, section "SharedSection", comdat($comdat1)
+; CHECK: @g.comdat1 = global i32 0, section "SharedSection", comdat($comdat1)
+@g.comdat2 = global i32 0, section "SharedSection", comdat($comdat2)
+; CHECK: @g.comdat2 = global i32 0, section "SharedSection", comdat($comdat2)
+
+; Global Variables -- align
+@g.align = global i32 0, align 4
+; CHECK: @g.align = global i32 0, align 4
+
+; Global Variables -- Intrinsics
+%pri.func.data = type { i32, void ()*, i8* }
+@g.used1 = global i32 0
+@g.used2 = global i32 0
+@g.used3 = global i8 0
+declare void @g.f1()
+@llvm.used = appending global [1 x i32*] [i32* @g.used1], section "llvm.metadata"
+; CHECK: @llvm.used = appending global [1 x i32*] [i32* @g.used1], section "llvm.metadata"
+@llvm.compiler.used = appending global [1 x i32*] [i32* @g.used2], section "llvm.metadata"
+; CHECK: @llvm.compiler.used = appending global [1 x i32*] [i32* @g.used2], section "llvm.metadata"
+@llvm.global_ctors = appending global [1 x %pri.func.data] [%pri.func.data { i32 0, void ()* @g.f1, i8* @g.used3 }], section "llvm.metadata"
+; CHECK: @llvm.global_ctors = appending global [1 x %pri.func.data] [%pri.func.data { i32 0, void ()* @g.f1, i8* @g.used3 }], section "llvm.metadata"
+@llvm.global_dtors = appending global [1 x %pri.func.data] [%pri.func.data { i32 0, void ()* @g.f1, i8* @g.used3 }], section "llvm.metadata"
+; CHECK: @llvm.global_dtors = appending global [1 x %pri.func.data] [%pri.func.data { i32 0, void ()* @g.f1, i8* @g.used3 }], section "llvm.metadata"
+
+;; Aliases
+; Format: @<Name> = [Linkage] [Visibility] [DLLStorageClass] [ThreadLocal]
+;                   [unnamed_addr] alias <AliaseeTy> @<Aliasee>
+
+; Aliases -- Linkage
+@a.private = private alias i32, i32* @g.private
+; CHECK: @a.private = private alias i32, i32* @g.private
+@a.internal = internal alias i32, i32* @g.internal
+; CHECK: @a.internal = internal alias i32, i32* @g.internal
+@a.linkonce = linkonce alias i32, i32* @g.linkonce
+; CHECK: @a.linkonce = linkonce alias i32, i32* @g.linkonce
+@a.weak = weak alias i32, i32* @g.weak
+; CHECK: @a.weak = weak alias i32, i32* @g.weak
+@a.linkonce_odr = linkonce_odr alias i32, i32* @g.linkonce_odr
+; CHECK: @a.linkonce_odr = linkonce_odr alias i32, i32* @g.linkonce_odr
+@a.weak_odr = weak_odr alias i32, i32* @g.weak_odr
+; CHECK: @a.weak_odr = weak_odr alias i32, i32* @g.weak_odr
+@a.external = external alias i32, i32* @g1
+; CHECK: @a.external = alias i32, i32* @g1
+
+; Aliases -- Visibility
+@a.default = default alias i32, i32* @g.default
+; CHECK: @a.default = alias i32, i32* @g.default
+@a.hidden = hidden alias i32, i32* @g.hidden
+; CHECK: @a.hidden = hidden alias i32, i32* @g.hidden
+@a.protected = protected alias i32, i32* @g.protected
+; CHECK: @a.protected = protected alias i32, i32* @g.protected
+
+; Aliases -- DLLStorageClass
+@a.dlldefault = default alias i32, i32* @g.dlldefault
+; CHECK: @a.dlldefault = alias i32, i32* @g.dlldefault
+@a.dllimport = dllimport alias i32, i32* @g1
+; CHECK: @a.dllimport = dllimport alias i32, i32* @g1
+@a.dllexport = dllexport alias i32, i32* @g.dllexport
+; CHECK: @a.dllexport = dllexport alias i32, i32* @g.dllexport
+
+; Aliases -- ThreadLocal
+@a.notthreadlocal = alias i32, i32* @g.notthreadlocal
+; CHECK: @a.notthreadlocal = alias i32, i32* @g.notthreadlocal
+@a.generaldynamic = thread_local alias i32, i32* @g.generaldynamic
+; CHECK: @a.generaldynamic = thread_local alias i32, i32* @g.generaldynamic
+@a.localdynamic = thread_local(localdynamic) alias i32, i32* @g.localdynamic
+; CHECK: @a.localdynamic = thread_local(localdynamic) alias i32, i32* @g.localdynamic
+@a.initialexec = thread_local(initialexec) alias i32, i32* @g.initialexec
+; CHECK: @a.initialexec = thread_local(initialexec) alias i32, i32* @g.initialexec
+@a.localexec = thread_local(localexec) alias i32, i32* @g.localexec
+; CHECK: @a.localexec = thread_local(localexec) alias i32, i32* @g.localexec
+
+; Aliases -- unnamed_addr and local_unnamed_addr
+@a.unnamed_addr = unnamed_addr alias i32, i32* @g.unnamed_addr
+; CHECK: @a.unnamed_addr = unnamed_addr alias i32, i32* @g.unnamed_addr
+@a.local_unnamed_addr = local_unnamed_addr alias i32, i32* @g.local_unnamed_addr
+; CHECK: @a.local_unnamed_addr = local_unnamed_addr alias i32, i32* @g.local_unnamed_addr
+
+;; IFunc
+; Format @<Name> = [Linkage] [Visibility] ifunc <IFuncTy>,
+;                  <ResolverTy>* @<Resolver>
+
+; IFunc -- Linkage
+@ifunc.external = external ifunc void (), i8* ()* @ifunc_resolver
+; CHECK: @ifunc.external = ifunc void (), i8* ()* @ifunc_resolver
+@ifunc.private = private ifunc void (), i8* ()* @ifunc_resolver
+; CHECK: @ifunc.private = private ifunc void (), i8* ()* @ifunc_resolver
+@ifunc.internal = internal ifunc void (), i8* ()* @ifunc_resolver
+; CHECK: @ifunc.internal = internal ifunc void (), i8* ()* @ifunc_resolver
+
+; IFunc -- Visibility
+@ifunc.default = default ifunc void (), i8* ()* @ifunc_resolver
+; CHECK: @ifunc.default = ifunc void (), i8* ()* @ifunc_resolver
+@ifunc.hidden = hidden ifunc void (), i8* ()* @ifunc_resolver
+; CHECK: @ifunc.hidden = hidden ifunc void (), i8* ()* @ifunc_resolver
+@ifunc.protected = protected ifunc void (), i8* ()* @ifunc_resolver
+; CHECK: @ifunc.protected = protected ifunc void (), i8* ()* @ifunc_resolver
+
+define i8* @ifunc_resolver() {
+entry:
+  ret i8* null
+}
+
+;; Functions
+; Format: define [linkage] [visibility] [DLLStorageClass]
+;         [cconv] [ret attrs]
+;         <ResultType> @<FunctionName> ([argument list])
+;         [(unnamed_addr|local_unnamed_addr)] [fn Attrs] [section "name"] [comdat [($name)]]
+;         [align N] [gc] [prefix Constant] [prologue Constant]
+;         [personality Constant] { ... }
+
+; Functions -- Simple
+declare void @f1 ()
+; CHECK: declare void @f1()
+
+define void @f2 () {
+; CHECK: define void @f2()
+entry:
+  ret void
+}
+
+; Functions -- linkage
+define private void @f.private() {
+; CHECK: define private void @f.private()
+entry:
+  ret void
+}
+define internal void @f.internal() {
+; CHECK: define internal void @f.internal()
+entry:
+  ret void
+}
+define available_externally void @f.available_externally() {
+; CHECK: define available_externally void @f.available_externally()
+entry:
+  ret void
+}
+define linkonce void @f.linkonce() {
+; CHECK: define linkonce void @f.linkonce()
+entry:
+  ret void
+}
+define weak void @f.weak() {
+; CHECK: define weak void @f.weak()
+entry:
+  ret void
+}
+define linkonce_odr void @f.linkonce_odr() {
+; CHECK: define linkonce_odr void @f.linkonce_odr()
+entry:
+  ret void
+}
+define weak_odr void @f.weak_odr() {
+; CHECK: define weak_odr void @f.weak_odr()
+entry:
+  ret void
+}
+declare external void @f.external()
+; CHECK: declare void @f.external()
+declare extern_weak void @f.extern_weak()
+; CHECK: declare extern_weak void @f.extern_weak()
+
+; Functions -- visibility
+declare default void @f.default()
+; CHECK: declare void @f.default()
+declare hidden void @f.hidden()
+; CHECK: declare hidden void @f.hidden()
+declare protected void @f.protected()
+; CHECK: declare protected void @f.protected()
+
+; Functions -- DLLStorageClass
+declare dllimport void @f.dllimport()
+; CHECK: declare dllimport void @f.dllimport()
+declare dllexport void @f.dllexport()
+; CHECK: declare dllexport void @f.dllexport()
+
+; Functions -- cconv (Calling conventions)
+declare ccc void @f.ccc()
+; CHECK: declare void @f.ccc()
+declare fastcc void @f.fastcc()
+; CHECK: declare fastcc void @f.fastcc()
+declare coldcc void @f.coldcc()
+; CHECK: declare coldcc void @f.coldcc()
+declare cc10 void @f.cc10()
+; CHECK: declare ghccc void @f.cc10()
+declare ghccc void @f.ghccc()
+; CHECK: declare ghccc void @f.ghccc()
+declare cc11 void @f.cc11()
+; CHECK: declare cc11 void @f.cc11()
+declare webkit_jscc void @f.webkit_jscc()
+; CHECK: declare webkit_jscc void @f.webkit_jscc()
+declare anyregcc void @f.anyregcc()
+; CHECK: declare anyregcc void @f.anyregcc()
+declare preserve_mostcc void @f.preserve_mostcc()
+; CHECK: declare preserve_mostcc void @f.preserve_mostcc()
+declare preserve_allcc void @f.preserve_allcc()
+; CHECK: declare preserve_allcc void @f.preserve_allcc()
+declare cc64 void @f.cc64()
+; CHECK: declare x86_stdcallcc void @f.cc64()
+declare x86_stdcallcc void @f.x86_stdcallcc()
+; CHECK: declare x86_stdcallcc void @f.x86_stdcallcc()
+declare cc65 void @f.cc65()
+; CHECK: declare x86_fastcallcc void @f.cc65()
+declare x86_fastcallcc void @f.x86_fastcallcc()
+; CHECK: declare x86_fastcallcc void @f.x86_fastcallcc()
+declare cc66 void @f.cc66()
+; CHECK: declare arm_apcscc void @f.cc66()
+declare arm_apcscc void @f.arm_apcscc()
+; CHECK: declare arm_apcscc void @f.arm_apcscc()
+declare cc67 void @f.cc67()
+; CHECK: declare arm_aapcscc void @f.cc67()
+declare arm_aapcscc void @f.arm_aapcscc()
+; CHECK: declare arm_aapcscc void @f.arm_aapcscc()
+declare cc68 void @f.cc68()
+; CHECK: declare arm_aapcs_vfpcc void @f.cc68()
+declare arm_aapcs_vfpcc void @f.arm_aapcs_vfpcc()
+; CHECK: declare arm_aapcs_vfpcc void @f.arm_aapcs_vfpcc()
+declare cc69 void @f.cc69()
+; CHECK: declare msp430_intrcc void @f.cc69()
+declare msp430_intrcc void @f.msp430_intrcc()
+; CHECK: declare msp430_intrcc void @f.msp430_intrcc()
+declare cc70 void @f.cc70()
+; CHECK: declare x86_thiscallcc void @f.cc70()
+declare x86_thiscallcc void @f.x86_thiscallcc()
+; CHECK: declare x86_thiscallcc void @f.x86_thiscallcc()
+declare cc71 void @f.cc71()
+; CHECK: declare ptx_kernel void @f.cc71()
+declare ptx_kernel void @f.ptx_kernel()
+; CHECK: declare ptx_kernel void @f.ptx_kernel()
+declare cc72 void @f.cc72()
+; CHECK: declare ptx_device void @f.cc72()
+declare ptx_device void @f.ptx_device()
+; CHECK: declare ptx_device void @f.ptx_device()
+declare cc75 void @f.cc75()
+; CHECK: declare spir_func void @f.cc75()
+declare spir_func void @f.spir_func()
+; CHECK: declare spir_func void @f.spir_func()
+declare cc76 void @f.cc76()
+; CHECK: declare spir_kernel void @f.cc76()
+declare spir_kernel void @f.spir_kernel()
+; CHECK: declare spir_kernel void @f.spir_kernel()
+declare cc77 void @f.cc77()
+; CHECK: declare intel_ocl_bicc void @f.cc77()
+declare intel_ocl_bicc void @f.intel_ocl_bicc()
+; CHECK: declare intel_ocl_bicc void @f.intel_ocl_bicc()
+declare cc78 void @f.cc78()
+; CHECK: declare x86_64_sysvcc void @f.cc78()
+declare x86_64_sysvcc void @f.x86_64_sysvcc()
+; CHECK: declare x86_64_sysvcc void @f.x86_64_sysvcc()
+declare cc79 void @f.cc79()
+; CHECK: declare x86_64_win64cc void @f.cc79()
+declare x86_64_win64cc void @f.x86_64_win64cc()
+; CHECK: declare x86_64_win64cc void @f.x86_64_win64cc()
+declare cc80 void @f.cc80()
+; CHECK: declare x86_vectorcallcc void @f.cc80()
+declare x86_vectorcallcc void @f.x86_vectorcallcc()
+; CHECK: declare x86_vectorcallcc void @f.x86_vectorcallcc()
+declare cc81 void @f.cc81()
+; CHECK: declare hhvmcc void @f.cc81()
+declare hhvmcc void @f.hhvmcc()
+; CHECK: declare hhvmcc void @f.hhvmcc()
+declare cc82 void @f.cc82()
+; CHECK: declare hhvm_ccc void @f.cc82()
+declare hhvm_ccc void @f.hhvm_ccc()
+; CHECK: declare hhvm_ccc void @f.hhvm_ccc()
+declare cc83 void @f.cc83()
+; CHECK: declare x86_intrcc void @f.cc83()
+declare x86_intrcc void @f.x86_intrcc()
+; CHECK: declare x86_intrcc void @f.x86_intrcc()
+declare cc84 void @f.cc84()
+; CHECK: declare avr_intrcc void @f.cc84()
+declare avr_intrcc void @f.avr_intrcc()
+; CHECK: declare avr_intrcc void @f.avr_intrcc()
+declare cc85 void @f.cc85()
+; CHECK: declare avr_signalcc void @f.cc85()
+declare avr_signalcc void @f.avr_signalcc()
+; CHECK: declare avr_signalcc void @f.avr_signalcc()
+declare cc87 void @f.cc87()
+; CHECK: declare amdgpu_vs void @f.cc87()
+declare amdgpu_vs void @f.amdgpu_vs()
+; CHECK: declare amdgpu_vs void @f.amdgpu_vs()
+declare cc88 void @f.cc88()
+; CHECK: declare amdgpu_gs void @f.cc88()
+declare amdgpu_gs void @f.amdgpu_gs()
+; CHECK: declare amdgpu_gs void @f.amdgpu_gs()
+declare cc89 void @f.cc89()
+; CHECK: declare amdgpu_ps void @f.cc89()
+declare amdgpu_ps void @f.amdgpu_ps()
+; CHECK: declare amdgpu_ps void @f.amdgpu_ps()
+declare cc90 void @f.cc90()
+; CHECK: declare amdgpu_cs void @f.cc90()
+declare amdgpu_cs void @f.amdgpu_cs()
+; CHECK: declare amdgpu_cs void @f.amdgpu_cs()
+declare cc91 void @f.cc91()
+; CHECK: declare amdgpu_kernel void @f.cc91()
+declare amdgpu_kernel void @f.amdgpu_kernel()
+; CHECK: declare amdgpu_kernel void @f.amdgpu_kernel()
+declare cc1023 void @f.cc1023()
+; CHECK: declare cc1023 void @f.cc1023()
+
+; Functions -- ret attrs (Return attributes)
+declare zeroext i64 @f.zeroext()
+; CHECK: declare zeroext i64 @f.zeroext()
+declare signext i64 @f.signext()
+; CHECK: declare signext i64 @f.signext()
+declare inreg i32* @f.inreg()
+; CHECK: declare inreg i32* @f.inreg()
+declare noalias i32* @f.noalias()
+; CHECK: declare noalias i32* @f.noalias()
+declare nonnull i32* @f.nonnull()
+; CHECK: declare nonnull i32* @f.nonnull()
+declare dereferenceable(4) i32* @f.dereferenceable4()
+; CHECK: declare dereferenceable(4) i32* @f.dereferenceable4()
+declare dereferenceable(8) i32* @f.dereferenceable8()
+; CHECK: declare dereferenceable(8) i32* @f.dereferenceable8()
+declare dereferenceable(16) i32* @f.dereferenceable16()
+; CHECK: declare dereferenceable(16) i32* @f.dereferenceable16()
+declare dereferenceable_or_null(4) i32* @f.dereferenceable4_or_null()
+; CHECK: declare dereferenceable_or_null(4) i32* @f.dereferenceable4_or_null()
+declare dereferenceable_or_null(8) i32* @f.dereferenceable8_or_null()
+; CHECK: declare dereferenceable_or_null(8) i32* @f.dereferenceable8_or_null()
+declare dereferenceable_or_null(16) i32* @f.dereferenceable16_or_null()
+; CHECK: declare dereferenceable_or_null(16) i32* @f.dereferenceable16_or_null()
+
+; Functions -- Parameter attributes
+declare void @f.param.zeroext(i8 zeroext)
+; CHECK: declare void @f.param.zeroext(i8 zeroext)
+declare void @f.param.signext(i8 signext)
+; CHECK: declare void @f.param.signext(i8 signext)
+declare void @f.param.inreg(i8 inreg)
+; CHECK: declare void @f.param.inreg(i8 inreg)
+declare void @f.param.byval({ i8, i8 }* byval)
+; CHECK: declare void @f.param.byval({ i8, i8 }* byval)
+declare void @f.param.inalloca(i8* inalloca)
+; CHECK: declare void @f.param.inalloca(i8* inalloca)
+declare void @f.param.sret(i8* sret)
+; CHECK: declare void @f.param.sret(i8* sret)
+declare void @f.param.noalias(i8* noalias)
+; CHECK: declare void @f.param.noalias(i8* noalias)
+declare void @f.param.nocapture(i8* nocapture)
+; CHECK: declare void @f.param.nocapture(i8* nocapture)
+declare void @f.param.nest(i8* nest)
+; CHECK: declare void @f.param.nest(i8* nest)
+declare i8* @f.param.returned(i8* returned)
+; CHECK: declare i8* @f.param.returned(i8* returned)
+declare void @f.param.nonnull(i8* nonnull)
+; CHECK: declare void @f.param.nonnull(i8* nonnull)
+declare void @f.param.dereferenceable(i8* dereferenceable(4))
+; CHECK: declare void @f.param.dereferenceable(i8* dereferenceable(4))
+declare void @f.param.dereferenceable_or_null(i8* dereferenceable_or_null(4))
+; CHECK: declare void @f.param.dereferenceable_or_null(i8* dereferenceable_or_null(4))
+
+; Functions -- unnamed_addr and local_unnamed_addr
+declare void @f.unnamed_addr() unnamed_addr
+; CHECK: declare void @f.unnamed_addr() unnamed_addr
+declare void @f.local_unnamed_addr() local_unnamed_addr
+; CHECK: declare void @f.local_unnamed_addr() local_unnamed_addr
+
+; Functions -- fn Attrs (Function attributes)
+declare void @f.alignstack4() alignstack(4)
+; CHECK: declare void @f.alignstack4() #0
+declare void @f.alignstack8() alignstack(8)
+; CHECK: declare void @f.alignstack8() #1
+declare void @f.alwaysinline() alwaysinline
+; CHECK: declare void @f.alwaysinline() #2
+declare void @f.cold() cold
+; CHECK: declare void @f.cold() #3
+declare void @f.convergent() convergent
+; CHECK: declare void @f.convergent() #4
+declare void @f.inlinehint() inlinehint
+; CHECK: declare void @f.inlinehint() #5
+declare void @f.jumptable() unnamed_addr jumptable
+; CHECK: declare void @f.jumptable() unnamed_addr #6
+declare void @f.minsize() minsize
+; CHECK: declare void @f.minsize() #7
+declare void @f.naked() naked
+; CHECK: declare void @f.naked() #8
+declare void @f.nobuiltin() nobuiltin
+; CHECK: declare void @f.nobuiltin() #9
+declare void @f.noduplicate() noduplicate
+; CHECK: declare void @f.noduplicate() #10
+declare void @f.noimplicitfloat() noimplicitfloat
+; CHECK: declare void @f.noimplicitfloat() #11
+declare void @f.noinline() noinline
+; CHECK: declare void @f.noinline() #12
+declare void @f.nonlazybind() nonlazybind
+; CHECK: declare void @f.nonlazybind() #13
+declare void @f.noredzone() noredzone
+; CHECK: declare void @f.noredzone() #14
+declare void @f.noreturn() noreturn
+; CHECK: declare void @f.noreturn() #15
+declare void @f.nounwind() nounwind
+; CHECK: declare void @f.nounwind() #16
+declare void @f.optnone() noinline optnone
+; CHECK: declare void @f.optnone() #17
+declare void @f.optsize() optsize
+; CHECK: declare void @f.optsize() #18
+declare void @f.readnone() readnone
+; CHECK: declare void @f.readnone() #19
+declare void @f.readonly() readonly
+; CHECK: declare void @f.readonly() #20
+declare void @f.returns_twice() returns_twice
+; CHECK: declare void @f.returns_twice() #21
+declare void @f.safestack() safestack
+; CHECK: declare void @f.safestack() #22
+declare void @f.sanitize_address() sanitize_address
+; CHECK: declare void @f.sanitize_address() #23
+declare void @f.sanitize_memory() sanitize_memory
+; CHECK: declare void @f.sanitize_memory() #24
+declare void @f.sanitize_thread() sanitize_thread
+; CHECK: declare void @f.sanitize_thread() #25
+declare void @f.ssp() ssp
+; CHECK: declare void @f.ssp() #26
+declare void @f.sspreq() sspreq
+; CHECK: declare void @f.sspreq() #27
+declare void @f.sspstrong() sspstrong
+; CHECK: declare void @f.sspstrong() #28
+declare void @f.thunk() "thunk"
+; CHECK: declare void @f.thunk() #29
+declare void @f.uwtable() uwtable
+; CHECK: declare void @f.uwtable() #30
+declare void @f.kvpair() "cpu"="cortex-a8"
+; CHECK:declare void @f.kvpair() #31
+declare void @f.norecurse() norecurse
+; CHECK: declare void @f.norecurse() #32
+declare void @f.inaccessiblememonly() inaccessiblememonly
+; CHECK: declare void @f.inaccessiblememonly() #33
+declare void @f.inaccessiblemem_or_argmemonly() inaccessiblemem_or_argmemonly
+; CHECK: declare void @f.inaccessiblemem_or_argmemonly() #34
+
+; Functions -- section
+declare void @f.section() section "80"
+; CHECK: declare void @f.section() section "80"
+
+; Functions -- comdat
+define void @f.comdat_any() comdat($comdat.any) {
+; CHECK: define void @f.comdat_any() comdat($comdat.any)
+entry:
+  ret void
+}
+define void @f.comdat_exactmatch() comdat($comdat.exactmatch) {
+; CHECK: define void @f.comdat_exactmatch() comdat($comdat.exactmatch)
+entry:
+  ret void
+}
+define void @f.comdat_largest() comdat($comdat.largest) {
+; CHECK: define void @f.comdat_largest() comdat($comdat.largest)
+entry:
+  ret void
+}
+define void @f.comdat_noduplicates() comdat($comdat.noduplicates) {
+; CHECK: define void @f.comdat_noduplicates() comdat($comdat.noduplicates)
+entry:
+  ret void
+}
+define void @f.comdat_samesize() comdat($comdat.samesize) {
+; CHECK: define void @f.comdat_samesize() comdat($comdat.samesize)
+entry:
+  ret void
+}
+
+; Functions -- align
+declare void @f.align2() align 2
+; CHECK: declare void @f.align2() align 2
+declare void @f.align4() align 4
+; CHECK: declare void @f.align4() align 4
+declare void @f.align8() align 8
+; CHECK: declare void @f.align8() align 8
+
+; Functions -- GC
+declare void @f.gcshadow() gc "shadow-stack"
+; CHECK: declare void @f.gcshadow() gc "shadow-stack"
+
+; Functions -- Prefix data
+declare void @f.prefixi32() prefix i32 1684365668
+; CHECK: declare void @f.prefixi32() prefix i32 1684365668
+declare void @f.prefixarray() prefix [4 x i32] [i32 0, i32 1, i32 2, i32 3]
+; CHECK: declare void @f.prefixarray() prefix [4 x i32] [i32 0, i32 1, i32 2, i32 3]
+
+; Functions -- Prologue data
+declare void @f.prologuei32() prologue i32 1684365669
+; CHECK: declare void @f.prologuei32() prologue i32 1684365669
+declare void @f.prologuearray() prologue [4 x i32] [i32 0, i32 1, i32 2, i32 3]
+; CHECK: declare void @f.prologuearray() prologue [4 x i32] [i32 0, i32 1, i32 2, i32 3]
+
+; Functions -- Personality constant
+declare void @llvm.donothing() nounwind readnone
+; CHECK: declare void @llvm.donothing() #35
+define void @f.no_personality() personality i8 3 {
+; CHECK: define void @f.no_personality() personality i8 3
+  invoke void @llvm.donothing() to label %normal unwind label %exception
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %normal
+normal:
+  ret void
+}
+
+declare i32 @f.personality_handler()
+; CHECK: declare i32 @f.personality_handler()
+define void @f.personality() personality i32 ()* @f.personality_handler {
+; CHECK: define void @f.personality() personality i32 ()* @f.personality_handler
+  invoke void @llvm.donothing() to label %normal unwind label %exception
+exception:
+  %cleanup = landingpad i32 cleanup
+  br label %normal
+normal:
+  ret void
+}
+
+;; Atomic Memory Ordering Constraints
+define void @atomics(i32* %word) {
+  %cmpxchg.0 = cmpxchg i32* %word, i32 0, i32 4 monotonic monotonic
+  ; CHECK: %cmpxchg.0 = cmpxchg i32* %word, i32 0, i32 4 monotonic monotonic
+  %cmpxchg.1 = cmpxchg i32* %word, i32 0, i32 5 acq_rel monotonic
+  ; CHECK: %cmpxchg.1 = cmpxchg i32* %word, i32 0, i32 5 acq_rel monotonic
+  %cmpxchg.2 = cmpxchg i32* %word, i32 0, i32 6 acquire monotonic
+  ; CHECK: %cmpxchg.2 = cmpxchg i32* %word, i32 0, i32 6 acquire monotonic
+  %cmpxchg.3 = cmpxchg i32* %word, i32 0, i32 7 release monotonic
+  ; CHECK: %cmpxchg.3 = cmpxchg i32* %word, i32 0, i32 7 release monotonic
+  %cmpxchg.4 = cmpxchg i32* %word, i32 0, i32 8 seq_cst monotonic
+  ; CHECK: %cmpxchg.4 = cmpxchg i32* %word, i32 0, i32 8 seq_cst monotonic
+  %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic
+  ; CHECK: %cmpxchg.5 = cmpxchg weak i32* %word, i32 0, i32 9 seq_cst monotonic
+  %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
+  ; CHECK: %cmpxchg.6 = cmpxchg volatile i32* %word, i32 0, i32 10 seq_cst monotonic
+  %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
+  ; CHECK: %cmpxchg.7 = cmpxchg weak volatile i32* %word, i32 0, i32 11 singlethread seq_cst monotonic
+  %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
+  ; CHECK: %atomicrmw.xchg = atomicrmw xchg i32* %word, i32 12 monotonic
+  %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic
+  ; CHECK: %atomicrmw.add = atomicrmw add i32* %word, i32 13 monotonic
+  %atomicrmw.sub = atomicrmw sub i32* %word, i32 14 monotonic
+  ; CHECK: %atomicrmw.sub = atomicrmw sub i32* %word, i32 14 monotonic
+  %atomicrmw.and = atomicrmw and i32* %word, i32 15 monotonic
+  ; CHECK: %atomicrmw.and = atomicrmw and i32* %word, i32 15 monotonic
+  %atomicrmw.nand = atomicrmw nand i32* %word, i32 16 monotonic
+  ; CHECK: %atomicrmw.nand = atomicrmw nand i32* %word, i32 16 monotonic
+  %atomicrmw.or = atomicrmw or i32* %word, i32 17 monotonic
+  ; CHECK: %atomicrmw.or = atomicrmw or i32* %word, i32 17 monotonic
+  %atomicrmw.xor = atomicrmw xor i32* %word, i32 18 monotonic
+  ; CHECK: %atomicrmw.xor = atomicrmw xor i32* %word, i32 18 monotonic
+  %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic
+  ; CHECK: %atomicrmw.max = atomicrmw max i32* %word, i32 19 monotonic
+  %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
+  ; CHECK: %atomicrmw.min = atomicrmw volatile min i32* %word, i32 20 monotonic
+  %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
+  ; CHECK: %atomicrmw.umax = atomicrmw umax i32* %word, i32 21 singlethread monotonic
+  %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
+  ; CHECK: %atomicrmw.umin = atomicrmw volatile umin i32* %word, i32 22 singlethread monotonic
+  fence acquire
+  ; CHECK: fence acquire
+  fence release
+  ; CHECK: fence release
+  fence acq_rel
+  ; CHECK: fence acq_rel
+  fence singlethread seq_cst
+  ; CHECK: fence singlethread seq_cst
+
+  %ld.1 = load atomic i32, i32* %word monotonic, align 4
+  ; CHECK: %ld.1 = load atomic i32, i32* %word monotonic, align 4
+  %ld.2 = load atomic volatile i32, i32* %word acquire, align 8
+  ; CHECK: %ld.2 = load atomic volatile i32, i32* %word acquire, align 8
+  %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
+  ; CHECK: %ld.3 = load atomic volatile i32, i32* %word singlethread seq_cst, align 16
+
+  store atomic i32 23, i32* %word monotonic, align 4
+  ; CHECK: store atomic i32 23, i32* %word monotonic, align 4
+  store atomic volatile i32 24, i32* %word monotonic, align 4
+  ; CHECK: store atomic volatile i32 24, i32* %word monotonic, align 4
+  store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
+  ; CHECK: store atomic volatile i32 25, i32* %word singlethread monotonic, align 4
+  ret void
+}
+
+;; Fast Math Flags
+define void @fastmathflags(float %op1, float %op2) {
+  %f.nnan = fadd nnan float %op1, %op2
+  ; CHECK: %f.nnan = fadd nnan float %op1, %op2
+  %f.ninf = fadd ninf float %op1, %op2
+  ; CHECK: %f.ninf = fadd ninf float %op1, %op2
+  %f.nsz = fadd nsz float %op1, %op2
+  ; CHECK: %f.nsz = fadd nsz float %op1, %op2
+  %f.arcp = fadd arcp float %op1, %op2
+  ; CHECK: %f.arcp = fadd arcp float %op1, %op2
+  %f.fast = fadd fast float %op1, %op2
+  ; CHECK: %f.fast = fadd fast float %op1, %op2
+  ret void
+}
+
+; Check various fast math flags and floating-point types on calls.
+
+declare float @fmf1()
+declare double @fmf2()
+declare <4 x double> @fmf3()
+
+; CHECK-LABEL: fastMathFlagsForCalls(
+define void @fastMathFlagsForCalls(float %f, double %d1, <4 x double> %d2) {
+  %call.fast = call fast float @fmf1()
+  ; CHECK: %call.fast = call fast float @fmf1()
+
+  ; Throw in some other attributes to make sure those stay in the right places.
+
+  %call.nsz.arcp = notail call nsz arcp double @fmf2()
+  ; CHECK: %call.nsz.arcp = notail call nsz arcp double @fmf2()
+
+  %call.nnan.ninf = tail call nnan ninf fastcc <4 x double> @fmf3()
+  ; CHECK: %call.nnan.ninf = tail call nnan ninf fastcc <4 x double> @fmf3()
+
+  ret void
+}
+
+;; Type System
+%opaquety = type opaque
+define void @typesystem() {
+  %p0 = bitcast i8* null to i32 (i32)*
+  ; CHECK: %p0 = bitcast i8* null to i32 (i32)*
+  %p1 = bitcast i8* null to void (i8*)*
+  ; CHECK: %p1 = bitcast i8* null to void (i8*)*
+  %p2 = bitcast i8* null to i32 (i8*, ...)*
+  ; CHECK: %p2 = bitcast i8* null to i32 (i8*, ...)*
+  %p3 = bitcast i8* null to { i32, i8 } (i8*, ...)*
+  ; CHECK: %p3 = bitcast i8* null to { i32, i8 } (i8*, ...)*
+  %p4 = bitcast i8* null to <{ i32, i8 }> (i8*, ...)*
+  ; CHECK: %p4 = bitcast i8* null to <{ i32, i8 }> (i8*, ...)*
+  %p5 = bitcast i8* null to <{ i32, i8 }> (<{ i8*, i64 }>*, ...)*
+  ; CHECK: %p5 = bitcast i8* null to <{ i32, i8 }> (<{ i8*, i64 }>*, ...)*
+
+  %t0 = alloca i1942652
+  ; CHECK: %t0 = alloca i1942652
+  %t1 = alloca half
+  ; CHECK: %t1 = alloca half
+  %t2 = alloca float
+  ; CHECK: %t2 = alloca float
+  %t3 = alloca double
+  ; CHECK: %t3 = alloca double
+  %t4 = alloca fp128
+  ; CHECK: %t4 = alloca fp128
+  %t5 = alloca x86_fp80
+  ; CHECK: %t5 = alloca x86_fp80
+  %t6 = alloca ppc_fp128
+  ; CHECK: %t6 = alloca ppc_fp128
+  %t7 = alloca x86_mmx
+  ; CHECK: %t7 = alloca x86_mmx
+  %t8 = alloca %opaquety*
+  ; CHECK: %t8 = alloca %opaquety*
+
+  ret void
+}
+
+declare void @llvm.token(token)
+; CHECK: declare void @llvm.token(token)
+
+;; Inline Assembler Expressions
+define void @inlineasm(i32 %arg) {
+  call i32 asm "bswap $0", "=r,r"(i32 %arg)
+  ; CHECK: call i32 asm "bswap $0", "=r,r"(i32 %arg)
+  call i32 asm sideeffect "blt $1, $2, $3", "=r,r,rm"(i32 %arg, i32 %arg)
+  ; CHECK: call i32 asm sideeffect "blt $1, $2, $3", "=r,r,rm"(i32 %arg, i32 %arg)
+  ret void
+}
+
+;; Instructions
+
+; Instructions -- Terminators
+define void @instructions.terminators(i8 %val) personality i32 -10 {
+  br i1 false, label %iftrue, label %iffalse
+  ; CHECK: br i1 false, label %iftrue, label %iffalse
+  br label %iftrue
+  ; CHECK: br label %iftrue
+iftrue:
+  ret void
+  ; CHECK: ret void
+iffalse:
+
+  switch i8 %val, label %defaultdest [
+  ; CHECK: switch i8 %val, label %defaultdest [
+         i8 0, label %defaultdest.0
+         ; CHECK: i8 0, label %defaultdest.0
+         i8 1, label %defaultdest.1
+         ; CHECK: i8 1, label %defaultdest.1
+         i8 2, label %defaultdest.2
+         ; CHECK: i8 2, label %defaultdest.2
+  ]
+  ; CHECK: ]
+defaultdest:
+  ret void
+defaultdest.0:
+  ret void
+defaultdest.1:
+  ret void
+defaultdest.2:
+
+  indirectbr i8* blockaddress(@instructions.terminators, %defaultdest.2), [label %defaultdest.2]
+  ; CHECK: indirectbr i8* blockaddress(@instructions.terminators, %defaultdest.2), [label %defaultdest.2]
+  indirectbr i8* blockaddress(@instructions.terminators, %defaultdest.2), [label %defaultdest.2, label %defaultdest.2]
+  ; CHECK: indirectbr i8* blockaddress(@instructions.terminators, %defaultdest.2), [label %defaultdest.2, label %defaultdest.2]
+
+  invoke fastcc void @f.fastcc()
+  ; CHECK: invoke fastcc void @f.fastcc()
+         to label %defaultdest unwind label %exc
+         ; CHECK: to label %defaultdest unwind label %exc
+exc:
+  %cleanup = landingpad i32 cleanup
+
+  resume i32 undef
+  ; CHECK: resume i32 undef
+  unreachable
+  ; CHECK: unreachable
+
+  ret void
+}
+
+define i32 @instructions.win_eh.1() personality i32 -3 {
+entry:
+  %arg1 = alloca i32
+  %arg2 = alloca i32
+  invoke void @f.ccc() to label %normal unwind label %catchswitch1
+  invoke void @f.ccc() to label %normal unwind label %catchswitch2
+  invoke void @f.ccc() to label %normal unwind label %catchswitch3
+
+catchswitch1:
+  %cs1 = catchswitch within none [label %catchpad1] unwind to caller
+
+catchpad1:
+  catchpad within %cs1 []
+  br label %normal
+  ; CHECK: catchpad within %cs1 []
+  ; CHECK-NEXT: br label %normal
+
+catchswitch2:
+  %cs2 = catchswitch within none [label %catchpad2] unwind to caller
+
+catchpad2:
+  catchpad within %cs2 [i32* %arg1]
+  br label %normal
+  ; CHECK: catchpad within %cs2 [i32* %arg1]
+  ; CHECK-NEXT: br label %normal
+
+catchswitch3:
+  %cs3 = catchswitch within none [label %catchpad3] unwind label %cleanuppad1
+
+catchpad3:
+  catchpad within %cs3 [i32* %arg1, i32* %arg2]
+  br label %normal
+  ; CHECK: catchpad within %cs3 [i32* %arg1, i32* %arg2]
+  ; CHECK-NEXT: br label %normal
+
+cleanuppad1:
+  %clean.1 = cleanuppad within none []
+  unreachable
+  ; CHECK: %clean.1 = cleanuppad within none []
+  ; CHECK-NEXT: unreachable
+
+normal:
+  ret i32 0
+}
+;
+define i32 @instructions.win_eh.2() personality i32 -4 {
+entry:
+  invoke void @f.ccc() to label %invoke.cont unwind label %catchswitch
+
+invoke.cont:
+  invoke void @f.ccc() to label %continue unwind label %cleanup
+
+cleanup:
+  %clean = cleanuppad within none []
+  ; CHECK: %clean = cleanuppad within none []
+  cleanupret from %clean unwind to caller
+  ; CHECK: cleanupret from %clean unwind to caller
+
+catchswitch:
+  %cs = catchswitch within none [label %catchpad] unwind label %terminate
+
+catchpad:
+  %catch = catchpad within %cs []
+  br label %body
+  ; CHECK: %catch = catchpad within %cs []
+  ; CHECK-NEXT: br label %body
+
+body:
+  invoke void @f.ccc() [ "funclet"(token %catch) ]
+    to label %continue unwind label %terminate.inner
+  catchret from %catch to label %return
+  ; CHECK: catchret from %catch to label %return
+
+return:
+  ret i32 0
+
+terminate.inner:
+  cleanuppad within %catch []
+  unreachable
+  ; CHECK: cleanuppad within %catch []
+  ; CHECK-NEXT: unreachable
+
+terminate:
+  cleanuppad within none []
+  unreachable
+  ; CHECK: cleanuppad within none []
+  ; CHECK-NEXT: unreachable
+
+continue:
+  ret i32 0
+}
+
+; Instructions -- Binary Operations
+define void @instructions.binops(i8 %op1, i8 %op2) {
+  ; nuw x nsw
+  add i8 %op1, %op2
+  ; CHECK: add i8 %op1, %op2
+  add nuw i8 %op1, %op2
+  ; CHECK: add nuw i8 %op1, %op2
+  add nsw i8 %op1, %op2
+  ; CHECK: add nsw i8 %op1, %op2
+  add nuw nsw i8 %op1, %op2
+  ; CHECK: add nuw nsw i8 %op1, %op2
+  sub i8 %op1, %op2
+  ; CHECK: sub i8 %op1, %op2
+  sub nuw i8 %op1, %op2
+  ; CHECK: sub nuw i8 %op1, %op2
+  sub nsw i8 %op1, %op2
+  ; CHECK: sub nsw i8 %op1, %op2
+  sub nuw nsw i8 %op1, %op2
+  ; CHECK: sub nuw nsw i8 %op1, %op2
+  mul i8 %op1, %op2
+  ; CHECK: mul i8 %op1, %op2
+  mul nuw i8 %op1, %op2
+  ; CHECK: mul nuw i8 %op1, %op2
+  mul nsw i8 %op1, %op2
+  ; CHECK: mul nsw i8 %op1, %op2
+  mul nuw nsw i8 %op1, %op2
+  ; CHECK: mul nuw nsw i8 %op1, %op2
+
+  ; exact
+  udiv i8 %op1, %op2
+  ; CHECK: udiv i8 %op1, %op2
+  udiv exact i8 %op1, %op2
+  ; CHECK: udiv exact i8 %op1, %op2
+  sdiv i8 %op1, %op2
+  ; CHECK: sdiv i8 %op1, %op2
+  sdiv exact i8 %op1, %op2
+  ; CHECK: sdiv exact i8 %op1, %op2
+
+  ; none
+  urem i8 %op1, %op2
+  ; CHECK: urem i8 %op1, %op2
+  srem i8 %op1, %op2
+  ; CHECK: srem i8 %op1, %op2
+
+  ret void
+}
+
+; Instructions -- Bitwise Binary Operations
+define void @instructions.bitwise_binops(i8 %op1, i8 %op2) {
+  ; nuw x nsw
+  shl i8 %op1, %op2
+  ; CHECK: shl i8 %op1, %op2
+  shl nuw i8 %op1, %op2
+  ; CHECK: shl nuw i8 %op1, %op2
+  shl nsw i8 %op1, %op2
+  ; CHECK: shl nsw i8 %op1, %op2
+  shl nuw nsw i8 %op1, %op2
+  ; CHECK: shl nuw nsw i8 %op1, %op2
+
+  ; exact
+  lshr i8 %op1, %op2
+  ; CHECK: lshr i8 %op1, %op2
+  lshr exact i8 %op1, %op2
+  ; CHECK: lshr exact i8 %op1, %op2
+  ashr i8 %op1, %op2
+  ; CHECK: ashr i8 %op1, %op2
+  ashr exact i8 %op1, %op2
+  ; CHECK: ashr exact i8 %op1, %op2
+
+  ; none
+  and i8 %op1, %op2
+  ; CHECK: and i8 %op1, %op2
+  or i8 %op1, %op2
+  ; CHECK: or i8 %op1, %op2
+  xor i8 %op1, %op2
+  ; CHECK: xor i8 %op1, %op2
+
+  ret void
+}
+
+; Instructions -- Vector Operations
+define void @instructions.vectorops(<4 x float> %vec, <4 x float> %vec2) {
+  extractelement <4 x float> %vec, i8 0
+  ; CHECK: extractelement <4 x float> %vec, i8 0
+  insertelement <4 x float> %vec, float 3.500000e+00, i8 0
+  ; CHECK: insertelement <4 x float> %vec, float 3.500000e+00, i8 0
+  shufflevector <4 x float> %vec, <4 x float> %vec2, <2 x i32> zeroinitializer
+  ; CHECK: shufflevector <4 x float> %vec, <4 x float> %vec2, <2 x i32> zeroinitializer
+
+  ret void
+}
+
+; Instructions -- Aggregate Operations
+define void @instructions.aggregateops({ i8, i32 } %up, <{ i8, i32 }> %p,
+                                       [3 x i8] %arr, { i8, { i32 }} %n,
+                                       <2 x i8*> %pvec, <2 x i64> %offsets) {
+  extractvalue { i8, i32 } %up, 0
+  ; CHECK: extractvalue { i8, i32 } %up, 0
+  extractvalue <{ i8, i32 }> %p, 1
+  ; CHECK: extractvalue <{ i8, i32 }> %p, 1
+  extractvalue [3 x i8] %arr, 2
+  ; CHECK: extractvalue [3 x i8] %arr, 2
+  extractvalue { i8, { i32 } } %n, 1, 0
+  ; CHECK: extractvalue { i8, { i32 } } %n, 1, 0
+
+  insertvalue { i8, i32 } %up, i8 1, 0
+  ; CHECK: insertvalue { i8, i32 } %up, i8 1, 0
+  insertvalue <{ i8, i32 }> %p, i32 2, 1
+  ; CHECK: insertvalue <{ i8, i32 }> %p, i32 2, 1
+  insertvalue [3 x i8] %arr, i8 0, 0
+  ; CHECK: insertvalue [3 x i8] %arr, i8 0, 0
+  insertvalue { i8, { i32 } } %n, i32 0, 1, 0
+  ; CHECK: insertvalue { i8, { i32 } } %n, i32 0, 1, 0
+
+  %up.ptr = alloca { i8, i32 }
+  %p.ptr = alloca <{ i8, i32 }>
+  %arr.ptr = alloca [3 x i8]
+  %n.ptr = alloca { i8, { i32 } }
+
+  getelementptr { i8, i32 }, { i8, i32 }* %up.ptr, i8 0
+  ; CHECK: getelementptr { i8, i32 }, { i8, i32 }* %up.ptr, i8 0
+  getelementptr <{ i8, i32 }>, <{ i8, i32 }>* %p.ptr, i8 1
+  ; CHECK: getelementptr <{ i8, i32 }>, <{ i8, i32 }>* %p.ptr, i8 1
+  getelementptr [3 x i8], [3 x i8]* %arr.ptr, i8 2
+  ; CHECK: getelementptr [3 x i8], [3 x i8]* %arr.ptr, i8 2
+  getelementptr { i8, { i32 } }, { i8, { i32 } }* %n.ptr, i32 0, i32 1
+  ; CHECK: getelementptr { i8, { i32 } }, { i8, { i32 } }* %n.ptr, i32 0, i32 1
+  getelementptr inbounds { i8, { i32 } }, { i8, { i32 } }* %n.ptr, i32 1, i32 0
+  ; CHECK: getelementptr inbounds { i8, { i32 } }, { i8, { i32 } }* %n.ptr, i32 1, i32 0
+  getelementptr i8, <2 x i8*> %pvec, <2 x i64> %offsets
+  ; CHECK: getelementptr i8, <2 x i8*> %pvec, <2 x i64> %offsets
+
+  ret void
+}
+
+; Instructions -- Memory Access and Addressing Operations
+!7 = !{i32 1}
+!8 = !{}
+!9 = !{i64 4}
+define void @instructions.memops(i32** %base) {
+  alloca i32, i8 4, align 4
+  ; CHECK: alloca i32, i8 4, align 4
+  alloca inalloca i32, i8 4, align 4
+  ; CHECK: alloca inalloca i32, i8 4, align 4
+
+  load i32*, i32** %base, align 8, !invariant.load !7, !nontemporal !8, !nonnull !7, !dereferenceable !9, !dereferenceable_or_null !9
+  ; CHECK: load i32*, i32** %base, align 8, !invariant.load !7, !nontemporal !8, !nonnull !7, !dereferenceable !9, !dereferenceable_or_null !9
+  load volatile i32*, i32** %base, align 8, !invariant.load !7, !nontemporal !8, !nonnull !7, !dereferenceable !9, !dereferenceable_or_null !9
+  ; CHECK: load volatile i32*, i32** %base, align 8, !invariant.load !7, !nontemporal !8, !nonnull !7, !dereferenceable !9, !dereferenceable_or_null !9
+
+  store i32* null, i32** %base, align 4, !nontemporal !8
+  ; CHECK: store i32* null, i32** %base, align 4, !nontemporal !8
+  store volatile i32* null, i32** %base, align 4, !nontemporal !8
+  ; CHECK: store volatile i32* null, i32** %base, align 4, !nontemporal !8
+
+  ret void
+}
+
+; Instructions -- Conversion Operations
+define void @instructions.conversions() {
+  trunc i32 -1 to i1
+  ; CHECK: trunc i32 -1 to i1
+  zext i32 -1 to i64
+  ; CHECK: zext i32 -1 to i64
+  sext i32 -1 to i64
+  ; CHECK: sext i32 -1 to i64
+  fptrunc float undef to half
+  ; CHECK: fptrunc float undef to half
+  fpext half undef to float
+  ; CHECK: fpext half undef to float
+  fptoui float undef to i32
+  ; CHECK: fptoui float undef to i32
+  fptosi float undef to i32
+  ; CHECK: fptosi float undef to i32
+  uitofp i32 1 to float
+  ; CHECK: uitofp i32 1 to float
+  sitofp i32 -1 to float
+  ; CHECK: sitofp i32 -1 to float
+  ptrtoint i8* null to i64
+  ; CHECK: ptrtoint i8* null to i64
+  inttoptr i64 0 to i8*
+  ; CHECK: inttoptr i64 0 to i8*
+  bitcast i32 0 to i32
+  ; CHECK: bitcast i32 0 to i32
+  addrspacecast i32* null to i32 addrspace(1)*
+  ; CHECK: addrspacecast i32* null to i32 addrspace(1)*
+
+  ret void
+}
+
+; Instructions -- Other Operations
+define void @instructions.other(i32 %op1, i32 %op2, half %fop1, half %fop2) {
+entry:
+  icmp eq  i32 %op1, %op2
+  ; CHECK: icmp eq  i32 %op1, %op2
+  icmp ne  i32 %op1, %op2
+  ; CHECK: icmp ne  i32 %op1, %op2
+  icmp ugt i32 %op1, %op2
+  ; CHECK: icmp ugt i32 %op1, %op2
+  icmp uge i32 %op1, %op2
+  ; CHECK: icmp uge i32 %op1, %op2
+  icmp ult i32 %op1, %op2
+  ; CHECK: icmp ult i32 %op1, %op2
+  icmp ule i32 %op1, %op2
+  ; CHECK: icmp ule i32 %op1, %op2
+  icmp sgt i32 %op1, %op2
+  ; CHECK: icmp sgt i32 %op1, %op2
+  icmp sge i32 %op1, %op2
+  ; CHECK: icmp sge i32 %op1, %op2
+  icmp slt i32 %op1, %op2
+  ; CHECK: icmp slt i32 %op1, %op2
+  icmp sle i32 %op1, %op2
+  ; CHECK: icmp sle i32 %op1, %op2
+
+  fcmp false half %fop1, %fop2
+  ; CHECK: fcmp false half %fop1, %fop2
+  fcmp oeq   half %fop1, %fop2
+  ; CHECK: fcmp oeq   half %fop1, %fop2
+  fcmp ogt   half %fop1, %fop2
+  ; CHECK: fcmp ogt   half %fop1, %fop2
+  fcmp oge   half %fop1, %fop2
+  ; CHECK: fcmp oge   half %fop1, %fop2
+  fcmp olt   half %fop1, %fop2
+  ; CHECK: fcmp olt   half %fop1, %fop2
+  fcmp ole   half %fop1, %fop2
+  ; CHECK: fcmp ole   half %fop1, %fop2
+  fcmp one   half %fop1, %fop2
+  ; CHECK: fcmp one   half %fop1, %fop2
+  fcmp ord   half %fop1, %fop2
+  ; CHECK: fcmp ord   half %fop1, %fop2
+  fcmp ueq   half %fop1, %fop2
+  ; CHECK: fcmp ueq   half %fop1, %fop2
+  fcmp ugt   half %fop1, %fop2
+  ; CHECK: fcmp ugt   half %fop1, %fop2
+  fcmp uge   half %fop1, %fop2
+  ; CHECK: fcmp uge   half %fop1, %fop2
+  fcmp ult   half %fop1, %fop2
+  ; CHECK: fcmp ult   half %fop1, %fop2
+  fcmp ule   half %fop1, %fop2
+  ; CHECK: fcmp ule   half %fop1, %fop2
+  fcmp une   half %fop1, %fop2
+  ; CHECK: fcmp une   half %fop1, %fop2
+  fcmp uno   half %fop1, %fop2
+  ; CHECK: fcmp uno   half %fop1, %fop2
+  fcmp true  half %fop1, %fop2
+  ; CHECK: fcmp true  half %fop1, %fop2
+
+  br label %exit
+L1:
+  %v1 = add i32 %op1, %op2
+  br label %exit
+L2:
+  %v2 = add i32 %op1, %op2
+  br label %exit
+exit:
+  phi i32 [ %v1, %L1 ], [ %v2, %L2 ], [ %op1, %entry ]
+  ; CHECK: phi i32 [ %v1, %L1 ], [ %v2, %L2 ], [ %op1, %entry ]
+
+  select i1 true, i32 0, i32 1
+  ; CHECK: select i1 true, i32 0, i32 1
+  select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
+  ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
+
+  call void @f.nobuiltin() builtin
+  ; CHECK: call void @f.nobuiltin() #41
+
+  call fastcc noalias i32* @f.noalias() noinline
+  ; CHECK: call fastcc noalias i32* @f.noalias() #12
+  tail call ghccc nonnull i32* @f.nonnull() minsize
+  ; CHECK: tail call ghccc nonnull i32* @f.nonnull() #7
+
+  ret void
+}
+
+define void @instructions.call_musttail(i8* inalloca %val) {
+  musttail call void @f.param.inalloca(i8* inalloca %val)
+  ; CHECK: musttail call void @f.param.inalloca(i8* inalloca %val)
+
+  ret void
+}
+
+define void @instructions.call_notail() {
+  notail call void @f1()
+  ; CHECK: notail call void @f1()
+
+  ret void
+}
+
+define void @instructions.landingpad() personality i32 -2 {
+  invoke void @llvm.donothing() to label %proceed unwind label %catch1
+  invoke void @llvm.donothing() to label %proceed unwind label %catch2
+  invoke void @llvm.donothing() to label %proceed unwind label %catch3
+  invoke void @llvm.donothing() to label %proceed unwind label %catch4
+
+catch1:
+  landingpad i32
+  ; CHECK: landingpad i32
+             cleanup
+             ; CHECK: cleanup
+  br label %proceed
+
+catch2:
+  landingpad i32
+  ; CHECK: landingpad i32
+             cleanup
+             ; CHECK: cleanup
+             catch i32* null
+             ; CHECK: catch i32* null
+  br label %proceed
+
+catch3:
+  landingpad i32
+  ; CHECK: landingpad i32
+             cleanup
+             ; CHECK: cleanup
+             catch i32* null
+             ; CHECK: catch i32* null
+             catch i32* null
+             ; CHECK: catch i32* null
+  br label %proceed
+
+catch4:
+  landingpad i32
+  ; CHECK: landingpad i32
+             filter [2 x i32] zeroinitializer
+             ; CHECK: filter [2 x i32] zeroinitializer
+  br label %proceed
+
+proceed:
+  ret void
+}
+
+;; Intrinsic Functions
+
+; Intrinsic Functions -- Variable Argument Handling
+declare void @llvm.va_start(i8*)
+declare void @llvm.va_copy(i8*, i8*)
+declare void @llvm.va_end(i8*)
+define void @instructions.va_arg(i8* %v, ...) {
+  %ap = alloca i8*
+  %ap2 = bitcast i8** %ap to i8*
+
+  call void @llvm.va_start(i8* %ap2)
+  ; CHECK: call void @llvm.va_start(i8* %ap2)
+
+  va_arg i8* %ap2, i32
+  ; CHECK: va_arg i8* %ap2, i32
+
+  call void @llvm.va_copy(i8* %v, i8* %ap2)
+  ; CHECK: call void @llvm.va_copy(i8* %v, i8* %ap2)
+
+  call void @llvm.va_end(i8* %ap2)
+  ; CHECK: call void @llvm.va_end(i8* %ap2)
+
+  ret void
+}
+
+; Intrinsic Functions -- Accurate Garbage Collection
+declare void @llvm.gcroot(i8**, i8*)
+declare i8* @llvm.gcread(i8*, i8**)
+declare void @llvm.gcwrite(i8*, i8*, i8**)
+define void @intrinsics.gc() gc "shadow-stack" {
+  %ptrloc = alloca i8*
+  call void @llvm.gcroot(i8** %ptrloc, i8* null)
+  ; CHECK: call void @llvm.gcroot(i8** %ptrloc, i8* null)
+
+  call i8* @llvm.gcread(i8* null, i8** %ptrloc)
+  ; CHECK: call i8* @llvm.gcread(i8* null, i8** %ptrloc)
+
+  %ref = alloca i8
+  call void @llvm.gcwrite(i8* %ref, i8* null, i8** %ptrloc)
+  ; CHECK: call void @llvm.gcwrite(i8* %ref, i8* null, i8** %ptrloc)
+
+  ret void
+}
+
+; Intrinsic Functions -- Code Generation
+declare i8* @llvm.returnaddress(i32)
+declare i8* @llvm.frameaddress(i32)
+declare i32 @llvm.read_register.i32(metadata)
+declare i64 @llvm.read_register.i64(metadata)
+declare void @llvm.write_register.i32(metadata, i32)
+declare void @llvm.write_register.i64(metadata, i64)
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8*)
+declare void @llvm.prefetch(i8*, i32, i32, i32)
+declare void @llvm.pcmarker(i32)
+declare i64 @llvm.readcyclecounter()
+declare void @llvm.clear_cache(i8*, i8*)
+declare void @llvm.instrprof_increment(i8*, i64, i32, i32)
+
+!10 = !{!"rax"}
+define void @intrinsics.codegen() {
+  call i8* @llvm.returnaddress(i32 1)
+  ; CHECK: call i8* @llvm.returnaddress(i32 1)
+  call i8* @llvm.frameaddress(i32 1)
+  ; CHECK: call i8* @llvm.frameaddress(i32 1)
+
+  call i32 @llvm.read_register.i32(metadata !10)
+  ; CHECK: call i32 @llvm.read_register.i32(metadata !10)
+  call i64 @llvm.read_register.i64(metadata !10)
+  ; CHECK: call i64 @llvm.read_register.i64(metadata !10)
+  call void @llvm.write_register.i32(metadata !10, i32 0)
+  ; CHECK: call void @llvm.write_register.i32(metadata !10, i32 0)
+  call void @llvm.write_register.i64(metadata !10, i64 0)
+  ; CHECK: call void @llvm.write_register.i64(metadata !10, i64 0)
+
+  %stack = call i8* @llvm.stacksave()
+  ; CHECK: %stack = call i8* @llvm.stacksave()
+  call void @llvm.stackrestore(i8* %stack)
+  ; CHECK: call void @llvm.stackrestore(i8* %stack)
+
+  call void @llvm.prefetch(i8* %stack, i32 0, i32 3, i32 0)
+  ; CHECK: call void @llvm.prefetch(i8* %stack, i32 0, i32 3, i32 0)
+
+  call void @llvm.pcmarker(i32 1)
+  ; CHECK: call void @llvm.pcmarker(i32 1)
+
+  call i64 @llvm.readcyclecounter()
+  ; CHECK: call i64 @llvm.readcyclecounter()
+
+  call void @llvm.clear_cache(i8* null, i8* null)
+  ; CHECK: call void @llvm.clear_cache(i8* null, i8* null)
+
+  call void @llvm.instrprof_increment(i8* null, i64 0, i32 0, i32 0)
+  ; CHECK: call void @llvm.instrprof_increment(i8* null, i64 0, i32 0, i32 0)
+
+  ret void
+}
+
+declare void @llvm.localescape(...)
+declare i8* @llvm.localrecover(i8* %func, i8* %fp, i32 %idx)
+define void @intrinsics.localescape() {
+  %static.alloca = alloca i32
+  call void (...) @llvm.localescape(i32* %static.alloca)
+  ; CHECK: call void (...) @llvm.localescape(i32* %static.alloca)
+
+  call void @intrinsics.localrecover()
+
+  ret void
+}
+define void @intrinsics.localrecover() {
+  %func = bitcast void ()* @intrinsics.localescape to i8*
+  %fp = call i8* @llvm.frameaddress(i32 1)
+  call i8* @llvm.localrecover(i8* %func, i8* %fp, i32 0)
+  ; CHECK: call i8* @llvm.localrecover(i8* %func, i8* %fp, i32 0)
+
+  ret void
+}
+
+; We need this function to provide `uses' for some metadata tests.
+define void @misc.metadata() {
+  call void @f1(), !srcloc !11
+  call void @f1(), !srcloc !12
+  call void @f1(), !srcloc !13
+  call void @f1(), !srcloc !14
+  ret void
+}
+
+declare void @op_bundle_callee_0()
+declare void @op_bundle_callee_1(i32,i32)
+
+define void @call_with_operand_bundle0(i32* %ptr) {
+; CHECK-LABEL: call_with_operand_bundle0(
+ entry:
+  %l = load i32, i32* %ptr
+  %x = add i32 42, 1
+  call void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "bar"(float  0.000000e+00, i64 100, i32 %l) ]
+; CHECK: call void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "bar"(float  0.000000e+00, i64 100, i32 %l) ]
+  ret void
+}
+
+define void @call_with_operand_bundle1(i32* %ptr) {
+; CHECK-LABEL: call_with_operand_bundle1(
+ entry:
+  %l = load i32, i32* %ptr
+  %x = add i32 42, 1
+
+  call void @op_bundle_callee_0()
+  call void @op_bundle_callee_0() [ "foo"() ]
+  call void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "bar"(float  0.000000e+00, i64 100, i32 %l) ]
+; CHECK: @op_bundle_callee_0(){{$}}
+; CHECK-NEXT: call void @op_bundle_callee_0() [ "foo"() ]
+; CHECK-NEXT: call void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "bar"(float  0.000000e+00, i64 100, i32 %l) ]
+  ret void
+}
+
+define void @call_with_operand_bundle2(i32* %ptr) {
+; CHECK-LABEL: call_with_operand_bundle2(
+ entry:
+  call void @op_bundle_callee_0() [ "foo"() ]
+; CHECK: call void @op_bundle_callee_0() [ "foo"() ]
+  ret void
+}
+
+define void @call_with_operand_bundle3(i32* %ptr) {
+; CHECK-LABEL: call_with_operand_bundle3(
+ entry:
+  %l = load i32, i32* %ptr
+  %x = add i32 42, 1
+  call void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ]
+; CHECK: call void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ]
+  ret void
+}
+
+define void @call_with_operand_bundle4(i32* %ptr) {
+; CHECK-LABEL: call_with_operand_bundle4(
+ entry:
+  %l = load i32, i32* %ptr
+  %x = add i32 42, 1
+  call void @op_bundle_callee_1(i32 10, i32 %x) [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ]
+; CHECK: call void @op_bundle_callee_1(i32 10, i32 %x) [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ]
+  ret void
+}
+
+; Invoke versions of the above tests:
+
+
+define void @invoke_with_operand_bundle0(i32* %ptr) personality i8 3 {
+; CHECK-LABEL: @invoke_with_operand_bundle0(
+ entry:
+  %l = load i32, i32* %ptr
+  %x = add i32 42, 1
+  invoke void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "bar"(float  0.000000e+00, i64 100, i32 %l) ] to label %normal unwind label %exception
+; CHECK: invoke void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "bar"(float  0.000000e+00, i64 100, i32 %l) ]
+
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %normal
+normal:
+  ret void
+}
+
+define void @invoke_with_operand_bundle1(i32* %ptr) personality i8 3 {
+; CHECK-LABEL: @invoke_with_operand_bundle1(
+ entry:
+  %l = load i32, i32* %ptr
+  %x = add i32 42, 1
+
+  invoke void @op_bundle_callee_0() to label %normal unwind label %exception
+; CHECK: invoke void @op_bundle_callee_0(){{$}}
+
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %normal
+
+normal:
+  invoke void @op_bundle_callee_0() [ "foo"() ] to label %normal1 unwind label %exception1
+; CHECK: invoke void @op_bundle_callee_0() [ "foo"() ]
+
+exception1:
+  %cleanup1 = landingpad i8 cleanup
+  br label %normal1
+
+normal1:
+  invoke void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ] to label %normal2 unwind label %exception2
+; CHECK: invoke void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ]
+
+exception2:
+  %cleanup2 = landingpad i8 cleanup
+  br label %normal2
+
+normal2:
+  ret void
+}
+
+define void @invoke_with_operand_bundle2(i32* %ptr) personality i8 3 {
+; CHECK-LABEL: @invoke_with_operand_bundle2(
+ entry:
+  invoke void @op_bundle_callee_0() [ "foo"() ] to label %normal unwind label %exception
+; CHECK: invoke void @op_bundle_callee_0() [ "foo"() ]
+
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %normal
+normal:
+  ret void
+}
+
+define void @invoke_with_operand_bundle3(i32* %ptr) personality i8 3 {
+; CHECK-LABEL: @invoke_with_operand_bundle3(
+ entry:
+  %l = load i32, i32* %ptr
+  %x = add i32 42, 1
+  invoke void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ] to label %normal unwind label %exception
+; CHECK: invoke void @op_bundle_callee_0() [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ]
+
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %normal
+normal:
+  ret void
+}
+
+define void @invoke_with_operand_bundle4(i32* %ptr) personality i8 3 {
+; CHECK-LABEL: @invoke_with_operand_bundle4(
+ entry:
+  %l = load i32, i32* %ptr
+  %x = add i32 42, 1
+  invoke void @op_bundle_callee_1(i32 10, i32 %x) [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ]
+        to label %normal unwind label %exception
+; CHECK: invoke void @op_bundle_callee_1(i32 10, i32 %x) [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ]
+
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %normal
+normal:
+  ret void
+}
+
+declare void @vaargs_func(...)
+define void @invoke_with_operand_bundle_vaarg(i32* %ptr) personality i8 3 {
+; CHECK-LABEL: @invoke_with_operand_bundle_vaarg(
+ entry:
+  %l = load i32, i32* %ptr
+  %x = add i32 42, 1
+  invoke void (...) @vaargs_func(i32 10, i32 %x) [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ]
+        to label %normal unwind label %exception
+; CHECK: invoke void (...) @vaargs_func(i32 10, i32 %x) [ "foo"(i32 42, i64 100, i32 %x), "foo"(i32 42, float  0.000000e+00, i32 %l) ]
+
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %normal
+normal:
+  ret void
+}
+
+
+declare void @f.writeonly() writeonly
+; CHECK: declare void @f.writeonly() #40
+
+;; Constant Expressions
+
+define i8** @constexpr() {
+  ; CHECK: ret i8** getelementptr inbounds ({ [4 x i8*], [4 x i8*] }, { [4 x i8*], [4 x i8*] }* null, i32 0, inrange i32 1, i32 2)
+  ret i8** getelementptr inbounds ({ [4 x i8*], [4 x i8*] }, { [4 x i8*], [4 x i8*] }* null, i32 0, inrange i32 1, i32 2)
+}
+
+; CHECK: attributes #0 = { alignstack=4 }
+; CHECK: attributes #1 = { alignstack=8 }
+; CHECK: attributes #2 = { alwaysinline }
+; CHECK: attributes #3 = { cold }
+; CHECK: attributes #4 = { convergent }
+; CHECK: attributes #5 = { inlinehint }
+; CHECK: attributes #6 = { jumptable }
+; CHECK: attributes #7 = { minsize }
+; CHECK: attributes #8 = { naked }
+; CHECK: attributes #9 = { nobuiltin }
+; CHECK: attributes #10 = { noduplicate }
+; CHECK: attributes #11 = { noimplicitfloat }
+; CHECK: attributes #12 = { noinline }
+; CHECK: attributes #13 = { nonlazybind }
+; CHECK: attributes #14 = { noredzone }
+; CHECK: attributes #15 = { noreturn }
+; CHECK: attributes #16 = { nounwind }
+; CHECK: attributes #17 = { noinline optnone }
+; CHECK: attributes #18 = { optsize }
+; CHECK: attributes #19 = { readnone }
+; CHECK: attributes #20 = { readonly }
+; CHECK: attributes #21 = { returns_twice }
+; CHECK: attributes #22 = { safestack }
+; CHECK: attributes #23 = { sanitize_address }
+; CHECK: attributes #24 = { sanitize_memory }
+; CHECK: attributes #25 = { sanitize_thread }
+; CHECK: attributes #26 = { ssp }
+; CHECK: attributes #27 = { sspreq }
+; CHECK: attributes #28 = { sspstrong }
+; CHECK: attributes #29 = { "thunk" }
+; CHECK: attributes #30 = { uwtable }
+; CHECK: attributes #31 = { "cpu"="cortex-a8" }
+; CHECK: attributes #32 = { norecurse }
+; CHECK: attributes #33 = { inaccessiblememonly }
+; CHECK: attributes #34 = { inaccessiblemem_or_argmemonly }
+; CHECK: attributes #35 = { nounwind readnone }
+; CHECK: attributes #36 = { argmemonly nounwind readonly }
+; CHECK: attributes #37 = { argmemonly nounwind }
+; CHECK: attributes #38 = { nounwind readonly }
+; CHECK: attributes #39 = { inaccessiblemem_or_argmemonly nounwind }
+; CHECK: attributes #40 = { writeonly }
+; CHECK: attributes #41 = { builtin }
+
+;; Metadata
+
+; Metadata -- Module flags
+!llvm.module.flags = !{!0, !1, !2, !4, !5, !6}
+; CHECK: !llvm.module.flags = !{!0, !1, !2, !4, !5, !6}
+
+!0 = !{i32 1, !"mod1", i32 0}
+; CHECK: !0 = !{i32 1, !"mod1", i32 0}
+!1 = !{i32 2, !"mod2", i32 0}
+; CHECK: !1 = !{i32 2, !"mod2", i32 0}
+!2 = !{i32 3, !"mod3", !3}
+; CHECK: !2 = !{i32 3, !"mod3", !3}
+!3 = !{!"mod6", !0}
+; CHECK: !3 = !{!"mod6", !0}
+!4 = !{i32 4, !"mod4", i32 0}
+; CHECK: !4 = !{i32 4, !"mod4", i32 0}
+!5 = !{i32 5, !"mod5", !0}
+; CHECK: !5 = !{i32 5, !"mod5", !0}
+!6 = !{i32 6, !"mod6", !0}
+; CHECK: !6 = !{i32 6, !"mod6", !0}
+
+; Metadata -- Check `distinct'
+!11 = distinct !{}
+; CHECK: !11 = distinct !{}
+!12 = distinct !{}
+; CHECK: !12 = distinct !{}
+!13 = !{!11}
+; CHECK: !13 = !{!11}
+!14 = !{!12}
+; CHECK: !14 = !{!12}
diff --git a/test/Bitcode/compatibility-4.0.ll.bc b/test/Bitcode/compatibility-4.0.ll.bc
new file mode 100644
index 0000000000000000000000000000000000000000..a2988ff95402376140f39be5e9566379398e3d1e
Binary files /dev/null and b/test/Bitcode/compatibility-4.0.ll.bc differ
diff --git a/test/Bitcode/compatibility.ll b/test/Bitcode/compatibility.ll
index e2b13f47d3b0709533f4bebc98099b85bee44339..b1f52bbe059fd7e55b920756040821603b53f291 100644
--- a/test/Bitcode/compatibility.ll
+++ b/test/Bitcode/compatibility.ll
@@ -760,6 +760,8 @@ define void @fastmathflags(float %op1, float %op2) {
   ; CHECK: %f.nsz = fadd nsz float %op1, %op2
   %f.arcp = fadd arcp float %op1, %op2
   ; CHECK: %f.arcp = fadd arcp float %op1, %op2
+  %f.contract = fadd contract float %op1, %op2
+  ; CHECK: %f.contract = fadd contract float %op1, %op2
   %f.fast = fadd fast float %op1, %op2
   ; CHECK: %f.fast = fadd fast float %op1, %op2
   ret void
@@ -1244,7 +1246,7 @@ exit:
   ; CHECK: select <2 x i1> <i1 true, i1 false>, <2 x i8> <i8 2, i8 3>, <2 x i8> <i8 3, i8 2>
 
   call void @f.nobuiltin() builtin
-  ; CHECK: call void @f.nobuiltin() #40
+  ; CHECK: call void @f.nobuiltin() #41
 
   call fastcc noalias i32* @f.noalias() noinline
   ; CHECK: call fastcc noalias i32* @f.noalias() #12
@@ -1609,7 +1611,7 @@ normal:
 
 
 declare void @f.writeonly() writeonly
-; CHECK: declare void @f.writeonly() #39
+; CHECK: declare void @f.writeonly() #40
 
 ;; Constant Expressions
 
@@ -1657,8 +1659,9 @@ define i8** @constexpr() {
 ; CHECK: attributes #36 = { argmemonly nounwind readonly }
 ; CHECK: attributes #37 = { argmemonly nounwind }
 ; CHECK: attributes #38 = { nounwind readonly }
-; CHECK: attributes #39 = { writeonly }
-; CHECK: attributes #40 = { builtin }
+; CHECK: attributes #39 = { inaccessiblemem_or_argmemonly nounwind }
+; CHECK: attributes #40 = { writeonly }
+; CHECK: attributes #41 = { builtin }
 
 ;; Metadata
 
diff --git a/test/Bitcode/dityperefs-3.8.ll b/test/Bitcode/dityperefs-3.8.ll
index b032805bd281441371e39c57e73a088f29ce962b..09225d4eba329553d8589acb227680229ea0c53c 100644
--- a/test/Bitcode/dityperefs-3.8.ll
+++ b/test/Bitcode/dityperefs-3.8.ll
@@ -18,14 +18,13 @@
 ; CHECK-NEXT: !7 = !DILocalVariable(name: "V1", scope: !6, type: !2)
 ; CHECK-NEXT: !8 = !DIObjCProperty(name: "P1", type: !1)
 ; CHECK-NEXT: !9 = !DITemplateTypeParameter(type: !1)
-; CHECK-NEXT: !10 = distinct !DIGlobalVariableExpression(var: !11)
-; CHECK-NEXT: !11 = !DIGlobalVariable(name: "G",{{.*}} type: !1,
-; CHECK-NEXT: !12 = !DITemplateValueParameter(type: !1, value: i32* @G1)
-; CHECK-NEXT: !13 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "T2", scope: !0, entity: !1)
-; CHECK-NEXT: !14 = !DICompositeType(tag: DW_TAG_structure_type, name: "T3", file: !0, elements: !15, identifier: "T3")
-; CHECK-NEXT: !15 = !{!16}
-; CHECK-NEXT: !16 = !DISubprogram(scope: !14,
-; CHECK-NEXT: !17 = !DIDerivedType(tag: DW_TAG_ptr_to_member_type,{{.*}} extraData: !14)
+; CHECK-NEXT: !10 = !DIGlobalVariable(name: "G",{{.*}} type: !1,
+; CHECK-NEXT: !11 = !DITemplateValueParameter(type: !1, value: i32* @G1)
+; CHECK-NEXT: !12 = !DIImportedEntity(tag: DW_TAG_imported_module, name: "T2", scope: !0, entity: !1)
+; CHECK-NEXT: !13 = !DICompositeType(tag: DW_TAG_structure_type, name: "T3", file: !0, elements: !14, identifier: "T3")
+; CHECK-NEXT: !14 = !{!15}
+; CHECK-NEXT: !15 = !DISubprogram(scope: !13,
+; CHECK-NEXT: !16 = !DIDerivedType(tag: DW_TAG_ptr_to_member_type,{{.*}} extraData: !13)
 
 !0 = !DIFile(filename: "path/to/file", directory: "/path/to/dir")
 !1 = !DICompositeType(tag: DW_TAG_structure_type, name: "T1", file: !0, identifier: "T1")
diff --git a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
index 9e6e72cda3aab5a4652a7325fbdd00706b33e911..3a5adea202e2a0da2b963f851d7ae4dedf168ad4 100644
--- a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
+++ b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll
@@ -10,7 +10,7 @@
 ; CHECK-NEXT:    <VERSION
 ; See if the call to func is registered, using the expected callsite count
 ; and profile count, with value id matching the subsequent value symbol table.
-; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op4=[[HOT1:.*]] op5=3 op6=[[COLD:.*]] op7=1 op8=[[HOT2:.*]] op9=3 op10=[[NONE1:.*]] op11=2 op12=[[HOT3:.*]] op13=3 op14=[[NONE2:.*]] op15=2 op16=[[NONE3:.*]] op17=2/>
+; CHECK-NEXT:    <PERMODULE_PROFILE {{.*}} op4=[[HOT1:.*]] op5=3 op6=[[COLD:.*]] op7=1 op8=[[HOT2:.*]] op9=3 op10=[[HOT4:.*]] op11=3 op12=[[NONE1:.*]] op13=2 op14=[[HOT3:.*]] op15=3 op16=[[NONE2:.*]] op17=2 op18=[[NONE3:.*]] op19=2 op20=[[LEGACY:.*]] op21=3/>
 ; CHECK-NEXT:  </GLOBALVAL_SUMMARY_BLOCK>
 ; CHECK-LABEL:  <VALUE_SYMTAB
 ; CHECK-NEXT:       <FNENTRY {{.*}} record string = 'hot_function
@@ -21,6 +21,8 @@
 ; CHECK-DAG:        <ENTRY abbrevid=6 op0=[[HOT1]] {{.*}} record string = 'hot1'
 ; CHECK-DAG:        <ENTRY abbrevid=6 op0=[[HOT2]] {{.*}} record string = 'hot2'
 ; CHECK-DAG:        <ENTRY abbrevid=6 op0=[[HOT3]] {{.*}} record string = 'hot3'
+; CHECK-DAG:        <ENTRY abbrevid=6 op0=[[HOT4]] {{.*}} record string = 'hot4'
+; CHECK-DAG:        <COMBINED_ENTRY abbrevid=11 op0=[[LEGACY]] op1=123/>
 ; CHECK-LABEL:  </VALUE_SYMTAB>
 
 ; COMBINED:       <GLOBALVAL_SUMMARY_BLOCK
@@ -48,6 +50,7 @@ entry:
 Cold:           ; 1/1000 goes here
   call void @cold()
   call void @hot2()
+  call void @hot4(), !prof !15
   call void @none1()
   br label %exit
 Hot:            ; 999/1000 goes here
@@ -68,6 +71,7 @@ exit:
 declare void @hot1() #1
 declare void @hot2() #1
 declare void @hot3() #1
+declare void @hot4() #1
 declare void @cold() #1
 declare void @none1() #1
 declare void @none2() #1
@@ -80,7 +84,7 @@ declare void @none3() #1
 
 
 !llvm.module.flags = !{!1}
-!20 = !{!"function_entry_count", i64 110}
+!20 = !{!"function_entry_count", i64 110, i64 123}
 
 !1 = !{i32 1, !"ProfileSummary", !2}
 !2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
@@ -96,3 +100,4 @@ declare void @none3() #1
 !12 = !{i32 10000, i64 100, i32 1}
 !13 = !{i32 999000, i64 100, i32 1}
 !14 = !{i32 999999, i64 1, i32 2}
+!15 = !{!"branch_weights", i32 100}
diff --git a/test/Bitcode/thinlto-function-summary.ll b/test/Bitcode/thinlto-function-summary.ll
index 594aaab566d1629e7e7150579dbfdcd0acecd16c..ff61b7713f0f49b239d902f27419ebc8f4197059 100644
--- a/test/Bitcode/thinlto-function-summary.ll
+++ b/test/Bitcode/thinlto-function-summary.ll
@@ -17,7 +17,7 @@
 ; BC-NEXT: <FNENTRY {{.*}} op0=4 {{.*}}> record string = 'variadic'
 ; BC-NEXT: <FNENTRY {{.*}} op0=1 {{.*}}> record string = 'foo'
 ; BC-NEXT: <FNENTRY {{.*}} op0=2 {{.*}}> record string = 'bar'
-; BC-NEXT: <FNENTRY {{.*}} op0=5 {{.*}}> record string = 'f'
+; BC-NEXT: <ENTRY {{.*}} op0=5 {{.*}}> record string = 'f'
 ; BC-NEXT: <ENTRY {{.*}} record string = 'h'
 ; BC-NEXT: <FNENTRY {{.*}} op0=3 {{.*}}> record string = 'anon.
 
diff --git a/test/Bitcode/thinlto-type-vcalls.ll b/test/Bitcode/thinlto-type-vcalls.ll
new file mode 100644
index 0000000000000000000000000000000000000000..40d229d121484cd40aaee716c822394bb650d9b3
--- /dev/null
+++ b/test/Bitcode/thinlto-type-vcalls.ll
@@ -0,0 +1,105 @@
+; RUN: opt -module-summary %s -o %t.o
+; RUN: llvm-bcanalyzer -dump %t.o | FileCheck %s
+; RUN: llvm-lto -thinlto -o %t2 %t.o
+; RUN: llvm-bcanalyzer -dump %t2.thinlto.bc | FileCheck --check-prefix=COMBINED %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; COMBINED:      <TYPE_TEST_ASSUME_VCALLS op0=6699318081062747564 op1=16/>
+; COMBINED-NEXT: <COMBINED
+; COMBINED-NEXT: <TYPE_CHECKED_LOAD_VCALLS op0=6699318081062747564 op1=16/>
+; COMBINED-NEXT: <COMBINED
+; COMBINED-NEXT: <TYPE_TEST_ASSUME_VCALLS op0=6699318081062747564 op1=24 op2=-2012135647395072713 op3=32/>
+; COMBINED-NEXT: <COMBINED
+; COMBINED-NEXT: <TYPE_TEST_ASSUME_CONST_VCALL op0=6699318081062747564 op1=16 op2=42/>
+; COMBINED-NEXT: <TYPE_TEST_ASSUME_CONST_VCALL op0=6699318081062747564 op1=24 op2=43/>
+; COMBINED-NEXT: <COMBINED
+; COMBINED-NEXT: <TYPE_CHECKED_LOAD_CONST_VCALL op0=6699318081062747564 op1=16 op2=42/>
+; COMBINED-NEXT: <COMBINED
+; COMBINED-NEXT: <TYPE_TESTS op0=7546896869197086323/>
+; COMBINED-NEXT: <COMBINED
+
+; CHECK: <TYPE_TEST_ASSUME_VCALLS op0=6699318081062747564 op1=16/>
+define void @f1([3 x i8*]* %vtable) {
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 2
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to void (i8*, i32)*
+  call void %fptr_casted(i8* null, i32 undef)
+  ret void
+}
+
+; CHECK: <TYPE_TEST_ASSUME_VCALLS op0=6699318081062747564 op1=24 op2=-2012135647395072713 op3=32/>
+define void @f2([3 x i8*]* %vtable, [3 x i8*]* %vtable2) {
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 3
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to void (i8*, i32)*
+  call void %fptr_casted(i8* null, i32 undef)
+
+  %vtablei82 = bitcast [3 x i8*]* %vtable2 to i8*
+  %p2 = call i1 @llvm.type.test(i8* %vtablei82, metadata !"bar")
+  call void @llvm.assume(i1 %p2)
+  %fptrptr2 = getelementptr [3 x i8*], [3 x i8*]* %vtable2, i32 0, i32 4
+  %fptr2 = load i8*, i8** %fptrptr2
+  %fptr_casted2 = bitcast i8* %fptr2 to void (i8*, i128)*
+  call void %fptr_casted2(i8* null, i128 0)
+
+  ret void
+}
+
+; CHECK: <TYPE_CHECKED_LOAD_VCALLS op0=6699318081062747564 op1=16/>
+define void @f3(i8* %vtable) {
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtable, i32 16, metadata !"foo")
+  %fptr = extractvalue {i8*, i1} %pair, 0
+  %fptr_casted = bitcast i8* %fptr to void (i8*, i32)*
+  call void %fptr_casted(i8* null, i32 undef)
+  ret void
+}
+
+; CHECK: <TYPE_TEST_ASSUME_CONST_VCALL op0=6699318081062747564 op1=16 op2=42/>
+; CHECK-NEXT: <TYPE_TEST_ASSUME_CONST_VCALL op0=6699318081062747564 op1=24 op2=43/>
+define void @f4([3 x i8*]* %vtable, [3 x i8*]* %vtable2) {
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"foo")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 2
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to void (i8*, i32)*
+  call void %fptr_casted(i8* null, i32 42)
+
+  %vtablei82 = bitcast [3 x i8*]* %vtable2 to i8*
+  %p2 = call i1 @llvm.type.test(i8* %vtablei82, metadata !"foo")
+  call void @llvm.assume(i1 %p2)
+  %fptrptr2 = getelementptr [3 x i8*], [3 x i8*]* %vtable2, i32 0, i32 3
+  %fptr2 = load i8*, i8** %fptrptr2
+  %fptr_casted2 = bitcast i8* %fptr2 to void (i8*, i32)*
+  call void %fptr_casted2(i8* null, i32 43)
+  ret void
+}
+
+; CHECK: <TYPE_CHECKED_LOAD_CONST_VCALL op0=6699318081062747564 op1=16 op2=42/>
+define void @f5(i8* %vtable) {
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtable, i32 16, metadata !"foo")
+  %fptr = extractvalue {i8*, i1} %pair, 0
+  %fptr_casted = bitcast i8* %fptr to void (i8*, i32)*
+  call void %fptr_casted(i8* null, i32 42)
+  ret void
+}
+
+; CHECK-NOT: <TYPE_CHECKED_LOAD_CONST_VCALL op0=7546896869197086323
+; CHECK: <TYPE_TESTS op0=7546896869197086323/>
+; CHECK-NOT: <TYPE_CHECKED_LOAD_CONST_VCALL op0=7546896869197086323
+define {i8*, i1} @f6(i8* %vtable) {
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtable, i32 16, metadata !"baz")
+  ret {i8*, i1} %pair
+}
+
+declare i1 @llvm.type.test(i8*, metadata) nounwind readnone
+declare void @llvm.assume(i1)
+declare {i8*, i1} @llvm.type.checked.load(i8*, i32, metadata)
diff --git a/test/Bitcode/upgrade-debug-info-for-profiling.ll b/test/Bitcode/upgrade-debug-info-for-profiling.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d50f87fa6200a85f4b88515c48af10c5041bd11e
--- /dev/null
+++ b/test/Bitcode/upgrade-debug-info-for-profiling.ll
@@ -0,0 +1,10 @@
+; RUN: llvm-dis < %s.bc | FileCheck %s
+; RUN: verify-uselistorder < %s.bc
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+
+!llvm.dbg.cu = !{!1}
+; CHECK: DICompileUnit(language: DW_LANG_C99, file: !{{[0-9]+}}, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !2, emissionKind: FullDebug)
+!2 = !DIFile(filename: "foo.c", directory: "/path/to/dir")
diff --git a/test/Bitcode/upgrade-debug-info-for-profiling.ll.bc b/test/Bitcode/upgrade-debug-info-for-profiling.ll.bc
new file mode 100644
index 0000000000000000000000000000000000000000..2bb54e9b3c556fa3550ac2e7cd41c8c16a0dba6b
Binary files /dev/null and b/test/Bitcode/upgrade-debug-info-for-profiling.ll.bc differ
diff --git a/test/Bitcode/upgrade-pointer-address-space.ll b/test/Bitcode/upgrade-pointer-address-space.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8b85055651d38b93441fa55114fcb8fb689d923f
--- /dev/null
+++ b/test/Bitcode/upgrade-pointer-address-space.ll
@@ -0,0 +1,5 @@
+; RUN: llvm-dis -o - %s.bc | FileCheck %s
+
+; CHECK-DAG: !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{[0-9]+}}, size: {{[0-9]+}})
+; CHECK-DAG: !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{[0-9]+}}, size: {{[0-9]+}})
+; CHECK-DAG: !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !{{[0-9]+}}, size: {{[0-9]+}})
diff --git a/test/Bitcode/upgrade-pointer-address-space.ll.bc b/test/Bitcode/upgrade-pointer-address-space.ll.bc
new file mode 100644
index 0000000000000000000000000000000000000000..0ad735e2430be8f5cf323e21d7944b3a66f236bc
Binary files /dev/null and b/test/Bitcode/upgrade-pointer-address-space.ll.bc differ
diff --git a/test/BugPoint/invalid-debuginfo.ll b/test/BugPoint/invalid-debuginfo.ll
index 91b01493d1f975265d30cf2af293a4b996b2c689..2005a13b67578de17d186fd7f4163c21bf3cec91 100644
--- a/test/BugPoint/invalid-debuginfo.ll
+++ b/test/BugPoint/invalid-debuginfo.ll
@@ -1,6 +1,6 @@
 ; RUN: bugpoint -load %llvmshlibdir/BugpointPasses%shlibext %s -output-prefix %t -bugpoint-crash-too-many-cus -silence-passes 2>&1 | FileCheck %s
 ; REQUIRES: loadable_module
-; CHECK: All DICompileUnits must be listed in llvm.dbg.cu
+; CHECK: DICompileUnit not listed in llvm.dbg.cu
 
 ; When bugpoint hacks at this testcase it will at one point create illegal IR
 ; that won't even pass the Verifier. A bugpoint *driver* built with assertions
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll b/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll
index 350ae0e4c0828145a165bc19466f16adee609412..59b9bb49f0ee0bc50ef774367f306c34db165985 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-callingconv.ll
@@ -80,3 +80,17 @@ define void @test_varargs() {
   call void(i32, double, i64, ...) @varargs(i32 42, double 1.0, i64 12, i8 3, i16 1, i32 4, float 1.0, double 2.0)
   ret void
 }
+
+; signext/zeroext parameters on the stack: not part of any real ABI as far as I
+; know, but ELF currently allocates 8 bytes for a signext parameter on the
+; stack. The ADJCALLSTACK ops should reflect this, even if the difference is
+; theoretical.
+declare void @stack_ext_needed([8 x i64], i8 signext %in)
+; CHECK-LABEL: name: test_stack_ext_needed
+; CHECK: ADJCALLSTACKDOWN 8
+; CHECK: BL @stack_ext_needed
+; CHECK: ADJCALLSTACKUP 8
+define void @test_stack_ext_needed() {
+  call void @stack_ext_needed([8 x i64] undef, i8 signext 42)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
index e4a2bb2dbf548494c50114d05729f42984994b33..e40199d82c9ddb5df8e97be4d9cdb96cfa094764 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-fallback.ll
@@ -1,6 +1,6 @@
 ; RUN: not llc -O0 -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=ERROR
 ; RUN: llc -O0 -global-isel -global-isel-abort=0 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefix=FALLBACK
-; RUN: llc -O0 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o %t.out 2> %t.err
+; RUN: llc -O0 -global-isel -global-isel-abort=2 -pass-remarks-missed='gisel*' -verify-machineinstrs %s -o %t.out 2> %t.err
 ; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-OUT < %t.out
 ; RUN: FileCheck %s --check-prefix=FALLBACK-WITH-REPORT-ERR < %t.err
 ; This file checks that the fallback path to selection dag works.
@@ -14,10 +14,11 @@ target triple = "aarch64--"
 
 ; We use __fixunstfti as the common denominator for __fixunstfti on Linux and
 ; ___fixunstfti on iOS
-; ERROR: Unable to lower arguments
+; ERROR: unable to lower arguments: i128 (i128)* (in function: ABIi128)
 ; FALLBACK: ldr q0,
 ; FALLBACK-NEXT: bl __fixunstfti
 ;
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to lower arguments: i128 (i128)* (in function: ABIi128)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for ABIi128
 ; FALLBACK-WITH-REPORT-OUT-LABEL: ABIi128:
 ; FALLBACK-WITH-REPORT-OUT: ldr q0,
@@ -31,6 +32,7 @@ define i128 @ABIi128(i128 %arg1) {
 ; It happens that we don't handle ConstantArray instances yet during
 ; translation. Any other constant would be fine too.
 
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate constant: [1 x double] (in function: constant)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for constant
 ; FALLBACK-WITH-REPORT-OUT-LABEL: constant:
 ; FALLBACK-WITH-REPORT-OUT: fmov d0, #1.0
@@ -41,6 +43,7 @@ define [1 x double] @constant() {
   ; The key problem here is that we may fail to create an MBB referenced by a
   ; PHI. If so, we cannot complete the G_PHI and mustn't try or bad things
   ; happen.
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: cannot select: G_STORE %vreg4, %vreg2; mem:ST4[%addr] GPR:%vreg4,%vreg2 (in function: pending_phis)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for pending_phis
 ; FALLBACK-WITH-REPORT-OUT-LABEL: pending_phis:
 define i32 @pending_phis(i1 %tst, i32 %val, i32* %addr) {
@@ -60,6 +63,7 @@ false:
 }
 
   ; General legalizer inability to handle types whose size wasn't a power of 2.
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(s42) = G_LOAD %vreg0; mem:LD6[%addr](align=8) (in function: odd_type)
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_type
 ; FALLBACK-WITH-REPORT-OUT-LABEL: odd_type:
 define void @odd_type(i42* %addr) {
@@ -67,8 +71,17 @@ define void @odd_type(i42* %addr) {
   ret void
 }
 
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg1<def>(<7 x s32>) = G_LOAD %vreg0; mem:LD28[%addr](align=32) (in function: odd_vector)
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for odd_vector
+; FALLBACK-WITH-REPORT-OUT-LABEL: odd_vector:
+define void @odd_vector(<7 x i32>* %addr) {
+  %vec = load <7 x i32>, <7 x i32>* %addr
+  ret void
+}
+
   ; RegBankSelect crashed when given invalid mappings, and AArch64's
   ; implementation produce valid-but-nonsense mappings for G_SEQUENCE.
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to map instruction
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for sequence_mapping
 ; FALLBACK-WITH-REPORT-OUT-LABEL: sequence_mapping:
 define void @sequence_mapping([2 x i64] %in) {
@@ -76,9 +89,68 @@ define void @sequence_mapping([2 x i64] %in) {
 }
 
   ; Legalizer was asserting when it enountered an unexpected default action.
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to map instruction
 ; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for legal_default
 ; FALLBACK-WITH-REPORT-LABEL: legal_default:
-define void @legal_default(i64 %in) {
-  insertvalue [2 x i64] undef, i64 %in, 0
+define void @legal_default([8 x i8] %in) {
+  insertvalue { [4 x i8], [8 x i8], [4 x i8] } undef, [8 x i8] %in, 1
+  ret void
+}
+
+  ; AArch64 was asserting instead of returning an invalid mapping for unknown
+  ; sizes.
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: ret: '  ret i128 undef' (in function: sequence_sizes)
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for sequence_sizes
+; FALLBACK-WITH-REPORT-LABEL: sequence_sizes:
+define i128 @sequence_sizes([8 x i8] %in) {
+  ret i128 undef
+}
+
+; Just to make sure we don't accidentally emit a normal load/store.
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: cannot select: %vreg2<def>(s64) = G_LOAD %vreg0; mem:LD8[%addr] GPR:%vreg2,%vreg0 (in function: atomic_ops)
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for atomic_ops
+; FALLBACK-WITH-REPORT-LABEL: atomic_ops:
+define i64 @atomic_ops(i64* %addr) {
+  store atomic i64 0, i64* %addr unordered, align 8
+  %res = load atomic i64, i64* %addr seq_cst, align 8
+  ret i64 %res
+}
+
+; Make sure we don't mess up metadata arguments.
+declare void @llvm.write_register.i64(metadata, i64)
+
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: call: ' call void @llvm.write_register.i64(metadata !0, i64 0)' (in function: test_write_register_intrin)
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_write_register_intrin
+; FALLBACK-WITH-REPORT-LABEL: test_write_register_intrin:
+define void @test_write_register_intrin() {
+  call void @llvm.write_register.i64(metadata !{!"sp"}, i64 0)
+  ret void
+}
+
+@_ZTIi = external global i8*
+declare i32 @__gxx_personality_v0(...)
+
+; Check that we fallback on invoke translation failures.
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to translate instruction: invoke: '  invoke void %callee(i128 0)
+; FALLBACK-WITH-REPORT-NEXT:   to label %continue unwind label %broken' (in function: invoke_weird_type)
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for invoke_weird_type
+; FALLBACK-WITH-REPORT-OUT-LABEL: invoke_weird_type:
+define void @invoke_weird_type(void(i128)* %callee) personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+  invoke void %callee(i128 0)
+    to label %continue unwind label %broken
+
+broken:
+  landingpad { i8*, i32 } catch i8* bitcast(i8** @_ZTIi to i8*)
+  ret void
+
+continue:
   ret void
 }
+
+; Check that we fallback on invoke translation failures.
+; FALLBACK-WITH-REPORT-ERR: remark: <unknown>:0:0: unable to legalize instruction: %vreg0<def>(s128) = G_FCONSTANT quad 2
+; FALLBACK-WITH-REPORT-ERR: warning: Instruction selection used fallback path for test_quad_dump
+; FALLBACK-WITH-REPORT-OUT-LABEL: test_quad_dump:
+define fp128 @test_quad_dump() {
+  ret fp128 0xL00000000000000004000000000000000
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
deleted file mode 100644
index 6b8a9900039247064f9632b8747da524cc3082cf..0000000000000000000000000000000000000000
--- a/test/CodeGen/AArch64/GlobalISel/arm64-instructionselect.mir
+++ /dev/null
@@ -1,2608 +0,0 @@
-# RUN: llc -O0 -mtriple=aarch64-apple-ios -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=IOS
-# RUN: llc -O0 -mtriple=aarch64-linux-gnu -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LINUX-DEFAULT
-# RUN: llc -O0 -mtriple=aarch64-linux-gnu -relocation-model=pic -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LINUX-PIC
-
-# Test the instruction selector.
-# As we support more instructions, we need to split this up.
-
---- |
-  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
-
-  define void @add_s32_gpr() { ret void }
-  define void @add_s64_gpr() { ret void }
-
-  define void @sub_s32_gpr() { ret void }
-  define void @sub_s64_gpr() { ret void }
-
-  define void @or_s32_gpr() { ret void }
-  define void @or_s64_gpr() { ret void }
-  define void @or_v2s32_fpr() { ret void }
-
-  define void @xor_s32_gpr() { ret void }
-  define void @xor_s64_gpr() { ret void }
-
-  define void @and_s32_gpr() { ret void }
-  define void @and_s64_gpr() { ret void }
-
-  define void @shl_s32_gpr() { ret void }
-  define void @shl_s64_gpr() { ret void }
-
-  define void @lshr_s32_gpr() { ret void }
-  define void @lshr_s64_gpr() { ret void }
-
-  define void @ashr_s32_gpr() { ret void }
-  define void @ashr_s64_gpr() { ret void }
-
-  define void @mul_s32_gpr() { ret void }
-  define void @mul_s64_gpr() { ret void }
-
-  define void @sdiv_s32_gpr() { ret void }
-  define void @sdiv_s64_gpr() { ret void }
-
-  define void @udiv_s32_gpr() { ret void }
-  define void @udiv_s64_gpr() { ret void }
-
-  define void @fadd_s32_gpr() { ret void }
-  define void @fadd_s64_gpr() { ret void }
-
-  define void @fsub_s32_gpr() { ret void }
-  define void @fsub_s64_gpr() { ret void }
-
-  define void @fmul_s32_gpr() { ret void }
-  define void @fmul_s64_gpr() { ret void }
-
-  define void @fdiv_s32_gpr() { ret void }
-  define void @fdiv_s64_gpr() { ret void }
-
-  define void @sitofp_s32_s32_fpr() { ret void }
-  define void @sitofp_s32_s64_fpr() { ret void }
-  define void @sitofp_s64_s32_fpr() { ret void }
-  define void @sitofp_s64_s64_fpr() { ret void }
-
-  define void @uitofp_s32_s32_fpr() { ret void }
-  define void @uitofp_s32_s64_fpr() { ret void }
-  define void @uitofp_s64_s32_fpr() { ret void }
-  define void @uitofp_s64_s64_fpr() { ret void }
-
-  define void @fptosi_s32_s32_gpr() { ret void }
-  define void @fptosi_s32_s64_gpr() { ret void }
-  define void @fptosi_s64_s32_gpr() { ret void }
-  define void @fptosi_s64_s64_gpr() { ret void }
-
-  define void @fptoui_s32_s32_gpr() { ret void }
-  define void @fptoui_s32_s64_gpr() { ret void }
-  define void @fptoui_s64_s32_gpr() { ret void }
-  define void @fptoui_s64_s64_gpr() { ret void }
-
-  define void @fptrunc() { ret void }
-  define void @fpext() { ret void }
-
-  define void @unconditional_br() { ret void }
-  define void @conditional_br() { ret void }
-  define void @indirect_br() { ret void }
-
-  define void @load_s64_gpr(i64* %addr) { ret void }
-  define void @load_s32_gpr(i32* %addr) { ret void }
-  define void @load_s16_gpr(i16* %addr) { ret void }
-  define void @load_s8_gpr(i8* %addr) { ret void }
-  define void @load_s64_fpr(i64* %addr) { ret void }
-  define void @load_s32_fpr(i32* %addr) { ret void }
-  define void @load_s16_fpr(i16* %addr) { ret void }
-  define void @load_s8_fpr(i8* %addr) { ret void }
-
-  define void @store_s64_gpr(i64* %addr) { ret void }
-  define void @store_s32_gpr(i32* %addr) { ret void }
-  define void @store_s16_gpr(i16* %addr) { ret void }
-  define void @store_s8_gpr(i8* %addr) { ret void }
-  define void @store_s64_fpr(i64* %addr) { ret void }
-  define void @store_s32_fpr(i32* %addr) { ret void }
-
-  define void @frame_index() {
-    %ptr0 = alloca i64
-    ret void
-  }
-
-  define void @selected_property() { ret void }
-
-  define i32 @const_s32() { ret i32 42 }
-  define i64 @const_s64() { ret i64 1234567890123 }
-
-  define i32 @fconst_s32() { ret i32 42 }
-  define i64 @fconst_s64() { ret i64 1234567890123 }
-
-  define i8* @gep(i8* %in) { ret i8* undef }
-
-  @var_local = global i8 0
-  define i8* @global_local() { ret i8* undef }
-
-  @var_got = external global i8
-  define i8* @global_got() { ret i8* undef }
-
-  define void @trunc() { ret void }
-
-  define void @anyext_gpr() { ret void }
-  define void @zext_gpr() { ret void }
-  define void @sext_gpr() { ret void }
-
-  define void @casts() { ret void }
-
-  define void @bitcast_s32_gpr() { ret void }
-  define void @bitcast_s32_fpr() { ret void }
-  define void @bitcast_s32_gpr_fpr() { ret void }
-  define void @bitcast_s32_fpr_gpr() { ret void }
-  define void @bitcast_s64_gpr() { ret void }
-  define void @bitcast_s64_fpr() { ret void }
-  define void @bitcast_s64_gpr_fpr() { ret void }
-  define void @bitcast_s64_fpr_gpr() { ret void }
-
-  define void @icmp() { ret void }
-  define void @fcmp() { ret void }
-
-  define void @phi() { ret void }
-
-  define void @select() { ret void }
-...
-
----
-# Check that we select a 32-bit GPR G_ADD into ADDWrr on GPR32.
-# Also check that we constrain the register class of the COPY to GPR32.
-# CHECK-LABEL: name: add_s32_gpr
-name:            add_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = ADDWrr %0, %1
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_ADD %0, %1
-...
-
----
-# Same as add_s32_gpr, for 64-bit operations.
-# CHECK-LABEL: name: add_s64_gpr
-name:            add_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = ADDXrr %0, %1
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_ADD %0, %1
-...
-
----
-# Same as add_s32_gpr, for G_SUB operations.
-# CHECK-LABEL: name: sub_s32_gpr
-name:            sub_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = SUBSWrr %0, %1
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_SUB %0, %1
-...
-
----
-# Same as add_s64_gpr, for G_SUB operations.
-# CHECK-LABEL: name: sub_s64_gpr
-name:            sub_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = SUBSXrr %0, %1
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_SUB %0, %1
-...
-
----
-# Same as add_s32_gpr, for G_OR operations.
-# CHECK-LABEL: name: or_s32_gpr
-name:            or_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = ORRWrr %0, %1
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_OR %0, %1
-...
-
----
-# Same as add_s64_gpr, for G_OR operations.
-# CHECK-LABEL: name: or_s64_gpr
-name:            or_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = ORRXrr %0, %1
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_OR %0, %1
-...
-
----
-# 64-bit G_OR on vector registers.
-# CHECK-LABEL: name: or_v2s32_fpr
-name:            or_v2s32_fpr
-legalized:       true
-regBankSelected: true
-#
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-# CHECK-NEXT:  - { id: 2, class: fpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = COPY %d1
-# The actual OR does not matter as long as it is operating
-# on 64-bit width vector.
-# CHECK:    %2 = ORRv8i8 %0, %1
-body:             |
-  bb.0:
-    liveins: %d0, %d1
-
-      %0(<2 x s32>) = COPY %d0
-      %1(<2 x s32>) = COPY %d1
-      %2(<2 x s32>) = G_OR %0, %1
-...
-
----
-# Same as add_s32_gpr, for G_XOR operations.
-# CHECK-LABEL: name: xor_s32_gpr
-name:            xor_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = EORWrr %0, %1
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_XOR %0, %1
-...
-
----
-# Same as add_s64_gpr, for G_XOR operations.
-# CHECK-LABEL: name: xor_s64_gpr
-name:            xor_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = EORXrr %0, %1
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_XOR %0, %1
-...
-
----
-# Same as add_s32_gpr, for G_AND operations.
-# CHECK-LABEL: name: and_s32_gpr
-name:            and_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = ANDWrr %0, %1
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_AND %0, %1
-...
-
----
-# Same as add_s64_gpr, for G_AND operations.
-# CHECK-LABEL: name: and_s64_gpr
-name:            and_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = ANDXrr %0, %1
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_AND %0, %1
-...
-
----
-# Same as add_s32_gpr, for G_SHL operations.
-# CHECK-LABEL: name: shl_s32_gpr
-name:            shl_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = LSLVWr %0, %1
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_SHL %0, %1
-...
-
----
-# Same as add_s64_gpr, for G_SHL operations.
-# CHECK-LABEL: name: shl_s64_gpr
-name:            shl_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = LSLVXr %0, %1
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_SHL %0, %1
-...
-
----
-# Same as add_s32_gpr, for G_LSHR operations.
-# CHECK-LABEL: name: lshr_s32_gpr
-name:            lshr_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = LSRVWr %0, %1
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_LSHR %0, %1
-...
-
----
-# Same as add_s64_gpr, for G_LSHR operations.
-# CHECK-LABEL: name: lshr_s64_gpr
-name:            lshr_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = LSRVXr %0, %1
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_LSHR %0, %1
-...
-
----
-# Same as add_s32_gpr, for G_ASHR operations.
-# CHECK-LABEL: name: ashr_s32_gpr
-name:            ashr_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = ASRVWr %0, %1
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_ASHR %0, %1
-...
-
----
-# Same as add_s64_gpr, for G_ASHR operations.
-# CHECK-LABEL: name: ashr_s64_gpr
-name:            ashr_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = ASRVXr %0, %1
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_ASHR %0, %1
-...
-
----
-# Check that we select s32 GPR G_MUL. This is trickier than other binops because
-# there is only MADDWrrr, and we have to use the WZR physreg.
-# CHECK-LABEL: name: mul_s32_gpr
-name:            mul_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = MADDWrrr %0, %1, %wzr
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_MUL %0, %1
-...
-
----
-# Same as mul_s32_gpr for the s64 type.
-# CHECK-LABEL: name: mul_s64_gpr
-name:            mul_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = MADDXrrr %0, %1, %xzr
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_MUL %0, %1
-...
-
----
-# Same as add_s32_gpr, for G_SDIV operations.
-# CHECK-LABEL: name: sdiv_s32_gpr
-name:            sdiv_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = SDIVWr %0, %1
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_SDIV %0, %1
-...
-
----
-# Same as add_s64_gpr, for G_SDIV operations.
-# CHECK-LABEL: name: sdiv_s64_gpr
-name:            sdiv_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = SDIVXr %0, %1
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_SDIV %0, %1
-...
-
----
-# Same as add_s32_gpr, for G_UDIV operations.
-# CHECK-LABEL: name: udiv_s32_gpr
-name:            udiv_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %w1
-# CHECK:    %2 = UDIVWr %0, %1
-body:             |
-  bb.0:
-    liveins: %w0, %w1
-
-    %0(s32) = COPY %w0
-    %1(s32) = COPY %w1
-    %2(s32) = G_UDIV %0, %1
-...
-
----
-# Same as add_s64_gpr, for G_UDIV operations.
-# CHECK-LABEL: name: udiv_s64_gpr
-name:            udiv_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %x1
-# CHECK:    %2 = UDIVXr %0, %1
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(s64) = COPY %x0
-    %1(s64) = COPY %x1
-    %2(s64) = G_UDIV %0, %1
-...
-
----
-# Check that we select a s32 FPR G_FADD into FADDSrr.
-# CHECK-LABEL: name: fadd_s32_gpr
-name:            fadd_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-# CHECK-NEXT:  - { id: 2, class: fpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = COPY %s1
-# CHECK:    %2 = FADDSrr %0, %1
-body:             |
-  bb.0:
-    liveins: %s0, %s1
-
-    %0(s32) = COPY %s0
-    %1(s32) = COPY %s1
-    %2(s32) = G_FADD %0, %1
-...
-
----
-# CHECK-LABEL: name: fadd_s64_gpr
-name:            fadd_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-# CHECK-NEXT:  - { id: 2, class: fpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = COPY %d1
-# CHECK:    %2 = FADDDrr %0, %1
-body:             |
-  bb.0:
-    liveins: %d0, %d1
-
-    %0(s64) = COPY %d0
-    %1(s64) = COPY %d1
-    %2(s64) = G_FADD %0, %1
-...
-
----
-# CHECK-LABEL: name: fsub_s32_gpr
-name:            fsub_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-# CHECK-NEXT:  - { id: 2, class: fpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = COPY %s1
-# CHECK:    %2 = FSUBSrr %0, %1
-body:             |
-  bb.0:
-    liveins: %s0, %s1
-
-    %0(s32) = COPY %s0
-    %1(s32) = COPY %s1
-    %2(s32) = G_FSUB %0, %1
-...
-
----
-# CHECK-LABEL: name: fsub_s64_gpr
-name:            fsub_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-# CHECK-NEXT:  - { id: 2, class: fpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = COPY %d1
-# CHECK:    %2 = FSUBDrr %0, %1
-body:             |
-  bb.0:
-    liveins: %d0, %d1
-
-    %0(s64) = COPY %d0
-    %1(s64) = COPY %d1
-    %2(s64) = G_FSUB %0, %1
-...
-
----
-# CHECK-LABEL: name: fmul_s32_gpr
-name:            fmul_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-# CHECK-NEXT:  - { id: 2, class: fpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = COPY %s1
-# CHECK:    %2 = FMULSrr %0, %1
-body:             |
-  bb.0:
-    liveins: %s0, %s1
-
-    %0(s32) = COPY %s0
-    %1(s32) = COPY %s1
-    %2(s32) = G_FMUL %0, %1
-...
-
----
-# CHECK-LABEL: name: fmul_s64_gpr
-name:            fmul_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-# CHECK-NEXT:  - { id: 2, class: fpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = COPY %d1
-# CHECK:    %2 = FMULDrr %0, %1
-body:             |
-  bb.0:
-    liveins: %d0, %d1
-
-    %0(s64) = COPY %d0
-    %1(s64) = COPY %d1
-    %2(s64) = G_FMUL %0, %1
-...
-
----
-# CHECK-LABEL: name: fdiv_s32_gpr
-name:            fdiv_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-# CHECK-NEXT:  - { id: 2, class: fpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = COPY %s1
-# CHECK:    %2 = FDIVSrr %0, %1
-body:             |
-  bb.0:
-    liveins: %s0, %s1
-
-    %0(s32) = COPY %s0
-    %1(s32) = COPY %s1
-    %2(s32) = G_FDIV %0, %1
-...
-
----
-# CHECK-LABEL: name: fdiv_s64_gpr
-name:            fdiv_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-# CHECK-NEXT:  - { id: 2, class: fpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = COPY %d1
-# CHECK:    %2 = FDIVDrr %0, %1
-body:             |
-  bb.0:
-    liveins: %d0, %d1
-
-    %0(s64) = COPY %d0
-    %1(s64) = COPY %d1
-    %2(s64) = G_FDIV %0, %1
-...
-
----
-# CHECK-LABEL: name: sitofp_s32_s32_fpr
-name:            sitofp_s32_s32_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = SCVTFUWSri %0
-body:             |
-  bb.0:
-    liveins: %w0
-
-    %0(s32) = COPY %w0
-    %1(s32) = G_SITOFP %0
-...
-
----
-# CHECK-LABEL: name: sitofp_s32_s64_fpr
-name:            sitofp_s32_s64_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = SCVTFUXSri %0
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(s64) = COPY %x0
-    %1(s32) = G_SITOFP %0
-...
-
----
-# CHECK-LABEL: name: sitofp_s64_s32_fpr
-name:            sitofp_s64_s32_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = SCVTFUWDri %0
-body:             |
-  bb.0:
-    liveins: %w0
-
-    %0(s32) = COPY %w0
-    %1(s64) = G_SITOFP %0
-...
-
----
-# CHECK-LABEL: name: sitofp_s64_s64_fpr
-name:            sitofp_s64_s64_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = SCVTFUXDri %0
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(s64) = COPY %x0
-    %1(s64) = G_SITOFP %0
-...
-
----
-# CHECK-LABEL: name: uitofp_s32_s32_fpr
-name:            uitofp_s32_s32_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = UCVTFUWSri %0
-body:             |
-  bb.0:
-    liveins: %w0
-
-    %0(s32) = COPY %w0
-    %1(s32) = G_UITOFP %0
-...
-
----
-# CHECK-LABEL: name: uitofp_s32_s64_fpr
-name:            uitofp_s32_s64_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = UCVTFUXSri %0
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(s64) = COPY %x0
-    %1(s32) = G_UITOFP %0
-...
-
----
-# CHECK-LABEL: name: uitofp_s64_s32_fpr
-name:            uitofp_s64_s32_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = UCVTFUWDri %0
-body:             |
-  bb.0:
-    liveins: %w0
-
-    %0(s32) = COPY %w0
-    %1(s64) = G_UITOFP %0
-...
-
----
-# CHECK-LABEL: name: uitofp_s64_s64_fpr
-name:            uitofp_s64_s64_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64 }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = UCVTFUXDri %0
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(s64) = COPY %x0
-    %1(s64) = G_UITOFP %0
-...
-
----
-# CHECK-LABEL: name: fptosi_s32_s32_gpr
-name:            fptosi_s32_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = FCVTZSUWSr %0
-body:             |
-  bb.0:
-    liveins: %s0
-
-    %0(s32) = COPY %s0
-    %1(s32) = G_FPTOSI %0
-...
-
----
-# CHECK-LABEL: name: fptosi_s32_s64_gpr
-name:            fptosi_s32_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = FCVTZSUWDr %0
-body:             |
-  bb.0:
-    liveins: %d0
-
-    %0(s64) = COPY %d0
-    %1(s32) = G_FPTOSI %0
-...
-
----
-# CHECK-LABEL: name: fptosi_s64_s32_gpr
-name:            fptosi_s64_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = FCVTZSUXSr %0
-body:             |
-  bb.0:
-    liveins: %s0
-
-    %0(s32) = COPY %s0
-    %1(s64) = G_FPTOSI %0
-...
-
----
-# CHECK-LABEL: name: fptosi_s64_s64_gpr
-name:            fptosi_s64_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = FCVTZSUXDr %0
-body:             |
-  bb.0:
-    liveins: %d0
-
-    %0(s64) = COPY %d0
-    %1(s64) = G_FPTOSI %0
-...
-
----
-# CHECK-LABEL: name: fptoui_s32_s32_gpr
-name:            fptoui_s32_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = FCVTZUUWSr %0
-body:             |
-  bb.0:
-    liveins: %s0
-
-    %0(s32) = COPY %s0
-    %1(s32) = G_FPTOUI %0
-...
-
----
-# CHECK-LABEL: name: fptoui_s32_s64_gpr
-name:            fptoui_s32_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = FCVTZUUWDr %0
-body:             |
-  bb.0:
-    liveins: %d0
-
-    %0(s64) = COPY %d0
-    %1(s32) = G_FPTOUI %0
-...
-
----
-# CHECK-LABEL: name: fptoui_s64_s32_gpr
-name:            fptoui_s64_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = FCVTZUUXSr %0
-body:             |
-  bb.0:
-    liveins: %s0
-
-    %0(s32) = COPY %s0
-    %1(s64) = G_FPTOUI %0
-...
-
----
-# CHECK-LABEL: name: fptoui_s64_s64_gpr
-name:            fptoui_s64_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = FCVTZUUXDr %0
-body:             |
-  bb.0:
-    liveins: %d0
-
-    %0(s64) = COPY %d0
-    %1(s64) = G_FPTOUI %0
-...
-
----
-# CHECK-LABEL: name: fptrunc
-name:            fptrunc
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK: - { id: 0, class: fpr64 }
-# CHECK: - { id: 1, class: fpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = FCVTSDr %0
-body:             |
-  bb.0:
-    liveins: %d0
-
-    %0(s64) = COPY %d0
-    %1(s32) = G_FPTRUNC %0
-...
-
----
-# CHECK-LABEL: name: fpext
-name:            fpext
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK: - { id: 0, class: fpr32 }
-# CHECK: - { id: 1, class: fpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = FCVTDSr %0
-body:             |
-  bb.0:
-    liveins: %d0
-
-    %0(s32) = COPY %s0
-    %1(s64) = G_FPEXT %0
-...
-
----
-# CHECK-LABEL: name: unconditional_br
-name:            unconditional_br
-legalized:       true
-regBankSelected: true
-
-# CHECK:  body:
-# CHECK:   bb.0:
-# CHECK:    successors: %bb.0
-# CHECK:    B %bb.0
-body:             |
-  bb.0:
-    successors: %bb.0
-
-    G_BR %bb.0
-...
-
----
-# CHECK-LABEL: name: conditional_br
-name:            conditional_br
-legalized:       true
-regBankSelected: true
-
-registers:
-  - { id: 0, class: gpr }
-
-# CHECK:  body:
-# CHECK:   bb.0:
-# CHECK:    TBNZW %0, 0, %bb.1
-# CHECK:    B %bb.0
-body:             |
-  bb.0:
-    successors: %bb.0, %bb.1
-    %0(s1) = COPY %w0
-    G_BRCOND %0(s1), %bb.1
-    G_BR %bb.0
-
-  bb.1:
-...
-
----
-# CHECK-LABEL: name: indirect_br
-name:            indirect_br
-legalized:       true
-regBankSelected: true
-
-registers:
-  - { id: 0, class: gpr }
-
-# CHECK:  body:
-# CHECK:   bb.0:
-# CHECK:    %0 = COPY %x0
-# CHECK:    BR %0
-body:             |
-  bb.0:
-    successors: %bb.0, %bb.1
-    %0(p0) = COPY %x0
-    G_BRINDIRECT %0(p0)
-
-  bb.1:
-...
-
----
-# CHECK-LABEL: name: load_s64_gpr
-name:            load_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = LDRXui %0, 0 :: (load 8 from %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(p0) = COPY %x0
-    %1(s64) = G_LOAD  %0 :: (load 8 from %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: load_s32_gpr
-name:            load_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = LDRWui %0, 0 :: (load 4 from %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(p0) = COPY %x0
-    %1(s32) = G_LOAD  %0 :: (load 4 from %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: load_s16_gpr
-name:            load_s16_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = LDRHHui %0, 0 :: (load 2 from %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(p0) = COPY %x0
-    %1(s16) = G_LOAD  %0 :: (load 2 from %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: load_s8_gpr
-name:            load_s8_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = LDRBBui %0, 0 :: (load 1 from %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(p0) = COPY %x0
-    %1(s8) = G_LOAD  %0 :: (load 1 from %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: load_s64_fpr
-name:            load_s64_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = LDRDui %0, 0 :: (load 8 from %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(p0) = COPY %x0
-    %1(s64) = G_LOAD  %0 :: (load 8 from %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: load_s32_fpr
-name:            load_s32_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = LDRSui %0, 0 :: (load 4 from %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(p0) = COPY %x0
-    %1(s32) = G_LOAD  %0 :: (load 4 from %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: load_s16_fpr
-name:            load_s16_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: fpr16 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = LDRHui %0, 0 :: (load 2 from %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(p0) = COPY %x0
-    %1(s16) = G_LOAD  %0 :: (load 2 from %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: load_s8_fpr
-name:            load_s8_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: fpr8 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = LDRBui %0, 0 :: (load 1 from %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(p0) = COPY %x0
-    %1(s8) = G_LOAD  %0 :: (load 1 from %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: store_s64_gpr
-name:            store_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = COPY %x1
-# CHECK: STRXui %1, %0, 0 :: (store 8 into %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0, %x1
-
-    %0(p0) = COPY %x0
-    %1(s64) = COPY %x1
-    G_STORE  %1, %0 :: (store 8 into %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: store_s32_gpr
-name:            store_s32_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = COPY %w1
-# CHECK: STRWui %1, %0, 0 :: (store 4 into %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0, %w1
-
-    %0(p0) = COPY %x0
-    %1(s32) = COPY %w1
-    G_STORE  %1, %0 :: (store 4 into %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: store_s16_gpr
-name:            store_s16_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = COPY %w1
-# CHECK: STRHHui %1, %0, 0 :: (store 2 into %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0, %w1
-
-    %0(p0) = COPY %x0
-    %1(s16) = COPY %w1
-    G_STORE  %1, %0 :: (store 2 into %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: store_s8_gpr
-name:            store_s8_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = COPY %w1
-# CHECK: STRBBui %1, %0, 0 :: (store 1 into %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0, %w1
-
-    %0(p0) = COPY %x0
-    %1(s8) = COPY %w1
-    G_STORE  %1, %0 :: (store 1 into %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: store_s64_fpr
-name:            store_s64_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = COPY %d1
-# CHECK: STRDui %1, %0, 0 :: (store 8 into %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0, %d1
-
-    %0(p0) = COPY %x0
-    %1(s64) = COPY %d1
-    G_STORE  %1, %0 :: (store 8 into %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: store_s32_fpr
-name:            store_s32_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK: %0 = COPY %x0
-# CHECK: %1 = COPY %s1
-# CHECK: STRSui %1, %0, 0 :: (store 4 into %ir.addr)
-body:             |
-  bb.0:
-    liveins: %x0, %s1
-
-    %0(p0) = COPY %x0
-    %1(s32) = COPY %s1
-    G_STORE  %1, %0 :: (store 4 into %ir.addr)
-
-...
-
----
-# CHECK-LABEL: name: frame_index
-name:            frame_index
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64sp }
-registers:
-  - { id: 0, class: gpr }
-
-stack:
-  - { id: 0, name: ptr0, offset: 0, size: 8, alignment: 8 }
-
-# CHECK:  body:
-# CHECK: %0 = ADDXri %stack.0.ptr0, 0, 0
-body:             |
-  bb.0:
-    %0(p0) = G_FRAME_INDEX %stack.0.ptr0
-...
-
----
-# Check that we set the "selected" property.
-# CHECK-LABEL: name: selected_property
-# CHECK: legalized: true
-# CHECK-NEXT: regBankSelected: true
-# CHECK-NEXT: selected: true
-name:            selected_property
-legalized:       true
-regBankSelected: true
-selected:        false
-body:             |
-  bb.0:
-...
-
----
-# CHECK-LABEL: name: const_s32
-name:            const_s32
-legalized:       true
-regBankSelected: true
-registers:
-  - { id: 0, class: gpr }
-
-# CHECK:  body:
-# CHECK: %0 = MOVi32imm 42
-body:             |
-  bb.0:
-    %0(s32) = G_CONSTANT i32 42
-...
-
----
-# CHECK-LABEL: name: const_s64
-name:            const_s64
-legalized:       true
-regBankSelected: true
-registers:
-  - { id: 0, class: gpr }
-
-# CHECK:  body:
-# CHECK: %0 = MOVi64imm 1234567890123
-body:             |
-  bb.0:
-    %0(s64) = G_CONSTANT i64 1234567890123
-...
-
----
-# CHECK-LABEL: name: fconst_s32
-name:            fconst_s32
-legalized:       true
-regBankSelected: true
-registers:
-  - { id: 0, class: fpr }
-
-# CHECK:  body:
-# CHECK: [[TMP:%[0-9]+]] = MOVi32imm 1080033280
-# CHECK: %0 = COPY [[TMP]]
-body:             |
-  bb.0:
-    %0(s32) = G_FCONSTANT float 3.5
-...
-
----
-# CHECK-LABEL: name: fconst_s64
-name:            fconst_s64
-legalized:       true
-regBankSelected: true
-registers:
-  - { id: 0, class: fpr }
-
-# CHECK:  body:
-# CHECK: [[TMP:%[0-9]+]] = MOVi64imm 4607182418800017408
-# CHECK: %0 = COPY [[TMP]]
-body:             |
-  bb.0:
-    %0(s64) = G_FCONSTANT double 1.0
-...
-
----
-# CHECK-LABEL: name: gep
-name:            gep
-legalized:       true
-regBankSelected: true
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-
-# CHECK:  body:
-# CHECK: %1 = MOVi64imm 42
-# CHECK: %2 = ADDXrr %0, %1
-body:             |
-  bb.0:
-      liveins: %x0
-    %0(p0) = COPY %x0
-    %1(s64) = G_CONSTANT i64 42
-    %2(p0) = G_GEP %0, %1(s64)
-...
-
----
-# Global defined in the same linkage unit so no GOT is needed
-# CHECK-LABEL: name: global_local
-name:            global_local
-legalized:       true
-regBankSelected: true
-registers:
-  - { id: 0, class: gpr }
-
-# CHECK:  body:
-# IOS: %0 = MOVaddr target-flags(aarch64-page) @var_local, target-flags(aarch64-pageoff, aarch64-nc) @var_local
-# LINUX-DEFAULT: %0 = MOVaddr target-flags(aarch64-page) @var_local, target-flags(aarch64-pageoff, aarch64-nc) @var_local
-# LINUX-PIC: %0 = LOADgot target-flags(aarch64-got) @var_local
-body:             |
-  bb.0:
-    %0(p0) = G_GLOBAL_VALUE @var_local
-...
-
----
-# CHECK-LABEL: name: global_got
-name:            global_got
-legalized:       true
-regBankSelected: true
-registers:
-  - { id: 0, class: gpr }
-
-# CHECK:  body:
-# IOS: %0 = LOADgot target-flags(aarch64-got) @var_got
-# LINUX-DEFAULT: %0 = MOVaddr target-flags(aarch64-page) @var_got, target-flags(aarch64-pageoff, aarch64-nc) @var_got
-# LINUX-PIC: %0 = LOADgot target-flags(aarch64-got) @var_got
-body:             |
-  bb.0:
-    %0(p0) = G_GLOBAL_VALUE @var_got
-...
-
----
-# CHECK-LABEL: name: trunc
-name:            trunc
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-# CHECK-NEXT:  - { id: 3, class: gpr32 }
-# CHECK-NEXT:  - { id: 4, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-  - { id: 4, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %1 = COPY %0
-# CHECK:    %3 = COPY %2.sub_32
-# CHECK:    %4 = COPY %2.sub_32
-body:             |
-  bb.0:
-    liveins: %w0, %x0
-
-    %0(s32) = COPY %w0
-    %1(s1) = G_TRUNC %0
-
-    %2(s64) = COPY %x0
-    %3(s32) = G_TRUNC %2
-    %4(s8) = G_TRUNC %2
-...
-
----
-# CHECK-LABEL: name: anyext_gpr
-name:            anyext_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32all }
-# CHECK-NEXT:  - { id: 1, class: gpr64all }
-# CHECK-NEXT:  - { id: 2, class: gpr32all }
-# CHECK-NEXT:  - { id: 3, class: gpr32all }
-# CHECK-NEXT:  - { id: 4, class: gpr64all }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %4 = SUBREG_TO_REG 0, %0, 15
-# CHECK:    %1 = COPY %4
-# CHECK:    %2 = COPY %w0
-# CHECK:    %3 = COPY %2
-body:             |
-  bb.0:
-    liveins: %w0
-
-    %0(s32) = COPY %w0
-    %1(s64) = G_ANYEXT %0
-    %2(s8) = COPY %w0
-    %3(s32) = G_ANYEXT %2
-...
-
----
-# CHECK-LABEL: name: zext_gpr
-name:            zext_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-# CHECK-NEXT:  - { id: 3, class: gpr32 }
-# CHECK-NEXT:  - { id: 4, class: gpr32 }
-# CHECK-NEXT:  - { id: 5, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-  - { id: 4, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %5 = SUBREG_TO_REG 0, %0, 15
-# CHECK:    %1 = UBFMXri %5, 0, 31
-# CHECK:    %2 = COPY %w0
-# CHECK:    %3 = UBFMWri %2, 0, 7
-# CHECK:    %4 = UBFMWri %2, 0, 7
-body:             |
-  bb.0:
-    liveins: %w0
-
-    %0(s32) = COPY %w0
-    %1(s64) = G_ZEXT %0
-    %2(s8) = COPY %w0
-    %3(s32) = G_ZEXT %2
-    %4(s16)= G_ZEXT %2
-...
-
----
-# CHECK-LABEL: name: sext_gpr
-name:            sext_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-# CHECK-NEXT:  - { id: 3, class: gpr32 }
-# CHECK-NEXT:  - { id: 4, class: gpr32 }
-# CHECK-NEXT:  - { id: 5, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-  - { id: 4, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %5 = SUBREG_TO_REG 0, %0, 15
-# CHECK:    %1 = SBFMXri %5, 0, 31
-# CHECK:    %2 = COPY %w0
-# CHECK:    %3 = SBFMWri %2, 0, 7
-# CHECK:    %4 = SBFMWri %2, 0, 7
-body:             |
-  bb.0:
-    liveins: %w0
-
-    %0(s32) = COPY %w0
-    %1(s64) = G_SEXT %0
-    %2(s8) = COPY %w0
-    %3(s32) = G_SEXT %2
-    %4(s16) = G_SEXT %2
-...
-
----
-# CHECK-LABEL: name: casts
-name:            casts
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64all }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-# CHECK-NEXT:  - { id: 3, class: gpr64 }
-# CHECK-NEXT:  - { id: 4, class: gpr32 }
-# CHECK-NEXT:  - { id: 5, class: gpr32 }
-# CHECK-NEXT:  - { id: 6, class: gpr32 }
-# CHECK-NEXT:  - { id: 7, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-  - { id: 4, class: gpr }
-  - { id: 5, class: gpr }
-  - { id: 6, class: gpr }
-  - { id: 7, class: gpr }
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %0
-# CHECK:    %2 = COPY %0
-# CHECK:    %3 = COPY %2
-# CHECK:    %4 = COPY %2.sub_32
-# CHECK:    %5 = COPY %2.sub_32
-# CHECK:    %6 = COPY %2.sub_32
-# CHECK:    %7 = COPY %2.sub_32
-body:             |
-  bb.0:
-    liveins: %x0
-    %0(s64) = COPY %x0
-    %1(<8 x s8>) = G_BITCAST %0(s64)
-    %2(p0) = G_INTTOPTR %0
-
-    %3(s64) = G_PTRTOINT %2
-    %4(s32) = G_PTRTOINT %2
-    %5(s16) = G_PTRTOINT %2
-    %6(s8) = G_PTRTOINT %2
-    %7(s1) = G_PTRTOINT %2
-...
-
----
-# CHECK-LABEL: name: bitcast_s32_gpr
-name:            bitcast_s32_gpr
-legalized:       true
-regBankSelected: true
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32all }
-# CHECK-NEXT:  - { id: 1, class: gpr32all }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %0
-body:             |
-  bb.0:
-    liveins: %w0
-
-    %0(s32) = COPY %w0
-    %1(s32) = G_BITCAST %0
-...
-
----
-# CHECK-LABEL: name: bitcast_s32_fpr
-name:            bitcast_s32_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = COPY %0
-body:             |
-  bb.0:
-    liveins: %s0
-
-    %0(s32) = COPY %s0
-    %1(s32) = G_BITCAST %0
-...
-
----
-# CHECK-LABEL: name: bitcast_s32_gpr_fpr
-name:            bitcast_s32_gpr_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32all }
-# CHECK-NEXT:  - { id: 1, class: fpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %w0
-# CHECK:    %1 = COPY %0
-body:             |
-  bb.0:
-    liveins: %w0
-
-    %0(s32) = COPY %w0
-    %1(s32) = G_BITCAST %0
-...
-
----
-# CHECK-LABEL: name: bitcast_s32_fpr_gpr
-name:            bitcast_s32_fpr_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32all }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %s0
-# CHECK:    %1 = COPY %0
-body:             |
-  bb.0:
-    liveins: %s0
-
-    %0(s32) = COPY %s0
-    %1(s32) = G_BITCAST %0
-...
-
----
-# CHECK-LABEL: name: bitcast_s64_gpr
-name:            bitcast_s64_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64all }
-# CHECK-NEXT:  - { id: 1, class: gpr64all }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %0
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(s64) = COPY %x0
-    %1(s64) = G_BITCAST %0
-...
-
----
-# CHECK-LABEL: name: bitcast_s64_fpr
-name:            bitcast_s64_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: fpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = COPY %0
-body:             |
-  bb.0:
-    liveins: %d0
-
-    %0(s64) = COPY %d0
-    %1(s64) = G_BITCAST %0
-...
-
----
-# CHECK-LABEL: name: bitcast_s64_gpr_fpr
-name:            bitcast_s64_gpr_fpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr64all }
-# CHECK-NEXT:  - { id: 1, class: fpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: fpr }
-# CHECK:  body:
-# CHECK:    %0 = COPY %x0
-# CHECK:    %1 = COPY %0
-body:             |
-  bb.0:
-    liveins: %x0
-
-    %0(s64) = COPY %x0
-    %1(s64) = G_BITCAST %0
-...
-
----
-# CHECK-LABEL: name: bitcast_s64_fpr_gpr
-name:            bitcast_s64_fpr_gpr
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr64 }
-# CHECK-NEXT:  - { id: 1, class: gpr64all }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %0 = COPY %d0
-# CHECK:    %1 = COPY %0
-body:             |
-  bb.0:
-    liveins: %d0
-
-    %0(s64) = COPY %d0
-    %1(s64) = G_BITCAST %0
-...
-
----
-# CHECK-LABEL: name: icmp
-name:            icmp
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr64 }
-# CHECK-NEXT:  - { id: 3, class: gpr32 }
-# CHECK-NEXT:  - { id: 4, class: gpr64 }
-# CHECK-NEXT:  - { id: 5, class: gpr32 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-  - { id: 4, class: gpr }
-  - { id: 5, class: gpr }
-
-# CHECK:  body:
-# CHECK:    %wzr = SUBSWrr %0, %0, implicit-def %nzcv
-# CHECK:    %1 = CSINCWr %wzr, %wzr, 1, implicit %nzcv
-
-# CHECK:    %xzr = SUBSXrr %2, %2, implicit-def %nzcv
-# CHECK:    %3 = CSINCWr %wzr, %wzr, 3, implicit %nzcv
-
-# CHECK:    %xzr = SUBSXrr %4, %4, implicit-def %nzcv
-# CHECK:    %5 = CSINCWr %wzr, %wzr, 0, implicit %nzcv
-
-body:             |
-  bb.0:
-    liveins: %w0, %x0
-
-    %0(s32) = COPY %w0
-    %1(s1) = G_ICMP intpred(eq), %0, %0
-
-    %2(s64) = COPY %x0
-    %3(s1) = G_ICMP intpred(uge), %2, %2
-
-    %4(p0) = COPY %x0
-    %5(s1) = G_ICMP intpred(ne), %4, %4
-...
-
----
-# CHECK-LABEL: name: fcmp
-name:            fcmp
-legalized:       true
-regBankSelected: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: fpr64 }
-# CHECK-NEXT:  - { id: 3, class: gpr32 }
-# CHECK-NEXT:  - { id: 4, class: gpr32 }
-# CHECK-NEXT:  - { id: 5, class: gpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: fpr }
-  - { id: 3, class: gpr }
-
-# CHECK:  body:
-# CHECK:    FCMPSrr %0, %0, implicit-def %nzcv
-# CHECK:    [[TST_MI:%[0-9]+]] = CSINCWr %wzr, %wzr, 5, implicit %nzcv
-# CHECK:    [[TST_GT:%[0-9]+]] = CSINCWr %wzr, %wzr, 13, implicit %nzcv
-# CHECK:    %1 = ORRWrr [[TST_MI]], [[TST_GT]]
-
-# CHECK:    FCMPDrr %2, %2, implicit-def %nzcv
-# CHECK:    %3 = CSINCWr %wzr, %wzr, 4, implicit %nzcv
-
-body:             |
-  bb.0:
-    liveins: %w0, %x0
-
-    %0(s32) = COPY %s0
-    %1(s1) = G_FCMP floatpred(one), %0, %0
-
-    %2(s64) = COPY %d0
-    %3(s1) = G_FCMP floatpred(uge), %2, %2
-
-...
-
----
-# CHECK-LABEL: name: phi
-name:            phi
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: fpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: fpr32 }
-registers:
-  - { id: 0, class: fpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: fpr }
-
-# CHECK:  body:
-# CHECK:    bb.1:
-# CHECK:      %2 = PHI %0, %bb.0, %2, %bb.1
-
-body:             |
-  bb.0:
-    liveins: %s0, %w0
-    successors: %bb.1
-    %0(s32) = COPY %s0
-    %1(s1) = COPY %w0
-
-  bb.1:
-    successors: %bb.1, %bb.2
-    %2(s32) = PHI %0, %bb.0, %2, %bb.1
-    G_BRCOND %1, %bb.1
-
-  bb.2:
-    %s0 = COPY %2
-    RET_ReallyLR implicit %s0
-...
-
----
-# CHECK-LABEL: name: select
-name:            select
-legalized:       true
-regBankSelected: true
-tracksRegLiveness: true
-
-# CHECK:      registers:
-# CHECK-NEXT:  - { id: 0, class: gpr32 }
-# CHECK-NEXT:  - { id: 1, class: gpr32 }
-# CHECK-NEXT:  - { id: 2, class: gpr32 }
-# CHECK-NEXT:  - { id: 3, class: gpr32 }
-# CHECK-NEXT:  - { id: 4, class: gpr64 }
-# CHECK-NEXT:  - { id: 5, class: gpr64 }
-# CHECK-NEXT:  - { id: 6, class: gpr64 }
-# CHECK-NEXT:  - { id: 7, class: gpr64 }
-# CHECK-NEXT:  - { id: 8, class: gpr64 }
-# CHECK-NEXT:  - { id: 9, class: gpr64 }
-registers:
-  - { id: 0, class: gpr }
-  - { id: 1, class: gpr }
-  - { id: 2, class: gpr }
-  - { id: 3, class: gpr }
-  - { id: 4, class: gpr }
-  - { id: 5, class: gpr }
-  - { id: 6, class: gpr }
-  - { id: 7, class: gpr }
-  - { id: 8, class: gpr }
-  - { id: 9, class: gpr }
-
-# CHECK:  body:
-# CHECK:      %wzr = ANDSWri %0, 0, implicit-def %nzcv
-# CHECK:      %3 = CSELWr %1, %2, 1, implicit %nzcv
-# CHECK:      %wzr = ANDSWri %0, 0, implicit-def %nzcv
-# CHECK:      %6 = CSELXr %4, %5, 1, implicit %nzcv
-# CHECK:      %wzr = ANDSWri %0, 0, implicit-def %nzcv
-# CHECK:      %9 = CSELXr %7, %8, 1, implicit %nzcv
-body:             |
-  bb.0:
-    liveins: %w0, %w1, %w2
-    %0(s1) = COPY %w0
-
-    %1(s32) = COPY %w1
-    %2(s32) = COPY %w2
-    %3(s32) = G_SELECT %0, %1, %2
-
-    %4(s64) = COPY %x0
-    %5(s64) = COPY %x1
-    %6(s64) = G_SELECT %0, %4, %5
-
-    %7(p0) = COPY %x0
-    %8(p0) = COPY %x1
-    %9(p0) = G_SELECT %0, %7, %8
-...
diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index bac3017a925b53a90efadcf6eaafdb3686f20b4d..02848021dbc09a038349e90f470c042dd3ebf70f 100644
--- a/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -52,18 +52,40 @@ define void @allocai64() {
 ; CHECK: body:
 ;
 ; ABI/constant lowering and IR-level entry basic block.
-; CHECK: {{bb.[0-9]+}} (%ir-block.{{[0-9]+}}):
+; CHECK: {{bb.[0-9]+}}.entry:
 ;
 ; Make sure we have one successor and only one.
-; CHECK-NEXT: successors: %[[END:bb.[0-9]+.end]](0x80000000)
+; CHECK-NEXT: successors: %[[BB2:bb.[0-9]+.bb2]](0x80000000)
 ;
 ; Check that we emit the correct branch.
-; CHECK: G_BR %[[END]]
+; CHECK: G_BR %[[BB2]]
 ;
 ; Check that end contains the return instruction.
-; CHECK: [[END]]:
+; CHECK: [[END:bb.[0-9]+.end]]:
 ; CHECK-NEXT: RET_ReallyLR
+;
+; CHECK: {{bb.[0-9]+}}.bb2:
+; CHECK-NEXT: successors: %[[END]](0x80000000)
+; CHECK: G_BR %[[END]]
 define void @uncondbr() {
+entry:
+  br label %bb2
+end:
+  ret void
+bb2:
+  br label %end
+}
+
+; CHECK-LABEL: name: uncondbr_fallthrough
+; CHECK: body:
+; CHECK: {{bb.[0-9]+}}.entry:
+; CHECK-NEXT: successors: %[[END:bb.[0-9]+.end]](0x80000000)
+; We don't emit a branch here, as we can fallthrough to the successor.
+; CHECK-NOT: G_BR
+; CHECK: [[END]]:
+; CHECK-NEXT: RET_ReallyLR
+define void @uncondbr_fallthrough() {
+entry:
   br label %end
 end:
   ret void
@@ -117,33 +139,35 @@ false:
 ; CHECK: G_BRCOND %[[regicmp100]](s1), %[[BB_CASE100]]
 ; CHECK: G_BR %[[BB_NOTCASE100_CHECKNEXT]]
 ;
-; CHECK: [[BB_CASE100]]:
-; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000)
-; CHECK: %[[regretc100:[0-9]+]](s32) = G_ADD %0, %[[reg1]]
-; CHECK: G_BR %[[BB_RET]]
 ; CHECK: [[BB_NOTCASE100_CHECKNEXT]]:
 ; CHECK-NEXT: successors: %[[BB_CASE200:bb.[0-9]+.case200]](0x40000000), %[[BB_NOTCASE200_CHECKNEXT:bb.[0-9]+.entry]](0x40000000)
 ; CHECK: %[[regicmp200:[0-9]+]](s1) = G_ICMP intpred(eq), %[[reg200]](s32), %0
 ; CHECK: G_BRCOND %[[regicmp200]](s1), %[[BB_CASE200]]
 ; CHECK: G_BR %[[BB_NOTCASE200_CHECKNEXT]]
 ;
-; CHECK: [[BB_CASE200]]:
-; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000)
-; CHECK: %[[regretc200:[0-9]+]](s32) = G_ADD %0, %[[reg2]]
-; CHECK: G_BR %[[BB_RET]]
 ; CHECK: [[BB_NOTCASE200_CHECKNEXT]]:
 ; CHECK-NEXT: successors: %[[BB_DEFAULT:bb.[0-9]+.default]](0x80000000)
 ; CHECK: G_BR %[[BB_DEFAULT]]
 ;
 ; CHECK: [[BB_DEFAULT]]:
-; CHECK-NEXT: successors: %[[BB_RET]](0x80000000)
+; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000)
 ; CHECK: %[[regretdefault:[0-9]+]](s32) = G_ADD %0, %[[reg0]]
 ; CHECK: G_BR %[[BB_RET]]
 ;
+; CHECK: [[BB_CASE100]]:
+; CHECK-NEXT: successors: %[[BB_RET:bb.[0-9]+.return]](0x80000000)
+; CHECK: %[[regretc100:[0-9]+]](s32) = G_ADD %0, %[[reg1]]
+; CHECK: G_BR %[[BB_RET]]
+;
+; CHECK: [[BB_CASE200]]:
+; CHECK-NEXT: successors: %[[BB_RET]](0x80000000)
+; CHECK: %[[regretc200:[0-9]+]](s32) = G_ADD %0, %[[reg2]]
+;
 ; CHECK: [[BB_RET]]:
 ; CHECK-NEXT: %[[regret:[0-9]+]](s32) = PHI %[[regretdefault]](s32), %[[BB_DEFAULT]], %[[regretc100]](s32), %[[BB_CASE100]]
 ; CHECK:  %w0 = COPY %[[regret]](s32)
 ; CHECK:  RET_ReallyLR implicit %w0
+;
 define i32 @switch(i32 %argc) {
 entry:
   switch i32 %argc, label %default [
@@ -172,13 +196,17 @@ return:
   ; %entry block is no longer a predecessor for the phi instruction. We need to
   ; use the correct lowered MachineBasicBlock instead.
 ; CHECK-LABEL: name: test_cfg_remap
-
-; CHECK: bb.5.entry:
-; CHECK-NEXT: successors: %[[PHI_BLOCK:bb.[0-9]+.phi.block]]
+; CHECK: {{bb.[0-9]+.entry}}:
+; CHECK-NEXT: successors: %{{bb.[0-9]+.next}}(0x40000000), %[[NOTCASE1_BLOCK:bb.[0-9]+.entry]](0x40000000)
+; CHECK: [[NOTCASE1_BLOCK]]:
+; CHECK-NEXT: successors: %{{bb.[0-9]+.other}}(0x40000000), %[[NOTCASE57_BLOCK:bb.[0-9]+.entry]](0x40000000)
+; CHECK: [[NOTCASE57_BLOCK]]:
+; CHECK-NEXT: successors: %[[PHI_BLOCK:bb.[0-9]+.phi.block]](0x80000000)
 ; CHECK: G_BR %[[PHI_BLOCK]]
-
+;
 ; CHECK: [[PHI_BLOCK]]:
-; CHECK-NEXT: PHI %{{.*}}(s32), %bb.5.entry
+; CHECK-NEXT: PHI %{{.*}}(s32), %[[NOTCASE57_BLOCK:bb.[0-9]+.entry]], %{{.*}}(s32),
+;
 define i32 @test_cfg_remap(i32 %in) {
 entry:
   switch i32 %in, label %phi.block [i32 1, label %next
@@ -225,7 +253,7 @@ phi.block:
 ; CHECK: {{bb.[0-9]+.entry}}:
 ; Make sure we have one successor
 ; CHECK-NEXT: successors: %[[BB_L1:bb.[0-9]+.L1]](0x80000000)
-; CHECK: G_BR %[[BB_L1]]
+; CHECK-NOT: G_BR
 ;
 ; Check basic block L1 has 2 successors: BBL1 and BBL2
 ; CHECK: [[BB_L1]] (address-taken):
@@ -378,11 +406,11 @@ define i64* @trivial_bitcast(i8* %a) {
 ; CHECK:     [[A:%[0-9]+]](p0) = COPY %x0
 ; CHECK:     G_BR %[[CAST:bb\.[0-9]+.cast]]
 
+; CHECK: [[END:bb\.[0-9]+.end]]:
+
 ; CHECK: [[CAST]]:
 ; CHECK:     {{%[0-9]+}}(p0) = COPY [[A]]
-; CHECK:     G_BR %[[END:bb\.[0-9]+.end]]
-
-; CHECK: [[END]]:
+; CHECK:     G_BR %[[END]]
 define i64* @trivial_bitcast_with_copy(i8* %a) {
   br label %cast
 
@@ -460,7 +488,8 @@ define void @store(i64* %addr, i64 addrspace(42)* %addr42, i64 %val1, i64 %val2)
 ; CHECK-LABEL: name: intrinsics
 ; CHECK: [[CUR:%[0-9]+]](s32) = COPY %w0
 ; CHECK: [[BITS:%[0-9]+]](s32) = COPY %w1
-; CHECK: [[PTR:%[0-9]+]](p0) = G_INTRINSIC intrinsic(@llvm.returnaddress), 0
+; CHECK: [[CREG:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[PTR:%[0-9]+]](p0) = G_INTRINSIC intrinsic(@llvm.returnaddress), [[CREG]]
 ; CHECK: [[PTR_VEC:%[0-9]+]](p0) = G_FRAME_INDEX %stack.0.ptr.vec
 ; CHECK: [[VEC:%[0-9]+]](<8 x s8>) = G_LOAD [[PTR_VEC]]
 ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.aarch64.neon.st2), [[VEC]](<8 x s8>), [[VEC]](<8 x s8>), [[PTR]](p0)
@@ -518,8 +547,8 @@ define void @unreachable(i32 %a) {
 ; CHECK-LABEL: name: constant_int
 ; CHECK: [[IN:%[0-9]+]](s32) = COPY %w0
 ; CHECK: [[ONE:%[0-9]+]](s32) = G_CONSTANT i32 1
-; CHECK: G_BR
 
+; CHECK: {{bb.[0-9]+}}.next:
 ; CHECK: [[SUM1:%[0-9]+]](s32) = G_ADD [[IN]], [[ONE]]
 ; CHECK: [[SUM2:%[0-9]+]](s32) = G_ADD [[IN]], [[ONE]]
 ; CHECK: [[RES:%[0-9]+]](s32) = G_ADD [[SUM1]], [[SUM2]]
@@ -881,7 +910,7 @@ define void @test_extractvalue_agg(%struct.nested* %addr, {i8, i32}* %addr2) {
 ; CHECK-LABEL: name: test_insertvalue
 ; CHECK: [[VAL:%[0-9]+]](s32) = COPY %w1
 ; CHECK: [[STRUCT:%[0-9]+]](s128) = G_LOAD
-; CHECK: [[NEWSTRUCT:%[0-9]+]](s128) = G_INSERT [[STRUCT]](s128), [[VAL]](s32), 64
+; CHECK: [[NEWSTRUCT:%[0-9]+]](s128) = G_INSERT [[STRUCT]], [[VAL]](s32), 64
 ; CHECK: G_STORE [[NEWSTRUCT]](s128),
 define void @test_insertvalue(%struct.nested* %addr, i32 %val) {
   %struct = load %struct.nested, %struct.nested* %addr
@@ -890,10 +919,30 @@ define void @test_insertvalue(%struct.nested* %addr, i32 %val) {
   ret void
 }
 
+define [1 x i64] @test_trivial_insert([1 x i64] %s, i64 %val) {
+; CHECK-LABEL: name: test_trivial_insert
+; CHECK: [[STRUCT:%[0-9]+]](s64) = COPY %x0
+; CHECK: [[VAL:%[0-9]+]](s64) = COPY %x1
+; CHECK: [[RES:%[0-9]+]](s64) = COPY [[VAL]](s64)
+; CHECK: %x0 = COPY [[RES]]
+  %res = insertvalue [1 x i64] %s, i64 %val, 0
+  ret [1 x i64] %res
+}
+
+define [1 x i8*] @test_trivial_insert_ptr([1 x i8*] %s, i8* %val) {
+; CHECK-LABEL: name: test_trivial_insert_ptr
+; CHECK: [[STRUCT:%[0-9]+]](s64) = COPY %x0
+; CHECK: [[VAL:%[0-9]+]](p0) = COPY %x1
+; CHECK: [[RES:%[0-9]+]](s64) = G_PTRTOINT [[VAL]](p0)
+; CHECK: %x0 = COPY [[RES]]
+  %res = insertvalue [1 x i8*] %s, i8* %val, 0
+  ret [1 x i8*] %res
+}
+
 ; CHECK-LABEL: name: test_insertvalue_agg
 ; CHECK: [[SMALLSTRUCT:%[0-9]+]](s64) = G_LOAD
 ; CHECK: [[STRUCT:%[0-9]+]](s128) = G_LOAD
-; CHECK: [[RES:%[0-9]+]](s128) = G_INSERT [[STRUCT]](s128), [[SMALLSTRUCT]](s64), 32
+; CHECK: [[RES:%[0-9]+]](s128) = G_INSERT [[STRUCT]], [[SMALLSTRUCT]](s64), 32
 ; CHECK: G_STORE [[RES]](s128)
 define void @test_insertvalue_agg(%struct.nested* %addr, {i8, i32}* %addr2) {
   %smallstruct = load {i8, i32}, {i8, i32}* %addr2
@@ -925,6 +974,30 @@ define i8* @test_select_ptr(i1 %tst, i8* %lhs, i8* %rhs) {
   ret i8* %res
 }
 
+; CHECK-LABEL: name: test_select_vec
+; CHECK: [[TST:%[0-9]+]](s1) = COPY %w0
+; CHECK: [[LHS:%[0-9]+]](<4 x s32>) = COPY %q0
+; CHECK: [[RHS:%[0-9]+]](<4 x s32>) = COPY %q1
+; CHECK: [[RES:%[0-9]+]](<4 x s32>) = G_SELECT [[TST]](s1), [[LHS]], [[RHS]]
+; CHECK: %q0 = COPY [[RES]]
+define <4 x i32> @test_select_vec(i1 %tst, <4 x i32> %lhs, <4 x i32> %rhs) {
+  %res = select i1 %tst, <4 x i32> %lhs, <4 x i32> %rhs
+  ret <4 x i32> %res
+}
+
+; CHECK-LABEL: name: test_vselect_vec
+; CHECK: [[TST32:%[0-9]+]](<4 x s32>) = COPY %q0
+; CHECK: [[LHS:%[0-9]+]](<4 x s32>) = COPY %q1
+; CHECK: [[RHS:%[0-9]+]](<4 x s32>) = COPY %q2
+; CHECK: [[TST:%[0-9]+]](<4 x s1>) = G_TRUNC [[TST32]](<4 x s32>)
+; CHECK: [[RES:%[0-9]+]](<4 x s32>) = G_SELECT [[TST]](<4 x s1>), [[LHS]], [[RHS]]
+; CHECK: %q0 = COPY [[RES]]
+define <4 x i32> @test_vselect_vec(<4 x i32> %tst32, <4 x i32> %lhs, <4 x i32> %rhs) {
+  %tst = trunc <4 x i32> %tst32 to <4 x i1>
+  %res = select <4 x i1> %tst, <4 x i32> %lhs, <4 x i32> %rhs
+  ret <4 x i32> %res
+}
+
 ; CHECK-LABEL: name: test_fptosi
 ; CHECK: [[FPADDR:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[FP:%[0-9]+]](s32) = G_LOAD [[FPADDR]](p0)
@@ -1012,6 +1085,19 @@ define void @float_comparison(float* %a.addr, float* %b.addr, i1* %bool.addr) {
   ret void
 }
 
+; CHECK-LABEL: name: trivial_float_comparison
+; CHECK: [[ENTRY_R1:%[0-9]+]](s1) = G_CONSTANT i1 false
+; CHECK: [[ENTRY_R2:%[0-9]+]](s1) = G_CONSTANT i1 true
+; CHECK: [[R1:%[0-9]+]](s1) = COPY [[ENTRY_R1]](s1)
+; CHECK: [[R2:%[0-9]+]](s1) = COPY [[ENTRY_R2]](s1)
+; CHECK: G_ADD [[R1]], [[R2]]
+define i1 @trivial_float_comparison(double %a, double %b) {
+  %r1 = fcmp false double %a, %b
+  %r2 = fcmp true double %a, %b
+  %sum = add i1 %r1, %r2
+  ret i1 %sum
+}
+
 @var = global i32 0
 
 define i32* @test_global() {
@@ -1117,9 +1203,341 @@ define i8* @test_const_placement() {
 ; CHECK: bb.{{[0-9]+}} (%ir-block.{{[0-9]+}}):
 ; CHECK:   [[VAL_INT:%[0-9]+]](s32) = G_CONSTANT i32 42
 ; CHECK:   [[VAL:%[0-9]+]](p0) = G_INTTOPTR [[VAL_INT]](s32)
-; CHECK:   G_BR
+; CHECK: {{bb.[0-9]+}}.next:
   br label %next
 
 next:
   ret i8* inttoptr(i32 42 to i8*)
 }
+
+declare void @llvm.va_end(i8*)
+define void @test_va_end(i8* %list) {
+; CHECK-LABEL: name: test_va_end
+; CHECK-NOT: va_end
+; CHECK-NOT: INTRINSIC
+; CHECK: RET_ReallyLR
+  call void @llvm.va_end(i8* %list)
+  ret void
+}
+
+define void @test_va_arg(i8* %list) {
+; CHECK-LABEL: test_va_arg
+; CHECK: [[LIST:%[0-9]+]](p0) = COPY %x0
+; CHECK: G_VAARG [[LIST]](p0), 8
+; CHECK: G_VAARG [[LIST]](p0), 1
+; CHECK: G_VAARG [[LIST]](p0), 16
+
+  %v0 = va_arg i8* %list, i64
+  %v1 = va_arg i8* %list, i8
+  %v2 = va_arg i8* %list, i128
+  ret void
+}
+
+declare float @llvm.pow.f32(float, float)
+define float @test_pow_intrin(float %l, float %r) {
+; CHECK-LABEL: name: test_pow_intrin
+; CHECK: [[LHS:%[0-9]+]](s32) = COPY %s0
+; CHECK: [[RHS:%[0-9]+]](s32) = COPY %s1
+; CHECK: [[RES:%[0-9]+]](s32) = G_FPOW [[LHS]], [[RHS]]
+; CHECK: %s0 = COPY [[RES]]
+  %res = call float @llvm.pow.f32(float %l, float %r)
+  ret float %res
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
+define void @test_lifetime_intrin() {
+; CHECK-LABEL: name: test_lifetime_intrin
+; CHECK: RET_ReallyLR
+  %slot = alloca i8, i32 4
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* %slot)
+  call void @llvm.lifetime.end.p0i8(i64 0, i8* %slot)
+  ret void
+}
+
+define void @test_load_store_atomics(i8* %addr) {
+; CHECK-LABEL: name: test_load_store_atomics
+; CHECK: [[ADDR:%[0-9]+]](p0) = COPY %x0
+; CHECK: [[V0:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load unordered 1 from %ir.addr)
+; CHECK: G_STORE [[V0]](s8), [[ADDR]](p0) :: (store monotonic 1 into %ir.addr)
+; CHECK: [[V1:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load acquire 1 from %ir.addr)
+; CHECK: G_STORE [[V1]](s8), [[ADDR]](p0) :: (store release 1 into %ir.addr)
+; CHECK: [[V2:%[0-9]+]](s8) = G_LOAD [[ADDR]](p0) :: (load singlethread seq_cst 1 from %ir.addr)
+; CHECK: G_STORE [[V2]](s8), [[ADDR]](p0) :: (store singlethread monotonic 1 into %ir.addr)
+  %v0 = load atomic i8, i8* %addr unordered, align 1
+  store atomic i8 %v0, i8* %addr monotonic, align 1
+
+  %v1 = load atomic i8, i8* %addr acquire, align 1
+  store atomic i8 %v1, i8* %addr release, align 1
+
+  %v2 = load atomic i8, i8* %addr singlethread seq_cst, align 1
+  store atomic i8 %v2, i8* %addr singlethread monotonic, align 1
+
+  ret void
+}
+
+define float @test_fneg_f32(float %x) {
+; CHECK-LABEL: name: test_fneg_f32
+; CHECK: [[ARG:%[0-9]+]](s32) = COPY %s0
+; CHECK: [[RES:%[0-9]+]](s32) = G_FNEG [[ARG]]
+; CHECK: %s0 = COPY [[RES]](s32)
+  %neg = fsub float -0.000000e+00, %x
+  ret float %neg
+}
+
+define double @test_fneg_f64(double %x) {
+; CHECK-LABEL: name: test_fneg_f64
+; CHECK: [[ARG:%[0-9]+]](s64) = COPY %d0
+; CHECK: [[RES:%[0-9]+]](s64) = G_FNEG [[ARG]]
+; CHECK: %d0 = COPY [[RES]](s64)
+  %neg = fsub double -0.000000e+00, %x
+  ret double %neg
+}
+
+define void @test_trivial_inlineasm() {
+; CHECK-LABEL: name: test_trivial_inlineasm
+; CHECK: INLINEASM $wibble, 1
+; CHECK: INLINEASM $wibble, 0
+  call void asm sideeffect "wibble", ""()
+  call void asm "wibble", ""()
+  ret void
+}
+
+define <2 x i32> @test_insertelement(<2 x i32> %vec, i32 %elt, i32 %idx){
+; CHECK-LABEL: name: test_insertelement
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = COPY %d0
+; CHECK: [[ELT:%[0-9]+]](s32) = COPY %w0
+; CHECK: [[IDX:%[0-9]+]](s32) = COPY %w1
+; CHECK: [[RES:%[0-9]+]](<2 x s32>) = G_INSERT_VECTOR_ELT [[VEC]], [[ELT]](s32), [[IDX]](s32)
+; CHECK: %d0 = COPY [[RES]](<2 x s32>)
+  %res = insertelement <2 x i32> %vec, i32 %elt, i32 %idx
+  ret <2 x i32> %res
+}
+
+define i32 @test_extractelement(<2 x i32> %vec, i32 %idx) {
+; CHECK-LABEL: name: test_extractelement
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = COPY %d0
+; CHECK: [[IDX:%[0-9]+]](s32) = COPY %w0
+; CHECK: [[RES:%[0-9]+]](s32) = G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>), [[IDX]](s32)
+; CHECK: %w0 = COPY [[RES]](s32)
+  %res = extractelement <2 x i32> %vec, i32 %idx
+  ret i32 %res
+}
+
+define i32 @test_singleelementvector(i32 %elt){
+; CHECK-LABEL: name: test_singleelementvector
+; CHECK: [[ELT:%[0-9]+]](s32) = COPY %w0
+; CHECK-NOT: G_INSERT_VECTOR_ELT
+; CHECK-NOT: G_EXTRACT_VECTOR_ELT
+; CHECK: %w0 = COPY [[ELT]](s32)
+  %vec = insertelement <1 x i32> undef, i32 %elt, i32 0
+  %res = extractelement <1 x i32> %vec, i32 0
+  ret i32 %res
+}
+
+define <2 x i32> @test_constantaggzerovector_v2i32() {
+; CHECK-LABEL: name: test_constantaggzerovector_v2i32
+; CHECK: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32)
+; CHECK: %d0 = COPY [[VEC]](<2 x s32>)
+  ret <2 x i32> zeroinitializer
+}
+
+define <2 x float> @test_constantaggzerovector_v2f32() {
+; CHECK-LABEL: name: test_constantaggzerovector_v2f32
+; CHECK: [[ZERO:%[0-9]+]](s32) = G_FCONSTANT float 0.000000e+00
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32)
+; CHECK: %d0 = COPY [[VEC]](<2 x s32>)
+  ret <2 x float> zeroinitializer
+}
+
+define i32 @test_constantaggzerovector_v3i32() {
+; CHECK-LABEL: name: test_constantaggzerovector_v3i32
+; CHECK: [[ZERO:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[VEC:%[0-9]+]](<3 x s32>) = G_MERGE_VALUES [[ZERO]](s32), [[ZERO]](s32), [[ZERO]](s32)
+; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>)
+  %elt = extractelement <3 x i32> zeroinitializer, i32 1
+  ret i32 %elt
+}
+
+define <2 x i32> @test_constantdatavector_v2i32() {
+; CHECK-LABEL: name: test_constantdatavector_v2i32
+; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32)
+; CHECK: %d0 = COPY [[VEC]](<2 x s32>)
+  ret <2 x i32> <i32 1, i32 2>
+}
+
+define i32 @test_constantdatavector_v3i32() {
+; CHECK-LABEL: name: test_constantdatavector_v3i32
+; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3
+; CHECK: [[VEC:%[0-9]+]](<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32), [[C3]](s32)
+; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>)
+  %elt = extractelement <3 x i32> <i32 1, i32 2, i32 3>, i32 1
+  ret i32 %elt
+}
+
+define <4 x i32> @test_constantdatavector_v4i32() {
+; CHECK-LABEL: name: test_constantdatavector_v4i32
+; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3
+; CHECK: [[C4:%[0-9]+]](s32) = G_CONSTANT i32 4
+; CHECK: [[VEC:%[0-9]+]](<4 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C2]](s32), [[C3]](s32), [[C4]](s32)
+; CHECK: %q0 = COPY [[VEC]](<4 x s32>)
+  ret <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+}
+
+define <2 x double> @test_constantdatavector_v2f64() {
+; CHECK-LABEL: name: test_constantdatavector_v2f64
+; CHECK: [[FC1:%[0-9]+]](s64) = G_FCONSTANT double 1.000000e+00
+; CHECK: [[FC2:%[0-9]+]](s64) = G_FCONSTANT double 2.000000e+00
+; CHECK: [[VEC:%[0-9]+]](<2 x s64>) = G_MERGE_VALUES [[FC1]](s64), [[FC2]](s64)
+; CHECK: %q0 = COPY [[VEC]](<2 x s64>)
+  ret <2 x double> <double 1.0, double 2.0>
+}
+
+define i32 @test_constantaggzerovector_v1s32(i32 %arg){
+; CHECK-LABEL: name: test_constantaggzerovector_v1s32
+; CHECK: [[ARG:%[0-9]+]](s32) = COPY %w0
+; CHECK: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK-NOT: G_MERGE_VALUES
+; CHECK: G_ADD [[ARG]], [[C0]]
+  %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
+  %add = add <1 x i32> %vec, zeroinitializer
+  %res = extractelement <1 x i32> %add, i32 0
+  ret i32 %res
+}
+
+define i32 @test_constantdatavector_v1s32(i32 %arg){
+; CHECK-LABEL: name: test_constantdatavector_v1s32
+; CHECK: [[ARG:%[0-9]+]](s32) = COPY %w0
+; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK-NOT: G_MERGE_VALUES
+; CHECK: G_ADD [[ARG]], [[C1]]
+  %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
+  %add = add <1 x i32> %vec, <i32 1>
+  %res = extractelement <1 x i32> %add, i32 0
+  ret i32 %res
+}
+
+declare ghccc float @different_call_conv_target(float %x)
+define float @test_different_call_conv_target(float %x) {
+; CHECK-LABEL: name: test_different_call_conv
+; CHECK: [[X:%[0-9]+]](s32) = COPY %s0
+; CHECK: %s8 = COPY [[X]]
+; CHECK: BL @different_call_conv_target, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %s8, implicit-def %s0
+  %res = call ghccc float @different_call_conv_target(float %x)
+  ret float %res
+}
+
+define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) {
+; CHECK-LABEL: name: test_shufflevector_s32_v2s32
+; CHECK: [[ARG:%[0-9]+]](s32) = COPY %w0
+; CHECK-DAG: [[UNDEF:%[0-9]+]](s32) = IMPLICIT_DEF
+; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK-DAG: [[MASK:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32)
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], [[MASK]](<2 x s32>)
+; CHECK: %d0 = COPY [[VEC]](<2 x s32>)
+  %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
+  %res = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer
+  ret <2 x i32> %res
+}
+
+define i32 @test_shufflevector_v2s32_s32(<2 x i32> %arg) {
+; CHECK-LABEL: name: test_shufflevector_v2s32_s32
+; CHECK: [[ARG:%[0-9]+]](<2 x s32>) = COPY %d0
+; CHECK-DAG: [[UNDEF:%[0-9]+]](<2 x s32>) = IMPLICIT_DEF
+; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK: [[RES:%[0-9]+]](s32) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], [[C1]](s32)
+; CHECK: %w0 = COPY [[RES]](s32)
+  %vec = shufflevector <2 x i32> %arg, <2 x i32> undef, <1 x i32> <i32 1>
+  %res = extractelement <1 x i32> %vec, i32 0
+  ret i32 %res
+}
+
+define <2 x i32> @test_shufflevector_v2s32_v2s32(<2 x i32> %arg) {
+; CHECK-LABEL: name: test_shufflevector_v2s32_v2s32
+; CHECK: [[ARG:%[0-9]+]](<2 x s32>) = COPY %d0
+; CHECK-DAG: [[UNDEF:%[0-9]+]](<2 x s32>) = IMPLICIT_DEF
+; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK-DAG: [[MASK:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32)
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], [[MASK]](<2 x s32>)
+; CHECK: %d0 = COPY [[VEC]](<2 x s32>)
+  %res = shufflevector <2 x i32> %arg, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+  ret <2 x i32> %res
+}
+
+define i32 @test_shufflevector_v2s32_v3s32(<2 x i32> %arg) {
+; CHECK-LABEL: name: test_shufflevector_v2s32_v3s32
+; CHECK: [[ARG:%[0-9]+]](<2 x s32>) = COPY %d0
+; CHECK-DAG: [[UNDEF:%[0-9]+]](<2 x s32>) = IMPLICIT_DEF
+; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK-DAG: [[MASK:%[0-9]+]](<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32), [[C1]](s32)
+; CHECK: [[VEC:%[0-9]+]](<3 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], [[MASK]](<3 x s32>)
+; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>)
+  %vec = shufflevector <2 x i32> %arg, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
+  %res = extractelement <3 x i32> %vec, i32 0
+  ret i32 %res
+}
+
+define <4 x i32> @test_shufflevector_v2s32_v4s32(<2 x i32> %arg1, <2 x i32> %arg2) {
+; CHECK-LABEL: name: test_shufflevector_v2s32_v4s32
+; CHECK: [[ARG1:%[0-9]+]](<2 x s32>) = COPY %d0
+; CHECK: [[ARG2:%[0-9]+]](<2 x s32>) = COPY %d1
+; CHECK: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3
+; CHECK: [[MASK:%[0-9]+]](<4 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C1]](s32), [[C2]](s32), [[C3]](s32)
+; CHECK: [[VEC:%[0-9]+]](<4 x s32>) = G_SHUFFLE_VECTOR [[ARG1]](<2 x s32>), [[ARG2]], [[MASK]](<4 x s32>)
+; CHECK: %q0 = COPY [[VEC]](<4 x s32>)
+  %res = shufflevector <2 x i32> %arg1, <2 x i32> %arg2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %res
+}
+
+define <2 x i32> @test_shufflevector_v4s32_v2s32(<4 x i32> %arg) {
+; CHECK-LABEL: name: test_shufflevector_v4s32_v2s32
+; CHECK: [[ARG:%[0-9]+]](<4 x s32>) = COPY %q0
+; CHECK-DAG: [[UNDEF:%[0-9]+]](<4 x s32>) = IMPLICIT_DEF
+; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK-DAG: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3
+; CHECK-DAG: [[MASK:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C3]](s32)
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](<4 x s32>), [[UNDEF]], [[MASK]](<2 x s32>)
+; CHECK: %d0 = COPY [[VEC]](<2 x s32>)
+  %res = shufflevector <4 x i32> %arg, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  ret <2 x i32> %res
+}
+
+
+define <16 x i8> @test_shufflevector_v8s8_v16s8(<8 x i8> %arg1, <8 x i8> %arg2) {
+; CHECK-LABEL: name: test_shufflevector_v8s8_v16s8
+; CHECK: [[ARG1:%[0-9]+]](<8 x s8>) = COPY %d0
+; CHECK: [[ARG2:%[0-9]+]](<8 x s8>) = COPY %d1
+; CHECK: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[C8:%[0-9]+]](s32) = G_CONSTANT i32 8
+; CHECK: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK: [[C9:%[0-9]+]](s32) = G_CONSTANT i32 9
+; CHECK: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK: [[C10:%[0-9]+]](s32) = G_CONSTANT i32 10
+; CHECK: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3
+; CHECK: [[C11:%[0-9]+]](s32) = G_CONSTANT i32 11
+; CHECK: [[C4:%[0-9]+]](s32) = G_CONSTANT i32 4
+; CHECK: [[C12:%[0-9]+]](s32) = G_CONSTANT i32 12
+; CHECK: [[C5:%[0-9]+]](s32) = G_CONSTANT i32 5
+; CHECK: [[C13:%[0-9]+]](s32) = G_CONSTANT i32 13
+; CHECK: [[C6:%[0-9]+]](s32) = G_CONSTANT i32 6
+; CHECK: [[C14:%[0-9]+]](s32) = G_CONSTANT i32 14
+; CHECK: [[C7:%[0-9]+]](s32) = G_CONSTANT i32 7
+; CHECK: [[C15:%[0-9]+]](s32) = G_CONSTANT i32 15
+; CHECK: [[MASK:%[0-9]+]](<16 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C8]](s32), [[C1]](s32), [[C9]](s32), [[C2]](s32), [[C10]](s32), [[C3]](s32), [[C11]](s32), [[C4]](s32), [[C12]](s32), [[C5]](s32), [[C13]](s32), [[C6]](s32), [[C14]](s32), [[C7]](s32), [[C15]](s32)
+; CHECK: [[VEC:%[0-9]+]](<16 x s8>) = G_SHUFFLE_VECTOR [[ARG1]](<8 x s8>), [[ARG2]], [[MASK]](<16 x s32>)
+; CHECK: %q0 = COPY [[VEC]](<16 x s8>)
+  %res = shufflevector <8 x i8> %arg1, <8 x i8> %arg2, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x i8> %res
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/call-translator.ll b/test/CodeGen/AArch64/GlobalISel/call-translator.ll
index f9c0fdb5bb201e57b2c2221203a82bbdc6f96389..f8d95c88cc8f3eec9b9ae18c63cd58da0c7efa2a 100644
--- a/test/CodeGen/AArch64/GlobalISel/call-translator.ll
+++ b/test/CodeGen/AArch64/GlobalISel/call-translator.ll
@@ -63,7 +63,13 @@ define void @test_multiple_args(i64 %in) {
 ; CHECK: [[I64:%[0-9]+]](s64) = COPY %x0
 ; CHECK: [[I8:%[0-9]+]](s8) = COPY %w1
 ; CHECK: [[ADDR:%[0-9]+]](p0) = COPY %x2
-; CHECK: [[ARG:%[0-9]+]](s192) = G_SEQUENCE [[DBL]](s64), 0, [[I64]](s64), 64, [[I8]](s8), 128
+
+; CHECK: [[UNDEF:%[0-9]+]](s192) = IMPLICIT_DEF
+; CHECK: [[ARG0:%[0-9]+]](s192) = G_INSERT [[UNDEF]], [[DBL]](s64), 0
+; CHECK: [[ARG1:%[0-9]+]](s192) = G_INSERT [[ARG0]], [[I64]](s64), 64
+; CHECK: [[ARG2:%[0-9]+]](s192) = G_INSERT [[ARG1]], [[I8]](s8), 128
+; CHECK: [[ARG:%[0-9]+]](s192) = COPY [[ARG2]]
+
 ; CHECK: G_STORE [[ARG]](s192), [[ADDR]](p0)
 ; CHECK: RET_ReallyLR
 define void @test_struct_formal({double, i64, i8} %in, {double, i64, i8}* %addr) {
@@ -75,7 +81,11 @@ define void @test_struct_formal({double, i64, i8} %in, {double, i64, i8}* %addr)
 ; CHECK-LABEL: name: test_struct_return
 ; CHECK: [[ADDR:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[VAL:%[0-9]+]](s192) = G_LOAD [[ADDR]](p0)
-; CHECK: [[DBL:%[0-9]+]](s64), [[I64:%[0-9]+]](s64), [[I32:%[0-9]+]](s32) = G_EXTRACT [[VAL]](s192), 0, 64, 128
+
+; CHECK: [[DBL:%[0-9]+]](s64) = G_EXTRACT [[VAL]](s192), 0
+; CHECK: [[I64:%[0-9]+]](s64) = G_EXTRACT [[VAL]](s192), 64
+; CHECK: [[I32:%[0-9]+]](s32) = G_EXTRACT [[VAL]](s192), 128
+
 ; CHECK: %d0 = COPY [[DBL]](s64)
 ; CHECK: %x0 = COPY [[I64]](s64)
 ; CHECK: %w1 = COPY [[I32]](s32)
@@ -86,8 +96,14 @@ define {double, i64, i32} @test_struct_return({double, i64, i32}* %addr) {
 }
 
 ; CHECK-LABEL: name: test_arr_call
+; CHECK: hasCalls: true
 ; CHECK: [[ARG:%[0-9]+]](s256) = G_LOAD
-; CHECK: [[E0:%[0-9]+]](s64), [[E1:%[0-9]+]](s64), [[E2:%[0-9]+]](s64), [[E3:%[0-9]+]](s64) = G_EXTRACT [[ARG]](s256), 0, 64, 128, 192
+
+; CHECK: [[E0:%[0-9]+]](s64) = G_EXTRACT [[ARG]](s256), 0
+; CHECK: [[E1:%[0-9]+]](s64) = G_EXTRACT [[ARG]](s256), 64
+; CHECK: [[E2:%[0-9]+]](s64) = G_EXTRACT [[ARG]](s256), 128
+; CHECK: [[E3:%[0-9]+]](s64) = G_EXTRACT [[ARG]](s256), 192
+
 ; CHECK: %x0 = COPY [[E0]](s64)
 ; CHECK: %x1 = COPY [[E1]](s64)
 ; CHECK: %x2 = COPY [[E2]](s64)
diff --git a/test/CodeGen/AArch64/GlobalISel/debug-insts.ll b/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
index fcf17e2d4e073a637d8e3cebe361ba842cb87351..5a76661180f229c1c829b78bee4028675e6800b5 100644
--- a/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
+++ b/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
@@ -1,8 +1,10 @@
 ; RUN: llc -global-isel -mtriple=aarch64 %s -stop-after=irtranslator -o - | FileCheck %s
-
+; RUN: llc -mtriple=aarch64 -global-isel --global-isel-abort=0 -o /dev/null
 
 ; CHECK-LABEL: name: debug_declare
-; CHECK: DBG_VALUE %stack.0.in.addr, 0, !11, !12, debug-location !13
+; CHECK: stack:
+; CHECK:    - { id: {{.*}}, name: in.addr, offset: {{.*}}, size: {{.*}}, alignment: {{.*}}, di-variable: '!11',
+; CHECK-NEXT:   di-expression: '!12', di-location: '!13' }
 ; CHECK: DBG_VALUE debug-use %0(s32), debug-use _, !11, !12, debug-location !13
 define void @debug_declare(i32 %in) #0 !dbg !7 {
 entry:
@@ -13,15 +15,30 @@ entry:
   ret void, !dbg !14
 }
 
+; CHECK-LABEL: name: debug_declare_vla
+; CHECK: DBG_VALUE debug-use %{{[0-9]+}}(p0), debug-use _, !11, !12, debug-location !13
+define void @debug_declare_vla(i32 %in) #0 !dbg !7 {
+entry:
+  %vla.addr = alloca i32, i32 %in
+  call void @llvm.dbg.declare(metadata i32* %vla.addr, metadata !11, metadata !12), !dbg !13
+  ret void, !dbg !14
+}
+
 ; CHECK-LABEL: name: debug_value
 ; CHECK: [[IN:%[0-9]+]](s32) = COPY %w0
-; CHECK: DBG_VALUE debug-use [[IN]](s32), debug-use _, !11, !12, debug-location !13
-; CHECK: DBG_VALUE debug-use %1(p0), debug-use _, !11, !15, debug-location !13
 define void @debug_value(i32 %in) #0 !dbg !7 {
   %addr = alloca i32
+; CHECK: DBG_VALUE debug-use [[IN]](s32), debug-use _, !11, !12, debug-location !13
   call void @llvm.dbg.value(metadata i32 %in, i64 0, metadata !11, metadata !12), !dbg !13
   store i32 %in, i32* %addr
+; CHECK: DBG_VALUE debug-use %1(p0), debug-use _, !11, !15, debug-location !13
   call void @llvm.dbg.value(metadata i32* %addr, i64 0, metadata !11, metadata !15), !dbg !13
+; CHECK: DBG_VALUE 123, 0, !11, !12, debug-location !13
+  call void @llvm.dbg.value(metadata i32 123, i64 0, metadata !11, metadata !12), !dbg !13
+; CHECK: DBG_VALUE float 1.000000e+00, 0, !11, !12, debug-location !13
+  call void @llvm.dbg.value(metadata float 1.000000e+00, i64 0, metadata !11, metadata !12), !dbg !13
+; CHECK: DBG_VALUE _, 0, !11, !12, debug-location !13
+  call void @llvm.dbg.value(metadata i32* null, i64 0, metadata !11, metadata !12), !dbg !13
   ret void
 }
 
diff --git a/test/CodeGen/AArch64/GlobalISel/dynamic-alloca.ll b/test/CodeGen/AArch64/GlobalISel/dynamic-alloca.ll
new file mode 100644
index 0000000000000000000000000000000000000000..196910e96ce3e68f70d46687e9647d9301558155
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/dynamic-alloca.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - -stop-after=irtranslator | FileCheck %s
+
+; CHECK-LABEL: name: test_simple_alloca
+; CHECK: [[NUMELTS:%[0-9]+]](s32) = COPY %w0
+; CHECK: [[TYPE_SIZE:%[0-9]+]](s64) = G_CONSTANT i64 -1
+; CHECK: [[NUMELTS_64:%[0-9]+]](s64) = G_ZEXT [[NUMELTS]](s32)
+; CHECK: [[NUMBYTES:%[0-9]+]](s64) = G_MUL [[NUMELTS_64]], [[TYPE_SIZE]]
+; CHECK: [[SP_TMP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[ALLOC:%[0-9]+]](p0) = G_GEP [[SP_TMP]], [[NUMBYTES]]
+; CHECK: [[ALIGNED_ALLOC:%[0-9]+]](p0) = G_PTR_MASK [[ALLOC]], 4
+; CHECK: %sp = COPY [[ALIGNED_ALLOC]]
+; CHECK: [[ALLOC:%[0-9]+]](p0) = COPY [[ALIGNED_ALLOC]]
+; CHECK: %x0 = COPY [[ALLOC]]
+define i8* @test_simple_alloca(i32 %numelts) {
+  %addr = alloca i8, i32 %numelts
+  ret i8* %addr
+}
+
+; CHECK-LABEL: name: test_aligned_alloca
+; CHECK: [[NUMELTS:%[0-9]+]](s32) = COPY %w0
+; CHECK: [[TYPE_SIZE:%[0-9]+]](s64) = G_CONSTANT i64 -1
+; CHECK: [[NUMELTS_64:%[0-9]+]](s64) = G_ZEXT [[NUMELTS]](s32)
+; CHECK: [[NUMBYTES:%[0-9]+]](s64) = G_MUL [[NUMELTS_64]], [[TYPE_SIZE]]
+; CHECK: [[SP_TMP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[ALLOC:%[0-9]+]](p0) = G_GEP [[SP_TMP]], [[NUMBYTES]]
+; CHECK: [[ALIGNED_ALLOC:%[0-9]+]](p0) = G_PTR_MASK [[ALLOC]], 5
+; CHECK: %sp = COPY [[ALIGNED_ALLOC]]
+; CHECK: [[ALLOC:%[0-9]+]](p0) = COPY [[ALIGNED_ALLOC]]
+; CHECK: %x0 = COPY [[ALLOC]]
+define i8* @test_aligned_alloca(i32 %numelts) {
+  %addr = alloca i8, i32 %numelts, align 32
+  ret i8* %addr
+}
+
+; CHECK-LABEL: name: test_natural_alloca
+; CHECK: [[NUMELTS:%[0-9]+]](s32) = COPY %w0
+; CHECK: [[TYPE_SIZE:%[0-9]+]](s64) = G_CONSTANT i64 -16
+; CHECK: [[NUMELTS_64:%[0-9]+]](s64) = G_ZEXT [[NUMELTS]](s32)
+; CHECK: [[NUMBYTES:%[0-9]+]](s64) = G_MUL [[NUMELTS_64]], [[TYPE_SIZE]]
+; CHECK: [[SP_TMP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[ALLOC:%[0-9]+]](p0) = G_GEP [[SP_TMP]], [[NUMBYTES]]
+; CHECK: %sp = COPY [[ALLOC]]
+; CHECK: [[ALLOC_TMP:%[0-9]+]](p0) = COPY [[ALLOC]]
+; CHECK: %x0 = COPY [[ALLOC_TMP]]
+define i128* @test_natural_alloca(i32 %numelts) {
+  %addr = alloca i128, i32 %numelts
+  ret i128* %addr
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3ecdb7bbedfb5b422eacbfde07f0bbf0d126fa02
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/gisel-commandline-option.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
+; RUN:   -O0 -aarch64-enable-global-isel-at-O=0 \
+; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix NOFALLBACK
+
+; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
+; RUN:   -O0 -aarch64-enable-global-isel-at-O=0 -global-isel-abort=2  \
+; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix FALLBACK
+
+; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
+; RUN:   -global-isel \
+; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix NOFALLBACK
+
+; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
+; RUN:   -global-isel -global-isel-abort=2 \
+; RUN:   | FileCheck %s --check-prefix ENABLED --check-prefix FALLBACK
+
+; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
+; RUN:   -O1 -aarch64-enable-global-isel-at-O=3 \
+; RUN:   | FileCheck %s --check-prefix ENABLED
+
+; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
+; RUN:   -O1 -aarch64-enable-global-isel-at-O=0 \
+; RUN:   | FileCheck %s --check-prefix DISABLED
+
+; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
+; RUN:   -aarch64-enable-global-isel-at-O=-1 \
+; RUN:   | FileCheck %s --check-prefix DISABLED
+
+; RUN: llc -mtriple=aarch64-- -debug-pass=Structure %s -o /dev/null 2>&1 \
+; RUN:   | FileCheck %s --check-prefix DISABLED
+
+; ENABLED:       IRTranslator
+; ENABLED-NEXT:  Legalizer
+; ENABLED-NEXT:  RegBankSelect
+; ENABLED-NEXT:  InstructionSelect
+; ENABLED-NEXT:  ResetMachineFunction
+
+; FALLBACK:       AArch64 Instruction Selection
+; NOFALLBACK-NOT: AArch64 Instruction Selection
+
+; DISABLED-NOT: IRTranslator
+
+; DISABLED: AArch64 Instruction Selection
+; DISABLED: Expand ISel Pseudo-instructions
+
+define void @empty() {
+  ret void
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/gisel-fail-intermediate-legalizer.ll b/test/CodeGen/AArch64/GlobalISel/gisel-fail-intermediate-legalizer.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e333f742e04dfe44f56121db21b7bc68f0726371
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/gisel-fail-intermediate-legalizer.ll
@@ -0,0 +1,8 @@
+;RUN: llc -mtriple=aarch64-unknown-unknown -o - -global-isel -global-isel-abort=2 %s 2>&1 | FileCheck %s
+; CHECK: fallback
+; CHECK-LABEL: foo
+define i16 @foo(half* %p) {
+  %tmp0 = load half, half* %p
+  %tmp1 = fptoui half %tmp0 to i16
+  ret i16 %tmp1
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/inline-asm.ll b/test/CodeGen/AArch64/GlobalISel/inline-asm.ll
index 3dc8f8cb7063b5f2cfc5bc4faac8e723666e9a2e..8ff7c4495dccb461aa287455f9e49bc6d70f56f3 100644
--- a/test/CodeGen/AArch64/GlobalISel/inline-asm.ll
+++ b/test/CodeGen/AArch64/GlobalISel/inline-asm.ll
@@ -2,9 +2,9 @@
 
 ; CHECK-LABEL: test_asm:
 ; CHECK: {{APP|InlineAsm Start}}
-; CHECK: mov x0, x0
+; CHECK: mov x0, {{x[0-9]+}}
 ; CHECK: {{NO_APP|InlineAsm End}}
 define void @test_asm() {
-  call void asm sideeffect "mov x0, x0", ""()
+  call void asm sideeffect "mov x0, $0", "r"(i64 42)
   ret void
 }
diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-bitcast.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-bitcast.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8d1b02216ea76bf9cd733b2fd9c5bc2d56c3b9b6
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-bitcast.ll
@@ -0,0 +1,30 @@
+; RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -stop-after=irtranslator %s -o - | FileCheck %s
+
+; Check that we don't invalidate the vreg map.
+; This test is brittle: the invalidation only triggers when we grow the map.
+
+; CHECK-LABEL: name: test_bitcast_invalid_vreg
+define i32 @test_bitcast_invalid_vreg() {
+  %tmp0 = add i32 1, 2
+  %tmp1 = add i32 3, 4
+  %tmp2 = add i32 5, 6
+  %tmp3 = add i32 7, 8
+  %tmp4 = add i32 9, 10
+  %tmp5 = add i32 11, 12
+  %tmp6 = add i32 13, 14
+  %tmp7 = add i32 15, 16
+  %tmp8 = add i32 17, 18
+  %tmp9 = add i32 19, 20
+  %tmp10 = add i32 21, 22
+  %tmp11 = add i32 23, 24
+  %tmp12 = add i32 25, 26
+  %tmp13 = add i32 27, 28
+  %tmp14 = add i32 29, 30
+  %tmp15 = add i32 30, 30
+
+; At this point we mapped 46 values. The 'i32 100' constant will grow the map.
+; CHECK:  %46(s32) = G_CONSTANT i32 100
+; CHECK:  %w0 = COPY %46(s32)
+  %res = bitcast i32 100 to i32
+  ret i32 %res
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll b/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
index 412f70e17b2cdd1935ab6a715e5ffc8e04f782e1..ef4445111d7b28eb3ead6960069a0713bf510ccd 100644
--- a/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
+++ b/test/CodeGen/AArch64/GlobalISel/irtranslator-exceptions.ll
@@ -15,20 +15,24 @@ declare i32 @llvm.eh.typeid.for(i8*)
 ; CHECK:     BL @foo, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %w0, implicit-def %w0
 ; CHECK:     {{%[0-9]+}}(s32) = COPY %w0
 ; CHECK:     EH_LABEL
+; CHECK:     G_BR %[[GOOD]]
 
 ; CHECK:   [[BAD]] (landing-pad):
 ; CHECK:     EH_LABEL
+; CHECK:     [[UNDEF:%[0-9]+]](s128) = IMPLICIT_DEF
 ; CHECK:     [[PTR:%[0-9]+]](p0) = COPY %x0
+; CHECK:     [[VAL_WITH_PTR:%[0-9]+]](s128) = G_INSERT [[UNDEF]], [[PTR]](p0), 0
 ; CHECK:     [[SEL_PTR:%[0-9]+]](p0) = COPY %x1
 ; CHECK:     [[SEL:%[0-9]+]](s32) = G_PTRTOINT [[SEL_PTR]]
-; CHECK:     [[PTR_SEL:%[0-9]+]](s128) = G_SEQUENCE [[PTR]](p0), 0, [[SEL]](s32), 64
-; CHECK:     [[PTR_RET:%[0-9]+]](s64), [[SEL_RET:%[0-9]+]](s32) = G_EXTRACT [[PTR_SEL]](s128), 0, 64
+; CHECK:     [[PTR_SEL:%[0-9]+]](s128) = G_INSERT [[VAL_WITH_PTR]], [[SEL]](s32), 64
+; CHECK:     [[PTR_RET:%[0-9]+]](s64) = G_EXTRACT [[PTR_SEL]](s128), 0
+; CHECK:     [[SEL_RET:%[0-9]+]](s32) = G_EXTRACT [[PTR_SEL]](s128), 64
 ; CHECK:     %x0 = COPY [[PTR_RET]]
 ; CHECK:     %w1 = COPY [[SEL_RET]]
 
 ; CHECK:   [[GOOD]]:
 ; CHECK:     [[SEL:%[0-9]+]](s32) = G_CONSTANT i32 1
-; CHECK:     {{%[0-9]+}}(s128) = G_INSERT {{%[0-9]+}}(s128), [[SEL]](s32), 64
+; CHECK:     {{%[0-9]+}}(s128) = G_INSERT {{%[0-9]+}}, [[SEL]](s32), 64
 
 define { i8*, i32 } @bar() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
   %res32 = invoke i32 @foo(i32 42) to label %continue unwind label %broken
@@ -57,3 +61,34 @@ broken:
 continue:
   ret void
 }
+
+; CHECK-LABEL: name: test_invoke_varargs
+
+; CHECK: [[NULL:%[0-9]+]](p0) = G_CONSTANT i64 0
+; CHECK: [[ANSWER:%[0-9]+]](s32) = G_CONSTANT i32 42
+; CHECK: [[ONE:%[0-9]+]](s32) = G_FCONSTANT float 1.0
+
+; CHECK: %x0 = COPY [[NULL]]
+
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFFSET:%[0-9]+]](s64) = G_CONSTANT i64 0
+; CHECK: [[SLOT:%[0-9]+]](p0) = G_GEP [[SP]], [[OFFSET]](s64)
+; CHECK: G_STORE [[ANSWER]](s32), [[SLOT]]
+
+; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFFSET:%[0-9]+]](s64) = G_CONSTANT i64 8
+; CHECK: [[SLOT:%[0-9]+]](p0) = G_GEP [[SP]], [[OFFSET]](s64)
+; CHECK: G_STORE [[ONE]](s32), [[SLOT]]
+
+; CHECK: BL @printf
+declare void @printf(i8*, ...)
+define void @test_invoke_varargs() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
+  invoke void(i8*, ...) @printf(i8* null, i32 42, float 1.0) to label %continue unwind label %broken
+
+broken:
+  landingpad { i8*, i32 } catch i8* bitcast(i8** @_ZTIi to i8*)
+  ret void
+
+continue:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
index 679c6b4788f2ad73ebf6fb9f66c8d60527e0b28f..9b27198b961aeed40c7e0df31a664bb693a96e0e 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-add.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -33,14 +33,14 @@ body: |
   bb.0.entry:
     liveins: %x0, %x1, %x2, %x3
     ; CHECK-LABEL: name: test_scalar_add_big
-    ; CHECK-NOT: G_EXTRACT
-    ; CHECK-NOT: G_SEQUENCE
+    ; CHECK-NOT: G_MERGE_VALUES
+    ; CHECK-NOT: G_UNMERGE_VALUES
     ; CHECK-DAG: [[CARRY0_32:%.*]](s32) = G_CONSTANT i32 0
     ; CHECK-DAG: [[CARRY0:%[0-9]+]](s1) = G_TRUNC [[CARRY0_32]]
     ; CHECK: [[RES_LO:%.*]](s64), [[CARRY:%.*]](s1) = G_UADDE %0, %2, [[CARRY0]]
     ; CHECK: [[RES_HI:%.*]](s64), {{%.*}}(s1) = G_UADDE %1, %3, [[CARRY]]
-    ; CHECK-NOT: G_EXTRACT
-    ; CHECK-NOT: G_SEQUENCE
+    ; CHECK-NOT: G_MERGE_VALUES
+    ; CHECK-NOT: G_UNMERGE_VALUES
     ; CHECK: %x0 = COPY [[RES_LO]]
     ; CHECK: %x1 = COPY [[RES_HI]]
 
@@ -48,10 +48,10 @@ body: |
     %1(s64) = COPY %x1
     %2(s64) = COPY %x2
     %3(s64) = COPY %x3
-    %4(s128) = G_SEQUENCE %0, 0, %1, 64
-    %5(s128) = G_SEQUENCE %2, 0, %3, 64
+    %4(s128) = G_MERGE_VALUES %0, %1
+    %5(s128) = G_MERGE_VALUES %2, %3
     %6(s128) = G_ADD %4, %5
-    %7(s64), %8(s64) = G_EXTRACT %6, 0, 64
+    %7(s64), %8(s64) = G_UNMERGE_VALUES %6
     %x0 = COPY %7
     %x1 = COPY %8
 ...
@@ -112,10 +112,10 @@ body: |
     %1(<2 x s64>) = COPY %q1
     %2(<2 x s64>) = COPY %q2
     %3(<2 x s64>) = COPY %q3
-    %4(<4 x s64>) = G_SEQUENCE %0, 0, %1, 128
-    %5(<4 x s64>) = G_SEQUENCE %2, 0, %3, 128
+    %4(<4 x s64>) = G_MERGE_VALUES %0, %1
+    %5(<4 x s64>) = G_MERGE_VALUES %2, %3
     %6(<4 x s64>) = G_ADD %4, %5
-    %7(<2 x s64>), %8(<2 x s64>) = G_EXTRACT %6, 0, 128
+    %7(<2 x s64>), %8(<2 x s64>) = G_UNMERGE_VALUES %6
     %q0 = COPY %7
     %q1 = COPY %8
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-and.mir b/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
index cdd885cb6732ed24556ac895548013d6d930c79e..75e1d5163532d937cececff1fc6fc0039b3d6606 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-and.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir b/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
index 926a62761ce09c177aef9574e141b9201039314f..29f83b36289587d52b75f006f7a496653a06f71d 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-cmp.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-combines.mir b/test/CodeGen/AArch64/GlobalISel/legalize-combines.mir
index d7d8ebeaf56e1c08a9c6010d23beb98b5ee1d91a..fab6dcf43346894fe72ff746fa399c23e42f3d74 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-combines.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-combines.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -8,6 +8,7 @@
   define void @test_combines_3() { ret void }
   define void @test_combines_4() { ret void }
   define void @test_combines_5() { ret void }
+  define void @test_combines_6() { ret void }
 ...
 
 ---
@@ -69,7 +70,8 @@ body: |
     ; CHECK: %5(s32) = G_ADD %0, %1
     %1:_(s32) = G_ADD %0, %0
     %2:_(s64) = G_SEQUENCE %0, 0, %1, 32
-    %3:_(s32), %4:_(s32) = G_EXTRACT %2, 0, 32
+    %3:_(s32) = G_EXTRACT %2, 0
+    %4:_(s32) = G_EXTRACT %2, 32
     %5:_(s32) = G_ADD %3, %4
 ...
 
@@ -107,3 +109,24 @@ body: |
     %4:_(s32) = G_EXTRACT %2, 32
     %5:_(s32) = G_ADD %3, %4
 ...
+
+---
+name:            test_combines_6
+body: |
+  bb.0:
+    liveins: %w0
+
+    ; CHECK-LABEL: name: test_combines_6
+    ; CHECK: %0(s32) = COPY %w0
+    %0:_(s32) = COPY %w0
+
+    ; Check that we replace all the uses of a G_EXTRACT.
+    ; CHECK-NOT: G_SEQUENCE
+    ; CHECK-NOT: G_EXTRACT
+    ; CHECK: %3(s32) = G_MUL %0, %0
+    ; CHECK: %4(s32) = G_ADD %0, %3
+    %1:_(s32) = G_SEQUENCE %0, 0
+    %2:_(s32) = G_EXTRACT %1, 0
+    %3:_(s32) = G_MUL %2, %2
+    %4:_(s32) = G_ADD %2, %3
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir b/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
index abbac413f5cbd669c599ae22a45428e84a47a963..16d9e59698fe1d7c93b6f142fee8fb45c6937341 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-constant.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-div.mir b/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
index aaef45d3c928b7c99dcb2e01caf2fb9a4dd14260..c6e0aabfd2c0f5dbeaee255455e5034bc9c83ee9 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-div.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-exceptions.ll b/test/CodeGen/AArch64/GlobalISel/legalize-exceptions.ll
index 001aea6409bbef9a1e93f42a0e6ba33bb097002e..23e7d5163e5a51cc4c11950ef34edce7997e86a7 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-exceptions.ll
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-exceptions.ll
@@ -14,12 +14,20 @@ declare void @_Unwind_Resume(i8*)
 
 ; CHECK:   [[LP]] (landing-pad):
 ; CHECK:     EH_LABEL
+
 ; CHECK:     [[PTR:%[0-9]+]](p0) = COPY %x0
+; CHECK:     [[STRUCT_PTR:%[0-9]+]](s64) = G_PTRTOINT [[PTR]](p0)
+
 ; CHECK:     [[SEL_PTR:%[0-9]+]](p0) = COPY %x1
 ; CHECK:     [[SEL:%[0-9]+]](s32) = G_PTRTOINT [[SEL_PTR]]
-; CHECK-NOT: G_SEQUENCE
-; CHECK-NOT: G_EXTRACT
+; CHECK:     [[STRUCT_SEL:%[0-9]+]](s64) = G_INSERT {{%[0-9]+}}, [[SEL]](s32), 0
+
+; CHECK:     [[STRUCT:%[0-9]+]](s128) = G_MERGE_VALUES [[STRUCT_PTR]](s64), [[STRUCT_SEL]]
+
+; CHECK:     [[PTR:%[0-9]+]](p0) = G_EXTRACT [[STRUCT]](s128), 0
 ; CHECK:     G_STORE [[PTR]](p0), {{%[0-9]+}}(p0)
+
+; CHECK:     [[SEL:%[0-9]+]](s32) = G_EXTRACT [[STRUCT]](s128), 64
 ; CHECK:     G_STORE [[SEL]](s32), {{%[0-9]+}}(p0)
 
 define void @bar() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
index 9907f009d931711fd7cec5f5cffa506321a91cf9..70b55e4ebc66da5ee776d559f2627d47ce1bd5a9 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-ext.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir b/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
index 72bd613fab3ae9dfc4dd9982a685cac960d230cb..8cdc7b78b1e95b36fbda17376b07567441877f1d 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-fcmp.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-fneg.mir b/test/CodeGen/AArch64/GlobalISel/legalize-fneg.mir
new file mode 100644
index 0000000000000000000000000000000000000000..8b5cbdfa55e39579ca7909c93d786b5e717344fa
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-fneg.mir
@@ -0,0 +1,48 @@
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64--"
+  define void @test_fneg_f32() {
+  entry:
+    ret void
+  }
+  define void @test_fneg_f64() {
+  entry:
+    ret void
+  }
+...
+---
+name:            test_fneg_f32
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1:
+    liveins: %s0
+    ; CHECK-LABEL: name: test_fneg_f32
+    ; CHECK: [[VAR:%[0-9]+]](s32) = COPY %s0
+    ; CHECK: [[ZERO:%[0-9]+]](s32) = G_FCONSTANT float -0.000000e+00
+    ; CHECK: [[RES:%[0-9]+]](s32) = G_FSUB [[ZERO]], [[VAR]]
+    ; CHECK: %s0 = COPY [[RES]](s32)
+    %0(s32) = COPY %s0
+    %1(s32) = G_FNEG %0
+    %s0 = COPY %1(s32)
+...
+---
+name:            test_fneg_f64
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1:
+    liveins: %d0
+    ; CHECK-LABEL: name: test_fneg_f64
+    ; CHECK: [[VAR:%[0-9]+]](s64) = COPY %d0
+    ; CHECK: [[ZERO:%[0-9]+]](s64) = G_FCONSTANT double -0.000000e+00
+    ; CHECK: [[RES:%[0-9]+]](s64) = G_FSUB [[ZERO]], [[VAR]]
+    ; CHECK: %d0 = COPY [[RES]](s64)
+    %0(s64) = COPY %d0
+    %1(s64) = G_FNEG %0
+    %d0 = COPY %1(s64)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir b/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir
index 8d0af0dc447c0c4e978e3ecf5e51affdfc6925b6..f79d0382ea7c45ea2e50d9bc875882c40ac4843d 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-fptoi.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir b/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir
index 3f11c123ba518e85d14f7fa71f2580100568d573..d6ec983c2067b2669c8799e778eb02b48eeb2ee2 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-gep.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir b/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..917f181099ec121e74b5f16fddd9bd40b063f9b3
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-inserts.mir
@@ -0,0 +1,141 @@
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64--"
+  define void @test_inserts_1() { ret void }
+  define void @test_inserts_2() { ret void }
+  define void @test_inserts_3() { ret void }
+  define void @test_inserts_4() { ret void }
+  define void @test_inserts_5() { ret void }
+  define void @test_inserts_6() { ret void }
+...
+
+---
+name:            test_inserts_1
+body: |
+  bb.0:
+    liveins: %w0
+
+      ; Low part of insertion wipes out the old register entirely, so %0 gets
+      ; forwarded to the G_STORE. Hi part is unchanged so (split) G_LOAD gets
+      ; forwarded.
+    ; CHECK-LABEL: name: test_inserts_1
+    ; CHECK: [[LO:%[0-9]+]](s64) = G_LOAD
+    ; CHECK: [[HI:%[0-9]+]](s64) = G_LOAD
+    ; CHECK: G_STORE %0(s64)
+    ; CHECK: G_STORE [[HI]]
+    %0:_(s64) = COPY %x0
+    %1:_(s32) = COPY %w1
+    %2:_(p0) = COPY %x2
+    %3:_(s128) = G_LOAD %2(p0) :: (load 16)
+    %4:_(s128) = G_INSERT %3(s128), %0(s64), 0
+    G_STORE %4(s128), %2(p0) :: (store 16)
+    RET_ReallyLR
+...
+
+---
+name:            test_inserts_2
+body: |
+  bb.0:
+    liveins: %w0
+
+      ; Low insertion wipes out the old register entirely, so %0 gets forwarded
+      ; to the G_STORE again. Second insertion is real.
+    ; CHECK-LABEL: name: test_inserts_2
+    ; CHECK: [[LO:%[0-9]+]](s64) = G_LOAD
+    ; CHECK: [[HI:%[0-9]+]](s64) = G_LOAD
+    ; CHECK: [[NEWHI:%[0-9]+]](s64) = G_INSERT [[HI]], %1(s32), 0
+    ; CHECK: G_STORE %0(s64)
+    ; CHECK: G_STORE [[NEWHI]]
+    %0:_(s64) = COPY %x0
+    %1:_(s32) = COPY %w1
+    %2:_(p0) = COPY %x2
+    %3:_(s128) = G_LOAD %2(p0) :: (load 16)
+    %4:_(s128) = G_INSERT %3(s128), %0(s64), 0
+    %5:_(s128) = G_INSERT %4(s128), %1(s32), 64
+    G_STORE %5(s128), %2(p0) :: (store 16)
+    RET_ReallyLR
+...
+
+---
+name:            test_inserts_3
+body: |
+  bb.0:
+    liveins: %w0
+
+      ; I'm not entirely convinced inserting a p0 into an s64 is valid, but it's
+      ; certainly better than the alternative of directly forwarding the value
+      ; which would cause a nasty type mismatch.
+    ; CHECK-LABEL: name: test_inserts_3
+    ; CHECK: [[LO:%[0-9]+]](s64) = G_LOAD
+    ; CHECK: [[HI:%[0-9]+]](s64) = G_LOAD
+    ; CHECK: [[NEWLO:%[0-9]+]](s64) = G_PTRTOINT %0(p0)
+    ; CHECK: G_STORE [[NEWLO]](s64)
+    ; CHECK: G_STORE [[HI]]
+    %0:_(p0) = COPY %x0
+    %1:_(s32) = COPY %w1
+    %2:_(p0) = COPY %x2
+    %3:_(s128) = G_LOAD %2(p0) :: (load 16)
+    %4:_(s128) = G_INSERT %3(s128), %0(p0), 0
+    G_STORE %4(s128), %2(p0) :: (store 16)
+    RET_ReallyLR
+...
+
+---
+name:            test_inserts_4
+body: |
+  bb.0:
+    liveins: %w0
+
+      ; A narrow insert gets surrounded by a G_ANYEXT/G_TRUNC pair.
+    ; CHECK-LABEL: name: test_inserts_4
+    ; CHECK: [[VALEXT:%[0-9]+]](s32) = G_ANYEXT %1(s8)
+    ; CHECK: [[VAL:%[0-9]+]](s32) = G_INSERT [[VALEXT]], %0(s1), 0
+    ; CHECK: %3(s8) = G_TRUNC [[VAL]](s32)
+    %0:_(s1) = COPY %w0
+    %1:_(s8) = COPY %w1
+    %2:_(p0) = COPY %x2
+    %3:_(s8) = G_INSERT %1(s8), %0(s1), 0
+    G_STORE %3(s8), %2(p0) :: (store 1)
+    RET_ReallyLR
+...
+
+---
+name:            test_inserts_5
+body: |
+  bb.0:
+    liveins: %x0, %x1, %x2
+
+
+    ; CHECK-LABEL: name: test_inserts_5
+    ; CHECK: [[INS_LO:%[0-9]+]](s32) = G_EXTRACT %2(s64), 0
+    ; CHECK: [[VAL_LO:%[0-9]+]](s64) = G_INSERT %0, [[INS_LO]](s32), 32
+    ; CHECK: [[INS_HI:%[0-9]+]](s32) = G_EXTRACT %2(s64), 32
+    ; CHECK: [[VAL_HI:%[0-9]+]](s64) = G_INSERT %1, [[INS_HI]](s32), 0
+    ; CHECK: %4(s128) = G_MERGE_VALUES [[VAL_LO]](s64), [[VAL_HI]](s64)
+    %0:_(s64) = COPY %x0
+    %1:_(s64) = COPY %x1
+    %2:_(s64) = COPY %x2
+    %3:_(s128) = G_MERGE_VALUES %0, %1
+    %4:_(s128) = G_INSERT %3, %2, 32
+    RET_ReallyLR
+...
+
+---
+name:            test_inserts_6
+body: |
+  bb.0:
+    liveins: %x0, %x1, %x2
+
+
+    ; CHECK-LABEL: name: test_inserts_6
+    ; CHECK: [[VAL_LO:%[0-9]+]](s64) = G_INSERT %0, %2(s32), 32
+    ; CHECK: %4(s128) = G_MERGE_VALUES [[VAL_LO]](s64), %1(s64)
+    %0:_(s64) = COPY %x0
+    %1:_(s64) = COPY %x1
+    %2:_(s32) = COPY %w2
+    %3:_(s128) = G_MERGE_VALUES %0, %1
+    %4:_(s128) = G_INSERT %3, %2, 32
+    RET_ReallyLR
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir b/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir
index 70ffc3ea3ac647cf3a4a1a5c8d72240698e5bade..69e72bcb1f387bcccd4fd1ca232cf161c937af93 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-itofp.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir b/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
index 00145ad1f53503a09098af6711c871702135c506..c806b4a7060d1cbd0844f33d35cfcb271c631010 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-load-store.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -59,7 +59,7 @@ body: |
     ; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_CONSTANT i64 8
     ; CHECK: [[GEP1:%[0-9]+]](p0) = G_GEP %0, [[OFFSET1]](s64)
     ; CHECK: [[LOAD1:%[0-9]+]](s64) = G_LOAD [[GEP1]](p0) :: (load 16 from %ir.addr)
-    ; CHECK: %8(s128) = G_SEQUENCE [[LOAD0]](s64), 0, [[LOAD1]](s64), 64
+    ; CHECK: %8(s128) = G_MERGE_VALUES [[LOAD0]](s64), [[LOAD1]](s64)
     %8(s128) = G_LOAD %0(p0) :: (load 16 from %ir.addr)
 ...
 
@@ -82,7 +82,7 @@ body: |
     %0(p0) = COPY %x0
     %1(s32) = COPY %w1
 
-    ; CHECK: [[BIT8:%[0-9]+]](s8) = G_ANYEXT %2(s1)
+    ; CHECK: [[BIT8:%[0-9]+]](s8) = G_ZEXT %2(s1)
     ; CHECK: G_STORE [[BIT8]](s8), %0(p0) :: (store 1 into %ir.addr)
     %2(s1) = G_TRUNC %1
     G_STORE %2, %0 :: (store 1 into %ir.addr)
@@ -112,6 +112,6 @@ body: |
     ; CHECK: [[GEP1:%[0-9]+]](p0) = G_GEP %0, [[OFFSET1]](s64)
     ; CHECK: G_STORE %6(s64), [[GEP1]](p0) :: (store 16 into %ir.addr)
     %6(s64) = G_PTRTOINT %0(p0)
-    %7(s128) = G_SEQUENCE %5, 0, %6, 64
+    %7(s128) = G_MERGE_VALUES %5, %6
     G_STORE %7, %0 :: (store 16 into %ir.addr)
 ...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir b/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
index e56eef0bc4fb910c24da4557444282e7b25a7051..1ea6e9c292f5bff5a999c48291344b63494641bf 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-mul.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -7,6 +7,7 @@
   entry:
     ret void
   }
+  define void @test_mul_overflow() { ret void }
 ...
 
 ---
@@ -35,3 +36,22 @@ body: |
     %5(s64) = G_ANYEXT %2
     %x0 = COPY %5
 ...
+
+
+---
+name:            test_mul_overflow
+body: |
+  bb.0:
+    liveins: %x0, %x1, %w2, %w3
+
+    %0:_(s64) = COPY %x0
+    %1:_(s64) = COPY %x1
+
+    ; CHECK-LABEL: name: test_mul_overflow
+    ; CHECK: %2(s64) = G_MUL %0, %1
+    ; CHECK: [[HI:%[0-9]+]](s64) = G_SMULH %0, %1
+    ; CHECK: [[ZERO:%[0-9]+]](s64) = G_CONSTANT i64 0
+    ; CHECK: %3(s1) = G_ICMP intpred(ne), [[HI]](s64), [[ZERO]]
+    %2:_(s64), %3:_(s1) = G_SMULO %0, %1
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir b/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir
new file mode 100644
index 0000000000000000000000000000000000000000..9928ea54d2c98cf689eb0fc5a2fb22cb97e3e49a
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-nonpowerof2eltsvec.mir
@@ -0,0 +1,29 @@
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64--"
+  define void @test_legalize_merge_v3s32() {
+    ret void
+  }
+...
+---
+name:            test_legalize_merge_v3s32
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+body: |
+  bb.0:
+    liveins: %w0, %w1, %w2
+    ; CHECK-LABEL: name: test_legalize_merge_v3s32
+    ; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %w0
+    ; CHECK: [[ARG2:%[0-9]+]](s32) = COPY %w1
+    ; CHECK: [[ARG3:%[0-9]+]](s32) = COPY %w2
+    ; CHECK: (<3 x s32>) = G_MERGE_VALUES [[ARG1]](s32), [[ARG2]](s32), [[ARG3]](s32)
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = COPY %w2
+    %3(<3 x s32>) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-or.mir b/test/CodeGen/AArch64/GlobalISel/legalize-or.mir
index 802d8ad1989c4a36e0067b9f926d5030e8e86930..e8b85098246000fc8b7c90bdb7d198716a6857ca 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-or.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-or.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir b/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir
new file mode 100644
index 0000000000000000000000000000000000000000..2becc2e134b5091374b03f57e2a76458c735b36b
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-pow.mir
@@ -0,0 +1,38 @@
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64--"
+  define void @test_pow() {
+  entry:
+    ret void
+  }
+...
+
+---
+name:            test_pow
+body: |
+  bb.0.entry:
+    liveins: %d0, %d1, %s2, %s3
+
+    ; CHECK-LABEL: name: test_pow
+    ; CHECK: hasCalls: true
+
+    %0:_(s64) = COPY %d0
+    %1:_(s64) = COPY %d1
+    %2:_(s32) = COPY %s2
+    %3:_(s32) = COPY %s3
+
+    ; CHECK: %d0 = COPY %0
+    ; CHECK: %d1 = COPY %1
+    ; CHECK: BL $pow, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %d0, implicit %d1, implicit-def %d0
+    ; CHECK: %4(s64) = COPY %d0
+    %4:_(s64) = G_FPOW %0, %1
+
+    ; CHECK: %s0 = COPY %2
+    ; CHECK: %s1 = COPY %3
+    ; CHECK: BL $powf, csr_aarch64_aapcs, implicit-def %lr, implicit %sp, implicit %s0, implicit %s1, implicit-def %s0
+    ; CHECK: %5(s32) = COPY %s0
+    %5:_(s32) = G_FPOW %2, %3
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir b/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
index bd8cdf4f1aeb96d891ef758438b1cfe04371e305..50a4d93cbe20c992816c50552afd5ff4b5aea75a 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-rem.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir b/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
index 5d95c5ee2d87450c2346821b24addd59c4f3becc..f75a2982a3f2df7eca281a0dcd5be66ca986f166 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-shift.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir b/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
index 17685fa9bf15c471d4993aee314919bab132b916..cd24bccfe77130768878d894980e691cda619510 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-simple.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
@@ -52,14 +52,24 @@ body: |
 
   bb.1.next:
 
-    ; CHECK: %7(s1) = G_SELECT %1(s1), %1, %1
-    ; CHECK: %8(s8) = G_SELECT %1(s1), %2, %2
-    ; CHECK: %9(s16) = G_SELECT %1(s1), %3, %3
-    ; CHECK: %10(s32) = G_SELECT %1(s1), %4, %4
-    ; CHECK: %11(s64) = G_SELECT %1(s1), %0, %0
+    ; CHECK: [[LHS:%[0-9]+]](s32) = G_ANYEXT %1(s1)
+    ; CHECK: [[RHS:%[0-9]+]](s32) = G_ANYEXT %1(s1)
+    ; CHECK: [[RES:%[0-9]+]](s32) = G_SELECT %1(s1), [[LHS]], [[RHS]]
+    ; CHECK: %7(s1) = G_TRUNC [[RES]](s32)
     %7(s1) = G_SELECT %1, %1, %1
+
+    ; CHECK: [[LHS:%[0-9]+]](s32) = G_ANYEXT %2(s8)
+    ; CHECK: [[RHS:%[0-9]+]](s32) = G_ANYEXT %2(s8)
+    ; CHECK: [[RES:%[0-9]+]](s32) = G_SELECT %1(s1), [[LHS]], [[RHS]]
+    ; CHECK: %8(s8) = G_TRUNC [[RES]](s32)
     %8(s8) = G_SELECT %1, %2, %2
+
+    ; CHECK: [[LHS:%[0-9]+]](s32) = G_ANYEXT %3(s16)
+    ; CHECK: [[RHS:%[0-9]+]](s32) = G_ANYEXT %3(s16)
+    ; CHECK: [[RES:%[0-9]+]](s32) = G_SELECT %1(s1), [[LHS]], [[RHS]]
+    ; CHECK: %9(s16) = G_TRUNC [[RES]](s32)
     %9(s16) = G_SELECT %1, %3, %3
+
     %10(s32) = G_SELECT %1, %4, %4
     %11(s64) = G_SELECT %1, %0, %0
 
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir b/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
index 6652d2b4d7a0a5cf3093087c6e6a1ca6abd9122e..82a1dd09c1a122132b2462fe78d19d85ba6f4788 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-sub.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir b/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir
new file mode 100644
index 0000000000000000000000000000000000000000..8bda08d0a1d122aa5d9f95895f0285b472d2cf31
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-vaarg.mir
@@ -0,0 +1,39 @@
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+  target triple = "aarch64--"
+  define void @test_vaarg() { ret void }
+...
+
+---
+name:            test_vaarg
+body: |
+  bb.0:
+    %0:_(p0) = COPY %x0
+
+    ; CHECK-LABEL: name: test_vaarg
+    ; CHECK: [[LIST:%[0-9]+]](p0) = G_LOAD %0(p0) :: (load 8)
+    ; CHECK: %1(s8) = G_LOAD [[LIST]](p0) :: (load 1, align 8)
+    ; CHECK: [[SLOTSIZE:%[0-9]+]](s64) = G_CONSTANT i64 8
+    ; CHECK: [[NEXT:%[0-9]+]](p0) = G_GEP [[LIST]], [[SLOTSIZE]](s64)
+    ; CHECK: G_STORE [[NEXT]](p0), %0(p0) :: (store 8)
+    %1:_(s8) = G_VAARG %0(p0), 1
+
+    ; CHECK: [[LIST:%[0-9]+]](p0) = G_LOAD %0(p0) :: (load 8)
+    ; CHECK: %2(s64) = G_LOAD [[LIST]](p0) :: (load 8)
+    ; CHECK: [[SLOTSIZE:%[0-9]+]](s64) = G_CONSTANT i64 8
+    ; CHECK: [[NEXT:%[0-9]+]](p0) = G_GEP [[LIST]], [[SLOTSIZE]](s64)
+    ; CHECK: G_STORE [[NEXT]](p0), %0(p0) :: (store 8)
+    %2:_(s64) = G_VAARG %0(p0), 8
+
+    ; CHECK: [[LIST:%[0-9]+]](p0) = G_LOAD %0(p0) :: (load 8)
+    ; CHECK: [[ALIGNM1:%[0-9]+]](s64) = G_CONSTANT i64 15
+    ; CHECK: [[ALIGNTMP:%[0-9]+]](p0) = G_GEP [[LIST]], [[ALIGNM1]](s64)
+    ; CHECK: [[LIST:%[0-9]+]](p0) = G_PTR_MASK [[ALIGNTMP]], 4
+    ; CHECK: %3(s64) = G_LOAD [[LIST]](p0) :: (load 8, align 16)
+    ; CHECK: [[SLOTSIZE:%[0-9]+]](s64) = G_CONSTANT i64 8
+    ; CHECK: [[NEXT:%[0-9]+]](p0) = G_GEP [[LIST]], [[SLOTSIZE]](s64)
+    ; CHECK: G_STORE [[NEXT]](p0), %0(p0) :: (store 8)
+    %3:_(s64) = G_VAARG %0(p0), 16
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir b/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir
index a2f3c8ea3b1b3aa3d7cb80d732df97a02cd4219c..460b3d16f1c0c7cb8f0aa2616627fcb7b86ac9da 100644
--- a/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir
+++ b/test/CodeGen/AArch64/GlobalISel/legalize-xor.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s
 
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
diff --git a/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir b/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir
new file mode 100644
index 0000000000000000000000000000000000000000..73d4d20547292598862bb99040ae85cb085a36eb
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/regbankselect-dbg-value.mir
@@ -0,0 +1,45 @@
+# RUN: llc -O0 -mtriple arm64-- -run-pass=regbankselect -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @test_dbg_value() !dbg !5 {
+    ; Keep the dbg metadata live by referencing it in the IR.
+    call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !7, metadata !9), !dbg !10
+    ret void
+  }
+
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "llvm", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "test.ll", directory: "/tmp")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = distinct !DISubprogram(name: "test_dbg_value", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+  !6 = !DISubroutineType(types: !2)
+  !7 = !DILocalVariable(name: "in", arg: 1, scope: !5, file: !1, line: 1, type: !8)
+  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !9 = !DIExpression()
+  !10 = !DILocation(line: 1, column: 1, scope: !5)
+...
+
+---
+# CHECK-LABEL: name: test_dbg_value
+name:            test_dbg_value
+legalized:       true
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gpr }
+body: |
+  bb.0:
+    liveins: %w0
+    %0:_(s32) = COPY %w0
+    ; CHECK: DBG_VALUE debug-use %0(s32), debug-use _, !7, !9, debug-location !10
+    DBG_VALUE debug-use %0(s32), debug-use _, !7, !9, debug-location !10
+
+    ; CHECK: DBG_VALUE _, 0, !7, !9, debug-location !10
+    DBG_VALUE _, 0, !7, !9, debug-location !10
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir b/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir
index 12162eb54a839c92c10bd8e19d28e03cbb5f358f..14ee40c941bf08c5953870fce4881ed8c4d9cb63 100644
--- a/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir
+++ b/test/CodeGen/AArch64/GlobalISel/regbankselect-default.mir
@@ -622,7 +622,7 @@ body: |
     ; CHECK:      %0(p0) = COPY %x0
     ; CHECK:      %1(s32) = G_LOAD %0
     %0(p0) = COPY %x0
-    %1(s32) = G_LOAD %0
+    %1(s32) = G_LOAD %0 :: (load 4)
 ...
 
 ---
@@ -643,7 +643,7 @@ body: |
     ; CHECK:      G_STORE %1(s32), %0(p0)
     %0(p0) = COPY %x0
     %1(s32) = COPY %w1
-    G_STORE %1, %0
+    G_STORE %1, %0 :: (store 4)
 ...
 
 ---
diff --git a/test/CodeGen/AArch64/GlobalISel/regbankselect-reg_sequence.mir b/test/CodeGen/AArch64/GlobalISel/regbankselect-reg_sequence.mir
new file mode 100644
index 0000000000000000000000000000000000000000..15ccf1f5459cfd27f14f53fd0650005dc3f9b7ca
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/regbankselect-reg_sequence.mir
@@ -0,0 +1,25 @@
+# RUN: llc %s -mtriple aarch64-- -o - -run-pass regbankselect | FileCheck %s
+--- |
+  define void @foo() { ret void }
+...
+---
+# CHECK-LABEL: foo
+# Check that we produce a valid mapping for REG_SEQUENCE.
+# This used to fail the RegisterBankInfo verify because
+# we were using the exclusively the type of the definition
+# whereas since REG_SEQUENCE are kind of target opcode
+# their definition may not have a type.
+#
+# CHECK: id: 0, class: dd
+name: foo
+legalized: true
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: dd }
+body: |
+  bb.0:
+    liveins: %d0, %d1
+
+    %0 = REG_SEQUENCE %d0, %subreg.dsub0, %d1, %subreg.dsub1
+
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-binop.mir b/test/CodeGen/AArch64/GlobalISel/select-binop.mir
new file mode 100644
index 0000000000000000000000000000000000000000..8ae2e1b2eb7d2da8859d3dce8c13d353f6d3c874
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-binop.mir
@@ -0,0 +1,1042 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @add_s32_gpr() { ret void }
+  define void @add_s64_gpr() { ret void }
+
+  define void @add_imm_s32_gpr() { ret void }
+  define void @add_imm_s64_gpr() { ret void }
+
+  define void @add_imm_s32_gpr_bb() { ret void }
+
+  define void @sub_s32_gpr() { ret void }
+  define void @sub_s64_gpr() { ret void }
+
+  define void @or_s32_gpr() { ret void }
+  define void @or_s64_gpr() { ret void }
+  define void @or_v2s32_fpr() { ret void }
+
+  define void @and_s32_gpr() { ret void }
+  define void @and_s64_gpr() { ret void }
+
+  define void @shl_s32_gpr() { ret void }
+  define void @shl_s64_gpr() { ret void }
+
+  define void @lshr_s32_gpr() { ret void }
+  define void @lshr_s64_gpr() { ret void }
+
+  define void @ashr_s32_gpr() { ret void }
+  define void @ashr_s64_gpr() { ret void }
+
+  define void @mul_s32_gpr() { ret void }
+  define void @mul_s64_gpr() { ret void }
+
+  define void @mulh_s64_gpr() { ret void }
+
+  define void @sdiv_s32_gpr() { ret void }
+  define void @sdiv_s64_gpr() { ret void }
+
+  define void @udiv_s32_gpr() { ret void }
+  define void @udiv_s64_gpr() { ret void }
+
+  define void @fadd_s32_fpr() { ret void }
+  define void @fadd_s64_fpr() { ret void }
+
+  define void @fsub_s32_fpr() { ret void }
+  define void @fsub_s64_fpr() { ret void }
+
+  define void @fmul_s32_fpr() { ret void }
+  define void @fmul_s64_fpr() { ret void }
+
+  define void @fdiv_s32_fpr() { ret void }
+  define void @fdiv_s64_fpr() { ret void }
+
+...
+
+---
+# Check that we select a 32-bit GPR G_ADD into ADDWrr on GPR32.
+# Also check that we constrain the register class of the COPY to GPR32.
+# CHECK-LABEL: name: add_s32_gpr
+name:            add_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = ADDWrr %0, %1
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_ADD %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as add_s32_gpr, for 64-bit operations.
+# CHECK-LABEL: name: add_s64_gpr
+name:            add_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = ADDXrr %0, %1
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_ADD %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# CHECK-LABEL: name: add_imm_s32_gpr
+name:            add_imm_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr32sp }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %2 = ADDWri %0, 1, 0
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = G_CONSTANT i32 1
+    %2(s32) = G_ADD %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# CHECK-LABEL: name: add_imm_s64_gpr
+name:            add_imm_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr64sp }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %2 = ADDXri %0, 1, 0
+body:             |
+  bb.0:
+    liveins: %x0, %w1
+
+    %0(s64) = COPY %x0
+    %1(s64) = G_CONSTANT i32 1
+    %2(s64) = G_ADD %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# CHECK-LABEL: name: add_imm_s32_gpr_bb
+name:            add_imm_s32_gpr_bb
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr32sp }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:  bb.1:
+# CHECK:    %2 = ADDWri %0, 1, 0
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+    successors: %bb.1
+
+    %0(s32) = COPY %w0
+    %1(s32) = G_CONSTANT i32 1
+    G_BR %bb.1
+
+  bb.1:
+    %2(s32) = G_ADD %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as add_s32_gpr, for G_SUB operations.
+# CHECK-LABEL: name: sub_s32_gpr
+name:            sub_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = SUBSWrr %0, %1, implicit-def %nzcv
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_SUB %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as add_s64_gpr, for G_SUB operations.
+# CHECK-LABEL: name: sub_s64_gpr
+name:            sub_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = SUBSXrr %0, %1, implicit-def %nzcv
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_SUB %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# Same as add_s32_gpr, for G_OR operations.
+# CHECK-LABEL: name: or_s32_gpr
+name:            or_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = ORRWrr %0, %1
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_OR %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as add_s64_gpr, for G_OR operations.
+# CHECK-LABEL: name: or_s64_gpr
+name:            or_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = ORRXrr %0, %1
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_OR %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# 64-bit G_OR on vector registers.
+# CHECK-LABEL: name: or_v2s32_fpr
+name:            or_v2s32_fpr
+legalized:       true
+regBankSelected: true
+#
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+# CHECK-NEXT:  - { id: 2, class: fpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = COPY %d1
+# The actual OR does not matter as long as it is operating
+# on 64-bit width vector.
+# CHECK:    %2 = ORRv8i8 %0, %1
+body:             |
+  bb.0:
+    liveins: %d0, %d1
+
+      %0(<2 x s32>) = COPY %d0
+      %1(<2 x s32>) = COPY %d1
+      %2(<2 x s32>) = G_OR %0, %1
+      %d0 = COPY %2(<2 x s32>)
+...
+
+---
+# Same as add_s32_gpr, for G_AND operations.
+# CHECK-LABEL: name: and_s32_gpr
+name:            and_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = ANDWrr %0, %1
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_AND %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as add_s64_gpr, for G_AND operations.
+# CHECK-LABEL: name: and_s64_gpr
+name:            and_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = ANDXrr %0, %1
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_AND %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# Same as add_s32_gpr, for G_SHL operations.
+# CHECK-LABEL: name: shl_s32_gpr
+name:            shl_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = LSLVWr %0, %1
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_SHL %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as add_s64_gpr, for G_SHL operations.
+# CHECK-LABEL: name: shl_s64_gpr
+name:            shl_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = LSLVXr %0, %1
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_SHL %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# Same as add_s32_gpr, for G_LSHR operations.
+# CHECK-LABEL: name: lshr_s32_gpr
+name:            lshr_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = LSRVWr %0, %1
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_LSHR %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as add_s64_gpr, for G_LSHR operations.
+# CHECK-LABEL: name: lshr_s64_gpr
+name:            lshr_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = LSRVXr %0, %1
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_LSHR %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# Same as add_s32_gpr, for G_ASHR operations.
+# CHECK-LABEL: name: ashr_s32_gpr
+name:            ashr_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = ASRVWr %0, %1
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_ASHR %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as add_s64_gpr, for G_ASHR operations.
+# CHECK-LABEL: name: ashr_s64_gpr
+name:            ashr_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = ASRVXr %0, %1
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_ASHR %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# Check that we select s32 GPR G_MUL. This is trickier than other binops because
+# there is only MADDWrrr, and we have to use the WZR physreg.
+# CHECK-LABEL: name: mul_s32_gpr
+name:            mul_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = MADDWrrr %0, %1, %wzr
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_MUL %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as mul_s32_gpr for the s64 type.
+# CHECK-LABEL: name: mul_s64_gpr
+name:            mul_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = MADDXrrr %0, %1, %xzr
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_MUL %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# Same as mul_s32_gpr for the s64 type.
+# CHECK-LABEL: name: mulh_s64_gpr
+name:            mulh_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+# CHECK-NEXT:  - { id: 3, class: gpr64 }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = SMULHrr %0, %1
+# CHECK:    %3 = UMULHrr %0, %1
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0:gpr(s64) = COPY %x0
+    %1:gpr(s64) = COPY %x1
+    %2:gpr(s64) = G_SMULH %0, %1
+    %3:gpr(s64) = G_UMULH %0, %1
+    %x0 = COPY %2(s64)
+    %x0 = COPY %3(s64)
+...
+
+---
+# Same as add_s32_gpr, for G_SDIV operations.
+# CHECK-LABEL: name: sdiv_s32_gpr
+name:            sdiv_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = SDIVWr %0, %1
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_SDIV %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as add_s64_gpr, for G_SDIV operations.
+# CHECK-LABEL: name: sdiv_s64_gpr
+name:            sdiv_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = SDIVXr %0, %1
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_SDIV %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# Same as add_s32_gpr, for G_UDIV operations.
+# CHECK-LABEL: name: udiv_s32_gpr
+name:            udiv_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = UDIVWr %0, %1
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_UDIV %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as add_s64_gpr, for G_UDIV operations.
+# CHECK-LABEL: name: udiv_s64_gpr
+name:            udiv_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = UDIVXr %0, %1
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_UDIV %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# Check that we select a s32 FPR G_FADD into FADDSrr.
+# CHECK-LABEL: name: fadd_s32_fpr
+name:            fadd_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+# CHECK-NEXT:  - { id: 2, class: fpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = COPY %s1
+# CHECK:    %2 = FADDSrr %0, %1
+body:             |
+  bb.0:
+    liveins: %s0, %s1
+
+    %0(s32) = COPY %s0
+    %1(s32) = COPY %s1
+    %2(s32) = G_FADD %0, %1
+    %s0 = COPY %2(s32)
+...
+
+---
+# CHECK-LABEL: name: fadd_s64_fpr
+name:            fadd_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+# CHECK-NEXT:  - { id: 2, class: fpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = COPY %d1
+# CHECK:    %2 = FADDDrr %0, %1
+body:             |
+  bb.0:
+    liveins: %d0, %d1
+
+    %0(s64) = COPY %d0
+    %1(s64) = COPY %d1
+    %2(s64) = G_FADD %0, %1
+    %d0 = COPY %2(s64)
+...
+
+---
+# CHECK-LABEL: name: fsub_s32_fpr
+name:            fsub_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+# CHECK-NEXT:  - { id: 2, class: fpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = COPY %s1
+# CHECK:    %2 = FSUBSrr %0, %1
+body:             |
+  bb.0:
+    liveins: %s0, %s1
+
+    %0(s32) = COPY %s0
+    %1(s32) = COPY %s1
+    %2(s32) = G_FSUB %0, %1
+    %s0 = COPY %2(s32)
+...
+
+---
+# CHECK-LABEL: name: fsub_s64_fpr
+name:            fsub_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+# CHECK-NEXT:  - { id: 2, class: fpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = COPY %d1
+# CHECK:    %2 = FSUBDrr %0, %1
+body:             |
+  bb.0:
+    liveins: %d0, %d1
+
+    %0(s64) = COPY %d0
+    %1(s64) = COPY %d1
+    %2(s64) = G_FSUB %0, %1
+    %d0 = COPY %2(s64)
+...
+
+---
+# CHECK-LABEL: name: fmul_s32_fpr
+name:            fmul_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+# CHECK-NEXT:  - { id: 2, class: fpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = COPY %s1
+# CHECK:    %2 = FMULSrr %0, %1
+body:             |
+  bb.0:
+    liveins: %s0, %s1
+
+    %0(s32) = COPY %s0
+    %1(s32) = COPY %s1
+    %2(s32) = G_FMUL %0, %1
+    %s0 = COPY %2(s32)
+...
+
+---
+# CHECK-LABEL: name: fmul_s64_fpr
+name:            fmul_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+# CHECK-NEXT:  - { id: 2, class: fpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = COPY %d1
+# CHECK:    %2 = FMULDrr %0, %1
+body:             |
+  bb.0:
+    liveins: %d0, %d1
+
+    %0(s64) = COPY %d0
+    %1(s64) = COPY %d1
+    %2(s64) = G_FMUL %0, %1
+    %d0 = COPY %2(s64)
+...
+
+---
+# CHECK-LABEL: name: fdiv_s32_fpr
+name:            fdiv_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+# CHECK-NEXT:  - { id: 2, class: fpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = COPY %s1
+# CHECK:    %2 = FDIVSrr %0, %1
+body:             |
+  bb.0:
+    liveins: %s0, %s1
+
+    %0(s32) = COPY %s0
+    %1(s32) = COPY %s1
+    %2(s32) = G_FDIV %0, %1
+    %s0 = COPY %2(s32)
+...
+
+---
+# CHECK-LABEL: name: fdiv_s64_fpr
+name:            fdiv_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+# CHECK-NEXT:  - { id: 2, class: fpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = COPY %d1
+# CHECK:    %2 = FDIVDrr %0, %1
+body:             |
+  bb.0:
+    liveins: %d0, %d1
+
+    %0(s64) = COPY %d0
+    %1(s64) = COPY %d1
+    %2(s64) = G_FDIV %0, %1
+    %d0 = COPY %2(s64)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-bitcast.mir b/test/CodeGen/AArch64/GlobalISel/select-bitcast.mir
new file mode 100644
index 0000000000000000000000000000000000000000..5ca63dbc214d55ba8ce03ce3be0f3af6d6297dc9
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-bitcast.mir
@@ -0,0 +1,212 @@
+# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @bitcast_s32_gpr() { ret void }
+  define void @bitcast_s32_fpr() { ret void }
+  define void @bitcast_s32_gpr_fpr() { ret void }
+  define void @bitcast_s32_fpr_gpr() { ret void }
+  define void @bitcast_s64_gpr() { ret void }
+  define void @bitcast_s64_fpr() { ret void }
+  define void @bitcast_s64_gpr_fpr() { ret void }
+  define void @bitcast_s64_fpr_gpr() { ret void }
+...
+
+---
+# CHECK-LABEL: name: bitcast_s32_gpr
+name:            bitcast_s32_gpr
+legalized:       true
+regBankSelected: true
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32all }
+# CHECK-NEXT:  - { id: 1, class: gpr32all }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s32) = G_BITCAST %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: bitcast_s32_fpr
+name:            bitcast_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %s0
+
+    %0(s32) = COPY %s0
+    %1(s32) = G_BITCAST %0
+    %s0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: bitcast_s32_gpr_fpr
+name:            bitcast_s32_gpr_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32all }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s32) = G_BITCAST %0
+    %s0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: bitcast_s32_fpr_gpr
+name:            bitcast_s32_fpr_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32all }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %s0
+
+    %0(s32) = COPY %s0
+    %1(s32) = G_BITCAST %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: bitcast_s64_gpr
+name:            bitcast_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64all }
+# CHECK-NEXT:  - { id: 1, class: gpr64all }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(s64) = COPY %x0
+    %1(s64) = G_BITCAST %0
+    %x0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: bitcast_s64_fpr
+name:            bitcast_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %d0
+
+    %0(s64) = COPY %d0
+    %1(s64) = G_BITCAST %0
+    %d0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: bitcast_s64_gpr_fpr
+name:            bitcast_s64_gpr_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64all }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(s64) = COPY %x0
+    %1(s64) = G_BITCAST %0
+    %d0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: bitcast_s64_fpr_gpr
+name:            bitcast_s64_fpr_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64all }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %d0
+
+    %0(s64) = COPY %d0
+    %1(s64) = G_BITCAST %0
+    %x0 = COPY %1(s64)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-br.mir b/test/CodeGen/AArch64/GlobalISel/select-br.mir
new file mode 100644
index 0000000000000000000000000000000000000000..f46f190260f646bc16963f6d76b8eaf11c0ccb29
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-br.mir
@@ -0,0 +1,71 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @unconditional_br() { ret void }
+  define void @conditional_br() { ret void }
+  define void @indirect_br() { ret void }
+...
+
+---
+# CHECK-LABEL: name: unconditional_br
+name:            unconditional_br
+legalized:       true
+regBankSelected: true
+
+# CHECK:  body:
+# CHECK:   bb.0:
+# CHECK:    successors: %bb.0
+# CHECK:    B %bb.0
+body:             |
+  bb.0:
+    successors: %bb.0
+
+    G_BR %bb.0
+...
+
+---
+# CHECK-LABEL: name: conditional_br
+name:            conditional_br
+legalized:       true
+regBankSelected: true
+
+registers:
+  - { id: 0, class: gpr }
+
+# CHECK:  body:
+# CHECK:   bb.0:
+# CHECK:    TBNZW %0, 0, %bb.1
+# CHECK:    B %bb.0
+body:             |
+  bb.0:
+    successors: %bb.0, %bb.1
+    %0(s1) = COPY %w0
+    G_BRCOND %0(s1), %bb.1
+    G_BR %bb.0
+
+  bb.1:
+...
+
+---
+# CHECK-LABEL: name: indirect_br
+name:            indirect_br
+legalized:       true
+regBankSelected: true
+
+registers:
+  - { id: 0, class: gpr }
+
+# CHECK:  body:
+# CHECK:   bb.0:
+# CHECK:    %0 = COPY %x0
+# CHECK:    BR %0
+body:             |
+  bb.0:
+    successors: %bb.0, %bb.1
+    %0(p0) = COPY %x0
+    G_BRINDIRECT %0(p0)
+
+  bb.1:
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-cbz.mir b/test/CodeGen/AArch64/GlobalISel/select-cbz.mir
new file mode 100644
index 0000000000000000000000000000000000000000..2decb994b967b463cecaeed25f00f1d4b002fa5c
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-cbz.mir
@@ -0,0 +1,108 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  define void @cbz_s32() { ret void }
+  define void @cbz_s64() { ret void }
+  define void @cbnz_s32() { ret void }
+  define void @cbnz_s64() { ret void }
+...
+
+---
+# CHECK-LABEL: name: cbz_s32
+name:            cbz_s32
+legalized:       true
+regBankSelected: true
+
+# CHECK:  body:
+# CHECK:   bb.0:
+# CHECK:    %0 = COPY %w0
+# CHECK:    CBZW %0, %bb.1
+# CHECK:    B %bb.0
+body:             |
+  bb.0:
+    liveins: %w0
+    successors: %bb.0, %bb.1
+
+    %0:gpr(s32) = COPY %w0
+    %1:gpr(s32) = G_CONSTANT i32 0
+    %2:gpr(s1) = G_ICMP intpred(eq), %0, %1
+    G_BRCOND %2(s1), %bb.1
+    G_BR %bb.0
+
+  bb.1:
+...
+
+---
+# CHECK-LABEL: name: cbz_s64
+name:            cbz_s64
+legalized:       true
+regBankSelected: true
+
+# CHECK:  body:
+# CHECK:   bb.0:
+# CHECK:    %0 = COPY %x0
+# CHECK:    CBZX %0, %bb.1
+# CHECK:    B %bb.0
+body:             |
+  bb.0:
+    liveins: %x0
+    successors: %bb.0, %bb.1
+
+    %0:gpr(s64) = COPY %x0
+    %1:gpr(s64) = G_CONSTANT i64 0
+    %2:gpr(s1) = G_ICMP intpred(eq), %0, %1
+    G_BRCOND %2(s1), %bb.1
+    G_BR %bb.0
+
+  bb.1:
+...
+
+---
+# CHECK-LABEL: name: cbnz_s32
+name:            cbnz_s32
+legalized:       true
+regBankSelected: true
+
+# CHECK:  body:
+# CHECK:   bb.0:
+# CHECK:    %0 = COPY %w0
+# CHECK:    CBNZW %0, %bb.1
+# CHECK:    B %bb.0
+body:             |
+  bb.0:
+    liveins: %w0
+    successors: %bb.0, %bb.1
+
+    %0:gpr(s32) = COPY %w0
+    %1:gpr(s32) = G_CONSTANT i32 0
+    %2:gpr(s1) = G_ICMP intpred(ne), %0, %1
+    G_BRCOND %2(s1), %bb.1
+    G_BR %bb.0
+
+  bb.1:
+...
+
+---
+# CHECK-LABEL: name: cbnz_s64
+name:            cbnz_s64
+legalized:       true
+regBankSelected: true
+
+# CHECK:  body:
+# CHECK:   bb.0:
+# CHECK:    %0 = COPY %x0
+# CHECK:    CBNZX %0, %bb.1
+# CHECK:    B %bb.0
+body:             |
+  bb.0:
+    liveins: %x0
+    successors: %bb.0, %bb.1
+
+    %0:gpr(s64) = COPY %x0
+    %1:gpr(s64) = G_CONSTANT i64 0
+    %2:gpr(s1) = G_ICMP intpred(ne), %0, %1
+    G_BRCOND %2(s1), %bb.1
+    G_BR %bb.0
+
+  bb.1:
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-constant.mir b/test/CodeGen/AArch64/GlobalISel/select-constant.mir
new file mode 100644
index 0000000000000000000000000000000000000000..1a5bac9fb7d6f4b3e9ed79c5378c414a456b1955
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-constant.mir
@@ -0,0 +1,77 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define i32 @const_s32() { ret i32 42 }
+  define i64 @const_s64() { ret i64 1234567890123 }
+
+  define i32 @fconst_s32() { ret i32 42 }
+  define i64 @fconst_s64() { ret i64 1234567890123 }
+...
+
+---
+# CHECK-LABEL: name: const_s32
+name:            const_s32
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = MOVi32imm 42
+body:             |
+  bb.0:
+    %0(s32) = G_CONSTANT i32 42
+    %w0 = COPY %0(s32)
+...
+
+---
+# CHECK-LABEL: name: const_s64
+name:            const_s64
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = MOVi64imm 1234567890123
+body:             |
+  bb.0:
+    %0(s64) = G_CONSTANT i64 1234567890123
+    %x0 = COPY %0(s64)
+...
+
+---
+# CHECK-LABEL: name: fconst_s32
+name:            fconst_s32
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: fpr }
+
+# CHECK:  body:
+# CHECK: [[TMP:%[0-9]+]] = MOVi32imm 1080033280
+# CHECK: %0 = COPY [[TMP]]
+body:             |
+  bb.0:
+    %0(s32) = G_FCONSTANT float 3.5
+    %s0 = COPY %0(s32)
+...
+
+---
+# CHECK-LABEL: name: fconst_s64
+name:            fconst_s64
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: fpr }
+
+# CHECK:  body:
+# CHECK: [[TMP:%[0-9]+]] = MOVi64imm 4607182418800017408
+# CHECK: %0 = COPY [[TMP]]
+body:             |
+  bb.0:
+    %0(s64) = G_FCONSTANT double 1.0
+    %d0 = COPY %0(s64)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir b/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir
new file mode 100644
index 0000000000000000000000000000000000000000..2f36ec8d2aaa9261f36e42fbb264b8fbe9b2dbe9
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-dbg-value.mir
@@ -0,0 +1,69 @@
+# RUN: llc -O0 -mtriple arm64-- -run-pass=instruction-select -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @test_dbg_value(i32 %a) !dbg !5 {
+    %tmp0 = add i32 %a, %a
+    call void @llvm.dbg.value(metadata i32 %tmp0, i64 0, metadata !7, metadata !9), !dbg !10
+    ret void
+  }
+
+  define void @test_dbg_value_dead(i32 %a) !dbg !5 {
+    call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !7, metadata !9), !dbg !10
+    ret void
+  }
+
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "llvm", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "test.ll", directory: "/tmp")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 4}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = distinct !DISubprogram(name: "test_dbg_value", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+  !6 = !DISubroutineType(types: !2)
+  !7 = !DILocalVariable(name: "in", arg: 1, scope: !5, file: !1, line: 1, type: !8)
+  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !9 = !DIExpression()
+  !10 = !DILocation(line: 1, column: 1, scope: !5)
+...
+
+---
+# CHECK-LABEL: name: test_dbg_value
+name:            test_dbg_value
+legalized:       true
+regBankSelected: true
+body: |
+  bb.0:
+    liveins: %w0
+    %0:gpr(s32) = COPY %w0
+    %1:gpr(s32) = G_ADD %0, %0
+    %w0 = COPY %1(s32)
+
+    ; CHECK:      %0 = COPY %w0
+    ; CHECK-NEXT: %1 = ADDWrr %0, %0
+    ; CHECK-NEXT: %w0 = COPY %1
+    ; CHECK-NEXT: DBG_VALUE debug-use %1, debug-use _, !7, !9, debug-location !10
+
+    DBG_VALUE debug-use %1(s32), debug-use _, !7, !9, debug-location !10
+...
+
+---
+# CHECK-LABEL: name: test_dbg_value_dead
+name:            test_dbg_value_dead
+legalized:       true
+regBankSelected: true
+body: |
+  bb.0:
+    liveins: %w0
+    %0:gpr(s32) = COPY %w0
+
+    ; CHECK-NOT: COPY
+    ; CHECK: DBG_VALUE debug-use _, debug-use _, !7, !9, debug-location !10
+
+    DBG_VALUE debug-use %0(s32), debug-use _, !7, !9, debug-location !10
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir b/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..fbb11a1c7a4c6c8520ccac36651a3670cdb3f3e0
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-fp-casts.mir
@@ -0,0 +1,478 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @fptrunc() { ret void }
+  define void @fpext() { ret void }
+
+  define void @sitofp_s32_s32_fpr() { ret void }
+  define void @sitofp_s32_s64_fpr() { ret void }
+  define void @sitofp_s64_s32_fpr() { ret void }
+  define void @sitofp_s64_s64_fpr() { ret void }
+
+  define void @uitofp_s32_s32_fpr() { ret void }
+  define void @uitofp_s32_s64_fpr() { ret void }
+  define void @uitofp_s64_s32_fpr() { ret void }
+  define void @uitofp_s64_s64_fpr() { ret void }
+
+  define void @fptosi_s32_s32_gpr() { ret void }
+  define void @fptosi_s32_s64_gpr() { ret void }
+  define void @fptosi_s64_s32_gpr() { ret void }
+  define void @fptosi_s64_s64_gpr() { ret void }
+
+  define void @fptoui_s32_s32_gpr() { ret void }
+  define void @fptoui_s32_s64_gpr() { ret void }
+  define void @fptoui_s64_s32_gpr() { ret void }
+  define void @fptoui_s64_s64_gpr() { ret void }
+...
+
+---
+# CHECK-LABEL: name: fptrunc
+name:            fptrunc
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK: - { id: 0, class: fpr64 }
+# CHECK: - { id: 1, class: fpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = FCVTSDr %0
+body:             |
+  bb.0:
+    liveins: %d0
+
+    %0(s64) = COPY %d0
+    %1(s32) = G_FPTRUNC %0
+    %s0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: fpext
+name:            fpext
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK: - { id: 0, class: fpr32 }
+# CHECK: - { id: 1, class: fpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = FCVTDSr %0
+body:             |
+  bb.0:
+    liveins: %d0
+
+    %0(s32) = COPY %s0
+    %1(s64) = G_FPEXT %0
+    %d0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: sitofp_s32_s32_fpr
+name:            sitofp_s32_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = SCVTFUWSri %0
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s32) = G_SITOFP %0
+    %s0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: sitofp_s32_s64_fpr
+name:            sitofp_s32_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = SCVTFUXSri %0
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(s64) = COPY %x0
+    %1(s32) = G_SITOFP %0
+    %s0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: sitofp_s64_s32_fpr
+name:            sitofp_s64_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = SCVTFUWDri %0
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s64) = G_SITOFP %0
+    %d0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: sitofp_s64_s64_fpr
+name:            sitofp_s64_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = SCVTFUXDri %0
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(s64) = COPY %x0
+    %1(s64) = G_SITOFP %0
+    %d0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: uitofp_s32_s32_fpr
+name:            uitofp_s32_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = UCVTFUWSri %0
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s32) = G_UITOFP %0
+    %s0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: uitofp_s32_s64_fpr
+name:            uitofp_s32_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = UCVTFUXSri %0
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(s64) = COPY %x0
+    %1(s32) = G_UITOFP %0
+    %s0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: uitofp_s64_s32_fpr
+name:            uitofp_s64_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = UCVTFUWDri %0
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s64) = G_UITOFP %0
+    %d0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: uitofp_s64_s64_fpr
+name:            uitofp_s64_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = UCVTFUXDri %0
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(s64) = COPY %x0
+    %1(s64) = G_UITOFP %0
+    %d0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: fptosi_s32_s32_gpr
+name:            fptosi_s32_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = FCVTZSUWSr %0
+body:             |
+  bb.0:
+    liveins: %s0
+
+    %0(s32) = COPY %s0
+    %1(s32) = G_FPTOSI %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: fptosi_s32_s64_gpr
+name:            fptosi_s32_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = FCVTZSUWDr %0
+body:             |
+  bb.0:
+    liveins: %d0
+
+    %0(s64) = COPY %d0
+    %1(s32) = G_FPTOSI %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: fptosi_s64_s32_gpr
+name:            fptosi_s64_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = FCVTZSUXSr %0
+body:             |
+  bb.0:
+    liveins: %s0
+
+    %0(s32) = COPY %s0
+    %1(s64) = G_FPTOSI %0
+    %x0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: fptosi_s64_s64_gpr
+name:            fptosi_s64_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = FCVTZSUXDr %0
+body:             |
+  bb.0:
+    liveins: %d0
+
+    %0(s64) = COPY %d0
+    %1(s64) = G_FPTOSI %0
+    %x0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: fptoui_s32_s32_gpr
+name:            fptoui_s32_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = FCVTZUUWSr %0
+body:             |
+  bb.0:
+    liveins: %s0
+
+    %0(s32) = COPY %s0
+    %1(s32) = G_FPTOUI %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: fptoui_s32_s64_gpr
+name:            fptoui_s32_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = FCVTZUUWDr %0
+body:             |
+  bb.0:
+    liveins: %d0
+
+    %0(s64) = COPY %d0
+    %1(s32) = G_FPTOUI %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: fptoui_s64_s32_gpr
+name:            fptoui_s64_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %s0
+# CHECK:    %1 = FCVTZUUXSr %0
+body:             |
+  bb.0:
+    liveins: %s0
+
+    %0(s32) = COPY %s0
+    %1(s64) = G_FPTOUI %0
+    %x0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: fptoui_s64_s64_gpr
+name:            fptoui_s64_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %d0
+# CHECK:    %1 = FCVTZUUXDr %0
+body:             |
+  bb.0:
+    liveins: %d0
+
+    %0(s64) = COPY %d0
+    %1(s64) = G_FPTOUI %0
+    %x0 = COPY %1(s64)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir b/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir
new file mode 100644
index 0000000000000000000000000000000000000000..2ba8b7366252eff4a02cd94d707c54b012167794
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-int-ext.mir
@@ -0,0 +1,274 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @anyext_s64_from_s32() { ret void }
+  define void @anyext_s32_from_s8() { ret void }
+
+  define void @zext_s64_from_s32() { ret void }
+  define void @zext_s32_from_s16() { ret void }
+  define void @zext_s32_from_s8() { ret void }
+  define void @zext_s16_from_s8() { ret void }
+
+  define void @sext_s64_from_s32() { ret void }
+  define void @sext_s32_from_s16() { ret void }
+  define void @sext_s32_from_s8() { ret void }
+  define void @sext_s16_from_s8() { ret void }
+...
+
+---
+# CHECK-LABEL: name: anyext_s64_from_s32
+name:            anyext_s64_from_s32
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32all }
+# CHECK-NEXT:  - { id: 1, class: gpr64all }
+# CHECK-NEXT:  - { id: 2, class: gpr64all }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %2 = SUBREG_TO_REG 0, %0, 15
+# CHECK:    %1 = COPY %2
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s64) = G_ANYEXT %0
+    %x0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: anyext_s32_from_s8
+name:            anyext_s32_from_s8
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32all }
+# CHECK-NEXT:  - { id: 1, class: gpr32all }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s8) = COPY %w0
+    %1(s32) = G_ANYEXT %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: zext_s64_from_s32
+name:            zext_s64_from_s32
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %2 = SUBREG_TO_REG 0, %0, 15
+# CHECK:    %1 = UBFMXri %2, 0, 31
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s64) = G_ZEXT %0
+    %x0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: zext_s32_from_s16
+name:            zext_s32_from_s16
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = UBFMWri %0, 0, 15
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s16) = COPY %w0
+    %1(s32) = G_ZEXT %0
+    %w0 = COPY %1
+...
+
+---
+# CHECK-LABEL: name: zext_s32_from_s8
+name:            zext_s32_from_s8
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = UBFMWri %0, 0, 7
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s8) = COPY %w0
+    %1(s32) = G_ZEXT %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: zext_s16_from_s8
+name:            zext_s16_from_s8
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = UBFMWri %0, 0, 7
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s8) = COPY %w0
+    %1(s16) = G_ZEXT %0
+    %w0 = COPY %1(s16)
+...
+
+---
+# CHECK-LABEL: name: sext_s64_from_s32
+name:            sext_s64_from_s32
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %2 = SUBREG_TO_REG 0, %0, 15
+# CHECK:    %1 = SBFMXri %2, 0, 31
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s64) = G_SEXT %0
+    %x0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: sext_s32_from_s16
+name:            sext_s32_from_s16
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = SBFMWri %0, 0, 15
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s16) = COPY %w0
+    %1(s32) = G_SEXT %0
+    %w0 = COPY %1
+...
+
+---
+# CHECK-LABEL: name: sext_s32_from_s8
+name:            sext_s32_from_s8
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = SBFMWri %0, 0, 7
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s8) = COPY %w0
+    %1(s32) = G_SEXT %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: sext_s16_from_s8
+name:            sext_s16_from_s8
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = SBFMWri %0, 0, 7
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s8) = COPY %w0
+    %1(s16) = G_SEXT %0
+    %w0 = COPY %1(s16)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-int-ptr-casts.mir b/test/CodeGen/AArch64/GlobalISel/select-int-ptr-casts.mir
new file mode 100644
index 0000000000000000000000000000000000000000..6537408f6d9874cc6a256351992310105de02350
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-int-ptr-casts.mir
@@ -0,0 +1,150 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @inttoptr_p0_s64() { ret void }
+  define void @ptrtoint_s64_p0() { ret void }
+  define void @ptrtoint_s32_p0() { ret void }
+  define void @ptrtoint_s16_p0() { ret void }
+  define void @ptrtoint_s8_p0() { ret void }
+  define void @ptrtoint_s1_p0() { ret void }
+...
+
+---
+# CHECK-LABEL: name: inttoptr_p0_s64
+name:            inttoptr_p0_s64
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64all }
+# CHECK-NEXT:  - { id: 1, class: gpr64all }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %x0
+    %0(s64) = COPY %x0
+    %1(p0) = G_INTTOPTR %0
+    %x0 = COPY %1(p0)
+...
+
+---
+# CHECK-LABEL: name: ptrtoint_s64_p0
+name:            ptrtoint_s64_p0
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %x0
+    %0(p0) = COPY %x0
+    %1(s64) = G_PTRTOINT %0
+    %x0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: ptrtoint_s32_p0
+name:            ptrtoint_s32_p0
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %0.sub_32
+body:             |
+  bb.0:
+    liveins: %x0
+    %0(p0) = COPY %x0
+    %1(s32) = G_PTRTOINT %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: ptrtoint_s16_p0
+name:            ptrtoint_s16_p0
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %0.sub_32
+body:             |
+  bb.0:
+    liveins: %x0
+    %0(p0) = COPY %x0
+    %1(s16) = G_PTRTOINT %0
+    %w0 = COPY %1(s16)
+...
+
+---
+# CHECK-LABEL: name: ptrtoint_s8_p0
+name:            ptrtoint_s8_p0
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %0.sub_32
+body:             |
+  bb.0:
+    liveins: %x0
+    %0(p0) = COPY %x0
+    %1(s8) = G_PTRTOINT %0
+    %w0 = COPY %1(s8)
+...
+
+---
+# CHECK-LABEL: name: ptrtoint_s1_p0
+name:            ptrtoint_s1_p0
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %0.sub_32
+body:             |
+  bb.0:
+    liveins: %x0
+    %0(p0) = COPY %x0
+    %1(s1) = G_PTRTOINT %0
+    %w0 = COPY %1(s1)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-load.mir b/test/CodeGen/AArch64/GlobalISel/select-load.mir
new file mode 100644
index 0000000000000000000000000000000000000000..9188e2b0c0fcc3b10666f9ef20df17f04394c659
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-load.mir
@@ -0,0 +1,515 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @load_s64_gpr(i64* %addr) { ret void }
+  define void @load_s32_gpr(i32* %addr) { ret void }
+  define void @load_s16_gpr(i16* %addr) { ret void }
+  define void @load_s8_gpr(i8* %addr) { ret void }
+
+  define void @load_fi_s64_gpr() {
+    %ptr0 = alloca i64
+    ret void
+  }
+
+  define void @load_gep_128_s64_gpr(i64* %addr) { ret void }
+  define void @load_gep_512_s32_gpr(i32* %addr) { ret void }
+  define void @load_gep_64_s16_gpr(i16* %addr) { ret void }
+  define void @load_gep_1_s8_gpr(i8* %addr) { ret void }
+
+  define void @load_s64_fpr(i64* %addr) { ret void }
+  define void @load_s32_fpr(i32* %addr) { ret void }
+  define void @load_s16_fpr(i16* %addr) { ret void }
+  define void @load_s8_fpr(i8* %addr) { ret void }
+
+  define void @load_gep_8_s64_fpr(i64* %addr) { ret void }
+  define void @load_gep_16_s32_fpr(i32* %addr) { ret void }
+  define void @load_gep_64_s16_fpr(i16* %addr) { ret void }
+  define void @load_gep_32_s8_fpr(i8* %addr) { ret void }
+
+...
+
+---
+# CHECK-LABEL: name: load_s64_gpr
+name:            load_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = LDRXui %0, 0 :: (load 8 from %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_LOAD  %0 :: (load 8 from %ir.addr)
+    %x0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: load_s32_gpr
+name:            load_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = LDRWui %0, 0 :: (load 4 from %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s32) = G_LOAD  %0 :: (load 4 from %ir.addr)
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: load_s16_gpr
+name:            load_s16_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = LDRHHui %0, 0 :: (load 2 from %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s16) = G_LOAD  %0 :: (load 2 from %ir.addr)
+    %w0 = COPY %1(s16)
+...
+
+---
+# CHECK-LABEL: name: load_s8_gpr
+name:            load_s8_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = LDRBBui %0, 0 :: (load 1 from %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s8) = G_LOAD  %0 :: (load 1 from %ir.addr)
+    %w0 = COPY %1(s8)
+...
+
+---
+# CHECK-LABEL: name: load_fi_s64_gpr
+name:            load_fi_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+stack:
+  - { id: 0, name: ptr0, offset: 0, size: 8, alignment: 8 }
+
+# CHECK:  body:
+# CHECK: %1 = LDRXui %stack.0.ptr0, 0 :: (load 8)
+# CHECK: %x0 = COPY %1
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = G_FRAME_INDEX %stack.0.ptr0
+    %1(s64) = G_LOAD %0 :: (load 8)
+    %x0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: load_gep_128_s64_gpr
+name:            load_gep_128_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %3 = LDRXui %0, 16 :: (load 8 from %ir.addr)
+# CHECK: %x0 = COPY %3
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_CONSTANT i64 128
+    %2(p0) = G_GEP %0, %1
+    %3(s64) = G_LOAD %2 :: (load 8 from %ir.addr)
+    %x0 = COPY %3
+...
+
+---
+# CHECK-LABEL: name: load_gep_512_s32_gpr
+name:            load_gep_512_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %3 = LDRWui %0, 128 :: (load 4 from %ir.addr)
+# CHECK: %w0 = COPY %3
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_CONSTANT i64 512
+    %2(p0) = G_GEP %0, %1
+    %3(s32) = G_LOAD %2 :: (load 4 from %ir.addr)
+    %w0 = COPY %3
+...
+
+---
+# CHECK-LABEL: name: load_gep_64_s16_gpr
+name:            load_gep_64_s16_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %3 = LDRHHui %0, 32 :: (load 2 from %ir.addr)
+# CHECK: %w0 = COPY %3
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_CONSTANT i64 64
+    %2(p0) = G_GEP %0, %1
+    %3(s16) = G_LOAD %2 :: (load 2 from %ir.addr)
+    %w0 = COPY %3
+...
+
+---
+# CHECK-LABEL: name: load_gep_1_s8_gpr
+name:            load_gep_1_s8_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %3 = LDRBBui %0, 1 :: (load 1 from %ir.addr)
+# CHECK: %w0 = COPY %3
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_CONSTANT i64 1
+    %2(p0) = G_GEP %0, %1
+    %3(s8) = G_LOAD %2 :: (load 1 from %ir.addr)
+    %w0 = COPY %3
+...
+
+---
+# CHECK-LABEL: name: load_s64_fpr
+name:            load_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = LDRDui %0, 0 :: (load 8 from %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_LOAD  %0 :: (load 8 from %ir.addr)
+    %d0 = COPY %1(s64)
+...
+
+---
+# CHECK-LABEL: name: load_s32_fpr
+name:            load_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = LDRSui %0, 0 :: (load 4 from %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s32) = G_LOAD  %0 :: (load 4 from %ir.addr)
+    %s0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: load_s16_fpr
+name:            load_s16_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: fpr16 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = LDRHui %0, 0 :: (load 2 from %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s16) = G_LOAD  %0 :: (load 2 from %ir.addr)
+    %h0 = COPY %1(s16)
+...
+
+---
+# CHECK-LABEL: name: load_s8_fpr
+name:            load_s8_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: fpr8 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = LDRBui %0, 0 :: (load 1 from %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s8) = G_LOAD  %0 :: (load 1 from %ir.addr)
+    %b0 = COPY %1(s8)
+...
+
+---
+# CHECK-LABEL: name: load_gep_8_s64_fpr
+name:            load_gep_8_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: fpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: fpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %3 = LDRDui %0, 1 :: (load 8 from %ir.addr)
+# CHECK: %d0 = COPY %3
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_CONSTANT i64 8
+    %2(p0) = G_GEP %0, %1
+    %3(s64) = G_LOAD %2 :: (load 8 from %ir.addr)
+    %d0 = COPY %3
+...
+
+---
+# CHECK-LABEL: name: load_gep_16_s32_fpr
+name:            load_gep_16_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: fpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: fpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %3 = LDRSui %0, 4 :: (load 4 from %ir.addr)
+# CHECK: %s0 = COPY %3
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_CONSTANT i64 16
+    %2(p0) = G_GEP %0, %1
+    %3(s32) = G_LOAD %2 :: (load 4 from %ir.addr)
+    %s0 = COPY %3
+...
+
+---
+# CHECK-LABEL: name: load_gep_64_s16_fpr
+name:            load_gep_64_s16_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: fpr16 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: fpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %3 = LDRHui %0, 32 :: (load 2 from %ir.addr)
+# CHECK: %h0 = COPY %3
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_CONSTANT i64 64
+    %2(p0) = G_GEP %0, %1
+    %3(s16) = G_LOAD %2 :: (load 2 from %ir.addr)
+    %h0 = COPY %3
+...
+
+---
+# CHECK-LABEL: name: load_gep_32_s8_fpr
+name:            load_gep_32_s8_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: fpr8 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: fpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %3 = LDRBui %0, 32 :: (load 1 from %ir.addr)
+# CHECK: %b0 = COPY %3
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_CONSTANT i64 32
+    %2(p0) = G_GEP %0, %1
+    %3(s8) = G_LOAD %2 :: (load 1 from %ir.addr)
+    %b0 = COPY %3
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-muladd.mir b/test/CodeGen/AArch64/GlobalISel/select-muladd.mir
new file mode 100644
index 0000000000000000000000000000000000000000..7d5b43bc16d5be187a63d72ea6b997d6550c5489
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-muladd.mir
@@ -0,0 +1,50 @@
+# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @SMADDLrrr_gpr() { ret void }
+...
+
+---
+# CHECK-LABEL: name: SMADDLrrr_gpr
+name:            SMADDLrrr_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+# CHECK-NEXT:  - { id: 3, class: gpr }
+# CHECK-NEXT:  - { id: 4, class: gpr }
+# CHECK-NEXT:  - { id: 5, class: gpr }
+# CHECK-NEXT:  - { id: 6, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+  - { id: 5, class: gpr }
+  - { id: 6, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = COPY %w2
+# CHECK:    %6 = SMADDLrrr %1, %2, %0
+body:             |
+  bb.0:
+    liveins: %x0, %w1, %w2
+
+    %0(s64) = COPY %x0
+    %1(s32) = COPY %w1
+    %2(s32) = COPY %w2
+    %3(s64) = G_SEXT %1
+    %4(s64) = G_SEXT %2
+    %5(s64) = G_MUL %3, %4
+    %6(s64) = G_ADD %0, %5
+    %x0 = COPY %6
+...
+
diff --git a/test/CodeGen/AArch64/GlobalISel/select-property.mir b/test/CodeGen/AArch64/GlobalISel/select-property.mir
new file mode 100644
index 0000000000000000000000000000000000000000..86961ac597e172821abe2e2eadc5f0842de94ed8
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-property.mir
@@ -0,0 +1,21 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @selected_property() { ret void }
+...
+
+---
+# Check that we set the "selected" property.
+# CHECK-LABEL: name: selected_property
+# CHECK: legalized: true
+# CHECK-NEXT: regBankSelected: true
+# CHECK-NEXT: selected: true
+name:            selected_property
+legalized:       true
+regBankSelected: true
+selected:        false
+body:             |
+  bb.0:
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-store.mir b/test/CodeGen/AArch64/GlobalISel/select-store.mir
new file mode 100644
index 0000000000000000000000000000000000000000..9b8f5c566ce0de3d5a11cb0cdd09a1f8fdb51f07
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-store.mir
@@ -0,0 +1,463 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @store_s64_gpr(i64* %addr) { ret void }
+  define void @store_s32_gpr(i32* %addr) { ret void }
+  define void @store_s16_gpr(i16* %addr) { ret void }
+  define void @store_s8_gpr(i8* %addr) { ret void }
+
+  define void @store_zero_s64_gpr(i64* %addr) { ret void }
+  define void @store_zero_s32_gpr(i32* %addr) { ret void }
+
+  define void @store_fi_s64_gpr() {
+    %ptr0 = alloca i64
+    ret void
+  }
+
+  define void @store_gep_128_s64_gpr(i64* %addr) { ret void }
+  define void @store_gep_512_s32_gpr(i32* %addr) { ret void }
+  define void @store_gep_64_s16_gpr(i16* %addr) { ret void }
+  define void @store_gep_1_s8_gpr(i8* %addr) { ret void }
+
+  define void @store_s64_fpr(i64* %addr) { ret void }
+  define void @store_s32_fpr(i32* %addr) { ret void }
+
+  define void @store_gep_8_s64_fpr(i64* %addr) { ret void }
+  define void @store_gep_8_s32_fpr(i32* %addr) { ret void }
+...
+
+---
+# CHECK-LABEL: name: store_s64_gpr
+name:            store_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %x1
+# CHECK: STRXui %1, %0, 0 :: (store 8 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(p0) = COPY %x0
+    %1(s64) = COPY %x1
+    G_STORE  %1, %0 :: (store 8 into %ir.addr)
+
+...
+
+---
+# CHECK-LABEL: name: store_s32_gpr
+name:            store_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %w1
+# CHECK: STRWui %1, %0, 0 :: (store 4 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %w1
+
+    %0(p0) = COPY %x0
+    %1(s32) = COPY %w1
+    G_STORE  %1, %0 :: (store 4 into %ir.addr)
+
+...
+
+---
+# CHECK-LABEL: name: store_s16_gpr
+name:            store_s16_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %w1
+# CHECK: STRHHui %1, %0, 0 :: (store 2 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %w1
+
+    %0(p0) = COPY %x0
+    %1(s16) = COPY %w1
+    G_STORE  %1, %0 :: (store 2 into %ir.addr)
+
+...
+
+---
+# CHECK-LABEL: name: store_s8_gpr
+name:            store_s8_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %w1
+# CHECK: STRBBui %1, %0, 0 :: (store 1 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %w1
+
+    %0(p0) = COPY %x0
+    %1(s8) = COPY %w1
+    G_STORE  %1, %0 :: (store 1 into %ir.addr)
+
+...
+
+---
+# CHECK-LABEL: name: store_zero_s64_gpr
+name:            store_zero_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: STRXui %xzr, %0, 0 :: (store 8 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(p0) = COPY %x0
+    %1(s64) = G_CONSTANT i64 0
+    G_STORE  %1, %0 :: (store 8 into %ir.addr)
+
+...
+
+---
+# CHECK-LABEL: name: store_zero_s32_gpr
+name:            store_zero_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: STRWui %wzr, %0, 0 :: (store 4 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(s32) = G_CONSTANT i32 0
+    G_STORE  %1, %0 :: (store 4 into %ir.addr)
+
+...
+
+---
+# CHECK-LABEL: name: store_fi_s64_gpr
+name:            store_fi_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+stack:
+  - { id: 0, name: ptr0, offset: 0, size: 8, alignment: 8 }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: STRXui %0, %stack.0.ptr0, 0 :: (store 8)
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(p0) = COPY %x0
+    %1(p0) = G_FRAME_INDEX %stack.0.ptr0
+    G_STORE  %0, %1 :: (store 8)
+...
+
+---
+# CHECK-LABEL: name: store_gep_128_s64_gpr
+name:            store_gep_128_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %x1
+# CHECK: STRXui %1, %0, 16 :: (store 8 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(p0) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_CONSTANT i64 128
+    %3(p0) = G_GEP %0, %2
+    G_STORE %1, %3 :: (store 8 into %ir.addr)
+...
+
+---
+# CHECK-LABEL: name: store_gep_512_s32_gpr
+name:            store_gep_512_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %w1
+# CHECK: STRWui %1, %0, 128 :: (store 4 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %w1
+
+    %0(p0) = COPY %x0
+    %1(s32) = COPY %w1
+    %2(s64) = G_CONSTANT i64 512
+    %3(p0) = G_GEP %0, %2
+    G_STORE %1, %3 :: (store 4 into %ir.addr)
+...
+
+---
+# CHECK-LABEL: name: store_gep_64_s16_gpr
+name:            store_gep_64_s16_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %w1
+# CHECK: STRHHui %1, %0, 32 :: (store 2 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %w1
+
+    %0(p0) = COPY %x0
+    %1(s16) = COPY %w1
+    %2(s64) = G_CONSTANT i64 64
+    %3(p0) = G_GEP %0, %2
+    G_STORE %1, %3 :: (store 2 into %ir.addr)
+...
+
+---
+# CHECK-LABEL: name: store_gep_1_s8_gpr
+name:            store_gep_1_s8_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %w1
+# CHECK: STRBBui %1, %0, 1 :: (store 1 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %w1
+
+    %0(p0) = COPY %x0
+    %1(s8) = COPY %w1
+    %2(s64) = G_CONSTANT i64 1
+    %3(p0) = G_GEP %0, %2
+    G_STORE %1, %3 :: (store 1 into %ir.addr)
+...
+
+---
+# CHECK-LABEL: name: store_s64_fpr
+name:            store_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %d1
+# CHECK: STRDui %1, %0, 0 :: (store 8 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %d1
+
+    %0(p0) = COPY %x0
+    %1(s64) = COPY %d1
+    G_STORE  %1, %0 :: (store 8 into %ir.addr)
+
+...
+
+---
+# CHECK-LABEL: name: store_s32_fpr
+name:            store_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %s1
+# CHECK: STRSui %1, %0, 0 :: (store 4 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %s1
+
+    %0(p0) = COPY %x0
+    %1(s32) = COPY %s1
+    G_STORE  %1, %0 :: (store 4 into %ir.addr)
+
+...
+
+---
+# CHECK-LABEL: name: store_gep_8_s64_fpr
+name:            store_gep_8_s64_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: fpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %d1
+# CHECK: STRDui %1, %0, 1 :: (store 8 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %d1
+
+    %0(p0) = COPY %x0
+    %1(s64) = COPY %d1
+    %2(s64) = G_CONSTANT i64 8
+    %3(p0) = G_GEP %0, %2
+    G_STORE %1, %3 :: (store 8 into %ir.addr)
+...
+
+---
+# CHECK-LABEL: name: store_gep_8_s32_fpr
+name:            store_gep_8_s32_fpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+# CHECK-NEXT:  - { id: 1, class: fpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: fpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK: %0 = COPY %x0
+# CHECK: %1 = COPY %s1
+# CHECK: STRSui %1, %0, 2 :: (store 4 into %ir.addr)
+body:             |
+  bb.0:
+    liveins: %x0, %s1
+
+    %0(p0) = COPY %x0
+    %1(s32) = COPY %s1
+    %2(s64) = G_CONSTANT i64 8
+    %3(p0) = G_GEP %0, %2
+    G_STORE %1, %3 :: (store 4 into %ir.addr)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-trunc.mir b/test/CodeGen/AArch64/GlobalISel/select-trunc.mir
new file mode 100644
index 0000000000000000000000000000000000000000..fc3546e777f703f60c28df53ff5cadea942e3a94
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-trunc.mir
@@ -0,0 +1,81 @@
+# RUN: llc -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @trunc_s32_s64() { ret void }
+  define void @trunc_s8_s64() { ret void }
+  define void @trunc_s1_s32() { ret void }
+...
+
+---
+# CHECK-LABEL: name: trunc_s32_s64
+name:            trunc_s32_s64
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %1 = COPY %0.sub_32
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(s64) = COPY %x0
+    %1(s32) = G_TRUNC %0
+    %w0 = COPY %1(s32)
+...
+
+---
+# CHECK-LABEL: name: trunc_s8_s64
+name:            trunc_s8_s64
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %1 = COPY %0.sub_32
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(s64) = COPY %x0
+    %1(s8) = G_TRUNC %0
+    %w0 = COPY %1(s8)
+...
+
+---
+# CHECK-LABEL: name: trunc_s1_s32
+name:            trunc_s1_s32
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %1 = COPY %0
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s1) = G_TRUNC %0
+    %w0 = COPY %1(s1)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select-xor.mir b/test/CodeGen/AArch64/GlobalISel/select-xor.mir
new file mode 100644
index 0000000000000000000000000000000000000000..e787849c8d1bf4dcd55ec93863269b723faefe02
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select-xor.mir
@@ -0,0 +1,165 @@
+# RUN: llc -O0 -mtriple=aarch64-- -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @xor_s32_gpr() { ret void }
+  define void @xor_s64_gpr() { ret void }
+  define void @xor_constant_n1_s32_gpr() { ret void }
+  define void @xor_constant_n1_s64_gpr() { ret void }
+  define void @xor_constant_n1_s32_gpr_2bb() { ret void }
+
+...
+
+---
+# Check that we select a 32-bit GPR G_XOR into EORWrr on GPR32.
+# Also check that we constrain the register class of the COPY to GPR32.
+# CHECK-LABEL: name: xor_s32_gpr
+name:            xor_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %1 = COPY %w1
+# CHECK:    %2 = EORWrr %0, %1
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+
+    %0(s32) = COPY %w0
+    %1(s32) = COPY %w1
+    %2(s32) = G_XOR %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as xor_s64_gpr, for 64-bit operations.
+# CHECK-LABEL: name: xor_s64_gpr
+name:            xor_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr64 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %1 = COPY %x1
+# CHECK:    %2 = EORXrr %0, %1
+body:             |
+  bb.0:
+    liveins: %x0, %x1
+
+    %0(s64) = COPY %x0
+    %1(s64) = COPY %x1
+    %2(s64) = G_XOR %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# Check that we select a 32-bit GPR G_XOR into EORWrr on GPR32.
+# Also check that we constrain the register class of the COPY to GPR32.
+# CHECK-LABEL: name: xor_constant_n1_s32_gpr
+name:            xor_constant_n1_s32_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %w0
+# CHECK:    %2 = ORNWrr %wzr, %0
+body:             |
+  bb.0:
+    liveins: %w0
+
+    %0(s32) = COPY %w0
+    %1(s32) = G_CONSTANT i32 -1
+    %2(s32) = G_XOR %0, %1
+    %w0 = COPY %2(s32)
+...
+
+---
+# Same as xor_constant_n1_s64_gpr, for 64-bit operations.
+# CHECK-LABEL: name: xor_constant_n1_s64_gpr
+name:            xor_constant_n1_s64_gpr
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64 }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %0 = COPY %x0
+# CHECK:    %2 = ORNXrr %xzr, %0
+body:             |
+  bb.0:
+    liveins: %x0
+
+    %0(s64) = COPY %x0
+    %1(s64) = G_CONSTANT i64 -1
+    %2(s64) = G_XOR %0, %1
+    %x0 = COPY %2(s64)
+...
+
+---
+# Check that we can obtain constants from other basic blocks.
+# CHECK-LABEL: name: xor_constant_n1_s32_gpr_2bb
+name:            xor_constant_n1_s32_gpr_2bb
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK:    B %bb.1
+# CHECK:    %0 = COPY %w0
+# CHECK:    %2 = ORNWrr %wzr, %0
+
+body:             |
+  bb.0:
+    liveins: %w0, %w1
+    successors: %bb.1
+    %1(s32) = G_CONSTANT i32 -1
+    G_BR %bb.1
+  bb.1:
+    %0(s32) = COPY %w0
+    %2(s32) = G_XOR %0, %1
+    %w0 = COPY %2(s32)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/select.mir b/test/CodeGen/AArch64/GlobalISel/select.mir
new file mode 100644
index 0000000000000000000000000000000000000000..8bffa085fdca6348227c1e9ef59e4818efaeeffa
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/select.mir
@@ -0,0 +1,311 @@
+# RUN: llc -O0 -mtriple=aarch64-apple-ios -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=IOS
+# RUN: llc -O0 -mtriple=aarch64-linux-gnu -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LINUX-DEFAULT
+# RUN: llc -O0 -mtriple=aarch64-linux-gnu -relocation-model=pic -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LINUX-PIC
+
+--- |
+  target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+  define void @frame_index() {
+    %ptr0 = alloca i64
+    ret void
+  }
+
+  define i8* @gep(i8* %in) { ret i8* undef }
+
+  define i8* @ptr_mask(i8* %in) { ret i8* undef }
+
+  @var_local = global i8 0
+  define i8* @global_local() { ret i8* undef }
+
+  @var_got = external global i8
+  define i8* @global_got() { ret i8* undef }
+
+  define void @icmp() { ret void }
+  define void @fcmp() { ret void }
+
+  define void @phi() { ret void }
+
+  define void @select() { ret void }
+...
+
+---
+# CHECK-LABEL: name: frame_index
+name:            frame_index
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr64sp }
+registers:
+  - { id: 0, class: gpr }
+
+stack:
+  - { id: 0, name: ptr0, offset: 0, size: 8, alignment: 8 }
+
+# CHECK:  body:
+# CHECK: %0 = ADDXri %stack.0.ptr0, 0, 0
+body:             |
+  bb.0:
+    %0(p0) = G_FRAME_INDEX %stack.0.ptr0
+    %x0 = COPY %0(p0)
+...
+
+---
+# CHECK-LABEL: name: gep
+name:            gep
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+
+# CHECK:  body:
+# CHECK: %1 = MOVi64imm 42
+# CHECK: %2 = ADDXrr %0, %1
+body:             |
+  bb.0:
+      liveins: %x0
+    %0(p0) = COPY %x0
+    %1(s64) = G_CONSTANT i64 42
+    %2(p0) = G_GEP %0, %1(s64)
+    %x0 = COPY %2(p0)
+...
+
+---
+# CHECK-LABEL: name: ptr_mask
+name:            ptr_mask
+legalized:       true
+regBankSelected: true
+
+# CHECK:  body:
+# CHECK: %1 = ANDXri %0, 8060
+body:             |
+  bb.0:
+      liveins: %x0
+    %0:gpr(p0) = COPY %x0
+    %1:gpr(p0) = G_PTR_MASK %0, 3
+    %x0 = COPY %1(p0)
+...
+
+---
+# Global defined in the same linkage unit so no GOT is needed
+# CHECK-LABEL: name: global_local
+name:            global_local
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr }
+
+# CHECK:  body:
+# IOS: %0 = MOVaddr target-flags(aarch64-page) @var_local, target-flags(aarch64-pageoff, aarch64-nc) @var_local
+# LINUX-DEFAULT: %0 = MOVaddr target-flags(aarch64-page) @var_local, target-flags(aarch64-pageoff, aarch64-nc) @var_local
+# LINUX-PIC: %0 = LOADgot target-flags(aarch64-got) @var_local
+body:             |
+  bb.0:
+    %0(p0) = G_GLOBAL_VALUE @var_local
+    %x0 = COPY %0(p0)
+...
+
+---
+# CHECK-LABEL: name: global_got
+name:            global_got
+legalized:       true
+regBankSelected: true
+registers:
+  - { id: 0, class: gpr }
+
+# CHECK:  body:
+# IOS: %0 = LOADgot target-flags(aarch64-got) @var_got
+# LINUX-DEFAULT: %0 = MOVaddr target-flags(aarch64-page) @var_got, target-flags(aarch64-pageoff, aarch64-nc) @var_got
+# LINUX-PIC: %0 = LOADgot target-flags(aarch64-got) @var_got
+body:             |
+  bb.0:
+    %0(p0) = G_GLOBAL_VALUE @var_got
+    %x0 = COPY %0(p0)
+...
+
+---
+# CHECK-LABEL: name: icmp
+name:            icmp
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr64 }
+# CHECK-NEXT:  - { id: 3, class: gpr32 }
+# CHECK-NEXT:  - { id: 4, class: gpr64 }
+# CHECK-NEXT:  - { id: 5, class: gpr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+  - { id: 5, class: gpr }
+
+# CHECK:  body:
+# CHECK:    %wzr = SUBSWrr %0, %0, implicit-def %nzcv
+# CHECK:    %1 = CSINCWr %wzr, %wzr, 1, implicit %nzcv
+
+# CHECK:    %xzr = SUBSXrr %2, %2, implicit-def %nzcv
+# CHECK:    %3 = CSINCWr %wzr, %wzr, 3, implicit %nzcv
+
+# CHECK:    %xzr = SUBSXrr %4, %4, implicit-def %nzcv
+# CHECK:    %5 = CSINCWr %wzr, %wzr, 0, implicit %nzcv
+
+body:             |
+  bb.0:
+    liveins: %w0, %x0
+
+    %0(s32) = COPY %w0
+    %1(s1) = G_ICMP intpred(eq), %0, %0
+    %w0 = COPY %1(s1)
+
+    %2(s64) = COPY %x0
+    %3(s1) = G_ICMP intpred(uge), %2, %2
+    %w0 = COPY %3(s1)
+
+    %4(p0) = COPY %x0
+    %5(s1) = G_ICMP intpred(ne), %4, %4
+    %w0 = COPY %5(s1)
+...
+
+---
+# CHECK-LABEL: name: fcmp
+name:            fcmp
+legalized:       true
+regBankSelected: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: fpr64 }
+# CHECK-NEXT:  - { id: 3, class: gpr32 }
+# CHECK-NEXT:  - { id: 4, class: gpr32 }
+# CHECK-NEXT:  - { id: 5, class: gpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: fpr }
+  - { id: 3, class: gpr }
+
+# CHECK:  body:
+# CHECK:    FCMPSrr %0, %0, implicit-def %nzcv
+# CHECK:    [[TST_MI:%[0-9]+]] = CSINCWr %wzr, %wzr, 5, implicit %nzcv
+# CHECK:    [[TST_GT:%[0-9]+]] = CSINCWr %wzr, %wzr, 13, implicit %nzcv
+# CHECK:    %1 = ORRWrr [[TST_MI]], [[TST_GT]]
+
+# CHECK:    FCMPDrr %2, %2, implicit-def %nzcv
+# CHECK:    %3 = CSINCWr %wzr, %wzr, 4, implicit %nzcv
+
+body:             |
+  bb.0:
+    liveins: %w0, %x0
+
+    %0(s32) = COPY %s0
+    %1(s1) = G_FCMP floatpred(one), %0, %0
+    %w0 = COPY %1(s1)
+
+    %2(s64) = COPY %d0
+    %3(s1) = G_FCMP floatpred(uge), %2, %2
+    %w0 = COPY %3(s1)
+
+...
+
+---
+# CHECK-LABEL: name: phi
+name:            phi
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: fpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: fpr32 }
+registers:
+  - { id: 0, class: fpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: fpr }
+
+# CHECK:  body:
+# CHECK:    bb.1:
+# CHECK:      %2 = PHI %0, %bb.0, %2, %bb.1
+
+body:             |
+  bb.0:
+    liveins: %s0, %w0
+    successors: %bb.1
+    %0(s32) = COPY %s0
+    %1(s1) = COPY %w0
+
+  bb.1:
+    successors: %bb.1, %bb.2
+    %2(s32) = PHI %0, %bb.0, %2, %bb.1
+    G_BRCOND %1, %bb.1
+
+  bb.2:
+    %s0 = COPY %2
+    RET_ReallyLR implicit %s0
+...
+
+---
+# CHECK-LABEL: name: select
+name:            select
+legalized:       true
+regBankSelected: true
+tracksRegLiveness: true
+
+# CHECK:      registers:
+# CHECK-NEXT:  - { id: 0, class: gpr32 }
+# CHECK-NEXT:  - { id: 1, class: gpr32 }
+# CHECK-NEXT:  - { id: 2, class: gpr32 }
+# CHECK-NEXT:  - { id: 3, class: gpr32 }
+# CHECK-NEXT:  - { id: 4, class: gpr64 }
+# CHECK-NEXT:  - { id: 5, class: gpr64 }
+# CHECK-NEXT:  - { id: 6, class: gpr64 }
+# CHECK-NEXT:  - { id: 7, class: gpr64 }
+# CHECK-NEXT:  - { id: 8, class: gpr64 }
+# CHECK-NEXT:  - { id: 9, class: gpr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gpr }
+  - { id: 4, class: gpr }
+  - { id: 5, class: gpr }
+  - { id: 6, class: gpr }
+  - { id: 7, class: gpr }
+  - { id: 8, class: gpr }
+  - { id: 9, class: gpr }
+
+# CHECK:  body:
+# CHECK:      %wzr = ANDSWri %0, 0, implicit-def %nzcv
+# CHECK:      %3 = CSELWr %1, %2, 1, implicit %nzcv
+# CHECK:      %wzr = ANDSWri %0, 0, implicit-def %nzcv
+# CHECK:      %6 = CSELXr %4, %5, 1, implicit %nzcv
+# CHECK:      %wzr = ANDSWri %0, 0, implicit-def %nzcv
+# CHECK:      %9 = CSELXr %7, %8, 1, implicit %nzcv
+body:             |
+  bb.0:
+    liveins: %w0, %w1, %w2
+    %0(s1) = COPY %w0
+
+    %1(s32) = COPY %w1
+    %2(s32) = COPY %w2
+    %3(s32) = G_SELECT %0, %1, %2
+    %w0 = COPY %3(s32)
+
+    %4(s64) = COPY %x0
+    %5(s64) = COPY %x1
+    %6(s64) = G_SELECT %0, %4, %5
+    %x0 = COPY %6(s64)
+
+    %7(p0) = COPY %x0
+    %8(p0) = COPY %x1
+    %9(p0) = G_SELECT %0, %7, %8
+    %x0 = COPY %9(p0)
+...
diff --git a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
index 14dbc7c3c31a985d219754a535d0607bd1cbfbf6..e4c18757418d0a342ebca28ff1b059f8fcbd4233 100644
--- a/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
+++ b/test/CodeGen/AArch64/GlobalISel/translate-gep.ll
@@ -58,8 +58,8 @@ define i32* @const_then_var(%type1* %addr, i64 %idx) {
 ; CHECK: [[BASE:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[IDX:%[0-9]+]](s64) = COPY %x1
 ; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_CONSTANT i64 272
-; CHECK: [[BASE1:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET1]](s64)
 ; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT i64 4
+; CHECK: [[BASE1:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET1]](s64)
 ; CHECK: [[OFFSET2:%[0-9]+]](s64) = G_MUL [[SIZE]], [[IDX]]
 ; CHECK: [[BASE2:%[0-9]+]](p0) = G_GEP [[BASE1]], [[OFFSET2]](s64)
 ; CHECK: [[RES:%[0-9]+]](p0) = COPY [[BASE2]](p0)
@@ -74,9 +74,9 @@ define i32* @var_then_const(%type1* %addr, i64 %idx) {
 ; CHECK: [[BASE:%[0-9]+]](p0) = COPY %x0
 ; CHECK: [[IDX:%[0-9]+]](s64) = COPY %x1
 ; CHECK: [[SIZE:%[0-9]+]](s64) = G_CONSTANT i64 64
+; CHECK: [[OFFSET2:%[0-9]+]](s64) = G_CONSTANT i64 40
 ; CHECK: [[OFFSET1:%[0-9]+]](s64) = G_MUL [[SIZE]], [[IDX]]
 ; CHECK: [[BASE1:%[0-9]+]](p0) = G_GEP [[BASE]], [[OFFSET1]](s64)
-; CHECK: [[OFFSET2:%[0-9]+]](s64) = G_CONSTANT i64 40
 ; CHECK: [[BASE2:%[0-9]+]](p0) = G_GEP [[BASE1]], [[OFFSET2]](s64)
 ; CHECK: %x0 = COPY [[BASE2]](p0)
 
diff --git a/test/CodeGen/AArch64/GlobalISel/varargs-ios-translator.ll b/test/CodeGen/AArch64/GlobalISel/varargs-ios-translator.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3bd56fa4cebca92cd9c65f7e31cf6ea26e8acda4
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/varargs-ios-translator.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple=aarch64-apple-ios -stop-after=instruction-select -global-isel -verify-machineinstrs %s -o - | FileCheck %s
+
+define void @test_varargs_sentinel(i8* %list, i64, i64, i64, i64, i64, i64, i64,
+                                   i32, ...) {
+; CHECK-LABEL: name: test_varargs_sentinel
+; CHECK: fixedStack:
+; CHECK:   - { id: [[VARARGS_SLOT:[0-9]+]], offset: 8
+; CHECK: body:
+; CHECK:   [[LIST:%[0-9]+]] = COPY %x0
+; CHECK:   [[VARARGS_AREA:%[0-9]+]] = ADDXri %fixed-stack.[[VARARGS_SLOT]], 0, 0
+; CHECK:   STRXui [[VARARGS_AREA]], [[LIST]], 0 :: (store 8 into %ir.list, align 0)
+  call void @llvm.va_start(i8* %list)
+  ret void
+}
+
+declare void @llvm.va_start(i8*)
diff --git a/test/CodeGen/AArch64/GlobalISel/vastart.ll b/test/CodeGen/AArch64/GlobalISel/vastart.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ae44e8fc5dea2bcc04bc96192a33c58bb1b2f266
--- /dev/null
+++ b/test/CodeGen/AArch64/GlobalISel/vastart.ll
@@ -0,0 +1,13 @@
+; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o -  -mtriple=aarch64-apple-ios7.0 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-IOS %s
+; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o -  -mtriple=aarch64-linux-gnu | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LINUX %s
+
+
+declare void @llvm.va_start(i8*)
+define void @test_va_start(i8* %list) {
+; CHECK-LABEL: name: test_va_start
+; CHECK: [[LIST:%[0-9]+]](p0) = COPY %x0
+; CHECK-IOS: G_VASTART [[LIST]](p0) :: (store 8 into %ir.list, align 0)
+; CHECK-LINUX: G_VASTART [[LIST]](p0) :: (store 32 into %ir.list, align 0)
+  call void @llvm.va_start(i8* %list)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll b/test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3fe7e65bf2454a5b7bb4040a7549eb07aeb50cba
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-codegen-prepare-atp.ll
@@ -0,0 +1,68 @@
+; RUN: opt -codegenprepare < %s -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%struct.match_state = type { i64, i64  }
+
+; %add is also promoted by forking an extra sext.
+define void @promoteTwoOne(i32 %i, i32 %j, i64* %P1, i64* %P2 ) {
+; CHECK-LABEL: @promoteTwoOne
+; CHECK-LABEL: entry:
+; CHECK: %[[SEXT1:.*]] = sext i32 %i to i64
+; CHECK: %[[SEXT2:.*]] = sext i32 %j to i64
+; CHECK: %add = add nsw i64 %[[SEXT1]], %[[SEXT2]]
+entry:
+  %add = add nsw i32 %i, %j
+  %s = sext i32 %add to i64
+  %addr1 = getelementptr inbounds i64, i64* %P1, i64 %s
+  store i64 %s, i64* %addr1
+  %s2 = sext i32 %i to i64
+  %addr2 = getelementptr inbounds i64, i64* %P2, i64 %s2
+  store i64 %s2, i64* %addr2
+  ret void
+}
+
+; Both %add1 and %add2 are promoted by forking extra sexts.
+define void @promoteTwoTwo(i32 %i, i32 %j, i32 %k, i64* %P1, i64* %P2) {
+; CHECK-LABEL: @promoteTwoTwo
+; CHECK-LABEL:entry:
+; CHECK: %[[SEXT1:.*]] = sext i32 %j to i64
+; CHECK: %[[SEXT2:.*]]  = sext i32 %i to i64
+; CHECK: %add1 = add nsw i64 %[[SEXT1]], %[[SEXT2]]
+; CHECK: %[[SEXT3:.*]] = sext i32 %k to i64
+; CHECK: %add2 = add nsw i64 %[[SEXT1]], %[[SEXT3]]
+entry:
+  %add1 = add nsw i32 %j, %i
+  %s = sext i32 %add1 to i64
+  %addr1 = getelementptr inbounds i64, i64* %P1, i64 %s
+  store i64 %s, i64* %addr1
+  %add2 = add nsw i32 %j, %k
+  %s2 = sext i32 %add2 to i64
+  %addr2 = getelementptr inbounds i64, i64* %P2, i64 %s2
+  store i64 %s2, i64* %addr2
+  ret void
+}
+
+define i64 @promoteGEPSunk(i1 %cond, i64* %base, i32 %i) {
+; CHECK-LABEL: @promoteGEPSunk
+; CHECK-LABEL: entry:
+; CHECK:  %[[SEXT:.*]] = sext i32 %i to i64
+; CHECK:  %add = add nsw i64 %[[SEXT]], 1
+; CHECK:  %add2 = add nsw i64 %[[SEXT]], 2
+entry:
+  %add = add nsw i32 %i, 1
+  %s = sext i32 %add to i64
+  %addr = getelementptr inbounds i64, i64* %base, i64 %s
+  %add2 = add nsw i32 %i,  2
+  %s2 = sext i32 %add2 to i64
+  %addr2 = getelementptr inbounds i64, i64* %base, i64 %s2
+  br i1 %cond, label %if.then, label %if.then2
+if.then:
+  %v = load i64, i64* %addr
+  %v2 = load i64, i64* %addr2
+  %r = add i64 %v, %v2
+  ret i64 %r
+if.then2:
+  ret i64 0;
+}
diff --git a/test/CodeGen/AArch64/aarch64-fold-lslfast.ll b/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0dfe04b664d0eae5985be2a6ab5572174bc3e944
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-fold-lslfast.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+lsl-fast | FileCheck %s
+
+%struct.a = type [256 x i16]
+%struct.b = type [256 x i32]
+%struct.c = type [256 x i64]
+
+declare void @foo()
+define i16 @halfword(%struct.a* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: halfword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldrh [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #1]
+; CHECK: strh [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #1]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.a, %struct.a* %ctx, i64 0, i64 %idxprom83
+  %result = load i16, i16* %arrayidx86, align 2
+  call void @foo()
+  store i16 %result, i16* %arrayidx86, align 2
+  ret i16 %result
+}
+
+define i32 @word(%struct.b* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: word:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldr [[REG1:w[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #2]
+; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #2]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.b, %struct.b* %ctx, i64 0, i64 %idxprom83
+  %result = load i32, i32* %arrayidx86, align 4
+  call void @foo()
+  store i32 %result, i32* %arrayidx86, align 4
+  ret i32 %result
+}
+
+define i64 @doubleword(%struct.c* %ctx, i32 %xor72) nounwind {
+; CHECK-LABEL: doubleword:
+; CHECK: ubfx [[REG:x[0-9]+]], x1, #9, #8
+; CHECK: ldr [[REG1:x[0-9]+]], [{{.*}}[[REG2:x[0-9]+]], [[REG]], lsl #3]
+; CHECK: str [[REG1]], [{{.*}}[[REG2]], [[REG]], lsl #3]
+  %shr81 = lshr i32 %xor72, 9
+  %conv82 = zext i32 %shr81 to i64
+  %idxprom83 = and i64 %conv82, 255
+  %arrayidx86 = getelementptr inbounds %struct.c, %struct.c* %ctx, i64 0, i64 %idxprom83
+  %result = load i64, i64* %arrayidx86, align 8
+  call void @foo()
+  store i64 %result, i64* %arrayidx86, align 8
+  ret i64 %result
+}
+
+define i64 @multi_use_non_memory(i64 %a, i64 %b) {
+; CHECK-LABEL: multi_use_non_memory:
+; CHECK: lsl [[REG1:x[0-9]+]], x0, #3
+; CHECK-NOT: cmp [[REG1]], x1, lsl # 3
+; CHECK-NEXT: lsl [[REG2:x[0-9]+]], x1, #3
+; CHECK-NEXT: cmp [[REG1]], [[REG2]]
+entry:
+  %mul1 = shl i64 %a, 3
+  %mul2 = shl i64 %b, 3
+  %cmp = icmp slt i64 %mul1, %mul2
+  br i1 %cmp, label %truebb, label %falsebb
+truebb:
+  tail call void @foo()
+  unreachable
+falsebb:
+  %cmp2 = icmp sgt i64 %mul1, %mul2
+  br i1 %cmp2, label %exitbb, label %endbb
+exitbb:
+ ret i64 %mul1
+endbb:
+ ret i64 %mul2
+}
diff --git a/test/CodeGen/AArch64/aarch64-gep-opt.ll b/test/CodeGen/AArch64/aarch64-gep-opt.ll
index 6e4a47b04406dd47164f492b3e09a6a97981e79c..df9534ffde0973a552f5aefcf83b599956f875fe 100644
--- a/test/CodeGen/AArch64/aarch64-gep-opt.ll
+++ b/test/CodeGen/AArch64/aarch64-gep-opt.ll
@@ -96,9 +96,13 @@ exit:
 ; CHECK-NoAA: add i64 [[TMP:%[a-zA-Z0-9]+]], 528
 ; CHECK-NoAA: add i64 [[TMP]], 532
 ; CHECK-NoAA: if.true:
-; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 532
+; CHECK-NoAA: inttoptr
+; CHECK-NoAA: bitcast
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8, {{.*}}, i64 532
 ; CHECK-NoAA: exit:
-; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 528
+; CHECK-NoAA: inttoptr
+; CHECK-NoAA: bitcast
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8, {{.*}}, i64 528
 
 ; CHECK-UseAA-LABEL: test_GEP_across_BB(
 ; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = getelementptr
diff --git a/test/CodeGen/AArch64/aarch64-named-reg-w18.ll b/test/CodeGen/AArch64/aarch64-named-reg-w18.ll
new file mode 100644
index 0000000000000000000000000000000000000000..341c7683dbaa44c3052f20ae383952376fdc4a40
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-named-reg-w18.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -mtriple=aarch64-fuchsia -o - %s 2>&1 | FileCheck %s --check-prefix=ERROR
+; RUN: llc -mtriple=aarch64-fuchsia -mattr=+reserve-x18 -o - %s
+
+define void @set_w18(i32 %x) {
+entry:
+; FIXME: Include an allocatable-specific error message
+; ERROR: Invalid register name "w18".
+  tail call void @llvm.write_register.i32(metadata !0, i32 %x)
+  ret void
+}
+
+declare void @llvm.write_register.i32(metadata, i32) nounwind
+
+!0 = !{!"w18"}
diff --git a/test/CodeGen/AArch64/aarch64-named-reg-x18.ll b/test/CodeGen/AArch64/aarch64-named-reg-x18.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eed852710ba00399676e96ffc3ef6282e96d37dd
--- /dev/null
+++ b/test/CodeGen/AArch64/aarch64-named-reg-x18.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -mtriple=aarch64-fuchsia -o - %s 2>&1 | FileCheck %s --check-prefix=ERROR
+; RUN: llc -mtriple=aarch64-fuchsia -mattr=+reserve-x18 -o - %s
+
+define void @set_x18(i64 %x) {
+entry:
+; FIXME: Include an allocatable-specific error message
+; ERROR: Invalid register name "x18".
+  tail call void @llvm.write_register.i64(metadata !0, i64 %x)
+  ret void
+}
+
+declare void @llvm.write_register.i64(metadata, i64) nounwind
+
+!0 = !{!"x18"}
diff --git a/test/CodeGen/AArch64/and-sink.ll b/test/CodeGen/AArch64/and-sink.ll
new file mode 100644
index 0000000000000000000000000000000000000000..91b7bd0db1726c541dcd24ff90b258362a1d75b0
--- /dev/null
+++ b/test/CodeGen/AArch64/and-sink.ll
@@ -0,0 +1,90 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: opt -S -codegenprepare -mtriple=aarch64-linux %s | FileCheck --check-prefix=CHECK-CGP %s
+
+@A = global i32 zeroinitializer
+@B = global i32 zeroinitializer
+@C = global i32 zeroinitializer
+
+; Test that and is sunk into cmp block to form tbz.
+define i32 @and_sink1(i32 %a, i1 %c) {
+; CHECK-LABEL: and_sink1:
+; CHECK: tbz w1, #0
+; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A]
+; CHECK: tbnz {{w[0-9]+}}, #2
+
+; CHECK-CGP-LABEL: @and_sink1(
+; CHECK-CGP-NOT: and i32
+  %and = and i32 %a, 4
+  br i1 %c, label %bb0, label %bb2
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+; CHECK-CGP-NEXT: store
+; CHECK-CGP-NEXT: br
+  %cmp = icmp eq i32 %and, 0
+  store i32 0, i32* @A
+  br i1 %cmp, label %bb1, label %bb2
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+; Test that both 'and' and cmp get sunk to form tbz.
+define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) {
+; CHECK-LABEL: and_sink2:
+; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A]
+; CHECK: tbz w1, #0
+; CHECK: str wzr, [x{{[0-9]+}}, :lo12:B]
+; CHECK: tbz w2, #0
+; CHECK: str wzr, [x{{[0-9]+}}, :lo12:C]
+; CHECK: tbnz {{w[0-9]+}}, #2
+
+; CHECK-CGP-LABEL: @and_sink2(
+; CHECK-CGP-NOT: and i32
+  %and = and i32 %a, 4
+  store i32 0, i32* @A
+  br i1 %c, label %bb0, label %bb3
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP-NOT: icmp
+  %cmp = icmp eq i32 %and, 0
+  store i32 0, i32* @B
+  br i1 %c2, label %bb1, label %bb3
+bb1:
+; CHECK-CGP-LABEL: bb1:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+; CHECK-CGP-NEXT: store
+; CHECK-CGP-NEXT: br
+  store i32 0, i32* @C
+  br i1 %cmp, label %bb2, label %bb0
+bb2:
+  ret i32 1
+bb3:
+  ret i32 0
+}
+
+; Test that 'and' is not sunk since cbz is a better alternative.
+define i32 @and_sink3(i32 %a) {
+; CHECK-LABEL: and_sink3:
+; CHECK: and [[REG:w[0-9]+]], w0, #0x3
+; CHECK: [[LOOP:.L[A-Z0-9_]+]]:
+; CHECK: str wzr, [x{{[0-9]+}}, :lo12:A]
+; CHECK: cbz [[REG]], [[LOOP]]
+
+; CHECK-CGP-LABEL: @and_sink3(
+; CHECK-CGP-NEXT: and i32
+  %and = and i32 %a, 3
+  br label %bb0
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP-NOT: and i32
+  %cmp = icmp eq i32 %and, 0
+  store i32 0, i32* @A
+  br i1 %cmp, label %bb0, label %bb2
+bb2:
+  ret i32 0
+}
diff --git a/test/CodeGen/AArch64/argument-blocks.ll b/test/CodeGen/AArch64/argument-blocks.ll
index 3169abc2dcb3a7f0e009b16f68ed06f1bc60b7e1..b5374ca8ced5375460527408add6eaad2920313f 100644
--- a/test/CodeGen/AArch64/argument-blocks.ll
+++ b/test/CodeGen/AArch64/argument-blocks.ll
@@ -59,10 +59,10 @@ define i64 @test_hfa_ignores_gprs([7 x float], [2 x float] %in, i64, i64 %res) {
 }
 
 ; [2 x float] should not be promoted to double by the Darwin varargs handling,
-; but should go in an 8-byte aligned slot.
+; but should go in an 8-byte aligned slot and can be merged as integer stores.
 define void @test_varargs_stackalign() {
 ; CHECK-LABEL: test_varargs_stackalign:
-; CHECK-DARWINPCS: stp {{w[0-9]+}}, {{w[0-9]+}}, [sp, #16]
+; CHECK-DARWINPCS: str {{x[0-9]+}}, [sp, #16]
 
   call void(...) @callee([3 x float] undef, [2 x float] [float 1.0, float 2.0])
   ret void
diff --git a/test/CodeGen/AArch64/arm64-abi-varargs.ll b/test/CodeGen/AArch64/arm64-abi-varargs.ll
index a29f8c4b57ab6ed39eda2f490c712f7f5d51b529..0a79655714806b5f135bd425a3b3c847f53470e5 100644
--- a/test/CodeGen/AArch64/arm64-abi-varargs.ll
+++ b/test/CodeGen/AArch64/arm64-abi-varargs.ll
@@ -3,7 +3,7 @@
 ; rdar://13625505
 ; Here we have 9 fixed integer arguments the 9th argument in on stack, the
 ; varargs start right after at 8-byte alignment.
-define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
+define void @fn9(i32* %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, ...) nounwind noinline ssp {
 ; CHECK-LABEL: fn9:
 ; 9th fixed argument
 ; CHECK: ldr {{w[0-9]+}}, [sp, #64]
@@ -30,7 +30,6 @@ define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7,
   %a10 = alloca i32, align 4
   %a11 = alloca i32, align 4
   %a12 = alloca i32, align 4
-  store i32 %a1, i32* %1, align 4
   store i32 %a2, i32* %2, align 4
   store i32 %a3, i32* %3, align 4
   store i32 %a4, i32* %4, align 4
@@ -39,6 +38,7 @@ define void @fn9(i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7,
   store i32 %a7, i32* %7, align 4
   store i32 %a8, i32* %8, align 4
   store i32 %a9, i32* %9, align 4
+  store i32 %a9, i32* %a1
   %10 = bitcast i8** %args to i8*
   call void @llvm.va_start(i8* %10)
   %11 = va_arg i8** %args, i32
@@ -93,7 +93,7 @@ define i32 @main() nounwind ssp {
   %10 = load i32, i32* %a10, align 4
   %11 = load i32, i32* %a11, align 4
   %12 = load i32, i32* %a12, align 4
-  call void (i32, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32 %1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
+  call void (i32*, i32, i32, i32, i32, i32, i32, i32, i32, ...) @fn9(i32* %a1, i32 %2, i32 %3, i32 %4, i32 %5, i32 %6, i32 %7, i32 %8, i32 %9, i32 %10, i32 %11, i32 %12)
   ret i32 0
 }
 
diff --git a/test/CodeGen/AArch64/arm64-abi.ll b/test/CodeGen/AArch64/arm64-abi.ll
index fb52b1d99fc95095805e7af2733630e8262021a2..6cf0ab35b9b527c12b29d7b0365810a366ed7718 100644
--- a/test/CodeGen/AArch64/arm64-abi.ll
+++ b/test/CodeGen/AArch64/arm64-abi.ll
@@ -205,10 +205,7 @@ declare i32 @args_i32(i32, i32, i32, i32, i32, i32, i32, i32, i16 signext, i32,
 define i32 @test8(i32 %argc, i8** nocapture %argv) nounwind {
 entry:
 ; CHECK-LABEL: test8
-; CHECK: strb {{w[0-9]+}}, [sp, #3]
-; CHECK: strb wzr, [sp, #2]
-; CHECK: strb {{w[0-9]+}}, [sp, #1]
-; CHECK: strb wzr, [sp]
+; CHECK: str w8, [sp]
 ; CHECK: bl
 ; FAST-LABEL: test8
 ; FAST: strb {{w[0-9]+}}, [sp]
diff --git a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
index c57be5684ade4a6879039ddd53526674e46223d0..0009fe52e177a31ff17051247f9c3879fa6d476b 100644
--- a/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
+++ b/test/CodeGen/AArch64/arm64-addr-type-promotion.ll
@@ -10,14 +10,17 @@ define zeroext i8 @fullGtU(i32 %i1, i32 %i2) {
 ; CHECK: fullGtU
 ; CHECK: adrp [[PAGE:x[0-9]+]], _block@GOTPAGE
 ; CHECK: ldr [[ADDR:x[0-9]+]], {{\[}}[[PAGE]], _block@GOTPAGEOFF]
+; CHECK: sxtw [[I1:x[0-9]+]], w0
+; CHECK: sxtw [[I2:x[0-9]+]], w1
 ; CHECK-NEXT: ldr [[BLOCKBASE:x[0-9]+]], {{\[}}[[ADDR]]]
-; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]],  w0, sxtw]
-; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], w1, sxtw]
+; CHECK-NEXT: ldrb [[BLOCKVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE]], [[I1]]]
+; CHECK-NEXT: ldrb [[BLOCKVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE]], [[I2]]]
+
 ; CHECK-NEXT: cmp [[BLOCKVAL1]], [[BLOCKVAL2]]
 ; CHECK-NEXT: b.ne
 ; Next BB
-; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], w1, sxtw
-; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], w0, sxtw
+; CHECK: add [[BLOCKBASE2:x[0-9]+]], [[BLOCKBASE]], [[I2]]
+; CHECK-NEXT: add [[BLOCKBASE1:x[0-9]+]], [[BLOCKBASE]], [[I1]]
 ; CHECK-NEXT: ldrb [[LOADEDVAL1:w[0-9]+]], {{\[}}[[BLOCKBASE1]], #1]
 ; CHECK-NEXT: ldrb [[LOADEDVAL2:w[0-9]+]], {{\[}}[[BLOCKBASE2]], #1]
 ; CHECK-NEXT: cmp [[LOADEDVAL1]], [[LOADEDVAL2]]
diff --git a/test/CodeGen/AArch64/arm64-atomic.ll b/test/CodeGen/AArch64/arm64-atomic.ll
index c87103481adf641969d50d37e1c857394bc59f3a..2c9a3bbaa500156447fd1b6bc5beb431cf8e1be3 100644
--- a/test/CodeGen/AArch64/arm64-atomic.ll
+++ b/test/CodeGen/AArch64/arm64-atomic.ll
@@ -9,10 +9,10 @@ define i32 @val_compare_and_swap(i32* %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -27,10 +27,12 @@ define i32 @val_compare_and_swap_from_load(i32* %p, i32 %cmp, i32* %pnew) #0 {
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], [[NEW]], [x0]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: mov    x0, x[[ADDR]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: mov    x0, x[[ADDR]]
+; CHECK-NEXT: ret
   %new = load i32, i32* %pnew
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acquire acquire
   %val = extractvalue { i32, i1 } %pair, 0
@@ -41,15 +43,15 @@ define i32 @val_compare_and_swap_rel(i32* %p, i32 %cmp, i32 %new) #0 {
 ; CHECK-LABEL: val_compare_and_swap_rel:
 ; CHECK-NEXT: mov    x[[ADDR:[0-9]+]], x0
 ; CHECK-NEXT: [[TRYBB:.?LBB[0-9_]+]]:
-; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]
+; CHECK-NEXT: ldaxr  [[RESULT:w[0-9]+]], [x[[ADDR]]]
 ; CHECK-NEXT: cmp    [[RESULT]], w1
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
-; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]
+; CHECK-NEXT: stlxr  [[SCRATCH_REG:w[0-9]+]], w2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i32* %p, i32 %cmp, i32 %new acq_rel monotonic
   %val = extractvalue { i32, i1 } %pair, 0
   ret i32 %val
@@ -64,10 +66,10 @@ define i64 @val_compare_and_swap_64(i64* %p, i64 %cmp, i64 %new) #0 {
 ; CHECK-NEXT: b.ne   [[FAILBB:.?LBB[0-9_]+]]
 ; CHECK-NEXT: stxr   [[SCRATCH_REG:w[0-9]+]], x2, [x[[ADDR]]]
 ; CHECK-NEXT: cbnz   [[SCRATCH_REG]], [[TRYBB]]
-; CHECK-NEXT: b      [[EXITBB:.?LBB[0-9_]+]]
+; CHECK-NEXT: ret
 ; CHECK-NEXT: [[FAILBB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[EXITBB]]:
+; CHECK-NEXT: ret
   %pair = cmpxchg i64* %p, i64 %cmp, i64 %new monotonic monotonic
   %val = extractvalue { i64, i1 } %pair, 0
   ret i64 %val
diff --git a/test/CodeGen/AArch64/arm64-builtins-linux.ll b/test/CodeGen/AArch64/arm64-builtins-linux.ll
index 64239582f230d36a28cebba1080bb78af9e1800a..f86ee1afe555e9e10416763c3d14d208e9f9dca2 100644
--- a/test/CodeGen/AArch64/arm64-builtins-linux.ll
+++ b/test/CodeGen/AArch64/arm64-builtins-linux.ll
@@ -1,4 +1,6 @@
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-fuchsia | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-fuchsia -code-model=kernel | FileCheck --check-prefix=FUCHSIA-KERNEL %s
 
 ; Function Attrs: nounwind readnone
 declare i8* @llvm.thread.pointer() #1
@@ -6,6 +8,8 @@ declare i8* @llvm.thread.pointer() #1
 define i8* @thread_pointer() {
 ; CHECK: thread_pointer:
 ; CHECK: mrs {{x[0-9]+}}, TPIDR_EL0
+; FUCHSIA-KERNEL: thread_pointer:
+; FUCHSIA-KERNEL: mrs {{x[0-9]+}}, TPIDR_EL1
   %1 = tail call i8* @llvm.thread.pointer()
   ret i8* %1
 }
diff --git a/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
index 3d9c86139edaa6db4e8df0fe354a5a8ca1cb9c65..a104b65ea861d865fb37dac3a1b6ace60af43531 100644
--- a/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
+++ b/test/CodeGen/AArch64/arm64-codegen-prepare-extload.ll
@@ -258,8 +258,7 @@ false:
 ;    => We have one zext of %zextld left and we created one sext of %ld2.
 ; 2. We try to promote the operand of %sextaddza.
 ;    a. This creates one sext of %zexta and one of %zextld
-;    b. The sext of %zexta does not lead to any load, it stays here, even if it
-;       could have been combine with the zext of %a.
+;    b. The sext of %zexta can be combined with the zext of %a.
 ;    c. The sext of %zextld leads to %ld and can be combined with it. This is
 ;       done by promoting %zextld. This is fine with the current heuristic:
 ;       neutral.
@@ -281,16 +280,14 @@ false:
 ; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %addr1
 ; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
 ; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
 ; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32, i32* %addr2
 ; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_1]]
-; We do not combine this one: see 2.b.
-; OPT-NEXT: [[ZEXTA:%[a-zA-Z_0-9-]+]] = zext i8 %a to i32
-; OPT-NEXT: [[SEXTZEXTA:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXTA]] to i64
-; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTZEXTA]], [[ZEXTLD1_3]]
+; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_3]]
+; OPT-NEXT: [[ZEXTLD1_4:%[a-zA-Z_0-9-]+]] = zext i8 %a to i64
+; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXTLD1_4]], [[ZEXTLD1_2]]
 ; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
-; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_2]]
+; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_1]]
 ;
 ; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32
 ; DISABLE: [[RES:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADD]] to i64
diff --git a/test/CodeGen/AArch64/arm64-crc32.ll b/test/CodeGen/AArch64/arm64-crc32.ll
index 22111de5a3aa509f0103b1d6f491bed0603d0c28..df9465a6bda57b7c79b8a2d7c8952c009ad53558 100644
--- a/test/CodeGen/AArch64/arm64-crc32.ll
+++ b/test/CodeGen/AArch64/arm64-crc32.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=arm64-eabi -mattr=+crc -o - %s | FileCheck %s
+; RUN: llc -mtriple=arm64-eabi -mcpu=cortex-a53 -mattr=+crc -o - %s | FileCheck %s
 
 define i32 @test_crc32b(i32 %cur, i8 %next) {
 ; CHECK-LABEL: test_crc32b:
diff --git a/test/CodeGen/AArch64/arm64-elf-globals.ll b/test/CodeGen/AArch64/arm64-elf-globals.ll
index b1d5524aee873f8975587256afd513043866237e..92dc8179f8ea627ebc60fe2bef85f19505dbe701 100644
--- a/test/CodeGen/AArch64/arm64-elf-globals.ll
+++ b/test/CodeGen/AArch64/arm64-elf-globals.ll
@@ -2,6 +2,10 @@
 ; RUN: llc -mtriple=arm64-linux-gnu -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
 ; RUN: llc -mtriple=arm64-linux-gnu -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC
 ; RUN: llc -mtriple=arm64-linux-gnu -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC
+; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -o - %s -mcpu=cyclone | FileCheck %s
+; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -o - %s -O0 -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST
+; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-PIC
+; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel -O0 -relocation-model=pic -o - %s -mcpu=cyclone | FileCheck %s --check-prefix=CHECK-FAST-PIC
 
 @var8 = external global i8, align 1
 @var16 = external global i16, align 2
diff --git a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
index 071b2d0dbca41f6834c6cc76112b541935fe6aa1..d344084ef62d825a9c707ae563f3a27f85673328 100644
--- a/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
+++ b/test/CodeGen/AArch64/arm64-indexed-vector-ldst.ll
@@ -6216,11 +6216,11 @@ define <4 x i16> @test_v4i16_post_reg_ld1lane_forced_narrow(i16* %bar, i16** %pt
 declare <2 x i32> @llvm.ctpop.v2i32(<2 x i32>)
 
 ; CHECK-LABEL: test_ld1lane_build:
-; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[0], [x0]
-; CHECK-DAG: ld1.s { [[REG0:v[0-9]+]] }[1], [x1]
-; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[0], [x2]
-; CHECK-DAG: ld1.s { [[REG1:v[0-9]+]] }[1], [x3]
-; CHECK: sub.2s v[[REGNUM2:[0-9]+]], [[REG0]], [[REG1]]
+; CHECK-DAG: ldr s[[REGNUM0:[0-9]+]], [x0]
+; CHECK-DAG: ld1.s { v[[REGNUM0:[0-9]+]] }[1], [x1]
+; CHECK-DAG: ldr s[[REGNUM1:[0-9]+]], [x2]
+; CHECK-DAG: ld1.s { v[[REGNUM1:[0-9]+]] }[1], [x3]
+; CHECK: sub.2s v[[REGNUM2:[0-9]+]], v[[REGNUM0]], v[[REGNUM1]]
 ; CHECK-NEXT: str d[[REGNUM2]], [x4]
 ; CHECK-NEXT: ret
 define void @test_ld1lane_build(i32* %ptr0, i32* %ptr1, i32* %ptr2, i32* %ptr3, <2 x i32>* %out) {
diff --git a/test/CodeGen/AArch64/arm64-memset-inline.ll b/test/CodeGen/AArch64/arm64-memset-inline.ll
index 8f22f97ca0870bac56521fc5636f7d3b95b7065f..384aaa8541df245c16d5d22e498f169f59cb3ecd 100644
--- a/test/CodeGen/AArch64/arm64-memset-inline.ll
+++ b/test/CodeGen/AArch64/arm64-memset-inline.ll
@@ -13,8 +13,8 @@ define void @t2() nounwind ssp {
 entry:
 ; CHECK-LABEL: t2:
 ; CHECK: strh wzr, [sp, #32]
-; CHECK: stp xzr, xzr, [sp, #16]
-; CHECK: str xzr, [sp, #8]
+; CHECK: stp xzr, xzr, [sp, #8]
+; CHECK: str xzr, [sp, #24]
   %buf = alloca [26 x i8], align 1
   %0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0
   call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
diff --git a/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll b/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7efb4bf6d5963831c842b77f153a98bfa447bbc8
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-opt-remarks-lazy-bfi.ll
@@ -0,0 +1,63 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -pass-remarks-analysis=asm-printer \
+; RUN:       -pass-remarks-with-hotness=1 -asm-verbose=0 \
+; RUN:       -debug-only=lazy-machine-block-freq,block-freq \
+; RUN:       -debug-pass=Executions 2>&1 | FileCheck %s -check-prefix=HOTNESS
+
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -pass-remarks-analysis=asm-printer \
+; RUN:       -pass-remarks-with-hotness=0 -asm-verbose=0 \
+; RUN:       -debug-only=lazy-machine-block-freq,block-freq \
+; RUN:       -debug-pass=Executions 2>&1 | FileCheck %s -check-prefix=NO_HOTNESS
+
+; REQUIRES: asserts
+
+
+; Verify that we don't new populate MachineBFI for passes that already use
+; MBFI, e.g. GreedyRegAlloc.  (This hard-codes the previous pass to the
+; GreedyRegAlloc, please adjust accordingly.)
+
+; HOTNESS:      Executing Pass 'Spill Code Placement Analysis'
+; HOTNESS-NEXT: Executing Pass 'Lazy Machine Block Frequency Analysis'
+; HOTNESS-NEXT: Executing Pass 'Machine Optimization Remark Emitter'
+; HOTNESS-NEXT: MachineBlockFrequencyInfo is available
+; HOTNESS-NEXT: Executing Pass 'Greedy Register Allocator'
+
+
+; Verify that we only populate MachineBFI on behalf of ORE when hotness is
+; requested.  (This hard-codes the previous pass to the Assembly Printer,
+; please adjust accordingly.)
+
+; HOTNESS:      Executing Pass 'Implement the 'patchable-function' attribute'
+; HOTNESS-NEXT:  Freeing Pass 'Implement the 'patchable-function' attribute'
+; HOTNESS-NEXT: Executing Pass 'Lazy Machine Block Frequency Analysis'
+; HOTNESS-NEXT: Executing Pass 'Machine Optimization Remark Emitter'
+; HOTNESS-NEXT: Building MachineBlockFrequencyInfo on the fly
+; HOTNESS-NEXT: Building LoopInfo on the fly
+; HOTNESS-NEXT: Building DominatorTree on the fly
+; HOTNESS-NOT: Executing Pass
+; HOTNESS: block-frequency: empty_func
+; HOTNESS-NOT: Executing Pass
+; HOTNESS: Executing Pass 'AArch64 Assembly Printer'
+
+; HOTNESS: arm64-summary-remarks.ll:5:0: 1 instructions in function (hotness: 33)
+
+
+; NO_HOTNESS:      Executing Pass 'Implement the 'patchable-function' attribute'
+; NO_HOTNESS-NEXT:  Freeing Pass 'Implement the 'patchable-function' attribute'
+; NO_HOTNESS-NEXT: Executing Pass 'Lazy Machine Block Frequency Analysis'
+; NO_HOTNESS-NEXT: Executing Pass 'Machine Optimization Remark Emitter'
+; NO_HOTNESS-NEXT: Executing Pass 'AArch64 Assembly Printer'
+
+; NO_HOTNESS: arm64-summary-remarks.ll:5:0: 1 instructions in function{{$}}
+
+define void @empty_func() nounwind ssp !dbg !3 !prof !4 {
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1)
+!1 = !DIFile(filename: "arm64-summary-remarks.ll", directory: "")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "empty_func", scope: !1, file: !1, line: 5, scopeLine: 5, unit: !0)
+!4 = !{!"function_entry_count", i64 33}
diff --git a/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir b/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir
index bda025af519367431ff8963a9f8dcd5c1685e86a..9ad47c721c3a06d299bcf6807b4b9069f10cf1f7 100644
--- a/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir
+++ b/test/CodeGen/AArch64/arm64-regress-opt-cmp.mir
@@ -1,4 +1,4 @@
-# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s 2>&1 | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass peephole-opt -o - %s | FileCheck %s
 # CHECK: %1 = ANDWri {{.*}}
 # CHECK-NEXT: %wzr = SUBSWri {{.*}}
 --- |
diff --git a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
index 255cd8e4a0d3c4ce28c6cc4dcb6677f650db64be..4df220eddbbbe8375f5ba7b5d985c489ee384ba6 100644
--- a/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
+++ b/test/CodeGen/AArch64/arm64-shrink-wrapping.ll
@@ -346,19 +346,15 @@ entry:
 ; CHECK-NEXT: sub w1, w1, #1
 ; CHECK-NEXT: add [[SUM]], [[SUM]], [[VA_VAL]]
 ; CHECK-NEXT: cbnz w1, [[LOOP_LABEL]]
-; DISABLE-NEXT: b [[IFEND_LABEL]]
-;
-; DISABLE: [[ELSE_LABEL]]: ; %if.else
-; DISABLE: lsl w0, w1, #1
-;
-; CHECK: [[IFEND_LABEL]]:
+; CHECK-NEXT: [[IFEND_LABEL]]:
 ; Epilogue code.
 ; CHECK: add sp, sp, #16
 ; CHECK-NEXT: ret
 ;
-; ENABLE: [[ELSE_LABEL]]: ; %if.else
-; ENABLE-NEXT: lsl w0, w1, #1
-; ENABLE_NEXT: ret
+; CHECK: [[ELSE_LABEL]]: ; %if.else
+; CHECK-NEXT: lsl w0, w1, #1
+; DISABLE-NEXT: add sp, sp, #16
+; CHECK-NEXT: ret
 define i32 @variadicFunc(i32 %cond, i32 %count, ...) #0 {
 entry:
   %ap = alloca i8*, align 8
diff --git a/test/CodeGen/AArch64/arm64-summary-remarks.ll b/test/CodeGen/AArch64/arm64-summary-remarks.ll
new file mode 100644
index 0000000000000000000000000000000000000000..70e7fdffd63db19d42360e94cd76d4fd44b09570
--- /dev/null
+++ b/test/CodeGen/AArch64/arm64-summary-remarks.ll
@@ -0,0 +1,15 @@
+; RUN: llc < %s -mtriple=arm64-apple-ios7.0 -pass-remarks-analysis=asm-printer 2>&1 | FileCheck %s
+
+; CHECK: arm64-summary-remarks.ll:5:0: 1 instructions in function
+
+define void @empty_func() nounwind ssp !dbg !3 {
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1)
+!1 = !DIFile(filename: "arm64-summary-remarks.ll", directory: "")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "empty_func", scope: !1, file: !1, line: 5, scopeLine: 5, unit: !0)
diff --git a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
index 16ddf690fe9598454edb97a5186d34bb8aa4cd73..375877c517989fba1be3f855ea2cc6972327d9d8 100644
--- a/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
+++ b/test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -99,7 +99,7 @@ define void @test_nospare([8 x i64], [8 x float], ...) {
 ; __stack field should point just past them.
 define void @test_offsetstack([8 x i64], [2 x i64], [3 x float], ...) {
 ; CHECK-LABEL: test_offsetstack:
-; CHECK: sub sp, sp, #80
+; CHECK: stp {{q[0-9]+}}, {{q[0-9]+}}, [sp, #-80]!
 ; CHECK: add [[STACK_TOP:x[0-9]+]], sp, #96
 ; CHECK: add x[[VAR:[0-9]+]], {{x[0-9]+}}, :lo12:var
 ; CHECK: str [[STACK_TOP]], [x[[VAR]]]
diff --git a/test/CodeGen/AArch64/br-cond-not-merge.ll b/test/CodeGen/AArch64/br-cond-not-merge.ll
index be8797176e60188bde1b9cc78d7b33739249ce4a..bf21ef30790562ca0f8f32956e61136af1150769 100644
--- a/test/CodeGen/AArch64/br-cond-not-merge.ll
+++ b/test/CodeGen/AArch64/br-cond-not-merge.ll
@@ -1,14 +1,17 @@
-; RUN: llc -mtriple=aarch64 -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=OPT %s
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs -O0 -fast-isel=0 < %s | FileCheck --check-prefix=CHECK --check-prefix=NOOPT %s
 
 declare void @foo()
 
 ; Check that the inverted or doesn't inhibit the splitting of the
 ; complex conditional into three branch instructions.
-; CHECK-LABEL: test_and_not
+; CHECK-LABEL: test_and_not:
 ; CHECK:       cbz w0, [[L:\.LBB[0-9_]+]]
-; CHECK:       cmp w1, #2
+; OPT:         cmp w1, #2
+; NOOPT:       subs w{{[0-9]+}}, w{{[0-9]+}}, #2
 ; CHECK:       b.lo [[L]]
-; CHECK:       cmp w2, #2
+; OPT:         cmp w2, #2
+; NOOPT:       subs w{{[0-9]+}}, w{{[0-9]+}}, #2
 ; CHECK:       b.hi [[L]]
 define void @test_and_not(i32 %a, i32 %b, i32 %c) {
 bb1:
@@ -28,5 +31,64 @@ bb3:
   ret void
 }
 
+; Check that non-canonicalized xor not is handled correctly by FindMergedConditions.
+; CHECK-LABEL: test_and_not2:
+; CHECK:       cbz w0, [[L:\.LBB[0-9_]+]]
+; OPT:         cmp w1, #2
+; NOOPT:       subs w{{[0-9]+}}, w{{[0-9]+}}, #2
+; CHECK:       b.lo [[L]]
+; OPT:         cmp w2, #2
+; NOOPT:       subs w{{[0-9]+}}, w{{[0-9]+}}, #2
+; CHECK:       b.hi [[L]]
+define void @test_and_not2(i32 %a, i32 %b, i32 %c) {
+bb1:
+  %cmp1 = icmp ult i32 %a, 1
+  %cmp2 = icmp ult i32 %b, 2
+  %cmp3 = icmp ult i32 %c, 3
+  %or = or i1 %cmp1, %cmp2
+  %not.or = xor i1 -1, %or
+  %and = and i1 %not.or, %cmp3
+  br i1 %and, label %bb2, label %bb3
+
+bb2:
+  ret void
+
+bb3:
+  call void @foo()
+  ret void
+}
+
+; Check that cmps in different blocks are handled correctly by FindMergedConditions.
+; CHECK-LABEL: test_cmp_other_block:
+; OPT: cmp w{{[0-9]+}}, #0
+; OPT: b.gt [[L:\.LBB[0-9_]+]]
+; OPT: tbz w1, #0, [[L]]
+;
+; NOOPT: subs w{{[0-9]+}}, w{{[0-9]+}}, #0
+; NOOPT: cset [[R1:w[0-9]+]], gt
+; NOOPT: str w1, [sp, #[[SLOT2:[0-9]+]]]
+; NOOPT: str [[R1]], [sp, #[[SLOT1:[0-9]+]]]
+; NOOPT: b .LBB
+; NOOPT: ldr [[R2:w[0-9]+]], [sp, #[[SLOT1]]]
+; NOOPT: tbnz [[R2]], #0, [[L:\.LBB[0-9_]+]]
+; NOOPT: ldr [[R3:w[0-9]+]], [sp, #[[SLOT2]]]
+; NOOPT: tbz [[R3]], #0, [[L]]
+define void @test_cmp_other_block(i32* %p, i1 %c) {
+entry:
+  %l = load i32, i32* %p
+  %cmp = icmp sgt i32 %l, 0
+  br label %bb1
+
+bb1:
+  %cmp.i = xor i1 %cmp, true
+  %or.cond1.i = and i1 %cmp.i, %c
+  br i1 %or.cond1.i, label %bb2, label %bb3
 
+bb2:
+  ret void
+
+bb3:
+  call void @foo()
+  ret void
+}
 
diff --git a/test/CodeGen/AArch64/branch-relax-cbz.ll b/test/CodeGen/AArch64/branch-relax-cbz.ll
index c654b94e49cf0e40be8ff3dce7c1753ffe2ca95a..d13c0f677bcb55e1d181ada49369485ca185db06 100644
--- a/test/CodeGen/AArch64/branch-relax-cbz.ll
+++ b/test/CodeGen/AArch64/branch-relax-cbz.ll
@@ -6,23 +6,22 @@
 
 ; CHECK-NEXT: ; BB#1: ; %b3
 ; CHECK: ldr [[LOAD:w[0-9]+]]
-; CHECK: cbz [[LOAD]], [[SKIP_LONG_B:LBB[0-9]+_[0-9]+]]
-; CHECK-NEXT: b [[B8:LBB[0-9]+_[0-9]+]]
-
-; CHECK-NEXT: [[SKIP_LONG_B]]:
+; CHECK: cbnz [[LOAD]], [[B8:LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: b [[B7:LBB[0-9]+_[0-9]+]]
 
+; CHECK-NEXT: [[B8]]: ; %b8
+; CHECK-NEXT: ret
+
 ; CHECK-NEXT: [[B2]]: ; %b2
 ; CHECK: mov w{{[0-9]+}}, #93
 ; CHECK: bl _extfunc
 ; CHECK: cbz w{{[0-9]+}}, [[B7]]
-
-; CHECK-NEXT: [[B8]]: ; %b8
-; CHECK-NEXT: ret
+; CHECK-NEXT: b [[B8]]
 
 ; CHECK-NEXT: [[B7]]: ; %b7
 ; CHECK: mov w{{[0-9]+}}, #13
 ; CHECK: b _extfunc
+
 define void @split_block_no_fallthrough(i64 %val) #0 {
 bb:
   %c0 = icmp sgt i64 %val, -5
diff --git a/test/CodeGen/AArch64/cpus.ll b/test/CodeGen/AArch64/cpus.ll
index 50685cf5d3432b3a6a35376da02129e8fb4919a9..f65144def24573731d68348e914da13fe773043e 100644
--- a/test/CodeGen/AArch64/cpus.ll
+++ b/test/CodeGen/AArch64/cpus.ll
@@ -12,7 +12,7 @@
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=exynos-m3 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=falkor 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=kryo 2>&1 | FileCheck %s
-; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=vulcan 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=thunderx2t99 2>&1 | FileCheck %s
 ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: {{.*}}  is not a recognized processor for this target
diff --git a/test/CodeGen/AArch64/dag-numsignbits.ll b/test/CodeGen/AArch64/dag-numsignbits.ll
new file mode 100644
index 0000000000000000000000000000000000000000..217c3df77c9c8c84b28f1c7504ec4b8e0650b90a
--- /dev/null
+++ b/test/CodeGen/AArch64/dag-numsignbits.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=aarch64-unknown | FileCheck %s
+
+; PR32273
+
+define void @signbits_vXi1(<4 x i16> %a1) {
+; CHECK-LABEL: signbits_vXi1
+; CHECK: cmgt v0.4h, v1.4h, v0.4h
+; CHECK-NEXT: and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT: shl v0.4h, v0.4h, #15
+; CHECK-NEXT: sshr v0.4h, v0.4h, #15
+; CHECK-NEXT: umov w0, v0.h[0]
+; CHECK-NEXT: umov w3, v0.h[3]
+; CHECK-NEXT: mov w1, wzr
+; CHECK-NEXT: mov w2, wzr
+; CHECK-NEXT: b foo
+  %tmp3 = shufflevector <4 x i16> %a1, <4 x i16> undef, <4 x i32> zeroinitializer
+  %tmp5 = add <4 x i16> %tmp3, <i16 18249, i16 6701, i16 -18744, i16 -25086>
+  %tmp6 = icmp slt <4 x i16> %tmp5, <i16 1, i16 1, i16 1, i16 1>
+  %tmp7 = and <4 x i1> %tmp6, <i1 true, i1 false, i1 false, i1 true>
+  %tmp8 = sext <4 x i1> %tmp7 to <4 x i16>
+  %tmp9 = extractelement <4 x i16> %tmp8, i32 0
+  %tmp10 = zext i16 %tmp9 to i32
+  %tmp11 = extractelement <4 x i16> %tmp8, i32 1
+  %tmp12 = zext i16 %tmp11 to i32
+  %tmp13 = extractelement <4 x i16> %tmp8, i32 2
+  %tmp14 = zext i16 %tmp13 to i32
+  %tmp15 = extractelement <4 x i16> %tmp8, i32 3
+  %tmp16 = zext i16 %tmp15 to i32
+  tail call void @foo(i32 %tmp10, i32 %tmp12, i32 %tmp14, i32 %tmp16)
+  ret void
+}
+
+declare void @foo(i32, i32, i32, i32)
diff --git a/test/CodeGen/AArch64/eliminate-trunc.ll b/test/CodeGen/AArch64/eliminate-trunc.ll
index bc4ac7d7170444838a77d919cb9345511867d25e..83730d15d7f5f5e286779bb80af165ac0e9d3d2b 100644
--- a/test/CodeGen/AArch64/eliminate-trunc.ll
+++ b/test/CodeGen/AArch64/eliminate-trunc.ll
@@ -6,7 +6,7 @@
 ; CHECK-NOT: add {{x[0-9]+}}, {{x[0-9]+}}, #1
 ; CHECK: add {{w[0-9]+}}, {{w[0-9]+}}, #1
 ; CHECK-NEXT: cmp {{w[0-9]+}}, {{w[0-9]+}}
-define void @test1_signed([8 x i8]* nocapture %a, i8* nocapture readonly %box, i8 %limit) minsize {
+define void @test1_signed([8 x i8]* nocapture %a, i8* nocapture readonly %box, i8 %limit, i64 %inv) minsize {
 entry:
   %conv = zext i8 %limit to i32
   %cmp223 = icmp eq i8 %limit, 0
@@ -14,7 +14,7 @@ entry:
 
 for.body4.us:
   %indvars.iv = phi i64 [ 0, %for.body4.lr.ph.us ], [ %indvars.iv.next, %for.body4.us ]
-  %arrayidx6.us = getelementptr inbounds [8 x i8], [8 x i8]* %a, i64 %indvars.iv26, i64 %indvars.iv
+  %arrayidx6.us = getelementptr inbounds [8 x i8], [8 x i8]* %a, i64 %indvars.iv, i64 %inv
   %0 = load i8, i8* %arrayidx6.us, align 1
   %idxprom7.us = zext i8 %0 to i64
   %arrayidx8.us = getelementptr inbounds i8, i8* %box, i64 %idxprom7.us
diff --git a/test/CodeGen/AArch64/fast-isel-tail-call.ll b/test/CodeGen/AArch64/fast-isel-tail-call.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0efaa373448627eb46208e48dfa93bb9f466fa17
--- /dev/null
+++ b/test/CodeGen/AArch64/fast-isel-tail-call.ll
@@ -0,0 +1,24 @@
+; RUN: llc -fast-isel -pass-remarks-missed=isel -pass-remarks-missed=isel \
+; RUN:     -mtriple arm64-- < %s 2> %t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix MISSED
+
+%struct = type { [4 x i32] }
+
+declare %struct @external()
+
+; Check that, when fastisel falls back to SDAG, we don't emit instructions
+; that follow a tail-call and would have been dropped by pure SDAGISel.
+
+; Here, the %struct extractvalue should fail FastISel.
+
+; MISSED: FastISel missed:   %tmp1 = extractvalue %struct %tmp0, 0
+
+; CHECK-LABEL: test:
+; CHECK: b external
+; CHECK-NEXT: .Lfunc_end0:
+define i32 @test() nounwind {
+  %tmp0 = tail call %struct @external()
+  %tmp1 = extractvalue %struct %tmp0, 0
+  %tmp2 = extractvalue [4 x i32] %tmp1, 0
+  ret i32 %tmp2
+}
diff --git a/test/CodeGen/AArch64/fast-isel-tbz.ll b/test/CodeGen/AArch64/fast-isel-tbz.ll
index af817777143ddb2955ad3ccfe7adefb90faef8ef..d6d10318bf02539c191c7538326b748d33ae757a 100644
--- a/test/CodeGen/AArch64/fast-isel-tbz.ll
+++ b/test/CodeGen/AArch64/fast-isel-tbz.ll
@@ -278,8 +278,24 @@ bb2:
 ; Test that we don't fold the 'and' instruction into the compare.
 define i32 @icmp_eq_and_i32(i32 %a, i1 %c) {
 ; CHECK-LABEL: icmp_eq_and_i32
-; CHECK:       and  [[REG:w[0-9]+]], w0, #0x4
+; CHECK:       and  [[REG:w[0-9]+]], w0, #0x3
 ; CHECK-NEXT:  cbz  [[REG]], {{LBB.+_3}}
+  %1 = and i32 %a, 3
+  br i1 %c, label %bb0, label %bb2
+bb0:
+  %2 = icmp eq i32 %1, 0
+  br i1 %2, label %bb1, label %bb2, !prof !0
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+; Test that we do fold the 'and' instruction into the compare and
+; generate a tbz instruction for the conditional branch.
+define i32 @icmp_eq_and1bit_i32(i32 %a, i1 %c) {
+; CHECK-LABEL: icmp_eq_and1bit_i32
+; CHECK:       tbz  {{w[0-9]+}}, #2, {{LBB.+_3}}
   %1 = and i32 %a, 4
   br i1 %c, label %bb0, label %bb2
 bb0:
diff --git a/test/CodeGen/AArch64/ldst-opt-aa.mir b/test/CodeGen/AArch64/ldst-opt-aa.mir
new file mode 100644
index 0000000000000000000000000000000000000000..808926ae3cd1fd4dca64ee26a175d7bd592f761f
--- /dev/null
+++ b/test/CodeGen/AArch64/ldst-opt-aa.mir
@@ -0,0 +1,30 @@
+# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=aarch64-ldst-opt %s -verify-machineinstrs -o - | FileCheck %s
+--- |
+  define void @ldr_str_aa(i32* noalias nocapture %x, i32* noalias nocapture readonly %y) {
+  entry:
+    %0 = load i32, i32* %y, align 4
+    store i32 %0, i32* %x, align 4
+    %arrayidx2 = getelementptr inbounds i32, i32* %y, i32 1
+    %1 = load i32, i32* %arrayidx2, align 4
+    %arrayidx3 = getelementptr inbounds i32, i32* %x, i32 1
+    store i32 %1, i32* %arrayidx3, align 4
+    ret void
+  }
+
+...
+---
+# CHECK-LABEL: name: ldr_str_aa
+# CHECK: %w8, %w9 = LDPWi %x1, 0
+# CHECK: STPWi %w8, %w9, %x0, 0
+name:            ldr_str_aa
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    liveins: %x0, %x1
+
+    %w8 = LDRWui %x1, 0 :: (load 4 from %ir.y)
+    STRWui killed %w8, %x0, 0 :: (store 4 into %ir.x)
+    %w9 = LDRWui killed %x1, 1 :: (load 4 from %ir.arrayidx2)
+    STRWui killed %w9, killed %x0, 1 :: (store 4 into %ir.arrayidx3)
+    RET undef %lr
+
diff --git a/test/CodeGen/AArch64/ldst-opt.ll b/test/CodeGen/AArch64/ldst-opt.ll
index 81e4b19e6eea2dec879473c450ca2e2a01069e38..b09fab8d8b465825c0160dc1725e8576c928886a 100644
--- a/test/CodeGen/AArch64/ldst-opt.ll
+++ b/test/CodeGen/AArch64/ldst-opt.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTRICTALIGN %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+strict-align -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=STRICTALIGN %s
 
 ; This file contains tests for the AArch64 load/store optimizer.
 
@@ -119,7 +120,7 @@ define void @load-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind {
 ; CHECK: ldr x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0
-  %add = load i64, i64* %a, align 4
+  %add = load i64, i64* %a, align 8
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1
@@ -132,7 +133,7 @@ define void @store-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) no
 ; CHECK: str x{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0
-  store i64 %val, i64* %a, align 4
+  store i64 %val, i64* %a, align 8
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1
@@ -147,7 +148,7 @@ define void @load-pre-indexed-quadword(%struct.quadword* %ptr) nounwind {
 ; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1, i32 0
-  %add = load fp128, fp128* %a, align 4
+  %add = load fp128, fp128* %a, align 16
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1
@@ -160,7 +161,7 @@ define void @store-pre-indexed-quadword(%struct.quadword* %ptr, fp128 %val) noun
 ; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1, i32 0
-  store fp128 %val, fp128* %a, align 4
+  store fp128 %val, fp128* %a, align 16
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1
@@ -203,7 +204,7 @@ define void @load-pre-indexed-double(%struct.double* %ptr) nounwind {
 ; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.double, %struct.double* %ptr, i64 0, i32 1, i32 0
-  %add = load double, double* %a, align 4
+  %add = load double, double* %a, align 8
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.double, %struct.double* %ptr, i64 0, i32 1
@@ -216,7 +217,7 @@ define void @store-pre-indexed-double(%struct.double* %ptr, double %val) nounwin
 ; CHECK: str d{{[0-9]+}}, [x{{[0-9]+}}, #32]!
 entry:
   %a = getelementptr inbounds %struct.double, %struct.double* %ptr, i64 0, i32 1, i32 0
-  store double %val, double* %a, align 4
+  store double %val, double* %a, align 8
   br label %bar
 bar:
   %c = getelementptr inbounds %struct.double, %struct.double* %ptr, i64 0, i32 1
@@ -1340,7 +1341,8 @@ end:
 define void @merge_zr32(i32* %p) {
 ; CHECK-LABEL: merge_zr32:
 ; CHECK: // %entry
-; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
@@ -1349,11 +1351,13 @@ entry:
   ret void
 }
 
-; Same sa merge_zr32 but the merged stores should also get paried.
+; Same as merge_zr32 but the merged stores should also get paried.
 define void @merge_zr32_2(i32* %p) {
 ; CHECK-LABEL: merge_zr32_2:
 ; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
 ; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
@@ -1370,7 +1374,11 @@ entry:
 define void @merge_zr32_2_offset(i32* %p) {
 ; CHECK-LABEL: merge_zr32_2_offset:
 ; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
+; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #504]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #508]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #512]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #516]
 ; CHECK-NEXT: ret
 entry:
   %p0 = getelementptr i32, i32* %p, i32 126
@@ -1390,8 +1398,12 @@ entry:
 define void @no_merge_zr32_2_offset(i32* %p) {
 ; CHECK-LABEL: no_merge_zr32_2_offset:
 ; CHECK: // %entry
-; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
-; CHECK-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
+; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4096]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4100]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4104]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4108]
 ; CHECK-NEXT: ret
 entry:
   %p0 = getelementptr i32, i32* %p, i32 1024
@@ -1411,8 +1423,12 @@ entry:
 define void @merge_zr32_3(i32* %p) {
 ; CHECK-LABEL: merge_zr32_3:
 ; CHECK: // %entry
-; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
-; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #16]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #24]
 ; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
@@ -1437,7 +1453,8 @@ entry:
 define void @merge_zr32_2vec(<2 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_2vec:
 ; CHECK: // %entry
-; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <2 x i32> zeroinitializer, <2 x i32>* %p
@@ -1448,8 +1465,10 @@ entry:
 define void @merge_zr32_3vec(<3 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_3vec:
 ; CHECK: // %entry
-; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
-; CHECK-NEXT: str wzr, [x{{[0-9]+}}, #8]
+; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8]
 ; CHECK-NEXT: ret
 entry:
   store <3 x i32> zeroinitializer, <3 x i32>* %p
@@ -1460,7 +1479,9 @@ entry:
 define void @merge_zr32_4vec(<4 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_4vec:
 ; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
 ; CHECK-NEXT: ret
 entry:
   store <4 x i32> zeroinitializer, <4 x i32>* %p
@@ -1471,7 +1492,8 @@ entry:
 define void @merge_zr32_2vecf(<2 x float>* %p) {
 ; CHECK-LABEL: merge_zr32_2vecf:
 ; CHECK: // %entry
-; CHECK-NEXT: str xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store <2 x float> zeroinitializer, <2 x float>* %p
@@ -1482,7 +1504,9 @@ entry:
 define void @merge_zr32_4vecf(<4 x float>* %p) {
 ; CHECK-LABEL: merge_zr32_4vecf:
 ; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
 ; CHECK-NEXT: ret
 entry:
   store <4 x float> zeroinitializer, <4 x float>* %p
@@ -1502,13 +1526,42 @@ entry:
   ret void
 }
 
+; Similar to merge_zr32, but for 64-bit values and with unaligned stores.
+define void @merge_zr64_unalign(<2 x i64>* %p) {
+; CHECK-LABEL: merge_zr64_unalign:
+; CHECK: // %entry
+; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN: strb wzr,
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; STRICTALIGN: strb
+; CHECK-NEXT: ret
+entry:
+  store <2 x i64> zeroinitializer, <2 x i64>* %p, align 1
+  ret void
+}
+
 ; Similar to merge_zr32_3, replaceZeroVectorStore should not split the
 ; vector store since the zero constant vector has multiple uses.
 define void @merge_zr64_2(i64* %p) {
 ; CHECK-LABEL: merge_zr64_2:
 ; CHECK: // %entry
-; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
-; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #16]
 ; CHECK-NEXT: ret
 entry:
   store i64 0, i64* %p
diff --git a/test/CodeGen/AArch64/ldst-opt.mir b/test/CodeGen/AArch64/ldst-opt.mir
index 8f0b71be34830d1686bc5e9470d15225de829bf1..f7641d3ffd04cdbb5d3a14d8b7eda5edf7ff253b 100644
--- a/test/CodeGen/AArch64/ldst-opt.mir
+++ b/test/CodeGen/AArch64/ldst-opt.mir
@@ -1,10 +1,4 @@
-# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=aarch64-ldst-opt %s -verify-machineinstrs -o - 2>&1 | FileCheck %s
---- |
-  define void @promote-load-from-store() { ret void }
-  define void @store-pair() { ret void }
-  define void @store-pair-clearkill0() { ret void }
-  define void @store-pair-clearkill1() { ret void }
-...
+# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=aarch64-ldst-opt %s -verify-machineinstrs -o - | FileCheck %s
 ---
 name: promote-load-from-store
 tracksRegLiveness: true
@@ -130,3 +124,23 @@ body: |
 # CHECK-NOT: %w2 = COPY killed %w1
 # CHECK: %w2 = COPY %w1
 # CHECK: STPWi %w1, killed %w2, killed %x0, 0
+---
+name: store-load-clearkill
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: %w1
+
+    STRWui %w1, %sp, 0 :: (store 4)
+    %wzr = COPY killed %w1 ; killing use of %w1
+    %w11 = LDRWui %sp, 0 :: (load 4)
+    HINT 0, implicit %w11 ; some use of %w11
+...
+# When replaceing the load of a store-load pair with a copy the kill flags
+# along the way need to be cleared.
+# CHECK-LABEL: name: store-load-clearkill
+# CHECK: STRWui %w1, %sp, 0 :: (store 4)
+# CHECK-NOT: COPY killed %w1
+# CHECK: %wzr = COPY %w1
+# CHECK: %w11 = ORRWrs %wzr, %w1, 0
+# CHECK: HINT 0, implicit %w11
diff --git a/test/CodeGen/AArch64/ldst-zero.ll b/test/CodeGen/AArch64/ldst-zero.ll
new file mode 100644
index 0000000000000000000000000000000000000000..95b92ac70879ffb34282f7d5183d3d878b316991
--- /dev/null
+++ b/test/CodeGen/AArch64/ldst-zero.ll
@@ -0,0 +1,74 @@
+; RUN: llc -mtriple=aarch64 -mcpu=cortex-a53 < %s | FileCheck %s
+
+; Tests to check that zero stores which are generated as STP xzr, xzr aren't
+; scheduled incorrectly due to incorrect alias information
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+%struct.tree_common = type { i8*, i8*, i32 }
+
+; Original test case which exhibited the bug
+define void @test1(%struct.tree_common* %t, i32 %code, i8* %type) {
+; CHECK-LABEL: test1:
+; CHECK: stp xzr, xzr, [x0, #8]
+; CHECK: stp xzr, x2, [x0]
+; CHECK: str w1, [x0, #16]
+entry:
+  %0 = bitcast %struct.tree_common* %t to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 24, i32 8, i1 false)
+  %code1 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 2
+  store i32 %code, i32* %code1, align 8
+  %type2 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 1
+  store i8* %type, i8** %type2, align 8
+  ret void
+}
+
+; Store to each struct element instead of using memset
+define void @test2(%struct.tree_common* %t, i32 %code, i8* %type) {
+; CHECK-LABEL: test2:
+; CHECK: stp xzr, xzr, [x0]
+; CHECK: str wzr, [x0, #16]
+; CHECK: str w1, [x0, #16]
+; CHECK: str x2, [x0, #8]
+entry:
+  %0 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 0
+  %1 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 1
+  %2 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 2
+  store i8* zeroinitializer, i8** %0, align 8
+  store i8* zeroinitializer, i8** %1, align 8
+  store i32 zeroinitializer, i32* %2, align 8
+  store i32 %code, i32* %2, align 8
+  store i8* %type, i8** %1, align 8
+  ret void
+}
+
+; Vector store instead of memset
+define void @test3(%struct.tree_common* %t, i32 %code, i8* %type) {
+; CHECK-LABEL: test3:
+; CHECK: stp xzr, xzr, [x0, #8]
+; CHECK: stp xzr, x2, [x0]
+; CHECK: str w1, [x0, #16]
+entry:
+  %0 = bitcast %struct.tree_common* %t to <3 x i64>*
+  store <3 x i64> zeroinitializer, <3 x i64>* %0, align 8
+  %code1 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 2
+  store i32 %code, i32* %code1, align 8
+  %type2 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 1
+  store i8* %type, i8** %type2, align 8
+  ret void
+}
+
+; Vector store, then store to vector elements
+define void @test4(<3 x i64>* %p, i64 %x, i64 %y) {
+; CHECK-LABEL: test4:
+; CHECK: stp xzr, xzr, [x0, #8]
+; CHECK: stp xzr, x2, [x0]
+; CHECK: str x1, [x0, #16]
+entry:
+  store <3 x i64> zeroinitializer, <3 x i64>* %p, align 8
+  %0 = bitcast <3 x i64>* %p to i64*
+  %1 = getelementptr inbounds i64, i64* %0, i64 2
+  store i64 %x, i64* %1, align 8
+  %2 = getelementptr inbounds i64, i64* %0, i64 1
+  store i64 %y, i64* %2, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/load-combine-big-endian.ll b/test/CodeGen/AArch64/load-combine-big-endian.ll
index b19ef3ec6927afeab778323a810c7a9369d19995..918ceaeb1b4fa4dbdb6e28491f827dd66f4c4a70 100644
--- a/test/CodeGen/AArch64/load-combine-big-endian.ll
+++ b/test/CodeGen/AArch64/load-combine-big-endian.ll
@@ -191,3 +191,394 @@ define i64 @load_i64_by_i8(i64* %arg) {
   %tmp37 = or i64 %tmp33, %tmp36
   ret i64 %tmp37
 }
+
+; i8* p; // p[1] is 4 byte aligned
+; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24)
+define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
+; CHECK: ldur  w8, [x0, #1]
+; CHECK-NEXT: rev w0, w8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 4
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[-4] is 4 byte aligned
+; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24)
+define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_neg_offset:
+; CHECK: ldur  w8, [x0, #-4]
+; CHECK-NEXT: rev w0, w8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
+  %tmp2 = load i8, i8* %tmp1, align 4
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[1] is 4 byte aligned
+; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24)
+define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
+; CHECK: ldur w0, [x0, #1]
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp15 = load i8, i8* %tmp14, align 4
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[-4] is 4 byte aligned
+; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24)
+define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4
+  %tmp15 = load i8, i8* %tmp14, align 4
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+declare i16 @llvm.bswap.i16(i16)
+
+; i16* p; // p is 4 byte aligned
+; (i32) bswap(p[0]) | (i32) bswap(p[1] << 16)
+define i32 @load_i32_by_bswap_i16(i32* %arg) {
+; CHECK-LABEL: load_i32_by_bswap_i16:
+; CHECK: ldr   w8, [x0]
+; CHECK-NEXT: rev w0, w8
+; CHECK-NEXT: ret
+  %tmp = bitcast i32* %arg to i16*
+  %tmp1 = load i16, i16* %tmp, align 4
+  %tmp11 = call i16 @llvm.bswap.i16(i16 %tmp1)
+  %tmp2 = zext i16 %tmp11 to i32
+  %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
+  %tmp4 = load i16, i16* %tmp3, align 1
+  %tmp41 = call i16 @llvm.bswap.i16(i16 %tmp4)
+  %tmp5 = zext i16 %tmp41 to i32
+  %tmp6 = shl nuw nsw i32 %tmp5, 16
+  %tmp7 = or i32 %tmp6, %tmp2
+  ret i32 %tmp7
+}
+
+; i16* p; // p is 4 byte aligned
+; (i32) p[1] | (sext(p[0] << 16) to i32)
+define i32 @load_i32_by_sext_i16(i32* %arg) {
+; CHECK-LABEL: load_i32_by_sext_i16:
+; CHECK: ldr   w0, [x0]
+; CHECK-NEXT: ret
+  %tmp = bitcast i32* %arg to i16*
+  %tmp1 = load i16, i16* %tmp, align 4
+  %tmp2 = sext i16 %tmp1 to i32
+  %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
+  %tmp4 = load i16, i16* %tmp3, align 1
+  %tmp5 = zext i16 %tmp4 to i32
+  %tmp6 = shl nuw nsw i32 %tmp2, 16
+  %tmp7 = or i32 %tmp6, %tmp5
+  ret i32 %tmp7
+}
+
+; i8* arg; i32 i;
+; p = arg + 12;
+; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24)
+define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
+; CHECK-LABEL: load_i32_by_i8_base_offset_index:
+; CHECK: add  x8, x0, w1, uxtw
+; CHECK-NEXT: ldr w8, [x8, #12]
+; CHECK-NEXT: rev w0, w8
+; CHECK-NEXT: ret
+  %tmp = add nuw nsw i32 %i, 3
+  %tmp2 = add nuw nsw i32 %i, 2
+  %tmp3 = add nuw nsw i32 %i, 1
+  %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp5 = zext i32 %i to i64
+  %tmp6 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp5
+  %tmp7 = load i8, i8* %tmp6, align 4
+  %tmp8 = zext i8 %tmp7 to i32
+  %tmp9 = zext i32 %tmp3 to i64
+  %tmp10 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp9
+  %tmp11 = load i8, i8* %tmp10, align 1
+  %tmp12 = zext i8 %tmp11 to i32
+  %tmp13 = shl nuw nsw i32 %tmp12, 8
+  %tmp14 = or i32 %tmp13, %tmp8
+  %tmp15 = zext i32 %tmp2 to i64
+  %tmp16 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp15
+  %tmp17 = load i8, i8* %tmp16, align 1
+  %tmp18 = zext i8 %tmp17 to i32
+  %tmp19 = shl nuw nsw i32 %tmp18, 16
+  %tmp20 = or i32 %tmp14, %tmp19
+  %tmp21 = zext i32 %tmp to i64
+  %tmp22 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp21
+  %tmp23 = load i8, i8* %tmp22, align 1
+  %tmp24 = zext i8 %tmp23 to i32
+  %tmp25 = shl nuw i32 %tmp24, 24
+  %tmp26 = or i32 %tmp20, %tmp25
+  ret i32 %tmp26
+}
+
+; i8* arg; i32 i;
+; p = arg + 12;
+; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24)
+define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
+; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
+; CHECK: add x8, x0, w1, uxtw
+; CHECK-NEXT: ldur  w8, [x8, #13]
+; CHECK-NEXT: rev w0, w8
+; CHECK-NEXT: ret
+  %tmp = add nuw nsw i32 %i, 4
+  %tmp2 = add nuw nsw i32 %i, 3
+  %tmp3 = add nuw nsw i32 %i, 2
+  %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp5 = add nuw nsw i32 %i, 1
+  %tmp27 = zext i32 %tmp5 to i64
+  %tmp28 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp27
+  %tmp29 = load i8, i8* %tmp28, align 4
+  %tmp30 = zext i8 %tmp29 to i32
+  %tmp31 = zext i32 %tmp3 to i64
+  %tmp32 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp31
+  %tmp33 = load i8, i8* %tmp32, align 1
+  %tmp34 = zext i8 %tmp33 to i32
+  %tmp35 = shl nuw nsw i32 %tmp34, 8
+  %tmp36 = or i32 %tmp35, %tmp30
+  %tmp37 = zext i32 %tmp2 to i64
+  %tmp38 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp37
+  %tmp39 = load i8, i8* %tmp38, align 1
+  %tmp40 = zext i8 %tmp39 to i32
+  %tmp41 = shl nuw nsw i32 %tmp40, 16
+  %tmp42 = or i32 %tmp36, %tmp41
+  %tmp43 = zext i32 %tmp to i64
+  %tmp44 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp43
+  %tmp45 = load i8, i8* %tmp44, align 1
+  %tmp46 = zext i8 %tmp45 to i32
+  %tmp47 = shl nuw i32 %tmp46, 24
+  %tmp48 = or i32 %tmp42, %tmp47
+  ret i32 %tmp48
+}
+; i8* p; // p is 2 byte aligned
+; (i32) p[0] | ((i32) p[1] << 8)
+define i32 @zext_load_i32_by_i8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8:
+; CHECK: ldrb  w8, [x0]
+; CHECK-NEXT: ldrb  w9, [x0, #1]
+; CHECK-NEXT: bfi w8, w9, #8, #8
+; CHECK-NEXT: mov  w0, w8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[0] << 8) | ((i32) p[1] << 16)
+define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_shl_8:
+; CHECK: ldrb  w8, [x0]
+; CHECK-NEXT: ldrb  w9, [x0, #1]
+; CHECK-NEXT: lsl w0, w8, #8
+; CHECK-NEXT: bfi w0, w9, #16, #8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 8
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 16
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[0] << 16) | ((i32) p[1] << 24)
+define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_shl_16:
+; CHECK: ldrb  w8, [x0]
+; CHECK-NEXT: ldrb  w9, [x0, #1]
+; CHECK-NEXT: lsl w0, w8, #16
+; CHECK-NEXT: bfi w0, w9, #24, #8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 16
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 24
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+; i8* p; // p is 2 byte aligned
+; (i32) p[1] | ((i32) p[0] << 8)
+define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap:
+; CHECK: ldrb  w8, [x0, #1]
+; CHECK-NEXT: ldrb    w9, [x0]
+; CHECK-NEXT: bfi w8, w9, #8, #8
+; CHECK-NEXT: mov  w0, w8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[1] << 8) | ((i32) p[0] << 16)
+define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8:
+; CHECK: ldrb  w8, [x0, #1]
+; CHECK-NEXT: ldrb    w9, [x0]
+; CHECK-NEXT: lsl w0, w8, #8
+; CHECK-NEXT: bfi w0, w9, #16, #8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 8
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 16
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[1] << 16) | ((i32) p[0] << 24)
+define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16:
+; CHECK: ldrb  w8, [x0, #1]
+; CHECK-NEXT: ldrb    w9, [x0]
+; CHECK-NEXT: lsl w0, w8, #16
+; CHECK-NEXT: bfi w0, w9, #24, #8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 16
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 24
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p;
+; i16* p1.i16 = (i16*) p;
+; (p1.i16[0] << 8) | ((i16) p[2])
+;
+; This is essentialy a i16 load from p[1], but we don't fold the pattern now
+; because in the original DAG we don't have p[1] address available
+define i16 @load_i16_from_nonzero_offset(i8* %p) {
+; CHECK-LABEL: load_i16_from_nonzero_offset:
+; CHECK:  ldrh    w8, [x0]
+; CHECK-NEXT: ldrb  w0, [x0, #2]
+; CHECK-NEXT: bfi w0, w8, #8, #24
+; CHECK-NEXT: ret
+
+  %p1.i16 = bitcast i8* %p to i16*
+  %p2.i8 = getelementptr i8, i8* %p, i64 2
+  %v1 = load i16, i16* %p1.i16
+  %v2.i8 = load i8, i8* %p2.i8
+  %v2 = zext i8 %v2.i8 to i16
+  %v1.shl = shl i16 %v1, 8
+  %res = or i16 %v1.shl, %v2
+  ret i16 %res
+}
diff --git a/test/CodeGen/AArch64/load-combine.ll b/test/CodeGen/AArch64/load-combine.ll
index 4644fa263eeda5738a33f3b7fd20149f985800ed..f0ed40357f12269e4d832c40d6e63a1eadd202d5 100644
--- a/test/CodeGen/AArch64/load-combine.ll
+++ b/test/CodeGen/AArch64/load-combine.ll
@@ -178,3 +178,371 @@ define i64 @load_i64_by_i8_bswap(i64* %arg) {
   %tmp37 = or i64 %tmp33, %tmp36
   ret i64 %tmp37
 }
+
+; i8* p; // p[1] is 4 byte aligned
+; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24)
+define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
+; CHECK: ldur w0, [x0, #1]
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 4
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[-4] is 4 byte aligned
+; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24)
+define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_neg_offset:
+; CHECK: ldur w0, [x0, #-4]
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
+  %tmp2 = load i8, i8* %tmp1, align 4
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[1] is 4 byte aligned
+; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24)
+define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
+; CHECK: ldur  w8, [x0, #1]
+; CHECK-NEXT: rev w0, w8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp15 = load i8, i8* %tmp14, align 4
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[-4] is 4 byte aligned
+; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24)
+define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
+; CHECK: ldur  w8, [x0, #-4]
+; CHECK-NEXT: rev w0, w8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4
+  %tmp15 = load i8, i8* %tmp14, align 4
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+declare i16 @llvm.bswap.i16(i16)
+
+; i16* p; // p is 4 byte aligned
+; (i32) bswap(p[1]) | (i32) bswap(p[0] << 16)
+define i32 @load_i32_by_bswap_i16(i32* %arg) {
+; CHECK-LABEL: load_i32_by_bswap_i16:
+; CHECK: ldr    w8, [x0]
+; CHECK-NEXT: rev w0, w8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i16*
+  %tmp1 = load i16, i16* %tmp, align 4
+  %tmp11 = call i16 @llvm.bswap.i16(i16 %tmp1)
+  %tmp2 = zext i16 %tmp11 to i32
+  %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
+  %tmp4 = load i16, i16* %tmp3, align 1
+  %tmp41 = call i16 @llvm.bswap.i16(i16 %tmp4)
+  %tmp5 = zext i16 %tmp41 to i32
+  %tmp6 = shl nuw nsw i32 %tmp2, 16
+  %tmp7 = or i32 %tmp6, %tmp5
+  ret i32 %tmp7
+}
+
+; i16* p; // p is 4 byte aligned
+; (i32) p[0] | (sext(p[1] << 16) to i32)
+define i32 @load_i32_by_sext_i16(i32* %arg) {
+; CHECK-LABEL: load_i32_by_sext_i16:
+; CHECK: ldr   w0, [x0]
+; CHECK-NEXT: ret
+  %tmp = bitcast i32* %arg to i16*
+  %tmp1 = load i16, i16* %tmp, align 4
+  %tmp2 = zext i16 %tmp1 to i32
+  %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
+  %tmp4 = load i16, i16* %tmp3, align 1
+  %tmp5 = sext i16 %tmp4 to i32
+  %tmp6 = shl nuw nsw i32 %tmp5, 16
+  %tmp7 = or i32 %tmp6, %tmp2
+  ret i32 %tmp7
+}
+
+; i8* arg; i32 i;
+; p = arg + 12;
+; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24)
+define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
+; CHECK-LABEL: load_i32_by_i8_base_offset_index:
+; CHECK: add x8, x0, w1, uxtw
+; CHECK-NEXT: ldr w0, [x8, #12]
+; CHECK-NEXT: ret
+  %tmp = add nuw nsw i32 %i, 3
+  %tmp2 = add nuw nsw i32 %i, 2
+  %tmp3 = add nuw nsw i32 %i, 1
+  %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp5 = zext i32 %i to i64
+  %tmp6 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp5
+  %tmp7 = load i8, i8* %tmp6, align 4
+  %tmp8 = zext i8 %tmp7 to i32
+  %tmp9 = zext i32 %tmp3 to i64
+  %tmp10 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp9
+  %tmp11 = load i8, i8* %tmp10, align 1
+  %tmp12 = zext i8 %tmp11 to i32
+  %tmp13 = shl nuw nsw i32 %tmp12, 8
+  %tmp14 = or i32 %tmp13, %tmp8
+  %tmp15 = zext i32 %tmp2 to i64
+  %tmp16 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp15
+  %tmp17 = load i8, i8* %tmp16, align 1
+  %tmp18 = zext i8 %tmp17 to i32
+  %tmp19 = shl nuw nsw i32 %tmp18, 16
+  %tmp20 = or i32 %tmp14, %tmp19
+  %tmp21 = zext i32 %tmp to i64
+  %tmp22 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp21
+  %tmp23 = load i8, i8* %tmp22, align 1
+  %tmp24 = zext i8 %tmp23 to i32
+  %tmp25 = shl nuw i32 %tmp24, 24
+  %tmp26 = or i32 %tmp20, %tmp25
+  ret i32 %tmp26
+}
+
+; i8* arg; i32 i;
+; p = arg + 12;
+; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24)
+define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
+; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
+; CHECK: add x8, x0, w1, uxtw
+; CHECK-NEXT: ldur  w0, [x8, #13]
+; CHECK-NEXT: ret
+  %tmp = add nuw nsw i32 %i, 4
+  %tmp2 = add nuw nsw i32 %i, 3
+  %tmp3 = add nuw nsw i32 %i, 2
+  %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp5 = add nuw nsw i32 %i, 1
+  %tmp27 = zext i32 %tmp5 to i64
+  %tmp28 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp27
+  %tmp29 = load i8, i8* %tmp28, align 4
+  %tmp30 = zext i8 %tmp29 to i32
+  %tmp31 = zext i32 %tmp3 to i64
+  %tmp32 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp31
+  %tmp33 = load i8, i8* %tmp32, align 1
+  %tmp34 = zext i8 %tmp33 to i32
+  %tmp35 = shl nuw nsw i32 %tmp34, 8
+  %tmp36 = or i32 %tmp35, %tmp30
+  %tmp37 = zext i32 %tmp2 to i64
+  %tmp38 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp37
+  %tmp39 = load i8, i8* %tmp38, align 1
+  %tmp40 = zext i8 %tmp39 to i32
+  %tmp41 = shl nuw nsw i32 %tmp40, 16
+  %tmp42 = or i32 %tmp36, %tmp41
+  %tmp43 = zext i32 %tmp to i64
+  %tmp44 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp43
+  %tmp45 = load i8, i8* %tmp44, align 1
+  %tmp46 = zext i8 %tmp45 to i32
+  %tmp47 = shl nuw i32 %tmp46, 24
+  %tmp48 = or i32 %tmp42, %tmp47
+  ret i32 %tmp48
+}
+
+; i8* p; // p is 2 byte aligned
+; (i32) p[0] | ((i32) p[1] << 8)
+define i32 @zext_load_i32_by_i8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8:
+; CHECK: ldrb  w8, [x0]
+; CHECK-NEXT: ldrb  w9, [x0, #1]
+; CHECK-NEXT: bfi w8, w9, #8, #8
+; CHECK-NEXT: mov  w0, w8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[0] << 8) | ((i32) p[1] << 16)
+define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_shl_8:
+; CHECK: ldrb  w8, [x0]
+; CHECK-NEXT: ldrb  w9, [x0, #1]
+; CHECK-NEXT: lsl w0, w8, #8
+; CHECK-NEXT: bfi w0, w9, #16, #8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 8
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 16
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[0] << 16) | ((i32) p[1] << 24)
+define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_shl_16:
+; CHECK: ldrb  w8, [x0]
+; CHECK-NEXT: ldrb  w9, [x0, #1]
+; CHECK-NEXT: lsl w0, w8, #16
+; CHECK-NEXT: bfi w0, w9, #24, #8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 16
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 24
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+; i8* p; // p is 2 byte aligned
+; (i32) p[1] | ((i32) p[0] << 8)
+define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap:
+; CHECK: ldrb  w8, [x0, #1]
+; CHECK-NEXT: ldrb    w9, [x0]
+; CHECK-NEXT: bfi w8, w9, #8, #8
+; CHECK-NEXT: mov  w0, w8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[1] << 8) | ((i32) p[0] << 16)
+define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8:
+; CHECK: ldrb  w8, [x0, #1]
+; CHECK-NEXT: ldrb    w9, [x0]
+; CHECK-NEXT: lsl w0, w8, #8
+; CHECK-NEXT: bfi w0, w9, #16, #8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 8
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 16
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[1] << 16) | ((i32) p[0] << 24)
+define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16:
+; CHECK: ldrb  w8, [x0, #1]
+; CHECK-NEXT: ldrb    w9, [x0]
+; CHECK-NEXT: lsl w0, w8, #16
+; CHECK-NEXT: bfi w0, w9, #24, #8
+; CHECK-NEXT: ret
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 16
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 24
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
diff --git a/test/CodeGen/AArch64/machine-combiner-madd.ll b/test/CodeGen/AArch64/machine-combiner-madd.ll
index ea3113789461b360701ae9ed8d734f027f39ed0c..4efe4e9cfb018577ce59ae47109904f1078bc773 100644
--- a/test/CodeGen/AArch64/machine-combiner-madd.ll
+++ b/test/CodeGen/AArch64/machine-combiner-madd.ll
@@ -6,7 +6,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m1  < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=exynos-m2  < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=kryo       < %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=vulcan     < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnu -mcpu=thunderx2t99 < %s | FileCheck %s
 
 ; Make sure that inst-combine fuses the multiply add in the addressing mode of
 ; the load.
diff --git a/test/CodeGen/AArch64/machine-copy-remove.mir b/test/CodeGen/AArch64/machine-copy-remove.mir
new file mode 100644
index 0000000000000000000000000000000000000000..6f2d3a3009b021da302983369683f84be3703dbb
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-copy-remove.mir
@@ -0,0 +1,672 @@
+# RUN: llc -mtriple=aarch64--linux-gnu -run-pass=aarch64-copyelim %s -verify-machineinstrs -o - | FileCheck %s
+---
+# Check that bb.0 COPY is seen through to allow the bb.1 COPY of XZR to be removed.
+# CHECK-LABEL: name: test1
+# CHECK-NOT: COPY %xzr
+name:            test1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1
+
+    %x0 = COPY %x1
+    CBNZX %x1, %bb.2
+
+  bb.1:
+    successors: %bb.3
+
+    %x0 = COPY %xzr
+    B %bb.3
+
+  bb.2:
+    successors: %bb.3
+    liveins: %x1
+
+    %x0 = LDRXui %x1, 0
+
+  bb.3:
+    liveins: %x0
+
+    RET_ReallyLR implicit %x0
+
+...
+# Similar to test1, but with reversed COPY.
+# CHECK-LABEL: name: test2
+# CHECK-NOT: COPY %xzr
+name:            test2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1
+
+    %x1 = COPY %x0
+    CBNZX %x1, %bb.2
+
+  bb.1:
+    successors: %bb.3
+
+    %x0 = COPY %xzr
+    B %bb.3
+
+  bb.2:
+    successors: %bb.3
+    liveins: %x1
+
+    %x0 = LDRXui %x1, 0
+
+  bb.3:
+    liveins: %x0
+
+    RET_ReallyLR implicit %x0
+
+...
+# Similar to test1, but with a clobber that prevents removal of the XZR COPY.
+# CHECK-LABEL: name: test3
+# CHECK: COPY %xzr
+name:            test3
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1, %x2
+
+    %x0 = COPY %x1
+    %x1 = LDRXui %x1, 0
+    CBNZX %x1, %bb.2
+
+  bb.1:
+    successors: %bb.3
+
+    %x0 = COPY %xzr
+    B %bb.3
+
+  bb.2:
+    successors: %bb.3
+    liveins: %x1
+
+    %x0 = LDRXui %x1, 0
+
+  bb.3:
+    liveins: %x0
+
+    RET_ReallyLR implicit %x0
+
+...
+# Similar to test2, but with a clobber that prevents removal of the XZR COPY.
+# CHECK-LABEL: name: test4
+# CHECK: COPY %xzr
+name:            test4
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1, %x2
+
+    %x1 = COPY %x0
+    %x1 = LDRXui %x1, 0
+    CBNZX %x1, %bb.2
+
+  bb.1:
+    successors: %bb.3
+
+    %x0 = COPY %xzr
+    B %bb.3
+
+  bb.2:
+    successors: %bb.3
+    liveins: %x1
+
+    %x0 = LDRXui %x1, 0
+
+  bb.3:
+    liveins: %x0
+
+    RET_ReallyLR implicit %x0
+
+...
+# Similar to test2, but with a clobber that prevents removal of the XZR COPY.
+# CHECK-LABEL: name: test5
+# CHECK: COPY %xzr
+name:            test5
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1, %x2
+
+    %x1 = COPY %x0
+    %x0 = LDRXui %x1, 0
+    CBNZX %x1, %bb.2
+
+  bb.1:
+    successors: %bb.3
+
+    %x0 = COPY %xzr
+    B %bb.3
+
+  bb.2:
+    successors: %bb.3
+    liveins: %x1
+
+    %x0 = LDRXui %x1, 0
+
+  bb.3:
+    liveins: %x0
+
+    RET_ReallyLR implicit %x0
+
+...
+# Similar to test1, but with two levels of COPYs.
+# CHECK-LABEL: name: test6
+# CHECK-NOT: COPY %xzr
+name:            test6
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1, %x2
+
+    %x2 = COPY %x0
+    %x1 = COPY %x2
+    CBNZX %x1, %bb.2
+
+  bb.1:
+    successors: %bb.3
+
+    %x0 = COPY %xzr
+    B %bb.3
+
+  bb.2:
+    successors: %bb.3
+    liveins: %x1
+
+    %x0 = LDRXui %x1, 0
+
+  bb.3:
+    liveins: %x0
+
+    RET_ReallyLR implicit %x0
+
+...
+# Similar to test1, but with two levels of COPYs and a clobber preventing COPY of XZR removal.
+# CHECK-LABEL: name: test7
+# CHECK: COPY %xzr
+name:            test7
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1, %x2
+
+    %x2 = COPY %x0
+    %x0 = LDRXui %x1, 0
+    %x1 = COPY %x2
+    CBNZX %x1, %bb.2
+
+  bb.1:
+    successors: %bb.3
+
+    %x0 = COPY %xzr
+    B %bb.3
+
+  bb.2:
+    successors: %bb.3
+    liveins: %x1
+
+    %x0 = LDRXui %x1, 0
+
+  bb.3:
+    liveins: %x0
+
+    RET_ReallyLR implicit %x0
+
+...
+# Check that the TargetRegs vector clobber update loop in
+#  AArch64RedundantCopyElimination::optimizeCopy works correctly.
+# CHECK-LABEL: name: test8
+# CHECK: x0 = COPY %xzr
+# CHECK: x1 = COPY %xzr
+name:            test8
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1
+
+    %x1 = COPY %x0
+    CBNZX %x1, %bb.2
+
+  bb.1:
+    successors: %bb.3
+    liveins: %x0, %x2
+
+    %x0, %x1 = LDPXi %x2, 0
+    %x0 = COPY %xzr
+    %x1 = COPY %xzr
+    B %bb.3
+
+  bb.2:
+    successors: %bb.3
+    liveins: %x1
+
+    %x0 = LDRXui %x1, 0
+
+  bb.3:
+    liveins: %x0
+
+    RET_ReallyLR implicit %x0
+
+...
+# Check that copy isn't removed from a block with multiple predecessors.
+# CHECK-LABEL: name: test9
+# CHECK: x0 = COPY %xzr
+# CHECK-NEXT: B %bb.3
+name:            test9
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1
+
+    CBNZX %x0, %bb.2
+
+  bb.1:
+    successors: %bb.3
+    liveins: %x0, %x2
+
+    %x0 = COPY %xzr
+    B %bb.3
+
+  bb.2:
+    successors: %bb.1, %bb.3
+    liveins: %x1
+
+    %x0 = LDRXui %x1, 0
+
+    CBNZX %x1, %bb.1
+
+  bb.3:
+    liveins: %x0
+
+    RET_ReallyLR implicit %x0
+
+...
+# Eliminate redundant MOVi32imm 7 in bb.1
+# Note: 32-bit compare/32-bit move imm
+# Kill marker should be removed from compare.
+# CHECK-LABEL: name: test10
+# CHECK: SUBSWri %w0, 7, 0, implicit-def %nzcv
+# CHECK: bb.1:
+# CHECK-NOT: MOVi32imm
+name:            test10
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %w0, %x1
+
+    dead %wzr = SUBSWri killed %w0, 7, 0, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %w0 = MOVi32imm 7
+    STRWui killed %w0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Eliminate redundant MOVi32imm 7 in bb.1
+# Note: 64-bit compare/32-bit move imm w/implicit def
+# Kill marker should be removed from compare.
+# CHECK-LABEL: name: test11
+# CHECK: SUBSXri %x0, 7, 0, implicit-def %nzcv
+# CHECK: bb.1:
+# CHECK-NOT: MOVi32imm
+name:            test11
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1
+
+    dead %xzr = SUBSXri killed %x0, 7, 0, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %w0 = MOVi32imm 7, implicit-def %x0
+    STRXui killed %x0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Eliminate redundant MOVi32imm 7 in bb.1
+# Note: 64-bit compare/32-bit move imm
+# Kill marker should be removed from compare.
+# CHECK-LABEL: name: test12
+# CHECK: SUBSXri %x0, 7, 0, implicit-def %nzcv
+# CHECK: bb.1:
+# CHECK-NOT: MOVi32imm
+name:            test12
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1
+
+    dead %xzr = SUBSXri killed %x0, 7, 0, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %w0 = MOVi32imm 7
+    STRWui killed %w0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Don't eliminate MOVi32imm 7 in bb.1 as we don't necessarily know the upper 32-bits.
+# Note: 32-bit compare/32-bit move imm w/implicit def
+# Kill marker should remain on compare.
+# CHECK-LABEL: name: test13
+# CHECK: SUBSWri killed %w0, 7, 0, implicit-def %nzcv
+# CHECK: bb.1:
+# CHECK: MOVi32imm
+name:            test13
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %w0, %x1
+
+    dead %wzr = SUBSWri killed %w0, 7, 0, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %w0 = MOVi32imm 7, implicit-def %x0
+    STRXui killed %x0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# We can't eliminate the MOVi32imm because of the clobbering LDRWui.
+# CHECK-LABEL: name: test14
+# CHECK: bb.1:
+# CHECK: MOVi32imm
+name:            test14
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %w0, %x1, %x2
+
+    dead %wzr = SUBSWri killed %w0, 7, 0, implicit-def %nzcv
+    %w0 = LDRWui %x1, 0
+    STRWui killed %w0, killed %x2, 0
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %w0 = MOVi32imm 7
+    STRWui killed %w0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# We can't eliminate the MOVi32imm because of the clobbering LDRWui.
+# CHECK-LABEL: name: test15
+# CHECK: bb.1:
+# CHECK: MOVi32imm
+name:            test15
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %w0, %x1, %x2
+
+    dead %wzr = SUBSWri killed %w0, 7, 0, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1, %x2
+
+    %w0 = LDRWui %x1, 0
+    STRWui killed %w0, killed %x2, 0
+    %w0 = MOVi32imm 7
+    STRWui killed %w0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Check that bb.0 COPY is seen through to allow the bb.1 MOVi32imm to be removed.
+# CHECK-LABEL: name: test16
+# CHECK: bb.1:
+# CHECK-NOT: MOVi32imm
+name:            test16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %w0, %x1
+
+    dead %wzr = SUBSWri %w0, 7, 0, implicit-def %nzcv
+    %w2 = COPY %w0
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %w2 = MOVi32imm 7
+    STRWui killed %w2, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Check that bb.1 MOVi32imm is not removed due to self clobbering compare.
+# CHECK-LABEL: name: test17
+# CHECK: bb.1:
+# CHECK: MOVi32imm
+name:            test17
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %w0, %x1
+
+    dead %w0 = SUBSWri killed %w0, 7, 0, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %w0 = MOVi32imm 7
+    STRWui killed %w0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Make sure the MOVi64imm is not removed.  In one version of this patch the
+# MOVi64imm immediate was truncated to 32 bits and incorrectly matched because
+# the low 32 bits of 4252017623040 are all zero.
+# CHECK-LABEL: name: test18
+# CHECK: bb.1:
+# CHECK: MOVi64imm
+name:            test18
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1
+
+    CBNZX killed %x0, %bb.2
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %x0 = MOVi64imm 4252017623040
+    STRXui killed %x0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Eliminate redundant MOVi32imm -1 in bb.1
+# Note: 32-bit compare/32-bit move imm
+# Kill marker should be removed from compare.
+# CHECK-LABEL: name: test19
+# CHECK: ADDSWri %w0, 1, 0, implicit-def %nzcv
+# CHECK: bb.1:
+# CHECK-NOT: MOVi32imm
+name:            test19
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %w0, %x1
+
+    dead %wzr = ADDSWri killed %w0, 1, 0, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %w0 = MOVi32imm -1
+    STRWui killed %w0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Eliminate redundant MOVi64imm -1 in bb.1
+# Note: 64-bit compare/64-bit move imm
+# Kill marker should be removed from compare.
+# CHECK-LABEL: name: test20
+# CHECK: ADDSXri %x0, 1, 0, implicit-def %nzcv
+# CHECK: bb.1:
+# CHECK-NOT: MOVi64imm
+name:            test20
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1
+
+    dead %xzr = ADDSXri killed %x0, 1, 0, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %x0 = MOVi64imm -1
+    STRXui killed %x0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Eliminate redundant MOVi32imm -1 in bb.1
+# Note: 64-bit compare/32-bit move imm
+# Kill marker should be removed from compare.
+# CHECK-LABEL: name: test21
+# CHECK: ADDSXri %x0, 1, 0, implicit-def %nzcv
+# CHECK: bb.1:
+# CHECK-NOT: MOVi32imm
+name:            test21
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %x0, %x1
+
+    dead %xzr = ADDSXri killed %x0, 1, 0, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %w0 = MOVi32imm -1
+    STRWui killed %w0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Don't eliminate MOVi64imm -1 in bb.1 as we don't necessarily know the upper 32-bits.
+# Note: 32-bit compare/64-bit move imm
+# CHECK-LABEL: name: test22
+# CHECK: bb.1:
+# CHECK: MOVi64imm
+name:            test22
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %w0, %x1
+
+    dead %wzr = ADDSWri killed %w0, 1, 0, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %x0 = MOVi64imm -1
+    STRXui killed %x0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
+...
+# Eliminate redundant MOVi32imm 4096 in bb.1 when the compare has a shifted immediate.
+# CHECK-LABEL: name: test23
+# CHECK: bb.1:
+# CHECK-NOT: MOVi32imm
+name:            test23
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    successors: %bb.1, %bb.2
+    liveins: %w0, %x1
+
+    dead %wzr = SUBSWri killed %w0, 1, 12, implicit-def %nzcv
+    Bcc 1, %bb.2, implicit killed %nzcv
+    B %bb.1
+
+  bb.1:
+    successors: %bb.2
+    liveins: %x1
+
+    %w0 = MOVi32imm 4096
+    STRWui killed %w0, killed %x1, 0
+
+  bb.2:
+    RET_ReallyLR
diff --git a/test/CodeGen/AArch64/machine-outliner.ll b/test/CodeGen/AArch64/machine-outliner.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b5094fe47508b1eb7f920ab81fc019c82e3ae1f1
--- /dev/null
+++ b/test/CodeGen/AArch64/machine-outliner.ll
@@ -0,0 +1,43 @@
+; RUN: llc -enable-machine-outliner -mtriple=aarch64-apple-darwin < %s | FileCheck %s
+
+define void @cat() #0 {
+; CHECK-LABEL: _cat:
+; CHECK: b l_OUTLINED_FUNCTION_0
+; CHECK-NOT: ret
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 0, i32* %1, align 4
+  store i32 1, i32* %2, align 4
+  store i32 2, i32* %3, align 4
+  store i32 3, i32* %4, align 4
+  ret void
+}
+
+define void @dog() #0 {
+; CHECK-LABEL: _dog:
+; CHECK: b l_OUTLINED_FUNCTION_0
+; CHECK-NOT: ret
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  store i32 0, i32* %1, align 4
+  store i32 1, i32* %2, align 4
+  store i32 2, i32* %3, align 4
+  store i32 3, i32* %4, align 4
+  ret void
+}
+
+; CHECK-LABEL: l_OUTLINED_FUNCTION_0:
+; CHECK:      orr w8, wzr, #0x1
+; CHECK-NEXT: stp w8, wzr, [sp, #8]
+; CHECK-NEXT: orr w8, wzr, #0x2
+; CHECK-NEXT: str w8, [sp, #4]
+; CHECK-NEXT: orr w8, wzr, #0x3
+; CHECK-NEXT: str w8, [sp], #16
+; CHECK-NEXT: ret
+
+
+attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" "target-cpu"="cyclone" }
diff --git a/test/CodeGen/AArch64/mature-mc-support.ll b/test/CodeGen/AArch64/mature-mc-support.ll
index 276c54d2cc4e4b3b9d9cab1d8f26d698ea7b6378..dbc027143f99406902e5f09e8f88927cf638d959 100644
--- a/test/CodeGen/AArch64/mature-mc-support.ll
+++ b/test/CodeGen/AArch64/mature-mc-support.ll
@@ -9,4 +9,4 @@
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
 
-; CHECK: LLVM ERROR: Error parsing inline asm
+; CHECK: error: unknown directive
diff --git a/test/CodeGen/AArch64/merge-store.ll b/test/CodeGen/AArch64/merge-store.ll
index 1d0196ad521d660733c6df756c6524d6e6151853..1d26e4a42b176252d880a6be44702e4cbc465733 100644
--- a/test/CodeGen/AArch64/merge-store.ll
+++ b/test/CodeGen/AArch64/merge-store.ll
@@ -4,8 +4,7 @@
 @g0 = external global <3 x float>, align 16
 @g1 = external global <3 x float>, align 4
 
-; CHECK: ldr s[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]]{{\]}}, #4
-; CHECK: ld1{{\.?s?}} { v[[R0]]{{\.?s?}} }[1], {{\[}}[[R1]]{{\]}}
+; CHECK: ldr q[[R0:[0-9]+]], {{\[}}[[R1:x[0-9]+]], :lo12:g0
 ; CHECK: str d[[R0]]
 
 define void @blam() {
diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f29dfb3a9802159dbcd3218b0f6f42cc7287724d
--- /dev/null
+++ b/test/CodeGen/AArch64/misched-fusion-aes.ll
@@ -0,0 +1,207 @@
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1
+
+declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k)
+declare <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %d)
+declare <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d, <16 x i8> %k)
+declare <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %d)
+
+define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
+  %d0 = load <16 x i8>, <16 x i8>* %a0
+  %a1 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 1
+  %d1 = load <16 x i8>, <16 x i8>* %a1
+  %a2 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 2
+  %d2 = load <16 x i8>, <16 x i8>* %a2
+  %a3 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 3
+  %d3 = load <16 x i8>, <16 x i8>* %a3
+  %k0 = load <16 x i8>, <16 x i8>* %b0
+  %e00 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d0, <16 x i8> %k0)
+  %f00 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e00)
+  %e01 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d1, <16 x i8> %k0)
+  %f01 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e01)
+  %e02 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d2, <16 x i8> %k0)
+  %f02 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e02)
+  %e03 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d3, <16 x i8> %k0)
+  %f03 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e03)
+  %b1 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 1
+  %k1 = load <16 x i8>, <16 x i8>* %b1
+  %e10 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f00, <16 x i8> %k1)
+  %f10 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e00)
+  %e11 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f01, <16 x i8> %k1)
+  %f11 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e01)
+  %e12 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f02, <16 x i8> %k1)
+  %f12 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e02)
+  %e13 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f03, <16 x i8> %k1)
+  %f13 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e03)
+  %b2 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 2
+  %k2 = load <16 x i8>, <16 x i8>* %b2
+  %e20 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f10, <16 x i8> %k2)
+  %f20 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e10)
+  %e21 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f11, <16 x i8> %k2)
+  %f21 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e11)
+  %e22 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f12, <16 x i8> %k2)
+  %f22 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e12)
+  %e23 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f13, <16 x i8> %k2)
+  %f23 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e13)
+  %b3 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 3
+  %k3 = load <16 x i8>, <16 x i8>* %b3
+  %e30 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f20, <16 x i8> %k3)
+  %f30 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e20)
+  %e31 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f21, <16 x i8> %k3)
+  %f31 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e21)
+  %e32 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f22, <16 x i8> %k3)
+  %f32 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e22)
+  %e33 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f23, <16 x i8> %k3)
+  %f33 = call <16 x i8> @llvm.aarch64.crypto.aesmc(<16 x i8> %e23)
+  %g0 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f30, <16 x i8> %d)
+  %h0 = xor <16 x i8> %g0, %e
+  %g1 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f31, <16 x i8> %d)
+  %h1 = xor <16 x i8> %g1, %e
+  %g2 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f32, <16 x i8> %d)
+  %h2 = xor <16 x i8> %g2, %e
+  %g3 = call <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %f33, <16 x i8> %d)
+  %h3 = xor <16 x i8> %g3, %e
+  store <16 x i8> %h0, <16 x i8>* %c0
+  %c1 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 1
+  store <16 x i8> %h1, <16 x i8>* %c1
+  %c2 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 2
+  store <16 x i8> %h2, <16 x i8>* %c2
+  %c3 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 3
+  store <16 x i8> %h3, <16 x i8>* %c3
+  ret void
+
+; CHECK-LABEL: aesea:
+; CHECKA57: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
+; CHECKA57: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKA57: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKA57: aesmc {{v[0-7].16b}}, [[VB]]
+; CHECKA57: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
+; CHECKA57: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
+; CHECKA57: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
+; CHECKA57: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1: aesmc {{v[0-7].16b}}, [[VA]]
+; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VB]]
+; CHECKM1: aese {{v[0-7].16b}}, {{v[0-7].16b}}
+; CHECKM1: aese [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VC]]
+; CHECKM1: aese [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1: aesmc {{v[0-7].16b}}, [[VD]]
+; CHECKM1: aese [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VE]]
+; CHECKM1: aese [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VF]]
+; CHECKM1: aese [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VG]]
+; CHECKM1: aese [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesmc {{v[0-7].16b}}, [[VH]]
+}
+
+define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, <16 x i8> %e) {
+  %d0 = load <16 x i8>, <16 x i8>* %a0
+  %a1 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 1
+  %d1 = load <16 x i8>, <16 x i8>* %a1
+  %a2 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 2
+  %d2 = load <16 x i8>, <16 x i8>* %a2
+  %a3 = getelementptr inbounds <16 x i8>, <16 x i8>* %a0, i64 3
+  %d3 = load <16 x i8>, <16 x i8>* %a3
+  %k0 = load <16 x i8>, <16 x i8>* %b0
+  %e00 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d0, <16 x i8> %k0)
+  %f00 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e00)
+  %e01 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d1, <16 x i8> %k0)
+  %f01 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e01)
+  %e02 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d2, <16 x i8> %k0)
+  %f02 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e02)
+  %e03 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %d3, <16 x i8> %k0)
+  %f03 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e03)
+  %b1 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 1
+  %k1 = load <16 x i8>, <16 x i8>* %b1
+  %e10 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f00, <16 x i8> %k1)
+  %f10 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e00)
+  %e11 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f01, <16 x i8> %k1)
+  %f11 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e01)
+  %e12 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f02, <16 x i8> %k1)
+  %f12 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e02)
+  %e13 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f03, <16 x i8> %k1)
+  %f13 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e03)
+  %b2 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 2
+  %k2 = load <16 x i8>, <16 x i8>* %b2
+  %e20 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f10, <16 x i8> %k2)
+  %f20 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e10)
+  %e21 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f11, <16 x i8> %k2)
+  %f21 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e11)
+  %e22 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f12, <16 x i8> %k2)
+  %f22 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e12)
+  %e23 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f13, <16 x i8> %k2)
+  %f23 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e13)
+  %b3 = getelementptr inbounds <16 x i8>, <16 x i8>* %b0, i64 3
+  %k3 = load <16 x i8>, <16 x i8>* %b3
+  %e30 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f20, <16 x i8> %k3)
+  %f30 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e20)
+  %e31 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f21, <16 x i8> %k3)
+  %f31 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e21)
+  %e32 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f22, <16 x i8> %k3)
+  %f32 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e22)
+  %e33 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f23, <16 x i8> %k3)
+  %f33 = call <16 x i8> @llvm.aarch64.crypto.aesimc(<16 x i8> %e23)
+  %g0 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f30, <16 x i8> %d)
+  %h0 = xor <16 x i8> %g0, %e
+  %g1 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f31, <16 x i8> %d)
+  %h1 = xor <16 x i8> %g1, %e
+  %g2 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f32, <16 x i8> %d)
+  %h2 = xor <16 x i8> %g2, %e
+  %g3 = call <16 x i8> @llvm.aarch64.crypto.aesd(<16 x i8> %f33, <16 x i8> %d)
+  %h3 = xor <16 x i8> %g3, %e
+  store <16 x i8> %h0, <16 x i8>* %c0
+  %c1 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 1
+  store <16 x i8> %h1, <16 x i8>* %c1
+  %c2 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 2
+  store <16 x i8> %h2, <16 x i8>* %c2
+  %c3 = getelementptr inbounds <16 x i8>, <16 x i8>* %c0, i64 3
+  store <16 x i8> %h3, <16 x i8>* %c3
+  ret void
+
+; CHECK-LABEL: aesda:
+; CHECKA57: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
+; CHECKA57: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKA57: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKA57: aesimc {{v[0-7].16b}}, [[VB]]
+; CHECKA57: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
+; CHECKA57: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
+; CHECKA57: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
+; CHECKA57: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1: aesimc {{v[0-7].16b}}, [[VA]]
+; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VB]]
+; CHECKM1: aesd {{v[0-7].16b}}, {{v[0-7].16b}}
+; CHECKM1: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VC]]
+; CHECKM1: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1: aesimc {{v[0-7].16b}}, [[VD]]
+; CHECKM1: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VE]]
+; CHECKM1: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VF]]
+; CHECKM1: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VG]]
+; CHECKM1: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}}
+; CHECKM1-NEXT: aesimc {{v[0-7].16b}}, [[VH]]
+}
diff --git a/test/CodeGen/AArch64/misched-fusion-lit.ll b/test/CodeGen/AArch64/misched-fusion-lit.ll
new file mode 100644
index 0000000000000000000000000000000000000000..45aa67ef1d5482a3a1748c679d60d7fe7a628c40
--- /dev/null
+++ b/test/CodeGen/AArch64/misched-fusion-lit.ll
@@ -0,0 +1,46 @@
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=-fuse-literals | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKDONT
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mattr=+fuse-literals | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE
+; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57      | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKFUSE
+
+@g = common local_unnamed_addr global i8* null, align 8
+
+define i8* @litp(i32 %a, i32 %b) {
+entry:
+  %add = add nsw i32 %b, %a
+  %idx.ext = sext i32 %add to i64
+  %add.ptr = getelementptr i8, i8* bitcast (i8* (i32, i32)* @litp to i8*), i64 %idx.ext
+  store i8* %add.ptr, i8** @g, align 8
+  ret i8* %add.ptr
+
+; CHECK-LABEL: litp:
+; CHECK: adrp [[R:x[0-9]+]], litp
+; CHECKDONT-NEXT: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECKFUSE-NEXT: add {{x[0-9]+}}, [[R]], :lo12:litp
+}
+
+define i32 @liti(i32 %a, i32 %b) {
+entry:
+  %add = add i32 %a, -262095121
+  %add1 = add i32 %add, %b
+  ret i32 %add1
+
+; CHECK-LABEL: liti:
+; CHECK: mov [[R:w[0-9]+]], {{#[0-9]+}}
+; CHECKDONT-NEXT: add {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+; CHECKFUSE-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16
+}
+
+; Function Attrs: norecurse nounwind readnone
+define i64 @litl(i64 %a, i64 %b) {
+entry:
+  %add = add i64 %a, 2208998440489107183
+  %add1 = add i64 %add, %b
+  ret i64 %add1
+
+; CHECK-LABEL: litl:
+; CHECK: mov [[R:x[0-9]+]], {{#[0-9]+}}
+; CHECK-NEXT: movk [[R]], {{#[0-9]+}}, lsl #16
+; CHECK: movk [[R]], {{#[0-9]+}}, lsl #32
+; CHECKDONT-NEXT: add {{x[0-9]+}}, {{x[0-9]+}}, {{x[0-9]+}}
+; CHECKFUSE-NEXT: movk [[R]], {{#[0-9]+}}, lsl #48
+}
diff --git a/test/CodeGen/AArch64/misched-fusion.ll b/test/CodeGen/AArch64/misched-fusion.ll
index d5dd9c757dfd7592919b5314a9bc22fa4f700998..1d504a2f19316e5b2020d32a3b6749e1aeb0c01f 100644
--- a/test/CodeGen/AArch64/misched-fusion.ll
+++ b/test/CodeGen/AArch64/misched-fusion.ll
@@ -1,22 +1,14 @@
 ; RUN: llc -o - %s -mattr=+arith-cbz-fusion | FileCheck %s
 ; RUN: llc -o - %s -mcpu=cyclone | FileCheck %s
 
-target triple = "arm64-apple-ios"
+target triple = "aarch64-unknown"
 
 declare void @foobar(i32 %v0, i32 %v1)
 
 ; Make sure sub is scheduled in front of cbnz
 ; CHECK-LABEL: test_sub_cbz:
-; CHECK: add w[[ADDRES:[0-9]+]], w1, #7
 ; CHECK: sub w[[SUBRES:[0-9]+]], w0, #13
-; CHECK-NEXT: cbnz w[[SUBRES]], [[SKIPBLOCK:LBB[0-9_]+]]
-; CHECK: mov [[REGTY:[x,w]]]0, [[REGTY]][[ADDRES]]
-; CHECK: mov [[REGTY]]1, [[REGTY]][[SUBRES]]
-; CHECK: bl _foobar
-; CHECK: [[SKIPBLOCK]]:
-; CHECK: mov [[REGTY]]0, [[REGTY]][[SUBRES]]
-; CHECK: mov [[REGTY]]1, [[REGTY]][[ADDRES]]
-; CHECK: bl _foobar
+; CHECK-NEXT: cbnz w[[SUBRES]], {{.?LBB[0-9_]+}}
 define void @test_sub_cbz(i32 %a0, i32 %a1) {
 entry:
   ; except for the fusion opportunity the sub/add should be equal so the
diff --git a/test/CodeGen/AArch64/misched-stp.ll b/test/CodeGen/AArch64/misched-stp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4ea481cae68ef617b85278020893f97ddb269abd
--- /dev/null
+++ b/test/CodeGen/AArch64/misched-stp.ll
@@ -0,0 +1,57 @@
+; REQUIRES: asserts
+; RUN: llc < %s -mtriple=aarch64 -mcpu=cyclone -mattr=+use-aa -enable-misched -verify-misched -debug-only=misched -o - 2>&1 > /dev/null | FileCheck %s
+
+; Tests to check that the scheduler dependencies derived from alias analysis are
+; correct when we have loads that have been split up so that they can later be
+; merged into STP.
+
+; CHECK: ********** MI Scheduling **********
+; CHECK: test_splat:BB#0 entry
+; CHECK: SU({{[0-9]+}}):   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 3; mem:ST4[%3+8]
+; CHECK: Successors:
+; CHECK-NEXT: ord  [[SU1:SU\([0-9]+\)]]
+; CHECK: SU({{[0-9]+}}):   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 2; mem:ST4[%3+4]
+; CHECK: Successors:
+; CHECK-NEXT: ord  [[SU2:SU\([0-9]+\)]]
+; CHECK: [[SU1]]:   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 3; mem:ST4[%2]
+; CHECK: [[SU2]]:   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 2; mem:ST4[%1]
+define void @test_splat(i32 %x, i32 %y, i32* %p) {
+entry:
+  %val = load i32, i32* %p, align 4
+  %0 = getelementptr inbounds i32, i32* %p, i64 1
+  %1 = getelementptr inbounds i32, i32* %p, i64 2
+  %2 = getelementptr inbounds i32, i32* %p, i64 3
+  %vec0 = insertelement <4 x i32> undef, i32 %val, i32 0
+  %vec1 = insertelement <4 x i32> %vec0, i32 %val, i32 1
+  %vec2 = insertelement <4 x i32> %vec1, i32 %val, i32 2
+  %vec3 = insertelement <4 x i32> %vec2, i32 %val, i32 3
+  %3 = bitcast i32* %0 to <4 x i32>*
+  store <4 x i32> %vec3, <4 x i32>* %3, align 4
+  store i32 %x, i32* %2, align 4
+  store i32 %y, i32* %1, align 4
+  ret void
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+%struct.tree_common = type { i8*, i8*, i32 }
+
+; CHECK: ********** MI Scheduling **********
+; CHECK: test_zero:BB#0 entry
+; CHECK: SU({{[0-9]+}}):   STRXui %XZR, %vreg{{[0-9]+}}, 2; mem:ST8[%0+16]
+; CHECK: Successors:
+; CHECK-NEXT: ord  [[SU3:SU\([0-9]+\)]]
+; CHECK: SU({{[0-9]+}}):   STRXui %XZR, %vreg{{[0-9]+}}, 1; mem:ST8[%0+8]
+; CHECK: Successors:
+; CHECK-NEXT: ord  [[SU4:SU\([0-9]+\)]]
+; CHECK: [[SU3]]:   STRWui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 4; mem:ST4[%code1]
+; CHECK: [[SU4]]:   STRXui %vreg{{[0-9]+}}, %vreg{{[0-9]+}}, 1; mem:ST8[%type2]
+define void @test_zero(%struct.tree_common* %t, i32 %code, i8* %type) {
+entry:
+  %0 = bitcast %struct.tree_common* %t to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 24, i32 8, i1 false)
+  %code1 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 2
+  store i32 %code, i32* %code1, align 8
+  %type2 = getelementptr inbounds %struct.tree_common, %struct.tree_common* %t, i64 0, i32 1
+  store i8* %type, i8** %type2, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/movimm-wzr.mir b/test/CodeGen/AArch64/movimm-wzr.mir
index 093f85bd9319c32e7e3c6d966a7baacd18639491..60e9bfa03a96a9499a7ffa8cdd36098e6a80b945 100644
--- a/test/CodeGen/AArch64/movimm-wzr.mir
+++ b/test/CodeGen/AArch64/movimm-wzr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -run-pass=aarch64-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -run-pass=aarch64-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   ; ModuleID = 'simple.ll'
diff --git a/test/CodeGen/AArch64/neon-fma-FMF.ll b/test/CodeGen/AArch64/neon-fma-FMF.ll
new file mode 100644
index 0000000000000000000000000000000000000000..25beef6592b26f449902c0e5a1eb378debc09b04
--- /dev/null
+++ b/test/CodeGen/AArch64/neon-fma-FMF.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
+
+define <2 x float> @fma(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+; CHECK-LABEL: fma:
+; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp1 = fmul contract <2 x float> %A, %B;
+	%tmp2 = fadd contract <2 x float> %C, %tmp1;
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @no_fma_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+; CHECK-LABEL: no_fma_1:
+; CHECK: fmul
+; CHECK: fadd
+	%tmp1 = fmul contract <2 x float> %A, %B;
+	%tmp2 = fadd <2 x float> %C, %tmp1;
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @no_fma_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+; CHECK-LABEL: no_fma_2:
+; CHECK: fmul
+; CHECK: fadd
+	%tmp1 = fmul <2 x float> %A, %B;
+	%tmp2 = fadd contract <2 x float> %C, %tmp1;
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @fma_sub(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+; CHECK-LABEL: fma_sub:
+; CHECK: fmls {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s
+	%tmp1 = fmul contract <2 x float> %A, %B;
+	%tmp2 = fsub contract <2 x float> %C, %tmp1;
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @no_fma_sub_1(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+; CHECK-LABEL: no_fma_sub_1:
+; CHECK: fmul
+; CHECK: fsub
+	%tmp1 = fmul contract <2 x float> %A, %B;
+	%tmp2 = fsub <2 x float> %C, %tmp1;
+	ret <2 x float> %tmp2
+}
+
+define <2 x float> @no_fma_sub_2(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
+; CHECK-LABEL: no_fma_sub_2:
+; CHECK: fmul
+; CHECK: fsub
+	%tmp1 = fmul <2 x float> %A, %B;
+	%tmp2 = fsub contract <2 x float> %C, %tmp1;
+	ret <2 x float> %tmp2
+}
diff --git a/test/CodeGen/AArch64/optimize-cond-branch.ll b/test/CodeGen/AArch64/optimize-cond-branch.ll
index 4e3ca6f16e78c703cc003f8ab3b3d30d17d0c3d2..ab4ad5e2ce93daeab0f9e1a8dab952832964483e 100644
--- a/test/CodeGen/AArch64/optimize-cond-branch.ll
+++ b/test/CodeGen/AArch64/optimize-cond-branch.ll
@@ -11,7 +11,7 @@ target triple = "arm64--"
 ;
 ; CHECK-LABEL: func
 ; CHECK-NOT: and
-; CHECK: tbnz
+; CHECK: tbz
 define void @func() {
   %c0 = icmp sgt i64 0, 0
   br i1 %c0, label %b1, label %b6
diff --git a/test/CodeGen/AArch64/pr27816.ll b/test/CodeGen/AArch64/pr27816.ll
new file mode 100644
index 0000000000000000000000000000000000000000..df15755cf3f5c90f4637c98c87742548e0a7d1fd
--- /dev/null
+++ b/test/CodeGen/AArch64/pr27816.ll
@@ -0,0 +1,48 @@
+; RUN: llc %s -mtriple=aarch64 -o - | FileCheck %s
+
+%struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8, i32 }
+
+; The existence of the final i32 value should not prevent the i8s from
+; being merged.
+
+; CHECK-LABEL: @merge_const_store
+; CHECK-NOT: strb
+; CHECK: str x8,  [x1]
+; CHECK-NOT: strb
+; CHECK: str wzr, [x1, #8]
+; CHECK-NOT: strb
+define void @merge_const_store(i32 %count, %struct.A* nocapture %p)  {
+  %1 = icmp sgt i32 %count, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+.lr.ph:
+  %i.02 = phi i32 [ %add, %.lr.ph ], [ 0, %0 ]
+  %.01 = phi %struct.A* [ %addr, %.lr.ph ], [ %p, %0 ]
+  %a2 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 0
+  store i8 1, i8* %a2, align 1
+  %a3 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 1
+  store i8 2, i8* %a3, align 1
+  %a4 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 2
+  store i8 3, i8* %a4, align 1
+  %a5 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 3
+  store i8 4, i8* %a5, align 1
+  %a6 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 4
+  store i8 5, i8* %a6, align 1
+  %a7 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 5
+  store i8 6, i8* %a7, align 1
+  %a8 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 6
+  store i8 7, i8* %a8, align 1
+  %a9 = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 7
+  store i8 8, i8* %a9, align 1
+
+  ;
+  %addr_last = getelementptr inbounds %struct.A, %struct.A* %.01, i64 0, i32 8
+  store i32 0, i32* %addr_last, align 4
+
+
+  %add = add nsw i32 %i.02, 1
+  %addr = getelementptr inbounds %struct.A, %struct.A* %.01, i64 1
+  %exitcond = icmp eq i32 %add, %count
+  br i1 %exitcond, label %._crit_edge, label %.lr.ph
+._crit_edge:
+  ret void
+}
diff --git a/test/CodeGen/AArch64/prefixdata.ll b/test/CodeGen/AArch64/prefixdata.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f62734c16e5290082179d8cb1b4336537b8924cf
--- /dev/null
+++ b/test/CodeGen/AArch64/prefixdata.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=aarch64-apple-darwin | FileCheck --check-prefix=MACHO %s
+; RUN: llc < %s -mtriple=aarch64-pc-linux | FileCheck --check-prefix=ELF %s
+
+@i = linkonce_odr global i32 1
+
+; MACHO: ltmp0:
+; MACHO-NEXT: .long 1
+; MACHO-NEXT: .alt_entry _f
+; MACHO-NEXT: _f:
+; ELF: .type f,@function
+; ELF-NEXT: .word	1
+; ELF-NEXT: // 0x1
+; ELF-NEXT: f:
+define void @f() prefix i32 1 {
+  ret void
+}
+
+; MACHO: ltmp1:
+; MACHO-NEXT: .quad _i
+; MACHO-NEXT: .alt_entry _g
+; MACHO-NEXT: _g:
+; ELF: .type g,@function
+; ELF-NEXT: .xword	i
+; ELF-NEXT: g:
+define void @g() prefix i32* @i {
+  ret void
+}
+
+; MACHO: .subsections_via_symbols
diff --git a/test/CodeGen/AArch64/regcoal-physreg.mir b/test/CodeGen/AArch64/regcoal-physreg.mir
index 4bcabd10088abe241e99c257e77783c498063736..813106366968d8e742851916c423377ef6817998 100644
--- a/test/CodeGen/AArch64/regcoal-physreg.mir
+++ b/test/CodeGen/AArch64/regcoal-physreg.mir
@@ -2,40 +2,33 @@
 --- |
   declare void @f2()
 
-  define void @func() { ret void }
+  define void @func0() { ret void }
+  define void @func1() { ret void }
+  define void @func2() { ret void }
 ...
 ---
 # Check coalescing of COPYs from reserved physregs.
-# CHECK-LABEL: name: func
-name: func
-registers:
-  - { id: 0, class: gpr32 }
-  - { id: 1, class: gpr64 }
-  - { id: 2, class: gpr64 }
-  - { id: 3, class: gpr32 }
-  - { id: 4, class: gpr64 }
-  - { id: 5, class: gpr32 }
-  - { id: 6, class: xseqpairsclass }
-  - { id: 7, class: gpr64 }
+# CHECK-LABEL: name: func0
+name: func0
 body: |
   bb.0:
     ; We usually should not coalesce copies from allocatable physregs.
     ; CHECK: %0 = COPY %w7
     ; CHECK: STRWui %0, %x1, 0
-    %0 = COPY %w7
+    %0 : gpr32 = COPY %w7
     STRWui %0, %x1, 0
 
     ; It is fine to coalesce copies from reserved physregs
     ; CHECK-NOT: COPY
     ; CHECK: STRXui %fp, %x1, 0
-    %1 = COPY %fp
+    %1 : gpr64 = COPY %fp
     STRXui %1, %x1, 0
 
     ; It is not fine to coalesce copies from reserved physregs when they are
     ; clobbered.
     ; CHECK: %2 = COPY %fp
     ; CHECK: STRXui %2, %x1, 0
-    %2 = COPY %fp
+    %2 : gpr64 = COPY %fp
     %fp = SUBXri %fp, 4, 0
     STRXui %2, %x1, 0
 
@@ -43,7 +36,7 @@ body: |
     ; clobbered.
     ; CHECK-NOT: COPY
     ; CHECK: STRWui %wzr, %x1
-    %3 = COPY %wzr
+    %3 : gpr32 = COPY %wzr
     dead %wzr = SUBSWri %w1, 0, 0, implicit-def %nzcv
     STRWui %3, %x1, 0
 
@@ -51,13 +44,13 @@ body: |
     ; clobbered.
     ; CHECK-NOT: COPY
     ; CHECK: STRXui %xzr, %x1
-    %4 = COPY %xzr
+    %4 : gpr64 = COPY %xzr
     dead %wzr = SUBSWri %w1, 0, 0, implicit-def %nzcv
     STRXui %4, %x1, 0
 
     ; Coalescing COPYs into constant physregs.
     ; CHECK: %wzr = SUBSWri %w1, 0, 0
-    %5 = SUBSWri %w1, 0, 0, implicit-def %nzcv
+    %5 : gpr32 = SUBSWri %w1, 0, 0, implicit-def %nzcv
     %wzr = COPY %5
 
     ; Only coalesce when the source register is reserved as a whole (this is
@@ -65,7 +58,7 @@ body: |
     ; of the non-reserved part).
     ; CHECK: %6 = COPY %x28_fp
     ; CHECK: HINT 0, implicit %6
-    %6 = COPY %x28_fp
+    %6 : xseqpairsclass = COPY %x28_fp
     HINT 0, implicit %6
 
     ; It is not fine to coalesce copies from reserved physregs when they are
@@ -76,7 +69,69 @@ body: |
 
     ; Need a def of x18 so that it's not deduced as "constant".
     %x18 = COPY %xzr
-    %7 = COPY %x18
+    %7 : gpr64 = COPY %x18
     BL @f2, csr_aarch64_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp
     STRXui %7, %x1, 0
+
+    ; This can be coalesced.
+    ; CHECK: %fp = SUBXri %fp, 4, 0
+    %8 : gpr64sp = SUBXri %fp, 4, 0
+    %fp = COPY %8
+
+    ; Cannot coalesce when there are reads of the physreg.
+    ; CHECK-NOT: %fp = SUBXri %fp, 8, 0
+    ; CHECK: %9 = SUBXri %fp, 8, 0
+    ; CHECK: STRXui %fp, %fp, 0
+    ; CHECK: %fp = COPY %9
+    %9 : gpr64sp = SUBXri %fp, 8, 0
+    STRXui %fp, %fp, 0
+    %fp = COPY %9
+...
+---
+# Check coalescing of COPYs from reserved physregs.
+# CHECK-LABEL: name: func1
+name: func1
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; Cannot coalesce physreg because we have reads on other CFG paths (we
+    ; currently abort for any control flow)
+    ; CHECK-NOT: %fp = SUBXri
+    ; CHECK: %0 = SUBXri %fp, 12, 0
+    ; CHECK: CBZX undef %x0, %bb.1
+    ; CHECK: B %bb.2
+    %0 : gpr64sp = SUBXri %fp, 12, 0
+    CBZX undef %x0, %bb.1
+    B %bb.2
+
+  bb.1:
+    %fp = COPY %0
+    RET_ReallyLR
+
+  bb.2:
+    STRXui %fp, %fp, 0
+    RET_ReallyLR
+...
+---
+# CHECK-LABEL: name: func2
+name: func2
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+    ; We can coalesce copies from physreg to vreg across multiple blocks.
+    ; CHECK-NOT: COPY
+    ; CHECK: CBZX undef %x0, %bb.1
+    ; CHECK-NEXT: B %bb.2
+    %0 : gpr64sp = COPY %fp
+    CBZX undef %x0, %bb.1
+    B %bb.2
+
+  bb.1:
+    ; CHECK: STRXui undef %x0, %fp, 0
+    ; CHECK-NEXT: RET_ReallyLR
+    STRXui undef %x0, %0, 0
+    RET_ReallyLR
+
+  bb.2:
+    RET_ReallyLR
 ...
diff --git a/test/CodeGen/AArch64/remat.ll b/test/CodeGen/AArch64/remat.ll
index 5081a9da3404780d0d904525254b76b7833cbb6d..80a054beb2a521be6e103276c40f3bdb780cece9 100644
--- a/test/CodeGen/AArch64/remat.ll
+++ b/test/CodeGen/AArch64/remat.ll
@@ -8,7 +8,7 @@
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m3 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=falkor -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s
-; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=vulcan -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx2t99 -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnuabi -mattr=+custom-cheap-as-move -o - %s | FileCheck %s
 
 %X = type { i64, i64, i64 }
diff --git a/test/CodeGen/AArch64/stack-protector-target.ll b/test/CodeGen/AArch64/stack-protector-target.ll
index d4d806289bff31c7f75b31d2e03ebf305919b219..787e4a76ec01b7192a5b4fcab6dbc7d3fd0f5777 100644
--- a/test/CodeGen/AArch64/stack-protector-target.ll
+++ b/test/CodeGen/AArch64/stack-protector-target.ll
@@ -1,5 +1,7 @@
 ; Test target-specific stack cookie location.
 ; RUN: llc -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefix=ANDROID-AARCH64 %s
+; RUN: llc -mtriple=aarch64-fuchsia < %s -o - | FileCheck --check-prefixes=FUCHSIA-AARCH64-COMMON,FUCHSIA-AARCH64-USER %s
+; RUN: llc -mtriple=aarch64-fuchsia -code-model=kernel < %s -o - | FileCheck --check-prefixes=FUCHSIA-AARCH64-COMMON,FUCHSIA-AARCH64-KERNEL %s
 
 define void @_Z1fv() sspreq {
 entry:
@@ -17,3 +19,11 @@ declare void @_Z7CapturePi(i32*)
 ; ANDROID-AARCH64: ldr [[C:.*]], {{\[}}[[A]], #40]
 ; ANDROID-AARCH64: ldr [[D:.*]], [sp,
 ; ANDROID-AARCH64: cmp [[C]], [[D]]
+
+; FUCHSIA-AARCH64-USER: mrs [[A:.*]], TPIDR_EL0
+; FUCHSIA-AARCH64-KERNEL: mrs [[A:.*]], TPIDR_EL1
+; FUCHSIA-AARCH64-COMMON: ldur [[B:.*]], {{\[}}[[A]], #-16]
+; FUCHSIA-AARCH64-COMMON: str [[B]], [sp,
+; FUCHSIA-AARCH64-COMMON: ldur [[C:.*]], {{\[}}[[A]], #-16]
+; FUCHSIA-AARCH64-COMMON: ldr [[D:.*]], [sp,
+; FUCHSIA-AARCH64-COMMON: cmp [[C]], [[D]]
diff --git a/test/CodeGen/AArch64/stack_guard_remat.ll b/test/CodeGen/AArch64/stack_guard_remat.ll
index 08c8a4b665720df2cc8c2178ca66603c53f21d6a..2b7b3485311aee240e1e8f386e3fde32bf86f1be 100644
--- a/test/CodeGen/AArch64/stack_guard_remat.ll
+++ b/test/CodeGen/AArch64/stack_guard_remat.ll
@@ -29,20 +29,20 @@ define i32 @test_stack_guard_remat() #0 {
 entry:
   %a1 = alloca [256 x i32], align 4
   %0 = bitcast [256 x i32]* %a1 to i8*
-  call void @llvm.lifetime.start(i64 1024, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1024, i8* %0)
   %arraydecay = getelementptr inbounds [256 x i32], [256 x i32]* %a1, i64 0, i64 0
   call void @foo3(i32* %arraydecay)
   call void asm sideeffect "foo2", "~{w0},~{w1},~{w2},~{w3},~{w4},~{w5},~{w6},~{w7},~{w8},~{w9},~{w10},~{w11},~{w12},~{w13},~{w14},~{w15},~{w16},~{w17},~{w18},~{w19},~{w20},~{w21},~{w22},~{w23},~{w24},~{w25},~{w26},~{w27},~{w28},~{w29},~{w30}"()
-  call void @llvm.lifetime.end(i64 1024, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 1024, i8* %0)
   ret i32 0
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare void @foo3(i32*)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/AArch64/swifterror.ll b/test/CodeGen/AArch64/swifterror.ll
index b15eaa923f08d67b6b290b2be29605ae6a44adca..69bf3510cc5a72594daf924763012347bebc845b 100644
--- a/test/CodeGen/AArch64/swifterror.ll
+++ b/test/CodeGen/AArch64/swifterror.ll
@@ -13,18 +13,18 @@ define float @foo(%swift_error** swifterror %error_ptr_ref) {
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
 ; CHECK-APPLE: strb [[ID]], [x0, #8]
-; CHECK-APPLE: mov x19, x0
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE: mov x21, x0
+; CHECK-APPLE-NOT: x21
 
 ; CHECK-O0-LABEL: foo:
 ; CHECK-O0: orr w{{.*}}, wzr, #0x10
 ; CHECK-O0: malloc
-; CHECK-O0: mov x19, x0
-; CHECK-O0-NOT: x19
+; CHECK-O0: mov x21, x0
+; CHECK-O0-NOT: x21
 ; CHECK-O0: orr [[ID:w[0-9]+]], wzr, #0x1
-; CHECK-O0-NOT: x19
+; CHECK-O0-NOT: x21
 ; CHECK-O0: strb [[ID]], [x0, #8]
-; CHECK-O0-NOT: x19
+; CHECK-O0-NOT: x21
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -38,20 +38,20 @@ entry:
 define float @caller(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller:
 ; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
-; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cbnz x19
+; CHECK-APPLE: cbnz x21
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller:
-; CHECK-O0: mov x19
+; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo
-; CHECK-O0: mov [[ID:x[0-9]+]], x19
-; CHECK-O0: cbnz x19
+; CHECK-O0: mov [[ID:x[0-9]+]], x21
+; CHECK-O0: cbnz x21
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
   store %swift_error* null, %swift_error** %error_ptr_ref
@@ -75,22 +75,22 @@ define float @caller2(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller2:
 ; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
 ; CHECK-APPLE: fmov [[CMP:s[0-9]+]], #1.0
-; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cbnz x19
+; CHECK-APPLE: cbnz x21
 ; CHECK-APPLE: fcmp s0, [[CMP]]
 ; CHECK-APPLE: b.le
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller2:
-; CHECK-O0: mov x19
+; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo
-; CHECK-O0: mov [[ID:x[0-9]+]], x19
-; CHECK-O0: cbnz x19
+; CHECK-O0: mov [[ID:x[0-9]+]], x21
+; CHECK-O0: cbnz x21
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
   br label %bb_loop
@@ -123,24 +123,24 @@ define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) {
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
 ; CHECK-APPLE: strb [[ID]], [x0, #8]
-; CHECK-APPLE: mov x19, x0
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE: mov x21, x0
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE: ret
 
 ; CHECK-O0-LABEL: foo_if:
-; spill x19
-; CHECK-O0: str x19, [sp, [[SLOT:#[0-9]+]]]
+; spill x21
+; CHECK-O0: str x21, [sp, [[SLOT:#[0-9]+]]]
 ; CHECK-O0: cbz w0
 ; CHECK-O0: orr w{{.*}}, wzr, #0x10
 ; CHECK-O0: malloc
 ; CHECK-O0: mov [[ID:x[0-9]+]], x0
 ; CHECK-O0: orr [[ID2:w[0-9]+]], wzr, #0x1
 ; CHECK-O0: strb [[ID2]], [x0, #8]
-; CHECK-O0: mov x19, [[ID]]
+; CHECK-O0: mov x21, [[ID]]
 ; CHECK-O0: ret
 ; reload from stack
 ; CHECK-O0: ldr [[ID3:x[0-9]+]], [sp, [[SLOT]]]
-; CHECK-O0: mov x19, [[ID3]]
+; CHECK-O0: mov x21, [[ID3]]
 ; CHECK-O0: ret
 entry:
   %cond = icmp ne i32 %cc, 0
@@ -162,19 +162,19 @@ normal:
 ; under a certain condition inside a loop.
 define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-APPLE-LABEL: foo_loop:
-; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: cbz
 ; CHECK-APPLE: orr w0, wzr, #0x10
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: strb w{{.*}}, [x0, #8]
 ; CHECK-APPLE: fcmp
 ; CHECK-APPLE: b.le
-; CHECK-APPLE: mov x19, x0
+; CHECK-APPLE: mov x21, x0
 ; CHECK-APPLE: ret
 
 ; CHECK-O0-LABEL: foo_loop:
-; spill x19
-; CHECK-O0: str x19, [sp, [[SLOT:#[0-9]+]]]
+; spill x21
+; CHECK-O0: str x21, [sp, [[SLOT:#[0-9]+]]]
 ; CHECK-O0: b [[BB1:[A-Za-z0-9_]*]]
 ; CHECK-O0: [[BB1]]:
 ; CHECK-O0: ldr     x0, [sp, [[SLOT]]]
@@ -193,7 +193,7 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float
 ; CHECK-O0: b.le [[BB1]]
 ; reload from stack
 ; CHECK-O0: ldr [[ID3:x[0-9]+]], [sp]
-; CHECK-O0: mov x19, [[ID3]]
+; CHECK-O0: mov x21, [[ID3]]
 ; CHECK-O0: ret
 entry:
   br label %bb_loop
@@ -229,23 +229,23 @@ define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swi
 ; CHECK-APPLE: orr [[ID:w[0-9]+]], wzr, #0x1
 ; CHECK-APPLE: strb [[ID]], [x0, #8]
 ; CHECK-APPLE: str w{{.*}}, [{{.*}}[[SRET]], #4]
-; CHECK-APPLE: mov x19, x0
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE: mov x21, x0
+; CHECK-APPLE-NOT: x21
 
 ; CHECK-O0-LABEL: foo_sret:
 ; CHECK-O0: orr w{{.*}}, wzr, #0x10
 ; spill x8
 ; CHECK-O0-DAG: str x8
-; spill x19
-; CHECK-O0-DAG: str x19
+; spill x21
+; CHECK-O0-DAG: str x21
 ; CHECK-O0: malloc
 ; CHECK-O0: orr [[ID:w[0-9]+]], wzr, #0x1
 ; CHECK-O0: strb [[ID]], [x0, #8]
 ; reload from stack
 ; CHECK-O0: ldr [[SRET:x[0-9]+]]
 ; CHECK-O0: str w{{.*}}, [{{.*}}[[SRET]], #4]
-; CHECK-O0: mov x19
-; CHECK-O0-NOT: x19
+; CHECK-O0: mov x21
+; CHECK-O0-NOT: x21
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -261,22 +261,22 @@ entry:
 define float @caller3(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller3:
 ; CHECK-APPLE: mov [[ID:x[0-9]+]], x0
-; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo_sret
-; CHECK-APPLE: cbnz x19
+; CHECK-APPLE: cbnz x21
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller3:
 ; spill x0
 ; CHECK-O0: str x0
-; CHECK-O0: mov x19
+; CHECK-O0: mov x21
 ; CHECK-O0: bl {{.*}}foo_sret
-; CHECK-O0: mov [[ID2:x[0-9]+]], x19
-; CHECK-O0: cbnz [[ID2]]
+; CHECK-O0: mov [[ID2:x[0-9]+]], x21
+; CHECK-O0: cbnz x21
 ; Access part of the error object and save it to error_ref
 ; reload from stack
 ; CHECK-O0: ldrb [[CODE:w[0-9]+]]
@@ -323,8 +323,8 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) {
 ; Third vararg
 ; CHECK-APPLE: ldr {{w[0-9]+}}, [{{x[0-9]+}}]
 
-; CHECK-APPLE: mov x19, x0
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE: mov x21, x0
+; CHECK-APPLE-NOT: x21
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -356,13 +356,13 @@ define float @caller4(i8* %error_ref) {
 ; CHECK-APPLE: stp {{x[0-9]+}}, {{x[0-9]+}}, [sp, #8]
 ; CHECK-APPLE: str {{x[0-9]+}}, [sp]
 
-; CHECK-APPLE: mov x19, xzr
+; CHECK-APPLE: mov x21, xzr
 ; CHECK-APPLE: bl {{.*}}foo_vararg
-; CHECK-APPLE: cbnz x19
+; CHECK-APPLE: cbnz x21
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x19, #8]
+; CHECK-APPLE: ldrb [[CODE:w[0-9]+]], [x21, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov x0, x19
+; CHECK-APPLE: mov x0, x21
 ; CHECK-APPLE: bl {{.*}}free
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
@@ -407,29 +407,29 @@ entry:
 }
 
 ; CHECK-APPLE-LABEL: swifterror_clobber
-; CHECK-APPLE: mov [[REG:x[0-9]+]], x19
+; CHECK-APPLE: mov [[REG:x[0-9]+]], x21
 ; CHECK-APPLE: nop
-; CHECK-APPLE: mov x19, [[REG]]
+; CHECK-APPLE: mov x21, [[REG]]
 define swiftcc void @swifterror_clobber(%swift_error** nocapture swifterror %err) {
-  call void asm sideeffect "nop", "~{x19}"()
+  call void asm sideeffect "nop", "~{x21}"()
   ret void
 }
 
 ; CHECK-APPLE-LABEL: swifterror_reg_clobber
-; CHECK-APPLE: stp {{.*}}x19
+; CHECK-APPLE: stp {{.*}}x21
 ; CHECK-APPLE: nop
-; CHECK-APPLE: ldp  {{.*}}x19
+; CHECK-APPLE: ldp  {{.*}}x21
 define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) {
-  call void asm sideeffect "nop", "~{x19}"()
+  call void asm sideeffect "nop", "~{x21}"()
   ret void
 }
 ; CHECK-APPLE-LABEL: params_in_reg
 ; Save callee saved registers and swifterror since it will be clobbered by the first call to params_in_reg2.
-; CHECK-APPLE:  stp     x19, x28, [sp
+; CHECK-APPLE:  stp     x21, x28, [sp
 ; CHECK-APPLE:  stp     x27, x26, [sp
 ; CHECK-APPLE:  stp     x25, x24, [sp
 ; CHECK-APPLE:  stp     x23, x22, [sp
-; CHECK-APPLE:  stp     x21, x20, [sp
+; CHECK-APPLE:  stp     x20, x19, [sp
 ; CHECK-APPLE:  stp     x29, x30, [sp
 ; CHECK-APPLE:  str     x20, [sp
 ; Store argument registers.
@@ -439,7 +439,7 @@ define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) {
 ; CHECK-APPLE:  mov      x26, x4
 ; CHECK-APPLE:  mov      x27, x3
 ; CHECK-APPLE:  mov      x28, x2
-; CHECK-APPLE:  mov      x21, x1
+; CHECK-APPLE:  mov      x19, x1
 ; CHECK-APPLE:  mov      x22, x0
 ; Setup call.
 ; CHECK-APPLE:  orr     w0, wzr, #0x1
@@ -451,11 +451,11 @@ define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) {
 ; CHECK-APPLE:  orr     w6, wzr, #0x7
 ; CHECK-APPLE:  orr     w7, wzr, #0x8
 ; CHECK-APPLE:  mov      x20, xzr
-; CHECK-APPLE:  mov      x19, xzr
+; CHECK-APPLE:  mov      x21, xzr
 ; CHECK-APPLE:  bl      _params_in_reg2
 ; Restore original arguments for next call.
 ; CHECK-APPLE:  mov      x0, x22
-; CHECK-APPLE:  mov      x1, x21
+; CHECK-APPLE:  mov      x1, x19
 ; CHECK-APPLE:  mov      x2, x28
 ; CHECK-APPLE:  mov      x3, x27
 ; CHECK-APPLE:  mov      x4, x26
@@ -463,22 +463,22 @@ define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) {
 ; CHECK-APPLE:  mov      x6, x24
 ; CHECK-APPLE:  mov      x7, x23
 ; Restore original swiftself argument and swifterror %err.
-; CHECK-APPLE:  ldp             x20, x19, [sp
+; CHECK-APPLE:  ldp             x20, x21, [sp
 ; CHECK-APPLE:  bl      _params_in_reg2
-; Restore calle save registers but don't clober swifterror x19.
-; CHECK-APPLE-NOT: x19
+; Restore calle save registers but don't clober swifterror x21.
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ldp     x29, x30, [sp
-; CHECK-APPLE-NOT: x19
-; CHECK-APPLE:  ldp     x21, x20, [sp
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE-NOT: x21
+; CHECK-APPLE:  ldp     x20, x19, [sp
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ldp     x23, x22, [sp
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ldp     x25, x24, [sp
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ldp     x27, x26, [sp
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ldr     x28, [sp
-; CHECK-APPLE-NOT: x19
+; CHECK-APPLE-NOT: x21
 ; CHECK-APPLE:  ret
 define swiftcc void @params_in_reg(i64, i64, i64, i64, i64, i64, i64, i64, i8* swiftself, %swift_error** nocapture swifterror %err) {
   %error_ptr_ref = alloca swifterror %swift_error*, align 8
@@ -495,17 +495,17 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8*
 ; CHECK-APPLE:  stp     x27, x26, [sp
 ; CHECK-APPLE:  stp     x25, x24, [sp
 ; CHECK-APPLE:  stp     x23, x22, [sp
-; CHECK-APPLE:  stp     x21, x20, [sp
+; CHECK-APPLE:  stp     x20, x19, [sp
 ; CHECK-APPLE:  stp     x29, x30, [sp
 ; Save original arguments.
-; CHECK-APPLE:  mov      x23, x19
+; CHECK-APPLE:  mov      x23, x21
 ; CHECK-APPLE:  str     x7, [sp, #16]
 ; CHECK-APPLE:  mov      x24, x6
 ; CHECK-APPLE:  mov      x25, x5
 ; CHECK-APPLE:  mov      x26, x4
 ; CHECK-APPLE:  mov      x27, x3
 ; CHECK-APPLE:  mov      x28, x2
-; CHECK-APPLE:  mov      x21, x1
+; CHECK-APPLE:  mov      x19, x1
 ; CHECK-APPLE:  mov      x22, x0
 ; Setup call arguments.
 ; CHECK-APPLE:  orr     w0, wzr, #0x1
@@ -517,23 +517,23 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8*
 ; CHECK-APPLE:  orr     w6, wzr, #0x7
 ; CHECK-APPLE:  orr     w7, wzr, #0x8
 ; CHECK-APPLE:  mov      x20, xzr
-; CHECK-APPLE:  mov      x19, xzr
+; CHECK-APPLE:  mov      x21, xzr
 ; CHECK-APPLE:  bl      _params_in_reg2
 ; Store swifterror %error_ptr_ref.
-; CHECK-APPLE:  str     x19, [sp, #8]
+; CHECK-APPLE:  str     x21, [sp, #8]
 ; Setup call arguments from original arguments.
 ; CHECK-APPLE:  mov      x0, x22
-; CHECK-APPLE:  mov      x1, x21
+; CHECK-APPLE:  mov      x1, x19
 ; CHECK-APPLE:  mov      x2, x28
 ; CHECK-APPLE:  mov      x3, x27
 ; CHECK-APPLE:  mov      x4, x26
 ; CHECK-APPLE:  mov      x5, x25
 ; CHECK-APPLE:  mov      x6, x24
 ; CHECK-APPLE:  ldp     x7, x20, [sp, #16]
-; CHECK-APPLE:  mov      x19, x23
+; CHECK-APPLE:  mov      x21, x23
 ; CHECK-APPLE:  bl      _params_and_return_in_reg2
 ; Store return values.
-; CHECK-APPLE:  mov      x21, x0
+; CHECK-APPLE:  mov      x19, x0
 ; CHECK-APPLE:  mov      x22, x1
 ; CHECK-APPLE:  mov      x24, x2
 ; CHECK-APPLE:  mov      x25, x3
@@ -542,7 +542,7 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8*
 ; CHECK-APPLE:  mov      x28, x6
 ; CHECK-APPLE:  mov      x23, x7
 ; Save swifterror %err.
-; CHECK-APPLE:  str     x19, [sp, #24]
+; CHECK-APPLE:  str     x21, [sp, #24]
 ; Setup call.
 ; CHECK-APPLE:  orr     w0, wzr, #0x1
 ; CHECK-APPLE:  orr     w1, wzr, #0x2
@@ -554,10 +554,10 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8*
 ; CHECK-APPLE:  orr     w7, wzr, #0x8
 ; CHECK-APPLE:  mov      x20, xzr
 ; ... setup call with swiferror %error_ptr_ref.
-; CHECK-APPLE:  ldr     x19, [sp, #8]
+; CHECK-APPLE:  ldr     x21, [sp, #8]
 ; CHECK-APPLE:  bl      _params_in_reg2
 ; Restore return values for return from this function.
-; CHECK-APPLE:  mov      x0, x21
+; CHECK-APPLE:  mov      x0, x19
 ; CHECK-APPLE:  mov      x1, x22
 ; CHECK-APPLE:  mov      x2, x24
 ; CHECK-APPLE:  mov      x3, x25
@@ -566,9 +566,9 @@ declare swiftcc void @params_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8*
 ; CHECK-APPLE:  mov      x6, x28
 ; CHECK-APPLE:  mov      x7, x23
 ; Restore swifterror %err and callee save registers.
-; CHECK-APPLE:  ldp     x19, x28, [sp, #24
+; CHECK-APPLE:  ldp     x21, x28, [sp, #24
 ; CHECK-APPLE:  ldp     x29, x30, [sp
-; CHECK-APPLE:  ldp     x21, x20, [sp
+; CHECK-APPLE:  ldp     x20, x19, [sp
 ; CHECK-APPLE:  ldp     x23, x22, [sp
 ; CHECK-APPLE:  ldp     x25, x24, [sp
 ; CHECK-APPLE:  ldp     x27, x26, [sp
@@ -583,3 +583,17 @@ define swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_
 }
 
 declare swiftcc { i64, i64, i64, i64, i64, i64, i64, i64 } @params_and_return_in_reg2(i64, i64, i64, i64, i64, i64, i64, i64, i8* swiftself, %swift_error** nocapture swifterror %err)
+
+declare void @acallee(i8*)
+
+; Make sure we don't tail call if the caller returns a swifterror value. We
+; would have to move into the swifterror register before the tail call.
+; CHECK-APPLE: tailcall_from_swifterror:
+; CHECK-APPLE-NOT: b _acallee
+; CHECK-APPLE: bl _acallee
+
+define swiftcc void @tailcall_from_swifterror(%swift_error** swifterror %error_ptr_ref) {
+entry:
+  tail call void @acallee(i8* null)
+  ret void
+}
diff --git a/test/CodeGen/AArch64/swiftself.ll b/test/CodeGen/AArch64/swiftself.ll
index a60aed6b0f2b7e21887e634a1ffb95f401eaa4b9..33a49198430eca4f682781debd60233f55a7580c 100644
--- a/test/CodeGen/AArch64/swiftself.ll
+++ b/test/CodeGen/AArch64/swiftself.ll
@@ -65,3 +65,21 @@ define i8* @swiftself_notail(i8* swiftself %addr0, i8* %addr1) nounwind {
   %res = tail call i8* @swiftself_param(i8* swiftself %addr1)
   ret i8* %res
 }
+
+; We cannot pretend that 'x0' is alive across the thisreturn_attribute call as
+; we normally would. We marked the first parameter with swiftself which means it
+; will no longer be passed in x0.
+declare swiftcc i8* @thisreturn_attribute(i8* returned swiftself)
+; OPT-LABEL: swiftself_nothisreturn:
+; OPT-DAG: ldr  x20, [x20]
+; OPT-DAG: mov [[CSREG:x[1-9].*]], x8
+; OPT: bl {{_?}}thisreturn_attribute
+; OPT: str x0, {{\[}}[[CSREG]]
+; OPT: ret
+define hidden swiftcc void @swiftself_nothisreturn(i8** noalias nocapture sret, i8** noalias nocapture readonly swiftself) {
+entry:
+  %2 = load i8*, i8** %1, align 8
+  %3 = tail call swiftcc i8* @thisreturn_attribute(i8* swiftself %2)
+  store i8* %3, i8** %0, align 8
+  ret void
+}
diff --git a/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll b/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll
deleted file mode 100644
index c2997c50f4d45a78e4268b18eca12a39664866d1..0000000000000000000000000000000000000000
--- a/test/CodeGen/AArch64/tail-dup-repeat-worklist.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc -O3 -o - -verify-machineinstrs %s | FileCheck %s
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-gnu"
-
-%struct.s1 = type { %struct.s3*, %struct.s1* }
-%struct.s2 = type opaque
-%struct.s3 = type { i32 }
-
-; Function Attrs: nounwind
-define internal fastcc i32 @repeated_dup_worklist(%struct.s1** %pp1, %struct.s2* %p2, i32 %state, i1 %i1_1, i32 %i32_1) unnamed_addr #0 {
-entry:
-  br label %while.cond.outer
-
-; The loop gets laid out:
-; %while.cond.outer
-; %(null)
-; %(null)
-; %dup2
-; and then %dup1 gets chosen as the next block.
-; when dup2 is duplicated into dup1, %worklist could erroneously be placed on
-; the worklist, because all of its current predecessors are now scheduled.
-; However, after dup2 is tail-duplicated, %worklist can't be on the worklist
-; because it now has unscheduled predecessors.q
-; CHECK-LABEL: repeated_dup_worklist
-; CHECK: // %entry
-; CHECK: // %while.cond.outer
-; first %(null) block
-; CHECK: // in Loop:
-; CHECK: ldr
-; CHECK-NEXT: tbnz
-; second %(null) block
-; CHECK: // in Loop:
-; CHECK: // %dup2
-; CHECK: // %worklist
-; CHECK: // %if.then96.i
-while.cond.outer:                                 ; preds = %dup1, %entry
-  %progress.0.ph = phi i32 [ 0, %entry ], [ %progress.1, %dup1 ]
-  %inc77 = add nsw i32 %progress.0.ph, 1
-  %cmp = icmp slt i32 %progress.0.ph, %i32_1
-  br i1 %cmp, label %dup2, label %dup1
-
-dup2:                       ; preds = %if.then96.i, %worklist, %while.cond.outer
-  %progress.1.ph = phi i32 [ 0, %while.cond.outer ], [ %progress.1, %if.then96.i ], [ %progress.1, %worklist ]
-  %.pr = load %struct.s1*, %struct.s1** %pp1, align 8
-  br label %dup1
-
-dup1:                                       ; preds = %dup2, %while.cond.outer
-  %0 = phi %struct.s1* [ %.pr, %dup2 ], [ undef, %while.cond.outer ]
-  %progress.1 = phi i32 [ %progress.1.ph, %dup2 ], [ %inc77, %while.cond.outer ]
-  br i1 %i1_1, label %while.cond.outer, label %worklist
-
-worklist:                                       ; preds = %dup1
-  %snode94 = getelementptr inbounds %struct.s1, %struct.s1* %0, i64 0, i32 0
-  %1 = load %struct.s3*, %struct.s3** %snode94, align 8
-  %2 = getelementptr inbounds %struct.s3, %struct.s3* %1, i32 0, i32 0
-  %3 = load i32, i32* %2, align 4
-  %tobool95.i = icmp eq i32 %3, 0
-  br i1 %tobool95.i, label %if.then96.i, label %dup2
-
-if.then96.i:                                      ; preds = %worklist
-  call fastcc void @free_s3(%struct.s2* %p2, %struct.s3* %1) #1
-  br label %dup2
-}
-
-; Function Attrs: nounwind
-declare fastcc void @free_s3(%struct.s2*, %struct.s3*) unnamed_addr #0
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
diff --git a/test/CodeGen/AArch64/tailcall-string-rvo.ll b/test/CodeGen/AArch64/tailcall-string-rvo.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bdc09235afd9c2985cd024918527944f52aefe09
--- /dev/null
+++ b/test/CodeGen/AArch64/tailcall-string-rvo.ll
@@ -0,0 +1,47 @@
+; RUN: llc -relocation-model=static -verify-machineinstrs -O2 < %s | FileCheck %s
+
+; The call to function TestBar should be a tail call, when in C++ the string
+; `ret` is RVO returned.
+; string TestFoo() {
+;   string ret = undef;
+;   TestBar(&ret);  // tail call optimized
+;   return ret;
+; }
+
+target triple = "aarch64-linux-gnu"
+
+%class.basic_string.11.42.73 = type { %"class.__gnu_cxx::__versa_string.10.41.72" }
+%"class.__gnu_cxx::__versa_string.10.41.72" = type { %"class.__gnu_cxx::__sso_string_base.9.40.71" }
+%"class.__gnu_cxx::__sso_string_base.9.40.71" = type { %"struct.__gnu_cxx::__vstring_utility<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider.7.38.69", i64, %union.anon.8.39.70 }
+%"struct.__gnu_cxx::__vstring_utility<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider.7.38.69" = type { i8* }
+%union.anon.8.39.70 = type { i64, [8 x i8] }
+
+declare void @TestBaz(%class.basic_string.11.42.73* noalias sret %arg)
+
+define void @TestBar(%class.basic_string.11.42.73* noalias sret %arg) {
+bb:
+  call void @TestBaz(%class.basic_string.11.42.73* noalias sret %arg)
+  ret void
+}
+
+define void @TestFoo(%class.basic_string.11.42.73* noalias sret %arg) {
+; CHECK-LABEL: TestFoo:
+; CHECK: b TestBar
+bb:
+  %tmp = getelementptr inbounds %class.basic_string.11.42.73, %class.basic_string.11.42.73* %arg, i64 0, i32 0, i32 0, i32 2
+  %tmp1 = bitcast %class.basic_string.11.42.73* %arg to %union.anon.8.39.70**
+  store %union.anon.8.39.70* %tmp, %union.anon.8.39.70** %tmp1, align 8
+  %tmp2 = bitcast %union.anon.8.39.70* %tmp to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %tmp2, i8* nonnull undef, i64 13, i32 1, i1 false)
+  %tmp3 = getelementptr inbounds %class.basic_string.11.42.73, %class.basic_string.11.42.73* %arg, i64 0, i32 0, i32 0, i32 1
+  store i64 13, i64* %tmp3, align 8
+  %tmp4 = getelementptr inbounds %class.basic_string.11.42.73, %class.basic_string.11.42.73* %arg, i64 0, i32 0, i32 0, i32 2, i32 1, i64 5
+  store i8 0, i8* %tmp4, align 1
+  tail call void @TestBar(%class.basic_string.11.42.73* noalias sret %arg)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #0
+
+attributes #0 = { argmemonly nounwind }
diff --git a/test/CodeGen/AArch64/tbz-tbnz.ll b/test/CodeGen/AArch64/tbz-tbnz.ll
index 0dd265c18ec7604b944ecca72ec321342a115d16..7ef78ca52a249fbe907f31923d188ca3a9ebfa78 100644
--- a/test/CodeGen/AArch64/tbz-tbnz.ll
+++ b/test/CodeGen/AArch64/tbz-tbnz.ll
@@ -10,7 +10,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbz [[CMP]], #31
+; CHECK: tbnz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -28,7 +28,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:x[0-9]+]], x0, #12
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
 if.then:
   call void @t()
@@ -118,7 +118,7 @@ entry:
   br i1 %cmp, label %if.then, label %if.end
 
 ; CHECK: sub [[CMP:w[0-9]+]], w0, #12
-; CHECK: tbz [[CMP]], #31
+; CHECK: tbnz [[CMP]], #31
 
 if.then:
   call void @t()
@@ -178,7 +178,7 @@ define void @test9(i64 %val1) {
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -194,7 +194,7 @@ define void @test10(i64 %val1) {
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -209,7 +209,7 @@ define void @test11(i64 %val1, i64* %ptr) {
 
 ; CHECK: ldr [[CMP:x[0-9]+]], [x1]
 ; CHECK-NOT: cmp
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
   %val = load i64, i64* %ptr
   %tst = icmp slt i64 %val, 0
@@ -229,7 +229,7 @@ define void @test12(i64 %val1) {
   br i1 %tst, label %if.then, label %if.end
 
 ; CHECK-NOT: cmp
-; CHECK: tbz x0, #63
+; CHECK: tbnz x0, #63
 
 if.then:
   call void @t()
@@ -247,7 +247,7 @@ define void @test13(i64 %val1, i64 %val2) {
 
 ; CHECK: orr [[CMP:x[0-9]+]], x0, x1
 ; CHECK-NOT: cmp
-; CHECK: tbz [[CMP]], #63
+; CHECK: tbnz [[CMP]], #63
 
 if.then:
   call void @t()
diff --git a/test/CodeGen/AArch64/thread-pointer.ll b/test/CodeGen/AArch64/thread-pointer.ll
new file mode 100644
index 0000000000000000000000000000000000000000..91585791a58e98424355c1199e8f941ee57a2f90
--- /dev/null
+++ b/test/CodeGen/AArch64/thread-pointer.ll
@@ -0,0 +1,60 @@
+; RUN: llc -mtriple=aarch64-linux-gnu -verify-machineinstrs -o - %s | FileCheck %s
+
+@x = thread_local local_unnamed_addr global i32 0, align 4
+@y = thread_local local_unnamed_addr global i32 0, align 4
+
+; Machine LICM should hoist the mrs into the loop preheader.
+; CHECK-LABEL: @test1
+; CHECK: BB#1:
+; CHECK:   mrs x[[BASE:[0-9]+]], TPIDR_EL0
+; CHECK:   add x[[REG1:[0-9]+]], x[[BASE]], :tprel_hi12:x
+; CHECK:   add x[[REG2:[0-9]+]], x[[REG1]], :tprel_lo12_nc:x
+;
+; CHECK: .LBB0_2:
+; CHECK:   ldr w0, [x[[REG2]]]
+; CHECK:   bl bar
+; CHECK:   sub w[[REG3:[0-9]+]], w{{[0-9]+}}, #1
+; CHECK:   cbnz w[[REG3]], .LBB0_2
+
+define void @test1(i32 %n) local_unnamed_addr {
+entry:
+  %cmp3 = icmp sgt i32 %n, 0
+  br i1 %cmp3, label %bb1, label %bb2
+
+bb1:
+  br label %for.body
+
+for.body:
+  %i.04 = phi i32 [ %inc, %for.body ], [ 0, %bb1 ]
+  %0 = load i32, i32* @x, align 4
+  tail call void @bar(i32 %0) #2
+  %inc = add nuw nsw i32 %i.04, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %bb2, label %for.body
+
+bb2:
+  ret void
+}
+
+; Machine CSE should combine the the mrs between the load of %x and %y.
+; CHECK-LABEL: @test2
+; CHECK: mrs x{{[0-9]+}}, TPIDR_EL0
+; CHECK-NOT: mrs x{{[0-9]+}}, TPIDR_EL0
+; CHECK: ret
+define void @test2(i32 %c) local_unnamed_addr #0 {
+entry:
+  %0 = load i32, i32* @x, align 4
+  tail call void @bar(i32 %0) #2
+  %cmp = icmp eq i32 %c, 0
+  br i1 %cmp, label %if.end, label %if.then
+
+if.then:
+  %1 = load i32, i32* @y, align 4
+  tail call void @bar(i32 %1) #2
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @bar(i32) local_unnamed_addr
diff --git a/test/CodeGen/AArch64/vector_merge_dep_check.ll b/test/CodeGen/AArch64/vector_merge_dep_check.ll
index 9220947e8362b249445bc5fdc162733217948fea..e4e64ef8c8dbf687addfe4b146527ee001f37905 100644
--- a/test/CodeGen/AArch64/vector_merge_dep_check.ll
+++ b/test/CodeGen/AArch64/vector_merge_dep_check.ll
@@ -1,5 +1,4 @@
-; RUN: llc --combiner-alias-analysis=false < %s | FileCheck %s
-; RUN: llc --combiner-alias-analysis=true  < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s
 
 ; This test checks that we do not merge stores together which have
 ; dependencies through their non-chain operands (e.g. one store is the
diff --git a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
index edad18e244d03186e090e79803072f56dc391885..ca661cf9a712caf8098bd89623188d75fb2187c6 100644
--- a/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
+++ b/test/CodeGen/AMDGPU/32-bit-local-address-space.ll
@@ -13,7 +13,7 @@
 ; FUNC-LABEL: {{^}}local_address_load:
 ; SI: v_mov_b32_e{{32|64}} [[PTR:v[0-9]]]
 ; SI: ds_read_b32 v{{[0-9]+}}, [[PTR]]
-define void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = load i32, i32 addrspace(3)* %in
   store i32 %0, i32 addrspace(1)* %out
@@ -24,7 +24,7 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]]
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
+define amdgpu_kernel void @local_address_gep(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 %offset
   %1 = load i32, i32 addrspace(3)* %0
@@ -35,7 +35,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_address_gep_const_offset:
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, [[VPTR]] offset:4
-define void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 1
   %1 = load i32, i32 addrspace(3)* %0
@@ -48,7 +48,7 @@ entry:
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_read_b32 [[VPTR]]
-define void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
+define amdgpu_kernel void @local_address_gep_large_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(3)* %in, i32 16385
   %1 = load i32, i32 addrspace(3)* %0
@@ -60,7 +60,7 @@ entry:
 ; SI: v_cmp_ne_u32
 ; SI-NOT: v_cmp_ne_u32
 ; SI: v_cndmask_b32
-define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
+define amdgpu_kernel void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) nounwind {
   %cmp = icmp ne i32 addrspace(3)* %lds, null
   %x = select i1 %cmp, i32 123, i32 456
   store i32 %x, i32 addrspace(1)* %out
@@ -71,7 +71,7 @@ define void @null_32bit_lds_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %lds)
 ; SI: s_mul_i32
 ; SI-NEXT: s_add_i32
 ; SI: ds_read_b32
-define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
+define amdgpu_kernel void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %lds, i32 %tid) {
   %ptr = getelementptr [3 x float], [3 x float] addrspace(3)* %lds, i32 %tid, i32 0
   %val = load float, float addrspace(3)* %ptr
   store float %val, float addrspace(1)* %out
@@ -83,7 +83,7 @@ define void @mul_32bit_ptr(float addrspace(1)* %out, [3 x float] addrspace(3)* %
 ; FUNC-LABEL: {{^}}infer_ptr_alignment_global_offset:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0
 ; SI: ds_read_b32 v{{[0-9]+}}, [[REG]]
-define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
+define amdgpu_kernel void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %tid) {
   %val = load float, float addrspace(3)* @g_lds
   store float %val, float addrspace(1)* %out
   ret void
@@ -95,14 +95,14 @@ define void @infer_ptr_alignment_global_offset(float addrspace(1)* %out, i32 %ti
 
 ; FUNC-LABEL: {{^}}global_ptr:
 ; SI: ds_write_b32
-define void @global_ptr() nounwind {
+define amdgpu_kernel void @global_ptr() nounwind {
   store i32 addrspace(3)* getelementptr ([16383 x i32], [16383 x i32] addrspace(3)* @dst, i32 0, i32 16), i32 addrspace(3)* addrspace(3)* @ptr
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_address_store:
 ; SI: ds_write_b32
-define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
   store i32 %val, i32 addrspace(3)* %out
   ret void
 }
@@ -111,7 +111,7 @@ define void @local_address_store(i32 addrspace(3)* %out, i32 %val) {
 ; SI: s_add_i32 [[SADDR:s[0-9]+]],
 ; SI: v_mov_b32_e32 [[ADDR:v[0-9]+]], [[SADDR]]
 ; SI: ds_write_b32 [[ADDR]], v{{[0-9]+}}
-define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
+define amdgpu_kernel void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32 %offset) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 %offset
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
@@ -121,7 +121,7 @@ define void @local_address_gep_store(i32 addrspace(3)* %out, i32, i32 %val, i32
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], s{{[0-9]+}}
 ; SI: v_mov_b32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; SI: ds_write_b32 [[VPTR]], [[VAL]] offset:4
-define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 1
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
@@ -132,7 +132,7 @@ define void @local_address_gep_const_offset_store(i32 addrspace(3)* %out, i32 %v
 ; SI: s_add_i32 [[SPTR:s[0-9]]], s{{[0-9]+}}, 0x10004
 ; SI: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; SI: ds_write_b32 [[VPTR]], v{{[0-9]+$}}
-define void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
+define amdgpu_kernel void @local_address_gep_large_const_offset_store(i32 addrspace(3)* %out, i32 %val) {
   %gep = getelementptr i32, i32 addrspace(3)* %out, i32 16385
   store i32 %val, i32 addrspace(3)* %gep, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
index c78ed11b73e19f0d3fcbcebc324c5a31b55d1ddc..56a9e7022db9c5587f5d8f1894a4d14a5ad91dc9 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir
@@ -4,7 +4,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
 ...
 ---
 
@@ -22,6 +22,7 @@ body: |
 
     %0:vgpr(p1) = COPY %vgpr0_vgpr1
     %1:vgpr(s32) = G_LOAD %0 :: (load 4 from %ir.global0)
+    %vgpr0 = COPY %1
 
 ...
 ---
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
index 078e71fc20fca7155a2d860cd66753aa44673979..ea2ad2ba83a5252715f01a8dec9547b10adf69dc 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-smrd.mir
@@ -5,7 +5,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
+  define amdgpu_kernel void @smrd_imm(i32 addrspace(2)* %const0) { ret void }
 ...
 ---
 
@@ -25,25 +25,18 @@ regBankSelected: true
 # VI:   S_LOAD_DWORD_IMM [[PTR]], 1020, 0
 
 # Immediate overflow for SI
-# FIXME: The immediate gets selected twice, once into the
-# S_LOAD_DWORD instruction and once just as a normal constat.
-# SI: S_MOV_B32 1024
 # SI: [[K1024:%[0-9]+]] = S_MOV_B32 1024
 # SI: S_LOAD_DWORD_SGPR [[PTR]], [[K1024]], 0
 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 256, 0
 # VI: S_LOAD_DWORD_IMM [[PTR]], 1024, 0
 
 # Max immediate offset for VI
-# SI: S_MOV_B32 1048572
 # SI: [[K1048572:%[0-9]+]] = S_MOV_B32 1048572
 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262143
 # VI: S_LOAD_DWORD_IMM [[PTR]], 1048572
 
 #
 # Immediate overflow for VI
-# FIXME: The immediate gets selected twice, once into the
-# S_LOAD_DWORD instruction and once just as a normal constat.
-# SIVI: S_MOV_B32 1048576
 # SIVI: [[K1048576:%[0-9]+]] = S_MOV_B32 1048576
 # SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K1048576]], 0
 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 262144, 0
@@ -52,11 +45,11 @@ regBankSelected: true
 # SIVI: [[K_LO:%[0-9]+]] = S_MOV_B32 4294967292
 # SIVI: [[K_HI:%[0-9]+]] = S_MOV_B32 3
 # SIVI: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
-# SIVI: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
-# SIVI: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
+# SIVI-DAG: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
+# SIVI-DAG: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
 # SIVI: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
-# SIVI: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
-# SIVI: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
+# SIVI-DAG: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
+# SIVI-DAG: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
 # SIVI: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
 # SIVI: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
 # SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
@@ -66,19 +59,16 @@ regBankSelected: true
 # GCN: [[K_LO:%[0-9]+]] = S_MOV_B32 0
 # GCN: [[K_HI:%[0-9]+]] = S_MOV_B32 4
 # GCN: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
-# GCN: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
-# GCN: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
+# GCN-DAG: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
+# GCN-DAG: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
 # GCN: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
-# GCN: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
-# GCN: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
+# GCN-DAG: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
+# GCN-DAG: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
 # GCN: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
 # GCN: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
 # GCN: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
 
 # Max 32-bit byte offset
-# FIXME: The immediate gets selected twice, once into the
-# S_LOAD_DWORD instruction and once just as a normal constat.
-# SIVI: S_MOV_B32 4294967292
 # SIVI: [[K4294967292:%[0-9]+]] = S_MOV_B32 4294967292
 # SIVI: S_LOAD_DWORD_SGPR [[PTR]], [[K4294967292]], 0
 # CI: S_LOAD_DWORD_IMM_ci [[PTR]], 1073741823, 0
@@ -87,11 +77,11 @@ regBankSelected: true
 # SIVI: [[K_LO:%[0-9]+]] = S_MOV_B32 0
 # SIVI: [[K_HI:%[0-9]+]] = S_MOV_B32 1
 # SIVI: [[K:%[0-9]+]] = REG_SEQUENCE [[K_LO]], 1, [[K_HI]], 2
-# SIVI: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
-# SIVI: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
+# SIVI-DAG: [[K_SUB0:%[0-9]+]] = COPY [[K]].sub0
+# SIVI-DAG: [[PTR_LO:%[0-9]+]] = COPY [[PTR]].sub0
 # SIVI: [[ADD_PTR_LO:%[0-9]+]] = S_ADD_U32 [[PTR_LO]], [[K_SUB0]]
-# SIVI: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
-# SIVI: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
+# SIVI-DAG: [[K_SUB1:%[0-9]+]] = COPY [[K]].sub1
+# SIVI-DAG: [[PTR_HI:%[0-9]+]] = COPY [[PTR]].sub1
 # SIVI: [[ADD_PTR_HI:%[0-9]+]] = S_ADDC_U32 [[PTR_HI]], [[K_SUB1]]
 # SIVI: [[ADD_PTR:%[0-9]+]] = REG_SEQUENCE [[ADD_PTR_LO]], 1, [[ADD_PTR_HI]], 2
 # SIVI: S_LOAD_DWORD_IMM [[ADD_PTR]], 0, 0
@@ -106,38 +96,47 @@ body: |
     %1:sgpr(s64) = G_CONSTANT i64 4
     %2:sgpr(p2) = G_GEP %0, %1
     %3:sgpr(s32) = G_LOAD %2 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %3
 
     %4:sgpr(s64) = G_CONSTANT i64 1020
     %5:sgpr(p2) = G_GEP %0, %4
     %6:sgpr(s32) = G_LOAD %5 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %6
 
     %7:sgpr(s64) = G_CONSTANT i64 1024
     %8:sgpr(p2) = G_GEP %0, %7
     %9:sgpr(s32) = G_LOAD %8 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %9
 
     %10:sgpr(s64) = G_CONSTANT i64 1048572
     %11:sgpr(p2) = G_GEP %0, %10
     %12:sgpr(s32) = G_LOAD %11 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %12
 
     %13:sgpr(s64) = G_CONSTANT i64 1048576
     %14:sgpr(p2) = G_GEP %0, %13
     %15:sgpr(s32) = G_LOAD %14 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %15
 
     %16:sgpr(s64) = G_CONSTANT i64 17179869180
     %17:sgpr(p2) = G_GEP %0, %16
     %18:sgpr(s32) = G_LOAD %17 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %18
 
     %19:sgpr(s64) = G_CONSTANT i64 17179869184
     %20:sgpr(p2) = G_GEP %0, %19
     %21:sgpr(s32) = G_LOAD %20 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %21
 
     %22:sgpr(s64) = G_CONSTANT i64 4294967292
     %23:sgpr(p2) = G_GEP %0, %22
     %24:sgpr(s32) = G_LOAD %23 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %24
 
     %25:sgpr(s64) = G_CONSTANT i64 4294967296
     %26:sgpr(p2) = G_GEP %0, %25
     %27:sgpr(s32) = G_LOAD %26 :: (load 4 from %ir.const0)
+    %sgpr0 = COPY %27
 
 ...
 ---
diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
index 070d3bc1448c434c71c69920fb17c6704dcfe023..ea435725bf25df31465ebc8096a1401d7a0dc068 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir
@@ -4,7 +4,7 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
+  define amdgpu_kernel void @global_addrspace(i32 addrspace(1)* %global0) { ret void }
 ...
 ---
 
diff --git a/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir b/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
index 94473367e0ad997d48108f623c8ae814cccdf2c3..3496b1ab71fe66ad28dbd7a3195d9e0e652cc263 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
+++ b/test/CodeGen/AMDGPU/GlobalISel/regbankselect.mir
@@ -3,12 +3,12 @@
 # REQUIRES: global-isel
 
 --- |
-  define void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
-  define void @load_global_uniform(i32 addrspace(1)* %ptr1) {
+  define amdgpu_kernel void @load_constant(i32 addrspace(2)* %ptr0) { ret void }
+  define amdgpu_kernel void @load_global_uniform(i32 addrspace(1)* %ptr1) {
     %tmp0 = load i32, i32 addrspace(1)* %ptr1
     ret void
   }
-  define void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
+  define amdgpu_kernel void @load_global_non_uniform(i32 addrspace(1)* %ptr2) {
     %tmp0 = call i32 @llvm.amdgcn.workitem.id.x() #0
     %tmp1 = getelementptr i32, i32 addrspace(1)* %ptr2, i32 %tmp0
     %tmp2 = load i32, i32 addrspace(1)* %tmp1
diff --git a/test/CodeGen/AMDGPU/GlobalISel/smrd.ll b/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
index a3630b1ac92a687cab92ee4c09becaaeda275208..8a6b3df9cff8dcc4a7b680f8c4b5fcd502075e9e 100644
--- a/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
+++ b/test/CodeGen/AMDGPU/GlobalISel/smrd.ll
@@ -9,7 +9,7 @@
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
   %1 = load i32, i32 addrspace(2)* %0
@@ -21,7 +21,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
   %1 = load i32, i32 addrspace(2)* %0
@@ -36,7 +36,7 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
   %1 = load i32, i32 addrspace(2)* %0
@@ -51,7 +51,7 @@ entry:
 ; XSI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; XGCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
   %1 = load i32, i32 addrspace(2)* %0
@@ -65,7 +65,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
   %1 = load i32, i32 addrspace(2)* %0
@@ -79,7 +79,7 @@ entry:
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
 entry:
   %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
   %1 = load i32, i32 addrspace(2)* %0
diff --git a/test/CodeGen/AMDGPU/add-debug.ll b/test/CodeGen/AMDGPU/add-debug.ll
index 529905dd36a2aa2a4f536ba4d5e5beeccbfe75d8..b90c20b9748233eb6cc9cd0b4f851edd4306ee4a 100644
--- a/test/CodeGen/AMDGPU/add-debug.ll
+++ b/test/CodeGen/AMDGPU/add-debug.ll
@@ -3,7 +3,7 @@
 ; REQUIRES: asserts
 
 ; Check that SelectionDAGDumper does not crash on int_SI_if.
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/add.i16.ll b/test/CodeGen/AMDGPU/add.i16.ll
index 6c5cdd3877d16cec192e8cac0bcda2aea68db3e1..b65e79f14deb71238a05ad350aaa308a6195b31d 100644
--- a/test/CodeGen/AMDGPU/add.i16.ll
+++ b/test/CodeGen/AMDGPU/add.i16.ll
@@ -6,7 +6,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_add_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x7b, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -38,7 +38,7 @@ define void @v_test_add_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xfffffcb3, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -53,7 +53,7 @@ define void @v_test_add_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], -1, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -69,7 +69,7 @@ define void @v_test_add_i16_inline_neg1(i16 addrspace(1)* %out, i16 addrspace(1)
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -89,7 +89,7 @@ define void @v_test_add_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-DAG: v_add_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -109,7 +109,7 @@ define void @v_test_add_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]],  [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -130,7 +130,7 @@ define void @v_test_add_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_add_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
diff --git a/test/CodeGen/AMDGPU/add.ll b/test/CodeGen/AMDGPU/add.ll
index a6247c73524091f23f610b0ce6a9a8da1381715a..7e4546d2cfb3f0b2c54007450a4212f9f3bd4ad0 100644
--- a/test/CodeGen/AMDGPU/add.ll
+++ b/test/CodeGen/AMDGPU/add.ll
@@ -8,7 +8,7 @@
 ;SI: v_add_i32_e32 [[REG:v[0-9]+]], vcc, {{v[0-9]+, v[0-9]+}}
 ;SI-NOT: [[REG]]
 ;SI: buffer_store_dword [[REG]],
-define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -24,7 +24,7 @@ define void @test1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -44,7 +44,7 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ;SI: v_add_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -71,7 +71,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
+define amdgpu_kernel void @test8(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) {
 entry:
   %0 = add <8 x i32> %a, %b
   store <8 x i32> %0, <8 x i32> addrspace(1)* %out
@@ -112,7 +112,7 @@ entry:
 ; SI: s_add_i32
 ; SI: s_add_i32
 ; SI: s_add_i32
-define void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
+define amdgpu_kernel void @test16(<16 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) {
 entry:
   %0 = add <16 x i32> %a, %b
   store <16 x i32> %0, <16 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @add64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %0 = add i64 %a, %b
   store i64 %0, i64 addrspace(1)* %out
@@ -150,7 +150,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @add64_sgpr_vgpr(i64 addrspace(1)* %out, i64 %a, i64 addrspace(1)* %in) {
 entry:
   %0 = load i64, i64 addrspace(1)* %in
   %1 = add i64 %a, %0
@@ -169,7 +169,7 @@ entry:
 ; EG-DAG: ADD_INT
 ; EG-DAG: ADD_INT {{[* ]*}}
 ; EG-NOT: SUB
-define void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @add64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/add.v2i16.ll b/test/CodeGen/AMDGPU/add.v2i16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e137ef4bc23672555769fb1c53b9d3fba0b763dc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/add.v2i16.ll
@@ -0,0 +1,283 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16:
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_add_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_add_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = add <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_add_v2i16:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]]
+
+; VI: s_add_i32
+; VI: s_add_i32
+define amdgpu_kernel void @s_test_add_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
+  %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
+  %add = add <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_add_self_v2i16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[VAL]], [[VAL]]
+
+; VI: s_add_i32
+; VI: s_add_i32
+define amdgpu_kernel void @s_test_add_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
+  %add = add <2 x i16> %a, %a
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: VI should not scalarize arg access.
+; GCN-LABEL: {{^}}s_test_add_v2i16_kernarg:
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+
+; VI: v_add_i32
+; VI: v_add_i32_sdwa
+define amdgpu_kernel void @s_test_add_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+  %add = add <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_add_v2i16_constant:
+; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x7b, v{{[0-9]+}}
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x1c8, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_add_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = add <2 x i16> %a, <i16 123, i16 456>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16_neg_constant:
+; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffcb3, v{{[0-9]+}}
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffc21, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_add_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = add <2 x i16> %a, <i16 -845, i16 -991>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_add_v2i16_inline_neg1:
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}}
+
+; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
+; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD0]]
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, -1, [[LOAD1]]
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_add_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = add <2 x i16> %a, <i16 -1, i16 -1>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_add_v2i16_inline_lo_zero_hi:
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}}
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+
+; VI-NOT: v_add_u16
+; VI: v_add_u16_e32 v{{[0-9]+}}, 32, v{{[0-9]+}}
+; VI-NOT: v_add_u16
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_add_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = add <2 x i16> %a, <i16 32, i16 0>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; The high element gives fp
+; GCN-LABEL: {{^}}v_test_add_v2i16_inline_fp_split:
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
+; GFX9: v_pk_add_u16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+
+; VI-NOT: v_add_u16
+; VI: v_add_u16_e32 v{{[0-9]+}}, 0x3f80, v{{[0-9]+}}
+; VI-NOT: v_add_u16
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_add_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = add <2 x i16> %a, <i16 0, i16 16256>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i32:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
+; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
+
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+
+; VI: v_add_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
+; VI-NOT: and
+; VI-NOT: shl
+; VI: v_add_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; VI-NOT: and
+; VI-NOT: shl
+; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = add <2 x i16> %a, %b
+  %ext = zext <2 x i16> %add to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16_zext_to_v2i64:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
+; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx4
+
+; VI: flat_load_ushort v[[A_LO:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+
+; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; VI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; VI: v_add_u16_e32
+; VI: v_add_u16_e32
+
+; VI: buffer_store_dwordx4
+define amdgpu_kernel void @v_test_add_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = add <2 x i16> %a, %b
+  %ext = zext <2 x i16> %add to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i32:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16
+; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
+
+; VI: v_add_u16_e32
+; VI: v_add_u16_e32
+; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; VI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; VI: buffer_store_dwordx2
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = add <2 x i16> %a, %b
+  %ext = sext <2 x i16> %add to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_add_v2i16_sext_to_v2i64:
+; GCN: flat_load_dword
+; GCN: flat_load_dword
+
+; GFX9: v_pk_add_u16
+; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+
+; VI: v_add_u16_sdwa
+; VI: v_add_u16_e32
+
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_add_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = add <2 x i16> %a, %b
+  %ext = sext <2 x i16> %add to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/add_i128.ll b/test/CodeGen/AMDGPU/add_i128.ll
index c80157ca9c58e7b8cc28eef676c741f71ff26c3b..00a125c2e44fb16f846f8c7d6aa678edc9db341e 100644
--- a/test/CodeGen/AMDGPU/add_i128.ll
+++ b/test/CodeGen/AMDGPU/add_i128.ll
@@ -6,7 +6,7 @@
 ; GCN-NEXT: v_addc_u32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN-NEXT: v_addc_u32_e32 v[[HI:[0-9]+]], vcc, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]],
-define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %inA, i128 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i128, i128 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i128, i128 addrspace(1)* %inB, i32 %tid
@@ -23,7 +23,7 @@ define void @test_i128_vreg(i128 addrspace(1)* noalias %out, i128 addrspace(1)*
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
   %foo = load i128, i128 addrspace(1)* %in, align 8
   %result = add i128 %foo, %a
   store i128 %result, i128 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @sgpr_operand(i128 addrspace(1)* noalias %out, i128 addrspace(1)* no
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
 ; GCN: v_addc_u32
-define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in, i128 %a) {
   %foo = load i128, i128 addrspace(1)* %in, align 8
   %result = add i128 %a, %foo
   store i128 %result, i128 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i128 addrspace(1)* noalias %out, i128 addrspa
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
 ; GCN: s_addc_u32
-define void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
+define amdgpu_kernel void @test_sreg(i128 addrspace(1)* noalias %out, i128 %a, i128 %b) {
   %result = add i128 %a, %b
   store i128 %result, i128 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/add_i64.ll b/test/CodeGen/AMDGPU/add_i64.ll
index 3d360b7d0b7ac2add278a4b45a005605aaa4f61b..62733d5bfb6c93aaa87101c5ac42b279fdc57fdd 100644
--- a/test/CodeGen/AMDGPU/add_i64.ll
+++ b/test/CodeGen/AMDGPU/add_i64.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; SI-LABEL: {{^}}test_i64_vreg:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
@@ -21,7 +21,7 @@ define void @test_i64_vreg(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noa
 ; SI-LABEL: {{^}}sgpr_operand:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
+define amdgpu_kernel void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 addrspace(1)* noalias %in_bar, i64 %a) {
   %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %foo, %a
   store i64 %result, i64 addrspace(1)* %out
@@ -34,7 +34,7 @@ define void @sgpr_operand(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noal
 ; SI-LABEL: {{^}}sgpr_operand_reversed:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
+define amdgpu_kernel void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %a) {
   %foo = load i64, i64 addrspace(1)* %in, align 8
   %result = add i64 %a, %foo
   store i64 %result, i64 addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sgpr_operand_reversed(i64 addrspace(1)* noalias %out, i64 addrspace
 ; SI: s_addc_u32
 ; SI: s_add_u32
 ; SI: s_addc_u32
-define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
+define amdgpu_kernel void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a, <2 x i64> %b) {
   %result = add <2 x i64> %a, %b
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
@@ -58,7 +58,7 @@ define void @test_v2i64_sreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %a,
 ; SI: v_addc_u32
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
@@ -76,7 +76,7 @@ define void @test_v2i64_vreg(<2 x i64> addrspace(1)* noalias %out, <2 x i64> add
 ; SI-NOT: addc
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_add_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = add i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/addrspacecast-captured.ll b/test/CodeGen/AMDGPU/addrspacecast-captured.ll
index 481a3e2b31b0d4fee09825accbe5ce4fe534ceeb..138bc36b9e1b8bcaee5f4fb6516ec732841e3889 100644
--- a/test/CodeGen/AMDGPU/addrspacecast-captured.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast-captured.ll
@@ -9,7 +9,7 @@ declare void @consume_ptr2int(i32) #0
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
 ; CHECK: store i32 %ptr2int, i32 addrspace(1)* %out
-define void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @addrspacecast_captured(i32 addrspace(1)* %out) #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
@@ -22,7 +22,7 @@ entry:
 ; CHECK: %data = alloca i32, align 4
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %out
-define void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
+define amdgpu_kernel void @addrspacecast_captured_store(i32 addrspace(4)* addrspace(1)* %out) #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
@@ -35,7 +35,7 @@ entry:
 ; CHECK: %cast = addrspacecast i32* %data to i32 addrspace(4)*
 ; CHECK: %ptr2int = ptrtoint i32 addrspace(4)* %cast to i32
 ; CHECK: call void @consume_ptr2int(i32 %ptr2int)
-define void @addrspacecast_captured_call() #0 {
+define amdgpu_kernel void @addrspacecast_captured_call() #0 {
 entry:
   %data = alloca i32, align 4
   %cast = addrspacecast i32* %data to i32 addrspace(4)*
diff --git a/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 67a1939992044ca1e585b61679e3694997c9ecce..8cabc7dae133e8a6b74023bcdcf23e4a92dfe413 100644
--- a/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -9,57 +9,57 @@ declare void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* nocapture, i32 addrs
 @global.arr = unnamed_addr addrspace(1) global [256 x i32] undef, align 4
 
 ; HSA: @store_cast_0_flat_to_group_addrspacecast() #1
-define void @store_cast_0_flat_to_group_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_flat_to_group_addrspacecast() #1 {
   store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
   ret void
 }
 
 ; HSA: @store_cast_0_group_to_flat_addrspacecast() #2
-define void @store_cast_0_group_to_flat_addrspacecast() #1 {
+define amdgpu_kernel void @store_cast_0_group_to_flat_addrspacecast() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*)
   ret void
 }
 
-; HSA: define void @store_constant_cast_group_gv_to_flat() #2
-define void @store_constant_cast_group_gv_to_flat() #1 {
+; HSA: define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #2
+define amdgpu_kernel void @store_constant_cast_group_gv_to_flat() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds.i32 to i32 addrspace(4)*)
   ret void
 }
 
 ; HSA: @store_constant_cast_group_gv_gep_to_flat() #2
-define void @store_constant_cast_group_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat() #1 {
   store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   ret void
 }
 
 ; HSA: @store_constant_cast_global_gv_to_flat() #1
-define void @store_constant_cast_global_gv_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_to_flat() #1 {
   store i32 7, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global.i32 to i32 addrspace(4)*)
   ret void
 }
 
 ; HSA: @store_constant_cast_global_gv_gep_to_flat() #1
-define void @store_constant_cast_global_gv_gep_to_flat() #1 {
+define amdgpu_kernel void @store_constant_cast_global_gv_gep_to_flat() #1 {
   store i32 7, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(1)* @global.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   ret void
 }
 
 ; HSA: @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @load_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = load i32, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8)
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
 ; HSA: @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @atomicrmw_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = atomicrmw add i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 1 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
 }
 
 ; HSA: @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   %val = cmpxchg i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 0, i32 1 seq_cst seq_cst
   %val0 = extractvalue { i32, i1 } %val, 0
   store i32 %val0, i32 addrspace(1)* %out
@@ -67,28 +67,28 @@ define void @cmpxchg_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out)
 }
 
 ; HSA: @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #2
-define void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @memcpy_constant_cast_group_gv_gep_to_flat(i32 addrspace(1)* %out) #1 {
   call void @llvm.memcpy.p1i32.p4i32.i32(i32 addrspace(1)* %out, i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 32, i32 4, i1 false)
   ret void
 }
 
 ; Can't just search the pointer value
 ; HSA: @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #2
-define void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_value_constant_cast_lds_gv_gep_to_flat(i32 addrspace(4)* addrspace(1)* %out) #1 {
   store i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8), i32 addrspace(4)* addrspace(1)* %out
   ret void
 }
 
 ; Can't just search pointer types
 ; HSA: @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #2
-define void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @store_ptrtoint_value_constant_cast_lds_gv_gep_to_flat(i64 addrspace(1)* %out) #1 {
   store i64 ptrtoint (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i64), i64 addrspace(1)* %out
   ret void
 }
 
 ; Cast group to flat, do GEP, cast back to group
 ; HSA: @store_constant_cast_group_gv_gep_to_flat_to_group() #2
-define void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
+define amdgpu_kernel void @store_constant_cast_group_gv_gep_to_flat_to_group() #1 {
   store i32 7, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* getelementptr ([256 x i32], [256 x i32] addrspace(4)* addrspacecast ([256 x i32] addrspace(3)* @lds.arr to [256 x i32] addrspace(4)*), i64 0, i64 8) to i32 addrspace(3)*)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/addrspacecast.ll b/test/CodeGen/AMDGPU/addrspacecast.ll
index 0a2130c96addc29f76de3dd968e66bdae9ae0244..6ec93c72ec527d4ad0f51570bd46cd8349031ce2 100644
--- a/test/CodeGen/AMDGPU/addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/addrspacecast.ll
@@ -1,14 +1,23 @@
-; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=-promote-alloca -verify-machineinstrs < %s | FileCheck -check-prefix=HSA -check-prefix=GFX9 %s
 
 ; HSA-LABEL: {{^}}use_group_to_flat_addrspacecast:
 ; HSA: enable_sgpr_private_segment_buffer = 1
 ; HSA: enable_sgpr_dispatch_ptr = 0
-; HSA: enable_sgpr_queue_ptr = 1
+; CI: enable_sgpr_queue_ptr = 1
+; GFX9: enable_sgpr_queue_ptr = 0
 
-; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
-; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
+; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10{{$}}
+; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
+; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16)
+; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
+; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_SHARED_BASE]]
+
+; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_shared_base
 
-; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
 ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 
 ; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
@@ -17,7 +26,13 @@
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
+
+; At most 2 digits. Make sure src_shared_base is not counted as a high
+; number SGPR.
+
+; CI: NumSgprs: {{[0-9][0-9]+}}
+; GFX9: NumSgprs: {{[0-9]+}}
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -26,21 +41,32 @@ define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #0 {
 ; HSA-LABEL: {{^}}use_private_to_flat_addrspacecast:
 ; HSA: enable_sgpr_private_segment_buffer = 1
 ; HSA: enable_sgpr_dispatch_ptr = 0
-; HSA: enable_sgpr_queue_ptr = 1
+; CI: enable_sgpr_queue_ptr = 1
+; GFX9: enable_sgpr_queue_ptr = 0
 
-; HSA-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
-; HSA-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
+; CI-DAG: s_load_dword [[PTR:s[0-9]+]], s[6:7], 0x0{{$}}
+; CI-DAG: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11{{$}}
+; CI-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
+
+; GFX9-DAG: s_load_dword [[PTR:s[0-9]+]], s[4:5], 0x0{{$}}
+; GFX9-DAG: s_getreg_b32 [[SSRC_PRIVATE:s[0-9]+]], hwreg(15, 0, 16)
+; GFX9-DAG: s_lshl_b32 [[SSRC_PRIVATE_BASE:s[0-9]+]], [[SSRC_PRIVATE]], 16
+; GFX9-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[SSRC_PRIVATE_BASE]]
+
+; GFX9-XXX: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], src_private_base
 
-; HSA-DAG: v_mov_b32_e32 [[VAPERTURE:v[0-9]+]], [[APERTURE]]
 ; HSA-DAG: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 
-; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], -1
+; HSA-DAG: v_cmp_ne_u32_e64 vcc, [[PTR]], 0
 ; HSA-DAG: v_cndmask_b32_e32 v[[HI:[0-9]+]], 0, [[VAPERTURE]]
 ; HSA-DAG: v_cndmask_b32_e32 v[[LO:[0-9]+]], 0, [[VPTR]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, [[K]]
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
+
+; CI: NumSgprs: {{[0-9][0-9]+}}
+; GFX9: NumSgprs: {{[0-9]+}}
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -55,7 +81,7 @@ define void @use_private_to_flat_addrspacecast(i32* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
   store volatile i32 7, i32 addrspace(4)* %stof
   ret void
@@ -67,7 +93,7 @@ define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRLO:[0-9]+]], s[[PTRLO]]
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA: flat_load_dword v{{[0-9]+}}, v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
   %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %stof
   ret void
@@ -84,7 +110,7 @@ define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #0 {
 ; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: ds_write_b32 [[CASTPTR]], v[[K]]
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
   ret void
@@ -98,10 +124,10 @@ define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 ; HSA: s_load_dwordx2 s{{\[}}[[PTR_LO:[0-9]+]]:[[PTR_HI:[0-9]+]]{{\]}}
 ; HSA-DAG: v_cmp_ne_u64_e64 vcc, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[VPTR_LO:[0-9]+]], s[[PTR_LO]]
-; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], -1, v[[VPTR_LO]]
+; HSA-DAG: v_cndmask_b32_e32 [[CASTPTR:v[0-9]+]], 0, v[[VPTR_LO]]
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 0{{$}}
 ; HSA: buffer_store_dword v[[K]], [[CASTPTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
   store volatile i32 0, i32* %ftos
   ret void
@@ -115,7 +141,7 @@ define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 ; HSA-DAG: v_mov_b32_e32 v[[VPTRHI:[0-9]+]], s[[PTRHI]]
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0
 ; HSA: flat_store_dword v{{\[}}[[VPTRLO]]:[[VPTRHI]]{{\]}}, [[K]]
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
   ret void
@@ -126,21 +152,27 @@ define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #0 {
 
 ; HSA: s_load_dwordx2 s{{\[}}[[PTRLO:[0-9]+]]:[[PTRHI:[0-9]+]]{{\]}}, s[4:5], 0x0
 ; HSA: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, 0x0
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #0 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
   load volatile i32, i32 addrspace(2)* %ftos
   ret void
 }
 
 ; HSA-LABEL: {{^}}cast_0_group_to_flat_addrspacecast:
-; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
-; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; CI: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x10
+; CI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; GFX9-DAG: s_getreg_b32 [[SSRC_SHARED:s[0-9]+]], hwreg(15, 16, 16)
+; GFX9-DAG: s_lshl_b32 [[SSRC_SHARED_BASE:s[0-9]+]], [[SSRC_SHARED]], 16
+; GFX9-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[SSRC_SHARED_BASE]]
+
+; GFX9-XXX: v_mov_b32_e32 v[[HI:[0-9]+]], src_shared_base
+
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* null to i32 addrspace(4)*
-  store i32 7, i32 addrspace(4)* %cast
+  store volatile i32 7, i32 addrspace(4)* %cast
   ret void
 }
 
@@ -148,9 +180,9 @@ define void @cast_0_group_to_flat_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_0_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_group_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(3)*
-  store i32 7, i32 addrspace(3)* %cast
+  store volatile i32 7, i32 addrspace(3)* %cast
   ret void
 }
 
@@ -159,9 +191,9 @@ define void @cast_0_flat_to_group_addrspacecast() #0 {
 ; HSA: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
 ; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_neg1_group_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_group_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(3)* inttoptr (i32 -1 to i32 addrspace(3)*) to i32 addrspace(4)*
-  store i32 7, i32 addrspace(4)* %cast
+  store volatile i32 7, i32 addrspace(4)* %cast
   ret void
 }
 
@@ -169,31 +201,34 @@ define void @cast_neg1_group_to_flat_addrspacecast() #0 {
 ; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: ds_write_b32 [[PTR]], [[K]]
-define void @cast_neg1_flat_to_group_addrspacecast() #0 {
+define amdgpu_kernel void @cast_neg1_flat_to_group_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* inttoptr (i64 -1 to i32 addrspace(4)*) to i32 addrspace(3)*
-  store i32 7, i32 addrspace(3)* %cast
+  store volatile i32 7, i32 addrspace(3)* %cast
   ret void
 }
 
+; FIXME: Shouldn't need to enable queue ptr
 ; HSA-LABEL: {{^}}cast_0_private_to_flat_addrspacecast:
-; HSA: s_load_dword [[APERTURE:s[0-9]+]], s[4:5], 0x11
-; HSA-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], [[APERTURE]]
+; CI: enable_sgpr_queue_ptr = 1
+; GFX9: enable_sgpr_queue_ptr = 0
+
 ; HSA-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 v[[K:[0-9]+]], 7{{$}}
+; HSA: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; HSA: flat_store_dword v{{\[}}[[LO]]:[[HI]]{{\]}}, v[[K]]
-define void @cast_0_private_to_flat_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_private_to_flat_addrspacecast() #0 {
   %cast = addrspacecast i32* null to i32 addrspace(4)*
-  store i32 7, i32 addrspace(4)* %cast
+  store volatile i32 7, i32 addrspace(4)* %cast
   ret void
 }
 
 ; HSA-LABEL: {{^}}cast_0_flat_to_private_addrspacecast:
-; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], -1{{$}}
+; HSA-DAG: v_mov_b32_e32 [[PTR:v[0-9]+]], 0{{$}}
 ; HSA-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 7{{$}}
 ; HSA: buffer_store_dword [[K]], [[PTR]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen
-define void @cast_0_flat_to_private_addrspacecast() #0 {
+define amdgpu_kernel void @cast_0_flat_to_private_addrspacecast() #0 {
   %cast = addrspacecast i32 addrspace(4)* null to i32 addrspace(0)*
-  store i32 7, i32* %cast
+  store volatile i32 7, i32* %cast
   ret void
 }
 
@@ -203,7 +238,7 @@ define void @cast_0_flat_to_private_addrspacecast() #0 {
 ; HSA-LABEL: {{^}}branch_use_flat_i32:
 ; HSA: flat_store_dword {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}
 ; HSA: s_endpgm
-define void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
+define amdgpu_kernel void @branch_use_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %gptr, i32 addrspace(3)* %lptr, i32 %x, i32 %c) #0 {
 entry:
   %cmp = icmp ne i32 %c, 0
   br i1 %cmp, label %local, label %global
@@ -218,7 +253,7 @@ global:
 
 end:
   %fptr = phi i32 addrspace(4)* [ %flat_local, %local ], [ %flat_global, %global ]
-  store i32 %x, i32 addrspace(4)* %fptr, align 4
+  store volatile i32 %x, i32 addrspace(4)* %fptr, align 4
 ;  %val = load i32, i32 addrspace(4)* %fptr, align 4
 ;  store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
@@ -226,22 +261,26 @@ end:
 
 ; Check for prologue initializing special SGPRs pointing to scratch.
 ; HSA-LABEL: {{^}}store_flat_scratch:
-; HSA-DAG: s_mov_b32 flat_scratch_lo, s9
-; HSA-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
-; HSA: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
+; CI-DAG: s_mov_b32 flat_scratch_lo, s9
+; CI-DAG: s_add_u32 [[ADD:s[0-9]+]], s8, s11
+; CI: s_lshr_b32 flat_scratch_hi, [[ADD]], 8
+
+; GFX9: s_add_u32 flat_scratch_lo, s6, s9
+; GFX9: s_addc_u32 flat_scratch_hi, s7, 0
+
 ; HSA: flat_store_dword
 ; HSA: s_barrier
 ; HSA: flat_load_dword
-define void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
+define amdgpu_kernel void @store_flat_scratch(i32 addrspace(1)* noalias %out, i32) #0 {
   %alloca = alloca i32, i32 9, align 4
   %x = call i32 @llvm.amdgcn.workitem.id.x() #2
   %pptr = getelementptr i32, i32* %alloca, i32 %x
   %fptr = addrspacecast i32* %pptr to i32 addrspace(4)*
-  store i32 %x, i32 addrspace(4)* %fptr
+  store volatile i32 %x, i32 addrspace(4)* %fptr
   ; Dummy call
   call void @llvm.amdgcn.s.barrier() #1
-  %reload = load i32, i32 addrspace(4)* %fptr, align 4
-  store i32 %reload, i32 addrspace(1)* %out, align 4
+  %reload = load volatile i32, i32 addrspace(4)* %fptr, align 4
+  store volatile i32 %reload, i32 addrspace(1)* %out, align 4
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/amdgcn.bitcast.ll b/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
index 87ef5978ebfcb976fa67cbab71f5f6e423c1f79c..ef742f56faec36c6d7ed2d9a75ec443144b200c0 100644
--- a/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.bitcast.ll
@@ -3,24 +3,20 @@
 
 ; This test just checks that the compiler doesn't crash.
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 ; FUNC-LABEL: {{^}}v32i8_to_v8i32:
-; SI: s_endpgm
-define amdgpu_ps void @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
+define amdgpu_ps float @v32i8_to_v8i32(<32 x i8> addrspace(2)* inreg) #0 {
 entry:
   %1 = load <32 x i8>, <32 x i8> addrspace(2)* %0
   %2 = bitcast <32 x i8> %1 to <8 x i32>
   %3 = extractelement <8 x i32> %2, i32 1
   %4 = icmp ne i32 %3, 0
   %5 = select i1 %4, float 0.0, float 1.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %5, float %5, float %5)
-  ret void
+  ret float %5
 }
 
 ; FUNC-LABEL: {{^}}i8ptr_v16i8ptr:
 ; SI: s_endpgm
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
   %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
@@ -28,28 +24,50 @@ entry:
   ret void
 }
 
-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
-  %bc = bitcast float %load to <2 x i16>
-  store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
+  %fadd32 = fadd float %load, 1.0
+  %bc = bitcast float %fadd32 to <2 x i16>
+  %add.bitcast = add <2 x i16> %bc, <i16 2, i16 2>
+  store <2 x i16> %add.bitcast, <2 x i16> addrspace(1)* %out
   ret void
 }
 
-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
-  %bc = bitcast <2 x i16> %load to float
-  store float %bc, float addrspace(1)* %out, align 4
+  %add.v2i16 = add <2 x i16> %load, <i16 2, i16 2>
+  %bc = bitcast <2 x i16> %add.v2i16 to float
+  %fadd.bitcast = fadd float %bc, 1.0
+  store float %fadd.bitcast, float addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @f32_to_v2f16(<2 x half> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+  %load = load float, float addrspace(1)* %in, align 4
+  %fadd32 = fadd float %load, 1.0
+  %bc = bitcast float %fadd32 to <2 x half>
+  %add.bitcast = fadd <2 x half> %bc, <half 2.0, half 2.0>
+  store <2 x half> %add.bitcast, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_kernel void @v2f16_to_f32(float addrspace(1)* %out, <2 x half> addrspace(1)* %in) nounwind {
+  %load = load <2 x half>, <2 x half> addrspace(1)* %in, align 4
+  %add.v2f16 = fadd <2 x half> %load, <half 2.0, half 2.0>
+  %bc = bitcast <2 x half> %add.v2f16 to float
+  %fadd.bitcast = fadd float %bc, 1.0
+  store float %fadd.bitcast, float addrspace(1)* %out
   ret void
 }
 
-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -58,17 +76,18 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou
 
 ; FUNC-LABEL: {{^}}bitcast_v2i32_to_f64:
 ; SI: s_endpgm
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %add = add <2 x i32> %val, <i32 4, i32 9>
   %bc = bitcast <2 x i32> %add to double
-  store double %bc, double addrspace(1)* %out, align 8
+  %fadd.bc = fadd double %bc, 1.0
+  store double %fadd.bc, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
 ; SI: s_endpgm
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
@@ -77,7 +96,7 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -93,7 +112,7 @@ end:
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
diff --git a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
index a6d055891d4b6a9b2debfe462df7a84ba60f16e5..79450b97c218ae0b04ce005d1b094f6deda4469e 100644
--- a/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgcn.private-memory.ll
@@ -15,7 +15,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; GCN-NOT: v0
 ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, v0, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll
deleted file mode 100644
index 8d8885852afeeec80ba352dfed582a916fce317d..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/amdgcn.sendmsg-m0.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}main:
-; GCN: s_mov_b32 m0, s0
-; VI-NEXT: s_nop 0
-; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
-; GCN-NEXT: s_endpgm
-
-define amdgpu_gs void @main(i32 inreg %a) #0 {
-  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %a)
-  ret void
-}
-
-; GCN-LABEL: {{^}}main_halt:
-; GCN: s_mov_b32 m0, s0
-; VI-NEXT: s_nop 0
-; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT)
-; GCN-NEXT: s_endpgm
-
-define  void @main_halt(i32 inreg %a) #0 {
-  call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a)
-  ret void
-}
-
-; GCN-LABEL: {{^}}legacy:
-; GCN: s_mov_b32 m0, s0
-; VI-NEXT: s_nop 0
-; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
-; GCN-NEXT: s_endpgm
-
-define amdgpu_gs void @legacy(i32 inreg %a) #0 {
-  call void @llvm.SI.sendmsg(i32 3, i32 %a)
-  ret void
-}
-
-declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
-declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
-declare void @llvm.SI.sendmsg(i32, i32) #0
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll b/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll
deleted file mode 100644
index 31f9cfca6def137508733ee397eba616ee0af968..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/amdgcn.sendmsg.ll
+++ /dev/null
@@ -1,161 +0,0 @@
-;RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK-LABEL: {{^}}test_interrupt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT)
-define void @test_interrupt() {
-body:
-  call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0);
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_emit:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
-define void @test_gs_emit() {
-body:
-  call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0);
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_cut:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
-define void @test_gs_cut() {
-body:
-  call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0);
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_emit_cut:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-define void @test_gs_emit_cut() {
-body:
-  call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_done:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
-define void @test_gs_done() {
-body:
-  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
-  ret void
-}
-
-
-; CHECK-LABEL: {{^}}test_interrupt_halt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsghalt sendmsg(MSG_INTERRUPT)
-define void @test_interrupt_halt() {
-body:
-  call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_emit_halt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0)
-define void @test_gs_emit_halt() {
-body:
-  call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_cut_halt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1)
-define void @test_gs_cut_halt() {
-body:
-  call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_emit_cut_halt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-define void @test_gs_emit_cut_halt() {
-body:
-  call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_gs_done_halt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP)
-define void @test_gs_done_halt() {
-body:
-  call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0)
-  ret void
-}
-
-; Legacy
-; CHECK-LABEL: {{^}}test_legacy_interrupt:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_INTERRUPT)
-define void @test_legacy_interrupt() {
-body:
-  call void @llvm.SI.sendmsg(i32 1, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_legacy_gs_emit:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
-define void @test_legacy_gs_emit() {
-body:
-  call void @llvm.SI.sendmsg(i32 34, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_legacy_gs_cut:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
-define void @test_legacy_gs_cut() {
-body:
-  call void @llvm.SI.sendmsg(i32 274, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_legacy_gs_emit_cut:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
-define void @test_legacy_gs_emit_cut() {
-body:
-  call void @llvm.SI.sendmsg(i32 562, i32 0)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}test_legacy_gs_done:
-; CHECK: s_mov_b32 m0, 0
-; CHECK-NOT: s_mov_b32 m0
-; CHECK: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
-define void @test_legacy_gs_done() {
-body:
-  call void @llvm.SI.sendmsg(i32 3, i32 0)
-  ret void
-}
-
-; Function Attrs: nounwind
-declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
-declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
-declare void @llvm.SI.sendmsg(i32, i32) #0
-
-attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll b/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e68ed9cac93f790cf0926471b1666b4fb58ce829
--- /dev/null
+++ b/test/CodeGen/AMDGPU/amdgpu-alias-analysis.ll
@@ -0,0 +1,9 @@
+; RUN: opt -mtriple=amdgcn-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+; RUN: opt -mtriple=r600-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: NoAlias:      i8 addrspace(1)* %p1, i8* %p
+
+define void @test(i8* %p, i8 addrspace(1)* %p1) {
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
index d78c75165be246444b28f4abf3ccf495f7d2b717..0e5605961e10cd4fb7ad308b5dee3d7c4577e6f0 100644
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-fdiv.ll
@@ -4,7 +4,7 @@
 
 ; NOOP-LABEL: @noop_fdiv_fpmath(
 ; NOOP: %md.25ulp = fdiv float %a, %b, !fpmath !0
-define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
+define amdgpu_kernel void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
   %md.25ulp = fdiv float %a, %b, !fpmath !0
   store volatile float %md.25ulp, float addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@ define void @noop_fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #3 {
 ; CHECK: %md.3ulp = call float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !3
 ; CHECK: %fast.md.25ulp = call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: arcp.md.25ulp = call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
 
@@ -51,7 +51,7 @@ define void @fdiv_fpmath(float addrspace(1)* %out, float %a, float %b) #1 {
 ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0
 ; CHECK: %fast.no.md = fdiv fast float 1.000000e+00, %x{{$}}
 ; CHECK: %fast.25ulp = fdiv fast float 1.000000e+00, %x, !fpmath !0
-define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
   %no.md = fdiv float 1.0, %x
   store volatile float %no.md, float addrspace(1)* %out
 
@@ -89,7 +89,7 @@ define void @rcp_fdiv_fpmath(float addrspace(1)* %out, float %x) #1 {
 ; CHECK: %[[B1:[0-9]+]] = extractelement <2 x float> %b, i64 1
 ; CHECK: %[[FDIV1:[0-9]+]] = call float @llvm.amdgcn.fdiv.fast(float %[[A1]], float %[[B1]]), !fpmath !0
 ; CHECK: %md.25ulp = insertelement <2 x float> %[[INS0]], float %[[FDIV1]], i64 1
-define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
+define amdgpu_kernel void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #1 {
   %no.md = fdiv <2 x float> %a, %b
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -120,7 +120,7 @@ define void @fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %a,
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: fdiv fast float 1.000000e+00, %{{[0-9]+}}, !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp, <2 x float> addrspace(1)* %out
-define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 1.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -158,7 +158,7 @@ define void @rcp_fdiv_fpmath_vector(<2 x float> addrspace(1)* %out, <2 x float>
 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1
 ; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 {
   %no.md = fdiv <2 x float> <float 1.0, float 2.0>, %x
   store volatile <2 x float> %no.md, <2 x float> addrspace(1)* %out
 
@@ -186,7 +186,7 @@ define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %{{[0-9]+}}, float %{{[0-9]+}}), !fpmath !0
 ; CHECK: store volatile <2 x float> %fast.25ulp
-define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
+define amdgpu_kernel void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %out, <2 x float> %x, <2 x float> %y) #1 {
   %x.insert = insertelement <2 x float> %x, float 1.0, i32 0
 
   %arcp.25ulp = fdiv arcp <2 x float> %x.insert, %y, !fpmath !0
@@ -206,7 +206,7 @@ define void @rcp_fdiv_fpmath_vector_partial_constant(<2 x float> addrspace(1)* %
 ; CHECK: %md.3ulp = fdiv float %a, %b, !fpmath !3
 ; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
 ; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float %a, float %b), !fpmath !0
-define void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fpmath_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
   %no.md = fdiv float %a, %b
   store volatile float %no.md, float addrspace(1)* %out
 
diff --git a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
index 13e4192ccd72d505bf2249a6454e8be57d417327..95a206e1dd00d3910c42f8faad0993e2002677a3 100644
--- a/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-codegenprepare-i16-to-i32.ll
@@ -6,7 +6,7 @@
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @add_i3(i3 %a, i3 %b) {
@@ -19,7 +19,7 @@ define i3 @add_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @add_nsw_i3(i3 %a, i3 %b) {
@@ -32,7 +32,7 @@ define i3 @add_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @add_nuw_i3(i3 %a, i3 %b) {
@@ -58,7 +58,7 @@ define i3 @add_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = sub i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @sub_i3(i3 %a, i3 %b) {
@@ -84,7 +84,7 @@ define i3 @sub_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @sub_nuw_i3(i3 %a, i3 %b) {
@@ -110,7 +110,7 @@ define i3 @sub_nuw_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @mul_i3(i3 %a, i3 %b) {
@@ -123,7 +123,7 @@ define i3 @mul_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @mul_nsw_i3(i3 %a, i3 %b) {
@@ -136,7 +136,7 @@ define i3 @mul_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @mul_nuw_i3(i3 %a, i3 %b) {
@@ -188,7 +188,7 @@ define i3 @srem_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @shl_i3(i3 %a, i3 %b) {
@@ -201,7 +201,7 @@ define i3 @shl_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @shl_nsw_i3(i3 %a, i3 %b) {
@@ -214,7 +214,7 @@ define i3 @shl_nsw_i3(i3 %a, i3 %b) {
 ; SI-NEXT: ret i3 %r
 ; VI: %[[A_32:[0-9]+]] = zext i3 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i3 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_3:[0-9]+]] = trunc i32 %[[R_32]] to i3
 ; VI-NEXT: ret i3 %[[R_3]]
 define i3 @shl_nuw_i3(i3 %a, i3 %b) {
@@ -525,7 +525,7 @@ define i3 @bitreverse_i3(i3 %a) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @add_i16(i16 %a, i16 %b) {
@@ -559,7 +559,7 @@ define i16 @constant_add_nuw_i16() {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @add_nsw_i16(i16 %a, i16 %b) {
@@ -572,7 +572,7 @@ define i16 @add_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = add nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @add_nuw_i16(i16 %a, i16 %b) {
@@ -598,7 +598,7 @@ define i16 @add_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = sub i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @sub_i16(i16 %a, i16 %b) {
@@ -624,7 +624,7 @@ define i16 @sub_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @sub_nuw_i16(i16 %a, i16 %b) {
@@ -650,7 +650,7 @@ define i16 @sub_nuw_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @mul_i16(i16 %a, i16 %b) {
@@ -663,7 +663,7 @@ define i16 @mul_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @mul_nsw_i16(i16 %a, i16 %b) {
@@ -676,7 +676,7 @@ define i16 @mul_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @mul_nuw_i16(i16 %a, i16 %b) {
@@ -728,7 +728,7 @@ define i16 @srem_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @shl_i16(i16 %a, i16 %b) {
@@ -741,7 +741,7 @@ define i16 @shl_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @shl_nsw_i16(i16 %a, i16 %b) {
@@ -754,7 +754,7 @@ define i16 @shl_nsw_i16(i16 %a, i16 %b) {
 ; SI-NEXT: ret i16 %r
 ; VI: %[[A_32:[0-9]+]] = zext i16 %a to i32
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext i16 %b to i32
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw i32 %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw i32 %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc i32 %[[R_32]] to i16
 ; VI-NEXT: ret i16 %[[R_16]]
 define i16 @shl_nuw_i16(i16 %a, i16 %b) {
@@ -1072,7 +1072,7 @@ define i16 @bitreverse_i16(i16 %a) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1085,7 +1085,7 @@ define <3 x i15> @add_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1098,7 +1098,7 @@ define <3 x i15> @add_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @add_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1124,7 +1124,7 @@ define <3 x i15> @add_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = sub <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @sub_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1150,7 +1150,7 @@ define <3 x i15> @sub_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @sub_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1176,7 +1176,7 @@ define <3 x i15> @sub_nuw_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1189,7 +1189,7 @@ define <3 x i15> @mul_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1202,7 +1202,7 @@ define <3 x i15> @mul_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @mul_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1254,7 +1254,7 @@ define <3 x i15> @srem_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1267,7 +1267,7 @@ define <3 x i15> @shl_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1280,7 +1280,7 @@ define <3 x i15> @shl_nsw_3xi15(<3 x i15> %a, <3 x i15> %b) {
 ; SI-NEXT: ret <3 x i15> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i15> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i15> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_15:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i15>
 ; VI-NEXT: ret <3 x i15> %[[R_15]]
 define <3 x i15> @shl_nuw_3xi15(<3 x i15> %a, <3 x i15> %b) {
@@ -1591,7 +1591,7 @@ define <3 x i15> @bitreverse_3xi15(<3 x i15> %a) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1604,7 +1604,7 @@ define <3 x i16> @add_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1617,7 +1617,7 @@ define <3 x i16> @add_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = add nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = add nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @add_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1643,7 +1643,7 @@ define <3 x i16> @add_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = sub <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @sub_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1669,7 +1669,7 @@ define <3 x i16> @sub_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = sub nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @sub_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1695,7 +1695,7 @@ define <3 x i16> @sub_nuw_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1708,7 +1708,7 @@ define <3 x i16> @mul_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1721,7 +1721,7 @@ define <3 x i16> @mul_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = mul nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @mul_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1773,7 +1773,7 @@ define <3 x i16> @srem_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1786,7 +1786,7 @@ define <3 x i16> @shl_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nsw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
@@ -1799,7 +1799,7 @@ define <3 x i16> @shl_nsw_3xi16(<3 x i16> %a, <3 x i16> %b) {
 ; SI-NEXT: ret <3 x i16> %r
 ; VI: %[[A_32:[0-9]+]] = zext <3 x i16> %a to <3 x i32>
 ; VI-NEXT: %[[B_32:[0-9]+]] = zext <3 x i16> %b to <3 x i32>
-; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw <3 x i32> %[[A_32]], %[[B_32]]
+; VI-NEXT: %[[R_32:[0-9]+]] = shl nuw nsw <3 x i32> %[[A_32]], %[[B_32]]
 ; VI-NEXT: %[[R_16:[0-9]+]] = trunc <3 x i32> %[[R_32]] to <3 x i16>
 ; VI-NEXT: ret <3 x i16> %[[R_16]]
 define <3 x i16> @shl_nuw_3xi16(<3 x i16> %a, <3 x i16> %b) {
diff --git a/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll b/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
index dd16907b748c63605f3c4add3b58e9a0d08ad1a3..0ba8836b20dca746160a0e91668667f2e66e21ca 100644
--- a/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
+++ b/test/CodeGen/AMDGPU/amdgpu-shader-calling-convention.ll
@@ -13,9 +13,10 @@ define amdgpu_cs float @shader_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w
 
 ; GCN-LABEL: {{^}}kernel_cc:
 ; GCN: s_endpgm
-define float @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+define amdgpu_kernel void @kernel_cc(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
   %vi = bitcast float %v to i32
   %x = add i32 %vi, %w
   %xf = bitcast i32 %x to float
-  ret float %xf
+  store float %xf, float addrspace(1)* undef
+  ret void
 }
diff --git a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
index ef2ca647cc30c8cc4fde0164a18f93e00aa53458..97cb9067f29a977c8717b1901e69f2e7d8bbddff 100644
--- a/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.private-memory.ll
@@ -1,9 +1,9 @@
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
-; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
-; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-PROMOTE
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -march=amdgcn < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -mcpu=kaveri -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC -check-prefix=HSA-ALLOCA
+; RUN: llc -show-mc-encoding -mattr=+promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-PROMOTE -check-prefix=SI -check-prefix=FUNC
+; RUN: llc -show-mc-encoding -mattr=-promote-alloca -amdgpu-load-store-vectorizer=0 -enable-amdgpu-aa=0 -verify-machineinstrs -mtriple=amdgcn-amdhsa -march=amdgcn -mcpu=tonga -mattr=-unaligned-buffer-access < %s | FileCheck %s -check-prefix=SI-ALLOCA -check-prefix=SI -check-prefix=FUNC
 
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=HSAOPT -check-prefix=OPT %s
 ; RUN: opt -S -mtriple=amdgcn-unknown-unknown -mcpu=kaveri -amdgpu-promote-alloca < %s | FileCheck -check-prefix=NOHSAOPT -check-prefix=OPT %s
@@ -80,7 +80,7 @@
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.x(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.y(), !range !0
 ; NOHSAOPT: call i32 @llvm.amdgcn.workitem.id.z(), !range !0
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -102,7 +102,7 @@ entry:
 
 ; OPT-LABEL: @high_alignment(
 ; OPT: getelementptr inbounds [256 x [8 x i32]], [256 x [8 x i32]] addrspace(3)* @high_alignment.stack, i32 0, i32 %{{[0-9]+}}
-define void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @high_alignment(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [8 x i32], align 16
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -127,7 +127,7 @@ entry:
 ; OPT: alloca [5 x i32]
 
 ; SI-NOT: ds_write
-define void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @no_replace_inbounds_gep(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -162,7 +162,7 @@ entry:
 ; SI-NOT: v_movrel
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -191,7 +191,7 @@ entry:
 ; R600-NOT: MOVA_INT
 ; SI-NOT: v_movrel
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -227,15 +227,15 @@ for.end:
 
 ; R600: MOVA_INT
 
-; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x68,0xe0
-; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:2 ; encoding: [0x02,0x00,0x68,0xe0
+; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:6 ; encoding: [0x06,0x00,0x68,0xe0
+; SI-ALLOCA-DAG: buffer_store_short v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x68,0xe0
 ; Loaded value is 0 or 1, so sext will become zext, so we get buffer_load_ushort instead of buffer_load_sshort.
 ; SI-ALLOCA: buffer_load_sshort v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}}
 
 ; SI-PROMOTE: s_load_dword [[IDX:s[0-9]+]]
 ; SI-PROMOTE: s_lshl_b32 [[SCALED_IDX:s[0-9]+]], [[IDX]], 16
 ; SI-PROMOTE: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[SCALED_IDX]], 16
-define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -253,12 +253,12 @@ entry:
 
 ; R600: MOVA_INT
 
-; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding:
-; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding:
+; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding:
+; SI-PROMOTE-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding:
 
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} ; encoding: [0x00,0x00,0x60,0xe0
-; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:1 ; encoding: [0x01,0x00,0x60,0xe0
-define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ; encoding: [0x04,0x00,0x60,0xe0
+; SI-ALLOCA-DAG: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:5 ; encoding: [0x05,0x00,0x60,0xe0
+define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -281,7 +281,7 @@ entry:
 ;
 ; A total of 5 bytes should be allocated and used.
 ; SI: buffer_store_byte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4 ;
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -305,7 +305,7 @@ entry:
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -319,7 +319,7 @@ entry:
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -332,7 +332,7 @@ entry:
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -347,7 +347,7 @@ entry:
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -360,7 +360,7 @@ entry:
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -373,7 +373,7 @@ entry:
   ret void
 }
 
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -394,7 +394,7 @@ entry:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:5 ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
@@ -410,7 +410,7 @@ define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; OPT-LABEL: @pointer_typed_alloca(
 ; OPT:  getelementptr inbounds [256 x i32 addrspace(1)*], [256 x i32 addrspace(1)*] addrspace(3)* @pointer_typed_alloca.A.addr, i32 0, i32 %{{[0-9]+}}
 ; OPT: load i32 addrspace(1)*, i32 addrspace(1)* addrspace(3)* %{{[0-9]+}}, align 4
-define void @pointer_typed_alloca(i32 addrspace(1)* %A) {
+define amdgpu_kernel void @pointer_typed_alloca(i32 addrspace(1)* %A) {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
@@ -462,7 +462,7 @@ entry:
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [2 x <16 x i32>]
   %tmp0 = getelementptr [2 x <16 x i32>], [2 x <16 x i32>]* %alloca, i32 0, i32 %a
   %tmp5 = load <16 x i32>, <16 x i32>* %tmp0
@@ -506,7 +506,7 @@ define void @v16i32_stack(<16 x i32> addrspace(1)* %out, i32 %a) {
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [2 x <16 x float>]
   %tmp0 = getelementptr [2 x <16 x float>], [2 x <16 x float>]* %alloca, i32 0, i32 %a
   %tmp5 = load <16 x float>, <16 x float>* %tmp0
@@ -522,7 +522,7 @@ define void @v16float_stack(<16 x float> addrspace(1)* %out, i32 %a) {
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
 
-define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
   %alloca = alloca [16 x <2 x float>]
   %tmp0 = getelementptr [16 x <2 x float>], [16 x <2 x float>]* %alloca, i32 0, i32 %a
   %tmp5 = load <2 x float>, <2 x float>* %tmp0
@@ -533,7 +533,7 @@ define void @v2float_stack(<2 x float> addrspace(1)* %out, i32 %a) {
 ; OPT-LABEL: @direct_alloca_read_0xi32(
 ; OPT: store [0 x i32] undef, [0 x i32] addrspace(3)*
 ; OPT: load [0 x i32], [0 x i32] addrspace(3)*
-define void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_0xi32([0 x i32] addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [0 x i32]
   store [0 x i32] [], [0 x i32]* %tmp
@@ -545,7 +545,7 @@ entry:
 ; OPT-LABEL: @direct_alloca_read_1xi32(
 ; OPT: store [1 x i32] zeroinitializer, [1 x i32] addrspace(3)*
 ; OPT: load [1 x i32], [1 x i32] addrspace(3)*
-define void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @direct_alloca_read_1xi32([1 x i32] addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [1 x i32]
   store [1 x i32] [i32 0], [1 x i32]* %tmp
diff --git a/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
index e515ca00d184f9f9bfca5b1725b61e75d8d92e07..187320805c11d2ed7110d2dbd5f88ad11eadfb18 100644
--- a/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
+++ b/test/CodeGen/AMDGPU/amdgpu.work-item-intrinsics.deprecated.ll
@@ -12,7 +12,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].X
-define void @ngroups_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -27,7 +27,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Y
-define void @ngroups_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].Z
-define void @ngroups_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ngroups_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.ngroups.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[0].W
-define void @global_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -72,7 +72,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].X
-define void @global_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -87,7 +87,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Y
-define void @global_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @global_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.global.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -102,7 +102,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].Z
-define void @local_size_x (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -117,7 +117,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[1].W
-define void @local_size_y (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -132,7 +132,7 @@ entry:
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV {{\*? *}}[[VAL]], KC0[2].X
-define void @local_size_z (i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z (i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -153,7 +153,7 @@ entry:
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -165,7 +165,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword [[VVAL]]
 
 ; GCN-NOHSA: COMPUTE_PGM_RSRC2:USER_SGPR: 2
-define void @tgid_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -181,7 +181,7 @@ entry:
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; GCN: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; GCN: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @tgid_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -194,7 +194,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_x_legacy:
 ; GCN-NOHSA: buffer_store_dword v0
-define void @tidig_x_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_x_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -208,7 +208,7 @@ entry:
 ; FUNC-LABEL: {{^}}tidig_y_legacy:
 
 ; GCN-NOHSA: buffer_store_dword v1
-define void @tidig_y_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_y_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -221,7 +221,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_z_legacy:
 ; GCN-NOHSA: buffer_store_dword v2
-define void @tidig_z_legacy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_z_legacy(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/and-gcn.ll b/test/CodeGen/AMDGPU/and-gcn.ll
index dde5f8c217695de7384ed56686ed94d9fa1280c5..2aec03aff8a3aad43987050fb5a2f46babbaf3a1 100644
--- a/test/CodeGen/AMDGPU/and-gcn.ll
+++ b/test/CodeGen/AMDGPU/and-gcn.ll
@@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}v_and_i64_br:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64_br(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
 entry:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp0 = icmp eq i32 %tid, 0
diff --git a/test/CodeGen/AMDGPU/and.ll b/test/CodeGen/AMDGPU/and.ll
index 5d9dcf64debf108dbfef749accce003c8a243a8a..c356f8b87cfc6d920dcfe81a54b2bb77a288de55 100644
--- a/test/CodeGen/AMDGPU/and.ll
+++ b/test/CodeGen/AMDGPU/and.ll
@@ -11,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -31,7 +31,7 @@ define void @test2(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_and_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -42,7 +42,7 @@ define void @test4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}s_and_i32:
 ; SI: s_and_b32
-define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, %b
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
@@ -50,7 +50,7 @@ define void @s_and_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 
 ; FUNC-LABEL: {{^}}s_and_constant_i32:
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687
-define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
   ret void
@@ -66,7 +66,7 @@ define void @s_and_constant_i32(i32 addrspace(1)* %out, i32 %a) {
 ; SI-DAG: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
 
   ; Just to stop future replacement of copy to vgpr + store with VALU op.
@@ -83,7 +83,7 @@ define void @s_and_multi_use_constant_i32_0(i32 addrspace(1)* %out, i32 %a, i32
 ; SI: s_add_i32
 ; SI: s_add_i32 [[ADD:s[0-9]+]], s{{[0-9]+}}, [[K]]
 ; SI: buffer_store_dword [[VK]]
-define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %and = and i32 %a, 1234567
   %foo = add i32 %and, 1234567
   %bar = add i32 %foo, %b
@@ -93,7 +93,7 @@ define void @s_and_multi_use_constant_i32_1(i32 addrspace(1)* %out, i32 %a, i32
 
 ; FUNC-LABEL: {{^}}v_and_i32_vgpr_vgpr:
 ; SI: v_and_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -109,7 +109,7 @@ define void @v_and_i32_vgpr_vgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1)* %bptr) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.b = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -123,7 +123,7 @@ define void @v_and_i32_sgpr_vgpr(i32 addrspace(1)* %out, i32 %a, i32 addrspace(1
 ; SI-DAG: s_load_dword [[SA:s[0-9]+]]
 ; SI-DAG: {{buffer|flat}}_load_dword [[VB:v[0-9]+]]
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[SA]], [[VB]]
-define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
+define amdgpu_kernel void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 %b) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -135,7 +135,7 @@ define void @v_and_i32_vgpr_sgpr(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 
 ; FUNC-LABEL: {{^}}v_and_constant_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, v{{[0-9]+}}
-define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 1234567
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -144,7 +144,7 @@ define void @v_and_constant_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr)
 
 ; FUNC-LABEL: {{^}}v_and_inline_imm_64_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, 64, v{{[0-9]+}}
-define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, 64
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@ define void @v_and_inline_imm_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 
 ; FUNC-LABEL: {{^}}v_and_inline_imm_neg_16_i32
 ; SI: v_and_b32_e32 v{{[0-9]+}}, -16, v{{[0-9]+}}
-define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %and = and i32 %a, -16
   store i32 %and, i32 addrspace(1)* %out, align 4
@@ -162,7 +162,7 @@ define void @v_and_inline_imm_neg_16_i32(i32 addrspace(1)* %out, i32 addrspace(1
 
 ; FUNC-LABEL: {{^}}s_and_i64
 ; SI: s_and_b64
-define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and = and i64 %a, %b
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -171,7 +171,7 @@ define void @s_and_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; FIXME: Should use SGPRs
 ; FUNC-LABEL: {{^}}s_and_i1:
 ; SI: v_and_b32
-define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
+define amdgpu_kernel void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
   %and = and i1 %a, %b
   store i1 %and, i1 addrspace(1)* %out
   ret void
@@ -181,7 +181,7 @@ define void @s_and_i1(i1 addrspace(1)* %out, i1 %a, i1 %b) {
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000{{$}}
 ; SI-DAG: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80{{$}}
 ; SI: buffer_store_dwordx2
-define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 549756338176
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -191,7 +191,7 @@ define void @s_and_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; XSI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x80000{{$}}
 ; XSI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0x80{{$}}
 ; XSI: s_and_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}
-define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %and0 = and i64 %a, 549756338176
   %and1 = and i64 %b, 549756338176
   store volatile i64 %and0, i64 addrspace(1)* %out
@@ -205,7 +205,7 @@ define void @s_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 %a, i64 %b
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x12d687{{$}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -223,7 +223,7 @@ define void @s_and_32_bit_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI: s_and_b32 s{{[0-9]+}}, [[B]], 62
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64 %b, i64 %c) {
   %shl.a = shl i64 %a, 1
   %shl.b = shl i64 %b, 1
   %and0 = and i64 %shl.a, 62
@@ -238,7 +238,7 @@ define void @s_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 %a, i64
 ; FUNC-LABEL: {{^}}v_and_i64:
 ; SI: v_and_b32
 ; SI: v_and_b32
-define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %and = and i64 %a, %b
@@ -250,7 +250,7 @@ define void @v_and_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0xab19b207, {{v[0-9]+}}
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, 0x11e, {{v[0-9]+}}
 ; SI: buffer_store_dwordx2
-define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1231231234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -268,7 +268,7 @@ define void @v_and_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
 ; SI-DAG: v_and_b32_e32 {{v[0-9]+}}, [[KHI]], v[[HI1]]
 ; SI: buffer_store_dwordx2
 ; SI: buffer_store_dwordx2
-define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
   %and0 = and i64 %a, 1231231234567
@@ -288,7 +288,7 @@ define void @v_and_multi_use_constant_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO0]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESLO1]]
-define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load volatile i64, i64 addrspace(1)* %aptr
   %b = load volatile i64, i64 addrspace(1)* %aptr
   %and0 = and i64 %a, 63
@@ -304,7 +304,7 @@ define void @v_and_multi_use_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspac
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 0x12d687, [[VAL]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -317,7 +317,7 @@ define void @v_and_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: v_and_b32_e32 {{v[0-9]+}}, 64, {{v[0-9]+}}
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -331,7 +331,7 @@ define void @v_and_inline_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %apt
 ; SI: v_and_b32_e32 v[[VAL_LO]], -8, v[[VAL_LO]]
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %and = and i64 %a, -8
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -344,7 +344,7 @@ define void @v_and_inline_neg_imm_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 64
 ; SI-NOT: and
 ; SI: buffer_store_dword
-define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 64
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -358,7 +358,7 @@ define void @s_and_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; SI-NOT: and
 ; SI: s_add_u32
 ; SI-NEXT: s_addc_u32
-define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a, i64 %b) {
   %shl = shl i64 %a, 1
   %and = and i64 %shl, 64
   %add = add i64 %and, %b
@@ -372,7 +372,7 @@ define void @s_and_inline_imm_64_i64_noshrink(i64 addrspace(1)* %out, i64 addrsp
 ; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -387,7 +387,7 @@ define void @s_and_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3ff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4607182418800017408
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -402,7 +402,7 @@ define void @s_and_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbff00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13830554455654793216
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -417,7 +417,7 @@ define void @s_and_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x3fe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4602678819172646912
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -432,7 +432,7 @@ define void @s_and_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xbfe00000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13826050856027422720
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -445,7 +445,7 @@ define void @s_and_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4611686018427387904
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -458,7 +458,7 @@ define void @s_and_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, -2.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13835058055282163712
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -473,7 +473,7 @@ define void @s_and_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0x40100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4616189618054758400
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -488,7 +488,7 @@ define void @s_and_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_and_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0xc0100000
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13839561654909534208
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -505,7 +505,7 @@ define void @s_and_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 1082130432
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -518,7 +518,7 @@ define void @s_and_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, -1065353216
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -531,7 +531,7 @@ define void @s_and_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, 4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 4647714815446351872
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
@@ -544,7 +544,7 @@ define void @s_and_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrs
 ; SI: s_and_b32 s[[K_HI:[0-9]+]], s{{[0-9]+}}, -4.0
 ; SI-NOT: and
 ; SI: buffer_store_dwordx2
-define void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %and = and i64 %a, 13871086852301127680
   store i64 %and, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 084a6933da26759fd5e6be6e59d3276f6d195266..e2620ce353c607f042d93bdc75373bb4cd7c4889 100644
--- a/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -11,22 +11,22 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 declare i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
 
-; HSA: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -34,8 +34,8 @@ define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -43,15 +43,15 @@ define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workgroup.id.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -59,8 +59,8 @@ define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.y()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -68,8 +68,8 @@ define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workgroup.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   %val2 = call i32 @llvm.amdgcn.workgroup.id.z()
@@ -79,29 +79,29 @@ define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.x()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -109,8 +109,8 @@ define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.y()
   %val1 = call i32 @llvm.amdgcn.workgroup.id.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -118,8 +118,8 @@ define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workitem.id.y()
   %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@@ -129,8 +129,8 @@ define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.amdgcn.workitem.id.x()
   %val1 = call i32 @llvm.amdgcn.workitem.id.y()
   %val2 = call i32 @llvm.amdgcn.workitem.id.z()
@@ -146,8 +146,8 @@ define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
-define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #10 {
+define amdgpu_kernel void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
   %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %bc
@@ -155,8 +155,8 @@ define void @use_dispatch_ptr(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
-define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #11 {
+define amdgpu_kernel void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
   %bc = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %bc
@@ -164,58 +164,58 @@ define void @use_queue_ptr(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
-define void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #11 {
+define amdgpu_kernel void @use_group_to_flat_addrspacecast(i32 addrspace(3)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
-define void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #11 {
+define amdgpu_kernel void @use_private_to_flat_addrspacecast(i32* %ptr) #1 {
   %stof = addrspacecast i32* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_group_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(3)*
   store volatile i32 0, i32 addrspace(3)* %ftos
   ret void
 }
 
-; HSA: define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_private_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32*
   store volatile i32 0, i32* %ftos
   ret void
 }
 
 ; No-op addrspacecast should not use queue ptr
-; HSA: define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
-define void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_global_to_flat_addrspacecast(i32 addrspace(1)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(1)* %ptr to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
-define void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
+define amdgpu_kernel void @use_constant_to_flat_addrspacecast(i32 addrspace(2)* %ptr) #1 {
   %stof = addrspacecast i32 addrspace(2)* %ptr to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %stof
   ret void
 }
 
-; HSA: define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_global_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %ftos
   ret void
 }
 
-; HSA: define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
-define void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
+define amdgpu_kernel void @use_flat_to_constant_addrspacecast(i32 addrspace(4)* %ptr) #1 {
   %ftos = addrspacecast i32 addrspace(4)* %ptr to i32 addrspace(2)*
   %ld = load volatile i32, i32 addrspace(2)* %ftos
   ret void
diff --git a/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index a4e7bb67d507d9bf42e75feabe9f016909ad04f3..09750da4cb8c72fc59f8a3cc1e4575687ecde206 100644
--- a/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -12,22 +12,22 @@ declare i32 @llvm.r600.read.local.size.x() #0
 declare i32 @llvm.r600.read.local.size.y() #0
 declare i32 @llvm.r600.read.local.size.z() #0
 
-; ALL: define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
-define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
   %val1 = call i32 @llvm.r600.read.tgid.y()
@@ -35,8 +35,8 @@ define void @multi_use_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
-define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #2 {
+define amdgpu_kernel void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -44,15 +44,15 @@ define void @use_tgid_x_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tgid.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
-define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #3 {
+define amdgpu_kernel void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -60,8 +60,8 @@ define void @use_tgid_x_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.y()
   %val1 = call i32 @llvm.r600.read.tgid.z()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -69,8 +69,8 @@ define void @use_tgid_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
-define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #4 {
+define amdgpu_kernel void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tgid.x()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   %val2 = call i32 @llvm.r600.read.tgid.z()
@@ -80,29 +80,29 @@ define void @use_tgid_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
-define void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #5 {
+define amdgpu_kernel void @use_tidig_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
-define void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #6 {
+define amdgpu_kernel void @use_tidig_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.tidig.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; ALL: define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tgid.x()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -110,8 +110,8 @@ define void @use_tidig_x_tgid_x(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
-define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #7 {
+define amdgpu_kernel void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.y()
   %val1 = call i32 @llvm.r600.read.tgid.y()
   store volatile i32 %val0, i32 addrspace(1)* %ptr
@@ -119,8 +119,8 @@ define void @use_tidig_y_tgid_y(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
-define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #8 {
+define amdgpu_kernel void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tidig.y()
   %val2 = call i32 @llvm.r600.read.tidig.z()
@@ -130,8 +130,8 @@ define void @use_tidig_x_y_z(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; ALL: define void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
-define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
+; ALL: define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #9 {
+define amdgpu_kernel void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   %val0 = call i32 @llvm.r600.read.tidig.x()
   %val1 = call i32 @llvm.r600.read.tidig.y()
   %val2 = call i32 @llvm.r600.read.tidig.z()
@@ -147,25 +147,25 @@ define void @use_all_workitems(i32 addrspace(1)* %ptr) #1 {
   ret void
 }
 
-; HSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_x(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.x()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_y(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.y()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
 }
 
-; HSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
-; NOHSA: define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
-define void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+; HSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #10 {
+; NOHSA: define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
+define amdgpu_kernel void @use_get_local_size_z(i32 addrspace(1)* %ptr) #1 {
   %val = call i32 @llvm.r600.read.local.size.z()
   store i32 %val, i32 addrspace(1)* %ptr
   ret void
diff --git a/test/CodeGen/AMDGPU/anonymous-gv.ll b/test/CodeGen/AMDGPU/anonymous-gv.ll
index f37b0f3382f4a4f08783655729c014dd29d21890..04fbe2ae1f948f58c555a5bba6a40204ebe4c241 100644
--- a/test/CodeGen/AMDGPU/anonymous-gv.ll
+++ b/test/CodeGen/AMDGPU/anonymous-gv.ll
@@ -6,13 +6,13 @@
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, __unnamed_1
 ; CHECK: s_endpgm
-define void @test() {
+define amdgpu_kernel void @test() {
   store i32 1, i32 addrspace(1)* @0
   ret void
 }
 
 ; CHECK-LABEL: {{^}}__unnamed_2:
 ; CHECK: s_endpgm
-define void @1() {
+define amdgpu_kernel void @1() {
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c61c23222bc7e4aebf73c1a0dcb08447bd0a5157
--- /dev/null
+++ b/test/CodeGen/AMDGPU/any_extend_vector_inreg.ll
@@ -0,0 +1,58 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}any_extend_vector_inreg_v16i8_to_v4i32:
+; GCN: {{buffer|flat}}_load_dwordx4
+; GCN-DAG: {{buffer|flat}}_load_dwordx4
+; GCN-DAG: {{buffer|flat}}_load_dword
+
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+; GCN: {{buffer|flat}}_store_byte
+define amdgpu_kernel void @any_extend_vector_inreg_v16i8_to_v4i32(<8 x i8> addrspace(1)* nocapture readonly %arg, <16 x i8> addrspace(1)* %arg1) local_unnamed_addr #0 {
+bb:
+  %tmp = bitcast <8 x i8> addrspace(1)* %arg to <16 x i8> addrspace(1)*
+  %tmp2 = load <16 x i8>, <16 x i8> addrspace(1)* %tmp, align 16
+  %tmp3 = extractelement <16 x i8> %tmp2, i64 4
+  %tmp6 = extractelement <16 x i8> %tmp2, i64 11
+  %tmp10 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %arg, i64 2
+  %tmp11 = bitcast <8 x i8> addrspace(1)* %tmp10 to <16 x i8> addrspace(1)*
+  %tmp12 = load <16 x i8>, <16 x i8> addrspace(1)* %tmp11, align 16
+  %tmp13 = extractelement <16 x i8> %tmp12, i64 7
+  %tmp17 = extractelement <16 x i8> %tmp12, i64 12
+  %tmp21 = getelementptr inbounds <8 x i8>, <8 x i8> addrspace(1)* %arg, i64 4
+  %tmp22 = bitcast <8 x i8> addrspace(1)* %tmp21 to <16 x i8> addrspace(1)*
+  %tmp23 = load <16 x i8>, <16 x i8> addrspace(1)* %tmp22, align 16
+  %tmp24 = extractelement <16 x i8> %tmp23, i64 3
+  %tmp1 = insertelement <16 x i8> undef, i8 %tmp3, i32 2
+  %tmp4 = insertelement <16 x i8> %tmp1, i8 0, i32 3
+  %tmp5 = insertelement <16 x i8> %tmp4, i8 0, i32 4
+  %tmp7 = insertelement <16 x i8> %tmp5, i8 %tmp6, i32 5
+  %tmp8 = insertelement <16 x i8> %tmp7, i8 0, i32 6
+  %tmp9 = insertelement <16 x i8> %tmp8, i8 %tmp13, i32 7
+  %tmp14 = insertelement <16 x i8> %tmp9, i8 0, i32 8
+  %tmp15 = insertelement <16 x i8> %tmp14, i8 %tmp17, i32 9
+  %tmp16 = insertelement <16 x i8> %tmp15, i8 0, i32 10
+  %tmp18 = insertelement <16 x i8> %tmp16, i8 0, i32 11
+  %tmp19 = insertelement <16 x i8> %tmp18, i8 %tmp24, i32 12
+  store <16 x i8> %tmp19, <16 x i8> addrspace(1)* %arg1, align 1
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/anyext.ll b/test/CodeGen/AMDGPU/anyext.ll
index 87b4c86427c84335c0af3b1d3175e95ae2c3a39f..3f220c40841298383e91f125897271b0c296cfe3 100644
--- a/test/CodeGen/AMDGPU/anyext.ll
+++ b/test/CodeGen/AMDGPU/anyext.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 
 ; GCN-LABEL: {{^}}anyext_i1_i32:
 ; GCN: v_cndmask_b32_e64
-define void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @anyext_i1_i32(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tmp = icmp eq i32 %cond, 0
   %tmp1 = zext i1 %tmp to i8
@@ -22,7 +22,7 @@ entry:
 ; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], -1, [[ADD]]
 ; VI: v_and_b32_e32 [[AND:v[0-9]+]], 1, [[XOR]]
 ; VI: buffer_store_dword [[AND]]
-define void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
+define amdgpu_kernel void @s_anyext_i16_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %a, i16 addrspace(1)* %b) {
 entry:
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
index f190bd0cb01e0789d3406200b3df2ed83608d838..daa3442097cf2fe83c52b4671e73cc3e325a2216 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i32.ll
@@ -12,11 +12,7 @@ declare void @llvm.amdgcn.s.barrier() #2
 
 ; SI-LABEL: {{^}}test_private_array_ptr_calc:
 
-; FIXME: We end up with zero argument for ADD, because
-; SIRegisterInfo::eliminateFrameIndex() blindly replaces the frame index
-; with the appropriate offset.  We should fold this into the store.
-
-; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 0, v{{[0-9]+}}
+; SI-ALLOCA: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 16, v{{[0-9]+}}
 ; SI-ALLOCA: buffer_store_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
 ; SI-ALLOCA: s_barrier
 ; SI-ALLOCA: buffer_load_dword {{v[0-9]+}}, [[PTRREG]], s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:64
@@ -28,7 +24,7 @@ declare void @llvm.amdgcn.s.barrier() #2
 
 ; SI-PROMOTE: v_add_i32_e32 [[PTRREG:v[0-9]+]], vcc, 64
 ; SI-PROMOTE: ds_write_b32 [[PTRREG]]
-define void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
+define amdgpu_kernel void @test_private_array_ptr_calc(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) #0 {
   %alloca = alloca [16 x i32], align 16
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
diff --git a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
index b914edf2928e535f44ce88bafebfd9ab3a01a426..ddeffc10a08959d6d1fbe07b8fe7f4870d658b53 100644
--- a/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
+++ b/test/CodeGen/AMDGPU/array-ptr-calc-i64.ll
@@ -7,7 +7,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
-define void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @test_array_ptr_calc(i32 addrspace(1)* noalias %out, [1025 x i32] addrspace(1)* noalias %inA, i32 addrspace(1)* noalias %inB) {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %a_ptr = getelementptr [1025 x i32], [1025 x i32] addrspace(1)* %inA, i32 %tid, i32 0
diff --git a/test/CodeGen/AMDGPU/ashr.v2i16.ll b/test/CodeGen/AMDGPU/ashr.v2i16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..96a5e3b23758a68ae22187929f326c357e112abb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/ashr.v2i16.ll
@@ -0,0 +1,161 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+
+; GCN-LABEL: {{^}}s_ashr_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
+
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+; CI: v_ashrrev_i32_e32
+; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_or_b32_e32
+define amdgpu_kernel void @s_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+  %result = ashr <2 x i16> %lhs, %rhs
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_ashr_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+
+; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
+; CI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], [[RHS]]
+; CI-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, [[LHS]]
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_ashrrev_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
+  %result = ashr <2 x i16> %a, %b
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_v_s_v2i16:
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @ashr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = ashr <2 x i16> %vgpr, %sgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_s_v_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @ashr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = ashr <2 x i16> %sgpr, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_imm_v_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], [[RHS]], -4
+define amdgpu_kernel void @ashr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = ashr <2 x i16> <i16 -4, i16 -4>, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_v_imm_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_ashrrev_i16 [[RESULT:v[0-9]+]], 8, [[LHS]]
+define amdgpu_kernel void @ashr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = ashr <2 x i16> %vgpr, <i16 8, i16 8>
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_ashr_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_ashrrev_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_ashrrev_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @v_ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
+  %result = ashr <4 x i16> %a, %b
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}ashr_v_imm_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @ashr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %result = ashr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
index 25eae0b41ae42e9831bdbe9e3b02fd2c65aa8d86..4f9526ddab55f64a45d1f0c030f0b2f9815bf931 100644
--- a/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
+++ b/test/CodeGen/AMDGPU/atomic_cmp_swap_local.ll
@@ -12,7 +12,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -33,7 +33,7 @@ define void @lds_atomic_cmpxchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrs
 ; GCN: ds_cmpst_rtn_b64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
@@ -45,7 +45,7 @@ define void @lds_atomic_cmpxchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrs
 ; SI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_cmpst_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %swap, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -65,7 +65,7 @@ define void @lds_atomic_cmpxchg_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i3
 ; GCN-DAG: v_mov_b32_e32 [[VSWAP:v[0-9]+]], [[SWAP]]
 ; GCN: ds_cmpst_b32 [[VPTR]], [[VCMP]], [[VSWAP]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %swap) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i32 addrspace(3)* %gep, i32 7, i32 %swap seq_cst monotonic
   %result = extractvalue { i32, i1 } %pair, 0
@@ -84,7 +84,7 @@ define void @lds_atomic_cmpxchg_noret_i32_offset(i32 addrspace(3)* %ptr, i32 %sw
 ; GCN-DAG: v_mov_b32_e32 v[[HISWAPV:[0-9]+]], s[[HISWAP]]
 ; GCN: ds_cmpst_b64 [[VPTR]], v{{\[}}[[LOVCMP]]:[[HIVCMP]]{{\]}}, v{{\[}}[[LOSWAPV]]:[[HISWAPV]]{{\]}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
+define amdgpu_kernel void @lds_atomic_cmpxchg_noret_i64_offset(i64 addrspace(3)* %ptr, i64 %swap) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %pair = cmpxchg i64 addrspace(3)* %gep, i64 7, i64 %swap seq_cst monotonic
   %result = extractvalue { i64, i1 } %pair, 0
diff --git a/test/CodeGen/AMDGPU/atomic_load_add.ll b/test/CodeGen/AMDGPU/atomic_load_add.ll
index 4b014e09b630b38e8f6c05270d866f3794ae9f6a..e0fe6641fa117246078dcac6f4130ceada7b69b5 100644
--- a/test/CodeGen/AMDGPU/atomic_load_add.ll
+++ b/test/CodeGen/AMDGPU/atomic_load_add.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_local:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32
-define void @atomic_add_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
@@ -13,7 +13,7 @@ define void @atomic_add_local(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_add_local_const_offset:
 ; R600: LDS_ADD *
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
@@ -22,7 +22,7 @@ define void @atomic_add_local_const_offset(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_add_ret_local:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32
-define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile add i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @atomic_add_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc
 ; FUNC-LABEL: {{^}}atomic_add_ret_local_const_offset:
 ; R600: LDS_ADD_RET *
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_add_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile add i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/atomic_load_sub.ll b/test/CodeGen/AMDGPU/atomic_load_sub.ll
index c6e5b1136d7ce3362adde629c18e0252f6d95bfc..a0275893919a909a0fa777997d2bd4659fde88f3 100644
--- a/test/CodeGen/AMDGPU/atomic_load_sub.ll
+++ b/test/CodeGen/AMDGPU/atomic_load_sub.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}atomic_sub_local:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32
-define void @atomic_sub_local(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local(i32 addrspace(3)* %local) {
    %unused = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
    ret void
 }
@@ -13,7 +13,7 @@ define void @atomic_sub_local(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_sub_local_const_offset:
 ; R600: LDS_SUB *
 ; SI: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
-define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 4
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   ret void
@@ -22,7 +22,7 @@ define void @atomic_sub_local_const_offset(i32 addrspace(3)* %local) {
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32
-define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %val = atomicrmw volatile sub i32 addrspace(3)* %local, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @atomic_sub_ret_local(i32 addrspace(1)* %out, i32 addrspace(3)* %loc
 ; FUNC-LABEL: {{^}}atomic_sub_ret_local_const_offset:
 ; R600: LDS_SUB_RET *
 ; SI: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:20
-define void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
+define amdgpu_kernel void @atomic_sub_ret_local_const_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %local) {
   %gep = getelementptr i32, i32 addrspace(3)* %local, i32 5
   %val = atomicrmw volatile sub i32 addrspace(3)* %gep, i32 5 seq_cst
   store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
index cab377feacb27febfc4ccfd6dfa044b99444449d..63a6f6a8d32c7c81ee2daa8c3f37def8b59c381d 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-flat-work-group-size.ll
@@ -5,7 +5,7 @@
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_64() #0 {
+define amdgpu_kernel void @min_64_max_64() #0 {
 entry:
   ret void
 }
@@ -16,7 +16,7 @@ attributes #0 = {"amdgpu-flat-work-group-size"="64,64"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_64_max_128() #1 {
+define amdgpu_kernel void @min_64_max_128() #1 {
 entry:
   ret void
 }
@@ -27,7 +27,7 @@ attributes #1 = {"amdgpu-flat-work-group-size"="64,128"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @min_128_max_128() #2 {
+define amdgpu_kernel void @min_128_max_128() #2 {
 entry:
   ret void
 }
@@ -39,7 +39,7 @@ attributes #2 = {"amdgpu-flat-work-group-size"="128,128"}
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 32
 @var = addrspace(1) global float 0.0
-define void @min_1024_max_2048() #3 {
+define amdgpu_kernel void @min_1024_max_2048() #3 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
index ca1b27e7cbc31994e1e6ec6d8ec9f46ce6e3e648..ac2f7b4a4a4b343802288b455520c22387a59066 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-sgpr.ll
@@ -8,16 +8,18 @@
 
 ; ALL: SGPRBlocks: 1
 ; ALL: NumSGPRsForWavesPerEU: 9
-define void @max_9_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_9_sgprs(i32 addrspace(1)* %out1,
 
                           i32 addrspace(1)* %out2,
                           i32 addrspace(1)* %out3,
                           i32 addrspace(1)* %out4,
-                          i32 %one, i32 %two, i32 %three, i32 %four) #0 {
+                          i32 addrspace(1)* %out5,
+                          i32 %one, i32 %two, i32 %three, i32 %four, i32 %five) #0 {
   store i32 %one, i32 addrspace(1)* %out1
   store i32 %two, i32 addrspace(1)* %out2
   store i32 %three, i32 addrspace(1)* %out3
   store i32 %four, i32 addrspace(1)* %out4
+  store i32 %five, i32 addrspace(1)* %out5
   ret void
 }
 
@@ -47,23 +49,26 @@ define void @max_9_sgprs(i32 addrspace(1)* %out1,
 
 ; TOSMEM: SGPRBlocks: 1
 ; TOSMEM: NumSGPRsForWavesPerEU: 16
-define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
+define amdgpu_kernel void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
                                         i32 addrspace(1)* %out2,
                                         i32 addrspace(1)* %out3,
                                         i32 addrspace(1)* %out4,
                                         i32 %one, i32 %two, i32 %three, i32 %four) #2 {
-  store volatile i32 0, i32* undef
   %x.0 = call i32 @llvm.amdgcn.workgroup.id.x()
-  store volatile i32 %x.0, i32 addrspace(1)* undef
   %x.1 = call i32 @llvm.amdgcn.workgroup.id.y()
-  store volatile i32 %x.0, i32 addrspace(1)* undef
   %x.2 = call i32 @llvm.amdgcn.workgroup.id.z()
-  store volatile i32 %x.0, i32 addrspace(1)* undef
   %x.3 = call i64 @llvm.amdgcn.dispatch.id()
-  store volatile i64 %x.3, i64 addrspace(1)* undef
   %x.4 = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
-  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
   %x.5 = call i8 addrspace(2)* @llvm.amdgcn.queue.ptr()
+  store volatile i32 0, i32* undef
+  br label %stores
+
+stores:
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  store volatile i32 %x.0, i32 addrspace(1)* undef
+  store volatile i64 %x.3, i64 addrspace(1)* undef
+  store volatile i8 addrspace(2)* %x.4, i8 addrspace(2)* addrspace(1)* undef
   store volatile i8 addrspace(2)* %x.5, i8 addrspace(2)* addrspace(1)* undef
 
   store i32 %one, i32 addrspace(1)* %out1
@@ -85,7 +90,7 @@ define void @max_12_sgprs_14_input_sgprs(i32 addrspace(1)* %out1,
 
 ; XALL: SGPRBlocks: 2
 ; XALL: NumSGPRsForWavesPerEU: 18
-;define void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
+;define amdgpu_kernel void @max_12_sgprs_12_input_sgprs(i32 addrspace(1)* %out1,
 ;                                        i32 addrspace(1)* %out2,
 ;                                        i32 addrspace(1)* %out3,
 ;                                        i32 addrspace(1)* %out4,
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll b/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
index 97feb7276b7d4e33afa63e4d6271d4f51506b1bf..979665ff0a807f5cfef2056a7af6f0e0add12245 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-num-vgpr.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}max_20_vgprs:
 ; CHECK: VGPRBlocks: 4
 ; CHECK: NumVGPRsForWavesPerEU: 20
-define void @max_20_vgprs() #1 {
+define amdgpu_kernel void @max_20_vgprs() #1 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
index 4f4efccc2260a4f5c189b50a0ec640f080dc0961..3dda73bc336ed8812e1d06959d9ded2ccc867bc5 100644
--- a/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
+++ b/test/CodeGen/AMDGPU/attr-amdgpu-waves-per-eu.ll
@@ -4,9 +4,9 @@
 ; CHECK-LABEL: {{^}}empty_exactly_1:
 ; CHECK: SGPRBlocks: 12
 ; CHECK: VGPRBlocks: 32
-; CHECK: NumSGPRsForWavesPerEU: 97
+; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 129
-define void @empty_exactly_1() #0 {
+define amdgpu_kernel void @empty_exactly_1() #0 {
 entry:
   ret void
 }
@@ -16,9 +16,9 @@ attributes #0 = {"amdgpu-waves-per-eu"="1,1"}
 ; CHECK-LABEL: {{^}}empty_exactly_5:
 ; CHECK: SGPRBlocks: 12
 ; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 97
+; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_exactly_5() #1 {
+define amdgpu_kernel void @empty_exactly_5() #1 {
 entry:
   ret void
 }
@@ -30,7 +30,7 @@ attributes #1 = {"amdgpu-waves-per-eu"="5,5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_exactly_10() #2 {
+define amdgpu_kernel void @empty_exactly_10() #2 {
 entry:
   ret void
 }
@@ -42,7 +42,7 @@ attributes #2 = {"amdgpu-waves-per-eu"="10,10"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_1() #3 {
+define amdgpu_kernel void @empty_at_least_1() #3 {
 entry:
   ret void
 }
@@ -54,7 +54,7 @@ attributes #3 = {"amdgpu-waves-per-eu"="1"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_5() #4 {
+define amdgpu_kernel void @empty_at_least_5() #4 {
 entry:
   ret void
 }
@@ -66,7 +66,7 @@ attributes #4 = {"amdgpu-waves-per-eu"="5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_least_10() #5 {
+define amdgpu_kernel void @empty_at_least_10() #5 {
 entry:
   ret void
 }
@@ -78,9 +78,9 @@ attributes #5 = {"amdgpu-waves-per-eu"="10"}
 ; CHECK-LABEL: {{^}}empty_at_most_5:
 ; CHECK: SGPRBlocks: 12
 ; CHECK: VGPRBlocks: 10
-; CHECK: NumSGPRsForWavesPerEU: 97
+; CHECK: NumSGPRsForWavesPerEU: 102
 ; CHECK: NumVGPRsForWavesPerEU: 41
-define void @empty_at_most_5() #6 {
+define amdgpu_kernel void @empty_at_most_5() #6 {
 entry:
   ret void
 }
@@ -92,7 +92,7 @@ attributes #6 = {"amdgpu-waves-per-eu"="1,5"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_at_most_10() #7 {
+define amdgpu_kernel void @empty_at_most_10() #7 {
 entry:
   ret void
 }
@@ -106,7 +106,7 @@ attributes #7 = {"amdgpu-waves-per-eu"="1,10"}
 ; CHECK: VGPRBlocks: 0
 ; CHECK: NumSGPRsForWavesPerEU: 1
 ; CHECK: NumVGPRsForWavesPerEU: 1
-define void @empty_between_5_and_10() #8 {
+define amdgpu_kernel void @empty_between_5_and_10() #8 {
 entry:
   ret void
 }
@@ -120,7 +120,7 @@ attributes #8 = {"amdgpu-waves-per-eu"="5,10"}
 ; CHECK: VGPRBlocks: 5
 ; CHECK: NumSGPRsForWavesPerEU: 13
 ; CHECK: NumVGPRsForWavesPerEU: 24
-define void @exactly_10() #9 {
+define amdgpu_kernel void @exactly_10() #9 {
   %val0 = load volatile float, float addrspace(1)* @var
   %val1 = load volatile float, float addrspace(1)* @var
   %val2 = load volatile float, float addrspace(1)* @var
diff --git a/test/CodeGen/AMDGPU/attr-unparseable.ll b/test/CodeGen/AMDGPU/attr-unparseable.ll
index 0282bc34c0ee03bc636168861156b975c0e906ed..17adb89900cd4cc8daba1f14a461bc3bb2393bc5 100644
--- a/test/CodeGen/AMDGPU/attr-unparseable.ll
+++ b/test/CodeGen/AMDGPU/attr-unparseable.ll
@@ -1,56 +1,56 @@
 ; RUN: not llc -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs < %s 2>&1 | FileCheck %s
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_0() #0 {
+define amdgpu_kernel void @unparseable_single_0() #0 {
 entry:
   ret void
 }
 attributes #0 = {"amdgpu-num-sgpr"}
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_1() #1 {
+define amdgpu_kernel void @unparseable_single_1() #1 {
 entry:
   ret void
 }
 attributes #1 = {"amdgpu-num-sgpr"="k"}
 
 ; CHECK: can't parse integer attribute amdgpu-num-sgpr
-define void @unparseable_single_2() #2 {
+define amdgpu_kernel void @unparseable_single_2() #2 {
 entry:
   ret void
 }
 attributes #2 = {"amdgpu-num-sgpr"="1,2"}
 
 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_0() #3 {
+define amdgpu_kernel void @unparseable_pair_0() #3 {
 entry:
   ret void
 }
 attributes #3 = {"amdgpu-flat-work-group-size"}
 
 ; CHECK: can't parse first integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_1() #4 {
+define amdgpu_kernel void @unparseable_pair_1() #4 {
 entry:
   ret void
 }
 attributes #4 = {"amdgpu-flat-work-group-size"="k"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_2() #5 {
+define amdgpu_kernel void @unparseable_pair_2() #5 {
 entry:
   ret void
 }
 attributes #5 = {"amdgpu-flat-work-group-size"="1"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_3() #6 {
+define amdgpu_kernel void @unparseable_pair_3() #6 {
 entry:
   ret void
 }
 attributes #6 = {"amdgpu-flat-work-group-size"="1,k"}
 
 ; CHECK: can't parse second integer attribute amdgpu-flat-work-group-size
-define void @unparseable_pair_4() #7 {
+define amdgpu_kernel void @unparseable_pair_4() #7 {
 entry:
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/barrier-elimination.ll b/test/CodeGen/AMDGPU/barrier-elimination.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c526baaab9cdacb55f8247220b3d264206f90022
--- /dev/null
+++ b/test/CodeGen/AMDGPU/barrier-elimination.ll
@@ -0,0 +1,30 @@
+; RUN: llc -march=amdgcn < %s | FileCheck %s
+
+; CHECK-LABEL: {{^}}unknown_wgs:
+; CHECK: s_barrier
+define amdgpu_kernel void @unknown_wgs() {
+  tail call void @llvm.amdgcn.s.barrier() #0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}flat_wgs_attr_32_128:
+; CHECK: s_barrier
+define amdgpu_kernel void @flat_wgs_attr_32_128() #1 {
+  tail call void @llvm.amdgcn.s.barrier() #0
+  ret void
+}
+
+; CHECK-LABEL: {{^}}flat_wgs_attr_32_64:
+; CHECK: :
+; CHECK-NEXT: ; wave barrier
+; CHECK-NEXT: s_endpgm
+define amdgpu_kernel void @flat_wgs_attr_32_64() #2 {
+  tail call void @llvm.amdgcn.s.barrier() #0
+  ret void
+}
+
+declare void @llvm.amdgcn.s.barrier() #0
+
+attributes #0 = { convergent nounwind }
+attributes #1 = { nounwind "amdgpu-flat-work-group-size"="32,128" }
+attributes #2 = { nounwind "amdgpu-flat-work-group-size"="32,64" }
diff --git a/test/CodeGen/AMDGPU/basic-branch.ll b/test/CodeGen/AMDGPU/basic-branch.ll
index 104dd45e8a1adc9c24620ae095d8f6f9645c28c5..e245e4296df2af6f742bacebbd2bcde0a6ac3de9 100644
--- a/test/CodeGen/AMDGPU/basic-branch.ll
+++ b/test/CodeGen/AMDGPU/basic-branch.ll
@@ -8,17 +8,14 @@
 ; GCNNOOPT: v_writelane_b32
 ; GCN: s_cbranch_scc1 [[END:BB[0-9]+_[0-9]+]]
 
-
-; GCN: ; BB#1
 ; GCNNOOPT: v_readlane_b32
 ; GCNNOOPT: v_readlane_b32
 ; GCN: buffer_store_dword
-; GCNOPT-NEXT: s_waitcnt vmcnt(0) expcnt(0)
-; TODO: This waitcnt can be eliminated
+; GCNNOOPT: s_endpgm
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
+define amdgpu_kernel void @test_branch(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) #0 {
   %cmp = icmp ne i32 %val, 0
   br i1 %cmp, label %store, label %end
 
@@ -42,7 +39,7 @@ end:
 
 ; GCN: {{^}}[[END]]:
 ; GCN: s_endpgm
-define void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
+define amdgpu_kernel void @test_brcc_i1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i1 %val) #0 {
   %cmp0 = icmp ne i1 %val, 0
   br i1 %cmp0, label %store, label %end
 
diff --git a/test/CodeGen/AMDGPU/basic-loop.ll b/test/CodeGen/AMDGPU/basic-loop.ll
index f0263caf5d6b7e9bb0b2639c8965b2152b577acb..de45190cdaa56c19f1dd907a7e188f31130ee385 100644
--- a/test/CodeGen/AMDGPU/basic-loop.ll
+++ b/test/CodeGen/AMDGPU/basic-loop.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -O0 -verify-machineinstrs -march=amdgcn -mcpu=tonga < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}test_loop:
-define void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
+define amdgpu_kernel void @test_loop(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %val) nounwind {
 entry:
   br label %loop.body
 
diff --git a/test/CodeGen/AMDGPU/bfe-patterns.ll b/test/CodeGen/AMDGPU/bfe-patterns.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5e39a6c6774b116ceb2e06c062b92510c741761d
--- /dev/null
+++ b/test/CodeGen/AMDGPU/bfe-patterns.ll
@@ -0,0 +1,163 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}v_ubfe_sub_i32:
+; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
+; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
+define amdgpu_kernel void @v_ubfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
+  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %src = load volatile i32, i32 addrspace(1)* %in0.gep
+  %width = load volatile i32, i32 addrspace(1)* %in0.gep
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = lshr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_ubfe_sub_multi_use_shl_i32:
+; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
+; GCN: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
+
+; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
+; SI-NEXT: v_lshr_b32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
+
+; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
+; VI-NEXT: v_lshrrev_b32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
+
+; GCN: [[BFE]]
+; GCN: [[SHL]]
+define amdgpu_kernel void @v_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
+  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %src = load volatile i32, i32 addrspace(1)* %in0.gep
+  %width = load volatile i32, i32 addrspace(1)* %in0.gep
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = lshr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  store volatile i32 %shl, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_ubfe_sub_i32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: s_load_dword [[WIDTH:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
+; GCN: v_bfe_u32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
+define amdgpu_kernel void @s_ubfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = lshr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_ubfe_sub_multi_use_shl_i32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: s_load_dword [[WIDTH:s[0-9]+]]
+; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
+; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
+; GCN-NEXT: s_lshr_b32 s{{[0-9]+}}, [[SHL]], [[SUB]]
+define amdgpu_kernel void @s_ubfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = lshr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  store volatile i32 %shl, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_sbfe_sub_i32:
+; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
+; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[WIDTH]]
+define amdgpu_kernel void @v_sbfe_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
+  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %src = load volatile i32, i32 addrspace(1)* %in0.gep
+  %width = load volatile i32, i32 addrspace(1)* %in0.gep
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = ashr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_sbfe_sub_multi_use_shl_i32:
+; GCN: {{buffer|flat}}_load_dword [[SRC:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[WIDTH:v[0-9]+]]
+; GCN: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 32, [[WIDTH]]
+
+; SI-NEXT: v_lshl_b32_e32 [[SHL:v[0-9]+]], [[SRC]], [[SUB]]
+; SI-NEXT: v_ashr_i32_e32 [[BFE:v[0-9]+]], [[SHL]], [[SUB]]
+
+; VI-NEXT: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], [[SUB]], [[SRC]]
+; VI-NEXT: v_ashrrev_i32_e32 [[BFE:v[0-9]+]], [[SUB]], [[SHL]]
+
+; GCN: [[BFE]]
+; GCN: [[SHL]]
+define amdgpu_kernel void @v_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %in0.gep = getelementptr i32, i32 addrspace(1)* %in0, i32 %id.x
+  %in1.gep = getelementptr i32, i32 addrspace(1)* %in1, i32 %id.x
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %src = load volatile i32, i32 addrspace(1)* %in0.gep
+  %width = load volatile i32, i32 addrspace(1)* %in0.gep
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = ashr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  store volatile i32 %shl, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_sbfe_sub_i32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: s_load_dword [[WIDTH:s[0-9]+]]
+; GCN: v_mov_b32_e32 [[VWIDTH:v[0-9]+]]
+; GCN: v_bfe_i32 v{{[0-9]+}}, [[SRC]], 0, [[VWIDTH]]
+define amdgpu_kernel void @s_sbfe_sub_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = ashr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_sbfe_sub_multi_use_shl_i32:
+; GCN: s_load_dword [[SRC:s[0-9]+]]
+; GCN: s_load_dword [[WIDTH:s[0-9]+]]
+; GCN: s_sub_i32 [[SUB:s[0-9]+]], 32, [[WIDTH]]
+; GCN-NEXT: s_lshl_b32 [[SHL:s[0-9]+]], [[SRC]], [[SUB]]
+; GCN-NEXT: s_ashr_i32 s{{[0-9]+}}, [[SHL]], [[SUB]]
+define amdgpu_kernel void @s_sbfe_sub_multi_use_shl_i32(i32 addrspace(1)* %out, i32 %src, i32 %width) #1 {
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
+  %sub = sub i32 32, %width
+  %shl = shl i32 %src, %sub
+  %bfe = ashr i32 %shl, %sub
+  store i32 %bfe, i32 addrspace(1)* %out.gep
+  store volatile i32 %shl, i32 addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/bfe_uint.ll b/test/CodeGen/AMDGPU/bfe_uint.ll
index 32e3fc26106f485d5431c991e6c05dfee82b5237..2c8c9a5ec932fce84d9b2cd736ec885939f3ef57 100644
--- a/test/CodeGen/AMDGPU/bfe_uint.ll
+++ b/test/CodeGen/AMDGPU/bfe_uint.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: {{^}}bfe_def:
 ; CHECK: BFE_UINT
-define void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_def(i32 addrspace(1)* %out, i32 %x) {
 entry:
   %0 = lshr i32 %x, 5
   %1 = and i32 %0, 15 ; 0xf
@@ -17,7 +17,7 @@ entry:
 
 ; CHECK: {{^}}bfe_shift:
 ; CHECK-NOT: BFE_UINT
-define void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
+define amdgpu_kernel void @bfe_shift(i32 addrspace(1)* %out, i32 %x) {
 entry:
   %0 = lshr i32 %x, 16
   %1 = and i32 %0, 65535 ; 0xffff
diff --git a/test/CodeGen/AMDGPU/bfi_int.ll b/test/CodeGen/AMDGPU/bfi_int.ll
index 5156137fd78aebd8c59cc2a3c7fe8d56f0ae44aa..7870e5f378d3046599eed9fc84875083057ae912 100644
--- a/test/CodeGen/AMDGPU/bfi_int.ll
+++ b/test/CodeGen/AMDGPU/bfi_int.ll
@@ -9,7 +9,7 @@
 ; R600: BFI_INT
 ; SI:   @bfi_def
 ; SI:   v_bfi_b32
-define void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_def(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %x, -1
   %1 = and i32 %z, %0
@@ -25,7 +25,7 @@ entry:
 ; R600: BFI_INT
 ; SI:   @bfi_sha256_ch
 ; SI:   v_bfi_b32
-define void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ch(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = xor i32 %y, %z
   %1 = and i32 %x, %0
@@ -42,7 +42,7 @@ entry:
 ; SI: v_xor_b32_e32 [[DST:v[0-9]+]], {{s[0-9]+, v[0-9]+}}
 ; SI: v_bfi_b32 {{v[0-9]+}}, [[DST]], {{s[0-9]+, v[0-9]+}}
 
-define void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
+define amdgpu_kernel void @bfi_sha256_ma(i32 addrspace(1)* %out, i32 %x, i32 %y, i32 %z) {
 entry:
   %0 = and i32 %x, %z
   %1 = or i32 %x, %z
diff --git a/test/CodeGen/AMDGPU/bfm.ll b/test/CodeGen/AMDGPU/bfm.ll
index 790458d0d60c8c1bebec06cd3da06ca57c694490..5673995588dae18316f2d1319a5f3786c9a381be 100644
--- a/test/CodeGen/AMDGPU/bfm.ll
+++ b/test/CodeGen/AMDGPU/bfm.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}bfm_pattern:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
-define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
   %c = shl i32 %b, %y
@@ -14,7 +14,7 @@ define void @bfm_pattern(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
 
 ; FUNC-LABEL: {{^}}bfm_pattern_simple:
 ; SI: s_bfm_b32 {{s[0-9]+}}, {{s[0-9]+}}, 0
-define void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @bfm_pattern_simple(i32 addrspace(1)* %out, i32 %x) #0 {
   %a = shl i32 1, %x
   %b = sub i32 %a, 1
   store i32 %b, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/big_alu.ll b/test/CodeGen/AMDGPU/big_alu.ll
index 0ab22b350f50fbc1e78fb25b7089bbcc04079455..51387c8b79cbffa315541aafc4c27bba28483272 100644
--- a/test/CodeGen/AMDGPU/big_alu.ll
+++ b/test/CodeGen/AMDGPU/big_alu.ll
@@ -2,7 +2,7 @@
 
 ; This test ensures that R600 backend can handle ifcvt properly
 
-define amdgpu_ps void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) {
+define amdgpu_ps void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7, <4 x float> inreg %reg8, <4 x float> inreg %reg9) #0 {
 main_body:
   %tmp = extractelement <4 x float> %reg0, i32 0
   %tmp1 = extractelement <4 x float> %reg0, i32 1
@@ -224,28 +224,31 @@ ENDIF136:                                         ; preds = %ENDIF154, %main_bod
   %result.i = fadd float %mul.i, %one.sub.ac.i
   %tmp204 = fadd float %result.i, 0x3FF4CCCCC0000000
   %tmp205 = fmul float %tmp204, 0x3FE1C71C80000000
-  %tmp206 = call float @llvm.AMDGPU.clamp.f32(float %tmp205, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp205, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
   %tmp207 = fadd float %result.i, 0x3FF4CCCCC0000000
   %tmp208 = fmul float %tmp207, 0x3FE1C71C80000000
-  %tmp209 = call float @llvm.AMDGPU.clamp.f32(float %tmp208, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i15 = call float @llvm.maxnum.f32(float %tmp208, float 0.000000e+00)
+  %clamp.i16 = call float @llvm.minnum.f32(float %max.0.i15, float 1.000000e+00)
   %tmp210 = fadd float %result.i, 2.000000e+00
   %tmp211 = fmul float %tmp210, 0x3FD611A7A0000000
-  %tmp212 = call float @llvm.AMDGPU.clamp.f32(float %tmp211, float 0.000000e+00, float 1.000000e+00)
-  %tmp213 = fmul float 2.000000e+00, %tmp206
+  %max.0.i13 = call float @llvm.maxnum.f32(float %tmp211, float 0.000000e+00)
+  %clamp.i14 = call float @llvm.minnum.f32(float %max.0.i13, float 1.000000e+00)
+  %tmp213 = fmul float 2.000000e+00, %clamp.i
   %tmp214 = fsub float -0.000000e+00, %tmp213
   %tmp215 = fadd float 3.000000e+00, %tmp214
-  %tmp216 = fmul float %tmp206, %tmp215
-  %tmp217 = fmul float %tmp206, %tmp216
-  %tmp218 = fmul float 2.000000e+00, %tmp209
+  %tmp216 = fmul float %clamp.i, %tmp215
+  %tmp217 = fmul float %clamp.i, %tmp216
+  %tmp218 = fmul float 2.000000e+00, %clamp.i16
   %tmp219 = fsub float -0.000000e+00, %tmp218
   %tmp220 = fadd float 3.000000e+00, %tmp219
-  %tmp221 = fmul float %tmp209, %tmp220
-  %tmp222 = fmul float %tmp209, %tmp221
-  %tmp223 = fmul float 2.000000e+00, %tmp212
+  %tmp221 = fmul float %clamp.i16, %tmp220
+  %tmp222 = fmul float %clamp.i16, %tmp221
+  %tmp223 = fmul float 2.000000e+00, %clamp.i14
   %tmp224 = fsub float -0.000000e+00, %tmp223
   %tmp225 = fadd float 3.000000e+00, %tmp224
-  %tmp226 = fmul float %tmp212, %tmp225
-  %tmp227 = fmul float %tmp212, %tmp226
+  %tmp226 = fmul float %clamp.i14, %tmp225
+  %tmp227 = fmul float %clamp.i14, %tmp226
   %tmp228 = fmul float %tmp26, 0x3F368B5CC0000000
   %tmp229 = fmul float %tmp27, 0x3F368B5CC0000000
   %tmp230 = insertelement <4 x float> undef, float %tmp228, i32 0
@@ -282,28 +285,31 @@ ENDIF136:                                         ; preds = %ENDIF154, %main_bod
   %tmp261 = fmul float %tmp257, 0.000000e+00
   %tmp262 = fadd float %result.i, 0x3FF4CCCCC0000000
   %tmp263 = fmul float %tmp262, 0x3FE1C71C80000000
-  %tmp264 = call float @llvm.AMDGPU.clamp.f32(float %tmp263, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i11 = call float @llvm.maxnum.f32(float %tmp263, float 0.000000e+00)
+  %clamp.i12 = call float @llvm.minnum.f32(float %max.0.i11, float 1.000000e+00)
   %tmp265 = fadd float %result.i, 0x3FF4CCCCC0000000
   %tmp266 = fmul float %tmp265, 0x3FE1C71C80000000
-  %tmp267 = call float @llvm.AMDGPU.clamp.f32(float %tmp266, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i9 = call float @llvm.maxnum.f32(float %tmp266, float 0.000000e+00)
+  %clamp.i10 = call float @llvm.minnum.f32(float %max.0.i9, float 1.000000e+00)
   %tmp268 = fadd float %result.i, 2.000000e+00
   %tmp269 = fmul float %tmp268, 0x3FD611A7A0000000
-  %tmp270 = call float @llvm.AMDGPU.clamp.f32(float %tmp269, float 0.000000e+00, float 1.000000e+00)
-  %tmp271 = fmul float 2.000000e+00, %tmp264
+  %max.0.i7 = call float @llvm.maxnum.f32(float %tmp269, float 0.000000e+00)
+  %clamp.i8 = call float @llvm.minnum.f32(float %max.0.i7, float 1.000000e+00)
+  %tmp271 = fmul float 2.000000e+00, %clamp.i12
   %tmp272 = fsub float -0.000000e+00, %tmp271
   %tmp273 = fadd float 3.000000e+00, %tmp272
-  %tmp274 = fmul float %tmp264, %tmp273
-  %tmp275 = fmul float %tmp264, %tmp274
-  %tmp276 = fmul float 2.000000e+00, %tmp267
+  %tmp274 = fmul float %clamp.i12, %tmp273
+  %tmp275 = fmul float %clamp.i12, %tmp274
+  %tmp276 = fmul float 2.000000e+00, %clamp.i10
   %tmp277 = fsub float -0.000000e+00, %tmp276
   %tmp278 = fadd float 3.000000e+00, %tmp277
-  %tmp279 = fmul float %tmp267, %tmp278
-  %tmp280 = fmul float %tmp267, %tmp279
-  %tmp281 = fmul float 2.000000e+00, %tmp270
+  %tmp279 = fmul float %clamp.i10, %tmp278
+  %tmp280 = fmul float %clamp.i10, %tmp279
+  %tmp281 = fmul float 2.000000e+00, %clamp.i8
   %tmp282 = fsub float -0.000000e+00, %tmp281
   %tmp283 = fadd float 3.000000e+00, %tmp282
-  %tmp284 = fmul float %tmp270, %tmp283
-  %tmp285 = fmul float %tmp270, %tmp284
+  %tmp284 = fmul float %clamp.i8, %tmp283
+  %tmp285 = fmul float %clamp.i8, %tmp284
   %tmp286 = fmul float %tmp26, 0x3F22DFD6A0000000
   %tmp287 = fmul float %tmp27, 0x3F22DFD6A0000000
   %tmp288 = insertelement <4 x float> undef, float %tmp286, i32 0
@@ -390,7 +396,8 @@ ENDIF136:                                         ; preds = %ENDIF154, %main_bod
   %tmp369 = fadd float %tmp368, %tmp367
   %tmp370 = fadd float %tmp369, 0xBFEFAE1480000000
   %tmp371 = fmul float %tmp370, 0xC023FFFFC0000000
-  %tmp372 = call float @llvm.AMDGPU.clamp.f32(float %tmp371, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i5 = call float @llvm.maxnum.f32(float %tmp371, float 0.000000e+00)
+  %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00)
   %tmp373 = fsub float -0.000000e+00, %tmp339
   %tmp374 = fadd float %result.i, %tmp373
   %tmp375 = fadd float %tmp374, 0x3FBEB851E0000000
@@ -416,12 +423,13 @@ ENDIF136:                                         ; preds = %ENDIF154, %main_bod
   %tmp395 = fadd float %tmp394, %tmp393
   %tmp396 = fadd float %tmp395, 0xBFEFAE1480000000
   %tmp397 = fmul float %tmp396, 0xC0490001A0000000
-  %tmp398 = call float @llvm.AMDGPU.clamp.f32(float %tmp397, float 0.000000e+00, float 1.000000e+00)
-  %tmp399 = fmul float 2.000000e+00, %tmp372
+  %max.0.i3 = call float @llvm.maxnum.f32(float %tmp397, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %tmp399 = fmul float 2.000000e+00, %clamp.i6
   %tmp400 = fsub float -0.000000e+00, %tmp399
   %tmp401 = fadd float 3.000000e+00, %tmp400
-  %tmp402 = fmul float %tmp372, %tmp401
-  %tmp403 = fmul float %tmp372, %tmp402
+  %tmp402 = fmul float %clamp.i6, %tmp401
+  %tmp403 = fmul float %clamp.i6, %tmp402
   %one.sub.a.i169 = fsub float 1.000000e+00, %tmp403
   %one.sub.ac.i170 = fmul float %one.sub.a.i169, %tmp349
   %mul.i171 = fmul float %tmp258, %tmp349
@@ -438,11 +446,11 @@ ENDIF136:                                         ; preds = %ENDIF154, %main_bod
   %one.sub.ac.i158 = fmul float %one.sub.a.i157, 0.000000e+00
   %mul.i159 = fmul float %tmp261, 0.000000e+00
   %result.i160 = fadd float %mul.i159, %one.sub.ac.i158
-  %tmp404 = fmul float 2.000000e+00, %tmp398
+  %tmp404 = fmul float 2.000000e+00, %clamp.i4
   %tmp405 = fsub float -0.000000e+00, %tmp404
   %tmp406 = fadd float 3.000000e+00, %tmp405
-  %tmp407 = fmul float %tmp398, %tmp406
-  %tmp408 = fmul float %tmp398, %tmp407
+  %tmp407 = fmul float %clamp.i4, %tmp406
+  %tmp408 = fmul float %clamp.i4, %tmp407
   %one.sub.a.i153 = fsub float 1.000000e+00, %tmp408
   %one.sub.ac.i154 = fmul float %one.sub.a.i153, %tmp375
   %mul.i155 = fmul float %tmp258, %tmp375
@@ -1157,12 +1165,13 @@ IF179:                                            ; preds = %ENDIF175
   %tmp882 = fadd float %tmp881, %tmp880
   %tmp883 = fadd float %tmp882, 0xBFEFAE1480000000
   %tmp884 = fmul float %tmp883, 0xC043FFFE20000000
-  %tmp885 = call float @llvm.AMDGPU.clamp.f32(float %tmp884, float 0.000000e+00, float 1.000000e+00)
-  %tmp886 = fmul float 2.000000e+00, %tmp885
+  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp884, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp886 = fmul float 2.000000e+00, %clamp.i2
   %tmp887 = fsub float -0.000000e+00, %tmp886
   %tmp888 = fadd float 3.000000e+00, %tmp887
-  %tmp889 = fmul float %tmp885, %tmp888
-  %tmp890 = fmul float %tmp885, %tmp889
+  %tmp889 = fmul float %clamp.i2, %tmp888
+  %tmp890 = fmul float %clamp.i2, %tmp889
   %one.sub.a.i41 = fsub float 1.000000e+00, %tmp890
   %one.sub.ac.i42 = fmul float %one.sub.a.i41, %tmp866
   %mul.i43 = fmul float %temp84.5, %tmp866
@@ -1288,25 +1297,14 @@ ENDIF178:                                         ; preds = %IF179, %ENDIF175
   ret void
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.r600.recipsqrt.clamped.f32(float) #0
-
-; Function Attrs: nounwind readonly
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
+declare float @llvm.r600.recipsqrt.clamped.f32(float) #1
 declare float @llvm.fabs.f32(float) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.exp2.f32(float) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
-
+declare float @llvm.exp2.f32(float) #1
 declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
+declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
 
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.r600.tex(<4 x float>, i32, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind readonly }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
index 3a55870c2882313c4a15f74b244bb295c264d342..cf95f74afb84d887fd79ce108a3c4b76479307c3 100644
--- a/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
+++ b/test/CodeGen/AMDGPU/bitcast-vector-extract.ll
@@ -11,7 +11,7 @@
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %out, <8 x i32> %vec) {
   %vec0.bc = bitcast <8 x i32> <i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 7, i32 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -27,7 +27,7 @@ define void @store_bitcast_constant_v8i32_to_v8f32(<8 x float> addrspace(1)* %ou
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -43,7 +43,7 @@ define void @store_bitcast_constant_v4i64_to_v8f32(<8 x float> addrspace(1)* %ou
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %vec) {
   %vec0.bc = bitcast <4 x i64> <i64 7, i64 7, i64 7, i64 8> to <4 x double>
   store volatile <4 x double> %vec0.bc, <4 x double> addrspace(1)* %out
 
@@ -59,7 +59,7 @@ define void @store_bitcast_constant_v4i64_to_v4f64(<4 x double> addrspace(1)* %o
 ; GCN: buffer_store_dwordx4
 ; GCN-NOT: v_mov_b32
 ; GCN: buffer_store_dwordx4
-define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
+define amdgpu_kernel void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %out, <16 x i16> %vec) {
   %vec0.bc = bitcast <16 x i16> <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 8> to <8 x float>
   store volatile <8 x float> %vec0.bc, <8 x float> addrspace(1)* %out
 
@@ -67,3 +67,27 @@ define void @store_bitcast_constant_v8i32_to_v16i16(<8 x float> addrspace(1)* %o
   store volatile <8 x float> %vec1.bc, <8 x float> addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source:
+; GCN-NOT: store_dword
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source(<2 x i32> addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+  %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
+  %bc = bitcast i64 %undef to <2 x i32>
+  store volatile <2 x i32> %bc, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_value_lowered_to_undef_bitcast_source_extractelt:
+; GCN-NOT: store_dword
+define amdgpu_kernel void @store_value_lowered_to_undef_bitcast_source_extractelt(i32 addrspace(1)* %out, i64 %a, i64 %b, i32 %c) #0 {
+  %undef = call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 %c) #1
+  %bc = bitcast i64 %undef to <2 x i32>
+  %elt1 = extractelement <2 x i32> %bc, i32 1
+  store volatile i32 %elt1, i32 addrspace(1)* %out
+  ret void
+}
+
+declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone convergent }
diff --git a/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
index f7dc1a9d37e8a70cf34a5669df2a273fdf94f48e..3616ec1f45d31394abe8c0f01221a0407daa4ed3 100644
--- a/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
+++ b/test/CodeGen/AMDGPU/bitreverse-inline-immediates.ll
@@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}materialize_0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i32(i32 addrspace(1)* %out) {
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
@@ -16,7 +16,7 @@ define void @materialize_0_i32(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_0_i64(i64 addrspace(1)* %out) {
   store i64 0, i64 addrspace(1)* %out
   ret void
 }
@@ -24,7 +24,7 @@ define void @materialize_0_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_neg1_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i32(i32 addrspace(1)* %out) {
   store i32 -1, i32 addrspace(1)* %out
   ret void
 }
@@ -33,7 +33,7 @@ define void @materialize_neg1_i32(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN: v_mov_b32_e32 v[[HIK:[0-9]+]], v[[LOK]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_neg1_i64(i64 addrspace(1)* %out) {
   store i64 -1, i64 addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@ define void @materialize_neg1_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_signbit_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i32(i32 addrspace(1)* %out) {
   store i32 -2147483648, i32 addrspace(1)* %out
   ret void
 }
@@ -50,7 +50,7 @@ define void @materialize_signbit_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_signbit_i64(i64 addrspace(1)* %out) {
   store i64  -9223372036854775808, i64 addrspace(1)* %out
   ret void
 }
@@ -58,7 +58,7 @@ define void @materialize_signbit_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_neg16_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
   store i32 268435455, i32 addrspace(1)* %out
   ret void
 }
@@ -67,7 +67,7 @@ define void @materialize_rev_neg16_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], -16{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
   store i64  1152921504606846975, i64 addrspace(1)* %out
   ret void
 }
@@ -75,7 +75,7 @@ define void @materialize_rev_neg16_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_neg17_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
   store i32 -134217729, i32 addrspace(1)* %out
   ret void
 }
@@ -84,7 +84,7 @@ define void @materialize_rev_neg17_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0xf7ffffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
   store i64 -576460752303423489, i64 addrspace(1)* %out
   ret void
 }
@@ -92,7 +92,7 @@ define void @materialize_rev_neg17_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_64_i32:
 ; GCN: v_bfrev_b32_e32 [[K:v[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
   store i32 33554432, i32 addrspace(1)* %out
   ret void
 }
@@ -101,7 +101,7 @@ define void @materialize_rev_64_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HIK:[0-9]+]], 64{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
   store i64 144115188075855872, i64 addrspace(1)* %out
   ret void
 }
@@ -109,7 +109,7 @@ define void @materialize_rev_64_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_65_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
   store i32 -2113929216, i32 addrspace(1)* %out
   ret void
 }
@@ -118,7 +118,7 @@ define void @materialize_rev_65_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0x82000000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
   store i64 -9079256848778919936, i64 addrspace(1)* %out
   ret void
 }
@@ -126,7 +126,7 @@ define void @materialize_rev_65_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_3_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
   store i32 -1073741824, i32 addrspace(1)* %out
   ret void
 }
@@ -135,7 +135,7 @@ define void @materialize_rev_3_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
   store i64 -4611686018427387904, i64 addrspace(1)* %out
   ret void
 }
@@ -143,7 +143,7 @@ define void @materialize_rev_3_i64(i64 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}materialize_rev_1.0_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1fc{{$}}
 ; GCN: buffer_store_dword [[K]]
-define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
   store i32 508, i32 addrspace(1)* %out
   ret void
 }
@@ -152,70 +152,70 @@ define void @materialize_rev_1.0_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LOK:[0-9]+]], 0x1fc{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HIK:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOK]]:[[HIK]]{{\]}}
-define void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @materialize_rev_1.0_i64(i64 addrspace(1)* %out) {
   store i64 508, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_0_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0{{$}}
-define void @s_materialize_0_i32() {
+define amdgpu_kernel void @s_materialize_0_i32() {
   call void asm sideeffect "; use $0", "s"(i32 0)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_1_i32() {
+define amdgpu_kernel void @s_materialize_1_i32() {
   call void asm sideeffect "; use $0", "s"(i32 1)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_neg1_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, -1{{$}}
-define void @s_materialize_neg1_i32() {
+define amdgpu_kernel void @s_materialize_neg1_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -1)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_signbit_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 1{{$}}
-define void @s_materialize_signbit_i32() {
+define amdgpu_kernel void @s_materialize_signbit_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -2147483648)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_64_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, 64{{$}}
-define void @s_materialize_rev_64_i32() {
+define amdgpu_kernel void @s_materialize_rev_64_i32() {
   call void asm sideeffect "; use $0", "s"(i32 33554432)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_65_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0x82000000{{$}}
-define void @s_materialize_rev_65_i32() {
+define amdgpu_kernel void @s_materialize_rev_65_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -2113929216)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_neg16_i32:
 ; GCN: s_brev_b32 s{{[0-9]+}}, -16{{$}}
-define void @s_materialize_rev_neg16_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg16_i32() {
   call void asm sideeffect "; use $0", "s"(i32 268435455)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_neg17_i32:
 ; GCN: s_mov_b32 s{{[0-9]+}}, 0xf7ffffff{{$}}
-define void @s_materialize_rev_neg17_i32() {
+define amdgpu_kernel void @s_materialize_rev_neg17_i32() {
   call void asm sideeffect "; use $0", "s"(i32 -134217729)
   ret void
 }
 
 ; GCN-LABEL: {{^}}s_materialize_rev_1.0_i32:
 ; GCN: s_movk_i32 s{{[0-9]+}}, 0x1fc{{$}}
-define void @s_materialize_rev_1.0_i32() {
+define amdgpu_kernel void @s_materialize_rev_1.0_i32() {
   call void asm sideeffect "; use $0", "s"(i32 508)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/bitreverse.ll b/test/CodeGen/AMDGPU/bitreverse.ll
index 43a4200cb3bd98a52ab422dcb17aad61cf0c2ffe..539373f7bdeb4e27faffcfb68135603cc5556ef5 100644
--- a/test/CodeGen/AMDGPU/bitreverse.ll
+++ b/test/CodeGen/AMDGPU/bitreverse.ll
@@ -14,7 +14,7 @@ declare <4 x i64> @llvm.bitreverse.v4i64(<4 x i64>) #1
 
 ; FUNC-LABEL: {{^}}s_brev_i16:
 ; SI: s_brev_b32 
-define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
+define amdgpu_kernel void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, i16 addrspace(1)* %out
   ret void
@@ -22,7 +22,7 @@ define void @s_brev_i16(i16 addrspace(1)* noalias %out, i16 %val) #0 {
 
 ; FUNC-LABEL: {{^}}v_brev_i16:
 ; SI: v_bfrev_b32_e32
-define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) #0 {
   %val = load i16, i16 addrspace(1)* %valptr
   %brev = call i16 @llvm.bitreverse.i16(i16 %val) #1
   store i16 %brev, i16 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @v_brev_i16(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalia
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
 ; SI: s_endpgm
-define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, i32 addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @s_brev_i32(i32 addrspace(1)* noalias %out, i32 %val) #0 {
 ; SI: v_bfrev_b32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
   %val = load i32, i32 addrspace(1)* %valptr
   %brev = call i32 @llvm.bitreverse.i32(i32 %val) #1
   store i32 %brev, i32 addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @v_brev_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalia
 ; FUNC-LABEL: {{^}}s_brev_v2i32:
 ; SI: s_brev_b32
 ; SI: s_brev_b32
-define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val) #0 {
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @s_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> %val)
 ; FUNC-LABEL: {{^}}v_brev_v2i32:
 ; SI: v_bfrev_b32_e32
 ; SI: v_bfrev_b32_e32
-define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) #0 {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr
   %brev = call <2 x i32> @llvm.bitreverse.v2i32(<2 x i32> %val) #1
   store <2 x i32> %brev, <2 x i32> addrspace(1)* %out
@@ -73,7 +73,7 @@ define void @v_brev_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrsp
 }
 
 ; FUNC-LABEL: {{^}}s_brev_i64:
-define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
+define amdgpu_kernel void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, i64 addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @s_brev_i64(i64 addrspace(1)* noalias %out, i64 %val) #0 {
 
 ; FUNC-LABEL: {{^}}v_brev_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %valptr) #0 {
   %val = load i64, i64 addrspace(1)* %valptr
   %brev = call i64 @llvm.bitreverse.i64(i64 %val) #1
   store i64 %brev, i64 addrspace(1)* %out
@@ -89,14 +89,14 @@ define void @v_brev_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalia
 }
 
 ; FUNC-LABEL: {{^}}s_brev_v2i64:
-define void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
+define amdgpu_kernel void @s_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> %val) #0 {
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_brev_v2i64:
-define void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_brev_v2i64(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %valptr) #0 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %valptr
   %brev = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> %val) #1
   store <2 x i64> %brev, <2 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/br_cc.f16.ll b/test/CodeGen/AMDGPU/br_cc.f16.ll
index 4ae15e8ea45e7f3fb5556951eddf14be1924858e..b7a0c8738dfaf4b382e6b44e62a2c16df6071572 100644
--- a/test/CodeGen/AMDGPU/br_cc.f16.ll
+++ b/test/CodeGen/AMDGPU/br_cc.f16.ll
@@ -5,23 +5,22 @@
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
-; SI:  v_cmp_nlt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
+; SI:  v_cmp_nlt_f32_e32 vcc, v[[B_F32]], v[[A_F32]]
 ; VI:  v_cmp_nlt_f16_e32 vcc, v[[A_F16]], v[[B_F16]]
 ; GCN: s_cbranch_vccnz
 
 ; GCN: one{{$}}
-; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[A_F32]]
-; SI: s_branch
-; VI: buffer_store_short
-; VI: s_endpgm
+; SI: v_cvt_f16_f32_e32 v[[A_F16:[0-9]+]], v[[B_F32]]
+; GCN: buffer_store_short
+; GCN: s_endpgm
 
 ; GCN: two{{$}}
-; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
+; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16(
+define amdgpu_kernel void @br_cc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -60,7 +59,7 @@ two:
 ; GCN: two{{$}}
 ; SI:  v_cvt_f16_f32_e32 v[[B_F16:[0-9]+]], v[[B_F32]]
 
-define void @br_cc_f16_imm_a(
+define amdgpu_kernel void @br_cc_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -93,7 +92,7 @@ two:
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x3800{{$}}
 ; GCN: buffer_store_short v[[B_F16]]
 ; GCN: s_endpgm
-define void @br_cc_f16_imm_b(
+define amdgpu_kernel void @br_cc_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/branch-condition-and.ll b/test/CodeGen/AMDGPU/branch-condition-and.ll
index 94616a4be8fd89481460a765db37273ae124968b..68b77ea3490e5ccc42efc77b943946677e834713 100644
--- a/test/CodeGen/AMDGPU/branch-condition-and.ll
+++ b/test/CodeGen/AMDGPU/branch-condition-and.ll
@@ -15,12 +15,16 @@
 ; GCN: s_and_b64 [[AND:s\[[0-9]+:[0-9]+\]]], vcc, [[OTHERCC]]
 ; GCN: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[AND]]
 ; GCN: s_xor_b64 {{s\[[0-9]+:[0-9]+\]}}, exec, [[SAVED]]
-;
-; TODO: The following sequence is a bug (missing s_endpgm)!
-;
-; GCN: s_branch [[BB:BB[0-9]+_[0-9]+]]
-; GCN: [[BB]]:
-; GCN-NEXT: .Lfunc_end0:
+; GCN: ; mask branch [[BB5:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %bb4
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+
+; GCN-NEXT: [[BB5]]
+; GCN: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN-NEXT: .Lfunc_end
 define amdgpu_ps void @ham(float %arg, float %arg1) #0 {
 bb:
   %tmp = fcmp ogt float %arg, 0.000000e+00
@@ -29,6 +33,7 @@ bb:
   br i1 %tmp3, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb
+  store volatile i32 4, i32 addrspace(3)* undef
   unreachable
 
 bb5:                                              ; preds = %bb
diff --git a/test/CodeGen/AMDGPU/branch-relax-spill.ll b/test/CodeGen/AMDGPU/branch-relax-spill.ll
index 86b8dd89e7d0437a011d4c869ef557d109231035..ede15559c4ffa545892d9d443bec488839799da8 100644
--- a/test/CodeGen/AMDGPU/branch-relax-spill.ll
+++ b/test/CodeGen/AMDGPU/branch-relax-spill.ll
@@ -5,7 +5,7 @@
 
 ; FAIL: LLVM ERROR: Error while trying to spill VCC from class SReg_64: Cannot scavenge register without an emergency spill slot!
 
-define void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @spill(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 entry:
   %sgpr0 = tail call i32 asm sideeffect "s_mov_b32 s0, 0", "={SGPR0}"() #0
   %sgpr1 = tail call i32 asm sideeffect "s_mov_b32 s1, 0", "={SGPR1}"() #0
diff --git a/test/CodeGen/AMDGPU/branch-relaxation.ll b/test/CodeGen/AMDGPU/branch-relaxation.ll
index 39505404a8681b291cfa3f458ce39378cf74b358..263059d4a6ed0014f781b144434e517fce439023 100644
--- a/test/CodeGen/AMDGPU/branch-relaxation.ll
+++ b/test/CodeGen/AMDGPU/branch-relaxation.ll
@@ -26,7 +26,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_max_short_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +8 dword branch
@@ -68,7 +68,7 @@ bb3:
 ; GCN: v_mov_b32_e32 [[V_CND:v[0-9]+]], [[CND]]
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %cnd) #0 {
 bb0:
   %cmp = icmp eq i32 %cnd, 0
   br i1 %cmp, label %bb3, label %bb2 ; +9 dword branch
@@ -108,7 +108,7 @@ bb3:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword [[V_CND]]
 ; GCN: s_endpgm
-define void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
+define amdgpu_kernel void @uniform_conditional_min_long_forward_vcnd_branch(float addrspace(1)* %arg, float %cnd) #0 {
 bb0:
   %cmp = fcmp oeq float %cnd, 0.0
   br i1 %cmp, label %bb3, label %bb2 ; + 8 dword branch
@@ -141,7 +141,7 @@ bb3:
 ; GCN: s_or_b64 exec, exec, [[SAVE]]
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @min_long_forward_vbranch(i32 addrspace(1)* %arg) #0 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = zext i32 %tid to i64
@@ -188,7 +188,7 @@ bb3:
 
 ; GCN-NEXT: [[ENDBB]]:
 ; GCN-NEXT: s_endpgm
-define void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
+define amdgpu_kernel void @long_backward_sbranch(i32 addrspace(1)* %arg) #0 {
 bb:
   br label %bb2
 
@@ -243,7 +243,7 @@ bb3:
 ; GCN: buffer_store_dword [[BB4_K]]
 ; GCN-NEXT: s_endpgm
 ; GCN-NEXT: .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_forward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 bb0:
   %tmp = icmp ne i32 %arg1, 0
   br i1 %tmp, label %bb2, label %bb3
@@ -285,7 +285,7 @@ bb4:
 ; GCN-NEXT: s_subb_u32 vcc_hi, vcc_hi, 0{{$}}
 ; GCN-NEXT: s_setpc_b64 vcc
 ; GCN-NEXT .Lfunc_end{{[0-9]+}}:
-define void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
+define amdgpu_kernel void @uniform_unconditional_min_long_backward_branch(i32 addrspace(1)* %arg, i32 %arg1) {
 entry:
   br label %loop
 
@@ -335,8 +335,14 @@ loop:
 ; GCN-NEXT: ;;#ASMEND
 
 ; GCN-NEXT: [[BB3]]: ; %bb3
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: ;;#ASMEND
+; GCN-NEXT: ;;#ASMSTART
+; GCN-NEXT: v_nop_e64
+; GCN-NEXT: ;;#ASMEND
 ; GCN-NEXT: s_endpgm
-define void @expand_requires_expand(i32 %cond0) #0 {
+define amdgpu_kernel void @expand_requires_expand(i32 %cond0) #0 {
 bb0:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp slt i32 %cond0, 0
@@ -356,6 +362,12 @@ bb2:
   br label %bb3
 
 bb3:
+; These NOPs prevent tail-duplication-based outlining
+; from firing, which defeats the need to expand the branches and this test.
+  call void asm sideeffect
+   "v_nop_e64", ""() #0
+  call void asm sideeffect
+   "v_nop_e64", ""() #0
   ret void
 }
 
@@ -385,8 +397,9 @@ bb3:
 
 ; GCN-NEXT: [[ENDIF]]: ; %endif
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
+; GCN-NEXT: s_sleep 5
 ; GCN-NEXT: s_endpgm
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %d_cmp = icmp ult i32 %tid, 16
@@ -402,6 +415,9 @@ if_uniform:
   br label %endif
 
 endif:
+  ; layout can remove the split branch if it can copy the return block.
+  ; This call makes the return block long enough that it doesn't get copied.
+  call void @llvm.amdgcn.s.sleep(i32 5);
   ret void
 }
 
@@ -446,7 +462,7 @@ endif:
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN: buffer_store_dword
 ; GCN-NEXT: s_endpgm
-define void @analyze_mask_branch() #0 {
+define amdgpu_kernel void @analyze_mask_branch() #0 {
 entry:
   %reg = call float asm sideeffect "v_mov_b32_e64 $0, 0", "=v"()
   %cmp0 = fcmp ogt float %reg, 0.000000e+00
@@ -475,7 +491,8 @@ ret:
 
 ; GCN-LABEL: {{^}}long_branch_hang:
 ; GCN: s_cmp_lt_i32 s{{[0-9]+}}, 6
-; GCN-NEXT: s_cbranch_scc0 [[LONG_BR_0:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: s_cbranch_scc1 {{BB[0-9]+_[0-9]+}}
+; GCN-NEXT: s_branch [[LONG_BR_0:BB[0-9]+_[0-9]+]]
 ; GCN-NEXT: BB{{[0-9]+_[0-9]+}}:
 
 ; GCN: s_add_u32 vcc_lo, vcc_lo, [[LONG_BR_DEST0:BB[0-9]+_[0-9]+]]-(
diff --git a/test/CodeGen/AMDGPU/bswap.ll b/test/CodeGen/AMDGPU/bswap.ll
index c68951731098cdbff37afdbdc193b213bee11de8..d2dacd7c17b3f119a20119e92b25172f5e6e6a76 100644
--- a/test/CodeGen/AMDGPU/bswap.ll
+++ b/test/CodeGen/AMDGPU/bswap.ll
@@ -17,7 +17,7 @@ declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>) nounwind readnone
 ; SI: v_bfi_b32 [[RESULT:v[0-9]+]], [[K]], [[TMP1]], [[TMP0]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %val) nounwind readnone
   store i32 %bswap, i32 addrspace(1)* %out, align 4
@@ -32,7 +32,7 @@ define void @test_bswap_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounw
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %bswap = call <2 x i32> @llvm.bswap.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %bswap, <2 x i32> addrspace(1)* %out, align 8
@@ -53,7 +53,7 @@ define void @test_bswap_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %bswap = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %bswap, <4 x i32> addrspace(1)* %out, align 16
@@ -86,7 +86,7 @@ define void @test_bswap_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_bfi_b32
 ; SI: s_endpgm
-define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) nounwind {
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %bswap = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %bswap, <8 x i32> addrspace(1)* %out, align 32
@@ -95,21 +95,21 @@ define void @test_bswap_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(
 
 ; FUNC-LABEL: {{^}}test_bswap_i64:
 ; SI-NOT: v_or_b32_e64 v{{[0-9]+}}, 0, 0
-define void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %val) nounwind readnone
   store i64 %bswap, i64 addrspace(1)* %out, align 8
   ret void
 }
 
-define void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %bswap = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %val) nounwind readnone
   store <2 x i64> %bswap, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
 
-define void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_bswap_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) nounwind {
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %bswap = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %val) nounwind readnone
   store <4 x i64> %bswap, <4 x i64> addrspace(1)* %out, align 32
diff --git a/test/CodeGen/AMDGPU/build_vector.ll b/test/CodeGen/AMDGPU/build_vector.ll
index 0a5774c601d3b0f3dd65fc08e88457b1d9f8edde..d77b0ab9fbb65c920af71d594c67a7a34ff35efa 100644
--- a/test/CodeGen/AMDGPU/build_vector.ll
+++ b/test/CodeGen/AMDGPU/build_vector.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: v_mov_b32_e32 v[[X:[0-9]]], 5
 ; SI-DAG: v_mov_b32_e32 v[[Y:[0-9]]], 6
 ; SI: buffer_store_dwordx2 v{{\[}}[[X]]:[[Y]]{{\]}}
-define void @build_vector2 (<2 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector2 (<2 x i32> addrspace(1)* %out) {
 entry:
   store <2 x i32> <i32 5, i32 6>, <2 x i32> addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ entry:
 ; SI-DAG: v_mov_b32_e32 v[[Z:[0-9]]], 7
 ; SI-DAG: v_mov_b32_e32 v[[W:[0-9]]], 8
 ; SI: buffer_store_dwordx4 v{{\[}}[[X]]:[[W]]{{\]}}
-define void @build_vector4 (<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @build_vector4 (<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 5, i32 6, i32 7, i32 8>, <4 x i32> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/call.ll b/test/CodeGen/AMDGPU/call.ll
index 6d101e1537cc8a1f03749fbe52488fd4a6bc1a60..769c7bb3eee7ae5e3c147d8d850b21fd65fa18fe 100644
--- a/test/CodeGen/AMDGPU/call.ll
+++ b/test/CodeGen/AMDGPU/call.ll
@@ -10,7 +10,7 @@
 
 declare i32 @external_function(i32) nounwind
 
-define void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call_external(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -25,7 +25,7 @@ define i32 @defined_function(i32 %x) nounwind noinline {
   ret i32 %y
 }
 
-define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -35,7 +35,7 @@ define void @test_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-define void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_tail_call(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
diff --git a/test/CodeGen/AMDGPU/calling-conventions.ll b/test/CodeGen/AMDGPU/calling-conventions.ll
index 57adc8be6a997a07254a2b5d6e09bb6a3c96b88f..677147b6f4e53f450733c85e52654181e672c3e9 100644
--- a/test/CodeGen/AMDGPU/calling-conventions.ll
+++ b/test/CodeGen/AMDGPU/calling-conventions.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -march=amdgcn -mcpu=SI -verify-machineinstrs | FileCheck --check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; Make sure we don't crash or assert on spir_kernel calling convention.
 
-; SI-LABEL: {{^}}kernel:
-; SI: s_endpgm
+; GCN-LABEL: {{^}}kernel:
+; GCN: s_endpgm
 define spir_kernel void @kernel(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
@@ -11,10 +12,34 @@ entry:
 }
 
 ; FIXME: This is treated like a kernel
-; SI-LABEL: {{^}}func:
-; SI: s_endpgm
-define spir_func void @func(i32 addrspace(1)* %out) {
-entry:
-  store i32 0, i32 addrspace(1)* %out
-  ret void
+; XGCN-LABEL: {{^}}func:
+; XGCN: s_endpgm
+; define spir_func void @func(i32 addrspace(1)* %out) {
+; entry:
+;   store i32 0, i32 addrspace(1)* %out
+;   ret void
+; }
+
+; GCN-LABEL: {{^}}ps_ret_cc_f16:
+; SI: v_cvt_f16_f32_e32 v0, v0
+; SI: v_cvt_f32_f16_e32 v0, v0
+; SI: v_add_f32_e32 v0, 1.0, v0
+
+; VI: v_add_f16_e32 v0, 1.0, v0
+; VI: ; return
+define amdgpu_ps half @ps_ret_cc_f16(half %arg0) {
+  %add = fadd half %arg0, 1.0
+  ret half %add
+}
+
+; GCN-LABEL: {{^}}ps_ret_cc_inreg_f16:
+; SI: v_cvt_f16_f32_e32 v0, s0
+; SI: v_cvt_f32_f16_e32 v0, v0
+; SI: v_add_f32_e32 v0, 1.0, v0
+
+; VI: v_add_f16_e64 v0, s0, 1.0
+; VI: ; return
+define amdgpu_ps half @ps_ret_cc_inreg_f16(half inreg %arg0) {
+  %add = fadd half %arg0, 1.0
+  ret half %add
 }
diff --git a/test/CodeGen/AMDGPU/captured-frame-index.ll b/test/CodeGen/AMDGPU/captured-frame-index.ll
index 49af159581f7a2b7c64cc8b87618e2f8b1be8444..5fe1b2728506e366b62373ee44b61e80c10b6c03 100644
--- a/test/CodeGen/AMDGPU/captured-frame-index.ll
+++ b/test/CodeGen/AMDGPU/captured-frame-index.ll
@@ -1,24 +1,24 @@
 ; RUN: llc -mtriple=amdgcn-- -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}store_fi_lifetime:
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @store_fi_lifetime(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %b = alloca i8
-  call void @llvm.lifetime.start(i64 1, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %b)
   store volatile i8* %b, i8* addrspace(1)* undef
-  call void @llvm.lifetime.end(i64 1, i8* %b)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %b)
   ret void
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_lds:
 ; GCN: s_load_dword [[LDSPTR:s[0-9]+]]
 ; GCN: buffer_store_dword v{{[0-9]+}}, off,
-; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[ZERO0:v[0-9]+]], 4{{$}}
 ; GCN: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO0]]
-define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
   %tmp = alloca float
   store float 4.0, float *%tmp
   store float* %tmp, float* addrspace(3)* %ptr
@@ -27,18 +27,18 @@ define void @stored_fi_to_lds(float* addrspace(3)* %ptr) #0 {
 
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_lds_2_small_objects:
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
 
 ; GCN-DAG: s_load_dword [[LDSPTR:s[0-9]+]]
 
 ; GCN-DAG: v_mov_b32_e32 [[VLDSPTR:v[0-9]+]], [[LDSPTR]]
 ; GCN: ds_write_b32  [[VLDSPTR]], [[ZERO]]
 
-; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
 ; GCN: ds_write_b32  [[VLDSPTR]], [[FI1]]
-define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   store float 4.0, float* %tmp0
@@ -51,10 +51,10 @@ define void @stored_fi_to_lds_2_small_objects(float* addrspace(3)* %ptr) #0 {
 ; Same frame index is used multiple times in the store
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
-; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
-define void @stored_fi_to_self() #0 {
+; GCN: buffer_store_dword [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 4{{$}}
+; GCN: buffer_store_dword [[ZERO]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32*
 
   ; Avoid optimizing everything out
@@ -66,14 +66,14 @@ define void @stored_fi_to_self() #0 {
 
 ; GCN-LABEL: {{^}}stored_fi_to_self_offset:
 ; GCN-DAG: v_mov_b32_e32 [[K0:v[0-9]+]], 32{{$}}
-; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN: buffer_store_dword [[K0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[K1:v[0-9]+]], 0x4d2{{$}}
-; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}}
+; GCN: buffer_store_dword [[K1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
 
-; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x800{{$}}
-; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2048{{$}}
-define void @stored_fi_to_self_offset() #0 {
+; GCN: v_mov_b32_e32 [[OFFSETK:v[0-9]+]], 0x804{{$}}
+; GCN: buffer_store_dword [[OFFSETK]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2052{{$}}
+define amdgpu_kernel void @stored_fi_to_self_offset() #0 {
   %tmp0 = alloca [512 x i32]
   %tmp1 = alloca i32*
 
@@ -89,16 +89,16 @@ define void @stored_fi_to_self_offset() #0 {
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_fi:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}
 
-; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
-; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
+; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}
 
-; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
-; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
-define void @stored_fi_to_fi() #0 {
+; GCN: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
+; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
+define amdgpu_kernel void @stored_fi_to_fi() #0 {
   %tmp0 = alloca i32*
   %tmp1 = alloca i32*
   %tmp2 = alloca i32*
@@ -115,10 +115,10 @@ define void @stored_fi_to_fi() #0 {
 }
 
 ; GCN-LABEL: {{^}}stored_fi_to_global:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
   %tmp = alloca float
   store float 0.0, float *%tmp
   store float* %tmp, float* addrspace(1)* %ptr
@@ -127,16 +127,16 @@ define void @stored_fi_to_global(float* addrspace(1)* %ptr) #0 {
 
 ; Offset is applied
 ; GCN-LABEL: {{^}}stored_fi_to_global_2_small_objects:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8{{$}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12{{$}}
 
-; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 4{{$}}
+; GCN: v_mov_b32_e32 [[FI1:v[0-9]+]], 8{{$}}
 ; GCN: buffer_store_dword [[FI1]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 
-; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 8{{$}}
+; GCN-DAG: v_mov_b32_e32 [[FI2:v[0-9]+]], 12{{$}}
 ; GCN: buffer_store_dword [[FI2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   %tmp2 = alloca float
@@ -150,10 +150,10 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
 
 ; GCN-LABEL: {{^}}stored_fi_to_global_huge_frame_offset:
 ; GCN: v_mov_b32_e32 [[BASE_0:v[0-9]+]], 0{{$}}
-; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN: buffer_store_dword [[BASE_0]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4{{$}}
 
 ; FIXME: Re-initialize
-; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[BASE_0_1:v[0-9]+]], 4{{$}}
 
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7{{$}}
 ; GCN-DAG: v_add_i32_e32 [[BASE_1_OFF_1:v[0-9]+]], vcc, 0x3ffc, [[BASE_0_1]]
@@ -163,7 +163,7 @@ define void @stored_fi_to_global_2_small_objects(float* addrspace(1)* %ptr) #0 {
 ; GCN: buffer_store_dword [[K]], [[BASE_1_OFF_1]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: buffer_store_dword [[BASE_1_OFF_2]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca [4096 x i32]
   %tmp1 = alloca [4096 x i32]
   %gep0.tmp0 = getelementptr [4096 x i32], [4096 x i32]* %tmp0, i32 0, i32 0
@@ -184,9 +184,9 @@ define void @stored_fi_to_global_huge_frame_offset(i32* addrspace(1)* %ptr) #0 {
 ; GCN: s_getpc_b64 s{{\[}}[[PC_LO:[0-9]+]]:[[PC_HI:[0-9]+]]{{\]}}
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[PC_LO]], g1@gotpcrel32@lo+4
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[PC_HI]], g1@gotpcrel32@hi+4
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: buffer_store_dword [[FI]]
-define void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
+define amdgpu_kernel void @cannot_select_assertzext_valuetype(i32 addrspace(1)* %out, i32 %idx) #0 {
 entry:
   %b = alloca i32, align 4
   %tmp1 = load volatile i32*, i32* addrspace(1)* @g1, align 4
@@ -196,8 +196,8 @@ entry:
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
index 0d919bbf85e3c05249cde8718227390d23d6c472..697f26b83a4df46b8b6ecd9c262c323432ce56c2 100644
--- a/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
+++ b/test/CodeGen/AMDGPU/cf-loop-on-constant.ll
@@ -2,12 +2,12 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs -O0 < %s
 
 ; GCN-LABEL: {{^}}test_loop:
-; GCN: [[LABEL:BB[0-9+]_[0-9]+]]:
+; GCN: [[LABEL:BB[0-9+]_[0-9]+]]: ; %for.body{{$}}
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
 ; GCN: s_endpgm
-define void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @test_loop(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   %cmp = icmp eq i32 %n, -1
   br i1 %cmp, label %for.exit, label %for.body
@@ -31,7 +31,7 @@ for.body:
 ; GCN: ds_read_b32
 ; GCN: ds_write_b32
 ; GCN: s_branch [[LABEL]]
-define void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_true(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -52,7 +52,7 @@ for.body:
 ; GCN-LABEL: {{^}}loop_const_false:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_false(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -74,7 +74,7 @@ for.body:
 ; GCN-LABEL: {{^}}loop_const_undef:
 ; GCN-NOT: s_branch
 ; GCN: s_endpgm
-define void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
+define amdgpu_kernel void @loop_const_undef(float addrspace(3)* %ptr, i32 %n) nounwind {
 entry:
   br label %for.body
 
@@ -104,7 +104,7 @@ for.body:
 ; GCN: s_cbranch_vccnz [[LOOPBB]]
 ; GCN-NEXT: ; BB#2
 ; GCN-NEXT: s_endpgm
-define void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
+define amdgpu_kernel void @loop_arg_0(float addrspace(3)* %ptr, i32 %n, i1 %cond) nounwind {
 entry:
   br label %for.body
 
diff --git a/test/CodeGen/AMDGPU/cf-stack-bug.ll b/test/CodeGen/AMDGPU/cf-stack-bug.ll
index 75b87e486226035150746a7c5eb6ae4fca7d6543..53fe89730f3aa81969afeb824920455b3c4cc5b8 100644
--- a/test/CodeGen/AMDGPU/cf-stack-bug.ll
+++ b/test/CodeGen/AMDGPU/cf-stack-bug.ll
@@ -35,7 +35,7 @@
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested3:
-define void @nested3(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested3(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -68,7 +68,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested4:
-define void @nested4(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested4(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -109,7 +109,7 @@ end:
 ; BUG32-NOT: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested7:
-define void @nested7(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested7(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
@@ -174,7 +174,7 @@ end:
 ; BUG32: Applying bug work-around
 ; NOBUG-NOT: Applying bug work-around
 ; FUNC-LABEL: {{^}}nested8:
-define void @nested8(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @nested8(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp sgt i32 %cond, 0
   br i1 %0, label %if.1, label %end
diff --git a/test/CodeGen/AMDGPU/cf_end.ll b/test/CodeGen/AMDGPU/cf_end.ll
index c74ee22868d5db82c66dada971e1507736f9f050..3c990e0a4bd61718b3682597478b528c9eac7ed8 100644
--- a/test/CodeGen/AMDGPU/cf_end.ll
+++ b/test/CodeGen/AMDGPU/cf_end.ll
@@ -4,6 +4,6 @@
 
 ; EG: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x20,0x80]
 ; CM: CF_END ; encoding: [0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x88]
-define void @eop() {
+define amdgpu_kernel void @eop() {
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
index 6db9a0761a010552abefd03c835e1b424dbbdb74..cbdcf6aeaf429650fb1f996f9705619003e023af 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes-flat.ll
@@ -11,7 +11,7 @@
 ; GCN-LABEL: {{^}}test_no_sink_flat_small_offset_i32:
 ; GCN: flat_load_dword
 ; GCN: {{^}}BB0_2:
-define void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_no_sink_flat_small_offset_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@@ -36,14 +36,14 @@ done:
 ; OPT-CI-NOT: getelementptr
 ; OPT: br i1
 
-; OPT-CI: ptrtoint
-; OPT-CI: add
-; OPT-CI: inttoptr
+; OPT-CI: addrspacecast
+; OPT-CI: getelementptr
+; OPT-CI: bitcast
 ; OPT: br label
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_global_i32:
 ; CI: buffer_load_dword {{v[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_global_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
@@ -69,14 +69,14 @@ done:
 ; OPT-CI-NOT: getelementptr
 ; OPT: br i1
 
-; OPT-CI: ptrtoint
-; OPT-CI: add
-; OPT-CI: inttoptr
+; OPT-CI: addrspacecast
+; OPT-CI: getelementptr
+; OPT-CI: bitcast
 ; OPT: br label
 
 ; GCN-LABEL: {{^}}test_sink_noop_addrspacecast_flat_to_constant_i32:
 ; CI: s_load_dword {{s[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-define void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
+define amdgpu_kernel void @test_sink_noop_addrspacecast_flat_to_constant_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %in, i32 %cond) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(4)* %in, i64 7
diff --git a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
index 2ed2857ff3401ec215b261d05a6084e41c52bf13..c1cf56e5058eca39dab6af53816c79e141b647db 100644
--- a/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
+++ b/test/CodeGen/AMDGPU/cgp-addressing-modes.ll
@@ -5,15 +5,17 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=-promote-alloca -amdgpu-sroa=0 < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
 ; OPT-LABEL: @test_sink_global_small_offset_i32(
 ; OPT-CI-NOT: getelementptr i32, i32 addrspace(1)* %in
 ; OPT-VI: getelementptr i32, i32 addrspace(1)* %in
 ; OPT: br i1
-; OPT-CI: ptrtoint
+; OPT-CI: getelementptr i8,
 
 ; GCN-LABEL: {{^}}test_sink_global_small_offset_i32:
 ; GCN: {{^}}BB0_2:
-define void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i64 7
@@ -43,7 +45,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB1_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_i32_ds_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 65535
@@ -70,7 +72,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:4095{{$}}
 ; GCN: {{^}}BB2_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4095
@@ -97,7 +99,7 @@ done:
 ; GCN: buffer_load_sbyte {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
 ; GCN: {{^}}BB3_2:
 ; GCN: s_or_b64 exec
-define void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sink_global_small_max_plus_1_mubuf_offset(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 99999
   %in.gep = getelementptr i8, i8 addrspace(1)* %in, i64 4096
@@ -122,14 +124,55 @@ done:
 ; OPT-LABEL: @test_sink_scratch_small_offset_i32(
 ; OPT-NOT:  getelementptr [512 x i32]
 ; OPT: br i1
-; OPT: ptrtoint
+; OPT: getelementptr i8,
 
 ; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32:
 ; GCN: s_and_saveexec_b64
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4092{{$}}
 ; GCN: {{^}}BB4_2:
-define void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+entry:
+  %alloca = alloca [512 x i32], align 4
+  %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
+  %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i64 999999
+  %add.arg = add i32 %arg, 8
+  %alloca.gep = getelementptr [512 x i32], [512 x i32]* %alloca, i32 0, i32 1022
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  store volatile i32 123, i32* %alloca.gep
+  %tmp1 = load volatile i32, i32* %alloca.gep
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(1)* %out.gep.0
+  %load = load volatile i32, i32* %alloca.gep
+  store i32 %load, i32 addrspace(1)* %out.gep.1
+  br label %done
+
+done:
+  ret void
+}
+
+; This ends up not fitting due to the reserved 4 bytes at offset 0
+; OPT-LABEL: @test_sink_scratch_small_offset_i32_reserved(
+; OPT-NOT:  getelementptr [512 x i32]
+; OPT: br i1
+; OPT: getelementptr i8,
+
+; GCN-LABEL: {{^}}test_sink_scratch_small_offset_i32_reserved:
+; GCN: s_and_saveexec_b64
+; GCN: v_mov_b32_e32 [[BASE_FI0:v[0-9]+]], 4
+; GCN: buffer_store_dword {{v[0-9]+}}, [[BASE_FI0]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
+; GCN: v_mov_b32_e32 [[BASE_FI1:v[0-9]+]], 4
+; GCN: buffer_load_dword {{v[0-9]+}}, [[BASE_FI1]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen offset:4092{{$}}
+; GCN: {{^BB[0-9]+}}_2:
+
+define amdgpu_kernel void @test_sink_scratch_small_offset_i32_reserved(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -165,8 +208,8 @@ done:
 ; GCN: s_and_saveexec_b64
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
-; GCN: {{^}}BB5_2:
-define void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
+; GCN: {{^BB[0-9]+}}_2:
+define amdgpu_kernel void @test_no_sink_scratch_large_offset_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %arg) {
 entry:
   %alloca = alloca [512 x i32], align 4
   %out.gep.0 = getelementptr i32, i32 addrspace(1)* %out, i64 999998
@@ -197,8 +240,8 @@ done:
 ; GCN: s_and_saveexec_b64
 ; CI: buffer_load_dword {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; VI: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-; GCN: {{^}}BB6_2:
-define void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+; GCN: {{^BB[0-9]+}}_2:
+define amdgpu_kernel void @test_sink_global_vreg_sreg_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
   %offset.ext = zext i32 %offset to i64
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
@@ -228,7 +271,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x7{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_small_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 7
@@ -257,7 +300,7 @@ done:
 ; GCN: s_and_saveexec_b64
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xff{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 255
@@ -290,7 +333,7 @@ done:
 
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_8_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 256
@@ -322,7 +365,7 @@ done:
 ; GCN: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, 3{{$}}
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 4294967295
@@ -353,7 +396,7 @@ done:
 ; GCN: s_addc_u32
 ; SI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_32_bit_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 17179869181
@@ -383,7 +426,7 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0xffffc{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262143
@@ -421,7 +464,7 @@ done:
 ; VI: s_load_dword s{{[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, [[OFFSET]]{{$}}
 
 ; GCN: s_or_b64 exec, exec
-define void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @test_sink_constant_max_20_bit_byte_offset_p1_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i64 999999
   %in.gep = getelementptr i32, i32 addrspace(2)* %in, i64 262144
@@ -445,13 +488,13 @@ done:
 %struct.foo = type { [3 x float], [3 x float] }
 
 ; OPT-LABEL: @sink_ds_address(
-; OPT: ptrtoint %struct.foo addrspace(3)* %ptr to i64
+; OPT: getelementptr i8,
 
 ; GCN-LABEL: {{^}}sink_ds_address:
 ; GCN: s_load_dword [[SREG1:s[0-9]+]],
 ; GCN: v_mov_b32_e32 [[VREG1:v[0-9]+]], [[SREG1]]
 ; GCN-DAG: ds_read2_b32 v[{{[0-9+:[0-9]+}}], [[VREG1]] offset0:3 offset1:5
-define void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
+define amdgpu_kernel void @sink_ds_address(%struct.foo addrspace(3)* nocapture %ptr) nounwind {
 entry:
   %x = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 0
   %y = getelementptr inbounds %struct.foo, %struct.foo addrspace(3)* %ptr, i32 0, i32 1, i32 2
@@ -476,9 +519,8 @@ bb34:
 ; OPT-LABEL: @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(
 ; OPT: br i1 %tmp0,
 ; OPT: if:
-; OPT: %sunkaddr = ptrtoint i8 addrspace(2)* %in to i64
-; OPT: %sunkaddr1 = add i64 %sunkaddr, 4095
-define void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
+; OPT: getelementptr i8, {{.*}} 4095
+define amdgpu_kernel void @test_sink_constant_small_max_mubuf_offset_load_i32_align_1(i32 addrspace(1)* %out, i8 addrspace(2)* %in) {
 entry:
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 1024
   %in.gep = getelementptr i8, i8 addrspace(2)* %in, i64 4095
@@ -500,7 +542,141 @@ done:
   ret void
 }
 
+; OPT-LABEL: @test_sink_local_small_offset_atomicrmw_i32(
+; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
+; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
+; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
+; OPT: %tmp1 = atomicrmw add i32 addrspace(3)* %1, i32 2 seq_cst
+define amdgpu_kernel void @test_sink_local_small_offset_atomicrmw_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
+  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = atomicrmw add i32 addrspace(3)* %in.gep, i32 2 seq_cst
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(3)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_local_small_offset_cmpxchg_i32(
+; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
+; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
+; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
+; OPT: %tmp1.struct = cmpxchg i32 addrspace(3)* %1, i32 undef, i32 2 seq_cst monotonic
+define amdgpu_kernel void @test_sink_local_small_offset_cmpxchg_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
+  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1.struct = cmpxchg i32 addrspace(3)* %in.gep, i32 undef, i32 2 seq_cst monotonic
+  %tmp1 = extractvalue { i32, i1 } %tmp1.struct, 0
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(3)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_wrong_operand_local_small_offset_cmpxchg_i32(
+; OPT: %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+; OPT: br i1
+; OPT: cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
+define amdgpu_kernel void @test_wrong_operand_local_small_offset_cmpxchg_i32(i32 addrspace(3)* addrspace(3)* %out, i32 addrspace(3)* %in) {
+entry:
+  %out.gep = getelementptr i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* %out, i32 999999
+  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1.struct = cmpxchg i32 addrspace(3)* addrspace(3)* undef, i32 addrspace(3)* %in.gep, i32 addrspace(3)* undef seq_cst monotonic
+  %tmp1 = extractvalue { i32 addrspace(3)*, i1 } %tmp1.struct, 0
+  br label %endif
+
+endif:
+  %x = phi i32 addrspace(3)* [ %tmp1, %if ], [ null, %entry ]
+  store i32 addrspace(3)* %x, i32 addrspace(3)* addrspace(3)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_local_small_offset_atomic_inc_i32(
+; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
+; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
+; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
+; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_inc_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
+  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(3)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
+; OPT-LABEL: @test_sink_local_small_offset_atomic_dec_i32(
+; OPT: %0 = bitcast i32 addrspace(3)* %in to i8 addrspace(3)*
+; OPT: %sunkaddr = getelementptr i8, i8 addrspace(3)* %0, i32 28
+; OPT: %1 = bitcast i8 addrspace(3)* %sunkaddr to i32 addrspace(3)*
+; OPT: %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %1, i32 2, i32 0, i32 0, i1 false)
+define amdgpu_kernel void @test_sink_local_small_offset_atomic_dec_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) {
+entry:
+  %out.gep = getelementptr i32, i32 addrspace(3)* %out, i32 999999
+  %in.gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
+  %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %tmp0 = icmp eq i32 %tid, 0
+  br i1 %tmp0, label %endif, label %if
+
+if:
+  %tmp1 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %in.gep, i32 2, i32 0, i32 0, i1 false)
+  br label %endif
+
+endif:
+  %x = phi i32 [ %tmp1, %if ], [ 0, %entry ]
+  store i32 %x, i32 addrspace(3)* %out.gep
+  br label %done
+
+done:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
 
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
+attributes #2 = { nounwind argmemonly }
diff --git a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
index 066ef951cc31bd42532a716fb63fe5f20a22cb74..53adf09026ec5bc3177f5036dcd916f5fbf632e8 100644
--- a/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
+++ b/test/CodeGen/AMDGPU/cgp-bitfield-extract.ll
@@ -36,7 +36,7 @@
 ; GCN: BB0_3:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
   %shr = lshr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
@@ -76,7 +76,7 @@ ret:
 ; OPT: ret
 
 ; GCN-LABEL: {{^}}sink_sbfe_i32:
-define void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
+define amdgpu_kernel void @sink_sbfe_i32(i32 addrspace(1)* %out, i32 %arg1) #0 {
 entry:
   %shr = ashr i32 %arg1, 8
   br i1 undef, label %bb0, label %bb1
@@ -120,20 +120,21 @@ ret:
 
 ; GCN-LABEL: {{^}}sink_ubfe_i16:
 ; GCN-NOT: lshr
-; VI: s_bfe_u32 s0, s0, 0xc0004
+; VI: s_load_dword [[ARG:s[0-9]+]], s[0:1], 0x2c
+; VI: s_bfe_u32 [[BFE:s[0-9]+]], [[ARG]], 0xc0004
 ; GCN: s_cbranch_scc1
 
 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80004
-; VI: s_and_b32 s0, s0, 0xff
+; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0xff
 
 ; GCN: BB2_2:
 ; SI: s_bfe_u32 s{{[0-9]+}}, s{{[0-9]+}}, 0x70004
-; VI: s_and_b32 s0, s0, 0x7f
+; VI: s_and_b32 s{{[0-9]+}}, [[BFE]], 0x7f
 
 ; GCN: BB2_3:
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i16(i16 addrspace(1)* %out, i16 %arg1) #0 {
 entry:
   %shr = lshr i16 %arg1, 4
   br i1 undef, label %bb0, label %bb1
@@ -186,7 +187,7 @@ ret:
 
 ; GCN: BB3_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_span_midpoint(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 30
   br i1 undef, label %bb0, label %bb1
@@ -235,7 +236,7 @@ ret:
 
 ; GCN: BB4_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_low32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 15
   br i1 undef, label %bb0, label %bb1
@@ -282,7 +283,7 @@ ret:
 
 ; GCN: BB5_3:
 ; GCN: buffer_store_dwordx2
-define void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
+define amdgpu_kernel void @sink_ubfe_i64_high32(i64 addrspace(1)* %out, i64 %arg1) #0 {
 entry:
   %shr = lshr i64 %arg1, 35
   br i1 undef, label %bb0, label %bb1
diff --git a/test/CodeGen/AMDGPU/clamp-modifier.ll b/test/CodeGen/AMDGPU/clamp-modifier.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c3a7d5e14d87cc216f456088e978abd42fc98092
--- /dev/null
+++ b/test/CodeGen/AMDGPU/clamp-modifier.ll
@@ -0,0 +1,222 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-NOT: [[A]]
+; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_multi_use_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[ADD]], [[ADD]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* %out.gep
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_dbg_use_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-NOT: [[A]]
+; GCN: v_add_f32_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_dbg_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_neg_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_floor_f32_e32 [[FLOOR:v[0-9]+]], [[A]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[FLOOR]], -[[FLOOR]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_neg_src_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %floor = call float @llvm.floor.f32(float %a)
+  %neg.floor = fsub float -0.0, %floor
+  %max = call float @llvm.maxnum.f32(float %neg.floor, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_non_clamp_max_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
+; GCN: v_max_f32_e32 v{{[0-9]+}}, 0, [[ADD]]{{$}}
+define amdgpu_kernel void @v_non_clamp_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  store float %max, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f32_denormals:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_f32_denormals(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f16_denorm:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_add_f16_e64 [[ADD:v[0-9]+]], [[A]], 1.0 clamp{{$}}
+
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
+define amdgpu_kernel void @v_clamp_add_src_f16_denorm(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %add = fadd half %a, 1.0
+  %max = call half @llvm.maxnum.f16(half %add, half 0.0)
+  %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
+  store half %clamp, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f16_no_denormals:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI-NOT: [[A]]
+; VI: v_add_f16_e64 v{{[0-9]+}}, [[A]], 1.0 clamp{{$}}
+
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; SI: v_add_f32_e64 [[ADD:v[0-9]+]], [[CVT]], 1.0 clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[ADD]]
+define amdgpu_kernel void @v_clamp_add_src_f16_no_denormals(half addrspace(1)* %out, half addrspace(1)* %aptr) #3 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %add = fadd half %a, 1.0
+  %max = call half @llvm.maxnum.f16(half %add, half 0.0)
+  %clamp = call half @llvm.minnum.f16(half %max, half 1.0)
+  store half %clamp, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_v2f32:
+; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A:[0-9]+]]:[[B:[0-9]+]]{{\]}}
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[A]], 1.0 clamp{{$}}
+; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, v[[B]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %out, i32 %tid
+  %a = load <2 x float>, <2 x float> addrspace(1)* %gep0
+  %add = fadd <2 x float> %a, <float 1.0, float 1.0>
+  %max = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %add, <2 x float> zeroinitializer)
+  %clamp = call <2 x float> @llvm.minnum.v2f32(<2 x float> %max, <2 x float> <float 1.0, float 1.0>)
+  store <2 x float> %clamp, <2 x float> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_add_src_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], 1.0 clamp{{$}}
+define amdgpu_kernel void @v_clamp_add_src_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %add = fadd double %a, 1.0
+  %max = call double @llvm.maxnum.f64(double %add, double 0.0)
+  %clamp = call double @llvm.minnum.f64(double %max, double 1.0)
+  store double %clamp, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_mac_to_mad:
+; GCN: v_mad_f32 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]}} clamp{{$}}
+define amdgpu_kernel void @v_clamp_mac_to_mad(float addrspace(1)* %out, float addrspace(1)* %aptr, float %a) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %b = load float, float addrspace(1)* %gep0
+
+  %mul = fmul float %a, %a
+  %add = fadd float %mul, %b
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  %res = fadd float %clamp, %b
+  store float %res, float addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
+declare double @llvm.fabs.f64(double) #1
+declare double @llvm.minnum.f64(double, double) #1
+declare double @llvm.maxnum.f64(double, double) #1
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.minnum.f16(half, half) #1
+declare half @llvm.maxnum.f16(half, half) #1
+declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #1
+declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-features"="+fp32-denormals" }
+attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
+!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8}
+!8 = !DIBasicType(name: "float", size: 32, align: 32)
+!9 = !DIExpression()
+!10 = !DILocation(line: 1, column: 42, scope: !5)
diff --git a/test/CodeGen/AMDGPU/clamp-omod-special-case.mir b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
new file mode 100644
index 0000000000000000000000000000000000000000..fbfd0fbf930884a180204a1c847ae29717fafc68
--- /dev/null
+++ b/test/CodeGen/AMDGPU/clamp-omod-special-case.mir
@@ -0,0 +1,424 @@
+# RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-fold-operands  %s -o - | FileCheck -check-prefix=GCN %s
+--- |
+  define amdgpu_ps void @v_max_self_clamp_not_set_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_ps void @v_clamp_omod_already_set_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_ps void @v_omod_mul_omod_already_set_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_ps void @v_omod_mul_clamp_already_set_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_ps void @v_omod_add_omod_already_set_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_ps void @v_omod_add_clamp_already_set_f32() #0 {
+    ret void
+  }
+
+  attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
+
+...
+---
+# GCN-LABEL: name: v_max_self_clamp_not_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit %exec
+
+name:            v_max_self_clamp_not_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 0, 0, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: v_clamp_omod_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN: %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit %exec
+name:            v_clamp_omod_already_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_MAX_F32_e64 0, killed %20, 0, killed %20, 1, 3, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+...
+---
+# Don't fold a mul that looks like an omod if itself has omod set
+
+# GCN-LABEL: name: v_omod_mul_omod_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit %exec
+name:            v_omod_mul_omod_already_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 0, 3, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# Don't fold a mul that looks like an omod if itself has clamp set
+# This might be OK, but would require folding the clamp at the same time.
+# GCN-LABEL: name: v_omod_mul_clamp_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit %exec
+
+name:            v_omod_mul_clamp_already_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_MUL_F32_e64 0, killed %20, 0, 1056964608, 1, 0, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+
+
+
+
+
+
+
+
+
+
+
+
+
+---
+# Don't fold a mul that looks like an omod if itself has omod set
+
+# GCN-LABEL: name: v_omod_add_omod_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit %exec
+name:            v_omod_add_omod_already_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 0, 3, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# Don't fold a mul that looks like an omod if itself has clamp set
+# This might be OK, but would require folding the clamp at the same time.
+# GCN-LABEL: name: v_omod_add_clamp_already_set_f32
+# GCN: %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+# GCN-NEXT: %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit %exec
+
+name:            v_omod_add_clamp_already_set_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: sgpr_128 }
+  - { id: 17, class: vgpr_32 }
+  - { id: 18, class: vreg_64 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vgpr_32 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vgpr_32 }
+  - { id: 23, class: vreg_64 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vreg_64 }
+  - { id: 26, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %24 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %25 = REG_SEQUENCE %3, 1, %24, 2
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    %12 = REG_SEQUENCE killed %11, 1, killed %10, 2
+    %13 = REG_SEQUENCE killed %5, 17, %12, 18
+    %14 = S_MOV_B32 2
+    %26 = V_LSHL_B64 killed %25, 2, implicit %exec
+    %16 = REG_SEQUENCE killed %4, 17, %12, 18
+    %18 = COPY %26
+    %17 = BUFFER_LOAD_DWORD_ADDR64 %26, killed %13, 0, 0, 0, 0, 0, implicit %exec
+    %20 = V_ADD_F32_e64 0, killed %17, 0, 1065353216, 0, 0, implicit %exec
+    %21 = V_ADD_F32_e64 0, killed %20, 0, killed %20, 1, 0, implicit %exec
+    BUFFER_STORE_DWORD_ADDR64 killed %21, %26, killed %16, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/clamp.ll b/test/CodeGen/AMDGPU/clamp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9735c7074be2abee83f663fe9bf13cf9442d4f81
--- /dev/null
+++ b/test/CodeGen/AMDGPU/clamp.ll
@@ -0,0 +1,529 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}v_clamp_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_neg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_neg_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %fneg.a = fsub float -0.0, %a
+  %max = call float @llvm.maxnum.f32(float %fneg.a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negabs_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
+define amdgpu_kernel void @v_clamp_negabs_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %fabs.a = call float @llvm.fabs.f32(float %a)
+  %fneg.fabs.a = fsub float -0.0, %fabs.a
+
+  %max = call float @llvm.maxnum.f32(float %fneg.fabs.a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negzero_f32:
+; GCN-DAG: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN-DAG: v_bfrev_b32_e32 [[SIGNBIT:v[0-9]+]], 1
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[SIGNBIT]], 1.0
+define amdgpu_kernel void @v_clamp_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float -0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_multi_use_max_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+define amdgpu_kernel void @v_clamp_multi_use_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  store volatile float %max, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_max_f16_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], [[A]] clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
+define amdgpu_kernel void @v_clamp_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %max = call half @llvm.maxnum.f16(half %a, half 0.0)
+  %med = call half @llvm.minnum.f16(half %max, half 1.0)
+
+  store half %med, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_neg_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_max_f16_e64 v{{[0-9]+}}, -[[A]], -[[A]] clamp{{$}}
+
+; FIXME: Better to fold neg into max
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]] clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
+define amdgpu_kernel void @v_clamp_neg_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %fneg.a = fsub half -0.0, %a
+  %max = call half @llvm.maxnum.f16(half %fneg.a, half 0.0)
+  %med = call half @llvm.minnum.f16(half %max, half 1.0)
+
+  store half %med, half addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negabs_f16:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; VI: v_max_f16_e64 v{{[0-9]+}}, -|[[A]]|, -|[[A]]| clamp{{$}}
+
+; FIXME: Better to fold neg/abs into max
+
+; SI: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[A]]| clamp{{$}}
+; SI: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[CVT]]
+define amdgpu_kernel void @v_clamp_negabs_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %fabs.a = call half @llvm.fabs.f16(half %a)
+  %fneg.fabs.a = fsub half -0.0, %fabs.a
+
+  %max = call half @llvm.maxnum.f16(half %fneg.fabs.a, half 0.0)
+  %med = call half @llvm.minnum.f16(half %max, half 1.0)
+
+  store half %med, half addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: Do f64 instructions support clamp?
+; GCN-LABEL: {{^}}v_clamp_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %max = call double @llvm.maxnum.f64(double %a, double 0.0)
+  %med = call double @llvm.minnum.f64(double %max, double 1.0)
+
+  store double %med, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_neg_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -[[A]], -[[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_neg_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %fneg.a = fsub double -0.0, %a
+  %max = call double @llvm.maxnum.f64(double %fneg.a, double 0.0)
+  %med = call double @llvm.minnum.f64(double %max, double 1.0)
+
+  store double %med, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_negabs_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: v_max_f64 v{{\[[0-9]+:[0-9]+\]}}, -|[[A]]|, -|[[A]]| clamp{{$}}
+define amdgpu_kernel void @v_clamp_negabs_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
+  %a = load double, double addrspace(1)* %gep0
+  %fabs.a = call double @llvm.fabs.f64(double %a)
+  %fneg.fabs.a = fsub double -0.0, %fabs.a
+
+  %max = call double @llvm.maxnum.f64(double %fneg.fabs.a, double 0.0)
+  %med = call double @llvm.minnum.f64(double %max, double 1.0)
+
+  store double %med, double addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_aby_negzero_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32
+define amdgpu_kernel void @v_clamp_med3_aby_negzero_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float -0.0, float 1.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_aby_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_aby_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bay_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_bay_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yab_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_yab_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yba_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_yba_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_ayb_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bya_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_bya_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constants_to_one_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 1.0
+define amdgpu_kernel void @v_clamp_constants_to_one_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 4.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constants_to_zero_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @v_clamp_constants_to_zero_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float -4.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_preserve_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0.5
+define amdgpu_kernel void @v_clamp_constant_preserve_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0.5)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_preserve_denorm_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fffff{{$}}
+define amdgpu_kernel void @v_clamp_constant_preserve_denorm_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 8388607 to float))
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @v_clamp_constant_qnan_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_snan_f32:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @v_clamp_constant_snan_f32(float addrspace(1)* %out) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; ---------------------------------------------------------------------
+; Test non-default behaviors enabling snans and disabling dx10_clamp
+; ---------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_clamp_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+define amdgpu_kernel void @v_clamp_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f32_snan_dx10clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_f32_snan_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #3 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 1.0, [[MAX]]
+define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_f32_snan_no_dx10clamp_nnan_src:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+define amdgpu_kernel void @v_clamp_f32_snan_no_dx10clamp_nnan_src(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add  = fadd nnan float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %med = call float @llvm.minnum.f32(float %max, float 1.0)
+
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_aby_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_aby_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bay_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 v{{[0-9]+}}, [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_med3_bay_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float 0.0, float %a)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yab_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 0, 1.0
+define amdgpu_kernel void @v_clamp_med3_yab_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 0.0, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_yba_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], 1.0, 0
+define amdgpu_kernel void @v_clamp_med3_yba_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float %a, float 1.0, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_ayb_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, 0, [[A]], 1.0
+define amdgpu_kernel void @v_clamp_med3_ayb_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %a, float 1.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_med3_bya_f32_no_dx10_clamp:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, 1.0, [[A]], 0
+define amdgpu_kernel void @v_clamp_med3_bya_f32_no_dx10_clamp(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %med = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %a, float 0.0)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_qnan_f32_no_dx10_clamp:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7fc00000
+define amdgpu_kernel void @v_clamp_constant_qnan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float 0x7FF8000000000000)
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_constant_snan_f32_no_dx10_clamp:
+; GCN: v_mov_b32_e32 v{{[0-9]+}}, 0x7f800001
+define amdgpu_kernel void @v_clamp_constant_snan_f32_no_dx10_clamp(float addrspace(1)* %out) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %med = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float bitcast (i32 2139095041 to float))
+  store float %med, float addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
+declare double @llvm.fabs.f64(double) #1
+declare double @llvm.minnum.f64(double, double) #1
+declare double @llvm.maxnum.f64(double, double) #1
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.minnum.f16(half, half) #1
+declare half @llvm.maxnum.f16(half, half) #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-features"="-dx10-clamp,-fp-exceptions" "no-nans-fp-math"="false" }
+attributes #3 = { nounwind "target-features"="+dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
+attributes #4 = { nounwind "target-features"="-dx10-clamp,+fp-exceptions" "no-nans-fp-math"="false" }
diff --git a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
index 9b4b61cf728ab55ea9bc22e52b43680fb3a63051..208d97feb64283de88c1f6cb0deed9d367e60fa0 100644
--- a/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
+++ b/test/CodeGen/AMDGPU/cndmask-no-def-vcc.ll
@@ -8,7 +8,7 @@ declare i1 @llvm.amdgcn.class.f32(float, i32)
 ; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
+define amdgpu_kernel void @vcc_shrink_vcc_def(float %arg, i32 %arg1, float %arg2, i32 %arg3) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
   %c = icmp eq i32 %arg3, 0
@@ -35,7 +35,7 @@ bb2:
 ; GCN-NOT: vcc
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e64 v0, 0, 1, s{{\[[0-9]+:[0-9]+\]}}
-define void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
+define amdgpu_kernel void @preserve_condition_undef_flag(float %arg, i32 %arg1, float %arg2) {
 bb0:
   %tmp = icmp sgt i32 %arg1, 4
   %undef = call i1 @llvm.amdgcn.class.f32(float undef, i32 undef)
diff --git a/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll b/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll
index 7ff133b86e72b7b35389f5efd470c5ab2e46636e..ef1b3d25f88337a516bf1660474367b0b81425da 100644
--- a/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll
+++ b/test/CodeGen/AMDGPU/coalescer-subrange-crash.ll
@@ -1,5 +1,4 @@
-; RUN: llc -march=amdgcn < %s | FileCheck %s
-; REQUIRES: asserts
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 ;
 ; This testcase used to cause the following crash:
 ;
@@ -18,14 +17,16 @@
 ;
 ; Test for a valid output:
 ; CHECK: image_sample_c_d_o
-
-target triple = "amdgcn--"
-
 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 {
 main_body:
-  %tmp = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %arg6, <2 x i32> %arg8)
-  %tmp23 = fadd float %tmp, 0xBFA99999A0000000
-  %tmp24 = fadd float %tmp, 0x3FA99999A0000000
+  %i.i = extractelement <2 x i32> %arg8, i32 0
+  %j.i = extractelement <2 x i32> %arg8, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 3, i32 0, i32 %arg6) #1
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 3, i32 0, i32 %arg6) #1
+  %tmp23 = fadd float %p2.i, 0xBFA99999A0000000
+  %tmp24 = fadd float %p2.i, 0x3FA99999A0000000
   %tmp25 = bitcast float %tmp23 to i32
   %tmp26 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp25, i32 1
   %tmp27 = insertelement <16 x i32> %tmp26, i32 undef, i32 2
@@ -35,7 +36,8 @@ main_body:
   %tmp31 = insertelement <16 x i32> %tmp30, i32 undef, i32 6
   %tmp32 = insertelement <16 x i32> %tmp31, i32 undef, i32 7
   %tmp33 = insertelement <16 x i32> %tmp32, i32 undef, i32 8
-  %tmp34 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp33, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %tmp33.bc = bitcast <16 x i32> %tmp33 to <16 x float>
+  %tmp34 = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float> %tmp33.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 true)
   %tmp35 = extractelement <4 x float> %tmp34, i32 0
   %tmp36 = bitcast float %tmp24 to i32
   %tmp37 = insertelement <16 x i32> <i32 212739, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>, i32 %tmp36, i32 1
@@ -46,7 +48,8 @@ main_body:
   %tmp42 = insertelement <16 x i32> %tmp41, i32 undef, i32 6
   %tmp43 = insertelement <16 x i32> %tmp42, i32 undef, i32 7
   %tmp44 = insertelement <16 x i32> %tmp43, i32 undef, i32 8
-  %tmp45 = call <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32> %tmp44, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
+  %tmp44.bc = bitcast <16 x i32> %tmp44 to <16 x float>
+  %tmp45 = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float> %tmp44.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 true)
   %tmp46 = extractelement <4 x float> %tmp45, i32 0
   %tmp47 = fmul float %tmp35, %tmp46
   %tmp48 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp47, 14
@@ -54,9 +57,10 @@ main_body:
   ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> %tmp49
 }
 
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.d.o.v16i32(<16 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v16f32.v8i32(<16 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 
-attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/coalescer_remat.ll b/test/CodeGen/AMDGPU/coalescer_remat.ll
index 4c7875c3a0394688b6e8179de4b33c9054da9ec2..3e1b76a1df09453533c7990fea4022e572c852b1 100644
--- a/test/CodeGen/AMDGPU/coalescer_remat.ll
+++ b/test/CodeGen/AMDGPU/coalescer_remat.ll
@@ -13,7 +13,7 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK:  v_mov_b32_e32 v{{[0-9]+}}, 0
 ; It's probably OK if this is slightly higher:
 ; CHECK: ; NumVgprs: 8
-define void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
+define amdgpu_kernel void @foobar(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %flag) {
 entry:
   %cmpflag = icmp eq i32 %flag, 1
   br i1 %cmpflag, label %loop, label %exit
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
new file mode 100644
index 0000000000000000000000000000000000000000..88ba310a92caef8b7d4addc768149c07a3f75852
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-from-llvm-ir-full.ll
@@ -0,0 +1,1260 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -amdgpu-dump-comd -amdgpu-verify-comd -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -amdgpu-dump-comd -amdgpu-verify-comd -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-dump-comd -amdgpu-verify-comd -filetype=obj -o - < %s 2>&1 | FileCheck --check-prefix=PARSER %s
+
+%struct.A = type { i8, float }
+%opencl.image1d_t = type opaque
+%opencl.image2d_t = type opaque
+%opencl.image3d_t = type opaque
+%opencl.queue_t = type opaque
+%opencl.pipe_t = type opaque
+%struct.B = type { i32 addrspace(1)*}
+%opencl.clk_event_t = type opaque
+
+; CHECK: ---
+; CHECK:  Version: [ 1, 0 ]
+; CHECK:  Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ]
+; CHECK:  Kernels:
+
+; CHECK:      - Name:            test_char
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          1
+; CHECK-NEXT:       Align:         1
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      char
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_char(i8 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9
+    !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_ushort2
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     U16
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      ushort2
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_ushort2(<2 x i16> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10
+    !kernel_arg_base_type !10 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_int3
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          16
+; CHECK-NEXT:       Align:         16
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int3
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_int3(<3 x i32> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11
+    !kernel_arg_base_type !11 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_ulong4
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          32
+; CHECK-NEXT:       Align:         32
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     U64
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      ulong4
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_ulong4(<4 x i64> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12
+    !kernel_arg_base_type !12 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_half8
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          16
+; CHECK-NEXT:       Align:         16
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     F16
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      half8
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_half8(<8 x half> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13
+    !kernel_arg_base_type !13 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_float16
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          64
+; CHECK-NEXT:       Align:         64
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     F32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      float16
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_float16(<16 x float> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14
+    !kernel_arg_base_type !14 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_double16
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          128
+; CHECK-NEXT:       Align:         128
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     F64
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      double16
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_double16(<16 x double> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15
+    !kernel_arg_base_type !15 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_pointer
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16
+    !kernel_arg_base_type !16 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_image
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Image
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      image2d_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17
+    !kernel_arg_base_type !17 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_sampler
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     Sampler
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      sampler_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_sampler(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18
+    !kernel_arg_base_type !18 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_queue
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Queue
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      queue_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19
+    !kernel_arg_base_type !19 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_struct
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Private
+; CHECK-NEXT:       TypeName:      struct A
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_struct(%struct.A* byval %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20
+    !kernel_arg_base_type !20 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_i128
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          16
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      i128
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_i128(i128 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21
+    !kernel_arg_base_type !21 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_multi_arg
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I16
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      short2
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      char3
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c)
+    !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24
+    !kernel_arg_base_type !24 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:      - Name:            test_addr_space
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Constant
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       PointeeAlign:  4
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g,
+                                           i32 addrspace(2)* %c,
+                                           i32 addrspace(3)* %l)
+    !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51
+    !kernel_arg_base_type !51 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:      - Name:            test_type_qual
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       IsVolatile:    true
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       IsConst:       true
+; CHECK-NEXT:       IsRestrict:    true
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Pipe
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       IsPipe:        true
+; CHECK-NEXT:       TypeName:      'int *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a,
+                                          i32 addrspace(1)* %b,
+                                          %opencl.pipe_t addrspace(1)* %c)
+    !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51
+    !kernel_arg_base_type !51 !kernel_arg_type_qual !70 {
+  ret void
+}
+
+; CHECK:      - Name:            test_access_qual
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Image
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       ReadOnly
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      image1d_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Image
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       WriteOnly
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      image2d_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     Image
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       ReadWrite
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      image3d_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro,
+                                            %opencl.image2d_t addrspace(1)* %wo,
+                                            %opencl.image3d_t addrspace(1)* %rw)
+    !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62
+    !kernel_arg_base_type !62 !kernel_arg_type_qual !25 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_half
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   half
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_half(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !26 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_float
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   float
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_float(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !27 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_double
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   double
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_double(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !28 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_char
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   char
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_char(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !29 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_short
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   short
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_short(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !30 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_long
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   long
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_long(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !31 {
+  ret void
+}
+
+; CHECK:      - Name:            test_vec_type_hint_unknown
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       VecTypeHint:   unknown
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      int
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_vec_type_hint_unknown(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !32 {
+  ret void
+}
+
+; CHECK:      - Name:            test_reqd_wgs_vec_type_hint
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       ReqdWorkGroupSize: [ 1, 2, 4 ]
+; CHECK-NEXT:       VecTypeHint:       int
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:              4
+; CHECK-NEXT:       Align:             4
+; CHECK-NEXT:       ValueKind:         ByValue
+; CHECK-NEXT:       ValueType:         I32
+; CHECK-NEXT:       AccQual:           Default
+; CHECK-NEXT:       TypeName:          int
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:         I8
+; CHECK-NEXT:       AddrSpaceQual:     Global
+define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5
+    !reqd_work_group_size !6 {
+  ret void
+}
+
+; CHECK:      - Name:            test_wgs_hint_vec_type_hint
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Attrs:
+; CHECK-NEXT:       WorkGroupSizeHint: [ 8, 16, 32 ]
+; CHECK-NEXT:       VecTypeHint:       uint4
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:              4
+; CHECK-NEXT:       Align:             4
+; CHECK-NEXT:       ValueKind:         ByValue
+; CHECK-NEXT:       ValueType:         I32
+; CHECK-NEXT:       AccQual:           Default
+; CHECK-NEXT:       TypeName:          int
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:         I64
+; CHECK-NEXT:     - Size:              8
+; CHECK-NEXT:       Align:             8
+; CHECK-NEXT:       ValueKind:         HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:         I8
+; CHECK-NEXT:       AddrSpaceQual:     Global
+define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3
+    !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7
+    !work_group_size_hint !8 {
+  ret void
+}
+
+; CHECK:      - Name:            test_arg_ptr_to_ptr
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      'int **'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_arg_ptr_to_ptr(i32* addrspace(1)* %a)
+    !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80
+    !kernel_arg_base_type !80 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_arg_struct_contains_ptr
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Private
+; CHECK-NEXT:       TypeName:      struct B
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B* byval %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82
+    !kernel_arg_base_type !82 !kernel_arg_type_qual !4 {
+ ret void
+}
+
+; CHECK:      - Name:            test_arg_vector_of_ptr
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          16
+; CHECK-NEXT:       Align:         16
+; CHECK-NEXT:       ValueKind:     ByValue
+; CHECK-NEXT:       ValueType:     I32
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       TypeName:      'global int* __attribute__((ext_vector_type(2)))'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a)
+    !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83
+    !kernel_arg_base_type !83 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_arg_unknown_builtin_type
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     Struct
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      clk_event_t
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_arg_unknown_builtin_type(
+    %opencl.clk_event_t addrspace(1)* %a)
+    !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84
+    !kernel_arg_base_type !84 !kernel_arg_type_qual !4 {
+  ret void
+}
+
+; CHECK:      - Name:            test_pointee_align
+; CHECK-NEXT:   Language:        OpenCL C
+; CHECK-NEXT:   LanguageVersion: [ 2, 0 ]
+; CHECK-NEXT:   Args:
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     GlobalBuffer
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Global
+; CHECK-NEXT:       TypeName:      'long *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  1
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  2
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char2 *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  4
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char3 *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  4
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char4 *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  8
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char8 *'
+; CHECK-NEXT:     - Size:          4
+; CHECK-NEXT:       Align:         4
+; CHECK-NEXT:       ValueKind:     DynamicSharedPointer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       PointeeAlign:  16
+; CHECK-NEXT:       AccQual:       Default
+; CHECK-NEXT:       AddrSpaceQual: Local
+; CHECK-NEXT:       TypeName:      'char16 *'
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetX
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetY
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenGlobalOffsetZ
+; CHECK-NEXT:       ValueType:     I64
+; CHECK-NEXT:     - Size:          8
+; CHECK-NEXT:       Align:         8
+; CHECK-NEXT:       ValueKind:     HiddenPrintfBuffer
+; CHECK-NEXT:       ValueType:     I8
+; CHECK-NEXT:       AddrSpaceQual: Global
+define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a,
+                                              i8 addrspace(3)* %b,
+                                              <2 x i8> addrspace(3)* %c,
+                                              <3 x i8> addrspace(3)* %d,
+                                              <4 x i8> addrspace(3)* %e,
+                                              <8 x i8> addrspace(3)* %f,
+                                              <16 x i8> addrspace(3)* %g)
+    !kernel_arg_addr_space !91 !kernel_arg_access_qual !92 !kernel_arg_type !93
+    !kernel_arg_base_type !93 !kernel_arg_type_qual !94 {
+  ret void
+}
+
+!llvm.printf.fmts = !{!100, !101}
+
+!1 = !{i32 0}
+!2 = !{!"none"}
+!3 = !{!"int"}
+!4 = !{!""}
+!5 = !{i32 undef, i32 1}
+!6 = !{i32 1, i32 2, i32 4}
+!7 = !{<4 x i32> undef, i32 0}
+!8 = !{i32 8, i32 16, i32 32}
+!9 = !{!"char"}
+!10 = !{!"ushort2"}
+!11 = !{!"int3"}
+!12 = !{!"ulong4"}
+!13 = !{!"half8"}
+!14 = !{!"float16"}
+!15 = !{!"double16"}
+!16 = !{!"int *"}
+!17 = !{!"image2d_t"}
+!18 = !{!"sampler_t"}
+!19 = !{!"queue_t"}
+!20 = !{!"struct A"}
+!21 = !{!"i128"}
+!22 = !{i32 0, i32 0, i32 0}
+!23 = !{!"none", !"none", !"none"}
+!24 = !{!"int", !"short2", !"char3"}
+!25 = !{!"", !"", !""}
+!26 = !{half undef, i32 1}
+!27 = !{float undef, i32 1}
+!28 = !{double undef, i32 1}
+!29 = !{i8 undef, i32 1}
+!30 = !{i16 undef, i32 1}
+!31 = !{i64 undef, i32 1}
+!32 = !{i32 *undef, i32 1}
+!50 = !{i32 1, i32 2, i32 3}
+!51 = !{!"int *", !"int *", !"int *"}
+!60 = !{i32 1, i32 1, i32 1}
+!61 = !{!"read_only", !"write_only", !"read_write"}
+!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"}
+!70 = !{!"volatile", !"const restrict", !"pipe"}
+!80 = !{!"int **"}
+!81 = !{i32 1}
+!82 = !{!"struct B"}
+!83 = !{!"global int* __attribute__((ext_vector_type(2)))"}
+!84 = !{!"clk_event_t"}
+!opencl.ocl.version = !{!90}
+!90 = !{i32 2, i32 0}
+!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3}
+!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"}
+!93 = !{!"long *", !"char *", !"char2 *", !"char3 *", !"char4 *", !"char8 *", !"char16 *"}
+!94 = !{!"", !"", !"", !"", !"", !"", !""}
+!100 = !{!"1:1:4:%d\5Cn"}
+!101 = !{!"2:1:8:%g\5Cn"}
+
+; NOTES: Displaying notes found at file offset 0x{{[0-9]+}}
+; NOTES-NEXT: Owner    Data size    Description
+; NOTES-NEXT: AMD      0x00000008   Unknown note type: (0x00000001)
+; NOTES-NEXT: AMD      0x0000001b   Unknown note type: (0x00000003)
+; GFX700:     AMD      0x00009171   Unknown note type: (0x0000000a)
+; GFX800:     AMD      0x00009190   Unknown note type: (0x0000000a)
+; GFX900:     AMD      0x00009171   Unknown note type: (0x0000000a)
+
+; PARSER: AMDGPU Code Object Metadata Parser Test: PASS
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-1.ll b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f41da9f921361c398671b8a10cf15090e080f47a
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-1.ll
@@ -0,0 +1,9 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: Version: [ 1, 0 ]
+; CHECK: ...
+
+!opencl.ocl.version = !{}
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-2.ll b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0509663d9849a044b28b7c0a38103518a83ddec5
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-2.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: Version: [ 1, 0 ]
+; CHECK: ...
+
+!opencl.ocl.version = !{!0}
+!0 = !{}
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-3.ll b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7404cec5d78acd61f32aa5b680289d1862ed4088
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-invalid-ocl-version-3.ll
@@ -0,0 +1,10 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata | FileCheck %s
+
+; Make sure llc does not crash for invalid opencl version metadata.
+
+; CHECK: ---
+; CHECK: Version: [ 1, 0 ]
+; CHECK: ...
+
+!opencl.ocl.version = !{!0}
+!0 = !{i32 1}
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-kernel-code-props.ll b/test/CodeGen/AMDGPU/code-object-metadata-kernel-code-props.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3b232e40cf25c0ad1f4413c1105107d2f39bc789
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-kernel-code-props.ll
@@ -0,0 +1,32 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+
+; CHECK: ---
+; CHECK:  Version: [ 1, 0 ]
+
+; CHECK:  Kernels:
+; CHECK:    - Name: test
+; CHECK:      CodeProps:
+; CHECK:        KernargSegmentSize:  24
+; GFX700:       WavefrontNumSGPRs:   6
+; GFX800:       WavefrontNumSGPRs:   96
+; GFX900:       WavefrontNumSGPRs:   6
+; GFX700:       WorkitemNumVGPRs:    4
+; GFX800:       WorkitemNumVGPRs:    6
+; GFX900:       WorkitemNumVGPRs:    6
+; CHECK:        KernargSegmentAlign: 4
+; CHECK:        GroupSegmentAlign:   4
+; CHECK:        PrivateSegmentAlign: 4
+; CHECK:        WavefrontSize:       6
+define amdgpu_kernel void @test(
+    half addrspace(1)* %r,
+    half addrspace(1)* %a,
+    half addrspace(1)* %b) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %b.val = load half, half addrspace(1)* %b
+  %r.val = fadd half %a.val, %b.val
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
new file mode 100644
index 0000000000000000000000000000000000000000..801029be8cb9fa86c1832d20b0280a8ef53f43e8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/code-object-metadata-kernel-debug-props.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX700 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX800 --check-prefix=NOTES %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj -o - < %s | llvm-readobj -amdgpu-code-object-metadata -elf-output-style=GNU -notes | FileCheck --check-prefix=CHECK --check-prefix=GFX900 --check-prefix=NOTES %s
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+; CHECK: ---
+; CHECK:  Version: [ 1, 0 ]
+
+; CHECK:  Kernels:
+; CHECK:    - Name: test
+; CHECK:      DebugProps:
+; CHECK:        DebuggerABIVersion:                [ 1, 0 ]
+; CHECK:        ReservedNumVGPRs:                  4
+; CHECK:        ReservedFirstVGPR:                 11
+; CHECK:        PrivateSegmentBufferSGPR:          0
+; CHECK:        WavefrontPrivateSegmentOffsetSGPR: 11
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !7 !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !15 {
+entry:
+  %A.addr = alloca i32 addrspace(1)*, align 4
+  store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !16, metadata !17), !dbg !18
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !19
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 0, !dbg !19
+  store i32 777, i32 addrspace(1)* %arrayidx, align 4, !dbg !20
+  %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !21
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i64 1, !dbg !21
+  store i32 888, i32 addrspace(1)* %arrayidx1, align 4, !dbg !22
+  %2 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !23
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %2, i64 2, !dbg !23
+  store i32 999, i32 addrspace(1)* %arrayidx2, align 4, !dbg !24
+  ret void, !dbg !25
+}
+
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx800" "target-features"="+16-bit-insts,+amdgpu-debugger-emit-prologue,+amdgpu-debugger-insert-nops,+amdgpu-debugger-reserve-regs,+dpp,+fp64-fp16-denormals,+s-memrealtime,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.dbg.cu = !{!0}
+!opencl.ocl.version = !{!3}
+!llvm.module.flags = !{!4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "code-object-metadata-kernel-debug-props.cl", directory: "/some/random/directory")
+!2 = !{}
+!3 = !{i32 1, i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 2}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{!"clang version 5.0.0"}
+!7 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{i32 1}
+!13 = !{!"none"}
+!14 = !{!"int*"}
+!15 = !{!""}
+!16 = !DILocalVariable(name: "A", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!17 = !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)
+!18 = !DILocation(line: 1, column: 30, scope: !7)
+!19 = !DILocation(line: 2, column: 3, scope: !7)
+!20 = !DILocation(line: 2, column: 8, scope: !7)
+!21 = !DILocation(line: 3, column: 3, scope: !7)
+!22 = !DILocation(line: 3, column: 8, scope: !7)
+!23 = !DILocation(line: 4, column: 3, scope: !7)
+!24 = !DILocation(line: 4, column: 8, scope: !7)
+!25 = !DILocation(line: 5, column: 1, scope: !7)
diff --git a/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
index 5851720926762beb15995edd0a9c66139d2770cd..155de5353bcb4031233f72653603dbfe46477e4e 100644
--- a/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/AMDGPU/codegen-prepare-addrmode-sext.ll
@@ -8,7 +8,7 @@
 ; SI-LLC-LABEL: {{^}}test:
 ; SI-LLC: s_mul_i32
 ; SI-LLC-NOT: mul
-define void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture readonly %in, i32 %a, i8 %b) {
 entry:
   %0 = mul nsw i32 %a, 3
   %1 = sext i32 %0 to i64
diff --git a/test/CodeGen/AMDGPU/combine_vloads.ll b/test/CodeGen/AMDGPU/combine_vloads.ll
index 01572afa620530199f7bb09dcb845fb5b83daf96..f8d4e01085c265f3754aa8c7635a991cf012d504 100644
--- a/test/CodeGen/AMDGPU/combine_vloads.ll
+++ b/test/CodeGen/AMDGPU/combine_vloads.ll
@@ -12,7 +12,7 @@
 ; EG-LABEL: {{^}}combine_vloads:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
+define amdgpu_kernel void @combine_vloads(<8 x i8> addrspace(1)* nocapture %src, <8 x i8> addrspace(1)* nocapture %result) nounwind {
 entry:
   br label %for.body
 
diff --git a/test/CodeGen/AMDGPU/commute-compares.ll b/test/CodeGen/AMDGPU/commute-compares.ll
index a4c51b233f4160e730ea681df9fe444c16874d18..973c4544d97a704bb113393db0829e4f84831173 100644
--- a/test/CodeGen/AMDGPU/commute-compares.ll
+++ b/test/CodeGen/AMDGPU/commute-compares.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}commute_eq_64_i32:
 ; GCN: v_cmp_eq_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -21,7 +21,7 @@ define void @commute_eq_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_ne_64_i32:
 ; GCN: v_cmp_ne_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -36,7 +36,7 @@ define void @commute_ne_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 ; GCN-LABEL: {{^}}commute_ne_litk_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x3039
 ; GCN: v_cmp_ne_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -49,7 +49,7 @@ define void @commute_ne_litk_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -62,7 +62,7 @@ define void @commute_ugt_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_uge_64_i32:
 ; GCN: v_cmp_lt_u32_e32 vcc, 63, v{{[0-9]+}}
-define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -75,7 +75,7 @@ define void @commute_uge_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ult_64_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -88,7 +88,7 @@ define void @commute_ult_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ule_63_i32:
 ; GCN: v_cmp_gt_u32_e32 vcc, 64, v{{[0-9]+}}
-define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -104,7 +104,7 @@ define void @commute_ule_63_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ule_64_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u32_e32 vcc, [[K]], v{{[0-9]+}}
-define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -117,7 +117,7 @@ define void @commute_ule_64_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, v{{[0-9]+}}
-define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -130,7 +130,7 @@ define void @commute_sgt_neg1_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i32:
 ; GCN: v_cmp_lt_i32_e32 vcc, -3, v{{[0-9]+}}
-define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -143,7 +143,7 @@ define void @commute_sge_neg2_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, -16, v{{[0-9]+}}
-define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -156,7 +156,7 @@ define void @commute_slt_neg16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_sle_5_i32:
 ; GCN: v_cmp_gt_i32_e32 vcc, 6, v{{[0-9]+}}
-define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -173,7 +173,7 @@ define void @commute_sle_5_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_eq_64_i64:
 ; GCN: v_cmp_eq_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -186,7 +186,7 @@ define void @commute_eq_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_ne_64_i64:
 ; GCN: v_cmp_ne_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -199,7 +199,7 @@ define void @commute_ne_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_ugt_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -212,7 +212,7 @@ define void @commute_ugt_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_uge_64_i64:
 ; GCN: v_cmp_lt_u64_e32 vcc, 63, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -225,7 +225,7 @@ define void @commute_uge_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ult_64_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -238,7 +238,7 @@ define void @commute_ult_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_ule_63_i64:
 ; GCN: v_cmp_gt_u64_e32 vcc, 64, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -254,7 +254,7 @@ define void @commute_ule_63_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 ; GCN-LABEL: {{^}}commute_ule_64_i64:
 ; GCN-DAG: s_movk_i32 s[[KLO:[0-9]+]], 0x41{{$}}
 ; GCN: v_cmp_gt_u64_e32 vcc, s{{\[}}[[KLO]]:{{[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -267,7 +267,7 @@ define void @commute_ule_64_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #
 
 ; GCN-LABEL: {{^}}commute_sgt_neg1_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -1, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -280,7 +280,7 @@ define void @commute_sgt_neg1_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_sge_neg2_i64:
 ; GCN: v_cmp_lt_i64_e32 vcc, -3, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -293,7 +293,7 @@ define void @commute_sge_neg2_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 
 ; GCN-LABEL: {{^}}commute_slt_neg16_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, -16, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -306,7 +306,7 @@ define void @commute_slt_neg16_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_sle_5_i64:
 ; GCN: v_cmp_gt_i64_e32 vcc, 6, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -324,7 +324,7 @@ define void @commute_sle_5_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -338,7 +338,7 @@ define void @commute_oeq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -351,7 +351,7 @@ define void @commute_ogt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -364,7 +364,7 @@ define void @commute_oge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -377,7 +377,7 @@ define void @commute_olt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -390,7 +390,7 @@ define void @commute_ole_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -403,7 +403,7 @@ define void @commute_one_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f32:
 ; GCN: v_cmp_o_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -416,7 +416,7 @@ define void @commute_ord_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -429,7 +429,7 @@ define void @commute_ueq_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -442,7 +442,7 @@ define void @commute_ugt_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -455,7 +455,7 @@ define void @commute_uge_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -468,7 +468,7 @@ define void @commute_ult_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -481,7 +481,7 @@ define void @commute_ule_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, 2.0, v{{[0-9]+}}
-define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -494,7 +494,7 @@ define void @commute_une_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f32:
 ; GCN: v_cmp_u_f32_e32 vcc, [[REG:v[0-9]+]], [[REG]]
-define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -512,7 +512,7 @@ define void @commute_uno_2.0_f32(i32 addrspace(1)* %out, float addrspace(1)* %in
 
 ; GCN-LABEL: {{^}}commute_oeq_2.0_f64:
 ; GCN: v_cmp_eq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -526,7 +526,7 @@ define void @commute_oeq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ogt_2.0_f64:
 ; GCN: v_cmp_lt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -539,7 +539,7 @@ define void @commute_ogt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_oge_2.0_f64:
 ; GCN: v_cmp_le_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -552,7 +552,7 @@ define void @commute_oge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_olt_2.0_f64:
 ; GCN: v_cmp_gt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -565,7 +565,7 @@ define void @commute_olt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ole_2.0_f64:
 ; GCN: v_cmp_ge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -578,7 +578,7 @@ define void @commute_ole_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_one_2.0_f64:
 ; GCN: v_cmp_lg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -591,7 +591,7 @@ define void @commute_one_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ord_2.0_f64:
 ; GCN: v_cmp_o_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -604,7 +604,7 @@ define void @commute_ord_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ueq_2.0_f64:
 ; GCN: v_cmp_nlg_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -617,7 +617,7 @@ define void @commute_ueq_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ugt_2.0_f64:
 ; GCN: v_cmp_nge_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -630,7 +630,7 @@ define void @commute_ugt_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_uge_2.0_f64:
 ; GCN: v_cmp_ngt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -643,7 +643,7 @@ define void @commute_uge_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ult_2.0_f64:
 ; GCN: v_cmp_nle_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -656,7 +656,7 @@ define void @commute_ult_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_ule_2.0_f64:
 ; GCN: v_cmp_nlt_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -669,7 +669,7 @@ define void @commute_ule_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_une_2.0_f64:
 ; GCN: v_cmp_neq_f64_e32 vcc, 2.0, v{{\[[0-9]+:[0-9]+\]}}
-define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -682,7 +682,7 @@ define void @commute_une_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 
 ; GCN-LABEL: {{^}}commute_uno_2.0_f64:
 ; GCN: v_cmp_u_f64_e32 vcc, [[REG:v\[[0-9]+:[0-9]+\]]], [[REG]]
-define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -701,9 +701,9 @@ define void @commute_uno_2.0_f64(i32 addrspace(1)* %out, double addrspace(1)* %i
 ; GCN-LABEL: {{^}}commute_frameindex:
 ; XGCN: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
 
-; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 0{{$}}
+; GCN: v_mov_b32_e32 [[FI:v[0-9]+]], 4{{$}}
 ; GCN: v_cmp_eq_u32_e32 vcc, [[FI]], v{{[0-9]+}}
-define void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @commute_frameindex(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %stack0 = alloca i32
   %ptr0 = load volatile i32*, i32* addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/commute-shifts.ll b/test/CodeGen/AMDGPU/commute-shifts.ll
index 862f236514cac493ef386b56c3a6e64fc77349b0..84d8bf2bd706a36cbe4b574be4bbe5596f7203ab 100644
--- a/test/CodeGen/AMDGPU/commute-shifts.ll
+++ b/test/CodeGen/AMDGPU/commute-shifts.ll
@@ -4,10 +4,10 @@
 ; GCN-LABEL: {{^}}main:
 ; SI: v_lshl_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
 ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, 1
-define amdgpu_ps void @main(float %arg0, float %arg1) #0 {
+define amdgpu_ps float @main(float %arg0, float %arg1) #0 {
 bb:
   %tmp = fptosi float %arg0 to i32
-  %tmp1 = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp1 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false)
   %tmp2.f = extractelement <4 x float> %tmp1, i32 0
   %tmp2 = bitcast float %tmp2.f to i32
   %tmp3 = and i32 %tmp, 7
@@ -15,15 +15,14 @@ bb:
   %tmp5 = and i32 %tmp2, %tmp4
   %tmp6 = icmp eq i32 %tmp5, 0
   %tmp7 = select i1 %tmp6, float 0.000000e+00, float %arg1
-  %tmp8 = call i32 @llvm.SI.packf16(float undef, float %tmp7)
-  %tmp9 = bitcast i32 %tmp8 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp9, float undef, float %tmp9)
-  ret void
+  %tmp8 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp7)
+  %tmp9 = bitcast <2 x half> %tmp8 to float
+  ret float %tmp9
 }
 
-declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare i32 @llvm.SI.packf16(float, float) #1
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/commute_modifiers.ll b/test/CodeGen/AMDGPU/commute_modifiers.ll
index ed4ec82eb3e3d5cc7a136bc78fd5e4e25ff8fba2..8820e4fd80e56d6df48c251d50c9d104e7c74f55 100644
--- a/test/CodeGen/AMDGPU/commute_modifiers.ll
+++ b/test/CodeGen/AMDGPU/commute_modifiers.ll
@@ -8,7 +8,7 @@ declare float @llvm.fma.f32(float, float, float) nounwind readnone
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], |[[X]]|, 2.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -22,7 +22,7 @@ define void @commute_add_imm_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -4.0
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -37,7 +37,7 @@ define void @commute_mul_imm_fneg_fabs_f32(float addrspace(1)* %out, float addrs
 ; SI: buffer_load_dword [[X:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: v_mul_f32_e32 [[REG:v[0-9]+]], -4.0, [[X]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -53,7 +53,7 @@ define void @commute_mul_imm_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x44800000
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[K]], |[[X]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %x = load float, float addrspace(1)* %gep.0
@@ -68,7 +68,7 @@ define void @commute_add_lit_fabs_f32(float addrspace(1)* %out, float addrspace(
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_add_f32_e64 [[REG:v[0-9]+]], [[X]], |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -85,7 +85,7 @@ define void @commute_add_fabs_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -[[Y]]
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -102,7 +102,7 @@ define void @commute_mul_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], [[X]], -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -121,7 +121,7 @@ define void @commute_mul_fabs_fneg_f32(float addrspace(1)* %out, float addrspace
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, |[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -139,7 +139,7 @@ define void @commute_mul_fabs_x_fabs_y_f32(float addrspace(1)* %out, float addrs
 ; SI-DAG: buffer_load_dword [[Y:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_mul_f32_e64 [[REG:v[0-9]+]], |[[X]]|, -|[[Y]]|
 ; SI: buffer_store_dword [[REG]]
-define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -161,7 +161,7 @@ define void @commute_mul_fabs_x_fneg_fabs_y_f32(float addrspace(1)* %out, float
 ; SI-DAG: buffer_load_dword [[R2:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI: v_fma_f32 [[RESULT:v[0-9]+]], [[R1]], 2.0, |[[R2]]|
 ; SI: buffer_store_dword [[RESULT]]
-define void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fma_a_2.0_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/concat_vectors.ll b/test/CodeGen/AMDGPU/concat_vectors.ll
index 2e6be5d10f09010c6638d71730c5810b8cb82a6d..7394842d156f7ada1cad4fecde2e902604d9079b 100644
--- a/test/CodeGen/AMDGPU/concat_vectors.ll
+++ b/test/CodeGen/AMDGPU/concat_vectors.ll
@@ -8,7 +8,7 @@
 ; value if we want to ensure scratch memory is not being used.
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
   %concat = shufflevector <1 x i32> %a, <1 x i32> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i32> %concat, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -17,7 +17,7 @@ define void @test_concat_v1i32(<2 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x
 ; FUNC-LABEL: {{^}}test_concat_v2i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %concat = shufflevector <2 x i32> %a, <2 x i32> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i32> %concat, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -26,7 +26,7 @@ define void @test_concat_v2i32(<4 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x
 ; FUNC-LABEL: {{^}}test_concat_v4i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
   %concat = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i32> %concat, <8 x i32> addrspace(1)* %out, align 32
   ret void
@@ -35,7 +35,7 @@ define void @test_concat_v4i32(<8 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x
 ; FUNC-LABEL: {{^}}test_concat_v8i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
   %concat = shufflevector <8 x i32> %a, <8 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i32> %concat, <16 x i32> addrspace(1)* %out, align 64
   ret void
@@ -44,7 +44,7 @@ define void @test_concat_v8i32(<16 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x
 ; FUNC-LABEL: {{^}}test_concat_v16i32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <16 x i32> %b) nounwind {
   %concat = shufflevector <16 x i32> %a, <16 x i32> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i32> %concat, <32 x i32> addrspace(1)* %out, align 128
   ret void
@@ -53,7 +53,7 @@ define void @test_concat_v16i32(<32 x i32> addrspace(1)* %out, <16 x i32> %a, <1
 ; FUNC-LABEL: {{^}}test_concat_v1f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <1 x float> %b) nounwind {
   %concat = shufflevector <1 x float> %a, <1 x float> %b, <2 x i32> <i32 0, i32 1>
   store <2 x float> %concat, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -62,7 +62,7 @@ define void @test_concat_v1f32(<2 x float> addrspace(1)* %out, <1 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v2f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %concat = shufflevector <2 x float> %a, <2 x float> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x float> %concat, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -71,7 +71,7 @@ define void @test_concat_v2f32(<4 x float> addrspace(1)* %out, <2 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v4f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %concat = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x float> %concat, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -80,7 +80,7 @@ define void @test_concat_v4f32(<8 x float> addrspace(1)* %out, <4 x float> %a, <
 ; FUNC-LABEL: {{^}}test_concat_v8f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %concat = shufflevector <8 x float> %a, <8 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x float> %concat, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -89,7 +89,7 @@ define void @test_concat_v8f32(<16 x float> addrspace(1)* %out, <8 x float> %a,
 ; FUNC-LABEL: {{^}}test_concat_v16f32:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %concat = shufflevector <16 x float> %a, <16 x float> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x float> %concat, <32 x float> addrspace(1)* %out, align 128
   ret void
@@ -98,7 +98,7 @@ define void @test_concat_v16f32(<32 x float> addrspace(1)* %out, <16 x float> %a
 ; FUNC-LABEL: {{^}}test_concat_v1i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
   store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -107,7 +107,7 @@ define void @test_concat_v1i64(<2 x double> addrspace(1)* %out, <1 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v2i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -116,7 +116,7 @@ define void @test_concat_v2i64(<4 x double> addrspace(1)* %out, <2 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v4i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -125,7 +125,7 @@ define void @test_concat_v4i64(<8 x double> addrspace(1)* %out, <4 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v8i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
   ret void
@@ -134,7 +134,7 @@ define void @test_concat_v8i64(<16 x double> addrspace(1)* %out, <8 x double> %a
 ; FUNC-LABEL: {{^}}test_concat_v16i64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
   ret void
@@ -143,7 +143,7 @@ define void @test_concat_v16i64(<32 x double> addrspace(1)* %out, <16 x double>
 ; FUNC-LABEL: {{^}}test_concat_v1f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a, <1 x double> %b) nounwind {
   %concat = shufflevector <1 x double> %a, <1 x double> %b, <2 x i32> <i32 0, i32 1>
   store <2 x double> %concat, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -152,7 +152,7 @@ define void @test_concat_v1f64(<2 x double> addrspace(1)* %out, <1 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v2f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %concat = shufflevector <2 x double> %a, <2 x double> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x double> %concat, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -161,7 +161,7 @@ define void @test_concat_v2f64(<4 x double> addrspace(1)* %out, <2 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v4f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %concat = shufflevector <4 x double> %a, <4 x double> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x double> %concat, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -170,7 +170,7 @@ define void @test_concat_v4f64(<8 x double> addrspace(1)* %out, <4 x double> %a,
 ; FUNC-LABEL: {{^}}test_concat_v8f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %concat = shufflevector <8 x double> %a, <8 x double> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x double> %concat, <16 x double> addrspace(1)* %out, align 128
   ret void
@@ -179,7 +179,7 @@ define void @test_concat_v8f64(<16 x double> addrspace(1)* %out, <8 x double> %a
 ; FUNC-LABEL: {{^}}test_concat_v16f64:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %concat = shufflevector <16 x double> %a, <16 x double> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x double> %concat, <32 x double> addrspace(1)* %out, align 256
   ret void
@@ -188,7 +188,7 @@ define void @test_concat_v16f64(<32 x double> addrspace(1)* %out, <16 x double>
 ; FUNC-LABEL: {{^}}test_concat_v1i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1> %b) nounwind {
   %concat = shufflevector <1 x i1> %a, <1 x i1> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i1> %concat, <2 x i1> addrspace(1)* %out
   ret void
@@ -197,7 +197,7 @@ define void @test_concat_v1i1(<2 x i1> addrspace(1)* %out, <1 x i1> %a, <1 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v2i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1> %b) nounwind {
   %concat = shufflevector <2 x i1> %a, <2 x i1> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i1> %concat, <4 x i1> addrspace(1)* %out
   ret void
@@ -206,7 +206,7 @@ define void @test_concat_v2i1(<4 x i1> addrspace(1)* %out, <2 x i1> %a, <2 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v4i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1> %b) nounwind {
   %concat = shufflevector <4 x i1> %a, <4 x i1> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i1> %concat, <8 x i1> addrspace(1)* %out
   ret void
@@ -215,7 +215,7 @@ define void @test_concat_v4i1(<8 x i1> addrspace(1)* %out, <4 x i1> %a, <4 x i1>
 ; FUNC-LABEL: {{^}}test_concat_v8i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1> %b) nounwind {
   %concat = shufflevector <8 x i1> %a, <8 x i1> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i1> %concat, <16 x i1> addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@ define void @test_concat_v8i1(<16 x i1> addrspace(1)* %out, <8 x i1> %a, <8 x i1
 ; FUNC-LABEL: {{^}}test_concat_v16i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x i1> %b) nounwind {
   %concat = shufflevector <16 x i1> %a, <16 x i1> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i1> %concat, <32 x i1> addrspace(1)* %out
   ret void
@@ -233,7 +233,7 @@ define void @test_concat_v16i1(<32 x i1> addrspace(1)* %out, <16 x i1> %a, <16 x
 ; FUNC-LABEL: {{^}}test_concat_v32i1:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
+define amdgpu_kernel void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x i1> %b) nounwind {
   %concat = shufflevector <32 x i1> %a, <32 x i1> %b, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63>
   store <64 x i1> %concat, <64 x i1> addrspace(1)* %out
   ret void
@@ -242,7 +242,7 @@ define void @test_concat_v32i1(<64 x i1> addrspace(1)* %out, <32 x i1> %a, <32 x
 ; FUNC-LABEL: {{^}}test_concat_v1i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x i16> %b) nounwind {
   %concat = shufflevector <1 x i16> %a, <1 x i16> %b, <2 x i32> <i32 0, i32 1>
   store <2 x i16> %concat, <2 x i16> addrspace(1)* %out, align 4
   ret void
@@ -251,7 +251,7 @@ define void @test_concat_v1i16(<2 x i16> addrspace(1)* %out, <1 x i16> %a, <1 x
 ; FUNC-LABEL: {{^}}test_concat_v2i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) nounwind {
   %concat = shufflevector <2 x i16> %a, <2 x i16> %b, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   store <4 x i16> %concat, <4 x i16> addrspace(1)* %out, align 8
   ret void
@@ -260,7 +260,7 @@ define void @test_concat_v2i16(<4 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x
 ; FUNC-LABEL: {{^}}test_concat_v4i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
   %concat = shufflevector <4 x i16> %a, <4 x i16> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   store <8 x i16> %concat, <8 x i16> addrspace(1)* %out, align 16
   ret void
@@ -269,7 +269,7 @@ define void @test_concat_v4i16(<8 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x
 ; FUNC-LABEL: {{^}}test_concat_v8i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
   %concat = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   store <16 x i16> %concat, <16 x i16> addrspace(1)* %out, align 32
   ret void
@@ -278,7 +278,7 @@ define void @test_concat_v8i16(<16 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x
 ; FUNC-LABEL: {{^}}test_concat_v16i16:
 ; SI-NOT: s_mov_b32 s{{[0-9]}}, 0x80f000
 ; SI-NOT: movrel
-define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
+define amdgpu_kernel void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <16 x i16> %b) nounwind {
   %concat = shufflevector <16 x i16> %a, <16 x i16> %b, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   store <32 x i16> %concat, <32 x i16> addrspace(1)* %out, align 64
   ret void
@@ -286,7 +286,7 @@ define void @test_concat_v16i16(<32 x i16> addrspace(1)* %out, <16 x i16> %a, <1
 
 ; FUNC-LABEL: {{^}}concat_vector_crash:
 ; SI: s_endpgm
-define void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @concat_vector_crash(<8 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) {
 bb:
   %tmp = load <2 x float>, <2 x float> addrspace(1)* %in, align 4
   %tmp1 = shufflevector <2 x float> %tmp, <2 x float> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
index 34bb2588ad6224c274ea4242b55a26d41dd9bc69..62b47beb125186c2eeb9cfdc3df218f2c4579845 100644
--- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
+++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir
@@ -1,12 +1,12 @@
 # RUN: llc -mtriple=amdgcn--amdhsa -mcpu=hawaii -verify-machineinstrs -run-pass si-fold-operands,dead-mi-elimination -o - %s | FileCheck -check-prefix=GCN %s
 --- |
-  define void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %and = and i32 %a, 1234567
     store volatile i32 %and, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_and_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -17,13 +17,13 @@
     ret void
   }
 
-  define void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %shl = shl i32 %a, 12
     store volatile i32 %shl, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_shl_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -34,13 +34,13 @@
     ret void
   }
 
-  define void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  define amdgpu_kernel void @s_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %ashr = ashr i32 %a, 12
     store volatile i32 %ashr, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_ashr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
@@ -51,13 +51,13 @@
     ret void
   }
 
-   define void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+   define amdgpu_kernel void @s_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
     %lshr = lshr i32 %a, 12
     store volatile i32 %lshr, i32 addrspace(1)* %out
     ret void
   }
 
-  define void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
+  define amdgpu_kernel void @v_fold_lshr_imm_regimm_32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %idxprom = sext i32 %tid to i64
     %gep.a = getelementptr i32, i32 addrspace(1)* %aptr, i64 %idxprom
diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
index 0ff75ab580034ff6462f4706f3a0b98f84aa8d3a..0831d250b9e7dd2c3f0d0a3fa7287ef4683da914 100644
--- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
+++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll
@@ -5,7 +5,7 @@
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
@@ -17,7 +17,7 @@ define void @fold_mi_v_and_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %and = and i32 %size, %x
   store i32 %and, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
@@ -42,7 +42,7 @@ define void @fold_mi_v_or_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %or = or i32 %size, %x
   store i32 %or, i32 addrspace(1)* %out
@@ -53,7 +53,7 @@ define void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
   %x = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
@@ -67,7 +67,7 @@ define void @fold_mi_v_xor_0(i32 addrspace(1)* %out) {
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, %x
   store i32 %xor, i32 addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fold_mi_s_xor_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], -1{{$}}
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
+define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %xor = xor i32 %size, -1
   store i32 %xor, i32 addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 {
 ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]]
 ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}}
 ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
   %vreg = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg)
   %xor = xor i64 %ctpop, -1
@@ -110,7 +110,7 @@ define void @fold_mi_v_not_0(i64 addrspace(1)* %out) {
 ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
@@ -126,7 +126,7 @@ define void @fold_mi_or_neg1(i64 addrspace(1)* %out) {
 ; GCN: v_not_b32
 ; GCN: v_and_b32
 ; GCN-NOT: v_and_b32
-define void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @fold_mi_and_neg1(i64 addrspace(1)* %out) {
   %vreg0 = load volatile i64, i64 addrspace(1)* undef
   %vreg1 = load volatile i64, i64 addrspace(1)* undef
   %ctpop = call i64 @llvm.ctpop.i64(i64 %vreg0)
diff --git a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
index 13383cbc1741dbd545aa18cb4dbfd58d3402856f..d3e6c11ef908460fbda2c95901a776c8d9fefa02 100644
--- a/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
+++ b/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll
@@ -10,6 +10,8 @@
 
 
 ; GCN-LABEL: {{^}}divergent_if_endif:
+; VGPR: workitem_private_segment_byte_size = 12{{$}}
+
 
 ; GCN: {{^}}; BB#0:
 ; GCN: s_mov_b32 m0, -1
@@ -26,12 +28,13 @@
 
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:8 ; 4-byte Folded Spill
 
 ; Spill load
 ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill
+
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
 ; GCN: s_waitcnt vmcnt(0) expcnt(0)
@@ -55,11 +58,11 @@
 
 
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:8 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
@@ -69,7 +72,7 @@
 ; GCN: buffer_load_dword [[RELOAD_VAL:v[0-9]+]], off, s[0:3], s7 offset:[[VAL_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RELOAD_VAL]]
-define void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_endif(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
@@ -88,6 +91,8 @@ endif:
 }
 
 ; GCN-LABEL: {{^}}divergent_loop:
+; VGPR: workitem_private_segment_byte_size = 16{{$}}
+
 ; GCN: {{^}}; BB#0:
 
 ; GCN: s_mov_b32 m0, -1
@@ -100,7 +105,7 @@ endif:
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -108,9 +113,9 @@ endif:
 
 
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_LO:[0-9]+]], s[[SAVEEXEC_LO]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:16 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_LO]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
 ; VMEM: v_mov_b32_e32 v[[V_SAVEEXEC_HI:[0-9]+]], s[[SAVEEXEC_HI]]
-; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:20 ; 4-byte Folded Spill
+; VMEM: buffer_store_dword v[[V_SAVEEXEC_HI]], off, s[0:3], s7 offset:24 ; 4-byte Folded Spill
 
 ; GCN: s_mov_b64 exec, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}
 
@@ -120,7 +125,7 @@ endif:
 
 
 ; GCN: [[LOOP:BB[0-9]+_[0-9]+]]:
-; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[VAL_LOOP_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[VAL_LOOP:v[0-9]+]], vcc, v{{[0-9]+}}, v[[VAL_LOOP_RELOAD]]
 ; GCN: v_cmp_ne_u32_e32 vcc,
 ; GCN: s_and_b64 vcc, exec, vcc
@@ -133,11 +138,11 @@ endif:
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]]
 ; VGPR: v_readlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:16 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_LO:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_LO:[0-9]+]], v[[V_RELOAD_SAVEEXEC_LO]]
 
-; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:20 ; 4-byte Folded Reload
+; VMEM: buffer_load_dword v[[V_RELOAD_SAVEEXEC_HI:[0-9]+]], off, s[0:3], s7 offset:24 ; 4-byte Folded Reload
 ; VMEM: s_waitcnt vmcnt(0)
 ; VMEM: v_readfirstlane_b32 s[[S_RELOAD_SAVEEXEC_HI:[0-9]+]], v[[V_RELOAD_SAVEEXEC_HI]]
 
@@ -145,7 +150,7 @@ endif:
 ; GCN: buffer_load_dword v[[VAL_END:[0-9]+]], off, s[0:3], s7 offset:[[VAL_SUB_OFFSET]] ; 4-byte Folded Reload
 
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[VAL_END]]
-define void @divergent_loop(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_loop(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
@@ -180,7 +185,7 @@ end:
 ; GCN: s_xor_b64 s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}, s{{\[}}[[ANDEXEC_LO]]:[[ANDEXEC_HI]]{{\]}}, s{{\[}}[[SAVEEXEC_LO]]:[[SAVEEXEC_HI]]{{\]}}
 
 ; Spill load
-; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 ; 4-byte Folded Spill
+; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], s7 offset:4 ; 4-byte Folded Spill
 
 ; Spill saved exec
 ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]]
@@ -237,14 +242,14 @@ end:
 
 ; GCN: BB{{[0-9]+}}_2: ; %if
 ; GCN: ds_read_b32
-; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Spill
 ; GCN: s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT: s_branch [[ENDIF:BB[0-9]+_[0-9]+]]
 
 ; GCN: [[ELSE]]: ; %else
-; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 ; 4-byte Folded Reload
+; GCN: buffer_load_dword v[[LOAD0_RELOAD:[0-9]+]], off, s[0:3], s7 offset:4 ; 4-byte Folded Reload
 ; GCN: v_subrev_i32_e32 [[SUB:v[0-9]+]], vcc, v{{[0-9]+}}, v[[LOAD0_RELOAD]]
 ; GCN: buffer_store_dword [[ADD]], off, s[0:3], s7 offset:[[FLOW_RESULT_OFFSET:[0-9]+]] ; 4-byte Folded Spill
 ; GCN: s_waitcnt vmcnt(0) expcnt(0)
@@ -267,7 +272,7 @@ end:
 
 ; GCN: buffer_load_dword v[[RESULT:[0-9]+]], off, s[0:3], s7 offset:[[RESULT_OFFSET]] ; 4-byte Folded Reload
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RESULT]]
-define void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @divergent_if_else_endif(i32 addrspace(1)* %out) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %load0 = load volatile i32, i32 addrspace(3)* undef
diff --git a/test/CodeGen/AMDGPU/convergent-inlineasm.ll b/test/CodeGen/AMDGPU/convergent-inlineasm.ll
index 755f439c68635d2cf25c10b0dff9ccab9d105ed2..0074a41e44cf8e6c62d9679c9144a484207c25ac 100644
--- a/test/CodeGen/AMDGPU/convergent-inlineasm.ll
+++ b/test/CodeGen/AMDGPU/convergent-inlineasm.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_cmp_ne_u32_e64
 ; GCN: ; mask branch
 ; GCN: BB{{[0-9]+_[0-9]+}}:
-define void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @convergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1) #1
@@ -29,7 +29,8 @@ bb5:                                              ; preds = %bb3, %bb
 ; GCN: v_cmp_ne_u32_e64
 
 ; GCN: BB{{[0-9]+_[0-9]+}}:
-define void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
+
+define amdgpu_kernel void @nonconvergent_inlineasm(i64 addrspace(1)* nocapture %arg) {
 bb:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 1)
diff --git a/test/CodeGen/AMDGPU/copy-illegal-type.ll b/test/CodeGen/AMDGPU/copy-illegal-type.ll
index 7434d745b259158a59ab0cce7894636dcfa4bbf5..026dd7ca6c870dc532be57b23ec15923ca200d97 100644
--- a/test/CodeGen/AMDGPU/copy-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/copy-illegal-type.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 ; GCN: buffer_load_dword [[REG:v[0-9]+]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@ define void @test_copy_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)*
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -32,7 +32,7 @@ define void @test_copy_v4i8_x2(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -47,7 +47,7 @@ define void @test_copy_v4i8_x3(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: buffer_store_dword [[REG]]
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %out3, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out1, align 4
@@ -65,7 +65,7 @@ define void @test_copy_v4i8_x4(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(
 ; GCN-DAG: buffer_store_dword
 
 ; GCN: s_endpgm
-define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %add = add <4 x i8> %val, <i8 9, i8 9, i8 9, i8 9>
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out0, align 4
@@ -85,7 +85,7 @@ define void @test_copy_v4i8_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> add
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: {{buffer|flat}}_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8> addrspace(1)* %out1, <4 x i8> addrspace(1)* %out2, <4 x i8> addrspace(1)* %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -101,7 +101,7 @@ define void @test_copy_v4i8_x2_extra_use(<4 x i8> addrspace(1)* %out0, <4 x i8>
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -113,7 +113,7 @@ define void @test_copy_v3i8_align4(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:2{{$}}
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 2
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 2
   ret void
@@ -128,7 +128,7 @@ define void @test_copy_v3i8_align2(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) nounwind {
   %val = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 1
   store <3 x i8> %val, <3 x i8> addrspace(1)* %out, align 1
   ret void
@@ -141,7 +141,7 @@ define void @test_copy_v3i8_align1(<3 x i8> addrspace(1)* %out, <3 x i8> addrspa
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load volatile <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ define void @test_copy_v4i8_volatile_load(<4 x i8> addrspace(1)* %out, <4 x i8>
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_copy_v4i8_volatile_store(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %val = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   store volatile <4 x i8> %val, <4 x i8> addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/copy-to-reg.ll b/test/CodeGen/AMDGPU/copy-to-reg.ll
index 3422a889a52002e35f0b60a87dee94ba96fdc883..f35b0706f3d3073fb840021cb2e05dbd5f4e67a3 100644
--- a/test/CodeGen/AMDGPU/copy-to-reg.ll
+++ b/test/CodeGen/AMDGPU/copy-to-reg.ll
@@ -6,7 +6,7 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: {{^}}copy_to_reg_frameindex:
-define void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @copy_to_reg_frameindex(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %alloca = alloca [16 x i32]
   br label %loop
diff --git a/test/CodeGen/AMDGPU/ctlz.ll b/test/CodeGen/AMDGPU/ctlz.ll
index 1a0027dd4a3c47463bc4743c01b491833a2135b2..e252971e3f427e47acac077bfc7813d12a95e2c8 100644
--- a/test/CodeGen/AMDGPU/ctlz.ll
+++ b/test/CodeGen/AMDGPU/ctlz.ll
@@ -27,7 +27,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -43,7 +43,7 @@ define void @s_ctlz_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -61,7 +61,7 @@ define void @v_ctlz_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalia
 ; EG: CNDE_INT
 ; EG: FFBH_UINT
 ; EG: CNDE_INT
-define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 false) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@@ -89,7 +89,7 @@ define void @v_ctlz_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrsp
 
 ; EG-DAG: FFBH_UINT
 ; EG-DAG: CNDE_INT
-define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 false) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@@ -98,10 +98,11 @@ define void @v_ctlz_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrsp
 
 ; FUNC-LABEL: {{^}}v_ctlz_i8:
 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
-; GCN-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; SI-DAG: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
+; VI-DAG: v_ffbh_u32_sdwa [[RESULT:v[0-9]+]], [[VAL]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
 ; GCN: buffer_store_byte [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 false) nounwind readnone
   store i8 %ctlz, i8 addrspace(1)* %out
@@ -119,14 +120,14 @@ define void @v_ctlz_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_i64_trunc:
-define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 false)
   %trunc = trunc i64 %ctlz to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -145,7 +146,7 @@ define void @s_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind
 ; GCN-DAG: v_cmp_ne_u32_e32 vcc, 0, [[OR]]
 ; GCN-DAG: v_cndmask_b32_e32 v[[CLTZ_LO:[0-9]+]], 64, v[[CTLZ:[0-9]+]], vcc
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CLTZ_LO]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -156,7 +157,7 @@ define void @v_ctlz_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalia
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_i64_trunc:
-define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -172,7 +173,7 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
- define void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -186,7 +187,7 @@ define void @v_ctlz_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -202,7 +203,7 @@ define void @v_ctlz_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspac
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp eq i32 %ctlz, 32
@@ -217,7 +218,7 @@ define void @v_ctlz_i32_sel_eq_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: s_endpgm
-define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 false) nounwind readnone
   %cmp = icmp ne i32 %ctlz, 32
@@ -230,7 +231,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
- define void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
   %val = load i8, i8 addrspace(1)* %valptr.gep
@@ -245,7 +246,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; SI: buffer_load_ushort [[VAL:v[0-9]+]],
 ; SI: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_short [[FFBH]],
- define void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_i16_sel_eq_neg1(i16 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %valptr) nounwind {
   %val = load i16, i16 addrspace(1)* %valptr
   %ctlz = call i16 @llvm.ctlz.i16(i16 %val, i1 false) nounwind readnone
   %cmp = icmp eq i16 %val, 0
@@ -260,7 +261,7 @@ define void @v_ctlz_i32_sel_ne_bitwidth(i32 addrspace(1)* noalias %out, i32 addr
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7f, [[FFBH]]
 ; GCN: {{buffer|flat}}_store_byte [[TRUNC]],
-define void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_i7_sel_eq_neg1(i7 addrspace(1)* noalias %out, i7 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i7, i7 addrspace(1)* %valptr, i32 %tid
   %val = load i7, i7 addrspace(1)* %valptr.gep
diff --git a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index d390f64deeab09f12222a568a398d9f204bf44e1..87ba563a740f8539a75af5322e86bbd095bad0c5 100644
--- a/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -22,7 +22,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @s_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; GCN: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %ctlz, i32 addrspace(1)* %out, align 4
@@ -51,7 +51,7 @@ define void @v_ctlz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %ctlz = call <2 x i32> @llvm.ctlz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %ctlz, <2 x i32> addrspace(1)* %out, align 8
@@ -71,7 +71,7 @@ define void @v_ctlz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
 ; EG: FFBH_UINT {{\*? *}}[[RESULT]]
-define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %ctlz = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %ctlz, <4 x i32> addrspace(1)* %out, align 16
@@ -82,7 +82,7 @@ define void @v_ctlz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x
 ; GCN: buffer_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_byte [[RESULT]],
-define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %val = load i8, i8 addrspace(1)* %valptr
   %ctlz = call i8 @llvm.ctlz.i8(i8 %val, i1 true) nounwind readnone
   store i8 %ctlz, i8 addrspace(1)* %out
@@ -100,14 +100,14 @@ define void @v_ctlz_zero_undef_i8(i8 addrspace(1)* noalias %out, i8 addrspace(1)
 ; GCN-DAG: v_cndmask_b32_e32 v[[CTLZ:[0-9]+]], [[VFFBH_HI]], [[VFFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   store i64 %ctlz, i64 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_ctlz_zero_undef_i64_trunc:
-define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctlz = call i64 @llvm.ctlz.i64(i64 %val, i1 true)
   %trunc = trunc i64 %ctlz to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -123,7 +123,7 @@ define void @s_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 %va
 ; GCN-DAG: v_cndmask_b32_e64 v[[CTLZ:[0-9]+]], [[FFBH_HI]], [[FFBH_LO]]
 ; GCN-DAG: v_mov_b32_e32 v[[CTLZ_HI:[0-9]+]], 0{{$}}
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[CTLZ]]:[[CTLZ_HI]]{{\]}}
-define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -134,7 +134,7 @@ define void @v_ctlz_zero_undef_i64(i64 addrspace(1)* noalias %out, i64 addrspace
 }
 
 ; FUNC-LABEL: {{^}}v_ctlz_zero_undef_i64_trunc:
-define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -149,7 +149,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -162,7 +162,7 @@ define void @v_ctlz_zero_undef_i64_trunc(i32 addrspace(1)* noalias %out, i64 add
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -175,7 +175,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_neg1(i32 addrspace(1)* noalias %out, i
 ; GCN: {{buffer|flat}}_load_ubyte [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_u32_e32 [[FFBH:v[0-9]+]], [[VAL]]
 ; GCN: {{buffer|flat}}_store_byte [[FFBH]],
-define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %valptr) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %valptr.gep = getelementptr i8, i8 addrspace(1)* %valptr, i32 %tid
   %val = load i8, i8 addrspace(1)* %valptr.gep
@@ -194,7 +194,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN-DAG: buffer_store_dword [[RESULT0]]
 ; GCN-DAG: buffer_store_byte [[RESULT1]]
 ; GCN: s_endpgm
- define void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_neg1_two_use(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -211,7 +211,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 0
@@ -227,7 +227,7 @@ define void @v_ctlz_zero_undef_i8_sel_eq_neg1(i8 addrspace(1)* noalias %out, i8
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 0
@@ -243,7 +243,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
- define void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+ define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_eq_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp eq i32 %val, 1
@@ -259,7 +259,7 @@ define void @v_ctlz_zero_undef_i32_sel_ne_0(i32 addrspace(1)* noalias %out, i32
 ; GCN: v_cmp
 ; GCN: v_cndmask
 ; GCN: buffer_store_dword
-define void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_ctlz_zero_undef_i32_sel_ne_cmp_non0(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr
   %ctlz = call i32 @llvm.ctlz.i32(i32 %val, i1 true) nounwind readnone
   %cmp = icmp ne i32 %val, 1
diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll
index 9692236bb3639f3a5303c331744ca6754b625383..a29e72ea57cb37321875d52bb1efa1a30b3c3c60 100644
--- a/test/CodeGen/AMDGPU/ctpop.ll
+++ b/test/CodeGen/AMDGPU/ctpop.ll
@@ -16,7 +16,7 @@ declare <16 x i32> @llvm.ctpop.v16i32(<16 x i32>) nounwind readnone
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
   ret void
@@ -30,7 +30,7 @@ define void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   store i32 %ctpop, i32 addrspace(1)* %out, align 4
@@ -48,7 +48,7 @@ define void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noali
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
+define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %val1 = load i32, i32 addrspace(1)* %in1, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
@@ -64,7 +64,7 @@ define void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
+define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind {
   %val0 = load i32, i32 addrspace(1)* %in0, align 4
   %ctpop0 = call i32 @llvm.ctpop.i32(i32 %val0) nounwind readnone
   %add = add i32 %ctpop0, %sval
@@ -79,7 +79,7 @@ define void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(
 
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %ctpop = call <2 x i32> @llvm.ctpop.v2i32(<2 x i32> %val) nounwind readnone
   store <2 x i32> %ctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -97,7 +97,7 @@ define void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %ctpop = call <4 x i32> @llvm.ctpop.v4i32(<4 x i32> %val) nounwind readnone
   store <4 x i32> %ctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -123,7 +123,7 @@ define void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <8 x i32>, <8 x i32> addrspace(1)* %in, align 32
   %ctpop = call <8 x i32> @llvm.ctpop.v8i32(<8 x i32> %val) nounwind readnone
   store <8 x i32> %ctpop, <8 x i32> addrspace(1)* %out, align 32
@@ -165,7 +165,7 @@ define void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, <8 x i32> addrs
 ; EG: BCNT_INT
 ; EG: BCNT_INT
 ; EG: BCNT_INT
-define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> addrspace(1)* noalias %in) nounwind {
   %val = load <16 x i32>, <16 x i32> addrspace(1)* %in, align 32
   %ctpop = call <16 x i32> @llvm.ctpop.v16i32(<16 x i32> %val) nounwind readnone
   store <16 x i32> %ctpop, <16 x i32> addrspace(1)* %out, align 32
@@ -179,7 +179,7 @@ define void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, <16 x i32> ad
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 4
@@ -194,7 +194,7 @@ define void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noalias %out, i32
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 4, %ctpop
@@ -209,7 +209,7 @@ define void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* noalias %out,
 ; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, 99999
@@ -225,7 +225,7 @@ define void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspa
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %ctpop, %const
@@ -241,7 +241,7 @@ define void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i32 addrspace(1
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 %const) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %add = add i32 %const, %ctpop
@@ -258,7 +258,7 @@ define void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %out, i32 addrspa
 ; GCN: s_endpgm
 
 ; EG: BCNT_INT
-define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
+define amdgpu_kernel void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in, i32 addrspace(1)* noalias %constptr) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ctpop = call i32 @llvm.ctpop.i32(i32 %val) nounwind readnone
   %gep = getelementptr i32, i32 addrspace(1)* %constptr, i32 4
@@ -279,7 +279,7 @@ define void @v_ctpop_i32_add_vvar_inv(i32 addrspace(1)* noalias %out, i32 addrsp
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
 ; EG: BCNT_INT
-define void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i32_in_br(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll
index cd5d805e5db38a64cdfa8aab2f339d451d6636de..2610684ad9ee7c1009ee777ac773793004ae4a57 100644
--- a/test/CodeGen/AMDGPU/ctpop64.ll
+++ b/test/CodeGen/AMDGPU/ctpop64.ll
@@ -17,7 +17,7 @@ declare i128 @llvm.ctpop.i128(i128) nounwind readnone
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
 ; GCN: s_endpgm
-define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -31,7 +31,7 @@ define void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) nounwind {
 ; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %truncctpop = trunc i64 %ctpop to i32
@@ -48,7 +48,7 @@ define void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noali
 ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
+define amdgpu_kernel void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i64 %s.val) nounwind {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %ctpop = call i64 @llvm.ctpop.i64(i64 %val) nounwind readnone
   %or = or i64 %ctpop, %s.val
@@ -60,7 +60,7 @@ define void @v_ctpop_i64_user(i64 addrspace(1)* noalias %out, i64 addrspace(1)*
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val) nounwind {
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
   store <2 x i32> %truncctpop, <2 x i32> addrspace(1)* %out, align 8
@@ -73,7 +73,7 @@ define void @s_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> %val)
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_bcnt1_i32_b64
 ; GCN: s_endpgm
-define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
+define amdgpu_kernel void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val) nounwind {
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
   store <4 x i32> %truncctpop, <4 x i32> addrspace(1)* %out, align 16
@@ -86,7 +86,7 @@ define void @s_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> %val)
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
   %ctpop = call <2 x i64> @llvm.ctpop.v2i64(<2 x i64> %val) nounwind readnone
   %truncctpop = trunc <2 x i64> %ctpop to <2 x i32>
@@ -104,7 +104,7 @@ define void @v_ctpop_v2i64(<2 x i32> addrspace(1)* noalias %out, <2 x i64> addrs
 ; GCN: v_bcnt_u32_b32
 ; GCN: v_bcnt_u32_b32
 ; GCN: s_endpgm
-define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrspace(1)* noalias %in) nounwind {
   %val = load <4 x i64>, <4 x i64> addrspace(1)* %in, align 32
   %ctpop = call <4 x i64> @llvm.ctpop.v4i64(<4 x i64> %val) nounwind readnone
   %truncctpop = trunc <4 x i64> %ctpop to <4 x i32>
@@ -121,7 +121,7 @@ define void @v_ctpop_v4i64(<4 x i32> addrspace(1)* noalias %out, <4 x i64> addrs
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[ZERO]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[VLO]]:[[VHI]]{{\]}}
 ; GCN: s_endpgm
-define void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
+define amdgpu_kernel void @ctpop_i64_in_br(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %ctpop_arg, i32 %cond) {
 entry:
   %tmp0 = icmp eq i32 %cond, 0
   br i1 %tmp0, label %if, label %else
@@ -146,7 +146,7 @@ endif:
 ; GCN: s_bcnt1_i32_b64 [[SRESULT1:s[0-9]+]],
 ; GCN: s_add_i32 s{{[0-9]+}}, [[SRESULT1]], [[SRESULT0]]
 ; GCN: s_endpgm
-define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -159,7 +159,7 @@ define void @s_ctpop_i128(i32 addrspace(1)* noalias %out, i128 %val) nounwind {
 ; GCN: s_bcnt1_i32_b64 [[REG1:s[0-9]+]],
 ; GCN: s_add_i32 {{s[0-9]+}}, [[REG0]], [[REG1]]
 ; GCN: s_endpgm
-define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
+define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
   %ctpop = call i65 @llvm.ctpop.i65(i65 %val) nounwind readnone
   %truncctpop = trunc i65 %ctpop to i32
   store i32 %truncctpop, i32 addrspace(1)* %out, align 4
@@ -181,7 +181,7 @@ define void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) nounwind {
 
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v_ctpop_i128(i32 addrspace(1)* noalias %out, i128 addrspace(1)* noalias %in) nounwind {
   %val = load i128, i128 addrspace(1)* %in, align 8
   %ctpop = call i128 @llvm.ctpop.i128(i128 %val) nounwind readnone
   %truncctpop = trunc i128 %ctpop to i32
diff --git a/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
index e33cc18eb05f0e1dc7bf5721d2029790b7d6cb51..1fa6407647eb8be0b8a1e692c44844e3f96e8255 100644
--- a/test/CodeGen/AMDGPU/cttz_zero_undef.ll
+++ b/test/CodeGen/AMDGPU/cttz_zero_undef.ll
@@ -14,7 +14,7 @@ declare <4 x i32> @llvm.cttz.v4i32(<4 x i32>, i1) nounwind readnone
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @s_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 %val) nou
 ; SI: s_endpgm
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %cttz = call i32 @llvm.cttz.i32(i32 %val, i1 true) nounwind readnone
   store i32 %cttz, i32 addrspace(1)* %out, align 4
@@ -43,7 +43,7 @@ define void @v_cttz_zero_undef_i32(i32 addrspace(1)* noalias %out, i32 addrspace
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %valptr, align 8
   %cttz = call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %val, i1 true) nounwind readnone
   store <2 x i32> %cttz, <2 x i32> addrspace(1)* %out, align 8
@@ -63,7 +63,7 @@ define void @v_cttz_zero_undef_v2i32(<2 x i32> addrspace(1)* noalias %out, <2 x
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
 ; EG: FFBL_INT {{\*? *}}[[RESULT]]
-define void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
+define amdgpu_kernel void @v_cttz_zero_undef_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %valptr) nounwind {
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %valptr, align 16
   %cttz = call <4 x i32> @llvm.cttz.v4i32(<4 x i32> %val, i1 true) nounwind readnone
   store <4 x i32> %cttz, <4 x i32> addrspace(1)* %out, align 16
diff --git a/test/CodeGen/AMDGPU/cube.ll b/test/CodeGen/AMDGPU/cube.ll
index 9b512c439b0e5cc52dcb2adebe0d4108c67514c1..7b5f1aff7ea61ad44158afe6b8917d68e0d7548d 100644
--- a/test/CodeGen/AMDGPU/cube.ll
+++ b/test/CodeGen/AMDGPU/cube.ll
@@ -6,16 +6,13 @@ declare float @llvm.amdgcn.cubesc(float, float, float) #0
 declare float @llvm.amdgcn.cubetc(float, float, float) #0
 declare float @llvm.amdgcn.cubema(float, float, float) #0
 
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
-
-
 ; GCN-LABEL: {{^}}cube:
 ; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: _store_dwordx4
-define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %cubeid = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   %cubesc = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
   %cubetc = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
@@ -29,18 +26,5 @@ define void @cube(<4 x float> addrspace(1)* %out, float %a, float %b, float %c)
   ret void
 }
 
-; GCN-LABEL: {{^}}legacy_cube:
-; GCN-DAG: v_cubeid_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_cubesc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_cubetc_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; GCN-DAG: v_cubema_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-; GCN: _store_dwordx4
-define void @legacy_cube(<4 x float> addrspace(1)* %out, <4 x float> %abcx) #1 {
-  %cube = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %abcx)
-  store <4 x float> %cube, <4 x float> addrspace(1)* %out
-  ret void
-}
-
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
-
diff --git a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
index 7baaa81fba59f413ce7a7bfce57bdb4a3c88e5db..e16daa6fad9d09129b90913709c2839e6430b022 100644
--- a/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
+++ b/test/CodeGen/AMDGPU/cvt_f32_ubyte.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 ; GCN-NOT: lshr
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %cvt = uitofp i8 %load to float
   store float %cvt, float addrspace(1)* %out, align 4
@@ -22,7 +22,7 @@ define void @load_i8_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* n
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[LD]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LD]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in, align 2
   %cvt = uitofp <2 x i8> %load to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out, align 16
@@ -36,7 +36,7 @@ define void @load_v2i8_to_v2f32(<2 x float> addrspace(1)* noalias %out, <2 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v[[HIRESULT:[0-9]+]], [[VAL]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <3 x i8>, <3 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <3 x i8> %load to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out, align 16
@@ -52,7 +52,7 @@ define void @load_v3i8_to_v3f32(<3 x float> addrspace(1)* noalias %out, <3 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte1_e32 v{{[0-9]+}}, [[LOADREG]]
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[LORESULT:[0-9]+]], [[LOADREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LORESULT]]:[[HIRESULT]]{{\]}},
-define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -76,7 +76,7 @@ define void @load_v4i8_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8>
 ; GCN-DAG: v_cvt_f32_ubyte0_e32 v[[HIRESULT:[0-9]+]]
 
 ; GCN: buffer_store_dwordx4
-define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <4 x i8> %load to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out, align 16
@@ -110,7 +110,7 @@ define void @load_v4i8_to_v4f32_unaligned(<4 x float> addrspace(1)* noalias %out
 ; GCN: {{buffer|flat}}_store_dword
 
 ; GCN: s_endpgm
-define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %out2, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr <4 x i8>, <4 x i8> addrspace(1)* %in, i32 %tid.x
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in.ptr, align 4
@@ -124,7 +124,7 @@ define void @load_v4i8_to_v4f32_2_uses(<4 x float> addrspace(1)* noalias %out, <
 ; Make sure this doesn't crash.
 ; GCN-LABEL: {{^}}load_v7i8_to_v7f32:
 ; GCN: s_endpgm
-define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <7 x i8>, <7 x i8> addrspace(1)* %in, align 1
   %cvt = uitofp <7 x i8> %load to <7 x float>
   store <7 x float> %cvt, <7 x float> addrspace(1)* %out, align 16
@@ -147,7 +147,7 @@ define void @load_v7i8_to_v7f32(<7 x float> addrspace(1)* noalias %out, <7 x i8>
 ; GCN-NOT: lshr
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in, align 8
   %cvt = uitofp <8 x i8> %load to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out, align 16
@@ -159,7 +159,7 @@ define void @load_v8i8_to_v8f32(<8 x float> addrspace(1)* noalias %out, <8 x i8>
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, 2, [[LOADREG]]
 ; GCN-NEXT: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[ADD]]
 ; GCN: buffer_store_dword [[CONV]],
-define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %add = add i32 %load, 2
   %inreg = and i32 %add, 255
@@ -169,7 +169,7 @@ define void @i8_zext_inreg_i32_to_f32(float addrspace(1)* noalias %out, i32 addr
 }
 
 ; GCN-LABEL: {{^}}i8_zext_inreg_hi1_to_f32:
-define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %inreg = and i32 %load, 65280
   %shr = lshr i32 %inreg, 8
@@ -181,7 +181,7 @@ define void @i8_zext_inreg_hi1_to_f32(float addrspace(1)* noalias %out, i32 addr
 ; We don't get these ones because of the zext, but instcombine removes
 ; them so it shouldn't really matter.
 ; GCN-LABEL: {{^}}i8_zext_i32_to_f32:
-define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1)* noalias %in) nounwind {
   %load = load i8, i8 addrspace(1)* %in, align 1
   %ext = zext i8 %load to i32
   %cvt = uitofp i32 %ext to float
@@ -190,7 +190,7 @@ define void @i8_zext_i32_to_f32(float addrspace(1)* noalias %out, i8 addrspace(1
 }
 
 ; GCN-LABEL: {{^}}v4i8_zext_v4i32_to_v4f32:
-define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4 x i8> addrspace(1)* noalias %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 1
   %ext = zext <4 x i8> %load to <4 x i32>
   %cvt = uitofp <4 x i32> %ext to <4 x float>
@@ -203,7 +203,7 @@ define void @v4i8_zext_v4i32_to_v4f32(<4 x float> addrspace(1)* noalias %out, <4
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte0_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %and = and i32 %val, 255
   %cvt = uitofp i32 %and to float
@@ -216,7 +216,7 @@ define void @extract_byte0_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte1_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 8
   %and = and i32 %srl, 255
@@ -230,7 +230,7 @@ define void @extract_byte1_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte2_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 16
   %and = and i32 %srl, 255
@@ -244,7 +244,7 @@ define void @extract_byte2_to_f32(float addrspace(1)* noalias %out, i32 addrspac
 ; GCN-NOT: [[VAL]]
 ; GCN: v_cvt_f32_ubyte3_e32 [[CONV:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[CONV]]
-define void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @extract_byte3_to_f32(float addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in
   %srl = lshr i32 %val, 24
   %and = and i32 %srl, 255
diff --git a/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
index e7773c6e2a4f6e1cdbc76dca97be5bd1e4adced0..c10cf1a8a6f24ac195daad70cfd5915e40161ad4 100644
--- a/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
+++ b/test/CodeGen/AMDGPU/cvt_flr_i32_f32.ll
@@ -10,7 +10,7 @@ declare float @llvm.floor.f32(float) #1
 ; SI-NOT: add
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptosi float %floor to i32
   store i32 %cvt, i32 addrspace(1)* %out
@@ -22,7 +22,7 @@ define void @cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 v{{[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 1.0
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
@@ -35,7 +35,7 @@ define void @cvt_flr_i32_f32_1(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %floor = call float @llvm.floor.f32(float %x.fabs) #1
   %cvt = fptosi float %floor to i32
@@ -48,7 +48,7 @@ define void @cvt_flr_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %floor = call float @llvm.floor.f32(float %x.fneg) #1
   %cvt = fptosi float %floor to i32
@@ -61,7 +61,7 @@ define void @cvt_flr_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e64 v{{[0-9]+}}, -|s{{[0-9]+}}|
 ; SI: s_endpgm
-define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %floor = call float @llvm.floor.f32(float %x.fabs.fneg) #1
@@ -75,7 +75,7 @@ define void @cvt_flr_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32_e32
 ; SI: s_endpgm
-define void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_flr_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %floor = call float @llvm.floor.f32(float %x) #1
   %cvt = fptoui float %floor to i32
   store i32 %cvt, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
index d38411dcca6170715b34c7ac1a90ffbea7be6ae9..9b771ebdf7b3638a4726355e6018b2a7de3f8263 100644
--- a/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
+++ b/test/CodeGen/AMDGPU/cvt_rpi_i32_f32.ll
@@ -9,7 +9,7 @@ declare float @llvm.floor.f32(float) #1
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptosi float %floor to i32
@@ -21,7 +21,7 @@ define void @cvt_rpi_i32_f32(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_rpi_i32_f32
 ; SI-NONAN: v_cvt_rpi_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %fadd = fadd float %x.fabs, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
@@ -37,7 +37,7 @@ define void @cvt_rpi_i32_f32_fabs(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fneg = fsub float -0.000000e+00, %x
   %fadd = fadd float %x.fneg, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
@@ -55,7 +55,7 @@ define void @cvt_rpi_i32_f32_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI-SAFE-NOT: v_cvt_flr_i32_f32
 ; SI-NONAN: v_cvt_flr_i32_f32_e32 {{v[0-9]+}}, [[TMP]]
 ; SI: s_endpgm
-define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
   %x.fabs = call float @llvm.fabs.f32(float %x) #1
   %x.fabs.fneg = fsub float -0.000000e+00, %x.fabs
   %fadd = fadd float %x.fabs.fneg, 0.5
@@ -71,7 +71,7 @@ define void @cvt_rpi_i32_f32_fabs_fneg(i32 addrspace(1)* %out, float %x) #0 {
 ; SI: v_floor_f32
 ; SI: v_cvt_u32_f32
 ; SI: s_endpgm
-define void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @no_cvt_rpi_i32_f32_0(i32 addrspace(1)* %out, float %x) #0 {
   %fadd = fadd float %x, 0.5
   %floor = call float @llvm.floor.f32(float %fadd) #1
   %cvt = fptoui float %floor to i32
diff --git a/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
index a32c16dfac385ec7621ce39bd5e7fbca9e7efe0d..11acbc274eb5f965cc87e18ef464e6359b5a1d6c 100644
--- a/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
+++ b/test/CodeGen/AMDGPU/dagcombine-reassociate-bug.ll
@@ -9,7 +9,7 @@
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, [[VADDR]], [[SADDR]]
 
-define void @store_same_base_ptr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_same_base_ptr(i32 addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #0
   %offset = sext i32 %id to i64
diff --git a/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
index fb43ff4fbddde63b264b9a6ab09939dd43993f5c..ceff889b3a7e97c6f545f8f4b5589f5fc82c486e 100644
--- a/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
+++ b/test/CodeGen/AMDGPU/dagcombiner-bug-illegal-vec4-int-to-fp.ll
@@ -10,7 +10,7 @@
 ; CHECK: {{^}}sint:
 ; CHECK: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %sint = load i32, i32 addrspace(1) * %in
@@ -24,7 +24,7 @@ entry:
 ;CHECK: {{^}}uint:
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @uint(<4 x float> addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %uint = load i32, i32 addrspace(1) * %in
diff --git a/test/CodeGen/AMDGPU/debug.ll b/test/CodeGen/AMDGPU/debug.ll
index a2e0e878b7404d4b1d520fb9511bcea6107a0712..f149aaddb8eff99b0784ff4db5e88fffd681bcf6 100644
--- a/test/CodeGen/AMDGPU/debug.ll
+++ b/test/CodeGen/AMDGPU/debug.ll
@@ -4,7 +4,7 @@
 ; Test for a crash in the custom assembly dump code.
 
 ; SI: s_endpgm
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   store i32 0, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
index 49a7e722f29cbe8ecfa9f5d9e6ab0700699ada8a..734905ba2b0829c94bdeac2712242383af3682e5 100644
--- a/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
+++ b/test/CodeGen/AMDGPU/debugger-emit-prologue.ll
@@ -23,7 +23,7 @@
 ; NOATTR-NOT: DebuggerPrivateSegmentBufferSGPR
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/test/CodeGen/AMDGPU/debugger-insert-nops.ll b/test/CodeGen/AMDGPU/debugger-insert-nops.ll
index 6638f4e25821f09e7332f3e07c615d12b0a5b20e..fcdbfb10a8ca4a57061fcfe0fc20efbdb366dc8a 100644
--- a/test/CodeGen/AMDGPU/debugger-insert-nops.ll
+++ b/test/CodeGen/AMDGPU/debugger-insert-nops.ll
@@ -1,27 +1,35 @@
-; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECK
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-insert-nops -verify-machineinstrs < %s | FileCheck %s --check-prefix=CHECKNOP
 
-; CHECK: test01.cl:2:{{[0-9]+}}
-; CHECK-NEXT: s_nop 0
+; This test expects that we have one instance for each line in some order with "s_nop 0" instances after each.
 
-; CHECK: test01.cl:3:{{[0-9]+}}
-; CHECK-NEXT: s_nop 0
+; Check that each line appears at least once
+; CHECK-DAG: test01.cl:2:3
+; CHECK-DAG: test01.cl:3:3
+; CHECK-DAG: test01.cl:4:3
 
-; CHECK: test01.cl:4:{{[0-9]+}}
-; CHECK-NEXT: s_nop 0
+
+; Check that each of each of the lines consists of the line output, followed by "s_nop 0"
+; CHECKNOP: test01.cl:{{[234]}}:3
+; CHECKNOP-NEXT: s_nop 0
+; CHECKNOP: test01.cl:{{[234]}}:3
+; CHECKNOP-NEXT: s_nop 0
+; CHECKNOP: test01.cl:{{[234]}}:3
+; CHECKNOP-NEXT: s_nop 0
 
 ; CHECK: test01.cl:5:{{[0-9]+}}
 ; CHECK-NEXT: s_nop 0
 ; CHECK-NEXT: s_endpgm
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
   call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !17, metadata !18), !dbg !19
   %0 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !20
   %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i32 0, !dbg !20
-  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !21
+  store i32 1, i32 addrspace(1)* %arrayidx, align 4, !dbg !20
   %1 = load i32 addrspace(1)*, i32 addrspace(1)** %A.addr, align 4, !dbg !22
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %1, i32 1, !dbg !22
   store i32 2, i32 addrspace(1)* %arrayidx1, align 4, !dbg !23
diff --git a/test/CodeGen/AMDGPU/debugger-reserve-regs.ll b/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
index d30bb20bb03a97e5c435eb24a991ef8bdbe8aa34..764c60b12bf9fc4c3935bf827ea893cbb4829e8e 100644
--- a/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
+++ b/test/CodeGen/AMDGPU/debugger-reserve-regs.ll
@@ -1,11 +1,12 @@
 ; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=+amdgpu-debugger-reserve-regs -verify-machineinstrs < %s | FileCheck %s
 ; CHECK: reserved_vgpr_first = {{[0-9]+}}
 ; CHECK-NEXT: reserved_vgpr_count = 4
 ; CHECK: ReservedVGPRFirst: {{[0-9]+}}
 ; CHECK-NEXT: ReservedVGPRCount: 4
 
 ; Function Attrs: nounwind
-define void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %A) #0 !dbg !12 {
 entry:
   %A.addr = alloca i32 addrspace(1)*, align 4
   store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
diff --git a/test/CodeGen/AMDGPU/default-fp-mode.ll b/test/CodeGen/AMDGPU/default-fp-mode.ll
index 0969fc1caffed278013f3c54afa87517b73127aa..ad9111a28654ed921bb74972fc555ab05249d4d4 100644
--- a/test/CodeGen/AMDGPU/default-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/default-fp-mode.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}test_default_si:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+define amdgpu_kernel void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -12,7 +12,7 @@ define void @test_default_si(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN-LABEL: {{^}}test_default_vi:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -21,7 +21,7 @@ define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN-LABEL: {{^}}test_f64_denormals:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -30,7 +30,7 @@ define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN-LABEL: {{^}}test_f32_denormals:
 ; GCNL: FloatMode: 48
 ; GCN: IeeeMode: 1
-define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -39,7 +39,7 @@ define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN-LABEL: {{^}}test_f32_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -48,7 +48,7 @@ define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(
 ; GCN-LABEL: {{^}}test_no_denormals
 ; GCN: FloatMode: 0
 ; GCN: IeeeMode: 1
-define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -57,7 +57,7 @@ define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %
 ; GCN-LABEL: {{^}}test_f16_f64_denormals:
 ; GCN: FloatMode: 192
 ; GCN: IeeeMode: 1
-define void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
+define amdgpu_kernel void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
   store half 0.0, half addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -66,7 +66,7 @@ define void @test_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1
 ; GCN-LABEL: {{^}}test_no_f16_f64_denormals:
 ; GCN: FloatMode: 0
 ; GCN: IeeeMode: 1
-define void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
+define amdgpu_kernel void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspace(1)* %out1) #7 {
   store half 0.0, half addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -75,7 +75,7 @@ define void @test_no_f16_f64_denormals(half addrspace(1)* %out0, double addrspac
 ; GCN-LABEL: {{^}}test_f32_f16_f64_denormals:
 ; GCN: FloatMode: 240
 ; GCN: IeeeMode: 1
-define void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 {
+define amdgpu_kernel void @test_f32_f16_f64_denormals(half addrspace(1)* %out0, float addrspace(1)* %out1, double addrspace(1)* %out2) #8 {
   store half 0.0, half addrspace(1)* %out0
   store float 0.0, float addrspace(1)* %out1
   store double 0.0, double addrspace(1)* %out2
@@ -97,18 +97,15 @@ main_body:
 
 ; GCN-LABEL: {{^}}kill_vcc_implicit_def:
 ; GCN: IeeeMode: 0
-define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
+define amdgpu_ps float @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
 entry:
   %tmp0 = fcmp olt float %13, 0.0
   call void @llvm.AMDGPU.kill(float %14)
   %tmp1 = select i1 %tmp0, float 1.0, float 0.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
-  ret void
+  ret float %tmp1
 }
 
-
 declare void @llvm.AMDGPU.kill(float)
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
 
 attributes #0 = { nounwind "target-cpu"="tahiti" }
 attributes #1 = { nounwind "target-cpu"="fiji" }
diff --git a/test/CodeGen/AMDGPU/detect-dead-lanes.mir b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
index 9d70f67ef4913285bdee8a4ba60938d6c592a763..32e6f7cc0cdc71135f5952aadfbaad5a28c0d28b 100644
--- a/test/CodeGen/AMDGPU/detect-dead-lanes.mir
+++ b/test/CodeGen/AMDGPU/detect-dead-lanes.mir
@@ -1,14 +1,14 @@
 # RUN: llc -march=amdgcn -run-pass detect-dead-lanes -o - %s | FileCheck %s
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
-  define void @test2() { ret void }
-  define void @test3() { ret void }
-  define void @test4() { ret void }
-  define void @test5() { ret void }
-  define void @loop0() { ret void }
-  define void @loop1() { ret void }
-  define void @loop2() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
+  define amdgpu_kernel void @test2() { ret void }
+  define amdgpu_kernel void @test3() { ret void }
+  define amdgpu_kernel void @test4() { ret void }
+  define amdgpu_kernel void @test5() { ret void }
+  define amdgpu_kernel void @loop0() { ret void }
+  define amdgpu_kernel void @loop1() { ret void }
+  define amdgpu_kernel void @loop2() { ret void }
 ...
 ---
 # Combined use/def transfer check, the basics.
diff --git a/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
index cdd2c0cd4f43deaf0bebc980ca19ac1e2ae19602..6dfe1294bb4791d72a0a836491bcfbc01b3e095b 100644
--- a/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
+++ b/test/CodeGen/AMDGPU/disconnected-predset-break-bug.ll
@@ -9,7 +9,7 @@
 ; CHECK: ALU_PUSH_BEFORE
 ; CHECK-NEXT: JUMP
 ; CHECK-NEXT: LOOP_BREAK
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
+define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) nounwind {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end
diff --git a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
index 5e1ebfde3e10aa17bcadf291751427ab3d422ba0..878b5ebe9409158dc6d43e9770364dc62a176e41 100644
--- a/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
+++ b/test/CodeGen/AMDGPU/drop-mem-operand-move-smrd.ll
@@ -9,7 +9,7 @@
 ; GCN: buffer_load_dword
 ; GCN: ds_write2_b32
 ; GCN: s_endpgm
-define void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
+define amdgpu_kernel void @reschedule_global_load_lds_store(i32 addrspace(1)* noalias %gptr0, i32 addrspace(1)* noalias %gptr1, i32 addrspace(3)* noalias %lptr, i32 %c) #0 {
 entry:
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx = shl i32 %tid, 2
diff --git a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
index f461d6978f135758b1a247409d75228ad935f1d5..5997e27fd815e462d7a47529690e925bbd398765 100644
--- a/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
+++ b/test/CodeGen/AMDGPU/ds-negative-offset-addressing-mode-loop.ll
@@ -23,7 +23,7 @@ declare void @llvm.amdgcn.s.barrier() #1
 ; CI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[VADDR]] offset0:32 offset1:34
 ; CI-DAG: ds_read_b32 v{{[0-9]+}}, [[VADDR]] offset:256
 ; CHECK: s_endpgm
-define void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
+define amdgpu_kernel void @signed_ds_offset_addressing_loop(float addrspace(1)* noalias nocapture %out, float addrspace(3)* noalias nocapture readonly %lptr, i32 %n) #2 {
 entry:
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %mul = shl nsw i32 %x.i, 1
diff --git a/test/CodeGen/AMDGPU/ds-sub-offset.ll b/test/CodeGen/AMDGPU/ds-sub-offset.ll
index 16fb019ae0f35ee0bac5084489f3392612762e14..d74bd5aa15ac1354ae818dfa840f94a0bd1334ab 100644
--- a/test/CodeGen/AMDGPU/ds-sub-offset.ll
+++ b/test/CodeGen/AMDGPU/ds-sub-offset.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_sub_i32_e32 [[BASEPTR:v[0-9]+]], vcc, 0, [[SHL]]
 ; GCN: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b
 ; GCN: ds_write_b32 [[BASEPTR]], [[VAL]] offset:12
-define void @write_ds_sub0_offset0_global() #0 {
+define amdgpu_kernel void @write_ds_sub0_offset0_global() #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %sub1 = sub i32 0, %x.i
@@ -24,7 +24,7 @@ entry:
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]] offset:65535
-define void @add_x_shl_neg_to_sub_max_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -39,7 +39,7 @@ define void @add_x_shl_neg_to_sub_max_offset() #1 {
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x10000, [[SCALED]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 13
 ; GCN: ds_write_b8 [[NEG]], [[K]]{{$}}
-define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -58,7 +58,7 @@ define void @add_x_shl_neg_to_sub_max_offset_p1() #1 {
 ; GCN-NOT: v_sub
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:456{{$}}
 ; GCN: s_endpgm
-define void @add_x_shl_neg_to_sub_multi_use() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -80,7 +80,7 @@ define void @add_x_shl_neg_to_sub_multi_use() #1 {
 ; GCN-NOT: v_sub
 ; GCN: ds_write_b32 [[NEG]], [[K]] offset:123{{$}}
 ; GCN: s_endpgm
-define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -95,7 +95,7 @@ define void @add_x_shl_neg_to_sub_multi_use_same_offset() #1 {
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset0:254 offset1:255
-define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
@@ -109,7 +109,7 @@ define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset() #1 {
 ; GCN-DAG: v_lshlrev_b32_e32 [[SCALED:v[0-9]+]], 2, v0
 ; GCN-DAG: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0x3fc, [[SCALED]]
 ; GCN: ds_write2_b32 [[NEG]], {{v[0-9]+}}, {{v[0-9]+}} offset1:1{{$}}
-define void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
+define amdgpu_kernel void @add_x_shl_neg_to_sub_misaligned_i64_max_offset_p1() #1 {
   %x.i = call i32 @llvm.amdgcn.workitem.id.x() #0
   %neg = sub i32 0, %x.i
   %shl = shl i32 %neg, 2
diff --git a/test/CodeGen/AMDGPU/ds_read2.ll b/test/CodeGen/AMDGPU/ds_read2.ll
index 9a313230e3035bda4c1717ccefd6775b8d06b988..2c474dbe7b086153f85def370f5489097230231d 100644
--- a/test/CodeGen/AMDGPU/ds_read2.ll
+++ b/test/CodeGen/AMDGPU/ds_read2.ll
@@ -12,7 +12,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -31,7 +31,7 @@ define void @simple_read2_f32(float addrspace(1)* %out) #0 {
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -49,7 +49,7 @@ define void @simple_read2_f32_max_offset(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
-define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -66,7 +66,7 @@ define void @simple_read2_f32_too_far(float addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -98,7 +98,7 @@ define void @simple_read2_f32_x2(float addrspace(1)* %out) #0 {
 ; SI: s_barrier
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 0
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -133,7 +133,7 @@ define void @simple_read2_f32_x2_barrier(float addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR:v[0-9]+]] offset0:2 offset1:8
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASEADDR]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %idx.0
@@ -170,7 +170,7 @@ define void @simple_read2_f32_x2_nonzero_base(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32
 ; SI: ds_read_b32
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
@@ -196,7 +196,7 @@ define void @read2_ptr_is_subreg_arg_f32(float addrspace(1)* %out, <2 x float ad
 ; SI: ds_read_b32
 ; SI: ds_read_b32
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %index.0 = insertelement <2 x i32> undef, i32 %x.i, i32 0
   %index.1 = insertelement <2 x i32> %index.0, i32 8, i32 0
@@ -219,7 +219,7 @@ define void @read2_ptr_is_subreg_arg_offset_f32(float addrspace(1)* %out, <2 x f
 ; SI-LABEL: {{^}}read2_ptr_is_subreg_f32:
 ; SI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:8{{$}}
 ; SI: s_endpgm
-define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr.0 = insertelement <2 x [512 x float] addrspace(3)*> undef, [512 x float] addrspace(3)* @lds, i32 0
   %ptr.1 = insertelement <2 x [512 x float] addrspace(3)*> %ptr.0, [512 x float] addrspace(3)* @lds, i32 1
@@ -243,7 +243,7 @@ define void @read2_ptr_is_subreg_f32(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load volatile float, float addrspace(3)* %arrayidx0, align 4
@@ -261,7 +261,7 @@ define void @simple_read2_f32_volatile_0(float addrspace(1)* %out) #0 {
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -280,7 +280,7 @@ define void @simple_read2_f32_volatile_1(float addrspace(1)* %out) #0 {
 ; SI-LABEL: @unaligned_read2_f32
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
-define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 1
@@ -296,7 +296,7 @@ define void @unaligned_read2_f32(float addrspace(1)* %out, float addrspace(3)* %
 ; SI-LABEL: @misaligned_2_simple_read2_f32
 ; SI-NOT: ds_read2_b32
 ; SI: s_endpgm
-define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 2
@@ -315,7 +315,7 @@ define void @misaligned_2_simple_read2_f32(float addrspace(1)* %out, float addrs
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2_f64(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -331,7 +331,7 @@ define void @simple_read2_f64(double addrspace(1)* %out) #0 {
 ; SI-LABEL: @simple_read2_f64_max_offset
 ; SI: ds_read2_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:255
 ; SI: s_endpgm
-define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -349,7 +349,7 @@ define void @simple_read2_f64_max_offset(double addrspace(1)* %out) #0 {
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset:2056
 ; SI: s_endpgm
-define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -367,7 +367,7 @@ define void @simple_read2_f64_too_far(double addrspace(1)* %out) #0 {
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:14 offset1:15
 ; SI: s_endpgm
-define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
@@ -385,7 +385,7 @@ define void @misaligned_read2_f64(double addrspace(1)* %out, double addrspace(3)
 ; SI-LABEL: @load_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
-define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   %sum = add i32 %val0, %val1
@@ -396,7 +396,7 @@ define void @load_constant_adjacent_offsets(i32 addrspace(1)* %out) {
 ; SI-LABEL: @load_constant_disjoint_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:2
-define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
   %val0 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   %val1 = load i32, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   %sum = add i32 %val0, %val1
@@ -410,7 +410,7 @@ define void @load_constant_disjoint_offsets(i32 addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[ZERO]] offset0:2 offset1:3
-define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   %sum = add i64 %val0, %val1
@@ -426,7 +426,7 @@ define void @load_misaligned64_constant_offsets(i64 addrspace(1)* %out) {
 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE0]] offset1:1
 ; SI-DAG: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, [[BASE1]] offset1:1
 ; SI: s_endpgm
-define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
   %val0 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   %val1 = load i64, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   %sum = add i64 %val0, %val1
@@ -437,7 +437,7 @@ define void @load_misaligned64_constant_large_offsets(i64 addrspace(1)* %out) {
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
+define amdgpu_kernel void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %arrayidx44 = getelementptr inbounds [264 x float], [264 x float] addrspace(3)* @sgemm.lA, i32 0, i32 %x.i
@@ -481,13 +481,13 @@ define void @sgemm_inner_loop_read2_sequence(float addrspace(1)* %C, i32 %lda, i
   ret void
 }
 
-define void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %load = load <2 x i32>, <2 x i32> addrspace(3)* %in, align 4
   store <2 x i32> %load, <2 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
-define void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @misaligned_read2_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %in) #0 {
   %load = load i64, i64 addrspace(3)* %in, align 4
   store i64 %load, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
index 4a3f3fb99700d6f6edc14ca32011d47d6dbf149b..9668743cf128f23e89d81769923e5a75ba512c88 100644
--- a/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_offset_order.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:2 offset1:3
 ; SI-DAG: ds_read_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:56
 ; SI-DAG: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} offset0:11 offset1:12
-define void @offset_order(float addrspace(1)* %out) {
+define amdgpu_kernel void @offset_order(float addrspace(1)* %out) {
 entry:
   %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 0
   %val0 = load float, float addrspace(3)* %ptr0
diff --git a/test/CodeGen/AMDGPU/ds_read2_superreg.ll b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
index 9d8375d6403710c2b095f8af6792f3b27f0a1f51..fc85ec06f58df6e9f68baebc38a865e4bb752d32 100644
--- a/test/CodeGen/AMDGPU/ds_read2_superreg.ll
+++ b/test/CodeGen/AMDGPU/ds_read2_superreg.ll
@@ -12,7 +12,7 @@
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds  [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0, align 4
@@ -26,7 +26,7 @@ define void @simple_read2_v2f32_superreg_align4(<2 x float> addrspace(1)* %out)
 ; CI: s_waitcnt lgkmcnt(0)
 ; CI: buffer_store_dwordx2 [[RESULT]]
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <2 x float>], [512 x <2 x float>] addrspace(3)* @lds.v2, i32 0, i32 %x.i
   %val0 = load <2 x float>, <2 x float> addrspace(3)* %arrayidx0
@@ -43,7 +43,7 @@ define void @simple_read2_v2f32_superreg(<2 x float> addrspace(1)* %out) #0 {
 ; CI: v_add_f32_e32 v[[ADD2:[0-9]+]], v[[ADD1]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD2]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 4
@@ -68,7 +68,7 @@ define void @simple_read2_v4f32_superreg_align4(float addrspace(1)* %out) #0 {
 ; CI-DAG: v_add_f32_e32 v[[ADD1:[0-9]+]], v[[REG_Y]], v[[ADD0]]
 ; CI: buffer_store_dword v[[ADD1]]
 ; CI: s_endpgm
-define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <3 x float>], [512 x <3 x float>] addrspace(3)* @lds.v3, i32 0, i32 %x.i
   %val0 = load <3 x float>, <3 x float> addrspace(3)* %arrayidx0, align 4
@@ -88,7 +88,7 @@ define void @simple_read2_v3f32_superreg_align4(float addrspace(1)* %out) #0 {
 ; CI: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0, align 8
@@ -101,7 +101,7 @@ define void @simple_read2_v4f32_superreg_align8(<4 x float> addrspace(1)* %out)
 ; CI-DAG: ds_read2_b64 [[REG_ZW:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}} offset1:1{{$}}
 ; CI: buffer_store_dwordx4 [[REG_ZW]]
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <4 x float>], [512 x <4 x float>] addrspace(3)* @lds.v4, i32 0, i32 %x.i
   %val0 = load <4 x float>, <4 x float> addrspace(3)* %arrayidx0
@@ -117,7 +117,7 @@ define void @simple_read2_v4f32_superreg(<4 x float> addrspace(1)* %out) #0 {
 ; CI-DAG: buffer_store_dwordx4 [[VEC_HI]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16
 ; CI-DAG: buffer_store_dwordx4 [[VEC_LO]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64{{$}}
 ; CI: s_endpgm
-define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <8 x float>], [512 x <8 x float>] addrspace(3)* @lds.v8, i32 0, i32 %x.i
   %val0 = load <8 x float>, <8 x float> addrspace(3)* %arrayidx0
@@ -138,7 +138,7 @@ define void @simple_read2_v8f32_superreg(<8 x float> addrspace(1)* %out) #0 {
 ; CI-DAG: buffer_store_dwordx4 [[VEC8_11]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:32
 ; CI-DAG: buffer_store_dwordx4 [[VEC12_15]], v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:48
 ; CI: s_endpgm
-define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x <16 x float>], [512 x <16 x float>] addrspace(3)* @lds.v16, i32 0, i32 %x.i
   %val0 = load <16 x float>, <16 x float> addrspace(3)* %arrayidx0
@@ -153,7 +153,7 @@ define void @simple_read2_v16f32_superreg(<16 x float> addrspace(1)* %out) #0 {
 ; CI-NOT: v_mov
 ; CI: buffer_store_dwordx2 v{{\[}}[[REG_ELT0]]:[[REG_ELT1]]{{\]}}
 ; CI: s_endpgm
-define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
@@ -176,7 +176,7 @@ define void @simple_read2_v2f32_superreg_scalar_loads_align4(<2 x float> addrspa
 ; CI-NOT: v_mov
 ; CI: buffer_store_dwordx4 v{{\[}}[[REG_ELT0]]:[[REG_ELT3]]{{\]}}
 ; CI: s_endpgm
-define void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2_v4f32_superreg_scalar_loads_align4(<4 x float> addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %arrayidx1 = getelementptr inbounds float, float addrspace(3)* %arrayidx0, i32 1
diff --git a/test/CodeGen/AMDGPU/ds_read2st64.ll b/test/CodeGen/AMDGPU/ds_read2st64.ll
index 99f01b4f26220dfdf29ac5422acb05e3c2d7e872..81b35a46aa1889906692ff03410af7ad9f221d9f 100644
--- a/test/CodeGen/AMDGPU/ds_read2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_read2st64.ll
@@ -10,7 +10,7 @@
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -29,7 +29,7 @@ define void @simple_read2st64_f32_0_1(float addrspace(1)* %out) #0 {
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -49,7 +49,7 @@ define void @simple_read2st64_f32_1_2(float addrspace(1)* %out, float addrspace(
 ; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], v[[HI_VREG]], v[[LO_VREG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -69,7 +69,7 @@ define void @simple_read2st64_f32_max_offset(float addrspace(1)* %out, float add
 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:256
 ; SI-DAG: ds_read_b32 {{v[0-9]+}}, [[BIGADD]]{{$}}
 ; SI: s_endpgm
-define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds float, float addrspace(3)* %lds, i32 %add.x.0
@@ -86,7 +86,7 @@ define void @simple_read2st64_f32_over_max_offset(float addrspace(1)* %out, floa
 ; SI-LABEL: @odd_invalid_read2st64_f32_0
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %x.i
   %val0 = load float, float addrspace(3)* %arrayidx0, align 4
@@ -102,7 +102,7 @@ define void @odd_invalid_read2st64_f32_0(float addrspace(1)* %out) #0 {
 ; SI-LABEL: @odd_invalid_read2st64_f32_1
 ; SI-NOT: ds_read2st64_b32
 ; SI: s_endpgm
-define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 %add.x.0
@@ -122,7 +122,7 @@ define void @odd_invalid_read2st64_f32_1(float addrspace(1)* %out) #0 {
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds [512 x double], [512 x double] addrspace(3)* @lds.f64, i32 0, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
@@ -141,7 +141,7 @@ define void @simple_read2st64_f64_0_1(double addrspace(1)* %out) #0 {
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -161,7 +161,7 @@ define void @simple_read2st64_f64_1_2(double addrspace(1)* %out, double addrspac
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset1:1
 ; SI: ds_read2_b32 v{{\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:128 offset1:129
 ; SI: s_endpgm
-define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 4
@@ -181,7 +181,7 @@ define void @misaligned_read2st64_f64(double addrspace(1)* %out, double addrspac
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[LO_VREG]]:{{[0-9]+\]}}, v{{\[[0-9]+}}:[[HI_VREG]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 256
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -197,11 +197,11 @@ define void @simple_read2st64_f64_max_offset(double addrspace(1)* %out, double a
 
 ; SI-LABEL: @simple_read2st64_f64_over_max_offset
 ; SI-NOT: ds_read2st64_b64
-; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
-; SI: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
+; SI-DAG: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset:512
+; SI-DAG: v_add_i32_e32 [[BIGADD:v[0-9]+]], vcc, 0x10000, {{v[0-9]+}}
 ; SI: ds_read_b64 {{v\[[0-9]+:[0-9]+\]}}, [[BIGADD]]
 ; SI: s_endpgm
-define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -218,7 +218,7 @@ define void @simple_read2st64_f64_over_max_offset(double addrspace(1)* %out, dou
 ; SI-LABEL: @invalid_read2st64_f64_odd_offset
 ; SI-NOT: ds_read2st64_b64
 ; SI: s_endpgm
-define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %add.x.0 = add nsw i32 %x.i, 64
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %add.x.0
@@ -239,7 +239,7 @@ define void @invalid_read2st64_f64_odd_offset(double addrspace(1)* %out, double
 ; SI-NOT: ds_read2st_b64
 ; SI: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:8
 ; SI: s_endpgm
-define void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_read2_f64(double addrspace(1)* %out, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx0 = getelementptr inbounds double, double addrspace(3)* %lds, i32 %x.i
   %val0 = load double, double addrspace(3)* %arrayidx0, align 8
diff --git a/test/CodeGen/AMDGPU/ds_write2.ll b/test/CodeGen/AMDGPU/ds_write2.ll
index ae230dac9378bbbe5387aec18c4bf03f9c70bfce..ab1cf0ba25b5fdc12688e19000a7f142983269bc 100644
--- a/test/CodeGen/AMDGPU/ds_write2.ll
+++ b/test/CodeGen/AMDGPU/ds_write2.ll
@@ -9,7 +9,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
@@ -27,7 +27,7 @@ define void @simple_write2_one_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -46,7 +46,7 @@ define void @simple_write2_two_val_f32(float addrspace(1)* %C, float addrspace(1
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -65,7 +65,7 @@ define void @simple_write2_two_val_f32_volatile_0(float addrspace(1)* %C, float
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:32
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -86,7 +86,7 @@ define void @simple_write2_two_val_f32_volatile_1(float addrspace(1)* %C, float
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in.gep.0, i32 1
@@ -107,7 +107,7 @@ define void @simple_write2_two_val_subreg2_mixed_f32(float addrspace(1)* %C, <2
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i
   %val = load <2 x float>, <2 x float> addrspace(1)* %in.gep, align 8
@@ -126,7 +126,7 @@ define void @simple_write2_two_val_subreg2_f32(float addrspace(1)* %C, <2 x floa
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 %x.i
   %val = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 16
@@ -146,7 +146,7 @@ define void @simple_write2_two_val_subreg4_f32(float addrspace(1)* %C, <4 x floa
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
-define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -164,7 +164,7 @@ define void @simple_write2_two_val_max_offset_f32(float addrspace(1)* %C, float
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:1028
 ; SI: s_endpgm
-define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -182,7 +182,7 @@ define void @simple_write2_two_val_too_far_f32(float addrspace(1)* %C, float add
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset1:8
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
@@ -212,7 +212,7 @@ define void @simple_write2_two_val_f32_x2(float addrspace(1)* %C, float addrspac
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0:v[0-9]+]], [[VAL1:v[0-9]+]] offset0:3 offset1:8
 ; SI: ds_write2_b32 [[BASEADDR:v[0-9]+]], [[VAL0]], [[VAL1]] offset0:11 offset1:27
 ; SI: s_endpgm
-define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %tid.x
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %tid.x
@@ -243,7 +243,7 @@ define void @simple_write2_two_val_f32_x2_nonzero_base(float addrspace(1)* %C, f
 ; SI: ds_write_b32
 ; SI: ds_write_b32
 ; SI: s_endpgm
-define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
+define amdgpu_kernel void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float addrspace(1)* %in0, float addrspace(1)* %in1, <2 x float addrspace(3)*> %lds.ptr) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in0.gep = getelementptr float, float addrspace(1)* %in0, i32 %x.i
   %in1.gep = getelementptr float, float addrspace(1)* %in1, i32 %x.i
@@ -270,7 +270,7 @@ define void @write2_ptr_subreg_arg_two_val_f32(float addrspace(1)* %C, float add
 ; SI: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL]], [[VAL]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
@@ -288,7 +288,7 @@ define void @simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset1:1
 ; SI: ds_write2_b32 [[VPTR]], v[[VAL0]], v[[VAL1]] offset0:14 offset1:15
 ; SI: s_endpgm
-define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
@@ -306,7 +306,7 @@ define void @misaligned_simple_write2_one_val_f64(double addrspace(1)* %C, doubl
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 3, v{{[0-9]+}}
 ; SI: ds_write2_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset1:8
 ; SI: s_endpgm
-define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
@@ -325,7 +325,7 @@ define void @simple_write2_two_val_f64(double addrspace(1)* %C, double addrspace
 ; SI-LABEL: @store_constant_adjacent_offsets
 ; SI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
-define void @store_constant_adjacent_offsets() {
+define amdgpu_kernel void @store_constant_adjacent_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 1), align 4
   ret void
@@ -335,7 +335,7 @@ define void @store_constant_adjacent_offsets() {
 ; SI-DAG: v_mov_b32_e32 [[VAL:v[0-9]+]], 0x7b{{$}}
 ; SI-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; SI: ds_write2_b32 [[ZERO]], [[VAL]], [[VAL]] offset1:2
-define void @store_constant_disjoint_offsets() {
+define amdgpu_kernel void @store_constant_disjoint_offsets() {
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 0), align 4
   store i32 123, i32 addrspace(3)* getelementptr inbounds ([4 x i32], [4 x i32] addrspace(3)* @foo, i32 0, i32 2), align 4
   ret void
@@ -348,7 +348,7 @@ define void @store_constant_disjoint_offsets() {
 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI-DAG: ds_write2_b32 [[ZERO]], v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; SI: s_endpgm
-define void @store_misaligned64_constant_offsets() {
+define amdgpu_kernel void @store_misaligned64_constant_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 0), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4 x i64], [4 x i64] addrspace(3)* @bar, i32 0, i32 1), align 4
   ret void
@@ -362,7 +362,7 @@ define void @store_misaligned64_constant_offsets() {
 ; SI-DAG: ds_write2_b32 [[BASE0]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI-DAG: ds_write2_b32 [[BASE1]], v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; SI: s_endpgm
-define void @store_misaligned64_constant_large_offsets() {
+define amdgpu_kernel void @store_misaligned64_constant_large_offsets() {
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 2048), align 4
   store i64 123, i64 addrspace(3)* getelementptr inbounds ([4096 x i64], [4096 x i64] addrspace(3)* @bar.large, i32 0, i32 4095), align 4
   ret void
@@ -371,7 +371,7 @@ define void @store_misaligned64_constant_large_offsets() {
 @sgemm.lA = internal unnamed_addr addrspace(3) global [264 x float] undef, align 4
 @sgemm.lB = internal unnamed_addr addrspace(3) global [776 x float] undef, align 4
 
-define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workgroup.id.x() #1
   %y.i = tail call i32 @llvm.amdgcn.workitem.id.y() #1
   %val = load float, float addrspace(1)* %in
@@ -410,7 +410,7 @@ define void @write2_sgemm_sequence(float addrspace(1)* %C, i32 %lda, i32 %ldb, f
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:2{{$}}
 ; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:1{{$}}
 ; CI: s_endpgm
-define void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2_v4f32_superreg_align4(<4 x float> addrspace(3)* %out, <4 x float> addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in
   %val0 = load <4 x float>, <4 x float> addrspace(1)* %in.gep, align 4
diff --git a/test/CodeGen/AMDGPU/ds_write2st64.ll b/test/CodeGen/AMDGPU/ds_write2st64.ll
index 872e7736140661348d8a4a8144285ce6f0041ce4..a395af34b67b9321f87f85f09af28fae32c93708 100644
--- a/test/CodeGen/AMDGPU/ds_write2st64.ll
+++ b/test/CodeGen/AMDGPU/ds_write2st64.ll
@@ -7,7 +7,7 @@
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL]], [[VAL]] offset1:1
 ; SI: s_endpgm
-define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %val = load float, float addrspace(1)* %in.gep, align 4
@@ -25,7 +25,7 @@ define void @simple_write2st64_one_val_f32_0_1(float addrspace(1)* %C, float add
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset0:2 offset1:5
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float addrspace(1)* %in) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -46,7 +46,7 @@ define void @simple_write2st64_two_val_f32_2_5(float addrspace(1)* %C, float add
 ; SI-DAG: v_lshlrev_b32_e32 [[VPTR:v[0-9]+]], 2, v{{[0-9]+}}
 ; SI: ds_write2st64_b32 [[VPTR]], [[VAL0]], [[VAL1]] offset1:255
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, float addrspace(1)* %in, float addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr float, float addrspace(1)* %in.gep.0, i32 1
@@ -66,7 +66,7 @@ define void @simple_write2st64_two_val_max_offset_f32(float addrspace(1)* %C, fl
 ; SI-DAG: v_add_i32_e32 [[VPTR:v[0-9]+]],
 ; SI: ds_write2st64_b64 [[VPTR]], [[VAL0]], [[VAL1]] offset0:4 offset1:127
 ; SI: s_endpgm
-define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep.0 = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %in.gep.1 = getelementptr double, double addrspace(1)* %in.gep.0, i32 1
@@ -85,7 +85,7 @@ define void @simple_write2st64_two_val_max_offset_f64(double addrspace(1)* %C, d
 ; SI-NOT: ds_write2st64_b64
 ; SI: ds_write2_b64 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:8
 ; SI: s_endpgm
-define void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
+define amdgpu_kernel void @byte_size_only_divisible_64_write2st64_f64(double addrspace(1)* %C, double addrspace(1)* %in, double addrspace(3)* %lds) #0 {
   %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr double, double addrspace(1)* %in, i32 %x.i
   %val = load double, double addrspace(1)* %in.gep, align 8
diff --git a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
index 580dc00f935e7618b055c42b22eee7d881b85b5f..b1107ea7fbcbf5d8a4abdd2a42a90b5580917f21 100644
--- a/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
+++ b/test/CodeGen/AMDGPU/dynamic_stackalloc.ll
@@ -4,7 +4,7 @@
 
 ; CHECK: in function test_dynamic_stackalloc{{.*}}: unsupported dynamic alloca
 
-define void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
+define amdgpu_kernel void @test_dynamic_stackalloc(i32 addrspace(1)* %out, i32 %n) {
   %alloca = alloca i32, i32 %n
   store volatile i32 0, i32* %alloca
   ret void
diff --git a/test/CodeGen/AMDGPU/early-if-convert-cost.ll b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
index d1624f8676517b262f26d5f86bb33e08130664e8..ace01593808b7eef238357c9eef4437f0d44ee5a 100644
--- a/test/CodeGen/AMDGPU/early-if-convert-cost.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert-cost.ll
@@ -10,7 +10,7 @@
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_LO:[0-9]+]], v[[ADD_LO]], v[[VAL_LO]], vcc
 ; GCN-DAG: v_cndmask_b32_e32 v[[RESULT_HI:[0-9]+]], v[[ADD_HI]], v[[VAL_HI]], vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
 entry:
   %v = load double, double addrspace(1)* %in
   %cc = fcmp oeq double %v, 1.000000e+00
@@ -32,7 +32,7 @@ endif:
 ; GCN: v_add_f64
 ; GCN: v_cndmask_b32_e32
 ; GCN: v_cndmask_b32_e32
-define void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
 entry:
   %v = load double, double addrspace(2)* %in
   %cc = fcmp oeq double %v, 1.000000e+00
@@ -62,7 +62,7 @@ endif:
 
 ; GCN-DAG: buffer_store_dword v
 ; GCN-DAG: buffer_store_dwordx2
-define void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle96(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -93,7 +93,7 @@ endif:
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
 
 ; GCN: buffer_store_dwordx4
-define void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
diff --git a/test/CodeGen/AMDGPU/early-if-convert.ll b/test/CodeGen/AMDGPU/early-if-convert.ll
index 5ae1db8c686ff43b56569cd848cee3593482306f..9439130deb9ef79499a300b6826cfa83be163201 100644
--- a/test/CodeGen/AMDGPU/early-if-convert.ll
+++ b/test/CodeGen/AMDGPU/early-if-convert.ll
@@ -9,7 +9,7 @@
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[VAL]], vcc
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -32,7 +32,7 @@ endif:
 ; GCN-DAG: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], [[ADD]], [[MUL]], vcc
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_diamond(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -58,7 +58,7 @@ endif:
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc
 ; GCN: s_mov_b64 vcc, [[CMP]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, vcc
-define void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_vcc_clobber(i32 addrspace(1)* %out, i32 addrspace(1)* %in, float %k) #0 {
 entry:
   %v = load i32, i32 addrspace(1)* %in
   %cc = fcmp oeq float %k, 1.000000e+00
@@ -87,7 +87,7 @@ endif:
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 ; GCN: v_cndmask_b32_e32
-define void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_max_cheap(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -128,7 +128,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_min_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -162,7 +162,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_expensive(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -187,7 +187,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_sgpr_ifcvt_triangle(i32 addrspace(1)* %out, i32 addrspace(2)* %in, float %cnd) #0 {
 entry:
   %v = load i32, i32 addrspace(2)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -206,7 +206,7 @@ endif:
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_constant_load:
 ; GCN: v_cndmask_b32
-define void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_constant_load(float addrspace(1)* %out, float addrspace(2)* %in) #0 {
 entry:
   %v = load float, float addrspace(2)* %in
   %cc = fcmp oeq float %v, 1.000000e+00
@@ -227,7 +227,7 @@ endif:
 
 ; GCN-LABEL: {{^}}test_vccnz_ifcvt_triangle_argload:
 ; GCN: v_cndmask_b32
-define void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle_argload(float addrspace(1)* %out, float %v) #0 {
 entry:
   %cc = fcmp oeq float %v, 1.000000e+00
   br i1 %cc, label %if, label %endif
@@ -248,7 +248,7 @@ endif:
 ; GCN: s_add_i32 [[ADD:s[0-9]+]], [[VAL]], [[VAL]]
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b32 [[SELECT:s[0-9]+]], [[ADD]], [[VAL]]
-define void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle(i32 addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load i32, i32 addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -274,7 +274,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_vgpr_ifcvt_triangle(float addrspace(1)* %out, float addrspace(1)* %in, i32 %cond) #0 {
 entry:
   %v = load float, float addrspace(1)* %in
   %cc = icmp eq i32 %cond, 1
@@ -295,7 +295,7 @@ endif:
 ; GCN: s_addc_u32
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle64(i64 addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load i64, i64 addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -320,7 +320,7 @@ endif:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle96(<3 x i32> addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load <3 x i32>, <3 x i32> addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -345,7 +345,7 @@ endif:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 1
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
 ; GCN-NEXT: s_cselect_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @test_scc1_sgpr_ifcvt_triangle128(<4 x i32> addrspace(2)* %in, i32 %cond) #0 {
 entry:
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %cc = icmp eq i32 %cond, 1
@@ -364,7 +364,7 @@ endif:
 ; GCN-LABEL: {{^}}uniform_if_swap_br_targets_scc_constant_select:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
 ; GCN: s_cselect_b32 s{{[0-9]+}}, 1, 0{{$}}
-define void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc_constant_select(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -385,7 +385,7 @@ done:
 ; GCN: {{^}}; BB#0:
 ; GCN-NEXT: s_load_dwordx2
 ; GCN-NEXT: s_cselect_b32 s{{[0-9]+}}, 1, 0
-define void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @ifcvt_undef_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   br i1 undef, label %else, label %if
 
@@ -410,7 +410,7 @@ done:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle256(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
@@ -435,7 +435,7 @@ endif:
 
 ; GCN: [[ENDIF]]:
 ; GCN: buffer_store_dword
-define void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
+define amdgpu_kernel void @test_vccnz_ifcvt_triangle512(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in, float %cnd) #0 {
 entry:
   %v = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %cc = fcmp oeq float %cnd, 1.000000e+00
diff --git a/test/CodeGen/AMDGPU/early-inline-alias.ll b/test/CodeGen/AMDGPU/early-inline-alias.ll
new file mode 100644
index 0000000000000000000000000000000000000000..42dfa4e7ab4f2560eff0668f4a9446ceb7cfaa33
--- /dev/null
+++ b/test/CodeGen/AMDGPU/early-inline-alias.ll
@@ -0,0 +1,12 @@
+; RUN: opt -mtriple=amdgcn-- -O1 -S -inline-threshold=1 %s | FileCheck %s
+
+; CHECK: @add1alias = alias i32 (i32), i32 (i32)* @add1
+; CHECK: @add1alias2 = alias i32 (i32), i32 (i32)* @add1
+
+@add1alias = alias i32 (i32), i32 (i32)* @add1
+@add1alias2 = alias i32 (i32), i32 (i32)* @add1
+
+define i32 @add1(i32) {
+  %2 = add nsw i32 %0, 1
+  ret i32 %2
+}
diff --git a/test/CodeGen/AMDGPU/early-inline.ll b/test/CodeGen/AMDGPU/early-inline.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c871d54bec7ed623c1c6ab3730da85c5d9c8bc72
--- /dev/null
+++ b/test/CodeGen/AMDGPU/early-inline.ll
@@ -0,0 +1,25 @@
+; RUN: opt -mtriple=amdgcn-- -O1 -S -inline-threshold=1 -amdgpu-early-inline-all %s | FileCheck %s
+
+; CHECK: @c_alias
+@c_alias = alias i32 (i32), i32 (i32)* @callee
+
+define i32 @callee(i32 %x) {
+entry:
+  %mul1 = mul i32 %x, %x
+  %mul2 = mul i32 %mul1, %x
+  %mul3 = mul i32 %mul1, %mul2
+  %mul4 = mul i32 %mul3, %mul2
+  %mul5 = mul i32 %mul4, %mul3
+  ret i32 %mul5
+}
+
+; CHECK-LABEL: @caller
+; CHECK: mul i32
+; CHECK-NOT: call i32
+
+define amdgpu_kernel void @caller(i32 %x) {
+entry:
+  %res = call i32 @callee(i32 %x)
+  store volatile i32 %res, i32 addrspace(1)* undef
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/elf.ll b/test/CodeGen/AMDGPU/elf.ll
index 628dd5ec839ec516a6e58611646e19fbf624a373..b22f8608d7e33c5fc82a8263b37496c20dcf311c 100644
--- a/test/CodeGen/AMDGPU/elf.ll
+++ b/test/CodeGen/AMDGPU/elf.ll
@@ -24,11 +24,13 @@
 ; TONGA-NEXT: .long   704
 ; CONFIG: .p2align 8
 ; CONFIG: test:
-define amdgpu_ps void @test(i32 %p) {
+define amdgpu_ps void @test(i32 %p) #0 {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
+   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r, float %r, float %r, float %r, i1 true, i1 false)
    ret void
 }
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/elf.r600.ll b/test/CodeGen/AMDGPU/elf.r600.ll
index 51cd0850093286aa0ecf9c2ab15120bd46280251..93c5e55750336d207cfe5dfde48bd2335ca8326a 100644
--- a/test/CodeGen/AMDGPU/elf.r600.ll
+++ b/test/CodeGen/AMDGPU/elf.r600.ll
@@ -9,7 +9,7 @@
 ; CONFIG-NEXT: .long   2
 ; CONFIG-NEXT: .long   165900
 ; CONFIG-NEXT: .long   0
-define void @test(float addrspace(1)* %out, i32 %p) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i32 %p) {
    %i = add i32 %p, 2
    %r = bitcast i32 %i to float
    store float %r, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/else.ll b/test/CodeGen/AMDGPU/else.ll
index ef1e64763d4a5cc4ad695c3aac5e8446336cff79..22338e4f50e568c6e21dcd9b1e0470a8c2c08104 100644
--- a/test/CodeGen/AMDGPU/else.ll
+++ b/test/CodeGen/AMDGPU/else.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}else_no_execfix:
 ; CHECK: ; %Flow
 ; CHECK-NEXT: s_or_saveexec_b64 [[DST:s\[[0-9]+:[0-9]+\]]],
 ; CHECK-NEXT: s_xor_b64 exec, exec, [[DST]]
 ; CHECK-NEXT: ; mask branch
-define amdgpu_ps float @else_no_execfix(i32 %z, float %v) {
+define amdgpu_ps float @else_no_execfix(i32 %z, float %v) #0 {
 main_body:
   %cc = icmp sgt i32 %z, 5
   br i1 %cc, label %if, label %else
@@ -33,7 +33,7 @@ end:
 ; CHECK-NEXT: s_and_b64 [[AND_INIT:s\[[0-9]+:[0-9]+\]]], exec, [[DST]]
 ; CHECK-NEXT: s_xor_b64 exec, exec, [[AND_INIT]]
 ; CHECK-NEXT: ; mask branch
-define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) {
+define amdgpu_ps void @else_execfix_leave_wqm(i32 %z, float %v) #0 {
 main_body:
   %cc = icmp sgt i32 %z, 5
   br i1 %cc, label %if, label %else
@@ -44,8 +44,7 @@ if:
 
 else:
   %c = fmul float %v, 3.0
-  %c.i = bitcast float %c to i32
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %v.else = extractelement <4 x float> %tex, i32 0
   br label %end
 
@@ -55,6 +54,9 @@ end:
   ret void
 }
 
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 
-declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone
+attributes #0 = { nounwind }
+attributes #1 = { nounwind writeonly }
+attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/empty-function.ll b/test/CodeGen/AMDGPU/empty-function.ll
index a060900811eaea2ecfa6f3d8377fa856247bb1c4..1231cb4d1de2b482e91f86447181f0896eb7b056 100644
--- a/test/CodeGen/AMDGPU/empty-function.ll
+++ b/test/CodeGen/AMDGPU/empty-function.ll
@@ -7,14 +7,14 @@
 ; SI-LABEL: {{^}}empty_function_ret:
 ; SI: s_endpgm
 ; SI: codeLenInByte = 4
-define void @empty_function_ret() #0 {
+define amdgpu_kernel void @empty_function_ret() #0 {
   ret void
 }
 
 ; SI: .text
 ; SI-LABEL: {{^}}empty_function_unreachable:
 ; SI: codeLenInByte = 0
-define void @empty_function_unreachable() #0 {
+define amdgpu_kernel void @empty_function_unreachable() #0 {
   unreachable
 }
 
diff --git a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
index 76b50b563013f1647bdd2d363b1eaeac43ee21b1..6eb1fc1d0cc29a3d12fa279699a0cfe88090498e 100644
--- a/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
+++ b/test/CodeGen/AMDGPU/enable-no-signed-zeros-fp-math.ll
@@ -9,7 +9,7 @@
 ; GCN-SAFE: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
 
 ; GCN-UNSAFE-NOT: xor
-define void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
diff --git a/test/CodeGen/AMDGPU/endcf-loop-header.ll b/test/CodeGen/AMDGPU/endcf-loop-header.ll
index c67095438ee590a6a304ae76e5da6bf590acef68..bd861e0c663edbfe1323dccb66084658f777be3d 100644
--- a/test/CodeGen/AMDGPU/endcf-loop-header.ll
+++ b/test/CodeGen/AMDGPU/endcf-loop-header.ll
@@ -12,7 +12,7 @@
 ; CHECK: [[LOOP_LABEL:[0-9A-Za-z_]+]]: ; %loop{{$}}
 ; CHECK-NOT: s_or_b64 exec, exec
 ; CHECK: s_cbranch_execnz [[LOOP_LABEL]]
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   %cond = call i32 @llvm.r600.read.tidig.x() #0
   %tmp0 = icmp eq i32 %cond, 0
diff --git a/test/CodeGen/AMDGPU/env-amdgiz.ll b/test/CodeGen/AMDGPU/env-amdgiz.ll
new file mode 100644
index 0000000000000000000000000000000000000000..70e4fb30d3aa57ece462e26f2b5cae232c0066ea
--- /dev/null
+++ b/test/CodeGen/AMDGPU/env-amdgiz.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa-amdgiz -verify-machineinstrs < %s
+; Just check the target feature and data layout is accepted without error.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
+target triple = "amdgcn-amd-amdhsa-amdgiz"
+
+define void @foo() {
+entry:
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/env-amdgizcl.ll b/test/CodeGen/AMDGPU/env-amdgizcl.ll
new file mode 100644
index 0000000000000000000000000000000000000000..feb213562c8021ff239e8cda5ca2dcfd5d3656ef
--- /dev/null
+++ b/test/CodeGen/AMDGPU/env-amdgizcl.ll
@@ -0,0 +1,11 @@
+; RUN: llc -march=amdgcn -mtriple=amdgcn-amd-amdhsa-amdgizcl -verify-machineinstrs < %s
+; Just check the target feature and data layout is accepted without error.
+
+target datalayout = "e-p:64:64-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-A5"
+target triple = "amdgcn-amd-amdhsa-amdgizcl"
+
+define void @foo() {
+entry:
+  ret void
+}
+
diff --git a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
index 8ef54b9e95d32228e2d72fcc84202f7b8cb4a3f2..40d115bfc06065b86be3c1cd61fdeb1ec6958712 100644
--- a/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
+++ b/test/CodeGen/AMDGPU/exceed-max-sgprs.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_tahiti
-define void @use_too_many_sgprs_tahiti() #0 {
+define amdgpu_kernel void @use_too_many_sgprs_tahiti() #0 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -20,7 +20,7 @@ define void @use_too_many_sgprs_tahiti() #0 {
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire
-define void @use_too_many_sgprs_bonaire() #1 {
+define amdgpu_kernel void @use_too_many_sgprs_bonaire() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -39,7 +39,7 @@ define void @use_too_many_sgprs_bonaire() #1 {
 }
 
 ; ERROR: error: scalar registers limit of 104 exceeded (106) in use_too_many_sgprs_bonaire_flat_scr
-define void @use_too_many_sgprs_bonaire_flat_scr() #1 {
+define amdgpu_kernel void @use_too_many_sgprs_bonaire_flat_scr() #1 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
@@ -59,7 +59,7 @@ define void @use_too_many_sgprs_bonaire_flat_scr() #1 {
 }
 
 ; ERROR: error: scalar registers limit of 96 exceeded (98) in use_too_many_sgprs_iceland
-define void @use_too_many_sgprs_iceland() #2 {
+define amdgpu_kernel void @use_too_many_sgprs_iceland() #2 {
   call void asm sideeffect "", "~{VCC}" ()
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
@@ -77,7 +77,7 @@ define void @use_too_many_sgprs_iceland() #2 {
 }
 
 ; ERROR: error: addressable scalar registers limit of 102 exceeded (103) in use_too_many_sgprs_fiji
-define void @use_too_many_sgprs_fiji() #3 {
+define amdgpu_kernel void @use_too_many_sgprs_fiji() #3 {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
diff --git a/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll b/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
index cf384da2c5beb893c32805aa4b10d9af6430ed44..0fa06b87eba272114304516f3585e3c57064e037 100644
--- a/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
+++ b/test/CodeGen/AMDGPU/extend-bit-ops-i16.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: and_zext:
 ; GCN: v_and_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
@@ -18,7 +18,7 @@ define void @and_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; GCN-LABEL: or_zext:
 ; GCN: v_or_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
@@ -33,7 +33,7 @@ define void @or_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; GCN-LABEL: xor_zext:
 ; GCN: v_xor_b32_e32 [[VAL16:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0xffff, [[VAL16]]
-define void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @xor_zext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i16, i16 addrspace(1)* %in, i32 %id
   %a = load i16, i16 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/extload-align.ll b/test/CodeGen/AMDGPU/extload-align.ll
index 9d2eb74c7ba9d1a61a6869d9a1466bbc0db778a3..4644800421d86e0ef21230417194c69ee481a2e7 100644
--- a/test/CodeGen/AMDGPU/extload-align.ll
+++ b/test/CodeGen/AMDGPU/extload-align.ll
@@ -9,7 +9,7 @@
 ; DEBUG: mem:LD2[<unknown>]{{[^(]}}
 ; DEBUG: {{^}}# End machine code for function extload_align.
 
-define void @extload_align(i32* %out, i32 %index) #0 {
+define amdgpu_kernel void @extload_align(i32* %out, i32 %index) #0 {
   %v0 = alloca [4 x i16]
   %a1 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 0
   %a2 = getelementptr inbounds [4 x i16], [4 x i16]* %v0, i32 0, i32 1
diff --git a/test/CodeGen/AMDGPU/extload-private.ll b/test/CodeGen/AMDGPU/extload-private.ll
index 6cebe5f495c586383fcca693816ac4272cc8b47c..fd298b361d03ce22e5791f48bd562409c792f15b 100644
--- a/test/CodeGen/AMDGPU/extload-private.ll
+++ b/test/CodeGen/AMDGPU/extload-private.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-promote-alloca -amdgpu-sroa=0 -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}load_i8_sext_private:
-; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}}
-define void @load_i8_sext_private(i32 addrspace(1)* %out) {
+; SI: buffer_load_sbyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+define amdgpu_kernel void @load_i8_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
   %tmp1 = load i8, i8* %tmp0
@@ -13,8 +13,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i8_zext_private:
-; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}}
-define void @load_i8_zext_private(i32 addrspace(1)* %out) {
+; SI: buffer_load_ubyte v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+define amdgpu_kernel void @load_i8_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i8
   %tmp1 = load i8, i8* %tmp0
@@ -24,8 +24,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_sext_private:
-; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}}
-define void @load_i16_sext_private(i32 addrspace(1)* %out) {
+; SI: buffer_load_sshort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+define amdgpu_kernel void @load_i16_sext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
   %tmp1 = load i16, i16* %tmp0
@@ -35,8 +35,8 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}load_i16_zext_private:
-; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+$}}
-define void @load_i16_zext_private(i32 addrspace(1)* %out) {
+; SI: buffer_load_ushort v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+define amdgpu_kernel void @load_i16_zext_private(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca i16
   %tmp1 = load volatile i16, i16* %tmp0
diff --git a/test/CodeGen/AMDGPU/extload.ll b/test/CodeGen/AMDGPU/extload.ll
index 8b3e087d1f45381ed7fc6b7673cbd98b6e5c0304..a7b8e86220aae9bb9a05b6aa6c9365619385e046 100644
--- a/test/CodeGen/AMDGPU/extload.ll
+++ b/test/CodeGen/AMDGPU/extload.ll
@@ -10,7 +10,7 @@
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <4 x i8>
@@ -25,7 +25,7 @@ define void @global_anyext_load_i8(i8 addrspace(1)* nocapture noalias %out, i8 a
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+.[XYZW]]],
 ; EG: VTX_READ_32 [[VAL]]
-define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i16 addrspace(1)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(1)* %src to i32 addrspace(1)*
   %load = load i32, i32 addrspace(1)* %cast
   %x = bitcast i32 %load to <2 x i16>
@@ -40,7 +40,7 @@ define void @global_anyext_load_i16(i16 addrspace(1)* nocapture noalias %out, i1
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i8 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <4 x i8>
@@ -55,7 +55,7 @@ define void @local_anyext_load_i8(i8 addrspace(3)* nocapture noalias %out, i8 ad
 
 ; EG: LDS_READ_RET {{.*}}, [[VAL:T[0-9]+.[XYZW]]]
 ; EG: LDS_WRITE * [[VAL]]
-define void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
+define amdgpu_kernel void @local_anyext_load_i16(i16 addrspace(3)* nocapture noalias %out, i16 addrspace(3)* nocapture noalias %src) nounwind {
   %cast = bitcast i16 addrspace(3)* %src to i32 addrspace(3)*
   %load = load i32, i32 addrspace(3)* %cast
   %x = bitcast i32 %load to <2 x i16>
diff --git a/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll b/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
index 4edff152e66e07bf57d4223cf909e39e01a6df5e..be85ca933c33e8b8c984a65522c85dd08162b370 100644
--- a/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
+++ b/test/CodeGen/AMDGPU/extract-vector-elt-build-vector-combine.ll
@@ -13,7 +13,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
                                                     <4 x i32> addrspace(1)* noalias %out1,
                                                     i32 addrspace(1)* noalias %out2,
                                                     i32 addrspace(1)* %in) {
@@ -55,7 +55,7 @@ define void @store_build_vector_multiple_uses_v4i32(<4 x i32> addrspace(1)* noal
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(1)* noalias %out0,
                                                             <4 x i32> addrspace(1)* noalias %out1,
                                                             i32 addrspace(1)* noalias %out2,
                                                             i32 addrspace(1)* %in) {
@@ -99,7 +99,7 @@ define void @store_build_vector_multiple_extract_uses_v4i32(<4 x i32> addrspace(
 
 ; GCN: buffer_store_dwordx2
 ; GCN: buffer_store_dwordx2
-define void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
+define amdgpu_kernel void @store_build_vector_multiple_uses_v4i32_bitcast_to_v2i64(<2 x i64> addrspace(1)* noalias %out0,
                                                                      <4 x i32> addrspace(1)* noalias %out1,
                                                                      i64 addrspace(1)* noalias %out2,
                                                                      i32 addrspace(1)* %in) {
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
index 84c7955c1384f7ab122a07a4f29aebe8a63afae7..1f567ae0508174ecf8dae9671af9ff95c542eea7 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-f16.ll
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %p0 = extractelement <2 x half> %vec, i32 0
   %p1 = extractelement <2 x half> %vec, i32 1
@@ -26,7 +26,7 @@ define void @extract_vector_elt_v2f16(half addrspace(1)* %out, <2 x half> addrsp
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 %idx) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %elt = extractelement <2 x half> %vec, i32 %idx
   store half %elt, half addrspace(1)* %out, align 2
@@ -45,7 +45,7 @@ define void @extract_vector_elt_v2f16_dynamic_sgpr(half addrspace(1)* %out, <2 x
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
@@ -61,7 +61,7 @@ define void @extract_vector_elt_v2f16_dynamic_vgpr(half addrspace(1)* %out, <2 x
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo) #0 {
   %p0 = extractelement <3 x half> %foo, i32 0
   %p1 = extractelement <3 x half> %foo, i32 2
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -75,7 +75,7 @@ define void @extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo)
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo) #0 {
   %p0 = extractelement <4 x half> %foo, i32 0
   %p1 = extractelement <4 x half> %foo, i32 2
   %out1 = getelementptr half, half addrspace(1)* %out, i32 10
@@ -95,7 +95,7 @@ define void @extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo)
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x half> %foo, i32 %idx
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
   store half %p0, half addrspace(1)* %out
@@ -115,7 +115,7 @@ define void @dynamic_extract_vector_elt_v3f16(half addrspace(1)* %out, <3 x half
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4f16(half addrspace(1)* %out, <4 x half> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x half> %foo, i32 %idx
   %out1 = getelementptr half, half addrspace(1)* %out, i32 1
   store half %p0, half addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
index 4594379dae03eb985ce04d5af17e0201c442a010..db5bf0b4e8089fd2b532ef9ed06027284c62715b 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-f64.ll
@@ -5,7 +5,7 @@
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_load_dwordx2
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
   %ld = load volatile <3 x double>, <3 x double> addrspace(1)* %in
   %elt = extractelement <3 x double> %ld, i32 2
   store volatile double %elt, double addrspace(1)* %out
@@ -13,14 +13,14 @@ define void @extract_vector_elt_v3f64_2(double addrspace(1)* %out, <3 x double>
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3f64:
-define void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3f64(double addrspace(1)* %out, <3 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4f64:
-define void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4f64(double addrspace(1)* %out, <4 x double> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x double> %foo, i32 %elt
   store volatile double %dynelt, double addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
index c1dbd424ac31213c83a62c056428d0a018462fe5..9b117d48a9804667d3fd6994a6baa709011108a2 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i16.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SICIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i16:
 ; GCN: s_load_dword [[VEC:s[0-9]+]]
@@ -8,7 +9,7 @@
 ; GCN-DAG: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN-DAG: buffer_store_short [[VELT0]]
 ; GCN-DAG: buffer_store_short [[VELT1]]
-define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %p0 = extractelement <2 x i16> %vec, i32 0
   %p1 = extractelement <2 x i16> %vec, i32 1
@@ -26,7 +27,7 @@ define void @extract_vector_elt_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspac
 ; GCN: v_mov_b32_e32 [[VELT1:v[0-9]+]], [[ELT1]]
 ; GCN: buffer_store_short [[VELT1]]
 ; GCN: ScratchSize: 0
-define void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %idx) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt = extractelement <2 x i16> %vec, i32 %idx
   store i16 %elt, i16 addrspace(1)* %out, align 2
@@ -44,7 +45,7 @@ define void @extract_vector_elt_v2i16_dynamic_sgpr(i16 addrspace(1)* %out, <2 x
 ; SI: buffer_store_short [[ELT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[ELT]]
 ; GCN: ScratchSize: 0{{$}}
-define void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %idx.ptr, i64 %tid.ext
@@ -60,7 +61,7 @@ define void @extract_vector_elt_v2i16_dynamic_vgpr(i16 addrspace(1)* %out, <2 x
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
-define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 0
   %p1 = extractelement <3 x i16> %foo, i32 2
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
@@ -70,16 +71,23 @@ define void @extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo) #0
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v4i16:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
+
+; GFX9-DAG: s_load_dword [[LOAD0:s[0-9]+]], s[0:1], 0x2c
+; GFX9-DAG: s_load_dword [[LOAD1:s[0-9]+]], s[0:1], 0x30
+; GFX9-DAG: v_mov_b32_e32 [[VLOAD0:v[0-9]+]], [[LOAD0]]
+; GFX9-DAG: buffer_store_short [[VLOAD0]], off
+; GFX9-DAG: v_mov_b32_e32 [[VLOAD1:v[0-9]+]], [[LOAD1]]
+; GFX9-DAG: buffer_store_short [[VLOAD1]], off
+define amdgpu_kernel void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 0
   %p1 = extractelement <4 x i16> %foo, i32 2
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 10
-  store i16 %p1, i16 addrspace(1)* %out, align 2
-  store i16 %p0, i16 addrspace(1)* %out1, align 2
+  store volatile i16 %p1, i16 addrspace(1)* %out, align 2
+  store volatile i16 %p0, i16 addrspace(1)* %out1, align 2
   ret void
 }
 
@@ -88,13 +96,16 @@ define void @extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo) #0
 ; GCN: buffer_load_ushort
 ; GCN: buffer_load_ushort
 
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
+
+; GFX9: buffer_store_dword
+; GFX9: buffer_store_dword
 
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
@@ -102,19 +113,25 @@ define void @dynamic_extract_vector_elt_v3i16(i16 addrspace(1)* %out, <3 x i16>
 }
 
 ; GCN-LABEL: {{^}}dynamic_extract_vector_elt_v4i16:
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_load_ushort
 
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
-; GCN: buffer_store_short
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
+; SICIVI: buffer_store_short
 
-; GCN: buffer_load_ushort
-; GCN: buffer_store_short
-define void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
+; SICIVI: buffer_load_ushort
+; SICIVI: buffer_store_short
+
+; GFX9: s_load_dword
+; GFX9: buffer_store_dword
+; GFX9: buffer_store_dword
+; GFX9: buffer_load_ushort
+; GFX9: buffer_store_short
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i16(i16 addrspace(1)* %out, <4 x i16> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x i16> %foo, i32 %idx
   %out1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
   store i16 %p0, i16 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
index 1df91c93329aea052ebfb7e911472b7579fc3612..a8d127879a32de5ad1c538682c5c77a515c4d63c 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i64.ll
@@ -8,7 +8,7 @@
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dwordx2
-define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
+define amdgpu_kernel void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %val) #0 {
   %vec = bitcast i64 %val to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
   %elt1 = extractelement <2 x i32> %vec, i32 1
@@ -20,7 +20,7 @@ define void @extract_vector_elt_select_error(i32 addrspace(1)* %out, i64 addrspa
 }
 
 ; GCN-LABEL: {{^}}extract_vector_elt_v2i64:
-define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0 {
   %p0 = extractelement <2 x i64> %foo, i32 0
   %p1 = extractelement <2 x i64> %foo, i32 1
   %out1 = getelementptr i64, i64 addrspace(1)* %out, i32 1
@@ -30,14 +30,14 @@ define void @extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo) #0
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64:
-define void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64(i64 addrspace(1)* %out, <2 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <2 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v2i64_2:
-define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %foo, i32 %elt, <2 x i64> %arst) #0 {
   %load = load volatile <2 x i64>, <2 x i64> addrspace(1)* %foo
   %or = or <2 x i64> %load, %arst
   %dynelt = extractelement <2 x i64> %or, i32 %elt
@@ -46,14 +46,14 @@ define void @dyn_extract_vector_elt_v2i64_2(i64 addrspace(1)* %out, <2 x i64> ad
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v3i64:
-define void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v3i64(i64 addrspace(1)* %out, <3 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <3 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}dyn_extract_vector_elt_v4i64:
-define void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
+define amdgpu_kernel void @dyn_extract_vector_elt_v4i64(i64 addrspace(1)* %out, <4 x i64> %foo, i32 %elt) #0 {
   %dynelt = extractelement <4 x i64> %foo, i32 %elt
   store volatile i64 %dynelt, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
index 6f4ae827f432bd10a695bd99f1d5bb69e78b5f23..b7d768fd55254ebe7cea4a7efc0f807f9ef6e562 100644
--- a/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
+++ b/test/CodeGen/AMDGPU/extract_vector_elt-i8.ll
@@ -4,7 +4,7 @@
 ; FUNC-LABEL: {{^}}extract_vector_elt_v1i8:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
   %p0 = extractelement <1 x i8> %foo, i32 0
   store i8 %p0, i8 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @extract_vector_elt_v1i8(i8 addrspace(1)* %out, <1 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
   %p0 = extractelement <2 x i8> %foo, i32 0
   %p1 = extractelement <2 x i8> %foo, i32 1
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -29,7 +29,7 @@ define void @extract_vector_elt_v2i8(i8 addrspace(1)* %out, <2 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 0
   %p1 = extractelement <3 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -43,7 +43,7 @@ define void @extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 0
   %p1 = extractelement <4 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -57,7 +57,7 @@ define void @extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
   %p0 = extractelement <8 x i8> %foo, i32 0
   %p1 = extractelement <8 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -71,7 +71,7 @@ define void @extract_vector_elt_v8i8(i8 addrspace(1)* %out, <8 x i8> %foo) #0 {
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0 {
   %p0 = extractelement <16 x i8> %foo, i32 0
   %p1 = extractelement <16 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -85,7 +85,7 @@ define void @extract_vector_elt_v16i8(i8 addrspace(1)* %out, <16 x i8> %foo) #0
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0 {
   %p0 = extractelement <32 x i8> %foo, i32 0
   %p1 = extractelement <32 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -99,7 +99,7 @@ define void @extract_vector_elt_v32i8(i8 addrspace(1)* %out, <32 x i8> %foo) #0
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
+define amdgpu_kernel void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0 {
   %p0 = extractelement <64 x i8> %foo, i32 0
   %p1 = extractelement <64 x i8> %foo, i32 2
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
@@ -120,7 +120,7 @@ define void @extract_vector_elt_v64i8(i8 addrspace(1)* %out, <64 x i8> %foo) #0
 ; GCN: buffer_store_byte
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %foo, i32 %idx) #0 {
   %p0 = extractelement <3 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i8 %p0, i8 addrspace(1)* %out
@@ -141,7 +141,7 @@ define void @dynamic_extract_vector_elt_v3i8(i8 addrspace(1)* %out, <3 x i8> %fo
 ; GCN: buffer_store_byte
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_byte
-define void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
+define amdgpu_kernel void @dynamic_extract_vector_elt_v4i8(i8 addrspace(1)* %out, <4 x i8> %foo, i32 %idx) #0 {
   %p0 = extractelement <4 x i8> %foo, i32 %idx
   %out1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i8 %p0, i8 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
index e160c20a03a0c2b6a5adcb5ede4aa0ac08178844..34999fa3aea43584e11aa6a55ae39f1a09cb1e93 100644
--- a/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
+++ b/test/CodeGen/AMDGPU/extractelt-to-trunc.ll
@@ -7,7 +7,7 @@
 ; GCN-DAG: buffer_load_dword [[A:v[0-9]+]]
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, [[B]], [[A]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x i32>
@@ -20,7 +20,7 @@ define void @bitcast_int_to_vector_extract_0(i32 addrspace(1)* %out, i64 addrspa
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_f64
 ; GCN: buffer_store_dword v
-define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
+define amdgpu_kernel void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrspace(1)* %in, double %b) {
    %a = load double, double addrspace(1)* %in
    %add = fadd double %a, %b
    %val.bc = bitcast double %add to <2 x i32>
@@ -33,7 +33,7 @@ define void @bitcast_fp_to_vector_extract_0(i32 addrspace(1)* %out, double addrs
 ; GCN: buffer_load_dwordx2
 ; GCN: v_add_i32
 ; GCN: buffer_store_dword
-define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
+define amdgpu_kernel void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 addrspace(1)* %in, i64 %b) {
    %a = load i64, i64 addrspace(1)* %in
    %add = add i64 %a, %b
    %val.bc = bitcast i64 %add to <2 x float>
@@ -45,7 +45,7 @@ define void @bitcast_int_to_fpvector_extract_0(float addrspace(1)* %out, i64 add
 ; GCN-LABEL: {{^}}no_extract_volatile_load_extract0:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
-define void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract0(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %elt0 = extractelement <4 x i32> %vec, i32 0
@@ -57,7 +57,7 @@ entry:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
 
-define void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @no_extract_volatile_load_extract2(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %elt2 = extractelement <4 x i32> %vec, i32 2
@@ -68,7 +68,7 @@ entry:
 ; GCN-LABEL: {{^}}no_extract_volatile_load_dynextract:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_store_dword v
-define void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @no_extract_volatile_load_dynextract(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %vec = load volatile <4 x i32>, <4 x i32> addrspace(1)* %in
   %eltN = extractelement <4 x i32> %vec, i32 %idx
diff --git a/test/CodeGen/AMDGPU/fabs.f16.ll b/test/CodeGen/AMDGPU/fabs.f16.ll
index c64aa6228c71801743eedad6be339b7c18ea79c9..d4ef7124a334c77ceaadf518f98c57de18129f25 100644
--- a/test/CodeGen/AMDGPU/fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fabs.f16.ll
@@ -1,69 +1,74 @@
 ; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
 ; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; DAGCombiner will transform:
 ; (fabs (f16 bitcast (i16 a))) => (f16 bitcast (and (i16 a), 0x7FFFFFFF))
 ; unless isFabsFree returns true
 
-; GCN-LABEL: {{^}}fabs_free_f16:
+; GCN-LABEL: {{^}}s_fabs_free_f16:
 ; GCN: flat_load_ushort [[VAL:v[0-9]+]],
 ; GCN: v_and_b32_e32 [[RESULT:v[0-9]+]], 0x7fff, [[VAL]]
 ; GCN: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
 
-define void @fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @s_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc= bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
   store half %fabs, half addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}fabs_f16:
+; GCN-LABEL: {{^}}s_fabs_f16:
 ; CI: flat_load_ushort [[VAL:v[0-9]+]],
-; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]]
-; CI: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], |[[CVT0]]|
+; CI: v_and_b32_e32 [[CVT0:v[0-9]+]], 0x7fff, [[VAL]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fabs_f16(half addrspace(1)* %out, half %in) {
+define amdgpu_kernel void @s_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
   store half %fabs, half addrspace(1)* %out
   ret void
 }
 
 ; FIXME: Should be able to use single and
-; GCN-LABEL: {{^}}fabs_v2f16:
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+; GCN-LABEL: {{^}}s_fabs_v2f16:
+; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
+; CI: v_or_b32_e32
 
-; VI: flat_load_ushort [[LO:v[0-9]+]]
 ; VI: flat_load_ushort [[HI:v[0-9]+]]
+; VI: flat_load_ushort [[LO:v[0-9]+]]
 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
-; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[LO]]
 ; VI-DAG: v_and_b32_e32 [[FABS_LO:v[0-9]+]], [[MASK]], [[HI]]
-; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
-; VI: v_or_b32
+; VI-DAG: v_and_b32_sdwa [[FABS_HI:v[0-9]+]], [[MASK]], [[LO]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, [[FABS_HI]], [[FABS_LO]]
 ; VI: flat_store_dword
-define void @fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
+define amdgpu_kernel void @s_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
   store <2 x half> %fabs, <2 x half> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}fabs_v4f16:
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|
+; GCN-LABEL: {{^}}s_fabs_v4f16:
+; CI: s_movk_i32 [[MASK:s[0-9]+]], 0x7fff
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]]
 
 ; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x7fff{{$}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
-; VI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_and_b32_sdwa v{{[0-9]+}}, [[MASK]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; VI-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI:     v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; GCN: flat_store_dwordx2
-define void @fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+define amdgpu_kernel void @s_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
   store <4 x half> %fabs, <4 x half> addrspace(1)* %out
   ret void
@@ -72,22 +77,74 @@ define void @fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
 ; GCN-LABEL: {{^}}fabs_fold_f16:
 ; GCN: flat_load_ushort [[IN0:v[0-9]+]]
 ; GCN: flat_load_ushort [[IN1:v[0-9]+]]
+
 ; CI-DAG: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[IN0]]
-; CI-DAG: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], [[IN1]]
-; CI: v_mul_f32_e64 [[RESULT:v[0-9]+]],  |[[CVT1]]|, [[CVT0]]
+; CI-DAG: v_cvt_f32_f16_e64 [[ABS_CVT1:v[0-9]+]], |[[IN1]]|
+; CI: v_mul_f32_e32 [[RESULT:v[0-9]+]],  [[CVT0]], [[ABS_CVT1]]
 ; CI: v_cvt_f16_f32_e32 [[CVTRESULT:v[0-9]+]], [[RESULT]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVTRESULT]]
 
 ; VI-NOT: and
 ; VI: v_mul_f16_e64 [[RESULT:v[0-9]+]], |[[IN1]]|, [[IN0]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
+define amdgpu_kernel void @fabs_fold_f16(half addrspace(1)* %out, half %in0, half %in1) {
   %fabs = call half @llvm.fabs.f16(half %in0)
   %fmul = fmul half %fabs, %in1
   store half %fmul, half addrspace(1)* %out
   ret void
 }
 
-declare half @llvm.fabs.f16(half) readnone
-declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone
-declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone
+; GCN-LABEL: {{^}}v_fabs_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, [[VAL]]
+define amdgpu_kernel void @v_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
+  store <2 x half> %fabs, <2 x half> addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_free_v2f16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; GCN: s_and_b32 s{{[0-9]+}}, [[VAL]], 0x7fff7fff
+define amdgpu_kernel void @fabs_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+  %bc = bitcast i32 %in to <2 x half>
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %bc)
+  store <2 x half> %fabs, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fabs_fold_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+
+; CI: v_cvt_f32_f16_e32
+; CI: v_cvt_f32_f16_e32
+; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
+; CI: v_cvt_f16_f32
+; CI: v_mul_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
+; CI: v_cvt_f16_f32
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
+; VI: v_mul_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, v{{[0-9]+}}
+
+; GFX9: v_and_b32_e32 [[FABS:v[0-9]+]], 0x7fff7fff, [[VAL]]
+; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[FABS]], v{{[0-9]+$}}
+define amdgpu_kernel void @v_fabs_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
+  %fmul = fmul <2 x half> %fabs, %val
+  store <2 x half> %fmul, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fabs.f64.ll b/test/CodeGen/AMDGPU/fabs.f64.ll
index f7780b875ff59be18a61db1ffe01f397d291b36a..998e02f7bdf84f5ccb6cf7619f7b5ee69baf16e8 100644
--- a/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -10,7 +10,7 @@ declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
 ; FUNC-LABEL: {{^}}v_fabs_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tidext = sext i32 %tid to i64
   %gep = getelementptr double, double addrspace(1)* %in, i64 %tidext
@@ -24,7 +24,7 @@ define void @v_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; SI: v_and_b32
 ; SI-NOT: v_and_b32
 ; SI: s_endpgm
-define void @fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fabs_f64(double addrspace(1)* %out, double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   store double %fabs, double addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @fabs_f64(double addrspace(1)* %out, double %in) {
 ; SI: v_and_b32
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   store <2 x double> %fabs, <2 x double> addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 ; SI: v_and_b32
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   store <4 x double> %fabs, <4 x double> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
 ; SI: s_endpgm
-define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @llvm.fabs.f64(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @fabs_fold_f64(double addrspace(1)* %out, double %in0, double %in1)
 ; SI-NOT: and
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|
 ; SI: s_endpgm
-define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
+define amdgpu_kernel void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in1) {
   %fabs = call double @fabs(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, double addrspace(1)* %out
@@ -79,7 +79,7 @@ define void @fabs_fn_fold_f64(double addrspace(1)* %out, double %in0, double %in
 ; FUNC-LABEL: {{^}}fabs_free_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc= bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   store double %fabs, double addrspace(1)* %out
@@ -89,7 +89,7 @@ define void @fabs_free_f64(double addrspace(1)* %out, i64 %in) {
 ; FUNC-LABEL: {{^}}fabs_fn_free_f64:
 ; SI: v_and_b32
 ; SI: s_endpgm
-define void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc= bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   store double %fabs, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fabs.ll b/test/CodeGen/AMDGPU/fabs.ll
index 98e7f9e3e9ad1254c55b56b9849dddbf25d7c5ed..ac8fa3e45ef51e14211f5af8fc7680382ac357e5 100644
--- a/test/CodeGen/AMDGPU/fabs.ll
+++ b/test/CodeGen/AMDGPU/fabs.ll
@@ -13,7 +13,7 @@
 
 ; GCN: v_and_b32
 
-define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
   store float %fabs, float addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @fabs_fn_free(float addrspace(1)* %out, i32 %in) {
 
 ; GCN: v_and_b32
 
-define void @fabs_free(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fabs_free(float addrspace(1)* %out, i32 %in) {
   %bc= bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
   store float %fabs, float addrspace(1)* %out
@@ -37,7 +37,7 @@ define void @fabs_free(float addrspace(1)* %out, i32 %in) {
 ; R600: |{{(PV|T[0-9])\.[XYZW]}}|
 
 ; GCN: v_and_b32
-define void @fabs_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   store float %fabs, float addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @fabs_f32(float addrspace(1)* %out, float %in) {
 
 ; GCN: v_and_b32
 ; GCN: v_and_b32
-define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   store <2 x float> %fabs, <2 x float> addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; GCN: v_and_b32
 ; GCN: v_and_b32
 ; GCN: v_and_b32
-define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   store <4 x float> %fabs, <4 x float> addrspace(1)* %out
   ret void
@@ -76,7 +76,7 @@ define void @fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
-define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
+define amdgpu_kernel void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @fabs(float %in0)
   %fmul = fmul float %fabs, %in1
   store float %fmul, float addrspace(1)* %out
@@ -88,7 +88,7 @@ define void @fabs_fn_fold(float addrspace(1)* %out, float %in0, float %in1) {
 ; VI: s_load_dword [[ABS_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: and
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |[[ABS_VALUE]]|
-define void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
+define amdgpu_kernel void @fabs_fold(float addrspace(1)* %out, float %in0, float %in1) {
   %fabs = call float @llvm.fabs.f32(float %in0)
   %fmul = fmul float %fabs, %in1
   store float %fmul, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index b74bce76f79c443d0a0663d3dd8ff685fde7f5e8..9edf55cbc69fe0ce0b8d4afc0723470451d572b5 100644
--- a/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -28,7 +28,7 @@
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -55,7 +55,7 @@ define void @fast_add_fmuladd_fmul() #0 {
 ; GCN-FASTFMA: v_fma_f32 [[FMA0:v[0-9]+]], [[U]], [[V]], -[[Z]]
 ; GCN-FASTFMA: v_fma_f32 [[FMA1:v[0-9]+]], [[X]], [[Y]], [[FMA0]]
 ; GCN-FASTFMA: buffer_store_dword [[FMA1]]
-define void @fast_sub_fmuladd_fmul() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -87,7 +87,7 @@ define void @fast_sub_fmuladd_fmul() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -120,7 +120,7 @@ define void @fast_add_fmuladd_fmul_multi_use_mul() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -145,7 +145,7 @@ define void @fast_add_fmuladd_fmul_multi_use_mul_commute() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -170,7 +170,7 @@ define void @fast_add_fmuladd_fmul_multi_use_fmuladd() #0 {
 ; GCN-SLOWFMA: v_mul_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_add_f32_e32
-define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
+define amdgpu_kernel void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -205,7 +205,7 @@ define void @fast_add_fmuladd_fmul_multi_use_fmuladd_commute() #0 {
 
 ; GCN: buffer_store_dword [[MUL]]
 ; GCN: buffer_store_dword [[MAD]]
-define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -241,7 +241,7 @@ define void @fast_sub_fmuladd_fmul_multi_use_mul() #0 {
 ; GCN-SLOWFMA-DAG: v_mul_f32_e32 v{{[0-9]+}}, [[Y]], [[X]]
 ; GCN-SLOWFMA: v_add_f32_e32
 ; GCN-SLOWFMA: v_subrev_f32_e32
-define void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
+define amdgpu_kernel void @fast_sub_fmuladd_fmul_multi_use_fmuladd() #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/fadd.f16.ll b/test/CodeGen/AMDGPU/fadd.f16.ll
index d8cfce276d35820e3d071e7acdf062320087c433..f76ecf58d9052fa5dd99b5c6db5ecca2c79e44ef 100644
--- a/test/CodeGen/AMDGPU/fadd.f16.ll
+++ b/test/CodeGen/AMDGPU/fadd.f16.ll
@@ -11,7 +11,7 @@
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16(
+define amdgpu_kernel void @fadd_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -31,7 +31,7 @@ entry:
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16_imm_a(
+define amdgpu_kernel void @fadd_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -49,7 +49,7 @@ entry:
 ; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fadd_f16_imm_b(
+define amdgpu_kernel void @fadd_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -59,27 +59,31 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v2f16
+; GCN-LABEL: {{^}}fadd_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_add_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_add_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16(
+define amdgpu_kernel void @fadd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -91,7 +95,7 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v2f16_imm_a
+; GCN-LABEL: {{^}}fadd_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
@@ -100,15 +104,17 @@ entry:
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16_imm_a(
+define amdgpu_kernel void @fadd_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -118,7 +124,7 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fadd_v2f16_imm_b
+; GCN-LABEL: {{^}}fadd_v2f16_imm_b:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
@@ -127,14 +133,17 @@ entry:
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], 1.0, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], 2.0, v[[A_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], 1.0, v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[A_F16_1]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[A_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_0]]
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_1]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fadd_v2f16_imm_b(
+define amdgpu_kernel void @fadd_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/fadd.ll b/test/CodeGen/AMDGPU/fadd.ll
index 0f683f7bfa23bfe5cfc7a7ab19f833e0370908b5..621a0de281db541e3f1d5c7bfc9a2d998d89a57a 100644
--- a/test/CodeGen/AMDGPU/fadd.ll
+++ b/test/CodeGen/AMDGPU/fadd.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}fadd_f32:
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
 ; SI: v_add_f32
-define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @fadd_f32(float addrspace(1)* %out, float %a, float %b) #0 {
    %add = fadd float %a, %b
    store float %add, float addrspace(1)* %out, align 4
    ret void
@@ -16,7 +16,7 @@ define void @fadd_f32(float addrspace(1)* %out, float %a, float %b) {
 ; R600-DAG: ADD {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
   %add = fadd <2 x float> %a, %b
   store <2 x float> %add, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -31,7 +31,7 @@ define void @fadd_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x flo
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
   %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
@@ -57,8 +57,19 @@ define void @fadd_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; SI: v_add_f32
-define void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) {
+define amdgpu_kernel void @fadd_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) #0 {
   %add = fadd <8 x float> %a, %b
   store <8 x float> %add, <8 x float> addrspace(1)* %out, align 32
   ret void
 }
+
+; FUNC-LABEL: {{^}}fadd_0_nsz_attr_f32:
+; SI-NOT: v_add_f32
+define amdgpu_kernel void @fadd_0_nsz_attr_f32(float addrspace(1)* %out, float %a) #1 {
+   %add = fadd float %a, 0.0
+   store float %add, float addrspace(1)* %out, align 4
+   ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
\ No newline at end of file
diff --git a/test/CodeGen/AMDGPU/fadd64.ll b/test/CodeGen/AMDGPU/fadd64.ll
index 6f0c9de8ebaf29c3eab3ce8aafbd9cd87ecff050..7eb7747de215cea1601b4e29fb3f3fb4391f4bbf 100644
--- a/test/CodeGen/AMDGPU/fadd64.ll
+++ b/test/CodeGen/AMDGPU/fadd64.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}v_fadd_f64:
 ; CHECK: v_add_f64 {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}, {{v[[0-9]+:[0-9]+]}}
-define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                         double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -14,7 +14,7 @@ define void @v_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}s_fadd_f64:
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
+define amdgpu_kernel void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
   %r2 = fadd double %r0, %r1
   store double %r2, double addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @s_fadd_f64(double addrspace(1)* %out, double %r0, double %r1) {
 ; CHECK: v_add_f64
 ; CHECK: v_add_f64
 ; CHECK: _store_dwordx4
-define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                           <2 x double> addrspace(1)* %in2) {
   %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
   %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -37,7 +37,7 @@ define void @v_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspac
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}
 ; CHECK: _store_dwordx4
-define void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
+define amdgpu_kernel void @s_fadd_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %r0, <2 x double> %r1) {
   %r2 = fadd <2 x double> %r0, %r1
   store <2 x double> %r2, <2 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index c76d792800a3c8de5c89f77cdac8fe5271568664..f2686a5582dc6f052979eed2ca1bfb60aaede6d5 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -1,5 +1,7 @@
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
+declare half @llvm.fabs.f16(half) #0
 declare half @llvm.canonicalize.f16(half) #0
 declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #0
 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
@@ -7,7 +9,7 @@ declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>) #0
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f16:
 ; GCN: v_mul_f16_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
   %val = load half, half addrspace(1)* %out
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
@@ -17,17 +19,51 @@ define void @v_test_canonicalize_var_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f16:
 ; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GCN: buffer_store_short [[REG]]
-define void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f16(half addrspace(1)* %out, i16 zeroext %val.arg) #1 {
   %val = bitcast i16 %val.arg to half
   %canonicalized = call half @llvm.canonicalize.f16(half %val)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f16:
+; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
+; GCN: buffer_store_short [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(half addrspace(1)* %out) #1 {
+  %val = load half, half addrspace(1)* %out
+  %val.fabs = call half @llvm.fabs.f16(half %val)
+  %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f16:
+; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
+; GCN: buffer_store_short [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(half addrspace(1)* %out) #1 {
+  %val = load half, half addrspace(1)* %out
+  %val.fabs = call half @llvm.fabs.f16(half %val)
+  %val.fabs.fneg = fsub half -0.0, %val.fabs
+  %canonicalized = call half @llvm.canonicalize.f16(half %val.fabs.fneg)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f16:
+; GCN: v_mul_f16_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
+; GCN: buffer_store_short [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(half addrspace(1)* %out) #1 {
+  %val = load half, half addrspace(1)* %out
+  %val.fneg = fsub half -0.0, %val
+  %canonicalized = call half @llvm.canonicalize.f16(half %val.fneg)
+  store half %canonicalized, half addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -36,7 +72,7 @@ define void @test_fold_canonicalize_p0_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -0.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -45,7 +81,7 @@ define void @test_fold_canonicalize_n0_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -54,7 +90,7 @@ define void @test_fold_canonicalize_p1_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half -1.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -63,7 +99,7 @@ define void @test_fold_canonicalize_n1_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 16.0)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -72,7 +108,7 @@ define void @test_fold_canonicalize_literal_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -81,7 +117,7 @@ define void @test_default_denormals_fold_canonicalize_denormal0_f16(half addrspa
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH03FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -90,7 +126,7 @@ define void @test_denormals_fold_canonicalize_denormal0_f16(half addrspace(1)* %
 ; GCN-LABEL: {{^}}test_default_denormals_fold_canonicalize_denormal1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -99,7 +135,7 @@ define void @test_default_denormals_fold_canonicalize_denormal1_f16(half addrspa
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff83ff{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %out) #3 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH83FF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -108,7 +144,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f16(half addrspace(1)* %
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C00)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -117,7 +153,7 @@ define void @test_fold_canonicalize_qnan_f16(half addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -1 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -126,7 +162,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f16(half addrspace(1)* %out)
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half bitcast (i16 -2 to half))
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -135,7 +171,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f16(half addrspace(1)* %out)
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7C01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -144,7 +180,7 @@ define void @test_fold_canonicalize_snan0_value_f16(half addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xH7DFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -153,7 +189,7 @@ define void @test_fold_canonicalize_snan1_value_f16(half addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFDFF)
   store half %canonicalized, half addrspace(1)* %out
   ret void
@@ -162,17 +198,21 @@ define void @test_fold_canonicalize_snan2_value_f16(half addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f16(half addrspace(1)* %out) #1 {
   %canonicalized = call half @llvm.canonicalize.f16(half 0xHFC01)
   store half %canonicalized, half addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_v2f16:
-; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
-; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-define void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, {{v[0-9]+}}
+; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, {{v[0-9]+}}
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI-NOT: v_and_b32
+
+; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+$}}
+; GFX9: buffer_store_dword [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -181,13 +221,17 @@ define void @v_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
 
 ; FIXME: Fold modifier
 ; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_v2f16:
-; GCN: v_bfe_u32
-; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
-; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
-; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
-; GCN: v_or_b32
+; VI-DAG: v_bfe_u32
+; VI-DAG: v_and_b32_e32 v{{[0-9]+}}, 0x7fff7fff, v{{[0-9]+}}
+; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
+; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
+; VI-NOT: 0xffff
+; VI: v_or_b32
+
+; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
+; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]]{{$}}
 ; GCN: buffer_store_dword
-define void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val.fabs)
@@ -196,12 +240,15 @@ define void @v_test_canonicalize_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #
 }
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_v2f16:
-; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
-; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
-; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
-; GCN: v_or_b32
+; VI: v_or_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
+; VI: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, v{{[0-9]+}}
+; VI: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, v{{[0-9]+}}
+; VI: v_or_b32
+
+; GFX9: v_and_b32_e32 [[ABS:v[0-9]+]], 0x7fff7fff, v{{[0-9]+}}
+; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, [[ABS]] neg_lo:[0,1] neg_hi:[0,1]{{$}}
 ; GCN: buffer_store_dword
-define void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %val.fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %val)
   %val.fabs.fneg = fsub <2 x half> <half -0.0, half -0.0>, %val.fabs
@@ -212,12 +259,16 @@ define void @v_test_canonicalize_fneg_fabs_var_v2f16(<2 x half> addrspace(1)* %o
 
 ; FIXME: Fold modifier
 ; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_v2f16:
-; GCN: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
-; GCN: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]]
-; GCN: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
-; GCN: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]]
-; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-define void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
+; VI: v_xor_b32_e32 [[FNEG:v[0-9]+]], 0x80008000, v{{[0-9]+}}
+; VI-DAG: v_lshrrev_b32_e32 [[FNEG_HI:v[0-9]+]], 16, [[FNEG]]
+; VI-DAG: v_mul_f16_e32 [[REG1:v[0-9]+]], 1.0, [[FNEG_HI]]
+; VI-DAG: v_mul_f16_e32 [[REG0:v[0-9]+]], 1.0, [[FNEG]]
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI-NOT: 0xffff
+
+; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}} neg_lo:[0,1] neg_hi:[0,1]{{$}}
+; GFX9: buffer_store_dword [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %out
   %fneg.val = fsub <2 x half> <half -0.0, half -0.0>, %val
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %fneg.val)
@@ -226,10 +277,14 @@ define void @v_test_canonicalize_fneg_var_v2f16(<2 x half> addrspace(1)* %out) #
 }
 
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_v2f16:
-; GCN: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}}
-; GCN: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
-; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
-define void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
+; VI: v_mul_f16_e64 [[REG0:v[0-9]+]], 1.0, {{s[0-9]+}}
+; VI-DAG: v_mul_f16_e64 [[REG1:v[0-9]+]], 1.0, {{s[0-9]+}}
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI-NOT: v_and_b32
+
+; GFX9: v_pk_mul_f16 [[REG:v[0-9]+]], 1.0, {{s[0-9]+$}}
+; GFX9: buffer_store_dword [[REG]]
+define amdgpu_kernel void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 zeroext %val.arg) #1 {
   %val = bitcast i32 %val.arg to <2 x half>
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %val)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
@@ -239,7 +294,7 @@ define void @s_test_canonicalize_var_v2f16(<2 x half> addrspace(1)* %out, i32 ze
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> zeroinitializer)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -248,7 +303,7 @@ define void @test_fold_canonicalize_p0_v2f16(<2 x half> addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -0.0, half -0.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -257,7 +312,7 @@ define void @test_fold_canonicalize_n0_v2f16(<2 x half> addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 1.0, half 1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -266,7 +321,7 @@ define void @test_fold_canonicalize_p1_v2f16(<2 x half> addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half -1.0, half -1.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -275,7 +330,7 @@ define void @test_fold_canonicalize_n1_v2f16(<2 x half> addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4c004c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 16.0, half 16.0>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -284,7 +339,7 @@ define void @test_fold_canonicalize_literal_v2f16(<2 x half> addrspace(1)* %out)
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -293,7 +348,7 @@ define void @test_no_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addr
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3ff03ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspace(1)* %out) #3 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH03FF, half 0xH03FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -302,7 +357,7 @@ define void @test_denormals_fold_canonicalize_denormal0_v2f16(<2 x half> addrspa
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -311,7 +366,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addr
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x83ff83ff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspace(1)* %out) #3 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH83FF, half 0xH83FF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -320,7 +375,7 @@ define void @test_denormals_fold_canonicalize_denormal1_v2f16(<2 x half> addrspa
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7c007c00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C00, half 0xH7C00>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -329,7 +384,7 @@ define void @test_fold_canonicalize_qnan_v2f16(<2 x half> addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> bitcast (i32 -1 to <2 x half>))
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -338,7 +393,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_v2f16(<2 x half> addrspace(1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half bitcast (i16 -2 to half), half bitcast (i16 -2 to half)>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -347,7 +402,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_v2f16(<2 x half> addrspace(1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7C01, half 0xH7C01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -356,7 +411,7 @@ define void @test_fold_canonicalize_snan0_value_v2f16(<2 x half> addrspace(1)* %
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xH7DFF, half 0xH7DFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -365,7 +420,7 @@ define void @test_fold_canonicalize_snan1_value_v2f16(<2 x half> addrspace(1)* %
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFDFF, half 0xHFDFF>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
@@ -374,7 +429,7 @@ define void @test_fold_canonicalize_snan2_value_v2f16(<2 x half> addrspace(1)* %
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_v2f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7e007e00{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_v2f16(<2 x half> addrspace(1)* %out) #1 {
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> <half 0xHFC01, half 0xHFC01>)
   store <2 x half> %canonicalized, <2 x half> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fcanonicalize.ll b/test/CodeGen/AMDGPU/fcanonicalize.ll
index bbe8a1e014379ce37c76afdde6eebbbddfbd3416..8c385f40b1c5f9676732d75c46d29196aae102e1 100644
--- a/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -1,12 +1,14 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
+declare float @llvm.fabs.f32(float) #0
 declare float @llvm.canonicalize.f32(float) #0
+declare double @llvm.fabs.f64(double) #0
 declare double @llvm.canonicalize.f64(double) #0
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f32:
 ; GCN: v_mul_f32_e32 [[REG:v[0-9]+]], 1.0, {{v[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
   %val = load float, float addrspace(1)* %out
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
@@ -16,16 +18,50 @@ define void @v_test_canonicalize_var_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f32:
 ; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, {{s[0-9]+}}
 ; GCN: buffer_store_dword [[REG]]
-define void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f32(float addrspace(1)* %out, float %val) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float %val)
   store float %canonicalized, float addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f32:
+; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, |{{v[0-9]+}}|
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f32(float addrspace(1)* %out) #1 {
+  %val = load float, float addrspace(1)* %out
+  %val.fabs = call float @llvm.fabs.f32(float %val)
+  %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f32:
+; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -|{{v[0-9]+}}|
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f32(float addrspace(1)* %out) #1 {
+  %val = load float, float addrspace(1)* %out
+  %val.fabs = call float @llvm.fabs.f32(float %val)
+  %val.fabs.fneg = fsub float -0.0, %val.fabs
+  %canonicalized = call float @llvm.canonicalize.f32(float %val.fabs.fneg)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f32:
+; GCN: v_mul_f32_e64 [[REG:v[0-9]+]], 1.0, -{{v[0-9]+}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f32(float addrspace(1)* %out) #1 {
+  %val = load float, float addrspace(1)* %out
+  %val.fneg = fsub float -0.0, %val
+  %canonicalized = call float @llvm.canonicalize.f32(float %val.fneg)
+  store float %canonicalized, float addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -34,7 +70,7 @@ define void @test_fold_canonicalize_p0_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n0_f32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -0.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -43,7 +79,7 @@ define void @test_fold_canonicalize_n0_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 1.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -52,7 +88,7 @@ define void @test_fold_canonicalize_p1_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_n1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float -1.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -61,7 +97,7 @@ define void @test_fold_canonicalize_n1_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_literal_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x41800000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 16.0)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -70,7 +106,7 @@ define void @test_fold_canonicalize_literal_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -79,7 +115,7 @@ define void @test_no_denormals_fold_canonicalize_denormal0_f32(float addrspace(1
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fffff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 8388607 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -88,7 +124,7 @@ define void @test_denormals_fold_canonicalize_denormal0_f32(float addrspace(1)*
 ; GCN-LABEL: {{^}}test_no_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -97,7 +133,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_f32(float addrspace(1
 ; GCN-LABEL: {{^}}test_denormals_fold_canonicalize_denormal1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x807fffff{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)* %out) #3 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2155872255 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -106,7 +142,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f32(float addrspace(1)*
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float 0x7FF8000000000000)
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -115,7 +151,7 @@ define void @test_fold_canonicalize_qnan_f32(float addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg1_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -1 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -124,7 +160,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f32(float addrspace(1)* %out
 ; GCN-LABEL: {{^}}test_fold_canonicalize_qnan_value_neg2_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 -2 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -133,7 +169,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f32(float addrspace(1)* %out
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan0_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2139095041 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -142,7 +178,7 @@ define void @test_fold_canonicalize_snan0_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan1_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 2143289343 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -151,7 +187,7 @@ define void @test_fold_canonicalize_snan1_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan2_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4286578689 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -160,7 +196,7 @@ define void @test_fold_canonicalize_snan2_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}test_fold_canonicalize_snan3_value_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x7fc00000{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1 {
   %canonicalized = call float @llvm.canonicalize.f32(float bitcast (i32 4290772991 to float))
   store float %canonicalized, float addrspace(1)* %out
   ret void
@@ -169,7 +205,7 @@ define void @test_fold_canonicalize_snan3_value_f32(float addrspace(1)* %out) #1
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{v\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
   %val = load double, double addrspace(1)* %out
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
@@ -179,17 +215,51 @@ define void @v_test_canonicalize_var_f64(double addrspace(1)* %out) #1 {
 ; GCN-LABEL: {{^}}s_test_canonicalize_var_f64:
 ; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, {{s\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
+define amdgpu_kernel void @s_test_canonicalize_var_f64(double addrspace(1)* %out, double %val) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double %val)
   store double %canonicalized, double addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_canonicalize_fabs_var_f64:
+; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, |{{v\[[0-9]+:[0-9]+\]}}|
+; GCN: buffer_store_dwordx2 [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fabs_var_f64(double addrspace(1)* %out) #1 {
+  %val = load double, double addrspace(1)* %out
+  %val.fabs = call double @llvm.fabs.f64(double %val)
+  %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_fabs_var_f64:
+; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]\]]], 1.0, -|{{v\[[0-9]+:[0-9]+\]}}|
+; GCN: buffer_store_dwordx2 [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f64(double addrspace(1)* %out) #1 {
+  %val = load double, double addrspace(1)* %out
+  %val.fabs = call double @llvm.fabs.f64(double %val)
+  %val.fabs.fneg = fsub double -0.0, %val.fabs
+  %canonicalized = call double @llvm.canonicalize.f64(double %val.fabs.fneg)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_fneg_var_f64:
+; GCN: v_mul_f64 [[REG:v\[[0-9]+:[0-9]+\]]], 1.0, -{{v\[[0-9]+:[0-9]+\]}}
+; GCN: buffer_store_dwordx2 [[REG]]
+define amdgpu_kernel void @v_test_canonicalize_fneg_var_f64(double addrspace(1)* %out) #1 {
+  %val = load double, double addrspace(1)* %out
+  %val.fneg = fsub double -0.0, %val
+  %canonicalized = call double @llvm.canonicalize.f64(double %val.fneg)
+  store double %canonicalized, double addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_fold_canonicalize_p0_f64:
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -199,7 +269,7 @@ define void @test_fold_canonicalize_p0_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -0.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -209,7 +279,7 @@ define void @test_fold_canonicalize_n0_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x3ff00000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 1.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -219,7 +289,7 @@ define void @test_fold_canonicalize_p1_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xbff00000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double -1.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -229,7 +299,7 @@ define void @test_fold_canonicalize_n1_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x40300000{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 16.0)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -239,7 +309,7 @@ define void @test_fold_canonicalize_literal_f64(double addrspace(1)* %out) #1 {
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -249,7 +319,7 @@ define void @test_no_denormals_fold_canonicalize_denormal0_f64(double addrspace(
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0xfffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 4503599627370495 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -259,7 +329,7 @@ define void @test_denormals_fold_canonicalize_denormal0_f64(double addrspace(1)*
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], v[[LO]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
+define amdgpu_kernel void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #2 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -269,7 +339,7 @@ define void @test_no_denormals_fold_canonicalize_denormal1_f64(double addrspace(
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], -1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x800fffff{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
+define amdgpu_kernel void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)* %out) #3 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9227875636482146303 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -279,7 +349,7 @@ define void @test_denormals_fold_canonicalize_denormal1_f64(double addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double 0x7FF8000000000000)
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -289,7 +359,7 @@ define void @test_fold_canonicalize_qnan_f64(double addrspace(1)* %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -1 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -299,7 +369,7 @@ define void @test_fold_canonicalize_qnan_value_neg1_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 -2 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -309,7 +379,7 @@ define void @test_fold_canonicalize_qnan_value_neg2_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9218868437227405313 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -319,7 +389,7 @@ define void @test_fold_canonicalize_snan0_value_f64(double addrspace(1)* %out) #
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 9223372036854775807 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -329,7 +399,7 @@ define void @test_fold_canonicalize_snan1_value_f64(double addrspace(1)* %out) #
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18442240474082181121 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
@@ -339,7 +409,7 @@ define void @test_fold_canonicalize_snan2_value_f64(double addrspace(1)* %out) #
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7ff80000{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_fold_canonicalize_snan3_value_f64(double addrspace(1)* %out) #1 {
   %canonicalized = call double @llvm.canonicalize.f64(double bitcast (i64 18446744073709551615 to double))
   store double %canonicalized, double addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fceil.ll b/test/CodeGen/AMDGPU/fceil.ll
index efdda78f852b6e6d09402bb0584774c19f79b2ec..0b913fda858005fee95b0cef084aeb90ca11e4fa 100644
--- a/test/CodeGen/AMDGPU/fceil.ll
+++ b/test/CodeGen/AMDGPU/fceil.ll
@@ -13,7 +13,7 @@ declare <16 x float> @llvm.ceil.v16f32(<16 x float>) nounwind readnone
 ; SI: v_ceil_f32_e32
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+\.[XYZW]]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @fceil_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.ceil.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
@@ -25,7 +25,7 @@ define void @fceil_f32(float addrspace(1)* %out, float %x) {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[RESULT:T[0-9]+]]{{\.[XYZW]}}
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.ceil.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
@@ -41,7 +41,7 @@ define void @fceil_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
 ; EG-DAG: CEIL {{\*? *}}[[RESULT1]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+define amdgpu_kernel void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
   %y = call <3 x float> @llvm.ceil.v3f32(<3 x float> %x) nounwind readnone
   store <3 x float> %y, <3 x float> addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @fceil_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
 ; EG: CEIL {{\*? *}}[[RESULT]]
-define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.ceil.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@ define void @fceil_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT2]]
-define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.ceil.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
@@ -125,7 +125,7 @@ define void @fceil_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
 ; EG-DAG: CEIL {{\*? *}}[[RESULT4]]
-define void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @fceil_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.ceil.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fceil64.ll b/test/CodeGen/AMDGPU/fceil64.ll
index 98448db5dd2487138250219b35ac7fde79810c10..61572a855620675c42f37868d040aac7f6890af7 100644
--- a/test/CodeGen/AMDGPU/fceil64.ll
+++ b/test/CodeGen/AMDGPU/fceil64.ll
@@ -31,7 +31,7 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone
 ; SI: v_cndmask_b32
 ; SI: v_add_f64
 ; SI: s_endpgm
-define void @fceil_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @fceil_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.ceil.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @fceil_f64(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}fceil_v2f64:
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
 ; FIXME-CI: v_ceil_f64_e32
-; define void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @fceil_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.ceil.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
@@ -61,7 +61,7 @@ define void @fceil_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -76,7 +76,7 @@ define void @fceil_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @fceil_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
 ; CI: v_ceil_f64_e32
-define void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @fceil_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fcmp-cnd.ll b/test/CodeGen/AMDGPU/fcmp-cnd.ll
index 530274f920f052626eff1f9710627e1ba0e26bac..7f8be804309ee198675d3b0163f14120794a82a2 100644
--- a/test/CodeGen/AMDGPU/fcmp-cnd.ll
+++ b/test/CodeGen/AMDGPU/fcmp-cnd.ll
@@ -4,7 +4,7 @@
 ;registers and literal.x depending on what the optimizer does.
 ;CHECK: CNDE  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00
diff --git a/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
index c402805feb39b4adb0e3fcf4db1d3e0c2582fad5..2a848e80b81bf2c66a3c70fc74cfaaf1f472697a 100644
--- a/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
+++ b/test/CodeGen/AMDGPU/fcmp-cnde-int-args.ll
@@ -6,7 +6,7 @@
 
 ; CHECK: SET{{[A-Z]+}}_DX10
 
-define void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %cmp = fcmp oeq float %0, 0.000000e+00
diff --git a/test/CodeGen/AMDGPU/fcmp.f16.ll b/test/CodeGen/AMDGPU/fcmp.f16.ll
index a62726f7f0683b1324e8048f5245732a3e59850d..7916226462f774312cb98b03ae5b440f17850ea5 100644
--- a/test/CodeGen/AMDGPU/fcmp.f16.ll
+++ b/test/CodeGen/AMDGPU/fcmp.f16.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lt(
+define amdgpu_kernel void @fcmp_f16_lt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -28,16 +28,16 @@ entry:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]]
 
-; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
+; SI:  v_cvt_f32_f16_e64 v[[A_F32:[0-9]+]], |v[[A_F16]]|
+; SI:  v_cvt_f32_f16_e64 v[[B_F32:[0-9]+]], |v[[B_F16]]|
 
-; SI:  v_cmp_lt_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F32]]|, |v[[B_F32]]|
+; SI:  v_cmp_lt_f32_e32 vcc, v[[A_F32]], v[[B_F32]]
 ; VI:  v_cmp_lt_f16_e64 s{{\[[0-9]+:[0-9]+\]}}, |v[[A_F16]]|, |v[[B_F16]]|
 
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lt_abs(
+define amdgpu_kernel void @fcmp_f16_lt_abs(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -62,7 +62,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_eq(
+define amdgpu_kernel void @fcmp_f16_eq(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -85,7 +85,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_le(
+define amdgpu_kernel void @fcmp_f16_le(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -108,7 +108,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_gt(
+define amdgpu_kernel void @fcmp_f16_gt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -131,7 +131,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_lg(
+define amdgpu_kernel void @fcmp_f16_lg(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -154,7 +154,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_ge(
+define amdgpu_kernel void @fcmp_f16_ge(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -177,7 +177,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_o(
+define amdgpu_kernel void @fcmp_f16_o(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -200,7 +200,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_u(
+define amdgpu_kernel void @fcmp_f16_u(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -223,7 +223,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nge(
+define amdgpu_kernel void @fcmp_f16_nge(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -246,7 +246,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nlg(
+define amdgpu_kernel void @fcmp_f16_nlg(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -269,7 +269,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_ngt(
+define amdgpu_kernel void @fcmp_f16_ngt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -292,7 +292,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nle(
+define amdgpu_kernel void @fcmp_f16_nle(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -315,7 +315,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_neq(
+define amdgpu_kernel void @fcmp_f16_neq(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -338,7 +338,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fcmp_f16_nlt(
+define amdgpu_kernel void @fcmp_f16_nlt(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -368,7 +368,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_lt(
+define amdgpu_kernel void @fcmp_v2f16_lt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -398,7 +398,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_eq(
+define amdgpu_kernel void @fcmp_v2f16_eq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -428,7 +428,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_le(
+define amdgpu_kernel void @fcmp_v2f16_le(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -458,7 +458,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_gt(
+define amdgpu_kernel void @fcmp_v2f16_gt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -488,7 +488,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_lg(
+define amdgpu_kernel void @fcmp_v2f16_lg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -518,7 +518,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_ge(
+define amdgpu_kernel void @fcmp_v2f16_ge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -548,7 +548,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_o(
+define amdgpu_kernel void @fcmp_v2f16_o(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -578,7 +578,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_u(
+define amdgpu_kernel void @fcmp_v2f16_u(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -608,7 +608,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nge(
+define amdgpu_kernel void @fcmp_v2f16_nge(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -638,7 +638,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nlg(
+define amdgpu_kernel void @fcmp_v2f16_nlg(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -668,7 +668,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_ngt(
+define amdgpu_kernel void @fcmp_v2f16_ngt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -698,7 +698,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nle(
+define amdgpu_kernel void @fcmp_v2f16_nle(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -728,7 +728,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_neq(
+define amdgpu_kernel void @fcmp_v2f16_neq(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -758,7 +758,7 @@ entry:
 ; GCN: v_cndmask_b32_e64 v[[R_I32_1:[0-9]+]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_I32_0]]:[[R_I32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fcmp_v2f16_nlt(
+define amdgpu_kernel void @fcmp_v2f16_nlt(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
diff --git a/test/CodeGen/AMDGPU/fcmp.ll b/test/CodeGen/AMDGPU/fcmp.ll
index 97d954fcc3c2794dbe0e12e98c478d5638198a3e..b548670edb0663fc6d1d26293187e5562c2afb7e 100644
--- a/test/CodeGen/AMDGPU/fcmp.ll
+++ b/test/CodeGen/AMDGPU/fcmp.ll
@@ -3,7 +3,7 @@
 ; CHECK: {{^}}fcmp_sext:
 ; CHECK: SETE_DX10  T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @fcmp_sext(i32 addrspace(1)* %out, float addrspace(1)* %in) {
 entry:
   %0 = load float, float addrspace(1)* %in
   %arrayidx1 = getelementptr inbounds float, float addrspace(1)* %in, i32 1
@@ -22,7 +22,7 @@ entry:
 ; CHECK: SET{{[N]*}}E_DX10 * T{{[0-9]+\.[XYZW],}}
 ; CHECK-NEXT: {{[0-9]+\(5.0}}
 
-define void @fcmp_br(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_br(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   br i1 %0, label %IF, label %ENDIF
diff --git a/test/CodeGen/AMDGPU/fcmp64.ll b/test/CodeGen/AMDGPU/fcmp64.ll
index acce82fdfe5361ef00cbd55d1cdf897c22abe3fe..b9e1921d4c45589454a62c324576084883725e56 100644
--- a/test/CodeGen/AMDGPU/fcmp64.ll
+++ b/test/CodeGen/AMDGPU/fcmp64.ll
@@ -3,7 +3,7 @@
 
 ; CHECK-LABEL: {{^}}flt_f64:
 ; CHECK: v_cmp_nge_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -15,7 +15,7 @@ define void @flt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fle_f64:
 ; CHECK: v_cmp_ngt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -27,7 +27,7 @@ define void @fle_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fgt_f64:
 ; CHECK: v_cmp_nle_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -39,7 +39,7 @@ define void @fgt_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fge_f64:
 ; CHECK: v_cmp_nlt_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -51,7 +51,7 @@ define void @fge_f64(i32 addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}fne_f64:
 ; CHECK: v_cmp_neq_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -63,7 +63,7 @@ define void @fne_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; CHECK-LABEL: {{^}}feq_f64:
 ; CHECK: v_cmp_nlg_f64_e32 vcc, {{v[[0-9]+:[0-9]+], v[[0-9]+:[0-9]+]}}
-define void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @feq_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
diff --git a/test/CodeGen/AMDGPU/fconst64.ll b/test/CodeGen/AMDGPU/fconst64.ll
index 89af37545c99c59ab4e45105813b96bfc4e634cd..1255977962454b17aa05417140b4ca4b72efc3c4 100644
--- a/test/CodeGen/AMDGPU/fconst64.ll
+++ b/test/CodeGen/AMDGPU/fconst64.ll
@@ -5,7 +5,7 @@
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0x40140000
 ; CHECK-DAG: s_mov_b32 {{s[0-9]+}}, 0
 
-define void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fconst_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
    %r1 = load double, double addrspace(1)* %in
    %r2 = fadd double %r1, 5.000000e+00
    store double %r2, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fcopysign.f16.ll b/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 27cea2f82455a4ca2dbd0f1ae26dc1f4658d7fa5..4e2bf765cd95f193230d5cac05a5f4fc971388ea 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=FUNC -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
 
 declare half @llvm.copysign.f16(half, half)
 declare float @llvm.copysign.f32(float, float)
@@ -8,21 +9,21 @@ declare <2 x half> @llvm.copysign.v2f16(<2 x half>, <2 x half>)
 declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>)
 declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
 
-; FUNC-LABEL: {{^}}test_copysign_f16:
-; SI: buffer_load_ushort v[[MAG:[0-9]+]]
+; GCN-LABEL: {{^}}test_copysign_f16:
 ; SI: buffer_load_ushort v[[SIGN:[0-9]+]]
+; SI: buffer_load_ushort v[[MAG:[0-9]+]]
 ; SI: s_brev_b32 s[[CONST:[0-9]+]], -2
-; SI: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
-; SI: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
 ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
-; VI: buffer_load_ushort v[[SIGN:[0-9]+]]
-; VI: buffer_load_ushort v[[MAG:[0-9]+]]
-; VI: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
-; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
+; GFX89: buffer_load_ushort v[[SIGN:[0-9]+]]
+; GFX89: buffer_load_ushort v[[MAG:[0-9]+]]
+; GFX89: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
+; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_f16(
+define amdgpu_kernel void @test_copysign_f16(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -34,7 +35,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32:
+; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f16_sign_f32:
 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
 ; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
@@ -42,7 +43,7 @@ entry:
 ; GCN: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_EXT]], v[[SIGN]]
 ; GCN: buffer_store_dword v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f32_mag_f16_sign_f32(
+define amdgpu_kernel void @test_copysign_out_f32_mag_f16_sign_f32(
   float addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   float addrspace(1)* %arg_sign) {
@@ -55,7 +56,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
+; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64:
 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
@@ -64,7 +65,7 @@ entry:
 ; GCN: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_EXT_HI]], v[[SIGN_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_EXT_LO]]:[[OUT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_out_f64_mag_f16_sign_f64(
+define amdgpu_kernel void @test_copysign_out_f64_mag_f16_sign_f64(
   double addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   double addrspace(1)* %arg_sign) {
@@ -77,17 +78,17 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16:
+; GCN-LABEL: {{^}}test_copysign_out_f32_mag_f32_sign_f16:
 ; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
 ; SI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_F32]]
-; VI-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]]
-; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
+; GFX89-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]]
+; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_dword v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f32_mag_f32_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f32_mag_f32_sign_f16(
   float addrspace(1)* %arg_out,
   float addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -100,17 +101,17 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
+; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f64_sign_f16:
 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[MAG_LO:[0-9]+]]:[[MAG_HI:[0-9]+]]{{\]}}
 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
 ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
 ; SI-DAG: v_cvt_f32_f16_e32 v[[SIGN_F32:[0-9]+]], v[[SIGN]]
 ; SI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_F32]]
-; VI-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]]
-; VI: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_SHIFT]]
+; GFX89-DAG: v_lshlrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]]
+; GFX89: v_bfi_b32 v[[OUT_HI:[0-9]+]], s[[CONST]], v[[MAG_HI]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MAG_LO]]:[[OUT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_out_f64_mag_f64_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f64_mag_f64_sign_f16(
   double addrspace(1)* %arg_out,
   double addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -123,19 +124,19 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32:
+; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f32:
 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
 ; GCN-DAG: buffer_load_dword v[[SIGN:[0-9]+]]
 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN]]
 ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
-; VI-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
-; VI-DAG: v_lshrrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]]
-; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
+; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
+; GFX89-DAG: v_lshrrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN]]
+; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f16_sign_f32(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f32(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   float addrspace(1)* %arg_sign) {
@@ -148,19 +149,19 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
+; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64:
 ; GCN-DAG: buffer_load_ushort v[[MAG:[0-9]+]]
 ; GCN-DAG: buffer_load_dwordx2 v{{\[}}[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]{{\]}}
 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]]
 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]]
 ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
-; VI-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
-; VI-DAG: v_lshrrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN_HI]]
-; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
+; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
+; GFX89-DAG: v_lshrrev_b32_e32 v[[SIGN_SHIFT:[0-9]+]], 16, v[[SIGN_HI]]
+; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG]], v[[SIGN_SHIFT]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f16_sign_f64(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f16_sign_f64(
   half addrspace(1)* %arg_out,
   half addrspace(1)* %arg_mag,
   double addrspace(1)* %arg_sign) {
@@ -173,7 +174,7 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16:
+; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f32_sign_f16:
 ; GCN-DAG: buffer_load_dword v[[MAG:[0-9]+]]
 ; GCN-DAG: buffer_load_ushort v[[SIGN:[0-9]+]]
 ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2
@@ -182,12 +183,12 @@ entry:
 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG_TRUNC]]
 ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_F32]]
 ; SI: v_cvt_f16_f32_e32 v[[OUT:[0-9]+]], v[[OUT_F32]]
-; VI-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
-; VI-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]]
-; VI: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_TRUNC]], v[[SIGN]]
+; GFX89-DAG: s_movk_i32 s[[CONST:[0-9]+]], 0x7fff
+; GFX89-DAG: v_cvt_f16_f32_e32 v[[MAG_TRUNC:[0-9]+]], v[[MAG]]
+; GFX89: v_bfi_b32 v[[OUT:[0-9]+]], s[[CONST]], v[[MAG_TRUNC]], v[[SIGN]]
 ; GCN: buffer_store_short v[[OUT]]
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f32_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f32_sign_f16(
   half addrspace(1)* %arg_out,
   float addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -200,10 +201,10 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16:
+; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f64_sign_f16:
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_out_f16_mag_f64_sign_f16(
+define amdgpu_kernel void @test_copysign_out_f16_mag_f64_sign_f16(
   half addrspace(1)* %arg_out,
   double addrspace(1)* %arg_mag,
   half addrspace(1)* %arg_sign) {
@@ -216,11 +217,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_v2f16:
+; GCN-LABEL: {{^}}test_copysign_v2f16:
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
 ; GCN: s_endpgm
-define void @test_copysign_v2f16(
+define amdgpu_kernel void @test_copysign_v2f16(
   <2 x half> addrspace(1)* %arg_out,
   <2 x half> %arg_mag,
   <2 x half> %arg_sign) {
@@ -230,12 +232,12 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_v3f16:
+; GCN-LABEL: {{^}}test_copysign_v3f16:
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v3f16(
+define amdgpu_kernel void @test_copysign_v3f16(
   <3 x half> addrspace(1)* %arg_out,
   <3 x half> %arg_mag,
   <3 x half> %arg_sign) {
@@ -245,13 +247,13 @@ entry:
   ret void
 }
 
-; FUNC-LABEL: {{^}}test_copysign_v4f16:
+; GCN-LABEL: {{^}}test_copysign_v4f16:
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: v_bfi_b32
 ; GCN: s_endpgm
-define void @test_copysign_v4f16(
+define amdgpu_kernel void @test_copysign_v4f16(
   <4 x half> addrspace(1)* %arg_out,
   <4 x half> %arg_mag,
   <4 x half> %arg_sign) {
diff --git a/test/CodeGen/AMDGPU/fcopysign.f32.ll b/test/CodeGen/AMDGPU/fcopysign.f32.ll
index 632de18dafcb3c8595a22ef52b69b4feadd6576b..e5893e5995a32710439f0e42a80f5331af594385 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f32.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f32.ll
@@ -20,7 +20,7 @@ declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>) nounwind read
 ; GCN: s_endpgm
 
 ; EG: BFI_INT
-define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign) nounwind {
   %result = call float @llvm.copysign.f32(float %mag, float %sign)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -31,7 +31,7 @@ define void @test_copysign_f32(float addrspace(1)* %out, float %mag, float %sign
 
 ; EG: BFI_INT
 ; EG: BFI_INT
-define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %mag, <2 x float> %sign) nounwind {
   %result = call <2 x float> @llvm.copysign.v2f32(<2 x float> %mag, <2 x float> %sign)
   store <2 x float> %result, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -44,7 +44,7 @@ define void @test_copysign_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %ma
 ; EG: BFI_INT
 ; EG: BFI_INT
 ; EG: BFI_INT
-define void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %mag, <4 x float> %sign) nounwind {
   %result = call <4 x float> @llvm.copysign.v4f32(<4 x float> %mag, <4 x float> %sign)
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/AMDGPU/fcopysign.f64.ll b/test/CodeGen/AMDGPU/fcopysign.f64.ll
index 12c942beee6c9b25f0214e25e67079b9aaadc460..67779a8ff3b9b7767460bf4b6f386e2d8c15aa39 100644
--- a/test/CodeGen/AMDGPU/fcopysign.f64.ll
+++ b/test/CodeGen/AMDGPU/fcopysign.f64.ll
@@ -17,7 +17,7 @@ declare <4 x double> @llvm.copysign.v4f64(<4 x double>, <4 x double>) nounwind r
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %sign) nounwind {
   %result = call double @llvm.copysign.f64(double %mag, double %sign)
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -32,7 +32,7 @@ define void @test_copysign_f64(double addrspace(1)* %out, double %mag, double %s
 ; GCN-DAG: v_bfi_b32 v[[VRESULT_HI:[0-9]+]], [[SCONST]], v[[VMAG_HI]], v[[VSIGN]]
 ; GCN-DAG: v_mov_b32_e32 v[[VMAG_LO:[0-9]+]], s[[SMAG_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VMAG_LO]]:[[VRESULT_HI]]{{\]}}
-define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
+define amdgpu_kernel void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float %sign) nounwind {
   %c = fpext float %sign to double
   %result = call double @llvm.copysign.f64(double %mag, double %c)
   store double %result, double addrspace(1)* %out, align 8
@@ -41,7 +41,7 @@ define void @test_copysign_f64_f32(double addrspace(1)* %out, double %mag, float
 
 ; FUNC-LABEL: {{^}}test_copysign_v2f64:
 ; GCN: s_endpgm
-define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %mag, <2 x double> %sign) nounwind {
   %result = call <2 x double> @llvm.copysign.v2f64(<2 x double> %mag, <2 x double> %sign)
   store <2 x double> %result, <2 x double> addrspace(1)* %out, align 8
   ret void
@@ -49,7 +49,7 @@ define void @test_copysign_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %
 
 ; FUNC-LABEL: {{^}}test_copysign_v4f64:
 ; GCN: s_endpgm
-define void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
+define amdgpu_kernel void @test_copysign_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %mag, <4 x double> %sign) nounwind {
   %result = call <4 x double> @llvm.copysign.v4f64(<4 x double> %mag, <4 x double> %sign)
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/fdiv.f16.ll b/test/CodeGen/AMDGPU/fdiv.f16.ll
index 70b70bdaaaa7b6f8240add3d7d352852a105a2ba..7f84e973c95824800cb6f5f588ef7aba80a57945 100644
--- a/test/CodeGen/AMDGPU/fdiv.f16.ll
+++ b/test/CodeGen/AMDGPU/fdiv.f16.ll
@@ -31,7 +31,7 @@
 ; VI: v_cvt_f16_f32_e32 [[CVT_BACK:v[0-9]+]], [[MUL]]
 ; VI: v_div_fixup_f16 [[RESULT:v[0-9]+]], [[CVT_BACK]], [[RHS]], [[LHS]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16(
+define amdgpu_kernel void @v_fdiv_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) #0 {
@@ -54,7 +54,7 @@ entry:
 ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [[RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -72,7 +72,7 @@ entry:
 ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], |[[VAL]]|
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_abs(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -91,7 +91,7 @@ entry:
 ; VI: v_rcp_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [[RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -109,7 +109,7 @@ entry:
 ; VI: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[VAL]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rcp_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -127,7 +127,7 @@ entry:
 ; VI: v_rsq_f16_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rsq_f16(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -147,7 +147,7 @@ entry:
 ; VI-NEXT: v_rcp_f16_e64 [[RESULT:v[0-9]+]], -[[SQRT]]
 ; VI-NOT: [RESULT]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_rsq_f16_neg(half addrspace(1)* %r, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -168,7 +168,7 @@ entry:
 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
+define amdgpu_kernel void @v_fdiv_f16_arcp(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -190,7 +190,7 @@ entry:
 ; VI: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[RCP]], [[LHS]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
+define amdgpu_kernel void @v_fdiv_f16_unsafe(half addrspace(1)* %r, half addrspace(1)* %a, half addrspace(1)* %b) #2 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -204,6 +204,42 @@ entry:
   ret void
 }
 
+; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f16:
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
+
+; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
+; VI: buffer_store_short [[MUL]]
+define amdgpu_kernel void @div_arcp_2_x_pat_f16(half addrspace(1)* %out) #0 {
+  %x = load half, half addrspace(1)* undef
+  %rcp = fdiv arcp half %x, 2.0
+  store half %rcp, half addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f16:
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0x3dcccccd, v{{[0-9]+}}
+
+; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0x2e66, v{{[0-9]+}}
+; VI: buffer_store_short [[MUL]]
+define amdgpu_kernel void @div_arcp_k_x_pat_f16(half addrspace(1)* %out) #0 {
+  %x = load half, half addrspace(1)* undef
+  %rcp = fdiv arcp half %x, 10.0
+  store half %rcp, half addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f16:
+; SI: v_mul_f32_e32 v{{[0-9]+}}, 0xbdcccccd, v{{[0-9]+}}
+
+; VI: v_mul_f16_e32 [[MUL:v[0-9]+]], 0xae66, v{{[0-9]+}}
+; VI: buffer_store_short [[MUL]]
+define amdgpu_kernel void @div_arcp_neg_k_x_pat_f16(half addrspace(1)* %out) #0 {
+  %x = load half, half addrspace(1)* undef
+  %rcp = fdiv arcp half %x, -10.0
+  store half %rcp, half addrspace(1)* %out, align 4
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare half @llvm.sqrt.f16(half) #1
 declare half @llvm.fabs.f16(half) #1
diff --git a/test/CodeGen/AMDGPU/fdiv.f64.ll b/test/CodeGen/AMDGPU/fdiv.f64.ll
index 20f9e4df07fd030a4931103190bd7c7585c30452..d16bdf43ee2656a28cc6f236234afed049e92deb 100644
--- a/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=COMMON %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=COMMON %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s
 
 
-; COMMON-LABEL: {{^}}fdiv_f64:
-; COMMON-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0
-; COMMON-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
+; GCN-LABEL: {{^}}fdiv_f64:
+; GCN-DAG: buffer_load_dwordx2 [[NUM:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0
+; GCN-DAG: buffer_load_dwordx2 [[DEN:v\[[0-9]+:[0-9]+\]]], off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; CI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
 ; CI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], vcc, [[NUM]], [[DEN]], [[NUM]]
 
@@ -13,23 +13,23 @@
 ; SI-DAG: v_div_scale_f64 [[SCALE0:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[DEN]], [[DEN]], [[NUM]]
 ; SI-DAG: v_div_scale_f64 [[SCALE1:v\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, [[NUM]], [[DEN]], [[NUM]]
 
-; COMMON-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]]
+; GCN-DAG: v_rcp_f64_e32 [[RCP_SCALE0:v\[[0-9]+:[0-9]+\]]], [[SCALE0]]
 
 ; SI-DAG: v_cmp_eq_u32_e32 vcc, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI-DAG: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI-DAG: s_xor_b64 vcc, [[CMP0]], vcc
 
-; COMMON-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0
-; COMMON-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]]
-; COMMON-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0
-; COMMON-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]]
-; COMMON-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]]
-; COMMON-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]]
-; COMMON: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]]
-; COMMON: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
-; COMMON: buffer_store_dwordx2 [[RESULT]]
-; COMMON: s_endpgm
-define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounwind {
+; GCN-DAG: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[RCP_SCALE0]], 1.0
+; GCN-DAG: v_fma_f64 [[FMA1:v\[[0-9]+:[0-9]+\]]], [[RCP_SCALE0]], [[FMA0]], [[RCP_SCALE0]]
+; GCN-DAG: v_fma_f64 [[FMA2:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[FMA1]], 1.0
+; GCN-DAG: v_fma_f64 [[FMA3:v\[[0-9]+:[0-9]+\]]], [[FMA1]], [[FMA2]], [[FMA1]]
+; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], [[SCALE1]], [[FMA3]]
+; GCN-DAG: v_fma_f64 [[FMA4:v\[[0-9]+:[0-9]+\]]], -[[SCALE0]], [[MUL]], [[SCALE1]]
+; GCN: v_div_fmas_f64 [[FMAS:v\[[0-9]+:[0-9]+\]]], [[FMA4]], [[FMA3]], [[MUL]]
+; GCN: v_div_fixup_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[FMAS]], [[DEN]], [[NUM]]
+; GCN: buffer_store_dwordx2 [[RESULT]]
+; GCN: s_endpgm
+define amdgpu_kernel void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %gep.1 = getelementptr double, double addrspace(1)* %in, i32 1
   %num = load volatile double, double addrspace(1)* %in
   %den = load volatile double, double addrspace(1)* %gep.1
@@ -38,31 +38,31 @@ define void @fdiv_f64(double addrspace(1)* %out, double addrspace(1)* %in) nounw
   ret void
 }
 
-; COMMON-LABEL: {{^}}fdiv_f64_s_v:
-define void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) nounwind {
+; GCN-LABEL: {{^}}fdiv_f64_s_v:
+define amdgpu_kernel void @fdiv_f64_s_v(double addrspace(1)* %out, double addrspace(1)* %in, double %num) #0 {
   %den = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
 }
 
-; COMMON-LABEL: {{^}}fdiv_f64_v_s:
-define void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) nounwind {
+; GCN-LABEL: {{^}}fdiv_f64_v_s:
+define amdgpu_kernel void @fdiv_f64_v_s(double addrspace(1)* %out, double addrspace(1)* %in, double %den) #0 {
   %num = load double, double addrspace(1)* %in
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
 }
 
-; COMMON-LABEL: {{^}}fdiv_f64_s_s:
-define void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) nounwind {
+; GCN-LABEL: {{^}}fdiv_f64_s_s:
+define amdgpu_kernel void @fdiv_f64_s_s(double addrspace(1)* %out, double %num, double %den) #0 {
   %result = fdiv double %num, %den
   store double %result, double addrspace(1)* %out
   ret void
 }
 
-; COMMON-LABEL: {{^}}v_fdiv_v2f64:
-define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) nounwind {
+; GCN-LABEL: {{^}}v_fdiv_v2f64:
+define amdgpu_kernel void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
   %gep.1 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in, i32 1
   %num = load <2 x double>, <2 x double> addrspace(1)* %in
   %den = load <2 x double>, <2 x double> addrspace(1)* %gep.1
@@ -71,15 +71,15 @@ define void @v_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspac
   ret void
 }
 
-; COMMON-LABEL: {{^}}s_fdiv_v2f64:
-define void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
+; GCN-LABEL: {{^}}s_fdiv_v2f64:
+define amdgpu_kernel void @s_fdiv_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %num, <2 x double> %den) {
   %result = fdiv <2 x double> %num, %den
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
 }
 
-; COMMON-LABEL: {{^}}v_fdiv_v4f64:
-define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) nounwind {
+; GCN-LABEL: {{^}}v_fdiv_v4f64:
+define amdgpu_kernel void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
   %gep.1 = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
   %num = load <4 x double>, <4 x double> addrspace(1)* %in
   %den = load <4 x double>, <4 x double> addrspace(1)* %gep.1
@@ -88,9 +88,46 @@ define void @v_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspac
   ret void
 }
 
-; COMMON-LABEL: {{^}}s_fdiv_v4f64:
-define void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) {
+; GCN-LABEL: {{^}}s_fdiv_v4f64:
+define amdgpu_kernel void @s_fdiv_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %num, <4 x double> %den) #0 {
   %result = fdiv <4 x double> %num, %den
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, 0.5
+; GCN: buffer_store_dwordx2 [[MUL]]
+define amdgpu_kernel void @div_fast_2_x_pat_f64(double addrspace(1)* %out) #1 {
+  %x = load double, double addrspace(1)* undef
+  %rcp = fdiv fast double %x, 2.0
+  store double %rcp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}div_fast_k_x_pat_f64:
+; GCN-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x9999999a
+; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0x3fb99999
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 [[MUL]]
+define amdgpu_kernel void @div_fast_k_x_pat_f64(double addrspace(1)* %out) #1 {
+  %x = load double, double addrspace(1)* undef
+  %rcp = fdiv fast double %x, 10.0
+  store double %rcp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}div_fast_neg_k_x_pat_f64:
+; GCN-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0x9999999a
+; GCN-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0xbfb99999
+; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
+; GCN: buffer_store_dwordx2 [[MUL]]
+define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(double addrspace(1)* %out) #1 {
+  %x = load double, double addrspace(1)* undef
+  %rcp = fdiv fast double %x, -10.0
+  store double %rcp, double addrspace(1)* %out, align 4
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fdiv.ll b/test/CodeGen/AMDGPU/fdiv.ll
index 0e95de9c555cdc102e44403e68202e93bfe537d3..b3a2b664372071a3b1301900104a2965283a9d0c 100644
--- a/test/CodeGen/AMDGPU/fdiv.ll
+++ b/test/CodeGen/AMDGPU/fdiv.ll
@@ -27,7 +27,7 @@
 ; GCN: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0
 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
 ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
-define void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -52,7 +52,7 @@ entry:
 ; GCN-NOT: s_setreg
 ; GCN: v_div_fmas_f32 [[FMAS:v[0-9]+]], [[F]], [[B]], [[E]]
 ; GCN: v_div_fixup_f32 v{{[0-9]+}}, [[FMAS]],
-define void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_f32_denormals(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -65,7 +65,7 @@ entry:
 ; GCN: v_rcp_f32
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
-define void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_25ulp_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
   store float %fdiv, float addrspace(1)* %out
@@ -77,7 +77,7 @@ entry:
 ; GCN: v_fma_f32
 ; GCN: v_div_fmas_f32
 ; GCN: v_div_fixup_f32
-define void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_25ulp_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv float %a, %b, !fpmath !0
   store float %fdiv, float addrspace(1)* %out
@@ -89,7 +89,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
+define amdgpu_kernel void @fdiv_fast_denormals_f32(float addrspace(1)* %out, float %a, float %b) #2 {
 entry:
   %fdiv = fdiv fast float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -104,7 +104,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_fast_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv fast float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -119,7 +119,7 @@ entry:
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}, [[RCP]]
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @fdiv_f32_arcp_math(float addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %fdiv = fdiv arcp float %a, %b
   store float %fdiv, float addrspace(1)* %out
@@ -136,7 +136,7 @@ entry:
 ; GCN: v_div_scale_f32
 ; GCN: v_div_scale_f32
 ; GCN: v_div_scale_f32
-define void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -146,7 +146,7 @@ entry:
 ; FUNC-LABEL: {{^}}fdiv_ulp25_v2f32:
 ; GCN: v_cmp_gt_f32
 ; GCN: v_cmp_gt_f32
-define void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_ulp25_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b, !fpmath !0
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -161,7 +161,7 @@ entry:
 
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32_fast_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv fast <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -176,7 +176,7 @@ entry:
 
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
+define amdgpu_kernel void @fdiv_v2f32_arcp_math(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) #0 {
 entry:
   %fdiv = fdiv arcp <2 x float> %a, %b
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -197,7 +197,7 @@ entry:
 ; GCN: v_div_fixup_f32
 ; GCN: v_div_fixup_f32
 ; GCN: v_div_fixup_f32
-define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -220,7 +220,7 @@ define void @fdiv_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -243,7 +243,7 @@ define void @fdiv_v4f32_fast_math(<4 x float> addrspace(1)* %out, <4 x float> ad
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
 ; GCN: v_rcp_f32
-define void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fdiv_v4f32_arcp_math(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
diff --git a/test/CodeGen/AMDGPU/ffloor.f64.ll b/test/CodeGen/AMDGPU/ffloor.f64.ll
index 83ffbdfa23a554f42c07bfc9ec7e3973400da8db..407cccb8443e2244ed2636288e51dbad0dc1dc25 100644
--- a/test/CodeGen/AMDGPU/ffloor.f64.ll
+++ b/test/CodeGen/AMDGPU/ffloor.f64.ll
@@ -19,7 +19,7 @@ declare <16 x double> @llvm.floor.v16f64(<16 x double>) nounwind readnone
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64
 ; SI: s_endpgm
-define void @ffloor_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.floor.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @ffloor_f64(double addrspace(1)* %out, double %x) {
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -[[INPUT]]
 ; SI: s_endpgm
-define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
   %neg = fsub double 0.0, %x
   %y = call double @llvm.floor.f64(double %neg) nounwind readnone
   store double %y, double addrspace(1)* %out
@@ -50,7 +50,7 @@ define void @ffloor_f64_neg(double addrspace(1)* %out, double %x) {
 ; SI: v_cndmask_b32_e32
 ; SI: v_add_f64 {{v[[0-9]+:[0-9]+]}}, -|[[INPUT]]|
 ; SI: s_endpgm
-define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
   %abs = call double @llvm.fabs.f64(double %x)
   %neg = fsub double 0.0, %abs
   %y = call double @llvm.floor.f64(double %neg) nounwind readnone
@@ -61,7 +61,7 @@ define void @ffloor_f64_neg_abs(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}ffloor_v2f64:
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.floor.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -72,7 +72,7 @@ define void @ffloor_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI-NOT: v_floor_f64_e32
-define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+define amdgpu_kernel void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
   %y = call <3 x double> @llvm.floor.v3f64(<3 x double> %x) nounwind readnone
   store <3 x double> %y, <3 x double> addrspace(1)* %out
   ret void
@@ -83,7 +83,7 @@ define void @ffloor_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.floor.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -98,7 +98,7 @@ define void @ffloor_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.floor.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -121,7 +121,7 @@ define void @ffloor_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
 ; CI: v_floor_f64_e32
-define void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @ffloor_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.floor.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/ffloor.ll b/test/CodeGen/AMDGPU/ffloor.ll
index d7f35a45075c4f1782a97495ce1fc7c46812903e..720fe7a45e3dfd8478ff93b99dd488534df2d4b1 100644
--- a/test/CodeGen/AMDGPU/ffloor.ll
+++ b/test/CodeGen/AMDGPU/ffloor.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}floor_f32:
 ; SI: v_floor_f32_e32
 ; R600: FLOOR
-define void @floor_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @floor_f32(float addrspace(1)* %out, float %in) {
   %tmp = call float @llvm.floor.f32(float %in) #0
   store float %tmp, float addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @floor_f32(float addrspace(1)* %out, float %in) {
 ; SI: v_floor_f32_e32
 ; SI: v_floor_f32_e32
 
-define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %tmp = call <2 x float> @llvm.floor.v2f32(<2 x float> %in) #0
   store <2 x float> %tmp, <2 x float> addrspace(1)* %out
   ret void
@@ -31,7 +31,7 @@ define void @floor_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; R600: FLOOR
 ; R600: FLOOR
 ; R600: FLOOR
-define void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @floor_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %tmp = call <4 x float> @llvm.floor.v4f32(<4 x float> %in) #0
   store <4 x float> %tmp, <4 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/flat-address-space.ll b/test/CodeGen/AMDGPU/flat-address-space.ll
index 55b5482d031fbefc7478e2792fb3e3a7bd06d440..c867e4fca229503488a2f58ff6669554fd525178 100644
--- a/test/CodeGen/AMDGPU/flat-address-space.ll
+++ b/test/CodeGen/AMDGPU/flat-address-space.ll
@@ -17,43 +17,43 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG]]
 ; CHECK-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], s[[HI_SREG]]
 ; CHECK: flat_store_dword v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}, v[[DATA]]
-define void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_i32(i32 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  store i32 %x, i32 addrspace(4)* %fptr, align 4
+  store volatile i32 %x, i32 addrspace(4)* %fptr, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_flat_i64:
 ; CHECK: flat_store_dwordx2
-define void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
+define amdgpu_kernel void @store_flat_i64(i64 addrspace(1)* %gptr, i64 %x) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
-  store i64 %x, i64 addrspace(4)* %fptr, align 8
+  store volatile i64 %x, i64 addrspace(4)* %fptr, align 8
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_flat_v4i32:
 ; CHECK: flat_store_dwordx4
-define void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_flat_v4i32(<4 x i32> addrspace(1)* %gptr, <4 x i32> %x) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
-  store <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
+  store volatile <4 x i32> %x, <4 x i32> addrspace(4)* %fptr, align 16
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_flat_trunc_i16:
 ; CHECK: flat_store_short
-define void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_trunc_i16(i16 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
   %y = trunc i32 %x to i16
-  store i16 %y, i16 addrspace(4)* %fptr, align 2
+  store volatile i16 %y, i16 addrspace(4)* %fptr, align 2
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_flat_trunc_i8:
 ; CHECK: flat_store_byte
-define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
+define amdgpu_kernel void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
   %y = trunc i32 %x to i8
-  store i8 %y, i8 addrspace(4)* %fptr, align 2
+  store volatile i8 %y, i8 addrspace(4)* %fptr, align 2
   ret void
 }
 
@@ -61,36 +61,36 @@ define void @store_flat_trunc_i8(i8 addrspace(1)* %gptr, i32 %x) #0 {
 
 ; CHECK-LABEL: load_flat_i32:
 ; CHECK: flat_load_dword
-define void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i32 addrspace(1)* %gptr to i32 addrspace(4)*
-  %fload = load i32, i32 addrspace(4)* %fptr, align 4
+  %fload = load volatile i32, i32 addrspace(4)* %fptr, align 4
   store i32 %fload, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: load_flat_i64:
 ; CHECK: flat_load_dwordx2
-define void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i64 addrspace(1)* %gptr to i64 addrspace(4)*
-  %fload = load i64, i64 addrspace(4)* %fptr, align 8
+  %fload = load volatile i64, i64 addrspace(4)* %fptr, align 8
   store i64 %fload, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; CHECK-LABEL: load_flat_v4i32:
 ; CHECK: flat_load_dwordx4
-define void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @load_flat_v4i32(<4 x i32> addrspace(1)* noalias %out, <4 x i32> addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast <4 x i32> addrspace(1)* %gptr to <4 x i32> addrspace(4)*
-  %fload = load <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32
+  %fload = load volatile <4 x i32>, <4 x i32> addrspace(4)* %fptr, align 32
   store <4 x i32> %fload, <4 x i32> addrspace(1)* %out, align 8
   ret void
 }
 
 ; CHECK-LABEL: sextload_flat_i8:
 ; CHECK: flat_load_sbyte
-define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
-  %fload = load i8, i8 addrspace(4)* %fptr, align 4
+  %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
   %ext = sext i8 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -98,9 +98,9 @@ define void @sextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n
 
 ; CHECK-LABEL: zextload_flat_i8:
 ; CHECK: flat_load_ubyte
-define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i8 addrspace(1)* %gptr to i8 addrspace(4)*
-  %fload = load i8, i8 addrspace(4)* %fptr, align 4
+  %fload = load volatile i8, i8 addrspace(4)* %fptr, align 4
   %ext = zext i8 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -108,9 +108,9 @@ define void @zextload_flat_i8(i32 addrspace(1)* noalias %out, i8 addrspace(1)* n
 
 ; CHECK-LABEL: sextload_flat_i16:
 ; CHECK: flat_load_sshort
-define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
-  %fload = load i16, i16 addrspace(4)* %fptr, align 4
+  %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
   %ext = sext i16 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -118,9 +118,9 @@ define void @sextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
 
 ; CHECK-LABEL: zextload_flat_i16:
 ; CHECK: flat_load_ushort
-define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
+define amdgpu_kernel void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)* noalias %gptr) #0 {
   %fptr = addrspacecast i16 addrspace(1)* %gptr to i16 addrspace(4)*
-  %fload = load i16, i16 addrspace(4)* %fptr, align 4
+  %fload = load volatile i16, i16 addrspace(4)* %fptr, align 4
   %ext = zext i16 %fload to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -131,7 +131,7 @@ define void @zextload_flat_i16(i32 addrspace(1)* noalias %out, i16 addrspace(1)*
 ; CHECK: flat_load_ubyte
 ; CHECK: flat_load_ubyte
 ; CHECK: flat_load_ubyte
-define void @flat_scratch_unaligned_load() {
+define amdgpu_kernel void @flat_scratch_unaligned_load() {
   %scratch = alloca i32
   %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
   %ld = load volatile i32, i32 addrspace(4)* %fptr, align 1
@@ -143,7 +143,7 @@ define void @flat_scratch_unaligned_load() {
 ; CHECK: flat_store_byte
 ; CHECK: flat_store_byte
 ; CHECK: flat_store_byte
-define void @flat_scratch_unaligned_store() {
+define amdgpu_kernel void @flat_scratch_unaligned_store() {
   %scratch = alloca i32
   %fptr = addrspacecast i32* %scratch to i32 addrspace(4)*
   store volatile i32 0, i32 addrspace(4)* %fptr, align 1
@@ -154,7 +154,7 @@ define void @flat_scratch_unaligned_store() {
 ; HSA: flat_load_dword
 ; HSA: flat_load_dword
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
-define void @flat_scratch_multidword_load() {
+define amdgpu_kernel void @flat_scratch_multidword_load() {
   %scratch = alloca <2 x i32>
   %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
   %ld = load volatile <2 x i32>, <2 x i32> addrspace(4)* %fptr
@@ -165,7 +165,7 @@ define void @flat_scratch_multidword_load() {
 ; HSA: flat_store_dword
 ; HSA: flat_store_dword
 ; FIXME: These tests are broken for os = mesa3d, becasue it doesn't initialize flat_scr
-define void @flat_scratch_multidword_store() {
+define amdgpu_kernel void @flat_scratch_multidword_store() {
   %scratch = alloca <2 x i32>
   %fptr = addrspacecast <2 x i32>* %scratch to <2 x i32> addrspace(4)*
   store volatile <2 x i32> zeroinitializer, <2 x i32> addrspace(4)* %fptr
diff --git a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
index df9ba00c69749caae4a0d82e852c6944410be20d..dac1500cd46cbba2162167506f1ac078aeb8fd5a 100644
--- a/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
+++ b/test/CodeGen/AMDGPU/flat-for-global-subtarget-feature.ll
@@ -23,7 +23,7 @@
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
@@ -36,7 +36,7 @@ entry:
 ; NOHSA-DEFAULT: buffer_store_dword
 ; NOHSA-NODEFAULT: flat_store_dword
 ; NOHSA-NOADDR64: flat_store_dword
-define void @test_addr64(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_addr64(i32 addrspace(1)* %out) {
 entry:
   %out.addr = alloca i32 addrspace(1)*, align 4
 
diff --git a/test/CodeGen/AMDGPU/flat-scratch-reg.ll b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
index b71c8bcb76c7a38c4c7a33de14fef454f2e8eb0f..23f40daf3d237b7123c482f04af3f9a239614820 100644
--- a/test/CodeGen/AMDGPU/flat-scratch-reg.ll
+++ b/test/CodeGen/AMDGPU/flat-scratch-reg.ll
@@ -19,7 +19,7 @@
 ; CI: ; NumSgprs: 8
 ; VI-NOXNACK: ; NumSgprs: 8
 ; VI-XNACK: ; NumSgprs: 12
-define void @no_vcc_no_flat() {
+define amdgpu_kernel void @no_vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7}"()
   ret void
@@ -33,7 +33,7 @@ entry:
 ; CI: ; NumSgprs: 10
 ; VI-NOXNACK: ; NumSgprs: 10
 ; VI-XNACK: ; NumSgprs: 12
-define void @vcc_no_flat() {
+define amdgpu_kernel void @vcc_no_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC}"()
   ret void
@@ -50,7 +50,7 @@ entry:
 ; HSA-CI: ; NumSgprs: 8
 ; HSA-VI-NOXNACK: ; NumSgprs: 8
 ; HSA-VI-XNACK: ; NumSgprs: 12
-define void @no_vcc_flat() {
+define amdgpu_kernel void @no_vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{FLAT_SCR}"()
   ret void
@@ -66,7 +66,7 @@ entry:
 ; HSA-CI: ; NumSgprs: 10
 ; HSA-VI-NOXNACK: ; NumSgprs: 10
 ; HSA-VI-XNACK: ; NumSgprs: 12
-define void @vcc_flat() {
+define amdgpu_kernel void @vcc_flat() {
 entry:
   call void asm sideeffect "", "~{SGPR7},~{VCC},~{FLAT_SCR}"()
   ret void
diff --git a/test/CodeGen/AMDGPU/flat_atomics.ll b/test/CodeGen/AMDGPU/flat_atomics.ll
index 7400dbcf89090e8df8a294cf3008b4f890e5f2a3..cc95d80570e09f1e0bfef109987a2f6c2e8267f9 100644
--- a/test/CodeGen/AMDGPU/flat_atomics.ll
+++ b/test/CodeGen/AMDGPU/flat_atomics.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i32_offset:
 ; GCN: flat_atomic_add v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_offset:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile add i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -23,7 +23,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -34,7 +34,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64_offset:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -45,7 +45,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -54,7 +54,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -63,7 +63,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i32_addr64:
 ; GCN: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -73,7 +73,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i32_ret_addr64:
 ; GCN: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -83,7 +83,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_offset:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -93,7 +93,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_offset:
 ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile and i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -103,7 +103,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -114,7 +114,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64_offset:
 ; GCN: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -125,7 +125,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -134,7 +134,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret:
 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -143,7 +143,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i32_addr64:
 ; GCN: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i32_ret_addr64:
 ; GCN: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -163,7 +163,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_offset:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -173,7 +173,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_offset:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile sub i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -183,7 +183,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -194,7 +194,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64_offset:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -205,7 +205,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -214,7 +214,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -223,7 +223,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i32_addr64:
 ; GCN: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -233,7 +233,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i32_ret_addr64:
 ; GCN: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -243,7 +243,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_offset:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -253,7 +253,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_offset:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile max i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -263,7 +263,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -274,7 +274,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64_offset:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -285,7 +285,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -294,7 +294,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -303,7 +303,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_addr64:
 ; GCN: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -313,7 +313,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i32_ret_addr64:
 ; GCN: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -323,7 +323,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_offset:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -333,7 +333,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_offset:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umax i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -343,7 +343,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -354,7 +354,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64_offset:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -365,7 +365,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -374,7 +374,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i32_addr64:
 ; GCN: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -393,7 +393,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i32_ret_addr64:
 ; GCN: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -403,7 +403,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_offset:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_offset:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile min i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -434,7 +434,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64_offset:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -445,7 +445,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -454,7 +454,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -463,7 +463,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i32_addr64:
 ; GCN: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -473,7 +473,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i32_ret_addr64:
 ; GCN: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -483,7 +483,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_offset:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -493,7 +493,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_offset:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile umin i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -503,7 +503,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -514,7 +514,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64_offset:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -525,16 +525,16 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
 }
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret:
-; GCN: flat_atomic_umin v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
+; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -543,7 +543,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i32_addr64:
 ; GCN: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -553,7 +553,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i32_ret_addr64:
 ; GCN: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]{{$}}
-  define void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+  define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -563,7 +563,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_offset:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -573,7 +573,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_offset:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile or i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -583,7 +583,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -594,7 +594,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64_offset:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -605,7 +605,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -614,7 +614,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -623,7 +623,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i32_addr64:
 ; GCN: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -633,7 +633,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i32_ret_addr64:
 ; GCN: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -643,7 +643,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_offset:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -653,7 +653,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xchg i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -663,7 +663,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -674,7 +674,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64_offset:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -685,7 +685,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32:
 ; GCN: flat_atomic_swap v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -694,7 +694,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -703,7 +703,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; GCN: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -713,7 +713,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i32_ret_addr64:
 ; GCN: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -725,7 +725,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -735,7 +735,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = cmpxchg volatile i32 addrspace(4)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -746,7 +746,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -757,7 +757,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64_offset:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -769,7 +769,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(4)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -778,7 +778,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(4)* %out, i32 %old, i32 %in seq_cst seq_cst
   %flag = extractvalue { i32, i1 } %val, 0
@@ -788,7 +788,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; GCN: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -798,7 +798,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i32_ret_addr64:
 ; GCN: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(4)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -809,7 +809,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_offset:
 ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -819,7 +819,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_offset:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   %val = atomicrmw volatile xor i32 addrspace(4)* %gep, i32 %in seq_cst
@@ -829,7 +829,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -840,7 +840,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64_offset:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -851,7 +851,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32:
 ; GCN: flat_atomic_xor v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(4)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
   ret void
@@ -860,7 +860,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(4)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(4)* %out2
@@ -869,7 +869,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i32_addr64:
 ; GCN: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(4)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -879,7 +879,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i32_ret_addr64:
 ; GCN: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(4)* %ptr, i32 %in seq_cst
@@ -890,7 +890,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_offset:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %in, i32 4
   %val = load atomic i32, i32 addrspace(4)* %gep  seq_cst, align 4
@@ -901,7 +901,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i32(i32 addrspace(4)* %in, i32 addrspace(4)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(4)* %in seq_cst, align 4
   store i32 %val, i32 addrspace(4)* %out
@@ -911,7 +911,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64_offset:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -923,7 +923,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i32_addr64:
 ; GCN: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(4)* %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %in, i64 %index
   %val = load atomic i32, i32 addrspace(4)* %ptr seq_cst, align 4
@@ -933,7 +933,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_offset:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(4)* %out, i32 4
   store atomic i32 %in, i32 addrspace(4)* %gep  seq_cst, align 4
@@ -942,7 +942,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(4)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(4)* %out seq_cst, align 4
   ret void
@@ -950,7 +950,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
@@ -960,7 +960,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i32_addr64:
 ; GCN: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(4)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(4)* %ptr seq_cst, align 4
diff --git a/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/test/CodeGen/AMDGPU/flat_atomics_i64.ll
index 0bd6c2dd5b860a8ed48c3c124e2fa54fc0bc23c6..723dde9ab68fd57a26699a6012853f8acf843f17 100644
--- a/test/CodeGen/AMDGPU/flat_atomics_i64.ll
+++ b/test/CodeGen/AMDGPU/flat_atomics_i64.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -23,7 +23,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -34,7 +34,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64_offset:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -45,7 +45,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -54,7 +54,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -63,7 +63,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64:
 ; GCN: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -73,7 +73,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_addr64:
 ; GCN: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -83,7 +83,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_offset:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -93,7 +93,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -103,7 +103,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -114,7 +114,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64_offset:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -125,7 +125,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -134,7 +134,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -143,7 +143,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64:
 ; GCN: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_addr64:
 ; GCN: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -163,7 +163,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_offset:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -173,7 +173,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -183,7 +183,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -194,7 +194,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64_offset:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -205,7 +205,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -214,7 +214,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -223,7 +223,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
 ; GCN: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -233,7 +233,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_addr64:
 ; GCN: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -243,7 +243,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_offset:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -253,7 +253,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -263,7 +263,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -274,7 +274,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64_offset:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -285,7 +285,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -294,7 +294,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -303,7 +303,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64:
 ; GCN: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -313,7 +313,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_addr64:
 ; GCN: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -323,7 +323,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_offset:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -333,7 +333,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -343,7 +343,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -354,7 +354,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64_offset:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -365,7 +365,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -374,7 +374,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
 ; GCN: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -393,7 +393,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_addr64:
 ; GCN: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -403,7 +403,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_offset:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -434,7 +434,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64_offset:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -445,7 +445,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -454,7 +454,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -463,7 +463,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64:
 ; GCN: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -473,7 +473,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_addr64:
 ; GCN: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -483,7 +483,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_offset:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -493,7 +493,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -503,7 +503,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -514,7 +514,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64_offset:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -525,7 +525,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -534,7 +534,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -543,7 +543,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
 ; GCN: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -553,7 +553,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_addr64:
 ; GCN: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -563,7 +563,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_offset:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -573,7 +573,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -583,7 +583,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -594,7 +594,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64_offset:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -605,7 +605,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -614,7 +614,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -623,7 +623,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64:
 ; GCN: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -633,7 +633,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_addr64:
 ; GCN: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -643,7 +643,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -653,7 +653,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -663,7 +663,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -674,7 +674,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64_offset:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -685,7 +685,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -694,7 +694,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -703,7 +703,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
 ; GCN: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -713,7 +713,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_addr64:
 ; GCN: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -723,7 +723,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_offset:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -733,7 +733,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %gep, i64 %in seq_cst
@@ -743,7 +743,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -754,7 +754,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64_offset:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -765,7 +765,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(4)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
   ret void
@@ -774,7 +774,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(4)* %out2
@@ -783,7 +783,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
 ; GCN: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -793,7 +793,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_addr64:
 ; GCN: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(4)* %ptr, i64 %in seq_cst
@@ -804,7 +804,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_offset:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %in, i64 4
   %val = load atomic i64, i64 addrspace(4)* %gep  seq_cst, align 8
@@ -815,7 +815,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_load_i64(i64 addrspace(4)* %in, i64 addrspace(4)* %out) {
 entry:
   %val = load atomic i64, i64 addrspace(4)* %in seq_cst, align 8
   store i64 %val, i64 addrspace(4)* %out
@@ -825,7 +825,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_addr64_offset:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -837,7 +837,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_load_i64_addr64:
 ; GCN: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RET]]
-define void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(4)* %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %in, i64 %index
   %val = load atomic i64, i64 addrspace(4)* %ptr seq_cst, align 8
@@ -847,7 +847,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_offset:
 ; GCN: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(4)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   store atomic i64 %in, i64 addrspace(4)* %gep  seq_cst, align 8
@@ -856,7 +856,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
-define void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(4)* %out) {
 entry:
   store atomic i64 %in, i64 addrspace(4)* %out seq_cst, align 8
   ret void
@@ -864,7 +864,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_addr64_offset:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -874,7 +874,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_store_i64_addr64:
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(4)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   store atomic i64 %in, i64 addrspace(4)* %ptr seq_cst, align 8
@@ -883,7 +883,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -892,7 +892,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 9000
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -902,7 +902,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(4)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(4)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -913,7 +913,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -924,7 +924,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64_offset:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i64 4
@@ -936,7 +936,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(4)* %out, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
@@ -945,7 +945,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(4)* %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -955,7 +955,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
 ; GCN: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(4)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -965,7 +965,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_cmpxchg_i64_ret_addr64:
 ; GCN: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(4)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(4)* %ptr, i64 %old, i64 %in seq_cst seq_cst
diff --git a/test/CodeGen/AMDGPU/fma-combine.ll b/test/CodeGen/AMDGPU/fma-combine.ll
index ab1b6d96a8d8f77f83e84ccc0a5e1afa2befbc25..4113ba8dc1f0798ff1c61f8355bb0fc46c654fc1 100644
--- a/test/CodeGen/AMDGPU/fma-combine.ll
+++ b/test/CodeGen/AMDGPU/fma-combine.ll
@@ -1,6 +1,6 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI-NOFMA -check-prefix=SI-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast -enable-no-infs-fp-math -enable-unsafe-fp-math -mattr=+fp32-denormals < %s | FileCheck -check-prefix=SI-FMA -check-prefix=SI-UNSAFE -check-prefix=SI -check-prefix=FUNC %s
 
 ; Note: The SI-FMA conversions of type x * (y + 1) --> x * y + x would be
 ; beneficial even without fp32 denormals, but they do require no-infs-fp-math
@@ -18,7 +18,7 @@ declare float @llvm.fma.f32(float, float, float) #0
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -46,7 +46,7 @@ define void @combine_to_fma_f64_0(double addrspace(1)* noalias %out, double addr
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -75,7 +75,7 @@ define void @combine_to_fma_f64_0_2use(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -99,7 +99,7 @@ define void @combine_to_fma_f64_1(double addrspace(1)* noalias %out, double addr
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -127,7 +127,7 @@ define void @combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -156,7 +156,7 @@ define void @combine_to_fma_fsub_f64_0_2use(double addrspace(1)* noalias %out, d
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], [[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -184,7 +184,7 @@ define void @combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -213,7 +213,7 @@ define void @combine_to_fma_fsub_1_f64_2use(double addrspace(1)* noalias %out, d
 ; SI-DAG: buffer_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]], [[B]], -[[C]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -242,7 +242,7 @@ define void @combine_to_fma_fsub_2_f64(double addrspace(1)* noalias %out, double
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -276,7 +276,7 @@ define void @combine_to_fma_fsub_2_f64_2uses_neg(double addrspace(1)* noalias %o
 ; SI-DAG: buffer_store_dwordx2 [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dwordx2 [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 ; SI: s_endpgm
-define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -308,10 +308,16 @@ define void @combine_to_fma_fsub_2_f64_2uses_mul(double addrspace(1)* noalias %o
 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
-; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
+
+; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
+; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[TMP0]]
+; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[TMP1]], -[[Z]]
+
+; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]], -[[Z]]
+; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], [[Y]], [[FMA0]]
+
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -343,10 +349,16 @@ define void @aggressive_combine_to_fma_fsub_0_f64(double addrspace(1)* noalias %
 ; SI-DAG: buffer_load_dwordx2 [[Z:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 ; SI-DAG: buffer_load_dwordx2 [[U:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24{{$}}
 ; SI-DAG: buffer_load_dwordx2 [[V:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:32{{$}}
-; SI: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
-; SI: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
+
+; SI-SAFE: v_mul_f64 [[TMP0:v\[[0-9]+:[0-9]+\]]], [[U]], [[V]]
+; SI-SAFE: v_fma_f64 [[TMP1:v\[[0-9]+:[0-9]+\]]], [[Y]], [[Z]], [[TMP0]]
+; SI-SAFE: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[X]], -[[TMP1]]
+
+; SI-UNSAFE: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], -[[U]], [[V]], [[X]]
+; SI-UNSAFE: v_fma_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[Y]], [[Z]], [[FMA0]]
+
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -378,7 +390,7 @@ define void @aggressive_combine_to_fma_fsub_1_f64(double addrspace(1)* noalias %
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load volatile float, float addrspace(1)* %in1
@@ -394,7 +406,7 @@ define void @test_f32_mul_add_x_one_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load volatile float, float addrspace(1)* %in1
@@ -410,7 +422,7 @@ define void @test_f32_mul_y_add_x_one(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -426,7 +438,7 @@ define void @test_f32_mul_add_x_negone_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -442,7 +454,7 @@ define void @test_f32_mul_y_add_x_negone(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -458,7 +470,7 @@ define void @test_f32_mul_sub_one_x_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -474,7 +486,7 @@ define void @test_f32_mul_y_sub_one_x(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
                                            float addrspace(1)* %in1,
                                            float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -490,7 +502,7 @@ define void @test_f32_mul_sub_negone_x_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, -[[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -506,7 +518,7 @@ define void @test_f32_mul_y_sub_negone_x(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
                                         float addrspace(1)* %in1,
                                         float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -522,7 +534,7 @@ define void @test_f32_mul_sub_x_one_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], -[[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
                                       float addrspace(1)* %in1,
                                       float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -538,7 +550,7 @@ define void @test_f32_mul_y_sub_x_one(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VY:v[0-9]]], [[VS]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -554,7 +566,7 @@ define void @test_f32_mul_sub_x_negone_y(float addrspace(1)* %out,
 ; SI-NOFMA: v_mul_f32_e32 {{v[0-9]}}, [[VS]], [[VY:v[0-9]]]
 ;
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VY:v[0-9]]], [[VY:v[0-9]]]
-define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
                                          float addrspace(1)* %in1,
                                          float addrspace(1)* %in2) {
   %x = load float, float addrspace(1)* %in1
@@ -576,7 +588,7 @@ define void @test_f32_mul_y_sub_x_negone(float addrspace(1)* %out,
 ;
 ; SI-FMA: v_fma_f32 [[VR:v[0-9]]], -[[VT:v[0-9]]], [[VY:v[0-9]]], [[VY]]
 ; SI-FMA: v_fma_f32 {{v[0-9]}}, [[VX:v[0-9]]], [[VT]], [[VR]]
-define void @test_f32_interp(float addrspace(1)* %out,
+define amdgpu_kernel void @test_f32_interp(float addrspace(1)* %out,
                              float addrspace(1)* %in1,
                              float addrspace(1)* %in2,
                              float addrspace(1)* %in3) {
@@ -598,7 +610,7 @@ define void @test_f32_interp(float addrspace(1)* %out,
 ;
 ; SI-FMA: v_fma_f64 [[VR:v\[[0-9]+:[0-9]+\]]], -[[VT:v\[[0-9]+:[0-9]+\]]], [[VY:v\[[0-9]+:[0-9]+\]]], [[VY]]
 ; SI-FMA: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, [[VX:v\[[0-9]+:[0-9]+\]]], [[VT]], [[VR]]
-define void @test_f64_interp(double addrspace(1)* %out,
+define amdgpu_kernel void @test_f64_interp(double addrspace(1)* %out,
                              double addrspace(1)* %in1,
                              double addrspace(1)* %in2,
                              double addrspace(1)* %in3) {
diff --git a/test/CodeGen/AMDGPU/fma.f64.ll b/test/CodeGen/AMDGPU/fma.f64.ll
index cf6d7d8249926de07d7dc67b46aaab3e5f7150f0..4d3f3712621ef86461c10a06f99bfa7c8bb8bdd8 100644
--- a/test/CodeGen/AMDGPU/fma.f64.ll
+++ b/test/CodeGen/AMDGPU/fma.f64.ll
@@ -8,7 +8,7 @@ declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) n
 
 ; FUNC-LABEL: {{^}}fma_f64:
 ; SI: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                      double addrspace(1)* %in2, double addrspace(1)* %in3) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -21,7 +21,7 @@ define void @fma_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; FUNC-LABEL: {{^}}fma_v2f64:
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                        <2 x double> addrspace(1)* %in2, <2 x double> addrspace(1)* %in3) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
    %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -36,7 +36,7 @@ define void @fma_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1
 ; SI: v_fma_f64
 ; SI: v_fma_f64
 ; SI: v_fma_f64
-define void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                        <4 x double> addrspace(1)* %in2, <4 x double> addrspace(1)* %in3) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
    %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
diff --git a/test/CodeGen/AMDGPU/fma.ll b/test/CodeGen/AMDGPU/fma.ll
index d04a5946b98cd09d1525b05bb169c3bfb4bb976e..659cecb59ebf7c1a1fd70c07cdc22268479ffb34 100644
--- a/test/CodeGen/AMDGPU/fma.ll
+++ b/test/CodeGen/AMDGPU/fma.ll
@@ -12,7 +12,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]\.[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG: FMA {{\*? *}}[[RES]]
-define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                      float addrspace(1)* %in2, float addrspace(1)* %in3) {
   %r0 = load float, float addrspace(1)* %in1
   %r1 = load float, float addrspace(1)* %in2
@@ -29,7 +29,7 @@ define void @fma_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 ; EG: MEM_RAT_{{.*}} STORE_{{.*}} [[RES:T[0-9]]].[[CHLO:[XYZW]]][[CHHI:[XYZW]]], {{T[0-9]\.[XYZW]}},
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHLO]]
 ; EG-DAG: FMA {{\*? *}}[[RES]].[[CHHI]]
-define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                        <2 x float> addrspace(1)* %in2, <2 x float> addrspace(1)* %in3) {
   %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1
   %r1 = load <2 x float>, <2 x float> addrspace(1)* %in2
@@ -50,7 +50,7 @@ define void @fma_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)*
 ; EG-DAG: FMA {{\*? *}}[[RES]].Y
 ; EG-DAG: FMA {{\*? *}}[[RES]].Z
 ; EG-DAG: FMA {{\*? *}}[[RES]].W
-define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                        <4 x float> addrspace(1)* %in2, <4 x float> addrspace(1)* %in3) {
   %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1
   %r1 = load <4 x float>, <4 x float> addrspace(1)* %in2
@@ -62,7 +62,7 @@ define void @fma_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)*
 
 ; FUNC-LABEL: @fma_commute_mul_inline_imm_f32
 ; SI: v_fma_f32 {{v[0-9]+}}, {{v[0-9]+}}, 2.0, {{v[0-9]+}}
-define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -77,7 +77,7 @@ define void @fma_commute_mul_inline_imm_f32(float addrspace(1)* noalias %out, fl
 }
 
 ; FUNC-LABEL: @fma_commute_mul_s_f32
-define void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
+define amdgpu_kernel void @fma_commute_mul_s_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b, float %b) nounwind {
   %tid = tail call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
diff --git a/test/CodeGen/AMDGPU/fmax3.f64.ll b/test/CodeGen/AMDGPU/fmax3.f64.ll
index 4d42a4630e22698676d913ee9bb37bd5bc7ec61c..8b9104b79e7f10336d923e2414b40da0d92a0647 100644
--- a/test/CodeGen/AMDGPU/fmax3.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax3.f64.ll
@@ -11,7 +11,7 @@ declare double @llvm.maxnum.f64(double, double) nounwind readnone
 ; SI: v_max_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[REGA]], [[REGC]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+define amdgpu_kernel void @test_fmax3_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %bptr = getelementptr double, double addrspace(1)* %aptr, i32 1
   %cptr = getelementptr double, double addrspace(1)* %aptr, i32 2
   %a = load volatile double, double addrspace(1)* %aptr, align 8
diff --git a/test/CodeGen/AMDGPU/fmax3.ll b/test/CodeGen/AMDGPU/fmax3.ll
index 7c01ca85f6b9d8fe4d0497eec80fb8bc90d25083..a96eb5db9e2a25dd0aeb80049191e319d6486c6b 100644
--- a/test/CodeGen/AMDGPU/fmax3.ll
+++ b/test/CodeGen/AMDGPU/fmax3.ll
@@ -10,7 +10,7 @@ declare float @llvm.maxnum.f32(float, float) nounwind readnone
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile  float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
@@ -28,7 +28,7 @@ define void @test_fmax3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 ; SI: v_max3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmax3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
index da498caa6b54ed0d58f5f9f69c42203364da8529..083346e9d1cbd356925706ab87d95f5ecbe5ddf7 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.f64.ll
@@ -4,7 +4,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmax_legacy_uge_f64
-define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -19,7 +19,7 @@ define void @test_fmax_legacy_uge_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_oge_f64
-define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -34,7 +34,7 @@ define void @test_fmax_legacy_oge_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_ugt_f64
-define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -49,7 +49,7 @@ define void @test_fmax_legacy_ugt_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmax_legacy_ogt_f64
-define void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmax_legacy.ll b/test/CodeGen/AMDGPU/fmax_legacy.ll
index 4a4c92a38a35be1042c9b9447fafc658b318f5ec..7643c3ea533ce59e1ee32984772b605787d4dfb3 100644
--- a/test/CodeGen/AMDGPU/fmax_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmax_legacy.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 
 ; EG: MAX
-define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -33,7 +33,7 @@ define void @test_fmax_legacy_uge_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -53,7 +53,7 @@ define void @test_fmax_legacy_oge_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -73,7 +73,7 @@ define void @test_fmax_legacy_ugt_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -93,7 +93,7 @@ define void @test_fmax_legacy_ogt_f32(float addrspace(1)* %out, float addrspace(
 ; SI-SAFE: v_max_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_max_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; EG: MAX
-define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
@@ -114,7 +114,7 @@ define void @test_fmax_legacy_ogt_v1f32(<1 x float> addrspace(1)* %out, <1 x flo
 ; SI-NONAN: v_max_f32_e32
 ; SI-NONAN: v_max_f32_e32
 ; SI-NONAN: v_max_f32_e32
-define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
@@ -137,7 +137,7 @@ define void @test_fmax_legacy_ogt_v3f32(<3 x float> addrspace(1)* %out, <3 x flo
 ; SI-NOT: v_max_
 
 ; EG: MAX
-define void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmax_legacy_ogt_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmaxnum.f64.ll b/test/CodeGen/AMDGPU/fmaxnum.f64.ll
index fec3a358a4fae64c082310059349ed9c66c63e8a..20af278bf98c3409d5851bddcf7f350fa2f819e8 100644
--- a/test/CodeGen/AMDGPU/fmaxnum.f64.ll
+++ b/test/CodeGen/AMDGPU/fmaxnum.f64.ll
@@ -9,7 +9,7 @@ declare <16 x double> @llvm.maxnum.v16f64(<16 x double>, <16 x double>) #0
 
 ; FUNC-LABEL: @test_fmax_f64
 ; SI: v_max_f64
-define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
   %val = call double @llvm.maxnum.f64(double %a, double %b) #0
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -18,7 +18,7 @@ define void @test_fmax_f64(double addrspace(1)* %out, double %a, double %b) noun
 ; FUNC-LABEL: @test_fmax_v2f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b) #0
   store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -29,7 +29,7 @@ define void @test_fmax_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %a, <4 x double> %b) #0
   store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -44,7 +44,7 @@ define void @test_fmax_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.maxnum.v8f64(<8 x double> %a, <8 x double> %b) #0
   store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -67,7 +67,7 @@ define void @test_fmax_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <
 ; SI: v_max_f64
 ; SI: v_max_f64
 ; SI: v_max_f64
-define void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.maxnum.v16f64(<16 x double> %a, <16 x double> %b) #0
   store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
   ret void
diff --git a/test/CodeGen/AMDGPU/fmaxnum.ll b/test/CodeGen/AMDGPU/fmaxnum.ll
index 4058247a6da98832bdfe8743f170f6761fdc1a6f..277b8ce04c4ea8d80feb6d587588340814ae667c 100644
--- a/test/CodeGen/AMDGPU/fmaxnum.ll
+++ b/test/CodeGen/AMDGPU/fmaxnum.ll
@@ -14,7 +14,7 @@ declare double @llvm.maxnum.f64(double, double)
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -27,7 +27,7 @@ define void @test_fmax_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.maxnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -44,7 +44,7 @@ define void @test_fmax_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
 ; EG: MAX_DX10 {{.*}}[[OUT]]
-define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -70,7 +70,7 @@ define void @test_fmax_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Y
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].Z
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT2]].W
-define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -114,7 +114,7 @@ define void @test_fmax_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Y
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].Z
 ; EG-DAG: MAX_DX10 {{.*}}[[OUT4]].W
-define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.maxnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -128,7 +128,7 @@ define void @test_fmax_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a,
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -143,7 +143,7 @@ define void @constant_fold_fmax_f32(float addrspace(1)* %out) nounwind {
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 ; EG: 2143289344(nan)
-define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ define void @constant_fold_fmax_f32_nan_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -171,7 +171,7 @@ define void @constant_fold_fmax_f32_val_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -185,7 +185,7 @@ define void @constant_fold_fmax_f32_nan_val(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -199,7 +199,7 @@ define void @constant_fold_fmax_f32_p0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -213,7 +213,7 @@ define void @constant_fold_fmax_f32_p0_n0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -227,7 +227,7 @@ define void @constant_fold_fmax_f32_n0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.maxnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@ define void @constant_fold_fmax_f32_n0_n0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MAX_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -250,7 +250,7 @@ define void @fmax_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -262,7 +262,7 @@ define void @fmax_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -274,7 +274,7 @@ define void @fmax_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MAX_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmax_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.maxnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/fmed3.ll b/test/CodeGen/AMDGPU/fmed3.ll
index 44889c9c472199d3c22b447064a8b4fc21a33cf1..d2cfc713ed37c6dd5004c372e08734235c01c34f 100644
--- a/test/CodeGen/AMDGPU/fmed3.ll
+++ b/test/CodeGen/AMDGPU/fmed3.ll
@@ -1,16 +1,15 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=NOSNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp-exceptions -verify-machineinstrs < %s | FileCheck -check-prefix=SNAN -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
 
-declare i32 @llvm.amdgcn.workitem.id.x() #0
-declare float @llvm.minnum.f32(float, float) #0
-declare float @llvm.maxnum.f32(float, float) #0
-declare double @llvm.minnum.f64(double, double) #0
-declare double @llvm.maxnum.f64(double, double) #0
 
 ; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f32:
 ; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v{{[0-9]+}}
 ; GCN: v_med3_f32 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
-define void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -28,7 +27,7 @@ define void @v_test_nnan_input_fmed3_r_i_i_f32(float addrspace(1)* %out, float a
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -46,7 +45,7 @@ define void @v_test_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -64,7 +63,7 @@ define void @v_test_fmed3_r_i_i_commute0_f32(float addrspace(1)* %out, float add
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -80,7 +79,7 @@ define void @v_test_fmed3_r_i_i_commute1_f32(float addrspace(1)* %out, float add
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_constant_order_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -97,7 +96,7 @@ define void @v_test_fmed3_r_i_i_constant_order_f32(float addrspace(1)* %out, flo
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_multi_use_f32:
 ; GCN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; GCN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -114,7 +113,7 @@ define void @v_test_fmed3_r_i_i_multi_use_f32(float addrspace(1)* %out, float ad
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_f64:
 ; GCN: v_max_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 2.0
 ; GCN: v_min_f64 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr double, double addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr double, double addrspace(1)* %out, i32 %tid
@@ -129,7 +128,7 @@ define void @v_test_fmed3_r_i_i_f64(double addrspace(1)* %out, double addrspace(
 
 ; GCN-LABEL: {{^}}v_test_fmed3_r_i_i_no_nans_f32:
 ; GCN: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
-define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
+define amdgpu_kernel void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #2 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -147,7 +146,7 @@ define void @v_test_fmed3_r_i_i_no_nans_f32(float addrspace(1)* %out, float addr
 
 ; SNAN: v_max_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}
 ; SNAN: v_min_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
-define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
+define amdgpu_kernel void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrspace(1)* %aptr) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -165,6 +164,812 @@ define void @v_test_legacy_fmed3_r_i_i_f32(float addrspace(1)* %out, float addrs
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %a.fneg = fsub float -0.0, %a
+  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod1:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], -[[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %b.fneg = fsub float -0.0, %b
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b.fneg)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b.fneg)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod2:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], -[[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %c.fneg = fsub float -0.0, %c
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fneg)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod012:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, -[[A]], |[[B]]|, -|[[C]]|
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.fneg = fsub float -0.0, %a
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %c.fabs.fneg = fsub float -0.0, %c.fabs
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b.fabs)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.fneg, float %b.fabs)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_negabs012:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, -|[[A]]|, -|[[B]]|, -|[[C]]|
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_negabs012(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.fabs = call float @llvm.fabs.f32(float %a)
+  %a.fabs.fneg = fsub float -0.0, %a.fabs
+  %b.fabs = call float @llvm.fabs.f32(float %b)
+  %b.fabs.fneg = fsub float -0.0, %b.fabs
+  %c.fabs = call float @llvm.fabs.f32(float %c)
+  %c.fabs.fneg = fsub float -0.0, %c.fabs
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.fabs.fneg, float %b.fabs.fneg)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.fabs.fneg)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_med3_f32_pat0:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN-DAG: v_add_f32_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
+; GCN-DAG: v_add_f32_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
+; GCN-DAG: v_add_f32_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
+define amdgpu_kernel void @v_nnan_inputs_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.nnan = fadd nnan float %a, 1.0
+  %b.nnan = fadd nnan float %b, 2.0
+  %c.nnan = fadd nnan float %c, 4.0
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; 16 combinations
+
+; 0: max(min(x, y), min(max(x, y), z))
+; 1: max(min(x, y), min(max(y, x), z))
+; 2: max(min(x, y), min(z, max(x, y)))
+; 3: max(min(x, y), min(z, max(y, x)))
+; 4: max(min(y, x), min(max(x, y), z))
+; 5: max(min(y, x), min(max(y, x), z))
+; 6: max(min(y, x), min(z, max(x, y)))
+; 7: max(min(y, x), min(z, max(y, x)))
+;
+; + commute outermost max
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat1:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat2:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat3:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat3(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat4:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat4(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat5:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat5(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat6:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat6(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat7:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat7(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat8:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat8(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat9:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat9(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat10:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat10(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat11:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat11(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat12:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat12(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat13:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat13(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat14:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[A]], [[B]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat14(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat15:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_med3_f32 v{{[0-9]+}}, [[B]], [[A]], [[C]]
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat15(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %b, float %a)
+  %tmp1 = call float @llvm.maxnum.f32(float %b, float %a)
+  %tmp2 = call float @llvm.minnum.f32(float %c, float %tmp1)
+  %med3 = call float @llvm.maxnum.f32(float %tmp2, float %tmp0)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; ---------------------------------------------------------------------
+; Negative patterns
+; ---------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use0:
+; GCN-DAG: v_min_f32
+; GCN-DAG: v_max_f32
+; GCN: v_min_f32
+; GCN: v_max_f32
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  store volatile float %tmp0, float addrspace(1)* undef
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use1:
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  store volatile float %tmp1, float addrspace(1)* undef
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0_multi_use2:
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0_multi_use2(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  store volatile float %tmp2, float addrspace(1)* undef
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}v_test_safe_med3_f32_pat0:
+define amdgpu_kernel void @v_test_safe_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %tmp0 = call float @llvm.minnum.f32(float %a, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_missing0_med3_f32_pat0:
+define amdgpu_kernel void @v_nnan_inputs_missing0_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.nnan = fadd float %a, 1.0
+  %b.nnan = fadd nnan float %b, 2.0
+  %c.nnan = fadd nnan float %c, 4.0
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_missing1_med3_f32_pat0:
+define amdgpu_kernel void @v_nnan_inputs_missing1_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.nnan = fadd nnan float %a, 1.0
+  %b.nnan = fadd float %b, 2.0
+  %c.nnan = fadd nnan float %c, 4.0
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_missing2_med3_f32_pat0:
+define amdgpu_kernel void @v_nnan_inputs_missing2_med3_f32_pat0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+
+  %a.nnan = fadd nnan float %a, 1.0
+  %b.nnan = fadd nnan float %b, 2.0
+  %c.nnan = fadd float %c, 4.0
+
+  %tmp0 = call float @llvm.minnum.f32(float %a.nnan, float %b.nnan)
+  %tmp1 = call float @llvm.maxnum.f32(float %a.nnan, float %b.nnan)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c.nnan)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_min_f32
+; GCN: v_max_f32
+; GCN: v_min_f32
+; GCN: v_max_f32
+define amdgpu_kernel void @v_test_global_nnans_med3_f32_pat0_srcmod0_mismatch(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %a.fneg = fsub float -0.0, %a
+  %tmp0 = call float @llvm.minnum.f32(float %a.fneg, float %b)
+  %tmp1 = call float @llvm.maxnum.f32(float %a, float %b)
+  %tmp2 = call float @llvm.minnum.f32(float %tmp1, float %c)
+  %med3 = call float @llvm.maxnum.f32(float %tmp0, float %tmp2)
+  store float %med3, float addrspace(1)* %outgep
+  ret void
+}
+
+; A simple min and max is not sufficient
+; GCN-LABEL: {{^}}v_test_global_nnans_min_max_f32:
+; GCN: {{buffer_|flat_}}load_dword [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_dword [[C:v[0-9]+]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], [[B]], [[A]]
+; GCN: v_min_f32_e32 v{{[0-9]+}}, [[C]], [[MAX]]
+define amdgpu_kernel void @v_test_global_nnans_min_max_f32(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) #2 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr float, float addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr float, float addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load volatile float, float addrspace(1)* %gep0
+  %b = load volatile float, float addrspace(1)* %gep1
+  %c = load volatile float, float addrspace(1)* %gep2
+  %max = call float @llvm.maxnum.f32(float %a, float %b)
+  %minmax = call float @llvm.minnum.f32(float %max, float %c)
+  store float %minmax, float addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_nnan_input_fmed3_r_i_i_f16:
+; SI: v_cvt_f32_f16
+; SI: v_add_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
+; SI: v_med3_f32 v{{[0-9]+}}, v{{[0-9]+}}, 2.0, 4.0
+; SI: v_cvt_f16_f32
+
+; VI: v_add_f16_e32 v{{[0-9]+}}, 1.0
+; VI: v_max_f16_e32 v{{[0-9]+}}, 2.0
+; VI: v_min_f16_e32 v{{[0-9]+}}, 4.0
+
+; GFX9: v_add_f16_e32 v{{[0-9]+}}, 1.0
+; GFX9: v_med3_f16 v{{[0-9]+}}, [[ADD]], 2.0, 4.0
+define amdgpu_kernel void @v_test_nnan_input_fmed3_r_i_i_f16(half addrspace(1)* %out, half addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load half, half addrspace(1)* %gep0
+  %a.add = fadd nnan half %a, 1.0
+  %max = call half @llvm.maxnum.f16(half %a.add, half 2.0)
+  %med = call half @llvm.minnum.f16(half %max, half 4.0)
+
+  store half %med, half addrspace(1)* %outgep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_nnan_inputs_med3_f16_pat0:
+; GCN: {{buffer_|flat_}}load_ushort [[A:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_ushort [[B:v[0-9]+]]
+; GCN: {{buffer_|flat_}}load_ushort [[C:v[0-9]+]]
+
+; SI: v_cvt_f32_f16
+; SI: v_cvt_f32_f16
+; SI: v_add_f32_e32
+; SI: v_add_f32_e32
+; SI: v_add_f32_e32
+; SI: v_med3_f32
+; SI: v_cvt_f16_f32_e32
+
+
+; GFX89-DAG: v_add_f16_e32 [[A_ADD:v[0-9]+]], 1.0, [[A]]
+; GFX89-DAG: v_add_f16_e32 [[B_ADD:v[0-9]+]], 2.0, [[B]]
+; GFX89-DAG: v_add_f16_e32 [[C_ADD:v[0-9]+]], 4.0, [[C]]
+
+; VI-DAG: v_min_f16
+; VI-DAG: v_max_f16
+; VI: v_min_f16
+; VI: v_max_f16
+
+; GFX9: v_med3_f16 v{{[0-9]+}}, [[A_ADD]], [[B_ADD]], [[C_ADD]]
+define amdgpu_kernel void @v_nnan_inputs_med3_f16_pat0(half addrspace(1)* %out, half addrspace(1)* %aptr, half addrspace(1)* %bptr, half addrspace(1)* %cptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr half, half addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr half, half addrspace(1)* %bptr, i32 %tid
+  %gep2 = getelementptr half, half addrspace(1)* %cptr, i32 %tid
+  %outgep = getelementptr half, half addrspace(1)* %out, i32 %tid
+  %a = load volatile half, half addrspace(1)* %gep0
+  %b = load volatile half, half addrspace(1)* %gep1
+  %c = load volatile half, half addrspace(1)* %gep2
+
+  %a.nnan = fadd nnan half %a, 1.0
+  %b.nnan = fadd nnan half %b, 2.0
+  %c.nnan = fadd nnan half %c, 4.0
+
+  %tmp0 = call half @llvm.minnum.f16(half %a.nnan, half %b.nnan)
+  %tmp1 = call half @llvm.maxnum.f16(half %a.nnan, half %b.nnan)
+  %tmp2 = call half @llvm.minnum.f16(half %tmp1, half %c.nnan)
+  %med3 = call half @llvm.maxnum.f16(half %tmp0, half %tmp2)
+  store half %med3, half addrspace(1)* %outgep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.maxnum.f32(float, float) #0
+declare double @llvm.minnum.f64(double, double) #0
+declare double @llvm.maxnum.f64(double, double) #0
+declare half @llvm.fabs.f16(half) #0
+declare half @llvm.minnum.f16(half, half) #0
+declare half @llvm.maxnum.f16(half, half) #0
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
 attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/fmin3.ll b/test/CodeGen/AMDGPU/fmin3.ll
index 3102ffdbdd28604c10b2b3ffdf76fe703515b7c0..3183f77f090bd4bba7778b0d5b6b3047a5ccfb72 100644
--- a/test/CodeGen/AMDGPU/fmin3.ll
+++ b/test/CodeGen/AMDGPU/fmin3.ll
@@ -11,7 +11,7 @@ declare float @llvm.minnum.f32(float, float) nounwind readnone
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
@@ -29,7 +29,7 @@ define void @test_fmin3_olt_0(float addrspace(1)* %out, float addrspace(1)* %apt
 ; SI: v_min3_f32 [[RESULT:v[0-9]+]], [[REGC]], [[REGB]], [[REGA]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @test_fmin3_olt_1(float addrspace(1)* %out, float addrspace(1)* %aptr, float addrspace(1)* %bptr, float addrspace(1)* %cptr) nounwind {
   %a = load volatile float, float addrspace(1)* %aptr, align 4
   %b = load volatile float, float addrspace(1)* %bptr, align 4
   %c = load volatile float, float addrspace(1)* %cptr, align 4
diff --git a/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll b/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fdfe533b3d0c8bcd9a0726978633270deca0efbd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmin_fmax_legacy.amdgcn.ll
@@ -0,0 +1,47 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SAFE -check-prefix=GCN %s
+; RUN: llc -enable-no-nans-fp-math -enable-unsafe-fp-math -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN-NONAN -check-prefix=GCN %s
+
+; FIXME: Should replace unsafe-fp-math with no signed zeros.
+
+; GCN-LABEL: {{^}}min_fneg_select_regression_0:
+; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, -1.0
+define amdgpu_ps float @min_fneg_select_regression_0(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ult float %a, 1.0
+  %min.a = select i1 %cmp.a, float %fneg.a, float -1.0
+  ret float %min.a
+}
+
+; GCN-LABEL: {{^}}min_fneg_select_regression_posk_0:
+; GCN-SAFE: v_max_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NONAN: v_max_f32_e64 v{{[0-9]+}}, -v0, 1.0
+define amdgpu_ps float @min_fneg_select_regression_posk_0(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ult float %a, -1.0
+  %min.a = select i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
+; GCN-LABEL: {{^}}max_fneg_select_regression_0:
+; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], -1.0, -v0
+; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, -1.0
+define amdgpu_ps float @max_fneg_select_regression_0(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ugt float %a, 1.0
+  %min.a = select i1 %cmp.a, float %fneg.a, float -1.0
+  ret float %min.a
+}
+
+; GCN-LABEL: {{^}}max_fneg_select_regression_posk_0:
+; GCN-SAFE: v_min_legacy_f32_e64 [[MIN:v[0-9]+]], 1.0, -v0
+; GCN-NONAN: v_min_f32_e64 [[MIN:v[0-9]+]], -v0, 1.0
+define amdgpu_ps float @max_fneg_select_regression_posk_0(float %a, float %b) #0 {
+  %fneg.a = fsub float -0.0, %a
+  %cmp.a = fcmp ugt float %a, -1.0
+  %min.a = select i1 %cmp.a, float %fneg.a, float 1.0
+  ret float %min.a
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
index 6982ee0c0cb3fb1a74f6740e36980e40b46dcac8..99bc114831ca056d2859d5b56b3c8e260b6aded7 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.f64.ll
@@ -3,7 +3,7 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; FUNC-LABEL: @test_fmin_legacy_f64
-define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
+define amdgpu_kernel void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double> inreg %reg0) #0 {
    %r0 = extractelement <4 x double> %reg0, i32 0
    %r1 = extractelement <4 x double> %reg0, i32 1
    %r2 = fcmp uge double %r0, %r1
@@ -14,7 +14,7 @@ define void @test_fmin_legacy_f64(<4 x double> addrspace(1)* %out, <4 x double>
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ule_f64
-define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -29,7 +29,7 @@ define void @test_fmin_legacy_ule_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ole_f64
-define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -44,7 +44,7 @@ define void @test_fmin_legacy_ole_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_olt_f64
-define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -59,7 +59,7 @@ define void @test_fmin_legacy_olt_f64(double addrspace(1)* %out, double addrspac
 }
 
 ; FUNC-LABEL: @test_fmin_legacy_ult_f64
-define void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmin_legacy.ll b/test/CodeGen/AMDGPU/fmin_legacy.ll
index 79acd02e6d1f394ea7931cea3a09b774f3af821a..52336f95a90962797caf7a6d23c07fac8a970af5 100644
--- a/test/CodeGen/AMDGPU/fmin_legacy.ll
+++ b/test/CodeGen/AMDGPU/fmin_legacy.ll
@@ -14,7 +14,7 @@ declare i32 @llvm.r600.read.tidig.x() #1
 ; EG: MIN *
 ; SI-SAFE: v_min_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI-NONAN: v_min_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out, <4 x float> inreg %reg0) #0 {
    %r0 = extractelement <4 x float> %reg0, i32 0
    %r1 = extractelement <4 x float> %reg0, i32 1
    %r2 = fcmp uge float %r0, %r1
@@ -34,7 +34,7 @@ define void @s_test_fmin_legacy_subreg_inputs_f32(<4 x float> addrspace(1)* %out
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[VA]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[A]], [[VB]]
 
-define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %cmp = fcmp ule float %a, %b
   %val = select i1 %cmp, float %a, float %b
   store float %val, float addrspace(1)* %out, align 4
@@ -46,7 +46,7 @@ define void @s_test_fmin_legacy_ule_f32(float addrspace(1)* %out, float %a, floa
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -65,7 +65,7 @@ define void @test_fmin_legacy_ule_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -84,7 +84,7 @@ define void @test_fmin_legacy_ole_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[A]], [[B]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -103,7 +103,7 @@ define void @test_fmin_legacy_olt_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -122,7 +122,7 @@ define void @test_fmin_legacy_ult_f32(float addrspace(1)* %out, float addrspace(
 ; SI: buffer_load_dword [[B:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; SI-SAFE: v_min_legacy_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
 ; SI-NONAN: v_min_f32_e32 {{v[0-9]+}}, [[B]], [[A]]
-define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <1 x float>, <1 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <1 x float>, <1 x float> addrspace(1)* %gep.0, i32 1
@@ -144,7 +144,7 @@ define void @test_fmin_legacy_ult_v1f32(<1 x float> addrspace(1)* %out, <1 x flo
 
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <2 x float>, <2 x float> addrspace(1)* %gep.0, i32 1
@@ -166,7 +166,7 @@ define void @test_fmin_legacy_ult_v2f32(<2 x float> addrspace(1)* %out, <2 x flo
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
 ; SI-NONAN: v_min_f32_e32
-define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr <3 x float>, <3 x float> addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr <3 x float>, <3 x float> addrspace(1)* %gep.0, i32 1
@@ -188,7 +188,7 @@ define void @test_fmin_legacy_ult_v3f32(<3 x float> addrspace(1)* %out, <3 x flo
 ; SI-NEXT: v_cndmask_b32
 ; SI-NOT: v_min
 ; SI: s_endpgm
-define void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fmin_legacy_ole_f32_multi_use(float addrspace(1)* %out0, i1 addrspace(1)* %out1, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() #1
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fminnum.f64.ll b/test/CodeGen/AMDGPU/fminnum.f64.ll
index 0f929d6a81f0b1e2ff1072ef1263375cab945f1d..01b267411212cb8a50acf3ea7f9cfbde04b06186 100644
--- a/test/CodeGen/AMDGPU/fminnum.f64.ll
+++ b/test/CodeGen/AMDGPU/fminnum.f64.ll
@@ -9,7 +9,7 @@ declare <16 x double> @llvm.minnum.v16f64(<16 x double>, <16 x double>) #0
 
 ; FUNC-LABEL: @test_fmin_f64
 ; SI: v_min_f64
-define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) nounwind {
   %val = call double @llvm.minnum.f64(double %a, double %b) #0
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -18,7 +18,7 @@ define void @test_fmin_f64(double addrspace(1)* %out, double %a, double %b) noun
 ; FUNC-LABEL: @test_fmin_v2f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) nounwind {
   %val = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b) #0
   store <2 x double> %val, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -29,7 +29,7 @@ define void @test_fmin_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) nounwind {
   %val = call <4 x double> @llvm.minnum.v4f64(<4 x double> %a, <4 x double> %b) #0
   store <4 x double> %val, <4 x double> addrspace(1)* %out, align 32
   ret void
@@ -44,7 +44,7 @@ define void @test_fmin_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b) nounwind {
   %val = call <8 x double> @llvm.minnum.v8f64(<8 x double> %a, <8 x double> %b) #0
   store <8 x double> %val, <8 x double> addrspace(1)* %out, align 64
   ret void
@@ -67,7 +67,7 @@ define void @test_fmin_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <
 ; SI: v_min_f64
 ; SI: v_min_f64
 ; SI: v_min_f64
-define void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %a, <16 x double> %b) nounwind {
   %val = call <16 x double> @llvm.minnum.v16f64(<16 x double> %a, <16 x double> %b) #0
   store <16 x double> %val, <16 x double> addrspace(1)* %out, align 128
   ret void
diff --git a/test/CodeGen/AMDGPU/fminnum.ll b/test/CodeGen/AMDGPU/fminnum.ll
index abd2b9d3e4d18fae4742659310f647cb7e973dd1..9e997c7a10458a55ff74a6618ada0f8875ba7e51 100644
--- a/test/CodeGen/AMDGPU/fminnum.ll
+++ b/test/CodeGen/AMDGPU/fminnum.ll
@@ -13,7 +13,7 @@ declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float %b) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -26,7 +26,7 @@ define void @test_fmin_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) nounwind {
   %val = call <2 x float> @llvm.minnum.v2f32(<2 x float> %a, <2 x float> %b) #0
   store <2 x float> %val, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -43,7 +43,7 @@ define void @test_fmin_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
 ; EG: MIN_DX10 {{.*}}[[OUT]]
-define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) nounwind {
   %val = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b) #0
   store <4 x float> %val, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -69,7 +69,7 @@ define void @test_fmin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Y
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].Z
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT2]].W
-define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b) nounwind {
   %val = call <8 x float> @llvm.minnum.v8f32(<8 x float> %a, <8 x float> %b) #0
   store <8 x float> %val, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -113,7 +113,7 @@ define void @test_fmin_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Y
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].Z
 ; EG-DAG: MIN_DX10 {{.*}}[[OUT4]].W
-define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
+define amdgpu_kernel void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, <16 x float> %b) nounwind {
   %val = call <16 x float> @llvm.minnum.v16f32(<16 x float> %a, <16 x float> %b) #0
   store <16 x float> %val, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -127,7 +127,7 @@ define void @test_fmin_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a,
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -142,7 +142,7 @@ define void @constant_fold_fmin_f32(float addrspace(1)* %out) nounwind {
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
 ; EG: 2143289344({{nan|1\.#QNAN0e\+00}})
-define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -156,7 +156,7 @@ define void @constant_fold_fmin_f32_nan_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 1.0, float 0x7FF8000000000000) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -170,7 +170,7 @@ define void @constant_fold_fmin_f32_val_nan(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0x7FF8000000000000, float 1.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@ define void @constant_fold_fmin_f32_nan_val(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -198,7 +198,7 @@ define void @constant_fold_fmin_f32_p0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float 0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -212,7 +212,7 @@ define void @constant_fold_fmin_f32_p0_n0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float 0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -226,7 +226,7 @@ define void @constant_fold_fmin_f32_n0_p0(float addrspace(1)* %out) nounwind {
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG-NOT: MIN_DX10
 ; EG: MOV {{.*}}[[OUT]], literal.{{[xy]}}
-define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
   %val = call float @llvm.minnum.f32(float -0.0, float -0.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -237,7 +237,7 @@ define void @constant_fold_fmin_f32_n0_n0(float addrspace(1)* %out) nounwind {
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 2.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -248,7 +248,7 @@ define void @fmin_var_immediate_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 2.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -260,7 +260,7 @@ define void @fmin_immediate_var_f32(float addrspace(1)* %out, float %a) nounwind
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float %a, float 99.0) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -272,7 +272,7 @@ define void @fmin_var_literal_f32(float addrspace(1)* %out, float %a) nounwind {
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[OUT:T[0-9]+\.[XYZW]]]
 ; EG: MIN_DX10 {{.*}}[[OUT]], {{KC0\[[0-9]\].[XYZW]}}, literal.{{[xy]}}
-define void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
+define amdgpu_kernel void @fmin_literal_var_f32(float addrspace(1)* %out, float %a) nounwind {
   %val = call float @llvm.minnum.f32(float 99.0, float %a) #0
   store float %val, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
index c1f42d2beae0bfd2ed75287edff832d87255e015..4002712ab1693ad145abdb04b04115b0431a4cbc 100644
--- a/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
+++ b/test/CodeGen/AMDGPU/fmul-2-combine-multi-use.ll
@@ -21,9 +21,9 @@ declare float @llvm.fabs.f32(float) #1
 ; VI: v_cmp_gt_f32_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_cndmask_b32_e32
 ; VI: v_add_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
-; VI: v_mul_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; VI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+; VI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+define amdgpu_kernel void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %a11 = fadd fast float %y, -1.0
   %a12 = call float @llvm.fabs.f32(float %a11)
   %a13 = fadd fast float %x, -1.0
@@ -44,7 +44,7 @@ define void @multiple_fadd_use_test_f32(float addrspace(1)* %out, float %x, floa
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %mad = fadd fast float %mul2, %y
@@ -59,7 +59,7 @@ define void @multiple_use_fadd_fmac_f32(float addrspace(1)* %out, float %x, floa
 ; GCN-DAG: buffer_store_dword [[MUL2]]
 ; GCN-DAG: buffer_store_dword [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
@@ -72,7 +72,7 @@ define void @multiple_use_fadd_fmad_f32(float addrspace(1)* %out, float %x, floa
 ; GCN-LABEL: {{^}}multiple_use_fadd_multi_fmad_f32:
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 ; GCN: v_mad_f32 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
-define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
+define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x, float %y, float %z) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %x.abs = call float @llvm.fabs.f32(float %x)
   %mul2 = fmul fast float %x.abs, 2.0
@@ -87,7 +87,7 @@ define void @multiple_use_fadd_multi_fmad_f32(float addrspace(1)* %out, float %x
 ; GCN: v_mul_f32_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -2.0
@@ -101,7 +101,7 @@ define void @fmul_x2_xn2_f32(float addrspace(1)* %out, float %x, float %y) #0 {
 ; GCN: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 ; GCN: v_mul_f32_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %mul2 = fmul fast float %x, 2.0
   %muln2 = fmul fast float %x, -3.0
@@ -116,10 +116,10 @@ define void @fmul_x2_xn3_f32(float addrspace(1)* %out, float %x, float %y) #0 {
 ; VI: v_cmp_gt_f16_e64 vcc, |v{{[0-9]+}}|, |v{{[0-9]+}}|
 ; VI: v_cndmask_b32_e32
 ; VI: v_add_f16_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |v{{[0-9]+}}|
-; VI: v_mul_f16_e64 v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-; VI-DENORM: v_fma_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
-define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
+; VI: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+; VI-DENORM: v_fma_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, 1.0
+define amdgpu_kernel void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -146,7 +146,7 @@ define void @multiple_fadd_use_test_f16(half addrspace(1)* %out, i16 zeroext %x.
 ; GCN-DAG: buffer_store_short [[MUL2]]
 ; GCN-DAG: buffer_store_short [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -166,7 +166,7 @@ define void @multiple_use_fadd_fmac_f16(half addrspace(1)* %out, i16 zeroext %x.
 ; GCN-DAG: buffer_store_short [[MUL2]]
 ; GCN-DAG: buffer_store_short [[MAD]]
 ; GCN: s_endpgm
-define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -185,7 +185,7 @@ define void @multiple_use_fadd_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.
 ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X:s[0-9]+]]|, 2.0, v{{[0-9]+}}
 ; VI-DENORM: v_fma_f16 {{v[0-9]+}}, |[[X]]|, 2.0, v{{[0-9]+}}
 
-define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
+define amdgpu_kernel void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg, i16 zeroext %z.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %z = bitcast i16 %z.arg to half
@@ -203,7 +203,7 @@ define void @multiple_use_fadd_multi_fmad_f16(half addrspace(1)* %out, i16 zeroe
 ; GCN: v_mul_f16_e64 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], -4.0
 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
@@ -219,7 +219,7 @@ define void @fmul_x2_xn2_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 ze
 ; GCN: v_mul_f16_e32 [[TMP0:v[0-9]+]], [[X:s[0-9]+]], [[K]]
 ; GCN: v_mul_f16_e32 [[RESULT:v[0-9]+]], [[X]], [[TMP0]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
+define amdgpu_kernel void @fmul_x2_xn3_f16(half addrspace(1)* %out, i16 zeroext %x.arg, i16 zeroext %y.arg) #0 {
   %x = bitcast i16 %x.arg to half
   %y = bitcast i16 %y.arg to half
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
diff --git a/test/CodeGen/AMDGPU/fmul.f16.ll b/test/CodeGen/AMDGPU/fmul.f16.ll
index fa907b09d8079dff878a37e53b2402ec176f5bc1..4e96091ae25639e0285794892ca9eda32494493d 100644
--- a/test/CodeGen/AMDGPU/fmul.f16.ll
+++ b/test/CodeGen/AMDGPU/fmul.f16.ll
@@ -11,7 +11,7 @@
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16(
+define amdgpu_kernel void @fmul_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -31,7 +31,7 @@ entry:
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16_imm_a(
+define amdgpu_kernel void @fmul_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -50,7 +50,7 @@ entry:
 ; VI:  v_mul_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fmul_f16_imm_b(
+define amdgpu_kernel void @fmul_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -60,27 +60,30 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_v2f16
+; GCN-LABEL: {{^}}fmul_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_mul_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_mul_f16_e32 v[[R_F16_LO:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_mul_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16(
+define amdgpu_kernel void @fmul_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -92,7 +95,7 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_v2f16_imm_a
+; GCN-LABEL: {{^}}fmul_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
@@ -101,14 +104,13 @@ entry:
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-; VI:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-DAG:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16_imm_a(
+define amdgpu_kernel void @fmul_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -118,7 +120,7 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fmul_v2f16_imm_b
+; GCN-LABEL: {{^}}fmul_v2f16_imm_b:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
@@ -127,14 +129,13 @@ entry:
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_mul_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-; VI:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-DAG:  v_mul_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG:  v_mul_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmul_v2f16_imm_b(
+define amdgpu_kernel void @fmul_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/fmul.ll b/test/CodeGen/AMDGPU/fmul.ll
index d0c39b5394566578d5e716e37fb99be57736915d..125de7aabfd4c01b2d79dd857e713bea790ab44f 100644
--- a/test/CodeGen/AMDGPU/fmul.ll
+++ b/test/CodeGen/AMDGPU/fmul.ll
@@ -6,24 +6,20 @@
 ; GCN: v_mul_f32
 
 ; R600: MUL_IEEE {{\** *}}{{T[0-9]+\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @fmul_f32(float addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = fmul float %a, %b
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-declare float @llvm.r600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 ; FUNC-LABEL: {{^}}fmul_v2f32:
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW]}}
-define void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fmul_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
 entry:
   %0 = fmul <2 x float> %a, %b
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -40,7 +36,7 @@ entry:
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: MUL_IEEE {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1) * %in
   %b = load <4 x float>, <4 x float> addrspace(1) * %b_ptr
@@ -53,7 +49,7 @@ define void @fmul_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
 ; GCN: v_mul_f32
 ; GCN-NOT: v_mul_f32
 ; GCN: s_endpgm
-define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 2.0
   %z = fmul float %y, 3.0
   store float %z, float addrspace(1)* %out
@@ -65,7 +61,7 @@ define void @test_mul_2_k(float addrspace(1)* %out, float %x) #0 {
 ; GCN-NOT: v_mul_f32
 ; GCN-NOT: v_mad_f32
 ; GCN: s_endpgm
-define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
   %y = fmul float %x, 3.0
   %z = fmul float %y, 2.0
   store float %z, float addrspace(1)* %out
@@ -79,7 +75,7 @@ define void @test_mul_2_k_inv(float addrspace(1)* %out, float %x) #0 {
 ; GCN: v_mul_f32
 ; GCN: v_mul_f32
 ; GCN-NOT: v_mul_f32
-define void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
+define amdgpu_kernel void @test_mul_twouse(float addrspace(1)* %out, float %x, float %y) #0 {
   %a = fmul float %x, 5.0
   %b = fsub float -0.0, %a
   %c = fmul float %b, %y
diff --git a/test/CodeGen/AMDGPU/fmul64.ll b/test/CodeGen/AMDGPU/fmul64.ll
index 3c222eaba89d1f2c740fb7bef4009b18a5cc42e1..f14233f267b2be6f2ddf03658e0683f4551701b1 100644
--- a/test/CodeGen/AMDGPU/fmul64.ll
+++ b/test/CodeGen/AMDGPU/fmul64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fmul_f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
    %r0 = load double, double addrspace(1)* %in1
    %r1 = load double, double addrspace(1)* %in2
@@ -15,7 +15,7 @@ define void @fmul_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; FUNC-LABEL: {{^}}fmul_v2f64:
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) {
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1
    %r1 = load <2 x double>, <2 x double> addrspace(1)* %in2
@@ -29,7 +29,7 @@ define void @fmul_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in1,
                         <4 x double> addrspace(1)* %in2) {
    %r0 = load <4 x double>, <4 x double> addrspace(1)* %in1
    %r1 = load <4 x double>, <4 x double> addrspace(1)* %in2
diff --git a/test/CodeGen/AMDGPU/fmuladd.f16.ll b/test/CodeGen/AMDGPU/fmuladd.f16.ll
index 477ae3da5074fdab2d32b4d35a0ec0b7c65b180e..9b713419e7471b0ca41dee68cd39b4be1fb64bf0 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f16.ll
@@ -16,7 +16,7 @@ declare half @llvm.fabs.f16(half) #1
 ; VI-FLUSH: v_mac_f16_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
 ; VI-DENORM: v_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
                          half addrspace(1)* %in2, half addrspace(1)* %in3) #0 {
   %r0 = load half, half addrspace(1)* %in1
   %r1 = load half, half addrspace(1)* %in2
@@ -34,7 +34,7 @@ define void @fmuladd_f16(half addrspace(1)* %out, half addrspace(1)* %in1,
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -56,7 +56,7 @@ define void @fmuladd_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -82,7 +82,7 @@ define void @fmuladd_a_2.0_b_f16(half addrspace(1)* %out, half addrspace(1)* %in
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f16(half addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f16(half addrspace(1)* %out,
                             half addrspace(1)* %in1,
                             half addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -111,7 +111,7 @@ define void @fadd_a_a_b_f16(half addrspace(1)* %out,
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f16(half addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f16(half addrspace(1)* %out,
                             half addrspace(1)* %in1,
                             half addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -134,7 +134,7 @@ define void @fadd_b_a_a_f16(half addrspace(1)* %out,
 ; VI-FLUSH: v_mac_f16_e32 [[R2]], -2.0, [[R1]]
 ; VI-DENORM: v_fma_f16 [[R2:v[0-9]+]], [[R1]], -2.0, [[R2]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[R2]]
-define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -156,7 +156,7 @@ define void @fmuladd_neg_2.0_a_b_f16(half addrspace(1)* %out, half addrspace(1)*
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], -2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -180,7 +180,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace
 
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], -[[R1]], 2.0, [[R2]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -202,7 +202,7 @@ define void @fmuladd_2.0_neg_a_b_f16(half addrspace(1)* %out, half addrspace(1)*
 ; VI-FLUSH: v_mad_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
 ; VI-DENORM: v_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, -[[R2]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -231,7 +231,7 @@ define void @fmuladd_2.0_a_neg_b_f16(half addrspace(1)* %out, half addrspace(1)*
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -261,7 +261,7 @@ define void @mad_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspa
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -291,7 +291,7 @@ define void @mad_sub_inv_f16(half addrspace(1)* noalias nocapture %out, half add
 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], [[TMP]], |[[REGC]]|
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -323,7 +323,7 @@ define void @mad_sub_fabs_f16(half addrspace(1)* noalias nocapture %out, half ad
 ; VI-DENORM-STRICT: v_sub_f16_e64 [[RESULT:v[0-9]+]], |[[REGC]]|, [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -355,7 +355,7 @@ define void @mad_sub_fabs_inv_f16(half addrspace(1)* noalias nocapture %out, hal
 ; VI-DENORM-STRICT: v_mul_f16_e32 [[TMP:v[0-9]+]], [[REGB]], [[REGA]]
 ; VI-DENORM-STRICT: v_add_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[REGC]]
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -388,7 +388,7 @@ define void @neg_neg_mad_f16(half addrspace(1)* noalias nocapture %out, half add
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[REGC]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr half, half addrspace(1)* %ptr, i64 %tid.ext
@@ -419,7 +419,7 @@ define void @mad_fabs_sub_f16(half addrspace(1)* noalias nocapture %out, half ad
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[TMP]], [[R2]]
 
 ; VI-DENORM: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
@@ -447,7 +447,7 @@ define void @fsub_c_fadd_a_a_f16(half addrspace(1)* %out, half addrspace(1)* %in
 ; VI-DENORM-STRICT: v_subrev_f16_e32 [[RESULT:v[0-9]+]], [[R2]], [[TMP]]
 
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_fadd_a_a_c_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr half, half addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr half, half addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmuladd.f32.ll b/test/CodeGen/AMDGPU/fmuladd.f32.ll
index e4b1053ff25c74a0e748570fb7203bfe009bc4d5..fb605dd2e4bd480a40c59b6229f33ae42d687374 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f32.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f32.ll
@@ -25,7 +25,7 @@ declare float @llvm.fabs.f32(float) #1
 
 ; GCN-DENORM-SLOWFMA: v_mul_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; GCN-DENORM-SLOWFMA: v_add_f32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
-define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                          float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
   %r0 = load float, float addrspace(1)* %in1
   %r1 = load float, float addrspace(1)* %in2
@@ -45,7 +45,7 @@ define void @fmuladd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 
 ; GCN-DENORM-STRICT: v_mul_f32_e32
 ; GCN-DENORM-STRICT: v_add_f32_e32
-define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                            float addrspace(1)* %in2, float addrspace(1)* %in3) #0 {
   %r0 = load volatile float, float addrspace(1)* %in1
   %r1 = load volatile float, float addrspace(1)* %in2
@@ -71,7 +71,7 @@ define void @fmul_fadd_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 
 ; SI-DENORM buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -100,7 +100,7 @@ define void @fmuladd_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -132,7 +132,7 @@ define void @fmuladd_a_2.0_b_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f32(float addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -167,7 +167,7 @@ define void @fadd_a_a_b_f32(float addrspace(1)* %out,
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f32(float addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f32(float addrspace(1)* %out,
                             float addrspace(1)* %in1,
                             float addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -196,7 +196,7 @@ define void @fadd_b_a_a_f32(float addrspace(1)* %out,
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -225,7 +225,7 @@ define void @fmuladd_neg_2.0_a_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -256,7 +256,7 @@ define void @fmuladd_neg_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspa
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -286,7 +286,7 @@ define void @fmuladd_2.0_neg_a_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -318,7 +318,7 @@ define void @fmuladd_2.0_a_neg_b_f32(float addrspace(1)* %out, float addrspace(1
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -353,7 +353,7 @@ define void @mad_sub_f32(float addrspace(1)* noalias nocapture %out, float addrs
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -387,7 +387,7 @@ define void @mad_sub_inv_f32(float addrspace(1)* noalias nocapture %out, float a
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -422,7 +422,7 @@ define void @mad_sub_fabs_f32(float addrspace(1)* noalias nocapture %out, float
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -460,7 +460,7 @@ define void @mad_sub_fabs_inv_f32(float addrspace(1)* noalias nocapture %out, fl
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -496,7 +496,7 @@ define void @neg_neg_mad_f32(float addrspace(1)* noalias nocapture %out, float a
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
+define amdgpu_kernel void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float addrspace(1)* noalias nocapture readonly %ptr) #0 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr float, float addrspace(1)* %ptr, i64 %tid.ext
@@ -532,7 +532,7 @@ define void @mad_fabs_sub_f32(float addrspace(1)* noalias nocapture %out, float
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; VI-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -563,7 +563,7 @@ define void @fsub_c_fadd_a_a_f32(float addrspace(1)* %out, float addrspace(1)* %
 
 ; SI: buffer_store_dword [[RESULT]]
 ; VI: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fsub_fadd_a_a_c_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/fmuladd.f64.ll b/test/CodeGen/AMDGPU/fmuladd.f64.ll
index f5e64b3c594161945fcdfbbda62b467668e3fd31..86e91e04b0fc3bc5c82afc4550e301fed26f5601 100644
--- a/test/CodeGen/AMDGPU/fmuladd.f64.ll
+++ b/test/CodeGen/AMDGPU/fmuladd.f64.ll
@@ -7,7 +7,7 @@
 
 ; GCN-LABEL: {{^}}fmuladd_f64:
 ; GCN: v_fma_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                          double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -22,7 +22,7 @@ define void @fmuladd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; GCN-STRICT: v_mul_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
 ; GCN-STRICT: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                            double addrspace(1)* %in2, double addrspace(1)* %in3) #0 {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -44,7 +44,7 @@ define void @fmul_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_a_a_b_f64(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64(double addrspace(1)* %out,
                             double addrspace(1)* %in1,
                             double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -72,7 +72,7 @@ define void @fadd_a_a_b_f64(double addrspace(1)* %out,
 
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @fadd_b_a_a_f64(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_b_a_a_f64(double addrspace(1)* %out,
                             double addrspace(1)* %in1,
                             double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -94,7 +94,7 @@ define void @fadd_b_a_a_f64(double addrspace(1)* %out,
 ; GCN-STRICT: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
 
 ; GCN-CONTRACT: v_fma_f64 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, -v{{\[[0-9]+:[0-9]+\]}}
-define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
+define amdgpu_kernel void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double addrspace(1)* noalias nocapture readonly %ptr) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid.ext = sext i32 %tid to i64
   %gep0 = getelementptr double, double addrspace(1)* %ptr, i64 %tid.ext
@@ -117,7 +117,7 @@ define void @mad_sub_f64(double addrspace(1)* noalias nocapture %out, double add
 ; GCN-STRICT: v_add_f64
 
 ; GCN-CONTRACT: v_fma_f64
-define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
                                       double addrspace(1)* %in1,
                                       double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -139,7 +139,7 @@ define void @fadd_a_a_b_f64_fast_add0(double addrspace(1)* %out,
 ; GCN-STRICT: v_add_f64
 
 ; GCN-CONTRACT: v_fma_f64
-define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
                                       double addrspace(1)* %in1,
                                       double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
@@ -158,7 +158,7 @@ define void @fadd_a_a_b_f64_fast_add1(double addrspace(1)* %out,
 
 ; GCN-LABEL: {{^}}fadd_a_a_b_f64_fast:
 ; GCN: v_fma_f64
-define void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
+define amdgpu_kernel void @fadd_a_a_b_f64_fast(double addrspace(1)* %out,
                                  double addrspace(1)* %in1,
                                 double addrspace(1)* %in2) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/fmuladd.v2f16.ll b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bdd3c04fd31894ecee4288752f2afe160989594e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fmuladd.v2f16.ll
@@ -0,0 +1,107 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-FLUSH,GFX9 %s
+
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=on -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-STRICT,GFX9-DENORM-STRICT,GFX9-DENORM,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=+fp64-fp16-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GCN-CONTRACT,GFX9-DENORM-CONTRACT,GFX9-DENORM,GFX9 %s
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare <2 x half> @llvm.fmuladd.v2f16(<2 x half>, <2 x half>, <2 x half>) #1
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
+
+; GCN-LABEL: {{^}}fmuladd_v2f16:
+; GFX9-FLUSH: v_pk_mul_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+; GFX9-FLUSH: v_pk_add_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+
+; GFX9-DENORM: v_pk_fma_f16 {{v[0-9]+, v[0-9]+, v[0-9]+}}
+define amdgpu_kernel void @fmuladd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in1,
+                         <2 x half> addrspace(1)* %in2, <2 x half> addrspace(1)* %in3) #0 {
+  %r0 = load <2 x half>, <2 x half> addrspace(1)* %in1
+  %r1 = load <2 x half>, <2 x half> addrspace(1)* %in2
+  %r2 = load <2 x half>, <2 x half> addrspace(1)* %in3
+  %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r0, <2 x half> %r1, <2 x half> %r2)
+  store <2 x half> %r3, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmuladd_2.0_a_b_v2f16:
+; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
+; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
+; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
+
+; GFX9-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @fmuladd_2.0_a_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+
+  %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
+  %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
+
+  %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> <half 2.0, half 2.0>, <2 x half> %r1, <2 x half> %r2)
+  store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fmuladd_a_2.0_b_v2f16:
+; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
+; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
+; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
+
+; GFX9-FLUSH: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+
+; GFX9-DENORM: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; GFX9-DENORM: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @fmuladd_a_2.0_b_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+
+  %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
+  %r2 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
+
+  %r3 = tail call <2 x half> @llvm.fmuladd.v2f16(<2 x half> %r1, <2 x half> <half 2.0, half 2.0>, <2 x half> %r2)
+  store <2 x half> %r3, <2 x half> addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fadd_a_a_b_v2f16:
+; GCN: {{buffer|flat}}_load_dword [[R1:v[0-9]+]],
+; GCN: {{buffer|flat}}_load_dword [[R2:v[0-9]+]],
+; GFX9-FLUSH: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
+; GFX9-FLUSH: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
+
+; GFX9-DENORM-STRICT: v_pk_add_f16 [[ADD0:v[0-9]+]], [[R1]], [[R1]]
+; GFX9-DENORM-STRICT: v_pk_add_f16 [[RESULT:v[0-9]+]], [[ADD0]], [[R2]]
+
+; GFX9-DENORM-CONTRACT: v_pk_fma_f16 [[RESULT:v[0-9]+]], [[R1]], 2.0, [[R2]]
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
+define amdgpu_kernel void @fadd_a_a_b_v2f16(<2 x half> addrspace(1)* %out,
+                            <2 x half> addrspace(1)* %in1,
+                            <2 x half> addrspace(1)* %in2) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %gep.1 = getelementptr <2 x half>, <2 x half> addrspace(1)* %gep.0, i32 1
+  %gep.out = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+
+  %r0 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.0
+  %r1 = load volatile <2 x half>, <2 x half> addrspace(1)* %gep.1
+
+  %add.0 = fadd <2 x half> %r0, %r0
+  %add.1 = fadd <2 x half> %add.0, %r1
+  store <2 x half> %add.1, <2 x half> addrspace(1)* %gep.out
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fnearbyint.ll b/test/CodeGen/AMDGPU/fnearbyint.ll
index 5423fadf81e2826b92d0e7bc64fdc80479ed34d5..4ff3bbbcbc3ef8ff126fd47b7af485418bdb80cf 100644
--- a/test/CodeGen/AMDGPU/fnearbyint.ll
+++ b/test/CodeGen/AMDGPU/fnearbyint.ll
@@ -13,41 +13,41 @@ declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>) #0
 declare <4 x double> @llvm.nearbyint.v4f64(<4 x double>) #0
 
 
-define void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @fnearbyint_f32(float addrspace(1)* %out, float %in) #1 {
 entry:
   %0 = call float @llvm.nearbyint.f32(float %in)
   store float %0, float addrspace(1)* %out
   ret void
 }
 
-define void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @fnearbyint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %0 = call <2 x float> @llvm.nearbyint.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
   ret void
 }
 
-define void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+define amdgpu_kernel void @fnearbyint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
 entry:
   %0 = call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
   ret void
 }
 
-define void @nearbyint_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @nearbyint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.nearbyint.f64(double %in)
   store double %0, double addrspace(1)* %out
   ret void
 }
-define void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @nearbyint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
   ret void
 }
 
-define void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @nearbyint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.nearbyint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll
index 37115e795e8d3d3bd8f955de088da190c12100e8..1c0e9a2f13ceaade255c4508fbc513e6f29f8651 100644
--- a/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -1,5 +1,5 @@
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SAFE -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -enable-no-signed-zeros-fp-math -march=amdgcn -mcpu=tahiti -start-after=sink -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-NSZ -check-prefix=SI -check-prefix=FUNC %s
 
 ; --------------------------------------------------------------------------------
 ; fadd tests
@@ -14,7 +14,7 @@
 
 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -35,7 +35,7 @@ define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -53,12 +53,16 @@ define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrsp
 ; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
-; GCN-NEXT: buffer_store_dword [[NEG_ADD]]
+
+; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
+; GCN-SAFE: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]]
+; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
+
+; GCN-NSZ: v_sub_f32_e64 [[NEG_ADD:v[0-9]+]], -[[A]], [[B]]
+; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[ADD]]
+; GCN: buffer_store_dword [[NEG_ADD]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -83,7 +87,7 @@ define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrsp
 
 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -107,7 +111,7 @@ define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)*
 
 ; GCN-NSZ: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -131,7 +135,7 @@ define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 
 ; GCN-NSZ: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -160,7 +164,7 @@ define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; GCN-NSZ-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
-define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -188,7 +192,7 @@ define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN-NSZ-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_ADD]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -214,7 +218,7 @@ define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -235,7 +239,7 @@ define void @v_fneg_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -253,12 +257,11 @@ define void @v_fneg_mul_store_use_mul_f32(float addrspace(1)* %out, float addrsp
 ; GCN-LABEL: {{^}}v_fneg_mul_multi_use_mul_f32:
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
-; GCN-DAG: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_MUL:v[0-9]+]], 0x80000000, [[ADD]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
-; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
-; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
+; GCN-NEXT: buffer_store_dword [[MUL0]]
+; GCN-NEXT: buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -279,7 +282,7 @@ define void @v_fneg_mul_multi_use_mul_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -299,7 +302,7 @@ define void @v_fneg_mul_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -319,7 +322,7 @@ define void @v_fneg_mul_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -342,7 +345,7 @@ define void @v_fneg_mul_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(
 ; GCN-DAG: v_mul_f32_e32 [[NEG_MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -365,7 +368,7 @@ define void @v_fneg_mul_store_use_fneg_x_f32(float addrspace(1)* %out, float add
 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -382,6 +385,300 @@ define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float add
   ret void
 }
 
+; --------------------------------------------------------------------------------
+; fminnum tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_self_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.minnum.f32(float %a, float %a)
+  %min.fneg = fsub float -0.0, %min
+  store float %min.fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_posk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.minnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_negk_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.minnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.minnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_neg0_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_neg0_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.minnum.f32(float -0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_minnum_foldable_use_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_min_f32_e32 [[MIN:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MIN]], [[B]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_0_minnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %min = call float @llvm.minnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  %mul = fmul float %fneg, %b
+  store float %mul, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_minnum_multi_use_minnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_max_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
+; GCN-NEXT: buffer_store_dword [[MAX0]]
+; GCN-NEXT: buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @v_fneg_minnum_multi_use_minnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %min = call float @llvm.minnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  %use1 = fmul float %min, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; fmaxnum tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[B]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %min = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_self_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -[[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_self_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.maxnum.f32(float %a, float %a)
+  %min.fneg = fsub float -0.0, %min
+  store float %min.fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_posk_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], -4.0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_posk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.maxnum.f32(float 4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_negk_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 4.0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_negk_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %min = call float @llvm.maxnum.f32(float -4.0, float %a)
+  %fneg = fsub float -0.000000e+00, %min
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_max_f32_e32 [[RESULT:v[0-9]+]], 0, [[A]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_neg0_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_min_f32_e64 [[RESULT:v[0-9]+]], -[[A]], 0
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_neg0_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %max = call float @llvm.maxnum.f32(float -0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  store float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_0_maxnum_foldable_use_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_max_f32_e32 [[MAX:v[0-9]+]], 0, [[A]]
+; GCN: v_mul_f32_e64 [[RESULT:v[0-9]+]], -[[MAX]], [[B]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @v_fneg_0_maxnum_foldable_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %max = call float @llvm.maxnum.f32(float 0.0, float %a)
+  %fneg = fsub float -0.000000e+00, %max
+  %mul = fmul float %fneg, %b
+  store float %mul, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_maxnum_multi_use_maxnum_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_min_f32_e64 [[MAX0:v[0-9]+]], -[[A]], -[[B]]
+; GCN-NEXT: v_mul_f32_e32 [[MUL1:v[0-9]+]], -4.0, [[MUL0]]
+; GCN-NEXT: buffer_store_dword [[MAX0]]
+; GCN-NEXT: buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @v_fneg_maxnum_multi_use_maxnum_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %min = call float @llvm.maxnum.f32(float %a, float %b)
+  %fneg = fsub float -0.000000e+00, %min
+  %use1 = fmul float %min, 4.0
+  store volatile float %fneg, float addrspace(1)* %out
+  store volatile float %use1, float addrspace(1)* %out
+  ret void
+}
+
 ; --------------------------------------------------------------------------------
 ; fma tests
 ; --------------------------------------------------------------------------------
@@ -396,7 +693,7 @@ define void @v_fneg_mul_multi_use_fneg_x_f32(float addrspace(1)* %out, float add
 
 ; GCN-NSZ: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -420,7 +717,7 @@ define void @v_fneg_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -441,12 +738,17 @@ define void @v_fneg_fma_store_use_fma_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
+
+; GCN-SAFE: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
+; GCN-SAFE: v_xor_b32_e32 [[NEG_FMA:v[0-9]+]], 0x80000000, [[FMA]]
+; GCN-SAFE: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[FMA]]
+
+; GCN-NSZ: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
+; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_FMA]]
+
 ; GCN-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -474,7 +776,7 @@ define void @v_fneg_fma_multi_use_fma_f32(float addrspace(1)* %out, float addrsp
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -501,7 +803,7 @@ define void @v_fneg_fma_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -528,7 +830,7 @@ define void @v_fneg_fma_x_fneg_y_f32(float addrspace(1)* %out, float addrspace(1
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -556,7 +858,7 @@ define void @v_fneg_fma_fneg_fneg_y_f32(float addrspace(1)* %out, float addrspac
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], [[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -584,7 +886,7 @@ define void @v_fneg_fma_fneg_x_fneg_f32(float addrspace(1)* %out, float addrspac
 
 ; GCN-NSZ: v_fma_f32 [[FMA:v[0-9]+]], [[A]], -[[B]], [[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
-define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -614,7 +916,7 @@ define void @v_fneg_fma_x_y_fneg_f32(float addrspace(1)* %out, float addrspace(1
 ; GCN-NSZ-DAG: v_fma_f32 [[FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[FMA]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_A]]
-define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -644,7 +946,7 @@ define void @v_fneg_fma_store_use_fneg_x_y_f32(float addrspace(1)* %out, float a
 ; GCN-NSZ-DAG: v_fma_f32 [[NEG_FMA:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[NEG_FMA]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
+define amdgpu_kernel void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float %d) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -677,7 +979,7 @@ define void @v_fneg_fma_multi_use_fneg_x_y_f32(float addrspace(1)* %out, float a
 
 ; GCN-NSZ: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; GCN-NSZ-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -697,12 +999,17 @@ define void @v_fneg_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.pt
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
-; GCN-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
-; GCN-DAG: v_xor_b32_e32 [[NEG_C:v[0-9]+]], 0x80000000, [[C]]
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
-; GCN-NEXT: buffer_store_dword [[NEG_C]]
+
+; GCN-SAFE: v_mac_f32_e32 [[C]], [[B]], [[A]]
+; GCN-SAFE: v_xor_b32_e32 [[NEG_MAD:v[0-9]+]], 0x80000000, [[C]]
+; GCN-SAFE-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[C]]
+
+; GCN-NSZ: v_mad_f32 [[NEG_MAD:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; GCN-NSZ-NEXT: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[NEG_MAD]]
+
+; GCN: buffer_store_dword [[NEG_MAD]]
 ; GCN-NEXT: buffer_store_dword [[MUL]]
-define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -728,7 +1035,7 @@ define void @v_fneg_fmad_multi_use_fmad_f32(float addrspace(1)* %out, float addr
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f64_f32_e64 [[RESULT:v\[[0-9]+:[0-9]+\]]], -[[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -744,7 +1051,7 @@ define void @v_fneg_fp_extend_f32_to_f64(double addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -763,7 +1070,7 @@ define void @v_fneg_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float a
 ; GCN-DAG: v_xor_b32_e32 [[FNEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
 ; GCN: buffer_store_dword [[FNEG_A]]
-define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -783,7 +1090,7 @@ define void @v_fneg_fp_extend_store_use_fneg_f32_to_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_xor_b32_e32 v[[FNEG_A:[0-9]+]], 0x80000000, v[[CVT_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}
-define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -803,7 +1110,7 @@ define void @v_fneg_multi_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %ou
 ; GCN-DAG: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], v{{\[}}[[CVT_LO]]:[[CVT_HI]]{{\]}}, 4.0
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+}}:[[FNEG_A]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -819,7 +1126,7 @@ define void @v_fneg_multi_foldable_use_fp_extend_fneg_f32_to_f64(double addrspac
 
 ; FIXME: Source modifiers not folded for f16->f32
 ; GCN-LABEL: {{^}}v_fneg_multi_use_fp_extend_fneg_f16_to_f32:
-define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
@@ -833,7 +1140,7 @@ define void @v_fneg_multi_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out
 }
 
 ; GCN-LABEL: {{^}}v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32:
-define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds half, half addrspace(1)* %a.ptr, i64 %tid.ext
@@ -855,7 +1162,7 @@ define void @v_fneg_multi_foldable_use_fp_extend_fneg_f16_to_f32(float addrspace
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cvt_f32_f64_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -871,7 +1178,7 @@ define void @v_fneg_fp_round_f64_to_f32(float addrspace(1)* %out, double addrspa
 ; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
 ; GCN: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -888,10 +1195,9 @@ define void @v_fneg_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double ad
 ; GCN: {{buffer|flat}}_load_dwordx2 v{{\[}}[[A_LO:[0-9]+]]:[[A_HI:[0-9]+]]{{\]}}
 ; GCN-DAG: v_cvt_f32_f64_e32 [[RESULT:v[0-9]+]], v{{\[}}[[A_LO]]:[[A_HI]]{{\]}}
 ; GCN-DAG: v_xor_b32_e32 v[[NEG_A_HI:[0-9]+]], 0x80000000, v[[A_HI]]
-; GCN-DAG: v_mov_b32_e32 v[[NEG_A_LO:[0-9]+]], v[[A_LO]]
 ; GCN: buffer_store_dword [[RESULT]]
-; GCN: buffer_store_dwordx2 v{{\[}}[[NEG_A_LO]]:[[NEG_A_HI]]{{\]}}
-define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+; GCN: buffer_store_dwordx2 v{{\[}}[[A_LO]]:[[NEG_A_HI]]{{\]}}
+define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -911,7 +1217,7 @@ define void @v_fneg_fp_round_store_use_fneg_f64_to_f32(float addrspace(1)* %out,
 ; GCN-DAG: v_mul_f64 [[USE1:v\[[0-9]+:[0-9]+\]]], -[[A]], s{{\[}}
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dwordx2 [[USE1]]
-define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr, double %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -930,7 +1236,7 @@ define void @v_fneg_fp_round_multi_use_fneg_f64_to_f32(float addrspace(1)* %out,
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -946,7 +1252,7 @@ define void @v_fneg_fp_round_f32_to_f16(half addrspace(1)* %out, float addrspace
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_short [[RESULT]]
-define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -965,7 +1271,7 @@ define void @v_fneg_fp_round_fneg_f32_to_f16(half addrspace(1)* %out, float addr
 ; GCN-DAG: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[CVT]]
 ; GCN: buffer_store_dword [[NEG]]
 ; GCN: buffer_store_dword [[CVT]]
-define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out, double addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
@@ -984,7 +1290,7 @@ define void @v_fneg_multi_use_fp_round_fneg_f64_to_f32(float addrspace(1)* %out,
 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_short [[RESULT]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1004,7 +1310,7 @@ define void @v_fneg_fp_round_store_use_fneg_f32_to_f16(half addrspace(1)* %out,
 ; GCN-DAG: v_mul_f32_e64 [[USE1:v[0-9]+]], -[[A]], s
 ; GCN: buffer_store_short [[RESULT]]
 ; GCN: buffer_store_dword [[USE1]]
-define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1027,7 +1333,7 @@ define void @v_fneg_fp_round_multi_use_fneg_f32_to_f16(half addrspace(1)* %out,
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1043,7 +1349,7 @@ define void @v_fneg_rcp_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_f32_e32 [[RESULT:v[0-9]+]], [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1062,7 +1368,7 @@ define void @v_fneg_rcp_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1082,7 +1388,7 @@ define void @v_fneg_rcp_store_use_fneg_f32(float addrspace(1)* %out, float addrs
 ; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1105,7 +1411,7 @@ define void @v_fneg_rcp_multi_use_fneg_f32(float addrspace(1)* %out, float addrs
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rcp_legacy_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1126,7 +1432,7 @@ define void @v_fneg_rcp_legacy_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e64 [[RESULT:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1147,7 +1453,7 @@ define void @v_fneg_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1170,7 +1476,7 @@ define void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addrspace(1)* %out
 ; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1191,7 +1497,7 @@ define void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1211,7 +1517,7 @@ define void @v_fneg_mul_legacy_fneg_x_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1231,7 +1537,7 @@ define void @v_fneg_mul_legacy_x_fneg_f32(float addrspace(1)* %out, float addrsp
 ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
 ; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]]
 ; GCN-NEXT: buffer_store_dword [[ADD]]
-define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1254,7 +1560,7 @@ define void @v_fneg_mul_legacy_fneg_fneg_f32(float addrspace(1)* %out, float add
 ; GCN-DAG: v_mul_legacy_f32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], [[B]], [[A]]
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[NEG_A]]
-define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1277,7 +1583,7 @@ define void @v_fneg_mul_legacy_store_use_fneg_x_f32(float addrspace(1)* %out, fl
 ; GCN-DAG: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}}
 ; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
+define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1304,7 +1610,7 @@ define void @v_fneg_mul_legacy_multi_use_fneg_x_f32(float addrspace(1)* %out, fl
 ; GCN: v_fract_f32_e32 [[FRACT:v[0-9]+]], [[MUL]]
 ; GCN: v_sin_f32_e32 [[RESULT:v[0-9]+]], [[FRACT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1320,7 +1626,7 @@ define void @v_fneg_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_sin_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1340,7 +1646,7 @@ define void @v_fneg_amdgcn_sin_f32(float addrspace(1)* %out, float addrspace(1)*
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_trunc_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1361,10 +1667,13 @@ define void @v_fneg_trunc_f32(float addrspace(1)* %out, float addrspace(1)* %a.p
 ; GCN: v_trunc_f32_e32
 ; GCN: v_subrev_f32_e32
 ; GCN: v_cndmask_b32
+
+; GCN-SAFE: v_add_f32_e32 [[ADD:v[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, [[ADD]]
+
 ; GCN-NSZ: v_sub_f32_e64 [[RESULT:v[0-9]+]], -v{{[0-9]+}}, v{{[0-9]+}}
-; GCN-SAFE: v_xor_b32_e32 [[RESULT:v[0-9]+]], 0x80000000, v{{[0-9]+}}
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1384,7 +1693,7 @@ define void @v_fneg_round_f32(float addrspace(1)* %out, float addrspace(1)* %a.p
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1404,7 +1713,7 @@ define void @v_fneg_rint_f32(float addrspace(1)* %out, float addrspace(1)* %a.pt
 ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
 ; GCN: v_rndne_f32_e64 [[RESULT:v[0-9]+]], -[[A]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+define amdgpu_kernel void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
@@ -1416,6 +1725,391 @@ define void @v_fneg_nearbyint_f32(float addrspace(1)* %out, float addrspace(1)*
   ret void
 }
 
+; --------------------------------------------------------------------------------
+; vintrp tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_interp_p1_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
+; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
+; GCN: v_interp_p1_f32 v{{[0-9]+}}, [[MUL]]
+define amdgpu_kernel void @v_fneg_interp_p1_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.0, %mul
+  %intrp0 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 0, i32 0, i32 0)
+  %intrp1 = call float @llvm.amdgcn.interp.p1(float %fneg, i32 1, i32 0, i32 0)
+  store volatile float %intrp0, float addrspace(1)* %out.gep
+  store volatile float %intrp1, float addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_interp_p2_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
+; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
+; GCN: v_interp_p2_f32 v{{[0-9]+}}, [[MUL]]
+define amdgpu_kernel void @v_fneg_interp_p2_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.0, %mul
+  %intrp0 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 0, i32 0, i32 0)
+  %intrp1 = call float @llvm.amdgcn.interp.p2(float 4.0, float %fneg, i32 1, i32 0, i32 0)
+  store volatile float %intrp0, float addrspace(1)* %out.gep
+  store volatile float %intrp1, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; CopyToReg tests
+; --------------------------------------------------------------------------------
+
+; GCN-LABEL: {{^}}v_fneg_copytoreg_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_mul_f32_e32 [[MUL0:v[0-9]+]], [[B]], [[A]]
+; GCN: s_cbranch_scc1
+
+; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x80000000, [[MUL0]]
+; GCN: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[XOR]]
+; GCN: buffer_store_dword [[MUL1]]
+
+; GCN: buffer_store_dword [[MUL0]]
+define amdgpu_kernel void @v_fneg_copytoreg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.0, %mul
+  %cmp0 = icmp eq i32 %d, 0
+  br i1 %cmp0, label %if, label %endif
+
+if:
+  %mul1 = fmul float %fneg, %c
+  store volatile float %mul1, float addrspace(1)* %out.gep
+  br label %endif
+
+endif:
+  store volatile float %mul, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; inlineasm tests
+; --------------------------------------------------------------------------------
+
+; Can't fold into use, so should fold into source
+; GCN-LABEL: {{^}}v_fneg_inlineasm_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
+; GCN: ; use [[MUL]]
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @v_fneg_inlineasm_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.0, %mul
+  call void asm sideeffect "; use $0", "v"(float %fneg) #0
+  store volatile float %fneg, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; inlineasm tests
+; --------------------------------------------------------------------------------
+
+; Can't fold into use, so should fold into source
+; GCN-LABEL: {{^}}v_fneg_inlineasm_multi_use_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
+; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80000000, [[MUL]]
+; GCN: ; use [[NEG]]
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @v_fneg_inlineasm_multi_use_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, i32 %d) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %mul = fmul float %a, %b
+  %fneg = fsub float -0.0, %mul
+  call void asm sideeffect "; use $0", "v"(float %fneg) #0
+  store volatile float %mul, float addrspace(1)* %out.gep
+  ret void
+}
+
+; --------------------------------------------------------------------------------
+; code size regression tests
+; --------------------------------------------------------------------------------
+
+; There are multiple users of the fneg that must use a VOP3
+; instruction, so there is no penalty
+; GCN-LABEL: {{^}}multiuse_fneg_2_vop3_users_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+
+; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; GCN-NEXT: v_fma_f32 [[FMA1:v[0-9]+]], -[[A]], [[C]], 2.0
+; GCN-NEXT:	buffer_store_dword [[FMA0]]
+; GCN-NEXT:	buffer_store_dword [[FMA1]]
+define amdgpu_kernel void @multiuse_fneg_2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+
+  %fneg.a = fsub float -0.0, %a
+  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float %c)
+  %fma1 = call float @llvm.fma.f32(float %fneg.a, float %c, float 2.0)
+
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %fma1, float addrspace(1)* %out
+  ret void
+}
+
+; There are multiple users, but both require using a larger encoding
+; for the modifier.
+
+; GCN-LABEL: {{^}}multiuse_fneg_2_vop2_users_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+
+; GCN: v_mul_f32_e64 [[MUL0:v[0-9]+]], -[[A]], [[B]]
+; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
+; GCN-NEXT:	buffer_store_dword [[MUL0]]
+; GCN-NEXT:	buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @multiuse_fneg_2_vop2_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+
+  %fneg.a = fsub float -0.0, %a
+  %mul0 = fmul float %fneg.a, %b
+  %mul1 = fmul float %fneg.a, %c
+
+  store volatile float %mul0, float addrspace(1)* %out
+  store volatile float %mul1, float addrspace(1)* %out
+  ret void
+}
+
+; One user is VOP3 so has no cost to folding the modifier, the other does.
+; GCN-LABEL: {{^}}multiuse_fneg_vop2_vop3_users_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+
+; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[A]], [[B]], 2.0
+; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[A]], [[C]]
+
+; GCN:	buffer_store_dword [[FMA0]]
+; GCN-NEXT:	buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @multiuse_fneg_vop2_vop3_users_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+
+  %fneg.a = fsub float -0.0, %a
+  %fma0 = call float @llvm.fma.f32(float %fneg.a, float %b, float 2.0)
+  %mul1 = fmul float %fneg.a, %c
+
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %mul1, float addrspace(1)* %out
+  ret void
+}
+
+; The use of the fneg requires a code size increase, but folding into
+; the source does not
+
+; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
+
+; GCN-SAFE: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], [[B]], 2.0
+; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[FMA0]], [[C]]
+; GCN-SAFE-DAG: v_mul_f32_e64 [[MUL2:v[0-9]+]], -[[FMA0]], [[D]]
+
+; GCN-NSZ: v_fma_f32 [[FMA0:v[0-9]+]], [[A]], -[[B]], -2.0
+; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[C]], [[FMA0]]
+; GCN-NSZ-DAG: v_mul_f32_e32 [[MUL2:v[0-9]+]], [[D]], [[FMA0]]
+
+; GCN: buffer_store_dword [[MUL1]]
+; GCN-NEXT:	buffer_store_dword [[MUL2]]
+define amdgpu_kernel void @free_fold_src_code_size_cost_use_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %d = load volatile float, float addrspace(1)* %d.gep
+
+  %fma0 = call float @llvm.fma.f32(float %a, float %b, float 2.0)
+  %fneg.fma0 = fsub float -0.0, %fma0
+  %mul1 = fmul float %fneg.fma0, %c
+  %mul2 = fmul float %fneg.fma0, %d
+
+  store volatile float %mul1, float addrspace(1)* %out
+  store volatile float %mul2, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}free_fold_src_code_size_cost_use_f64:
+; GCN: {{buffer|flat}}_load_dwordx2 [[A:v\[[0-9]+:[0-9]+\]]]
+; GCN: {{buffer|flat}}_load_dwordx2 [[B:v\[[0-9]+:[0-9]+\]]]
+; GCN: {{buffer|flat}}_load_dwordx2 [[C:v\[[0-9]+:[0-9]+\]]]
+; GCN: {{buffer|flat}}_load_dwordx2 [[D:v\[[0-9]+:[0-9]+\]]]
+
+; GCN: v_fma_f64 [[FMA0:v\[[0-9]+:[0-9]+\]]], [[A]], [[B]], 2.0
+; GCN-DAG: v_mul_f64 [[MUL0:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[C]]
+; GCN-DAG: v_mul_f64 [[MUL1:v\[[0-9]+:[0-9]+\]]], -[[FMA0]], [[D]]
+
+; GCN: buffer_store_dwordx2 [[MUL0]]
+; GCN: buffer_store_dwordx2 [[MUL1]]
+define amdgpu_kernel void @free_fold_src_code_size_cost_use_f64(double addrspace(1)* %out, double addrspace(1)* %a.ptr, double addrspace(1)* %b.ptr, double addrspace(1)* %c.ptr, double addrspace(1)* %d.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds double, double addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds double, double addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds double, double addrspace(1)* %c.ptr, i64 %tid.ext
+  %d.gep = getelementptr inbounds double, double addrspace(1)* %d.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds double, double addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile double, double addrspace(1)* %a.gep
+  %b = load volatile double, double addrspace(1)* %b.gep
+  %c = load volatile double, double addrspace(1)* %c.gep
+  %d = load volatile double, double addrspace(1)* %d.gep
+
+  %fma0 = call double @llvm.fma.f64(double %a, double %b, double 2.0)
+  %fneg.fma0 = fsub double -0.0, %fma0
+  %mul1 = fmul double %fneg.fma0, %c
+  %mul2 = fmul double %fneg.fma0, %d
+
+  store volatile double %mul1, double addrspace(1)* %out
+  store volatile double %mul2, double addrspace(1)* %out
+  ret void
+}
+
+; %trunc.a has one fneg use, but it requires a code size increase and
+; %the fneg can instead be folded for free into the fma.
+
+; GCN-LABEL: {{^}}one_use_cost_to_fold_into_src_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
+; GCN: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
+; GCN: buffer_store_dword [[FMA0]]
+define amdgpu_kernel void @one_use_cost_to_fold_into_src_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %d = load volatile float, float addrspace(1)* %d.gep
+
+  %trunc.a = call float @llvm.trunc.f32(float %a)
+  %trunc.fneg.a = fsub float -0.0, %trunc.a
+  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
+  store volatile float %fma0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}multi_use_cost_to_fold_into_src:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[C:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[D:v[0-9]+]]
+; GCN: v_trunc_f32_e32 [[TRUNC_A:v[0-9]+]], [[A]]
+; GCN-DAG: v_fma_f32 [[FMA0:v[0-9]+]], -[[TRUNC_A]], [[B]], [[C]]
+; GCN-DAG: v_mul_f32_e32 [[MUL1:v[0-9]+]], [[D]], [[TRUNC_A]]
+; GCN: buffer_store_dword [[FMA0]]
+; GCN: buffer_store_dword [[MUL1]]
+define amdgpu_kernel void @multi_use_cost_to_fold_into_src(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float addrspace(1)* %c.ptr, float addrspace(1)* %d.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %c.gep = getelementptr inbounds float, float addrspace(1)* %c.ptr, i64 %tid.ext
+  %d.gep = getelementptr inbounds float, float addrspace(1)* %d.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %c = load volatile float, float addrspace(1)* %c.gep
+  %d = load volatile float, float addrspace(1)* %d.gep
+
+  %trunc.a = call float @llvm.trunc.f32(float %a)
+  %trunc.fneg.a = fsub float -0.0, %trunc.a
+  %fma0 = call float @llvm.fma.f32(float %trunc.fneg.a, float %b, float %c)
+  %mul1 = fmul float %trunc.a, %d
+  store volatile float %fma0, float addrspace(1)* %out
+  store volatile float %mul1, float addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 declare float @llvm.fma.f32(float, float, float) #1
 declare float @llvm.fmuladd.f32(float, float, float) #1
@@ -1424,11 +2118,17 @@ declare float @llvm.trunc.f32(float) #1
 declare float @llvm.round.f32(float) #1
 declare float @llvm.rint.f32(float) #1
 declare float @llvm.nearbyint.f32(float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+
+declare double @llvm.fma.f64(double, double, double) #1
 
 declare float @llvm.amdgcn.sin.f32(float) #1
 declare float @llvm.amdgcn.rcp.f32(float) #1
 declare float @llvm.amdgcn.rcp.legacy(float) #1
 declare float @llvm.amdgcn.fmul.legacy(float, float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
index d7d21311c1b9f409a316b81eb6f22983293f3ef5..555764c15519ead17f7b053554796af59b80055d 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f16.ll
@@ -1,33 +1,35 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=FUNC %s
-; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=CIVI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GFX89 -check-prefix=GCN -check-prefix=CIVI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=GCN %s
 
 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f16:
 ; CI: v_cvt_f32_f16_e32
-; CI: v_cvt_f32_f16_e32
-; CI: v_sub_f32_e64 v{{[0-9]+}}, v{{[0-9]+}}, |v{{[0-9]+}}|
+; CI: v_cvt_f32_f16_e64 [[CVT_ABS_X:v[0-9]+]], |v{{[0-9]+}}|
+; CI: v_subrev_f32_e32 v{{[0-9]+}}, [[CVT_ABS_X]], v{{[0-9]+}}
 
-; VI-NOT: and
-; VI: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
-define void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
+; GFX89-NOT: _and
+; GFX89: v_sub_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|
+define amdgpu_kernel void @fneg_fabs_fadd_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
-  %fsub = fsub half -0.000000e+00, %fabs
+  %fsub = fsub half -0.0, %fabs
   %fadd = fadd half %y, %fsub
   store half %fadd, half addrspace(1)* %out, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f16:
-; CI: v_cvt_f32_f16_e32
-; CI: v_cvt_f32_f16_e32
-; CI: v_mul_f32_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}|
+; CI-DAG: v_cvt_f32_f16_e32
+; CI-DAG: v_cvt_f32_f16_e64 [[CVT_NEG_ABS_X:v[0-9]+]], -|{{v[0-9]+}}|
+; CI: v_mul_f32_e32 {{v[0-9]+}}, [[CVT_NEG_ABS_X]], {{v[0-9]+}}
 ; CI: v_cvt_f16_f32_e32
 
-; VI-NOT: and
-; VI: v_mul_f16_e64 {{v[0-9]+}}, {{v[0-9]+}}, -|{{v[0-9]+}}|
-; VI-NOT: and
-define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
+; GFX89-NOT: _and
+; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], {{v[0-9]+}}, -|{{v[0-9]+}}|
+; GFX89-NOT: [[MUL]]
+; GFX89: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[MUL]]
+define amdgpu_kernel void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
   %fabs = call half @llvm.fabs.f16(half %x)
-  %fsub = fsub half -0.000000e+00, %fabs
+  %fsub = fsub half -0.0, %fabs
   %fmul = fmul half %y, %fsub
   store half %fmul, half addrspace(1)* %out, align 2
   ret void
@@ -39,75 +41,113 @@ define void @fneg_fabs_fmul_f16(half addrspace(1)* %out, half %x, half %y) {
 
 ; GCN-LABEL: {{^}}fneg_fabs_free_f16:
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f16(half addrspace(1)* %out, i16 %in) {
   %bc = bitcast i16 %in to half
   %fabs = call half @llvm.fabs.f16(half %bc)
-  %fsub = fsub half -0.000000e+00, %fabs
+  %fsub = fsub half -0.0, %fabs
   store half %fsub, half addrspace(1)* %out
   ret void
 }
 
-; FIXME: Should use or
 ; GCN-LABEL: {{^}}fneg_fabs_f16:
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+define amdgpu_kernel void @fneg_fabs_f16(half addrspace(1)* %out, half %in) {
   %fabs = call half @llvm.fabs.f16(half %in)
-  %fsub = fsub half -0.000000e+00, %fabs
+  %fsub = fsub half -0.0, %fabs
   store half %fsub, half addrspace(1)* %out, align 2
   ret void
 }
 
 ; GCN-LABEL: {{^}}v_fneg_fabs_f16:
-; CI: v_cvt_f32_f16_e32 v{{[0-9]+}},
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
-define void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+; GCN: v_or_b32_e32 v{{[0-9]+}}, 0x8000, v{{[0-9]+}}
+define amdgpu_kernel void @v_fneg_fabs_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %val = load half, half addrspace(1)* %in, align 2
   %fabs = call half @llvm.fabs.f16(half %val)
-  %fsub = fsub half -0.000000e+00, %fabs
+  %fsub = fsub half -0.0, %fabs
   store half %fsub, half addrspace(1)* %out, align 2
   ret void
 }
 
 ; FIXME: single bit op
-; GCN-LABEL: {{^}}fneg_fabs_v2f16:
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: flat_store_dword
-define void @fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
+; GCN-LABEL: {{^}}s_fneg_fabs_v2f16:
+; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CIVI: flat_store_dword
+
+; GFX9: s_or_b32 s{{[0-9]+}}, 0x80008000, s{{[0-9]+}}
+define amdgpu_kernel void @s_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) {
   %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
-  %fsub = fsub <2 x half> <half -0.000000e+00, half -0.000000e+00>, %fabs
-  store <2 x half> %fsub, <2 x half> addrspace(1)* %out
+  %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
+  store <2 x half> %fneg.fabs, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_v4f16:
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-; CI: v_cvt_f16_f32_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
-
-; VI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
-; VI: flat_store_dwordx2
-define void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
+; CIVI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, [[MASK]],
+
+; GFX9: s_mov_b32 [[MASK:s[0-9]+]], 0x80008000
+; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
+; GFX9: s_or_b32 s{{[0-9]+}}, [[MASK]], s{{[0-9]+}}
+
+; GCN: flat_store_dwordx2
+define amdgpu_kernel void @fneg_fabs_v4f16(<4 x half> addrspace(1)* %out, <4 x half> %in) {
   %fabs = call <4 x half> @llvm.fabs.v4f16(<4 x half> %in)
-  %fsub = fsub <4 x half> <half -0.000000e+00, half -0.000000e+00, half -0.000000e+00, half -0.000000e+00>, %fabs
+  %fsub = fsub <4 x half> <half -0.0, half -0.0, half -0.0, half -0.0>, %fabs
   store <4 x half> %fsub, <4 x half> addrspace(1)* %out
   ret void
 }
 
-declare half @llvm.fabs.f16(half) readnone
-declare <2 x half> @llvm.fabs.v2f16(<2 x half>) readnone
-declare <4 x half> @llvm.fabs.v4f16(<4 x half>) readnone
+; GCN-LABEL: {{^}}fold_user_fneg_fabs_v2f16:
+; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|
+; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+; CI: v_mul_f32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}
+
+; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
+; VI: v_mul_f16_e64 v{{[0-9]+}}, -|v{{[0-9]+}}|, 4.0
+
+; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
+; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
+define amdgpu_kernel void @fold_user_fneg_fabs_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  %fneg.fabs = fsub <2 x half> <half -0.0, half -0.0>, %fabs
+  %mul = fmul <2 x half> %fneg.fabs, <half 4.0, half 4.0>
+  store <2 x half> %mul, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_v2f16:
+; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
+; GFX9: v_mov_b32_e32 [[VABS:v[0-9]+]], [[ABS]]
+; GFX9: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VABS]]
+define amdgpu_kernel void @s_fneg_multi_use_fabs_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
+  store <2 x half> %fabs, <2 x half> addrspace(1)* %out0
+  store <2 x half> %fneg, <2 x half> addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_fneg_multi_use_fabs_foldable_neg_v2f16:
+; GFX9: s_and_b32 [[ABS:s[0-9]+]], s{{[0-9]+}}, 0x7fff7fff
+; GFX9: v_pk_mul_f16 v{{[0-9]+}}, [[ABS]], 4.0 neg_lo:[1,0] neg_hi:[1,0]
+define amdgpu_kernel void @s_fneg_multi_use_fabs_foldable_neg_v2f16(<2 x half> addrspace(1)* %out0, <2 x half> addrspace(1)* %out1, <2 x half> %in) {
+  %fabs = call <2 x half> @llvm.fabs.v2f16(<2 x half> %in)
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %fabs
+  %mul = fmul <2 x half> %fneg, <half 4.0, half 4.0>
+  store <2 x half> %fabs, <2 x half> addrspace(1)* %out0
+  store <2 x half> %mul, <2 x half> addrspace(1)* %out1
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+declare <2 x half> @llvm.fabs.v2f16(<2 x half>) #1
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
index d16e83fd4d5bf6ba98e3e2ca6b3eb762585ffd55..85f544032171c7275095bb413218b2d57b692aea 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.f64.ll
@@ -6,7 +6,7 @@
 
 ; GCN-LABEL: {{^}}fneg_fabs_fadd_f64:
 ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -|v{{\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
-define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
   %fadd = fadd double %y, %fsub
@@ -14,7 +14,7 @@ define void @fneg_fabs_fadd_f64(double addrspace(1)* %out, double %x, double %y)
   ret void
 }
 
-define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)* %xptr, double addrspace(1)* %yptr) {
   %x = load double, double addrspace(1)* %xptr, align 8
   %y = load double, double addrspace(1)* %xptr, align 8
   %fabs = call double @llvm.fabs.f64(double %x)
@@ -26,7 +26,7 @@ define void @v_fneg_fabs_fadd_f64(double addrspace(1)* %out, double addrspace(1)
 
 ; GCN-LABEL: {{^}}fneg_fabs_fmul_f64:
 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -|{{v\[[0-9]+:[0-9]+\]}}|, {{s\[[0-9]+:[0-9]+\]}}
-define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y) {
   %fabs = call double @llvm.fabs.f64(double %x)
   %fsub = fsub double -0.000000e+00, %fabs
   %fmul = fmul double %y, %fsub
@@ -35,7 +35,7 @@ define void @fneg_fabs_fmul_f64(double addrspace(1)* %out, double %x, double %y)
 }
 
 ; GCN-LABEL: {{^}}fneg_fabs_free_f64:
-define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -46,7 +46,7 @@ define void @fneg_fabs_free_f64(double addrspace(1)* %out, i64 %in) {
 ; GCN-LABEL: {{^}}fneg_fabs_fn_free_f64:
 ; GCN: v_bfrev_b32_e32 [[IMMREG:v[0-9]+]], 1{{$}}
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   %fsub = fsub double -0.000000e+00, %fabs
@@ -62,7 +62,7 @@ define void @fneg_fabs_fn_free_f64(double addrspace(1)* %out, i64 %in) {
 ; GCN-DAG: v_or_b32_e32 v[[HI_V:[0-9]+]], s[[HI_X]], [[IMMREG]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO_V:[0-9]+]], s[[LO_X]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_V]]:[[HI_V]]{{\]}}
-define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
   %fabs = call double @llvm.fabs.f64(double %in)
   %fsub = fsub double -0.000000e+00, %fabs
   store double %fsub, double addrspace(1)* %out, align 8
@@ -74,7 +74,7 @@ define void @fneg_fabs_f64(double addrspace(1)* %out, double %in) {
 ; GCN-NOT: 0x80000000
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   %fsub = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %fabs
   store <2 x double> %fsub, <2 x double> addrspace(1)* %out
@@ -88,7 +88,7 @@ define void @fneg_fabs_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in)
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
 ; GCN: v_or_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, [[IMMREG]]
-define void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   %fsub = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %fabs
   store <4 x double> %fsub, <4 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fneg-fabs.ll b/test/CodeGen/AMDGPU/fneg-fabs.ll
index 9ee1171306c724c70b8487ebbe805c38bdae6790..a0cf37b159dbbaedb0a543cd77c41d45f21d68e6 100644
--- a/test/CodeGen/AMDGPU/fneg-fabs.ll
+++ b/test/CodeGen/AMDGPU/fneg-fabs.ll
@@ -5,7 +5,7 @@
 ; FUNC-LABEL: {{^}}fneg_fabs_fadd_f32:
 ; SI-NOT: and
 ; SI: v_subrev_f32_e64 {{v[0-9]+}}, |{{v[0-9]+}}|, {{s[0-9]+}}
-define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
+define amdgpu_kernel void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
   %fadd = fadd float %y, %fsub
@@ -17,7 +17,7 @@ define void @fneg_fabs_fadd_f32(float addrspace(1)* %out, float %x, float %y) {
 ; SI-NOT: and
 ; SI: v_mul_f32_e64 {{v[0-9]+}}, -|{{v[0-9]+}}|, {{s[0-9]+}}
 ; SI-NOT: and
-define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
+define amdgpu_kernel void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
   %fabs = call float @llvm.fabs.f32(float %x)
   %fsub = fsub float -0.000000e+00, %fabs
   %fmul = fmul float %y, %fsub
@@ -35,7 +35,7 @@ define void @fneg_fabs_fmul_f32(float addrspace(1)* %out, float %x, float %y) {
 ; R600: -PV
 
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @llvm.fabs.f32(float %bc)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -49,7 +49,7 @@ define void @fneg_fabs_free_f32(float addrspace(1)* %out, i32 %in) {
 ; R600: -PV
 
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fabs = call float @fabs(float %bc)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -59,7 +59,7 @@ define void @fneg_fabs_fn_free_f32(float addrspace(1)* %out, i32 %in) {
 
 ; FUNC-LABEL: {{^}}fneg_fabs_f32:
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x80000000
-define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
   %fabs = call float @llvm.fabs.f32(float %in)
   %fsub = fsub float -0.000000e+00, %fabs
   store float %fsub, float addrspace(1)* %out, align 4
@@ -68,7 +68,7 @@ define void @fneg_fabs_f32(float addrspace(1)* %out, float %in) {
 
 ; FUNC-LABEL: {{^}}v_fneg_fabs_f32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
-define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %val = load float, float addrspace(1)* %in, align 4
   %fabs = call float @llvm.fabs.f32(float %val)
   %fsub = fsub float -0.000000e+00, %fabs
@@ -86,7 +86,7 @@ define void @v_fneg_fabs_f32(float addrspace(1)* %out, float addrspace(1)* %in)
 ; SI: s_brev_b32 [[SIGNBITK:s[0-9]+]], 1{{$}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
-define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
   %fabs = call <2 x float> @llvm.fabs.v2f32(<2 x float> %in)
   %fsub = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %fabs
   store <2 x float> %fsub, <2 x float> addrspace(1)* %out
@@ -99,7 +99,7 @@ define void @fneg_fabs_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+}}, [[SIGNBITK]], v{{[0-9]+}}
-define void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fneg_fabs_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
   %fabs = call <4 x float> @llvm.fabs.v4f32(<4 x float> %in)
   %fsub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %fabs
   store <4 x float> %fsub, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll
index e3dfd9201a249279b977f61ecf205e6256794e17..626a0b50cce8ad58292c2cfc827f2062ff415c2d 100644
--- a/test/CodeGen/AMDGPU/fneg.f16.ll
+++ b/test/CodeGen/AMDGPU/fneg.f16.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
 
 ; FIXME: Should be able to do scalar op
-; FUNC-LABEL: {{^}}s_fneg_f16:
-
-define void @s_fneg_f16(half addrspace(1)* %out, half %in) {
-  %fneg = fsub half -0.000000e+00, %in
+; GCN-LABEL: {{^}}s_fneg_f16:
+define amdgpu_kernel void @s_fneg_f16(half addrspace(1)* %out, half %in) #0 {
+  %fneg = fsub half -0.0, %in
   store half %fneg, half addrspace(1)* %out
   ret void
 }
@@ -13,49 +13,123 @@ define void @s_fneg_f16(half addrspace(1)* %out, half %in) {
 ; FIXME: Should be able to use bit operations when illegal type as
 ; well.
 
-; FUNC-LABEL: {{^}}v_fneg_f16:
+; GCN-LABEL: {{^}}v_fneg_f16:
 ; GCN: flat_load_ushort [[VAL:v[0-9]+]],
-
-; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[VAL]]
-; CI: v_cvt_f16_f32_e64 [[CVT1:v[0-9]+]], -[[CVT0]]
-; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
-
-; VI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
+; GCN: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[VAL]]
 ; VI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
-define void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
-  %val = load half, half addrspace(1)* %in, align 2
-  %fneg = fsub half -0.000000e+00, %val
-  store half %fneg, half addrspace(1)* %out
+; SI: buffer_store_short [[XOR]]
+define amdgpu_kernel void @v_fneg_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr inbounds half, half addrspace(1)* %in, i32 %tid
+  %val = load half, half addrspace(1)* %gep.in, align 2
+  %fneg = fsub half -0.0, %val
+  store half %fneg, half addrspace(1)* %gep.out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fneg_free_f16:
+; GCN-LABEL: {{^}}fneg_free_f16:
 ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]],
 
 ; XCI: s_xor_b32 [[XOR:s[0-9]+]], [[NEG_VALUE]], 0x8000{{$}}
 ; CI: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[NEG_VALUE]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[XOR]]
-define void @fneg_free_f16(half addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @fneg_free_f16(half addrspace(1)* %out, i16 %in) #0 {
   %bc = bitcast i16 %in to half
   %fsub = fsub half -0.0, %bc
   store half %fsub, half addrspace(1)* %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}v_fneg_fold_f16:
+; GCN-LABEL: {{^}}v_fneg_fold_f16:
 ; GCN: flat_load_ushort [[NEG_VALUE:v[0-9]+]]
 
-; CI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[CVT0]]
-; CI: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[CVT0]], [[CVT0]]
+; CI-DAG: v_cvt_f32_f16_e32 [[CVT_VAL:v[0-9]+]], [[NEG_VALUE]]
+; CI-DAG: v_cvt_f32_f16_e64 [[NEG_CVT0:v[0-9]+]], -[[NEG_VALUE]]
+; CI: v_mul_f32_e32 [[MUL:v[0-9]+]], [[CVT_VAL]], [[NEG_CVT0]]
 ; CI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[MUL]]
 ; CI: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, [[CVT1]]
 
 ; VI-NOT: [[NEG_VALUE]]
 ; VI: v_mul_f16_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fold_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %fsub = fsub half -0.0, %val
   %fmul = fmul half %fsub, %val
   store half %fmul, half addrspace(1)* %out
   ret void
 }
+
+; FIXME: Terrible code with VI and even worse with SI/CI
+; GCN-LABEL: {{^}}s_fneg_v2f16:
+; CI: s_mov_b32 [[MASK:s[0-9]+]], 0x8000{{$}}
+; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_xor_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_or_b32_e32
+
+; VI: v_mov_b32_e32 [[MASK:v[0-9]+]], 0x8000{{$}}
+; VI-DAG: v_xor_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI-DAG: v_xor_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[MASK]]
+
+; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, v{{[0-9]+}}
+
+define amdgpu_kernel void @s_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in) #0 {
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %in
+  store <2 x half> %fneg, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VAL]]
+define amdgpu_kernel void @v_fneg_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+  %gep.out = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i32 %tid
+  %val = load <2 x half>, <2 x half> addrspace(1)* %gep.in, align 2
+  %fneg = fsub <2 x half> <half -0.0, half -0.0>, %val
+  store <2 x half> %fneg, <2 x half> addrspace(1)* %gep.out
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_free_v2f16:
+; GCN: s_load_dword [[VAL:s[0-9]+]]
+; CIVI: s_xor_b32 s{{[0-9]+}}, [[VAL]], 0x80008000
+
+; GFX9: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
+; GFX9: v_xor_b32_e32 v{{[0-9]+}}, 0x80008000, [[VVAL]]
+define amdgpu_kernel void @fneg_free_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+  %bc = bitcast i32 %in to <2 x half>
+  %fsub = fsub <2 x half> <half -0.0, half -0.0>, %bc
+  store <2 x half> %fsub, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_fneg_fold_v2f16:
+; GCN: flat_load_dword [[VAL:v[0-9]+]]
+
+; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}
+; CI: v_cvt_f32_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}
+; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_cvt_f16_f32
+; CI: v_mul_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_cvt_f16_f32
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_mul_f16_e64 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}
+
+; GFX9: v_pk_mul_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,0] neg_hi:[1,0]{{$}}
+define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %val = load <2 x half>, <2 x half> addrspace(1)* %in
+  %fsub = fsub <2 x half> <half -0.0, half -0.0>, %val
+  %fmul = fmul <2 x half> %fsub, %val
+  store <2 x half> %fmul, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fneg.f64.ll b/test/CodeGen/AMDGPU/fneg.f64.ll
index b7080f4622a3b9127d1a0470f3c5ec8f9f0e40fc..9b4b4d6e942aac602a3c9c1da040299629a903ff 100644
--- a/test/CodeGen/AMDGPU/fneg.f64.ll
+++ b/test/CodeGen/AMDGPU/fneg.f64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fneg_f64:
 ; GCN: v_xor_b32
-define void @fneg_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_f64(double addrspace(1)* %out, double %in) {
   %fneg = fsub double -0.000000e+00, %in
   store double %fneg, double addrspace(1)* %out
   ret void
@@ -12,7 +12,7 @@ define void @fneg_f64(double addrspace(1)* %out, double %in) {
 ; FUNC-LABEL: {{^}}fneg_v2f64:
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
+define amdgpu_kernel void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double> %in) {
   %fneg = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %in
   store <2 x double> %fneg, <2 x double> addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ define void @fneg_v2f64(<2 x double> addrspace(1)* nocapture %out, <2 x double>
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
+define amdgpu_kernel void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double> %in) {
   %fneg = fsub <4 x double> <double -0.000000e+00, double -0.000000e+00, double -0.000000e+00, double -0.000000e+00>, %in
   store <4 x double> %fneg, <4 x double> addrspace(1)* %out
   ret void
@@ -40,7 +40,7 @@ define void @fneg_v4f64(<4 x double> addrspace(1)* nocapture %out, <4 x double>
 
 ; FUNC-LABEL: {{^}}fneg_free_f64:
 ; GCN: v_add_f64 {{v\[[0-9]+:[0-9]+\]}}, -{{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
   %bc = bitcast i64 %in to double
   %fsub = fsub double 0.0, %bc
   store double %fsub, double addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @fneg_free_f64(double addrspace(1)* %out, i64 %in) {
 ; VI: s_load_dwordx2 [[NEG_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN-NOT: xor
 ; GCN: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fneg_fold_f64(double addrspace(1)* %out, double %in) {
   %fsub = fsub double -0.0, %in
   %fmul = fmul double %fsub, %in
   store double %fmul, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fneg.ll b/test/CodeGen/AMDGPU/fneg.ll
index 007c6dcadd9e6a4d22eea023f7e918c830743c89..d1eabfb13c9af733d2edf9cd01a2d15da40b7d04 100644
--- a/test/CodeGen/AMDGPU/fneg.ll
+++ b/test/CodeGen/AMDGPU/fneg.ll
@@ -6,7 +6,7 @@
 ; R600: -PV
 
 ; GCN: v_xor_b32
-define void @s_fneg_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @s_fneg_f32(float addrspace(1)* %out, float %in) {
   %fneg = fsub float -0.000000e+00, %in
   store float %fneg, float addrspace(1)* %out
   ret void
@@ -18,7 +18,7 @@ define void @s_fneg_f32(float addrspace(1)* %out, float %in) {
 
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
+define amdgpu_kernel void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) {
   %fneg = fsub <2 x float> <float -0.000000e+00, float -0.000000e+00>, %in
   store <2 x float> %fneg, <2 x float> addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @s_fneg_v2f32(<2 x float> addrspace(1)* nocapture %out, <2 x float>
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
 ; GCN: v_xor_b32
-define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
+define amdgpu_kernel void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) {
   %fneg = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %in
   store <4 x float> %fneg, <4 x float> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @s_fneg_v4f32(<4 x float> addrspace(1)* nocapture %out, <4 x float>
 
 ; R600-NOT: XOR
 ; R600: -KC0[2].Z
-define void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fsub = fsub float 0.0, %bc
   store float %fsub, float addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @fsub0_f32(float addrspace(1)* %out, i32 %in) {
 
 ; R600-NOT: XOR
 ; R600: -PV.W
-define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
   %bc = bitcast i32 %in to float
   %fsub = fsub float -0.0, %bc
   store float %fsub, float addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @fneg_free_f32(float addrspace(1)* %out, i32 %in) {
 ; VI: s_load_dword [[NEG_VALUE:s[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0x2c
 ; GCN-NOT: xor
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[NEG_VALUE]], [[NEG_VALUE]]
-define void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fneg_fold_f32(float addrspace(1)* %out, float %in) {
   %fsub = fsub float -0.0, %in
   %fmul = fmul float %fsub, %in
   store float %fmul, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fold-cndmask.mir b/test/CodeGen/AMDGPU/fold-cndmask.mir
new file mode 100644
index 0000000000000000000000000000000000000000..8dfec91663038b3eafb44994cf754c5f6b625bf3
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fold-cndmask.mir
@@ -0,0 +1,34 @@
+# RUN: llc -march=amdgcn -run-pass si-fold-operands -verify-machineinstrs -o - %s | FileCheck %s
+
+# CHECK: %1 = V_MOV_B32_e32 0, implicit %exec
+# CHECK: %2 = V_MOV_B32_e32 0, implicit %exec
+# CHECK: %4 = COPY %3
+# CHECK: %5 = V_MOV_B32_e32 0, implicit %exec
+# CHECK: %6 = V_MOV_B32_e32 0, implicit %exec
+# CHECK: %7 = COPY %3
+
+---
+name:            fold_cndmask
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: vgpr_32 }
+  - { id: 2, class: vgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: vgpr_32 }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: vgpr_32 }
+body:             |
+  bb.0.entry:
+    %0 = IMPLICIT_DEF
+    %1 = V_CNDMASK_B32_e64 0, 0, %0, implicit %exec
+    %2 = V_CNDMASK_B32_e64 %1, %1, %0, implicit %exec
+    %3 = IMPLICIT_DEF
+    %4 = V_CNDMASK_B32_e64 %3, %3, %0, implicit %exec
+    %5 = COPY %1
+    %6 = V_CNDMASK_B32_e64 %5, 0, %0, implicit %exec
+    %vcc = IMPLICIT_DEF
+    %7 = V_CNDMASK_B32_e32 %3, %3, implicit %exec, implicit %vcc
+
+...
diff --git a/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
new file mode 100644
index 0000000000000000000000000000000000000000..986c6b296c962e945498bc6e09c28f2cbd4f8bc2
--- /dev/null
+++ b/test/CodeGen/AMDGPU/fold-immediate-output-mods.mir
@@ -0,0 +1,306 @@
+# RUN: llc -march=amdgcn -run-pass peephole-opt -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s
+
+--- |
+  define amdgpu_kernel void @no_fold_imm_madak_mac_clamp_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_kernel void @no_fold_imm_madak_mac_omod_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_kernel void @no_fold_imm_madak_mad_clamp_f32() #0 {
+    ret void
+  }
+
+  define amdgpu_kernel void @no_fold_imm_madak_mad_omod_f32() #0 {
+    ret void
+  }
+
+  attributes #0 = { nounwind }
+
+...
+---
+# GCN-LABEL: name: no_fold_imm_madak_mac_clamp_f32
+# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec
+# GCN-NEXT: %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec
+
+name:            no_fold_imm_madak_mac_clamp_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_64_xexec }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sgpr_64 }
+  - { id: 14, class: sgpr_128 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_64 }
+  - { id: 17, class: sgpr_128 }
+  - { id: 18, class: sgpr_128 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vreg_64 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vreg_64 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %28 = REG_SEQUENCE %3, 1, %27, 2
+    %11 = S_MOV_B32 61440
+    %12 = S_MOV_B32 0
+    %13 = REG_SEQUENCE killed %12, 1, killed %11, 2
+    %14 = REG_SEQUENCE killed %5, 17, %13, 18
+    %15 = S_MOV_B32 2
+    %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec
+    %17 = REG_SEQUENCE killed %6, 17, %13, 18
+    %18 = REG_SEQUENCE killed %4, 17, %13, 18
+    %20 = COPY %29
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec
+    %22 = COPY %29
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec
+    %23 = V_MOV_B32_e32 1090519040, implicit %exec
+    %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec
+    %26 = COPY %29
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# GCN-LABEL: name: no_fold_imm_madak_mac_omod_f32
+# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec
+# GCN: %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit %exec
+
+name:            no_fold_imm_madak_mac_omod_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_64_xexec }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sgpr_64 }
+  - { id: 14, class: sgpr_128 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_64 }
+  - { id: 17, class: sgpr_128 }
+  - { id: 18, class: sgpr_128 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vreg_64 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vreg_64 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %28 = REG_SEQUENCE %3, 1, %27, 2
+    %11 = S_MOV_B32 61440
+    %12 = S_MOV_B32 0
+    %13 = REG_SEQUENCE killed %12, 1, killed %11, 2
+    %14 = REG_SEQUENCE killed %5, 17, %13, 18
+    %15 = S_MOV_B32 2
+    %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec
+    %17 = REG_SEQUENCE killed %6, 17, %13, 18
+    %18 = REG_SEQUENCE killed %4, 17, %13, 18
+    %20 = COPY %29
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec
+    %22 = COPY %29
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec
+    %23 = V_MOV_B32_e32 1090519040, implicit %exec
+    %24 = V_MAC_F32_e64 0, killed %19, 0, killed %21, 0, %23, 0, 2, implicit %exec
+    %26 = COPY %29
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# GCN: name: no_fold_imm_madak_mad_clamp_f32
+# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec
+# GCN: %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec
+
+name:            no_fold_imm_madak_mad_clamp_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_64_xexec }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sgpr_64 }
+  - { id: 14, class: sgpr_128 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_64 }
+  - { id: 17, class: sgpr_128 }
+  - { id: 18, class: sgpr_128 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vreg_64 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vreg_64 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %28 = REG_SEQUENCE %3, 1, %27, 2
+    %11 = S_MOV_B32 61440
+    %12 = S_MOV_B32 0
+    %13 = REG_SEQUENCE killed %12, 1, killed %11, 2
+    %14 = REG_SEQUENCE killed %5, 17, %13, 18
+    %15 = S_MOV_B32 2
+    %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec
+    %17 = REG_SEQUENCE killed %6, 17, %13, 18
+    %18 = REG_SEQUENCE killed %4, 17, %13, 18
+    %20 = COPY %29
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec
+    %22 = COPY %29
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec
+    %23 = V_MOV_B32_e32 1090519040, implicit %exec
+    %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 1, 0, implicit %exec
+    %26 = COPY %29
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
+---
+# GCN: name: no_fold_imm_madak_mad_omod_f32
+# GCN: %23 = V_MOV_B32_e32 1090519040, implicit %exec
+# GCN: %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit %exec
+
+name:            no_fold_imm_madak_mad_omod_f32
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_64_xexec }
+  - { id: 6, class: sreg_64_xexec }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32 }
+  - { id: 9, class: sreg_32_xm0 }
+  - { id: 10, class: sreg_64 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sreg_32_xm0 }
+  - { id: 13, class: sgpr_64 }
+  - { id: 14, class: sgpr_128 }
+  - { id: 15, class: sreg_32_xm0 }
+  - { id: 16, class: sreg_64 }
+  - { id: 17, class: sgpr_128 }
+  - { id: 18, class: sgpr_128 }
+  - { id: 19, class: vgpr_32 }
+  - { id: 20, class: vreg_64 }
+  - { id: 21, class: vgpr_32 }
+  - { id: 22, class: vreg_64 }
+  - { id: 23, class: vgpr_32 }
+  - { id: 24, class: vgpr_32 }
+  - { id: 25, class: vgpr_32 }
+  - { id: 26, class: vreg_64 }
+  - { id: 27, class: vgpr_32 }
+  - { id: 28, class: vreg_64 }
+  - { id: 29, class: vreg_64 }
+liveins:
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %sgpr0_sgpr1, %vgpr0
+
+    %3 = COPY %vgpr0
+    %0 = COPY %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORDX2_IMM %0, 11, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %6 = S_LOAD_DWORDX2_IMM %0, 13, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %27 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    %28 = REG_SEQUENCE %3, 1, %27, 2
+    %11 = S_MOV_B32 61440
+    %12 = S_MOV_B32 0
+    %13 = REG_SEQUENCE killed %12, 1, killed %11, 2
+    %14 = REG_SEQUENCE killed %5, 17, %13, 18
+    %15 = S_MOV_B32 2
+    %29 = V_LSHL_B64 killed %28, killed %15, implicit %exec
+    %17 = REG_SEQUENCE killed %6, 17, %13, 18
+    %18 = REG_SEQUENCE killed %4, 17, %13, 18
+    %20 = COPY %29
+    %19 = BUFFER_LOAD_DWORD_ADDR64 %20, killed %14, 0, 0, 0, 0, 0, implicit %exec
+    %22 = COPY %29
+    %21 = BUFFER_LOAD_DWORD_ADDR64 %22, killed %17, 0, 0, 0, 0, 0, implicit %exec
+    %23 = V_MOV_B32_e32 1090519040, implicit %exec
+    %24 = V_MAD_F32 0, killed %19, 0, killed %21, 0, %23, 0, 1, implicit %exec
+    %26 = COPY %29
+    BUFFER_STORE_DWORD_ADDR64 killed %24, %26, killed %18, 0, 0, 0, 0, 0, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/fp-classify.ll b/test/CodeGen/AMDGPU/fp-classify.ll
index b7ffaed70c5a758ed05b1db656a0e6d25d5ddd18..cbc42979f2ee54d2db1602e593519da7be4de83e 100644
--- a/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/test/CodeGen/AMDGPU/fp-classify.ll
@@ -9,7 +9,7 @@ declare double @llvm.fabs.f64(double) #1
 ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
 ; SI-NOT: v_cmp
 ; SI: s_endpgm
-define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp oeq float %fabs, 0x7FF0000000000000
   %ext = zext i1 %cmp to i32
@@ -20,7 +20,7 @@ define void @test_isinf_pattern(i32 addrspace(1)* nocapture %out, float %x) #0 {
 ; SI-LABEL: {{^}}test_not_isinf_pattern_0:
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp ueq float %fabs, 0x7FF0000000000000
   %ext = zext i1 %cmp to i32
@@ -31,7 +31,7 @@ define void @test_not_isinf_pattern_0(i32 addrspace(1)* nocapture %out, float %x
 ; SI-LABEL: {{^}}test_not_isinf_pattern_1:
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %fabs = tail call float @llvm.fabs.f32(float %x) #1
   %cmp = fcmp oeq float %fabs, 0xFFF0000000000000
   %ext = zext i1 %cmp to i32
@@ -45,7 +45,7 @@ define void @test_not_isinf_pattern_1(i32 addrspace(1)* nocapture %out, float %x
 ; SI: v_cmp_class_f32_e32 vcc, s{{[0-9]+}}, [[MASK]]
 ; SI-NOT: v_cmp
 ; SI: s_endpgm
-define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -59,7 +59,7 @@ define void @test_isfinite_pattern_0(i32 addrspace(1)* nocapture %out, float %x)
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_0:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0xFFF0000000000000
@@ -73,7 +73,7 @@ define void @test_isfinite_not_pattern_0(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_1:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %ninf = fcmp une float %x, 0x7FF0000000000000
   %and = and i1 %ord, %ninf
@@ -86,7 +86,7 @@ define void @test_isfinite_not_pattern_1(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_2:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float %x, float %y) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %y) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -100,7 +100,7 @@ define void @test_isfinite_not_pattern_2(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_3:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp uno float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp une float %x.fabs, 0x7FF0000000000000
@@ -114,7 +114,7 @@ define void @test_isfinite_not_pattern_3(i32 addrspace(1)* nocapture %out, float
 ; SI-LABEL: {{^}}test_isfinite_not_pattern_4:
 ; SI-NOT: v_cmp_class_f32
 ; SI: s_endpgm
-define void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
+define amdgpu_kernel void @test_isfinite_not_pattern_4(i32 addrspace(1)* nocapture %out, float %x) #0 {
   %ord = fcmp ord float %x, 0.000000e+00
   %x.fabs = tail call float @llvm.fabs.f32(float %x) #1
   %ninf = fcmp one float %x.fabs, 0x7FF0000000000000
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index 01bc53ff35a557fe3bd63d351d6aa4d0013ca3f5..ce041364b76db1baf63954d5b1c3e8071a4366c0 100644
--- a/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -14,7 +14,7 @@ declare float @llvm.convert.from.fp16.f32(i16) nounwind readnone
 ; CM: MEM_RAT_CACHELESS STORE_DWORD [[RES:T[0-9]+\.[XYZW]]]
 ; EGCM: VTX_READ_16 [[VAL:T[0-9]+\.[XYZW]]]
 ; EGCM: FLT16_TO_FLT32{{[ *]*}}[[RES]], [[VAL]]
-define void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp16_to_fp32(float addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
   %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call float @llvm.convert.from.fp16.f32(i16 %val) nounwind readnone
   store float %cvt, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index a9f493bf0ccd6fa4190271786ec56a571c5e6fff..70f0c0c1afdb7dd0389580a1cd57d2c4a263cf78 100644
--- a/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -8,7 +8,7 @@ declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
 ; GCN: v_cvt_f32_f16_e32 [[RESULT32:v[0-9]+]], [[VAL]]
 ; GCN: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[RESULT32]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp16_to_fp64(double addrspace(1)* noalias %out, i16 addrspace(1)* noalias %in) nounwind {
   %val = load i16, i16 addrspace(1)* %in, align 2
   %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
   store double %cvt, double addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index 3e426e3e94b157ae5cebe0910b41ac377f4604c9..2c6b1cb18f7e64e414e82c9509e5ec0d38acc92b 100644
--- a/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -12,7 +12,7 @@ declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
 ; EG: MEM_RAT MSKOR
 ; EG: VTX_READ_32
 ; EG: FLT32_TO_FLT16
-define void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_convert_fp32_to_fp16(i16 addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %cvt = call i16 @llvm.convert.to.fp16.f32(float %val) nounwind readnone
   store i16 %cvt, i16 addrspace(1)* %out, align 2
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
index 1537d67cadcc685fabf6e2910f7f54ce5cd4cf4f..a7cddd09b762864325690b1055ae7efe3b3484eb 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.f64.ll
@@ -6,7 +6,7 @@ declare double @llvm.fabs.f64(double) #1
 
 ; FUNC-LABEL: @fp_to_sint_f64_i32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
   %result = fptosi double %in to i32
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -15,7 +15,7 @@ define void @fp_to_sint_f64_i32(i32 addrspace(1)* %out, double %in) {
 ; FUNC-LABEL: @fp_to_sint_v2f64_v2i32
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
   %result = fptosi <2 x double> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@ define void @fp_to_sint_v2f64_v2i32(<2 x i32> addrspace(1)* %out, <2 x double> %
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
 ; SI: v_cvt_i32_f64_e32
-define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
   %result = fptosi <4 x double> %in to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
@@ -47,7 +47,7 @@ define void @fp_to_sint_v4f64_v4i32(<4 x i32> addrspace(1)* %out, <4 x double> %
 ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
 ; CI-DAG: v_cvt_i32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %val = load double, double addrspace(1)* %gep, align 8
@@ -58,7 +58,7 @@ define void @fp_to_sint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in
 
 ; FUNC-LABEL: {{^}}fp_to_sint_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{\[[0-9]+:[0-9]+\]}}
-define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %conv = fptosi double %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define void @fp_to_sint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_sint_fabs_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{\[[0-9]+:[0-9]+\]}}|
-define void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_sint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %in.fabs = call double @llvm.fabs.f64(double %in)
   %conv = fptosi double %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fp_to_sint.ll b/test/CodeGen/AMDGPU/fp_to_sint.ll
index a2fa7a1907450478810013014c9332049630211b..630a7186e10116c8eb3b63e62e889406d9a0d8ff 100644
--- a/test/CodeGen/AMDGPU/fp_to_sint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_sint.ll
@@ -8,7 +8,7 @@ declare float @llvm.fabs.f32(float) #1
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI: v_cvt_i32_f32_e32
 ; SI: s_endpgm
-define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
   %conv = fptosi float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @fp_to_sint_i32(i32 addrspace(1)* %out, float %in) {
 
 ; FUNC-LABEL: {{^}}fp_to_sint_i32_fabs:
 ; SI: v_cvt_i32_f32_e64 v{{[0-9]+}}, |s{{[0-9]+}}|{{$}}
-define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptosi float %in.fabs to i32
   store i32 %conv, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @fp_to_sint_i32_fabs(i32 addrspace(1)* %out, float %in) {
 ; EG: FLT_TO_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW]}}
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
-define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptosi <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @fp_to_sint_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
 ; SI: v_cvt_i32_f32_e32
-define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float>, <4 x float> addrspace(1) * %in
   %result = fptosi <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -76,7 +76,7 @@ define void @fp_to_sint_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspac
 
 ; Check that the compiler doesn't crash with a "cannot select" error
 ; SI: s_endpgm
-define void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_sint_i64 (i64 addrspace(1)* %out, float %in) {
 entry:
   %0 = fptosi float %in to i64
   store i64 %0, i64 addrspace(1)* %out
@@ -128,7 +128,7 @@ entry:
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
   %conv = fptosi <2 x float> %x to <2 x i64>
   store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
   ret void
@@ -221,7 +221,7 @@ define void @fp_to_sint_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
 ; EG-DAG: CNDE_INT
 
 ; SI: s_endpgm
-define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   %conv = fptosi <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
@@ -233,7 +233,7 @@ define void @fp_to_sint_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
 ; EG: AND_INT
 ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, literal.y,
 ; EG-NEXT: -1082130432(-1.000000e+00)
-define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %conv = fptosi float %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -241,7 +241,7 @@ define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
 ; SI: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, |s{{[0-9]+}}|
-define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptosi float %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
@@ -251,7 +251,7 @@ define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 ; FUNC-LABEL: {{^}}fp_to_sint_f32_i16:
 ; GCN: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_short [[VAL]]
-define void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_sint_f32_i16(i16 addrspace(1)* %out, float %in) #0 {
   %sint = fptosi float %in to i16
   store i16 %sint, i16 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
index d5bc416434df89cca28be77e2dbc305b254d6e3b..4f597eb3f32c36aad3bb2ef65995c9bd21126075 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.f64.ll
@@ -6,7 +6,7 @@ declare double @llvm.fabs.f64(double) #1
 
 ; SI-LABEL: {{^}}fp_to_uint_i32_f64:
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
   %cast = fptoui double %in to i32
   store i32 %cast, i32 addrspace(1)* %out, align 4
   ret void
@@ -15,7 +15,7 @@ define void @fp_to_uint_i32_f64(i32 addrspace(1)* %out, double %in) {
 ; SI-LABEL: @fp_to_uint_v2i32_v2f64
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %in) {
   %cast = fptoui <2 x double> %in to <2 x i32>
   store <2 x i32> %cast, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -26,7 +26,7 @@ define void @fp_to_uint_v2i32_v2f64(<2 x i32> addrspace(1)* %out, <2 x double> %
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
 ; SI: v_cvt_u32_f64_e32
-define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %in) {
   %cast = fptoui <4 x double> %in to <4 x i32>
   store <4 x i32> %cast, <4 x i32> addrspace(1)* %out, align 8
   ret void
@@ -47,7 +47,7 @@ define void @fp_to_uint_v4i32_v4f64(<4 x i32> addrspace(1)* %out, <4 x double> %
 ; CI-DAG: v_cvt_u32_f64_e32 v[[LO:[0-9]+]], [[FMA]]
 ; CI-DAG: v_cvt_u32_f64_e32 v[[HI:[0-9]+]], [[FLOOR]]
 ; CI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %val = load double, double addrspace(1)* %gep, align 8
@@ -57,14 +57,14 @@ define void @fp_to_uint_i64_f64(i64 addrspace(1)* %out, double addrspace(1)* %in
 }
 
 ; SI-LABEL: @fp_to_uint_v2i64_v2f64
-define void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v2i64_v2f64(<2 x i64> addrspace(1)* %out, <2 x double> %in) {
   %cast = fptoui <2 x double> %in to <2 x i64>
   store <2 x i64> %cast, <2 x i64> addrspace(1)* %out, align 16
   ret void
 }
 
 ; SI-LABEL: @fp_to_uint_v4i64_v4f64
-define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %in) {
   %cast = fptoui <4 x double> %in to <4 x i64>
   store <4 x i64> %cast, <4 x i64> addrspace(1)* %out, align 32
   ret void
@@ -72,7 +72,7 @@ define void @fp_to_uint_v4i64_v4f64(<4 x i64> addrspace(1)* %out, <4 x double> %
 
 ; FUNC-LABEL: {{^}}fp_to_uint_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{\[[0-9]+:[0-9]+\]}}
-define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %conv = fptoui double %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -80,7 +80,7 @@ define void @fp_to_uint_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f64_to_i1:
 ; SI: v_cmp_eq_f64_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{\[[0-9]+:[0-9]+\]}}|
-define void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f64_to_i1(i1 addrspace(1)* %out, double %in) #0 {
   %in.fabs = call double @llvm.fabs.f64(double %in)
   %conv = fptoui double %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fp_to_uint.ll b/test/CodeGen/AMDGPU/fp_to_uint.ll
index cbff9f22b0735fff010a8ae8a008bc0dba8ec771..fdb15801dc4e3e1dcee6524df983e1b7ef9cd67d 100644
--- a/test/CodeGen/AMDGPU/fp_to_uint.ll
+++ b/test/CodeGen/AMDGPU/fp_to_uint.ll
@@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) #1
 
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: s_endpgm
-define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
   %conv = fptoui float %in to i32
   store i32 %conv, i32 addrspace(1)* %out
   ret void
@@ -21,7 +21,7 @@ define void @fp_to_uint_f32_to_i32 (i32 addrspace(1)* %out, float %in) {
 
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: v_cvt_u32_f32_e32
-define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float> %in) {
   %result = fptoui <2 x float> %in to <2 x i32>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
@@ -37,7 +37,7 @@ define void @fp_to_uint_v2f32_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x float>
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: v_cvt_u32_f32_e32
 
-define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %value = load <4 x float>, <4 x float> addrspace(1) * %in
   %result = fptoui <4 x float> %value to <4 x i32>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -68,7 +68,7 @@ define void @fp_to_uint_v4f32_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x float>
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
   %conv = fptoui float %x to i64
   store i64 %conv, i64 addrspace(1)* %out
   ret void
@@ -119,7 +119,7 @@ define void @fp_to_uint_f32_to_i64(i64 addrspace(1)* %out, float %x) {
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float> %x) {
   %conv = fptoui <2 x float> %x to <2 x i64>
   store <2 x i64> %conv, <2 x i64> addrspace(1)* %out
   ret void
@@ -212,7 +212,7 @@ define void @fp_to_uint_v2f32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x float>
 ; EG-DAG: CNDE_INT
 
 ; GCN: s_endpgm
-define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float> %x) {
   %conv = fptoui <4 x float> %x to <4 x i64>
   store <4 x i64> %conv, <4 x i64> addrspace(1)* %out
   ret void
@@ -224,7 +224,7 @@ define void @fp_to_uint_v4f32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x float>
 
 ; EG: AND_INT
 ; EG: SETE_DX10 {{[*]?}} T{{[0-9]+}}.{{[XYZW]}}, KC0[2].Z, 1.0,
-define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %conv = fptoui float %in to i1
   store i1 %conv, i1 addrspace(1)* %out
   ret void
@@ -232,7 +232,7 @@ define void @fp_to_uint_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 
 ; FUNC-LABEL: {{^}}fp_to_uint_fabs_f32_to_i1:
 ; GCN: v_cmp_eq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, |s{{[0-9]+}}|
-define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
   %in.fabs = call float @llvm.fabs.f32(float %in)
   %conv = fptoui float %in.fabs to i1
   store i1 %conv, i1 addrspace(1)* %out
@@ -246,7 +246,7 @@ define void @fp_to_uint_fabs_f32_to_i1(i1 addrspace(1)* %out, float %in) #0 {
 ; SI: v_cvt_u32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; VI: v_cvt_i32_f32_e32 [[VAL:v[0-9]+]], s{{[0-9]+}}
 ; GCN: buffer_store_short [[VAL]]
-define void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @fp_to_uint_f32_to_i16(i16 addrspace(1)* %out, float %in) #0 {
   %uint = fptoui float %in to i16
   store i16 %uint, i16 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fpext.f16.ll b/test/CodeGen/AMDGPU/fpext.f16.ll
index c4f5d7cdfb5d5a47f96a6d3577f7dd3180d154d4..03657176c383bdd1c134b0276964f9729b5fdd3c 100644
--- a/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,14 +1,15 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI -check-prefix=SIGFX9 %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=SIGFX9 %s
 
 ; GCN-LABEL: {{^}}fpext_f16_to_f32
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; GCN: v_cvt_f32_f16_e32 v[[R_F32:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_dword v[[R_F32]]
 ; GCN: s_endpgm
-define void @fpext_f16_to_f32(
+define amdgpu_kernel void @fpext_f16_to_f32(
     float addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    half addrspace(1)* %a) #0 {
 entry:
   %a.val = load half, half addrspace(1)* %a
   %r.val = fpext half %a.val to float
@@ -22,9 +23,9 @@ entry:
 ; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:[[R_F64_1:[0-9]+]]{{\]}}, v[[A_F32]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F64_0]]:[[R_F64_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fpext_f16_to_f64(
+define amdgpu_kernel void @fpext_f16_to_f64(
     double addrspace(1)* %r,
-    half addrspace(1)* %a) {
+    half addrspace(1)* %a) #0 {
 entry:
   %a.val = load half, half addrspace(1)* %a
   %r.val = fpext half %a.val to double
@@ -34,15 +35,17 @@ entry:
 
 ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f32
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; VI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
+; GFX9-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; GCN-DAG: v_cvt_f32_f16_e32 v[[R_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
+; SIGFX9: v_cvt_f32_f16_e32 v[[R_F32_1:[0-9]+]], v[[A_F16_1]]
+; VI: v_cvt_f32_f16_sdwa v[[R_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GCN: buffer_store_dwordx2 v{{\[}}[[R_F32_0]]:[[R_F32_1]]{{\]}}
 ; GCN: s_endpgm
-define void @fpext_v2f16_to_v2f32(
+
+define amdgpu_kernel void @fpext_v2f16_to_v2f32(
     <2 x float> addrspace(1)* %r,
-    <2 x half> addrspace(1)* %a) {
+    <2 x half> addrspace(1)* %a) #0 {
 entry:
   %a.val = load <2 x half>, <2 x half> addrspace(1)* %a
   %r.val = fpext <2 x half> %a.val to <2 x float>
@@ -51,15 +54,18 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}fpext_v2f16_to_v2f64
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_cvt_f64_f32_e32 v{{\[}}{{[0-9]+}}:[[R_F64_3:[0-9]+]]{{\]}}, v[[A_F32_1]]
-; GCN: v_cvt_f64_f32_e32 v{{\[}}[[R_F64_0:[0-9]+]]:{{[0-9]+}}{{\]}}, v[[A_F32_0]]
-; GCN: buffer_store_dwordx4 v{{\[}}[[R_F64_0]]:[[R_F64_3]]{{\]}}
+; GCN: buffer_load_dword
+; SIGFX9-DAG: v_lshrrev_b32_e32
+; SIGFX9-DAG: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
+; GCN: v_cvt_f32_f16_e32
+
+; GCN: v_cvt_f64_f32_e32
+; GCN: v_cvt_f64_f32_e32
+; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @fpext_v2f16_to_v2f64(
+
+define amdgpu_kernel void @fpext_v2f16_to_v2f64(
     <2 x double> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -68,3 +74,202 @@ entry:
   store <2 x double> %r.val, <2 x double> addrspace(1)* %r
   ret void
 }
+
+; GCN-LABEL: {{^}}s_fneg_fpext_f16_to_f32:
+; GCN: v_cvt_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(float addrspace(1)* %r, i32 %a) {
+entry:
+  %a.trunc = trunc i32 %a to i16
+  %a.val = bitcast i16 %a.trunc to half
+  %r.val = fpext half %a.val to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -[[A]]
+define amdgpu_kernel void @fneg_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.neg = fsub half -0.0, %a.val
+  %r.val = fpext half %a.neg to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, |[[A]]|
+define amdgpu_kernel void @fabs_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %r.val = fpext half %a.fabs to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fabs_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN: v_cvt_f32_f16_e64 v{{[0-9]+}}, -|[[A]]|
+define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %a.fneg.fabs = fsub half -0.0, %a.fabs
+  %r.val = fpext half %a.fneg.fabs to float
+  store float %r.val, float addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_multi_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_xor_b32_e32 [[XOR:v[0-9]+]], 0x8000, [[A]]
+
+; FIXME: Using the source modifier here only wastes code size
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -[[A]]
+
+; GCN: store_dword [[CVT]]
+; GCN: store_short [[XOR]]
+define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.neg = fsub half -0.0, %a.val
+  %r.val = fpext half %a.neg to float
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %a.neg, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_multi_foldable_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_cvt_f32_f16_e64 [[CVTA_NEG:v[0-9]+]], -[[A]]
+; SI-DAG: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
+; SI: v_mul_f32_e32 [[MUL_F32:v[0-9]+]], [[CVTA]], [[CVTA_NEG]]
+; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
+
+; GFX89-DAG: v_cvt_f32_f16_e64 [[CVT_NEGA:v[0-9]+]], -[[A]]
+; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], -[[A]], [[A]]
+
+; GCN: buffer_store_dword [[CVTA_NEG]]
+; GCN: buffer_store_short [[MUL]]
+define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.neg = fsub half -0.0, %a.val
+  %r.val = fpext half %a.neg to float
+  %mul = fmul half %a.neg, %a.val
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %mul, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_multi_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_and_b32_e32 [[XOR:v[0-9]+]], 0x7fff, [[A]]
+
+; SI-DAG: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[A]]
+; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], |[[A]]|
+
+; GCN: store_dword [[CVT]]
+; GCN: store_short [[XOR]]
+define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %r.val = fpext half %a.fabs to float
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %a.fabs, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_multi_foldable_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
+; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], |[[CVTA]]|, [[CVTA]]
+; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
+; SI: v_and_b32_e32 [[ABS_A:v[0-9]+]], 0x7fffffff, [[CVTA]]
+
+; GFX89-DAG: v_cvt_f32_f16_e64 [[ABS_A:v[0-9]+]], |[[A]]|
+; GFX89: v_mul_f16_e64 [[MUL:v[0-9]+]], |[[A]]|, [[A]]
+
+; GCN: buffer_store_dword [[ABS_A]]
+; GCN: buffer_store_short [[MUL]]
+define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %r.val = fpext half %a.fabs to float
+  %mul = fmul half %a.fabs, %a.val
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %mul, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fneg_multi_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; GCN-DAG: v_or_b32_e32 [[OR:v[0-9]+]], 0x8000, [[A]]
+
+; SI: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[OR]]
+; VI-DAG: v_cvt_f32_f16_e64 [[CVT:v[0-9]+]], -|[[OR]]|
+
+; GCN: buffer_store_dword [[CVT]]
+; GCN: buffer_store_short [[OR]]
+define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %a.fneg.fabs = fsub half -0.0, %a.fabs
+  %r.val = fpext half %a.fneg.fabs to float
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %a.fneg.fabs, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
+; GCN: {{buffer|flat}}_load_ushort [[A:v[0-9]+]]
+; SI: v_cvt_f32_f16_e32 [[CVTA:v[0-9]+]], [[A]]
+; SI: v_mul_f32_e64 [[MUL_F32:v[0-9]+]], -|[[CVTA]]|, [[CVTA]]
+; SI: v_cvt_f16_f32_e32 [[MUL:v[0-9]+]], [[MUL_F32]]
+; SI: v_or_b32_e32 [[FABS_FNEG:v[0-9]+]], 0x80000000, [[CVTA]]
+
+; GFX89-DAG: v_cvt_f32_f16_e64 [[FABS_FNEG:v[0-9]+]], -|[[A]]|
+; GFX89-DAG: v_mul_f16_e64 [[MUL:v[0-9]+]], -|[[A]]|, [[A]]
+
+; GCN: buffer_store_dword [[FABS_FNEG]]
+; GCN: buffer_store_short [[MUL]]
+define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
+    float addrspace(1)* %r,
+    half addrspace(1)* %a) {
+entry:
+  %a.val = load half, half addrspace(1)* %a
+  %a.fabs = call half @llvm.fabs.f16(half %a.val)
+  %a.fneg.fabs = fsub half -0.0, %a.fabs
+  %r.val = fpext half %a.fneg.fabs to float
+  %mul = fmul half %a.fneg.fabs, %a.val
+  store volatile float %r.val, float addrspace(1)* %r
+  store volatile half %mul, half addrspace(1)* undef
+  ret void
+}
+
+declare half @llvm.fabs.f16(half) #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fpext.ll b/test/CodeGen/AMDGPU/fpext.ll
index 6dc84b01d734d2f20b9ccebcd91967cbad176e86..b11e2ea056c3349022d7f76bff4ac66c580e2e9e 100644
--- a/test/CodeGen/AMDGPU/fpext.ll
+++ b/test/CodeGen/AMDGPU/fpext.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}fpext_f32_to_f64:
 ; SI: v_cvt_f64_f32_e32 {{v\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
   %result = fpext float %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -12,7 +12,7 @@ define void @fpext_f32_to_f64(double addrspace(1)* %out, float %in) {
 ; FUNC-LABEL: {{^}}fpext_v2f32_to_v2f64:
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %in) {
   %result = fpext <2 x float> %in to <2 x double>
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
@@ -22,7 +22,7 @@ define void @fpext_v2f32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x float> %
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) {
+define amdgpu_kernel void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %in) {
   %result = fpext <3 x float> %in to <3 x double>
   store <3 x double> %result, <3 x double> addrspace(1)* %out
   ret void
@@ -33,7 +33,7 @@ define void @fpext_v3f32_to_v3f64(<3 x double> addrspace(1)* %out, <3 x float> %
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %in) {
   %result = fpext <4 x float> %in to <4 x double>
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @fpext_v4f32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x float> %
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
 ; SI: v_cvt_f64_f32_e32
-define void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
+define amdgpu_kernel void @fpext_v8f32_to_v8f64(<8 x double> addrspace(1)* %out, <8 x float> %in) {
   %result = fpext <8 x float> %in to <8 x double>
   store <8 x double> %result, <8 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fptosi.f16.ll b/test/CodeGen/AMDGPU/fptosi.f16.ll
index 71f56d730e967eafa14a49eda4c3005588f7865d..50e56e08416ad4813a18d893e8a641b34dc2e16e 100644
--- a/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_I16]]
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i16(
+define amdgpu_kernel void @fptosi_f16_to_i16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -23,7 +23,7 @@ entry:
 ; GCN: v_cvt_i32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i32(
+define amdgpu_kernel void @fptosi_f16_to_i32(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -40,7 +40,7 @@ entry:
 ; GCN: buffer_load_ushort
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptosi_f16_to_i64(
+define amdgpu_kernel void @fptosi_f16_to_i64(
     i64 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -52,17 +52,26 @@ entry:
 
 ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
-; GCN: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
-; GCN: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
+
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
+; SI: v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
+; SI: v_and_b32_e32 v[[R_I16_LO:[0-9]+]], 0xffff, v[[R_I16_0]]
+; SI: v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
+; SI: v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_LO]]
+
+; VI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; VI: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI: v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
+; VI: v_cvt_i32_f32_sdwa v[[R_I16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI: v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_1]], v[[R_I16_0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
 ; GCN: buffer_store_dword v[[R_V2_I16]]
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i16(
+
+define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
     <2 x i16> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -75,12 +84,13 @@ entry:
 ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i32
 ; GCN: buffer_load_dword
 ; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
 ; GCN: v_cvt_i32_f32_e32
 ; GCN: v_cvt_i32_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i32(
+define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -96,9 +106,10 @@ entry:
 ; GCN-LABEL: {{^}}fptosi_v2f16_to_v2i64
 ; GCN: buffer_load_dword
 ; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
 ; GCN: s_endpgm
-define void @fptosi_v2f16_to_v2i64(
+define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
     <2 x i64> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/fptoui.f16.ll b/test/CodeGen/AMDGPU/fptoui.f16.ll
index a6876624a0c6b3e82a5782efd390de087e97be80..2afa6111cf17477aad722a974a632cda17c5052d 100644
--- a/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -8,7 +8,7 @@
 ; VI:  v_cvt_i32_f32_e32 v[[R_I16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_I16]]
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i16(
+define amdgpu_kernel void @fptoui_f16_to_i16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@ entry:
 ; GCN: v_cvt_u32_f32_e32 v[[R_I32:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i32(
+define amdgpu_kernel void @fptoui_f16_to_i32(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -41,7 +41,7 @@ entry:
 ; GCN: buffer_load_ushort
 ; GCN: v_cvt_f32_f16_e32
 ; GCN: s_endpgm
-define void @fptoui_f16_to_i64(
+define amdgpu_kernel void @fptoui_f16_to_i64(
     i64 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -53,18 +53,25 @@ entry:
 
 ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i16
 ; GCN:     buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+
+; SI:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
 ; SI:      v_cvt_u32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
 ; SI:      v_cvt_u32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
-; VI:      v_cvt_i32_f32_e32 v[[R_I16_0:[0-9]+]], v[[A_F32_0]]
+; SI:     v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
+; SI:     v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
+
+; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_0:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; VI:      v_cvt_i32_f32_e32 v[[R_I16_1:[0-9]+]], v[[A_F32_1]]
-; GCN:     v_lshlrev_b32_e32 v[[R_I16_HI:[0-9]+]], 16, v[[R_I16_1]]
-; GCN:     v_or_b32_e32 v[[R_V2_I16:[0-9]+]], v[[R_I16_HI]], v[[R_I16_0]]
+; VI:      v_cvt_i32_f32_sdwa v[[R_I16_0:[0-9]+]], v[[A_F32_0]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI:     v_or_b32_sdwa v[[R_V2_I16:[0-9]+]], v[[R_I16_0]], v[[R_I16_1]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
 ; GCN:     buffer_store_dword v[[R_V2_I16]]
 ; GCN:     s_endpgm
-define void @fptoui_v2f16_to_v2i16(
+
+define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
     <2 x i16> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -77,12 +84,13 @@ entry:
 ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i32
 ; GCN: buffer_load_dword
 ; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: v_cvt_u32_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @fptoui_v2f16_to_v2i32(
+define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
     <2 x i32> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
@@ -98,9 +106,10 @@ entry:
 ; GCN-LABEL: {{^}}fptoui_v2f16_to_v2i64
 ; GCN: buffer_load_dword
 ; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
 ; GCN: s_endpgm
-define void @fptoui_v2f16_to_v2i64(
+define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
     <2 x i64> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/fptrunc.f16.ll b/test/CodeGen/AMDGPU/fptrunc.f16.ll
index 284fc53c8240e0e5a20718e2b5f61dd9cc78a92e..cdcc7be8f2f8d3072475b19dad4c43ee0dda0896 100644
--- a/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -1,12 +1,14 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,+fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-FLUSH %s
 
-; GCN-LABEL: {{^}}fptrunc_f32_to_f16
+; GCN-LABEL: {{^}}fptrunc_f32_to_f16:
 ; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fptrunc_f32_to_f16(
+define amdgpu_kernel void @fptrunc_f32_to_f16(
     half addrspace(1)* %r,
     float addrspace(1)* %a) {
 entry:
@@ -16,13 +18,13 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fptrunc_f64_to_f16
+; GCN-LABEL: {{^}}fptrunc_f64_to_f16:
 ; GCN: buffer_load_dwordx2 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_1:[0-9]+]]{{\]}}
 ; GCN: v_cvt_f32_f64_e32 v[[A_F32:[0-9]+]], v{{\[}}[[A_F64_0]]:[[A_F64_1]]{{\]}}
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fptrunc_f64_to_f16(
+define amdgpu_kernel void @fptrunc_f64_to_f16(
     half addrspace(1)* %r,
     double addrspace(1)* %a) {
 entry:
@@ -32,16 +34,26 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16
+; GCN-LABEL: {{^}}fptrunc_v2f32_to_v2f16:
 ; GCN:     buffer_load_dwordx2 v{{\[}}[[A_F32_0:[0-9]+]]:[[A_F32_1:[0-9]+]]{{\]}}
 ; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
-; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
-; GCN-DAG: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI:      v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
+; GFX9-DAG:   v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
+; GFX9-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
+; GFX9-FLUSH: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
+
+; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+
 ; GCN:     buffer_store_dword v[[R_V2_F16]]
 ; GCN:     s_endpgm
-define void @fptrunc_v2f32_to_v2f16(
+
+define amdgpu_kernel void @fptrunc_v2f32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x float> addrspace(1)* %a) {
 entry:
@@ -51,17 +63,25 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16
+; GCN-LABEL: {{^}}fptrunc_v2f64_to_v2f16:
 ; GCN: buffer_load_dwordx4 v{{\[}}[[A_F64_0:[0-9]+]]:[[A_F64_3:[0-9]+]]{{\]}}
-; GCN: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}}
-; GCN: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}}
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_0:[0-9]+]], v{{\[}}[[A_F64_0]]:{{[0-9]+}}{{\]}}
+; GCN-DAG: v_cvt_f32_f64_e32 v[[A_F32_1:[0-9]+]], v{{\[}}{{[0-9]+}}:[[A_F64_3]]{{\]}}
+; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[A_F32_0]]
+
+; VI: v_cvt_f16_f32_sdwa v[[R_F16_HI:[0-9]+]], v[[A_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+
+; SIVI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; GFX9-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[A_F32_1]]
+; GFX9-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
+; GFX9-FLUSH: v_lshl_or_b32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], 16, v[[R_F16_LO]]
+
+; GFX9-DENORM: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
-define void @fptrunc_v2f64_to_v2f16(
+
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x double> addrspace(1)* %a) {
 entry:
@@ -70,3 +90,109 @@ entry:
   store <2 x half> %r.val, <2 x half> addrspace(1)* %r
   ret void
 }
+
+; GCN-LABEL: {{^}}fneg_fptrunc_f32_to_f16:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -v[[A_F32]]
+; GCN: buffer_store_short v[[R_F16]]
+; GCN: s_endpgm
+define amdgpu_kernel void @fneg_fptrunc_f32_to_f16(
+    half addrspace(1)* %r,
+    float addrspace(1)* %a) {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fneg = fsub float -0.0, %a.val
+  %r.val = fptrunc float %a.fneg to half
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fabs_fptrunc_f32_to_f16:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
+; GCN: buffer_store_short v[[R_F16]]
+; GCN: s_endpgm
+define amdgpu_kernel void @fabs_fptrunc_f32_to_f16(
+    half addrspace(1)* %r,
+    float addrspace(1)* %a) {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fabs = call float @llvm.fabs.f32(float %a.val)
+  %r.val = fptrunc float %a.fabs to half
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fneg_fabs_fptrunc_f32_to_f16:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], -|v[[A_F32]]|
+; GCN: buffer_store_short v[[R_F16]]
+; GCN: s_endpgm
+define amdgpu_kernel void @fneg_fabs_fptrunc_f32_to_f16(
+    half addrspace(1)* %r,
+    float addrspace(1)* %a) #0 {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fabs = call float @llvm.fabs.f32(float %a.val)
+  %a.fneg.fabs = fsub float -0.0, %a.fabs
+  %r.val = fptrunc float %a.fneg.fabs to half
+  store half %r.val, half addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fptrunc_f32_to_f16_zext_i32:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
+; GCN-NOT: v[[R_F16]]
+; GCN: buffer_store_dword v[[R_F16]]
+define amdgpu_kernel void @fptrunc_f32_to_f16_zext_i32(
+    i32 addrspace(1)* %r,
+    float addrspace(1)* %a) #0 {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %r.val = fptrunc float %a.val to half
+  %r.i16 = bitcast half %r.val to i16
+  %zext = zext i16 %r.i16 to i32
+  store i32 %zext, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fptrunc_fabs_f32_to_f16_zext_i32:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e64 v[[R_F16:[0-9]+]], |v[[A_F32]]|
+; GCN-NOT: v[[R_F16]]
+; GCN: buffer_store_dword v[[R_F16]]
+define amdgpu_kernel void @fptrunc_fabs_f32_to_f16_zext_i32(
+    i32 addrspace(1)* %r,
+    float addrspace(1)* %a) #0 {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %a.fabs = call float @llvm.fabs.f32(float %a.val)
+  %r.val = fptrunc float %a.fabs to half
+  %r.i16 = bitcast half %r.val to i16
+  %zext = zext i16 %r.i16 to i32
+  store i32 %zext, i32 addrspace(1)* %r
+  ret void
+}
+
+; GCN-LABEL: {{^}}fptrunc_f32_to_f16_sext_i32:
+; GCN: buffer_load_dword v[[A_F32:[0-9]+]]
+; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
+; GCN: v_bfe_i32 v[[R_F16_SEXT:[0-9]+]], v[[R_F16]], 0, 16
+; GCN: buffer_store_dword v[[R_F16_SEXT]]
+define amdgpu_kernel void @fptrunc_f32_to_f16_sext_i32(
+    i32 addrspace(1)* %r,
+    float addrspace(1)* %a) #0 {
+entry:
+  %a.val = load float, float addrspace(1)* %a
+  %r.val = fptrunc float %a.val to half
+  %r.i16 = bitcast half %r.val to i16
+  %zext = sext i16 %r.i16 to i32
+  store i32 %zext, i32 addrspace(1)* %r
+  ret void
+}
+
+declare float @llvm.fabs.f32(float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/fptrunc.ll b/test/CodeGen/AMDGPU/fptrunc.ll
index 0c7b67406a89973420e836b5ba3eadf046b11290..d9c5b7e6f359425ae8bd397549b2a777024ef254 100644
--- a/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/test/CodeGen/AMDGPU/fptrunc.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}fptrunc_f64_to_f32:
 ; GCN: v_cvt_f32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -14,7 +14,7 @@ define void @fptrunc_f64_to_f32(float addrspace(1)* %out, double %in) {
 ; GCN-NOT: v_cvt
 ; GCN-UNSAFE: v_cvt_f32_f64_e32 [[F32:v[0-9]+]]
 ; GCN-UNSAFE: v_cvt_f16_f32_e32 v{{[0-9]+}}, [[F32]]
-define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
   %result = fptrunc double %in to half
   %result_i16 = bitcast half %result to i16
   store i16 %result_i16, i16 addrspace(1)* %out
@@ -24,7 +24,7 @@ define void @fptrunc_f64_to_f16(i16 addrspace(1)* %out, double %in) {
 ; FUNC-LABEL: {{^}}fptrunc_v2f64_to_v2f32:
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double> %in) {
   %result = fptrunc <2 x double> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -35,7 +35,7 @@ define void @fptrunc_v2f64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x double>
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double> %in) {
   %result = fptrunc <4 x double> %in to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
@@ -50,7 +50,7 @@ define void @fptrunc_v4f64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x double>
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
 ; GCN: v_cvt_f32_f64_e32
-define void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
+define amdgpu_kernel void @fptrunc_v8f64_to_v8f32(<8 x float> addrspace(1)* %out, <8 x double> %in) {
   %result = fptrunc <8 x double> %in to <8 x float>
   store <8 x float> %result, <8 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/fract.f64.ll b/test/CodeGen/AMDGPU/fract.f64.ll
index 0651dce8d95c0dbd3c17d1392e375d1fbecf9160..7a5bcfffa3f3b2eb73d7c04051bc9dd5e8d841f4 100644
--- a/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/test/CodeGen/AMDGPU/fract.f64.ll
@@ -27,7 +27,7 @@ declare double @llvm.floor.f64(double) #0
 ; GCN-UNSAFE: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %floor.x = call double @llvm.floor.f64(double %x)
   %fract = fsub double %x, %floor.x
@@ -54,7 +54,7 @@ define void @fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1
 ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -[[X]]
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %neg.x = fsub double -0.0, %x
   %floor.neg.x = call double @llvm.floor.f64(double %neg.x)
@@ -82,7 +82,7 @@ define void @fract_f64_neg(double addrspace(1)* %out, double addrspace(1)* %src)
 ; GCN-UNSAFE: v_fract_f64_e64 [[FRACT:v\[[0-9]+:[0-9]+\]]], -|[[X]]|
 
 ; GCN: buffer_store_dwordx2 [[FRACT]]
-define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %abs.x = call double @llvm.fabs.f64(double %x)
   %neg.abs.x = fsub double -0.0, %abs.x
@@ -98,7 +98,7 @@ define void @fract_f64_neg_abs(double addrspace(1)* %out, double addrspace(1)* %
 ; VI-UNSAFE-DAG: v_fract_f64_e32 [[FRACT:v\[[0-9]+:[0-9]+\]]], [[X]]
 ; VI-UNSAFE: buffer_store_dwordx2 [[FLOOR]]
 ; VI-UNSAFE: buffer_store_dwordx2 [[FRACT]]
-define void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
+define amdgpu_kernel void @multi_use_floor_fract_f64(double addrspace(1)* %out, double addrspace(1)* %src) #1 {
   %x = load double, double addrspace(1)* %src
   %floor.x = call double @llvm.floor.f64(double %x)
   %fract = fsub double %x, %floor.x
diff --git a/test/CodeGen/AMDGPU/fract.ll b/test/CodeGen/AMDGPU/fract.ll
index 4e1a503b12987932433c3270697f91518dcf76a1..207fe280c9a69d8e407920f78e1c686bcae6b9e6 100644
--- a/test/CodeGen/AMDGPU/fract.ll
+++ b/test/CodeGen/AMDGPU/fract.ll
@@ -14,7 +14,7 @@ declare float @llvm.floor.f32(float) #0
 ; GCN-UNSAFE: v_fract_f32_e32 [[RESULT:v[0-9]+]], [[INPUT:v[0-9]+]]
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %floor.x = call float @llvm.floor.f32(float %x)
   %fract = fsub float %x, %floor.x
@@ -29,7 +29,7 @@ define void @fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
 ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -[[INPUT:v[0-9]+]]
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %x.neg = fsub float -0.0, %x
   %floor.x.neg = call float @llvm.floor.f32(float %x.neg)
@@ -45,7 +45,7 @@ define void @fract_f32_neg(float addrspace(1)* %out, float addrspace(1)* %src) #
 ; GCN-UNSAFE: v_fract_f32_e64 [[RESULT:v[0-9]+]], -|[[INPUT:v[0-9]+]]|
 
 ; GCN: buffer_store_dword [[RESULT]]
-define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %abs.x = call float @llvm.fabs.f32(float %x)
   %neg.abs.x = fsub float -0.0, %abs.x
@@ -61,7 +61,7 @@ define void @fract_f32_neg_abs(float addrspace(1)* %out, float addrspace(1)* %sr
 
 ; GCN-UNSAFE: buffer_store_dword [[FLOOR]]
 ; GCN-UNSAFE: buffer_store_dword [[FRACT]]
-define void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
+define amdgpu_kernel void @multi_use_floor_fract_f32(float addrspace(1)* %out, float addrspace(1)* %src) #1 {
   %x = load float, float addrspace(1)* %src
   %floor.x = call float @llvm.floor.f32(float %x)
   %fract = fsub float %x, %floor.x
diff --git a/test/CodeGen/AMDGPU/frem.ll b/test/CodeGen/AMDGPU/frem.ll
index 97533c418c94f862494564a977c39b29191bf13d..9778069d0477be7597b199a6d53724b96146fb10 100644
--- a/test/CodeGen/AMDGPU/frem.ll
+++ b/test/CodeGen/AMDGPU/frem.ll
@@ -12,10 +12,10 @@
 ; GCN: v_mul_f32_e32
 ; GCN: v_div_fmas_f32
 ; GCN: v_div_fixup_f32
-; GCN: v_trunc_f32_e64 v{{[0-9]+}}, -v{{[0-9]+}}
-; GCN: v_mac_f32_e32
+; GCN: v_trunc_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                       float addrspace(1)* %in2) #0 {
    %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
    %r0 = load float, float addrspace(1)* %in1, align 4
@@ -28,13 +28,12 @@ define void @frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 ; FUNC-LABEL: {{^}}unsafe_frem_f32:
 ; GCN: buffer_load_dword [[Y:v[0-9]+]], {{.*}} offset:16
 ; GCN: buffer_load_dword [[X:v[0-9]+]], {{.*}}
-; GCN: v_rcp_f32_e64 [[INVY:v[0-9]+]], -[[Y]]
+; GCN: v_rcp_f32_e32 [[INVY:v[0-9]+]], [[Y]]
 ; GCN: v_mul_f32_e32 [[DIV:v[0-9]+]], [[INVY]], [[X]]
 ; GCN: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[DIV]]
-; GCN: v_mac_f32_e32 [[X]], [[Y]], [[TRUNC]]
-; GCN: buffer_store_dword [[X]]
-; GCN: s_endpgm
-define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
+; GCN: v_mad_f32 [[RESULT:v[0-9]+]], -[[TRUNC]], [[Y]], [[X]]
+; GCN: buffer_store_dword [[RESULT]]
+define amdgpu_kernel void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
                              float addrspace(1)* %in2) #1 {
    %gep2 = getelementptr float, float addrspace(1)* %in2, i32 4
    %r0 = load float, float addrspace(1)* %in1, align 4
@@ -55,7 +54,7 @@ define void @unsafe_frem_f32(float addrspace(1)* %out, float addrspace(1)* %in1,
 ; GCN: v_add_f64
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) #0 {
    %r0 = load double, double addrspace(1)* %in1, align 8
    %r1 = load double, double addrspace(1)* %in2, align 8
@@ -71,7 +70,7 @@ define void @frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 ; CI: v_trunc_f64_e32
 ; GCN: v_fma_f64
 ; GCN: s_endpgm
-define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                              double addrspace(1)* %in2) #1 {
    %r0 = load double, double addrspace(1)* %in1, align 8
    %r1 = load double, double addrspace(1)* %in2, align 8
@@ -80,7 +79,7 @@ define void @unsafe_frem_f64(double addrspace(1)* %out, double addrspace(1)* %in
    ret void
 }
 
-define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in1,
                         <2 x float> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4
    %r0 = load <2 x float>, <2 x float> addrspace(1)* %in1, align 8
@@ -90,7 +89,7 @@ define void @frem_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)
    ret void
 }
 
-define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in1,
                         <4 x float> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4
    %r0 = load <4 x float>, <4 x float> addrspace(1)* %in1, align 16
@@ -100,7 +99,7 @@ define void @frem_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)
    ret void
 }
 
-define void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
+define amdgpu_kernel void @frem_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in1,
                         <2 x double> addrspace(1)* %in2) #0 {
    %gep2 = getelementptr <2 x double>, <2 x double> addrspace(1)* %in2, i32 4
    %r0 = load <2 x double>, <2 x double> addrspace(1)* %in1, align 16
diff --git a/test/CodeGen/AMDGPU/fsqrt.f64.ll b/test/CodeGen/AMDGPU/fsqrt.f64.ll
index ed040436a61a21751e5f5ae2b1f3eec605208b6c..453d8fb37f2f480d781112e4320c46a72b8225b2 100644
--- a/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f64:
 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #1 {
   %r0 = load double, double addrspace(1)* %in
   %r1 = call double @llvm.sqrt.f64(double %r0)
   store double %r1, double addrspace(1)* %out
@@ -12,7 +12,7 @@ define void @v_safe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %i
 
 ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f64:
 ; GCN: v_sqrt_f64_e32 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\]}}
-define void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 {
+define amdgpu_kernel void @v_unsafe_fsqrt_f64(double addrspace(1)* %out, double addrspace(1)* %in) #2 {
   %r0 = load double, double addrspace(1)* %in
   %r1 = call double @llvm.sqrt.f64(double %r0)
   store double %r1, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/fsqrt.ll b/test/CodeGen/AMDGPU/fsqrt.ll
index b6526b8e0787c8ad2d0b60c4fad0a770bce79519..a0fd3411ca05c03974e90e94ce734bda68345094 100644
--- a/test/CodeGen/AMDGPU/fsqrt.ll
+++ b/test/CodeGen/AMDGPU/fsqrt.ll
@@ -7,7 +7,7 @@
 
 ; FUNC-LABEL: {{^}}v_safe_fsqrt_f32:
 ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %r0 = load float, float addrspace(1)* %in
   %r1 = call float @llvm.sqrt.f32(float %r0)
   store float %r1, float addrspace(1)* %out
@@ -16,7 +16,7 @@ define void @v_safe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in)
 
 ; FUNC-LABEL: {{^}}v_unsafe_fsqrt_f32:
 ; GCN: v_sqrt_f32_e32 {{v[0-9]+, v[0-9]+}}
-define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 {
+define amdgpu_kernel void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %in) #2 {
   %r0 = load float, float addrspace(1)* %in
   %r1 = call float @llvm.sqrt.f32(float %r0)
   store float %r1, float addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @v_unsafe_fsqrt_f32(float addrspace(1)* %out, float addrspace(1)* %i
 
 ; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z
 ; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS
-define void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 {
 entry:
   %fdiv = call float @llvm.sqrt.f32(float %in)
   store float %fdiv, float addrspace(1)* %out
@@ -44,7 +44,7 @@ entry:
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS
 ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS
-define void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   store <2 x float> %fdiv, <2 x float> addrspace(1)* %out
@@ -65,7 +65,7 @@ entry:
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS
 ; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X
 ; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS
-define void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
+define amdgpu_kernel void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 {
 entry:
   %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in)
   store <4 x float> %fdiv, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ entry:
 ; FUNC-LABEL: {{^}}elim_redun_check_neg0:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_neg0(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp olt float %in, -0.000000e+00
@@ -87,7 +87,7 @@ entry:
 ; FUNC-LABEL: {{^}}elim_redun_check_pos0:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_pos0(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp olt float %in, 0.000000e+00
@@ -99,7 +99,7 @@ entry:
 ; FUNC-LABEL: {{^}}elim_redun_check_ult:
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 {
+define amdgpu_kernel void @elim_redun_check_ult(float addrspace(1)* %out, float %in) #1 {
 entry:
   %sqrt = call float @llvm.sqrt.f32(float %in)
   %cmp = fcmp ult float %in, -0.000000e+00
@@ -112,7 +112,7 @@ entry:
 ; GCN: v_sqrt_f32_e32
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @elim_redun_check_v2(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   %cmp = fcmp olt <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
@@ -125,7 +125,7 @@ entry:
 ; GCN: v_sqrt_f32_e32
 ; GCN: v_sqrt_f32_e32
 ; GCN-NOT: v_cndmask
-define void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
+define amdgpu_kernel void @elim_redun_check_v2_ult(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 {
 entry:
   %sqrt = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in)
   %cmp = fcmp ult <2 x float> %in, <float -0.000000e+00, float -0.000000e+00>
diff --git a/test/CodeGen/AMDGPU/fsub.f16.ll b/test/CodeGen/AMDGPU/fsub.f16.ll
index 1565569523990af8a23f4d61a83deee8b3273618..d3c5df3177713369bb780f7dbfdc62c0c3485b31 100644
--- a/test/CodeGen/AMDGPU/fsub.f16.ll
+++ b/test/CodeGen/AMDGPU/fsub.f16.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
 
 ; GCN-LABEL: {{^}}fsub_f16:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
@@ -8,10 +9,10 @@
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_subrev_f32_e32 v[[R_F32:[0-9]+]], v[[B_F32]], v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
+; GFX89:  v_subrev_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16(
+define amdgpu_kernel void @fsub_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -28,10 +29,10 @@ entry:
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32:[0-9]+]], v[[B_F16]]
 ; SI:  v_sub_f32_e32 v[[R_F32:[0-9]+]], 1.0, v[[B_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
+; GFX89:  v_sub_f16_e32 v[[R_F16:[0-9]+]], 1.0, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16_imm_a(
+define amdgpu_kernel void @fsub_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -46,10 +47,10 @@ entry:
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_add_f32_e32 v[[R_F32:[0-9]+]], -2.0, v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
+; GFX89:  v_add_f16_e32 v[[R_F16:[0-9]+]], -2.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fsub_f16_imm_b(
+define amdgpu_kernel void @fsub_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -62,24 +63,30 @@ entry:
 ; GCN-LABEL: {{^}}fsub_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_subrev_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_subrev_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_subrev_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_subrev_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_subrev_f16_sdwa v[[R_F16_HI:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16(
+
+define amdgpu_kernel void @fsub_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -93,21 +100,30 @@ entry:
 
 ; GCN-LABEL: {{^}}fsub_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
-; SI:  v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
-; VI:  v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
+; SI: v_sub_f32_e32 v[[R_F32_0:[0-9]+]], 1.0, v[[B_F32_0]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI: v_sub_f32_e32 v[[R_F32_1:[0-9]+]], 2.0, v[[B_F32_1]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DAG: v_sub_f16_e32 v[[R_F16_1:[0-9]+]], 2.0, v[[B_F16_1]]
+; VI-DAG: v_sub_f16_e32 v[[R_F16_0:[0-9]+]], 1.0, v[[B_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x40003c00
+; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[B_V2_F16]] neg_lo:[0,1] neg_hi:[0,1]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16_imm_a(
+
+define amdgpu_kernel void @fsub_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -119,21 +135,30 @@ entry:
 
 ; GCN-LABEL: {{^}}fsub_v2f16_imm_b:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
-; VI:  v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_add_f32_e32 v[[R_F32_0:[0-9]+]], -2.0, v[[A_F32_0]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI: v_add_f32_e32 v[[R_F32_1:[0-9]+]], -1.0, v[[A_F32_1]]
+; SI: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_1:[0-9]+]], -1.0, v[[A_F16_1]]
+; VI-DAG: v_add_f16_e32 v[[R_F16_0:[0-9]+]], -2.0, v[[A_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 0xbc00c000
+; GFX9: v_pk_add_f16 v[[R_V2_F16:[0-9]+]], [[K]], v[[A_V2_F16]]{{$}}
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fsub_v2f16_imm_b(
+
+define amdgpu_kernel void @fsub_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/fsub.ll b/test/CodeGen/AMDGPU/fsub.ll
index a92035f2235809a84bcfe85505771cdd61cc15a0..e7a92d95d48593e879bb2ece4c95835057153bcf 100644
--- a/test/CodeGen/AMDGPU/fsub.ll
+++ b/test/CodeGen/AMDGPU/fsub.ll
@@ -4,7 +4,7 @@
 
 ; FUNC-LABEL: {{^}}v_fsub_f32:
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -17,23 +17,19 @@ define void @v_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
 ; R600: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, -KC0[2].W
 
 ; SI: v_sub_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @s_fsub_f32(float addrspace(1)* %out, float %a, float %b) {
   %sub = fsub float %a, %b
   store float %sub, float addrspace(1)* %out, align 4
   ret void
 }
 
-declare float @llvm.r600.load.input(i32) readnone
-
-declare void @llvm.AMDGPU.store.output(float, i32)
-
 ; FUNC-LABEL: {{^}}fsub_v2f32:
 ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[3].X, -KC0[3].Z
 ; R600-DAG: ADD {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].W, -KC0[3].Y
 
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
+define amdgpu_kernel void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b) {
   %sub = fsub <2 x float> %a, %b
   store <2 x float> %sub, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -49,7 +45,7 @@ define void @fsub_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x flo
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x float>, <4 x float> addrspace(1)* %in, i32 1
   %a = load <4 x float>, <4 x float> addrspace(1)* %in, align 16
   %b = load <4 x float>, <4 x float> addrspace(1)* %b_ptr, align 16
@@ -64,7 +60,7 @@ define void @v_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_subrev_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: s_endpgm
-define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
+define amdgpu_kernel void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b) {
   %result = fsub <4 x float> %a, %b
   store <4 x float> %result, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -73,7 +69,7 @@ define void @s_fsub_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x f
 ; FUNC-LABEL: {{^}}v_fneg_fsub_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
-define void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -86,7 +82,7 @@ define void @v_fneg_fsub_f32(float addrspace(1)* %out, float addrspace(1)* %in)
 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI-NOT: xor
-define void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -99,7 +95,7 @@ define void @v_fneg_fsub_nsz_f32(float addrspace(1)* %out, float addrspace(1)* %
 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_attribute_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI-NOT: xor
-define void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -115,7 +111,7 @@ define void @v_fneg_fsub_nsz_attribute_f32(float addrspace(1)* %out, float addrs
 ; FUNC-LABEL: {{^}}v_fneg_fsub_nsz_false_attribute_f32:
 ; SI: v_subrev_f32_e32 [[SUB:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, [[SUB]]
-define void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %a = load float, float addrspace(1)* %in, align 4
   %b = load float, float addrspace(1)* %b_ptr, align 4
@@ -125,5 +121,14 @@ define void @v_fneg_fsub_nsz_false_attribute_f32(float addrspace(1)* %out, float
   ret void
 }
 
+; FUNC-LABEL: {{^}}v_fsub_0_nsz_attribute_f32:
+; SI-NOT: v_sub
+define amdgpu_kernel void @v_fsub_0_nsz_attribute_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+  %a = load float, float addrspace(1)* %in, align 4
+  %result = fsub float %a, 0.0
+  store float %result, float addrspace(1)* %out, align 4
+  ret void
+}
+
 attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #1 = { nounwind "no-signed-zeros-fp-math"="false" }
diff --git a/test/CodeGen/AMDGPU/fsub64.ll b/test/CodeGen/AMDGPU/fsub64.ll
index 4c9c5ddd4c6ed9d8e7ce8ce8a0800c5e11e291cc..1b0879d098ee01fa572fc1666aa614dc95a5f2f0 100644
--- a/test/CodeGen/AMDGPU/fsub64.ll
+++ b/test/CodeGen/AMDGPU/fsub64.ll
@@ -5,7 +5,7 @@ declare double @llvm.fabs.f64(double) #0
 
 ; SI-LABEL: {{^}}fsub_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                       double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -16,7 +16,7 @@ define void @fsub_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; SI-LABEL: {{^}}fsub_fabs_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -\|v\[[0-9]+:[0-9]+\]\|}}
-define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                            double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -28,7 +28,7 @@ define void @fsub_fabs_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
 
 ; SI-LABEL: {{^}}fsub_fabs_inv_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], |v\[[0-9]+:[0-9]+\]|, -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
+define amdgpu_kernel void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %in1,
                                double addrspace(1)* %in2) {
   %r0 = load double, double addrspace(1)* %in1
   %r1 = load double, double addrspace(1)* %in2
@@ -40,7 +40,7 @@ define void @fsub_fabs_inv_f64(double addrspace(1)* %out, double addrspace(1)* %
 
 ; SI-LABEL: {{^}}s_fsub_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double %a, %b
   store double %sub, double addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @s_fsub_f64(double addrspace(1)* %out, double %a, double %b) {
 
 ; SI-LABEL: {{^}}s_fsub_imm_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}, 4.0
-define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double 4.0, %a
   store double %sub, double addrspace(1)* %out
   ret void
@@ -56,7 +56,7 @@ define void @s_fsub_imm_f64(double addrspace(1)* %out, double %a, double %b) {
 
 ; SI-LABEL: {{^}}s_fsub_imm_inv_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\]}}, -4.0
-define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
+define amdgpu_kernel void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b) {
   %sub = fsub double %a, 4.0
   store double %sub, double addrspace(1)* %out
   ret void
@@ -64,7 +64,7 @@ define void @s_fsub_imm_inv_f64(double addrspace(1)* %out, double %a, double %b)
 
 ; SI-LABEL: {{^}}s_fsub_self_f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -s\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
+define amdgpu_kernel void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
   %sub = fsub double %a, %a
   store double %sub, double addrspace(1)* %out
   ret void
@@ -73,7 +73,7 @@ define void @s_fsub_self_f64(double addrspace(1)* %out, double %a) {
 ; SI-LABEL: {{^}}fsub_v2f64:
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
+define amdgpu_kernel void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b) {
   %sub = fsub <2 x double> %a, %b
   store <2 x double> %sub, <2 x double> addrspace(1)* %out
   ret void
@@ -84,7 +84,7 @@ define void @fsub_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x d
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
+define amdgpu_kernel void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x double>, <4 x double> addrspace(1)* %in, i32 1
   %a = load <4 x double>, <4 x double> addrspace(1)* %in
   %b = load <4 x double>, <4 x double> addrspace(1)* %b_ptr
@@ -98,7 +98,7 @@ define void @fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
 ; SI: v_add_f64 {{v\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], -v\[[0-9]+:[0-9]+\]}}
-define void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
+define amdgpu_kernel void @s_fsub_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b) {
   %result = fsub <4 x double> %a, %b
   store <4 x double> %result, <4 x double> addrspace(1)* %out, align 16
   ret void
diff --git a/test/CodeGen/AMDGPU/ftrunc.f64.ll b/test/CodeGen/AMDGPU/ftrunc.f64.ll
index c4138ad79c28bcfced4cd1b49919c8c37dbe4732..1f72ec65588ea0faf04ece9e6912edc98da722d8 100644
--- a/test/CodeGen/AMDGPU/ftrunc.f64.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.f64.ll
@@ -13,7 +13,7 @@ declare <16 x double> @llvm.trunc.v16f64(<16 x double>) nounwind readnone
 ; CI: v_trunc_f64
 ; SI: v_bfe_u32 {{v[0-9]+}}, {{v[0-9]+}}, 20, 11
 ; SI: s_endpgm
-define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
   %x = load double, double addrspace(1)* %in, align 8
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out, align 8
@@ -36,7 +36,7 @@ define void @v_ftrunc_f64(double addrspace(1)* %out, double addrspace(1)* %in) {
 ; SI-DAG: cndmask_b32
 ; SI-DAG: cndmask_b32
 ; SI: s_endpgm
-define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @ftrunc_f64(double addrspace(1)* %out, double %x) {
   %y = call double @llvm.trunc.f64(double %x) nounwind readnone
   store double %y, double addrspace(1)* %out
   ret void
@@ -45,7 +45,7 @@ define void @ftrunc_f64(double addrspace(1)* %out, double %x) {
 ; FUNC-LABEL: {{^}}ftrunc_v2f64:
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
+define amdgpu_kernel void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
   %y = call <2 x double> @llvm.trunc.v2f64(<2 x double> %x) nounwind readnone
   store <2 x double> %y, <2 x double> addrspace(1)* %out
   ret void
@@ -55,7 +55,7 @@ define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; FIXME-CI: v_trunc_f64_e32
 ; FIXME-CI: v_trunc_f64_e32
 ; FIXME-CI: v_trunc_f64_e32
-; define void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
+; define amdgpu_kernel void @ftrunc_v3f64(<3 x double> addrspace(1)* %out, <3 x double> %x) {
 ;   %y = call <3 x double> @llvm.trunc.v3f64(<3 x double> %x) nounwind readnone
 ;   store <3 x double> %y, <3 x double> addrspace(1)* %out
 ;   ret void
@@ -66,7 +66,7 @@ define void @ftrunc_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %x) {
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
+define amdgpu_kernel void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
   %y = call <4 x double> @llvm.trunc.v4f64(<4 x double> %x) nounwind readnone
   store <4 x double> %y, <4 x double> addrspace(1)* %out
   ret void
@@ -81,7 +81,7 @@ define void @ftrunc_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %x) {
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
+define amdgpu_kernel void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
   %y = call <8 x double> @llvm.trunc.v8f64(<8 x double> %x) nounwind readnone
   store <8 x double> %y, <8 x double> addrspace(1)* %out
   ret void
@@ -104,7 +104,7 @@ define void @ftrunc_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %x) {
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
 ; CI: v_trunc_f64_e32
-define void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
+define amdgpu_kernel void @ftrunc_v16f64(<16 x double> addrspace(1)* %out, <16 x double> %x) {
   %y = call <16 x double> @llvm.trunc.v16f64(<16 x double> %x) nounwind readnone
   store <16 x double> %y, <16 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/ftrunc.ll b/test/CodeGen/AMDGPU/ftrunc.ll
index d0718394e7f13937eb45446aab828ed0e2e34619..b5ad01eaeaf0aac18db650b3d0c40ba075791162 100644
--- a/test/CodeGen/AMDGPU/ftrunc.ll
+++ b/test/CodeGen/AMDGPU/ftrunc.ll
@@ -12,7 +12,7 @@ declare <16 x float> @llvm.trunc.v16f32(<16 x float>) nounwind readnone
 ; FUNC-LABEL: {{^}}ftrunc_f32:
 ; EG: TRUNC
 ; SI: v_trunc_f32_e32
-define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @ftrunc_f32(float addrspace(1)* %out, float %x) {
   %y = call float @llvm.trunc.f32(float %x) nounwind readnone
   store float %y, float addrspace(1)* %out
   ret void
@@ -23,7 +23,7 @@ define void @ftrunc_f32(float addrspace(1)* %out, float %x) {
 ; EG: TRUNC
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
+define amdgpu_kernel void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
   %y = call <2 x float> @llvm.trunc.v2f32(<2 x float> %x) nounwind readnone
   store <2 x float> %y, <2 x float> addrspace(1)* %out
   ret void
@@ -36,7 +36,7 @@ define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
 ; FIXME-SI: v_trunc_f32_e32
 ; FIXME-SI: v_trunc_f32_e32
 ; FIXME-SI: v_trunc_f32_e32
-; define void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
+; define amdgpu_kernel void @ftrunc_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %x) {
 ;   %y = call <3 x float> @llvm.trunc.v3f32(<3 x float> %x) nounwind readnone
 ;   store <3 x float> %y, <3 x float> addrspace(1)* %out
 ;   ret void
@@ -51,7 +51,7 @@ define void @ftrunc_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %x) {
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
+define amdgpu_kernel void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
   %y = call <4 x float> @llvm.trunc.v4f32(<4 x float> %x) nounwind readnone
   store <4 x float> %y, <4 x float> addrspace(1)* %out
   ret void
@@ -74,7 +74,7 @@ define void @ftrunc_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %x) {
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
+define amdgpu_kernel void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
   %y = call <8 x float> @llvm.trunc.v8f32(<8 x float> %x) nounwind readnone
   store <8 x float> %y, <8 x float> addrspace(1)* %out
   ret void
@@ -113,7 +113,7 @@ define void @ftrunc_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %x) {
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
 ; SI: v_trunc_f32_e32
-define void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
+define amdgpu_kernel void @ftrunc_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %x) {
   %y = call <16 x float> @llvm.trunc.v16f32(<16 x float> %x) nounwind readnone
   store <16 x float> %y, <16 x float> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/gep-address-space.ll b/test/CodeGen/AMDGPU/gep-address-space.ll
index f96463613e8ef1a297121796090dc8d05dd80212..7fb47e08ea58c51b6e2d36c7e59b8b4952033a96 100644
--- a/test/CodeGen/AMDGPU/gep-address-space.ll
+++ b/test/CodeGen/AMDGPU/gep-address-space.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs< %s | FileCheck --check-prefix=CI --check-prefix=CHECK %s
 
-define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
+define amdgpu_kernel void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; CHECK-LABEL: {{^}}use_gep_address_space:
 ; CHECK: v_mov_b32_e32 [[PTR:v[0-9]+]], s{{[0-9]+}}
 ; CHECK: ds_write_b32 [[PTR]], v{{[0-9]+}} offset:64
@@ -17,7 +17,7 @@ define void @use_gep_address_space([1024 x i32] addrspace(3)* %array) nounwind {
 ; SI: s_or_b32
 ; CI: s_add_i32
 ; CHECK: ds_write_b32
-define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
+define amdgpu_kernel void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %array) nounwind {
   %p = getelementptr [1024 x i32], [1024 x i32] addrspace(3)* %array, i16 0, i16 16384
   store i32 99, i32 addrspace(3)* %p
   ret void
@@ -39,7 +39,7 @@ define void @use_gep_address_space_large_offset([1024 x i32] addrspace(3)* %arra
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_endpgm
-define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
+define amdgpu_kernel void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <4 x [1024 x i32] addrspace(3)*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   %p0 = extractelement <4 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <4 x i32 addrspace(3)*> %p, i32 1
@@ -60,7 +60,7 @@ define void @gep_as_vector_v4(<4 x [1024 x i32] addrspace(3)*> %array) nounwind
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CI-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:64
 ; CHECK: s_endpgm
-define void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
+define amdgpu_kernel void @gep_as_vector_v2(<2 x [1024 x i32] addrspace(3)*> %array) nounwind {
   %p = getelementptr [1024 x i32], <2 x [1024 x i32] addrspace(3)*> %array, <2 x i16> zeroinitializer, <2 x i16> <i16 16, i16 16>
   %p0 = extractelement <2 x i32 addrspace(3)*> %p, i32 0
   %p1 = extractelement <2 x i32 addrspace(3)*> %p, i32 1
diff --git a/test/CodeGen/AMDGPU/global-constant.ll b/test/CodeGen/AMDGPU/global-constant.ll
index 5a18d425d506623c2b406c2b3385f23316467434..80acfcca70822ce1ad3680b75a60eb7f1608c703 100644
--- a/test/CodeGen/AMDGPU/global-constant.ll
+++ b/test/CodeGen/AMDGPU/global-constant.ll
@@ -26,7 +26,7 @@
 ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC1_LO]], private2@rel32@lo+4
 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC1_HI]], private2@rel32@hi+4
 
-define void @private_test(i32 %index, float addrspace(1)* %out) {
+define amdgpu_kernel void @private_test(i32 %index, float addrspace(1)* %out) {
   %ptr = getelementptr [4 x float], [4 x float] addrspace(2) * @private1, i32 0, i32 %index
   %val = load float, float addrspace(2)* %ptr
   store float %val, float addrspace(1)* %out
@@ -40,7 +40,7 @@ define void @private_test(i32 %index, float addrspace(1)* %out) {
 ; HSA: s_getpc_b64 s{{\[}}[[PC0_LO:[0-9]+]]:[[PC0_HI:[0-9]+]]{{\]}}
 ; HSA: s_add_u32 s{{[0-9]+}}, s[[PC0_LO]], available_externally@gotpcrel32@lo+4
 ; HSA: s_addc_u32 s{{[0-9]+}}, s[[PC0_HI]], available_externally@gotpcrel32@hi+4
-define void @available_externally_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(2)* @available_externally, i32 0, i32 1
   %val = load i32, i32 addrspace(2)* %ptr
   store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/global-directive.ll b/test/CodeGen/AMDGPU/global-directive.ll
index 450b7d3674296cc3e05ea4e3cd0ce26e69b19541..ce89e390eac1f901a0500e312a86e4033665e76a 100644
--- a/test/CodeGen/AMDGPU/global-directive.ll
+++ b/test/CodeGen/AMDGPU/global-directive.ll
@@ -5,7 +5,7 @@
 
 ; SI:	.globl	foo
 ; SI: {{^}}foo:
-define void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @foo(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
diff --git a/test/CodeGen/AMDGPU/global-extload-i16.ll b/test/CodeGen/AMDGPU/global-extload-i16.ll
index 2c7c02de167325fea5f21aeb05d25b622dd65394..19e592f50beaf0d69b6342cf10ab98f30cd81d63 100644
--- a/test/CodeGen/AMDGPU/global-extload-i16.ll
+++ b/test/CodeGen/AMDGPU/global-extload-i16.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_load_ushort
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@ define void @zextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; SI: buffer_load_sshort
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -28,7 +28,7 @@ define void @sextload_global_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i32:
 ; SI: buffer_load_ushort
 ; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @zextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i32:
 ; SI: buffer_load_sshort
 ; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @sextload_global_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @zextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i32:
 ; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @sextload_global_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -74,7 +74,7 @@ define void @zextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i32:
 ; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -83,7 +83,7 @@ define void @sextload_global_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @zextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i32:
 ; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -101,7 +101,7 @@ define void @sextload_global_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -110,7 +110,7 @@ define void @zextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i32:
 ; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -119,7 +119,7 @@ define void @sextload_global_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -128,7 +128,7 @@ define void @zextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i32:
 ; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @sextload_global_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -146,7 +146,7 @@ define void @zextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i32:
 ; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -157,7 +157,7 @@ define void @sextload_global_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; SI-DAG: buffer_load_ushort v[[LO:[0-9]+]],
 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]]
-define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -168,7 +168,7 @@ define void @zextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; VI: buffer_load_ushort [[LOAD:v[0-9]+]], s[{{[0-9]+:[0-9]+}}], 0
 ; VI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[LOAD]]
 ; VI: buffer_store_dwordx2 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0
-define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @sextload_global_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 
 ; FUNC-LABEL: {{^}}zextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
-define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -186,7 +186,7 @@ define void @zextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v1i16_to_v1i64:
 ; SI: s_endpgm
-define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -195,7 +195,7 @@ define void @sextload_global_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
-define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -204,7 +204,7 @@ define void @zextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v2i16_to_v2i64:
 ; SI: s_endpgm
-define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -213,7 +213,7 @@ define void @sextload_global_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
-define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -222,7 +222,7 @@ define void @zextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v4i16_to_v4i64:
 ; SI: s_endpgm
-define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -231,7 +231,7 @@ define void @sextload_global_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
-define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -240,7 +240,7 @@ define void @zextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}sextload_global_v8i16_to_v8i64:
 ; SI: s_endpgm
-define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @sextload_global_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; FUNC-LABEL: {{^}}zextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
-define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -258,7 +258,7 @@ define void @zextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}sextload_global_v16i16_to_v16i64:
 ; SI: s_endpgm
-define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -267,7 +267,7 @@ define void @sextload_global_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; FUNC-LABEL: {{^}}zextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
-define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -276,7 +276,7 @@ define void @zextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}sextload_global_v32i16_to_v32i64:
 ; SI: s_endpgm
-define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -285,7 +285,7 @@ define void @sextload_global_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; FUNC-LABEL: {{^}}zextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
-define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -294,7 +294,7 @@ define void @zextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64
 
 ; FUNC-LABEL: {{^}}sextload_global_v64i16_to_v64i64:
 ; SI: s_endpgm
-define void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
+define amdgpu_kernel void @sextload_global_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* nocapture %in) nounwind {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/global-variable-relocs.ll b/test/CodeGen/AMDGPU/global-variable-relocs.ll
index 00be6e4d5c152214cbff34e0519938b2e2b16dae..ae6dd54fec6c38f78b26e31f02e2c87d3838ac27 100644
--- a/test/CodeGen/AMDGPU/global-variable-relocs.ll
+++ b/test/CodeGen/AMDGPU/global-variable-relocs.ll
@@ -19,7 +19,7 @@
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @private_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @private_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @private, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -33,7 +33,7 @@ define void @private_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[ADDR_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[ADDR_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @internal_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @internal_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @internal, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -50,7 +50,7 @@ define void @internal_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @available_externally_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @available_externally_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @available_externally, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -67,7 +67,7 @@ define void @available_externally_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @linkonce_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @linkonce_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @linkonce_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @weak_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @weak_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -101,7 +101,7 @@ define void @weak_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @common_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @common_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @common, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -118,7 +118,7 @@ define void @common_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @extern_weak_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @extern_weak_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @extern_weak, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -135,7 +135,7 @@ define void @extern_weak_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @linkonce_odr_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @linkonce_odr_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @linkonce_odr, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -152,7 +152,7 @@ define void @linkonce_odr_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @weak_odr_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @weak_odr_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @weak_odr, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -169,7 +169,7 @@ define void @weak_odr_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @external_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @external_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
@@ -186,7 +186,7 @@ define void @external_test(i32 addrspace(1)* %out) {
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[GEP_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[GEP_HI]]
 ; CHECK: flat_load_dword v{{[0-9]+}}, v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @external_w_init_test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @external_w_init_test(i32 addrspace(1)* %out) {
   %ptr = getelementptr [256 x i32], [256 x i32] addrspace(1)* @external_w_init, i32 0, i32 1
   %val = load i32, i32 addrspace(1)* %ptr
   store i32 %val, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/global_atomics.ll b/test/CodeGen/AMDGPU/global_atomics.ll
index 909ceb5546c6b974fcc52ed2a5f087e921cbc7da..6928bede547efa8311e5cd2c1cdd1550e1842ecf 100644
--- a/test/CodeGen/AMDGPU/global_atomics.ll
+++ b/test/CodeGen/AMDGPU/global_atomics.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_soffset:
 ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x8ca0
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
-define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -25,7 +25,7 @@ entry:
 ; SI-DAG: v_mov_b32_e32 v[[PTRHI:[0-9]+]], 0xabcd
 ; SI: buffer_atomic_add v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}}, s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add
-define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
 
@@ -36,7 +36,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret_offset:
 ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -47,7 +47,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -59,7 +59,7 @@ entry:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -70,7 +70,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_add_i32:
 ; GCN: buffer_atomic_add v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -79,7 +79,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_ret:
 ; GCN: buffer_atomic_add [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -89,7 +89,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
 ; SI: buffer_atomic_add v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -100,7 +100,7 @@ entry:
 ; SI: buffer_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_add [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -110,7 +110,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_and_i32_offset:
 ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -120,7 +120,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret_offset:
 ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -131,7 +131,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -143,7 +143,7 @@ entry:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_and [[RET:v[0-9]]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -154,7 +154,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_and_i32:
 ; GCN: buffer_atomic_and v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -163,7 +163,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_ret:
 ; GCN: buffer_atomic_and [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -173,7 +173,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
 ; SI: buffer_atomic_and v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_and v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -184,7 +184,7 @@ entry:
 ; SI: buffer_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_and [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -194,7 +194,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
 ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -204,7 +204,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret_offset:
 ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -215,7 +215,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -227,7 +227,7 @@ entry:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -238,7 +238,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_sub_i32:
 ; GCN: buffer_atomic_sub v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -247,7 +247,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_ret:
 ; GCN: buffer_atomic_sub [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -257,7 +257,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
 ; SI: buffer_atomic_sub v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_sub v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -268,7 +268,7 @@ entry:
 ; SI: buffer_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_sub [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -278,7 +278,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_max_i32_offset:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -288,7 +288,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret_offset:
 ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -299,7 +299,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -311,7 +311,7 @@ entry:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -322,7 +322,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_max_i32:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -331,7 +331,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_ret:
 ; GCN: buffer_atomic_smax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -341,7 +341,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
 ; SI: buffer_atomic_smax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -352,7 +352,7 @@ entry:
 ; SI: buffer_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -362,7 +362,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
 ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -372,7 +372,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret_offset:
 ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -383,7 +383,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -395,7 +395,7 @@ entry:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -406,7 +406,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umax_i32:
 ; GCN: buffer_atomic_umax v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -415,7 +415,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_ret:
 ; GCN: buffer_atomic_umax [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -425,7 +425,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
 ; SI: buffer_atomic_umax v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umax v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -436,7 +436,7 @@ entry:
 ; SI: buffer_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umax [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -446,7 +446,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_min_i32_offset:
 ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -456,7 +456,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret_offset:
 ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -467,7 +467,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -479,7 +479,7 @@ entry:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -490,7 +490,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_min_i32:
 ; GCN: buffer_atomic_smin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -499,7 +499,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_ret:
 ; GCN: buffer_atomic_smin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -509,7 +509,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
 ; SI: buffer_atomic_smin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -520,7 +520,7 @@ entry:
 ; SI: buffer_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -530,7 +530,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
 ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -540,7 +540,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret_offset:
 ; GCN: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -551,7 +551,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -563,7 +563,7 @@ entry:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -574,7 +574,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_umin_i32:
 ; GCN: buffer_atomic_umin v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -583,7 +583,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_ret:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -593,7 +593,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
 ; SI: buffer_atomic_umin v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umin v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -604,7 +604,7 @@ entry:
 ; SI: buffer_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umin [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -614,7 +614,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_or_i32_offset:
 ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -624,7 +624,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret_offset:
 ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -635,7 +635,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -647,7 +647,7 @@ entry:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -658,7 +658,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_or_i32:
 ; GCN: buffer_atomic_or v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -667,7 +667,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_ret:
 ; GCN: buffer_atomic_or [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -677,7 +677,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
 ; SI: buffer_atomic_or v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_or v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -688,7 +688,7 @@ entry:
 ; SI: buffer_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_or [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -698,7 +698,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
 ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -708,7 +708,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret_offset:
 ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -720,7 +720,7 @@ entry:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 
 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}{{$}}
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -733,7 +733,7 @@ entry:
 
 ; VI: flat_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -744,7 +744,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xchg_i32:
 ; GCN: buffer_atomic_swap v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -753,7 +753,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_ret:
 ; GCN: buffer_atomic_swap [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -763,7 +763,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; SI: buffer_atomic_swap v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_swap v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -774,7 +774,7 @@ entry:
 ; SI: buffer_atomic_swap [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_swap [[RET:v[0-9]+]],  v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -784,7 +784,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; GCN: buffer_atomic_cmpswap v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -794,7 +794,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret_offset:
 ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -807,7 +807,7 @@ entry:
 ; SI: buffer_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 
 ; VI: flat_atomic_cmpswap v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -819,7 +819,7 @@ entry:
 ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -831,7 +831,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
 ; GCN: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -840,7 +840,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_ret:
 ; GCN: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   %extract0 = extractvalue { i32, i1 } %val, 0
@@ -851,7 +851,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; SI: buffer_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_cmpswap v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -862,7 +862,7 @@ entry:
 ; SI: buffer_atomic_cmpswap v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_cmpswap v[[RET:[0-9]+]], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword v[[RET]]
-define void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -873,7 +873,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
 ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16{{$}}
-define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -883,7 +883,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret_offset:
 ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -894,7 +894,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16{{$}}
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -906,7 +906,7 @@ entry:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -917,7 +917,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_xor_i32:
 ; GCN: buffer_atomic_xor v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -926,7 +926,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_ret:
 ; GCN: buffer_atomic_xor [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_ret(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   store i32 %val, i32 addrspace(1)* %out2
@@ -936,7 +936,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
 ; SI: buffer_atomic_xor v{{[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_xor v[{{[0-9]+:[0-9]+}}], v{{[0-9]+$}}
-define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -947,7 +947,7 @@ entry:
 ; SI: buffer_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_xor [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_ret_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %out2, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -959,7 +959,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %in, i64 4
   %val = load atomic i32, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -971,7 +971,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i32(i32 addrspace(1)* %in, i32 addrspace(1)* %out) {
 entry:
   %val = load atomic i32, i32 addrspace(1)* %in seq_cst, align 4
   store i32 %val, i32 addrspace(1)* %out
@@ -982,7 +982,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64_offset(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -995,7 +995,7 @@ entry:
 ; SI: buffer_load_dword [[RET:v[0-9]+]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_load_dword [[RET:v[0-9]+]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dword [[RET]]
-define void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i32_addr64(i32 addrspace(1)* %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i64 %index
   %val = load atomic i32, i32 addrspace(1)* %ptr seq_cst, align 4
@@ -1006,7 +1006,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_offset:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:16 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   store atomic i32 %in, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -1016,7 +1016,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32:
 ; SI: buffer_store_dword {{v[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
   ret void
@@ -1025,7 +1025,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:16 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -1036,7 +1036,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64:
 ; SI: buffer_store_dword {{v[0-9]+}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_store_dword v[{{[0-9]+}}:{{[0-9]+}}], {{v[0-9]+}} glc{{$}}
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
diff --git a/test/CodeGen/AMDGPU/global_atomics_i64.ll b/test/CodeGen/AMDGPU/global_atomics_i64.ll
index f66c6c7b531a6dbe0c2a19477fa84071db427e35..56520b787ead7971a2a5f11ffc50b7e678749cf8 100644
--- a/test/CodeGen/AMDGPU/global_atomics_i64.ll
+++ b/test/CodeGen/AMDGPU/global_atomics_i64.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}atomic_add_i64_offset:
 ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -13,7 +13,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret_offset:
 ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -24,7 +24,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64_offset:
 ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -36,7 +36,7 @@ entry:
 ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -47,7 +47,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_add_i64:
 ; GCN: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -56,7 +56,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_ret:
 ; GCN: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_add_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -66,7 +66,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_add_i64_addr64:
 ; CI: buffer_atomic_add_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_add_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -77,7 +77,7 @@ entry:
 ; CI: buffer_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_add_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile add i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -87,7 +87,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64_offset:
 ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -97,7 +97,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret_offset:
 ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -108,7 +108,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64_offset:
 ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -120,7 +120,7 @@ entry:
 ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -131,7 +131,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_and_i64:
 ; GCN: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -140,7 +140,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_ret:
 ; GCN: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_and_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -150,7 +150,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_and_i64_addr64:
 ; CI: buffer_atomic_and_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_and_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -161,7 +161,7 @@ entry:
 ; CI: buffer_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_and_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile and i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -171,7 +171,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64_offset:
 ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -181,7 +181,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret_offset:
 ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -192,7 +192,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64_offset:
 ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -204,7 +204,7 @@ entry:
 ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -215,7 +215,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_sub_i64:
 ; GCN: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -224,7 +224,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_ret:
 ; GCN: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_sub_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -234,7 +234,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_sub_i64_addr64:
 ; CI: buffer_atomic_sub_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_sub_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -245,7 +245,7 @@ entry:
 ; CI: buffer_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_sub_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile sub i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -255,7 +255,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64_offset:
 ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -265,7 +265,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret_offset:
 ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -276,7 +276,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64_offset:
 ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -288,7 +288,7 @@ entry:
 ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -299,7 +299,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_max_i64:
 ; GCN: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -308,7 +308,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_ret:
 ; GCN: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_max_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -318,7 +318,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_max_i64_addr64:
 ; CI: buffer_atomic_smax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -329,7 +329,7 @@ entry:
 ; CI: buffer_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile max i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -339,7 +339,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64_offset:
 ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -349,7 +349,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret_offset:
 ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -360,7 +360,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64_offset:
 ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -372,7 +372,7 @@ entry:
 ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -383,7 +383,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umax_i64:
 ; GCN: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -392,7 +392,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_ret:
 ; GCN: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umax_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -402,7 +402,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umax_i64_addr64:
 ; CI: buffer_atomic_umax_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umax_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -413,7 +413,7 @@ entry:
 ; CI: buffer_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umax_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umax i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -423,7 +423,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64_offset:
 ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -433,7 +433,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret_offset:
 ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -444,7 +444,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64_offset:
 ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -456,7 +456,7 @@ entry:
 ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -467,7 +467,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_min_i64:
 ; GCN: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -476,7 +476,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_ret:
 ; GCN: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_min_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -486,7 +486,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_min_i64_addr64:
 ; CI: buffer_atomic_smin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_smin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -497,7 +497,7 @@ entry:
 ; CI: buffer_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_smin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile min i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -507,7 +507,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64_offset:
 ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -517,7 +517,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret_offset:
 ; GCN: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -528,7 +528,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64_offset:
 ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -540,7 +540,7 @@ entry:
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -551,7 +551,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_umin_i64:
 ; GCN: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -560,7 +560,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_ret:
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_umin_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -570,7 +570,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_umin_i64_addr64:
 ; CI: buffer_atomic_umin_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_umin_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -581,7 +581,7 @@ entry:
 ; CI: buffer_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_umin_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile umin i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -591,7 +591,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64_offset:
 ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -601,7 +601,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret_offset:
 ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -612,7 +612,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64_offset:
 ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -624,7 +624,7 @@ entry:
 ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -635,7 +635,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_or_i64:
 ; GCN: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -644,7 +644,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_ret:
 ; GCN: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_or_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -654,7 +654,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_or_i64_addr64:
 ; CI: buffer_atomic_or_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_or_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -665,7 +665,7 @@ entry:
 ; CI: buffer_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_or_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile or i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -675,7 +675,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64_offset:
 ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -685,7 +685,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret_offset:
 ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -696,7 +696,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64_offset:
 ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}}{{$}}
-define void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -708,7 +708,7 @@ entry:
 ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -719,7 +719,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xchg_i64:
 ; GCN: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -728,7 +728,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_ret:
 ; GCN: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -738,7 +738,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xchg_i64_addr64:
 ; CI: buffer_atomic_swap_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_swap_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -749,7 +749,7 @@ entry:
 ; CI: buffer_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_swap_x2 [[RET:v\[[0-9]+:[0-9]+\]]],  v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xchg i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -759,7 +759,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64_offset:
 ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_offset(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -769,7 +769,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret_offset:
 ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %gep, i64 %in seq_cst
@@ -780,7 +780,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64_offset:
 ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -792,7 +792,7 @@ entry:
 ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -803,7 +803,7 @@ entry:
 
 ; GCN-LABEL: {{^}}atomic_xor_i64:
 ; GCN: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64(i64 addrspace(1)* %out, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
   ret void
@@ -812,7 +812,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_ret:
 ; GCN: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
+define amdgpu_kernel void @atomic_xor_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in) {
 entry:
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %out, i64 %in seq_cst
   store i64 %tmp0, i64 addrspace(1)* %out2
@@ -822,7 +822,7 @@ entry:
 ; GCN-LABEL: {{^}}atomic_xor_i64_addr64:
 ; CI: buffer_atomic_xor_x2 v{{\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_xor_x2 v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]$}}
-define void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -833,7 +833,7 @@ entry:
 ; CI: buffer_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_xor_x2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}], v{{\[[0-9]+:[0-9]+\]}} glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %tmp0 = atomicrmw volatile xor i64 addrspace(1)* %ptr, i64 %in seq_cst
@@ -851,7 +851,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_offset:
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32{{$}}
-define void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -861,7 +861,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_soffset:
 ; GCN: s_mov_b32 [[SREG:s[0-9]+]], 0x11940
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+}}:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], [[SREG]]{{$}}
-define void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 9000
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -871,7 +871,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret_offset:
 ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]{{:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i64 addrspace(1)* %gep, i64 %old, i64 %in seq_cst seq_cst
@@ -884,7 +884,7 @@ entry:
 ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32{{$}}
 
 ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+\:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -896,7 +896,7 @@ entry:
 ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -908,7 +908,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64:
 ; GCN: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0{{$}}
-define void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64(i64 addrspace(1)* %out, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
   ret void
@@ -917,7 +917,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_ret:
 ; GCN: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %old) {
 entry:
   %val = cmpxchg volatile i64 addrspace(1)* %out, i64 %old, i64 %in seq_cst seq_cst
   %extract0 = extractvalue { i64, i1 } %val, 0
@@ -928,7 +928,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i64_addr64:
 ; CI: buffer_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64{{$}}
 ; VI: flat_atomic_cmpswap_x2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]{{$}}
-define void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(i64 addrspace(1)* %out, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -939,7 +939,7 @@ entry:
 ; CI: buffer_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+}}], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_atomic_cmpswap_x2 v{{\[}}[[RET:[0-9]+]]:{{[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RET]]:
-define void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %out2, i64 %in, i64 %index, i64 %old) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i64 addrspace(1)* %ptr, i64 %old, i64 %in seq_cst seq_cst
@@ -952,7 +952,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %in, i64 4
   %val = load atomic i64, i64 addrspace(1)* %gep  seq_cst, align 8
@@ -964,7 +964,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_load_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %out) {
 entry:
   %val = load atomic i64, i64 addrspace(1)* %in seq_cst, align 8
   store i64 %val, i64 addrspace(1)* %out
@@ -975,7 +975,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -988,7 +988,7 @@ entry:
 ; CI: buffer_load_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_load_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], v[{{[0-9]+:[0-9]+}}] glc{{$}}
 ; GCN: buffer_store_dwordx2 [[RET]]
-define void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_load_i64_addr64(i64 addrspace(1)* %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %in, i64 %index
   %val = load atomic i64, i64 addrspace(1)* %ptr seq_cst, align 8
@@ -999,7 +999,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64_offset:
 ; CI: buffer_store_dwordx2 [[RET:v\[[0-9]+:[0-9]+\]]], off, s[{{[0-9]+}}:{{[0-9]+}}], 0 offset:32 glc{{$}}
 ; VI: flat_store_dwordx2 [[RET:v\[[0-9]+:[0-9]\]]], v[{{[0-9]+}}:{{[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, i64 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i64, i64 addrspace(1)* %out, i64 4
   store atomic i64 %in, i64 addrspace(1)* %gep  seq_cst, align 8
@@ -1009,7 +1009,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, off, s[{{[0-9]+}}:{{[0-9]+}}], 0 glc
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}] glc
-define void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i64(i64 %in, i64 addrspace(1)* %out) {
 entry:
   store atomic i64 %in, i64 addrspace(1)* %out seq_cst, align 8
   ret void
@@ -1018,7 +1018,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64_addr64_offset:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 offset:32 glc{{$}}
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i64 4
@@ -1029,7 +1029,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i64_addr64:
 ; CI: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]\]}}, v[{{[0-9]+}}:{{[0-9]+}}], s[{{[0-9]+}}:{{[0-9]+}}], 0 addr64 glc{{$}}
 ; VI: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v[{{[0-9]+:[0-9]+}}] glc{{$}}
-define void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, i64 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i64, i64 addrspace(1)* %out, i64 %index
   store atomic i64 %in, i64 addrspace(1)* %ptr seq_cst, align 8
diff --git a/test/CodeGen/AMDGPU/gv-const-addrspace.ll b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
index d07843e9dd276ddb596b7ab3e754fad86d7b6a60..0903542bac4f09b94adb386118d7915097ecd460 100644
--- a/test/CodeGen/AMDGPU/gv-const-addrspace.ll
+++ b/test/CodeGen/AMDGPU/gv-const-addrspace.ll
@@ -15,7 +15,7 @@
 ; EG: @float_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @float(float addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) {
 entry:
   %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
   %1 = load float, float addrspace(2)* %0
@@ -33,7 +33,7 @@ entry:
 ; EG: @i32_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @i32(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @i32(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %0 = getelementptr inbounds [5 x i32], [5 x i32] addrspace(2)* @i32_gv, i32 0, i32 %index
   %1 = load i32, i32 addrspace(2)* %0
@@ -53,7 +53,7 @@ entry:
 ; EG: @struct_foo_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [1 x %struct.foo], [1 x %struct.foo] addrspace(2)* @struct_foo_gv, i32 0, i32 0, i32 1, i32 %index
   %load = load i32, i32 addrspace(2)* %gep, align 4
   store i32 %load, i32 addrspace(1)* %out, align 4
@@ -72,7 +72,7 @@ define void @struct_foo_gv_load(i32 addrspace(1)* %out, i32 %index) {
 ; EG: @array_v1_gv
 ; EG-NOT: MOVA_INT
 ; EG-NOT: MOV
-define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
   %gep = getelementptr inbounds [4 x <1 x i32>], [4 x <1 x i32>] addrspace(2)* @array_v1_gv, i32 0, i32 %index
   %load = load <1 x i32>, <1 x i32> addrspace(2)* %gep, align 4
   store <1 x i32> %load, <1 x i32> addrspace(1)* %out, align 4
@@ -84,7 +84,7 @@ define void @array_v1_gv_load(<1 x i32> addrspace(1)* %out, i32 %index) {
 ; EG: VTX_READ_32
 ; EG: @float_gv
 ; EG-NOT: MOVA_INT
-define void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
+define amdgpu_kernel void @gv_addressing_in_branch(float addrspace(1)* %out, i32 %index, i32 %a) {
 entry:
   %0 = icmp eq i32 0, %a
   br i1 %0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/gv-offset-folding.ll b/test/CodeGen/AMDGPU/gv-offset-folding.ll
index 2b5af75936f669a50438bd8f2ebc281fd94d51c5..e641d7266a793ab3c688dd42f63df9c01c3a9f08 100644
--- a/test/CodeGen/AMDGPU/gv-offset-folding.ll
+++ b/test/CodeGen/AMDGPU/gv-offset-folding.ll
@@ -13,7 +13,7 @@
 
 ; CHECK-LABEL: lds_no_offset:
 ; CHECK: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:4
-define void @lds_no_offset() {
+define amdgpu_kernel void @lds_no_offset() {
 entry:
   %ptr = getelementptr [4 x i32], [4 x i32] addrspace(3)* @lds, i32 0, i32 1
   store i32 0, i32 addrspace(3)* %ptr
diff --git a/test/CodeGen/AMDGPU/half.ll b/test/CodeGen/AMDGPU/half.ll
index 4b4895028c915bdf428092dfb7cf5a38a5eafd5d..41ae5a4a0b00b79460ab88d62dece4842ded6cc4 100644
--- a/test/CodeGen/AMDGPU/half.ll
+++ b/test/CodeGen/AMDGPU/half.ll
@@ -8,7 +8,7 @@
 ; SI: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[ARG]]
 ; VI: v_trunc_f16_e32 [[CVT:v[0-9]+]], [[ARG]]
 ; GCN: buffer_store_short [[CVT]]
-define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
   store half %arg, half addrspace(1)* %out
   ret void
 }
@@ -20,7 +20,7 @@ define void @load_f16_arg(half addrspace(1)* %out, half %arg) #0 {
 ; GCN: v_or_b32_e32 [[PACKED:v[0-9]+]], [[HI]], [[V0]]
 ; GCN: buffer_store_dword [[PACKED]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN: s_endpgm
-define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
   store <2 x half> %arg, <2 x half> addrspace(1)* %out
   ret void
 }
@@ -34,7 +34,7 @@ define void @load_v2f16_arg(<2 x half> addrspace(1)* %out, <2 x half> %arg) #0 {
 ; GCN-DAG: buffer_store_short
 ; GCN-NOT: buffer_store
 ; GCN: s_endpgm
-define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
   store <3 x half> %arg, <3 x half> addrspace(1)* %out
   ret void
 }
@@ -46,33 +46,33 @@ define void @load_v3f16_arg(<3 x half> addrspace(1)* %out, <3 x half> %arg) #0 {
 ; GCN: buffer_load_ushort
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @load_v4f16_arg(<4 x half> addrspace(1)* %out, <4 x half> %arg) #0 {
   store <4 x half> %arg, <4 x half> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}load_v8f16_arg:
-define void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @load_v8f16_arg(<8 x half> addrspace(1)* %out, <8 x half> %arg) #0 {
   store <8 x half> %arg, <8 x half> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v2f16_arg:
-define void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
+define amdgpu_kernel void @extload_v2f16_arg(<2 x float> addrspace(1)* %out, <2 x half> %in) #0 {
   %fpext = fpext <2 x half> %in to <2 x float>
   store <2 x float> %fpext, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_f16_to_f32_arg:
-define void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @extload_f16_to_f32_arg(float addrspace(1)* %out, half %arg) #0 {
   %ext = fpext half %arg to float
   store float %ext, float addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v2f16_to_v2f32_arg:
-define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x half> %arg) #0 {
   %ext = fpext <2 x half> %arg to <2 x float>
   store <2 x float> %ext, <2 x float> addrspace(1)* %out
   ret void
@@ -90,14 +90,14 @@ define void @extload_v2f16_to_v2f32_arg(<2 x float> addrspace(1)* %out, <2 x hal
 ; GCN-DAG: buffer_store_dword
 ; GCN-DAG: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v3f16_to_v3f32_arg(<3 x float> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x float>
   store <3 x float> %ext, <3 x float> addrspace(1)* %out
   ret void
 }
 
 ; GCN-LABEL: {{^}}extload_v4f16_to_v4f32_arg:
-define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x float>
   store <4 x float> %ext, <4 x float> addrspace(1)* %out
   ret void
@@ -124,7 +124,7 @@ define void @extload_v4f16_to_v4f32_arg(<4 x float> addrspace(1)* %out, <4 x hal
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x float>
   store <8 x float> %ext, <8 x float> addrspace(1)* %out
   ret void
@@ -138,7 +138,7 @@ define void @extload_v8f16_to_v8f32_arg(<8 x float> addrspace(1)* %out, <8 x hal
 ; VI: v_cvt_f32_f16_e32 v[[VARG_F32:[0-9]+]], v[[VARG]]
 ; VI: v_cvt_f64_f32_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], v[[VARG_F32]]
 ; GCN: buffer_store_dwordx2 [[RESULT]]
-define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
+define amdgpu_kernel void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
   %ext = fpext half %arg to double
   store double %ext, double addrspace(1)* %out
   ret void
@@ -152,7 +152,7 @@ define void @extload_f16_to_f64_arg(double addrspace(1)* %out, half %arg) #0 {
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x half> %arg) #0 {
   %ext = fpext <2 x half> %arg to <2 x double>
   store <2 x double> %ext, <2 x double> addrspace(1)* %out
   ret void
@@ -169,7 +169,7 @@ define void @extload_v2f16_to_v2f64_arg(<2 x double> addrspace(1)* %out, <2 x ha
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x half> %arg) #0 {
   %ext = fpext <3 x half> %arg to <3 x double>
   store <3 x double> %ext, <3 x double> addrspace(1)* %out
   ret void
@@ -189,7 +189,7 @@ define void @extload_v3f16_to_v3f64_arg(<3 x double> addrspace(1)* %out, <3 x ha
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN-DAG: v_cvt_f64_f32_e32
 ; GCN: s_endpgm
-define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x half> %arg) #0 {
   %ext = fpext <4 x half> %arg to <4 x double>
   store <4 x double> %ext, <4 x double> addrspace(1)* %out
   ret void
@@ -227,7 +227,7 @@ define void @extload_v4f16_to_v4f64_arg(<4 x double> addrspace(1)* %out, <4 x ha
 ; GCN-DAG: v_cvt_f64_f32_e32
 
 ; GCN: s_endpgm
-define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
+define amdgpu_kernel void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x half> %arg) #0 {
   %ext = fpext <8 x half> %arg to <8 x double>
   store <8 x double> %ext, <8 x double> addrspace(1)* %out
   ret void
@@ -236,7 +236,7 @@ define void @extload_v8f16_to_v8f64_arg(<8 x double> addrspace(1)* %out, <8 x ha
 ; GCN-LABEL: {{^}}global_load_store_f16:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   store half %val, half addrspace(1)* %out
   ret void
@@ -245,7 +245,7 @@ define void @global_load_store_f16(half addrspace(1)* %out, half addrspace(1)* %
 ; GCN-LABEL: {{^}}global_load_store_v2f16:
 ; GCN: buffer_load_dword [[TMP:v[0-9]+]]
 ; GCN: buffer_store_dword [[TMP]]
-define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   store <2 x half> %val, <2 x half> addrspace(1)* %out
   ret void
@@ -254,7 +254,7 @@ define void @global_load_store_v2f16(<2 x half> addrspace(1)* %out, <2 x half> a
 ; GCN-LABEL: {{^}}global_load_store_v4f16:
 ; GCN: buffer_load_dwordx2 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx2 [[TMP]]
-define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> addrspace(1)* %out) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   store <4 x half> %val, <4 x half> addrspace(1)* %out
   ret void
@@ -264,7 +264,7 @@ define void @global_load_store_v4f16(<4 x half> addrspace(1)* %in, <4 x half> ad
 ; GCN: buffer_load_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[TMP:v\[[0-9]+:[0-9]+\]]]
 ; GCN: s_endpgm
-define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   store <8 x half> %val, <8 x half> addrspace(1)* %out
   ret void
@@ -274,7 +274,7 @@ define void @global_load_store_v8f16(<8 x half> addrspace(1)* %out, <8 x half> a
 ; GCN: buffer_load_ushort [[LOAD:v[0-9]+]]
 ; GCN: v_cvt_f32_f16_e32 [[CVT:v[0-9]+]], [[LOAD]]
 ; GCN: buffer_store_dword [[CVT]]
-define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %cvt = fpext half %val to float
   store float %cvt, float addrspace(1)* %out
@@ -283,13 +283,13 @@ define void @global_extload_f16_to_f32(float addrspace(1)* %out, half addrspace(
 
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f32:
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; VI:  v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
 ; GCN: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
-; SI:  v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
-; GCN: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; SI: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+; SI: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; VI: v_cvt_f32_f16_sdwa v[[CVT1:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 ; GCN: buffer_store_dwordx2 v{{\[}}[[CVT0]]:[[CVT1]]{{\]}}
 ; GCN: s_endpgm
-define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x float>
   store <2 x float> %cvt, <2 x float> addrspace(1)* %out
@@ -297,7 +297,7 @@ define void @global_extload_v2f16_to_v2f32(<2 x float> addrspace(1)* %out, <2 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v3f16_to_v3f32:
-define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
   %cvt = fpext <3 x half> %val to <3 x float>
   store <3 x float> %cvt, <3 x float> addrspace(1)* %out
@@ -305,7 +305,7 @@ define void @global_extload_v3f16_to_v3f32(<3 x float> addrspace(1)* %out, <3 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f32:
-define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   %cvt = fpext <4 x half> %val to <4 x float>
   store <4 x float> %cvt, <4 x float> addrspace(1)* %out
@@ -313,7 +313,7 @@ define void @global_extload_v4f16_to_v4f32(<4 x float> addrspace(1)* %out, <4 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f32:
-define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   %cvt = fpext <8 x half> %val to <8 x float>
   store <8 x float> %cvt, <8 x float> addrspace(1)* %out
@@ -324,22 +324,26 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_load_dwordx4
 
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
-; GCN: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+
+; VI: v_cvt_f32_f16_e32
+; VI: v_cvt_f32_f16_sdwa
+; ...
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
@@ -347,7 +351,7 @@ define void @global_extload_v8f16_to_v8f32(<8 x float> addrspace(1)* %out, <8 x
 ; GCN: buffer_store_dwordx4
 
 ; GCN: s_endpgm
-define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x float>
   store <16 x float> %cvt, <16 x float> addrspace(1)* %out
@@ -359,7 +363,7 @@ define void @global_extload_v16f16_to_v16f32(<16 x float> addrspace(1)* %out, <1
 ; GCN: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], [[LOAD]]
 ; GCN: v_cvt_f64_f32_e32 [[CVT1:v\[[0-9]+:[0-9]+\]]], [[CVT0]]
 ; GCN: buffer_store_dwordx2 [[CVT1]]
-define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace(1)* %in) #0 {
   %val = load half, half addrspace(1)* %in
   %cvt = fpext half %val to double
   store double %cvt, double addrspace(1)* %out
@@ -368,14 +372,21 @@ define void @global_extload_f16_to_f64(double addrspace(1)* %out, half addrspace
 
 ; GCN-LABEL: {{^}}global_extload_v2f16_to_v2f64:
 ; GCN-DAG: buffer_load_dword [[LOAD:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-; GCN-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
-; GCN-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
-; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
+
+; SI-DAG: v_lshrrev_b32_e32 [[HI:v[0-9]+]], 16, [[LOAD]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[CVT0:[0-9]+]], [[LOAD]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[HI]]
+; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT0]]
+; SI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT1]]
+
+; VI-DAG: v_cvt_f32_f16_sdwa v[[CVT0:[0-9]+]], [[LOAD]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-DAG: v_cvt_f32_f16_e32 v[[CVT1:[0-9]+]], [[LOAD]]
+; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT3_LO:[0-9]+]]:[[CVT3_HI:[0-9]+]]{{\]}}, v[[CVT0]]
+; VI-DAG: v_cvt_f64_f32_e32 v{{\[}}[[CVT2_LO:[0-9]+]]:[[CVT2_HI:[0-9]+]]{{\]}}, v[[CVT1]]
+
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[CVT2_LO]]:[[CVT3_HI]]{{\]}}
 ; GCN: s_endpgm
-define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %val = load <2 x half>, <2 x half> addrspace(1)* %in
   %cvt = fpext <2 x half> %val to <2 x double>
   store <2 x double> %cvt, <2 x double> addrspace(1)* %out
@@ -392,18 +403,17 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
 ; XSI-NOT: v_cvt_f32_f16
 
 ; XVI: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-; XVI-DAG: v_lshrrev_b32_e32 {{v[0-9]+}}, 16, {{v[0-9]+}}
-; XVI: v_cvt_f32_f16_e32
 ; XVI: v_cvt_f32_f16_e32
 ; XVI: v_cvt_f32_f16_e32
+; XVI: v_cvt_f32_f16_sdwa
 ; XVI-NOT: v_cvt_f32_f16
 
 ; GCN: buffer_load_dwordx2 v{{\[}}[[IN_LO:[0-9]+]]:[[IN_HI:[0-9]+]]
-; VI-DAG:  v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
 ; GCN-DAG: v_cvt_f32_f16_e32 [[Z32:v[0-9]+]], v[[IN_HI]]
 ; GCN-DAG: v_cvt_f32_f16_e32 [[X32:v[0-9]+]], v[[IN_LO]]
 ; SI:      v_lshrrev_b32_e32 [[Y16:v[0-9]+]], 16, v[[IN_LO]]
-; GCN-DAG: v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
+; SI-DAG:  v_cvt_f32_f16_e32 [[Y32:v[0-9]+]], [[Y16]]
+; VI-DAG:  v_cvt_f32_f16_sdwa [[Y32:v[0-9]+]], v[[IN_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
 
 ; GCN-DAG: v_cvt_f64_f32_e32 [[Z:v\[[0-9]+:[0-9]+\]]], [[Z32]]
 ; GCN-DAG: v_cvt_f64_f32_e32 v{{\[}}[[XLO:[0-9]+]]:{{[0-9]+}}], [[X32]]
@@ -413,7 +423,7 @@ define void @global_extload_v2f16_to_v2f64(<2 x double> addrspace(1)* %out, <2 x
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[XLO]]:[[YHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 [[Z]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
 ; GCN: s_endpgm
-define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x half> addrspace(1)* %in) #0 {
   %val = load <3 x half>, <3 x half> addrspace(1)* %in
   %cvt = fpext <3 x half> %val to <3 x double>
   store <3 x double> %cvt, <3 x double> addrspace(1)* %out
@@ -421,7 +431,7 @@ define void @global_extload_v3f16_to_v3f64(<3 x double> addrspace(1)* %out, <3 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v4f16_to_v4f64:
-define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %val = load <4 x half>, <4 x half> addrspace(1)* %in
   %cvt = fpext <4 x half> %val to <4 x double>
   store <4 x double> %cvt, <4 x double> addrspace(1)* %out
@@ -429,7 +439,7 @@ define void @global_extload_v4f16_to_v4f64(<4 x double> addrspace(1)* %out, <4 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v8f16_to_v8f64:
-define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x half> addrspace(1)* %in) #0 {
   %val = load <8 x half>, <8 x half> addrspace(1)* %in
   %cvt = fpext <8 x half> %val to <8 x double>
   store <8 x double> %cvt, <8 x double> addrspace(1)* %out
@@ -437,7 +447,7 @@ define void @global_extload_v8f16_to_v8f64(<8 x double> addrspace(1)* %out, <8 x
 }
 
 ; GCN-LABEL: {{^}}global_extload_v16f16_to_v16f64:
-define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <16 x half> addrspace(1)* %in) #0 {
   %val = load <16 x half>, <16 x half> addrspace(1)* %in
   %cvt = fpext <16 x half> %val to <16 x double>
   store <16 x double> %cvt, <16 x double> addrspace(1)* %out
@@ -448,7 +458,7 @@ define void @global_extload_v16f16_to_v16f64(<16 x double> addrspace(1)* %out, <
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[CVT:v[0-9]+]], [[LOAD]]
 ; GCN: buffer_store_short [[CVT]]
-define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %val = load float, float addrspace(1)* %in
   %cvt = fptrunc float %val to half
   store half %cvt, half addrspace(1)* %out
@@ -458,12 +468,17 @@ define void @global_truncstore_f32_to_f16(half addrspace(1)* %out, float addrspa
 ; GCN-LABEL: {{^}}global_truncstore_v2f32_to_v2f16:
 ; GCN: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 ; GCN-DAG: v_cvt_f16_f32_e32 [[CVT0:v[0-9]+]], v[[LO]]
-; GCN-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
-; GCN-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
-; GCN-DAG: v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
+
+; SI-DAG: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], v[[HI]]
+; SI-DAG: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[CVT1]]
+; SI:     v_or_b32_e32 [[PACKED:v[0-9]+]], [[SHL]], [[CVT0]]
+
+; VI-DAG: v_cvt_f16_f32_sdwa [[CVT1:v[0-9]+]], v[[HI]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI:     v_or_b32_e32 [[PACKED:v[0-9]+]], [[CVT1]], [[CVT0]]
+
 ; GCN-DAG: buffer_store_dword [[PACKED]]
 ; GCN: s_endpgm
-define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
   %val = load <2 x float>, <2 x float> addrspace(1)* %in
   %cvt = fptrunc <2 x float> %val to <2 x half>
   store <2 x half> %cvt, <2 x half> addrspace(1)* %out
@@ -472,14 +487,14 @@ define void @global_truncstore_v2f32_to_v2f16(<2 x half> addrspace(1)* %out, <2
 
 ; GCN-LABEL: {{^}}global_truncstore_v3f32_to_v3f16:
 ; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN-NOT: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; SI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; GCN-DAG: v_cvt_f16_f32_e32
 ; GCN: buffer_store_short
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
   %val = load <3 x float>, <3 x float> addrspace(1)* %in
   %cvt = fptrunc <3 x float> %val to <3 x half>
   store <3 x half> %cvt, <3 x half> addrspace(1)* %out
@@ -488,13 +503,15 @@ define void @global_truncstore_v3f32_to_v3f16(<3 x half> addrspace(1)* %out, <3
 
 ; GCN-LABEL: {{^}}global_truncstore_v4f32_to_v4f16:
 ; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
+; GCN-DAG: v_cvt_f16_f32_e32
+; SI-DAG:  v_cvt_f16_f32_e32
+; SI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; GCN-DAG: v_cvt_f16_f32_e32
 ; GCN: buffer_store_dwordx2
 ; GCN: s_endpgm
-define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
   %val = load <4 x float>, <4 x float> addrspace(1)* %in
   %cvt = fptrunc <4 x float> %val to <4 x half>
   store <4 x half> %cvt, <4 x half> addrspace(1)* %out
@@ -504,17 +521,25 @@ define void @global_truncstore_v4f32_to_v4f16(<4 x half> addrspace(1)* %out, <4
 ; GCN-LABEL: {{^}}global_truncstore_v8f32_to_v8f16:
 ; GCN: buffer_load_dwordx4
 ; GCN: buffer_load_dwordx4
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
-; GCN: v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_e32
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; VI-DAG:  v_cvt_f16_f32_sdwa
+; VI-DAG:  v_cvt_f16_f32_sdwa
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
   %val = load <8 x float>, <8 x float> addrspace(1)* %in
   %cvt = fptrunc <8 x float> %val to <8 x half>
   store <8 x half> %cvt, <8 x half> addrspace(1)* %out
@@ -545,7 +570,7 @@ define void @global_truncstore_v8f32_to_v8f16(<8 x half> addrspace(1)* %out, <8
 ; GCN-DAG: buffer_store_dwordx4
 ; GCN-DAG: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
   %val = load <16 x float>, <16 x float> addrspace(1)* %in
   %cvt = fptrunc <16 x float> %val to <16 x half>
   store <16 x half> %cvt, <16 x half> addrspace(1)* %out
@@ -560,7 +585,7 @@ define void @global_truncstore_v16f32_to_v16f16(<16 x half> addrspace(1)* %out,
 ; SI-DAG: v_cvt_f32_f16_e32 v{{[0-9]+}},
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
+define amdgpu_kernel void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
    %add = fadd half %a, %b
    store half %add, half addrspace(1)* %out, align 4
    ret void
@@ -570,7 +595,7 @@ define void @fadd_f16(half addrspace(1)* %out, half %a, half %b) #0 {
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half> %b) #0 {
   %add = fadd <2 x half> %a, %b
   store <2 x half> %add, <2 x half> addrspace(1)* %out, align 8
   ret void
@@ -582,7 +607,7 @@ define void @fadd_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %a, <2 x half>
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x half>, <4 x half> addrspace(1)* %in, i32 1
   %a = load <4 x half>, <4 x half> addrspace(1)* %in, align 16
   %b = load <4 x half>, <4 x half> addrspace(1)* %b_ptr, align 16
@@ -601,7 +626,7 @@ define void @fadd_v4f16(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)*
 ; SI: v_add_f32
 ; SI: v_add_f32
 ; GCN: s_endpgm
-define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
+define amdgpu_kernel void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half> %b) #0 {
   %add = fadd <8 x half> %a, %b
   store <8 x half> %add, <8 x half> addrspace(1)* %out, align 32
   ret void
@@ -610,7 +635,7 @@ define void @fadd_v8f16(<8 x half> addrspace(1)* %out, <8 x half> %a, <8 x half>
 ; GCN-LABEL: {{^}}test_bitcast_from_half:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %out) #0 {
   %val = load half, half addrspace(1)* %in
   %val_int = bitcast half %val to i16
   store i16 %val_int, i16 addrspace(1)* %out
@@ -620,7 +645,7 @@ define void @test_bitcast_from_half(half addrspace(1)* %in, i16 addrspace(1)* %o
 ; GCN-LABEL: {{^}}test_bitcast_to_half:
 ; GCN: buffer_load_ushort [[TMP:v[0-9]+]]
 ; GCN: buffer_store_short [[TMP]]
-define void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_bitcast_to_half(half addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %val = load i16, i16 addrspace(1)* %in
   %val_fp = bitcast i16 %val to half
   store half %val_fp, half addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/hsa-default-device.ll b/test/CodeGen/AMDGPU/hsa-default-device.ll
index 631d6def444263899dd61e0e7f2993bec8a76a32..45efe9b86557a0d72ef5baa258e0b547c7db494c 100644
--- a/test/CodeGen/AMDGPU/hsa-default-device.ll
+++ b/test/CodeGen/AMDGPU/hsa-default-device.ll
@@ -4,7 +4,7 @@
 ; unsupported device.
 
 ; CHECK: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
-define void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
+define amdgpu_kernel void @test_kernel(float addrspace(1)* %out0, double addrspace(1)* %out1) nounwind {
   store float 0.0, float addrspace(1)* %out0
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/hsa-fp-mode.ll b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
index 977667cbe6c3148046bee7d2aac711420e66856a..b1901cf894b08ef4f02b02ea800b83c50cd9bae1 100644
--- a/test/CodeGen/AMDGPU/hsa-fp-mode.ll
+++ b/test/CodeGen/AMDGPU/hsa-fp-mode.ll
@@ -4,7 +4,7 @@
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
+define amdgpu_kernel void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %out1) #0 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -14,7 +14,7 @@ define void @test_default_ci(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
+define amdgpu_kernel void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #1 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -24,7 +24,7 @@ define void @test_default_vi(float addrspace(1)* %out0, double addrspace(1)* %ou
 ; GCN: float_mode = 192
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
+define amdgpu_kernel void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #2 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -34,7 +34,7 @@ define void @test_f64_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN: float_mode = 48
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
+define amdgpu_kernel void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #3 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -44,7 +44,7 @@ define void @test_f32_denormals(float addrspace(1)* %out0, double addrspace(1)*
 ; GCN: float_mode = 240
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
+define amdgpu_kernel void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #4 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -54,7 +54,17 @@ define void @test_f32_f64_denormals(float addrspace(1)* %out0, double addrspace(
 ; GCN: float_mode = 0
 ; GCN: enable_dx10_clamp = 1
 ; GCN: enable_ieee_mode = 1
-define void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+define amdgpu_kernel void @test_no_denormals(float addrspace(1)* %out0, double addrspace(1)* %out1) #5 {
+  store float 0.0, float addrspace(1)* %out0
+  store double 0.0, double addrspace(1)* %out1
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_no_dx10_clamp_vi:
+; GCN: float_mode = 192
+; GCN: enable_dx10_clamp = 0
+; GCN: enable_ieee_mode = 1
+define amdgpu_kernel void @test_no_dx10_clamp_vi(float addrspace(1)* %out0, double addrspace(1)* %out1) #6 {
   store float 0.0, float addrspace(1)* %out0
   store double 0.0, double addrspace(1)* %out1
   ret void
@@ -66,3 +76,4 @@ attributes #2 = { nounwind "target-features"="-fp32-denormals,+fp64-fp16-denorma
 attributes #3 = { nounwind "target-features"="+fp32-denormals,-fp64-fp16-denormals" }
 attributes #4 = { nounwind "target-features"="+fp32-denormals,+fp64-fp16-denormals" }
 attributes #5 = { nounwind "target-features"="-fp32-denormals,-fp64-fp16-denormals" }
+attributes #6 = { nounwind "target-cpu"="fiji" "target-features"="-dx10-clamp" }
diff --git a/test/CodeGen/AMDGPU/hsa-func.ll b/test/CodeGen/AMDGPU/hsa-func.ll
index d9662b69b1260db5e4ad23965941af4f931d0b02..b4cdd4030d86a79d8d91747f6b13bbd0ca28ff08 100644
--- a/test/CodeGen/AMDGPU/hsa-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-func.ll
@@ -26,7 +26,7 @@
 
 ; ELF: Symbol {
 ; ELF: Name: simple
-; ELF: Size: 288
+; ELF: Size: 292
 ; ELF: Type: Function (0x2)
 ; ELF: }
 
diff --git a/test/CodeGen/AMDGPU/hsa-globals.ll b/test/CodeGen/AMDGPU/hsa-globals.ll
index 2820b308edb8bcde49e052e8de74f4c587b0031a..2ec57a40f0a25026b964b66f8fd98b776c098b2c 100644
--- a/test/CodeGen/AMDGPU/hsa-globals.ll
+++ b/test/CodeGen/AMDGPU/hsa-globals.ll
@@ -9,7 +9,7 @@
 @internal_readonly = internal unnamed_addr addrspace(2) constant i32 0
 @external_readonly = unnamed_addr addrspace(2) constant i32 0
 
-define void @test() {
+define amdgpu_kernel void @test() {
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/hsa-group-segment.ll b/test/CodeGen/AMDGPU/hsa-group-segment.ll
index 1999dc38a6b0f9915bb889cd8353004c90ec93de..600793810e598a7258defa207d30a643de1ba7da 100644
--- a/test/CodeGen/AMDGPU/hsa-group-segment.ll
+++ b/test/CodeGen/AMDGPU/hsa-group-segment.ll
@@ -3,7 +3,7 @@
 @internal_group = internal addrspace(3) global i32 undef
 @external_group = addrspace(3) global i32 undef
 
-define void @test() {
+define amdgpu_kernel void @test() {
 entry:
   store i32 0, i32 addrspace(3)* @internal_group
   store i32 0, i32 addrspace(3)* @external_group
diff --git a/test/CodeGen/AMDGPU/hsa-note-no-func.ll b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
index a4e599230b74b4f3e2857e5da28ea5ad29edbfda..af63a4f8df760f7b0ce32e2f2af85c81378b4ba4 100644
--- a/test/CodeGen/AMDGPU/hsa-note-no-func.ll
+++ b/test/CodeGen/AMDGPU/hsa-note-no-func.ll
@@ -13,6 +13,8 @@
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx803 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI803 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx804 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI804 %s
 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx810 | FileCheck --check-prefix=HSA --check-prefix=HSA-VI810 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx900 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX900 %s
+; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=gfx901 | FileCheck --check-prefix=HSA --check-prefix=HSA-GFX901 %s
 
 ; HSA: .hsa_code_object_version 2,1
 ; HSA-CI700: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
@@ -24,3 +26,5 @@
 ; HSA-VI803: .hsa_code_object_isa 8,0,3,"AMD","AMDGPU"
 ; HSA-VI804: .hsa_code_object_isa 8,0,4,"AMD","AMDGPU"
 ; HSA-VI810: .hsa_code_object_isa 8,1,0,"AMD","AMDGPU"
+; HSA-GFX900: .hsa_code_object_isa 9,0,0,"AMD","AMDGPU"
+; HSA-GFX901: .hsa_code_object_isa 9,0,1,"AMD","AMDGPU"
diff --git a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
index e85db65e7429ab661f036fbf50ba4d9d1fa68a30..f6bf0b09486e876dcd696cc0a63f3c9a7b731658 100644
--- a/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-implicit-def.ll
@@ -5,7 +5,7 @@
 ; SI-LABEL: {{^}}br_implicit_def:
 ; SI: BB#0:
 ; SI-NEXT: s_cbranch_scc1
-define void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
+define amdgpu_kernel void @br_implicit_def(i32 addrspace(1)* %out, i32 %arg) #0 {
 bb:
   br i1 undef, label %bb1, label %bb2
 
diff --git a/test/CodeGen/AMDGPU/i1-copy-phi.ll b/test/CodeGen/AMDGPU/i1-copy-phi.ll
index d4912776debdaa9d841b078f71f95266ef3244ba..b160af86a2b6dbebb8798a8adc9b091125799481 100644
--- a/test/CodeGen/AMDGPU/i1-copy-phi.ll
+++ b/test/CodeGen/AMDGPU/i1-copy-phi.ll
@@ -10,7 +10,7 @@
 ; SI: s_and_saveexec_b64
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @br_i1_phi(i32 %arg) {
+define amdgpu_kernel void @br_i1_phi(i32 %arg) {
 bb:
   %tidig = call i32 @llvm.r600.read.tidig.x() #0
   %cmp = trunc i32 %tidig to i1
diff --git a/test/CodeGen/AMDGPU/i8-to-double-to-float.ll b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
index c218e1918bb01133162478f11ae4499a0ed62975..d501be5c8bf000b577f084d4c6e1341fc5807579 100644
--- a/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
+++ b/test/CodeGen/AMDGPU/i8-to-double-to-float.ll
@@ -2,7 +2,7 @@
 
 ;CHECK: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i8 addrspace(1)* %in) {
   %1 = load i8, i8 addrspace(1)* %in
   %2 = uitofp i8 %1 to double
   %3 = fptrunc double %2 to float
diff --git a/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
index 60e59a5a52863b8eec4fd1c5ef13bbb748d53d19..12cc440e48d9d9db7d218a8d05bd2d7e7366d40c 100644
--- a/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
+++ b/test/CodeGen/AMDGPU/icmp-select-sete-reverse-args.ll
@@ -6,7 +6,7 @@
 ;CHECK: SETNE_INT T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;CHECK-NOT: SETNE_INT
 
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32, i32 addrspace(1)* %in
   %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %in, i32 1
diff --git a/test/CodeGen/AMDGPU/icmp.i16.ll b/test/CodeGen/AMDGPU/icmp.i16.ll
index c3dad2d3203315c2afb2abd7fee52d53cd96fca0..99c2138bbe64e4d890e85f2002b5d242afc1ae69 100644
--- a/test/CodeGen/AMDGPU/icmp.i16.ll
+++ b/test/CodeGen/AMDGPU/icmp.i16.ll
@@ -8,7 +8,7 @@
 ; GCN-LABEL: {{^}}i16_eq:
 ; VI: v_cmp_eq_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_eq_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_eq(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -26,7 +26,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ne:
 ; VI: v_cmp_ne_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ne_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ne(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -44,7 +44,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ugt:
 ; VI: v_cmp_gt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ugt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -62,7 +62,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_uge:
 ; VI: v_cmp_ge_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_uge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -80,7 +80,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ult:
 ; VI: v_cmp_lt_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ult(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -98,7 +98,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ule:
 ; VI: v_cmp_le_u16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_u32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_ule(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -117,7 +117,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sgt:
 ; VI: v_cmp_gt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sgt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -135,7 +135,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sge:
 ; VI: v_cmp_ge_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sge(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -153,7 +153,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_slt:
 ; VI: v_cmp_lt_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_slt(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -171,7 +171,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sle:
 ; VI: v_cmp_le_i16_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_i32_e32 vcc, v{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @i16_sle(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -190,7 +190,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_eq_v_s:
 ; VI: v_cmp_eq_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_eq_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_eq_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -206,7 +206,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ne_v_s:
 ; VI: v_cmp_ne_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ne_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ne_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -222,7 +222,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ugt_v_s:
 ; VI: v_cmp_lt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ugt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -238,7 +238,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_uge_v_s:
 ; VI: v_cmp_le_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_uge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -254,7 +254,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ult_v_s:
 ; VI: v_cmp_gt_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ult_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -270,7 +270,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_ule_v_s:
 ; VI: v_cmp_ge_u16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_ule_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -286,7 +286,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sgt_v_s:
 ; VI: v_cmp_lt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_lt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sgt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -302,7 +302,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sge_v_s:
 ; VI: v_cmp_le_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_le_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sge_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -318,7 +318,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_slt_v_s:
 ; VI: v_cmp_gt_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_gt_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_slt_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -334,7 +334,7 @@ entry:
 ; GCN-LABEL: {{^}}i16_sle_v_s:
 ; VI: v_cmp_ge_i16_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; SI: v_cmp_ge_i32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
+define amdgpu_kernel void @i16_sle_v_s(i32 addrspace(1)* %out, i16 addrspace(1)* %a.ptr, i16 %b) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
diff --git a/test/CodeGen/AMDGPU/icmp64.ll b/test/CodeGen/AMDGPU/icmp64.ll
index 33ad0c9199b93061698a6f1d8c1a8600868771b7..3af74277df128bd9fd139c7a84610aab6045f85b 100644
--- a/test/CodeGen/AMDGPU/icmp64.ll
+++ b/test/CodeGen/AMDGPU/icmp64.ll
@@ -3,7 +3,7 @@
 
 ; SI-LABEL: {{^}}test_i64_eq:
 ; SI: v_cmp_eq_u64
-define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp eq i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -12,7 +12,7 @@ define void @test_i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ne:
 ; SI: v_cmp_ne_u64
-define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ne i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -21,7 +21,7 @@ define void @test_i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_slt:
 ; SI: v_cmp_lt_i64
-define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp slt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -30,7 +30,7 @@ define void @test_i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ult:
 ; SI: v_cmp_lt_u64
-define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ult i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -39,7 +39,7 @@ define void @test_i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_sle:
 ; SI: v_cmp_le_i64
-define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sle i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -48,7 +48,7 @@ define void @test_i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ule:
 ; SI: v_cmp_le_u64
-define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ule i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -57,7 +57,7 @@ define void @test_i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_sgt:
 ; SI: v_cmp_gt_i64
-define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sgt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -66,7 +66,7 @@ define void @test_i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_ugt:
 ; SI: v_cmp_gt_u64
-define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -75,7 +75,7 @@ define void @test_i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_sge:
 ; SI: v_cmp_ge_i64
-define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp sge i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -84,7 +84,7 @@ define void @test_i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; SI-LABEL: {{^}}test_i64_uge:
 ; SI: v_cmp_ge_u64
-define void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %cmp = icmp uge i64 %a, %b
   %result = sext i1 %cmp to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll b/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6e411ce5e017019684039a7dec1b59d059fdadbc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/illegal-sgpr-to-vgpr-copy.ll
@@ -0,0 +1,45 @@
+; RUN: not llc -march=amdgcn < %s 2>&1 | FileCheck -check-prefix=ERR %s
+; RUN: not llc -march=amdgcn < %s | FileCheck -check-prefix=GCN %s
+
+; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_i32 void (): illegal SGPR to VGPR copy
+; GCN: ; illegal copy v1 to s9
+
+define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_i32() #0 {
+  %vgpr = call i32 asm sideeffect "; def $0", "=${VGPR1}"()
+  call void asm sideeffect "; use $0", "${SGPR9}"(i32 %vgpr)
+  ret void
+}
+
+; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v2i32 void (): illegal SGPR to VGPR copy
+; GCN: ; illegal copy v[0:1] to s[10:11]
+define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v2i32() #0 {
+  %vgpr = call <2 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1}"()
+  call void asm sideeffect "; use $0", "${SGPR10_SGPR11}"(<2 x i32> %vgpr)
+  ret void
+}
+
+; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v4i32 void (): illegal SGPR to VGPR copy
+; GCN: ; illegal copy v[0:3] to s[8:11]
+define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v4i32() #0 {
+  %vgpr = call <4 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3}"()
+  call void asm sideeffect "; use $0", "${SGPR8_SGPR9_SGPR10_SGPR11}"(<4 x i32> %vgpr)
+  ret void
+}
+
+; ERR: error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v8i32 void (): illegal SGPR to VGPR copy
+; GCN: ; illegal copy v[0:7] to s[8:15]
+define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v8i32() #0 {
+  %vgpr = call <8 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}"()
+  call void asm sideeffect "; use $0", "${SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}"(<8 x i32> %vgpr)
+  ret void
+}
+
+; ERR error: <unknown>:0:0: in function illegal_vgpr_to_sgpr_copy_v16i32 void (): illegal SGPR to VGPR copy
+; GCN: ; illegal copy v[0:15] to s[16:31]
+define amdgpu_kernel void @illegal_vgpr_to_sgpr_copy_v16i32() #0 {
+  %vgpr = call <16 x i32> asm sideeffect "; def $0", "=${VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7_VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}"()
+  call void asm sideeffect "; use $0", "${SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23_SGPR24_SGPR25_SGPR26_SGPR27_SGPR28_SGPR29_SGPR30_SGPR31}"(<16 x i32> %vgpr)
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/image-attributes.ll b/test/CodeGen/AMDGPU/image-attributes.ll
index 5906b2f1570961abbfafda94e9ec6254fd0dc912..53d61e66c6ba8a317ef6b9be52258236e2ff2840 100644
--- a/test/CodeGen/AMDGPU/image-attributes.ll
+++ b/test/CodeGen/AMDGPU/image-attributes.ll
@@ -7,7 +7,7 @@
 ; FUNC-LABEL: {{^}}width_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].Z
-define void @width_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @width_2d (%opencl.image2d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
@@ -20,7 +20,7 @@ entry:
 ; FUNC-LABEL: {{^}}width_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].Z
-define void @width_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @width_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -37,7 +37,7 @@ entry:
 ; FUNC-LABEL: {{^}}height_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].W
-define void @height_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @height_2d (%opencl.image2d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.2d(
@@ -50,7 +50,7 @@ entry:
 ; FUNC-LABEL: {{^}}height_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[2].W
-define void @height_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @height_3d (%opencl.image3d_t addrspace(1)* %in,
                         i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -67,7 +67,7 @@ entry:
 ; FUNC-LABEL: {{^}}depth_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].X
-define void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @depth_3d (%opencl.image3d_t addrspace(1)* %in,
                        i32 addrspace(1)* %out) {
 entry:
   %0 = call [3 x i32] @llvm.OpenCL.image.get.size.3d(
@@ -84,7 +84,7 @@ entry:
 ; FUNC-LABEL: {{^}}data_type_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Y
-define void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @data_type_2d (%opencl.image2d_t addrspace(1)* %in,
                            i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
@@ -97,7 +97,7 @@ entry:
 ; FUNC-LABEL: {{^}}data_type_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Y
-define void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @data_type_3d (%opencl.image3d_t addrspace(1)* %in,
                                      i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
@@ -114,7 +114,7 @@ entry:
 ; FUNC-LABEL: {{^}}channel_order_2d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Z
-define void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
+define amdgpu_kernel void @channel_order_2d (%opencl.image2d_t addrspace(1)* %in,
                                i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.2d(
@@ -127,7 +127,7 @@ entry:
 ; FUNC-LABEL: {{^}}channel_order_3d:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[3].Z
-define void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
+define amdgpu_kernel void @channel_order_3d (%opencl.image3d_t addrspace(1)* %in,
                                          i32 addrspace(1)* %out) {
 entry:
   %0 = call [2 x i32] @llvm.OpenCL.image.get.format.3d(
@@ -146,7 +146,7 @@ entry:
 ; FUNC-LABEL: {{^}}image_arg_2nd:
 ; EG: MEM_RAT_CACHELESS STORE_RAW [[VAL:T[0-9]+\.X]]
 ; EG: MOV * [[VAL]], KC0[4].Z
-define void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
+define amdgpu_kernel void @image_arg_2nd (%opencl.image3d_t addrspace(1)* %in1,
                             i32 %x,
                             %opencl.image2d_t addrspace(1)* %in2,
                             i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/image-resource-id.ll b/test/CodeGen/AMDGPU/image-resource-id.ll
index d4cf349442409ccc2862ee2f338775b3446d62f6..dac7c7ddaeac9a2858896ed91e9f14d9599b1674 100644
--- a/test/CodeGen/AMDGPU/image-resource-id.ll
+++ b/test/CodeGen/AMDGPU/image-resource-id.ll
@@ -7,7 +7,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only
+define amdgpu_kernel void @test_2d_rd_1_0(%opencl.image2d_t addrspace(1)* %in, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
@@ -21,7 +21,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only
+define amdgpu_kernel void @test_3d_rd_1_0(%opencl.image3d_t addrspace(1)* %in, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
@@ -37,7 +37,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only
+define amdgpu_kernel void @test_2d_wr_1_0(%opencl.image2d_t addrspace(1)* %in, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.2d(
@@ -51,7 +51,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only
+define amdgpu_kernel void @test_3d_wr_1_0(%opencl.image3d_t addrspace(1)* %in, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.image.get.resource.id.3d(
@@ -67,7 +67,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_2_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -82,7 +82,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_2_1(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -97,7 +97,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_2_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -112,7 +112,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_2_1(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -129,7 +129,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_2_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -144,7 +144,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_2_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -159,7 +159,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_2_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -174,7 +174,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_2_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             i32 addrspace(1)* %out) {
 entry:
@@ -191,7 +191,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_2d_rd_3_0(%opencl.image2d_t addrspace(1)* %in1, ; read_only
                             %opencl.image3d_t addrspace(1)* %in2, ; read_only
                             %opencl.image2d_t addrspace(1)* %in3, ; read_only
                             i32 addrspace(1)* %out) {
@@ -208,7 +208,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
+define amdgpu_kernel void @test_3d_rd_3_0(%opencl.image3d_t addrspace(1)* %in1, ; read_only
                             %opencl.image2d_t addrspace(1)* %in2, ; read_only
                             %opencl.image3d_t addrspace(1)* %in3, ; read_only
                             i32 addrspace(1)* %out) {
@@ -226,7 +226,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_wr_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                             %opencl.image3d_t addrspace(1)* %in2, ; write_only
                             %opencl.image2d_t addrspace(1)* %in3, ; write_only
                             i32 addrspace(1)* %out) {
@@ -243,7 +243,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_wr_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                             %opencl.image2d_t addrspace(1)* %in2, ; write_only
                             %opencl.image3d_t addrspace(1)* %in3, ; write_only
                             i32 addrspace(1)* %out) {
@@ -261,7 +261,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_mix_3_0(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                              %opencl.image3d_t addrspace(1)* %in2, ; read_only
                              %opencl.image2d_t addrspace(1)* %in3, ; read_only
                              i32 addrspace(1)* %out) {
@@ -277,7 +277,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_mix_3_0(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                              %opencl.image2d_t addrspace(1)* %in2, ; read_only
                              %opencl.image3d_t addrspace(1)* %in3, ; read_only
                              i32 addrspace(1)* %out) {
@@ -293,7 +293,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_2d_mix_3_1(%opencl.image2d_t addrspace(1)* %in1, ; write_only
                              %opencl.image3d_t addrspace(1)* %in2, ; read_only
                              %opencl.image2d_t addrspace(1)* %in3, ; write_only
                              i32 addrspace(1)* %out) {
@@ -309,7 +309,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
+define amdgpu_kernel void @test_3d_mix_3_1(%opencl.image3d_t addrspace(1)* %in1, ; write_only
                              %opencl.image2d_t addrspace(1)* %in2, ; read_only
                              %opencl.image3d_t addrspace(1)* %in3, ; write_only
                              i32 addrspace(1)* %out) {
diff --git a/test/CodeGen/AMDGPU/imm.ll b/test/CodeGen/AMDGPU/imm.ll
index ef6008aa5fdedf5465cb970105354a82944825c9..c2668a077b0987fa90e4a5ceb6f67ae9b1ad8968 100644
--- a/test/CodeGen/AMDGPU/imm.ll
+++ b/test/CodeGen/AMDGPU/imm.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}i64_imm_inline_lo:
 ; GCN: v_mov_b32_e32 v[[LO_VGPR:[0-9]+]], 5
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VGPR]]:
-define void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_lo(i64 addrspace(1) *%out) {
 entry:
   store i64 1311768464867721221, i64 addrspace(1) *%out ; 0x1234567800000005
   ret void
@@ -15,7 +15,7 @@ entry:
 ; GCN-LABEL: {{^}}i64_imm_inline_hi:
 ; GCN: v_mov_b32_e32 v[[HI_VGPR:[0-9]+]], 5
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:}}[[HI_VGPR]]
-define void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @i64_imm_inline_hi(i64 addrspace(1) *%out) {
 entry:
   store i64 21780256376, i64 addrspace(1) *%out ; 0x0000000512345678
   ret void
@@ -25,7 +25,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
+define amdgpu_kernel void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
   store i64 -9223372036854775808, i64 addrspace(1) *%out
   ret void
 }
@@ -33,7 +33,7 @@ define void @store_imm_neg_0.0_i64(i64 addrspace(1) *%out) {
 ; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_i32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
   store i32 -2147483648, i32 addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@ define void @store_inline_imm_neg_0.0_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
   store float 0.0, float addrspace(1)* %out
   ret void
 }
@@ -49,7 +49,7 @@ define void @store_inline_imm_0.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_imm_neg_0.0_f32:
 ; GCN: v_bfrev_b32_e32 [[REG:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
   store float -0.0, float addrspace(1)* %out
   ret void
 }
@@ -57,7 +57,7 @@ define void @store_imm_neg_0.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
   store float 0.5, float addrspace(1)* %out
   ret void
 }
@@ -65,7 +65,7 @@ define void @store_inline_imm_0.5_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_0.5_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
   store float -0.5, float addrspace(1)* %out
   ret void
 }
@@ -73,7 +73,7 @@ define void @store_inline_imm_m_0.5_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
   store float 1.0, float addrspace(1)* %out
   ret void
 }
@@ -81,7 +81,7 @@ define void @store_inline_imm_1.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_1.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
   store float -1.0, float addrspace(1)* %out
   ret void
 }
@@ -89,7 +89,7 @@ define void @store_inline_imm_m_1.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
   store float 2.0, float addrspace(1)* %out
   ret void
 }
@@ -97,7 +97,7 @@ define void @store_inline_imm_2.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_2.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
   store float -2.0, float addrspace(1)* %out
   ret void
 }
@@ -105,7 +105,7 @@ define void @store_inline_imm_m_2.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
   store float 4.0, float addrspace(1)* %out
   ret void
 }
@@ -113,7 +113,7 @@ define void @store_inline_imm_4.0_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_4.0_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], -4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
   store float -4.0, float addrspace(1)* %out
   ret void
 }
@@ -123,7 +123,7 @@ define void @store_inline_imm_m_4.0_f32(float addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3e22f983{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0.15915494{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
   store float 0x3FC45F3060000000, float addrspace(1)* %out
   ret void
 }
@@ -131,7 +131,7 @@ define void @store_inline_imm_inv_2pi_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_f32:
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbe22f983{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
   store float 0xBFC45F3060000000, float addrspace(1)* %out
   ret void
 }
@@ -139,7 +139,7 @@ define void @store_inline_imm_m_inv_2pi_f32(float addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_literal_imm_f32:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x45800000
 ; GCN: buffer_store_dword [[REG]]
-define void @store_literal_imm_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f32(float addrspace(1)* %out) {
   store float 4096.0, float addrspace(1)* %out
   ret void
 }
@@ -148,7 +148,7 @@ define void @store_literal_imm_f32(float addrspace(1)* %out) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -158,7 +158,7 @@ define void @add_inline_imm_0.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
   ret void
@@ -168,7 +168,7 @@ define void @add_inline_imm_0.5_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -0.5
   store float %y, float addrspace(1)* %out
   ret void
@@ -178,7 +178,7 @@ define void @add_inline_imm_neg_0.5_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 1.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -188,7 +188,7 @@ define void @add_inline_imm_1.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -1.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -198,7 +198,7 @@ define void @add_inline_imm_neg_1.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 2.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -208,7 +208,7 @@ define void @add_inline_imm_2.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -2.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -218,7 +218,7 @@ define void @add_inline_imm_neg_2.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 4.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -228,7 +228,7 @@ define void @add_inline_imm_4.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, -4.0
   store float %y, float addrspace(1)* %out
   ret void
@@ -238,7 +238,7 @@ define void @add_inline_imm_neg_4.0_f32(float addrspace(1)* %out, float %x) {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
 ; GCN: buffer_store_dword [[REG]]
-define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %x = load float, float addrspace(1)* %in
   %y = fadd float %x, 0.5
   store float %y, float addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @commute_add_inline_imm_0.5_f32(float addrspace(1)* %out, float addr
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_add_f32_e32 [[REG:v[0-9]+]], 0x44800000, [[VAL]]
 ; GCN: buffer_store_dword [[REG]]
-define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1)* %in) {
   %x = load float, float addrspace(1)* %in
   %y = fadd float %x, 1024.0
   store float %y, float addrspace(1)* %out
@@ -260,7 +260,7 @@ define void @commute_add_literal_f32(float addrspace(1)* %out, float addrspace(1
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36a0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -270,7 +270,7 @@ define void @add_inline_imm_1_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 2{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36b0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -280,7 +280,7 @@ define void @add_inline_imm_2_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 16
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36e0000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -290,7 +290,7 @@ define void @add_inline_imm_16_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -1{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xffffffffe0000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -300,7 +300,7 @@ define void @add_inline_imm_neg_1_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -2{{$}}
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xffffffffc0000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -310,7 +310,7 @@ define void @add_inline_imm_neg_2_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], -16
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0xfffffffe00000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -320,7 +320,7 @@ define void @add_inline_imm_neg_16_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 63
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x36ff800000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -330,7 +330,7 @@ define void @add_inline_imm_63_f32(float addrspace(1)* %out, float %x) {
 ; GCN: s_load_dword [[VAL:s[0-9]+]]
 ; GCN: v_add_f32_e64 [[REG:v[0-9]+]], [[VAL]], 64
 ; GCN: buffer_store_dword [[REG]]
-define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
+define amdgpu_kernel void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
   %y = fadd float %x, 0x3700000000000000
   store float %y, float addrspace(1)* %out
   ret void
@@ -342,7 +342,7 @@ define void @add_inline_imm_64_f32(float addrspace(1)* %out, float %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -353,7 +353,7 @@ define void @add_inline_imm_0.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0.5
   store double %y, double addrspace(1)* %out
   ret void
@@ -364,7 +364,7 @@ define void @add_inline_imm_0.5_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -0.5
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -0.5
   store double %y, double addrspace(1)* %out
   ret void
@@ -375,7 +375,7 @@ define void @add_inline_imm_neg_0.5_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 1.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -386,7 +386,7 @@ define void @add_inline_imm_1.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -1.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -397,7 +397,7 @@ define void @add_inline_imm_neg_1.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 2.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -408,7 +408,7 @@ define void @add_inline_imm_2.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -2.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -419,7 +419,7 @@ define void @add_inline_imm_neg_2.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 4.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -430,7 +430,7 @@ define void @add_inline_imm_4.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -4.0
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, -4.0
   store double %y, double addrspace(1)* %out
   ret void
@@ -445,7 +445,7 @@ define void @add_inline_imm_neg_4.0_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 0.15915494{{$}}
 ; VI: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x3fc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
@@ -455,7 +455,7 @@ define void @add_inline_imm_inv_2pi_f64(double addrspace(1)* %out, double %x) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
 ; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xbfc45f306dc9c882
   store double %y, double addrspace(1)* %out
   ret void
@@ -466,7 +466,7 @@ define void @add_m_inv_2pi_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 1{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000001
   store double %y, double addrspace(1)* %out
   ret void
@@ -477,7 +477,7 @@ define void @add_inline_imm_1_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 2{{$}}
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000002
   store double %y, double addrspace(1)* %out
   ret void
@@ -488,7 +488,7 @@ define void @add_inline_imm_2_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 16
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000010
   store double %y, double addrspace(1)* %out
   ret void
@@ -499,7 +499,7 @@ define void @add_inline_imm_16_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -1
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xffffffffffffffff
   store double %y, double addrspace(1)* %out
   ret void
@@ -510,7 +510,7 @@ define void @add_inline_imm_neg_1_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -2
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffffe
   store double %y, double addrspace(1)* %out
   ret void
@@ -521,7 +521,7 @@ define void @add_inline_imm_neg_2_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], -16
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0xfffffffffffffff0
   store double %y, double addrspace(1)* %out
   ret void
@@ -532,7 +532,7 @@ define void @add_inline_imm_neg_16_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 63
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x000000000000003F
   store double %y, double addrspace(1)* %out
   ret void
@@ -543,7 +543,7 @@ define void @add_inline_imm_63_f64(double addrspace(1)* %out, double %x) {
 ; VI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x2c
 ; GCN: v_add_f64 [[REG:v\[[0-9]+:[0-9]+\]]], [[VAL]], 64
 ; GCN: buffer_store_dwordx2 [[REG]]
-define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
+define amdgpu_kernel void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
   %y = fadd double %x, 0x0000000000000040
   store double %y, double addrspace(1)* %out
   ret void
@@ -554,7 +554,7 @@ define void @add_inline_imm_64_f64(double addrspace(1)* %out, double %x) {
 ; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0
 ; GCN: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], v[[LO_VREG]]{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
   store double 0.0, double addrspace(1)* %out
   ret void
 }
@@ -564,7 +564,7 @@ define void @store_inline_imm_0.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_bfrev_b32_e32 v[[HI_VREG:[0-9]+]], 1{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
   store double -0.0, double addrspace(1)* %out
   ret void
 }
@@ -573,7 +573,7 @@ define void @store_literal_imm_neg_0.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fe00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
   store double 0.5, double addrspace(1)* %out
   ret void
 }
@@ -582,7 +582,7 @@ define void @store_inline_imm_0.5_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfe00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
   store double -0.5, double addrspace(1)* %out
   ret void
 }
@@ -591,7 +591,7 @@ define void @store_inline_imm_m_0.5_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3ff00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
   store double 1.0, double addrspace(1)* %out
   ret void
 }
@@ -600,7 +600,7 @@ define void @store_inline_imm_1.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbff00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
   store double -1.0, double addrspace(1)* %out
   ret void
 }
@@ -609,7 +609,7 @@ define void @store_inline_imm_m_1.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 2.0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
   store double 2.0, double addrspace(1)* %out
   ret void
 }
@@ -618,7 +618,7 @@ define void @store_inline_imm_2.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], -2.0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
   store double -2.0, double addrspace(1)* %out
   ret void
 }
@@ -627,7 +627,7 @@ define void @store_inline_imm_m_2.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40100000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
   store double 4.0, double addrspace(1)* %out
   ret void
 }
@@ -636,7 +636,7 @@ define void @store_inline_imm_4.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xc0100000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
   store double -4.0, double addrspace(1)* %out
   ret void
 }
@@ -645,7 +645,7 @@ define void @store_inline_imm_m_4.0_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x3fc45f30
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inv_2pi_f64(double addrspace(1)* %out) {
   store double 0x3fc45f306dc9c882, double addrspace(1)* %out
   ret void
 }
@@ -654,7 +654,7 @@ define void @store_inv_2pi_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0x6dc9c882
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0xbfc45f30
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
   store double 0xbfc45f306dc9c882, double addrspace(1)* %out
   ret void
 }
@@ -663,7 +663,22 @@ define void @store_inline_imm_m_inv_2pi_f64(double addrspace(1)* %out) {
 ; GCN-DAG: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[HI_VREG:[0-9]+]], 0x40b00000
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO_VREG]]:[[HI_VREG]]{{\]}}
-define void @store_literal_imm_f64(double addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f64(double addrspace(1)* %out) {
   store double 4096.0, double addrspace(1)* %out
   ret void
 }
+
+; GCN-LABEL: {{^}}literal_folding:
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}}
+define amdgpu_vs void @literal_folding(float %arg) {
+main_body:
+  %tmp = fmul float %arg, 0x3FE86A7F00000000
+  %tmp1 = fmul float %arg, 0xBFE86A7F00000000
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp, float %tmp, float %tmp1, float %tmp1, i1 true, i1 false) #0
+  ret void
+}
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/imm16.ll b/test/CodeGen/AMDGPU/imm16.ll
index 2e73eb06502f53e85df1dd4dce3c6388633dbbc5..e42d587918906ca17ecdaac4553275757e5dd8ee 100644
--- a/test/CodeGen/AMDGPU/imm16.ll
+++ b/test/CodeGen/AMDGPU/imm16.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
   store volatile i16 -32768, i16 addrspace(1)* %out
   ret void
 }
@@ -15,7 +15,7 @@ define void @store_inline_imm_neg_0.0_i16(i16 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
   store half 0.0, half addrspace(1)* %out
   ret void
 }
@@ -24,7 +24,7 @@ define void @store_inline_imm_0.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0x8000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffff8000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
   store half -0.0, half addrspace(1)* %out
   ret void
 }
@@ -32,7 +32,7 @@ define void @store_imm_neg_0.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_0.5_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3800{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
   store half 0.5, half addrspace(1)* %out
   ret void
 }
@@ -41,7 +41,7 @@ define void @store_inline_imm_0.5_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb800{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
   store half -0.5, half addrspace(1)* %out
   ret void
 }
@@ -49,7 +49,7 @@ define void @store_inline_imm_m_0.5_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_1.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
   store half 1.0, half addrspace(1)* %out
   ret void
 }
@@ -58,7 +58,7 @@ define void @store_inline_imm_1.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffbc00{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
   store half -1.0, half addrspace(1)* %out
   ret void
 }
@@ -66,7 +66,7 @@ define void @store_inline_imm_m_1.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_2.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
   store half 2.0, half addrspace(1)* %out
   ret void
 }
@@ -75,7 +75,7 @@ define void @store_inline_imm_2.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc000{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
   store half -2.0, half addrspace(1)* %out
   ret void
 }
@@ -83,7 +83,7 @@ define void @store_inline_imm_m_2.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_4.0_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4400{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
   store half 4.0, half addrspace(1)* %out
   ret void
 }
@@ -92,7 +92,7 @@ define void @store_inline_imm_4.0_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffc400{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
   store half -4.0, half addrspace(1)* %out
   ret void
 }
@@ -101,7 +101,7 @@ define void @store_inline_imm_m_4.0_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3118{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
   store half 0xH3118, half addrspace(1)* %out
   ret void
 }
@@ -110,7 +110,7 @@ define void @store_inline_imm_inv_2pi_f16(half addrspace(1)* %out) {
 ; SI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118{{$}}
 ; VI: v_mov_b32_e32 [[REG:v[0-9]+]], 0xffffb118{{$}}
 ; GCN: buffer_store_short [[REG]]
-define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
   store half 0xHB118, half addrspace(1)* %out
   ret void
 }
@@ -118,7 +118,7 @@ define void @store_inline_imm_m_inv_2pi_f16(half addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}store_literal_imm_f16:
 ; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c00
 ; GCN: buffer_store_short [[REG]]
-define void @store_literal_imm_f16(half addrspace(1)* %out) {
+define amdgpu_kernel void @store_literal_imm_f16(half addrspace(1)* %out) {
   store half 4096.0, half addrspace(1)* %out
   ret void
 }
@@ -127,7 +127,7 @@ define void @store_literal_imm_f16(half addrspace(1)* %out) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -137,7 +137,7 @@ define void @add_inline_imm_0.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0.5
   store half %y, half addrspace(1)* %out
   ret void
@@ -147,7 +147,7 @@ define void @add_inline_imm_0.5_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -0.5, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -0.5
   store half %y, half addrspace(1)* %out
   ret void
@@ -157,7 +157,7 @@ define void @add_inline_imm_neg_0.5_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 1.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -167,7 +167,7 @@ define void @add_inline_imm_1.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -1.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -177,7 +177,7 @@ define void @add_inline_imm_neg_1.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 2.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -187,7 +187,7 @@ define void @add_inline_imm_2.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -2.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -197,7 +197,7 @@ define void @add_inline_imm_neg_2.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 4.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 4.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -207,7 +207,7 @@ define void @add_inline_imm_4.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -4.0, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, -4.0
   store half %y, half addrspace(1)* %out
   ret void
@@ -217,7 +217,7 @@ define void @add_inline_imm_neg_4.0_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0.5, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %x = load half, half addrspace(1)* %in
   %y = fadd half %x, 0.5
   store half %y, half addrspace(1)* %out
@@ -228,7 +228,7 @@ define void @commute_add_inline_imm_0.5_f16(half addrspace(1)* %out, half addrsp
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 0x6400, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
+define amdgpu_kernel void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)* %in) {
   %x = load half, half addrspace(1)* %in
   %y = fadd half %x, 1024.0
   store half %y, half addrspace(1)* %out
@@ -239,7 +239,7 @@ define void @commute_add_literal_f16(half addrspace(1)* %out, half addrspace(1)*
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 1, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0001
   store half %y, half addrspace(1)* %out
   ret void
@@ -249,7 +249,7 @@ define void @add_inline_imm_1_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 2, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0002
   store half %y, half addrspace(1)* %out
   ret void
@@ -259,7 +259,7 @@ define void @add_inline_imm_2_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 16, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0010
   store half %y, half addrspace(1)* %out
   ret void
@@ -269,7 +269,7 @@ define void @add_inline_imm_16_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -1, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFFF
   store half %y, half addrspace(1)* %out
   ret void
@@ -279,7 +279,7 @@ define void @add_inline_imm_neg_1_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -2, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFFE
   store half %y, half addrspace(1)* %out
   ret void
@@ -289,7 +289,7 @@ define void @add_inline_imm_neg_2_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], -16, [[VAL]]{{$}}
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xHFFF0
   store half %y, half addrspace(1)* %out
   ret void
@@ -299,7 +299,7 @@ define void @add_inline_imm_neg_16_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 63, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH003F
   store half %y, half addrspace(1)* %out
   ret void
@@ -309,7 +309,7 @@ define void @add_inline_imm_63_f16(half addrspace(1)* %out, half %x) {
 ; VI: buffer_load_ushort [[VAL:v[0-9]+]]
 ; VI: v_add_f16_e32 [[REG:v[0-9]+]], 64, [[VAL]]
 ; VI: buffer_store_short [[REG]]
-define void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
+define amdgpu_kernel void @add_inline_imm_64_f16(half addrspace(1)* %out, half %x) {
   %y = fadd half %x, 0xH0040
   store half %y, half addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/immv216.ll b/test/CodeGen/AMDGPU/immv216.ll
new file mode 100644
index 0000000000000000000000000000000000000000..85ad365d02a8975aa6ae6c0680e22c14b74f7e62
--- /dev/null
+++ b/test/CodeGen/AMDGPU/immv216.ll
@@ -0,0 +1,446 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+; FIXME: Merge into imm.ll
+
+; GCN-LABEL: {{^}}store_inline_imm_neg_0.0_v2i16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_neg_0.0_v2i16(<2 x i16> addrspace(1)* %out) #0 {
+  store <2 x i16> <i16 -32768, i16 -32768>, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_0.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 0.0, half 0.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_imm_neg_0.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x80008000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_imm_neg_0.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half -0.0, half -0.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_0.5_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x38003800{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 0.5, half 0.5>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_0.5_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb800b800{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_m_0.5_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half -0.5, half -0.5>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_1.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x3c003c00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 1.0, half 1.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_1.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xbc00bc00{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_m_1.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half -1.0, half -1.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_2.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x40004000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 2.0, half 2.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_2.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc000c000{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_m_2.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half -2.0, half -2.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_4.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x44004400{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 4.0, half 4.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_4.0_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xc400c400{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_m_4.0_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half -4.0, half -4.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_inv_2pi_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x31183118{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 0xH3118, half 0xH3118>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_inline_imm_m_inv_2pi_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0xb118b118{{$}}
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_inline_imm_m_inv_2pi_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 0xHB118, half 0xHB118>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}store_literal_imm_v2f16:
+; GCN: v_mov_b32_e32 [[REG:v[0-9]+]], 0x6c006c00
+; GCN: buffer_store_dword [[REG]]
+define amdgpu_kernel void @store_literal_imm_v2f16(<2 x half> addrspace(1)* %out) #0 {
+  store <2 x half> <half 4096.0, half 4096.0>, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_0.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_0.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0.0, half 0.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_0.5_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0.5, half 0.5>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_0.5_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -0.5{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -0.5, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half -0.5, half -0.5>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_1.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 1.0, half 1.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_1.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_1.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half -1.0, half -1.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_2.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 2.0, half 2.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_2.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_2.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half -2.0, half -2.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_4.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 4.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 4.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 4.0, half 4.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_4.0_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -4.0{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -4.0, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_4.0_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half -4.0, half -4.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_add_inline_imm_0.5_v2f16:
+; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 0.5
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_dword
+; VI-NOT: and
+; VI: v_lshrrev_b32_e32 {{v[0-9]+}}, 16,
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 0.5, v{{[0-9]+}}
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @commute_add_inline_imm_0.5_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %x = load <2 x half>, <2 x half> addrspace(1)* %in
+  %y = fadd <2 x half> %x, <half 0.5, half 0.5>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}commute_add_literal_v2f16:
+; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 0x64006400
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[K]], [[VAL]]
+; GFX9: buffer_store_dword [[REG]]
+
+; VI-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x6400{{$}}
+; VI-DAG: buffer_load_dword
+; VI-NOT: and
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}
+; VI-DAG: v_add_f16_sdwa v{{[0-9]+}}, [[K]], v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: buffer_store_dword
+define amdgpu_kernel void @commute_add_literal_v2f16(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+  %x = load <2 x half>, <2 x half> addrspace(1)* %in
+  %y = fadd <2 x half> %x, <half 1024.0, half 1024.0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_1_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 1{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 1, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xH0001, half 0xH0001>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_2_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 2{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 2, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xH0002, half 0xH0002>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_16_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 16{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 16, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xH0010, half 0xH0010>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_1_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -1{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -1, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_1_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xHFFFF, half 0xHFFFF>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_2_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -2{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -2, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_2_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xHFFFE, half 0xHFFFE>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_neg_16_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], -16{{$}}
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, -16, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_neg_16_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xHFFF0, half 0xHFFF0>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_63_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 63
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 63, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_63_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xH003F, half 0xH003F>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_inline_imm_64_v2f16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_add_f16 [[REG:v[0-9]+]], [[VAL]], 64
+; GFX9: buffer_store_dword [[REG]]
+
+; VI: buffer_load_ushort [[VAL0:v[0-9]+]]
+; VI: buffer_load_ushort [[VAL1:v[0-9]+]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL0]]
+; VI-DAG: v_add_f16_e32 v{{[0-9]+}}, 64, [[VAL1]]
+; VI: v_or_b32
+; VI: buffer_store_dword
+define amdgpu_kernel void @add_inline_imm_64_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %x) #0 {
+  %y = fadd <2 x half> %x, <half 0xH0040, half 0xH0040>
+  store <2 x half> %y, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll b/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
index 877956be3088f58804968f53e1407eab4a2bed60..8e207a38c84746e17f7c251b19bb7dc27f37a137 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si-noopt.ll
@@ -10,7 +10,7 @@
 ; CHECK: s_mov_b32 m0, [[IN]]
 ; CHECK: v_movreld_b32_e32 v[[ELT0:[0-9]+]]
 ; CHECK-NEXT: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
-define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %ins = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
   store <4 x float> %ins, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/indirect-addressing-si.ll b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
index 208e55c143ac88af845cf75b7fb3880f820a0c1e..b18ae353ca4cb15d434f7db30abd155324addc8d 100644
--- a/test/CodeGen/AMDGPU/indirect-addressing-si.ll
+++ b/test/CodeGen/AMDGPU/indirect-addressing-si.ll
@@ -1,6 +1,7 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=MOVREL %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -amdgpu-vgpr-index-mode -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=IDXMODE %s
 
 ; Tests for indirect addressing on SI, which is implemented using dynamic
 ; indexing of vectors.
@@ -18,7 +19,7 @@
 ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @extract_w_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %idx = add i32 %in, 1
   %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %idx
@@ -43,7 +44,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on s{{[0-9]+}}, src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
+define amdgpu_kernel void @extract_w_offset_salu_use_vector(i32 addrspace(1)* %out, i32 %in, <4 x i32> %or.val) {
 entry:
   %idx = add i32 %in, 1
   %vec = or <4 x i32> %or.val, <i32 1, i32 2, i32 3, i32 4>
@@ -65,7 +66,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IN]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, [[BASEREG]]
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @extract_wo_offset(float addrspace(1)* %out, i32 %in) {
 entry:
   %elt = extractelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, i32 %in
   store float %elt, float addrspace(1)* %out
@@ -83,7 +84,7 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = extractelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 %index
@@ -104,7 +105,7 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_on [[ADD_IDX]], src0{{$}}
 ; IDXMODE-NEXT: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
+define amdgpu_kernel void @extract_neg_offset_sgpr_loaded(i32 addrspace(1)* %out, <4 x i32> %vec0, <4 x i32> %vec1, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %or = or <4 x i32> %vec0, %vec1
@@ -136,7 +137,7 @@ entry:
 
 ; IDXMODE: s_set_gpr_idx_off
 ; GCN: buffer_store_dword [[RESULT]]
-define void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @extract_neg_offset_vgpr(i32 addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
@@ -146,7 +147,7 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}extract_undef_offset_sgpr:
-define void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @extract_undef_offset_sgpr(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %value = extractelement <4 x i32> %ld, i32 undef
@@ -158,7 +159,7 @@ entry:
 ; GCN-DAG: buffer_load_dwordx4
 ; MOVREL-DAG: s_mov_b32 m0,
 ; MOVREL: v_movreld_b32
-define void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @insert_undef_offset_sgpr_vector_src(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
 entry:
   %ld = load <4 x i32>, <4  x i32> addrspace(1)* %in
   %value = insertelement <4 x i32> %ld, i32 5, i32 undef
@@ -177,7 +178,7 @@ entry:
 
 ; MOVREL: v_movreld_b32_e32 v[[ELT1]], v[[INS]]
 ; MOVREL: buffer_store_dwordx4 v{{\[}}[[ELT0]]:[[ELT3]]{{\]}}
-define void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_w_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 %in, 1
   %1 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %0
@@ -196,7 +197,7 @@ entry:
 ; IDXMODE-NEXT: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[ELT0]]:
-define void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @insert_wo_offset(<4 x float> addrspace(1)* %out, i32 %in) {
 entry:
   %0 = insertelement <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, float 5.0, i32 %in
   store <4 x float> %0, <4 x float> addrspace(1)* %out
@@ -212,7 +213,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = insertelement <4 x i32> <i32 0, i32 1, i32 2, i32 3>, i32 5, i32 %index
@@ -232,7 +233,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[ADD_IDX]], dst
 ; IDXMODE-NEXT: v_mov_b32_e32 v0, 5
 ; IDXMODE-NEXT: s_set_gpr_idx_off
-define void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
+define amdgpu_kernel void @insert_neg_offset_sgpr_loadreg(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out, <4 x i32> %vec, i32 %offset) {
 entry:
   %index = add i32 %offset, -512
   %value = insertelement <4 x i32> %vec, i32 5, i32 %index
@@ -269,7 +270,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword
-define void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -512
@@ -304,7 +305,7 @@ entry:
 ; GCN: s_cbranch_execnz
 
 ; IDXMODE: s_set_gpr_idx_off
-define void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @insert_neg_inline_offset_vgpr(i32 addrspace(1)* %in, <4 x i32> addrspace(1)* %out) {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %index = add i32 %id, -16
@@ -374,7 +375,7 @@ entry:
 
 ; GCN: buffer_store_dword [[MOVREL0]]
 ; GCN: buffer_store_dword [[MOVREL1]]
-define void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @extract_vgpr_offset_multiple_in_block(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
@@ -449,7 +450,7 @@ bb2:
 ; GCN: buffer_store_dwordx4 v{{\[}}[[VEC_ELT0]]:
 
 ; GCN: buffer_store_dword [[INS0]]
-define void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
+define amdgpu_kernel void @insert_vgpr_offset_multiple_in_block(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, i32 addrspace(1)* %in, <4 x i32> %vec0) #0 {
 entry:
   %id = call i32 @llvm.amdgcn.workitem.id.x() #1
   %id.ext = zext i32 %id to i64
@@ -498,7 +499,7 @@ bb2:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @extract_adjacent_blocks(i32 %arg) #0 {
+define amdgpu_kernel void @extract_adjacent_blocks(i32 %arg) #0 {
 bb:
   %tmp = icmp eq i32 %arg, 0
   br i1 %tmp, label %bb1, label %bb4
@@ -548,7 +549,7 @@ bb7:
 ; GCN: [[ENDBB]]:
 ; GCN: buffer_store_dword
 ; GCN: s_endpgm
-define void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
+define amdgpu_kernel void @insert_adjacent_blocks(i32 %arg, float %val0) #0 {
 bb:
   %tmp = icmp eq i32 %arg, 0
   br i1 %tmp, label %bb1, label %bb4
@@ -609,7 +610,7 @@ bb7:                                              ; preds = %bb4, %bb1
 ; GCN: ds_write_b32
 ; GCN: ds_write_b32
 ; GCN: s_endpgm
-define void @multi_same_block(i32 %arg) #0 {
+define amdgpu_kernel void @multi_same_block(i32 %arg) #0 {
 bb:
   %tmp1 = add i32 %arg, -16
   %tmp2 = insertelement <6 x float> <float 1.700000e+01, float 1.800000e+01, float 1.900000e+01, float 2.000000e+01, float 2.100000e+01, float 2.200000e+01>, float 4.000000e+00, i32 %tmp1
@@ -636,7 +637,7 @@ bb:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_largest_inbounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %offset = add i32 %idx, 3
@@ -657,7 +658,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_off
 
 ; GCN: buffer_store_dword [[EXTRACT]]
-define void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
+define amdgpu_kernel void @extract_out_of_bounds_offset(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %offset = add i32 %idx, 4
@@ -680,7 +681,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], src0
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
+define amdgpu_kernel void @extractelement_v4i32_or_index(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %idx.in) {
 entry:
   %ld = load volatile <4 x i32>, <4  x i32> addrspace(1)* %in
   %idx.shl = shl i32 %idx.in, 2
@@ -701,7 +702,7 @@ entry:
 ; IDXMODE: s_set_gpr_idx_on [[IDX_SHL]], dst
 ; IDXMODE: v_mov_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; IDXMODE: s_set_gpr_idx_off
-define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %idx.in) nounwind {
   %idx.shl = shl i32 %idx.in, 2
   %idx = or i32 %idx.shl, 1
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %idx
@@ -728,7 +729,7 @@ define void @insertelement_v4f32_or_index(<4 x float> addrspace(1)* %out, <4 x f
 ; IDXMODE: s_set_gpr_idx_idx
 ; IDXMODE: v_mov_b32_e32
 ; GCN: s_cbranch_execnz [[REGLOOP]]
-define void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @broken_phi_bb(i32 %arg, i32 %arg1) #0 {
 bb:
   br label %bb2
 
diff --git a/test/CodeGen/AMDGPU/indirect-private-64.ll b/test/CodeGen/AMDGPU/indirect-private-64.ll
index 4db87c3c1b64a5ca3cbd1cdc648436f019ddb44b..7f08a89d149e97c260561c0261e911a27b2de1b7 100644
--- a/test/CodeGen/AMDGPU/indirect-private-64.ll
+++ b/test/CodeGen/AMDGPU/indirect-private-64.ll
@@ -20,10 +20,10 @@ declare void @llvm.amdgcn.s.barrier() #0
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b64
 ; CI-PROMOTE: ds_read_b64
-define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_f64_alloca(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load double, double addrspace(1)* %in, align 8
-  %array = alloca [16 x double], align 8
-  %ptr = getelementptr inbounds [16 x double], [16 x double]* %array, i32 0, i32 %b
+  %array = alloca [8 x double], align 8
+  %ptr = getelementptr inbounds [8 x double], [8 x double]* %array, i32 0, i32 %b
   store double %val, double* %ptr, align 8
   call void @llvm.amdgcn.s.barrier()
   %result = load double, double* %ptr, align 8
@@ -51,10 +51,10 @@ define void @private_access_f64_alloca(double addrspace(1)* noalias %out, double
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write2_b64
 ; CI-PROMOTE: ds_read2_b64
-define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out, <2 x double> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in, align 16
-  %array = alloca [8 x <2 x double>], align 16
-  %ptr = getelementptr inbounds [8 x <2 x double>], [8 x <2 x double>]* %array, i32 0, i32 %b
+  %array = alloca [4 x <2 x double>], align 16
+  %ptr = getelementptr inbounds [4 x <2 x double>], [4 x <2 x double>]* %array, i32 0, i32 %b
   store <2 x double> %val, <2 x double>* %ptr, align 16
   call void @llvm.amdgcn.s.barrier()
   %result = load <2 x double>, <2 x double>* %ptr, align 16
@@ -77,7 +77,7 @@ define void @private_access_v2f64_alloca(<2 x double> addrspace(1)* noalias %out
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write_b64
 ; CI-PROMOTE: ds_read_b64
-define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load i64, i64 addrspace(1)* %in, align 8
   %array = alloca [8 x i64], align 8
   %ptr = getelementptr inbounds [8 x i64], [8 x i64]* %array, i32 0, i32 %b
@@ -109,10 +109,10 @@ define void @private_access_i64_alloca(i64 addrspace(1)* noalias %out, i64 addrs
 ; SI-PROMOTE: ds_read_b64
 ; CI-PROMOTE: ds_write2_b64
 ; CI-PROMOTE: ds_read2_b64
-define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
+define amdgpu_kernel void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <2 x i64> addrspace(1)* noalias %in, i32 %b) #1 {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in, align 16
-  %array = alloca [8 x <2 x i64>], align 16
-  %ptr = getelementptr inbounds [8 x <2 x i64>], [8 x <2 x i64>]* %array, i32 0, i32 %b
+  %array = alloca [4 x <2 x i64>], align 16
+  %ptr = getelementptr inbounds [4 x <2 x i64>], [4 x <2 x i64>]* %array, i32 0, i32 %b
   store <2 x i64> %val, <2 x i64>* %ptr, align 16
   call void @llvm.amdgcn.s.barrier()
   %result = load <2 x i64>, <2 x i64>* %ptr, align 16
@@ -121,4 +121,4 @@ define void @private_access_v2i64_alloca(<2 x i64> addrspace(1)* noalias %out, <
 }
 
 attributes #0 = { convergent nounwind }
-attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="64,64" }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="1,2" "amdgpu-flat-work-group-size"="64,128" }
diff --git a/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
index 990f33518ab915eb8619e99e9917e8709cedd778..7cee8a41c1206c7b47999306b5c6b0eb27048a86 100644
--- a/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
+++ b/test/CodeGen/AMDGPU/infinite-loop-evergreen.ll
@@ -2,7 +2,7 @@
 ; REQUIRES: asserts
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
-define void @inf_loop_irreducible_cfg() nounwind {
+define amdgpu_kernel void @inf_loop_irreducible_cfg() nounwind {
 entry:
   br label %block
 
diff --git a/test/CodeGen/AMDGPU/infinite-loop.ll b/test/CodeGen/AMDGPU/infinite-loop.ll
index 3e0b695934c7fcea95c750bc63200fce7698a678..73482756b8c804ed7be0ae7a151e92bfb96d400d 100644
--- a/test/CodeGen/AMDGPU/infinite-loop.ll
+++ b/test/CodeGen/AMDGPU/infinite-loop.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_store_dword [[REG]]
 ; SI: s_waitcnt vmcnt(0) expcnt(0)
 ; SI: s_branch BB0_1
-define void @infinite_loop(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @infinite_loop(i32 addrspace(1)* %out) {
 entry:
   br label %for.body
 
diff --git a/test/CodeGen/AMDGPU/inline-asm.ll b/test/CodeGen/AMDGPU/inline-asm.ll
index db1a0c67436d825b4409f3c7debafce6de02e04d..5d49b11f0d416b39374e75c8e158bfbdae20c608 100644
--- a/test/CodeGen/AMDGPU/inline-asm.ll
+++ b/test/CodeGen/AMDGPU/inline-asm.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: {{^}}inline_asm:
 ; CHECK: s_endpgm
 ; CHECK: s_endpgm
-define void @inline_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @inline_asm(i32 addrspace(1)* %out) {
 entry:
   store i32 5, i32 addrspace(1)* %out
   call void asm sideeffect "s_endpgm", ""()
@@ -25,7 +25,7 @@ entry:
 ; Make sure inline assembly is treted as divergent.
 ; CHECK: s_mov_b32 s{{[0-9]+}}, 0
 ; CHECK: s_and_saveexec_b64
-define void @branch_on_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @branch_on_asm(i32 addrspace(1)* %out) {
 	%zero = call i32 asm "s_mov_b32 $0, 0", "=s"()
 	%cmp = icmp eq i32 %zero, 0
 	br i1 %cmp, label %if, label %endif
@@ -44,7 +44,7 @@ endif:
 ; CHECK-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], s[[MASK_LO]]
 ; CHECK-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], s[[MASK_HI]]
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
   %sgpr = tail call i64 asm "v_cmp_ne_u32_e64 $0, 0, $1", "=s,v"(i32 %in)
   store i64 %sgpr, i64 addrspace(1)* %out
   ret void
@@ -52,7 +52,7 @@ define void @v_cmp_asm(i64 addrspace(1)* %out, i32 %in) {
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm:
 ; CHECK: codeLenInByte = 12
-define void @code_size_inline_asm(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e64", ""()
   ret void
@@ -61,7 +61,7 @@ entry:
 ; All inlineasm instructions are assumed to be the maximum size
 ; CHECK-LABEL: {{^}}code_size_inline_asm_small_inst:
 ; CHECK: codeLenInByte = 12
-define void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_small_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e32", ""()
   ret void
@@ -69,7 +69,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
     v_nop_e64
@@ -80,7 +80,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_inst_extra_newline:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_inst_extra_newline(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
     v_nop_e64
@@ -92,7 +92,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_0_inst:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_0_inst(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "", ""()
   ret void
@@ -100,7 +100,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_1_comment(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment", ""()
   ret void
@@ -108,7 +108,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_newline_1_comment:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_newline_1_comment(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "
 ; comment", ""()
@@ -117,7 +117,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_1_comment_newline:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_1_comment_newline(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
 ", ""()
@@ -126,7 +126,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_comments_line(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; first comment ; second comment", ""()
   ret void
@@ -134,7 +134,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_2_comments_line_nospace:
 ; CHECK: codeLenInByte = 4
-define void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_2_comments_line_nospace(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; first comment;second comment", ""()
   ret void
@@ -142,7 +142,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments0:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments0(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
     v_nop_e64 ; inline comment
@@ -157,7 +157,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments1:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments1(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "v_nop_e64 ; inline comment
 ; separate comment
@@ -171,7 +171,7 @@ entry:
 
 ; CHECK-LABEL: {{^}}code_size_inline_asm_mixed_comments_operands:
 ; CHECK: codeLenInByte = 20
-define void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @code_size_inline_asm_mixed_comments_operands(i32 addrspace(1)* %out) {
 entry:
   call void asm sideeffect "; comment
     v_add_i32_e32 v0, vcc, v1, v2 ; inline comment
@@ -183,3 +183,16 @@ entry:
   ", ""()
   ret void
 }
+
+; FIXME: Should not have intermediate sgprs
+; CHECK-LABEL: {{^}}i64_imm_input_phys_vgpr:
+; CHECK: s_mov_b32 s1, 0
+; CHECK: s_mov_b32 s0, 0x1e240
+; CHECK: v_mov_b32_e32 v0, s0
+; CHECK: v_mov_b32_e32 v1, s1
+; CHECK: use v[0:1]
+define void @i64_imm_input_phys_vgpr() {
+entry:
+  call void asm sideeffect "; use $0 ", "{VGPR0_VGPR1}"(i64 123456)
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/inline-calls.ll b/test/CodeGen/AMDGPU/inline-calls.ll
index 4541a902c1b8e69768d967013719e52740b608cb..f8821f319893c72a0210b88ebe81b87c28010698 100644
--- a/test/CodeGen/AMDGPU/inline-calls.ll
+++ b/test/CodeGen/AMDGPU/inline-calls.ll
@@ -11,7 +11,7 @@ entry:
 
 ; CHECK: {{^}}kernel:
 ; CHECK-NOT: call
-define void @kernel(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @func(i32 1)
   store i32 %tmp0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 
 ; CHECK: {{^}}kernel2:
 ; CHECK-NOT: call
-define void @kernel2(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel2(i32 addrspace(1)* %out) {
 entry:
   call void @kernel(i32 addrspace(1)* %out)
   ret void
@@ -31,7 +31,7 @@ entry:
 
 ; CHECK: {{^}}kernel3:
 ; CHECK-NOT: call
-define void @kernel3(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel3(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @func_alias(i32 1)
   store i32 %tmp0, i32 addrspace(1)* %out
@@ -43,7 +43,7 @@ entry:
 
 ; CHECK: {{^}}kernel4:
 ; CHECK-NOT: call
-define void @kernel4(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel4(i32 addrspace(1)* %out) {
 entry:
   call void @kernel_alias(i32 addrspace(1)* %out)
   ret void
diff --git a/test/CodeGen/AMDGPU/inline-constraints.ll b/test/CodeGen/AMDGPU/inline-constraints.ll
index 1bcbd14009ce3c40f97553a38bd9b4e64ee7d6c4..941a1b90dcc119680149cabc3a1fb097a66a92b9 100644
--- a/test/CodeGen/AMDGPU/inline-constraints.ll
+++ b/test/CodeGen/AMDGPU/inline-constraints.ll
@@ -10,7 +10,7 @@
 ; GCN: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 ; GCN: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}]
 
-define void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
+define amdgpu_kernel void @inline_reg_constraints(i32 addrspace(1)* %ptr) {
 entry:
   %v32 = tail call i32 asm sideeffect "flat_load_dword   $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
   %v64 = tail call <2 x i32> asm sideeffect "flat_load_dwordx2 $0, $1", "=v,v"(i32 addrspace(1)* %ptr)
@@ -27,7 +27,7 @@ entry:
 ; GCN: s_mov_b32 m0, -1
 ; GCN: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; GCN: ; use [[COPY_M0]]
-define void @inline_sreg_constraint_m0() {
+define amdgpu_kernel void @inline_sreg_constraint_m0() {
   %m0 = tail call i32 asm sideeffect "s_mov_b32 m0, -1", "={M0}"()
   tail call void asm sideeffect "; use $0", "s"(i32 %m0)
   ret void
@@ -36,7 +36,7 @@ define void @inline_sreg_constraint_m0() {
 ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_i32:
 ; GCN: s_mov_b32 [[REG:s[0-9]+]], 32
 ; GCN: ; use [[REG]]
-define void @inline_sreg_constraint_imm_i32() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_i32() {
   tail call void asm sideeffect "; use $0", "s"(i32 32)
   ret void
 }
@@ -44,7 +44,7 @@ define void @inline_sreg_constraint_imm_i32() {
 ; GCN-LABEL: {{^}}inline_sreg_constraint_imm_f32:
 ; GCN: s_mov_b32 [[REG:s[0-9]+]], 1.0
 ; GCN: ; use [[REG]]
-define void @inline_sreg_constraint_imm_f32() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_f32() {
   tail call void asm sideeffect "; use $0", "s"(float 1.0)
   ret void
 }
@@ -54,7 +54,7 @@ define void @inline_sreg_constraint_imm_f32() {
 ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], -4{{$}}
 ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], -1{{$}}
 ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
-define void @inline_sreg_constraint_imm_i64() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_i64() {
   tail call void asm sideeffect "; use $0", "s"(i64 -4)
   ret void
 }
@@ -63,7 +63,7 @@ define void @inline_sreg_constraint_imm_i64() {
 ; GCN-DAG: s_mov_b32 s[[REG_LO:[0-9]+]], 0{{$}}
 ; GCN-DAG: s_mov_b32 s[[REG_HI:[0-9]+]], 0x3ff00000{{$}}
 ; GCN: ; use s{{\[}}[[REG_LO]]:[[REG_HI]]{{\]}}
-define void @inline_sreg_constraint_imm_f64() {
+define amdgpu_kernel void @inline_sreg_constraint_imm_f64() {
   tail call void asm sideeffect "; use $0", "s"(double 1.0)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/inlineasm-16.ll b/test/CodeGen/AMDGPU/inlineasm-16.ll
index 75f3158937dc80f0e06cd30544f9afe5109ac4a8..15e57fe6bffbc5e7115fd79730d5b90dbdb8fe72 100644
--- a/test/CodeGen/AMDGPU/inlineasm-16.ll
+++ b/test/CodeGen/AMDGPU/inlineasm-16.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}s_input_output_i16:
 ; SICI: error: couldn't allocate output register for constraint 's'
 ; SICI: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i16() #0 {
+define amdgpu_kernel void @s_input_output_i16() #0 {
   %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i16 %v) #0
   ret void
@@ -14,7 +14,7 @@ define void @s_input_output_i16() #0 {
 ; GCN-LABEL: {{^}}v_input_output_i16:
 ; SICI: error: couldn't allocate output register for constraint 'v'
 ; SICI: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_i16() #0 {
+define amdgpu_kernel void @v_input_output_i16() #0 {
   %v = tail call i16 asm sideeffect "v_mov_b32 $0, -1", "=v"() #0
   tail call void asm sideeffect "; use $0", "v"(i16 %v)
   ret void
@@ -23,7 +23,7 @@ define void @v_input_output_i16() #0 {
 ; GCN-LABEL: {{^}}s_input_output_f16:
 ; SICI: error: couldn't allocate output register for constraint 's'
 ; SICI: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_f16() #0 {
+define amdgpu_kernel void @s_input_output_f16() #0 {
   %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"() #0
   tail call void asm sideeffect "; use $0", "s"(half %v)
   ret void
@@ -32,7 +32,7 @@ define void @s_input_output_f16() #0 {
 ; GCN-LABEL: {{^}}v_input_output_f16:
 ; SICI: error: couldn't allocate output register for constraint 'v'
 ; SICI: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_f16() #0 {
+define amdgpu_kernel void @v_input_output_f16() #0 {
   %v = tail call half asm sideeffect "v_mov_b32 $0, -1", "=v"() #0
   tail call void asm sideeffect "; use $0", "v"(half %v)
   ret void
diff --git a/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll b/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
index 2eb21f07e0ec174484ac28889002dd463fce54ec..c1d67ba614c6dda0f89d2f892a03c219b4380024 100644
--- a/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
+++ b/test/CodeGen/AMDGPU/inlineasm-illegal-type.ll
@@ -3,7 +3,7 @@
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i8() {
+define amdgpu_kernel void @s_input_output_i8() {
   %v = tail call i8 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i8 %v)
   ret void
@@ -11,7 +11,7 @@ define void @s_input_output_i8() {
 
 ; GCN: error: couldn't allocate output register for constraint 'v'
 ; GCN: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_i8() {
+define amdgpu_kernel void @v_input_output_i8() {
   %v = tail call i8 asm sideeffect "v_mov_b32 $0, -1", "=v"()
   tail call void asm sideeffect "; use $0", "v"(i8 %v)
   ret void
@@ -19,7 +19,7 @@ define void @v_input_output_i8() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_i128() {
+define amdgpu_kernel void @s_input_output_i128() {
   %v = tail call i128 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i128 %v)
   ret void
@@ -27,7 +27,7 @@ define void @s_input_output_i128() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v8f16() {
+define amdgpu_kernel void @s_input_output_v8f16() {
   %v = tail call <8 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<8 x half> %v)
   ret void
@@ -36,7 +36,7 @@ define void @s_input_output_v8f16() {
 ; CI: error: couldn't allocate output register for constraint 's'
 ; CI: error: couldn't allocate input reg for constraint 's'
 ; VI-NOT: error
-define void @s_input_output_f16() {
+define amdgpu_kernel void @s_input_output_f16() {
   %v = tail call half asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(half %v)
   ret void
@@ -44,7 +44,7 @@ define void @s_input_output_f16() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v2f16() {
+define amdgpu_kernel void @s_input_output_v2f16() {
   %v = tail call <2 x half> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<2 x half> %v)
   ret void
@@ -52,7 +52,7 @@ define void @s_input_output_v2f16() {
 
 ; GCN: error: couldn't allocate output register for constraint 'v'
 ; GCN: error: couldn't allocate input reg for constraint 'v'
-define void @v_input_output_v2f16() {
+define amdgpu_kernel void @v_input_output_v2f16() {
   %v = tail call <2 x half> asm sideeffect "v_mov_b32 $0, -1", "=v"()
   tail call void asm sideeffect "; use $0", "v"(<2 x half> %v)
   ret void
@@ -61,7 +61,7 @@ define void @v_input_output_v2f16() {
 ; CI: error: couldn't allocate output register for constraint 's'
 ; CI: error: couldn't allocate input reg for constraint 's'
 ; VI-NOT: error
-define void @s_input_output_i16() {
+define amdgpu_kernel void @s_input_output_i16() {
   %v = tail call i16 asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(i16 %v)
   ret void
@@ -69,14 +69,14 @@ define void @s_input_output_i16() {
 
 ; GCN: error: couldn't allocate output register for constraint 's'
 ; GCN: error: couldn't allocate input reg for constraint 's'
-define void @s_input_output_v2i16() {
+define amdgpu_kernel void @s_input_output_v2i16() {
   %v = tail call <2 x i16> asm sideeffect "s_mov_b32 $0, -1", "=s"()
   tail call void asm sideeffect "; use $0", "s"(<2 x i16> %v)
   ret void
 }
 
 ; FIXME: Crash in codegen prepare
-; define void @s_input_output_i3() {
+; define amdgpu_kernel void @s_input_output_i3() {
 ;   %v = tail call i3 asm sideeffect "s_mov_b32 $0, -1", "=s"()
 ;   tail call void asm sideeffect "; use $0", "s"(i3 %v)
 ;   ret void
diff --git a/test/CodeGen/AMDGPU/inlineasm-packed.ll b/test/CodeGen/AMDGPU/inlineasm-packed.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3c6c7e1d1b42641a013d3dc1abb360ad865a4ee1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/inlineasm-packed.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}inline_asm_input_v2i16:
+; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_input_v2i16(i32 addrspace(1)* %out, <2 x i16> %in) #0 {
+entry:
+  %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x i16> %in) #0
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_asm_input_v2f16:
+; GCN: s_mov_b32 s0, s{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_input_v2f16(i32 addrspace(1)* %out, <2 x half> %in) #0 {
+entry:
+  %val = call i32 asm "s_mov_b32 $0, $1", "=r,r"(<2 x half> %in) #0
+  store i32 %val, i32 addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_asm_output_v2i16:
+; GCN: s_mov_b32 s{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_output_v2i16(<2 x i16> addrspace(1)* %out, i32 %in) #0 {
+entry:
+  %val = call <2 x i16> asm "s_mov_b32 $0, $1", "=r,r"(i32 %in) #0
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_asm_output_v2f16:
+; GCN: v_mov_b32 v{{[0-9]+}}, s{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_output_v2f16(<2 x half> addrspace(1)* %out, i32 %in) #0 {
+entry:
+  %val = call <2 x half> asm "v_mov_b32 $0, $1", "=v,r"(i32 %in) #0
+  store <2 x half> %val, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_asm_packed_v2i16:
+; GCN: v_pk_add_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_packed_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %in0, <2 x i16> %in1) #0 {
+entry:
+  %val = call <2 x i16> asm "v_pk_add_u16 $0, $1, $2", "=v,r,v"(<2 x i16> %in0, <2 x i16> %in1) #0
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}inline_asm_packed_v2f16:
+; GCN: v_pk_add_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @inline_asm_packed_v2f16(<2 x half> addrspace(1)* %out, <2 x half> %in0, <2 x half> %in1) #0 {
+entry:
+  %val = call <2 x half> asm "v_pk_add_f16 $0, $1, $2", "=v,r,v"(<2 x half> %in0, <2 x half> %in1) #0
+  store <2 x half> %val, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/insert-waits-callee.mir b/test/CodeGen/AMDGPU/insert-waits-callee.mir
new file mode 100644
index 0000000000000000000000000000000000000000..ad7cd0cc8abf7705097d4dbce502ba46ae95a8dd
--- /dev/null
+++ b/test/CodeGen/AMDGPU/insert-waits-callee.mir
@@ -0,0 +1,25 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s
+--- |
+  define float @entry_callee_wait(float %arg) #0 {
+    ret float %arg
+  }
+
+  attributes #0 = { nounwind }
+...
+---
+# CHECK-LABEL: name: entry_callee_wait{{$}}
+# CHECK: bb.0:
+# CHECK-NEXT: S_WAITCNT 0{{$}}
+# CHECK-NEXT: V_ADD_F32
+# CHECK-NEXT: S_SETPC_B64
+liveins:
+  - { reg: '%sgpr0_sgpr1' }
+  - { reg: '%vgpr0' }
+
+name: entry_callee_wait
+body:             |
+  bb.0:
+    %vgpr0 = V_ADD_F32_e32 %vgpr0, %vgpr0, implicit %exec
+    S_SETPC_B64 killed %sgpr0_sgpr1
+
+...
diff --git a/test/CodeGen/AMDGPU/insert-waits-exp.mir b/test/CodeGen/AMDGPU/insert-waits-exp.mir
index 9aaa374ed28e67ac28cf29353a9c8efaea558de8..1055201ce3dd5dfaaa0b23ddf05d05bd47b0e328 100644
--- a/test/CodeGen/AMDGPU/insert-waits-exp.mir
+++ b/test/CodeGen/AMDGPU/insert-waits-exp.mir
@@ -1,18 +1,18 @@
 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass si-insert-waits -o - %s | FileCheck %s
 --- |
-  define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x i32> inreg, i32 inreg %w, float %v) {
+  define amdgpu_ps <4 x float> @exp_done_waitcnt(<4 x i32> inreg, <4 x
+  i32> inreg, i32 inreg %w, float %v) #0 {
     %a = load volatile float, float addrspace(1)* undef
     %b = load volatile float, float addrspace(1)* undef
     %c = load volatile float, float addrspace(1)* undef
     %d = load volatile float, float addrspace(1)* undef
-    call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %a, float %b, float %c, float %d)
+    call void @llvm.amdgcn.exp.f32(i32 15, i32 1, float %a, float %b, float %c, float %d, i1 true, i1 false)
     ret <4 x float> <float 5.000000e-01, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>
   }
 
-  declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+  declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 
-  attributes #0 = { readnone }
-  attributes #1 = { nounwind }
+  attributes #0 = { nounwind }
 
 ...
 ---
@@ -58,6 +58,6 @@ body:             |
     %vgpr1 = V_MOV_B32_e32 1065353216, implicit %exec
     %vgpr2 = V_MOV_B32_e32 1073741824, implicit %exec
     %vgpr3 = V_MOV_B32_e32 1082130432, implicit %exec
-    SI_RETURN killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3
+    SI_RETURN_TO_EPILOG killed %vgpr0, killed %vgpr1, killed %vgpr2, killed %vgpr3
 
 ...
diff --git a/test/CodeGen/AMDGPU/insert_subreg.ll b/test/CodeGen/AMDGPU/insert_subreg.ll
index 4a5e8869c2df1bbb7f45f64dbd88d1b3b915e4b0..e895f27c886dac46f638221d293c742af8998888 100644
--- a/test/CodeGen/AMDGPU/insert_subreg.ll
+++ b/test/CodeGen/AMDGPU/insert_subreg.ll
@@ -6,7 +6,7 @@
 
 ; Make sure this doesn't crash
 ; CHECK-LABEL: test:
-define void @test(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i64 addrspace(1)* %out) {
 entry:
   %tmp0 = alloca [16 x i32]
   %tmp1 = ptrtoint [16 x i32]* %tmp0 to i32
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.ll b/test/CodeGen/AMDGPU/insert_vector_elt.ll
index 65ac693a4f44405097d8fc308619d7455f0f4aed..6391b6b5407b0bb8d9b0da7c58a97245e14df477 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-NO-TONGA %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+max-private-element-size-16 < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=GCN-TONGA %s
 
 ; FIXME: Broken on evergreen
 ; FIXME: For some reason the 8 and 16 vectors are being stored as
@@ -18,56 +18,56 @@
 ; GCN-DAG: s_mov_b32 [[CONSTREG:s[0-9]+]], 0x40a00000
 ; GCN-DAG: v_mov_b32_e32 v[[LOW_REG:[0-9]+]], [[CONSTREG]]
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LOW_REG]]:
-define void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_0(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 0
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_1:
-define void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_1(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 1
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_2:
-define void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_2(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 2
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4f32_3:
-define void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4f32_3(<4 x float> addrspace(1)* %out, <4 x float> %a) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v4i32_0:
-define void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
+define amdgpu_kernel void @insertelement_v4i32_0(<4 x i32> addrspace(1)* %out, <4 x i32> %a) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 999, i32 0
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_1:
-define void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_1(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 1
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_2:
-define void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_2(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 2
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
 }
 
 ; GCN-LABEL: {{^}}insertelement_v3f32_3:
-define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
+define amdgpu_kernel void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %a) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 3
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
@@ -78,7 +78,7 @@ define void @insertelement_v3f32_3(<3 x float> addrspace(1)* %out, <3 x float> %
 define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind {
   %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef
   %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
-  %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp2 = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 false, i1 false, i1 false, i1 false, i1 true)
   ret <4 x float> %tmp2
 }
 
@@ -86,7 +86,7 @@ define amdgpu_ps <4 x float> @insertelement_to_sgpr() nounwind {
 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
-define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x float> %a, float 5.000000e+00, i32 %b
   store <2 x float> %vecins, <2 x float> addrspace(1)* %out, align 8
   ret void
@@ -97,7 +97,7 @@ define void @dynamic_insertelement_v2f32(<2 x float> addrspace(1)* %out, <2 x fl
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 ; GCN-DAG: buffer_store_dword v
-define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x float> %a, float 5.000000e+00, i32 %b
   store <3 x float> %vecins, <3 x float> addrspace(1)* %out, align 16
   ret void
@@ -107,7 +107,7 @@ define void @dynamic_insertelement_v3f32(<3 x float> addrspace(1)* %out, <3 x fl
 ; GCN: v_mov_b32_e32 [[CONST:v[0-9]+]], 0x40a00000
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], [[CONST]]
 ; GCN: buffer_store_dwordx4 {{v\[}}[[LOW_RESULT_REG]]:
-define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 %b
   store <4 x float> %vecins, <4 x float> addrspace(1)* %out, align 16
   ret void
@@ -117,7 +117,7 @@ define void @dynamic_insertelement_v4f32(<4 x float> addrspace(1)* %out, <4 x fl
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x float> %a, float 5.000000e+00, i32 %b
   store <8 x float> %vecins, <8 x float> addrspace(1)* %out, align 32
   ret void
@@ -129,7 +129,7 @@ define void @dynamic_insertelement_v8f32(<8 x float> addrspace(1)* %out, <8 x fl
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x float> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x float> %a, float 5.000000e+00, i32 %b
   store <16 x float> %vecins, <16 x float> addrspace(1)* %out, align 64
   ret void
@@ -138,7 +138,7 @@ define void @dynamic_insertelement_v16f32(<16 x float> addrspace(1)* %out, <16 x
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i32:
 ; GCN: v_movreld_b32
 ; GCN: buffer_store_dwordx2
-define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i32> %a, i32 5, i32 %b
   store <2 x i32> %vecins, <2 x i32> addrspace(1)* %out, align 8
   ret void
@@ -148,7 +148,7 @@ define void @dynamic_insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 ; GCN: v_movreld_b32_e32 v[[LOW_RESULT_REG:[0-9]+]], 5
 ; GCN-DAG: buffer_store_dwordx2 {{v\[}}[[LOW_RESULT_REG]]:
 ; GCN-DAG: buffer_store_dword v
-define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i32> %a, i32 5, i32 %b
   store <3 x i32> %vecins, <3 x i32> addrspace(1)* %out, align 16
   ret void
@@ -159,7 +159,7 @@ define void @dynamic_insertelement_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32>
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[SVAL]]
 ; GCN: v_movreld_b32_e32 v{{[0-9]+}}, [[VVAL]]
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, i32 %b, i32 %val) nounwind {
   %vecins = insertelement <4 x i32> %a, i32 %val, i32 %b
   store <4 x i32> %vecins, <4 x i32> addrspace(1)* %out, align 16
   ret void
@@ -169,7 +169,7 @@ define void @dynamic_insertelement_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32>
 ; GCN: v_movreld_b32
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i32> %a, i32 5, i32 %b
   store <8 x i32> %vecins, <8 x i32> addrspace(1)* %out, align 32
   ret void
@@ -181,21 +181,21 @@ define void @dynamic_insertelement_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32>
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i32> %a, i32 5, i32 %b
   store <16 x i32> %vecins, <16 x i32> addrspace(1)* %out, align 64
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v2i16:
-define void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i16> %a, i16 5, i32 %b
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i16:
-define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i16> %a, i16 5, i32 %b
   store <3 x i16> %vecins, <3 x i16> addrspace(1)* %out, align 8
   ret void
@@ -207,25 +207,22 @@ define void @dynamic_insertelement_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16>
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
 ; GCN: buffer_load_ushort v{{[0-9]+}}, off
 
-; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 0{{$}}
+; GCN-DAG: v_mov_b32_e32 [[BASE_FI:v[0-9]+]], 8{{$}}
 ; GCN-DAG: s_and_b32 [[MASK_IDX:s[0-9]+]], s{{[0-9]+}}, 3{{$}}
 ; GCN-DAG: v_or_b32_e32 [[IDX:v[0-9]+]], [[MASK_IDX]], [[BASE_FI]]{{$}}
 
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
-; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+$}}
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:14
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:12
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:10
+; GCN-DAG: buffer_store_short v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:8
 ; GCN: buffer_store_short v{{[0-9]+}}, [[IDX]], s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
 ; GCN: s_waitcnt
 
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
-; GCN: buffer_load_ushort
+; GCN: buffer_load_dwordx2
 
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off
-define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i16> %a, i16 5, i32 %b
   store <4 x i16> %vecins, <4 x i16> addrspace(1)* %out, align 8
   ret void
@@ -235,16 +232,17 @@ define void @dynamic_insertelement_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16>
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
 
 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-TONGA: buffer_load_ushort
 
 ; GCN: buffer_store_short v{{[0-9]+}}, off
-define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i8> %a, i8 5, i32 %b
   store <2 x i8> %vecins, <2 x i8> addrspace(1)* %out, align 8
   ret void
@@ -255,19 +253,19 @@ define void @dynamic_insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> %a
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
 
-; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
-
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-TONGA: buffer_load_ushort
+; GCN-TONGA: buffer_load_ubyte
 
 ; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off
 ; GCN-DAG: buffer_store_short v{{[0-9]+}}, off
-define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i8> %a, i8 5, i32 %b
   store <3 x i8> %vecins, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -279,34 +277,35 @@ define void @dynamic_insertelement_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> %a
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, off
 
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:3
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:2
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:1
-; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}{{$}}
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:7
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:6
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:5
+; GCN-DAG: buffer_store_byte v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4
 
 ; GCN: buffer_store_byte v{{[0-9]+}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offen{{$}}
 
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-NO-TONGA: buffer_load_ubyte
+; GCN-TONGA: buffer_load_dword
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, off
-define void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x i8> %a, i8 5, i32 %b
   store <4 x i8> %vecins, <4 x i8> addrspace(1)* %out, align 4
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8i8:
-define void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <8 x i8> %a, i8 5, i32 %b
   store <8 x i8> %vecins, <8 x i8> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v16i8:
-define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> %a, i32 %b) nounwind {
   %vecins = insertelement <16 x i8> %a, i8 5, i32 %b
   store <16 x i8> %vecins, <16 x i8> addrspace(1)* %out, align 16
   ret void
@@ -315,7 +314,7 @@ define void @dynamic_insertelement_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8>
 ; This test requires handling INSERT_SUBREG in SIFixSGPRCopies.  Check that
 ; the compiler doesn't crash.
 ; GCN-LABEL: {{^}}insert_split_bb:
-define void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
+define amdgpu_kernel void @insert_split_bb(<2 x i32> addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b) {
 entry:
   %0 = insertelement <2 x i32> undef, i32 %a, i32 0
   %1 = icmp eq i32 %a, 0
@@ -362,7 +361,7 @@ endif:
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x double> %a, double 8.0, i32 %b
   store <2 x double> %vecins, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -375,14 +374,14 @@ define void @dynamic_insertelement_v2f64(<2 x double> addrspace(1)* %out, <2 x d
 
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <2 x i64> %a, i64 5, i32 %b
   store <2 x i64> %vecins, <2 x i64> addrspace(1)* %out, align 8
   ret void
 }
 
 ; GCN-LABEL: {{^}}dynamic_insertelement_v3i64:
-define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %a, i32 %b) nounwind {
   %vecins = insertelement <3 x i64> %a, i64 5, i32 %b
   store <3 x i64> %vecins, <3 x i64> addrspace(1)* %out, align 32
   ret void
@@ -396,15 +395,15 @@ define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64>
 
 ; Stack store
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
 
 ; Write element
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 
 ; Stack reload
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
 
 ; Store result
 ; GCN: buffer_store_dwordx4
@@ -412,7 +411,7 @@ define void @dynamic_insertelement_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64>
 ; GCN: s_endpgm
 ; GCN: ScratchSize: 64
 
-define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, i32 %b) nounwind {
   %vecins = insertelement <4 x double> %a, double 8.0, i32 %b
   store <4 x double> %vecins, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -421,17 +420,17 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d
 ; GCN-LABEL: {{^}}dynamic_insertelement_v8f64:
 ; GCN-DAG: SCRATCH_RSRC_DWORD
 
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
-; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
-; GCN: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
+; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
 
 ; GCN: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen{{$}}
 
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:48{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:32{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:16{{$}}
-; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}}{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:64{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:80{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:96{{$}}
+; GCN-DAG: buffer_load_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:112{{$}}
 
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
@@ -439,10 +438,13 @@ define void @dynamic_insertelement_v4f64(<4 x double> addrspace(1)* %out, <4 x d
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
 ; GCN: ScratchSize: 128
-define void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) nounwind {
+define amdgpu_kernel void @dynamic_insertelement_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, i32 %b) #0 {
   %vecins = insertelement <8 x double> %a, double 8.0, i32 %b
   store <8 x double> %vecins, <8 x double> addrspace(1)* %out, align 16
   ret void
 }
 
-declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) nounwind readnone
+declare <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
index 2cd4d0c6be91ac811f3910fb9d9472848bd8ea7d..a3f82b8a011746f60a187a2ec9cdaaa445098a7b 100644
--- a/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
+++ b/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll
@@ -1,12 +1,16 @@
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI %s
-; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=gfx901 -enable-amdgpu-aa=0 -mattr=+flat-for-global,-fp64-fp16-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=fiji -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -verify-machineinstrs -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -mattr=+flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=CIVI -check-prefix=CI %s
 
 ; GCN-LABEL: {{^}}s_insertelement_v2i16_0:
 ; GCN: s_load_dword [[VEC:s[0-9]+]]
 
 ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x3e7{{$}}
-define void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+
+; GFX9-NOT: lshr
+; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, 0x3e7, [[VEC]]
+define amdgpu_kernel void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 0
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -20,25 +24,106 @@ define void @s_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> add
 ; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
-define void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+
+; GFX9-NOT: [[ELT0]]
+; GFX9-NOT: [[VEC]]
+; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT0]], [[VEC]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
   ret void
 }
 
-; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
+; GCN-LABEL: {{^}}s_insertelement_v2i16_0_multi_use_hi_reg:
 ; GCN: s_load_dword [[ELT0:s[0-9]+]]
 ; GCN: s_load_dword [[VEC:s[0-9]+]]
 
+; CIVI-DAG: s_and_b32 [[ELT0]], [[ELT0]], 0xffff{{$}}
+; CIVI: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
+; CIVI: s_lshl_b32 [[ELT1:s[0-9]+]], [[SHR]], 16
+; CIVI-DAG: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+; CIVI-DAG: ; use [[SHR]]
+
+; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
+; GFX9-DAG: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
+; GFX9-DAG: ; use [[ELT1]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_multi_use_hi_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %elt1 = extractelement <2 x i16> %vec, i32 1
+  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  %use1 = zext i16 %elt1 to i32
+  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi:
+; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
+; GCN: s_load_dword [[VEC:s[0-9]+]]
+
+; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_ARG]], [[ELT1]]
+
+; GFX9-NOT: [[ELT0]]
+; GFX9-NOT: [[VEC]]
+; GFX9: s_pack_hh_b32_b16 s{{[0-9]+}}, [[ELT_ARG]], [[VEC]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %elt.hi = lshr i32 %elt.arg, 16
+  %elt = trunc i32 %elt.hi to i16
+  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_multi_use_1:
+; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
+; GCN: s_load_dword [[VEC:s[0-9]+]],
+
 ; CIVI-DAG: s_and_b32 [[ELT1:s[0-9]+]], [[VEC]], 0xffff0000{{$}}
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
-define void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+
+; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[ELT_ARG]], 16
+; GFX9: s_pack_lh_b32_b16 s{{[0-9]+}}, [[ELT1]], [[VEC]]
+; GFX9: ; use [[ELT1]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
+  %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
+  %elt.hi = lshr i32 %elt.arg, 16
+  %elt = trunc i32 %elt.hi to i16
+  %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
+  store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  %use1 = zext i16 %elt to i32
+  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_insertelement_v2i16_0_reghi_both_multi_use_1:
+; GCN: s_load_dword [[ELT_ARG:s[0-9]+]], s[0:1]
+; GCN: s_load_dword [[VEC:s[0-9]+]],
+
+; CIVI-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
+; CIVI-DAG: s_lshr_b32 [[SHR:s[0-9]+]], [[VEC]], 16
+; CIVI-DAG: s_lshl_b32 [[VEC_HI:s[0-9]+]], [[SHR]], 16
+; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
+
+; GFX9-DAG: s_lshr_b32 [[ELT_HI:s[0-9]+]], [[ELT_ARG]], 16
+; GFX9-DAG: s_lshr_b32 [[VEC_HI:s[0-9]+]], [[VEC]], 16
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[ELT_HI]], [[VEC_HI]]
+; GFX9: ; use [[ELT_HI]]
+; GFX9: ; use [[VEC_HI]]
+define amdgpu_kernel void @s_insertelement_v2i16_0_reghi_both_multi_use_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 %elt.arg) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %elt.hi = lshr i32 %elt.arg, 16
   %elt = trunc i32 %elt.hi to i16
+  %vec.hi = extractelement <2 x i16> %vec, i32 1
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 0
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
+  %use1 = zext i16 %elt to i32
+  %vec.hi.use1 = zext i16 %vec.hi to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %use1) #0
+  call void asm sideeffect "; use $0", "s"(i32 %vec.hi.use1) #0
   ret void
 }
 
@@ -46,9 +131,12 @@ define void @s_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i1
 ; GCN: s_load_dword [[VEC:s[0-9]+]]
 
 ; GCN-NOT: s_lshr
-; GCN: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
-; GCN: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000
-define void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
+
+; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
+; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x3e70000
+
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x3e7
+define amdgpu_kernel void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 1
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -63,7 +151,8 @@ define void @s_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> add
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT0]], [[ELT1]]
 
 ; GCN-NOT: shlr
-define void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], [[ELT1]]
+define amdgpu_kernel void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i16 %elt) #0 {
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 %elt, i32 1
   store <2 x i16> %vecins, <2 x i16> addrspace(1)* %out
@@ -74,7 +163,10 @@ define void @s_insertelement_v2i16_1_reg(<2 x i16> addrspace(1)* %out, <2 x i16>
 ; GCN: s_load_dword [[VEC:s[0-9]+]]
 ; CIVI: s_and_b32 [[ELT1:s[0-9]+]], [[VEC:s[0-9]+]], 0xffff0000
 ; CIVI: s_or_b32 s{{[0-9]+}}, [[ELT1]], 0x4500
-define void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+
+; GFX9: s_lshr_b32 [[ELT1:s[0-9]+]], [[VEC]], 16
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, 0x4500, [[ELT1]]
+define amdgpu_kernel void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 0
   store <2 x half> %vecins, <2 x half> addrspace(1)* %out
@@ -82,11 +174,14 @@ define void @s_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> a
 }
 
 ; GCN-LABEL: {{^}}s_insertelement_v2f16_1:
-; GCN: s_load_dword [[VEC:s[0-9]+]]
+; GFX9: s_load_dword [[VEC:s[0-9]+]]
 ; GCN-NOT: s_lshr
-; GCN: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
-; GCN: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000
-define void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
+
+; CIVI: s_and_b32 [[ELT0:s[0-9]+]], [[VEC]], 0xffff{{$}}
+; CIVI: s_or_b32 [[INS:s[0-9]+]], [[ELT0]], 0x45000000
+
+; GFX9: s_pack_ll_b32_b16 s{{[0-9]+}}, [[VEC]], 0x4500
+define amdgpu_kernel void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(2)* %vec.ptr) #0 {
   %vec = load <2 x half>, <2 x half> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x half> %vec, half 5.000000e+00, i32 1
   store <2 x half> %vecins, <2 x half> addrspace(1)* %out
@@ -94,11 +189,15 @@ define void @s_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> a
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_0:
-; GCN: flat_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e7, [[ELT1]]
+
+; GFX9-DAG: s_movk_i32 [[ELT0:s[0-9]+]], 0x3e7{{$}}
+; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
+; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], [[ELT0]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -117,8 +216,12 @@ define void @v_insertelement_v2i16_0(<2 x i16> addrspace(1)* %out, <2 x i16> add
 ; CIVI-DAG: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], [[ELT0_SHIFT]], [[ELT1]]
 
+; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
+; GFX9-DAG: v_lshrrev_b32_e64 [[ELT0_SHIFT:v[0-9]+]], 16, [[ELT0]]
+; GFX9: v_and_or_b32 [[RES:v[0-9]+]], [[VEC]], [[MASK]], [[ELT0_SHIFT]]
+
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %elt.arg) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -137,8 +240,11 @@ define void @v_insertelement_v2i16_0_reghi(<2 x i16> addrspace(1)* %out, <2 x i1
 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]]
 
+; GFX9-DAG: v_mov_b32_e32 [[MASK:v[0-9]+]], 0xffff{{$}}
+; GFX9: v_bfi_b32 [[RES:v[0-9]+]], [[MASK]], 53, [[VEC]]
+
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -152,11 +258,15 @@ define void @v_insertelement_v2i16_0_inlineimm(<2 x i16> addrspace(1)* %out, <2
 ; FIXME: fold lshl_or c0, c1, v0 -> or (c0 << c1), v0
 
 ; GCN-LABEL: {{^}}v_insertelement_v2i16_1:
-; GCN: flat_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x3e70000, [[VEC]]
 
+; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3e7
+; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
+
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -171,9 +281,9 @@ define void @v_insertelement_v2i16_1(<2 x i16> addrspace(1)* %out, <2 x i16> add
 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
 ; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0xfff10000, [[ELT0]]
-
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], -15, 16, [[ELT0]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -185,13 +295,17 @@ define void @v_insertelement_v2i16_1_inlineimm(<2 x i16> addrspace(1)* %out, <2
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_0:
-; GCN: flat_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
 
 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x4500, [[ELT1]]
 
+; GFX9-DAG: v_mov_b32_e32 [[ELT0:v[0-9]+]], 0x4500{{$}}
+; GFX9-DAG: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]]
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, [[ELT0]]
+
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -208,8 +322,10 @@ define void @v_insertelement_v2f16_0(<2 x half> addrspace(1)* %out, <2 x half> a
 ; CIVI: v_and_b32_e32 [[ELT1:v[0-9]+]], 0xffff0000, [[VEC]]
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 53, [[ELT1]]
 
+; GFX9: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VEC]]
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[ELT1]], 16, 53
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -221,11 +337,15 @@ define void @v_insertelement_v2f16_0_inlineimm(<2 x half> addrspace(1)* %out, <2
 }
 
 ; GCN-LABEL: {{^}}v_insertelement_v2f16_1:
-; GCN: flat_load_dword [[VEC:v[0-9]+]]
+; GCN-DAG: flat_load_dword [[VEC:v[0-9]+]]
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x45000000, [[VEC]]
 
+; GFX9-DAG: s_movk_i32 [[K:s[0-9]+]], 0x4500
+; GFX9-DAG: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], [[K]], 16, [[ELT0]]
+
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -240,9 +360,9 @@ define void @v_insertelement_v2f16_1(<2 x half> addrspace(1)* %out, <2 x half> a
 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
 ; GCN: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VEC]]
 ; CIVI: v_or_b32_e32 [[RES:v[0-9]+]], 0x230000, [[ELT0]]
-
+; GFX9: v_lshl_or_b32 [[RES:v[0-9]+]], 35, 16, [[ELT0]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RES]]
-define void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
@@ -263,7 +383,7 @@ define void @v_insertelement_v2f16_1_inlineimm(<2 x half> addrspace(1)* %out, <2
 ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VVEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 {
+define amdgpu_kernel void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %vec.ptr, i32 addrspace(2)* %idx.ptr) #0 {
   %idx = load volatile i32, i32 addrspace(2)* %idx.ptr
   %vec = load <2 x i16>, <2 x i16> addrspace(2)* %vec.ptr
   %vecins = insertelement <2 x i16> %vec, i16 999, i32 %idx
@@ -279,7 +399,7 @@ define void @s_insertelement_v2i16_dynamic(<2 x i16> addrspace(1)* %out, <2 x i1
 ; GCN-DAG: s_lshl_b32 [[MASK:s[0-9]+]], 0xffff, [[SCALED_IDX]]
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 %idx) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -295,16 +415,16 @@ define void @v_insertelement_v2i16_dynamic_sgpr(<2 x i16> addrspace(1)* %out, <2
 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e7
 
-; VI-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; VI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
-; VI: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
 
-; CI: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
-; CI: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
 
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
@@ -322,16 +442,16 @@ define void @v_insertelement_v2i16_dynamic_vgpr(<2 x i16> addrspace(1)* %out, <2
 ; GCN: flat_load_dword [[VEC:v[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234
 
-; VI-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
-; VI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
-; VI: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
+; GFX89-DAG: s_mov_b32 [[MASKK:s[0-9]+]], 0xffff{{$}}
+; GFX89-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
+; GFX89-DAG: v_lshlrev_b32_e64 [[MASK:v[0-9]+]], [[SCALED_IDX]], [[MASKK]]
 
-; CI: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
-; CI: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
+; CI-DAG: v_lshlrev_b32_e32 [[SCALED_IDX:v[0-9]+]], 16, [[IDX]]
+; CI-DAG: v_lshl_b32_e32 [[MASK:v[0-9]+]], 0xffff, [[SCALED_IDX]]
 
 ; GCN: v_bfi_b32 [[RESULT:v[0-9]+]], [[MASK]], [[K]], [[VEC]]
 ; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[RESULT]]
-define void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
+define amdgpu_kernel void @v_insertelement_v2f16_dynamic_vgpr(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %in, i32 addrspace(1)* %idx.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %in.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %in, i64 %tid.ext
diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir
index 85cd903a405daf4c8ff3b41151afaae0f3a5da79..1479303712d0f0299e03df5b34a06f2b52658650 100644
--- a/test/CodeGen/AMDGPU/inserted-wait-states.mir
+++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir
@@ -1,14 +1,46 @@
 # RUN: llc -march=amdgcn -mcpu=tahiti -run-pass post-RA-hazard-rec  %s -o - | FileCheck %s -check-prefixes=GCN
 # RUN: llc -march=amdgcn -mcpu=hawaii -run-pass post-RA-hazard-rec  %s -o - | FileCheck %s -check-prefixes=GCN,CIVI
 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass post-RA-hazard-rec  %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI
+# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass post-RA-hazard-rec  %s -o - | FileCheck %s -check-prefixes=GCN,CIVI,VI,GFX9
 
 --- |
-  define void @div_fmas() { ret void }
-  define void @s_getreg() { ret void }
-  define void @s_setreg() { ret void }
-  define void @vmem_gt_8dw_store() { ret void }
-  define void @readwrite_lane() { ret void }
-  define void @rfe() { ret void }
+  define amdgpu_kernel void @div_fmas() { ret void }
+  define amdgpu_kernel void @s_getreg() { ret void }
+  define amdgpu_kernel void @s_setreg() { ret void }
+  define amdgpu_kernel void @vmem_gt_8dw_store() { ret void }
+  define amdgpu_kernel void @readwrite_lane() { ret void }
+  define amdgpu_kernel void @rfe() { ret void }
+  define amdgpu_kernel void @s_mov_fed_b32() { ret void }
+  define amdgpu_kernel void @s_movrel() { ret void }
+  define amdgpu_kernel void @v_interp() { ret void }
+
+  define amdgpu_kernel void @mov_fed_hazard_crash_on_dbg_value(i32 addrspace(1)* %A) {
+  entry:
+    %A.addr = alloca i32 addrspace(1)*, align 4
+    store i32 addrspace(1)* %A, i32 addrspace(1)** %A.addr, align 4
+    call void @llvm.dbg.declare(metadata i32 addrspace(1)** %A.addr, metadata !5, metadata !11), !dbg !12
+    ret void
+  }
+
+  declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 3.9.0 (trunk 268929)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "test01.cl", directory: "/dev/null")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 2}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !5 = !DILocalVariable(name: "A", arg: 1, scope: !6, file: !1, line: 1, type: !9)
+  !6 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+  !7 = !DISubroutineType(types: !8)
+  !8 = !{null, !9}
+  !9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64, align: 32)
+  !10 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !11 = !DIExpression()
+  !12 = !DILocation(line: 1, column: 30, scope: !6)
+
 ...
 ---
 # GCN-LABEL: name: div_fmas
@@ -331,3 +363,185 @@ body: |
     S_ENDPGM
 
 ...
+
+...
+---
+
+# GCN-LABEL: name: s_mov_fed_b32
+
+# GCN-LABEL: bb.0:
+# GCN: S_MOV_FED_B32
+# GFX9: S_NOP
+# GCN-NEXT: S_MOV_B32
+
+# GCN-LABEL: bb.1:
+# GCN: S_MOV_FED_B32
+# GFX9: S_NOP
+# GCN-NEXT: V_MOV_B32
+name: s_mov_fed_b32
+
+body: |
+  bb.0:
+    successors: %bb.1
+    %sgpr0 = S_MOV_FED_B32 %sgpr0
+    %sgpr0 = S_MOV_B32 %sgpr0
+    S_BRANCH %bb.1
+
+  bb.1:
+    %sgpr0 = S_MOV_FED_B32 %sgpr0
+    %vgpr0 = V_MOV_B32_e32 %sgpr0, implicit %exec
+    S_ENDPGM
+
+...
+
+...
+---
+
+# GCN-LABEL: name: s_movrel
+
+# GCN-LABEL: bb.0:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: S_MOVRELS_B32
+
+# GCN-LABEL: bb.1:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: S_MOVRELS_B64
+
+# GCN-LABEL: bb.2:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: S_MOVRELD_B32
+
+# GCN-LABEL: bb.3:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: S_MOVRELD_B64
+
+name: s_movrel
+
+body: |
+  bb.0:
+    successors: %bb.1
+    %m0 = S_MOV_B32 0
+    %sgpr0 = S_MOVRELS_B32 %sgpr0, implicit %m0
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+    %m0 = S_MOV_B32 0
+    %sgpr0_sgpr1 = S_MOVRELS_B64 %sgpr0_sgpr1, implicit %m0
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    %m0 = S_MOV_B32 0
+    %sgpr0 = S_MOVRELD_B32 %sgpr0, implicit %m0
+    S_BRANCH %bb.3
+
+  bb.3:
+    %m0 = S_MOV_B32 0
+    %sgpr0_sgpr1 = S_MOVRELD_B64 %sgpr0_sgpr1, implicit %m0
+    S_ENDPGM
+...
+
+...
+---
+
+# GCN-LABEL: name: v_interp
+
+# GCN-LABEL: bb.0:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: V_INTERP_P1_F32
+
+# GCN-LABEL: bb.1:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: V_INTERP_P2_F32
+
+# GCN-LABEL: bb.2:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: V_INTERP_P1_F32_16bank
+
+# GCN-LABEL: bb.3:
+# GCN: S_MOV_B32
+# GFX9: S_NOP
+# GCN-NEXT: V_INTERP_MOV_F32
+
+name: v_interp
+
+body: |
+  bb.0:
+    successors: %bb.1
+    %m0 = S_MOV_B32 0
+    %vgpr0 = V_INTERP_P1_F32 %vgpr0, 0, 0, implicit %m0, implicit %exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2
+    %m0 = S_MOV_B32 0
+    %vgpr0 = V_INTERP_P2_F32 %vgpr0, %vgpr1, 0, 0, implicit %m0, implicit %exec
+    S_BRANCH %bb.2
+
+  bb.2:
+    successors: %bb.3
+    %m0 = S_MOV_B32 0
+    %vgpr0 = V_INTERP_P1_F32_16bank %vgpr0, 0, 0, implicit %m0, implicit %exec
+    S_BRANCH %bb.3
+
+  bb.3:
+    %m0 = S_MOV_B32 0
+    %vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit %m0, implicit %exec
+    S_ENDPGM
+...
+---
+name:            mov_fed_hazard_crash_on_dbg_value
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:
+  - { reg: '%sgpr4_sgpr5' }
+  - { reg: '%sgpr6_sgpr7' }
+  - { reg: '%sgpr9' }
+  - { reg: '%sgpr0_sgpr1_sgpr2_sgpr3' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       16
+  offsetAdjustment: 0
+  maxAlignment:    8
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+stack:
+  - { id: 0, name: A.addr, offset: 0, size: 8, alignment: 8, local-offset: 0 }
+  - { id: 1, offset: 8, size: 4, alignment: 4 }
+body:             |
+  bb.0.entry:
+    liveins: %sgpr4_sgpr5, %sgpr6_sgpr7, %sgpr9, %sgpr0_sgpr1_sgpr2_sgpr3
+
+    %flat_scr_lo = S_ADD_U32 %sgpr6, %sgpr9, implicit-def %scc
+    %flat_scr_hi = S_ADDC_U32 %sgpr7, 0, implicit-def %scc, implicit %scc
+    DBG_VALUE _, 2, !5, !11, debug-location !12
+    %sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed %sgpr4_sgpr5, 0, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    dead %sgpr6_sgpr7 = KILL %sgpr4_sgpr5
+    %sgpr8 = S_MOV_B32 %sgpr5
+    %vgpr0 = V_MOV_B32_e32 killed %sgpr8, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr9, 4, 0, 0, 0, implicit %exec :: (store 4 into %ir.A.addr + 4)
+    %sgpr8 = S_MOV_B32 %sgpr4, implicit killed %sgpr4_sgpr5
+    %vgpr0 = V_MOV_B32_e32 killed %sgpr8, implicit %exec
+    BUFFER_STORE_DWORD_OFFSET %vgpr0, %sgpr0_sgpr1_sgpr2_sgpr3, %sgpr9, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.A.addr)
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/internalize.ll b/test/CodeGen/AMDGPU/internalize.ll
index 5a6669977b81ad2f1ca1ad704851d7fc3b39803d..968b1d326a76db3ec90335ba73a20629fd4c32b6 100644
--- a/test/CodeGen/AMDGPU/internalize.ll
+++ b/test/CodeGen/AMDGPU/internalize.ll
@@ -8,14 +8,14 @@
 @gvar_used = addrspace(1) global i32 undef, align 4
 
 ; Function Attrs: alwaysinline nounwind
-define void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 {
+define amdgpu_kernel void @foo_unused(i32 addrspace(1)* %out) local_unnamed_addr #1 {
 entry:
   store i32 1, i32 addrspace(1)* %out
   ret void
 }
 
 ; Function Attrs: alwaysinline nounwind
-define void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 {
+define amdgpu_kernel void @foo_used(i32 addrspace(1)* %out, i32 %tid) local_unnamed_addr #1 {
 entry:
   store i32 %tid, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/invalid-addrspacecast.ll b/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
index c29434f5eca2f2bf3070aecb5fc30185971eaa33..31f2fbc919aa31eeb4f6de4e27eafd5a17f800bc 100644
--- a/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
+++ b/test/CodeGen/AMDGPU/invalid-addrspacecast.ll
@@ -1,7 +1,7 @@
 ; RUN: not llc -march=amdgcn -mcpu=bonaire -mattr=-promote-alloca < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
 ; ERROR: error: <unknown>:0:0: in function use_group_to_global_addrspacecast void (i32 addrspace(3)*): invalid addrspacecast
-define void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
+define amdgpu_kernel void @use_group_to_global_addrspacecast(i32 addrspace(3)* %ptr) {
   %stof = addrspacecast i32 addrspace(3)* %ptr to i32 addrspace(1)*
   store volatile i32 0, i32 addrspace(1)* %stof
   ret void
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll
deleted file mode 100644
index 49c314fbc5d04d9fadd69acf0319f02bcc6c45f2..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata1.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s
-; check llc does not crash for invalid opencl version metadata
-
-; CHECK: { amd.MDVersion: [ 2, 0 ] }
-
-!opencl.ocl.version = !{}
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll
deleted file mode 100644
index 1f5e8be531dc0c346d495aa6fc402b84fc4f0ddf..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata2.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s
-; check llc does not crash for invalid opencl version metadata
-
-; CHECK: { amd.MDVersion: [ 2, 0 ] }
-
-!opencl.ocl.version = !{!0}
-!0 = !{}
diff --git a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll b/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll
deleted file mode 100644
index b77551e268a0635f01d6ca319cf6dd5788407e9d..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/invalid-opencl-version-metadata3.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s
-; check llc does not crash for invalid opencl version metadata
-
-; CHECK: { amd.MDVersion: [ 2, 0 ] }
-
-!opencl.ocl.version = !{!0}
-!0 = !{i32 1}
diff --git a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
index 45a061067cfc5da2ab9dc93093246c6cfd590d7a..5cd965d2fa9c3bae6be93525569c5cf37be0223c 100644
--- a/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
+++ b/test/CodeGen/AMDGPU/invariant-load-no-alias-store.ll
@@ -10,7 +10,7 @@
 ; GCN-DAG: buffer_load_dwordx2 [[PTR:v\[[0-9]+:[0-9]+\]]],
 ; GCN-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], [[PTR]]
-define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 addrspace(1)* addrspace(1)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(1)* %in, !invariant.load !0
   %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
   store i16 123, i16 addrspace(1)* %ptr, align 4
@@ -22,7 +22,7 @@ define void @test_merge_store_constant_i16_invariant_global_pointer_load(i16 add
 ; GCN: s_load_dwordx2 s{{\[}}[[SPTR_LO:[0-9]+]]:[[SPTR_HI:[0-9]+]]{{\]}}
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0x1c8007b
 ; GCN: buffer_store_dword [[K]], off, s{{\[}}[[SPTR_LO]]:
-define void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
+define amdgpu_kernel void @test_merge_store_constant_i16_invariant_constant_pointer_load(i16 addrspace(1)* addrspace(2)* dereferenceable(4096) nonnull %in) #0 {
   %ptr = load i16 addrspace(1)*, i16 addrspace(1)* addrspace(2)* %in, !invariant.load !0
   %ptr.1 = getelementptr i16, i16 addrspace(1)* %ptr, i64 1
   store i16 123, i16 addrspace(1)* %ptr, align 4
diff --git a/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir b/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
index 66182d092895116d7532803bb2b09422cd215142..bc1dafe0ea1e22be99e3b925f8a656be3c3a4759 100644
--- a/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
+++ b/test/CodeGen/AMDGPU/invert-br-undef-vcc.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass block-placement -march=amdgcn -verify-machineinstrs -o - %s | FileCheck %s
 --- |
 
-  define void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @invert_br_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
 
diff --git a/test/CodeGen/AMDGPU/kcache-fold.ll b/test/CodeGen/AMDGPU/kcache-fold.ll
index 43448fbd7b33193b31d597bbd962bab3a21c9a95..37dd977ae216e4346cb92ee06767fc1c920de4f0 100644
--- a/test/CodeGen/AMDGPU/kcache-fold.ll
+++ b/test/CodeGen/AMDGPU/kcache-fold.ll
@@ -1,100 +1,112 @@
-;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s
 
 ; CHECK: {{^}}main1:
 ; CHECK: MOV * T{{[0-9]+\.[XYZW], KC0}}
-define void @main1() {
+define amdgpu_kernel void @main1() #0 {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = extractelement <4 x float> %0, i32 0
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %3 = extractelement <4 x float> %2, i32 0
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = fcmp ogt float %1, 0.000000e+00
-  %7 = select i1 %6, float %3, float %5
-  %8 = load <4 x float>, <4 x float> addrspace(8)* null
-  %9 = extractelement <4 x float> %8, i32 1
-  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %11 = extractelement <4 x float> %10, i32 1
-  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %13 = extractelement <4 x float> %12, i32 1
-  %14 = fcmp ogt float %9, 0.000000e+00
-  %15 = select i1 %14, float %11, float %13
-  %16 = load <4 x float>, <4 x float> addrspace(8)* null
-  %17 = extractelement <4 x float> %16, i32 2
-  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %19 = extractelement <4 x float> %18, i32 2
-  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %21 = extractelement <4 x float> %20, i32 2
-  %22 = fcmp ogt float %17, 0.000000e+00
-  %23 = select i1 %22, float %19, float %21
-  %24 = load <4 x float>, <4 x float> addrspace(8)* null
-  %25 = extractelement <4 x float> %24, i32 3
-  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %27 = extractelement <4 x float> %26, i32 3
-  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %29 = extractelement <4 x float> %28, i32 3
-  %30 = fcmp ogt float %25, 0.000000e+00
-  %31 = select i1 %30, float %27, float %29
-  %32 = call float @llvm.AMDGPU.clamp.f32(float %7, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
-  %34 = call float @llvm.AMDGPU.clamp.f32(float %23, float 0.000000e+00, float 1.000000e+00)
-  %35 = call float @llvm.AMDGPU.clamp.f32(float %31, float 0.000000e+00, float 1.000000e+00)
-  %36 = insertelement <4 x float> undef, float %32, i32 0
-  %37 = insertelement <4 x float> %36, float %33, i32 1
-  %38 = insertelement <4 x float> %37, float %34, i32 2
-  %39 = insertelement <4 x float> %38, float %35, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  %tmp = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp7 = extractelement <4 x float> %tmp, i32 0
+  %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp9 = extractelement <4 x float> %tmp8, i32 0
+  %tmp10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp11 = extractelement <4 x float> %tmp10, i32 0
+  %tmp12 = fcmp ogt float %tmp7, 0.000000e+00
+  %tmp13 = select i1 %tmp12, float %tmp9, float %tmp11
+  %tmp14 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp15 = extractelement <4 x float> %tmp14, i32 1
+  %tmp16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp17 = extractelement <4 x float> %tmp16, i32 1
+  %tmp18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp19 = extractelement <4 x float> %tmp18, i32 1
+  %tmp20 = fcmp ogt float %tmp15, 0.000000e+00
+  %tmp21 = select i1 %tmp20, float %tmp17, float %tmp19
+  %tmp22 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp23 = extractelement <4 x float> %tmp22, i32 2
+  %tmp24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp25 = extractelement <4 x float> %tmp24, i32 2
+  %tmp26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp27 = extractelement <4 x float> %tmp26, i32 2
+  %tmp28 = fcmp ogt float %tmp23, 0.000000e+00
+  %tmp29 = select i1 %tmp28, float %tmp25, float %tmp27
+  %tmp30 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp31 = extractelement <4 x float> %tmp30, i32 3
+  %tmp32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp33 = extractelement <4 x float> %tmp32, i32 3
+  %tmp34 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp35 = extractelement <4 x float> %tmp34, i32 3
+  %tmp36 = fcmp ogt float %tmp31, 0.000000e+00
+  %tmp37 = select i1 %tmp36, float %tmp33, float %tmp35
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp13, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i5 = call float @llvm.maxnum.f32(float %tmp21, float 0.000000e+00)
+  %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %tmp29, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp37, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp38 = insertelement <4 x float> undef, float %clamp.i, i32 0
+  %tmp39 = insertelement <4 x float> %tmp38, float %clamp.i6, i32 1
+  %tmp40 = insertelement <4 x float> %tmp39, float %clamp.i4, i32 2
+  %tmp41 = insertelement <4 x float> %tmp40, float %clamp.i2, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp41, i32 0, i32 0)
   ret void
 }
 
 ; CHECK: {{^}}main2:
 ; CHECK-NOT: MOV
-define void @main2() {
+define amdgpu_kernel void @main2() #0 {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(8)* null
-  %1 = extractelement <4 x float> %0, i32 0
-  %2 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %3 = extractelement <4 x float> %2, i32 0
-  %4 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %5 = extractelement <4 x float> %4, i32 1
-  %6 = fcmp ogt float %1, 0.000000e+00
-  %7 = select i1 %6, float %3, float %5
-  %8 = load <4 x float>, <4 x float> addrspace(8)* null
-  %9 = extractelement <4 x float> %8, i32 1
-  %10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %11 = extractelement <4 x float> %10, i32 0
-  %12 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %13 = extractelement <4 x float> %12, i32 1
-  %14 = fcmp ogt float %9, 0.000000e+00
-  %15 = select i1 %14, float %11, float %13
-  %16 = load <4 x float>, <4 x float> addrspace(8)* null
-  %17 = extractelement <4 x float> %16, i32 2
-  %18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %19 = extractelement <4 x float> %18, i32 3
-  %20 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %21 = extractelement <4 x float> %20, i32 2
-  %22 = fcmp ogt float %17, 0.000000e+00
-  %23 = select i1 %22, float %19, float %21
-  %24 = load <4 x float>, <4 x float> addrspace(8)* null
-  %25 = extractelement <4 x float> %24, i32 3
-  %26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %27 = extractelement <4 x float> %26, i32 3
-  %28 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %29 = extractelement <4 x float> %28, i32 2
-  %30 = fcmp ogt float %25, 0.000000e+00
-  %31 = select i1 %30, float %27, float %29
-  %32 = call float @llvm.AMDGPU.clamp.f32(float %7, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
-  %34 = call float @llvm.AMDGPU.clamp.f32(float %23, float 0.000000e+00, float 1.000000e+00)
-  %35 = call float @llvm.AMDGPU.clamp.f32(float %31, float 0.000000e+00, float 1.000000e+00)
-  %36 = insertelement <4 x float> undef, float %32, i32 0
-  %37 = insertelement <4 x float> %36, float %33, i32 1
-  %38 = insertelement <4 x float> %37, float %34, i32 2
-  %39 = insertelement <4 x float> %38, float %35, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %39, i32 0, i32 0)
+  %tmp = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp7 = extractelement <4 x float> %tmp, i32 0
+  %tmp8 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp9 = extractelement <4 x float> %tmp8, i32 0
+  %tmp10 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp11 = extractelement <4 x float> %tmp10, i32 1
+  %tmp12 = fcmp ogt float %tmp7, 0.000000e+00
+  %tmp13 = select i1 %tmp12, float %tmp9, float %tmp11
+  %tmp14 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp15 = extractelement <4 x float> %tmp14, i32 1
+  %tmp16 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp17 = extractelement <4 x float> %tmp16, i32 0
+  %tmp18 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp19 = extractelement <4 x float> %tmp18, i32 1
+  %tmp20 = fcmp ogt float %tmp15, 0.000000e+00
+  %tmp21 = select i1 %tmp20, float %tmp17, float %tmp19
+  %tmp22 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp23 = extractelement <4 x float> %tmp22, i32 2
+  %tmp24 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp25 = extractelement <4 x float> %tmp24, i32 3
+  %tmp26 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp27 = extractelement <4 x float> %tmp26, i32 2
+  %tmp28 = fcmp ogt float %tmp23, 0.000000e+00
+  %tmp29 = select i1 %tmp28, float %tmp25, float %tmp27
+  %tmp30 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp31 = extractelement <4 x float> %tmp30, i32 3
+  %tmp32 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp33 = extractelement <4 x float> %tmp32, i32 3
+  %tmp34 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp35 = extractelement <4 x float> %tmp34, i32 2
+  %tmp36 = fcmp ogt float %tmp31, 0.000000e+00
+  %tmp37 = select i1 %tmp36, float %tmp33, float %tmp35
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp13, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i5 = call float @llvm.maxnum.f32(float %tmp21, float 0.000000e+00)
+  %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %tmp29, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp37, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp38 = insertelement <4 x float> undef, float %clamp.i, i32 0
+  %tmp39 = insertelement <4 x float> %tmp38, float %clamp.i6, i32 1
+  %tmp40 = insertelement <4 x float> %tmp39, float %clamp.i4, i32 2
+  %tmp41 = insertelement <4 x float> %tmp40, float %clamp.i2, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp41, i32 0, i32 0)
   ret void
 }
 
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) readnone
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #0
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
index 21c92dbc9098b5bbfd85dd7f813298a48d0b3085..8e358ef2804ff3f925c728595e55f9ca241cbfb8 100644
--- a/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
+++ b/test/CodeGen/AMDGPU/kernarg-stack-alignment.ll
@@ -4,40 +4,40 @@
 ; alignment of the stack
 
 ; CHECK-LABEL: {{^}}no_args:
-; CHECK: ScratchSize: 8{{$}}
-define void @no_args() {
+; CHECK: ScratchSize: 5{{$}}
+define amdgpu_kernel void @no_args() {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
 }
 
 ; CHECK-LABEL: {{^}}force_align32:
-; CHECK: ScratchSize: 8{{$}}
-define void @force_align32(<8 x i32>) {
+; CHECK: ScratchSize: 5{{$}}
+define amdgpu_kernel void @force_align32(<8 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
 }
 
 ; CHECK-LABEL: {{^}}force_align64:
-; CHECK: ScratchSize: 8{{$}}
-define void @force_align64(<16 x i32>) {
+; CHECK: ScratchSize: 5{{$}}
+define amdgpu_kernel void @force_align64(<16 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
 }
 
 ; CHECK-LABEL: {{^}}force_align128:
-; CHECK: ScratchSize: 8{{$}}
-define void @force_align128(<32 x i32>) {
+; CHECK: ScratchSize: 5{{$}}
+define amdgpu_kernel void @force_align128(<32 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
 }
 
 ; CHECK-LABEL: {{^}}force_align256:
-; CHECK: ScratchSize: 8{{$}}
-define void @force_align256(<64 x i32>) {
+; CHECK: ScratchSize: 5{{$}}
+define amdgpu_kernel void @force_align256(<64 x i32>) {
   %alloca = alloca i8
   store volatile i8 0, i8* %alloca
   ret void
diff --git a/test/CodeGen/AMDGPU/kernel-args.ll b/test/CodeGen/AMDGPU/kernel-args.ll
index 95a68319f8af38e8dbb8d5effd10483442a08341..6fa26cb3879357bdd09e25e2c45d6b129fb3b0aa 100644
--- a/test/CodeGen/AMDGPU/kernel-args.ll
+++ b/test/CodeGen/AMDGPU/kernel-args.ll
@@ -17,7 +17,7 @@
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
+define amdgpu_kernel void @i8_arg(i32 addrspace(1)* nocapture %out, i8 %in) nounwind {
 entry:
   %0 = zext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -36,7 +36,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ubyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
+define amdgpu_kernel void @i8_zext_arg(i32 addrspace(1)* nocapture %out, i8 zeroext %in) nounwind {
 entry:
   %0 = zext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -55,7 +55,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_sbyte v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
+define amdgpu_kernel void @i8_sext_arg(i32 addrspace(1)* nocapture %out, i8 signext %in) nounwind {
 entry:
   %0 = sext i8 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -75,7 +75,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
+define amdgpu_kernel void @i16_arg(i32 addrspace(1)* nocapture %out, i16 %in) nounwind {
 entry:
   %0 = zext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -94,7 +94,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_ushort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
+define amdgpu_kernel void @i16_zext_arg(i32 addrspace(1)* nocapture %out, i16 zeroext %in) nounwind {
 entry:
   %0 = zext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -113,7 +113,7 @@ entry:
 ; FIXME: Should be using s_load_dword
 ; HSA-VI: flat_load_sshort v{{[0-9]+}}, v{{\[}}[[VPTR_LO]]:[[VPTR_HI]]]
 
-define void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
+define amdgpu_kernel void @i16_sext_arg(i32 addrspace(1)* nocapture %out, i16 signext %in) nounwind {
 entry:
   %0 = sext i16 %in to i32
   store i32 %0, i32 addrspace(1)* %out, align 4
@@ -126,7 +126,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]}}, s[4:5], 0x8
-define void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
+define amdgpu_kernel void @i32_arg(i32 addrspace(1)* nocapture %out, i32 %in) nounwind {
 entry:
   store i32 %in, i32 addrspace(1)* %out, align 4
   ret void
@@ -138,7 +138,7 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dword s{{[0-9]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dword s{{[0-9]+}}, s[4:5], 0x8
-define void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
+define amdgpu_kernel void @f32_arg(float addrspace(1)* nocapture %out, float %in) nounwind {
 entry:
   store float %in, float addrspace(1)* %out, align 4
   ret void
@@ -152,7 +152,7 @@ entry:
 ; MESA-GCN: buffer_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
+define amdgpu_kernel void @v2i8_arg(<2 x i8> addrspace(1)* %out, <2 x i8> %in) {
 entry:
   store <2 x i8> %in, <2 x i8> addrspace(1)* %out
   ret void
@@ -166,7 +166,7 @@ entry:
 ; MESA-GCN: buffer_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
+define amdgpu_kernel void @v2i16_arg(<2 x i16> addrspace(1)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(1)* %out
   ret void
@@ -179,7 +179,7 @@ entry:
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
+define amdgpu_kernel void @v2i32_arg(<2 x i32> addrspace(1)* nocapture %out, <2 x i32> %in) nounwind {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(1)* %out, align 4
   ret void
@@ -192,7 +192,7 @@ entry:
 ; SI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xb
 ; MESA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x2c
 ; HSA-VI: s_load_dwordx2 s{{\[[0-9]:[0-9]\]}}, s[4:5], 0x8
-define void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
+define amdgpu_kernel void @v2f32_arg(<2 x float> addrspace(1)* nocapture %out, <2 x float> %in) nounwind {
 entry:
   store <2 x float> %in, <2 x float> addrspace(1)* %out, align 4
   ret void
@@ -209,7 +209,7 @@ entry:
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
+define amdgpu_kernel void @v3i8_arg(<3 x i8> addrspace(1)* nocapture %out, <3 x i8> %in) nounwind {
 entry:
   store <3 x i8> %in, <3 x i8> addrspace(1)* %out, align 4
   ret void
@@ -226,7 +226,7 @@ entry:
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
+define amdgpu_kernel void @v3i16_arg(<3 x i16> addrspace(1)* nocapture %out, <3 x i16> %in) nounwind {
 entry:
   store <3 x i16> %in, <3 x i16> addrspace(1)* %out, align 4
   ret void
@@ -239,7 +239,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
+define amdgpu_kernel void @v3i32_arg(<3 x i32> addrspace(1)* nocapture %out, <3 x i32> %in) nounwind {
 entry:
   store <3 x i32> %in, <3 x i32> addrspace(1)* %out, align 4
   ret void
@@ -253,7 +253,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]+\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
+define amdgpu_kernel void @v3f32_arg(<3 x float> addrspace(1)* nocapture %out, <3 x float> %in) nounwind {
 entry:
   store <3 x float> %in, <3 x float> addrspace(1)* %out, align 4
   ret void
@@ -273,7 +273,7 @@ entry:
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @v4i8_arg(<4 x i8> addrspace(1)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(1)* %out
   ret void
@@ -293,7 +293,7 @@ entry:
 ; HSA-GCN: flat_load_ushort
 ; HSA-GCN: flat_load_ushort
 ; HSA-GCN: flat_load_ushort
-define void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
+define amdgpu_kernel void @v4i16_arg(<4 x i16> addrspace(1)* %out, <4 x i16> %in) {
 entry:
   store <4 x i16> %in, <4 x i16> addrspace(1)* %out
   ret void
@@ -308,7 +308,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
+define amdgpu_kernel void @v4i32_arg(<4 x i32> addrspace(1)* nocapture %out, <4 x i32> %in) nounwind {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -323,7 +323,7 @@ entry:
 ; SI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0xd
 ; MESA-VI: s_load_dwordx4 s{{\[[0-9]:[0-9]\]}}, s[0:1], 0x34
 ; HSA-VI: s_load_dwordx4 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x10
-define void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
+define amdgpu_kernel void @v4f32_arg(<4 x float> addrspace(1)* nocapture %out, <4 x float> %in) nounwind {
 entry:
   store <4 x float> %in, <4 x float> addrspace(1)* %out, align 4
   ret void
@@ -354,7 +354,7 @@ entry:
 ; HSA-GCN: float_load_ubyte
 ; HSA-GCN: float_load_ubyte
 ; HSA-GCN: float_load_ubyte
-define void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
+define amdgpu_kernel void @v8i8_arg(<8 x i8> addrspace(1)* %out, <8 x i8> %in) {
 entry:
   store <8 x i8> %in, <8 x i8> addrspace(1)* %out
   ret void
@@ -386,7 +386,7 @@ entry:
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
+define amdgpu_kernel void @v8i16_arg(<8 x i16> addrspace(1)* %out, <8 x i16> %in) {
 entry:
   store <8 x i16> %in, <8 x i16> addrspace(1)* %out
   ret void
@@ -405,7 +405,7 @@ entry:
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
 ; MESA-VI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x44
 ; HSA-VI: s_load_dwordx8 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x20
-define void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
+define amdgpu_kernel void @v8i32_arg(<8 x i32> addrspace(1)* nocapture %out, <8 x i32> %in) nounwind {
 entry:
   store <8 x i32> %in, <8 x i32> addrspace(1)* %out, align 4
   ret void
@@ -422,7 +422,7 @@ entry:
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[5].W
 ; EG-DAG: T{{[0-9]\.[XYZW]}}, KC0[6].X
 ; SI: s_load_dwordx8 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x11
-define void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
+define amdgpu_kernel void @v8f32_arg(<8 x float> addrspace(1)* nocapture %out, <8 x float> %in) nounwind {
 entry:
   store <8 x float> %in, <8 x float> addrspace(1)* %out, align 4
   ret void
@@ -478,7 +478,7 @@ entry:
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
 ; HSA-VI: flat_load_ubyte
-define void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
+define amdgpu_kernel void @v16i8_arg(<16 x i8> addrspace(1)* %out, <16 x i8> %in) {
 entry:
   store <16 x i8> %in, <16 x i8> addrspace(1)* %out
   ret void
@@ -534,7 +534,7 @@ entry:
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
 ; HSA-VI: flat_load_ushort
-define void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
+define amdgpu_kernel void @v16i16_arg(<16 x i16> addrspace(1)* %out, <16 x i16> %in) {
 entry:
   store <16 x i16> %in, <16 x i16> addrspace(1)* %out
   ret void
@@ -561,7 +561,7 @@ entry:
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-define void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
+define amdgpu_kernel void @v16i32_arg(<16 x i32> addrspace(1)* nocapture %out, <16 x i32> %in) nounwind {
 entry:
   store <16 x i32> %in, <16 x i32> addrspace(1)* %out, align 4
   ret void
@@ -588,7 +588,7 @@ entry:
 ; SI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x19
 ; MESA-VI: s_load_dwordx16 s{{\[[0-9]+:[0-9]+\]}}, s[0:1], 0x64
 ; HSA-VI: s_load_dwordx16 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x40
-define void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
+define amdgpu_kernel void @v16f32_arg(<16 x float> addrspace(1)* nocapture %out, <16 x float> %in) nounwind {
 entry:
   store <16 x float> %in, <16 x float> addrspace(1)* %out, align 4
   ret void
@@ -599,7 +599,7 @@ entry:
 ; MESA-GCN: s_load_dwordx2
 ; MESA-GCN: buffer_store_dwordx2
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
@@ -611,7 +611,7 @@ define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
 ; MESA-VI-DAG: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[0:1], 0x2c
 ; MESA-GCN: buffer_store_dwordx2
 ; HSA-VI: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}], s[4:5], 0x8
-define void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
+define amdgpu_kernel void @f64_kernel_arg(double addrspace(1)* %out, double  %in) {
 entry:
   store double %in, double addrspace(1)* %out
   ret void
@@ -621,7 +621,7 @@ entry:
 ; XGCN: s_load_dwordx2
 ; XGCN: s_load_dwordx2
 ; XGCN: buffer_store_dwordx2
-; define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+; define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
 ;   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
 ;   ret void
 ; }
@@ -631,7 +631,7 @@ entry:
 ; SI: v_and_b32_e32
 ; SI: buffer_store_byte
 ; SI: s_endpgm
-define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
   store i1 %x, i1 addrspace(1)* %out, align 1
   ret void
 }
@@ -640,7 +640,7 @@ define void @i1_arg(i1 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i32
   store i32 %ext, i32 addrspace(1)* %out, align 4
   ret void
@@ -650,7 +650,7 @@ define void @i1_arg_zext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = zext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
@@ -660,7 +660,7 @@ define void @i1_arg_zext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i32
   store i32 %ext, i32addrspace(1)* %out, align 4
   ret void
@@ -672,7 +672,7 @@ define void @i1_arg_sext_i32(i32 addrspace(1)* %out, i1 %x) nounwind {
 ; SI: v_ashrrev_i32
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
+define amdgpu_kernel void @i1_arg_sext_i64(i64 addrspace(1)* %out, i1 %x) nounwind {
   %ext = sext i1 %x to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/large-alloca-compute.ll b/test/CodeGen/AMDGPU/large-alloca-compute.ll
index 4f6dbf9dc2bf7db0e340fc61a129b0d0cba67054..4af37d8da966232298b5cc2984e98ebf3a64674c 100644
--- a/test/CodeGen/AMDGPU/large-alloca-compute.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-compute.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=carrizo --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 --show-mc-encoding < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=bonaire -mtriple=amdgcn-unknown-amdhsa < %s -mattr=-flat-for-global | FileCheck -check-prefix=GCNHSA -check-prefix=CIHSA -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=carrizo -mtriple=amdgcn-unknown-amdhsa -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCNHSA -check-prefix=VIHSA -check-prefix=ALL %s
 
@@ -14,6 +15,7 @@
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
 ; CI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe8f000
 ; VI-DAG: s_mov_b32 s{{[0-9]+}}, 0xe80000
+; GFX9-DAG: s_mov_b32 s{{[0-9]+}}, 0xe00000
 
 
 ; GCNHSA: .amd_kernel_code_t
@@ -46,7 +48,7 @@
 
 ; Scratch size = alloca size + emergency stack slot
 ; ALL: ; ScratchSize: 32772
-define void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @large_alloca_compute_shader(i32 %x, i32 %y) #0 {
   %large = alloca [8192 x i32], align 4
   %gep = getelementptr [8192 x i32], [8192 x i32]* %large, i32 0, i32 8191
   store volatile i32 %x, i32* %gep
diff --git a/test/CodeGen/AMDGPU/large-alloca-graphics.ll b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
index ea9754a390b6defcadb8c53da4d1b5fa00d07a2a..28b819a6374b2f38eda086044c5a0523b8b6b71b 100644
--- a/test/CodeGen/AMDGPU/large-alloca-graphics.ll
+++ b/test/CodeGen/AMDGPU/large-alloca-graphics.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=ALL %s
 ; RUN: llc -march=amdgcn -mcpu=carrizo -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=ALL %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=ALL %s
 
 ; ALL-LABEL: {{^}}large_alloca_pixel_shader:
 ; GCN-DAG: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
@@ -7,6 +8,7 @@
 ; GCN-DAG: s_mov_b32 s10, -1
 ; CI-DAG: s_mov_b32 s11, 0xe8f000
 ; VI-DAG: s_mov_b32 s11, 0xe80000
+; GFX9-DAG: s_mov_b32 s11, 0xe00000
 
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s0 offen
@@ -28,6 +30,7 @@ define amdgpu_ps void @large_alloca_pixel_shader(i32 %x, i32 %y) #0 {
 ; GCN-DAG: s_mov_b32 s10, -1
 ; CI-DAG: s_mov_b32 s11, 0xe8f000
 ; VI-DAG: s_mov_b32 s11, 0xe80000
+; GFX9-DAG: s_mov_b32 s11, 0xe00000
 
 ; GCN: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
 ; GCN: buffer_load_dword {{v[0-9]+}}, {{v[0-9]+}}, s[8:11], s2 offen
diff --git a/test/CodeGen/AMDGPU/large-constant-initializer.ll b/test/CodeGen/AMDGPU/large-constant-initializer.ll
index 9975b1b7f5cc30eda161320e628a37fa77970bf0..c46d68e38ade82f240c96faa120cbb262c5a723a 100644
--- a/test/CodeGen/AMDGPU/large-constant-initializer.ll
+++ b/test/CodeGen/AMDGPU/large-constant-initializer.ll
@@ -4,7 +4,7 @@
 
 @gv = external unnamed_addr addrspace(2) constant [239 x i32], align 4
 
-define void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
+define amdgpu_kernel void @opencv_cvtfloat_crash(i32 addrspace(1)* %out, i32 %x) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([239 x i32], [239 x i32] addrspace(2)* @gv, i64 0, i64 239), align 4
   %mul12 = mul nsw i32 %val, 7
   br i1 undef, label %exit, label %bb
diff --git a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
index 906a688febd240da16e64033b92846745b4c5290..13dd7058c50a5966c02f068d24c2ab4d280db4e7 100644
--- a/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
+++ b/test/CodeGen/AMDGPU/large-work-group-promote-alloca.ll
@@ -1,8 +1,10 @@
-; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=SI --check-prefix=ALL %s
+; RUN: opt -S -mcpu=tonga -mtriple=amdgcn-unknown-unknown -amdgpu-promote-alloca < %s | FileCheck --check-prefix=CI --check-prefix=ALL %s
 
-; CHECK: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+; SI-NOT: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
+; CI: @promote_alloca_size_63.stack = internal unnamed_addr addrspace(3) global [63 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @promote_alloca_size_63(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -22,9 +24,9 @@ entry:
   ret void
 }
 
-; CHECK: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
+; ALL: @promote_alloca_size_256.stack = internal unnamed_addr addrspace(3) global [256 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
+define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #1 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -44,9 +46,9 @@ entry:
   ret void
 }
 
-; CHECK: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
+; ALL: @promote_alloca_size_1600.stack = internal unnamed_addr addrspace(3) global [1600 x [5 x i32]] undef, align 4
 
-define void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
+define amdgpu_kernel void @promote_alloca_size_1600(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #2 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -66,9 +68,10 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_0(
-; CHECK: alloca [5 x i32]
-define void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
+; ALL-LABEL: @occupancy_0(
+; CI-NOT: alloca [5 x i32]
+; SI: alloca [5 x i32]
+define amdgpu_kernel void @occupancy_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #3 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -88,9 +91,10 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_max(
-; CHECK: alloca [5 x i32]
-define void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
+; ALL-LABEL: @occupancy_max(
+; CI-NOT: alloca [5 x i32]
+; SI: alloca [5 x i32]
+define amdgpu_kernel void @occupancy_max(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #4 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -110,9 +114,11 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_6(
-; CHECK-NOT: alloca
-define void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+; SI-LABEL: @occupancy_6(
+; CI-LABEL: @occupancy_6(
+; SI: alloca
+; CI-NOT: alloca
+define amdgpu_kernel void @occupancy_6(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [42 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -134,9 +140,9 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_6_over(
-; CHECK: alloca [43 x i8]
-define void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
+; ALL-LABEL: @occupancy_6_over(
+; ALL: alloca [43 x i8]
+define amdgpu_kernel void @occupancy_6_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [43 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -158,9 +164,11 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_8(
-; CHECK-NOT: alloca
-define void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+; SI-LABEL: @occupancy_8(
+; CI-LABEL: @occupancy_8(
+; SI: alloca
+; CI-NOT: alloca
+define amdgpu_kernel void @occupancy_8(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [32 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -182,9 +190,9 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_8_over(
-; CHECK: alloca [33 x i8]
-define void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
+; ALL-LABEL: @occupancy_8_over(
+; ALL: alloca [33 x i8]
+define amdgpu_kernel void @occupancy_8_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [33 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -206,9 +214,11 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_9(
-; CHECK-NOT: alloca
-define void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+; SI-LABEL: @occupancy_9(
+; CI-LABEL: @occupancy_9(
+; SI: alloca
+; CI-NOT: alloca
+define amdgpu_kernel void @occupancy_9(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 entry:
   %stack = alloca [28 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
@@ -230,9 +240,9 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @occupancy_9_over(
-; CHECK: alloca [29 x i8]
-define void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
+; ALL-LABEL: @occupancy_9_over(
+; ALL: alloca [29 x i8]
+define amdgpu_kernel void @occupancy_9_over(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %in) #7 {
 entry:
   %stack = alloca [29 x i8], align 4
   %tmp = load i8, i8 addrspace(1)* %in, align 1
diff --git a/test/CodeGen/AMDGPU/lds-alignment.ll b/test/CodeGen/AMDGPU/lds-alignment.ll
index 99334585e5896e8c94bbf2371516510f889d2d57..c23dea2b6b763bdc7cb14deb5bccb6c44997675e 100644
--- a/test/CodeGen/AMDGPU/lds-alignment.ll
+++ b/test/CodeGen/AMDGPU/lds-alignment.ll
@@ -15,7 +15,7 @@ declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace
 
 ; HSA-LABEL: {{^}}test_no_round_size_1:
 ; HSA: workgroup_group_segment_byte_size = 38
-define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
@@ -34,7 +34,7 @@ define void @test_no_round_size_1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #
 ; HSA-LABEL: {{^}}test_round_size_2:
 ; HSA: workgroup_group_segment_byte_size = 86
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 4, i1 false)
@@ -50,7 +50,7 @@ define void @test_round_size_2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
 ; HSA-LABEL: {{^}}test_round_size_2_align_8:
 ; HSA: workgroup_group_segment_byte_size = 86
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -65,7 +65,7 @@ define void @test_round_size_2_align_8(i8 addrspace(1)* %out, i8 addrspace(1)* %
 ; HSA-LABEL: {{^}}test_round_local_lds_and_arg:
 ; HSA: workgroup_group_segment_byte_size = 38
 ; HSA: group_segment_alignment = 4
-define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+define amdgpu_kernel void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
 
@@ -78,7 +78,7 @@ define void @test_round_local_lds_and_arg(i8 addrspace(1)* %out, i8 addrspace(1)
 ; HSA-LABEL: {{^}}test_round_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
-define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
+define amdgpu_kernel void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* %lds.arg) #1 {
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 4, i1 false)
   ret void
@@ -88,7 +88,7 @@ define void @test_round_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8
 ; HSA-LABEL: {{^}}test_high_align_lds_arg:
 ; HSA: workgroup_group_segment_byte_size = 0
 ; HSA: group_segment_alignment = 4
-define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
+define amdgpu_kernel void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in, i8 addrspace(3)* align 64 %lds.arg) #1 {
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.arg, i8 addrspace(1)* %in, i32 38, i32 64, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.arg, i32 38, i32 64, i1 false)
   ret void
@@ -98,7 +98,7 @@ define void @test_high_align_lds_arg(i8 addrspace(1)* %out, i8 addrspace(1)* %in
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order0:
 ; HSA: workgroup_group_segment_byte_size = 212
 ; HSA: group_segment_alignment = 4
-define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.0.bc = bitcast [39 x i32] addrspace(3)* @lds.missing.align.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.0.bc, i8 addrspace(1)* %in, i32 160, i32 4, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.0.bc, i32 160, i32 4, i1 false)
@@ -114,7 +114,7 @@ define void @test_missing_alignment_size_2_order0(i8 addrspace(1)* %out, i8 addr
 ; HSA-LABEL: {{^}}test_missing_alignment_size_2_order1:
 ; HSA: workgroup_group_segment_byte_size = 216
 ; HSA: group_segment_alignment = 4
-define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.missing.align.1.bc = bitcast [7 x i64] addrspace(3)* @lds.missing.align.1 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.missing.align.1.bc, i8 addrspace(1)* %in, i32 56, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.missing.align.1.bc, i32 56, i32 8, i1 false)
@@ -142,7 +142,7 @@ define void @test_missing_alignment_size_2_order1(i8 addrspace(1)* %out, i8 addr
 ; HSA-LABEL: {{^}}test_round_size_3_order0:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
@@ -163,7 +163,7 @@ define void @test_round_size_3_order0(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order1:
 ; HSA: workgroup_group_segment_byte_size = 134
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align32.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align32.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align32.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align32.0.bc, i32 38, i32 8, i1 false)
@@ -184,7 +184,7 @@ define void @test_round_size_3_order1(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order2:
 ; HSA: workgroup_group_segment_byte_size = 150
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -205,7 +205,7 @@ define void @test_round_size_3_order2(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order3:
 ; HSA: workgroup_group_segment_byte_size = 118
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align16.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align16.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align16.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align16.0.bc, i32 38, i32 8, i1 false)
@@ -226,7 +226,7 @@ define void @test_round_size_3_order3(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order4:
 ; HSA: workgroup_group_segment_byte_size = 142
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
@@ -247,7 +247,7 @@ define void @test_round_size_3_order4(i8 addrspace(1)* %out, i8 addrspace(1)* %i
 ; HSA-LABEL: {{^}}test_round_size_3_order5:
 ; HSA: workgroup_group_segment_byte_size = 126
 ; HSA: group_segment_alignment = 4
-define void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @test_round_size_3_order5(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
   %lds.align8.0.bc = bitcast [38 x i8] addrspace(3)* @lds.align8.0 to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %lds.align8.0.bc, i8 addrspace(1)* %in, i32 38, i32 8, i1 false)
   call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out, i8 addrspace(3)* %lds.align8.0.bc, i32 38, i32 8, i1 false)
diff --git a/test/CodeGen/AMDGPU/lds-initializer.ll b/test/CodeGen/AMDGPU/lds-initializer.ll
index 9875814b03d36d4c00fe83853986480c42139df7..254673d8a1e4cba0d91d0a998faa2ac2873450ba 100644
--- a/test/CodeGen/AMDGPU/lds-initializer.ll
+++ b/test/CodeGen/AMDGPU/lds-initializer.ll
@@ -5,7 +5,7 @@
 
 @lds = addrspace(3) global [8 x i32] [i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8]
 
-define void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
+define amdgpu_kernel void @load_init_lds_global(i32 addrspace(1)* %out, i1 %p) {
  %gep = getelementptr [8 x i32], [8 x i32] addrspace(3)* @lds, i32 0, i32 10
   %ld = load i32, i32 addrspace(3)* %gep
   store i32 %ld, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
index 078d6330ce04e25d9931d9c769c88070a054d43e..1b3eeed3005c6da98f8a441b54b1a095088715bb 100644
--- a/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
+++ b/test/CodeGen/AMDGPU/lds-m0-init-in-loop.ll
@@ -18,7 +18,7 @@
 
 ; GCN: BB0_3:
 ; GCN-NEXT: s_endpgm
-define void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
+define amdgpu_kernel void @copy_local_to_global_loop_m0_init(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(3)* noalias nocapture readonly %in, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
diff --git a/test/CodeGen/AMDGPU/lds-oqap-crash.ll b/test/CodeGen/AMDGPU/lds-oqap-crash.ll
index 6ff6fc3d7afcd6cecef5458925a5bca45fb02432..fff2a92007293f13bed759b405fac550e44d5cc7 100644
--- a/test/CodeGen/AMDGPU/lds-oqap-crash.ll
+++ b/test/CodeGen/AMDGPU/lds-oqap-crash.ll
@@ -10,7 +10,7 @@
 ; reads and writes are bundled together in the same instruction.
 
 ; CHECK: {{^}}lds_crash:
-define void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @lds_crash(i32 addrspace(1)* %out, i32 addrspace(3)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = load i32, i32 addrspace(3)* %in
   ; This block needs to be > 115 ISA instructions to hit the bug,
diff --git a/test/CodeGen/AMDGPU/lds-output-queue.ll b/test/CodeGen/AMDGPU/lds-output-queue.ll
index abe472e423fca58464380b76cf93e3771070f91f..8b7e9e6d6aa8b9e563f5ef3eef705758a259958b 100644
--- a/test/CodeGen/AMDGPU/lds-output-queue.ll
+++ b/test/CodeGen/AMDGPU/lds-output-queue.ll
@@ -10,7 +10,7 @@
 
 @local_mem = internal unnamed_addr addrspace(3) global [2 x i32] undef, align 4
 
-define void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
+define amdgpu_kernel void @lds_input_queue(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %index) {
 entry:
   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 %index
   %1 = load i32, i32 addrspace(3)* %0
@@ -88,7 +88,7 @@ declare void @llvm.r600.group.barrier() nounwind convergent
 ; CHECK: LDS_READ_RET
 ; CHECK-NOT: ALU clause
 ; CHECK: MOV * T{{[0-9]\.[XYZW]}}, OQAP
-define void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @local_global_alias(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr inbounds [2 x i32], [2 x i32] addrspace(3)* @local_mem, i32 0, i32 0
   %1 = load i32, i32 addrspace(3)* %0
diff --git a/test/CodeGen/AMDGPU/lds-size.ll b/test/CodeGen/AMDGPU/lds-size.ll
index 1607713090e3c93671404a3d4376e555ba73eb19..c65817abd489da4cdb5ec6e7aa13cb6dcdc48d70 100644
--- a/test/CodeGen/AMDGPU/lds-size.ll
+++ b/test/CodeGen/AMDGPU/lds-size.ll
@@ -14,7 +14,7 @@
 ; GCN: ; LDSByteSize: 4 bytes/workgroup (compile time only)
 @lds = internal unnamed_addr addrspace(3) global i32 undef, align 4
 
-define void @test(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/lds-zero-initializer.ll b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
index cb5d73fb0d8b979dd8bbbd1d884c6b0940881374..53c1c727a19d2c28a5223dbe1b977ca9da6b4eb9 100644
--- a/test/CodeGen/AMDGPU/lds-zero-initializer.ll
+++ b/test/CodeGen/AMDGPU/lds-zero-initializer.ll
@@ -5,7 +5,7 @@
 
 @lds = addrspace(3) global [256 x i32] zeroinitializer
 
-define void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
+define amdgpu_kernel void @load_zeroinit_lds_global(i32 addrspace(1)* %out, i1 %p) {
  %gep = getelementptr [256 x i32], [256 x i32] addrspace(3)* @lds, i32 0, i32 10
   %ld = load i32, i32 addrspace(3)* %gep
   store i32 %ld, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
index 4244c48d240e69b0f2d7da2fb5b8e3293d292a5e..e85a1b690af60e198b49c6ea89ba098d65d36ebe 100644
--- a/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
+++ b/test/CodeGen/AMDGPU/legalizedag-bug-expand-setcc.ll
@@ -11,7 +11,7 @@
 ; CHECK: {{^}}setcc_expand:
 ; CHECK: SET
 ; CHECK-NOT: CND
-define void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @setcc_expand(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp eq i32 %in, 5
   br i1 %0, label %IF, label %ENDIF
diff --git a/test/CodeGen/AMDGPU/literals.ll b/test/CodeGen/AMDGPU/literals.ll
index 82fbb7f46186fed98b9539f04428b943bc6e525d..1c546ba9f74ba5c2a0cde55f8a3af8430cb792fc 100644
--- a/test/CodeGen/AMDGPU/literals.ll
+++ b/test/CodeGen/AMDGPU/literals.ll
@@ -10,7 +10,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: ADD_INT * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 5
-define void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @i32_literal(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = add i32 5, %in
   store i32 %0, i32 addrspace(1)* %out
@@ -27,7 +27,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: ADD * {{\** *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, literal.y
 ; CHECK-NEXT: 1084227584(5.0
-define void @float_literal(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @float_literal(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fadd float 5.0, %in
   store float %0, float addrspace(1)* %out
@@ -41,7 +41,7 @@ entry:
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].Z, 0.0
 ; CHECK-NEXT: MOV {{\** *}}T[[GPR]].W, 0.0
 
-define void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @inline_literal_reg_sequence(<4 x i32> addrspace(1)* %out) {
 entry:
   store <4 x i32> <i32 0, i32 0, i32 0, i32 0>, <4 x i32> addrspace(1)* %out
   ret void
@@ -52,7 +52,7 @@ entry:
 ; CHECK-NEXT: DOT4 T[[GPR]].Y (MASKED), 1.0
 ; CHECK-NEXT: DOT4 T[[GPR]].Z (MASKED), 1.0
 ; CHECK-NEXT: DOT4 * T[[GPR]].W (MASKED), 1.0
-define void @inline_literal_dot4(float addrspace(1)* %out) {
+define amdgpu_kernel void @inline_literal_dot4(float addrspace(1)* %out) {
 entry:
   %0 = call float @llvm.r600.dot4(<4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
   store float %0, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/liveness.mir b/test/CodeGen/AMDGPU/liveness.mir
index 112c3f8e69a622c45a23b6adfdb0dcdbc02505c5..48762e3f2ab4255ee353ffb37633516bf695a60d 100644
--- a/test/CodeGen/AMDGPU/liveness.mir
+++ b/test/CodeGen/AMDGPU/liveness.mir
@@ -8,7 +8,7 @@
 # Should see three distinct value numbers:
 # CHECK: %vreg0 [{{.*}}:0)[{{.*}}:1)[{{.*}}:2) 0@{{[0-9]+[Berd]}} 1@{{[0-9]+[Berd]}} 2@{{[0-9]+B-phi}}
 --- |
-  define void @test0() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
 ...
 ---
 name: test0
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
deleted file mode 100644
index 77dd4b1349825e266ec4b935c241fc6cbb600b9c..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.i32.ll
+++ /dev/null
@@ -1,437 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=redwood -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_arg_arg:
-; SI: v_bfe_i32
-; EG: BFE_INT
-; EG: encoding: [{{[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+,[x0-9a-f]+}},0xac
-define void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_arg_imm:
-; SI: v_bfe_i32
-; EG: BFE_INT
-define void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 123) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_imm_arg:
-; SI: v_bfe_i32
-; EG: BFE_INT
-define void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 123, i32 %src2) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_imm_arg_arg:
-; SI: v_bfe_i32
-; EG: BFE_INT
-define void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 123, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_bfe_print_arg:
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
-define void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) nounwind {
-  %load = load i32, i32 addrspace(1)* %src0, align 4
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 2, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 %src1, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.i32(i32 %src0, i32 8, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_6:
-; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: s_endpgm
-define void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_7:
-; SI-NOT: shl
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-define void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 0, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_8:
-; SI: buffer_load_dword
-; SI: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
-; SI: s_endpgm
-define void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 8, i32 24)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 24, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_13:
-; SI: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = ashr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_test_14:
-; SI-NOT: lshr
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = lshr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_0:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_1:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 12334, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_2:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 0, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_3:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 1, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_4:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_5:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 7, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_6:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 128, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_7:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_8:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 127, i32 6, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65536, i32 16, i32 8) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 65535, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 4) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_13:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 131070, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_14:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 2, i32 30) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_15:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 160, i32 4, i32 28) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_16:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 4294967295, i32 1, i32 7) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_17:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 1, i32 31) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_i32_constant_fold_test_18:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
-  %bfe_i32 = call i32 @llvm.AMDGPU.bfe.i32(i32 255, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_sext_in_reg_i24:
-; SI: buffer_load_dword [[LOAD:v[0-9]+]],
-; SI-NOT: v_lshl
-; SI-NOT: v_ashr
-; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
-; SI: buffer_store_dword [[BFE]],
-define void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %x, i32 0, i32 24)
-  %shl = shl i32 %bfe, 8
-  %ashr = ashr i32 %shl, 8
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: @simplify_demanded_bfe_sdiv
-; SI: buffer_load_dword [[LOAD:v[0-9]+]]
-; SI: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
-; SI: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
-; SI: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
-; SI: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
-; SI: buffer_store_dword [[TMP2]]
-define void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %src = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %src, i32 1, i32 16) nounwind readnone
-  %div = sdiv i32 %bfe, 2
-  store i32 %div, i32 addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
deleted file mode 100644
index ee47b14c496d090797255d275b5a67252bfb4605..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.bfe.u32.ll
+++ /dev/null
@@ -1,631 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=FUNC -check-prefix=GCN %s
-; RUN: llc -march=r600 -mcpu=redwood -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare i32 @llvm.AMDGPU.bfe.u32(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_arg_arg:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 %src1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_arg_imm:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 123) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_imm_arg:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 123, i32 %src2) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_imm_arg_arg:
-; SI: v_bfe_u32
-; EG: BFE_UINT
-define void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 123, i32 %src1, i32 %src2) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 %src1, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %src0, i32 8, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zextload_i8:
-; SI: buffer_load_ubyte
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
-  %load = load i8, i8 addrspace(1)* %in
-  %ext = zext i8 %load to i32
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
-; GCN: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32
-; FIXME: Should be using s_add_i32
-; VI: v_add_i32
-; VI-NEXT: v_and_b32_e32
-; SI-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 65535
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 0, i32 16)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 1, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
-; SI-NEXT: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 3, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
-; SI-NEXT: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 255
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 7, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8:
-; SI: buffer_load_dword
-; SI: v_add_i32
-; SI-NEXT: bfe
-; SI: s_endpgm
-define void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %load = load i32, i32 addrspace(1)* %in, align 4
-  %add = add i32 %load, 1
-  %ext = and i32 %add, 65535
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %ext, i32 8, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_1:
-; SI: buffer_load_dword
-; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
-; SI: s_endpgm
-; EG: AND_INT T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}}, 1,
-define void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-define void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_4:
-; SI-NOT: lshl
-; SI-NOT: shr
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-define void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %shr = lshr i32 %shl, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_5:
-; SI: buffer_load_dword
-; SI-NOT: lshl
-; SI-NOT: shr
-; SI: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
-; SI: s_endpgm
-define void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %shr = ashr i32 %shl, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shr, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_6:
-; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: s_endpgm
-define void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_7:
-; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 0, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_8:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 1, i32 31)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 8, i32 24)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %x, i32 24, i32 8)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_13:
-; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = ashr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_test_14:
-; SI-NOT: lshr
-; SI-NOT: {{[^@]}}bfe
-; SI: s_endpgm
-define void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = lshr i32 %x, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.u32(i32 %shl, i32 31, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_0:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_1:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 12334, i32 0, i32 0) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_2:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 0, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_3:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 1, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_4:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 0, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_5:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 7, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_6:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 128, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_7:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 0, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_8:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 127, i32 6, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_9:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65536, i32 16, i32 8) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_10:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 65535, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_11:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 4) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_12:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_13:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 131070, i32 16, i32 16) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_14:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 2, i32 30) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_15:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 160, i32 4, i32 28) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_16:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 4294967295, i32 1, i32 7) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_17:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 1, i32 31) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_u32_constant_fold_test_18:
-; SI-NOT: {{[^@]}}bfe
-; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
-; SI: buffer_store_dword [[VREG]],
-; SI: s_endpgm
-; EG-NOT: BFE
-define void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) nounwind {
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 255, i32 31, i32 1) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Make sure that SimplifyDemandedBits doesn't cause the and to be
-; reduced to the bits demanded by the bfe.
-
-; XXX: The operand to v_bfe_u32 could also just directly be the load register.
-; FUNC-LABEL: {{^}}simplify_bfe_u32_multi_use_arg:
-; SI: buffer_load_dword [[ARG:v[0-9]+]]
-; SI: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]]
-; SI: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2
-; SI-DAG: buffer_store_dword [[AND]]
-; SI-DAG: buffer_store_dword [[BFE]]
-; SI: s_endpgm
-define void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
-                                            i32 addrspace(1)* %out1,
-                                            i32 addrspace(1)* %in) nounwind {
-  %src = load i32, i32 addrspace(1)* %in, align 4
-  %and = and i32 %src, 63
-  %bfe_u32 = call i32 @llvm.AMDGPU.bfe.u32(i32 %and, i32 2, i32 2) nounwind readnone
-  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
-  store i32 %and, i32 addrspace(1)* %out1, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}lshr_and:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
-; SI: buffer_store_dword
-define void @lshr_and(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = lshr i32 %a, 6
-  %c = and i32 %b, 7
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}v_lshr_and:
-; SI: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
-; SI: buffer_store_dword
-define void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
-  %c = lshr i32 %a, %b
-  %d = and i32 %c, 7
-  store i32 %d, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}and_lshr:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
-; SI: buffer_store_dword
-define void @and_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = and i32 %a, 448
-  %c = lshr i32 %b, 6
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}and_lshr2:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
-; SI: buffer_store_dword
-define void @and_lshr2(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = and i32 %a, 511
-  %c = lshr i32 %b, 6
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
-
-; FUNC-LABEL: {{^}}shl_lshr:
-; SI: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
-; SI: buffer_store_dword
-define void @shl_lshr(i32 addrspace(1)* %out, i32 %a) nounwind {
-  %b = shl i32 %a, 9
-  %c = lshr i32 %b, 11
-  store i32 %c, i32 addrspace(1)* %out, align 8
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
deleted file mode 100644
index 2336109f4dadf1ea9a2783845c42b8083b4dba93..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.clamp.ll
+++ /dev/null
@@ -1,56 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
-
-declare float @llvm.fabs.f32(float) nounwind readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) nounwind readnone
-
-; FUNC-LABEL: {{^}}clamp_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], [[ARG]], 0 clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-
-; EG: MOV_SAT
-define void @clamp_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}clamp_fabs_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], |[[ARG]]|, 0 clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @clamp_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fabs, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}clamp_fneg_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -[[ARG]], 0 clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @clamp_fneg_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %src.fneg = fsub float -0.0, %src
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}clamp_fneg_fabs_0_1_f32:
-; SI: s_load_dword [[ARG:s[0-9]+]],
-; SI: v_add_f32_e64 [[RESULT:v[0-9]+]], -|[[ARG]]|, 0 clamp{{$}}
-; SI: buffer_store_dword [[RESULT]]
-; SI: s_endpgm
-define void @clamp_fneg_fabs_0_1_f32(float addrspace(1)* %out, float %src) nounwind {
-  %src.fabs = call float @llvm.fabs.f32(float %src) nounwind readnone
-  %src.fneg.fabs = fsub float -0.0, %src.fabs
-  %clamp = call float @llvm.AMDGPU.clamp.f32(float %src.fneg.fabs, float 0.0, float 1.0) nounwind readnone
-  store float %clamp, float addrspace(1)* %out, align 4
-  ret void
-}
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
index 59997d27683d98eda76162e562c9d4e695061e4b..595f632b493dc76bdbdf872c7068b0acd4306ccf 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
+++ b/test/CodeGen/AMDGPU/llvm.AMDGPU.kill.ll
@@ -4,15 +4,14 @@
 ; SI-LABEL: {{^}}kill_gs_const:
 ; SI-NOT: v_cmpx_le_f32
 ; SI: s_mov_b64 exec, 0
-
 define amdgpu_gs void @kill_gs_const() {
 main_body:
-  %0 = icmp ule i32 0, 3
-  %1 = select i1 %0, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %1)
-  %2 = icmp ule i32 3, 0
-  %3 = select i1 %2, float 1.000000e+00, float -1.000000e+00
-  call void @llvm.AMDGPU.kill(float %3)
+  %tmp = icmp ule i32 0, 3
+  %tmp1 = select i1 %tmp, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %tmp1)
+  %tmp2 = icmp ule i32 3, 0
+  %tmp3 = select i1 %tmp2, float 1.000000e+00, float -1.000000e+00
+  call void @llvm.AMDGPU.kill(float %tmp3)
   ret void
 }
 
@@ -21,16 +20,16 @@ main_body:
 ; SI: v_cmp_gt_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], 0, v{{[0-9]+}}
 ; SI: v_cmpx_le_f32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1.0, [[CMP]]
-define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
+define amdgpu_ps void @kill_vcc_implicit_def([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) {
 entry:
-  %tmp0 = fcmp olt float %13, 0.0
-  call void @llvm.AMDGPU.kill(float %14)
-  %tmp1 = select i1 %tmp0, float 1.0, float 0.0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 1, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
+  %tmp0 = fcmp olt float %arg13, 0.000000e+00
+  call void @llvm.AMDGPU.kill(float %arg14)
+  %tmp1 = select i1 %tmp0, float 1.000000e+00, float 0.000000e+00
+  call void @llvm.amdgcn.exp.f32(i32 1, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
   ret void
 }
 
-declare void @llvm.AMDGPU.kill(float)
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.AMDGPU.kill(float) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 
-!0 = !{!"const", null, i32 1}
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.export.ll b/test/CodeGen/AMDGPU/llvm.SI.export.ll
deleted file mode 100644
index 23a32dcfd9439726c90658fc1fd0c52ba0e35e7d..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.export.ll
+++ /dev/null
@@ -1,237 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #0
-
-; GCN-LABEL: {{^}}test_export_zeroes:
-; GCN: exp mrt0 off, off, off, off{{$}}
-; GCN: exp mrt0 off, off, off, off done{{$}}
-define void @test_export_zeroes() #0 {
-
-  call void @llvm.SI.export(i32 0, i32 0, i32 0, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0)
-  call void @llvm.SI.export(i32 0, i32 0, i32 1, i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0)
-  ret void
-}
-
-; FIXME: Should not set up registers for the unused source registers.
-
-; GCN-LABEL: {{^}}test_export_en_src0:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
-define void @test_export_en_src0() #0 {
-  call void @llvm.SI.export(i32 1, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src1:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
-define void @test_export_en_src1() #0 {
-  call void @llvm.SI.export(i32 2, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src2:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
-define void @test_export_en_src2() #0 {
-  call void @llvm.SI.export(i32 4, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src3:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src3() #0 {
-  call void @llvm.SI.export(i32 8, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src0_src1:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
-define void @test_export_en_src0_src1() #0 {
-  call void @llvm.SI.export(i32 3, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src0_src2:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
-define void @test_export_en_src0_src2() #0 {
-  call void @llvm.SI.export(i32 5, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src0_src3:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src0_src3() #0 {
-  call void @llvm.SI.export(i32 9, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 9, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_en_src0_src1_src2_src3:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_en_src0_src1_src2_src3() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_mrt7:
-; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
-; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
-define void @test_export_mrt7() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 7, i32 0, float 0.5, float 0.5, float 0.5, float 0.5)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_z:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_z() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 8, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_null:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_null() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 9, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_reserved10:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved10() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 10, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_reserved11:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved11() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 11, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_pos0:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos0() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_pos3:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos3() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 15, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_param0:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param0() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 32, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_param31:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
-; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param31() #0 {
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 63, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-; GCN-LABEL: {{^}}test_export_vm:
-; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 1.0
-; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 2.0
-; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
-; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-define void @test_export_vm() #0 {
-  call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float 1.0, float 2.0, float 0.5, float 4.0)
-  ret void
-}
-
-attributes #0 = { nounwind "ShaderType"="0" }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll b/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
deleted file mode 100644
index 9e7c3c2e620167f07841df9ed23b373c87e1b6a3..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.fs.interp.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-;RUN: llc < %s -march=amdgcn -mcpu=kabini -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
-;RUN: llc < %s -march=amdgcn -mcpu=stoney -verify-machineinstrs | FileCheck --check-prefix=GCN --check-prefix=16BANK %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-
-;GCN-LABEL: {{^}}main:
-;GCN-NOT: s_wqm
-;GCN: s_mov_b32 m0
-;GCN-DAG: v_interp_mov_f32
-;GCN-DAG: v_interp_p1_f32
-;GCN-DAG: v_interp_p2_f32
-
-define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>) {
-main_body:
-  %5 = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %3)
-  %6 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %3, <2 x i32> %4)
-  %7 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %3, <2 x i32> %4)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %5, float %6, float %7, float %7)
-  ret void
-}
-
-; Thest that v_interp_p1 uses different source and destination registers
-; on 16 bank LDS chips.
-
-; 16BANK-LABEL: {{^}}v_interp_p1_bank16_bug:
-; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
-
-define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, i32, float, float) {
-main_body:
-  %22 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %5, <2 x i32> %7)
-  %23 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
-  %24 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %5, <2 x i32> %7)
-  %25 = call float @fabs(float %22)
-  %26 = call float @fabs(float %23)
-  %27 = call float @fabs(float %24)
-  %28 = call i32 @llvm.SI.packf16(float %25, float %26)
-  %29 = bitcast i32 %28 to float
-  %30 = call i32 @llvm.SI.packf16(float %27, float 1.000000e+00)
-  %31 = bitcast i32 %30 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %29, float %31, float %29, float %31)
-  ret void
-}
-
-; Function Attrs: readnone
-declare float @fabs(float) #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.constant(i32, i32, i32) #0
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
-attributes #1 = { readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll b/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
deleted file mode 100644
index aef9f660436e2d17c5d1a42480c36a5f76285f23..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.gather4.ll
+++ /dev/null
@@ -1,525 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}gather4_v2:
-;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_v2() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4:
-;CHECK: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_cl:
-;CHECK: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_l:
-;CHECK: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b:
-;CHECK: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_cl:
-;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_cl_v8:
-;CHECK: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b_cl_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_lz_v2:
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_lz_v2() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_lz:
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-;CHECK-LABEL: {{^}}gather4_o:
-;CHECK: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_cl_o:
-;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_cl_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_cl_o_v8:
-;CHECK: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_cl_o_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_l_o:
-;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_l_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_l_o_v8:
-;CHECK: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_l_o_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_o:
-;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_o_v8:
-;CHECK: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b_o_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_b_cl_o:
-;CHECK: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_b_cl_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_lz_o:
-;CHECK: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_lz_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-;CHECK-LABEL: {{^}}gather4_c:
-;CHECK: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_cl:
-;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_cl_v8:
-;CHECK: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_cl_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_l:
-;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_l_v8:
-;CHECK: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_l_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b:
-;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_v8:
-;CHECK: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_b_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_cl:
-;CHECK: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_lz:
-;CHECK: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-
-;CHECK-LABEL: {{^}}gather4_c_o:
-;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_o_v8:
-;CHECK: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_o_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_cl_o:
-;CHECK: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_cl_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_l_o:
-;CHECK: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_l_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_o:
-;CHECK: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_b_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_b_cl_o:
-;CHECK: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_b_cl_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_lz_o:
-;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_lz_o() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_c_lz_o_v8:
-;CHECK: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define amdgpu_ps void @gather4_c_lz_o_v8() {
-main_body:
-  %r = call <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}gather4_sgpr_bug:
-;
-; This crashed at some point due to a bug in FixSGPRCopies. Derived from the
-; report in https://bugs.freedesktop.org/show_bug.cgi?id=96877
-;
-;CHECK: s_load_dwordx4 s{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
-;CHECK: s_waitcnt lgkmcnt(0)
-;CHECK: s_mov_b32 s[[LO]], 0
-;CHECK: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]] dmask:0x8
-define amdgpu_ps float @gather4_sgpr_bug() {
-main_body:
-  %tmp = load <4 x i32>, <4 x i32> addrspace(2)* undef, align 16
-  %tmp1 = insertelement <4 x i32> %tmp, i32 0, i32 0
-  %tmp2 = call <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> %tmp1, i32 8, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp4 = extractelement <4 x float> %tmp2, i32 1
-  %tmp9 = fadd float undef, %tmp4
-  ret float %tmp9
-}
-
-declare <4 x float> @llvm.SI.gather4.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.lz.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare <4 x float> @llvm.SI.gather4.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare <4 x float> @llvm.SI.gather4.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.l.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.b.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.b.cl.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare <4 x float> @llvm.SI.gather4.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.l.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.b.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.b.cl.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.gather4.c.lz.o.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll b/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
deleted file mode 100644
index ac34d31b97c1b65efad2b515f3c9f5473ba9b4a0..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.getlod.ll
+++ /dev/null
@@ -1,44 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}getlod:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
-define amdgpu_ps void @getlod() {
-main_body:
-  %r = call <4 x float> @llvm.SI.getlod.i32(i32 undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}getlod_v2:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
-define amdgpu_ps void @getlod_v2() {
-main_body:
-  %r = call <4 x float> @llvm.SI.getlod.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}getlod_v4:
-;CHECK: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
-define amdgpu_ps void @getlod_v4() {
-main_body:
-  %r = call <4 x float> @llvm.SI.getlod.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 1, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r0, float %r1)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.getlod.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.getlod.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.getlod.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.ll b/test/CodeGen/AMDGPU/llvm.SI.image.ll
deleted file mode 100644
index 50341e3e207f8150a90d3f474760e6ec5db86b1d..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.image.ll
+++ /dev/null
@@ -1,49 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}image_load:
-;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @image_load() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.load.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}image_load_mip:
-;CHECK: image_load_mip {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @image_load_mip() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32> undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}getresinfo:
-;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @getresinfo() {
-main_body:
-  %r = call <4 x float> @llvm.SI.getresinfo.i32(i32 undef, <8 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-declare <4 x float> @llvm.SI.image.load.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.load.mip.v4i32(<4 x i32>, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.getresinfo.i32(i32, <8 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
deleted file mode 100644
index 7cdd9559994e8ddb8c1e69a359b90f600daf8a81..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.image.sample-masked.ll
+++ /dev/null
@@ -1,94 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s
-
-; CHECK-LABEL: {{^}}v1:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xd
-define amdgpu_ps void @v1(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 2
-  %4 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v2:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xb
-define amdgpu_ps void @v2(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 1
-  %4 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v3:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xe
-define amdgpu_ps void @v3(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 1
-  %3 = extractelement <4 x float> %1, i32 2
-  %4 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v4:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x7
-define amdgpu_ps void @v4(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 1
-  %4 = extractelement <4 x float> %1, i32 2
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %4, float %4)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v5:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xa
-define amdgpu_ps void @v5(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 1
-  %3 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v6:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x6
-define amdgpu_ps void @v6(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 1
-  %3 = extractelement <4 x float> %1, i32 2
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
-  ret void
-}
-
-; CHECK-LABEL: {{^}}v7:
-; CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x9
-define amdgpu_ps void @v7(i32 %a1) {
-entry:
-  %0 = insertelement <1 x i32> undef, i32 %a1, i32 0
-  %1 = call <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32> %0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %2 = extractelement <4 x float> %1, i32 0
-  %3 = extractelement <4 x float> %1, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %2, float %3, float %3, float %3)
-  ret void
-}
-
-declare <4 x float> @llvm.SI.image.sample.v1i32(<1 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
deleted file mode 100644
index 60077dc218fd3cdee6f9c4c27fabaec8f2bcf07f..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.image.sample.ll
+++ /dev/null
@@ -1,309 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}sample:
-;CHECK: s_wqm
-;CHECK: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_d() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_d_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b:
-;CHECK: s_wqm
-;CHECK: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cd() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cd_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c:
-;CHECK: s_wqm
-;CHECK: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_d() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_d_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cd() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cd_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.d.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.d.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.b.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.lz.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cd.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cd.cl.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
deleted file mode 100644
index 34d4f6825690591e7954d666c3e3033c4fa6e3c8..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.image.sample.o.ll
+++ /dev/null
@@ -1,309 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK-LABEL: {{^}}sample:
-;CHECK: s_wqm
-;CHECK: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_d() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_d_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b:
-;CHECK: s_wqm
-;CHECK: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cd() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_cd_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c:
-;CHECK: s_wqm
-;CHECK: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_d() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_d_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_d_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_l:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_l() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_b() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_b_cl:
-;CHECK: s_wqm
-;CHECK: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_b_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_lz:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_lz() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cd() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-;CHECK-LABEL: {{^}}sample_c_cd_cl:
-;CHECK-NOT: s_wqm
-;CHECK: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @sample_c_cd_cl() {
-main_body:
-  %r = call <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %r0 = extractelement <4 x float> %r, i32 0
-  %r1 = extractelement <4 x float> %r, i32 1
-  %r2 = extractelement <4 x float> %r, i32 2
-  %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
-  ret void
-}
-
-
-declare <4 x float> @llvm.SI.image.sample.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare <4 x float> @llvm.SI.image.sample.c.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.d.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.d.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.l.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.b.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.b.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.lz.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cd.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-declare <4 x float> @llvm.SI.image.sample.c.cd.cl.o.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
index ee0a41f2210f18c89bb940037b3cc83663eb35e0..51f564d9690950216c29251cc0bfcf061b0ba798 100644
--- a/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
+++ b/test/CodeGen/AMDGPU/llvm.SI.load.dword.ll
@@ -34,8 +34,8 @@ main_body:
   %tmp22 = call i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8> %tmp10, <2 x i32> zeroinitializer, i32 1234, i32 65535, i32 1, i32 1, i32 1, i32 1, i32 0)
   %tmp23 = bitcast i32 %tmp22 to float
 
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp13, float %tmp15, float %tmp17, float %tmp19)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp21, float %tmp23, float %tmp23, float %tmp23)
+  call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp13, float %tmp15, float %tmp17, float %tmp19, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 15, i32 12, float %tmp21, float %tmp23, float %tmp23, float %tmp23, i1 true, i1 false)
   ret void
 }
 
@@ -45,9 +45,10 @@ declare i32 @llvm.SI.buffer.load.dword.i32.i32(<16 x i8>, i32, i32, i32, i32, i3
 ; Function Attrs: nounwind readonly
 declare i32 @llvm.SI.buffer.load.dword.i32.v2i32(<16 x i8>, <2 x i32>, i32, i32, i32, i32, i32, i32, i32) #0
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 
 attributes #0 = { nounwind readonly }
+attributes #1 = { nounwind inaccessiblememonly }
 
 !0 = !{!"const", !1, i32 1}
 !1 = !{!"tbaa root"}
diff --git a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll b/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
deleted file mode 100644
index 6984b4cf488adb9214c6e3b5f0ab46a16d14310d..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/llvm.SI.packf16.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}main:
-; GCN: v_cvt_pkrtz_f16_f32
-; GCN: v_cvt_pkrtz_f16_f32
-; GCN-NOT: v_cvt_pkrtz_f16_f32
-
-define amdgpu_ps void @main(float %src) {
-main_body:
-  %p1 = call i32 @llvm.SI.packf16(float undef, float %src)
-  %p2 = call i32 @llvm.SI.packf16(float %src, float undef)
-  %p3 = call i32 @llvm.SI.packf16(float undef, float undef)
-  %f1 = bitcast i32 %p1 to float
-  %f2 = bitcast i32 %p2 to float
-  %f3 = bitcast i32 %p3 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 1, float undef, float %f1, float undef, float %f1)
-  call void @llvm.SI.export(i32 15, i32 1, i32 0, i32 0, i32 1, float undef, float %f2, float undef, float %f2)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %f3, float undef, float %f2)
-  ret void
-}
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
index 9c845e84bc12fa37bfd69f5451c12252c5b3f3f7..405534ea4b5736ea914602c0a8360d2cb84e5266 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll
@@ -1,21 +1,45 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2
-declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2
 
-declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2
-declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2
-declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
+; Make sure no crash on invalid non-constant
+; GCN-LABEL: {{^}}invalid_variable_order_lds_atomic_dec_ret_i32:
+define amdgpu_kernel void @invalid_variable_order_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %order.var) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 %order.var, i32 0, i1 false)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; Make sure no crash on invalid non-constant
+; GCN-LABEL: {{^}}invalid_variable_scope_lds_atomic_dec_ret_i32:
+define amdgpu_kernel void @invalid_variable_scope_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %scope.var) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 %scope.var, i1 false)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
+; Make sure no crash on invalid non-constant
+; GCN-LABEL: {{^}}invalid_variable_volatile_lds_atomic_dec_ret_i32:
+define amdgpu_kernel void @invalid_variable_volatile_lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i1 %volatile.var) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 %volatile.var)
+  store i32 %result, i32 addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+define amdgpu_kernel void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -23,9 +47,9 @@ define void @lds_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -35,25 +59,25 @@ define void @lds_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_dec_u32 [[VPTR]], [[DATA]]
-define void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+define amdgpu_kernel void @lds_atomic_dec_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_dec_u32 v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+define amdgpu_kernel void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -61,26 +85,26 @@ define void @global_atomic_dec_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GCN-LABEL: {{^}}global_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-define void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32:
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+define amdgpu_kernel void @global_atomic_dec_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_dec [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -88,12 +112,12 @@ define void @global_atomic_dec_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
 ; VI: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out.gep
   ret void
 }
@@ -102,19 +126,19 @@ define void @global_atomic_dec_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_dec [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
 ; VI: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+define amdgpu_kernel void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out
   ret void
 }
@@ -122,38 +146,38 @@ define void @flat_atomic_dec_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32:
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+define amdgpu_kernel void @flat_atomic_dec_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}flat_atomic_dec_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_dec_ret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out.gep
   ret void
 }
@@ -161,11 +185,11 @@ define void @flat_atomic_dec_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 a
 ; GCN-LABEL: {{^}}flat_atomic_dec_noret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_dec v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -173,8 +197,8 @@ define void @flat_atomic_dec_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+define amdgpu_kernel void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out
   ret void
 }
@@ -183,9 +207,9 @@ define void @flat_atomic_dec_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out
   ret void
 }
@@ -194,8 +218,8 @@ define void @flat_atomic_dec_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspac
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+define amdgpu_kernel void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -203,9 +227,9 @@ define void @flat_atomic_dec_noret_i64(i64 addrspace(4)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -213,12 +237,12 @@ define void @flat_atomic_dec_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out.gep
   ret void
 }
@@ -227,11 +251,11 @@ define void @flat_atomic_dec_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 a
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -240,11 +264,11 @@ define void @flat_atomic_dec_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0
 ; SI-LABEL: {{^}}atomic_dec_shl_base_lds_0:
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_dec_rtn_u32 {{v[0-9]+}}, [[PTR]] offset:8
-define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9)
+  %val0 = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
   store i32 %idx.0, i32 addrspace(1)* %add_use
   store i32 %val0, i32 addrspace(1)* %out
   ret void
@@ -254,8 +278,8 @@ define void @atomic_dec_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+define amdgpu_kernel void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -264,9 +288,9 @@ define void @lds_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
-define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -275,8 +299,8 @@ define void @lds_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+define amdgpu_kernel void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -284,9 +308,9 @@ define void @lds_atomic_dec_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_dec_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
-define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -294,8 +318,8 @@ define void @lds_atomic_dec_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+define amdgpu_kernel void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -304,9 +328,9 @@ define void @global_atomic_dec_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -315,8 +339,8 @@ define void @global_atomic_dec_ret_i64_offset(i64 addrspace(1)* %out, i64 addrsp
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+define amdgpu_kernel void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -324,9 +348,9 @@ define void @global_atomic_dec_noret_i64(i64 addrspace(1)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -335,12 +359,12 @@ define void @global_atomic_dec_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out.gep
   ret void
 }
@@ -350,11 +374,11 @@ define void @global_atomic_dec_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -363,11 +387,11 @@ define void @global_atomic_dec_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #
 ; GCN-LABEL: {{^}}atomic_dec_shl_base_lds_0_i64:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_dec_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
-define void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_dec_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9)
+  %val0 = call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
   store i32 %idx.0, i32 addrspace(1)* %add_use
   store i64 %val0, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
index 22097418eec47c5bb35348b356e13e478a1eb95f..8334c0c357befb62765595131bc6b2a19c1b8d78 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll
@@ -1,21 +1,21 @@
 ; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
-declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32) #2
-declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32) #2
-declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* nocapture, i32, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2
 
-declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64) #2
-declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64) #2
-declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* nocapture, i64, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]]
-define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+define amdgpu_kernel void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -23,9 +23,9 @@ define void @lds_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -35,25 +35,25 @@ define void @lds_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; GCN: v_mov_b32_e32 [[DATA:v[0-9]+]], 4
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_inc_u32 [[VPTR]], [[DATA]]
-define void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42)
+define amdgpu_kernel void @lds_atomic_inc_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}lds_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: ds_inc_u32 v{{[0-9]+}}, [[K]] offset:16
-define void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+define amdgpu_kernel void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
@@ -61,26 +61,26 @@ define void @global_atomic_inc_ret_i32(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; GCN-LABEL: {{^}}global_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16 glc{{$}}
-define void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32:
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42)
+define amdgpu_kernel void @global_atomic_inc_noret_i32(i32 addrspace(1)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: buffer_atomic_inc [[K]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
-define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -88,12 +88,12 @@ define void @global_atomic_inc_noret_i32_offset(i32 addrspace(1)* %ptr) nounwind
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20 glc{{$}}
 ; VI: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(1)* %out.gep
   ret void
 }
@@ -102,11 +102,11 @@ define void @global_atomic_inc_ret_i32_offset_addr64(i32 addrspace(1)* %out, i32
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; CI: buffer_atomic_inc [[K]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:20{{$}}
 ; VI: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(1)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -115,11 +115,11 @@ define void @global_atomic_inc_noret_i32_offset_addr64(i32 addrspace(1)* %ptr) #
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i32:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
-define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds0, i32 0, i32 %idx.0
-  %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9)
+  %val0 = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %arrayidx0, i32 9, i32 0, i32 0, i1 false)
   store i32 %idx.0, i32 addrspace(1)* %add_use
   store i32 %val0, i32 addrspace(1)* %out
   ret void
@@ -129,8 +129,8 @@ define void @atomic_inc_shl_base_lds_0_i32(i32 addrspace(1)* %out, i32 addrspace
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+define amdgpu_kernel void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -139,9 +139,9 @@ define void @lds_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32
-define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
+define amdgpu_kernel void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -150,8 +150,8 @@ define void @lds_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42)
+define amdgpu_kernel void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -159,9 +159,9 @@ define void @lds_atomic_inc_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: ds_inc_u64 v{{[0-9]+}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} offset:32{{$}}
-define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -169,8 +169,8 @@ define void @lds_atomic_inc_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 glc{{$}}
-define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+define amdgpu_kernel void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -179,9 +179,9 @@ define void @global_atomic_inc_ret_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32 glc{{$}}
-define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
@@ -190,8 +190,8 @@ define void @global_atomic_inc_ret_i64_offset(i64 addrspace(1)* %out, i64 addrsp
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42)
+define amdgpu_kernel void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -199,9 +199,9 @@ define void @global_atomic_inc_noret_i64(i64 addrspace(1)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:32{{$}}
-define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(1)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -210,12 +210,12 @@ define void @global_atomic_inc_noret_i64_offset(i64 addrspace(1)* %ptr) nounwind
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(1)* %out.gep
   ret void
 }
@@ -225,19 +225,19 @@ define void @global_atomic_inc_ret_i64_offset_addr64(i64 addrspace(1)* %out, i64
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}}
 ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}}
-define void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @global_atomic_inc_noret_i64_offset_addr64(i64 addrspace(1)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(1)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(1)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+define amdgpu_kernel void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out
   ret void
 }
@@ -245,38 +245,38 @@ define void @flat_atomic_inc_ret_i32(i32 addrspace(4)* %out, i32 addrspace(4)* %
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32:
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42)
+define amdgpu_kernel void @flat_atomic_inc_noret_i32(i32 addrspace(4)* %ptr) nounwind {
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %ptr, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; FUNC-LABEL: {{^}}flat_atomic_inc_noret_i32_offset:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset(i32 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(4)* %ptr, i32 4
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, [[K]] glc{{$}}
-define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i32, i32 addrspace(4)* %out, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   store i32 %result, i32 addrspace(4)* %out.gep
   ret void
 }
@@ -284,11 +284,11 @@ define void @flat_atomic_inc_ret_i32_offset_addr64(i32 addrspace(4)* %out, i32 a
 ; GCN-LABEL: {{^}}flat_atomic_inc_noret_i32_offset_addr64:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 42
 ; GCN: flat_atomic_inc v{{\[[0-9]+:[0-9]+\]}}, [[K]]{{$}}
-define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i32, i32 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i32, i32 addrspace(4)* %gep.tid, i32 5
-  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42)
+  %result = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %gep, i32 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -297,31 +297,22 @@ define void @flat_atomic_inc_noret_i32_offset_addr64(i32 addrspace(4)* %ptr) #0
 ; GCN-LABEL: {{^}}atomic_inc_shl_base_lds_0_i64:
 ; GCN: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 3, {{v[0-9]+}}
 ; GCN: ds_inc_rtn_u64 v{{\[[0-9]+:[0-9]+\]}}, [[PTR]], v{{\[[0-9]+:[0-9]+\]}} offset:16
-define void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_inc_shl_base_lds_0_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i64], [512 x i64] addrspace(3)* @lds1, i32 0, i32 %idx.0
-  %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9)
+  %val0 = call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %arrayidx0, i64 9, i32 0, i32 0, i1 false)
   store i32 %idx.0, i32 addrspace(1)* %add_use
   store i64 %val0, i64 addrspace(1)* %out
   ret void
 }
 
-attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind argmemonly }
-
-
-
-
-
-
 ; GCN-LABEL: {{^}}flat_atomic_inc_ret_i64:
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+define amdgpu_kernel void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out
   ret void
 }
@@ -330,9 +321,9 @@ define void @flat_atomic_inc_ret_i64(i64 addrspace(4)* %out, i64 addrspace(4)* %
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out
   ret void
 }
@@ -341,8 +332,8 @@ define void @flat_atomic_inc_ret_i64_offset(i64 addrspace(4)* %out, i64 addrspac
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42)
+define amdgpu_kernel void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %ptr, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -350,9 +341,9 @@ define void @flat_atomic_inc_noret_i64(i64 addrspace(4)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(4)* %ptr, i32 4
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
 
@@ -360,12 +351,12 @@ define void @flat_atomic_inc_noret_i64_offset(i64 addrspace(4)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}}
-define void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %out.gep = getelementptr i64, i64 addrspace(4)* %out, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   store i64 %result, i64 addrspace(4)* %out.gep
   ret void
 }
@@ -374,10 +365,14 @@ define void @flat_atomic_inc_ret_i64_offset_addr64(i64 addrspace(4)* %out, i64 a
 ; GCN-DAG: v_mov_b32_e32 v[[KLO:[0-9]+]], 42
 ; GCN-DAG: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}}
 ; GCN: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]$}}
-define void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
+define amdgpu_kernel void @flat_atomic_inc_noret_i64_offset_addr64(i64 addrspace(4)* %ptr) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.tid = getelementptr i64, i64 addrspace(4)* %ptr, i32 %id
   %gep = getelementptr i64, i64 addrspace(4)* %gep.tid, i32 5
-  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42)
+  %result = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %gep, i64 42, i32 0, i32 0, i1 false)
   ret void
 }
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind argmemonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
index 6d9db65e7d93a2a0c42d92fa9ca361c9db92f0e9..10bea8ea63b02a5c6e4888fddefa5d5aba3bbee6 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.ll
@@ -8,7 +8,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1() #0
 ; SI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00]
 ; VI-NEXT: buffer_wbinvl1 ; encoding: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_buffer_wbinvl1() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1()
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
index 746298465e5801c33ccde4db63ab7fffb108c788..fe60d16d90f7aba7a8c61db900b801352fe81b57 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.sc.ll
@@ -6,7 +6,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1.sc() #0
 ; SI-NEXT: ; BB#0:
 ; SI-NEXT: buffer_wbinvl1_sc ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
 ; SI-NEXT: s_endpgm
-define void @test_buffer_wbinvl1_sc() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1_sc() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1.sc()
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
index 4e0f3c37f2146142eaccab2ba63d9d7d168b8953..061c1469ed4d88f0841a08f3dcbe8de378f268d7 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.buffer.wbinvl1.vol.ll
@@ -8,7 +8,7 @@ declare void @llvm.amdgcn.buffer.wbinvl1.vol() #0
 ; CI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
 ; VI-NEXT: buffer_wbinvl1_vol ; encoding: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00]
 ; GCN: s_endpgm
-define void @test_buffer_wbinvl1_vol() #0 {
+define amdgpu_kernel void @test_buffer_wbinvl1_vol() #0 {
   call void @llvm.amdgcn.buffer.wbinvl1.vol()
 ; This used to crash in hazard recognizer
   store i8 0, i8 addrspace(1)* undef, align 1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
index 3798c46677f06a8ec0eec0fbc3820132b4cbb428..f08d4b6c791567e19e31245a5e8c376a187944c8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.f16.ll
@@ -10,7 +10,7 @@ declare i1 @llvm.amdgcn.class.f16(half %a, i32 %b)
 ; GCN: v_cndmask_b32_e64 v[[R_I32:[0-9]+]]
 ; GCN: buffer_store_dword v[[R_I32]]
 ; GCN: s_endpgm
-define void @class_f16(
+define amdgpu_kernel void @class_f16(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a,
     i32 addrspace(1)* %b) {
@@ -31,7 +31,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fabs(
+define amdgpu_kernel void @class_f16_fabs(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -51,7 +51,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fneg(
+define amdgpu_kernel void @class_f16_fneg(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -71,7 +71,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_fabs_fneg(
+define amdgpu_kernel void @class_f16_fabs_fneg(
   i32 addrspace(1)* %r,
   half %a.val,
   i32 %b.val) {
@@ -91,7 +91,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_1(
+define amdgpu_kernel void @class_f16_1(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -108,7 +108,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, [[CMP]]
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_64(
+define amdgpu_kernel void @class_f16_64(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -126,7 +126,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_full_mask(
+define amdgpu_kernel void @class_f16_full_mask(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
@@ -144,7 +144,7 @@ entry:
 ; VI:  v_cndmask_b32_e64 v[[VR_I32:[0-9]+]], 0, -1, vcc
 ; GCN: buffer_store_dword v[[VR_I32]]
 ; GCN: s_endpgm
-define void @class_f16_nine_bit_mask(
+define amdgpu_kernel void @class_f16_nine_bit_mask(
   i32 addrspace(1)* %r,
   half %a.val) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
index 668c669e41e89dc836d3490243ac353bcde4c49b..1fcdac537fba6ff3a6b0f9932ab992bfb7fb0003 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll
@@ -14,7 +14,7 @@ declare double @llvm.fabs.f64(double) #1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -29,7 +29,7 @@ define void @test_class_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -45,7 +45,7 @@ define void @test_class_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fneg = fsub float -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -61,7 +61,7 @@ define void @test_class_fneg_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a) #1
   %a.fneg.fabs = fsub float -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1
@@ -76,7 +76,7 @@ define void @test_class_fneg_fabs_f32(i32 addrspace(1)* %out, float %a, i32 %b)
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@ define void @test_class_1_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[COND]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -104,7 +104,7 @@ define void @test_class_64_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -118,7 +118,7 @@ define void @test_class_full_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -132,7 +132,7 @@ define void @test_class_9bit_mask_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -150,7 +150,7 @@ define void @v_test_class_full_mask_f32(i32 addrspace(1)* %out, float addrspace(
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -170,7 +170,7 @@ define void @test_class_inline_imm_constant_dynamic_mask_f32(i32 addrspace(1)* %
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -190,7 +190,7 @@ define void @test_class_lit_constant_dynamic_mask_f32(i32 addrspace(1)* %out, i3
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -205,7 +205,7 @@ define void @test_class_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -221,7 +221,7 @@ define void @test_class_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fneg = fsub double -0.0, %a
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1
   %sext = sext i1 %result to i32
@@ -237,7 +237,7 @@ define void @test_class_fneg_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP]]
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b) #0 {
   %a.fabs = call double @llvm.fabs.f64(double %a) #1
   %a.fneg.fabs = fsub double -0.0, %a.fabs
   %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1
@@ -249,7 +249,7 @@ define void @test_class_fneg_fabs_f64(i32 addrspace(1)* %out, double %a, i32 %b)
 ; SI-LABEL: {{^}}test_class_1_f64:
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}}
 ; SI: s_endpgm
-define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -259,7 +259,7 @@ define void @test_class_1_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI-LABEL: {{^}}test_class_64_f64:
 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}}
 ; SI: s_endpgm
-define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -275,7 +275,7 @@ define void @test_class_64_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI-NEXT: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -290,7 +290,7 @@ define void @test_class_full_mask_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, vcc
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -306,7 +306,7 @@ define void @v_test_class_full_mask_f64(i32 addrspace(1)* %out, double addrspace
 ; XSI: v_cmp_class_f64_e32 vcc, 1.0,
 ; SI: v_cmp_class_f64_e32 vcc,
 ; SI: s_endpgm
-define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -321,7 +321,7 @@ define void @test_class_inline_imm_constant_dynamic_mask_f64(i32 addrspace(1)* %
 ; SI-LABEL: {{^}}test_class_lit_constant_dynamic_mask_f64:
 ; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}
 ; SI: s_endpgm
-define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -338,7 +338,7 @@ define void @test_class_lit_constant_dynamic_mask_f64(i32 addrspace(1)* %out, i3
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 3{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -358,7 +358,7 @@ define void @test_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)
 ; SI: v_cmp_class_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -381,7 +381,7 @@ define void @test_fold_or3_class_f32_0(i32 addrspace(1)* %out, float addrspace(1
 ; SI: v_cmp_class_f32_e32 vcc, v{{[0-9]+}}, [[MASK]]{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -416,7 +416,7 @@ define void @test_fold_or_all_tests_class_f32_0(i32 addrspace(1)* %out, float ad
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 12{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -436,7 +436,7 @@ define void @test_fold_or_class_f32_1(i32 addrspace(1)* %out, float addrspace(1)
 ; SI: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, 7{{$}}
 ; SI-NOT: v_cmp_class
 ; SI: s_endpgm
-define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -456,7 +456,7 @@ define void @test_fold_or_class_f32_2(i32 addrspace(1)* %out, float addrspace(1)
 ; SI-DAG: v_cmp_class_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 8{{$}}
 ; SI: s_or_b64
 ; SI: s_endpgm
-define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
+define amdgpu_kernel void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace(1)* %in, float %b) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep.in = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -476,7 +476,7 @@ define void @test_no_fold_or_class_f32_0(i32 addrspace(1)* %out, float addrspace
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -488,7 +488,7 @@ define void @test_class_0_f32(i32 addrspace(1)* %out, float %a) #0 {
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
+define amdgpu_kernel void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
   %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -500,7 +500,7 @@ define void @test_class_0_f64(i32 addrspace(1)* %out, double %a) #0 {
 ; SI-NOT: v_cmp_class
 ; SI: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1,
 ; SI: buffer_store_dword
-define void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
+define amdgpu_kernel void @test_class_undef_f32(i32 addrspace(1)* %out, float %a, i32 %b) #0 {
   %result = call i1 @llvm.amdgcn.class.f32(float undef, i32 %b) #1
   %sext = sext i1 %result to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
index 410ac59279a5aa5854ea1f64e8a332a785374de2..054388607293adefb418c269fc8f48d0edc321b8 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.cos.f16(half %a)
 ; VI:  v_cos_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @cos_f16(
+define amdgpu_kernel void @cos_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
index f6495d8155f7f33276ce05abeebed2efc3220ad1..5b9c83c11cf491e6465534b0db590ebdd3d27a03 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cos.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cos.f32(float) #0
 
 ; GCN-LABEL: {{^}}v_cos_f32:
 ; GCN: v_cos_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_cos_f32(float addrspace(1)* %out, float %src) #1 {
   %cos = call float @llvm.amdgcn.cos.f32(float %src) #0
   store float %cos, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
index 22bed45ee30f99c63fea95acef46ab9d156c94e6..dadb070bdcf8c0eb19c6c41753ab486219b9a7bb 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubeid.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubeid(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubeid:
 ; GCN: v_cubeid_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubeid(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubeid(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
index 565f22c5d5b62aa8d69e432966fc2649a941b5a5..60c4618a011b37f51a370214c3bf1971f9fdaee7 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubema.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubema(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubema:
 ; GCN: v_cubema_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubema(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubema(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
index a3ba3274581429bbc0e6f9d71810d2d8eb5abf11..10669cf991380e31d0180c93b00cba37ed0b9611 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubesc.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubesc(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubesc:
 ; GCN: v_cubesc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubesc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubesc(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
index d3c0f2851ead3c29ab07ef8a4d6f4b5ca9a9eeb1..b2770308c17022236a2fc599f775e016f636ac19 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cubetc.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.cubetc(float, float, float) #0
 
 ; GCN-LABEL: {{^}}test_cubetc:
 ; GCN: v_cubetc_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
+define amdgpu_kernel void @test_cubetc(float addrspace(1)* %out, float %a, float %b, float %c) #1 {
   %result = call float @llvm.amdgcn.cubetc(float %a, float %b, float %c)
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b92eb34750d9a258e0a0270089b6fbd449168130
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll
@@ -0,0 +1,166 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
+
+; GCN-LABEL: {{^}}s_cvt_pkrtz_v2f16_f32:
+; GCN-DAG: s_load_dword [[X:s[0-9]+]], s[0:1], 0x{{b|2c}}
+; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}}
+; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]]
+; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]]
+define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 {
+  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32:
+; GCN: s_load_dword [[X:s[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]]
+define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 {
+  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Folds to 0 on gfx9
+; GCN-LABEL: {{^}}s_cvt_pkrtz_undef_undef:
+; GCN-NEXT: ; BB#0
+; SI-NEXT: s_endpgm
+; VI-NEXT: s_endpgm
+; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out) #0 {
+  %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]]
+; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float 1.0)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_imm_reg:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]]
+; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 1.0, float %a)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %b)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_hi:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %neg.b)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo_hi:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %neg.a = fsub float -0.0, %a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.a, float %neg.b)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]]
+; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]]
+define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext
+  %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x half>, <2 x half> addrspace(1)* %out, i64 %tid.ext
+  %a = load volatile float, float addrspace(1)* %a.gep
+  %b = load volatile float, float addrspace(1)* %b.gep
+  %fabs.a = call float @llvm.fabs.f32(float %a)
+  %neg.fabs.a = fsub float -0.0, %fabs.a
+  %neg.b = fsub float -0.0, %b
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %neg.fabs.a, float %neg.b)
+  store <2 x half> %cvt, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare float @llvm.fabs.f32(float) #1
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
index 6c09aa5924473103d3014d5116b958c81664ff68..58250de2f891d668599f38d8a86011e01be551d6 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.id.ll
@@ -9,7 +9,7 @@ declare i64 @llvm.amdgcn.dispatch.id() #1
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], s6
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], s7
 ; GCN: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @dispatch_id(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @dispatch_id(i64 addrspace(1)* %out) #0 {
   %tmp0 = call i64 @llvm.amdgcn.dispatch.id()
   store i64 %tmp0, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
index 2e8625256f137ceebf95633c518140478ce198b2..92208e7fe17c94d95080223bb0eeaeafd7df125b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.dispatch.ptr.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_dispatch_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   %dispatch_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %dispatch_ptr to i32 addrspace(2)*
   %value = load i32, i32 addrspace(2)* %header_ptr
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
index 6d262cf497ac7455cfd00368cd403d485ebded20..e04d9e662cea315d43a70b70f98f5fe1bed07d27 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.f16.ll
@@ -9,7 +9,7 @@ declare half @llvm.amdgcn.div.fixup.f16(half %a, half %b, half %c)
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16(
+define amdgpu_kernel void @div_fixup_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -30,7 +30,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a(
+define amdgpu_kernel void @div_fixup_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -49,7 +49,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_b(
+define amdgpu_kernel void @div_fixup_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -68,7 +68,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -86,7 +86,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AB_F16]], v[[AB_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a_imm_b(
+define amdgpu_kernel void @div_fixup_f16_imm_a_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %c) {
 entry:
@@ -102,7 +102,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[BC_F16]], v[[BC_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_b_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_b_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -118,7 +118,7 @@ entry:
 ; VI:  v_div_fixup_f16 v[[R_F16:[0-9]+]], v[[AC_F16]], v[[B_F16]], v[[AC_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @div_fixup_f16_imm_a_imm_c(
+define amdgpu_kernel void @div_fixup_f16_imm_a_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
index cc1504f2bc8dfce63f408e66052c498ca6f2fa5e..b8fcacf46bba0ec9ac54d6e720c880ffb32d4e60 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fixup.ll
@@ -16,7 +16,7 @@ declare double @llvm.amdgcn.div.fixup.f64(double, double, double) nounwind readn
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], [[SA]], [[VB]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fixup.f32(float %a, float %b, float %c) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -24,7 +24,7 @@ define void @test_div_fixup_f32(float addrspace(1)* %out, float %a, float %b, fl
 
 ; GCN-LABEL: {{^}}test_div_fixup_f64:
 ; GCN: v_div_fixup_f64
-define void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
+define amdgpu_kernel void @test_div_fixup_f64(double addrspace(1)* %out, double %a, double %b, double %c) nounwind {
   %result = call double @llvm.amdgcn.div.fixup.f64(double %a, double %b, double %c) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
index d408fe9f87f684040bb11de5e783158004b813d3..a86468b07a272f801e6ccc3ef9a6320f47a95647 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll
@@ -20,7 +20,7 @@ declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind re
 ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VB]], [[VA]], [[VC]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -34,7 +34,7 @@ define void @test_div_fmas_f32(float addrspace(1)* %out, float %a, float %b, flo
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -48,7 +48,7 @@ define void @test_div_fmas_f32_inline_imm_0(float addrspace(1)* %out, float %a,
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -62,7 +62,7 @@ define void @test_div_fmas_f32_inline_imm_1(float addrspace(1)* %out, float %a,
 ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a, float %b, float %c, i1 %d) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -70,7 +70,7 @@ define void @test_div_fmas_f32_inline_imm_2(float addrspace(1)* %out, float %a,
 
 ; GCN-LABEL: {{^}}test_div_fmas_f64:
 ; GCN: v_div_fmas_f64
-define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b, double %c, i1 %d) nounwind {
   %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -79,7 +79,7 @@ define void @test_div_fmas_f64(double addrspace(1)* %out, double %a, double %b,
 ; GCN-LABEL: {{^}}test_div_fmas_f32_cond_to_vcc:
 ; SI: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0{{$}}
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c, i32 %i) nounwind {
   %cmp = icmp eq i32 %i, 0
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@ define void @test_div_fmas_f32_cond_to_vcc(float addrspace(1)* %out, float %a, f
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_false_cond_to_vcc:
 ; SI: s_mov_b64 vcc, 0
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -98,7 +98,7 @@ define void @test_div_fmas_f32_imm_false_cond_to_vcc(float addrspace(1)* %out, f
 ; GCN-LABEL: {{^}}test_div_fmas_f32_imm_true_cond_to_vcc:
 ; SI: s_mov_b64 vcc, -1
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
-define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, float %a, float %b, float %c) nounwind {
   %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -114,7 +114,7 @@ define void @test_div_fmas_f32_imm_true_cond_to_vcc(float addrspace(1)* %out, fl
 ; SI: s_and_b64 vcc, [[CMP0]], [[CMP1]]
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]]
 ; SI: s_endpgm
-define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 %d) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.a = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.b = getelementptr float, float addrspace(1)* %gep.a, i32 1
@@ -150,7 +150,7 @@ define void @test_div_fmas_f32_logical_cond_to_vcc(float addrspace(1)* %out, flo
 ; SI: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
+define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(float addrspace(1)* %out, float addrspace(1)* %in, i32 addrspace(1)* %dummy) nounwind {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.out = getelementptr float, float addrspace(1)* %out, i32 2
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
index 8e5c62c31db54cba3bf3da6ece725f0f369eaacf..0b4f09ac65178fb5cb6ebbf261b94049e7dcaa73 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.div.scale.ll
@@ -11,7 +11,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -31,7 +31,7 @@ define void @test_div_scale_f32_1(float addrspace(1)* %out, float addrspace(1)*
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -51,7 +51,7 @@ define void @test_div_scale_f32_2(float addrspace(1)* %out, float addrspace(1)*
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -71,7 +71,7 @@ define void @test_div_scale_f64_1(double addrspace(1)* %out, double addrspace(1)
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)* %aptr, double addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr double, double addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr double, double addrspace(1)* %gep.0, i32 1
@@ -91,7 +91,7 @@ define void @test_div_scale_f64_2(double addrspace(1)* %out, double addrspace(1)
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -109,7 +109,7 @@ define void @test_div_scale_f32_scalar_num_1(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float addrspace(1)* %in, float %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -127,7 +127,7 @@ define void @test_div_scale_f32_scalar_num_2(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -145,7 +145,7 @@ define void @test_div_scale_f32_scalar_den_1(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float addrspace(1)* %in, float %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr float, float addrspace(1)* %in, i32 %tid
 
@@ -163,7 +163,7 @@ define void @test_div_scale_f32_scalar_den_2(float addrspace(1)* %out, float add
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -181,7 +181,7 @@ define void @test_div_scale_f64_scalar_num_1(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double addrspace(1)* %in, double %a) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -199,7 +199,7 @@ define void @test_div_scale_f64_scalar_num_2(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -217,7 +217,7 @@ define void @test_div_scale_f64_scalar_den_1(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[B]], [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double addrspace(1)* %in, double %b) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
 
@@ -236,7 +236,7 @@ define void @test_div_scale_f64_scalar_den_2(double addrspace(1)* %out, double a
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[VA]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a, float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 false) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -250,7 +250,7 @@ define void @test_div_scale_f32_all_scalar_1(float addrspace(1)* %out, float %a,
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[VB]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a, float %b) nounwind {
   %result = call { float, i1 } @llvm.amdgcn.div.scale.f32(float %a, float %b, i1 true) nounwind readnone
   %result0 = extractvalue { float, i1 } %result, 0
   store float %result0, float addrspace(1)* %out, align 4
@@ -265,7 +265,7 @@ define void @test_div_scale_f32_all_scalar_2(float addrspace(1)* %out, float %a,
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], v{{\[}}[[VA_LO]]:[[VA_HI]]{{\]}}
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %a, double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 false) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -280,7 +280,7 @@ define void @test_div_scale_f64_all_scalar_1(double addrspace(1)* %out, double %
 ; SI: v_div_scale_f64 [[RESULT0:v\[[0-9]+:[0-9]+\]]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], v{{\[}}[[VB_LO]]:[[VB_HI]]{{\]}}, [[A]]
 ; SI: buffer_store_dwordx2 [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
+define amdgpu_kernel void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %a, double %b) nounwind {
   %result = call { double, i1 } @llvm.amdgcn.div.scale.f64(double %a, double %b, i1 true) nounwind readnone
   %result0 = extractvalue { double, i1 } %result, 0
   store double %result0, double addrspace(1)* %out, align 8
@@ -292,7 +292,7 @@ define void @test_div_scale_f64_all_scalar_2(double addrspace(1)* %out, double %
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[A]], [[A]], 1.0
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
@@ -308,7 +308,7 @@ define void @test_div_scale_f32_inline_imm_num(float addrspace(1)* %out, float a
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], 2.0, 2.0, [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %a = load float, float addrspace(1)* %gep.0, align 4
@@ -326,7 +326,7 @@ define void @test_div_scale_f32_inline_imm_den(float addrspace(1)* %out, float a
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[B]], [[B]], [[ABS_A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -349,7 +349,7 @@ define void @test_div_scale_f32_fabs_num(float addrspace(1)* %out, float addrspa
 ; SI: v_div_scale_f32 [[RESULT0:v[0-9]+]], [[RESULT1:s\[[0-9]+:[0-9]+\]]], [[ABS_B]], [[ABS_B]], [[A]]
 ; SI: buffer_store_dword [[RESULT0]]
 ; SI: s_endpgm
-define void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @test_div_scale_f32_fabs_den(float addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
index 92d3fc8b107eb8dc4f3f2b2cad223f5fc6d47bea..08f286a7f510b925c75aa682dedd727586d9f17f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.bpermute.ll
@@ -4,8 +4,7 @@ declare i32 @llvm.amdgcn.ds.bpermute(i32, i32) #0
 
 ; FUNC-LABEL: {{^}}ds_bpermute:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,8 +12,7 @@ define void @ds_bpermute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind
 
 ; CHECK-LABEL: {{^}}ds_bpermute_imm_offset:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
-; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 4
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
@@ -23,8 +21,7 @@ define void @ds_bpermute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32
 
 ; CHECK-LABEL: {{^}}ds_bpermute_imm_index:
 ; CHECK: ds_bpermute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:64
-; CHECK: s_waitcnt lgkmcnt
-define void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_bpermute_imm_index(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.bpermute(i32 64, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
index 6d9c94191535451d28b3751dec7da52e273d1d3e..63618c3aed7759e24866ea3adc96a7c29b31c387 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.permute.ll
@@ -4,8 +4,7 @@ declare i32 @llvm.amdgcn.ds.permute(i32, i32) #0
 
 ; CHECK-LABEL: {{^}}ds_permute:
 ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; CHECK: s_waitcnt lgkmcnt
-define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
   %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,8 +12,7 @@ define void @ds_permute(i32 addrspace(1)* %out, i32 %index, i32 %src) nounwind {
 
 ; CHECK-LABEL: {{^}}ds_permute_imm_offset:
 ; CHECK: ds_permute_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:4
-; CHECK: s_waitcnt lgkmcnt
-define void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
+define amdgpu_kernel void @ds_permute_imm_offset(i32 addrspace(1)* %out, i32 %base_index, i32 %src) nounwind {
   %index = add i32 %base_index, 4
   %bpermute = call i32 @llvm.amdgcn.ds.permute(i32 %index, i32 %src) #0
   store i32 %bpermute, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
index ef3cb00024bb0a262f4cd3274627911b63669a9e..a3a78d326a628a5790d50cb83608e2f8b0e62713 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ds.swizzle.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #0
 ; FUNC-LABEL: {{^}}ds_swizzle:
 ; CHECK: ds_swizzle_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:100
 ; CHECK: s_waitcnt lgkmcnt
-define void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
+define amdgpu_kernel void @ds_swizzle(i32 addrspace(1)* %out, i32 %src) nounwind {
   %swizzle = call i32 @llvm.amdgcn.ds.swizzle(i32 %src, i32 100) #0
   store i32 %swizzle, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
index 0b5b20be334a86ac1907b9e14fb75ef5c49afc1c..b972ddb8cb7704e1476077cbbb32b2baabdddcda 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.compr.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefix=GCN %s
 
 declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
 declare void @llvm.amdgcn.exp.compr.v2i16(i32, i32, <2 x i16>, <2 x i16>, i1, i1) #0
@@ -7,7 +8,7 @@ declare void @llvm.amdgcn.exp.compr.v2i16(i32, i32, <2 x i16>, <2 x i16>, i1, i1
 ; GCN-LABEL: {{^}}test_export_compr_zeroes_v2f16:
 ; GCN: exp mrt0 off, off, off, off compr{{$}}
 ; GCN: exp mrt0 off, off, off, off done compr{{$}}
-define void @test_export_compr_zeroes_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_zeroes_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> zeroinitializer, <2 x half> zeroinitializer, i1 true, i1 false)
   ret void
@@ -16,27 +17,45 @@ define void @test_export_compr_zeroes_v2f16() #0 {
 ; GCN-LABEL: {{^}}test_export_compr_en_src0_v2f16:
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
-; GCN: exp mrt0 [[SRC0]], off, off, off done compr{{$}}
-define void @test_export_compr_en_src0_v2f16() #0 {
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], off, off done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src0_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_export_compr_en_src1_v2f16:
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
-; GCN: exp mrt0 off, [[SRC1]], off, off done compr{{$}}
-define void @test_export_compr_en_src1_v2f16() #0 {
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src1_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_export_compr_en_src0_src1_v2f16:
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done compr{{$}}
-define void @test_export_compr_en_src0_src1_v2f16() #0 {
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src0_src1_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_en_invalid2_v2f16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
+; GCN: exp mrt0 off, [[SRC0]], off, off done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_invalid2_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_export_compr_en_invalid10_v2f16:
+; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
+; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
+; GCN: exp mrt0 off, [[SRC0]], off, [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_invalid10_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 10, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
 
@@ -44,7 +63,7 @@ define void @test_export_compr_en_src0_src1_v2f16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0x38003800
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] compr{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done compr{{$}}
-define void @test_export_compr_mrt7_v2f16() #0 {
+define amdgpu_kernel void @test_export_compr_mrt7_v2f16() #0 {
   call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> <half 0.5, half 0.5>, <2 x half> <half 0.5, half 0.5>, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2f16(i32 7, i32 15, <2 x half> <half 0.5, half 0.5>, <2 x half> <half 0.5, half 0.5>, i1 true, i1 false)
   ret void
@@ -53,29 +72,29 @@ define void @test_export_compr_mrt7_v2f16() #0 {
 ; GCN-LABEL: {{^}}test_export_compr_z_v2f16:
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
-; GCN: exp mrtz [[SRC0]], [[SRC1]], off, off compr{{$}}
-; GCN: exp mrtz [[SRC0]], [[SRC1]], off, off done compr{{$}}
-define void @test_export_compr_z_v2f16() #0 {
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 false, i1 false)
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}}
+; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_z_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 8, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_export_compr_vm_v2f16:
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x40003c00
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x44003800
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off compr vm{{$}}
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done compr vm{{$}}
-define void @test_export_compr_vm_v2f16() #0 {
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 false, i1 true)
-  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 true)
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}}
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}}
+define amdgpu_kernel void @test_export_compr_vm_v2f16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 false, i1 true)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 true)
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_export_compr_zeroes_v2i16:
 ; GCN: exp mrt0 off, off, off, off compr{{$}}
 ; GCN: exp mrt0 off, off, off, off done compr{{$}}
-define void @test_export_compr_zeroes_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_zeroes_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 0, <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, i1 true, i1 false)
   ret void
@@ -85,7 +104,7 @@ define void @test_export_compr_zeroes_v2i16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
 ; GCN: exp mrt0 [[SRC0]], off, off, off done compr{{$}}
-define void @test_export_compr_en_src0_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_en_src0_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 1, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
 }
@@ -93,18 +112,18 @@ define void @test_export_compr_en_src0_v2i16() #0 {
 ; GCN-LABEL: {{^}}test_export_compr_en_src1_v2i16:
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
-; GCN: exp mrt0 off, [[SRC1]], off, off done compr{{$}}
-define void @test_export_compr_en_src1_v2i16() #0 {
-  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 2, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
+; GCN: exp mrt0 off, off, [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src1_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 12, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_export_compr_en_src0_src1_v2i16:
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done compr{{$}}
-define void @test_export_compr_en_src0_src1_v2i16() #0 {
-  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 3, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_en_src0_src1_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
 }
 
@@ -112,7 +131,7 @@ define void @test_export_compr_en_src0_src1_v2i16() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[VI16:v[0-9]+]], 0x50005
 ; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] compr{{$}}
 ; GCN: exp mrt7 [[VI16]], [[VI16]], [[VI16]], [[VI16]] done compr{{$}}
-define void @test_export_compr_mrt7_v2i16() #0 {
+define amdgpu_kernel void @test_export_compr_mrt7_v2i16() #0 {
   call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> <i16 5, i16 5>, <2 x i16> <i16 5, i16 5>, i1 false, i1 false)
   call void @llvm.amdgcn.exp.compr.v2i16(i32 7, i32 15, <2 x i16> <i16 5, i16 5>, <2 x i16> <i16 5, i16 5>, i1 true, i1 false)
   ret void
@@ -121,22 +140,22 @@ define void @test_export_compr_mrt7_v2i16() #0 {
 ; GCN-LABEL: {{^}}test_export_compr_z_v2i16:
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
-; GCN: exp mrtz [[SRC0]], [[SRC1]], off, off compr{{$}}
-; GCN: exp mrtz [[SRC0]], [[SRC1]], off, off done compr{{$}}
-define void @test_export_compr_z_v2i16() #0 {
-  call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 3, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 false, i1 false)
-  call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 3, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
+; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr{{$}}
+; GCN: exp mrtz [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr{{$}}
+define amdgpu_kernel void @test_export_compr_z_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 8, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 false)
   ret void
 }
 
 ; GCN-LABEL: {{^}}test_export_compr_vm_v2i16:
 ; GCN-DAG: v_mov_b32_e32 [[SRC0:v[0-9]+]], 0x20001
 ; GCN-DAG: v_mov_b32_e32 [[SRC1:v[0-9]+]], 0x40005
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off compr vm{{$}}
-; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done compr vm{{$}}
-define void @test_export_compr_vm_v2i16() #0 {
-  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 3, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 false, i1 true)
-  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 3, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 true)
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] compr vm{{$}}
+; GCN: exp mrt0 [[SRC0]], [[SRC0]], [[SRC1]], [[SRC1]] done compr vm{{$}}
+define amdgpu_kernel void @test_export_compr_vm_v2i16() #0 {
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 false, i1 true)
+  call void @llvm.amdgcn.exp.compr.v2i16(i32 0, i32 15, <2 x i16> <i16 1, i16 2>, <2 x i16> <i16 5, i16 4>, i1 true, i1 true)
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
index 9b5836fa56a6f242ff47b977170a426fea1f3661..6d2de108829d3386af86f2f3016a218b78d6ac81 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll
@@ -7,7 +7,7 @@ declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1
 ; GCN-LABEL: {{^}}test_export_zeroes_f32:
 ; GCN: exp mrt0 off, off, off, off{{$}}
 ; GCN: exp mrt0 off, off, off, off done{{$}}
-define void @test_export_zeroes_f32() #0 {
+define amdgpu_kernel void @test_export_zeroes_f32() #0 {
 
   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float 0.0, float 0.0, float 0.0, float 0.0, i1 true, i1 false)
@@ -22,7 +22,7 @@ define void @test_export_zeroes_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
-define void @test_export_en_src0_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -33,7 +33,7 @@ define void @test_export_en_src0_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
-define void @test_export_en_src1_f32() #0 {
+define amdgpu_kernel void @test_export_en_src1_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -44,7 +44,7 @@ define void @test_export_en_src1_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
-define void @test_export_en_src2_f32() #0 {
+define amdgpu_kernel void @test_export_en_src2_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -55,7 +55,7 @@ define void @test_export_en_src2_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src3_f32() #0 {
+define amdgpu_kernel void @test_export_en_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -66,7 +66,7 @@ define void @test_export_en_src3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
-define void @test_export_en_src0_src1_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -77,7 +77,7 @@ define void @test_export_en_src0_src1_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 0.5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
-define void @test_export_en_src0_src2_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src2_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
 }
@@ -89,7 +89,7 @@ define void @test_export_en_src0_src2_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src0_src3_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -102,7 +102,7 @@ define void @test_export_en_src0_src3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_en_src0_src1_src2_src3_f32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -112,7 +112,7 @@ define void @test_export_en_src0_src1_src2_src3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 0.5
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
-define void @test_export_mrt7_f32() #0 {
+define amdgpu_kernel void @test_export_mrt7_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 7, i32 15, float 0.5, float 0.5, float 0.5, float 0.5, i1 true, i1 false)
   ret void
@@ -125,7 +125,7 @@ define void @test_export_mrt7_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_z_f32() #0 {
+define amdgpu_kernel void @test_export_z_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 8, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -138,7 +138,7 @@ define void @test_export_z_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_null_f32() #0 {
+define amdgpu_kernel void @test_export_null_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 9, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -151,7 +151,7 @@ define void @test_export_null_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved10_f32() #0 {
+define amdgpu_kernel void @test_export_reserved10_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 10, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -164,7 +164,7 @@ define void @test_export_reserved10_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved11_f32() #0 {
+define amdgpu_kernel void @test_export_reserved11_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 11, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -177,7 +177,7 @@ define void @test_export_reserved11_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos0_f32() #0 {
+define amdgpu_kernel void @test_export_pos0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -190,7 +190,7 @@ define void @test_export_pos0_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos3_f32() #0 {
+define amdgpu_kernel void @test_export_pos3_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 15, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -203,7 +203,7 @@ define void @test_export_pos3_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param0_f32() #0 {
+define amdgpu_kernel void @test_export_param0_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -216,7 +216,7 @@ define void @test_export_param0_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param31_f32() #0 {
+define amdgpu_kernel void @test_export_param31_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
   ret void
@@ -229,7 +229,7 @@ define void @test_export_param31_f32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4.0
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-define void @test_export_vm_f32() #0 {
+define amdgpu_kernel void @test_export_vm_f32() #0 {
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 true)
   call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 true)
   ret void
@@ -252,7 +252,7 @@ define void @test_export_vm_f32() #0 {
 ; GCN-LABEL: {{^}}test_export_zeroes_i32:
 ; GCN: exp mrt0 off, off, off, off{{$}}
 ; GCN: exp mrt0 off, off, off, off done{{$}}
-define void @test_export_zeroes_i32() #0 {
+define amdgpu_kernel void @test_export_zeroes_i32() #0 {
 
   call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i1 true, i1 false)
@@ -267,7 +267,7 @@ define void @test_export_zeroes_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], off, off, off done{{$}}
-define void @test_export_en_src0_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 1, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -278,7 +278,7 @@ define void @test_export_en_src0_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 off, [[SRC1]], off, off done{{$}}
-define void @test_export_en_src1_i32() #0 {
+define amdgpu_kernel void @test_export_en_src1_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 2, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -289,7 +289,7 @@ define void @test_export_en_src1_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 off, off, [[SRC2]], off done{{$}}
-define void @test_export_en_src2_i32() #0 {
+define amdgpu_kernel void @test_export_en_src2_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 4, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -300,7 +300,7 @@ define void @test_export_en_src2_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 off, off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src3_i32() #0 {
+define amdgpu_kernel void @test_export_en_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 8, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -311,7 +311,7 @@ define void @test_export_en_src3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], off, off done{{$}}
-define void @test_export_en_src0_src1_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 3, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -322,7 +322,7 @@ define void @test_export_en_src0_src1_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC2:v[0-9]+]], 5
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], off, [[SRC2]], off done{{$}}
-define void @test_export_en_src0_src2_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src2_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 5, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
 }
@@ -334,7 +334,7 @@ define void @test_export_en_src0_src2_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], off, off, [[SRC3]] done{{$}}
-define void @test_export_en_src0_src3_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 9, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -347,7 +347,7 @@ define void @test_export_en_src0_src3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_en_src0_src1_src2_src3_i32() #0 {
+define amdgpu_kernel void @test_export_en_src0_src1_src2_src3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -357,7 +357,7 @@ define void @test_export_en_src0_src1_src2_src3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[VHALF:v[0-9]+]], 5
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]]{{$}}
 ; GCN: exp mrt7 [[VHALF]], [[VHALF]], [[VHALF]], [[VHALF]] done{{$}}
-define void @test_export_mrt7_i32() #0 {
+define amdgpu_kernel void @test_export_mrt7_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 7, i32 15, i32 5, i32 5, i32 5, i32 5, i1 true, i1 false)
   ret void
@@ -370,7 +370,7 @@ define void @test_export_mrt7_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp mrtz [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_z_i32() #0 {
+define amdgpu_kernel void @test_export_z_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 8, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -383,7 +383,7 @@ define void @test_export_z_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp null [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_null_i32() #0 {
+define amdgpu_kernel void @test_export_null_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 9, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -396,7 +396,7 @@ define void @test_export_null_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_10 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved10_i32() #0 {
+define amdgpu_kernel void @test_export_reserved10_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 10, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -409,7 +409,7 @@ define void @test_export_reserved10_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp invalid_target_11 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_reserved11_i32() #0 {
+define amdgpu_kernel void @test_export_reserved11_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 11, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -422,7 +422,7 @@ define void @test_export_reserved11_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos0_i32() #0 {
+define amdgpu_kernel void @test_export_pos0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 12, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -435,7 +435,7 @@ define void @test_export_pos0_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp pos3 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_pos3_i32() #0 {
+define amdgpu_kernel void @test_export_pos3_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 15, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -448,7 +448,7 @@ define void @test_export_pos3_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param0_i32() #0 {
+define amdgpu_kernel void @test_export_param0_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 32, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -461,7 +461,7 @@ define void @test_export_param0_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]]{{$}}
 ; GCN: exp param31 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done{{$}}
-define void @test_export_param31_i32() #0 {
+define amdgpu_kernel void @test_export_param31_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 false)
   call void @llvm.amdgcn.exp.i32(i32 63, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 false)
   ret void
@@ -474,7 +474,7 @@ define void @test_export_param31_i32() #0 {
 ; GCN-DAG: v_mov_b32_e32 [[SRC3:v[0-9]+]], 4
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] vm{{$}}
 ; GCN: exp mrt0 [[SRC0]], [[SRC1]], [[SRC2]], [[SRC3]] done vm{{$}}
-define void @test_export_vm_i32() #0 {
+define amdgpu_kernel void @test_export_vm_i32() #0 {
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 false, i1 true)
   call void @llvm.amdgcn.exp.i32(i32 0, i32 15, i32 1, i32 2, i32 5, i32 4, i1 true, i1 true)
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
index 427ad5ef553d6946dff4a5a7d572243e5f711f81..c9993ee88369c4399f4aad057e7bed3d2d601bab 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fcmp.ll
@@ -5,9 +5,17 @@ declare i64 @llvm.amdgcn.fcmp.f32(float, float, i32) #0
 declare i64 @llvm.amdgcn.fcmp.f64(double, double, i32) #0
 declare float @llvm.fabs.f32(float) #0
 
+; GCN-LABEL: {{^}}v_fcmp_f32_dynamic_cc:
+; GCN: s_endpgm
+define amdgpu_kernel void @v_fcmp_f32_dynamic_cc(i64 addrspace(1)* %out, float %src0, float %src1, i32 %cc) {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float %src0, float %src1, i32 %cc)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_with_fabs:
 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, {{s[0-9]+}}
-define void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float %temp, i32 1)
   store i64 %result, i64 addrspace(1)* %out
@@ -16,7 +24,7 @@ define void @v_fcmp_f32_oeq_with_fabs(i64 addrspace(1)* %out, float %src, float
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq_both_operands_with_fabs:
 ; GCN: v_cmp_eq_f32_e64 {{s\[[0-9]+:[0-9]+\]}}, |{{v[0-9]+}}|, |{{s[0-9]+}}|
-define void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
+define amdgpu_kernel void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, float %src, float %a) {
   %temp = call float @llvm.fabs.f32(float %a)
   %src_input = call float @llvm.fabs.f32(float %src)
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src_input, float %temp, i32 1)
@@ -26,7 +34,7 @@ define void @v_fcmp_f32_oeq_both_operands_with_fabs(i64 addrspace(1)* %out, floa
 
 ; GCN-LABEL: {{^}}v_fcmp:
 ; GCN-NOT: v_cmp_eq_f32_e64
-define void @v_fcmp(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 -1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -34,7 +42,7 @@ define void @v_fcmp(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oeq:
 ; GCN: v_cmp_eq_f32_e64
-define void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -42,7 +50,7 @@ define void @v_fcmp_f32_oeq(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_one:
 ; GCN: v_cmp_neq_f32_e64
-define void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 6)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -50,7 +58,7 @@ define void @v_fcmp_f32_one(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ogt:
 ; GCN: v_cmp_gt_f32_e64
-define void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 2)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -58,7 +66,7 @@ define void @v_fcmp_f32_ogt(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_oge:
 ; GCN: v_cmp_ge_f32_e64
-define void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 3)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -66,7 +74,7 @@ define void @v_fcmp_f32_oge(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_olt:
 ; GCN: v_cmp_lt_f32_e64
-define void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 4)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -74,7 +82,7 @@ define void @v_fcmp_f32_olt(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ole:
 ; GCN: v_cmp_le_f32_e64
-define void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 5)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -83,7 +91,7 @@ define void @v_fcmp_f32_ole(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ueq:
 ; GCN: v_cmp_nlg_f32_e64
-define void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 9)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -91,7 +99,7 @@ define void @v_fcmp_f32_ueq(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_une:
 ; GCN: v_cmp_neq_f32_e64
-define void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 14)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -99,7 +107,7 @@ define void @v_fcmp_f32_une(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ugt:
 ; GCN: v_cmp_nle_f32_e64
-define void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 10)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -107,7 +115,7 @@ define void @v_fcmp_f32_ugt(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_uge:
 ; GCN: v_cmp_nlt_f32_e64
-define void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 11)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -115,7 +123,7 @@ define void @v_fcmp_f32_uge(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ult:
 ; GCN: v_cmp_nge_f32_e64
-define void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 12)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -123,7 +131,7 @@ define void @v_fcmp_f32_ult(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f32_ule:
 ; GCN: v_cmp_ngt_f32_e64
-define void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
+define amdgpu_kernel void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f32(float %src, float 100.00, i32 13)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -131,7 +139,7 @@ define void @v_fcmp_f32_ule(i64 addrspace(1)* %out, float %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_oeq:
 ; GCN: v_cmp_eq_f64_e64
-define void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 1)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +147,7 @@ define void @v_fcmp_f64_oeq(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_one:
 ; GCN: v_cmp_neq_f64_e64
-define void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 6)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -147,7 +155,7 @@ define void @v_fcmp_f64_one(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ogt:
 ; GCN: v_cmp_gt_f64_e64
-define void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 2)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -155,7 +163,7 @@ define void @v_fcmp_f64_ogt(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_oge:
 ; GCN: v_cmp_ge_f64_e64
-define void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 3)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -163,7 +171,7 @@ define void @v_fcmp_f64_oge(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_olt:
 ; GCN: v_cmp_lt_f64_e64
-define void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 4)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -171,7 +179,7 @@ define void @v_fcmp_f64_olt(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ole:
 ; GCN: v_cmp_le_f64_e64
-define void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 5)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -179,7 +187,7 @@ define void @v_fcmp_f64_ole(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ueq:
 ; GCN: v_cmp_nlg_f64_e64
-define void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 9)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -187,7 +195,7 @@ define void @v_fcmp_f64_ueq(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_une:
 ; GCN: v_cmp_neq_f64_e64
-define void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 14)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -195,7 +203,7 @@ define void @v_fcmp_f64_une(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ugt:
 ; GCN: v_cmp_nle_f64_e64
-define void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 10)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -203,7 +211,7 @@ define void @v_fcmp_f64_ugt(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_uge:
 ; GCN: v_cmp_nlt_f64_e64
-define void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 11)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -211,7 +219,7 @@ define void @v_fcmp_f64_uge(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ult:
 ; GCN: v_cmp_nge_f64_e64
-define void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 12)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -219,7 +227,7 @@ define void @v_fcmp_f64_ult(i64 addrspace(1)* %out, double %src) {
 
 ; GCN-LABEL: {{^}}v_fcmp_f64_ule:
 ; GCN: v_cmp_ngt_f64_e64
-define void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) {
+define amdgpu_kernel void @v_fcmp_f64_ule(i64 addrspace(1)* %out, double %src) {
   %result = call i64 @llvm.amdgcn.fcmp.f64(double %src, double 100.00, i32 13)
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
index 54d7848da3bfbc241dbf79cff0fb5423a211c88f..248ee9904da030215ae99c42254de164f34aa960 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fdiv.fast.ll
@@ -8,7 +8,7 @@ declare float @llvm.amdgcn.fdiv.fast(float, float) #0
 ; CHECK: v_rcp_f32_e32
 ; CHECK: v_mul_f32_e32
 ; CHECK: v_mul_f32_e32
-define void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
+define amdgpu_kernel void @test_fdiv_fast(float addrspace(1)* %out, float %a, float %b) #1 {
   %fdiv = call float @llvm.amdgcn.fdiv.fast(float %a, float %b)
   store float %fdiv, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a4ae37b23c5f407e295734f0d7b5bbcaeef073f6
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.f16.ll
@@ -0,0 +1,39 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_fmed3_f16:
+; GCN: v_med3_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @test_fmed3_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+  %src0.f16 = trunc i32 %src0.arg to i16
+  %src0 = bitcast i16 %src0.f16 to half
+  %src1.f16 = trunc i32 %src1.arg to i16
+  %src1 = bitcast i16 %src1.f16 to half
+  %src2.f16 = trunc i32 %src2.arg to i16
+  %src2 = bitcast i16 %src2.f16 to half
+  %mad = call half @llvm.amdgcn.fmed3.f16(half %src0, half %src1, half %src2)
+  store half %mad, half addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fmed3_srcmods_f16:
+; GCN: v_med3_f16 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
+define amdgpu_kernel void @test_fmed3_srcmods_f16(half addrspace(1)* %out, i32 %src0.arg, i32 %src1.arg, i32 %src2.arg) #1 {
+  %src0.f16 = trunc i32 %src0.arg to i16
+  %src0 = bitcast i16 %src0.f16 to half
+  %src1.f16 = trunc i32 %src1.arg to i16
+  %src1 = bitcast i16 %src1.f16 to half
+  %src2.f16 = trunc i32 %src2.arg to i16
+  %src2 = bitcast i16 %src2.f16 to half
+  %src0.fneg = fsub half -0.0, %src0
+  %src1.fabs = call half @llvm.fabs.f16(half %src1)
+  %src2.fabs = call half @llvm.fabs.f16(half %src2)
+  %src2.fneg.fabs = fsub half -0.0, %src2.fabs
+  %mad = call half @llvm.amdgcn.fmed3.f16(half %src0.fneg, half %src1.fabs, half %src2.fneg.fabs)
+  store half %mad, half addrspace(1)* %out
+  ret void
+}
+
+declare half @llvm.amdgcn.fmed3.f16(half, half, half) #0
+declare half @llvm.fabs.f16(half) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..230e625ad45bb533c1c8a12c44d5a85b78cfd7dc
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll
@@ -0,0 +1,28 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}test_fmed3:
+; GCN: v_med3_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @test_fmed3(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+  %mad = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2)
+  store float %mad, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_fmed3_srcmods:
+; GCN: v_med3_f32 v{{[0-9]+}}, -s{{[0-9]+}}, |v{{[0-9]+}}|, -|v{{[0-9]+}}|
+define amdgpu_kernel void @test_fmed3_srcmods(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 {
+  %src0.fneg = fsub float -0.0, %src0
+  %src1.fabs = call float @llvm.fabs.f32(float %src1)
+  %src2.fabs = call float @llvm.fabs.f32(float %src2)
+  %src2.fneg.fabs = fsub float -0.0, %src2.fabs
+  %mad = call float @llvm.amdgcn.fmed3.f32(float %src0.fneg, float %src1.fabs, float %src2.fneg.fabs)
+  store float %mad, float addrspace(1)* %out
+  ret void
+}
+
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #0
+declare float @llvm.fabs.f32(float) #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
index d5c1c0a0969b1c0f3dfbc796aa21892e839f71fe..b47d2dbc744d447068c5301ba786a12e96def586 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fmul.legacy.ll
@@ -4,7 +4,7 @@
 
 ; GCN-LABEL: {{^}}test_mul_legacy_f32:
 ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
-define void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -12,7 +12,7 @@ define void @test_mul_legacy_f32(float addrspace(1)* %out, float %a, float %b) #
 
 ; GCN-LABEL: {{^}}test_mul_legacy_undef0_f32:
 ; GCN: v_mul_legacy_f32_e32
-define void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float undef, float %a)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -20,7 +20,7 @@ define void @test_mul_legacy_undef0_f32(float addrspace(1)* %out, float %a) #0 {
 
 ; GCN-LABEL: {{^}}test_mul_legacy_undef1_f32:
 ; GCN: v_mul_legacy_f32_e32
-define void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
   %result = call float @llvm.amdgcn.fmul.legacy(float %a, float undef)
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -28,7 +28,7 @@ define void @test_mul_legacy_undef1_f32(float addrspace(1)* %out, float %a) #0 {
 
 ; GCN-LABEL: {{^}}test_mul_legacy_fabs_f32:
 ; GCN: v_mul_legacy_f32_e64 v{{[0-9]+}}, |v{{[0-9]+}}|, |s{{[0-9]+}}|
-define void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %a.fabs = call float @llvm.fabs.f32(float %a)
   %b.fabs = call float @llvm.fabs.f32(float %b)
   %result = call float @llvm.amdgcn.fmul.legacy(float %a.fabs, float %b.fabs)
@@ -40,7 +40,7 @@ define void @test_mul_legacy_fabs_f32(float addrspace(1)* %out, float %a, float
 ; GCN-LABEL: {{^}}test_mad_legacy_f32:
 ; GCN: v_mul_legacy_f32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_add_f32_e32
-define void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 {
+define amdgpu_kernel void @test_mad_legacy_f32(float addrspace(1)* %out, float %a, float %b, float %c) #0 {
   %mul = call float @llvm.amdgcn.fmul.legacy(float %a, float %b)
   %add = fadd float %mul, %c
   store float %add, float addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
index d8c1af036a34c46ddc635f6a387cfde6aa5c67e4..026f6901fc7f20f678f18eea47116da3dcc2f790 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.fract.f16(half %a)
 ; VI:  v_fract_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fract_f16(
+define amdgpu_kernel void @fract_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
index a75267b8d6933a1787b201c504df25d1e5fb177f..d4f1c5fd9be76b6fdb8513ba8c6c1f265258917d 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.fract.ll
@@ -6,7 +6,7 @@ declare double @llvm.amdgcn.fract.f64(double) #0
 
 ; GCN-LABEL: {{^}}v_fract_f32:
 ; GCN: v_fract_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
   %fract = call float @llvm.amdgcn.fract.f32(float %src)
   store float %fract, float addrspace(1)* %out
   ret void
@@ -14,7 +14,7 @@ define void @v_fract_f32(float addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}v_fract_f64:
 ; GCN: v_fract_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
   %fract = call double @llvm.amdgcn.fract.f64(double %src)
   store double %fract, double addrspace(1)* %out
   ret void
@@ -22,9 +22,8 @@ define void @v_fract_f64(double addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}v_fract_undef_f32:
 ; GCN-NOT: v_fract_f32
-; GCN-NOT: v0
-; GCN: buffer_store_dword v0
-define void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
+; GCN-NOT: store_dword
+define amdgpu_kernel void @v_fract_undef_f32(float addrspace(1)* %out) #1 {
   %fract = call float @llvm.amdgcn.fract.f32(float undef)
   store float %fract, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
index 7521224058f3c9f6ababf51710799b38b09cd188..dc3eb4ce191e258e619aec07407a5446ecf48117 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.f16.ll
@@ -6,7 +6,7 @@ declare i16 @llvm.amdgcn.frexp.exp.i16.f16(half %a)
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_I16]]
-define void @frexp_exp_f16(
+define amdgpu_kernel void @frexp_exp_f16(
     i16 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -21,7 +21,7 @@ entry:
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; VI:  v_bfe_i32 v[[R_I32:[0-9]+]], v[[R_I16]], 0, 16{{$}}
 ; GCN: buffer_store_dword v[[R_I32]]
-define void @frexp_exp_f16_sext(
+define amdgpu_kernel void @frexp_exp_f16_sext(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -37,7 +37,7 @@ entry:
 ; VI:  v_frexp_exp_i16_f16_e32 v[[R_I16:[0-9]+]], v[[A_F16]]
 ; VI:  v_and_b32_e32 v[[R_I32:[0-9]+]], 0xffff, v[[R_I16]]
 ; GCN: buffer_store_dword v[[R_I32]]
-define void @frexp_exp_f16_zext(
+define amdgpu_kernel void @frexp_exp_f16_zext(
     i32 addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
index 9c49f175f2b5b5de4573a1f9145cfcc67690af77..0d686147caf81f6129f8fbb1a5ec69bc93dc8bec 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.exp.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.frexp.exp.i32.f64(double) #0
 
 ; GCN-LABEL: {{^}}s_test_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_test_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
-define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fabs.src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
@@ -25,7 +25,7 @@ define void @s_test_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f32:
 ; GCN: v_frexp_exp_i32_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
-define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %fneg.fabs.src = fsub float -0.0, %fabs.src
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f32(float %fneg.fabs.src)
@@ -35,7 +35,7 @@ define void @s_test_fneg_fabs_frexp_exp_f32(i32 addrspace(1)* %out, float %src)
 
 ; GCN-LABEL: {{^}}s_test_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e32 {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @s_test_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, |{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fabs.src)
   store i32 %frexp.exp, i32 addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @s_test_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_exp_f64:
 ; GCN: v_frexp_exp_i32_f64_e64 {{v[0-9]+}}, -|{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_exp_f64(i32 addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %fneg.fabs.src = fsub double -0.0, %fabs.src
   %frexp.exp = call i32 @llvm.amdgcn.frexp.exp.i32.f64(double %fneg.fabs.src)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
index 706537d7e21c05ce8785b429e46ce07edf9b2f32..722cd44e99fbd28e2b1dcc909bb2ae7ad1f1089f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.frexp.mant.f16(half %a)
 ; VI:  v_frexp_mant_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @frexp_mant_f16(
+define amdgpu_kernel void @frexp_mant_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
index b8d63defffedc7bef76b4037deb9eb165bbc2da4..605dc3db2b989ab3bbd9a63b72054a8fe3b86a47 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.frexp.mant.ll
@@ -8,7 +8,7 @@ declare double @llvm.amdgcn.frexp.mant.f64(double) #0
 
 ; GCN-LABEL: {{^}}s_test_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %src)
   store float %frexp.mant, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_test_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, |{{s[0-9]+}}|
-define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fabs.src)
   store float %frexp.mant, float addrspace(1)* %out
@@ -25,7 +25,7 @@ define void @s_test_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f32:
 ; GCN: v_frexp_mant_f32_e64 {{v[0-9]+}}, -|{{s[0-9]+}}|
-define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %src) #1 {
   %fabs.src = call float @llvm.fabs.f32(float %src)
   %fneg.fabs.src = fsub float -0.0, %fabs.src
   %frexp.mant = call float @llvm.amdgcn.frexp.mant.f32(float %fneg.fabs.src)
@@ -35,7 +35,7 @@ define void @s_test_fneg_fabs_frexp_mant_f32(float addrspace(1)* %out, float %sr
 
 ; GCN-LABEL: {{^}}s_test_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %src)
   store double %frexp.mant, double addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @s_test_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
 
 ; GCN-LABEL: {{^}}s_test_fabs_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, |{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fabs.src)
   store double %frexp.mant, double addrspace(1)* %out
@@ -52,7 +52,7 @@ define void @s_test_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src)
 
 ; GCN-LABEL: {{^}}s_test_fneg_fabs_frexp_mant_f64:
 ; GCN: v_frexp_mant_f64_e64 {{v\[[0-9]+:[0-9]+\]}}, -|{{s\[[0-9]+:[0-9]+\]}}|
-define void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @s_test_fneg_fabs_frexp_mant_f64(double addrspace(1)* %out, double %src) #1 {
   %fabs.src = call double @llvm.fabs.f64(double %src)
   %fneg.fabs.src = fsub double -0.0, %fabs.src
   %frexp.mant = call double @llvm.amdgcn.frexp.mant.f64(double %fneg.fabs.src)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
index 6014e2ed85f8a2d0992aca5af4725a89d8206a4d..d26fab4cebe1843d221a199909a1a6935cd1c90a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.groupstaticsize.ll
@@ -9,7 +9,7 @@
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test0:
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0x800{{$}}
-define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
+define amdgpu_kernel void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %lds_size) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
@@ -23,7 +23,7 @@ define void @groupstaticsize_test0(float addrspace(1)* %out, i32 addrspace(1)* %
 
 ; CHECK-LABEL: {{^}}groupstaticsize_test1:
 ; CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xc00{{$}}
-define void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
+define amdgpu_kernel void @groupstaticsize_test1(float addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %lds_size) {
 entry:
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize() #1
   store i32 %static_lds_size, i32 addrspace(1)* %lds_size, align 4
@@ -51,7 +51,7 @@ endif:                                            ; preds = %else, %if
 ; Exceeds 16-bit simm limit of s_movk_i32
 ; CHECK-LABEL: {{^}}large_groupstaticsize:
 ; CHECK: v_mov_b32_e32 [[REG:v[0-9]+]], 0x4000{{$}}
-define void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
+define amdgpu_kernel void @large_groupstaticsize(i32 addrspace(1)* %size, i32 %idx) #0 {
   %gep = getelementptr inbounds [4096 x i32], [4096 x i32] addrspace(3)* @large, i32 0, i32 %idx
   store volatile i32 0, i32 addrspace(3)* %gep
   %static_lds_size = call i32 @llvm.amdgcn.groupstaticsize()
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
index 6d0457bc6489ca0396285b75ccd604a2e83df1d1..aa04af7a64a900f293776aedbc1ce0df123fb280 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.ll
@@ -4,9 +4,18 @@
 declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) #0
 declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) #0
 
+; No crash on invalid input
+; GCN-LABEL: {{^}}v_icmp_i32_dynamic_cc:
+; GCN: s_endpgm
+define amdgpu_kernel void @v_icmp_i32_dynamic_cc(i64 addrspace(1)* %out, i32 %src, i32 %cc) {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 %cc)
+  store i64 %result, i64 addrspace(1)* %out
+  ret void
+}
+
 ; GCN-LABEL: {{^}}v_icmp_i32_eq:
 ; GCN: v_cmp_eq_u32_e64
-define void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 32)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -14,14 +23,14 @@ define void @v_icmp_i32_eq(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp:
 ; GCN-NOT: v_cmp_eq_u32_e64
-define void @v_icmp(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 30)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i32_ne:
 ; GCN: v_cmp_ne_u32_e64
-define void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 33)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -29,7 +38,7 @@ define void @v_icmp_i32_ne(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ugt:
 ; GCN: v_cmp_gt_u32_e64
-define void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 34)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -37,7 +46,7 @@ define void @v_icmp_u32_ugt(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_uge:
 ; GCN: v_cmp_ge_u32_e64
-define void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 35)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -45,7 +54,7 @@ define void @v_icmp_u32_uge(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ult:
 ; GCN: v_cmp_lt_u32_e64
-define void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 36)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -53,7 +62,7 @@ define void @v_icmp_u32_ult(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u32_ule:
 ; GCN: v_cmp_le_u32_e64
-define void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 37)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -61,7 +70,7 @@ define void @v_icmp_u32_ule(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i32_sgt:
 ; GCN: v_cmp_gt_i32_e64
-define void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
+define amdgpu_kernel void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 38)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -69,7 +78,7 @@ define void @v_icmp_i32_sgt(i64 addrspace(1)* %out, i32 %src) #1 {
 
 ; GCN-LABEL: {{^}}v_icmp_i32_sge:
 ; GCN: v_cmp_ge_i32_e64
-define void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 39)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -77,14 +86,14 @@ define void @v_icmp_i32_sge(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i32_slt:
 ; GCN: v_cmp_lt_i32_e64
-define void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_slt(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 40)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i32_sle:
 ; GCN: v_cmp_le_i32_e64
-define void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i32(i32 %src, i32 100, i32 41)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -92,7 +101,7 @@ define void @v_icmp_i32_sle(i64 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_eq:
 ; GCN: v_cmp_eq_u64_e64
-define void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 32)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -100,7 +109,7 @@ define void @v_icmp_i64_eq(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_ne:
 ; GCN: v_cmp_ne_u64_e64
-define void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 33)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -108,7 +117,7 @@ define void @v_icmp_i64_ne(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ugt:
 ; GCN: v_cmp_gt_u64_e64
-define void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 34)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -116,7 +125,7 @@ define void @v_icmp_u64_ugt(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_uge:
 ; GCN: v_cmp_ge_u64_e64
-define void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 35)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -124,7 +133,7 @@ define void @v_icmp_u64_uge(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ult:
 ; GCN: v_cmp_lt_u64_e64
-define void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 36)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -132,7 +141,7 @@ define void @v_icmp_u64_ult(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_u64_ule:
 ; GCN: v_cmp_le_u64_e64
-define void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 37)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -140,7 +149,7 @@ define void @v_icmp_u64_ule(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_sgt:
 ; GCN: v_cmp_gt_i64_e64
-define void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 38)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -148,7 +157,7 @@ define void @v_icmp_i64_sgt(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_sge:
 ; GCN: v_cmp_ge_i64_e64
-define void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 39)
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -156,14 +165,14 @@ define void @v_icmp_i64_sge(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_icmp_i64_slt:
 ; GCN: v_cmp_lt_i64_e64
-define void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_slt(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 40)
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 ; GCN-LABEL: {{^}}v_icmp_i64_sle:
 ; GCN: v_cmp_le_i64_e64
-define void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_icmp_i64_sle(i64 addrspace(1)* %out, i64 %src) {
   %result = call i64 @llvm.amdgcn.icmp.i64(i64 %src, i64 100, i32 41)
   store i64 %result, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
index a65f422742c9343177bf93e9b0cc522b87b6c95c..a9351dbb27d2a7ec46b77ea9e1b2da80f6f59ff4 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.gather4.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}gather4_v2:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_cl:
 ; GCN: image_gather4_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_l:
 ; GCN: image_gather4_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b:
 ; GCN: image_gather4_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_cl:
 ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_cl_v8:
 ; GCN: image_gather4_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_lz_v2:
 ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_lz:
 ; GCN: image_gather4_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -86,7 +86,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_o:
 ; GCN: image_gather4_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -95,7 +95,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_cl_o:
 ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -104,7 +104,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_cl_o_v8:
 ; GCN: image_gather4_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_cl_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -113,7 +113,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_l_o:
 ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -122,7 +122,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_l_o_v8:
 ; GCN: image_gather4_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_l_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -131,7 +131,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_o:
 ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -140,7 +140,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_o_v8:
 ; GCN: image_gather4_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -149,7 +149,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_b_cl_o:
 ; GCN: image_gather4_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_b_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -158,7 +158,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_lz_o:
 ; GCN: image_gather4_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_lz_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_lz_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -168,7 +168,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c:
 ; GCN: image_gather4_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -177,7 +177,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_cl:
 ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -186,7 +186,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_cl_v8:
 ; GCN: image_gather4_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -195,7 +195,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_l:
 ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -204,7 +204,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_l_v8:
 ; GCN: image_gather4_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -213,7 +213,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b:
 ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -222,7 +222,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_v8:
 ; GCN: image_gather4_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -231,7 +231,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_cl:
 ; GCN: image_gather4_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -240,7 +240,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_lz:
 ; GCN: image_gather4_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -250,7 +250,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_o:
 ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -259,7 +259,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_o_v8:
 ; GCN: image_gather4_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -268,7 +268,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_cl_o:
 ; GCN: image_gather4_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -277,7 +277,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_l_o:
 ; GCN: image_gather4_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_l_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_l_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.l.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -286,7 +286,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_o:
 ; GCN: image_gather4_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -295,7 +295,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_b_cl_o:
 ; GCN: image_gather4_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_b_cl_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.b.cl.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -304,7 +304,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_lz_o:
 ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz_o(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -313,7 +313,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_c_lz_o_v8:
 ; GCN: image_gather4_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_c_lz_o_v8(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.gather4.c.lz.o.v4f32.v8f32.v8i32(<8 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -322,7 +322,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_f32:
 ; GCN: image_gather4 {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 da
-define void @gather4_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_f32(float addrspace(1)* %out) {
 main_body:
   %r = call float @llvm.amdgcn.image.gather4.f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 1)
   store float %r, float addrspace(1)* %out
@@ -331,7 +331,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}gather4_v2f32:
 ; GCN: image_gather4 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 da
-define void @gather4_v2f32(<2 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @gather4_v2f32(<2 x float> addrspace(1)* %out) {
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.gather4.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <2 x float> %r, <2 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
index ef810a330017f18c6c4f1eb9f42350bdd1c491f9..2e78e2a4c6f57dac8eaee1e9fa88f06e430a6947 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.getlod.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}getlod:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}getlod_v2:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod_v2(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod_v2(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,13 +21,23 @@ main_body:
 
 ; GCN-LABEL: {{^}}getlod_v4:
 ; GCN: image_get_lod {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf da
-define void @getlod_v4(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @getlod_v4(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 1)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}adjust_writemask_getlod_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_getlod_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.getlod.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
 
 declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.getlod.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
index 69c43ca3070ae56440d74c0fcfab1f10ada0f6f5..c74c0fa15855db2fa5ffb0a06d374dab52b104f3 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.ll
@@ -1,146 +1,144 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=CHECK,VI %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
 
-;CHECK-LABEL: {{^}}image_load_v4i32:
-;CHECK: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_v4i32:
+; GCN: image_load v[0:3], v[0:3], s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret <4 x float> %tex
 }
 
-;CHECK-LABEL: {{^}}image_load_v2i32:
-;CHECK: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_v2i32:
+; GCN: image_load v[0:3], v[0:1], s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret <4 x float> %tex
 }
 
-;CHECK-LABEL: {{^}}image_load_i32:
-;CHECK: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) {
+; GCN-LABEL: {{^}}image_load_i32:
+; GCN: image_load v[0:3], v0, s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_i32(<8 x i32> inreg %rsrc, i32 %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret <4 x float> %tex
 }
 
-;CHECK-LABEL: {{^}}image_load_mip:
-;CHECK: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_mip:
+; GCN: image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <4 x float> @image_load_mip(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret <4 x float> %tex
 }
 
-;CHECK-LABEL: {{^}}image_load_1:
-;CHECK: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_1:
+; GCN: image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_load_1(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   %elt = extractelement <4 x float> %tex, i32 0
-; Only first component used, test that dmask etc. is changed accordingly
   ret float %elt
 }
 
-;CHECK-LABEL: {{^}}image_load_f32_v2i32:
-;CHECK: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_f32_v2i32:
+; GCN: image_load {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps float @image_load_f32_v2i32(<8 x i32> inreg %rsrc, <2 x i32> %c) #0 {
 main_body:
-  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0)
+  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false)
   ret float %tex
 }
 
-;CHECK-LABEL: {{^}}image_load_v2f32_v4i32:
-;CHECK: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
-;CHECK: s_waitcnt vmcnt(0)
-define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) {
+; GCN-LABEL: {{^}}image_load_v2f32_v4i32:
+; GCN: image_load {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
+; GCN: s_waitcnt vmcnt(0)
+define amdgpu_ps <2 x float> @image_load_v2f32_v4i32(<8 x i32> inreg %rsrc, <4 x i32> %c) #0 {
 main_body:
-  %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0)
+  %tex = call <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32> %c, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false)
   ret <2 x float> %tex
 }
 
-
-;CHECK-LABEL: {{^}}image_store_v4i32:
-;CHECK: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_v4i32:
+; GCN: image_store v[0:3], v[4:7], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_v4i32(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}image_store_v2i32:
-;CHECK: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_v2i32:
+; GCN: image_store v[0:3], v[4:5], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_v2i32(<8 x i32> inreg %rsrc, <4 x float> %data, <2 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float> %data, <2 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}image_store_i32:
-;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) {
+; GCN-LABEL: {{^}}image_store_i32:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_i32(<8 x i32> inreg %rsrc, <4 x float> %data, i32 %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}image_store_f32_i32:
-;CHECK: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
-define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) {
+; GCN-LABEL: {{^}}image_store_f32_i32:
+; GCN: image_store {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1 unorm
+define amdgpu_ps void @image_store_f32_i32(<8 x i32> inreg %rsrc, float %data, i32 %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.f32.i32.v8i32(float %data, i32 %coords, <8 x i32> %rsrc, i32 1, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}image_store_v2f32_v4i32:
-;CHECK: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
-define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_v2f32_v4i32:
+; GCN: image_store {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3 unorm
+define amdgpu_ps void @image_store_v2f32_v4i32(<8 x i32> inreg %rsrc, <2 x float> %data, <4 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 3, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}image_store_mip:
-;CHECK: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
-define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) {
+; GCN-LABEL: {{^}}image_store_mip:
+; GCN: image_store_mip v[0:3], v[4:7], s[0:7] dmask:0xf unorm
+define amdgpu_ps void @image_store_mip(<8 x i32> inreg %rsrc, <4 x float> %data, <4 x i32> %coords) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.mip.v4f32.v4i32.v8i32(<4 x float> %data, <4 x i32> %coords, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
-;CHECK-LABEL: {{^}}getresinfo:
-;CHECK: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define amdgpu_ps void @getresinfo() {
+; GCN-LABEL: {{^}}getresinfo:
+; GCN: image_get_resinfo {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
+define amdgpu_ps void @getresinfo() #0 {
 main_body:
-  %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0)
+  %r = call <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32 undef, <8 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false)
   %r0 = extractelement <4 x float> %r, i32 0
   %r1 = extractelement <4 x float> %r, i32 1
   %r2 = extractelement <4 x float> %r, i32 2
   %r3 = extractelement <4 x float> %r, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %r0, float %r1, float %r2, float %r3)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %r0, float %r1, float %r2, float %r3, i1 true, i1 true) #0
   ret void
 }
 
 ; Ideally, the register allocator would avoid the wait here
 ;
-;CHECK-LABEL: {{^}}image_store_wait:
-;CHECK: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0) expcnt(0)
-;CHECK: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
-define amdgpu_ps void @image_store_wait(<8 x i32> inreg, <8 x i32> inreg, <8 x i32> inreg, <4 x float>, i32) {
+; GCN-LABEL: {{^}}image_store_wait:
+; GCN: image_store v[0:3], v4, s[0:7] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0) expcnt(0)
+; GCN: image_load v[0:3], v4, s[8:15] dmask:0xf unorm
+; GCN: s_waitcnt vmcnt(0)
+; GCN: image_store v[0:3], v4, s[16:23] dmask:0xf unorm
+define amdgpu_ps void @image_store_wait(<8 x i32> inreg %arg, <8 x i32> inreg %arg1, <8 x i32> inreg %arg2, <4 x float> %arg3, i32 %arg4) #0 {
 main_body:
-  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %3, i32 %4, <8 x i32> %0, i32 15, i1 0, i1 0, i1 0, i1 0)
-  %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %4, <8 x i32> %1, i32 15, i1 0, i1 0, i1 0, i1 0)
-  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %4, <8 x i32> %2, i32 15, i1 0, i1 0, i1 0, i1 0)
+  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %arg3, i32 %arg4, <8 x i32> %arg, i32 15, i1 false, i1 false, i1 false, i1 false)
+  %data = call <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32 %arg4, <8 x i32> %arg1, i32 15, i1 false, i1 false, i1 false, i1 false)
+  call void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float> %data, i32 %arg4, <8 x i32> %arg2, i32 15, i1 false, i1 false, i1 false, i1 false)
   ret void
 }
 
@@ -149,21 +147,22 @@ main_body:
 ; VI-LABEL: image_load_mmo
 ; VI: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) {
-  store float 0.0, float addrspace(3)* %lds
-  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 0, i1 0, i1 0, i1 0)
+define amdgpu_ps void @image_load_mmo(float addrspace(3)* %lds, <2 x i32> %c, <8 x i32> inreg %rsrc) #0 {
+bb:
+  store float 0.000000e+00, float addrspace(3)* %lds
+  %tex = call float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32> %c, <8 x i32> %rsrc, i32 15, i1 false, i1 false, i1 false, i1 false)
   %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.0, float addrspace(3)* %tmp2
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tex, float %tex, float %tex, float %tex)
+  store float 0.000000e+00, float addrspace(3)* %tmp2
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tex, float %tex, float %tex, float %tex, i1 true, i1 true) #0
   ret void
 }
 
 declare float @llvm.amdgcn.image.load.f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare <2 x float> @llvm.amdgcn.image.load.v2f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare void @llvm.amdgcn.image.store.f32.i32.v8i32(float, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
-declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
 
 
+declare void @llvm.amdgcn.image.store.v2f32.v4i32.v8i32(<2 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.i32.v8i32(<4 x float>, i32, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.v2i32.v8i32(<4 x float>, <2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #0
@@ -173,10 +172,9 @@ declare <4 x float> @llvm.amdgcn.image.load.v4f32.i32.v8i32(i32, <8 x i32>, i32,
 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
 declare <4 x float> @llvm.amdgcn.image.load.mip.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #1
 
-declare <4 x float> @llvm.amdgcn.image.getresinfo.v4f32.i32.v8i32(i32, <8 x i32>, i32, i1, i1, i1, i1) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
index 752ec2d42fac0024d10c1fb479dfaf2eace6619a..4f90b0a25eaa6ac519d8585b61cb9f4f71b9ee5a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}sample:
 ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cl:
 ; GCN: image_sample_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d:
 ; GCN: image_sample_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d_cl:
 ; GCN: image_sample_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_l:
 ; GCN: image_sample_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b:
 ; GCN: image_sample_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b_cl:
 ; GCN: image_sample_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_lz:
 ; GCN: image_sample_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd:
 ; GCN: image_sample_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -84,7 +84,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd_cl:
 ; GCN: image_sample_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -93,7 +93,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c:
 ; GCN: image_sample_c {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -102,7 +102,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cl:
 ; GCN: image_sample_c_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -111,7 +111,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d:
 ; GCN: image_sample_c_d {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -120,7 +120,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d_cl:
 ; GCN: image_sample_c_d_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -129,7 +129,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_l:
 ; GCN: image_sample_c_l {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -138,7 +138,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b:
 ; GCN: image_sample_c_b {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -147,7 +147,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b_cl:
 ; GCN: image_sample_c_b_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -156,7 +156,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_lz:
 ; GCN: image_sample_c_lz {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -165,7 +165,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd:
 ; GCN: image_sample_c_cd {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -174,7 +174,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd_cl:
 ; GCN: image_sample_c_cd_cl {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -183,7 +183,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_f32:
 ; GCN: image_sample {{v[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x1
-define void @sample_f32(float addrspace(1)* %out) {
+define amdgpu_kernel void @sample_f32(float addrspace(1)* %out) {
 main_body:
   %r = call float @llvm.amdgcn.image.sample.f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 1, i1 0, i1 0, i1 0, i1 0, i1 0)
   store float %r, float addrspace(1)* %out
@@ -192,13 +192,221 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_v2f32:
 ; GCN: image_sample {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0x3
-define void @sample_v2f32(<2 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_v2f32(<2 x float> addrspace(1)* %out) {
 main_body:
   %r = call <2 x float> @llvm.amdgcn.image.sample.v2f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 3, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <2 x float> %r, <2 x float> addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}adjust_writemask_sample_0:
+; GCN: image_sample v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x1{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_0(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_01:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x3{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_01(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  %elt1 = extractelement <4 x float> %r, i32 1
+  store volatile float %elt0, float addrspace(1)* %out
+  store volatile float %elt1, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_012:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x7{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_012(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  %elt1 = extractelement <4 x float> %r, i32 1
+  %elt2 = extractelement <4 x float> %r, i32 2
+  store volatile float %elt0, float addrspace(1)* %out
+  store volatile float %elt1, float addrspace(1)* %out
+  store volatile float %elt2, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_12:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x6{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_12(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt1 = extractelement <4 x float> %r, i32 1
+  %elt2 = extractelement <4 x float> %r, i32 2
+  store volatile float %elt1, float addrspace(1)* %out
+  store volatile float %elt2, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_03:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0x9{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_03(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  %elt3 = extractelement <4 x float> %r, i32 3
+  store volatile float %elt0, float addrspace(1)* %out
+  store volatile float %elt3, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_13:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xa{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_13(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt1 = extractelement <4 x float> %r, i32 1
+  %elt3 = extractelement <4 x float> %r, i32 3
+  store volatile float %elt1, float addrspace(1)* %out
+  store volatile float %elt3, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_123:
+; GCN: image_sample v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} dmask:0xe{{$}}
+define amdgpu_kernel void @adjust_writemask_sample_123(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
+  %elt1 = extractelement <4 x float> %r, i32 1
+  %elt2 = extractelement <4 x float> %r, i32 2
+  %elt3 = extractelement <4 x float> %r, i32 3
+  store volatile float %elt1, float addrspace(1)* %out
+  store volatile float %elt2, float addrspace(1)* %out
+  store volatile float %elt3, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_variable_dmask_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_variable_dmask_enabled(float addrspace(1)* %out, i32 %dmask) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 %dmask, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cl_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cl_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_d_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_d_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_d_cl_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_l_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_l_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_b_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_b_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_b_cl_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_lz_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_lz_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.lz.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cd_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cd_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cd.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cd_cl_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
 declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.cl.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
index d10fd08246928dea3430e62f245a52ec1933a4ea..42d7bc0e7778efa1ecbfdc48c2da938708bd9a76 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.o.ll
@@ -3,7 +3,7 @@
 
 ; GCN-LABEL: {{^}}sample:
 ; GCN: image_sample_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -12,7 +12,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cl:
 ; GCN: image_sample_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -21,7 +21,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d:
 ; GCN: image_sample_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -30,7 +30,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_d_cl:
 ; GCN: image_sample_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -39,7 +39,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_l:
 ; GCN: image_sample_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -48,7 +48,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b:
 ; GCN: image_sample_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -57,7 +57,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_b_cl:
 ; GCN: image_sample_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_lz:
 ; GCN: image_sample_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -75,7 +75,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd:
 ; GCN: image_sample_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -84,7 +84,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_cd_cl:
 ; GCN: image_sample_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -93,7 +93,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c:
 ; GCN: image_sample_c_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -102,7 +102,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cl:
 ; GCN: image_sample_c_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -111,7 +111,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d:
 ; GCN: image_sample_c_d_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -120,7 +120,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_d_cl:
 ; GCN: image_sample_c_d_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_d_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -129,7 +129,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_l:
 ; GCN: image_sample_c_l_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_l(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_l(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -138,7 +138,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b:
 ; GCN: image_sample_c_b_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -147,7 +147,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_b_cl:
 ; GCN: image_sample_c_b_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_b_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -156,7 +156,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_lz:
 ; GCN: image_sample_c_lz_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_lz(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_lz(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -165,7 +165,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd:
 ; GCN: image_sample_c_cd_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
@@ -174,13 +174,232 @@ main_body:
 
 ; GCN-LABEL: {{^}}sample_c_cd_cl:
 ; GCN: image_sample_c_cd_cl_o {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}} dmask:0xf
-define void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
+define amdgpu_kernel void @sample_c_cd_cl(<4 x float> addrspace(1)* %out) {
 main_body:
   %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 0, i1 0, i1 0, i1 0, i1 0)
   store <4 x float> %r, <4 x float> addrspace(1)* %out
   ret void
 }
 
+; GCN-LABEL: {{^}}adjust_writemask_sample_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_d_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_d_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_d_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_d_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_l_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_l_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_b_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_b_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_b_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_b_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_lz_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_lz_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cd_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cd_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_cd_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_cd_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_d_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_d_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_d_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.d.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_l_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_l_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.l.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_b_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_b_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_b_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.b.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_lz_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_lz_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.lz.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_cd_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}adjust_writemask_sample_c_cd_cl_o_none_enabled:
+; GCN-NOT: image
+; GCN-NOT: store
+define amdgpu_kernel void @adjust_writemask_sample_c_cd_cl_o_none_enabled(float addrspace(1)* %out) {
+main_body:
+  %r = call <4 x float> @llvm.amdgcn.image.sample.c.cd.cl.o.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 0, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %r, i32 0
+  store float %elt0, float addrspace(1)* %out
+  ret void
+}
 
 declare <4 x float> @llvm.amdgcn.image.sample.o.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
 declare <4 x float> @llvm.amdgcn.image.sample.cl.o.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
index 9ba5c69a9a245b43d527516adebe7fc47b8f7aeb..c4795a23cd5b5e54b94dffd38cfee30f504aab0d 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.interp.ll
@@ -1,5 +1,7 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefix=GCN %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,VI %s
+; RUN: llc -march=amdgcn -mcpu=kabini -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,16BANK %s
+; RUN: llc -march=amdgcn -mcpu=stoney -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,16BANK %s
 
 ; GCN-LABEL: {{^}}v_interp:
 ; GCN-NOT: s_wqm
@@ -8,17 +10,17 @@
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}}
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr0.y{{$}}
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
-define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x float>) {
+define amdgpu_ps void @v_interp(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x float> %arg4) #0 {
 main_body:
-  %i = extractelement <2 x float> %4, i32 0
-  %j = extractelement <2 x float> %4, i32 1
-  %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %3)
-  %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %3)
-  %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %3)
-  %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %3)
-  %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %3)
+  %i = extractelement <2 x float> %arg4, i32 0
+  %j = extractelement <2 x float> %arg4, i32 1
+  %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 %arg3)
+  %p1_0 = call float @llvm.amdgcn.interp.p2(float %p0_0, float %j, i32 0, i32 0, i32 %arg3)
+  %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 %arg3)
+  %p1_1 = call float @llvm.amdgcn.interp.p2(float %p0_1, float %j, i32 1, i32 0, i32 %arg3)
+  %const = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %arg3)
   %w = fadd float %p1_1, %const
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %p0_0, float %p0_0, float %p1_1, float %w)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %p0_0, float %p0_0, float %p1_1, float %w, i1 true, i1 true) #0
   ret void
 }
 
@@ -37,7 +39,8 @@ main_body:
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr63.w{{$}}
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.w{{$}}
 ; GCN-DAG: v_interp_p1_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}}
-define amdgpu_ps void @v_interp_p1(float %i) {
+define amdgpu_ps void @v_interp_p1(float %i) #0 {
+bb:
   %p0_0 = call float @llvm.amdgcn.interp.p1(float %i, i32 0, i32 0, i32 256)
   %p0_1 = call float @llvm.amdgcn.interp.p1(float %i, i32 1, i32 0, i32 256)
   %p0_2 = call float @llvm.amdgcn.interp.p1(float %i, i32 2, i32 0, i32 256)
@@ -77,7 +80,8 @@ define amdgpu_ps void @v_interp_p1(float %i) {
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr63.x{{$}}
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}}
 ; GCN-DAG: v_interp_p2_f32 v{{[0-9]+}}, v{{[0-9]+}}, attr64.x{{$}}
-define amdgpu_ps void @v_interp_p2(float %x, float %j) {
+define amdgpu_ps void @v_interp_p2(float %x, float %j) #0 {
+bb:
   %p2_0 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 0, i32 0, i32 256)
   %p2_1 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 1, i32 0, i32 256)
   %p2_2 = call float @llvm.amdgcn.interp.p2(float %x, float %j, i32 2, i32 0, i32 256)
@@ -118,7 +122,8 @@ define amdgpu_ps void @v_interp_p2(float %x, float %j) {
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, p10, attr64.y{{$}}
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, invalid_param_3, attr64.y{{$}}
 ; GCN-DAG: v_interp_mov_f32 v{{[0-9]+}}, invalid_param_10, attr64.x{{$}}
-define amdgpu_ps void @v_interp_mov(float %x, float %j) {
+define amdgpu_ps void @v_interp_mov(float %x, float %j) #0 {
+bb:
   %mov_0 = call float @llvm.amdgcn.interp.mov(i32 0, i32 0, i32 0, i32 256)
   %mov_1 = call float @llvm.amdgcn.interp.mov(i32 1, i32 0, i32 0, i32 256)
   %mov_2 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 256)
@@ -161,23 +166,57 @@ define amdgpu_ps void @v_interp_mov(float %x, float %j) {
 ; VI-DAG: v_interp_mov_f32 v{{[0-9]+}}, p0, attr0.x{{$}}
 ; VI: s_mov_b32 m0, -1{{$}}
 ; VI: ds_write2_b32 v{{[0-9]+}}, [[ZERO]], [[ZERO]] offset1:4
-define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) {
-  store float 0.0, float addrspace(3)* %lds
+define amdgpu_ps void @v_interp_readnone(float addrspace(3)* %lds) #0 {
+bb:
+  store float 0.000000e+00, float addrspace(3)* %lds
   %tmp1 = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 0)
   %tmp2 = getelementptr float, float addrspace(3)* %lds, i32 4
-  store float 0.0, float addrspace(3)* %tmp2
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp1, float %tmp1, float %tmp1, float %tmp1)
+  store float 0.000000e+00, float addrspace(3)* %tmp2
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp1, float %tmp1, float %tmp1, float %tmp1, i1 true, i1 true) #0
   ret void
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #0
+; Thest that v_interp_p1 uses different source and destination registers
+; on 16 bank LDS chips.
 
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
-
-declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #0
+; GCN-LABEL: {{^}}v_interp_p1_bank16_bug:
+; 16BANK-NOT: v_interp_p1_f32 [[DST:v[0-9]+]], [[DST]]
+define amdgpu_ps void @v_interp_p1_bank16_bug([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg13, [17 x <4 x i32>] addrspace(2)* byval %arg14, [34 x <8 x i32>] addrspace(2)* byval %arg15, float inreg %arg16, i32 inreg %arg17, <2 x i32> %arg18, <2 x i32> %arg19, <2 x i32> %arg20, <3 x i32> %arg21, <2 x i32> %arg22, <2 x i32> %arg23, <2 x i32> %arg24, float %arg25, float %arg26, float %arg27, float %arg28, float %arg29, float %arg30, i32 %arg31, float %arg32, float %arg33) #0 {
+main_body:
+  %i.i = extractelement <2 x i32> %arg19, i32 0
+  %j.i = extractelement <2 x i32> %arg19, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg17) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg17) #0
+  %i.i7 = extractelement <2 x i32> %arg19, i32 0
+  %j.i8 = extractelement <2 x i32> %arg19, i32 1
+  %i.f.i9 = bitcast i32 %i.i7 to float
+  %j.f.i10 = bitcast i32 %j.i8 to float
+  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 0, i32 %arg17) #0
+  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 0, i32 %arg17) #0
+  %i.i1 = extractelement <2 x i32> %arg19, i32 0
+  %j.i2 = extractelement <2 x i32> %arg19, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 0, i32 %arg17) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 0, i32 %arg17) #0
+  %tmp = call float @llvm.fabs.f32(float %p2.i)
+  %tmp34 = call float @llvm.fabs.f32(float %p2.i12)
+  %tmp35 = call float @llvm.fabs.f32(float %p2.i6)
+  %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp, float %tmp34)
+  %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp35, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 true) #0
+  ret void
+}
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
 
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
index 5d4d4cd7ee46b82951629d81dd5811fc65dc4517..055dddbfa8af2944ce0ba26e127aaeff026f6d4f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.kernarg.segment.ptr.ll
@@ -8,7 +8,7 @@
 ; CO-V2: s_load_dword s{{[0-9]+}}, s[4:5], 0xa
 
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0xa
-define void @test(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) #1 {
   %kernarg.segment.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %header.ptr = bitcast i8 addrspace(2)* %kernarg.segment.ptr to i32 addrspace(2)*
   %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
@@ -20,7 +20,7 @@ define void @test(i32 addrspace(1)* %out) #1 {
 ; ALL-LABEL: {{^}}test_implicit:
 ; 10 + 9 (36 prepended implicit bytes) + 2(out pointer) = 21 = 0x15
 ; OS-UNKNOWN: s_load_dword s{{[0-9]+}}, s[0:1], 0x15
-define void @test_implicit(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
   %gep = getelementptr i32, i32 addrspace(2)* %header.ptr, i64 10
@@ -39,7 +39,7 @@ define void @test_implicit(i32 addrspace(1)* %out) #1 {
 ; ALL: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[VAL]]
 ; MESA: buffer_store_dword [[V_VAL]]
 ; HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[V_VAL]]
-define void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
+define amdgpu_kernel void @test_implicit_alignment(i32 addrspace(1)* %out, <2 x i8> %in) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr()
   %arg.ptr = bitcast i8 addrspace(2)* %implicitarg.ptr to i32 addrspace(2)*
   %val = load i32, i32 addrspace(2)* %arg.ptr
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
index 6720cbe9d8dacd6e24d8c7a90054e8a5b86ee1b5..fe211d356070c9a2b8b7f3ddc124c80298f180e9 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.ldexp.f16(half %a, i32 %b)
 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
 ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_I32]]
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16(
+define amdgpu_kernel void @ldexp_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     i32 addrspace(1)* %b) {
@@ -22,7 +22,7 @@ define void @ldexp_f16(
 ; GCN: buffer_load_dword v[[B_I32:[0-9]+]]
 ; VI: v_ldexp_f16_e32 v[[R_F16:[0-9]+]], 2.0, v[[B_I32]]
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16_imm_a(
+define amdgpu_kernel void @ldexp_f16_imm_a(
     half addrspace(1)* %r,
     i32 addrspace(1)* %b) {
   %b.val = load i32, i32 addrspace(1)* %b
@@ -35,7 +35,7 @@ define void @ldexp_f16_imm_a(
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; VI: v_ldexp_f16_e64 v[[R_F16:[0-9]+]], v[[A_F16]], 2{{$}}
 ; GCN: buffer_store_short v[[R_F16]]
-define void @ldexp_f16_imm_b(
+define amdgpu_kernel void @ldexp_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
   %a.val = load half, half addrspace(1)* %a
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
index a23defd742a8a0b3175af8047fee85ade7161bc6..1ab4e8b8063003d445c786f1409d1065bb70b895 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ldexp.ll
@@ -7,7 +7,7 @@ declare double @llvm.amdgcn.ldexp.f64(double, i32) nounwind readnone
 ; SI-LABEL: {{^}}test_ldexp_f32:
 ; SI: v_ldexp_f32
 ; SI: s_endpgm
-define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind {
   %result = call float @llvm.amdgcn.ldexp.f32(float %a, i32 %b) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
@@ -16,7 +16,7 @@ define void @test_ldexp_f32(float addrspace(1)* %out, float %a, i32 %b) nounwind
 ; SI-LABEL: {{^}}test_ldexp_f64:
 ; SI: v_ldexp_f64
 ; SI: s_endpgm
-define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwind {
   %result = call double @llvm.amdgcn.ldexp.f64(double %a, i32 %b) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
   ret void
@@ -24,7 +24,7 @@ define void @test_ldexp_f64(double addrspace(1)* %out, double %a, i32 %b) nounwi
 
 ; SI-LABEL: {{^}}test_ldexp_undef_f32:
 ; SI-NOT: v_ldexp_f32
-define void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
+define amdgpu_kernel void @test_ldexp_undef_f32(float addrspace(1)* %out, i32 %b) nounwind {
   %result = call float @llvm.amdgcn.ldexp.f32(float undef, i32 %b) nounwind readnone
   store float %result, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
index 014369b450157a93f623e45c3ea2f0cc421ebf47..bc599897f82aedb3239a01fc9c28f57001a9a85f 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.lerp.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.lerp(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_lerp:
 ; GCN: v_lerp_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
+define amdgpu_kernel void @v_lerp(i32 addrspace(1)* %out, i32 %src) nounwind {
   %result= call i32 @llvm.amdgcn.lerp(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
index f78257f1d226b1828f9e951c1b8ab3cf9e58ad8d..feecd6c0e35dde7b3e57427d5a458eba626008b2 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.log.clamp.ll
@@ -7,7 +7,7 @@ declare float @llvm.amdgcn.log.clamp.f32(float) #0
 
 ; GCN-LABEL: {{^}}v_log_clamp_f32:
 ; GCN: v_log_clamp_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_log_clamp_f32(float addrspace(1)* %out, float %src) #1 {
   %log.clamp = call float @llvm.amdgcn.log.clamp.f32(float %src) #0
   store float %log.clamp, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
index 303446b63315bf13d20c6c04d8aefbbb7fad6335..ab76c870796be87187821527b82c29ec1173242b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll
@@ -1,24 +1,22 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI  %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; GCN-LABEL: {{^}}mbcnt_intrinsics:
 ; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0
 ; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]]
 ; VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]]
-
-define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg) {
+define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) {
 main_body:
-  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #1
-  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #1
-  %4 = bitcast i32 %hi to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %4, float %4, float %4, float %4)
+  %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
+  %hi = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo) #0
+  %tmp = bitcast i32 %hi to float
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp, float %tmp, float %tmp, float %tmp, i1 true, i1 true) #1
   ret void
 }
 
-declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
-
-declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #0
+declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 
-attributes #1 = { nounwind readnone }
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
index 35fdba8f34a3f2196a8fd0660ece572b17c6c7e9..8baaad19040679f39218f409a497cdf9527a8b0a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp.ll
@@ -7,7 +7,7 @@
 ; VI: v_mov_b32_e32 v0, s{{[0-9]+}}
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0 ; encoding: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x08,0x11]
-define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   store i32 %tmp0, i32 addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@ define void @dpp_test(i32 addrspace(1)* %out, i32 %in) {
 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
   %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %in, i32 1, i32 1, i32 1, i1 1) #0
   %tmp1 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %tmp0, i32 1, i32 1, i32 1, i1 1) #0
   store i32 %tmp1, i32 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @dpp_wait_states(i32 addrspace(1)* %out, i32 %in) {
 ; VI: v_mov_b32_dpp [[VGPR1:v[0-9]+]], [[VGPR0]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
 ; VI: s_nop 1
 ; VI: v_mov_b32_dpp v{{[0-9]+}}, [[VGPR1]] quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 bound_ctrl:0
-define void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
+define amdgpu_kernel void @dpp_first_in_bb(float addrspace(1)* %out, float addrspace(1)* %in, float %cond, float %a, float %b) {
   %cmp = fcmp oeq float %cond, 0.0
   br i1 %cmp, label %if, label %else
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
index 7c2495e096ec67198e8f10f90249a8d4a8c13135..3a2b87cd87f30586ca409902583b67e974a68cef 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.pk.u16.u8.ll
@@ -5,7 +5,7 @@ declare i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8:
 ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
   %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_mqsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_mqsad_pk_u16_u8_non_immediate:
 ; GCN: v_mqsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
+define amdgpu_kernel void @v_mqsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
   %result= call i64 @llvm.amdgcn.mqsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
index 04bb97a9eb57dfeec1b136086182d8bda351b7e5..a8d03bf6bbac5087c7f1ee6bdaaa3dfc3ef02ebf 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mqsad.u32.u8.ll
@@ -5,7 +5,7 @@ declare <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64, i32, <4 x i32>) #0
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_non_inline_constant:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out, i64 %src) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 100, <4 x i32> <i32 100, i32 100, i32 100, i32 100>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_mqsad_u32_u8_use_non_inline_constant(<4 x i32> addrspace(1)* %out
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_non_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
+define amdgpu_kernel void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> %b) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %b) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @v_mqsad_u32_u8_non_immediate(<4 x i32> addrspace(1)* %out, i64 %src
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_integer_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 10, i32 20, i32 30, i32 40>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @v_mqsad_u32_u8_inline_integer_immediate(<4 x i32> addrspace(1)* %ou
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_inline_fp_immediate:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
+define amdgpu_kernel void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a) {
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> <i32 1065353216, i32 0, i32 0, i32 0>) #0
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -37,7 +37,7 @@ define void @v_mqsad_u32_u8_inline_fp_immediate(<4 x i32> addrspace(1)* %out, i6
 
 ; GCN-LABEL: {{^}}v_mqsad_u32_u8_use_sgpr_vgpr:
 ; GCN: v_mqsad_u32_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
+define amdgpu_kernel void @v_mqsad_u32_u8_use_sgpr_vgpr(<4 x i32> addrspace(1)* %out, i64 %src, i32 %a, <4 x i32> addrspace(1)* %input) {
   %in = load <4 x i32>, <4 x i32> addrspace(1) * %input
 
   %result = call <4 x i32> @llvm.amdgcn.mqsad.u32.u8(i64 %src, i32 %a, <4 x i32> %in) #0
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
index 83d13ab268463f8bc0d9645b529e438ef76fdf64..dfaac042227c9557f1f149322ab4b4c96fad7dfd 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.msad.u8.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.msad.u8(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_msad_u8:
 ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_msad_u8(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_msad_u8_non_immediate:
 ; GCN: v_msad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_msad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.msad.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
index fd1a463fd3e999a9f67fb4bcd60a5b1997b4114e..f0af876567b493a63530161ff8e22c39fc966c15 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK %s
 
 ; CHECK-LABEL: {{^}}test1:
 ; CHECK: v_cndmask_b32_e64 v0, 0, 1, exec
@@ -7,7 +7,7 @@
 ; there is no WQM use and therefore llvm.amdgcn.ps.live is constant. However,
 ; the expectation is that the intrinsic will be used in non-trivial shaders,
 ; so such an optimization doesn't seem worth the effort.
-define amdgpu_ps float @test1() {
+define amdgpu_ps float @test1() #0 {
   %live = call i1 @llvm.amdgcn.ps.live()
   %live.32 = zext i1 %live to i32
   %r = bitcast i32 %live.32 to float
@@ -19,12 +19,11 @@ define amdgpu_ps float @test1() {
 ; CHECK-DAG: s_wqm_b64 exec, exec
 ; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[LIVE]]
 ; CHECK: image_sample v0, [[VAR]],
-define amdgpu_ps float @test2() {
+define amdgpu_ps float @test2() #0 {
   %live = call i1 @llvm.amdgcn.ps.live()
   %live.32 = zext i1 %live to i32
-
-  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %live.32, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-
+  %live.32.bc = bitcast i32 %live.32 to float
+  %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %live.32.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %r = extractelement <4 x float> %t, i32 0
   ret float %r
 }
@@ -35,7 +34,7 @@ define amdgpu_ps float @test2() {
 ; CHECK-DAG: s_xor_b64 [[HELPER:s\[[0-9]+:[0-9]+\]]], [[LIVE]], -1
 ; CHECK_DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]]
 ; CHECK: ; %dead
-define amdgpu_ps float @test3(i32 %in) {
+define amdgpu_ps float @test3(i32 %in) #0 {
 entry:
   %live = call i1 @llvm.amdgcn.ps.live()
   br i1 %live, label %end, label %dead
@@ -46,14 +45,15 @@ dead:
 
 end:
   %tc = phi i32 [ %in, %entry ], [ %tc.dead, %dead ]
-  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %tc, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-
+  %tc.bc = bitcast i32 %tc to float
+  %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %tc.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %r = extractelement <4 x float> %t, i32 0
   ret float %r
 }
 
-declare i1 @llvm.amdgcn.ps.live() #0
-
-declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #0
+declare i1 @llvm.amdgcn.ps.live() #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
index ece4224f6e67f14737708be54bcc00ff6e81044a..be71225c5e06f3b232687af8ae961d3c843bf89e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.qsad.pk.u16.u8.ll
@@ -5,7 +5,7 @@ declare i64 @llvm.amdgcn.qsad.pk.u16.u8(i64, i32, i64) #0
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8:
 ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
+define amdgpu_kernel void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
   %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 100, i64 100) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_qsad_pk_u16_u8(i64 addrspace(1)* %out, i64 %src) {
 
 ; GCN-LABEL: {{^}}v_qsad_pk_u16_u8_non_immediate:
 ; GCN: v_qsad_pk_u16_u8 v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
+define amdgpu_kernel void @v_qsad_pk_u16_u8_non_immediate(i64 addrspace(1)* %out, i64 %src, i32 %a, i64 %b) {
   %result= call i64 @llvm.amdgcn.qsad.pk.u16.u8(i64 %src, i32 %a, i64 %b) #0
   store i64 %result, i64 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
index 6bf871543ca20f613e0d26fc25a01045bc2322e3..9200fe7c67b185c457af39430f5a0e74bb4797ab 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.queue.ptr.ll
@@ -6,7 +6,7 @@
 ; GCN-LABEL: {{^}}test:
 ; GCN: enable_sgpr_queue_ptr = 1
 ; GCN: s_load_dword s{{[0-9]+}}, s[4:5], 0x0
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out) {
   %queue_ptr = call noalias i8 addrspace(2)* @llvm.amdgcn.queue.ptr() #0
   %header_ptr = bitcast i8 addrspace(2)* %queue_ptr to i32 addrspace(2)*
   %value = load i32, i32 addrspace(2)* %header_ptr
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
index f0b8e2a0293fe5d0db4cdca202923ec72298d01c..0f1fa15f47cca04f9f28d1331a897a67d89c4393 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.rcp.f16(half %a)
 ; VI:  v_rcp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rcp_f16(
+define amdgpu_kernel void @rcp_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
index d53861456c783a4a384b42923962811abe3813de..71db76d902b79d2ab2592fe14be4af610e34f7db 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.legacy.ll
@@ -7,7 +7,7 @@ declare float @llvm.amdgcn.rcp.legacy(float) #0
 
 ; GCN-LABEL: {{^}}rcp_legacy_f32:
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float %src) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -16,7 +16,7 @@ define void @rcp_legacy_f32(float addrspace(1)* %out, float %src) #1 {
 ; TODO: Really these should be constant folded
 ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_4.0
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 4.0
-define void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float 4.0) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -24,7 +24,7 @@ define void @rcp_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
 
 ; GCN-LABEL: {{^}}rcp_legacy_f32_constant_100.0
 ; GCN: v_rcp_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float 100.0) #0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -32,7 +32,7 @@ define void @rcp_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
 
 ; GCN-LABEL: {{^}}rcp_legacy_undef_f32:
 ; GCN-NOT: v_rcp_legacy_f32
-define void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_legacy_undef_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.legacy(float undef)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
index 825231bf8680ccf94c6e315aeb474e4100b93968..ad2d84b7911b48025d0a38829ef90939d399bd92 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rcp.ll
@@ -8,17 +8,35 @@ declare float @llvm.sqrt.f32(float) #0
 
 ; FUNC-LABEL: {{^}}rcp_undef_f32:
 ; SI-NOT: v_rcp_f32
-define void @rcp_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rcp_undef_f32(float addrspace(1)* %out) #1 {
   %rcp = call float @llvm.amdgcn.rcp.f32(float undef)
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
 }
 
+; FUNC-LABEL: {{^}}rcp_2_f32:
+; SI-NOT: v_rcp_f32
+; SI: v_mov_b32_e32 v{{[0-9]+}}, 0.5
+define amdgpu_kernel void @rcp_2_f32(float addrspace(1)* %out) #1 {
+  %rcp = call float @llvm.amdgcn.rcp.f32(float 2.0)
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}rcp_10_f32:
+; SI-NOT: v_rcp_f32
+; SI: v_mov_b32_e32 v{{[0-9]+}}, 0x3dcccccd
+define amdgpu_kernel void @rcp_10_f32(float addrspace(1)* %out) #1 {
+  %rcp = call float @llvm.amdgcn.rcp.f32(float 10.0)
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
 ; FUNC-LABEL: {{^}}safe_no_fp32_denormals_rcp_f32:
 ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src) #1 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -28,7 +46,7 @@ define void @safe_no_fp32_denormals_rcp_f32(float addrspace(1)* %out, float %src
 ; SI: v_rcp_f32_e32 [[RESULT:v[0-9]+]], s{{[0-9]+}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
+define amdgpu_kernel void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #4 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -36,7 +54,7 @@ define void @safe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src
 
 ; FUNC-LABEL: {{^}}unsafe_f32_denormals_rcp_pat_f32:
 ; SI: v_div_scale_f32
-define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
+define amdgpu_kernel void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %src) #3 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -45,7 +63,7 @@ define void @unsafe_f32_denormals_rcp_pat_f32(float addrspace(1)* %out, float %s
 ; FUNC-LABEL: {{^}}safe_rsq_rcp_pat_f32:
 ; SI: v_sqrt_f32_e32
 ; SI: v_rcp_f32_e32
-define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
   store float %rcp, float addrspace(1)* %out, align 4
@@ -54,7 +72,7 @@ define void @safe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #1 {
 
 ; FUNC-LABEL: {{^}}unsafe_rsq_rcp_pat_f32:
 ; SI: v_rsq_f32_e32
-define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
   %sqrt = call float @llvm.sqrt.f32(float %src)
   %rcp = call float @llvm.amdgcn.rcp.f32(float %sqrt)
   store float %rcp, float addrspace(1)* %out, align 4
@@ -65,7 +83,7 @@ define void @unsafe_rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) #2 {
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
   %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -75,7 +93,7 @@ define void @rcp_f64(double addrspace(1)* %out, double %src) #1 {
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
   %rcp = call double @llvm.amdgcn.rcp.f64(double %src)
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -83,7 +101,7 @@ define void @unsafe_rcp_f64(double addrspace(1)* %out, double %src) #2 {
 
 ; FUNC-LABEL: {{^}}rcp_pat_f64:
 ; SI: v_div_scale_f64
-define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
   %rcp = fdiv double 1.0, %src
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -93,7 +111,7 @@ define void @rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
 ; SI: v_rcp_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
   %rcp = fdiv double 1.0, %src
   store double %rcp, double addrspace(1)* %out, align 8
   ret void
@@ -103,7 +121,7 @@ define void @unsafe_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
 ; SI-NOT: v_rsq_f64_e32
 ; SI: v_sqrt_f64
 ; SI: v_rcp_f64
-define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
   store double %rcp, double addrspace(1)* %out, align 8
@@ -114,7 +132,7 @@ define void @safe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #1 {
 ; SI: v_rsq_f64_e32 [[RESULT:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}
 ; SI-NOT: [[RESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
+define amdgpu_kernel void @unsafe_rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) #2 {
   %sqrt = call double @llvm.sqrt.f64(double %src)
   %rcp = call double @llvm.amdgcn.rcp.f64(double %sqrt)
   store double %rcp, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
index 2569108e7b185c0dc998ba61b6593159ed0685b5..9f5c809455ea35d63cb79db59d8c1b8fe1667c93 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readfirstlane.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #0
 
 ; CHECK-LABEL: {{^}}test_readfirstlane:
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, v{{[0-9]+}}
-define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
+define amdgpu_kernel void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %src)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @test_readfirstlane(i32 addrspace(1)* %out, i32 %src) #1 {
 ; CHECK-LABEL: {{^}}test_readfirstlane_imm:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]]
-define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 32)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@ define void @test_readfirstlane_imm(i32 addrspace(1)* %out) #1 {
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
 ; CHECK: v_readfirstlane_b32 s{{[0-9]+}}, [[VVAL]]
-define void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_readfirstlane_m0(i32 addrspace(1)* %out) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
   %readfirstlane = call i32 @llvm.amdgcn.readfirstlane(i32 %m0)
   store i32 %readfirstlane, i32 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
index 436ffff692c60986b167eb0bc5b59363e048e413..5e892fad3741bf10863e65890e7757ade16af6be 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.readlane.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.readlane(i32, i32) #0
 
 ; CHECK-LABEL: {{^}}test_readlane_sreg:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @test_readlane_sreg(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #1
 ; CHECK-LABEL: {{^}}test_readlane_imm_sreg:
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], 32
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
-define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 32, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@ define void @test_readlane_imm_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[VVAL:v[0-9]]], [[COPY_M0]]
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, [[VVAL]], s{{[0-9]+}}
-define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
+define amdgpu_kernel void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
   %m0 = call i32 asm "s_mov_b32 m0, -1", "={M0}"()
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %m0, i32 %src1)
   store i32 %readlane, i32 addrspace(1)* %out, align 4
@@ -34,7 +34,7 @@ define void @test_readlane_m0_sreg(i32 addrspace(1)* %out, i32 %src1) #1 {
 
 ; CHECK-LABEL: {{^}}test_readlane_imm:
 ; CHECK: v_readlane_b32 s{{[0-9]+}}, v{{[0-9]+}}, 32
-define void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
+define amdgpu_kernel void @test_readlane_imm(i32 addrspace(1)* %out, i32 %src0) #1 {
   %readlane = call i32 @llvm.amdgcn.readlane(i32 %src0, i32 32) #0
   store i32 %readlane, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
index 5f40e0d0986fcb96ad16ee0243788f2f2c2eb876..3611047f127745f955e927d650d19cd9a997815d 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.clamp.ll
@@ -12,7 +12,7 @@ declare double @llvm.amdgcn.rsq.clamp.f64(double) #1
 ; VI-DAG: v_min_f32_e32 [[MIN:v[0-9]+]], 0x7f7fffff, [[RSQ]]
 ; VI: v_max_f32_e32 [[RESULT:v[0-9]+]], 0xff7fffff, [[MIN]]
 ; VI: buffer_store_dword [[RESULT]]
-define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float %src)
   store float %rsq_clamp, float addrspace(1)* %out
   ret void
@@ -30,7 +30,7 @@ define void @rsq_clamp_f32(float addrspace(1)* %out, float %src) #0 {
 ; VI-DAG: v_rsq_f64_e32 [[RSQ:v\[[0-9]+:[0-9]+\]]], s[{{[0-9]+:[0-9]+}}
 ; VI-DAG: v_min_f64 v[0:1], [[RSQ]], s{{\[}}[[LOW1]]:[[HIGH1]]]
 ; VI-DAG: v_max_f64 v[0:1], v[0:1], s{{\[}}[[LOW1]]:[[HIGH2]]]
-define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
+define amdgpu_kernel void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
   %rsq_clamp = call double @llvm.amdgcn.rsq.clamp.f64(double %src)
   store double %rsq_clamp, double addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @rsq_clamp_f64(double addrspace(1)* %out, double %src) #0 {
 
 ; FUNC-LABEL: {{^}}rsq_clamp_undef_f32:
 ; SI-NOT: v_rsq_clamp_f32
-define void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @rsq_clamp_undef_f32(float addrspace(1)* %out) #0 {
   %rsq_clamp = call float @llvm.amdgcn.rsq.clamp.f32(float undef)
   store float %rsq_clamp, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
index 2022d028986200e8ee5f1acf52e01cc6a6e21674..fd48021408100c553531b1b81b7894d579145631 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.rsq.f16(half %a)
 ; VI:  v_rsq_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rsq_f16(
+define amdgpu_kernel void @rsq_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
index 47bd0d82b8340872c63855be61cd9ac73e86ac52..7f4c2cb19a3245ff593f3587bdf5ce6b5c852164 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.legacy.ll
@@ -4,7 +4,7 @@ declare float @llvm.amdgcn.rsq.legacy(float) #0
 
 ; FUNC-LABEL: {{^}}rsq_legacy_f32:
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_4.0
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 4.0
-define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float 4.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @rsq_legacy_f32_constant_4.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_legacy_f32_constant_100.0
 ; SI: v_rsq_legacy_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float 100.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @rsq_legacy_f32_constant_100.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_legacy_undef_f32:
 ; SI-NOT: v_rsq_legacy_f32
-define void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_legacy_undef_f32(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float undef)
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
index c644288977a3846e3dd4b222632021a0b0ffcacd..0ce26d0fe8762db07680d051c2a0d7655f9b10ad 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.rsq.ll
@@ -6,7 +6,7 @@ declare double @llvm.amdgcn.rsq.f64(double) #0
 
 ; FUNC-LABEL: {{^}}rsq_f32:
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float %src) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -15,7 +15,7 @@ define void @rsq_f32(float addrspace(1)* %out, float %src) #1 {
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_f32_constant_4.0
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 4.0
-define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float 4.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -23,7 +23,7 @@ define void @rsq_f32_constant_4.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_f32_constant_100.0
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, 0x42c80000
-define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float 100.0) #0
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -31,7 +31,7 @@ define void @rsq_f32_constant_100.0(float addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_f64:
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
+define amdgpu_kernel void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double %src) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -40,7 +40,7 @@ define void @rsq_f64(double addrspace(1)* %out, double %src) #1 {
 ; TODO: Really these should be constant folded
 ; FUNC-LABEL: {{^}}rsq_f64_constant_4.0
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, 4.0
-define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 4.0) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -50,7 +50,7 @@ define void @rsq_f64_constant_4.0(double addrspace(1)* %out) #1 {
 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0x40590000
 ; SI-DAG: s_mov_b32 s{{[0-9]+}}, 0{{$}}
 ; SI: v_rsq_f64_e32 {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}
-define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
   %rsq = call double @llvm.amdgcn.rsq.f64(double 100.0) #0
   store double %rsq, double addrspace(1)* %out, align 4
   ret void
@@ -58,7 +58,7 @@ define void @rsq_f64_constant_100.0(double addrspace(1)* %out) #1 {
 
 ; FUNC-LABEL: {{^}}rsq_undef_f32:
 ; SI-NOT: v_rsq_f32
-define void @rsq_undef_f32(float addrspace(1)* %out) #1 {
+define amdgpu_kernel void @rsq_undef_f32(float addrspace(1)* %out) #1 {
   %rsq = call float @llvm.amdgcn.rsq.f32(float undef)
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
index 132e476d5e295ceec5cc53561894d633077af06e..5f8ca28ec5f05d2d7af708996c3575a9a0d47e8e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.ll
@@ -1,10 +1,13 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX8 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; GCN-LABEL: {{^}}test_barrier:
-; GCN: buffer_store_dword
-; GCN: s_waitcnt
+; GFX8: buffer_store_dword
+; GFX8: s_waitcnt
+; GFX9: flat_store_dword
+; GFX9-NOT: s_waitcnt
 ; GCN: s_barrier
-define void @test_barrier(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_barrier(i32 addrspace(1)* %out) #0 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
index ecd4ac6824cc6ac1bb40407636dc9c6b0f469225..b488565c6b3ad5ddb2d3a06f5421da5f288a065b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.ll
@@ -9,7 +9,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; SI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0xc0,0xc7]
 ; VI-NEXT: s_dcache_inv ; encoding: [0x00,0x00,0x80,0xc0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_s_dcache_inv() #0 {
+define amdgpu_kernel void @test_s_dcache_inv() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
   ret void
 }
@@ -18,7 +18,7 @@ define void @test_s_dcache_inv() #0 {
 ; GCN-NEXT: ; BB#0:
 ; GCN: s_dcache_inv
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_inv_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
index 097f35d42c4f7e1b937c30313c7f43b63fd9472a..a3a5c329f41115a43d37f2e8cc8bda103ed05423 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.inv.vol.ll
@@ -9,7 +9,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; CI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x40,0xc7]
 ; VI-NEXT: s_dcache_inv_vol ; encoding: [0x00,0x00,0x88,0xc0,0x00,0x00,0x00,0x00]
 ; GCN-NEXT: s_endpgm
-define void @test_s_dcache_inv_vol() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_vol() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
   ret void
 }
@@ -18,7 +18,7 @@ define void @test_s_dcache_inv_vol() #0 {
 ; GCN-NEXT: ; BB#0:
 ; GCN-NEXT: s_dcache_inv_vol
 ; GCN: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_inv_vol_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_inv_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.inv.vol()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
index 9ecce7463f6be5ea9f9127f92f4aca781bed4e3b..909a85dda3e8a884db1fc1b578811b54c1edae70 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.ll
@@ -7,7 +7,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb ; encoding: [0x00,0x00,0x84,0xc0,0x00,0x00,0x00,0x00]
 ; VI-NEXT: s_endpgm
-define void @test_s_dcache_wb() #0 {
+define amdgpu_kernel void @test_s_dcache_wb() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
   ret void
 }
@@ -16,7 +16,7 @@ define void @test_s_dcache_wb() #0 {
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_wb_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
index 943f8c67a2e304329cb61d7176f3f54b82b93d5b..217bf97c41a4a6b4f909c740ad5bbc3344d230dd 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.dcache.wb.vol.ll
@@ -7,7 +7,7 @@ declare void @llvm.amdgcn.s.waitcnt(i32) #0
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb_vol ; encoding: [0x00,0x00,0x8c,0xc0,0x00,0x00,0x00,0x00]
 ; VI-NEXT: s_endpgm
-define void @test_s_dcache_wb_vol() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_vol() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
   ret void
 }
@@ -16,7 +16,7 @@ define void @test_s_dcache_wb_vol() #0 {
 ; VI-NEXT: ; BB#0:
 ; VI-NEXT: s_dcache_wb_vol
 ; VI: s_waitcnt lgkmcnt(0) ; encoding
-define void @test_s_dcache_wb_vol_insert_wait() #0 {
+define amdgpu_kernel void @test_s_dcache_wb_vol_insert_wait() #0 {
   call void @llvm.amdgcn.s.dcache.wb.vol()
   call void @llvm.amdgcn.s.waitcnt(i32 0)
   br label %end
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
index 72513fc86f4912c3bb7e8adcc1c2530c0c761b92..8f64c50b9c601e28562000ccc17f0d4a11dd6c7b 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.decperflevel.ll
@@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.decperflevel(i32) #0
 ; GCN: s_decperflevel 13{{$}}
 ; GCN: s_decperflevel 14{{$}}
 ; GCN: s_decperflevel 15{{$}}
-define void @test_s_decperflevel(i32 %x) #0 {
+define amdgpu_kernel void @test_s_decperflevel(i32 %x) #0 {
   call void @llvm.amdgcn.s.decperflevel(i32 0)
   call void @llvm.amdgcn.s.decperflevel(i32 1)
   call void @llvm.amdgcn.s.decperflevel(i32 2)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
index 4304398182a674c9307a7812ec554948f4c83187..906a8a3e05f44f18422e2dd9b8fd2c94084d549e 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.getreg.ll
@@ -4,7 +4,7 @@
 
 ; GCN-LABEL: {{^}}s_getreg_test:
 ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
-define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+define amdgpu_kernel void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
   %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574)
   %lds_size_bytes = shl i32 %lds_size_64dwords, 8
   store i32 %lds_size_bytes, i32 addrspace(1)* %out
@@ -14,7 +14,7 @@ define void @s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size
 ; Call site has additional readnone knowledge.
 ; GCN-LABEL: {{^}}readnone_s_getreg_test:
 ; GCN: s_getreg_b32 s{{[0-9]+}}, hwreg(HW_REG_LDS_ALLOC, 8, 23)
-define void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
+define amdgpu_kernel void @readnone_s_getreg_test(i32 addrspace(1)* %out) { ; simm16=45574 for lds size.
   %lds_size_64dwords = call i32 @llvm.amdgcn.s.getreg(i32 45574) #1
   %lds_size_bytes = shl i32 %lds_size_64dwords, 8
   store i32 %lds_size_bytes, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
index 2ae4fc473eaa95cbad09c929195d80cab6f6426b..49e6e42579068a2601d2004aada09a5c025d6895 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.incperflevel.ll
@@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.incperflevel(i32) #0
 ; GCN: s_incperflevel 13{{$}}
 ; GCN: s_incperflevel 14{{$}}
 ; GCN: s_incperflevel 15{{$}}
-define void @test_s_incperflevel(i32 %x) #0 {
+define amdgpu_kernel void @test_s_incperflevel(i32 %x) #0 {
   call void @llvm.amdgcn.s.incperflevel(i32 0)
   call void @llvm.amdgcn.s.incperflevel(i32 1)
   call void @llvm.amdgcn.s.incperflevel(i32 2)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
index d8eda10fdfd829b6708d98d4fb4b11ff374abc9a..66041037168a7cfdb5e8156ec03ab9c33b14c358 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memrealtime.ll
@@ -10,7 +10,7 @@ declare i64 @llvm.amdgcn.s.memrealtime() #0
 ; GCN-NOT: lgkmcnt
 ; GCN: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: _store_dwordx2
-define void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_memrealtime(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memrealtime()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
index ff9d74619788b31fdc699042f6cc9eb8abdfc2a5..6aef769bafad8a5ad1a5e246ab6ea28efd610b9a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.memtime.ll
@@ -11,7 +11,7 @@ declare i64 @llvm.amdgcn.s.memtime() #0
 ; GCN-NOT: lgkmcnt
 ; GCN: s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: buffer_store_dwordx2
-define void @test_s_memtime(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_s_memtime(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.amdgcn.s.memtime()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
index 870aa48a3417250f5317f9cc9f0bade73359cdf1..59c910c71c5aaf8b4d519c29293f3ec1e381723a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.s.sleep.ll
@@ -20,7 +20,7 @@ declare void @llvm.amdgcn.s.sleep(i32) #0
 ; GCN: s_sleep 13{{$}}
 ; GCN: s_sleep 14{{$}}
 ; GCN: s_sleep 15{{$}}
-define void @test_s_sleep(i32 %x) #0 {
+define amdgpu_kernel void @test_s_sleep(i32 %x) #0 {
   call void @llvm.amdgcn.s.sleep(i32 0)
   call void @llvm.amdgcn.s.sleep(i32 1)
   call void @llvm.amdgcn.s.sleep(i32 2)
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
index 3aaed9d53772417ad9f26981e291785a14df3aaa..2a3705de2b44d2280d19c7e99f6453796c5c380c 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.hi.u8.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.hi.u8(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_sad_hi_u8:
 ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_sad_hi_u8(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_sad_hi_u8_non_immediate:
 ; GCN: v_sad_hi_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_hi_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.hi.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
index 5438571c58219dd3bf5da73e40b65afb50c36fd7..c404531513e7941f1f7c4dc55d3acb2ada0c6251 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u16.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.u16(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_sad_u16:
 ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_sad_u16(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_sad_u16_non_immediate:
 ; GCN: v_sad_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_u16_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.u16(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
index 9422d7620ca6cd4eff24ac1ead6991a671beb232..1ee876aa724efa6bd7d700b619651c4c37987684 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sad.u8.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.sad.u8(i32, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_sad_u8:
 ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
+define amdgpu_kernel void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
   %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 100, i32 100) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_sad_u8(i32 addrspace(1)* %out, i32 %src) {
 
 ; GCN-LABEL: {{^}}v_sad_u8_non_immediate:
 ; GCN: v_sad_u8 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
+define amdgpu_kernel void @v_sad_u8_non_immediate(i32 addrspace(1)* %out, i32 %src, i32 %a, i32 %b) {
   %result= call i32 @llvm.amdgcn.sad.u8(i32 %src, i32 %a, i32 %b) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
new file mode 100644
index 0000000000000000000000000000000000000000..593c95856811eb07d0ecb11c7d46f80f2ecc1901
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sbfe.ll
@@ -0,0 +1,556 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}bfe_i32_arg_arg_arg:
+; GCN: v_bfe_i32
+define amdgpu_kernel void @bfe_i32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 %src1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_arg_arg_imm:
+; GCN: v_bfe_i32
+define amdgpu_kernel void @bfe_i32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 123)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_arg_imm_arg:
+; GCN: v_bfe_i32
+define amdgpu_kernel void @bfe_i32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 123, i32 %src2)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_imm_arg_arg:
+; GCN: v_bfe_i32
+define amdgpu_kernel void @bfe_i32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 123, i32 %src1, i32 %src2)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_bfe_print_arg:
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 2, 8
+define amdgpu_kernel void @v_bfe_print_arg(i32 addrspace(1)* %out, i32 addrspace(1)* %src0) #0 {
+  %load = load i32, i32 addrspace(1)* %src0, align 4
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 2, i32 8)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_arg_0_width_reg_offset:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 %src1, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_arg_0_width_imm_offset:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.sbfe.i32(i32 %src0, i32 8, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_6:
+; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_7:
+; GCN-NOT: shl
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_8:
+; GCN: buffer_load_dword
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_9:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_10:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_11:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_12:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_13:
+; GCN: v_ashrrev_i32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_test_14:
+; GCN-NOT: lshr
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_0:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 0)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_1:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 12334, i32 0, i32 0)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_2:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 0, i32 0, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_3:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 1, i32 0, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_4:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 0, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_5:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 7, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_6:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0xffffff80
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 128, i32 0, i32 8)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_7:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 0, i32 8)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_8:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 127, i32 6, i32 8)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_9:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65536, i32 16, i32 8)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_10:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 65535, i32 16, i32 16)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_11:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -6
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 4)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_12:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 31, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_13:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 131070, i32 16, i32 16)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_14:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 2, i32 30)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_15:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 160, i32 4, i32 28)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_16:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 4294967295, i32 1, i32 7)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_17:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 1, i32 31)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_i32_constant_fold_test_18:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_i32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
+  %bfe_i32 = call i32 @llvm.amdgcn.sbfe.i32(i32 255, i32 31, i32 1)
+  store i32 %bfe_i32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_sext_in_reg_i24:
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]],
+; GCN-NOT: v_lshl
+; GCN-NOT: v_ashr
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 0, 24
+; GCN: buffer_store_dword [[BFE]],
+define amdgpu_kernel void @bfe_sext_in_reg_i24(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %x, i32 0, i32 24)
+  %shl = shl i32 %bfe, 8
+  %ashr = ashr i32 %shl, 8
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: @simplify_demanded_bfe_sdiv
+; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
+; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[LOAD]], 1, 16
+; GCN: v_lshrrev_b32_e32 [[TMP0:v[0-9]+]], 31, [[BFE]]
+; GCN: v_add_i32_e32 [[TMP1:v[0-9]+]], vcc, [[TMP0]], [[BFE]]
+; GCN: v_ashrrev_i32_e32 [[TMP2:v[0-9]+]], 1, [[TMP1]]
+; GCN: buffer_store_dword [[TMP2]]
+define amdgpu_kernel void @simplify_demanded_bfe_sdiv(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %src = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16)
+  %div = sdiv i32 %bfe, 2
+  store i32 %div, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_0_width:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 8, i32 0)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_8_bfe_8:
+; GCN: v_bfe_i32
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8)
+  %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8)
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_8_bfe_16:
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 8)
+  %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 16)
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; This really should be folded into 1
+; GCN-LABEL: {{^}}bfe_16_bfe_8:
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
+  %load = load i32, i32 addrspace(1)* %ptr, align 4
+  %bfe0 = call i32 @llvm.amdgcn.sbfe.i32(i32 %load, i32 0, i32 16)
+  %bfe1 = call i32 @llvm.amdgcn.sbfe.i32(i32 %bfe0, i32 0, i32 8)
+  store i32 %bfe1, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure there isn't a redundant BFE
+; GCN-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe:
+; GCN: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 0, i32 8)
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
+define amdgpu_kernel void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %c = add i32 %a, %b ; add to prevent folding into extload
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %c, i32 8, i32 0)
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sextload_i8_to_i32_bfe:
+; GCN: buffer_load_sbyte
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
+  %load = load i8, i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 0, i32 8)
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN: .text
+; GCN-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
+  %load = load i8, i8 addrspace(1)* %ptr, align 1
+  %sext = sext i8 %load to i32
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %sext, i32 8, i32 0)
+  %shl = shl i32 %bfe, 24
+  %ashr = ashr i32 %shl, 24
+  store i32 %ashr, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0:
+; GCN-NOT: shr
+; GCN-NOT: shl
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
+; GCN: s_endpgm
+define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1:
+; GCN: buffer_load_dword
+; GCN-NOT: shl
+; GCN-NOT: shr
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
+; GCN: s_endpgm
+define amdgpu_kernel void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
+; GCN: buffer_load_dword
+; GCN-NOT: v_lshl
+; GCN-NOT: v_ashr
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
+; GCN: s_endpgm
+define amdgpu_kernel void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 30
+  %shr = ashr i32 %shl, 30
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %shr, i32 1, i32 2)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..94aeb077ebefc5b85b702d57a1f3427f8b27ef29
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.ll
@@ -0,0 +1,127 @@
+;RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+;RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}test_interrupt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsg sendmsg(MSG_INTERRUPT)
+define amdgpu_kernel void @test_interrupt() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 1, i32 0);
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_emit:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT, 0)
+define amdgpu_kernel void @test_gs_emit() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 34, i32 0);
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_cut:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_CUT, 1)
+define amdgpu_kernel void @test_gs_cut() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 274, i32 0);
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_emit_cut:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsg sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+define amdgpu_kernel void @test_gs_emit_cut() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 562, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_done:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsg sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define amdgpu_kernel void @test_gs_done() {
+body:
+  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}sendmsg:
+; GCN: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; GCN-NEXT: sendmsg(MSG_GS_DONE, GS_OP_NOP)
+; GCN-NEXT: s_endpgm
+define amdgpu_gs void @sendmsg(i32 inreg %a) #0 {
+  call void @llvm.amdgcn.s.sendmsg(i32 3, i32 %a)
+  ret void
+}
+
+; GCN-LABEL: {{^}}sendmsghalt:
+; GCN: s_mov_b32 m0, s0
+; VI-NEXT: s_nop 0
+; GCN-NEXT: s_sendmsghalt sendmsg(MSG_INTERRUPT)
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @sendmsghalt(i32 inreg %a) #0 {
+  call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 %a)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_interrupt_halt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsghalt sendmsg(MSG_INTERRUPT)
+define amdgpu_kernel void @test_interrupt_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 1, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_emit_halt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT, 0)
+define amdgpu_kernel void @test_gs_emit_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 34, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_cut_halt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_CUT, 1)
+define amdgpu_kernel void @test_gs_cut_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 274, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_emit_cut_halt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsghalt sendmsg(MSG_GS, GS_OP_EMIT_CUT, 2)
+define amdgpu_kernel void @test_gs_emit_cut_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 562, i32 0)
+  ret void
+}
+
+; GCN-LABEL: {{^}}test_gs_done_halt:
+; GCN: s_mov_b32 m0, 0
+; GCN-NOT: s_mov_b32 m0
+; GCN: s_sendmsghalt sendmsg(MSG_GS_DONE, GS_OP_NOP)
+define amdgpu_kernel void @test_gs_done_halt() {
+body:
+  call void @llvm.amdgcn.s.sendmsghalt(i32 3, i32 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0
+declare void @llvm.amdgcn.s.sendmsghalt(i32, i32) #0
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
index d453d03cded893824ca1a0c56ebec6c17591fb96..495e36b09f8faee6300f37a0a4e253c05785400a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sffbh.ll
@@ -2,14 +2,13 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 declare i32 @llvm.amdgcn.sffbh.i32(i32) #1
-declare i32 @llvm.AMDGPU.flbit.i32(i32) #1
 
 ; FUNC-LABEL: {{^}}s_flbit:
 ; GCN: s_load_dword [[VAL:s[0-9]+]],
 ; GCN: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]]
 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; GCN: buffer_store_dword [[VRESULT]],
-define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
+define amdgpu_kernel void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
   store i32 %r, i32 addrspace(1)* %out, align 4
   ret void
@@ -19,36 +18,12 @@ define void @s_flbit(i32 addrspace(1)* noalias %out, i32 %val) #0 {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]],
 ; GCN: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
+define amdgpu_kernel void @v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) #0 {
   %val = load i32, i32 addrspace(1)* %valptr, align 4
   %r = call i32 @llvm.amdgcn.sffbh.i32(i32 %val)
   store i32 %r, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; FUNC-LABEL: {{^}}legacy_s_flbit:
-; GCN: s_load_dword [[VAL:s[0-9]+]],
-; GCN: s_flbit_i32 [[SRESULT:s[0-9]+]], [[VAL]]
-; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
-; GCN: buffer_store_dword [[VRESULT]],
-; GCN: s_endpgm
-define void @legacy_s_flbit(i32 addrspace(1)* noalias %out, i32 %val) nounwind {
-  %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
-  store i32 %r, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}legacy_v_flbit:
-; GCN: buffer_load_dword [[VAL:v[0-9]+]],
-; GCN: v_ffbh_i32_e32 [[RESULT:v[0-9]+]], [[VAL]]
-; GCN: buffer_store_dword [[RESULT]],
-; GCN: s_endpgm
-define void @legacy_v_flbit(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %valptr) nounwind {
-  %val = load i32, i32 addrspace(1)* %valptr, align 4
-  %r = call i32 @llvm.AMDGPU.flbit.i32(i32 %val) nounwind readnone
-  store i32 %r, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
index fac0e352614c67fead9c8ee50f0440e58299f8ee..4b930bfa210cd2516f4bbbff5ea0a5d8b738f7c6 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.f16.ll
@@ -7,7 +7,7 @@ declare half @llvm.amdgcn.sin.f16(half %a)
 ; VI:  v_sin_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sin_f16(
+define amdgpu_kernel void @sin_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
index e3692fc5906c29e14fc1e306d2cf4ae1187b30c5..0b7064da23f9ee075be49f68b32305e1cafffef9 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.sin.ll
@@ -5,7 +5,7 @@ declare float @llvm.amdgcn.sin.f32(float) #0
 
 ; GCN-LABEL: {{^}}v_sin_f32:
 ; GCN: v_sin_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
-define void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @v_sin_f32(float addrspace(1)* %out, float %src) #1 {
   %sin = call float @llvm.amdgcn.sin.f32(float %src) #0
   store float %sin, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
index caac6ddbeb80661871bacb38f2fe780feef6e00a..e0cec2134e70c49856cb7149488fdf7f17f7b73a 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.trig.preop.ll
@@ -9,7 +9,7 @@ declare double @llvm.amdgcn.trig.preop.f64(double, i32) nounwind readnone
 ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], [[SEG]]
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load double, double addrspace(1)* %aptr, align 8
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 %b) nounwind readnone
@@ -22,7 +22,7 @@ define void @test_trig_preop_f64(double addrspace(1)* %out, double addrspace(1)*
 ; SI: v_trig_preop_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[SRC]], 7
 ; SI: buffer_store_dwordx2 [[RESULT]],
 ; SI: s_endpgm
-define void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
+define amdgpu_kernel void @test_trig_preop_f64_imm_segment(double addrspace(1)* %out, double addrspace(1)* %aptr) nounwind {
   %a = load double, double addrspace(1)* %aptr, align 8
   %result = call double @llvm.amdgcn.trig.preop.f64(double %a, i32 7) nounwind readnone
   store double %result, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
new file mode 100644
index 0000000000000000000000000000000000000000..92e3a1099da0a92881a866d73ec4d2c305b2d01c
--- /dev/null
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.ubfe.ll
@@ -0,0 +1,623 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; GCN-LABEL: {{^}}bfe_u32_arg_arg_arg:
+; GCN: v_bfe_u32
+define amdgpu_kernel void @bfe_u32_arg_arg_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src1, i32 %src2) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 %src1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_arg_arg_imm:
+; GCN: v_bfe_u32
+define amdgpu_kernel void @bfe_u32_arg_arg_imm(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 123)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_arg_imm_arg:
+; GCN: v_bfe_u32
+define amdgpu_kernel void @bfe_u32_arg_imm_arg(i32 addrspace(1)* %out, i32 %src0, i32 %src2) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 123, i32 %src2)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_imm_arg_arg:
+; GCN: v_bfe_u32
+define amdgpu_kernel void @bfe_u32_imm_arg_arg(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 123, i32 %src1, i32 %src2)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_arg_0_width_reg_offset:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_arg_0_width_reg_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 %src1, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_arg_0_width_imm_offset:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_arg_0_width_imm_offset(i32 addrspace(1)* %out, i32 %src0, i32 %src1) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %src0, i32 8, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zextload_i8:
+; GCN: buffer_load_ubyte
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zextload_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+  %load = load i8, i8 addrspace(1)* %in
+  %ext = zext i8 %load to i32
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN-NEXT: v_and_b32_e32
+; FIXME: Should be using s_add_i32
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i16:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN-NEXT: v_and_b32_e32
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 0, i32 16)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_1:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN: bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 1, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_3:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0xf8
+; GCN-NEXT: bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 3, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i8_offset_7:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN-NEXT: v_and_b32_e32 {{v[0-9]+}}, 0x80
+; GCN-NEXT: bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i8_offset_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 255
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 7, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_zext_in_reg_i16_offset_8:
+; GCN: buffer_load_dword
+; GCN: v_add_i32
+; GCN-NEXT: bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_zext_in_reg_i16_offset_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %load = load i32, i32 addrspace(1)* %in, align 4
+  %add = add i32 %load, 1
+  %ext = and i32 %add, 65535
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %ext, i32 8, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_1:
+; GCN: buffer_load_dword
+; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @bfe_u32_test_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+define amdgpu_kernel void @bfe_u32_test_3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_4:
+; GCN-NOT: lshl
+; GCN-NOT: shr
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = lshr i32 %shl, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_5:
+; GCN: buffer_load_dword
+; GCN-NOT: lshl
+; GCN-NOT: shr
+; GCN: v_bfe_i32 {{v[0-9]+}}, {{v[0-9]+}}, 0, 1
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_5(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %shr = ashr i32 %shl, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shr, i32 0, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_6:
+; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_6(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_7:
+; GCN: v_lshlrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 0, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_8:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_and_b32_e32 {{v[0-9]+}}, 1, {{v[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_8(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = shl i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_9:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_9(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_10:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_10(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 1, i32 31)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_11:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_11(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 8, i32 24)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_12:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 24, v{{[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_12(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %x, i32 24, i32 8)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_13:
+; V_ASHRREV_U32_e32 {{v[0-9]+}}, 31, {{v[0-9]+}}
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_13(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = ashr i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_test_14:
+; GCN-NOT: lshr
+; GCN-NOT: {{[^@]}}bfe
+; GCN: s_endpgm
+define amdgpu_kernel void @bfe_u32_test_14(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  %x = load i32, i32 addrspace(1)* %in, align 4
+  %shl = lshr i32 %x, 31
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %shl, i32 31, i32 1)
+  store i32 %bfe, i32 addrspace(1)* %out, align 4 ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_0:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_0(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_1:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_1(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 12334, i32 0, i32 0)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_2:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_2(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_3:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_3(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 1, i32 0, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_4:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], -1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_4(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 0, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_5:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_5(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 7, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_6:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x80
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_6(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 128, i32 0, i32 8)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_7:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_7(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 0, i32 8)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_8:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_8(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 127, i32 6, i32 8)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_9:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_9(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65536, i32 16, i32 8)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_10:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_10(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 65535, i32 16, i32 16)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_11:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_11(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 4)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_12:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_12(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 31, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_13:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 1
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_13(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 131070, i32 16, i32 16)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_14:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 40
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_14(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 2, i32 30)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_15:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 10
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_15(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 160, i32 4, i32 28)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_16:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_16(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 4294967295, i32 1, i32 7)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_17:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0x7f
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_17(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 1, i32 31)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}bfe_u32_constant_fold_test_18:
+; GCN-NOT: {{[^@]}}bfe
+; GCN: v_mov_b32_e32 [[VREG:v[0-9]+]], 0
+; GCN: buffer_store_dword [[VREG]],
+; GCN: s_endpgm
+; EG-NOT: BFE
+define amdgpu_kernel void @bfe_u32_constant_fold_test_18(i32 addrspace(1)* %out) #0 {
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 255, i32 31, i32 1)
+  store i32 %bfe_u32, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; Make sure that SimplifyDemandedBits doesn't cause the and to be
+; reduced to the bits demanded by the bfe.
+
+; XXX: The operand to v_bfe_u32 could also just directly be the load register.
+; GCN-LABEL: {{^}}simplify_bfe_u32_multi_use_arg:
+; GCN: buffer_load_dword [[ARG:v[0-9]+]]
+; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 63, [[ARG]]
+; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[AND]], 2, 2
+; GCN-DAG: buffer_store_dword [[AND]]
+; GCN-DAG: buffer_store_dword [[BFE]]
+; GCN: s_endpgm
+define amdgpu_kernel void @simplify_bfe_u32_multi_use_arg(i32 addrspace(1)* %out0,
+                                            i32 addrspace(1)* %out1,
+                                            i32 addrspace(1)* %in) #0 {
+  %src = load i32, i32 addrspace(1)* %in, align 4
+  %and = and i32 %src, 63
+  %bfe_u32 = call i32 @llvm.amdgcn.ubfe.i32(i32 %and, i32 2, i32 2)
+  store i32 %bfe_u32, i32 addrspace(1)* %out0, align 4
+  store i32 %and, i32 addrspace(1)* %out1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_and:
+; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; GCN: buffer_store_dword
+define amdgpu_kernel void @lshr_and(i32 addrspace(1)* %out, i32 %a) #0 {
+  %b = lshr i32 %a, 6
+  %c = and i32 %b, 7
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_lshr_and:
+; GCN: v_bfe_u32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}, 3
+; GCN: buffer_store_dword
+define amdgpu_kernel void @v_lshr_and(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+  %c = lshr i32 %a, %b
+  %d = and i32 %c, 7
+  store i32 %d, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_lshr:
+; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; GCN: buffer_store_dword
+define amdgpu_kernel void @and_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
+  %b = and i32 %a, 448
+  %c = lshr i32 %b, 6
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}and_lshr2:
+; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x30006
+; GCN: buffer_store_dword
+define amdgpu_kernel void @and_lshr2(i32 addrspace(1)* %out, i32 %a) #0 {
+  %b = and i32 %a, 511
+  %c = lshr i32 %b, 6
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_lshr:
+; GCN: s_bfe_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x150002
+; GCN: buffer_store_dword
+define amdgpu_kernel void @shl_lshr(i32 addrspace(1)* %out, i32 %a) #0 {
+  %b = shl i32 %a, 9
+  %c = lshr i32 %b, 11
+  store i32 %c, i32 addrspace(1)* %out, align 8
+  ret void
+}
+
+declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
index e85179755371e6cb0254d7ad938dbe59aa02f130..e305f8eff58725ef837d10224c4c1acd0fdf7a37 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.wave.barrier.ll
@@ -4,7 +4,7 @@
 ; GCN-DAG: ; wave barrier
 ; GCN-NOT: s_barrier
 
-define void @test_wave_barrier() #0 {
+define amdgpu_kernel void @test_wave_barrier() #0 {
 entry:
   call void @llvm.amdgcn.wave.barrier() #1
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
index 58529b874442ae78ea67798b91d13a7e7bf70cc0..349e7f0f0e8da3c9fbb82cc39277bc7632f5ea00 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workgroup.id.ll
@@ -34,7 +34,7 @@ declare i32 @llvm.amdgcn.workgroup.id.z() #0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -61,7 +61,7 @@ define void @test_workgroup_id_x(i32 addrspace(1)* %out) #1 {
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.y()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -96,7 +96,7 @@ define void @test_workgroup_id_y(i32 addrspace(1)* %out) #1 {
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Y_EN: 0
 ; ALL: COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
 ; ALL: COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
-define void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workgroup_id_z(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workgroup.id.z()
   store i32 %id, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
index 1f18173f40a4b143b20a2796f696c38fd95481d3..8b80998cab6fe90f1bae91f2ba69864072555db0 100644
--- a/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
+++ b/test/CodeGen/AMDGPU/llvm.amdgcn.workitem.id.ll
@@ -18,7 +18,7 @@ declare i32 @llvm.amdgcn.workitem.id.z() #0
 
 ; ALL-NOT: v0
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v0
-define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -33,7 +33,7 @@ define void @test_workitem_id_x(i32 addrspace(1)* %out) #1 {
 
 ; ALL-NOT: v1
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v1
-define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.y()
   store i32 %id, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @test_workitem_id_y(i32 addrspace(1)* %out) #1 {
 
 ; ALL-NOT: v2
 ; ALL: {{buffer|flat}}_store_dword {{.*}}v2
-define void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_workitem_id_z(i32 addrspace(1)* %out) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.z()
   store i32 %id, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
index 112e29ed22a7c88ebc69970f60e69aaa77597d9e..0604a49372a2b6585a06c732a85f06bcb3968d1e 100644
--- a/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.ceil.f16.ll
@@ -4,7 +4,7 @@
 declare half @llvm.ceil.f16(half %a)
 declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
 
-; GCN-LABEL: {{^}}ceil_f16
+; GCN-LABEL: {{^}}ceil_f16:
 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_ceil_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]]
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.ceil.v2f16(<2 x half> %a)
 ; VI:  v_ceil_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @ceil_f16(
+define amdgpu_kernel void @ceil_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -22,23 +22,27 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}ceil_v2f16
+; GCN-LABEL: {{^}}ceil_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_ceil_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_ceil_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_ceil_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG:  v_ceil_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG:  v_ceil_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: and
+; VI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @ceil_v2f16(
+define amdgpu_kernel void @ceil_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.cos.f16.ll b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
index ba354ed0b124d34cbc2466d8be9f8644c59bbcce..d836ea36ef6320e94e67d8e202652f88614e33b1 100644
--- a/test/CodeGen/AMDGPU/llvm.cos.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.cos.f16.ll
@@ -13,7 +13,7 @@ declare <2 x half> @llvm.cos.v2f16(<2 x half> %a)
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @cos_f16(
+define amdgpu_kernel void @cos_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,26 +25,34 @@ entry:
 
 ; GCN-LABEL: {{^}}cos_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; SI:  v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
-; VI:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
-; GCN: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-; SI:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
-; VI:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
-; GCN: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
-; GCN: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
-; GCN: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
+; SI-DAG:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
+
+; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
+; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
+
+; GCN-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; GCN-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+; GCN-DAG: v_cos_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
+; GCN-DAG: v_cos_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+
+; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; VI-DAG:  v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; GCN-NOT: and
+
+; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+; VI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @cos_v2f16(
+define amdgpu_kernel void @cos_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.cos.ll b/test/CodeGen/AMDGPU/llvm.cos.ll
index eb7dcbbf234609a6b9de4c27b6102d292762065e..bd89502d7b829e408ea55b527bec1054a84cc37d 100644
--- a/test/CodeGen/AMDGPU/llvm.cos.ll
+++ b/test/CodeGen/AMDGPU/llvm.cos.ll
@@ -11,7 +11,7 @@
 ;SI: v_cos_f32
 ;SI-NOT: v_cos_f32
 
-define void @test(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %x) #1 {
    %cos = call float @llvm.cos.f32(float %x)
    store float %cos, float addrspace(1)* %out
    ret void
@@ -29,7 +29,7 @@ define void @test(float addrspace(1)* %out, float %x) #1 {
 ;SI: v_cos_f32
 ;SI-NOT: v_cos_f32
 
-define void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
+define amdgpu_kernel void @testv(<4 x float> addrspace(1)* %out, <4 x float> inreg %vx) #1 {
    %cos = call <4 x float> @llvm.cos.v4f32(<4 x float> %vx)
    store <4 x float> %cos, <4 x float> addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index 8b0854c2c2b555d80202791ea8aa29867814bf00..c4a76de5989c21533da25c591ded2a723d6879a6 100644
--- a/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -9,7 +9,7 @@
 
 ; CHECK: buffer_store_dword
 ; CHECK: s_endpgm
-define void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
+define amdgpu_kernel void @test_debug_value(i32 addrspace(1)* nocapture %globalptr_arg) #0 !dbg !4 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 addrspace(1)* %globalptr_arg, i64 0, metadata !10, metadata !13), !dbg !14
   store i32 123, i32 addrspace(1)* %globalptr_arg, align 4
diff --git a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
index 7fa56911efdce67f357819c05d0b54dcea7dc3ed..5757142b9e9544c78aee6bc61449c6db505c4add 100644
--- a/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.exp2.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.exp2.v2f16(<2 x half> %a)
 ; VI:  v_exp_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @exp2_f16(
+define amdgpu_kernel void @exp2_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,20 +25,24 @@ entry:
 ; GCN-LABEL: {{^}}exp2_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_exp_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_exp_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_exp_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_exp_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_exp_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: and
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @exp2_v2f16(
+define amdgpu_kernel void @exp2_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.exp2.ll b/test/CodeGen/AMDGPU/llvm.exp2.ll
index 42698925aae422d05a64745ae646104fa608056c..387dc3b8566aa74f24ce70c61d866455bc571ced 100644
--- a/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -1,6 +1,6 @@
 ;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s --check-prefix=EG --check-prefix=FUNC
 ;RUN: llc < %s -march=r600 -mcpu=cayman | FileCheck %s --check-prefix=CM --check-prefix=FUNC
-;RUN: llc < %s -march=amdgcn -mcpu=SI | FileCheck %s --check-prefix=SI --check-prefix=FUNC
+;RUN: llc < %s -march=amdgcn | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 ;RUN: llc < %s -march=amdgcn -mcpu=tonga | FileCheck %s --check-prefix=SI --check-prefix=FUNC
 
 ;FUNC-LABEL: {{^}}test:
@@ -11,7 +11,7 @@
 ;CM-DAG: EXP_IEEE T{{[0-9]+\.[XYZW]}}
 ;SI: v_exp_f32
 
-define void @test(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) {
 entry:
    %0 = call float @llvm.exp2.f32(float %in)
    store float %0, float addrspace(1)* %out
@@ -34,7 +34,7 @@ entry:
 ;SI: v_exp_f32
 ;SI: v_exp_f32
 
-define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.exp2.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -68,7 +68,7 @@ entry:
 ;SI: v_exp_f32
 ;SI: v_exp_f32
 ;SI: v_exp_f32
-define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.exp2.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.floor.f16.ll b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
index 60dfd734ee73d1222456cf9c736552c63515a9f0..6a18141d8035e23aace6901a4381dcb11591fe2e 100644
--- a/test/CodeGen/AMDGPU/llvm.floor.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.floor.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.floor.v2f16(<2 x half> %a)
 ; VI:  v_floor_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @floor_f16(
+define amdgpu_kernel void @floor_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,20 +25,24 @@ entry:
 ; GCN-LABEL: {{^}}floor_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_floor_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_floor_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_floor_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_floor_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_floor_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: and
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @floor_v2f16(
+define amdgpu_kernel void @floor_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.fma.f16.ll b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
index f818ac38ee56b35454e0f2baf3ec4fce9c655eee..518fe8baaa7a1cfe84d8c5072141cb5ef02c687e 100644
--- a/test/CodeGen/AMDGPU/llvm.fma.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fma.f16.ll
@@ -16,7 +16,7 @@ declare <2 x half> @llvm.fma.v2f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16(
+define amdgpu_kernel void @fma_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -42,7 +42,7 @@ define void @fma_f16(
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_a(
+define amdgpu_kernel void @fma_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -65,7 +65,7 @@ define void @fma_f16_imm_a(
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_b(
+define amdgpu_kernel void @fma_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -88,7 +88,7 @@ define void @fma_f16_imm_b(
 ; VI:  v_fma_f16 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]], v[[C_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @fma_f16_imm_c(
+define amdgpu_kernel void @fma_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -103,27 +103,35 @@ define void @fma_f16_imm_c(
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+
+; SI: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI: v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
+; SI: v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
-; VI:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+
+; VI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; VI-DAG: v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
+; VI-DAG: v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16(
+define amdgpu_kernel void @fma_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -136,29 +144,33 @@ define void @fma_v2f16(
   ret void
 }
 
-; GCN-LABEL: {{^}}fma_v2f16_imm_a
+; GCN-LABEL: {{^}}fma_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+
 ; SI:  v_mov_b32_e32 v[[A_F32:[0-9]+]], 0x40400000{{$}}
 ; VI:  v_mov_b32_e32 v[[A_F16:[0-9]+]], 0x4200{{$}}
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; GCN-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
 ; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32]], v[[C_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_F16]], v[[C_V2_F16]]
-; VI:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16]], v[[C_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32]], v[[C_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+
+; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[C_F16_1]], v[[A_F16]], v[[B_F16_1]]
+; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[C_V2_F16]], v[[A_F16]], v[[B_V2_F16]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_a(
+define amdgpu_kernel void @fma_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b,
     <2 x half> addrspace(1)* %c) {
@@ -169,29 +181,39 @@ define void @fma_v2f16_imm_a(
   ret void
 }
 
-; GCN-LABEL: {{^}}fma_v2f16_imm_b
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; GCN-LABEL: {{^}}fma_v2f16_imm_b:
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+
+; VI: buffer_load_dword v[[C_V2_F16:[0-9]+]]
+; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+
 ; SI:  v_mov_b32_e32 v[[B_F32:[0-9]+]], 0x40400000{{$}}
 ; VI:  v_mov_b32_e32 v[[B_F16:[0-9]+]], 0x4200{{$}}
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]]
-; VI:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
+; SI-DAG:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32]], v[[C_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32]], v[[C_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+
+; VI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG:  v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_F16]], v[[C_V2_F16]]
+; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16]], v[[C_F16_1]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_b(
+define amdgpu_kernel void @fma_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %c) {
@@ -202,29 +224,39 @@ define void @fma_v2f16_imm_b(
   ret void
 }
 
-; GCN-LABEL: {{^}}fma_v2f16_imm_c
-; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; GCN-LABEL: {{^}}fma_v2f16_imm_c:
+; SI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+; SI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+
+; VI: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+; VI: buffer_load_dword v[[A_V2_F16:[0-9]+]]
+
 ; SI:  v_mov_b32_e32 v[[C_F32:[0-9]+]], 0x40400000{{$}}
 ; VI:  v_mov_b32_e32 v[[C_F16:[0-9]+]], 0x4200{{$}}
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_fma_f32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]], v[[B_F32_0]], v[[C_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
-; VI:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_fma_f32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]], v[[B_F32_1]], v[[C_F32]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+
+; VI-DAG:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DAG:  v_fma_f16 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_F16]]
+; VI-DAG:  v_fma_f16 v[[R_F16_1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16]]
+
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fma_v2f16_imm_c(
+define amdgpu_kernel void @fma_v2f16_imm_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
diff --git a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
index 35cb092528c0ac4b101bc78520f7b77c952af87d..f30fd1d5820436f8d64a8b4d49d42433778d54ab 100644
--- a/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.fmuladd.f16.ll
@@ -24,7 +24,7 @@ declare <2 x half> @llvm.fmuladd.v2f16(<2 x half> %a, <2 x half> %b, <2 x half>
 ; VI-DENORM: buffer_store_short [[RESULT]]
 
 ; GCN: s_endpgm
-define void @fmuladd_f16(
+define amdgpu_kernel void @fmuladd_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -54,7 +54,7 @@ define void @fmuladd_f16(
 ; VI-DENORM: buffer_store_short [[RESULT]]
 
 ; GCN: s_endpgm
-define void @fmuladd_f16_imm_a(
+define amdgpu_kernel void @fmuladd_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c) {
@@ -83,7 +83,7 @@ define void @fmuladd_f16_imm_a(
 
 
 ; GCN: s_endpgm
-define void @fmuladd_f16_imm_b(
+define amdgpu_kernel void @fmuladd_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c) {
@@ -98,38 +98,45 @@ define void @fmuladd_f16_imm_b(
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[C_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
 ; SI:  v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[C_F32_0]]
 ; SI:  v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
-; SI:  v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
+; SI:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
 ; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-
-
-; FIXME: and should be unnecessary
-; VI-FLUSH: v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI-FLUSH: v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]]
-; VI-FLUSH: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]]
-; VI-FLUSH: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
-
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+
+; VI-FLUSH:     v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-FLUSH-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-FLUSH-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
+; VI-FLUSH-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
+; VI-FLUSH-NOT: v_and_b32
+; VI-FLUSH:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
+
+; VI-DENORM: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DENORM: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DENORM: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
 ; VI-DENORM-DAG: v_fma_f16 v[[RES0:[0-9]+]], v[[A_V2_F16]], v[[B_V2_F16]], v[[C_V2_F16]]
 ; VI-DENORM-DAG: v_fma_f16 v[[RES1:[0-9]+]], v[[A_F16_1]], v[[B_F16_1]], v[[C_F16_1]]
-; VI-DENORM: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[RES0]]
-; VI-DENORM: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
+; VI-DENORM-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[RES1]]
+; VI-DENORM-NOT: v_and_b32
+; VI-DENORM: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[RES0]]
 
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @fmuladd_v2f16(
+
+define amdgpu_kernel void @fmuladd_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
diff --git a/test/CodeGen/AMDGPU/llvm.log2.f16.ll b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
index 8d1a8973cb4e36bcae7cd223fb022cc7e7753ba7..773eb55283e44d46f46c54566f4ef13b1fb2b932 100644
--- a/test/CodeGen/AMDGPU/llvm.log2.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.log2.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.log2.v2f16(<2 x half> %a)
 ; VI:  v_log_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @log2_f16(
+define amdgpu_kernel void @log2_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -24,21 +24,25 @@ entry:
 
 ; GCN-LABEL: {{^}}log2_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
-; SI:  v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_log_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG:  v_log_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
+; SI-DAG:  v_log_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT:  and
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG:  v_log_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG:  v_log_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT:  and
+; VI:      v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @log2_v2f16(
+define amdgpu_kernel void @log2_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.log2.ll b/test/CodeGen/AMDGPU/llvm.log2.ll
index c75e7850b353e73bad6f55912ba1db37959edc4a..b9d593e43f32c6d793053533d4aac4c46bd3b33f 100644
--- a/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -11,7 +11,7 @@
 ;CM-DAG: LOG_IEEE T{{[0-9]+\.[XYZW]}}
 ;SI: v_log_f32
 
-define void @test(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float %in) {
 entry:
    %0 = call float @llvm.log2.f32(float %in)
    store float %0, float addrspace(1)* %out
@@ -34,7 +34,7 @@ entry:
 ;SI: v_log_f32
 ;SI: v_log_f32
 
-define void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @testv2(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.log2.v2f32(<2 x float> %in)
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -68,7 +68,7 @@ entry:
 ;SI: v_log_f32
 ;SI: v_log_f32
 ;SI: v_log_f32
-define void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @testv4(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.log2.v4f32(<4 x float> %in)
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index 668aaf1178bd0d711b2b52e7811d7cab189d8509..4c8dff52509a203de69035c0450d967986a7d928 100644
--- a/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -14,7 +14,7 @@ declare <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> %b)
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16(
+define amdgpu_kernel void @maxnum_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -34,7 +34,7 @@ entry:
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16_imm_a(
+define amdgpu_kernel void @maxnum_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -52,7 +52,7 @@ entry:
 ; VI:  v_max_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @maxnum_f16_imm_b(
+define amdgpu_kernel void @maxnum_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -65,24 +65,30 @@ entry:
 ; GCN-LABEL: {{^}}maxnum_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_max_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_max_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI:     v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_max_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_max_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NOT: and
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16(
+define amdgpu_kernel void @maxnum_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -103,14 +109,15 @@ entry:
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-; VI:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-DAG:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16_imm_a(
+define amdgpu_kernel void @maxnum_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -129,14 +136,15 @@ entry:
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_max_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-; VI:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-DAG:  v_max_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG:  v_max_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @maxnum_v2f16_imm_b(
+define amdgpu_kernel void @maxnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.memcpy.ll b/test/CodeGen/AMDGPU/llvm.memcpy.ll
index 009338d273f541005b8563c3c8144389ebfa9c65..7b4db55155eb38b204a180aa59fed84cb09cfc4d 100644
--- a/test/CodeGen/AMDGPU/llvm.memcpy.ll
+++ b/test/CodeGen/AMDGPU/llvm.memcpy.ll
@@ -80,7 +80,7 @@ declare void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* nocapture, i8 addrspace
 ; SI-DAG: ds_write_b8
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 1, i1 false) nounwind
@@ -125,7 +125,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align1(i64 addrspace(3)* noalias %
 ; SI-DAG: ds_write_b16
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 2, i1 false) nounwind
@@ -144,7 +144,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align2(i64 addrspace(3)* noalias %
 ; SI: ds_write2_b32
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 4, i1 false) nounwind
@@ -161,7 +161,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align4(i64 addrspace(3)* noalias %
 ; SI: ds_write2_b64
 
 ; SI-DAG: s_endpgm
-define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %out, i64 addrspace(3)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(3)* %in to i8 addrspace(3)*
   %bcout = bitcast i64 addrspace(3)* %out to i8 addrspace(3)*
   call void @llvm.memcpy.p3i8.p3i8.i32(i8 addrspace(3)* %bcout, i8 addrspace(3)* %bcin, i32 32, i32 8, i1 false) nounwind
@@ -238,7 +238,7 @@ define void @test_small_memcpy_i64_lds_to_lds_align8(i64 addrspace(3)* noalias %
 ; SI-DAG: buffer_store_byte
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 1, i1 false) nounwind
@@ -281,7 +281,7 @@ define void @test_small_memcpy_i64_global_to_global_align1(i64 addrspace(1)* noa
 ; SI-DAG: buffer_store_short
 
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 2, i1 false) nounwind
@@ -294,7 +294,7 @@ define void @test_small_memcpy_i64_global_to_global_align2(i64 addrspace(1)* noa
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 4, i1 false) nounwind
@@ -307,7 +307,7 @@ define void @test_small_memcpy_i64_global_to_global_align4(i64 addrspace(1)* noa
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 8, i1 false) nounwind
@@ -320,7 +320,7 @@ define void @test_small_memcpy_i64_global_to_global_align8(i64 addrspace(1)* noa
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind {
   %bcin = bitcast i64 addrspace(1)* %in to i8 addrspace(1)*
   %bcout = bitcast i64 addrspace(1)* %out to i8 addrspace(1)*
   call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %bcout, i8 addrspace(1)* %bcin, i64 32, i32 16, i1 false) nounwind
@@ -340,7 +340,7 @@ define void @test_small_memcpy_i64_global_to_global_align16(i64 addrspace(1)* no
 ; SI-DAG: s_load_dwordx2
 ; SI-DAG: buffer_store_dwordx4
 ; SI-DAG: buffer_store_dwordx4
-define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
+define amdgpu_kernel void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) nounwind {
   %str = bitcast [16 x i8] addrspace(2)* @hello.align4 to i8 addrspace(2)*
   call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 4, i1 false)
   ret void
@@ -365,7 +365,7 @@ define void @test_memcpy_const_string_align4(i8 addrspace(1)* noalias %out) noun
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
-define void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
+define amdgpu_kernel void @test_memcpy_const_string_align1(i8 addrspace(1)* noalias %out) nounwind {
   %str = bitcast [16 x i8] addrspace(2)* @hello.align1 to i8 addrspace(2)*
   call void @llvm.memcpy.p1i8.p2i8.i64(i8 addrspace(1)* %out, i8 addrspace(2)* %str, i64 32, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index 2ed8b941b457e6275777298e755c75d66f9081cb..b8221356b664197b02b2869caa33edc9549b3456 100644
--- a/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -14,7 +14,7 @@ declare <2 x half> @llvm.minnum.v2f16(<2 x half> %a, <2 x half> %b)
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], v[[B_F16]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16(
+define amdgpu_kernel void @minnum_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -34,7 +34,7 @@ entry:
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], 0x4200, v[[B_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16_imm_a(
+define amdgpu_kernel void @minnum_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b) {
 entry:
@@ -52,7 +52,7 @@ entry:
 ; VI:  v_min_f16_e32 v[[R_F16:[0-9]+]], 4.0, v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @minnum_f16_imm_b(
+define amdgpu_kernel void @minnum_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -65,24 +65,29 @@ entry:
 ; GCN-LABEL: {{^}}minnum_v2f16:
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], v[[B_F16_1]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_min_f32_e32 v[[R_F32_1:[0-9]+]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: and
+; SI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_min_f16_e32 v[[R_F16_0:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]]
+; VI-DAG: v_min_f16_sdwa v[[R_F16_1:[0-9]+]], v[[B_V2_F16]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-NOT: and
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16(
+define amdgpu_kernel void @minnum_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b) {
@@ -96,21 +101,25 @@ entry:
 
 ; GCN-LABEL: {{^}}minnum_v2f16_imm_a:
 ; GCN: buffer_load_dword v[[B_V2_F16:[0-9]+]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_min_f32_e32 v[[R_F32_0:[0-9]+]], 0x40400000, v[[B_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
-; VI:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], 4.0, v[[B_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+
+; VI:  v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; VI-DAG:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 4.0, v[[B_F16_1]]
+; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 0x4200, v[[B_V2_F16]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16_imm_a(
+define amdgpu_kernel void @minnum_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b) {
 entry:
@@ -129,14 +138,15 @@ entry:
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_min_f32_e32 v[[R_F32_1:[0-9]+]], 0x40400000, v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
-; VI:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-DAG:  v_min_f16_e32 v[[R_F16_1:[0-9]+]], 0x4200, v[[A_F16_1]]
+; VI-DAG:  v_min_f16_e32 v[[R_F16_0:[0-9]+]], 4.0, v[[A_V2_F16]]
+
+; GCN-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; GCN-NOT: and
+; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @minnum_v2f16_imm_b(
+define amdgpu_kernel void @minnum_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll b/test/CodeGen/AMDGPU/llvm.r600.cube.ll
similarity index 95%
rename from test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
rename to test/CodeGen/AMDGPU/llvm.r600.cube.ll
index 78b88122229bb96d88d4d96f65df22636f15bafe..b5a0de95acf50daacbb516372c7280256c32965b 100644
--- a/test/CodeGen/AMDGPU/llvm.AMDGPU.cube.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.cube.ll
@@ -22,7 +22,7 @@ main_body:
   %tmp12 = insertelement <4 x float> %tmp11, float %tmp7, i32 1
   %tmp13 = insertelement <4 x float> %tmp12, float %tmp10, i32 2
   %tmp14 = insertelement <4 x float> %tmp13, float 1.000000e+00, i32 3
-  %tmp15 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp14)
+  %tmp15 = call <4 x float> @llvm.r600.cube(<4 x float> %tmp14)
   %tmp16 = extractelement <4 x float> %tmp15, i32 0
   %tmp17 = extractelement <4 x float> %tmp15, i32 1
   %tmp18 = extractelement <4 x float> %tmp15, i32 2
@@ -44,7 +44,7 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
+declare <4 x float> @llvm.r600.cube(<4 x float>) #0
 
 ; Function Attrs: nounwind readnone
 declare float @llvm.fabs.f32(float) #0
diff --git a/test/CodeGen/AMDGPU/llvm.r600.dot4.ll b/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
index 4db29c58385eb6c3f5b88ee9d3ef230742d4e1bc..de8a47741c9470ed6fe0b0919ef7526a4388dc5a 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.dot4.ll
@@ -2,7 +2,7 @@
 
 declare float @llvm.r600.dot4(<4 x float>, <4 x float>) nounwind readnone
 
-define void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
+define amdgpu_kernel void @test_dp4(float addrspace(1)* %out, <4 x float> addrspace(1)* %a, <4 x float> addrspace(1)* %b) nounwind {
   %src0 = load <4 x float>, <4 x float> addrspace(1)* %a, align 16
   %src1 = load <4 x float>, <4 x float> addrspace(1)* %b, align 16
   %dp4 = call float @llvm.r600.dot4(<4 x float> %src0, <4 x float> %src1) nounwind readnone
diff --git a/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll b/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
index e4e6dd8e1069d5eaa956b4259ad907fb5b144494..93caafbb9524024062fd9a4cd131cff587b70e27 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.group.barrier.ll
@@ -2,7 +2,7 @@
 
 ; EG-LABEL: {{^}}test_group_barrier:
 ; EG: GROUP_BARRIER
-define void @test_group_barrier(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_group_barrier(i32 addrspace(1)* %out) #0 {
 entry:
   %tmp = call i32 @llvm.r600.read.tidig.x()
   %tmp1 = getelementptr i32, i32 addrspace(1)* %out, i32 %tmp
diff --git a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
index a5b07e072fa589efc75b96adca8b7bb2a775d316..82c42601ef1e77368acb1462c70775d4433d88ec 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.read.local.size.ll
@@ -14,7 +14,7 @@
 
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x1c
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -44,7 +44,7 @@ entry:
 ; VI-NOHSA: s_load_dword [[VAL:s[0-9]+]], s[0:1], 0x20
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @local_size_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.local.size.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -59,7 +59,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VY:v[0-9]+]], [[Y]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VY]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xy(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xy(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %y = call i32 @llvm.r600.read.local.size.y() #0
@@ -78,7 +78,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[X]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xz(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %z = call i32 @llvm.r600.read.local.size.z() #0
@@ -98,7 +98,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mul_u32_u24_e32 [[VAL:v[0-9]+]], [[Y]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_yz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_yz(i32 addrspace(1)* %out) {
 entry:
   %y = call i32 @llvm.r600.read.local.size.y() #0
   %z = call i32 @llvm.r600.read.local.size.z() #0
@@ -121,7 +121,7 @@ entry:
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_mad_u32_u24 [[VAL:v[0-9]+]], [[X]], [[VY]], [[VZ]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @local_size_xyz(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_xyz(i32 addrspace(1)* %out) {
 entry:
   %x = call i32 @llvm.r600.read.local.size.x() #0
   %y = call i32 @llvm.r600.read.local.size.y() #0
@@ -138,7 +138,7 @@ entry:
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_x_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_x_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.x() #0
   %shl = shl i32 %size, 16
@@ -153,7 +153,7 @@ entry:
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_y_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_y_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.y() #0
   %shl = shl i32 %size, 16
@@ -168,7 +168,7 @@ entry:
 ; GCN-NOT: 0xffff
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NEXT: buffer_store_dword [[VVAL]]
-define void @local_size_z_known_bits(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @local_size_z_known_bits(i32 addrspace(1)* %out) {
 entry:
   %size = call i32 @llvm.r600.read.local.size.z() #0
   %shl = shl i32 %size, 16
diff --git a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
index 1c6e7950e9b712db2b241c75ed990951ac78261d..90d076d4fb4d8718bc70fd72dc835e01d979ed09 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.clamped.ll
@@ -4,7 +4,7 @@ declare float @llvm.r600.recipsqrt.clamped.f32(float) nounwind readnone
 
 ; EG-LABEL: {{^}}rsq_clamped_f32:
 ; EG: RECIPSQRT_CLAMPED
-define void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
+define amdgpu_kernel void @rsq_clamped_f32(float addrspace(1)* %out, float %src) nounwind {
   %rsq_clamped = call float @llvm.r600.recipsqrt.clamped.f32(float %src)
   store float %rsq_clamped, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
index 1d6bff01e6622cd8f8dfd7dad1e284ef2c66f837..d9177b39b8ac0b1db712dbfa26596ee8eb51c662 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.recipsqrt.ieee.ll
@@ -4,7 +4,7 @@ declare float @llvm.r600.recipsqrt.ieee.f32(float) nounwind readnone
 
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32:
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float %src) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @recipsqrt.ieee_f32(float addrspace(1)* %out, float %src) nounwind {
 ; TODO: Really these should be constant folded
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_4.0
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 4.0) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @recipsqrt.ieee_f32_constant_4.0(float addrspace(1)* %out) nounwind
 
 ; EG-LABEL: {{^}}recipsqrt.ieee_f32_constant_100.0
 ; EG: RECIPSQRT_IEEE
-define void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @recipsqrt.ieee_f32_constant_100.0(float addrspace(1)* %out) nounwind {
   %recipsqrt.ieee = call float @llvm.r600.recipsqrt.ieee.f32(float 100.0) nounwind readnone
   store float %recipsqrt.ieee, float addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.r600.tex.ll b/test/CodeGen/AMDGPU/llvm.r600.tex.ll
index 409037f3e976cac4aa8ea0513cb630e7fcc6ebd3..98044917e2b0949fd36e98b5b6199e97d1c1fda2 100644
--- a/test/CodeGen/AMDGPU/llvm.r600.tex.ll
+++ b/test/CodeGen/AMDGPU/llvm.r600.tex.ll
@@ -17,7 +17,7 @@
 ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNNN
 ;CHECK: TEX_SAMPLE T{{[0-9]+\.XYZW, T[0-9]+\.XYZW}} RID:0 SID:0 CT:NNUN
 
-define void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
 bb:
   %addr = load <4 x float>, <4 x float> addrspace(1)* %in
   %tmp = shufflevector <4 x float> %addr, <4 x float> %addr, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f16.ll b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
index 3657940f36fd09942ceb0be79f76ee048e4deeec..f56655630bebf61d638646e143ce282b8dfb173f 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.f16.ll
@@ -1,5 +1,6 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SIVI -check-prefix=VI -check-prefix=GFX89 %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 %s
 
 declare half @llvm.rint.f16(half %a)
 declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
@@ -9,10 +10,10 @@ declare <2 x half> @llvm.rint.v2f16(<2 x half> %a)
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32:[0-9]+]], v[[A_F16]]
 ; SI:  v_rndne_f32_e32 v[[R_F32:[0-9]+]], v[[A_F32]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
-; VI:  v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
+; GFX89: v_rndne_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @rint_f16(
+define amdgpu_kernel void @rint_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,20 +26,29 @@ entry:
 ; GCN-LABEL: {{^}}rint_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_rndne_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_rndne_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: v_and_b32
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_rndne_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: v_and_b32
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
+; GFX9: v_rndne_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; GFX9: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; GFX9: v_rndne_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
+; GFX9: v_pack_b32_f16 v[[R_V2_F16:[0-9]+]], v[[R_F16_0]], v[[R_F16_1]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @rint_v2f16(
+define amdgpu_kernel void @rint_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.rint.f64.ll b/test/CodeGen/AMDGPU/llvm.rint.f64.ll
index c63fb172794030dddf15168896fe923d5f738c91..30ce8ed83ff183c20cfb8b800d914da99c2bd822 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.f64.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.f64.ll
@@ -11,7 +11,7 @@
 ; SI: v_cndmask_b32
 ; SI: v_cndmask_b32
 ; SI: s_endpgm
-define void @rint_f64(double addrspace(1)* %out, double %in) {
+define amdgpu_kernel void @rint_f64(double addrspace(1)* %out, double %in) {
 entry:
   %0 = call double @llvm.rint.f64(double %in)
   store double %0, double addrspace(1)* %out
@@ -21,7 +21,7 @@ entry:
 ; FUNC-LABEL: {{^}}rint_v2f64:
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
-define void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
+define amdgpu_kernel void @rint_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) {
 entry:
   %0 = call <2 x double> @llvm.rint.v2f64(<2 x double> %in)
   store <2 x double> %0, <2 x double> addrspace(1)* %out
@@ -33,7 +33,7 @@ entry:
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
 ; CI: v_rndne_f64_e32
-define void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
+define amdgpu_kernel void @rint_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) {
 entry:
   %0 = call <4 x double> @llvm.rint.v4f64(<4 x double> %in)
   store <4 x double> %0, <4 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.rint.ll b/test/CodeGen/AMDGPU/llvm.rint.ll
index cf7c0e4c6fb6ee318ae0d5ef8669b4261c3a3570..4056bc39448d5f5cae79b5080542ee03886939b8 100644
--- a/test/CodeGen/AMDGPU/llvm.rint.ll
+++ b/test/CodeGen/AMDGPU/llvm.rint.ll
@@ -6,7 +6,7 @@
 ; R600: RNDNE
 
 ; SI: v_rndne_f32_e32
-define void @rint_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @rint_f32(float addrspace(1)* %out, float %in) {
 entry:
   %0 = call float @llvm.rint.f32(float %in) #0
   store float %0, float addrspace(1)* %out
@@ -19,7 +19,7 @@ entry:
 
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
-define void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
+define amdgpu_kernel void @rint_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) {
 entry:
   %0 = call <2 x float> @llvm.rint.v2f32(<2 x float> %in) #0
   store <2 x float> %0, <2 x float> addrspace(1)* %out
@@ -36,7 +36,7 @@ entry:
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
 ; SI: v_rndne_f32_e32
-define void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
+define amdgpu_kernel void @rint_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) {
 entry:
   %0 = call <4 x float> @llvm.rint.v4f32(<4 x float> %in) #0
   store <4 x float> %0, <4 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/llvm.round.f64.ll b/test/CodeGen/AMDGPU/llvm.round.f64.ll
index 3ea4551f0ee730ebd7451437b40056357053394f..c58b9b4d9e9449bd5eab0d5553e2ee2960352618 100644
--- a/test/CodeGen/AMDGPU/llvm.round.f64.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.f64.ll
@@ -2,7 +2,7 @@
 
 ; FUNC-LABEL: {{^}}round_f64:
 ; SI: s_endpgm
-define void @round_f64(double addrspace(1)* %out, double %x) #0 {
+define amdgpu_kernel void @round_f64(double addrspace(1)* %out, double %x) #0 {
   %result = call double @llvm.round.f64(double %x) #1
   store double %result, double addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@ define void @round_f64(double addrspace(1)* %out, double %x) #0 {
 
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %gep = getelementptr double, double addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr double, double addrspace(1)* %out, i32 %tid
@@ -38,7 +38,7 @@ define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0
 
 ; FUNC-LABEL: {{^}}round_v2f64:
 ; SI: s_endpgm
-define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
+define amdgpu_kernel void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
   %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1
   store <2 x double> %result, <2 x double> addrspace(1)* %out
   ret void
@@ -46,7 +46,7 @@ define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 {
 
 ; FUNC-LABEL: {{^}}round_v4f64:
 ; SI: s_endpgm
-define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
+define amdgpu_kernel void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
   %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1
   store <4 x double> %result, <4 x double> addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@ define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 {
 
 ; FUNC-LABEL: {{^}}round_v8f64:
 ; SI: s_endpgm
-define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
+define amdgpu_kernel void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 {
   %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1
   store <8 x double> %result, <8 x double> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/llvm.round.ll b/test/CodeGen/AMDGPU/llvm.round.ll
index 7e8f8ff172e8fcc2b8e3c723f90a2ca9802f1ee7..d211ad8ec9f4aa67650e415866d3247e0e3c253a 100644
--- a/test/CodeGen/AMDGPU/llvm.round.ll
+++ b/test/CodeGen/AMDGPU/llvm.round.ll
@@ -1,18 +1,19 @@
-; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -check-prefix=GCN -check-prefix=GFX89 -check-prefix=GFX9 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}round_f32:
-; SI-DAG: s_load_dword [[SX:s[0-9]+]]
-; SI-DAG: s_brev_b32 [[K:s[0-9]+]], -2{{$}}
-; SI-DAG: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
-; SI-DAG: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
-; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
-; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
-; SI: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5
-; SI: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
-; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
-; SI: buffer_store_dword [[RESULT]]
+; GCN-DAG: s_load_dword [[SX:s[0-9]+]]
+; GCN-DAG: s_brev_b32 [[K:s[0-9]+]], -2{{$}}
+; GCN-DAG: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; GCN-DAG: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; GCN-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; GCN: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]]
+; GCN: v_cmp_ge_f32_e64 vcc, |[[SUB]]|, 0.5
+; GCN: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[VX]]
+; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; GCN: buffer_store_dword [[RESULT]]
 
 ; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]]
 ; R600-DAG: ADD  {{.*}},
@@ -20,7 +21,7 @@
 ; R600-DAG: SETGE
 ; R600-DAG: CNDE
 ; R600-DAG: ADD
-define void @round_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @round_f32(float addrspace(1)* %out, float %x) #0 {
   %result = call float @llvm.round.f32(float %x) #1
   store float %result, float addrspace(1)* %out
   ret void
@@ -32,36 +33,77 @@ define void @round_f32(float addrspace(1)* %out, float %x) #0 {
 ; compiler doesn't crash.
 
 ; FUNC-LABEL: {{^}}round_v2f32:
-; SI: s_endpgm
+; GCN: s_endpgm
 ; R600: CF_END
-define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
+define amdgpu_kernel void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 {
   %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}round_v4f32:
-; SI: s_endpgm
+; GCN: s_endpgm
 ; R600: CF_END
-define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
+define amdgpu_kernel void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 {
   %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1
   store <4 x float> %result, <4 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}round_v8f32:
-; SI: s_endpgm
+; GCN: s_endpgm
 ; R600: CF_END
-define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
+define amdgpu_kernel void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 {
   %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1
   store <8 x float> %result, <8 x float> addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}round_f16:
+; GFX89-DAG: s_load_dword [[SX:s[0-9]+]]
+; GFX89-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7fff{{$}}
+; GFX89-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]]
+; GFX89-DAG: v_mov_b32_e32 [[BFI_K:v[0-9]+]], 0x3c00
+; GFX89: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], [[BFI_K]], [[VX]]
+
+; GFX89: v_trunc_f16_e32 [[TRUNC:v[0-9]+]], [[SX]]
+; GFX89: v_sub_f16_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]]
+; GFX89: v_cmp_ge_f16_e64 vcc, |[[SUB]]|, 0.5
+; GFX89: v_cndmask_b32_e32 [[SEL:v[0-9]+]], 0, [[COPYSIGN]]
+; GFX89: v_add_f16_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]]
+; GFX89: buffer_store_short [[RESULT]]
+define amdgpu_kernel void @round_f16(half addrspace(1)* %out, i32 %x.arg) #0 {
+  %x.arg.trunc = trunc i32 %x.arg to i16
+  %x = bitcast i16 %x.arg.trunc to half
+  %result = call half @llvm.round.f16(half %x) #1
+  store half %result, half addrspace(1)* %out
+  ret void
+}
+
+; Should be scalarized
+; FUNC-LABEL: {{^}}round_v2f16:
+; GFX89-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7fff{{$}}
+; GFX89-DAG: v_mov_b32_e32 [[BFI_K:v[0-9]+]], 0x3c00
+; GFX89: v_bfi_b32 [[COPYSIGN0:v[0-9]+]], [[K]], [[BFI_K]],
+; GFX89: v_bfi_b32 [[COPYSIGN1:v[0-9]+]], [[K]], [[BFI_K]],
+
+; GFX9: v_pack_b32_f16
+define amdgpu_kernel void @round_v2f16(<2 x half> addrspace(1)* %out, i32 %in.arg) #0 {
+  %in = bitcast i32 %in.arg to <2 x half>
+  %result = call <2 x half> @llvm.round.v2f16(<2 x half> %in)
+  store <2 x half> %result, <2 x half> addrspace(1)* %out
+  ret void
+}
+
 declare float @llvm.round.f32(float) #1
 declare <2 x float> @llvm.round.v2f32(<2 x float>) #1
 declare <4 x float> @llvm.round.v4f32(<4 x float>) #1
 declare <8 x float> @llvm.round.v8f32(<8 x float>) #1
 
+declare half @llvm.round.f16(half) #1
+declare <2 x half> @llvm.round.v2f16(<2 x half>) #1
+declare <4 x half> @llvm.round.v4f16(<4 x half>) #1
+declare <8 x half> @llvm.round.v8f16(<8 x half>) #1
+
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/llvm.sin.f16.ll b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
index b01932f69b06c8aa47bc9bacd89297a692bc69da..eb1f32c981f88eae786d7a5c203e382812310f4c 100644
--- a/test/CodeGen/AMDGPU/llvm.sin.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sin.f16.ll
@@ -13,7 +13,7 @@ declare <2 x half> @llvm.sin.v2f16(<2 x half> %a)
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[R_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sin_f16(
+define amdgpu_kernel void @sin_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -26,25 +26,35 @@ entry:
 ; GCN-LABEL: {{^}}sin_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_mov_b32_e32 v[[HALF_PIE:[0-9]+]], 0x3e22f983{{$}}
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
-; SI:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
-; VI:  v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
-; GCN: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
-; SI:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
-; VI:  v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
-; GCN: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
-; GCN: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
-; GCN: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
-; GCN: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], v[[HALF_PIE]], v[[A_F32_0]]
+; SI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; SI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], v[[HALF_PIE]], v[[A_F32_1]]
+; SI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+
+; VI-DAG: v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_cvt_f32_f16_sdwa v[[A_F32_1:[0-9]+]], v[[A_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-DAG: v_mul_f32_e32 v[[M_F32_0:[0-9]+]], 0.15915494, v[[A_F32_0]]
+; VI-DAG: v_mul_f32_e32 v[[M_F32_1:[0-9]+]], 0.15915494, v[[A_F32_1]]
+; VI-DAG: v_fract_f32_e32 v[[F_F32_0:[0-9]+]], v[[M_F32_0]]
+; VI-DAG: v_fract_f32_e32 v[[F_F32_1:[0-9]+]], v[[M_F32_1]]
+
+; GCN-DAG: v_sin_f32_e32 v[[R_F32_0:[0-9]+]], v[[F_F32_0]]
+; GCN-DAG: v_sin_f32_e32 v[[R_F32_1:[0-9]+]], v[[F_F32_1]]
+; GCN-DAG: v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
+
+; SI-DAG: v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
+; SI: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_cvt_f16_f32_sdwa v[[R_F16_1:[0-9]+]], v[[R_F32_1]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @sin_v2f16(
+define amdgpu_kernel void @sin_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.sin.ll b/test/CodeGen/AMDGPU/llvm.sin.ll
index 04754396a0f76cfbbc8c0991176357e893ad22bb..2a17303267ba28cc04ab1e326f8dfc009865b41e 100644
--- a/test/CodeGen/AMDGPU/llvm.sin.ll
+++ b/test/CodeGen/AMDGPU/llvm.sin.ll
@@ -12,7 +12,7 @@
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @sin_f32(float addrspace(1)* %out, float %x) #1 {
    %sin = call float @llvm.sin.f32(float %x)
    store float %sin, float addrspace(1)* %out
    ret void
@@ -24,7 +24,7 @@ define void @sin_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
   %y = fmul float 3.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @safe_sin_3x_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
   %y = fmul float 3.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -51,7 +51,7 @@ define void @unsafe_sin_3x_f32(float addrspace(1)* %out, float %x) #2 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
   %y = fmul float 2.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -65,7 +65,7 @@ define void @safe_sin_2x_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
   %y = fmul float 2.0, %x
   %sin = call float @llvm.sin.f32(float %y)
   store float %sin, float addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @unsafe_sin_2x_f32(float addrspace(1)* %out, float %x) #2 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
+define amdgpu_kernel void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
    %y = fmul float 2.0, %x
    %sin = call float @llvm.sin.f32(float %y)
    store float %sin, float addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @test_safe_2sin_f32(float addrspace(1)* %out, float %x) #1 {
 ; SI: v_fract_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
+define amdgpu_kernel void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
    %y = fmul float 2.0, %x
    %sin = call float @llvm.sin.f32(float %y)
    store float %sin, float addrspace(1)* %out
@@ -110,7 +110,7 @@ define void @test_unsafe_2sin_f32(float addrspace(1)* %out, float %x) #2 {
 ; SI: v_sin_f32
 ; SI: v_sin_f32
 ; SI-NOT: v_sin_f32
-define void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
+define amdgpu_kernel void @sin_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %vx) #1 {
    %sin = call <4 x float> @llvm.sin.v4f32( <4 x float> %vx)
    store <4 x float> %sin, <4 x float> addrspace(1)* %out
    ret void
diff --git a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
index 69125b0bcfdc114c368eb570185ff205859675dd..46ee6526aca2f09505eacf02f17be5981a39536b 100644
--- a/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.sqrt.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.sqrt.v2f16(<2 x half> %a)
 ; VI:  v_sqrt_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sqrt_f16(
+define amdgpu_kernel void @sqrt_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,20 +25,24 @@ entry:
 ; GCN-LABEL: {{^}}sqrt_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_sqrt_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_sqrt_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_sqrt_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: v_and_b32
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_sqrt_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_sqrt_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: v_and_b32
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @sqrt_v2f16(
+define amdgpu_kernel void @sqrt_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
index 9f84b432209d50c2dd2cd3543b6290096db2920c..dc7182aa0d89a3b15d4bb24b35df01eb49c8602e 100644
--- a/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
+++ b/test/CodeGen/AMDGPU/llvm.trunc.f16.ll
@@ -12,7 +12,7 @@ declare <2 x half> @llvm.trunc.v2f16(<2 x half> %a)
 ; VI:  v_trunc_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @trunc_f16(
+define amdgpu_kernel void @trunc_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a) {
 entry:
@@ -25,20 +25,24 @@ entry:
 ; GCN-LABEL: {{^}}trunc_v2f16
 ; GCN: buffer_load_dword v[[A_V2_F16:[0-9]+]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; SI:  v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_trunc_f32_e32 v[[R_F32_0:[0-9]+]], v[[A_F32_0]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[R_F32_0]]
 ; SI:  v_trunc_f32_e32 v[[R_F32_1:[0-9]+]], v[[A_F32_1]]
 ; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[R_F32_1]]
-; VI:  v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
-; VI:  v_trunc_f16_e32 v[[R_F16_1:[0-9]+]], v[[A_F16_1]]
-; GCN: v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
-; GCN: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; SI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
+; SI-NOT: v_and_b32
+; SI: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_0]]
+
+; VI-DAG: v_trunc_f16_e32 v[[R_F16_0:[0-9]+]], v[[A_V2_F16]]
+; VI-DAG: v_trunc_f16_sdwa v[[R_F16_1:[0-9]+]], v[[A_V2_F16]] dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1
+; VI-NOT: v_and_b32
+; VI:     v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_1]], v[[R_F16_0]]
+
 ; GCN: buffer_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @trunc_v2f16(
+define amdgpu_kernel void @trunc_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/load-constant-f64.ll b/test/CodeGen/AMDGPU/load-constant-f64.ll
index 1b42a9e96e010d76c44d3e67cfd5997155154c35..0050d1a4f87431def2da26b26ccfc73ad549bab8 100644
--- a/test/CodeGen/AMDGPU/load-constant-f64.ll
+++ b/test/CodeGen/AMDGPU/load-constant-f64.ll
@@ -6,7 +6,7 @@
 ; GCN: s_load_dwordx2 s[{{[0-9]+:[0-9]+}}]
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_store_dwordx2
-define void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_f64(double addrspace(1)* %out, double addrspace(2)* %in) #0 {
   %ld = load double, double addrspace(2)* %in
   store double %ld, double addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/load-constant-i1.ll b/test/CodeGen/AMDGPU/load-constant-i1.ll
index 104af10036c15da26d70de8da6440d135fd57bd0..c8abe5c77ee5eac02bd0085f8f9c1da86d5ab7fe 100644
--- a/test/CodeGen/AMDGPU/load-constant-i1.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i1.ll
@@ -9,56 +9,56 @@
 
 ; EG: VTX_READ_8
 ; EG: AND_INT
-define void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_i1(i1 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %load = load i1, i1 addrspace(2)* %in
   store i1 %load, i1 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v2i1:
-define void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   store <2 x i1> %load, <2 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v3i1:
-define void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   store <3 x i1> %load, <3 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v4i1:
-define void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   store <4 x i1> %load, <4 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v8i1:
-define void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   store <8 x i1> %load, <8 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v16i1:
-define void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   store <16 x i1> %load, <16 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v32i1:
-define void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   store <32 x i1> %load, <32 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}constant_load_v64i1:
-define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   store <64 x i1> %load, <64 x i1> addrspace(1)* %out
   ret void
@@ -67,7 +67,7 @@ define void @constant_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspa
 ; FUNC-LABEL: {{^}}constant_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @constant_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)
 
 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -89,7 +89,7 @@ define void @constant_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(2)
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i32:
-define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @constant_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i32:
-define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -105,7 +105,7 @@ define void @constant_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i32:
-define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -113,7 +113,7 @@ define void @constant_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i32:
-define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -121,7 +121,7 @@ define void @constant_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i32:
-define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @constant_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i32:
-define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @constant_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i32:
-define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -145,7 +145,7 @@ define void @constant_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i32:
-define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -153,7 +153,7 @@ define void @constant_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i32:
-define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -161,7 +161,7 @@ define void @constant_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i32:
-define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -169,7 +169,7 @@ define void @constant_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i32:
-define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @constant_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i32:
-define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -185,7 +185,7 @@ define void @constant_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i32:
-define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -193,7 +193,7 @@ define void @constant_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i32:
-define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -201,7 +201,7 @@ define void @constant_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i32:
-define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -209,7 +209,7 @@ define void @constant_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i32:
-define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -221,7 +221,7 @@ define void @constant_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]
 ; GCN: buffer_store_dwordx2
-define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -233,7 +233,7 @@ define void @constant_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)* nocapture %in) #0 {
   %a = load i1, i1 addrspace(2)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -241,7 +241,7 @@ define void @constant_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(2)
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i1_to_v1i64:
-define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @constant_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v1i1_to_v1i64:
-define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(2)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -257,7 +257,7 @@ define void @constant_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v2i1_to_v2i64:
-define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -265,7 +265,7 @@ define void @constant_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v2i1_to_v2i64:
-define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(2)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -273,7 +273,7 @@ define void @constant_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v3i1_to_v3i64:
-define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -281,7 +281,7 @@ define void @constant_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v3i1_to_v3i64:
-define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(2)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -289,7 +289,7 @@ define void @constant_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v4i1_to_v4i64:
-define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -297,7 +297,7 @@ define void @constant_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v4i1_to_v4i64:
-define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(2)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -305,7 +305,7 @@ define void @constant_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v8i1_to_v8i64:
-define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -313,7 +313,7 @@ define void @constant_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v8i1_to_v8i64:
-define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(2)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -321,7 +321,7 @@ define void @constant_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v16i1_to_v16i64:
-define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -329,7 +329,7 @@ define void @constant_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v16i1_to_v16i64:
-define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(2)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -337,7 +337,7 @@ define void @constant_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v32i1_to_v32i64:
-define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -345,7 +345,7 @@ define void @constant_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v32i1_to_v32i64:
-define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(2)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -353,7 +353,7 @@ define void @constant_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v64i1_to_v64i64:
-define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -361,7 +361,7 @@ define void @constant_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <6
 }
 
 ; FUNC-LABEL: {{^}}constant_sextload_v64i1_to_v64i64:
-define void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(2)* nocapture %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(2)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-constant-i16.ll b/test/CodeGen/AMDGPU/load-constant-i16.ll
index f7be1291040f401e35dc1e1acfd02ced5fd01ed8..5dd2efdf638210757f0d9d7063e2b3f3142f7349 100644
--- a/test/CodeGen/AMDGPU/load-constant-i16.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i16.ll
@@ -8,7 +8,7 @@
 ; GCN-HSA: flat_load_ushort
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_i16(i16 addrspace(1)* %out, i16 addrspace(2)* %in) {
 entry:
   %ld = load i16, i16 addrspace(2)* %in
   store i16 %ld, i16 addrspace(1)* %out
@@ -19,7 +19,7 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(2)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
@@ -31,7 +31,7 @@ entry:
 
 ; EG-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
-define void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(2)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
@@ -53,7 +53,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(2)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
@@ -65,7 +65,7 @@ entry:
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(2)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
@@ -80,7 +80,7 @@ entry:
 ; GCN-HSA: flat_store_dword
 
 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
-define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @constant_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(
 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 16
-define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -109,7 +109,7 @@ define void @constant_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(
 ; GCN-HSA: flat_load_ushort
 
 ; EG: VTX_READ_16 T{{[0-9]+\.X, T[0-9]+\.X}}, 0, #1
-define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -123,7 +123,7 @@ define void @constant_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; EG: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 16
-define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -140,7 +140,7 @@ define void @constant_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
 ; EG: 16
 ; EG: 16
-define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -160,7 +160,7 @@ define void @constant_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -183,7 +183,7 @@ define void @constant_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: AND_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, literal
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -204,7 +204,7 @@ entry:
 ; EG-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, {{T[0-9]\.[XYZW]}}, 0.0, literal
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
+define amdgpu_kernel void @constant_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(2)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(2)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -229,7 +229,7 @@ entry:
 ; EG-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{T[0-9]\.[XYZW]}}, literal
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -254,7 +254,7 @@ define void @constant_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -288,7 +288,7 @@ define void @constant_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 65535
 ; EG-DAG: 65535
 ; EG-DAG: 65535
-define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -322,7 +322,7 @@ define void @constant_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; EG-DAG: 16
 ; EG-DAG: 16
 ; EG-DAG: 16
-define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -337,7 +337,7 @@ define void @constant_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; v16i16 is naturally 32 byte aligned
 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+.[XYZW]}}, 16, #1
-define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -352,7 +352,7 @@ define void @constant_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <
 ; v16i16 is naturally 32 byte aligned
 ; EG-DAG: VTX_READ_128 [[DST_HI:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 0, #1
 ; EG-DAG: VTX_READ_128 [[DST_LO:T[0-9]+\.XYZW]], {{T[0-9]+\.[XYZW]}}, 16, #1
-define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -369,7 +369,7 @@ define void @constant_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
-define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -385,7 +385,7 @@ define void @constant_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 48, #1
-define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -404,7 +404,7 @@ define void @constant_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
-define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -421,7 +421,7 @@ define void @constant_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 80, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 96, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+\.[XYZW]}}, 112, #1
-define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -438,7 +438,7 @@ define void @constant_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -464,7 +464,7 @@ define void @constant_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
-define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(2)* %in) #0 {
   %a = load i16, i16 addrspace(2)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -475,7 +475,7 @@ define void @constant_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -488,7 +488,7 @@ define void @constant_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EG: 31
-define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(2)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(2)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -498,7 +498,7 @@ define void @constant_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i16_to_v2i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -508,7 +508,7 @@ define void @constant_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_sextload_v2i16_to_v2i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(2)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(2)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -518,7 +518,7 @@ define void @constant_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_zextload_v4i16_to_v4i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -528,7 +528,7 @@ define void @constant_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_sextload_v4i16_to_v4i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(2)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(2)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -538,7 +538,7 @@ define void @constant_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_zextload_v8i16_to_v8i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -548,7 +548,7 @@ define void @constant_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_sextload_v8i16_to_v8i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(2)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(2)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -559,7 +559,7 @@ define void @constant_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -570,7 +570,7 @@ define void @constant_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(2)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(2)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -583,7 +583,7 @@ define void @constant_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -596,7 +596,7 @@ define void @constant_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(2)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(2)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -606,7 +606,7 @@ define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; These trigger undefined register machine verifier errors
 
 ; ; XFUNC-LABEL: {{^}}constant_zextload_v64i16_to_v64i64:
-; define void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -614,7 +614,7 @@ define void @constant_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; }
 
 ; ; XFUNC-LABEL: {{^}}constant_sextload_v64i16_to_v64i64:
-; define void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(2)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-constant-i32.ll b/test/CodeGen/AMDGPU/load-constant-i32.ll
index d1ff1c706c40cd9e7459a8640adeaa775a3d10c1..7370d45ca6b91e0270b1847540e67ac5515508b6 100644
--- a/test/CodeGen/AMDGPU/load-constant-i32.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i32.ll
@@ -7,7 +7,7 @@
 ; GCN: s_load_dword s{{[0-9]+}}
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i32(i32 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(2)* %in
   store i32 %ld, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64
-define void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(2)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -40,7 +40,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -52,7 +52,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
@@ -66,7 +66,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -81,7 +81,7 @@ entry:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG: CF_END
 ; EG: VTX_READ_32
-define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
   %ld = load i32, i32 addrspace(2)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -98,7 +98,7 @@ define void @constant_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(2)* %in) #0 {
   %ld = load i32, i32 addrspace(2)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -108,7 +108,7 @@ define void @constant_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(
 ; FUNC-LABEL: {{^}}constant_zextload_v1i32_to_v1i64:
 ; GCN: s_load_dword
 ; GCN: store_dwordx2
-define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -119,7 +119,7 @@ define void @constant_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; GCN: s_load_dword s[[LO:[0-9]+]]
 ; GCN: s_ashr_i32 s[[HI:[0-9]+]], s[[LO]], 31
 ; GCN: store_dwordx2
-define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(2)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(2)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @constant_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i32_to_v2i64:
 ; GCN: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; GCN: store_dwordx4
-define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -143,7 +143,7 @@ define void @constant_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; GCN-DAG: s_ashr_i32
 
 ; GCN: store_dwordx4
-define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(2)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(2)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -155,7 +155,7 @@ define void @constant_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -172,7 +172,7 @@ define void @constant_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(2)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(2)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -191,7 +191,7 @@ define void @constant_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-SA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -219,7 +219,7 @@ define void @constant_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(2)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(2)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -240,7 +240,7 @@ define void @constant_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
 ; GCN: store_dwordx4
-define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -267,7 +267,7 @@ define void @constant_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(2)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(2)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -319,7 +319,7 @@ define void @constant_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 
-define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -370,7 +370,7 @@ define void @constant_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(2)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(2)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-constant-i64.ll b/test/CodeGen/AMDGPU/load-constant-i64.ll
index 0d071a10b49a9cd08a8b0cb4fe9e431870e1df05..14e50ea4c3cade596a7f92a3bbdb9eec3a2117f0 100644
--- a/test/CodeGen/AMDGPU/load-constant-i64.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i64.ll
@@ -7,7 +7,7 @@
 ; FUNC-LABEL: {{^}}constant_load_i64:
 ; GCN: s_load_dwordx2 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x0{{$}}
 ; EG: VTX_READ_64
-define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0 {
   %ld = load i64, i64 addrspace(2)* %in
   store i64 %ld, i64 addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@ define void @constant_load_i64(i64 addrspace(1)* %out, i64 addrspace(2)* %in) #0
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(2)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 
 ; EG-DAG: VTX_READ_128
 ; EG-DAG: VTX_READ_128
-define void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(2)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
@@ -41,7 +41,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(2)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
@@ -55,7 +55,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(2)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
@@ -74,7 +74,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(2)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-constant-i8.ll b/test/CodeGen/AMDGPU/load-constant-i8.ll
index 9fdc4ebfd8542cddcd9ef8fe9e9610d07e6d059c..6e56b9f9b6d62acbfc1327a8897bf289f858783f 100644
--- a/test/CodeGen/AMDGPU/load-constant-i8.ll
+++ b/test/CodeGen/AMDGPU/load-constant-i8.ll
@@ -10,7 +10,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_i8(i8 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(2)* %in
   store i8 %ld, i8 addrspace(1)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN-HSA: flat_load_ushort v
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(2)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
@@ -33,7 +33,7 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
@@ -44,7 +44,7 @@ entry:
 ; GCN: s_load_dword s
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(2)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
@@ -55,7 +55,7 @@ entry:
 ; GCN: s_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(2)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
@@ -66,7 +66,7 @@ entry:
 ; GCN: s_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(2)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
@@ -78,7 +78,7 @@ entry:
 ; GCN-HSA: flat_load_ubyte
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @constant_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %ld = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -102,7 +102,7 @@ define void @constant_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(2)
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i32:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -114,7 +114,7 @@ define void @constant_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; EG: VTX_READ_8 [[DST:T[0-9]+\.X]], T{{[0-9]+}}.X, 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @constant_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x
 ; TODO: This should use DST, but for some there are redundant MOVs
 ; EG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG: 8
-define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -150,7 +150,7 @@ define void @constant_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -170,7 +170,7 @@ define void @constant_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -193,7 +193,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(2)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(2)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -214,7 +214,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -236,7 +236,7 @@ define void @constant_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -264,7 +264,7 @@ define void @constant_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -294,7 +294,7 @@ define void @constant_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -335,7 +335,7 @@ define void @constant_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -378,7 +378,7 @@ define void @constant_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -450,7 +450,7 @@ define void @constant_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <1
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -526,7 +526,7 @@ define void @constant_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -539,7 +539,7 @@ define void @constant_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <3
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -552,7 +552,7 @@ define void @constant_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -570,7 +570,7 @@ define void @constant_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <6
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -589,7 +589,7 @@ define void @constant_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -600,7 +600,7 @@ define void @constant_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(2)
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -613,7 +613,7 @@ define void @constant_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -623,7 +623,7 @@ define void @constant_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -633,7 +633,7 @@ define void @constant_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_sextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -643,7 +643,7 @@ define void @constant_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -653,7 +653,7 @@ define void @constant_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_sextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -663,7 +663,7 @@ define void @constant_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -673,7 +673,7 @@ define void @constant_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_sextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -683,7 +683,7 @@ define void @constant_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -693,7 +693,7 @@ define void @constant_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 ; FUNC-LABEL: {{^}}constant_sextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -704,7 +704,7 @@ define void @constant_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <1
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -715,7 +715,7 @@ define void @constant_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -723,7 +723,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 }
 
 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i64:
-; define void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -731,7 +731,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 ; }
 
 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i64:
-; define void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -744,7 +744,7 @@ define void @constant_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <3
 
 ; GCN-HSA: flat_load_ubyte v[[VAL:[0-9]+]],
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
-define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -759,7 +759,7 @@ define void @constant_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)* %in) #0 {
   %a = load i8, i8 addrspace(2)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -767,7 +767,7 @@ define void @constant_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(2)
 }
 
 ; FUNC-LABEL: {{^}}constant_zextload_v1i8_to_v1i16:
-define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -778,7 +778,7 @@ define void @constant_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(2)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(2)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -788,7 +788,7 @@ define void @constant_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x
 ; FUNC-LABEL: {{^}}constant_zextload_v2i8_to_v2i16:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -800,7 +800,7 @@ define void @constant_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(2)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(2)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -810,7 +810,7 @@ define void @constant_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x
 ; FUNC-LABEL: {{^}}constant_zextload_v4i8_to_v4i16:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -824,7 +824,7 @@ define void @constant_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(2)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(2)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -834,7 +834,7 @@ define void @constant_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x
 ; FUNC-LABEL: {{^}}constant_zextload_v8i8_to_v8i16:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -853,7 +853,7 @@ define void @constant_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 
-define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(2)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(2)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -863,7 +863,7 @@ define void @constant_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x
 ; FUNC-LABEL: {{^}}constant_zextload_v16i8_to_v16i16:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -889,7 +889,7 @@ define void @constant_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(2)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(2)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -900,7 +900,7 @@ define void @constant_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <1
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -943,7 +943,7 @@ define void @constant_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
+define amdgpu_kernel void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(2)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(2)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -951,7 +951,7 @@ define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3
 }
 
 ; XFUNC-LABEL: {{^}}constant_zextload_v64i8_to_v64i16:
-; define void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
@@ -959,7 +959,7 @@ define void @constant_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <3
 ; }
 
 ; XFUNC-LABEL: {{^}}constant_sextload_v64i8_to_v64i16:
-; define void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
+; define amdgpu_kernel void @constant_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(2)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(2)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-f32.ll b/test/CodeGen/AMDGPU/load-global-f32.ll
index 805c0a7a39c7c71034afa930734f13d345eda3e2..bd6fea587b42fdb5b24a3d77f7c26530e7718da0 100644
--- a/test/CodeGen/AMDGPU/load-global-f32.ll
+++ b/test/CodeGen/AMDGPU/load-global-f32.ll
@@ -10,7 +10,7 @@
 ; GCN-HSA: flat_load_dword
 
 ; R600: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load float, float addrspace(1)* %in
   store float %tmp0, float addrspace(1)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; R600: VTX_READ_64
-define void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <2 x float>, <2 x float> addrspace(1)* %in
   store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
@@ -34,7 +34,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; R600: VTX_READ_128
-define void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f32(<3 x float> addrspace(1)* %out, <3 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <3 x float>, <3 x float> addrspace(1)* %in
   store <3 x float> %tmp0, <3 x float> addrspace(1)* %out
@@ -46,7 +46,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; R600: VTX_READ_128
-define void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <4 x float>, <4 x float> addrspace(1)* %in
   store <4 x float> %tmp0, <4 x float> addrspace(1)* %out
@@ -61,7 +61,7 @@ entry:
 
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-define void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f32(<8 x float> addrspace(1)* %out, <8 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <8 x float>, <8 x float> addrspace(1)* %in
   store <8 x float> %tmp0, <8 x float> addrspace(1)* %out
@@ -83,7 +83,7 @@ entry:
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
 ; R600: VTX_READ_128
-define void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f32(<16 x float> addrspace(1)* %out, <16 x float> addrspace(1)* %in) #0 {
 entry:
   %tmp0 = load <16 x float>, <16 x float> addrspace(1)* %in
   store <16 x float> %tmp0, <16 x float> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-f64.ll b/test/CodeGen/AMDGPU/load-global-f64.ll
index dc1a9432283e4619eb08e4ebf27863904182b682..5b772e1fe5ee30da92933ca977a75816d92a1faa 100644
--- a/test/CodeGen/AMDGPU/load-global-f64.ll
+++ b/test/CodeGen/AMDGPU/load-global-f64.ll
@@ -8,7 +8,7 @@
 
 ; GCN-HSA: flat_load_dwordx2 [[VAL:v\[[0-9]+:[0-9]+\]]]
 ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
-define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %ld = load double, double addrspace(1)* %in
   store double %ld, double addrspace(1)* %out
   ret void
@@ -17,7 +17,7 @@ define void @global_load_f64(double addrspace(1)* %out, double addrspace(1)* %in
 ; FUNC-LABEL: {{^}}global_load_v2f64:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2f64(<2 x double> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x double>, <2 x double> addrspace(1)* %in
   store <2 x double> %ld, <2 x double> addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3f64(<3 x double> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x double>, <3 x double> addrspace(1)* %in
   store <3 x double> %ld, <3 x double> addrspace(1)* %out
@@ -42,7 +42,7 @@ entry:
 
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4f64(<4 x double> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x double>, <4 x double> addrspace(1)* %in
   store <4 x double> %ld, <4 x double> addrspace(1)* %out
@@ -59,7 +59,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8f64(<8 x double> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x double>, <8 x double> addrspace(1)* %in
   store <8 x double> %ld, <8 x double> addrspace(1)* %out
@@ -84,7 +84,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16f64(<16 x double> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x double>, <16 x double> addrspace(1)* %in
   store <16 x double> %ld, <16 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-i1.ll b/test/CodeGen/AMDGPU/load-global-i1.ll
index e2e90cac8cc1ca8fc554ac50e1e376f26a7b893b..cb3536a0c12847174890e084f553635c95c82b0f 100644
--- a/test/CodeGen/AMDGPU/load-global-i1.ll
+++ b/test/CodeGen/AMDGPU/load-global-i1.ll
@@ -9,56 +9,56 @@
 
 ; EG: VTX_READ_8
 ; EG: AND_INT
-define void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %load = load i1, i1 addrspace(1)* %in
   store i1 %load, i1 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v2i1:
-define void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i1(<2 x i1> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   store <2 x i1> %load, <2 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v3i1:
-define void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i1(<3 x i1> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   store <3 x i1> %load, <3 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v4i1:
-define void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i1(<4 x i1> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   store <4 x i1> %load, <4 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v8i1:
-define void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i1(<8 x i1> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   store <8 x i1> %load, <8 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v16i1:
-define void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i1(<16 x i1> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   store <16 x i1> %load, <16 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v32i1:
-define void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v32i1(<32 x i1> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   store <32 x i1> %load, <32 x i1> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}global_load_v64i1:
-define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   store <64 x i1> %load, <64 x i1> addrspace(1)* %out
   ret void
@@ -67,7 +67,7 @@ define void @global_load_v64i1(<64 x i1> addrspace(1)* %out, <64 x i1> addrspace
 ; FUNC-LABEL: {{^}}global_zextload_i1_to_i32:
 ; GCN: buffer_load_ubyte
 ; GCN: buffer_store_dword
-define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @global_zextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 
 ; EG: VTX_READ_8
 ; EG: BFE_INT
-define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -89,7 +89,7 @@ define void @global_sextload_i1_to_i32(i32 addrspace(1)* %out, i1 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i32:
-define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -97,7 +97,7 @@ define void @global_zextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i32:
-define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -105,7 +105,7 @@ define void @global_sextload_v1i1_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i32:
-define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -113,7 +113,7 @@ define void @global_zextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i32:
-define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -121,7 +121,7 @@ define void @global_sextload_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i32:
-define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -129,7 +129,7 @@ define void @global_zextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i32:
-define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @global_sextload_v3i1_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i32:
-define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -145,7 +145,7 @@ define void @global_zextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i32:
-define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -153,7 +153,7 @@ define void @global_sextload_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i32:
-define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -161,7 +161,7 @@ define void @global_zextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i32:
-define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -169,7 +169,7 @@ define void @global_sextload_v8i1_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i32:
-define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @global_zextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i32:
-define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -185,7 +185,7 @@ define void @global_sextload_v16i1_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i32:
-define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -193,7 +193,7 @@ define void @global_zextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i32:
-define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -201,7 +201,7 @@ define void @global_sextload_v32i1_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i32:
-define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -209,7 +209,7 @@ define void @global_zextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i32:
-define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -221,7 +221,7 @@ define void @global_sextload_v64i1_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_and_b32_e32 {{v[0-9]+}}, 1, [[LOAD]]{{$}}
 ; GCN: buffer_store_dwordx2
-define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -233,7 +233,7 @@ define void @global_zextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: buffer_store_dwordx2
-define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %a = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -241,7 +241,7 @@ define void @global_sextload_i1_to_i64(i64 addrspace(1)* %out, i1 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v1i1_to_v1i64:
-define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -249,7 +249,7 @@ define void @global_zextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v1i1_to_v1i64:
-define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1> addrspace(1)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(1)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -257,7 +257,7 @@ define void @global_sextload_v1i1_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i1_to_v2i64:
-define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -265,7 +265,7 @@ define void @global_zextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v2i1_to_v2i64:
-define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1> addrspace(1)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(1)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -273,7 +273,7 @@ define void @global_sextload_v2i1_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v3i1_to_v3i64:
-define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -281,7 +281,7 @@ define void @global_zextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v3i1_to_v3i64:
-define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1> addrspace(1)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(1)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(1)* %out
@@ -289,7 +289,7 @@ define void @global_sextload_v3i1_to_v3i64(<3 x i64> addrspace(1)* %out, <3 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v4i1_to_v4i64:
-define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -297,7 +297,7 @@ define void @global_zextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v4i1_to_v4i64:
-define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1> addrspace(1)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(1)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -305,7 +305,7 @@ define void @global_sextload_v4i1_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v8i1_to_v8i64:
-define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -313,7 +313,7 @@ define void @global_zextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v8i1_to_v8i64:
-define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1> addrspace(1)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(1)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -321,7 +321,7 @@ define void @global_sextload_v8i1_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i1
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v16i1_to_v16i64:
-define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -329,7 +329,7 @@ define void @global_zextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v16i1_to_v16i64:
-define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i1> addrspace(1)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(1)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -337,7 +337,7 @@ define void @global_sextload_v16i1_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v32i1_to_v32i64:
-define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -345,7 +345,7 @@ define void @global_zextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v32i1_to_v32i64:
-define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i1> addrspace(1)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(1)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -353,7 +353,7 @@ define void @global_sextload_v32i1_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v64i1_to_v64i64:
-define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -361,7 +361,7 @@ define void @global_zextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64
 }
 
 ; FUNC-LABEL: {{^}}global_sextload_v64i1_to_v64i64:
-define void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i1_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i1> addrspace(1)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(1)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-i16.ll b/test/CodeGen/AMDGPU/load-global-i16.ll
index 88d6b7b99d306ad7f3ae55d274ac9b8a6574eb1c..dcdd1a947cd4dc799cfd8c46090e4b660b69130d 100644
--- a/test/CodeGen/AMDGPU/load-global-i16.ll
+++ b/test/CodeGen/AMDGPU/load-global-i16.ll
@@ -11,7 +11,7 @@
 ; GCN-HSA: flat_load_ushort
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 entry:
   %ld = load i16, i16 addrspace(1)* %in
   store i16 %ld, i16 addrspace(1)* %out
@@ -23,7 +23,7 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(1)* %out
@@ -36,7 +36,7 @@ entry:
 
 ; EGCM-DAG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 4, #1
-define void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(1)* %out
@@ -48,7 +48,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(1)* %out
@@ -60,7 +60,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(1)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(1)* %out
@@ -76,7 +76,7 @@ entry:
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_load_v16i16(<16 x i16> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(1)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(1)* %out
@@ -91,7 +91,7 @@ entry:
 ; GCN-HSA: flat_store_dword
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -108,7 +108,7 @@ define void @global_zextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EGCM: 16
-define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -120,7 +120,7 @@ define void @global_sextload_i16_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; GCN-HSA: flat_load_ushort
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -134,7 +134,7 @@ define void @global_zextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; EGCM: VTX_READ_16 [[DST:T[0-9]\.[XYZW]]], T{{[0-9]+}}.X, 0, #1
 ; EGCM: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EGCM: 16
-define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -148,7 +148,7 @@ define void @global_sextload_v1i16_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i
 ; EGCM: VTX_READ_32 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EGCM: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], literal
 ; EGCM: 16
-define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -168,7 +168,7 @@ define void @global_zextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST]].Y, {{PV.[XYZW]}}, 0.0, literal
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -190,7 +190,7 @@ define void @global_sextload_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i
 ; EGCM: 16
 ; EGCM: AND_INT {{[* ]*}}[[ST_LO]].X, {{T[0-9]\.[XYZW]}}, literal
 ; EGCM: AND_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], literal
-define void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_zextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -214,7 +214,7 @@ entry:
 ; EGCM-DAG: BFE_INT {{[* ]*}}[[ST_HI]].X, [[DST_HI]], 0.0, literal
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @global_sextload_v3i16_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i16> addrspace(1)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(1)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -237,7 +237,7 @@ entry:
 ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].X, {{.*}}, literal
 ; EGCM-DAG: AND_INT {{[* ]*}}[[ST]].Z, {{.*}}, literal
 ; EGCM-DAG: 16
-define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -262,7 +262,7 @@ define void @global_zextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -296,7 +296,7 @@ define void @global_sextload_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -330,7 +330,7 @@ define void @global_zextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
 ; EGCM-DAG: 16
-define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -346,7 +346,7 @@ define void @global_sextload_v8i16_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i
 
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -357,7 +357,7 @@ define void @global_zextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 0, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
-define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -379,7 +379,7 @@ define void @global_sextload_v16i16_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -401,7 +401,7 @@ define void @global_zextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 16, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 32, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 48, #1
-define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -435,7 +435,7 @@ define void @global_sextload_v32i16_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
-define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -452,7 +452,7 @@ define void @global_zextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 80, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 96, #1
 ; EGCM-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, {{T[0-9]+.[XYZW]}}, 112, #1
-define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -469,7 +469,7 @@ define void @global_sextload_v64i16_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM: MOV {{.*}}, 0.0
-define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -495,7 +495,7 @@ define void @global_zextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EGCM: 31
-define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %a = load i16, i16 addrspace(1)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -506,7 +506,7 @@ define void @global_sextload_i16_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 
 ; EGCM: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EGCM: MOV {{.*}}, 0.0
-define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -519,7 +519,7 @@ define void @global_zextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 ; EGCM: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: These could be expanded earlier using ASHR 15
 ; EGCM: 31
-define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i16> addrspace(1)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(1)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -527,7 +527,7 @@ define void @global_sextload_v1i16_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 }
 
 ; FUNC-LABEL: {{^}}global_zextload_v2i16_to_v2i64:
-define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -537,7 +537,7 @@ define void @global_zextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; FUNC-LABEL: {{^}}global_sextload_v2i16_to_v2i64:
 
 ; EGCM: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -547,7 +547,7 @@ define void @global_sextload_v2i16_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; FUNC-LABEL: {{^}}global_zextload_v4i16_to_v4i64:
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -557,7 +557,7 @@ define void @global_zextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}global_sextload_v4i16_to_v4i64:
 
 ; EGCM: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -567,7 +567,7 @@ define void @global_sextload_v4i16_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}global_zextload_v8i16_to_v8i64:
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -577,7 +577,7 @@ define void @global_zextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; FUNC-LABEL: {{^}}global_sextload_v8i16_to_v8i64:
 
 ; EGCM: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i16> addrspace(1)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(1)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -588,7 +588,7 @@ define void @global_sextload_v8i16_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -599,7 +599,7 @@ define void @global_zextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i16> addrspace(1)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(1)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -612,7 +612,7 @@ define void @global_sextload_v16i16_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -625,7 +625,7 @@ define void @global_zextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 32, #1
 ; EGCM-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 48, #1
-define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i16> addrspace(1)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(1)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -633,7 +633,7 @@ define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; ; XFUNC-LABEL: {{^}}global_zextload_v64i16_to_v64i64:
-; define void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -641,7 +641,7 @@ define void @global_sextload_v32i16_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; }
 
 ; ; XFUNC-LABEL: {{^}}global_sextload_v64i16_to_v64i64:
-; define void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i16_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i16> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(1)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-i32.ll b/test/CodeGen/AMDGPU/load-global-i32.ll
index e3335347a63f1c4e7de563f54da98be8df74d058..5df32c1e3120a60518a8f76f1bd215a446f82bd0 100644
--- a/test/CodeGen/AMDGPU/load-global-i32.ll
+++ b/test/CodeGen/AMDGPU/load-global-i32.ll
@@ -9,7 +9,7 @@
 ; GCN-HSA: flat_load_dword
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0
-define void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(1)* %in
   store i32 %ld, i32 addrspace(1)* %out
@@ -21,7 +21,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; EG: VTX_READ_64
-define void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(1)* %out
@@ -33,7 +33,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(1)* %out
@@ -45,7 +45,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(1)* %out
@@ -60,7 +60,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(1)* %out
@@ -82,7 +82,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i32(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(1)* %out
@@ -98,7 +98,7 @@ entry:
 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]]
 
 ; EG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
-define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %ld = load i32, i32 addrspace(1)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -117,7 +117,7 @@ define void @global_zextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
 ; EG: VTX_READ_32
 ; EG: ASHR {{[* ]*}}T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}},  literal.
 ; EG: 31
-define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %ld = load i32, i32 addrspace(1)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -130,7 +130,7 @@ define void @global_sextload_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)
 
 ; GCN-HSA: flat_load_dword
 ; GCN-HSA: flat_store_dwordx2
-define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -143,7 +143,7 @@ define void @global_zextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; GCN-NOHSA: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
 ; GCN-HSA: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i32> addrspace(1)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(1)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -156,7 +156,7 @@ define void @global_sextload_v1i32_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i
 
 ; GCN-HSA: flat_load_dwordx2
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -172,7 +172,7 @@ define void @global_zextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 
 ; GCN-NOHSA-DAG: buffer_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -187,7 +187,7 @@ define void @global_sextload_v2i32_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -208,7 +208,7 @@ define void @global_zextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -231,7 +231,7 @@ define void @global_sextload_v4i32_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-SA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -263,7 +263,7 @@ define void @global_zextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i32> addrspace(1)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(1)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -309,7 +309,7 @@ define void @global_sextload_v8i32_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i
 ; GCN-DAG: v_ashrrev_i32
 ; GCN-NOHSA-DAG: buffer_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -344,7 +344,7 @@ define void @global_sextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
-define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i32> addrspace(1)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(1)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -444,7 +444,7 @@ define void @global_zextload_v16i32_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; GCN-HSA: flat_store_dwordx4
 ; GCN-HSA: flat_store_dwordx4
 
-define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -511,7 +511,7 @@ define void @global_sextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
 ; GCN-HSA-DAG: flat_store_dwordx4
-define void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i32_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i32> addrspace(1)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(1)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-i64.ll b/test/CodeGen/AMDGPU/load-global-i64.ll
index dd4ce2c10ebd18a6161acb9a048338736649efa2..de16b6c8997ef3850a2f7d2cae4e5e041a8169d0 100644
--- a/test/CodeGen/AMDGPU/load-global-i64.ll
+++ b/test/CodeGen/AMDGPU/load-global-i64.ll
@@ -13,7 +13,7 @@
 ; GCN-HSA: flat_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, [[VAL]]
 
 ; EG: VTX_READ_64
-define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %ld = load i64, i64 addrspace(1)* %in
   store i64 %ld, i64 addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @global_load_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128
-define void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(1)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(1)* %out
@@ -40,7 +40,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(1)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(1)* %out
@@ -56,7 +56,7 @@ entry:
 
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(1)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(1)* %out
@@ -78,7 +78,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i64(<8 x i64> addrspace(1)* %out, <8 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(1)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(1)* %out
@@ -112,7 +112,7 @@ entry:
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
 ; EG: VTX_READ_128
-define void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i64(<16 x i64> addrspace(1)* %out, <16 x i64> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(1)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-global-i8.ll b/test/CodeGen/AMDGPU/load-global-i8.ll
index c880700f347b9634fce16d6488eb262b84ccb3bf..71adf090532fc961dd4f819576eb94702391915e 100644
--- a/test/CodeGen/AMDGPU/load-global-i8.ll
+++ b/test/CodeGen/AMDGPU/load-global-i8.ll
@@ -11,7 +11,7 @@
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; TODO: NOT AND
-define void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(1)* %in
   store i8 %ld, i8 addrspace(1)* %out
@@ -23,7 +23,7 @@ entry:
 ; GCN-HSA: flat_load_ushort v
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(1)* %out
@@ -35,7 +35,7 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v3i8(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
 ; GCN-HSA: flat_load_dword v
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(1)* %out
@@ -59,7 +59,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx2
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(1)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(1)* %out
@@ -72,7 +72,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_load_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(1)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
@@ -84,7 +84,7 @@ entry:
 ; GCN-HSA: flat_load_ubyte
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -98,7 +98,7 @@ define void @global_zextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)*
 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %ld = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(1)* %out
@@ -108,7 +108,7 @@ define void @global_sextload_i8_to_i32(i32 addrspace(1)* %out, i8 addrspace(1)*
 ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i32:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -120,7 +120,7 @@ define void @global_zextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8
 ; EG: VTX_READ_8 [[DST:T[0-9]\.[XYZW]]], [[DST]], 0, #1
 ; EG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, [[DST]], 0.0, literal
 ; EG: 8
-define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(1)* %out
@@ -135,7 +135,7 @@ define void @global_sextload_v1i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i8
 ; TODO: These should use DST, but for some there are redundant MOVs
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, literal
 ; EG-DAG: 8
-define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -152,7 +152,7 @@ define void @global_zextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
@@ -174,7 +174,7 @@ define void @global_sextload_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i8
 ; EG-DAG: BFE_UINT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, {{.*}}literal
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -207,7 +207,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v3i8_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i8> addrspace(1)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -227,7 +227,7 @@ entry:
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -248,7 +248,7 @@ define void @global_zextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(1)* %out
@@ -273,7 +273,7 @@ define void @global_sextload_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -300,7 +300,7 @@ define void @global_zextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(1)* %out
@@ -341,7 +341,7 @@ define void @global_sextload_v8i8_to_v8i32(<8 x i32> addrspace(1)* %out, <8 x i8
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -384,7 +384,7 @@ define void @global_zextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(1)* %out
@@ -456,7 +456,7 @@ define void @global_sextload_v16i8_to_v16i32(<16 x i32> addrspace(1)* %out, <16
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -532,7 +532,7 @@ define void @global_zextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EG-DAG: 8
 ; EG-DAG: 8
 ; EG-DAG: 8
-define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(1)* %out
@@ -545,7 +545,7 @@ define void @global_sextload_v32i8_to_v32i32(<32 x i32> addrspace(1)* %out, <32
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -558,7 +558,7 @@ define void @global_zextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 16, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 32, #1
 ; EG-DAG: VTX_READ_128 {{T[0-9]+\.XYZW}}, T{{[0-9]+}}.X, 48, #1
-define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(1)* %out
@@ -576,7 +576,7 @@ define void @global_sextload_v64i8_to_v64i32(<64 x i32> addrspace(1)* %out, <64
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -595,7 +595,7 @@ define void @global_zextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(1)* %out
@@ -606,7 +606,7 @@ define void @global_sextload_i8_to_i64(i64 addrspace(1)* %out, i8 addrspace(1)*
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG: MOV {{.*}}, 0.0
-define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -619,7 +619,7 @@ define void @global_zextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8
 ; EG: ASHR {{\**}} {{T[0-9]\.[XYZW]}}, {{.*}}, literal
 ; TODO: Why not 7 ?
 ; EG: 31
-define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(1)* %out
@@ -629,7 +629,7 @@ define void @global_sextload_v1i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -639,7 +639,7 @@ define void @global_zextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}global_sextload_v2i8_to_v2i64:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
@@ -649,7 +649,7 @@ define void @global_sextload_v2i8_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -659,7 +659,7 @@ define void @global_zextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}global_sextload_v4i8_to_v4i64:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(1)* %out
@@ -669,7 +669,7 @@ define void @global_sextload_v4i8_to_v4i64(<4 x i64> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -679,7 +679,7 @@ define void @global_zextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}global_sextload_v8i8_to_v8i64:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(1)* %out
@@ -689,7 +689,7 @@ define void @global_sextload_v8i8_to_v8i64(<8 x i64> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -699,7 +699,7 @@ define void @global_zextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 ; FUNC-LABEL: {{^}}global_sextload_v16i8_to_v16i64:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(1)* %out
@@ -710,7 +710,7 @@ define void @global_sextload_v16i8_to_v16i64(<16 x i64> addrspace(1)* %out, <16
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -721,7 +721,7 @@ define void @global_zextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(1)* %out
@@ -729,7 +729,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 }
 
 ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i64:
-; define void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -737,7 +737,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; }
 
 ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i64:
-; define void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i8_to_v64i64(<64 x i64> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(1)* %out
@@ -752,7 +752,7 @@ define void @global_sextload_v32i8_to_v32i64(<32 x i64> addrspace(1)* %out, <32
 ; GCN-HSA: flat_store_short v{{\[[0-9]+:[0-9]+\]}}, v[[VAL]]
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -768,7 +768,7 @@ define void @global_zextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)*
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %a = load i8, i8 addrspace(1)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(1)* %out
@@ -778,7 +778,7 @@ define void @global_sextload_i8_to_i16(i16 addrspace(1)* %out, i8 addrspace(1)*
 ; FUNC-LABEL: {{^}}global_zextload_v1i8_to_v1i16:
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -789,7 +789,7 @@ define void @global_zextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8
 
 ; EG: VTX_READ_8 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8> addrspace(1)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(1)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(1)* %out
@@ -799,7 +799,7 @@ define void @global_sextload_v1i8_to_v1i16(<1 x i16> addrspace(1)* %out, <1 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v2i8_to_v2i16:
 
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -811,7 +811,7 @@ define void @global_zextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8
 ; EG: VTX_READ_16 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8> addrspace(1)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(1)* %out
@@ -821,7 +821,7 @@ define void @global_sextload_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v4i8_to_v4i16:
 
 ; EG: VTX_READ_32 T{{[0-9]+}}.X, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -835,7 +835,7 @@ define void @global_zextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8> addrspace(1)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(1)* %out
@@ -845,7 +845,7 @@ define void @global_sextload_v4i8_to_v4i16(<4 x i16> addrspace(1)* %out, <4 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v8i8_to_v8i16:
 
 ; EG: VTX_READ_64 T{{[0-9]+}}.XY, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -863,7 +863,7 @@ define void @global_zextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8> addrspace(1)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(1)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(1)* %out
@@ -873,7 +873,7 @@ define void @global_sextload_v8i8_to_v8i16(<8 x i16> addrspace(1)* %out, <8 x i8
 ; FUNC-LABEL: {{^}}global_zextload_v16i8_to_v16i16:
 
 ; EG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
-define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -899,7 +899,7 @@ define void @global_zextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16 x i8> addrspace(1)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(1)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(1)* %out
@@ -910,7 +910,7 @@ define void @global_sextload_v16i8_to_v16i16(<16 x i16> addrspace(1)* %out, <16
 
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 0, #1
 ; EG-DAG: VTX_READ_128 T{{[0-9]+}}.XYZW, T{{[0-9]+}}.X, 16, #1
-define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -953,7 +953,7 @@ define void @global_zextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
 ; EG-DAG: BFE_INT {{[* ]*}}T{{[0-9].[XYZW]}}, {{.*}}, 0.0, literal
-define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32 x i8> addrspace(1)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(1)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(1)* %out
@@ -961,7 +961,7 @@ define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32
 }
 
 ; XFUNC-LABEL: {{^}}global_zextload_v64i8_to_v64i16:
-; define void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_zextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
@@ -969,7 +969,7 @@ define void @global_sextload_v32i8_to_v32i16(<32 x i16> addrspace(1)* %out, <32
 ; }
 
 ; XFUNC-LABEL: {{^}}global_sextload_v64i8_to_v64i16:
-; define void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
+; define amdgpu_kernel void @global_sextload_v64i8_to_v64i16(<64 x i16> addrspace(1)* %out, <64 x i8> addrspace(1)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(1)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/load-input-fold.ll b/test/CodeGen/AMDGPU/load-input-fold.ll
index b1899a45bf5693b2fc2b560d50a5bf06175aff7c..0724e09d7ad09189ae1c004a524dec530deb979c 100644
--- a/test/CodeGen/AMDGPU/load-input-fold.ll
+++ b/test/CodeGen/AMDGPU/load-input-fold.ll
@@ -97,15 +97,6 @@ main_body:
 ; Function Attrs: readnone
 declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
 
-; Function Attrs: readonly
-declare float @fabs(float) #2
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.rsq(float) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
-
 ; Function Attrs: nounwind readonly
 declare float @llvm.pow.f32(float, float) #3
 
diff --git a/test/CodeGen/AMDGPU/load-local-f32.ll b/test/CodeGen/AMDGPU/load-local-f32.ll
index 77b5e3cf3aed8785d6a4a3d65d4fdd3fe83a4b26..09d7145424de8710d435e3a11ba036ad62c92cd9 100644
--- a/test/CodeGen/AMDGPU/load-local-f32.ll
+++ b/test/CodeGen/AMDGPU/load-local-f32.ll
@@ -7,7 +7,7 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_f32_local(float addrspace(1)* %out, float addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load float, float addrspace(3)* %in
   store float %tmp0, float addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_v2f32_local(<2 x float> addrspace(1)* %out, <2 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <2 x float>, <2 x float> addrspace(3)* %in
   store <2 x float> %tmp0, <2 x float> addrspace(1)* %out
@@ -38,7 +38,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3f32(<3 x float> addrspace(3)* %out, <3 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <3 x float>, <3 x float> addrspace(3)* %in
   store <3 x float> %tmp0, <3 x float> addrspace(3)* %out
@@ -52,7 +52,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4f32(<4 x float> addrspace(3)* %out, <4 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <4 x float>, <4 x float> addrspace(3)* %in
   store <4 x float> %tmp0, <4 x float> addrspace(3)* %out
@@ -71,7 +71,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8f32(<8 x float> addrspace(3)* %out, <8 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <8 x float>, <8 x float> addrspace(3)* %in
   store <8 x float> %tmp0, <8 x float> addrspace(3)* %out
@@ -100,7 +100,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16f32(<16 x float> addrspace(3)* %out, <16 x float> addrspace(3)* %in) #0 {
 entry:
   %tmp0 = load <16 x float>, <16 x float> addrspace(3)* %in
   store <16 x float> %tmp0, <16 x float> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-f64.ll b/test/CodeGen/AMDGPU/load-local-f64.ll
index 27d39b7e9d7d8b6a4031bbcfa675760086735cb4..9ad6c087bf2effff3d5061a001f57113c7a5de08 100644
--- a/test/CodeGen/AMDGPU/load-local-f64.ll
+++ b/test/CodeGen/AMDGPU/load-local-f64.ll
@@ -9,7 +9,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in) #0 {
   %ld = load double, double addrspace(3)* %in
   store double %ld, double addrspace(3)* %out
   ret void
@@ -22,7 +22,7 @@ define void @local_load_f64(double addrspace(3)* %out, double addrspace(3)* %in)
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2f64(<2 x double> addrspace(3)* %out, <2 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x double>, <2 x double> addrspace(3)* %in
   store <2 x double> %ld, <2 x double> addrspace(3)* %out
@@ -39,7 +39,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3f64(<3 x double> addrspace(3)* %out, <3 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x double>, <3 x double> addrspace(3)* %in
   store <3 x double> %ld, <3 x double> addrspace(3)* %out
@@ -59,7 +59,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4f64(<4 x double> addrspace(3)* %out, <4 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x double>, <4 x double> addrspace(3)* %in
   store <4 x double> %ld, <4 x double> addrspace(3)* %out
@@ -88,7 +88,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8f64(<8 x double> addrspace(3)* %out, <8 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x double>, <8 x double> addrspace(3)* %in
   store <8 x double> %ld, <8 x double> addrspace(3)* %out
@@ -144,7 +144,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16f64(<16 x double> addrspace(3)* %out, <16 x double> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x double>, <16 x double> addrspace(3)* %in
   store <16 x double> %ld, <16 x double> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-i1.ll b/test/CodeGen/AMDGPU/load-local-i1.ll
index 2eed9917b5e5abf0edf9f964b1a7a9f465efb496..e8f134b1fb2ef5c2e6edab17987a6d43091caac2 100644
--- a/test/CodeGen/AMDGPU/load-local-i1.ll
+++ b/test/CodeGen/AMDGPU/load-local-i1.ll
@@ -10,56 +10,56 @@
 ; EG: LDS_UBYTE_READ_RET
 ; EG: AND_INT
 ; EG: LDS_BYTE_WRITE
-define void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i1(i1 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %load = load i1, i1 addrspace(3)* %in
   store i1 %load, i1 addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v2i1:
-define void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i1(<2 x i1> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   store <2 x i1> %load, <2 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v3i1:
-define void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i1(<3 x i1> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   store <3 x i1> %load, <3 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v4i1:
-define void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i1(<4 x i1> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   store <4 x i1> %load, <4 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v8i1:
-define void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i1(<8 x i1> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   store <8 x i1> %load, <8 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v16i1:
-define void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i1(<16 x i1> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   store <16 x i1> %load, <16 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v32i1:
-define void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v32i1(<32 x i1> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   store <32 x i1> %load, <32 x i1> addrspace(3)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}local_load_v64i1:
-define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   store <64 x i1> %load, <64 x i1> addrspace(3)* %out
   ret void
@@ -68,7 +68,7 @@ define void @local_load_v64i1(<64 x i1> addrspace(3)* %out, <64 x i1> addrspace(
 ; FUNC-LABEL: {{^}}local_zextload_i1_to_i32:
 ; GCN: ds_read_u8
 ; GCN: ds_write_b32
-define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = zext i1 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -82,7 +82,7 @@ define void @local_zextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = sext i1 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -90,7 +90,7 @@ define void @local_sextload_i1_to_i32(i32 addrspace(3)* %out, i1 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i32:
-define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = zext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -98,7 +98,7 @@ define void @local_zextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i32:
-define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = sext <1 x i1> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -106,7 +106,7 @@ define void @local_sextload_v1i1_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i32:
-define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = zext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -114,7 +114,7 @@ define void @local_zextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i32:
-define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = sext <2 x i1> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -122,7 +122,7 @@ define void @local_sextload_v2i1_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i32:
-define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = zext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
@@ -130,7 +130,7 @@ define void @local_zextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i32:
-define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = sext <3 x i1> %load to <3 x i32>
   store <3 x i32> %ext, <3 x i32> addrspace(3)* %out
@@ -138,7 +138,7 @@ define void @local_sextload_v3i1_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i32:
-define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = zext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -146,7 +146,7 @@ define void @local_zextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i32:
-define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = sext <4 x i1> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -154,7 +154,7 @@ define void @local_sextload_v4i1_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i32:
-define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = zext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -162,7 +162,7 @@ define void @local_zextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i32:
-define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = sext <8 x i1> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -170,7 +170,7 @@ define void @local_sextload_v8i1_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i32:
-define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = zext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -178,7 +178,7 @@ define void @local_zextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i32:
-define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = sext <16 x i1> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -186,7 +186,7 @@ define void @local_sextload_v16i1_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i32:
-define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = zext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -194,7 +194,7 @@ define void @local_zextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i32:
-define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = sext <32 x i1> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -202,7 +202,7 @@ define void @local_sextload_v32i1_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i32:
-define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = zext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -210,7 +210,7 @@ define void @local_zextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i32:
-define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = sext <64 x i1> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -221,7 +221,7 @@ define void @local_sextload_v64i1_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 ; GCN-DAG: ds_read_u8 [[LOAD:v[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 {{v[0-9]+}}, 0{{$}}
 ; GCN: ds_write_b64
-define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = zext i1 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -233,7 +233,7 @@ define void @local_zextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], {{v[0-9]+}}, 0, 1{{$}}
 ; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, [[BFE]]
 ; GCN: ds_write_b64
-define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %in) #0 {
   %a = load i1, i1 addrspace(3)* %in
   %ext = sext i1 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -241,7 +241,7 @@ define void @local_sextload_i1_to_i64(i64 addrspace(3)* %out, i1 addrspace(3)* %
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i1_to_v1i64:
-define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = zext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -249,7 +249,7 @@ define void @local_zextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i1_to_v1i64:
-define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1> addrspace(3)* %in) #0 {
   %load = load <1 x i1>, <1 x i1> addrspace(3)* %in
   %ext = sext <1 x i1> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -257,7 +257,7 @@ define void @local_sextload_v1i1_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i1_to_v2i64:
-define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = zext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -265,7 +265,7 @@ define void @local_zextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i1_to_v2i64:
-define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1> addrspace(3)* %in) #0 {
   %load = load <2 x i1>, <2 x i1> addrspace(3)* %in
   %ext = sext <2 x i1> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -273,7 +273,7 @@ define void @local_sextload_v2i1_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v3i1_to_v3i64:
-define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = zext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
@@ -281,7 +281,7 @@ define void @local_zextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v3i1_to_v3i64:
-define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1> addrspace(3)* %in) #0 {
   %load = load <3 x i1>, <3 x i1> addrspace(3)* %in
   %ext = sext <3 x i1> %load to <3 x i64>
   store <3 x i64> %ext, <3 x i64> addrspace(3)* %out
@@ -289,7 +289,7 @@ define void @local_sextload_v3i1_to_v3i64(<3 x i64> addrspace(3)* %out, <3 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i1_to_v4i64:
-define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = zext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -297,7 +297,7 @@ define void @local_zextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i1_to_v4i64:
-define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1> addrspace(3)* %in) #0 {
   %load = load <4 x i1>, <4 x i1> addrspace(3)* %in
   %ext = sext <4 x i1> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -305,7 +305,7 @@ define void @local_sextload_v4i1_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i1_to_v8i64:
-define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = zext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -313,7 +313,7 @@ define void @local_zextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i1_to_v8i64:
-define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1> addrspace(3)* %in) #0 {
   %load = load <8 x i1>, <8 x i1> addrspace(3)* %in
   %ext = sext <8 x i1> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -321,7 +321,7 @@ define void @local_sextload_v8i1_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1>
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i1_to_v16i64:
-define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = zext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -329,7 +329,7 @@ define void @local_zextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i1_to_v16i64:
-define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i1> addrspace(3)* %in) #0 {
   %load = load <16 x i1>, <16 x i1> addrspace(3)* %in
   %ext = sext <16 x i1> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -337,7 +337,7 @@ define void @local_sextload_v16i1_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i1_to_v32i64:
-define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = zext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -345,7 +345,7 @@ define void @local_zextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i1_to_v32i64:
-define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i1> addrspace(3)* %in) #0 {
   %load = load <32 x i1>, <32 x i1> addrspace(3)* %in
   %ext = sext <32 x i1> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -353,7 +353,7 @@ define void @local_sextload_v32i1_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v64i1_to_v64i64:
-define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = zext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -361,7 +361,7 @@ define void @local_zextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v64i1_to_v64i64:
-define void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i1_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i1> addrspace(3)* %in) #0 {
   %load = load <64 x i1>, <64 x i1> addrspace(3)* %in
   %ext = sext <64 x i1> %load to <64 x i64>
   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-i16.ll b/test/CodeGen/AMDGPU/load-local-i16.ll
index d4e86de66aff8fe471d944ac8204405b505c8118..bbbb34e8d3331abfa9f5adfc714ec799ed9b93fe 100644
--- a/test/CodeGen/AMDGPU/load-local-i16.ll
+++ b/test/CodeGen/AMDGPU/load-local-i16.ll
@@ -10,7 +10,7 @@
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_SHORT_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_i16(i16 addrspace(3)* %out, i16 addrspace(3)* %in) {
 entry:
   %ld = load i16, i16 addrspace(3)* %in
   store i16 %ld, i16 addrspace(3)* %out
@@ -25,7 +25,7 @@ entry:
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <2 x i16>, <2 x i16> addrspace(3)* %in
   store <2 x i16> %ld, <2 x i16> addrspace(3)* %out
@@ -39,7 +39,7 @@ entry:
 
 ; EG-DAG: LDS_USHORT_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v3i16(<3 x i16> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   store <3 x i16> %ld, <3 x i16> addrspace(3)* %out
@@ -51,7 +51,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v4i16(<4 x i16> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <4 x i16>, <4 x i16> addrspace(3)* %in
   store <4 x i16> %ld, <4 x i16> addrspace(3)* %out
@@ -65,7 +65,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v8i16(<8 x i16> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <8 x i16>, <8 x i16> addrspace(3)* %in
   store <8 x i16> %ld, <8 x i16> addrspace(3)* %out
@@ -86,7 +86,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_load_v16i16(<16 x i16> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <16 x i16>, <16 x i16> addrspace(3)* %in
   store <16 x i16> %ld, <16 x i16> addrspace(3)* %out
@@ -102,7 +102,7 @@ entry:
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = zext i16 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -121,7 +121,7 @@ define void @local_zextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
 ; EG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = sext i16 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -136,7 +136,7 @@ define void @local_sextload_i16_to_i32(i32 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: MOV {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], OQAP
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = zext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -153,7 +153,7 @@ define void @local_zextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1
 ; EG-DAG: BFE_INT {{[* ]*}}[[DATA:T[0-9]+\.[XYZW]]], {{.*}}, 0.0, literal
 ; EG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = sext <1 x i16> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -166,7 +166,7 @@ define void @local_sextload_v1i16_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i1
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = zext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -181,7 +181,7 @@ define void @local_zextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1
 ; EG: LDS_READ_RET
 ; EG: BFE_INT
 ; EG: BFE_INT
-define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = sext <2 x i16> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -194,7 +194,7 @@ define void @local_sextload_v2i16_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i1
 ; GCN-DAG: ds_write_b64
 
 ; EG: LDS_READ_RET
-define void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_local_zextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   %ext = zext <3 x i16> %ld to <3 x i32>
@@ -211,7 +211,7 @@ entry:
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
+define amdgpu_kernel void @local_local_sextload_v3i16_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i16> addrspace(3)* %in) {
 entry:
   %ld = load <3 x i16>, <3 x i16> addrspace(3)* %in
   %ext = sext <3 x i16> %ld to <3 x i32>
@@ -226,7 +226,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = zext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -244,7 +244,7 @@ define void @local_local_zextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = sext <4 x i16> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -258,7 +258,7 @@ define void @local_sextload_v4i16_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = zext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -280,7 +280,7 @@ define void @local_zextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = sext <8 x i16> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -304,7 +304,7 @@ define void @local_sextload_v8i16_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = zext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -340,7 +340,7 @@ define void @local_zextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = sext <16 x i16> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -369,7 +369,7 @@ define void @local_sextload_v16i16_to_v16i32(<16 x i32> addrspace(3)* %out, <16
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = zext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -406,7 +406,7 @@ define void @local_zextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = sext <32 x i16> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -471,7 +471,7 @@ define void @local_sextload_v32i16_to_v32i32(<32 x i32> addrspace(3)* %out, <32
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
   %ext = zext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -512,7 +512,7 @@ define void @local_zextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
   %ext = sext <64 x i16> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -531,7 +531,7 @@ define void @local_sextload_v64i16_to_v64i32(<64 x i32> addrspace(3)* %out, <64
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG-DAG: LDS_WRITE
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = zext i16 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -558,7 +558,7 @@ define void @local_zextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: LDS_WRITE
 ; EG-DAG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)* %in) #0 {
   %a = load i16, i16 addrspace(3)* %in
   %ext = sext i16 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -573,7 +573,7 @@ define void @local_sextload_i16_to_i64(i64 addrspace(3)* %out, i16 addrspace(3)*
 ; EG-DAG: MOV {{[* ]*}}[[TO:T[0-9]+\.[XYZW]]], KC0[2].Y
 ; EG-DAG: LDS_WRITE
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = zext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -590,7 +590,7 @@ define void @local_zextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1
 ; EG-DAG: LDS_WRITE
 ; EG-DAG: 16
 ; EG: LDS_WRITE {{\*?}} [[TO]], [[DATA]]
-define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i16> addrspace(3)* %in) #0 {
   %load = load <1 x i16>, <1 x i16> addrspace(3)* %in
   %ext = sext <1 x i16> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -600,7 +600,7 @@ define void @local_sextload_v1i16_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i1
 ; FUNC-LABEL: {{^}}local_zextload_v2i16_to_v2i64:
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = zext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -612,7 +612,7 @@ define void @local_zextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1
 ; EG: LDS_READ_RET
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
-define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i16> addrspace(3)* %in) #0 {
   %load = load <2 x i16>, <2 x i16> addrspace(3)* %in
   %ext = sext <2 x i16> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -623,7 +623,7 @@ define void @local_sextload_v2i16_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i1
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = zext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -638,7 +638,7 @@ define void @local_zextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i16> addrspace(3)* %in) #0 {
   %load = load <4 x i16>, <4 x i16> addrspace(3)* %in
   %ext = sext <4 x i16> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -651,7 +651,7 @@ define void @local_sextload_v4i16_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = zext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -672,7 +672,7 @@ define void @local_zextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i16> addrspace(3)* %in) #0 {
   %load = load <8 x i16>, <8 x i16> addrspace(3)* %in
   %ext = sext <8 x i16> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -689,7 +689,7 @@ define void @local_sextload_v8i16_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i1
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = zext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -722,7 +722,7 @@ define void @local_zextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i16> addrspace(3)* %in) #0 {
   %load = load <16 x i16>, <16 x i16> addrspace(3)* %in
   %ext = sext <16 x i16> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -747,7 +747,7 @@ define void @local_sextload_v16i16_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = zext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -804,7 +804,7 @@ define void @local_zextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 ; EG-DAG: BFE_INT
 ; EG-DAG: ASHR
 ; EG-DAG: ASHR
-define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i16> addrspace(3)* %in) #0 {
   %load = load <32 x i16>, <32 x i16> addrspace(3)* %in
   %ext = sext <32 x i16> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -812,7 +812,7 @@ define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 }
 
 ; ; XFUNC-LABEL: {{^}}local_zextload_v64i16_to_v64i64:
-; define void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
 ;   %ext = zext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -820,7 +820,7 @@ define void @local_sextload_v32i16_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 ; }
 
 ; ; XFUNC-LABEL: {{^}}local_sextload_v64i16_to_v64i64:
-; define void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i16_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i16> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i16>, <64 x i16> addrspace(3)* %in
 ;   %ext = sext <64 x i16> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-i32.ll b/test/CodeGen/AMDGPU/load-local-i32.ll
index 280f9658ef8de9717f0657a380ffeb549f9e3e68..86055413d2cf64899e52a14975e230ff25d18f30 100644
--- a/test/CodeGen/AMDGPU/load-local-i32.ll
+++ b/test/CodeGen/AMDGPU/load-local-i32.ll
@@ -9,7 +9,7 @@
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i32(i32 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
 entry:
   %ld = load i32, i32 addrspace(3)* %in
   store i32 %ld, i32 addrspace(3)* %out
@@ -18,7 +18,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}local_load_v2i32:
 ; GCN: ds_read_b64
-define void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   store <2 x i32> %ld, <2 x i32> addrspace(3)* %out
@@ -28,7 +28,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_load_v3i32:
 ; GCN-DAG: ds_read_b64
 ; GCN-DAG: ds_read_b32
-define void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i32(<3 x i32> addrspace(3)* %out, <3 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i32>, <3 x i32> addrspace(3)* %in
   store <3 x i32> %ld, <3 x i32> addrspace(3)* %out
@@ -38,7 +38,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_load_v4i32:
 ; GCN: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
 
-define void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   store <4 x i32> %ld, <4 x i32> addrspace(3)* %out
@@ -48,7 +48,7 @@ entry:
 ; FUNC-LABEL: {{^}}local_load_v8i32:
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset0:2 offset1:3{{$}}
 ; GCN-DAG: ds_read2_b64 v{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} offset1:1{{$}}
-define void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i32(<8 x i32> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   store <8 x i32> %ld, <8 x i32> addrspace(3)* %out
@@ -64,7 +64,7 @@ entry:
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:4 offset1:5
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b64 v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}} offset1:1
-define void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i32(<16 x i32> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   store <16 x i32> %ld, <16 x i32> addrspace(3)* %out
@@ -72,7 +72,7 @@ entry:
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_i32_to_i64:
-define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
   %ld = load i32, i32 addrspace(3)* %in
   %ext = zext i32 %ld to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -80,7 +80,7 @@ define void @local_zextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)*
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_i32_to_i64:
-define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)* %in) #0 {
   %ld = load i32, i32 addrspace(3)* %in
   %ext = sext i32 %ld to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -88,7 +88,7 @@ define void @local_sextload_i32_to_i64(i64 addrspace(3)* %out, i32 addrspace(3)*
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v1i32_to_v1i64:
-define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
   %ext = zext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -96,7 +96,7 @@ define void @local_zextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v1i32_to_v1i64:
-define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i32> addrspace(3)* %in) #0 {
   %ld = load <1 x i32>, <1 x i32> addrspace(3)* %in
   %ext = sext <1 x i32> %ld to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -104,7 +104,7 @@ define void @local_sextload_v1i32_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v2i32_to_v2i64:
-define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   %ext = zext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -112,7 +112,7 @@ define void @local_zextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v2i32_to_v2i64:
-define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i32> addrspace(3)* %in) #0 {
   %ld = load <2 x i32>, <2 x i32> addrspace(3)* %in
   %ext = sext <2 x i32> %ld to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -120,7 +120,7 @@ define void @local_sextload_v2i32_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v4i32_to_v4i64:
-define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   %ext = zext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -128,7 +128,7 @@ define void @local_zextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v4i32_to_v4i64:
-define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i32> addrspace(3)* %in) #0 {
   %ld = load <4 x i32>, <4 x i32> addrspace(3)* %in
   %ext = sext <4 x i32> %ld to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -136,7 +136,7 @@ define void @local_sextload_v4i32_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v8i32_to_v8i64:
-define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   %ext = zext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -144,7 +144,7 @@ define void @local_zextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v8i32_to_v8i64:
-define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i32> addrspace(3)* %in) #0 {
   %ld = load <8 x i32>, <8 x i32> addrspace(3)* %in
   %ext = sext <8 x i32> %ld to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -152,7 +152,7 @@ define void @local_sextload_v8i32_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i3
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v16i32_to_v16i64:
-define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   %ext = sext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -160,7 +160,7 @@ define void @local_sextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v16i32_to_v16i64
-define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i32> addrspace(3)* %in) #0 {
   %ld = load <16 x i32>, <16 x i32> addrspace(3)* %in
   %ext = zext <16 x i32> %ld to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -168,7 +168,7 @@ define void @local_zextload_v16i32_to_v16i64(<16 x i64> addrspace(3)* %out, <16
 }
 
 ; FUNC-LABEL: {{^}}local_sextload_v32i32_to_v32i64:
-define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
   %ext = sext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -176,7 +176,7 @@ define void @local_sextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32
 }
 
 ; FUNC-LABEL: {{^}}local_zextload_v32i32_to_v32i64:
-define void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i32_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i32> addrspace(3)* %in) #0 {
   %ld = load <32 x i32>, <32 x i32> addrspace(3)* %in
   %ext = zext <32 x i32> %ld to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-i64.ll b/test/CodeGen/AMDGPU/load-local-i64.ll
index 180807df7b9add3186f2e985e5f466004e836622..0c719a9e0bf90599b225fb7e645e3d23c9abc172 100644
--- a/test/CodeGen/AMDGPU/load-local-i64.ll
+++ b/test/CodeGen/AMDGPU/load-local-i64.ll
@@ -9,7 +9,7 @@
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
   %ld = load i64, i64 addrspace(3)* %in
   store i64 %ld, i64 addrspace(3)* %out
   ret void
@@ -22,7 +22,7 @@ define void @local_load_i64(i64 addrspace(3)* %out, i64 addrspace(3)* %in) #0 {
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i64(<2 x i64> addrspace(3)* %out, <2 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i64>, <2 x i64> addrspace(3)* %in
   store <2 x i64> %ld, <2 x i64> addrspace(3)* %out
@@ -39,7 +39,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i64>, <3 x i64> addrspace(3)* %in
   store <3 x i64> %ld, <3 x i64> addrspace(3)* %out
@@ -59,7 +59,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i64(<4 x i64> addrspace(3)* %out, <4 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i64>, <4 x i64> addrspace(3)* %in
   store <4 x i64> %ld, <4 x i64> addrspace(3)* %out
@@ -88,7 +88,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i64(<8 x i64> addrspace(3)* %out, <8 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i64>, <8 x i64> addrspace(3)* %in
   store <8 x i64> %ld, <8 x i64> addrspace(3)* %out
@@ -144,7 +144,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i64(<16 x i64> addrspace(3)* %out, <16 x i64> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i64>, <16 x i64> addrspace(3)* %in
   store <16 x i64> %ld, <16 x i64> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-local-i8.ll b/test/CodeGen/AMDGPU/load-local-i8.ll
index 9ffc74213dd513ed055f31d19662257372427b95..731996ec6c4595f8e03a2262888e29f525f3c99f 100644
--- a/test/CodeGen/AMDGPU/load-local-i8.ll
+++ b/test/CodeGen/AMDGPU/load-local-i8.ll
@@ -9,7 +9,7 @@
 ; GCN: ds_read_u8
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i8(i8 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
 entry:
   %ld = load i8, i8 addrspace(3)* %in
   store i8 %ld, i8 addrspace(3)* %out
@@ -22,7 +22,7 @@ entry:
 ; GCN: ds_read_u16
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v2i8(<2 x i8> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <2 x i8>, <2 x i8> addrspace(3)* %in
   store <2 x i8> %ld, <2 x i8> addrspace(3)* %out
@@ -33,7 +33,7 @@ entry:
 ; GCN: ds_read_b32
 
 ; EG: DS_READ_RET
-define void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v3i8(<3 x i8> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   store <3 x i8> %ld, <3 x i8> addrspace(3)* %out
@@ -44,7 +44,7 @@ entry:
 ; GCN: ds_read_b32
 
 ; EG: LDS_READ_RET
-define void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <4 x i8>, <4 x i8> addrspace(3)* %in
   store <4 x i8> %ld, <4 x i8> addrspace(3)* %out
@@ -56,7 +56,7 @@ entry:
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v8i8(<8 x i8> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <8 x i8>, <8 x i8> addrspace(3)* %in
   store <8 x i8> %ld, <8 x i8> addrspace(3)* %out
@@ -71,7 +71,7 @@ entry:
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_v16i8(<16 x i8> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in
   store <16 x i8> %ld, <16 x i8> addrspace(3)* %out
@@ -84,7 +84,7 @@ entry:
 ; GCN: ds_read_u8
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -98,7 +98,7 @@ define void @local_zextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %ld = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %ld to i32
   store i32 %ext, i32 addrspace(3)* %out
@@ -108,7 +108,7 @@ define void @local_sextload_i8_to_i32(i32 addrspace(3)* %out, i8 addrspace(3)* %
 ; FUNC-LABEL: {{^}}local_zextload_v1i8_to_v1i32:
 
 ; EG: LDS_UBYTE_READ_RET
-define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -119,7 +119,7 @@ define void @local_zextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8>
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
-define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i32>
   store <1 x i32> %ext, <1 x i32> addrspace(3)* %out
@@ -130,7 +130,7 @@ define void @local_sextload_v1i8_to_v1i32(<1 x i32> addrspace(3)* %out, <1 x i8>
 ; GCN: ds_read_u16
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -156,7 +156,7 @@ define void @local_zextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8>
 ; EG: LDS_USHORT_READ_RET
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i32>
   store <2 x i32> %ext, <2 x i32> addrspace(3)* %out
@@ -172,7 +172,7 @@ define void @local_sextload_v2i8_to_v2i32(<2 x i32> addrspace(3)* %out, <2 x i8>
 ; GCN-DAG: v_and_b32_e32 v{{[0-9]+}}, 0xff,
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   %ext = zext <3 x i8> %ld to <3 x i32>
@@ -197,7 +197,7 @@ entry:
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v3i8_to_v3i32(<3 x i32> addrspace(3)* %out, <3 x i8> addrspace(3)* %in) #0 {
 entry:
   %ld = load <3 x i8>, <3 x i8> addrspace(3)* %in
   %ext = sext <3 x i8> %ld to <3 x i32>
@@ -214,7 +214,7 @@ entry:
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -231,7 +231,7 @@ define void @local_zextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8>
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i32>
   store <4 x i32> %ext, <4 x i32> addrspace(3)* %out
@@ -248,7 +248,7 @@ define void @local_sextload_v4i8_to_v4i32(<4 x i32> addrspace(3)* %out, <4 x i8>
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -267,7 +267,7 @@ define void @local_zextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8>
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i32>
   store <8 x i32> %ext, <8 x i32> addrspace(3)* %out
@@ -292,7 +292,7 @@ define void @local_sextload_v8i8_to_v8i32(<8 x i32> addrspace(3)* %out, <8 x i8>
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
 ; EG-DAG: BFE_UINT
-define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -321,7 +321,7 @@ define void @local_zextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i32>
   store <16 x i32> %ext, <16 x i32> addrspace(3)* %out
@@ -338,7 +338,7 @@ define void @local_sextload_v16i8_to_v16i32(<16 x i32> addrspace(3)* %out, <16 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -355,7 +355,7 @@ define void @local_zextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i32>
   store <32 x i32> %ext, <32 x i32> addrspace(3)* %out
@@ -380,7 +380,7 @@ define void @local_sextload_v32i8_to_v32i32(<32 x i32> addrspace(3)* %out, <32 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
   %ext = zext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -405,7 +405,7 @@ define void @local_zextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
 ; EG-DAG: LDS_READ_RET
-define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
   %ext = sext <64 x i8> %load to <64 x i32>
   store <64 x i32> %ext, <64 x i32> addrspace(3)* %out
@@ -420,7 +420,7 @@ define void @local_sextload_v64i8_to_v64i32(<64 x i32> addrspace(3)* %out, <64 x
 ; EG: LDS_UBYTE_READ_RET
 ; EG: MOV {{.*}}, literal
 ; EG: 0.0
-define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -437,7 +437,7 @@ define void @local_zextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %
 ; EG: ASHR
 ; TODO: why not 7?
 ; EG: 31
-define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %a to i64
   store i64 %ext, i64 addrspace(3)* %out
@@ -450,7 +450,7 @@ define void @local_sextload_i8_to_i64(i64 addrspace(3)* %out, i8 addrspace(3)* %
 ; EG: MOV {{.*}}, literal
 ; TODO: merge?
 ; EG: 0.0
-define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -463,7 +463,7 @@ define void @local_zextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8>
 ; EG: ASHR
 ; TODO: why not 7?
 ; EG: 31
-define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i64>
   store <1 x i64> %ext, <1 x i64> addrspace(3)* %out
@@ -473,7 +473,7 @@ define void @local_sextload_v1i8_to_v1i64(<1 x i64> addrspace(3)* %out, <1 x i8>
 ; FUNC-LABEL: {{^}}local_zextload_v2i8_to_v2i64:
 
 ; EG: LDS_USHORT_READ_RET
-define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -485,7 +485,7 @@ define void @local_zextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8>
 ; EG: LDS_USHORT_READ_RET
 ; EG: BFE_INT
 ; EG: BFE_INT
-define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i64>
   store <2 x i64> %ext, <2 x i64> addrspace(3)* %out
@@ -495,7 +495,7 @@ define void @local_sextload_v2i8_to_v2i64(<2 x i64> addrspace(3)* %out, <2 x i8>
 ; FUNC-LABEL: {{^}}local_zextload_v4i8_to_v4i64:
 
 ; EG: LDS_READ_RET
-define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -505,7 +505,7 @@ define void @local_zextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8>
 ; FUNC-LABEL: {{^}}local_sextload_v4i8_to_v4i64:
 
 ; EG: LDS_READ_RET
-define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i64>
   store <4 x i64> %ext, <4 x i64> addrspace(3)* %out
@@ -516,7 +516,7 @@ define void @local_sextload_v4i8_to_v4i64(<4 x i64> addrspace(3)* %out, <4 x i8>
 
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -536,7 +536,7 @@ define void @local_zextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8>
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
 ; EG-DAG: BFE_INT
-define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i64>
   store <8 x i64> %ext, <8 x i64> addrspace(3)* %out
@@ -549,7 +549,7 @@ define void @local_sextload_v8i8_to_v8i64(<8 x i64> addrspace(3)* %out, <8 x i8>
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -562,7 +562,7 @@ define void @local_zextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i64>
   store <16 x i64> %ext, <16 x i64> addrspace(3)* %out
@@ -579,7 +579,7 @@ define void @local_sextload_v16i8_to_v16i64(<16 x i64> addrspace(3)* %out, <16 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -596,7 +596,7 @@ define void @local_zextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
 ; EG: LDS_READ_RET
-define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i64>
   store <32 x i64> %ext, <32 x i64> addrspace(3)* %out
@@ -604,7 +604,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 }
 
 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i64:
-; define void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -612,7 +612,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 ; }
 
 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i64:
-; define void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i8_to_v64i64(<64 x i64> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i64>
 ;   store <64 x i64> %ext, <64 x i64> addrspace(3)* %out
@@ -625,7 +625,7 @@ define void @local_sextload_v32i8_to_v32i64(<32 x i64> addrspace(3)* %out, <32 x
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: LDS_SHORT_WRITE
-define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = zext i8 %a to i16
   store i16 %ext, i16 addrspace(3)* %out
@@ -639,7 +639,7 @@ define void @local_zextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
 ; EG: LDS_SHORT_WRITE
-define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %in) #0 {
   %a = load i8, i8 addrspace(3)* %in
   %ext = sext i8 %a to i16
   store i16 %ext, i16 addrspace(3)* %out
@@ -650,7 +650,7 @@ define void @local_sextload_i8_to_i16(i16 addrspace(3)* %out, i8 addrspace(3)* %
 
 ; EG: LDS_UBYTE_READ_RET
 ; EG: LDS_SHORT_WRITE
-define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = zext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
@@ -662,7 +662,7 @@ define void @local_zextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8>
 ; EG: LDS_UBYTE_READ_RET
 ; EG: BFE_INT
 ; EG: LDS_SHORT_WRITE
-define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8> addrspace(3)* %in) #0 {
   %load = load <1 x i8>, <1 x i8> addrspace(3)* %in
   %ext = sext <1 x i8> %load to <1 x i16>
   store <1 x i16> %ext, <1 x i16> addrspace(3)* %out
@@ -673,7 +673,7 @@ define void @local_sextload_v1i8_to_v1i16(<1 x i16> addrspace(3)* %out, <1 x i8>
 
 ; EG: LDS_USHORT_READ_RET
 ; EG: LDS_WRITE
-define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = zext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
@@ -686,7 +686,7 @@ define void @local_zextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8>
 ; EG: BFE_INT
 ; EG: BFE_INT
 ; EG: LDS_WRITE
-define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8> addrspace(3)* %in) #0 {
   %load = load <2 x i8>, <2 x i8> addrspace(3)* %in
   %ext = sext <2 x i8> %load to <2 x i16>
   store <2 x i16> %ext, <2 x i16> addrspace(3)* %out
@@ -698,7 +698,7 @@ define void @local_sextload_v2i8_to_v2i16(<2 x i16> addrspace(3)* %out, <2 x i8>
 ; EG: LDS_READ_RET
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = zext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
@@ -715,7 +715,7 @@ define void @local_zextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8>
 ; EG-DAG: BFE_INT
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8> addrspace(3)* %in) #0 {
   %load = load <4 x i8>, <4 x i8> addrspace(3)* %in
   %ext = sext <4 x i8> %load to <4 x i16>
   store <4 x i16> %ext, <4 x i16> addrspace(3)* %out
@@ -730,7 +730,7 @@ define void @local_sextload_v4i8_to_v4i16(<4 x i16> addrspace(3)* %out, <4 x i8>
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = zext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
@@ -754,7 +754,7 @@ define void @local_zextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8>
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8> addrspace(3)* %in) #0 {
   %load = load <8 x i8>, <8 x i8> addrspace(3)* %in
   %ext = sext <8 x i8> %load to <8 x i16>
   store <8 x i16> %ext, <8 x i16> addrspace(3)* %out
@@ -775,7 +775,7 @@ define void @local_sextload_v8i8_to_v8i16(<8 x i16> addrspace(3)* %out, <8 x i8>
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = zext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
@@ -813,7 +813,7 @@ define void @local_zextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %load = load <16 x i8>, <16 x i8> addrspace(3)* %in
   %ext = sext <16 x i8> %load to <16 x i16>
   store <16 x i16> %ext, <16 x i16> addrspace(3)* %out
@@ -846,7 +846,7 @@ define void @local_sextload_v16i8_to_v16i16(<16 x i16> addrspace(3)* %out, <16 x
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = zext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
@@ -908,7 +908,7 @@ define void @local_zextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
 ; EG: LDS_WRITE
-define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x i8> addrspace(3)* %in) #0 {
   %load = load <32 x i8>, <32 x i8> addrspace(3)* %in
   %ext = sext <32 x i8> %load to <32 x i16>
   store <32 x i16> %ext, <32 x i16> addrspace(3)* %out
@@ -916,7 +916,7 @@ define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 }
 
 ; XFUNC-LABEL: {{^}}local_zextload_v64i8_to_v64i16:
-; define void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_zextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = zext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
@@ -924,7 +924,7 @@ define void @local_sextload_v32i8_to_v32i16(<32 x i16> addrspace(3)* %out, <32 x
 ; }
 
 ; XFUNC-LABEL: {{^}}local_sextload_v64i8_to_v64i16:
-; define void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
+; define amdgpu_kernel void @local_sextload_v64i8_to_v64i16(<64 x i16> addrspace(3)* %out, <64 x i8> addrspace(3)* %in) #0 {
 ;   %load = load <64 x i8>, <64 x i8> addrspace(3)* %in
 ;   %ext = sext <64 x i8> %load to <64 x i16>
 ;   store <64 x i16> %ext, <64 x i16> addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/load-weird-sizes.ll b/test/CodeGen/AMDGPU/load-weird-sizes.ll
index bc5e4945fb04057c995fe07791ebeb777493c62c..d6162c388b5b103517229c17fb0303eee8f8fa95 100644
--- a/test/CodeGen/AMDGPU/load-weird-sizes.ll
+++ b/test/CodeGen/AMDGPU/load-weird-sizes.ll
@@ -8,7 +8,7 @@
 ; SI: {{flat|buffer}}_load_ubyte
 ; SI: {{flat|buffer}}_load_ushort
 ; SI: {{flat|buffer}}_store_dword
-define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
   %1 = load i24, i24 addrspace(1)* %in
   %2 = zext i24 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @load_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) #0 {
 
 ; CI-HSA: flat_load_dword [[VAL:v[0-9]+]]
 ; CI-HSA: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, [[VAL]]
-define void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @load_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) #0 {
   %1 = load i25, i25 addrspace(1)* %in
   %2 = zext i25 %1 to i32
   store i32 %2, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/local-64.ll b/test/CodeGen/AMDGPU/local-64.ll
index a7cee43187c18446104bef8b28b1c4ac46043128..bf4a93237bd4cef73f937ee835cf84aaad71a30d 100644
--- a/test/CodeGen/AMDGPU/local-64.ll
+++ b/test/CodeGen/AMDGPU/local-64.ll
@@ -5,7 +5,7 @@
 ; BOTH-LABEL: {{^}}local_i32_load
 ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}} offset:28
 ; BOTH: buffer_store_dword [[REG]],
-define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %in, i32 7
   %val = load i32, i32 addrspace(3)* %gep, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -15,7 +15,7 @@ define void @local_i32_load(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounw
 ; BOTH-LABEL: {{^}}local_i32_load_0_offset
 ; BOTH: ds_read_b32 [[REG:v[0-9]+]], v{{[0-9]+}}
 ; BOTH: buffer_store_dword [[REG]],
-define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %in) nounwind {
   %val = load i32, i32 addrspace(3)* %in, align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
@@ -25,7 +25,7 @@ define void @local_i32_load_0_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_u8 [[REG:v[0-9]+]], {{v[0-9]+}} offset:65535
 ; BOTH: buffer_store_byte [[REG]],
-define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65535
   %val = load i8, i8 addrspace(3)* %gep, align 4
   store i8 %val, i8 addrspace(1)* %out, align 4
@@ -40,7 +40,7 @@ define void @local_i8_load_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; BOTH: v_mov_b32_e32 [[VREGADDR:v[0-9]+]], [[ADDR]]
 ; BOTH: ds_read_u8 [[REG:v[0-9]+]], [[VREGADDR]]
 ; BOTH: buffer_store_byte [[REG]],
-define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %in) nounwind {
   %gep = getelementptr i8, i8 addrspace(3)* %in, i32 65536
   %val = load i8, i8 addrspace(3)* %gep, align 4
   store i8 %val, i8 addrspace(1)* %out, align 4
@@ -51,7 +51,7 @@ define void @local_i8_load_over_i16_max_offset(i8 addrspace(1)* %out, i8 addrspa
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %in, i32 7
   %val = load i64, i64 addrspace(3)* %gep, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -61,7 +61,7 @@ define void @local_i64_load(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounw
 ; BOTH-LABEL: {{^}}local_i64_load_0_offset
 ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %in) nounwind {
   %val = load i64, i64 addrspace(3)* %in, align 8
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -71,7 +71,7 @@ define void @local_i64_load_0_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; BOTH-NOT: ADD
 ; BOTH: ds_read_b64 [[REG:v[[0-9]+:[0-9]+]]], v{{[0-9]+}} offset:56
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %gep = getelementptr double, double addrspace(3)* %in, i32 7
   %val = load double, double addrspace(3)* %gep, align 8
   store double %val, double addrspace(1)* %out, align 8
@@ -81,7 +81,7 @@ define void @local_f64_load(double addrspace(1)* %out, double addrspace(3)* %in)
 ; BOTH-LABEL: {{^}}local_f64_load_0_offset
 ; BOTH: ds_read_b64 [[REG:v\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}
 ; BOTH: buffer_store_dwordx2 [[REG]],
-define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
+define amdgpu_kernel void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace(3)* %in) nounwind {
   %val = load double, double addrspace(3)* %in, align 8
   store double %val, double addrspace(1)* %out, align 8
   ret void
@@ -90,7 +90,7 @@ define void @local_f64_load_0_offset(double addrspace(1)* %out, double addrspace
 ; BOTH-LABEL: {{^}}local_i64_store:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_i64_store(i64 addrspace(3)* %out) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %out, i32 7
   store i64 5678, i64 addrspace(3)* %gep, align 8
   ret void
@@ -99,7 +99,7 @@ define void @local_i64_store(i64 addrspace(3)* %out) nounwind {
 ; BOTH-LABEL: {{^}}local_i64_store_0_offset:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
   store i64 1234, i64 addrspace(3)* %out, align 8
   ret void
 }
@@ -107,7 +107,7 @@ define void @local_i64_store_0_offset(i64 addrspace(3)* %out) nounwind {
 ; BOTH-LABEL: {{^}}local_f64_store:
 ; BOTH-NOT: ADD
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}} offset:56
-define void @local_f64_store(double addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_f64_store(double addrspace(3)* %out) nounwind {
   %gep = getelementptr double, double addrspace(3)* %out, i32 7
   store double 16.0, double addrspace(3)* %gep, align 8
   ret void
@@ -115,7 +115,7 @@ define void @local_f64_store(double addrspace(3)* %out) nounwind {
 
 ; BOTH-LABEL: {{^}}local_f64_store_0_offset
 ; BOTH: ds_write_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}
-define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
   store double 20.0, double addrspace(3)* %out, align 8
   ret void
 }
@@ -124,7 +124,7 @@ define void @local_f64_store_0_offset(double addrspace(3)* %out) nounwind {
 ; BOTH-NOT: ADD
 ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:14 offset1:15
 ; BOTH: s_endpgm
-define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <2 x i64>, <2 x i64> addrspace(3)* %out, i32 7
   store <2 x i64> <i64 5678, i64 5678>, <2 x i64> addrspace(3)* %gep, align 16
   ret void
@@ -134,7 +134,7 @@ define void @local_v2i64_store(<2 x i64> addrspace(3)* %out) nounwind {
 ; BOTH-NOT: ADD
 ; BOTH: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
 ; BOTH: s_endpgm
-define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
   store <2 x i64> <i64 1234, i64 1234>, <2 x i64> addrspace(3)* %out, align 16
   ret void
 }
@@ -144,7 +144,7 @@ define void @local_v2i64_store_0_offset(<2 x i64> addrspace(3)* %out) nounwind {
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:30 offset1:31
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:28 offset1:29
 ; BOTH: s_endpgm
-define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
   %gep = getelementptr <4 x i64>, <4 x i64> addrspace(3)* %out, i32 7
   store <4 x i64> <i64 5678, i64 5678, i64 5678, i64 5678>, <4 x i64> addrspace(3)* %gep, align 16
   ret void
@@ -155,7 +155,7 @@ define void @local_v4i64_store(<4 x i64> addrspace(3)* %out) nounwind {
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset0:2 offset1:3
 ; BOTH-DAG: ds_write2_b64 v{{[0-9]+}}, {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}} offset1:1
 ; BOTH: s_endpgm
-define void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
+define amdgpu_kernel void @local_v4i64_store_0_offset(<4 x i64> addrspace(3)* %out) nounwind {
   store <4 x i64> <i64 1234, i64 1234, i64 1234, i64 1234>, <4 x i64> addrspace(3)* %out, align 16
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/local-atomics.ll b/test/CodeGen/AMDGPU/local-atomics.ll
index 6714a28aa43a4af7066747bfdb2c9c4a19c98749..de029d964b0d57feb2ff62fe0952e0851bc3f3ea 100644
--- a/test/CodeGen/AMDGPU/local-atomics.ll
+++ b/test/CodeGen/AMDGPU/local-atomics.ll
@@ -11,7 +11,7 @@
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @lds_atomic_xchg_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; EG: LDS_WRXCHG_RET *
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -37,7 +37,7 @@ define void @lds_atomic_xchg_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; GCN: ds_add_rtn_u32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: buffer_store_dword [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -47,7 +47,7 @@ define void @lds_atomic_add_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_ADD_RET *
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -59,7 +59,7 @@ define void @lds_atomic_add_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -73,7 +73,7 @@ define void @lds_atomic_add_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 ad
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -84,7 +84,7 @@ define void @lds_atomic_add1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -96,7 +96,7 @@ define void @lds_atomic_add1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; SI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -109,7 +109,7 @@ define void @lds_atomic_add1_ret_i32_bad_si_offset(i32 addrspace(1)* %out, i32 a
 ; EG: LDS_SUB_RET *
 ; GCN: ds_sub_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -119,7 +119,7 @@ define void @lds_atomic_sub_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_SUB_RET *
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -131,7 +131,7 @@ define void @lds_atomic_sub_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_rtn_u32  v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -142,7 +142,7 @@ define void @lds_atomic_sub1_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@ define void @lds_atomic_sub1_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; EG: LDS_AND_RET *
 ; GCN: ds_and_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -163,7 +163,7 @@ define void @lds_atomic_and_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_AND_RET *
 ; GCN: ds_and_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -174,7 +174,7 @@ define void @lds_atomic_and_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_OR_RET *
 ; GCN: ds_or_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -184,7 +184,7 @@ define void @lds_atomic_or_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %pt
 ; EG: LDS_OR_RET *
 ; GCN: ds_or_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -195,7 +195,7 @@ define void @lds_atomic_or_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(
 ; EG: LDS_XOR_RET *
 ; GCN: ds_xor_rtn_b32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -205,7 +205,7 @@ define void @lds_atomic_xor_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_XOR_RET *
 ; GCN: ds_xor_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -214,7 +214,7 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 
 ; FIXME: There is no atomic nand instr
 ; XFUNC-LABEL: {{^}}lds_atomic_nand_ret_i32:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i32 %result, i32 addrspace(1)* %out, align 4
 ;   ret void
@@ -224,7 +224,7 @@ define void @lds_atomic_xor_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_MIN_INT_RET *
 ; GCN: ds_min_rtn_i32
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -234,7 +234,7 @@ define void @lds_atomic_min_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_MIN_INT_RET *
 ; GCN: ds_min_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -245,7 +245,7 @@ define void @lds_atomic_min_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_MAX_INT_RET *
 ; GCN: ds_max_rtn_i32
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -255,7 +255,7 @@ define void @lds_atomic_max_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %p
 ; EG: LDS_MAX_INT_RET *
 ; GCN: ds_max_rtn_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -266,7 +266,7 @@ define void @lds_atomic_max_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace
 ; EG: LDS_MIN_UINT_RET *
 ; GCN: ds_min_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -276,7 +276,7 @@ define void @lds_atomic_umin_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; EG: LDS_MIN_UINT_RET *
 ; GCN: ds_min_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -287,7 +287,7 @@ define void @lds_atomic_umin_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; EG: LDS_MAX_UINT_RET *
 ; GCN: ds_max_rtn_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -297,7 +297,7 @@ define void @lds_atomic_umax_ret_i32(i32 addrspace(1)* %out, i32 addrspace(3)* %
 ; EG: LDS_MAX_UINT_RET *
 ; GCN: ds_max_rtn_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   store i32 %result, i32 addrspace(1)* %out, align 4
@@ -310,7 +310,7 @@ define void @lds_atomic_umax_ret_i32_offset(i32 addrspace(1)* %out, i32 addrspac
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_wrxchg_rtn_b32 [[RESULT:v[0-9]+]], [[VPTR]], [[DATA]]
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -318,7 +318,7 @@ define void @lds_atomic_xchg_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_xchg_noret_i32_offset:
 ; GCN: ds_wrxchg_rtn_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -330,7 +330,7 @@ define void @lds_atomic_xchg_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[SPTR]]
 ; GCN: ds_add_u32 [[VPTR]], [[DATA]]
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -338,7 +338,7 @@ define void @lds_atomic_add_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_add_noret_i32_offset:
 ; GCN: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -348,7 +348,7 @@ define void @lds_atomic_add_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -360,7 +360,7 @@ define void @lds_atomic_add_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
@@ -369,7 +369,7 @@ define void @lds_atomic_add1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_add_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
@@ -379,7 +379,7 @@ define void @lds_atomic_add1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; SI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}}
 ; CIVI: ds_add_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32 %a, i32 %b) nounwind {
   %sub = sub i32 %a, %b
   %add = add i32 %sub, 4
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 %add
@@ -390,7 +390,7 @@ define void @lds_atomic_add1_noret_i32_bad_si_offset(i32 addrspace(3)* %ptr, i32
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32:
 ; GCN: ds_sub_u32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -398,7 +398,7 @@ define void @lds_atomic_sub_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_sub_noret_i32_offset:
 ; GCN: ds_sub_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -408,7 +408,7 @@ define void @lds_atomic_sub_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]]
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i32 addrspace(3)* %ptr, i32 1 seq_cst
   ret void
 }
@@ -417,7 +417,7 @@ define void @lds_atomic_sub1_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1{{$}}
 ; GCN: ds_sub_u32 v{{[0-9]+}}, [[ONE]] offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i32 addrspace(3)* %gep, i32 1 seq_cst
   ret void
@@ -426,7 +426,7 @@ define void @lds_atomic_sub1_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32:
 ; GCN: ds_and_b32
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -434,7 +434,7 @@ define void @lds_atomic_and_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_and_noret_i32_offset:
 ; GCN: ds_and_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -443,7 +443,7 @@ define void @lds_atomic_and_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32:
 ; GCN: ds_or_b32
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -451,7 +451,7 @@ define void @lds_atomic_or_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_or_noret_i32_offset:
 ; GCN: ds_or_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -460,7 +460,7 @@ define void @lds_atomic_or_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32:
 ; GCN: ds_xor_b32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -468,7 +468,7 @@ define void @lds_atomic_xor_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_xor_noret_i32_offset:
 ; GCN: ds_xor_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -476,7 +476,7 @@ define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 
 ; FIXME: There is no atomic nand instr
 ; XFUNC-LABEL: {{^}}lds_atomic_nand_noret_i32:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i32 addrspace(3)* %ptr, i32 4 seq_cst
 ;   ret void
 ; }
@@ -484,7 +484,7 @@ define void @lds_atomic_xor_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32:
 ; GCN: ds_min_i32
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -492,7 +492,7 @@ define void @lds_atomic_min_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_min_noret_i32_offset:
 ; GCN: ds_min_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -501,7 +501,7 @@ define void @lds_atomic_min_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32:
 ; GCN: ds_max_i32
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -509,7 +509,7 @@ define void @lds_atomic_max_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_max_noret_i32_offset:
 ; GCN: ds_max_i32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -518,7 +518,7 @@ define void @lds_atomic_max_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32:
 ; GCN: ds_min_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -526,7 +526,7 @@ define void @lds_atomic_umin_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umin_noret_i32_offset:
 ; GCN: ds_min_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
@@ -535,7 +535,7 @@ define void @lds_atomic_umin_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32:
 ; GCN: ds_max_u32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i32 addrspace(3)* %ptr, i32 4 seq_cst
   ret void
 }
@@ -543,7 +543,7 @@ define void @lds_atomic_umax_noret_i32(i32 addrspace(3)* %ptr) nounwind {
 ; FUNC-LABEL: {{^}}lds_atomic_umax_noret_i32_offset:
 ; GCN: ds_max_u32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i32_offset(i32 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i32, i32 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i32 addrspace(3)* %gep, i32 4 seq_cst
   ret void
diff --git a/test/CodeGen/AMDGPU/local-atomics64.ll b/test/CodeGen/AMDGPU/local-atomics64.ll
index c88917812eda83f63527948b6ddead0cecee15ba..6572a7bcd4fe2396bad62fb7965e33c81be9ec3a 100644
--- a/test/CodeGen/AMDGPU/local-atomics64.ll
+++ b/test/CodeGen/AMDGPU/local-atomics64.ll
@@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64:
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -13,7 +13,7 @@ define void @lds_atomic_xchg_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_xchg_ret_i64_offset:
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -23,7 +23,7 @@ define void @lds_atomic_xchg_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_add_ret_i64:
 ; GCN: ds_add_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -38,7 +38,7 @@ define void @lds_atomic_add_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -51,7 +51,7 @@ define void @lds_atomic_add_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN: ds_add_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -60,7 +60,7 @@ define void @lds_atomic_add1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_add1_ret_i64_offset:
 ; GCN: ds_add_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -70,7 +70,7 @@ define void @lds_atomic_add1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64:
 ; GCN: ds_sub_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -79,7 +79,7 @@ define void @lds_atomic_sub_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_sub_ret_i64_offset:
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -92,7 +92,7 @@ define void @lds_atomic_sub_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN: ds_sub_rtn_u64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: buffer_store_dwordx2 [[RESULT]],
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -101,7 +101,7 @@ define void @lds_atomic_sub1_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_sub1_ret_i64_offset:
 ; GCN: ds_sub_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -111,7 +111,7 @@ define void @lds_atomic_sub1_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64:
 ; GCN: ds_and_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -120,7 +120,7 @@ define void @lds_atomic_and_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_and_ret_i64_offset:
 ; GCN: ds_and_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -130,7 +130,7 @@ define void @lds_atomic_and_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64:
 ; GCN: ds_or_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -139,7 +139,7 @@ define void @lds_atomic_or_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %pt
 ; GCN-LABEL: {{^}}lds_atomic_or_ret_i64_offset:
 ; GCN: ds_or_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -149,7 +149,7 @@ define void @lds_atomic_or_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(
 ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64:
 ; GCN: ds_xor_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -158,7 +158,7 @@ define void @lds_atomic_xor_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_xor_ret_i64_offset:
 ; GCN: ds_xor_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -167,7 +167,7 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 
 ; FIXME: There is no atomic nand instr
 ; XGCN-LABEL: {{^}}lds_atomic_nand_ret_i64:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
@@ -176,7 +176,7 @@ define void @lds_atomic_xor_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64:
 ; GCN: ds_min_rtn_i64
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -185,7 +185,7 @@ define void @lds_atomic_min_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_min_ret_i64_offset:
 ; GCN: ds_min_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -195,7 +195,7 @@ define void @lds_atomic_min_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64:
 ; GCN: ds_max_rtn_i64
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -204,7 +204,7 @@ define void @lds_atomic_max_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %p
 ; GCN-LABEL: {{^}}lds_atomic_max_ret_i64_offset:
 ; GCN: ds_max_rtn_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -214,7 +214,7 @@ define void @lds_atomic_max_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace
 ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64:
 ; GCN: ds_min_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -223,7 +223,7 @@ define void @lds_atomic_umin_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_umin_ret_i64_offset:
 ; GCN: ds_min_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -233,7 +233,7 @@ define void @lds_atomic_umin_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64:
 ; GCN: ds_max_rtn_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -242,7 +242,7 @@ define void @lds_atomic_umax_ret_i64(i64 addrspace(1)* %out, i64 addrspace(3)* %
 ; GCN-LABEL: {{^}}lds_atomic_umax_ret_i64_offset:
 ; GCN: ds_max_rtn_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   store i64 %result, i64 addrspace(1)* %out, align 8
@@ -252,7 +252,7 @@ define void @lds_atomic_umax_ret_i64_offset(i64 addrspace(1)* %out, i64 addrspac
 ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64:
 ; GCN: ds_wrxchg_rtn_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xchg i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -260,7 +260,7 @@ define void @lds_atomic_xchg_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_xchg_noret_i64_offset:
 ; GCN: ds_wrxchg_rtn_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xchg i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -269,7 +269,7 @@ define void @lds_atomic_xchg_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_add_noret_i64:
 ; GCN: ds_add_u64
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -282,7 +282,7 @@ define void @lds_atomic_add_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 [[VPTR:v[0-9]+]], [[PTR]]
 ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i64 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 9 seq_cst
   ret void
@@ -293,7 +293,7 @@ define void @lds_atomic_add_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-DAG: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
 ; GCN: ds_add_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw add i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
@@ -301,7 +301,7 @@ define void @lds_atomic_add1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_add1_noret_i64_offset:
 ; GCN: ds_add_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw add i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
@@ -310,7 +310,7 @@ define void @lds_atomic_add1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64:
 ; GCN: ds_sub_u64
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -318,7 +318,7 @@ define void @lds_atomic_sub_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_sub_noret_i64_offset:
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -329,7 +329,7 @@ define void @lds_atomic_sub_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN: v_mov_b32_e32 v[[HIVDATA:[0-9]+]], 0{{$}}
 ; GCN: ds_sub_u64 [[VPTR]], v{{\[}}[[LOVDATA]]:[[HIVDATA]]{{\]}}
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw sub i64 addrspace(3)* %ptr, i64 1 seq_cst
   ret void
 }
@@ -337,7 +337,7 @@ define void @lds_atomic_sub1_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_sub1_noret_i64_offset:
 ; GCN: ds_sub_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw sub i64 addrspace(3)* %gep, i64 1 seq_cst
   ret void
@@ -346,7 +346,7 @@ define void @lds_atomic_sub1_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64:
 ; GCN: ds_and_b64
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw and i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -354,7 +354,7 @@ define void @lds_atomic_and_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_and_noret_i64_offset:
 ; GCN: ds_and_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw and i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -363,7 +363,7 @@ define void @lds_atomic_and_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64:
 ; GCN: ds_or_b64
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw or i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -371,7 +371,7 @@ define void @lds_atomic_or_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_or_noret_i64_offset:
 ; GCN: ds_or_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw or i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -380,7 +380,7 @@ define void @lds_atomic_or_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64:
 ; GCN: ds_xor_b64
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw xor i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -388,7 +388,7 @@ define void @lds_atomic_xor_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_xor_noret_i64_offset:
 ; GCN: ds_xor_b64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw xor i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -396,7 +396,7 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 
 ; FIXME: There is no atomic nand instr
 ; XGCN-LABEL: {{^}}lds_atomic_nand_noret_i64:uction, so we somehow need to expand this.
-; define void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+; define amdgpu_kernel void @lds_atomic_nand_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ;   %result = atomicrmw nand i64 addrspace(3)* %ptr, i32 4 seq_cst
 ;   ret void
 ; }
@@ -404,7 +404,7 @@ define void @lds_atomic_xor_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64:
 ; GCN: ds_min_i64
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw min i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -412,7 +412,7 @@ define void @lds_atomic_min_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_min_noret_i64_offset:
 ; GCN: ds_min_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw min i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -421,7 +421,7 @@ define void @lds_atomic_min_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64:
 ; GCN: ds_max_i64
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw max i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -429,7 +429,7 @@ define void @lds_atomic_max_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_max_noret_i64_offset:
 ; GCN: ds_max_i64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw max i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -438,7 +438,7 @@ define void @lds_atomic_max_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64:
 ; GCN: ds_min_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umin i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -446,7 +446,7 @@ define void @lds_atomic_umin_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umin_noret_i64_offset:
 ; GCN: ds_min_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umin i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
@@ -455,7 +455,7 @@ define void @lds_atomic_umin_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64:
 ; GCN: ds_max_u64
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
   %result = atomicrmw umax i64 addrspace(3)* %ptr, i64 4 seq_cst
   ret void
 }
@@ -463,7 +463,7 @@ define void @lds_atomic_umax_noret_i64(i64 addrspace(3)* %ptr) nounwind {
 ; GCN-LABEL: {{^}}lds_atomic_umax_noret_i64_offset:
 ; GCN: ds_max_u64 {{.*}} offset:32
 ; GCN: s_endpgm
-define void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
+define amdgpu_kernel void @lds_atomic_umax_noret_i64_offset(i64 addrspace(3)* %ptr) nounwind {
   %gep = getelementptr i64, i64 addrspace(3)* %ptr, i32 4
   %result = atomicrmw umax i64 addrspace(3)* %gep, i64 4 seq_cst
   ret void
diff --git a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
index a57e4f5953226dadcb0bfad32426dc3e7fca9da3..4ce9208eaddcf8ca5a38535064d7e404862e6b9e 100644
--- a/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
+++ b/test/CodeGen/AMDGPU/local-memory.amdgcn.ll
@@ -17,7 +17,7 @@
 ; GCN: s_barrier
 
 ; GCN: ds_read_b32 {{v[0-9]+}},
-define void @local_memory(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
 entry:
   %y.i = call i32 @llvm.amdgcn.workitem.id.x() #1
   %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
@@ -61,7 +61,7 @@ entry:
 
 ; CI: v_sub_i32_e32 [[SUB:v[0-9]+]], vcc, 0, [[ADDRW]]
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, [[SUB]] offset0:3 offset1:7
-define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
   %x.i = call i32 @llvm.amdgcn.workitem.id.x()
   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
diff --git a/test/CodeGen/AMDGPU/local-memory.ll b/test/CodeGen/AMDGPU/local-memory.ll
index 1a11332f865d0acaaf6fb0e7132d88e56f5b9538..6124237d76383524adf75e6946e20533378d89fb 100644
--- a/test/CodeGen/AMDGPU/local-memory.ll
+++ b/test/CodeGen/AMDGPU/local-memory.ll
@@ -14,7 +14,7 @@
 ; GCN: ds_read_b32 v{{[0-9]+}}, v[[ZERO]] offset:4
 
 ; R600: LDS_READ_RET
-define void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_i32_local_const_ptr(i32 addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
 entry:
   %tmp0 = getelementptr [512 x i32], [512 x i32] addrspace(3)* @lds, i32 0, i32 1
   %tmp1 = load i32, i32 addrspace(3)* %tmp0
@@ -30,7 +30,7 @@ entry:
 ; R600: LDS_READ_RET
 ; GCN-DAG: ds_read_b32
 ; GCN-DAG: ds_read2_b32
-define void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @load_i32_v2i32_local(<2 x i32> addrspace(1)* %out, i32 addrspace(3)* %in) #0 {
   %scalar = load i32, i32 addrspace(3)* %in
   %tmp0 = bitcast i32 addrspace(3)* %in to <2 x i32> addrspace(3)*
   %vec_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(3)* %tmp0, i32 2
diff --git a/test/CodeGen/AMDGPU/local-memory.r600.ll b/test/CodeGen/AMDGPU/local-memory.r600.ll
index 9841b8882b397b6d7b4930213122c12a3ec329a1..c8f4e4c986a76c0acb8f6cfe59304f1037796a12 100644
--- a/test/CodeGen/AMDGPU/local-memory.r600.ll
+++ b/test/CodeGen/AMDGPU/local-memory.r600.ll
@@ -15,7 +15,7 @@
 ; EG-NEXT: ALU clause
 
 ; EG: LDS_READ_RET
-define void @local_memory(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out) #0 {
 entry:
   %y.i = call i32 @llvm.r600.read.tidig.x() #1
   %arrayidx = getelementptr inbounds [128 x i32], [128 x i32] addrspace(3)* @local_memory.local_mem, i32 0, i32 %y.i
@@ -57,7 +57,7 @@ entry:
 ; EG: LDS_READ_RET {{[*]*}} OQAP, {{PV|T}}[[ADDRR:[0-9]*\.[XYZW]]]
 ; EG-NOT: LDS_READ_RET {{[*]*}} OQAP, T[[ADDRR]]
 
-define void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @local_memory_two_objects(i32 addrspace(1)* %out) #0 {
 entry:
   %x.i = call i32 @llvm.r600.read.tidig.x() #1
   %arrayidx = getelementptr inbounds [4 x i32], [4 x i32] addrspace(3)* @local_memory_two_objects.local_mem0, i32 0, i32 %x.i
diff --git a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
index dc43e8613ddf76d207b99736ee955bc4e213ad5d..d3e0f0be4b5f32e45bf8d03655ad6bda24864aa2 100644
--- a/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
+++ b/test/CodeGen/AMDGPU/local-stack-slot-bug.ll
@@ -8,13 +8,12 @@
 ; CHECK-LABEL: {{^}}main:
 
 ; CHECK-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x200
-; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
+; CHECK-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0x400{{$}}
 ; CHECK-DAG: v_lshlrev_b32_e32 [[BYTES:v[0-9]+]], 2, v0
 ; CHECK-DAG: v_and_b32_e32 [[CLAMP_IDX:v[0-9]+]], 0x1fc, [[BYTES]]
 
-; TODO: add 0?
-; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]]
-; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]]
+; CHECK-DAG: v_or_b32_e32 [[LO_OFF:v[0-9]+]], [[CLAMP_IDX]], [[K]]
+; CHECK-DAG: v_or_b32_e32 [[HI_OFF:v[0-9]+]], [[CLAMP_IDX]], [[ZERO]]
 
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[LO_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
 ; CHECK: buffer_load_dword {{v[0-9]+}}, [[HI_OFF]], {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offen
diff --git a/test/CodeGen/AMDGPU/loop-address.ll b/test/CodeGen/AMDGPU/loop-address.ll
index f60d574497deee8ea2875cdda78ae9f1b9ed60c3..e25d4f4b4f5f23af77464c8d08d64111bd900bba 100644
--- a/test/CodeGen/AMDGPU/loop-address.ll
+++ b/test/CodeGen/AMDGPU/loop-address.ll
@@ -5,7 +5,7 @@
 ;CHECK: LOOP_BREAK @10
 ;CHECK: POP @10
 
-define void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
+define amdgpu_kernel void @loop_ge(i32 addrspace(1)* nocapture %out, i32 %iterations) #0 {
 entry:
   %cmp5 = icmp sgt i32 %iterations, 0
   br i1 %cmp5, label %for.body, label %for.end
diff --git a/test/CodeGen/AMDGPU/loop-idiom.ll b/test/CodeGen/AMDGPU/loop-idiom.ll
index 5fd9806813cd6207188c07db5d4d744439157c0e..23ddd6488af99c766f777d11ebb3539a37be0a79 100644
--- a/test/CodeGen/AMDGPU/loop-idiom.ll
+++ b/test/CodeGen/AMDGPU/loop-idiom.ll
@@ -9,7 +9,7 @@
 ; FUNC: @no_memcpy
 ; R600-NOT: {{^}}llvm.memcpy
 ; SI-NOT: {{^}}llvm.memcpy
-define void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
+define amdgpu_kernel void @no_memcpy(i8 addrspace(3)* %in, i32 %size) {
 entry:
   %dest = alloca i8, i32 32
   br label %for.body
@@ -33,7 +33,7 @@ for.end:
 ; R600-NOT: {{^}}memset_pattern16:
 ; SI-NOT: {{^}}llvm.memset
 ; SI-NOT: {{^}}memset_pattern16:
-define void @no_memset(i32 %size) {
+define amdgpu_kernel void @no_memset(i32 %size) {
 entry:
   %dest = alloca i8, i32 32
   br label %for.body
diff --git a/test/CodeGen/AMDGPU/loop_break.ll b/test/CodeGen/AMDGPU/loop_break.ll
index 82564b8bb28d0c5dcb0ad217ca59612bf9ec4ad5..492472155ee6bce776ab86718c782076e9abbb94 100644
--- a/test/CodeGen/AMDGPU/loop_break.ll
+++ b/test/CodeGen/AMDGPU/loop_break.ll
@@ -43,7 +43,7 @@
 ; GCN: ; BB#4: ; %bb9
 ; GCN-NEXT: s_or_b64 exec, exec, [[MASK]]
 ; GCN-NEXT: s_endpgm
-define void @break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
@@ -64,6 +64,264 @@ bb9:
   ret void
 }
 
+; OPT-LABEL: @undef_phi_cond_break_loop(
+; OPT: bb1:
+; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 undef, i64 %phi.broken)
+; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
+
+; OPT: bb4:
+; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
+; OPT-NEXT: br label %Flow
+
+; OPT: Flow:
+; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
+; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+
+; OPT: bb9:                                              ; preds = %Flow
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: store volatile i32 7
+; OPT-NEXT: ret void
+define amdgpu_kernel void @undef_phi_cond_break_loop(i32 %arg) #0 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp = sub i32 %id, %arg
+  br label %bb1
+
+bb1:                                              ; preds = %Flow, %bb
+  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp0 = icmp slt i32 %lsr.iv.next, 0
+  br i1 %cmp0, label %bb4, label %Flow
+
+bb4:                                              ; preds = %bb1
+  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %cmp1 = icmp sge i32 %tmp, %load
+  br label %Flow
+
+Flow:                                             ; preds = %bb4, %bb1
+  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ undef, %bb1 ]
+  br i1 %tmp3, label %bb9, label %bb1
+
+bb9:                                              ; preds = %Flow
+  store volatile i32 7, i32 addrspace(3)* undef
+  ret void
+}
+
+; FIXME: ConstantExpr compare of address to null folds away
+@lds = addrspace(3) global i32 undef
+
+; OPT-LABEL: @constexpr_phi_cond_break_loop(
+; OPT: bb1:
+; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+; OPT: %0 = call i64 @llvm.amdgcn.if.break(i1 icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), i64 %phi.broken)
+; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
+
+; OPT: bb4:
+; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
+; OPT-NEXT: br label %Flow
+
+; OPT: Flow:
+; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
+; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+
+; OPT: bb9:                                              ; preds = %Flow
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: store volatile i32 7
+; OPT-NEXT: ret void
+define amdgpu_kernel void @constexpr_phi_cond_break_loop(i32 %arg) #0 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp = sub i32 %id, %arg
+  br label %bb1
+
+bb1:                                              ; preds = %Flow, %bb
+  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp0 = icmp slt i32 %lsr.iv.next, 0
+  br i1 %cmp0, label %bb4, label %Flow
+
+bb4:                                              ; preds = %bb1
+  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %cmp1 = icmp sge i32 %tmp, %load
+  br label %Flow
+
+Flow:                                             ; preds = %bb4, %bb1
+  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ icmp ne (i32 addrspace(3)* inttoptr (i32 4 to i32 addrspace(3)*), i32 addrspace(3)* @lds), %bb1 ]
+  br i1 %tmp3, label %bb9, label %bb1
+
+bb9:                                              ; preds = %Flow
+  store volatile i32 7, i32 addrspace(3)* undef
+  ret void
+}
+
+; OPT-LABEL: @true_phi_cond_break_loop(
+; OPT: bb1:
+; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+; OPT: %0 = call i64 @llvm.amdgcn.break(i64 %phi.broken)
+; OPT: br i1 %cmp0, label %bb4, label %Flow
+
+; OPT: bb4:
+; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
+; OPT-NEXT: br label %Flow
+
+; OPT: Flow:
+; OPT-NEXT: %loop.phi = phi i64 [ %1, %bb4 ], [ %0, %bb1 ]
+; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+
+; OPT: bb9:                                              ; preds = %Flow
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: store volatile i32 7
+; OPT-NEXT: ret void
+define amdgpu_kernel void @true_phi_cond_break_loop(i32 %arg) #0 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp = sub i32 %id, %arg
+  br label %bb1
+
+bb1:                                              ; preds = %Flow, %bb
+  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp0 = icmp slt i32 %lsr.iv.next, 0
+  br i1 %cmp0, label %bb4, label %Flow
+
+bb4:                                              ; preds = %bb1
+  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %cmp1 = icmp sge i32 %tmp, %load
+  br label %Flow
+
+Flow:                                             ; preds = %bb4, %bb1
+  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
+  br i1 %tmp3, label %bb9, label %bb1
+
+bb9:                                              ; preds = %Flow
+  store volatile i32 7, i32 addrspace(3)* undef
+  ret void
+}
+
+; OPT-LABEL: @false_phi_cond_break_loop(
+; OPT: bb1:
+; OPT-NEXT: %phi.broken = phi i64 [ %loop.phi, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+; OPT-NOT: call
+; OPT: br i1 %cmp0, label %bb4, label %Flow
+
+; OPT: bb4:
+; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT-NEXT: %0 = call i64 @llvm.amdgcn.if.break(i1 %cmp1, i64 %phi.broken)
+; OPT-NEXT: br label %Flow
+
+; OPT: Flow:
+; OPT-NEXT: %loop.phi = phi i64 [ %0, %bb4 ], [ %phi.broken, %bb1 ]
+; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %1 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; OPT-NEXT: br i1 %1, label %bb9, label %bb1
+
+; OPT: bb9:                                              ; preds = %Flow
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; OPT-NEXT: store volatile i32 7
+; OPT-NEXT: ret void
+define amdgpu_kernel void @false_phi_cond_break_loop(i32 %arg) #0 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp = sub i32 %id, %arg
+  br label %bb1
+
+bb1:                                              ; preds = %Flow, %bb
+  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp0 = icmp slt i32 %lsr.iv.next, 0
+  br i1 %cmp0, label %bb4, label %Flow
+
+bb4:                                              ; preds = %bb1
+  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %cmp1 = icmp sge i32 %tmp, %load
+  br label %Flow
+
+Flow:                                             ; preds = %bb4, %bb1
+  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ false, %bb1 ]
+  br i1 %tmp3, label %bb9, label %bb1
+
+bb9:                                              ; preds = %Flow
+  store volatile i32 7, i32 addrspace(3)* undef
+  ret void
+}
+
+; Swap order of branches in flow block so that the true phi is
+; continue.
+
+; OPT-LABEL: @invert_true_phi_cond_break_loop(
+; OPT: bb1:
+; OPT-NEXT: %phi.broken = phi i64 [ %1, %Flow ], [ 0, %bb ]
+; OPT-NEXT: %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+; OPT-NEXT: %lsr.iv.next = add i32 %lsr.iv, 1
+; OPT-NEXT: %cmp0 = icmp slt i32 %lsr.iv.next, 0
+; OPT-NEXT: br i1 %cmp0, label %bb4, label %Flow
+
+; OPT: bb4:
+; OPT-NEXT: %load = load volatile i32, i32 addrspace(1)* undef, align 4
+; OPT-NEXT: %cmp1 = icmp sge i32 %tmp, %load
+; OPT-NEXT: br label %Flow
+
+; OPT: Flow:
+; OPT-NEXT: %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+; OPT-NEXT: %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
+; OPT-NEXT: %0 = xor i1 %tmp3, true
+; OPT-NEXT: %1 = call i64 @llvm.amdgcn.if.break(i1 %0, i64 %phi.broken)
+; OPT-NEXT: %2 = call i1 @llvm.amdgcn.loop(i64 %1)
+; OPT-NEXT: br i1 %2, label %bb9, label %bb1
+
+; OPT: bb9:
+; OPT-NEXT: call void @llvm.amdgcn.end.cf(i64 %1)
+; OPT-NEXT: store volatile i32 7, i32 addrspace(3)* undef
+; OPT-NEXT: ret void
+define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 {
+bb:
+  %id = call i32 @llvm.amdgcn.workitem.id.x()
+  %tmp = sub i32 %id, %arg
+  br label %bb1
+
+bb1:                                              ; preds = %Flow, %bb
+  %lsr.iv = phi i32 [ undef, %bb ], [ %tmp2, %Flow ]
+  %lsr.iv.next = add i32 %lsr.iv, 1
+  %cmp0 = icmp slt i32 %lsr.iv.next, 0
+  br i1 %cmp0, label %bb4, label %Flow
+
+bb4:                                              ; preds = %bb1
+  %load = load volatile i32, i32 addrspace(1)* undef, align 4
+  %cmp1 = icmp sge i32 %tmp, %load
+  br label %Flow
+
+Flow:                                             ; preds = %bb4, %bb1
+  %tmp2 = phi i32 [ %lsr.iv.next, %bb4 ], [ undef, %bb1 ]
+  %tmp3 = phi i1 [ %cmp1, %bb4 ], [ true, %bb1 ]
+  br i1 %tmp3, label %bb1, label %bb9
+
+bb9:                                              ; preds = %Flow
+  store volatile i32 7, i32 addrspace(3)* undef
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
new file mode 100644
index 0000000000000000000000000000000000000000..74564f387edebf6fc7103dff2cff5ba23deb5c4f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll
@@ -0,0 +1,117 @@
+; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s
+
+declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
+declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
+
+declare void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1
+declare void @llvm.memset.p1i8.i64(i8 addrspace(1)* nocapture, i8, i64, i32, i1) #1
+
+; Test the upper bound for sizes to leave
+; OPT-LABEL: @max_size_small_static_memcpy_caller0(
+; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
+define amdgpu_kernel void @max_size_small_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
+  ret void
+}
+
+; Smallest static size which will be expanded
+; OPT-LABEL: @min_size_large_static_memcpy_caller0(
+; OPT-NOT: call
+; OPT: getelementptr
+; OPT-NEXT: load i8
+; OPT: getelementptr
+; OPT-NEXT: store i8
+define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @max_size_small_static_memmove_caller0(
+; OPT: call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
+define amdgpu_kernel void @max_size_small_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+  call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1024, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @min_size_large_static_memmove_caller0(
+; OPT-NOT: call
+; OPT: getelementptr
+; OPT-NEXT: load i8
+; OPT: getelementptr
+; OPT-NEXT: store i8
+define amdgpu_kernel void @min_size_large_static_memmove_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 {
+  call void @llvm.memmove.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @max_size_small_static_memset_caller0(
+; OPT: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
+define amdgpu_kernel void @max_size_small_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
+  call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1024, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @min_size_large_static_memset_caller0(
+; OPT-NOT: call
+; OPT: getelementptr
+; OPT: store i8
+define amdgpu_kernel void @min_size_large_static_memset_caller0(i8 addrspace(1)* %dst, i8 %val) #0 {
+  call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %dst, i8 %val, i64 1025, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @variable_memcpy_caller0(
+; OPT-NOT: call
+; OPT: phi
+define amdgpu_kernel void @variable_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @variable_memcpy_caller1(
+; OPT-NOT: call
+; OPT: phi
+define amdgpu_kernel void @variable_memcpy_caller1(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @memcpy_multi_use_one_function(
+; OPT-NOT: call
+; OPT: phi
+; OPT-NOT: call
+; OPT: phi
+; OPT-NOT: call
+define amdgpu_kernel void @memcpy_multi_use_one_function(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n, i64 %m) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %m, i32 1, i1 false)
+  ret void
+}
+
+; OPT-LABEL: @memcpy_alt_type(
+; OPT: phi
+; OPT: getelementptr inbounds i8, i8 addrspace(3)*
+; OPT: load i8, i8 addrspace(3)*
+; OPT: getelementptr inbounds i8, i8 addrspace(1)*
+; OPT: store i8
+define amdgpu_kernel void @memcpy_alt_type(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n) #0 {
+  call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dst, i8 addrspace(3)* %src, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; One of the uses in the function should be expanded, the other left alone.
+; OPT-LABEL: @memcpy_multi_use_one_function_keep_small(
+; OPT: getelementptr inbounds i8, i8 addrspace(1)*
+; OPT: load i8, i8 addrspace(1)*
+; OPT: getelementptr inbounds i8, i8 addrspace(1)*
+; OPT: store i8
+
+; OPT: call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
+define amdgpu_kernel void @memcpy_multi_use_one_function_keep_small(i8 addrspace(1)* %dst0, i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 %n) #0 {
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst0, i8 addrspace(1)* %src, i64 %n, i32 1, i1 false)
+  call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst1, i8 addrspace(1)* %src, i64 102, i32 1, i1 false)
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll b/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
index e1fad13e0b51910878c1c7db6c0e01e776d24aed..4e0ecc0565e02370fa32b784bc0979025cf6b0dc 100644
--- a/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
+++ b/test/CodeGen/AMDGPU/lower-range-metadata-intrinsic-call.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_max_range:
 ; CHECK-NOT: v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}v0
-define void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_max_range(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %and = and i32 %id, 1023
@@ -16,7 +16,7 @@ entry:
 ; CHECK-LABEL: {{^}}test_workitem_id_x_known_trunc_1_bit_range:
 ; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0x1ff, v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
-define void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_trunc_1_bit_range(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !0
   %and = and i32 %id, 511
@@ -28,7 +28,7 @@ entry:
 ; CHECK-NOT: v0
 ; CHECK: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xff, v0
 ; CHECK: {{flat|buffer}}_store_dword {{.*}}[[MASKED]]
-define void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @test_workitem_id_x_known_max_range_m1(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %id = tail call i32 @llvm.amdgcn.workitem.id.x(), !range !1
   %and = and i32 %id, 255
diff --git a/test/CodeGen/AMDGPU/lshl.ll b/test/CodeGen/AMDGPU/lshl.ll
deleted file mode 100644
index 8468437c2c1f1d18bab1f3173764c265ea8da382..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/lshl.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
-
-;CHECK: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-
-define void @test(i32 %p) {
-   %i = mul i32 %p, 2
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/lshr.ll b/test/CodeGen/AMDGPU/lshr.ll
deleted file mode 100644
index c8ab7871434e09cf6cb9f7be939f77a908d9ee00..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/lshr.ll
+++ /dev/null
@@ -1,15 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
-
-;CHECK: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
-
-define void @test(i32 %p) {
-   %i = udiv i32 %p, 2
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/lshr.v2i16.ll b/test/CodeGen/AMDGPU/lshr.v2i16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e21d0d09bb415849fb106f6e956c87673972f9c1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/lshr.v2i16.ll
@@ -0,0 +1,149 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+
+; GCN-LABEL: {{^}}s_lshr_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
+
+; VI: v_lshrrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CIVI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
+; CIVI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @s_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+  %result = lshr <2 x i16> %lhs, %rhs
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_lshr_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+
+; VI: v_lshrrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_lshrrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
+; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]]
+; CI-DAG: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[RHS]]
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_bfe_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 16
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_lshr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
+  %result = lshr <2 x i16> %a, %b
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_v_s_v2i16:
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @lshr_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = lshr <2 x i16> %vgpr, %sgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_s_v_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @lshr_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = lshr <2 x i16> %sgpr, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_imm_v_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8
+define amdgpu_kernel void @lshr_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = lshr <2 x i16> <i16 8, i16 8>, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_v_imm_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_lshrrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]]
+define amdgpu_kernel void @lshr_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = lshr <2 x i16> %vgpr, <i16 8, i16 8>
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_lshr_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @v_lshr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
+  %result = lshr <4 x i16> %a, %b
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}lshr_v_imm_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_lshrrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @lshr_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %result = lshr <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/mad-combine.ll b/test/CodeGen/AMDGPU/mad-combine.ll
index d141281f36b87d47b7c1f184cb3b98863911f3dc..b855fc500c6b4e9e4bc154bf546c8156f5ff1d89 100644
--- a/test/CodeGen/AMDGPU/mad-combine.ll
+++ b/test/CodeGen/AMDGPU/mad-combine.ll
@@ -1,12 +1,12 @@
 ; Make sure we still form mad even when unsafe math or fp-contract is allowed instead of fma.
 
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD  -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -fp-contract=fast < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-SAFE -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefix=SI -check-prefix=SI-STD -check-prefix=SI-STD-UNSAFE -check-prefix=FUNC %s
 
 ; Make sure we don't form mad with denormals
-; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-FASTFMAF -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=verde -mattr=+fp32-denormals -fp-contract=fast -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=SI-DENORM -check-prefix=SI-DENORM-SLOWFMAF -check-prefix=FUNC %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 declare float @llvm.fabs.f32(float) #0
@@ -21,7 +21,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
 
 ; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
 
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
 
 ; SI-DENORM-SLOWFMAF-NOT: v_fma
 ; SI-DENORM-SLOWFMAF-NOT: v_mad
@@ -31,7 +31,7 @@ declare float @llvm.fmuladd.f32(float, float, float) #0
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -58,8 +58,8 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp
 ; SI-STD-DAG: v_mac_f32_e32 [[C]], [[B]], [[A]]
 ; SI-STD-DAG: v_mac_f32_e32 [[D]], [[B]], [[A]]
 
-; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
-; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
+; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], [[D]]
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF-DAG: v_add_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
@@ -70,7 +70,7 @@ define void @combine_to_mad_f32_0(float addrspace(1)* noalias %out, float addrsp
 ; SI-STD-DAG: buffer_store_dword [[C]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-STD-DAG: buffer_store_dword [[D]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -100,14 +100,14 @@ define void @combine_to_mad_f32_0_2use(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
 ; SI-STD: v_mac_f32_e32 [[C]], [[B]], [[A]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
+; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[C]]
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
 
 ; SI-DENORM: buffer_store_dword [[RESULT]]
 ; SI-STD: buffer_store_dword [[C]]
-define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -131,13 +131,13 @@ define void @combine_to_mad_f32_1(float addrspace(1)* noalias %out, float addrsp
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], -[[C]]
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -164,8 +164,8 @@ define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float a
 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
 
-; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
-; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], [[A]], [[B]], -[[C]]
+; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
@@ -174,7 +174,7 @@ define void @combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -203,13 +203,13 @@ define void @combine_to_mad_fsub_0_f32_2use(float addrspace(1)* noalias %out, fl
 ; SI-DAG: buffer_load_dword [[C:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:8{{$}}
 
 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], [[C]]
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP]], [[C]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -235,8 +235,8 @@ define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float a
 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
 
-; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
-; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
+; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], [[C]]
+; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], [[D]]
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[TMP]], [[C]]
@@ -245,7 +245,7 @@ define void @combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -275,13 +275,13 @@ define void @combine_to_mad_fsub_1_f32_2use(float addrspace(1)* noalias %out, fl
 
 ; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], [[A]], -[[B]], -[[C]]
 
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-FASTFMAF: v_fma_f32 [[RESULT:v[0-9]+]], -[[A]], [[B]], -[[C]]
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP]]
 
 ; SI: buffer_store_dword [[RESULT]]
-define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -309,8 +309,8 @@ define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float a
 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], [[A]], -[[B]], -[[C]]
 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], -[[B]], -[[D]]
 
-; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
-; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
+; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], -[[A]], [[B]], -[[D]]
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e64 [[TMP:v[0-9]+]], [[A]], -[[B]]
 ; SI-DENORM-SLOWFMAF-DAG: v_subrev_f32_e32 [[RESULT0:v[0-9]+]], [[C]], [[TMP]]
@@ -319,7 +319,7 @@ define void @combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float a
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -352,8 +352,8 @@ define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %ou
 ; SI-STD-DAG: v_mad_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
 ; SI-STD-DAG: v_mad_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
 
-; SI-DENORM-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
-; SI-DENORM-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
+; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT0:v[0-9]+]], -[[A]], [[B]], -[[C]]
+; SI-DENORM-FASTFMAF-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[A]], [[B]], -[[D]]
 
 ; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF-DAG: v_sub_f32_e64 [[RESULT0:v[0-9]+]], -[[TMP]], [[C]]
@@ -362,7 +362,7 @@ define void @combine_to_mad_fsub_2_f32_2uses_neg(float addrspace(1)* noalias %ou
 ; SI-DAG: buffer_store_dword [[RESULT0]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI-DAG: buffer_store_dword [[RESULT1]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; SI: s_endpgm
-define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -399,15 +399,12 @@ define void @combine_to_mad_fsub_2_f32_2uses_mul(float addrspace(1)* noalias %ou
 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
 ; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
 
-; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], [[D]], [[E]], -[[C]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
+; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-DENORM: v_subrev_f32_e32 [[RESULT1:v[0-9]+]], [[C]], [[TMP1]]
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -444,16 +441,13 @@ define void @aggressive_combine_to_mad_fsub_0_f32(float addrspace(1)* noalias %o
 ; SI-STD: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
 ; SI-STD: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
 
-; SI-DENORM: v_fma_f32 [[TMP0:v[0-9]+]], -[[D]], [[E]], [[A]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP0]]
-
-; SI-DENORM-SLOWFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
-; SI-DENORM-SLOWFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
-; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
+; SI-DENORM: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-DENORM: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -485,21 +479,25 @@ define void @aggressive_combine_to_mad_fsub_1_f32(float addrspace(1)* noalias %o
 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 
-; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
-; SI-STD: v_mac_f32_e32 [[TMP]], [[B]], [[A]]
+; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[B]], [[A]]
+; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP0]]
+
+; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], [[D]], [[E]], -[[C]]
+; SI-STD-UNSAFE: v_mac_f32_e32 [[RESULT]], [[B]], [[A]]
 
-; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], [[D]], [[E]], -[[C]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], [[A]], [[B]], [[TMP]]
+; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[A]], [[B]], [[TMP0]]
+; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP1]]
 
 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[B]], [[A]]
 ; SI-DENORM-SLOWFMAF: v_add_f32_e32 [[TMP2:v[0-9]+]], [[TMP0]], [[TMP1]]
 ; SI-DENORM-SLOWFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[C]], [[TMP2]]
 
-; SI-DENORM: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
-; SI-STD: buffer_store_dword [[TMP]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
+; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -532,11 +530,16 @@ define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %o
 ; SI-DAG: buffer_load_dword [[D:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:12{{$}}
 ; SI-DAG: buffer_load_dword [[E:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:16{{$}}
 
-; SI-STD: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
-; SI-STD: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+; SI-STD-SAFE: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-STD-SAFE: v_mac_f32_e32 [[TMP0]], [[C]], [[B]]
+; SI-STD-SAFE: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP0]], [[A]]
+
+; SI-STD-UNSAFE: v_mad_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
+; SI-STD-UNSAFE: v_mad_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
 
-; SI-DENORM: v_fma_f32 [[TMP:v[0-9]+]], -[[D]], [[E]], [[A]]
-; SI-DENORM: v_fma_f32 [[RESULT:v[0-9]+]], -[[B]], [[C]], [[TMP]]
+; SI-DENORM-FASTFMAF: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
+; SI-DENORM-FASTFMAF: v_fma_f32 [[TMP1:v[0-9]+]], [[B]], [[C]], [[TMP0]]
+; SI-DENORM-FASTFMAF: v_subrev_f32_e32 [[RESULT:v[0-9]+]], [[TMP1]], [[A]]
 
 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP0:v[0-9]+]], [[E]], [[D]]
 ; SI-DENORM-SLOWFMAF-DAG: v_mul_f32_e32 [[TMP1:v[0-9]+]], [[C]], [[B]]
@@ -545,7 +548,7 @@ define void @aggressive_combine_to_mad_fsub_2_f32(float addrspace(1)* noalias %o
 
 ; SI: buffer_store_dword [[RESULT]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; SI: s_endpgm
-define void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
+define amdgpu_kernel void @aggressive_combine_to_mad_fsub_3_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) #1 {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
diff --git a/test/CodeGen/AMDGPU/mad24-get-global-id.ll b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
index 9183ae0972dcb0bd3233b53cff9081994e642e06..1e78c4ebcc9f1a2e8b364970317ee1a1c0eb1a12 100644
--- a/test/CodeGen/AMDGPU/mad24-get-global-id.ll
+++ b/test/CodeGen/AMDGPU/mad24-get-global-id.ll
@@ -11,7 +11,7 @@ declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #0
 ; GCN: s_and_b32 [[WGSIZEX:s[0-9]+]], {{s[0-9]+}}, 0xffff
 ; GCN: v_mov_b32_e32 [[VWGSIZEX:v[0-9]+]], [[WGSIZEX]]
 ; GCN: v_mad_u32_u24 v{{[0-9]+}}, [[VWGSIZEX]], s8, v0
-define void @get_global_id_0(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @get_global_id_0(i32 addrspace(1)* %out) #1 {
   %dispatch.ptr = call i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr()
   %cast.dispatch.ptr = bitcast i8 addrspace(2)* %dispatch.ptr to i32 addrspace(2)*
   %gep = getelementptr inbounds i32, i32 addrspace(2)* %cast.dispatch.ptr, i64 1
diff --git a/test/CodeGen/AMDGPU/mad_int24.ll b/test/CodeGen/AMDGPU/mad_int24.ll
index f149ea0a6a0e127ea6d2461ec48097b32bcb07b1..af0159aa9b10db8b1658313ab60459062a612b79 100644
--- a/test/CodeGen/AMDGPU/mad_int24.ll
+++ b/test/CodeGen/AMDGPU/mad_int24.ll
@@ -11,7 +11,7 @@
 ; CM: MULADD_INT24
 ; SI-NOT: and
 ; SI: v_mad_i32_i24
-define void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @i32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = ashr i32 %0, 8
diff --git a/test/CodeGen/AMDGPU/mad_uint24.ll b/test/CodeGen/AMDGPU/mad_uint24.ll
index 9fde950f822c7d1d890c0064e523776c306c3662..2c4f7d324a9652bc9ca3b8fc02eb38e532d6fbd7 100644
--- a/test/CodeGen/AMDGPU/mad_uint24.ll
+++ b/test/CodeGen/AMDGPU/mad_uint24.ll
@@ -11,7 +11,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; SI: v_mad_u32_u24
 ; VI: v_mad_u32_u24
 
-define void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @u32_mad24(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -32,7 +32,7 @@ entry:
 ; FIXME: Should be using scalar instructions here.
 ; GCN: v_mad_u32_u24 [[MAD:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MAD]], 0, 16
-define void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @i16_mad24(i32 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 entry:
   %0 = mul i16 %a, %b
   %1 = add i16 %0, %c
@@ -49,7 +49,7 @@ entry:
 ; EG: 8
 ; GCN: v_mad_u32_u24 [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-define void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @i8_mad24(i32 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 entry:
   %0 = mul i8 %a, %b
   %1 = add i8 %0, %c
@@ -68,7 +68,7 @@ entry:
 ; FUNC-LABEL: {{^}}i24_i32_i32_mad:
 ; EG: CNDE_INT
 ; SI: v_cndmask
-define void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @i24_i32_i32_mad(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   %0 = ashr i32 %a, 8
   %1 = icmp ne i32 %c, 0
diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll
index 6722aa79dd5d32477ab7eef21f575d2c5de904be..eb4066a2a0a80ca07d2189741650ef236a8f51ea 100644
--- a/test/CodeGen/AMDGPU/madak.ll
+++ b/test/CodeGen/AMDGPU/madak.ll
@@ -10,7 +10,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000
-define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -37,7 +37,7 @@ define void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
 ; GCN-DAG: v_mad_f32 {{v[0-9]+}}, [[VB]], [[VA]], [[VK]]
 ; GCN-DAG: v_mac_f32_e32 [[VK]], [[VC]], [[VA]]
 ; GCN: s_endpgm
-define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -64,7 +64,7 @@ define void @madak_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; GCN-LABEL: {{^}}madak_m_inline_imm_f32:
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000
-define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
+define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -84,7 +84,7 @@ define void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addr
 ; GCN: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VA]], [[VB]], 4.0
-define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -106,7 +106,7 @@ define void @madak_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
+define amdgpu_kernel void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -125,7 +125,7 @@ define void @s_v_madak_f32(float addrspace(1)* noalias %out, float addrspace(1)*
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]]
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 [[VK]], [[SB]], [[VA]]
-define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -141,7 +141,7 @@ define void @v_s_madak_f32(float addrspace(1)* noalias %out, float %a, float add
 ; GCN-LABEL: {{^}}s_s_madak_f32:
 ; GCN-NOT: v_madak_f32
 ; GCN: v_mac_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
-define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwind {
   %mul = fmul float %a, %b
   %madak = fadd float %mul, 10.0
   store float %madak, float addrspace(1)* %out, align 4
@@ -153,7 +153,7 @@ define void @s_s_madak_f32(float addrspace(1)* %out, float %a, float %b) nounwin
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, |{{v[0-9]+}}|, {{[sv][0-9]+}}
 ; GCN: s_endpgm
-define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -175,7 +175,7 @@ define void @no_madak_src0_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: buffer_load_dword [[VB:v[0-9]+]]
 ; GCN: v_mad_f32 {{v[0-9]+}}, |{{v[0-9]+}}|, {{v[0-9]+}}, {{[sv][0-9]+}}
 ; GCN: s_endpgm
-define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
+define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid
   %in.b.gep = getelementptr float, float addrspace(1)* %in.b, i32 %tid
@@ -201,7 +201,7 @@ define void @no_madak_src1_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
+define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 {
 bb:
   %tmp = icmp eq i32 %arg1, 0
   br i1 %tmp, label %bb3, label %bb4
diff --git a/test/CodeGen/AMDGPU/madmk.ll b/test/CodeGen/AMDGPU/madmk.ll
index 27fbf58d26c6c2162a5f80e87e6fd909541fe7bf..6e70e95383c97eeb8e00114ccf92c00f908f4bec 100644
--- a/test/CodeGen/AMDGPU/madmk.ll
+++ b/test/CodeGen/AMDGPU/madmk.ll
@@ -12,7 +12,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 0x41200000, [[VA]]
-define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -35,7 +35,7 @@ define void @madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noa
 ; GCN-DAG: v_mac_f32_e32 [[VB]], [[VK]], [[VA]]
 ; GCN-DAG: v_mac_f32_e32 [[VC]], [[VK]], [[VA]]
 ; GCN: s_endpgm
-define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
   %in.gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -64,7 +64,7 @@ define void @madmk_2_use_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mac_f32_e32 [[VB]], 4.0, [[VA]]
-define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -83,7 +83,7 @@ define void @madmk_inline_imm_f32(float addrspace(1)* noalias %out, float addrsp
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
-define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
+define amdgpu_kernel void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
 
@@ -97,7 +97,7 @@ define void @s_s_madmk_f32(float addrspace(1)* noalias %out, float %a, float %b)
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mad_f32
 ; GCN: s_endpgm
-define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
+define amdgpu_kernel void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %b) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -113,7 +113,7 @@ define void @v_s_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)*
 ; GCN-NOT: v_madmk_f32
 ; GCN: v_mac_f32_e32
 ; GCN: s_endpgm
-define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
+define amdgpu_kernel void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in, float %a) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -130,7 +130,7 @@ define void @scalar_vector_madmk_f32(float addrspace(1)* noalias %out, float add
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], |[[VA]]|, [[VB]]
-define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -151,7 +151,7 @@ define void @no_madmk_src0_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN-DAG: buffer_load_dword [[VA:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: buffer_load_dword [[VB:v[0-9]+]], {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN: v_mad_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, |{{[sv][0-9]+}}|
-define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %gep.1 = getelementptr float, float addrspace(1)* %gep.0, i32 1
@@ -172,7 +172,7 @@ define void @no_madmk_src2_modifier_f32(float addrspace(1)* noalias %out, float
 ; GCN: buffer_load_dword [[A:v[0-9]+]]
 ; GCN: v_mov_b32_e32 [[VK:v[0-9]+]], 0x41200000
 ; GCN: v_mad_f32 {{v[0-9]+}}, [[VK]], [[A]], 2.0
-define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -189,7 +189,7 @@ define void @madmk_add_inline_imm_f32(float addrspace(1)* noalias %out, float ad
 ; SI: s_xor_b64
 ; SI: v_mac_f32_e32 {{v[0-9]+}}, 0x472aee8c, {{v[0-9]+}}
 ; SI: s_or_b64
-define void @kill_madmk_verifier_error() nounwind {
+define amdgpu_kernel void @kill_madmk_verifier_error() nounwind {
 bb:
   br label %bb2
 
diff --git a/test/CodeGen/AMDGPU/max.i16.ll b/test/CodeGen/AMDGPU/max.i16.ll
index 3f2a87f206916f9587dd84b9d1778150595d0cca..abd75258c4d4ff7493311b8182ab11fc6649c288 100644
--- a/test/CodeGen/AMDGPU/max.i16.ll
+++ b/test/CodeGen/AMDGPU/max.i16.ll
@@ -1,12 +1,10 @@
-; RUN: llc < %s -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
-
-
-declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+; RUN: llc -march=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=VIPLUS %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=VIPLUS %s
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_imax_sge_i16:
-; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+; VIPLUS: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -20,12 +18,56 @@ define void @v_test_imax_sge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 }
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
-; GCN-LABEL: {{^}}v_test_imax_sge_v4i16:
+; GCN-LABEL: {{^}}v_test_imax_sge_v2i16:
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_imax_sge_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4
+  %cmp = icmp sge <2 x i16> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_imax_sge_v3i16:
+; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-NOT: v_max_i16
+
+; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_imax_sge_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %aptr, <3 x i16> addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep0 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid
+  %a = load <3 x i16>, <3 x i16> addrspace(1)* %gep0, align 4
+  %b = load <3 x i16>, <3 x i16> addrspace(1)* %gep1, align 4
+  %cmp = icmp sge <3 x i16> %a, %b
+  %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  store <3 x i16> %val, <3 x i16> addrspace(1)* %outgep, align 4
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_imax_sge_v4i16:
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
+; VI: v_max_i16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_max_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %aptr, <4 x i16> addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %bptr, i32 %tid
@@ -40,8 +82,8 @@ define void @v_test_imax_sge_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrs
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_imax_sgt_i16:
-; VI: v_max_i16_e32
-define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+; VIPLUS: v_max_i16_e32
+define amdgpu_kernel void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -56,8 +98,8 @@ define void @v_test_imax_sgt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_umax_uge_i16:
-; VI: v_max_u16_e32
-define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+; VIPLUS: v_max_u16_e32
+define amdgpu_kernel void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -72,8 +114,8 @@ define void @v_test_umax_uge_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
 
 ; FIXME: Need to handle non-uniform case for function below (load without gep).
 ; GCN-LABEL: {{^}}v_test_umax_ugt_i16:
-; VI: v_max_u16_e32
-define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+; VIPLUS: v_max_u16_e32
+define amdgpu_kernel void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -85,3 +127,23 @@ define void @v_test_umax_ugt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr
   store i16 %val, i16 addrspace(1)* %outgep, align 4
   ret void
 }
+
+; GCN-LABEL: {{^}}v_test_umax_ugt_v2i16:
+; VI: v_max_u16_e32
+; VI: v_max_u16_sdwa
+
+; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_umax_ugt_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %aptr, <2 x i16> addrspace(1)* %bptr) nounwind {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
+  %gep0 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %aptr, i32 %tid
+  %gep1 = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %bptr, i32 %tid
+  %outgep = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep0, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep1, align 4
+  %cmp = icmp ugt <2 x i16> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %outgep, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
diff --git a/test/CodeGen/AMDGPU/max.ll b/test/CodeGen/AMDGPU/max.ll
index 5fa307be0fd5cbeb70c92c4ad43e7aca8dccdc2d..ffcdac03bc74ca070cf49c75b0f7bb1fcbb32da0 100644
--- a/test/CodeGen/AMDGPU/max.ll
+++ b/test/CodeGen/AMDGPU/max.ll
@@ -6,7 +6,7 @@
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sge i32 %a, %b
@@ -26,7 +26,7 @@ define void @v_test_imax_sge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %aptr, <4 x i32> addrspace(1)* %bptr) nounwind {
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %aptr, align 4
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %bptr, align 4
   %cmp = icmp sge <4 x i32> %a, %b
@@ -39,7 +39,7 @@ define void @v_test_imax_sge_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrs
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -50,7 +50,7 @@ define void @s_test_imax_sge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sge i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -63,7 +63,7 @@ define void @s_test_imax_sge_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
   %a = load i8, i8 addrspace(1)* %aptr, align 1
   %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp sge i8 %a, %b
@@ -76,7 +76,7 @@ define void @v_test_imax_sge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
 ; SI: s_max_i32 {{s[0-9]+}}, {{s[0-9]+}}, 9
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cmp = icmp sgt i32 %a, 9
   %val = select i1 %cmp, i32 %a, i32 9
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -89,7 +89,7 @@ define void @s_test_imax_sgt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
 ; EG: MAX_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp sgt <2 x i32> %a, <i32 9, i32 9>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 9, i32 9>
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
@@ -100,7 +100,7 @@ define void @s_test_imax_sgt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; SI: v_max_i32_e32
 
 ; EG: MAX_INT
-define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp sgt i32 %a, %b
@@ -113,7 +113,7 @@ define void @v_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp sgt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -124,7 +124,7 @@ define void @s_test_imax_sgt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp uge i32 %a, %b
@@ -137,7 +137,7 @@ define void @v_test_umax_uge_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: s_max_u32
 
 ; EG: MAX_UINT
-define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp uge i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -155,7 +155,7 @@ define void @s_test_umax_uge_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; EG: MAX_UINT
 ; EG: MAX_UINT
 ; EG-NOT: MAX_UINT
-define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <3 x i32> %b) nounwind {
   %cmp = icmp uge <3 x i32> %a, %b
   %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
   store <3 x i32> %val, <3 x i32> addrspace(1)* %out, align 4
@@ -168,7 +168,7 @@ define void @s_test_umax_uge_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a, <
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
   %a = load i8, i8 addrspace(1)* %aptr, align 1
   %b = load i8, i8 addrspace(1)* %bptr, align 1
   %cmp = icmp uge i8 %a, %b
@@ -181,7 +181,7 @@ define void @v_test_umax_uge_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
 ; SI: v_max_u32_e32
 
 ; EG: MAX_UINT
-define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ugt i32 %a, %b
@@ -194,7 +194,7 @@ define void @v_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; SI: s_max_u32
 
 ; EG: MAX_UINT
-define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp ugt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -207,7 +207,7 @@ define void @s_test_umax_ugt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 
 ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
 ; EG: MAX_UINT {{.*}}literal.{{[xyzw]}}
-define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
+define amdgpu_kernel void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a) nounwind {
   %cmp = icmp ugt <2 x i32> %a, <i32 15, i32 23>
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> <i32 15, i32 23>
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out, align 4
@@ -223,7 +223,7 @@ define void @s_test_umax_ugt_imm_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_UINT
-define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ugt i32 %a.ext, %b.ext
@@ -243,7 +243,7 @@ define void @simplify_demanded_bits_test_umax_ugt_i16(i32 addrspace(1)* %out, i1
 ; SI: buffer_store_dword [[VMAX]]
 
 ; EG: MAX_INT
-define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp sgt i32 %a.ext, %b.ext
@@ -262,7 +262,7 @@ define void @simplify_demanded_bits_test_max_slt_i16(i32 addrspace(1)* %out, i16
 ; SI: s_max_i32
 
 ; EG: MAX_INT
-define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+define amdgpu_kernel void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
   %cmp = icmp sge i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
@@ -275,7 +275,7 @@ define void @s_test_imax_sge_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin
 
 ; EG: MAX_UINT
 ; EG: MAX_UINT
-define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp ugt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -287,7 +287,7 @@ define void @test_umax_ugt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG: MAX_UINT
 ; EG: MAX_UINT
-define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp uge i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -299,7 +299,7 @@ define void @test_umax_uge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG-DAG: MAX_UINT
 ; EG-DAG: MAX_INT
-define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp sgt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -311,7 +311,7 @@ define void @test_imax_sgt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG-DAG: MAX_UINT
 ; EG-DAG: MAX_INT
-define void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imax_sge_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %tmp = icmp sge i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/max3.ll b/test/CodeGen/AMDGPU/max3.ll
index a12dba2eb6e9a4e4fed2a2d0c2989d25a32887dc..4bb4fd46becde1fd51c049ad2f27d7563c9ffc7f 100644
--- a/test/CodeGen/AMDGPU/max3.ll
+++ b/test/CodeGen/AMDGPU/max3.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; FUNC-LABEL: @v_test_imax3_sgt_i32
 ; SI: v_max3_i32
-define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_imax3_sgt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 
 ; FUNC-LABEL: @v_test_umax3_ugt_i32
 ; SI: v_max3_u32
-define void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umax3_ugt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/mem-builtins.ll b/test/CodeGen/AMDGPU/mem-builtins.ll
index 97512670f59e412991b2d067467be04d75bbbd1a..1cbd0c3275106fa90286fc65ad5879faf8b708f0 100644
--- a/test/CodeGen/AMDGPU/mem-builtins.ll
+++ b/test/CodeGen/AMDGPU/mem-builtins.ll
@@ -9,7 +9,7 @@ declare i32 @strcmp(i8* nocapture, i8* nocapture) #1
 
 
 ; ERROR: error: <unknown>:0:0: in function test_memcmp void (i8 addrspace(1)*, i8 addrspace(1)*, i32*): unsupported call to function memcmp
-define void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 {
+define amdgpu_kernel void @test_memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i32* nocapture %p) #0 {
 entry:
   %cmp = tail call i32 @memcmp(i8 addrspace(1)* %x, i8 addrspace(1)* %y, i64 2)
   store volatile i32 %cmp, i32 addrspace(1)* undef
@@ -17,35 +17,35 @@ entry:
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_memchr void (i8 addrspace(1)*, i32, i64): unsupported call to function memchr
-define void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 {
+define amdgpu_kernel void @test_memchr(i8 addrspace(1)* %src, i32 %char, i64 %len) #0 {
   %res = call i8 addrspace(1)* @memchr(i8 addrspace(1)* %src, i32 %char, i64 %len)
   store volatile i8 addrspace(1)* %res, i8 addrspace(1)* addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strcpy void (i8*, i8*): unsupported call to function strcpy
-define void @test_strcpy(i8* %dst, i8* %src) #0 {
+define amdgpu_kernel void @test_strcpy(i8* %dst, i8* %src) #0 {
   %res = call i8* @strcpy(i8* %dst, i8* %src)
   store volatile i8* %res, i8* addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strcmp void (i8*, i8*): unsupported call to function strcmp
-define void @test_strcmp(i8* %src0, i8* %src1) #0 {
+define amdgpu_kernel void @test_strcmp(i8* %src0, i8* %src1) #0 {
   %res = call i32 @strcmp(i8* %src0, i8* %src1)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strlen void (i8*): unsupported call to function strlen
-define void @test_strlen(i8* %src) #0 {
+define amdgpu_kernel void @test_strlen(i8* %src) #0 {
   %res = call i32 @strlen(i8* %src)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
 }
 
 ; ERROR: error: <unknown>:0:0: in function test_strnlen void (i8*, i32): unsupported call to function strnlen
-define void @test_strnlen(i8* %src, i32 %size) #0 {
+define amdgpu_kernel void @test_strnlen(i8* %src, i32 %size) #0 {
   %res = call i32 @strnlen(i8* %src, i32 %size)
   store volatile i32 %res, i32 addrspace(1)* undef
   ret void
diff --git a/test/CodeGen/AMDGPU/merge-stores.ll b/test/CodeGen/AMDGPU/merge-stores.ll
index 07104ebc8c97969d73bc9e19b7a99fb78b3f28b1..dfd5b97fcc865c63f0a72c67ed23bc54616dd15e 100644
--- a/test/CodeGen/AMDGPU/merge-stores.ll
+++ b/test/CodeGen/AMDGPU/merge-stores.ll
@@ -1,8 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-NOAA %s
-
-; RUN: llc -march=amdgcn -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -combiner-alias-analysis -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -amdgpu-load-store-vectorizer=0 < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=GCN-AA %s
 
 ; This test is mostly to test DAG store merging, so disable the vectorizer.
 ; Run with devices with different unaligned load restrictions.
@@ -16,7 +13,7 @@
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -28,7 +25,7 @@ define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -38,7 +35,7 @@ define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %o
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i16:
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -48,7 +45,7 @@ define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_0_i16:
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 0, i16 addrspace(1)* %out.gep.1
@@ -60,7 +57,7 @@ define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
 ; GCN: buffer_store_short
 ; GCN: buffer_store_short
 ; GCN: s_endpgm
-define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -72,7 +69,7 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)*
 ; SI-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; SI-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -82,7 +79,7 @@ define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i32_f32:
 ; GCN: buffer_store_dwordx2
-define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
   store float 1.0, float addrspace(1)* %out.gep.1.bc
@@ -94,7 +91,7 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 4.0
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0x7b
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
   store i32 123, i32 addrspace(1)* %out.gep.1.bc
@@ -108,7 +105,7 @@ define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0
 ; GCN-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0x7b{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x4d2{{$}}
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -122,7 +119,7 @@ define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32_order:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -137,7 +134,7 @@ define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out)
 ; First store is out of order.
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_f32:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -150,14 +147,9 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 }
 
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_mixed_i32_f32:
-; GCN-NOAA: buffer_store_dwordx4 v
-
-; GCN-AA: buffer_store_dwordx2
-; GCN-AA: buffer_store_dword v
-; GCN-AA: buffer_store_dword v
-
+; GCN-AA: buffer_store_dwordx4 v
 ; GCN: s_endpgm
-define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -177,7 +169,7 @@ define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %o
 ; SI-DAG: buffer_store_dword
 ; SI-NOT: buffer_store_dword
 ; GCN: s_endpgm
-define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 
@@ -189,7 +181,7 @@ define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; GCN-LABEL: {{^}}merge_global_store_2_constants_i64:
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 
   store i64 123, i64 addrspace(1)* %out.gep.1
@@ -200,7 +192,7 @@ define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}merge_global_store_4_constants_i64:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
-define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
@@ -215,7 +207,7 @@ define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32:
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx2 [[LOAD]]
-define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -230,7 +222,7 @@ define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; GCN-LABEL: {{^}}merge_global_store_2_adjacent_loads_i32_nonzero_base:
 ; GCN: buffer_load_dwordx2 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; GCN: buffer_store_dwordx2 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
-define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 
@@ -249,7 +241,7 @@ define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; GCN: buffer_load_dword v
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -264,7 +256,7 @@ define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -291,7 +283,7 @@ define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; SI-DAG: buffer_store_dword v
 ; SI-DAG: buffer_store_dwordx2 v
 ; GCN: s_endpgm
-define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
@@ -310,7 +302,7 @@ define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_f32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -333,7 +325,7 @@ define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, f
 ; GCN-LABEL: {{^}}merge_global_store_4_adjacent_loads_i32_nonzero_base:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:44
 ; GCN: buffer_store_dwordx4 [[LOAD]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:28
-define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
@@ -359,7 +351,7 @@ define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
 ; GCN: s_barrier
 ; GCN: buffer_store_dwordx4 [[LOAD]]
-define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -396,7 +388,7 @@ define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -424,7 +416,7 @@ define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; GCN: buffer_load_dword [[LOAD:v[0-9]+]]
 ; GCN: buffer_store_dword [[LOAD]]
 ; GCN: s_endpgm
-define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -454,7 +446,7 @@ define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 ad
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: s_endpgm
-define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -474,19 +466,11 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1
   ret void
 }
 
-; This works once AA is enabled on the subtarget
 ; GCN-LABEL: {{^}}merge_global_store_4_vector_elts_loads_v4i32:
 ; GCN: buffer_load_dwordx4 [[LOAD:v\[[0-9]+:[0-9]+\]]]
-
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-; GCN-NOAA: buffer_store_dword v
-
-; GCN-AA: buffer_store_dwordx4 [[LOAD]]
-
+; GCN: buffer_store_dwordx4 [[LOAD]]
 ; GCN: s_endpgm
-define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -508,7 +492,7 @@ define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: s_endpgm
-define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 
   store i8 123, i8 addrspace(3)* %out.gep.1
@@ -520,7 +504,7 @@ define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0x1c8
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0x7b
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v[[LO]], v[[HI]] offset1:1{{$}}
-define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1
@@ -538,7 +522,7 @@ define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, [[K0]], [[K1]] offset1:1
 
 ; GCN: s_endpgm
-define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
@@ -556,7 +540,7 @@ define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 ; GCN: buffer_store_dwordx4 v{{\[}}[[LO]]:[[HI4]]{{\]}}
 ; GCN: v_mov_b32_e32 v[[HI:[0-9]+]], 11{{$}}
 ; GCN: buffer_store_dword v[[HI]]
-define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
   store i32 9, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 12, i32 addrspace(1)* %idx1, align 4
@@ -572,7 +556,7 @@ define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 ; GCN-LABEL: {{^}}merge_global_store_6_constants_i32:
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx2
-define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
   store i32 13, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 15, i32 addrspace(1)* %idx1, align 4
@@ -591,7 +575,7 @@ define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx2
 ; GCN: buffer_store_dword v
-define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -612,7 +596,7 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 ; GCN: buffer_store_dwordx4
 ; GCN: buffer_store_dwordx4
 ; GCN: s_endpgm
-define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -646,7 +630,7 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
   ret void
@@ -662,7 +646,7 @@ define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> a
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
   ret void
@@ -678,7 +662,7 @@ define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> a
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
@@ -695,7 +679,7 @@ define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x floa
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16{{$}}
 ; GCN: ScratchSize: 0{{$}}
-define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/min.ll b/test/CodeGen/AMDGPU/min.ll
index 19d0117d64a98beeec56e708555915d073f4508d..e85a724c1567c98d788ee4b5feb953df4a852f4a 100644
--- a/test/CodeGen/AMDGPU/min.ll
+++ b/test/CodeGen/AMDGPU/min.ll
@@ -1,17 +1,22 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}v_test_imin_sle_i32:
 ; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
-define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
@@ -19,7 +24,7 @@ define void @v_test_imin_sle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp sle i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -30,7 +35,7 @@ define void @s_test_imin_sle_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %cmp = icmp sle <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
   store <1 x i32> %val, <1 x i32> addrspace(1)* %out
@@ -47,7 +52,7 @@ define void @s_test_imin_sle_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %cmp = icmp sle <4 x i32> %a, %b
   %val = select <4 x i1> %cmp, <4 x i32> %a, <4 x i32> %b
   store <4 x i32> %val, <4 x i32> addrspace(1)* %out
@@ -60,7 +65,7 @@ define void @s_test_imin_sle_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: s_min_i32
-define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) #0 {
   %cmp = icmp sle i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
   store i8 %val, i8 addrspace(1)* %out
@@ -90,30 +95,62 @@ define void @s_test_imin_sle_i8(i8 addrspace(1)* %out, i8 %a, i8 %b) nounwind {
 ; VI: v_min_i32
 ; VI: v_min_i32
 
+; GFX9: v_min_i16
+; GFX9: v_min_i16
+; GFX9: v_min_i16
+; GFX9: v_min_i16
+
 ; GCN: s_endpgm
 
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b) #0 {
   %cmp = icmp sle <4 x i8> %a, %b
   %val = select <4 x i1> %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %val, <4 x i8> addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}s_test_imin_sle_v2i16:
+; SI: v_min_i32
+; SI: v_min_i32
+
+; VI: v_min_i32
+; VI: v_min_i32
+
+; GFX9: v_pk_min_i16
+
+; EG: MIN_INT
+; EG: MIN_INT
+define amdgpu_kernel void @s_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+  %cmp = icmp sle <2 x i16> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: VI use s_min_i32
 ; FUNC-LABEL: {{^}}s_test_imin_sle_v4i16:
 ; SI: v_min_i32
 ; SI: v_min_i32
 ; SI: v_min_i32
 ; SI: v_min_i32
 
+; VI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+; VI: v_min_i32
+
+; GFX9: v_pk_min_i16
+; GFX9: v_pk_min_i16
+
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b) #0 {
   %cmp = icmp sle <4 x i16> %a, %b
   %val = select <4 x i1> %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %val, <4 x i16> addrspace(1)* %out
@@ -124,12 +161,36 @@ define void @s_test_imin_sle_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <
 ; GCN: v_min_i32_e32
 
 ; EG: MIN_INT
-define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %bptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, i32 addrspace(1)* %out.gep, align 4
+  ret void
+}
+
+; FUNC-LABEL: @v_test_imin_slt_i16
+; SI: v_min_i32_e32
+
+; GFX89: v_min_i16_e32
+
+; EG: MIN_INT
+define amdgpu_kernel void @v_test_imin_slt_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %aptr, i32 %tid
+  %b.gep = getelementptr inbounds i16, i16 addrspace(1)* %bptr, i32 %tid
+  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+
+  %a = load i16, i16 addrspace(1)* %a.gep
+  %b = load i16, i16 addrspace(1)* %b.gep
+  %cmp = icmp slt i16 %a, %b
+  %val = select i1 %cmp, i16 %a, i16 %b
+  store i16 %val, i16 addrspace(1)* %out.gep
   ret void
 }
 
@@ -137,7 +198,7 @@ define void @v_test_imin_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp slt i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -150,7 +211,7 @@ define void @s_test_imin_slt_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 
 ; EG: MIN_INT
 ; EG: MIN_INT
-define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %cmp = icmp slt <2 x i32> %a, %b
   %val = select <2 x i1> %cmp, <2 x i32> %a, <2 x i32> %b
   store <2 x i32> %val, <2 x i32> addrspace(1)* %out
@@ -161,7 +222,7 @@ define void @s_test_imin_slt_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <
 ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
   %cmp = icmp slt i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -172,7 +233,7 @@ define void @s_test_imin_slt_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; GCN: s_min_i32 {{s[0-9]+}}, {{s[0-9]+}}, 8
 
 ; EG: MIN_INT {{.*}}literal.{{[xyzw]}}
-define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) #0 {
   %cmp = icmp sle i32 %a, 8
   %val = select i1 %cmp, i32 %a, i32 8
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -183,12 +244,16 @@ define void @s_test_imin_sle_imm_i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
@@ -196,25 +261,65 @@ define void @v_test_umin_ule_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr
 ; GCN: v_min_u32_e32
 ; GCN: v_min_u32_e32
 ; GCN: v_min_u32_e32
-; SI-NOT: v_min_u32_e32
+; GCN-NOT: v_min_u32_e32
 ; GCN: s_endpgm
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %aptr, <3 x i32> addrspace(1)* %bptr) nounwind {
-  %a = load <3 x i32>, <3 x i32> addrspace(1)* %aptr
-  %b = load <3 x i32>, <3 x i32> addrspace(1)* %bptr
+define amdgpu_kernel void @v_test_umin_ule_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %a.ptr, <3 x i32> addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %out, i32 %tid
+
+  %a = load <3 x i32>, <3 x i32> addrspace(1)* %a.gep
+  %b = load <3 x i32>, <3 x i32> addrspace(1)* %b.gep
   %cmp = icmp ule <3 x i32> %a, %b
   %val = select <3 x i1> %cmp, <3 x i32> %a, <3 x i32> %b
-  store <3 x i32> %val, <3 x i32> addrspace(1)* %out
+  store <3 x i32> %val, <3 x i32> addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: Reduce unused packed component to scalar
+; FUNC-LABEL: @v_test_umin_ule_v3i16{{$}}
+; SI: v_min_u32_e32
+; SI: v_min_u32_e32
+; SI: v_min_u32_e32
+; SI-NOT: v_min_u32_e32
+
+; VI: v_min_u16_e32
+; VI: v_min_u16_sdwa
+; VI: v_min_u16_e32
+; VI-NOT: v_min_u16_e32
+
+; GFX9: v_pk_min_u16
+; GFX9: v_pk_min_u16
+
+; GCN: s_endpgm
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+; EG: MIN_UINT
+define amdgpu_kernel void @v_test_umin_ule_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> addrspace(1)* %a.ptr, <3 x i16> addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <3 x i16>, <3 x i16> addrspace(1)* %out, i32 %tid
+
+  %a = load <3 x i16>, <3 x i16> addrspace(1)* %a.gep
+  %b = load <3 x i16>, <3 x i16> addrspace(1)* %b.gep
+  %cmp = icmp ule <3 x i16> %a, %b
+  %val = select <3 x i1> %cmp, <3 x i16> %a, <3 x i16> %b
+  store <3 x i16> %val, <3 x i16> addrspace(1)* %out.gep
   ret void
 }
+
 ; FUNC-LABEL: @s_test_umin_ule_i32
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp ule i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -225,27 +330,40 @@ define void @s_test_umin_ule_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; GCN: v_min_u32_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
-  %a = load i32, i32 addrspace(1)* %aptr, align 4
-  %b = load i32, i32 addrspace(1)* %bptr, align 4
+define amdgpu_kernel void @v_test_umin_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i32, i32 addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
+  %a = load i32, i32 addrspace(1)* %a.gep, align 4
+  %b = load i32, i32 addrspace(1)* %b.gep, align 4
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
-  store i32 %val, i32 addrspace(1)* %out, align 4
+  store i32 %val, i32 addrspace(1)* %out.gep, align 4
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_test_umin_ult_i8:
-; GCN: buffer_load_ubyte
-; GCN: buffer_load_ubyte
-; GCN: v_min_u32_e32
+; SI: buffer_load_ubyte
+; SI: buffer_load_ubyte
+; SI: v_min_u32_e32
+
+; GFX89: flat_load_ubyte
+; GFX89: flat_load_ubyte
+; GFX89: v_min_u16_e32
 
 ; EG: MIN_UINT
-define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i8 addrspace(1)* %bptr) nounwind {
-  %a = load i8, i8 addrspace(1)* %aptr, align 1
-  %b = load i8, i8 addrspace(1)* %bptr, align 1
+define amdgpu_kernel void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %a.ptr, i8 addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds i8, i8 addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds i8, i8 addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds i8, i8 addrspace(1)* %out, i32 %tid
+
+  %a = load i8, i8 addrspace(1)* %a.gep, align 1
+  %b = load i8, i8 addrspace(1)* %b.gep, align 1
   %cmp = icmp ult i8 %a, %b
   %val = select i1 %cmp, i8 %a, i8 %b
-  store i8 %val, i8 addrspace(1)* %out, align 1
+  store i8 %val, i8 addrspace(1)* %out.gep, align 1
   ret void
 }
 
@@ -253,7 +371,7 @@ define void @v_test_umin_ult_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %aptr, i
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp ult i32 %a, %b
   %val = select i1 %cmp, i32 %a, i32 %b
   store i32 %val, i32 addrspace(1)* %out, align 4
@@ -268,7 +386,7 @@ define void @s_test_umin_ult_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwin
 ; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
-define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace(1)* %out1, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) #0 {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %cmp = icmp ult i32 %a, %b
@@ -286,7 +404,7 @@ define void @v_test_umin_ult_i32_multi_use(i32 addrspace(1)* %out0, i1 addrspace
 ; GCN: s_endpgm
 
 ; EG-NOT: MIN_UINT
-define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace(1)* %out1, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr) #0 {
   %a = load i16, i16 addrspace(1)* %aptr, align 2
   %b = load i16, i16 addrspace(1)* %bptr, align 2
   %cmp = icmp ult i16 %a, %b
@@ -301,7 +419,7 @@ define void @v_test_umin_ult_i16_multi_use(i16 addrspace(1)* %out0, i1 addrspace
 ; GCN: s_min_u32
 
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %cmp = icmp ult <1 x i32> %a, %b
   %val = select <1 x i1> %cmp, <1 x i32> %a, <1 x i32> %b
   store <1 x i32> %val, <1 x i32> addrspace(1)* %out
@@ -326,7 +444,7 @@ define void @s_test_umin_ult_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b) #0 {
   %cmp = icmp ult <8 x i32> %a, %b
   %val = select <8 x i1> %cmp, <8 x i32> %a, <8 x i32> %b
   store <8 x i32> %val, <8 x i32> addrspace(1)* %out
@@ -334,14 +452,23 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
 }
 
 ; FUNC-LABEL: {{^}}s_test_umin_ult_v8i16:
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
-; GCN: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+; SI: v_min_u32
+
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
+; VI: v_min_u32
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
@@ -351,7 +478,7 @@ define void @s_test_umin_ult_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <
 ; EG: MIN_UINT
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) nounwind {
+define amdgpu_kernel void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <8 x i16> %b) #0 {
   %cmp = icmp ult <8 x i16> %a, %b
   %val = select <8 x i1> %cmp, <8 x i16> %a, <8 x i16> %b
   store <8 x i16> %val, <8 x i16> addrspace(1)* %out
@@ -367,7 +494,7 @@ define void @s_test_umin_ult_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> %a, <
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_UINT
-define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
   %a.ext = zext i16 %a to i32
   %b.ext = zext i16 %b to i32
   %cmp = icmp ult i32 %a.ext, %b.ext
@@ -387,7 +514,7 @@ define void @simplify_demanded_bits_test_umin_ult_i16(i32 addrspace(1)* %out, i1
 ; GCN: buffer_store_dword [[VMIN]]
 
 ; EG: MIN_INT
-define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) nounwind {
+define amdgpu_kernel void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16 signext %a, i16 signext %b) #0 {
   %a.ext = sext i16 %a to i32
   %b.ext = sext i16 %b to i32
   %cmp = icmp slt i32 %a.ext, %b.ext
@@ -402,7 +529,7 @@ define void @simplify_demanded_bits_test_min_slt_i16(i32 addrspace(1)* %out, i16
 ; GCN: s_min_i32
 
 ; EG: MIN_INT
-define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwind {
+define amdgpu_kernel void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) #0 {
   %cmp = icmp sle i16 %a, %b
   %val = select i1 %cmp, i16 %a, i16 %b
   store i16 %val, i16 addrspace(1)* %out
@@ -415,7 +542,7 @@ define void @s_test_imin_sle_i16(i16 addrspace(1)* %out, i16 %a, i16 %b) nounwin
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp ult i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -427,7 +554,7 @@ define void @test_umin_ult_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG: MIN_UINT
 ; EG: MIN_UINT
-define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp ule i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -439,7 +566,7 @@ define void @test_umin_ule_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
-define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp slt i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -451,9 +578,63 @@ define void @test_imin_slt_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind
 
 ; EG-DAG: MIN_UINT
 ; EG-DAG: MIN_INT
-define void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @test_imin_sle_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %tmp = icmp sle i64 %a, %b
   %val = select i1 %tmp, i64 %a, i64 %b
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
 }
+
+; FUNC-LABEL: {{^}}v_test_imin_sle_v2i16:
+; SI: v_min_i32
+; SI: v_min_i32
+
+; VI: v_min_i16
+; VI: v_min_i16
+
+; GFX9: v_pk_min_i16
+
+; EG: MIN_INT
+; EG: MIN_INT
+define amdgpu_kernel void @v_test_imin_sle_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
+  %cmp = icmp sle <2 x i16> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; FIXME: i16 min
+; FUNC-LABEL: {{^}}v_test_imin_ule_v2i16:
+; SI: v_min_u32
+; SI: v_min_u32
+
+; VI: v_min_u16
+; VI: v_min_u16
+
+; GFX9: v_pk_min_u16
+
+; EG: MIN_UINT
+; EG: MIN_UINT
+define amdgpu_kernel void @v_test_imin_ule_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %a.ptr, <2 x i16> addrspace(1)* %b.ptr) #0 {
+  %tid = call i32 @llvm.r600.read.tidig.x()
+  %a.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a.ptr, i32 %tid
+  %b.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b.ptr, i32 %tid
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %a.gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b.gep
+  %cmp = icmp ule <2 x i16> %a, %b
+  %val = select <2 x i1> %cmp, <2 x i16> %a, <2 x i16> %b
+  store <2 x i16> %val, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.r600.read.tidig.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/min3.ll b/test/CodeGen/AMDGPU/min3.ll
index 728479ad9f62fd03cb50106705f03fbf969e234d..59d5d2cdb1aa36422c1bedb9db106dc1676c2c3a 100644
--- a/test/CodeGen/AMDGPU/min3.ll
+++ b/test/CodeGen/AMDGPU/min3.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; FUNC-LABEL: @v_test_imin3_slt_i32
 ; SI: v_min3_i32
-define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -23,7 +23,7 @@ define void @v_test_imin3_slt_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 
 ; FUNC-LABEL: @v_test_umin3_ult_i32
 ; SI: v_min3_u32
-define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %gep1 = getelementptr i32, i32 addrspace(1)* %bptr, i32 %tid
@@ -43,7 +43,7 @@ define void @v_test_umin3_ult_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %apt
 ; FUNC-LABEL: @v_test_umin_umin_umin
 ; SI: v_min_i32
 ; SI: v_min3_i32
-define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
@@ -77,7 +77,7 @@ define void @v_test_umin_umin_umin(i32 addrspace(1)* %out, i32 addrspace(1)* %ap
 
 ; FUNC-LABEL: @v_test_umin3_2_uses
 ; SI-NOT: v_min3
-define void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
+define amdgpu_kernel void @v_test_umin3_2_uses(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr, i32 addrspace(1)* %cptr) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tid2 = mul i32 %tid, 2
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/missing-store.ll b/test/CodeGen/AMDGPU/missing-store.ll
index 8e1b0036a1afcc149cde4cbf34962d811021861d..83c2a911a5ce3396733d3dbc896eb86b0ebdf972 100644
--- a/test/CodeGen/AMDGPU/missing-store.ll
+++ b/test/CodeGen/AMDGPU/missing-store.ll
@@ -15,7 +15,7 @@
 ; SI: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @missing_store_reduced(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @ptr_load, align 8
   %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
diff --git a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
index 85dfbe6b8a33a72de1a7dda0284165cc3a60f186..e1fb00a1de307c3b8b6227bb7bac1a294700aa1c 100644
--- a/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
+++ b/test/CodeGen/AMDGPU/move-addr64-rsrc-dead-subreg-writes.ll
@@ -19,7 +19,7 @@
 ; GCN: v_addc_u32_e32 v[[PTRHI:[0-9]+]], vcc, v[[LDPTRHI]], v[[VARG1HI]]
 ; GCN: buffer_load_ubyte v{{[0-9]+}}, v{{\[}}[[PTRLO]]:[[PTRHI]]{{\]}},
 
-define void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
+define amdgpu_kernel void @clobber_vgpr_pair_pointer_add(i64 %arg1, i8 addrspace(1)* addrspace(1)* %ptrarg, i32 %arg3) #0 {
 bb:
   %tmp = icmp sgt i32 %arg3, 0
   br i1 %tmp, label %bb4, label %bb17
diff --git a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
index 1a0a39027853bc8148eb79b2ed7e2ffdee35a144..417b4ba802e1bee97108f8edea7bb0be61419a86 100644
--- a/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
+++ b/test/CodeGen/AMDGPU/move-to-valu-atomicrmw.ll
@@ -11,7 +11,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; GCN-LABEL: {{^}}atomic_max_i32:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400 glc{{$}}
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
   %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
@@ -31,7 +31,7 @@ exit:
 
 ; GCN-LABEL: {{^}}atomic_max_i32_noret:
 ; GCN: buffer_atomic_smax v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:400{{$}}
-define void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
+define amdgpu_kernel void @atomic_max_i32_noret(i32 addrspace(1)* %out, i32 addrspace(1)* addrspace(1)* %in, i32 addrspace(1)* %x, i32 %y) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.gep = getelementptr i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in, i32 %tid
   %ptr = load volatile i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %tid.gep
diff --git a/test/CodeGen/AMDGPU/mubuf.ll b/test/CodeGen/AMDGPU/mubuf.ll
index a574365da98627b6b6a9281415fe591f962ac5d0..9e1d2e0490c70830718a015e99339c007e31840c 100644
--- a/test/CodeGen/AMDGPU/mubuf.ll
+++ b/test/CodeGen/AMDGPU/mubuf.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; MUBUF load with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_load0:
 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; encoding: [0x04,0x00,0x30,0xe0
-define void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1
   %1 = load i32, i32 addrspace(1)* %0
@@ -20,7 +20,7 @@ entry:
 ; MUBUF load with the largest possible immediate offset
 ; CHECK-LABEL: {{^}}mubuf_load1:
 ; CHECK: buffer_load_ubyte v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x20,0xe0
-define void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load1(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i8, i8 addrspace(1)* %in, i64 4095
   %1 = load i8, i8 addrspace(1)* %0
@@ -32,7 +32,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_load2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
 ; CHECK: buffer_load_dword v{{[0-9]}}, off, s[{{[0-9]+:[0-9]+}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x30,0xe0
-define void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @mubuf_load2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 1024
   %1 = load i32, i32 addrspace(1)* %0
@@ -44,7 +44,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_load3:
 ; CHECK-NOT: ADD
 ; CHECK: buffer_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x30,0xe0
-define void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
+define amdgpu_kernel void @mubuf_load3(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i64 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %in, i64 %offset
   %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
@@ -91,7 +91,7 @@ main_body:
 ; MUBUF store with an immediate byte offset that fits into 12-bits
 ; CHECK-LABEL: {{^}}mubuf_store0:
 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4 ; encoding: [0x04,0x00,0x70,0xe0
-define void @mubuf_store0(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store0(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1
   store i32 0, i32 addrspace(1)* %0
@@ -102,7 +102,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store1:
 ; CHECK: buffer_store_byte v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], 0 offset:4095 ; encoding: [0xff,0x0f,0x60,0xe0
 
-define void @mubuf_store1(i8 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store1(i8 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i8, i8 addrspace(1)* %out, i64 4095
   store i8 0, i8 addrspace(1)* %0
@@ -113,7 +113,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store2:
 ; CHECK: s_movk_i32 [[SOFFSET:s[0-9]+]], 0x1000
 ; CHECK: buffer_store_dword v{{[0-9]}}, off, s[{{[0-9]:[0-9]}}], [[SOFFSET]] ; encoding: [0x00,0x00,0x70,0xe0
-define void @mubuf_store2(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @mubuf_store2(i32 addrspace(1)* %out) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 1024
   store i32 0, i32 addrspace(1)* %0
@@ -124,7 +124,7 @@ entry:
 ; CHECK-LABEL: {{^}}mubuf_store3:
 ; CHECK-NOT: ADD
 ; CHECK: buffer_store_dword v{{[0-9]}}, v[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0 addr64 offset:4 ; encoding: [0x04,0x80,0x70,0xe0
-define void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
+define amdgpu_kernel void @mubuf_store3(i32 addrspace(1)* %out, i64 %offset) {
 entry:
   %0 = getelementptr i32, i32 addrspace(1)* %out, i64 %offset
   %1 = getelementptr i32, i32 addrspace(1)* %0, i64 1
@@ -134,14 +134,14 @@ entry:
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0
-define void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr(i32 addrspace(1)* %out) #0 {
   store i32 99, i32 addrspace(1)* %out, align 4
   ret void
 }
 
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_offset:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:40
-define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 10
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
@@ -150,7 +150,7 @@ define void @store_sgpr_ptr_offset(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   store i32 99, i32 addrspace(1)* %out.gep, align 4
   ret void
@@ -159,7 +159,7 @@ define void @store_sgpr_ptr_large_offset(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}store_sgpr_ptr_large_offset_atomic:
 ; CHECK: s_mov_b32 [[SOFFSET:s[0-9]+]], 0x20000
 ; CHECK: buffer_atomic_add v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, [[SOFFSET]]
-define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
   %gep = getelementptr i32, i32 addrspace(1)* %out, i32 32768
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 5 seq_cst
   ret void
@@ -167,7 +167,7 @@ define void @store_sgpr_ptr_large_offset_atomic(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: {{^}}store_vgpr_ptr:
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @store_vgpr_ptr(i32 addrspace(1)* %out) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   store i32 99, i32 addrspace(1)* %out.gep, align 4
diff --git a/test/CodeGen/AMDGPU/mul.ll b/test/CodeGen/AMDGPU/mul.ll
index 7910b70d8cf2cb3878b0e6a0ea456898c667981a..a72a6efb071198646e2a7ab6b959de48b725f7a6 100644
--- a/test/CodeGen/AMDGPU/mul.ll
+++ b/test/CodeGen/AMDGPU/mul.ll
@@ -11,7 +11,7 @@
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -31,7 +31,7 @@ define void @test_mul_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_mul_lo_i32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -45,7 +45,7 @@ define void @v_mul_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 ; SI: s_load_dword
 ; SI: s_mul_i32
 ; SI: buffer_store_dword
-define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %mul = mul i64 %b, %a
   %trunc = trunc i64 %mul to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -57,7 +57,7 @@ define void @s_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 ; SI: s_load_dword
 ; SI: v_mul_lo_i32
 ; SI: buffer_store_dword
-define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %b, %a
@@ -73,7 +73,7 @@ define void @v_trunc_i64_mul_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %a
 ; EG-DAG: MULHI_INT
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_i32
-define void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @mul64_sext_c(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = sext i32 %in to i64
   %1 = mul i64 %0, 80
@@ -87,7 +87,7 @@ entry:
 ; SI-DAG: v_mul_lo_i32
 ; SI-DAG: v_mul_hi_i32
 ; SI: s_endpgm
-define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 80
@@ -99,7 +99,7 @@ define void @v_mul64_sext_c(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI-DAG: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
 ; SI-DAG: v_mul_hi_i32 v{{[0-9]+}}, v{{[0-9]+}}, 9
 ; SI: s_endpgm
-define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %ext = sext i32 %val to i64
   %mul = mul i64 %ext, 9
@@ -114,7 +114,7 @@ define void @v_mul64_sext_inline_imm(i64 addrspace(1)* %out, i32 addrspace(1)* %
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
 ; SI: s_endpgm
-define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %mul = mul i32 %a, %b
   store i32 %mul, i32 addrspace(1)* %out, align 4
   ret void
@@ -122,7 +122,7 @@ define void @s_mul_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 
 ; FUNC-LABEL: {{^}}v_mul_i32:
 ; SI: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -139,7 +139,7 @@ define void @v_mul_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; crash with a 'failed to select' error.
 
 ; FUNC-LABEL: {{^}}s_mul_i64:
-define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %mul = mul i64 %a, %b
   store i64 %mul, i64 addrspace(1)* %out, align 8
   ret void
@@ -147,7 +147,7 @@ define void @s_mul_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 
 ; FUNC-LABEL: {{^}}v_mul_i64:
 ; SI: v_mul_lo_i32
-define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
+define amdgpu_kernel void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
   %mul = mul i64 %a, %b
@@ -157,7 +157,7 @@ define void @v_mul_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addr
 
 ; FUNC-LABEL: {{^}}mul32_in_branch:
 ; SI: s_mul_i32
-define void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @mul32_in_branch(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %a, i32 %b, i32 %c) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -180,7 +180,7 @@ endif:
 ; SI-DAG: s_mul_i32
 ; SI-DAG: v_mul_hi_u32
 ; SI: s_endpgm
-define void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
+define amdgpu_kernel void @mul64_in_branch(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b, i64 %c) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -224,7 +224,7 @@ endif:
 ; SI: s_mul_i32
 
 ; SI: buffer_store_dwordx4
-define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
+define amdgpu_kernel void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
   %mul = mul i128 %a, %b
   store i128 %mul, i128 addrspace(1)* %out
   ret void
@@ -234,26 +234,26 @@ define void @s_mul_i128(i128 addrspace(1)* %out, i128 %a, i128 %b) nounwind #0 {
 ; SI: {{buffer|flat}}_load_dwordx4
 ; SI: {{buffer|flat}}_load_dwordx4
 
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_u32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_u32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
 ; SI: v_add_i32_e32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_hi_u32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
-; SI: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_hi_u32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
+; SI-DAG: v_mul_lo_i32
 
 ; SI: {{buffer|flat}}_store_dwordx4
-define void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
+define amdgpu_kernel void @v_mul_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %aptr, i128 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %gep.a = getelementptr inbounds i128, i128 addrspace(1)* %aptr, i32 %tid
   %gep.b = getelementptr inbounds i128, i128 addrspace(1)* %bptr, i32 %tid
diff --git a/test/CodeGen/AMDGPU/mul_int24.ll b/test/CodeGen/AMDGPU/mul_int24.ll
index 6f7dfe2e13ebb566fae1bdd1c1880e62fa086f8e..3137569e9ca7fb233f268236b8db457a9a6d19dc 100644
--- a/test/CodeGen/AMDGPU/mul_int24.ll
+++ b/test/CodeGen/AMDGPU/mul_int24.ll
@@ -13,7 +13,7 @@
 ; Make sure we are not masking the inputs
 ; CM-NOT: AND
 ; CM: MUL_INT24
-define void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %a.shl = shl i32 %a, 8
   %a.24 = ashr i32 %a.shl, 8
@@ -39,7 +39,7 @@ entry:
 ; CM: MULHI_INT24
 ; CM: MULHI_INT24
 ; CM: MULHI_INT24
-define void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smulhi24_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %a.shl = shl i32 %a, 8
   %a.24 = ashr i32 %a.shl, 8
@@ -70,7 +70,7 @@ entry:
 ; GCN-DAG: v_mul_i32_i24_e32
 
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %shl.i = shl i32 %a, 8
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
@@ -87,7 +87,7 @@ define void @test_smul24_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; GCN-DAG: v_mul_hi_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN-DAG: v_mul_i32_i24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %shl.i = shl i32 %a, 8
   %shr.i = ashr i32 %shl.i, 8
   %conv.i = sext i32 %shr.i to i64
@@ -112,7 +112,7 @@ define void @test_smul24_i64_square(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; VI: v_ashrrev_i64 v{{\[[0-9]+:[0-9]+\]}}, 31, v{{\[[0-9]+:[0-9]+\]}}
 
 ; GCN: buffer_store_dwordx2
-define void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
+define amdgpu_kernel void @test_smul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) #0 {
 entry:
   %a.shl = shl i33 %a, 9
   %a.24 = ashr i33 %a.shl, 9
@@ -133,7 +133,7 @@ entry:
 ; SI: v_mul_hi_i32_i24_e32 v[[MUL_HI:[0-9]+]],
 ; SI-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; SI-NEXT: buffer_store_dword v[[HI]]
-define void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_smulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = ashr i33 %tmp0, 9
@@ -151,7 +151,7 @@ entry:
 ; GCN: v_mul_i32_i24_e32 v[[VAL_LO:[0-9]+]]
 ; GCN: v_mov_b32_e32 v[[VAL_HI:[0-9]+]], v[[VAL_LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VAL_LO]]:[[VAL_HI]]{{\]}}
-define void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
+define amdgpu_kernel void @simplify_i24_crash(<2 x i32> addrspace(1)* %out, i32 %arg0, <2 x i32> %arg1, <2 x i32> %arg2) {
 bb:
   %cmp = icmp eq i32 %arg0, 0
   br i1 %cmp, label %bb11, label %bb7
diff --git a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
index 004d36f00e51c3a910e6be2d2fdbabf5253d36d1..59fdc8be5cea2db51660bf3be4cea891cf45b41c 100644
--- a/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
+++ b/test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll
@@ -6,7 +6,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() nounwind readnone
 
 ; FUNC-LABEL: {{^}}test_umul24_i32:
 ; GCN: v_mul_u32_u24
-define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -22,7 +22,7 @@ entry:
 ; SI: v_bfe_i32 v{{[0-9]}}, [[VI_MUL]], 0, 16
 ; VI: s_mul_i32 [[SI_MUL:s[0-9]]], s{{[0-9]}}, s{{[0-9]}}
 ; VI: s_sext_i32_i16 s{{[0-9]}}, [[SI_MUL]]
-define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = sext i16 %mul to i32
@@ -34,7 +34,7 @@ entry:
 ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 16
-define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
   %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
@@ -54,7 +54,7 @@ define void @test_umul24_i16_vgpr_sext(i32 addrspace(1)* %out, i16 addrspace(1)*
 ; VI: s_mul_i32
 ; VI: s_and_b32
 ; VI: v_mov_b32_e32
-define void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = zext i16 %mul to i32
@@ -66,7 +66,7 @@ entry:
 ; SI: v_mul_u32_u24_e32
 ; SI: v_and_b32_e32
 ; VI: v_mul_lo_u16
-define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
   %ptr_a = getelementptr i16, i16 addrspace(1)* %in, i32 %tid.x
@@ -83,7 +83,7 @@ define void @test_umul24_i16_vgpr(i32 addrspace(1)* %out, i16 addrspace(1)* %in)
 ; SI: v_mul_u32_u24_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; VI: v_mul_lo_u16_e{{(32|64)}} [[MUL:v[0-9]]], {{[sv][0-9], [sv][0-9]}}
 ; GCN: v_bfe_i32 v{{[0-9]}}, [[MUL]], 0, 8
-define void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
+define amdgpu_kernel void @test_umul24_i8_vgpr(i32 addrspace(1)* %out, i8 addrspace(1)* %a, i8 addrspace(1)* %b) {
 entry:
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.y = call i32 @llvm.amdgcn.workitem.id.y()
@@ -101,7 +101,7 @@ entry:
 ; GCN-NOT: and
 ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.24 = and i32 %a, 16777215
   %b.24 = and i32 %b, 16777215
@@ -118,7 +118,7 @@ entry:
 ; GCN-NOT: and
 ; GCN: v_mul_hi_u32_u24_e32 [[RESULT:v[0-9]+]],
 ; GCN-NEXT: buffer_store_dword [[RESULT]]
-define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %a.24 = and i64 %a, 16777215
   %b.24 = and i64 %b, 16777215
@@ -136,7 +136,7 @@ entry:
 ; GCN-DAG: v_mul_u32_u24_e32
 ; GCN-DAG: v_mul_hi_u32_u24_e32
 ; GCN: buffer_store_dwordx2
-define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a_24 = lshr i64 %tmp0, 40
@@ -152,7 +152,7 @@ entry:
 ; GCN-NOT: s_and_b32
 ; GCN-DAG: v_mul_hi_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
 ; GCN-DAG: v_mul_u32_u24_e64 v{{[0-9]+}}, [[A]], [[A]]
-define void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @test_umul24_i64_square(i64 addrspace(1)* %out, i64 %a) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a.24 = lshr i64 %tmp0, 40
@@ -166,7 +166,7 @@ entry:
 ; GCN: s_and_b32
 ; GCN: v_mul_u32_u24_e32 [[MUL24:v[0-9]+]]
 ; GCN: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[MUL24]]
-define void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi16_i32(i16 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.16 = and i32 %a, 65535
   %b.16 = and i32 %b, 65535
@@ -186,7 +186,7 @@ entry:
 ; GCN-DAG: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
 ; GCN-DAG: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[MUL_LO]]:[[HI]]{{\]}}
-define void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_umul24_i33(i64 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = lshr i33 %tmp0, 9
@@ -206,7 +206,7 @@ entry:
 ; GCN: v_mul_hi_u32_u24_e32 v[[MUL_HI:[0-9]+]],
 ; GCN-NEXT: v_and_b32_e32 v[[HI:[0-9]+]], 1, v[[MUL_HI]]
 ; GCN-NEXT: buffer_store_dword v[[HI]]
-define void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
+define amdgpu_kernel void @test_umulhi24_i33(i32 addrspace(1)* %out, i33 %a, i33 %b) {
 entry:
   %tmp0 = shl i33 %a, 9
   %a_24 = lshr i33 %tmp0, 9
diff --git a/test/CodeGen/AMDGPU/mul_uint24-r600.ll b/test/CodeGen/AMDGPU/mul_uint24-r600.ll
index da1c111fa5c0826939043d7a74c882c6b14e36c3..0a646b7126d083ebf4afdac5ce3508d91c28a719 100644
--- a/test/CodeGen/AMDGPU/mul_uint24-r600.ll
+++ b/test/CodeGen/AMDGPU/mul_uint24-r600.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}test_umul24_i32:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umul24_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %0 = shl i32 %a, 8
   %a_24 = lshr i32 %0, 8
@@ -19,7 +19,7 @@ entry:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
 ; EG: 16
-define void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
+define amdgpu_kernel void @test_umul24_i16_sext(i32 addrspace(1)* %out, i16 %a, i16 %b) {
 entry:
   %mul = mul i16 %a, %b
   %ext = sext i16 %mul to i32
@@ -31,7 +31,7 @@ entry:
 ; FUNC-LABEL: {{^}}test_umul24_i8:
 ; EG: MUL_UINT24 {{[* ]*}}T{{[0-9]}}.[[MUL_CHAN:[XYZW]]]
 ; EG: BFE_INT {{[* ]*}}T{{[0-9]}}.{{[XYZW]}}, PV.[[MUL_CHAN]], 0.0, literal.x
-define void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
+define amdgpu_kernel void @test_umul24_i8(i32 addrspace(1)* %out, i8 %a, i8 %b) {
 entry:
   %mul = mul i8 %a, %b
   %ext = sext i8 %mul to i32
@@ -41,7 +41,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}test_umulhi24_i32_i64:
 ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, KC0[2].W
-define void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test_umulhi24_i32_i64(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   %a.24 = and i32 %a, 16777215
   %b.24 = and i32 %b, 16777215
@@ -56,7 +56,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}test_umulhi24:
 ; EG: MULHI_UINT24 {{[* ]*}}T{{[0-9]\.[XYZW]}}, KC0[2].W, KC0[3].Y
-define void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umulhi24(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %a.24 = and i64 %a, 16777215
   %b.24 = and i64 %b, 16777215
@@ -71,7 +71,7 @@ entry:
 ; FUNC-LABEL: {{^}}test_umul24_i64:
 ; EG; MUL_UINT24
 ; EG: MULHI
-define void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @test_umul24_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 entry:
   %tmp0 = shl i64 %a, 40
   %a_24 = lshr i64 %tmp0, 40
diff --git a/test/CodeGen/AMDGPU/mulhu.ll b/test/CodeGen/AMDGPU/mulhu.ll
deleted file mode 100644
index 29b0944a553321130138499258e8e43e83af6cc0..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/mulhu.ll
+++ /dev/null
@@ -1,17 +0,0 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
-
-;CHECK: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
-;CHECK: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
-;CHECK-NEXT: v_lshrrev_b32_e32 v0, 1, v0
-
-define void @test(i32 %p) {
-   %i = udiv i32 %p, 3
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare <4 x float> @llvm.SI.sample.(i32, <4 x i32>, <8 x i32>, <4 x i32>, i32) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9d0b6b395996b56ddff7dd92c2971878627e689f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/multi-divergent-exit-region.ll
@@ -0,0 +1,710 @@
+; RUN: opt -mtriple=amdgcn-- -S -amdgpu-unify-divergent-exit-nodes -verify -structurizecfg -verify -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; Add an extra verifier runs. There were some cases where invalid IR
+; was produced but happened to be fixed by the later passes.
+
+; Make sure divergent control flow with multiple exits from a region
+; is properly handled. UnifyFunctionExitNodes should be run before
+; StructurizeCFG.
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: %2 = extractvalue { i1, i64 } %1, 0
+; IR: %3 = extractvalue { i1, i64 } %1, 1
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: %7 = extractvalue { i1, i64 } %6, 0
+; IR: %8 = extractvalue { i1, i64 } %6, 1
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: LeafBlock:
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: br label %Flow{{$}}
+
+; IR:  Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: %13 = extractvalue { i1, i64 } %12, 0
+; IR: %14 = extractvalue { i1, i64 } %12, 1
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR:  br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+
+
+; GCN-LABEL: {{^}}multi_divergent_region_exit_ret_ret:
+; GCN: v_cmp_lt_i32_e32 vcc, 1
+; GCN: s_and_saveexec_b64
+; GCN: s_xor_b64
+
+
+; FIXME: Why is this compare essentially repeated?
+; GCN: v_cmp_eq_u32_e32 vcc, 1, [[REG:v[0-9]+]]
+; GCN-NEXT: v_cmp_ne_u32_e64 s{{\[[0-9]+:[0-9]+\]}}, 1, [[REG]]
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1, vcc
+; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, -1
+
+; GCN: ; %Flow1
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+
+; GCN: ; %exit1
+; GCN: ds_write_b32
+
+; GCN: %Flow2
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: v_cmp_ne_u32_e32 vcc, 0
+; GCN-NEXT: s_and_saveexec_b64
+; GCN-NEXT: s_xor_b64
+
+; GCN: ; %exit0
+; GCN: buffer_store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @multi_divergent_region_exit_ret_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_unreachable_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedUnreachableBlock
+
+
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: unreachable
+
+
+; FIXME: Probably should insert an s_endpgm anyway.
+; GCN-LABEL: {{^}}multi_divergent_region_exit_unreachable_unreachable:
+; GCN: ; %UnifiedUnreachableBlock
+; GCN-NEXT: .Lfunc_end
+define amdgpu_kernel void @multi_divergent_region_exit_unreachable_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  unreachable
+}
+
+; IR-LABEL: @multi_exit_region_divergent_ret_uniform_ret(
+; IR: %divergent.cond0 = icmp slt i32 %tmp16, 2
+; IR: llvm.amdgcn.if
+; IR: br i1
+
+; IR: {{^}}Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+; IR: br i1 %7, label %LeafBlock, label %Flow1
+
+; IR: {{^}}LeafBlock:
+; IR: %divergent.cond1 = icmp eq i32 %tmp16, 1
+; IR: %9 = xor i1 %divergent.cond1, true
+; IR: br label %Flow1
+
+; IR: LeafBlock1:
+; IR: %uniform.cond0 = icmp eq i32 %arg3, 2
+; IR: %10 = xor i1 %uniform.cond0, true
+; IR: br label %Flow
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: br label %UnifiedReturnBlock
+
+; IR: {{^}}Flow1:
+; IR: %15 = phi i1 [ %divergent.cond1, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR: store volatile i32 17, i32 addrspace(3)* undef
+; IR: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR: ret void
+define amdgpu_kernel void @multi_exit_region_divergent_ret_uniform_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %divergent.cond0 = icmp slt i32 %tmp16, 2
+  br i1 %divergent.cond0, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %divergent.cond1 = icmp eq i32 %tmp16, 1
+  br i1 %divergent.cond1, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %uniform.cond0 = icmp eq i32 %arg3, 2
+  br i1 %uniform.cond0, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_exit_region_uniform_ret_divergent_ret(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+; IR: br i1 %2, label %LeafBlock1, label %Flow
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+
+define amdgpu_kernel void @multi_exit_region_uniform_ret_divergent_ret(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2, i32 %arg3) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %arg3, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_ret_return_value(
+; IR: Flow2:
+; IR: %11 = phi float [ 2.000000e+00, %exit1 ], [ undef, %Flow1 ]
+; IR: %12 = phi i1 [ false, %exit1 ], [ %16, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %20)
+
+; IR: UnifiedReturnBlock:
+; IR: %UnifiedRetVal = phi float [ %11, %Flow2 ], [ 1.000000e+00, %exit0 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %15)
+; IR: ret float %UnifiedRetVal
+define amdgpu_ps float @multi_divergent_region_exit_ret_ret_return_value(i32 %vgpr) #0 {
+entry:
+  %Pivot = icmp slt i32 %vgpr, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %vgpr, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %vgpr, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 9, i32 addrspace(1)* undef
+  ret float 1.0
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 17, i32 addrspace(3)* undef
+  ret float 2.0
+}
+
+; IR-LABEL: @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(
+
+; GCN-LABEL: {{^}}uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value:
+; GCN: s_cmp_gt_i32 s0, 1
+; GCN: s_cbranch_scc0 [[FLOW:BB[0-9]+_[0-9]+]]
+
+; GCN: v_cmp_ne_u32_e32 vcc, 7, v0
+
+; GCN: {{^}}[[FLOW]]:
+; GCN: s_cbranch_vccnz [[FLOW1:BB[0-9]+]]
+
+; GCN: v_mov_b32_e32 v0, 2.0
+; GCN: s_or_b64 exec, exec
+; GCN: s_and_b64 exec, exec
+; GCN: v_mov_b32_e32 v0, 1.0
+
+; GCN: {{^BB[0-9]+_[0-9]+}}: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+
+define amdgpu_ps float @uniform_branch_to_multi_divergent_region_exit_ret_ret_return_value(i32 inreg %sgpr, i32 %vgpr) #0 {
+entry:
+  %uniform.cond = icmp slt i32 %sgpr, 2
+  br i1 %uniform.cond, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %divergent.cond0 = icmp eq i32 %vgpr, 3
+  br i1 %divergent.cond0, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %divergent.cond1 = icmp eq i32 %vgpr, 7
+  br i1 %divergent.cond1, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 9, i32 addrspace(1)* undef
+  ret float 1.0
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store i32 17, i32 addrspace(3)* undef
+  ret float 2.0
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_unreachable(
+; IR: %1 = call { i1, i64 } @llvm.amdgcn.if(i1 %0)
+
+; IR: Flow:
+; IR: %4 = phi i1 [ true, %LeafBlock1 ], [ false, %entry ]
+; IR: %5 = phi i1 [ %10, %LeafBlock1 ], [ false, %entry ]
+; IR: %6 = call { i1, i64 } @llvm.amdgcn.else(i64 %3)
+
+; IR: Flow2:
+; IR: %11 = phi i1 [ false, %exit1 ], [ %15, %Flow1 ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %19)
+; IR: %12 = call { i1, i64 } @llvm.amdgcn.if(i1 %11)
+; IR: br i1 %13, label %exit0, label %UnifiedReturnBlock
+
+; IR: exit0:
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: Flow1:
+; IR: %15 = phi i1 [ %SwitchLeaf, %LeafBlock ], [ %4, %Flow ]
+; IR: %16 = phi i1 [ %9, %LeafBlock ], [ %5, %Flow ]
+; IR: call void @llvm.amdgcn.end.cf(i64 %8)
+; IR: %17 = call { i1, i64 } @llvm.amdgcn.if(i1 %16)
+; IR: %18 = extractvalue { i1, i64 } %17, 0
+; IR: %19 = extractvalue { i1, i64 } %17, 1
+; IR: br i1 %18, label %exit1, label %Flow2
+
+; IR: exit1:
+; IR-NEXT: store volatile i32 9, i32 addrspace(1)* undef
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; The non-uniformity of the branch to the exiting blocks requires
+; looking at transitive predecessors.
+
+; IR-LABEL: @indirect_multi_divergent_region_exit_ret_unreachable(
+
+; IR: exit0:                                            ; preds = %Flow2
+; IR-NEXT: store volatile i32 17, i32 addrspace(3)* undef
+; IR-NEXT: br label %UnifiedReturnBlock
+
+
+; IR: indirect.exit1:
+; IR: %load = load volatile i32, i32 addrspace(1)* undef
+; IR: store volatile i32 %load, i32 addrspace(1)* undef
+; IR: store volatile i32 9, i32 addrspace(1)* undef
+; IR: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %Flow2
+
+; IR: UnifiedReturnBlock:                               ; preds = %exit0, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %14)
+; IR-NEXT: ret void
+define amdgpu_kernel void @indirect_multi_divergent_region_exit_ret_unreachable(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  %Pivot = icmp slt i32 %tmp16, 2
+  br i1 %Pivot, label %LeafBlock, label %LeafBlock1
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %indirect.exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %indirect.exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+indirect.exit1:
+  %load = load volatile i32, i32 addrspace(1)* undef
+  store volatile i32 %load, i32 addrspace(1)* undef
+  br label %exit1
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; IR-LABEL: @multi_divergent_region_exit_ret_switch(
+define amdgpu_kernel void @multi_divergent_region_exit_ret_switch(i32 addrspace(1)* nocapture %arg0, i32 addrspace(1)* nocapture %arg1, i32 addrspace(1)* nocapture %arg2) #0 {
+entry:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = add i32 0, %tmp
+  %tmp2 = zext i32 %tmp1 to i64
+  %tmp3 = add i64 0, %tmp2
+  %tmp4 = shl i64 %tmp3, 32
+  %tmp5 = ashr exact i64 %tmp4, 32
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(1)* %arg0, i64 %tmp5
+  %tmp7 = load i32, i32 addrspace(1)* %tmp6, align 4
+  %tmp8 = sext i32 %tmp7 to i64
+  %tmp9 = getelementptr inbounds i32, i32 addrspace(1)* %arg1, i64 %tmp8
+  %tmp10 = load i32, i32 addrspace(1)* %tmp9, align 4
+  %tmp13 = zext i32 %tmp10 to i64
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp13
+  %tmp16 = load i32, i32 addrspace(1)* %tmp14, align 16
+  switch i32 %tmp16, label %exit1
+    [ i32 1, label %LeafBlock
+      i32 2, label %LeafBlock1
+      i32 3, label %exit0 ]
+
+LeafBlock:                                        ; preds = %entry
+  %SwitchLeaf = icmp eq i32 %tmp16, 1
+  br i1 %SwitchLeaf, label %exit0, label %exit1
+
+LeafBlock1:                                       ; preds = %entry
+  %SwitchLeaf2 = icmp eq i32 %tmp16, 2
+  br i1 %SwitchLeaf2, label %exit0, label %exit1
+
+exit0:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 17, i32 addrspace(3)* undef
+  ret void
+
+exit1:                                     ; preds = %LeafBlock, %LeafBlock1
+  store volatile i32 9, i32 addrspace(1)* undef
+  unreachable
+}
+
+; IR-LABEL: @divergent_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %divergent.ret0, label %divergent.ret1
+
+divergent.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @divergent_complex_multi_ret_nest_in_uniform_triangle(
+define amdgpu_kernel void @divergent_complex_multi_ret_nest_in_uniform_triangle(i32 %arg0) #0 {
+entry:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %divergent.multi.exit.region, label %uniform.ret
+
+divergent.multi.exit.region:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %divergent.if, label %divergent.ret1
+
+divergent.if:
+  %vgpr0 = load volatile float, float addrspace(1)* undef
+  %divergent.cond1 = fcmp ogt float %vgpr0, 1.0
+  br i1 %divergent.cond1, label %divergent.then, label %divergent.endif
+
+divergent.then:
+  %vgpr1 = load volatile float, float addrspace(1)* undef
+  %divergent.cond2 = fcmp olt float %vgpr1, 4.0
+  store volatile i32 33, i32 addrspace(1)* undef
+  br i1 %divergent.cond2, label %divergent.ret0, label %divergent.endif
+
+divergent.endif:
+  store volatile i32 38, i32 addrspace(1)* undef
+  br label %divergent.ret0
+
+divergent.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @uniform_complex_multi_ret_nest_in_divergent_triangle(
+; IR: Flow1:                                            ; preds = %uniform.ret1, %uniform.multi.exit.region
+; IR: %8 = phi i1 [ false, %uniform.ret1 ], [ true, %uniform.multi.exit.region ]
+; IR: br i1 %8, label %uniform.if, label %Flow2
+
+; IR: Flow:                                             ; preds = %uniform.then, %uniform.if
+; IR: %11 = phi i1 [ %10, %uniform.then ], [ %9, %uniform.if ]
+; IR: br i1 %11, label %uniform.endif, label %uniform.ret0
+
+; IR: UnifiedReturnBlock:                               ; preds = %Flow3, %Flow2
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %6)
+; IR-NEXT: ret void
+define amdgpu_kernel void @uniform_complex_multi_ret_nest_in_divergent_triangle(i32 %arg0) #0 {
+entry:
+  %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
+  %divergent.cond0 = icmp eq i32 %id.x, 0
+  br i1 %divergent.cond0, label %uniform.multi.exit.region, label %divergent.ret
+
+uniform.multi.exit.region:
+  %uniform.cond0 = icmp eq i32 %arg0, 4
+  br i1 %uniform.cond0, label %uniform.if, label %uniform.ret1
+
+uniform.if:
+  %sgpr0 = load volatile i32, i32 addrspace(2)* undef
+  %uniform.cond1 = icmp slt i32 %sgpr0, 1
+  br i1 %uniform.cond1, label %uniform.then, label %uniform.endif
+
+uniform.then:
+  %sgpr1 = load volatile i32, i32 addrspace(2)* undef
+  %uniform.cond2 = icmp sge i32 %sgpr1, 4
+  store volatile i32 33, i32 addrspace(1)* undef
+  br i1 %uniform.cond2, label %uniform.ret0, label %uniform.endif
+
+uniform.endif:
+  store volatile i32 38, i32 addrspace(1)* undef
+  br label %uniform.ret0
+
+uniform.ret0:
+  store volatile i32 11, i32 addrspace(3)* undef
+  ret void
+
+uniform.ret1:
+  store volatile i32 42, i32 addrspace(3)* undef
+  ret void
+
+divergent.ret:
+  store volatile i32 9, i32 addrspace(1)* undef
+  ret void
+}
+
+; IR-LABEL: @multi_divergent_unreachable_exit(
+; IR: UnifiedUnreachableBlock:
+; IR-NEXT: call void @llvm.amdgcn.unreachable()
+; IR-NEXT: br label %UnifiedReturnBlock
+
+; IR: UnifiedReturnBlock:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64
+; IR-NEXT: ret void
+define amdgpu_kernel void @multi_divergent_unreachable_exit() #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  switch i32 %tmp, label %bb3 [
+    i32 2, label %bb1
+    i32 0, label %bb2
+  ]
+
+bb1:                                              ; preds = %bb
+  unreachable
+
+bb2:                                              ; preds = %bb
+  unreachable
+
+bb3:                                              ; preds = %bb
+  switch i32 undef, label %bb5 [
+    i32 2, label %bb4
+  ]
+
+bb4:                                              ; preds = %bb3
+  ret void
+
+bb5:                                              ; preds = %bb3
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/multilevel-break.ll b/test/CodeGen/AMDGPU/multilevel-break.ll
index 95c7ce86232945013bf55b568effe0bb9e31a0d4..15de689b953e6bcbac0c6aabf2c5d16280c1e1ba 100644
--- a/test/CodeGen/AMDGPU/multilevel-break.ll
+++ b/test/CodeGen/AMDGPU/multilevel-break.ll
@@ -64,7 +64,7 @@ ENDIF:                                            ; preds = %LOOP
   br i1 %tmp51, label %LOOP, label %LOOP.outer
 }
 
-; OPT-LABEL: define void @multi_if_break_loop(
+; OPT-LABEL: define amdgpu_kernel void @multi_if_break_loop(
 ; OPT: llvm.amdgcn.break
 ; OPT: llvm.amdgcn.loop
 ; OPT: llvm.amdgcn.if.break
@@ -79,7 +79,7 @@ ENDIF:                                            ; preds = %LOOP
 ; Uses a copy intsead of an or
 ; GCN: s_mov_b64 [[COPY:s\[[0-9]+:[0-9]+\]]], [[BREAK_REG]]
 ; GCN: s_or_b64 [[BREAK_REG]], exec, [[COPY]]
-define void @multi_if_break_loop(i32 %arg) #0 {
+define amdgpu_kernel void @multi_if_break_loop(i32 %arg) #0 {
 bb:
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp = sub i32 %id, %arg
diff --git a/test/CodeGen/AMDGPU/nested-loop-conditions.ll b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
new file mode 100644
index 0000000000000000000000000000000000000000..672549c8ea636160bd31e998a09164ac5f925d33
--- /dev/null
+++ b/test/CodeGen/AMDGPU/nested-loop-conditions.ll
@@ -0,0 +1,269 @@
+; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=IR %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+; After structurizing, there are 3 levels of loops. The i1 phi
+; conditions mutually depend on each other, so it isn't safe to delete
+; the condition that appears to have no uses until the loop is
+; completely processed.
+
+
+; IR-LABEL: @reduced_nested_loop_conditions(
+
+; IR: bb5:
+; IR-NEXT: %phi.broken = phi i64 [ %loop.phi, %bb10 ], [ 0, %bb ]
+; IR-NEXT: %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ]
+; IR-NEXT: %tmp7 = icmp eq i32 %tmp6, 1
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %tmp7)
+; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
+; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
+; IR-NEXT: br i1 %1, label %bb8, label %Flow
+
+; IR: bb8:
+; IR-NEXT: %3 = call i64 @llvm.amdgcn.break(i64 %phi.broken)
+; IR-NEXT: br label %bb13
+
+; IR: bb10:
+; IR-NEXT: %loop.phi = phi i64 [ %6, %Flow ]
+; IR-NEXT: %tmp11 = phi i32 [ %5, %Flow ]
+; IR-NEXT: %4 = call i1 @llvm.amdgcn.loop(i64 %loop.phi)
+; IR-NEXT: br i1 %4, label %bb23, label %bb5
+
+; IR: Flow:
+; IR-NEXT: %loop.phi1 = phi i64 [ %loop.phi2, %bb4 ], [ %phi.broken, %bb5 ]
+; IR-NEXT: %5 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
+; IR-NEXT: %6 = call i64 @llvm.amdgcn.else.break(i64 %2, i64 %loop.phi1)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %2)
+; IR-NEXT: br label %bb10
+
+; IR: bb13:
+; IR-NEXT: %loop.phi3 = phi i64 [ %loop.phi4, %bb3 ], [ %3, %bb8 ]
+; IR-NEXT: %tmp14 = phi i1 [ false, %bb3 ], [ true, %bb8 ]
+; IR-NEXT: %tmp15 = bitcast i64 %tmp2 to <2 x i32>
+; IR-NEXT: br i1 %tmp14, label %bb16, label %bb20
+
+; IR: bb16:
+; IR-NEXT: %tmp17 = extractelement <2 x i32> %tmp15, i64 1
+; IR-NEXT: %tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %tmp17
+; IR-NEXT: %tmp19 = load volatile i32, i32 addrspace(3)* %tmp18
+; IR-NEXT: br label %bb20
+
+; IR: bb20:
+; IR-NEXT: %loop.phi4 = phi i64 [ %phi.broken, %bb16 ], [ %phi.broken, %bb13 ]
+; IR-NEXT: %loop.phi2 = phi i64 [ %phi.broken, %bb16 ], [ %loop.phi3, %bb13 ]
+; IR-NEXT: %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ]
+; IR-NEXT: br label %bb9
+
+; IR: bb23:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %loop.phi)
+; IR-NEXT: ret void
+
+; GCN-LABEL: {{^}}reduced_nested_loop_conditions:
+
+; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 1
+; GCN-NEXT: s_cbranch_scc1
+
+; FIXME: Should fold to unconditional branch?
+; GCN: s_mov_b64 vcc, -1
+; GCN-NEXT: ; implicit-def
+; GCN: s_cbranch_vccz
+
+; GCN: ds_read_b32
+
+; GCN: [[BB9:BB[0-9]+_[0-9]+]]: ; %bb9
+; GCN-NEXT: ; =>This Inner Loop Header: Depth=1
+; GCN-NEXT: s_branch [[BB9]]
+define amdgpu_kernel void @reduced_nested_loop_conditions(i64 addrspace(3)* nocapture %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = getelementptr inbounds i64, i64 addrspace(3)* %arg, i32 %tmp
+  %tmp2 = load volatile i64, i64 addrspace(3)* %tmp1
+  br label %bb5
+
+bb3:                                              ; preds = %bb9
+  br i1 true, label %bb4, label %bb13
+
+bb4:                                              ; preds = %bb3
+  br label %bb10
+
+bb5:                                              ; preds = %bb10, %bb
+  %tmp6 = phi i32 [ 0, %bb ], [ %tmp11, %bb10 ]
+  %tmp7 = icmp eq i32 %tmp6, 1
+  br i1 %tmp7, label %bb8, label %bb10
+
+bb8:                                              ; preds = %bb5
+  br label %bb13
+
+bb9:                                              ; preds = %bb20, %bb9
+  br i1 false, label %bb3, label %bb9
+
+bb10:                                             ; preds = %bb5, %bb4
+  %tmp11 = phi i32 [ %tmp21, %bb4 ], [ undef, %bb5 ]
+  %tmp12 = phi i1 [ %tmp22, %bb4 ], [ true, %bb5 ]
+  br i1 %tmp12, label %bb23, label %bb5
+
+bb13:                                             ; preds = %bb8, %bb3
+  %tmp14 = phi i1 [ %tmp22, %bb3 ], [ true, %bb8 ]
+  %tmp15 = bitcast i64 %tmp2 to <2 x i32>
+  br i1 %tmp14, label %bb16, label %bb20
+
+bb16:                                             ; preds = %bb13
+  %tmp17 = extractelement <2 x i32> %tmp15, i64 1
+  %tmp18 = getelementptr inbounds i32, i32 addrspace(3)* undef, i32 %tmp17
+  %tmp19 = load volatile i32, i32 addrspace(3)* %tmp18
+  br label %bb20
+
+bb20:                                             ; preds = %bb16, %bb13
+  %tmp21 = phi i32 [ %tmp19, %bb16 ], [ 0, %bb13 ]
+  %tmp22 = phi i1 [ false, %bb16 ], [ %tmp14, %bb13 ]
+  br label %bb9
+
+bb23:                                             ; preds = %bb10
+  ret void
+}
+
+; Earlier version of above, before a run of the structurizer.
+; IR-LABEL: @nested_loop_conditions(
+
+; IR: Flow7:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %17)
+; IR-NEXT: %0 = call { i1, i64 } @llvm.amdgcn.if(i1 %15)
+; IR-NEXT: %1 = extractvalue { i1, i64 } %0, 0
+; IR-NEXT: %2 = extractvalue { i1, i64 } %0, 1
+; IR-NEXT: br i1 %1, label %bb4.bb13_crit_edge, label %Flow8
+
+; IR: Flow1:
+; IR-NEXT: %loop.phi = phi i64 [ %loop.phi9, %Flow6 ], [ %phi.broken, %bb14 ]
+; IR-NEXT: %13 = phi <4 x i32> [ %29, %Flow6 ], [ undef, %bb14 ]
+; IR-NEXT: %14 = phi i32 [ %30, %Flow6 ], [ undef, %bb14 ]
+; IR-NEXT: %15 = phi i1 [ %31, %Flow6 ], [ false, %bb14 ]
+; IR-NEXT: %16 = phi i1 [ false, %Flow6 ], [ %8, %bb14 ]
+; IR-NEXT: %17 = call i64 @llvm.amdgcn.else.break(i64 %11, i64 %loop.phi)
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %11)
+; IR-NEXT: %18 = call i1 @llvm.amdgcn.loop(i64 %17)
+; IR-NEXT: br i1 %18, label %Flow7, label %bb14
+
+; IR: Flow2:
+; IR-NEXT: %loop.phi10 = phi i64 [ %loop.phi11, %Flow5 ], [ %12, %bb16 ]
+; IR-NEXT: %19 = phi <4 x i32> [ %29, %Flow5 ], [ undef, %bb16 ]
+; IR-NEXT: %20 = phi i32 [ %30, %Flow5 ], [ undef, %bb16 ]
+; IR-NEXT: %21 = phi i1 [ %31, %Flow5 ], [ false, %bb16 ]
+; IR-NEXT: %22 = phi i1 [ false, %Flow5 ], [ false, %bb16 ]
+; IR-NEXT: %23 = phi i1 [ false, %Flow5 ], [ %8, %bb16 ]
+; IR-NEXT: %24 = call { i1, i64 } @llvm.amdgcn.if(i1 %23)
+; IR-NEXT: %25 = extractvalue { i1, i64 } %24, 0
+; IR-NEXT: %26 = extractvalue { i1, i64 } %24, 1
+; IR-NEXT: br i1 %25, label %bb21, label %Flow3
+
+; IR: bb21:
+; IR: %tmp12 = icmp slt i32 %tmp11, 9
+; IR-NEXT: %27 = xor i1 %tmp12, true
+; IR-NEXT: %28 = call i64 @llvm.amdgcn.if.break(i1 %27, i64 %phi.broken)
+; IR-NEXT: br label %Flow3
+
+; IR: Flow3:
+; IR-NEXT: %loop.phi11 = phi i64 [ %phi.broken, %bb21 ], [ %phi.broken, %Flow2 ]
+; IR-NEXT: %loop.phi9 = phi i64 [ %28, %bb21 ], [ %loop.phi10, %Flow2 ]
+; IR-NEXT: %29 = phi <4 x i32> [ %tmp9, %bb21 ], [ %19, %Flow2 ]
+; IR-NEXT: %30 = phi i32 [ %tmp10, %bb21 ], [ %20, %Flow2 ]
+; IR-NEXT: %31 = phi i1 [ %27, %bb21 ], [ %21, %Flow2 ]
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %26)
+; IR-NEXT: br i1 %22, label %bb31.loopexit, label %Flow4
+
+; IR: bb31:
+; IR-NEXT: call void @llvm.amdgcn.end.cf(i64 %7)
+; IR-NEXT: store volatile i32 0, i32 addrspace(1)* undef
+; IR-NEXT: ret void
+
+
+; GCN-LABEL: {{^}}nested_loop_conditions:
+
+; GCN: v_cmp_lt_i32_e32 vcc, 8, v
+; GCN: s_and_b64 vcc, exec, vcc
+; GCN: s_cbranch_vccnz [[BB31:BB[0-9]+_[0-9]+]]
+
+; GCN: [[BB14:BB[0-9]+_[0-9]+]]: ; %bb14
+; GCN: v_cmp_ne_u32_e32 vcc, 1, v
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_cbranch_vccnz [[BB31]]
+
+; GCN: [[BB18:BB[0-9]+_[0-9]+]]: ; %bb18
+; GCN: buffer_load_dword
+; GCN: v_cmp_lt_i32_e32 vcc, 8, v
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_cbranch_vccnz [[BB18]]
+
+; GCN: buffer_load_dword
+; GCN: buffer_load_dword
+; GCN: v_cmp_gt_i32_e32 vcc, 9
+; GCN-NEXT: s_and_b64 vcc, exec, vcc
+; GCN-NEXT: s_cbranch_vccnz [[BB14]]
+
+; GCN: [[BB31]]:
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+define amdgpu_kernel void @nested_loop_conditions(i64 addrspace(1)* nocapture %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  %tmp1 = zext i32 %tmp to i64
+  %tmp2 = getelementptr inbounds i64, i64 addrspace(1)* %arg, i64 %tmp1
+  %tmp3 = load i64, i64 addrspace(1)* %tmp2, align 16
+  %tmp932 = load <4 x i32>, <4 x i32> addrspace(1)* undef, align 16
+  %tmp1033 = extractelement <4 x i32> %tmp932, i64 0
+  %tmp1134 = load volatile i32, i32 addrspace(1)* undef
+  %tmp1235 = icmp slt i32 %tmp1134, 9
+  br i1 %tmp1235, label %bb14.lr.ph, label %bb13
+
+bb14.lr.ph:                                       ; preds = %bb
+  br label %bb14
+
+bb4.bb13_crit_edge:                               ; preds = %bb21
+  br label %bb13
+
+bb13:                                             ; preds = %bb4.bb13_crit_edge, %bb
+  br label %bb31
+
+bb14:                                             ; preds = %bb21, %bb14.lr.ph
+  %tmp1037 = phi i32 [ %tmp1033, %bb14.lr.ph ], [ %tmp10, %bb21 ]
+  %tmp936 = phi <4 x i32> [ %tmp932, %bb14.lr.ph ], [ %tmp9, %bb21 ]
+  %tmp15 = icmp eq i32 %tmp1037, 1
+  br i1 %tmp15, label %bb16, label %bb31.loopexit
+
+bb16:                                             ; preds = %bb14
+  %tmp17 = bitcast i64 %tmp3 to <2 x i32>
+  br label %bb18
+
+bb18:                                             ; preds = %bb18, %bb16
+  %tmp19 = load volatile i32, i32 addrspace(1)* undef
+  %tmp20 = icmp slt i32 %tmp19, 9
+  br i1 %tmp20, label %bb21, label %bb18
+
+bb21:                                             ; preds = %bb18
+  %tmp22 = extractelement <2 x i32> %tmp17, i64 1
+  %tmp23 = lshr i32 %tmp22, 16
+  %tmp24 = select i1 undef, i32 undef, i32 %tmp23
+  %tmp25 = uitofp i32 %tmp24 to float
+  %tmp26 = fmul float %tmp25, 0x3EF0001000000000
+  %tmp27 = fsub float %tmp26, undef
+  %tmp28 = fcmp olt float %tmp27, 5.000000e-01
+  %tmp29 = select i1 %tmp28, i64 1, i64 2
+  %tmp30 = extractelement <4 x i32> %tmp936, i64 %tmp29
+  %tmp7 = zext i32 %tmp30 to i64
+  %tmp8 = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* undef, i64 %tmp7
+  %tmp9 = load <4 x i32>, <4 x i32> addrspace(1)* %tmp8, align 16
+  %tmp10 = extractelement <4 x i32> %tmp9, i64 0
+  %tmp11 = load volatile i32, i32 addrspace(1)* undef
+  %tmp12 = icmp slt i32 %tmp11, 9
+  br i1 %tmp12, label %bb14, label %bb4.bb13_crit_edge
+
+bb31.loopexit:                                    ; preds = %bb14
+  br label %bb31
+
+bb31:                                             ; preds = %bb31.loopexit, %bb13
+  store volatile i32 0, i32 addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
index 9dd99efd997c59b4b715a20fde3289338f1fe294..97dc67f82607babc372c119a6c1f7d2068d175b9 100644
--- a/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
+++ b/test/CodeGen/AMDGPU/no-initializer-constant-addrspace.ll
@@ -9,7 +9,7 @@
 @extern_const_addrspace = external unnamed_addr addrspace(2) constant [5 x i32], align 4
 
 ; CHECK-DAG: Name: load_extern_const_init
-define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @extern_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
@@ -19,7 +19,7 @@ define void @load_extern_const_init(i32 addrspace(1)* %out) nounwind {
 @undef_const_addrspace = unnamed_addr addrspace(2) constant [5 x i32] undef, align 4
 
 ; CHECK-DAG: Name: undef_const_addrspace
-define void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @load_undef_const_init(i32 addrspace(1)* %out) nounwind {
   %val = load i32, i32 addrspace(2)* getelementptr ([5 x i32], [5 x i32] addrspace(2)* @undef_const_addrspace, i64 0, i64 3), align 4
   store i32 %val, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/no-shrink-extloads.ll b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
index fd66b0b5d1f62e45ce0d8441ae74538400f9c207..8a7bf6db5b8d4c73345ac1368fd339be0b840bdd 100644
--- a/test/CodeGen/AMDGPU/no-shrink-extloads.ll
+++ b/test/CodeGen/AMDGPU/no-shrink-extloads.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i16:
 ; SI: s_load_dword s
 ; SI: buffer_store_short v
-define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i16
   store i16 %trunc, i16 addrspace(1)* %out
   ret void
@@ -21,7 +21,7 @@ define void @truncate_kernarg_i32_to_i16(i16 addrspace(1)* %out, i32 %arg) nounw
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i16:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_short v
-define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
@@ -34,7 +34,7 @@ define void @truncate_buffer_load_i32_to_i16(i16 addrspace(1)* %out, i32 addrspa
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -43,7 +43,7 @@ define void @truncate_kernarg_i32_to_i8(i8 addrspace(1)* %out, i32 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -56,7 +56,7 @@ define void @truncate_buffer_load_i32_to_i8(i8 addrspace(1)* %out, i32 addrspace
 ; FUNC-LABEL: {{^}}truncate_kernarg_i32_to_i1:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwind {
   %trunc = trunc i32 %arg to i1
   store i1 %trunc, i1 addrspace(1)* %out
   ret void
@@ -65,7 +65,7 @@ define void @truncate_kernarg_i32_to_i1(i1 addrspace(1)* %out, i32 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i32_to_i1:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i1, i1 addrspace(1)* %out, i32 %tid
@@ -78,7 +78,7 @@ define void @truncate_buffer_load_i32_to_i1(i1 addrspace(1)* %out, i32 addrspace
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i32
   store i32 %trunc, i32 addrspace(1)* %out
   ret void
@@ -87,7 +87,7 @@ define void @truncate_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounw
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -100,7 +100,7 @@ define void @truncate_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspa
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i32:
 ; SI: s_load_dword s
 ; SI: buffer_store_dword v
-define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i32
   store i32 %trunc, i32 addrspace(1)* %out
@@ -110,7 +110,7 @@ define void @srl_kernarg_i64_to_i32(i32 addrspace(1)* %out, i64 %arg) nounwind {
 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
@@ -125,7 +125,7 @@ define void @srl_buffer_load_i64_to_i32(i32 addrspace(1)* %out, i64 addrspace(1)
 ; FUNC-LABEL: {{^}}truncate_kernarg_i16_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwind {
   %trunc = trunc i16 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -134,7 +134,7 @@ define void @truncate_kernarg_i16_to_i8(i8 addrspace(1)* %out, i16 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i16_to_i8:
 ; SI: buffer_load_ubyte v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i16, i16 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -147,7 +147,7 @@ define void @truncate_buffer_load_i16_to_i8(i8 addrspace(1)* %out, i16 addrspace
 ; FUNC-LABEL: {{^}}srl_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
   %srl = lshr i64 %arg, 32
   %trunc = trunc i64 %srl to i8
   store i8 %trunc, i8 addrspace(1)* %out
@@ -157,7 +157,7 @@ define void @srl_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
 ; FUNC-LABEL: {{^}}srl_buffer_load_i64_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -171,7 +171,7 @@ define void @srl_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)*
 ; FUNC-LABEL: {{^}}truncate_kernarg_i64_to_i8:
 ; SI: s_load_dword s
 ; SI: buffer_store_byte v
-define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
+define amdgpu_kernel void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwind {
   %trunc = trunc i64 %arg to i8
   store i8 %trunc, i8 addrspace(1)* %out
   ret void
@@ -180,7 +180,7 @@ define void @truncate_kernarg_i64_to_i8(i8 addrspace(1)* %out, i64 %arg) nounwin
 ; FUNC-LABEL: {{^}}truncate_buffer_load_i64_to_i8:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_byte v
-define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i8, i8 addrspace(1)* %out, i32 %tid
@@ -194,7 +194,7 @@ define void @truncate_buffer_load_i64_to_i8(i8 addrspace(1)* %out, i64 addrspace
 ; SI: s_load_dword [[LOAD:s[0-9]+]], s[{{[0-9]+}}:{{[0-9]+}}], 0x0
 ; SI: s_waitcnt lgkmcnt(0)
 ; SI: s_and_b32 s{{[0-9]+}}, [[LOAD]], 0xffff
-define void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
+define amdgpu_kernel void @smrd_mask_i32_to_i16(i32 addrspace(1)* %out, i32 addrspace(2)* %in) {
 entry:
   %val = load i32, i32 addrspace(2)* %in
   %mask = and i32 %val, 65535
@@ -205,7 +205,7 @@ entry:
 ; FUNC-LABEL: {{^}}extract_hi_i64_bitcast_v2i32:
 ; SI: buffer_load_dword v
 ; SI: buffer_store_dword v
-define void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @extract_hi_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) nounwind {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %bc = bitcast <2 x i32> %ld to i64
   %hi = lshr i64 %bc, 32
diff --git a/test/CodeGen/AMDGPU/nop-data.ll b/test/CodeGen/AMDGPU/nop-data.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b68f343097e5f78d3353c02aff1144305362ad32
--- /dev/null
+++ b/test/CodeGen/AMDGPU/nop-data.ll
@@ -0,0 +1,87 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -filetype=obj < %s | llvm-objdump -d - -mcpu=fiji | FileCheck %s
+
+; CHECK: kernel0:
+; CHECK-NEXT: s_endpgm
+define amdgpu_kernel void @kernel0() align 256 {
+entry:
+  ret void
+}
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_nop 0  // 0000000001FC: BF800000
+
+; CHECK-NEXT: {{^$}}
+; CHECK-NEXT: kernel1:
+; CHECK-NEXT: s_endpgm
+define amdgpu_kernel void @kernel1(i32 addrspace(1)* addrspace(2)* %ptr.out) align 256 {
+entry:
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/nullptr.ll b/test/CodeGen/AMDGPU/nullptr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0df16da1356226147fd848bcaa2b1e2f29e9c374
--- /dev/null
+++ b/test/CodeGen/AMDGPU/nullptr.ll
@@ -0,0 +1,113 @@
+;RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s
+
+%struct.S = type { i32*, i32 addrspace(1)*, i32 addrspace(2)*, i32 addrspace(3)*, i32 addrspace(4)*, i32 addrspace(5)*}
+
+; CHECK-LABEL: nullptr_priv:
+; CHECK-NEXT: .long 0
+@nullptr_priv = global i32* addrspacecast (i32 addrspace(4)* null to i32*)
+
+; CHECK-LABEL: nullptr_glob:
+; CHECK-NEXT: .quad 0
+@nullptr_glob = global i32 addrspace(1)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(1)*)
+
+; CHECK-LABEL: nullptr_const:
+; CHECK-NEXT: .quad 0
+@nullptr_const = global i32 addrspace(2)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(2)*)
+
+; CHECK-LABEL: nullptr_local:
+; CHECK-NEXT: .long -1
+@nullptr_local = global i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
+
+; CHECK-LABEL: nullptr_region:
+; CHECK-NEXT: .long -1
+@nullptr_region = global i32 addrspace(5)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(5)*)
+
+; CHECK-LABEL: nullptr6:
+; CHECK-NEXT: .long 0
+@nullptr6 = global i32 addrspace(6)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(6)*)
+
+; CHECK-LABEL: nullptr7:
+; CHECK-NEXT: .long 0
+@nullptr7 = global i32 addrspace(7)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(7)*)
+
+; CHECK-LABEL: nullptr8:
+; CHECK-NEXT: .long 0
+@nullptr8 = global i32 addrspace(8)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(8)*)
+
+; CHECK-LABEL: nullptr9:
+; CHECK-NEXT: .long 0
+@nullptr9 = global i32 addrspace(9)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(9)*)
+
+; CHECK-LABEL: nullptr10:
+; CHECK-NEXT: .long 0
+@nullptr10 = global i32 addrspace(10)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(10)*)
+
+; CHECK-LABEL: nullptr11:
+; CHECK-NEXT: .long 0
+@nullptr11 = global i32 addrspace(11)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(11)*)
+
+; CHECK-LABEL: nullptr12:
+; CHECK-NEXT: .long 0
+@nullptr12 = global i32 addrspace(12)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(12)*)
+
+; CHECK-LABEL: nullptr13:
+; CHECK-NEXT: .long 0
+@nullptr13 = global i32 addrspace(13)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(13)*)
+
+; CHECK-LABEL: nullptr14:
+; CHECK-NEXT: .long 0
+@nullptr14 = global i32 addrspace(14)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(14)*)
+
+; CHECK-LABEL: nullptr15:
+; CHECK-NEXT: .long 0
+@nullptr15 = global i32 addrspace(15)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(15)*)
+
+; CHECK-LABEL: nullptr16:
+; CHECK-NEXT: .long 0
+@nullptr16 = global i32 addrspace(16)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(16)*)
+
+; CHECK-LABEL: nullptr17:
+; CHECK-NEXT: .long 0
+@nullptr17 = global i32 addrspace(17)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(17)*)
+
+; CHECK-LABEL: nullptr18:
+; CHECK-NEXT: .long 0
+@nullptr18 = global i32 addrspace(18)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(18)*)
+
+; CHECK-LABEL: nullptr19:
+; CHECK-NEXT: .long 0
+@nullptr19 = global i32 addrspace(19)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(19)*)
+
+; CHECK-LABEL: nullptr20:
+; CHECK-NEXT: .long 0
+@nullptr20 = global i32 addrspace(20)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(20)*)
+
+; CHECK-LABEL: nullptr21:
+; CHECK-NEXT: .long 0
+@nullptr21 = global i32 addrspace(21)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(21)*)
+
+; CHECK-LABEL: nullptr22:
+; CHECK-NEXT: .long 0
+@nullptr22 = global i32 addrspace(22)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(22)*)
+
+; CHECK-LABEL: nullptr23:
+; CHECK-NEXT: .long 0
+@nullptr23 = global i32 addrspace(23)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(23)*)
+
+; CHECK-LABEL: structWithPointers:
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .long -1
+; CHECK-NEXT: .zero 4
+; CHECK-NEXT: .quad 0
+; CHECK-NEXT: .long -1
+; CHECK-NEXT: .zero 4
+@structWithPointers = addrspace(1) global %struct.S {
+  i32* addrspacecast (i32 addrspace(4)* null to i32*),
+  i32 addrspace(1)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(1)*),
+  i32 addrspace(2)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(2)*),
+  i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*),
+  i32 addrspace(4)* null,
+  i32 addrspace(5)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(5)*)}, align 4
diff --git a/test/CodeGen/AMDGPU/omod.ll b/test/CodeGen/AMDGPU/omod.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3fd7b13fcc586ccb307d27f3490123da7e899074
--- /dev/null
+++ b/test/CodeGen/AMDGPU/omod.ll
@@ -0,0 +1,297 @@
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
+; IEEE bit enabled for compute kernel, no shouldn't use.
+; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_signed_zeros:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_signed_zeros(float addrspace(1)* %out, float addrspace(1)* %aptr) #4 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+  store float %div2, float addrspace(1)* %out.gep
+  ret void
+}
+
+; IEEE bit enabled for compute kernel, no shouldn't use even though nsz is allowed
+; GCN-LABEL: {{^}}v_omod_div2_f32_enable_ieee_nsz:
+; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]]
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, [[A]]{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_kernel void @v_omod_div2_f32_enable_ieee_nsz(float addrspace(1)* %out, float addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr float, float addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
+  %a = load float, float addrspace(1)* %gep0
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+  store float %div2, float addrspace(1)* %out.gep
+  ret void
+}
+
+; Only allow without IEEE bit if signed zeros are significant.
+; GCN-LABEL: {{^}}v_omod_div2_f32_signed_zeros:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_div2_f32_signed_zeros(float %a) #4 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_div2_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 div:2{{$}}
+define amdgpu_ps void @v_omod_div2_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_mul2_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:2{{$}}
+define amdgpu_ps void @v_omod_mul2_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 2.0
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_mul4_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}
+define amdgpu_ps void @v_omod_mul4_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 4.0
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_mul4_multi_use_f32:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 4.0, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_mul4_multi_use_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 4.0
+  store float %div2, float addrspace(1)* undef
+  store volatile float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_mul4_dbg_use_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 mul:4{{$}}
+define amdgpu_ps void @v_omod_mul4_dbg_use_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  call void @llvm.dbg.value(metadata float %add, i64 0, metadata !4, metadata !9), !dbg !10
+  %div2 = fmul float %add, 4.0
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; Clamp is applied after omod, folding both into instruction is OK.
+; GCN-LABEL: {{^}}v_clamp_omod_div2_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, 1.0 clamp div:2{{$}}
+define amdgpu_ps void @v_clamp_omod_div2_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+
+  %max = call float @llvm.maxnum.f32(float %div2, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* undef
+  ret void
+}
+
+; Cannot fold omod into clamp
+; GCN-LABEL: {{^}}v_omod_div2_clamp_f32:
+; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 clamp{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_div2_clamp_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  %div2 = fmul float %clamp, 0.5
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_div2_abs_src_f32:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; GCN: v_mul_f32_e64 v{{[0-9]+}}, |[[ADD]]|, 0.5{{$}}
+define amdgpu_ps void @v_omod_div2_abs_src_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %abs.add = call float @llvm.fabs.f32(float %add)
+  %div2 = fmul float %abs.add, 0.5
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_add_self_clamp_f32:
+; GCN: v_add_f32_e64 v{{[0-9]+}}, v0, v0 clamp{{$}}
+define amdgpu_ps void @v_omod_add_self_clamp_f32(float %a) #0 {
+  %add = fadd float %a, %a
+  %max = call float @llvm.maxnum.f32(float %add, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  store float %clamp, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_add_clamp_self_f32:
+; GCN: v_max_f32_e64 [[CLAMP:v[0-9]+]], v0, v0 clamp{{$}}
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[CLAMP]], [[CLAMP]]{{$}}
+define amdgpu_ps void @v_omod_add_clamp_self_f32(float %a) #0 {
+  %max = call float @llvm.maxnum.f32(float %a, float 0.0)
+  %clamp = call float @llvm.minnum.f32(float %max, float 1.0)
+  %add = fadd float %clamp, %clamp
+  store float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_add_abs_self_f32:
+; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, |[[X]]|{{$}}
+define amdgpu_ps void @v_omod_add_abs_self_f32(float %a) #0 {
+  %x = fadd float %a, 1.0
+  %abs.x = call float @llvm.fabs.f32(float %x)
+  %add = fadd float %abs.x, %abs.x
+  store float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_add_abs_x_x_f32:
+
+; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
+; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[X]]{{$}}
+define amdgpu_ps void @v_omod_add_abs_x_x_f32(float %a) #0 {
+  %x = fadd float %a, 1.0
+  %abs.x = call float @llvm.fabs.f32(float %x)
+  %add = fadd float %abs.x, %x
+  store float %add, float addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_add_x_abs_x_f32:
+; GCN: v_add_f32_e32 [[X:v[0-9]+]], 1.0, v0
+; GCN: v_add_f32_e64 v{{[0-9]+}}, [[X]], |[[X]]|{{$}}
+define amdgpu_ps void @v_omod_add_x_abs_x_f32(float %a) #0 {
+  %x = fadd float %a, 1.0
+  %abs.x = call float @llvm.fabs.f32(float %x)
+  %add = fadd float %x, %abs.x
+  store float %add, float addrspace(1)* undef
+  ret void
+}
+
+; Don't fold omod into omod into another omod.
+; GCN-LABEL: {{^}}v_omod_div2_omod_div2_f32:
+; GCN: v_add_f32_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_div2_omod_div2_f32(float %a) #0 {
+  %add = fadd float %a, 1.0
+  %div2.0 = fmul float %add, 0.5
+  %div2.1 = fmul float %div2.0, 0.5
+  store float %div2.1, float addrspace(1)* undef
+  ret void
+}
+
+; Don't fold omod if denorms enabled
+; GCN-LABEL: {{^}}v_omod_div2_f32_denormals:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_div2_f32_denormals(float %a) #2 {
+  %add = fadd float %a, 1.0
+  %div2 = fmul float %add, 0.5
+  store float %div2, float addrspace(1)* undef
+  ret void
+}
+
+; Don't fold omod if denorms enabled for add form.
+; GCN-LABEL: {{^}}v_omod_mul2_f32_denormals:
+; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; GCN: v_add_f32_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_mul2_f32_denormals(float %a) #2 {
+  %add = fadd float %a, 1.0
+  %mul2 = fadd float %add, %add
+  store float %mul2, float addrspace(1)* undef
+  ret void
+}
+
+; Don't fold omod if denorms enabled
+; GCN-LABEL: {{^}}v_omod_div2_f16_denormals:
+; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; VI: v_mul_f16_e32 v{{[0-9]+}}, 0.5, [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_div2_f16_denormals(half %a) #0 {
+  %add = fadd half %a, 1.0
+  %div2 = fmul half %add, 0.5
+  store half %div2, half addrspace(1)* undef
+  ret void
+}
+
+; Don't fold omod if denorms enabled for add form.
+; GCN-LABEL: {{^}}v_omod_mul2_f16_denormals:
+; VI: v_add_f16_e32 [[ADD:v[0-9]+]], 1.0, v0{{$}}
+; VI: v_add_f16_e32 v{{[0-9]+}}, [[ADD]], [[ADD]]{{$}}
+define amdgpu_ps void @v_omod_mul2_f16_denormals(half %a) #0 {
+  %add = fadd half %a, 1.0
+  %mul2 = fadd half %add, %add
+  store half %mul2, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_div2_f16_no_denormals:
+; VI-NOT: v0
+; VI: v_add_f16_e64 [[ADD:v[0-9]+]], v0, 1.0 div:2{{$}}
+define amdgpu_ps void @v_omod_div2_f16_no_denormals(half %a) #3 {
+  %add = fadd half %a, 1.0
+  %div2 = fmul half %add, 0.5
+  store half %div2, half addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_omod_mac_to_mad:
+; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}} mul:2{{$}}
+define amdgpu_ps void @v_omod_mac_to_mad(float %b, float %a) #0 {
+  %mul = fmul float %a, %a
+  %add = fadd float %mul, %b
+  %mad = fmul float %add, 2.0
+  %res = fmul float %mad, %b
+  store float %res, float addrspace(1)* undef
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.floor.f32(float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) #1
+declare double @llvm.fabs.f64(double) #1
+declare double @llvm.minnum.f64(double, double) #1
+declare double @llvm.maxnum.f64(double, double) #1
+declare half @llvm.fabs.f16(half) #1
+declare half @llvm.minnum.f16(half, half) #1
+declare half @llvm.maxnum.f16(half, half) #1
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind "no-signed-zeros-fp-math"="true" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind "target-features"="+fp32-denormals" "no-signed-zeros-fp-math"="true" }
+attributes #3 = { nounwind "target-features"="-fp64-fp16-denormals" "no-signed-zeros-fp-math"="true" }
+attributes #4 = { nounwind "no-signed-zeros-fp-math"="false" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug)
+!1 = !DIFile(filename: "/tmp/foo.cl", directory: "/dev/null")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !DILocalVariable(name: "add", arg: 1, scope: !5, file: !1, line: 1)
+!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !6, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !8}
+!8 = !DIBasicType(name: "float", size: 32, align: 32)
+!9 = !DIExpression()
+!10 = !DILocation(line: 1, column: 42, scope: !5)
diff --git a/test/CodeGen/AMDGPU/opencl-image-metadata.ll b/test/CodeGen/AMDGPU/opencl-image-metadata.ll
index 0242f6d6145a1851d161b30b4d111124cf37ff53..c974471c65738b8fa5a2276ff3094fa3e68b8509 100644
--- a/test/CodeGen/AMDGPU/opencl-image-metadata.ll
+++ b/test/CodeGen/AMDGPU/opencl-image-metadata.ll
@@ -6,7 +6,7 @@
 
 ; EG: CF_END
 ; SI: s_endpgm
-define void @kernel(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @kernel(i32 addrspace(1)* %out) {
 entry:
   store i32 0, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/operand-folding.ll b/test/CodeGen/AMDGPU/operand-folding.ll
index 4e5ea4b86b773f2b7adc9b5a83dea96ec5424ca1..3836a2b7e599f59b4e1924deac934aff9e967d1c 100644
--- a/test/CodeGen/AMDGPU/operand-folding.ll
+++ b/test/CodeGen/AMDGPU/operand-folding.ll
@@ -2,7 +2,7 @@
 
 ; CHECK-LABEL: {{^}}fold_sgpr:
 ; CHECK: v_add_i32_e32 v{{[0-9]+}}, vcc, s
-define void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
+define amdgpu_kernel void @fold_sgpr(i32 addrspace(1)* %out, i32 %fold) {
 entry:
   %tmp0 = icmp ne i32 %fold, 0
   br i1 %tmp0, label %if, label %endif
@@ -20,7 +20,7 @@ endif:
 
 ; CHECK-LABEL: {{^}}fold_imm:
 ; CHECK: v_or_b32_e32 v{{[0-9]+}}, 5
-define void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
+define amdgpu_kernel void @fold_imm(i32 addrspace(1)* %out, i32 %cmp) {
 entry:
   %fold = add i32 3, 2
   %tmp0 = icmp ne i32 %cmp, 0
@@ -46,7 +46,7 @@ endif:
 ; CHECK-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[HI]]
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}},
 
-define void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
+define amdgpu_kernel void @fold_64bit_constant_add(i64 addrspace(1)* %out, i32 %cmp, i64 %val) {
 entry:
   %tmp0 = add i64 %val, 1
   store i64 %tmp0, i64 addrspace(1)* %out
@@ -61,7 +61,7 @@ entry:
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 5, v{{[0-9]+}}
 
-define void @vector_inline(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @vector_inline(<4 x i32> addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
@@ -80,7 +80,7 @@ entry:
 ; CHECK-LABEL: {{^}}imm_one_use:
 ; CHECK: v_xor_b32_e32 v{{[0-9]+}}, 0x64, v{{[0-9]+}}
 
-define void @imm_one_use(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @imm_one_use(i32 addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = xor i32 %tmp0, 100
@@ -94,7 +94,7 @@ entry:
 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
 ; CHECK: v_xor_b32_e32 v{{[0-9]}}, [[IMM]], v{{[0-9]}}
 
-define void @vector_imm(<4 x i32> addrspace(1)* %out) {
+define amdgpu_kernel void @vector_imm(<4 x i32> addrspace(1)* %out) {
 entry:
   %tmp0 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp0, 1
@@ -114,7 +114,7 @@ entry:
 ; CHECK: buffer_load_dwordx2 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}}
 ; CHECK: v_mac_f32_e32 v[[LO]], 0x41200000, v[[HI]]
 ; CHECK: buffer_store_dword v[[LO]]
-define void @no_fold_tied_subregister() {
+define amdgpu_kernel void @no_fold_tied_subregister() {
   %tmp1 = load volatile <2 x float>, <2 x float> addrspace(1)* undef
   %tmp2 = extractelement <2 x float> %tmp1, i32 0
   %tmp3 = extractelement <2 x float> %tmp1, i32 1
diff --git a/test/CodeGen/AMDGPU/operand-spacing.ll b/test/CodeGen/AMDGPU/operand-spacing.ll
index 127f3da220e71de251bded54cbdbdd7e4e382e39..fc6f070b737a68a596d95d7878116e6bfc8833d9 100644
--- a/test/CodeGen/AMDGPU/operand-spacing.ll
+++ b/test/CodeGen/AMDGPU/operand-spacing.ll
@@ -11,7 +11,7 @@
 ; GCN: v_mov_b32_e32 [[VREGB:v[0-9]+]], [[SREGB]]
 ; GCN: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SREGA]], [[VREGB]]
 ; GCN: buffer_store_dword [[RESULT]],
-define void @add_f32(float addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @add_f32(float addrspace(1)* %out, float %a, float %b) {
   %result = fadd float %a, %b
   store float %result, float addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
index 4584802ad5a7a390b2fc4a6367848f72715e9b71..2de6b59e59e9669eea85850a06b50881684a9b3c 100644
--- a/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
+++ b/test/CodeGen/AMDGPU/optimize-if-exec-masking.mir
@@ -3,7 +3,7 @@
 --- |
   target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
-  define void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor(i32 %z, i32 %v) #0 {
   main_body:
     %id = call i32 @llvm.amdgcn.workitem.id.x()
     %cc = icmp eq i32 %id, 0
@@ -23,7 +23,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -34,7 +34,7 @@
     ret void
   }
 
-  define void @optimize_if_or_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_or_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -46,7 +46,7 @@
   }
 
 
-  define void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_valu_middle(i32 %z, i32 %v) #0 {
   main_body:
     %id = call i32 @llvm.amdgcn.workitem.id.x()
     %cc = icmp eq i32 %id, 0
@@ -67,7 +67,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_wrong_reg(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -78,7 +78,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_modify_copy_to_exec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -89,7 +89,7 @@
     ret void
   }
 
-  define void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_and_saveexec_xor_live_out_setexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -100,7 +100,7 @@
     ret void
   }
 
-  define void @optimize_if_unknown_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_unknown_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -111,7 +111,7 @@
     ret void
   }
 
-  define void @optimize_if_andn2_saveexec(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_andn2_saveexec(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
@@ -122,7 +122,7 @@
     ret void
   }
 
-  define void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v)  #0 {
+  define amdgpu_kernel void @optimize_if_andn2_saveexec_no_commute(i32 %z, i32 %v)  #0 {
   main_body:
       br i1 undef, label %if, label %end
 
diff --git a/test/CodeGen/AMDGPU/or.ll b/test/CodeGen/AMDGPU/or.ll
index eca6909d4eb944915108a349e7fcddff5ca7b4f0..eb082843fb829a4b50352565385094823520a778 100644
--- a/test/CodeGen/AMDGPU/or.ll
+++ b/test/CodeGen/AMDGPU/or.ll
@@ -9,7 +9,7 @@
 
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -28,7 +28,7 @@ define void @or_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in)
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_or_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -39,7 +39,7 @@ define void @or_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in)
 
 ; FUNC-LABEL: {{^}}scalar_or_i32:
 ; SI: s_or_b32
-define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %or = or i32 %a, %b
   store i32 %or, i32 addrspace(1)* %out
   ret void
@@ -47,7 +47,7 @@ define void @scalar_or_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 
 ; FUNC-LABEL: {{^}}vector_or_i32:
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
+define amdgpu_kernel void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b) {
   %loada = load i32, i32 addrspace(1)* %a
   %or = or i32 %loada, %b
   store i32 %or, i32 addrspace(1)* %out
@@ -56,7 +56,7 @@ define void @vector_or_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 %b)
 
 ; FUNC-LABEL: {{^}}scalar_or_literal_i32:
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x1869f
-define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
   %or = or i32 %a, 99999
   store i32 %or, i32 addrspace(1)* %out, align 4
   ret void
@@ -68,7 +68,7 @@ define void @scalar_or_literal_i32(i32 addrspace(1)* %out, i32 %a) {
 ; SI-DAG: s_or_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -82,7 +82,7 @@ define void @scalar_or_literal_i64(i64 addrspace(1)* %out, i64 %a) {
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -101,7 +101,7 @@ define void @scalar_or_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: or_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -111,7 +111,7 @@ define void @scalar_or_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI-NOT: or_b32
 ; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 63
 ; SI-NOT: or_b32
-define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   %foo = add i64 %b, 63
@@ -125,7 +125,7 @@ define void @scalar_or_inline_imm_multi_use_i64(i64 addrspace(1)* %out, i64 %a,
 ; SI-DAG: v_mov_b32_e32 v[[V_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: v_mov_b32_e32 v[[V_LO:[0-9]+]], [[VAL]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[V_LO]]:[[V_HI]]{{\]}}
-define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = or i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -133,7 +133,7 @@ define void @scalar_or_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 
 ; FUNC-LABEL: {{^}}vector_or_literal_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
-define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
   %loada = load i32, i32 addrspace(1)* %a, align 4
   %or = or i32 %loada, 65535
   store i32 %or, i32 addrspace(1)* %out, align 4
@@ -142,7 +142,7 @@ define void @vector_or_literal_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a,
 
 ; FUNC-LABEL: {{^}}vector_or_inline_immediate_i32:
 ; SI: v_or_b32_e32 v{{[0-9]+}}, 4, v{{[0-9]+}}
-define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
   %loada = load i32, i32 addrspace(1)* %a, align 4
   %or = or i32 %loada, 4
   store i32 %or, i32 addrspace(1)* %out, align 4
@@ -154,7 +154,7 @@ define void @vector_or_inline_immediate_i32(i32 addrspace(1)* %out, i32 addrspac
 ; EG-DAG: OR_INT * T{{[0-9]\.[XYZW]}}, KC0[3].X, KC0[3].Z
 
 ; SI: s_or_b64
-define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = or i64 %a, %b
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -163,7 +163,7 @@ define void @scalar_or_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; FUNC-LABEL: {{^}}vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %loadb = load i64, i64 addrspace(1)* %b, align 8
   %or = or i64 %loada, %loadb
@@ -174,7 +174,7 @@ define void @vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; FUNC-LABEL: {{^}}scalar_vector_or_i64:
 ; SI: v_or_b32_e32 v{{[0-9]}}
 ; SI: v_or_b32_e32 v{{[0-9]}}
-define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
+define amdgpu_kernel void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 %b) {
   %loada = load i64, i64 addrspace(1)* %a
   %or = or i64 %loada, %b
   store i64 %or, i64 addrspace(1)* %out
@@ -186,7 +186,7 @@ define void @scalar_vector_or_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 22470723082367
   store i64 %or, i64 addrspace(1)* %out
@@ -200,7 +200,7 @@ define void @vector_or_i64_loadimm(i64 addrspace(1)* %out, i64 addrspace(1)* %a,
 ; SI-NOT: v_or_b32_e32 {{v[0-9]+}}, 0
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO_RESULT]]:[[HI_VREG]]{{\]}}
 ; SI: s_endpgm
-define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, 8
   store i64 %or, i64 addrspace(1)* %out
@@ -213,7 +213,7 @@ define void @vector_or_i64_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64
 ; SI-DAG: v_mov_b32_e32 v[[RES_HI:[0-9]+]], -1{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[RES_LO]]:[[RES_HI]]{{\]}}
 ; SI: s_endpgm
-define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, -8
   store i64 %or, i64 addrspace(1)* %out
@@ -226,7 +226,7 @@ define void @vector_or_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 0xffffff38, v[[LO_VREG]]
 ; SI: buffer_store_dwordx2
 ; SI: s_endpgm
-define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = or i64 %loada, -200
   store i64 %or, i64 addrspace(1)* %out
@@ -239,7 +239,7 @@ define void @vector_or_i64_neg_literal(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: s_or_b32 s[[SRESULT:[0-9]+]], s[[SREG1]], s[[SREG0]]
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], s[[SRESULT]]
 ; SI: buffer_store_dword [[VRESULT]],
-define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
   %add = or i64 %b, %a
   %trunc = trunc i64 %add to i32
   store i32 %trunc, i32 addrspace(1)* %out, align 8
@@ -250,7 +250,7 @@ define void @trunc_i64_or_to_i32(i32 addrspace(1)* %out, i64 %a, i64 %b) {
 ; EG: OR_INT * {{\** *}}T{{[0-9]+\.[XYZW], PS, PV\.[XYZW]}}
 
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
-define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+define amdgpu_kernel void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1)* %in0
   %b = load float, float addrspace(1)* %in1
   %acmp = fcmp oge float %a, 0.000000e+00
@@ -263,7 +263,7 @@ define void @or_i1(i32 addrspace(1)* %out, float addrspace(1)* %in0, float addrs
 
 ; FUNC-LABEL: {{^}}s_or_i1:
 ; SI: s_or_b64 s[{{[0-9]+:[0-9]+}}], vcc, s[{{[0-9]+:[0-9]+}}]
-define void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @s_or_i1(i1 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %cmp0 = icmp eq i32 %a, %b
   %cmp1 = icmp eq i32 %c, %d
   %or = or i1 %cmp0, %cmp1
diff --git a/test/CodeGen/AMDGPU/over-max-lds-size.ll b/test/CodeGen/AMDGPU/over-max-lds-size.ll
index 32ad9aba04edc43dc3023a8282418e7d082a9ff7..57777e783c5618587232a017ec6071d2d3ac2bf0 100644
--- a/test/CodeGen/AMDGPU/over-max-lds-size.ll
+++ b/test/CodeGen/AMDGPU/over-max-lds-size.ll
@@ -6,7 +6,7 @@
 
 @huge = internal unnamed_addr addrspace(3) global [100000 x i32] undef, align 4
 
-define void @use_huge_lds() {
+define amdgpu_kernel void @use_huge_lds() {
 entry:
   %v0 = getelementptr inbounds [100000 x i32], [100000 x i32] addrspace(3)* @huge, i32 0, i32 0
   store i32 0, i32 addrspace(3)* %v0
diff --git a/test/CodeGen/AMDGPU/pack.v2f16.ll b/test/CodeGen/AMDGPU/pack.v2f16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b86215627131d22dab67400946253bcbd5fdc04e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/pack.v2f16.ll
@@ -0,0 +1,229 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-FLUSH %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+
+
+; GCN-LABEL: {{^}}s_pack_v2f16:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2f16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
+  %val0 = load volatile i32, i32 addrspace(2)* %in0
+  %val1 = load volatile i32, i32 addrspace(2)* %in1
+  %lo.i = trunc i32 %val0 to i16
+  %hi.i = trunc i32 %val1 to i16
+  %lo = bitcast i16 %lo.i to half
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_pack_v2f16_imm_lo:
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1234, [[VAL1]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2f16_imm_lo(i32 addrspace(2)* %in1) #0 {
+  %val1 = load i32, i32 addrspace(2)* %in1
+  %hi.i = trunc i32 %val1 to i16
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_pack_v2f16_imm_hi:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1234
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2f16_imm_hi(i32 addrspace(2)* %in0) #0 {
+  %val0 = load i32, i32 addrspace(2)* %in0
+  %lo.i = trunc i32 %val0 to i16
+  %lo = bitcast i16 %lo.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16:
+; GFX9: flat_load_dword [[VAL0:v[0-9]+]]
+; GFX9: flat_load_dword [[VAL1:v[0-9]+]]
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]]
+
+; GFX9-FLUSH: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]]
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %lo.i = trunc i32 %val0 to i16
+  %hi.i = trunc i32 %val1 to i16
+  %lo = bitcast i16 %lo.i to half
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_user:
+; GFX9: flat_load_dword [[VAL0:v[0-9]+]]
+; GFX9: flat_load_dword [[VAL1:v[0-9]+]]
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]]
+
+; GFX9-FLUSH: v_and_b32_e32 [[ELT0:v[0-9]+]], 0xffff, [[VAL0]]
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[ELT0]]
+
+; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %lo.i = trunc i32 %val0 to i16
+  %hi.i = trunc i32 %val1 to i16
+  %lo = bitcast i16 %lo.i to half
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  %foo = add i32 %vec.i32, 9
+  store volatile i32 %foo, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_imm_lo:
+; GFX9-DAG: flat_load_dword [[VAL1:v[0-9]+]]
+; GFX9-DENORM-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234{{$}}
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[K]], [[VAL1]]
+
+; GFX9-FLUSH-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x1234{{$}}
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_imm_lo(i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %hi.i = trunc i32 %val1 to i16
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half 0xH1234, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_lo:
+; GFX9-DAG: flat_load_dword [[VAL1:v[0-9]+]]
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], 4.0, [[VAL1]]
+
+; GFX9-FLUSH-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x4400{{$}}
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %hi.i = trunc i32 %val1 to i16
+  %hi = bitcast i16 %hi.i to half
+  %vec.0 = insertelement <2 x half> undef, half 4.0, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half %hi, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_imm_hi:
+; GFX9-DAG: flat_load_dword [[VAL0:v[0-9]+]]
+; GFX9-DENORM-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[K]]
+
+; GFX9-FLUSH-DAG: s_movk_i32 [[K:s[0-9]+]], 0x1234
+; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_imm_hi(i32 addrspace(1)* %in0) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %lo.i = trunc i32 %val0 to i16
+  %lo = bitcast i16 %lo.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half 0xH1234, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_inline_f16imm_hi:
+; GFX9-DAG: flat_load_dword [[VAL:v[0-9]+]]
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL]], 1.0
+
+; GFX9-FLUSH-DAG: s_movk_i32 [[K:s[0-9]+]], 0x3c00
+; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]]
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[MASKED]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_inline_f16imm_hi(i32 addrspace(1)* %in0) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %lo.i = trunc i32 %val0 to i16
+  %lo = bitcast i16 %lo.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half 1.0, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2f16_inline_imm_hi:
+; GFX9: flat_load_dword [[VAL:v[0-9]+]]
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL]], 64
+
+; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL]]
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], 64, 16, [[MASKED]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2f16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %lo.i = trunc i32 %val0 to i16
+  %lo = bitcast i16 %lo.i to half
+  %vec.0 = insertelement <2 x half> undef, half %lo, i32 0
+  %vec.1 = insertelement <2 x half> %vec.0, half 0xH0040, i32 1
+  %vec.i32 = bitcast <2 x half> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/pack.v2i16.ll b/test/CodeGen/AMDGPU/pack.v2i16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9ffd16754a1c43f719465957a213a5e559aad812
--- /dev/null
+++ b/test/CodeGen/AMDGPU/pack.v2i16.ll
@@ -0,0 +1,188 @@
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,+fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-DENORM %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx901 -mattr=-flat-for-global,-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX9-FLUSH %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=kaveri -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI %s
+
+
+; GCN-LABEL: {{^}}s_pack_v2i16:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], [[VAL1]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2i16(i32 addrspace(2)* %in0, i32 addrspace(2)* %in1) #0 {
+  %val0 = load volatile i32, i32 addrspace(2)* %in0
+  %val1 = load volatile i32, i32 addrspace(2)* %in1
+  %lo = trunc i32 %val0 to i16
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_pack_v2i16_imm_lo:
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], 0x1c8, [[VAL1]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2i16_imm_lo(i32 addrspace(2)* %in1) #0 {
+  %val1 = load i32, i32 addrspace(2)* %in1
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 456, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_pack_v2i16_imm_hi:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_pack_ll_b32_b16 [[PACKED:s[0-9]+]], [[VAL0]], 0x1c8
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @s_pack_v2i16_imm_hi(i32 addrspace(2)* %in0) #0 {
+  %val0 = load i32, i32 addrspace(2)* %in0
+  %lo = trunc i32 %val0 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 456, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+
+  call void asm sideeffect "; use $0", "s"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16:
+; GFX9: flat_load_dword [[VAL0:v[0-9]+]]
+; GFX9: flat_load_dword [[VAL1:v[0-9]+]]
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]]
+
+; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %lo = trunc i32 %val0 to i16
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16_user:
+; GFX9: flat_load_dword [[VAL0:v[0-9]+]]
+; GFX9: flat_load_dword [[VAL1:v[0-9]+]]
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[VAL1]]
+
+; GFX9-FLUSH: v_and_b32_e32 [[MASKED:v[0-9]+]], 0xffff, [[VAL0]]
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[MASKED]]
+
+; GFX9: v_add_i32_e32 v{{[0-9]+}}, vcc, 9, [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16_user(i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %lo = trunc i32 %val0 to i16
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  %foo = add i32 %vec.i32, 9
+  store volatile i32 %foo, i32 addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16_imm_lo:
+; GFX9-DAG: flat_load_dword [[VAL1:v[0-9]+]]
+; GFX9-DENORM-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7b{{$}}
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[K]], [[VAL1]]
+
+; GFX9-FLUSH-DAG: v_mov_b32_e32 [[K:v[0-9]+]], 0x7b{{$}}
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, [[K]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16_imm_lo(i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 123, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16_inline_imm_lo:
+; GFX9: flat_load_dword [[VAL1:v[0-9]+]]
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], 64, [[VAL1]]
+
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[VAL1]], 16, 64
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16_inline_imm_lo(i32 addrspace(1)* %in1) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in1.gep = getelementptr inbounds i32, i32 addrspace(1)* %in1, i64 %tid.ext
+  %val1 = load volatile i32, i32 addrspace(1)* %in1.gep
+  %hi = trunc i32 %val1 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 64, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 %hi, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16_imm_hi:
+; GFX9-DAG: flat_load_dword [[VAL0:v[0-9]+]]
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL0]], [[K]]
+
+; GFX9-FLUSH-DAG: s_movk_i32 [[K:s[0-9]+]], 0x7b{{$}}
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], [[K]], 16, [[VAL0]]
+
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16_imm_hi(i32 addrspace(1)* %in0) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %lo = trunc i32 %val0 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 123, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_pack_v2i16_inline_imm_hi:
+; GFX9: flat_load_dword [[VAL:v[0-9]+]]
+; GFX9-DENORM: v_pack_b32_f16 [[PACKED:v[0-9]+]], [[VAL]], 7
+; GFX9-FLUSH: v_lshl_or_b32 [[PACKED:v[0-9]+]], 7, 16, [[VAL0]]
+; GFX9: ; use [[PACKED]]
+define amdgpu_kernel void @v_pack_v2i16_inline_imm_hi(i32 addrspace(1)* %in0) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in0.gep = getelementptr inbounds i32, i32 addrspace(1)* %in0, i64 %tid.ext
+  %val0 = load volatile i32, i32 addrspace(1)* %in0.gep
+  %lo = trunc i32 %val0 to i16
+  %vec.0 = insertelement <2 x i16> undef, i16 %lo, i32 0
+  %vec.1 = insertelement <2 x i16> %vec.0, i16 7, i32 1
+  %vec.i32 = bitcast <2 x i16> %vec.1 to i32
+  call void asm sideeffect "; use $0", "v"(i32 %vec.i32) #0
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/packetizer.ll b/test/CodeGen/AMDGPU/packetizer.ll
index 49a7c0df748f58cd52cc0d5dd3d35b6fd7ce95ed..1764d64c367f81c8595c12fdad499248deb5310b 100644
--- a/test/CodeGen/AMDGPU/packetizer.ll
+++ b/test/CodeGen/AMDGPU/packetizer.ll
@@ -7,7 +7,7 @@
 ; CHECK: BIT_ALIGN_INT T{{[0-9]}}.Z
 ; CHECK: BIT_ALIGN_INT * T{{[0-9]}}.W
 
-define void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %x_arg, i32 %y_arg, i32 %z_arg, i32 %w_arg, i32 %e) {
 entry:
   %shl = sub i32 32, %e
   %x = add i32 %x_arg, 1
diff --git a/test/CodeGen/AMDGPU/parallelandifcollapse.ll b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
index ea943a533c81fd9c2b542eba9bef912a35427dbd..a90f200f79e3bdc2145b81b54351e5e2aea57af8 100644
--- a/test/CodeGen/AMDGPU/parallelandifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelandifcollapse.ll
@@ -11,7 +11,7 @@
 ; to do its transfomation, however now that we are using local memory for
 ; allocas, the transformation isn't happening.
 
-define void @_Z9chk1D_512v() #0 {
+define amdgpu_kernel void @_Z9chk1D_512v() #0 {
 entry:
   %a0 = alloca i32, align 4
   %b0 = alloca i32, align 4
diff --git a/test/CodeGen/AMDGPU/parallelorifcollapse.ll b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
index 1da1e91b8ab8aed671f0cf0299bceef6673cbaa4..91116b0f65ea423b8eb00130d2bdd0c3e9476e87 100644
--- a/test/CodeGen/AMDGPU/parallelorifcollapse.ll
+++ b/test/CodeGen/AMDGPU/parallelorifcollapse.ll
@@ -12,7 +12,7 @@
 ; CHECK: OR_INT
 ; CHECK-NEXT: OR_INT
 ; CHECK-NEXT: OR_INT
-define void @_Z9chk1D_512v() #0 {
+define amdgpu_kernel void @_Z9chk1D_512v() #0 {
 entry:
   %a0 = alloca i32, align 4
   %b0 = alloca i32, align 4
diff --git a/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
new file mode 100644
index 0000000000000000000000000000000000000000..77d793201adc7679b8a09505d4676049e3d50c32
--- /dev/null
+++ b/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll
@@ -0,0 +1,638 @@
+; RUN: llc -O0 -march=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VGPR -check-prefix=GCN %s
+
+; FIXME: we should disable sdwa peephole because dead-code elimination, that
+; runs after peephole, ruins this test (different register numbers)
+
+; Spill all SGPRs so multiple VGPRs are required for spilling all of them.
+
+; Ideally we only need 2 VGPRs for all spilling. The VGPRs are
+; allocated per-frame index, so it's possible to get up with more.
+
+; GCN-LABEL: {{^}}spill_sgprs_to_multiple_vgprs:
+
+; GCN: def s[8:15]
+; GCN: def s[16:23]
+; GCN: def s[24:31]
+; GCN: def s[32:39]
+; GCN: def s[40:47]
+; GCN: def s[48:55]
+; GCN: def s[56:63]
+; GCN: def s[64:71]
+; GCN: def s[72:79]
+; GCN: def s[80:87]
+; GCN: def s[88:95]
+
+; GCN: v_writelane_b32 v0, s8, 0
+; GCN-NEXT: v_writelane_b32 v0, s9, 1
+; GCN-NEXT: v_writelane_b32 v0, s10, 2
+; GCN-NEXT: v_writelane_b32 v0, s11, 3
+; GCN-NEXT: v_writelane_b32 v0, s12, 4
+; GCN-NEXT: v_writelane_b32 v0, s13, 5
+; GCN-NEXT: v_writelane_b32 v0, s14, 6
+; GCN-NEXT: v_writelane_b32 v0, s15, 7
+
+; GCN: def s{{\[}}[[TMP_LO:[0-9]+]]:[[TMP_HI:[0-9]+]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 8
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 9
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 10
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 11
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 12
+; GCN-NEXT: v_writelane_b32 v0, s13, 13
+; GCN-NEXT: v_writelane_b32 v0, s14, 14
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 15
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 16
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 17
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 18
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 19
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 20
+; GCN-NEXT: v_writelane_b32 v0, s13, 21
+; GCN-NEXT: v_writelane_b32 v0, s14, 22
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 23
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 24
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 25
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 26
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 27
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 28
+; GCN-NEXT: v_writelane_b32 v0, s13, 29
+; GCN-NEXT: v_writelane_b32 v0, s14, 30
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 31
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 32
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 33
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 34
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 35
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 36
+; GCN-NEXT: v_writelane_b32 v0, s13, 37
+; GCN-NEXT: v_writelane_b32 v0, s14, 38
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 39
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 40
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 41
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 42
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 43
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 44
+; GCN-NEXT: v_writelane_b32 v0, s13, 45
+; GCN-NEXT: v_writelane_b32 v0, s14, 46
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 47
+
+; GCN: def s{{\[}}[[TMP_LO]]:[[TMP_HI]]{{\]}}
+; GCN: v_writelane_b32 v0, s[[TMP_LO]], 48
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 49
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 50
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 51
+; GCN-NEXT: v_writelane_b32 v0, s{{[0-9]+}}, 52
+; GCN-NEXT: v_writelane_b32 v0, s13, 53
+; GCN-NEXT: v_writelane_b32 v0, s14, 54
+; GCN-NEXT: v_writelane_b32 v0, s[[TMP_HI]], 55
+
+; GCN-NEXT: v_writelane_b32 v0, s88, 56
+; GCN-NEXT: v_writelane_b32 v0, s89, 57
+; GCN-NEXT: v_writelane_b32 v0, s90, 58
+; GCN-NEXT: v_writelane_b32 v0, s91, 59
+; GCN-NEXT: v_writelane_b32 v0, s92, 60
+; GCN-NEXT: v_writelane_b32 v0, s93, 61
+; GCN-NEXT: v_writelane_b32 v0, s94, 62
+; GCN-NEXT: v_writelane_b32 v0, s95, 63
+; GCN-NEXT: v_writelane_b32 v1, s16, 0
+; GCN-NEXT: v_writelane_b32 v1, s17, 1
+; GCN-NEXT: v_writelane_b32 v1, s18, 2
+; GCN-NEXT: v_writelane_b32 v1, s19, 3
+; GCN-NEXT: v_writelane_b32 v1, s20, 4
+; GCN-NEXT: v_writelane_b32 v1, s21, 5
+; GCN-NEXT: v_writelane_b32 v1, s22, 6
+; GCN-NEXT: v_writelane_b32 v1, s23, 7
+; GCN-NEXT: v_writelane_b32 v1, s24, 8
+; GCN-NEXT: v_writelane_b32 v1, s25, 9
+; GCN-NEXT: v_writelane_b32 v1, s26, 10
+; GCN-NEXT: v_writelane_b32 v1, s27, 11
+; GCN-NEXT: v_writelane_b32 v1, s28, 12
+; GCN-NEXT: v_writelane_b32 v1, s29, 13
+; GCN-NEXT: v_writelane_b32 v1, s30, 14
+; GCN-NEXT: v_writelane_b32 v1, s31, 15
+; GCN-NEXT: v_writelane_b32 v1, s32, 16
+; GCN-NEXT: v_writelane_b32 v1, s33, 17
+; GCN-NEXT: v_writelane_b32 v1, s34, 18
+; GCN-NEXT: v_writelane_b32 v1, s35, 19
+; GCN-NEXT: v_writelane_b32 v1, s36, 20
+; GCN-NEXT: v_writelane_b32 v1, s37, 21
+; GCN-NEXT: v_writelane_b32 v1, s38, 22
+; GCN-NEXT: v_writelane_b32 v1, s39, 23
+; GCN-NEXT: v_writelane_b32 v1, s40, 24
+; GCN-NEXT: v_writelane_b32 v1, s41, 25
+; GCN-NEXT: v_writelane_b32 v1, s42, 26
+; GCN-NEXT: v_writelane_b32 v1, s43, 27
+; GCN-NEXT: v_writelane_b32 v1, s44, 28
+; GCN-NEXT: v_writelane_b32 v1, s45, 29
+; GCN-NEXT: v_writelane_b32 v1, s46, 30
+; GCN-NEXT: v_writelane_b32 v1, s47, 31
+; GCN-NEXT: v_writelane_b32 v1, s48, 32
+; GCN-NEXT: v_writelane_b32 v1, s49, 33
+; GCN-NEXT: v_writelane_b32 v1, s50, 34
+; GCN-NEXT: v_writelane_b32 v1, s51, 35
+; GCN-NEXT: v_writelane_b32 v1, s52, 36
+; GCN-NEXT: v_writelane_b32 v1, s53, 37
+; GCN-NEXT: v_writelane_b32 v1, s54, 38
+; GCN-NEXT: v_writelane_b32 v1, s55, 39
+; GCN-NEXT: v_writelane_b32 v1, s56, 40
+; GCN-NEXT: v_writelane_b32 v1, s57, 41
+; GCN-NEXT: v_writelane_b32 v1, s58, 42
+; GCN-NEXT: v_writelane_b32 v1, s59, 43
+; GCN-NEXT: v_writelane_b32 v1, s60, 44
+; GCN-NEXT: v_writelane_b32 v1, s61, 45
+; GCN-NEXT: v_writelane_b32 v1, s62, 46
+; GCN-NEXT: v_writelane_b32 v1, s63, 47
+; GCN-NEXT: v_writelane_b32 v1, s64, 48
+; GCN-NEXT: v_writelane_b32 v1, s65, 49
+; GCN-NEXT: v_writelane_b32 v1, s66, 50
+; GCN-NEXT: v_writelane_b32 v1, s67, 51
+; GCN-NEXT: v_writelane_b32 v1, s68, 52
+; GCN-NEXT: v_writelane_b32 v1, s69, 53
+; GCN-NEXT: v_writelane_b32 v1, s70, 54
+; GCN-NEXT: v_writelane_b32 v1, s71, 55
+; GCN-NEXT: v_writelane_b32 v1, s72, 56
+; GCN-NEXT: v_writelane_b32 v1, s73, 57
+; GCN-NEXT: v_writelane_b32 v1, s74, 58
+; GCN-NEXT: v_writelane_b32 v1, s75, 59
+; GCN-NEXT: v_writelane_b32 v1, s76, 60
+; GCN-NEXT: v_writelane_b32 v1, s77, 61
+; GCN-NEXT: v_writelane_b32 v1, s78, 62
+; GCN-NEXT: v_writelane_b32 v1, s79, 63
+; GCN-NEXT: v_writelane_b32 v2, s80, 0
+; GCN-NEXT: v_writelane_b32 v2, s81, 1
+; GCN-NEXT: v_writelane_b32 v2, s82, 2
+; GCN-NEXT: v_writelane_b32 v2, s83, 3
+; GCN-NEXT: v_writelane_b32 v2, s84, 4
+; GCN-NEXT: v_writelane_b32 v2, s85, 5
+; GCN-NEXT: v_writelane_b32 v2, s86, 6
+; GCN-NEXT: v_writelane_b32 v2, s87, 7
+; GCN: s_cbranch_scc1
+
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v0, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 6
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v0, 7
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 6
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 7
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 8
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 9
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 10
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 11
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 12
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 13
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 14
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 15
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 16
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 17
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 18
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 19
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 20
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 21
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 22
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 23
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 24
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 25
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 26
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 27
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 28
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 29
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 30
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 31
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 32
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 33
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 34
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 35
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 36
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 37
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 38
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 39
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 40
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 41
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 42
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 43
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 44
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 45
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 46
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 47
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 48
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 49
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 50
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 51
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 52
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 53
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 54
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 55
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO]], v1, 56
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 57
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 58
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 59
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 60
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 61
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v1, 62
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI]], v1, 63
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v2, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 6
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v2, 7
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 56
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 57
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 58
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 59
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 60
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 61
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 62
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 63
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 8
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 9
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 10
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 11
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 12
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 13
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 14
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 15
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 16
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 17
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 18
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 19
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 20
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 21
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 22
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 23
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 24
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 25
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 26
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 27
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 28
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 29
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 30
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 31
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 32
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 33
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 34
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 35
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 36
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 37
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 38
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 39
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 40
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 41
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 42
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 43
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 44
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 45
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 46
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 47
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s{{[0-9]+}}, v0, 48
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 49
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 50
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 51
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 52
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 53
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 54
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v0, 55
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(i32 addrspace(1)* %out, i32 %in) #0 {
+  %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr1 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr2 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr4 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr5 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr6 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr7 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr8 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr9 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr10 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr11 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr12 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr13 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr14 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr15 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr16 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr0) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr1) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr2) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr4) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr5) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr6) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr7) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr8) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr9) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr10) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr11) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr12) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr13) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr14) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr15) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr16) #0
+  br label %ret
+
+ret:
+  ret void
+}
+
+; Some of the lanes of an SGPR spill are in one VGPR and some forced
+; into the next available VGPR.
+
+; GCN-LABEL: {{^}}split_sgpr_spill_2_vgprs:
+; GCN: def s[24:39]
+
+; GCN: v_writelane_b32 v0, s24, 50
+; GCN-NEXT: v_writelane_b32 v0, s25, 51
+; GCN-NEXT: v_writelane_b32 v0, s26, 52
+; GCN-NEXT: v_writelane_b32 v0, s27, 53
+; GCN-NEXT: v_writelane_b32 v0, s28, 54
+; GCN-NEXT: v_writelane_b32 v0, s29, 55
+; GCN-NEXT: v_writelane_b32 v0, s30, 56
+; GCN-NEXT: v_writelane_b32 v0, s31, 57
+; GCN-NEXT: v_writelane_b32 v0, s32, 58
+; GCN-NEXT: v_writelane_b32 v0, s33, 59
+; GCN-NEXT: v_writelane_b32 v0, s34, 60
+; GCN-NEXT: v_writelane_b32 v0, s35, 61
+; GCN-NEXT: v_writelane_b32 v0, s36, 62
+; GCN-NEXT: v_writelane_b32 v0, s37, 63
+; GCN-NEXT: v_writelane_b32 v1, s38, 0
+; GCN-NEXT: v_writelane_b32 v1, s39, 1
+
+; GCN: v_readlane_b32 s4, v0, 50
+; GCN-NEXT: v_readlane_b32 s5, v0, 51
+; GCN-NEXT: v_readlane_b32 s6, v0, 52
+; GCN-NEXT: v_readlane_b32 s7, v0, 53
+; GCN-NEXT: v_readlane_b32 s8, v0, 54
+; GCN-NEXT: v_readlane_b32 s9, v0, 55
+; GCN-NEXT: v_readlane_b32 s10, v0, 56
+; GCN-NEXT: v_readlane_b32 s11, v0, 57
+; GCN-NEXT: v_readlane_b32 s12, v0, 58
+; GCN-NEXT: v_readlane_b32 s13, v0, 59
+; GCN-NEXT: v_readlane_b32 s14, v0, 60
+; GCN-NEXT: v_readlane_b32 s15, v0, 61
+; GCN-NEXT: v_readlane_b32 s16, v0, 62
+; GCN-NEXT: v_readlane_b32 s17, v0, 63
+; GCN-NEXT: v_readlane_b32 s18, v1, 0
+; GCN-NEXT: v_readlane_b32 s19, v1, 1
+define amdgpu_kernel void @split_sgpr_spill_2_vgprs(i32 addrspace(1)* %out, i32 %in) #1 {
+  %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr5 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr3 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
+
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0
+  call void asm sideeffect "; use $0", "s"(<8 x i32> %wide.sgpr3) #0
+  call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr5) #0
+  br label %ret
+
+ret:
+  ret void
+}
+
+; The first 64 SGPR spills can go to a VGPR, but there isn't a second
+; so some spills must be to memory. The last 16 element spill runs out of lanes at the 15th element.
+
+; GCN-LABEL: {{^}}no_vgprs_last_sgpr_spill:
+
+; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 0
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 1
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 2
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 3
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 4
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 5
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 6
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 7
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 8
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 9
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 10
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 11
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 12
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 13
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 14
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 15
+
+; GCN: v_writelane_b32 v23, s{{[0-9]+}}, 16
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 17
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 18
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 19
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 20
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 21
+; GCN-NEXT: v_writelane_b32 v23, s{{[0-9]+}}, 22
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 23
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 24
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 25
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 26
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 27
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 28
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 29
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 30
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 31
+
+; GCN: def s[0:1]
+; GCN:      v_writelane_b32 v23, s0, 32
+; GCN-NEXT: v_writelane_b32 v23, s1, 33
+
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 34
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 35
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 36
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 37
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 38
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 39
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 40
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 41
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 42
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 43
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 44
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 45
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 46
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 47
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 48
+; GCN-NEXT: v_writelane_b32 v23, s{{[[0-9]+}}, 49
+
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: s_cbranch_scc1
+
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 0
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 1
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 2
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 3
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 4
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 5
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 6
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 7
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 8
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 9
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 10
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 11
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 12
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 13
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 14
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 15
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 34
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 35
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 36
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 37
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 38
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 39
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 40
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 41
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 42
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 43
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 44
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 45
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 46
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 47
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 48
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 49
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: v_readlane_b32 s[[USE_TMP_LO:[0-9]+]], v23, 16
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 17
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 18
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 19
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 20
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 21
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 22
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 23
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 24
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 25
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 26
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 27
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 28
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 29
+; GCN-NEXT: v_readlane_b32 s{{[0-9]+}}, v23, 30
+; GCN-NEXT: v_readlane_b32 s[[USE_TMP_HI:[0-9]+]], v23, 31
+; GCN: ; use s{{\[}}[[USE_TMP_LO]]:[[USE_TMP_HI]]{{\]}}
+
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
+
+; GCN: v_readlane_b32 s0, v23, 32
+; GCN: v_readlane_b32 s1, v23, 33
+; GCN: ;;#ASMSTART
+; GCN: ; use s[0:1]
+define amdgpu_kernel void @no_vgprs_last_sgpr_spill(i32 addrspace(1)* %out, i32 %in) #1 {
+  call void asm sideeffect "", "~{VGPR0_VGPR1_VGPR2_VGPR3_VGPR4_VGPR5_VGPR6_VGPR7}" () #0
+  call void asm sideeffect "", "~{VGPR8_VGPR9_VGPR10_VGPR11_VGPR12_VGPR13_VGPR14_VGPR15}" () #0
+  call void asm sideeffect "", "~{VGPR16_VGPR17_VGPR18_VGPR19}"() #0
+  call void asm sideeffect "", "~{VGPR20_VGPR21}"() #0
+  call void asm sideeffect "", "~{VGPR22}"() #0
+
+  %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr2 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr3 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0
+  %wide.sgpr4 = call <2 x i32> asm sideeffect "; def $0", "=s" () #0
+  %cmp = icmp eq i32 %in, 0
+  br i1 %cmp, label %bb0, label %ret
+
+bb0:
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr0) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr1) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr2) #0
+  call void asm sideeffect "; use $0", "s"(<16 x i32> %wide.sgpr3) #0
+  call void asm sideeffect "; use $0", "s"(<2 x i32> %wide.sgpr4) #0
+  br label %ret
+
+ret:
+  ret void
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "amdgpu-waves-per-eu"="10,10" }
diff --git a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
index 3e0d36978ad4155ec181df2d06061bdbfda5ca2c..4bcfe5f3d28cca0b272801b02cb2346472b3f85d 100644
--- a/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
+++ b/test/CodeGen/AMDGPU/partially-dead-super-register-immediate.ll
@@ -10,7 +10,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
-define void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @dead_def_subregister(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %in.gep
diff --git a/test/CodeGen/AMDGPU/predicates.ll b/test/CodeGen/AMDGPU/predicates.ll
index c1af815c7b1e95d08d6a735ea9c6c060849e4e26..566b48eb88642d911343a01ba26433a0007fce09 100644
--- a/test/CodeGen/AMDGPU/predicates.ll
+++ b/test/CodeGen/AMDGPU/predicates.ll
@@ -6,7 +6,7 @@
 ; CHECK-LABEL: {{^}}simple_if:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @simple_if(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @simple_if(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %cmp0 = icmp sgt i32 %in, 0
   br i1 %cmp0, label %IF, label %ENDIF
@@ -25,7 +25,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @simple_if_else(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF, label %ELSE
@@ -51,7 +51,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Exec
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSHL * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @nested_if(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @nested_if(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF0, label %ENDIF
@@ -79,7 +79,7 @@ ENDIF:
 ; CHECK: PRED_SET{{[EGN][ET]*}}_INT * Pred,
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
 ; CHECK: LSH{{[LR] \* T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, Pred_sel
-define void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @nested_if_else(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   br i1 %0, label %IF0, label %ENDIF
diff --git a/test/CodeGen/AMDGPU/private-access-no-objects.ll b/test/CodeGen/AMDGPU/private-access-no-objects.ll
index 2894730eccb1a12776b1601ba7ee55cfbfd31b78..af268351029389f8547583e7c36d344550222877 100644
--- a/test/CodeGen/AMDGPU/private-access-no-objects.ll
+++ b/test/CodeGen/AMDGPU/private-access-no-objects.ll
@@ -18,7 +18,7 @@
 
 ; OPTNONE-NOT: s_mov_b32
 ; OPTNONE: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[0:3], s7 offen{{$}}
-define void @store_to_undef() #0 {
+define amdgpu_kernel void @store_to_undef() #0 {
   store volatile i32 0, i32* undef
   ret void
 }
@@ -28,7 +28,7 @@ define void @store_to_undef() #0 {
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @store_to_inttoptr() #0 {
+define amdgpu_kernel void @store_to_inttoptr() #0 {
  store volatile i32 0, i32* inttoptr (i32 123 to i32*)
  ret void
 }
@@ -38,7 +38,7 @@ define void @store_to_inttoptr() #0 {
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @load_from_undef() #0 {
+define amdgpu_kernel void @load_from_undef() #0 {
   %ld = load volatile i32, i32* undef
   ret void
 }
@@ -48,7 +48,7 @@ define void @load_from_undef() #0 {
 ; OPT-DAG: s_mov_b64 s{{\[[0-9]+}}:[[RSRC_HI:[0-9]+]]{{\]}}, s[2:3]
 ; OPT-DAG: s_mov_b32 [[SOFFSET:s[0-9]+]], s7{{$}}
 ; OPT: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s{{\[}}[[RSRC_LO]]:[[RSRC_HI]]{{\]}}, [[SOFFSET]] offen{{$}}
-define void @load_from_inttoptr() #0 {
+define amdgpu_kernel void @load_from_inttoptr() #0 {
   %ld = load volatile i32, i32* inttoptr (i32 123 to i32*)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/private-element-size.ll b/test/CodeGen/AMDGPU/private-element-size.ll
index de9a8f75512201af41f341793c8c1f36081ab49b..f805430797016944a4cfa3d20c53468fb4cee7cb 100644
--- a/test/CodeGen/AMDGPU/private-element-size.ll
+++ b/test/CodeGen/AMDGPU/private-element-size.ll
@@ -10,33 +10,33 @@
 ; HSA-ELT4: private_element_size = 1
 
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24{{$}}
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40
 
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}}
 
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
-; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
-define void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
+; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
+define amdgpu_kernel void @private_elt_size_v4i32(<4 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -59,36 +59,28 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80
 
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 
 
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:48
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:56
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:88
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:80
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:72
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:64
 
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
@@ -97,6 +89,14 @@ entry:
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:52{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:56{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:60{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:64{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:68{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:72{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:76{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:80{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:84{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:88{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:92{{$}}
 
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
@@ -106,7 +106,7 @@ entry:
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:20{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:28{{$}}
-define void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -130,20 +130,20 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:1
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{off|v[0-9]}}, s[0:3], s9 offset:2
 
 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-define void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -166,20 +166,20 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
+; HSA-ELTGE8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
 
 ; HSA-ELTGE8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
-define void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_f64(double addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
@@ -202,33 +202,33 @@ entry:
 ; HSA-ELT8: private_element_size = 2
 ; HSA-ELT4: private_element_size = 1
 
-; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
 ; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
+; HSA-ELT16-DAG: buffer_store_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
 ; HSA-ELT16-DAG: buffer_load_dwordx4 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9{{$}}
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:8
-; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:16{{$}}
 ; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:24
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:40
+; HSA-ELT8-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, off, s[0:3], s9 offset:32
 
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 ; HSA-ELT8: buffer_load_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}}, s[0:3], s9 offen
 
 
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:4{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:8{{$}}
-; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:12{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:16{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:20{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:24{{$}}
 ; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:28{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:32{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:36{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:40{{$}}
+; HSA-ELT4-DAG: buffer_store_dword {{v[0-9]+}}, off, s[0:3], s9 offset:44{{$}}
 
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:4{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:8{{$}}
 ; HSA-ELT4: buffer_load_dword {{v[0-9]+}}, v{{[0-9]+}}, s[0:3], s9 offen offset:12{{$}}
-define void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
+define amdgpu_kernel void @private_elt_size_v2i64(<2 x i64> addrspace(1)* %out, i32 addrspace(1)* %index.array) #0 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %idxprom = sext i32 %tid to i64
diff --git a/test/CodeGen/AMDGPU/private-memory-atomics.ll b/test/CodeGen/AMDGPU/private-memory-atomics.ll
index eea10c86223896fa4d4dcd00ecddefd4346b4bc6..9fa3051928a051d0207b325ccb87d4feb003cd2e 100644
--- a/test/CodeGen/AMDGPU/private-memory-atomics.ll
+++ b/test/CodeGen/AMDGPU/private-memory-atomics.ll
@@ -4,7 +4,7 @@
 ; This works because promote allocas pass replaces these with LDS atomics.
 
 ; Private atomics have no real use, but at least shouldn't crash on it.
-define void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @atomicrmw_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -17,7 +17,7 @@ entry:
   ret void
 }
 
-define void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @cmpxchg_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
diff --git a/test/CodeGen/AMDGPU/private-memory-broken.ll b/test/CodeGen/AMDGPU/private-memory-broken.ll
index 8ba0b70dbdbb59642ca1a93bc0ec5c57b72f9574..9b5f655f1b5233f8279cffdd4341454cfd7201d4 100644
--- a/test/CodeGen/AMDGPU/private-memory-broken.ll
+++ b/test/CodeGen/AMDGPU/private-memory-broken.ll
@@ -7,7 +7,7 @@
 
 declare i32 @foo(i32*) nounwind
 
-define void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @call_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
diff --git a/test/CodeGen/AMDGPU/private-memory-r600.ll b/test/CodeGen/AMDGPU/private-memory-r600.ll
index 3e1796959aa68d825538dbef075ceebd1c69256e..d07a0a02cbae4d620494a4953df407540d867cb2 100644
--- a/test/CodeGen/AMDGPU/private-memory-r600.ll
+++ b/test/CodeGen/AMDGPU/private-memory-r600.ll
@@ -16,7 +16,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; OPT: call i32 @llvm.r600.read.tidig.y(), !range !0
 ; OPT: call i32 @llvm.r600.read.tidig.z(), !range !0
 
-define void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @mova_same_clause(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %0 = load i32, i32 addrspace(1)* %in, align 4
@@ -47,7 +47,7 @@ entry:
 ; R600-NOT: MOVA_INT
 %struct.point = type { i32, i32 }
 
-define void @multiple_structs(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @multiple_structs(i32 addrspace(1)* %out) #0 {
 entry:
   %a = alloca %struct.point
   %b = alloca %struct.point
@@ -75,7 +75,7 @@ entry:
 ; FUNC-LABEL: {{^}}direct_loop:
 ; R600-NOT: MOVA_INT
 
-define void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @direct_loop(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 entry:
   %prv_array_const = alloca [2 x i32]
   %prv_array = alloca [2 x i32]
@@ -110,7 +110,7 @@ for.end:
 ; FUNC-LABEL: {{^}}short_array:
 
 ; R600: MOVA_INT
-define void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @short_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i16]
   %1 = getelementptr inbounds [2 x i16], [2 x i16]* %0, i32 0, i32 0
@@ -127,7 +127,7 @@ entry:
 ; FUNC-LABEL: {{^}}char_array:
 
 ; R600: MOVA_INT
-define void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %0 = alloca [2 x i8]
   %1 = getelementptr inbounds [2 x i8], [2 x i8]* %0, i32 0, i32 0
@@ -148,7 +148,7 @@ entry:
 ; R600-NOT: MOV T0.X
 ; Additional check in case the move ends up in the last slot
 ; R600-NOT: MOV * TO.X
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr inbounds [2 x i32], [2 x i32]* %0, i32 0, i32 0
@@ -169,7 +169,7 @@ entry:
 ; R600_CHECK: MOV
 ; R600_CHECK: [[CHAN:[XYZW]]]+
 ; R600-NOT: [[CHAN]]+
-define void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @no_overlap(i32 addrspace(1)* %out, i32 %in) #0 {
 entry:
   %0 = alloca [3 x i8], align 1
   %1 = alloca [2 x i8], align 1
@@ -193,7 +193,7 @@ entry:
   ret void
 }
 
-define void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @char_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i8]]
   %gep0 = getelementptr inbounds [2 x [2 x i8]], [2 x [2 x i8]]* %alloca, i32 0, i32 0, i32 0
@@ -207,7 +207,7 @@ entry:
   ret void
 }
 
-define void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -220,7 +220,7 @@ entry:
   ret void
 }
 
-define void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @i64_array_array(i64 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i64]]
   %gep0 = getelementptr inbounds [2 x [2 x i64]], [2 x [2 x i64]]* %alloca, i32 0, i32 0, i32 0
@@ -235,7 +235,7 @@ entry:
 
 %struct.pair32 = type { i32, i32 }
 
-define void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x %struct.pair32]]
   %gep0 = getelementptr inbounds [2 x [2 x %struct.pair32]], [2 x [2 x %struct.pair32]]* %alloca, i32 0, i32 0, i32 0, i32 1
@@ -248,7 +248,7 @@ entry:
   ret void
 }
 
-define void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @struct_pair32_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x %struct.pair32]
   %gep0 = getelementptr inbounds [2 x %struct.pair32], [2 x %struct.pair32]* %alloca, i32 0, i32 0, i32 1
@@ -261,7 +261,7 @@ entry:
   ret void
 }
 
-define void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
+define amdgpu_kernel void @select_private(i32 addrspace(1)* %out, i32 %in) nounwind {
 entry:
   %tmp = alloca [2 x i32]
   %tmp1 = getelementptr inbounds  [2 x i32], [2 x i32]* %tmp, i32 0, i32 0
@@ -282,7 +282,7 @@ entry:
 ; SI-NOT: ds_write
 ; SI: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen
 ; SI: buffer_load_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen ;
-define void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @ptrtoint(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32]
   %tmp0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   store i32 5, i32* %tmp0
diff --git a/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll b/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
index 3bd0aecf7aa9e43d8b1a8aad604ae98a81460638..41a68b18b0a7eaf70cbe0e03ad62b36f70e8baf2 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-array-allocation.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: @array_alloca(
 ; CHECK: %stack = alloca i32, i32 5, align 4
-define void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
+define amdgpu_kernel void @array_alloca(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #0 {
 entry:
   %stack = alloca i32, i32 5, align 4
   %ld0 = load i32, i32 addrspace(1)* %in, align 4
@@ -27,7 +27,7 @@ entry:
 
 ; CHECK-LABEL: @array_alloca_dynamic(
 ; CHECK: %stack = alloca i32, i32 %size, align 4
-define void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
+define amdgpu_kernel void @array_alloca_dynamic(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %size) #0 {
 entry:
   %stack = alloca i32, i32 %size, align 4
   %ld0 = load i32, i32 addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
index 82030f377d9f8384e93634f9ef308c5b94ee9623..a5eb92de9e4b526e9ed574c4915810494fc004b7 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-bitcast-function.ll
@@ -7,14 +7,14 @@ declare void @foo(float*) #0
 declare void @foo.varargs(...) #0
 
 ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo
-define void @crash_call_constexpr_cast() #0 {
+define amdgpu_kernel void @crash_call_constexpr_cast() #0 {
   %alloca = alloca i32
   call void bitcast (void (float*)* @foo to void (i32*)*)(i32* %alloca) #0
   ret void
 }
 
 ; CHECK: in function crash_call_constexpr_cast{{.*}}: unsupported call to function foo.varargs
-define void @crash_call_constexpr_cast_varargs() #0 {
+define amdgpu_kernel void @crash_call_constexpr_cast_varargs() #0 {
   %alloca = alloca i32
   call void bitcast (void (...)* @foo.varargs to void (i32*)*)(i32* %alloca) #0
   ret void
diff --git a/test/CodeGen/AMDGPU/promote-alloca-globals.ll b/test/CodeGen/AMDGPU/promote-alloca-globals.ll
index eb0d0cc62697887a5a2ce68eac6afe7dbc156aac..38db51d4c8c6a49a6e3b109cf6aacd40f452b97b 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-globals.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-globals.ll
@@ -5,12 +5,12 @@
 @global_array0 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 @global_array1 = internal unnamed_addr addrspace(3) global [750 x [10 x i32]] undef, align 4
 
-; IR-LABEL: define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+; IR-LABEL: define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 ; IR: alloca [10 x i32]
 ; ASM-LABEL: {{^}}promote_alloca_size_256:
 ; ASM: ; LDSByteSize: 60000 bytes/workgroup (compile time only)
 
-define void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @promote_alloca_size_256(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [10 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
index 4c3c15dac0d102b34be15a48c6a77633b40fe444..f83eb56dc6edf4cb5e38d8c87f8265d32db5834d 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-invariant-markers.ll
@@ -7,7 +7,7 @@ declare i8* @llvm.invariant.group.barrier(i8*) #1
 ; GCN-LABEL: {{^}}use_invariant_promotable_lds:
 ; GCN: buffer_load_dword
 ; GCN: ds_write_b32
-define void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @use_invariant_promotable_lds(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
diff --git a/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll b/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
index eeda19fa27accf834f6b9da5a5a1f863622531a9..bd4571a9616bea693c94bb0fca0809ba0d8f8e5c 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-lifetime.ll
@@ -1,21 +1,21 @@
 ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca %s | FileCheck -check-prefix=OPT %s
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 ; OPT-LABEL: @use_lifetime_promotable_lds(
 ; OPT-NOT: alloca i32
 ; OPT-NOT: llvm.lifetime
 ; OPT: store i32 %tmp3, i32 addrspace(3)*
-define void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @use_lifetime_promotable_lds(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %tmp1)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %tmp1)
   %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
   %tmp3 = load i32, i32 addrspace(1)* %tmp2
   store i32 %tmp3, i32* %tmp
-  call void @llvm.lifetime.end(i64 4, i8* %tmp1)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %tmp1)
   ret void
 }
 
diff --git a/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll b/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
index 9cea1a23ea980ad92484db297d1d6db034138341..7a4a451ff360323e1dfb542c3ed1c99de7b53de3 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-mem-intrinsics.ll
@@ -8,13 +8,13 @@ declare void @llvm.memmove.p1i8.p0i8.i32(i8 addrspace(1)* nocapture, i8* nocaptu
 
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) #0
 
-declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) #1
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) #1
 
 ; CHECK-LABEL: @promote_with_memcpy(
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memcpy.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memcpy.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
 ; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
-define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -28,7 +28,7 @@ define void @promote_with_memcpy(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memmove.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memmove.p3i8.p1i8.i32(i8 addrspace(3)* %alloca.bc, i8 addrspace(1)* %in.bc, i32 68, i32 4, i1 false)
 ; CHECK: call void @llvm.memmove.p1i8.p3i8.i32(i8 addrspace(1)* %out.bc, i8 addrspace(3)* %alloca.bc, i32 68, i32 4, i1 false)
-define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -41,7 +41,7 @@ define void @promote_with_memmove(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; CHECK-LABEL: @promote_with_memset(
 ; CHECK: getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_memset.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: call void @llvm.memset.p3i8.i32(i8 addrspace(3)* %alloca.bc, i8 7, i32 68, i32 4, i1 false)
-define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
   %in.bc = bitcast i32 addrspace(1)* %in to i8 addrspace(1)*
@@ -52,11 +52,11 @@ define void @promote_with_memset(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; CHECK-LABEL: @promote_with_objectsize(
 ; CHECK: [[PTR:%[0-9]+]] = getelementptr inbounds [64 x [17 x i32]], [64 x [17 x i32]] addrspace(3)* @promote_with_objectsize.alloca, i32 0, i32 %{{[0-9]+}}
-; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false)
-define void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
+; CHECK: call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %alloca.bc, i1 false, i1 false)
+define amdgpu_kernel void @promote_with_objectsize(i32 addrspace(1)* %out) #0 {
   %alloca = alloca [17 x i32], align 4
   %alloca.bc = bitcast [17 x i32]* %alloca to i8*
-  %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false)
+  %size = call i32 @llvm.objectsize.i32.p0i8(i8* %alloca.bc, i1 false, i1 false)
   store i32 %size, i32 addrspace(1)* %out
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
index 8ba849e5f88414bee60191b289eeab438403681d..9f22f2071797e042530595b2acca61a35e4929d0 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-no-opts.ll
@@ -5,7 +5,7 @@
 ; NOOPTS: workgroup_group_segment_byte_size = 0{{$}}
 ; NOOPTS-NOT ds_write
 ; OPTS: ds_write
-define void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
+define amdgpu_kernel void @promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #0 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
@@ -21,7 +21,7 @@ entry:
 ; ALL-LABEL: {{^}}optnone_promote_alloca_i32_array_array:
 ; ALL: workgroup_group_segment_byte_size = 0{{$}}
 ; ALL-NOT ds_write
-define void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
+define amdgpu_kernel void @optnone_promote_alloca_i32_array_array(i32 addrspace(1)* %out, i32 %index) #1 {
 entry:
   %alloca = alloca [2 x [2 x i32]]
   %gep0 = getelementptr inbounds [2 x [2 x i32]], [2 x [2 x i32]]* %alloca, i32 0, i32 0, i32 0
diff --git a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
index 468a789e4a6788188097b9a3321362d78627e3f9..bf3bc493a4b86e48cf93886c3c091db59fc6c49a 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-padding-size-estimate.ll
@@ -30,7 +30,7 @@
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_0:
 ; GCN: workgroup_group_segment_byte_size = 2340
-define void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_0(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
@@ -62,7 +62,7 @@ entry:
 
 ; GCN-LABEL: {{^}}promote_alloca_size_order_1:
 ; GCN: workgroup_group_segment_byte_size = 2352
-define void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_size_order_1(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
@@ -100,7 +100,7 @@ entry:
 
 ; GCN-LABEL: {{^}}promote_alloca_align_pad_guess_over_limit:
 ; GCN: workgroup_group_segment_byte_size = 1060
-define void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
+define amdgpu_kernel void @promote_alloca_align_pad_guess_over_limit(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in, i32 %idx) #0 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp0 = load i32, i32 addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
index 3bcbb4f986b7ba898a56f32b28caef40b2176bf3..03ce116cfcad97c61ae16b51aea3bc9c4c5f4938 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-stored-pointer-value.ll
@@ -5,7 +5,7 @@
 
 ; GCN-LABEL: {{^}}stored_lds_pointer_value:
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
   %tmp = alloca float
   store float 0.0, float *%tmp
   store float* %tmp, float* addrspace(1)* %ptr
@@ -14,7 +14,7 @@ define void @stored_lds_pointer_value(float* addrspace(1)* %ptr) #0 {
 
 ; GCN-LABEL: {{^}}stored_lds_pointer_value_offset:
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
   %tmp0 = alloca float
   %tmp1 = alloca float
   store float 0.0, float *%tmp0
@@ -29,7 +29,7 @@ define void @stored_lds_pointer_value_offset(float* addrspace(1)* %ptr) #0 {
 ; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
 ; GCN: buffer_store_dword v
 ; GCN: buffer_store_dword v
-define void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 {
+define amdgpu_kernel void @stored_lds_pointer_value_gep(float* addrspace(1)* %ptr, i32 %idx) #0 {
 bb:
   %tmp = alloca float, i32 16
   store float 0.0, float* %tmp
@@ -46,7 +46,7 @@ bb:
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
 ; GCN: buffer_store_dword
-define void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @stored_vector_pointer_value(i32* addrspace(1)* %out, i32 %index) {
 entry:
   %tmp0 = alloca [4 x i32]
   %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp0, i32 0, i32 0
@@ -64,7 +64,7 @@ entry:
 
 ; GCN-LABEL: {{^}}stored_fi_to_self:
 ; GCN-NOT: ds_
-define void @stored_fi_to_self() #0 {
+define amdgpu_kernel void @stored_fi_to_self() #0 {
   %tmp = alloca i32*
   store volatile i32* inttoptr (i32 1234 to i32*), i32** %tmp
   %bitcast = bitcast i32** %tmp to i32*
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
index 2e7527dbdbc493e3cac140ad40d96a30b0757df4..ebef612299054b487bbfb5432ce3204ee563e274 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-icmp.ll
@@ -8,7 +8,7 @@
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, %ptr1
-define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -22,7 +22,7 @@ define void @lds_promoted_alloca_icmp_same_derived_pointer(i32 addrspace(1)* %ou
 ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_rhs.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* %ptr0, null
-define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %cmp = icmp eq i32* %ptr0, null
@@ -35,7 +35,7 @@ define void @lds_promoted_alloca_icmp_null_rhs(i32 addrspace(1)* %out, i32 %a, i
 ; CHECK: [[ARRAYGEP:%[0-9]+]] = getelementptr inbounds [256 x [16 x i32]], [256 x [16 x i32]] addrspace(3)* @lds_promoted_alloca_icmp_null_lhs.alloca, i32 0, i32 %{{[0-9]+}}
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %a
 ; CHECK: %cmp = icmp eq i32 addrspace(3)* null, %ptr0
-define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %cmp = icmp eq i32* null, %ptr0
@@ -49,7 +49,7 @@ define void @lds_promoted_alloca_icmp_null_lhs(i32 addrspace(1)* %out, i32 %a, i
 ; CHECK: %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
 ; CHECK: %ptr1 = call i32* @get_unknown_pointer()
 ; CHECK: %cmp = icmp eq i32* %ptr0, %ptr1
-define void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_icmp_unknown_ptr(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = call i32* @get_unknown_pointer()
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
index 0462a351c39bfc2183e76bca165008234ff0c6c8..d196897d67dc95693d02e12a64f2e4f708a9d33e 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-phi.ll
@@ -13,7 +13,7 @@
 ; CHECK: endif:
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
 ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
-define void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_var_same_alloca(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %else
@@ -34,7 +34,7 @@ endif:
 
 ; CHECK-LABEL: @branch_ptr_phi_alloca_null_0(
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %if ], [ null, %entry ]
-define void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_phi_alloca_null_0(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %endif
@@ -51,7 +51,7 @@ endif:
 
 ; CHECK-LABEL: @branch_ptr_phi_alloca_null_1(
 ; CHECK: %phi.ptr = phi i32 addrspace(3)*  [ null, %entry ], [ %arrayidx0, %if ]
-define void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_phi_alloca_null_1(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %endif
@@ -73,7 +73,7 @@ endif:
 ; CHECK: br label %exit
 ; CHECK: %phi.ptr = phi i32 addrspace(3)* [ %arrayidx0, %entry ]
 ; CHECK: store i32 0, i32 addrspace(3)* %phi.ptr, align 4
-define void @one_phi_value(i32 %a) #0 {
+define amdgpu_kernel void @one_phi_value(i32 %a) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx0 = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 %a
@@ -97,7 +97,7 @@ exit:
 ; CHECK: endif:
 ; CHECK: %phi.ptr = phi i32* [ %arrayidx0, %if ], [ %arrayidx1, %else ]
 ; CHECK: store i32 0, i32* %phi.ptr, align 4
-define void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @branch_ptr_alloca_unknown_obj(i32 %a, i32 %b) #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   br i1 undef, label %if, label %else
@@ -134,7 +134,7 @@ endif:
 ; CHECK-LABEL: @ptr_induction_var_same_alloca(
 ; CHECK: %alloca = alloca [64 x i32], align 4
 ; CHECK: phi i32* [ %arrayidx, %entry ], [ %incdec.ptr, %for.body ]
-define void @ptr_induction_var_same_alloca() #0 {
+define amdgpu_kernel void @ptr_induction_var_same_alloca() #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
@@ -172,7 +172,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; CHECK: %alloca = alloca [64 x i32], align 4
 ; CHECK: %p.08 = phi i32* [ %incdec.ptr, %for.body ], [ %arrayidx, %for.body.preheader ]
 ; CHECK: %cmp = icmp eq i32* %incdec.ptr, %call
-define void @ptr_induction_var_alloca_unknown() #0 {
+define amdgpu_kernel void @ptr_induction_var_alloca_unknown() #0 {
 entry:
   %alloca = alloca [64 x i32], align 4
   %arrayidx = getelementptr inbounds [64 x i32], [64 x i32]* %alloca, i32 0, i32 2
diff --git a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
index 34d274df738795654ffaf91a8bce0380fbd20c9b..55c2229fb6bdb1424c9c49fbecfc83838cae96dd 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-to-lds-select.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @lds_promoted_alloca_select_invalid_pointer_operand(
 ; CHECK: %alloca = alloca i32
 ; CHECK: select i1 undef, i32* undef, i32* %alloca
-define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
   %alloca = alloca i32, align 4
   %select = select i1 undef, i32* undef, i32* %alloca
   store i32 0, i32* %select, align 4
@@ -16,7 +16,7 @@ define void @lds_promoted_alloca_select_invalid_pointer_operand() #0 {
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 %b
 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
-define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -33,7 +33,7 @@ define void @lds_promote_alloca_select_two_derived_pointers(i32 %a, i32 %b) #0 {
 ; CHECK: %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
 ; CHECK: %ptr1 = getelementptr inbounds i32, i32* %alloca1, i32 %b
 ; CHECK: %select = select i1 undef, i32* %ptr0, i32* %ptr1
-define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
   %alloca0 = alloca i32, i32 16, align 4
   %alloca1 = alloca i32, i32 16, align 4
   %ptr0 = getelementptr inbounds i32, i32* %alloca0, i32 %a
@@ -50,7 +50,7 @@ define void @lds_promote_alloca_select_two_allocas(i32 %a, i32 %b) #0 {
 ; CHECK: %ptr1 = getelementptr inbounds [16 x i32], [16 x i32] addrspace(3)* [[ARRAYGEP]], i32 0, i32 3
 ; CHECK: %select = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: store i32 0, i32 addrspace(3)* %select, align 4
-define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
+define amdgpu_kernel void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 1
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 3
@@ -67,7 +67,7 @@ define void @lds_promote_alloca_select_two_derived_constant_pointers() #0 {
 ; CHECK: %select0 = select i1 undef, i32 addrspace(3)* %ptr0, i32 addrspace(3)* %ptr1
 ; CHECK: %select1 = select i1 undef, i32 addrspace(3)* %select0, i32 addrspace(3)* %ptr2
 ; CHECK: store i32 0, i32 addrspace(3)* %select1, align 4
-define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0 {
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
   %ptr1 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %b
@@ -78,7 +78,7 @@ define void @lds_promoted_alloca_select_input_select(i32 %a, i32 %b, i32 %c) #0
   ret void
 }
 
-define void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @lds_promoted_alloca_select_input_phi(i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %alloca = alloca [16 x i32], align 4
   %ptr0 = getelementptr inbounds [16 x i32], [16 x i32]* %alloca, i32 0, i32 %a
@@ -102,7 +102,7 @@ bb2:
 ; CHECK-LABEL: @select_null_rhs(
 ; CHECK-NOT: alloca
 ; CHECK: select i1 %tmp2, double addrspace(3)* %{{[0-9]+}}, double addrspace(3)* null
-define void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+define amdgpu_kernel void @select_null_rhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
@@ -117,7 +117,7 @@ bb:
 ; CHECK-LABEL: @select_null_lhs(
 ; CHECK-NOT: alloca
 ; CHECK: select i1 %tmp2, double addrspace(3)* null, double addrspace(3)* %{{[0-9]+}}
-define void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
+define amdgpu_kernel void @select_null_lhs(double addrspace(1)* nocapture %arg, i32 %arg1) #1 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
diff --git a/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll b/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
index e331731f90f69a491b2f66b8886e633d7bed2336..88c0e911662d6d561c2444577e6bc972eb4dee27 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-unhandled-intrinsic.ll
@@ -8,7 +8,7 @@ declare void @llvm.stackrestore(i8*) #2
 ; CHECK-LABEL: @try_promote_unhandled_intrinsic(
 ; CHECK: alloca
 ; CHECK: call void @llvm.stackrestore(i8* %tmp1)
-define void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
+define amdgpu_kernel void @try_promote_unhandled_intrinsic(i32 addrspace(1)* %arg) #2 {
 bb:
   %tmp = alloca i32, align 4
   %tmp1 = bitcast i32* %tmp to i8*
diff --git a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
index 626ff1e1e139223cc4c1c98bb193fdd2f7be1d4c..9c43a6dc60f41e0aa2a639efe83c3a9a07f58d02 100644
--- a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
+++ b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @volatile_load(
 ; CHECK: alloca [5 x i32]
 ; CHECK: load volatile i32, i32*
-define void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -16,7 +16,7 @@ entry:
 ; CHECK-LABEL: @volatile_store(
 ; CHECK: alloca [5 x i32]
 ; CHECK: store volatile i32 %tmp, i32*
-define void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
+define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -30,7 +30,7 @@ entry:
 ; CHECK: alloca double
 ; CHECK: load double
 ; CHECK: load volatile double
-define void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
+define amdgpu_kernel void @volatile_and_non_volatile_load(double addrspace(1)* nocapture %arg, i32 %arg1) #0 {
 bb:
   %tmp = alloca double, align 8
   store double 0.000000e+00, double* %tmp, align 8
diff --git a/test/CodeGen/AMDGPU/pv.ll b/test/CodeGen/AMDGPU/pv.ll
index d5f9833d6ad09d5dc9ab9867ae08380df17785b6..1474dbabba69fa015fffcfddf95fd6ccf84354cc 100644
--- a/test/CodeGen/AMDGPU/pv.ll
+++ b/test/CodeGen/AMDGPU/pv.ll
@@ -1,240 +1,236 @@
-; RUN: llc < %s -march=r600 | FileCheck %s
+; RUN: llc -march=r600 < %s | FileCheck %s
 
 ; CHECK: DOT4 * T{{[0-9]\.W}} (MASKED)
 ; CHECK: MAX T{{[0-9].[XYZW]}}, 0.0, PV.X
-
 define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1, <4 x float> inreg %reg2, <4 x float> inreg %reg3, <4 x float> inreg %reg4, <4 x float> inreg %reg5, <4 x float> inreg %reg6, <4 x float> inreg %reg7) {
 main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = extractelement <4 x float> %reg2, i32 0
-  %5 = extractelement <4 x float> %reg2, i32 1
-  %6 = extractelement <4 x float> %reg2, i32 2
-  %7 = extractelement <4 x float> %reg2, i32 3
-  %8 = extractelement <4 x float> %reg3, i32 0
-  %9 = extractelement <4 x float> %reg3, i32 1
-  %10 = extractelement <4 x float> %reg3, i32 2
-  %11 = extractelement <4 x float> %reg3, i32 3
-  %12 = extractelement <4 x float> %reg4, i32 0
-  %13 = extractelement <4 x float> %reg4, i32 1
-  %14 = extractelement <4 x float> %reg4, i32 2
-  %15 = extractelement <4 x float> %reg4, i32 3
-  %16 = extractelement <4 x float> %reg5, i32 0
-  %17 = extractelement <4 x float> %reg5, i32 1
-  %18 = extractelement <4 x float> %reg5, i32 2
-  %19 = extractelement <4 x float> %reg5, i32 3
-  %20 = extractelement <4 x float> %reg6, i32 0
-  %21 = extractelement <4 x float> %reg6, i32 1
-  %22 = extractelement <4 x float> %reg6, i32 2
-  %23 = extractelement <4 x float> %reg6, i32 3
-  %24 = extractelement <4 x float> %reg7, i32 0
-  %25 = extractelement <4 x float> %reg7, i32 1
-  %26 = extractelement <4 x float> %reg7, i32 2
-  %27 = extractelement <4 x float> %reg7, i32 3
-  %28 = load <4 x float>, <4 x float> addrspace(8)* null
-  %29 = extractelement <4 x float> %28, i32 0
-  %30 = fmul float %0, %29
-  %31 = load <4 x float>, <4 x float> addrspace(8)* null
-  %32 = extractelement <4 x float> %31, i32 1
-  %33 = fmul float %0, %32
-  %34 = load <4 x float>, <4 x float> addrspace(8)* null
-  %35 = extractelement <4 x float> %34, i32 2
-  %36 = fmul float %0, %35
-  %37 = load <4 x float>, <4 x float> addrspace(8)* null
-  %38 = extractelement <4 x float> %37, i32 3
-  %39 = fmul float %0, %38
-  %40 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %41 = extractelement <4 x float> %40, i32 0
-  %42 = fmul float %1, %41
-  %43 = fadd float %42, %30
-  %44 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %45 = extractelement <4 x float> %44, i32 1
-  %46 = fmul float %1, %45
-  %47 = fadd float %46, %33
-  %48 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %49 = extractelement <4 x float> %48, i32 2
-  %50 = fmul float %1, %49
-  %51 = fadd float %50, %36
-  %52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
-  %53 = extractelement <4 x float> %52, i32 3
-  %54 = fmul float %1, %53
-  %55 = fadd float %54, %39
-  %56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %57 = extractelement <4 x float> %56, i32 0
-  %58 = fmul float %2, %57
-  %59 = fadd float %58, %43
-  %60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %61 = extractelement <4 x float> %60, i32 1
-  %62 = fmul float %2, %61
-  %63 = fadd float %62, %47
-  %64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %65 = extractelement <4 x float> %64, i32 2
-  %66 = fmul float %2, %65
-  %67 = fadd float %66, %51
-  %68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
-  %69 = extractelement <4 x float> %68, i32 3
-  %70 = fmul float %2, %69
-  %71 = fadd float %70, %55
-  %72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %73 = extractelement <4 x float> %72, i32 0
-  %74 = fmul float %3, %73
-  %75 = fadd float %74, %59
-  %76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %77 = extractelement <4 x float> %76, i32 1
-  %78 = fmul float %3, %77
-  %79 = fadd float %78, %63
-  %80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %81 = extractelement <4 x float> %80, i32 2
-  %82 = fmul float %3, %81
-  %83 = fadd float %82, %67
-  %84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
-  %85 = extractelement <4 x float> %84, i32 3
-  %86 = fmul float %3, %85
-  %87 = fadd float %86, %71
-  %88 = insertelement <4 x float> undef, float %4, i32 0
-  %89 = insertelement <4 x float> %88, float %5, i32 1
-  %90 = insertelement <4 x float> %89, float %6, i32 2
-  %91 = insertelement <4 x float> %90, float 0.000000e+00, i32 3
-  %92 = insertelement <4 x float> undef, float %4, i32 0
-  %93 = insertelement <4 x float> %92, float %5, i32 1
-  %94 = insertelement <4 x float> %93, float %6, i32 2
-  %95 = insertelement <4 x float> %94, float 0.000000e+00, i32 3
-  %96 = call float @llvm.r600.dot4(<4 x float> %91, <4 x float> %95)
-  %97 = call float @llvm.fabs.f32(float %96)
-  %98 = call float @llvm.r600.recipsqrt.clamped.f32(float %97)
-  %99 = fmul float %4, %98
-  %100 = fmul float %5, %98
-  %101 = fmul float %6, %98
-  %102 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %103 = extractelement <4 x float> %102, i32 0
-  %104 = fmul float %103, %8
-  %105 = fadd float %104, %20
-  %106 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %107 = extractelement <4 x float> %106, i32 1
-  %108 = fmul float %107, %9
-  %109 = fadd float %108, %21
-  %110 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
-  %111 = extractelement <4 x float> %110, i32 2
-  %112 = fmul float %111, %10
-  %113 = fadd float %112, %22
-  %114 = call float @llvm.AMDGPU.clamp.f32(float %105, float 0.000000e+00, float 1.000000e+00)
-  %115 = call float @llvm.AMDGPU.clamp.f32(float %109, float 0.000000e+00, float 1.000000e+00)
-  %116 = call float @llvm.AMDGPU.clamp.f32(float %113, float 0.000000e+00, float 1.000000e+00)
-  %117 = call float @llvm.AMDGPU.clamp.f32(float %15, float 0.000000e+00, float 1.000000e+00)
-  %118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %119 = extractelement <4 x float> %118, i32 0
-  %120 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %121 = extractelement <4 x float> %120, i32 1
-  %122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
-  %123 = extractelement <4 x float> %122, i32 2
-  %124 = insertelement <4 x float> undef, float %99, i32 0
-  %125 = insertelement <4 x float> %124, float %100, i32 1
-  %126 = insertelement <4 x float> %125, float %101, i32 2
-  %127 = insertelement <4 x float> %126, float 0.000000e+00, i32 3
-  %128 = insertelement <4 x float> undef, float %119, i32 0
-  %129 = insertelement <4 x float> %128, float %121, i32 1
-  %130 = insertelement <4 x float> %129, float %123, i32 2
-  %131 = insertelement <4 x float> %130, float 0.000000e+00, i32 3
-  %132 = call float @llvm.r600.dot4(<4 x float> %127, <4 x float> %131)
-  %133 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %134 = extractelement <4 x float> %133, i32 0
-  %135 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %136 = extractelement <4 x float> %135, i32 1
-  %137 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
-  %138 = extractelement <4 x float> %137, i32 2
-  %139 = insertelement <4 x float> undef, float %99, i32 0
-  %140 = insertelement <4 x float> %139, float %100, i32 1
-  %141 = insertelement <4 x float> %140, float %101, i32 2
-  %142 = insertelement <4 x float> %141, float 0.000000e+00, i32 3
-  %143 = insertelement <4 x float> undef, float %134, i32 0
-  %144 = insertelement <4 x float> %143, float %136, i32 1
-  %145 = insertelement <4 x float> %144, float %138, i32 2
-  %146 = insertelement <4 x float> %145, float 0.000000e+00, i32 3
-  %147 = call float @llvm.r600.dot4(<4 x float> %142, <4 x float> %146)
-  %148 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %149 = extractelement <4 x float> %148, i32 0
-  %150 = fmul float %149, %8
-  %151 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %152 = extractelement <4 x float> %151, i32 1
-  %153 = fmul float %152, %9
-  %154 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
-  %155 = extractelement <4 x float> %154, i32 2
-  %156 = fmul float %155, %10
-  %157 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %158 = extractelement <4 x float> %157, i32 0
-  %159 = fmul float %158, %12
-  %160 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %161 = extractelement <4 x float> %160, i32 1
-  %162 = fmul float %161, %13
-  %163 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
-  %164 = extractelement <4 x float> %163, i32 2
-  %165 = fmul float %164, %14
-  %166 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %167 = extractelement <4 x float> %166, i32 0
-  %168 = fmul float %167, %16
-  %169 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %170 = extractelement <4 x float> %169, i32 1
-  %171 = fmul float %170, %17
-  %172 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
-  %173 = extractelement <4 x float> %172, i32 2
-  %174 = fmul float %173, %18
-  %175 = fcmp uge float %132, 0.000000e+00
-  %176 = select i1 %175, float %132, float 0.000000e+00
-  %177 = fcmp uge float %147, 0.000000e+00
-  %178 = select i1 %177, float %147, float 0.000000e+00
-  %179 = call float @llvm.pow.f32(float %178, float %24)
-  %180 = fcmp ult float %132, 0.000000e+00
-  %181 = select i1 %180, float 0.000000e+00, float %179
-  %182 = fadd float %150, %105
-  %183 = fadd float %153, %109
-  %184 = fadd float %156, %113
-  %185 = fmul float %176, %159
-  %186 = fadd float %185, %182
-  %187 = fmul float %176, %162
-  %188 = fadd float %187, %183
-  %189 = fmul float %176, %165
-  %190 = fadd float %189, %184
-  %191 = fmul float %181, %168
-  %192 = fadd float %191, %186
-  %193 = fmul float %181, %171
-  %194 = fadd float %193, %188
-  %195 = fmul float %181, %174
-  %196 = fadd float %195, %190
-  %197 = call float @llvm.AMDGPU.clamp.f32(float %192, float 0.000000e+00, float 1.000000e+00)
-  %198 = call float @llvm.AMDGPU.clamp.f32(float %194, float 0.000000e+00, float 1.000000e+00)
-  %199 = call float @llvm.AMDGPU.clamp.f32(float %196, float 0.000000e+00, float 1.000000e+00)
-  %200 = insertelement <4 x float> undef, float %75, i32 0
-  %201 = insertelement <4 x float> %200, float %79, i32 1
-  %202 = insertelement <4 x float> %201, float %83, i32 2
-  %203 = insertelement <4 x float> %202, float %87, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %203, i32 60, i32 1)
-  %204 = insertelement <4 x float> undef, float %197, i32 0
-  %205 = insertelement <4 x float> %204, float %198, i32 1
-  %206 = insertelement <4 x float> %205, float %199, i32 2
-  %207 = insertelement <4 x float> %206, float %117, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %207, i32 0, i32 2)
+  %tmp = extractelement <4 x float> %reg1, i32 0
+  %tmp13 = extractelement <4 x float> %reg1, i32 1
+  %tmp14 = extractelement <4 x float> %reg1, i32 2
+  %tmp15 = extractelement <4 x float> %reg1, i32 3
+  %tmp16 = extractelement <4 x float> %reg2, i32 0
+  %tmp17 = extractelement <4 x float> %reg2, i32 1
+  %tmp18 = extractelement <4 x float> %reg2, i32 2
+  %tmp19 = extractelement <4 x float> %reg2, i32 3
+  %tmp20 = extractelement <4 x float> %reg3, i32 0
+  %tmp21 = extractelement <4 x float> %reg3, i32 1
+  %tmp22 = extractelement <4 x float> %reg3, i32 2
+  %tmp23 = extractelement <4 x float> %reg3, i32 3
+  %tmp24 = extractelement <4 x float> %reg4, i32 0
+  %tmp25 = extractelement <4 x float> %reg4, i32 1
+  %tmp26 = extractelement <4 x float> %reg4, i32 2
+  %tmp27 = extractelement <4 x float> %reg4, i32 3
+  %tmp28 = extractelement <4 x float> %reg5, i32 0
+  %tmp29 = extractelement <4 x float> %reg5, i32 1
+  %tmp30 = extractelement <4 x float> %reg5, i32 2
+  %tmp31 = extractelement <4 x float> %reg5, i32 3
+  %tmp32 = extractelement <4 x float> %reg6, i32 0
+  %tmp33 = extractelement <4 x float> %reg6, i32 1
+  %tmp34 = extractelement <4 x float> %reg6, i32 2
+  %tmp35 = extractelement <4 x float> %reg6, i32 3
+  %tmp36 = extractelement <4 x float> %reg7, i32 0
+  %tmp37 = extractelement <4 x float> %reg7, i32 1
+  %tmp38 = extractelement <4 x float> %reg7, i32 2
+  %tmp39 = extractelement <4 x float> %reg7, i32 3
+  %tmp40 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp41 = extractelement <4 x float> %tmp40, i32 0
+  %tmp42 = fmul float %tmp, %tmp41
+  %tmp43 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp44 = extractelement <4 x float> %tmp43, i32 1
+  %tmp45 = fmul float %tmp, %tmp44
+  %tmp46 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp47 = extractelement <4 x float> %tmp46, i32 2
+  %tmp48 = fmul float %tmp, %tmp47
+  %tmp49 = load <4 x float>, <4 x float> addrspace(8)* null
+  %tmp50 = extractelement <4 x float> %tmp49, i32 3
+  %tmp51 = fmul float %tmp, %tmp50
+  %tmp52 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp53 = extractelement <4 x float> %tmp52, i32 0
+  %tmp54 = fmul float %tmp13, %tmp53
+  %tmp55 = fadd float %tmp54, %tmp42
+  %tmp56 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp57 = extractelement <4 x float> %tmp56, i32 1
+  %tmp58 = fmul float %tmp13, %tmp57
+  %tmp59 = fadd float %tmp58, %tmp45
+  %tmp60 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp61 = extractelement <4 x float> %tmp60, i32 2
+  %tmp62 = fmul float %tmp13, %tmp61
+  %tmp63 = fadd float %tmp62, %tmp48
+  %tmp64 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
+  %tmp65 = extractelement <4 x float> %tmp64, i32 3
+  %tmp66 = fmul float %tmp13, %tmp65
+  %tmp67 = fadd float %tmp66, %tmp51
+  %tmp68 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp69 = extractelement <4 x float> %tmp68, i32 0
+  %tmp70 = fmul float %tmp14, %tmp69
+  %tmp71 = fadd float %tmp70, %tmp55
+  %tmp72 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp73 = extractelement <4 x float> %tmp72, i32 1
+  %tmp74 = fmul float %tmp14, %tmp73
+  %tmp75 = fadd float %tmp74, %tmp59
+  %tmp76 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp77 = extractelement <4 x float> %tmp76, i32 2
+  %tmp78 = fmul float %tmp14, %tmp77
+  %tmp79 = fadd float %tmp78, %tmp63
+  %tmp80 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
+  %tmp81 = extractelement <4 x float> %tmp80, i32 3
+  %tmp82 = fmul float %tmp14, %tmp81
+  %tmp83 = fadd float %tmp82, %tmp67
+  %tmp84 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %tmp85 = extractelement <4 x float> %tmp84, i32 0
+  %tmp86 = fmul float %tmp15, %tmp85
+  %tmp87 = fadd float %tmp86, %tmp71
+  %tmp88 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %tmp89 = extractelement <4 x float> %tmp88, i32 1
+  %tmp90 = fmul float %tmp15, %tmp89
+  %tmp91 = fadd float %tmp90, %tmp75
+  %tmp92 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %tmp93 = extractelement <4 x float> %tmp92, i32 2
+  %tmp94 = fmul float %tmp15, %tmp93
+  %tmp95 = fadd float %tmp94, %tmp79
+  %tmp96 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 3)
+  %tmp97 = extractelement <4 x float> %tmp96, i32 3
+  %tmp98 = fmul float %tmp15, %tmp97
+  %tmp99 = fadd float %tmp98, %tmp83
+  %tmp100 = insertelement <4 x float> undef, float %tmp16, i32 0
+  %tmp101 = insertelement <4 x float> %tmp100, float %tmp17, i32 1
+  %tmp102 = insertelement <4 x float> %tmp101, float %tmp18, i32 2
+  %tmp103 = insertelement <4 x float> %tmp102, float 0.000000e+00, i32 3
+  %tmp104 = insertelement <4 x float> undef, float %tmp16, i32 0
+  %tmp105 = insertelement <4 x float> %tmp104, float %tmp17, i32 1
+  %tmp106 = insertelement <4 x float> %tmp105, float %tmp18, i32 2
+  %tmp107 = insertelement <4 x float> %tmp106, float 0.000000e+00, i32 3
+  %tmp108 = call float @llvm.r600.dot4(<4 x float> %tmp103, <4 x float> %tmp107)
+  %tmp109 = call float @llvm.fabs.f32(float %tmp108)
+  %tmp110 = call float @llvm.r600.recipsqrt.clamped.f32(float %tmp109)
+  %tmp111 = fmul float %tmp16, %tmp110
+  %tmp112 = fmul float %tmp17, %tmp110
+  %tmp113 = fmul float %tmp18, %tmp110
+  %tmp114 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %tmp115 = extractelement <4 x float> %tmp114, i32 0
+  %tmp116 = fmul float %tmp115, %tmp20
+  %tmp117 = fadd float %tmp116, %tmp32
+  %tmp118 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %tmp119 = extractelement <4 x float> %tmp118, i32 1
+  %tmp120 = fmul float %tmp119, %tmp21
+  %tmp121 = fadd float %tmp120, %tmp33
+  %tmp122 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 4)
+  %tmp123 = extractelement <4 x float> %tmp122, i32 2
+  %tmp124 = fmul float %tmp123, %tmp22
+  %tmp125 = fadd float %tmp124, %tmp34
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp117, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i11 = call float @llvm.maxnum.f32(float %tmp121, float 0.000000e+00)
+  %clamp.i12 = call float @llvm.minnum.f32(float %max.0.i11, float 1.000000e+00)
+  %max.0.i9 = call float @llvm.maxnum.f32(float %tmp125, float 0.000000e+00)
+  %clamp.i10 = call float @llvm.minnum.f32(float %max.0.i9, float 1.000000e+00)
+  %max.0.i7 = call float @llvm.maxnum.f32(float %tmp27, float 0.000000e+00)
+  %clamp.i8 = call float @llvm.minnum.f32(float %max.0.i7, float 1.000000e+00)
+  %tmp126 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %tmp127 = extractelement <4 x float> %tmp126, i32 0
+  %tmp128 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %tmp129 = extractelement <4 x float> %tmp128, i32 1
+  %tmp130 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 5)
+  %tmp131 = extractelement <4 x float> %tmp130, i32 2
+  %tmp132 = insertelement <4 x float> undef, float %tmp111, i32 0
+  %tmp133 = insertelement <4 x float> %tmp132, float %tmp112, i32 1
+  %tmp134 = insertelement <4 x float> %tmp133, float %tmp113, i32 2
+  %tmp135 = insertelement <4 x float> %tmp134, float 0.000000e+00, i32 3
+  %tmp136 = insertelement <4 x float> undef, float %tmp127, i32 0
+  %tmp137 = insertelement <4 x float> %tmp136, float %tmp129, i32 1
+  %tmp138 = insertelement <4 x float> %tmp137, float %tmp131, i32 2
+  %tmp139 = insertelement <4 x float> %tmp138, float 0.000000e+00, i32 3
+  %tmp140 = call float @llvm.r600.dot4(<4 x float> %tmp135, <4 x float> %tmp139)
+  %tmp141 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %tmp142 = extractelement <4 x float> %tmp141, i32 0
+  %tmp143 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %tmp144 = extractelement <4 x float> %tmp143, i32 1
+  %tmp145 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 7)
+  %tmp146 = extractelement <4 x float> %tmp145, i32 2
+  %tmp147 = insertelement <4 x float> undef, float %tmp111, i32 0
+  %tmp148 = insertelement <4 x float> %tmp147, float %tmp112, i32 1
+  %tmp149 = insertelement <4 x float> %tmp148, float %tmp113, i32 2
+  %tmp150 = insertelement <4 x float> %tmp149, float 0.000000e+00, i32 3
+  %tmp151 = insertelement <4 x float> undef, float %tmp142, i32 0
+  %tmp152 = insertelement <4 x float> %tmp151, float %tmp144, i32 1
+  %tmp153 = insertelement <4 x float> %tmp152, float %tmp146, i32 2
+  %tmp154 = insertelement <4 x float> %tmp153, float 0.000000e+00, i32 3
+  %tmp155 = call float @llvm.r600.dot4(<4 x float> %tmp150, <4 x float> %tmp154)
+  %tmp156 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %tmp157 = extractelement <4 x float> %tmp156, i32 0
+  %tmp158 = fmul float %tmp157, %tmp20
+  %tmp159 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %tmp160 = extractelement <4 x float> %tmp159, i32 1
+  %tmp161 = fmul float %tmp160, %tmp21
+  %tmp162 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 8)
+  %tmp163 = extractelement <4 x float> %tmp162, i32 2
+  %tmp164 = fmul float %tmp163, %tmp22
+  %tmp165 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp166 = extractelement <4 x float> %tmp165, i32 0
+  %tmp167 = fmul float %tmp166, %tmp24
+  %tmp168 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp169 = extractelement <4 x float> %tmp168, i32 1
+  %tmp170 = fmul float %tmp169, %tmp25
+  %tmp171 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 9)
+  %tmp172 = extractelement <4 x float> %tmp171, i32 2
+  %tmp173 = fmul float %tmp172, %tmp26
+  %tmp174 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %tmp175 = extractelement <4 x float> %tmp174, i32 0
+  %tmp176 = fmul float %tmp175, %tmp28
+  %tmp177 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %tmp178 = extractelement <4 x float> %tmp177, i32 1
+  %tmp179 = fmul float %tmp178, %tmp29
+  %tmp180 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 10)
+  %tmp181 = extractelement <4 x float> %tmp180, i32 2
+  %tmp182 = fmul float %tmp181, %tmp30
+  %tmp183 = fcmp uge float %tmp140, 0.000000e+00
+  %tmp184 = select i1 %tmp183, float %tmp140, float 0.000000e+00
+  %tmp185 = fcmp uge float %tmp155, 0.000000e+00
+  %tmp186 = select i1 %tmp185, float %tmp155, float 0.000000e+00
+  %tmp187 = call float @llvm.pow.f32(float %tmp186, float %tmp36)
+  %tmp188 = fcmp ult float %tmp140, 0.000000e+00
+  %tmp189 = select i1 %tmp188, float 0.000000e+00, float %tmp187
+  %tmp190 = fadd float %tmp158, %tmp117
+  %tmp191 = fadd float %tmp161, %tmp121
+  %tmp192 = fadd float %tmp164, %tmp125
+  %tmp193 = fmul float %tmp184, %tmp167
+  %tmp194 = fadd float %tmp193, %tmp190
+  %tmp195 = fmul float %tmp184, %tmp170
+  %tmp196 = fadd float %tmp195, %tmp191
+  %tmp197 = fmul float %tmp184, %tmp173
+  %tmp198 = fadd float %tmp197, %tmp192
+  %tmp199 = fmul float %tmp189, %tmp176
+  %tmp200 = fadd float %tmp199, %tmp194
+  %tmp201 = fmul float %tmp189, %tmp179
+  %tmp202 = fadd float %tmp201, %tmp196
+  %tmp203 = fmul float %tmp189, %tmp182
+  %tmp204 = fadd float %tmp203, %tmp198
+  %max.0.i5 = call float @llvm.maxnum.f32(float %tmp200, float 0.000000e+00)
+  %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %tmp202, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp204, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp205 = insertelement <4 x float> undef, float %tmp87, i32 0
+  %tmp206 = insertelement <4 x float> %tmp205, float %tmp91, i32 1
+  %tmp207 = insertelement <4 x float> %tmp206, float %tmp95, i32 2
+  %tmp208 = insertelement <4 x float> %tmp207, float %tmp99, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp208, i32 60, i32 1)
+  %tmp209 = insertelement <4 x float> undef, float %clamp.i6, i32 0
+  %tmp210 = insertelement <4 x float> %tmp209, float %clamp.i4, i32 1
+  %tmp211 = insertelement <4 x float> %tmp210, float %clamp.i2, i32 2
+  %tmp212 = insertelement <4 x float> %tmp211, float %clamp.i8, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp212, i32 0, i32 2)
   ret void
 }
 
-; Function Attrs: readnone
-declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #1
-
-; Function Attrs: readonly
-declare float @llvm.fabs.f32(float) #1
-
-; Function Attrs: readnone
-declare float @llvm.r600.recipsqrt.clamped.f32(float) #1
-
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
-
-; Function Attrs: nounwind readonly
-declare float @llvm.pow.f32(float, float) #2
-
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #3
+declare float @llvm.minnum.f32(float, float) #0
+declare float @llvm.maxnum.f32(float, float) #0
+declare float @llvm.r600.dot4(<4 x float>, <4 x float>) #0
+declare float @llvm.fabs.f32(float) #0
+declare float @llvm.r600.recipsqrt.clamped.f32(float) #0
+declare float @llvm.pow.f32(float, float) #0
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #1
 
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly }
-attributes #3 = { nounwind }
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
index 461caf5b5d204573b05ec8c9c741f9696e271b0a..e2143ff85b72e70c81458f4d71c926fa17599e04 100644
--- a/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
+++ b/test/CodeGen/AMDGPU/r600-infinite-loop-bug-while-reorganizing-vector.ll
@@ -10,7 +10,7 @@ main_body:
   %tmp6 = insertelement <4 x float> %tmp5, float %tmp2, i32 1
   %tmp7 = insertelement <4 x float> %tmp6, float %tmp3, i32 2
   %tmp8 = insertelement <4 x float> %tmp7, float %tmp4, i32 3
-  %tmp9 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp8)
+  %tmp9 = call <4 x float> @llvm.r600.cube(<4 x float> %tmp8)
   %tmp10 = extractelement <4 x float> %tmp9, i32 0
   %tmp11 = extractelement <4 x float> %tmp9, i32 1
   %tmp12 = extractelement <4 x float> %tmp9, i32 2
@@ -45,7 +45,7 @@ main_body:
 }
 
 ; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #0
+declare <4 x float> @llvm.r600.cube(<4 x float>) #0
 
 ; Function Attrs: readnone
 declare float @fabs(float) #0
diff --git a/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
index 866a4a9191e27a9dfc9c06bd142bad0857b21c67..b7ed34bbf09b77262d66d0321421d2c24558d4d0 100644
--- a/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
+++ b/test/CodeGen/AMDGPU/r600-legalize-umax-bug.ll
@@ -2,7 +2,7 @@
 ; Don't crash
 
 ; CHECK: MAX_UINT
-define void @test(i64 addrspace(1)* %out) {
+define amdgpu_kernel void @test(i64 addrspace(1)* %out) {
 bb:
   store i64 2, i64 addrspace(1)* %out
   %tmp = load i64, i64 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/r600.alu-limits.ll b/test/CodeGen/AMDGPU/r600.alu-limits.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2604ed4e574c043e43df370a2ab1395495a2b1e1
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.alu-limits.ll
@@ -0,0 +1,29 @@
+; RUN: opt -loop-unroll -unroll-threshold=2000 -S < %s | llc -march=r600 -mcpu=cypress | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: {{^}}alu_limits:
+; CHECK: CF_END
+
+%struct.foo = type {i32, i32, i32}
+
+define amdgpu_kernel void @alu_limits(i32 addrspace(1)* %out, %struct.foo* %in, i32 %offset) {
+entry:
+  %ptr = getelementptr inbounds %struct.foo, %struct.foo* %in, i32 1, i32 2
+  %x = load i32, i32 *%ptr, align 4
+  br label %loop
+loop:
+  %i = phi i32 [ 100, %entry ], [ %nexti, %loop ]
+  %val = phi i32 [ 1, %entry ], [ %nextval, %loop ]
+
+  %nexti = sub i32 %i, 1
+
+  %y = xor i32 %x, %i
+  %nextval = mul i32 %val, %y
+
+  %cond = icmp ne i32 %nexti, 0
+  br i1 %cond, label %loop, label %end
+end:
+  %out_val = add i32 %nextval, 4
+  store i32 %out_val, i32 addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll b/test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8956d113e8b50fdf4201daddddadb581aa88b04f
--- /dev/null
+++ b/test/CodeGen/AMDGPU/r600.amdgpu-alias-analysis.ll
@@ -0,0 +1,7 @@
+; RUN: opt -mtriple=r600-- -O3 -aa-eval -print-all-alias-modref-info -disable-output < %s 2>&1 | FileCheck %s
+
+; CHECK: NoAlias:      i8 addrspace(7)* %p1, i8* %p
+
+define amdgpu_kernel void @test(i8* %p, i8 addrspace(7)* %p1) {
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/r600.bitcast.ll b/test/CodeGen/AMDGPU/r600.bitcast.ll
index 49441ee8d1865104c6888ba7f7ac8642d2720746..acf7a66a357fc03c96909c5f3d6013fb4928f190 100644
--- a/test/CodeGen/AMDGPU/r600.bitcast.ll
+++ b/test/CodeGen/AMDGPU/r600.bitcast.ll
@@ -8,7 +8,7 @@
 ; EG: VTX_READ_128 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @i8ptr_v16i8ptr(<16 x i8> addrspace(1)* %out, i8 addrspace(1)* %in) {
 entry:
   %0 = bitcast i8 addrspace(1)* %in to <16 x i8> addrspace(1)*
   %1 = load <16 x i8>, <16 x i8> addrspace(1)* %0
@@ -21,7 +21,7 @@ entry:
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %load = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %load to <2 x i16>
   store <2 x i16> %bc, <2 x i16> addrspace(1)* %out, align 4
@@ -33,7 +33,7 @@ define void @f32_to_v2i16(<2 x i16> addrspace(1)* %out, float addrspace(1)* %in)
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
   %bc = bitcast <2 x i16> %load to float
   store float %bc, float addrspace(1)* %out, align 4
@@ -45,7 +45,7 @@ define void @v2i16_to_f32(float addrspace(1)* %out, <2 x i16> addrspace(1)* %in)
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nounwind {
   %load = load <4 x i8>, <4 x i8> addrspace(1)* %in, align 4
   %bc = bitcast <4 x i8> %load to i32
   store i32 %bc, i32 addrspace(1)* %out, align 4
@@ -57,7 +57,7 @@ define void @v4i8_to_i32(i32 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) nou
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %load = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -69,7 +69,7 @@ define void @i32_to_v4i8(<4 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nou
 ; EG: VTX_READ_32 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) nounwind {
   %load = load <2 x i16>, <2 x i16>  addrspace(1)* %in, align 4
   %bc = bitcast <2 x i16> %load to <4 x i8>
   store <4 x i8> %bc, <4 x i8> addrspace(1)* %out, align 4
@@ -85,7 +85,7 @@ define void @v2i16_to_v4i8(<4 x i8> addrspace(1)* %out, <2 x i16> addrspace(1)*
 ; EG: VTX_READ_16
 ; EG-DAG: BFE_UINT
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) nounwind {
   %load = load <4 x i16>, <4 x i16>  addrspace(1)* %in, align 2
   %bc = bitcast <4 x i16> %load to <8 x i8>
   %element = extractelement <8 x i8> %bc, i32 5
@@ -98,7 +98,7 @@ define void @v4i16_extract_i8(i8 addrspace(1)* %out, <4 x i16> addrspace(1)* %in
 ; EG: VTX_READ_64 [[DATA]], [[LD_PTR:T[0-9]+\.[XYZW]]]
 ; EG-DAG: MOV {{[\* ]*}}[[LD_PTR]], KC0[2].Z
 ; EG-DAG: LSHR {{[\* ]*}}[[ST_PTR]], KC0[2].Y, literal
-define void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_v2i32_to_f64(double addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %val = load <2 x i32>, <2 x i32> addrspace(1)* %in, align 8
   %bc = bitcast <2 x i32> %val to double
   store double %bc, double addrspace(1)* %out, align 8
diff --git a/test/CodeGen/AMDGPU/r600.global_atomics.ll b/test/CodeGen/AMDGPU/r600.global_atomics.ll
index 7047c635dff3c55dd6239f56e7973c3f4f2fc179..1ddc41feb00692ab58de4145772e4ce19b778f84 100644
--- a/test/CodeGen/AMDGPU/r600.global_atomics.ll
+++ b/test/CodeGen/AMDGPU/r600.global_atomics.ll
@@ -6,7 +6,7 @@
 ; FUNC-LABEL: {{^}}atomic_add_i32_offset:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -16,7 +16,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_soffset:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_soffset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 9000
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -27,7 +27,7 @@ entry:
 ; FIXME: looks like the offset is wrong
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32_huge_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 47224239175595
 
@@ -38,7 +38,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -49,7 +49,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_add_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile add i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -58,7 +58,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_add_i32_addr64:
 ; EG: MEM_RAT ATOMIC_ADD [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_add_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile add i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -68,7 +68,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_offset:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile and i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -78,7 +78,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -89,7 +89,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_and_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile and i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -98,7 +98,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_and_i32_addr64:
 ; EG: MEM_RAT ATOMIC_AND [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_and_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile and i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -108,7 +108,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_offset:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -118,7 +118,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -129,7 +129,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_sub_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile sub i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -138,7 +138,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_sub_i32_addr64:
 ; EG: MEM_RAT ATOMIC_SUB [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_sub_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile sub i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -148,7 +148,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_offset:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile max i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -158,7 +158,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -169,7 +169,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_max_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile max i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -178,7 +178,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_max_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MAX_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_max_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile max i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -188,7 +188,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_offset:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umax i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -198,7 +198,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -209,7 +209,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umax_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umax i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -218,7 +218,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umax_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MAX_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umax_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umax i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -228,7 +228,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_offset:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile min i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -238,7 +238,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -249,7 +249,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_min_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile min i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -258,7 +258,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_min_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MIN_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_min_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile min i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -268,7 +268,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_offset:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile umin i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -278,7 +278,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -289,7 +289,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_umin_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile umin i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -298,7 +298,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_umin_i32_addr64:
 ; EG: MEM_RAT ATOMIC_MIN_UINT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_umin_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile umin i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -308,7 +308,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_offset:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile or i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -318,7 +318,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -329,7 +329,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_or_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile or i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -338,7 +338,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_or_i32_addr64:
 ; EG: MEM_RAT ATOMIC_OR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_or_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile or i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -348,7 +348,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xchg i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -358,7 +358,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -369,7 +369,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xchg_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xchg i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -378,7 +378,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xchg_i32_addr64:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xchg i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -388,7 +388,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_offset:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_offset(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = cmpxchg volatile i32 addrspace(1)* %gep, i32 %old, i32 %in seq_cst seq_cst
@@ -398,7 +398,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -409,7 +409,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32(i32 addrspace(1)* %out, i32 %in, i32 %old) {
 entry:
   %val = cmpxchg volatile i32 addrspace(1)* %out, i32 %old, i32 %in seq_cst seq_cst
   ret void
@@ -418,7 +418,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_cmpxchg_i32_addr64:
 ; EG: MEM_RAT ATOMIC_CMPXCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
+define amdgpu_kernel void @atomic_cmpxchg_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index, i32 %old) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = cmpxchg volatile i32 addrspace(1)* %ptr, i32 %old, i32 %in seq_cst seq_cst
@@ -428,7 +428,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_offset:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32_offset(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile xor i32 addrspace(1)* %gep, i32 %in seq_cst
@@ -438,7 +438,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64_offset(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -449,7 +449,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @atomic_xor_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %val = atomicrmw volatile xor i32 addrspace(1)* %out, i32 %in seq_cst
   ret void
@@ -458,7 +458,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_xor_i32_addr64:
 ; EG: MEM_RAT ATOMIC_XOR [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Z
-define void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
+define amdgpu_kernel void @atomic_xor_i32_addr64(i32 addrspace(1)* %out, i32 %in, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %val = atomicrmw volatile xor i32 addrspace(1)* %ptr, i32 %in seq_cst
@@ -468,7 +468,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32_offset(i32 %in, i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   store atomic i32 %in, i32 addrspace(1)* %gep  seq_cst, align 4
@@ -478,7 +478,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_store_i32(i32 %in, i32 addrspace(1)* %out) {
 entry:
   store atomic i32 %in, i32 addrspace(1)* %out seq_cst, align 4
   ret void
@@ -487,7 +487,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64_offset:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64_offset(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i64 4
@@ -498,7 +498,7 @@ entry:
 ; FUNC-LABEL: {{^}}atomic_store_i32_addr64:
 ; EG: MEM_RAT ATOMIC_XCHG_INT [[REG:T[0-9]+]]
 ; EG: MOV{{[ *]*}}[[REG]].X, KC0[2].Y
-define void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
+define amdgpu_kernel void @atomic_store_i32_addr64(i32 %in, i32 addrspace(1)* %out, i64 %index) {
 entry:
   %ptr = getelementptr i32, i32 addrspace(1)* %out, i64 %index
   store atomic i32 %in, i32 addrspace(1)* %ptr seq_cst, align 4
@@ -507,7 +507,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_inc_add
 ; EG: MEM_RAT ATOMIC_INC_UINT
-define void @atomic_inc_add(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_inc_add(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 1 seq_cst
@@ -516,7 +516,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_dec_add
 ; EG: MEM_RAT ATOMIC_DEC_UINT
-define void @atomic_dec_add(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_dec_add(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile add i32 addrspace(1)* %gep, i32 -1 seq_cst
@@ -525,7 +525,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_inc_sub
 ; EG: MEM_RAT ATOMIC_INC_UINT
-define void @atomic_inc_sub(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_inc_sub(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 -1 seq_cst
@@ -534,7 +534,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}atomic_dec_sub
 ; EG: MEM_RAT ATOMIC_DEC_UINT
-define void @atomic_dec_sub(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @atomic_dec_sub(i32 addrspace(1)* %out) {
 entry:
   %gep = getelementptr i32, i32 addrspace(1)* %out, i64 4
   %val = atomicrmw volatile sub i32 addrspace(1)* %gep, i32 1 seq_cst
diff --git a/test/CodeGen/AMDGPU/r600.private-memory.ll b/test/CodeGen/AMDGPU/r600.private-memory.ll
index f406c160cbbeffd24131bd63e47f4f694fce9e52..53ee214f07ece1b97137aef830e214c7dbabb0d7 100644
--- a/test/CodeGen/AMDGPU/r600.private-memory.ll
+++ b/test/CodeGen/AMDGPU/r600.private-memory.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 ; Additional check in case the move ends up in the last slot
 ; R600-NOT: MOV * TO.X
 
-define void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @work_item_info(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = alloca [2 x i32]
   %1 = getelementptr [2 x i32], [2 x i32]* %0, i32 0, i32 0
diff --git a/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll b/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
index a34a48e3b7baa710bac5fcb9ad163a83782f6ac8..9eee9a6effc92cb3ea7b7cf3a9869ff0867ebe9d 100644
--- a/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/r600.work-item-intrinsics.ll
@@ -2,7 +2,7 @@
 
 ; FUNC-LABEL: {{^}}tgid_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.X
-define void @tgid_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -11,7 +11,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tgid_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Y
-define void @tgid_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tgid_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T1.Z
-define void @tgid_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tgid_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tgid.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_x:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.X
-define void @tidig_x(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_x(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.x() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -38,7 +38,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_y:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Y
-define void @tidig_y(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_y(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.y() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -47,7 +47,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}tidig_z:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.Z
-define void @tidig_z(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @tidig_z(i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.r600.read.tidig.z() #0
   store i32 %0, i32 addrspace(1)* %out
@@ -57,7 +57,7 @@ entry:
 ; FUNC-LABEL: {{^}}test_implicit:
 ; 36 prepended implicit bytes + 4(out pointer) + 4*4 = 56
 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 56
-define void @test_implicit(i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @test_implicit(i32 addrspace(1)* %out) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
   %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 4
@@ -69,7 +69,7 @@ define void @test_implicit(i32 addrspace(1)* %out) #1 {
 ; FUNC-LABEL: {{^}}test_implicit_dyn:
 ; 36 prepended implicit bytes + 8(out pointer + in) = 44
 ; EG: VTX_READ_32 {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}}, 44
-define void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
+define amdgpu_kernel void @test_implicit_dyn(i32 addrspace(1)* %out, i32 %in) #1 {
   %implicitarg.ptr = call noalias i8 addrspace(7)* @llvm.r600.implicitarg.ptr()
   %header.ptr = bitcast i8 addrspace(7)* %implicitarg.ptr to i32 addrspace(7)*
   %gep = getelementptr i32, i32 addrspace(7)* %header.ptr, i32 %in
diff --git a/test/CodeGen/AMDGPU/rcp-pattern.ll b/test/CodeGen/AMDGPU/rcp-pattern.ll
index b7cc6d47cd87972134bc58e010ca5a1dee0d8368..fbdaeb82929742c7a3c9319b2906b65d7dd52b82 100644
--- a/test/CodeGen/AMDGPU/rcp-pattern.ll
+++ b/test/CodeGen/AMDGPU/rcp-pattern.ll
@@ -9,7 +9,7 @@
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float 1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -33,7 +33,7 @@ define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv fast float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -45,7 +45,7 @@ define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv arcp float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -57,7 +57,7 @@ define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
+define amdgpu_kernel void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 {
   %rcp = fdiv float 1.0, %src, !fpmath !0
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -69,7 +69,7 @@ define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src)
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %rcp = fdiv float 1.0, %src.fabs
   store float %rcp, float addrspace(1)* %out, align 4
@@ -82,7 +82,7 @@ define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: buffer_store_dword [[RCP]]
 
 ; EG: RECIP_IEEE
-define void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %rcp = fdiv float -1.0, %src
   store float %rcp, float addrspace(1)* %out, align 4
   ret void
@@ -92,7 +92,7 @@ define void @neg_rcp_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: s_load_dword [[SRC:s[0-9]+]]
 ; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], -|[[SRC]]|
 ; GCN: buffer_store_dword [[RCP]]
-define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
   %rcp = fdiv float 1.0, %src.fabs.fneg
@@ -106,7 +106,7 @@ define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 {
 ; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[SRC]], -|[[SRC]]|
 ; GCN: buffer_store_dword [[RCP]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
+define amdgpu_kernel void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %src) #0 {
   %src.fabs = call float @llvm.fabs.f32(float %src)
   %src.fabs.fneg = fsub float -0.0, %src.fabs
   %rcp = fdiv float 1.0, %src.fabs.fneg
@@ -117,6 +117,35 @@ define void @rcp_fabs_fneg_pat_multi_use_f32(float addrspace(1)* %out, float %sr
   ret void
 }
 
+; FUNC-LABEL: {{^}}div_arcp_2_x_pat_f32:
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0.5, v{{[0-9]+}}
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @div_arcp_2_x_pat_f32(float addrspace(1)* %out) #0 {
+  %x = load float, float addrspace(1)* undef
+  %rcp = fdiv arcp float %x, 2.0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}div_arcp_k_x_pat_f32:
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0x3dcccccd, v{{[0-9]+}}
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @div_arcp_k_x_pat_f32(float addrspace(1)* %out) #0 {
+  %x = load float, float addrspace(1)* undef
+  %rcp = fdiv arcp float %x, 10.0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
+
+; FUNC-LABEL: {{^}}div_arcp_neg_k_x_pat_f32:
+; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 0xbdcccccd, v{{[0-9]+}}
+; GCN: buffer_store_dword [[MUL]]
+define amdgpu_kernel void @div_arcp_neg_k_x_pat_f32(float addrspace(1)* %out) #0 {
+  %x = load float, float addrspace(1)* undef
+  %rcp = fdiv arcp float %x, -10.0
+  store float %rcp, float addrspace(1)* %out, align 4
+  ret void
+}
 
 declare float @llvm.fabs.f32(float) #1
 declare float @llvm.sqrt.f32(float) #1
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll b/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
index a5581d73cb2596e35234cf2bb8369ac757bdb21f..34cbe3963361effaece948dabed5eac8fe816432 100644
--- a/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
+++ b/test/CodeGen/AMDGPU/read-register-invalid-subtarget.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.read_register.i32(metadata) #0
 
-define void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @test_invalid_read_flat_scratch_lo(i32 addrspace(1)* %out) nounwind {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll b/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
index 2617ad7402ff2f5523075270f500ab46ec75c87a..6417d28e7aad6e0ef92467fd3c91d5fd5c5c7ff2 100644
--- a/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
+++ b/test/CodeGen/AMDGPU/read-register-invalid-type-i32.ll
@@ -4,7 +4,7 @@
 
 declare i32 @llvm.read_register.i32(metadata) #0
 
-define void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @test_invalid_read_exec(i32 addrspace(1)* %out) nounwind {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll b/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
index dcde8a1894fcde98a37d47f554815dfcd2f89b4a..8e248fdfea4ce81a741410e176c7aff5f0736436 100644
--- a/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
+++ b/test/CodeGen/AMDGPU/read-register-invalid-type-i64.ll
@@ -4,7 +4,7 @@
 
 declare i64 @llvm.read_register.i64(metadata) #0
 
-define void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_invalid_read_m0(i64 addrspace(1)* %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !0)
   store i64 %exec, i64 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/read_register.ll b/test/CodeGen/AMDGPU/read_register.ll
index 601a0adb812297518c4b3ef30e33fade18b86395..8fe9e7f3f111d18edae9c0e527847efdcb492eee 100644
--- a/test/CodeGen/AMDGPU/read_register.ll
+++ b/test/CodeGen/AMDGPU/read_register.ll
@@ -9,7 +9,7 @@ declare i64 @llvm.read_register.i64(metadata) #0
 ; CHECK: s_mov_b32 [[COPY_M0:s[0-9]+]], m0
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], [[COPY_M0]]
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_m0(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_m0(i32 addrspace(1)* %out) #0 {
   store volatile i32 0, i32 addrspace(3)* undef
   %m0 = call i32 @llvm.read_register.i32(metadata !0)
   store i32 %m0, i32 addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @test_read_m0(i32 addrspace(1)* %out) #0 {
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], exec_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], exec_hi
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_read_exec(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec(i64 addrspace(1)* %out) #0 {
   %exec = call i64 @llvm.read_register.i64(metadata !1)
   store i64 %exec, i64 addrspace(1)* %out
   ret void
@@ -30,7 +30,7 @@ define void @test_read_exec(i64 addrspace(1)* %out) #0 {
 ; CHECK: v_mov_b32_e32 v[[LO:[0-9]+]], flat_scratch_lo
 ; CHECK: v_mov_b32_e32 v[[HI:[0-9]+]], flat_scratch_hi
 ; CHECK: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
   %flat_scratch = call i64 @llvm.read_register.i64(metadata !2)
   store i64 %flat_scratch, i64 addrspace(1)* %out
   ret void
@@ -39,7 +39,7 @@ define void @test_read_flat_scratch(i64 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_flat_scratch_lo:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_lo
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
   %flat_scratch_lo = call i32 @llvm.read_register.i32(metadata !3)
   store i32 %flat_scratch_lo, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @test_read_flat_scratch_lo(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_flat_scratch_hi:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], flat_scratch_hi
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
   %flat_scratch_hi = call i32 @llvm.read_register.i32(metadata !4)
   store i32 %flat_scratch_hi, i32 addrspace(1)* %out
   ret void
@@ -57,7 +57,7 @@ define void @test_read_flat_scratch_hi(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_exec_lo:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_lo
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
   %exec_lo = call i32 @llvm.read_register.i32(metadata !5)
   store i32 %exec_lo, i32 addrspace(1)* %out
   ret void
@@ -66,7 +66,7 @@ define void @test_read_exec_lo(i32 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: {{^}}test_read_exec_hi:
 ; CHECK: v_mov_b32_e32 [[COPY:v[0-9]+]], exec_hi
 ; CHECK: buffer_store_dword [[COPY]]
-define void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_read_exec_hi(i32 addrspace(1)* %out) #0 {
   %exec_hi = call i32 @llvm.read_register.i32(metadata !6)
   store i32 %exec_hi, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/readcyclecounter.ll b/test/CodeGen/AMDGPU/readcyclecounter.ll
index 7965b061fe5b92c70fa3682d86a69da37ea7add6..5c698c839fa686692831ab75352e5ae312e8efd6 100644
--- a/test/CodeGen/AMDGPU/readcyclecounter.ll
+++ b/test/CodeGen/AMDGPU/readcyclecounter.ll
@@ -13,7 +13,7 @@ declare i64 @llvm.readcyclecounter() #0
 ; SI: s_memtime s{{\[[0-9]+:[0-9]+\]}}
 ; VI: s_memrealtime s{{\[[0-9]+:[0-9]+\]}}
 ; GCN: store_dwordx2
-define void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @test_readcyclecounter(i64 addrspace(1)* %out) #0 {
   %cycle0 = call i64 @llvm.readcyclecounter()
   store volatile i64 %cycle0, i64 addrspace(1)* %out
 
diff --git a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
index dd67dc488dbf431c359f5ab8cb0d05fc827cfec6..ecb513cd80b6e483bfd779186b79e3b2f182cb8e 100644
--- a/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
+++ b/test/CodeGen/AMDGPU/reduce-load-width-alignment.ll
@@ -6,7 +6,7 @@
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_and_b32_e32 v{{[0-9]+}}, 0x12d687, [[VAL]]
 ; GCN: buffer_store_dwordx2
-define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %and = and i64 %a, 1234567
   store i64 %and, i64 addrspace(1)* %out, align 8
@@ -16,7 +16,7 @@ define void @reduce_i64_load_align_4_width_to_i32(i64 addrspace(1)* %out, i64 ad
 ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt0:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: buffer_store_dword [[VAL]]
-define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %vec = bitcast i64 %a to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 0
@@ -27,7 +27,7 @@ define void @reduce_i64_align_4_bitcast_v2i32_elt0(i32 addrspace(1)* %out, i64 a
 ; GCN-LABEL: {{^}}reduce_i64_align_4_bitcast_v2i32_elt1:
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:4
 ; GCN: buffer_store_dword [[VAL]]
-define void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @reduce_i64_align_4_bitcast_v2i32_elt1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %a = load i64, i64 addrspace(1)* %in, align 4
   %vec = bitcast i64 %a to <2 x i32>
   %elt0 = extractelement <2 x i32> %vec, i32 1
diff --git a/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
index 281e49f804c61a7dcfef4e6ea0c7733c634d6e6d..601aca48e1e2673ae7c3fbe6b88e8038cee20224 100644
--- a/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
+++ b/test/CodeGen/AMDGPU/reduce-store-width-alignment.ll
@@ -3,7 +3,7 @@
 ; GCN-LABEL: {{^}}store_v2i32_as_v4i16_align_4:
 ; GCN: s_load_dwordx2
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+define amdgpu_kernel void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
   %x.bc = bitcast <2 x i32> %x to <4 x i16>
   store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @store_v2i32_as_v4i16_align_4(<4 x i16> addrspace(3)* align 4 %out,
 ; GCN: s_load_dwordx4
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
   %x.bc = bitcast <4 x i32> %x to <8 x i16>
   store <8 x i16> %x.bc, <8 x i16> addrspace(3)* %out, align 4
   ret void
@@ -22,7 +22,7 @@ define void @store_v4i32_as_v8i16_align_4(<8 x i16> addrspace(3)* align 4 %out,
 ; GCN-LABEL: {{^}}store_v2i32_as_i64_align_4:
 ; GCN: s_load_dwordx2
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
+define amdgpu_kernel void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2 x i32> %x) #0 {
   %x.bc = bitcast <2 x i32> %x to <4 x i16>
   store <4 x i16> %x.bc, <4 x i16> addrspace(3)* %out, align 4
   ret void
@@ -32,7 +32,7 @@ define void @store_v2i32_as_i64_align_4(<4 x i16> addrspace(3)* align 4 %out, <2
 ; GCN: s_load_dwordx4
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:2 offset1:3
 ; GCN-DAG: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
+define amdgpu_kernel void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out, <4 x i32> %x) #0 {
   %x.bc = bitcast <4 x i32> %x to <2 x i64>
   store <2 x i64> %x.bc, <2 x i64> addrspace(3)* %out, align 4
   ret void
@@ -44,7 +44,7 @@ define void @store_v4i32_as_v2i64_align_4(<2 x i64> addrspace(3)* align 4 %out,
 ; GCN: buffer_load_ushort
 ; GCN: buffer_load_ushort
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1{{$}}
-define void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
+define amdgpu_kernel void @store_v4i16_as_v2i32_align_4(<2 x i32> addrspace(3)* align 4 %out, <4 x i16> %x) #0 {
   %x.bc = bitcast <4 x i16> %x to <2 x i32>
   store <2 x i32> %x.bc, <2 x i32> addrspace(3)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
index 909644850750710fb86a482b8fb167c05e5e8a70..9f8667d3599325a44640c2f29dcfe46d475bf11d 100644
--- a/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
+++ b/test/CodeGen/AMDGPU/reg-coalescer-sched-crash.ll
@@ -6,7 +6,7 @@
 
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
-define void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
+define amdgpu_kernel void @reg_coalescer_breaks_dead(<2 x i32> addrspace(1)* nocapture readonly %arg, i32 %arg1, i32 %arg2, i32 %arg3) #1 {
 bb:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %cmp0 = icmp eq i32 %id.x, 0
diff --git a/test/CodeGen/AMDGPU/regcoalesce-dbg.mir b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
new file mode 100644
index 0000000000000000000000000000000000000000..ecf94b5772ffc286dd510c93cbebd021c7b1d141
--- /dev/null
+++ b/test/CodeGen/AMDGPU/regcoalesce-dbg.mir
@@ -0,0 +1,76 @@
+# RUN: llc -march=amdgcn -run-pass simple-register-coalescing -o - %s | FileCheck %s
+
+# Test that register coalescing does not allow a call to
+# LIS->getInstructionIndex with a DBG_VALUE instruction, which does not have
+# a slot index.
+
+# CHECK: %13.sub2 = S_MOV_B32 0
+# CHECK: DBG_VALUE{{.*}}debug-use %13.sub2
+
+--- |
+  define amdgpu_kernel void @test(i32 addrspace(1)* %out) { ret void }
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4)
+  !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6)
+  !2 = distinct !DISubprogram(name: "test", scope: !4, file: !4, line: 1, type: !3, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !5)
+  !3 = !DISubroutineType(types: !4)
+  !4 = !{null}
+  !5 = !{!1}
+  !6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 32)
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !DIExpression()
+  !9 = !DILocation(line: 126, column: 9, scope: !2)
+
+...
+---
+name:            test
+tracksRegLiveness: true
+registers:       
+  - { id: 0, class: sgpr_64 }
+  - { id: 1, class: sreg_32_xm0 }
+  - { id: 2, class: sgpr_32 }
+  - { id: 3, class: vgpr_32 }
+  - { id: 4, class: sreg_64_xexec }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: sreg_32 }
+  - { id: 7, class: sreg_32 }
+  - { id: 8, class: sreg_32_xm0 }
+  - { id: 9, class: sreg_64 }
+  - { id: 10, class: sreg_32_xm0 }
+  - { id: 11, class: sreg_32_xm0 }
+  - { id: 12, class: sgpr_64 }
+  - { id: 13, class: sgpr_128 }
+  - { id: 14, class: sreg_32_xm0 }
+  - { id: 15, class: sreg_64 }
+  - { id: 16, class: vgpr_32 }
+  - { id: 17, class: vreg_64 }
+  - { id: 18, class: vgpr_32 }
+  - { id: 19, class: vreg_64 }
+  - { id: 20, class: vreg_64 }
+liveins:         
+  - { reg: '%sgpr0_sgpr1', virtual-reg: '%0' }
+  - { reg: '%vgpr0', virtual-reg: '%3' }
+body:             |
+  bb.0:
+    liveins: %sgpr0_sgpr1, %vgpr0
+  
+    %3 = COPY killed %vgpr0
+    %0 = COPY killed %sgpr0_sgpr1
+    %4 = S_LOAD_DWORDX2_IMM %0, 9, 0 :: (non-temporal dereferenceable invariant load 8 from `i64 addrspace(2)* undef`)
+    %5 = S_LOAD_DWORD_IMM killed %0, 13, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    %18 = V_ASHRREV_I32_e32 31, %3, implicit %exec
+    undef %19.sub0 = COPY killed %3
+    %19.sub1 = COPY killed %18
+    %10 = S_MOV_B32 61440
+    %11 = S_MOV_B32 0
+    DBG_VALUE debug-use %11, debug-use _, !1, !8, debug-location !9
+    undef %12.sub0 = COPY killed %11
+    %12.sub1 = COPY killed %10
+    undef %13.sub0_sub1 = COPY killed %4
+    %13.sub2_sub3 = COPY killed %12
+    %20 = V_LSHL_B64 killed %19, 2, implicit %exec
+    %16 = COPY killed %5
+    BUFFER_STORE_DWORD_ADDR64 killed %16, killed %20, killed %13, 0, 0, 0, 0, 0, implicit %exec :: (store 4 into %ir.out)
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/register-count-comments.ll b/test/CodeGen/AMDGPU/register-count-comments.ll
index bff3a9f5d2b0e8bda680146c942a37aaa0a8874b..26a76cf2041e11563e477aeb61151fe73c83c208 100644
--- a/test/CodeGen/AMDGPU/register-count-comments.ll
+++ b/test/CodeGen/AMDGPU/register-count-comments.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #0
 ; SI: ; Kernel info:
 ; SI: ; NumSgprs: {{[0-9]+}}
 ; SI: ; NumVgprs: {{[0-9]+}}
-define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
+define amdgpu_kernel void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 addrspace(1)* %bbase) nounwind {
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0);
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
   %aptr = getelementptr i32, i32 addrspace(1)* %abase, i32 %tid
@@ -24,7 +24,7 @@ define void @foo(i32 addrspace(1)* noalias %out, i32 addrspace(1)* %abase, i32 a
 
 ; SI-LABEL: {{^}}one_vgpr_used:
 ; SI: NumVgprs: 1
-define void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
+define amdgpu_kernel void @one_vgpr_used(i32 addrspace(1)* %out, i32 %x) nounwind {
   store i32 %x, i32 addrspace(1)* %out, align 4
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/rename-disconnected-bug.ll b/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
index 47bdfba965307a03a12fdef66699ff9533c27cef..5d4955aa1ce2fe2f24b56caa363b1ee2a928596c 100644
--- a/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
+++ b/test/CodeGen/AMDGPU/rename-disconnected-bug.ll
@@ -3,7 +3,7 @@
 ; definition on every path (there should at least be IMPLICIT_DEF instructions).
 target triple = "amdgcn--"
 
-define void @func() {
+define amdgpu_kernel void @func() {
 B0:
   br i1 undef, label %B1, label %B2
 
diff --git a/test/CodeGen/AMDGPU/rename-independent-subregs.mir b/test/CodeGen/AMDGPU/rename-independent-subregs.mir
index b928bc7086bb75f7e9f2200c7579bfbe7edb5513..fc2e4426ba48ff41669e5fe812e31070fc1ce58b 100644
--- a/test/CodeGen/AMDGPU/rename-independent-subregs.mir
+++ b/test/CodeGen/AMDGPU/rename-independent-subregs.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -verify-machineinstrs -run-pass simple-register-coalescing,rename-independent-subregs -o - %s | FileCheck %s
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
 ...
 ---
 # In the test below we have two independent def+use pairs of subregister1 which
diff --git a/test/CodeGen/AMDGPU/reorder-stores.ll b/test/CodeGen/AMDGPU/reorder-stores.ll
index 412202fa5d513ffd81e835a6807d0162fb8e398c..ff4069226a62bc5286991812330635bf02e11648 100644
--- a/test/CodeGen/AMDGPU/reorder-stores.ll
+++ b/test/CodeGen/AMDGPU/reorder-stores.ll
@@ -7,7 +7,7 @@
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocapture %x, <2 x double> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(1)* %x, align 16
   %tmp4 = load <2 x double>, <2 x double> addrspace(1)* %y, align 16
   store <2 x double> %tmp4, <2 x double> addrspace(1)* %x, align 16
@@ -19,7 +19,7 @@ define void @no_reorder_v2f64_global_load_store(<2 x double> addrspace(1)* nocap
 ; SI: ds_read2_b64
 ; SI: ds_write2_b64
 ; SI: s_endpgm
-define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace(3)* nocapture %x, <2 x double> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x double>, <2 x double> addrspace(3)* %x, align 16
   %tmp4 = load <2 x double>, <2 x double> addrspace(3)* %y, align 16
   store <2 x double> %tmp4, <2 x double> addrspace(3)* %x, align 16
@@ -39,7 +39,7 @@ define void @no_reorder_scalarized_v2f64_local_load_store(<2 x double> addrspace
 ; SI: buffer_store_dwordx4
 ; SI: buffer_store_dwordx4
 ; SI: s_endpgm
-define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* nocapture %x, <8 x i32> addrspace(1)* nocapture %y) nounwind {
   %tmp1 = load <8 x i32>, <8 x i32> addrspace(1)* %x, align 32
   %tmp4 = load <8 x i32>, <8 x i32> addrspace(1)* %y, align 32
   store <8 x i32> %tmp4, <8 x i32> addrspace(1)* %x, align 32
@@ -54,7 +54,7 @@ define void @no_reorder_split_v8i32_global_load_store(<8 x i32> addrspace(1)* no
 ; SI-NOT: ds_read
 ; SI: ds_write_b64
 ; SI: s_endpgm
-define void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
+define amdgpu_kernel void @no_reorder_extload_64(<2 x i32> addrspace(3)* nocapture %x, <2 x i32> addrspace(3)* nocapture %y) nounwind {
   %tmp1 = load <2 x i32>, <2 x i32> addrspace(3)* %x, align 8
   %tmp4 = load <2 x i32>, <2 x i32> addrspace(3)* %y, align 8
   %tmp1ext = zext <2 x i32> %tmp1 to <2 x i64>
diff --git a/test/CodeGen/AMDGPU/ret.ll b/test/CodeGen/AMDGPU/ret.ll
index 515203fad4cb12fccca4311a7eacad016987999c..831c71dff79de38fe867a6ac2bdf2a3d8e0e63a5 100644
--- a/test/CodeGen/AMDGPU/ret.ll
+++ b/test/CodeGen/AMDGPU/ret.ll
@@ -1,25 +1,24 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 ; GCN-LABEL: {{^}}vgpr:
 ; GCN: v_mov_b32_e32 v1, v0
 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
-; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm
+; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  %x = fadd float %3, 1.0
-  %a = insertvalue {float, float} undef, float %x, 0
-  %b = insertvalue {float, float} %a, float %3, 1
-  ret {float, float} %b
+define amdgpu_vs { float, float } @vgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  %x = fadd float %arg3, 1.000000e+00
+  %a = insertvalue { float, float } undef, float %x, 0
+  %b = insertvalue { float, float } %a, float %arg3, 1
+  ret { float, float } %b
 }
 
 ; GCN-LABEL: {{^}}vgpr_literal:
 ; GCN: v_mov_b32_e32 v4, v0
-; GCN: exp mrt0 v4, v4, v4, v4 done compr vm
+; GCN: exp mrt0 v4, v4, v4, v4 done vm
 
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
@@ -27,12 +26,12 @@ define amdgpu_vs {float, float} @vgpr([9 x <16 x i8>] addrspace(2)* byval, i32 i
 ; GCN-DAG: v_mov_b32_e32 v3, -1.0
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  ret {float, float, float, float} {float 1.0, float 2.0, float 4.0, float -1.0}
+define amdgpu_vs { float, float, float, float } @vgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  ret { float, float, float, float } { float 1.000000e+00, float 2.000000e+00, float 4.000000e+00, float -1.000000e+00 }
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@@ -44,24 +43,24 @@ define amdgpu_vs {float, float, float, float} @vgpr_literal([9 x <16 x i8>] addr
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v6
 ; GCN-NOT: s_endpgm
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr0([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
   %f0 = bitcast i32 %i0 to float
   %f1 = bitcast i32 %i1 to float
   %f2 = bitcast i32 %i2 to float
   %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 1
 ; GCN-NEXT: .long 165584
@@ -69,11 +68,11 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr0([9 x <16 x i
 ; GCN-LABEL: {{^}}ps_input_ena_no_inputs:
 ; GCN: v_mov_b32_e32 v0, 1.0
 ; GCN-NOT: s_endpgm
-define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-  ret float 1.0
+define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+bb:
+  ret float 1.000000e+00
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 2081
 ; GCN-NEXT: .long 165584
@@ -83,14 +82,14 @@ define amdgpu_ps float @ps_input_ena_no_inputs([9 x <16 x i8>] addrspace(2)* byv
 ; GCN-DAG: v_mov_b32_e32 v1, v2
 ; GCN: v_mov_b32_e32 v2, v3
 ; GCN-NOT: s_endpgm
-define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #0 {
-  %f = bitcast <2 x i32> %8 to <2 x float>
-  %s = insertvalue {float, <2 x float>} undef, float %14, 0
-  %s1 = insertvalue {float, <2 x float>} %s, <2 x float> %f, 1
-  ret {float, <2 x float>} %s1
+define amdgpu_ps { float, <2 x float> } @ps_input_ena_pos_w([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #1 {
+bb:
+  %f = bitcast <2 x i32> %arg8 to <2 x float>
+  %s = insertvalue { float, <2 x float> } undef, float %arg14, 0
+  %s1 = insertvalue { float, <2 x float> } %s, <2 x float> %f, 1
+  ret { float, <2 x float> } %s1
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@@ -102,25 +101,24 @@ define amdgpu_ps {float, <2 x float>} @ps_input_ena_pos_w([9 x <16 x i8>] addrsp
 ; GCN-DAG: v_mov_b32_e32 v3, v6
 ; GCN-DAG: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-attributes #1 = { "InitialPSInputAddr"="1" }
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #1 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr1([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #2 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
   %f0 = bitcast i32 %i0 to float
   %f1 = bitcast i32 %i1 to float
   %f2 = bitcast i32 %i2 to float
   %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@@ -132,25 +130,24 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr1([9 x <16 x i
 ; GCN: v_mov_b32_e32 v3, v8
 ; GCN: v_mov_b32_e32 v4, v12
 ; GCN-NOT: s_endpgm
-attributes #2 = { "InitialPSInputAddr"="119" }
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #2 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
   %f0 = bitcast i32 %i0 to float
   %f1 = bitcast i32 %i1 to float
   %f2 = bitcast i32 %i2 to float
   %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }
 
-
 ; GCN: .long 165580
 ; GCN-NEXT: .long 562
 ; GCN-NEXT: .long 165584
@@ -162,38 +159,37 @@ define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr119([9 x <16 x
 ; GCN: v_mov_b32_e32 v3, v4
 ; GCN: v_mov_b32_e32 v4, v8
 ; GCN-NOT: s_endpgm
-attributes #3 = { "InitialPSInputAddr"="418" }
-define amdgpu_ps {float, float, float, float, float} @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) #3 {
-  %i0 = extractelement <2 x i32> %4, i32 0
-  %i1 = extractelement <2 x i32> %4, i32 1
-  %i2 = extractelement <2 x i32> %7, i32 0
-  %i3 = extractelement <2 x i32> %8, i32 0
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 {
+bb:
+  %i0 = extractelement <2 x i32> %arg4, i32 0
+  %i1 = extractelement <2 x i32> %arg4, i32 1
+  %i2 = extractelement <2 x i32> %arg7, i32 0
+  %i3 = extractelement <2 x i32> %arg8, i32 0
   %f0 = bitcast i32 %i0 to float
   %f1 = bitcast i32 %i1 to float
   %f2 = bitcast i32 %i2 to float
   %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue {float, float, float, float, float} undef, float %f0, 0
-  %r1 = insertvalue {float, float, float, float, float} %r0, float %f1, 1
-  %r2 = insertvalue {float, float, float, float, float} %r1, float %f2, 2
-  %r3 = insertvalue {float, float, float, float, float} %r2, float %f3, 3
-  %r4 = insertvalue {float, float, float, float, float} %r3, float %12, 4
-  ret {float, float, float, float, float} %r4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  ret { float, float, float, float, float } %r4
 }
 
-
 ; GCN-LABEL: {{^}}sgpr:
 ; GCN: s_add_i32 s0, s3, 2
 ; GCN: s_mov_b32 s2, s3
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  %x = add i32 %2, 2
-  %a = insertvalue {i32, i32, i32} undef, i32 %x, 0
-  %b = insertvalue {i32, i32, i32} %a, i32 %1, 1
-  %c = insertvalue {i32, i32, i32} %a, i32 %2, 2
-  ret {i32, i32, i32} %c
+define amdgpu_vs { i32, i32, i32 } @sgpr([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  %x = add i32 %arg2, 2
+  %a = insertvalue { i32, i32, i32 } undef, i32 %x, 0
+  %b = insertvalue { i32, i32, i32 } %a, i32 %arg1, 1
+  %c = insertvalue { i32, i32, i32 } %a, i32 %arg2, 2
+  ret { i32, i32, i32 } %c
 }
 
-
 ; GCN-LABEL: {{^}}sgpr_literal:
 ; GCN: s_mov_b32 s0, 5
 ; GCN-NOT: s_mov_b32 s0, s0
@@ -201,37 +197,37 @@ define amdgpu_vs {i32, i32, i32} @sgpr([9 x <16 x i8>] addrspace(2)* byval, i32
 ; GCN-DAG: s_mov_b32 s2, 7
 ; GCN-DAG: s_mov_b32 s3, 8
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {i32, i32, i32, i32} @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  %x = add i32 %2, 2
-  ret {i32, i32, i32, i32} {i32 5, i32 6, i32 7, i32 8}
+define amdgpu_vs { i32, i32, i32, i32 } @sgpr_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  %x = add i32 %arg2, 2
+  ret { i32, i32, i32, i32 } { i32 5, i32 6, i32 7, i32 8 }
 }
 
-
 ; GCN-LABEL: {{^}}both:
 ; GCN: v_mov_b32_e32 v1, v0
-; GCN-DAG: exp mrt0 v1, v1, v1, v1 done compr vm
+; GCN-DAG: exp mrt0 v1, v1, v1, v1 done vm
 ; GCN-DAG: v_add_f32_e32 v0, 1.0, v1
 ; GCN-DAG: s_add_i32 s0, s3, 2
 ; GCN-DAG: s_mov_b32 s1, s2
 ; GCN: s_mov_b32 s2, s3
 ; GCN: s_waitcnt expcnt(0)
 ; GCN-NOT: s_endpgm
-define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  %v = fadd float %3, 1.0
-  %s = add i32 %2, 2
-  %a0 = insertvalue {float, i32, float, i32, i32} undef, float %v, 0
-  %a1 = insertvalue {float, i32, float, i32, i32} %a0, i32 %s, 1
-  %a2 = insertvalue {float, i32, float, i32, i32} %a1, float %3, 2
-  %a3 = insertvalue {float, i32, float, i32, i32} %a2, i32 %1, 3
-  %a4 = insertvalue {float, i32, float, i32, i32} %a3, i32 %2, 4
-  ret {float, i32, float, i32, i32} %a4
+define amdgpu_vs { float, i32, float, i32, i32 } @both([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  %v = fadd float %arg3, 1.000000e+00
+  %s = add i32 %arg2, 2
+  %a0 = insertvalue { float, i32, float, i32, i32 } undef, float %v, 0
+  %a1 = insertvalue { float, i32, float, i32, i32 } %a0, i32 %s, 1
+  %a2 = insertvalue { float, i32, float, i32, i32 } %a1, float %arg3, 2
+  %a3 = insertvalue { float, i32, float, i32, i32 } %a2, i32 %arg1, 3
+  %a4 = insertvalue { float, i32, float, i32, i32 } %a3, i32 %arg2, 4
+  ret { float, i32, float, i32, i32 } %a4
 }
 
-
 ; GCN-LABEL: {{^}}structure_literal:
 ; GCN: v_mov_b32_e32 v3, v0
-; GCN: exp mrt0 v3, v3, v3, v3 done compr vm
+; GCN: exp mrt0 v3, v3, v3, v3 done vm
 
 ; GCN-DAG: v_mov_b32_e32 v0, 1.0
 ; GCN-DAG: s_mov_b32 s0, 2
@@ -239,9 +235,16 @@ define amdgpu_vs {float, i32, float, i32, i32} @both([9 x <16 x i8>] addrspace(2
 ; GCN-DAG: v_mov_b32_e32 v1, 2.0
 ; GCN-DAG: v_mov_b32_e32 v2, 4.0
 ; GCN: s_waitcnt expcnt(0)
-define amdgpu_vs {{float, i32}, {i32, <2 x float>}} @structure_literal([9 x <16 x i8>] addrspace(2)* byval, i32 inreg, i32 inreg, float) {
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %3, float %3, float %3, float %3)
-  ret {{float, i32}, {i32, <2 x float>}} {{float, i32} {float 1.0, i32 2}, {i32, <2 x float>} {i32 3, <2 x float> <float 2.0, float 4.0>}}
+define amdgpu_vs { { float, i32 }, { i32, <2 x float> } } @structure_literal([9 x <16 x i8>] addrspace(2)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, float %arg3) #0 {
+bb:
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %arg3, float %arg3, float %arg3, float %arg3, i1 true, i1 true) #0
+  ret { { float, i32 }, { i32, <2 x float> } } { { float, i32 } { float 1.000000e+00, i32 2 }, { i32, <2 x float> } { i32 3, <2 x float> <float 2.000000e+00, float 4.000000e+00> } }
 }
 
-attributes #0 = { nounwind "InitialPSInputAddr"="0" }
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "InitialPSInputAddr"="0" }
+attributes #2 = { nounwind "InitialPSInputAddr"="1" }
+attributes #3 = { nounwind "InitialPSInputAddr"="119" }
+attributes #4 = { nounwind "InitialPSInputAddr"="418" }
diff --git a/test/CodeGen/AMDGPU/ret_jump.ll b/test/CodeGen/AMDGPU/ret_jump.ll
index 51ca60492414b96202675135f9368174c237c30f..f2fbacbab82e7dd1b58c0bb700ef89d7013aec60 100644
--- a/test/CodeGen/AMDGPU/ret_jump.ll
+++ b/test/CodeGen/AMDGPU/ret_jump.ll
@@ -4,24 +4,86 @@
 ; This should end with an no-op sequence of exec mask manipulations
 ; Mask should be in original state after executed unreachable block
 
-; GCN-LABEL: {{^}}main:
+
+; GCN-LABEL: {{^}}uniform_br_trivial_ret_divergent_br_trivial_unreachable:
 ; GCN: s_cbranch_scc1 [[RET_BB:BB[0-9]+_[0-9]+]]
 
+; GCN-NEXT: ; %else
+
 ; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
 ; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
-; GCN-NEXT: ; mask branch [[UNREACHABLE_BB:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
 
-; GCN: [[RET_BB]]:
-; GCN-NEXT: s_branch [[FINAL_BB:BB[0-9]+_[0-9]+]]
+; GCN: BB{{[0-9]+_[0-9]+}}: ; %unreachable.bb
+; GCN-NEXT: ; divergent unreachable
 
-; GCN-NEXT: [[UNREACHABLE_BB]]:
-; GCN-NEXT: s_or_b64 exec, exec, [[XOR_EXEC]]
-; GCN-NEXT: [[FINAL_BB]]:
+; GCN-NEXT: {{^}}[[FLOW]]: ; %Flow
+; GCN-NEXT: s_or_b64 exec, exec
+
+; GCN-NEXT: [[RET_BB]]:
+; GCN-NEXT: ; return
 ; GCN-NEXT: .Lfunc_end0
-define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([9 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <8 x i32>] addrspace(2)* byval, i32 addrspace(2)* byval, float inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, i32, i32, float, i32) #0 {
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_trivial_ret_divergent_br_trivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, i32 inreg %arg17, i32 %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
+entry:
+  %i.i = extractelement <2 x i32> %arg7, i32 0
+  %j.i = extractelement <2 x i32> %arg7, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p87 = fmul float undef, %p2.i
+  %p88 = fadd float %p87, undef
+  %p93 = fadd float %p88, undef
+  %p97 = fmul float %p93, undef
+  %p102 = fsub float %p97, undef
+  %p104 = fmul float %p102, undef
+  %p106 = fadd float 0.000000e+00, %p104
+  %p108 = fadd float undef, %p106
+  %uniform.cond = icmp slt i32 %arg17, 0
+  br i1 %uniform.cond, label %ret.bb, label %else
+
+else:                                             ; preds = %main_body
+  %p124 = fmul float %p108, %p108
+  %p125 = fsub float %p124, undef
+  %divergent.cond = fcmp olt float %p125, 0.000000e+00
+  br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
+
+unreachable.bb:                                           ; preds = %else
+  unreachable
+
+ret.bb:                                          ; preds = %else, %main_body
+  ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
+}
+
+; GCN-LABEL: {{^}}uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable:
+; GCN: s_cbranch_vccnz [[RET_BB:BB[0-9]+_[0-9]+]]
+
+; GCN: ; BB#{{[0-9]+}}: ; %else
+; GCN: s_and_saveexec_b64 [[SAVE_EXEC:s\[[0-9]+:[0-9]+\]]], vcc
+; GCN-NEXT: s_xor_b64 [[XOR_EXEC:s\[[0-9]+:[0-9]+\]]], exec, [[SAVE_EXEC]]
+; GCN-NEXT: ; mask branch [[FLOW1:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT:  ; %unreachable.bb
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+; GCN: ; divergent unreachable
+
+; GCN: ; %ret.bb
+; GCN: store_dword
+
+; GCN: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: ; return
+; GCN-NEXT: .Lfunc_end
+define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @uniform_br_nontrivial_ret_divergent_br_nontrivial_unreachable([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <8 x i32>] addrspace(2)* byval %arg2, i32 addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, i32 inreg %arg18, i32 %arg19, float %arg20, i32 %arg21) #0 {
 main_body:
-  %p83 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %5, <2 x i32> %7)
-  %p87 = fmul float undef, %p83
+  %i.i = extractelement <2 x i32> %arg7, i32 0
+  %j.i = extractelement <2 x i32> %arg7, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 1, i32 0, i32 %arg5) #2
+  %p87 = fmul float undef, %p2.i
   %p88 = fadd float %p87, undef
   %p93 = fadd float %p88, undef
   %p97 = fmul float %p93, undef
@@ -29,26 +91,35 @@ main_body:
   %p104 = fmul float %p102, undef
   %p106 = fadd float 0.000000e+00, %p104
   %p108 = fadd float undef, %p106
-  br i1 undef, label %ENDIF69, label %ELSE
+  %uniform.cond = icmp slt i32 %arg18, 0
+  br i1 %uniform.cond, label %ret.bb, label %else
 
-ELSE:                                             ; preds = %main_body
+else:                                             ; preds = %main_body
   %p124 = fmul float %p108, %p108
   %p125 = fsub float %p124, undef
-  %p126 = fcmp olt float %p125, 0.000000e+00
-  br i1 %p126, label %ENDIF69, label %ELSE41
+  %divergent.cond = fcmp olt float %p125, 0.000000e+00
+  br i1 %divergent.cond, label %ret.bb, label %unreachable.bb
 
-ELSE41:                                           ; preds = %ELSE
+unreachable.bb:                                           ; preds = %else
+  store volatile i32 8, i32 addrspace(3)* undef
   unreachable
 
-ENDIF69:                                          ; preds = %ELSE, %main_body
+ret.bb:                                          ; preds = %else, %main_body
+  store volatile i32 11, i32 addrspace(1)* undef
   ret <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef
 }
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
 
 ; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+
+; Function Attrs: nounwind readnone
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
 ; Function Attrs: nounwind readnone
 declare float @llvm.fabs.f32(float) #1
@@ -61,3 +132,4 @@ declare float @llvm.floor.f32(float) #1
 
 attributes #0 = { "InitialPSInputAddr"="36983" }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/rotl.i64.ll b/test/CodeGen/AMDGPU/rotl.i64.ll
index b60c470de97c350890fb7f1da2860d11abe93e9c..266490718dd182e6678b17af0c68f4a9a144cbc5 100644
--- a/test/CodeGen/AMDGPU/rotl.i64.ll
+++ b/test/CodeGen/AMDGPU/rotl.i64.ll
@@ -7,7 +7,7 @@
 ; BOTH-DAG: s_lshr_b64
 ; BOTH: s_or_b64
 ; BOTH: s_endpgm
-define void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_rotl_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %0 = shl i64 %x, %y
   %1 = sub i64 64, %y
@@ -26,7 +26,7 @@ entry:
 ; BOTH: v_or_b32
 ; BOTH: v_or_b32
 ; BOTH: s_endpgm
-define void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotl_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64, i64 addrspace(1)* %xptr, align 8
   %y = load i64, i64 addrspace(1)* %yptr, align 8
diff --git a/test/CodeGen/AMDGPU/rotl.ll b/test/CodeGen/AMDGPU/rotl.ll
index 7d2b5538ca333c220290b013681585aaa49e8d9e..c4bc8cdaabf5ba58e9b82d2227a1ce15ee9effc0 100644
--- a/test/CodeGen/AMDGPU/rotl.ll
+++ b/test/CodeGen/AMDGPU/rotl.ll
@@ -10,7 +10,7 @@
 ; SI: s_sub_i32 [[SDST:s[0-9]+]], 32, {{[s][0-9]+}}
 ; SI: v_mov_b32_e32 [[VDST:v[0-9]+]], [[SDST]]
 ; SI: v_alignbit_b32 {{v[0-9]+, [s][0-9]+, s[0-9]+}}, [[VDST]]
-define void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @rotl_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %0 = shl i32 %x, %y
   %1 = sub i32 32, %y
@@ -26,7 +26,7 @@ entry:
 ; SI-DAG: v_alignbit_b32
 ; SI-DAG: v_alignbit_b32
 ; SI: s_endpgm
-define void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @rotl_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %0 = shl <2 x i32> %x, %y
   %1 = sub <2 x i32> <i32 32, i32 32>, %y
@@ -46,7 +46,7 @@ entry:
 ; SI-DAG: s_sub_i32
 ; SI-DAG: v_alignbit_b32
 ; SI: s_endpgm
-define void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @rotl_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %0 = shl <4 x i32> %x, %y
   %1 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
diff --git a/test/CodeGen/AMDGPU/rotr.i64.ll b/test/CodeGen/AMDGPU/rotr.i64.ll
index 58a1efe080792b38e10532e392c234714cc2c124..9eda479cd25c2d92da6ac1eb95f8661a59c1c49d 100644
--- a/test/CodeGen/AMDGPU/rotr.i64.ll
+++ b/test/CodeGen/AMDGPU/rotr.i64.ll
@@ -6,7 +6,7 @@
 ; BOTH-DAG: s_lshr_b64
 ; BOTH-DAG: s_lshl_b64
 ; BOTH: s_or_b64
-define void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_rotr_i64(i64 addrspace(1)* %in, i64 %x, i64 %y) {
 entry:
   %tmp0 = sub i64 64, %y
   %tmp1 = shl i64 %x, %tmp0
@@ -24,7 +24,7 @@ entry:
 ; VI-DAG: v_lshlrev_b64
 ; BOTH: v_or_b32
 ; BOTH: v_or_b32
-define void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotr_i64(i64 addrspace(1)* %in, i64 addrspace(1)* %xptr, i64 addrspace(1)* %yptr) {
 entry:
   %x = load i64, i64 addrspace(1)* %xptr, align 8
   %y = load i64, i64 addrspace(1)* %yptr, align 8
@@ -37,7 +37,7 @@ entry:
 }
 
 ; BOTH-LABEL: {{^}}s_rotr_v2i64:
-define void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
+define amdgpu_kernel void @s_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> %x, <2 x i64> %y) {
 entry:
   %tmp0 = sub <2 x i64> <i64 64, i64 64>, %y
   %tmp1 = shl <2 x i64> %x, %tmp0
@@ -48,7 +48,7 @@ entry:
 }
 
 ; BOTH-LABEL: {{^}}v_rotr_v2i64:
-define void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
+define amdgpu_kernel void @v_rotr_v2i64(<2 x i64> addrspace(1)* %in, <2 x i64> addrspace(1)* %xptr, <2 x i64> addrspace(1)* %yptr) {
 entry:
   %x = load <2 x i64>, <2 x i64> addrspace(1)* %xptr, align 8
   %y = load <2 x i64>, <2 x i64> addrspace(1)* %yptr, align 8
diff --git a/test/CodeGen/AMDGPU/rotr.ll b/test/CodeGen/AMDGPU/rotr.ll
index 55d180077cc7570af7f075c4ede21680a2e07838..b4e2c2b67ce1442170c96cc1efcad72146e98366 100644
--- a/test/CodeGen/AMDGPU/rotr.ll
+++ b/test/CodeGen/AMDGPU/rotr.ll
@@ -6,7 +6,7 @@
 ; R600: BIT_ALIGN_INT
 
 ; SI: v_alignbit_b32
-define void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
+define amdgpu_kernel void @rotr_i32(i32 addrspace(1)* %in, i32 %x, i32 %y) {
 entry:
   %tmp0 = sub i32 32, %y
   %tmp1 = shl i32 %x, %tmp0
@@ -22,7 +22,7 @@ entry:
 
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
-define void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @rotr_v2i32(<2 x i32> addrspace(1)* %in, <2 x i32> %x, <2 x i32> %y) {
 entry:
   %tmp0 = sub <2 x i32> <i32 32, i32 32>, %y
   %tmp1 = shl <2 x i32> %x, %tmp0
@@ -42,7 +42,7 @@ entry:
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
 ; SI: v_alignbit_b32
-define void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @rotr_v4i32(<4 x i32> addrspace(1)* %in, <4 x i32> %x, <4 x i32> %y) {
 entry:
   %tmp0 = sub <4 x i32> <i32 32, i32 32, i32 32, i32 32>, %y
   %tmp1 = shl <4 x i32> %x, %tmp0
diff --git a/test/CodeGen/AMDGPU/rsq.ll b/test/CodeGen/AMDGPU/rsq.ll
index 699440c3efbf91eb78a67328db79c6ef746836a9..9462683efe0e83ee9137e81ab2c3a7cb90f6ebef 100644
--- a/test/CodeGen/AMDGPU/rsq.ll
+++ b/test/CodeGen/AMDGPU/rsq.ll
@@ -8,7 +8,7 @@ declare double @llvm.sqrt.f64(double) nounwind readnone
 ; SI-LABEL: {{^}}rsq_f32:
 ; SI: v_rsq_f32_e32
 ; SI: s_endpgm
-define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv float 1.0, %sqrt
@@ -20,7 +20,7 @@ define void @rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noali
 ; SI-UNSAFE: v_rsq_f64_e32
 ; SI-SAFE: v_sqrt_f64_e32
 ; SI: s_endpgm
-define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val) nounwind readnone
   %div = fdiv double 1.0, %sqrt
@@ -31,7 +31,7 @@ define void @rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noa
 ; SI-LABEL: {{^}}rsq_f32_sgpr:
 ; SI: v_rsq_f32_e32 {{v[0-9]+}}, {{s[0-9]+}}
 ; SI: s_endpgm
-define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
+define amdgpu_kernel void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind {
   %sqrt = call float @llvm.sqrt.f32(float %val) nounwind readnone
   %div = fdiv float 1.0, %sqrt
   store float %div, float addrspace(1)* %out, align 4
@@ -55,7 +55,7 @@ define void @rsq_f32_sgpr(float addrspace(1)* noalias %out, float %val) nounwind
 ; SI-SAFE-NOT: v_rsq_f32
 
 ; SI: s_endpgm
-define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
   %gep.0 = getelementptr float, float addrspace(1)* %in, i32 %tid
@@ -81,7 +81,7 @@ define void @rsqrt_fmul(float addrspace(1)* %out, float addrspace(1)* %in) {
 ; SI-UNSAFE: v_rsq_f32_e32 [[RSQ:v[0-9]+]], v{{[0-9]+}}
 ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
 ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
-define void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %sqrt = call float @llvm.sqrt.f32(float %val)
   %div = fdiv float -1.0, %sqrt
@@ -96,7 +96,7 @@ define void @neg_rsq_f32(float addrspace(1)* noalias %out, float addrspace(1)* n
 ; SI-UNSAFE: v_sqrt_f64_e32 [[SQRT:v\[[0-9]+:[0-9]+\]]], v{{\[[0-9]+:[0-9]+\]}}
 ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
 ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
-define void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %sqrt = call double @llvm.sqrt.f64(double %val)
   %div = fdiv double -1.0, %sqrt
@@ -112,7 +112,7 @@ define void @neg_rsq_f64(double addrspace(1)* noalias %out, double addrspace(1)*
 ; SI-UNSAFE: v_rsq_f32_e64 [[RSQ:v[0-9]+]], -v{{[0-9]+}}
 ; SI-UNSAFE: v_xor_b32_e32 [[NEG_RSQ:v[0-9]+]], 0x80000000, [[RSQ]]
 ; SI-UNSAFE: buffer_store_dword [[NEG_RSQ]]
-define void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in) nounwind {
   %val = load float, float addrspace(1)* %in, align 4
   %val.fneg = fsub float -0.0, %val
   %sqrt = call float @llvm.sqrt.f32(float %val.fneg)
@@ -128,7 +128,7 @@ define void @neg_rsq_neg_f32(float addrspace(1)* noalias %out, float addrspace(1
 ; SI-UNSAFE: v_sqrt_f64_e64 [[SQRT:v\[[0-9]+:[0-9]+\]]], -v{{\[[0-9]+:[0-9]+\]}}
 ; SI-UNSAFE: v_rcp_f64_e64 [[RCP:v\[[0-9]+:[0-9]+\]]], -[[SQRT]]
 ; SI-UNSAFE: buffer_store_dwordx2 [[RCP]]
-define void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
+define amdgpu_kernel void @neg_rsq_neg_f64(double addrspace(1)* noalias %out, double addrspace(1)* noalias %in) nounwind {
   %val = load double, double addrspace(1)* %in, align 4
   %val.fneg = fsub double -0.0, %val
   %sqrt = call double @llvm.sqrt.f64(double %val.fneg)
diff --git a/test/CodeGen/AMDGPU/runtime-metadata.ll b/test/CodeGen/AMDGPU/runtime-metadata.ll
deleted file mode 100644
index abdbc325fd4d811f5269ca0ebbed7b8cd3dea63c..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/runtime-metadata.ll
+++ /dev/null
@@ -1,396 +0,0 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -o - < %s | llvm-readobj -amdgpu-runtime-metadata | FileCheck %s
-; RUN: llc -mtriple=amdgcn--amdhsa -filetype=obj -amdgpu-dump-rtmd -amdgpu-check-rtmd-parser %s -o - 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=PARSER %s
-
-%struct.A = type { i8, float }
-%opencl.image1d_t = type opaque
-%opencl.image2d_t = type opaque
-%opencl.image3d_t = type opaque
-%opencl.queue_t = type opaque
-%opencl.pipe_t = type opaque
-%struct.B = type { i32 addrspace(1)*}
-%opencl.clk_event_t = type opaque
-
-; CHECK: ---
-; CHECK-NEXT: { amd.MDVersion: [ 2, 0 ], amd.PrintfInfo: [ '1:1:4:%d\n', '2:1:8:%g\n' ], amd.Kernels: 
-
-; CHECK-NEXT:   - { amd.KernelName: test_char, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 1, amd.ArgAlign: 1, amd.ArgKind: 0, amd.ArgValueType: 1, amd.ArgTypeName: char, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_char(i8 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !9 !kernel_arg_base_type !9 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_ushort2, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 4, amd.ArgTypeName: ushort2, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_ushort2(<2 x i16> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !10 !kernel_arg_base_type !10 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_int3, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 16, amd.ArgAlign: 16, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_int3(<3 x i32> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !11 !kernel_arg_base_type !11 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_ulong4, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 32, amd.ArgAlign: 32, amd.ArgKind: 0, amd.ArgValueType: 10, amd.ArgTypeName: ulong4, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_ulong4(<4 x i64> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !12 !kernel_arg_base_type !12 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_half8, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 16, amd.ArgAlign: 16, amd.ArgKind: 0, amd.ArgValueType: 5, amd.ArgTypeName: half8, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_half8(<8 x half> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !13 !kernel_arg_base_type !13 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_float16, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 64, amd.ArgAlign: 64, amd.ArgKind: 0, amd.ArgValueType: 8, amd.ArgTypeName: float16, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_float16(<16 x float> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_double16, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 128, amd.ArgAlign: 128, amd.ArgKind: 0, amd.ArgValueType: 11, amd.ArgTypeName: double16, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_double16(<16 x double> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !15 !kernel_arg_base_type !15 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_pointer, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_pointer(i32 addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !16 !kernel_arg_base_type !16 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_image, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image2d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_image(%opencl.image2d_t addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !17 !kernel_arg_base_type !17 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_sampler, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 3, amd.ArgValueType: 6, amd.ArgTypeName: sampler_t, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_sampler(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !18 !kernel_arg_base_type !18 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_queue, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 6, amd.ArgValueType: 0, amd.ArgTypeName: queue_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_queue(%opencl.queue_t addrspace(1)* %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !19 !kernel_arg_base_type !19 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_struct, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 1, amd.ArgValueType: 0, amd.ArgTypeName: struct A, amd.ArgAddrQual: 0, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_struct(%struct.A* byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !20 !kernel_arg_base_type !20 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_i128, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 16, amd.ArgAlign: 8, amd.ArgKind: 0, amd.ArgValueType: 0, amd.ArgTypeName: i128, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_i128(i128 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !21 !kernel_arg_base_type !21 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_multi_arg, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 3, amd.ArgTypeName: short2, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 1, amd.ArgTypeName: char3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_multi_arg(i32 %a, <2 x i16> %b, <3 x i8> %c) !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !24 !kernel_arg_base_type !24 !kernel_arg_type_qual !25 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_addr_space, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 2, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 4, amd.ArgKind: 2, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_addr_space(i32 addrspace(1)* %g, i32 addrspace(2)* %c, i32 addrspace(3)* %l) !kernel_arg_addr_space !50 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !25 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_type_qual, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0, amd.ArgIsVolatile: 1 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0, amd.ArgIsConst: 1, amd.ArgIsRestrict: 1 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 5, amd.ArgValueType: 0, amd.ArgTypeName: 'int *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0, amd.ArgIsPipe: 1 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_type_qual(i32 addrspace(1)* %a, i32 addrspace(1)* %b, %opencl.pipe_t addrspace(1)* %c) !kernel_arg_addr_space !22 !kernel_arg_access_qual !23 !kernel_arg_type !51 !kernel_arg_base_type !51 !kernel_arg_type_qual !70 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_access_qual, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image1d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 1 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image2d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 2 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 4, amd.ArgValueType: 0, amd.ArgTypeName: image3d_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 3 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_access_qual(%opencl.image1d_t addrspace(1)* %ro, %opencl.image2d_t addrspace(1)* %wo, %opencl.image3d_t addrspace(1)* %rw) !kernel_arg_addr_space !60 !kernel_arg_access_qual !61 !kernel_arg_type !62 !kernel_arg_base_type !62 !kernel_arg_type_qual !25 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_half, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: half, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_half(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !26 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_float, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: float, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_float(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !27 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_double, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: double, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_double(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !28 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_char, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: char, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_char(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !29 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_short, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: short, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_short(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !30 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_long, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: long, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_long(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !31 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_vec_type_hint_unknown, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.VecTypeHint: unknown, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_vec_type_hint_unknown(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !32 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_reqd_wgs_vec_type_hint, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.ReqdWorkGroupSize: [ 1, 2, 4 ], amd.VecTypeHint: int, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_reqd_wgs_vec_type_hint(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !5 !reqd_work_group_size !6 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_wgs_hint_vec_type_hint, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.WorkGroupSizeHint: [ 8, 16, 32 ], amd.VecTypeHint: uint4, amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: int, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_wgs_hint_vec_type_hint(i32 %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !3 !kernel_arg_base_type !3 !kernel_arg_type_qual !4 !vec_type_hint !7 !work_group_size_hint !8 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_arg_ptr_to_ptr, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 6, amd.ArgTypeName: 'int **', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_arg_ptr_to_ptr(i32 * addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !80 !kernel_arg_base_type !80 !kernel_arg_type_qual !4 {
-  ret void
-}
-; CHECK-NEXT:   - { amd.KernelName: test_arg_struct_contains_ptr, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 1, amd.ArgValueType: 0, amd.ArgTypeName: struct B, amd.ArgAddrQual: 0, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_arg_struct_contains_ptr(%struct.B * byval %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !82 !kernel_arg_base_type !82 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_arg_vector_of_ptr, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 16, amd.ArgAlign: 16, amd.ArgKind: 0, amd.ArgValueType: 6, amd.ArgTypeName: 'global int* __attribute__((ext_vector_type(2)))', amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_arg_vector_of_ptr(<2 x i32 addrspace(1)*> %a) !kernel_arg_addr_space !1 !kernel_arg_access_qual !2 !kernel_arg_type !83 !kernel_arg_base_type !83 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-
-; CHECK-NEXT:   - { amd.KernelName: test_arg_unknown_builtin_type, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 0, amd.ArgTypeName: clk_event_t, amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-define amdgpu_kernel void @test_arg_unknown_builtin_type(%opencl.clk_event_t addrspace(1)* %a) !kernel_arg_addr_space !81 !kernel_arg_access_qual !2 !kernel_arg_type !84 !kernel_arg_base_type !84 !kernel_arg_type_qual !4 {
-  ret void
-}
-
-; CHECK-NEXT:   - { amd.KernelName: test_pointee_align, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args: 
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 1, amd.ArgValueType: 9, amd.ArgTypeName: 'long *', amd.ArgAddrQual: 1, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 1, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 2, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char2 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 4, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char3 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 4, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char4 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 8, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char8 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgPointeeAlign: 16, amd.ArgKind: 2, amd.ArgValueType: 1, amd.ArgTypeName: 'char16 *', amd.ArgAddrQual: 3, amd.ArgAccQual: 0 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-; CHECK-NEXT:       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } } }
-define amdgpu_kernel void @test_pointee_align(i64 addrspace(1)* %a, i8 addrspace(3)* %b, <2 x i8> addrspace(3)* %c, <3 x i8> addrspace(3)* %d, <4 x i8> addrspace(3)* %e, <8 x i8> addrspace(3)* %f, <16 x i8> addrspace(3)* %g) !kernel_arg_addr_space !91 !kernel_arg_access_qual !92 !kernel_arg_type !93 !kernel_arg_base_type !93 !kernel_arg_type_qual !94 {
-  ret void
-}
-
-; CHECK-NEXT:...
-
-; PARSER: AMDGPU runtime metadata parser test passes.
-
-!llvm.printf.fmts = !{!100, !101}
-
-!1 = !{i32 0}
-!2 = !{!"none"}
-!3 = !{!"int"}
-!4 = !{!""}
-!5 = !{i32 undef, i32 1}
-!6 = !{i32 1, i32 2, i32 4}
-!7 = !{<4 x i32> undef, i32 0}
-!8 = !{i32 8, i32 16, i32 32}
-!9 = !{!"char"}
-!10 = !{!"ushort2"}
-!11 = !{!"int3"}
-!12 = !{!"ulong4"}
-!13 = !{!"half8"}
-!14 = !{!"float16"}
-!15 = !{!"double16"}
-!16 = !{!"int *"}
-!17 = !{!"image2d_t"}
-!18 = !{!"sampler_t"}
-!19 = !{!"queue_t"}
-!20 = !{!"struct A"}
-!21 = !{!"i128"}
-!22 = !{i32 0, i32 0, i32 0}
-!23 = !{!"none", !"none", !"none"}
-!24 = !{!"int", !"short2", !"char3"}
-!25 = !{!"", !"", !""}
-!26 = !{half undef, i32 1}
-!27 = !{float undef, i32 1}
-!28 = !{double undef, i32 1}
-!29 = !{i8 undef, i32 1}
-!30 = !{i16 undef, i32 1}
-!31 = !{i64 undef, i32 1}
-!32 = !{i32 *undef, i32 1}
-!50 = !{i32 1, i32 2, i32 3}
-!51 = !{!"int *", !"int *", !"int *"}
-!60 = !{i32 1, i32 1, i32 1}
-!61 = !{!"read_only", !"write_only", !"read_write"}
-!62 = !{!"image1d_t", !"image2d_t", !"image3d_t"}
-!70 = !{!"volatile", !"const restrict", !"pipe"}
-!80 = !{!"int **"}
-!81 = !{i32 1}
-!82 = !{!"struct B"}
-!83 = !{!"global int* __attribute__((ext_vector_type(2)))"}
-!84 = !{!"clk_event_t"}
-!opencl.ocl.version = !{!90}
-!90 = !{i32 2, i32 0}
-!91 = !{i32 0, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3}
-!92 = !{!"none", !"none", !"none", !"none", !"none", !"none", !"none"}
-!93 = !{!"long *", !"char *", !"char2 *", !"char3 *", !"char4 *", !"char8 *", !"char16 *"}
-!94 = !{!"", !"", !"", !"", !"", !"", !""}
-!100 = !{!"1:1:4:%d\5Cn"}
-!101 = !{!"2:1:8:%g\5Cn"}
diff --git a/test/CodeGen/AMDGPU/s_addk_i32.ll b/test/CodeGen/AMDGPU/s_addk_i32.ll
index acceb3272fc3d5c85aecb6d55c1d38214acf1136..deef24cea3775b85fddf04e359dd1e26330234a4 100644
--- a/test/CodeGen/AMDGPU/s_addk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_addk_i32.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[VRESULT]]
 ; SI: s_endpgm
-define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 65
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@ define void @s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
 ; SI-DAG: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, [[K]]
 ; SI: s_endpgm
-define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %a, i32 %b) {
   %add0 = add i32 %a, 65
   %add1 = add i32 %b, 65
   store i32 %add0, i32 addrspace(1)* %out0
@@ -30,7 +30,7 @@ define void @s_addk_i32_k0_x2(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1,
 ; SI-LABEL: {{^}}s_addk_i32_k1:
 ; SI: s_addk_i32 {{s[0-9]+}}, 0x7fff{{$}}
 ; SI: s_endpgm
-define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 32767 ; (1 << 15) - 1
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -39,7 +39,7 @@ define void @s_addk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}s_addk_i32_k2:
 ; SI: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, 17
 ; SI: s_endpgm
-define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, -17
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -48,7 +48,7 @@ define void @s_addk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}s_addk_i32_k3:
 ; SI: s_addk_i32 {{s[0-9]+}}, 0xffbf{{$}}
 ; SI: s_endpgm
-define void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, -65
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -58,7 +58,7 @@ define void @s_addk_i32_k3(i32 addrspace(1)* %out, i32 %b) {
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x41
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x42
 ; SI: s_endpgm
-define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
+define amdgpu_kernel void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
   %add = add <2 x i32> %b, <i32 65, i32 66>
   store <2 x i32> %add, <2 x i32> addrspace(1)* %out
   ret void
@@ -70,7 +70,7 @@ define void @s_addk_v2i32_k0(<2 x i32> addrspace(1)* %out, <2 x i32> %b) {
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x43
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x44
 ; SI: s_endpgm
-define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
+define amdgpu_kernel void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
   %add = add <4 x i32> %b, <i32 65, i32 66, i32 67, i32 68>
   store <4 x i32> %add, <4 x i32> addrspace(1)* %out
   ret void
@@ -86,7 +86,7 @@ define void @s_addk_v4i32_k0(<4 x i32> addrspace(1)* %out, <4 x i32> %b) {
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x47
 ; SI-DAG: s_addk_i32 {{s[0-9]+}}, 0x48
 ; SI: s_endpgm
-define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
+define amdgpu_kernel void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
   %add = add <8 x i32> %b, <i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72>
   store <8 x i32> %add, <8 x i32> addrspace(1)* %out
   ret void
@@ -95,7 +95,7 @@ define void @s_addk_v8i32_k0(<8 x i32> addrspace(1)* %out, <8 x i32> %b) {
 ; SI-LABEL: {{^}}no_s_addk_i32_k0:
 ; SI: s_add_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8000{{$}}
 ; SI: s_endpgm
-define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %add = add i32 %b, 32768 ; 1 << 15
   store i32 %add, i32 addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@ define void @no_s_addk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 
 ; SI-LABEL: {{^}}commute_s_addk_i32:
 ; SI: s_addk_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
+define amdgpu_kernel void @commute_s_addk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = add i32 %size, %b
   call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
diff --git a/test/CodeGen/AMDGPU/s_movk_i32.ll b/test/CodeGen/AMDGPU/s_movk_i32.ll
index 0164c45083a2276b54c9d45cf2c6e11b8744490c..a131aaa3dfb4f8c1e46cc728c64d09aa986968fd 100644
--- a/test/CodeGen/AMDGPU/s_movk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_movk_i32.ll
@@ -7,7 +7,7 @@
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295032831 ; ((1 << 16) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -21,7 +21,7 @@ define void @s_movk_i32_k0(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000063 ; ((1 << 15) - 1) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -35,7 +35,7 @@ define void @s_movk_i32_k1(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 64, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 274877939711 ; ((1 << 15) - 1) | (64 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -49,7 +49,7 @@ define void @s_movk_i32_k2(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295000064 ; (1 << 15) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -63,7 +63,7 @@ define void @s_movk_i32_k3(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 1, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 4295098368 ; (1 << 17) | (1 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -78,7 +78,7 @@ define void @s_movk_i32_k4(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 18374967954648334319 ; -17 & 0xff00ffffffffffff
   store i64 %or, i64 addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @s_movk_i32_k5(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, 63, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 270582939713 ; 65 | (63 << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -107,7 +107,7 @@ define void @s_movk_i32_k6(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 70368744185856; ((1 << 13)) | ((1 << 14) << 32)
   store i64 %or, i64 addrspace(1)* %out
@@ -122,7 +122,7 @@ define void @s_movk_i32_k7(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906816 ; 0x11111111ffff8000
   store i64 %or, i64 addrspace(1)* %out
@@ -137,7 +137,7 @@ define void @s_movk_i32_k8(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255906817 ; 0x11111111ffff8001
   store i64 %or, i64 addrspace(1)* %out
@@ -152,7 +152,7 @@ define void @s_movk_i32_k9(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 add
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255909000 ; 0x11111111ffff8888
   store i64 %or, i64 addrspace(1)* %out
@@ -167,7 +167,7 @@ define void @s_movk_i32_k10(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255910911 ; 0x11111111ffff8fff
   store i64 %or, i64 addrspace(1)* %out
@@ -182,7 +182,7 @@ define void @s_movk_i32_k11(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 ad
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[LO_S_IMM]], v[[LO_VREG]]
 ; SI-DAG: v_or_b32_e32 {{v[0-9]+}}, [[HI_S_IMM]], v[[HI_VREG]]
 ; SI: s_endpgm
-define void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @s_movk_i32_k12(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 4
   %or = or i64 %loada, 1229782942255902721 ; 0x11111111ffff7001
   store i64 %or, i64 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/s_mulk_i32.ll b/test/CodeGen/AMDGPU/s_mulk_i32.ll
index e83b368cc1cbd0c658d800011e2d24db68abc8df..f6ed5408ba45332fd23a877f7b702c68e542bc7e 100644
--- a/test/CodeGen/AMDGPU/s_mulk_i32.ll
+++ b/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -7,7 +7,7 @@
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
 ; SI: buffer_store_dword [[VRESULT]]
 ; SI: s_endpgm
-define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 65
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}s_mulk_i32_k1:
 ; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}}
 ; SI: s_endpgm
-define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 32767 ; (1 << 15) - 1
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -25,7 +25,7 @@ define void @s_mulk_i32_k1(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}s_mulk_i32_k2:
 ; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}}
 ; SI: s_endpgm
-define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, -17
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -34,7 +34,7 @@ define void @s_mulk_i32_k2(i32 addrspace(1)* %out, i32 %b) {
 ; SI-LABEL: {{^}}no_s_mulk_i32_k0:
 ; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}
 ; SI: s_endpgm
-define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
+define amdgpu_kernel void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
   %mul = mul i32 %b, 32769 ; 1 << 15 + 1
   store i32 %mul, i32 addrspace(1)* %out
   ret void
@@ -44,7 +44,7 @@ define void @no_s_mulk_i32_k0(i32 addrspace(1)* %out, i32 %b) {
 
 ; SI-LABEL: {{^}}commute_s_mulk_i32:
 ; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
+define amdgpu_kernel void @commute_s_mulk_i32(i32 addrspace(1)* %out, i32 %b) #0 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = mul i32 %size, %b
   call void asm sideeffect "; foo $0, $1", "v,s"([512 x i32] addrspace(3)* @lds, i32 %add)
diff --git a/test/CodeGen/AMDGPU/sad.ll b/test/CodeGen/AMDGPU/sad.ll
index 534483401638293a269520c4e415341d69ee50ec..f7a1c65881d02868b99f190e4eec880d3a588ac2 100644
--- a/test/CodeGen/AMDGPU/sad.ll
+++ b/test/CodeGen/AMDGPU/sad.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -18,7 +18,7 @@ define void @v_sad_u32_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 
 ; GCN-LABEL: {{^}}v_sad_u32_constant_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 20
-define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
   %icmp0 = icmp ugt i32 %a, 90
   %t0 = select i1 %icmp0, i32 %a, i32 90
 
@@ -34,7 +34,7 @@ define void @v_sad_u32_constant_pat1(i32 addrspace(1)* %out, i32 %a) {
 
 ; GCN-LABEL: {{^}}v_sad_u32_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -51,7 +51,7 @@ define void @v_sad_u32_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 ; GCN: s_min_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_add_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -68,7 +68,7 @@ define void @v_sad_u32_multi_use_sub_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_add_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -84,7 +84,7 @@ define void @v_sad_u32_multi_use_add_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_max_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
   store volatile i32 %t0, i32 *undef
@@ -101,7 +101,7 @@ define void @v_sad_u32_multi_use_max_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_min_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -119,7 +119,7 @@ define void @v_sad_u32_multi_use_min_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b
 
 ; GCN-LABEL: {{^}}v_sad_u32_multi_use_sub_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   store volatile i32 %sub0, i32 *undef
@@ -136,7 +136,7 @@ define void @v_sad_u32_multi_use_sub_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_cmp_gt_u32_e32 vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %b
   %sub1 = sub i32 %b, %a
@@ -154,7 +154,7 @@ define void @v_sad_u32_multi_use_select_pat2(i32 addrspace(1)* %out, i32 %a, i32
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %t0 = select <4 x i1> %icmp0, <4 x i32> %a, <4 x i32> %b
 
@@ -173,7 +173,7 @@ define void @v_sad_u32_vector_pat1(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+define amdgpu_kernel void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
   %icmp0 = icmp ugt <4 x i32> %a, %b
   %sub0 = sub <4 x i32> %a, %b
   %sub1 = sub <4 x i32> %b, %a
@@ -187,7 +187,7 @@ define void @v_sad_u32_vector_pat2(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
+define amdgpu_kernel void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c) {
 
   %icmp0 = icmp ugt i16 %a, %b
   %t0 = select i1 %icmp0, i16 %a, i16 %b
@@ -204,7 +204,7 @@ define void @v_sad_u32_i16_pat1(i16 addrspace(1)* %out, i16 %a, i16 %b, i16 %c)
 
 ; GCN-LABEL: {{^}}v_sad_u32_i16_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
+define amdgpu_kernel void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b, i16 zeroext %c) {
   %icmp0 = icmp ugt i16 %a, %b
   %sub0 = sub i16 %a, %b
   %sub1 = sub i16 %b, %a
@@ -218,7 +218,7 @@ define void @v_sad_u32_i16_pat2(i16 addrspace(1)* %out, i16 zeroext %a, i16 zero
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat1:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
+define amdgpu_kernel void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %t0 = select i1 %icmp0, i8 %a, i8 %b
 
@@ -234,7 +234,7 @@ define void @v_sad_u32_i8_pat1(i8 addrspace(1)* %out, i8 %a, i8 %b, i8 %c) {
 
 ; GCN-LABEL: {{^}}v_sad_u32_i8_pat2:
 ; GCN: v_sad_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
+define amdgpu_kernel void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext %b, i8 zeroext %c) {
   %icmp0 = icmp ugt i8 %a, %b
   %sub0 = sub i8 %a, %b
   %sub1 = sub i8 %b, %a
@@ -251,7 +251,7 @@ define void @v_sad_u32_i8_pat2(i8 addrspace(1)* %out, i8 zeroext %a, i8 zeroext
 ; GCN: s_max_u32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %t0 = select i1 %icmp0, i32 %a, i32 %b
 
@@ -269,7 +269,7 @@ define void @v_sad_u32_mismatched_operands_pat1(i32 addrspace(1)* %out, i32 %a,
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: s_sub_i32 s{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, s{{[0-9]+}}, v{{[0-9]+}}
-define void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @v_sad_u32_mismatched_operands_pat2(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
   %icmp0 = icmp ugt i32 %a, %b
   %sub0 = sub i32 %a, %d
   %sub1 = sub i32 %b, %a
diff --git a/test/CodeGen/AMDGPU/saddo.ll b/test/CodeGen/AMDGPU/saddo.ll
index f8ced7942a60bbb3f72206a35d0ab601d62dde11..586a455b2b91ed918552d65de3d514e4baf98d47 100644
--- a/test/CodeGen/AMDGPU/saddo.ll
+++ b/test/CodeGen/AMDGPU/saddo.ll
@@ -6,7 +6,7 @@ declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
 
 ; FUNC-LABEL: {{^}}saddo_i64_zext:
-define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
@@ -17,7 +17,7 @@ define void @saddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}s_saddo_i32:
-define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %sadd, 0
   %carry = extractvalue { i32, i1 } %sadd, 1
@@ -27,7 +27,7 @@ define void @s_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}v_saddo_i32:
-define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %sadd = call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -39,7 +39,7 @@ define void @v_saddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}s_saddo_i64:
-define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %sadd, 0
   %carry = extractvalue { i64, i1 } %sadd, 1
@@ -51,7 +51,7 @@ define void @s_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_saddo_i64:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_saddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 4
   %b = load i64, i64 addrspace(1)* %bptr, align 4
   %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind
diff --git a/test/CodeGen/AMDGPU/salu-to-valu.ll b/test/CodeGen/AMDGPU/salu-to-valu.ll
index 37083fbbd3c5e23d2ff8a78556e45a24e7e95d66..6e1dd163833378628ccad6e8d3d9e21ef3487fe0 100644
--- a/test/CodeGen/AMDGPU/salu-to-valu.ll
+++ b/test/CodeGen/AMDGPU/salu-to-valu.ll
@@ -24,7 +24,7 @@ declare i32 @llvm.amdgcn.workitem.id.y() #0
 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 ; GCN-HSA: flat_load_ubyte v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}
 
-define void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @mubuf(i32 addrspace(1)* %out, i8 addrspace(1)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.y()
@@ -55,17 +55,17 @@ done:                                             ; preds = %loop
 
 ; GCN-LABEL: {{^}}smrd_valu:
 ; SI: s_movk_i32 [[OFFSET:s[0-9]+]], 0x2ee0
+; SI: s_mov_b32
 ; GCN: v_readfirstlane_b32 s[[PTR_LO:[0-9]+]], v{{[0-9]+}}
 ; GCN: v_readfirstlane_b32 s[[PTR_HI:[0-9]+]], v{{[0-9]+}}
 ; SI: s_nop 3
 ; SI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, [[OFFSET]]
-; SI: s_mov_b32
 
 ; CI: s_load_dword [[OUT:s[0-9]+]], s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0xbb8
 ; GCN: v_mov_b32_e32 [[V_OUT:v[0-9]+]], [[OUT]]
 ; GCN-NOHSA: buffer_store_dword [[V_OUT]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[V_OUT]]
-define void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
+define amdgpu_kernel void @smrd_valu(i32 addrspace(2)* addrspace(1)* %in, i32 %a, i32 %b, i32 addrspace(1)* %out) #1 {
 entry:
   %tmp = icmp ne i32 %a, 0
   br i1 %tmp, label %if, label %else
@@ -93,7 +93,7 @@ endif:                                            ; preds = %else, %if
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s[{{[0-9]+:[0-9]+}}], 0 addr64 offset:16{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -113,7 +113,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dword v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
-define void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset(i32 addrspace(1)* %out, i32 addrspace(2)* %in, i32 %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr i32, i32 addrspace(2)* %in, i32 %tmp
@@ -133,7 +133,7 @@ entry:
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx2
 ; GCN-HSA: flat_load_dwordx2 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x2(i64 addrspace(1)* %out, i64 addrspace(2)* %in, i64 %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr i64, i64 addrspace(2)* %in, i32 %tmp
@@ -155,7 +155,7 @@ entry:
 ; GCN-NOHSA: v_or_b32_e32 {{v[0-9]+}}, {{s[0-9]+}}, {{v[0-9]+}}
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(2)* %in, <4 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <4 x i32>, <4 x i32> addrspace(2)* %in, i32 %tmp
@@ -189,7 +189,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x8(<8 x i32> addrspace(1)* %out, <8 x i32> addrspace(2)* %in, <8 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <8 x i32>, <8 x i32> addrspace(2)* %in, i32 %tmp
@@ -230,7 +230,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 
 ; GCN: s_endpgm
-define void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
+define amdgpu_kernel void @smrd_valu_ci_offset_x16(<16 x i32> addrspace(1)* %out, <16 x i32> addrspace(2)* %in, <16 x i32> %c) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = getelementptr <16 x i32>, <16 x i32> addrspace(2)* %in, i32 %tmp
@@ -247,7 +247,7 @@ entry:
 ; GCN: v_add_i32_e32 [[ADD:v[0-9]+]], vcc, s{{[0-9]+}}, [[MOVED]]
 ; GCN-NOHSA: buffer_store_dword [[ADD]]
 ; GCN-HSA: flat_store_dword {{.*}}, [[ADD]]
-define void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
+define amdgpu_kernel void @smrd_valu2_salu_user(i32 addrspace(1)* %out, [8 x i32] addrspace(2)* %in, i32 %a) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -261,7 +261,7 @@ entry:
 ; GCN-LABEL: {{^}}smrd_valu2_max_smrd_offset:
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1020{{$}}
 ; GCN-HSA flat_load_dword v{{[0-9]}}, v{{[0-9]+:[0-9]+}}
-define void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_max_smrd_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -275,7 +275,7 @@ entry:
 ; GCN-NOHSA-NOT: v_add
 ; GCN-NOHSA: buffer_load_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:1024{{$}}
 ; GCN-HSA: flat_load_dword v{{[0-9]}}, v[{{[0-9]+:[0-9]+}}]
-define void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
+define amdgpu_kernel void @smrd_valu2_mubuf_offset(i32 addrspace(1)* %out, [1024 x i32] addrspace(2)* %in) #1 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = add i32 %tmp, 4
@@ -290,7 +290,7 @@ entry:
 ; GCN-NOHSA: buffer_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32(<8 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -313,7 +313,7 @@ entry:
 ; GCN-NOHSA: buffer_store_dword
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v8i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -350,7 +350,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32(<16 x i32> addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -385,7 +385,7 @@ entry:
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
 ; GCN-HSA: flat_load_dwordx4
-define void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
+define amdgpu_kernel void @s_load_imm_v16i32_salu_user(i32 addrspace(1)* %out, i32 addrspace(2)* nocapture readonly %in) #1 {
 entry:
   %tmp0 = tail call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr inbounds i32, i32 addrspace(2)* %in, i32 %tmp0
@@ -439,9 +439,9 @@ entry:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN-NOHSA: buffer_store_dword [[ONE]]
 ; GCN-HSA: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[ONE]]
-; GCN; {{^}}[[EXIT]]:
+; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
-define void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sopc_vopc_legalize_bug(i32 %cond, i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 bb3:                                              ; preds = %bb2
   %tmp0 = bitcast i32 %cond to float
   %tmp1 = fadd float %tmp0, 2.500000e-01
@@ -459,7 +459,7 @@ bb7:                                              ; preds = %bb3
 
 ; GCN-LABEL: {{^}}phi_visit_order:
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 1, v{{[0-9]+}}
-define void @phi_visit_order() {
+define amdgpu_kernel void @phi_visit_order() {
 bb:
   br label %bb1
 
@@ -484,7 +484,7 @@ bb4:
 ; GCN: [[LOOP_LABEL:[0-9a-zA-Z_]+]]:
 ; GCN: s_xor_b32 [[B]], [[B]], [[A]]
 ; GCN: s_cbranch_scc{{[01]}} [[LOOP_LABEL]]
-define void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
+define amdgpu_kernel void @phi_imm_in_sgprs(i32 addrspace(3)* %out, i32 %cond) {
 entry:
   br label %loop
 
diff --git a/test/CodeGen/AMDGPU/sampler-resource-id.ll b/test/CodeGen/AMDGPU/sampler-resource-id.ll
index c41d345369bf62fb9ec25a5bafb03b59c7a6abe0..4ea503bf6098ed434719929de6ca412012684229 100644
--- a/test/CodeGen/AMDGPU/sampler-resource-id.ll
+++ b/test/CodeGen/AMDGPU/sampler-resource-id.ll
@@ -5,7 +5,7 @@
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 0(
-define void @test_0(i32 %in0, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_0(i32 %in0, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in0) #0
   store i32 %0, i32 addrspace(1)* %out
@@ -17,7 +17,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 1(
-define void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_1(i32 %in0, i32 %in1, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in1) #0
   store i32 %0, i32 addrspace(1)* %out
@@ -29,7 +29,7 @@ entry:
 ; EG: MOV [[VAL]], literal.x
 ; EG-NEXT: LSHR
 ; EG-NEXT: 2(
-define void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @test_2(i32 %in0, i32 %in1, i32 %in2, i32 addrspace(1)* %out) {
 entry:
   %0 = call i32 @llvm.OpenCL.sampler.get.resource.id(i32 %in2) #0
   store i32 %0, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir b/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
index af71086e542fde588741bada85eae27262a721dd..5bee36d878ebc285beac28b41aad9f8a34f31e4c 100644
--- a/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
+++ b/test/CodeGen/AMDGPU/scalar-store-cache-flush.mir
@@ -1,23 +1,23 @@
 # RUN: llc -march=amdgcn -run-pass si-insert-waits %s -o - | FileCheck %s
 
 --- |
-  define void @basic_insert_dcache_wb() {
+  define amdgpu_kernel void @basic_insert_dcache_wb() {
     ret void
   }
 
-  define void @explicit_flush_after() {
+  define amdgpu_kernel void @explicit_flush_after() {
     ret void
   }
 
-  define void @explicit_flush_before() {
+  define amdgpu_kernel void @explicit_flush_before() {
     ret void
   }
 
-  define void @no_scalar_store() {
+  define amdgpu_kernel void @no_scalar_store() {
     ret void
   }
 
-  define void @multi_block_store() {
+  define amdgpu_kernel void @multi_block_store() {
   bb0:
     br i1 undef, label %bb1, label %bb2
 
@@ -28,7 +28,7 @@
     ret void
   }
 
-  define void @one_block_store() {
+  define amdgpu_kernel void @one_block_store() {
   bb0:
     br i1 undef, label %bb1, label %bb2
 
@@ -169,5 +169,5 @@ tracksRegLiveness: false
 body: |
   bb.0:
     S_STORE_DWORD_SGPR undef %sgpr2, undef %sgpr0_sgpr1, undef %m0, 0
-    SI_RETURN undef %vgpr0
+    SI_RETURN_TO_EPILOG undef %vgpr0
 ...
diff --git a/test/CodeGen/AMDGPU/scalar_to_vector.ll b/test/CodeGen/AMDGPU/scalar_to_vector.ll
index 32df16778a9190d8686629bbca03376538256dd1..62d0d936788580b9f99af8f7d7e81da0c6868b00 100644
--- a/test/CodeGen/AMDGPU/scalar_to_vector.ll
+++ b/test/CodeGen/AMDGPU/scalar_to_vector.ll
@@ -1,15 +1,15 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
 
 ; XXX - Why the packing?
-; FUNC-LABEL: {{^}}scalar_to_vector_v2i32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]]
-; SI: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]]
-; SI: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
-; SI: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
-; SI: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
-define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+; GCN-LABEL: {{^}}scalar_to_vector_v2i32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_lshrrev_b32_e32 [[SHR:v[0-9]+]], 16, [[VAL]]
+; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[SHR]]
+; GCN: v_or_b32_e32 v[[OR:[0-9]+]], [[SHL]], [[SHR]]
+; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[OR]]
+; GCN: buffer_store_dwordx2 v{{\[}}[[OR]]:[[COPY]]{{\]}}
+define amdgpu_kernel void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
   %bc = bitcast i32 %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -17,11 +17,11 @@ define void @scalar_to_vector_v2i32(<4 x i16> addrspace(1)* %out, i32 addrspace(
   ret void
 }
 
-; FUNC-LABEL: {{^}}scalar_to_vector_v2f32:
-; SI: buffer_load_dword [[VAL:v[0-9]+]],
-; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
-; SI: buffer_store_dwordx2
-define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
+; GCN-LABEL: {{^}}scalar_to_vector_v2f32:
+; GCN: buffer_load_dword [[VAL:v[0-9]+]],
+; GCN: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 16, [[VAL]]
+; GCN: buffer_store_dwordx2
+define amdgpu_kernel void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspace(1)* %in) nounwind {
   %tmp1 = load float, float addrspace(1)* %in, align 4
   %bc = bitcast float %tmp1 to <2 x i16>
   %tmp2 = shufflevector <2 x i16> %bc, <2 x i16> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
@@ -33,7 +33,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ; to produce one, but for some reason never made it to selection.
 
 
-; define void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test2(<8 x i8> addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
 ;   %tmp1 = load i32, i32 addrspace(1)* %in, align 4
 ;   %bc = bitcast i32 %tmp1 to <4 x i8>
 
@@ -42,7 +42,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test3(<4 x i32> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <2 x i64> undef, i64 12345, i32 0
 ;   %newvec1 = insertelement <2 x i64> %newvec0, i64 undef, i32 1
 ;   %bc = bitcast <2 x i64> %newvec1 to <4 x i32>
@@ -51,7 +51,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test4(<8 x i16> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <4 x i32> undef, i32 12345, i32 0
 ;   %bc = bitcast <4 x i32> %newvec0 to <8 x i16>
 ;   %add = add <8 x i16> %bc, <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>
@@ -59,7 +59,7 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
+; define amdgpu_kernel void @scalar_to_vector_test5(<4 x i16> addrspace(1)* %out) nounwind {
 ;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
 ;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
 ;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
@@ -67,10 +67,9 @@ define void @scalar_to_vector_v2f32(<4 x i16> addrspace(1)* %out, float addrspac
 ;   ret void
 ; }
 
-; define void @scalar_to_vector_test6(<4 x i16> addrspace(1)* %out) nounwind {
-;   %newvec0 = insertelement <2 x i32> undef, i32 12345, i32 0
-;   %bc = bitcast <2 x i32> %newvec0 to <4 x i16>
-;   %add = add <4 x i16> %bc, <i16 1, i16 2, i16 3, i16 4>
-;   store <4 x i16> %add, <4 x i16> addrspace(1)* %out, align 16
-;   ret void
-; }
+define amdgpu_kernel void @scalar_to_vector_test6(<2 x half> addrspace(1)* %out, i8 zeroext %val) nounwind {
+  %newvec0 = insertelement <4 x i8> undef, i8 %val, i32 0
+  %bc = bitcast <4 x i8> %newvec0 to <2 x half>
+  store <2 x half> %bc, <2 x half> addrspace(1)* %out
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
index e040639a2d945818907532342b08d19851714196..60abd83546d3ce9f6a67ad12f4d8ed0a73c7be32 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested-if.ll
@@ -1,81 +1,85 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
+; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
+; REQUIRES: asserts
 
-define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = extractelement <4 x float> %reg1, i32 0
-  %1 = extractelement <4 x float> %reg1, i32 1
-  %2 = extractelement <4 x float> %reg1, i32 2
-  %3 = extractelement <4 x float> %reg1, i32 3
-  %4 = fcmp ult float %1, 0.000000e+00
-  %5 = select i1 %4, float 1.000000e+00, float 0.000000e+00
-  %6 = fsub float -0.000000e+00, %5
-  %7 = fptosi float %6 to i32
-  %8 = bitcast i32 %7 to float
-  %9 = fcmp ult float %0, 5.700000e+01
-  %10 = select i1 %9, float 1.000000e+00, float 0.000000e+00
-  %11 = fsub float -0.000000e+00, %10
-  %12 = fptosi float %11 to i32
-  %13 = bitcast i32 %12 to float
-  %14 = bitcast float %8 to i32
-  %15 = bitcast float %13 to i32
-  %16 = and i32 %14, %15
-  %17 = bitcast i32 %16 to float
-  %18 = bitcast float %17 to i32
-  %19 = icmp ne i32 %18, 0
-  %20 = fcmp ult float %0, 0.000000e+00
-  %21 = select i1 %20, float 1.000000e+00, float 0.000000e+00
-  %22 = fsub float -0.000000e+00, %21
-  %23 = fptosi float %22 to i32
-  %24 = bitcast i32 %23 to float
-  %25 = bitcast float %24 to i32
-  %26 = icmp ne i32 %25, 0
-  br i1 %19, label %IF, label %ELSE
+  %tmp = extractelement <4 x float> %reg1, i32 0
+  %tmp5 = extractelement <4 x float> %reg1, i32 1
+  %tmp6 = extractelement <4 x float> %reg1, i32 2
+  %tmp7 = extractelement <4 x float> %reg1, i32 3
+  %tmp8 = fcmp ult float %tmp5, 0.000000e+00
+  %tmp9 = select i1 %tmp8, float 1.000000e+00, float 0.000000e+00
+  %tmp10 = fsub float -0.000000e+00, %tmp9
+  %tmp11 = fptosi float %tmp10 to i32
+  %tmp12 = bitcast i32 %tmp11 to float
+  %tmp13 = fcmp ult float %tmp, 5.700000e+01
+  %tmp14 = select i1 %tmp13, float 1.000000e+00, float 0.000000e+00
+  %tmp15 = fsub float -0.000000e+00, %tmp14
+  %tmp16 = fptosi float %tmp15 to i32
+  %tmp17 = bitcast i32 %tmp16 to float
+  %tmp18 = bitcast float %tmp12 to i32
+  %tmp19 = bitcast float %tmp17 to i32
+  %tmp20 = and i32 %tmp18, %tmp19
+  %tmp21 = bitcast i32 %tmp20 to float
+  %tmp22 = bitcast float %tmp21 to i32
+  %tmp23 = icmp ne i32 %tmp22, 0
+  %tmp24 = fcmp ult float %tmp, 0.000000e+00
+  %tmp25 = select i1 %tmp24, float 1.000000e+00, float 0.000000e+00
+  %tmp26 = fsub float -0.000000e+00, %tmp25
+  %tmp27 = fptosi float %tmp26 to i32
+  %tmp28 = bitcast i32 %tmp27 to float
+  %tmp29 = bitcast float %tmp28 to i32
+  %tmp30 = icmp ne i32 %tmp29, 0
+  br i1 %tmp23, label %IF, label %ELSE
 
 IF:                                               ; preds = %main_body
-  %. = select i1 %26, float 0.000000e+00, float 1.000000e+00
-  %.18 = select i1 %26, float 1.000000e+00, float 0.000000e+00
+  %. = select i1 %tmp30, float 0.000000e+00, float 1.000000e+00
+  %.18 = select i1 %tmp30, float 1.000000e+00, float 0.000000e+00
   br label %ENDIF
 
 ELSE:                                             ; preds = %main_body
-  br i1 %26, label %ENDIF, label %ELSE17
+  br i1 %tmp30, label %ENDIF, label %ELSE17
 
 ENDIF:                                            ; preds = %ELSE17, %ELSE, %IF
-  %temp1.0 = phi float [ %., %IF ], [ %48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
-  %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
-  %temp.0 = phi float [ %.18, %IF ], [ %47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
-  %27 = call float @llvm.AMDGPU.clamp.f32(float %temp.0, float 0.000000e+00, float 1.000000e+00)
-  %28 = call float @llvm.AMDGPU.clamp.f32(float %temp1.0, float 0.000000e+00, float 1.000000e+00)
-  %29 = call float @llvm.AMDGPU.clamp.f32(float %temp2.0, float 0.000000e+00, float 1.000000e+00)
-  %30 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %31 = insertelement <4 x float> undef, float %27, i32 0
-  %32 = insertelement <4 x float> %31, float %28, i32 1
-  %33 = insertelement <4 x float> %32, float %29, i32 2
-  %34 = insertelement <4 x float> %33, float %30, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %34, i32 0, i32 0)
+  %temp1.0 = phi float [ %., %IF ], [ %tmp48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %tmp49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
+  %temp.0 = phi float [ %.18, %IF ], [ %tmp47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %max.0.i = call float @llvm.maxnum.f32(float %temp.0, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %temp1.0, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %temp2.0, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp31 = insertelement <4 x float> undef, float %clamp.i, i32 0
+  %tmp32 = insertelement <4 x float> %tmp31, float %clamp.i4, i32 1
+  %tmp33 = insertelement <4 x float> %tmp32, float %clamp.i2, i32 2
+  %tmp34 = insertelement <4 x float> %tmp33, float 1.000000e+00, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp34, i32 0, i32 0)
   ret void
 
 ELSE17:                                           ; preds = %ELSE
-  %35 = fadd float 0.000000e+00, 0x3FC99999A0000000
-  %36 = fadd float 0.000000e+00, 0x3FC99999A0000000
-  %37 = fadd float 0.000000e+00, 0x3FC99999A0000000
-  %38 = fadd float %35, 0x3FC99999A0000000
-  %39 = fadd float %36, 0x3FC99999A0000000
-  %40 = fadd float %37, 0x3FC99999A0000000
-  %41 = fadd float %38, 0x3FC99999A0000000
-  %42 = fadd float %39, 0x3FC99999A0000000
-  %43 = fadd float %40, 0x3FC99999A0000000
-  %44 = fadd float %41, 0x3FC99999A0000000
-  %45 = fadd float %42, 0x3FC99999A0000000
-  %46 = fadd float %43, 0x3FC99999A0000000
-  %47 = fadd float %44, 0x3FC99999A0000000
-  %48 = fadd float %45, 0x3FC99999A0000000
-  %49 = fadd float %46, 0x3FC99999A0000000
+  %tmp35 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp36 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp37 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp38 = fadd float %tmp35, 0x3FC99999A0000000
+  %tmp39 = fadd float %tmp36, 0x3FC99999A0000000
+  %tmp40 = fadd float %tmp37, 0x3FC99999A0000000
+  %tmp41 = fadd float %tmp38, 0x3FC99999A0000000
+  %tmp42 = fadd float %tmp39, 0x3FC99999A0000000
+  %tmp43 = fadd float %tmp40, 0x3FC99999A0000000
+  %tmp44 = fadd float %tmp41, 0x3FC99999A0000000
+  %tmp45 = fadd float %tmp42, 0x3FC99999A0000000
+  %tmp46 = fadd float %tmp43, 0x3FC99999A0000000
+  %tmp47 = fadd float %tmp44, 0x3FC99999A0000000
+  %tmp48 = fadd float %tmp45, 0x3FC99999A0000000
+  %tmp49 = fadd float %tmp46, 0x3FC99999A0000000
   br label %ENDIF
 }
 
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
 
 declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
 
-attributes #0 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
index f907e154f9625ad5e8d2deb6cc0e0df66860285e..177957c0b35b8f55570648253104a2f7ef95db78 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop-nested.ll
@@ -1,88 +1,91 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
+; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
+; REQUIRES: asserts
 
-define void @main() {
+define amdgpu_kernel void @main() #0 {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(9)* null
-  %1 = extractelement <4 x float> %0, i32 3
-  %2 = fptosi float %1 to i32
-  %3 = bitcast i32 %2 to float
-  %4 = bitcast float %3 to i32
-  %5 = sdiv i32 %4, 4
-  %6 = bitcast i32 %5 to float
-  %7 = bitcast float %6 to i32
-  %8 = mul i32 %7, 4
-  %9 = bitcast i32 %8 to float
-  %10 = bitcast float %9 to i32
-  %11 = sub i32 0, %10
-  %12 = bitcast i32 %11 to float
-  %13 = bitcast float %3 to i32
-  %14 = bitcast float %12 to i32
-  %15 = add i32 %13, %14
-  %16 = bitcast i32 %15 to float
-  %17 = load <4 x float>, <4 x float> addrspace(9)* null
-  %18 = extractelement <4 x float> %17, i32 0
-  %19 = load <4 x float>, <4 x float> addrspace(9)* null
-  %20 = extractelement <4 x float> %19, i32 1
-  %21 = load <4 x float>, <4 x float> addrspace(9)* null
-  %22 = extractelement <4 x float> %21, i32 2
+  %tmp = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp5 = extractelement <4 x float> %tmp, i32 3
+  %tmp6 = fptosi float %tmp5 to i32
+  %tmp7 = bitcast i32 %tmp6 to float
+  %tmp8 = bitcast float %tmp7 to i32
+  %tmp9 = sdiv i32 %tmp8, 4
+  %tmp10 = bitcast i32 %tmp9 to float
+  %tmp11 = bitcast float %tmp10 to i32
+  %tmp12 = mul i32 %tmp11, 4
+  %tmp13 = bitcast i32 %tmp12 to float
+  %tmp14 = bitcast float %tmp13 to i32
+  %tmp15 = sub i32 0, %tmp14
+  %tmp16 = bitcast i32 %tmp15 to float
+  %tmp17 = bitcast float %tmp7 to i32
+  %tmp18 = bitcast float %tmp16 to i32
+  %tmp19 = add i32 %tmp17, %tmp18
+  %tmp20 = bitcast i32 %tmp19 to float
+  %tmp21 = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp22 = extractelement <4 x float> %tmp21, i32 0
+  %tmp23 = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp24 = extractelement <4 x float> %tmp23, i32 1
+  %tmp25 = load <4 x float>, <4 x float> addrspace(9)* null
+  %tmp26 = extractelement <4 x float> %tmp25, i32 2
   br label %LOOP
 
 LOOP:                                             ; preds = %IF31, %main_body
-  %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %47, %IF31 ]
-  %temp6.0 = phi float [ %22, %main_body ], [ %temp6.1, %IF31 ]
-  %temp5.0 = phi float [ %20, %main_body ], [ %temp5.1, %IF31 ]
-  %temp4.0 = phi float [ %18, %main_body ], [ %temp4.1, %IF31 ]
-  %23 = bitcast float %temp12.0 to i32
-  %24 = bitcast float %6 to i32
-  %25 = icmp sge i32 %23, %24
-  %26 = sext i1 %25 to i32
-  %27 = bitcast i32 %26 to float
-  %28 = bitcast float %27 to i32
-  %29 = icmp ne i32 %28, 0
-  br i1 %29, label %IF, label %LOOP29
+  %temp12.0 = phi float [ 0.000000e+00, %main_body ], [ %tmp47, %IF31 ]
+  %temp6.0 = phi float [ %tmp26, %main_body ], [ %temp6.1, %IF31 ]
+  %temp5.0 = phi float [ %tmp24, %main_body ], [ %temp5.1, %IF31 ]
+  %temp4.0 = phi float [ %tmp22, %main_body ], [ %temp4.1, %IF31 ]
+  %tmp27 = bitcast float %temp12.0 to i32
+  %tmp28 = bitcast float %tmp10 to i32
+  %tmp29 = icmp sge i32 %tmp27, %tmp28
+  %tmp30 = sext i1 %tmp29 to i32
+  %tmp31 = bitcast i32 %tmp30 to float
+  %tmp32 = bitcast float %tmp31 to i32
+  %tmp33 = icmp ne i32 %tmp32, 0
+  br i1 %tmp33, label %IF, label %LOOP29
 
 IF:                                               ; preds = %LOOP
-  %30 = call float @llvm.AMDGPU.clamp.f32(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
-  %31 = call float @llvm.AMDGPU.clamp.f32(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
-  %32 = call float @llvm.AMDGPU.clamp.f32(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
-  %33 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %34 = insertelement <4 x float> undef, float %30, i32 0
-  %35 = insertelement <4 x float> %34, float %31, i32 1
-  %36 = insertelement <4 x float> %35, float %32, i32 2
-  %37 = insertelement <4 x float> %36, float %33, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %37, i32 0, i32 0)
+  %max.0.i = call float @llvm.maxnum.f32(float %temp4.0, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %temp5.0, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %temp6.0, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp34 = insertelement <4 x float> undef, float %clamp.i, i32 0
+  %tmp35 = insertelement <4 x float> %tmp34, float %clamp.i4, i32 1
+  %tmp36 = insertelement <4 x float> %tmp35, float %clamp.i2, i32 2
+  %tmp37 = insertelement <4 x float> %tmp36, float 1.000000e+00, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp37, i32 0, i32 0)
   ret void
 
-LOOP29:                                           ; preds = %LOOP, %ENDIF30
+LOOP29:                                           ; preds = %ENDIF30, %LOOP
   %temp6.1 = phi float [ %temp4.1, %ENDIF30 ], [ %temp6.0, %LOOP ]
   %temp5.1 = phi float [ %temp6.1, %ENDIF30 ], [ %temp5.0, %LOOP ]
   %temp4.1 = phi float [ %temp5.1, %ENDIF30 ], [ %temp4.0, %LOOP ]
-  %temp20.0 = phi float [ %50, %ENDIF30 ], [ 0.000000e+00, %LOOP ]
-  %38 = bitcast float %temp20.0 to i32
-  %39 = bitcast float %16 to i32
-  %40 = icmp sge i32 %38, %39
-  %41 = sext i1 %40 to i32
-  %42 = bitcast i32 %41 to float
-  %43 = bitcast float %42 to i32
-  %44 = icmp ne i32 %43, 0
-  br i1 %44, label %IF31, label %ENDIF30
+  %temp20.0 = phi float [ %tmp50, %ENDIF30 ], [ 0.000000e+00, %LOOP ]
+  %tmp38 = bitcast float %temp20.0 to i32
+  %tmp39 = bitcast float %tmp20 to i32
+  %tmp40 = icmp sge i32 %tmp38, %tmp39
+  %tmp41 = sext i1 %tmp40 to i32
+  %tmp42 = bitcast i32 %tmp41 to float
+  %tmp43 = bitcast float %tmp42 to i32
+  %tmp44 = icmp ne i32 %tmp43, 0
+  br i1 %tmp44, label %IF31, label %ENDIF30
 
 IF31:                                             ; preds = %LOOP29
-  %45 = bitcast float %temp12.0 to i32
-  %46 = add i32 %45, 1
-  %47 = bitcast i32 %46 to float
+  %tmp45 = bitcast float %temp12.0 to i32
+  %tmp46 = add i32 %tmp45, 1
+  %tmp47 = bitcast i32 %tmp46 to float
   br label %LOOP
 
 ENDIF30:                                          ; preds = %LOOP29
-  %48 = bitcast float %temp20.0 to i32
-  %49 = add i32 %48, 1
-  %50 = bitcast i32 %49 to float
+  %tmp48 = bitcast float %temp20.0 to i32
+  %tmp49 = add i32 %tmp48, 1
+  %tmp50 = bitcast i32 %tmp49 to float
   br label %LOOP29
 }
 
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #0
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
 
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
-
-attributes #0 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-fs-loop.ll b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
index 5839785f00d562f8259f59996f6806ea00a3cf98..6cd419f6cfc45b0403f88d476770717b47addf71 100644
--- a/test/CodeGen/AMDGPU/schedule-fs-loop.ll
+++ b/test/CodeGen/AMDGPU/schedule-fs-loop.ll
@@ -1,55 +1,84 @@
-;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
-;REQUIRES: asserts
+; RUN: llc -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs < %s
+; REQUIRES: asserts
 
-define void @main() {
+define amdgpu_vs void @main(<4 x float> inreg %reg0, <4 x float> inreg %reg1) #0 {
 main_body:
-  %0 = load <4 x float>, <4 x float> addrspace(9)* null
-  %1 = extractelement <4 x float> %0, i32 3
-  %2 = fptosi float %1 to i32
-  %3 = bitcast i32 %2 to float
-  %4 = load <4 x float>, <4 x float> addrspace(9)* null
-  %5 = extractelement <4 x float> %4, i32 0
-  %6 = load <4 x float>, <4 x float> addrspace(9)* null
-  %7 = extractelement <4 x float> %6, i32 1
-  %8 = load <4 x float>, <4 x float> addrspace(9)* null
-  %9 = extractelement <4 x float> %8, i32 2
-  br label %LOOP
+  %tmp = extractelement <4 x float> %reg1, i32 0
+  %tmp5 = extractelement <4 x float> %reg1, i32 1
+  %tmp6 = extractelement <4 x float> %reg1, i32 2
+  %tmp7 = extractelement <4 x float> %reg1, i32 3
+  %tmp8 = fcmp ult float %tmp5, 0.000000e+00
+  %tmp9 = select i1 %tmp8, float 1.000000e+00, float 0.000000e+00
+  %tmp10 = fsub float -0.000000e+00, %tmp9
+  %tmp11 = fptosi float %tmp10 to i32
+  %tmp12 = bitcast i32 %tmp11 to float
+  %tmp13 = fcmp ult float %tmp, 5.700000e+01
+  %tmp14 = select i1 %tmp13, float 1.000000e+00, float 0.000000e+00
+  %tmp15 = fsub float -0.000000e+00, %tmp14
+  %tmp16 = fptosi float %tmp15 to i32
+  %tmp17 = bitcast i32 %tmp16 to float
+  %tmp18 = bitcast float %tmp12 to i32
+  %tmp19 = bitcast float %tmp17 to i32
+  %tmp20 = and i32 %tmp18, %tmp19
+  %tmp21 = bitcast i32 %tmp20 to float
+  %tmp22 = bitcast float %tmp21 to i32
+  %tmp23 = icmp ne i32 %tmp22, 0
+  %tmp24 = fcmp ult float %tmp, 0.000000e+00
+  %tmp25 = select i1 %tmp24, float 1.000000e+00, float 0.000000e+00
+  %tmp26 = fsub float -0.000000e+00, %tmp25
+  %tmp27 = fptosi float %tmp26 to i32
+  %tmp28 = bitcast i32 %tmp27 to float
+  %tmp29 = bitcast float %tmp28 to i32
+  %tmp30 = icmp ne i32 %tmp29, 0
+  br i1 %tmp23, label %IF, label %ELSE
 
-LOOP:                                             ; preds = %ENDIF, %main_body
-  %temp4.0 = phi float [ %5, %main_body ], [ %temp5.0, %ENDIF ]
-  %temp5.0 = phi float [ %7, %main_body ], [ %temp6.0, %ENDIF ]
-  %temp6.0 = phi float [ %9, %main_body ], [ %temp4.0, %ENDIF ]
-  %temp8.0 = phi float [ 0.000000e+00, %main_body ], [ %27, %ENDIF ]
-  %10 = bitcast float %temp8.0 to i32
-  %11 = bitcast float %3 to i32
-  %12 = icmp sge i32 %10, %11
-  %13 = sext i1 %12 to i32
-  %14 = bitcast i32 %13 to float
-  %15 = bitcast float %14 to i32
-  %16 = icmp ne i32 %15, 0
-  br i1 %16, label %IF, label %ENDIF
+IF:                                               ; preds = %main_body
+  %. = select i1 %tmp30, float 0.000000e+00, float 1.000000e+00
+  %.18 = select i1 %tmp30, float 1.000000e+00, float 0.000000e+00
+  br label %ENDIF
 
-IF:                                               ; preds = %LOOP
-  %17 = call float @llvm.AMDGPU.clamp.f32(float %temp4.0, float 0.000000e+00, float 1.000000e+00)
-  %18 = call float @llvm.AMDGPU.clamp.f32(float %temp5.0, float 0.000000e+00, float 1.000000e+00)
-  %19 = call float @llvm.AMDGPU.clamp.f32(float %temp6.0, float 0.000000e+00, float 1.000000e+00)
-  %20 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %21 = insertelement <4 x float> undef, float %17, i32 0
-  %22 = insertelement <4 x float> %21, float %18, i32 1
-  %23 = insertelement <4 x float> %22, float %19, i32 2
-  %24 = insertelement <4 x float> %23, float %20, i32 3
-  call void @llvm.r600.store.swizzle(<4 x float> %24, i32 0, i32 0)
+ELSE:                                             ; preds = %main_body
+  br i1 %tmp30, label %ENDIF, label %ELSE17
+
+ENDIF:                                            ; preds = %ELSE17, %ELSE, %IF
+  %temp1.0 = phi float [ %., %IF ], [ %tmp48, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %temp2.0 = phi float [ 0.000000e+00, %IF ], [ %tmp49, %ELSE17 ], [ 1.000000e+00, %ELSE ]
+  %temp.0 = phi float [ %.18, %IF ], [ %tmp47, %ELSE17 ], [ 0.000000e+00, %ELSE ]
+  %max.0.i = call float @llvm.maxnum.f32(float %temp.0, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %max.0.i3 = call float @llvm.maxnum.f32(float %temp1.0, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %temp2.0, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp31 = insertelement <4 x float> undef, float %clamp.i, i32 0
+  %tmp32 = insertelement <4 x float> %tmp31, float %clamp.i4, i32 1
+  %tmp33 = insertelement <4 x float> %tmp32, float %clamp.i2, i32 2
+  %tmp34 = insertelement <4 x float> %tmp33, float 1.000000e+00, i32 3
+  call void @llvm.r600.store.swizzle(<4 x float> %tmp34, i32 0, i32 0)
   ret void
 
-ENDIF:                                            ; preds = %LOOP
-  %25 = bitcast float %temp8.0 to i32
-  %26 = add i32 %25, 1
-  %27 = bitcast i32 %26 to float
-  br label %LOOP
+ELSE17:                                           ; preds = %ELSE
+  %tmp35 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp36 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp37 = fadd float 0.000000e+00, 0x3FC99999A0000000
+  %tmp38 = fadd float %tmp35, 0x3FC99999A0000000
+  %tmp39 = fadd float %tmp36, 0x3FC99999A0000000
+  %tmp40 = fadd float %tmp37, 0x3FC99999A0000000
+  %tmp41 = fadd float %tmp38, 0x3FC99999A0000000
+  %tmp42 = fadd float %tmp39, 0x3FC99999A0000000
+  %tmp43 = fadd float %tmp40, 0x3FC99999A0000000
+  %tmp44 = fadd float %tmp41, 0x3FC99999A0000000
+  %tmp45 = fadd float %tmp42, 0x3FC99999A0000000
+  %tmp46 = fadd float %tmp43, 0x3FC99999A0000000
+  %tmp47 = fadd float %tmp44, 0x3FC99999A0000000
+  %tmp48 = fadd float %tmp45, 0x3FC99999A0000000
+  %tmp49 = fadd float %tmp46, 0x3FC99999A0000000
+  br label %ENDIF
 }
 
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #0
-
-declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32) #0
 
-attributes #0 = { readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-global-loads.ll b/test/CodeGen/AMDGPU/schedule-global-loads.ll
index 32c456bd2ceb5f473b7e81536ffabdbabd66a4ff..44d46086f02af4b59bfe27b3b1558ff48aec1f86 100644
--- a/test/CodeGen/AMDGPU/schedule-global-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-global-loads.ll
@@ -10,7 +10,7 @@
 ; SI-DAG: buffer_load_dword [[REG1:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:8
 ; SI: buffer_store_dword [[REG0]]
 ; SI: buffer_store_dword [[REG1]]
-define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr) #0 {
   %load0 = load i32, i32 addrspace(1)* %ptr, align 4
   %gep = getelementptr i32, i32 addrspace(1)* %ptr, i32 2
   %load1 = load i32, i32 addrspace(1)* %gep, align 4
@@ -24,7 +24,7 @@ define void @cluster_global_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)*
 ; FUNC-LABEL: {{^}}same_base_ptr_crash:
 ; SI: buffer_load_dword
 ; SI: buffer_load_dword
-define void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
+define amdgpu_kernel void @same_base_ptr_crash(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %offset) {
 entry:
   %out1 = getelementptr i32, i32 addrspace(1)* %out, i32 %offset
   %tmp0 = load i32, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/schedule-if-2.ll b/test/CodeGen/AMDGPU/schedule-if-2.ll
index aa67b2e0f7dbb87b09cecdb607f1352057265cea..964298a553187786a163c6a36c2b232ba398e6af 100644
--- a/test/CodeGen/AMDGPU/schedule-if-2.ll
+++ b/test/CodeGen/AMDGPU/schedule-if-2.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main() {
+define amdgpu_kernel void @main() {
 main_body:
   %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 2)
   %1 = extractelement <4 x float> %0, i32 0
diff --git a/test/CodeGen/AMDGPU/schedule-if.ll b/test/CodeGen/AMDGPU/schedule-if.ll
index 6637b3897717e9d1f90ccf90f5d5482da31cabfb..feac5d918f63b58d134d5e2db2403c8c19b72dff 100644
--- a/test/CodeGen/AMDGPU/schedule-if.ll
+++ b/test/CodeGen/AMDGPU/schedule-if.ll
@@ -1,7 +1,7 @@
 ;RUN: llc < %s -march=r600 -mcpu=cayman -stress-sched -verify-misched -verify-machineinstrs
 ;REQUIRES: asserts
 
-define void @main() {
+define amdgpu_kernel void @main() {
 main_body:
   %0 = load <4 x float>, <4 x float> addrspace(8)* getelementptr ([1024 x <4 x float>], [1024 x <4 x float>] addrspace(8)* null, i64 0, i32 1)
   %1 = extractelement <4 x float> %0, i32 0
diff --git a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
index ccfde7b9adc51ead84020d8b1b9b8d7812e69267..5c47c163dcce087e50dd54f14a79a9e9ac64ed6a 100644
--- a/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
+++ b/test/CodeGen/AMDGPU/schedule-kernel-arg-loads.ll
@@ -12,7 +12,7 @@
 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x24
 ; VI: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x2c
 ; VI: s_load_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, 0x38
-define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
+define amdgpu_kernel void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) nounwind {
   store i32 %x, i32 addrspace(1)* %out0, align 4
   store i32 %y, i32 addrspace(1)* %out1, align 4
   ret void
@@ -26,7 +26,7 @@ define void @cluster_arg_loads(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1,
 ; GCN: s_load_dwordx2
 ; GCN: s_load_dwordx2
 ; GCN: s_endpgm
-define void @same_base_ptr_crash(i64 addrspace(1)* %out,
+define amdgpu_kernel void @same_base_ptr_crash(i64 addrspace(1)* %out,
     i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %arg5, i64 %arg6, i64 %arg7,
     i64 %arg8, i64 %arg9, i64 %arg10, i64 %arg11, i64 %arg12, i64 %arg13, i64 %arg14, i64 %arg15,
     i64 %arg16, i64 %arg17, i64 %arg18, i64 %arg19, i64 %arg20, i64 %arg21, i64 %arg22, i64 %arg23,
diff --git a/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll b/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4520fe86136f318a2d436f88eb65235c0d1f6068
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-regpressure-limit.ll
@@ -0,0 +1,591 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=tonga -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck %s
+
+; We expect a two digit VGPR usage here, not a three digit.
+; CHECK: NumVgprs: {{[0-9][0-9]$}}
+
+define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %arg, float addrspace(1)* nocapture %arg1) {
+bb:
+  %tmp = getelementptr inbounds float, float addrspace(3)* %arg, i32 1
+  %tmp2 = load float, float addrspace(3)* %tmp, align 4
+  %tmp3 = getelementptr inbounds float, float addrspace(3)* %arg, i32 2
+  %tmp4 = load float, float addrspace(3)* %tmp3, align 4
+  %tmp5 = getelementptr inbounds float, float addrspace(3)* %arg, i32 3
+  %tmp6 = load float, float addrspace(3)* %tmp5, align 4
+  %tmp7 = tail call float @llvm.fmuladd.f32(float %tmp2, float %tmp4, float %tmp6)
+  %tmp8 = getelementptr inbounds float, float addrspace(3)* %arg, i32 5
+  %tmp9 = load float, float addrspace(3)* %tmp8, align 4
+  %tmp10 = getelementptr inbounds float, float addrspace(3)* %arg, i32 6
+  %tmp11 = load float, float addrspace(3)* %tmp10, align 4
+  %tmp12 = getelementptr inbounds float, float addrspace(3)* %arg, i32 7
+  %tmp13 = load float, float addrspace(3)* %tmp12, align 4
+  %tmp14 = tail call float @llvm.fmuladd.f32(float %tmp9, float %tmp11, float %tmp13)
+  %tmp15 = getelementptr inbounds float, float addrspace(3)* %arg, i32 9
+  %tmp16 = load float, float addrspace(3)* %tmp15, align 4
+  %tmp17 = getelementptr inbounds float, float addrspace(3)* %arg, i32 10
+  %tmp18 = load float, float addrspace(3)* %tmp17, align 4
+  %tmp19 = getelementptr inbounds float, float addrspace(3)* %arg, i32 11
+  %tmp20 = load float, float addrspace(3)* %tmp19, align 4
+  %tmp21 = tail call float @llvm.fmuladd.f32(float %tmp16, float %tmp18, float %tmp20)
+  %tmp22 = getelementptr inbounds float, float addrspace(3)* %arg, i32 13
+  %tmp23 = load float, float addrspace(3)* %tmp22, align 4
+  %tmp24 = getelementptr inbounds float, float addrspace(3)* %arg, i32 14
+  %tmp25 = load float, float addrspace(3)* %tmp24, align 4
+  %tmp26 = getelementptr inbounds float, float addrspace(3)* %arg, i32 15
+  %tmp27 = load float, float addrspace(3)* %tmp26, align 4
+  %tmp28 = tail call float @llvm.fmuladd.f32(float %tmp23, float %tmp25, float %tmp27)
+  %tmp29 = getelementptr inbounds float, float addrspace(3)* %arg, i32 17
+  %tmp30 = load float, float addrspace(3)* %tmp29, align 4
+  %tmp31 = getelementptr inbounds float, float addrspace(3)* %arg, i32 18
+  %tmp32 = load float, float addrspace(3)* %tmp31, align 4
+  %tmp33 = getelementptr inbounds float, float addrspace(3)* %arg, i32 19
+  %tmp34 = load float, float addrspace(3)* %tmp33, align 4
+  %tmp35 = tail call float @llvm.fmuladd.f32(float %tmp30, float %tmp32, float %tmp34)
+  %tmp36 = getelementptr inbounds float, float addrspace(3)* %arg, i32 21
+  %tmp37 = load float, float addrspace(3)* %tmp36, align 4
+  %tmp38 = getelementptr inbounds float, float addrspace(3)* %arg, i32 22
+  %tmp39 = load float, float addrspace(3)* %tmp38, align 4
+  %tmp40 = getelementptr inbounds float, float addrspace(3)* %arg, i32 23
+  %tmp41 = load float, float addrspace(3)* %tmp40, align 4
+  %tmp42 = tail call float @llvm.fmuladd.f32(float %tmp37, float %tmp39, float %tmp41)
+  %tmp43 = getelementptr inbounds float, float addrspace(3)* %arg, i32 25
+  %tmp44 = load float, float addrspace(3)* %tmp43, align 4
+  %tmp45 = getelementptr inbounds float, float addrspace(3)* %arg, i32 26
+  %tmp46 = load float, float addrspace(3)* %tmp45, align 4
+  %tmp47 = getelementptr inbounds float, float addrspace(3)* %arg, i32 27
+  %tmp48 = load float, float addrspace(3)* %tmp47, align 4
+  %tmp49 = tail call float @llvm.fmuladd.f32(float %tmp44, float %tmp46, float %tmp48)
+  %tmp50 = getelementptr inbounds float, float addrspace(3)* %arg, i32 29
+  %tmp51 = load float, float addrspace(3)* %tmp50, align 4
+  %tmp52 = getelementptr inbounds float, float addrspace(3)* %arg, i32 30
+  %tmp53 = load float, float addrspace(3)* %tmp52, align 4
+  %tmp54 = getelementptr inbounds float, float addrspace(3)* %arg, i32 31
+  %tmp55 = load float, float addrspace(3)* %tmp54, align 4
+  %tmp56 = tail call float @llvm.fmuladd.f32(float %tmp51, float %tmp53, float %tmp55)
+  %tmp57 = getelementptr inbounds float, float addrspace(3)* %arg, i32 33
+  %tmp58 = load float, float addrspace(3)* %tmp57, align 4
+  %tmp59 = getelementptr inbounds float, float addrspace(3)* %arg, i32 34
+  %tmp60 = load float, float addrspace(3)* %tmp59, align 4
+  %tmp61 = getelementptr inbounds float, float addrspace(3)* %arg, i32 35
+  %tmp62 = load float, float addrspace(3)* %tmp61, align 4
+  %tmp63 = tail call float @llvm.fmuladd.f32(float %tmp58, float %tmp60, float %tmp62)
+  %tmp64 = getelementptr inbounds float, float addrspace(3)* %arg, i32 37
+  %tmp65 = load float, float addrspace(3)* %tmp64, align 4
+  %tmp66 = getelementptr inbounds float, float addrspace(3)* %arg, i32 38
+  %tmp67 = load float, float addrspace(3)* %tmp66, align 4
+  %tmp68 = getelementptr inbounds float, float addrspace(3)* %arg, i32 39
+  %tmp69 = load float, float addrspace(3)* %tmp68, align 4
+  %tmp70 = tail call float @llvm.fmuladd.f32(float %tmp65, float %tmp67, float %tmp69)
+  %tmp71 = getelementptr inbounds float, float addrspace(3)* %arg, i32 41
+  %tmp72 = load float, float addrspace(3)* %tmp71, align 4
+  %tmp73 = getelementptr inbounds float, float addrspace(3)* %arg, i32 42
+  %tmp74 = load float, float addrspace(3)* %tmp73, align 4
+  %tmp75 = getelementptr inbounds float, float addrspace(3)* %arg, i32 43
+  %tmp76 = load float, float addrspace(3)* %tmp75, align 4
+  %tmp77 = tail call float @llvm.fmuladd.f32(float %tmp72, float %tmp74, float %tmp76)
+  %tmp78 = getelementptr inbounds float, float addrspace(3)* %arg, i32 45
+  %tmp79 = load float, float addrspace(3)* %tmp78, align 4
+  %tmp80 = getelementptr inbounds float, float addrspace(3)* %arg, i32 46
+  %tmp81 = load float, float addrspace(3)* %tmp80, align 4
+  %tmp82 = getelementptr inbounds float, float addrspace(3)* %arg, i32 47
+  %tmp83 = load float, float addrspace(3)* %tmp82, align 4
+  %tmp84 = tail call float @llvm.fmuladd.f32(float %tmp79, float %tmp81, float %tmp83)
+  %tmp85 = getelementptr inbounds float, float addrspace(3)* %arg, i32 49
+  %tmp86 = load float, float addrspace(3)* %tmp85, align 4
+  %tmp87 = getelementptr inbounds float, float addrspace(3)* %arg, i32 50
+  %tmp88 = load float, float addrspace(3)* %tmp87, align 4
+  %tmp89 = getelementptr inbounds float, float addrspace(3)* %arg, i32 51
+  %tmp90 = load float, float addrspace(3)* %tmp89, align 4
+  %tmp91 = tail call float @llvm.fmuladd.f32(float %tmp86, float %tmp88, float %tmp90)
+  %tmp92 = getelementptr inbounds float, float addrspace(3)* %arg, i32 53
+  %tmp93 = load float, float addrspace(3)* %tmp92, align 4
+  %tmp94 = getelementptr inbounds float, float addrspace(3)* %arg, i32 54
+  %tmp95 = load float, float addrspace(3)* %tmp94, align 4
+  %tmp96 = getelementptr inbounds float, float addrspace(3)* %arg, i32 55
+  %tmp97 = load float, float addrspace(3)* %tmp96, align 4
+  %tmp98 = tail call float @llvm.fmuladd.f32(float %tmp93, float %tmp95, float %tmp97)
+  %tmp99 = getelementptr inbounds float, float addrspace(3)* %arg, i32 57
+  %tmp100 = load float, float addrspace(3)* %tmp99, align 4
+  %tmp101 = getelementptr inbounds float, float addrspace(3)* %arg, i32 58
+  %tmp102 = load float, float addrspace(3)* %tmp101, align 4
+  %tmp103 = getelementptr inbounds float, float addrspace(3)* %arg, i32 59
+  %tmp104 = load float, float addrspace(3)* %tmp103, align 4
+  %tmp105 = tail call float @llvm.fmuladd.f32(float %tmp100, float %tmp102, float %tmp104)
+  %tmp106 = getelementptr inbounds float, float addrspace(3)* %arg, i32 61
+  %tmp107 = load float, float addrspace(3)* %tmp106, align 4
+  %tmp108 = getelementptr inbounds float, float addrspace(3)* %arg, i32 62
+  %tmp109 = load float, float addrspace(3)* %tmp108, align 4
+  %tmp110 = getelementptr inbounds float, float addrspace(3)* %arg, i32 63
+  %tmp111 = load float, float addrspace(3)* %tmp110, align 4
+  %tmp112 = tail call float @llvm.fmuladd.f32(float %tmp107, float %tmp109, float %tmp111)
+  %tmp113 = getelementptr inbounds float, float addrspace(3)* %arg, i32 65
+  %tmp114 = load float, float addrspace(3)* %tmp113, align 4
+  %tmp115 = getelementptr inbounds float, float addrspace(3)* %arg, i32 66
+  %tmp116 = load float, float addrspace(3)* %tmp115, align 4
+  %tmp117 = getelementptr inbounds float, float addrspace(3)* %arg, i32 67
+  %tmp118 = load float, float addrspace(3)* %tmp117, align 4
+  %tmp119 = tail call float @llvm.fmuladd.f32(float %tmp114, float %tmp116, float %tmp118)
+  %tmp120 = getelementptr inbounds float, float addrspace(3)* %arg, i32 69
+  %tmp121 = load float, float addrspace(3)* %tmp120, align 4
+  %tmp122 = getelementptr inbounds float, float addrspace(3)* %arg, i32 70
+  %tmp123 = load float, float addrspace(3)* %tmp122, align 4
+  %tmp124 = getelementptr inbounds float, float addrspace(3)* %arg, i32 71
+  %tmp125 = load float, float addrspace(3)* %tmp124, align 4
+  %tmp126 = tail call float @llvm.fmuladd.f32(float %tmp121, float %tmp123, float %tmp125)
+  %tmp127 = getelementptr inbounds float, float addrspace(3)* %arg, i32 73
+  %tmp128 = load float, float addrspace(3)* %tmp127, align 4
+  %tmp129 = getelementptr inbounds float, float addrspace(3)* %arg, i32 74
+  %tmp130 = load float, float addrspace(3)* %tmp129, align 4
+  %tmp131 = getelementptr inbounds float, float addrspace(3)* %arg, i32 75
+  %tmp132 = load float, float addrspace(3)* %tmp131, align 4
+  %tmp133 = tail call float @llvm.fmuladd.f32(float %tmp128, float %tmp130, float %tmp132)
+  %tmp134 = getelementptr inbounds float, float addrspace(3)* %arg, i32 77
+  %tmp135 = load float, float addrspace(3)* %tmp134, align 4
+  %tmp136 = getelementptr inbounds float, float addrspace(3)* %arg, i32 78
+  %tmp137 = load float, float addrspace(3)* %tmp136, align 4
+  %tmp138 = getelementptr inbounds float, float addrspace(3)* %arg, i32 79
+  %tmp139 = load float, float addrspace(3)* %tmp138, align 4
+  %tmp140 = tail call float @llvm.fmuladd.f32(float %tmp135, float %tmp137, float %tmp139)
+  %tmp141 = getelementptr inbounds float, float addrspace(3)* %arg, i32 81
+  %tmp142 = load float, float addrspace(3)* %tmp141, align 4
+  %tmp143 = getelementptr inbounds float, float addrspace(3)* %arg, i32 82
+  %tmp144 = load float, float addrspace(3)* %tmp143, align 4
+  %tmp145 = getelementptr inbounds float, float addrspace(3)* %arg, i32 83
+  %tmp146 = load float, float addrspace(3)* %tmp145, align 4
+  %tmp147 = tail call float @llvm.fmuladd.f32(float %tmp142, float %tmp144, float %tmp146)
+  %tmp148 = getelementptr inbounds float, float addrspace(3)* %arg, i32 85
+  %tmp149 = load float, float addrspace(3)* %tmp148, align 4
+  %tmp150 = getelementptr inbounds float, float addrspace(3)* %arg, i32 86
+  %tmp151 = load float, float addrspace(3)* %tmp150, align 4
+  %tmp152 = getelementptr inbounds float, float addrspace(3)* %arg, i32 87
+  %tmp153 = load float, float addrspace(3)* %tmp152, align 4
+  %tmp154 = tail call float @llvm.fmuladd.f32(float %tmp149, float %tmp151, float %tmp153)
+  %tmp155 = getelementptr inbounds float, float addrspace(3)* %arg, i32 89
+  %tmp156 = load float, float addrspace(3)* %tmp155, align 4
+  %tmp157 = getelementptr inbounds float, float addrspace(3)* %arg, i32 90
+  %tmp158 = load float, float addrspace(3)* %tmp157, align 4
+  %tmp159 = getelementptr inbounds float, float addrspace(3)* %arg, i32 91
+  %tmp160 = load float, float addrspace(3)* %tmp159, align 4
+  %tmp161 = tail call float @llvm.fmuladd.f32(float %tmp156, float %tmp158, float %tmp160)
+  %tmp162 = getelementptr inbounds float, float addrspace(3)* %arg, i32 93
+  %tmp163 = load float, float addrspace(3)* %tmp162, align 4
+  %tmp164 = getelementptr inbounds float, float addrspace(3)* %arg, i32 94
+  %tmp165 = load float, float addrspace(3)* %tmp164, align 4
+  %tmp166 = getelementptr inbounds float, float addrspace(3)* %arg, i32 95
+  %tmp167 = load float, float addrspace(3)* %tmp166, align 4
+  %tmp168 = tail call float @llvm.fmuladd.f32(float %tmp163, float %tmp165, float %tmp167)
+  %tmp169 = getelementptr inbounds float, float addrspace(3)* %arg, i32 97
+  %tmp170 = load float, float addrspace(3)* %tmp169, align 4
+  %tmp171 = getelementptr inbounds float, float addrspace(3)* %arg, i32 98
+  %tmp172 = load float, float addrspace(3)* %tmp171, align 4
+  %tmp173 = getelementptr inbounds float, float addrspace(3)* %arg, i32 99
+  %tmp174 = load float, float addrspace(3)* %tmp173, align 4
+  %tmp175 = tail call float @llvm.fmuladd.f32(float %tmp170, float %tmp172, float %tmp174)
+  %tmp176 = getelementptr inbounds float, float addrspace(3)* %arg, i32 101
+  %tmp177 = load float, float addrspace(3)* %tmp176, align 4
+  %tmp178 = getelementptr inbounds float, float addrspace(3)* %arg, i32 102
+  %tmp179 = load float, float addrspace(3)* %tmp178, align 4
+  %tmp180 = getelementptr inbounds float, float addrspace(3)* %arg, i32 103
+  %tmp181 = load float, float addrspace(3)* %tmp180, align 4
+  %tmp182 = tail call float @llvm.fmuladd.f32(float %tmp177, float %tmp179, float %tmp181)
+  %tmp183 = getelementptr inbounds float, float addrspace(3)* %arg, i32 105
+  %tmp184 = load float, float addrspace(3)* %tmp183, align 4
+  %tmp185 = getelementptr inbounds float, float addrspace(3)* %arg, i32 106
+  %tmp186 = load float, float addrspace(3)* %tmp185, align 4
+  %tmp187 = getelementptr inbounds float, float addrspace(3)* %arg, i32 107
+  %tmp188 = load float, float addrspace(3)* %tmp187, align 4
+  %tmp189 = tail call float @llvm.fmuladd.f32(float %tmp184, float %tmp186, float %tmp188)
+  %tmp190 = getelementptr inbounds float, float addrspace(3)* %arg, i32 109
+  %tmp191 = load float, float addrspace(3)* %tmp190, align 4
+  %tmp192 = getelementptr inbounds float, float addrspace(3)* %arg, i32 110
+  %tmp193 = load float, float addrspace(3)* %tmp192, align 4
+  %tmp194 = getelementptr inbounds float, float addrspace(3)* %arg, i32 111
+  %tmp195 = load float, float addrspace(3)* %tmp194, align 4
+  %tmp196 = tail call float @llvm.fmuladd.f32(float %tmp191, float %tmp193, float %tmp195)
+  %tmp197 = getelementptr inbounds float, float addrspace(3)* %arg, i32 113
+  %tmp198 = load float, float addrspace(3)* %tmp197, align 4
+  %tmp199 = getelementptr inbounds float, float addrspace(3)* %arg, i32 114
+  %tmp200 = load float, float addrspace(3)* %tmp199, align 4
+  %tmp201 = getelementptr inbounds float, float addrspace(3)* %arg, i32 115
+  %tmp202 = load float, float addrspace(3)* %tmp201, align 4
+  %tmp203 = tail call float @llvm.fmuladd.f32(float %tmp198, float %tmp200, float %tmp202)
+  %tmp204 = getelementptr inbounds float, float addrspace(3)* %arg, i32 117
+  %tmp205 = load float, float addrspace(3)* %tmp204, align 4
+  %tmp206 = getelementptr inbounds float, float addrspace(3)* %arg, i32 118
+  %tmp207 = load float, float addrspace(3)* %tmp206, align 4
+  %tmp208 = getelementptr inbounds float, float addrspace(3)* %arg, i32 119
+  %tmp209 = load float, float addrspace(3)* %tmp208, align 4
+  %tmp210 = tail call float @llvm.fmuladd.f32(float %tmp205, float %tmp207, float %tmp209)
+  %tmp211 = getelementptr inbounds float, float addrspace(3)* %arg, i32 121
+  %tmp212 = load float, float addrspace(3)* %tmp211, align 4
+  %tmp213 = getelementptr inbounds float, float addrspace(3)* %arg, i32 122
+  %tmp214 = load float, float addrspace(3)* %tmp213, align 4
+  %tmp215 = getelementptr inbounds float, float addrspace(3)* %arg, i32 123
+  %tmp216 = load float, float addrspace(3)* %tmp215, align 4
+  %tmp217 = tail call float @llvm.fmuladd.f32(float %tmp212, float %tmp214, float %tmp216)
+  %tmp218 = getelementptr inbounds float, float addrspace(3)* %arg, i32 125
+  %tmp219 = load float, float addrspace(3)* %tmp218, align 4
+  %tmp220 = getelementptr inbounds float, float addrspace(3)* %arg, i32 126
+  %tmp221 = load float, float addrspace(3)* %tmp220, align 4
+  %tmp222 = getelementptr inbounds float, float addrspace(3)* %arg, i32 127
+  %tmp223 = load float, float addrspace(3)* %tmp222, align 4
+  %tmp224 = tail call float @llvm.fmuladd.f32(float %tmp219, float %tmp221, float %tmp223)
+  %tmp225 = getelementptr inbounds float, float addrspace(3)* %arg, i32 129
+  %tmp226 = load float, float addrspace(3)* %tmp225, align 4
+  %tmp227 = getelementptr inbounds float, float addrspace(3)* %arg, i32 130
+  %tmp228 = load float, float addrspace(3)* %tmp227, align 4
+  %tmp229 = getelementptr inbounds float, float addrspace(3)* %arg, i32 131
+  %tmp230 = load float, float addrspace(3)* %tmp229, align 4
+  %tmp231 = tail call float @llvm.fmuladd.f32(float %tmp226, float %tmp228, float %tmp230)
+  %tmp232 = getelementptr inbounds float, float addrspace(3)* %arg, i32 133
+  %tmp233 = load float, float addrspace(3)* %tmp232, align 4
+  %tmp234 = getelementptr inbounds float, float addrspace(3)* %arg, i32 134
+  %tmp235 = load float, float addrspace(3)* %tmp234, align 4
+  %tmp236 = getelementptr inbounds float, float addrspace(3)* %arg, i32 135
+  %tmp237 = load float, float addrspace(3)* %tmp236, align 4
+  %tmp238 = tail call float @llvm.fmuladd.f32(float %tmp233, float %tmp235, float %tmp237)
+  %tmp239 = getelementptr inbounds float, float addrspace(3)* %arg, i32 137
+  %tmp240 = load float, float addrspace(3)* %tmp239, align 4
+  %tmp241 = getelementptr inbounds float, float addrspace(3)* %arg, i32 138
+  %tmp242 = load float, float addrspace(3)* %tmp241, align 4
+  %tmp243 = getelementptr inbounds float, float addrspace(3)* %arg, i32 139
+  %tmp244 = load float, float addrspace(3)* %tmp243, align 4
+  %tmp245 = tail call float @llvm.fmuladd.f32(float %tmp240, float %tmp242, float %tmp244)
+  %tmp246 = getelementptr inbounds float, float addrspace(3)* %arg, i32 141
+  %tmp247 = load float, float addrspace(3)* %tmp246, align 4
+  %tmp248 = getelementptr inbounds float, float addrspace(3)* %arg, i32 142
+  %tmp249 = load float, float addrspace(3)* %tmp248, align 4
+  %tmp250 = getelementptr inbounds float, float addrspace(3)* %arg, i32 143
+  %tmp251 = load float, float addrspace(3)* %tmp250, align 4
+  %tmp252 = tail call float @llvm.fmuladd.f32(float %tmp247, float %tmp249, float %tmp251)
+  %tmp253 = getelementptr inbounds float, float addrspace(3)* %arg, i32 145
+  %tmp254 = load float, float addrspace(3)* %tmp253, align 4
+  %tmp255 = getelementptr inbounds float, float addrspace(3)* %arg, i32 146
+  %tmp256 = load float, float addrspace(3)* %tmp255, align 4
+  %tmp257 = getelementptr inbounds float, float addrspace(3)* %arg, i32 147
+  %tmp258 = load float, float addrspace(3)* %tmp257, align 4
+  %tmp259 = tail call float @llvm.fmuladd.f32(float %tmp254, float %tmp256, float %tmp258)
+  %tmp260 = getelementptr inbounds float, float addrspace(3)* %arg, i32 149
+  %tmp261 = load float, float addrspace(3)* %tmp260, align 4
+  %tmp262 = getelementptr inbounds float, float addrspace(3)* %arg, i32 150
+  %tmp263 = load float, float addrspace(3)* %tmp262, align 4
+  %tmp264 = getelementptr inbounds float, float addrspace(3)* %arg, i32 151
+  %tmp265 = load float, float addrspace(3)* %tmp264, align 4
+  %tmp266 = tail call float @llvm.fmuladd.f32(float %tmp261, float %tmp263, float %tmp265)
+  %tmp267 = getelementptr inbounds float, float addrspace(3)* %arg, i32 153
+  %tmp268 = load float, float addrspace(3)* %tmp267, align 4
+  %tmp269 = getelementptr inbounds float, float addrspace(3)* %arg, i32 154
+  %tmp270 = load float, float addrspace(3)* %tmp269, align 4
+  %tmp271 = getelementptr inbounds float, float addrspace(3)* %arg, i32 155
+  %tmp272 = load float, float addrspace(3)* %tmp271, align 4
+  %tmp273 = tail call float @llvm.fmuladd.f32(float %tmp268, float %tmp270, float %tmp272)
+  %tmp274 = getelementptr inbounds float, float addrspace(3)* %arg, i32 157
+  %tmp275 = load float, float addrspace(3)* %tmp274, align 4
+  %tmp276 = getelementptr inbounds float, float addrspace(3)* %arg, i32 158
+  %tmp277 = load float, float addrspace(3)* %tmp276, align 4
+  %tmp278 = getelementptr inbounds float, float addrspace(3)* %arg, i32 159
+  %tmp279 = load float, float addrspace(3)* %tmp278, align 4
+  %tmp280 = tail call float @llvm.fmuladd.f32(float %tmp275, float %tmp277, float %tmp279)
+  %tmp281 = getelementptr inbounds float, float addrspace(3)* %arg, i32 161
+  %tmp282 = load float, float addrspace(3)* %tmp281, align 4
+  %tmp283 = getelementptr inbounds float, float addrspace(3)* %arg, i32 162
+  %tmp284 = load float, float addrspace(3)* %tmp283, align 4
+  %tmp285 = getelementptr inbounds float, float addrspace(3)* %arg, i32 163
+  %tmp286 = load float, float addrspace(3)* %tmp285, align 4
+  %tmp287 = tail call float @llvm.fmuladd.f32(float %tmp282, float %tmp284, float %tmp286)
+  %tmp288 = getelementptr inbounds float, float addrspace(3)* %arg, i32 165
+  %tmp289 = load float, float addrspace(3)* %tmp288, align 4
+  %tmp290 = getelementptr inbounds float, float addrspace(3)* %arg, i32 166
+  %tmp291 = load float, float addrspace(3)* %tmp290, align 4
+  %tmp292 = getelementptr inbounds float, float addrspace(3)* %arg, i32 167
+  %tmp293 = load float, float addrspace(3)* %tmp292, align 4
+  %tmp294 = tail call float @llvm.fmuladd.f32(float %tmp289, float %tmp291, float %tmp293)
+  %tmp295 = getelementptr inbounds float, float addrspace(3)* %arg, i32 169
+  %tmp296 = load float, float addrspace(3)* %tmp295, align 4
+  %tmp297 = getelementptr inbounds float, float addrspace(3)* %arg, i32 170
+  %tmp298 = load float, float addrspace(3)* %tmp297, align 4
+  %tmp299 = getelementptr inbounds float, float addrspace(3)* %arg, i32 171
+  %tmp300 = load float, float addrspace(3)* %tmp299, align 4
+  %tmp301 = tail call float @llvm.fmuladd.f32(float %tmp296, float %tmp298, float %tmp300)
+  %tmp302 = getelementptr inbounds float, float addrspace(3)* %arg, i32 173
+  %tmp303 = load float, float addrspace(3)* %tmp302, align 4
+  %tmp304 = getelementptr inbounds float, float addrspace(3)* %arg, i32 174
+  %tmp305 = load float, float addrspace(3)* %tmp304, align 4
+  %tmp306 = getelementptr inbounds float, float addrspace(3)* %arg, i32 175
+  %tmp307 = load float, float addrspace(3)* %tmp306, align 4
+  %tmp308 = tail call float @llvm.fmuladd.f32(float %tmp303, float %tmp305, float %tmp307)
+  %tmp309 = getelementptr inbounds float, float addrspace(3)* %arg, i32 177
+  %tmp310 = load float, float addrspace(3)* %tmp309, align 4
+  %tmp311 = getelementptr inbounds float, float addrspace(3)* %arg, i32 178
+  %tmp312 = load float, float addrspace(3)* %tmp311, align 4
+  %tmp313 = getelementptr inbounds float, float addrspace(3)* %arg, i32 179
+  %tmp314 = load float, float addrspace(3)* %tmp313, align 4
+  %tmp315 = tail call float @llvm.fmuladd.f32(float %tmp310, float %tmp312, float %tmp314)
+  %tmp316 = getelementptr inbounds float, float addrspace(3)* %arg, i32 181
+  %tmp317 = load float, float addrspace(3)* %tmp316, align 4
+  %tmp318 = getelementptr inbounds float, float addrspace(3)* %arg, i32 182
+  %tmp319 = load float, float addrspace(3)* %tmp318, align 4
+  %tmp320 = getelementptr inbounds float, float addrspace(3)* %arg, i32 183
+  %tmp321 = load float, float addrspace(3)* %tmp320, align 4
+  %tmp322 = tail call float @llvm.fmuladd.f32(float %tmp317, float %tmp319, float %tmp321)
+  %tmp323 = getelementptr inbounds float, float addrspace(3)* %arg, i32 185
+  %tmp324 = load float, float addrspace(3)* %tmp323, align 4
+  %tmp325 = getelementptr inbounds float, float addrspace(3)* %arg, i32 186
+  %tmp326 = load float, float addrspace(3)* %tmp325, align 4
+  %tmp327 = getelementptr inbounds float, float addrspace(3)* %arg, i32 187
+  %tmp328 = load float, float addrspace(3)* %tmp327, align 4
+  %tmp329 = tail call float @llvm.fmuladd.f32(float %tmp324, float %tmp326, float %tmp328)
+  %tmp330 = getelementptr inbounds float, float addrspace(3)* %arg, i32 189
+  %tmp331 = load float, float addrspace(3)* %tmp330, align 4
+  %tmp332 = getelementptr inbounds float, float addrspace(3)* %arg, i32 190
+  %tmp333 = load float, float addrspace(3)* %tmp332, align 4
+  %tmp334 = getelementptr inbounds float, float addrspace(3)* %arg, i32 191
+  %tmp335 = load float, float addrspace(3)* %tmp334, align 4
+  %tmp336 = tail call float @llvm.fmuladd.f32(float %tmp331, float %tmp333, float %tmp335)
+  %tmp337 = getelementptr inbounds float, float addrspace(3)* %arg, i32 193
+  %tmp338 = load float, float addrspace(3)* %tmp337, align 4
+  %tmp339 = getelementptr inbounds float, float addrspace(3)* %arg, i32 194
+  %tmp340 = load float, float addrspace(3)* %tmp339, align 4
+  %tmp341 = getelementptr inbounds float, float addrspace(3)* %arg, i32 195
+  %tmp342 = load float, float addrspace(3)* %tmp341, align 4
+  %tmp343 = tail call float @llvm.fmuladd.f32(float %tmp338, float %tmp340, float %tmp342)
+  %tmp344 = getelementptr inbounds float, float addrspace(3)* %arg, i32 197
+  %tmp345 = load float, float addrspace(3)* %tmp344, align 4
+  %tmp346 = getelementptr inbounds float, float addrspace(3)* %arg, i32 198
+  %tmp347 = load float, float addrspace(3)* %tmp346, align 4
+  %tmp348 = getelementptr inbounds float, float addrspace(3)* %arg, i32 199
+  %tmp349 = load float, float addrspace(3)* %tmp348, align 4
+  %tmp350 = tail call float @llvm.fmuladd.f32(float %tmp345, float %tmp347, float %tmp349)
+  %tmp351 = getelementptr inbounds float, float addrspace(3)* %arg, i32 201
+  %tmp352 = load float, float addrspace(3)* %tmp351, align 4
+  %tmp353 = getelementptr inbounds float, float addrspace(3)* %arg, i32 202
+  %tmp354 = load float, float addrspace(3)* %tmp353, align 4
+  %tmp355 = getelementptr inbounds float, float addrspace(3)* %arg, i32 203
+  %tmp356 = load float, float addrspace(3)* %tmp355, align 4
+  %tmp357 = tail call float @llvm.fmuladd.f32(float %tmp352, float %tmp354, float %tmp356)
+  %tmp358 = getelementptr inbounds float, float addrspace(3)* %arg, i32 205
+  %tmp359 = load float, float addrspace(3)* %tmp358, align 4
+  %tmp360 = getelementptr inbounds float, float addrspace(3)* %arg, i32 206
+  %tmp361 = load float, float addrspace(3)* %tmp360, align 4
+  %tmp362 = getelementptr inbounds float, float addrspace(3)* %arg, i32 207
+  %tmp363 = load float, float addrspace(3)* %tmp362, align 4
+  %tmp364 = tail call float @llvm.fmuladd.f32(float %tmp359, float %tmp361, float %tmp363)
+  %tmp365 = getelementptr inbounds float, float addrspace(3)* %arg, i32 209
+  %tmp366 = load float, float addrspace(3)* %tmp365, align 4
+  %tmp367 = getelementptr inbounds float, float addrspace(3)* %arg, i32 210
+  %tmp368 = load float, float addrspace(3)* %tmp367, align 4
+  %tmp369 = getelementptr inbounds float, float addrspace(3)* %arg, i32 211
+  %tmp370 = load float, float addrspace(3)* %tmp369, align 4
+  %tmp371 = tail call float @llvm.fmuladd.f32(float %tmp366, float %tmp368, float %tmp370)
+  %tmp372 = getelementptr inbounds float, float addrspace(3)* %arg, i32 213
+  %tmp373 = load float, float addrspace(3)* %tmp372, align 4
+  %tmp374 = getelementptr inbounds float, float addrspace(3)* %arg, i32 214
+  %tmp375 = load float, float addrspace(3)* %tmp374, align 4
+  %tmp376 = getelementptr inbounds float, float addrspace(3)* %arg, i32 215
+  %tmp377 = load float, float addrspace(3)* %tmp376, align 4
+  %tmp378 = tail call float @llvm.fmuladd.f32(float %tmp373, float %tmp375, float %tmp377)
+  %tmp379 = getelementptr inbounds float, float addrspace(3)* %arg, i32 217
+  %tmp380 = load float, float addrspace(3)* %tmp379, align 4
+  %tmp381 = getelementptr inbounds float, float addrspace(3)* %arg, i32 218
+  %tmp382 = load float, float addrspace(3)* %tmp381, align 4
+  %tmp383 = getelementptr inbounds float, float addrspace(3)* %arg, i32 219
+  %tmp384 = load float, float addrspace(3)* %tmp383, align 4
+  %tmp385 = tail call float @llvm.fmuladd.f32(float %tmp380, float %tmp382, float %tmp384)
+  %tmp386 = getelementptr inbounds float, float addrspace(3)* %arg, i32 221
+  %tmp387 = load float, float addrspace(3)* %tmp386, align 4
+  %tmp388 = getelementptr inbounds float, float addrspace(3)* %arg, i32 222
+  %tmp389 = load float, float addrspace(3)* %tmp388, align 4
+  %tmp390 = getelementptr inbounds float, float addrspace(3)* %arg, i32 223
+  %tmp391 = load float, float addrspace(3)* %tmp390, align 4
+  %tmp392 = tail call float @llvm.fmuladd.f32(float %tmp387, float %tmp389, float %tmp391)
+  %tmp393 = getelementptr inbounds float, float addrspace(3)* %arg, i32 225
+  %tmp394 = load float, float addrspace(3)* %tmp393, align 4
+  %tmp395 = getelementptr inbounds float, float addrspace(3)* %arg, i32 226
+  %tmp396 = load float, float addrspace(3)* %tmp395, align 4
+  %tmp397 = getelementptr inbounds float, float addrspace(3)* %arg, i32 227
+  %tmp398 = load float, float addrspace(3)* %tmp397, align 4
+  %tmp399 = tail call float @llvm.fmuladd.f32(float %tmp394, float %tmp396, float %tmp398)
+  %tmp400 = getelementptr inbounds float, float addrspace(3)* %arg, i32 229
+  %tmp401 = load float, float addrspace(3)* %tmp400, align 4
+  %tmp402 = getelementptr inbounds float, float addrspace(3)* %arg, i32 230
+  %tmp403 = load float, float addrspace(3)* %tmp402, align 4
+  %tmp404 = getelementptr inbounds float, float addrspace(3)* %arg, i32 231
+  %tmp405 = load float, float addrspace(3)* %tmp404, align 4
+  %tmp406 = tail call float @llvm.fmuladd.f32(float %tmp401, float %tmp403, float %tmp405)
+  %tmp407 = getelementptr inbounds float, float addrspace(3)* %arg, i32 233
+  %tmp408 = load float, float addrspace(3)* %tmp407, align 4
+  %tmp409 = getelementptr inbounds float, float addrspace(3)* %arg, i32 234
+  %tmp410 = load float, float addrspace(3)* %tmp409, align 4
+  %tmp411 = getelementptr inbounds float, float addrspace(3)* %arg, i32 235
+  %tmp412 = load float, float addrspace(3)* %tmp411, align 4
+  %tmp413 = tail call float @llvm.fmuladd.f32(float %tmp408, float %tmp410, float %tmp412)
+  %tmp414 = getelementptr inbounds float, float addrspace(3)* %arg, i32 237
+  %tmp415 = load float, float addrspace(3)* %tmp414, align 4
+  %tmp416 = getelementptr inbounds float, float addrspace(3)* %arg, i32 238
+  %tmp417 = load float, float addrspace(3)* %tmp416, align 4
+  %tmp418 = getelementptr inbounds float, float addrspace(3)* %arg, i32 239
+  %tmp419 = load float, float addrspace(3)* %tmp418, align 4
+  %tmp420 = tail call float @llvm.fmuladd.f32(float %tmp415, float %tmp417, float %tmp419)
+  %tmp421 = getelementptr inbounds float, float addrspace(3)* %arg, i32 241
+  %tmp422 = load float, float addrspace(3)* %tmp421, align 4
+  %tmp423 = getelementptr inbounds float, float addrspace(3)* %arg, i32 242
+  %tmp424 = load float, float addrspace(3)* %tmp423, align 4
+  %tmp425 = getelementptr inbounds float, float addrspace(3)* %arg, i32 243
+  %tmp426 = load float, float addrspace(3)* %tmp425, align 4
+  %tmp427 = tail call float @llvm.fmuladd.f32(float %tmp422, float %tmp424, float %tmp426)
+  %tmp428 = getelementptr inbounds float, float addrspace(3)* %arg, i32 245
+  %tmp429 = load float, float addrspace(3)* %tmp428, align 4
+  %tmp430 = getelementptr inbounds float, float addrspace(3)* %arg, i32 246
+  %tmp431 = load float, float addrspace(3)* %tmp430, align 4
+  %tmp432 = getelementptr inbounds float, float addrspace(3)* %arg, i32 247
+  %tmp433 = load float, float addrspace(3)* %tmp432, align 4
+  %tmp434 = tail call float @llvm.fmuladd.f32(float %tmp429, float %tmp431, float %tmp433)
+  %tmp435 = getelementptr inbounds float, float addrspace(3)* %arg, i32 249
+  %tmp436 = load float, float addrspace(3)* %tmp435, align 4
+  %tmp437 = getelementptr inbounds float, float addrspace(3)* %arg, i32 250
+  %tmp438 = load float, float addrspace(3)* %tmp437, align 4
+  %tmp439 = getelementptr inbounds float, float addrspace(3)* %arg, i32 251
+  %tmp440 = load float, float addrspace(3)* %tmp439, align 4
+  %tmp441 = tail call float @llvm.fmuladd.f32(float %tmp436, float %tmp438, float %tmp440)
+  %tmp442 = getelementptr inbounds float, float addrspace(3)* %arg, i32 253
+  %tmp443 = load float, float addrspace(3)* %tmp442, align 4
+  %tmp444 = getelementptr inbounds float, float addrspace(3)* %arg, i32 254
+  %tmp445 = load float, float addrspace(3)* %tmp444, align 4
+  %tmp446 = getelementptr inbounds float, float addrspace(3)* %arg, i32 255
+  %tmp447 = load float, float addrspace(3)* %tmp446, align 4
+  %tmp448 = tail call float @llvm.fmuladd.f32(float %tmp443, float %tmp445, float %tmp447)
+  store float %tmp7, float addrspace(1)* %arg1, align 4
+  %tmp449 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 1
+  store float %tmp14, float addrspace(1)* %tmp449, align 4
+  %tmp450 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 2
+  store float %tmp21, float addrspace(1)* %tmp450, align 4
+  %tmp451 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 3
+  store float %tmp28, float addrspace(1)* %tmp451, align 4
+  %tmp452 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 4
+  store float %tmp35, float addrspace(1)* %tmp452, align 4
+  %tmp453 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 5
+  store float %tmp42, float addrspace(1)* %tmp453, align 4
+  %tmp454 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 6
+  store float %tmp49, float addrspace(1)* %tmp454, align 4
+  %tmp455 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 7
+  store float %tmp56, float addrspace(1)* %tmp455, align 4
+  %tmp456 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 8
+  store float %tmp63, float addrspace(1)* %tmp456, align 4
+  %tmp457 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 9
+  store float %tmp70, float addrspace(1)* %tmp457, align 4
+  %tmp458 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 10
+  store float %tmp77, float addrspace(1)* %tmp458, align 4
+  %tmp459 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 11
+  store float %tmp84, float addrspace(1)* %tmp459, align 4
+  %tmp460 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 12
+  store float %tmp91, float addrspace(1)* %tmp460, align 4
+  %tmp461 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 13
+  store float %tmp98, float addrspace(1)* %tmp461, align 4
+  %tmp462 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 14
+  store float %tmp105, float addrspace(1)* %tmp462, align 4
+  %tmp463 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 15
+  store float %tmp112, float addrspace(1)* %tmp463, align 4
+  %tmp464 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 16
+  store float %tmp119, float addrspace(1)* %tmp464, align 4
+  %tmp465 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 17
+  store float %tmp126, float addrspace(1)* %tmp465, align 4
+  %tmp466 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 18
+  store float %tmp133, float addrspace(1)* %tmp466, align 4
+  %tmp467 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 19
+  store float %tmp140, float addrspace(1)* %tmp467, align 4
+  %tmp468 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 20
+  store float %tmp147, float addrspace(1)* %tmp468, align 4
+  %tmp469 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 21
+  store float %tmp154, float addrspace(1)* %tmp469, align 4
+  %tmp470 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 22
+  store float %tmp161, float addrspace(1)* %tmp470, align 4
+  %tmp471 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 23
+  store float %tmp168, float addrspace(1)* %tmp471, align 4
+  %tmp472 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 24
+  store float %tmp175, float addrspace(1)* %tmp472, align 4
+  %tmp473 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 25
+  store float %tmp182, float addrspace(1)* %tmp473, align 4
+  %tmp474 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 26
+  store float %tmp189, float addrspace(1)* %tmp474, align 4
+  %tmp475 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 27
+  store float %tmp196, float addrspace(1)* %tmp475, align 4
+  %tmp476 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 28
+  store float %tmp203, float addrspace(1)* %tmp476, align 4
+  %tmp477 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 29
+  store float %tmp210, float addrspace(1)* %tmp477, align 4
+  %tmp478 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 30
+  store float %tmp217, float addrspace(1)* %tmp478, align 4
+  %tmp479 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 31
+  store float %tmp224, float addrspace(1)* %tmp479, align 4
+  %tmp480 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 32
+  store float %tmp231, float addrspace(1)* %tmp480, align 4
+  %tmp481 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 33
+  store float %tmp238, float addrspace(1)* %tmp481, align 4
+  %tmp482 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 34
+  store float %tmp245, float addrspace(1)* %tmp482, align 4
+  %tmp483 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 35
+  store float %tmp252, float addrspace(1)* %tmp483, align 4
+  %tmp484 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 36
+  store float %tmp259, float addrspace(1)* %tmp484, align 4
+  %tmp485 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 37
+  store float %tmp266, float addrspace(1)* %tmp485, align 4
+  %tmp486 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 38
+  store float %tmp273, float addrspace(1)* %tmp486, align 4
+  %tmp487 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 39
+  store float %tmp280, float addrspace(1)* %tmp487, align 4
+  %tmp488 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 40
+  store float %tmp287, float addrspace(1)* %tmp488, align 4
+  %tmp489 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 41
+  store float %tmp294, float addrspace(1)* %tmp489, align 4
+  %tmp490 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 42
+  store float %tmp301, float addrspace(1)* %tmp490, align 4
+  %tmp491 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 43
+  store float %tmp308, float addrspace(1)* %tmp491, align 4
+  %tmp492 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 44
+  store float %tmp315, float addrspace(1)* %tmp492, align 4
+  %tmp493 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 45
+  store float %tmp322, float addrspace(1)* %tmp493, align 4
+  %tmp494 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 46
+  store float %tmp329, float addrspace(1)* %tmp494, align 4
+  %tmp495 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 47
+  store float %tmp336, float addrspace(1)* %tmp495, align 4
+  %tmp496 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 48
+  store float %tmp343, float addrspace(1)* %tmp496, align 4
+  %tmp497 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 49
+  store float %tmp350, float addrspace(1)* %tmp497, align 4
+  %tmp498 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 50
+  store float %tmp357, float addrspace(1)* %tmp498, align 4
+  %tmp499 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 51
+  store float %tmp364, float addrspace(1)* %tmp499, align 4
+  %tmp500 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 52
+  store float %tmp371, float addrspace(1)* %tmp500, align 4
+  %tmp501 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 53
+  store float %tmp378, float addrspace(1)* %tmp501, align 4
+  %tmp502 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 54
+  store float %tmp385, float addrspace(1)* %tmp502, align 4
+  %tmp503 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 55
+  store float %tmp392, float addrspace(1)* %tmp503, align 4
+  %tmp504 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 56
+  store float %tmp399, float addrspace(1)* %tmp504, align 4
+  %tmp505 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 57
+  store float %tmp406, float addrspace(1)* %tmp505, align 4
+  %tmp506 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 58
+  store float %tmp413, float addrspace(1)* %tmp506, align 4
+  %tmp507 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 59
+  store float %tmp420, float addrspace(1)* %tmp507, align 4
+  %tmp508 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 60
+  store float %tmp427, float addrspace(1)* %tmp508, align 4
+  %tmp509 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 61
+  store float %tmp434, float addrspace(1)* %tmp509, align 4
+  %tmp510 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 62
+  store float %tmp441, float addrspace(1)* %tmp510, align 4
+  %tmp511 = getelementptr inbounds float, float addrspace(1)* %arg1, i64 63
+  store float %tmp448, float addrspace(1)* %tmp511, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare float @llvm.fmuladd.f32(float, float, float) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll b/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0d19c1e6a8f34ead75d44ccc7e4fd1bdca4acf4b
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-regpressure-limit2.ll
@@ -0,0 +1,288 @@
+; RUN: llc -march=amdgcn -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-minreg -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -misched=gcn-max-occupancy-experimental -verify-machineinstrs < %s | FileCheck -check-prefix=VI %s
+
+; SI: NumSgprs: {{[1-9]$}}
+; SI: NumVgprs: {{[1-9]$}}
+
+; stores may alias loads
+; VI: NumSgprs: {{[1-5][0-9]$}}
+; VI: NumVgprs: {{[1-3][0-9]$}}
+
+define amdgpu_kernel void @load_fma_store(float addrspace(3)* nocapture readonly %in_arg, float addrspace(1)* nocapture %out_arg) {
+bb:
+  %adr.a.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20004
+  %adr.b.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20252
+  %adr.c.0 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20508
+  %adr.a.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 20772
+  %adr.b.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21020
+  %adr.c.1 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21276
+  %adr.a.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21540
+  %adr.b.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 21788
+  %adr.c.2 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22044
+  %adr.a.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22308
+  %adr.b.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22556
+  %adr.c.3 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 22812
+  %adr.a.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23076
+  %adr.b.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23324
+  %adr.c.4 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23580
+  %adr.a.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 23844
+  %adr.b.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24092
+  %adr.c.5 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24348
+  %adr.a.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24612
+  %adr.b.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 24860
+  %adr.c.6 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25116
+  %adr.a.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25380
+  %adr.b.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25628
+  %adr.c.7 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 25884
+  %adr.a.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26148
+  %adr.b.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26396
+  %adr.c.8 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26652
+  %adr.a.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 26916
+  %adr.b.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27164
+  %adr.c.9 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27420
+  %adr.a.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27684
+  %adr.b.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 27932
+  %adr.c.10 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28188
+  %adr.a.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28452
+  %adr.b.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28700
+  %adr.c.11 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 28956
+  %adr.a.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29220
+  %adr.b.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29468
+  %adr.c.12 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29724
+  %adr.a.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 29988
+  %adr.b.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30236
+  %adr.c.13 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30492
+  %adr.a.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 30756
+  %adr.b.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31004
+  %adr.c.14 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31260
+  %adr.a.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31524
+  %adr.b.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 31772
+  %adr.c.15 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32028
+  %adr.a.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32292
+  %adr.b.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32540
+  %adr.c.16 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 32796
+  %adr.a.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33060
+  %adr.b.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33308
+  %adr.c.17 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33564
+  %adr.a.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 33828
+  %adr.b.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34076
+  %adr.c.18 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34332
+  %adr.a.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34596
+  %adr.b.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 34844
+  %adr.c.19 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35100
+  %adr.a.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35364
+  %adr.b.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35612
+  %adr.c.20 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 35868
+  %adr.a.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36132
+  %adr.b.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36380
+  %adr.c.21 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36636
+  %adr.a.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 36900
+  %adr.b.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37148
+  %adr.c.22 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37404
+  %adr.a.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37668
+  %adr.b.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 37916
+  %adr.c.23 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38172
+  %adr.a.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38436
+  %adr.b.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38684
+  %adr.c.24 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 38940
+  %adr.a.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39204
+  %adr.b.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39452
+  %adr.c.25 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39708
+  %adr.a.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 39972
+  %adr.b.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40220
+  %adr.c.26 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40476
+  %adr.a.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40740
+  %adr.b.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 40988
+  %adr.c.27 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41244
+  %adr.a.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41508
+  %adr.b.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 41756
+  %adr.c.28 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42012
+  %adr.a.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42276
+  %adr.b.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42524
+  %adr.c.29 = getelementptr inbounds float, float addrspace(3)* %in_arg, i32 42780
+  %a.0 = load float, float addrspace(3)* %adr.a.0, align 4
+  %b.0 = load float, float addrspace(3)* %adr.b.0, align 4
+  %c.0 = load float, float addrspace(3)* %adr.c.0, align 4
+  %a.1 = load float, float addrspace(3)* %adr.a.1, align 4
+  %b.1 = load float, float addrspace(3)* %adr.b.1, align 4
+  %c.1 = load float, float addrspace(3)* %adr.c.1, align 4
+  %a.2 = load float, float addrspace(3)* %adr.a.2, align 4
+  %b.2 = load float, float addrspace(3)* %adr.b.2, align 4
+  %c.2 = load float, float addrspace(3)* %adr.c.2, align 4
+  %a.3 = load float, float addrspace(3)* %adr.a.3, align 4
+  %b.3 = load float, float addrspace(3)* %adr.b.3, align 4
+  %c.3 = load float, float addrspace(3)* %adr.c.3, align 4
+  %a.4 = load float, float addrspace(3)* %adr.a.4, align 4
+  %b.4 = load float, float addrspace(3)* %adr.b.4, align 4
+  %c.4 = load float, float addrspace(3)* %adr.c.4, align 4
+  %a.5 = load float, float addrspace(3)* %adr.a.5, align 4
+  %b.5 = load float, float addrspace(3)* %adr.b.5, align 4
+  %c.5 = load float, float addrspace(3)* %adr.c.5, align 4
+  %a.6 = load float, float addrspace(3)* %adr.a.6, align 4
+  %b.6 = load float, float addrspace(3)* %adr.b.6, align 4
+  %c.6 = load float, float addrspace(3)* %adr.c.6, align 4
+  %a.7 = load float, float addrspace(3)* %adr.a.7, align 4
+  %b.7 = load float, float addrspace(3)* %adr.b.7, align 4
+  %c.7 = load float, float addrspace(3)* %adr.c.7, align 4
+  %a.8 = load float, float addrspace(3)* %adr.a.8, align 4
+  %b.8 = load float, float addrspace(3)* %adr.b.8, align 4
+  %c.8 = load float, float addrspace(3)* %adr.c.8, align 4
+  %a.9 = load float, float addrspace(3)* %adr.a.9, align 4
+  %b.9 = load float, float addrspace(3)* %adr.b.9, align 4
+  %c.9 = load float, float addrspace(3)* %adr.c.9, align 4
+  %a.10 = load float, float addrspace(3)* %adr.a.10, align 4
+  %b.10 = load float, float addrspace(3)* %adr.b.10, align 4
+  %c.10 = load float, float addrspace(3)* %adr.c.10, align 4
+  %a.11 = load float, float addrspace(3)* %adr.a.11, align 4
+  %b.11 = load float, float addrspace(3)* %adr.b.11, align 4
+  %c.11 = load float, float addrspace(3)* %adr.c.11, align 4
+  %a.12 = load float, float addrspace(3)* %adr.a.12, align 4
+  %b.12 = load float, float addrspace(3)* %adr.b.12, align 4
+  %c.12 = load float, float addrspace(3)* %adr.c.12, align 4
+  %a.13 = load float, float addrspace(3)* %adr.a.13, align 4
+  %b.13 = load float, float addrspace(3)* %adr.b.13, align 4
+  %c.13 = load float, float addrspace(3)* %adr.c.13, align 4
+  %a.14 = load float, float addrspace(3)* %adr.a.14, align 4
+  %b.14 = load float, float addrspace(3)* %adr.b.14, align 4
+  %c.14 = load float, float addrspace(3)* %adr.c.14, align 4
+  %a.15 = load float, float addrspace(3)* %adr.a.15, align 4
+  %b.15 = load float, float addrspace(3)* %adr.b.15, align 4
+  %c.15 = load float, float addrspace(3)* %adr.c.15, align 4
+  %a.16 = load float, float addrspace(3)* %adr.a.16, align 4
+  %b.16 = load float, float addrspace(3)* %adr.b.16, align 4
+  %c.16 = load float, float addrspace(3)* %adr.c.16, align 4
+  %a.17 = load float, float addrspace(3)* %adr.a.17, align 4
+  %b.17 = load float, float addrspace(3)* %adr.b.17, align 4
+  %c.17 = load float, float addrspace(3)* %adr.c.17, align 4
+  %a.18 = load float, float addrspace(3)* %adr.a.18, align 4
+  %b.18 = load float, float addrspace(3)* %adr.b.18, align 4
+  %c.18 = load float, float addrspace(3)* %adr.c.18, align 4
+  %a.19 = load float, float addrspace(3)* %adr.a.19, align 4
+  %b.19 = load float, float addrspace(3)* %adr.b.19, align 4
+  %c.19 = load float, float addrspace(3)* %adr.c.19, align 4
+  %a.20 = load float, float addrspace(3)* %adr.a.20, align 4
+  %b.20 = load float, float addrspace(3)* %adr.b.20, align 4
+  %c.20 = load float, float addrspace(3)* %adr.c.20, align 4
+  %a.21 = load float, float addrspace(3)* %adr.a.21, align 4
+  %b.21 = load float, float addrspace(3)* %adr.b.21, align 4
+  %c.21 = load float, float addrspace(3)* %adr.c.21, align 4
+  %a.22 = load float, float addrspace(3)* %adr.a.22, align 4
+  %b.22 = load float, float addrspace(3)* %adr.b.22, align 4
+  %c.22 = load float, float addrspace(3)* %adr.c.22, align 4
+  %a.23 = load float, float addrspace(3)* %adr.a.23, align 4
+  %b.23 = load float, float addrspace(3)* %adr.b.23, align 4
+  %c.23 = load float, float addrspace(3)* %adr.c.23, align 4
+  %a.24 = load float, float addrspace(3)* %adr.a.24, align 4
+  %b.24 = load float, float addrspace(3)* %adr.b.24, align 4
+  %c.24 = load float, float addrspace(3)* %adr.c.24, align 4
+  %a.25 = load float, float addrspace(3)* %adr.a.25, align 4
+  %b.25 = load float, float addrspace(3)* %adr.b.25, align 4
+  %c.25 = load float, float addrspace(3)* %adr.c.25, align 4
+  %a.26 = load float, float addrspace(3)* %adr.a.26, align 4
+  %b.26 = load float, float addrspace(3)* %adr.b.26, align 4
+  %c.26 = load float, float addrspace(3)* %adr.c.26, align 4
+  %a.27 = load float, float addrspace(3)* %adr.a.27, align 4
+  %b.27 = load float, float addrspace(3)* %adr.b.27, align 4
+  %c.27 = load float, float addrspace(3)* %adr.c.27, align 4
+  %a.28 = load float, float addrspace(3)* %adr.a.28, align 4
+  %b.28 = load float, float addrspace(3)* %adr.b.28, align 4
+  %c.28 = load float, float addrspace(3)* %adr.c.28, align 4
+  %a.29 = load float, float addrspace(3)* %adr.a.29, align 4
+  %b.29 = load float, float addrspace(3)* %adr.b.29, align 4
+  %c.29 = load float, float addrspace(3)* %adr.c.29, align 4
+  %res.0 = tail call float @llvm.fmuladd.f32(float %a.0, float %b.0, float %c.0)
+  %res.1 = tail call float @llvm.fmuladd.f32(float %a.1, float %b.1, float %c.1)
+  %res.2 = tail call float @llvm.fmuladd.f32(float %a.2, float %b.2, float %c.2)
+  %res.3 = tail call float @llvm.fmuladd.f32(float %a.3, float %b.3, float %c.3)
+  %res.4 = tail call float @llvm.fmuladd.f32(float %a.4, float %b.4, float %c.4)
+  %res.5 = tail call float @llvm.fmuladd.f32(float %a.5, float %b.5, float %c.5)
+  %res.6 = tail call float @llvm.fmuladd.f32(float %a.6, float %b.6, float %c.6)
+  %res.7 = tail call float @llvm.fmuladd.f32(float %a.7, float %b.7, float %c.7)
+  %res.8 = tail call float @llvm.fmuladd.f32(float %a.8, float %b.8, float %c.8)
+  %res.9 = tail call float @llvm.fmuladd.f32(float %a.9, float %b.9, float %c.9)
+  %res.10 = tail call float @llvm.fmuladd.f32(float %a.10, float %b.10, float %c.10)
+  %res.11 = tail call float @llvm.fmuladd.f32(float %a.11, float %b.11, float %c.11)
+  %res.12 = tail call float @llvm.fmuladd.f32(float %a.12, float %b.12, float %c.12)
+  %res.13 = tail call float @llvm.fmuladd.f32(float %a.13, float %b.13, float %c.13)
+  %res.14 = tail call float @llvm.fmuladd.f32(float %a.14, float %b.14, float %c.14)
+  %res.15 = tail call float @llvm.fmuladd.f32(float %a.15, float %b.15, float %c.15)
+  %res.16 = tail call float @llvm.fmuladd.f32(float %a.16, float %b.16, float %c.16)
+  %res.17 = tail call float @llvm.fmuladd.f32(float %a.17, float %b.17, float %c.17)
+  %res.18 = tail call float @llvm.fmuladd.f32(float %a.18, float %b.18, float %c.18)
+  %res.19 = tail call float @llvm.fmuladd.f32(float %a.19, float %b.19, float %c.19)
+  %res.20 = tail call float @llvm.fmuladd.f32(float %a.20, float %b.20, float %c.20)
+  %res.21 = tail call float @llvm.fmuladd.f32(float %a.21, float %b.21, float %c.21)
+  %res.22 = tail call float @llvm.fmuladd.f32(float %a.22, float %b.22, float %c.22)
+  %res.23 = tail call float @llvm.fmuladd.f32(float %a.23, float %b.23, float %c.23)
+  %res.24 = tail call float @llvm.fmuladd.f32(float %a.24, float %b.24, float %c.24)
+  %res.25 = tail call float @llvm.fmuladd.f32(float %a.25, float %b.25, float %c.25)
+  %res.26 = tail call float @llvm.fmuladd.f32(float %a.26, float %b.26, float %c.26)
+  %res.27 = tail call float @llvm.fmuladd.f32(float %a.27, float %b.27, float %c.27)
+  %res.28 = tail call float @llvm.fmuladd.f32(float %a.28, float %b.28, float %c.28)
+  %res.29 = tail call float @llvm.fmuladd.f32(float %a.29, float %b.29, float %c.29)
+  %adr.res.0 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 0
+  %adr.res.1 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 2
+  %adr.res.2 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 4
+  %adr.res.3 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 6
+  %adr.res.4 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 8
+  %adr.res.5 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 10
+  %adr.res.6 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 12
+  %adr.res.7 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 14
+  %adr.res.8 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 16
+  %adr.res.9 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 18
+  %adr.res.10 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 20
+  %adr.res.11 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 22
+  %adr.res.12 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 24
+  %adr.res.13 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 26
+  %adr.res.14 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 28
+  %adr.res.15 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 30
+  %adr.res.16 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 32
+  %adr.res.17 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 34
+  %adr.res.18 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 36
+  %adr.res.19 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 38
+  %adr.res.20 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 40
+  %adr.res.21 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 42
+  %adr.res.22 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 44
+  %adr.res.23 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 46
+  %adr.res.24 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 48
+  %adr.res.25 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 50
+  %adr.res.26 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 52
+  %adr.res.27 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 54
+  %adr.res.28 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 56
+  %adr.res.29 = getelementptr inbounds float, float addrspace(1)* %out_arg, i64 58
+  store float %res.0, float addrspace(1)* %adr.res.0, align 4
+  store float %res.1, float addrspace(1)* %adr.res.1, align 4
+  store float %res.2, float addrspace(1)* %adr.res.2, align 4
+  store float %res.3, float addrspace(1)* %adr.res.3, align 4
+  store float %res.4, float addrspace(1)* %adr.res.4, align 4
+  store float %res.5, float addrspace(1)* %adr.res.5, align 4
+  store float %res.6, float addrspace(1)* %adr.res.6, align 4
+  store float %res.7, float addrspace(1)* %adr.res.7, align 4
+  store float %res.8, float addrspace(1)* %adr.res.8, align 4
+  store float %res.9, float addrspace(1)* %adr.res.9, align 4
+  store float %res.10, float addrspace(1)* %adr.res.10, align 4
+  store float %res.11, float addrspace(1)* %adr.res.11, align 4
+  store float %res.12, float addrspace(1)* %adr.res.12, align 4
+  store float %res.13, float addrspace(1)* %adr.res.13, align 4
+  store float %res.14, float addrspace(1)* %adr.res.14, align 4
+  store float %res.15, float addrspace(1)* %adr.res.15, align 4
+  store float %res.16, float addrspace(1)* %adr.res.16, align 4
+  store float %res.17, float addrspace(1)* %adr.res.17, align 4
+  store float %res.18, float addrspace(1)* %adr.res.18, align 4
+  store float %res.19, float addrspace(1)* %adr.res.19, align 4
+  store float %res.20, float addrspace(1)* %adr.res.20, align 4
+  store float %res.21, float addrspace(1)* %adr.res.21, align 4
+  store float %res.22, float addrspace(1)* %adr.res.22, align 4
+  store float %res.23, float addrspace(1)* %adr.res.23, align 4
+  store float %res.24, float addrspace(1)* %adr.res.24, align 4
+  store float %res.25, float addrspace(1)* %adr.res.25, align 4
+  store float %res.26, float addrspace(1)* %adr.res.26, align 4
+  store float %res.27, float addrspace(1)* %adr.res.27, align 4
+  store float %res.28, float addrspace(1)* %adr.res.28, align 4
+  store float %res.29, float addrspace(1)* %adr.res.29, align 4
+  ret void
+}
+declare float @llvm.fmuladd.f32(float, float, float) #0
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/schedule-regpressure.mir b/test/CodeGen/AMDGPU/schedule-regpressure.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c71de87eeecee686bc27bb8ef29608605e729372
--- /dev/null
+++ b/test/CodeGen/AMDGPU/schedule-regpressure.mir
@@ -0,0 +1,57 @@
+# RUN: llc -march=amdgcn -misched=converge -run-pass machine-scheduler %s -o - -debug-only=misched 2>&1 | FileCheck %s
+# REQUIRES: asserts
+
+# Check there is no SReg_32 pressure created by DS_* instructions because of M0 use
+
+# CHECK: ScheduleDAGMILive::schedule starting
+# CHECK: SU({{.*}} = DS_READ_B32 {{.*}} %M0<imp-use>, %EXEC<imp-use>
+# CHECK: Pressure Diff : {{$}}
+# CHECK: SU({{.*}} DS_WRITE_B32
+
+---
+name:            mo_pset
+alignment:       0
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: sreg_128 }
+  - { id: 1, class: sgpr_64 }
+  - { id: 2, class: sreg_32_xm0 }
+  - { id: 3, class: sgpr_32 }
+  - { id: 4, class: vgpr_32 }
+  - { id: 5, class: sreg_32_xm0_xexec }
+  - { id: 6, class: vgpr_32 }
+  - { id: 7, class: vgpr_32 }
+  - { id: 8, class: vgpr_32 }
+liveins:
+  - { reg: '%sgpr4_sgpr5', virtual-reg: '%1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0:
+    liveins: %sgpr4_sgpr5
+
+    %1 = COPY %sgpr4_sgpr5
+    %5 = S_LOAD_DWORD_IMM %1, 0, 0 :: (non-temporal dereferenceable invariant load 4 from `i32 addrspace(2)* undef`)
+    %m0 = S_MOV_B32 -1
+    %7 = COPY %5
+    %6 = DS_READ_B32 %7, 0, 0, implicit %m0, implicit %exec
+    DS_WRITE_B32 %7, %6, 4, 0, implicit killed %m0, implicit %exec
+    S_ENDPGM
+
+...
diff --git a/test/CodeGen/AMDGPU/scratch-buffer.ll b/test/CodeGen/AMDGPU/scratch-buffer.ll
index 94101f0b92b6b843c6338f952385703f4b2c9a45..6b1e85915a110f163cf5c46d8d150c005b6410b8 100644
--- a/test/CodeGen/AMDGPU/scratch-buffer.ll
+++ b/test/CodeGen/AMDGPU/scratch-buffer.ll
@@ -9,11 +9,11 @@
 ; should be able to reuse the same regiser for each scratch buffer access.
 
 ; GCN-LABEL: {{^}}legal_offset_fi:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+$}}
-; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8000
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offset:4{{$}}
+; GCN: v_mov_b32_e32 [[OFFSET:v[0-9]+]], 0x8004
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
-define void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
+define amdgpu_kernel void @legal_offset_fi(i32 addrspace(1)* %out, i32 %cond, i32 %if_offset, i32 %else_offset) {
 entry:
   %scratch0 = alloca [8192 x i32]
   %scratch1 = alloca [8192 x i32]
@@ -49,11 +49,11 @@ done:
 ; GCN-LABEL: {{^}}legal_offset_fi_offset:
 ; GCN-DAG: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 ; This constant isn't folded, because it has multiple uses.
-; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8000
+; GCN-DAG: v_mov_b32_e32 [[K8000:v[0-9]+]], 0x8004
 ; GCN-DAG: v_add_i32_e32 [[OFFSET:v[0-9]+]], vcc, [[K8000]]
 ; GCN: buffer_store_dword v{{[0-9]+}}, [[OFFSET]], s[{{[0-9]+}}:{{[0-9]+}}], s{{[0-9]+}} offen{{$}}
 
-define void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
+define amdgpu_kernel void @legal_offset_fi_offset(i32 addrspace(1)* %out, i32 %cond, i32 addrspace(1)* %offsets, i32 %if_offset, i32 %else_offset) {
 entry:
   %scratch0 = alloca [8192 x i32]
   %scratch1 = alloca [8192 x i32]
@@ -88,7 +88,7 @@ done:
 
 ; GCN-LABEL: {{^}}neg_vaddr_offset:
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{[0-9]+}}, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offen offset:16{{$}}
-define void @neg_vaddr_offset(i32 %offset) {
+define amdgpu_kernel void @neg_vaddr_offset(i32 %offset) {
 entry:
   %array = alloca [8192 x i32]
   %ptr_offset = add i32 %offset, 4
@@ -98,8 +98,8 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}pos_vaddr_offset:
-; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16
-define void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
+; GCN: buffer_store_dword v{{[0-9]+}}, off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:20
+define amdgpu_kernel void @pos_vaddr_offset(i32 addrspace(1)* %out, i32 %offset) {
 entry:
   %array = alloca [8192 x i32]
   %ptr = getelementptr [8192 x i32], [8192 x i32]* %array, i32 0, i32 4
diff --git a/test/CodeGen/AMDGPU/sdiv.ll b/test/CodeGen/AMDGPU/sdiv.ll
index bafd6a50ccfe9aee277a7e0083a78627983ddc5f..f9ac425be79428cc56f20e250fa045950e1a0842 100644
--- a/test/CodeGen/AMDGPU/sdiv.ll
+++ b/test/CodeGen/AMDGPU/sdiv.ll
@@ -13,7 +13,7 @@
 
 ; FUNC-LABEL: {{^}}sdiv_i32:
 ; EG: CF_END
-define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in
   %den = load i32, i32 addrspace(1) * %den_ptr
@@ -23,7 +23,7 @@ define void @sdiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 }
 
 ; FUNC-LABEL: {{^}}sdiv_i32_4:
-define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = sdiv i32 %num, 4
   store i32 %result, i32 addrspace(1)* %out
@@ -43,14 +43,14 @@ define void @sdiv_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_add_i32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @slow_sdiv_i32_3435(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = sdiv i32 %num, 3435
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
@@ -59,14 +59,14 @@ define void @sdiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %result = sdiv <2 x i32> %num, <i32 4, i32 4>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
@@ -75,7 +75,7 @@ define void @sdiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = sdiv <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
@@ -86,7 +86,7 @@ define void @sdiv_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 8
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -100,7 +100,7 @@ define void @v_sdiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 23
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
   %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
   %num = load i23, i23 addrspace(1) * %in
   %den = load i23, i23 addrspace(1) * %den_ptr
@@ -114,7 +114,7 @@ define void @v_sdiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_bfe_i32 [[BFE:v[0-9]+]], v{{[0-9]+}}, 0, 24
 ; SI: buffer_store_dword [[BFE]]
-define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
   %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
   %num = load i24, i24 addrspace(1) * %in
   %den = load i24, i24 addrspace(1) * %den_ptr
@@ -126,7 +126,7 @@ define void @v_sdiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}v_sdiv_i25:
 ; SI-NOT: v_rcp_f32
-define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
   %den_ptr = getelementptr i25, i25 addrspace(1)* %in, i25 1
   %num = load i25, i25 addrspace(1) * %in
   %den = load i25, i25 addrspace(1) * %den_ptr
@@ -137,19 +137,19 @@ define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
 }
 
 ; Tests for 64-bit divide bypass.
-; define void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_quotient(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %result = sdiv i64 %a, %b
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; define void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %result = srem i64 %a, %b
 ;   store i64 %result, i64 addrspace(1)* %out, align 8
 ;   ret void
 ; }
 
-; define void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+; define amdgpu_kernel void @test_get_quotient_and_remainder(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 ;   %resultdiv = sdiv i64 %a, %b
 ;   %resultrem = srem i64 %a, %b
 ;   %result = add i64 %resultdiv, %resultrem
@@ -163,7 +163,7 @@ define void @v_sdiv_i25(i32 addrspace(1)* %out, i25 addrspace(1)* %in) {
 ; SI: v_mul_hi_i32
 ; SI: v_mul_hi_i32
 
-define void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhs_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %2 = sdiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
diff --git a/test/CodeGen/AMDGPU/sdivrem24.ll b/test/CodeGen/AMDGPU/sdivrem24.ll
index 349a7821da17e341b840a5ca6d63f9907f3a24ca..257e6be96b658bf8bbe4ed313e961b26150b60f0 100644
--- a/test/CodeGen/AMDGPU/sdivrem24.ll
+++ b/test/CodeGen/AMDGPU/sdivrem24.ll
@@ -12,7 +12,7 @@
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -31,7 +31,7 @@ define void @sdiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -50,7 +50,7 @@ define void @sdiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -69,7 +69,7 @@ define void @sdiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -88,7 +88,7 @@ define void @sdiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -107,7 +107,7 @@ define void @test_no_sdiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -130,7 +130,7 @@ define void @test_no_sdiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -149,7 +149,7 @@ define void @srem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -168,7 +168,7 @@ define void @srem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: INT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_INT
-define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -187,7 +187,7 @@ define void @srem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -206,7 +206,7 @@ define void @no_srem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -225,7 +225,7 @@ define void @no_sdiv25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -244,7 +244,7 @@ define void @no_sdiv25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -263,7 +263,7 @@ define void @no_srem25_i24_i25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: INT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -283,7 +283,7 @@ define void @no_srem25_i25_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -303,7 +303,7 @@ define void @srem25_i24_i11_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -323,7 +323,7 @@ define void @srem25_i11_i24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG: INT_TO_FLT
 ; EG: RECIP_IEEE
-define void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem25_i17_i12_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
diff --git a/test/CodeGen/AMDGPU/sdivrem64.ll b/test/CodeGen/AMDGPU/sdivrem64.ll
index 28fdb69e1ada824ae4937b5e30612b1411e597e5..5ad0d8efaed3f2c5cc83ff932237873376928474 100644
--- a/test/CodeGen/AMDGPU/sdivrem64.ll
+++ b/test/CodeGen/AMDGPU/sdivrem64.ll
@@ -70,7 +70,7 @@
 ; SI-NOT: v_lshr_b64
 ; VI-NOT: v_lshrrev_b64
 ; GCN: s_endpgm
-define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = sdiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@ define void @s_test_sdiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -159,7 +159,7 @@ define void @s_test_srem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = sdiv i64 %1, %2
@@ -176,7 +176,7 @@ define void @test_sdiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 33
   %2 = ashr i64 %y, 33
   %result = srem i64 %1, %2
@@ -196,7 +196,7 @@ define void @test_srem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = sdiv i64 %1, %2
@@ -216,7 +216,7 @@ define void @test_sdiv2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_srem2464(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = ashr i64 %x, 40
   %2 = ashr i64 %y, 40
   %result = srem i64 %1, %2
diff --git a/test/CodeGen/AMDGPU/sdwa-peephole.ll b/test/CodeGen/AMDGPU/sdwa-peephole.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1e0ac3807528000e7ea1134770ef31d34746bbbb
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sdwa-peephole.ll
@@ -0,0 +1,395 @@
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole=0 -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=NOSDWA -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=fiji -amdgpu-sdwa-peephole -mattr=-fp64-fp16-denormals -verify-machineinstrs < %s | FileCheck -check-prefix=SDWA -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}add_shr_i32:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_add_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
+; NOSDWA-NOT: v_add_i32_sdwa
+
+; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+define amdgpu_kernel void @add_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %a = load i32, i32 addrspace(1)* %in, align 4
+  %shr = lshr i32 %a, 16
+  %add = add i32 %a, %shr
+  store i32 %add, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sub_shr_i32:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_subrev_i32_e32 v{{[0-9]+}}, vcc, v{{[0-9]+}}, v[[DST]]
+; NOSDWA-NOT: v_subrev_i32_sdwa
+
+; SDWA: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+define amdgpu_kernel void @sub_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+  %a = load i32, i32 addrspace(1)* %in, align 4
+  %shr = lshr i32 %a, 16
+  %sub = sub i32 %shr, %a
+  store i32 %sub, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_shr_i32:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v[[DST1]], v[[DST0]]
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+define amdgpu_kernel void @mul_shr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in1, i32 addrspace(1)* %in2) {
+  %a = load i32, i32 addrspace(1)* %in1, align 4
+  %b = load i32, i32 addrspace(1)* %in2, align 4
+  %shra = lshr i32 %a, 16
+  %shrb = lshr i32 %b, 16
+  %mul = mul i32 %shra, %shrb
+  store i32 %mul, i32 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_i16:
+; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-NOT: v_mul_u32_u24_sdwa
+
+define amdgpu_kernel void @mul_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %ina, i16 addrspace(1)* %inb) {
+entry:
+  %a = load i16, i16 addrspace(1)* %ina, align 4
+  %b = load i16, i16 addrspace(1)* %inb, align 4
+  %mul = mul i16 %a, %b
+  store i16 %mul, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v2i16:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+define amdgpu_kernel void @mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <2 x i16> %a, %b
+  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v4i16:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+define amdgpu_kernel void @mul_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %ina, <4 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <4 x i16>, <4 x i16> addrspace(1)* %ina, align 4
+  %b = load <4 x i16>, <4 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <4 x i16> %a, %b
+  store <4 x i16> %mul, <4 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v8i16:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL0:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL1:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL2:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL3:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL4:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL5:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL6:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; SDWA-DAG: v_mul_u32_u24_sdwa v[[DST_MUL7:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL7]], v[[DST_MUL6]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL5]], v[[DST_MUL4]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL3]], v[[DST_MUL2]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+; SDWA-DAG: v_or_b32_sdwa v{{[0-9]+}}, v[[DST_MUL1]], v[[DST_MUL0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+define amdgpu_kernel void @mul_v8i16(<8 x i16> addrspace(1)* %out, <8 x i16> addrspace(1)* %ina, <8 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <8 x i16>, <8 x i16> addrspace(1)* %ina, align 4
+  %b = load <8 x i16>, <8 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <8 x i16> %a, %b
+  store <8 x i16> %mul, <8 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_half:
+; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+; SDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-NOT: v_mul_f16_sdwa
+
+define amdgpu_kernel void @mul_half(half addrspace(1)* %out, half addrspace(1)* %ina, half addrspace(1)* %inb) {
+entry:
+  %a = load half, half addrspace(1)* %ina, align 4
+  %b = load half, half addrspace(1)* %inb, align 4
+  %mul = fmul half %a, %b
+  store half %mul, half addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v2half:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mul_f16_e32 v[[DST_MUL:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MUL]]
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+
+; SDWA-DAG: v_mul_f16_sdwa v[[DST_MUL_HI:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_e32 v[[DST_MUL_LO:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_MUL_HI]], v[[DST_MUL_LO]]
+define amdgpu_kernel void @mul_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
+  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <2 x half> %a, %b
+  store <2 x half> %mul, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v4half:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+define amdgpu_kernel void @mul_v4half(<4 x half> addrspace(1)* %out, <4 x half> addrspace(1)* %ina, <4 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <4 x half>, <4 x half> addrspace(1)* %ina, align 4
+  %b = load <4 x half>, <4 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <4 x half> %a, %b
+  store <4 x half> %mul, <4 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v8half:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_f16_sdwa
+
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_mul_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-DAG: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+define amdgpu_kernel void @mul_v8half(<8 x half> addrspace(1)* %out, <8 x half> addrspace(1)* %ina, <8 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <8 x half>, <8 x half> addrspace(1)* %ina, align 4
+  %b = load <8 x half>, <8 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <8 x half> %a, %b
+  store <8 x half> %mul, <8 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_i8:
+; NOSDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA: v_mul_lo_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SDWA-NOT: v_mul_u32_u24_sdwa
+
+define amdgpu_kernel void @mul_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %ina, i8 addrspace(1)* %inb) {
+entry:
+  %a = load i8, i8 addrspace(1)* %ina, align 4
+  %b = load i8, i8 addrspace(1)* %inb, align 4
+  %mul = mul i8 %a, %b
+  store i8 %mul, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v2i8:
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:BYTE_1
+
+define amdgpu_kernel void @mul_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %ina, <2 x i8> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i8>, <2 x i8> addrspace(1)* %ina, align 4
+  %b = load <2 x i8>, <2 x i8> addrspace(1)* %inb, align 4
+  %mul = mul <2 x i8> %a, %b
+  store <2 x i8> %mul, <2 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v4i8:
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+
+define amdgpu_kernel void @mul_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> addrspace(1)* %ina, <4 x i8> addrspace(1)* %inb) {
+entry:
+  %a = load <4 x i8>, <4 x i8> addrspace(1)* %ina, align 4
+  %b = load <4 x i8>, <4 x i8> addrspace(1)* %inb, align 4
+  %mul = mul <4 x i8> %a, %b
+  store <4 x i8> %mul, <4 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}mul_v8i8:
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b16_e32 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+; SDWA-DAG: v_mul_u32_u24_sdwa
+
+define amdgpu_kernel void @mul_v8i8(<8 x i8> addrspace(1)* %out, <8 x i8> addrspace(1)* %ina, <8 x i8> addrspace(1)* %inb) {
+entry:
+  %a = load <8 x i8>, <8 x i8> addrspace(1)* %ina, align 4
+  %b = load <8 x i8>, <8 x i8> addrspace(1)* %inb, align 4
+  %mul = mul <8 x i8> %a, %b
+  store <8 x i8> %mul, <8 x i8> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16:
+; NOSDWA-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; NOSDWA-DAG: v_ashrrev_i32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-DAG: v_cvt_f32_i32_e32 v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_cvt_f32_i32_sdwa
+
+; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+; SDWA-DAG: v_cvt_f32_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+
+define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
+    <2 x half> addrspace(1)* %r,
+    <2 x i16> addrspace(1)* %a) {
+entry:
+  %a.val = load <2 x i16>, <2 x i16> addrspace(1)* %a
+  %r.val = sitofp <2 x i16> %a.val to <2 x half>
+  store <2 x half> %r.val, <2 x half> addrspace(1)* %r
+  ret void
+}
+
+
+; GCN-LABEL: {{^}}mac_v2half:
+; NOSDWA: v_lshrrev_b32_e32 v[[DST0:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v[[DST1:[0-9]+]], 16, v{{[0-9]+}}
+; NOSDWA: v_mac_f16_e32 v[[DST_MAC:[0-9]+]], v[[DST1]], v[[DST0]]
+; NOSDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v[[DST_SHL]], v{{[0-9]+}}
+; NOSDWA-NOT: v_mac_f16_sdwa
+
+; SDWA: v_mac_f16_sdwa v[[DST_MAC:[0-9]+]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; SDWA: v_lshlrev_b32_e32 v[[DST_SHL:[0-9]+]], 16, v[[DST_MAC]]
+
+define amdgpu_kernel void @mac_v2half(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %ina, <2 x half> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x half>, <2 x half> addrspace(1)* %ina, align 4
+  %b = load <2 x half>, <2 x half> addrspace(1)* %inb, align 4
+  %mul = fmul <2 x half> %a, %b
+  %mac = fadd <2 x half> %mul, %b
+  store <2 x half> %mac, <2 x half> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}immediate_mul_v2i16:
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+; SDWA-NOT: v_mul_u32_u24_sdwa
+
+define amdgpu_kernel void @immediate_mul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in, align 4
+  %mul = mul <2 x i16> %a, <i16 123, i16 321>
+  store <2 x i16> %mul, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; Double use of same src - should not convert it
+; GCN-LABEL: {{^}}mulmul_v2i16:
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_mul_u32_u24_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; NOSDWA: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; NOSDWA-NOT: v_mul_u32_u24_sdwa
+
+; SDWA: v_mul_u32_u24_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+
+define amdgpu_kernel void @mulmul_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
+  %mul = mul <2 x i16> %a, %b
+  %mul2 = mul <2 x i16> %mul, %b
+  store <2 x i16> %mul2, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}add_bb_v2i16:
+; NOSDWA-NOT: v_add_i32_sdwa
+
+; SDWA: v_add_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+
+define amdgpu_kernel void @add_bb_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %ina, <2 x i16> addrspace(1)* %inb) {
+entry:
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %ina, align 4
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %inb, align 4
+  br label %add_label
+add_label:
+  %add = add <2 x i16> %a, %b
+  br label %store_label
+store_label:
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
index 559d464f36a5bb6aaa25df641f5e0a2bce34c6db..c8839c17365e81c254a844a45deedac56e84bffd 100644
--- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract-legacy.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
+define amdgpu_kernel void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -29,7 +29,7 @@ define void @select_fneg_posk_src_rcp_legacy_f32(i32 %c) #2 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[MUL]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
+define amdgpu_kernel void @select_fneg_posk_src_mul_legacy_f32(i32 %c) #2 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %mul = call float @llvm.amdgcn.fmul.legacy(float %x, float 4.0)
diff --git a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
index 15cfd8bfe8d0b05ae8be28fa805ef69ca65d8eb6..3417eb02b3614c819c394ddf7c35974fe34122a4 100644
--- a/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
+++ b/test/CodeGen/AMDGPU/select-fabs-fneg-extract.ll
@@ -8,7 +8,7 @@
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
-define void @add_select_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -30,7 +30,7 @@ define void @add_select_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[X]]|, [[W]]
-define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -57,7 +57,7 @@ define void @add_select_multi_use_lhs_fabs_fabs_f32(i32 %c) #0 {
 
 ; GCN: buffer_store_dword [[ADD]]
 ; GCN: buffer_store_dword [[X_ABS]]
-define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -80,7 +80,7 @@ define void @add_select_multi_store_use_lhs_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Z]]
 ; GCN-DAG: v_add_f32_e64 v{{[0-9]+}}, |[[Y]]|, [[W]]
-define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -104,7 +104,7 @@ define void @add_select_multi_use_rhs_fabs_fabs_f32(i32 %c) #0 {
 ; GCN: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_var_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_var_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -123,7 +123,7 @@ define void @add_select_fabs_var_f32(i32 %c) #0 {
 ; GCN: v_and_b32_e32 [[FABS_X:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_fabs_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -140,7 +140,7 @@ define void @add_select_fabs_negk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[X]]
-define void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -155,7 +155,7 @@ define void @add_select_fabs_negk_negk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], 1.0, 2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_posk_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float 2.0, float 1.0
@@ -172,7 +172,7 @@ define void @add_select_posk_posk_f32(i32 %c) #0 {
 ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_negk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -192,7 +192,7 @@ define void @add_select_negk_fabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[FABS_X]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Y]], [[SELECT]]
-define void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -209,7 +209,7 @@ define void @add_select_negliteralk_fabs_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
-define void @add_select_fabs_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
 
@@ -228,7 +228,7 @@ define void @add_select_fabs_posk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_add_f32_e64 v{{[0-9]+}}, |[[SELECT]]|, [[Y]]
-define void @add_select_posk_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -246,7 +246,7 @@ define void @add_select_posk_fabs_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -268,7 +268,7 @@ define void @add_select_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[X]], [[W]]
-define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -295,7 +295,7 @@ define void @add_select_multi_use_lhs_fneg_fneg_f32(i32 %c) #0 {
 
 ; GCN: buffer_store_dword [[ADD]]
 ; GCN: buffer_store_dword [[NEG_X]]
-define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -318,7 +318,7 @@ define void @add_select_multi_store_use_lhs_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X]], vcc
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
 ; GCN-DAG: v_subrev_f32_e32 v{{[0-9]+}}, [[Y]], [[W]]
-define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -342,7 +342,7 @@ define void @add_select_multi_use_rhs_fneg_fneg_f32(i32 %c) #0 {
 ; GCN: v_xor_b32_e32 [[X_NEG:v[0-9]+]], 0x80000000, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y]], [[X_NEG]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fneg_var_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_var_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -360,7 +360,7 @@ define void @add_select_fneg_var_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -372,13 +372,13 @@ define void @add_select_fneg_negk_f32(i32 %c) #0 {
 }
 
 ; GCN-LABEL: {{^}}add_select_fneg_inv2pi_f32:
-; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0xbe22f983
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -390,15 +390,15 @@ define void @add_select_fneg_inv2pi_f32(i32 %c) #0 {
 }
 
 ; GCN-LABEL: {{^}}add_select_fneg_neginv2pi_f32:
-; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
 ; GCN: buffer_load_dword [[X:v[0-9]+]]
 ; GCN: buffer_load_dword [[Y:v[0-9]+]]
+; SI: v_mov_b32_e32 [[K:v[0-9]+]], 0x3e22f983
 
 ; SI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K]], [[X]], vcc
 ; VI: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 0.15915494, [[X]], vcc
 
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -415,7 +415,7 @@ define void @add_select_fneg_neginv2pi_f32(i32 %c) #0 {
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -432,7 +432,7 @@ define void @add_select_negk_negk_f32(i32 %c) #0 {
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[K1]], [[K0]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[X]], [[SELECT]]
-define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2048.0, float -4096.0
@@ -446,7 +446,7 @@ define void @add_select_negliteralk_negliteralk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e64 [[SELECT:v[0-9]+]], -1.0, -2.0, s
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[X]]
-define void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, float -2.0, float -1.0
@@ -463,7 +463,7 @@ define void @add_select_fneg_negk_negk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_negk_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negk_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -480,7 +480,7 @@ define void @add_select_negk_fneg_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_fneg_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fneg_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -498,7 +498,7 @@ define void @add_select_fneg_posk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -1.0, [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Y]]
-define void @add_select_posk_fneg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_posk_fneg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -518,7 +518,7 @@ define void @add_select_posk_fneg_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_negfabs_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negfabs_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -541,7 +541,7 @@ define void @add_select_negfabs_fabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG_ABS]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -564,7 +564,7 @@ define void @add_select_fabs_negfabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X_NEG]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_neg_fabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_neg_fabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -586,7 +586,7 @@ define void @add_select_neg_fabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_xor_b32_e32 [[Y_NEG:v[0-9]+]], 0x80000000, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_NEG]], [[X_ABS]], vcc
 ; GCN: v_add_f32_e32 v{{[0-9]+}}, [[Z]], [[SELECT]]
-define void @add_select_fabs_neg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_fabs_neg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -607,7 +607,7 @@ define void @add_select_fabs_neg_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[Y_ABS:v[0-9]+]], 0x7fffffff, [[Y]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[Y_ABS]], [[X]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_neg_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_neg_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -629,7 +629,7 @@ define void @add_select_neg_negfabs_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], [[X_ABS]], [[Y]], vcc
 ; GCN: v_subrev_f32_e32 v{{[0-9]+}}, [[SELECT]], [[Z]]
-define void @add_select_negfabs_neg_f32(i32 %c) #0 {
+define amdgpu_kernel void @add_select_negfabs_neg_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
@@ -651,7 +651,7 @@ define void @add_select_negfabs_neg_f32(i32 %c) #0 {
 ; GCN-DAG: v_and_b32_e32 [[X_ABS:v[0-9]+]], 0x7fffffff, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
-define void @mul_select_negfabs_posk_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negfabs_posk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -672,7 +672,7 @@ define void @mul_select_negfabs_posk_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -4.0, [[X_ABS]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[SELECT]], [[Y]]
-define void @mul_select_posk_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_posk_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -690,7 +690,7 @@ define void @mul_select_posk_negfabs_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
-define void @mul_select_negfabs_negk_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negfabs_negk_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -709,7 +709,7 @@ define void @mul_select_negfabs_negk_f32(i32 %c) #0 {
 ; GCN: v_cmp_ne_u32_e64 vcc
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 4.0, [[X]], vcc
 ; GCN: v_mul_f32_e64 v{{[0-9]+}}, -|[[SELECT]]|, [[Y]]
-define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
+define amdgpu_kernel void @mul_select_negk_negfabs_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -732,7 +732,7 @@ define void @mul_select_negk_negfabs_f32(i32 %c) #0 {
 ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], -4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_add_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -749,7 +749,7 @@ define void @select_fneg_posk_src_add_f32(i32 %c) #0 {
 ; GCN: v_sub_f32_e32 [[ADD:v[0-9]+]], 4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[ADD]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %add = fsub float %x, 4.0
@@ -765,7 +765,7 @@ define void @select_fneg_posk_src_sub_f32(i32 %c) #0 {
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], -4.0, [[X]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[MUL]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
   %mul = fmul float %x, 4.0
@@ -782,7 +782,7 @@ define void @select_fneg_posk_src_mul_f32(i32 %c) #0 {
 ; GCN: v_fma_f32 [[FMA:v[0-9]+]], [[X]], -4.0, -[[Z]]
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[FMA]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -799,7 +799,7 @@ define void @select_fneg_posk_src_fma_f32(i32 %c) #0 {
 
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], 2.0, [[X]], vcc
 ; GCN-NEXT: buffer_store_dword [[SELECT]]
-define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %z = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
@@ -818,7 +818,7 @@ define void @select_fneg_posk_src_fmad_f32(i32 %c) #0 {
 ; GCN: v_cndmask_b32_e32 [[SELECT:v[0-9]+]], -2.0, [[RCP]], vcc
 ; GCN: v_xor_b32_e32 [[NEG_SELECT:v[0-9]+]], 0x80000000, [[SELECT]]
 ; GCN-NEXT: buffer_store_dword [[NEG_SELECT]]
-define void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
+define amdgpu_kernel void @select_fneg_posk_src_rcp_f32(i32 %c) #0 {
   %x = load volatile float, float addrspace(1)* undef
   %y = load volatile float, float addrspace(1)* undef
   %cmp = icmp eq i32 %c, 0
diff --git a/test/CodeGen/AMDGPU/select-i1.ll b/test/CodeGen/AMDGPU/select-i1.ll
index 07dcb2153384a8154f05fc0da2a938081acf1cbd..5eaad1f363f91a526742e60be97f7e917920479d 100644
--- a/test/CodeGen/AMDGPU/select-i1.ll
+++ b/test/CodeGen/AMDGPU/select-i1.ll
@@ -6,7 +6,7 @@
 ; FUNC-LABEL: {{^}}select_i1:
 ; SI: v_cndmask_b32
 ; SI-NOT: v_cndmask_b32
-define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
+define amdgpu_kernel void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i1 %a, i1 %b
   store i1 %sel, i1 addrspace(1)* %out, align 4
@@ -19,7 +19,7 @@ define void @select_i1(i1 addrspace(1)* %out, i32 %cond, i1 %a, i1 %b) nounwind
 ; SI-DAG: buffer_load_ubyte [[B:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:46
 ; SI: v_cmp_eq_u32_e32 vcc, 1, [[COND]]
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, [[B]], [[A]]
-define void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
+define amdgpu_kernel void @s_minmax_i1(i1 addrspace(1)* %out, i1 zeroext %cond, i1 zeroext %a, i1 zeroext %b) nounwind {
   %cmp = icmp slt i1 %cond, false
   %sel = select i1 %cmp, i1 %a, i1 %b
   store i1 %sel, i1 addrspace(1)* %out, align 4
diff --git a/test/CodeGen/AMDGPU/select-opt.ll b/test/CodeGen/AMDGPU/select-opt.ll
index ad358d33c4052c70dd324b23cc3c1a59ce121ae4..d56b952118b580ed316da68bd50b5f7628273616 100644
--- a/test/CodeGen/AMDGPU/select-opt.ll
+++ b/test/CodeGen/AMDGPU/select-opt.ll
@@ -11,7 +11,7 @@
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %and = and i1 %icmp0, %icmp1
@@ -27,7 +27,7 @@ define void @opt_select_i32_and_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b,
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %and = and i1 %fcmp0, %fcmp1
@@ -43,7 +43,7 @@ define void @opt_select_i32_and_cmp_f32(i32 addrspace(1)* %out, float %a, float
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %and = and i1 %icmp0, %icmp1
@@ -59,7 +59,7 @@ define void @opt_select_i64_and_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b,
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %and = and i1 %fcmp0, %fcmp1
@@ -76,7 +76,7 @@ define void @opt_select_i64_and_cmp_f32(i64 addrspace(1)* %out, float %a, float
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
 ; GCN: s_endpgm
-define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %or = or i1 %icmp0, %icmp1
@@ -92,7 +92,7 @@ define void @opt_select_i32_or_cmp_i32(i32 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN-NOT: [[RESULT]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %or = or i1 %fcmp0, %fcmp1
@@ -108,7 +108,7 @@ define void @opt_select_i32_or_cmp_f32(i32 addrspace(1)* %out, float %a, float %
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 {
   %icmp0 = icmp ne i32 %a, %b
   %icmp1 = icmp ne i32 %a, %c
   %or = or i1 %icmp0, %icmp1
@@ -124,7 +124,7 @@ define void @opt_select_i64_or_cmp_i32(i64 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN: v_cndmask_b32_e32 v[[RESULT1:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: v_cndmask_b32_e32 v[[RESULT0:[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}, vcc
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT0]]:[[RESULT1]]{{\]}}
-define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
+define amdgpu_kernel void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 {
   %fcmp0 = fcmp one float %a, %b
   %fcmp1 = fcmp one float %a, %c
   %or = or i1 %fcmp0, %fcmp1
@@ -138,7 +138,7 @@ define void @opt_select_i64_or_cmp_f32(i64 addrspace(1)* %out, float %a, float %
 ; GCN: v_cmp_neq_f32_e64 vcc, s{{[0-9]+}}, 0
 ; GCN: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 
-define void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
+define amdgpu_kernel void @regression(float addrspace(1)* %out, float %c0, float %c1) #0 {
 entry:
   %cmp0 = fcmp oeq float %c0, 1.0
   br i1 %cmp0, label %if0, label %endif
diff --git a/test/CodeGen/AMDGPU/select-vectors.ll b/test/CodeGen/AMDGPU/select-vectors.ll
index 759abe2f2e9aaa5c4c2db1b0f7515a01ef7e0002..8710fc8c7307bf28c32d832a308ff54702f4376d 100644
--- a/test/CodeGen/AMDGPU/select-vectors.ll
+++ b/test/CodeGen/AMDGPU/select-vectors.ll
@@ -10,7 +10,7 @@
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
+define amdgpu_kernel void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b, i8 %c) nounwind {
   %cmp = icmp eq i8 %c, 0
   %select = select i1 %cmp, <4 x i8> %a, <4 x i8> %b
   store <4 x i8> %select, <4 x i8> addrspace(1)* %out, align 4
@@ -22,7 +22,7 @@ define void @select_v4i8(<4 x i8> addrspace(1)* %out, <4 x i8> %a, <4 x i8> %b,
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i16> %a, <4 x i16> %b
   store <4 x i16> %select, <4 x i16> addrspace(1)* %out, align 4
@@ -36,7 +36,7 @@ define void @select_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %a, <4 x i16>
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
-define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x i32> %a, <2 x i32> %b
   store <2 x i32> %select, <2 x i32> addrspace(1)* %out, align 8
@@ -49,7 +49,7 @@ define void @s_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx4
-define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x i32> %a, <4 x i32> %b
   store <4 x i32> %select, <4 x i32> addrspace(1)* %out, align 16
@@ -64,7 +64,7 @@ define void @s_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: buffer_store_dwordx4
-define void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @v_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
   %val = load <4 x i32>, <4 x i32> addrspace(1)* %in
@@ -82,7 +82,7 @@ bb:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x i32> %a, <8 x i32> %b
   store <8 x i32> %select, <8 x i32> addrspace(1)* %out, align 16
@@ -102,7 +102,7 @@ define void @select_v8i32(<8 x i32> addrspace(1)* %out, <8 x i32> %a, <8 x i32>
 ; SI: v_mov_b32_e32 v{{[0-9]+}}, s[[BLO]]
 ; SI: v_cndmask_b32_e32
 ; SI: buffer_store_dwordx2
-define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x float> %a, <2 x float> %b
   store <2 x float> %select, <2 x float> addrspace(1)* %out, align 16
@@ -120,7 +120,7 @@ define void @s_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %a, <2 x
 ; SI: v_cndmask_b32_e32
 
 ; SI: buffer_store_dwordx4
-define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x float> %a, <4 x float> %b
   store <4 x float> %select, <4 x float> addrspace(1)* %out, align 16
@@ -135,7 +135,7 @@ define void @s_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %a, <4 x
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
 ; SI: buffer_store_dwordx4
-define void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
+define amdgpu_kernel void @v_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in, i32 %cond) #0 {
 bb:
   %tmp2 = icmp ult i32 %cond, 32
   %val = load <4 x float>, <4 x float> addrspace(1)* %in
@@ -153,7 +153,7 @@ bb:
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x float> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x float> %a, <8 x float> %b
   store <8 x float> %select, <8 x float> addrspace(1)* %out, align 16
@@ -165,7 +165,7 @@ define void @select_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %a, <8 x f
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <2 x double> %a, <2 x double> %b
   store <2 x double> %select, <2 x double> addrspace(1)* %out, align 16
@@ -181,7 +181,7 @@ define void @select_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %a, <2 x
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <4 x double> %a, <4 x double> %b
   store <4 x double> %select, <4 x double> addrspace(1)* %out, align 16
@@ -205,7 +205,7 @@ define void @select_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %a, <4 x
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
 ; SI: v_cndmask_b32_e32
-define void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
+define amdgpu_kernel void @select_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %a, <8 x double> %b, i32 %c) nounwind {
   %cmp = icmp eq i32 %c, 0
   %select = select i1 %cmp, <8 x double> %a, <8 x double> %b
   store <8 x double> %select, <8 x double> addrspace(1)* %out, align 16
diff --git a/test/CodeGen/AMDGPU/select.f16.ll b/test/CodeGen/AMDGPU/select.f16.ll
index a69e21fd944f06d4bbccacbefc97a09b9c427551..2a7a9c9e0638f8c430011419099047580926dfd5 100644
--- a/test/CodeGen/AMDGPU/select.f16.ll
+++ b/test/CodeGen/AMDGPU/select.f16.ll
@@ -17,7 +17,7 @@
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16(
+define amdgpu_kernel void @select_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -48,7 +48,7 @@ entry:
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_a(
+define amdgpu_kernel void @select_f16_imm_a(
     half addrspace(1)* %r,
     half addrspace(1)* %b,
     half addrspace(1)* %c,
@@ -78,7 +78,7 @@ entry:
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_b(
+define amdgpu_kernel void @select_f16_imm_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %c,
@@ -109,7 +109,7 @@ entry:
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[C_F16]], v[[D_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_c(
+define amdgpu_kernel void @select_f16_imm_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -139,7 +139,7 @@ entry:
 ; VI:  v_cndmask_b32_e32 v[[R_F16:[0-9]+]], v[[D_F16]], v[[C_F16]], vcc
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @select_f16_imm_d(
+define amdgpu_kernel void @select_f16_imm_d(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -155,20 +155,24 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}select_v2f16:
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_lt_f32_e64
-; SI:  v_cmp_lt_f32_e32
-; VI:  v_cmp_lt_f16_e32
-; VI:  v_cmp_lt_f16_e64
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e64
-; SI:  v_cvt_f16_f32_e32
-; SI:  v_cvt_f16_f32_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cmp_lt_f32_e64
+; SI: v_cmp_lt_f32_e32
+; SI: v_cndmask_b32_e32
+; SI: v_cndmask_b32_e64
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+
+; VI: v_cmp_lt_f16_e64
+; VI: v_cmp_lt_f16_e32
+; VI: v_cndmask_b32_e64
+; VI: v_cndmask_b32_e32
+
 ; GCN: s_endpgm
-define void @select_v2f16(
+define amdgpu_kernel void @select_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -188,21 +192,21 @@ entry:
 ; GCN-LABEL: {{^}}select_v2f16_imm_a:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_lt_f32_e32 vcc, 0.5
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cmp_lt_f32_e64
+; SI:  v_cmp_lt_f32_e32 vcc, 0.5
 
 ; VI:  v_cmp_lt_f16_e32
 ; VI:  v_cmp_lt_f16_e64
 ; GCN: v_cndmask_b32_e32
-; SI:  v_cvt_f16_f32_e32
 ; GCN: v_cndmask_b32_e64
 ; SI:  v_cvt_f16_f32_e32
+; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_a(
+define amdgpu_kernel void @select_v2f16_imm_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %b,
     <2 x half> addrspace(1)* %c,
@@ -220,21 +224,22 @@ entry:
 ; GCN-LABEL: {{^}}select_v2f16_imm_b:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_gt_f32_e32 vcc, 0.5
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cmp_gt_f32_e64
+; SI:  v_cmp_gt_f32_e32 vcc, 0.5
 
 ; VI:  v_cmp_gt_f16_e32
 ; VI:  v_cmp_gt_f16_e64
 ; GCN: v_cndmask_b32_e32
-; SI:  v_cvt_f16_f32_e32
 ; GCN: v_cndmask_b32_e64
+
+; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_b(
+define amdgpu_kernel void @select_v2f16_imm_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %c,
@@ -258,8 +263,8 @@ entry:
 ; SI:  v_cvt_f32_f16_e32
 
 ; SI: v_cmp_nlt_f32_e32
-; SI: v_cndmask_b32_e32
-; SI: v_cmp_nlt_f32_e32
+; SI: v_cmp_nlt_f32_e64
+; SI: v_cndmask_b32_e64
 ; SI: v_cndmask_b32_e32
 
 ; VI: v_cmp_nlt_f16_e32
@@ -271,7 +276,7 @@ entry:
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_c(
+define amdgpu_kernel void @select_v2f16_imm_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -293,16 +298,17 @@ entry:
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
 ; SI:  v_cvt_f32_f16_e32
-; SI:  v_cmp_lt_f32_e32
 ; SI:  v_cmp_lt_f32_e64
+; SI:  v_cmp_lt_f32_e32
+
 ; VI:  v_cmp_lt_f16_e32
 ; VI:  v_cmp_lt_f16_e64
-; GCN: v_cndmask_b32_e32
-; GCN: v_cndmask_b32_e64
+; GCN: v_cndmask_b32
+; GCN: v_cndmask_b32
 ; SI:  v_cvt_f16_f32_e32
 ; SI:  v_cvt_f16_f32_e32
 ; GCN: s_endpgm
-define void @select_v2f16_imm_d(
+define amdgpu_kernel void @select_v2f16_imm_d(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
diff --git a/test/CodeGen/AMDGPU/select.ll b/test/CodeGen/AMDGPU/select.ll
index 45f3cd5a7ac5da5129043beb5b4d1d969809d6b5..e53c159a2f712efdda658e27ab6ba25dfaa120fa 100644
--- a/test/CodeGen/AMDGPU/select.ll
+++ b/test/CodeGen/AMDGPU/select.ll
@@ -14,7 +14,7 @@
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XY
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW T{{[0-9]+}}.XYZW
-define void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
+define amdgpu_kernel void @select (i32 addrspace(1)* %i32out, float addrspace(1)* %f32out,
                      <2 x i32> addrspace(1)* %v2i32out, <2 x float> addrspace(1)* %v2f32out,
                      <4 x i32> addrspace(1)* %v4i32out, <4 x float> addrspace(1)* %v4f32out,
                      i32 %cond) {
diff --git a/test/CodeGen/AMDGPU/select64.ll b/test/CodeGen/AMDGPU/select64.ll
index a68fdecb00af7cd74a243f211ab656770d228d5f..3b4c925a87a090e40290b8f4b05596c78dd5479b 100644
--- a/test/CodeGen/AMDGPU/select64.ll
+++ b/test/CodeGen/AMDGPU/select64.ll
@@ -7,7 +7,7 @@
 ; CHECK-NOT: s_lshr_b64
 ; CHECK: v_cndmask
 ; CHECK: v_cndmask
-define void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
+define amdgpu_kernel void @select0(i64 addrspace(1)* %out, i32 %cond, i64 %in) {
 entry:
   %0 = icmp ugt i32 %cond, 5
   %1 = select i1 %0, i64 0, i64 %in
@@ -18,7 +18,7 @@ entry:
 ; CHECK-LABEL: {{^}}select_trunc_i64:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
+define amdgpu_kernel void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 0, i64 %in
   %trunc = trunc i64 %sel to i32
@@ -29,7 +29,7 @@ define void @select_trunc_i64(i32 addrspace(1)* %out, i32 %cond, i64 %in) nounwi
 ; CHECK-LABEL: {{^}}select_trunc_i64_2:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %b) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %sel = select i1 %cmp, i64 %a, i64 %b
   %trunc = trunc i64 %sel to i32
@@ -40,7 +40,7 @@ define void @select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 %a, i64 %
 ; CHECK-LABEL: {{^}}v_select_trunc_i64_2:
 ; CHECK: v_cndmask_b32
 ; CHECK-NOT: v_cndmask_b32
-define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
@@ -54,7 +54,7 @@ define void @v_select_trunc_i64_2(i32 addrspace(1)* %out, i32 %cond, i64 addrspa
 ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 0, {{v[0-9]+}}
 ; CHECK-DAG: v_cndmask_b32_e32 {{v[0-9]+}}, 63, {{v[0-9]+}}
 ; CHECK: s_endpgm
-define void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_select_i64_split_imm(i64 addrspace(1)* %out, i32 %cond, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %cmp = icmp ugt i32 %cond, 5
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %b = load i64, i64 addrspace(1)* %bptr, align 8
diff --git a/test/CodeGen/AMDGPU/selectcc-cnd.ll b/test/CodeGen/AMDGPU/selectcc-cnd.ll
index 94d0ace75697c30b2d657259ba661354d88b7a94..18616851c9c24047ba997198d2abaff8f76afcbf 100644
--- a/test/CodeGen/AMDGPU/selectcc-cnd.ll
+++ b/test/CodeGen/AMDGPU/selectcc-cnd.ll
@@ -3,7 +3,7 @@
 ;CHECK-NOT: SETE
 ;CHECK: CNDE {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1.0, literal.x,
 ;CHECK: 1073741824
-define void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, float addrspace(1)* %in) {
   %1 = load float, float addrspace(1)* %in
   %2 = fcmp oeq float %1, 0.0
   %3 = select i1 %2, float 1.0, float 2.0
diff --git a/test/CodeGen/AMDGPU/selectcc-cnde-int.ll b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
index 58a4ee7d62b2e05772e27522ac2e113a830f4756..1504165d3d2bcf4b3f0ffb2134cb129ab09c7241 100644
--- a/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
+++ b/test/CodeGen/AMDGPU/selectcc-cnde-int.ll
@@ -3,7 +3,7 @@
 ;CHECK-NOT: SETE_INT
 ;CHECK: CNDE_INT {{\*?}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}, 1, literal.x,
 ;CHECK-NEXT: 2
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %1 = load i32, i32 addrspace(1)* %in
   %2 = icmp eq i32 %1, 0
   %3 = select i1 %2, i32 1, i32 2
diff --git a/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
index e870ee891e664171eecab670acf7091f64b70423..7af5478600bb99af6f07eb2a807f8d7970bfb831 100644
--- a/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
+++ b/test/CodeGen/AMDGPU/selectcc-icmp-select-float.ll
@@ -6,7 +6,7 @@
 ; CHECK-NEXT: -1
 ; Test a selectcc with i32 LHS/RHS and float True/False
 
-define void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(float addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %0 = load i32, i32 addrspace(1)* %in
   %1 = icmp sge i32 %0, 0
diff --git a/test/CodeGen/AMDGPU/selectcc-opt.ll b/test/CodeGen/AMDGPU/selectcc-opt.ll
index 0f46d4c7ea0628091150662489d1fc3d287a2aeb..8fef3f8b38084879db0e3235f65fc355dac211e1 100644
--- a/test/CodeGen/AMDGPU/selectcc-opt.ll
+++ b/test/CodeGen/AMDGPU/selectcc-opt.ll
@@ -7,7 +7,7 @@
 ; EG-NOT: CND
 ; EG: SET{{[NEQGTL]+}}_DX10
 
-define void @test_a(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test_a(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.000000e+00
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -35,7 +35,7 @@ ENDIF:
 ; EG: SET{{[GTEQN]+}}_DX10
 ; EG-NEXT: PRED_
 ; EG-NEXT: ALU clause starting
-define void @test_b(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @test_b(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 0.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -59,7 +59,7 @@ ENDIF:
 ; Test a CND*_INT instruction with float true/false values
 ; EG-LABEL: {{^}}test_c:
 ; EG: CND{{[GTE]+}}_INT
-define void @test_c(float addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test_c(float addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sgt i32 %in, 0
   %1 = select i1 %0, float 2.0, float 3.0
@@ -72,7 +72,7 @@ entry:
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI-NOT: cmp
 ; SI-NOT: cndmask
-define void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @selectcc_bool(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = select i1 %icmp0, i32 -1, i32 0
   store i32 %ext, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/selectcc.ll b/test/CodeGen/AMDGPU/selectcc.ll
index 446d4ab344b249da2a79a9db16ac8a2e7b6e2f62..7eca22913987e65993ddc756754200de99d972d5 100644
--- a/test/CodeGen/AMDGPU/selectcc.ll
+++ b/test/CodeGen/AMDGPU/selectcc.ll
@@ -11,7 +11,7 @@
 ; SI: v_cmp_eq_u64
 ; SI: v_cndmask
 ; SI: v_cndmask
-define void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
+define amdgpu_kernel void @selectcc_i64(i64 addrspace(1) * %out, i64 %lhs, i64 %rhs, i64 %true, i64 %false) {
 entry:
   %0 = icmp eq i64 %lhs, %rhs
   %1 = select i1 %0, i64 %true, i64 %false
diff --git a/test/CodeGen/AMDGPU/set-dx10.ll b/test/CodeGen/AMDGPU/set-dx10.ll
index 57365a6e1fc373b5a5223c97b95fd020d2e2fded..6867c6394937ca3ea399b44f15b1255a006cf195 100644
--- a/test/CodeGen/AMDGPU/set-dx10.ll
+++ b/test/CodeGen/AMDGPU/set-dx10.ll
@@ -8,7 +8,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_une_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp une float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -22,7 +22,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETNE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_une_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp une float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -34,7 +34,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oeq_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -48,7 +48,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oeq_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oeq float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -60,7 +60,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ogt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -74,7 +74,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ogt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ogt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -86,7 +86,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oge_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -100,7 +100,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, KC0[2].Z, literal.y,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_oge_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp oge float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -112,7 +112,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ole_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -126,7 +126,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_ole_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
@@ -138,7 +138,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_olt_select_fptosi(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00
@@ -152,7 +152,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_DX10 * {{\** *}}T{{[0-9]+\.[XYZW]}}, literal.y, KC0[2].Z,
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @fcmp_olt_select_i32(i32 addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, i32 -1, i32 0
diff --git a/test/CodeGen/AMDGPU/setcc-equivalent.ll b/test/CodeGen/AMDGPU/setcc-equivalent.ll
index 11ea793650c4062987fae39f90d47a6b6d853825..853afa8772ea647fede23f3c40e756ea99c367e9 100644
--- a/test/CodeGen/AMDGPU/setcc-equivalent.ll
+++ b/test/CodeGen/AMDGPU/setcc-equivalent.ll
@@ -3,7 +3,7 @@
 ; EG-LABEL: {{^}}and_setcc_setcc_i32:
 ; EG: AND_INT
 ; EG-NEXT: SETE_INT
-define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %cmp1 = icmp eq i32 %a, -1
   %cmp2 = icmp eq i32 %b, -1
   %and = and i1 %cmp1, %cmp2
@@ -20,7 +20,7 @@ define void @and_setcc_setcc_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; EG: SETE_INT
 ; EG: AND_INT
 ; EG: SETE_INT
-define void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
+define amdgpu_kernel void @and_setcc_setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) {
   %cmp1 = icmp eq <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
   %cmp2 = icmp eq <4 x i32> %b, <i32 -1, i32 -1, i32 -1, i32 -1>
   %and = and <4 x i1> %cmp1, %cmp2
diff --git a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
index eb29b89018d114d1d975a192dadec99bd41d1c69..8d455d84bf9e7b685984b1e7135c20a1e761714b 100644
--- a/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
+++ b/test/CodeGen/AMDGPU/setcc-fneg-constant.ll
@@ -10,7 +10,7 @@
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
 ; GCN: buffer_store_dword [[MUL]]
-define void @multi_use_fneg_src() #0 {
+define amdgpu_kernel void @multi_use_fneg_src() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -33,7 +33,7 @@ define void @multi_use_fneg_src() #0 {
 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[A]]
 ; GCN: v_mul_f32_e64 [[USE1:v[0-9]+]], [[MUL]], -[[MUL]]
-define void @multi_foldable_use_fneg_src() #0 {
+define amdgpu_kernel void @multi_foldable_use_fneg_src() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -55,10 +55,11 @@ define void @multi_foldable_use_fneg_src() #0 {
 ; GCN: buffer_load_dword [[B:v[0-9]+]]
 ; GCN: buffer_load_dword [[C:v[0-9]+]]
 
-; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[B]], [[A]]
-; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL]]
+; GCN: v_mul_f32_e64 [[MUL:v[0-9]+]], [[A]], -[[B]]
+; GCN-NEXT: v_cmp_eq_f32_e32 vcc, 4.0, [[MUL]]
+; GCN-NOT: xor
 ; GCN: buffer_store_dword [[MUL]]
-define void @multi_use_fneg() #0 {
+define amdgpu_kernel void @multi_use_fneg() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -81,7 +82,7 @@ define void @multi_use_fneg() #0 {
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, [[MUL0]]
 ; GCN: v_mul_f32_e64 [[MUL1:v[0-9]+]], -[[MUL0]], [[MUL0]]
 ; GCN: buffer_store_dword [[MUL1]]
-define void @multi_foldable_use_fneg() #0 {
+define amdgpu_kernel void @multi_foldable_use_fneg() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %b = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
@@ -100,7 +101,7 @@ define void @multi_foldable_use_fneg() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_oeq_posk_f32:
 ; GCN: v_cmp_eq_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_oeq_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_oeq_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -113,7 +114,7 @@ define void @test_setcc_fneg_oeq_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ogt_posk_f32:
 ; GCN: v_cmp_gt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ogt_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ogt_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -126,7 +127,7 @@ define void @test_setcc_fneg_ogt_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_oge_posk_f32:
 ; GCN: v_cmp_ge_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_oge_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_oge_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -139,7 +140,7 @@ define void @test_setcc_fneg_oge_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_olt_posk_f32:
 ; GCN: v_cmp_lt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_olt_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_olt_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -152,7 +153,7 @@ define void @test_setcc_fneg_olt_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ole_posk_f32:
 ; GCN: v_cmp_le_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ole_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ole_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -165,7 +166,7 @@ define void @test_setcc_fneg_ole_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_one_posk_f32:
 ; GCN: v_cmp_lg_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_one_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_one_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -178,7 +179,7 @@ define void @test_setcc_fneg_one_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ueq_posk_f32:
 ; GCN: v_cmp_nlg_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ueq_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ueq_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -191,7 +192,7 @@ define void @test_setcc_fneg_ueq_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ugt_posk_f32:
 ; GCN: v_cmp_nle_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ugt_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ugt_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -204,7 +205,7 @@ define void @test_setcc_fneg_ugt_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_uge_posk_f32:
 ; GCN: v_cmp_nlt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_uge_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_uge_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -217,7 +218,7 @@ define void @test_setcc_fneg_uge_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ult_posk_f32:
 ; GCN: v_cmp_nge_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ult_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ult_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -230,7 +231,7 @@ define void @test_setcc_fneg_ult_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_ule_posk_f32:
 ; GCN: v_cmp_ngt_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_ule_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_ule_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
@@ -243,7 +244,7 @@ define void @test_setcc_fneg_ule_posk_f32() #0 {
 
 ; GCN-LABEL: {{^}}test_setcc_fneg_une_posk_f32:
 ; GCN: v_cmp_neq_f32_e32 vcc, -4.0, v{{[0-9]+}}
-define void @test_setcc_fneg_une_posk_f32() #0 {
+define amdgpu_kernel void @test_setcc_fneg_une_posk_f32() #0 {
   %a = load volatile float, float addrspace(1)* undef
   %x = load volatile i32, i32 addrspace(1)* undef
   %y = load volatile i32, i32 addrspace(1)* undef
diff --git a/test/CodeGen/AMDGPU/setcc-opt.ll b/test/CodeGen/AMDGPU/setcc-opt.ll
index 4ab6da085634e304637d0f5d0156a761f6984c86..caddb6f682187959ad496377a144615a875496a1 100644
--- a/test/CodeGen/AMDGPU/setcc-opt.ll
+++ b/test/CodeGen/AMDGPU/setcc-opt.ll
@@ -11,7 +11,7 @@
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 0
@@ -28,7 +28,7 @@ define void @sext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 
 ; EG: SETNE_INT * [[CMP:T[0-9]+]].[[CMPCHAN:[XYZW]]], KC0[2].Z, KC0[2].W
 ; EG: AND_INT T{{[0-9]+.[XYZW]}}, PS, 1
-define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
@@ -42,7 +42,7 @@ define void @sext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, -1
@@ -56,7 +56,7 @@ define void @sext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, -1
@@ -70,7 +70,7 @@ define void @sext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 0
@@ -84,7 +84,7 @@ define void @zext_bool_icmp_eq_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 0
@@ -98,7 +98,7 @@ define void @zext_bool_icmp_ne_0(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 1
@@ -111,7 +111,7 @@ define void @zext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN: v_cmp_eq_u32_e32 vcc,
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
-define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 1
@@ -124,7 +124,7 @@ define void @zext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[TMP]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, -1
@@ -137,7 +137,7 @@ define void @zext_bool_icmp_eq_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN: v_mov_b32_e32 [[TMP:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[TMP]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, -1
@@ -159,7 +159,7 @@ define void @zext_bool_icmp_ne_neg1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounw
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, 255
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -172,7 +172,7 @@ define void @cmp_zext_k_i8max(i1 addrspace(1)* %out, i8 %b) nounwind {
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nounwind {
   %b = load i8, i8 addrspace(1)* %b.ptr
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
@@ -186,7 +186,7 @@ define void @cmp_sext_k_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %b.ptr) nou
 ; GCN-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP]]
 ; GCN-NEXT: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -207,7 +207,7 @@ define void @cmp_sext_k_neg1_i8_sext_arg(i1 addrspace(1)* %out, i8 signext %b) n
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = sext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -218,7 +218,7 @@ define void @cmp_sext_k_neg1_i8_arg(i1 addrspace(1)* %out, i8 %b) nounwind {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN: s_endpgm
-define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
+define amdgpu_kernel void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
   %b.ext = zext i8 %b to i32
   %icmp0 = icmp ne i32 %b.ext, -1
   store i1 %icmp0, i1 addrspace(1)* %out
@@ -229,7 +229,7 @@ define void @cmp_zext_k_neg1(i1 addrspace(1)* %out, i8 %b) nounwind {
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 2
@@ -241,7 +241,7 @@ define void @zext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; GCN: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[RESULT]]
 ; GCN-NEXT: s_endpgm
-define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = zext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 2
@@ -256,7 +256,7 @@ define void @zext_bool_icmp_eq_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; FUNC-LABEL: {{^}}sext_bool_icmp_eq_1:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp eq i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp eq i32 %ext, 1
@@ -267,7 +267,7 @@ define void @sext_bool_icmp_eq_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_1:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 1
@@ -278,7 +278,7 @@ define void @sext_bool_icmp_ne_1(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind
 ; FUNC-LABEL: {{^}}sext_bool_icmp_ne_k:
 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 1{{$}}
 ; GCN: buffer_store_byte [[K]]
-define void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @sext_bool_icmp_ne_k(i1 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %icmp0 = icmp ne i32 %a, %b
   %ext = sext i1 %icmp0 to i32
   %icmp1 = icmp ne i32 %ext, 2
diff --git a/test/CodeGen/AMDGPU/setcc.ll b/test/CodeGen/AMDGPU/setcc.ll
index 10d04bab9f6b991199c0d68ac4c062948e0a598d..add90e9c2f3a98994abd568e95708e8321153367 100644
--- a/test/CodeGen/AMDGPU/setcc.ll
+++ b/test/CodeGen/AMDGPU/setcc.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
 ; GCN-DAG: v_cmp_eq_u32_e32
 ; GCN-DAG: v_cmp_eq_u32_e64
-define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %result = icmp eq <2 x i32> %a, %b
   %sext = sext <2 x i1> %result to <2 x i32>
   store <2 x i32> %sext, <2 x i32> addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @setcc_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cmp_eq_u32_e64
 ; GCN: v_cmp_eq_u32_e64
-define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -43,7 +43,7 @@ define void @setcc_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 ; FUNC-LABEL: {{^}}f32_oeq:
 ; R600: SETE_DX10
 ; GCN: v_cmp_eq_f32
-define void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_oeq(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp oeq float %a, %b
   %1 = sext i1 %0 to i32
@@ -54,7 +54,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_ogt:
 ; R600: SETGT_DX10
 ; GCN: v_cmp_gt_f32
-define void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ogt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ogt float %a, %b
   %1 = sext i1 %0 to i32
@@ -65,7 +65,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_oge:
 ; R600: SETGE_DX10
 ; GCN: v_cmp_ge_f32
-define void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_oge(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp oge float %a, %b
   %1 = sext i1 %0 to i32
@@ -76,7 +76,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_olt:
 ; R600: SETGT_DX10
 ; GCN: v_cmp_lt_f32
-define void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_olt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp olt float %a, %b
   %1 = sext i1 %0 to i32
@@ -87,7 +87,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_ole:
 ; R600: SETGE_DX10
 ; GCN: v_cmp_le_f32
-define void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ole(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ole float %a, %b
   %1 = sext i1 %0 to i32
@@ -105,7 +105,7 @@ entry:
 
 ; GCN: v_cmp_lg_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_one(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp one float %a, %b
   %1 = sext i1 %0 to i32
@@ -119,7 +119,7 @@ entry:
 ; R600-DAG: AND_INT
 ; R600-DAG: SETNE_INT
 ; GCN: v_cmp_o_f32
-define void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ord(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ord float %a, %b
   %1 = sext i1 %0 to i32
@@ -137,7 +137,7 @@ entry:
 
 ; GCN: v_cmp_nlg_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ueq(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ueq float %a, %b
   %1 = sext i1 %0 to i32
@@ -150,7 +150,7 @@ entry:
 ; R600: SETE_DX10
 ; GCN: v_cmp_nle_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ugt(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ugt float %a, %b
   %1 = sext i1 %0 to i32
@@ -164,7 +164,7 @@ entry:
 
 ; GCN: v_cmp_nlt_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_uge(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp uge float %a, %b
   %1 = sext i1 %0 to i32
@@ -178,7 +178,7 @@ entry:
 
 ; GCN: v_cmp_nge_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ult(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ult float %a, %b
   %1 = sext i1 %0 to i32
@@ -192,7 +192,7 @@ entry:
 
 ; GCN: v_cmp_ngt_f32_e32 vcc
 ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_ule(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp ule float %a, %b
   %1 = sext i1 %0 to i32
@@ -203,7 +203,7 @@ entry:
 ; FUNC-LABEL: {{^}}f32_une:
 ; R600: SETNE_DX10
 ; GCN: v_cmp_neq_f32
-define void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_une(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp une float %a, %b
   %1 = sext i1 %0 to i32
@@ -217,7 +217,7 @@ entry:
 ; R600: OR_INT
 ; R600: SETNE_INT
 ; GCN: v_cmp_u_f32
-define void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @f32_uno(i32 addrspace(1)* %out, float %a, float %b) #0 {
 entry:
   %0 = fcmp uno float %a, %b
   %1 = sext i1 %0 to i32
@@ -232,7 +232,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_eq:
 ; R600: SETE_INT
 ; GCN: v_cmp_eq_u32
-define void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_eq(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp eq i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -243,7 +243,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ne:
 ; R600: SETNE_INT
 ; GCN: v_cmp_ne_u32
-define void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ne(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ne i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -254,7 +254,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ugt:
 ; R600: SETGT_UINT
 ; GCN: v_cmp_gt_u32
-define void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ugt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ugt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -265,7 +265,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_uge:
 ; R600: SETGE_UINT
 ; GCN: v_cmp_ge_u32
-define void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_uge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp uge i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -276,7 +276,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ult:
 ; R600: SETGT_UINT
 ; GCN: v_cmp_lt_u32
-define void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ult(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ult i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -287,7 +287,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_ule:
 ; R600: SETGE_UINT
 ; GCN: v_cmp_le_u32
-define void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_ule(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp ule i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -298,7 +298,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_sgt:
 ; R600: SETGT_INT
 ; GCN: v_cmp_gt_i32
-define void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sgt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sgt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -309,7 +309,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_sge:
 ; R600: SETGE_INT
 ; GCN: v_cmp_ge_i32
-define void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sge(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sge i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -320,7 +320,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_slt:
 ; R600: SETGT_INT
 ; GCN: v_cmp_lt_i32
-define void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_slt(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp slt i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -331,7 +331,7 @@ entry:
 ; FUNC-LABEL: {{^}}i32_sle:
 ; R600: SETGE_INT
 ; GCN: v_cmp_le_i32
-define void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @i32_sle(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %0 = icmp sle i32 %a, %b
   %1 = sext i1 %0 to i32
@@ -348,7 +348,7 @@ entry:
 ; GCN-DAG: v_cmp_eq_u32
 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
 ; GCN: s_endpgm
-define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 {
+define amdgpu_kernel void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptra, <3 x i32> addrspace(1)* %ptrb) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.a = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptra, i32 %tid
   %gep.b = getelementptr <3 x i32>, <3 x i32> addrspace(1)* %ptrb, i32 %tid
@@ -369,7 +369,7 @@ define void @v3i32_eq(<3 x i32> addrspace(1)* %out, <3 x i32> addrspace(1)* %ptr
 ; GCN-DAG: v_cmp_eq_u32
 ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1,
 ; GCN: s_endpgm
-define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 {
+define amdgpu_kernel void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra, <3 x i8> addrspace(1)* %ptrb) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep.a = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptra, i32 %tid
   %gep.b = getelementptr <3 x i8>, <3 x i8> addrspace(1)* %ptrb, i32 %tid
@@ -386,7 +386,7 @@ define void @v3i8_eq(<3 x i8> addrspace(1)* %out, <3 x i8> addrspace(1)* %ptra,
 ; FUNC-LABEL: setcc-i1
 ; GCN: s_and_b32 [[AND:s[0-9]+]], s{{[0-9]+}}, 1
 ; GCN: s_cmp_eq_u32 [[AND]], 0
-define void @setcc-i1(i32 %in) #0 {
+define amdgpu_kernel void @setcc-i1(i32 %in) #0 {
   %and = and i32 %in, 1
   %cmp = icmp eq i32 %and, 0
   br i1 %cmp, label %endif, label %if
@@ -400,7 +400,7 @@ endif:
 ; GCN-DAG: v_cmp_ge_f32_e64 [[A:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0{{$}}
 ; GCN-DAG: v_cmp_le_f32_e64 [[B:s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 1.0
 ; GCN: s_and_b64 s[2:3], [[A]], [[B]]
-define void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
+define amdgpu_kernel void @setcc-i1-and-xor(i32 addrspace(1)* %out, float %cond) #0 {
 bb0:
   %tmp5 = fcmp oge float %cond, 0.000000e+00
   %tmp7 = fcmp ole float %cond, 1.000000e+00
diff --git a/test/CodeGen/AMDGPU/setcc64.ll b/test/CodeGen/AMDGPU/setcc64.ll
index 1f86277e0bc68aebce5ef7f2d28dd8a3043994c9..1f1bdb055302c365f904ffbac6dd4022da7fde39 100644
--- a/test/CodeGen/AMDGPU/setcc64.ll
+++ b/test/CodeGen/AMDGPU/setcc64.ll
@@ -9,7 +9,7 @@
 
 ; GCN-LABEL: {{^}}f64_oeq:
 ; GCN: v_cmp_eq_f64
-define void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_oeq(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp oeq double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -19,7 +19,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_ogt:
 ; GCN: v_cmp_gt_f64
-define void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ogt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ogt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -29,7 +29,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_oge:
 ; GCN: v_cmp_ge_f64
-define void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_oge(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp oge double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -39,7 +39,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_olt:
 ; GCN: v_cmp_lt_f64
-define void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_olt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp olt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -49,7 +49,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_ole:
 ; GCN: v_cmp_le_f64
-define void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ole(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ole double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -60,7 +60,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_one:
 ; GCN: v_cmp_lg_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_one(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp one double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -70,7 +70,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_ord:
 ; GCN: v_cmp_o_f64
-define void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ord(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ord double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -81,7 +81,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_ueq:
 ; GCN: v_cmp_nlg_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ueq(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ueq double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -93,7 +93,7 @@ entry:
 
 ; GCN: v_cmp_nle_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ugt(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ugt double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -104,7 +104,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_uge:
 ; GCN: v_cmp_nlt_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_uge(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp uge double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -115,7 +115,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_ult:
 ; GCN: v_cmp_nge_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ult(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ult double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -126,7 +126,7 @@ entry:
 ; GCN-LABEL: {{^}}f64_ule:
 ; GCN: v_cmp_ngt_f64_e32 vcc
 ; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc
-define void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_ule(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp ule double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -136,7 +136,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_une:
 ; GCN: v_cmp_neq_f64
-define void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_une(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp une double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -146,7 +146,7 @@ entry:
 
 ; GCN-LABEL: {{^}}f64_uno:
 ; GCN: v_cmp_u_f64
-define void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @f64_uno(i32 addrspace(1)* %out, double %a, double %b) #0 {
 entry:
   %tmp0 = fcmp uno double %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -160,7 +160,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_eq:
 ; GCN: v_cmp_eq_u64
-define void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_eq(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp eq i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -170,7 +170,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ne:
 ; GCN: v_cmp_ne_u64
-define void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ne(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ne i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -180,7 +180,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ugt:
 ; GCN: v_cmp_gt_u64
-define void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ugt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ugt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -190,7 +190,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_uge:
 ; GCN: v_cmp_ge_u64
-define void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_uge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp uge i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -200,7 +200,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ult:
 ; GCN: v_cmp_lt_u64
-define void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ult(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ult i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -210,7 +210,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_ule:
 ; GCN: v_cmp_le_u64
-define void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_ule(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp ule i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -220,7 +220,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_sgt:
 ; GCN: v_cmp_gt_i64
-define void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sgt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sgt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -230,7 +230,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_sge:
 ; GCN: v_cmp_ge_i64
-define void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sge(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sge i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -240,7 +240,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_slt:
 ; GCN: v_cmp_lt_i64
-define void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_slt(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp slt i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
@@ -250,7 +250,7 @@ entry:
 
 ; GCN-LABEL: {{^}}i64_sle:
 ; GCN: v_cmp_le_i64
-define void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @i64_sle(i32 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 entry:
   %tmp0 = icmp sle i64 %a, %b
   %tmp1 = sext i1 %tmp0 to i32
diff --git a/test/CodeGen/AMDGPU/seto.ll b/test/CodeGen/AMDGPU/seto.ll
index 01e4a7fda5d27e0019f96f05e0c6a84ed330d75a..b4385aa0cccaaa94742552c707c5c84ac2e14a70 100644
--- a/test/CodeGen/AMDGPU/seto.ll
+++ b/test/CodeGen/AMDGPU/seto.ll
@@ -4,12 +4,9 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: v_cmp_o_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
 ; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
-define void @main(float %p) {
+define amdgpu_ps float @main(float inreg %p) {
 main_body:
   %c = fcmp oeq float %p, %p
   %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
-  ret void
+  ret float %r
 }
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/setuo.ll b/test/CodeGen/AMDGPU/setuo.ll
index 76346c4f624abf90ceaddea9ada32b77242faab1..f6821b675e225ee8a44979ed00edab039682d88d 100644
--- a/test/CodeGen/AMDGPU/setuo.ll
+++ b/test/CodeGen/AMDGPU/setuo.ll
@@ -4,12 +4,9 @@
 ; CHECK-LABEL: {{^}}main:
 ; CHECK: v_cmp_u_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], [[SREG:s[0-9]+]], [[SREG]]
 ; CHECK-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, 1.0, [[CMP]]
-define void @main(float %p) {
+define amdgpu_ps float @main(float inreg %p) {
 main_body:
   %c = fcmp une float %p, %p
   %r = select i1 %c, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %r, float %r, float %r, float %r)
-  ret void
+  ret float %r
 }
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/sext-eliminate.ll b/test/CodeGen/AMDGPU/sext-eliminate.ll
index 7dc6eb87f6b5a19dd863af43dab005a17cf242ed..0b780af17bca99a0125e2fe9275ab8a95413ee78 100644
--- a/test/CodeGen/AMDGPU/sext-eliminate.ll
+++ b/test/CodeGen/AMDGPU/sext-eliminate.ll
@@ -6,7 +6,7 @@
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: SUB_INT {{[* ]*}}[[RES]]
 ; EG-NOT: BFE
-define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+define amdgpu_kernel void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
   %sext = sext i1 %a to i32
   %res = add i32 %b, %sext
   store i32 %res, i32 addrspace(1)* %out
@@ -18,7 +18,7 @@ define void @sext_in_reg_i1_i32_add(i32 addrspace(1)* %out, i1 %a, i32 %b) {
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: ADD_INT {{[* ]*}}[[RES]]
 ; EG-NOT: BFE
-define void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
+define amdgpu_kernel void @sext_in_reg_i1_i32_sub(i32 addrspace(1)* %out, i1 %a, i32 %b) {
   %sext = sext i1 %a to i32
   %res = sub i32 %b, %sext
   store i32 %res, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll b/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
index adba6bbb51d4eaff3e8f5f5ea3b09a98856fb5d0..7ac4e1d9fe4b086b8a59f51a95eb9b7d668a9b63 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg-failure-r600.ll
@@ -11,7 +11,7 @@
 ; EG: LSHR {{\*?}} [[ADDR]]
 
 ; Works with the align 2 removed
-define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
+define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) nounwind {
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
   %y = ashr <2 x i32> %x, <i32 7, i32 7>
diff --git a/test/CodeGen/AMDGPU/sext-in-reg.ll b/test/CodeGen/AMDGPU/sext-in-reg.ll
index 4c58261709c45015982787fe611d49db92a970dd..b702e1c07200d6ec3f3e970b4aa04bb39a9472c8 100644
--- a/test/CodeGen/AMDGPU/sext-in-reg.ll
+++ b/test/CodeGen/AMDGPU/sext-in-reg.ll
@@ -1,8 +1,10 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
-; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 -check-prefix=GFX89 -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FIXME: i16 promotion pass ruins the scalar cases when legal.
+; FIXME: r600 fails verifier
 
 ; FUNC-LABEL: {{^}}sext_in_reg_i1_i32:
 ; GCN: s_load_dword [[ARG:s[0-9]+]],
@@ -13,7 +15,7 @@
 ; EG: MEM_{{.*}} STORE_{{.*}} [[RES:T[0-9]+\.[XYZW]]], [[ADDR:T[0-9]+.[XYZW]]]
 ; EG: LSHR * [[ADDR]]
 ; EG: BFE_INT * [[RES]], {{.*}}, 0.0, 1
-define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
   %shl = shl i32 %in, 31
   %sext = ashr i32 %shl, 31
   store i32 %sext, i32 addrspace(1)* %out
@@ -30,7 +32,7 @@ define void @sext_in_reg_i1_i32(i32 addrspace(1)* %out, i32 %in) #0 {
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 24
   %ashr = ashr i32 %shl, 24
@@ -48,7 +50,7 @@ define void @sext_in_reg_i8_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b ; add to prevent folding into extload
   %shl = shl i32 %c, 16
   %ashr = ashr i32 %shl, 16
@@ -66,7 +68,7 @@ define void @sext_in_reg_i16_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; EG: ADD_INT
 ; EG-NEXT: BFE_INT [[RES]], {{.*}}, 0.0, literal
 ; EG-NEXT: LSHR * [[ADDR]]
-define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a, <1 x i32> %b) #0 {
   %c = add <1 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <1 x i32> %c, <i32 24>
   %ashr = ashr <1 x i32> %shl, <i32 24>
@@ -80,7 +82,7 @@ define void @sext_in_reg_i8_to_v1i32(<1 x i32> addrspace(1)* %out, <1 x i32> %a,
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 63
   %ashr = ashr i64 %shl, 63
@@ -94,7 +96,7 @@ define void @sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 56
   %ashr = ashr i64 %shl, 56
@@ -109,7 +111,7 @@ define void @sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
 
-define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 48
   %ashr = ashr i64 %shl, 48
@@ -123,7 +125,7 @@ define void @sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[SLO]]
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[SHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %c = shl i64 %a, %b
   %shl = shl i64 %c, 32
   %ashr = ashr i64 %shl, 32
@@ -138,7 +140,7 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; XGCN: buffer_store_dword
 ; XEG: BFE_INT
 ; XEG: ASHR
-; define void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 {
+; define amdgpu_kernel void @sext_in_reg_i8_to_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a, <1 x i64> %b) #0 {
 ;   %c = add <1 x i64> %a, %b
 ;   %shl = shl <1 x i64> %c, <i64 56>
 ;   %ashr = ashr <1 x i64> %shl, <i64 56>
@@ -150,15 +152,15 @@ define void @sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
 ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -177,15 +179,15 @@ define void @v_sext_in_reg_i1_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
 ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 8
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -204,15 +206,15 @@ define void @v_sext_in_reg_i8_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
 ; GCN: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 16
 ; GCN: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[HI]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -231,12 +233,12 @@ define void @v_sext_in_reg_i16_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
 
 ; GCN: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}}
-define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[LO]]:[[SHR]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -262,7 +264,7 @@ define void @v_sext_in_reg_i32_to_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; EG: LSHL
 ; EG: ASHR [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %c = add i32 %a, %b
   %x = shl i32 %c, 6
   %y = ashr i32 %x, 7
@@ -285,7 +287,7 @@ define void @sext_in_reg_i1_in_i32_other_amount(i32 addrspace(1)* %out, i32 %a,
 ; EG: LSHL
 ; EG: ASHR [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b
   %x = shl <2 x i32> %c, <i32 6, i32 6>
   %y = ashr <2 x i32> %x, <i32 7, i32 7>
@@ -303,7 +305,7 @@ define void @sext_in_reg_v2i1_in_v2i32_other_amount(<2 x i32> addrspace(1)* %out
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 31, i32 31>
   %ashr = ashr <2 x i32> %shl, <i32 31, i32 31>
@@ -324,7 +326,7 @@ define void @sext_in_reg_v2i1_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 31, i32 31, i32 31, i32 31>
   %ashr = ashr <4 x i32> %shl, <i32 31, i32 31, i32 31, i32 31>
@@ -341,7 +343,7 @@ define void @sext_in_reg_v4i1_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 24, i32 24>
   %ashr = ashr <2 x i32> %shl, <i32 24, i32 24>
@@ -362,7 +364,7 @@ define void @sext_in_reg_v2i8_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %a, <4 x i32> %b) #0 {
   %c = add <4 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <4 x i32> %c, <i32 24, i32 24, i32 24, i32 24>
   %ashr = ashr <4 x i32> %shl, <i32 24, i32 24, i32 24, i32 24>
@@ -379,7 +381,7 @@ define void @sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %
 ; EG: BFE_INT [[RES]]
 ; EG: BFE_INT [[RES]]
 ; EG: LSHR {{\*?}} [[ADDR]]
-define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
+define amdgpu_kernel void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %a, <2 x i32> %b) #0 {
   %c = add <2 x i32> %a, %b ; add to prevent folding into extload
   %shl = shl <2 x i32> %c, <i32 16, i32 16>
   %ashr = ashr <2 x i32> %shl, <i32 16, i32 16>
@@ -388,7 +390,7 @@ define void @sext_in_reg_v2i16_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32>
 }
 
 ; FUNC-LABEL: {{^}}testcase:
-define void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
+define amdgpu_kernel void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
   %cmp_slt = icmp slt i8 %a, 0
@@ -400,7 +402,7 @@ define void @testcase(i8 addrspace(1)* %out, i8 %a) #0 {
 }
 
 ; FUNC-LABEL: {{^}}testcase_3:
-define void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
+define amdgpu_kernel void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
   %and_a_1 = and i8 %a, 1
   %cmp_eq = icmp eq i8 %and_a_1, 0
   %cmp_slt = icmp slt i8 %a, 0
@@ -416,7 +418,7 @@ define void @testcase_3(i8 addrspace(1)* %out, i8 %a) #0 {
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 8
-define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
+define amdgpu_kernel void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
   %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
   %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -429,7 +431,7 @@ define void @vgpr_sext_in_reg_v4i8_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i
 ; FUNC-LABEL: {{^}}vgpr_sext_in_reg_v4i16_to_v4i32:
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
 ; GCN: v_bfe_i32 [[EXTRACT:v[0-9]+]], {{v[0-9]+}}, 0, 16
-define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
+define amdgpu_kernel void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %a, <4 x i32> addrspace(1)* %b) #0 {
   %loada = load <4 x i32>, <4 x i32> addrspace(1)* %a, align 16
   %loadb = load <4 x i32>, <4 x i32> addrspace(1)* %b, align 16
   %c = add <4 x i32> %loada, %loadb ; add to prevent folding into extload
@@ -444,7 +446,7 @@ define void @vgpr_sext_in_reg_v4i16_to_v4i32(<4 x i32> addrspace(1)* %out, <4 x
 ; GCN: v_max_i32
 ; GCN-NOT: bfe
 ; GCN: buffer_store_short
-define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 {
+define amdgpu_kernel void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture %src) #0 {
   %tmp5 = load i8, i8 addrspace(1)* %src, align 1
   %tmp2 = sext i8 %tmp5 to i32
   %tmp2.5 = icmp sgt i32 %tmp2, 0
@@ -455,167 +457,22 @@ define void @sext_in_reg_to_illegal_type(i16 addrspace(1)* nocapture %out, i8 ad
   ret void
 }
 
-declare i32 @llvm.AMDGPU.bfe.i32(i32, i32, i32) nounwind readnone
-
-; FUNC-LABEL: {{^}}bfe_0_width:
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @bfe_0_width(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 8, i32 0) nounwind readnone
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_8_bfe_8:
-; GCN: v_bfe_i32
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @bfe_8_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
-  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}bfe_8_bfe_16:
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; GCN: s_endpgm
-define void @bfe_8_bfe_16(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 8) nounwind readnone
-  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 16) nounwind readnone
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; This really should be folded into 1
-; FUNC-LABEL: {{^}}bfe_16_bfe_8:
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 8
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @bfe_16_bfe_8(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) #0 {
-  %load = load i32, i32 addrspace(1)* %ptr, align 4
-  %bfe0 = call i32 @llvm.AMDGPU.bfe.i32(i32 %load, i32 0, i32 16) nounwind readnone
-  %bfe1 = call i32 @llvm.AMDGPU.bfe.i32(i32 %bfe0, i32 0, i32 8) nounwind readnone
-  store i32 %bfe1, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; Make sure there isn't a redundant BFE
-; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe:
-; GCN: s_sext_i32_i8 s{{[0-9]+}}, s{{[0-9]+}}
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @sext_in_reg_i8_to_i32_bfe(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
-  %c = add i32 %a, %b ; add to prevent folding into extload
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 0, i32 8) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i8_to_i32_bfe_wrong:
-define void @sext_in_reg_i8_to_i32_bfe_wrong(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
-  %c = add i32 %a, %b ; add to prevent folding into extload
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %c, i32 8, i32 0) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe:
-; GCN: buffer_load_sbyte
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @sextload_i8_to_i32_bfe(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
-  %load = load i8, i8 addrspace(1)* %ptr, align 1
-  %sext = sext i8 %load to i32
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 0, i32 8) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; GCN: .text
-; FUNC-LABEL: {{^}}sextload_i8_to_i32_bfe_0:{{.*$}}
-; GCN-NOT: {{[^@]}}bfe
-; GCN: s_endpgm
-define void @sextload_i8_to_i32_bfe_0(i32 addrspace(1)* %out, i8 addrspace(1)* %ptr) #0 {
-  %load = load i8, i8 addrspace(1)* %ptr, align 1
-  %sext = sext i8 %load to i32
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %sext, i32 8, i32 0) nounwind readnone
-  %shl = shl i32 %bfe, 24
-  %ashr = ashr i32 %shl, 24
-  store i32 %ashr, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_0:
-; GCN-NOT: shr
-; GCN-NOT: shl
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 1
-; GCN: s_endpgm
-define void @sext_in_reg_i1_bfe_offset_0(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 31
-  %shr = ashr i32 %shl, 31
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 0, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i1_bfe_offset_1:
-; GCN: buffer_load_dword
-; GCN-NOT: shl
-; GCN-NOT: shr
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 1
-; GCN: s_endpgm
-define void @sext_in_reg_i1_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 30
-  %shr = ashr i32 %shl, 30
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 1)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
-; FUNC-LABEL: {{^}}sext_in_reg_i2_bfe_offset_1:
-; GCN: buffer_load_dword
-; GCN-NOT: v_lshl
-; GCN-NOT: v_ashr
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 2
-; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 1, 2
-; GCN: s_endpgm
-define void @sext_in_reg_i2_bfe_offset_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
-  %x = load i32, i32 addrspace(1)* %in, align 4
-  %shl = shl i32 %x, 30
-  %shr = ashr i32 %shl, 30
-  %bfe = call i32 @llvm.AMDGPU.bfe.i32(i32 %shr, i32 1, i32 2)
-  store i32 %bfe, i32 addrspace(1)* %out, align 4
-  ret void
-}
-
 ; Make sure we propagate the VALUness to users of a moved scalar BFE.
 
 ; FUNC-LABEL: {{^}}v_sext_in_reg_i1_to_i64_move_use:
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[VAL_LO:[0-9]+]]:[[VAL_HI:[0-9]+]]{{\]}}
 
 ; GCN-DAG: v_bfe_i32 v[[LO:[0-9]+]], v[[VAL_LO]], 0, 1
 ; GCN-DAG: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[HI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -636,16 +493,16 @@ define void @v_sext_in_reg_i1_to_i64_move_use(i64 addrspace(1)* %out, i64 addrsp
 ; SI: buffer_load_dwordx2
 ; SI: v_lshl_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
 
-; VI: flat_load_dwordx2
-; VI: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
+; GFX89: flat_load_dwordx2
+; GFX89: v_lshlrev_b64 v{{\[}}[[LO:[0-9]+]]:[[HI:[0-9]+]]{{\]}},
 
 ; GCN-DAG: v_ashrrev_i32_e32 v[[SHR:[0-9]+]], 31, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, v[[LO]]
 ; GCN-DAG: v_and_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}}, v[[SHR]]
 
 ; SI: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-; VI: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
-define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
+; GFX89: flat_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}}
+define amdgpu_kernel void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr, i64 %s.val) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
@@ -668,10 +525,10 @@ define void @v_sext_in_reg_i32_to_i64_move_use(i64 addrspace(1)* %out, i64 addrs
 ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
 ; SI: buffer_store_short [[VBFE]]
 
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
-; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
-define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 15
+define amdgpu_kernel void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
   %ld = load i32, i32 addrspace(2)* %ptr
   %in = trunc i32 %ld to i16
   %shl = shl i16 %in, 15
@@ -687,10 +544,10 @@ define void @s_sext_in_reg_i1_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr
 ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
 ; SI: buffer_store_short [[VBFE]]
 
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
-; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
-define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
+; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
+; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14
+define amdgpu_kernel void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
   %ld = load i32, i32 addrspace(2)* %ptr
   %in = trunc i32 %ld to i16
   %shl = shl i16 %in, 14
@@ -704,7 +561,7 @@ define void @s_sext_in_reg_i2_i16(i16 addrspace(1)* %out, i32 addrspace(2)* %ptr
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[VAL]], 0, 1{{$}}
 
 ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
-define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
+define amdgpu_kernel void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %gep = getelementptr i16, i16 addrspace(1)* %ptr, i32 %tid
   %out.gep = getelementptr i16, i16 addrspace(3)* %out, i32 %tid
@@ -721,11 +578,11 @@ define void @v_sext_in_reg_i1_i16(i16 addrspace(3)* %out, i16 addrspace(1)* %ptr
 ; GCN: {{buffer|flat}}_load_ushort [[VAL1:v[0-9]+]]
 
 ; SI: v_lshlrev_b32_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
-; VI: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
+; GFX89: v_lshlrev_b16_e32 [[REG:v[0-9]+]], [[VAL1]], [[VAL0]]
 
 ; GCN: v_bfe_i32 [[BFE:v[0-9]+]], [[REG]], 0, 1{{$}}
 ; GCN: ds_write_b16 v{{[0-9]+}}, [[BFE]]
-define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
+define amdgpu_kernel void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(1)* %aptr, i16 addrspace(1)* %bptr, i16 %s.val) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %a.gep = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %b.gep = getelementptr i16, i16 addrspace(1)* %bptr, i32 %tid
@@ -748,10 +605,10 @@ define void @v_sext_in_reg_i1_i16_nonload(i16 addrspace(3)* %out, i16 addrspace(
 ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
 ; SI: buffer_store_short [[VBFE]]
 
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
-; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
-define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
+; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 14{{$}}
+define amdgpu_kernel void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 14
   %sext = ashr i16 %shl, 14
   store i16 %sext, i16 addrspace(1)* %out
@@ -765,10 +622,10 @@ define void @s_sext_in_reg_i2_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
 ; SI: v_mov_b32_e32 [[VSEXT:v[0-9]+]], [[SSEXT]]
 ; SI: buffer_store_short [[VBFE]]
 
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
-; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
-define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 8{{$}}
+define amdgpu_kernel void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 8
   %sext = ashr i16 %shl, 8
   store i16 %sext, i16 addrspace(1)* %out
@@ -782,16 +639,82 @@ define void @s_sext_in_reg_i8_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
 ; SI: v_mov_b32_e32 [[VBFE:v[0-9]+]], [[BFE]]
 ; SI: buffer_store_short [[VBFE]]
 
-; VI: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
-; VI: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
-; VI: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
-define void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
+; GFX89: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
+; GFX89: s_sext_i32_i16 s{{[0-9]+}}, s{{[0-9]+}}
+; GFX89: s_lshr_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1{{$}}
+define amdgpu_kernel void @s_sext_in_reg_i15_i16_arg(i16 addrspace(1)* %out, i16 %in) #0 {
   %shl = shl i16 %in, 1
   %sext = ashr i16 %shl, 1
   store i16 %sext, i16 addrspace(1)* %out
   ret void
 }
 
+; FUNC-LABEL: {{^}}sext_in_reg_v2i1_to_v2i16:
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 15, [[ADD]]
+; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 15, [[SHL]]
+define amdgpu_kernel void @sext_in_reg_v2i1_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+  %c = add <2 x i16> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i16> %c, <i16 15, i16 15>
+  %ashr = ashr <2 x i16> %shl, <i16 15, i16 15>
+  store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v3i1_to_v3i16:
+; GFX9: v_pk_add_u16
+; GFX9: v_pk_add_u16
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 15, v{{[0-9]+}}
+define amdgpu_kernel void @sext_in_reg_v3i1_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
+  %c = add <3 x i16> %a, %b ; add to prevent folding into extload
+  %shl = shl <3 x i16> %c, <i16 15, i16 15, i16 15>
+  %ashr = ashr <3 x i16> %shl, <i16 15, i16 15, i16 15>
+  store <3 x i16> %ashr, <3 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i2_to_v2i16:
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 14, [[ADD]]
+; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 14, [[SHL]]
+define amdgpu_kernel void @sext_in_reg_v2i2_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+  %c = add <2 x i16> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i16> %c, <i16 14, i16 14>
+  %ashr = ashr <2 x i16> %shl, <i16 14, i16 14>
+  store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v2i8_to_v2i16:
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[SHL:v[0-9]+]], 8, [[ADD]]
+; GFX9: v_pk_ashrrev_i16 [[SRA:v[0-9]+]], 8, [[SHL]]
+define amdgpu_kernel void @sext_in_reg_v2i8_to_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #0 {
+  %c = add <2 x i16> %a, %b ; add to prevent folding into extload
+  %shl = shl <2 x i16> %c, <i16 8, i16 8>
+  %ashr = ashr <2 x i16> %shl, <i16 8, i16 8>
+  store <2 x i16> %ashr, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FUNC-LABEL: {{^}}sext_in_reg_v3i8_to_v3i16:
+; GFX9: v_pk_add_u16
+; GFX9: v_pk_add_u16
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_ashrrev_i16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+define amdgpu_kernel void @sext_in_reg_v3i8_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i16> %a, <3 x i16> %b) #0 {
+  %c = add <3 x i16> %a, %b ; add to prevent folding into extload
+  %shl = shl <3 x i16> %c, <i16 8, i16 8, i16 8>
+  %ashr = ashr <3 x i16> %shl, <i16 8, i16 8, i16 8>
+  store <3 x i16> %ashr, <3 x i16> addrspace(1)* %out
+  ret void
+}
+
 declare i32 @llvm.r600.read.tidig.x() #1
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/sgpr-control-flow.ll b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
index d5d2f6b717f9600ae643469d2ba3d40c18fa037e..8e18ab5554e458dfe8942a9cb16635bbe02ccbe4 100644
--- a/test/CodeGen/AMDGPU/sgpr-control-flow.ll
+++ b/test/CodeGen/AMDGPU/sgpr-control-flow.ll
@@ -13,7 +13,7 @@
 
 ; SI: s_sub
 
-define void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -52,7 +52,7 @@ endif:
 ; SI: s_add_i32 s{{[0-9]+}}, [[LOAD0]], [[LOAD1]]
 ; SI: buffer_store_dword
 ; SI-NEXT: s_endpgm
-define void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_salu_br_opt(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %0 = icmp eq i32 %a, 0
   br i1 %0, label %if, label %else
@@ -79,7 +79,7 @@ endif:
 ; SI: s_add_i32 [[SGPR:s[0-9]+]]
 ; SI-NOT: s_add_i32 [[SGPR]]
 
-define void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
+define amdgpu_kernel void @sgpr_if_else_valu_br(i32 addrspace(1)* %out, float %a, i32 %b, i32 %c, i32 %d, i32 %e) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tid_f = uitofp i32 %tid to float
@@ -116,7 +116,7 @@ endif:
 ; SI: v_cmp_ne_u32_e32 [[CMP_CMP:vcc]], 0, [[V_CMP]]
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1, [[CMP_CMP]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
+define amdgpu_kernel void @sgpr_if_else_valu_cmp_phi_br(i32 addrspace(1)* %out, i32 addrspace(1)* %a, i32 addrspace(1)* %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp eq i32 %tid, 0
diff --git a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
index f44ae6e09e9fe841ecbe097cc73a62842740be3d..fb0bbaa9cbf272f852dbe23d221c5ee45a088bfc 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy-duplicate-operand.ll
@@ -6,7 +6,7 @@
 
 ; SI-LABEL: {{^}}test_dup_operands:
 ; SI: v_add_i32_e32
-define void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
+define amdgpu_kernel void @test_dup_operands(<2 x i32> addrspace(1)* noalias %out, <2 x i32> addrspace(1)* noalias %in) {
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %lo = extractelement <2 x i32> %a, i32 0
   %hi = extractelement <2 x i32> %a, i32 1
diff --git a/test/CodeGen/AMDGPU/sgpr-copy.ll b/test/CodeGen/AMDGPU/sgpr-copy.ll
index 013f5253b369dc46359eda9cd0465897305e5160..5c20e9a8d5859da3c39fd849dc9c63cba8f30074 100644
--- a/test/CodeGen/AMDGPU/sgpr-copy.ll
+++ b/test/CodeGen/AMDGPU/sgpr-copy.ll
@@ -1,13 +1,6 @@
 ; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck %s
 
-; This test checks that no VGPR to SGPR copies are created by the register
-; allocator.
-
-
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-
 ; CHECK-LABEL: {{^}}phi1:
 ; CHECK: s_buffer_load_dword [[DST:s[0-9]]], {{s\[[0-9]+:[0-9]+\]}}, 0x0
 ; CHECK: v_mov_b32_e32 v{{[0-9]}}, [[DST]]
@@ -29,13 +22,13 @@ ELSE:                                             ; preds = %main_body
 ENDIF:                                            ; preds = %ELSE, %main_body
   %temp.0 = phi float [ %tmp26, %ELSE ], [ %tmp21, %main_body ]
   %tmp27 = fadd float %temp.0, %tmp23
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp27, float %tmp27, float 0.000000e+00, float 1.000000e+00, i1 true, i1 true) #0
   ret void
 }
 
 ; Make sure this program doesn't crash
 ; CHECK-LABEL: {{^}}phi2:
-define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
+define amdgpu_ps void @phi2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #1 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
   %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -58,28 +51,54 @@ main_body:
   %tmp37 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp36, !tbaa !0
   %tmp38 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0
   %tmp39 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp38, !tbaa !0
-  %tmp40 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5)
-  %tmp41 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg3, <2 x i32> %arg5)
-  %tmp42 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg3, <2 x i32> %arg5)
-  %tmp43 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg3, <2 x i32> %arg5)
-  %tmp44 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg3, <2 x i32> %arg5)
-  %tmp45 = bitcast float %tmp40 to i32
-  %tmp46 = bitcast float %tmp41 to i32
+  %i.i = extractelement <2 x i32> %arg5, i32 0
+  %j.i = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg3) #1
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg3) #1
+  %i.i19 = extractelement <2 x i32> %arg5, i32 0
+  %j.i20 = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i21 = bitcast i32 %i.i19 to float
+  %j.f.i22 = bitcast i32 %j.i20 to float
+  %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 1, i32 0, i32 %arg3) #1
+  %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 1, i32 0, i32 %arg3) #1
+  %i.i13 = extractelement <2 x i32> %arg5, i32 0
+  %j.i14 = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i15 = bitcast i32 %i.i13 to float
+  %j.f.i16 = bitcast i32 %j.i14 to float
+  %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 0, i32 1, i32 %arg3) #1
+  %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 0, i32 1, i32 %arg3) #1
+  %i.i7 = extractelement <2 x i32> %arg5, i32 0
+  %j.i8 = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i9 = bitcast i32 %i.i7 to float
+  %j.f.i10 = bitcast i32 %j.i8 to float
+  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 1, i32 %arg3) #1
+  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 1, i32 %arg3) #1
+  %i.i1 = extractelement <2 x i32> %arg5, i32 0
+  %j.i2 = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 1, i32 %arg3) #1
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 1, i32 %arg3) #1
+  %tmp45 = bitcast float %p2.i to i32
+  %tmp46 = bitcast float %p2.i24 to i32
   %tmp47 = insertelement <2 x i32> undef, i32 %tmp45, i32 0
   %tmp48 = insertelement <2 x i32> %tmp47, i32 %tmp46, i32 1
   %tmp39.bc = bitcast <16 x i8> %tmp39 to <4 x i32>
-  %tmp49 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp48, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp50 = extractelement <4 x float> %tmp49, i32 2
-  %tmp51 = call float @fabs(float %tmp50)
-  %tmp52 = fmul float %tmp42, %tmp42
-  %tmp53 = fmul float %tmp43, %tmp43
+  %a.bc.i = bitcast <2 x i32> %tmp48 to <2 x float>
+  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp37, <4 x i32> %tmp39.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp50 = extractelement <4 x float> %tmp1, i32 2
+  %tmp51 = call float @llvm.fabs.f32(float %tmp50)
+  %tmp52 = fmul float %p2.i18, %p2.i18
+  %tmp53 = fmul float %p2.i12, %p2.i12
   %tmp54 = fadd float %tmp53, %tmp52
-  %tmp55 = fmul float %tmp44, %tmp44
+  %tmp55 = fmul float %p2.i6, %p2.i6
   %tmp56 = fadd float %tmp54, %tmp55
   %tmp57 = call float @llvm.amdgcn.rsq.f32(float %tmp56)
-  %tmp58 = fmul float %tmp42, %tmp57
-  %tmp59 = fmul float %tmp43, %tmp57
-  %tmp60 = fmul float %tmp44, %tmp57
+  %tmp58 = fmul float %p2.i18, %tmp57
+  %tmp59 = fmul float %p2.i12, %tmp57
+  %tmp60 = fmul float %p2.i6, %tmp57
   %tmp61 = fmul float %tmp58, %tmp22
   %tmp62 = fmul float %tmp59, %tmp23
   %tmp63 = fadd float %tmp62, %tmp61
@@ -90,7 +109,7 @@ main_body:
   %tmp68 = fadd float %tmp67, %tmp66
   %tmp69 = fmul float %tmp26, %tmp68
   %tmp70 = fmul float %tmp27, %tmp68
-  %tmp71 = call float @fabs(float %tmp69)
+  %tmp71 = call float @llvm.fabs.f32(float %tmp69)
   %tmp72 = fcmp olt float 0x3EE4F8B580000000, %tmp71
   %tmp73 = sext i1 %tmp72 to i32
   %tmp74 = bitcast i32 %tmp73 to float
@@ -110,7 +129,7 @@ IF:                                               ; preds = %main_body
 
 ENDIF:                                            ; preds = %IF, %main_body
   %temp4.0 = phi float [ %tmp83, %IF ], [ %tmp31, %main_body ]
-  %tmp84 = call float @fabs(float %tmp70)
+  %tmp84 = call float @llvm.fabs.f32(float %tmp70)
   %tmp85 = fcmp olt float 0x3EE4F8B580000000, %tmp84
   %tmp86 = sext i1 %tmp85 to i32
   %tmp87 = bitcast i32 %tmp86 to float
@@ -146,11 +165,9 @@ ENDIF24:                                          ; preds = %IF25, %ENDIF
   %tmp110 = fmul float %tmp109, %tmp106
   %tmp111 = fsub float -0.000000e+00, %tmp105
   %tmp112 = fmul float %tmp111, %tmp106
-  %tmp113 = call i32 @llvm.SI.packf16(float %tmp108, float %tmp110)
-  %tmp114 = bitcast i32 %tmp113 to float
-  %tmp115 = call i32 @llvm.SI.packf16(float %tmp112, float 1.000000e+00)
-  %tmp116 = bitcast i32 %tmp115 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp114, float %tmp116, float %tmp114, float %tmp116)
+  %tmp113 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp108, float %tmp110)
+  %tmp115 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp112, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp113, <2 x half> %tmp115, i1 true, i1 true) #0
   ret void
 }
 
@@ -183,7 +200,7 @@ LOOP:                                             ; preds = %ENDIF, %main_body
   br i1 %tmp33, label %IF, label %ENDIF
 
 IF:                                               ; preds = %LOOP
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %temp4.0, float %temp5.0, float %temp6.0, float 1.000000e+00, i1 true, i1 true) #0
   ret void
 
 ENDIF:                                            ; preds = %LOOP
@@ -193,31 +210,6 @@ ENDIF:                                            ; preds = %LOOP
   br label %LOOP
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: readonly
-declare float @fabs(float) #2
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.sample.v2i32(<2 x i32>, <8 x i32>, <16 x i8>, i32) #1
-
-; Function Attrs: readnone
-declare float @llvm.amdgcn.rsq.f32(float) #1
-
-declare float @llvm.exp2.f32(float) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.pow.f32(float, float) #1
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
 ; This checks for a bug in the FixSGPRCopies pass where VReg96
 ; registers were being identified as an SGPR regclass which was causing
 ; an assertion failure.
@@ -248,24 +240,24 @@ entry:
   br i1 %tmp27, label %if, label %else
 
 if:                                               ; preds = %entry
-  %val.if = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 11, i32 13>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %val.if.0 = extractelement <4 x float> %val.if, i32 0
-  %val.if.1 = extractelement <4 x float> %val.if, i32 1
-  %val.if.2 = extractelement <4 x float> %val.if, i32 2
+  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0x36D6000000000000, float 0x36DA000000000000>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %val.if.0 = extractelement <4 x float> %tmp1, i32 0
+  %val.if.1 = extractelement <4 x float> %tmp1, i32 1
+  %val.if.2 = extractelement <4 x float> %tmp1, i32 2
   br label %endif
 
 else:                                             ; preds = %entry
-  %val.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 5, i32 7>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %val.else.0 = extractelement <4 x float> %val.else, i32 0
-  %val.else.1 = extractelement <4 x float> %val.else, i32 1
-  %val.else.2 = extractelement <4 x float> %val.else, i32 2
+  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0x36C4000000000000, float 0x36CC000000000000>, <8 x i32> %tmp24, <4 x i32> %tmp26.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %val.else.0 = extractelement <4 x float> %tmp2, i32 0
+  %val.else.1 = extractelement <4 x float> %tmp2, i32 1
+  %val.else.2 = extractelement <4 x float> %tmp2, i32 2
   br label %endif
 
 endif:                                            ; preds = %else, %if
   %val.0 = phi float [ %val.if.0, %if ], [ %val.else.0, %else ]
   %val.1 = phi float [ %val.if.1, %if ], [ %val.else.1, %else ]
   %val.2 = phi float [ %val.if.2, %if ], [ %val.else.2, %else ]
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %val.0, float %val.1, float %val.2, float 0.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %val.0, float %val.1, float %val.2, float 0.000000e+00, i1 true, i1 true) #0
   ret void
 }
 
@@ -273,7 +265,7 @@ endif:                                            ; preds = %else, %if
 ; CHECK: buffer_load_dword
 ; CHECK: v_add
 ; CHECK: s_endpgm
-define void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
+define amdgpu_kernel void @copy1(float addrspace(1)* %out, float addrspace(1)* %in0) {
 entry:
   %tmp = load float, float addrspace(1)* %in0
   %tmp1 = fcmp oeq float %tmp, 0.000000e+00
@@ -312,7 +304,7 @@ LOOP68:                                           ; preds = %ENDIF69, %entry
 IF70:                                             ; preds = %LOOP68
   %q = icmp ne i32 %l, 13
   %temp.8 = select i1 %q, float 1.000000e+00, float 0.000000e+00
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %temp.8, float 0.000000e+00, float 0.000000e+00, float 1.000000e+00, i1 true, i1 true) #0
   ret void
 
 ENDIF69:                                          ; preds = %LOOP68
@@ -337,41 +329,53 @@ ENDIF69:                                          ; preds = %LOOP68
 define amdgpu_ps void @sample_rsrc([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <4 x i32>] addrspace(2)* byval %arg2, [32 x <8 x i32>] addrspace(2)* byval %arg3, float inreg %arg4, i32 inreg %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <3 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, float %arg20, float %arg21) #0 {
 bb:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg1, i32 0, i32 0
-  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !2
+  %tmp22 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !3
   %tmp23 = call float @llvm.SI.load.const(<16 x i8> %tmp22, i32 16)
   %tmp25 = getelementptr [32 x <8 x i32>], [32 x <8 x i32>] addrspace(2)* %arg3, i32 0, i32 0
-  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !2
+  %tmp26 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp25, !tbaa !3
   %tmp27 = getelementptr [16 x <4 x i32>], [16 x <4 x i32>] addrspace(2)* %arg2, i32 0, i32 0
-  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !2
-  %tmp29 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg7)
-  %tmp30 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg7)
+  %tmp28 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp27, !tbaa !3
+  %i.i = extractelement <2 x i32> %arg7, i32 0
+  %j.i = extractelement <2 x i32> %arg7, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #0
+  %i.i1 = extractelement <2 x i32> %arg7, i32 0
+  %j.i2 = extractelement <2 x i32> %arg7, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #0
   %tmp31 = bitcast float %tmp23 to i32
   %tmp36 = icmp ne i32 %tmp31, 0
   br i1 %tmp36, label %bb38, label %bb80
 
 bb38:                                             ; preds = %bb
-  %tmp52 = bitcast float %tmp29 to i32
-  %tmp53 = bitcast float %tmp30 to i32
+  %tmp52 = bitcast float %p2.i to i32
+  %tmp53 = bitcast float %p2.i6 to i32
   %tmp54 = insertelement <2 x i32> undef, i32 %tmp52, i32 0
   %tmp55 = insertelement <2 x i32> %tmp54, i32 %tmp53, i32 1
   %tmp56 = bitcast <8 x i32> %tmp26 to <8 x i32>
-  %tmp58 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp55, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %a.bc.i = bitcast <2 x i32> %tmp55 to <2 x float>
+  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp56, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   br label %bb71
 
 bb80:                                             ; preds = %bb
-  %tmp81 = bitcast float %tmp29 to i32
-  %tmp82 = bitcast float %tmp30 to i32
+  %tmp81 = bitcast float %p2.i to i32
+  %tmp82 = bitcast float %p2.i6 to i32
   %tmp82.2 = add i32 %tmp82, 1
   %tmp83 = insertelement <2 x i32> undef, i32 %tmp81, i32 0
   %tmp84 = insertelement <2 x i32> %tmp83, i32 %tmp82.2, i32 1
   %tmp85 = bitcast <8 x i32> %tmp26 to <8 x i32>
-  %tmp87 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp84, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %a.bc.i1 = bitcast <2 x i32> %tmp84 to <2 x float>
+  %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp85, <4 x i32> %tmp28, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   br label %bb71
 
 bb71:                                             ; preds = %bb80, %bb38
-  %tmp72 = phi <4 x float> [ %tmp58, %bb38 ], [ %tmp87, %bb80 ]
+  %tmp72 = phi <4 x float> [ %tmp2, %bb38 ], [ %tmp3, %bb80 ]
   %tmp88 = extractelement <4 x float> %tmp72, i32 0
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp88, float %tmp88, float %tmp88, float %tmp88)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp88, float %tmp88, float %tmp88, float %tmp88, i1 true, i1 true) #0
   ret void
 }
 
@@ -379,14 +383,14 @@ bb71:                                             ; preds = %bb80, %bb38
 ; CHECK-LABEL: {{^}}mimg_srsrc_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
+bb:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
   %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
-  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp10 = extractelement <4 x float> %tmp9, i32 0
-  %tmp12 = call i32 @llvm.SI.packf16(float undef, float %tmp10)
-  %tmp13 = bitcast i32 %tmp12 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+  %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp10 = extractelement <4 x float> %tmp, i32 0
+  %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %tmp10)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0
   ret void
 }
 
@@ -394,24 +398,35 @@ define amdgpu_ps void @mimg_srsrc_sgpr([34 x <8 x i32>] addrspace(2)* byval %arg
 ; CHECK-LABEL: {{^}}mimg_ssamp_sgpr:
 ; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 define amdgpu_ps void @mimg_ssamp_sgpr([17 x <4 x i32>] addrspace(2)* byval %arg) #0 {
+bb:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %tmp7 = getelementptr [17 x <4 x i32>], [17 x <4 x i32>] addrspace(2)* %arg, i32 0, i32 %tid
   %tmp8 = load <4 x i32>, <4 x i32> addrspace(2)* %tmp7, align 16, !tbaa !0
-  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp10 = extractelement <4 x float> %tmp9, i32 0
-  %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
-  %tmp13 = bitcast i32 %tmp12 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp13, float undef, float undef, float undef)
+  %tmp = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 7.500000e-01, float 2.500000e-01>, <8 x i32> undef, <4 x i32> %tmp8, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp10 = extractelement <4 x float> %tmp, i32 0
+  %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0
   ret void
 }
 
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.amdgcn.rsq.f32(float) #1
+declare float @llvm.exp2.f32(float) #1
+declare float @llvm.pow.f32(float, float) #1
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
-!1 = !{!"const", !3}
-!2 = !{!1, !1, i64 0}
-!3 = !{!"tbaa root"}
+!1 = !{!"const", !2}
+!2 = !{!"tbaa root"}
+!3 = !{!1, !1, i64 0}
diff --git a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
index 48bbc32abcbb42e31921bd9828e3178515c5b782..b3cb19ad05e2992622ec249a36573019e47443da 100644
--- a/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i128-ubfe.ll
@@ -11,7 +11,7 @@
 
 ; GCN: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -33,7 +33,7 @@ define void @v_uextract_bit_31_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -55,7 +55,7 @@ define void @v_uextract_bit_63_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -77,7 +77,7 @@ define void @v_uextract_bit_95_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[SHIFT]]:[[ZERO2]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
@@ -100,7 +100,7 @@ define void @v_uextract_bit_127_i128(i128 addrspace(1)* %out, i128 addrspace(1)*
 
 ; GCN-DAG: buffer_store_dwordx4 v{{\[}}[[OR0]]:[[ZERO]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: s_endpgm
-define void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_34_100_i128(i128 addrspace(1)* %out, i128 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i128, i128 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i128, i128 addrspace(1)* %out, i32 %id.x
diff --git a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
index b85714ea54c1a5b80b2ec81cb3db74997c894b4c..744c1c2b682be983b53cbeb2d1a21ebdbeaae786 100644
--- a/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
+++ b/test/CodeGen/AMDGPU/shift-and-i64-ubfe.ll
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -25,7 +25,7 @@ define void @v_uextract_bit_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -41,7 +41,7 @@ define void @v_uextract_bit_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -57,7 +57,7 @@ define void @v_uextract_bit_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -73,7 +73,7 @@ define void @v_uextract_bit_20_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -89,7 +89,7 @@ define void @v_uextract_bit_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 1{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -105,7 +105,7 @@ define void @v_uextract_bit_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 20, 2
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -121,7 +121,7 @@ define void @v_uextract_bit_20_21_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -137,7 +137,7 @@ define void @v_uextract_bit_1_30_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -155,7 +155,7 @@ define void @v_uextract_bit_1_31_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 3, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -171,7 +171,7 @@ define void @v_uextract_bit_31_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 2
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -188,7 +188,7 @@ define void @v_uextract_bit_32_33_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_and_b32_e32 v[[AND:[0-9]+]], 0x3fffffff, v[[SHRLO]]{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -204,7 +204,7 @@ define void @v_uextract_bit_30_60_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 1, 30
 ; GCN-DAG: v_mov_b32_e32 v[[BFE:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHIFT]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -220,7 +220,7 @@ define void @v_uextract_bit_33_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN: v_lshr_b64 v{{\[}}[[SHRLO:[0-9]+]]:[[SHRHI:[0-9]+]]{{\]}}, [[VAL]], 31
 ; GCN-NEXT: v_mov_b32_e32 v[[SHRHI]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
-define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -236,7 +236,7 @@ define void @v_uextract_bit_31_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN-DAG: v_lshrrev_b32_e32 v[[SHIFT:[0-9]+]], 31, [[VAL]]
 ; GCN: buffer_store_dword v[[SHIFT]]
-define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -252,7 +252,7 @@ define void @v_uextract_bit_31_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspa
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64{{$}}
 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 3, 1{{$}}
 ; GCN: buffer_store_dword [[BFE]]
-define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -268,7 +268,7 @@ define void @v_uextract_bit_3_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspac
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4{{$}}
 ; GCN: v_bfe_u32 [[BFE:v[0-9]+]], [[VAL]], 1, 1{{$}}
 ; GCN: buffer_store_dword [[BFE]]
-define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -286,7 +286,7 @@ define void @v_uextract_bit_33_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspa
 ; GCN-NEXT: v_and_b32_e32 v[[SHRLO]], 3, v[[SHRLO]]
 ; GCN-NOT: v[[SHRLO]]
 ; GCN: buffer_store_dword v[[SHRLO]]
-define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %id.x
@@ -306,7 +306,7 @@ define void @v_uextract_bit_31_32_i64_trunc_i32(i32 addrspace(1)* %out, i64 addr
 ; GCN-NOT: v[[SHRLO]]
 ; GCN-NOT: v[[SHRHI]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
-define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -327,7 +327,7 @@ define void @and_not_mask_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[SHRLO]]:[[SHRHI]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[AND]]:[[ZERO]]{{\]}}
-define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -347,7 +347,7 @@ define void @v_uextract_bit_27_29_multi_use_shift_i64(i64 addrspace(1)* %out, i6
 ; GCN-DAG: v_bfe_u32 v[[BFE:[0-9]+]], [[VAL]], 2, 3
 ; GCN-DAG: buffer_store_dwordx2 v{{\[}}[[SHR]]:[[ZERO_SHR]]{{\]}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO_BFE]]{{\]}}
-define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out.gep = getelementptr i64, i64 addrspace(1)* %out, i32 %id.x
@@ -365,7 +365,7 @@ define void @v_uextract_bit_34_37_multi_use_shift_i64(i64 addrspace(1)* %out, i6
 ; GCN-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
 ; GCN: buffer_store_dword v[[ZERO]]
-define void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
+define amdgpu_kernel void @v_uextract_bit_33_36_use_upper_half_shift_i64(i64 addrspace(1)* %out0, i32 addrspace(1)* %out1, i64 addrspace(1)* %in) #1 {
   %id.x = tail call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %id.x
   %out0.gep = getelementptr i64, i64 addrspace(1)* %out0, i32 %id.x
diff --git a/test/CodeGen/AMDGPU/shift-i64-opts.ll b/test/CodeGen/AMDGPU/shift-i64-opts.ll
index 28a7b924904daf7bf4b63e1adbe81afc31044326..a803849be02c45c64dfc52d3b291817b75fae54d 100644
--- a/test/CodeGen/AMDGPU/shift-i64-opts.ll
+++ b/test/CodeGen/AMDGPU/shift-i64-opts.ll
@@ -8,7 +8,7 @@
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 3, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 35
   store i64 %shl, i64 addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @lshr_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 31, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -32,7 +32,7 @@ define void @lshr_i64_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: v_lshrrev_b32_e32 v[[LO:[0-9]+]], 1, [[VAL]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 33
   store i64 %shl, i64 addrspace(1)* %out
@@ -43,7 +43,7 @@ define void @lshr_i64_33(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: buffer_load_dword v[[LO:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[HI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = lshr i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -58,7 +58,7 @@ define void @lshr_i64_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_bfe_u32 v[[BFE:[0-9]+]], v[[HI]], 8, 23
 ; GCN: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[BFE]]:[[ZERO]]{{\]}}
-define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %and = and i64 %val, 9223372036854775807 ; 0x7fffffffffffffff
   %shl = lshr i64 %and, 40
@@ -73,7 +73,7 @@ define void @lshr_and_i64_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 3, [[VAL]]
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 35
   store i64 %shl, i64 addrspace(1)* %out
@@ -84,7 +84,7 @@ define void @shl_i64_const_35(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN-DAG: buffer_load_dword v[[HI:[0-9]+]]
 ; GCN-DAG: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -96,7 +96,7 @@ define void @shl_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_lshlrev_b32_e32 v[[HI:[0-9]+]], 31, [[VAL]]
 ; GCN: v_mov_b32_e32 v[[LO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -106,7 +106,7 @@ define void @shl_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; ashr (i64 x), 63 => (ashr lo(x), 31), lo(x)
 
 ; GCN-LABEL: {{^}}ashr_i64_const_32:
-define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = ashr i64 %val, 32
   store i64 %shl, i64 addrspace(1)* %out
@@ -114,7 +114,7 @@ define void @ashr_i64_const_32(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 }
 
 ; GCN-LABEL: {{^}}ashr_i64_const_63:
-define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = ashr i64 %val, 63
   store i64 %shl, i64 addrspace(1)* %out
@@ -125,7 +125,7 @@ define void @ashr_i64_const_63(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 31, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 31
   %trunc = trunc i64 %shl to i32
@@ -137,7 +137,7 @@ define void @trunc_shl_31_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
 ; GCN: buffer_store_short [[SHL]]
-define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 15
   %trunc = trunc i64 %shl to i16
@@ -149,7 +149,7 @@ define void @trunc_shl_15_i16_i64(i16 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 15, [[VAL]]
 ; GCN: buffer_store_short [[SHL]]
-define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %val = load i32, i32 addrspace(1)* %in
   %shl = shl i32 %val, 15
   %trunc = trunc i32 %shl to i16
@@ -161,7 +161,7 @@ define void @trunc_shl_15_i16_i32(i16 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 7, [[VAL]]
 ; GCN: buffer_store_byte [[SHL]]
-define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 7
   %trunc = trunc i64 %shl to i8
@@ -174,7 +174,7 @@ define void @trunc_shl_7_i8_i64(i8 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
 ; GCN: v_and_b32_e32 [[AND:v[0-9]+]], 2, [[SHL]]
 ; GCN: buffer_store_byte [[AND]]
-define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 1
   %trunc = trunc i64 %shl to i2
@@ -186,7 +186,7 @@ define void @trunc_shl_1_i2_i64(i2 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 1, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 1
   %trunc = trunc i64 %shl to i32
@@ -198,7 +198,7 @@ define void @trunc_shl_1_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN: buffer_load_dword [[VAL:v[0-9]+]]
 ; GCN: v_lshlrev_b32_e32 [[SHL:v[0-9]+]], 16, [[VAL]]
 ; GCN: buffer_store_dword [[SHL]]
-define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 16
   %trunc = trunc i64 %shl to i32
@@ -209,7 +209,7 @@ define void @trunc_shl_16_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}trunc_shl_33_i32_i64:
 ; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dword [[ZERO]]
-define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 33
   %trunc = trunc i64 %shl to i32
@@ -222,7 +222,7 @@ define void @trunc_shl_33_i32_i64(i32 addrspace(1)* %out, i64 addrspace(1)* %in)
 ; GCN-DAG: v_lshlrev_b32_e32 v[[RESHI:[0-9]+]], 16, v{{[0-9]+}}
 ; GCN-DAG: v_lshlrev_b32_e32 v[[RESLO:[0-9]+]], 16, v[[LO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %val = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %shl = shl <2 x i64> %val, <i64 16, i64 16>
   %trunc = trunc <2 x i64> %shl to <2 x i32>
@@ -235,7 +235,7 @@ define void @trunc_shl_16_v2i32_v2i64(<2 x i32> addrspace(1)* %out, <2 x i64> ad
 ; GCN: v_lshl_b64 v{{\[}}[[RESLO:[0-9]+]]:[[RESHI:[0-9]+]]{{\]}}, [[VAL]], 31
 ; GCN: buffer_store_dword v[[RESLO]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[RESLO]]:[[RESHI]]{{\]}}
-define void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_shl_31_i32_i64_multi_use(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %val = load i64, i64 addrspace(1)* %in
   %shl = shl i64 %val, 31
   %trunc = trunc i64 %shl to i32
diff --git a/test/CodeGen/AMDGPU/shl.ll b/test/CodeGen/AMDGPU/shl.ll
index 972349c24453d75d8a66285d9e239f1edb07e454..f6520eeb4fd6913f927e861b192881791e95a167 100644
--- a/test/CodeGen/AMDGPU/shl.ll
+++ b/test/CodeGen/AMDGPU/shl.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=SI %s
-; XUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck -check-prefix=GCN -check-prefix=VI %s
-; RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck --check-prefix=EG %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s
+; XUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=FUNC %s
+; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 declare i32 @llvm.r600.read.tidig.x() #0
 
@@ -17,7 +17,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -44,7 +44,7 @@ define void @shl_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ;VI: v_lshlrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -57,7 +57,7 @@ define void @shl_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %a = load i16, i16 addrspace(1)* %in
   %b = load i16, i16 addrspace(1)* %b_ptr
@@ -70,7 +70,7 @@ define void @shl_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; VI: v_lshlrev_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
+define amdgpu_kernel void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
   %a = load i16, i16 addrspace(1)* %in
   %result = shl i16 %a, %b
   store i16 %result, i16 addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @shl_i16_v_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b)
 ; SI: v_lshlrev_b32_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
 
 ; VI: v_lshlrev_b16_e64 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
+define amdgpu_kernel void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in, i16 %b) {
   %a = load i16, i16 addrspace(1)* %in
   %b.add = add i16 %b, 3
   %result = shl i16 %a, %b.add
@@ -92,7 +92,7 @@ define void @shl_i16_v_compute_s(i16 addrspace(1)* %out, i16 addrspace(1)* %in,
 ; GCN-LABEL: {{^}}shl_i16_computed_amount:
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 3, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, [[ADD]], v{{[0-9]+}}
-define void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
@@ -107,7 +107,7 @@ define void @shl_i16_computed_amount(i16 addrspace(1)* %out, i16 addrspace(1)* %
 
 ; GCN-LABEL: {{^}}shl_i16_i_s:
 ; GCN: s_lshl_b32 s{{[0-9]+}}, s{{[0-9]+}}, 12
-define void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
+define amdgpu_kernel void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
   %result = shl i16 %a, 12
   store i16 %result, i16 addrspace(1)* %out
   ret void
@@ -116,7 +116,7 @@ define void @shl_i16_i_s(i16 addrspace(1)* %out, i16 zeroext %a) {
 ; GCN-LABEL: {{^}}shl_v2i16:
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
@@ -133,7 +133,7 @@ define void @shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i32 %tid
@@ -160,7 +160,7 @@ define void @shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in
 ; GCN-LABEL: {{^}}shl_i64:
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\], v\[[0-9]+:[0-9]+\], v[0-9]+}}
 ; VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
-define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -199,7 +199,7 @@ define void @shl_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
-define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -262,7 +262,7 @@ define void @shl_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 ;VI: v_lshlrev_b64 {{v\[[0-9]+:[0-9]+\], v[0-9]+, v\[[0-9]+:[0-9]+\]}}
 
-define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -277,7 +277,7 @@ define void @shl_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], [[LO_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = shl i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -287,7 +287,7 @@ define void @s_shl_32_i64(i64 addrspace(1)* %out, i64 %a) {
 ; GCN-DAG: buffer_load_dword v[[LO_A:[0-9]+]],
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[LO_A]]{{\]}}
-define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -299,7 +299,7 @@ define void @v_shl_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}s_shl_constant_i64
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}
-define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
   %shl = shl i64 281474976710655, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -311,7 +311,7 @@ define void @s_shl_constant_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI-DAG: s_movk_i32 s[[KHI:[0-9]+]], 0x11e{{$}}
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
 ; SI: buffer_store_dwordx2
-define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 1231231234567, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -323,7 +323,7 @@ define void @v_shl_constant_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr)
 ; SI-DAG: s_mov_b32 s[[KLO:[0-9]+]], 0x12d687{{$}}
 ; SI-DAG: s_mov_b32 s[[KHI:[0-9]+]], 0{{$}}
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, s{{\[}}[[KLO]]:[[KHI]]{{\]}}, [[VAL]]
-define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 1234567, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -332,7 +332,7 @@ define void @v_shl_i64_32_bit_constant(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}v_shl_inline_imm_64_i64:
 ; SI: v_lshl_b64 {{v\[[0-9]+:[0-9]+\]}}, 64, {{v[0-9]+}}
-define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
+define amdgpu_kernel void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) {
   %a = load i64, i64 addrspace(1)* %aptr, align 8
   %shl = shl i64 64, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
@@ -341,7 +341,7 @@ define void @v_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_64_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 64, s{{[0-9]+}}
-define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 64, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -349,7 +349,7 @@ define void @s_shl_inline_imm_64_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_1_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1, s{{[0-9]+}}
-define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 1, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -357,7 +357,7 @@ define void @s_shl_inline_imm_1_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_1.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 1.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4607182418800017408, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -365,7 +365,7 @@ define void @s_shl_inline_imm_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_1.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -1.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13830554455654793216, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -373,7 +373,7 @@ define void @s_shl_inline_imm_neg_1.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_0.5_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 0.5, s{{[0-9]+}}
-define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4602678819172646912, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -381,7 +381,7 @@ define void @s_shl_inline_imm_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_0.5_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -0.5, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13826050856027422720, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -389,7 +389,7 @@ define void @s_shl_inline_imm_neg_0.5_i64(i64 addrspace(1)* %out, i64 addrspace(
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_2.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 2.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4611686018427387904, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -397,7 +397,7 @@ define void @s_shl_inline_imm_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_2.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -2.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13835058055282163712, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -405,7 +405,7 @@ define void @s_shl_inline_imm_neg_2.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_4.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, 4.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4616189618054758400, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -413,7 +413,7 @@ define void @s_shl_inline_imm_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_shl_inline_imm_neg_4.0_i64:
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, -4.0, s{{[0-9]+}}
-define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13839561654909534208, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -427,7 +427,7 @@ define void @s_shl_inline_imm_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 4.0
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 1082130432, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -439,7 +439,7 @@ define void @s_shl_inline_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -1{{$}}
 ; SI-DAG: s_mov_b32 s[[K_HI_COPY:[0-9]+]], s[[K_HI]]
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI_COPY]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 -1065353216, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -450,7 +450,7 @@ define void @s_shl_inline_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrsp
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], 4.0
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 4647714815446351872, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
@@ -460,10 +460,18 @@ define void @s_shl_inline_high_imm_f32_4.0_i64(i64 addrspace(1)* %out, i64 addrs
 ; SI-DAG: s_mov_b32 s[[K_HI:[0-9]+]], -4.0
 ; SI-DAG: s_mov_b32 s[[K_LO:[0-9]+]], 0{{$}}
 ; SI: s_lshl_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}, s{{[0-9]+}}
-define void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
+define amdgpu_kernel void @s_shl_inline_high_imm_f32_neg_4.0_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr, i64 %a) {
   %shl = shl i64 13871086852301127680, %a
   store i64 %shl, i64 addrspace(1)* %out, align 8
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_mul2:
+; GCN: s_lshl_b32 s{{[0-9]}}, s{{[0-9]}}, 1
+define amdgpu_kernel void @test_mul2(i32 %p) {
+   %i = mul i32 %p, 2
+   store volatile i32 %i, i32 addrspace(1)* undef
+   ret void
+}
+
 attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shl.v2i16.ll b/test/CodeGen/AMDGPU/shl.v2i16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eac29bad7cf23347f307c1ef7eda250a67fe11cf
--- /dev/null
+++ b/test/CodeGen/AMDGPU/shl.v2i16.ll
@@ -0,0 +1,152 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI -check-prefix=CIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CI -check-prefix=CIVI %s
+
+; GCN-LABEL: {{^}}s_shl_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: v_mov_b32_e32 [[VLHS:v[0-9]+]], [[LHS]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[VLHS]]
+
+; VI: v_lshlrev_b32_e32
+; VI: v_lshlrev_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+; CI: v_lshlrev_b32_e32
+; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_or_b32_e32
+define amdgpu_kernel void @s_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %lhs, <2 x i16> %rhs) #0 {
+  %result = shl <2 x i16> %lhs, %rhs
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_shl_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+
+; VI: v_lshlrev_b16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_lshlrev_b16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; CI: s_mov_b32 [[MASK:s[0-9]+]], 0xffff{{$}}
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, [[LHS]]
+; CI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshl_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; CI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+; CI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]], v{{[0-9]+}}
+; CI: v_or_b32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_shl_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
+  %result = shl <2 x i16> %a, %b
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_v_s_v2i16:
+; GFX9: s_load_dword [[RHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @shl_v_s_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = shl <2 x i16> %vgpr, %sgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_s_v_v2i16:
+; GFX9: s_load_dword [[LHS:s[0-9]+]]
+; GFX9: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], [[LHS]]
+define amdgpu_kernel void @shl_s_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in, <2 x i16> %sgpr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = shl <2 x i16> %sgpr, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_imm_v_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[RHS:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], [[RHS]], 8
+define amdgpu_kernel void @shl_imm_v_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = shl <2 x i16> <i16 8, i16 8>, %vgpr
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_v_imm_v2i16:
+; GCN: {{buffer|flat}}_load_dword [[LHS:v[0-9]+]]
+; GFX9: v_pk_lshlrev_b16 [[RESULT:v[0-9]+]], 8, [[LHS]]
+define amdgpu_kernel void @shl_v_imm_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <2 x i16>, <2 x i16> addrspace(1)* %in.gep
+  %result = shl <2 x i16> %vgpr, <i16 8, i16 8>
+  store <2 x i16> %result, <2 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_shl_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @v_shl_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in.gep, i32 1
+  %a = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
+  %result = shl <4 x i16> %a, %b
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}shl_v_imm_v4i16:
+; GCN: {{buffer|flat}}_load_dwordx2
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GFX9: v_pk_lshlrev_b16 v{{[0-9]+}}, 8, v{{[0-9]+}}
+; GCN: {{buffer|flat}}_store_dwordx2
+define amdgpu_kernel void @shl_v_imm_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %tid.ext = sext i32 %tid to i64
+  %in.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %in, i64 %tid.ext
+  %out.gep = getelementptr inbounds <4 x i16>, <4 x i16> addrspace(1)* %out, i64 %tid.ext
+  %vgpr = load <4 x i16>, <4 x i16> addrspace(1)* %in.gep
+  %result = shl <4 x i16> %vgpr, <i16 8, i16 8, i16 8, i16 8>
+  store <4 x i16> %result, <4 x i16> addrspace(1)* %out.gep
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/shl_add_constant.ll b/test/CodeGen/AMDGPU/shl_add_constant.ll
index 9b5f9fed4d7939bb4319d3893e290a7bb22e2f4e..9da4bc0280163917217cc380b8e0ff762c1b94b8 100644
--- a/test/CodeGen/AMDGPU/shl_add_constant.ll
+++ b/test/CodeGen/AMDGPU/shl_add_constant.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 36, [[REG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -25,7 +25,7 @@ define void @shl_2_add_9_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
 ; SI-DAG: buffer_store_dword [[ADDREG]]
 ; SI-DAG: buffer_store_dword [[SHLREG]]
 ; SI: s_endpgm
-define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -43,7 +43,7 @@ define void @shl_2_add_9_i32_2_add_uses(i32 addrspace(1)* %out0, i32 addrspace(1
 ; SI: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, 0xf9c, [[REG]]
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %ptr = getelementptr i32, i32 addrspace(1)* %in, i32 %tid.x
   %val = load i32, i32 addrspace(1)* %ptr, align 4
@@ -61,7 +61,7 @@ define void @shl_2_add_999_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0
 ; SI: s_addk_i32 [[RESULT]], 0x3d8
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]]
 ; SI: buffer_store_dword [[VRESULT]]
-define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %shl, %y
@@ -78,7 +78,7 @@ define void @test_add_shl_add_constant(i32 addrspace(1)* %out, i32 %x, i32 %y) #
 ; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[TMP]]
 ; SI: buffer_store_dword [[VRESULT]]
 
-define void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @test_add_shl_add_constant_inv(i32 addrspace(1)* %out, i32 %x, i32 %y) #0 {
   %add.0 = add i32 %x, 123
   %shl = shl i32 %add.0, 3
   %add.1 = add i32 %y, %shl
diff --git a/test/CodeGen/AMDGPU/shl_add_ptr.ll b/test/CodeGen/AMDGPU/shl_add_ptr.ll
index 6e45759fa058f3ceef96092330ac280bc6f2f76f..9147eb58c6ad259fb8f5713a3e9ee5df0aa79422 100644
--- a/test/CodeGen/AMDGPU/shl_add_ptr.ll
+++ b/test/CodeGen/AMDGPU/shl_add_ptr.ll
@@ -19,7 +19,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_read_b32 {{v[0-9]+}}, [[PTR]] offset:8
 ; SI: s_endpgm
-define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -39,7 +39,7 @@ define void @load_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %ad
 ; SI-DAG: buffer_store_dword [[RESULT]]
 ; SI-DAG: buffer_store_dword [[ADDUSE]]
 ; SI: s_endpgm
-define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -55,7 +55,7 @@ define void @load_shl_base_lds_1(float addrspace(1)* %out, i32 addrspace(1)* %ad
 ; SI-LABEL: {{^}}load_shl_base_lds_max_offset
 ; SI: ds_read_u8 v{{[0-9]+}}, v{{[0-9]+}} offset:65535
 ; SI: s_endpgm
-define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)* %lds, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 65535
   %arrayidx0 = getelementptr inbounds [65536 x i8], [65536 x i8] addrspace(3)* @maxlds, i32 0, i32 %idx.0
@@ -73,7 +73,7 @@ define void @load_shl_base_lds_max_offset(i8 addrspace(1)* %out, i8 addrspace(3)
 ; SI: s_mov_b32 m0, -1
 ; SI-NEXT: ds_read2st64_b32 {{v\[[0-9]+:[0-9]+\]}}, [[PTR]] offset0:1 offset1:9
 ; SI: s_endpgm
-define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 64
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -89,7 +89,7 @@ define void @load_shl_base_lds_2(float addrspace(1)* %out) #0 {
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_write_b32 [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds0, i32 0, i32 %idx.0
@@ -104,7 +104,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
 
 @lds2 = addrspace(3) global [512 x i32] undef, align 4
 
-; define void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+; define amdgpu_kernel void @atomic_load_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ;   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
 ;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -119,7 +119,7 @@ define void @store_shl_base_lds_0(float addrspace(1)* %out, i32 addrspace(1)* %a
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_cmpst_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}}, {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
+define amdgpu_kernel void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use, i32 %swap) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -134,7 +134,7 @@ define void @atomic_cmpxchg_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_wrxchg_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -148,7 +148,7 @@ define void @atomic_swap_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_add_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -162,7 +162,7 @@ define void @atomic_add_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_sub_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -176,7 +176,7 @@ define void @atomic_sub_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_and_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -190,7 +190,7 @@ define void @atomic_and_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_or_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -204,7 +204,7 @@ define void @atomic_or_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_xor_rtn_b32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -214,7 +214,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
   ret void
 }
 
-; define void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+; define amdgpu_kernel void @atomic_nand_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
 ;   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
 ;   %idx.0 = add nsw i32 %tid.x, 2
 ;   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -228,7 +228,7 @@ define void @atomic_xor_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_min_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -242,7 +242,7 @@ define void @atomic_min_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_max_rtn_i32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -256,7 +256,7 @@ define void @atomic_max_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)*
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_min_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
@@ -270,7 +270,7 @@ define void @atomic_umin_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)
 ; SI: v_lshlrev_b32_e32 [[PTR:v[0-9]+]], 2, {{v[0-9]+}}
 ; SI: ds_max_rtn_u32 {{v[0-9]+}}, [[PTR]], {{v[0-9]+}} offset:8
 ; SI: s_endpgm
-define void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
+define amdgpu_kernel void @atomic_umax_shl_base_lds_0(i32 addrspace(1)* %out, i32 addrspace(1)* %add_use) #0 {
   %tid.x = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %idx.0 = add nsw i32 %tid.x, 2
   %arrayidx0 = getelementptr inbounds [512 x i32], [512 x i32] addrspace(3)* @lds2, i32 0, i32 %idx.0
diff --git a/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll b/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
index 08d44cc1e1d6dad8e5c2395b4322df2fa4510389..14ca635c6dadd3c31f614fbd616ec9098477eb38 100644
--- a/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
+++ b/test/CodeGen/AMDGPU/shrink-add-sub-constant.ll
@@ -7,7 +7,7 @@
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_64:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
-define void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -23,7 +23,7 @@ define void @v_test_i32_x_sub_64(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN: {{buffer|flat}}_load_dword [[Y:v[0-9]+]]
 ; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
 ; GCN-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
-define void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -40,7 +40,7 @@ define void @v_test_i32_x_sub_64_multi_use(i32 addrspace(1)* %out, i32 addrspace
 ; GCN-LABEL: {{^}}v_test_i32_64_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
-define void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -54,7 +54,7 @@ define void @v_test_i32_64_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_65:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 0xffffffbf, [[X]]
-define void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -68,7 +68,7 @@ define void @v_test_i32_x_sub_65(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}v_test_i32_65_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0x41, [[X]]
-define void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -82,7 +82,7 @@ define void @v_test_i32_65_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_neg16:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 16, [[X]]
-define void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -96,7 +96,7 @@ define void @v_test_i32_x_sub_neg16(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-LABEL: {{^}}v_test_i32_neg16_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, -16, [[X]]
-define void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -110,7 +110,7 @@ define void @v_test_i32_neg16_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-LABEL: {{^}}v_test_i32_x_sub_neg17:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_add_i32_e32 v{{[0-9]+}}, vcc, 17, [[X]]
-define void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -124,7 +124,7 @@ define void @v_test_i32_x_sub_neg17(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-LABEL: {{^}}v_test_i32_neg17_sub_x:
 ; GCN: {{buffer|flat}}_load_dword [[X:v[0-9]+]]
 ; GCN: v_sub_i32_e32 v{{[0-9]+}}, vcc, 0xffffffef, [[X]]
-define void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -138,7 +138,7 @@ define void @v_test_i32_neg17_sub_x(i32 addrspace(1)* %out, i32 addrspace(1)* %i
 ; GCN-LABEL: {{^}}s_test_i32_x_sub_64:
 ; GCN: s_load_dword [[X:s[0-9]+]]
 ; GCN: s_sub_i32 s{{[0-9]+}}, [[X]], 64
-define void @s_test_i32_x_sub_64(i32 %x) #0 {
+define amdgpu_kernel void @s_test_i32_x_sub_64(i32 %x) #0 {
   %result = sub i32 %x, 64
   call void asm sideeffect "; use $0", "s"(i32 %result)
   ret void
@@ -147,7 +147,7 @@ define void @s_test_i32_x_sub_64(i32 %x) #0 {
 ; GCN-LABEL: {{^}}v_test_i16_x_sub_64:
 ; VI: {{buffer|flat}}_load_ushort [[X:v[0-9]+]]
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 64, [[X]]
-define void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
@@ -166,7 +166,7 @@ define void @v_test_i16_x_sub_64(i16 addrspace(1)* %out, i16 addrspace(1)* %in)
 
 ; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[X]]
 ; SI-DAG: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 64, [[Y]]
-define void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_test_i16_x_sub_64_multi_use(i16 addrspace(1)* %out, i16 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %gep = getelementptr inbounds i16, i16 addrspace(1)* %in, i64 %tid.ext
diff --git a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
index 1988a14b5845320ef04f3943864cfaa9a8df8bfa..6248d8a46daf60fab3cc714988974c43d47073ca 100644
--- a/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
+++ b/test/CodeGen/AMDGPU/shrink-vop3-carry-out.mir
@@ -7,7 +7,7 @@
 # resume crashes
 
 --- |
-  define void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_add_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -20,7 +20,7 @@
     ret void
   }
 
-  define void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_sub_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -33,7 +33,7 @@
     ret void
   }
 
-  define void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_subrev_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -46,7 +46,7 @@
     ret void
   }
 
-  define void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @check_addc_src2_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -59,7 +59,7 @@
     ret void
   }
 
-  define void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_addc_vop3(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
@@ -72,7 +72,7 @@
     ret void
   }
 
-  define void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+  define amdgpu_kernel void @shrink_addc_undef_vcc(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
     %tid = call i32 @llvm.amdgcn.workitem.id.x()
     %tid.ext = sext i32 %tid to i64
     %a.ptr = getelementptr inbounds i32, i32 addrspace(1)* %in, i64 %tid.ext
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll b/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
index ef616eb63801da6c57498d57e152d70a9d8bb598..5c6663dbbdab94ac10606bc3abf99f4ae35bd45c 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cf-noloop.ll
@@ -6,10 +6,10 @@
 ; OPT-NOT: call i1 @llvm.amdgcn.loop
 
 ; GCN-LABEL: {{^}}annotate_unreachable_noloop:
-; GCN: s_cbranch_vccnz
+; GCN: s_cbranch_scc1
 ; GCN-NOT: s_endpgm
 ; GCN: .Lfunc_end0
-define void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+define amdgpu_kernel void @annotate_unreachable_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
@@ -37,12 +37,49 @@ bb5:                                              ; preds = %bb3, %bb1
 ; OPT-NOT: call i1 @llvm.amdgcn.loop
 
 ; GCN-LABEL: {{^}}annotate_ret_noloop:
+; GCN: load_dwordx4
+; GCN: v_cmp_nlt_f32
+; GCN: s_and_saveexec_b64
+; GCN: ; mask branch [[UNIFIED_RET:BB[0-9]+_[0-9]+]]
+; GCN-NEXT: [[UNIFIED_RET]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+; GCN: .Lfunc_end
+define amdgpu_kernel void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = sext i32 %tmp to i64
+  %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
+  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+  %tmp5 = extractelement <4 x float> %tmp4, i32 1
+  store volatile <4 x float> %tmp4, <4 x float> addrspace(1)* undef
+  %cmp = fcmp ogt float %tmp5, 1.0
+  br i1 %cmp, label %bb5, label %bb3
+
+bb3:                                              ; preds = %bb1
+  %tmp6 = extractelement <4 x float> %tmp4, i32 2
+  %tmp7 = fcmp olt float %tmp6, 0.000000e+00
+  br i1 %tmp7, label %bb4, label %bb5 ; crash goes away if these are swapped
+
+bb4:                                              ; preds = %bb3
+  ret void
+
+bb5:                                              ; preds = %bb3, %bb1
+  ret void
+}
+
+; OPT-LABEL: @uniform_annotate_ret_noloop(
+; OPT-NOT: call i1 @llvm.amdgcn.loop
+
+; GCN-LABEL: {{^}}uniform_annotate_ret_noloop:
 ; GCN: s_cbranch_scc1
 ; GCN: s_endpgm
-; GCN: .Lfunc_end1
-define void @annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+; GCN: .Lfunc_end
+define amdgpu_kernel void @uniform_annotate_ret_noloop(<4 x float> addrspace(1)* noalias nocapture readonly %arg, i32 %tmp) #0 {
 bb:
-  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
   br label %bb1
 
 bb1:                                              ; preds = %bb
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll b/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e50c595bc6c35f70a6cceb5e2858bfced07e8d3e
--- /dev/null
+++ b/test/CodeGen/AMDGPU/si-annotate-cf-unreachable.ll
@@ -0,0 +1,40 @@
+; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+
+
+; OPT-LABEL: @annotate_unreachable(
+; OPT: call { i1, i64 } @llvm.amdgcn.if(
+; OPT-NOT: call void @llvm.amdgcn.end.cf(
+
+
+; GCN-LABEL: {{^}}annotate_unreachable:
+; GCN: s_and_saveexec_b64
+; GCN-NOT: s_endpgm
+; GCN: .Lfunc_end0
+define amdgpu_kernel void @annotate_unreachable(<4 x float> addrspace(1)* noalias nocapture readonly %arg) #0 {
+bb:
+  %tmp = tail call i32 @llvm.amdgcn.workitem.id.x()
+  br label %bb1
+
+bb1:                                              ; preds = %bb
+  %tmp2 = sext i32 %tmp to i64
+  %tmp3 = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %arg, i64 %tmp2
+  %tmp4 = load <4 x float>, <4 x float> addrspace(1)* %tmp3, align 16
+  br i1 undef, label %bb3, label %bb5  ; label order reversed
+
+bb3:                                              ; preds = %bb1
+  %tmp6 = extractelement <4 x float> %tmp4, i32 2
+  %tmp7 = fcmp olt float %tmp6, 0.000000e+00
+  br i1 %tmp7, label %bb4, label %bb5
+
+bb4:                                              ; preds = %bb3
+  unreachable
+
+bb5:                                              ; preds = %bb3, %bb1
+  unreachable
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/si-annotate-cf.ll b/test/CodeGen/AMDGPU/si-annotate-cf.ll
index d658b229fd3780f00877b035d39f35ad6396f339..a4b6d1fd069ded9be744cd7d2ab760e2b64336fe 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -10,7 +10,7 @@
 ; SI: s_andn2_b64
 ; s_cbranch_execnz [[LOOP_LABEL]]
 ; SI: s_endpgm
-define void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @break_inserted_outside_of_loop(i32 addrspace(1)* %out, i32 %a) {
 main_body:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %0 = and i32 %a, %tid
@@ -40,7 +40,7 @@ ENDIF:
 ; SI: s_cbranch_execnz [[LOOP_LABEL]]
 ; SI: s_endpgm
 
-define void @phi_cond_outside_loop(i32 %b) {
+define amdgpu_kernel void @phi_cond_outside_loop(i32 %b) {
 entry:
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
   %0 = icmp eq i32 %tid , 0
@@ -68,7 +68,7 @@ exit:
 ; CHECK-LABEL: {{^}}switch_unreachable:
 ; CHECK-NOT: s_endpgm
 ; CHECK: .Lfunc_end2
-define void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
+define amdgpu_kernel void @switch_unreachable(i32 addrspace(1)* %g, i8 addrspace(3)* %l, i32 %x) nounwind {
 centry:
   switch i32 %x, label %sw.default [
     i32 0, label %sw.bb
@@ -100,7 +100,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone
 
 ; SI: [[ENDPGM]]:
 ; SI: s_endpgm
-define void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
+define amdgpu_kernel void @loop_land_info_assert(i32 %c0, i32 %c1, i32 %c2, i32 %c3, i32 %x, i32 %y, i1 %arg) nounwind {
 entry:
   %cmp = icmp sgt i32 %c0, 0
   br label %while.cond.outer
diff --git a/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
index 025a3d8fca2e8f81555be424b5af133cc94116d7..b0473f3b5bdafad03c2d00fdb772f24926f777f3 100644
--- a/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
+++ b/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll
@@ -6,7 +6,7 @@
 ; CHECK s_or_b64 exec, exec
 ; CHECK s_andn2_b64 exec, exec
 ; CHECK s_cbranch_execnz
-define void @test(i32 %arg, i32 %arg1) {
+define amdgpu_kernel void @test(i32 %arg, i32 %arg1) {
 bb:
   %tmp = icmp ne i32 %arg, 0
   %tmp7 = icmp ne i32 %arg1, 0
diff --git a/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir b/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
index 0c08deb13a8e7c2f995f09b4e719ede60819930c..20052e865a54ea21a34d0dcca382ebe3a17e811c 100644
--- a/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
+++ b/test/CodeGen/AMDGPU/si-fix-sgpr-copies.mir
@@ -1,7 +1,7 @@
 # RUN: llc -march=amdgcn -run-pass si-fix-sgpr-copies %s -o - | FileCheck %s -check-prefixes=GCN
 
 --- |
-  define void @phi_visit_order() { ret void }
+  define amdgpu_kernel void @phi_visit_order() { ret void }
 
 name: phi_visit_order
 tracksRegLiveness: true
diff --git a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
index 0d1de6662f25f3b7b805780c652707739089a8db..580268deb85d136e2b8f609d499b85a8fdb85b38 100644
--- a/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
+++ b/test/CodeGen/AMDGPU/si-instr-info-correct-implicit-operands.ll
@@ -5,7 +5,7 @@
 
 ; CHECK: %{{[0-9]+}} = V_ADD_I32_e32 %{{[0-9]+}}, %{{[0-9]+}}, implicit-def %vcc, implicit %exec
 
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load volatile i32, i32 addrspace(1)* %in
diff --git a/test/CodeGen/AMDGPU/si-literal-folding.ll b/test/CodeGen/AMDGPU/si-literal-folding.ll
deleted file mode 100644
index b3f000c8ccd22cee5a42849f36e96b1a7368fce0..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/si-literal-folding.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-
-; GCN-LABEL: {{^}}main:
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0x3f4353f8, v{{[0-9]+}}
-; GCN: v_mul_f32_e32 v{{[0-9]+}}, 0xbf4353f8, v{{[0-9]+}}
-define amdgpu_vs void @main(float) {
-main_body:
-  %1 = fmul float %0, 0x3FE86A7F00000000
-  %2 = fmul float %0, 0xBFE86A7F00000000
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %1, float %1, float %2, float %2)
-  ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/si-lod-bias.ll b/test/CodeGen/AMDGPU/si-lod-bias.ll
index 8e846d7a238e1ed56db765da07999ea7c00ab1a0..3a7359ea4ffaf196971ebe8d1bda57bf49d3dfc4 100644
--- a/test/CodeGen/AMDGPU/si-lod-bias.ll
+++ b/test/CodeGen/AMDGPU/si-lod-bias.ll
@@ -1,12 +1,12 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; This shader has the potential to generated illegal VGPR to SGPR copies if
 ; the wrong register class is used for the REG_SEQUENCE instructions.
 
-; CHECK: {{^}}main:
-; CHECK: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
-define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) {
+; GCN-LABEL: {{^}}main:
+; GCN: image_sample_b v{{\[[0-9]:[0-9]\]}}, v{{\[[0-9]:[0-9]\]}}, s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0xf
+define amdgpu_ps void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <8 x i32> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
   %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -15,38 +15,45 @@ main_body:
   %tmp23 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp22, !tbaa !0
   %tmp24 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg1, i32 0
   %tmp25 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp24, !tbaa !0
-  %tmp26 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg3, <2 x i32> %arg5)
-  %tmp27 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg3, <2 x i32> %arg5)
+  %i.i = extractelement <2 x i32> %arg5, i32 0
+  %j.i = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg3) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg3) #0
+  %i.i1 = extractelement <2 x i32> %arg5, i32 0
+  %j.i2 = extractelement <2 x i32> %arg5, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg3) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg3) #0
   %tmp28 = bitcast float %tmp21 to i32
-  %tmp29 = bitcast float %tmp26 to i32
-  %tmp30 = bitcast float %tmp27 to i32
+  %tmp29 = bitcast float %p2.i to i32
+  %tmp30 = bitcast float %p2.i6 to i32
   %tmp31 = insertelement <4 x i32> undef, i32 %tmp28, i32 0
   %tmp32 = insertelement <4 x i32> %tmp31, i32 %tmp29, i32 1
   %tmp33 = insertelement <4 x i32> %tmp32, i32 %tmp30, i32 2
   %tmp34 = insertelement <4 x i32> %tmp33, i32 undef, i32 3
   %tmp25.bc = bitcast <16 x i8> %tmp25 to <4 x i32>
-  %tmp35 = call <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32> %tmp34, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp34.bc = bitcast <4 x i32> %tmp34 to <4 x float>
+  %tmp35 = call <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float> %tmp34.bc, <8 x i32> %tmp23, <4 x i32> %tmp25.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp36 = extractelement <4 x float> %tmp35, i32 0
   %tmp37 = extractelement <4 x float> %tmp35, i32 1
   %tmp38 = extractelement <4 x float> %tmp35, i32 2
   %tmp39 = extractelement <4 x float> %tmp35, i32 3
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %tmp36, float %tmp37, float %tmp38, float %tmp39)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp36, float %tmp37, float %tmp38, float %tmp39, i1 true, i1 true) #0
   ret void
 }
 
-; Function Attrs: nounwind readnone
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.b.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-declare <4 x float> @llvm.SI.image.sample.b.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
diff --git a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
index 8d66df258e43e20ce8cfe93e4a77c28b83f6681c..cb010cf153001668fc07f29c0ab551d16b1ee04d 100644
--- a/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
+++ b/test/CodeGen/AMDGPU/si-lower-control-flow-unreachable-block.ll
@@ -4,17 +4,18 @@
 ; GCN: v_cmp_eq_u32
 ; GCN: s_and_saveexec_b64
 ; GCN: s_xor_b64
-; GCN: ; mask branch [[RET:BB[0-9]+]]
-; GCN: s_branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RET:BB[0-9]+_[0-9]+]]
 
-; GCN: [[RET]]
-; GCN: s_or_b64 exec, exec
-; GCN: s_endpgm
-
-; GCN: [[UNREACHABLE]]:
+; GCN-NEXT: BB{{[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
+; GCN: ; divergent unreachable
 ; GCN: s_waitcnt
-define void @lower_control_flow_unreachable_terminator() #0 {
+
+; GCN-NEXT: [[RET]]: ; %UnifiedReturnBlock
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN: s_endpgm
+
+define amdgpu_kernel void @lower_control_flow_unreachable_terminator() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp63 = icmp eq i32 %tmp15, 32
@@ -29,19 +30,20 @@ ret:
 }
 
 ; GCN-LABEL: {{^}}lower_control_flow_unreachable_terminator_swap_block_order:
-; GCN: v_cmp_eq_u32
+; GCN: v_cmp_ne_u32
 ; GCN: s_and_saveexec_b64
 ; GCN: s_xor_b64
-; GCN: ; mask branch [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+; GCN: ; mask branch [[RETURN:BB[0-9]+_[0-9]+]]
 
-; GCN-NEXT: ; %ret
-; GCN-NEXT: s_endpgm
-
-; GCN-NEXT: [[UNREACHABLE]]:
-; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %unreachable
 ; GCN: ds_write_b32
+; GCN: ; divergent unreachable
 ; GCN: s_waitcnt
-define void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
+
+; GCN: [[RETURN]]:
+; GCN-NEXT: s_or_b64 exec, exec
+; GCN-NEXT: s_endpgm
+define amdgpu_kernel void @lower_control_flow_unreachable_terminator_swap_block_order() #0 {
 bb:
   %tmp15 = tail call i32 @llvm.amdgcn.workitem.id.y()
   %tmp63 = icmp eq i32 %tmp15, 32
@@ -55,7 +57,29 @@ unreachable:
   unreachable
 }
 
-; Function Attrs: nounwind readnone
+; GCN-LABEL: {{^}}uniform_lower_control_flow_unreachable_terminator:
+; GCN: s_cmp_lg_u32
+; GCN: s_cbranch_scc0 [[UNREACHABLE:BB[0-9]+_[0-9]+]]
+
+; GCN-NEXT: BB#{{[0-9]+}}: ; %ret
+; GCN-NEXT: s_endpgm
+
+; GCN: [[UNREACHABLE]]:
+; GCN: ds_write_b32
+; GCN: s_waitcnt
+define amdgpu_kernel void @uniform_lower_control_flow_unreachable_terminator(i32 %arg0) #0 {
+bb:
+  %tmp63 = icmp eq i32 %arg0, 32
+  br i1 %tmp63, label %unreachable, label %ret
+
+unreachable:
+  store volatile i32 0, i32 addrspace(3)* undef, align 4
+  unreachable
+
+ret:
+  ret void
+}
+
 declare i32 @llvm.amdgcn.workitem.id.y() #1
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/si-scheduler.ll b/test/CodeGen/AMDGPU/si-scheduler.ll
index 9374ef3cd9070d100cabf1b59fd03c3aca33df27..462528c4ff1a8ad556c37382f3d40e7e58986c9f 100644
--- a/test/CodeGen/AMDGPU/si-scheduler.ll
+++ b/test/CodeGen/AMDGPU/si-scheduler.ll
@@ -3,7 +3,7 @@
 ; The only way the subtarget knows that the si machine scheduler is being used
 ; is to specify -mattr=si-scheduler.  If we just pass --misched=si, the backend
 ; won't know what scheduler we are using.
-; RUN: llc -march=amdgcn -mcpu=SI --misched=si -mattr=si-scheduler < %s | FileCheck %s
+; RUN: llc -march=amdgcn --misched=si -mattr=si-scheduler < %s | FileCheck %s
 
 ; The test checks the "si" machine scheduler pass works correctly.
 
@@ -22,39 +22,46 @@ main_body:
   %tmp22 = load <32 x i8>, <32 x i8> addrspace(2)* %tmp, align 32, !tbaa !0
   %tmp23 = bitcast [17 x <4 x i32>] addrspace(2)* %arg2 to <16 x i8> addrspace(2)*
   %tmp24 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp23, align 16, !tbaa !0
-  %tmp25 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg5, <2 x i32> %arg11)
-  %tmp26 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg5, <2 x i32> %arg11)
-  %tmp27 = bitcast float %tmp25 to i32
-  %tmp28 = bitcast float %tmp26 to i32
+  %i.i = extractelement <2 x i32> %arg11, i32 0
+  %j.i = extractelement <2 x i32> %arg11, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg5) #1
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg5) #1
+  %i.i1 = extractelement <2 x i32> %arg11, i32 0
+  %j.i2 = extractelement <2 x i32> %arg11, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 1, i32 0, i32 %arg5) #1
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 1, i32 0, i32 %arg5) #1
+  %tmp27 = bitcast float %p2.i to i32
+  %tmp28 = bitcast float %p2.i6 to i32
   %tmp29 = insertelement <2 x i32> undef, i32 %tmp27, i32 0
   %tmp30 = insertelement <2 x i32> %tmp29, i32 %tmp28, i32 1
   %tmp22.bc = bitcast <32 x i8> %tmp22 to <8 x i32>
   %tmp24.bc = bitcast <16 x i8> %tmp24 to <4 x i32>
-  %tmp31 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp30, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp30.bc = bitcast <2 x i32> %tmp30 to <2 x float>
+  %tmp31 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp30.bc, <8 x i32> %tmp22.bc, <4 x i32> %tmp24.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+
   %tmp32 = extractelement <4 x float> %tmp31, i32 0
   %tmp33 = extractelement <4 x float> %tmp31, i32 1
   %tmp34 = extractelement <4 x float> %tmp31, i32 2
   %tmp35 = extractelement <4 x float> %tmp31, i32 3
-  %tmp36 = call i32 @llvm.SI.packf16(float %tmp32, float %tmp33)
-  %tmp37 = bitcast i32 %tmp36 to float
-  %tmp38 = call i32 @llvm.SI.packf16(float %tmp34, float %tmp35)
-  %tmp39 = bitcast i32 %tmp38 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp37, float %tmp39, float %tmp37, float %tmp39)
+  %tmp36 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp32, float %tmp33)
+  %tmp38 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp34, float %tmp35)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp36, <2 x half> %tmp38, i1 true, i1 false) #0
   ret void
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 
+attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
diff --git a/test/CodeGen/AMDGPU/si-sgpr-spill.ll b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
index e61b4051124a607264bcad3be5b5164f12a4389f..8731e74d63a057d4c6b9b0473c3d70e0fdffcbff 100644
--- a/test/CodeGen/AMDGPU/si-sgpr-spill.ll
+++ b/test/CodeGen/AMDGPU/si-sgpr-spill.ll
@@ -1,27 +1,29 @@
-; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=TOVGPR %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=+vgpr-spilling,-mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; These tests check that the compiler won't crash when it needs to spill
 ; SGPRs.
 
-
 @ddxy_lds = external addrspace(3) global [64 x i32]
 
-; CHECK-LABEL: {{^}}main:
-; CHECK: s_wqm
+; GCN-LABEL: {{^}}main:
+; GCN: s_wqm
 
 ; Make sure not emitting unused scratch resource descriptor setup
-; CHECK-NOT: s_mov_b32
-; CHECK-NOT: s_mov_b32
-; CHECK-NOT: s_mov_b32
-; CHECK-NOT: s_mov_b32
+; GCN-NOT: s_mov_b32
+; GCN-NOT: s_mov_b32
+; GCN-NOT: s_mov_b32
+; GCN-NOT: s_mov_b32
 
-; CHECK: s_mov_b32 m0
+; GCN: s_mov_b32 m0
 
+; Make sure scratch space isn't being used for SGPR->VGPR spills
 
 ; Writing to M0 from an SMRD instruction will hang the GPU.
-; CHECK-NOT: s_buffer_load_dword m0
-; CHECK: s_endpgm
+; GCN-NOT: s_buffer_load_dword m0
+; GCN: s_endpgm
+
+; TOVGPR: ScratchSize: 0{{$}}
 define amdgpu_ps void @main([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
 main_body:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
@@ -97,29 +99,114 @@ main_body:
   %tmp89 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp88, !tbaa !0
   %tmp90 = getelementptr [32 x <16 x i8>], [32 x <16 x i8>] addrspace(2)* %arg1, i64 0, i32 7
   %tmp91 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp90, !tbaa !0
-  %tmp92 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp93 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp94 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp95 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp96 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp97 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp98 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp99 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp100 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp101 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp102 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp103 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp104 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp105 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp106 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp107 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp108 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %arg4, <2 x i32> %arg6)
+  %i.i = extractelement <2 x i32> %arg6, i32 0
+  %j.i = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg4) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg4) #0
+  %i.i91 = extractelement <2 x i32> %arg6, i32 0
+  %j.i92 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i93 = bitcast i32 %i.i91 to float
+  %j.f.i94 = bitcast i32 %j.i92 to float
+  %p1.i95 = call float @llvm.amdgcn.interp.p1(float %i.f.i93, i32 1, i32 0, i32 %arg4) #0
+  %p2.i96 = call float @llvm.amdgcn.interp.p2(float %p1.i95, float %j.f.i94, i32 1, i32 0, i32 %arg4) #0
+  %i.i85 = extractelement <2 x i32> %arg6, i32 0
+  %j.i86 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i87 = bitcast i32 %i.i85 to float
+  %j.f.i88 = bitcast i32 %j.i86 to float
+  %p1.i89 = call float @llvm.amdgcn.interp.p1(float %i.f.i87, i32 0, i32 1, i32 %arg4) #0
+  %p2.i90 = call float @llvm.amdgcn.interp.p2(float %p1.i89, float %j.f.i88, i32 0, i32 1, i32 %arg4) #0
+  %i.i79 = extractelement <2 x i32> %arg6, i32 0
+  %j.i80 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i81 = bitcast i32 %i.i79 to float
+  %j.f.i82 = bitcast i32 %j.i80 to float
+  %p1.i83 = call float @llvm.amdgcn.interp.p1(float %i.f.i81, i32 1, i32 1, i32 %arg4) #0
+  %p2.i84 = call float @llvm.amdgcn.interp.p2(float %p1.i83, float %j.f.i82, i32 1, i32 1, i32 %arg4) #0
+  %i.i73 = extractelement <2 x i32> %arg6, i32 0
+  %j.i74 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i75 = bitcast i32 %i.i73 to float
+  %j.f.i76 = bitcast i32 %j.i74 to float
+  %p1.i77 = call float @llvm.amdgcn.interp.p1(float %i.f.i75, i32 2, i32 1, i32 %arg4) #0
+  %p2.i78 = call float @llvm.amdgcn.interp.p2(float %p1.i77, float %j.f.i76, i32 2, i32 1, i32 %arg4) #0
+  %i.i67 = extractelement <2 x i32> %arg6, i32 0
+  %j.i68 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i69 = bitcast i32 %i.i67 to float
+  %j.f.i70 = bitcast i32 %j.i68 to float
+  %p1.i71 = call float @llvm.amdgcn.interp.p1(float %i.f.i69, i32 0, i32 2, i32 %arg4) #0
+  %p2.i72 = call float @llvm.amdgcn.interp.p2(float %p1.i71, float %j.f.i70, i32 0, i32 2, i32 %arg4) #0
+  %i.i61 = extractelement <2 x i32> %arg6, i32 0
+  %j.i62 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i63 = bitcast i32 %i.i61 to float
+  %j.f.i64 = bitcast i32 %j.i62 to float
+  %p1.i65 = call float @llvm.amdgcn.interp.p1(float %i.f.i63, i32 1, i32 2, i32 %arg4) #0
+  %p2.i66 = call float @llvm.amdgcn.interp.p2(float %p1.i65, float %j.f.i64, i32 1, i32 2, i32 %arg4) #0
+  %i.i55 = extractelement <2 x i32> %arg6, i32 0
+  %j.i56 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i57 = bitcast i32 %i.i55 to float
+  %j.f.i58 = bitcast i32 %j.i56 to float
+  %p1.i59 = call float @llvm.amdgcn.interp.p1(float %i.f.i57, i32 2, i32 2, i32 %arg4) #0
+  %p2.i60 = call float @llvm.amdgcn.interp.p2(float %p1.i59, float %j.f.i58, i32 2, i32 2, i32 %arg4) #0
+  %i.i49 = extractelement <2 x i32> %arg6, i32 0
+  %j.i50 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i51 = bitcast i32 %i.i49 to float
+  %j.f.i52 = bitcast i32 %j.i50 to float
+  %p1.i53 = call float @llvm.amdgcn.interp.p1(float %i.f.i51, i32 0, i32 3, i32 %arg4) #0
+  %p2.i54 = call float @llvm.amdgcn.interp.p2(float %p1.i53, float %j.f.i52, i32 0, i32 3, i32 %arg4) #0
+  %i.i43 = extractelement <2 x i32> %arg6, i32 0
+  %j.i44 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i45 = bitcast i32 %i.i43 to float
+  %j.f.i46 = bitcast i32 %j.i44 to float
+  %p1.i47 = call float @llvm.amdgcn.interp.p1(float %i.f.i45, i32 1, i32 3, i32 %arg4) #0
+  %p2.i48 = call float @llvm.amdgcn.interp.p2(float %p1.i47, float %j.f.i46, i32 1, i32 3, i32 %arg4) #0
+  %i.i37 = extractelement <2 x i32> %arg6, i32 0
+  %j.i38 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i39 = bitcast i32 %i.i37 to float
+  %j.f.i40 = bitcast i32 %j.i38 to float
+  %p1.i41 = call float @llvm.amdgcn.interp.p1(float %i.f.i39, i32 2, i32 3, i32 %arg4) #0
+  %p2.i42 = call float @llvm.amdgcn.interp.p2(float %p1.i41, float %j.f.i40, i32 2, i32 3, i32 %arg4) #0
+  %i.i31 = extractelement <2 x i32> %arg6, i32 0
+  %j.i32 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i33 = bitcast i32 %i.i31 to float
+  %j.f.i34 = bitcast i32 %j.i32 to float
+  %p1.i35 = call float @llvm.amdgcn.interp.p1(float %i.f.i33, i32 0, i32 4, i32 %arg4) #0
+  %p2.i36 = call float @llvm.amdgcn.interp.p2(float %p1.i35, float %j.f.i34, i32 0, i32 4, i32 %arg4) #0
+  %i.i25 = extractelement <2 x i32> %arg6, i32 0
+  %j.i26 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i27 = bitcast i32 %i.i25 to float
+  %j.f.i28 = bitcast i32 %j.i26 to float
+  %p1.i29 = call float @llvm.amdgcn.interp.p1(float %i.f.i27, i32 1, i32 4, i32 %arg4) #0
+  %p2.i30 = call float @llvm.amdgcn.interp.p2(float %p1.i29, float %j.f.i28, i32 1, i32 4, i32 %arg4) #0
+  %i.i19 = extractelement <2 x i32> %arg6, i32 0
+  %j.i20 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i21 = bitcast i32 %i.i19 to float
+  %j.f.i22 = bitcast i32 %j.i20 to float
+  %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 2, i32 4, i32 %arg4) #0
+  %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 2, i32 4, i32 %arg4) #0
+  %i.i13 = extractelement <2 x i32> %arg6, i32 0
+  %j.i14 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i15 = bitcast i32 %i.i13 to float
+  %j.f.i16 = bitcast i32 %j.i14 to float
+  %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 0, i32 5, i32 %arg4) #0
+  %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 0, i32 5, i32 %arg4) #0
+  %i.i7 = extractelement <2 x i32> %arg6, i32 0
+  %j.i8 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i9 = bitcast i32 %i.i7 to float
+  %j.f.i10 = bitcast i32 %j.i8 to float
+  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 1, i32 5, i32 %arg4) #0
+  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 1, i32 5, i32 %arg4) #0
+  %i.i1 = extractelement <2 x i32> %arg6, i32 0
+  %j.i2 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 2, i32 5, i32 %arg4) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 2, i32 5, i32 %arg4) #0
   %mbcnt.lo.0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp109 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.0)
   %tmp110 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp109
-  %tmp111 = bitcast float %tmp92 to i32
+  %tmp111 = bitcast float %p2.i to i32
   store i32 %tmp111, i32 addrspace(3)* %tmp110
-  %tmp112 = bitcast float %tmp93 to i32
+  %tmp112 = bitcast float %p2.i96 to i32
   store i32 %tmp112, i32 addrspace(3)* %tmp110
   %mbcnt.lo.1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp113 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.1)
@@ -128,14 +215,14 @@ main_body:
   %tmp116 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp115
   %tmp117 = add i32 %tmp115, 1
   %tmp118 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp117
-  %tmp119 = bitcast float %tmp92 to i32
+  %tmp119 = bitcast float %p2.i to i32
   store i32 %tmp119, i32 addrspace(3)* %tmp114
   %tmp120 = load i32, i32 addrspace(3)* %tmp116
   %tmp121 = bitcast i32 %tmp120 to float
   %tmp122 = load i32, i32 addrspace(3)* %tmp118
   %tmp123 = bitcast i32 %tmp122 to float
   %tmp124 = fsub float %tmp123, %tmp121
-  %tmp125 = bitcast float %tmp93 to i32
+  %tmp125 = bitcast float %p2.i96 to i32
   store i32 %tmp125, i32 addrspace(3)* %tmp114
   %tmp126 = load i32, i32 addrspace(3)* %tmp116
   %tmp127 = bitcast i32 %tmp126 to float
@@ -148,10 +235,10 @@ main_body:
   %tmp134 = insertelement <4 x float> %tmp133, float %tmp130, i32 3
   %tmp135 = extractelement <4 x float> %tmp134, i32 0
   %tmp136 = extractelement <4 x float> %tmp134, i32 1
-  %tmp137 = fmul float %tmp59, %tmp92
-  %tmp138 = fmul float %tmp59, %tmp93
-  %tmp139 = fmul float %tmp59, %tmp93
-  %tmp140 = fmul float %tmp59, %tmp93
+  %tmp137 = fmul float %tmp59, %p2.i
+  %tmp138 = fmul float %tmp59, %p2.i96
+  %tmp139 = fmul float %tmp59, %p2.i96
+  %tmp140 = fmul float %tmp59, %p2.i96
   %mbcnt.lo.2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tmp141 = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo.2)
   %tmp142 = getelementptr [64 x i32], [64 x i32] addrspace(3)* @ddxy_lds, i32 0, i32 %tmp141
@@ -204,26 +291,26 @@ main_body:
   %tmp180 = insertelement <4 x float> %tmp179, float %tmp176, i32 3
   %tmp181 = extractelement <4 x float> %tmp180, i32 0
   %tmp182 = extractelement <4 x float> %tmp180, i32 1
-  %tmp183 = fdiv float 1.000000e+00, %tmp96
+  %tmp183 = fdiv float 1.000000e+00, %p2.i78
   %tmp184 = fmul float %tmp32, %tmp183
   %tmp185 = fcmp uge float 1.000000e+00, %tmp184
   %tmp186 = select i1 %tmp185, float %tmp184, float 1.000000e+00
   %tmp187 = fmul float %tmp186, %tmp29
-  %tmp188 = call float @ceil(float %tmp187)
+  %tmp188 = call float @llvm.ceil.f32(float %tmp187)
   %tmp189 = fcmp uge float 3.000000e+00, %tmp188
   %tmp190 = select i1 %tmp189, float 3.000000e+00, float %tmp188
   %tmp191 = fdiv float 1.000000e+00, %tmp190
   %tmp192 = fdiv float 1.000000e+00, %tmp29
   %tmp193 = fmul float %tmp190, %tmp192
   %tmp194 = fmul float %tmp30, %tmp193
-  %tmp195 = fmul float %tmp94, %tmp94
-  %tmp196 = fmul float %tmp95, %tmp95
+  %tmp195 = fmul float %p2.i90, %p2.i90
+  %tmp196 = fmul float %p2.i84, %p2.i84
   %tmp197 = fadd float %tmp196, %tmp195
-  %tmp198 = fmul float %tmp96, %tmp96
+  %tmp198 = fmul float %p2.i78, %p2.i78
   %tmp199 = fadd float %tmp197, %tmp198
   %tmp200 = call float @llvm.amdgcn.rsq.f32(float %tmp199)
-  %tmp201 = fmul float %tmp94, %tmp200
-  %tmp202 = fmul float %tmp95, %tmp200
+  %tmp201 = fmul float %p2.i90, %tmp200
+  %tmp202 = fmul float %p2.i84, %tmp200
   %tmp203 = fmul float %tmp201, %tmp28
   %tmp204 = fmul float %tmp202, %tmp28
   %tmp205 = fmul float %tmp203, -1.000000e+00
@@ -231,9 +318,9 @@ main_body:
   %tmp207 = fmul float %tmp205, %tmp31
   %tmp208 = fmul float %tmp206, %tmp31
   %tmp209 = fsub float -0.000000e+00, %tmp207
-  %tmp210 = fadd float %tmp92, %tmp209
+  %tmp210 = fadd float %p2.i, %tmp209
   %tmp211 = fsub float -0.000000e+00, %tmp208
-  %tmp212 = fadd float %tmp93, %tmp211
+  %tmp212 = fadd float %p2.i96, %tmp211
   %tmp213 = fmul float %tmp205, %tmp191
   %tmp214 = fmul float %tmp206, %tmp191
   %tmp215 = fmul float -1.000000e+00, %tmp191
@@ -277,7 +364,8 @@ ENDIF:                                            ; preds = %LOOP
   %tmp240 = insertelement <8 x i32> %tmp239, i32 %tmp238, i32 5
   %tmp241 = insertelement <8 x i32> %tmp240, i32 undef, i32 6
   %tmp242 = insertelement <8 x i32> %tmp241, i32 undef, i32 7
-  %tmp243 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp242, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp242.bc = bitcast <8 x i32> %tmp242 to <8 x float>
+  %tmp243 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp242.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp244 = extractelement <4 x float> %tmp243, i32 3
   %tmp245 = fcmp oge float %temp30.0, %tmp244
   %tmp246 = sext i1 %tmp245 to i32
@@ -323,7 +411,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp275 = insertelement <8 x i32> %tmp274, i32 undef, i32 6
   %tmp276 = insertelement <8 x i32> %tmp275, i32 undef, i32 7
   %tmp67.bc = bitcast <16 x i8> %tmp67 to <4 x i32>
-  %tmp277 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp276, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp276.bc = bitcast <8 x i32> %tmp276 to <8 x float>
+  %tmp277 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp276.bc, <8 x i32> %tmp65, <4 x i32> %tmp67.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp278 = extractelement <4 x float> %tmp277, i32 0
   %tmp279 = extractelement <4 x float> %tmp277, i32 1
   %tmp280 = extractelement <4 x float> %tmp277, i32 2
@@ -344,7 +433,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp295 = insertelement <8 x i32> %tmp294, i32 undef, i32 6
   %tmp296 = insertelement <8 x i32> %tmp295, i32 undef, i32 7
   %tmp83.bc = bitcast <16 x i8> %tmp83 to <4 x i32>
-  %tmp297 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp296, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp296.bc = bitcast <8 x i32> %tmp296 to <8 x float>
+  %tmp297 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp296.bc, <8 x i32> %tmp81, <4 x i32> %tmp83.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp298 = extractelement <4 x float> %tmp297, i32 0
   %tmp299 = extractelement <4 x float> %tmp297, i32 1
   %tmp300 = extractelement <4 x float> %tmp297, i32 2
@@ -363,7 +453,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp313 = insertelement <8 x i32> %tmp312, i32 undef, i32 6
   %tmp314 = insertelement <8 x i32> %tmp313, i32 undef, i32 7
   %tmp79.bc = bitcast <16 x i8> %tmp79 to <4 x i32>
-  %tmp315 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp314, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp314.bc = bitcast <8 x i32> %tmp314 to <8 x float>
+  %tmp315 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp314.bc, <8 x i32> %tmp77, <4 x i32> %tmp79.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp316 = extractelement <4 x float> %tmp315, i32 0
   %tmp317 = extractelement <4 x float> %tmp315, i32 1
   %tmp318 = extractelement <4 x float> %tmp315, i32 2
@@ -393,7 +484,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp342 = insertelement <8 x i32> %tmp341, i32 %tmp336, i32 5
   %tmp343 = insertelement <8 x i32> %tmp342, i32 undef, i32 6
   %tmp344 = insertelement <8 x i32> %tmp343, i32 undef, i32 7
-  %tmp345 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp344, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp344.bc = bitcast <8 x i32> %tmp344 to <8 x float>
+  %tmp345 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp344.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp346 = extractelement <4 x float> %tmp345, i32 0
   %tmp347 = extractelement <4 x float> %tmp345, i32 1
   %tmp348 = extractelement <4 x float> %tmp345, i32 2
@@ -424,14 +516,15 @@ IF67:                                             ; preds = %LOOP65
   %tmp373 = insertelement <8 x i32> %tmp372, i32 undef, i32 6
   %tmp374 = insertelement <8 x i32> %tmp373, i32 undef, i32 7
   %tmp71.bc = bitcast <16 x i8> %tmp71 to <4 x i32>
-  %tmp375 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp374, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp374.bc = bitcast <8 x i32> %tmp374 to <8 x float>
+  %tmp375 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp374.bc, <8 x i32> %tmp69, <4 x i32> %tmp71.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp376 = extractelement <4 x float> %tmp375, i32 0
   %tmp377 = extractelement <4 x float> %tmp375, i32 1
   %tmp378 = extractelement <4 x float> %tmp375, i32 2
   %tmp379 = extractelement <4 x float> %tmp375, i32 3
-  %tmp380 = fsub float -0.000000e+00, %tmp94
-  %tmp381 = fsub float -0.000000e+00, %tmp95
-  %tmp382 = fsub float -0.000000e+00, %tmp96
+  %tmp380 = fsub float -0.000000e+00, %p2.i90
+  %tmp381 = fsub float -0.000000e+00, %p2.i84
+  %tmp382 = fsub float -0.000000e+00, %p2.i78
   %tmp383 = fmul float %tmp358, %tmp380
   %tmp384 = fmul float %tmp359, %tmp381
   %tmp385 = fadd float %tmp384, %tmp383
@@ -449,20 +542,20 @@ IF67:                                             ; preds = %LOOP65
   %tmp397 = fadd float %tmp381, %tmp396
   %tmp398 = fsub float -0.000000e+00, %tmp393
   %tmp399 = fadd float %tmp382, %tmp398
-  %tmp400 = fmul float %tmp395, %tmp97
-  %tmp401 = fmul float %tmp395, %tmp98
-  %tmp402 = fmul float %tmp395, %tmp99
-  %tmp403 = fmul float %tmp397, %tmp100
+  %tmp400 = fmul float %tmp395, %p2.i72
+  %tmp401 = fmul float %tmp395, %p2.i66
+  %tmp402 = fmul float %tmp395, %p2.i60
+  %tmp403 = fmul float %tmp397, %p2.i54
   %tmp404 = fadd float %tmp403, %tmp400
-  %tmp405 = fmul float %tmp397, %tmp101
+  %tmp405 = fmul float %tmp397, %p2.i48
   %tmp406 = fadd float %tmp405, %tmp401
-  %tmp407 = fmul float %tmp397, %tmp102
+  %tmp407 = fmul float %tmp397, %p2.i42
   %tmp408 = fadd float %tmp407, %tmp402
-  %tmp409 = fmul float %tmp399, %tmp103
+  %tmp409 = fmul float %tmp399, %p2.i36
   %tmp410 = fadd float %tmp409, %tmp404
-  %tmp411 = fmul float %tmp399, %tmp104
+  %tmp411 = fmul float %tmp399, %p2.i30
   %tmp412 = fadd float %tmp411, %tmp406
-  %tmp413 = fmul float %tmp399, %tmp105
+  %tmp413 = fmul float %tmp399, %p2.i24
   %tmp414 = fadd float %tmp413, %tmp408
   %tmp415 = bitcast float %tmp135 to i32
   %tmp416 = bitcast float %tmp181 to i32
@@ -479,7 +572,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp427 = insertelement <8 x i32> %tmp426, i32 undef, i32 6
   %tmp428 = insertelement <8 x i32> %tmp427, i32 undef, i32 7
   %tmp87.bc = bitcast <16 x i8> %tmp87 to <4 x i32>
-  %tmp429 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp428, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp428.bc = bitcast <8 x i32> %tmp428 to <8 x float>
+  %tmp429 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp428.bc, <8 x i32> %tmp85, <4 x i32> %tmp87.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp430 = extractelement <4 x float> %tmp429, i32 0
   %tmp431 = extractelement <4 x float> %tmp429, i32 1
   %tmp432 = extractelement <4 x float> %tmp429, i32 2
@@ -502,12 +596,22 @@ IF67:                                             ; preds = %LOOP65
   %tmp449 = insertelement <4 x float> %tmp448, float %tmp445, i32 1
   %tmp450 = insertelement <4 x float> %tmp449, float %tmp447, i32 2
   %tmp451 = insertelement <4 x float> %tmp450, float %tmp194, i32 3
-  %tmp452 = call <4 x float> @llvm.AMDGPU.cube(<4 x float> %tmp451)
+  %tmp451.x = extractelement <4 x float> %tmp451, i32 0
+  %tmp451.y = extractelement <4 x float> %tmp451, i32 1
+  %tmp451.z = extractelement <4 x float> %tmp451, i32 2
+  %cubetc = call float @llvm.amdgcn.cubetc(float %tmp451.x, float %tmp451.y, float %tmp451.z)
+  %cubesc = call float @llvm.amdgcn.cubesc(float %tmp451.x, float %tmp451.y, float %tmp451.z)
+  %cubema = call float @llvm.amdgcn.cubema(float %tmp451.x, float %tmp451.y, float %tmp451.z)
+  %cubeid = call float @llvm.amdgcn.cubeid(float %tmp451.x, float %tmp451.y, float %tmp451.z)
+  %tmp452.0 = insertelement <4 x float> undef, float %cubetc, i32 0
+  %tmp452.1 = insertelement <4 x float> %tmp452.0, float %cubesc, i32 1
+  %tmp452.2 = insertelement <4 x float> %tmp452.1, float %cubema, i32 2
+  %tmp452 = insertelement <4 x float> %tmp452.2, float %cubeid, i32 3
   %tmp453 = extractelement <4 x float> %tmp452, i32 0
   %tmp454 = extractelement <4 x float> %tmp452, i32 1
   %tmp455 = extractelement <4 x float> %tmp452, i32 2
   %tmp456 = extractelement <4 x float> %tmp452, i32 3
-  %tmp457 = call float @fabs(float %tmp455)
+  %tmp457 = call float @llvm.fabs.f32(float %tmp455)
   %tmp458 = fdiv float 1.000000e+00, %tmp457
   %tmp459 = fmul float %tmp453, %tmp458
   %tmp460 = fadd float %tmp459, 1.500000e+00
@@ -521,7 +625,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp468 = insertelement <4 x i32> %tmp467, i32 %tmp465, i32 2
   %tmp469 = insertelement <4 x i32> %tmp468, i32 undef, i32 3
   %tmp91.bc = bitcast <16 x i8> %tmp91 to <4 x i32>
-  %tmp470 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp469, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp469.bc = bitcast <4 x i32> %tmp469 to <4 x float>
+  %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp469.bc, <8 x i32> %tmp89, <4 x i32> %tmp91.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %tmp471 = extractelement <4 x float> %tmp470, i32 0
   %tmp472 = extractelement <4 x float> %tmp470, i32 1
   %tmp473 = extractelement <4 x float> %tmp470, i32 2
@@ -531,15 +636,15 @@ IF67:                                             ; preds = %LOOP65
   %tmp477 = fadd float %tmp476, %tmp329
   %tmp478 = fmul float %tmp432, %tmp473
   %tmp479 = fadd float %tmp478, %tmp330
-  %tmp480 = fmul float %tmp106, %tmp106
-  %tmp481 = fmul float %tmp107, %tmp107
+  %tmp480 = fmul float %p2.i18, %p2.i18
+  %tmp481 = fmul float %p2.i12, %p2.i12
   %tmp482 = fadd float %tmp481, %tmp480
-  %tmp483 = fmul float %tmp108, %tmp108
+  %tmp483 = fmul float %p2.i6, %p2.i6
   %tmp484 = fadd float %tmp482, %tmp483
   %tmp485 = call float @llvm.amdgcn.rsq.f32(float %tmp484)
-  %tmp486 = fmul float %tmp106, %tmp485
-  %tmp487 = fmul float %tmp107, %tmp485
-  %tmp488 = fmul float %tmp108, %tmp485
+  %tmp486 = fmul float %p2.i18, %tmp485
+  %tmp487 = fmul float %p2.i12, %tmp485
+  %tmp488 = fmul float %p2.i6, %tmp485
   %tmp489 = fmul float %tmp376, %tmp39
   %tmp490 = fmul float %tmp377, %tmp40
   %tmp491 = fmul float %tmp378, %tmp41
@@ -560,15 +665,15 @@ IF67:                                             ; preds = %LOOP65
   %tmp506 = fadd float %tmp487, %tmp505
   %tmp507 = fsub float -0.000000e+00, %tmp502
   %tmp508 = fadd float %tmp488, %tmp507
-  %tmp509 = fmul float %tmp94, %tmp94
-  %tmp510 = fmul float %tmp95, %tmp95
+  %tmp509 = fmul float %p2.i90, %p2.i90
+  %tmp510 = fmul float %p2.i84, %p2.i84
   %tmp511 = fadd float %tmp510, %tmp509
-  %tmp512 = fmul float %tmp96, %tmp96
+  %tmp512 = fmul float %p2.i78, %p2.i78
   %tmp513 = fadd float %tmp511, %tmp512
   %tmp514 = call float @llvm.amdgcn.rsq.f32(float %tmp513)
-  %tmp515 = fmul float %tmp94, %tmp514
-  %tmp516 = fmul float %tmp95, %tmp514
-  %tmp517 = fmul float %tmp96, %tmp514
+  %tmp515 = fmul float %p2.i90, %tmp514
+  %tmp516 = fmul float %p2.i84, %tmp514
+  %tmp517 = fmul float %p2.i78, %tmp514
   %tmp518 = fmul float %tmp504, %tmp515
   %tmp519 = fmul float %tmp506, %tmp516
   %tmp520 = fadd float %tmp519, %tmp518
@@ -623,7 +728,8 @@ IF67:                                             ; preds = %LOOP65
   %tmp569 = insertelement <8 x i32> %tmp568, i32 undef, i32 6
   %tmp570 = insertelement <8 x i32> %tmp569, i32 undef, i32 7
   %tmp75.bc = bitcast <16 x i8> %tmp75 to <4 x i32>
-  %tmp571 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp570, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp570.bc = bitcast <8 x i32> %tmp570 to <8 x float>
+  %tmp571 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp570.bc, <8 x i32> %tmp73, <4 x i32> %tmp75.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp572 = extractelement <4 x float> %tmp571, i32 0
   %tmp573 = extractelement <4 x float> %tmp571, i32 1
   %tmp574 = extractelement <4 x float> %tmp571, i32 2
@@ -633,11 +739,9 @@ IF67:                                             ; preds = %LOOP65
   %tmp578 = fadd float %tmp577, %tmp554
   %tmp579 = fmul float %tmp574, %tmp45
   %tmp580 = fadd float %tmp579, %tmp556
-  %tmp581 = call i32 @llvm.SI.packf16(float %tmp576, float %tmp578)
-  %tmp582 = bitcast i32 %tmp581 to float
-  %tmp583 = call i32 @llvm.SI.packf16(float %tmp580, float %tmp282)
-  %tmp584 = bitcast i32 %tmp583 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp582, float %tmp584, float %tmp582, float %tmp584)
+  %tmp581 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp576, float %tmp578)
+  %tmp583 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp580, float %tmp282)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp581, <2 x half> %tmp583, i1 true, i1 true) #0
   ret void
 
 ENDIF66:                                          ; preds = %LOOP65
@@ -647,7 +751,8 @@ ENDIF66:                                          ; preds = %LOOP65
   %tmp588 = insertelement <8 x i32> %tmp587, i32 %tmp586, i32 5
   %tmp589 = insertelement <8 x i32> %tmp588, i32 undef, i32 6
   %tmp590 = insertelement <8 x i32> %tmp589, i32 undef, i32 7
-  %tmp591 = call <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32> %tmp590, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp590.bc = bitcast <8 x i32> %tmp590 to <8 x float>
+  %tmp591 = call <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float> %tmp590.bc, <8 x i32> %tmp61, <4 x i32> %tmp63.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp592 = extractelement <4 x float> %tmp591, i32 3
   %tmp593 = fcmp oge float %temp30.1, %tmp592
   %tmp594 = sext i1 %tmp593 to i32
@@ -670,9 +775,10 @@ ENDIF66:                                          ; preds = %LOOP65
   br label %LOOP65
 }
 
-; CHECK-LABEL: {{^}}main1:
-; CHECK: s_endpgm
-define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) {
+; GCN-LABEL: {{^}}main1:
+; GCN: s_endpgm
+; TOVGPR: ScratchSize: 0{{$}}
+define amdgpu_ps void @main1([17 x <16 x i8>] addrspace(2)* byval %arg, [32 x <16 x i8>] addrspace(2)* byval %arg1, [16 x <8 x i32>] addrspace(2)* byval %arg2, float inreg %arg3, i32 inreg %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <2 x i32> %arg7, <3 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, <2 x i32> %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19, float %arg20) #0 {
 main_body:
   %tmp = getelementptr [17 x <16 x i8>], [17 x <16 x i8>] addrspace(2)* %arg, i64 0, i32 0
   %tmp21 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
@@ -817,52 +923,210 @@ main_body:
   %tmp160 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp159, !tbaa !0
   %tmp161 = fcmp ugt float %arg17, 0.000000e+00
   %tmp162 = select i1 %tmp161, float 1.000000e+00, float 0.000000e+00
-  %tmp163 = call float @llvm.SI.fs.interp(i32 0, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp164 = call float @llvm.SI.fs.interp(i32 1, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp165 = call float @llvm.SI.fs.interp(i32 2, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp166 = call float @llvm.SI.fs.interp(i32 3, i32 0, i32 %arg4, <2 x i32> %arg6)
-  %tmp167 = call float @llvm.SI.fs.interp(i32 0, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp168 = call float @llvm.SI.fs.interp(i32 1, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp169 = call float @llvm.SI.fs.interp(i32 2, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp170 = call float @llvm.SI.fs.interp(i32 3, i32 1, i32 %arg4, <2 x i32> %arg6)
-  %tmp171 = call float @llvm.SI.fs.interp(i32 0, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp172 = call float @llvm.SI.fs.interp(i32 1, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp173 = call float @llvm.SI.fs.interp(i32 2, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp174 = call float @llvm.SI.fs.interp(i32 3, i32 2, i32 %arg4, <2 x i32> %arg6)
-  %tmp175 = call float @llvm.SI.fs.interp(i32 0, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp176 = call float @llvm.SI.fs.interp(i32 1, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp177 = call float @llvm.SI.fs.interp(i32 2, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp178 = call float @llvm.SI.fs.interp(i32 3, i32 3, i32 %arg4, <2 x i32> %arg6)
-  %tmp179 = call float @llvm.SI.fs.interp(i32 0, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp180 = call float @llvm.SI.fs.interp(i32 1, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp181 = call float @llvm.SI.fs.interp(i32 2, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp182 = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %arg4, <2 x i32> %arg6)
-  %tmp183 = call float @llvm.SI.fs.interp(i32 0, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp184 = call float @llvm.SI.fs.interp(i32 1, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp185 = call float @llvm.SI.fs.interp(i32 2, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp186 = call float @llvm.SI.fs.interp(i32 3, i32 5, i32 %arg4, <2 x i32> %arg6)
-  %tmp187 = call float @llvm.SI.fs.interp(i32 0, i32 6, i32 %arg4, <2 x i32> %arg6)
-  %tmp188 = call float @llvm.SI.fs.interp(i32 1, i32 6, i32 %arg4, <2 x i32> %arg6)
-  %tmp189 = call float @llvm.SI.fs.interp(i32 2, i32 6, i32 %arg4, <2 x i32> %arg6)
-  %tmp190 = call float @llvm.SI.fs.interp(i32 3, i32 6, i32 %arg4, <2 x i32> %arg6)
-  %tmp191 = call float @llvm.SI.fs.interp(i32 0, i32 7, i32 %arg4, <2 x i32> %arg6)
-  %tmp192 = call float @llvm.SI.fs.interp(i32 1, i32 7, i32 %arg4, <2 x i32> %arg6)
-  %tmp193 = call float @llvm.SI.fs.interp(i32 2, i32 7, i32 %arg4, <2 x i32> %arg6)
-  %tmp194 = call float @llvm.SI.fs.interp(i32 3, i32 7, i32 %arg4, <2 x i32> %arg6)
+  %i.i = extractelement <2 x i32> %arg6, i32 0
+  %j.i = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 0, i32 0, i32 %arg4) #0
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 0, i32 0, i32 %arg4) #0
+  %i.i181 = extractelement <2 x i32> %arg6, i32 0
+  %j.i182 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i183 = bitcast i32 %i.i181 to float
+  %j.f.i184 = bitcast i32 %j.i182 to float
+  %p1.i185 = call float @llvm.amdgcn.interp.p1(float %i.f.i183, i32 1, i32 0, i32 %arg4) #0
+  %p2.i186 = call float @llvm.amdgcn.interp.p2(float %p1.i185, float %j.f.i184, i32 1, i32 0, i32 %arg4) #0
+  %i.i175 = extractelement <2 x i32> %arg6, i32 0
+  %j.i176 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i177 = bitcast i32 %i.i175 to float
+  %j.f.i178 = bitcast i32 %j.i176 to float
+  %p1.i179 = call float @llvm.amdgcn.interp.p1(float %i.f.i177, i32 2, i32 0, i32 %arg4) #0
+  %p2.i180 = call float @llvm.amdgcn.interp.p2(float %p1.i179, float %j.f.i178, i32 2, i32 0, i32 %arg4) #0
+  %i.i169 = extractelement <2 x i32> %arg6, i32 0
+  %j.i170 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i171 = bitcast i32 %i.i169 to float
+  %j.f.i172 = bitcast i32 %j.i170 to float
+  %p1.i173 = call float @llvm.amdgcn.interp.p1(float %i.f.i171, i32 3, i32 0, i32 %arg4) #0
+  %p2.i174 = call float @llvm.amdgcn.interp.p2(float %p1.i173, float %j.f.i172, i32 3, i32 0, i32 %arg4) #0
+  %i.i163 = extractelement <2 x i32> %arg6, i32 0
+  %j.i164 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i165 = bitcast i32 %i.i163 to float
+  %j.f.i166 = bitcast i32 %j.i164 to float
+  %p1.i167 = call float @llvm.amdgcn.interp.p1(float %i.f.i165, i32 0, i32 1, i32 %arg4) #0
+  %p2.i168 = call float @llvm.amdgcn.interp.p2(float %p1.i167, float %j.f.i166, i32 0, i32 1, i32 %arg4) #0
+  %i.i157 = extractelement <2 x i32> %arg6, i32 0
+  %j.i158 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i159 = bitcast i32 %i.i157 to float
+  %j.f.i160 = bitcast i32 %j.i158 to float
+  %p1.i161 = call float @llvm.amdgcn.interp.p1(float %i.f.i159, i32 1, i32 1, i32 %arg4) #0
+  %p2.i162 = call float @llvm.amdgcn.interp.p2(float %p1.i161, float %j.f.i160, i32 1, i32 1, i32 %arg4) #0
+  %i.i151 = extractelement <2 x i32> %arg6, i32 0
+  %j.i152 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i153 = bitcast i32 %i.i151 to float
+  %j.f.i154 = bitcast i32 %j.i152 to float
+  %p1.i155 = call float @llvm.amdgcn.interp.p1(float %i.f.i153, i32 2, i32 1, i32 %arg4) #0
+  %p2.i156 = call float @llvm.amdgcn.interp.p2(float %p1.i155, float %j.f.i154, i32 2, i32 1, i32 %arg4) #0
+  %i.i145 = extractelement <2 x i32> %arg6, i32 0
+  %j.i146 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i147 = bitcast i32 %i.i145 to float
+  %j.f.i148 = bitcast i32 %j.i146 to float
+  %p1.i149 = call float @llvm.amdgcn.interp.p1(float %i.f.i147, i32 3, i32 1, i32 %arg4) #0
+  %p2.i150 = call float @llvm.amdgcn.interp.p2(float %p1.i149, float %j.f.i148, i32 3, i32 1, i32 %arg4) #0
+  %i.i139 = extractelement <2 x i32> %arg6, i32 0
+  %j.i140 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i141 = bitcast i32 %i.i139 to float
+  %j.f.i142 = bitcast i32 %j.i140 to float
+  %p1.i143 = call float @llvm.amdgcn.interp.p1(float %i.f.i141, i32 0, i32 2, i32 %arg4) #0
+  %p2.i144 = call float @llvm.amdgcn.interp.p2(float %p1.i143, float %j.f.i142, i32 0, i32 2, i32 %arg4) #0
+  %i.i133 = extractelement <2 x i32> %arg6, i32 0
+  %j.i134 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i135 = bitcast i32 %i.i133 to float
+  %j.f.i136 = bitcast i32 %j.i134 to float
+  %p1.i137 = call float @llvm.amdgcn.interp.p1(float %i.f.i135, i32 1, i32 2, i32 %arg4) #0
+  %p2.i138 = call float @llvm.amdgcn.interp.p2(float %p1.i137, float %j.f.i136, i32 1, i32 2, i32 %arg4) #0
+  %i.i127 = extractelement <2 x i32> %arg6, i32 0
+  %j.i128 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i129 = bitcast i32 %i.i127 to float
+  %j.f.i130 = bitcast i32 %j.i128 to float
+  %p1.i131 = call float @llvm.amdgcn.interp.p1(float %i.f.i129, i32 2, i32 2, i32 %arg4) #0
+  %p2.i132 = call float @llvm.amdgcn.interp.p2(float %p1.i131, float %j.f.i130, i32 2, i32 2, i32 %arg4) #0
+  %i.i121 = extractelement <2 x i32> %arg6, i32 0
+  %j.i122 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i123 = bitcast i32 %i.i121 to float
+  %j.f.i124 = bitcast i32 %j.i122 to float
+  %p1.i125 = call float @llvm.amdgcn.interp.p1(float %i.f.i123, i32 3, i32 2, i32 %arg4) #0
+  %p2.i126 = call float @llvm.amdgcn.interp.p2(float %p1.i125, float %j.f.i124, i32 3, i32 2, i32 %arg4) #0
+  %i.i115 = extractelement <2 x i32> %arg6, i32 0
+  %j.i116 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i117 = bitcast i32 %i.i115 to float
+  %j.f.i118 = bitcast i32 %j.i116 to float
+  %p1.i119 = call float @llvm.amdgcn.interp.p1(float %i.f.i117, i32 0, i32 3, i32 %arg4) #0
+  %p2.i120 = call float @llvm.amdgcn.interp.p2(float %p1.i119, float %j.f.i118, i32 0, i32 3, i32 %arg4) #0
+  %i.i109 = extractelement <2 x i32> %arg6, i32 0
+  %j.i110 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i111 = bitcast i32 %i.i109 to float
+  %j.f.i112 = bitcast i32 %j.i110 to float
+  %p1.i113 = call float @llvm.amdgcn.interp.p1(float %i.f.i111, i32 1, i32 3, i32 %arg4) #0
+  %p2.i114 = call float @llvm.amdgcn.interp.p2(float %p1.i113, float %j.f.i112, i32 1, i32 3, i32 %arg4) #0
+  %i.i103 = extractelement <2 x i32> %arg6, i32 0
+  %j.i104 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i105 = bitcast i32 %i.i103 to float
+  %j.f.i106 = bitcast i32 %j.i104 to float
+  %p1.i107 = call float @llvm.amdgcn.interp.p1(float %i.f.i105, i32 2, i32 3, i32 %arg4) #0
+  %p2.i108 = call float @llvm.amdgcn.interp.p2(float %p1.i107, float %j.f.i106, i32 2, i32 3, i32 %arg4) #0
+  %i.i97 = extractelement <2 x i32> %arg6, i32 0
+  %j.i98 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i99 = bitcast i32 %i.i97 to float
+  %j.f.i100 = bitcast i32 %j.i98 to float
+  %p1.i101 = call float @llvm.amdgcn.interp.p1(float %i.f.i99, i32 3, i32 3, i32 %arg4) #0
+  %p2.i102 = call float @llvm.amdgcn.interp.p2(float %p1.i101, float %j.f.i100, i32 3, i32 3, i32 %arg4) #0
+  %i.i91 = extractelement <2 x i32> %arg6, i32 0
+  %j.i92 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i93 = bitcast i32 %i.i91 to float
+  %j.f.i94 = bitcast i32 %j.i92 to float
+  %p1.i95 = call float @llvm.amdgcn.interp.p1(float %i.f.i93, i32 0, i32 4, i32 %arg4) #0
+  %p2.i96 = call float @llvm.amdgcn.interp.p2(float %p1.i95, float %j.f.i94, i32 0, i32 4, i32 %arg4) #0
+  %i.i85 = extractelement <2 x i32> %arg6, i32 0
+  %j.i86 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i87 = bitcast i32 %i.i85 to float
+  %j.f.i88 = bitcast i32 %j.i86 to float
+  %p1.i89 = call float @llvm.amdgcn.interp.p1(float %i.f.i87, i32 1, i32 4, i32 %arg4) #0
+  %p2.i90 = call float @llvm.amdgcn.interp.p2(float %p1.i89, float %j.f.i88, i32 1, i32 4, i32 %arg4) #0
+  %i.i79 = extractelement <2 x i32> %arg6, i32 0
+  %j.i80 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i81 = bitcast i32 %i.i79 to float
+  %j.f.i82 = bitcast i32 %j.i80 to float
+  %p1.i83 = call float @llvm.amdgcn.interp.p1(float %i.f.i81, i32 2, i32 4, i32 %arg4) #0
+  %p2.i84 = call float @llvm.amdgcn.interp.p2(float %p1.i83, float %j.f.i82, i32 2, i32 4, i32 %arg4) #0
+  %i.i73 = extractelement <2 x i32> %arg6, i32 0
+  %j.i74 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i75 = bitcast i32 %i.i73 to float
+  %j.f.i76 = bitcast i32 %j.i74 to float
+  %p1.i77 = call float @llvm.amdgcn.interp.p1(float %i.f.i75, i32 3, i32 4, i32 %arg4) #0
+  %p2.i78 = call float @llvm.amdgcn.interp.p2(float %p1.i77, float %j.f.i76, i32 3, i32 4, i32 %arg4) #0
+  %i.i67 = extractelement <2 x i32> %arg6, i32 0
+  %j.i68 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i69 = bitcast i32 %i.i67 to float
+  %j.f.i70 = bitcast i32 %j.i68 to float
+  %p1.i71 = call float @llvm.amdgcn.interp.p1(float %i.f.i69, i32 0, i32 5, i32 %arg4) #0
+  %p2.i72 = call float @llvm.amdgcn.interp.p2(float %p1.i71, float %j.f.i70, i32 0, i32 5, i32 %arg4) #0
+  %i.i61 = extractelement <2 x i32> %arg6, i32 0
+  %j.i62 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i63 = bitcast i32 %i.i61 to float
+  %j.f.i64 = bitcast i32 %j.i62 to float
+  %p1.i65 = call float @llvm.amdgcn.interp.p1(float %i.f.i63, i32 1, i32 5, i32 %arg4) #0
+  %p2.i66 = call float @llvm.amdgcn.interp.p2(float %p1.i65, float %j.f.i64, i32 1, i32 5, i32 %arg4) #0
+  %i.i55 = extractelement <2 x i32> %arg6, i32 0
+  %j.i56 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i57 = bitcast i32 %i.i55 to float
+  %j.f.i58 = bitcast i32 %j.i56 to float
+  %p1.i59 = call float @llvm.amdgcn.interp.p1(float %i.f.i57, i32 2, i32 5, i32 %arg4) #0
+  %p2.i60 = call float @llvm.amdgcn.interp.p2(float %p1.i59, float %j.f.i58, i32 2, i32 5, i32 %arg4) #0
+  %i.i49 = extractelement <2 x i32> %arg6, i32 0
+  %j.i50 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i51 = bitcast i32 %i.i49 to float
+  %j.f.i52 = bitcast i32 %j.i50 to float
+  %p1.i53 = call float @llvm.amdgcn.interp.p1(float %i.f.i51, i32 3, i32 5, i32 %arg4) #0
+  %p2.i54 = call float @llvm.amdgcn.interp.p2(float %p1.i53, float %j.f.i52, i32 3, i32 5, i32 %arg4) #0
+  %i.i43 = extractelement <2 x i32> %arg6, i32 0
+  %j.i44 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i45 = bitcast i32 %i.i43 to float
+  %j.f.i46 = bitcast i32 %j.i44 to float
+  %p1.i47 = call float @llvm.amdgcn.interp.p1(float %i.f.i45, i32 0, i32 6, i32 %arg4) #0
+  %p2.i48 = call float @llvm.amdgcn.interp.p2(float %p1.i47, float %j.f.i46, i32 0, i32 6, i32 %arg4) #0
+  %i.i37 = extractelement <2 x i32> %arg6, i32 0
+  %j.i38 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i39 = bitcast i32 %i.i37 to float
+  %j.f.i40 = bitcast i32 %j.i38 to float
+  %p1.i41 = call float @llvm.amdgcn.interp.p1(float %i.f.i39, i32 1, i32 6, i32 %arg4) #0
+  %p2.i42 = call float @llvm.amdgcn.interp.p2(float %p1.i41, float %j.f.i40, i32 1, i32 6, i32 %arg4) #0
+  %i.i31 = extractelement <2 x i32> %arg6, i32 0
+  %j.i32 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i33 = bitcast i32 %i.i31 to float
+  %j.f.i34 = bitcast i32 %j.i32 to float
+  %p1.i35 = call float @llvm.amdgcn.interp.p1(float %i.f.i33, i32 2, i32 6, i32 %arg4) #0
+  %p2.i36 = call float @llvm.amdgcn.interp.p2(float %p1.i35, float %j.f.i34, i32 2, i32 6, i32 %arg4) #0
+  %i.i25 = extractelement <2 x i32> %arg6, i32 0
+  %j.i26 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i27 = bitcast i32 %i.i25 to float
+  %j.f.i28 = bitcast i32 %j.i26 to float
+  %p1.i29 = call float @llvm.amdgcn.interp.p1(float %i.f.i27, i32 3, i32 6, i32 %arg4) #0
+  %p2.i30 = call float @llvm.amdgcn.interp.p2(float %p1.i29, float %j.f.i28, i32 3, i32 6, i32 %arg4) #0
+  %i.i19 = extractelement <2 x i32> %arg6, i32 0
+  %j.i20 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i21 = bitcast i32 %i.i19 to float
+  %j.f.i22 = bitcast i32 %j.i20 to float
+  %p1.i23 = call float @llvm.amdgcn.interp.p1(float %i.f.i21, i32 0, i32 7, i32 %arg4) #0
+  %p2.i24 = call float @llvm.amdgcn.interp.p2(float %p1.i23, float %j.f.i22, i32 0, i32 7, i32 %arg4) #0
+  %i.i13 = extractelement <2 x i32> %arg6, i32 0
+  %j.i14 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i15 = bitcast i32 %i.i13 to float
+  %j.f.i16 = bitcast i32 %j.i14 to float
+  %p1.i17 = call float @llvm.amdgcn.interp.p1(float %i.f.i15, i32 1, i32 7, i32 %arg4) #0
+  %p2.i18 = call float @llvm.amdgcn.interp.p2(float %p1.i17, float %j.f.i16, i32 1, i32 7, i32 %arg4) #0
+  %i.i7 = extractelement <2 x i32> %arg6, i32 0
+  %j.i8 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i9 = bitcast i32 %i.i7 to float
+  %j.f.i10 = bitcast i32 %j.i8 to float
+  %p1.i11 = call float @llvm.amdgcn.interp.p1(float %i.f.i9, i32 2, i32 7, i32 %arg4) #0
+  %p2.i12 = call float @llvm.amdgcn.interp.p2(float %p1.i11, float %j.f.i10, i32 2, i32 7, i32 %arg4) #0
+  %i.i1 = extractelement <2 x i32> %arg6, i32 0
+  %j.i2 = extractelement <2 x i32> %arg6, i32 1
+  %i.f.i3 = bitcast i32 %i.i1 to float
+  %j.f.i4 = bitcast i32 %j.i2 to float
+  %p1.i5 = call float @llvm.amdgcn.interp.p1(float %i.f.i3, i32 3, i32 7, i32 %arg4) #0
+  %p2.i6 = call float @llvm.amdgcn.interp.p2(float %p1.i5, float %j.f.i4, i32 3, i32 7, i32 %arg4) #0
   %tmp195 = fmul float %arg14, %tmp123
   %tmp196 = fadd float %tmp195, %tmp124
-  %tmp197 = call float @llvm.AMDGPU.clamp.f32(float %tmp162, float 0.000000e+00, float 1.000000e+00)
-  %tmp198 = call float @llvm.AMDGPU.clamp.f32(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %tmp199 = call float @llvm.AMDGPU.clamp.f32(float 0.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %tmp200 = call float @llvm.AMDGPU.clamp.f32(float 1.000000e+00, float 0.000000e+00, float 1.000000e+00)
-  %tmp201 = bitcast float %tmp197 to i32
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp162, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %tmp201 = bitcast float %clamp.i to i32
   %tmp202 = icmp ne i32 %tmp201, 0
   %. = select i1 %tmp202, float -1.000000e+00, float 1.000000e+00
-  %tmp203 = fsub float -0.000000e+00, %tmp163
+  %tmp203 = fsub float -0.000000e+00, %p2.i
   %tmp204 = fadd float %tmp43, %tmp203
-  %tmp205 = fsub float -0.000000e+00, %tmp164
+  %tmp205 = fsub float -0.000000e+00, %p2.i186
   %tmp206 = fadd float %tmp44, %tmp205
-  %tmp207 = fsub float -0.000000e+00, %tmp165
+  %tmp207 = fsub float -0.000000e+00, %p2.i180
   %tmp208 = fadd float %tmp45, %tmp207
   %tmp209 = fmul float %tmp204, %tmp204
   %tmp210 = fmul float %tmp206, %tmp206
@@ -876,12 +1140,13 @@ main_body:
   %tmp218 = fmul float %., %tmp53
   %tmp219 = fmul float %arg13, %tmp46
   %tmp220 = fmul float %tmp196, %tmp47
-  %tmp221 = bitcast float %tmp173 to i32
-  %tmp222 = bitcast float %tmp174 to i32
+  %tmp221 = bitcast float %p2.i132 to i32
+  %tmp222 = bitcast float %p2.i126 to i32
   %tmp223 = insertelement <2 x i32> undef, i32 %tmp221, i32 0
   %tmp224 = insertelement <2 x i32> %tmp223, i32 %tmp222, i32 1
   %tmp132.bc = bitcast <16 x i8> %tmp132 to <4 x i32>
-  %tmp225 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp224, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp224.bc = bitcast <2 x i32> %tmp224 to <2 x float>
+  %tmp225 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp224.bc, <8 x i32> %tmp130, <4 x i32> %tmp132.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp226 = extractelement <4 x float> %tmp225, i32 0
   %tmp227 = extractelement <4 x float> %tmp225, i32 1
   %tmp228 = extractelement <4 x float> %tmp225, i32 2
@@ -895,34 +1160,36 @@ main_body:
   %result.i44 = fadd float %tmp231, %one.sub.a.i43
   %one.sub.a.i41 = fsub float 1.000000e+00, %tmp26
   %result.i42 = fadd float %tmp232, %one.sub.a.i41
-  %tmp233 = fmul float %tmp215, %tmp183
-  %tmp234 = fmul float %tmp216, %tmp184
+  %tmp233 = fmul float %tmp215, %p2.i72
+  %tmp234 = fmul float %tmp216, %p2.i66
   %tmp235 = fadd float %tmp234, %tmp233
-  %tmp236 = fmul float %tmp217, %tmp185
+  %tmp236 = fmul float %tmp217, %p2.i60
   %tmp237 = fadd float %tmp235, %tmp236
-  %tmp238 = fmul float %tmp215, %tmp186
-  %tmp239 = fmul float %tmp216, %tmp187
+  %tmp238 = fmul float %tmp215, %p2.i54
+  %tmp239 = fmul float %tmp216, %p2.i48
   %tmp240 = fadd float %tmp239, %tmp238
-  %tmp241 = fmul float %tmp217, %tmp188
+  %tmp241 = fmul float %tmp217, %p2.i42
   %tmp242 = fadd float %tmp240, %tmp241
-  %tmp243 = fmul float %tmp215, %tmp189
-  %tmp244 = fmul float %tmp216, %tmp190
+  %tmp243 = fmul float %tmp215, %p2.i36
+  %tmp244 = fmul float %tmp216, %p2.i30
   %tmp245 = fadd float %tmp244, %tmp243
-  %tmp246 = fmul float %tmp217, %tmp191
+  %tmp246 = fmul float %tmp217, %p2.i24
   %tmp247 = fadd float %tmp245, %tmp246
-  %tmp248 = call float @llvm.AMDGPU.clamp.f32(float %tmp247, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i19 = call float @llvm.maxnum.f32(float %tmp247, float 0.000000e+00)
+  %clamp.i20 = call float @llvm.minnum.f32(float %max.0.i19, float 1.000000e+00)
   %tmp249 = fmul float %tmp213, 0x3F5A36E2E0000000
-  %tmp250 = call float @llvm.AMDGPU.clamp.f32(float %tmp249, float 0.000000e+00, float 1.000000e+00)
-  %tmp251 = fsub float -0.000000e+00, %tmp250
+  %max.0.i17 = call float @llvm.maxnum.f32(float %tmp249, float 0.000000e+00)
+  %clamp.i18 = call float @llvm.minnum.f32(float %max.0.i17, float 1.000000e+00)
+  %tmp251 = fsub float -0.000000e+00, %clamp.i18
   %tmp252 = fadd float 1.000000e+00, %tmp251
-  %tmp253 = call float @llvm.pow.f32(float %tmp248, float 2.500000e-01)
+  %tmp253 = call float @llvm.pow.f32(float %clamp.i20, float 2.500000e-01)
   %tmp254 = fmul float %tmp38, %tmp253
   %tmp255 = fmul float %tmp237, %tmp254
   %tmp256 = fmul float %tmp242, %tmp254
   %tmp257 = fmul float %tmp255, %tmp229
   %tmp258 = fmul float %tmp256, %tmp229
-  %tmp259 = fadd float %tmp248, 0x3EE4F8B580000000
-  %tmp260 = fsub float -0.000000e+00, %tmp248
+  %tmp259 = fadd float %clamp.i20, 0x3EE4F8B580000000
+  %tmp260 = fsub float -0.000000e+00, %clamp.i20
   %tmp261 = fadd float 1.000000e+00, %tmp260
   %tmp262 = fmul float 1.200000e+01, %tmp261
   %tmp263 = fadd float %tmp262, 4.000000e+00
@@ -942,8 +1209,8 @@ main_body:
 
 LOOP:                                             ; preds = %LOOP, %main_body
   %temp144.0 = phi float [ 1.000000e+00, %main_body ], [ %tmp288, %LOOP ]
-  %temp168.0 = phi float [ %tmp175, %main_body ], [ %tmp284, %LOOP ]
-  %temp169.0 = phi float [ %tmp176, %main_body ], [ %tmp285, %LOOP ]
+  %temp168.0 = phi float [ %p2.i120, %main_body ], [ %tmp284, %LOOP ]
+  %temp169.0 = phi float [ %p2.i114, %main_body ], [ %tmp285, %LOOP ]
   %temp170.0 = phi float [ %tmp252, %main_body ], [ %tmp286, %LOOP ]
   %tmp276 = bitcast float %temp168.0 to i32
   %tmp277 = bitcast float %temp169.0 to i32
@@ -952,7 +1219,8 @@ LOOP:                                             ; preds = %LOOP, %main_body
   %tmp280 = insertelement <4 x i32> %tmp279, i32 0, i32 2
   %tmp281 = insertelement <4 x i32> %tmp280, i32 undef, i32 3
   %tmp148.bc = bitcast <16 x i8> %tmp148 to <4 x i32>
-  %tmp282 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp281, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp281.bc = bitcast <4 x i32> %tmp281 to <4 x float>
+  %tmp282 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp281.bc, <8 x i32> %tmp146, <4 x i32> %tmp148.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp283 = extractelement <4 x float> %tmp282, i32 3
   %tmp284 = fadd float %temp168.0, %tmp273
   %tmp285 = fadd float %temp169.0, %tmp274
@@ -979,12 +1247,12 @@ IF189:                                            ; preds = %LOOP
   %tmp303 = fadd float %tmp302, %tmp284
   %tmp304 = fmul float %tmp301, %tmp274
   %tmp305 = fadd float %tmp304, %tmp285
-  %tmp306 = fsub float -0.000000e+00, %tmp175
+  %tmp306 = fsub float -0.000000e+00, %p2.i120
   %tmp307 = fadd float %tmp303, %tmp306
-  %tmp308 = fsub float -0.000000e+00, %tmp176
+  %tmp308 = fsub float -0.000000e+00, %p2.i114
   %tmp309 = fadd float %tmp305, %tmp308
-  %tmp310 = fadd float %tmp175, %tmp307
-  %tmp311 = fadd float %tmp176, %tmp309
+  %tmp310 = fadd float %p2.i120, %tmp307
+  %tmp311 = fadd float %p2.i114, %tmp309
   %tmp312 = fmul float %tmp307, %tmp66
   %tmp313 = fmul float %tmp309, %tmp67
   %tmp314 = fmul float %tmp312, %tmp54
@@ -993,8 +1261,8 @@ IF189:                                            ; preds = %LOOP
   %tmp317 = fadd float %tmp316, %tmp314
   %tmp318 = fmul float %tmp313, %tmp57
   %tmp319 = fadd float %tmp318, %tmp315
-  %tmp320 = fadd float %tmp177, %tmp317
-  %tmp321 = fadd float %tmp178, %tmp319
+  %tmp320 = fadd float %p2.i108, %tmp317
+  %tmp321 = fadd float %p2.i102, %tmp319
   %tmp322 = fmul float %tmp312, %tmp58
   %tmp323 = fmul float %tmp312, %tmp59
   %tmp324 = fmul float %tmp312, %tmp60
@@ -1007,28 +1275,29 @@ IF189:                                            ; preds = %LOOP
   %tmp331 = fadd float %tmp330, %tmp324
   %tmp332 = fmul float %tmp313, %tmp65
   %tmp333 = fadd float %tmp332, %tmp325
-  %tmp334 = fadd float %tmp167, %tmp327
-  %tmp335 = fadd float %tmp168, %tmp329
-  %tmp336 = fadd float %tmp169, %tmp331
-  %tmp337 = fadd float %tmp170, %tmp333
+  %tmp334 = fadd float %p2.i168, %tmp327
+  %tmp335 = fadd float %p2.i162, %tmp329
+  %tmp336 = fadd float %p2.i156, %tmp331
+  %tmp337 = fadd float %p2.i150, %tmp333
   %tmp338 = bitcast float %tmp334 to i32
   %tmp339 = bitcast float %tmp335 to i32
   %tmp340 = insertelement <2 x i32> undef, i32 %tmp338, i32 0
   %tmp341 = insertelement <2 x i32> %tmp340, i32 %tmp339, i32 1
   %tmp136.bc = bitcast <16 x i8> %tmp136 to <4 x i32>
-  %tmp342 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp341, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp343 = extractelement <4 x float> %tmp342, i32 0
-  %tmp344 = extractelement <4 x float> %tmp342, i32 1
-  %tmp345 = extractelement <4 x float> %tmp342, i32 2
-  %tmp346 = extractelement <4 x float> %tmp342, i32 3
+  %a.bc.i = bitcast <2 x i32> %tmp341 to <2 x float>
+  %tmp0 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i, <8 x i32> %tmp134, <4 x i32> %tmp136.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp343 = extractelement <4 x float> %tmp0, i32 0
+  %tmp344 = extractelement <4 x float> %tmp0, i32 1
+  %tmp345 = extractelement <4 x float> %tmp0, i32 2
+  %tmp346 = extractelement <4 x float> %tmp0, i32 3
   %tmp347 = fmul float %tmp343, %tmp22
   %tmp348 = fmul float %tmp344, %tmp23
   %tmp349 = fmul float %tmp345, %tmp24
   %tmp350 = fmul float %tmp346, %tmp25
-  %tmp351 = fmul float %tmp347, %tmp179
-  %tmp352 = fmul float %tmp348, %tmp180
-  %tmp353 = fmul float %tmp349, %tmp181
-  %tmp354 = fmul float %tmp350, %tmp182
+  %tmp351 = fmul float %tmp347, %p2.i96
+  %tmp352 = fmul float %tmp348, %p2.i90
+  %tmp353 = fmul float %tmp349, %p2.i84
+  %tmp354 = fmul float %tmp350, %p2.i78
   %tmp355 = fsub float -0.000000e+00, %tmp346
   %tmp356 = fadd float 1.000000e+00, %tmp355
   %tmp357 = fmul float %tmp356, %tmp48
@@ -1049,8 +1318,9 @@ IF189:                                            ; preds = %LOOP
   %tmp360 = insertelement <2 x i32> undef, i32 %tmp358, i32 0
   %tmp361 = insertelement <2 x i32> %tmp360, i32 %tmp359, i32 1
   %tmp152.bc = bitcast <16 x i8> %tmp152 to <4 x i32>
-  %tmp362 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp361, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp363 = extractelement <4 x float> %tmp362, i32 2
+  %a.bc.i3 = bitcast <2 x i32> %tmp361 to <2 x float>
+  %tmp1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i3, <8 x i32> %tmp150, <4 x i32> %tmp152.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp363 = extractelement <4 x float> %tmp1, i32 2
   %tmp364 = fmul float %result.i40, %result.i
   %tmp365 = fmul float %result.i36, %result.i44
   %tmp366 = fmul float %result.i32, %result.i42
@@ -1060,11 +1330,12 @@ IF189:                                            ; preds = %LOOP
   %tmp370 = insertelement <2 x i32> undef, i32 %tmp368, i32 0
   %tmp371 = insertelement <2 x i32> %tmp370, i32 %tmp369, i32 1
   %tmp140.bc = bitcast <16 x i8> %tmp140 to <4 x i32>
-  %tmp372 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp371, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp373 = extractelement <4 x float> %tmp372, i32 0
-  %tmp374 = extractelement <4 x float> %tmp372, i32 1
-  %tmp375 = extractelement <4 x float> %tmp372, i32 2
-  %tmp376 = extractelement <4 x float> %tmp372, i32 3
+  %a.bc.i2 = bitcast <2 x i32> %tmp371 to <2 x float>
+  %tmp2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i2, <8 x i32> %tmp138, <4 x i32> %tmp140.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp373 = extractelement <4 x float> %tmp2, i32 0
+  %tmp374 = extractelement <4 x float> %tmp2, i32 1
+  %tmp375 = extractelement <4 x float> %tmp2, i32 2
+  %tmp376 = extractelement <4 x float> %tmp2, i32 3
   %tmp377 = fcmp olt float 0.000000e+00, %tmp375
   %tmp378 = sext i1 %tmp377 to i32
   %tmp379 = bitcast i32 %tmp378 to float
@@ -1077,11 +1348,12 @@ IF189:                                            ; preds = %LOOP
   %tmp384 = insertelement <2 x i32> undef, i32 %tmp382, i32 0
   %tmp385 = insertelement <2 x i32> %tmp384, i32 %tmp383, i32 1
   %tmp144.bc = bitcast <16 x i8> %tmp144 to <4 x i32>
-  %tmp386 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp385, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tmp387 = extractelement <4 x float> %tmp386, i32 0
-  %tmp388 = extractelement <4 x float> %tmp386, i32 1
-  %tmp389 = extractelement <4 x float> %tmp386, i32 2
-  %tmp390 = extractelement <4 x float> %tmp386, i32 3
+  %a.bc.i1 = bitcast <2 x i32> %tmp385 to <2 x float>
+  %tmp3 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %a.bc.i1, <8 x i32> %tmp142, <4 x i32> %tmp144.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+  %tmp387 = extractelement <4 x float> %tmp3, i32 0
+  %tmp388 = extractelement <4 x float> %tmp3, i32 1
+  %tmp389 = extractelement <4 x float> %tmp3, i32 2
+  %tmp390 = extractelement <4 x float> %tmp3, i32 3
   %tmp391 = fcmp olt float 0.000000e+00, %tmp389
   %tmp392 = sext i1 %tmp391 to i32
   %tmp393 = bitcast i32 %tmp392 to float
@@ -1107,8 +1379,8 @@ IF189:                                            ; preds = %LOOP
   %tmp411 = fmul float %tmp410, %tmp35
   %tmp412 = fmul float %tmp409, %tmp363
   %tmp413 = fmul float %tmp411, %tmp363
-  %tmp414 = call float @fabs(float %tmp405)
-  %tmp415 = call float @fabs(float %tmp407)
+  %tmp414 = call float @llvm.fabs.f32(float %tmp405)
+  %tmp415 = call float @llvm.fabs.f32(float %tmp407)
   %tmp416 = fsub float -0.000000e+00, %tmp414
   %tmp417 = fadd float 1.000000e+00, %tmp416
   %tmp418 = fsub float -0.000000e+00, %tmp415
@@ -1122,26 +1394,27 @@ IF189:                                            ; preds = %LOOP
   %tmp426 = fadd float %tmp424, %tmp425
   %tmp427 = fsub float -0.000000e+00, %tmp426
   %tmp428 = fadd float 0x3FF00068E0000000, %tmp427
-  %tmp429 = call float @llvm.AMDGPU.clamp.f32(float %tmp428, float 0.000000e+00, float 1.000000e+00)
-  %tmp430 = call float @llvm.amdgcn.rsq.f32(float %tmp429)
-  %tmp431 = fmul float %tmp430, %tmp429
-  %tmp432 = fsub float -0.000000e+00, %tmp429
+  %max.0.i15 = call float @llvm.maxnum.f32(float %tmp428, float 0.000000e+00)
+  %clamp.i16 = call float @llvm.minnum.f32(float %max.0.i15, float 1.000000e+00)
+  %tmp430 = call float @llvm.amdgcn.rsq.f32(float %clamp.i16)
+  %tmp431 = fmul float %tmp430, %clamp.i16
+  %tmp432 = fsub float -0.000000e+00, %clamp.i16
   %cmp = fcmp ogt float 0.000000e+00, %tmp432
   %tmp433 = select i1 %cmp, float %tmp431, float 0.000000e+00
-  %tmp434 = fmul float %tmp183, %tmp421
-  %tmp435 = fmul float %tmp184, %tmp421
-  %tmp436 = fmul float %tmp185, %tmp421
-  %tmp437 = fmul float %tmp186, %tmp423
+  %tmp434 = fmul float %p2.i72, %tmp421
+  %tmp435 = fmul float %p2.i66, %tmp421
+  %tmp436 = fmul float %p2.i60, %tmp421
+  %tmp437 = fmul float %p2.i54, %tmp423
   %tmp438 = fadd float %tmp437, %tmp434
-  %tmp439 = fmul float %tmp187, %tmp423
+  %tmp439 = fmul float %p2.i48, %tmp423
   %tmp440 = fadd float %tmp439, %tmp435
-  %tmp441 = fmul float %tmp188, %tmp423
+  %tmp441 = fmul float %p2.i42, %tmp423
   %tmp442 = fadd float %tmp441, %tmp436
-  %tmp443 = fmul float %tmp189, %tmp433
+  %tmp443 = fmul float %p2.i36, %tmp433
   %tmp444 = fadd float %tmp443, %tmp438
-  %tmp445 = fmul float %tmp190, %tmp433
+  %tmp445 = fmul float %p2.i30, %tmp433
   %tmp446 = fadd float %tmp445, %tmp440
-  %tmp447 = fmul float %tmp191, %tmp433
+  %tmp447 = fmul float %p2.i24, %tmp433
   %tmp448 = fadd float %tmp447, %tmp442
   %tmp449 = fmul float %tmp444, %tmp444
   %tmp450 = fmul float %tmp446, %tmp446
@@ -1174,7 +1447,8 @@ ENDIF197:                                         ; preds = %IF198, %IF189
   %tmp468 = insertelement <2 x i32> undef, i32 %tmp466, i32 0
   %tmp469 = insertelement <2 x i32> %tmp468, i32 %tmp467, i32 1
   %tmp160.bc = bitcast <16 x i8> %tmp160 to <4 x i32>
-  %tmp470 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp469, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp469.bc = bitcast <2 x i32> %tmp469 to <2 x float>
+  %tmp470 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp469.bc, <8 x i32> %tmp158, <4 x i32> %tmp160.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp471 = extractelement <4 x float> %tmp470, i32 0
   %tmp472 = extractelement <4 x float> %tmp470, i32 1
   %tmp473 = extractelement <4 x float> %tmp470, i32 2
@@ -1187,12 +1461,13 @@ ENDIF197:                                         ; preds = %IF198, %IF189
   %tmp480 = fadd float %tmp479, %tmp40
   %tmp481 = fmul float %tmp474, %tmp41
   %tmp482 = fadd float %tmp481, %tmp42
-  %tmp483 = bitcast float %tmp171 to i32
-  %tmp484 = bitcast float %tmp172 to i32
+  %tmp483 = bitcast float %p2.i144 to i32
+  %tmp484 = bitcast float %p2.i138 to i32
   %tmp485 = insertelement <2 x i32> undef, i32 %tmp483, i32 0
   %tmp486 = insertelement <2 x i32> %tmp485, i32 %tmp484, i32 1
   %tmp156.bc = bitcast <16 x i8> %tmp156 to <4 x i32>
-  %tmp487 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> %tmp486, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp486.bc = bitcast <2 x i32> %tmp486 to <2 x float>
+  %tmp487 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> %tmp486.bc, <8 x i32> %tmp154, <4 x i32> %tmp156.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp488 = extractelement <4 x float> %tmp487, i32 0
   %tmp489 = extractelement <4 x float> %tmp487, i32 1
   %tmp490 = extractelement <4 x float> %tmp487, i32 2
@@ -1204,11 +1479,11 @@ ENDIF197:                                         ; preds = %IF198, %IF189
   %tmp496 = fmul float %tmp489, %tmp494
   %tmp497 = fmul float %tmp490, %tmp494
   %tmp498 = fmul float %tmp27, %tmp495
-  %tmp499 = fadd float %tmp498, %tmp192
+  %tmp499 = fadd float %tmp498, %p2.i18
   %tmp500 = fmul float %tmp28, %tmp496
-  %tmp501 = fadd float %tmp500, %tmp193
+  %tmp501 = fadd float %tmp500, %p2.i12
   %tmp502 = fmul float %tmp29, %tmp497
-  %tmp503 = fadd float %tmp502, %tmp194
+  %tmp503 = fadd float %tmp502, %p2.i6
   %tmp504 = fmul float %tmp499, %tmp482
   %tmp505 = fmul float %tmp501, %tmp482
   %tmp506 = fmul float %tmp503, %tmp482
@@ -1242,18 +1517,19 @@ ENDIF197:                                         ; preds = %IF198, %IF189
   %tmp534 = fadd float %tmp533, %tmp532
   %tmp535 = fmul float %temp14.0, %tmp531
   %tmp536 = fadd float %tmp534, %tmp535
-  %tmp537 = call float @llvm.AMDGPU.clamp.f32(float %tmp536, float 0.000000e+00, float 1.000000e+00)
-  %tmp538 = fmul float %tmp364, %tmp537
-  %tmp539 = fmul float %tmp365, %tmp537
-  %tmp540 = fmul float %tmp366, %tmp537
+  %max.0.i13 = call float @llvm.maxnum.f32(float %tmp536, float 0.000000e+00)
+  %clamp.i14 = call float @llvm.minnum.f32(float %max.0.i13, float 1.000000e+00)
+  %tmp538 = fmul float %tmp364, %clamp.i14
+  %tmp539 = fmul float %tmp365, %clamp.i14
+  %tmp540 = fmul float %tmp366, %clamp.i14
   %tmp541 = fmul float %tmp538, %tmp68
   %tmp542 = fmul float %tmp539, %tmp69
   %tmp543 = fmul float %tmp540, %tmp70
-  %tmp544 = fsub float -0.000000e+00, %tmp163
+  %tmp544 = fsub float -0.000000e+00, %p2.i
   %tmp545 = fadd float %tmp96, %tmp544
-  %tmp546 = fsub float -0.000000e+00, %tmp164
+  %tmp546 = fsub float -0.000000e+00, %p2.i186
   %tmp547 = fadd float %tmp97, %tmp546
-  %tmp548 = fsub float -0.000000e+00, %tmp165
+  %tmp548 = fsub float -0.000000e+00, %p2.i180
   %tmp549 = fadd float %tmp98, %tmp548
   %tmp550 = fmul float %tmp545, %tmp545
   %tmp551 = fmul float %tmp547, %tmp547
@@ -1339,31 +1615,31 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %temp69.0 = phi float [ %tmp112, %ENDIF200 ], [ %.231, %ELSE214 ], [ %tmp108, %ELSE211 ]
   %temp70.0 = phi float [ %tmp113, %ENDIF200 ], [ %.232, %ELSE214 ], [ %tmp109, %ELSE211 ]
   %temp71.0 = phi float [ %tmp114, %ENDIF200 ], [ %.233, %ELSE214 ], [ %tmp110, %ELSE211 ]
-  %tmp602 = fmul float %tmp163, %tmp84
-  %tmp603 = fmul float %tmp164, %tmp85
+  %tmp602 = fmul float %p2.i, %tmp84
+  %tmp603 = fmul float %p2.i186, %tmp85
   %tmp604 = fadd float %tmp602, %tmp603
-  %tmp605 = fmul float %tmp165, %tmp86
+  %tmp605 = fmul float %p2.i180, %tmp86
   %tmp606 = fadd float %tmp604, %tmp605
-  %tmp607 = fmul float %tmp166, %tmp87
+  %tmp607 = fmul float %p2.i174, %tmp87
   %tmp608 = fadd float %tmp606, %tmp607
-  %tmp609 = fmul float %tmp163, %tmp88
-  %tmp610 = fmul float %tmp164, %tmp89
+  %tmp609 = fmul float %p2.i, %tmp88
+  %tmp610 = fmul float %p2.i186, %tmp89
   %tmp611 = fadd float %tmp609, %tmp610
-  %tmp612 = fmul float %tmp165, %tmp90
+  %tmp612 = fmul float %p2.i180, %tmp90
   %tmp613 = fadd float %tmp611, %tmp612
-  %tmp614 = fmul float %tmp166, %tmp91
+  %tmp614 = fmul float %p2.i174, %tmp91
   %tmp615 = fadd float %tmp613, %tmp614
-  %tmp616 = fmul float %tmp163, %tmp92
-  %tmp617 = fmul float %tmp164, %tmp93
+  %tmp616 = fmul float %p2.i, %tmp92
+  %tmp617 = fmul float %p2.i186, %tmp93
   %tmp618 = fadd float %tmp616, %tmp617
-  %tmp619 = fmul float %tmp165, %tmp94
+  %tmp619 = fmul float %p2.i180, %tmp94
   %tmp620 = fadd float %tmp618, %tmp619
-  %tmp621 = fmul float %tmp166, %tmp95
+  %tmp621 = fmul float %p2.i174, %tmp95
   %tmp622 = fadd float %tmp620, %tmp621
   %tmp623 = fsub float -0.000000e+00, %tmp77
   %tmp624 = fadd float 1.000000e+00, %tmp623
-  %tmp625 = call float @fabs(float %tmp608)
-  %tmp626 = call float @fabs(float %tmp615)
+  %tmp625 = call float @llvm.fabs.f32(float %tmp608)
+  %tmp626 = call float @llvm.fabs.f32(float %tmp615)
   %tmp627 = fcmp oge float %tmp624, %tmp625
   %tmp628 = sext i1 %tmp627 to i32
   %tmp629 = bitcast i32 %tmp628 to float
@@ -1389,7 +1665,8 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp649 = fadd float %temp80.0, -1.000000e+00
   %tmp650 = fmul float %tmp649, %tmp76
   %tmp651 = fadd float %tmp650, 1.000000e+00
-  %tmp652 = call float @llvm.AMDGPU.clamp.f32(float %tmp651, float 0.000000e+00, float 1.000000e+00)
+  %max.0.i11 = call float @llvm.maxnum.f32(float %tmp651, float 0.000000e+00)
+  %clamp.i12 = call float @llvm.minnum.f32(float %max.0.i11, float 1.000000e+00)
   %tmp653 = bitcast float %tmp642 to i32
   %tmp654 = bitcast float %tmp644 to i32
   %tmp655 = bitcast float 0.000000e+00 to i32
@@ -1398,7 +1675,8 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp658 = insertelement <4 x i32> %tmp657, i32 %tmp655, i32 2
   %tmp659 = insertelement <4 x i32> %tmp658, i32 undef, i32 3
   %tmp128.bc = bitcast <16 x i8> %tmp128 to <4 x i32>
-  %tmp660 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp659, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp659.bc = bitcast <4 x i32> %tmp659 to <4 x float>
+  %tmp660 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp659.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp661 = extractelement <4 x float> %tmp660, i32 0
   %tmp662 = extractelement <4 x float> %tmp660, i32 1
   %tmp663 = bitcast float %tmp646 to i32
@@ -1408,7 +1686,8 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp667 = insertelement <4 x i32> %tmp666, i32 %tmp664, i32 1
   %tmp668 = insertelement <4 x i32> %tmp667, i32 %tmp665, i32 2
   %tmp669 = insertelement <4 x i32> %tmp668, i32 undef, i32 3
-  %tmp670 = call <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32> %tmp669, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp669.bc = bitcast <4 x i32> %tmp669 to <4 x float>
+  %tmp670 = call <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float> %tmp669.bc, <8 x i32> %tmp126, <4 x i32> %tmp128.bc, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp671 = extractelement <4 x float> %tmp670, i32 0
   %tmp672 = extractelement <4 x float> %tmp670, i32 1
   %tmp673 = fsub float -0.000000e+00, %tmp662
@@ -1425,11 +1704,13 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp684 = fadd float %tmp683, %temp89.0
   %tmp685 = fmul float %tmp640, %temp90.0
   %tmp686 = fadd float %tmp685, %temp91.0
-  %tmp687 = call float @llvm.AMDGPU.clamp.f32(float %tmp684, float 0.000000e+00, float 1.000000e+00)
-  %tmp688 = call float @llvm.AMDGPU.clamp.f32(float %tmp686, float 0.000000e+00, float 1.000000e+00)
-  %tmp689 = fsub float -0.000000e+00, %tmp687
+  %max.0.i9 = call float @llvm.maxnum.f32(float %tmp684, float 0.000000e+00)
+  %clamp.i10 = call float @llvm.minnum.f32(float %max.0.i9, float 1.000000e+00)
+  %max.0.i7 = call float @llvm.maxnum.f32(float %tmp686, float 0.000000e+00)
+  %clamp.i8 = call float @llvm.minnum.f32(float %max.0.i7, float 1.000000e+00)
+  %tmp689 = fsub float -0.000000e+00, %clamp.i10
   %tmp690 = fadd float %tmp661, %tmp689
-  %tmp691 = fsub float -0.000000e+00, %tmp688
+  %tmp691 = fsub float -0.000000e+00, %clamp.i8
   %tmp692 = fadd float %tmp671, %tmp691
   %tmp693 = fmul float %tmp661, %tmp661
   %tmp694 = fmul float %tmp671, %tmp671
@@ -1461,16 +1742,17 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp719 = bitcast float %tmp718 to i32
   %tmp720 = icmp ne i32 %tmp719, 0
   %temp28.0 = select i1 %tmp720, float 1.000000e+00, float %tmp710
-  %one.sub.a.i25 = fsub float 1.000000e+00, %tmp652
+  %one.sub.a.i25 = fsub float 1.000000e+00, %clamp.i12
   %one.sub.ac.i26 = fmul float %one.sub.a.i25, %.229
   %mul.i27 = fmul float %temp28.0, %.229
   %result.i28 = fadd float %mul.i27, %one.sub.ac.i26
   %tmp721 = call float @llvm.pow.f32(float %result.i28, float %tmp75)
   %tmp722 = fmul float %tmp721, %tmp78
   %tmp723 = fadd float %tmp722, %tmp79
-  %tmp724 = call float @llvm.AMDGPU.clamp.f32(float %tmp723, float 0.000000e+00, float 1.000000e+00)
-  %tmp725 = fmul float %tmp724, %tmp724
-  %tmp726 = fmul float 2.000000e+00, %tmp724
+  %max.0.i5 = call float @llvm.maxnum.f32(float %tmp723, float 0.000000e+00)
+  %clamp.i6 = call float @llvm.minnum.f32(float %max.0.i5, float 1.000000e+00)
+  %tmp725 = fmul float %clamp.i6, %clamp.i6
+  %tmp726 = fmul float 2.000000e+00, %clamp.i6
   %tmp727 = fsub float -0.000000e+00, %tmp726
   %tmp728 = fadd float 3.000000e+00, %tmp727
   %tmp729 = fmul float %tmp725, %tmp728
@@ -1504,12 +1786,13 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp747 = fadd float %tmp746, %tmp745
   %tmp748 = fmul float %temp14.0, %tmp217
   %tmp749 = fadd float %tmp747, %tmp748
-  %tmp750 = call float @fabs(float %tmp749)
+  %tmp750 = call float @llvm.fabs.f32(float %tmp749)
   %tmp751 = fmul float %tmp750, %tmp750
   %tmp752 = fmul float %tmp751, %tmp49
   %tmp753 = fadd float %tmp752, %tmp50
-  %tmp754 = call float @llvm.AMDGPU.clamp.f32(float %tmp753, float 0.000000e+00, float 1.000000e+00)
-  %tmp755 = fsub float -0.000000e+00, %tmp754
+  %max.0.i3 = call float @llvm.maxnum.f32(float %tmp753, float 0.000000e+00)
+  %clamp.i4 = call float @llvm.minnum.f32(float %max.0.i3, float 1.000000e+00)
+  %tmp755 = fsub float -0.000000e+00, %clamp.i4
   %tmp756 = fadd float 1.000000e+00, %tmp755
   %tmp757 = fmul float %tmp32, %tmp756
   %tmp758 = fmul float %tmp32, %tmp756
@@ -1545,12 +1828,11 @@ ENDIF209:                                         ; preds = %ELSE214, %ELSE211,
   %tmp772 = select i1 %tmp771, float 6.550400e+04, float %tmp766
   %tmp773 = fmul float %result.i2, %tmp51
   %tmp774 = fadd float %tmp773, %tmp52
-  %tmp775 = call float @llvm.AMDGPU.clamp.f32(float %tmp774, float 0.000000e+00, float 1.000000e+00)
-  %tmp776 = call i32 @llvm.SI.packf16(float %tmp768, float %tmp770)
-  %tmp777 = bitcast i32 %tmp776 to float
-  %tmp778 = call i32 @llvm.SI.packf16(float %tmp772, float %tmp775)
-  %tmp779 = bitcast i32 %tmp778 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp777, float %tmp779, float %tmp777, float %tmp779)
+  %max.0.i1 = call float @llvm.maxnum.f32(float %tmp774, float 0.000000e+00)
+  %clamp.i2 = call float @llvm.minnum.f32(float %max.0.i1, float 1.000000e+00)
+  %tmp776 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp768, float %tmp770)
+  %tmp778 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp772, float %clamp.i2)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp776, <2 x half> %tmp778, i1 true, i1 true) #0
   ret void
 
 ELSE214:                                          ; preds = %ELSE211
@@ -1566,57 +1848,32 @@ ELSE214:                                          ; preds = %ELSE211
   br label %ENDIF209
 }
 
-; Function Attrs: readnone
-declare float @llvm.AMDGPU.clamp.f32(float, float, float) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.l.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-
-
-declare float @llvm.exp2.f32(float) #2
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #2
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #2
-
+declare float @llvm.exp2.f32(float) #1
+declare float @llvm.ceil.f32(float) #1
+declare float @llvm.fabs.f32(float) #1
+declare float @llvm.pow.f32(float, float) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.rsq.f32(float) #1
+declare float @llvm.amdgcn.cubeid(float, float, float) #1
+declare float @llvm.amdgcn.cubesc(float, float, float) #1
+declare float @llvm.amdgcn.cubetc(float, float, float) #1
+declare float @llvm.amdgcn.cubema(float, float, float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
 declare i32 @llvm.amdgcn.mbcnt.hi(i32, i32) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.sample.d.v4f32.v8f32.v8i32(<8 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.sample.l.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
-; Function Attrs: nounwind readonly
-declare float @ceil(float) #3
-
-; Function Attrs: nounwind readnone
-declare float @llvm.amdgcn.rsq.f32(float) #2
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.d.v8i32(<8 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-
-; Function Attrs: readnone
-declare <4 x float> @llvm.AMDGPU.cube(<4 x float>) #1
-
-; Function Attrs: readnone
-declare float @fabs(float) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #2
-
-
-; Function Attrs: nounwind readnone
-declare float @llvm.pow.f32(float, float) #2
-
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #2
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-attributes #1 = { readnone }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readonly }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
diff --git a/test/CodeGen/AMDGPU/si-spill-cf.ll b/test/CodeGen/AMDGPU/si-spill-cf.ll
index 06f9277080a8595418a4815ec5c7f965adc9a527..926702645d9e42306ea30f30e22d9bc285f47e20 100644
--- a/test/CodeGen/AMDGPU/si-spill-cf.ll
+++ b/test/CodeGen/AMDGPU/si-spill-cf.ll
@@ -6,270 +6,271 @@
 
 ; SI: s_or_b64 exec, exec, [[SAVED:s\[[0-9]+:[0-9]+\]|[a-z]+]]
 ; SI-NOT: v_readlane_b32 [[SAVED]]
+
 define amdgpu_ps void @main() #0 {
 main_body:
-  %0 = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
-  %1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
-  %2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
-  %3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
-  %4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
-  %5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
-  %6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
-  %7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
-  %8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
-  %9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
-  %10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
-  %11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
-  %12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
-  %13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
-  %14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
-  %15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
-  %16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
-  %17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
-  %18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
-  %19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
-  %20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
-  %21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
-  %22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
-  %23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
-  %24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
-  %25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
-  %26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
-  %27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
-  %28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
-  %29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
-  %30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
-  %31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
-  %32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
-  %33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
-  %34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
-  %35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
-  %36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
-  %37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
-  %38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
-  %39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
-  %40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
-  %41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
-  %42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
-  %43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
-  %44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
-  %45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
-  %46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
-  %47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
-  %48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
-  %49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
-  %50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
-  %51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
-  %52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
-  %53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
-  %54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
-  %55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
-  %56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
-  %57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
-  %58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
-  %59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
-  %60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
-  %61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
-  %62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
-  %63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
-  %64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
-  %65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
-  %66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
+  %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 16)
+  %tmp1 = call float @llvm.SI.load.const(<16 x i8> undef, i32 32)
+  %tmp2 = call float @llvm.SI.load.const(<16 x i8> undef, i32 80)
+  %tmp3 = call float @llvm.SI.load.const(<16 x i8> undef, i32 84)
+  %tmp4 = call float @llvm.SI.load.const(<16 x i8> undef, i32 88)
+  %tmp5 = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
+  %tmp6 = call float @llvm.SI.load.const(<16 x i8> undef, i32 100)
+  %tmp7 = call float @llvm.SI.load.const(<16 x i8> undef, i32 104)
+  %tmp8 = call float @llvm.SI.load.const(<16 x i8> undef, i32 112)
+  %tmp9 = call float @llvm.SI.load.const(<16 x i8> undef, i32 116)
+  %tmp10 = call float @llvm.SI.load.const(<16 x i8> undef, i32 120)
+  %tmp11 = call float @llvm.SI.load.const(<16 x i8> undef, i32 128)
+  %tmp12 = call float @llvm.SI.load.const(<16 x i8> undef, i32 132)
+  %tmp13 = call float @llvm.SI.load.const(<16 x i8> undef, i32 136)
+  %tmp14 = call float @llvm.SI.load.const(<16 x i8> undef, i32 144)
+  %tmp15 = call float @llvm.SI.load.const(<16 x i8> undef, i32 148)
+  %tmp16 = call float @llvm.SI.load.const(<16 x i8> undef, i32 152)
+  %tmp17 = call float @llvm.SI.load.const(<16 x i8> undef, i32 160)
+  %tmp18 = call float @llvm.SI.load.const(<16 x i8> undef, i32 164)
+  %tmp19 = call float @llvm.SI.load.const(<16 x i8> undef, i32 168)
+  %tmp20 = call float @llvm.SI.load.const(<16 x i8> undef, i32 176)
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> undef, i32 180)
+  %tmp22 = call float @llvm.SI.load.const(<16 x i8> undef, i32 184)
+  %tmp23 = call float @llvm.SI.load.const(<16 x i8> undef, i32 192)
+  %tmp24 = call float @llvm.SI.load.const(<16 x i8> undef, i32 196)
+  %tmp25 = call float @llvm.SI.load.const(<16 x i8> undef, i32 200)
+  %tmp26 = call float @llvm.SI.load.const(<16 x i8> undef, i32 208)
+  %tmp27 = call float @llvm.SI.load.const(<16 x i8> undef, i32 212)
+  %tmp28 = call float @llvm.SI.load.const(<16 x i8> undef, i32 216)
+  %tmp29 = call float @llvm.SI.load.const(<16 x i8> undef, i32 224)
+  %tmp30 = call float @llvm.SI.load.const(<16 x i8> undef, i32 228)
+  %tmp31 = call float @llvm.SI.load.const(<16 x i8> undef, i32 232)
+  %tmp32 = call float @llvm.SI.load.const(<16 x i8> undef, i32 240)
+  %tmp33 = call float @llvm.SI.load.const(<16 x i8> undef, i32 244)
+  %tmp34 = call float @llvm.SI.load.const(<16 x i8> undef, i32 248)
+  %tmp35 = call float @llvm.SI.load.const(<16 x i8> undef, i32 256)
+  %tmp36 = call float @llvm.SI.load.const(<16 x i8> undef, i32 260)
+  %tmp37 = call float @llvm.SI.load.const(<16 x i8> undef, i32 264)
+  %tmp38 = call float @llvm.SI.load.const(<16 x i8> undef, i32 272)
+  %tmp39 = call float @llvm.SI.load.const(<16 x i8> undef, i32 276)
+  %tmp40 = call float @llvm.SI.load.const(<16 x i8> undef, i32 280)
+  %tmp41 = call float @llvm.SI.load.const(<16 x i8> undef, i32 288)
+  %tmp42 = call float @llvm.SI.load.const(<16 x i8> undef, i32 292)
+  %tmp43 = call float @llvm.SI.load.const(<16 x i8> undef, i32 296)
+  %tmp44 = call float @llvm.SI.load.const(<16 x i8> undef, i32 304)
+  %tmp45 = call float @llvm.SI.load.const(<16 x i8> undef, i32 308)
+  %tmp46 = call float @llvm.SI.load.const(<16 x i8> undef, i32 312)
+  %tmp47 = call float @llvm.SI.load.const(<16 x i8> undef, i32 320)
+  %tmp48 = call float @llvm.SI.load.const(<16 x i8> undef, i32 324)
+  %tmp49 = call float @llvm.SI.load.const(<16 x i8> undef, i32 328)
+  %tmp50 = call float @llvm.SI.load.const(<16 x i8> undef, i32 336)
+  %tmp51 = call float @llvm.SI.load.const(<16 x i8> undef, i32 340)
+  %tmp52 = call float @llvm.SI.load.const(<16 x i8> undef, i32 344)
+  %tmp53 = call float @llvm.SI.load.const(<16 x i8> undef, i32 352)
+  %tmp54 = call float @llvm.SI.load.const(<16 x i8> undef, i32 356)
+  %tmp55 = call float @llvm.SI.load.const(<16 x i8> undef, i32 360)
+  %tmp56 = call float @llvm.SI.load.const(<16 x i8> undef, i32 368)
+  %tmp57 = call float @llvm.SI.load.const(<16 x i8> undef, i32 372)
+  %tmp58 = call float @llvm.SI.load.const(<16 x i8> undef, i32 376)
+  %tmp59 = call float @llvm.SI.load.const(<16 x i8> undef, i32 384)
+  %tmp60 = call float @llvm.SI.load.const(<16 x i8> undef, i32 388)
+  %tmp61 = call float @llvm.SI.load.const(<16 x i8> undef, i32 392)
+  %tmp62 = call float @llvm.SI.load.const(<16 x i8> undef, i32 400)
+  %tmp63 = call float @llvm.SI.load.const(<16 x i8> undef, i32 404)
+  %tmp64 = call float @llvm.SI.load.const(<16 x i8> undef, i32 408)
+  %tmp65 = call float @llvm.SI.load.const(<16 x i8> undef, i32 416)
+  %tmp66 = call float @llvm.SI.load.const(<16 x i8> undef, i32 420)
   br label %LOOP
 
 LOOP:                                             ; preds = %ENDIF2795, %main_body
   %temp894.0 = phi float [ 0.000000e+00, %main_body ], [ %temp894.1, %ENDIF2795 ]
   %temp18.0 = phi float [ undef, %main_body ], [ %temp18.1, %ENDIF2795 ]
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
-  %67 = icmp sgt i32 %tid, 4
-  br i1 %67, label %ENDLOOP, label %ENDIF
+  %tmp67 = icmp sgt i32 %tid, 4
+  br i1 %tmp67, label %ENDLOOP, label %ENDIF
 
 ENDLOOP:                                          ; preds = %ELSE2566, %LOOP
-  %one.sub.a.i = fsub float 1.000000e+00, %0
+  %one.sub.a.i = fsub float 1.000000e+00, %tmp
   %one.sub.ac.i = fmul float %one.sub.a.i, undef
   %result.i = fadd float fmul (float undef, float undef), %one.sub.ac.i
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float undef, float %result.i, float undef, float 1.000000e+00)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float undef, float %result.i, float undef, float 1.000000e+00, i1 true, i1 true) #0
   ret void
 
 ENDIF:                                            ; preds = %LOOP
-  %68 = fsub float %2, undef
-  %69 = fsub float %3, undef
-  %70 = fsub float %4, undef
-  %71 = fmul float %68, 0.000000e+00
-  %72 = fmul float %69, undef
-  %73 = fmul float %70, undef
-  %74 = fsub float %6, undef
-  %75 = fsub float %7, undef
-  %76 = fmul float %74, undef
-  %77 = fmul float %75, 0.000000e+00
-  %78 = call float @llvm.minnum.f32(float %73, float %77)
-  %79 = call float @llvm.maxnum.f32(float %71, float 0.000000e+00)
-  %80 = call float @llvm.maxnum.f32(float %72, float %76)
-  %81 = call float @llvm.maxnum.f32(float undef, float %78)
-  %82 = call float @llvm.minnum.f32(float %79, float %80)
-  %83 = call float @llvm.minnum.f32(float %82, float undef)
-  %84 = fsub float %14, undef
-  %85 = fsub float %15, undef
-  %86 = fsub float %16, undef
-  %87 = fmul float %84, undef
-  %88 = fmul float %85, undef
-  %89 = fmul float %86, undef
-  %90 = fsub float %17, undef
-  %91 = fsub float %18, undef
-  %92 = fsub float %19, undef
-  %93 = fmul float %90, 0.000000e+00
-  %94 = fmul float %91, undef
-  %95 = fmul float %92, undef
-  %96 = call float @llvm.minnum.f32(float %88, float %94)
-  %97 = call float @llvm.maxnum.f32(float %87, float %93)
-  %98 = call float @llvm.maxnum.f32(float %89, float %95)
-  %99 = call float @llvm.maxnum.f32(float undef, float %96)
-  %100 = call float @llvm.maxnum.f32(float %99, float undef)
-  %101 = call float @llvm.minnum.f32(float %97, float undef)
-  %102 = call float @llvm.minnum.f32(float %101, float %98)
-  %103 = fsub float %30, undef
-  %104 = fsub float %31, undef
-  %105 = fmul float %103, 0.000000e+00
-  %106 = fmul float %104, 0.000000e+00
-  %107 = call float @llvm.minnum.f32(float undef, float %105)
-  %108 = call float @llvm.maxnum.f32(float undef, float %106)
-  %109 = call float @llvm.maxnum.f32(float undef, float %107)
-  %110 = call float @llvm.maxnum.f32(float %109, float undef)
-  %111 = call float @llvm.minnum.f32(float undef, float %108)
-  %112 = fsub float %32, undef
-  %113 = fsub float %33, undef
-  %114 = fsub float %34, undef
-  %115 = fmul float %112, 0.000000e+00
-  %116 = fmul float %113, undef
-  %117 = fmul float %114, undef
-  %118 = fsub float %35, undef
-  %119 = fsub float %36, undef
-  %120 = fsub float %37, undef
-  %121 = fmul float %118, undef
-  %122 = fmul float %119, undef
-  %123 = fmul float %120, undef
-  %124 = call float @llvm.minnum.f32(float %115, float %121)
-  %125 = call float @llvm.minnum.f32(float %116, float %122)
-  %126 = call float @llvm.minnum.f32(float %117, float %123)
-  %127 = call float @llvm.maxnum.f32(float %124, float %125)
-  %128 = call float @llvm.maxnum.f32(float %127, float %126)
-  %129 = fsub float %38, undef
-  %130 = fsub float %39, undef
-  %131 = fsub float %40, undef
-  %132 = fmul float %129, 0.000000e+00
-  %133 = fmul float %130, undef
-  %134 = fmul float %131, undef
-  %135 = fsub float %41, undef
-  %136 = fsub float %42, undef
-  %137 = fsub float %43, undef
-  %138 = fmul float %135, undef
-  %139 = fmul float %136, undef
-  %140 = fmul float %137, undef
-  %141 = call float @llvm.minnum.f32(float %132, float %138)
-  %142 = call float @llvm.minnum.f32(float %133, float %139)
-  %143 = call float @llvm.minnum.f32(float %134, float %140)
-  %144 = call float @llvm.maxnum.f32(float %141, float %142)
-  %145 = call float @llvm.maxnum.f32(float %144, float %143)
-  %146 = fsub float %44, undef
-  %147 = fsub float %45, undef
-  %148 = fsub float %46, undef
-  %149 = fmul float %146, 0.000000e+00
-  %150 = fmul float %147, 0.000000e+00
-  %151 = fmul float %148, undef
-  %152 = fsub float %47, undef
-  %153 = fsub float %48, undef
-  %154 = fsub float %49, undef
-  %155 = fmul float %152, undef
-  %156 = fmul float %153, 0.000000e+00
-  %157 = fmul float %154, undef
-  %158 = call float @llvm.minnum.f32(float %149, float %155)
-  %159 = call float @llvm.minnum.f32(float %150, float %156)
-  %160 = call float @llvm.minnum.f32(float %151, float %157)
-  %161 = call float @llvm.maxnum.f32(float %158, float %159)
-  %162 = call float @llvm.maxnum.f32(float %161, float %160)
-  %163 = fsub float %50, undef
-  %164 = fsub float %51, undef
-  %165 = fsub float %52, undef
-  %166 = fmul float %163, undef
-  %167 = fmul float %164, 0.000000e+00
-  %168 = fmul float %165, 0.000000e+00
-  %169 = fsub float %53, undef
-  %170 = fsub float %54, undef
-  %171 = fsub float %55, undef
-  %172 = fdiv float 1.000000e+00, %temp18.0
-  %173 = fmul float %169, undef
-  %174 = fmul float %170, undef
-  %175 = fmul float %171, %172
-  %176 = call float @llvm.minnum.f32(float %166, float %173)
-  %177 = call float @llvm.minnum.f32(float %167, float %174)
-  %178 = call float @llvm.minnum.f32(float %168, float %175)
-  %179 = call float @llvm.maxnum.f32(float %176, float %177)
-  %180 = call float @llvm.maxnum.f32(float %179, float %178)
-  %181 = fsub float %62, undef
-  %182 = fsub float %63, undef
-  %183 = fsub float %64, undef
-  %184 = fmul float %181, 0.000000e+00
-  %185 = fmul float %182, undef
-  %186 = fmul float %183, undef
-  %187 = fsub float %65, undef
-  %188 = fsub float %66, undef
-  %189 = fmul float %187, undef
-  %190 = fmul float %188, undef
-  %191 = call float @llvm.maxnum.f32(float %184, float %189)
-  %192 = call float @llvm.maxnum.f32(float %185, float %190)
-  %193 = call float @llvm.maxnum.f32(float %186, float undef)
-  %194 = call float @llvm.minnum.f32(float %191, float %192)
-  %195 = call float @llvm.minnum.f32(float %194, float %193)
-  %.temp292.7 = select i1 undef, float %162, float undef
-  %temp292.9 = select i1 false, float %180, float %.temp292.7
+  %tmp68 = fsub float %tmp2, undef
+  %tmp69 = fsub float %tmp3, undef
+  %tmp70 = fsub float %tmp4, undef
+  %tmp71 = fmul float %tmp68, 0.000000e+00
+  %tmp72 = fmul float %tmp69, undef
+  %tmp73 = fmul float %tmp70, undef
+  %tmp74 = fsub float %tmp6, undef
+  %tmp75 = fsub float %tmp7, undef
+  %tmp76 = fmul float %tmp74, undef
+  %tmp77 = fmul float %tmp75, 0.000000e+00
+  %tmp78 = call float @llvm.minnum.f32(float %tmp73, float %tmp77)
+  %tmp79 = call float @llvm.maxnum.f32(float %tmp71, float 0.000000e+00)
+  %tmp80 = call float @llvm.maxnum.f32(float %tmp72, float %tmp76)
+  %tmp81 = call float @llvm.maxnum.f32(float undef, float %tmp78)
+  %tmp82 = call float @llvm.minnum.f32(float %tmp79, float %tmp80)
+  %tmp83 = call float @llvm.minnum.f32(float %tmp82, float undef)
+  %tmp84 = fsub float %tmp14, undef
+  %tmp85 = fsub float %tmp15, undef
+  %tmp86 = fsub float %tmp16, undef
+  %tmp87 = fmul float %tmp84, undef
+  %tmp88 = fmul float %tmp85, undef
+  %tmp89 = fmul float %tmp86, undef
+  %tmp90 = fsub float %tmp17, undef
+  %tmp91 = fsub float %tmp18, undef
+  %tmp92 = fsub float %tmp19, undef
+  %tmp93 = fmul float %tmp90, 0.000000e+00
+  %tmp94 = fmul float %tmp91, undef
+  %tmp95 = fmul float %tmp92, undef
+  %tmp96 = call float @llvm.minnum.f32(float %tmp88, float %tmp94)
+  %tmp97 = call float @llvm.maxnum.f32(float %tmp87, float %tmp93)
+  %tmp98 = call float @llvm.maxnum.f32(float %tmp89, float %tmp95)
+  %tmp99 = call float @llvm.maxnum.f32(float undef, float %tmp96)
+  %tmp100 = call float @llvm.maxnum.f32(float %tmp99, float undef)
+  %tmp101 = call float @llvm.minnum.f32(float %tmp97, float undef)
+  %tmp102 = call float @llvm.minnum.f32(float %tmp101, float %tmp98)
+  %tmp103 = fsub float %tmp30, undef
+  %tmp104 = fsub float %tmp31, undef
+  %tmp105 = fmul float %tmp103, 0.000000e+00
+  %tmp106 = fmul float %tmp104, 0.000000e+00
+  %tmp107 = call float @llvm.minnum.f32(float undef, float %tmp105)
+  %tmp108 = call float @llvm.maxnum.f32(float undef, float %tmp106)
+  %tmp109 = call float @llvm.maxnum.f32(float undef, float %tmp107)
+  %tmp110 = call float @llvm.maxnum.f32(float %tmp109, float undef)
+  %tmp111 = call float @llvm.minnum.f32(float undef, float %tmp108)
+  %tmp112 = fsub float %tmp32, undef
+  %tmp113 = fsub float %tmp33, undef
+  %tmp114 = fsub float %tmp34, undef
+  %tmp115 = fmul float %tmp112, 0.000000e+00
+  %tmp116 = fmul float %tmp113, undef
+  %tmp117 = fmul float %tmp114, undef
+  %tmp118 = fsub float %tmp35, undef
+  %tmp119 = fsub float %tmp36, undef
+  %tmp120 = fsub float %tmp37, undef
+  %tmp121 = fmul float %tmp118, undef
+  %tmp122 = fmul float %tmp119, undef
+  %tmp123 = fmul float %tmp120, undef
+  %tmp124 = call float @llvm.minnum.f32(float %tmp115, float %tmp121)
+  %tmp125 = call float @llvm.minnum.f32(float %tmp116, float %tmp122)
+  %tmp126 = call float @llvm.minnum.f32(float %tmp117, float %tmp123)
+  %tmp127 = call float @llvm.maxnum.f32(float %tmp124, float %tmp125)
+  %tmp128 = call float @llvm.maxnum.f32(float %tmp127, float %tmp126)
+  %tmp129 = fsub float %tmp38, undef
+  %tmp130 = fsub float %tmp39, undef
+  %tmp131 = fsub float %tmp40, undef
+  %tmp132 = fmul float %tmp129, 0.000000e+00
+  %tmp133 = fmul float %tmp130, undef
+  %tmp134 = fmul float %tmp131, undef
+  %tmp135 = fsub float %tmp41, undef
+  %tmp136 = fsub float %tmp42, undef
+  %tmp137 = fsub float %tmp43, undef
+  %tmp138 = fmul float %tmp135, undef
+  %tmp139 = fmul float %tmp136, undef
+  %tmp140 = fmul float %tmp137, undef
+  %tmp141 = call float @llvm.minnum.f32(float %tmp132, float %tmp138)
+  %tmp142 = call float @llvm.minnum.f32(float %tmp133, float %tmp139)
+  %tmp143 = call float @llvm.minnum.f32(float %tmp134, float %tmp140)
+  %tmp144 = call float @llvm.maxnum.f32(float %tmp141, float %tmp142)
+  %tmp145 = call float @llvm.maxnum.f32(float %tmp144, float %tmp143)
+  %tmp146 = fsub float %tmp44, undef
+  %tmp147 = fsub float %tmp45, undef
+  %tmp148 = fsub float %tmp46, undef
+  %tmp149 = fmul float %tmp146, 0.000000e+00
+  %tmp150 = fmul float %tmp147, 0.000000e+00
+  %tmp151 = fmul float %tmp148, undef
+  %tmp152 = fsub float %tmp47, undef
+  %tmp153 = fsub float %tmp48, undef
+  %tmp154 = fsub float %tmp49, undef
+  %tmp155 = fmul float %tmp152, undef
+  %tmp156 = fmul float %tmp153, 0.000000e+00
+  %tmp157 = fmul float %tmp154, undef
+  %tmp158 = call float @llvm.minnum.f32(float %tmp149, float %tmp155)
+  %tmp159 = call float @llvm.minnum.f32(float %tmp150, float %tmp156)
+  %tmp160 = call float @llvm.minnum.f32(float %tmp151, float %tmp157)
+  %tmp161 = call float @llvm.maxnum.f32(float %tmp158, float %tmp159)
+  %tmp162 = call float @llvm.maxnum.f32(float %tmp161, float %tmp160)
+  %tmp163 = fsub float %tmp50, undef
+  %tmp164 = fsub float %tmp51, undef
+  %tmp165 = fsub float %tmp52, undef
+  %tmp166 = fmul float %tmp163, undef
+  %tmp167 = fmul float %tmp164, 0.000000e+00
+  %tmp168 = fmul float %tmp165, 0.000000e+00
+  %tmp169 = fsub float %tmp53, undef
+  %tmp170 = fsub float %tmp54, undef
+  %tmp171 = fsub float %tmp55, undef
+  %tmp172 = fdiv float 1.000000e+00, %temp18.0
+  %tmp173 = fmul float %tmp169, undef
+  %tmp174 = fmul float %tmp170, undef
+  %tmp175 = fmul float %tmp171, %tmp172
+  %tmp176 = call float @llvm.minnum.f32(float %tmp166, float %tmp173)
+  %tmp177 = call float @llvm.minnum.f32(float %tmp167, float %tmp174)
+  %tmp178 = call float @llvm.minnum.f32(float %tmp168, float %tmp175)
+  %tmp179 = call float @llvm.maxnum.f32(float %tmp176, float %tmp177)
+  %tmp180 = call float @llvm.maxnum.f32(float %tmp179, float %tmp178)
+  %tmp181 = fsub float %tmp62, undef
+  %tmp182 = fsub float %tmp63, undef
+  %tmp183 = fsub float %tmp64, undef
+  %tmp184 = fmul float %tmp181, 0.000000e+00
+  %tmp185 = fmul float %tmp182, undef
+  %tmp186 = fmul float %tmp183, undef
+  %tmp187 = fsub float %tmp65, undef
+  %tmp188 = fsub float %tmp66, undef
+  %tmp189 = fmul float %tmp187, undef
+  %tmp190 = fmul float %tmp188, undef
+  %tmp191 = call float @llvm.maxnum.f32(float %tmp184, float %tmp189)
+  %tmp192 = call float @llvm.maxnum.f32(float %tmp185, float %tmp190)
+  %tmp193 = call float @llvm.maxnum.f32(float %tmp186, float undef)
+  %tmp194 = call float @llvm.minnum.f32(float %tmp191, float %tmp192)
+  %tmp195 = call float @llvm.minnum.f32(float %tmp194, float %tmp193)
+  %.temp292.7 = select i1 undef, float %tmp162, float undef
+  %temp292.9 = select i1 false, float %tmp180, float %.temp292.7
   %.temp292.9 = select i1 undef, float undef, float %temp292.9
-  %196 = fcmp ogt float undef, 0.000000e+00
-  %197 = fcmp olt float undef, %195
-  %198 = and i1 %196, %197
-  %199 = fcmp olt float undef, %.temp292.9
-  %200 = and i1 %198, %199
-  %temp292.11 = select i1 %200, float undef, float %.temp292.9
-  %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tmp196 = fcmp ogt float undef, 0.000000e+00
+  %tmp197 = fcmp olt float undef, %tmp195
+  %tmp198 = and i1 %tmp196, %tmp197
+  %tmp199 = fcmp olt float undef, %.temp292.9
+  %tmp200 = and i1 %tmp198, %tmp199
+  %temp292.11 = select i1 %tmp200, float undef, float %.temp292.9
+  %tid0 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %cmp0 = icmp eq i32 %tid0, 0
   br i1 %cmp0, label %IF2565, label %ELSE2566
 
 IF2565:                                           ; preds = %ENDIF
-  %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tid1 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %cmp1 = icmp eq i32 %tid1, 0
   br i1 %cmp1, label %ENDIF2582, label %ELSE2584
 
 ELSE2566:                                         ; preds = %ENDIF
-  %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tid2 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tidf = bitcast i32 %tid2 to float
-  %201 = fcmp oeq float %temp292.11, %tidf
-  br i1 %201, label %ENDLOOP, label %ELSE2593
+  %tmp201 = fcmp oeq float %temp292.11, %tidf
+  br i1 %tmp201, label %ENDLOOP, label %ELSE2593
 
 ENDIF2564:                                        ; preds = %ENDIF2594, %ENDIF2588
   %temp894.1 = phi float [ undef, %ENDIF2588 ], [ %temp894.2, %ENDIF2594 ]
-  %temp18.1 = phi float [ %218, %ENDIF2588 ], [ undef, %ENDIF2594 ]
-  %202 = fsub float %5, undef
-  %203 = fmul float %202, undef
-  %204 = call float @llvm.maxnum.f32(float undef, float %203)
-  %205 = call float @llvm.minnum.f32(float %204, float undef)
-  %206 = call float @llvm.minnum.f32(float %205, float undef)
-  %207 = fcmp ogt float undef, 0.000000e+00
-  %208 = fcmp olt float undef, 1.000000e+00
-  %209 = and i1 %207, %208
-  %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %temp18.1 = phi float [ %tmp218, %ENDIF2588 ], [ undef, %ENDIF2594 ]
+  %tmp202 = fsub float %tmp5, undef
+  %tmp203 = fmul float %tmp202, undef
+  %tmp204 = call float @llvm.maxnum.f32(float undef, float %tmp203)
+  %tmp205 = call float @llvm.minnum.f32(float %tmp204, float undef)
+  %tmp206 = call float @llvm.minnum.f32(float %tmp205, float undef)
+  %tmp207 = fcmp ogt float undef, 0.000000e+00
+  %tmp208 = fcmp olt float undef, 1.000000e+00
+  %tmp209 = and i1 %tmp207, %tmp208
+  %tid3 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tidf3 = bitcast i32 %tid3 to float
-  %210 = fcmp olt float %tidf3, %206
-  %211 = and i1 %209, %210
-  br i1 %211, label %ENDIF2795, label %ELSE2797
+  %tmp210 = fcmp olt float %tidf3, %tmp206
+  %tmp211 = and i1 %tmp209, %tmp210
+  br i1 %tmp211, label %ENDIF2795, label %ELSE2797
 
 ELSE2584:                                         ; preds = %IF2565
   br label %ENDIF2582
 
 ENDIF2582:                                        ; preds = %ELSE2584, %IF2565
-  %212 = fadd float %1, undef
-  %213 = fadd float 0.000000e+00, %212
-  %floor = call float @llvm.floor.f32(float %213)
-  %214 = fsub float %213, %floor
-  %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #2
+  %tmp212 = fadd float %tmp1, undef
+  %tmp213 = fadd float 0.000000e+00, %tmp212
+  %floor = call float @llvm.floor.f32(float %tmp213)
+  %tmp214 = fsub float %tmp213, %floor
+  %tid4 = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %cmp4 = icmp eq i32 %tid4, 0
   br i1 %cmp4, label %IF2589, label %ELSE2590
 
@@ -280,61 +281,61 @@ ELSE2590:                                         ; preds = %ENDIF2582
   br label %ENDIF2588
 
 ENDIF2588:                                        ; preds = %ELSE2590, %IF2589
-  %215 = fsub float 1.000000e+00, %214
-  %216 = call float @llvm.sqrt.f32(float %215)
-  %217 = fmul float %216, undef
-  %218 = fadd float %217, undef
+  %tmp215 = fsub float 1.000000e+00, %tmp214
+  %tmp216 = call float @llvm.sqrt.f32(float %tmp215)
+  %tmp217 = fmul float %tmp216, undef
+  %tmp218 = fadd float %tmp217, undef
   br label %ENDIF2564
 
 ELSE2593:                                         ; preds = %ELSE2566
-  %219 = fcmp oeq float %temp292.11, %81
-  %220 = fcmp olt float %81, %83
-  %221 = and i1 %219, %220
-  br i1 %221, label %ENDIF2594, label %ELSE2596
+  %tmp219 = fcmp oeq float %temp292.11, %tmp81
+  %tmp220 = fcmp olt float %tmp81, %tmp83
+  %tmp221 = and i1 %tmp219, %tmp220
+  br i1 %tmp221, label %ENDIF2594, label %ELSE2596
 
 ELSE2596:                                         ; preds = %ELSE2593
-  %222 = fcmp oeq float %temp292.11, %100
-  %223 = fcmp olt float %100, %102
-  %224 = and i1 %222, %223
-  br i1 %224, label %ENDIF2594, label %ELSE2632
+  %tmp222 = fcmp oeq float %temp292.11, %tmp100
+  %tmp223 = fcmp olt float %tmp100, %tmp102
+  %tmp224 = and i1 %tmp222, %tmp223
+  br i1 %tmp224, label %ENDIF2594, label %ELSE2632
 
 ENDIF2594:                                        ; preds = %ELSE2788, %ELSE2785, %ELSE2782, %ELSE2779, %IF2775, %ELSE2761, %ELSE2758, %IF2757, %ELSE2704, %ELSE2686, %ELSE2671, %ELSE2668, %IF2667, %ELSE2632, %ELSE2596, %ELSE2593
   %temp894.2 = phi float [ 0.000000e+00, %IF2667 ], [ 0.000000e+00, %ELSE2671 ], [ 0.000000e+00, %IF2757 ], [ 0.000000e+00, %ELSE2761 ], [ %temp894.0, %ELSE2758 ], [ 0.000000e+00, %IF2775 ], [ 0.000000e+00, %ELSE2779 ], [ 0.000000e+00, %ELSE2782 ], [ %.2848, %ELSE2788 ], [ 0.000000e+00, %ELSE2785 ], [ 0.000000e+00, %ELSE2593 ], [ 0.000000e+00, %ELSE2632 ], [ 0.000000e+00, %ELSE2704 ], [ 0.000000e+00, %ELSE2686 ], [ 0.000000e+00, %ELSE2668 ], [ 0.000000e+00, %ELSE2596 ]
-  %225 = fmul float %temp894.2, undef
+  %tmp225 = fmul float %temp894.2, undef
   br label %ENDIF2564
 
 ELSE2632:                                         ; preds = %ELSE2596
   br i1 undef, label %ENDIF2594, label %ELSE2650
 
 ELSE2650:                                         ; preds = %ELSE2632
-  %226 = fcmp oeq float %temp292.11, %110
-  %227 = fcmp olt float %110, %111
-  %228 = and i1 %226, %227
-  br i1 %228, label %IF2667, label %ELSE2668
+  %tmp226 = fcmp oeq float %temp292.11, %tmp110
+  %tmp227 = fcmp olt float %tmp110, %tmp111
+  %tmp228 = and i1 %tmp226, %tmp227
+  br i1 %tmp228, label %IF2667, label %ELSE2668
 
 IF2667:                                           ; preds = %ELSE2650
   br i1 undef, label %ENDIF2594, label %ELSE2671
 
 ELSE2668:                                         ; preds = %ELSE2650
-  %229 = fcmp oeq float %temp292.11, %128
-  %230 = fcmp olt float %128, undef
-  %231 = and i1 %229, %230
-  br i1 %231, label %ENDIF2594, label %ELSE2686
+  %tmp229 = fcmp oeq float %temp292.11, %tmp128
+  %tmp230 = fcmp olt float %tmp128, undef
+  %tmp231 = and i1 %tmp229, %tmp230
+  br i1 %tmp231, label %ENDIF2594, label %ELSE2686
 
 ELSE2671:                                         ; preds = %IF2667
   br label %ENDIF2594
 
 ELSE2686:                                         ; preds = %ELSE2668
-  %232 = fcmp oeq float %temp292.11, %145
-  %233 = fcmp olt float %145, undef
-  %234 = and i1 %232, %233
-  br i1 %234, label %ENDIF2594, label %ELSE2704
+  %tmp232 = fcmp oeq float %temp292.11, %tmp145
+  %tmp233 = fcmp olt float %tmp145, undef
+  %tmp234 = and i1 %tmp232, %tmp233
+  br i1 %tmp234, label %ENDIF2594, label %ELSE2704
 
 ELSE2704:                                         ; preds = %ELSE2686
-  %235 = fcmp oeq float %temp292.11, %180
-  %236 = fcmp olt float %180, undef
-  %237 = and i1 %235, %236
-  br i1 %237, label %ENDIF2594, label %ELSE2740
+  %tmp235 = fcmp oeq float %temp292.11, %tmp180
+  %tmp236 = fcmp olt float %tmp180, undef
+  %tmp237 = and i1 %tmp235, %tmp236
+  br i1 %tmp237, label %ENDIF2594, label %ELSE2740
 
 ELSE2740:                                         ; preds = %ELSE2704
   br i1 undef, label %IF2757, label %ELSE2758
@@ -349,8 +350,8 @@ ELSE2761:                                         ; preds = %IF2757
   br label %ENDIF2594
 
 IF2775:                                           ; preds = %ELSE2758
-  %238 = fcmp olt float undef, undef
-  br i1 %238, label %ENDIF2594, label %ELSE2779
+  %tmp238 = fcmp olt float undef, undef
+  br i1 %tmp238, label %ENDIF2594, label %ELSE2779
 
 ELSE2779:                                         ; preds = %IF2775
   br i1 undef, label %ENDIF2594, label %ELSE2782
@@ -359,39 +360,39 @@ ELSE2782:                                         ; preds = %ELSE2779
   br i1 undef, label %ENDIF2594, label %ELSE2785
 
 ELSE2785:                                         ; preds = %ELSE2782
-  %239 = fcmp olt float undef, 0.000000e+00
-  br i1 %239, label %ENDIF2594, label %ELSE2788
+  %tmp239 = fcmp olt float undef, 0.000000e+00
+  br i1 %tmp239, label %ENDIF2594, label %ELSE2788
 
 ELSE2788:                                         ; preds = %ELSE2785
-  %240 = fcmp olt float 0.000000e+00, undef
-  %.2848 = select i1 %240, float -1.000000e+00, float 1.000000e+00
+  %tmp240 = fcmp olt float 0.000000e+00, undef
+  %.2848 = select i1 %tmp240, float -1.000000e+00, float 1.000000e+00
   br label %ENDIF2594
 
 ELSE2797:                                         ; preds = %ENDIF2564
-  %241 = fsub float %8, undef
-  %242 = fsub float %9, undef
-  %243 = fsub float %10, undef
-  %244 = fmul float %241, undef
-  %245 = fmul float %242, undef
-  %246 = fmul float %243, undef
-  %247 = fsub float %11, undef
-  %248 = fsub float %12, undef
-  %249 = fsub float %13, undef
-  %250 = fmul float %247, undef
-  %251 = fmul float %248, undef
-  %252 = fmul float %249, undef
-  %253 = call float @llvm.minnum.f32(float %244, float %250)
-  %254 = call float @llvm.minnum.f32(float %245, float %251)
-  %255 = call float @llvm.maxnum.f32(float %246, float %252)
-  %256 = call float @llvm.maxnum.f32(float %253, float %254)
-  %257 = call float @llvm.maxnum.f32(float %256, float undef)
-  %258 = call float @llvm.minnum.f32(float undef, float %255)
-  %259 = fcmp ogt float %257, 0.000000e+00
-  %260 = fcmp olt float %257, 1.000000e+00
-  %261 = and i1 %259, %260
-  %262 = fcmp olt float %257, %258
-  %263 = and i1 %261, %262
-  br i1 %263, label %ENDIF2795, label %ELSE2800
+  %tmp241 = fsub float %tmp8, undef
+  %tmp242 = fsub float %tmp9, undef
+  %tmp243 = fsub float %tmp10, undef
+  %tmp244 = fmul float %tmp241, undef
+  %tmp245 = fmul float %tmp242, undef
+  %tmp246 = fmul float %tmp243, undef
+  %tmp247 = fsub float %tmp11, undef
+  %tmp248 = fsub float %tmp12, undef
+  %tmp249 = fsub float %tmp13, undef
+  %tmp250 = fmul float %tmp247, undef
+  %tmp251 = fmul float %tmp248, undef
+  %tmp252 = fmul float %tmp249, undef
+  %tmp253 = call float @llvm.minnum.f32(float %tmp244, float %tmp250)
+  %tmp254 = call float @llvm.minnum.f32(float %tmp245, float %tmp251)
+  %tmp255 = call float @llvm.maxnum.f32(float %tmp246, float %tmp252)
+  %tmp256 = call float @llvm.maxnum.f32(float %tmp253, float %tmp254)
+  %tmp257 = call float @llvm.maxnum.f32(float %tmp256, float undef)
+  %tmp258 = call float @llvm.minnum.f32(float undef, float %tmp255)
+  %tmp259 = fcmp ogt float %tmp257, 0.000000e+00
+  %tmp260 = fcmp olt float %tmp257, 1.000000e+00
+  %tmp261 = and i1 %tmp259, %tmp260
+  %tmp262 = fcmp olt float %tmp257, %tmp258
+  %tmp263 = and i1 %tmp261, %tmp262
+  br i1 %tmp263, label %ENDIF2795, label %ELSE2800
 
 ENDIF2795:                                        ; preds = %ELSE2824, %ELSE2821, %ELSE2818, %ELSE2815, %ELSE2812, %ELSE2809, %ELSE2806, %ELSE2803, %ELSE2800, %ELSE2797, %ENDIF2564
   br label %LOOP
@@ -400,53 +401,53 @@ ELSE2800:                                         ; preds = %ELSE2797
   br i1 undef, label %ENDIF2795, label %ELSE2803
 
 ELSE2803:                                         ; preds = %ELSE2800
-  %264 = fsub float %20, undef
-  %265 = fsub float %21, undef
-  %266 = fsub float %22, undef
-  %267 = fmul float %264, undef
-  %268 = fmul float %265, undef
-  %269 = fmul float %266, 0.000000e+00
-  %270 = fsub float %23, undef
-  %271 = fsub float %24, undef
-  %272 = fsub float %25, undef
-  %273 = fmul float %270, undef
-  %274 = fmul float %271, undef
-  %275 = fmul float %272, undef
-  %276 = call float @llvm.minnum.f32(float %267, float %273)
-  %277 = call float @llvm.maxnum.f32(float %268, float %274)
-  %278 = call float @llvm.maxnum.f32(float %269, float %275)
-  %279 = call float @llvm.maxnum.f32(float %276, float undef)
-  %280 = call float @llvm.maxnum.f32(float %279, float undef)
-  %281 = call float @llvm.minnum.f32(float undef, float %277)
-  %282 = call float @llvm.minnum.f32(float %281, float %278)
-  %283 = fcmp ogt float %280, 0.000000e+00
-  %284 = fcmp olt float %280, 1.000000e+00
-  %285 = and i1 %283, %284
-  %286 = fcmp olt float %280, %282
-  %287 = and i1 %285, %286
-  br i1 %287, label %ENDIF2795, label %ELSE2806
+  %tmp264 = fsub float %tmp20, undef
+  %tmp265 = fsub float %tmp21, undef
+  %tmp266 = fsub float %tmp22, undef
+  %tmp267 = fmul float %tmp264, undef
+  %tmp268 = fmul float %tmp265, undef
+  %tmp269 = fmul float %tmp266, 0.000000e+00
+  %tmp270 = fsub float %tmp23, undef
+  %tmp271 = fsub float %tmp24, undef
+  %tmp272 = fsub float %tmp25, undef
+  %tmp273 = fmul float %tmp270, undef
+  %tmp274 = fmul float %tmp271, undef
+  %tmp275 = fmul float %tmp272, undef
+  %tmp276 = call float @llvm.minnum.f32(float %tmp267, float %tmp273)
+  %tmp277 = call float @llvm.maxnum.f32(float %tmp268, float %tmp274)
+  %tmp278 = call float @llvm.maxnum.f32(float %tmp269, float %tmp275)
+  %tmp279 = call float @llvm.maxnum.f32(float %tmp276, float undef)
+  %tmp280 = call float @llvm.maxnum.f32(float %tmp279, float undef)
+  %tmp281 = call float @llvm.minnum.f32(float undef, float %tmp277)
+  %tmp282 = call float @llvm.minnum.f32(float %tmp281, float %tmp278)
+  %tmp283 = fcmp ogt float %tmp280, 0.000000e+00
+  %tmp284 = fcmp olt float %tmp280, 1.000000e+00
+  %tmp285 = and i1 %tmp283, %tmp284
+  %tmp286 = fcmp olt float %tmp280, %tmp282
+  %tmp287 = and i1 %tmp285, %tmp286
+  br i1 %tmp287, label %ENDIF2795, label %ELSE2806
 
 ELSE2806:                                         ; preds = %ELSE2803
-  %288 = fsub float %26, undef
-  %289 = fsub float %27, undef
-  %290 = fsub float %28, undef
-  %291 = fmul float %288, undef
-  %292 = fmul float %289, 0.000000e+00
-  %293 = fmul float %290, undef
-  %294 = fsub float %29, undef
-  %295 = fmul float %294, undef
-  %296 = call float @llvm.minnum.f32(float %291, float %295)
-  %297 = call float @llvm.minnum.f32(float %292, float undef)
-  %298 = call float @llvm.maxnum.f32(float %293, float undef)
-  %299 = call float @llvm.maxnum.f32(float %296, float %297)
-  %300 = call float @llvm.maxnum.f32(float %299, float undef)
-  %301 = call float @llvm.minnum.f32(float undef, float %298)
-  %302 = fcmp ogt float %300, 0.000000e+00
-  %303 = fcmp olt float %300, 1.000000e+00
-  %304 = and i1 %302, %303
-  %305 = fcmp olt float %300, %301
-  %306 = and i1 %304, %305
-  br i1 %306, label %ENDIF2795, label %ELSE2809
+  %tmp288 = fsub float %tmp26, undef
+  %tmp289 = fsub float %tmp27, undef
+  %tmp290 = fsub float %tmp28, undef
+  %tmp291 = fmul float %tmp288, undef
+  %tmp292 = fmul float %tmp289, 0.000000e+00
+  %tmp293 = fmul float %tmp290, undef
+  %tmp294 = fsub float %tmp29, undef
+  %tmp295 = fmul float %tmp294, undef
+  %tmp296 = call float @llvm.minnum.f32(float %tmp291, float %tmp295)
+  %tmp297 = call float @llvm.minnum.f32(float %tmp292, float undef)
+  %tmp298 = call float @llvm.maxnum.f32(float %tmp293, float undef)
+  %tmp299 = call float @llvm.maxnum.f32(float %tmp296, float %tmp297)
+  %tmp300 = call float @llvm.maxnum.f32(float %tmp299, float undef)
+  %tmp301 = call float @llvm.minnum.f32(float undef, float %tmp298)
+  %tmp302 = fcmp ogt float %tmp300, 0.000000e+00
+  %tmp303 = fcmp olt float %tmp300, 1.000000e+00
+  %tmp304 = and i1 %tmp302, %tmp303
+  %tmp305 = fcmp olt float %tmp300, %tmp301
+  %tmp306 = and i1 %tmp304, %tmp305
+  br i1 %tmp306, label %ENDIF2795, label %ELSE2809
 
 ELSE2809:                                         ; preds = %ELSE2806
   br i1 undef, label %ENDIF2795, label %ELSE2812
@@ -461,53 +462,42 @@ ELSE2818:                                         ; preds = %ELSE2815
   br i1 undef, label %ENDIF2795, label %ELSE2821
 
 ELSE2821:                                         ; preds = %ELSE2818
-  %307 = fsub float %56, undef
-  %308 = fsub float %57, undef
-  %309 = fsub float %58, undef
-  %310 = fmul float %307, undef
-  %311 = fmul float %308, 0.000000e+00
-  %312 = fmul float %309, undef
-  %313 = fsub float %59, undef
-  %314 = fsub float %60, undef
-  %315 = fsub float %61, undef
-  %316 = fmul float %313, undef
-  %317 = fmul float %314, undef
-  %318 = fmul float %315, undef
-  %319 = call float @llvm.maxnum.f32(float %310, float %316)
-  %320 = call float @llvm.maxnum.f32(float %311, float %317)
-  %321 = call float @llvm.maxnum.f32(float %312, float %318)
-  %322 = call float @llvm.minnum.f32(float %319, float %320)
-  %323 = call float @llvm.minnum.f32(float %322, float %321)
-  %324 = fcmp ogt float undef, 0.000000e+00
-  %325 = fcmp olt float undef, 1.000000e+00
-  %326 = and i1 %324, %325
-  %327 = fcmp olt float undef, %323
-  %328 = and i1 %326, %327
-  br i1 %328, label %ENDIF2795, label %ELSE2824
+  %tmp307 = fsub float %tmp56, undef
+  %tmp308 = fsub float %tmp57, undef
+  %tmp309 = fsub float %tmp58, undef
+  %tmp310 = fmul float %tmp307, undef
+  %tmp311 = fmul float %tmp308, 0.000000e+00
+  %tmp312 = fmul float %tmp309, undef
+  %tmp313 = fsub float %tmp59, undef
+  %tmp314 = fsub float %tmp60, undef
+  %tmp315 = fsub float %tmp61, undef
+  %tmp316 = fmul float %tmp313, undef
+  %tmp317 = fmul float %tmp314, undef
+  %tmp318 = fmul float %tmp315, undef
+  %tmp319 = call float @llvm.maxnum.f32(float %tmp310, float %tmp316)
+  %tmp320 = call float @llvm.maxnum.f32(float %tmp311, float %tmp317)
+  %tmp321 = call float @llvm.maxnum.f32(float %tmp312, float %tmp318)
+  %tmp322 = call float @llvm.minnum.f32(float %tmp319, float %tmp320)
+  %tmp323 = call float @llvm.minnum.f32(float %tmp322, float %tmp321)
+  %tmp324 = fcmp ogt float undef, 0.000000e+00
+  %tmp325 = fcmp olt float undef, 1.000000e+00
+  %tmp326 = and i1 %tmp324, %tmp325
+  %tmp327 = fcmp olt float undef, %tmp323
+  %tmp328 = and i1 %tmp326, %tmp327
+  br i1 %tmp328, label %ENDIF2795, label %ELSE2824
 
 ELSE2824:                                         ; preds = %ELSE2821
   %.2849 = select i1 undef, float 0.000000e+00, float 1.000000e+00
   br label %ENDIF2795
 }
 
-declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
-
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.floor.f32(float) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.sqrt.f32(float) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.minnum.f32(float, float) #1
-
-; Function Attrs: nounwind readnone
 declare float @llvm.maxnum.f32(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
index 062f5245af10c3cab1ca9a91fc437fb9e1d71ce1..114c97b61bd40bb343f76fd3fb6ab19c8367ddbe 100644
--- a/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
+++ b/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll
@@ -8,7 +8,7 @@
 ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000
 
 ; Make sure we are handling hazards correctly.
-; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:12
+; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], s{{[0-9]+}} offset:16
 ; SGPR-NEXT: s_waitcnt vmcnt(0)
 ; SGPR-NEXT: v_readfirstlane_b32 s[[HI:[0-9]+]], [[VHI]]
 ; SGPR-NEXT: s_nop 4
@@ -16,15 +16,15 @@
 
 ; Make sure scratch wave offset register is correctly incremented and
 ; then restored.
-; SMEM: s_mov_b32 m0, s[[OFF]]{{$}}
+; SMEM: s_add_u32 m0, s[[OFF]], 0x100{{$}}
 ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]], m0 ; 16-byte Folded Spill
 
-; SMEM: s_mov_b32 m0, s[[OFF]]{{$}}
+; SMEM: s_add_u32 m0, s[[OFF]], 0x100{{$}}
 ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[LO]]:[[HI]]], m0 ; 16-byte Folded Reload
 
 ; SMEM: s_dcache_wb
 ; ALL: s_endpgm
-define void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   call void asm sideeffect "", "~{SGPR0_SGPR1_SGPR2_SGPR3_SGPR4_SGPR5_SGPR6_SGPR7}" ()
   call void asm sideeffect "", "~{SGPR8_SGPR9_SGPR10_SGPR11_SGPR12_SGPR13_SGPR14_SGPR15}" ()
   call void asm sideeffect "", "~{SGPR16_SGPR17_SGPR18_SGPR19_SGPR20_SGPR21_SGPR22_SGPR23}" ()
diff --git a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
index 4beefb047f2211c2310b00c075bbfded657337c6..8a4cee264fd80d4d3c1d671afad71abab98f631c 100644
--- a/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
+++ b/test/CodeGen/AMDGPU/si-triv-disjoint-mem-access.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -enable-amdgpu-aa=0 -verify-machineinstrs -enable-misched -enable-aa-sched-mi < %s | FileCheck -check-prefix=FUNC -check-prefix=CI %s
 
 declare void @llvm.SI.tbuffer.store.i32(<16 x i8>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
 declare void @llvm.SI.tbuffer.store.v4i32(<16 x i8>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)
@@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #2
 ; FUNC-LABEL: @reorder_local_load_global_store_local_load
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:1 offset1:3
 ; CI: buffer_store_dword
-define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -33,7 +33,7 @@ define void @reorder_local_load_global_store_local_load(i32 addrspace(1)* %out,
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; CI: buffer_store_dword
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
-define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -53,7 +53,7 @@ define void @no_reorder_local_load_volatile_global_store_local_load(i32 addrspac
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:4
 ; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
 ; CI: buffer_store_dword
-define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(3)*, i32 addrspace(3)* addrspace(3)* @stored_lds_ptr, align 4
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 1
@@ -77,7 +77,7 @@ define void @no_reorder_barrier_local_load_global_store_local_load(i32 addrspace
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x1
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: buffer_store_dword
-define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
+define amdgpu_kernel void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(1)* %gptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
@@ -100,7 +100,7 @@ define void @reorder_constant_load_global_store_constant_load(i32 addrspace(1)*
 ; CI-DAG: s_load_dword s{{[0-9]+}}, s{{\[}}[[PTR_LO]]:[[PTR_HI]]{{\]}}, 0x3
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
-define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
+define amdgpu_kernel void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr) #0 {
   %ptr0 = load i32 addrspace(2)*, i32 addrspace(2)* addrspace(3)* @stored_constant_ptr, align 8
 
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
@@ -122,7 +122,7 @@ define void @reorder_constant_load_local_store_constant_load(i32 addrspace(1)* %
 ; CI: s_load_dword
 ; CI: ds_write_b32
 ; CI: buffer_store_dword
-define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
+define amdgpu_kernel void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32 addrspace(3)* noalias %lptr, i32 addrspace(2)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 1
   %ptr2 = getelementptr inbounds i32, i32 addrspace(2)* %ptr0, i64 2
 
@@ -141,7 +141,7 @@ define void @reorder_smrd_load_local_store_smrd_load(i32 addrspace(1)* %out, i32
 ; CI: buffer_load_dword
 ; CI: buffer_load_dword
 ; CI: buffer_store_dword
-define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
+define amdgpu_kernel void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out, i32 addrspace(3)* %lptr, i32 addrspace(1)* %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 1
   %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i64 3
 
@@ -157,12 +157,11 @@ define void @reorder_global_load_local_store_global_load(i32 addrspace(1)* %out,
 
 ; FUNC-LABEL: @reorder_local_offsets
 ; CI: ds_read2_b32 {{v\[[0-9]+:[0-9]+\]}}, {{v[0-9]+}} offset0:100 offset1:102
-; CI: ds_write2_b32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} offset0:3 offset1:100
-; CI: ds_read_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:12
-; CI: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
+; CI-DAG: ds_write2_b32 {{v[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:3 offset1:100
+; CI-DAG: ds_write_b32 {{v[0-9]+}}, {{v[0-9]+}} offset:408
 ; CI: buffer_store_dword
 ; CI: s_endpgm
-define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
+define amdgpu_kernel void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(3)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 100
   %ptr3 = getelementptr inbounds i32, i32 addrspace(3)* %ptr0, i32 102
@@ -181,14 +180,14 @@ define void @reorder_local_offsets(i32 addrspace(1)* nocapture %out, i32 addrspa
 }
 
 ; FUNC-LABEL: @reorder_global_offsets
-; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
-; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
-; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
-; CI: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
-; CI: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI-DAG: buffer_load_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:12
+; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:400
+; CI-DAG: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0 offset:408
+; CI: buffer_store_dword
 ; CI: s_endpgm
-define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
+define amdgpu_kernel void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* noalias nocapture readnone %gptr, i32 addrspace(1)* noalias nocapture %ptr0) #0 {
   %ptr1 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 3
   %ptr2 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 100
   %ptr3 = getelementptr inbounds i32, i32 addrspace(1)* %ptr0, i32 102
@@ -222,7 +221,7 @@ define void @reorder_global_offsets(i32 addrspace(1)* nocapture %out, i32 addrsp
 
 ; GCN: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:36{{$}}
 ; GCN-NEXT: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}} 0 addr64 offset:52{{$}}
-define void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
+define amdgpu_kernel void @reorder_global_offsets_addr64_soffset0(i32 addrspace(1)* noalias nocapture %ptr.base) #0 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
 
diff --git a/test/CodeGen/AMDGPU/si-vector-hang.ll b/test/CodeGen/AMDGPU/si-vector-hang.ll
index dd8783df5c3cbfd99dbbb8c68b4ad97a3d9923bf..7990990478af73cf79ea35aec7b16d9af56bc972 100644
--- a/test/CodeGen/AMDGPU/si-vector-hang.ll
+++ b/test/CodeGen/AMDGPU/si-vector-hang.ll
@@ -12,7 +12,7 @@
 ; CHECK: buffer_store_byte
 ; ModuleID = 'radeon'
 
-define void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
+define amdgpu_kernel void @test_8_min_char(i8 addrspace(1)* nocapture %out, i8 addrspace(1)* nocapture readonly %in0, i8 addrspace(1)* nocapture readonly %in1) #0 {
 entry:
   %0 = load i8, i8 addrspace(1)* %in0, align 1
   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
diff --git a/test/CodeGen/AMDGPU/sign_extend.ll b/test/CodeGen/AMDGPU/sign_extend.ll
index 875351c59961e3f9a3fafca9a22c0320dfe9e3ad..3e452c214e983afb4e7e8f4fde5ba82228723ef8 100644
--- a/test/CodeGen/AMDGPU/sign_extend.ll
+++ b/test/CodeGen/AMDGPU/sign_extend.ll
@@ -4,7 +4,7 @@
 ; GCN-LABEL: {{^}}s_sext_i1_to_i32:
 ; GCN: v_cndmask_b32_e64
 ; GCN: s_endpgm
-define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i32
   store i32 %sext, i32 addrspace(1)* %out, align 4
@@ -14,7 +14,7 @@ define void @s_sext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; GCN-LABEL: {{^}}test_s_sext_i32_to_i64:
 ; GCN: s_ashr_i32
 ; GCN: s_endpg
-define void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+define amdgpu_kernel void @test_s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
 entry:
   %mul = mul i32 %a, %b
   %add = add i32 %mul, %c
@@ -28,7 +28,7 @@ entry:
 ; GCN: v_mov_b32_e32 v[[HIREG:[0-9]+]], v[[LOREG]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LOREG]]:[[HIREG]]{{\]}}
 ; GCN: s_endpgm
-define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
@@ -38,7 +38,7 @@ define void @s_sext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; GCN-LABEL: {{^}}s_sext_i32_to_i64:
 ; GCN: s_ashr_i32
 ; GCN: s_endpgm
-define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
   %sext = sext i32 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
@@ -47,7 +47,7 @@ define void @s_sext_i32_to_i64(i64 addrspace(1)* %out, i32 %a) nounwind {
 ; GCN-LABEL: {{^}}v_sext_i32_to_i64:
 ; GCN: v_ashr
 ; GCN: s_endpgm
-define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %val = load i32, i32 addrspace(1)* %in, align 4
   %sext = sext i32 %val to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
@@ -56,7 +56,7 @@ define void @v_sext_i32_to_i64(i64 addrspace(1)* %out, i32 addrspace(1)* %in) no
 
 ; GCN-LABEL: {{^}}s_sext_i16_to_i64:
 ; GCN: s_bfe_i64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100000
-define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
+define amdgpu_kernel void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
   %sext = sext i16 %a to i64
   store i64 %sext, i64 addrspace(1)* %out, align 8
   ret void
@@ -65,7 +65,7 @@ define void @s_sext_i16_to_i64(i64 addrspace(1)* %out, i16 %a) nounwind {
 ; GCN-LABEL: {{^}}s_sext_i1_to_i16:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
   %cmp = icmp eq i32 %a, %b
   %sext = sext i1 %cmp to i16
   store i16 %sext, i16 addrspace(1)* %out
@@ -79,7 +79,7 @@ define void @s_sext_i1_to_i16(i16 addrspace(1)* %out, i32 %a, i32 %b) nounwind {
 ; GCN-LABEL: {{^}}s_sext_i1_to_i16_with_and:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
+define amdgpu_kernel void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
   %cmp0 = icmp eq i32 %a, %b
   %cmp1 = icmp eq i32 %c, %d
   %cmp = and i1 %cmp0, %cmp1
@@ -91,7 +91,7 @@ define void @s_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN-LABEL: {{^}}v_sext_i1_to_i16_with_and:
 ; GCN: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1
 ; GCN-NEXT: buffer_store_short [[RESULT]]
-define void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
+define amdgpu_kernel void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) nounwind {
   %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1
   %cmp0 = icmp eq i32 %a, %tid
   %cmp1 = icmp eq i32 %b, %c
@@ -130,7 +130,7 @@ define void @v_sext_i1_to_i16_with_and(i16 addrspace(1)* %out, i32 %a, i32 %b, i
 ; GCN-DAG: buffer_store_dword [[VEXT3]]
 
 ; GCN: s_endpgm
-define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
+define amdgpu_kernel void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
   %cast = bitcast i32 %a to <4 x i8>
   %ext = sext <4 x i8> %cast to <4 x i32>
   %elt0 = extractelement <4 x i32> %ext, i32 0
@@ -162,7 +162,7 @@ define void @s_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 %a) nounwind {
 ; GCN: buffer_store_dword [[EXT1]]
 ; GCN: buffer_store_dword [[EXT2]]
 ; GCN: buffer_store_dword [[EXT3]]
-define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) nounwind {
   %a = load i32, i32 addrspace(1)* %in
   %cast = bitcast i32 %a to <4 x i8>
   %ext = sext <4 x i8> %cast to <4 x i32>
@@ -184,7 +184,7 @@ define void @v_sext_v4i8_to_v4i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; GCN-DAG: s_sext_i32_i16
 ; GCN-DAG: s_sext_i32_i16
 ; GCN: s_endpgm
-define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
   %cast = bitcast i64 %a to <4 x i16>
   %ext = sext <4 x i16> %cast to <4 x i32>
   %elt0 = extractelement <4 x i32> %ext, i32 0
@@ -206,7 +206,7 @@ define void @s_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 %a) nounwind {
 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN-DAG: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
 ; GCN: s_endpgm
-define void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @v_sext_v4i16_to_v4i32(i32 addrspace(1)* %out, i64 addrspace(1)* %in) nounwind {
   %a = load i64, i64 addrspace(1)* %in
   %cast = bitcast i64 %a to <4 x i16>
   %ext = sext <4 x i16> %cast to <4 x i32>
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
index 68dc3c6ccd246824b2a45ccfa9efbc524026a2f3..f98a716b4fd11322cf03b5daf52028021aa4e373 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.f64.ll
@@ -4,7 +4,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
 ; SI-LABEL: {{^}}sint_to_fp_i32_to_f64
 ; SI: v_cvt_f64_i32_e32
-define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   %result = sitofp i32 %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -19,7 +19,7 @@ define void @sint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
-define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = sitofp i1 %cmp to double
   store double %fp, double addrspace(1)* %out, align 4
@@ -31,14 +31,14 @@ define void @sint_to_fp_i1_f64(double addrspace(1)* %out, i32 %in) {
 ; SI-NEXT: v_cvt_f64_i32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
+define amdgpu_kernel void @sint_to_fp_i1_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = sitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; SI-LABEL: @s_sint_to_fp_i64_to_f64
-define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
   %result = sitofp i64 %in to double
   store double %result, double addrspace(1)* %out
   ret void
@@ -51,7 +51,7 @@ define void @s_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
 ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
index 5df8105116cc67b0d9a0d3c6ca092886a9019cba..04cd199b81ae5a8e2c8337d8922c5e451033e5cd 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.i64.ll
@@ -4,7 +4,7 @@
 ; FIXME: This should be merged with sint_to_fp.ll, but s_sint_to_fp_v2i64 crashes on r600
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f16:
-define void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
   %result = sitofp i64 %in to half
   store half %result, half addrspace(1)* %out
   ret void
@@ -28,7 +28,7 @@ define void @s_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
 ; GCN: v_cvt_f16_f32_e32 [[SIGN_SEL_F16:v[0-9]+]], [[SIGN_SEL]]
 ; GCN: {{buffer|flat}}_store_short {{.*}}[[SIGN_SEL_F16]]
-define void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -39,7 +39,7 @@ define void @v_sint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_i64_to_f32:
-define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
   %result = sitofp i64 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -62,7 +62,7 @@ define void @s_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 ; GCN: v_xor_b32_e32 v{{[0-9]+}}, 0x80000000, v{{[0-9]+}}
 ; GCN: v_cndmask_b32_e{{32|64}} [[SIGN_SEL:v[0-9]+]],
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[SIGN_SEL]]
-define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -74,14 +74,14 @@ define void @v_sint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)*
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f32:
 ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1,
-define void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = sitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f32:
-define void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -93,14 +93,14 @@ define void @v_sint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i6
 
 ; FUNC-LABEL: {{^}}s_sint_to_fp_v2i64_to_v2f16:
 ; GCN-NOT: v_and_b32_e32 v{{[0-9]+}}, -1,
-define void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = sitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_sint_to_fp_v4i64_to_v4f16:
-define void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/sint_to_fp.ll b/test/CodeGen/AMDGPU/sint_to_fp.ll
index 4c8fea12bada1b645d68ff672c2546bac13b72f3..8e85d9998597537d18913fd2c034baef160121e5 100644
--- a/test/CodeGen/AMDGPU/sint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/sint_to_fp.ll
@@ -6,7 +6,7 @@
 ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{s[0-9]+$}}
 
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = sitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cvt_f32_i32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
 
 ; R600: INT_TO_FLT
-define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -32,7 +32,7 @@ define void @v_sint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)*
 
 ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: INT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{
+define amdgpu_kernel void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0{
   %result = sitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @s_sint_to_fp_v2i32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = sitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @s_sint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i3
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: INT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -81,7 +81,7 @@ define void @v_sint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrsp
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
   store float %fp, float addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @s_sint_to_fp_i1_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
+define amdgpu_kernel void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = sitofp i1 %in to float
   store float %fp, float addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@ define void @s_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, -1.0
 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
-define void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_sint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/sitofp.f16.ll b/test/CodeGen/AMDGPU/sitofp.f16.ll
index 1395fa2bfea04c87a66d19317ef2363edf7b7de6..574d1c0b2c78ea230c94ed11c98fb2729e2ea9a5 100644
--- a/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -7,7 +7,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sitofp_i16_to_f16(
+define amdgpu_kernel void @sitofp_i16_to_f16(
     half addrspace(1)* %r,
     i16 addrspace(1)* %a) {
 entry:
@@ -23,7 +23,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @sitofp_i32_to_f16(
+define amdgpu_kernel void @sitofp_i32_to_f16(
     half addrspace(1)* %r,
     i32 addrspace(1)* %a) {
 entry:
@@ -37,15 +37,24 @@ entry:
 
 ; GCN-LABEL: {{^}}sitofp_v2i16_to_v2f16
 ; GCN:     buffer_load_dword
-; GCN:     v_cvt_f32_i32_e32
-; GCN:     v_cvt_f32_i32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
-; GCN:     buffer_store_dword
-; GCN:     s_endpgm
-define void @sitofp_v2i16_to_v2f16(
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI:     v_or_b32_e32
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+
+define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a) {
 entry:
@@ -56,17 +65,24 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}sitofp_v2i32_to_v2f16
-; GCN:     buffer_load_dwordx2
-; GCN:     v_cvt_f32_i32_e32
-; GCN:     v_cvt_f32_i32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN-DAG: v_and_b32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
-; GCN:     buffer_store_dword
-; GCN:     s_endpgm
-define void @sitofp_v2i32_to_v2f16(
+; GCN:    buffer_load_dwordx2
+
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f32_i32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f32_i32_e32
+; VI-DAG: v_cvt_f32_i32_e32
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI:     v_or_b32_e32
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i32> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/skip-if-dead.ll b/test/CodeGen/AMDGPU/skip-if-dead.ll
index 60cee7a3499e3b914189c68658a2eacdee603d0b..3f53572ab44033373743f8be476e73566fbb53e1 100644
--- a/test/CodeGen/AMDGPU/skip-if-dead.ll
+++ b/test/CodeGen/AMDGPU/skip-if-dead.ll
@@ -357,7 +357,7 @@ bb7:                                              ; preds = %bb4
 ; CHECK: [[END]]:
 ; CHECK: s_or_b64 exec, exec
 ; CHECK: s_endpgm
-define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x i32> %arg2) #0 {
+define amdgpu_ps void @if_after_kill_block(float %arg, float %arg1, <4 x float> %arg2) #0 {
 bb:
   %tmp = fcmp ult float %arg1, 0.000000e+00
   br i1 %tmp, label %bb3, label %bb4
@@ -367,7 +367,7 @@ bb3:                                              ; preds = %bb
   br label %bb4
 
 bb4:                                              ; preds = %bb3, %bb
-  %tmp5 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp5 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %arg2, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp6 = extractelement <4 x float> %tmp5, i32 0
   %tmp7 = fcmp une float %tmp6, 0.000000e+00
   br i1 %tmp7, label %bb8, label %bb9
@@ -380,9 +380,8 @@ bb9:                                              ; preds = %bb4
   ret void
 }
 
+declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
 declare void @llvm.AMDGPU.kill(float) #0
-declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) nounwind
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/smed3.ll b/test/CodeGen/AMDGPU/smed3.ll
index 985c73904f4351afc225db4d973394d988135109..8665ab697265e4f82913d22b075b71add00361b6 100644
--- a/test/CodeGen/AMDGPU/smed3.ll
+++ b/test/CodeGen/AMDGPU/smed3.ll
@@ -1,12 +1,13 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i32:
 ; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -24,8 +25,8 @@ define void @v_test_smed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a
 ; GCN-LABEL: {{^}}v_test_smed3_multi_use_r_i_i_i32:
 ; GCN: v_max_i32
 ; GCN: v_min_i32
-define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -44,8 +45,8 @@ define void @v_test_smed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_constant_order_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -63,8 +64,8 @@ define void @v_test_smed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_i32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -82,8 +83,8 @@ define void @v_test_smed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i64:
 ; GCN: v_cmp_lt_i64
 ; GCN: v_cmp_gt_i64
-define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
   %a = load i64, i64 addrspace(1)* %gep0
@@ -99,9 +100,10 @@ define void @v_test_smed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 }
 
 ; GCN-LABEL: {{^}}v_test_smed3_r_i_i_i16:
-; GCN: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+; SICIVI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define amdgpu_kernel void @v_test_smed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
   %a = load i16, i16 addrspace(1)* %gep0
@@ -172,7 +174,7 @@ define internal i8 @smax8(i8 %x, i8 %y) #2 {
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -184,7 +186,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_1:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -196,7 +198,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_2:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -208,7 +210,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_3:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -220,7 +222,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_4:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -232,7 +234,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_5:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -244,7 +246,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_6:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -256,7 +258,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_7:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -268,7 +270,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_8:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -280,7 +282,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_9:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -292,7 +294,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_10:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -304,7 +306,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_11:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -316,7 +318,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_12:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -328,7 +330,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_13:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -340,7 +342,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_14:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -352,7 +354,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_15:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %y, i32 %x)
   %tmp1 = call i32 @smax(i32 %y, i32 %x)
@@ -362,12 +364,13 @@ bb:
   ret void
 }
 
+; FIXME: Should keep scalar or not promote
 ; GCN-LABEL: {{^}}s_test_smed3_i16_pat_0:
 ; GCN: s_sext_i32_i16
 ; GCN: s_sext_i32_i16
 ; GCN: s_sext_i32_i16
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
 bb:
   %tmp0 = call i16 @smin16(i16 %x, i16 %y)
   %tmp1 = call i16 @smax16(i16 %x, i16 %y)
@@ -382,7 +385,7 @@ bb:
 ; GCN: s_sext_i32_i8
 ; GCN: s_sext_i32_i8
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
 bb:
   %tmp0 = call i8 @smin8(i8 %x, i8 %y)
   %tmp1 = call i8 @smax8(i8 %x, i8 %y)
@@ -394,7 +397,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_0:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -407,7 +410,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_1:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -420,7 +423,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_2:
 ; GCN-NOT: v_med3_i32
-define void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -433,7 +436,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_smed3_i32_pat_0_multi_use_result:
 ; GCN: v_med3_i32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_smed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @smin(i32 %x, i32 %y)
   %tmp1 = call i32 @smax(i32 %x, i32 %y)
@@ -444,6 +447,35 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_smed3_i16_pat_0:
+; SI: v_med3_i32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; FIXME: VI not matching med3
+; VI: v_min_i16
+; VI: v_max_i16
+; VI: v_min_i16
+; VI: v_max_i16
+
+; GFX9: v_med3_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_smed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
+  %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
+  %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
+  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %x = load i16, i16 addrspace(1)* %gep0
+  %y = load i16, i16 addrspace(1)* %gep1
+  %z = load i16, i16 addrspace(1)* %gep2
+
+  %tmp0 = call i16 @smin16(i16 %x, i16 %y)
+  %tmp1 = call i16 @smax16(i16 %x, i16 %y)
+  %tmp2 = call i16 @smin16(i16 %tmp1, i16 %z)
+  %tmp3 = call i16 @smax16(i16 %tmp0, i16 %tmp2)
+  store i16 %tmp3, i16 addrspace(1)* %out.gep
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readnone alwaysinline }
diff --git a/test/CodeGen/AMDGPU/sminmax.ll b/test/CodeGen/AMDGPU/sminmax.ll
index ce5d92451647258159262e81593769d6c7fb4b64..827d672022eba389e799b9b42794b1558398d0f4 100644
--- a/test/CodeGen/AMDGPU/sminmax.ll
+++ b/test/CodeGen/AMDGPU/sminmax.ll
@@ -7,7 +7,7 @@
 ; GCN: s_add_i32
 
 ; EG: MAX_INT
-define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
   %res = select i1 %cond, i32 %val, i32 %neg
@@ -22,7 +22,7 @@ define void @s_abs_i32(i32 addrspace(1)* %out, i32 %val) nounwind {
 ; GCN: v_add_i32
 
 ; EG: MAX_INT
-define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
   %val = load i32, i32 addrspace(1)* %src, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
@@ -36,7 +36,7 @@ define void @v_abs_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind
 ; GCN: v_sub_i32_e32 [[NEG:v[0-9]+]], vcc, 0, [[SRC:v[0-9]+]]
 ; GCN: v_max_i32_e32 [[MAX:v[0-9]+]], [[NEG]], [[SRC]]
 ; GCN: v_mul_lo_i32 v{{[0-9]+}}, [[MAX]], [[MAX]]
-define void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %src) nounwind {
   %val = load i32, i32 addrspace(1)* %src, align 4
   %neg = sub i32 0, %val
   %cond = icmp sgt i32 %val, %neg
@@ -54,7 +54,7 @@ define void @v_abs_i32_repeat_user(i32 addrspace(1)* %out, i32 addrspace(1)* %sr
 
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
@@ -79,7 +79,7 @@ define void @s_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> %val) nounwind
 
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %src) nounwind {
   %z0 = insertelement <2 x i32> undef, i32 0, i32 0
   %z1 = insertelement <2 x i32> %z0, i32 0, i32 1
   %t0 = insertelement <2 x i32> undef, i32 2, i32 0
@@ -109,7 +109,7 @@ define void @v_abs_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
+define amdgpu_kernel void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
   %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
@@ -146,7 +146,7 @@ define void @s_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %val) nounwind
 ; EG: MAX_INT
 ; EG: MAX_INT
 ; EG: MAX_INT
-define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
+define amdgpu_kernel void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %src) nounwind {
   %z0 = insertelement <4 x i32> undef, i32 0, i32 0
   %z1 = insertelement <4 x i32> %z0, i32 0, i32 1
   %z2 = insertelement <4 x i32> %z1, i32 0, i32 2
@@ -170,7 +170,7 @@ define void @v_abs_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %
 
 ; GCN-DAG: s_min_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
 ; GCN-DAG: s_max_i32 s{{[0-9]+}}, [[VAL0]], [[VAL1]]
-define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
+define amdgpu_kernel void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %val0, i32 %val1) nounwind {
   %cond0 = icmp sgt i32 %val0, %val1
   %sel0 = select i1 %cond0, i32 %val0, i32 %val1
   %sel1 = select i1 %cond0, i32 %val1, i32 %val0
@@ -186,7 +186,7 @@ define void @s_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32
 
 ; GCN-DAG: v_min_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
 ; GCN-DAG: v_max_i32_e32 v{{[0-9]+}}, [[VAL1]], [[VAL0]]
-define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+define amdgpu_kernel void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
   %val0 = load volatile i32, i32 addrspace(1)* %ptr0
   %val1 = load volatile i32, i32 addrspace(1)* %ptr1
 
@@ -208,7 +208,7 @@ define void @v_min_max_i32(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32
 ; GCN-DAG: s_max_i32
 ; GCN-DAG: s_max_i32
 ; GCN-DAG: s_max_i32
-define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
+define amdgpu_kernel void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(1)* %out1, <4 x i32> %val0, <4 x i32> %val1) nounwind {
   %cond0 = icmp sgt <4 x i32> %val0, %val1
   %sel0 = select <4 x i1> %cond0, <4 x i32> %val0, <4 x i32> %val1
   %sel1 = select <4 x i1> %cond0, <4 x i32> %val1, <4 x i32> %val0
@@ -223,7 +223,7 @@ define void @s_min_max_v4i32(<4 x i32> addrspace(1)* %out0, <4 x i32> addrspace(
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e32
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, vcc
-define void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
+define amdgpu_kernel void @v_min_max_i32_user(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 addrspace(1)* %ptr0, i32 addrspace(1)* %ptr1) nounwind {
   %val0 = load volatile i32, i32 addrspace(1)* %ptr0
   %val1 = load volatile i32, i32 addrspace(1)* %ptr1
 
diff --git a/test/CodeGen/AMDGPU/sminmax.v2i16.ll b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4e093cdece212b41a23ba44d38e673a701dd1ca9
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sminmax.v2i16.ll
@@ -0,0 +1,224 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s
+
+; GCN-LABEL: {{^}}s_abs_v2i16:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
+; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+
+; VI: v_sub_i32_e32
+; VI-DAG: v_sub_i32_e32
+; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_max_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; VI: v_add_i32_e32
+; VI: v_add_i32_e32
+; VI: v_or_b32_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+
+; CI: v_sub_i32_e32
+; CI-DAG: v_sub_i32_e32
+; CI: v_bfe_i32
+; CI-DAG: v_bfe_i32
+; CI-DAG: v_add_i32_e32
+; CI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16
+; CI: v_add_i32_e32
+; CI: v_and_b32_e32 v{{[0-9]+}}, 0xffff,
+; CI: v_or_b32_e32
+define amdgpu_kernel void @s_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
+  %neg = sub <2 x i16> zeroinitializer, %val
+  %cond = icmp sgt <2 x i16> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
+  %res2 = add <2 x i16> %res, <i16 2, i16 2>
+  store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_abs_v2i16:
+; GFX9: flat_load_dword [[VAL:v[0-9]+]]
+; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
+; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+
+; VI: v_lshrrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; VI: v_sub_u16_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
+; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_max_i16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
+; VI: v_add_u16_e32 v{{[0-9]+}}, 2, v{{[0-9]+}}
+; VI-NOT: v_and_b32
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_abs_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.in = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %src, i32 %tid
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %val = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in, align 4
+  %neg = sub <2 x i16> zeroinitializer, %val
+  %cond = icmp sgt <2 x i16> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
+  %res2 = add <2 x i16> %res, <i16 2, i16 2>
+  store <2 x i16> %res2, <2 x i16> addrspace(1)* %gep.out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_abs_v2i16_2:
+; GFX9: s_load_dword [[VAL:s[0-9]+]]
+; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
+; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+define amdgpu_kernel void @s_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> %val) #0 {
+  %z0 = insertelement <2 x i16> undef, i16 0, i16 0
+  %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
+  %t0 = insertelement <2 x i16> undef, i16 2, i16 0
+  %t1 = insertelement <2 x i16> %t0, i16 2, i16 1
+  %neg = sub <2 x i16> %z1, %val
+  %cond = icmp sgt <2 x i16> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
+  %res2 = add <2 x i16> %res, %t1
+  store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_abs_v2i16_2:
+; GFX9: buffer_load_dword [[VAL:v[0-9]+]]
+; GFX9: v_pk_sub_i16 [[SUB:v[0-9]+]], 0, [[VAL]]
+; GFX9: v_pk_max_i16 [[MAX:v[0-9]+]], [[VAL]], [[SUB]]
+; GFX9: v_pk_add_u16 [[ADD:v[0-9]+]], [[MAX]], 2
+define amdgpu_kernel void @v_abs_v2i16_2(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %src) #0 {
+  %z0 = insertelement <2 x i16> undef, i16 0, i16 0
+  %z1 = insertelement <2 x i16> %z0, i16 0, i16 1
+  %t0 = insertelement <2 x i16> undef, i16 2, i16 0
+  %t1 = insertelement <2 x i16> %t0, i16 2, i16 1
+  %val = load <2 x i16>, <2 x i16> addrspace(1)* %src, align 4
+  %neg = sub <2 x i16> %z1, %val
+  %cond = icmp sgt <2 x i16> %val, %neg
+  %res = select <2 x i1> %cond, <2 x i16> %val, <2 x i16> %neg
+  %res2 = add <2 x i16> %res, %t1
+  store <2 x i16> %res2, <2 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_abs_v4i16:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, [[VAL0]]
+; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], [[VAL0]], [[SUB0]]
+; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2
+
+; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, [[VAL1]]
+; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], [[VAL1]], [[SUB1]]
+; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
+define amdgpu_kernel void @s_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> %val) #0 {
+  %z0 = insertelement <4 x i16> undef, i16 0, i16 0
+  %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
+  %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
+  %z3 = insertelement <4 x i16> %z2, i16 0, i16 3
+  %t0 = insertelement <4 x i16> undef, i16 2, i16 0
+  %t1 = insertelement <4 x i16> %t0, i16 2, i16 1
+  %t2 = insertelement <4 x i16> %t1, i16 2, i16 2
+  %t3 = insertelement <4 x i16> %t2, i16 2, i16 3
+  %neg = sub <4 x i16> %z3, %val
+  %cond = icmp sgt <4 x i16> %val, %neg
+  %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg
+  %res2 = add <4 x i16> %res, %t3
+  store <4 x i16> %res2, <4 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_abs_v4i16:
+; GFX9: buffer_load_dwordx2 v{{\[}}[[VAL0:[0-9]+]]:[[VAL1:[0-9]+]]{{\]}}
+
+; GFX9-DAG: v_pk_sub_i16 [[SUB0:v[0-9]+]], 0, v[[VAL0]]
+; GFX9-DAG: v_pk_max_i16 [[MAX0:v[0-9]+]], v[[VAL0]], [[SUB0]]
+; GFX9-DAG: v_pk_add_u16 [[ADD0:v[0-9]+]], [[MAX0]], 2
+
+; GFX9-DAG: v_pk_sub_i16 [[SUB1:v[0-9]+]], 0, v[[VAL1]]
+; GFX9-DAG: v_pk_max_i16 [[MAX1:v[0-9]+]], v[[VAL1]], [[SUB1]]
+; GFX9-DAG: v_pk_add_u16 [[ADD1:v[0-9]+]], [[MAX1]], 2
+define amdgpu_kernel void @v_abs_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %src) #0 {
+  %z0 = insertelement <4 x i16> undef, i16 0, i16 0
+  %z1 = insertelement <4 x i16> %z0, i16 0, i16 1
+  %z2 = insertelement <4 x i16> %z1, i16 0, i16 2
+  %z3 = insertelement <4 x i16> %z2, i16 0, i16 3
+  %t0 = insertelement <4 x i16> undef, i16 2, i16 0
+  %t1 = insertelement <4 x i16> %t0, i16 2, i16 1
+  %t2 = insertelement <4 x i16> %t1, i16 2, i16 2
+  %t3 = insertelement <4 x i16> %t2, i16 2, i16 3
+  %val = load <4 x i16>, <4 x i16> addrspace(1)* %src, align 4
+  %neg = sub <4 x i16> %z3, %val
+  %cond = icmp sgt <4 x i16> %val, %neg
+  %res = select <4 x i1> %cond, <4 x i16> %val, <4 x i16> %neg
+  %res2 = add <4 x i16> %res, %t3
+  store <4 x i16> %res2, <4 x i16> addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_min_max_v2i16:
+define amdgpu_kernel void @s_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) #0 {
+  %cond0 = icmp sgt <2 x i16> %val0, %val1
+  %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
+  %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
+
+  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
+  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_min_max_v2i16:
+define amdgpu_kernel void @v_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
+  %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0
+  %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1
+
+  %cond0 = icmp sgt <2 x i16> %val0, %val1
+  %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
+  %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
+
+  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
+  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_min_max_v4i32:
+define amdgpu_kernel void @s_min_max_v4i32(<4 x i16> addrspace(1)* %out0, <4 x i16> addrspace(1)* %out1, <4 x i16> %val0, <4 x i16> %val1) #0 {
+  %cond0 = icmp sgt <4 x i16> %val0, %val1
+  %sel0 = select <4 x i1> %cond0, <4 x i16> %val0, <4 x i16> %val1
+  %sel1 = select <4 x i1> %cond0, <4 x i16> %val1, <4 x i16> %val0
+
+  store volatile <4 x i16> %sel0, <4 x i16> addrspace(1)* %out0, align 4
+  store volatile <4 x i16> %sel1, <4 x i16> addrspace(1)* %out1, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_min_max_v2i16_user:
+define amdgpu_kernel void @v_min_max_v2i16_user(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> addrspace(1)* %ptr0, <2 x i16> addrspace(1)* %ptr1) #0 {
+  %val0 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr0
+  %val1 = load volatile <2 x i16>, <2 x i16> addrspace(1)* %ptr1
+
+  %cond0 = icmp sgt <2 x i16> %val0, %val1
+  %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
+  %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
+
+  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
+  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  store volatile <2 x i1> %cond0, <2 x i1> addrspace(1)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}u_min_max_v2i16:
+; GFX9: v_pk_max_u16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+; GFX9: v_pk_min_u16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @u_min_max_v2i16(<2 x i16> addrspace(1)* %out0, <2 x i16> addrspace(1)* %out1, <2 x i16> %val0, <2 x i16> %val1) nounwind {
+  %cond0 = icmp ugt <2 x i16> %val0, %val1
+  %sel0 = select <2 x i1> %cond0, <2 x i16> %val0, <2 x i16> %val1
+  %sel1 = select <2 x i1> %cond0, <2 x i16> %val1, <2 x i16> %val0
+
+  store volatile <2 x i16> %sel0, <2 x i16> addrspace(1)* %out0, align 4
+  store volatile <2 x i16> %sel1, <2 x i16> addrspace(1)* %out1, align 4
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
index daac5b92b1ef11b68f9855fb8569badb8caef4b7..343211b0219cc8f235b679935d4cd38862b5ab07 100644
--- a/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
+++ b/test/CodeGen/AMDGPU/smrd-vccz-bug.ll
@@ -12,7 +12,7 @@
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
+define amdgpu_kernel void @vccz_workaround(i32 addrspace(2)* %in, i32 addrspace(1)* %out, float %cond) {
 entry:
   %cnd = fcmp oeq float 0.0, %cond
   %sgpr = load volatile i32, i32 addrspace(2)* %in
@@ -32,7 +32,7 @@ endif:
 ; GCN: buffer_store_dword
 ; GCN: [[EXIT]]:
 ; GCN: s_endpgm
-define void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
+define amdgpu_kernel void @vccz_noworkaround(float addrspace(1)* %in, float addrspace(1)* %out) {
 entry:
   %vgpr = load volatile float, float addrspace(1)* %in
   %cnd = fcmp oeq float 0.0, %vgpr
diff --git a/test/CodeGen/AMDGPU/smrd.ll b/test/CodeGen/AMDGPU/smrd.ll
index 9b118425f9cb4bf8a666b798d2b4828830199b54..50f72c67059824eab24bd8a30e6cec8a48f4dc35 100644
--- a/test/CodeGen/AMDGPU/smrd.ll
+++ b/test/CodeGen/AMDGPU/smrd.ll
@@ -1,16 +1,16 @@
-; RUN: llc < %s -march=amdgcn -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=SI --check-prefix=GCN --check-prefix=SIVI %s
-; RUN: llc < %s -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=CI --check-prefix=GCN  %s
-; RUN: llc < %s -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs | FileCheck --check-prefix=VI --check-prefix=GCN --check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN -check-prefix=SIVI %s
+; RUN: llc -march=amdgcn -mcpu=bonaire -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN  %s
+; RUN: llc -march=amdgcn -mcpu=tonga -show-mc-encoding -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN -check-prefix=SIVI %s
 
 ; SMRD load with an immediate offset.
 ; GCN-LABEL: {{^}}smrd0:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x1 ; encoding: [0x01
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4
-define void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd0(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 1
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -18,11 +18,11 @@ entry:
 ; GCN-LABEL: {{^}}smrd1:
 ; SICI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff,0x{{[0-9]+[137]}}
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd1(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 255
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -33,11 +33,11 @@ entry:
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
 ; GCN: s_endpgm
-define void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd2(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 256
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -48,11 +48,11 @@ entry:
 ; SI: s_load_dwordx2 s[{{[0-9]:[0-9]}}], s[{{[0-9]:[0-9]}}], 0xb ; encoding: [0x0b
 ; TODO: Add VI checks
 ; GCN: s_endpgm
-define void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd3(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296 ; 2 ^ 32
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 4294967296
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -62,11 +62,11 @@ entry:
 ; SI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd4(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262143
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -76,11 +76,11 @@ entry:
 ; SIVI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) {
+define amdgpu_kernel void @smrd5(i32 addrspace(1)* %out, i32 addrspace(2)* %ptr) #0 {
 entry:
-  %0 = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
-  %1 = load i32, i32 addrspace(2)* %0
-  store i32 %1, i32 addrspace(1)* %out
+  %tmp = getelementptr i32, i32 addrspace(2)* %ptr, i64 262144
+  %tmp1 = load i32, i32 addrspace(2)* %tmp
+  store i32 %tmp1, i32 addrspace(1)* %out
   ret void
 }
 
@@ -88,12 +88,12 @@ entry:
 ; GCN-LABEL: {{^}}smrd_load_const0:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x4 ; encoding: [0x04
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x10
-define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const0(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 16)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 16)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
@@ -102,14 +102,15 @@ main_body:
 ; GCN-LABEL: {{^}}smrd_load_const1:
 ; SICI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xff ; encoding: [0xff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3fc
-define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const1(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1020)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1020)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
+
 ; SMRD load using the load.const intrinsic with an offset greater than the
 ; largets possible immediate.
 ; immediate offset.
@@ -118,12 +119,12 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], s[[OFFSET]] ; encoding: [0x0[[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x100
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x400
-define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const2(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1024)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1024)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
@@ -133,12 +134,12 @@ main_body:
 ; SI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x3ffff
 ; VI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0xffffc
-define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const3(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048572)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048572)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
@@ -148,18 +149,17 @@ main_body:
 ; SIVI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], [[OFFSET]]
 ; CI: s_buffer_load_dword s{{[0-9]}}, s[{{[0-9]:[0-9]}}], 0x40000
 ; GCN: s_endpgm
-define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg, <16 x i8> addrspace(2)* inreg, <32 x i8> addrspace(2)* inreg, i32 inreg, <2 x i32>, <2 x i32>, <2 x i32>, <3 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, float, float, float, float, float, float, float, float, float) {
+define amdgpu_ps void @smrd_load_const4(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <2 x i32> %arg6, <3 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <2 x i32> %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, float %arg19) #0 {
 main_body:
-  %20 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %0, i32 0
-  %21 = load <16 x i8>, <16 x i8> addrspace(2)* %20
-  %22 = call float @llvm.SI.load.const(<16 x i8> %21, i32 1048576)
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 0, float %22, float %22, float %22, float %22)
+  %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg, i32 0
+  %tmp20 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp
+  %tmp21 = call float @llvm.SI.load.const(<16 x i8> %tmp20, i32 1048576)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp21, float %tmp21, float %tmp21, float %tmp21, i1 true, i1 true) #0
   ret void
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #0
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
-attributes #0 = { nounwind readnone }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/sopk-compares.ll b/test/CodeGen/AMDGPU/sopk-compares.ll
index 74acc5bc961c13edae42f33874e5a101e93de654..c0f773ca70c254e9291c1de9d885561a3ba144e1 100644
--- a/test/CodeGen/AMDGPU/sopk-compares.ll
+++ b/test/CodeGen/AMDGPU/sopk-compares.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.groupstaticsize() #1
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_inline_imm:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 4{{$}}
-define void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_inline_imm(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -25,7 +25,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x7fff{{$}}
-define void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 32767
   br i1 %cmp0, label %endif, label %if
@@ -41,7 +41,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_max_p1:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -57,7 +57,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ne_i32_simm16_max_p1:
 ; GCN: s_cmpk_lg_u32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -73,7 +73,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_min(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, -32768
   br i1 %cmp0, label %endif, label %if
@@ -89,7 +89,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_simm16_min_m1:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0xffff7fff{{$}}
-define void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_simm16_min_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, -32769
   br i1 %cmp0, label %endif, label %if
@@ -105,7 +105,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm15_max:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}}
-define void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm15_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65535
   br i1 %cmp0, label %endif, label %if
@@ -121,7 +121,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max:
 ; GCN: s_cmpk_eq_u32 s{{[0-9]+}}, 0xffff{{$}}
-define void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65535
   br i1 %cmp0, label %endif, label %if
@@ -137,7 +137,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32_uimm16_max_p1:
 ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0x10000{{$}}
-define void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32_uimm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65536
   br i1 %cmp0, label %endif, label %if
@@ -154,7 +154,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_eq_i32:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -170,7 +170,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ne_i32:
 ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -186,7 +186,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -202,7 +202,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x7fff{{$}}
-define void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32_simm16_max(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 32767
   br i1 %cmp0, label %endif, label %if
@@ -218,7 +218,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sgt_i32_simm16_max_p1:
 ; GCN: s_cmp_gt_i32 s{{[0-9]+}}, 0x8000{{$}}
-define void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sgt_i32_simm16_max_p1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp sgt i32 %cond, 32768
   br i1 %cmp0, label %endif, label %if
@@ -234,7 +234,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sge_i32:
 ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sge i32 %cond, %size
@@ -251,7 +251,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_slt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp slt i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -267,7 +267,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_sle_i32:
 ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sle i32 %cond, %size
@@ -284,7 +284,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ugt_i32:
 ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ugt i32 %cond, %size
@@ -301,7 +301,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_uge_i32:
 ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp uge i32 %cond, %size
@@ -318,7 +318,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x41{{$}}
-define void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, 65
   br i1 %cmp0, label %endif, label %if
@@ -334,7 +334,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16:
 ; GCN: s_cmp_lt_u32 s2, 0xffff8000
-define void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_min_simm16(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, -32768
   br i1 %cmp0, label %endif, label %if
@@ -350,7 +350,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_min_simm16_m1:
 ; GCN: s_cmp_lt_u32 s{{[0-9]+}}, 0xffff7fff{{$}}
-define void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_min_simm16_m1(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ult i32 %cond, -32769
   br i1 %cmp0, label %endif, label %if
@@ -366,7 +366,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ule_i32:
 ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ule i32 %cond, %size
@@ -383,7 +383,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_eq_i32:
 ; GCN: s_cmpk_eq_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_eq_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp eq i32 %size, %cond
@@ -400,7 +400,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ne_i32:
 ; GCN: s_cmpk_lg_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ne_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ne i32 %size, %cond
@@ -417,7 +417,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_sgt_i32:
 ; GCN: s_cmpk_lt_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sgt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sgt i32 %size, %cond
@@ -434,7 +434,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_sge_i32:
 ; GCN: s_cmpk_le_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sge i32 %size, %cond
@@ -451,7 +451,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_slt_i32:
 ; GCN: s_cmpk_gt_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_slt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp slt i32 %size, %cond
@@ -468,7 +468,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_sle_i32:
 ; GCN: s_cmpk_ge_i32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_sle_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp sle i32 %size, %cond
@@ -485,7 +485,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ugt_i32:
 ; GCN: s_cmpk_lt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ugt_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ugt i32 %size, %cond
@@ -502,7 +502,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_uge_i32:
 ; GCN: s_cmpk_le_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_uge_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp uge i32 %size, %cond
@@ -519,7 +519,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ult_i32:
 ; GCN: s_cmpk_gt_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ult_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ult i32 %size, %cond
@@ -536,7 +536,7 @@ endif:
 
 ; GCN-LABEL: {{^}}commute_br_scc_ule_i32:
 ; GCN: s_cmpk_ge_u32 s{{[0-9]+}}, 0x800{{$}}
-define void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @commute_br_scc_ule_i32(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %cmp0 = icmp ule i32 %size, %cond
@@ -553,7 +553,7 @@ endif:
 
 ; GCN-LABEL: {{^}}br_scc_ult_i32_non_u16:
 ; GCN: s_cmp_lt_u32 s2, 0xfffff7ff
-define void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ult_i32_non_u16(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %not.size = xor i32 %size, -1
@@ -573,7 +573,7 @@ endif:
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
 ; SI: v_cmp_eq_u64_e64
-define void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i64 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -593,7 +593,7 @@ endif:
 ; VI: s_cmp_eq_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: v_cmp_eq_u64_e32
-define void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_eq_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp eq i64 %cond, 1234
   br i1 %cmp0, label %endif, label %if
@@ -611,7 +611,7 @@ endif:
 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, 4
 
 ; SI: v_cmp_ne_u64_e64
-define void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i64_inline_imm(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i64 %cond, 4
   br i1 %cmp0, label %endif, label %if
@@ -631,7 +631,7 @@ endif:
 ; VI: s_cmp_lg_u64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[}}[[K_LO]]:[[K_HI]]{{\]}}
 
 ; SI: v_cmp_ne_u64_e32
-define void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @br_scc_ne_i64_simm16(i64 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %cmp0 = icmp ne i64 %cond, 1234
   br i1 %cmp0, label %endif, label %if
diff --git a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
index ff9429843b22dfa7ae3f87755deb7983cff7577b..63ea21b05339c6470d6e729b65e5f2b3bcfe4c6a 100644
--- a/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
+++ b/test/CodeGen/AMDGPU/spill-alloc-sgpr-init-bug.ll
@@ -4,7 +4,7 @@
 ; allocate scratch registers correctly. Check that this test compiles without
 ; error.
 ; TONGA-LABEL: test
-define void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<256 x i32> addrspace(1)* %out, <256 x i32> addrspace(1)* %in) {
 entry:
   %mbcnt.lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %mbcnt.lo)
diff --git a/test/CodeGen/AMDGPU/spill-cfg-position.ll b/test/CodeGen/AMDGPU/spill-cfg-position.ll
index 686c83116fdc7e43254eb8e9a4052b6ef55bf72f..1ca0919258a8ef880c601702ad2c1a7b11de7f45 100644
--- a/test/CodeGen/AMDGPU/spill-cfg-position.ll
+++ b/test/CodeGen/AMDGPU/spill-cfg-position.ll
@@ -13,7 +13,7 @@
 ; CHECK-NEXT: s_or_b64 exec
 ; CHECK: buffer_
 
-define void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) {
+define amdgpu_kernel void @spill_cfg_position(i32 addrspace(1)* nocapture %arg) {
 bb:
   %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp14 = load i32, i32 addrspace(1)* %arg, align 4
diff --git a/test/CodeGen/AMDGPU/spill-m0.ll b/test/CodeGen/AMDGPU/spill-m0.ll
index 8c16b9d1649c947c23c19d3726b55c32c4fca161..0e715c453209e7d5edc9c7ed7143f1110e4c4dd4 100644
--- a/test/CodeGen/AMDGPU/spill-m0.ll
+++ b/test/CodeGen/AMDGPU/spill-m0.ll
@@ -17,11 +17,11 @@
 
 ; TOVMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
 ; TOVMEM-DAG: v_mov_b32_e32 [[SPILL_VREG:v[0-9]+]], [[M0_COPY]]
-; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Spill
+; TOVMEM: buffer_store_dword [[SPILL_VREG]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Spill
 ; TOVMEM: s_waitcnt vmcnt(0)
 
 ; TOSMEM-DAG: s_mov_b32 [[M0_COPY:s[0-9]+]], m0
-; TOSMEM: s_mov_b32 m0, s3{{$}}
+; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; TOSMEM-NOT: [[M0_COPY]]
 ; TOSMEM: s_buffer_store_dword [[M0_COPY]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Spill
 ; TOSMEM: s_waitcnt lgkmcnt(0)
@@ -32,18 +32,18 @@
 ; TOVGPR: v_readlane_b32 [[M0_RESTORE:s[0-9]+]], [[SPILL_VREG]], 0
 ; TOVGPR: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} ; 4-byte Folded Reload
+; TOVMEM: buffer_load_dword [[RELOAD_VREG:v[0-9]+]], off, s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}} offset:4 ; 4-byte Folded Reload
 ; TOVMEM: s_waitcnt vmcnt(0)
 ; TOVMEM: v_readfirstlane_b32 [[M0_RESTORE:s[0-9]+]], [[RELOAD_VREG]]
 ; TOVMEM: s_mov_b32 m0, [[M0_RESTORE]]
 
-; TOSMEM: s_mov_b32 m0, s3{{$}}
+; TOSMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; TOSMEM: s_buffer_load_dword [[M0_RESTORE:s[0-9]+]], s{{\[}}[[LO]]:[[HI]]], m0 ; 4-byte Folded Reload
 ; TOSMEM-NOT: [[M0_RESTORE]]
 ; TOSMEM: s_mov_b32 m0, [[M0_RESTORE]]
 
 ; GCN: s_add_i32 s{{[0-9]+}}, m0, 1
-define void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @spill_m0(i32 %cond, i32 addrspace(1)* %out) #0 {
 entry:
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
   %cmp0 = icmp eq i32 %cond, 0
@@ -67,12 +67,12 @@ endif:
 ; GCN: v_interp_mov_f32
 
 ; TOSMEM-NOT: s_m0
-; TOSMEM: s_mov_b32 m0, s7
+; TOSMEM: s_add_u32 m0, s7, 0x100
 ; TOSMEM-NEXT: s_buffer_store_dword s{{[0-9]+}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 4-byte Folded Spill
 ; TOSMEM-NOT: m0
 
 ; TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s7, 0x100
+; TOSMEM: s_add_u32 m0, s7, 0x200
 ; TOSMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
 ; TOSMEM-NOT: m0
 
@@ -81,16 +81,16 @@ endif:
 ; TOSMEM: s_branch
 
 ; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM-NEXT: s_add_u32 m0, s7, 0x100
+; TOSMEM-NEXT: s_add_u32 m0, s7, 0x200
 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
 
 
 ; GCN-NOT: v_readlane_b32 m0
 ; GCN-NOT: s_buffer_store_dword m0
 ; GCN-NOT: s_buffer_load_dword m0
-define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) #0 {
+define amdgpu_ps void @spill_kill_m0_lds(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %m0) #0 {
 main_body:
-  %tmp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3)
+  %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0)
   %cmp = fcmp ueq float 0.000000e+00, %tmp
   br i1 %cmp, label %if, label %else
 
@@ -100,14 +100,13 @@ if:                                               ; preds = %main_body
   br label %endif
 
 else:                                             ; preds = %main_body
-  %interp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3)
+  %interp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0)
   br label %endif
 
 endif:                                            ; preds = %else, %if
   %export = phi float [ %lds_data, %if ], [ %interp, %else ]
-  %tmp4 = call i32 @llvm.SI.packf16(float %export, float %export)
-  %tmp5 = bitcast i32 %tmp4 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp5, float %tmp5, float %tmp5, float %tmp5)
+  %tmp4 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %export, float %export)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp4, <2 x half> %tmp4, i1 true, i1 true) #0
   ret void
 }
 
@@ -122,7 +121,7 @@ endif:                                            ; preds = %else, %if
 ; GCN: ; clobber m0
 
 ; TOSMEM: s_mov_b32 vcc_hi, m0
-; TOSMEM: s_mov_b32 m0, s3
+; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM-NEXT: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Spill
 ; TOSMEM: s_mov_b32 m0, vcc_hi
 
@@ -131,16 +130,16 @@ endif:                                            ; preds = %else, %if
 ; TOSMEM: s_branch
 
 ; TOSMEM: BB{{[0-9]+_[0-9]+}}:
-; TOSMEM-NEXT: s_mov_b32 m0, s3
+; TOSMEM-NEXT: s_add_u32 m0, s3, 0x100
 ; TOSMEM-NEXT: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, m0 ; 8-byte Folded Reload
 
 ; GCN-NOT: v_readlane_b32 m0
 ; GCN-NOT: s_buffer_store_dword m0
 ; GCN-NOT: s_buffer_load_dword m0
-define void @m0_unavailable_spill(i32 %arg3) #0 {
+define amdgpu_kernel void @m0_unavailable_spill(i32 %m0.arg) #0 {
 main_body:
   %m0 = call i32 asm sideeffect "; def $0, 1", "={M0}"() #0
-  %tmp = call float @llvm.SI.fs.constant(i32 0, i32 0, i32 %arg3)
+  %tmp = call float @llvm.amdgcn.interp.mov(i32 2, i32 0, i32 0, i32 %m0.arg)
   call void asm sideeffect "; clobber $0", "~{M0}"() #0
   %cmp = fcmp ueq float 0.000000e+00, %tmp
    br i1 %cmp, label %if, label %else
@@ -161,10 +160,10 @@ endif:
 ; TOSMEM: s_load_dwordx2 [[REG:s\[[0-9]+:[0-9]+\]]]
 ; TOSMEM: s_cmp_eq_u32
 ; TOSMEM-NOT: m0
-; TOSMEM: s_mov_b32 m0, s3
+; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM: s_buffer_store_dwordx2 [[REG]], s[88:91], m0 ; 8-byte Folded Spill
 ; TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x200
+; TOSMEM: s_add_u32 m0, s3, 0x300
 ; TOSMEM: s_buffer_store_dword s{{[0-9]+}}, s[88:91], m0 ; 4-byte Folded Spill
 ; TOSMEM-NOT: m0
 ; TOSMEM: s_cbranch_scc1
@@ -172,7 +171,7 @@ endif:
 ; TOSMEM: s_mov_b32 m0, -1
 
 ; TOSMEM: s_mov_b32 vcc_hi, m0
-; TOSMEM: s_mov_b32 m0, s3
+; TOSMEM: s_add_u32 m0, s3, 0x100
 ; TOSMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[88:91], m0 ; 8-byte Folded Reload
 ; TOSMEM: s_mov_b32 m0, vcc_hi
 ; TOSMEM: s_waitcnt lgkmcnt(0)
@@ -180,7 +179,7 @@ endif:
 ; TOSMEM: ds_write_b64
 
 ; TOSMEM-NOT: m0
-; TOSMEM: s_add_u32 m0, s3, 0x200
+; TOSMEM: s_add_u32 m0, s3, 0x300
 ; TOSMEM: s_buffer_load_dword s0, s[88:91], m0 ; 4-byte Folded Reload
 ; TOSMEM-NOT: m0
 ; TOSMEM: s_waitcnt lgkmcnt(0)
@@ -190,7 +189,7 @@ endif:
 
 ; TOSMEM: s_dcache_wb
 ; TOSMEM: s_endpgm
-define void @restore_m0_lds(i32 %arg) {
+define amdgpu_kernel void @restore_m0_lds(i32 %arg) {
   %m0 = call i32 asm sideeffect "s_mov_b32 m0, 0", "={M0}"() #0
   %sval = load volatile i64, i64 addrspace(2)* undef
   %cmp = icmp eq i32 %arg, 0
@@ -205,10 +204,10 @@ ret:
   ret void
 }
 
-declare float @llvm.SI.fs.constant(i32, i32, i32) readnone
-
-declare i32 @llvm.SI.packf16(float, float) readnone
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare float @llvm.amdgcn.interp.mov(i32, i32, i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
 
 attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
index 9b3dfab2be6a90ae010cb109946c1b51936a0f09..c05021a91ff059b6f099f10b4538b1305304d5a9 100644
--- a/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
+++ b/test/CodeGen/AMDGPU/spill-scavenge-offset.ll
@@ -11,7 +11,7 @@
 
 ; Just test that it compiles successfully.
 ; CHECK-LABEL: test
-define void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test(<1280 x i32> addrspace(1)* %out, <1280 x i32> addrspace(1)* %in) {
 entry:
   %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0)
   %tid = call i32 @llvm.amdgcn.mbcnt.hi(i32 -1, i32 %lo)
diff --git a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
index cab45be8da5070aca09c54702ee6179501b286d0..ebba35a6689af2fc99a8c4da30b79f76ae77ec9e 100644
--- a/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
+++ b/test/CodeGen/AMDGPU/spill-wide-sgpr.ll
@@ -3,11 +3,11 @@
 ; RUN: llc -O0 -march=amdgcn -mcpu=fiji -amdgpu-spill-sgpr-to-smem=0 -amdgpu-spill-sgpr-to-vgpr=0 -verify-machineinstrs < %s | FileCheck -check-prefix=ALL -check-prefix=VMEM %s
 
 ; ALL-LABEL: {{^}}spill_sgpr_x2:
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_store_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Spill
 ; SMEM: s_cbranch_scc1
 
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s[8:11], m0 ; 8-byte Folded Reload
 
 ; SMEM: s_dcache_wb
@@ -44,11 +44,11 @@ ret:
 }
 
 ; ALL-LABEL: {{^}}spill_sgpr_x4:
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Spill
 ; SMEM: s_cbranch_scc1
 
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[12:15], m0 ; 16-byte Folded Reload
 ; SMEM: s_dcache_wb
 ; SMEM: s_endpgm
@@ -93,15 +93,15 @@ ret:
 
 ; ALL-LABEL: {{^}}spill_sgpr_x8:
 
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
-; SMEM: s_add_u32 m0, s3, 16
+; SMEM: s_add_u32 m0, s3, 0x110{{$}}
 ; SMEM: s_buffer_store_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Spill
 ; SMEM: s_cbranch_scc1
 
-; SMEM: s_mov_b32 m0, s3{{$}}
+; SMEM: s_add_u32 m0, s3, 0x100{{$}}
 ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
-; SMEM: s_add_u32 m0, s3, 16
+; SMEM: s_add_u32 m0, s3, 0x110{{$}}
 ; SMEM: s_buffer_load_dwordx4 s{{\[[0-9]+:[0-9]+\]}}, s[16:19], m0 ; 16-byte Folded Reload
 
 ; SMEM: s_dcache_wb
diff --git a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
index d4e2dc81405085c22e7c869c147a0d1a07882abd..5d7d29db3a2f59718cfed7a532d06bc415558af4 100644
--- a/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
+++ b/test/CodeGen/AMDGPU/split-scalar-i64-add.ll
@@ -10,7 +10,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() readnone
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_0:
 ; SI: v_add_i32_e32 v{{[0-9]+}}, vcc, 0x18f, v{{[0-9]+}}
 ; SI: v_addc_u32_e32 v{{[0-9]+}}, vcc, 0, v{{[0-9]+}}, vcc
-define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %s.val) {
   %v.val = load volatile i32, i32 addrspace(1)* %in
   %vec.0 = insertelement <2 x i32> undef, i32 %s.val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
@@ -23,7 +23,7 @@ define void @imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 addrspace(1
 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_0:
 ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, 0x18f
 ; SI: s_addc_u32 {{s[0-9]+}}, 0xf423f, 0
-define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
+define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 999999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
@@ -35,7 +35,7 @@ define void @s_imp_def_vcc_split_i64_add_0(i64 addrspace(1)* %out, i32 %val) {
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_1:
 ; SI: v_add_i32
 ; SI: v_addc_u32
-define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
   %v.val = load volatile i32, i32 addrspace(1)* %in
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 %v.val, i32 1
@@ -48,7 +48,7 @@ define void @imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 addrspace(1
 ; FUNC-LABEL: {{^}}s_imp_def_vcc_split_i64_add_1:
 ; SI: s_add_u32 {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
 ; SI: s_addc_u32 {{s[0-9]+}}, 0x1869f, {{s[0-9]+}}
-define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i64 %val1) {
   %vec.0 = insertelement <2 x i32> undef, i32 %val0, i32 0
   %vec.1 = insertelement <2 x i32> %vec.0, i32 99999, i32 1
   %bc = bitcast <2 x i32> %vec.1 to i64
@@ -61,7 +61,7 @@ define void @s_imp_def_vcc_split_i64_add_1(i64 addrspace(1)* %out, i32 %val0, i6
 ; FUNC-LABEL: {{^}}imp_def_vcc_split_i64_add_2:
 ; SI: v_add_i32_e32 {{v[0-9]+}}, vcc, {{s[0-9]+}}, {{v[0-9]+}}
 ; SI: v_addc_u32_e32 {{v[0-9]+}}, vcc, {{v[0-9]+}}, {{v[0-9]+}}, vcc
-define void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
+define amdgpu_kernel void @imp_def_vcc_split_i64_add_2(i64 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %val0, i64 %val1) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() readnone
   %gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %load = load i32, i32 addrspace(1)* %gep
diff --git a/test/CodeGen/AMDGPU/split-smrd.ll b/test/CodeGen/AMDGPU/split-smrd.ll
index d07da103093671ddb257940b2a7a5d1044bf67ec..cdb1b1e3b5032de3a5b3b5c1adfdd6f35914bdff 100644
--- a/test/CodeGen/AMDGPU/split-smrd.ll
+++ b/test/CodeGen/AMDGPU/split-smrd.ll
@@ -1,11 +1,11 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
 
 ; FIXME: Move this to sgpr-copy.ll when this is fixed on VI.
 ; Make sure that when we split an smrd instruction in order to move it to
 ; the VALU, we are also moving its users to the VALU.
-; CHECK-LABEL: {{^}}split_smrd_add_worklist:
-; CHECK: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 
+; GCN-LABEL: {{^}}split_smrd_add_worklist:
+; GCN: image_sample v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}], s[{{[0-9]+:[0-9]+}}] dmask:0x1
 define amdgpu_ps void @split_smrd_add_worklist([34 x <8 x i32>] addrspace(2)* byval %arg) #0 {
 bb:
   %tmp = call float @llvm.SI.load.const(<16 x i8> undef, i32 96)
@@ -21,27 +21,22 @@ bb3:                                              ; preds = %bb
   %tmp6 = sext i32 %tmp5 to i64
   %tmp7 = getelementptr [34 x <8 x i32>], [34 x <8 x i32>] addrspace(2)* %arg, i64 0, i64 %tmp6
   %tmp8 = load <8 x i32>, <8 x i32> addrspace(2)* %tmp7, align 32, !tbaa !0
-  %tmp9 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 1061158912, i32 1048576000>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp9 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float bitcast (i32 1061158912 to float), float bitcast (i32 1048576000 to float)>, <8 x i32> %tmp8, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp10 = extractelement <4 x float> %tmp9, i32 0
-  %tmp12 = call i32 @llvm.SI.packf16(float %tmp10, float undef)
-  %tmp13 = bitcast i32 %tmp12 to float
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float undef, float %tmp13, float undef, float undef)
+  %tmp12 = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %tmp10, float undef)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %tmp12, <2 x half> undef, i1 true, i1 true) #0
   ret void
 }
 
-; Function Attrs: nounwind readnone
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) #1
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 declare float @llvm.SI.load.const(<16 x i8>, i32) #1
 
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-
-declare i32 @llvm.SI.packf16(float, float) #1
-
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
-!1 = !{!"const", !3}
-!2 = !{!1, !1, i64 0}
-!3 = !{!"tbaa root"}
+!1 = !{!"const", !2}
+!2 = !{!"tbaa root"}
diff --git a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
index 37ec2b012896f7ae7cfe227234efca63581ad27e..c2426993bb3a17683a4e58bd4195ecf8ae3aa6f9 100644
--- a/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
+++ b/test/CodeGen/AMDGPU/split-vector-memoperand-offsets.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=amdgcn -mcpu=hawaii -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=hawaii -enable-amdgpu-aa=0 -verify-machineinstrs -mattr=-promote-alloca,-load-store-opt < %s | FileCheck -check-prefix=GCN %s
 
 @sPrivateStorage = internal addrspace(3) global [256 x [8 x <4 x i64>]] undef
 
@@ -29,7 +29,7 @@
 ; GCN-DAG: buffer_store_dwordx2 {{v\[[0-9]+:[0-9]+\]}}, {{v\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:24
 
 ; GCN: s_endpgm
-define void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 {
+define amdgpu_kernel void @ds_reorder_vector_split(<4 x i64> addrspace(1)* nocapture readonly %srcValues, i32 addrspace(1)* nocapture readonly %offsets, <4 x i64> addrspace(1)* nocapture %destBuffer, i32 %alignmentOffset) #0 {
 entry:
   %tmp = tail call i32 @llvm.r600.read.local.size.y()
   %tmp1 = tail call i32 @llvm.r600.read.local.size.z()
diff --git a/test/CodeGen/AMDGPU/splitkit.mir b/test/CodeGen/AMDGPU/splitkit.mir
new file mode 100644
index 0000000000000000000000000000000000000000..41782af40e3cb76a0022688b6addbf101aa16ed8
--- /dev/null
+++ b/test/CodeGen/AMDGPU/splitkit.mir
@@ -0,0 +1,105 @@
+# RUN: llc -o - %s -mtriple=amdgcn-- -mcpu=fiji -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
+--- |
+  define amdgpu_kernel void @func0() #0 { ret void }
+  define amdgpu_kernel void @func1() #0 { ret void }
+  define amdgpu_kernel void @splitHoist() #0 { ret void }
+
+  attributes #0 = { "amdgpu-num-sgpr"="12" }
+...
+---
+# Make sure we only get a single spill+reload even if liverange splitting
+# created a sequence of multiple copy instructions.
+# CHECK-LABEL: name: func0
+# CHECK: SI_SPILL_S128_SAVE
+# CHECK-NOT: SI_SPILL_S128_SAVE
+# CHECK: S_NOP 0
+# CHECK: SI_SPILL_S128_RESTORE
+# CHECK-NOT: SI_SPILL_S128_RESTORE
+name: func0
+body: |
+  bb.0:
+    S_NOP 0, implicit-def undef %0.sub0 : sreg_128
+    S_NOP 0, implicit-def %0.sub3 : sreg_128
+
+    ; Clobber registers
+    S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1, implicit-def dead %sgpr2, implicit-def dead %sgpr3, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11
+
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub3
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub3
+...
+---
+# LiveRange splitting should split this into 2 intervals with the second getting
+# allocated to sgpr0_sgpr1 and the first to something else so we see two copies
+# in between for the two subregisters that are alive.
+# CHECK-LABEL: name: func1
+# CHECK: [[REG0:%sgpr[0-9]+]] = COPY %sgpr0
+# CHECK: [[REG1:%sgpr[0-9]+]] = COPY %sgpr2
+# CHECK: S_NOP 0
+# CHECK: S_NOP 0, implicit [[REG0]]
+# CHECK: S_NOP 0, implicit [[REG1]]
+# CHECK: %sgpr0 = COPY [[REG0]]
+# CHECK: %sgpr2 = COPY [[REG1]]
+# CHECK: S_NOP
+# CHECK: S_NOP 0, implicit %sgpr0
+# CHECK: S_NOP 0, implicit %sgpr2
+name: func1
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: %sgpr0, %sgpr1, %sgpr2
+    undef %0.sub0 : sreg_128 = COPY %sgpr0
+    %0.sub2 = COPY %sgpr2
+
+    S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1
+
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub2
+
+    ; Clobber everything but sgpr0-sgpr3
+    S_NOP 0, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11, implicit-def dead %sgpr12, implicit-def dead %sgpr13, implicit-def dead %sgpr14, implicit-def dead %sgpr15, implicit-def dead %vcc_lo, implicit-def dead %vcc_hi
+
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub2
+...
+---
+# Check that copy hoisting out of loops works. This mainly should not crash the
+# compiler when it hoists a subreg copy sequence.
+# CHECK-LABEL: name: splitHoist
+# CHECK: S_NOP 0, implicit-def %sgpr0
+# CHECK: S_NOP 0, implicit-def %sgpr3
+# CHECK-NEXT: SI_SPILL_S128_SAVE
+name: splitHoist
+tracksRegLiveness: true
+body: |
+  bb.0:
+    successors: %bb.1, %bb.2
+    S_NOP 0, implicit-def undef %0.sub0 : sreg_128
+    S_NOP 0, implicit-def %0.sub3 : sreg_128
+
+    S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+    S_BRANCH %bb.2
+
+  bb.1:
+    successors: %bb.1, %bb.3
+    S_NOP 0, implicit %0.sub0
+
+    ; Clobber registers
+    S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1, implicit-def dead %sgpr2, implicit-def dead %sgpr3, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11
+
+    S_CBRANCH_VCCNZ %bb.1, implicit undef %vcc
+    S_BRANCH %bb.3
+
+  bb.2:
+    successors: %bb.3
+    ; Clobber registers
+    S_NOP 0, implicit-def dead %sgpr0, implicit-def dead %sgpr1, implicit-def dead %sgpr2, implicit-def dead %sgpr3, implicit-def dead %sgpr4, implicit-def dead %sgpr5, implicit-def dead %sgpr6, implicit-def dead %sgpr7, implicit-def dead %sgpr8, implicit-def dead %sgpr9, implicit-def dead %sgpr10, implicit-def dead %sgpr11
+    S_BRANCH %bb.3
+
+  bb.3:
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub3
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub3
+...
diff --git a/test/CodeGen/AMDGPU/sra.ll b/test/CodeGen/AMDGPU/sra.ll
index ad7c86fe7919c26293d03dfa99556d3c1cd23675..b4355b76016a1ba380c35f010e94ab904928ff81 100644
--- a/test/CodeGen/AMDGPU/sra.ll
+++ b/test/CodeGen/AMDGPU/sra.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -37,7 +37,7 @@ define void @ashr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: ASHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -49,9 +49,9 @@ define void @ashr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ; FUNC-LABEL: {{^}}ashr_v2i16:
 ; FIXME: The ashr operation is uniform, but because its operands come from a
 ; global load we end up with the vector instructions rather than scalar.
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+define amdgpu_kernel void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
   %a = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %b = load <2 x i16>, <2 x i16> addrspace(1)* %b_ptr
@@ -63,11 +63,11 @@ define void @ashr_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %i
 ; FUNC-LABEL: {{^}}ashr_v4i16:
 ; FIXME: The ashr operation is uniform, but because its operands come from a
 ; global load we end up with the vector instructions rather than scalar.
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-; VI: v_ashrrev_i32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:WORD_0
+; VI: v_ashrrev_i32_sdwa v{{[0-9]+}}, sext(v{{[0-9]+}}), sext(v{{[0-9]+}}) dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+define amdgpu_kernel void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
   %a = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %b = load <4 x i16>, <4 x i16> addrspace(1)* %b_ptr
@@ -80,7 +80,7 @@ define void @ashr_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %i
 ; GCN: s_ashr_i64 s[{{[0-9]}}:{{[0-9]}}], s[{{[0-9]}}:{{[0-9]}}], 8
 
 ; EG: ASHR
-define void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_ashr_i64(i64 addrspace(1)* %out, i32 %in) {
 entry:
   %in.ext = sext i32 %in to i64
   %ashr = ashr i64 %in.ext, 8
@@ -105,7 +105,7 @@ entry:
 ; EG-DAG: SETGT_UINT {{\*? *}}[[RESC:T[0-9]+\.[XYZW]]], [[SHIFT]], literal
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]}}
-define void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_i64_2(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 entry:
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
@@ -143,7 +143,7 @@ entry:
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -156,7 +156,7 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; XFUNC-LABEL: {{^}}s_ashr_v2i64:
 ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
 ; XGCN: s_ashr_i64 {{s\[[0-9]+:[0-9]+\], s\[[0-9]+:[0-9]+\], s[0-9]+}}
-; define void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) {
+; define amdgpu_kernel void @s_ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in, <2 x i64> %a, <2 x i64> %b) {
 ;   %result = ashr <2 x i64> %a, %b
 ;   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
 ;   ret void
@@ -221,7 +221,7 @@ define void @ashr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -235,7 +235,7 @@ define void @ashr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 s{{[0-9]+}}, s[[HI]], s{{[0-9]+}}
 ; GCN: s_addc_u32 s{{[0-9]+}}, s[[SHIFT]], s{{[0-9]+}}
-define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = ashr i64 %a, 32
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -247,7 +247,7 @@ define void @s_ashr_32_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; VI: flat_load_dword v[[HI:[0-9]+]]
 ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[HI]]:[[SHIFT]]{{\]}}
-define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
@@ -262,7 +262,7 @@ define void @v_ashr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; GCN: s_ashr_i32 s[[SHIFT:[0-9]+]], s[[HI]], 31
 ; GCN: s_add_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
 ; GCN: s_addc_u32 {{s[0-9]+}}, s[[SHIFT]], {{s[0-9]+}}
-define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = ashr i64 %a, 63
   %add = add i64 %result, %b
   store i64 %add, i64 addrspace(1)* %out
@@ -275,7 +275,7 @@ define void @s_ashr_63_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 ; GCN: v_ashrrev_i32_e32 v[[SHIFT:[0-9]+]], 31, v[[HI]]
 ; GCN: v_mov_b32_e32 v[[COPY:[0-9]+]], v[[SHIFT]]
 ; GCN: {{buffer|flat}}_store_dwordx2 {{.*}}v{{\[}}[[SHIFT]]:[[COPY]]{{\]}}
-define void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_ashr_63_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/srem.ll b/test/CodeGen/AMDGPU/srem.ll
index c78fd549b31659230a160ee9f851fa6dbb9dd79f..c89f798397ae60c7321ed9dd654b34b29eebc346 100644
--- a/test/CodeGen/AMDGPU/srem.ll
+++ b/test/CodeGen/AMDGPU/srem.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -march=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
 ; RUN: llc -march=r600 -mcpu=redwood < %s
 
-define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in
   %den = load i32, i32 addrspace(1) * %den_ptr
@@ -11,7 +11,7 @@ define void @srem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   ret void
 }
 
-define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = srem i32 %num, 4
   store i32 %result, i32 addrspace(1)* %out
@@ -24,14 +24,14 @@ define void @srem_i32_4(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_mul_lo_i32
 ; SI: v_sub_i32
 ; SI: s_endpgm
-define void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = srem i32 %num, 7
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
-define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %den = load <2 x i32>, <2 x i32> addrspace(1) * %den_ptr
@@ -40,14 +40,14 @@ define void @srem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i32_4(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %num = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %result = srem <2 x i32> %num, <i32 4, i32 4>
   store <2 x i32> %result, <2 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %den = load <4 x i32>, <4 x i32> addrspace(1) * %den_ptr
@@ -56,14 +56,14 @@ define void @srem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i32_4(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %num = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = srem <4 x i32> %num, <i32 4, i32 4, i32 4, i32 4>
   store <4 x i32> %result, <4 x i32> addrspace(1)* %out
   ret void
 }
 
-define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %den_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %num = load i64, i64 addrspace(1) * %in
   %den = load i64, i64 addrspace(1) * %den_ptr
@@ -72,14 +72,14 @@ define void @srem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   ret void
 }
 
-define void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @srem_i64_4(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %num = load i64, i64 addrspace(1) * %in
   %result = srem i64 %num, 4
   store i64 %result, i64 addrspace(1)* %out
   ret void
 }
 
-define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %den_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
   %den = load <2 x i64>, <2 x i64> addrspace(1) * %den_ptr
@@ -88,14 +88,14 @@ define void @srem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v2i64_4(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %num = load <2 x i64>, <2 x i64> addrspace(1) * %in
   %result = srem <2 x i64> %num, <i64 4, i64 4>
   store <2 x i64> %result, <2 x i64> addrspace(1)* %out
   ret void
 }
 
-define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %den_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
   %den = load <4 x i64>, <4 x i64> addrspace(1) * %den_ptr
@@ -104,7 +104,7 @@ define void @srem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
   ret void
 }
 
-define void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @srem_v4i64_4(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %num = load <4 x i64>, <4 x i64> addrspace(1) * %in
   %result = srem <4 x i64> %num, <i64 4, i64 4, i64 4, i64 4>
   store <4 x i64> %result, <4 x i64> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/srl.ll b/test/CodeGen/AMDGPU/srl.ll
index 6b006fd936d741245507c341bfb25381f7d016fc..1daf4bb33e819e2791bb66c3de49a22605dde420 100644
--- a/test/CodeGen/AMDGPU/srl.ll
+++ b/test/CodeGen/AMDGPU/srl.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() #0
 ; SI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_lshrrev_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -26,7 +26,7 @@ define void @lshr_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -50,7 +50,7 @@ define void @lshr_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; EG: LSHR {{\*? *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -74,7 +74,7 @@ define void @lshr_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESLO:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW]|PS}}
 ; EG-DAG: LSHR {{\*? *}}[[LOBIG:T[0-9]+\.[XYZW]]], [[OPHI]], [[SHIFT]]
 ; EG-DAG: CNDE_INT {{\*? *}}[[RESHI:T[0-9]+\.[XYZW]]], {{T[0-9]+\.[XYZW], .*}}, 0.0
-define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -112,7 +112,7 @@ define void @lshr_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; EG-DAG: CNDE_INT {{.*}}, 0.0
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -178,7 +178,7 @@ define void @lshr_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %i
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
 ; EG-DAG: CNDE_INT
-define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
@@ -193,7 +193,7 @@ define void @lshr_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %i
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], [[HI_A]]
 ; GCN: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = lshr i64 %a, 32
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -203,7 +203,7 @@ define void @s_lshr_32_i64(i64 addrspace(1)* %out, i64 %a) {
 ; GCN-DAG: buffer_load_dword v[[HI_A:[0-9]+]], v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:4
 ; GCN-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], 0{{$}}
 ; GCN: buffer_store_dwordx2 v{{\[}}[[HI_A]]:[[VHI]]{{\]}}
-define void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_lshr_32_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() #0
   %gep.in = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %gep.out = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/ssubo.ll b/test/CodeGen/AMDGPU/ssubo.ll
index 26884a1b776148105373e2b26debdcd623f2743a..135632343f9094054bee513c418526e785e5c086 100644
--- a/test/CodeGen/AMDGPU/ssubo.ll
+++ b/test/CodeGen/AMDGPU/ssubo.ll
@@ -6,7 +6,7 @@ declare { i32, i1 } @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
 
 ; FUNC-LABEL: {{^}}ssubo_i64_zext:
-define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
@@ -17,7 +17,7 @@ define void @ssubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) nounwind {
 }
 
 ; FUNC-LABEL: {{^}}s_ssubo_i32:
-define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
+define amdgpu_kernel void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) nounwind {
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
   %val = extractvalue { i32, i1 } %ssub, 0
   %carry = extractvalue { i32, i1 } %ssub, 1
@@ -27,7 +27,7 @@ define void @s_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 }
 
 ; FUNC-LABEL: {{^}}v_ssubo_i32:
-define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %aptr, i32 addrspace(1)* %bptr) nounwind {
   %a = load i32, i32 addrspace(1)* %aptr, align 4
   %b = load i32, i32 addrspace(1)* %bptr, align 4
   %ssub = call { i32, i1 } @llvm.ssub.with.overflow.i32(i32 %a, i32 %b) nounwind
@@ -41,7 +41,7 @@ define void @v_ssubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 ; FUNC-LABEL: {{^}}s_ssubo_i64:
 ; SI: s_sub_u32
 ; SI: s_subb_u32
-define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) nounwind {
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
   %val = extractvalue { i64, i1 } %ssub, 0
   %carry = extractvalue { i64, i1 } %ssub, 1
@@ -53,7 +53,7 @@ define void @s_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_ssubo_i64:
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
+define amdgpu_kernel void @v_ssubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %aptr, i64 addrspace(1)* %bptr) nounwind {
   %a = load i64, i64 addrspace(1)* %aptr, align 4
   %b = load i64, i64 addrspace(1)* %bptr, align 4
   %ssub = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 %a, i64 %b) nounwind
diff --git a/test/CodeGen/AMDGPU/store-barrier.ll b/test/CodeGen/AMDGPU/store-barrier.ll
index 57a93ccd2505560bbaafb2e0e38f36d0e33f7964..afa4e94222cd9f6fb6f70b48c81318989744dbcb 100644
--- a/test/CodeGen/AMDGPU/store-barrier.ll
+++ b/test/CodeGen/AMDGPU/store-barrier.ll
@@ -12,7 +12,7 @@
 ; CHECK: s_barrier
 ; CHECK: s_endpgm
 ; Function Attrs: nounwind
-define void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
+define amdgpu_kernel void @test(<2 x i8> addrspace(3)* nocapture %arg, <2 x i8> addrspace(1)* nocapture readonly %arg1, i32 addrspace(1)* nocapture readonly %arg2, <2 x i8> addrspace(1)* nocapture %arg3, i32 %arg4, i64 %tmp9) #0 {
 bb:
   %tmp10 = getelementptr inbounds i32, i32 addrspace(1)* %arg2, i64 %tmp9
   %tmp13 = load i32, i32 addrspace(1)* %tmp10, align 2
diff --git a/test/CodeGen/AMDGPU/store-global.ll b/test/CodeGen/AMDGPU/store-global.ll
index 5d49795a68eca84320fbdf273962939fcfa733e4..160e921fc075995ddad68eeb0ca8460864782600 100644
--- a/test/CodeGen/AMDGPU/store-global.ll
+++ b/test/CodeGen/AMDGPU/store-global.ll
@@ -11,7 +11,7 @@
 ; CM-NOT: MEM_RAT MSKOR
 
 ; GCN: buffer_store_byte
-define void @store_i1(i1 addrspace(1)* %out) {
+define amdgpu_kernel void @store_i1(i1 addrspace(1)* %out) {
 entry:
   store i1 true, i1 addrspace(1)* %out
   ret void
@@ -42,7 +42,7 @@ entry:
 
 ; GCN: buffer_store_byte
 
-define void @store_i8(i8 addrspace(1)* %out, i8 %in) {
+define amdgpu_kernel void @store_i8(i8 addrspace(1)* %out, i8 %in) {
 entry:
   store i8 %in, i8 addrspace(1)* %out
   ret void
@@ -75,7 +75,7 @@ entry:
 ; EG: MOV * T[[RW_GPR]].Z, 0.0
 
 ; GCN: buffer_store_short
-define void @store_i16(i16 addrspace(1)* %out, i16 %in) {
+define amdgpu_kernel void @store_i16(i16 addrspace(1)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(1)* %out
   ret void
@@ -88,7 +88,7 @@ entry:
 
 ; EG: MEM_RAT MSKOR
 ; EG: MEM_RAT MSKOR
-define void @store_i24(i24 addrspace(1)* %out, i24 %in) {
+define amdgpu_kernel void @store_i24(i24 addrspace(1)* %out, i24 %in) {
 entry:
   store i24 %in, i24 addrspace(1)* %out
   ret void
@@ -104,7 +104,7 @@ entry:
 
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 ; CM-NOT: MEM_RAT
-define void @store_i25(i25 addrspace(1)* %out, i25 %in) {
+define amdgpu_kernel void @store_i25(i25 addrspace(1)* %out, i25 %in) {
 entry:
   store i25 %in, i25 addrspace(1)* %out
   ret void
@@ -119,7 +119,7 @@ entry:
 ; CM-NOT: MEM_RAT MSKOR
 
 ; GCN: buffer_store_short
-define void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(1)* %out
@@ -136,7 +136,7 @@ entry:
 ; CM-NOT: MEM_RAT MSKOR
 
 ; SI: buffer_store_byte
-define void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(1)* %out, align 1
@@ -150,7 +150,7 @@ entry:
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dword
-define void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(1)* %out
@@ -170,7 +170,7 @@ entry:
 
 ; SI: buffer_store_short
 ; SI: buffer_store_short
-define void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(1)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(1)* %out, align 2
@@ -183,7 +183,7 @@ entry:
 ; CM: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dword
-define void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out
@@ -210,7 +210,7 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 1
@@ -231,7 +231,7 @@ entry:
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(1)* %out, align 2
@@ -246,7 +246,7 @@ entry:
 
 ; GCN: buffer_store_dword
 
-define void @store_f32(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @store_f32(float addrspace(1)* %out, float %in) {
   store float %in, float addrspace(1)* %out
   ret void
 }
@@ -257,7 +257,7 @@ define void @store_f32(float addrspace(1)* %out, float %in) {
 ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}
 
 ; GCN: buffer_store_dwordx2
-define void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
   store <4 x i16> %0, <4 x i16> addrspace(1)* %out
@@ -272,7 +272,7 @@ entry:
 
 ; GCN: buffer_store_dwordx2
 
-define void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
+define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(1)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
   %1 = insertelement <2 x float> %0, float %b, i32 1
@@ -286,7 +286,7 @@ entry:
 
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.[XYZW]}}, {{T[0-9]+\.[XYZW]}},
 ; EG-DAG: MEM_RAT_CACHELESS STORE_RAW {{T[0-9]+\.XY}}, {{T[0-9]+\.[XYZW]}},
-define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
+define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
   store <3 x i32> %a, <3 x i32> addrspace(1)* %out, align 16
   ret void
 }
@@ -299,7 +299,7 @@ define void @store_v3i32(<3 x i32> addrspace(1)* %out, <3 x i32> %a) nounwind {
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx4
-define void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out
   ret void
@@ -313,7 +313,7 @@ entry:
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; SI: buffer_store_dwordx4
-define void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(1)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(1)* %out, align 4
   ret void
@@ -328,7 +328,7 @@ entry:
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx4
-define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
+define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in) {
   %1 = load <4 x float>, <4 x float> addrspace(1) * %in
   store <4 x float> %1, <4 x float> addrspace(1)* %out
   ret void
@@ -340,7 +340,7 @@ define void @store_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1
 ; CM: MEM_RAT MSKOR
 
 ; GCN: buffer_store_byte
-define void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i8(i8 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(1)* %out
@@ -350,7 +350,7 @@ entry:
 ; FUNC-LABEL: {{^}}store_i64_i16:
 ; EG: MEM_RAT MSKOR
 ; GCN: buffer_store_short
-define void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i16(i16 addrspace(1)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(1)* %out
@@ -369,7 +369,7 @@ entry:
 ; CM-NOT: MEM_RAT_CACHELESS STORE_DWORD
 
 ; GCN: buffer_store_dwordx2
-define void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(i32 addrspace(1)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32, i32 addrspace(2)* %mem, align 4
   %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
@@ -388,7 +388,7 @@ entry:
 ; CM: MEM_RAT_CACHELESS STORE_DWORD T{{[0-9]+}}, T{{[0-9]+}}.X
 
 ; GCN: buffer_store_dwordx4
-define void @i128-const-store(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @i128-const-store(i32 addrspace(1)* %out) {
 entry:
   store i32 1, i32 addrspace(1)* %out, align 4
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
diff --git a/test/CodeGen/AMDGPU/store-local.ll b/test/CodeGen/AMDGPU/store-local.ll
index 03fd30ca9a2599020de8ad5fd673e579794e30ad..c144bf2aa878c9b04605e81957ee547cf0e35633 100644
--- a/test/CodeGen/AMDGPU/store-local.ll
+++ b/test/CodeGen/AMDGPU/store-local.ll
@@ -9,7 +9,7 @@
 ; CM: LDS_BYTE_WRITE
 
 ; GCN: ds_write_b8
-define void @store_local_i1(i1 addrspace(3)* %out) {
+define amdgpu_kernel void @store_local_i1(i1 addrspace(3)* %out) {
 entry:
   store i1 true, i1 addrspace(3)* %out
   ret void
@@ -21,7 +21,7 @@ entry:
 ; CM: LDS_BYTE_WRITE
 
 ; GCN: ds_write_b8
-define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
+define amdgpu_kernel void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
   store i8 %in, i8 addrspace(3)* %out
   ret void
 }
@@ -32,7 +32,7 @@ define void @store_local_i8(i8 addrspace(3)* %out, i8 %in) {
 ; CM: LDS_SHORT_WRITE
 
 ; GCN: ds_write_b16
-define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
+define amdgpu_kernel void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
   store i16 %in, i16 addrspace(3)* %out
   ret void
 }
@@ -43,7 +43,7 @@ define void @store_local_i16(i16 addrspace(3)* %out, i16 %in) {
 ; CM: LDS_WRITE
 
 ; GCN: ds_write_b32
-define void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
+define amdgpu_kernel void @store_local_v2i16(<2 x i16> addrspace(3)* %out, <2 x i16> %in) {
 entry:
   store <2 x i16> %in, <2 x i16> addrspace(3)* %out
   ret void
@@ -55,7 +55,7 @@ entry:
 ; CM: LDS_WRITE
 
 ; GCN: ds_write_b32
-define void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out
   ret void
@@ -78,7 +78,7 @@ entry:
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
-define void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8_unaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 1
   ret void
@@ -95,7 +95,7 @@ entry:
 
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
+define amdgpu_kernel void @store_local_v4i8_halfaligned(<4 x i8> addrspace(3)* %out, <4 x i8> %in) {
 entry:
   store <4 x i8> %in, <4 x i8> addrspace(3)* %out, align 2
   ret void
@@ -111,7 +111,7 @@ entry:
 ; CM-NOT: LDS_WRITE
 
 ; GCN: ds_write_b64
-define void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_local_v2i32(<2 x i32> addrspace(3)* %out, <2 x i32> %in) {
 entry:
   store <2 x i32> %in, <2 x i32> addrspace(3)* %out
   ret void
@@ -129,7 +129,7 @@ entry:
 ; CM: LDS_WRITE
 
 ; GCN: ds_write2_b64
-define void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_local_v4i32(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out
   ret void
@@ -148,7 +148,7 @@ entry:
 
 ; GCN: ds_write2_b32
 ; GCN: ds_write2_b32
-define void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_local_v4i32_align4(<4 x i32> addrspace(3)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(3)* %out, align 4
   ret void
@@ -157,7 +157,7 @@ entry:
 ; FUNC-LABEL: {{^}}store_local_i64_i8:
 ; EG: LDS_BYTE_WRITE
 ; GCN: ds_write_b8
-define void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
+define amdgpu_kernel void @store_local_i64_i8(i8 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(3)* %out
@@ -167,7 +167,7 @@ entry:
 ; FUNC-LABEL: {{^}}store_local_i64_i16:
 ; EG: LDS_SHORT_WRITE
 ; GCN: ds_write_b16
-define void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
+define amdgpu_kernel void @store_local_i64_i16(i16 addrspace(3)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(3)* %out
diff --git a/test/CodeGen/AMDGPU/store-private.ll b/test/CodeGen/AMDGPU/store-private.ll
index 33d27f24e9cf640210ef6713d091f6d7fd5938af..ab73ada370ea0a6700150aef860dcff5939eea1f 100644
--- a/test/CodeGen/AMDGPU/store-private.ll
+++ b/test/CodeGen/AMDGPU/store-private.ll
@@ -15,7 +15,7 @@
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_i1(i1 addrspace(0)* %out) {
+define amdgpu_kernel void @store_i1(i1 addrspace(0)* %out) {
 entry:
   store i1 true, i1 addrspace(0)* %out
   ret void
@@ -44,7 +44,7 @@ entry:
 
 ; SI: buffer_store_byte
 
-define void @store_i8(i8 addrspace(0)* %out, i8 %in) {
+define amdgpu_kernel void @store_i8(i8 addrspace(0)* %out, i8 %in) {
 entry:
   store i8 %in, i8 addrspace(0)* %out
   ret void
@@ -72,7 +72,7 @@ entry:
 ; EG: MOV * T(0 + AR.x).X+, [[RES]]
 
 ; SI: buffer_store_short
-define void @store_i16(i16 addrspace(0)* %out, i16 %in) {
+define amdgpu_kernel void @store_i16(i16 addrspace(0)* %out, i16 %in) {
 entry:
   store i16 %in, i16 addrspace(0)* %out
   ret void
@@ -102,7 +102,7 @@ entry:
 ; CM: MOV {{[\* ]*}}{{T[0-9]+\.[XYZW]}}, T(0 + AR.x).X+,
 ; CM: MOVA_INT
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
-define void @store_i24(i24 addrspace(0)* %out, i24 %in) {
+define amdgpu_kernel void @store_i24(i24 addrspace(0)* %out, i24 %in) {
 entry:
   store i24 %in, i24 addrspace(0)* %out
   ret void
@@ -120,7 +120,7 @@ entry:
 ; CM: MOVA_INT
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 ; CM-NOT: MOVA_INT
-define void @store_i25(i25 addrspace(0)* %out, i25 %in) {
+define amdgpu_kernel void @store_i25(i25 addrspace(0)* %out, i25 %in) {
 entry:
   store i25 %in, i25 addrspace(0)* %out
   ret void
@@ -141,7 +141,7 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_short
-define void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(0)* %out
@@ -172,7 +172,7 @@ entry:
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i8_unaligned(<2 x i8> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i8>
   store <2 x i8> %0, <2 x i8> addrspace(0)* %out, align 1
@@ -191,7 +191,7 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_dword
-define void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(0)* %out
@@ -223,7 +223,7 @@ entry:
 
 ; SI: buffer_store_short
 ; SI: buffer_store_short
-define void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @store_v2i16_unaligned(<2 x i16> addrspace(0)* %out, <2 x i32> %in) {
 entry:
   %0 = trunc <2 x i32> %in to <2 x i16>
   store <2 x i16> %0, <2 x i16> addrspace(0)* %out, align 2
@@ -240,7 +240,7 @@ entry:
 ; CM-NOT: MOVA_INT
 
 ; SI: buffer_store_dword
-define void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out
@@ -299,7 +299,7 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_unaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 1
@@ -410,7 +410,7 @@ entry:
 ; SI: buffer_store_byte
 ; SI: buffer_store_byte
 ; SI-NOT: buffer_store_dword
-define void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) {
+define amdgpu_kernel void @store_v8i8_unaligned(<8 x i8> addrspace(0)* %out, <8 x i32> %in) {
 entry:
   %0 = trunc <8 x i32> %in to <8 x i8>
   store <8 x i8> %0, <8 x i8> addrspace(0)* %out, align 1
@@ -443,7 +443,7 @@ entry:
 ; SI: buffer_store_short
 ; SI: buffer_store_short
 ; SI-NOT: buffer_store_dword
-define void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i8_halfaligned(<4 x i8> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i8>
   store <4 x i8> %0, <4 x i8> addrspace(0)* %out, align 2
@@ -460,7 +460,7 @@ entry:
 
 ; SI: buffer_store_dword
 
-define void @store_f32(float addrspace(0)* %out, float %in) {
+define amdgpu_kernel void @store_f32(float addrspace(0)* %out, float %in) {
   store float %in, float addrspace(0)* %out
   ret void
 }
@@ -480,7 +480,7 @@ define void @store_f32(float addrspace(0)* %out, float %in) {
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i16(<4 x i16> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   %0 = trunc <4 x i32> %in to <4 x i16>
   store <4 x i16> %0, <4 x i16> addrspace(0)* %out
@@ -504,7 +504,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 
-define void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) {
+define amdgpu_kernel void @store_v2f32(<2 x float> addrspace(0)* %out, float %a, float %b) {
 entry:
   %0 = insertelement <2 x float> <float 0.0, float 0.0>, float %a, i32 0
   %1 = insertelement <2 x float> %0, float %b, i32 1
@@ -533,7 +533,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 
-define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
+define amdgpu_kernel void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
   store <3 x i32> %a, <3 x i32> addrspace(0)* %out, align 16
   ret void
 }
@@ -563,7 +563,7 @@ define void @store_v3i32(<3 x i32> addrspace(0)* %out, <3 x i32> %a) nounwind {
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(0)* %out
   ret void
@@ -594,7 +594,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @store_v4i32_unaligned(<4 x i32> addrspace(0)* %out, <4 x i32> %in) {
 entry:
   store <4 x i32> %in, <4 x i32> addrspace(0)* %out, align 4
   ret void
@@ -626,7 +626,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) {
+define amdgpu_kernel void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0)* %in) {
   %1 = load <4 x float>, <4 x float> addrspace(0) * %in
   store <4 x float> %1, <4 x float> addrspace(0)* %out
   ret void
@@ -644,7 +644,7 @@ define void @store_v4f32(<4 x float> addrspace(0)* %out, <4 x float> addrspace(0
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_byte
-define void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i8(i8 addrspace(0)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i8
   store i8 %0, i8 addrspace(0)* %out
@@ -663,7 +663,7 @@ entry:
 ; CM: MOV {{[\* ]*}}T(0 + AR.x).X+,
 
 ; SI: buffer_store_short
-define void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) {
+define amdgpu_kernel void @store_i64_i16(i16 addrspace(0)* %out, i64 %in) {
 entry:
   %0 = trunc i64 %in to i16
   store i16 %0, i16 addrspace(0)* %out
@@ -689,7 +689,7 @@ entry:
 ; XSI: buffer_store_dwordx2
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
+define amdgpu_kernel void @vecload2(i32 addrspace(0)* nocapture %out, i32 addrspace(2)* nocapture %mem) #0 {
 entry:
   %0 = load i32, i32 addrspace(2)* %mem, align 4
   %arrayidx1.i = getelementptr inbounds i32, i32 addrspace(2)* %mem, i64 1
@@ -727,7 +727,7 @@ entry:
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
 ; SI: buffer_store_dword
-define void @i128-const-store(i32 addrspace(0)* %out) {
+define amdgpu_kernel void @i128-const-store(i32 addrspace(0)* %out) {
 entry:
   store i32 1, i32 addrspace(0)* %out, align 4
   %arrayidx2 = getelementptr inbounds i32, i32 addrspace(0)* %out, i64 1
diff --git a/test/CodeGen/AMDGPU/store-v3i64.ll b/test/CodeGen/AMDGPU/store-v3i64.ll
index 78db2d37724bed8adb24d9a47aae7d811f2167ac..7518e887135c885177ad807730e134a7d0939d49 100644
--- a/test/CodeGen/AMDGPU/store-v3i64.ll
+++ b/test/CodeGen/AMDGPU/store-v3i64.ll
@@ -5,7 +5,7 @@
 ; GCN-LABEL: {{^}}global_store_v3i64:
 ; GCN-DAG: buffer_store_dwordx2 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0 offset:16
 ; GCN-DAG: buffer_store_dwordx4 v{{\[[0-9]+:[0-9]+\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 32
   ret void
 }
@@ -40,7 +40,7 @@ define void @global_store_v3i64(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
 ; GCN: buffer_store_byte
-define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(1)* %out, align 1
   ret void
 }
@@ -48,7 +48,7 @@ define void @global_store_v3i64_unaligned(<3 x i64> addrspace(1)* %out, <3 x i64
 ; GCN-LABEL: {{^}}local_store_v3i64:
 ; GCN: ds_write2_b64
 ; GCN: ds_write_b64
-define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 32
   ret void
 }
@@ -83,7 +83,7 @@ define void @local_store_v3i64(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
-define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64> %x) {
   store <3 x i64> %x, <3 x i64> addrspace(3)* %out, align 1
   ret void
 }
@@ -91,7 +91,7 @@ define void @local_store_v3i64_unaligned(<3 x i64> addrspace(3)* %out, <3 x i64>
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i32:
 ; GCN-DAG: buffer_store_dwordx2
 ; GCN-DAG: buffer_store_dword v
-define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i32>
   store <3 x i32> %trunc, <3 x i32> addrspace(1)* %out
   ret void
@@ -100,7 +100,7 @@ define void @global_truncstore_v3i64_to_v3i32(<3 x i32> addrspace(1)* %out, <3 x
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i16:
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_dword v
-define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i16>
   store <3 x i16> %trunc, <3 x i16> addrspace(1)* %out
   ret void
@@ -110,7 +110,7 @@ define void @global_truncstore_v3i64_to_v3i16(<3 x i16> addrspace(1)* %out, <3 x
 ; GCN-LABEL: {{^}}global_truncstore_v3i64_to_v3i8:
 ; GCN-DAG: buffer_store_short
 ; GCN-DAG: buffer_store_byte v
-define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i8>
   store <3 x i8> %trunc, <3 x i8> addrspace(1)* %out
   ret void
@@ -120,7 +120,7 @@ define void @global_truncstore_v3i64_to_v3i8(<3 x i8> addrspace(1)* %out, <3 x i
 ; GCN-DAG: buffer_store_byte v
 ; GCN-DAG: buffer_store_byte v
 ; GCN-DAG: buffer_store_byte v
-define void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
+define amdgpu_kernel void @global_truncstore_v3i64_to_v3i1(<3 x i1> addrspace(1)* %out, <3 x i64> %x) {
   %trunc = trunc <3 x i64> %x to <3 x i1>
   store <3 x i1> %trunc, <3 x i1> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/store-vector-ptrs.ll b/test/CodeGen/AMDGPU/store-vector-ptrs.ll
index d5af3b29118a51042eb4539fab5f1ea0768b959b..507f07dee0524a1063730298a0a2af346d0b6215 100644
--- a/test/CodeGen/AMDGPU/store-vector-ptrs.ll
+++ b/test/CodeGen/AMDGPU/store-vector-ptrs.ll
@@ -5,7 +5,7 @@
 ; AMDGPUDAGToDAGISel::SelectMUBUFScratch() which is used for selecting
 ; scratch loads and stores.
 ; CHECK-LABEL: {{^}}store_vector_ptrs:
-define void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
+define amdgpu_kernel void @store_vector_ptrs(<4 x i32*>* %out, <4 x [1024 x i32]*> %array) nounwind {
   %p = getelementptr [1024 x i32], <4 x [1024 x i32]*> %array, <4 x i16> zeroinitializer, <4 x i16> <i16 16, i16 16, i16 16, i16 16>
   store <4 x i32*> %p, <4 x i32*>* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/store_typed.ll b/test/CodeGen/AMDGPU/store_typed.ll
index 515fcf04f4067ed58216bf4dc734a1972f0b35fa..eaa21617f9373e21d285f2304f67835f5da64ae7 100644
--- a/test/CodeGen/AMDGPU/store_typed.ll
+++ b/test/CodeGen/AMDGPU/store_typed.ll
@@ -6,7 +6,7 @@
 ; EG: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}, 1
 ; CM: MEM_RAT STORE_TYPED RAT(0) {{T[0-9]+, T[0-9]+}}
 
-define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
+define amdgpu_kernel void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
   call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 0)
   ret void
 }
@@ -16,7 +16,7 @@ define void @store_typed_rat0(<4 x i32> %data, <4 x i32> %index) {
 ; EG: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}, 1
 ; CM: MEM_RAT STORE_TYPED RAT(11) {{T[0-9]+, T[0-9]+}}
 
-define void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) {
+define amdgpu_kernel void @store_typed_rat11(<4 x i32> %data, <4 x i32> %index) {
   call void @llvm.r600.rat.store.typed(<4 x i32> %data, <4 x i32> %index, i32 11)
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/structurize.ll b/test/CodeGen/AMDGPU/structurize.ll
index 174e64e2cf8b4d889af1ed79d59535c9a1d87fbe..3cceb2d45c93e79431b5066eaa800b0bb09c7a32 100644
--- a/test/CodeGen/AMDGPU/structurize.ll
+++ b/test/CodeGen/AMDGPU/structurize.ll
@@ -45,7 +45,7 @@
 ; CHECK: CF_END
 
 
-define void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @branch_into_diamond(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) {
 entry:
 %0 = icmp ne i32 %a, 0
   br i1 %0, label %diamond_head, label %branch_from
diff --git a/test/CodeGen/AMDGPU/structurize1.ll b/test/CodeGen/AMDGPU/structurize1.ll
index db0f50247e3883f3eacd2644e8642f6b076161ad..2e7d0e615e076de49071d85a65f1bc776c4e692f 100644
--- a/test/CodeGen/AMDGPU/structurize1.ll
+++ b/test/CodeGen/AMDGPU/structurize1.ll
@@ -19,7 +19,7 @@
 ; CHECK-LABEL: {{^}}if_inside_loop:
 ; CHECK: LOOP_START_DX10
 ; CHECK: END_LOOP
-define void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
+define amdgpu_kernel void @if_inside_loop(i32 addrspace(1)* %out, i32 %a, i32 %b, i32 %c, i32 %d) {
 entry:
   br label %for.body
 
diff --git a/test/CodeGen/AMDGPU/sub.i16.ll b/test/CodeGen/AMDGPU/sub.i16.ll
index b2797ceecf3d3432f24547fe24e6512d332f21d6..ada72140563392cdff5e4d4a78d313b95dfca957 100644
--- a/test/CodeGen/AMDGPU/sub.i16.ll
+++ b/test/CodeGen/AMDGPU/sub.i16.ll
@@ -7,7 +7,7 @@
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -24,7 +24,7 @@ define void @v_test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in0, i16
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0xffffff85, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -39,7 +39,7 @@ define void @v_test_sub_i16_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_add_u16_e32 [[ADD:v[0-9]+]], 0x34d, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -54,7 +54,7 @@ define void @v_test_sub_i16_neg_constant(i16 addrspace(1)* %out, i16 addrspace(1
 ; VI: flat_load_ushort [[A:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], 63, [[A]]
 ; VI-NEXT: buffer_store_short [[ADD]]
-define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -70,7 +70,7 @@ define void @v_test_sub_i16_inline_63(i16 addrspace(1)* %out, i16 addrspace(1)*
 ; VI: flat_load_ushort [[B:v[0-9]+]]
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: buffer_store_dword [[ADD]]
-define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -90,7 +90,7 @@ define void @v_test_sub_i16_zext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-DAG: v_subrev_u16_e32 v[[ADD:[0-9]+]], [[B]], [[A]]
 ; VI-DAG: v_mov_b32_e32 v[[VZERO:[0-9]+]], 0
 ; VI: buffer_store_dwordx2 v{{\[}}[[ADD]]:[[VZERO]]{{\]}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0{{$}}
-define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -110,7 +110,7 @@ define void @v_test_sub_i16_zext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; VI: v_subrev_u16_e32 [[ADD:v[0-9]+]], [[B]], [[A]]
 ; VI-NEXT: v_bfe_i32 [[SEXT:v[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: buffer_store_dword [[SEXT]]
-define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i32, i32 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -131,7 +131,7 @@ define void @v_test_sub_i16_sext_to_i32(i32 addrspace(1)* %out, i16 addrspace(1)
 ; VI-NEXT: v_bfe_i32 v[[LO:[0-9]+]], [[ADD]], 0, 16
 ; VI-NEXT: v_ashrrev_i32_e32 v[[HI:[0-9]+]], 31, v[[LO]]
 ; VI-NEXT: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
+define amdgpu_kernel void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)* %in0, i16 addrspace(1)* %in1) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep.out = getelementptr inbounds i64, i64 addrspace(1)* %out, i32 %tid
   %gep.in0 = getelementptr inbounds i16, i16 addrspace(1)* %in0, i32 %tid
@@ -149,7 +149,7 @@ define void @v_test_sub_i16_sext_to_i64(i64 addrspace(1)* %out, i16 addrspace(1)
 ; GCN-LABEL: {{^}}v_test_sub_i16_constant_commute:
 ; VI: v_subrev_u16_e32 v{{[0-9]+}}, 0x800, v{{[0-9]+}}
 ; CI: v_subrev_i32_e32 v{{[0-9]+}}, vcc, 0x800, v{{[0-9]+}}
-define void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
+define amdgpu_kernel void @v_test_sub_i16_constant_commute(i16 addrspace(1)* %out, i16 addrspace(1)* %in0) #1 {
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %size.trunc = trunc i32 %size to i16
   call void asm sideeffect "; $0", "v"([512 x i32] addrspace(3)* @lds)
diff --git a/test/CodeGen/AMDGPU/sub.ll b/test/CodeGen/AMDGPU/sub.ll
index 5816345098afa5669fadd6477bbb64d4d6a38e34..f366029fdea27bae180dbd6ab8e226e7d4e953ff 100644
--- a/test/CodeGen/AMDGPU/sub.ll
+++ b/test/CodeGen/AMDGPU/sub.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.r600.read.tidig.x() readnone
 ; EG: SUB_INT {{\** *}}T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
 ; SI: v_subrev_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
-define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -25,7 +25,7 @@ define void @test_sub_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -45,7 +45,7 @@ define void @test_sub_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 ; SI: v_sub_i32_e32 v{{[0-9]+, vcc, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -55,7 +55,7 @@ define void @test_sub_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)
 }
 
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
-define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
     %b_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
     %a = load i16, i16 addrspace(1)* %in 
     %b = load i16, i16 addrspace(1)* %b_ptr
@@ -69,7 +69,7 @@ define void @test_sub_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
     %b_ptr = getelementptr <2 x i16>, <2 x i16> addrspace(1)* %in, i16 1
     %a = load <2 x i16>, <2 x i16> addrspace(1) * %in 
     %b = load <2 x i16>, <2 x i16> addrspace(1) * %b_ptr
@@ -85,7 +85,7 @@ define void @test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; VI: v_sub_i16_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
     %b_ptr = getelementptr <4 x i16>, <4 x i16> addrspace(1)* %in, i16 1
     %a = load <4 x i16>, <4 x i16> addrspace(1) * %in 
     %b = load <4 x i16>, <4 x i16> addrspace(1) * %b_ptr
@@ -103,7 +103,7 @@ define void @test_sub_v4i16(<4 x i16> addrspace(1)* %out, <4 x i16> addrspace(1)
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT {{[* ]*}}
-define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
+define amdgpu_kernel void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind {
   %result = sub i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out, align 8
   ret void
@@ -118,7 +118,7 @@ define void @s_sub_i64(i64 addrspace(1)* noalias %out, i64 %a, i64 %b) nounwind
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT {{[* ]*}}
-define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
+define amdgpu_kernel void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %inA, i64 addrspace(1)* noalias %inB) nounwind {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr i64, i64 addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr i64, i64 addrspace(1)* %inB, i32 %tid
@@ -134,7 +134,7 @@ define void @v_sub_i64(i64 addrspace(1)* noalias %out, i64 addrspace(1)* noalias
 ; SI: v_subb_u32_e32
 ; SI: v_sub_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* noalias %inA, <2 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %inB, i32 %tid
@@ -154,7 +154,7 @@ define void @v_test_sub_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(
 ; SI: v_subb_u32_e32
 ; SI: v_subrev_i32_e32
 ; SI: v_subb_u32_e32
-define void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
+define amdgpu_kernel void @v_test_sub_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* noalias %inA, <4 x i64> addrspace(1)* noalias %inB) {
   %tid = call i32 @llvm.r600.read.tidig.x() readnone
   %a_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inA, i32 %tid
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %inB, i32 %tid
diff --git a/test/CodeGen/AMDGPU/sub.v2i16.ll b/test/CodeGen/AMDGPU/sub.v2i16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..69f0accef6282605e0542fe99f7165a603a89e02
--- /dev/null
+++ b/test/CodeGen/AMDGPU/sub.v2i16.ll
@@ -0,0 +1,278 @@
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=GCN %s
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16:
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; VI: v_subrev_u16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = sub <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_sub_v2i16:
+; GFX9: s_load_dword [[VAL0:s[0-9]+]]
+; GFX9: s_load_dword [[VAL1:s[0-9]+]]
+; GFX9: v_mov_b32_e32 [[VVAL1:v[0-9]+]]
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[VVAL1]], [[VAL0]]
+
+; VI: s_sub_i32
+; VI: s_sub_i32
+define amdgpu_kernel void @s_test_sub_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0, <2 x i16> addrspace(2)* %in1) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
+  %b = load <2 x i16>, <2 x i16> addrspace(2)* %in1
+  %add = sub <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}s_test_sub_self_v2i16:
+; GCN: v_mov_b32_e32 [[ZERO:v[0-9]+]]
+; GCN: buffer_store_dword [[ZERO]]
+define amdgpu_kernel void @s_test_sub_self_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(2)* %in0) #1 {
+  %a = load <2 x i16>, <2 x i16> addrspace(2)* %in0
+  %add = sub <2 x i16> %a, %a
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: VI should not scalarize arg access.
+; GCN-LABEL: {{^}}s_test_sub_v2i16_kernarg:
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
+
+; VI: v_subrev_i32_e32
+; VI: v_subrev_i32_sdwa v{{[0-9]+}}, vcc, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+define amdgpu_kernel void @s_test_sub_v2i16_kernarg(<2 x i16> addrspace(1)* %out, <2 x i16> %a, <2 x i16> %b) #1 {
+  %add = sub <2 x i16> %a, %b
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_sub_v2i16_constant:
+; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0x1c8007b{{$}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xfffffe38, v{{[0-9]+}}
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0xffffff85, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_sub_v2i16_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = sub <2 x i16> %a, <i16 123, i16 456>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16_neg_constant:
+; GFX9: s_mov_b32 [[CONST:s[0-9]+]], 0xfc21fcb3{{$}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[CONST]], v{{[0-9]+}}
+
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x3df, v{{[0-9]+}}
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 0x34d, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_sub_v2i16_neg_constant(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = sub <2 x i16> %a, <i16 -845, i16 -991>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_neg1:
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, v{{[0-9]+}}, -1{{$}}
+
+; VI: flat_load_ushort [[LOAD0:v[0-9]+]]
+; VI: flat_load_ushort [[LOAD1:v[0-9]+]]
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD0]]
+; VI-DAG: v_add_u16_e32 v{{[0-9]+}}, 1, [[LOAD1]]
+; VI-DAG: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_sub_v2i16_inline_neg1(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = sub <2 x i16> %a, <i16 -1, i16 -1>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_lo_zero_hi:
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 32{{$}}
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+
+; VI-NOT: v_subrev_i16
+; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffffe0, v{{[0-9]+}}
+; VI-NOT: v_subrev_i16
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_sub_v2i16_inline_lo_zero_hi(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = sub <2 x i16> %a, <i16 32, i16 0>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; The high element gives fp
+; GCN-LABEL: {{^}}v_test_sub_v2i16_inline_fp_split:
+; GFX9: s_mov_b32 [[K:s[0-9]+]], 1.0
+; GFX9: v_pk_sub_i16 v{{[0-9]+}}, [[K]], v{{[0-9]+}}{{$}}
+
+; VI-NOT: v_subrev_i16
+; VI: v_add_u16_e32 v{{[0-9]+}}, 0xffffc080, v{{[0-9]+}}
+; VI-NOT: v_subrev_i16
+; VI: v_lshlrev_b32_e32 v{{[0-9]+}}, 16,
+; VI: v_or_b32_e32
+define amdgpu_kernel void @v_test_sub_v2i16_inline_fp_split(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %add = sub <2 x i16> %a, <i16 0, i16 16256>
+  store <2 x i16> %add, <2 x i16> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i32:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
+; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
+
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+; VI: flat_load_ushort v[[A_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+
+; VI: v_subrev_u16_e32 v[[ADD_HI:[0-9]+]], v[[B_HI]], v[[A_HI]]
+; VI-NOT: and
+; VI-NOT: shl
+; VI: v_subrev_u16_e32 v[[ADD_LO:[0-9]+]], v[[B_LO]], v[[A_LO]]
+; VI-NOT: and
+; VI-NOT: shl
+; VI: buffer_store_dwordx2 v{{\[}}[[ADD_LO]]:[[ADD_HI]]{{\]}}
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = sub <2 x i16> %a, %b
+  %ext = zext <2 x i16> %add to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16_zext_to_v2i64:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_and_b32_e32 v[[ELT0:[0-9]+]], 0xffff, [[ADD]]
+; GFX9-DAG: v_lshrrev_b32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx4
+
+; VI: flat_load_ushort v[[A_LO:[0-9]+]]
+; VI: flat_load_ushort v[[A_HI:[0-9]+]]
+; VI: flat_load_ushort v[[B_LO:[0-9]+]]
+; VI: flat_load_ushort v[[B_HI:[0-9]+]]
+
+; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; VI-DAG: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}}
+; VI-DAG: v_subrev_u16_e32
+; VI-DAG: v_subrev_u16_e32
+
+; VI: buffer_store_dwordx4
+define amdgpu_kernel void @v_test_sub_v2i16_zext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = sub <2 x i16> %a, %b
+  %ext = zext <2 x i16> %add to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i32:
+; GFX9: flat_load_dword [[A:v[0-9]+]]
+; GFX9: flat_load_dword [[B:v[0-9]+]]
+
+; GFX9: v_pk_sub_i16 [[ADD:v[0-9]+]], [[A]], [[B]]
+; GFX9-DAG: v_bfe_i32 v[[ELT0:[0-9]+]], [[ADD]], 0, 16
+; GFX9-DAG: v_ashrrev_i32_e32 v[[ELT1:[0-9]+]], 16, [[ADD]]
+; GFX9: buffer_store_dwordx2 v{{\[}}[[ELT0]]:[[ELT1]]{{\]}}
+
+; VI: v_subrev_u16_e32
+; VI: v_subrev_u16_e32
+; VI: buffer_store_dwordx2
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i32(<2 x i32> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load volatile <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = sub <2 x i16> %a, %b
+  %ext = sext <2 x i16> %add to <2 x i32>
+  store <2 x i32> %ext, <2 x i32> addrspace(1)* %out
+  ret void
+}
+
+; FIXME: Need to handle non-uniform case for function below (load without gep).
+; GCN-LABEL: {{^}}v_test_sub_v2i16_sext_to_v2i64:
+; GCN: flat_load_dword
+; GCN: flat_load_dword
+
+; GFX9: v_pk_sub_i16
+; GFX9: v_lshrrev_b32_e32 v{{[0-9]+}}, 16, v{{[0-9]+}}
+
+; VI: v_subrev_u16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI: v_subrev_u16_e32
+
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN: v_bfe_i32 v{{[0-9]+}}, v{{[0-9]+}}, 0, 16
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+; GCN: v_ashrrev_i32_e32 v{{[0-9]+}}, 31, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_sub_v2i16_sext_to_v2i64(<2 x i64> addrspace(1)* %out, <2 x i16> addrspace(1)* %in0, <2 x i16> addrspace(1)* %in1) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep.out = getelementptr inbounds <2 x i64>, <2 x i64> addrspace(1)* %out, i32 %tid
+  %gep.in0 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in0, i32 %tid
+  %gep.in1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %in1, i32 %tid
+  %a = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in0
+  %b = load <2 x i16>, <2 x i16> addrspace(1)* %gep.in1
+  %add = sub <2 x i16> %a, %b
+  %ext = sext <2 x i16> %add to <2 x i64>
+  store <2 x i64> %ext, <2 x i64> addrspace(1)* %out
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
+attributes #1 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
index ec2ed78b495458474fee45331d691d4258b158cd..c2d04abf829f26ee8db5fd153dfd2d578882d29c 100644
--- a/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-crash.ll
@@ -1,39 +1,37 @@
-; RUN: llc -march=amdgcn -mcpu=SI -verify-machineinstrs -o - %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s
+; RUN: llc -march=amdgcn -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s
 
-; SI-LABEL:{{^}}row_filter_C1_D0:
-; SI: s_endpgm
-; Function Attrs: nounwind
-define void @row_filter_C1_D0() {
+; GCN-LABEL:{{^}}row_filter_C1_D0:
+define amdgpu_kernel void @row_filter_C1_D0() #0 {
 entry:
   br i1 undef, label %for.inc.1, label %do.body.preheader
 
 do.body.preheader:                                ; preds = %entry
-  %0 = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1
+  %tmp = insertelement <4 x i32> zeroinitializer, i32 undef, i32 1
   br i1 undef, label %do.body56.1, label %do.body90
 
 do.body90:                                        ; preds = %do.body56.2, %do.body56.1, %do.body.preheader
-  %1 = phi <4 x i32> [ %6, %do.body56.2 ], [ %5, %do.body56.1 ], [ %0, %do.body.preheader ]
-  %2 = insertelement <4 x i32> %1, i32 undef, i32 2
-  %3 = insertelement <4 x i32> %2, i32 undef, i32 3
+  %tmp1 = phi <4 x i32> [ %tmp6, %do.body56.2 ], [ %tmp5, %do.body56.1 ], [ %tmp, %do.body.preheader ]
+  %tmp2 = insertelement <4 x i32> %tmp1, i32 undef, i32 2
+  %tmp3 = insertelement <4 x i32> %tmp2, i32 undef, i32 3
   br i1 undef, label %do.body124.1, label %do.body.1562.preheader
 
 do.body.1562.preheader:                           ; preds = %do.body124.1, %do.body90
-  %storemerge = phi <4 x i32> [ %3, %do.body90 ], [ %7, %do.body124.1 ]
-  %4 = insertelement <4 x i32> undef, i32 undef, i32 1
+  %storemerge = phi <4 x i32> [ %tmp3, %do.body90 ], [ %tmp7, %do.body124.1 ]
+  %tmp4 = insertelement <4 x i32> undef, i32 undef, i32 1
   br label %for.inc.1
 
 do.body56.1:                                      ; preds = %do.body.preheader
-  %5 = insertelement <4 x i32> %0, i32 undef, i32 1
+  %tmp5 = insertelement <4 x i32> %tmp, i32 undef, i32 1
   %or.cond472.1 = or i1 undef, undef
   br i1 %or.cond472.1, label %do.body56.2, label %do.body90
 
 do.body56.2:                                      ; preds = %do.body56.1
-  %6 = insertelement <4 x i32> %5, i32 undef, i32 1
+  %tmp6 = insertelement <4 x i32> %tmp5, i32 undef, i32 1
   br label %do.body90
 
 do.body124.1:                                     ; preds = %do.body90
-  %7 = insertelement <4 x i32> %3, i32 undef, i32 3
+  %tmp7 = insertelement <4 x i32> %tmp3, i32 undef, i32 3
   br label %do.body.1562.preheader
 
 for.inc.1:                                        ; preds = %do.body.1562.preheader, %entry
@@ -42,8 +40,8 @@ for.inc.1:                                        ; preds = %do.body.1562.prehea
   unreachable
 }
 
-; SI-LABEL: {{^}}foo:
-; SI: s_endpgm
+; GCN-LABEL: {{^}}foo:
+; GCN: s_endpgm
 define amdgpu_ps void @foo() #0 {
 bb:
   br i1 undef, label %bb2, label %bb1
@@ -67,7 +65,7 @@ bb7:                                              ; preds = %bb6
   br label %bb4
 
 bb9:                                              ; preds = %bb2
-  %tmp10 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp10 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp11 = extractelement <4 x float> %tmp10, i32 1
   %tmp12 = extractelement <4 x float> %tmp10, i32 3
   br label %bb14
@@ -78,9 +76,9 @@ bb13:                                             ; preds = %bb2
 bb14:                                             ; preds = %bb27, %bb24, %bb9
   %tmp15 = phi float [ %tmp12, %bb9 ], [ undef, %bb27 ], [ 0.000000e+00, %bb24 ]
   %tmp16 = phi float [ %tmp11, %bb9 ], [ undef, %bb27 ], [ %tmp25, %bb24 ]
-  %tmp17 = fmul float 10.5, %tmp16
-  %tmp18 = fmul float 11.5, %tmp15
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %tmp18, float %tmp17, float %tmp17, float %tmp17)
+  %tmp17 = fmul float 1.050000e+01, %tmp16
+  %tmp18 = fmul float 1.150000e+01, %tmp15
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %tmp18, float %tmp17, float %tmp17, float %tmp17, i1 true, i1 true) #0
   ret void
 
 bb23:                                             ; preds = %bb13
@@ -97,13 +95,9 @@ bb27:                                             ; preds = %bb24
   br label %bb14
 }
 
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
 
-; Function Attrs: nounwind readnone
-declare i32 @llvm.SI.packf16(float, float) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
index 72a1f1e25b30b0a5ca99162dd859dd161a4f8a64..35615c40d498d16a87c9a97c0e408f9ffc388e1e 100644
--- a/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
+++ b/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll
@@ -20,7 +20,7 @@ target triple="amdgcn--"
 ; CHECK-NEXT: s_mov_b32 s6, -1
 ; CHECK-NEXT: buffer_store_dword v1, off, s[4:7], 0
 ; CHECK-NEXT: s_endpgm
-define void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
+define amdgpu_kernel void @foobar(float %a0, float %a1, float addrspace(1)* %out) nounwind {
 entry:
   %v0 = insertelement <4 x float> undef, float %a0, i32 0
   %tid = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0
diff --git a/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
index 8bd995a8ecbbb44c6d9367724d4421320e1b1d57..57c267e54a1466d9be7871895ebdd6ba2bbee5f9 100644
--- a/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
+++ b/test/CodeGen/AMDGPU/subreg-eliminate-dead.ll
@@ -5,7 +5,7 @@
 ; Just make sure this test doesn't crash.
 ; CHECK-LABEL: foobar:
 ; CHECK: s_endpgm
-define void @foobar() {
+define amdgpu_kernel void @foobar() {
   %v0 = icmp eq <4 x i32> undef, <i32 0, i32 1, i32 2, i32 3>
   %v3 = sext <4 x i1> %v0 to <4 x i32>
   %v4 = extractelement <4 x i32> %v3, i32 1
diff --git a/test/CodeGen/AMDGPU/subreg-intervals.mir b/test/CodeGen/AMDGPU/subreg-intervals.mir
index c4e00215159bc93eedbee6de44ea2c6f8316a807..c477fe9bc6d348bf4afe080816844a655cb9d286 100644
--- a/test/CodeGen/AMDGPU/subreg-intervals.mir
+++ b/test/CodeGen/AMDGPU/subreg-intervals.mir
@@ -10,8 +10,8 @@
 # CHECK-LABEL: Machine code for function test1:
 
 --- |
-  define void @test0() { ret void }
-  define void @test1() { ret void }
+  define amdgpu_kernel void @test0() { ret void }
+  define amdgpu_kernel void @test1() { ret void }
 ...
 ---
 name: test0
diff --git a/test/CodeGen/AMDGPU/subreg_interference.mir b/test/CodeGen/AMDGPU/subreg_interference.mir
new file mode 100644
index 0000000000000000000000000000000000000000..24d06a576c2a4c6bc1bb1e7a0d4d939e2aa60c69
--- /dev/null
+++ b/test/CodeGen/AMDGPU/subreg_interference.mir
@@ -0,0 +1,24 @@
+# RUN: llc -o - %s -mtriple=amdgcn--amdhsa -verify-machineinstrs -run-pass=greedy,virtregrewriter | FileCheck %s
+---
+# We should not detect any interference between v0/v1 here and only allocate
+# sgpr0-sgpr3.
+#
+# CHECK-LABEL: func0
+# CHECK: S_NOP 0, implicit-def %sgpr0
+# CHECK: S_NOP 0, implicit-def %sgpr3
+# CHECK: S_NOP 0, implicit-def %sgpr1
+# CHECK: S_NOP 0, implicit-def %sgpr2
+# CHECK: S_NOP 0, implicit %sgpr0, implicit %sgpr3
+# CHECK: S_NOP 0, implicit %sgpr1, implicit %sgpr2
+name: func0
+body: |
+  bb.0:
+    S_NOP 0, implicit-def undef %0.sub0 : sreg_128
+    S_NOP 0, implicit-def %0.sub3
+    S_NOP 0, implicit-def undef %1.sub1 : sreg_128
+    S_NOP 0, implicit-def %1.sub2
+
+
+    S_NOP 0, implicit %0.sub0, implicit %0.sub3
+    S_NOP 0, implicit %1.sub1, implicit %1.sub2
+...
diff --git a/test/CodeGen/AMDGPU/target-cpu.ll b/test/CodeGen/AMDGPU/target-cpu.ll
index cf80ff3f4c831b68b2bc3120e7e304a34408d96b..466e89ebee806db2b357176a743bdfa1454b22dd 100644
--- a/test/CodeGen/AMDGPU/target-cpu.ll
+++ b/test/CodeGen/AMDGPU/target-cpu.ll
@@ -14,7 +14,7 @@ declare void @llvm.amdgcn.s.dcache.wb() #0
 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @target_none() #0 {
+define amdgpu_kernel void @target_none() #0 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -30,7 +30,7 @@ define void @target_none() #0 {
 ; CHECK: s_movk_i32 [[OFFSETREG:s[0-9]+]], 0x400
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, [[OFFSETREG]]
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
-define void @target_tahiti() #1 {
+define amdgpu_kernel void @target_tahiti() #1 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -46,7 +46,7 @@ define void @target_tahiti() #1 {
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x100
 ; CHECK: buffer_store_dword v{{[0-9]+}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64
 ; CHECK: s_dcache_inv_vol
-define void @target_bonaire() #3 {
+define amdgpu_kernel void @target_bonaire() #3 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -63,7 +63,7 @@ define void @target_bonaire() #3 {
 ; CHECK: s_load_dwordx2 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x400
 ; CHECK: flat_store_dword
 ; CHECK: s_dcache_wb{{$}}
-define void @target_fiji() #4 {
+define amdgpu_kernel void @target_fiji() #4 {
   %kernargs = call i8 addrspace(2)* @llvm.amdgcn.kernarg.segment.ptr()
   %kernargs.gep = getelementptr inbounds i8, i8 addrspace(2)* %kernargs, i64 1024
   %kernargs.gep.cast = bitcast i8 addrspace(2)* %kernargs.gep to i32 addrspace(1)* addrspace(2)*
@@ -79,7 +79,7 @@ define void @target_fiji() #4 {
 ; CHECK-LABEL: {{^}}promote_alloca_enabled:
 ; CHECK: ds_read_b32
 ; CHECK: ; LDSByteSize: 5120
-define void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
+define amdgpu_kernel void @promote_alloca_enabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #5 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
@@ -93,7 +93,7 @@ entry:
 ; CHECK: SCRATCH_RSRC_DWORD0
 ; CHECK: SCRATCH_RSRC_DWORD1
 ; CHECK: ScratchSize: 24
-define void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
+define amdgpu_kernel void @promote_alloca_disabled(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) #6 {
 entry:
   %stack = alloca [5 x i32], align 4
   %tmp = load i32, i32 addrspace(1)* %in, align 4
diff --git a/test/CodeGen/AMDGPU/trap.ll b/test/CodeGen/AMDGPU/trap.ll
index 4271a499c9c5d424481cd3cb3e650a06cb2ef590..77ad895d0e86a18b03b0e6a1f674f258e963d299 100644
--- a/test/CodeGen/AMDGPU/trap.ll
+++ b/test/CodeGen/AMDGPU/trap.ll
@@ -1,12 +1,81 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=HSA-TRAP %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=HSA-TRAP %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-HSA-TRAP %s
+; RUN: llc -mtriple=amdgcn--amdhsa -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
+
+; enable trap handler feature
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=TRAP-BIT -check-prefix=MESA-TRAP %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=+trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=TRAP-BIT %s
+
+; disable trap handler feature
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=NO-MESA-TRAP -check-prefix=NO-TRAP-BIT -check-prefix=NOMESA-TRAP %s
+; RUN: llc -mtriple=amdgcn-unknown-mesa3d -mattr=-trap-handler -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING -check-prefix=NO-TRAP-BIT %s
+
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GCN -check-prefix=GCN-WARNING %s
 
 declare void @llvm.trap() #0
+declare void @llvm.debugtrap() #0
+
+; MESA-TRAP: .section .AMDGPU.config
+; MESA-TRAP:  .long   47180
+; MESA-TRAP-NEXT: .long   208
+
+; NOMESA-TRAP: .section .AMDGPU.config
+; NOMESA-TRAP:  .long   47180
+; NOMESA-TRAP-NEXT: .long   144
+
+; GCN-LABEL: {{^}}hsa_trap:
+; HSA-TRAP: enable_trap_handler = 1
+; HSA-TRAP: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP: s_trap 2
+
+; for llvm.trap in hsa path without ABI, direct generate s_endpgm instruction without any warning information
+; NO-HSA-TRAP: enable_trap_handler = 0
+; NO-HSA-TRAP: s_endpgm
+; NO-HSA-TRAP: COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
+
+; TRAP-BIT: enable_trap_handler = 1
+; NO-TRAP-BIT: enable_trap_handler = 0
+; NO-MESA-TRAP: s_endpgm
+define amdgpu_kernel void @hsa_trap() {
+  call void @llvm.trap()
+  ret void
+}
+
+; MESA-TRAP: .section .AMDGPU.config
+; MESA-TRAP:  .long   47180
+; MESA-TRAP-NEXT: .long   208
+
+; NOMESA-TRAP: .section .AMDGPU.config
+; NOMESA-TRAP:  .long   47180
+; NOMESA-TRAP-NEXT: .long   144
+
+; GCN-WARNING: warning: <unknown>:0:0: in function hsa_debugtrap void (): debugtrap handler not supported
+; GCN-LABEL: {{^}}hsa_debugtrap:
+; HSA-TRAP: enable_trap_handler = 1
+; HSA-TRAP: s_mov_b64 s[0:1], s[4:5]
+; HSA-TRAP: s_trap 3
+
+; for llvm.debugtrap in non-hsa path without ABI, generate a warning and a s_endpgm instruction
+; NO-HSA-TRAP: enable_trap_handler = 0
+; NO-HSA-TRAP: s_endpgm
+
+; TRAP-BIT: enable_trap_handler = 1
+; NO-TRAP-BIT: enable_trap_handler = 0
+; NO-MESA-TRAP: s_endpgm
+define amdgpu_kernel void @hsa_debugtrap() {
+  call void @llvm.debugtrap()
+  ret void
+}
 
+; For non-HSA path
 ; GCN-LABEL: {{^}}trap:
-; GCN: v_mov_b32_e32 v0, 1
-; GCN: s_mov_b64 s[0:1], s[4:5]
-; GCN: s_trap 1
-define void @trap() {
+; TRAP-BIT: enable_trap_handler = 1
+; NO-TRAP-BIT: enable_trap_handler = 0
+; NO-HSA-TRAP: s_endpgm
+; NO-MESA-TRAP: s_endpgm
+define amdgpu_kernel void @trap() {
   call void @llvm.trap()
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
index a331475820a0ebd808d23a7597beaf12dac6ce8f..f90040385f7532a43b710209fa819a5011b85769 100644
--- a/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
+++ b/test/CodeGen/AMDGPU/trunc-bitcast-vector.ll
@@ -4,7 +4,7 @@
 ; CHECK-LABEL: {{^}}trunc_i64_bitcast_v2i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %ld = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %bc = bitcast <2 x i32> %ld to i64
   %trunc = trunc i64 %bc to i32
@@ -15,7 +15,7 @@ define void @trunc_i64_bitcast_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace
 ; CHECK-LABEL: {{^}}trunc_i96_bitcast_v3i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %in) {
   %ld = load <3 x i32>, <3 x i32> addrspace(1)* %in
   %bc = bitcast <3 x i32> %ld to i96
   %trunc = trunc i96 %bc to i32
@@ -26,7 +26,7 @@ define void @trunc_i96_bitcast_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace
 ; CHECK-LABEL: {{^}}trunc_i128_bitcast_v4i32:
 ; CHECK: buffer_load_dword v
 ; CHECK: buffer_store_dword v
-define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %ld = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %bc = bitcast <4 x i32> %ld to i128
   %trunc = trunc i128 %bc to i32
@@ -38,7 +38,7 @@ define void @trunc_i128_bitcast_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspac
 ; CHECK-LABEL: {{^}}trunc_i16_bitcast_v2i16:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_short [[VAL]]
-define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %in) {
   %ld = load <2 x i16>, <2 x i16> addrspace(1)* %in
   %bc = bitcast <2 x i16> %ld to i32
   %trunc = trunc i32 %bc to i16
@@ -54,7 +54,7 @@ define void @trunc_i16_bitcast_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace
 ; SI: buffer_load_dword v[[VAL:[0-9]+]]
 ; VI: buffer_load_dwordx2 v{{\[}}[[VAL:[0-9]+]]
 ; CHECK: buffer_store_short [[VAL]]
-define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace(1)* %in) {
   %ld = load <4 x i16>, <4 x i16> addrspace(1)* %in
   %bc = bitcast <4 x i16> %ld to i64
   %trunc = trunc i64 %bc to i16
@@ -66,7 +66,7 @@ define void @trunc_i16_bitcast_v4i16(i16 addrspace(1)* %out, <4 x i16> addrspace
 ; CHECK-LABEL: {{^}}trunc_i8_bitcast_v2i8:
 ; CHECK: buffer_load_ubyte [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)* %in) {
   %ld = load <2 x i8>, <2 x i8> addrspace(1)* %in
   %bc = bitcast <2 x i8> %ld to i16
   %trunc = trunc i16 %bc to i8
@@ -77,7 +77,7 @@ define void @trunc_i8_bitcast_v2i8(i8 addrspace(1)* %out, <2 x i8> addrspace(1)*
 ; CHECK-LABEL: {{^}}trunc_i32_bitcast_v4i8:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %in) {
   %ld = load <4 x i8>, <4 x i8> addrspace(1)* %in
   %bc = bitcast <4 x i8> %ld to i32
   %trunc = trunc i32 %bc to i8
@@ -88,7 +88,7 @@ define void @trunc_i32_bitcast_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)
 ; CHECK-LABEL: {{^}}trunc_i24_bitcast_v3i8:
 ; CHECK: buffer_load_dword [[VAL:v[0-9]+]]
 ; CHECK: buffer_store_byte [[VAL]]
-define void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
+define amdgpu_kernel void @trunc_i24_bitcast_v3i8(i8 addrspace(1)* %out, <3 x i8> addrspace(1)* %in) {
   %ld = load <3 x i8>, <3 x i8> addrspace(1)* %in
   %bc = bitcast <3 x i8> %ld to i24
   %trunc = trunc i24 %bc to i8
diff --git a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
index 7a4bced9d4360994415e20d3441583a8983c9d8f..cb8d3655033101719f22ac52515446f02e635ea7 100644
--- a/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
+++ b/test/CodeGen/AMDGPU/trunc-cmp-constant.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI: v_cmp_eq_u32_e32 vcc, 0, [[TMP]]{{$}}
 ; SI: v_cndmask_b32_e64
 ; SI: buffer_store_byte
-define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
@@ -25,7 +25,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 0
@@ -36,7 +36,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_0(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_eq_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
@@ -48,7 +48,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, 1
@@ -60,7 +60,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
@@ -71,7 +71,7 @@ define void @sextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_eq_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 0{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp eq i32 %ext, -1
@@ -84,7 +84,7 @@ define void @zextload_i1_to_i32_trunc_cmp_eq_neg1(i1 addrspace(1)* %out, i1 addr
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
@@ -96,7 +96,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
 ; SI: buffer_load_ubyte [[LOAD:v[0-9]+]]
 ; SI: v_and_b32_e32 [[TMP:v[0-9]+]], 1, [[LOAD]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 0
@@ -107,7 +107,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_0(i1 addrspace(1)* %out, i1 addrspa
 ; FUNC-LABEL: {{^}}sextload_i1_to_i32_trunc_cmp_ne_1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
@@ -122,7 +122,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
 ; SI-NEXT: s_xor_b64 [[NEG:s\[[0-9]+:[0-9]+\]]], vcc, -1
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[NEG]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, 1
@@ -137,7 +137,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_1(i1 addrspace(1)* %out, i1 addrspa
 ; XSI: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], [[TMP]], 0{{$}}
 ; XSI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, [[CMP0]]
 ; XSI-NEXT: buffer_store_byte [[RESULT]]
-define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = sext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
@@ -148,7 +148,7 @@ define void @sextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
 ; FUNC-LABEL: {{^}}zextload_i1_to_i32_trunc_cmp_ne_neg1:
 ; SI: v_mov_b32_e32 [[RESULT:v[0-9]+]], 1{{$}}
 ; SI: buffer_store_byte [[RESULT]]
-define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addrspace(1)* %in) nounwind {
   %load = load i1, i1 addrspace(1)* %in
   %ext = zext i1 %load to i32
   %cmp = icmp ne i32 %ext, -1
@@ -162,7 +162,7 @@ define void @zextload_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i1 addr
 ; SI: v_cmp_ne_u32_e32 vcc, -1, [[LOAD]]{{$}}
 ; SI-NEXT: v_cndmask_b32_e64
 ; SI: {{buffer|flat}}_store_byte
-define void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
+define amdgpu_kernel void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(i1 addrspace(1)* %out, i8 addrspace(1)* %in) nounwind {
   %tid.x = call i32 @llvm.amdgcn.workitem.id.x()
   %in.ptr = getelementptr i8, i8 addrspace(1)* %in, i32 %tid.x
   %load = load i8, i8 addrspace(1)* %in.ptr
diff --git a/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
index 03b8af0610d7738bd932bc09923931fa3661ea93..d67b8f981b2810ff1f8906816ff94a5862df12ba 100644
--- a/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
+++ b/test/CodeGen/AMDGPU/trunc-store-f64-to-f16.ll
@@ -2,7 +2,7 @@
 
 ; GCN-LABEL: {{^}}global_truncstore_f64_to_f16:
 ; GCN: s_endpgm
-define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrspace(1)* %in) #0 {
   %val = load double, double addrspace(1)* %in
   %cvt = fptrunc double %val to half
   store half %cvt, half addrspace(1)* %out
@@ -11,7 +11,7 @@ define void @global_truncstore_f64_to_f16(half addrspace(1)* %out, double addrsp
 
 ; GCN-LABEL: {{^}}global_truncstore_v2f64_to_v2f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x double> addrspace(1)* %in) #0 {
   %val = load <2 x double>, <2 x double> addrspace(1)* %in
   %cvt = fptrunc <2 x double> %val to <2 x half>
   store <2 x half> %cvt, <2 x half> addrspace(1)* %out
@@ -20,7 +20,7 @@ define void @global_truncstore_v2f64_to_v2f16(<2 x half> addrspace(1)* %out, <2
 
 ; GCN-LABEL: {{^}}global_truncstore_v3f64_to_v3f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3 x double> addrspace(1)* %in) #0 {
   %val = load <3 x double>, <3 x double> addrspace(1)* %in
   %cvt = fptrunc <3 x double> %val to <3 x half>
   store <3 x half> %cvt, <3 x half> addrspace(1)* %out
@@ -29,7 +29,7 @@ define void @global_truncstore_v3f64_to_v3f16(<3 x half> addrspace(1)* %out, <3
 
 ; GCN-LABEL: {{^}}global_truncstore_v4f64_to_v4f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x double> addrspace(1)* %in) #0 {
   %val = load <4 x double>, <4 x double> addrspace(1)* %in
   %cvt = fptrunc <4 x double> %val to <4 x half>
   store <4 x half> %cvt, <4 x half> addrspace(1)* %out
@@ -38,7 +38,7 @@ define void @global_truncstore_v4f64_to_v4f16(<4 x half> addrspace(1)* %out, <4
 
 ; GCN-LABEL: {{^}}global_truncstore_v8f64_to_v8f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8 x double> addrspace(1)* %in) #0 {
   %val = load <8 x double>, <8 x double> addrspace(1)* %in
   %cvt = fptrunc <8 x double> %val to <8 x half>
   store <8 x half> %cvt, <8 x half> addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @global_truncstore_v8f64_to_v8f16(<8 x half> addrspace(1)* %out, <8
 
 ; GCN-LABEL: {{^}}global_truncstore_v16f64_to_v16f16:
 ; GCN: s_endpgm
-define void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @global_truncstore_v16f64_to_v16f16(<16 x half> addrspace(1)* %out, <16 x double> addrspace(1)* %in) #0 {
   %val = load <16 x double>, <16 x double> addrspace(1)* %in
   %cvt = fptrunc <16 x double> %val to <16 x half>
   store <16 x half> %cvt, <16 x half> addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/trunc-store-i1.ll b/test/CodeGen/AMDGPU/trunc-store-i1.ll
index da2a5b43dad5c095b8381753dfb9aacd704b3ddb..4ea2352f57f354776ef06555e67373fed80b4a3b 100644
--- a/test/CodeGen/AMDGPU/trunc-store-i1.ll
+++ b/test/CodeGen/AMDGPU/trunc-store-i1.ll
@@ -7,7 +7,7 @@
 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; SI: buffer_store_byte [[VREG]],
-define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwind {
   %trunc = trunc i32 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
@@ -15,7 +15,7 @@ define void @global_truncstore_i32_to_i1(i1 addrspace(1)* %out, i32 %val) nounwi
 
 ; SI-LABEL: {{^}}global_truncstore_i64_to_i1:
 ; SI: buffer_store_byte
-define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
+define amdgpu_kernel void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwind {
   %trunc = trunc i64 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
@@ -26,13 +26,13 @@ define void @global_truncstore_i64_to_i1(i1 addrspace(1)* %out, i64 %val) nounwi
 ; SI: s_and_b32 [[SREG:s[0-9]+]], [[LOAD]], 1
 ; SI: v_mov_b32_e32 [[VREG:v[0-9]+]], [[SREG]]
 ; SI: buffer_store_byte [[VREG]],
-define void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
+define amdgpu_kernel void @s_arg_global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val) nounwind {
   %trunc = trunc i16 %val to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
   ret void
 }
 ; SI-LABEL: {{^}}global_truncstore_i16_to_i1:
-define void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
+define amdgpu_kernel void @global_truncstore_i16_to_i1(i1 addrspace(1)* %out, i16 %val0, i16 %val1) nounwind {
   %add = add i16 %val0, %val1
   %trunc = trunc i16 %add to i1
   store i1 %trunc, i1 addrspace(1)* %out, align 1
diff --git a/test/CodeGen/AMDGPU/trunc-store.ll b/test/CodeGen/AMDGPU/trunc-store.ll
index c6727e1e1273d376feee7c341fb8c57df82d2a91..f45de679588f54caf00b4af6305a7b8eb3a71de1 100644
--- a/test/CodeGen/AMDGPU/trunc-store.ll
+++ b/test/CodeGen/AMDGPU/trunc-store.ll
@@ -3,7 +3,7 @@
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i32_to_v16i8:
 ; SI: buffer_store_dwordx4
-define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i32> %in) {
   %trunc = trunc <16 x i32> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
   ret void
@@ -11,7 +11,7 @@ define void @truncstore_arg_v16i32_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x
 
 ; FUNC-LABEL: {{^}}truncstore_arg_v16i64_to_v16i8:
 ; SI: buffer_store_dwordx4
-define void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
+define amdgpu_kernel void @truncstore_arg_v16i64_to_v16i8(<16 x i8> addrspace(1)* %out, <16 x i64> %in) {
   %trunc = trunc <16 x i64> %in to <16 x i8>
   store <16 x i8> %trunc, <16 x i8> addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
index 878ea3f48995a005769e1b1a2f267577cc96e0a5..3dbc10d2e9b5697afdc6b413b0a2590ae172c0f2 100644
--- a/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
+++ b/test/CodeGen/AMDGPU/trunc-vector-store-assertion-failure.ll
@@ -6,7 +6,7 @@
 
 ; CHECK-LABEL: {{^}}test:
 ; CHECK: MEM_RAT_CACHELESS STORE_RAW
-define void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
+define amdgpu_kernel void @test(<4 x i8> addrspace(1)* %out, i32 %cond, <4 x i8> %in) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %if, label %done
diff --git a/test/CodeGen/AMDGPU/trunc.ll b/test/CodeGen/AMDGPU/trunc.ll
index 63bb447df2fde6d3d9f414cefb3c90cf83b3da8b..0c91d52df0c086d5380e54b0b01245d243de9254 100644
--- a/test/CodeGen/AMDGPU/trunc.ll
+++ b/test/CodeGen/AMDGPU/trunc.ll
@@ -1,13 +1,15 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=SI %s
+; RUN: llc -march=amdgcn -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=fiji -verify-machineinstrs< %s | FileCheck -check-prefix=GCN -check-prefix=VI  %s
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck -check-prefix=EG %s
 
 declare i32 @llvm.r600.read.tidig.x() nounwind readnone
 
-define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
-; SI-LABEL: {{^}}trunc_i64_to_i32_store:
-; SI: s_load_dword [[SLOAD:s[0-9]+]], s[0:1], 0xb
-; SI: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
+define amdgpu_kernel void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
+; GCN-LABEL: {{^}}trunc_i64_to_i32_store:
+; GCN: s_load_dword [[SLOAD:s[0-9]+]], s[0:1],
+; GCN: v_mov_b32_e32 [[VLOAD:v[0-9]+]], [[SLOAD]]
 ; SI: buffer_store_dword [[VLOAD]]
+; VI: flat_store_dword v[{{[0-9:]+}}], [[VLOAD]]
 
 ; EG-LABEL: {{^}}trunc_i64_to_i32_store:
 ; EG: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1
@@ -18,29 +20,33 @@ define void @trunc_i64_to_i32_store(i32 addrspace(1)* %out, i64 %in) {
   ret void
 }
 
-; SI-LABEL: {{^}}trunc_load_shl_i64:
-; SI-DAG: s_load_dwordx2
-; SI-DAG: s_load_dword [[SREG:s[0-9]+]],
-; SI: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2
-; SI: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]]
-; SI: buffer_store_dword [[VSHL]],
-define void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
+; GCN-LABEL: {{^}}trunc_load_shl_i64:
+; GCN-DAG: s_load_dwordx2
+; GCN-DAG: s_load_dword [[SREG:s[0-9]+]],
+; GCN: s_lshl_b32 [[SHL:s[0-9]+]], [[SREG]], 2
+; GCN: v_mov_b32_e32 [[VSHL:v[0-9]+]], [[SHL]]
+; SI: buffer_store_dword [[VSHL]]
+; VI: flat_store_dword v[{{[0-9:]+}}], [[VSHL]]
+
+define amdgpu_kernel void @trunc_load_shl_i64(i32 addrspace(1)* %out, i64 %a) {
   %b = shl i64 %a, 2
   %result = trunc i64 %b to i32
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}trunc_shl_i64:
+; GCN-LABEL: {{^}}trunc_shl_i64:
 ; SI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0xd
-; SI: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
-; SI: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
-; SI: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
-; SI: s_addc_u32
+; VI: s_load_dwordx2 s{{\[}}[[LO_SREG:[0-9]+]]:{{[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0x34
+; GCN: s_lshl_b64 s{{\[}}[[LO_SHL:[0-9]+]]:{{[0-9]+\]}}, s{{\[}}[[LO_SREG]]:{{[0-9]+\]}}, 2
+; GCN: s_add_u32 s[[LO_SREG2:[0-9]+]], s[[LO_SHL]],
+; GCN: v_mov_b32_e32 v[[LO_VREG:[0-9]+]], s[[LO_SREG2]]
+; GCN: s_addc_u32
 ; SI: buffer_store_dword v[[LO_VREG]],
-; SI: v_mov_b32_e32
-; SI: v_mov_b32_e32
-define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
+; VI: flat_store_dword v[{{[0-9:]+}}], v[[LO_VREG]]
+; GCN: v_mov_b32_e32
+; GCN: v_mov_b32_e32
+define amdgpu_kernel void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64 %a) {
   %aa = add i64 %a, 234 ; Prevent shrinking store.
   %b = shl i64 %aa, 2
   %result = trunc i64 %b to i32
@@ -49,10 +55,9 @@ define void @trunc_shl_i64(i64 addrspace(1)* %out2, i32 addrspace(1)* %out, i64
   ret void
 }
 
-; SI-LABEL: {{^}}trunc_i32_to_i1:
-; SI: v_and_b32_e32 v{{[0-9]+}}, 1, v{{[0-9]+}}
-; SI: v_cmp_eq_u32
-define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
+; GCN-LABEL: {{^}}trunc_i32_to_i1:
+; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
+define amdgpu_kernel void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
   %a = load i32, i32 addrspace(1)* %ptr, align 4
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
@@ -60,34 +65,54 @@ define void @trunc_i32_to_i1(i32 addrspace(1)* %out, i32 addrspace(1)* %ptr) {
   ret void
 }
 
-; SI-LABEL: {{^}}sgpr_trunc_i32_to_i1:
-; SI: s_and_b32 s{{[0-9]+}}, 1, s{{[0-9]+}}
-; SI: v_cmp_eq_u32
-define void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
+; GCN-LABEL: {{^}}trunc_i8_to_i1:
+; GCN: v_and_b32_e32 [[VREG:v[0-9]+]], 1, v{{[0-9]+}}
+define amdgpu_kernel void @trunc_i8_to_i1(i8 addrspace(1)* %out, i8 addrspace(1)* %ptr) {
+  %a = load i8, i8 addrspace(1)* %ptr, align 4
+  %trunc = trunc i8 %a to i1
+  %result = select i1 %trunc, i8 1, i8 0
+  store i8 %result, i8 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sgpr_trunc_i16_to_i1:
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+define amdgpu_kernel void @sgpr_trunc_i16_to_i1(i16 addrspace(1)* %out, i16 %a) {
+  %trunc = trunc i16 %a to i1
+  %result = select i1 %trunc, i16 1, i16 0
+  store i16 %result, i16 addrspace(1)* %out, align 4
+  ret void
+}
+
+; GCN-LABEL: {{^}}sgpr_trunc_i32_to_i1:
+; GCN: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
+define amdgpu_kernel void @sgpr_trunc_i32_to_i1(i32 addrspace(1)* %out, i32 %a) {
   %trunc = trunc i32 %a to i1
   %result = select i1 %trunc, i32 1, i32 0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
 }
 
-; SI-LABEL: {{^}}s_trunc_i64_to_i1:
+; GCN-LABEL: {{^}}s_trunc_i64_to_i1:
 ; SI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0xb
-; SI: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
-; SI: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}}
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
-define void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
+; VI: s_load_dwordx2 s{{\[}}[[SLO:[0-9]+]]:{{[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 0x2c
+; GCN: s_and_b32 [[MASKED:s[0-9]+]], 1, s[[SLO]]
+; GCN: v_cmp_eq_u32_e64 s{{\[}}[[VLO:[0-9]+]]:[[VHI:[0-9]+]]], [[MASKED]], 1{{$}}
+; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, s{{\[}}[[VLO]]:[[VHI]]]
+define amdgpu_kernel void @s_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 %x) {
   %trunc = trunc i64 %x to i1
   %sel = select i1 %trunc, i32 63, i32 -12
   store i32 %sel, i32 addrspace(1)* %out
   ret void
 }
 
-; SI-LABEL: {{^}}v_trunc_i64_to_i1:
+; GCN-LABEL: {{^}}v_trunc_i64_to_i1:
 ; SI: buffer_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}}
-; SI: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
-; SI: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
-; SI: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
-define void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
+; VI: flat_load_dwordx2 v{{\[}}[[VLO:[0-9]+]]:{{[0-9]+\]}}
+; GCN: v_and_b32_e32 [[MASKED:v[0-9]+]], 1, v[[VLO]]
+; GCN: v_cmp_eq_u32_e32 vcc, 1, [[MASKED]]
+; GCN: v_cndmask_b32_e64 {{v[0-9]+}}, -12, 63, vcc
+define amdgpu_kernel void @v_trunc_i64_to_i1(i32 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.r600.read.tidig.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/tti-unroll-prefs.ll b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
index 76c32afc1f215957fd10327325228296813998a0..7c369a312761725f8f809febcb6b59c11f849603 100644
--- a/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
+++ b/test/CodeGen/AMDGPU/tti-unroll-prefs.ll
@@ -19,7 +19,7 @@
 ; CHECK: store i8 0, i8 addrspace(1)*
 ; CHECK-NOT: store i8 0, i8 addrspace(1)*
 ; CHECK: ret void
-define void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
+define amdgpu_kernel void @test(i8 addrspace(1)* nocapture %dst, i32 %a, i32 %b, i32 %c) {
 entry:
   %add = add nsw i32 %b, 4
   %cmp = icmp sgt i32 %add, %a
diff --git a/test/CodeGen/AMDGPU/uaddo.ll b/test/CodeGen/AMDGPU/uaddo.ll
index d62e4e0b14093b289c0e5258441eb1439387c179..632ccaa7e612451dc4377bad9335ed5584d5ee2d 100644
--- a/test/CodeGen/AMDGPU/uaddo.ll
+++ b/test/CodeGen/AMDGPU/uaddo.ll
@@ -9,7 +9,7 @@
 
 ; EG: ADDC_UINT
 ; EG: ADDC_UINT
-define void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -27,7 +27,7 @@ define void @s_uaddo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
   %uadd = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %uadd, 0
   %carry = extractvalue { i32, i1 } %uadd, 1
@@ -42,7 +42,7 @@ define void @s_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -63,7 +63,7 @@ define void @v_uaddo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -85,7 +85,7 @@ define void @v_uaddo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryou
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
   %uadd = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %uadd, 0
   %carry = extractvalue { i64, i1 } %uadd, 1
@@ -100,7 +100,7 @@ define void @s_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 
 ; EG: ADDC_UINT
 ; EG: ADD_INT
-define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
@@ -118,7 +118,7 @@ define void @v_uaddo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_uaddo_i16:
 ; VI: v_add_u16_e32
 ; VI: v_cmp_lt_u16_e32
-define void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_uaddo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
diff --git a/test/CodeGen/AMDGPU/udiv.ll b/test/CodeGen/AMDGPU/udiv.ll
index da88d2a8e8cbe3ebb3343bfae4e4cf925418d2b1..2874a0cdbc05f81e68d6620be5c694fd28b4b3f6 100644
--- a/test/CodeGen/AMDGPU/udiv.ll
+++ b/test/CodeGen/AMDGPU/udiv.ll
@@ -1,22 +1,27 @@
 ; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs -mattr=-fp32-denormals < %s | FileCheck -check-prefix=SI -check-prefix=FUNC -check-prefix=VI %s
+
+; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=+fp32-denormals < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+
 ; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s
 
 ; FUNC-LABEL: {{^}}udiv_i32:
 ; EG-NOT: SETGE_INT
 ; EG: CF_END
-define void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+
+; SI: v_rcp_iflag_f32_e32
+define amdgpu_kernel void @udiv_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
-  %a = load i32, i32 addrspace(1) * %in
-  %b = load i32, i32 addrspace(1) * %b_ptr
+  %a = load i32, i32 addrspace(1)* %in
+  %b = load i32, i32 addrspace(1)* %b_ptr
   %result = udiv i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}s_udiv_i32:
-
-define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+; SI: v_rcp_iflag_f32_e32
+define amdgpu_kernel void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = udiv i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -30,8 +35,10 @@ define void @s_udiv_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 ; FUNC-LABEL: {{^}}udiv_v2i32:
 ; EG: CF_END
 
+; SI: v_rcp_iflag_f32_e32
+; SI: v_rcp_iflag_f32_e32
 ; SI: s_endpgm
-define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %b_ptr
@@ -43,7 +50,7 @@ define void @udiv_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %i
 ; FUNC-LABEL: {{^}}udiv_v4i32:
 ; EG: CF_END
 ; SI: s_endpgm
-define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %b_ptr
@@ -56,7 +63,7 @@ define void @udiv_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %i
 ; SI: buffer_load_dword [[VAL:v[0-9]+]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 4, [[VAL]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 16
@@ -70,7 +77,7 @@ define void @udiv_i32_div_pow2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 25, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 34259182
@@ -84,7 +91,7 @@ define void @udiv_i32_div_k_even(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; SI: v_mul_hi_u32 [[MULHI:v[0-9]+]], [[K]], [[VAL]]
 ; SI: v_lshrrev_b32_e32 [[RESULT:v[0-9]+]], 24, [[MULHI]]
 ; SI: buffer_store_dword [[RESULT]]
-define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %result = udiv i32 %a, 34259183
@@ -96,7 +103,7 @@ define void @udiv_i32_div_k_odd(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -110,7 +117,7 @@ define void @v_udiv_i8(i32 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0xffff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in
   %den = load i16, i16 addrspace(1) * %den_ptr
@@ -124,7 +131,7 @@ define void @v_udiv_i16(i32 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; SI: v_rcp_f32
 ; SI: v_and_b32_e32 [[TRUNC:v[0-9]+]], 0x7fffff, v{{[0-9]+}}
 ; SI: buffer_store_dword [[TRUNC]]
-define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
   %den_ptr = getelementptr i23, i23 addrspace(1)* %in, i23 1
   %num = load i23, i23 addrspace(1) * %in
   %den = load i23, i23 addrspace(1) * %den_ptr
@@ -136,7 +143,7 @@ define void @v_udiv_i23(i32 addrspace(1)* %out, i23 addrspace(1)* %in) {
 
 ; FUNC-LABEL: {{^}}v_udiv_i24:
 ; SI-NOT: v_rcp_f32
-define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
+define amdgpu_kernel void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
   %den_ptr = getelementptr i24, i24 addrspace(1)* %in, i24 1
   %num = load i24, i24 addrspace(1) * %in
   %den = load i24, i24 addrspace(1) * %den_ptr
@@ -152,9 +159,42 @@ define void @v_udiv_i24(i32 addrspace(1)* %out, i24 addrspace(1)* %in) {
 ; SI: v_mul_hi_u32
 ; SI: v_mul_hi_u32
 
-define void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
+define amdgpu_kernel void @scalarize_mulhu_4xi32(<4 x i32> addrspace(1)* nocapture readonly %in, <4 x i32> addrspace(1)* nocapture %out) {
   %1 = load <4 x i32>, <4 x i32> addrspace(1)* %in, align 16
   %2 = udiv <4 x i32> %1, <i32 53668, i32 53668, i32 53668, i32 53668>
   store <4 x i32> %2, <4 x i32> addrspace(1)* %out, align 16
   ret void
 }
+
+; FUNC-LABEL: {{^}}test_udiv2:
+; SI: s_lshr_b32 s{{[0-9]}}, s{{[0-9]}}, 1
+define amdgpu_kernel void @test_udiv2(i32 %p) {
+  %i = udiv i32 %p, 2
+  store volatile i32 %i, i32 addrspace(1)* undef
+  ret void
+}
+
+; FUNC-LABEL: {{^}}test_udiv_3_mulhu:
+; SI: v_mov_b32_e32 v{{[0-9]+}}, 0xaaaaaaab
+; SI: v_mul_hi_u32 v0, {{v[0-9]+}}, {{s[0-9]+}}
+; SI-NEXT: v_lshrrev_b32_e32 v0, 1, v0
+define amdgpu_kernel void @test_udiv_3_mulhu(i32 %p) {
+   %i = udiv i32 %p, 3
+   store volatile i32 %i, i32 addrspace(1)* undef
+   ret void
+}
+
+; GCN-LABEL: {{^}}fdiv_test_denormals
+; VI: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @fdiv_test_denormals(i8 addrspace(1)* nocapture readonly %arg) {
+bb:
+  %tmp = load i8, i8 addrspace(1)* null, align 1
+  %tmp1 = sext i8 %tmp to i32
+  %tmp2 = getelementptr inbounds i8, i8 addrspace(1)* %arg, i64 undef
+  %tmp3 = load i8, i8 addrspace(1)* %tmp2, align 1
+  %tmp4 = sext i8 %tmp3 to i32
+  %tmp5 = sdiv i32 %tmp1, %tmp4
+  %tmp6 = trunc i32 %tmp5 to i8
+  store i8 %tmp6, i8 addrspace(1)* null, align 1
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/udivrem.ll b/test/CodeGen/AMDGPU/udivrem.ll
index 17f4ebf175d9be9c34cd797d045fd9b3ff62b858..9507a49cfc8b3f52d2dac3394eda9c4115ed21db 100644
--- a/test/CodeGen/AMDGPU/udivrem.ll
+++ b/test/CodeGen/AMDGPU/udivrem.ll
@@ -51,7 +51,7 @@
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
+define amdgpu_kernel void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32 %x, i32 %y) {
   %result0 = udiv i32 %x, %y
   store i32 %result0, i32 addrspace(1)* %out0
   %result1 = urem i32 %x, %y
@@ -158,7 +158,7 @@ define void @test_udivrem(i32 addrspace(1)* %out0, i32 addrspace(1)* %out1, i32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i32> %y) {
   %result0 = udiv <2 x i32> %x, %y
   store <2 x i32> %result0, <2 x i32> addrspace(1)* %out
   %result1 = urem <2 x i32> %x, %y
@@ -340,7 +340,7 @@ define void @test_udivrem_v2(<2 x i32> addrspace(1)* %out, <2 x i32> %x, <2 x i3
 ; SI-DAG: v_subrev_i32_e32
 ; SI-DAG: v_cndmask_b32_e64
 ; SI: s_endpgm
-define void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
+define amdgpu_kernel void @test_udivrem_v4(<4 x i32> addrspace(1)* %out, <4 x i32> %x, <4 x i32> %y) {
   %result0 = udiv <4 x i32> %x, %y
   store <4 x i32> %result0, <4 x i32> addrspace(1)* %out
   %result1 = urem <4 x i32> %x, %y
diff --git a/test/CodeGen/AMDGPU/udivrem24.ll b/test/CodeGen/AMDGPU/udivrem24.ll
index 6d145f1dbf09712171f7a47f9f7f06135fb82918..6f144dcc6fd218ae0fdec440c6be33b84107b351 100644
--- a/test/CodeGen/AMDGPU/udivrem24.ll
+++ b/test/CodeGen/AMDGPU/udivrem24.ll
@@ -12,7 +12,7 @@
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -31,7 +31,7 @@ define void @udiv24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -50,7 +50,7 @@ define void @udiv24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -67,7 +67,7 @@ define void @udiv23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -84,7 +84,7 @@ define void @udiv24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -101,7 +101,7 @@ define void @no_udiv24_u23_u24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 ; SI: v_rcp_iflag
 ; SI-NOT v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -121,7 +121,7 @@ define void @no_udiv24_u24_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -141,7 +141,7 @@ define void @udiv25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -161,7 +161,7 @@ define void @test_no_udiv24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -184,7 +184,7 @@ define void @test_no_udiv24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
   %den_ptr = getelementptr i8, i8 addrspace(1)* %in, i8 1
   %num = load i8, i8 addrspace(1) * %in
   %den = load i8, i8 addrspace(1) * %den_ptr
@@ -203,7 +203,7 @@ define void @urem24_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) {
 ; EG-DAG: UINT_TO_FLT
 ; EG-DAG: RECIP_IEEE
 ; EG: FLT_TO_UINT
-define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
   %den_ptr = getelementptr i16, i16 addrspace(1)* %in, i16 1
   %num = load i16, i16 addrspace(1) * %in, align 2
   %den = load i16, i16 addrspace(1) * %den_ptr, align 2
@@ -215,7 +215,7 @@ define void @urem24_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %in) {
 ; FUNC-LABEL: {{^}}urem24_i32:
 ; SI-NOT: v_rcp_f32
 ; EG-NOT: RECIP_IEEE
-define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -235,7 +235,7 @@ define void @urem24_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -255,7 +255,7 @@ define void @urem25_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -275,7 +275,7 @@ define void @test_no_urem24_i32_1(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 
 ; EG-NOT: UINT_TO_FLT
 ; EG-NOT: RECIP_IEEE
-define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -294,7 +294,7 @@ define void @test_no_urem24_i32_2(i32 addrspace(1)* %out, i32 addrspace(1)* %in)
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; EG: RECIP_IEEE
-define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
@@ -313,7 +313,7 @@ define void @test_udiv24_u16_u23_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %
 ; SI: v_and_b32_e32 v{{[0-9]+}}, [[MASK]],
 
 ; EG: RECIP_IEEE
-define void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_udiv24_u23_u16_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %den_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %num = load i32, i32 addrspace(1) * %in, align 4
   %den = load i32, i32 addrspace(1) * %den_ptr, align 4
diff --git a/test/CodeGen/AMDGPU/udivrem64.ll b/test/CodeGen/AMDGPU/udivrem64.ll
index da61a841ff35db2c8596f11ad4b0c7f06b37afc7..bd297920d5634abf5a46dfa27aaf31308cf99b47 100644
--- a/test/CodeGen/AMDGPU/udivrem64.ll
+++ b/test/CodeGen/AMDGPU/udivrem64.ll
@@ -70,7 +70,7 @@
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = udiv i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -144,7 +144,7 @@ define void @test_udiv(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %result = urem i64 %x, %y
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -159,7 +159,7 @@ define void @test_urem(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = udiv i64 %1, %2
@@ -176,7 +176,7 @@ define void @test_udiv3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;SI-NOT: v_lshr_b64
 ;VI-NOT: v_lshrrev_b64
 ;GCN: s_endpgm
-define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 33
   %2 = lshr i64 %y, 33
   %result = urem i64 %1, %2
@@ -195,7 +195,7 @@ define void @test_urem3264(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = udiv i64 %1, %2
@@ -214,7 +214,7 @@ define void @test_udiv2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
 ;VI-NOT: v_lshrrev_b64
 ;GCN: v_mad_f32
 ;GCN: s_endpgm
-define void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
+define amdgpu_kernel void @test_urem2364(i64 addrspace(1)* %out, i64 %x, i64 %y) {
   %1 = lshr i64 %x, 41
   %2 = lshr i64 %y, 41
   %result = urem i64 %1, %2
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
index a4e18ebc9120a78f83b5da01f8b60c52773d6d69..62943aeefbd8af27b76e17d89fbf7bd79280973b 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.f64.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI-DAG: v_ldexp_f64 [[LDEXP:v\[[0-9]+:[0-9]+\]]], [[HI_CONV]], 32
 ; SI: v_add_f64 [[RESULT:v\[[0-9]+:[0-9]+\]]], [[LDEXP]], [[LO_CONV]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
-define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)* %in) {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %val = load i64, i64 addrspace(1)* %gep, align 8
@@ -19,21 +19,21 @@ define void @v_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 addrspace(1)
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_i64_to_f64
-define void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f64(double addrspace(1)* %out, i64 %in) {
   %cast = uitofp i64 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f64
-define void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i64> %in) {
   %cast = uitofp <2 x i64> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
 }
 
 ; SI-LABEL: {{^}}s_uint_to_fp_v4i64_to_v4f64
-define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i64> %in) {
   %cast = uitofp <4 x i64> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -42,7 +42,7 @@ define void @s_uint_to_fp_v4i64_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
 ; SI-LABEL: {{^}}s_uint_to_fp_i32_to_f64
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
   %cast = uitofp i32 %in to double
   store double %cast, double addrspace(1)* %out, align 8
   ret void
@@ -52,7 +52,7 @@ define void @s_uint_to_fp_i32_to_f64(double addrspace(1)* %out, i32 %in) {
 ; SI: v_cvt_f64_u32_e32
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i32> %in) {
   %cast = uitofp <2 x i32> %in to <2 x double>
   store <2 x double> %cast, <2 x double> addrspace(1)* %out, align 16
   ret void
@@ -64,7 +64,7 @@ define void @s_uint_to_fp_v2i32_to_v2f64(<2 x double> addrspace(1)* %out, <2 x i
 ; SI: v_cvt_f64_u32_e32
 ; SI: v_cvt_f64_u32_e32
 ; SI: s_endpgm
-define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i32> %in) {
   %cast = uitofp <4 x i32> %in to <4 x double>
   store <4 x double> %cast, <4 x double> addrspace(1)* %out, align 16
   ret void
@@ -79,7 +79,7 @@ define void @s_uint_to_fp_v4i32_to_v4f64(<4 x double> addrspace(1)* %out, <4 x i
 ; SI-DAG: v_mov_b32_e32 v[[ZERO:[0-9]+]], 0{{$}}
 ; SI: buffer_store_dwordx2 v{{\[}}[[ZERO]]:[[SEL]]{{\]}}
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to double
   store double %fp, double addrspace(1)* %out, align 4
@@ -91,7 +91,7 @@ define void @uint_to_fp_i1_to_f64(double addrspace(1)* %out, i32 %in) {
 ; SI-NEXT: v_cvt_f64_u32_e32 [[RESULT:v\[[0-9]+:[0-9]\]]], [[IRESULT]]
 ; SI: buffer_store_dwordx2 [[RESULT]]
 ; SI: s_endpgm
-define void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
+define amdgpu_kernel void @uint_to_fp_i1_to_f64_load(double addrspace(1)* %out, i1 %in) {
   %fp = uitofp i1 %in to double
   store double %fp, double addrspace(1)* %out, align 8
   ret void
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
index cd816b27fce69c64f88ae4e50cad578a3866f54d..4168326e14c633d28f5f82893929490b9b1b4882 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.i64.ll
@@ -4,7 +4,7 @@
 ; FIXME: This should be merged with uint_to_fp.ll, but s_uint_to_fp_v2i64 crashes on r600
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f16:
-define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
   %result = uitofp i64 %in to half
   store half %result, half addrspace(1)* %out
   ret void
@@ -24,7 +24,7 @@ define void @s_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 %in) #0 {
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: v_cvt_f16_f32_e32 [[VR_F16:v[0-9]+]], [[VR]]
 ; GCN: {{buffer|flat}}_store_short {{.*}}[[VR_F16]]
-define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr half, half addrspace(1)* %out, i32 %tid
@@ -35,7 +35,7 @@ define void @v_uint_to_fp_i64_to_f16(half addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_i64_to_f32:
-define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
   %result = uitofp i64 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -54,7 +54,7 @@ define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 
 ; GCN: v_add_i32_e32 [[VR:v[0-9]+]]
 ; GCN: {{buffer|flat}}_store_dword {{.*}}[[VR]]
-define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i64, i64 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -65,14 +65,14 @@ define void @v_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 addrspace(1)*
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f32:
-define void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = uitofp <2 x i64> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f32:
-define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -83,14 +83,14 @@ define void @v_uint_to_fp_v4i64_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i6
 }
 
 ; FUNC-LABEL: {{^}}s_uint_to_fp_v2i64_to_v2f16:
-define void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
+define amdgpu_kernel void @s_uint_to_fp_v2i64_to_v2f16(<2 x half> addrspace(1)* %out, <2 x i64> %in) #0{
   %result = uitofp <2 x i64> %in to <2 x half>
   store <2 x half> %result, <2 x half> addrspace(1)* %out
   ret void
 }
 
 ; FUNC-LABEL: {{^}}v_uint_to_fp_v4i64_to_v4f16:
-define void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i64_to_v4f16(<4 x half> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x half>, <4 x half> addrspace(1)* %out, i32 %tid
diff --git a/test/CodeGen/AMDGPU/uint_to_fp.ll b/test/CodeGen/AMDGPU/uint_to_fp.ll
index 3003226ca1a45c1d9e7d1a385f32f4bde8d82d3b..2e9918717c3ac87d58f7b46665c54a3392264ef9 100644
--- a/test/CodeGen/AMDGPU/uint_to_fp.ll
+++ b/test/CodeGen/AMDGPU/uint_to_fp.ll
@@ -6,7 +6,7 @@
 ; SI: v_cvt_f32_u32_e32
 
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].Z
-define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %result = uitofp i32 %in to float
   store float %result, float addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @s_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cvt_f32_u32_e32 {{v[0-9]+}}, {{v[0-9]+$}}
 
 ; R600: INT_TO_FLT
-define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i32, i32 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -32,7 +32,7 @@ define void @v_uint_to_fp_i32_to_f32(float addrspace(1)* %out, i32 addrspace(1)*
 
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[2].W
 ; R600-DAG: UINT_TO_FLT * T{{[0-9]+\.[XYZW]}}, KC0[3].X
-define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i32> %in) #0 {
   %result = uitofp <2 x i32> %in to <2 x float>
   store <2 x float> %result, <2 x float> addrspace(1)* %out
   ret void
@@ -49,7 +49,7 @@ define void @s_uint_to_fp_v2i32_to_v2f32(<2 x float> addrspace(1)* %out, <2 x i3
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %value = load <4 x i32>, <4 x i32> addrspace(1) * %in
   %result = uitofp <4 x i32> %value to <4 x float>
   store <4 x float> %result, <4 x float> addrspace(1)* %out
@@ -66,7 +66,7 @@ define void @s_uint_to_fp_v4i32_to_v4f32(<4 x float> addrspace(1)* %out, <4 x i3
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ; R600: UINT_TO_FLT * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
-define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr <4 x float>, <4 x float> addrspace(1)* %out, i32 %tid
@@ -81,7 +81,7 @@ define void @v_uint_to_fp_v4i32(<4 x float> addrspace(1)* %out, <4 x i32> addrsp
 ; SI-NEXT: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0, [[CMP]]
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
   %cmp = icmp eq i32 %in, 0
   %fp = uitofp i1 %cmp to float
   store float %fp, float addrspace(1)* %out
@@ -92,7 +92,7 @@ define void @s_uint_to_fp_i1_to_f32(float addrspace(1)* %out, i32 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: buffer_store_dword [[RESULT]],
 ; SI: s_endpgm
-define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
   %fp = uitofp i1 %in to float
   store float %fp, float addrspace(1)* %out
   ret void
@@ -105,7 +105,7 @@ define void @s_uint_to_fp_i1_to_f32_load(float addrspace(1)* %out, i1 %in) #0 {
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1.0
 ; SI: {{buffer|flat}}_store_dword {{.*}}[[RESULT]]
 ; SI: s_endpgm
-define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)* %in) #0 {
   %tid = call i32 @llvm.r600.read.tidig.x()
   %in.gep = getelementptr i1, i1 addrspace(1)* %in, i32 %tid
   %out.gep = getelementptr float, float addrspace(1)* %out, i32 %tid
@@ -126,7 +126,7 @@ define void @v_uint_to_fp_i1_f32_load(float addrspace(1)* %out, i1 addrspace(1)*
 ; R600-DAG: SETGT_UINT
 ; R600-DAG: SETE_INT
 
-define void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
+define amdgpu_kernel void @s_uint_to_fp_i64_to_f32(float addrspace(1)* %out, i64 %in) #0 {
 entry:
   %cvt = uitofp i64 %in to float
   store float %cvt, float addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/uitofp.f16.ll b/test/CodeGen/AMDGPU/uitofp.f16.ll
index faab5ca5db73035fa4aa66a96e756ab721a031eb..0c3b0fcaf85492d1a9cebd5938d877c449bf76a3 100644
--- a/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -8,7 +8,7 @@
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_F32]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @uitofp_i16_to_f16(
+define amdgpu_kernel void @uitofp_i16_to_f16(
     half addrspace(1)* %r,
     i16 addrspace(1)* %a) {
 entry:
@@ -24,7 +24,7 @@ entry:
 ; GCN: v_cvt_f16_f32_e32 v[[R_F16:[0-9]+]], v[[A_I16]]
 ; GCN: buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @uitofp_i32_to_f16(
+define amdgpu_kernel void @uitofp_i32_to_f16(
     half addrspace(1)* %r,
     i32 addrspace(1)* %a) {
 entry:
@@ -38,18 +38,23 @@ entry:
 
 ; GCN-LABEL: {{^}}uitofp_v2i16_to_v2f16
 ; GCN:     buffer_load_dword
-; SI:      v_cvt_f32_u32_e32
-; SI:      v_cvt_f32_u32_e32
-; VI:      v_cvt_f32_i32_e32
-; VI:      v_cvt_f32_i32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN-DAG: v_and_b32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
-; GCN:     buffer_store_dword
-; GCN:     s_endpgm
-define void @uitofp_v2i16_to_v2f16(
+
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f32_i32_sdwa
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI:     v_or_b32_e32
+
+; GCN: buffer_store_dword
+; GCN: s_endpgm
+define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i16> addrspace(1)* %a) {
 entry:
@@ -61,16 +66,23 @@ entry:
 
 ; GCN-LABEL: {{^}}uitofp_v2i32_to_v2f16
 ; GCN:     buffer_load_dwordx2
-; GCN:     v_cvt_f32_u32_e32
-; GCN:     v_cvt_f32_u32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN:     v_cvt_f16_f32_e32
-; GCN-DAG: v_and_b32_e32
-; GCN-DAG: v_lshlrev_b32_e32
-; GCN-DAG: v_or_b32_e32
+
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f32_u32_e32
+; SI: v_cvt_f16_f32_e32
+; SI: v_cvt_f16_f32_e32
+; SI-DAG: v_lshlrev_b32_e32
+; SI: v_or_b32_e32
+
+; VI-DAG: v_cvt_f32_u32_e32
+; VI-DAG: v_cvt_f32_u32_e32
+; VI-DAG: v_cvt_f16_f32_e32
+; VI-DAG: v_cvt_f16_f32_sdwa
+; VI:     v_or_b32_e32
+
 ; GCN:     buffer_store_dword
 ; GCN:     s_endpgm
-define void @uitofp_v2i32_to_v2f16(
+define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x i32> addrspace(1)* %a) {
 entry:
diff --git a/test/CodeGen/AMDGPU/umed3.ll b/test/CodeGen/AMDGPU/umed3.ll
index a2e485d362254f2e43e8008b7636b61e2627e3f5..5a579f3575fd107e69422de64414a3e83427ab96 100644
--- a/test/CodeGen/AMDGPU/umed3.ll
+++ b/test/CodeGen/AMDGPU/umed3.ll
@@ -1,12 +1,13 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
-; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SICIVI -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx901 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
-declare i32 @llvm.r600.read.tidig.x() #0
+declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i32:
 ; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -24,8 +25,8 @@ define void @v_test_umed3_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %a
 ; GCN-LABEL: {{^}}v_test_umed3_multi_use_r_i_i_i32:
 ; GCN: v_max_u32
 ; GCN: v_min_u32
-define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -44,8 +45,8 @@ define void @v_test_umed3_multi_use_r_i_i_i32(i32 addrspace(1)* %out, i32 addrsp
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_constant_order_i32:
 ; GCN: v_max_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
-define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -63,8 +64,8 @@ define void @v_test_umed3_r_i_i_constant_order_i32(i32 addrspace(1)* %out, i32 a
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_sign_mismatch_i32:
 ; GCN: v_max_i32_e32 v{{[0-9]+}}, 12, v{{[0-9]+}}
 ; GCN: v_min_u32_e32 v{{[0-9]+}}, 17, v{{[0-9]+}}
-define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i32, i32 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i32, i32 addrspace(1)* %out, i32 %tid
   %a = load i32, i32 addrspace(1)* %gep0
@@ -82,8 +83,8 @@ define void @v_test_umed3_r_i_i_sign_mismatch_i32(i32 addrspace(1)* %out, i32 ad
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i64:
 ; GCN: v_cmp_lt_u64
 ; GCN: v_cmp_gt_u64
-define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+define amdgpu_kernel void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i64, i64 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i64, i64 addrspace(1)* %out, i32 %tid
   %a = load i64, i64 addrspace(1)* %gep0
@@ -99,9 +100,10 @@ define void @v_test_umed3_r_i_i_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a
 }
 
 ; GCN-LABEL: {{^}}v_test_umed3_r_i_i_i16:
-; GCN: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
-define void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
-  %tid = call i32 @llvm.r600.read.tidig.x()
+; SICIVI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, 12, 17
+define amdgpu_kernel void @v_test_umed3_r_i_i_i16(i16 addrspace(1)* %out, i16 addrspace(1)* %aptr) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %gep0 = getelementptr i16, i16 addrspace(1)* %aptr, i32 %tid
   %outgep = getelementptr i16, i16 addrspace(1)* %out, i32 %tid
   %a = load i16, i16 addrspace(1)* %gep0
@@ -171,7 +173,7 @@ define internal i8 @umax8(i8 %x, i8 %y) #2 {
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -183,7 +185,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -195,7 +197,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -207,7 +209,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_3:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_3(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -219,7 +221,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_4:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_4(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -231,7 +233,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_5:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_5(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -243,7 +245,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_6:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_6(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -255,7 +257,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_7:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_7(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -267,7 +269,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_8:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_8(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -279,7 +281,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_9:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_9(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -291,7 +293,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_10:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_10(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -303,7 +305,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_11:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_11(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -315,7 +317,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_12:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_12(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -327,7 +329,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_13:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_13(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -339,7 +341,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_14:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_14(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -351,7 +353,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_15:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_15(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %y, i32 %x)
   %tmp1 = call i32 @umax(i32 %y, i32 %x)
@@ -366,7 +368,7 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 %x, i16 %y, i16 %z) #1 {
 bb:
   %tmp0 = call i16 @umin16(i16 %x, i16 %y)
   %tmp1 = call i16 @umax16(i16 %x, i16 %y)
@@ -381,7 +383,7 @@ bb:
 ; GCN: s_and_b32
 ; GCN: s_and_b32
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i8_pat_0(i8 addrspace(1)* %arg, i8 %x, i8 %y, i8 %z) #1 {
 bb:
   %tmp0 = call i8 @umin8(i8 %x, i8 %y)
   %tmp1 = call i8 @umax8(i8 %x, i8 %y)
@@ -393,7 +395,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_0:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -406,7 +408,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_1:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -419,7 +421,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_2:
 ; GCN-NOT: v_med3_u32
-define void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -432,7 +434,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_multi_use_result:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_multi_use_result(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -445,7 +447,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src0:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src0(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 1, i32 %y)
   %tmp1 = call i32 @umax(i32 1, i32 %y)
@@ -457,7 +459,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src1:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src1(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 2)
   %tmp1 = call i32 @umax(i32 %x, i32 2)
@@ -469,7 +471,7 @@ bb:
 
 ; GCN-LABEL: {{^}}s_test_umed3_i32_pat_0_imm_src2:
 ; GCN: v_med3_u32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, 9
-define void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
+define amdgpu_kernel void @s_test_umed3_i32_pat_0_imm_src2(i32 addrspace(1)* %arg, i32 %x, i32 %y, i32 %z) #1 {
 bb:
   %tmp0 = call i32 @umin(i32 %x, i32 %y)
   %tmp1 = call i32 @umax(i32 %x, i32 %y)
@@ -479,6 +481,35 @@ bb:
   ret void
 }
 
+; GCN-LABEL: {{^}}v_test_umed3_i16_pat_0:
+; SI: v_med3_u32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; FIXME: VI not matching med3
+; VI: v_min_u16
+; VI: v_max_u16
+; VI: v_min_u16
+; VI: v_max_u16
+
+; GFX9: v_med3_u16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+define amdgpu_kernel void @v_test_umed3_i16_pat_0(i16 addrspace(1)* %arg, i16 addrspace(1)* %out, i16 addrspace(1)* %a.ptr) #1 {
+bb:
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr, i32 %tid
+  %gep1 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 3
+  %gep2 = getelementptr inbounds i16, i16 addrspace(1)* %gep0, i32 8
+  %out.gep = getelementptr inbounds i16, i16 addrspace(1)* %out, i32 %tid
+  %x = load i16, i16 addrspace(1)* %gep0
+  %y = load i16, i16 addrspace(1)* %gep1
+  %z = load i16, i16 addrspace(1)* %gep2
+
+  %tmp0 = call i16 @umin16(i16 %x, i16 %y)
+  %tmp1 = call i16 @umax16(i16 %x, i16 %y)
+  %tmp2 = call i16 @umin16(i16 %tmp1, i16 %z)
+  %tmp3 = call i16 @umax16(i16 %tmp0, i16 %tmp2)
+  store i16 %tmp3, i16 addrspace(1)* %out.gep
+  ret void
+}
+
 attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readnone alwaysinline }
diff --git a/test/CodeGen/AMDGPU/unaligned-load-store.ll b/test/CodeGen/AMDGPU/unaligned-load-store.ll
index 0f76a54975e6a3082592a73bd2b381e373cd72b4..68aacd084bf9486994d905e6a633aa92718f3f0e 100644
--- a/test/CodeGen/AMDGPU/unaligned-load-store.ll
+++ b/test/CodeGen/AMDGPU/unaligned-load-store.ll
@@ -8,7 +8,7 @@
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(3)* %r) #0 {
   %v = load i16, i16 addrspace(3)* %p, align 1
   store i16 %v, i16 addrspace(3)* %r, align 1
   ret void
@@ -23,7 +23,7 @@ define void @local_unaligned_load_store_i16(i16 addrspace(3)* %p, i16 addrspace(
 ; UNALIGNED: buffer_load_ushort
 ; UNALIGNED: buffer_store_short
 ; SI: s_endpgm
-define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace(1)* %r) #0 {
   %v = load i16, i16 addrspace(1)* %p, align 1
   store i16 %v, i16 addrspace(1)* %r, align 1
   ret void
@@ -42,7 +42,7 @@ define void @global_unaligned_load_store_i16(i16 addrspace(1)* %p, i16 addrspace
 ; SI: ds_write_b8
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 1
   store i32 %v, i32 addrspace(3)* %r, align 1
   ret void
@@ -60,7 +60,7 @@ define void @local_unaligned_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 1
   ret void
@@ -74,7 +74,7 @@ define void @global_unaligned_load_store_i32(i32 addrspace(1)* %p, i32 addrspace
 
 ; UNALIGNED: buffer_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(1)* %p, align 2
   store i32 %v, i32 addrspace(1)* %r, align 2
   ret void
@@ -85,7 +85,7 @@ define void @global_align2_load_store_i32(i32 addrspace(1)* %p, i32 addrspace(1)
 ; GCN: ds_read_u16
 ; GCN: ds_write_b16
 ; GCN: ds_write_b16
-define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)* %r) #0 {
   %v = load i32, i32 addrspace(3)* %p, align 2
   store i32 %v, i32 addrspace(3)* %r, align 2
   ret void
@@ -132,7 +132,7 @@ define void @local_align2_load_store_i32(i32 addrspace(3)* %p, i32 addrspace(3)*
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(3)* %r) #0 {
   %v = load i64, i64 addrspace(3)* %p, align 1
   store i64 %v, i64 addrspace(3)* %r, align 1
   ret void
@@ -179,7 +179,7 @@ define void @local_unaligned_load_store_i64(i64 addrspace(3)* %p, i64 addrspace(
 ; SI-NOT: v_lshl
 ; SI: ds_write_b8
 ; SI: s_endpgm
-define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i32> addrspace(3)* %r) #0 {
   %v = load <2 x i32>, <2 x i32> addrspace(3)* %p, align 1
   store <2 x i32> %v, <2 x i32> addrspace(3)* %r, align 1
   ret void
@@ -209,7 +209,7 @@ define void @local_unaligned_load_store_v2i32(<2 x i32> addrspace(3)* %p, <2 x i
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(1)* %p, align 2
   store i64 %v, i64 addrspace(1)* %r, align 2
   ret void
@@ -239,7 +239,7 @@ define void @global_align2_load_store_i64(i64 addrspace(1)* %p, i64 addrspace(1)
 
 ; UNALIGNED: buffer_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(1)* %p, align 1
   store i64 %v, i64 addrspace(1)* %r, align 1
   ret void
@@ -286,7 +286,7 @@ define void @unaligned_load_store_i64_global(i64 addrspace(1)* %p, i64 addrspace
 ; GCN: ds_write_b8
 ; GCN: ds_write_b8
 ; GCN: s_endpgm
-define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
+define amdgpu_kernel void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i32> addrspace(3)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(3)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(3)* %r, align 1
   ret void
@@ -329,7 +329,7 @@ define void @local_unaligned_load_store_v4i32(<4 x i32> addrspace(3)* %p, <4 x i
 
 ; UNALIGNED: buffer_load_dwordx4
 ; UNALIGNED: buffer_store_dwordx4
-define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 1
   ret void
@@ -337,7 +337,7 @@ define void @global_unaligned_load_store_v4i32(<4 x i32> addrspace(1)* %p, <4 x
 
 ; FUNC-LABEL: {{^}}local_load_i64_align_4:
 ; GCN: ds_read2_b32
-define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -345,7 +345,7 @@ define void @local_load_i64_align_4(i64 addrspace(1)* nocapture %out, i64 addrsp
 
 ; FUNC-LABEL: {{^}}local_load_i64_align_4_with_offset
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset0:8 offset1:9
-define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %in, i32 4
   %val = load i64, i64 addrspace(3)* %ptr, align 4
   store i64 %val, i64 addrspace(1)* %out, align 8
@@ -356,7 +356,7 @@ define void @local_load_i64_align_4_with_offset(i64 addrspace(1)* nocapture %out
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_read2_b32 v[{{[0-9]+}}:{{[0-9]+}}], v{{[0-9]}} offset1:1
 ; GCN: s_endpgm
-define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %ptr = bitcast i64 addrspace(3)* %in to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -375,7 +375,7 @@ define void @local_load_i64_align_4_with_split_offset(i64 addrspace(1)* nocaptur
 ; GCN: ds_read_u8
 ; GCN: ds_read_u8
 ; GCN: store_dwordx2
-define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrspace(3)* %in) #0 {
   %val = load i64, i64 addrspace(3)* %in, align 1
   store i64 %val, i64 addrspace(1)* %out, align 8
   ret void
@@ -383,7 +383,7 @@ define void @local_load_i64_align_1(i64 addrspace(1)* nocapture %out, i64 addrsp
 
 ; FUNC-LABEL: {{^}}local_store_i64_align_4:
 ; GCN: ds_write2_b32
-define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
+define amdgpu_kernel void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
   store i64 %val, i64 addrspace(3)* %out, align 4
   ret void
 }
@@ -391,7 +391,7 @@ define void @local_store_i64_align_4(i64 addrspace(3)* %out, i64 %val) #0 {
 ; FUNC-LABEL: {{^}}local_store_i64_align_4_with_offset
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset0:8 offset1:9
 ; GCN: s_endpgm
-define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
   %ptr = getelementptr i64, i64 addrspace(3)* %out, i32 4
   store i64 0, i64 addrspace(3)* %ptr, align 4
   ret void
@@ -401,7 +401,7 @@ define void @local_store_i64_align_4_with_offset(i64 addrspace(3)* %out) #0 {
 ; The tests for the case where the lo offset is 8-bits, but the hi offset is 9-bits
 ; GCN: ds_write2_b32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} offset1:1
 ; GCN: s_endpgm
-define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #0 {
   %ptr = bitcast i64 addrspace(3)* %out to i32 addrspace(3)*
   %ptr255 = getelementptr i32, i32 addrspace(3)* %ptr, i32 255
   %ptri64 = bitcast i32 addrspace(3)* %ptr255 to i64 addrspace(3)*
@@ -418,7 +418,7 @@ define void @local_store_i64_align_4_with_split_offset(i64 addrspace(3)* %out) #
 ; UNALIGNED: s_load_dword
 
 ; SI: buffer_store_dword
-define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(2)* %p, align 1
   store i32 %v, i32 addrspace(1)* %r, align 4
   ret void
@@ -430,7 +430,7 @@ define void @constant_unaligned_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)*
 
 ; UNALIGNED: s_load_dword
 ; UNALIGNED: buffer_store_dword
-define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %v = load i32, i32 addrspace(2)* %p, align 2
   store i32 %v, i32 addrspace(1)* %r, align 4
   ret void
@@ -444,7 +444,7 @@ define void @constant_align2_load_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r
 
 ; UNALIGNED: s_load_dwordx2
 ; UNALIGNED: buffer_store_dwordx2
-define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(2)* %p, align 2
   store i64 %v, i64 addrspace(1)* %r, align 4
   ret void
@@ -453,7 +453,7 @@ define void @constant_align2_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r
 ; SI-LABEL: {{^}}constant_align4_load_i64:
 ; SI: s_load_dwordx2
 ; SI: buffer_store_dwordx2
-define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r) #0 {
   %v = load i64, i64 addrspace(2)* %p, align 4
   store i64 %v, i64 addrspace(1)* %r, align 4
   ret void
@@ -462,7 +462,7 @@ define void @constant_align4_load_i64(i64 addrspace(2)* %p, i64 addrspace(1)* %r
 ; SI-LABEL: {{^}}constant_align4_load_v4i32:
 ; SI: s_load_dwordx4
 ; SI: buffer_store_dwordx4
-define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 4
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
   ret void
@@ -482,7 +482,7 @@ define void @constant_align4_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> ad
 ; UNALIGNED: buffer_load_dwordx2
 
 ; SI: buffer_store_dwordx2
-define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32> addrspace(1)* %r) #0 {
   %v = load <2 x i32>, <2 x i32> addrspace(2)* %p, align 1
   store <2 x i32> %v, <2 x i32> addrspace(1)* %r, align 4
   ret void
@@ -512,7 +512,7 @@ define void @constant_unaligned_load_v2i32(<2 x i32> addrspace(2)* %p, <2 x i32>
 ; UNALIGNED: buffer_load_dwordx4
 
 ; SI: buffer_store_dwordx4
-define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32> addrspace(1)* %r) #0 {
   %v = load <4 x i32>, <4 x i32> addrspace(2)* %p, align 1
   store <4 x i32> %v, <4 x i32> addrspace(1)* %r, align 4
   ret void
@@ -521,7 +521,7 @@ define void @constant_unaligned_load_v4i32(<4 x i32> addrspace(2)* %p, <4 x i32>
 ; SI-LABEL: {{^}}constant_align4_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 4
   store i8 %v, i8 addrspace(1)* %r, align 4
   ret void
@@ -530,7 +530,7 @@ define void @constant_align4_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #
 ; SI-LABEL: {{^}}constant_align2_load_i8:
 ; SI: buffer_load_ubyte
 ; SI: buffer_store_byte
-define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #0 {
   %v = load i8, i8 addrspace(2)* %p, align 2
   store i8 %v, i8 addrspace(1)* %r, align 2
   ret void
@@ -541,7 +541,7 @@ define void @constant_align2_load_i8(i8 addrspace(2)* %p, i8 addrspace(1)* %r) #
 ; SI-DAG: v_mov_b32_e32 v[[VLO:[0-9]+]], s[[LO]]
 ; SI-DAG: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[HI]]
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
+define amdgpu_kernel void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspace(1)* %r) #0 {
   %gep0 = getelementptr i32, i32 addrspace(2)* %p, i64 1
   %v0 = load i32, i32 addrspace(2)* %p, align 4
   %v1 = load i32, i32 addrspace(2)* %gep0, align 4
@@ -571,7 +571,7 @@ define void @constant_align4_merge_load_2_i32(i32 addrspace(2)* %p, i32 addrspac
 ; SI: ds_read_u8
 
 ; SI: ScratchSize: 0{{$}}
-define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
+define amdgpu_kernel void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> addrspace(3)* %in) #0 {
   %ld = load <16 x i8>, <16 x i8> addrspace(3)* %in, align 1
   store <16 x i8> %ld, <16 x i8> addrspace(1)* %out
   ret void
@@ -596,7 +596,7 @@ define void @local_load_align1_v16i8(<16 x i8> addrspace(1)* %out, <16 x i8> add
 ; SI: ds_write_b8
 
 ; SI: ScratchSize: 0{{$}}
-define void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
+define amdgpu_kernel void @local_store_align1_v16i8(<16 x i8> addrspace(3)* %out) #0 {
   store <16 x i8> zeroinitializer, <16 x i8> addrspace(3)* %out, align 1
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
index 4902e9a3cafb2409d104c468d230c41853e92726..3e80fcf85b529b1dc88117468c3408e1a02a3cac 100644
--- a/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
+++ b/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll
@@ -4,7 +4,7 @@
 
 
 ; CHECK-LABEL: {{^}}func:
-define void @func() #0 {
+define amdgpu_kernel void @func() #0 {
 B0:
   br i1 undef, label %B1, label %B2
 
@@ -35,7 +35,8 @@ bb:
   %tmp1 = load volatile i32, i32 addrspace(1)* undef, align 4
   %tmp2 = insertelement <4 x i32> undef, i32 %tmp1, i32 0
   %tmp3 = insertelement <4 x i32> %tmp2, i32 %tmp1, i32 1
-  %tmp4 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tmp3, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp3.cast = bitcast <4 x i32> %tmp3 to <4 x float>
+  %tmp4 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tmp3.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp5 = extractelement <4 x float> %tmp4, i32 0
   %tmp6 = fmul float %tmp5, undef
   %tmp7 = fadd float %tmp6, %tmp6
@@ -71,7 +72,7 @@ bb11:                                             ; preds = %bb9
 ; CHECK: v_mov_b32_e32 v[[OUTPUT_LO]], v6
 
 ; CHECK: buffer_store_dwordx4 v{{\[}}[[OUTPUT_LO]]:[[OUTPUT_HI]]{{\]}}
-define void @partially_undef_copy() #0 {
+define amdgpu_kernel void @partially_undef_copy() #0 {
   %tmp0 = call i32 asm sideeffect "v_mov_b32_e32 v5, 5", "={VGPR5}"()
   %tmp1 = call i32 asm sideeffect "v_mov_b32_e32 v6, 6", "={VGPR6}"()
 
@@ -83,8 +84,7 @@ define void @partially_undef_copy() #0 {
   ret void
 }
 
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare float @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #1
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind readnone }
+attributes #1 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
index d96ee6d21ce819b945214ca3565be46fe55fff2a..60ab7631a1011881ffd00adc14afe369becaf741 100644
--- a/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
+++ b/test/CodeGen/AMDGPU/unhandled-loop-condition-assertion.ll
@@ -5,7 +5,7 @@
 ; SI hits an assertion at -O0, evergreen hits a not implemented unreachable.
 
 ; COMMON-LABEL: {{^}}branch_true:
-define void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_true(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 true, label %for.end, label %for.body.lr.ph
 
@@ -42,7 +42,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; SI: s_cbranch_vccnz
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_false(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 false, label %for.end, label %for.body.lr.ph
 
@@ -79,7 +79,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; SI: s_cbranch_scc1
 ; SI: s_cbranch_scc1
 ; SI: s_endpgm
-define void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
+define amdgpu_kernel void @branch_undef(i8 addrspace(1)* nocapture %main, i32 %main_stride) #0 {
 entry:
   br i1 undef, label %for.end, label %for.body.lr.ph
 
diff --git a/test/CodeGen/AMDGPU/uniform-cfg.ll b/test/CodeGen/AMDGPU/uniform-cfg.ll
index d3e431d1e35e86e3e0ecf8fa8756353052b8f84d..a9d45d71fa2ea1121eb90dcbea4ca56c226bf666 100644
--- a/test/CodeGen/AMDGPU/uniform-cfg.ll
+++ b/test/CodeGen/AMDGPU/uniform-cfg.ll
@@ -12,7 +12,7 @@
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -40,7 +40,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %if, label %else
@@ -68,7 +68,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_scc(i32 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %else, label %if
@@ -96,7 +96,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_swap_br_targets_vcc(float %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = fcmp oeq float %cond, 0.0
   br i1 %cmp0, label %else, label %if
@@ -123,7 +123,7 @@ done:
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu(i32 addrspace(1)* %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -148,7 +148,7 @@ endif:
 ; GCN: buffer_store_dword
 ; GCN: [[ENDIF_LABEL]]:
 ; GCN: s_endpgm
-define void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
+define amdgpu_kernel void @uniform_if_move_valu_commute(i32 addrspace(1)* %out, float %a) {
 entry:
   %a.0 = fadd float %a, 10.0
   %cond = bitcast float %a.0 to i32
@@ -166,7 +166,7 @@ endif:
 
 ; GCN-LABEL: {{^}}uniform_if_else_ret:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN-NEXT: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
 
 ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; GCN: buffer_store_dword [[TWO]]
@@ -176,7 +176,7 @@ endif:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_endpgm
-define void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
+define amdgpu_kernel void @uniform_if_else_ret(i32 addrspace(1)* nocapture %out, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
@@ -209,7 +209,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; GCN: v_mov_b32_e32 [[THREE:v[0-9]+]], 3
 ; GCN: buffer_store_dword [[THREE]]
 ; GCN: s_endpgm
-define void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
+define amdgpu_kernel void @uniform_if_else(i32 addrspace(1)* nocapture %out0, i32 addrspace(1)* nocapture %out1, i32 %a) {
 entry:
   %cmp = icmp eq i32 %a, 0
   br i1 %cmp, label %if.then, label %if.else
@@ -233,7 +233,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; GCN: buffer_store_dword
 ; GCN: [[LABEL]]:
 ; GCN: s_endpgm
-define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
@@ -252,11 +252,13 @@ ENDIF:                                            ; preds = %IF, %main_body
 ; GCN: s_cmp_lt_i32 [[COND]], 1
 ; GCN: s_cbranch_scc1 [[EXIT:[A-Za-z0-9_]+]]
 ; GCN: v_cmp_gt_i32_e64 vcc, [[COND]], 0{{$}}
-; GCN: s_cbranch_vccnz [[EXIT]]
-; GCN: buffer_store
+; GCN: s_cbranch_vccz [[BODY:[A-Za-z0-9_]+]]
 ; GCN: {{^}}[[EXIT]]:
 ; GCN: s_endpgm
-define void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
+; GCN: {{^}}[[BODY]]:
+; GCN: buffer_store
+; GCN: s_endpgm
+define amdgpu_kernel void @icmp_users_different_blocks(i32 %cond0, i32 %cond1, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %cmp0 = icmp sgt i32 %cond0, 0
@@ -282,7 +284,7 @@ bb9:                                              ; preds = %bb8, %bb4
 ; SI: s_cmp_lg_u32 [[I]], 0
 ; SI: s_cbranch_scc1 [[LOOP_LABEL]]
 ; SI: s_endpgm
-define void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @uniform_loop(i32 addrspace(1)* %out, i32 %a) {
 entry:
   br label %loop
 
@@ -302,12 +304,13 @@ done:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 ; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
-; GCN: s_cbranch_execz [[ENDIF_LABEL:[0-9_A-Za-z]+]]
 ; GCN: s_cmp_lg_u32 {{s[0-9]+}}, 0
-; GCN: s_cbranch_scc1 [[ENDIF_LABEL]]
+; GCN: s_cbranch_scc0 [[IF_UNIFORM_LABEL:[A-Z0-9_a-z]+]]
+; GCN: s_endpgm
+; GCN: {{^}}[[IF_UNIFORM_LABEL]]:
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-define void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @uniform_inside_divergent(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp ult i32 %tid, 16
@@ -328,15 +331,14 @@ endif:
 
 ; GCN-LABEL: {{^}}divergent_inside_uniform:
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_scc1 [[ENDIF_LABEL:[0-9_A-Za-z]+]]
+; GCN: s_cbranch_scc0 [[IF_LABEL:[0-9_A-Za-z]+]]
+; GCN: [[IF_LABEL]]:
 ; GCN: v_cmp_gt_u32_e32 vcc, 16, v{{[0-9]+}}
 ; GCN: s_and_saveexec_b64 [[MASK:s\[[0-9]+:[0-9]+\]]], vcc
 ; GCN: s_xor_b64  [[MASK1:s\[[0-9]+:[0-9]+\]]], exec, [[MASK]]
 ; GCN: v_mov_b32_e32 [[ONE:v[0-9]+]], 1
 ; GCN: buffer_store_dword [[ONE]]
-; GCN: [[ENDIF_LABEL]]:
-; GCN: s_endpgm
-define void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_inside_uniform(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %u_cmp = icmp eq i32 %cond, 0
   br i1 %u_cmp, label %if, label %endif
@@ -363,12 +365,12 @@ endif:
 ; GCN: buffer_store_dword [[ONE]]
 ; GCN: s_or_b64 exec, exec, [[MASK]]
 ; GCN: s_cmp_lg_u32 s{{[0-9]+}}, 0
-; GCN: s_cbranch_scc1 [[EXIT:[A-Z0-9_]+]]
+; GCN: s_cbranch_scc0 [[IF_UNIFORM:[A-Z0-9_]+]]
+; GCN: s_endpgm
+; GCN: [[IF_UNIFORM]]:
 ; GCN: v_mov_b32_e32 [[TWO:v[0-9]+]], 2
 ; GCN: buffer_store_dword [[TWO]]
-; GCN: [[EXIT]]:
-; GCN: s_endpgm
-define void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @divergent_if_uniform_if(i32 addrspace(1)* %out, i32 %cond) {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #0
   %d_cmp = icmp eq i32 %tid, 0
@@ -408,7 +410,7 @@ exit:
 
 ; GCN: BB[[FNNUM]]_3:
 ; GCN: s_endpgm
-define void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @cse_uniform_condition_different_blocks(i32 %cond, i32 addrspace(1)* %out) {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp1 = icmp sgt i32 %cond, 0
@@ -443,7 +445,7 @@ bb9:                                              ; preds = %bb8, %bb4
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_eq(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -475,7 +477,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL:v[0-9]+]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_ne(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -503,7 +505,7 @@ done:
 ; GCN: [[IF_LABEL]]:
 ; GCN: v_mov_b32_e32 [[V_VAL]], [[S_VAL]]
 ; GCN: buffer_store_dword [[V_VAL]]
-define void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
+define amdgpu_kernel void @uniform_if_scc_i64_sgt(i64 %cond, i32 addrspace(1)* %out) {
 entry:
   %cmp0 = icmp sgt i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -522,7 +524,7 @@ done:
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_eq:
 ; GCN: v_cmp_eq_u64_e32
-define void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @move_to_valu_i64_eq(i32 addrspace(1)* %out) {
   %cond = load volatile i64, i64 addrspace(3)* undef
   %cmp0 = icmp eq i64 %cond, 0
   br i1 %cmp0, label %if, label %else
@@ -541,7 +543,7 @@ done:
 
 ; GCN-LABEL: {{^}}move_to_valu_i64_ne:
 ; GCN: v_cmp_ne_u64_e32
-define void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @move_to_valu_i64_ne(i32 addrspace(1)* %out) {
   %cond = load volatile i64, i64 addrspace(3)* undef
   %cmp0 = icmp ne i64 %cond, 0
   br i1 %cmp0, label %if, label %else
diff --git a/test/CodeGen/AMDGPU/uniform-crash.ll b/test/CodeGen/AMDGPU/uniform-crash.ll
index cfbb2af58677d2c740f4c98d0c8a4864abe49bab..028199ef9de71e362b7553c6ae11a9468e5382b3 100644
--- a/test/CodeGen/AMDGPU/uniform-crash.ll
+++ b/test/CodeGen/AMDGPU/uniform-crash.ll
@@ -6,7 +6,7 @@
 ; GCN: s_cbranch_scc1 [[LABEL:BB[0-9_A-Z]+]]
 ; GCN: [[LABEL]]:
 ; GCN-NEXT: s_endpgm
-define void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
+define amdgpu_kernel void @icmp_2_users(i32 addrspace(1)* %out, i32 %cond) {
 main_body:
   %0 = icmp sgt i32 %cond, 0
   %1 = sext i1 %0 to i32
@@ -25,7 +25,7 @@ ENDIF:                                            ; preds = %IF, %main_body
 ; GCN: {{^}}[[LOOP:[A-Z0-9_]+]]:
 ; GCN: s_cbranch_scc1 [[LOOP]]
 ; GCN: {{^}}[[BB0]]:
-define void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
+define amdgpu_kernel void @fix_sgpr_live_ranges_crash(i32 %arg, i32 %arg1)  {
 bb:
   %cnd = trunc i32 %arg to i1
   br i1 %cnd, label %bb2, label %bb5
diff --git a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
index 2c3a098188609d12eb729735f6312919bc322c7f..e0067f9f45acc5988de1b067085b2f41335e388c 100644
--- a/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
+++ b/test/CodeGen/AMDGPU/uniform-loop-inside-nonuniform.ll
@@ -38,7 +38,7 @@ out:
 ; CHECK-NEXT: s_xor_b64
 ; CHECK-NEXT: ; mask branch
 ; CHECK-NEXT: s_cbranch_execz
-define void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @test2(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 main_body:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %cc = icmp eq i32 %tid, 0
diff --git a/test/CodeGen/AMDGPU/unigine-liveness-crash.ll b/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
index 732790ceb3356904223cd15c0de031ef1576038e..853131baed5e750e026abcfe4a7798512eb4dc68 100644
--- a/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
+++ b/test/CodeGen/AMDGPU/unigine-liveness-crash.ll
@@ -1,5 +1,4 @@
-; RUN: llc -march=amdgcn < %s | FileCheck %s
-; REQUIRES: asserts
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck %s
 ;
 ; This test used to crash with the following assertion:
 ; llc: include/llvm/ADT/IntervalMap.h:632: unsigned int llvm::IntervalMapImpl::LeafNode<llvm::SlotIndex, llvm::LiveInterval *, 8, llvm::IntervalMapInfo<llvm::SlotIndex> >::insertFrom(unsigned int &, unsigned int, KeyT, KeyT, ValT) [KeyT = llvm::SlotIndex, ValT = llvm::LiveInterval *, N = 8, Traits = llvm::IntervalMapInfo<llvm::SlotIndex>]: Assertion `(i == Size || Traits::stopLess(b, start(i))) && "Overlapping insert"' failed.
@@ -10,31 +9,33 @@
 ;
 ; Check for a valid output.
 ; CHECK: image_sample_c
-
-target triple = "amdgcn--"
-
-@ddxy_lds = external addrspace(3) global [64 x i32]
-
 define amdgpu_ps <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> @main([17 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg, [16 x <16 x i8>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg1, [32 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg2, [16 x <8 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg3, [16 x <4 x i32>] addrspace(2)* byval dereferenceable(18446744073709551615) %arg4, float inreg %arg5, i32 inreg %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, <3 x i32> %arg10, <2 x i32> %arg11, <2 x i32> %arg12, <2 x i32> %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18, i32 %arg19, i32 %arg20, float %arg21, i32 %arg22) #0 {
 main_body:
-  %tmp = call float @llvm.SI.fs.interp(i32 3, i32 4, i32 %arg6, <2 x i32> %arg8)
-  %tmp23 = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %i.i = extractelement <2 x i32> %arg8, i32 0
+  %j.i = extractelement <2 x i32> %arg8, i32 1
+  %i.f.i = bitcast i32 %i.i to float
+  %j.f.i = bitcast i32 %j.i to float
+  %p1.i = call float @llvm.amdgcn.interp.p1(float %i.f.i, i32 3, i32 4, i32 %arg6) #2
+  %p2.i = call float @llvm.amdgcn.interp.p2(float %p1.i, float %j.f.i, i32 3, i32 4, i32 %arg6) #2
+  %tmp23 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
+
   %tmp24 = extractelement <4 x float> %tmp23, i32 3
   %tmp25 = fmul float %tmp24, undef
-  %tmp26 = fmul float undef, %tmp
+  %tmp26 = fmul float undef, %p2.i
   %tmp27 = fadd float %tmp26, undef
   %tmp28 = bitcast float %tmp27 to i32
   %tmp29 = insertelement <4 x i32> undef, i32 %tmp28, i32 0
   %tmp30 = insertelement <4 x i32> %tmp29, i32 0, i32 1
   %tmp31 = insertelement <4 x i32> %tmp30, i32 undef, i32 2
-  %tmp32 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp31, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp31.cast = bitcast <4 x i32> %tmp31 to <4 x float>
+  %tmp32 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp31.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp33 = extractelement <4 x float> %tmp32, i32 0
   %tmp34 = fadd float undef, %tmp33
   %tmp35 = fadd float %tmp34, undef
   %tmp36 = fadd float %tmp35, undef
   %tmp37 = fadd float %tmp36, undef
   %tmp38 = fadd float %tmp37, undef
-  %tmp39 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp39 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp40 = extractelement <4 x float> %tmp39, i32 0
   %tmp41 = extractelement <4 x float> %tmp39, i32 1
   %tmp42 = extractelement <4 x float> %tmp39, i32 2
@@ -51,7 +52,8 @@ main_body:
   %tmp53 = insertelement <4 x i32> undef, i32 %tmp50, i32 0
   %tmp54 = insertelement <4 x i32> %tmp53, i32 %tmp51, i32 1
   %tmp55 = insertelement <4 x i32> %tmp54, i32 %tmp52, i32 2
-  %tmp56 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp55, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp55.cast = bitcast <4 x i32> %tmp55 to <4 x float>
+  %tmp56 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp55.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp57 = extractelement <4 x float> %tmp56, i32 0
   %tmp58 = fadd float %tmp38, %tmp57
   %tmp59 = fadd float undef, %tmp46
@@ -60,7 +62,8 @@ main_body:
   %tmp62 = bitcast float %tmp60 to i32
   %tmp63 = insertelement <4 x i32> undef, i32 %tmp61, i32 1
   %tmp64 = insertelement <4 x i32> %tmp63, i32 %tmp62, i32 2
-  %tmp65 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp64, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp64.cast = bitcast <4 x i32> %tmp64 to <4 x float>
+  %tmp65 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp64.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp66 = extractelement <4 x float> %tmp65, i32 0
   %tmp67 = fadd float %tmp58, %tmp66
   %tmp68 = fmul float %tmp67, 1.250000e-01
@@ -76,8 +79,9 @@ IF26:                                             ; preds = %main_body
 ENDIF25:                                          ; preds = %IF29, %main_body
   %.4 = phi float [ %tmp84, %IF29 ], [ %tmp68, %main_body ]
   %tmp73 = fadd float %.4, undef
-  %tmp74 = call float @llvm.AMDGPU.clamp.(float %tmp73, float 0.000000e+00, float 1.000000e+00)
-  %tmp75 = fmul float undef, %tmp74
+  %max.0.i = call float @llvm.maxnum.f32(float %tmp73, float 0.000000e+00)
+  %clamp.i = call float @llvm.minnum.f32(float %max.0.i, float 1.000000e+00)
+  %tmp75 = fmul float undef, %clamp.i
   %tmp76 = fmul float %tmp75, undef
   %tmp77 = fadd float %tmp76, undef
   %tmp78 = insertvalue <{ i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, float, float, float, float, float, float, float, float, float, float, float, float, float }> undef, float %tmp77, 11
@@ -99,17 +103,22 @@ IF29:                                             ; preds = %LOOP
 ENDIF28:                                          ; preds = %LOOP
   %tmp85 = insertelement <4 x i32> %tmp72, i32 undef, i32 1
   %tmp86 = insertelement <4 x i32> %tmp85, i32 undef, i32 2
-  %tmp87 = call <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32> %tmp86, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tmp86.cast = bitcast <4 x i32> %tmp86 to <4 x float>
+  %tmp87 = call <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float> %tmp86.cast, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false)
   %tmp88 = extractelement <4 x float> %tmp87, i32 0
   %tmp89 = fadd float undef, %tmp88
   br label %LOOP
 }
 
-declare float @llvm.AMDGPU.clamp.(float, float, float) #1
-declare float @llvm.SI.fs.interp(i32, i32, i32, <2 x i32>) #1
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
-declare <4 x float> @llvm.SI.image.sample.c.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #1
+declare float @llvm.minnum.f32(float, float) #1
+declare float @llvm.maxnum.f32(float, float) #1
+declare float @llvm.amdgcn.interp.p1(float, i32, i32, i32) #1
+declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #1
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.sample.c.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #2
 
-attributes #0 = { "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
+attributes #0 = { nounwind "InitialPSInputAddr"="36983" "target-cpu"="tonga" }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
+attributes #3 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/unknown-processor.ll b/test/CodeGen/AMDGPU/unknown-processor.ll
index 941f4c601e349fa797dfa692ac9a7cb4ec0419a9..25a700a943d2aa0b4a0bab9bc459ac483e2ff924 100644
--- a/test/CodeGen/AMDGPU/unknown-processor.ll
+++ b/test/CodeGen/AMDGPU/unknown-processor.ll
@@ -13,7 +13,7 @@
 ; GCN: ScratchSize: 8{{$}}
 
 ; R600: MOV
-define void @foo() {
+define amdgpu_kernel void @foo() {
   %alloca = alloca i32, align 4
   store volatile i32 0, i32* %alloca
   ret void
diff --git a/test/CodeGen/AMDGPU/unroll.ll b/test/CodeGen/AMDGPU/unroll.ll
index 411a15a4b839c7b3b0bd90be207231c140d029d9..2ce4de90a02dc57abdcb4662d4886eda7137180b 100644
--- a/test/CodeGen/AMDGPU/unroll.ll
+++ b/test/CodeGen/AMDGPU/unroll.ll
@@ -6,10 +6,10 @@
 ; private memory.  We want to make sure these kinds of loops are always
 ; unrolled, because private memory is slow.
 
-; CHECK-LABEL: @test
+; CHECK-LABEL: @private_memory
 ; CHECK-NOT: alloca
 ; CHECK: store i32 5, i32 addrspace(1)* %out
-define void @test(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @private_memory(i32 addrspace(1)* %out) {
 entry:
   %0 = alloca [32 x i32]
   br label %loop.header
@@ -34,3 +34,67 @@ exit:
   store i32 %3, i32 addrspace(1)* %out
   ret void
 }
+
+; Check that loop is unrolled for local memory references
+
+; CHECK-LABEL: @local_memory
+; CHECK: getelementptr i32, i32 addrspace(1)* %out, i32 128
+; CHECK-NEXT: store
+; CHECK-NEXT: ret
+define amdgpu_kernel void @local_memory(i32 addrspace(1)* %out, i32 addrspace(3)* %lds) {
+entry:
+  br label %loop.header
+
+loop.header:
+  %counter = phi i32 [0, %entry], [%inc, %loop.inc]
+  br label %loop.body
+
+loop.body:
+  %ptr_lds = getelementptr i32, i32 addrspace(3)* %lds, i32 %counter
+  %val = load i32, i32 addrspace(3)* %ptr_lds
+  %ptr_out = getelementptr i32, i32 addrspace(1)* %out, i32 %counter
+  store i32 %val, i32 addrspace(1)* %ptr_out
+  br label %loop.inc
+
+loop.inc:
+  %inc = add i32 %counter, 1
+  %cond = icmp sge i32 %counter, 128
+  br i1 %cond, label  %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+; Check that a loop with if inside completely unrolled to eliminate phi and if
+
+; CHECK-LABEL: @unroll_for_if
+; CHECK: entry:
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: store
+; CHECK-NEXT: getelementptr
+; CHECK-NEXT: store
+; CHECK-NOT: br
+define amdgpu_kernel void @unroll_for_if(i32* %a) {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.inc
+  %i1 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %and = and i32 %i1, 1
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:                                          ; preds = %for.body
+  %0 = sext i32 %i1 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %0
+  store i32 0, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i1, 1
+  %cmp = icmp ult i32 %inc, 48
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.cond
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/unsupported-cc.ll b/test/CodeGen/AMDGPU/unsupported-cc.ll
index d120111a71fb3177b0b01ab254c96a181ff0227d..68e91e8c9c6b0229b779d42c8bf5311e0d72ffef 100644
--- a/test/CodeGen/AMDGPU/unsupported-cc.ll
+++ b/test/CodeGen/AMDGPU/unsupported-cc.ll
@@ -6,7 +6,7 @@
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define void @slt(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @slt(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp slt i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -18,7 +18,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 5(7.006492e-45)
-define void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ult_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp ult i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -31,7 +31,7 @@ entry:
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define void @ult_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -43,7 +43,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ult_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ult_float_native(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ult float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
@@ -55,7 +55,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @olt(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @olt(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp olt float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -67,7 +67,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_INT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define void @sle(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @sle(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp sle i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -79,7 +79,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT_UINT {{\** *}}T{{[0-9]+\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT: 6(8.407791e-45)
-define void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @ule_i32(i32 addrspace(1)* %out, i32 %in) {
 entry:
   %0 = icmp ule i32 %in, 5
   %1 = select i1 %0, i32 -1, i32 0
@@ -92,7 +92,7 @@ entry:
 ; CHECK-NEXT: 1084227584(5.000000e+00)
 ; CHECK-NEXT: SETE T{{[0-9]\.[XYZW]}}, PV.[[CHAN]], 0.0
 ; CHECK-NEXT: LSHR *
-define void @ule_float(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
@@ -104,7 +104,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGT {{\*? *}}T{{[0-9]\.[XYZW]}}, KC0[2].Z, {{literal\.[xy]}}
 ; CHECK-NEXT: 1084227584(5.000000e+00)
-define void @ule_float_native(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ule_float_native(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ule float %in, 5.0
   %1 = select i1 %0, float 0.0, float 1.0
@@ -116,7 +116,7 @@ entry:
 ; CHECK: LSHR
 ; CHECK-NEXT: SETGE {{\*? *}}T{{[0-9]\.[XYZW]}}, {{literal\.[xy]}}, KC0[2].Z
 ; CHECK-NEXT:1084227584(5.000000e+00)
-define void @ole(float addrspace(1)* %out, float %in) {
+define amdgpu_kernel void @ole(float addrspace(1)* %out, float %in) {
 entry:
   %0 = fcmp ole float %in, 5.0
   %1 = select i1 %0, float 1.0, float 0.0
diff --git a/test/CodeGen/AMDGPU/urecip.ll b/test/CodeGen/AMDGPU/urecip.ll
deleted file mode 100644
index d58d2dc2d9637b79bd505a996815289e438f4997..0000000000000000000000000000000000000000
--- a/test/CodeGen/AMDGPU/urecip.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck %s
-; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
-
-; CHECK: v_rcp_iflag_f32_e32
-
-define void @test(i32 %p, i32 %q) {
-   %i = udiv i32 %p, %q
-   %r = bitcast i32 %i to float
-   call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %r, float %r, float %r, float %r)
-   ret void
-}
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
diff --git a/test/CodeGen/AMDGPU/urem.ll b/test/CodeGen/AMDGPU/urem.ll
index 9e2cfa34e0b9efc647002fcf93b714a868e66000..fd7f8fa2efab573d47445461e607bf822afa6dc7 100644
--- a/test/CodeGen/AMDGPU/urem.ll
+++ b/test/CodeGen/AMDGPU/urem.ll
@@ -9,7 +9,7 @@
 ; FUNC-LABEL: {{^}}test_urem_i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %b_ptr = getelementptr i32, i32 addrspace(1)* %in, i32 1
   %a = load i32, i32 addrspace(1)* %in
   %b = load i32, i32 addrspace(1)* %b_ptr
@@ -26,7 +26,7 @@ define void @test_urem_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; SI: v_sub_i32
 ; SI: buffer_store_dword
 ; SI: s_endpgm
-define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %num = load i32, i32 addrspace(1) * %in
   %result = urem i32 %num, 7
   store i32 %result, i32 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @test_urem_i32_7(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; FUNC-LABEL: {{^}}test_urem_v2i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i32>, <2 x i32> addrspace(1)* %in, i32 1
   %a = load <2 x i32>, <2 x i32> addrspace(1)* %in
   %b = load <2 x i32>, <2 x i32> addrspace(1)* %b_ptr
@@ -48,7 +48,7 @@ define void @test_urem_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1
 ; FUNC-LABEL: {{^}}test_urem_v4i32:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i32>, <4 x i32> addrspace(1)* %in, i32 1
   %a = load <4 x i32>, <4 x i32> addrspace(1)* %in
   %b = load <4 x i32>, <4 x i32> addrspace(1)* %b_ptr
@@ -60,7 +60,7 @@ define void @test_urem_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1
 ; FUNC-LABEL: {{^}}test_urem_i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
   %b_ptr = getelementptr i64, i64 addrspace(1)* %in, i64 1
   %a = load i64, i64 addrspace(1)* %in
   %b = load i64, i64 addrspace(1)* %b_ptr
@@ -72,7 +72,7 @@ define void @test_urem_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in) {
 ; FUNC-LABEL: {{^}}test_urem_v2i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <2 x i64>, <2 x i64> addrspace(1)* %in, i64 1
   %a = load <2 x i64>, <2 x i64> addrspace(1)* %in
   %b = load <2 x i64>, <2 x i64> addrspace(1)* %b_ptr
@@ -84,7 +84,7 @@ define void @test_urem_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1
 ; FUNC-LABEL: {{^}}test_urem_v4i64:
 ; SI: s_endpgm
 ; EG: CF_END
-define void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
+define amdgpu_kernel void @test_urem_v4i64(<4 x i64> addrspace(1)* %out, <4 x i64> addrspace(1)* %in) {
   %b_ptr = getelementptr <4 x i64>, <4 x i64> addrspace(1)* %in, i64 1
   %a = load <4 x i64>, <4 x i64> addrspace(1)* %in
   %b = load <4 x i64>, <4 x i64> addrspace(1)* %b_ptr
diff --git a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
index 82bdc261b1123df0c1a459545a2eafceb84a204f..f8e6b7edfe3583fc3d0d1bbc177a5143f9ca9485 100644
--- a/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
+++ b/test/CodeGen/AMDGPU/use-sgpr-multiple-times.ll
@@ -11,7 +11,7 @@ declare float @llvm.amdgcn.div.fixup.f32(float, float, float) #1
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_add_f32_e64 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
   %dbl = fadd float %a, %a
   store float %dbl, float addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @test_sgpr_use_twice_binop(float addrspace(1)* %out, float %a) #0 {
 ; GCN: s_load_dword [[SGPR:s[0-9]+]],
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -35,7 +35,7 @@ define void @test_sgpr_use_three_ternary_op(float addrspace(1)* %out, float %a)
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[SGPR0]], [[VGPR1]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float %b) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -58,7 +58,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_b(float addrspace(1)* %out, floa
 ; GCN-DAG: v_fma_f32 [[RESULT1:v[0-9]+]], [[VA1]], [[SA]], [[VB]]
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float addrspace(1)* %in) #0 {
   %va0 = load volatile float, float addrspace(1)* %in
   %va1 = load volatile float, float addrspace(1)* %in
   %fma0 = call float @llvm.fma.f32(float %a, float %va0, float %b) #1
@@ -76,7 +76,7 @@ define void @test_use_s_v_s(float addrspace(1)* %out, float %a, float %b, float
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[VGPR1]], [[SGPR0]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %b, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -90,7 +90,7 @@ define void @test_sgpr_use_twice_ternary_op_a_b_a(float addrspace(1)* %out, floa
 ; GCN: v_mov_b32_e32 [[VGPR1:v[0-9]+]], [[SGPR1]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR0]], [[VGPR1]], [[SGPR0]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma = call float @llvm.fma.f32(float %b, float %a, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -100,7 +100,7 @@ define void @test_sgpr_use_twice_ternary_op_b_a_a(float addrspace(1)* %out, floa
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], 2.0
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 2.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -110,7 +110,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_imm(float addrspace(1)* %out, fl
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], 2.0, [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 2.0, float %a) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -121,7 +121,7 @@ define void @test_sgpr_use_twice_ternary_op_a_imm_a(float addrspace(1)* %out, fl
 ; GCN: s_load_dword [[SGPR:s[0-9]+]]
 ; GCN: v_div_fixup_f32 [[RESULT:v[0-9]+]], 2.0, [[SGPR]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, float %a) #0 {
   %val = call float @llvm.amdgcn.div.fixup.f32(float 2.0, float %a, float %a) #1
   store float %val, float addrspace(1)* %out, align 4
   ret void
@@ -132,7 +132,7 @@ define void @test_sgpr_use_twice_ternary_op_imm_a_a(float addrspace(1)* %out, fl
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[SGPR]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float %a, float 1024.0) #1
   store float %fma, float addrspace(1)* %out, align 4
   ret void
@@ -143,7 +143,7 @@ define void @test_sgpr_use_twice_ternary_op_a_a_kimm(float addrspace(1)* %out, f
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT0:v[0-9]+]], [[VK]], [[VK]], [[SGPR]]
 ; GCN: buffer_store_dword [[RESULT0]]
-define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -158,7 +158,7 @@ define void @test_literal_use_twice_ternary_op_k_k_s(float addrspace(1)* %out, f
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %a) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float 1024.0, float %b) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -171,7 +171,7 @@ define void @test_literal_use_twice_ternary_op_k_k_s_x2(float addrspace(1)* %out
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -186,7 +186,7 @@ define void @test_literal_use_twice_ternary_op_k_s_k(float addrspace(1)* %out, f
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float 1024.0, float %a, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float 1024.0, float %b, float 1024.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -199,7 +199,7 @@ define void @test_literal_use_twice_ternary_op_k_s_k_x2(float addrspace(1)* %out
 ; GCN-DAG: v_mov_b32_e32 [[VK:v[0-9]+]], 0x44800000
 ; GCN: v_fma_f32 [[RESULT:v[0-9]+]], [[SGPR]], [[VK]], [[VK]]
 ; GCN: buffer_store_dword [[RESULT]]
-define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, float %a) #0 {
   %fma = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   store float %fma, float addrspace(1)* %out
   ret void
@@ -214,7 +214,7 @@ define void @test_literal_use_twice_ternary_op_s_k_k(float addrspace(1)* %out, f
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
 ; GCN: s_endpgm
-define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float 1024.0, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %b, float 1024.0, float 1024.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -234,7 +234,7 @@ define void @test_literal_use_twice_ternary_op_s_k_k_x2(float addrspace(1)* %out
 
 ; GCN: buffer_store_dword [[RESULT0]]
 ; GCN: buffer_store_dword [[RESULT1]]
-define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
   %fma0 = call float @llvm.fma.f32(float %a, float %b, float 1024.0) #1
   %fma1 = call float @llvm.fma.f32(float %a, float %b, float 4096.0) #1
   store volatile float %fma0, float addrspace(1)* %out
@@ -259,7 +259,7 @@ define void @test_s0_s1_k_f32(float addrspace(1)* %out, float %a, float %b) #0 {
 
 ; GCN: buffer_store_dwordx2 [[RESULT0]]
 ; GCN: buffer_store_dwordx2 [[RESULT1]]
-define void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
+define amdgpu_kernel void @test_s0_s1_k_f64(double addrspace(1)* %out, double %a, double %b) #0 {
   %fma0 = call double @llvm.fma.f64(double %a, double %b, double 1024.0) #1
   %fma1 = call double @llvm.fma.f64(double %a, double %b, double 4096.0) #1
   store volatile double %fma0, double addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/usubo.ll b/test/CodeGen/AMDGPU/usubo.ll
index 9391eda00e76e881c99d7bf15e0a8284b05fd5cd..d1f454f0bc65551b1d3bd4039a4001c5f68b62a0 100644
--- a/test/CodeGen/AMDGPU/usubo.ll
+++ b/test/CodeGen/AMDGPU/usubo.ll
@@ -9,7 +9,7 @@
 
 ; EG: SUBB_UINT
 ; EG: ADDC_UINT
-define void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b) #0
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -27,7 +27,7 @@ define void @s_usubo_i64_zext(i64 addrspace(1)* %out, i64 %a, i64 %b) #0 {
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 %a, i32 %b) #0 {
   %usub = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %a, i32 %b)
   %val = extractvalue { i32, i1 } %usub, 0
   %carry = extractvalue { i32, i1 } %usub, 1
@@ -42,7 +42,7 @@ define void @s_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -63,7 +63,7 @@ define void @v_usubo_i32(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32
 
 ; EG-DAG: SUBB_UINT
 ; EG-DAG: SUB_INT
-define void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryout, i32 addrspace(1)* %a.ptr, i32 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i32, i32 addrspace(1)* %a.ptr
@@ -87,7 +87,7 @@ define void @v_usubo_i32_novcc(i32 addrspace(1)* %out, i1 addrspace(1)* %carryou
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
+define amdgpu_kernel void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 %a, i64 %b) #0 {
   %usub = call { i64, i1 } @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %val = extractvalue { i64, i1 } %usub, 0
   %carry = extractvalue { i64, i1 } %usub, 1
@@ -104,7 +104,7 @@ define void @s_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; EG-DAG: SUB_INT
 ; EG-DAG: SUB_INT
 ; EG: SUB_INT
-define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64 addrspace(1)* %a.ptr, i64 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i64, i64 addrspace(1)* %a.ptr
@@ -122,7 +122,7 @@ define void @v_usubo_i64(i64 addrspace(1)* %out, i1 addrspace(1)* %carryout, i64
 ; FUNC-LABEL: {{^}}v_usubo_i16:
 ; VI: v_subrev_u16_e32
 ; VI: v_cmp_gt_u16_e32
-define void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
+define amdgpu_kernel void @v_usubo_i16(i16 addrspace(1)* %out, i1 addrspace(1)* %carryout, i16 addrspace(1)* %a.ptr, i16 addrspace(1)* %b.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
   %a.gep = getelementptr inbounds i16, i16 addrspace(1)* %a.ptr
diff --git a/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
index a48e7acd4cf375e0e61e4cef9b7180c6daaa691b..b7d766aa395ef872ca7bd134089e096f99dddb10 100644
--- a/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
+++ b/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll
@@ -1,14 +1,14 @@
 ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s
 
 ; CHECK-LABEL: {{^}}kernel_arg_i64:
-define void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
+define amdgpu_kernel void @kernel_arg_i64(i64 addrspace(1)* %out, i64 %a) nounwind {
   store i64 %a, i64 addrspace(1)* %out, align 8
   ret void
 }
 
 ; i64 arg works, v1i64 arg does not.
 ; CHECK-LABEL: {{^}}kernel_arg_v1i64:
-define void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
+define amdgpu_kernel void @kernel_arg_v1i64(<1 x i64> addrspace(1)* %out, <1 x i64> %a) nounwind {
   store <1 x i64> %a, <1 x i64> addrspace(1)* %out, align 8
   ret void
 }
diff --git a/test/CodeGen/AMDGPU/v_cndmask.ll b/test/CodeGen/AMDGPU/v_cndmask.ll
index 1cd49feb0d88328fd0a89311a5aa4ab60e563cff..d4a68a418ee41e2b2c080195ffe6bb1091fa4e30 100644
--- a/test/CodeGen/AMDGPU/v_cndmask.ll
+++ b/test/CodeGen/AMDGPU/v_cndmask.ll
@@ -4,12 +4,12 @@
 declare i32 @llvm.amdgcn.workitem.id.x() #1
 
 ; GCN-LABEL: {{^}}v_cnd_nan_nosgpr:
-; GCN: v_cmp_eq_u32_e64 vcc, s{{[0-9]+}}, 0
-; GCN: v_cndmask_b32_e32 v{{[0-9]}}, -1, v{{[0-9]+}}, vcc
+; GCN: v_cmp_eq_u32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], s{{[0-9]+}}, 0
+; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]}}, -1, v{{[0-9]+}}, [[COND]]
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
+define amdgpu_kernel void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(1)* %fptr) #0 {
   %idx = call i32 @llvm.amdgcn.workitem.id.x() #1
   %f.gep = getelementptr float, float addrspace(1)* %fptr, i32 %idx
   %f = load float, float addrspace(1)* %f.gep
@@ -30,7 +30,7 @@ define void @v_cnd_nan_nosgpr(float addrspace(1)* %out, i32 %c, float addrspace(
 ; GCN-DAG: v{{[0-9]}}
 ; All nan values are converted to 0xffffffff
 ; GCN: s_endpgm
-define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
+define amdgpu_kernel void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
   %setcc = icmp ne i32 %c, 0
   %select = select i1 %setcc, float 0xFFFFFFFFE0000000, float %f
   store float %select, float addrspace(1)* %out
@@ -47,7 +47,7 @@ define void @v_cnd_nan(float addrspace(1)* %out, i32 %c, float %f) #0 {
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -62,7 +62,7 @@ define void @fcmp_sgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -77,7 +77,7 @@ define void @fcmp_sgprX_k0_select_k1_sgprX_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %x, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -92,7 +92,7 @@ define void @fcmp_sgprX_k0_select_k0_sgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[VZ]], vcc
-define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %x) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext
@@ -105,9 +105,9 @@ define void @fcmp_sgprX_k0_select_k0_sgprX_f32(float addrspace(1)* %out, float %
 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k0_vgprZ_f32:
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
 ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
-; GCN-DAG: v_cmp_nlg_f32_e64 vcc, [[X]], 0
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 0, [[Z]], vcc
-define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+; GCN-DAG: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
+; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 0, [[Z]], [[COND]]
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
@@ -122,9 +122,9 @@ define void @fcmp_sgprX_k0_select_k0_vgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-LABEL: {{^}}fcmp_sgprX_k0_select_k1_vgprZ_f32:
 ; GCN-DAG: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN-DAG: s_load_dword [[X:s[0-9]+]]
-; GCN: v_cmp_nlg_f32_e64 vcc, [[X]], 0
-; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
-define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
+; GCN: v_cmp_nlg_f32_e64 [[COND:vcc|s\[[0-9]+:[0-9]+\]]], [[X]], 0
+; GCN: v_cndmask_b32_e{{32|64}} v{{[0-9]+}}, 1.0, [[Z]], [[COND]]
+define amdgpu_kernel void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %x, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %z.gep = getelementptr inbounds float, float addrspace(1)* %z.ptr, i64 %tid.ext
@@ -142,7 +142,7 @@ define void @fcmp_sgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float %
 ; GCN-DAG: v_cmp_ngt_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_mov_b32_e32 [[VZ:v[0-9]+]], [[Z]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[VZ]], vcc
-define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float %z) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -159,7 +159,7 @@ define void @fcmp_vgprX_k0_select_k1_sgprZ_f32(float addrspace(1)* %out, float a
 ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 1.0, [[Z]], vcc
-define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -178,7 +178,7 @@ define void @fcmp_vgprX_k0_select_k1_vgprZ_f32(float addrspace(1)* %out, float a
 ; GCN: {{buffer|flat}}_load_dword [[Z:v[0-9]+]]
 ; GCN: v_cmp_lt_i32_e32 vcc, -1, [[X]]
 ; GCN: v_cndmask_b32_e32 v{{[0-9]+}}, 2, [[Z]], vcc
-define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i32 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -203,7 +203,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i32(i32 addrspace(1)* %out, i32 addrs
 ; VI-DAG: v_cmp_lt_i64_e64 s{{\[[0-9]+:[0-9]+\]}}, -1, v{{\[}}[[X_LO]]:[[X_HI]]{{\]}}
 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 0, v[[Z_HI]], s
 ; VI-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, 2, v[[Z_LO]], s
-define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i64, i64 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -226,7 +226,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i64(i64 addrspace(1)* %out, i64 addrs
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -249,7 +249,7 @@ define void @fcmp_vgprX_k0_select_vgprZ_k1_v4f32(<4 x float> addrspace(1)* %out,
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -275,7 +275,7 @@ define void @fcmp_vgprX_k0_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out,
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, -0.5, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, v{{[0-9]+}}, vcc
-define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out, float addrspace(1)* %x.ptr, <4 x float> addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -298,7 +298,7 @@ define void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(<4 x float> addrspace(1)* %out,
 ; GCN-DAG: s_or_b64 s{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, vcc
 ; GCN: v_cndmask_b32_e64 v{{[0-9]+}}, 0, 1, s
 ; GCN: store_byte
-define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspace(1)* %x.ptr, i1 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -321,7 +321,7 @@ define void @icmp_vgprX_k0_select_k1_vgprZ_i1(i1 addrspace(1)* %out, i32 addrspa
 ; GCN: v_cmp_le_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, [[K]], v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, float addrspace(1)* %x.ptr, double addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -343,7 +343,7 @@ define void @fcmp_vgprX_k0_selectf64_k1_vgprZ_f32(double addrspace(1)* %out, flo
 ; GCN: v_cmp_nlg_f32_e32 vcc, 0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 3, v{{[0-9]+}}, vcc
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}, vcc
-define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float addrspace(1)* %x.ptr, i64 addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
@@ -364,7 +364,7 @@ define void @fcmp_vgprX_k0_selecti64_k1_vgprZ_f32(i64 addrspace(1)* %out, float
 
 ; GCN: v_cmp_gt_u32_e32 vcc, 2, [[X]]
 ; GCN-DAG: v_cndmask_b32_e32 v{{[0-9]+}}, 4.0, [[Z]], vcc
-define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32 addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds i32, i32 addrspace(1)* %x.ptr, i64 %tid.ext
@@ -386,7 +386,7 @@ define void @icmp_vgprX_k0_selectf32_k1_vgprZ_i32(float addrspace(1)* %out, i32
 ; GCN: v_cmp_nle_f32_e32 vcc, 4.0, [[X]]
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -1.0, vcc
 ; GCN-DAG: v_cndmask_b32_e64 v{{[0-9]+}}, v{{[0-9]+}}, -2.0, vcc
-define void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
+define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_f32_cond_use_x2(float addrspace(1)* %out, float addrspace(1)* %x.ptr, float addrspace(1)* %z.ptr) #0 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tid.ext = sext i32 %tid to i64
   %x.gep = getelementptr inbounds float, float addrspace(1)* %x.ptr, i64 %tid.ext
diff --git a/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll b/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
index 9246ce38dbedaa9d24da03a9f55c57994960a750..2cda52a8438af1ac9412bed9a8aa329075a242c6 100644
--- a/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
+++ b/test/CodeGen/AMDGPU/v_cvt_pk_u8_f32.ll
@@ -5,7 +5,7 @@ declare i32 @llvm.amdgcn.cvt.pk.u8.f32(float, i32, i32) #0
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_0:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 0, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -13,7 +13,7 @@ define void @v_cvt_pk_u8_f32_idx_0(i32 addrspace(1)* %out, float %src, i32 %reg)
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_1:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -21,7 +21,7 @@ define void @v_cvt_pk_u8_f32_idx_1(i32 addrspace(1)* %out, float %src, i32 %reg)
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_2:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -29,7 +29,7 @@ define void @v_cvt_pk_u8_f32_idx_2(i32 addrspace(1)* %out, float %src, i32 %reg)
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx_3:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 3, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
@@ -40,7 +40,7 @@ define void @v_cvt_pk_u8_f32_idx_3(i32 addrspace(1)* %out, float %src, i32 %reg)
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 1, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 2, v{{[0-9]+}}
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, 3, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %reg) {
   %result0 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 0, i32 %reg) #0
   %result1 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 1, i32 %result0) #0
   %result2 = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 2, i32 %result1) #0
@@ -51,7 +51,7 @@ define void @v_cvt_pk_u8_f32_combine(i32 addrspace(1)* %out, float %src, i32 %re
 
 ; GCN-LABEL: {{^}}v_cvt_pk_u8_f32_idx:
 ; GCN: v_cvt_pk_u8_f32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-define void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
+define amdgpu_kernel void @v_cvt_pk_u8_f32_idx(i32 addrspace(1)* %out, float %src, i32 %idx, i32 %reg) {
   %result = call i32 @llvm.amdgcn.cvt.pk.u8.f32(float %src, i32 %idx, i32 %reg) #0
   store i32 %result, i32 addrspace(1)* %out, align 4
   ret void
diff --git a/test/CodeGen/AMDGPU/v_mac.ll b/test/CodeGen/AMDGPU/v_mac.ll
index 911207815e9a682fba367c5464d1a3a897875242..2b96f7d50076a396ccd9bf1470fa66366d3c7c43 100644
--- a/test/CodeGen/AMDGPU/v_mac.ll
+++ b/test/CodeGen/AMDGPU/v_mac.ll
@@ -8,7 +8,7 @@
 ; GCN: buffer_load_dword [[C:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:8
 ; GCN: v_mac_f32_e32 [[C]], [[B]], [[A]]
 ; GCN: buffer_store_dword [[C]]
-define void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mac_vvv(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -26,7 +26,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_inline_sgpr_inline:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]}}, s{{[0-9]+}}, 0.5, 0.5
-define void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
+define amdgpu_kernel void @mad_inline_sgpr_inline(float addrspace(1)* %out, float %in) #0 {
 entry:
   %tmp0 = fmul float 0.5, %in
   %tmp1 = fadd float %tmp0, 0.5
@@ -37,7 +37,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_vvs:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, s{{[0-9]+}}
-define void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
+define amdgpu_kernel void @mad_vvs(float addrspace(1)* %out, float addrspace(1)* %in, float %c) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
 
@@ -52,7 +52,7 @@ entry:
 
 ; GCN-LABEL: {{^}}mac_ssv:
 ; GCN: v_mac_f32_e64 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}
-define void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
+define amdgpu_kernel void @mac_ssv(float addrspace(1)* %out, float addrspace(1)* %in, float %a) #0 {
 entry:
   %c = load float, float addrspace(1)* %in
 
@@ -65,7 +65,7 @@ entry:
 ; GCN-LABEL: {{^}}mac_mad_same_add:
 ; GCN: v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; GCN: v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
-define void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mac_mad_same_add(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -96,7 +96,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -113,10 +113,10 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}unsafe_mad_sub0_src0:
+; GCN-LABEL: {{^}}nsz_mad_sub0_src0:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @unsafe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -136,7 +136,7 @@ entry:
 ; GCN-LABEL: {{^}}safe_mad_sub0_src0:
 ; GCN: v_sub_f32_e32 [[SUB0:v[0-9]+]], 0,
 ; GCN: v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, [[SUB0]]
-define void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @safe_mad_sub0_src0(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -156,7 +156,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src1(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -173,10 +173,10 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}unsafe_mad_sub0_src1:
+; GCN-LABEL: {{^}}nsz_mad_sub0_src1:
 ; GCN-NOT: v_mac_f32
 ; GCN: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-define void @unsafe_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
+define amdgpu_kernel void @nsz_mad_sub0_src1(float addrspace(1)* %out, float addrspace(1)* %in) #1 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -196,7 +196,7 @@ entry:
 ; GCN-LABEL: {{^}}mad_neg_src2:
 ; GCN-NOT: v_mac
 ; GCN: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
-define void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @mad_neg_src2(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
 entry:
   %b_ptr = getelementptr float, float addrspace(1)* %in, i32 1
   %c_ptr = getelementptr float, float addrspace(1)* %in, i32 2
@@ -222,7 +222,7 @@ entry:
 
 ; GCN: v_add_f32_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
 ; GCN: v_mad_f32 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-define void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
+define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f32(float addrspace(1)* %out, float addrspace(1)* %a, float addrspace(1)* %b) #3 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -257,7 +257,7 @@ bb:
 
 ; VI-FLUSH: v_add_f16_e32 [[TMP2:v[0-9]+]], [[A]], [[A]]
 ; VI-FLUSH: v_mad_f16 v{{[0-9]+}}, [[TMP2]], -4.0, 1.0
-define void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
+define amdgpu_kernel void @fold_inline_imm_into_mac_src2_f16(half addrspace(1)* %out, half addrspace(1)* %a, half addrspace(1)* %b) #3 {
 bb:
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   %tid.ext = sext i32 %tid to i64
@@ -281,7 +281,7 @@ bb:
 
 declare i32 @llvm.amdgcn.workitem.id.x() #2
 
-attributes #0 = { nounwind "unsafe-fp-math"="false" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
+attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/test/CodeGen/AMDGPU/v_mac_f16.ll b/test/CodeGen/AMDGPU/v_mac_f16.ll
index 20c1d2310d38f84173577775bbdd62e58303e16e..c45af522ec49bc8e80a1641555dc80adaecb45cb 100644
--- a/test/CodeGen/AMDGPU/v_mac_f16.ll
+++ b/test/CodeGen/AMDGPU/v_mac_f16.ll
@@ -14,7 +14,7 @@
 ; VI:  v_mac_f16_e32 v[[C_F16]], v[[B_F16]], v[[A_F16]]
 ; VI:  buffer_store_short v[[C_F16]]
 ; GCN: s_endpgm
-define void @mac_f16(
+define amdgpu_kernel void @mac_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -31,13 +31,14 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_same_add
+; GCN-LABEL: {{^}}mac_f16_same_add:
 ; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; SI:  v_mac_f32_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD:v[0-9]+]]
 ; VI:  v_mac_f16_e32 [[ADD]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_same_add(
+define amdgpu_kernel void @mac_f16_same_add(
     half addrspace(1)* %r0,
     half addrspace(1)* %r1,
     half addrspace(1)* %a,
@@ -63,13 +64,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_a
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_f16_neg_a:
+; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_a(
+define amdgpu_kernel void @mac_f16_neg_a(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -87,13 +91,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_b
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_f16_neg_b:
+; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_b(
+define amdgpu_kernel void @mac_f16_neg_b(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -111,13 +118,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_c
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_f16_neg_c:
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_cvt_f32_f16_e32
+; SI: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_c(
+define amdgpu_kernel void @mac_f16_neg_c(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -141,7 +151,7 @@ entry:
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A]]
 ; GCN: s_endpgm
-define void @mac_f16_neg_a_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_a_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -165,7 +175,7 @@ entry:
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A]], v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_neg_b_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_b_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -189,7 +199,7 @@ entry:
 ; VI:  v_sub_f16_e32 v[[NEG_A:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v[[NEG_A]], v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @mac_f16_neg_c_safe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_c_safe_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -207,13 +217,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_a_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
+; GCN-LABEL: {{^}}mac_f16_neg_a_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_a_unsafe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_a_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -231,13 +244,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_b_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
+; GCN-LABEL: {{^}}mac_f16_neg_b_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, -[[CVT_A]], [[CVT_B]], [[CVT_C]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_b_unsafe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_b_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -255,13 +271,16 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_f16_neg_c_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
+; GCN-LABEL: {{^}}mac_f16_neg_c_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT_A:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_B:v[0-9]+]], v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT_C:v[0-9]+]], v{{[0-9]+}}
+; SI: v_mad_f32 v{{[0-9]+}}, [[CVT_A]], [[CVT_B]], -[[CVT_C]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]}}
 ; GCN:    s_endpgm
-define void @mac_f16_neg_c_unsafe_fp_math(
+define amdgpu_kernel void @mac_f16_neg_c_nsz_fp_math(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b,
@@ -279,33 +298,38 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16
+; GCN-LABEL: {{^}}mac_v2f16:
 ; GCN: {{buffer|flat}}_load_dword v[[A_V2_F16:[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword v[[B_V2_F16:[0-9]+]]
 ; GCN: {{buffer|flat}}_load_dword v[[C_V2_F16:[0-9]+]]
-; GCN: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
-; GCN: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+
 ; SI:  v_cvt_f32_f16_e32 v[[A_F32_0:[0-9]+]], v[[A_V2_F16]]
+; SI: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_0:[0-9]+]], v[[B_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
-; SI:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
+; SI-DAG: v_lshrrev_b32_e32 v[[B_F16_1:[0-9]+]], 16, v[[B_V2_F16]]
+; SI-DAG: v_lshrrev_b32_e32 v[[C_F16_1:[0-9]+]], 16, v[[C_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[C_F32_0:[0-9]+]], v[[C_V2_F16]]
+; SI-DAG:  v_cvt_f32_f16_e32 v[[A_F32_1:[0-9]+]], v[[A_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[B_F32_1:[0-9]+]], v[[B_F16_1]]
 ; SI:  v_cvt_f32_f16_e32 v[[C_F32_1:[0-9]+]], v[[C_F16_1]]
-; SI:  v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_0:[0-9]+]], v[[C_F32_0]]
-; SI:  v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
-; SI:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
-; SI:  v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[R_F16_0]]
+; SI-DAG:  v_mac_f32_e32 v[[C_F32_0]], v[[B_F32_0]], v[[A_F32_0]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_LO:[0-9]+]], v[[C_F32_0]]
+; SI-DAG:  v_mac_f32_e32 v[[C_F32_1]], v[[B_F32_1]], v[[A_F32_1]]
+; SI-DAG:  v_cvt_f16_f32_e32 v[[R_F16_1:[0-9]+]], v[[C_F32_1]]
 ; SI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[R_F16_1]]
-; VI:  v_mac_f16_e32 v[[C_V2_F16]], v[[B_V2_F16]], v[[A_V2_F16]]
-; VI:  v_mac_f16_e32 v[[C_F16_1]], v[[B_F16_1]], v[[A_F16_1]]
-; VI:  v_and_b32_e32 v[[R_F16_LO:[0-9]+]], 0xffff, v[[C_V2_F16]]
-; VI:  v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[C_F16_1]]
-; GCN: v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+; VI-NOT: and
+; SI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[R_F16_LO]]
+
+; VI-DAG: v_lshrrev_b32_e32 v[[A_F16_1:[0-9]+]], 16, v[[A_V2_F16]]
+; VI-DAG: v_mac_f16_sdwa v[[A_F16_1]], v[[C_V2_F16]], v[[B_V2_F16]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG: v_mac_f16_e32 v[[A_V2_F16]], v[[C_V2_F16]], v[[B_V2_F16]]
+; VI-DAG: v_lshlrev_b32_e32 v[[R_F16_HI:[0-9]+]], 16, v[[A_F16_1]]
+; VI-NOT: and
+; VI:  v_or_b32_e32 v[[R_V2_F16:[0-9]+]], v[[R_F16_HI]], v[[A_V2_F16]]
+
 ; GCN: {{buffer|flat}}_store_dword v[[R_V2_F16]]
 ; GCN: s_endpgm
-define void @mac_v2f16(
+define amdgpu_kernel void @mac_v2f16(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -322,17 +346,19 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_same_add
-; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]]
-; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]]
-; SI:  v_mac_f32_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
-; SI:  v_mac_f32_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}
-; VI:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD0:v[0-9]+]]
-; VI:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, [[ADD1:v[0-9]+]]
-; VI:  v_mac_f16_e32 [[ADD0]], v{{[0-9]+}}, v{{[0-9]+}}
-; VI:  v_mac_f16_e32 [[ADD1]], v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_v2f16_same_add:
+; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI:  v_mad_f32 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
+; VI-DAG:  v_mac_f16_sdwa v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG:  v_mad_f16 v{{[0-9]}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG:  v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
 ; GCN: s_endpgm
-define void @mac_v2f16_same_add(
+define amdgpu_kernel void @mac_v2f16_same_add(
     <2 x half> addrspace(1)* %r0,
     <2 x half> addrspace(1)* %r1,
     <2 x half> addrspace(1)* %a,
@@ -358,15 +384,18 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_a
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_a:
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_a(
+define amdgpu_kernel void @mac_v2f16_neg_a(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -385,14 +414,17 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}mac_v2f16_neg_b
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -[[CVT0]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, [[CVT1]], v{{[0-9]+}}
+
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_b(
+define amdgpu_kernel void @mac_v2f16_neg_b(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -410,15 +442,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_c
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_c:
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
+
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT2]]
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -[[CVT5]]
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_c(
+define amdgpu_kernel void @mac_v2f16_neg_c(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -437,16 +476,19 @@ entry:
 }
 
 ; GCN-LABEL: {{^}}mac_v2f16_neg_a_safe_fp_math:
+
 ; SI:  v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; SI:  v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
-; SI:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
+; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+
 ; VI:  v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]]
-; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+; VI-DAG:  v_mac_f16_sdwa v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A0]] dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; VI-DAG:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v[[NEG_A1]]
+
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_a_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_a_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -464,17 +506,20 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math
+; GCN-LABEL: {{^}}mac_v2f16_neg_b_safe_fp_math:
+
 ; SI:  v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; SI:  v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; SI:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
-; SI:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+
 ; VI:  v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}}
-; VI:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+; VI-DAG:  v_mac_f16_sdwa v{{[0-9]+}}, v[[NEG_A0]], v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; VI-DAG:  v_mac_f16_e32 v{{[0-9]+}}, v[[NEG_A1]], v{{[0-9]+}}
+
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_b_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_b_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -492,17 +537,20 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math
+; GCN-LABEL: {{^}}mac_v2f16_neg_c_safe_fp_math:
+
 ; SI:  v_sub_f32_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; SI:  v_sub_f32_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; SI:  v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
-; SI:  v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG:  v_mac_f32_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI:  v_sub_f16_e32 v[[NEG_A0:[0-9]+]], 0, v{{[0-9]+}}
 ; VI:  v_sub_f16_e32 v[[NEG_A1:[0-9]+]], 0, v{{[0-9]+}}
-; VI:  v_mac_f16_e32 v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}}
-; VI:  v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+; VI-DAG:  v_mac_f16_sdwa v[[NEG_A0]], v{{[0-9]+}}, v{{[0-9]+}} dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
+; VI-DAG:  v_mac_f16_e32 v[[NEG_A1]], v{{[0-9]+}}, v{{[0-9]+}}
+
 ; GCN: s_endpgm
-define void @mac_v2f16_neg_c_safe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_c_safe_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -520,15 +568,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_a_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_a_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
+
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_a_unsafe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_a_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -546,15 +601,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_b_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
-; SI:     v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_b_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
+
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, -v{{[0-9]+}}, v{{[0-9]+}}, v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_b_unsafe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_b_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -572,15 +634,22 @@ entry:
   ret void
 }
 
-; GCN-LABEL: {{^}}mac_v2f16_neg_c_unsafe_fp_math
-; SI-NOT: v_mac_f32
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
-; SI:     v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
+; GCN-LABEL: {{^}}mac_v2f16_neg_c_nsz_fp_math:
+; SI: v_cvt_f32_f16_e32 [[CVT0:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT1:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT2:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT3:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT4:v[0-9]+]], {{v[0-9]+}}
+; SI: v_cvt_f32_f16_e32 [[CVT5:v[0-9]+]], {{v[0-9]+}}
+
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+; SI-DAG: v_mad_f32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[0-9]+}}
+
 ; VI-NOT: v_mac_f16
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; VI:     v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, -v{{[-0-9]}}
 ; GCN:    s_endpgm
-define void @mac_v2f16_neg_c_unsafe_fp_math(
+define amdgpu_kernel void @mac_v2f16_neg_c_nsz_fp_math(
     <2 x half> addrspace(1)* %r,
     <2 x half> addrspace(1)* %a,
     <2 x half> addrspace(1)* %b,
@@ -598,5 +667,5 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "unsafe-fp-math"="false" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "no-signed-zeros-fp-math"="false" }
+attributes #1 = { nounwind "no-signed-zeros-fp-math"="true" }
diff --git a/test/CodeGen/AMDGPU/v_madak_f16.ll b/test/CodeGen/AMDGPU/v_madak_f16.ll
index df87a94ca0739d68143dc042e4e10dd27ebe90c2..bfb10503aaea211b7420b6a06c31df993753a92b 100644
--- a/test/CodeGen/AMDGPU/v_madak_f16.ll
+++ b/test/CodeGen/AMDGPU/v_madak_f16.ll
@@ -7,7 +7,7 @@
 ; VI:  v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}}
 ; VI:  buffer_store_short v[[R_F16]]
 ; GCN: s_endpgm
-define void @madak_f16(
+define amdgpu_kernel void @madak_f16(
     half addrspace(1)* %r,
     half addrspace(1)* %a,
     half addrspace(1)* %b) {
@@ -28,7 +28,7 @@ entry:
 ; VI:  v_mad_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; VI:  v_mac_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}
 ; GCN: s_endpgm
-define void @madak_f16_use_2(
+define amdgpu_kernel void @madak_f16_use_2(
     half addrspace(1)* %r0,
     half addrspace(1)* %r1,
     half addrspace(1)* %a,
diff --git a/test/CodeGen/AMDGPU/valu-i1.ll b/test/CodeGen/AMDGPU/valu-i1.ll
index e64f8467240ae7bd03bed1347d8ec41b6451e642..aad260c3e3690a264914d59274105737343a07d5 100644
--- a/test/CodeGen/AMDGPU/valu-i1.ll
+++ b/test/CodeGen/AMDGPU/valu-i1.ll
@@ -29,7 +29,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 ; SI-NEXT: s_xor_b64 exec, exec, [[SAVE3]]
 ; SI-NEXT: ; mask branch
 ;
-define void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
+define amdgpu_kernel void @test_if(i32 %b, i32 addrspace(1)* %src, i32 addrspace(1)* %dst) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   switch i32 %tid, label %default [
@@ -64,29 +64,100 @@ end:
   ret void
 }
 
-; SI-LABEL: @simple_test_v_if
+; SI-LABEL: {{^}}simple_test_v_if:
 ; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
 ; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
 ; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
 
-; SI: BB{{[0-9]+_[0-9]+}}:
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
 ; SI: buffer_store_dword
-; SI: s_endpgm
+; SI-NEXT: s_waitcnt
 
-; SI: BB1_2:
+; SI-NEXT: {{^}}[[EXIT]]:
 ; SI: s_or_b64 exec, exec, [[BR_SREG]]
 ; SI: s_endpgm
-define void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_if(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
-  br i1 %is.0, label %store, label %exit
+  br i1 %is.0, label %then, label %exit
+
+then:
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  br label %exit
+
+exit:
+  ret void
+}
+
+; FIXME: It would be better to endpgm in the then block.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_ret:
+; SI: v_cmp_ne_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[EXIT:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: BB{{[0-9]+_[0-9]+}}:
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[EXIT]]:
+; SI: s_or_b64 exec, exec, [[BR_SREG]]
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %then, label %exit
+
+then:
+  %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
+  store i32 999, i32 addrspace(1)* %gep
+  ret void
+
+exit:
+  ret void
+}
+
+; Final block has more than a ret to execute. This was miscompiled
+; before function exit blocks were unified since the endpgm would
+; terminate the then wavefront before reaching the store.
+
+; SI-LABEL: {{^}}simple_test_v_if_ret_else_code_ret:
+; SI: v_cmp_eq_u32_e32 vcc, 0, v{{[0-9]+}}
+; SI: s_and_saveexec_b64 [[BR_SREG:s\[[0-9]+:[0-9]+\]]], vcc
+; SI: s_xor_b64 [[BR_SREG]], exec, [[BR_SREG]]
+; SI: ; mask branch [[FLOW:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %exit
+; SI: ds_write_b32
+; SI: s_waitcnt
+
+; SI-NEXT: {{^}}[[FLOW]]:
+; SI-NEXT: s_or_saveexec_b64
+; SI-NEXT: s_xor_b64 exec, exec
+; SI-NEXT: ; mask branch [[UNIFIED_RETURN:BB[0-9]+_[0-9]+]]
+
+; SI-NEXT: {{^BB[0-9]+_[0-9]+}}: ; %then
+; SI: buffer_store_dword
+; SI-NEXT: s_waitcnt
+
+; SI-NEXT: {{^}}[[UNIFIED_RETURN]]: ; %UnifiedReturnBlock
+; SI: s_or_b64 exec, exec
+; SI: s_endpgm
+define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %is.0 = icmp ne i32 %tid, 0
+  br i1 %is.0, label %then, label %exit
 
-store:
+then:
   %gep = getelementptr i32, i32 addrspace(1)* %dst, i32 %tid
   store i32 999, i32 addrspace(1)* %gep
   ret void
 
 exit:
+  store volatile i32 7, i32 addrspace(3)* undef
   ret void
 }
 
@@ -106,7 +177,7 @@ exit:
 ; SI: [[LABEL_EXIT]]:
 ; SI: s_endpgm
 
-define void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
+define amdgpu_kernel void @simple_test_v_loop(i32 addrspace(1)* %dst, i32 addrspace(1)* %src) #1 {
 entry:
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %is.0 = icmp ne i32 %tid, 0
@@ -156,7 +227,7 @@ exit:
 
 ; SI: BB{{[0-9]+_[0-9]+}}: ; %bb20
 ; SI: buffer_store_dword
-; SI: v_cmp_ge_i64_e32 [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
+; SI: v_cmp_ge_i64_e{{32|64}} [[CMP:s\[[0-9]+:[0-9]+\]|vcc]]
 ; SI: s_or_b64 [[TMP:s\[[0-9]+:[0-9]+\]]], [[CMP]], [[COND_STATE]]
 
 ; SI: [[LABEL_FLOW]]:
@@ -173,7 +244,7 @@ exit:
 ; SI-NOT: [[COND_STATE]]
 ; SI: s_endpgm
 
-define void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
+define amdgpu_kernel void @multi_vcond_loop(i32 addrspace(1)* noalias nocapture %arg, i32 addrspace(1)* noalias nocapture readonly %arg1, i32 addrspace(1)* noalias nocapture readonly %arg2, i32 addrspace(1)* noalias nocapture readonly %arg3) #1 {
 bb:
   %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0
   %tmp4 = sext i32 %tmp to i64
diff --git a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
index 03e473e3a0c016a2774a6a2c85ab3e28605c5916..5e5465800c3a3c68ab4ebc044b1b27561d57fa32 100644
--- a/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
+++ b/test/CodeGen/AMDGPU/vccz-corrupt-bug-workaround.mir
@@ -1,7 +1,7 @@
 # RUN: llc -run-pass si-insert-waits -march=amdgcn -mcpu=tahiti -o - %s | FileCheck %s
 --- |
 
-  define void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @vccz_corrupt_workaround(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     %cmp0 = fcmp oeq float %cond, 0.000000e+00
     br i1 %cmp0, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
@@ -20,7 +20,7 @@
     ret void
   }
 
-  define void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
+  define amdgpu_kernel void @vccz_corrupt_undef_vcc(float %cond, i32 addrspace(1)* %out) #0 {
   entry:
     br i1 undef, label %if, label %else, !structurizecfg.uniform !0, !amdgpu.uniform !0
 
diff --git a/test/CodeGen/AMDGPU/vector-alloca.ll b/test/CodeGen/AMDGPU/vector-alloca.ll
index 7dcf36f144aca5b1b4ab142156185c0ee1edc2bf..03cf725601b79dd2801439ef6364afae6a5742e2 100644
--- a/test/CodeGen/AMDGPU/vector-alloca.ll
+++ b/test/CodeGen/AMDGPU/vector-alloca.ll
@@ -15,7 +15,7 @@
 ; EG: MOV
 ; EG: MOV
 ; EG: MOVA_INT
-define void @vector_read(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -44,7 +44,7 @@ entry:
 ; EG: MOV
 ; EG: MOVA_INT
 ; EG: MOVA_INT
-define void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @vector_write(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -71,7 +71,7 @@ entry:
 
 ; FUNC-LABEL: {{^}}bitcast_gep:
 ; EG: STORE_RAW
-define void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
+define amdgpu_kernel void @bitcast_gep(i32 addrspace(1)* %out, i32 %w_index, i32 %r_index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -93,7 +93,7 @@ entry:
 ; OPT-LABEL: @vector_read_bitcast_gep(
 ; OPT: %0 = extractelement <4 x i32> <i32 1065353216, i32 1, i32 2, i32 3>, i32 %index
 ; OPT: store i32 %0, i32 addrspace(1)* %out, align 4
-define void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_bitcast_gep(i32 addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %x = getelementptr inbounds [4 x i32], [4 x i32]* %tmp, i32 0, i32 0
@@ -121,7 +121,7 @@ entry:
 ; OPT: store float
 ; OPT: store float
 ; OPT: load float
-define void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
+define amdgpu_kernel void @vector_read_bitcast_alloca(float addrspace(1)* %out, i32 %index) {
 entry:
   %tmp = alloca [4 x i32]
   %tmp.bc = bitcast [4 x i32]* %tmp to [4 x float]*
diff --git a/test/CodeGen/AMDGPU/vector-extract-insert.ll b/test/CodeGen/AMDGPU/vector-extract-insert.ll
index 2d39f82e2499c61fee262229191bd37498b51604..ab2bfcfd1fb7188970147d09e82632dc5d409c65 100644
--- a/test/CodeGen/AMDGPU/vector-extract-insert.ll
+++ b/test/CodeGen/AMDGPU/vector-extract-insert.ll
@@ -13,7 +13,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #0
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -30,7 +30,7 @@ define void @extract_insert_same_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32>
 ; GCN: v_movreld_b32
 ; GCN: v_movrels_b32
 ; GCN: buffer_store_dword v
-define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
+define amdgpu_kernel void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx0, i32 %idx1) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -49,7 +49,7 @@ define void @extract_insert_different_dynelt_v4i32(i32 addrspace(1)* %out, <4 x
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in, i32 %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x i32>, <4 x i32> addrspace(1)* %in, i64 %id.ext
@@ -68,7 +68,7 @@ define void @extract_insert_same_elt2_v4i32(i32 addrspace(1)* %out, <4 x i32> ad
 ; GCN: v_mov_b32_e32 [[VVAL:v[0-9]+]], [[VAL]]
 ; GCN-NOT: [[VVAL]]
 ; GCN: buffer_store_dword [[VVAL]]
-define void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
+define amdgpu_kernel void @extract_insert_same_dynelt_v4f32(float addrspace(1)* %out, <4 x float> addrspace(1)* %in, float %val, i32 %idx) #1 {
   %id = call i32 @llvm.amdgcn.workitem.id.x()
   %id.ext = sext i32 %id to i64
   %gep.in = getelementptr inbounds <4 x float>, <4 x float> addrspace(1)* %in, i64 %id.ext
diff --git a/test/CodeGen/AMDGPU/vectorize-global-local.ll b/test/CodeGen/AMDGPU/vectorize-global-local.ll
new file mode 100644
index 0000000000000000000000000000000000000000..90cf34e609f6ec13def5e491769d687ad2081c18
--- /dev/null
+++ b/test/CodeGen/AMDGPU/vectorize-global-local.ll
@@ -0,0 +1,80 @@
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck %s
+; CHECK-DAG: flat_load_dwordx4
+; CHECK-DAG: flat_load_dwordx4
+; CHECK-DAG: flat_load_dwordx4
+; CHECK-DAG: flat_load_dwordx4
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+; CHECK-DAG: ds_write2_b32
+
+define amdgpu_kernel void @vectorize_global_local(i32 addrspace(1)* nocapture readonly %arg, i32 addrspace(3)* nocapture %arg1) {
+bb:
+  %tmp = load i32, i32 addrspace(1)* %arg, align 4
+  store i32 %tmp, i32 addrspace(3)* %arg1, align 4
+  %tmp2 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 1
+  %tmp3 = load i32, i32 addrspace(1)* %tmp2, align 4
+  %tmp4 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 1
+  store i32 %tmp3, i32 addrspace(3)* %tmp4, align 4
+  %tmp5 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 2
+  %tmp6 = load i32, i32 addrspace(1)* %tmp5, align 4
+  %tmp7 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 2
+  store i32 %tmp6, i32 addrspace(3)* %tmp7, align 4
+  %tmp8 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 3
+  %tmp9 = load i32, i32 addrspace(1)* %tmp8, align 4
+  %tmp10 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 3
+  store i32 %tmp9, i32 addrspace(3)* %tmp10, align 4
+  %tmp11 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 4
+  %tmp12 = load i32, i32 addrspace(1)* %tmp11, align 4
+  %tmp13 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 4
+  store i32 %tmp12, i32 addrspace(3)* %tmp13, align 4
+  %tmp14 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 5
+  %tmp15 = load i32, i32 addrspace(1)* %tmp14, align 4
+  %tmp16 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 5
+  store i32 %tmp15, i32 addrspace(3)* %tmp16, align 4
+  %tmp17 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 6
+  %tmp18 = load i32, i32 addrspace(1)* %tmp17, align 4
+  %tmp19 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 6
+  store i32 %tmp18, i32 addrspace(3)* %tmp19, align 4
+  %tmp20 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 7
+  %tmp21 = load i32, i32 addrspace(1)* %tmp20, align 4
+  %tmp22 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 7
+  store i32 %tmp21, i32 addrspace(3)* %tmp22, align 4
+  %tmp23 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 8
+  %tmp24 = load i32, i32 addrspace(1)* %tmp23, align 4
+  %tmp25 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 8
+  store i32 %tmp24, i32 addrspace(3)* %tmp25, align 4
+  %tmp26 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 9
+  %tmp27 = load i32, i32 addrspace(1)* %tmp26, align 4
+  %tmp28 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 9
+  store i32 %tmp27, i32 addrspace(3)* %tmp28, align 4
+  %tmp29 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 10
+  %tmp30 = load i32, i32 addrspace(1)* %tmp29, align 4
+  %tmp31 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 10
+  store i32 %tmp30, i32 addrspace(3)* %tmp31, align 4
+  %tmp32 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 11
+  %tmp33 = load i32, i32 addrspace(1)* %tmp32, align 4
+  %tmp34 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 11
+  store i32 %tmp33, i32 addrspace(3)* %tmp34, align 4
+  %tmp35 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 12
+  %tmp36 = load i32, i32 addrspace(1)* %tmp35, align 4
+  %tmp37 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 12
+  store i32 %tmp36, i32 addrspace(3)* %tmp37, align 4
+  %tmp38 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 13
+  %tmp39 = load i32, i32 addrspace(1)* %tmp38, align 4
+  %tmp40 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 13
+  store i32 %tmp39, i32 addrspace(3)* %tmp40, align 4
+  %tmp41 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 14
+  %tmp42 = load i32, i32 addrspace(1)* %tmp41, align 4
+  %tmp43 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 14
+  store i32 %tmp42, i32 addrspace(3)* %tmp43, align 4
+  %tmp44 = getelementptr inbounds i32, i32 addrspace(1)* %arg, i64 15
+  %tmp45 = load i32, i32 addrspace(1)* %tmp44, align 4
+  %tmp46 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 15
+  store i32 %tmp45, i32 addrspace(3)* %tmp46, align 4
+  ret void
+}
diff --git a/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
index 3d71062f1fbae27964a21652466d4c3881a66f14..46a1c87184d15609bcfe9c0d356784601574c2d5 100644
--- a/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
+++ b/test/CodeGen/AMDGPU/vertex-fetch-encoding.ll
@@ -6,7 +6,7 @@
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
+define amdgpu_kernel void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
   %v = load i32, i32 addrspace(1)* %in
   store i32 %v, i32 addrspace(1)* %out
   ret void
@@ -16,7 +16,7 @@ define void @vtx_fetch32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) {
 ; EG: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x40,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_128 T[[DST:[0-9]]].XYZW, T[[SRC:[0-9]]].X, 0, #1 ; encoding: [0x40,0x01,0x0[[SRC]],0x00,0x0[[DST]],0x10,0x8d,0x18,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
+define amdgpu_kernel void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in) {
   %v = load <4 x i32>, <4 x i32> addrspace(1)* %in
   store <4 x i32> %v, <4 x i32> addrspace(1)* %out
   ret void
@@ -26,7 +26,7 @@ define void @vtx_fetch128(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)*
 ; EG: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x10,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x08,0x00
 ; CM: VTX_READ_32 T[[GPR:[0-9]]].X, T[[GPR]].X, 0, #3 ; encoding: [0x40,0x03,0x0[[GPR]],0x00,0x0[[GPR]],0xf0,0x5f,0x13,0x00,0x00,0x00,0x00
 
-define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
+define amdgpu_kernel void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
   %v = load i32, i32 addrspace(7)* %in
   store i32 %v, i32 addrspace(1)* %out
   ret void
@@ -38,7 +38,7 @@ define void @vtx_fetch32_id3(i32 addrspace(1)* %out, i32 addrspace(7)* %in) {
 
 @t = internal addrspace(2) constant [4 x i32] [i32 0, i32 1, i32 2, i32 3]
 
-define void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @vtx_fetch32_id2(i32 addrspace(1)* %out, i32 %in) {
   %a = getelementptr inbounds [4 x i32], [4 x i32] addrspace(2)* @t, i32 0, i32 %in
   %v = load i32, i32 addrspace(2)* %a
   store i32 %v, i32 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
index a8908f87fbf6fc72b79862f292ad3eb85b727432..e82e548f23cda7a5f394fed5bccb6d379d6e77ef 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot-compute.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=SIMESA %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=VIMESA %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+vgpr-spilling,-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GCNMESA -check-prefix=GFX9MESA %s
 ; RUN: llc -march=amdgcn -mcpu=hawaii -mtriple=amdgcn-unknown-amdhsa -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=CIHSA -check-prefix=HSA %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VIHSA -check-prefix=HSA %s
 
@@ -15,16 +16,17 @@
 
 ; HSA: enable_sgpr_private_segment_buffer = 1
 ; HSA: enable_sgpr_flat_scratch_init = 0
-; HSA: workitem_private_segment_byte_size = 1024
+; HSA: workitem_private_segment_byte_size = 1536
 
 ; GCN-NOT: flat_scr
 
 ; GCNMESA-DAG: s_mov_b32 s16, s3
 ; GCNMESA-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCNMESA--DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GCNMESA-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GCNMESA-DAG: s_mov_b32 s14, -1
 ; SIMESA-DAG: s_mov_b32 s15, 0xe8f000
 ; VIMESA-DAG: s_mov_b32 s15, 0xe80000
+; GFX9MESA-DAG: s_mov_b32 s15, 0xe00000
 
 
 ; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}} ; 4-byte Folded Spill
@@ -40,10 +42,10 @@
 ; GCN: buffer_load_dword {{v[0-9]+}}, off, s[12:15], s16 offset:{{[0-9]+}}
 
 ; GCN: NumVgprs: 256
-; GCN: ScratchSize: 1024
+; GCN: ScratchSize: 1536
 
 ; s[0:3] input user SGPRs. s4,s5,s6 = workgroup IDs. s8 scratch offset.
-define void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
+define amdgpu_kernel void @spill_vgpr_compute(<4 x float> %arg6, float addrspace(1)* %arg, i32 %arg1, i32 %arg2, float %arg3, float %arg4, float %arg5) #0 {
 bb:
   %tmp = add i32 %arg1, %arg2
   %tmp7 = extractelement <4 x float> %arg6, i32 0
diff --git a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
index 4de35b97aeabec8da1f55237154f2ced30df1408..c9c8583d5e8799431f6cb36ac3bf521e491eeced 100644
--- a/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
+++ b/test/CodeGen/AMDGPU/vgpr-spill-emergency-stack-slot.ll
@@ -1,5 +1,6 @@
 ; RUN: llc -march=amdgcn -mcpu=tahiti -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s
 ; RUN: llc -march=amdgcn -mcpu=fiji -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s
+; RUN: llc -march=amdgcn -mcpu=gfx900 -mattr=+vgpr-spilling -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=GFX9 %s
 
 ; This ends up using all 255 registers and requires register
 ; scavenging which will fail to find an unsued register.
@@ -12,19 +13,19 @@
 
 ; GCN-LABEL: {{^}}main:
 
-; GCN-DAG: s_mov_b32 s11, s12
-; GCN-DAG: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GCN-DAG: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
-; GCN-DAG: s_mov_b32 s14, -1
-; SI-DAG: s_mov_b32 s15, 0xe8f000
-; VI-DAG: s_mov_b32 s15, 0xe80000
-
-; s11 is offset system SGPR
-; GCN: buffer_store_dword {{v[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Spill
-; GCN: buffer_load_dword v{{[0-9]+}}, off, s[12:15], s11 offset:{{[0-9]+}} ; 4-byte Folded Reload
+; GCN-DAG: s_mov_b32 s[[OFFREG:[0-9]+]], s12
+; GCN-DAG: s_mov_b32 s[[DESC0:[0-9]+]], SCRATCH_RSRC_DWORD0
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, SCRATCH_RSRC_DWORD1
+; GCN-DAG: s_mov_b32 s{{[0-9]+}}, -1
+; SI-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe8f000
+; VI-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe80000
+; GFX9-DAG: s_mov_b32 s[[DESC3:[0-9]+]], 0xe00000
 
+; OFFREG is offset system SGPR
+; GCN: buffer_store_dword {{v[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Spill
+; GCN: buffer_load_dword v{{[0-9]+}}, off, s{{\[}}[[DESC0]]:[[DESC3]]], s[[OFFREG]] offset:{{[0-9]+}} ; 4-byte Folded Reload
 ; GCN: NumVgprs: 256
-; GCN: ScratchSize: 1024
+; GCN: ScratchSize: 1536
 
 define amdgpu_vs void @main([9 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 bb:
@@ -36,7 +37,8 @@ bb:
   %tmp15 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0
   %tmp16 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp15, align 16, !tbaa !0
   %tmp17 = add i32 %arg5, %arg7
-  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp16, i32 0, i32 %tmp17)
+  %tmp16.cast = bitcast <16 x i8> %tmp16 to <4 x i32>
+  %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp16.cast, i32 %tmp17, i32 0, i1 false, i1 false)
   %tmp19 = extractelement <4 x float> %tmp18, i32 0
   %tmp20 = extractelement <4 x float> %tmp18, i32 1
   %tmp21 = extractelement <4 x float> %tmp18, i32 2
@@ -180,39 +182,39 @@ bb24:                                             ; preds = %bb157, %bb
   br i1 %tmp155, label %bb156, label %bb157
 
 bb156:                                            ; preds = %bb24
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp12, float %tmp103, float %tmp102, float %tmp101)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 33, i32 0, float %tmp99, float %tmp98, float %tmp97, float %tmp95)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 34, i32 0, float %tmp94, float %tmp93, float %tmp91, float %tmp90)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 35, i32 0, float %tmp89, float %tmp87, float %tmp86, float %tmp85)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 36, i32 0, float %tmp83, float %tmp82, float %tmp81, float %tmp79)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 37, i32 0, float %tmp78, float %tmp77, float %tmp75, float %tmp74)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 38, i32 0, float %tmp73, float %tmp71, float %tmp70, float %tmp69)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 39, i32 0, float %tmp67, float %tmp66, float %tmp65, float %tmp63)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 40, i32 0, float %tmp62, float %tmp61, float %tmp59, float %tmp58)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 41, i32 0, float %tmp57, float %tmp55, float %tmp54, float %tmp53)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 42, i32 0, float %tmp51, float %tmp50, float %tmp49, float %tmp47)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 43, i32 0, float %tmp46, float %tmp45, float %tmp43, float %tmp42)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 44, i32 0, float %tmp41, float %tmp39, float %tmp38, float %tmp37)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 45, i32 0, float %tmp35, float %tmp34, float %tmp33, float %tmp31)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 46, i32 0, float %tmp30, float %tmp29, float %tmp27, float %tmp26)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 47, i32 0, float %tmp25, float %tmp28, float %tmp32, float %tmp36)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 48, i32 0, float %tmp40, float %tmp44, float %tmp48, float %tmp52)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 49, i32 0, float %tmp56, float %tmp60, float %tmp64, float %tmp68)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 50, i32 0, float %tmp72, float %tmp76, float %tmp80, float %tmp84)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 51, i32 0, float %tmp88, float %tmp92, float %tmp96, float %tmp100)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 52, i32 0, float %tmp104, float %tmp105, float %tmp106, float %tmp108)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 53, i32 0, float %tmp109, float %tmp110, float %tmp111, float %tmp112)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 54, i32 0, float %tmp113, float %tmp114, float %tmp115, float %tmp116)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 55, i32 0, float %tmp117, float %tmp118, float %tmp119, float %tmp120)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 56, i32 0, float %tmp121, float %tmp122, float %tmp123, float %tmp124)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 57, i32 0, float %tmp125, float %tmp126, float %tmp127, float %tmp128)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 58, i32 0, float %tmp129, float %tmp130, float %tmp131, float %tmp132)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 59, i32 0, float %tmp133, float %tmp134, float %tmp135, float %tmp136)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 60, i32 0, float %tmp137, float %tmp138, float %tmp139, float %tmp140)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 61, i32 0, float %tmp141, float %tmp142, float %tmp143, float %tmp144)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 62, i32 0, float %tmp145, float %tmp146, float %tmp147, float %tmp148)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 63, i32 0, float %tmp149, float %tmp150, float %tmp151, float %tmp13)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp12, float %tmp103, float %tmp102, float %tmp101, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 33, i32 15, float %tmp99, float %tmp98, float %tmp97, float %tmp95, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 34, i32 15, float %tmp94, float %tmp93, float %tmp91, float %tmp90, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 35, i32 15, float %tmp89, float %tmp87, float %tmp86, float %tmp85, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 36, i32 15, float %tmp83, float %tmp82, float %tmp81, float %tmp79, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 37, i32 15, float %tmp78, float %tmp77, float %tmp75, float %tmp74, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 38, i32 15, float %tmp73, float %tmp71, float %tmp70, float %tmp69, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 39, i32 15, float %tmp67, float %tmp66, float %tmp65, float %tmp63, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 40, i32 15, float %tmp62, float %tmp61, float %tmp59, float %tmp58, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 41, i32 15, float %tmp57, float %tmp55, float %tmp54, float %tmp53, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 42, i32 15, float %tmp51, float %tmp50, float %tmp49, float %tmp47, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 43, i32 15, float %tmp46, float %tmp45, float %tmp43, float %tmp42, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 44, i32 15, float %tmp41, float %tmp39, float %tmp38, float %tmp37, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 45, i32 15, float %tmp35, float %tmp34, float %tmp33, float %tmp31, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 46, i32 15, float %tmp30, float %tmp29, float %tmp27, float %tmp26, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 47, i32 15, float %tmp25, float %tmp28, float %tmp32, float %tmp36, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 48, i32 15, float %tmp40, float %tmp44, float %tmp48, float %tmp52, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 49, i32 15, float %tmp56, float %tmp60, float %tmp64, float %tmp68, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 50, i32 15, float %tmp72, float %tmp76, float %tmp80, float %tmp84, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 51, i32 15, float %tmp88, float %tmp92, float %tmp96, float %tmp100, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 52, i32 15, float %tmp104, float %tmp105, float %tmp106, float %tmp108, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 53, i32 15, float %tmp109, float %tmp110, float %tmp111, float %tmp112, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 54, i32 15, float %tmp113, float %tmp114, float %tmp115, float %tmp116, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 55, i32 15, float %tmp117, float %tmp118, float %tmp119, float %tmp120, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 56, i32 15, float %tmp121, float %tmp122, float %tmp123, float %tmp124, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 57, i32 15, float %tmp125, float %tmp126, float %tmp127, float %tmp128, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 58, i32 15, float %tmp129, float %tmp130, float %tmp131, float %tmp132, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 59, i32 15, float %tmp133, float %tmp134, float %tmp135, float %tmp136, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 60, i32 15, float %tmp137, float %tmp138, float %tmp139, float %tmp140, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 61, i32 15, float %tmp141, float %tmp142, float %tmp143, float %tmp144, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 62, i32 15, float %tmp145, float %tmp146, float %tmp147, float %tmp148, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 63, i32 15, float %tmp149, float %tmp150, float %tmp151, float %tmp13, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp19, float %tmp20, float %tmp21, float %tmp22, i1 true, i1 false) #0
   ret void
 
 bb157:                                            ; preds = %bb24
@@ -483,18 +485,15 @@ bb157:                                            ; preds = %bb24
   br label %bb24
 }
 
-; Function Attrs: nounwind readnone
-declare float @llvm.SI.load.const(<16 x i8>, i32) #1
-
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #1
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
 declare i32 @llvm.amdgcn.mbcnt.lo(i32, i32) #1
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+declare float @llvm.SI.load.const(<16 x i8>, i32) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2
 
 attributes #0 = { nounwind }
 attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
diff --git a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
index ad7521a3da9b09e939218fbbde7a9c07f78dbf78..8d66c346ed5b8314c64488ef5eb08f2817f77bfe 100644
--- a/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
+++ b/test/CodeGen/AMDGPU/vi-removed-intrinsics.ll
@@ -1,10 +1,10 @@
 ; RUN: not llc -march=amdgcn -mcpu=tonga < %s 2>&1 | FileCheck -check-prefix=ERROR %s
 
-; ERROR: error: :1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget
+; ERROR: error: foo.cl:1:42: in function rsq_legacy_f32 void (float addrspace(1)*, float): intrinsic not supported on subtarget
 
 declare float @llvm.amdgcn.rsq.legacy(float) #0
 
-define void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
+define amdgpu_kernel void @rsq_legacy_f32(float addrspace(1)* %out, float %src) #1 {
   %rsq = call float @llvm.amdgcn.rsq.legacy(float %src), !dbg !4
   store float %rsq, float addrspace(1)* %out, align 4
   ret void
@@ -21,4 +21,4 @@ attributes #1 = { nounwind }
 !2 = !{i32 2, !"Dwarf Version", i32 4}
 !3 = !{i32 2, !"Debug Info Version", i32 3}
 !4 = !DILocation(line: 1, column: 42, scope: !5)
-!5 = distinct !DISubprogram(name: "rsq_legacy_f32", scope: null, line: 1, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0)
+!5 = distinct !DISubprogram(name: "rsq_legacy_f32", scope: null, file: !1, line: 1, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0)
diff --git a/test/CodeGen/AMDGPU/vop-shrink.ll b/test/CodeGen/AMDGPU/vop-shrink.ll
index ae8ec58270c1eeb67680684937bd728472a13649..d2708b068eb42f7a48c9f29fd0d518ba18a68874 100644
--- a/test/CodeGen/AMDGPU/vop-shrink.ll
+++ b/test/CodeGen/AMDGPU/vop-shrink.ll
@@ -8,7 +8,7 @@
 
 ; ModuleID = 'vop-shrink.ll'
 
-define void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
+define amdgpu_kernel void @sub_rev(i32 addrspace(1)* %out, <4 x i32> %sgpr, i32 %cond) {
 entry:
   %vgpr = call i32 @llvm.amdgcn.workitem.id.x() #1
   %tmp = icmp eq i32 %cond, 0
@@ -35,7 +35,7 @@ endif:                                            ; preds = %else, %if
 
 ; FUNC-LABEL: {{^}}add_fold:
 ; SI: v_add_f32_e32 v{{[0-9]+}}, 0x44800000
-define void @add_fold(float addrspace(1)* %out) {
+define amdgpu_kernel void @add_fold(float addrspace(1)* %out) {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = uitofp i32 %tmp to float
diff --git a/test/CodeGen/AMDGPU/vselect.ll b/test/CodeGen/AMDGPU/vselect.ll
index fe5be7526b196504394673c1ee3e7863be9eacf5..bb6234729f90bf0f9bd121c67de37c1411ec96be 100644
--- a/test/CodeGen/AMDGPU/vselect.ll
+++ b/test/CodeGen/AMDGPU/vselect.ll
@@ -10,7 +10,7 @@
 ; SI: v_cndmask_b32_e64
 ; SI: v_cndmask_b32_e32
 
-define void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
+define amdgpu_kernel void @test_select_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1, <2 x i32> %val) {
 entry:
   %load0 = load <2 x i32>, <2 x i32> addrspace(1)* %in0
   %load1 = load <2 x i32>, <2 x i32> addrspace(1)* %in1
@@ -28,7 +28,7 @@ entry:
 ;SI: v_cndmask_b32_e64
 ;SI: v_cndmask_b32_e32
 
-define void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v2f32(<2 x float> addrspace(1)* %out, <2 x float> addrspace(1)* %in0, <2 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <2 x float>, <2 x float> addrspace(1)* %in0
   %1 = load <2 x float>, <2 x float> addrspace(1)* %in1
@@ -52,7 +52,7 @@ entry:
 ; SI: v_cndmask_b32
 ; SI: v_cndmask_b32
 
-define void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
+define amdgpu_kernel void @test_select_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1, <4 x i32> %val) {
 entry:
   %load0 = load <4 x i32>, <4 x i32> addrspace(1)* %in0
   %load1 = load <4 x i32>, <4 x i32> addrspace(1)* %in1
@@ -68,7 +68,7 @@ entry:
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 ;EG: CNDE_INT {{\** *}}T{{[0-9]+\.[XYZW], PV\.[XYZW], T[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}}
 
-define void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
+define amdgpu_kernel void @test_select_v4f32(<4 x float> addrspace(1)* %out, <4 x float> addrspace(1)* %in0, <4 x float> addrspace(1)* %in1) {
 entry:
   %0 = load <4 x float>, <4 x float> addrspace(1)* %in0
   %1 = load <4 x float>, <4 x float> addrspace(1)* %in1
diff --git a/test/CodeGen/AMDGPU/vselect64.ll b/test/CodeGen/AMDGPU/vselect64.ll
index ef85ebe7899f11616616993ec020df56400e874c..4a043556516190fd4a415c6214077cb3145c9258 100644
--- a/test/CodeGen/AMDGPU/vselect64.ll
+++ b/test/CodeGen/AMDGPU/vselect64.ll
@@ -5,7 +5,7 @@
 ; Make sure the vectors aren't being stored on the stack.  We know they are
 ; being stored on the stack if the shaders uses at leat 10 registers.
 ; CHECK-NOT: {{\**}} MOV T{{[0-9][0-9]}}.X
-define void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
+define amdgpu_kernel void @test_select_v4i64(<4 x i64> addrspace(1)* %out, <4 x i32> %c) {
 entry:
        %cmp = icmp ne  <4 x i32> %c, <i32 0, i32 0, i32 0, i32 0>
        %result = select <4 x i1> %cmp, <4 x i64> <i64 0, i64 1, i64 2, i64 3>, <4 x i64> <i64 4, i64 5, i64 6, i64 7>
diff --git a/test/CodeGen/AMDGPU/vtx-fetch-branch.ll b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
index 4584d6e25254dcd713dda206b2a87ae63b986b07..4c5eb3d3aa5d81c0d50f5950236685a049fd83d8 100644
--- a/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
+++ b/test/CodeGen/AMDGPU/vtx-fetch-branch.ll
@@ -10,7 +10,7 @@
 ; CHECK-NOT: ALU_POP_AFTER
 ; CHECK: TEX
 ; CHECK-NEXT: POP
-define void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 addrspace(1)* %in, i32 %cond) {
 entry:
   %0 = icmp eq i32 %cond, 0
   br i1 %0, label %endif, label %if
diff --git a/test/CodeGen/AMDGPU/vtx-schedule.ll b/test/CodeGen/AMDGPU/vtx-schedule.ll
index 912e258ebb8356bbba44d1d8c8aae1fc671f52c5..c4b619bf168f2b64b406162d444ac4d5b41d7360 100644
--- a/test/CodeGen/AMDGPU/vtx-schedule.ll
+++ b/test/CodeGen/AMDGPU/vtx-schedule.ll
@@ -9,7 +9,7 @@
 ; CHECK: VTX_READ_32 [[IN0:T[0-9]+\.X]], [[IN0]], 0
 ; CHECK: Fetch clause
 ; CHECK: VTX_READ_32 [[IN1:T[0-9]+\.X]], [[IN1]], 0
-define void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
+define amdgpu_kernel void @test(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* addrspace(1)* nocapture %in0) {
 entry:
   %0 = load i32 addrspace(1)*, i32 addrspace(1)* addrspace(1)* %in0
   %1 = load i32, i32 addrspace(1)* %0
diff --git a/test/CodeGen/AMDGPU/wait.ll b/test/CodeGen/AMDGPU/wait.ll
index 621c582fcefdf75d80461afa270d1631c8c0f84d..623cbeae8da9c98c34fc694dc87e32e5fb4494ce 100644
--- a/test/CodeGen/AMDGPU/wait.ll
+++ b/test/CodeGen/AMDGPU/wait.ll
@@ -11,26 +11,27 @@
 ; DEFAULT: exp
 ; DEFAULT: s_waitcnt lgkmcnt(0)
 ; DEFAULT: s_endpgm
-define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) {
+define amdgpu_vs void @main(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, <16 x i8> addrspace(2)* inreg %arg3, <16 x i8> addrspace(2)* inreg %arg4, i32 inreg %arg5, i32 %arg6, i32 %arg7, i32 %arg8, i32 %arg9, float addrspace(2)* inreg %constptr) #0 {
 main_body:
   %tmp = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 0
   %tmp10 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, !tbaa !0
-  %tmp11 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp10, i32 0, i32 %arg6)
+  %tmp10.cast = bitcast <16 x i8> %tmp10 to <4 x i32>
+  %tmp11 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp10.cast, i32 %arg6, i32 0, i1 false, i1 false)
   %tmp12 = extractelement <4 x float> %tmp11, i32 0
   %tmp13 = extractelement <4 x float> %tmp11, i32 1
   call void @llvm.amdgcn.s.barrier() #1
   %tmp14 = extractelement <4 x float> %tmp11, i32 2
-;  %tmp15 = extractelement <4 x float> %tmp11, i32 3
-  %tmp15 = load float, float addrspace(2)* %constptr, align 4 ; Force waiting for expcnt and lgkmcnt
+  %tmp15 = load float, float addrspace(2)* %constptr, align 4
   %tmp16 = getelementptr <16 x i8>, <16 x i8> addrspace(2)* %arg3, i32 1
   %tmp17 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp16, !tbaa !0
-  %tmp18 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %tmp17, i32 0, i32 %arg6)
+  %tmp17.cast = bitcast <16 x i8> %tmp17 to <4 x i32>
+  %tmp18 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp17.cast, i32 %arg6, i32 0, i1 false, i1 false)
   %tmp19 = extractelement <4 x float> %tmp18, i32 0
   %tmp20 = extractelement <4 x float> %tmp18, i32 1
   %tmp21 = extractelement <4 x float> %tmp18, i32 2
   %tmp22 = extractelement <4 x float> %tmp18, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %tmp19, float %tmp20, float %tmp21, float %tmp22)
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %tmp12, float %tmp13, float %tmp14, float %tmp15)
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp19, float %tmp20, float %tmp21, float %tmp22, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp12, float %tmp13, float %tmp14, float %tmp15, i1 true, i1 false) #0
   ret void
 }
 
@@ -41,45 +42,42 @@ main_body:
 ; ILPMAX: s_load_dwordx4
 ; ILPMAX: s_waitcnt lgkmcnt(0)
 ; ILPMAX: buffer_load
-; ILPMAX: s_waitcnt vmcnt(1)
 ; ILPMAX: s_waitcnt vmcnt(0)
+; ILPMAX: exp pos0
+; ILPMAX-NEXT: exp param0
 ; ILPMAX: s_endpgm
-
-define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval, [17 x <16 x i8>] addrspace(2)* byval, [17 x <4 x i32>] addrspace(2)* byval, [34 x <8 x i32>] addrspace(2)* byval, [16 x <16 x i8>] addrspace(2)*
-byval, i32 inreg, i32 inreg, i32, i32, i32, i32) {
+define amdgpu_vs void @main2([6 x <16 x i8>] addrspace(2)* byval %arg, [17 x <16 x i8>] addrspace(2)* byval %arg1, [17 x <4 x i32>] addrspace(2)* byval %arg2, [34 x <8 x i32>] addrspace(2)* byval %arg3, [16 x <16 x i8>] addrspace(2)* byval %arg4, i32 inreg %arg5, i32 inreg %arg6, i32 %arg7, i32 %arg8, i32 %arg9, i32 %arg10) #0 {
 main_body:
-  %11 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 0
-  %12 = load <16 x i8>, <16 x i8> addrspace(2)* %11, align 16, !tbaa !0
-  %13 = add i32 %5, %7
-  %14 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %12, i32 0, i32 %13)
-  %15 = extractelement <4 x float> %14, i32 0
-  %16 = extractelement <4 x float> %14, i32 1
-  %17 = extractelement <4 x float> %14, i32 2
-  %18 = extractelement <4 x float> %14, i32 3
-  %19 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %4, i64 0, i64 1
-  %20 = load <16 x i8>, <16 x i8> addrspace(2)* %19, align 16, !tbaa !0
-  %21 = add i32 %5, %7
-  %22 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> %20, i32 0, i32 %21)
-  %23 = extractelement <4 x float> %22, i32 0
-  %24 = extractelement <4 x float> %22, i32 1
-  %25 = extractelement <4 x float> %22, i32 2
-  %26 = extractelement <4 x float> %22, i32 3
-  call void @llvm.SI.export(i32 15, i32 0, i32 1, i32 12, i32 0, float %15, float %16, float %17, float %18)
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %23, float %24, float %25, float %26)
+  %tmp = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 0
+  %tmp11 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp, align 16, !tbaa !0
+  %tmp12 = add i32 %arg5, %arg7
+  %tmp11.cast = bitcast <16 x i8> %tmp11 to <4 x i32>
+  %tmp13 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp11.cast, i32 %tmp12, i32 0, i1 false, i1 false)
+  %tmp14 = extractelement <4 x float> %tmp13, i32 0
+  %tmp15 = extractelement <4 x float> %tmp13, i32 1
+  %tmp16 = extractelement <4 x float> %tmp13, i32 2
+  %tmp17 = extractelement <4 x float> %tmp13, i32 3
+  %tmp18 = getelementptr [16 x <16 x i8>], [16 x <16 x i8>] addrspace(2)* %arg4, i64 0, i64 1
+  %tmp19 = load <16 x i8>, <16 x i8> addrspace(2)* %tmp18, align 16, !tbaa !0
+  %tmp20 = add i32 %arg5, %arg7
+  %tmp19.cast = bitcast <16 x i8> %tmp19 to <4 x i32>
+  %tmp21 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %tmp19.cast, i32 %tmp20, i32 0, i1 false, i1 false)
+  %tmp22 = extractelement <4 x float> %tmp21, i32 0
+  %tmp23 = extractelement <4 x float> %tmp21, i32 1
+  %tmp24 = extractelement <4 x float> %tmp21, i32 2
+  %tmp25 = extractelement <4 x float> %tmp21, i32 3
+  call void @llvm.amdgcn.exp.f32(i32 12, i32 15, float %tmp14, float %tmp15, float %tmp16, float %tmp17, i1 false, i1 false) #0
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp22, float %tmp23, float %tmp24, float %tmp25, i1 true, i1 false) #0
   ret void
 }
 
-
-; Function Attrs: convergent nounwind
 declare void @llvm.amdgcn.s.barrier() #1
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
 
-; Function Attrs: nounwind readnone
-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #2
-
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
-
+attributes #0 = { nounwind }
 attributes #1 = { convergent nounwind }
-attributes #2 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
 
 !0 = !{!1, !1, i64 0, i32 1}
 !1 = !{!"const", !2}
diff --git a/test/CodeGen/AMDGPU/waitcnt-flat.ll b/test/CodeGen/AMDGPU/waitcnt-flat.ll
index d29bae45d8c23185247359e29b312e9f42d0ea5d..5d86b12da95fd177cf2edea9a3134762fba325c3 100644
--- a/test/CodeGen/AMDGPU/waitcnt-flat.ll
+++ b/test/CodeGen/AMDGPU/waitcnt-flat.ll
@@ -9,7 +9,7 @@
 ; XGCN: flat_store_dword v[{{[0-9]+:[0-9]+}}], [[DATA:v[0-9]+]]
 ; XGCN: s_waitcnt vmcnt(0) lgkmcnt(0)
 ; XGCN: flat_load_dword [[DATA]], v[{{[0-9]+:[0-9]+}}]
-define void @test(i32 addrspace(1)* %out, i32 %in) {
+define amdgpu_kernel void @test(i32 addrspace(1)* %out, i32 %in) {
   store volatile i32 0, i32 addrspace(1)* %out
   %val = load volatile i32, i32 addrspace(1)* %out
   ret void
diff --git a/test/CodeGen/AMDGPU/waitcnt.mir b/test/CodeGen/AMDGPU/waitcnt.mir
index cb5de6a2419d434a4e5f78ef2229e1f8d6ab8ce1..38662e83b359dd79015a02da14122a3d6b4052c2 100644
--- a/test/CodeGen/AMDGPU/waitcnt.mir
+++ b/test/CodeGen/AMDGPU/waitcnt.mir
@@ -1,12 +1,21 @@
 # RUN: llc -march=amdgcn -mcpu=fiji -run-pass si-insert-waits  %s -o - | FileCheck %s
 
 --- |
-  define void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
+  define amdgpu_kernel void @flat_zero_waitcnt(i32 addrspace(1)* %global4,
                                  <4 x i32> addrspace(1)* %global16,
                                  i32 addrspace(4)* %flat4,
                                  <4 x i32> addrspace(4)* %flat16) {
     ret void
   }
+
+  define amdgpu_kernel void @single_fallthrough_successor_no_end_block_wait() {
+    ret void
+  }
+
+  define amdgpu_kernel void @single_branch_successor_not_next_block() {
+    ret void
+  }
+
 ...
 ---
 
@@ -21,18 +30,21 @@
 
 # CHECK-LABEL: bb.1:
 # CHECK: FLAT_LOAD_DWORD
+# CHECK: S_WAITCNT 3952
 # CHECK: FLAT_LOAD_DWORDX4
 # The first load has no mem operand, so we should assume it accesses the flat
 # address space.
 # s_waitcnt vmcnt(0) lgkmcnt(0)
-# CHECK-NEXT: S_WAITCNT 112
+# CHECK-NEXT: S_WAITCNT 127
 
 # CHECK-LABEL: bb.2:
 # CHECK: FLAT_LOAD_DWORD
+# CHECK: S_WAITCNT 3952
 # CHECK: FLAT_LOAD_DWORDX4
+
 # One outstand loads access the flat address space.
 # s_waitcnt vmcnt(0) lgkmcnt(0)
-# CHECK-NEXT: S_WAITCNT 112
+# CHECK-NEXT: S_WAITCNT 127
 
 name: flat_zero_waitcnt
 
@@ -57,3 +69,60 @@ body: |
     %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec
     S_ENDPGM
 ...
+---
+# There is only a single fallthrough successor block, so there's no
+# need to wait immediately.
+
+# CHECK-LABEL: name: single_fallthrough_successor_no_end_block_wait
+# CHECK:   %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2
+# CHECK-NOT: S_WAITCNT
+
+# CHECK: bb.1:
+# CHECK-NEXT: V_LSHLREV_B64
+# CHECK-NEXT: S_WAITCNT 112
+# CHECK-NEXT: FLAT_STORE_DWORD
+name: single_fallthrough_successor_no_end_block_wait
+
+body: |
+  bb.0:
+    successors: %bb.1
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
+
+  bb.1:
+    %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
+    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
+---
+# The block has a single predecessor with a single successor, but it
+# is not the next block so it's non-obvious that the wait is not needed.
+
+
+# CHECK-LABEL: name: single_branch_successor_not_next_block
+# CHECK:   %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2
+# CHECK-NEXT: S_WAITCNT 112
+
+# CHECK: bb.1
+# CHECK-NEXT: FLAT_STORE_DWORD
+# CHECK-NEXT: S_ENDPGM
+
+# CHECK: bb.2:
+# CHECK-NEXT: V_LSHLREV_B64
+# CHECK-NEXT: FLAT_STORE_DWORD
+name: single_branch_successor_not_next_block
+
+body: |
+  bb.0:
+    successors: %bb.2
+    %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr
+   S_BRANCH %bb.2
+
+  bb.1:
+    FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+
+  bb.2:
+     %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec
+    FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr
+    S_ENDPGM
+...
diff --git a/test/CodeGen/AMDGPU/wqm.ll b/test/CodeGen/AMDGPU/wqm.ll
index 3f7b2b284c535e9805d5347e2893574d1af03231..9f277b2c9a59da7620e1ddfae46888e595528caf 100644
--- a/test/CodeGen/AMDGPU/wqm.ll
+++ b/test/CodeGen/AMDGPU/wqm.ll
@@ -1,5 +1,5 @@
-;RUN: llc < %s -march=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=SI
-;RUN: llc < %s -march=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s --check-prefix=CHECK --check-prefix=VI
+; RUN: llc -march=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=SI %s
+; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=CHECK -check-prefix=VI %s
 
 ; Check that WQM isn't triggered by image load/store intrinsics.
 ;
@@ -18,16 +18,14 @@ main_body:
 ;CHECK-NEXT: ; %main_body
 ;CHECK-NEXT: s_wqm_b64 exec, exec
 ;CHECK-NOT: exec
-define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x i32> %c) {
+define amdgpu_ps void @test2(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <4 x float> %c) {
 main_body:
-  %c.1 = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.1 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %c.2 = bitcast <4 x float> %c.1 to <4 x i32>
   %c.3 = extractelement <4 x i32> %c.2, i32 0
   %gep = getelementptr float, float addrspace(1)* %ptr, i32 %c.3
   %data = load float, float addrspace(1)* %gep
-
-  call void @llvm.SI.export(i32 15, i32 1, i32 1, i32 0, i32 1, float %data, float undef, float undef, float undef)
-
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float %data, float undef, float undef, float undef, i1 true, i1 true) #1
   ret void
 }
 
@@ -42,9 +40,9 @@ main_body:
 ;CHECK: store
 ;CHECK-NOT: exec
 ;CHECK: .size test3
-define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x i32> %c) {
+define amdgpu_ps <4 x float> @test3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, <4 x float> %c) {
 main_body:
-  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %tex.1 = bitcast <4 x float> %tex to <4 x i32>
   %tex.2 = extractelement <4 x i32> %tex.1, i32 0
 
@@ -70,10 +68,9 @@ main_body:
   %c.1 = mul i32 %c, %d
 
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> undef, <4 x i32> undef, i32 %c.1, i32 0, i1 0, i1 0)
-
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.1.bc = bitcast i32 %c.1 to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.1.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   ret <4 x float> %dtex
 }
 
@@ -101,9 +98,9 @@ main_body:
   br i1 %cmp, label %IF, label %ELSE
 
 IF:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %data.if = extractelement <4 x float> %dtex, i32 0
   br label %END
 
@@ -143,9 +140,9 @@ main_body:
   br i1 %cmp, label %ELSE, label %IF
 
 IF:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.bc = bitcast i32 %c to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %data.if = extractelement <4 x float> %dtex, i32 0
   br label %END
 
@@ -200,7 +197,8 @@ ELSE:
 
 END:
   %coord.END = phi i32 [ %coord.IF, %IF ], [ %coord.ELSE, %ELSE ]
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord.END, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %coord.END.bc = bitcast i32 %coord.END to float
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord.END.bc, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   ret <4 x float> %tex
 }
 
@@ -215,13 +213,11 @@ END:
 ;CHECK: image_sample
 ;CHECK: v_cmp
 ;CHECK: store
-define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, i32 %coord) {
+define amdgpu_ps float @test_control_flow_3(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %coord) {
 main_body:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %dtex.1 = extractelement <4 x float> %dtex, i32 0
-
   call void @llvm.amdgcn.buffer.store.f32(float %dtex.1, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
 
   %cc = fcmp ogt float %dtex.1, 0.0
@@ -254,7 +250,7 @@ END:
 ;CHECK: %END
 ;CHECK: image_sample
 ;CHECK: image_sample
-define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %coord, i32 %y, float %z) {
+define amdgpu_ps <4 x float> @test_control_flow_4(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float %coord, i32 %y, float %z) {
 main_body:
   %cond = icmp eq i32 %y, 0
   br i1 %cond, label %IF, label %END
@@ -265,9 +261,8 @@ IF:
   br label %END
 
 END:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   ret <4 x float> %dtex
 }
 
@@ -286,10 +281,9 @@ END:
 ;CHECK: buffer_store_dword
 ;CHECK: s_mov_b64 exec, [[SAVE]]
 ;CHECK: image_sample
-define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, i32 %coord, i32 %coord2, float %z) {
+define amdgpu_ps <4 x float> @test_kill_0(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, float addrspace(1)* inreg %ptr, <2 x i32> %idx, <2 x float> %data, float %coord, float %coord2, float %z) {
 main_body:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %idx.0 = extractelement <2 x i32> %idx, i32 0
   %data.0 = extractelement <2 x float> %data, i32 0
   call void @llvm.amdgcn.buffer.store.f32(float %data.0, <4 x i32> undef, i32 %idx.0, i32 0, i1 0, i1 0)
@@ -299,10 +293,8 @@ main_body:
   %idx.1 = extractelement <2 x i32> %idx, i32 1
   %data.1 = extractelement <2 x float> %data, i32 1
   call void @llvm.amdgcn.buffer.store.f32(float %data.1, <4 x i32> undef, i32 %idx.1, i32 0, i1 0, i1 0)
-
-  %tex2 = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex2.1 = bitcast <4 x float> %tex2 to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex2.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex2 = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex2, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %out = fadd <4 x float> %tex, %dtex
 
   ret <4 x float> %out
@@ -320,11 +312,10 @@ main_body:
 ; CHECK: buffer_store_dword
 ; CHECK-NOT: wqm
 ; CHECK: v_cmpx_
-define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, i32 %coord, i32 %coord2, float %z) {
+define amdgpu_ps <4 x float> @test_kill_1(<8 x i32> inreg %rsrc, <4 x i32> inreg %sampler, i32 %idx, float %data, float %coord, float %coord2, float %z) {
 main_body:
-  %tex = call <4 x float> @llvm.SI.image.sample.i32(i32 %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.1 = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.1, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %coord, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> %rsrc, <4 x i32> %sampler, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
 
   call void @llvm.amdgcn.buffer.store.f32(float %data, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
 
@@ -375,8 +366,7 @@ loop:
   br i1 %cc, label %break, label %body
 
 body:
-  %c.i = bitcast <4 x float> %c.iv to <4 x i32>
-  %c.next = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %c.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %c.next = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %c.iv, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %ctr.next = fadd float %ctr.iv, 2.0
   br label %loop
 
@@ -394,7 +384,7 @@ break:
 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 ; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, 0
 ; CHECK: s_wqm_b64 exec, exec
-; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+$}}
+; CHECK: buffer_store_dword {{v[0-9]+}}, off, {{s\[[0-9]+:[0-9]+\]}}, {{s[0-9]+}} offset:4{{$}}
 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 ; CHECK: buffer_store_dword {{v[0-9]+}}, {{v[0-9]+}}, {{s\[[0-9]+:[0-9]+\]}}, 0 idxen
 ; CHECK: s_wqm_b64 exec, exec
@@ -416,9 +406,8 @@ entry:
 
   %c.gep = getelementptr [32 x i32], [32 x i32]* %array, i32 0, i32 %idx
   %c = load i32, i32* %c.gep, align 4
-
-  %t = call <4 x float> @llvm.SI.image.sample.i32(i32 %c, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-
+  %c.bc = bitcast i32 %c to float
+  %t = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float %c.bc, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   call void @llvm.amdgcn.buffer.store.v4f32(<4 x float> %t, <4 x i32> undef, i32 0, i32 0, i1 0, i1 0)
 
   ret void
@@ -436,9 +425,8 @@ entry:
 ; CHECK: s_and_b64 exec, exec, [[LIVE]]
 ; CHECK-NOT: exec
 define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
-  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.i = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   ret <4 x float> %dtex
 }
 
@@ -450,10 +438,8 @@ define amdgpu_ps <4 x float> @test_nonvoid_return() nounwind {
 ; CHECK-NOT: exec
 define amdgpu_ps <4 x float> @test_nonvoid_return_unreachable(i32 inreg %c) nounwind {
 entry:
-  %tex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-  %tex.i = bitcast <4 x float> %tex to <4 x i32>
-  %dtex = call <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32> %tex.i, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
-
+  %tex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> undef, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
+  %dtex = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float> %tex, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   %cc = icmp sgt i32 %c, 0
   br i1 %cc, label %if, label %else
 
@@ -485,35 +471,29 @@ main_body:
   br i1 %cc, label %if, label %else
 
 if:
-  %r.if = call <4 x float> @llvm.SI.image.sample.i32(i32 0, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r.if = call <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float 0.0, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   br label %end
 
 else:
-  %r.else = call <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32> <i32 0, i32 1>, <8 x i32> undef, <4 x i32> undef, i32 15, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0)
+  %r.else = call <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float> <float 0.0, float bitcast (i32 1 to float)>, <8 x i32> undef, <4 x i32> undef, i32 15, i1 false, i1 false, i1 false, i1 false, i1 false) #0
   br label %end
 
 end:
   %r = phi <4 x float> [ %r.if, %if ], [ %r.else, %else ]
-
   call void @llvm.amdgcn.buffer.store.f32(float 1.0, <4 x i32> undef, i32 %idx, i32 0, i1 0, i1 0)
-
   ret <4 x float> %r
 }
 
-
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1
 declare void @llvm.amdgcn.image.store.v4f32.v4i32.v8i32(<4 x float>, <4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
-declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #1
-declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #1
-
-declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
-declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #2
-
-declare <4 x float> @llvm.SI.image.sample.i32(i32, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
-declare <4 x float> @llvm.SI.image.sample.v2i32(<2 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
-declare <4 x float> @llvm.SI.image.sample.v4i32(<4 x i32>, <8 x i32>, <4 x i32>, i32, i32, i32, i32, i32, i32, i32, i32) #3
-
-declare void @llvm.AMDGPU.kill(float)
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float)
+declare void @llvm.amdgcn.buffer.store.f32(float, <4 x i32>, i32, i32, i1, i1) #2
+declare void @llvm.amdgcn.buffer.store.v4f32(<4 x float>, <4 x i32>, i32, i32, i1, i1) #2
+declare <4 x float> @llvm.amdgcn.image.load.v4f32.v4i32.v8i32(<4 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #3
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #3
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.f32.v8i32(float, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v2f32.v8i32(<2 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
+declare <4 x float> @llvm.amdgcn.image.sample.v4f32.v4f32.v8i32(<4 x float>, <8 x i32>, <4 x i32>, i32, i1, i1, i1, i1, i1) #3
+declare void @llvm.AMDGPU.kill(float) #1
 
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
diff --git a/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll b/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
index deac809f9b055670a2a0db8f0114add0b914eefc..b1ee016e99c9c6362930d746f0001bd062123aee 100644
--- a/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
+++ b/test/CodeGen/AMDGPU/write-register-vgpr-into-sgpr.ll
@@ -10,7 +10,7 @@ declare void @llvm.write_register.i32(metadata, i32) #0
 declare i32 @llvm.amdgcn.workitem.id.x() #0
 
 
-define void @write_vgpr_into_sgpr() {
+define amdgpu_kernel void @write_vgpr_into_sgpr() {
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
   call void @llvm.write_register.i32(metadata !0, i32 %tid)
   ret void
diff --git a/test/CodeGen/AMDGPU/write_register.ll b/test/CodeGen/AMDGPU/write_register.ll
index 88660ba6ec6a8a868b74b96d5612056ce1138579..9c62e003dde0f75d21b7144f7c8597d29e7b373f 100644
--- a/test/CodeGen/AMDGPU/write_register.ll
+++ b/test/CodeGen/AMDGPU/write_register.ll
@@ -4,7 +4,7 @@ declare void @llvm.write_register.i32(metadata, i32) #0
 declare void @llvm.write_register.i64(metadata, i64) #0
 
 ; CHECK-LABEL: {{^}}test_write_m0:
-define void @test_write_m0(i32 %val) #0 {
+define amdgpu_kernel void @test_write_m0(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !0, i32 0)
   call void @llvm.write_register.i32(metadata !0, i32 -1)
   call void @llvm.write_register.i32(metadata !0, i32 %val)
@@ -15,7 +15,7 @@ define void @test_write_m0(i32 %val) #0 {
 ; CHECK: s_mov_b64 exec, 0
 ; CHECK: s_mov_b64 exec, -1
 ; CHECK: s_mov_b64 exec, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_write_exec(i64 %val) #0 {
+define amdgpu_kernel void @test_write_exec(i64 %val) #0 {
   call void @llvm.write_register.i64(metadata !1, i64 0)
   call void @llvm.write_register.i64(metadata !1, i64 -1)
   call void @llvm.write_register.i64(metadata !1, i64 %val)
@@ -26,7 +26,7 @@ define void @test_write_exec(i64 %val) #0 {
 ; CHECK: s_mov_b64 flat_scratch, 0
 ; CHECK: s_mov_b64 flat_scratch, -1
 ; CHECK: s_mov_b64 flat_scratch, s{{\[[0-9]+:[0-9]+\]}}
-define void @test_write_flat_scratch(i64 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch(i64 %val) #0 {
   call void @llvm.write_register.i64(metadata !2, i64 0)
   call void @llvm.write_register.i64(metadata !2, i64 -1)
   call void @llvm.write_register.i64(metadata !2, i64 %val)
@@ -36,7 +36,7 @@ define void @test_write_flat_scratch(i64 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_flat_scratch_lo:
 ; CHECK: s_mov_b32 flat_scratch_lo, 0
 ; CHECK: s_mov_b32 flat_scratch_lo, s{{[0-9]+}}
-define void @test_write_flat_scratch_lo(i32 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch_lo(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !3, i32 0)
   call void @llvm.write_register.i32(metadata !3, i32 %val)
   ret void
@@ -45,7 +45,7 @@ define void @test_write_flat_scratch_lo(i32 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_flat_scratch_hi:
 ; CHECK: s_mov_b32 flat_scratch_hi, 0
 ; CHECK: s_mov_b32 flat_scratch_hi, s{{[0-9]+}}
-define void @test_write_flat_scratch_hi(i32 %val) #0 {
+define amdgpu_kernel void @test_write_flat_scratch_hi(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !4, i32 0)
   call void @llvm.write_register.i32(metadata !4, i32 %val)
   ret void
@@ -54,7 +54,7 @@ define void @test_write_flat_scratch_hi(i32 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_exec_lo:
 ; CHECK: s_mov_b32 exec_lo, 0
 ; CHECK: s_mov_b32 exec_lo, s{{[0-9]+}}
-define void @test_write_exec_lo(i32 %val) #0 {
+define amdgpu_kernel void @test_write_exec_lo(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !5, i32 0)
   call void @llvm.write_register.i32(metadata !5, i32 %val)
   ret void
@@ -63,7 +63,7 @@ define void @test_write_exec_lo(i32 %val) #0 {
 ; CHECK-LABEL: {{^}}test_write_exec_hi:
 ; CHECK: s_mov_b32 exec_hi, 0
 ; CHECK: s_mov_b32 exec_hi, s{{[0-9]+}}
-define void @test_write_exec_hi(i32 %val) #0 {
+define amdgpu_kernel void @test_write_exec_hi(i32 %val) #0 {
   call void @llvm.write_register.i32(metadata !6, i32 0)
   call void @llvm.write_register.i32(metadata !6, i32 %val)
   ret void
diff --git a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
index 7f6b8045904731763a1f06914c514ab7539cee61..36532365d87183667098c4d77b419c7557b9c93d 100644
--- a/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
+++ b/test/CodeGen/AMDGPU/wrong-transalu-pos-fix.ll
@@ -4,7 +4,7 @@
 ;CHECK: {{^}}fill3d:
 ;CHECK-NOT: MULLO_INT T[0-9]+
 
-define void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
+define amdgpu_kernel void @fill3d(i32 addrspace(1)* nocapture %out) #0 {
 entry:
   %x.i = tail call i32 @llvm.r600.read.global.size.x() #1
   %y.i18 = tail call i32 @llvm.r600.read.global.size.y() #1
diff --git a/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll b/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
index babae9ead27caf09759bba266e8af3ff74c8e0ba..88ef9fd93c8f1f00d6db770cf0f82f0ef4377421 100644
--- a/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
+++ b/test/CodeGen/AMDGPU/xfail.r600.bitcast.ll
@@ -5,7 +5,7 @@
 
 ; TODO: enable doubles
 ; FUNC-LABEL: {{^}}bitcast_f64_to_v2i32:
-define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
+define amdgpu_kernel void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace(1)* %in) {
   %val = load double, double addrspace(1)* %in, align 8
   %add = fadd double %val, 4.0
   %bc = bitcast double %add to <2 x i32>
@@ -14,7 +14,7 @@ define void @bitcast_f64_to_v2i32(<2 x i32> addrspace(1)* %out, double addrspace
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2i64_to_v2f64:
-define void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
+define amdgpu_kernel void @bitcast_v2i64_to_v2f64(i32 %cond, <2 x double> addrspace(1)* %out, <2 x i64> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
@@ -30,7 +30,7 @@ end:
 }
 
 ; FUNC-LABEL: {{^}}bitcast_v2f64_to_v2i64:
-define void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
+define amdgpu_kernel void @bitcast_v2f64_to_v2i64(i32 %cond, <2 x i64> addrspace(1)* %out, <2 x double> %value) {
 entry:
   %cmp0 = icmp eq i32 %cond, 0
   br i1 %cmp0, label %if, label %end
diff --git a/test/CodeGen/AMDGPU/xor.ll b/test/CodeGen/AMDGPU/xor.ll
index bf02d4c3b31188aafde7e2e32dd64b19ba18844f..57a082a0170c35861bb3b9f5d1c2149783481f3b 100644
--- a/test/CodeGen/AMDGPU/xor.ll
+++ b/test/CodeGen/AMDGPU/xor.ll
@@ -10,7 +10,7 @@
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 v{{[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in0, <2 x i32> addrspace(1)* %in1) {
   %a = load <2 x i32>, <2 x i32> addrspace(1) * %in0
   %b = load <2 x i32>, <2 x i32> addrspace(1) * %in1
   %result = xor <2 x i32> %a, %b
@@ -29,7 +29,7 @@ define void @xor_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %in
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 ; SI: v_xor_b32_e32 {{v[0-9]+, v[0-9]+, v[0-9]+}}
 
-define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in0, <4 x i32> addrspace(1)* %in1) {
   %a = load <4 x i32>, <4 x i32> addrspace(1) * %in0
   %b = load <4 x i32>, <4 x i32> addrspace(1) * %in1
   %result = xor <4 x i32> %a, %b
@@ -46,7 +46,7 @@ define void @xor_v4i32(<4 x i32> addrspace(1)* %out, <4 x i32> addrspace(1)* %in
 ; SI: v_cndmask_b32_e32 [[RESULT:v[0-9]+]], {{v[0-9]+}}, {{v[0-9]+}}
 ; SI: buffer_store_dword [[RESULT]]
 ; SI: s_endpgm
-define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
+define amdgpu_kernel void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float addrspace(1)* %in1) {
   %a = load float, float addrspace(1) * %in0
   %b = load float, float addrspace(1) * %in1
   %acmp = fcmp oge float %a, 0.000000e+00
@@ -63,7 +63,7 @@ define void @xor_i1(float addrspace(1)* %out, float addrspace(1)* %in0, float ad
 ; SI: v_xor_b32_e32 [[XOR:v[0-9]+]], [[A]], [[B]]
 ; SI: v_and_b32_e32 [[RESULT:v[0-9]+]], 1, [[XOR]]
 ; SI: buffer_store_byte [[RESULT]]
-define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
+define amdgpu_kernel void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace(1)* %in1) {
   %a = load volatile i1, i1 addrspace(1)* %in0
   %b = load volatile i1, i1 addrspace(1)* %in1
   %xor = xor i1 %a, %b
@@ -73,7 +73,7 @@ define void @v_xor_i1(i1 addrspace(1)* %out, i1 addrspace(1)* %in0, i1 addrspace
 
 ; FUNC-LABEL: {{^}}vector_xor_i32:
 ; SI: v_xor_b32_e32
-define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32, i32 addrspace(1)* %in0
   %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, %b
@@ -83,7 +83,7 @@ define void @vector_xor_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
 
 ; FUNC-LABEL: {{^}}scalar_xor_i32:
 ; SI: s_xor_b32
-define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
   %result = xor i32 %a, %b
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -91,7 +91,7 @@ define void @scalar_xor_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 
 ; FUNC-LABEL: {{^}}scalar_not_i32:
 ; SI: s_not_b32
-define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
+define amdgpu_kernel void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
   %result = xor i32 %a, -1
   store i32 %result, i32 addrspace(1)* %out
   ret void
@@ -99,7 +99,7 @@ define void @scalar_not_i32(i32 addrspace(1)* %out, i32 %a) {
 
 ; FUNC-LABEL: {{^}}vector_not_i32:
 ; SI: v_not_b32
-define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %a = load i32, i32 addrspace(1)* %in0
   %b = load i32, i32 addrspace(1)* %in1
   %result = xor i32 %a, -1
@@ -111,7 +111,7 @@ define void @vector_not_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in0, i32
 ; SI: v_xor_b32_e32
 ; SI: v_xor_b32_e32
 ; SI: s_endpgm
-define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64, i64 addrspace(1)* %in0
   %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, %b
@@ -122,7 +122,7 @@ define void @vector_xor_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64
 ; FUNC-LABEL: {{^}}scalar_xor_i64:
 ; SI: s_xor_b64
 ; SI: s_endpgm
-define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %result = xor i64 %a, %b
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -130,7 +130,7 @@ define void @scalar_xor_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
 
 ; FUNC-LABEL: {{^}}scalar_not_i64:
 ; SI: s_not_b64
-define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
   %result = xor i64 %a, -1
   store i64 %result, i64 addrspace(1)* %out
   ret void
@@ -139,7 +139,7 @@ define void @scalar_not_i64(i64 addrspace(1)* %out, i64 %a) {
 ; FUNC-LABEL: {{^}}vector_not_i64:
 ; SI: v_not_b32
 ; SI: v_not_b32
-define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
+define amdgpu_kernel void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64 addrspace(1)* %in1) {
   %a = load i64, i64 addrspace(1)* %in0
   %b = load i64, i64 addrspace(1)* %in1
   %result = xor i64 %a, -1
@@ -153,7 +153,7 @@ define void @vector_not_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i64
 
 ; FUNC-LABEL: {{^}}xor_cf:
 ; SI: s_xor_b64
-define void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
+define amdgpu_kernel void @xor_cf(i64 addrspace(1)* %out, i64 addrspace(1)* %in, i64 %a, i64 %b) {
 entry:
   %0 = icmp eq i64 %a, 0
   br i1 %0, label %if, label %else
@@ -178,7 +178,7 @@ endif:
 ; SI-DAG: s_xor_b32 s[[RES_LO:[0-9]+]], s[[LO]], 0x3039
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_LO]]
 ; SI-DAG: v_mov_b32_e32 v{{[0-9]+}}, s[[RES_HI]]
-define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -192,7 +192,7 @@ define void @scalar_xor_literal_i64(i64 addrspace(1)* %out, i64 %a) {
 
 ; SI: s_add_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_LO]]
 ; SI: s_addc_u32 s{{[0-9]+}}, s{{[0-9]+}}, s[[K_HI]]
-define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
+define amdgpu_kernel void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i64 %b) {
   %or = xor i64 %a, 4261135838621753
   store i64 %or, i64 addrspace(1)* %out
 
@@ -211,7 +211,7 @@ define void @scalar_xor_literal_multi_use_i64(i64 addrspace(1)* %out, i64 %a, i6
 ; SI: v_mov_b32_e32 v[[VHI:[0-9]+]], s[[VAL_HI]]
 ; SI-NOT: xor_b32
 ; SI: buffer_store_dwordx2 v{{\[}}[[VLO]]:[[VHI]]{{\]}}
-define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, 63
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -220,7 +220,7 @@ define void @scalar_xor_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 ; FUNC-LABEL: {{^}}scalar_xor_neg_inline_imm_i64:
 ; SI: s_load_dwordx2 [[VAL:s\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, {{0xb|0x2c}}
 ; SI: s_xor_b64 [[VAL]], [[VAL]], -8
-define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
+define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
   %or = xor i64 %a, -8
   store i64 %or, i64 addrspace(1)* %out
   ret void
@@ -231,7 +231,7 @@ define void @scalar_xor_neg_inline_imm_i64(i64 addrspace(1)* %out, i64 %a) {
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -8, v[[LO_VREG]]
 ; SI: v_xor_b32_e32 {{v[0-9]+}}, -1, {{.*}}
 ; SI: s_endpgm
-define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = xor i64 %loada, -8
   store i64 %or, i64 addrspace(1)* %out
@@ -243,7 +243,7 @@ define void @vector_xor_i64_neg_inline_imm(i64 addrspace(1)* %out, i64 addrspace
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0xdf77987f, v[[LO_VREG]]
 ; SI-DAG: v_xor_b32_e32 {{v[0-9]+}}, 0x146f, v[[HI_VREG]]
 ; SI: s_endpgm
-define void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
+define amdgpu_kernel void @vector_xor_literal_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %a, i64 addrspace(1)* %b) {
   %loada = load i64, i64 addrspace(1)* %a, align 8
   %or = xor i64 %loada, 22470723082367
   store i64 %or, i64 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/zero_extend.ll b/test/CodeGen/AMDGPU/zero_extend.ll
index 57209961760588c9a687d346600f2925890c1735..f256d89f0cb7263182b18802fcb7505b5944ac47 100644
--- a/test/CodeGen/AMDGPU/zero_extend.ll
+++ b/test/CodeGen/AMDGPU/zero_extend.ll
@@ -9,7 +9,7 @@
 ; SI: {{^}}s_mad_zext_i32_to_i64:
 ; SI: v_mov_b32_e32 v[[V_ZERO:[0-9]]], 0{{$}}
 ; SI: buffer_store_dwordx2 v[0:[[V_ZERO]]{{\]}}
-define void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
+define amdgpu_kernel void @s_mad_zext_i32_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b, i32 %c) #0 {
 entry:
   %tmp0 = mul i32 %a, %b
   %tmp1 = add i32 %tmp0, %c
@@ -20,7 +20,7 @@ entry:
 
 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i32
 ; SI: v_cndmask_b32
-define void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 entry:
   %tmp0 = icmp eq i32 %a, %b
   %tmp1 = zext i1 %tmp0 to i32
@@ -29,7 +29,7 @@ entry:
 }
 
 ; SI-LABEL: {{^}}s_arg_zext_i1_to_i64:
-define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
+define amdgpu_kernel void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
   %ext = zext i1 %arg to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
   ret void
@@ -39,7 +39,7 @@ define void @s_arg_zext_i1_to_i64(i64 addrspace(1)* %out, i1 zeroext %arg) #0 {
 ; SI: s_mov_b32 s{{[0-9]+}}, 0
 ; SI: v_cmp_eq_u32
 ; SI: v_cndmask_b32
-define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
   %cmp = icmp eq i32 %a, %b
   %ext = zext i1 %cmp to i64
   store i64 %ext, i64 addrspace(1)* %out, align 8
@@ -49,7 +49,7 @@ define void @s_cmp_zext_i1_to_i64(i64 addrspace(1)* %out, i32 %a, i32 %b) #0 {
 ; SI-LABEL: {{^}}s_cmp_zext_i1_to_i16
 ; SI: v_cndmask_b32_e64 [[RESULT:v[0-9]+]], 0, 1, vcc
 ; SI: buffer_store_short [[RESULT]]
-define void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
+define amdgpu_kernel void @s_cmp_zext_i1_to_i16(i16 addrspace(1)* %out, i16 zeroext %a, i16 zeroext %b) #0 {
   %tmp0 = icmp eq i16 %a, %b
   %tmp1 = zext i1 %tmp0 to i16
   store i16 %tmp1, i16 addrspace(1)* %out
diff --git a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
index 842c30b40df2f08f8f2fad6f11920bd872beeb0c..a902234898cd039743202f0cd2982b8dbd119d3b 100644
--- a/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
+++ b/test/CodeGen/AMDGPU/zext-i64-bit-operand.ll
@@ -11,7 +11,7 @@
 ; GCN-NOT: v[[HI]]
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
   %ext = zext i32 %ld.32 to i64
@@ -31,7 +31,7 @@ define void @zext_or_operand_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0,
 ; GCN-NOT: _or_
 ; GCN-NOT: v_mov_b32_e32 v{{[0-9]+}}, 0
 ; GCN: buffer_store_dwordx2 v{{\[}}[[LO]]:[[HI]]{{\]}}
-define void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
+define amdgpu_kernel void @zext_or_operand_commute_i64(i64 addrspace(1)* %out, i64 addrspace(1)* %in0, i32 addrspace(1)* %in1) {
   %ld.64 = load volatile i64, i64 addrspace(1)* %in0
   %ld.32 = load volatile i32, i32 addrspace(1)* %in1
   %ext = zext i32 %ld.32 to i64
diff --git a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
index 52cc37e2408453ee85d37deac39741613ccb20ef..b8f2980be7502bd2577885b78bdf66954e233c72 100644
--- a/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
+++ b/test/CodeGen/ARM/2007-05-22-tailmerge-3.ll
@@ -12,11 +12,11 @@
 ; CHECK: bl _quux
 ; CHECK-NOT: bl _quux
 
-; NOMERGE: bl _baz
-; NOMERGE: bl _baz
+; NOMERGE-DAG: bl _baz
+; NOMERGE-DAG: bl _baz
 
-; NOMERGE: bl _quux
-; NOMERGE: bl _quux
+; NOMERGE-DAG: bl _quux
+; NOMERGE-DAG: bl _quux
 
 ; ModuleID = 'tail.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
diff --git a/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll b/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll
index 5d59fc64d92220a4cf40fd1d7eccb712527da2dd..e5c2fb4d67a1b852cc338e9a2c17d0e8b63b2ff2 100644
--- a/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll
+++ b/test/CodeGen/ARM/2009-05-18-InlineAsmMem.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
 ; PR4091
 
 define void @foo(i32 %i, i32* %p) nounwind {
diff --git a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
index 4a1341c4d6e7106bce7c559cd40a29984892e1ec..2a5af6199a345cb8286bc67a00f3afa9e2a5060d 100644
--- a/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
+++ b/test/CodeGen/ARM/2012-10-04-AAPCS-byval-align8.ll
@@ -12,13 +12,14 @@ define void @test_byval_8_bytes_alignment(i32 %i, ...) {
 entry:
 ; CHECK: sub       sp, sp, #12
 ; CHECK: sub       sp, sp, #4
-; CHECK: stmib     sp, {r1, r2, r3}
+; CHECK: add       r0, sp, #4
+; CHECK: stm       sp, {r0, r1, r2, r3}
   %g = alloca i8*
   %g1 = bitcast i8** %g to i8*
   call void @llvm.va_start(i8* %g1)
 
 ; CHECK: add	[[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7
-; CHECK: bfc	[[REG]], #0, #3
+; CHECK: bic	[[REG]], [[REG]], #7
   %0 = va_arg i8** %g, double
   call void @llvm.va_end(i8* %g1)
 
diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
index ab2ddf88ff390b36b6a1ec5aae8a255b3e0e799e..66d9033a6d7cba987b1827ad3086534b51cf7f7e 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir
@@ -9,7 +9,21 @@
   define void @test_add_s16() { ret void }
   define void @test_add_s32() { ret void }
 
+  define void @test_fadd_s32() #0 { ret void }
+  define void @test_fadd_s64() #0 { ret void }
+
   define void @test_load_from_stack() { ret void }
+  define void @test_load_f32() #0 { ret void }
+  define void @test_load_f64() #0 { ret void }
+
+  define void @test_stores() #0 { ret void }
+
+  define void @test_gep() { ret void }
+  define void @test_constants() { ret void }
+
+  define void @test_soft_fp_double() #0 { ret void }
+
+  attributes #0 = { "target-features"="+vfp2,-neonfp" }
 ...
 ---
 name:            test_zext_s1
@@ -217,6 +231,72 @@ body:             |
     ; CHECK: BX_RET 14, _, implicit %r0
 ...
 ---
+name:            test_fadd_s32
+# CHECK-LABEL: name: test_fadd_s32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: fprb }
+  - { id: 1, class: fprb }
+  - { id: 2, class: fprb }
+# CHECK: id: 0, class: spr
+# CHECK: id: 1, class: spr
+# CHECK: id: 2, class: spr
+body:             |
+  bb.0:
+    liveins: %s0, %s1
+
+    %0(s32) = COPY %s0
+    ; CHECK: [[VREGX:%[0-9]+]] = COPY %s0
+
+    %1(s32) = COPY %s1
+    ; CHECK: [[VREGY:%[0-9]+]] = COPY %s1
+
+    %2(s32) = G_FADD %0, %1
+    ; CHECK: [[VREGSUM:%[0-9]+]] = VADDS [[VREGX]], [[VREGY]], 14, _
+
+    %s0 = COPY %2(s32)
+    ; CHECK: %s0 = COPY [[VREGSUM]]
+
+    BX_RET 14, _, implicit %s0
+    ; CHECK: BX_RET 14, _, implicit %s0
+...
+---
+name:            test_fadd_s64
+# CHECK-LABEL: name: test_fadd_s64
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: fprb }
+  - { id: 1, class: fprb }
+  - { id: 2, class: fprb }
+# CHECK: id: 0, class: dpr
+# CHECK: id: 1, class: dpr
+# CHECK: id: 2, class: dpr
+body:             |
+  bb.0:
+    liveins: %d0, %d1
+
+    %0(s64) = COPY %d0
+    ; CHECK: [[VREGX:%[0-9]+]] = COPY %d0
+
+    %1(s64) = COPY %d1
+    ; CHECK: [[VREGY:%[0-9]+]] = COPY %d1
+
+    %2(s64) = G_FADD %0, %1
+    ; CHECK: [[VREGSUM:%[0-9]+]] = VADDD [[VREGX]], [[VREGY]], 14, _
+
+    %d0 = COPY %2(s64)
+    ; CHECK: %d0 = COPY [[VREGSUM]]
+
+    BX_RET 14, _, implicit %d0
+    ; CHECK: BX_RET 14, _, implicit %d0
+...
+---
 name:            test_load_from_stack
 # CHECK-LABEL: name: test_load_from_stack
 legalized:       true
@@ -245,15 +325,213 @@ body:             |
     %0(p0) = G_FRAME_INDEX %fixed-stack.2
     ; CHECK: [[FI32VREG:%[0-9]+]] = ADDri %fixed-stack.[[FI32]], 0, 14, _, _
 
-    %1(s32) = G_LOAD %0(p0)
-    ; CHECK: {{%[0-9]+}} = LDRi12 [[FI32VREG]], 0, 14, _
+    %1(s32) = G_LOAD %0(p0) :: (load 4)
+    ; CHECK: [[LD32VREG:%[0-9]+]] = LDRi12 [[FI32VREG]], 0, 14, _
+
+    %r0 = COPY %1
+    ; CHECK: %r0 = COPY [[LD32VREG]]
 
     %2(p0) = G_FRAME_INDEX %fixed-stack.0
     ; CHECK: [[FI1VREG:%[0-9]+]] = ADDri %fixed-stack.[[FI1]], 0, 14, _, _
 
-    %3(s1) = G_LOAD %2(p0)
-    ; CHECK: {{%[0-9]+}} = LDRBi12 [[FI1VREG]], 0, 14, _
+    %3(s1) = G_LOAD %2(p0) :: (load 1)
+    ; CHECK: [[LD1VREG:%[0-9]+]] = LDRBi12 [[FI1VREG]], 0, 14, _
+
+    %r0 = COPY %3
+    ; CHECK: %r0 = COPY [[LD1VREG]]
 
     BX_RET 14, _
     ; CHECK: BX_RET 14, _
 ...
+---
+name:            test_load_f32
+# CHECK-LABEL: name: test_load_f32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: fprb }
+# CHECK-DAG: id: [[P:[0-9]+]], class: gpr
+# CHECK-DAG: id: [[V:[0-9]+]], class: spr
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2, %r3
+
+    %0(p0) = COPY %r0
+
+    %1(s32) = G_LOAD %0(p0) :: (load 4)
+    ; CHECK: %[[V]] = VLDRS %[[P]], 0, 14, _
+
+    %s0 = COPY %1
+    ; CHECK: %s0 = COPY %[[V]]
+
+    BX_RET 14, _, implicit %s0
+    ; CHECK: BX_RET 14, _, implicit %s0
+...
+---
+name:            test_load_f64
+# CHECK-LABEL: name: test_load_f64
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: fprb }
+# CHECK-DAG: id: [[P:[0-9]+]], class: gpr
+# CHECK-DAG: id: [[V:[0-9]+]], class: dpr
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2, %r3
+
+    %0(p0) = COPY %r0
+
+    %1(s64) = G_LOAD %0(p0) :: (load 8)
+    ; CHECK: %[[V]] = VLDRD %[[P]], 0, 14, _
+
+    %d0 = COPY %1
+    ; CHECK: %d0 = COPY %[[V]]
+
+    BX_RET 14, _, implicit %d0
+    ; CHECK: BX_RET 14, _, implicit %d0
+...
+---
+name:            test_stores
+# CHECK-LABEL: name: test_stores
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: fprb }
+  - { id: 5, class: fprb }
+# CHECK: id: [[P:[0-9]+]], class: gpr
+# CHECK: id: [[I8:[0-9]+]], class: gpr
+# CHECK: id: [[I16:[0-9]+]], class: gpr
+# CHECK: id: [[I32:[0-9]+]], class: gpr
+# CHECK: id: [[F32:[0-9]+]], class: spr
+# CHECK: id: [[F64:[0-9]+]], class: dpr
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2, %r3
+
+    %0(p0) = COPY %r0
+    %1(s8) = COPY %r3
+    %2(s16) = COPY %r2
+    %3(s32) = COPY %r1
+    %4(s32) = COPY %s0
+    %5(s64) = COPY %d2
+
+    G_STORE %1(s8), %0(p0) :: (store 1)
+    ; CHECK: STRBi12 %[[I8]], %[[P]], 0, 14, _
+
+    G_STORE %2(s16), %0(p0) :: (store 2)
+    ; CHECK: STRH %[[I16]], %[[P]], _, 0, 14, _
+
+    G_STORE %3(s32), %0(p0) :: (store 4)
+    ; CHECK: STRi12 %[[I32]], %[[P]], 0, 14, _
+
+    G_STORE %4(s32), %0(p0) :: (store 4)
+    ; CHECK: VSTRS %[[F32]], %[[P]], 0, 14, _
+
+    G_STORE %5(s64), %0(p0) :: (store 8)
+    ; CHECK: VSTRD %[[F64]], %[[P]], 0, 14, _
+
+    BX_RET 14, _
+...
+---
+name:            test_gep
+# CHECK-LABEL: name: test_gep
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: gprb }
+# CHECK: id: [[PTR:[0-9]+]], class: gpr
+# CHECK: id: [[OFF:[0-9]+]], class: gpr
+# CHECK: id: [[GEP:[0-9]+]], class: gpr
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    %0(p0) = COPY %r0
+    %1(s32) = COPY %r1
+
+    %2(p0) = G_GEP %0, %1(s32)
+    ; CHECK: %[[GEP]] = ADDrr %[[PTR]], %[[OFF]], 14, _, _
+
+    %r0 = COPY %2(p0)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_constants
+# CHECK-LABEL: name: test_constants
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+# CHECK: id: [[C:[0-9]+]], class: gpr
+body:             |
+  bb.0:
+    %0(s32) = G_CONSTANT 42
+    ; CHECK: %[[C]] = MOVi 42, 14, _, _
+
+    %r0 = COPY %0(s32)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_soft_fp_double
+# CHECK-LABEL: name: test_soft_fp_double
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK: selected: true
+registers:
+  - { id: 0, class: gprb }
+  - { id: 1, class: gprb }
+  - { id: 2, class: fprb }
+  - { id: 3, class: gprb }
+  - { id: 4, class: gprb }
+# CHECK-DAG: id: {{[0-9]+}}, class: gpr
+# CHECK-DAG: id: {{[0-9]+}}, class: gpr
+# CHECK-DAG: id: {{[0-9]+}}, class: gpr
+# CHECK-DAG: id: {{[0-9]+}}, class: gpr
+# CHECK-DAG: id: [[DREG:[0-9]+]], class: dpr
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2, %r3
+
+    %0(s32) = COPY %r2
+    ; CHECK: [[IN1:%[0-9]+]] = COPY %r2
+
+    %1(s32) = COPY %r3
+    ; CHECK: [[IN2:%[0-9]+]] = COPY %r3
+
+    %2(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 1
+    ; CHECK: %[[DREG]] = VMOVDRR [[IN1]], [[IN2]]
+
+    %3(s32) = G_EXTRACT %2(s64), 0
+    %4(s32) = G_EXTRACT %2(s64), 32
+    ; CHECK: [[OUT1:%[0-9]+]] = VGETLNi32 %[[DREG]], 0
+    ; CHECK: [[OUT2:%[0-9]+]] = VGETLNi32 %[[DREG]], 1
+
+    %r0 = COPY %3
+    ; CHECK: %r0 = COPY [[OUT1]]
+
+    %r1 = COPY %4
+    ; CHECK: %r1 = COPY [[OUT2]]
+
+    BX_RET 14, _, implicit %r0, implicit %r1
+    ; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index a20a108d8a92b231236538380d081048acbc2272..a7f5ec33bee3cde982dfe20fc86097ae49a9f8a2 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -1,4 +1,5 @@
-; RUN: llc -mtriple arm-unknown -global-isel -stop-after=irtranslator %s -o - | FileCheck %s
+; RUN: llc -mtriple arm-unknown -mattr=+vfp2 -global-isel -stop-after=irtranslator %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=LITTLE
+; RUN: llc -mtriple armeb-unknown -mattr=+vfp2 -global-isel -stop-after=irtranslator %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=BIG
 
 define void @test_void_return() {
 ; CHECK-LABEL: name: test_void_return
@@ -134,3 +135,491 @@ entry:
   %sum = add i8 %p2, %p4
   ret i8 %sum
 }
+
+define i16 @test_ptr_arg(i16* %p) {
+; CHECK-LABEL: name: test_ptr_arg
+; CHECK: liveins: %r0
+; CHECK: [[VREGP:%[0-9]+]](p0) = COPY %r0
+; CHECK: [[VREGV:%[0-9]+]](s16) = G_LOAD [[VREGP]](p0)
+entry:
+  %v = load i16, i16* %p
+  ret i16 %v
+}
+
+define i32* @test_ptr_ret(i32** %p) {
+; Test pointer returns and pointer-to-pointer arguments
+; CHECK-LABEL: name: test_ptr_ret
+; CHECK: liveins: %r0
+; CHECK: [[VREGP:%[0-9]+]](p0) = COPY %r0
+; CHECK: [[VREGV:%[0-9]+]](p0) = G_LOAD [[VREGP]](p0)
+; CHECK: %r0 = COPY [[VREGV]]
+; CHECK: BX_RET 14, _, implicit %r0
+entry:
+  %v = load i32*, i32** %p
+  ret i32* %v
+}
+
+define i32 @test_ptr_arg_on_stack(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32* %p) {
+; CHECK-LABEL: name: test_ptr_arg_on_stack
+; CHECK: fixedStack:
+; CHECK: id: [[P:[0-9]+]]{{.*}}offset: 0{{.*}}size: 4
+; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK: [[FIP:%[0-9]+]]{{.*}} = G_FRAME_INDEX %fixed-stack.[[P]]
+; CHECK: [[VREGP:%[0-9]+]](p0) = G_LOAD [[FIP]](p0)
+; CHECK: [[VREGV:%[0-9]+]](s32) = G_LOAD [[VREGP]](p0)
+; CHECK: %r0 = COPY [[VREGV]]
+; CHECK: BX_RET 14, _, implicit %r0
+entry:
+  %v = load i32, i32* %p
+  ret i32 %v
+}
+
+define arm_aapcscc float @test_float_aapcscc(float %p0, float %p1, float %p2,
+                                             float %p3, float %p4, float %p5) {
+; CHECK-LABEL: name: test_float_aapcscc
+; CHECK: fixedStack:
+; CHECK-DAG: id: [[P4:[0-9]+]]{{.*}}offset: 0{{.*}}size: 4
+; CHECK-DAG: id: [[P5:[0-9]+]]{{.*}}offset: 4{{.*}}size: 4
+; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK: [[VREGP1:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[FIP5:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[P5]]
+; CHECK: [[VREGP5:%[0-9]+]](s32) = G_LOAD [[FIP5]](p0)
+; CHECK: [[VREGV:%[0-9]+]](s32) = G_FADD [[VREGP1]], [[VREGP5]]
+; CHECK: %r0 = COPY [[VREGV]]
+; CHECK: BX_RET 14, _, implicit %r0
+entry:
+  %v = fadd float %p1, %p5
+  ret float %v
+}
+
+define arm_aapcs_vfpcc float @test_float_vfpcc(float %p0, float %p1, float %p2,
+                                               float %p3, float %p4, float %p5,
+                                               float %ridiculous,
+                                               float %number,
+                                               float %of,
+                                               float %parameters,
+                                               float %that,
+                                               float %should,
+                                               float %never,
+                                               float %exist,
+                                               float %in,
+                                               float %practice,
+                                               float %q0, float %q1) {
+; CHECK-LABEL: name: test_float_vfpcc
+; CHECK: fixedStack:
+; CHECK-DAG: id: [[Q0:[0-9]+]]{{.*}}offset: 0{{.*}}size: 4
+; CHECK-DAG: id: [[Q1:[0-9]+]]{{.*}}offset: 4{{.*}}size: 4
+; CHECK: liveins: %s0, %s1, %s2, %s3, %s4, %s5, %s6, %s7, %s8, %s9, %s10, %s11, %s12, %s13, %s14, %s15
+; CHECK: [[VREGP1:%[0-9]+]](s32) = COPY %s1
+; CHECK: [[FIQ1:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Q1]]
+; CHECK: [[VREGQ1:%[0-9]+]](s32) = G_LOAD [[FIQ1]](p0)
+; CHECK: [[VREGV:%[0-9]+]](s32) = G_FADD [[VREGP1]], [[VREGQ1]]
+; CHECK: %s0 = COPY [[VREGV]]
+; CHECK: BX_RET 14, _, implicit %s0
+entry:
+  %v = fadd float %p1, %q1
+  ret float %v
+}
+
+define arm_aapcs_vfpcc double @test_double_vfpcc(double %p0, double %p1, double %p2,
+                                                 double %p3, double %p4, double %p5,
+                                                 double %reasonable,
+                                                 double %parameters,
+                                                 double %q0, double %q1) {
+; CHECK-LABEL: name: test_double_vfpcc
+; CHECK: fixedStack:
+; CHECK-DAG: id: [[Q0:[0-9]+]]{{.*}}offset: 0{{.*}}size: 8
+; CHECK-DAG: id: [[Q1:[0-9]+]]{{.*}}offset: 8{{.*}}size: 8
+; CHECK: liveins: %d0, %d1, %d2, %d3, %d4, %d5, %d6, %d7
+; CHECK: [[VREGP1:%[0-9]+]](s64) = COPY %d1
+; CHECK: [[FIQ1:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Q1]]
+; CHECK: [[VREGQ1:%[0-9]+]](s64) = G_LOAD [[FIQ1]](p0)
+; CHECK: [[VREGV:%[0-9]+]](s64) = G_FADD [[VREGP1]], [[VREGQ1]]
+; CHECK: %d0 = COPY [[VREGV]]
+; CHECK: BX_RET 14, _, implicit %d0
+entry:
+  %v = fadd double %p1, %q1
+  ret double %v
+}
+
+define arm_aapcscc double @test_double_aapcscc(double %p0, double %p1, double %p2,
+                                               double %p3, double %p4, double %p5) {
+; CHECK-LABEL: name: test_double_aapcscc
+; CHECK: fixedStack:
+; CHECK-DAG: id: [[P2:[0-9]+]]{{.*}}offset: 0{{.*}}size: 8
+; CHECK-DAG: id: [[P3:[0-9]+]]{{.*}}offset: 8{{.*}}size: 8
+; CHECK-DAG: id: [[P4:[0-9]+]]{{.*}}offset: 16{{.*}}size: 8
+; CHECK-DAG: id: [[P5:[0-9]+]]{{.*}}offset: 24{{.*}}size: 8
+; CHECK: liveins: %r0, %r1, %r2, %r3
+; CHECK-DAG: [[VREGP1LO:%[0-9]+]](s32) = COPY %r2
+; CHECK-DAG: [[VREGP1HI:%[0-9]+]](s32) = COPY %r3
+; LITTLE: [[VREGP1:%[0-9]+]](s64) = G_SEQUENCE [[VREGP1LO]](s32), 0, [[VREGP1HI]](s32), 32
+; BIG: [[VREGP1:%[0-9]+]](s64) = G_SEQUENCE [[VREGP1HI]](s32), 0, [[VREGP1LO]](s32), 32
+; CHECK: [[FIP5:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[P5]]
+; CHECK: [[VREGP5:%[0-9]+]](s64) = G_LOAD [[FIP5]](p0)
+; CHECK: [[VREGV:%[0-9]+]](s64) = G_FADD [[VREGP1]], [[VREGP5]]
+; LITTLE: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0
+; LITTLE: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32
+; BIG: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0
+; BIG: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32
+; CHECK-DAG: %r0 = COPY [[VREGVLO]]
+; CHECK-DAG: %r1 = COPY [[VREGVHI]]
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
+entry:
+  %v = fadd double %p1, %p5
+  ret double %v
+}
+
+define arm_aapcs_vfpcc double @test_double_gap_vfpcc(double %p0, float %filler,
+                                                     double %p1, double %p2,
+                                                     double %p3, double %p4,
+                                                     double %reasonable,
+                                                     double %parameters,
+                                                     double %q0, double %q1) {
+; CHECK-LABEL: name: test_double_gap_vfpcc
+; CHECK: fixedStack:
+; CHECK-DAG: id: [[Q0:[0-9]+]]{{.*}}offset: 0{{.*}}size: 8
+; CHECK-DAG: id: [[Q1:[0-9]+]]{{.*}}offset: 8{{.*}}size: 8
+; CHECK: liveins: %d0, %d2, %d3, %d4, %d5, %d6, %d7, %s2
+; CHECK: [[VREGP1:%[0-9]+]](s64) = COPY %d2
+; CHECK: [[FIQ1:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[Q1]]
+; CHECK: [[VREGQ1:%[0-9]+]](s64) = G_LOAD [[FIQ1]](p0)
+; CHECK: [[VREGV:%[0-9]+]](s64) = G_FADD [[VREGP1]], [[VREGQ1]]
+; CHECK: %d0 = COPY [[VREGV]]
+; CHECK: BX_RET 14, _, implicit %d0
+entry:
+  %v = fadd double %p1, %q1
+  ret double %v
+}
+
+define arm_aapcscc double @test_double_gap_aapcscc(float %filler, double %p0,
+                                                   double %p1) {
+; CHECK-LABEL: name: test_double_gap_aapcscc
+; CHECK: fixedStack:
+; CHECK-DAG: id: [[P1:[0-9]+]]{{.*}}offset: 0{{.*}}size: 8
+; CHECK: liveins: %r0, %r2, %r3
+; CHECK-DAG: [[VREGP0LO:%[0-9]+]](s32) = COPY %r2
+; CHECK-DAG: [[VREGP0HI:%[0-9]+]](s32) = COPY %r3
+; LITTLE: [[VREGP0:%[0-9]+]](s64) = G_SEQUENCE [[VREGP0LO]](s32), 0, [[VREGP0HI]](s32), 32
+; BIG: [[VREGP0:%[0-9]+]](s64) = G_SEQUENCE [[VREGP0HI]](s32), 0, [[VREGP0LO]](s32), 32
+; CHECK: [[FIP1:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[P1]]
+; CHECK: [[VREGP1:%[0-9]+]](s64) = G_LOAD [[FIP1]](p0)
+; CHECK: [[VREGV:%[0-9]+]](s64) = G_FADD [[VREGP0]], [[VREGP1]]
+; LITTLE: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0
+; LITTLE: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32
+; BIG: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0
+; BIG: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32
+; CHECK-DAG: %r0 = COPY [[VREGVLO]]
+; CHECK-DAG: %r1 = COPY [[VREGVHI]]
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
+entry:
+  %v = fadd double %p0, %p1
+  ret double %v
+}
+
+define arm_aapcscc double @test_double_gap2_aapcscc(double %p0, float %filler,
+                                                    double %p1) {
+; CHECK-LABEL: name: test_double_gap2_aapcscc
+; CHECK: fixedStack:
+; CHECK-DAG: id: [[P1:[0-9]+]]{{.*}}offset: 0{{.*}}size: 8
+; CHECK: liveins: %r0, %r1, %r2
+; CHECK-DAG: [[VREGP0LO:%[0-9]+]](s32) = COPY %r0
+; CHECK-DAG: [[VREGP0HI:%[0-9]+]](s32) = COPY %r1
+; LITTLE: [[VREGP0:%[0-9]+]](s64) = G_SEQUENCE [[VREGP0LO]](s32), 0, [[VREGP0HI]](s32), 32
+; BIG: [[VREGP0:%[0-9]+]](s64) = G_SEQUENCE [[VREGP0HI]](s32), 0, [[VREGP0LO]](s32), 32
+; CHECK: [[FIP1:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[P1]]
+; CHECK: [[VREGP1:%[0-9]+]](s64) = G_LOAD [[FIP1]](p0)
+; CHECK: [[VREGV:%[0-9]+]](s64) = G_FADD [[VREGP0]], [[VREGP1]]
+; LITTLE: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0
+; LITTLE: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32
+; BIG: [[VREGVHI:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 0
+; BIG: [[VREGVLO:%[0-9]+]](s32) = G_EXTRACT [[VREGV]](s64), 32
+; CHECK-DAG: %r0 = COPY [[VREGVLO]]
+; CHECK-DAG: %r1 = COPY [[VREGVHI]]
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
+entry:
+  %v = fadd double %p0, %p1
+  ret double %v
+}
+
+define arm_aapcscc void @test_indirect_call(void() *%fptr) {
+; CHECK-LABEL: name: test_indirect_call
+; CHECK: [[FPTR:%[0-9]+]](p0) = COPY %r0
+; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: BLX [[FPTR]](p0), csr_aapcs, implicit-def %lr, implicit %sp
+; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
+entry:
+  notail call arm_aapcscc void %fptr()
+  ret void
+}
+
+declare arm_aapcscc void @call_target()
+
+define arm_aapcscc void @test_direct_call() {
+; CHECK-LABEL: name: test_direct_call
+; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: BLX @call_target, csr_aapcs, implicit-def %lr, implicit %sp
+; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
+entry:
+  notail call arm_aapcscc void @call_target()
+  ret void
+}
+
+declare arm_aapcscc i32* @simple_reg_params_target(i32, i32*)
+
+define arm_aapcscc i32* @test_call_simple_reg_params(i32 *%a, i32 %b) {
+; CHECK-LABEL: name: test_call_simple_reg_params
+; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0
+; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1
+; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK-DAG: %r0 = COPY [[BVREG]]
+; CHECK-DAG: %r1 = COPY [[AVREG]]
+; CHECK: BLX @simple_reg_params_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0
+; CHECK: [[RVREG:%[0-9]+]](p0) = COPY %r0
+; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: %r0 = COPY [[RVREG]]
+; CHECK: BX_RET 14, _, implicit %r0
+entry:
+  %r = notail call arm_aapcscc i32 *@simple_reg_params_target(i32 %b, i32 *%a)
+  ret i32 *%r
+}
+
+declare arm_aapcscc i32* @simple_stack_params_target(i32, i32*, i32, i32*, i32, i32*)
+
+define arm_aapcscc i32* @test_call_simple_stack_params(i32 *%a, i32 %b) {
+; CHECK-LABEL: name: test_call_simple_stack_params
+; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0
+; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1
+; CHECK: ADJCALLSTACKDOWN 8, 14, _, implicit-def %sp, implicit %sp
+; CHECK-DAG: %r0 = COPY [[BVREG]]
+; CHECK-DAG: %r1 = COPY [[AVREG]]
+; CHECK-DAG: %r2 = COPY [[BVREG]]
+; CHECK-DAG: %r3 = COPY [[AVREG]]
+; CHECK: [[SP1:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF1:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[FI1:%[0-9]+]](p0) = G_GEP [[SP1]], [[OFF1]](s32)
+; CHECK: G_STORE [[BVREG]](s32), [[FI1]](p0){{.*}}store 4
+; CHECK: [[SP2:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF2:%[0-9]+]](s32) = G_CONSTANT i32 4
+; CHECK: [[FI2:%[0-9]+]](p0) = G_GEP [[SP2]], [[OFF2]](s32)
+; CHECK: G_STORE [[AVREG]](p0), [[FI2]](p0){{.*}}store 4
+; CHECK: BLX @simple_stack_params_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+; CHECK: [[RVREG:%[0-9]+]](p0) = COPY %r0
+; CHECK: ADJCALLSTACKUP 8, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: %r0 = COPY [[RVREG]]
+; CHECK: BX_RET 14, _, implicit %r0
+entry:
+  %r = notail call arm_aapcscc i32 *@simple_stack_params_target(i32 %b, i32 *%a, i32 %b, i32 *%a, i32 %b, i32 *%a)
+  ret i32 *%r
+}
+
+declare arm_aapcscc signext i16 @ext_target(i8 signext, i8 zeroext, i16 signext, i16 zeroext, i8 signext, i8 zeroext, i16 signext, i16 zeroext, i1 zeroext)
+
+define arm_aapcscc signext i16 @test_call_ext_params(i8 %a, i16 %b, i1 %c) {
+; CHECK-LABEL: name: test_call_ext_params
+; CHECK-DAG: [[AVREG:%[0-9]+]](s8) = COPY %r0
+; CHECK-DAG: [[BVREG:%[0-9]+]](s16) = COPY %r1
+; CHECK-DAG: [[CVREG:%[0-9]+]](s1) = COPY %r2
+; CHECK: ADJCALLSTACKDOWN 20, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[SEXTA:%[0-9]+]](s32) = G_SEXT [[AVREG]](s8)
+; CHECK: %r0 = COPY [[SEXTA]]
+; CHECK: [[ZEXTA:%[0-9]+]](s32) = G_ZEXT [[AVREG]](s8)
+; CHECK: %r1 = COPY [[ZEXTA]]
+; CHECK: [[SEXTB:%[0-9]+]](s32) = G_SEXT [[BVREG]](s16)
+; CHECK: %r2 = COPY [[SEXTB]]
+; CHECK: [[ZEXTB:%[0-9]+]](s32) = G_ZEXT [[BVREG]](s16)
+; CHECK: %r3 = COPY [[ZEXTB]]
+; CHECK: [[SP1:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF1:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[FI1:%[0-9]+]](p0) = G_GEP [[SP1]], [[OFF1]](s32)
+; CHECK: [[SEXTA2:%[0-9]+]](s32) = G_SEXT [[AVREG]]
+; CHECK: G_STORE [[SEXTA2]](s32), [[FI1]](p0){{.*}}store 4
+; CHECK: [[SP2:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF2:%[0-9]+]](s32) = G_CONSTANT i32 4
+; CHECK: [[FI2:%[0-9]+]](p0) = G_GEP [[SP2]], [[OFF2]](s32)
+; CHECK: [[ZEXTA2:%[0-9]+]](s32) = G_ZEXT [[AVREG]]
+; CHECK: G_STORE [[ZEXTA2]](s32), [[FI2]](p0){{.*}}store 4
+; CHECK: [[SP3:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF3:%[0-9]+]](s32) = G_CONSTANT i32 8
+; CHECK: [[FI3:%[0-9]+]](p0) = G_GEP [[SP3]], [[OFF3]](s32)
+; CHECK: [[SEXTB2:%[0-9]+]](s32) = G_SEXT [[BVREG]]
+; CHECK: G_STORE [[SEXTB2]](s32), [[FI3]](p0){{.*}}store 4
+; CHECK: [[SP4:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF4:%[0-9]+]](s32) = G_CONSTANT i32 12
+; CHECK: [[FI4:%[0-9]+]](p0) = G_GEP [[SP4]], [[OFF4]](s32)
+; CHECK: [[ZEXTB2:%[0-9]+]](s32) = G_ZEXT [[BVREG]]
+; CHECK: G_STORE [[ZEXTB2]](s32), [[FI4]](p0){{.*}}store 4
+; CHECK: [[SP5:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF5:%[0-9]+]](s32) = G_CONSTANT i32 16
+; CHECK: [[FI5:%[0-9]+]](p0) = G_GEP [[SP5]], [[OFF5]](s32)
+; CHECK: [[ZEXTC:%[0-9]+]](s32) = G_ZEXT [[CVREG]]
+; CHECK: G_STORE [[ZEXTC]](s32), [[FI5]](p0){{.*}}store 4
+; CHECK: BLX @ext_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0
+; CHECK: [[RVREG:%[0-9]+]](s16) = COPY %r0
+; CHECK: ADJCALLSTACKUP 20, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[RExtVREG:%[0-9]+]](s32) = G_SEXT [[RVREG]]
+; CHECK: %r0 = COPY [[RExtVREG]]
+; CHECK: BX_RET 14, _, implicit %r0
+entry:
+  %r = notail call arm_aapcscc signext i16 @ext_target(i8 signext %a, i8 zeroext %a, i16 signext %b, i16 zeroext %b, i8 signext %a, i8 zeroext %a, i16 signext %b, i16 zeroext %b, i1 zeroext %c)
+  ret i16 %r
+}
+
+declare arm_aapcs_vfpcc double @vfpcc_fp_target(float, double)
+
+define arm_aapcs_vfpcc double @test_call_vfpcc_fp_params(double %a, float %b) {
+; CHECK-LABEL: name: test_call_vfpcc_fp_params
+; CHECK-DAG: [[AVREG:%[0-9]+]](s64) = COPY %d0
+; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %s2
+; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK-DAG: %s0 = COPY [[BVREG]]
+; CHECK-DAG: %d1 = COPY [[AVREG]]
+; CHECK: BLX @vfpcc_fp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %s0, implicit %d1, implicit-def %d0
+; CHECK: [[RVREG:%[0-9]+]](s64) = COPY %d0
+; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: %d0 = COPY [[RVREG]]
+; CHECK: BX_RET 14, _, implicit %d0
+entry:
+  %r = notail call arm_aapcs_vfpcc double @vfpcc_fp_target(float %b, double %a)
+  ret double %r
+}
+
+declare arm_aapcscc double @aapcscc_fp_target(float, double, float, double)
+
+define arm_aapcscc double @test_call_aapcs_fp_params(double %a, float %b) {
+; CHECK-LABEL: name: test_call_aapcs_fp_params
+; CHECK-DAG: [[A1:%[0-9]+]](s32) = COPY %r0
+; CHECK-DAG: [[A2:%[0-9]+]](s32) = COPY %r1
+; LITTLE-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A1]](s32), 0, [[A2]](s32), 32
+; BIG-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A2]](s32), 0, [[A1]](s32), 32
+; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r2
+; CHECK: ADJCALLSTACKDOWN 16, 14, _, implicit-def %sp, implicit %sp
+; CHECK-DAG: %r0 = COPY [[BVREG]]
+; CHECK-DAG: [[A1:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 0
+; CHECK-DAG: [[A2:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 32
+; LITTLE-DAG: %r2 = COPY [[A1]]
+; LITTLE-DAG: %r3 = COPY [[A2]]
+; BIG-DAG: %r2 = COPY [[A2]]
+; BIG-DAG: %r3 = COPY [[A1]]
+; CHECK: [[SP1:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF1:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK: [[FI1:%[0-9]+]](p0) = G_GEP [[SP1]], [[OFF1]](s32)
+; CHECK: G_STORE [[BVREG]](s32), [[FI1]](p0){{.*}}store 4
+; CHECK: [[SP2:%[0-9]+]](p0) = COPY %sp
+; CHECK: [[OFF2:%[0-9]+]](s32) = G_CONSTANT i32 8
+; CHECK: [[FI2:%[0-9]+]](p0) = G_GEP [[SP2]], [[OFF2]](s32)
+; CHECK: G_STORE [[AVREG]](s64), [[FI2]](p0){{.*}}store 8
+; CHECK: BLX @aapcscc_fp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+; CHECK-DAG: [[R1:%[0-9]+]](s32) = COPY %r0
+; CHECK-DAG: [[R2:%[0-9]+]](s32) = COPY %r1
+; LITTLE: [[RVREG:%[0-9]+]](s64) = G_SEQUENCE [[R1]](s32), 0, [[R2]](s32), 32
+; BIG: [[RVREG:%[0-9]+]](s64) = G_SEQUENCE [[R2]](s32), 0, [[R1]](s32), 32
+; CHECK: ADJCALLSTACKUP 16, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: [[R1:%[0-9]+]](s32) = G_EXTRACT [[RVREG]](s64), 0
+; CHECK: [[R2:%[0-9]+]](s32) = G_EXTRACT [[RVREG]](s64), 32
+; LITTLE-DAG: %r0 = COPY [[R1]]
+; LITTLE-DAG: %r1 = COPY [[R2]]
+; BIG-DAG: %r0 = COPY [[R2]]
+; BIG-DAG: %r1 = COPY [[R1]]
+; CHECK: BX_RET 14, _, implicit %r0, implicit %r1
+entry:
+  %r = notail call arm_aapcscc double @aapcscc_fp_target(float %b, double %a, float %b, double %a)
+  ret double %r
+}
+
+declare arm_aapcscc float @different_call_conv_target(float)
+
+define arm_aapcs_vfpcc float @test_call_different_call_conv(float %x) {
+; CHECK-LABEL: name: test_call_different_call_conv
+; CHECK: [[X:%[0-9]+]](s32) = COPY %s0
+; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: %r0 = COPY [[X]]
+; CHECK: BLX @different_call_conv_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit-def %r0
+; CHECK: [[R:%[0-9]+]](s32) = COPY %r0
+; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp
+; CHECK: %s0 = COPY [[R]]
+; CHECK: BX_RET 14, _, implicit %s0
+entry:
+  %r = notail call arm_aapcscc float @different_call_conv_target(float %x)
+  ret float %r
+}
+
+define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
+; CHECK-LABEL: name: test_shufflevector_s32_v2s32
+; CHECK: [[ARG:%[0-9]+]](s32) = COPY %r0
+; CHECK-DAG: [[UNDEF:%[0-9]+]](s32) = IMPLICIT_DEF
+; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK-DAG: [[MASK:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32)
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], [[MASK]](<2 x s32>)
+; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>)
+  %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
+  %shuffle = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer
+  %res = extractelement <2 x i32> %shuffle, i32 0
+  ret i32 %res
+}
+
+define i32 @test_shufflevector_v2s32_v3s32(i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: name: test_shufflevector_v2s32_v3s32
+; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[ARG2:%[0-9]+]](s32) = COPY %r1
+; CHECK-DAG: [[UNDEF:%[0-9]+]](<2 x s32>) = IMPLICIT_DEF
+; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK-DAG: [[MASK:%[0-9]+]](<3 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C0]](s32), [[C1]](s32)
+; CHECK-DAG: [[V1:%[0-9]+]](<2 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32)
+; CHECK-DAG: [[V2:%[0-9]+]](<2 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32)
+; CHECK: [[VEC:%[0-9]+]](<3 x s32>) = G_SHUFFLE_VECTOR [[V2]](<2 x s32>), [[UNDEF]], [[MASK]](<3 x s32>)
+; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<3 x s32>)
+  %v1 = insertelement <2 x i32> undef, i32 %arg1, i32 0
+  %v2 = insertelement <2 x i32> %v1, i32 %arg2, i32 1
+  %shuffle = shufflevector <2 x i32> %v2, <2 x i32> undef, <3 x i32> <i32 1, i32 0, i32 1>
+  %res = extractelement <3 x i32> %shuffle, i32 0
+  ret i32 %res
+}
+
+
+define i32 @test_shufflevector_v2s32_v4s32(i32 %arg1, i32 %arg2) {
+; CHECK-LABEL: name: test_shufflevector_v2s32_v4s32
+; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[ARG2:%[0-9]+]](s32) = COPY %r1
+; CHECK-DAG: [[UNDEF:%[0-9]+]](<2 x s32>) = IMPLICIT_DEF
+; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK-DAG: [[MASK:%[0-9]+]](<4 x s32>) = G_MERGE_VALUES [[C0]](s32), [[C0]](s32), [[C0]](s32), [[C0]](s32)
+; CHECK-DAG: [[V1:%[0-9]+]](<2 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32)
+; CHECK-DAG: [[V2:%[0-9]+]](<2 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32)
+; CHECK: [[VEC:%[0-9]+]](<4 x s32>) = G_SHUFFLE_VECTOR [[V2]](<2 x s32>), [[UNDEF]], [[MASK]](<4 x s32>)
+; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<4 x s32>)
+  %v1 = insertelement <2 x i32> undef, i32 %arg1, i32 0
+  %v2 = insertelement <2 x i32> %v1, i32 %arg2, i32 1
+  %shuffle = shufflevector <2 x i32> %v2, <2 x i32> undef, <4 x i32> zeroinitializer
+  %res = extractelement <4 x i32> %shuffle, i32 0
+  ret i32 %res
+}
+
+define i32 @test_shufflevector_v4s32_v2s32(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) {
+; CHECK-LABEL: name: test_shufflevector_v4s32_v2s32
+; CHECK: [[ARG1:%[0-9]+]](s32) = COPY %r0
+; CHECK: [[ARG2:%[0-9]+]](s32) = COPY %r1
+; CHECK: [[ARG3:%[0-9]+]](s32) = COPY %r2
+; CHECK: [[ARG4:%[0-9]+]](s32) = COPY %r3
+; CHECK-DAG: [[UNDEF:%[0-9]+]](<4 x s32>) = IMPLICIT_DEF
+; CHECK-DAG: [[C0:%[0-9]+]](s32) = G_CONSTANT i32 0
+; CHECK-DAG: [[C1:%[0-9]+]](s32) = G_CONSTANT i32 1
+; CHECK-DAG: [[C2:%[0-9]+]](s32) = G_CONSTANT i32 2
+; CHECK-DAG: [[C3:%[0-9]+]](s32) = G_CONSTANT i32 3
+; CHECK-DAG: [[MASK:%[0-9]+]](<2 x s32>) = G_MERGE_VALUES [[C1]](s32), [[C3]](s32)
+; CHECK-DAG: [[V1:%[0-9]+]](<4 x s32>) = G_INSERT_VECTOR_ELT [[UNDEF]], [[ARG1]](s32), [[C0]](s32)
+; CHECK-DAG: [[V2:%[0-9]+]](<4 x s32>) = G_INSERT_VECTOR_ELT [[V1]], [[ARG2]](s32), [[C1]](s32)
+; CHECK-DAG: [[V3:%[0-9]+]](<4 x s32>) = G_INSERT_VECTOR_ELT [[V2]], [[ARG3]](s32), [[C2]](s32)
+; CHECK-DAG: [[V4:%[0-9]+]](<4 x s32>) = G_INSERT_VECTOR_ELT [[V3]], [[ARG4]](s32), [[C3]](s32)
+; CHECK: [[VEC:%[0-9]+]](<2 x s32>) = G_SHUFFLE_VECTOR [[V4]](<4 x s32>), [[UNDEF]], [[MASK]](<2 x s32>)
+; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>)
+  %v1 = insertelement <4 x i32> undef, i32 %arg1, i32 0
+  %v2 = insertelement <4 x i32> %v1, i32 %arg2, i32 1
+  %v3 = insertelement <4 x i32> %v2, i32 %arg3, i32 2
+  %v4 = insertelement <4 x i32> %v3, i32 %arg4, i32 3
+  %shuffle = shufflevector <4 x i32> %v4, <4 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %res = extractelement <2 x i32> %shuffle, i32 0
+  ret i32 %res
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll b/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7d021fdb43dd94b1840b8ee57724dae625f3758c
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel-fp.ll
@@ -0,0 +1,51 @@
+; RUN: llc -mtriple arm-linux-gnueabihf -mattr=+vfp2 -float-abi=hard -global-isel %s -o - | FileCheck %s -check-prefix CHECK -check-prefix HARD
+; RUN: llc -mtriple arm-linux-gnueabi -mattr=+vfp2,+soft-float -float-abi=soft -global-isel %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT-AEABI
+; RUN: llc -mtriple arm-linux-gnu- -mattr=+vfp2,+soft-float -float-abi=soft -global-isel %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT-DEFAULT
+
+define arm_aapcscc float @test_frem_float(float %x, float %y) {
+; CHECK-LABEL: test_frem_float:
+; CHECK: blx fmodf
+  %r = frem float %x, %y
+  ret float %r
+}
+
+define arm_aapcscc double @test_frem_double(double %x, double %y) {
+; CHECK-LABEL: test_frem_double:
+; CHECK: blx fmod
+  %r = frem double %x, %y
+  ret double %r
+}
+
+declare float @llvm.pow.f32(float %x, float %y)
+define arm_aapcscc float @test_fpow_float(float %x, float %y) {
+; CHECK-LABEL: test_fpow_float:
+; CHECK: blx powf
+  %r = call float @llvm.pow.f32(float %x, float %y)
+  ret float %r
+}
+
+declare double @llvm.pow.f64(double %x, double %y)
+define arm_aapcscc double @test_fpow_double(double %x, double %y) {
+; CHECK-LABEL: test_fpow_double:
+; CHECK: blx pow
+  %r = call double @llvm.pow.f64(double %x, double %y)
+  ret double %r
+}
+
+define arm_aapcscc float @test_add_float(float %x, float %y) {
+; CHECK-LABEL: test_add_float:
+; HARD: vadd.f32
+; SOFT-AEABI: blx __aeabi_fadd
+; SOFT-DEFAULT: blx __addsf3
+  %r = fadd float %x, %y
+  ret float %r
+}
+
+define arm_aapcscc double @test_add_double(double %x, double %y) {
+; CHECK-LABEL: test_add_double:
+; HARD: vadd.f64
+; SOFT-AEABI: blx __aeabi_dadd
+; SOFT-DEFAULT: blx __adddf3
+  %r = fadd double %x, %y
+  ret double %r
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-isel.ll b/test/CodeGen/ARM/GlobalISel/arm-isel.ll
index f5b706e7d21ae1674057a5f95d36ecceff34fd0b..236dcbeb84c524652e911262cd6b28d07d80d10f 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-isel.ll
+++ b/test/CodeGen/ARM/GlobalISel/arm-isel.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple arm-unknown -global-isel %s -o - | FileCheck %s
+; RUN: llc -mtriple arm-unknown -mattr=+vfp2 -global-isel %s -o - | FileCheck %s
 
 define void @test_void_return() {
 ; CHECK-LABEL: test_void_return:
@@ -110,3 +110,75 @@ entry:
   %sum = add i8 %p2, %p4
   ret i8 %sum
 }
+
+define i32 @test_ptr_arg_in_reg(i32* %p) {
+; CHECK-LABEL: test_ptr_arg_in_reg:
+; CHECK: ldr r0, [r0]
+; CHECK: bx lr
+entry:
+  %v = load i32, i32* %p
+  ret i32 %v
+}
+
+define i32 @test_ptr_arg_on_stack(i32 %f0, i32 %f1, i32 %f2, i32 %f3, i32* %p) {
+; CHECK-LABEL: test_ptr_arg_on_stack:
+; CHECK: mov r0, sp
+; CHECK: ldr r0, [r0]
+; CHECK: ldr r0, [r0]
+; CHECK: bx lr
+entry:
+  %v = load i32, i32* %p
+  ret i32 %v
+}
+
+define i8* @test_ptr_ret(i8** %p) {
+; CHECK-LABEL: test_ptr_ret:
+; CHECK: ldr r0, [r0]
+; CHECK: bx lr
+entry:
+  %v = load i8*, i8** %p
+  ret i8* %v
+}
+
+define arm_aapcs_vfpcc float @test_float_hard(float %f0, float %f1) {
+; CHECK-LABEL: test_float_hard:
+; CHECK: vadd.f32 s0, s0, s1
+; CHECK: bx lr
+entry:
+  %v = fadd float %f0, %f1
+  ret float %v
+}
+
+define arm_aapcscc float @test_float_softfp(float %f0, float %f1) {
+; CHECK-LABEL: test_float_softfp:
+; CHECK-DAG: vmov [[F0:s[0-9]+]], r0
+; CHECK-DAG: vmov [[F1:s[0-9]+]], r1
+; CHECK: vadd.f32 [[FV:s[0-9]+]], [[F0]], [[F1]]
+; CHECK: vmov r0, [[FV]]
+; CHECK: bx lr
+entry:
+  %v = fadd float %f0, %f1
+  ret float %v
+}
+
+define arm_aapcs_vfpcc double @test_double_hard(double %f0, double %f1) {
+; CHECK-LABEL: test_double_hard:
+; CHECK: vadd.f64 d0, d0, d1
+; CHECK: bx lr
+entry:
+  %v = fadd double %f0, %f1
+  ret double %v
+}
+
+define arm_aapcscc double @test_double_softfp(double %f0, double %f1) {
+; CHECK-LABEL: test_double_softfp:
+; CHECK-DAG: vmov [[F0:d[0-9]+]], r0, r1
+; CHECK-DAG: vmov [[F1:d[0-9]+]], r2, r3
+; CHECK: vadd.f64 [[FV:d[0-9]+]], [[F0]], [[F1]]
+; CHECK: vmov.32 r0, [[FV]][0]
+; CHECK: vmov.32 r1, [[FV]][1]
+; CHECK: bx lr
+entry:
+  %v = fadd double %f0, %f1
+  ret double %v
+}
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir b/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
new file mode 100644
index 0000000000000000000000000000000000000000..d154b4887c195ea74f3ef59105a984f6988e0f44
--- /dev/null
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalize-fp.mir
@@ -0,0 +1,282 @@
+# RUN: llc -mtriple arm-linux-gnueabihf -mattr=+vfp2 -float-abi=hard -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefix CHECK -check-prefix HARD
+# RUN: llc -mtriple arm-linux-gnueabi -mattr=+vfp2,+soft-float -float-abi=soft -global-isel -run-pass=legalizer %s -o - | FileCheck %s -check-prefix CHECK -check-prefix SOFT -check-prefix SOFT-AEABI
+# RUN: llc -mtriple arm-linux-gnu -mattr=+soft-float -float-abi=soft -global-isel -run-pass=legalizer %s -o - | FileCheck %s  -check-prefix CHECK -check-prefix SOFT -check-prefix SOFT-DEFAULT
+--- |
+  define void @test_frem_float() { ret void }
+  define void @test_frem_double() { ret void }
+
+  define void @test_fpow_float() { ret void }
+  define void @test_fpow_double() { ret void }
+
+  define void @test_fadd_float() { ret void }
+  define void @test_fadd_double() { ret void }
+...
+---
+name:            test_frem_float
+# CHECK-LABEL: name: test_frem_float
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    ; CHECK: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X]]
+    ; SOFT-DAG: %r1 = COPY [[Y]]
+    ; HARD-DAG: %s0 = COPY [[X]]
+    ; HARD-DAG: %s1 = COPY [[Y]]
+    ; SOFT: BLX $fmodf, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; HARD: BLX $fmodf, {{.*}}, implicit %s0, implicit %s1, implicit-def %s0
+    ; SOFT: [[R:%[0-9]+]](s32) = COPY %r0
+    ; HARD: [[R:%[0-9]+]](s32) = COPY %s0
+    ; CHECK: ADJCALLSTACKUP
+    %2(s32) = G_FREM %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s32)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_frem_double
+# CHECK-LABEL: name: test_frem_double
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+  - { id: 6, class: _ }
+  - { id: 7, class: _ }
+  - { id: 8, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2, %r3
+
+    ; The inputs may be in the wrong order (depending on the target's
+    ; endianness), but that's orthogonal to what we're trying to test here.
+    ; For soft float, we only need to check that the first value, received
+    ; through R0-R1, ends up in R0-R1 or R1-R0, and the second value, received
+    ; through R2-R3, ends up in R2-R3 or R3-R2, when passed to fmod.
+    ; For hard float, the values need to end up in D0 and D1.
+    ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+    ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+    ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+    ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s32) = COPY %r2
+    %3(s32) = COPY %r3
+    ; HARD-DAG: [[X:%[0-9]+]](s64) = G_SEQUENCE [[X0]]
+    ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_SEQUENCE [[Y0]]
+    %4(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 32
+    %5(s64) = G_SEQUENCE %2(s32), 0, %3(s32), 32
+    ; CHECK: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]]
+    ; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]]
+    ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y0]]
+    ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y1]]
+    ; HARD-DAG: %d0 = COPY [[X]]
+    ; HARD-DAG: %d1 = COPY [[Y]]
+    ; SOFT: BLX $fmod, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+    ; HARD: BLX $fmod, {{.*}}, implicit %d0, implicit %d1, implicit-def %d0
+    ; CHECK: ADJCALLSTACKUP
+    %6(s64) = G_FREM %4, %5
+    %7(s32) = G_EXTRACT %6(s64), 0
+    %8(s32) = G_EXTRACT %6(s64), 32
+    %r0 = COPY %7(s32)
+    %r1 = COPY %8(s32)
+    BX_RET 14, _, implicit %r0, implicit %r1
+...
+---
+name:            test_fpow_float
+# CHECK-LABEL: name: test_fpow_float
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    ; CHECK: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X]]
+    ; SOFT-DAG: %r1 = COPY [[Y]]
+    ; HARD-DAG: %s0 = COPY [[X]]
+    ; HARD-DAG: %s1 = COPY [[Y]]
+    ; SOFT: BLX $powf, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; HARD: BLX $powf, {{.*}}, implicit %s0, implicit %s1, implicit-def %s0
+    ; SOFT: [[R:%[0-9]+]](s32) = COPY %r0
+    ; HARD: [[R:%[0-9]+]](s32) = COPY %s0
+    ; CHECK: ADJCALLSTACKUP
+    %2(s32) = G_FPOW %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s32)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_fpow_double
+# CHECK-LABEL: name: test_fpow_double
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+  - { id: 6, class: _ }
+  - { id: 7, class: _ }
+  - { id: 8, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2, %r3
+
+    ; The inputs may be in the wrong order (depending on the target's
+    ; endianness), but that's orthogonal to what we're trying to test here.
+    ; For soft float, we only need to check that the first value, received
+    ; through R0-R1, ends up in R0-R1 or R1-R0, and the second value, received
+    ; through R2-R3, ends up in R2-R3 or R3-R2, when passed to pow.
+    ; For hard float, the values need to end up in D0 and D1.
+    ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+    ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+    ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+    ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s32) = COPY %r2
+    %3(s32) = COPY %r3
+    ; HARD-DAG: [[X:%[0-9]+]](s64) = G_SEQUENCE [[X0]]
+    ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_SEQUENCE [[Y0]]
+    %4(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 32
+    %5(s64) = G_SEQUENCE %2(s32), 0, %3(s32), 32
+    ; CHECK: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]]
+    ; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]]
+    ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y0]]
+    ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y1]]
+    ; HARD-DAG: %d0 = COPY [[X]]
+    ; HARD-DAG: %d1 = COPY [[Y]]
+    ; SOFT: BLX $pow, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+    ; HARD: BLX $pow, {{.*}}, implicit %d0, implicit %d1, implicit-def %d0
+    ; CHECK: ADJCALLSTACKUP
+    %6(s64) = G_FPOW %4, %5
+    %7(s32) = G_EXTRACT %6(s64), 0
+    %8(s32) = G_EXTRACT %6(s64), 32
+    %r0 = COPY %7(s32)
+    %r1 = COPY %8(s32)
+    BX_RET 14, _, implicit %r0, implicit %r1
+...
+---
+name:            test_fadd_float
+# CHECK-LABEL: name: test_fadd_float
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    ; CHECK-DAG: [[X:%[0-9]+]](s32) = COPY %r0
+    ; CHECK-DAG: [[Y:%[0-9]+]](s32) = COPY %r1
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    ; HARD: [[R:%[0-9]+]](s32) = G_FADD [[X]], [[Y]]
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r0 = COPY [[X]]
+    ; SOFT-DAG: %r1 = COPY [[Y]]
+    ; SOFT-AEABI: BLX $__aeabi_fadd, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT-DEFAULT: BLX $__addsf3, {{.*}}, implicit %r0, implicit %r1, implicit-def %r0
+    ; SOFT: [[R:%[0-9]+]](s32) = COPY %r0
+    ; SOFT: ADJCALLSTACKUP
+    %2(s32) = G_FADD %0, %1
+    ; CHECK: %r0 = COPY [[R]]
+    %r0 = COPY %2(s32)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_fadd_double
+# CHECK-LABEL: name: test_fadd_double
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+  - { id: 6, class: _ }
+  - { id: 7, class: _ }
+  - { id: 8, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2, %r3
+
+    ; CHECK-DAG: [[X0:%[0-9]+]](s32) = COPY %r0
+    ; CHECK-DAG: [[X1:%[0-9]+]](s32) = COPY %r1
+    ; CHECK-DAG: [[Y0:%[0-9]+]](s32) = COPY %r2
+    ; CHECK-DAG: [[Y1:%[0-9]+]](s32) = COPY %r3
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s32) = COPY %r2
+    %3(s32) = COPY %r3
+    ; HARD-DAG: [[X:%[0-9]+]](s64) = G_SEQUENCE [[X0]]
+    ; HARD-DAG: [[Y:%[0-9]+]](s64) = G_SEQUENCE [[Y0]]
+    %4(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 32
+    %5(s64) = G_SEQUENCE %2(s32), 0, %3(s32), 32
+    ; HARD: [[R:%[0-9]+]](s64) = G_FADD [[X]], [[Y]]
+    ; SOFT: ADJCALLSTACKDOWN
+    ; SOFT-DAG: %r{{[0-1]}} = COPY [[X0]]
+    ; SOFT-DAG: %r{{[0-1]}} = COPY [[X1]]
+    ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y0]]
+    ; SOFT-DAG: %r{{[2-3]}} = COPY [[Y1]]
+    ; SOFT-AEABI: BLX $__aeabi_dadd, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+    ; SOFT-DEFAULT: BLX $__adddf3, {{.*}}, implicit %r0, implicit %r1, implicit %r2, implicit %r3, implicit-def %r0, implicit-def %r1
+    ; SOFT: ADJCALLSTACKUP
+    %6(s64) = G_FADD %4, %5
+    ; HARD-DAG: G_EXTRACT [[R]](s64), 0
+    ; HARD-DAG: G_EXTRACT [[R]](s64), 32
+    %7(s32) = G_EXTRACT %6(s64), 0
+    %8(s32) = G_EXTRACT %6(s64), 32
+    %r0 = COPY %7(s32)
+    %r1 = COPY %8(s32)
+    BX_RET 14, _, implicit %r0, implicit %r1
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
index d1e8ec3d18824d84c014fac81875cb335a5ad175..cbff7e12fb77c872bbb61c41720e8a7c622bbff0 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir
@@ -8,6 +8,17 @@
   define void @test_add_s32() { ret void }
 
   define void @test_load_from_stack() { ret void }
+  define void @test_legal_loads() #0 { ret void }
+  define void @test_legal_stores() #0 { ret void }
+
+  define void @test_gep() { ret void }
+
+  define void @test_constants() { ret void }
+
+  define void @test_fadd_s32() #0 { ret void }
+  define void @test_fadd_s64() #0 { ret void }
+
+  attributes #0 = { "target-features"="+vfp2" }
 ...
 ---
 name:            test_sext_s8
@@ -151,8 +162,179 @@ body:             |
 
     ; This is legal, so we should find it unchanged in the output
     ; CHECK: [[FIVREG:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[FRAME_INDEX]]
-    ; CHECK: {{%[0-9]+}}(s32) = G_LOAD [[FIVREG]](p0)
+    ; CHECK: {{%[0-9]+}}(s32) = G_LOAD [[FIVREG]](p0) :: (load 4)
     %0(p0) = G_FRAME_INDEX %fixed-stack.2
-    %1(s32) = G_LOAD %0(p0)
+    %1(s32) = G_LOAD %0(p0) :: (load 4)
+    BX_RET 14, _
+...
+---
+name:            test_legal_loads
+# CHECK-LABEL: name: test_legal_loads
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+  - { id: 6, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2, %r3
+
+    ; These are all legal, so we should find them unchanged in the output
+    ; CHECK-DAG: {{%[0-9]+}}(s64) = G_LOAD %0
+    ; CHECK-DAG: {{%[0-9]+}}(s32) = G_LOAD %0
+    ; CHECK-DAG: {{%[0-9]+}}(s16) = G_LOAD %0
+    ; CHECK-DAG: {{%[0-9]+}}(s8) = G_LOAD %0
+    ; CHECK-DAG: {{%[0-9]+}}(s1) = G_LOAD %0
+    ; CHECK-DAG: {{%[0-9]+}}(p0) = G_LOAD %0
+    %0(p0) = COPY %r0
+    %1(s32) = G_LOAD %0(p0) :: (load 4)
+    %2(s16) = G_LOAD %0(p0) :: (load 2)
+    %3(s8)  = G_LOAD %0(p0) :: (load 1)
+    %4(s1)  = G_LOAD %0(p0) :: (load 1)
+    %5(p0)  = G_LOAD %0(p0) :: (load 4)
+    %6(s64) = G_LOAD %0(p0) :: (load 8)
+    BX_RET 14, _
+...
+---
+name:            test_legal_stores
+# CHECK-LABEL: name: test_legal_stores
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+  - { id: 6, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2, %r3, %r4, %r5, %r6, %d1
+
+    ; These are all legal, so we should find them unchanged in the output
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s64), %0(p0)
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s32), %0(p0)
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s16), %0(p0)
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s8), %0(p0)
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(s1), %0(p0)
+    ; CHECK-DAG: G_STORE {{%[0-9]+}}(p0), %0(p0)
+    %0(p0) = COPY %r0
+    %1(s64) = COPY %d1
+    G_STORE %1(s64), %0(p0) :: (store 8)
+    %2(s32) = COPY %r2
+    G_STORE %2(s32), %0(p0) :: (store 4)
+    %3(s16) = COPY %r3
+    G_STORE %3(s16), %0(p0) :: (store 2)
+    %4(s8) = COPY %r4
+    G_STORE %4(s8), %0(p0) :: (store 1)
+    %5(s1) = COPY %r5
+    G_STORE %5(s1), %0(p0) :: (store 1)
+    %6(p0) = COPY %r6
+    G_STORE %6(p0), %0(p0) :: (store 4)
     BX_RET 14, _
 ...
+---
+name:            test_gep
+# CHECK-LABEL: name: test_gep
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    %0(p0) = COPY %r0
+    %1(s32) = COPY %r1
+
+    ; CHECK: {{%[0-9]+}}(p0) = G_GEP {{%[0-9]+}}, {{%[0-9]+}}(s32)
+    %2(p0) = G_GEP %0, %1(s32)
+
+    %r0 = COPY %2(p0)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_constants
+# CHECK-LABEL: name: test_constants
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+body:             |
+  bb.0:
+    %0(s32) = G_CONSTANT 42
+    ; CHECK: {{%[0-9]+}}(s32) = G_CONSTANT 42
+
+    %r0 = COPY %0(s32)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_fadd_s32
+# CHECK-LABEL: name: test_fadd_s32
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s32) = G_FADD %0, %1
+    ; G_FADD with s32 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}(s32) = G_FADD {{%[0-9]+, %[0-9]+}}
+    %r0 = COPY %2(s32)
+    BX_RET 14, _, implicit %r0
+
+...
+---
+name:            test_fadd_s64
+# CHECK-LABEL: name: test_fadd_s64
+legalized:       false
+# CHECK: legalized: true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %d0, %d1
+
+    %0(s64) = COPY %d0
+    %1(s64) = COPY %d1
+    %2(s64) = G_FADD %0, %1
+    ; G_FADD with s64 is legal, so we should find it unchanged in the output
+    ; CHECK: {{%[0-9]+}}(s64) = G_FADD {{%[0-9]+, %[0-9]+}}
+    %d0 = COPY %2(s64)
+    BX_RET 14, _, implicit %d0
+
+...
diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
index 64c65a41a31d35bda064ea969b68a15ea72c12c7..fbf8d81322f8f3767f4aaf98389aeb3c578a2810 100644
--- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
+++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir
@@ -4,6 +4,22 @@
   define void @test_add_s16() { ret void }
   define void @test_add_s8() { ret void }
   define void @test_add_s1() { ret void }
+
+  define void @test_loads() #0 { ret void }
+  define void @test_stores() #0 { ret void }
+
+  define void @test_stack() { ret void }
+
+  define void @test_gep() { ret void }
+
+  define void @test_constants() { ret void }
+
+  define void @test_fadd_s32() #0 { ret void }
+  define void @test_fadd_s64() #0 { ret void }
+
+  define void @test_soft_fp_s64() #0 { ret void }
+
+  attributes #0 = { "target-features"="+vfp2"}
 ...
 ---
 name:            test_add_s32
@@ -109,3 +125,240 @@ body:             |
     BX_RET 14, _, implicit %r0
 
 ...
+---
+name:            test_loads
+# CHECK-LABEL: name: test_loads
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+# CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
+# CHECK: - { id: 6, class: fprb }
+
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+  - { id: 6, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0
+    %0(p0) = COPY %r0
+    %6(s64) = G_LOAD %0 :: (load 8)
+    %1(s32) = G_LOAD %0 :: (load 4)
+    %2(s16) = G_LOAD %0 :: (load 2)
+    %3(s8)  = G_LOAD %0 :: (load 1)
+    %4(s1)  = G_LOAD %0 :: (load 1)
+    %5(p0)  = G_LOAD %0 :: (load 4)
+    BX_RET 14, _, implicit %r0
+
+...
+---
+name:            test_stores
+# CHECK-LABEL: name: test_stores
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+# CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+# CHECK: - { id: 5, class: gprb }
+# CHECK: - { id: 6, class: fprb }
+
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+  - { id: 5, class: _ }
+  - { id: 6, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1, %r2, %r3, %r4, %r5, %d6
+    %0(p0) = COPY %r0
+    %1(s32) = COPY %r1
+    G_STORE %1(s32), %0 :: (store 4)
+    %2(s16) = COPY %r2
+    G_STORE %2(s16), %0 :: (store 2)
+    %3(s8) = COPY %r3
+    G_STORE %3(s8), %0 :: (store 1)
+    %4(s1) = COPY %r4
+    G_STORE %4(s1), %0 :: (store 1)
+    %5(p0) = COPY %r5
+    G_STORE %5(p0), %0 :: (store 4)
+    %6(s64) = COPY %d6
+    G_STORE %6(s64), %0 :: (store 8)
+    BX_RET 14, _, implicit %r0
+
+...
+---
+name:            test_stack
+# CHECK-LABEL: name: test_stack
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+# CHECK: - { id: 2, class: gprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+fixedStack:
+  - { id: 0, offset: 0, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+body:             |
+  bb.0:
+    %0(p0) = G_FRAME_INDEX %fixed-stack.0
+    %1(s32) = G_LOAD %0(p0) :: (load 4 from %fixed-stack.0, align 0)
+
+    %2(p0) = COPY %sp
+    %3(s32) = G_CONSTANT i32 8
+    %4(p0) = G_GEP %2, %3(s32)
+    G_STORE %1(s32), %4(p0) :: (store 4)
+
+    BX_RET 14, _
+
+...
+---
+name:            test_gep
+# CHECK-LABEL: name: test_gep
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+# CHECK: - { id: 2, class: gprb }
+
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    %0(p0) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(p0) = G_GEP %0, %1(s32)
+    %r0 = COPY %2(p0)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_constants
+# CHECK-LABEL: name: test_constants
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+registers:
+  - { id: 0, class: _ }
+body:             |
+  bb.0:
+    %0(s32) = G_CONSTANT 42
+    %r0 = COPY %0(s32)
+    BX_RET 14, _, implicit %r0
+...
+---
+name:            test_fadd_s32
+# CHECK-LABEL: name: test_fadd_s32
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: fprb }
+# CHECK: - { id: 1, class: fprb }
+# CHECK: - { id: 2, class: fprb }
+
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %s0, %s1
+
+    %0(s32) = COPY %s0
+    %1(s32) = COPY %s1
+    %2(s32) = G_FADD %0, %1
+    %s0 = COPY %2(s32)
+    BX_RET 14, _, implicit %s0
+
+...
+---
+name:            test_fadd_s64
+# CHECK-LABEL: name: test_fadd_s64
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: fprb }
+# CHECK: - { id: 1, class: fprb }
+# CHECK: - { id: 2, class: fprb }
+
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.0:
+    liveins: %d0, %d1
+
+    %0(s64) = COPY %d0
+    %1(s64) = COPY %d1
+    %2(s64) = G_FADD %0, %1
+    %d0 = COPY %2(s64)
+    BX_RET 14, _, implicit %d0
+
+...
+---
+name:            test_soft_fp_s64
+# CHECK-LABEL: name: test_soft_fp_s64
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK: registers:
+# CHECK: - { id: 0, class: gprb }
+# CHECK: - { id: 1, class: gprb }
+# CHECK: - { id: 2, class: fprb }
+# CHECK: - { id: 3, class: gprb }
+# CHECK: - { id: 4, class: gprb }
+
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+body:             |
+  bb.0:
+    liveins: %r0, %r1
+
+    %0(s32) = COPY %r0
+    %1(s32) = COPY %r1
+    %2(s64) = G_SEQUENCE %0(s32), 0, %1(s32), 32
+    %3(s32) = G_EXTRACT %2(s64), 0
+    %4(s32) = G_EXTRACT %2(s64), 32
+    %r0 = COPY %3(s32)
+    %r1 = COPY %4(s32)
+    BX_RET 14, _, implicit %r0, implicit %r1
+
+...
diff --git a/test/CodeGen/ARM/alias_store.ll b/test/CodeGen/ARM/alias_store.ll
new file mode 100644
index 0000000000000000000000000000000000000000..48f21fc03eca103b5dcc32b64cbe1a76f95ee360
--- /dev/null
+++ b/test/CodeGen/ARM/alias_store.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s -mtriple=arm-linux-gnueabi | FileCheck %s
+
+@X = constant {i8, i8 } { i8 0, i8 0 }
+@XA = alias i8, getelementptr inbounds ({ i8, i8 }, {i8, i8}* @X, i32 0, i32 1)
+
+define void @f(i8** %p) align 2 {
+entry:
+  store i8* @XA, i8 **%p, align 4
+  ret void
+}
+
+; CHECK: f:
+; CHECK: ldr r{{.*}}, [[L:.*]]
+; CHECK: [[L]]:
+; CHECK-NEXT: .long XA
+; CHECK: XA = X+1
diff --git a/test/CodeGen/ARM/alloc-no-stack-realign.ll b/test/CodeGen/ARM/alloc-no-stack-realign.ll
index 7d37c83d74838f91436d3d367ed9b7a6ebb0fe6f..0e077b3aee5a10870818ea75026e7f2d711e1829 100644
--- a/test/CodeGen/ARM/alloc-no-stack-realign.ll
+++ b/test/CodeGen/ARM/alloc-no-stack-realign.ll
@@ -1,5 +1,4 @@
-; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=NO-REALIGN
-; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s -check-prefix=REALIGN
+; RUN: llc < %s -mtriple=armv7-apple-ios -O0 | FileCheck %s
 
 ; rdar://12713765
 ; When realign-stack is set to false, make sure we are not creating stack
@@ -8,29 +7,31 @@
 
 define void @test1(<16 x float>* noalias sret %agg.result) nounwind ssp "no-realign-stack" {
 entry:
-; NO-REALIGN-LABEL: test1
-; NO-REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
-; NO-REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
-; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
-; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48
-; NO-REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-
-; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
-; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; NO-REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
-; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; NO-REALIGN: mov	r[[R3:[0-9]+]], r[[R1]]
-; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128]!
-; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R3]]:128]
-
-; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0:0]], #48
-; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; NO-REALIGN: add r[[R2:[0-9]+]], r[[R0]], #32
-; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; NO-REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
-; NO-REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
+; CHECK-LABEL: test1
+; CHECK:	ldr	r[[R1:[0-9]+]], [pc, r1]
+; CHECK:	add	r[[R2:[0-9]+]], r1, #48
+; CHECK:	vld1.64	 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	mov	r[[R2:[0-9]+]], r[[R1]]
+; CHECK:	vld1.32	 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
+; CHECK:	vld1.64	 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	add	r[[R1:[0-9]+]], r[[R1]], #32
+; CHECK:	vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	mov	r[[R1:[0-9]+]], sp
+; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	add	r[[R2:[0-9]+]], r[[R1]], #32
+; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
+; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	add	r[[R1:[0-9]+]], r0, #48
+; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	add	r[[R1:[0-9]+]], r0, #32
+; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r0:128]!
+; CHECK:	vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r0:128]
  %retval = alloca <16 x float>, align 16
  %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
@@ -41,32 +42,33 @@ entry:
 
 define void @test2(<16 x float>* noalias sret %agg.result) nounwind ssp {
 entry:
-; REALIGN-LABEL: test2
-; REALIGN: bfc sp, #0, #6
-; REALIGN: mov r[[R2:[0-9]+]], r[[R1:[0-9]+]]
-; REALIGN: vld1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
-; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #32
-; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; REALIGN: add r[[R2:[0-9]+]], r[[R1]], #48
-; REALIGN: vld1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	ldr	r[[R1:[0-9]+]], [pc, r1]
+; CHECK:	add	r[[R2:[0-9]+]], r[[R1]], #48
+; CHECK:	vld1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	mov	r[[R2:[0-9]+]], r[[R1]]
+; CHECK:	vld1.32	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
+; CHECK:	vld1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	add	r[[R1:[0-9]+]], r[[R1]], #32
+; CHECK:	vld1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	mov	r[[R1:[0-9]+]], sp
+; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	orr	r[[R2:[0-9]+]], r[[R1]], #32
+; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	vld1.32	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]!
+; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	vld1.32	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]!
+; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	vld1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	vld1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
+; CHECK:	add	r[[R1:[0-9]+]], r0, #48
+; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	add	r[[R1:[0-9]+]], r0, #32
+; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
+; CHECK:	vst1.32	{{{d[0-9]+, d[0-9]+}}}, [r0:128]!
+; CHECK:	vst1.64	{{{d[0-9]+, d[0-9]+}}}, [r0:128]
 
 
-; REALIGN: orr r[[R2:[0-9]+]], r[[R1:[0-9]+]], #48
-; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #32
-; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; REALIGN: orr r[[R2:[0-9]+]], r[[R1]], #16
-; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R2]]:128]
-; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-
-; REALIGN: add r[[R1:[0-9]+]], r[[R0:0]], #48
-; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; REALIGN: add r[[R1:[0-9]+]], r[[R0]], #32
-; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R1]]:128]
-; REALIGN: vst1.32 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]!
-; REALIGN: vst1.64 {{{d[0-9]+, d[0-9]+}}}, [r[[R0]]:128]
- %retval = alloca <16 x float>, align 16
+%retval = alloca <16 x float>, align 16
  %0 = load <16 x float>, <16 x float>* @T3_retval, align 16
  store <16 x float> %0, <16 x float>* %retval
  %1 = load <16 x float>, <16 x float>* %retval
diff --git a/test/CodeGen/ARM/arg-copy-elide.ll b/test/CodeGen/ARM/arg-copy-elide.ll
new file mode 100644
index 0000000000000000000000000000000000000000..739b560b0833f890edb4e36d6a19280441ced9da
--- /dev/null
+++ b/test/CodeGen/ARM/arg-copy-elide.ll
@@ -0,0 +1,61 @@
+; RUN: llc -mtriple=armv7-linux < %s | FileCheck %s
+
+declare arm_aapcscc void @addrof_i32(i32*)
+declare arm_aapcscc void @addrof_i64(i64*)
+
+define arm_aapcscc void @simple(i32, i32, i32, i32, i32 %x) {
+entry:
+  %x.addr = alloca i32
+  store i32 %x, i32* %x.addr
+  call void @addrof_i32(i32* %x.addr)
+  ret void
+}
+
+; CHECK-LABEL: simple:
+; CHECK: push {r11, lr}
+; CHECK: add r0, sp, #8
+; CHECK: bl addrof_i32
+; CHECK: pop {r11, pc}
+
+
+; We need to load %x before calling addrof_i32 now because it could mutate %x in
+; place.
+
+define arm_aapcscc i32 @use_arg(i32, i32, i32, i32, i32 %x) {
+entry:
+  %x.addr = alloca i32
+  store i32 %x, i32* %x.addr
+  call void @addrof_i32(i32* %x.addr)
+  ret i32 %x
+}
+
+; CHECK-LABEL: use_arg:
+; CHECK: push {[[csr:[^ ]*]], lr}
+; CHECK: ldr [[csr]], [sp, #8]
+; CHECK: add r0, sp, #8
+; CHECK: bl addrof_i32
+; CHECK: mov r0, [[csr]]
+; CHECK: pop {[[csr]], pc}
+
+
+define arm_aapcscc i64 @split_i64(i32, i32, i32, i32, i64 %x) {
+entry:
+  %x.addr = alloca i64, align 4
+  store i64 %x, i64* %x.addr, align 4
+  call void @addrof_i64(i64* %x.addr)
+  ret i64 %x
+}
+
+; CHECK-LABEL: split_i64:
+; CHECK: push    {r4, r5, r11, lr}
+; CHECK: sub     sp, sp, #8
+; CHECK: ldr     r4, [sp, #28]
+; CHECK: ldr     r5, [sp, #24]
+; CHECK: mov     r0, sp
+; CHECK: str     r4, [sp, #4]
+; CHECK: str     r5, [sp]
+; CHECK: bl      addrof_i64
+; CHECK: mov     r0, r5
+; CHECK: mov     r1, r4
+; CHECK: add     sp, sp, #8
+; CHECK: pop     {r4, r5, r11, pc}
diff --git a/test/CodeGen/ARM/arm-and-tst-peephole.ll b/test/CodeGen/ARM/arm-and-tst-peephole.ll
index 9bd2077e4d0372dfde0ce233deae5c3b21b27e5a..31691e9468c9ece60851c62ff0f35951f2a8946d 100644
--- a/test/CodeGen/ARM/arm-and-tst-peephole.ll
+++ b/test/CodeGen/ARM/arm-and-tst-peephole.ll
@@ -1,7 +1,6 @@
 ; RUN: llc -mtriple=arm-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=ARM %s
 ; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=THUMB %s
-; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - \
-; RUN:   | FileCheck -check-prefix=T2 %s
+; RUN: llc -mtriple=thumb-eabi -arm-atomic-cfg-tidy=0 -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck -check-prefix=T2 %s
 ; RUN: llc -mtriple=thumbv8-eabi -arm-atomic-cfg-tidy=0 %s -o - | FileCheck -check-prefix=V8 %s
 
 ; FIXME: The -march=thumb test doesn't change if -disable-peephole is specified.
@@ -49,9 +48,9 @@ tailrecurse.switch:                               ; preds = %tailrecurse
 ; V8-NEXT: beq
 ; V8-NEXT: %tailrecurse.switch
 ; V8: cmp
-; V8-NEXT: bne
-; V8-NEXT: b	
-; The trailing space in the last line checks that the branch is unconditional
+; V8-NEXT: beq
+; V8-NEXT: %sw.epilog
+; V8-NEXT: bx lr
   switch i32 %and, label %sw.epilog [
     i32 1, label %sw.bb
     i32 3, label %sw.bb6
@@ -93,7 +92,7 @@ entry:
   %1 = load i8, i8* %0, align 1
   %2 = zext i8 %1 to i32
 ; ARM: ands
-; THUMB: ands 
+; THUMB: ands
 ; T2: ands
 ; V8: ands
 ; V8-NEXT: beq
@@ -141,19 +140,48 @@ return:                                           ; preds = %bb2, %bb, %entry
 ; folding of unrelated tests (in this case, a TST against r1 was eliminated in
 ; favour of an AND of r0).
 
+define i32 @test_tst_assessment(i32 %a, i32 %b) {
 ; ARM-LABEL: test_tst_assessment:
+; ARM:       @ BB#0:
+; ARM-NEXT:    and r0, r0, #1
+; ARM-NEXT:    tst r1, #1
+; ARM-NEXT:    subne r0, r0, #1
+; ARM-NEXT:    mov pc, lr
+;
 ; THUMB-LABEL: test_tst_assessment:
+; THUMB:       @ BB#0:
+; THUMB-NEXT:    movs r2, r0
+; THUMB-NEXT:    movs r0, #1
+; THUMB-NEXT:    ands r0, r2
+; THUMB-NEXT:    subs r2, r0, #1
+; THUMB-NEXT:    lsls r1, r1, #31
+; THUMB-NEXT:    beq .LBB2_2
+; THUMB-NEXT:  @ BB#1:
+; THUMB-NEXT:    movs r0, r2
+; THUMB-NEXT:  .LBB2_2:
+; THUMB-NEXT:    bx lr
+;
 ; T2-LABEL: test_tst_assessment:
+; T2:       @ BB#0:
+; T2-NEXT:    lsls r1, r1, #31
+; T2-NEXT:    and r0, r0, #1
+; T2-NEXT:    it ne
+; T2-NEXT:    subne r0, #1
+; T2-NEXT:    bx lr
+;
 ; V8-LABEL: test_tst_assessment:
-define i32 @test_tst_assessment(i1 %lhs, i1 %rhs) {
-  %lhs32 = zext i1 %lhs to i32
-  %rhs32 = zext i1 %rhs to i32
-  %diff = sub nsw i32 %lhs32, %rhs32
-; ARM: tst r1, #1
-; THUMB: lsls r1, r1, #31
-; T2: lsls r1, r1, #31
-; V8: lsls r1, r1, #31
-  ret i32 %diff
+; V8:       @ BB#0:
+; V8-NEXT:    lsls r1, r1, #31
+; V8-NEXT:    and r0, r0, #1
+; V8-NEXT:    it ne
+; V8-NEXT:    subne r0, #1
+; V8-NEXT:    bx lr
+  %and1 = and i32 %a, 1
+  %sub = sub i32 %and1, 1
+  %and2 = and i32 %b, 1
+  %cmp = icmp eq i32 %and2, 0
+  %sel = select i1 %cmp, i32 %and1, i32 %sub
+  ret i32 %sel
 }
 
 !1 = !{!"branch_weights", i32 1, i32 1, i32 3, i32 2 }
diff --git a/test/CodeGen/ARM/arm-position-independence.ll b/test/CodeGen/ARM/arm-position-independence.ll
index 02a63984ad6f443b690dccc17dbf07da63a4fad1..4aa817f7a481469022859d307a75dd3d05046710 100644
--- a/test/CodeGen/ARM/arm-position-independence.ll
+++ b/test/CodeGen/ARM/arm-position-independence.ll
@@ -13,6 +13,12 @@
 ; RUN: llc -relocation-model=rwpi      -mtriple=thumbv6m--none-eabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1_RO_ABS --check-prefix=THUMB1_RW_SB
 ; RUN: llc -relocation-model=ropi-rwpi -mtriple=thumbv6m--none-eabi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=THUMB1_RO_PC  --check-prefix=THUMB1_RW_SB
 
+; RUN: llc -relocation-model=rwpi      -mtriple=armv7a--none-eabi   -mattr=no-movt < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NO_MOVT_ARM_RO_ABS --check-prefix=NO_MOVT_ARM_RW_SB
+; RUN: llc -relocation-model=ropi-rwpi -mtriple=armv7a--none-eabi   -mattr=no-movt < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NO_MOVT_ARM_RO_PC  --check-prefix=NO_MOVT_ARM_RW_SB
+
+; RUN: llc -relocation-model=rwpi      -mtriple=thumbv7m--none-eabi -mattr=no-movt < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NO_MOVT_THUMB2_RO_ABS  --check-prefix=NO_MOVT_THUMB2_RW_SB
+; RUN: llc -relocation-model=ropi-rwpi -mtriple=thumbv7m--none-eabi -mattr=no-movt < %s | FileCheck %s --check-prefix=CHECK --check-prefix=NO_MOVT_THUMB2_RO_PC  --check-prefix=NO_MOVT_THUMB2_RW_SB
+
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 @a = external global i32, align 4
@@ -28,16 +34,24 @@ entry:
 ; ARM_RW_ABS: movt    r[[REG]], :upper16:a
 ; ARM_RW_ABS: ldr     r0, [r[[REG]]]
 
-; ARM_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; ARM_RW_SB: movw    r[[REG:[0-9]]], :lower16:a(sbrel)
+; ARM_RW_SB: movt    r[[REG]], :upper16:a(sbrel)
 ; ARM_RW_SB: ldr     r0, [r9, r[[REG]]]
 
+; NO_MOVT_ARM_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_ARM_RW_SB: ldr     r0, [r9, r[[REG]]]
+
 ; THUMB2_RW_ABS: movw    r[[REG:[0-9]]], :lower16:a
 ; THUMB2_RW_ABS: movt    r[[REG]], :upper16:a
 ; THUMB2_RW_ABS: ldr     r0, [r[[REG]]]
 
-; THUMB2_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; THUMB2_RW_SB: movw    r[[REG:[0-9]]], :lower16:a(sbrel)
+; THUMB2_RW_SB: movt    r[[REG]], :upper16:a(sbrel)
 ; THUMB2_RW_SB: ldr.w   r0, [r9, r[[REG]]]
 
+; NO_MOVT_THUMB2_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_THUMB2_RW_SB: ldr.w   r0, [r9, r[[REG]]]
+
 ; THUMB1_RW_ABS: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
 ; THUMB1_RW_ABS: ldr     r0, [r[[REG]]]
 
@@ -47,11 +61,11 @@ entry:
 
 ; CHECK: {{(bx lr|pop)}}
 
-; ARM_RW_SB: [[LCPI]]
-; ARM_RW_SB: .long   a(sbrel)
+; NO_MOVT_ARM_RW_SB: [[LCPI]]
+; NO_MOVT_ARM_RW_SB: .long   a(sbrel)
 
-; THUMB2_RW_SB: [[LCPI]]
-; THUMB2_RW_SB: .long   a(sbrel)
+; NO_MOVT_THUMB2_RW_SB: [[LCPI]]
+; NO_MOVT_THUMB2_RW_SB: .long   a(sbrel)
 
 ; THUMB1_RW_ABS: [[LCPI]]
 ; THUMB1_RW_ABS-NEXT: .long a
@@ -70,16 +84,24 @@ entry:
 ; ARM_RW_ABS: movt    r[[REG]], :upper16:a
 ; ARM_RW_ABS: str     r0, [r[[REG:[0-9]]]]
 
-; ARM_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
-; ARM_RW_SB: str     r0, [r9, r[[REG]]]
+; ARM_RW_SB: movw    r[[REG:[0-9]]], :lower16:a
+; ARM_RW_SB: movt    r[[REG]], :upper16:a
+; ARM_RW_SB: str     r0, [r9, r[[REG:[0-9]]]]
+
+; NO_MOVT_ARM_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_ARM_RW_SB: str     r0, [r9, r[[REG]]]
 
 ; THUMB2_RW_ABS: movw    r[[REG:[0-9]]], :lower16:a
 ; THUMB2_RW_ABS: movt    r[[REG]], :upper16:a
 ; THUMB2_RW_ABS: str     r0, [r[[REG]]]
 
-; THUMB2_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; THUMB2_RW_SB: movw    r[[REG:[0-9]]], :lower16:a(sbrel)
+; THUMB2_RW_SB: movt    r[[REG]], :upper16:a(sbrel)
 ; THUMB2_RW_SB: str.w   r0, [r9, r[[REG]]]
 
+; NO_MOVT_THUMB2_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_THUMB2_RW_SB: str.w   r0, [r9, r[[REG]]]
+
 ; THUMB1_RW_ABS: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
 ; THUMB1_RW_ABS: str     r0, [r[[REG]]]
 
@@ -89,11 +111,11 @@ entry:
 
 ; CHECK: {{(bx lr|pop)}}
 
-; ARM_RW_SB: [[LCPI]]
-; ARM_RW_SB: .long   a(sbrel)
+; NO_MOVT_ARM_RW_SB: [[LCPI]]
+; NO_MOVT_ARM_RW_SB: .long   a(sbrel)
 
-; THUMB2_RW_SB: [[LCPI]]
-; THUMB2_RW_SB: .long   a(sbrel)
+; NO_MOVT_THUMB2_RW_SB: [[LCPI]]
+; NO_MOVT_THUMB2_RW_SB: .long   a(sbrel)
 
 ; THUMB1_RW_ABS: [[LCPI]]
 ; THUMB1_RW_ABS-NEXT: .long a
@@ -112,21 +134,37 @@ entry:
 ; ARM_RO_ABS: movt    r[[reg]], :upper16:b
 ; ARM_RO_ABS: ldr     r0, [r[[reg]]]
 
+; NO_MOVT_ARM_RO_ABS: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_ARM_RO_ABS: ldr     r0, [r[[REG]]]
+
 ; ARM_RO_PC: movw    r[[REG:[0-9]]], :lower16:(b-([[LPC:.LPC[0-9]+_[0-9]+]]+8))
 ; ARM_RO_PC: movt    r[[REG]], :upper16:(b-([[LPC]]+8))
 ; ARM_RO_PC: [[LPC]]:
 ; ARM_RO_PC-NEXT: ldr     r0, [pc, r[[REG]]]
 
+; NO_MOVT_ARM_RO_PC: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_ARM_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]:
+; NO_MOVT_ARM_RO_PC: ldr     r0, [pc, r[[REG]]]
+
 ; THUMB2_RO_ABS: movw    r[[REG:[0-9]]], :lower16:b
 ; THUMB2_RO_ABS: movt    r[[REG]], :upper16:b
 ; THUMB2_RO_ABS: ldr     r0, [r[[REG]]]
 
+; NO_MOVT_THUMB2_RO_ABS: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_THUMB2_RO_ABS: ldr     r0, [r[[REG]]]
+
 ; THUMB2_RO_PC: movw    r[[REG:[0-9]]], :lower16:(b-([[LPC:.LPC[0-9]+_[0-9]+]]+4))
 ; THUMB2_RO_PC: movt    r[[REG]], :upper16:(b-([[LPC]]+4))
 ; THUMB2_RO_PC: [[LPC]]:
 ; THUMB2_RO_PC-NEXT: add     r[[REG]], pc
 ; THUMB2_RO_PC: ldr     r0, [r[[REG]]]
 
+; NO_MOVT_THUMB2_RO_PC: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_THUMB2_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]:
+; NO_MOVT_THUMB2_RO_PC-NEXT: add     r[[REG]], pc
+; NO_MOVT_THUMB2_RO_PC: ldr     r0, [r[[REG]]]
+
+
 ; THUMB1_RO_ABS: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
 ; THUMB1_RO_ABS: ldr     r0, [r[[REG]]]
 
@@ -137,9 +175,21 @@ entry:
 
 ; CHECK: {{(bx lr|pop)}}
 
+; NO_MOVT_ARM_RO_ABS: [[LCPI]]
+; NO_MOVT_ARM_RO_ABS-NEXT: .long b
+
+; NO_MOVT_THUMB2_RO_ABS: [[LCPI]]
+; NO_MOVT_THUMB2_RO_ABS-NEXT: .long b
+
 ; THUMB1_RO_ABS: [[LCPI]]
 ; THUMB1_RO_ABS-NEXT: .long b
 
+; NO_MOVT_ARM_RO_PC: [[LCPI]]
+; NO_MOVT_ARM_RO_PC-NEXT: .long b-([[LPC]]+8)
+
+; NO_MOVT_THUMB2_RO_PC: [[LCPI]]
+; NO_MOVT_THUMB2_RO_PC-NEXT: .long b-([[LPC]]+4)
+
 ; THUMB1_RO_PC: [[LCPI]]
 ; THUMB1_RO_PC-NEXT: .long b-([[LPC]]+4)
 }
@@ -152,15 +202,23 @@ entry:
 ; ARM_RW_ABS: movw    r[[REG:[0-9]]], :lower16:a
 ; ARM_RW_ABS: movt    r[[REG]], :upper16:a
 
-; ARM_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; ARM_RW_SB: movw    r[[REG:[0-9]]], :lower16:a(sbrel)
+; ARM_RW_SB: movt    r[[REG]], :upper16:a(sbrel)
 ; ARM_RW_SB: add     r0, r9, r[[REG]]
 
+; NO_MOVT_ARM_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_ARM_RW_SB: add     r0, r9, r[[REG]]
+
 ; THUMB2_RW_ABS: movw    r[[REG:[0-9]]], :lower16:a
 ; THUMB2_RW_ABS: movt    r[[REG]], :upper16:a
 
-; THUMB2_RW_SB: ldr     r0, [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; THUMB2_RW_SB: movw    r[[REG:[0-9]]], :lower16:a(sbrel)
+; THUMB2_RW_SB: movt    r[[REG]], :upper16:a(sbrel)
 ; THUMB2_RW_SB: add     r0, r9
 
+; NO_MOVT_THUMB2_RW_SB: ldr     r0, [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_THUMB2_RW_SB: add     r0, r9
+
 ; THUMB1_RW_ABS: ldr     r0, [[LCPI:.LCPI[0-9]+_[0-9]+]]
 
 ; THUMB1_RW_SB: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
@@ -169,11 +227,11 @@ entry:
 
 ; CHECK: {{(bx lr|pop)}}
 
-; ARM_RW_SB: [[LCPI]]
-; ARM_RW_SB: .long   a(sbrel)
+; NO_MOVT_ARM_RW_SB: [[LCPI]]
+; NO_MOVT_ARM_RW_SB: .long   a(sbrel)
 
-; THUMB2_RW_SB: [[LCPI]]
-; THUMB2_RW_SB: .long   a(sbrel)
+; NO_MOVT_THUMB2_RW_SB: [[LCPI]]
+; NO_MOVT_THUMB2_RW_SB: .long   a(sbrel)
 
 ; THUMB1_RW_ABS: [[LCPI]]
 ; THUMB1_RW_ABS-NEXT: .long a
@@ -190,19 +248,31 @@ entry:
 ; ARM_RO_ABS: movw    r[[REG:[0-9]]], :lower16:b
 ; ARM_RO_ABS: movt    r[[REG]], :upper16:b
 
+; NO_MOVT_ARM_RO_ABS: ldr     r0, [[LCPI:.LCPI[0-9]+_[0-9]+]]
+
 ; ARM_RO_PC: movw    r[[REG:[0-9]]], :lower16:(b-([[LPC:.LPC[0-9]+_[0-9]+]]+8))
 ; ARM_RO_PC: movt    r[[REG]], :upper16:(b-([[LPC]]+8))
 ; ARM_RO_PC: [[LPC]]:
 ; ARM_RO_PC-NEXT: add     r0, pc, r[[REG:[0-9]]]
 
+; NO_MOVT_ARM_RO_PC: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_ARM_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]:
+; NO_MOVT_ARM_RO_PC-NEXT: add     r0, pc, r[[REG]]
+
 ; THUMB2_RO_ABS: movw    r[[REG:[0-9]]], :lower16:b
 ; THUMB2_RO_ABS: movt    r[[REG]], :upper16:b
 
+; NO_MOVT_THUMB2_RO_ABS: ldr     r0, [[LCPI:.LCPI[0-9]+_[0-9]+]]
+
 ; THUMB2_RO_PC: movw    r0, :lower16:(b-([[LPC:.LPC[0-9]+_[0-9]+]]+4))
 ; THUMB2_RO_PC: movt    r0, :upper16:(b-([[LPC]]+4))
 ; THUMB2_RO_PC: [[LPC]]:
 ; THUMB2_RO_PC-NEXT: add     r0, pc
 
+; NO_MOVT_THUMB2_RO_PC: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_THUMB2_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]:
+; NO_MOVT_THUMB2_RO_PC-NEXT: add     r[[REG]], pc
+
 ; THUMB1_RO_ABS: ldr     r0, [[LCPI:.LCPI[0-9]+_[0-9]+]]
 
 ; THUMB1_RO_PC: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
@@ -211,9 +281,21 @@ entry:
 
 ; CHECK: {{(bx lr|pop)}}
 
+; NO_MOVT_ARM_RO_ABS: [[LCPI]]
+; NO_MOVT_ARM_RO_ABS-NEXT: .long b
+
+; NO_MOVT_THUMB2_RO_ABS: [[LCPI]]
+; NO_MOVT_THUMB2_RO_ABS-NEXT: .long b
+
 ; THUMB1_RO_ABS: [[LCPI]]
 ; THUMB1_RO_ABS-NEXT: .long b
 
+; NO_MOVT_ARM_RO_PC: [[LCPI]]
+; NO_MOVT_ARM_RO_PC-NEXT: .long b-([[LPC]]+8)
+
+; NO_MOVT_THUMB2_RO_PC: [[LCPI]]
+; NO_MOVT_THUMB2_RO_PC-NEXT: .long b-([[LPC]]+4)
+
 ; THUMB1_RO_PC: [[LCPI]]
 ; THUMB1_RO_PC-NEXT: .long b-([[LPC]]+4)
 }
@@ -226,19 +308,31 @@ entry:
 ; ARM_RO_ABS: movw    r[[REG:[0-9]]], :lower16:take_addr_func
 ; ARM_RO_ABS: movt    r[[REG]], :upper16:take_addr_func
 
+; NO_MOVT_ARM_RO_ABS: ldr     r0, [[LCPI:.LCPI[0-9]+_[0-9]+]]
+
 ; ARM_RO_PC: movw    r[[REG:[0-9]]], :lower16:(take_addr_func-([[LPC:.LPC[0-9]+_[0-9]+]]+8))
 ; ARM_RO_PC: movt    r[[REG]], :upper16:(take_addr_func-([[LPC]]+8))
 ; ARM_RO_PC: [[LPC]]:
 ; ARM_RO_PC-NEXT: add     r0, pc, r[[REG:[0-9]]]
 
+; NO_MOVT_ARM_RO_PC: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_ARM_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]:
+; NO_MOVT_ARM_RO_PC-NEXT: add     r0, pc, r[[REG]]
+
 ; THUMB2_RO_ABS: movw    r[[REG:[0-9]]], :lower16:take_addr_func
 ; THUMB2_RO_ABS: movt    r[[REG]], :upper16:take_addr_func
 
+; NO_MOVT_THUMB2_RO_ABS: ldr     r0, [[LCPI:.LCPI[0-9]+_[0-9]+]]
+
 ; THUMB2_RO_PC: movw    r0, :lower16:(take_addr_func-([[LPC:.LPC[0-9]+_[0-9]+]]+4))
 ; THUMB2_RO_PC: movt    r0, :upper16:(take_addr_func-([[LPC]]+4))
 ; THUMB2_RO_PC: [[LPC]]:
 ; THUMB2_RO_PC-NEXT: add     r0, pc
 
+; NO_MOVT_THUMB2_RO_PC: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
+; NO_MOVT_THUMB2_RO_PC: [[LPC:.LPC[0-9]+_[0-9]+]]:
+; NO_MOVT_THUMB2_RO_PC-NEXT: add     r[[REG]], pc
+
 ; THUMB1_RO_ABS: ldr     r0, [[LCPI:.LCPI[0-9]+_[0-9]+]]
 
 ; THUMB1_RO_PC: ldr     r[[REG:[0-9]]], [[LCPI:.LCPI[0-9]+_[0-9]+]]
@@ -247,9 +341,21 @@ entry:
 
 ; CHECK: {{(bx lr|pop)}}
 
+; NO_MOVT_ARM_RO_ABS: [[LCPI]]
+; NO_MOVT_ARM_RO_ABS-NEXT: .long take_addr_func
+
+; NO_MOVT_THUMB2_RO_ABS: [[LCPI]]
+; NO_MOVT_THUMB2_RO_ABS-NEXT: .long take_addr_func
+
 ; THUMB1_RO_ABS: [[LCPI]]
 ; THUMB1_RO_ABS-NEXT: .long take_addr_func
 
+; NO_MOVT_ARM_RO_PC: [[LCPI]]
+; NO_MOVT_ARM_RO_PC-NEXT: .long take_addr_func-([[LPC]]+8)
+
+; NO_MOVT_THUMB2_RO_PC: [[LCPI]]
+; NO_MOVT_THUMB2_RO_PC-NEXT: .long take_addr_func-([[LPC]]+4)
+
 ; THUMB1_RO_PC: [[LCPI]]
 ; THUMB1_RO_PC-NEXT: .long take_addr_func-([[LPC]]+4)
 }
diff --git a/test/CodeGen/ARM/atomic-cmpxchg.ll b/test/CodeGen/ARM/atomic-cmpxchg.ll
index 364bd5d13691e9b8d0038ef840043727bd8d35fe..e026bae361e19b32ffe502b85514f8bd74a52384 100644
--- a/test/CodeGen/ARM/atomic-cmpxchg.ll
+++ b/test/CodeGen/ARM/atomic-cmpxchg.ll
@@ -24,14 +24,12 @@ entry:
 ; CHECK-THUMB-LABEL: test_cmpxchg_res_i8
 ; CHECK-THUMB: bl __sync_val_compare_and_swap_1
 ; CHECK-THUMB-NOT: mov [[R1:r[0-7]]], r0
-; CHECK-THUMB: push  {r0}
-; CHECK-THUMB: pop {[[R1:r[0-7]]]}
+; CHECK-THUMB: movs [[R1:r[0-7]]], r0
 ; CHECK-THUMB: movs r0, #1
 ; CHECK-THUMB: movs [[R2:r[0-9]+]], #0
 ; CHECK-THUMB: cmp [[R1]], {{r[0-9]+}}
 ; CHECK-THUMB: beq
-; CHECK-THUMB: push  {[[R2]]}
-; CHECK-THUMB: pop {r0}
+; CHECK-THUMB: movs r0, [[R2]]
 
 ; CHECK-ARMV6-LABEL: test_cmpxchg_res_i8:
 ; CHECK-ARMV6-NEXT:  .fnstart
@@ -66,14 +64,14 @@ entry:
 ; CHECK-ARMV7-NEXT: [[HEAD:.LBB[0-9_]+]]:
 ; CHECK-ARMV7-NEXT: strexb [[SUCCESS:r[0-9]+]], r2, [r0]
 ; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], #0
-; CHECK-ARMV7-NEXT: moveq [[RES:r[0-9]+]], #1
+; CHECK-ARMV7-NEXT: moveq r0, #1
 ; CHECK-ARMV7-NEXT: bxeq lr
 ; CHECK-ARMV7-NEXT: [[TRY]]:
-; CHECK-ARMV7-NEXT: ldrexb [[LD:r[0-9]+]], [r0]
-; CHECK-ARMV7-NEXT: cmp [[LD]], [[DESIRED]]
+; CHECK-ARMV7-NEXT: ldrexb [[SUCCESS]], [r0]
+; CHECK-ARMV7-NEXT: cmp [[SUCCESS]], r1
 ; CHECK-ARMV7-NEXT: beq [[HEAD]]
 ; CHECK-ARMV7-NEXT: clrex
-; CHECK-ARMV7-NEXT: mov [[RES]], #0
+; CHECK-ARMV7-NEXT: mov r0, #0
 ; CHECK-ARMV7-NEXT: bx lr
 
 ; CHECK-THUMBV7-LABEL: test_cmpxchg_res_i8:
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index e6a4949d53ce4c429941234127617b06eaac294e..23c4ccea460460c0302efc7b69964bd8e6b31b4d 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -320,10 +320,10 @@ define i32 @test_cmpxchg_fail_order1(i32 *%addr, i32 %desired, i32 %new) {
 ; CHECK:     strex   [[SUCCESS:r[0-9]+]], r2, [r[[ADDR]]]
 ; CHECK:     cmp     [[SUCCESS]], #0
 ; CHECK:     bne     [[LOOP_BB]]
-; CHECK:     b       [[END_BB:\.?LBB[0-9]+_[0-9]+]]
+; CHECK:     dmb     ish
+; CHECK:     bx      lr
 ; CHECK: [[FAIL_BB]]:
 ; CHECK-NEXT: clrex
-; CHECK-NEXT: [[END_BB]]:
 ; CHECK:     dmb     ish
 ; CHECK:     bx      lr
 
diff --git a/test/CodeGen/ARM/atomic-ops-v8.ll b/test/CodeGen/ARM/atomic-ops-v8.ll
index 77b850bd617b8e1e18d75c6ede9451b493e425e4..d1575ed12e4e18adb5944d72149161fedbd45a7a 100644
--- a/test/CodeGen/ARM/atomic-ops-v8.ll
+++ b/test/CodeGen/ARM/atomic-ops-v8.ll
@@ -1045,20 +1045,21 @@ define i8 @test_atomic_cmpxchg_i8(i8 zeroext %wanted, i8 zeroext %new) nounwind
   ;  function there.
 ; CHECK-ARM-NEXT:   cmp r[[OLD]], r0
 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: strexb [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK-ARM-NEXT: bx lr
    ret i8 %old
 }
 
@@ -1078,20 +1079,21 @@ define i16 @test_atomic_cmpxchg_i16(i16 zeroext %wanted, i16 zeroext %new) nounw
   ;  function there.
 ; CHECK-ARM-NEXT:   cmp r[[OLD]], r0
 ; CHECK-THUMB-NEXT: cmp r[[OLD]], r[[WANTED]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: stlexh [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK-ARM: mov r0, r[[OLD]]
+; CHECK-ARM-NEXT: bx lr
    ret i16 %old
 }
 
@@ -1110,20 +1112,21 @@ define void @test_atomic_cmpxchg_i32(i32 %wanted, i32 %new) nounwind {
   ; r0 below is a reasonable guess but could change: it certainly comes into the
   ;  function there.
 ; CHECK-NEXT: cmp r[[OLD]], r0
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r1 is a reasonable guess.
 ; CHECK: stlex [[STATUS:r[0-9]+]], r1, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK: str{{(.w)?}} r[[OLD]],
+; CHECK-NEXT: bx lr
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
 ; CHECK: str{{(.w)?}} r[[OLD]],
+; CHECK-ARM-NEXT: bx lr
    ret void
 }
 
@@ -1148,16 +1151,16 @@ define void @test_atomic_cmpxchg_i64(i64 %wanted, i64 %new) nounwind {
 ; CHECK-BE-DAG: eor{{(\.w)?}} [[MISMATCH_LO:r[0-9]+|lr]], [[OLD1]], r0
 ; CHECK-ARM-BE: orrs{{(\.w)?}} {{r[0-9]+}}, [[MISMATCH_HI]], [[MISMATCH_LO]]
 ; CHECK-THUMB-BE: orrs{{(\.w)?}} {{(r[0-9]+, )?}}[[MISMATCH_LO]], [[MISMATCH_HI]]
-; CHECK-NEXT: bne .LBB{{[0-9]+}}_3
+; CHECK-NEXT: bne .LBB{{[0-9]+}}_4
 ; CHECK-NEXT: BB#2:
   ; As above, r2, r3 is a reasonable guess.
 ; CHECK: strexd [[STATUS:r[0-9]+]], r2, r3, [r[[ADDR]]]
 ; CHECK-NEXT: cmp [[STATUS]], #0
 ; CHECK-NEXT: bne .LBB{{[0-9]+}}_1
-; CHECK-NEXT: b .LBB{{[0-9]+}}_4
-; CHECK-NEXT: .LBB{{[0-9]+}}_3:
-; CHECK-NEXT: clrex
+; CHECK: strd [[OLD1]], [[OLD2]], [r[[ADDR]]]
+; CHECK-NEXT: pop
 ; CHECK-NEXT: .LBB{{[0-9]+}}_4:
+; CHECK-NEXT: clrex
 ; CHECK-NOT: dmb
 ; CHECK-NOT: mcr
 
diff --git a/test/CodeGen/ARM/bfi.ll b/test/CodeGen/ARM/bfi.ll
index 893fef3add7e10389eb940c74ff465c4de59a74b..31eff16fcc3c4f34bbc5546b7997486b5c7aa2b3 100644
--- a/test/CodeGen/ARM/bfi.ll
+++ b/test/CodeGen/ARM/bfi.ll
@@ -77,7 +77,7 @@ entry:
 
 define i32 @f7(i32 %x, i32 %y) {
 ; CHECK-LABEL: f7:
-; CHECK: bfi r1, r0, #4, #1
+; CHECK: bfi r0, r2, #4, #1
   %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
   %and = and i32 %x, 4
   %or = or i32 %y2, 16
@@ -88,8 +88,8 @@ define i32 @f7(i32 %x, i32 %y) {
 
 define i32 @f8(i32 %x, i32 %y) {
 ; CHECK-LABEL: f8:
-; CHECK: bfi r1, r0, #4, #1
-; CHECK: bfi r1, r0, #5, #1
+; CHECK: bfi r0, r2, #4, #1
+; CHECK: bfi r0, r2, #5, #1
   %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
   %and = and i32 %x, 4
   %or = or i32 %y2, 48
@@ -111,7 +111,7 @@ define i32 @f9(i32 %x, i32 %y) {
 
 define i32 @f10(i32 %x, i32 %y) {
 ; CHECK-LABEL: f10:
-; CHECK: bfi r1, r0, #4, #2
+; CHECK: bfi r0, r2, #4, #2
   %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
   %and = and i32 %x, 4
   %or = or i32 %y2, 32
@@ -128,7 +128,7 @@ define i32 @f10(i32 %x, i32 %y) {
 
 define i32 @f11(i32 %x, i32 %y) {
 ; CHECK-LABEL: f11:
-; CHECK: bfi r1, r0, #4, #3
+; CHECK: bfi r0, r2, #4, #3
   %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
   %and = and i32 %x, 4
   %or = or i32 %y2, 32
@@ -150,7 +150,7 @@ define i32 @f11(i32 %x, i32 %y) {
 
 define i32 @f12(i32 %x, i32 %y) {
 ; CHECK-LABEL: f12:
-; CHECK: bfi r1, r0, #4, #1
+; CHECK: bfi r0, r2, #4, #1
   %y2 = and i32 %y, 4294967040 ; 0xFFFFFF00
   %and = and i32 %x, 4
   %or = or i32 %y2, 16
diff --git a/test/CodeGen/ARM/bic.ll b/test/CodeGen/ARM/bic.ll
index 691f8be4ab66b9200bbe9519dbea4577e8b2b1aa..8be59898bd0fe717c5321632af9edb8e0b8be59f 100644
--- a/test/CodeGen/ARM/bic.ll
+++ b/test/CodeGen/ARM/bic.ll
@@ -1,17 +1,24 @@
 ; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK: bic	r0, r0, r1
     %tmp = xor i32 %b, 4294967295
     %tmp1 = and i32 %a, %tmp
     ret i32 %tmp1
 }
 
-; CHECK: bic	r0, r0, r1
-
 define i32 @f2(i32 %a, i32 %b) {
+; CHECK-LABEL: f2:
+; CHECK: bic	r0, r0, r1
     %tmp = xor i32 %b, 4294967295
     %tmp1 = and i32 %tmp, %a
     ret i32 %tmp1
 }
 
-; CHECK: bic	r0, r0, r1
+define i32 @f3(i32 %a) {
+; CHECK-LABEL: f3:
+; CHECK: bic r0, r0, #255
+    %tmp = and i32 %a, -256
+    ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/bool-ext-inc.ll b/test/CodeGen/ARM/bool-ext-inc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fe43f1b2ef93dc13333c5206665bef348e8e9612
--- /dev/null
+++ b/test/CodeGen/ARM/bool-ext-inc.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=arm-eabi -mattr=neon | FileCheck %s
+
+define i32 @sext_inc(i1 zeroext %x) {
+; CHECK-LABEL: sext_inc:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    rsb r0, r0, #1
+; CHECK-NEXT:    mov pc, lr
+  %ext = sext i1 %x to i32
+  %add = add i32 %ext, 1
+  ret i32 %add
+}
+
+define <4 x i32> @sext_inc_vec(<4 x i1> %x) {
+; CHECK-LABEL: sext_inc_vec:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vmov.i32 q9, #0x1f
+; CHECK-NEXT:    vmov.i32 q10, #0x1
+; CHECK-NEXT:    vmovl.u16 q8, d16
+; CHECK-NEXT:    vneg.s32 q9, q9
+; CHECK-NEXT:    vshl.i32 q8, q8, #31
+; CHECK-NEXT:    vshl.s32 q8, q8, q9
+; CHECK-NEXT:    vadd.i32 q8, q8, q10
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
+  %ext = sext <4 x i1> %x to <4 x i32>
+  %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %add
+}
+
diff --git a/test/CodeGen/ARM/build-attributes.ll b/test/CodeGen/ARM/build-attributes.ll
index b1b3b46dce24935f99317d4f25fe68ddf926efd2..fc85a3a2e6834d794d2840ac66786b45ec478163 100644
--- a/test/CodeGen/ARM/build-attributes.ll
+++ b/test/CodeGen/ARM/build-attributes.ll
@@ -102,6 +102,10 @@
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -mattr=+fp-only-sp  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M7-FAST
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 | FileCheck %s --check-prefix=CORTEX-M7-DOUBLE
 ; RUN: llc < %s -mtriple=thumbv7em-linux-gnueabi -mcpu=cortex-m7 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m23 | FileCheck %s --check-prefix=CORTEX-M23
+; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=CORTEX-M33
+; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-M33-FAST
+; RUN: llc < %s -mtriple=thumbv8-linux-gnueabi -mcpu=cortex-m33 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4 | FileCheck %s --check-prefix=CORTEX-R4
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r4f | FileCheck %s --check-prefix=CORTEX-R4F
 ; RUN: llc < %s -mtriple=armv7r-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=CORTEX-R5
@@ -182,6 +186,8 @@
 ; ARMv7a
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; ARMv7ve
+; RUN: llc < %s -mtriple=armv7ve-none-linux-gnueabi | FileCheck %s --check-prefix=V7VE
 ; ARMv7r
 ; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv7r-none-linux-gnueabi -mcpu=cortex-r5 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
@@ -210,6 +216,12 @@
 ; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-r52 -mattr=-neon,+fp-only-sp,+d16 | FileCheck %s --check-prefix=ARMv8R --check-prefix=ARMv8R-SP
 ; RUN: llc < %s -mtriple=arm-none-none-eabi -mcpu=cortex-r52 | FileCheck %s --check-prefix=ARMv8R --check-prefix=ARMv8R-NEON
 
+; ARMv8-M
+; RUN: llc < %s -mtriple=thumbv8-none-none-eabi -mcpu=cortex-m23 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv8-none-none-eabi -mcpu=cortex-m23 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv8-none-none-eabi -mcpu=cortex-m33 | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=thumbv8-none-none-eabi -mcpu=cortex-m33 -mattr=+strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+
 ; XSCALE:      .eabi_attribute 6, 5
 ; XSCALE:      .eabi_attribute 8, 1
 ; XSCALE:      .eabi_attribute 9, 1
@@ -369,6 +381,22 @@
 ; V7-FAST-NOT:   .eabi_attribute 22
 ; V7-FAST:   .eabi_attribute 23, 1
 
+; V7VE:      .syntax unified
+; V7VE: .eabi_attribute 6, 10   @ Tag_CPU_arch
+; V7VE: .eabi_attribute 7, 65   @ Tag_CPU_arch_profile
+; V7VE: .eabi_attribute 8, 1    @ Tag_ARM_ISA_use
+; V7VE: .eabi_attribute 9, 2    @ Tag_THUMB_ISA_use
+; V7VE: .eabi_attribute 17, 1   @ Tag_ABI_PCS_GOT_use
+; V7VE: .eabi_attribute 20, 1   @ Tag_ABI_FP_denormal
+; V7VE: .eabi_attribute 21, 1   @ Tag_ABI_FP_exceptions
+; V7VE: .eabi_attribute 23, 3   @ Tag_ABI_FP_number_model
+; V7VE: .eabi_attribute 24, 1   @ Tag_ABI_align_needed
+; V7VE: .eabi_attribute 25, 1   @ Tag_ABI_align_preserved
+; V7VE: .eabi_attribute 38, 1   @ Tag_ABI_FP_16bit_format
+; V7VE: .eabi_attribute 42, 1   @ Tag_MPextension_use
+; V7VE: .eabi_attribute 44, 2   @ Tag_DIV_use
+; V7VE: .eabi_attribute 68, 3   @ Tag_Virtualization_use
+
 ; V8:      .syntax unified
 ; V8: .eabi_attribute 67, "2.09"
 ; V8: .eabi_attribute 6, 14
@@ -1310,6 +1338,55 @@
 ; CORTEX-A32-FAST-NOT:  .eabi_attribute 22
 ; CORTEX-A32-FAST:  .eabi_attribute 23, 1
 
+; CORTEX-M23:  .cpu cortex-m23
+; CORTEX-M23:  .eabi_attribute 6, 16
+; CORTEX-M23:  .eabi_attribute 7, 77
+; CORTEX-M23:  .eabi_attribute 8, 0
+; CORTEX-M23:  .eabi_attribute 9, 3
+; CORTEX-M23:  .eabi_attribute 17, 1
+;; We default to IEEE 754 compliance
+; CORTEX-M23-NOT:   .eabi_attribute 19
+; CORTEX-M23:  .eabi_attribute 20, 1
+; CORTEX-M23:  .eabi_attribute 21, 1
+; CORTEX-M23:  .eabi_attribute 23, 3
+; CORTEX-M23:  .eabi_attribute 34, 1
+; CORTEX-M23:  .eabi_attribute 24, 1
+; CORTEX-M23-NOT:  .eabi_attribute 27
+; CORTEX-M23-NOT:  .eabi_attribute 28
+; CORTEX-M23:  .eabi_attribute 25, 1
+; CORTEX-M23:  .eabi_attribute 38, 1
+; CORTEX-M23:  .eabi_attribute 14, 0
+; CORTEX-M23-NOT:  .eabi_attribute 44
+
+; CORTEX-M33:  .cpu cortex-m33
+; CORTEX-M33:  .eabi_attribute 6, 17
+; CORTEX-M33:  .eabi_attribute 7, 77
+; CORTEX-M33:  .eabi_attribute 8, 0
+; CORTEX-M33:  .eabi_attribute 9, 3
+; CORTEX-M33:  .fpu fpv5-sp-d16
+; CORTEX-M33:  .eabi_attribute 17, 1
+;; We default to IEEE 754 compliance
+; CORTEX-M23-NOT:   .eabi_attribute 19
+; CORTEX-M33:  .eabi_attribute 20, 1
+; CORTEX-M33:  .eabi_attribute 21, 1
+; CORTEX-M33:  .eabi_attribute 23, 3
+; CORTEX-M33:  .eabi_attribute 34, 1
+; CORTEX-M33:  .eabi_attribute 24, 1
+; CORTEX-M33:  .eabi_attribute 25, 1
+; CORTEX-M33:  .eabi_attribute 27, 1
+; CORTEX-M33-NOT:  .eabi_attribute 28
+; CORTEX-M33:  .eabi_attribute 36, 1
+; CORTEX-M33:  .eabi_attribute 38, 1
+; CORTEX-M33:  .eabi_attribute 46, 1
+; CORTEX-M33-NOT:  .eabi_attribute 44
+; CORTEX-M33:  .eabi_attribute 14, 0
+
+; CORTEX-M33-FAST-NOT:   .eabi_attribute 19
+; CORTEX-M33-FAST:  .eabi_attribute 20, 2
+; CORTEX-M33-FAST-NOT:  .eabi_attribute 21
+; CORTEX-M33-FAST-NOT:  .eabi_attribute 22
+; CORTEX-M33-FAST:  .eabi_attribute 23, 1
+
 ; CORTEX-A35:  .cpu cortex-a35
 ; CORTEX-A35:  .eabi_attribute 6, 14
 ; CORTEX-A35:  .eabi_attribute 7, 65
diff --git a/test/CodeGen/ARM/cmpxchg-weak.ll b/test/CodeGen/ARM/cmpxchg-weak.ll
index 4038528c91bc847d00729ed43553f5476863b575..0d5681aafbcb099b8bcd9b9e25795f20c9f6cf31 100644
--- a/test/CodeGen/ARM/cmpxchg-weak.ll
+++ b/test/CodeGen/ARM/cmpxchg-weak.ll
@@ -13,14 +13,16 @@ define void @test_cmpxchg_weak(i32 *%addr, i32 %desired, i32 %new) {
 ; CHECK-NEXT:     dmb ish
 ; CHECK-NEXT:     strex   [[SUCCESS:r[0-9]+]], r2, [r0]
 ; CHECK-NEXT:     cmp     [[SUCCESS]], #0
-; CHECK-NEXT:     bne     [[FAILBB:LBB[0-9]+_[0-9]+]]
+; CHECK-NEXT:     beq     [[SUCCESSBB:LBB[0-9]+_[0-9]+]]
 ; CHECK-NEXT: BB#2:
-; CHECK-NEXT:     dmb     ish
 ; CHECK-NEXT:     str     r3, [r0]
 ; CHECK-NEXT:     bx      lr
 ; CHECK-NEXT: [[LDFAILBB]]:
 ; CHECK-NEXT:     clrex
-; CHECK-NEXT: [[FAILBB]]:
+; CHECK-NEXT:     str     r3, [r0]
+; CHECK-NEXT:     bx      lr
+; CHECK-NEXT: [[SUCCESSBB]]:
+; CHECK-NEXT:     dmb     ish
 ; CHECK-NEXT:     str     r3, [r0]
 ; CHECK-NEXT:     bx      lr
 
diff --git a/test/CodeGen/ARM/constantpool-promote.ll b/test/CodeGen/ARM/constantpool-promote.ll
index fb1bdfd62fb7c18c75d11c04e64da6033b76ef33..8df7e100c0514d2813b6be7a7f921d282dd4b8d8 100644
--- a/test/CodeGen/ARM/constantpool-promote.ll
+++ b/test/CodeGen/ARM/constantpool-promote.ll
@@ -1,10 +1,15 @@
-; RUN: llc -relocation-model=static < %s | FileCheck %s
-; RUN: llc -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -relocation-model=ropi < %s | FileCheck %s
-; RUN: llc -relocation-model=rwpi < %s | FileCheck %s
-
-target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
-target triple = "armv7--linux-gnueabihf"
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple armv7--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7ARM
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv7--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V7,CHECK-V7THUMB
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=static < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=pic < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=ropi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
+; RUN: llc -mtriple thumbv6m--linux-gnueabihf -relocation-model=rwpi < %s | FileCheck %s --check-prefixes=CHECK,CHECK-V6M
 
 @.str = private unnamed_addr constant [2 x i8] c"s\00", align 1
 @.str1 = private unnamed_addr constant [69 x i8] c"this string is far too long to fit in a literal pool by far and away\00", align 1
@@ -16,6 +21,7 @@ target triple = "armv7--linux-gnueabihf"
 @.arr3 = private unnamed_addr constant [2 x i16*] [i16* null, i16* null], align 4
 @.ptr = private unnamed_addr constant [2 x i16*] [i16* getelementptr inbounds ([2 x i16], [2 x i16]* @.arr2, i32 0, i32 0), i16* null], align 2
 @.arr4 = private unnamed_addr constant [2 x i16] [i16 3, i16 4], align 16
+@.zerosize = private unnamed_addr constant [0 x i16] zeroinitializer, align 4
 
 ; CHECK-LABEL: @test1
 ; CHECK: adr r0, [[x:.*]]
@@ -134,18 +140,56 @@ define void @test9() #0 {
   ret void
 }
 
+; Ensure that zero sized values are supported / not promoted.
+; CHECK-LABEL: @pr32130
+; CHECK-NOT: adr
+define void @pr32130() #0 {
+  tail call void @c(i16* getelementptr inbounds ([0 x i16], [0 x i16]* @.zerosize, i32 0, i32 0)) #2
+  ret void
+}
+
+; CHECK-LABEL: @test10
+; CHECK-V6M: adr r{{[0-9]*}}, [[x:.*]]
+; CHECK-V6M: [[x]]:
+; CHECK-V6M: .asciz "s\000\000"
+; CHECK-V7: ldrb{{(.w)?}} r{{[0-9]*}}, [[x:.*]]
+; CHECK-V7: [[x]]:
+; CHECK-V7: .asciz "s\000\000"
+define void @test10(i8* %a) local_unnamed_addr #0 {
+  call void @llvm.memmove.p0i8.p0i8.i32(i8* %a, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str, i32 0, i32 0), i32 1, i32 1, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @test11
+; CHECK-V6M: adr r{{[0-9]*}}, [[x:.*]]
+; CHECK-V6M: [[x]]:
+; CHECK-V6M: .short 3
+; CHECK-V6M: .short 4
+; CHECK-V7THUMB: ldrh{{(.w)?}} r{{[0-9]*}}, [[x:.*]]
+; CHECK-V7THUMB: [[x]]:
+; CHECK-V7THUMB: .short 3
+; CHECK-V7THUMB: .short 4
+; CHECK-V7ARM: adr r{{[0-9]*}}, [[x:.*]]
+; CHECK-V7ARM: [[x]]:
+; CHECK-V7ARM: .short 3
+; CHECK-V7ARM: .short 4
+define void @test11(i16* %a) local_unnamed_addr #0 {
+  call void @llvm.memmove.p0i16.p0i16.i32(i16* %a, i16* getelementptr inbounds ([2 x i16], [2 x i16]* @.arr1, i32 0, i32 0), i32 2, i32 2, i1 false)
+  ret void
+}
+
 
 declare void @b(i8*) #1
 declare void @c(i16*) #1
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1)
+declare void @llvm.memmove.p0i8.p0i8.i32(i8*, i8*, i32, i32, i1) local_unnamed_addr
+declare void @llvm.memmove.p0i16.p0i16.i32(i16*, i16*, i32, i32, i1) local_unnamed_addr
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !llvm.module.flags = !{!0, !1}
-!llvm.ident = !{!2}
 
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 1, !"min_enum_size", i32 4}
-!2 = !{!"Apple LLVM version 6.1.0 (clang-602.0.53) (based on LLVM 3.6.0svn)"}
diff --git a/test/CodeGen/ARM/debug-info-s16-reg.ll b/test/CodeGen/ARM/debug-info-s16-reg.ll
index 2987b9a2105aaed34763b4fdd2602391f15c8988..197746c5f122b1b7d8b424b6f9d6f746f48af035 100644
--- a/test/CodeGen/ARM/debug-info-s16-reg.ll
+++ b/test/CodeGen/ARM/debug-info-s16-reg.ll
@@ -3,8 +3,6 @@
 ; Test dwarf reg no for s16
 ;CHECK: super-register DW_OP_regx
 ;CHECK-NEXT: 264
-;CHECK-NEXT: DW_OP_piece
-;CHECK-NEXT: 4
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32"
 target triple = "thumbv7-apple-macosx10.6.7"
diff --git a/test/CodeGen/ARM/debug-info-sreg2.ll b/test/CodeGen/ARM/debug-info-sreg2.ll
index b31d1b7bed4f86f18da0245322161fb2a699d76c..094b1049978881e07e449cf20b488acb5b32635d 100644
--- a/test/CodeGen/ARM/debug-info-sreg2.ll
+++ b/test/CodeGen/ARM/debug-info-sreg2.ll
@@ -10,7 +10,7 @@ target triple = "thumbv7-apple-macosx10.6.7"
 
 ; CHECK: 0x00000000: Beginning address offset:
 ; CHECK-NEXT:           Ending address offset:
-; CHECK-NEXT:            Location description: 90 {{.. .. .. .. $}}
+; CHECK-NEXT:            Location description: 90 {{.. .. $}}
 
 define void @_Z3foov() optsize ssp !dbg !1 {
 entry:
diff --git a/test/CodeGen/ARM/div.ll b/test/CodeGen/ARM/div.ll
index 997f50760f3a973798157631b4201d65fbc16b87..8837315197554f511814247dcd43d7bbc978f3bf 100644
--- a/test/CodeGen/ARM/div.ll
+++ b/test/CodeGen/ARM/div.ll
@@ -10,12 +10,18 @@
 ; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-HWDIV
 ; RUN: llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8    | \
 ; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-EABI
+; RUN: llc < %s -mtriple=armv7ve-none-linux-gnu           | \
+; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-HWDIV
+; RUN: llc < %s -mtriple=thumbv7ve-none-linux-gnu         | \
+; RUN:     FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-HWDIV \
+; RUN:                  -check-prefix=CHECK-THUMB
 
 define i32 @f1(i32 %a, i32 %b) {
 entry:
 ; CHECK-LABEL: f1
 ; CHECK-SWDIV: __divsi3
 
+; CHECK-THUMB: .thumb_func
 ; CHECK-HWDIV: sdiv
 
 ; CHECK-EABI: __aeabi_idiv
@@ -28,6 +34,7 @@ entry:
 ; CHECK-LABEL: f2
 ; CHECK-SWDIV: __udivsi3
 
+; CHECK-THUMB: .thumb_func
 ; CHECK-HWDIV: udiv
 
 ; CHECK-EABI: __aeabi_uidiv
@@ -40,6 +47,7 @@ entry:
 ; CHECK-LABEL: f3
 ; CHECK-SWDIV: __modsi3
 
+; CHECK-THUMB: .thumb_func
 ; CHECK-HWDIV: sdiv
 ; CHECK-HWDIV: mls
 
@@ -55,6 +63,7 @@ entry:
 ; CHECK-LABEL: f4
 ; CHECK-SWDIV: __umodsi3
 
+; CHECK-THUMB: .thumb_func
 ; CHECK-HWDIV: udiv
 ; CHECK-HWDIV: mls
 
diff --git a/test/CodeGen/ARM/fast-isel-align.ll b/test/CodeGen/ARM/fast-isel-align.ll
index 701884e926a89bccd932d8f6bfd1ce5355f05b0b..71cd73a4a25d10e00c5f09c57b96484ad282b5ef 100644
--- a/test/CodeGen/ARM/fast-isel-align.ll
+++ b/test/CodeGen/ARM/fast-isel-align.ll
@@ -72,10 +72,10 @@ entry:
   %4 = fcmp une float %3, 0.000000e+00
 ; ARM: ldr r[[R:[0-9]+]], [r0, #2]
 ; ARM: vmov s0, r[[R]]
-; ARM: vcmpe.f32 s0, #0
+; ARM: vcmp.f32 s0, #0
 ; THUMB: ldr.w r[[R:[0-9]+]], [r0, #2]
 ; THUMB: vmov s0, r[[R]]
-; THUMB: vcmpe.f32 s0, #0
+; THUMB: vcmp.f32 s0, #0
   ret i1 %4
 }
 
diff --git a/test/CodeGen/ARM/fast-isel-cmp-imm.ll b/test/CodeGen/ARM/fast-isel-cmp-imm.ll
index a9d7e4580638e73e69c8daf336ad36cdf67dd336..543b6c285f3f7c851783a5e0b452808b57b5e3d7 100644
--- a/test/CodeGen/ARM/fast-isel-cmp-imm.ll
+++ b/test/CodeGen/ARM/fast-isel-cmp-imm.ll
@@ -7,8 +7,8 @@ entry:
 ; ARM: t1a
 ; THUMB: t1a
   %cmp = fcmp oeq float %a, 0.000000e+00
-; ARM: vcmpe.f32 s{{[0-9]+}}, #0
-; THUMB: vcmpe.f32 s{{[0-9]+}}, #0
+; ARM: vcmp.f32 s{{[0-9]+}}, #0
+; THUMB: vcmp.f32 s{{[0-9]+}}, #0
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
@@ -28,9 +28,9 @@ entry:
 ; THUMB: t1b
   %cmp = fcmp oeq float %a, -0.000000e+00
 ; ARM: vldr
-; ARM: vcmpe.f32 s{{[0-9]+}}, s{{[0-9]+}}
+; ARM: vcmp.f32 s{{[0-9]+}}, s{{[0-9]+}}
 ; THUMB: vldr
-; THUMB: vcmpe.f32 s{{[0-9]+}}, s{{[0-9]+}}
+; THUMB: vcmp.f32 s{{[0-9]+}}, s{{[0-9]+}}
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
@@ -46,8 +46,8 @@ entry:
 ; ARM: t2a
 ; THUMB: t2a
   %cmp = fcmp oeq double %a, 0.000000e+00
-; ARM: vcmpe.f64 d{{[0-9]+}}, #0
-; THUMB: vcmpe.f64 d{{[0-9]+}}, #0
+; ARM: vcmp.f64 d{{[0-9]+}}, #0
+; THUMB: vcmp.f64 d{{[0-9]+}}, #0
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
@@ -65,9 +65,9 @@ entry:
 ; THUMB: t2b
   %cmp = fcmp oeq double %a, -0.000000e+00
 ; ARM: vldr
-; ARM: vcmpe.f64 d{{[0-9]+}}, d{{[0-9]+}}
+; ARM: vcmp.f64 d{{[0-9]+}}, d{{[0-9]+}}
 ; THUMB: vldr
-; THUMB: vcmpe.f64 d{{[0-9]+}}, d{{[0-9]+}}
+; THUMB: vcmp.f64 d{{[0-9]+}}, d{{[0-9]+}}
   br i1 %cmp, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
diff --git a/test/CodeGen/ARM/fold-stack-adjust.ll b/test/CodeGen/ARM/fold-stack-adjust.ll
index 442459bc0582c984245bb58f8dbf15f62420f319..eb32ee54c09594c5059f1853adc64b880bbf8122 100644
--- a/test/CodeGen/ARM/fold-stack-adjust.ll
+++ b/test/CodeGen/ARM/fold-stack-adjust.ll
@@ -135,7 +135,7 @@ define void @test_fold_point(i1 %tst) minsize {
 
   ; Important to check for beginning of basic block, because if it gets
   ; if-converted the test is probably no longer checking what it should.
-; CHECK: {{LBB[0-9]+_2}}:
+; CHECK: %end
 ; CHECK-NEXT: vpop {d7, d8}
 ; CHECK-NEXT: pop {r4, pc}
 
diff --git a/test/CodeGen/ARM/fp-only-sp.ll b/test/CodeGen/ARM/fp-only-sp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2c7b2acbde9c5a6e97d6b788eaf0ac612bbc71ab
--- /dev/null
+++ b/test/CodeGen/ARM/fp-only-sp.ll
@@ -0,0 +1,62 @@
+; RUN: llc -mtriple=thumbv7em-apple-macho -mcpu=cortex-m4 %s -o - -O0 | FileCheck %s
+; RUN: llc -mtriple=thumbv7em-apple-macho -mcpu=cortex-m4 %s -o - | FileCheck %s
+
+; Note: vldr and vstr really do have 64-bit variants even with fp-only-sp
+define void @test_load_store(double* %addr) {
+; CHECK-LABEL: test_load_store:
+; CHECK: vldr [[TMP:d[0-9]+]], [r0]
+; CHECK: vstr [[TMP]], [r0]
+  %val = load volatile double, double* %addr
+  store volatile double %val, double* %addr
+  ret void
+}
+
+define void @test_cmp(double %l, double %r, i1* %addr.dst) {
+; CHECK-LABEL: test_cmp:
+; CHECK: bl ___eqdf2
+  %res = fcmp oeq double %l, %r
+  store i1 %res, i1* %addr.dst
+  ret void
+}
+
+define void @test_ext(float %in, double* %addr) {
+; CHECK-LABEL: test_ext:
+; CHECK: bl ___extendsfdf2
+  %res = fpext float %in to double
+  store double %res, double* %addr
+  ret void
+}
+
+define void @test_trunc(double %in, float* %addr) {
+; CHECK-LABEL: test_trunc:
+; CHECK: bl ___truncdfsf2
+  %res = fptrunc double %in to float
+  store float %res, float* %addr
+  ret void
+}
+
+define void @test_itofp(i32 %in, double* %addr) {
+; CHECK-LABEL: test_itofp:
+; CHECK: bl ___floatsidf
+  %res = sitofp i32 %in to double
+  store double %res, double* %addr
+;  %res = fptoui double %tmp to i32
+  ret void
+}
+
+define i32 @test_fptoi(double* %addr) {
+; CHECK-LABEL: test_fptoi:
+; CHECK: bl ___fixunsdfsi
+  %val = load double, double* %addr
+  %res = fptoui double %val to i32
+  ret i32 %res
+}
+
+define void @test_binop(double* %addr) {
+; CHECK-LABEL: test_binop:
+; CHECK: bl ___adddf3
+  %in = load double, double* %addr
+  %res = fadd double %in, %in
+  store double %res, double* %addr
+  ret void
+}
diff --git a/test/CodeGen/ARM/fp16-promote.ll b/test/CodeGen/ARM/fp16-promote.ll
index c9dafa8dffff020ce97589862997cc5826c3561b..9148ac109ae3883f19363dafaa0962d89a635a45 100644
--- a/test/CodeGen/ARM/fp16-promote.ll
+++ b/test/CodeGen/ARM/fp16-promote.ll
@@ -161,14 +161,14 @@ define void @test_select(half* %p, half* %q, i1 zeroext %c) #0 {
   ret void
 }
 
-; Test only two variants of fcmp.  These get translated to f32 vcmpe
+; Test only two variants of fcmp.  These get translated to f32 vcmp
 ; instructions anyway.
 ; CHECK-ALL-LABEL: test_fcmp_une:
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl __aeabi_h2f
-; CHECK-VFP: vcmpe.f32
+; CHECK-VFP: vcmp.f32
 ; CHECK-NOVFP: bl __aeabi_fcmpeq
 ; CHECK-FP16: vmrs APSR_nzcv, fpscr
 ; CHECK-ALL: movw{{ne|eq}}
@@ -184,7 +184,7 @@ define i1 @test_fcmp_une(half* %p, half* %q) #0 {
 ; CHECK-FP16: vcvtb.f32.f16
 ; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl __aeabi_h2f
-; CHECK-VFP: vcmpe.f32
+; CHECK-VFP: vcmp.f32
 ; CHECK-NOVFP: bl __aeabi_fcmpeq
 ; CHECK-FP16: vmrs APSR_nzcv, fpscr
 ; CHECK-LIBCALL: movw{{ne|eq}}
@@ -597,7 +597,7 @@ define void @test_fma(half* %p, half* %q, half* %r) #0 {
 ; CHECK-FP16: vcvtb.f16.f32
 ; CHECK-LIBCALL-LABEL: test_fabs:
 ; CHECK-LIBCALL: bl __aeabi_h2f
-; CHECK-LIBCALL: bfc
+; CHECK-LIBCALL: bic
 ; CHECK-LIBCALL: bl __aeabi_f2h
 define void @test_fabs(half* %p) {
   %a = load half, half* %p, align 2
@@ -687,7 +687,7 @@ define void @test_maxnan(half* %p) #0 {
 ; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-LIBCALL: bl __aeabi_h2f
 ; CHECK-VFP-LIBCALL: vbsl
-; CHECK-NOVFP: bfc
+; CHECK-NOVFP: bic
 ; CHECK-NOVFP: and
 ; CHECK-NOVFP: orr
 ; CHECK-LIBCALL: bl __aeabi_f2h
@@ -847,21 +847,15 @@ define void @test_insertelement(half* %p, <4 x half>* %q, i32 %i) #0 {
 }
 
 ; CHECK-ALL-LABEL: test_extractelement:
+; CHECK-VFP: push {{{.*}}, lr}
 ; CHECK-VFP: sub sp, sp, #8
-; CHECK-VFP: ldrh
-; CHECK-VFP: ldrh
-; CHECK-VFP: orr
-; CHECK-VFP: str
-; CHECK-VFP: ldrh
-; CHECK-VFP: ldrh
-; CHECK-VFP: orr
-; CHECK-VFP: str
+; CHECK-VFP: ldrd
 ; CHECK-VFP: mov
 ; CHECK-VFP: orr
 ; CHECK-VFP: ldrh
 ; CHECK-VFP: strh
 ; CHECK-VFP: add sp, sp, #8
-; CHECK-VFP: bx lr
+; CHECK-VFP: pop {{{.*}}, pc}
 ; CHECK-NOVFP: ldrh
 ; CHECK-NOVFP: strh
 ; CHECK-NOVFP: ldrh
diff --git a/test/CodeGen/ARM/fpcmp-opt.ll b/test/CodeGen/ARM/fpcmp-opt.ll
index 45bb6d2f702d0f7c6da4414cb191e2cfd1490ed7..a8285410945071a9fe668d74056089fbe6cbbb5b 100644
--- a/test/CodeGen/ARM/fpcmp-opt.ll
+++ b/test/CodeGen/ARM/fpcmp-opt.ll
@@ -10,7 +10,7 @@ entry:
 ; CHECK-LABEL: t1:
 ; CHECK: vldr [[S0:s[0-9]+]],
 ; CHECK: vldr [[S1:s[0-9]+]],
-; CHECK: vcmpe.f32 [[S1]], [[S0]]
+; CHECK: vcmp.f32 [[S1]], [[S0]]
 ; CHECK: vmrs APSR_nzcv, fpscr
 ; CHECK: beq
   %0 = load float, float* %a
@@ -35,10 +35,10 @@ entry:
 ; CHECK-NOT: vldr
 ; CHECK: ldrd [[REG1:(r[0-9]+)]], [[REG2:(r[0-9]+)]], [r0]
 ; CHECK-NOT: b LBB
-; CHECK: bfc [[REG2]], #31, #1
+; CHECK: bic [[REG2]], [[REG2]], #-2147483648
 ; CHECK: cmp [[REG1]], #0
 ; CHECK: cmpeq [[REG2]], #0
-; CHECK-NOT: vcmpe.f32
+; CHECK-NOT: vcmp.f32
 ; CHECK-NOT: vmrs
 ; CHECK: bne
   %0 = load double, double* %a
@@ -61,7 +61,7 @@ entry:
 ; CHECK: ldr [[REG3:(r[0-9]+)]], [r0]
 ; CHECK: mvn [[REG4:(r[0-9]+)]], #-2147483648
 ; CHECK: tst [[REG3]], [[REG4]]
-; CHECK-NOT: vcmpe.f32
+; CHECK-NOT: vcmp.f32
 ; CHECK-NOT: vmrs
 ; CHECK: bne
   %0 = load float, float* %a
diff --git a/test/CodeGen/ARM/fpcmp.ll b/test/CodeGen/ARM/fpcmp.ll
index e3ffd45a396d868399b28b301f016df0e247d4a7..67326e0001697d81c0e46ea77f714216dd937a52 100644
--- a/test/CodeGen/ARM/fpcmp.ll
+++ b/test/CodeGen/ARM/fpcmp.ll
@@ -12,7 +12,7 @@ entry:
 
 define i32 @f2(float %a) {
 ;CHECK-LABEL: f2:
-;CHECK: vcmpe.f32
+;CHECK: vcmp.f32
 ;CHECK: moveq
 entry:
         %tmp = fcmp oeq float %a, 1.000000e+00          ; <i1> [#uses=1]
@@ -52,7 +52,7 @@ entry:
 
 define i32 @f6(float %a) {
 ;CHECK-LABEL: f6:
-;CHECK: vcmpe.f32
+;CHECK: vcmp.f32
 ;CHECK: movne
 entry:
         %tmp = fcmp une float %a, 1.000000e+00          ; <i1> [#uses=1]
diff --git a/test/CodeGen/ARM/fpcmp_ueq.ll b/test/CodeGen/ARM/fpcmp_ueq.ll
index c1696c9be1b7c4753ffe3bbb8ade02aa9304def5..698c7506cc5936864b5bc3a1f770d66910f08c1c 100644
--- a/test/CodeGen/ARM/fpcmp_ueq.ll
+++ b/test/CodeGen/ARM/fpcmp_ueq.ll
@@ -17,7 +17,7 @@ entry:
 ; CHECK-ARMv4: moveq r0, #42
 
 ; CHECK-ARMv7-LABEL: f7:
-; CHECK-ARMv7: vcmpe.f32
+; CHECK-ARMv7: vcmp.f32
 ; CHECK-ARMv7: vmrs APSR_nzcv, fpscr
 ; CHECK-ARMv7: movweq
 ; CHECK-ARMv7-NOT: vmrs
diff --git a/test/CodeGen/ARM/fpscr-intrinsics.ll b/test/CodeGen/ARM/fpscr-intrinsics.ll
new file mode 100644
index 0000000000000000000000000000000000000000..64b97525febfe58494905dcc1431232e88a61fb4
--- /dev/null
+++ b/test/CodeGen/ARM/fpscr-intrinsics.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -O0 -mtriple=armv7-eabi -mcpu=cortex-a8 -mattr=+neon,+fp-armv8 | FileCheck %s
+; RUN: llc < %s -O3 -mtriple=armv7-eabi -mcpu=cortex-a8 -mattr=+neon,+fp-armv8 | FileCheck %s
+
+@a = common global double 0.000000e+00, align 8
+
+; Function Attrs: noinline nounwind uwtable
+define void @strtod() {
+entry:
+  ; CHECK: vmrs r{{[0-9]+}}, fpscr
+  %0 = call i32 @llvm.flt.rounds()
+  %tobool = icmp ne i32 %0, 0
+  br i1 %tobool, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  store double 5.000000e-01, double* @a, align 8
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void
+}
+
+; Function Attrs: nounwind
+define void @fn1(i32* nocapture %p) local_unnamed_addr {
+entry:
+  ; CHECK: vmrs r{{[0-9]+}}, fpscr
+  %0 = tail call i32 @llvm.arm.get.fpscr()
+  store i32 %0, i32* %p, align 4
+  ; CHECK: vmsr fpscr, r{{[0-9]+}}
+  tail call void @llvm.arm.set.fpscr(i32 1)
+  ; CHECK: vmrs r{{[0-9]+}}, fpscr
+  %1 = tail call i32 @llvm.arm.get.fpscr()
+  %arrayidx1 = getelementptr inbounds i32, i32* %p, i32 1
+  store i32 %1, i32* %arrayidx1, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readonly
+declare i32 @llvm.arm.get.fpscr()
+
+; Function Attrs: nounwind writeonly
+declare void @llvm.arm.set.fpscr(i32)
+
+; Function Attrs: nounwind
+declare i32 @llvm.flt.rounds()
diff --git a/test/CodeGen/ARM/gpr-paired-spill.ll b/test/CodeGen/ARM/gpr-paired-spill.ll
index ef3e5a54a2dbed0c708e85f04dcd7f622c964a55..797b147d5d016cf1e2096bb4e51b017f41f58af5 100644
--- a/test/CodeGen/ARM/gpr-paired-spill.ll
+++ b/test/CodeGen/ARM/gpr-paired-spill.ll
@@ -16,22 +16,22 @@ define void @foo(i64* %addr) {
   ; an LDMIA was created with both a FrameIndex and an offset, which
   ; is not allowed.
 
-; CHECK-WITH-LDRD: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8]
-; CHECK-WITH-LDRD: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp]
+; CHECK-WITH-LDRD-DAG: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8]
+; CHECK-WITH-LDRD-DAG: strd {{r[0-9]+}}, {{r[0-9]+}}, [sp]
 
-; CHECK-WITH-LDRD: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8]
-; CHECK-WITH-LDRD: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp]
+; CHECK-WITH-LDRD-DAG: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp, #8]
+; CHECK-WITH-LDRD-DAG: ldrd {{r[0-9]+}}, {{r[0-9]+}}, [sp]
 
   ; We also want to ensure the register scavenger is working (i.e. an
   ; offset from sp can be generated), so we need two spills.
-; CHECK-WITHOUT-LDRD: add [[ADDRREG:[a-z0-9]+]], sp, #{{[0-9]+}}
-; CHECK-WITHOUT-LDRD: stm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}}
-; CHECK-WITHOUT-LDRD: stm sp, {r{{[0-9]+}}, r{{[0-9]+}}}
+; CHECK-WITHOUT-LDRD-DAG: add [[ADDRREG:[a-z0-9]+]], sp, #{{[0-9]+}}
+; CHECK-WITHOUT-LDRD-DAG: stm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}}
+; CHECK-WITHOUT-LDRD-DAG: stm sp, {r{{[0-9]+}}, r{{[0-9]+}}}
 
   ; In principle LLVM may have to recalculate the offset. At the moment
   ; it reuses the original though.
-; CHECK-WITHOUT-LDRD: ldm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}}
-; CHECK-WITHOUT-LDRD: ldm sp, {r{{[0-9]+}}, r{{[0-9]+}}}
+; CHECK-WITHOUT-LDRD-DAG: ldm [[ADDRREG]], {r{{[0-9]+}}, r{{[0-9]+}}}
+; CHECK-WITHOUT-LDRD-DAG: ldm sp, {r{{[0-9]+}}, r{{[0-9]+}}}
 
   store volatile i64 %val1, i64* %addr
   store volatile i64 %val2, i64* %addr
diff --git a/test/CodeGen/ARM/ifcvt10.ll b/test/CodeGen/ARM/ifcvt10.ll
index 5725a404c3201ef69bd55553b0ac88878541d1c9..c7e18d35dbee13639220d85b9de698d43e94e263 100644
--- a/test/CodeGen/ARM/ifcvt10.ll
+++ b/test/CodeGen/ARM/ifcvt10.ll
@@ -9,8 +9,6 @@ entry:
 ; CHECK-LABEL: t:
 ; CHECK: vpop {d8}
 ; CHECK-NOT: vpopne
-; CHECK: pop {r7, pc}
-; CHECK: vpop {d8}
 ; CHECK: pop {r7, pc}
   br i1 undef, label %if.else, label %if.then
 
diff --git a/test/CodeGen/ARM/illegal-bitfield-loadstore.ll b/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
new file mode 100644
index 0000000000000000000000000000000000000000..74117d3896bdcda5bc75afd1c54157258767debe
--- /dev/null
+++ b/test/CodeGen/ARM/illegal-bitfield-loadstore.ll
@@ -0,0 +1,184 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=arm-eabi | FileCheck %s -check-prefix=LE
+; RUN: llc < %s -mtriple=armeb-eabi | FileCheck %s -check-prefix=BE
+
+define void @i24_or(i24* %a) {
+; LE-LABEL: i24_or:
+; LE:       @ BB#0:
+; LE-NEXT:    ldrh r1, [r0]
+; LE-NEXT:    orr r1, r1, #384
+; LE-NEXT:    strh r1, [r0]
+; LE-NEXT:    mov pc, lr
+;
+; BE-LABEL: i24_or:
+; BE:       @ BB#0:
+; BE-NEXT:    ldrh r1, [r0]
+; BE-NEXT:    ldrb r2, [r0, #2]
+; BE-NEXT:    orr r1, r2, r1, lsl #8
+; BE-NEXT:    orr r1, r1, #384
+; BE-NEXT:    strb r1, [r0, #2]
+; BE-NEXT:    lsr r1, r1, #8
+; BE-NEXT:    strh r1, [r0]
+; BE-NEXT:    mov pc, lr
+  %aa = load i24, i24* %a, align 1
+  %b = or i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+define void @i24_and_or(i24* %a) {
+; LE-LABEL: i24_and_or:
+; LE:       @ BB#0:
+; LE-NEXT:    ldrh r1, [r0]
+; LE-NEXT:    mov r2, #16256
+; LE-NEXT:    orr r2, r2, #49152
+; LE-NEXT:    orr r1, r1, #384
+; LE-NEXT:    and r1, r1, r2
+; LE-NEXT:    strh r1, [r0]
+; LE-NEXT:    mov pc, lr
+;
+; BE-LABEL: i24_and_or:
+; BE:       @ BB#0:
+; BE-NEXT:    mov r1, #128
+; BE-NEXT:    strb r1, [r0, #2]
+; BE-NEXT:    ldrh r1, [r0]
+; BE-NEXT:    orr r1, r1, #1
+; BE-NEXT:    strh r1, [r0]
+; BE-NEXT:    mov pc, lr
+  %b = load i24, i24* %a, align 1
+  %c = and i24 %b, -128
+  %d = or i24 %c, 384
+  store i24 %d, i24* %a, align 1
+  ret void
+}
+
+define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
+; LE-LABEL: i24_insert_bit:
+; LE:       @ BB#0:
+; LE-NEXT:    ldrh r2, [r0]
+; LE-NEXT:    mov r3, #255
+; LE-NEXT:    orr r3, r3, #57088
+; LE-NEXT:    and r2, r2, r3
+; LE-NEXT:    orr r1, r2, r1, lsl #13
+; LE-NEXT:    strh r1, [r0]
+; LE-NEXT:    mov pc, lr
+;
+; BE-LABEL: i24_insert_bit:
+; BE:       @ BB#0:
+; BE-NEXT:    ldrh r2, [r0]
+; BE-NEXT:    mov r3, #57088
+; BE-NEXT:    orr r3, r3, #16711680
+; BE-NEXT:    and r2, r3, r2, lsl #8
+; BE-NEXT:    orr r1, r2, r1, lsl #13
+; BE-NEXT:    lsr r1, r1, #8
+; BE-NEXT:    strh r1, [r0]
+; BE-NEXT:    mov pc, lr
+  %extbit = zext i1 %bit to i24
+  %b = load i24, i24* %a, align 1
+  %extbit.shl = shl nuw nsw i24 %extbit, 13
+  %c = and i24 %b, -8193
+  %d = or i24 %c, %extbit.shl
+  store i24 %d, i24* %a, align 1
+  ret void
+}
+
+define void @i56_or(i56* %a) {
+; LE-LABEL: i56_or:
+; LE:       @ BB#0:
+; LE-NEXT:    ldr r1, [r0]
+; LE-NEXT:    orr r1, r1, #384
+; LE-NEXT:    str r1, [r0]
+; LE-NEXT:    mov pc, lr
+;
+; BE-LABEL: i56_or:
+; BE:       @ BB#0:
+; BE-NEXT:    mov r1, r0
+; BE-NEXT:    ldr r12, [r0]
+; BE-NEXT:    ldrh r2, [r1, #4]!
+; BE-NEXT:    ldrb r3, [r1, #2]
+; BE-NEXT:    orr r2, r3, r2, lsl #8
+; BE-NEXT:    orr r2, r2, r12, lsl #24
+; BE-NEXT:    orr r2, r2, #384
+; BE-NEXT:    lsr r3, r2, #8
+; BE-NEXT:    strb r2, [r1, #2]
+; BE-NEXT:    strh r3, [r1]
+; BE-NEXT:    bic r1, r12, #255
+; BE-NEXT:    orr r1, r1, r2, lsr #24
+; BE-NEXT:    str r1, [r0]
+; BE-NEXT:    mov pc, lr
+  %aa = load i56, i56* %a
+  %b = or i56 %aa, 384
+  store i56 %b, i56* %a
+  ret void
+}
+
+define void @i56_and_or(i56* %a) {
+; LE-LABEL: i56_and_or:
+; LE:       @ BB#0:
+; LE-NEXT:    ldr r1, [r0]
+; LE-NEXT:    orr r1, r1, #384
+; LE-NEXT:    bic r1, r1, #127
+; LE-NEXT:    str r1, [r0]
+; LE-NEXT:    mov pc, lr
+;
+; BE-LABEL: i56_and_or:
+; BE:       @ BB#0:
+; BE-NEXT:    mov r1, r0
+; BE-NEXT:    mov r3, #128
+; BE-NEXT:    ldrh r2, [r1, #4]!
+; BE-NEXT:    strb r3, [r1, #2]
+; BE-NEXT:    lsl r2, r2, #8
+; BE-NEXT:    ldr r12, [r0]
+; BE-NEXT:    orr r2, r2, r12, lsl #24
+; BE-NEXT:    orr r2, r2, #384
+; BE-NEXT:    lsr r3, r2, #8
+; BE-NEXT:    strh r3, [r1]
+; BE-NEXT:    bic r1, r12, #255
+; BE-NEXT:    orr r1, r1, r2, lsr #24
+; BE-NEXT:    str r1, [r0]
+; BE-NEXT:    mov pc, lr
+
+  %b = load i56, i56* %a, align 1
+  %c = and i56 %b, -128
+  %d = or i56 %c, 384
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
+; LE-LABEL: i56_insert_bit:
+; LE:       @ BB#0:
+; LE-NEXT:    ldr r2, [r0]
+; LE-NEXT:    bic r2, r2, #8192
+; LE-NEXT:    orr r1, r2, r1, lsl #13
+; LE-NEXT:    str r1, [r0]
+; LE-NEXT:    mov pc, lr
+;
+; BE-LABEL: i56_insert_bit:
+; BE:       @ BB#0:
+; BE-NEXT:    .save {r11, lr}
+; BE-NEXT:    push {r11, lr}
+; BE-NEXT:    mov r2, r0
+; BE-NEXT:    ldr lr, [r0]
+; BE-NEXT:    ldrh r12, [r2, #4]!
+; BE-NEXT:    ldrb r3, [r2, #2]
+; BE-NEXT:    orr r12, r3, r12, lsl #8
+; BE-NEXT:    orr r3, r12, lr, lsl #24
+; BE-NEXT:    bic r3, r3, #8192
+; BE-NEXT:    orr r1, r3, r1, lsl #13
+; BE-NEXT:    lsr r3, r1, #8
+; BE-NEXT:    strh r3, [r2]
+; BE-NEXT:    bic r2, lr, #255
+; BE-NEXT:    orr r1, r2, r1, lsr #24
+; BE-NEXT:    str r1, [r0]
+; BE-NEXT:    pop {r11, lr}
+; BE-NEXT:    mov pc, lr
+  %extbit = zext i1 %bit to i56
+  %b = load i56, i56* %a, align 1
+  %extbit.shl = shl nuw nsw i56 %extbit, 13
+  %c = and i56 %b, -8193
+  %d = or i56 %c, %extbit.shl
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
diff --git a/test/CodeGen/ARM/indirectbr.ll b/test/CodeGen/ARM/indirectbr.ll
index d15ef14b44932723942184fe9217bf18a0c430ee..90defad43a7d88a222b9462002e2691995d82dd6 100644
--- a/test/CodeGen/ARM/indirectbr.ll
+++ b/test/CodeGen/ARM/indirectbr.ll
@@ -47,6 +47,7 @@ L3:                                               ; preds = %L4, %bb2
   br label %L2
 
 L2:                                               ; preds = %L3, %bb2
+; THUMB-LABEL: %L1.clone
 ; THUMB: muls
   %res.2 = phi i32 [ %res.1, %L3 ], [ 1, %bb2 ]   ; <i32> [#uses=1]
   %phitmp = mul i32 %res.2, 6                     ; <i32> [#uses=1]
diff --git a/test/CodeGen/ARM/interval-update-remat.ll b/test/CodeGen/ARM/interval-update-remat.ll
index 6391d4c29604f439f66ddf42c57645950dffbf09..524e8a0aa491a0592050212e761c07439224dcea 100644
--- a/test/CodeGen/ARM/interval-update-remat.ll
+++ b/test/CodeGen/ARM/interval-update-remat.ll
@@ -109,7 +109,7 @@ _ZN7MessageD1Ev.exit:                             ; preds = %if.then.i.i.i.i, %i
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
 
 declare %class.StaticSocketDataProvider.6.231.281.1306.2331* @_ZN24StaticSocketDataProviderC1EP13MockReadWritejS1_j(%class.StaticSocketDataProvider.6.231.281.1306.2331* returned, %struct.MockReadWrite.7.232.282.1307.2332*, i32, %struct.MockReadWrite.7.232.282.1307.2332*, i32) unnamed_addr
 
@@ -130,7 +130,7 @@ declare %class.Message.13.238.288.1313.2338* @_ZN7MessageC1Ev(%class.Message.13.
 declare %class.AssertHelper.10.235.285.1310.2335* @_ZN12AssertHelperD1Ev(%class.AssertHelper.10.235.285.1310.2335* returned) unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 declare void @_ZN18ClientSocketHandle5m_fn3IPiEEvRK25Trans_NS___1_basic_stringIciiE13scoped_refptr15RequestPriorityN16ClientSocketPool13RespectLimitsERiT_11BoundNetLog(%class.ClientSocketHandle.14.239.289.1314.2339*, %class.Trans_NS___1_basic_string.18.243.293.1318.2343* dereferenceable(12), %class.scoped_refptr.19.244.294.1319.2344*, i32, i32, i32* dereferenceable(4), i32*, %class.BoundNetLog.20.245.295.1320.2345*)
 
diff --git a/test/CodeGen/ARM/intrinsics-coprocessor.ll b/test/CodeGen/ARM/intrinsics-coprocessor.ll
index 8fea49b39fb6074f4334713f96306f283c4a3bd7..5352471238f9c9e59fa5903751a4a891f58d280c 100644
--- a/test/CodeGen/ARM/intrinsics-coprocessor.ll
+++ b/test/CodeGen/ARM/intrinsics-coprocessor.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -mtriple=armv7-eabi -mcpu=cortex-a8 | FileCheck %s
-; RUN: llc < %s -march=thumb -mtriple=thumbv7-eabi -mcpu=cortex-a8 | FileCheck %s
 
 define void @coproc(i8* %i) nounwind {
 entry:
diff --git a/test/CodeGen/ARM/ldm-stm-i256.ll b/test/CodeGen/ARM/ldm-stm-i256.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7b4151dabf6dd2970f9333018c8b529b1e829a9f
--- /dev/null
+++ b/test/CodeGen/ARM/ldm-stm-i256.ll
@@ -0,0 +1,38 @@
+; RUN: llc -mtriple=armv7--eabi -verify-machineinstrs < %s | FileCheck %s
+
+; Check the way we schedule/merge a bunch of loads and stores.
+; Originally test/CodeGen/ARM/2011-07-07-ScheduleDAGCrash.ll ; now
+; being used as a test of optimizations related to ldm/stm.
+
+; FIXME: We could merge more loads/stores with regalloc hints.
+; FIXME: Fix scheduling so we don't have 16 live registers.
+
+define void @f(i256* nocapture %a, i256* nocapture %b, i256* nocapture %cc, i256* nocapture %dd) nounwind uwtable noinline ssp {
+entry:
+  %c = load i256, i256* %cc
+  %d = load i256, i256* %dd
+  %add = add nsw i256 %c, %d
+  store i256 %add, i256* %a, align 8
+  %or = or i256 %c, 1606938044258990275541962092341162602522202993782792835301376
+  %add6 = add nsw i256 %or, %d
+  store i256 %add6, i256* %b, align 8
+  ret void
+  ; CHECK-DAG: ldm r3
+  ; CHECK-DAG: ldm r2
+  ; CHECK-DAG: ldr {{.*}}, [r3, #20]
+  ; CHECK-DAG: ldr {{.*}}, [r3, #16]
+  ; CHECK-DAG: ldr {{.*}}, [r3, #28]
+  ; CHECK-DAG: ldr {{.*}}, [r3, #24]
+  ; CHECK-DAG: ldr {{.*}}, [r2, #20]
+  ; CHECK-DAG: ldr {{.*}}, [r2, #16]
+  ; CHECK-DAG: ldr {{.*}}, [r2, #28]
+  ; CHECK-DAG: ldr {{.*}}, [r2, #24]
+  ; CHECK-DAG: stmib r0
+  ; CHECK-DAG: str {{.*}}, [r0]
+  ; CHECK-DAG: str {{.*}}, [r0, #24]
+  ; CHECK-DAG: str {{.*}}, [r0, #28]
+  ; CHECK-DAG: str {{.*}}, [r1]
+  ; CHECK-DAG: stmib r1
+  ; CHECK-DAG: str {{.*}}, [r1, #24]
+  ; CHECK-DAG: str {{.*}}, [r1, #28]
+}
diff --git a/test/CodeGen/ARM/ldrd.ll b/test/CodeGen/ARM/ldrd.ll
index 6a9e63f649c93255d2d0e74aa667b932f5211aac..6981cfcb08550c87d2f05d1df6984f0d45057a8a 100644
--- a/test/CodeGen/ARM/ldrd.ll
+++ b/test/CodeGen/ARM/ldrd.ll
@@ -80,7 +80,7 @@ return:                                           ; preds = %bb, %entry
 
 ; CHECK-LABEL: Func1:
 define void @Func1() nounwind ssp "no-frame-pointer-elim"="true" {
-entry: 
+entry:
 ; A8: movw [[BASE:r[0-9]+]], :lower16:{{.*}}TestVar{{.*}}
 ; A8: movt [[BASE]], :upper16:{{.*}}TestVar{{.*}}
 ; A8: ldrd [[FIELD1:r[0-9]+]], [[FIELD2:r[0-9]+]], {{\[}}[[BASE]], #4]
@@ -88,12 +88,12 @@ entry:
 ; A8-NEXT: str [[FIELD1]], {{\[}}[[BASE]]{{\]}}
 ; CONSERVATIVE-NOT: ldrd
   %orig_blocks = alloca [256 x i16], align 2
-  %0 = bitcast [256 x i16]* %orig_blocks to i8*call void @llvm.lifetime.start(i64 512, i8* %0) nounwind
+  %0 = bitcast [256 x i16]* %orig_blocks to i8*call void @llvm.lifetime.start.p0i8(i64 512, i8* %0) nounwind
   %tmp1 = load i32, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 1), align 4
   %tmp2 = load i32, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 2), align 4
   %add = add nsw i32 %tmp2, %tmp1
   store i32 %add, i32* getelementptr inbounds (%struct.Test, %struct.Test* @TestVar, i32 0, i32 0), align 4
-  call void @llvm.lifetime.end(i64 512, i8* %0) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %0) nounwind
   ret void
 }
 
@@ -189,5 +189,23 @@ define i32* @strd_postupdate_inc(i32* %p0, i32 %v0, i32 %v1) "no-frame-pointer-e
   ret i32* %p1
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+; CHECK-LABEL: ldrd_strd_aa:
+; NORMAL: ldrd [[TMP1:r[0-9]]], [[TMP2:r[0-9]]],
+; NORMAL: strd [[TMP1]], [[TMP2]],
+; CONSERVATIVE-NOT: ldrd
+; CONSERVATIVE-NOT: strd
+; CHECK: bx lr
+
+define void @ldrd_strd_aa(i32* noalias nocapture %x, i32* noalias nocapture readonly %y) {
+entry:
+  %0 = load i32, i32* %y, align 4
+  store i32 %0, i32* %x, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i32 1
+  %1 = load i32, i32* %arrayidx2, align 4
+  %arrayidx3 = getelementptr inbounds i32, i32* %x, i32 1
+  store i32 %1, i32* %arrayidx3, align 4
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/ARM/load-combine-big-endian.ll b/test/CodeGen/ARM/load-combine-big-endian.ll
index 6f0e008a6557938983870ed77d39c32bd9f1117c..8d8a0136cf962222ee9b833048868cee10b3080d 100644
--- a/test/CodeGen/ARM/load-combine-big-endian.ll
+++ b/test/CodeGen/ARM/load-combine-big-endian.ll
@@ -269,3 +269,511 @@ define i64 @load_i64_by_i8(i64* %arg) {
   %tmp37 = or i64 %tmp33, %tmp36
   ret i64 %tmp37
 }
+
+; i8* p; // p[1] is 4 byte aligned
+; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24)
+define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
+; CHECK: ldr r0, [r0, #1]
+; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r1, r1, r0, lsr #8
+; CHECK-NEXT: and r2, r2, r0, lsl #8
+; CHECK-NEXT: orr r1, r1, r0, lsr #24
+; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mov pc, lr
+
+; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset:
+; CHECK-ARMv6: ldr r0, [r0, #1]
+; CHECK-ARMv6-NEXT: rev r0, r0
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 4
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[-4] is 4 byte aligned
+; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24)
+define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_neg_offset:
+; CHECK: ldr r0, [r0, #-4]
+; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r1, r1, r0, lsr #8
+; CHECK-NEXT: and r2, r2, r0, lsl #8
+; CHECK-NEXT: orr r1, r1, r0, lsr #24
+; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mov pc, lr
+
+; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset:
+; CHECK-ARMv6: ldr r0, [r0, #-4]
+; CHECK-ARMv6-NEXT: rev r0, r0
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
+  %tmp2 = load i8, i8* %tmp1, align 4
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[1] is 4 byte aligned
+; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24)
+define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
+; CHECK: ldr r0, [r0, #1]
+; CHECK-NEXT: mov pc, lr
+
+; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap:
+; CHECK-ARMv6: ldr r0, [r0, #1]
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp15 = load i8, i8* %tmp14, align 4
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[-4] is 4 byte aligned
+; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24)
+define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
+; CHECK: ldr r0, [r0, #-4]
+; CHECK-NEXT: mov pc, lr
+
+; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap:
+; CHECK-ARMv6: ldr r0, [r0, #-4]
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4
+  %tmp15 = load i8, i8* %tmp14, align 4
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+declare i16 @llvm.bswap.i16(i16)
+
+; i16* p; // p is 4 byte aligned
+; (i32) bswap(p[0]) | (i32) bswap(p[1] << 16)
+define i32 @load_i32_by_bswap_i16(i32* %arg) {
+; CHECK-LABEL: load_i32_by_bswap_i16:
+; CHECK: ldr r0, [r0]
+; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r1, r1, r0, lsr #8
+; CHECK-NEXT: and r2, r2, r0, lsl #8
+; CHECK-NEXT: orr r1, r1, r0, lsr #24
+; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mov pc, lr
+
+; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
+; CHECK-ARMv6: ldr  r0, [r0]
+; CHECK-ARMv6-NEXT: rev r0, r0
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i16*
+  %tmp1 = load i16, i16* %tmp, align 4
+  %tmp11 = call i16 @llvm.bswap.i16(i16 %tmp1)
+  %tmp2 = zext i16 %tmp11 to i32
+  %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
+  %tmp4 = load i16, i16* %tmp3, align 1
+  %tmp41 = call i16 @llvm.bswap.i16(i16 %tmp4)
+  %tmp5 = zext i16 %tmp41 to i32
+  %tmp6 = shl nuw nsw i32 %tmp5, 16
+  %tmp7 = or i32 %tmp6, %tmp2
+  ret i32 %tmp7
+}
+
+; i16* p; // p is 4 byte aligned
+; (i32) p[1] | (sext(p[0] << 16) to i32)
+define i32 @load_i32_by_sext_i16(i32* %arg) {
+; CHECK-LABEL: load_i32_by_sext_i16:
+; CHECK: ldr  r0, [r0]
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: load_i32_by_sext_i16:
+; CHECK-ARMv6: ldr r0, [r0]
+; CHECK-ARMv6-NEXT: bx  lr
+  %tmp = bitcast i32* %arg to i16*
+  %tmp1 = load i16, i16* %tmp, align 4
+  %tmp2 = sext i16 %tmp1 to i32
+  %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
+  %tmp4 = load i16, i16* %tmp3, align 1
+  %tmp5 = zext i16 %tmp4 to i32
+  %tmp6 = shl nuw nsw i32 %tmp2, 16
+  %tmp7 = or i32 %tmp6, %tmp5
+  ret i32 %tmp7
+}
+
+; i8* arg; i32 i;
+; p = arg + 12;
+; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24)
+define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
+; CHECK-LABEL: load_i32_by_i8_base_offset_index:
+; CHECK: add r0, r0, r1
+; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: ldr r0, [r0, #12]
+; CHECK-NEXT: and r1, r1, r0, lsr #8
+; CHECK-NEXT: and r2, r2, r0, lsl #8
+; CHECK-NEXT: orr r1, r1, r0, lsr #24
+; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index:
+; CHECK-ARMv6: add r0, r0, r1
+; CHECK-ARMv6-NEXT: ldr r0, [r0, #12]
+; CHECK-ARMv6-NEXT: rev r0, r0
+; CHECK-ARMv6-NEXT: bx  lr
+  %tmp = add nuw nsw i32 %i, 3
+  %tmp2 = add nuw nsw i32 %i, 2
+  %tmp3 = add nuw nsw i32 %i, 1
+  %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp5 = zext i32 %i to i64
+  %tmp6 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp5
+  %tmp7 = load i8, i8* %tmp6, align 4
+  %tmp8 = zext i8 %tmp7 to i32
+  %tmp9 = zext i32 %tmp3 to i64
+  %tmp10 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp9
+  %tmp11 = load i8, i8* %tmp10, align 1
+  %tmp12 = zext i8 %tmp11 to i32
+  %tmp13 = shl nuw nsw i32 %tmp12, 8
+  %tmp14 = or i32 %tmp13, %tmp8
+  %tmp15 = zext i32 %tmp2 to i64
+  %tmp16 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp15
+  %tmp17 = load i8, i8* %tmp16, align 1
+  %tmp18 = zext i8 %tmp17 to i32
+  %tmp19 = shl nuw nsw i32 %tmp18, 16
+  %tmp20 = or i32 %tmp14, %tmp19
+  %tmp21 = zext i32 %tmp to i64
+  %tmp22 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp21
+  %tmp23 = load i8, i8* %tmp22, align 1
+  %tmp24 = zext i8 %tmp23 to i32
+  %tmp25 = shl nuw i32 %tmp24, 24
+  %tmp26 = or i32 %tmp20, %tmp25
+  ret i32 %tmp26
+}
+
+; i8* arg; i32 i;
+; p = arg + 12;
+; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24)
+define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
+; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
+; CHECK: add r0, r0, r1
+; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: ldr r0, [r0, #13]
+; CHECK-NEXT: and r1, r1, r0, lsr #8
+; CHECK-NEXT: and r2, r2, r0, lsl #8
+; CHECK-NEXT: orr r1, r1, r0, lsr #24
+; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2:
+; CHECK-ARMv6: add r0, r0, r1
+; CHECK-ARMv6-NEXT: ldr r0, [r0, #13]
+; CHECK-ARMv6-NEXT: rev r0, r0
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = add nuw nsw i32 %i, 4
+  %tmp2 = add nuw nsw i32 %i, 3
+  %tmp3 = add nuw nsw i32 %i, 2
+  %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp5 = add nuw nsw i32 %i, 1
+  %tmp27 = zext i32 %tmp5 to i64
+  %tmp28 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp27
+  %tmp29 = load i8, i8* %tmp28, align 4
+  %tmp30 = zext i8 %tmp29 to i32
+  %tmp31 = zext i32 %tmp3 to i64
+  %tmp32 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp31
+  %tmp33 = load i8, i8* %tmp32, align 1
+  %tmp34 = zext i8 %tmp33 to i32
+  %tmp35 = shl nuw nsw i32 %tmp34, 8
+  %tmp36 = or i32 %tmp35, %tmp30
+  %tmp37 = zext i32 %tmp2 to i64
+  %tmp38 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp37
+  %tmp39 = load i8, i8* %tmp38, align 1
+  %tmp40 = zext i8 %tmp39 to i32
+  %tmp41 = shl nuw nsw i32 %tmp40, 16
+  %tmp42 = or i32 %tmp36, %tmp41
+  %tmp43 = zext i32 %tmp to i64
+  %tmp44 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp43
+  %tmp45 = load i8, i8* %tmp44, align 1
+  %tmp46 = zext i8 %tmp45 to i32
+  %tmp47 = shl nuw i32 %tmp46, 24
+  %tmp48 = or i32 %tmp42, %tmp47
+  ret i32 %tmp48
+}
+
+; i8* p; // p is 2 byte aligned
+; (i32) p[0] | ((i32) p[1] << 8)
+define i32 @zext_load_i32_by_i8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: orr r0, r1, r0, lsl #8
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[0] << 8) | ((i32) p[1] << 16)
+define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_shl_8:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: lsl r0, r0, #16
+; CHECK-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_8:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: lsl r0, r0, #16
+; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 8
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 16
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[0] << 16) | ((i32) p[1] << 24)
+define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_shl_16:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r1, lsl #16
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_16:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: lsl r0, r0, #24
+; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 16
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 24
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; (i32) p[1] | ((i32) p[0] << 8)
+define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[1] << 8) | ((i32) p[0] << 16)
+define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: lsl r1, r1, #16
+; CHECK-NEXT: orr r0, r1, r0, lsl #8
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_8:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: lsl r1, r1, #16
+; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 8
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 16
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[1] << 16) | ((i32) p[0] << 24)
+define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: lsl r1, r1, #24
+; CHECK-NEXT: orr r0, r1, r0, lsl #16
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_16:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: lsl r1, r1, #24
+; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #16
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 16
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 24
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p;
+; i16* p1.i16 = (i16*) p;
+; (p1.i16[0] << 8) | ((i16) p[2])
+;
+; This is essentialy a i16 load from p[1], but we don't fold the pattern now
+; because in the original DAG we don't have p[1] address available
+define i16 @load_i16_from_nonzero_offset(i8* %p) {
+; CHECK-LABEL: load_i16_from_nonzero_offset:
+; CHECK: ldrh  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #2]
+; CHECK-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: load_i16_from_nonzero_offset:
+; CHECK-ARMv6: ldrh  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #2]
+; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %p1.i16 = bitcast i8* %p to i16*
+  %p2.i8 = getelementptr i8, i8* %p, i64 2
+  %v1 = load i16, i16* %p1.i16
+  %v2.i8 = load i8, i8* %p2.i8
+  %v2 = zext i8 %v2.i8 to i16
+  %v1.shl = shl i16 %v1, 8
+  %res = or i16 %v1.shl, %v2
+  ret i16 %res
+}
diff --git a/test/CodeGen/ARM/load-combine.ll b/test/CodeGen/ARM/load-combine.ll
index 4ee7780bff739c7d4690765d96cb3a86bdedf050..720bc7b88b32f646048466778bb3dae29b5f9820 100644
--- a/test/CodeGen/ARM/load-combine.ll
+++ b/test/CodeGen/ARM/load-combine.ll
@@ -227,3 +227,466 @@ define i64 @load_i64_by_i8_bswap(i64* %arg) {
   %tmp37 = or i64 %tmp33, %tmp36
   ret i64 %tmp37
 }
+
+; i8* p; // p[1] is 4 byte aligned
+; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24)
+define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
+; CHECK: ldr r0, [r0, #1]
+; CHECK-NEXT: mov pc, lr
+
+; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset:
+; CHECK-ARMv6: ldr r0, [r0, #1]
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 4
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 2
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 3
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 4
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[-4] is 4 byte aligned
+; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24)
+define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_neg_offset:
+; CHECK: ldr r0, [r0, #-4]
+; CHECK-NEXT: mov pc, lr
+
+; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset:
+; CHECK-ARMv6: ldr r0, [r0, #-4]
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
+  %tmp2 = load i8, i8* %tmp1, align 4
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[1] is 4 byte aligned
+; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24)
+define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
+; CHECK: ldr r0, [r0, #1]
+; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r1, r1, r0, lsr #8
+; CHECK-NEXT: and r2, r2, r0, lsl #8
+; CHECK-NEXT: orr r1, r1, r0, lsr #24
+; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mov pc, lr
+
+; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap:
+; CHECK-ARMv6: ldr r0, [r0, #1]
+; CHECK-ARMv6-NEXT: rev r0, r0
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp15 = load i8, i8* %tmp14, align 4
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p; // p[-4] is 4 byte aligned
+; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24)
+define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
+; CHECK: ldr r0, [r0, #-4]
+; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r1, r1, r0, lsr #8
+; CHECK-NEXT: and r2, r2, r0, lsl #8
+; CHECK-NEXT: orr r1, r1, r0, lsr #24
+; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mov pc, lr
+
+; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap:
+; CHECK-ARMv6: ldr r0, [r0, #-4]
+; CHECK-ARMv6-NEXT: rev r0, r0
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4
+  %tmp15 = load i8, i8* %tmp14, align 4
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+declare i16 @llvm.bswap.i16(i16)
+
+; i16* p; // p is 4 byte aligned
+; (i32) bswap(p[1]) | (i32) bswap(p[0] << 16)
+define i32 @load_i32_by_bswap_i16(i32* %arg) {
+; CHECK-LABEL: load_i32_by_bswap_i16:
+; CHECK: ldr  r0, [r0]
+; CHECK-NEXT: mov r1, #65280
+; CHECK-NEXT: mov r2, #16711680
+; CHECK-NEXT: and r1, r1, r0, lsr #8
+; CHECK-NEXT: and r2, r2, r0, lsl #8
+; CHECK-NEXT: orr r1, r1, r0, lsr #24
+; CHECK-NEXT: orr r0, r2, r0, lsl #24
+; CHECK-NEXT: orr r0, r0, r1
+; CHECK-NEXT: mov pc, lr
+
+; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
+; CHECK-ARMv6: ldr r0, [r0]
+; CHECK-ARMv6-NEXT: rev r0, r0
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i16*
+  %tmp1 = load i16, i16* %tmp, align 4
+  %tmp11 = call i16 @llvm.bswap.i16(i16 %tmp1)
+  %tmp2 = zext i16 %tmp11 to i32
+  %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
+  %tmp4 = load i16, i16* %tmp3, align 1
+  %tmp41 = call i16 @llvm.bswap.i16(i16 %tmp4)
+  %tmp5 = zext i16 %tmp41 to i32
+  %tmp6 = shl nuw nsw i32 %tmp2, 16
+  %tmp7 = or i32 %tmp6, %tmp5
+  ret i32 %tmp7
+}
+
+; i16* p;
+; (i32) p[0] | (sext(p[1] << 16) to i32)
+define i32 @load_i32_by_sext_i16(i32* %arg) {
+; CHECK-LABEL: load_i32_by_sext_i16:
+; CHECK: ldr  r0, [r0]
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: load_i32_by_sext_i16:
+; CHECK-ARMv6: ldr  r0, [r0]
+; CHECK-ARMv6-NEXT: bx lr
+  %tmp = bitcast i32* %arg to i16*
+  %tmp1 = load i16, i16* %tmp, align 4
+  %tmp2 = zext i16 %tmp1 to i32
+  %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
+  %tmp4 = load i16, i16* %tmp3, align 1
+  %tmp5 = sext i16 %tmp4 to i32
+  %tmp6 = shl nuw nsw i32 %tmp5, 16
+  %tmp7 = or i32 %tmp6, %tmp2
+  ret i32 %tmp7
+}
+
+; i8* arg; i32 i;
+; p = arg + 12;
+; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24)
+define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
+; CHECK-LABEL: load_i32_by_i8_base_offset_index:
+; CHECK: add r0, r0, r1
+; CHECK-NEXT: ldr r0, [r0, #12]
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index:
+; CHECK-ARMv6: add r0, r0, r1
+; CHECK-ARMv6-NEXT: ldr r0, [r0, #12]
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = add nuw nsw i32 %i, 3
+  %tmp2 = add nuw nsw i32 %i, 2
+  %tmp3 = add nuw nsw i32 %i, 1
+  %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp5 = zext i32 %i to i64
+  %tmp6 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp5
+  %tmp7 = load i8, i8* %tmp6, align 4
+  %tmp8 = zext i8 %tmp7 to i32
+  %tmp9 = zext i32 %tmp3 to i64
+  %tmp10 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp9
+  %tmp11 = load i8, i8* %tmp10, align 1
+  %tmp12 = zext i8 %tmp11 to i32
+  %tmp13 = shl nuw nsw i32 %tmp12, 8
+  %tmp14 = or i32 %tmp13, %tmp8
+  %tmp15 = zext i32 %tmp2 to i64
+  %tmp16 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp15
+  %tmp17 = load i8, i8* %tmp16, align 1
+  %tmp18 = zext i8 %tmp17 to i32
+  %tmp19 = shl nuw nsw i32 %tmp18, 16
+  %tmp20 = or i32 %tmp14, %tmp19
+  %tmp21 = zext i32 %tmp to i64
+  %tmp22 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp21
+  %tmp23 = load i8, i8* %tmp22, align 1
+  %tmp24 = zext i8 %tmp23 to i32
+  %tmp25 = shl nuw i32 %tmp24, 24
+  %tmp26 = or i32 %tmp20, %tmp25
+  ret i32 %tmp26
+}
+
+; i8* arg; i32 i;
+; p = arg + 12;
+; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24)
+define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
+; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
+; CHECK: add r0, r0, r1
+; CHECK-NEXT: ldr r0, [r0, #13]
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2:
+; CHECK-ARMv6: add r0, r0, r1
+; CHECK-ARMv6-NEXT: ldr r0, [r0, #13]
+; CHECK-ARMv6-NEXT: bx  lr
+  %tmp = add nuw nsw i32 %i, 4
+  %tmp2 = add nuw nsw i32 %i, 3
+  %tmp3 = add nuw nsw i32 %i, 2
+  %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp5 = add nuw nsw i32 %i, 1
+  %tmp27 = zext i32 %tmp5 to i64
+  %tmp28 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp27
+  %tmp29 = load i8, i8* %tmp28, align 4
+  %tmp30 = zext i8 %tmp29 to i32
+  %tmp31 = zext i32 %tmp3 to i64
+  %tmp32 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp31
+  %tmp33 = load i8, i8* %tmp32, align 1
+  %tmp34 = zext i8 %tmp33 to i32
+  %tmp35 = shl nuw nsw i32 %tmp34, 8
+  %tmp36 = or i32 %tmp35, %tmp30
+  %tmp37 = zext i32 %tmp2 to i64
+  %tmp38 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp37
+  %tmp39 = load i8, i8* %tmp38, align 1
+  %tmp40 = zext i8 %tmp39 to i32
+  %tmp41 = shl nuw nsw i32 %tmp40, 16
+  %tmp42 = or i32 %tmp36, %tmp41
+  %tmp43 = zext i32 %tmp to i64
+  %tmp44 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp43
+  %tmp45 = load i8, i8* %tmp44, align 1
+  %tmp46 = zext i8 %tmp45 to i32
+  %tmp47 = shl nuw i32 %tmp46, 24
+  %tmp48 = or i32 %tmp42, %tmp47
+  ret i32 %tmp48
+}
+
+; i8* p; // p is 2 byte aligned
+; (i32) p[0] | ((i32) p[1] << 8)
+define i32 @zext_load_i32_by_i8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: orr r0, r1, r0, lsl #8
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[0] << 8) | ((i32) p[1] << 16)
+define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_shl_8:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: lsl r0, r0, #16
+; CHECK-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_8:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: lsl r0, r0, #16
+; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 8
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 16
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[0] << 16) | ((i32) p[1] << 24)
+define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_shl_16:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: lsl r0, r0, #24
+; CHECK-NEXT: orr r0, r0, r1, lsl #16
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_shl_16:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: lsl r0, r0, #24
+; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #16
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 2
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 16
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 24
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; (i32) p[1] | ((i32) p[0] << 8)
+define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: orr r0, r0, r1, lsl #8
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[1] << 8) | ((i32) p[0] << 16)
+define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: lsl r1, r1, #16
+; CHECK-NEXT: orr r0, r1, r0, lsl #8
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_8:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: lsl r1, r1, #16
+; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #8
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 8
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 16
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p; // p is 2 byte aligned
+; ((i32) p[1] << 16) | ((i32) p[0] << 24)
+define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16:
+; CHECK: ldrb  r1, [r0]
+; CHECK-NEXT: ldrb  r0, [r0, #1]
+; CHECK-NEXT: lsl r1, r1, #24
+; CHECK-NEXT: orr r0, r1, r0, lsl #16
+; CHECK-NEXT: mov pc, lr
+;
+; CHECK-ARMv6-LABEL: zext_load_i32_by_i8_bswap_shl_16:
+; CHECK-ARMv6: ldrb  r1, [r0]
+; CHECK-ARMv6-NEXT: ldrb  r0, [r0, #1]
+; CHECK-ARMv6-NEXT: lsl r1, r1, #24
+; CHECK-ARMv6-NEXT: orr r0, r1, r0, lsl #16
+; CHECK-ARMv6-NEXT: bx  lr
+
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 16
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 2
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 24
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
index 80cb5096c03c5e597a3f034bc9e0dc50dfd29d54..9ecda8b06cbf28e256dd4e65b66c88a37a5e8bb6 100644
--- a/test/CodeGen/ARM/longMAC.ll
+++ b/test/CodeGen/ARM/longMAC.ll
@@ -1,14 +1,15 @@
 ; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -check-prefix=CHECK --check-prefix=CHECK-LE
-; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s --check-prefix=CHECK-V7-LE
+; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK --check-prefix=CHECK-V7-LE
 ; RUN: llc -mtriple=armeb-eabi %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
-; RUN: llc -mtriple=armebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-BE
-; RUN: llc -mtriple=thumbv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6-THUMB
-; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6-THUMB2
-; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-THUMB
-; RUN: llc -mtriple=thumbebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7-THUMB-BE
-; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6M-THUMB
-; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7M-THUMB
-; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7EM-THUMB
+; RUN: llc -mtriple=armebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V7-BE
+; RUN: llc -mtriple=thumbv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V6-THUMB
+; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-T2-DSP
+; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-T2-DSP
+; RUN: llc -mtriple=thumbebv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V7-THUMB-BE
+; RUN: llc -mtriple=thumbv6m-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V6M-THUMB
+; RUN: llc -mtriple=thumbv7m-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V7M-THUMB
+; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-T2-DSP
+; RUN: llc -mtriple=armv5te-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V5TE
 ; Check generated signed and unsigned multiply accumulate long.
 
 define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
@@ -20,12 +21,9 @@ define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
 ;CHECK-BE: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
 ;CHECK-BE: mov r0, [[RDHI]]
 ;CHECK-BE: mov r1, [[RDLO]]
-;CHECK-V6-THUMB2: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
-;CHECK-V6-THUMB2: mov r0, [[RDLO]]
-;CHECK-V6-THUMB2: mov r1, [[RDHI]]
-;CHECK-V7-THUMB: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
-;CHECK-V7-THUMB: mov r0, [[RDLO]]
-;CHECK-V7-THUMB: mov r1, [[RDHI]]
+;CHECK-T2-DSP: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
+;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]]
+;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]]
 ;CHECK-V7-THUMB-BE: umlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
 ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]]
 ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]]
@@ -44,12 +42,9 @@ define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
 ;CHECK-BE: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
 ;CHECK-BE: mov r0, [[RDHI]]
 ;CHECK-BE: mov r1, [[RDLO]]
-;CHECK-V6-THUMB2: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
-;CHECK-V6-THUMB2: mov r0, [[RDLO]]
-;CHECK-V6-THUMB2: mov r1, [[RDHI]]
-;CHECK-V7-THUMB: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
-;CHECK-V7-THUMB: mov r0, [[RDLO]]
-;CHECK-V7-THUMB: mov r1, [[RDHI]]
+;CHECK-T2-DSP: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
+;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]]
+;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]]
 ;CHECK-V7-THUMB-BE: smlal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
 ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]]
 ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]]
@@ -78,8 +73,7 @@ define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
 ;CHECK-BE: umlal [[RDLO:r[0-9]+]], [[RDHI]], r1, r0
 ;CHECK-BE: mov r0, [[RDHI]]
 ;CHECK-BE: mov r1, [[RDLO]]
-;CHECK-V6-THUMB2: umlal
-;CHECK-V7-THUMB: umlal
+;CHECK-T2-DSP: umlal
 ;CHECK-V6-THUMB-NOT: umlal
   %conv = zext i32 %b to i64
   %conv1 = zext i32 %a to i64
@@ -92,8 +86,7 @@ define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
 define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
 ;CHECK-LABEL: MACLongTest4:
 ;CHECK-V6-THUMB-NOT: smlal
-;CHECK-V6-THUMB2: smlal
-;CHECK-V7-THUMB: smlal
+;CHECK-T2-DSP: smlal
 ;CHECK-LE: asr [[RDHI:r[0-9]+]], [[RDLO:r[0-9]+]], #31
 ;CHECK-LE: smlal [[RDLO]], [[RDHI]], r1, r0
 ;CHECK-LE: mov r0, [[RDLO]]
@@ -114,14 +107,12 @@ define i64 @MACLongTest6(i32 %a, i32 %b, i32 %c, i32 %d) {
 ;CHECK-LABEL: MACLongTest6:
 ;CHECK-V6-THUMB-NOT: smull
 ;CHECK-V6-THUMB-NOT: smlal
-;CHECK: smull   r12, lr, r1, r0
-;CHECK: smlal   r12, lr, r3, r2
+;CHECK-LE: smull   r12, lr, r1, r0
+;CHECK-LE: smlal   r12, lr, r3, r2
 ;CHECK-V7: smull   [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0
 ;CHECK-V7: smlal   [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]]
-;CHECK-V7-THUMB: smull   [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0
-;CHECK-V7-THUMB: smlal   [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]]
-;CHECK-V6-THUMB2: smull   [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0
-;CHECK-V6-THUMB2: smlal   [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]]
+;CHECK-T2-DSP: smull   [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], r1, r0
+;CHECK-T2-DSP: smlal   [[RDLO]], [[RDHI]], [[Rn:r[0-9]+]], [[Rm:r[0-9]+]]
   %conv = sext i32 %a to i64
   %conv1 = sext i32 %b to i64
   %mul = mul nsw i64 %conv1, %conv
@@ -172,18 +163,12 @@ define i64 @MACLongTest9(i32 %lhs, i32 %rhs, i32 %lo, i32 %hi) {
 ;CHECK-V7-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
 ;CHECK-V7-BE: mov r0, [[RDHI]]
 ;CHECK-V7-BE: mov r1, [[RDLO]]
-;CHECK-V6-THUMB2: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
-;CHECK-V6-THUMB2: mov r0, [[RDLO]]
-;CHECK-V6-THUMB2: mov r1, [[RDHI]]
-;CHECK-V7-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
-;CHECK-V7-THUMB: mov r0, [[RDLO]]
-;CHECK-V7-THUMB: mov r1, [[RDHI]]
+;CHECK-T2-DSP: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
+;CHECK-T2-DSP-NEXT: mov r0, [[RDLO]]
+;CHECK-T2-DSP-NEXT: mov r1, [[RDHI]]
 ;CHECK-V7-THUMB-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
 ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]]
 ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]]
-;CHECK-V7EM-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
-;CHECK-V7EM-THUMB: mov r0, [[RDLO]]
-;CHECK-V7EM-THUMB: mov r1, [[RDHI]]
 ;CHECK-NOT:umaal
 ;CHECK-V6-THUMB-NOT: umaal
 ;CHECK-V6M-THUMB-NOT: umaal
@@ -206,18 +191,12 @@ define i64 @MACLongTest10(i32 %lhs, i32 %rhs, i32 %lo, i32 %hi) {
 ;CHECK-V7-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
 ;CHECK-V7-BE: mov r0, [[RDHI]]
 ;CHECK-V7-BE: mov r1, [[RDLO]]
-;CHECK-V6-THUMB2: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
-;CHECK-V6-THUMB2: mov r0, [[RDLO]]
-;CHECK-V6-THUMB2: mov r1, [[RDHI]]
-;CHECK-V7-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
-;CHECK-V7-THUMB: mov r0, [[RDLO]]
-;CHECK-V7-THUMB: mov r1, [[RDHI]]
+;CHECK-T2-DSP: umaal r2, r3, r1, r0
+;CHECK-T2-DSP-NEXT: mov r0, r2
+;CHECK-T2-DSP-NEXT: mov r1, r3
 ;CHECK-V7-THUMB-BE: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
 ;CHECK-V7-THUMB-BE: mov r0, [[RDHI]]
 ;CHECK-V7-THUMB-BE: mov r1, [[RDLO]]
-;CHECK-V7EM-THUMB: umaal [[RDLO:r[0-9]+]], [[RDHI:r[0-9]+]], [[LHS:r[0-9]+]], [[RHS:r[0-9]+]]
-;CHECK-V7EM-THUMB: mov r0, [[RDLO]]
-;CHECK-V7EM-THUMB: mov r1, [[RDHI]]
 ;CHECK-NOT:umaal
 ;CHECK-V6-THUMB-NOT:umaal
 ;CHECK-V6M-THUMB-NOT: umaal
@@ -231,3 +210,188 @@ define i64 @MACLongTest10(i32 %lhs, i32 %rhs, i32 %lo, i32 %hi) {
   %add2 = add i64 %add, %mul
   ret i64 %add2
 }
+
+define i64 @MACLongTest11(i16 %a, i16 %b, i64 %c)  {
+;CHECK-LABEL: MACLongTest11:
+;CHECK-T2-DSP-NOT: sxth
+;CHECK-T2-DSP: smlalbb r2, r3
+;CHECK-T2-DSP-NEXT: mov r0, r2
+;CHECK-T2-DSP-NEXT: mov r1, r3
+;CHECK-V5TE-NOT: sxth
+;CHECK-V5TE: smlalbb r2, r3
+;CHECK-V5TE-NEXT: mov r0, r2
+;CHECK-V5TE-NEXT: mov r1, r3
+;CHECK-V7-LE-NOT: sxth
+;CHECK-V7-LE: smlalbb r2, r3
+;CHECK-V7-LE-NEXT: mov r0, r2
+;CHECK-V7-LE-NEXT: mov r1, r3
+;CHECK-V7-THUMB-BE: smlalbb r3, r2
+;CHECK-V7-THUMB-BE-NEXT: mov r0, r2
+;CHECK-V7-THUMB-BE-NEXT: mov r1, r3
+;CHECK-LE-NOT: smlalbb
+;CHECK-BE-NOT: smlalbb
+;CHECK-V6M-THUMB-NOT: smlalbb
+;CHECK-V7M-THUMB-NOT: smlalbb
+  %conv = sext i16 %a to i32
+  %conv1 = sext i16 %b to i32
+  %mul = mul nsw i32 %conv1, %conv
+  %conv2 = sext i32 %mul to i64
+  %add = add nsw i64 %conv2, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest12(i16 %b, i32 %t, i64 %c)  {
+;CHECK-LABEL: MACLongTest12:
+;CHECK-T2-DSP-NOT: sxth
+;CHECK-T2-DSP-NOT: {{asr|lsr}}
+;CHECK-T2-DSP: smlalbt r2, r3, r0, r1
+;CHECK-T2-DSP-NEXT: mov r0, r2
+;CHECK-T2-DSP-NEXT: mov r1, r3
+;CHECK-T2-DSP-NOT: sxth
+;CHECK-V5TE-NOT: sxth
+;CHECK-V5TE-NOT: {{asr|lsr}}
+;CHECK-V5TE: smlalbt r2, r3, r0, r1
+;CHECK-V5TE-NEXT: mov r0, r2
+;CHECK-V5TE-NEXT: mov r1, r3
+;CHECK-V7-LE-NOT: sxth
+;CHECK-V7-LE-NOT: {{asr|lsr}}
+;CHECK-V7-LE: smlalbt r2, r3, r0, r1
+;CHECK-V7-LE-NEXT: mov r0, r2
+;CHECK-V7-LE-NEXT: mov r1, r3
+;CHECK-V7-THUMB-BE: smlalbt r3, r2,
+;CHECK-V7-THUMB-BE-NEXT: mov r0, r2
+;CHECK-V7-THUMB-BE-NEXT: mov r1, r3
+;CHECK-LE-NOT: smlalbt
+;CHECK-BE-NOT: smlalbt
+;CHECK-V6M-THUMB-NOT: smlalbt
+;CHECK-V7M-THUMB-NOT: smlalbt
+  %conv0 = sext i16 %b to i32
+  %conv1 = ashr i32 %t, 16
+  %mul = mul nsw i32 %conv0, %conv1
+  %conv2 = sext i32 %mul to i64
+  %add = add nsw i64 %conv2, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest13(i32 %t, i16 %b, i64 %c)  {
+;CHECK-LABEL: MACLongTest13:
+;CHECK-T2-DSP-NOT: sxth
+;CHECK-T2-DSP-NOT: {{asr|lsr}}
+;CHECK-T2-DSP: smlaltb r2, r3, r0, r1
+;CHECK-T2-DSP-NEXT: mov r0, r2
+;CHECK-T2-DSP-NEXT: mov r1, r3
+;CHECK-V5TE-NOT: sxth
+;CHECK-V5TE-NOT: {{asr|lsr}}
+;CHECK-V5TE: smlaltb r2, r3, r0, r1
+;CHECK-V5TE-NEXT: mov r0, r2
+;CHECK-V5TE-NEXT: mov r1, r3
+;CHECK-V7-LE-NOT: sxth
+;CHECK-V7-LE-NOT: {{asr|lsr}}
+;CHECK-V7-LE: smlaltb r2, r3, r0, r1
+;CHECK-V7-LE-NEXT: mov r0, r2
+;CHECK-V7-LE-NEXT: mov r1, r3
+;CHECK-V7-THUMB-BE: smlaltb r3, r2, r0, r1
+;CHECK-V7-THUMB-BE-NEXT: mov r0, r2
+;CHECK-V7-THUMB-BE-NEXT: mov r1, r3
+;CHECK-LE-NOT: smlaltb
+;CHECK-BE-NOT: smlaltb
+;CHECK-V6M-THUMB-NOT: smlaltb
+;CHECK-V7M-THUMB-NOT: smlaltb
+  %conv0 = ashr i32 %t, 16
+  %conv1= sext i16 %b to i32
+  %mul = mul nsw i32 %conv0, %conv1
+  %conv2 = sext i32 %mul to i64
+  %add = add nsw i64 %conv2, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest14(i32 %a, i32 %b, i64 %c)  {
+;CHECK-LABEL: MACLongTest14:
+;CHECK-T2-DSP-NOT: {{asr|lsr}}
+;CHECK-T2-DSP: smlaltt r2, r3,
+;CHECK-T2-DSP-NEXT: mov r0, r2
+;CHECK-T2-DSP-NEXT: mov r1, r3
+;CHECK-V5TE-NOT: {{asr|lsr}}
+;CHECK-V5TE: smlaltt r2, r3,
+;CHECK-V5TE-NEXT: mov r0, r2
+;CHECK-V5TE-NEXT: mov r1, r3
+;CHECK-V7-LE-NOT: {{asr|lsr}}
+;CHECK-V7-LE: smlaltt r2, r3,
+;CHECK-V7-LE-NEXT: mov r0, r2
+;CHECK-V7-LE-NEXT: mov r1, r3
+;CHECK-V7-THUMB-BE: smlaltt r3, r2,
+;CHECK-V7-THUMB-BE-NEXT: mov r0, r2
+;CHECK-V7-THUMB-BE-NEXT: mov r1, r3
+;CHECK-LE-NOT: smlaltt
+;CHECK-BE-NOT: smlaltt
+;CHECK-V6M-THUMB-NOT: smlaltt
+;CHECK-V7M-THUMB-NOT: smlaltt
+  %conv0 = ashr i32 %a, 16
+  %conv1 = ashr i32 %b, 16
+  %mul = mul nsw i32 %conv1, %conv0
+  %conv2 = sext i32 %mul to i64
+  %add = add nsw i64 %conv2, %c
+  ret i64 %add
+}
+
+@global_b = external global i16, align 2
+;CHECK-LABEL: MACLongTest15
+;CHECK-T2-DSP-NOT: {{asr|lsr}}
+;CHECK-T2-DSP: smlaltb r2, r3, r0, r1
+;CHECK-T2-DSP-NEXT: mov r0, r2
+;CHECK-T2-DSP-NEXT: mov r1, r3
+;CHECK-V5TE-NOT: {{asr|lsr}}
+;CHECK-V5TE: smlaltb r2, r3, r0, r1
+;CHECK-V5TE-NEXT: mov r0, r2
+;CHECK-V5TE-NEXT: mov r1, r3
+;CHECK-V7-LE-NOT: {{asr|lsr}}
+;CHECK-V7-LE: smlaltb r2, r3, r0, r1
+;CHECK-V7-LE-NEXT: mov r0, r2
+;CHECK-V7-LE-NEXT: mov r1, r3
+;CHECK-V7-THUMB-BE: smlaltb r3, r2, r0, r1
+;CHECK-V7-THUMB-BE-NEXT: mov r0, r2
+;CHECK-V7-THUMB-BE-NEXT: mov r1, r3
+;CHECK-LE-NOT: smlaltb
+;CHECK-BE-NOT: smlaltb
+;CHECK-V6M-THUMB-NOT: smlaltb
+;CHECK-V7M-THUMB-NOT: smlaltb
+define i64 @MACLongTest15(i32 %t, i64 %acc) {
+entry:
+  %0 = load i16, i16* @global_b, align 2
+  %conv = sext i16 %0 to i32
+  %shr = ashr i32 %t, 16
+  %mul = mul nsw i32 %shr, %conv
+  %conv1 = sext i32 %mul to i64
+  %add = add nsw i64 %conv1, %acc
+  ret i64 %add
+}
+
+;CHECK-LABEL: MACLongTest16
+;CHECK-T2-DSP-NOT: {{asr|lsr}}
+;CHECK-T2-DSP: smlalbt r2, r3, r1, r0
+;CHECK-T2-DSP-NEXT: mov r0, r2
+;CHECK-T2-DSP-NEXT: mov r1, r3
+;CHECK-V5TE-NOT: {{asr|lsr}}
+;CHECK-V5TE: smlalbt r2, r3, r1, r0
+;CHECK-V5TE-NEXT: mov r0, r2
+;CHECK-V5TE-NEXT: mov r1, r3
+;CHECK-V7-LE: smlalbt r2, r3, r1, r0
+;CHECK-V7-LE-NEXT: mov r0, r2
+;CHECK-V7-LE-NEXT: mov r1, r3
+;CHECK-V7-THUMB-BE: smlalbt r3, r2, r1, r0
+;CHECK-V7-THUMB-BE-NEXT: mov r0, r2
+;CHECK-V7-THUMB-BE-NEXT: mov r1, r3
+;CHECK-LE-NOT: smlalbt
+;CHECK-BE-NOT: smlalbt
+;CHECK-V6M-THUMB-NOT: smlalbt
+;CHECK-V7M-THUMB-NOT: smlalbt
+define i64 @MACLongTest16(i32 %t, i64 %acc) {
+entry:
+  %0 = load i16, i16* @global_b, align 2
+  %conv = sext i16 %0 to i32
+  %shr = ashr i32 %t, 16
+  %mul = mul nsw i32 %conv, %shr
+  %conv1 = sext i32 %mul to i64
+  %add = add nsw i64 %conv1, %acc
+  ret i64 %add
+}
diff --git a/test/CodeGen/ARM/lowerMUL-newload.ll b/test/CodeGen/ARM/lowerMUL-newload.ll
new file mode 100644
index 0000000000000000000000000000000000000000..93d765cba1168cef45bb940647dafbfdb09ab846
--- /dev/null
+++ b/test/CodeGen/ARM/lowerMUL-newload.ll
@@ -0,0 +1,115 @@
+; RUN: llc < %s -mtriple=arm-eabi -mcpu=krait | FileCheck %s
+
+define void @func1(i16* %a, i16* %b, i16* %c) {
+entry:
+; The test case trying to vectorize the pseudo code below.
+; a[i] = b[i] + c[i];
+; b[i] = a[i] * c[i];
+; a[i] = b[i] + a[i] * c[i];
+;
+; Checking that vector load a[i] for "a[i] = b[i] + a[i] * c[i]" is
+; scheduled before the first vector store to "a[i] = b[i] + c[i]".
+; Checking that there is no vector load a[i] scheduled between the vector
+; stores to a[i], otherwise the load of a[i] will be polluted by the first
+; vector store to a[i].
+;
+; This test case check that the chain information is updated during
+; lowerMUL for the new created Load SDNode.
+
+; CHECK: vldr {{.*}} [r0, #16]
+; CHECK: vstr {{.*}} [r0, #16]
+; CHECK-NOT: vldr {{.*}} [r0, #16]
+; CHECK: vstr {{.*}} [r0, #16]
+
+  %scevgep0 = getelementptr i16, i16* %a, i32 8
+  %vector_ptr0 = bitcast i16* %scevgep0 to <4 x i16>*
+  %vec0 = load <4 x i16>, <4 x i16>* %vector_ptr0, align 8
+  %scevgep1 = getelementptr i16, i16* %b, i32 8
+  %vector_ptr1 = bitcast i16* %scevgep1 to <4 x i16>*
+  %vec1 = load <4 x i16>, <4 x i16>* %vector_ptr1, align 8
+  %0 = zext <4 x i16> %vec1 to <4 x i32>
+  %scevgep2 = getelementptr i16, i16* %c, i32 8
+  %vector_ptr2 = bitcast i16* %scevgep2 to <4 x i16>*
+  %vec2 = load <4 x i16>, <4 x i16>* %vector_ptr2, align 8
+  %1 = sext <4 x i16> %vec2 to <4 x i32>
+  %vec3 = add <4 x i32> %1, %0
+  %2 = trunc <4 x i32> %vec3 to <4 x i16>
+  %scevgep3 = getelementptr i16, i16* %a, i32 8
+  %vector_ptr3 = bitcast i16* %scevgep3 to <4 x i16>*
+  store <4 x i16> %2, <4 x i16>* %vector_ptr3, align 8
+  %vector_ptr4 = bitcast i16* %scevgep2 to <4 x i16>*
+  %vec4 = load <4 x i16>, <4 x i16>* %vector_ptr4, align 8
+  %3 = sext <4 x i16> %vec4 to <4 x i32>
+  %vec5 = mul <4 x i32> %3, %vec3
+  %4 = trunc <4 x i32> %vec5 to <4 x i16>
+  %vector_ptr5 = bitcast i16* %scevgep1 to <4 x i16>*
+  store <4 x i16> %4, <4 x i16>* %vector_ptr5, align 8
+  %5 = sext <4 x i16> %vec0 to <4 x i32>
+  %vector_ptr6 = bitcast i16* %scevgep2 to <4 x i16>*
+  %vec6 = load <4 x i16>, <4 x i16>* %vector_ptr6, align 8
+  %6 = sext <4 x i16> %vec6 to <4 x i32>
+  %vec7 = mul <4 x i32> %6, %5
+  %vec8 = add <4 x i32> %vec7, %vec5
+  %7 = trunc <4 x i32> %vec8 to <4 x i16>
+  %vector_ptr7 = bitcast i16* %scevgep3 to <4 x i16>*
+  store <4 x i16> %7, <4 x i16>* %vector_ptr7, align 8
+  ret void
+}
+
+define void @func2(i16* %a, i16* %b, i16* %c) {
+entry:
+; The test case trying to vectorize the pseudo code below.
+; a[i] = b[i] + c[i];
+; b[i] = a[i] * c[i];
+; a[i] = b[i] + a[i] * c[i] + a[i];
+;
+; Checking that vector load a[i] for "a[i] = b[i] + a[i] * c[i] + a[i]"
+; is scheduled before the first vector store to "a[i] = b[i] + c[i]".
+; Checking that there is no vector load a[i] scheduled between the first
+; vector store to a[i] and the vector add of a[i], otherwise the load of
+; a[i] will be polluted by the first vector store to a[i].
+;
+; This test case check that both the chain and value of the new created
+; Load SDNode are updated during lowerMUL.
+
+; CHECK: vldr {{.*}} [r0, #16]
+; CHECK: vstr {{.*}} [r0, #16]
+; CHECK-NOT: vldr {{.*}} [r0, #16]
+; CHECK: vaddw.s16
+; CHECK: vstr {{.*}} [r0, #16]
+
+  %scevgep0 = getelementptr i16, i16* %a, i32 8
+  %vector_ptr0 = bitcast i16* %scevgep0 to <4 x i16>*
+  %vec0 = load <4 x i16>, <4 x i16>* %vector_ptr0, align 8
+  %scevgep1 = getelementptr i16, i16* %b, i32 8
+  %vector_ptr1 = bitcast i16* %scevgep1 to <4 x i16>*
+  %vec1 = load <4 x i16>, <4 x i16>* %vector_ptr1, align 8
+  %0 = zext <4 x i16> %vec1 to <4 x i32>
+  %scevgep2 = getelementptr i16, i16* %c, i32 8
+  %vector_ptr2 = bitcast i16* %scevgep2 to <4 x i16>*
+  %vec2 = load <4 x i16>, <4 x i16>* %vector_ptr2, align 8
+  %1 = sext <4 x i16> %vec2 to <4 x i32>
+  %vec3 = add <4 x i32> %1, %0
+  %2 = trunc <4 x i32> %vec3 to <4 x i16>
+  %scevgep3 = getelementptr i16, i16* %a, i32 8
+  %vector_ptr3 = bitcast i16* %scevgep3 to <4 x i16>*
+  store <4 x i16> %2, <4 x i16>* %vector_ptr3, align 8
+  %vector_ptr4 = bitcast i16* %scevgep2 to <4 x i16>*
+  %vec4 = load <4 x i16>, <4 x i16>* %vector_ptr4, align 8
+  %3 = sext <4 x i16> %vec4 to <4 x i32>
+  %vec5 = mul <4 x i32> %3, %vec3
+  %4 = trunc <4 x i32> %vec5 to <4 x i16>
+  %vector_ptr5 = bitcast i16* %scevgep1 to <4 x i16>*
+  store <4 x i16> %4, <4 x i16>* %vector_ptr5, align 8
+  %5 = sext <4 x i16> %vec0 to <4 x i32>
+  %vector_ptr6 = bitcast i16* %scevgep2 to <4 x i16>*
+  %vec6 = load <4 x i16>, <4 x i16>* %vector_ptr6, align 8
+  %6 = sext <4 x i16> %vec6 to <4 x i32>
+  %vec7 = mul <4 x i32> %6, %5
+  %vec8 = add <4 x i32> %vec7, %vec5
+  %vec9 = add <4 x i32> %vec8, %5
+  %7 = trunc <4 x i32> %vec9 to <4 x i16>
+  %vector_ptr7 = bitcast i16* %scevgep3 to <4 x i16>*
+  store <4 x i16> %7, <4 x i16>* %vector_ptr7, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM/machine-copyprop.mir b/test/CodeGen/ARM/machine-copyprop.mir
new file mode 100644
index 0000000000000000000000000000000000000000..9be595f690dbcca0ce14824ff13403c69aaf4a27
--- /dev/null
+++ b/test/CodeGen/ARM/machine-copyprop.mir
@@ -0,0 +1,22 @@
+# RUN: llc -o - %s -mtriple=armv7s-- -run-pass=machine-cp | FileCheck %s
+---
+# Test that machine copy prop recognizes the implicit-def operands on a COPY
+# as clobbering the register.
+# CHECK-LABEL: name: func
+# CHECK: %d2 = VMOVv2i32 2, 14, _
+# CHECK: %s5 = COPY %s0, implicit %q1, implicit-def %q1
+# CHECK: VST1q32 %r0, 0, %q1, 14, _
+# The following two COPYs must not be removed
+# CHECK: %s4 = COPY %s20, implicit-def %q1
+# CHECK: %s5 = COPY %s0, implicit killed %d0, implicit %q1, implicit-def %q1
+# CHECK: VST1q32 %r2, 0, %q1, 14, _
+name: func
+body: |
+  bb.0:
+    %d2 = VMOVv2i32 2, 14, _
+    %s5 = COPY %s0, implicit %q1, implicit-def %q1
+    VST1q32 %r0, 0, %q1, 14, _
+    %s4 = COPY %s20, implicit-def %q1
+    %s5 = COPY %s0, implicit killed %d0, implicit %q1, implicit-def %q1
+    VST1q32 %r2, 0, %q1, 14, _
+...
diff --git a/test/CodeGen/ARM/mature-mc-support.ll b/test/CodeGen/ARM/mature-mc-support.ll
index 0a7e5b91adc5fa626900532efdeb97a8809fab04..f89657dd81ac399fede1be2f51d39cce19ea321c 100644
--- a/test/CodeGen/ARM/mature-mc-support.ll
+++ b/test/CodeGen/ARM/mature-mc-support.ll
@@ -9,4 +9,4 @@
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
 
-; CHECK: LLVM ERROR: Error parsing inline asm
+; CHECK: error: unknown directive
diff --git a/test/CodeGen/ARM/misched-int-basic-thumb2.mir b/test/CodeGen/ARM/misched-int-basic-thumb2.mir
new file mode 100644
index 0000000000000000000000000000000000000000..86ef1e26f63688844440dc4d4326dfd927ca1854
--- /dev/null
+++ b/test/CodeGen/ARM/misched-int-basic-thumb2.mir
@@ -0,0 +1,175 @@
+# Basic machine sched model test for Thumb2 int instructions
+# RUN: llc -o /dev/null %s -mtriple=thumbv7-eabi -mcpu=swift -run-pass  machine-scheduler  -enable-misched -verify-misched \
+# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
+# RUN: llc -o /dev/null %s -mtriple=thumbv7--eabi -mcpu=cortex-a9 -run-pass  machine-scheduler  -enable-misched -verify-misched \
+# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
+# RUN: llc -o /dev/null %s -mtriple=thumbv8r-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -verify-misched \
+# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
+# REQUIRES: asserts
+--- |
+  ; ModuleID = 'foo.ll'
+  source_filename = "foo.ll"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv7---eabi"
+
+  @g1 = common global i32 0, align 4
+  @g2 = common global i32 0, align 4
+
+  define i64 @foo(i16 signext %a, i16 signext %b) {
+  entry:
+    %0 = load i32, i32* @g1, align 4
+    %1 = load i32, i32* @g2, align 4
+    %2 = add nuw nsw i32 %0, %0
+    %3 = sdiv i32 %2, %1
+    store i32 %3, i32* @g1, align 4
+    %d = mul nsw i16 %a, %a
+    %e = mul nsw i16 %b, %b
+    %f = add nuw nsw i16 %e, %d
+    %c = zext i16 %f to i32
+    %mul8 = mul nsw i32 %c, %3
+    %mul9 = mul nsw i32 %mul8, %mul8
+    %add10 = add nuw nsw i32 %mul9, %mul8
+    %conv1130 = zext i32 %add10 to i64
+    %mul12 = mul nuw nsw i64 %conv1130, %conv1130
+    %mul13 = mul nsw i64 %mul12, %mul12
+    %add14 = add nuw nsw i64 %mul13, %mul12
+    ret i64 %add14
+  }
+#
+# CHECK:       ********** MI Scheduling **********
+# CHECK:       SU(2):   %vreg2<def> = t2MOVi32imm <ga:@g1>; rGPR:%vreg2
+# CHECK_A9:    Latency    : 2
+# CHECK_SWIFT: Latency    : 2
+# CHECK_R52:   Latency    : 2
+#
+# CHECK:       SU(3):   %vreg3<def> = t2LDRi12 %vreg2, 0, pred:14, pred:%noreg; mem:LD4[@g1](dereferenceable) rGPR:%vreg3,%vreg2
+# CHECK_A9:    Latency    : 1
+# CHECK_SWIFT: Latency    : 3
+# CHECK_R52:   Latency    : 4
+#
+# CHECK :      SU(6):   %vreg6<def> = t2ADDrr %vreg3, %vreg3, pred:14, pred:%noreg, opt:%noreg; rGPR:%vreg6,%vreg3,%vreg3
+# CHECK_A9:    Latency    : 1
+# CHECK_SWIFT: Latency    : 1
+# CHECK_R52:   Latency    : 3
+
+# CHECK:       SU(7):   %vreg7<def> = t2SDIV %vreg6, %vreg5, pred:14, pred:%noreg; rGPR:%vreg7,%vreg6,%vreg5
+# CHECK_A9:    Latency    : 0
+# CHECK_SWIFT: Latency    : 14
+# CHECK_R52:   Latency    : 8
+
+# CHECK:       SU(8):   t2STRi12 %vreg7, %vreg2, 0, pred:14, pred:%noreg; mem:ST4[@g1] rGPR:%vreg7,%vreg2
+# CHECK_A9:    Latency    : 1
+# CHECK_SWIFT: Latency    : 0
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(9):   %vreg8<def> = t2SMULBB %vreg1, %vreg1, pred:14, pred:%noreg; rGPR:%vreg8,%vreg1,%vreg1
+# CHECK_A9:    Latency    : 2
+# CHECK_SWIFT: Latency    : 4
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(10):   %vreg9<def> = t2SMLABB %vreg0, %vreg0, %vreg8, pred:14, pred:%noreg; rGPR:%vreg9,%vreg0,%vreg0,%vreg8
+# CHECK_A9:    Latency    : 2
+# CHECK_SWIFT: Latency    : 4
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(11):   %vreg10<def> = t2UXTH %vreg9, 0, pred:14, pred:%noreg; rGPR:%vreg10,%vreg9
+# CHECK_A9:    Latency    : 1
+# CHECK_SWIFT: Latency    : 1
+# CHECK_R52:   Latency    : 3
+#
+# CHECK:       SU(12):   %vreg11<def> = t2MUL %vreg10, %vreg7, pred:14, pred:%noreg; rGPR:%vreg11,%vreg10,%vreg7
+# CHECK_A9:    Latency    : 2
+# CHECK_SWIFT: Latency    : 4
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(13):   %vreg12<def> = t2MLA %vreg11, %vreg11, %vreg11, pred:14, pred:%noreg; rGPR:%vreg12,%vreg11,%vreg11,%vreg11
+# CHECK_A9:    Latency    : 2
+# CHECK_SWIFT: Latency    : 4
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(14):   %vreg13<def>, %vreg14<def> = t2UMULL %vreg12, %vreg12, pred:14, pred:%noreg; rGPR:%vreg13,%vreg14,%vreg12,%vreg12
+# CHECK_A9:    Latency    : 3
+# CHECK_SWIFT: Latency    : 5
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(18):   %vreg19<def,tied4>, %vreg20<def,tied5> = t2UMLAL %vreg12, %vreg12, %vreg19<tied0>, %vreg20<tied1>, pred:14, pred:%noreg; rGPR:%vreg19,%vreg20,%vreg12,%vreg12,%vreg20
+# CHECK_A9:    Latency    : 3
+# CHECK_SWIFT: Latency    : 7
+# CHECK_R52:   Latency    : 4
+# CHECK:  ** ScheduleDAGMILive::schedule picking next node
+...
+---
+name:            foo
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: rgpr }
+  - { id: 1, class: rgpr }
+  - { id: 2, class: rgpr }
+  - { id: 3, class: rgpr }
+  - { id: 4, class: rgpr }
+  - { id: 5, class: rgpr }
+  - { id: 6, class: rgpr }
+  - { id: 7, class: rgpr }
+  - { id: 8, class: rgpr }
+  - { id: 9, class: rgpr }
+  - { id: 10, class: rgpr }
+  - { id: 11, class: rgpr }
+  - { id: 12, class: rgpr }
+  - { id: 13, class: rgpr }
+  - { id: 14, class: rgpr }
+  - { id: 15, class: rgpr }
+  - { id: 16, class: rgpr }
+  - { id: 17, class: rgpr }
+  - { id: 18, class: rgpr }
+  - { id: 19, class: rgpr }
+  - { id: 20, class: rgpr }
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+  - { reg: '%r1', virtual-reg: '%1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    liveins: %r0, %r1
+
+    %1 = COPY %r1
+    %0 = COPY %r0
+    %2 = t2MOVi32imm @g1
+    %3 = t2LDRi12 %2, 0, 14, _ :: (dereferenceable load 4 from @g1)
+    %4 = t2MOVi32imm @g2
+    %5 = t2LDRi12 %4, 0, 14, _ :: (dereferenceable load 4 from @g2)
+    %6 = t2ADDrr %3, %3, 14, _, _
+    %7 = t2SDIV %6, %5, 14, _
+    t2STRi12 %7, %2, 0, 14, _ :: (store 4 into @g1)
+    %8 = t2SMULBB %1, %1, 14, _
+    %9 = t2SMLABB %0, %0, %8, 14, _
+    %10 = t2UXTH %9, 0, 14, _
+    %11 = t2MUL %10, %7, 14, _
+    %12 = t2MLA %11, %11, %11, 14, _
+    %13, %14 = t2UMULL %12, %12, 14, _
+    %19, %16 = t2UMULL %13, %13, 14, _
+    %17 = t2MLA %13, %14, %16, 14, _
+    %20 = t2MLA %13, %14, %17, 14, _
+    %19, %20 = t2UMLAL %12, %12, %19, %20, 14, _
+    %r0 = COPY %19
+    %r1 = COPY %20
+    tBX_RET 14, _, implicit %r0, implicit %r1
+
+...
diff --git a/test/CodeGen/ARM/misched-int-basic.mir b/test/CodeGen/ARM/misched-int-basic.mir
new file mode 100644
index 0000000000000000000000000000000000000000..f237c0a07b2edb71cf48699b84c3d5a955f06499
--- /dev/null
+++ b/test/CodeGen/ARM/misched-int-basic.mir
@@ -0,0 +1,128 @@
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=swift -run-pass  machine-scheduler  -enable-misched -verify-misched \
+# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_SWIFT
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-a9 -run-pass  machine-scheduler  -enable-misched -verify-misched \
+# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_A9
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -verify-misched \
+# RUN:  -debug-only=misched 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK_R52
+# REQUIRES: asserts
+--- |
+  ; ModuleID = 'foo.ll'
+  source_filename = "foo.ll"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "arm---eabi"
+
+  define i64 @foo(i16 signext %a, i16 signext %b)  {
+  entry:
+    %d = mul nsw i16 %a, %a
+    %e = mul nsw i16 %b, %b
+    %f = add nuw nsw i16 %e, %d
+    %c = zext i16 %f to i32
+    %mul8 = mul nsw i32 %c, %c
+    %mul9 = mul nsw i32 %mul8, %mul8
+    %add10 = add nuw nsw i32 %mul9, %mul8
+    %conv1130 = zext i32 %add10 to i64
+    %mul12 = mul nuw nsw i64 %conv1130, %conv1130
+    %mul13 = mul nsw i64 %mul12, %mul12
+    %add14 = add nuw nsw i64 %mul13, %mul12
+    ret i64 %add14
+  }
+
+# CHECK:       ********** MI Scheduling **********
+# CHECK:       SU(2):   %vreg2<def> = SMULBB %vreg1, %vreg1, pred:14, pred:%noreg; GPR:%vreg2,%vreg1,%vreg1
+# CHECK_A9:    Latency    : 2
+# CHECK_SWIFT: Latency    : 4
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(3):   %vreg3<def> = SMLABB %vreg0, %vreg0, %vreg2, pred:14, pred:%noreg; GPRnopc:%vreg3,%vreg0,%vreg0 GPR:%vreg2
+# CHECK_A9:    Latency    : 2
+# CHECK_SWIFT: Latency    : 4
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(4):   %vreg4<def> = UXTH %vreg3, 0, pred:14, pred:%noreg; GPRnopc:%vreg4,%vreg3
+# CHECK_A9:    Latency    : 1
+# CHECK_SWIFT: Latency    : 1
+# CHECK_R52:   Latency    : 3
+#
+# CHECK:       SU(5):   %vreg5<def> = MUL %vreg4, %vreg4, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%vreg5,%vreg4,%vreg4
+# CHECK_A9:    Latency    : 2
+# CHECK_SWIFT: Latency    : 4
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(6):   %vreg6<def> = MLA %vreg5, %vreg5, %vreg5, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%vreg6,%vreg5,%vreg5,%vreg5
+# CHECK_A9:    Latency    : 2
+# CHECK_SWIFT: Latency    : 4
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(7):   %vreg7<def>, %vreg8<def> = UMULL %vreg6, %vreg6, pred:14, pred:%noreg, opt:%noreg; GPRnopc:%vreg7,%vreg8,%vreg6,%vreg6
+# CHECK_A9:    Latency    : 3
+# CHECK_SWIFT: Latency    : 5
+# CHECK_R52:   Latency    : 4
+#
+# CHECK:       SU(11):   %vreg13<def,tied4>, %vreg14<def,tied5> = UMLAL %vreg6, %vreg6, %vreg13<tied0>, %vreg14<tied1>, pred:14, pred:%noreg, opt:%noreg; GPR:%vreg13 GPRnopc:%vreg14,%vreg6,%vreg6
+# CHECK_SWIFT: Latency    : 7
+# CHECK_A9:    Latency    : 3
+# CHECK_R52:   Latency    : 4
+# CHECK:  ** ScheduleDAGMILive::schedule picking next node
+...
+---
+name:            foo
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gprnopc }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+  - { id: 3, class: gprnopc }
+  - { id: 4, class: gprnopc }
+  - { id: 5, class: gprnopc }
+  - { id: 6, class: gprnopc }
+  - { id: 7, class: gprnopc }
+  - { id: 8, class: gprnopc }
+  - { id: 9, class: gpr }
+  - { id: 10, class: gprnopc }
+  - { id: 11, class: gprnopc }
+  - { id: 12, class: gprnopc }
+  - { id: 13, class: gpr }
+  - { id: 14, class: gprnopc }
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+  - { reg: '%r1', virtual-reg: '%1' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0.entry:
+    liveins: %r0, %r1
+
+    %1 = COPY %r1
+    %0 = COPY %r0
+    %2 = SMULBB %1, %1, 14, _
+    %3 = SMLABB %0, %0, %2, 14, _
+    %4 = UXTH %3, 0, 14, _
+    %5 = MUL %4, %4, 14, _, _
+    %6 = MLA %5, %5, %5, 14, _, _
+    %7, %8 = UMULL %6, %6, 14, _, _
+    %13, %10 = UMULL %7, %7, 14, _, _
+    %11 = MLA %7, %8, %10, 14, _, _
+    %14 = MLA %7, %8, %11, 14, _, _
+    %13, %14 = UMLAL %6, %6, %13, %14, 14, _, _
+    %r0 = COPY %13
+    %r1 = COPY %14
+    BX_RET 14, _, implicit %r0, implicit %r1
+
+...
diff --git a/test/CodeGen/ARM/movt.ll b/test/CodeGen/ARM/movt.ll
index da9b698f2099671256badc86c3f3e0a039044d69..f51582031bd59c4b543a733fda992a2bca6925b5 100644
--- a/test/CodeGen/ARM/movt.ll
+++ b/test/CodeGen/ARM/movt.ll
@@ -2,10 +2,15 @@
 ; rdar://7317664
 
 ; RUN: llc -mtriple=thumbv8m.base %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv8m.base -mcpu=cortex-m23 %s -o - | FileCheck %s --check-prefix=NOMOVT
+; RUN: llc -mtriple=thumbv8m.base -mcpu=cortex-m33 %s -o - | FileCheck %s
 
 define i32 @t(i32 %X) nounwind {
 ; CHECK-LABEL: t:
 ; CHECK: movt r{{[0-9]}}, #65535
+; NOMOVT-LABEL: t:
+; NOMOVT-NOT: movt r{{[0-9]}}, #65535
+; NOMOVT: ldr r{{[0-9]}}, .LCP
 entry:
 	%0 = or i32 %X, -65536
 	ret i32 %0
@@ -14,6 +19,9 @@ entry:
 define i32 @t2(i32 %X) nounwind {
 ; CHECK-LABEL: t2:
 ; CHECK: movt r{{[0-9]}}, #65534
+; NOMOVT-LABEL: t2:
+; NOMOVT-NOT: movt r{{[0-9]}}, #65534
+; NOMOVT: ldr r{{[0-9]}}, .LCP
 entry:
 	%0 = or i32 %X, -131072
 	%1 = and i32 %0, -65537
diff --git a/test/CodeGen/ARM/msr-it-block.ll b/test/CodeGen/ARM/msr-it-block.ll
index 0f9ff6b29d795aa9dcce4321c733273134cbe4a8..8d4ddc3a498535d2eff225a90e38425512937727 100644
--- a/test/CodeGen/ARM/msr-it-block.ll
+++ b/test/CodeGen/ARM/msr-it-block.ll
@@ -20,8 +20,8 @@ write_reg:
 ; V6M: msr     apsr, {{r[0-9]+}}
 ; V7M: msr     apsr_nzcvq, {{r[0-9]+}}
 ; V7M: msr     apsr_nzcvq, {{r[0-9]+}}
-; V7A: msr     APSR_nzcvqg, {{r[0-9]+}}
-; V7A: msr     APSR_nzcvqg, {{r[0-9]+}}
+; V7A: msr     APSR_nzcvq, {{r[0-9]+}}
+; V7A: msr     APSR_nzcvq, {{r[0-9]+}}
   br label %exit
 
 exit:
@@ -41,8 +41,8 @@ write_reg:
 ; V6M: msr     apsr, {{r[0-9]+}}
 ; V7M: msr     apsr_nzcvq, {{r[0-9]+}}
 ; V7M: msr     apsr_nzcvq, {{r[0-9]+}}
-; V7A: msr     APSR_nzcvqg, {{r[0-9]+}}
-; V7A: msr     APSR_nzcvqg, {{r[0-9]+}}
+; V7A: msr     APSR_nzcvq, {{r[0-9]+}}
+; V7A: msr     APSR_nzcvq, {{r[0-9]+}}
   br label %exit
 
 exit:
diff --git a/test/CodeGen/ARM/neon_vabs.ll b/test/CodeGen/ARM/neon_vabs.ll
index d32e7b78879ba6400ea103b10319800225d0caef..109d09582afdc0043842a1fb0c331469e57d2e75 100644
--- a/test/CodeGen/ARM/neon_vabs.ll
+++ b/test/CodeGen/ARM/neon_vabs.ll
@@ -1,8 +1,15 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=arm-eabi -mattr=+neon %s -o - | FileCheck %s
 
 define <4 x i32> @test1(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: test1:
-; CHECK: vabs.s32 q
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vabs.s32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
         %tmp1neg = sub <4 x i32> zeroinitializer, %a
         %b = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
         %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
@@ -11,7 +18,13 @@ define <4 x i32> @test1(<4 x i32> %a) nounwind {
 
 define <4 x i32> @test2(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: test2:
-; CHECK: vabs.s32 q
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vabs.s32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
         %tmp1neg = sub <4 x i32> zeroinitializer, %a
         %b = icmp sge <4 x i32> %a, zeroinitializer
         %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
@@ -20,7 +33,13 @@ define <4 x i32> @test2(<4 x i32> %a) nounwind {
 
 define <8 x i16> @test3(<8 x i16> %a) nounwind {
 ; CHECK-LABEL: test3:
-; CHECK: vabs.s16 q
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vabs.s16 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
         %tmp1neg = sub <8 x i16> zeroinitializer, %a
         %b = icmp sgt <8 x i16> %a, zeroinitializer
         %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
@@ -29,7 +48,13 @@ define <8 x i16> @test3(<8 x i16> %a) nounwind {
 
 define <16 x i8> @test4(<16 x i8> %a) nounwind {
 ; CHECK-LABEL: test4:
-; CHECK: vabs.s8 q
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vabs.s8 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
         %tmp1neg = sub <16 x i8> zeroinitializer, %a
         %b = icmp slt <16 x i8> %a, zeroinitializer
         %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
@@ -38,7 +63,13 @@ define <16 x i8> @test4(<16 x i8> %a) nounwind {
 
 define <4 x i32> @test5(<4 x i32> %a) nounwind {
 ; CHECK-LABEL: test5:
-; CHECK: vabs.s32 q
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d17, r2, r3
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vabs.s32 q8, q8
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
         %tmp1neg = sub <4 x i32> zeroinitializer, %a
         %b = icmp sle <4 x i32> %a, zeroinitializer
         %abs = select <4 x i1> %b, <4 x i32> %tmp1neg, <4 x i32> %a
@@ -47,7 +78,11 @@ define <4 x i32> @test5(<4 x i32> %a) nounwind {
 
 define <2 x i32> @test6(<2 x i32> %a) nounwind {
 ; CHECK-LABEL: test6:
-; CHECK: vabs.s32 d
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vabs.s32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
         %tmp1neg = sub <2 x i32> zeroinitializer, %a
         %b = icmp sgt <2 x i32> %a, <i32 -1, i32 -1>
         %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
@@ -56,7 +91,11 @@ define <2 x i32> @test6(<2 x i32> %a) nounwind {
 
 define <2 x i32> @test7(<2 x i32> %a) nounwind {
 ; CHECK-LABEL: test7:
-; CHECK: vabs.s32 d
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vabs.s32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
         %tmp1neg = sub <2 x i32> zeroinitializer, %a
         %b = icmp sge <2 x i32> %a, zeroinitializer
         %abs = select <2 x i1> %b, <2 x i32> %a, <2 x i32> %tmp1neg
@@ -65,7 +104,11 @@ define <2 x i32> @test7(<2 x i32> %a) nounwind {
 
 define <4 x i16> @test8(<4 x i16> %a) nounwind {
 ; CHECK-LABEL: test8:
-; CHECK: vabs.s16 d
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vabs.s16 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
         %tmp1neg = sub <4 x i16> zeroinitializer, %a
         %b = icmp sgt <4 x i16> %a, zeroinitializer
         %abs = select <4 x i1> %b, <4 x i16> %a, <4 x i16> %tmp1neg
@@ -74,7 +117,11 @@ define <4 x i16> @test8(<4 x i16> %a) nounwind {
 
 define <8 x i8> @test9(<8 x i8> %a) nounwind {
 ; CHECK-LABEL: test9:
-; CHECK: vabs.s8 d
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vabs.s8 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
         %tmp1neg = sub <8 x i8> zeroinitializer, %a
         %b = icmp slt <8 x i8> %a, zeroinitializer
         %abs = select <8 x i1> %b, <8 x i8> %tmp1neg, <8 x i8> %a
@@ -83,7 +130,11 @@ define <8 x i8> @test9(<8 x i8> %a) nounwind {
 
 define <2 x i32> @test10(<2 x i32> %a) nounwind {
 ; CHECK-LABEL: test10:
-; CHECK: vabs.s32 d
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r0, r1
+; CHECK-NEXT:    vabs.s32 d16, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    mov pc, lr
         %tmp1neg = sub <2 x i32> zeroinitializer, %a
         %b = icmp sle <2 x i32> %a, zeroinitializer
         %abs = select <2 x i1> %b, <2 x i32> %tmp1neg, <2 x i32> %a
@@ -95,7 +146,13 @@ define <2 x i32> @test10(<2 x i32> %a) nounwind {
 
 define <4 x i32> @test11(<4 x i16> %a, <4 x i16> %b) nounwind {
 ; CHECK-LABEL: test11:
-; CHECK: vabdl.u16 q
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r2, r3
+; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    vabdl.u16 q8, d17, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
         %zext1 = zext <4 x i16> %a to <4 x i32>
         %zext2 = zext <4 x i16> %b to <4 x i32>
         %diff = sub <4 x i32> %zext1, %zext2
@@ -106,7 +163,13 @@ define <4 x i32> @test11(<4 x i16> %a, <4 x i16> %b) nounwind {
 }
 define <8 x i16> @test12(<8 x i8> %a, <8 x i8> %b) nounwind {
 ; CHECK-LABEL: test12:
-; CHECK: vabdl.u8 q
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r2, r3
+; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    vabdl.u8 q8, d17, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
         %zext1 = zext <8 x i8> %a to <8 x i16>
         %zext2 = zext <8 x i8> %b to <8 x i16>
         %diff = sub <8 x i16> %zext1, %zext2
@@ -118,7 +181,13 @@ define <8 x i16> @test12(<8 x i8> %a, <8 x i8> %b) nounwind {
 
 define <2 x i64> @test13(<2 x i32> %a, <2 x i32> %b) nounwind {
 ; CHECK-LABEL: test13:
-; CHECK: vabdl.u32 q
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    vmov d16, r2, r3
+; CHECK-NEXT:    vmov d17, r0, r1
+; CHECK-NEXT:    vabdl.u32 q8, d17, d16
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    vmov r2, r3, d17
+; CHECK-NEXT:    mov pc, lr
         %zext1 = zext <2 x i32> %a to <2 x i64>
         %zext2 = zext <2 x i32> %b to <2 x i64>
         %diff = sub <2 x i64> %zext1, %zext2
diff --git a/test/CodeGen/ARM/no-cmov2bfi.ll b/test/CodeGen/ARM/no-cmov2bfi.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c8b512048905452c2fd5727786bb0e6d9c8c14bf
--- /dev/null
+++ b/test/CodeGen/ARM/no-cmov2bfi.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=thumbv7 | FileCheck --check-prefix=CHECK-NOBFI %s
+
+declare zeroext i1 @dummy()
+
+define i8 @test(i8 %a1, i1 %c) {
+; CHECK-NOBFI-NOT: bfi
+; CHECK-NOBFI: bl      dummy
+; CHECK-NOBFI: cmp     r0, #0
+; CHECK-NOBFI: it      ne
+; CHECK-NOBFI: orrne   [[REG:r[0-9]+]], [[REG]], #8
+; CHECK-NOBFI: mov     r0, [[REG]]
+
+  %1 = and i8 %a1, -9
+  %2 = select i1 %c, i8 %1, i8 %a1
+  %3 = tail call zeroext i1 @dummy()
+  %4 = or i8 %2, 8
+  %ret = select i1 %3, i8 %4, i8 %2
+  ret i8 %ret
+}
diff --git a/test/CodeGen/ARM/phi.ll b/test/CodeGen/ARM/phi.ll
index ff85052175c85ad53d87d77da1a49e3c8545782f..568f7572b32e9a65f5c5c01d20a31e5a17571cf8 100644
--- a/test/CodeGen/ARM/phi.ll
+++ b/test/CodeGen/ARM/phi.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mtriple=arm-eabi -mattr=+v4t %s -o - | FileCheck %s
-; RUN: llc -mtriple=arm-eabi -mattr=+v4t -addr-sink-using-gep=1 %s -o - | FileCheck %s
 
 ; <rdar://problem/8686347>
 
diff --git a/test/CodeGen/ARM/pr32545.ll b/test/CodeGen/ARM/pr32545.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5bfb01b45983b34bf9e1ec016796bd6d37408905
--- /dev/null
+++ b/test/CodeGen/ARM/pr32545.ll
@@ -0,0 +1,22 @@
+; RUN: llc %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7--linux-gnueabi"
+
+; CHECK: vld1.16	{[[DREG:d[0-9]+]][0]}, {{.*}}
+; CHECK: vmovl.u8	[[QREG:q[0-9]+]], [[DREG]]
+; CHECK: vmovl.u16	[[QREG]], [[DREG]]
+
+define void @f(i32 %dstStride, i8* %indvars.iv, <2 x i8>* %zz) {
+entry:
+  br label %for.body
+
+for.body:
+  %tmp = load <2 x i8>, <2 x i8>* %zz, align 1
+  %tmp1 = extractelement <2 x i8> %tmp, i32 0
+  %.lhs.rhs = zext i8 %tmp1 to i32
+  call void @g(i32 %.lhs.rhs)
+  br label %for.body
+}
+
+declare void @g(i32)
diff --git a/test/CodeGen/ARM/prera-ldst-aliasing.mir b/test/CodeGen/ARM/prera-ldst-aliasing.mir
new file mode 100644
index 0000000000000000000000000000000000000000..ce37106ed8d2fd64d70d6b3caca6774906445624
--- /dev/null
+++ b/test/CodeGen/ARM/prera-ldst-aliasing.mir
@@ -0,0 +1,40 @@
+# RUN: llc -run-pass arm-prera-ldst-opt %s -o - | FileCheck %s
+--- |
+  target triple = "thumbv7---eabi"
+
+  define void @ldrd_strd_aa(i32* noalias nocapture %x, i32* noalias nocapture readonly %y) {
+  entry:
+    %0 = load i32, i32* %y, align 4
+    store i32 %0, i32* %x, align 4
+    %arrayidx2 = getelementptr inbounds i32, i32* %y, i32 1
+    %1 = load i32, i32* %arrayidx2, align 4
+    %arrayidx3 = getelementptr inbounds i32, i32* %x, i32 1
+    store i32 %1, i32* %arrayidx3, align 4
+    ret void
+  }
+...
+---
+name:            ldrd_strd_aa
+alignment:       1
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+  - { reg: '%r1', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: %r0, %r1
+
+    %1 : gpr = COPY %r1
+    %0 : gpr = COPY %r0
+    %2 : gpr = t2LDRi12 %1, 0, 14, _ :: (load 4 from %ir.y)
+    t2STRi12 killed %2, %0, 0, 14, _ :: (store 4 into %ir.x)
+    %3 : gpr = t2LDRi12 %1, 4, 14, _ :: (load 4 from %ir.arrayidx2)
+    t2STRi12 killed %3, %0, 4, 14, _ :: (store 4 into %ir.arrayidx3)
+    ; CHECK: t2LDRi12
+    ; CHECK-NEXT: t2LDRi12
+    ; CHECK-NEXT: t2STRi12
+    ; CHECK-NEXT: t2STRi12
+    tBX_RET 14, _
+
+...
+
diff --git a/test/CodeGen/ARM/prera-ldst-insertpt.mir b/test/CodeGen/ARM/prera-ldst-insertpt.mir
new file mode 100644
index 0000000000000000000000000000000000000000..eafcc7c36d334fe097ec3988aefd50a8169c4db0
--- /dev/null
+++ b/test/CodeGen/ARM/prera-ldst-insertpt.mir
@@ -0,0 +1,105 @@
+# RUN: llc -run-pass arm-prera-ldst-opt %s -o - | FileCheck %s
+--- |
+  target triple = "thumbv7---eabi"
+
+  define void @a(i32* nocapture %x, i32 %y, i32 %z) {
+  entry:
+    ret void
+  }
+
+  define void @b(i32* nocapture %x, i32 %y, i32 %z) {
+  entry:
+    ret void
+  }
+...
+---
+# CHECK-LABEL: name: a
+name:            a
+alignment:       1
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+  - { reg: '%r1', virtual-reg: '%1' }
+  - { reg: '%r2', virtual-reg: '%2' }
+body:             |
+  bb.0.entry:
+    liveins: %r0, %r1, %r2
+
+    %2 : rgpr = COPY %r2
+    %1 : rgpr = COPY %r1
+    %0 : gpr = COPY %r0
+    %3 : rgpr = t2MUL %2, %2, 14, _
+    %4 : rgpr = t2MUL %1, %1, 14, _
+    %5 : rgpr = t2MOVi32imm -858993459
+    %6 : rgpr, %7 : rgpr  = t2UMULL killed %3, %5, 14, _
+    %8 : rgpr, %9 : rgpr  = t2UMULL killed %4, %5, 14, _
+    t2STRi12 %1, %0, 0, 14, _ :: (store 4)
+    %10 : rgpr = t2LSLri %2, 1, 14, _, _
+    t2STRi12 killed %10, %0, 4, 14, _ :: (store 4)
+
+    ; Make sure we move the paired stores next to each other, and
+    ; insert them in an appropriate location.
+    ; CHECK: t2STRi12 %1,
+    ; CHECK-NEXT: t2STRi12 killed %10,
+    ; CHECK-NEXT: t2MOVi
+    ; CHECK-NEXT: t2ADDrs
+
+    %11 : rgpr = t2MOVi 55, 14, _, _
+    %12 : gprnopc = t2ADDrs %11, killed %7, 19, 14, _, _
+    t2STRi12 killed %12, %0, 16, 14, _ :: (store 4)
+    %13 : gprnopc = t2ADDrs %11, killed %9, 19, 14, _, _
+    t2STRi12 killed %13, %0, 20, 14, _ :: (store 4)
+
+    ; Make sure we move the paired stores next to each other.
+    ; CHECK: t2STRi12 killed %12,
+    ; CHECK-NEXT: t2STRi12 killed %13,
+
+    tBX_RET 14, _
+---
+# CHECK-LABEL: name: b
+name:            b
+alignment:       1
+tracksRegLiveness: true
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+  - { reg: '%r1', virtual-reg: '%1' }
+  - { reg: '%r2', virtual-reg: '%2' }
+body:             |
+  bb.0.entry:
+    liveins: %r0, %r1, %r2
+
+    %2 : rgpr = COPY %r2
+    %1 : rgpr = COPY %r1
+    %0 : gpr = COPY %r0
+    t2STRi12 %1, %0, 0, 14, _ :: (store 4)
+    %10 : rgpr = t2LSLri %2, 1, 14, _, _
+    t2STRi12 killed %10, %0, 4, 14, _ :: (store 4)
+    %3 : rgpr = t2MUL %2, %2, 14, _
+    t2STRi12 %3, %0, 8, 14, _ :: (store 4)
+
+    ; Make sure we move the paired stores next to each other, and
+    ; insert them in an appropriate location.
+    ; CHECK: t2STRi12 {{.*}}, 0
+    ; CHECK-NEXT: t2STRi12 {{.*}}, 4
+    ; CHECK-NEXT: t2STRi12 {{.*}}, 8
+    ; CHECK-NEXT: t2MUL
+    ; CHECK-NEXT: t2MOVi32imm
+
+    %4 : rgpr = t2MUL %1, %1, 14, _
+    %5 : rgpr = t2MOVi32imm -858993459
+    %6 : rgpr, %7 : rgpr  = t2UMULL killed %3, %5, 14, _
+    %8 : rgpr, %9 : rgpr  = t2UMULL killed %4, %5, 14, _
+    %10 : rgpr = t2LSLri %2, 1, 14, _, _
+    %11 : rgpr = t2MOVi 55, 14, _, _
+    %12 : gprnopc = t2ADDrs %11, killed %7, 19, 14, _, _
+    t2STRi12 killed %12, %0, 16, 14, _ :: (store 4)
+    %13 : gprnopc = t2ADDrs %11, killed %9, 19, 14, _, _
+    t2STRi12 killed %13, %0, 20, 14, _ :: (store 4)
+
+    ; Make sure we move the paired stores next to each other.
+    ; CHECK: t2STRi12 {{.*}}, 16
+    ; CHECK-NEXT: t2STRi12 {{.*}}, 20
+
+    tBX_RET 14, _
+
+...
diff --git a/test/CodeGen/ARM/rev.ll b/test/CodeGen/ARM/rev.ll
index f95f97105b9fc1b368b719c45626f878ead43f92..a36526ff1fb030f1bb8887e4b36653c5e1408eab 100644
--- a/test/CodeGen/ARM/rev.ll
+++ b/test/CodeGen/ARM/rev.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=arm-eabi -mattr=+v6 %s -o - | FileCheck %s
 
 define i32 @test1(i32 %X) nounwind {
-; CHECK: test1
+; CHECK-LABEL: test1
 ; CHECK: rev16 r0, r0
         %tmp1 = lshr i32 %X, 8
         %X15 = bitcast i32 %X to i32
@@ -17,7 +17,7 @@ define i32 @test1(i32 %X) nounwind {
 }
 
 define i32 @test2(i32 %X) nounwind {
-; CHECK: test2
+; CHECK-LABEL: test2
 ; CHECK: revsh r0, r0
         %tmp1 = lshr i32 %X, 8
         %tmp1.upgrd.1 = trunc i32 %tmp1 to i16
@@ -58,7 +58,7 @@ entry:
 ; rdar://9609059
 define i32 @test5(i32 %i) nounwind readnone {
 entry:
-; CHECK: test5
+; CHECK-LABEL: test5
 ; CHECK: revsh r0, r0
   %shl = shl i32 %i, 24
   %shr = ashr exact i32 %shl, 16
@@ -71,7 +71,7 @@ entry:
 ; rdar://9609108
 define i32 @test6(i32 %x) nounwind readnone {
 entry:
-; CHECK: test6
+; CHECK-LABEL: test6
 ; CHECK: rev16 r0, r0
   %and = shl i32 %x, 8
   %shl = and i32 %and, 65280
@@ -88,7 +88,7 @@ entry:
 ; rdar://9164521
 define i32 @test7(i32 %a) nounwind readnone {
 entry:
-; CHECK: test7
+; CHECK-LABEL: test7
 ; CHECK: rev r0, r0
 ; CHECK: lsr r0, r0, #16
   %and = lshr i32 %a, 8
@@ -101,7 +101,7 @@ entry:
 
 define i32 @test8(i32 %a) nounwind readnone {
 entry:
-; CHECK: test8
+; CHECK-LABEL: test8
 ; CHECK: revsh r0, r0
   %and = lshr i32 %a, 8
   %shr4 = and i32 %and, 255
@@ -115,7 +115,7 @@ entry:
 ; rdar://10750814
 define zeroext i16 @test9(i16 zeroext %v) nounwind readnone {
 entry:
-; CHECK: test9
+; CHECK-LABEL: test9
 ; CHECK: rev16 r0, r0
   %conv = zext i16 %v to i32
   %shr4 = lshr i32 %conv, 8
diff --git a/test/CodeGen/ARM/select_const.ll b/test/CodeGen/ARM/select_const.ll
new file mode 100644
index 0000000000000000000000000000000000000000..48fe572bf8a7292118d7e687e6e2649a0d8bcf92
--- /dev/null
+++ b/test/CodeGen/ARM/select_const.ll
@@ -0,0 +1,326 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=arm-eabi-unknown-unknown | FileCheck %s
+
+; Select of constants: control flow / conditional moves can always be replaced by logic+math (but may not be worth it?).
+; Test the zeroext/signext variants of each pattern to see if that makes a difference.
+
+; select Cond, 0, 1 --> zext (!Cond)
+
+define i32 @select_0_or_1(i1 %cond) {
+; CHECK-LABEL: select_0_or_1:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #1
+; CHECK-NEXT:    bic r0, r1, r0
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 0, i32 1
+  ret i32 %sel
+}
+
+define i32 @select_0_or_1_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_0_or_1_zeroext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    eor r0, r0, #1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 0, i32 1
+  ret i32 %sel
+}
+
+define i32 @select_0_or_1_signext(i1 signext %cond) {
+; CHECK-LABEL: select_0_or_1_signext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #1
+; CHECK-NEXT:    bic r0, r1, r0
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 0, i32 1
+  ret i32 %sel
+}
+
+; select Cond, 1, 0 --> zext (Cond)
+
+define i32 @select_1_or_0(i1 %cond) {
+; CHECK-LABEL: select_1_or_0:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 1, i32 0
+  ret i32 %sel
+}
+
+define i32 @select_1_or_0_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_1_or_0_zeroext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 1, i32 0
+  ret i32 %sel
+}
+
+define i32 @select_1_or_0_signext(i1 signext %cond) {
+; CHECK-LABEL: select_1_or_0_signext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 1, i32 0
+  ret i32 %sel
+}
+
+; select Cond, 0, -1 --> sext (!Cond)
+
+define i32 @select_0_or_neg1(i1 %cond) {
+; CHECK-LABEL: select_0_or_neg1:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #1
+; CHECK-NEXT:    bic r0, r1, r0
+; CHECK-NEXT:    rsb r0, r0, #0
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 0, i32 -1
+  ret i32 %sel
+}
+
+define i32 @select_0_or_neg1_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_0_or_neg1_zeroext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    eor r0, r0, #1
+; CHECK-NEXT:    rsb r0, r0, #0
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 0, i32 -1
+  ret i32 %sel
+}
+
+define i32 @select_0_or_neg1_signext(i1 signext %cond) {
+; CHECK-LABEL: select_0_or_neg1_signext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mvn r0, r0
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 0, i32 -1
+  ret i32 %sel
+}
+
+define i32 @select_0_or_neg1_alt(i1 %cond) {
+; CHECK-LABEL: select_0_or_neg1_alt:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    sub r0, r0, #1
+; CHECK-NEXT:    mov pc, lr
+  %z = zext i1 %cond to i32
+  %add = add i32 %z, -1
+  ret i32 %add
+}
+
+define i32 @select_0_or_neg1_alt_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_0_or_neg1_alt_zeroext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    sub r0, r0, #1
+; CHECK-NEXT:    mov pc, lr
+  %z = zext i1 %cond to i32
+  %add = add i32 %z, -1
+  ret i32 %add
+}
+
+define i32 @select_0_or_neg1_alt_signext(i1 signext %cond) {
+; CHECK-LABEL: select_0_or_neg1_alt_signext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mvn r0, r0
+; CHECK-NEXT:    mov pc, lr
+  %z = zext i1 %cond to i32
+  %add = add i32 %z, -1
+  ret i32 %add
+}
+
+; select Cond, -1, 0 --> sext (Cond)
+
+define i32 @select_neg1_or_0(i1 %cond) {
+; CHECK-LABEL: select_neg1_or_0:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    and r0, r0, #1
+; CHECK-NEXT:    rsb r0, r0, #0
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 -1, i32 0
+  ret i32 %sel
+}
+
+define i32 @select_neg1_or_0_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_neg1_or_0_zeroext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    rsb r0, r0, #0
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 -1, i32 0
+  ret i32 %sel
+}
+
+define i32 @select_neg1_or_0_signext(i1 signext %cond) {
+; CHECK-LABEL: select_neg1_or_0_signext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 -1, i32 0
+  ret i32 %sel
+}
+
+; select Cond, C+1, C --> add (zext Cond), C
+
+define i32 @select_Cplus1_C(i1 %cond) {
+; CHECK-LABEL: select_Cplus1_C:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #41
+; CHECK-NEXT:    tst r0, #1
+; CHECK-NEXT:    movne r1, #42
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 42, i32 41
+  ret i32 %sel
+}
+
+define i32 @select_Cplus1_C_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_Cplus1_C_zeroext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #41
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    movne r1, #42
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 42, i32 41
+  ret i32 %sel
+}
+
+define i32 @select_Cplus1_C_signext(i1 signext %cond) {
+; CHECK-LABEL: select_Cplus1_C_signext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #41
+; CHECK-NEXT:    tst r0, #1
+; CHECK-NEXT:    movne r1, #42
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 42, i32 41
+  ret i32 %sel
+}
+
+; select Cond, C, C+1 --> add (sext Cond), C
+
+define i32 @select_C_Cplus1(i1 %cond) {
+; CHECK-LABEL: select_C_Cplus1:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #42
+; CHECK-NEXT:    tst r0, #1
+; CHECK-NEXT:    movne r1, #41
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 41, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C_Cplus1_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_C_Cplus1_zeroext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #42
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    movne r1, #41
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 41, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C_Cplus1_signext(i1 signext %cond) {
+; CHECK-LABEL: select_C_Cplus1_signext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #42
+; CHECK-NEXT:    tst r0, #1
+; CHECK-NEXT:    movne r1, #41
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 41, i32 42
+  ret i32 %sel
+}
+
+; In general, select of 2 constants could be:
+; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> add (and (sext Cond), C1-C2), C2
+
+define i32 @select_C1_C2(i1 %cond) {
+; CHECK-LABEL: select_C1_C2:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #165
+; CHECK-NEXT:    tst r0, #1
+; CHECK-NEXT:    orr r1, r1, #256
+; CHECK-NEXT:    moveq r1, #42
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 421, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C1_C2_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_C1_C2_zeroext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #165
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    orr r1, r1, #256
+; CHECK-NEXT:    moveq r1, #42
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 421, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C1_C2_signext(i1 signext %cond) {
+; CHECK-LABEL: select_C1_C2_signext:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #165
+; CHECK-NEXT:    tst r0, #1
+; CHECK-NEXT:    orr r1, r1, #256
+; CHECK-NEXT:    moveq r1, #42
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i32 421, i32 42
+  ret i32 %sel
+}
+
+; 4295032833 = 0x100010001.
+; This becomes an opaque constant via ConstantHoisting, so we don't fold it into the select.
+
+define i64 @opaque_constant1(i1 %cond, i64 %x) {
+; CHECK-LABEL: opaque_constant1:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    ands r12, r0, #1
+; CHECK-NEXT:    mov lr, #1
+; CHECK-NEXT:    mov r0, #23
+; CHECK-NEXT:    eor r3, r3, #1
+; CHECK-NEXT:    orr lr, lr, #65536
+; CHECK-NEXT:    mvnne r0, #3
+; CHECK-NEXT:    movne r12, #1
+; CHECK-NEXT:    and r4, r0, lr
+; CHECK-NEXT:    eor r2, r2, lr
+; CHECK-NEXT:    subs r0, r4, #1
+; CHECK-NEXT:    sbc r1, r12, #0
+; CHECK-NEXT:    orrs r2, r2, r3
+; CHECK-NEXT:    movne r0, r4
+; CHECK-NEXT:    movne r1, r12
+; CHECK-NEXT:    pop {r4, lr}
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i64 -4, i64 23
+  %bo = and i64 %sel, 4295032833  ; 0x100010001
+  %cmp = icmp eq i64 %x, 4295032833
+  %sext = sext i1 %cmp to i64
+  %add = add i64 %bo, %sext
+  ret i64 %add
+}
+
+; 65537 == 0x10001.
+; This becomes an opaque constant via ConstantHoisting, so we don't fold it into the select.
+
+define i64 @opaque_constant2(i1 %cond, i64 %x) {
+; CHECK-LABEL: opaque_constant2:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    mov r1, #1
+; CHECK-NEXT:    tst r0, #1
+; CHECK-NEXT:    orr r1, r1, #65536
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    moveq r0, #23
+; CHECK-NEXT:    and r0, r0, r1
+; CHECK-NEXT:    mov r1, #0
+; CHECK-NEXT:    mov pc, lr
+  %sel = select i1 %cond, i64 65537, i64 23
+  %bo = and i64 %sel, 65537
+  ret i64 %bo
+}
+
diff --git a/test/CodeGen/ARM/select_xform.ll b/test/CodeGen/ARM/select_xform.ll
index 8c1502e14655091b8f668a32f72a558454cffa12..09e8ed4bc096a94a55bef1f6fdfaf800a609936b 100644
--- a/test/CodeGen/ARM/select_xform.ll
+++ b/test/CodeGen/ARM/select_xform.ll
@@ -223,21 +223,19 @@ entry:
   ret i32 %add
 }
 
-; Do not fold the xor into the select
+; Fold the xor into the select.
 define i32 @t15(i32 %p) {
 entry:
 ; ARM-LABEL: t15:
-; ARM: mov     [[REG:r[0-9]+]], #2
+; ARM: mov     [[REG:r[0-9]+]], #3
 ; ARM: cmp     r0, #8
-; ARM: movwgt  [[REG:r[0-9]+]], #1
-; ARM: eor     r0, [[REG:r[0-9]+]], #1
+; ARM: movwgt  [[REG:r[0-9]+]], #0
 
 ; T2-LABEL: t15:
-; T2: movs    [[REG:r[0-9]+]], #2
+; T2: movs    [[REG:r[0-9]+]], #3
 ; T2: cmp     [[REG:r[0-9]+]], #8
 ; T2: it      gt
-; T2: movgt   [[REG:r[0-9]+]], #1
-; T2: eor     r0, [[REG:r[0-9]+]], #1
+; T2: movgt   [[REG:r[0-9]+]], #0
   %cmp = icmp sgt i32 %p, 8
   %a = select i1 %cmp, i32 1, i32 2
   %xor = xor i32 %a, 1
diff --git a/test/CodeGen/ARM/setcc-logic.ll b/test/CodeGen/ARM/setcc-logic.ll
new file mode 100644
index 0000000000000000000000000000000000000000..79bae1facb3e55c907563f7e1ee7b1ec003348ca
--- /dev/null
+++ b/test/CodeGen/ARM/setcc-logic.ll
@@ -0,0 +1,74 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=arm-eabi -mcpu=cortex-a8 | FileCheck %s
+
+define zeroext i1 @ne_neg1_and_ne_zero(i32 %x) nounwind {
+; CHECK-LABEL: ne_neg1_and_ne_zero:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    add r1, r0, #1
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    cmp r1, #1
+; CHECK-NEXT:    movwhi r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp1 = icmp ne i32 %x, -1
+  %cmp2 = icmp ne i32 %x, 0
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+; PR32401 - https://bugs.llvm.org/show_bug.cgi?id=32401
+
+define zeroext i1 @and_eq(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
+; CHECK-LABEL: and_eq:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    eor r2, r2, r3
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    orrs r0, r0, r2
+; CHECK-NEXT:    mov r0, #0
+; CHECK-NEXT:    movweq r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp1 = icmp eq i32 %a, %b
+  %cmp2 = icmp eq i32 %c, %d
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define zeroext i1 @or_ne(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
+; CHECK-LABEL: or_ne:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    eor r2, r2, r3
+; CHECK-NEXT:    eor r0, r0, r1
+; CHECK-NEXT:    orrs r0, r0, r2
+; CHECK-NEXT:    movwne r0, #1
+; CHECK-NEXT:    bx lr
+  %cmp1 = icmp ne i32 %a, %b
+  %cmp2 = icmp ne i32 %c, %d
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+define <4 x i1> @and_eq_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
+; CHECK-LABEL: and_eq_vec:
+; CHECK:       @ BB#0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    vmov d19, r2, r3
+; CHECK-NEXT:    add r12, sp, #40
+; CHECK-NEXT:    add lr, sp, #8
+; CHECK-NEXT:    vmov d18, r0, r1
+; CHECK-NEXT:    vld1.64 {d16, d17}, [lr]
+; CHECK-NEXT:    add r0, sp, #24
+; CHECK-NEXT:    vld1.64 {d20, d21}, [r12]
+; CHECK-NEXT:    vceq.i32 q8, q9, q8
+; CHECK-NEXT:    vld1.64 {d22, d23}, [r0]
+; CHECK-NEXT:    vceq.i32 q9, q11, q10
+; CHECK-NEXT:    vmovn.i32 d16, q8
+; CHECK-NEXT:    vmovn.i32 d17, q9
+; CHECK-NEXT:    vand d16, d16, d17
+; CHECK-NEXT:    vmov r0, r1, d16
+; CHECK-NEXT:    pop {r11, pc}
+  %cmp1 = icmp eq <4 x i32> %a, %b
+  %cmp2 = icmp eq <4 x i32> %c, %d
+  %and = and <4 x i1> %cmp1, %cmp2
+  ret <4 x i1> %and
+}
+
diff --git a/test/CodeGen/ARM/setcc-sentinals.ll b/test/CodeGen/ARM/setcc-sentinals.ll
deleted file mode 100644
index dc45e0e13881d69dcaea2e15d7144617ddf4ca0c..0000000000000000000000000000000000000000
--- a/test/CodeGen/ARM/setcc-sentinals.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc -mtriple=arm-eabi -mcpu=cortex-a8 -asm-verbose=false %s -o - | FileCheck %s
-
-define zeroext i1 @test0(i32 %x) nounwind {
-; CHECK-LABEL: test0:
-; CHECK: add [[REG:(r[0-9]+)|(lr)]], r0, #1
-; CHECK-NEXT: mov r0, #0
-; CHECK-NEXT: cmp [[REG]], #1
-; CHECK-NEXT: movwhi r0, #1
-; CHECK-NEXT: bx  lr
-  %cmp1 = icmp ne i32 %x, -1
-  %not.cmp = icmp ne i32 %x, 0
-  %.cmp1 = and i1 %cmp1, %not.cmp
-  ret i1 %.cmp1
-}
diff --git a/test/CodeGen/ARM/single-issue-r52.mir b/test/CodeGen/ARM/single-issue-r52.mir
new file mode 100644
index 0000000000000000000000000000000000000000..6c95f7603e6e0007c0e449bf384673e419d572d6
--- /dev/null
+++ b/test/CodeGen/ARM/single-issue-r52.mir
@@ -0,0 +1,86 @@
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=misched -misched-topdown 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=TOPDOWN
+# RUN: llc -o /dev/null %s -mtriple=arm-eabi -mcpu=cortex-r52 -run-pass  machine-scheduler  -enable-misched -debug-only=misched -misched-bottomup 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=BOTTOMUP
+# REQUIRES: asserts
+--- |
+  ; ModuleID = 'foo.ll'
+  source_filename = "foo.ll"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "arm---eabi"
+
+  %struct.__neon_int8x8x4_t = type { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> }
+  ; Function Attrs: nounwind
+  define <8 x i8> @foo(i8* %A) {
+    %tmp1 = call %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8* %A, i32 8)
+    %tmp2 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 0
+    %tmp3 = extractvalue %struct.__neon_int8x8x4_t %tmp1, 1
+    %tmp4 = add <8 x i8> %tmp2, %tmp3
+    ret <8 x i8> %tmp4
+  }
+  declare %struct.__neon_int8x8x4_t @llvm.arm.neon.vld4.v8i8.p0i8(i8*, i32)
+
+# CHECK: ********** MI Scheduling **********
+# CHECK: ScheduleDAGMILive::schedule starting
+# CHECK: SU(1):   %vreg1<def> = VLD4d8Pseudo %vreg0, 8, pred:14, pred:%noreg; mem:LD32[%A](align=8) QQPR:%vreg1 GPR:%vreg0
+# CHECK: Latency            : 8
+# CHECK: Single Issue       : true;
+# CHECK: SU(2):   %vreg4<def> = VADDv8i8 %vreg1:dsub_0, %vreg1:dsub_1, pred:14, pred:%noreg; DPR:%vreg4 QQPR:%vreg1
+# CHECK: Latency            : 5
+# CHECK: Single Issue       : false;
+# CHECK: SU(3):   %vreg5<def>, %vreg6<def> = VMOVRRD %vreg4, pred:14, pred:%noreg; GPR:%vreg5,%vreg6 DPR:%vreg4
+# CHECK: Latency            : 4
+# CHECK: Single Issue       : false;
+
+# TOPDOWN: Scheduling SU(1) %vreg1<def> = VLD4d8Pseudo
+# TOPDOWN: Bump cycle to end group
+# TOPDOWN: Scheduling SU(2) %vreg4<def> = VADDv8i8
+
+# BOTTOMUP: Scheduling SU(2) %vreg4<def> = VADDv8i8
+# BOTTOMUP: Scheduling SU(1) %vreg1<def> = VLD4d8Pseudo
+# BOTTOMUP: Bump cycle to begin group
+
+...
+---
+name:            foo
+alignment:       2
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: qqpr }
+  - { id: 2, class: dpr }
+  - { id: 3, class: dpr }
+  - { id: 4, class: dpr }
+  - { id: 5, class: gpr }
+  - { id: 6, class: gpr }
+liveins:
+  - { reg: '%r0', virtual-reg: '%0' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %r0
+
+    %0 = COPY %r0
+    %1 = VLD4d8Pseudo %0, 8, 14, _ :: (load 32 from %ir.A, align 8)
+    %4 = VADDv8i8 %1.dsub_0, %1.dsub_1, 14, _
+    %5, %6 = VMOVRRD %4, 14, _
+    %r0 = COPY %5
+    %r1 = COPY %6
+    BX_RET 14, _, implicit %r0, implicit killed %r1
+
+...
diff --git a/test/CodeGen/ARM/sjljeh-swifterror.ll b/test/CodeGen/ARM/sjljeh-swifterror.ll
new file mode 100644
index 0000000000000000000000000000000000000000..aae0e75c98afb086a92a1cb488b25dd81be6061e
--- /dev/null
+++ b/test/CodeGen/ARM/sjljeh-swifterror.ll
@@ -0,0 +1,27 @@
+; RUN: opt -sjljehprepare -verify < %s | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+target triple = "armv7s-apple-ios7.0"
+
+%swift.error = type opaque
+
+declare void @objc_msgSend() local_unnamed_addr
+
+declare i32 @__objc_personality_v0(...)
+
+; Make sure we don't leave a select on a swifterror argument.
+; CHECK-LABEL; @test
+; CHECK-NOT: select true, %0
+define swiftcc void @test(%swift.error** swifterror) local_unnamed_addr personality i32 (...)* @__objc_personality_v0 {
+entry:
+  %call28.i = invoke i32 bitcast (void ()* @objc_msgSend to i32 (i8*, i8*)*)(i8* undef, i8* undef)
+          to label %invoke.cont.i unwind label %lpad.i
+
+invoke.cont.i:
+  unreachable
+
+lpad.i:
+  %1 = landingpad { i8*, i32 }
+          cleanup
+  resume { i8*, i32 } undef
+}
+
diff --git a/test/CodeGen/ARM/smml.ll b/test/CodeGen/ARM/smml.ll
index aa093192f2b22df3420ccf6bd580013a4534ca23..4788644cf195880908eb3d9a2ae9030a0124269f 100644
--- a/test/CodeGen/ARM/smml.ll
+++ b/test/CodeGen/ARM/smml.ll
@@ -1,20 +1,15 @@
-; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s
-; RUN: llc -mtriple=armv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V6
-; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-V7
-; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s -check-prefix=CHECK-THUMB
-; RUN: llc -mtriple=thumbv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK-THUMB
-; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK-THUMBV6T2
-; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK-THUMBV7
+; RUN: llc -mtriple=arm-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V4
+; RUN: llc -mtriple=armv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V6
+; RUN: llc -mtriple=armv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-V6
+; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMB
+; RUN: llc -mtriple=thumbv6-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMBV6
+; RUN: llc -mtriple=thumbv6t2-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMBV6T2
+; RUN: llc -mtriple=thumbv7-eabi %s -o - | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-THUMBV6T2
 
 define i32 @Test0(i32 %a, i32 %b, i32 %c) nounwind readnone ssp {
 entry:
 ; CHECK-LABEL: Test0
 ; CHECK-NOT: smmls
-; CHECK-V6-NOT: smmls
-; CHECK-V7-NOT: smmls
-; CHECK_THUMB-NOT: smmls
-; CHECK-THUMBV6T2-NOT: smmls
-; CHECK-THUMBV7-NOT: smmls
   %conv4 = zext i32 %a to i64
   %conv1 = sext i32 %b to i64
   %conv2 = sext i32 %c to i64
@@ -27,12 +22,11 @@ entry:
 
 define i32 @Test1(i32 %a, i32 %b, i32 %c) {
 ;CHECK-LABEL: Test1
-;CHECK-NOT: smmls
+;CHECK-V4-NOT: smmls
 ;CHECK-THUMB-NOT: smmls
+;CHECK-THUMBV6-NOT: smmls
 ;CHECK-V6: smmls r0, [[Rn:r[1-2]]], [[Rm:r[1-2]]], r0
-;CHECK-V7: smmls r0, [[Rn:r[1-2]]], [[Rm:r[1-2]]], r0
 ;CHECK-THUMBV6T2: smmls r0, [[Rn:r[1-2]]], [[Rm:r[1-2]]], r0
-;CHECK-THUMBV7: smmls r0, [[Rn:r[1-2]]], [[Rm:r[1-2]]], r0
 entry:
   %conv = sext i32 %b to i64
   %conv1 = sext i32 %c to i64
@@ -47,10 +41,21 @@ entry:
 
 declare void @opaque(i32)
 define void @test_used_flags(i32 %in1, i32 %in2) {
-; CHECK-V7-LABEL: test_used_flags:
-; CHECK-V7: smull [[PROD_LO:r[0-9]+]], [[PROD_HI:r[0-9]+]], r0, r1
-; CHECK-V7: rsbs {{.*}}, [[PROD_LO]], #0
-; CHECK-V7: rscs {{.*}}, [[PROD_HI]], #0
+; CHECK-LABEL: test_used_flags:
+; CHECK-THUMB: cmp  r1, #0
+; CHECK-THUMB: push {r2}
+; CHECK-THUMB: pop  {r3}
+; CHECK-THUMB: ble
+; CHECK-THUMBV6: cmp r1, #0
+; CHECK-THUMBV6: mov r3, r2
+; CHECK-THUMBV6: ble
+; CHECK-V6: smull [[PROD_LO:r[0-9]+]], [[PROD_HI:r[0-9]+]], r0, r1
+; CHECK-V6: rsbs {{.*}}, [[PROD_LO]], #0
+; CHECK-V6: rscs {{.*}}, [[PROD_HI]], #0
+; CHECK-THUMBV6T2: smull [[PROD_LO:r[0-9]+]], [[PROD_HI:r[0-9]+]], r0, r1
+; CHECK-THUMBV6T2: movs	[[ZERO:r[0-9]+]], #0
+; CHECK-THUMBV6T2: rsbs	{{.*}}, [[PROD_LO]], #0
+; CHECK-THUMBV6T2: sbcs.w {{.*}}, [[ZERO]], [[PROD_HI]]
   %in1.64 = sext i32 %in1 to i64
   %in2.64 = sext i32 %in2 to i64
   %mul = mul nsw i64 %in1.64, %in2.64
diff --git a/test/CodeGen/ARM/smul.ll b/test/CodeGen/ARM/smul.ll
index 3c187aa846d54c08f77b96d9a0b4ab17b61f17c9..2b7be41ddb24e6df0ce7ebd3e40258fc456f731d 100644
--- a/test/CodeGen/ARM/smul.ll
+++ b/test/CodeGen/ARM/smul.ll
@@ -262,3 +262,32 @@ define i32 @f21(i32 %a, i32 %x, i16 %y) {
         %tmp5 = add i32 %a, %tmp4
         ret i32 %tmp5
 }
+
+@global_b = external global i16, align 2
+
+define i32 @f22(i32 %a) {
+; CHECK-LABEL: f22:
+; CHECK: smulwb r0, r0, r1
+; CHECK-THUMBV6-NOT: smulwb
+        %b = load i16, i16* @global_b, align 2
+        %sext = sext i16 %b to i64
+        %conv = sext i32 %a to i64
+        %mul = mul nsw i64 %sext, %conv
+        %shr37 = lshr i64 %mul, 16
+        %conv4 = trunc i64 %shr37 to i32
+        ret i32 %conv4
+}
+
+define i32 @f23(i32 %a, i32 %c) {
+; CHECK-LABEL: f23:
+; CHECK: smlawb r0, r0, r2, r1
+; CHECK-THUMBV6-NOT: smlawb
+        %b = load i16, i16* @global_b, align 2
+        %sext = sext i16 %b to i64
+        %conv = sext i32 %a to i64
+        %mul = mul nsw i64 %sext, %conv
+        %shr49 = lshr i64 %mul, 16
+        %conv5 = trunc i64 %shr49 to i32
+        %add = add nsw i32 %conv5, %c
+        ret i32 %add
+}
diff --git a/test/CodeGen/ARM/softfp-fabs-fneg.ll b/test/CodeGen/ARM/softfp-fabs-fneg.ll
index b608fb840218a287e3e0a7da25c767559e168d0b..b7c684d35b5719a8b58ff763dfca89cb1c4ca122 100644
--- a/test/CodeGen/ARM/softfp-fabs-fneg.ll
+++ b/test/CodeGen/ARM/softfp-fabs-fneg.ll
@@ -14,8 +14,7 @@ define double @f(double %a) {
 
 define float @g(float %a) {
   ; CHECK-LABEL: g:
-  ; CHECK-THUMB: bic r0, r0, #-2147483648
-  ; CHECK-ARM: bfc r0, #31, #1
+  ; CHECK: bic r0, r0, #-2147483648
   ; CHECK-NEXT: bx lr
   %x = call float @llvm.fabs.f32(float %a) readnone
   ret float %x
diff --git a/test/CodeGen/ARM/special-reg-mcore.ll b/test/CodeGen/ARM/special-reg-mcore.ll
index 45e6db9e78fe1198934250b80dbe3360f9581b83..1ecf8dc77a70156a17a126ec7374a22f0b52f170 100644
--- a/test/CodeGen/ARM/special-reg-mcore.ll
+++ b/test/CodeGen/ARM/special-reg-mcore.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 2>&1 | FileCheck %s --check-prefix=MCORE
+; RUN: llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m4 --show-mc-encoding 2>&1 | FileCheck %s --check-prefix=MCORE
 ; RUN: not llc < %s -mtriple=thumb-none-eabi -mcpu=cortex-m3 2>&1 | FileCheck %s --check-prefix=M3CORE
 ; RUN: not llc < %s -mtriple=arm-none-eabi -mcpu=cortex-a8 2>&1 | FileCheck %s --check-prefix=ACORE
 
@@ -8,20 +8,20 @@
 define i32 @read_mclass_registers() nounwind {
 entry:
   ; MCORE-LABEL: read_mclass_registers:
-  ; MCORE:   mrs r0, apsr
-  ; MCORE:   mrs r1, iapsr
-  ; MCORE:   mrs r1, eapsr
-  ; MCORE:   mrs r1, xpsr
-  ; MCORE:   mrs r1, ipsr
-  ; MCORE:   mrs r1, epsr
-  ; MCORE:   mrs r1, iepsr
-  ; MCORE:   mrs r1, msp
-  ; MCORE:   mrs r1, psp
-  ; MCORE:   mrs r1, primask
-  ; MCORE:   mrs r1, basepri
-  ; MCORE:   mrs r1, basepri_max
-  ; MCORE:   mrs r1, faultmask
-  ; MCORE:   mrs r1, control
+  ; MCORE:   mrs r0, apsr        @ encoding: [0xef,0xf3,0x00,0x80]
+  ; MCORE:   mrs r1, iapsr       @ encoding: [0xef,0xf3,0x01,0x81]
+  ; MCORE:   mrs r1, eapsr       @ encoding: [0xef,0xf3,0x02,0x81]
+  ; MCORE:   mrs r1, xpsr        @ encoding: [0xef,0xf3,0x03,0x81]
+  ; MCORE:   mrs r1, ipsr        @ encoding: [0xef,0xf3,0x05,0x81]
+  ; MCORE:   mrs r1, epsr        @ encoding: [0xef,0xf3,0x06,0x81]
+  ; MCORE:   mrs r1, iepsr       @ encoding: [0xef,0xf3,0x07,0x81]
+  ; MCORE:   mrs r1, msp         @ encoding: [0xef,0xf3,0x08,0x81]
+  ; MCORE:   mrs r1, psp         @ encoding: [0xef,0xf3,0x09,0x81]
+  ; MCORE:   mrs r1, primask     @ encoding: [0xef,0xf3,0x10,0x81]
+  ; MCORE:   mrs r1, basepri     @ encoding: [0xef,0xf3,0x11,0x81]
+  ; MCORE:   mrs r1, basepri_max @ encoding: [0xef,0xf3,0x12,0x81]
+  ; MCORE:   mrs r1, faultmask   @ encoding: [0xef,0xf3,0x13,0x81]
+  ; MCORE:   mrs r1, control     @ encoding: [0xef,0xf3,0x14,0x81]
 
   %0 = call i32 @llvm.read_register.i32(metadata !0)
   %1 = call i32 @llvm.read_register.i32(metadata !4)
@@ -56,32 +56,32 @@ entry:
 define void @write_mclass_registers(i32 %x) nounwind {
 entry:
   ; MCORE-LABEL: write_mclass_registers:
-  ; MCORE:   msr apsr_nzcvqg, r0
-  ; MCORE:   msr apsr_nzcvq, r0
-  ; MCORE:   msr apsr_g, r0
-  ; MCORE:   msr apsr_nzcvqg, r0
-  ; MCORE:   msr iapsr_nzcvqg, r0
-  ; MCORE:   msr iapsr_nzcvq, r0
-  ; MCORE:   msr iapsr_g, r0
-  ; MCORE:   msr iapsr_nzcvqg, r0
-  ; MCORE:   msr eapsr_nzcvqg, r0
-  ; MCORE:   msr eapsr_nzcvq, r0
-  ; MCORE:   msr eapsr_g, r0
-  ; MCORE:   msr eapsr_nzcvqg, r0
-  ; MCORE:   msr xpsr_nzcvqg, r0
-  ; MCORE:   msr xpsr_nzcvq, r0
-  ; MCORE:   msr xpsr_g, r0
-  ; MCORE:   msr xpsr_nzcvqg, r0
-  ; MCORE:   msr ipsr, r0
-  ; MCORE:   msr epsr, r0
-  ; MCORE:   msr iepsr, r0
-  ; MCORE:   msr msp, r0
-  ; MCORE:   msr psp, r0
-  ; MCORE:   msr primask, r0
-  ; MCORE:   msr basepri, r0
-  ; MCORE:   msr basepri_max, r0
-  ; MCORE:   msr faultmask, r0
-  ; MCORE:   msr control, r0
+  ; MCORE:   msr apsr_nzcvq, r0   @ encoding: [0x80,0xf3,0x00,0x88]
+  ; MCORE:   msr apsr_nzcvq, r0   @ encoding: [0x80,0xf3,0x00,0x88]
+  ; MCORE:   msr apsr_g, r0       @ encoding: [0x80,0xf3,0x00,0x84]
+  ; MCORE:   msr apsr_nzcvqg, r0  @ encoding: [0x80,0xf3,0x00,0x8c]
+  ; MCORE:   msr iapsr_nzcvq, r0  @ encoding: [0x80,0xf3,0x01,0x88]
+  ; MCORE:   msr iapsr_nzcvq, r0  @ encoding: [0x80,0xf3,0x01,0x88]
+  ; MCORE:   msr iapsr_g, r0      @ encoding: [0x80,0xf3,0x01,0x84]
+  ; MCORE:   msr iapsr_nzcvqg, r0 @ encoding: [0x80,0xf3,0x01,0x8c]
+  ; MCORE:   msr eapsr_nzcvq, r0  @ encoding: [0x80,0xf3,0x02,0x88]
+  ; MCORE:   msr eapsr_nzcvq, r0  @ encoding: [0x80,0xf3,0x02,0x88]
+  ; MCORE:   msr eapsr_g, r0      @ encoding: [0x80,0xf3,0x02,0x84]
+  ; MCORE:   msr eapsr_nzcvqg, r0 @ encoding: [0x80,0xf3,0x02,0x8c]
+  ; MCORE:   msr xpsr_nzcvq, r0   @ encoding: [0x80,0xf3,0x03,0x88]
+  ; MCORE:   msr xpsr_nzcvq, r0   @ encoding: [0x80,0xf3,0x03,0x88]
+  ; MCORE:   msr xpsr_g, r0       @ encoding: [0x80,0xf3,0x03,0x84]
+  ; MCORE:   msr xpsr_nzcvqg, r0  @ encoding: [0x80,0xf3,0x03,0x8c]
+  ; MCORE:   msr ipsr, r0         @ encoding: [0x80,0xf3,0x05,0x88]
+  ; MCORE:   msr epsr, r0         @ encoding: [0x80,0xf3,0x06,0x88]
+  ; MCORE:   msr iepsr, r0        @ encoding: [0x80,0xf3,0x07,0x88]
+  ; MCORE:   msr msp, r0          @ encoding: [0x80,0xf3,0x08,0x88]
+  ; MCORE:   msr psp, r0          @ encoding: [0x80,0xf3,0x09,0x88]
+  ; MCORE:   msr primask, r0      @ encoding: [0x80,0xf3,0x10,0x88]
+  ; MCORE:   msr basepri, r0      @ encoding: [0x80,0xf3,0x11,0x88]
+  ; MCORE:   msr basepri_max, r0  @ encoding: [0x80,0xf3,0x12,0x88]
+  ; MCORE:   msr faultmask, r0    @ encoding: [0x80,0xf3,0x13,0x88]
+  ; MCORE:   msr control, r0      @ encoding: [0x80,0xf3,0x14,0x88]
 
   call void @llvm.write_register.i32(metadata !0, i32 %x)
   call void @llvm.write_register.i32(metadata !1, i32 %x)
diff --git a/test/CodeGen/ARM/special-reg-v8m-main.ll b/test/CodeGen/ARM/special-reg-v8m-main.ll
index cde296c6b218f5e08ed655c19291477c0b9cf1c4..ea9c01487d8548ddfd06dfbcbf47dbfca38b000e 100644
--- a/test/CodeGen/ARM/special-reg-v8m-main.ll
+++ b/test/CodeGen/ARM/special-reg-v8m-main.ll
@@ -90,19 +90,19 @@ entry:
 define void @write_mclass_registers(i32 %x) nounwind {
 entry:
   ; MAINLINE-LABEL: write_mclass_registers:
-  ; MAINLINE:   msr apsr_nzcvqg, r0
+  ; MAINLINE:   msr apsr_nzcvq, r0
   ; MAINLINE:   msr apsr_nzcvq, r0
   ; MAINLINE:   msr apsr_g, r0
   ; MAINLINE:   msr apsr_nzcvqg, r0
-  ; MAINLINE:   msr iapsr_nzcvqg, r0
+  ; MAINLINE:   msr iapsr_nzcvq, r0
   ; MAINLINE:   msr iapsr_nzcvq, r0
   ; MAINLINE:   msr iapsr_g, r0
   ; MAINLINE:   msr iapsr_nzcvqg, r0
-  ; MAINLINE:   msr eapsr_nzcvqg, r0
+  ; MAINLINE:   msr eapsr_nzcvq, r0
   ; MAINLINE:   msr eapsr_nzcvq, r0
   ; MAINLINE:   msr eapsr_g, r0
   ; MAINLINE:   msr eapsr_nzcvqg, r0
-  ; MAINLINE:   msr xpsr_nzcvqg, r0
+  ; MAINLINE:   msr xpsr_nzcvq, r0
   ; MAINLINE:   msr xpsr_nzcvq, r0
   ; MAINLINE:   msr xpsr_g, r0
   ; MAINLINE:   msr xpsr_nzcvqg, r0
diff --git a/test/CodeGen/ARM/stack_guard_remat.ll b/test/CodeGen/ARM/stack_guard_remat.ll
index 99d49949845068299dafc0b1f581b83b05787952..9b5677608d26664a4ca852663d3fdb633fe9cc95 100644
--- a/test/CodeGen/ARM/stack_guard_remat.ll
+++ b/test/CodeGen/ARM/stack_guard_remat.ll
@@ -51,20 +51,20 @@
 define i32 @test_stack_guard_remat() #0 {
   %a1 = alloca [256 x i32], align 4
   %1 = bitcast [256 x i32]* %a1 to i8*
-  call void @llvm.lifetime.start(i64 1024, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 1024, i8* %1)
   %2 = getelementptr inbounds [256 x i32], [256 x i32]* %a1, i32 0, i32 0
   call void @foo3(i32* %2) #3
   call void asm sideeffect "foo2", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{sp},~{lr}"()
-  call void @llvm.lifetime.end(i64 1024, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1024, i8* %1)
   ret i32 0
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare void @foo3(i32*)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/ARM/static-addr-hoisting.ll b/test/CodeGen/ARM/static-addr-hoisting.ll
index 3d47e02f965e80d0d8113517976bc0bebe4594af..683d607936b85b38ddc1e284c20913f09c032e9b 100644
--- a/test/CodeGen/ARM/static-addr-hoisting.ll
+++ b/test/CodeGen/ARM/static-addr-hoisting.ll
@@ -6,9 +6,9 @@ define void @multiple_store() {
 ; CHECK: movs [[VAL:r[0-9]+]], #42
 ; CHECK: movt r[[BASE1]], #15
 
-; CHECK: str [[VAL]], [r[[BASE1]]]
-; CHECK: str [[VAL]], [r[[BASE1]], #24]
-; CHECK: str.w [[VAL]], [r[[BASE1]], #42]
+; CHECK-DAG: str [[VAL]], [r[[BASE1]]]
+; CHECK-DAG: str [[VAL]], [r[[BASE1]], #24]
+; CHECK-DAG: str.w [[VAL]], [r[[BASE1]], #42]
 
 ; CHECK: movw r[[BASE2:[0-9]+]], #20394
 ; CHECK: movt r[[BASE2]], #18
diff --git a/test/CodeGen/ARM/swifterror.ll b/test/CodeGen/ARM/swifterror.ll
index 7551291207ed450bf7e1f21e1330eaf8c0fbdf16..78764202f62730ad6699ea6ef74970e6f0819651 100644
--- a/test/CodeGen/ARM/swifterror.ll
+++ b/test/CodeGen/ARM/swifterror.ll
@@ -13,7 +13,7 @@ define float @foo(%swift_error** swifterror %error_ptr_ref) {
 ; CHECK-APPLE: mov r0, #16
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE-DAG: mov [[ID:r[0-9]+]], #1
-; CHECK-APPLE-DAG: mov r6, r{{.*}}
+; CHECK-APPLE-DAG: mov r8, r{{.*}}
 ; CHECK-APPLE-DAG: strb [[ID]], [r{{.*}}, #8]
 
 ; CHECK-O0-LABEL: foo:
@@ -22,7 +22,7 @@ define float @foo(%swift_error** swifterror %error_ptr_ref) {
 ; CHECK-O0: mov [[ID2:r[0-9]+]], r0
 ; CHECK-O0: mov [[ID:r[0-9]+]], #1
 ; CHECK-O0: strb [[ID]], [r0, #8]
-; CHECK-O0: mov r6, [[ID2]]
+; CHECK-O0: mov r8, [[ID2]]
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -36,21 +36,21 @@ entry:
 define float @caller(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller:
 ; CHECK-APPLE-DAG: mov [[ID:r[0-9]+]], r0
-; CHECK-APPLE-DAG: mov r6, #0
+; CHECK-APPLE-DAG: mov r8, #0
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cmp r6, #0
+; CHECK-APPLE: cmp r8, #0
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r8, #8]
 ; CHECK-APPLE: strbeq [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov r0, r6
+; CHECK-APPLE: mov r0, r8
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller:
 ; spill r0
-; CHECK-O0-DAG: mov r6, #0
+; CHECK-O0-DAG: mov r8, #0
 ; CHECK-O0-DAG: str r0, [sp, [[SLOT:#[0-9]+]]
 ; CHECK-O0: bl {{.*}}foo
-; CHECK-O0: mov [[TMP:r[0-9]+]], r6
+; CHECK-O0: mov [[TMP:r[0-9]+]], r8
 ; CHECK-O0: str [[TMP]], [sp]
 ; CHECK-O0: bne
 ; CHECK-O0: ldrb [[CODE:r[0-9]+]], [r0, #8]
@@ -81,22 +81,22 @@ handler:
 define float @caller2(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller2:
 ; CHECK-APPLE-DAG: mov [[ID:r[0-9]+]], r0
-; CHECK-APPLE-DAG: mov r6, #0
+; CHECK-APPLE-DAG: mov r8, #0
 ; CHECK-APPLE: bl {{.*}}foo
-; CHECK-APPLE: cmp r6, #0
+; CHECK-APPLE: cmp r8, #0
 ; CHECK-APPLE: bne
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrb [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: ldrb [[CODE:r[0-9]+]], [r8, #8]
 ; CHECK-APPLE: strb [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov r0, r6
+; CHECK-APPLE: mov r0, r8
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller2:
 ; spill r0
 ; CHECK-O0-DAG: str r0,
-; CHECK-O0-DAG: mov r6, #0
+; CHECK-O0-DAG: mov r8, #0
 ; CHECK-O0: bl {{.*}}foo
-; CHECK-O0: mov r{{.*}}, r6
+; CHECK-O0: mov r{{.*}}, r8
 ; CHECK-O0: str r0, [sp]
 ; CHECK-O0: bne
 ; CHECK-O0: ble
@@ -138,22 +138,22 @@ define float @foo_if(%swift_error** swifterror %error_ptr_ref, i32 %cc) {
 ; CHECK-APPLE: mov r0, #16
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: mov [[ID:r[0-9]+]], #1
-; CHECK-APPLE-DAG: mov r6, r{{.*}}
+; CHECK-APPLE-DAG: mov r8, r{{.*}}
 ; CHECK-APPLE-DAG: strb [[ID]], [r{{.*}}, #8]
 
 ; CHECK-O0-LABEL: foo_if:
 ; CHECK-O0: cmp r0, #0
 ; spill to stack
-; CHECK-O0: str r6
+; CHECK-O0: str r8
 ; CHECK-O0: beq
 ; CHECK-O0: mov r0, #16
 ; CHECK-O0: malloc
 ; CHECK-O0: mov [[ID:r[0-9]+]], r0
 ; CHECK-O0: mov [[ID2:[a-z0-9]+]], #1
 ; CHECK-O0: strb [[ID2]], [r0, #8]
-; CHECK-O0: mov r6, [[ID]]
+; CHECK-O0: mov r8, [[ID]]
 ; reload from stack
-; CHECK-O0: ldr r6
+; CHECK-O0: ldr r8
 entry:
   %cond = icmp ne i32 %cc, 0
   br i1 %cond, label %gen_error, label %normal
@@ -176,17 +176,17 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float
 ; CHECK-APPLE-LABEL: foo_loop:
 ; CHECK-APPLE: mov [[CODE:r[0-9]+]], r0
 ; swifterror is kept in a register
-; CHECK-APPLE: mov [[ID:r[0-9]+]], r6
+; CHECK-APPLE: mov [[ID:r[0-9]+]], r8
 ; CHECK-APPLE: cmp [[CODE]], #0
 ; CHECK-APPLE: beq
 ; CHECK-APPLE: mov r0, #16
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: strb r{{.*}}, [{{.*}}[[ID]], #8]
 ; CHECK-APPLE: ble
-; CHECK-APPLE: mov r6, [[ID]]
+; CHECK-APPLE: mov r8, [[ID]]
 
 ; CHECK-O0-LABEL: foo_loop:
-; CHECK-O0: mov r{{.*}}, r6
+; CHECK-O0: mov r{{.*}}, r8
 ; CHECK-O0: cmp r{{.*}}, #0
 ; CHECK-O0: beq
 ; CHECK-O0-DAG: movw r{{.*}}, #1
@@ -200,7 +200,7 @@ define float @foo_loop(%swift_error** swifterror %error_ptr_ref, i32 %cc, float
 ; CHECK-O0: vcmpe
 ; CHECK-O0: ble
 ; reload from stack
-; CHECK-O0: ldr r6
+; CHECK-O0: ldr r8
 entry:
   br label %bb_loop
 
@@ -231,7 +231,7 @@ define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swi
 ; CHECK-APPLE: mov r0, #16
 ; CHECK-APPLE: malloc
 ; CHECK-APPLE: mov [[REG:r[0-9]+]], #1
-; CHECK-APPLE-DAG: mov r6, r0
+; CHECK-APPLE-DAG: mov r8, r0
 ; CHECK-APPLE-DAG: strb [[REG]], [r0, #8]
 ; CHECK-APPLE-DAG: str r{{.*}}, [{{.*}}[[SRET]], #4]
 
@@ -247,7 +247,7 @@ define void @foo_sret(%struct.S* sret %agg.result, i32 %val1, %swift_error** swi
 ; CHECK-O0: ldr
 ; CHECK-O0: ldr
 ; CHECK-O0: str r{{.*}}, [{{.*}}, #4]
-; CHECK-O0: mov r6
+; CHECK-O0: mov r8
 entry:
   %call = call i8* @malloc(i64 16)
   %call.0 = bitcast i8* %call to %swift_error*
@@ -263,22 +263,22 @@ entry:
 define float @caller3(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller3:
 ; CHECK-APPLE: mov [[ID:r[0-9]+]], r0
-; CHECK-APPLE: mov r6, #0
+; CHECK-APPLE: mov r8, #0
 ; CHECK-APPLE: bl {{.*}}foo_sret
-; CHECK-APPLE: cmp r6, #0
+; CHECK-APPLE: cmp r8, #0
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r8, #8]
 ; CHECK-APPLE: strbeq [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov r0, r6
+; CHECK-APPLE: mov r0, r8
 ; CHECK-APPLE: bl {{.*}}free
 
 ; CHECK-O0-LABEL: caller3:
-; CHECK-O0-DAG: mov r6, #0
+; CHECK-O0-DAG: mov r8, #0
 ; CHECK-O0-DAG: mov r0
 ; CHECK-O0-DAG: mov r1
 ; CHECK-O0: bl {{.*}}foo_sret
-; CHECK-O0: mov [[ID2:r[0-9]+]], r6
-; CHECK-O0: cmp r6
+; CHECK-O0: mov [[ID2:r[0-9]+]], r8
+; CHECK-O0: cmp r8
 ; CHECK-O0: str [[ID2]], [sp[[SLOT:.*]]]
 ; CHECK-O0: bne
 ; Access part of the error object and save it to error_ref
@@ -316,7 +316,7 @@ define float @foo_vararg(%swift_error** swifterror %error_ptr_ref, ...) {
 ; CHECK-APPLE: mov [[REG:r[0-9]+]], r0
 ; CHECK-APPLE: mov [[ID:r[0-9]+]], #1
 ; CHECK-APPLE-DAG: strb [[ID]], [{{.*}}[[REG]], #8]
-; CHECK-APPLE-DAG: mov r6, [[REG]]
+; CHECK-APPLE-DAG: mov r8, [[REG]]
 
 entry:
   %call = call i8* @malloc(i64 16)
@@ -345,13 +345,13 @@ entry:
 define float @caller4(i8* %error_ref) {
 ; CHECK-APPLE-LABEL: caller4:
 ; CHECK-APPLE: mov [[ID:r[0-9]+]], r0
-; CHECK-APPLE: mov r6, #0
+; CHECK-APPLE: mov r8, #0
 ; CHECK-APPLE: bl {{.*}}foo_vararg
-; CHECK-APPLE: cmp r6, #0
+; CHECK-APPLE: cmp r8, #0
 ; Access part of the error object and save it to error_ref
-; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r6, #8]
+; CHECK-APPLE: ldrbeq [[CODE:r[0-9]+]], [r8, #8]
 ; CHECK-APPLE: strbeq [[CODE]], [{{.*}}[[ID]]]
-; CHECK-APPLE: mov r0, r6
+; CHECK-APPLE: mov r0, r8
 ; CHECK-APPLE: bl {{.*}}free
 entry:
   %error_ptr_ref = alloca swifterror %swift_error*
@@ -396,51 +396,51 @@ entry:
 }
 
 ; CHECK-APPLE-LABEL: swifterror_clobber
-; CHECK-APPLE: mov [[REG:r[0-9]+]], r6
+; CHECK-APPLE: mov [[REG:r[0-9]+]], r8
 ; CHECK-APPLE: nop
-; CHECK-APPLE: mov r6, [[REG]]
+; CHECK-APPLE: mov r8, [[REG]]
 define swiftcc void @swifterror_clobber(%swift_error** nocapture swifterror %err) {
-  call void asm sideeffect "nop", "~{r6}"()
+  call void asm sideeffect "nop", "~{r8}"()
   ret void
 }
 
 ; CHECK-APPLE-LABEL: swifterror_reg_clobber
-; CHECK-APPLE: push {{.*}}r6
+; CHECK-APPLE: push {{.*}}r8
 ; CHECK-APPLE: nop
-; CHECK-APPLE: pop  {{.*}}r6
+; CHECK-APPLE: pop  {{.*}}r8
 define swiftcc void @swifterror_reg_clobber(%swift_error** nocapture %err) {
-  call void asm sideeffect "nop", "~{r6}"()
+  call void asm sideeffect "nop", "~{r8}"()
   ret void
 }
 
 ; CHECK-ARMV7-LABEL: _params_in_reg
 ; Store callee saved registers excluding swifterror.
-; CHECK-ARMV7:  push    {r4, r5, r7, r8, r10, r11, lr}
-; Store swiftself (r10) and swifterror (r6).
-; CHECK-ARMV7-DAG:  str     r6, [s[[STK1:.*]]]
+; CHECK-ARMV7:  push   {r4, r5, r6, r7, r10, r11, lr}
+; Store swiftself (r10) and swifterror (r8).
+; CHECK-ARMV7-DAG:  str     r8, [s[[STK1:.*]]]
 ; CHECK-ARMV7-DAG:  str     r10, [s[[STK2:.*]]]
 ; Store arguments.
-; CHECK-ARMV7:  mov     r4, r3
-; CHECK-ARMV7:  mov     r5, r2
-; CHECK-ARMV7:  mov     r8, r1
-; CHECK-ARMV7:  mov     r11, r0
+; CHECK-ARMV7:  mov     r6, r3
+; CHECK-ARMV7:  mov     r4, r2
+; CHECK-ARMV7:  mov     r11, r1
+; CHECK-ARMV7:  mov     r5, r0
 ; Setup call.
 ; CHECK-ARMV7:  mov     r0, #1
 ; CHECK-ARMV7:  mov     r1, #2
 ; CHECK-ARMV7:  mov     r2, #3
 ; CHECK-ARMV7:  mov     r3, #4
 ; CHECK-ARMV7:  mov     r10, #0
-; CHECK-ARMV7:  mov     r6, #0
+; CHECK-ARMV7:  mov     r8, #0
 ; CHECK-ARMV7:  bl      _params_in_reg2
 ; Restore original arguments.
 ; CHECK-ARMV7-DAG:  ldr     r10, [s[[STK2]]]
-; CHECK-ARMV7-DAG:  ldr     r6, [s[[STK1]]]
-; CHECK-ARMV7:  mov     r0, r11
-; CHECK-ARMV7:  mov     r1, r8
-; CHECK-ARMV7:  mov     r2, r5
-; CHECK-ARMV7:  mov     r3, r4
+; CHECK-ARMV7-DAG:  ldr     r8, [s[[STK1]]]
+; CHECK-ARMV7:  mov     r0, r5
+; CHECK-ARMV7:  mov     r1, r11
+; CHECK-ARMV7:  mov     r2, r4
+; CHECK-ARMV7:  mov     r3, r6
 ; CHECK-ARMV7:  bl      _params_in_reg2
-; CHECK-ARMV7:  pop     {r4, r5, r7,  r8, r10, r11, pc}
+; CHECK-ARMV7:  pop     {r4, r5, r6, r7, r10, r11, pc}
 define swiftcc void @params_in_reg(i32, i32, i32, i32, i8* swiftself, %swift_error** nocapture swifterror %err) {
   %error_ptr_ref = alloca swifterror %swift_error*, align 8
   store %swift_error* null, %swift_error** %error_ptr_ref
@@ -451,42 +451,42 @@ define swiftcc void @params_in_reg(i32, i32, i32, i32, i8* swiftself, %swift_err
 declare swiftcc void @params_in_reg2(i32, i32, i32, i32, i8* swiftself, %swift_error** nocapture swifterror %err)
 
 ; CHECK-ARMV7-LABEL: params_and_return_in_reg
-; CHECK-ARMV7:  push    {r4, r5, r7, r8, r10, r11, lr}
+; CHECK-ARMV7:  push    {r4, r5, r6, r7, r10, r11, lr}
 ; Store swifterror and swiftself
-; CHECK-ARMV7:  mov     r4, r6
+; CHECK-ARMV7:  mov     r6, r8
 ; CHECK-ARMV7:  str     r10, [s[[STK1:.*]]]
 ; Store arguments.
 ; CHECK-ARMV7:  str     r3, [s[[STK2:.*]]]
-; CHECK-ARMV7:  mov     r5, r2
-; CHECK-ARMV7:  mov     r8, r1
-; CHECK-ARMV7:  mov     r11, r0
+; CHECK-ARMV7:  mov     r4, r2
+; CHECK-ARMV7:  mov     r11, r1
+; CHECK-ARMV7:  mov     r5, r0
 ; Setup call.
 ; CHECK-ARMV7:  mov     r0, #1
 ; CHECK-ARMV7:  mov     r1, #2
 ; CHECK-ARMV7:  mov     r2, #3
 ; CHECK-ARMV7:  mov     r3, #4
 ; CHECK-ARMV7:  mov     r10, #0
-; CHECK-ARMV7:  mov     r6, #0
+; CHECK-ARMV7:  mov     r8, #0
 ; CHECK-ARMV7:  bl      _params_in_reg2
 ; Restore original arguments.
 ; CHECK-ARMV7:  ldr     r3, [s[[STK2]]]
 ; CHECK-ARMV7:  ldr     r10, [s[[STK1]]]
 ; Store %error_ptr_ref;
-; CHECK-ARMV7:  str     r6, [s[[STK3:.*]]]
+; CHECK-ARMV7:  str     r8, [s[[STK3:.*]]]
 ; Restore original arguments.
-; CHECK-ARMV7:  mov     r0, r11
-; CHECK-ARMV7:  mov     r1, r8
-; CHECK-ARMV7:  mov     r2, r5
-; CHECK-ARMV7:  mov     r6, r4
+; CHECK-ARMV7:  mov     r0, r5
+; CHECK-ARMV7:  mov     r1, r11
+; CHECK-ARMV7:  mov     r2, r4
+; CHECK-ARMV7:  mov     r8, r6
 ; CHECK-ARMV7:  bl      _params_and_return_in_reg2
 ; Store swifterror return %err;
-; CHECK-ARMV7:  str     r6, [s[[STK1]]]
+; CHECK-ARMV7:  str     r8, [s[[STK1]]]
 ; Load swifterror value %error_ptr_ref.
-; CHECK-ARMV7:  ldr     r6, [s[[STK3]]]
+; CHECK-ARMV7:  ldr     r8, [s[[STK3]]]
 ; Save return values.
-; CHECK-ARMV7:  mov     r5, r0
-; CHECK-ARMV7:  mov     r4, r1
-; CHECK-ARMV7:  mov     r8, r2
+; CHECK-ARMV7:  mov     r4, r0
+; CHECK-ARMV7:  mov     r5, r1
+; CHECK-ARMV7:  mov     r6, r2
 ; CHECK-ARMV7:  mov     r11, r3
 ; Setup call.
 ; CHECK-ARMV7:  mov     r0, #1
@@ -496,13 +496,13 @@ declare swiftcc void @params_in_reg2(i32, i32, i32, i32, i8* swiftself, %swift_e
 ; CHECK-ARMV7:  mov     r10, #0
 ; CHECK-ARMV7:  bl      _params_in_reg2
 ; Load swifterror %err;
-; CHECK-ARMV7:  ldr     r6, [s[[STK1]]]
+; CHECK-ARMV7:  ldr     r8, [s[[STK1]]]
 ; Restore return values for returning.
-; CHECK-ARMV7:  mov     r0, r5
-; CHECK-ARMV7:  mov     r1, r4
-; CHECK-ARMV7:  mov     r2, r8
+; CHECK-ARMV7:  mov     r0, r4
+; CHECK-ARMV7:  mov     r1, r5
+; CHECK-ARMV7:  mov     r2, r6
 ; CHECK-ARMV7:  mov     r3, r11
-; CHECK-ARMV7:  pop     {r4, r5, r7, r8, r10, r11, pc}
+; CHECK-ARMV7:  pop     {r4, r5, r6, r7, r10, r11, pc}
 define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i32, i8* swiftself, %swift_error** nocapture swifterror %err) {
   %error_ptr_ref = alloca swifterror %swift_error*, align 8
   store %swift_error* null, %swift_error** %error_ptr_ref
@@ -513,3 +513,18 @@ define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i3
 }
 
 declare swiftcc { i32, i32, i32, i32 } @params_and_return_in_reg2(i32, i32, i32, i32, i8* swiftself, %swift_error** nocapture swifterror %err)
+
+
+declare void @acallee(i8*)
+
+; Make sure we don't tail call if the caller returns a swifterror value. We
+; would have to move into the swifterror register before the tail call.
+; CHECK-APPLE: tailcall_from_swifterror:
+; CHECK-APPLE-NOT: b _acallee
+; CHECK-APPLE: bl _acallee
+
+define swiftcc void @tailcall_from_swifterror(%swift_error** swifterror %error_ptr_ref) {
+entry:
+  tail call void @acallee(i8* null)
+  ret void
+}
diff --git a/test/CodeGen/ARM/swiftself.ll b/test/CodeGen/ARM/swiftself.ll
index b7a04ca4060ea300969457c864b37ce8231301e3..1e06b34c7052905695bdcae233c1fb9f7d59e1e9 100644
--- a/test/CodeGen/ARM/swiftself.ll
+++ b/test/CodeGen/ARM/swiftself.ll
@@ -63,3 +63,20 @@ define i8* @swiftself_notail(i8* swiftself %addr0, i8* %addr1) nounwind "no-fram
   %res = tail call i8* @swiftself_param(i8* swiftself %addr1)
   ret i8* %res
 }
+
+; We cannot pretend that 'r0' is alive across the thisreturn_attribute call as
+; we normally would. We marked the first parameter with swiftself which means it
+; will no longer be passed in r0.
+declare swiftcc i8* @thisreturn_attribute(i8* returned swiftself)
+; OPT-LABEL: swiftself_nothisreturn:
+; OPT-DAG: mov [[CSREG:r[1-9].*]], r0
+; OPT-DAG: ldr r10, [r10]
+; OPT: bl  {{_?}}thisreturn_attribute
+; OPT: str r0, {{\[}}[[CSREG]]
+define hidden swiftcc void @swiftself_nothisreturn(i8** noalias nocapture sret, i8** noalias nocapture readonly swiftself) {
+entry:
+  %2 = load i8*, i8** %1, align 8
+  %3 = tail call swiftcc i8* @thisreturn_attribute(i8* swiftself %2)
+  store i8* %3, i8** %0, align 8
+  ret void
+}
diff --git a/test/CodeGen/ARM/tail-opts.ll b/test/CodeGen/ARM/tail-opts.ll
index 37e9a4af3be5969d9fec79bd220d36b7297033f4..475b80b3bb070eec6d84366483e94d68e5c08df5 100644
--- a/test/CodeGen/ARM/tail-opts.ll
+++ b/test/CodeGen/ARM/tail-opts.ll
@@ -65,3 +65,55 @@ altret:
   call void @far(i32 1001)
   ret void
 }
+
+; Use alternating abort functions so that the blocks we wish to merge are not
+; layout successors during branch folding.
+
+; CHECK-LABEL: merge_alternating_aborts:
+; CHECK-NOT: _abort
+; CHECK-NOT: _alt_abort
+; CHECK: bxne lr
+; CHECK-NOT: _abort
+; CHECK-NOT: _alt_abort
+; CHECK: LBB{{.*}}:
+; CHECK: mov lr, pc
+; CHECK: b _alt_abort
+; CHECK-NOT: _abort
+; CHECK-NOT: _alt_abort
+; CHECK: LBB{{.*}}:
+; CHECK: mov lr, pc
+; CHECK: b _abort
+; CHECK-NOT: _abort
+; CHECK-NOT: _alt_abort
+
+declare void @abort()
+declare void @alt_abort()
+
+define void @merge_alternating_aborts() {
+entry:
+  %c1 = call i1 @qux()
+  br i1 %c1, label %cont1, label %abort1
+abort1:
+  call void @abort()
+  unreachable
+cont1:
+  %c2 = call i1 @qux()
+  br i1 %c2, label %cont2, label %abort2
+abort2:
+  call void @alt_abort()
+  unreachable
+cont2:
+  %c3 = call i1 @qux()
+  br i1 %c3, label %cont3, label %abort3
+abort3:
+  call void @abort()
+  unreachable
+cont3:
+  %c4 = call i1 @qux()
+  br i1 %c4, label %cont4, label %abort4
+abort4:
+  call void @alt_abort()
+  unreachable
+cont4:
+  ret void
+}
diff --git a/test/CodeGen/ARM/thumb1-div.ll b/test/CodeGen/ARM/thumb1-div.ll
new file mode 100644
index 0000000000000000000000000000000000000000..844dfe6f963c1793159413f87067f11d7cd62237
--- /dev/null
+++ b/test/CodeGen/ARM/thumb1-div.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -mtriple=arm-none-eabi -mcpu=cortex-m23 -march=thumb | \
+; RUN:     FileCheck %s -check-prefix=CHECK
+
+define i32 @f1(i32 %a, i32 %b) {
+entry:
+; CHECK-LABEL: f1
+
+; CHECK: sdiv
+        %tmp1 = sdiv i32 %a, %b         ; <i32> [#uses=1]
+        ret i32 %tmp1
+}
+
+define i32 @f2(i32 %a, i32 %b) {
+entry:
+; CHECK-LABEL: f2
+; CHECK: udiv
+        %tmp1 = udiv i32 %a, %b         ; <i32> [#uses=1]
+        ret i32 %tmp1
+}
+
+define i32 @f3(i32 %a, i32 %b) {
+entry:
+; CHECK-LABEL: f3
+
+
+        %tmp1 = srem i32 %a, %b         ; <i32> [#uses=1]
+        ret i32 %tmp1
+; CHECK: sdiv
+; CHECK-NEXT: muls
+; CHECK-NEXT: subs
+}
+
+define i32 @f4(i32 %a, i32 %b) {
+entry:
+; CHECK-LABEL: f4
+
+; CHECK: udiv
+; CHECK-NEXT: muls
+; CHECK-NEXT: subs
+        %tmp1 = urem i32 %a, %b         ; <i32> [#uses=1]
+        ret i32 %tmp1
+}
+
+
+define i64 @f5(i64 %a, i64 %b) {
+entry:
+; CHECK-LABEL: f5
+
+; EABI MODE = Remainder in R2-R3, quotient in R0-R1
+; CHECK: __aeabi_ldivmod
+; CHECK-NEXT: mov r0, r2
+; CHECK-NEXT: mov r1, r3
+        %tmp1 = srem i64 %a, %b         ; <i64> [#uses=1]
+        ret i64 %tmp1
+}
+
+define i64 @f6(i64 %a, i64 %b) {
+entry:
+; CHECK-LABEL: f6
+
+; EABI MODE = Remainder in R2-R3, quotient in R0-R1
+; CHECK: __aeabi_uldivmod
+; CHECK: mov r0, r2
+; CHECK: mov r1, r3
+        %tmp1 = urem i64 %a, %b         ; <i64> [#uses=1]
+        ret i64 %tmp1
+}
diff --git a/test/CodeGen/ARM/unschedule-first-call.ll b/test/CodeGen/ARM/unschedule-first-call.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4a218afcc5e13bd66a55c064555510269f180264
--- /dev/null
+++ b/test/CodeGen/ARM/unschedule-first-call.ll
@@ -0,0 +1,136 @@
+; RUN: llc < %s
+; PR30911
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv6kz--linux-gnueabihf"
+
+; Function Attrs: nounwind
+define void @dradbg(i32, i32, float*, float*, float*, float*, float*) #0 {
+  br i1 undef, label %.critedge, label %8
+
+.critedge:                                        ; preds = %7
+  %.mux2 = select i1 undef, i1 undef, i1 true
+  br label %8
+
+; <label>:8:                                      ; preds = %.critedge, %7
+  %9 = getelementptr float, float* %3, i64 undef
+  %10 = ptrtoint float* %9 to i32
+  %11 = icmp ule i32 %10, undef
+  %12 = getelementptr float, float* %5, i64 undef
+  %13 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 undef, i64 undef)
+  %14 = extractvalue { i64, i1 } %13, 0
+  %15 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %14, i64 1)
+  %16 = extractvalue { i64, i1 } %15, 0
+  %17 = icmp slt i64 1, %16
+  %18 = select i1 %17, i64 1, i64 %16
+  %19 = sext i32 %1 to i64
+  %20 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %18, i64 %19)
+  %21 = extractvalue { i64, i1 } %20, 0
+  %22 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %21, i64 0)
+  %23 = extractvalue { i64, i1 } %22, 0
+  %24 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %23, i64 undef)
+  %25 = extractvalue { i64, i1 } %24, 0
+  %26 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %25, i64 0)
+  %27 = extractvalue { i64, i1 } %26, 0
+  %28 = getelementptr float, float* %3, i64 %27
+  %29 = ptrtoint float* %12 to i32
+  %30 = ptrtoint float* %28 to i32
+  %31 = icmp ule i32 %29, %30
+  %32 = or i1 %11, %31
+  %33 = and i1 false, %32
+  %34 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 0, i64 undef)
+  %35 = extractvalue { i64, i1 } %34, 0
+  %36 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %35, i64 1)
+  %37 = extractvalue { i64, i1 } %36, 0
+  %38 = icmp slt i64 1, %37
+  %39 = select i1 %38, i64 1, i64 %37
+  %40 = sext i32 %1 to i64
+  %41 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %39, i64 %40)
+  %42 = extractvalue { i64, i1 } %41, 0
+  %43 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %42, i64 0)
+  %44 = extractvalue { i64, i1 } %43, 0
+  %45 = sext i32 %0 to i64
+  %46 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %44, i64 %45)
+  %47 = extractvalue { i64, i1 } %46, 0
+  %48 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %47, i64 0)
+  %49 = extractvalue { i64, i1 } %48, 0
+  %50 = getelementptr float, float* %5, i64 %49
+  %51 = ptrtoint float* %50 to i32
+  %52 = icmp ule i32 undef, %51
+  %53 = getelementptr float, float* %4, i64 undef
+  %54 = ptrtoint float* %53 to i32
+  %55 = icmp ule i32 undef, %54
+  %56 = or i1 %52, %55
+  %57 = and i1 %33, %56
+  %58 = getelementptr float, float* %2, i64 undef
+  %59 = ptrtoint float* %58 to i32
+  %60 = icmp ule i32 %59, undef
+  %61 = select i1 undef, i64 undef, i64 0
+  %62 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %61, i64 undef)
+  %63 = extractvalue { i64, i1 } %62, 0
+  %64 = call { i64, i1 } @llvm.ssub.with.overflow.i64(i64 undef, i64 1)
+  %65 = extractvalue { i64, i1 } %64, 0
+  %66 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %63, i64 %65)
+  %67 = extractvalue { i64, i1 } %66, 0
+  %68 = sext i32 %0 to i64
+  %69 = call { i64, i1 } @llvm.smul.with.overflow.i64(i64 %67, i64 %68)
+  %70 = extractvalue { i64, i1 } %69, 0
+  %71 = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %70, i64 0)
+  %72 = extractvalue { i64, i1 } %71, 0
+  %73 = getelementptr float, float* %5, i64 %72
+  %74 = ptrtoint float* %73 to i32
+  %75 = icmp ule i32 %74, undef
+  %76 = or i1 %60, %75
+  %77 = and i1 %57, %76
+  %78 = getelementptr float, float* %6, i64 undef
+  %79 = ptrtoint float* %78 to i32
+  %80 = icmp ule i32 %79, undef
+  %81 = getelementptr float, float* %5, i64 undef
+  %82 = ptrtoint float* %81 to i32
+  %83 = icmp ule i32 %82, undef
+  %84 = or i1 %80, %83
+  %85 = and i1 %77, %84
+  %86 = and i1 %85, undef
+  %87 = and i1 %86, undef
+  %88 = and i1 %87, undef
+  %89 = and i1 %88, undef
+  %90 = and i1 %89, undef
+  %91 = and i1 %90, undef
+  %92 = and i1 %91, undef
+  %93 = and i1 %92, undef
+  %94 = and i1 %93, undef
+  %95 = and i1 %94, undef
+  br i1 %95, label %97, label %96
+
+; <label>:96:                                     ; preds = %8
+  br i1 undef, label %.critedge122, label %.critedge110
+
+.critedge122:                                     ; preds = %.critedge122, %96
+  br i1 false, label %.critedge122, label %.critedge110
+
+.critedge110:                                     ; preds = %.critedge219, %97, %.critedge122, %96
+  ret void
+
+; <label>:97:                                     ; preds = %8
+  br i1 undef, label %.critedge219, label %.critedge110
+
+.critedge219:                                     ; preds = %.critedge219, %97
+  %.pr287 = phi i1 [ undef, %.critedge219 ], [ true, %97 ]
+  br i1 %.pr287, label %.critedge219, label %.critedge110
+}
+
+; Function Attrs: nounwind readnone
+declare { i64, i1 } @llvm.smul.with.overflow.i64(i64, i64) #1
+
+; Function Attrs: nounwind readnone
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1
+
+; Function Attrs: nounwind readnone
+declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) #1
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="arm1176jzf-s" "target-features"="+dsp,+strict-align,+vfp2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 4.0.0 (trunk 285923) (llvm/trunk 285921)"}
diff --git a/test/CodeGen/ARM/v6-jumptable-clobber.mir b/test/CodeGen/ARM/v6-jumptable-clobber.mir
new file mode 100644
index 0000000000000000000000000000000000000000..0e9bc42565f3bbdeb9636d86034ae3217e193fc9
--- /dev/null
+++ b/test/CodeGen/ARM/v6-jumptable-clobber.mir
@@ -0,0 +1,384 @@
+# RUN: llc -run-pass=arm-cp-islands -o - %s | FileCheck %s
+
+# Test created by tweaking the register allocation after stopping the IR below
+# just before constant islands. We were forwarding the table index to the end of
+# the block, even though the LEA clobbered it.
+
+# CHECK-LABEL: name: foo
+# CHECK:     tBR_JT
+  # This order is important. If the jump-table comes first then the
+  # transformation is valid because the LEA can be removed, see second test.
+# CHECK:     CONSTPOOL_ENTRY
+# CHECK:     JUMPTABLE_ADDRS
+
+# CHECK-LABEL: name: bar
+# CHECK: tTBB_JT %pc, killed %r1
+
+--- |
+  ; ModuleID = 'simple.ll'
+  source_filename = "simple.ll"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv6m-none--eabi"
+  
+  define void @foo(i8 %in, i32* %addr) {
+    store i32 12345678, i32* %addr
+    %1 = call i32 @llvm.arm.space(i32 980, i32 undef)
+    %2 = zext i8 %in to i32
+    switch i32 %2, label %default [
+      i32 0, label %d1
+      i32 1, label %d2
+      i32 3, label %d3
+      i32 4, label %d4
+      i32 5, label %d5
+      i32 6, label %d6
+      i32 7, label %d7
+      i32 2, label %d8
+      i32 8, label %d9
+      i32 9, label %d10
+      i32 19, label %d11
+      i32 20, label %d12
+      i32 21, label %d13
+      i32 22, label %d14
+      i32 24, label %d15
+      i32 25, label %d16
+      i32 26, label %d17
+    ]
+  
+  default:                                          ; preds = %0
+    unreachable
+  
+  d1:                                               ; preds = %0
+    unreachable
+  
+  d2:                                               ; preds = %0
+    unreachable
+  
+  d3:                                               ; preds = %0
+    unreachable
+  
+  d4:                                               ; preds = %0
+    unreachable
+  
+  d5:                                               ; preds = %0
+    unreachable
+  
+  d6:                                               ; preds = %0
+    unreachable
+  
+  d7:                                               ; preds = %0
+    unreachable
+  
+  d8:                                               ; preds = %0
+    unreachable
+  
+  d9:                                               ; preds = %0
+    unreachable
+  
+  d10:                                              ; preds = %0
+    unreachable
+  
+  d11:                                              ; preds = %0
+    unreachable
+  
+  d12:                                              ; preds = %0
+    unreachable
+  
+  d13:                                              ; preds = %0
+    unreachable
+  
+  d14:                                              ; preds = %0
+    unreachable
+  
+  d15:                                              ; preds = %0
+    unreachable
+  
+  d16:                                              ; preds = %0
+    unreachable
+  
+  d17:                                              ; preds = %0
+    unreachable
+  }
+
+  define void @bar(i8 %in, i32* %addr) {
+      store i32 12345678, i32* %addr
+    %1 = zext i8 %in to i32
+    switch i32 %1, label %default [
+      i32 0, label %d1
+      i32 1, label %d2
+      i32 3, label %d3
+      i32 4, label %d4
+      i32 5, label %d5
+      i32 6, label %d6
+      i32 7, label %d7
+      i32 2, label %d8
+      i32 8, label %d9
+      i32 9, label %d10
+      i32 19, label %d11
+      i32 20, label %d12
+      i32 21, label %d13
+      i32 22, label %d14
+      i32 24, label %d15
+      i32 25, label %d16
+      i32 26, label %d17
+    ]
+  
+  default:                                          ; preds = %0
+    unreachable
+  
+  d1:                                               ; preds = %0
+    unreachable
+  
+  d2:                                               ; preds = %0
+    unreachable
+  
+  d3:                                               ; preds = %0
+    unreachable
+  
+  d4:                                               ; preds = %0
+    unreachable
+  
+  d5:                                               ; preds = %0
+    unreachable
+  
+  d6:                                               ; preds = %0
+    unreachable
+  
+  d7:                                               ; preds = %0
+    unreachable
+  
+  d8:                                               ; preds = %0
+    unreachable
+  
+  d9:                                               ; preds = %0
+    unreachable
+  
+  d10:                                              ; preds = %0
+    unreachable
+  
+  d11:                                              ; preds = %0
+    unreachable
+  
+  d12:                                              ; preds = %0
+    unreachable
+  
+  d13:                                              ; preds = %0
+    unreachable
+  
+  d14:                                              ; preds = %0
+    unreachable
+  
+  d15:                                              ; preds = %0
+    unreachable
+  
+  d16:                                              ; preds = %0
+    unreachable
+  
+  d17:                                              ; preds = %0
+    unreachable
+  }
+  
+  ; Function Attrs: nounwind
+  declare i32 @llvm.arm.space(i32, i32) #0
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #0
+  
+  attributes #0 = { nounwind }
+
+...
+---
+name:            foo
+alignment:       1
+exposesReturnsTwice: false
+noVRegs:         true
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%r0' }
+  - { reg: '%r1' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+constants:       
+  - id:              0
+    value:           i32 12345678
+    alignment:       4
+jumpTable:       
+  kind:            inline
+  entries:         
+    - id:              0
+      blocks:          [ '%bb.3.d2', '%bb.9.d8', '%bb.4.d3', '%bb.5.d4', 
+                         '%bb.6.d5', '%bb.7.d6', '%bb.8.d7', '%bb.10.d9', 
+                         '%bb.11.d10', '%bb.2.d1', '%bb.2.d1', '%bb.2.d1', 
+                         '%bb.2.d1', '%bb.2.d1', '%bb.2.d1', '%bb.2.d1', 
+                         '%bb.2.d1', '%bb.2.d1', '%bb.12.d11', '%bb.13.d12', 
+                         '%bb.14.d13', '%bb.15.d14', '%bb.2.d1', '%bb.16.d15', 
+                         '%bb.17.d16', '%bb.18.d17' ]
+body:             |
+  bb.0 (%ir-block.0):
+    successors: %bb.2.d1(0x03c3c3c4), %bb.1(0x7c3c3c3c)
+    liveins: %r0, %r1
+  
+    %r2 = tLDRpci %const.0, 14, _
+    tSTRi killed %r2, killed %r1, 0, 14, _ :: (store 4 into %ir.addr)
+    dead %r1 = SPACE 980, undef %r0
+    %r0 = tUXTB killed %r0, 14, _
+    %r1, dead %cpsr = tSUBi3 killed %r0, 1, 14, _
+    tCMPi8 %r1, 25, 14, _, implicit-def %cpsr
+    tBcc %bb.2.d1, 8, killed %cpsr
+  
+  bb.1 (%ir-block.0):
+    successors: %bb.3.d2(0x07c549d2), %bb.9.d8(0x07c549d2), %bb.4.d3(0x07c549d2), %bb.5.d4(0x07c549d2), %bb.6.d5(0x07c549d2), %bb.7.d6(0x07c549d2), %bb.8.d7(0x07c549d2), %bb.10.d9(0x07c549d2), %bb.11.d10(0x07c549d2), %bb.2.d1(0x03ab62db), %bb.12.d11(0x07c549d2), %bb.13.d12(0x07c549d2), %bb.14.d13(0x07c549d2), %bb.15.d14(0x07c549d2), %bb.16.d15(0x07c549d2), %bb.17.d16(0x07c549d2), %bb.18.d17(0x07c549d2)
+    liveins: %r1
+  
+    %r0, dead %cpsr = tLSLri killed %r1, 2, 14, _
+    %r1 = tLEApcrelJT %jump-table.0, 14, _
+    %r0 = tLDRr killed %r0, killed %r1, 14, _ :: (load 4 from jump-table)
+    tBR_JTr killed %r0, %jump-table.0
+  
+  bb.3.d2:
+  
+  bb.9.d8:
+  
+  bb.4.d3:
+  
+  bb.5.d4:
+  
+  bb.6.d5:
+  
+  bb.7.d6:
+  
+  bb.8.d7:
+  
+  bb.10.d9:
+  
+  bb.11.d10:
+  
+  bb.2.d1:
+  
+  bb.12.d11:
+  
+  bb.13.d12:
+  
+  bb.14.d13:
+  
+  bb.15.d14:
+  
+  bb.16.d15:
+  
+  bb.17.d16:
+  
+  bb.18.d17:
+
+...
+
+---
+name:            bar
+alignment:       1
+exposesReturnsTwice: false
+noVRegs:         true
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%r0' }
+  - { reg: '%r1' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+constants:       
+  - id:              0
+    value:           i32 12345678
+    alignment:       4
+jumpTable:       
+  kind:            inline
+  entries:         
+    - id:              0
+      blocks:          [ '%bb.3.d2', '%bb.9.d8', '%bb.4.d3', '%bb.5.d4', 
+                         '%bb.6.d5', '%bb.7.d6', '%bb.8.d7', '%bb.10.d9', 
+                         '%bb.11.d10', '%bb.2.d1', '%bb.2.d1', '%bb.2.d1', 
+                         '%bb.2.d1', '%bb.2.d1', '%bb.2.d1', '%bb.2.d1', 
+                         '%bb.2.d1', '%bb.2.d1', '%bb.12.d11', '%bb.13.d12', 
+                         '%bb.14.d13', '%bb.15.d14', '%bb.2.d1', '%bb.16.d15', 
+                         '%bb.17.d16', '%bb.18.d17' ]
+body:             |
+  bb.0 (%ir-block.0):
+    successors: %bb.2.d1(0x03c3c3c4), %bb.1(0x7c3c3c3c)
+    liveins: %r0, %r1
+  
+    %r2 = tLDRpci %const.0, 14, _
+    tSTRi killed %r2, killed %r1, 0, 14, _ :: (store 4 into %ir.addr)
+    %r0 = tUXTB killed %r0, 14, _
+    %r1, dead %cpsr = tSUBi3 killed %r0, 1, 14, _
+    tCMPi8 %r1, 25, 14, _, implicit-def %cpsr
+    tBcc %bb.2.d1, 8, killed %cpsr
+  
+  bb.1 (%ir-block.0):
+    successors: %bb.3.d2(0x07c549d2), %bb.9.d8(0x07c549d2), %bb.4.d3(0x07c549d2), %bb.5.d4(0x07c549d2), %bb.6.d5(0x07c549d2), %bb.7.d6(0x07c549d2), %bb.8.d7(0x07c549d2), %bb.10.d9(0x07c549d2), %bb.11.d10(0x07c549d2), %bb.2.d1(0x03ab62db), %bb.12.d11(0x07c549d2), %bb.13.d12(0x07c549d2), %bb.14.d13(0x07c549d2), %bb.15.d14(0x07c549d2), %bb.16.d15(0x07c549d2), %bb.17.d16(0x07c549d2), %bb.18.d17(0x07c549d2)
+    liveins: %r1
+  
+    %r0, dead %cpsr = tLSLri killed %r1, 2, 14, _
+    %r1 = tLEApcrelJT %jump-table.0, 14, _
+    %r0 = tLDRr killed %r0, killed %r1, 14, _ :: (load 4 from jump-table)
+    tBR_JTr killed %r0, %jump-table.0
+  
+  bb.3.d2:
+  
+  bb.9.d8:
+  
+  bb.4.d3:
+  
+  bb.5.d4:
+  
+  bb.6.d5:
+  
+  bb.7.d6:
+  
+  bb.8.d7:
+  
+  bb.10.d9:
+  
+  bb.11.d10:
+  
+  bb.2.d1:
+  
+  bb.12.d11:
+  
+  bb.13.d12:
+  
+  bb.14.d13:
+  
+  bb.15.d14:
+  
+  bb.16.d15:
+  
+  bb.17.d16:
+  
+  bb.18.d17:
+
+...
diff --git a/test/CodeGen/ARM/v8m-tail-call.ll b/test/CodeGen/ARM/v8m-tail-call.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2c2c795838ff4ce5188f07107ddfbacff315962e
--- /dev/null
+++ b/test/CodeGen/ARM/v8m-tail-call.ll
@@ -0,0 +1,23 @@
+; RUN: llc %s -o - -mtriple=thumbv8m.base | FileCheck %s
+
+define void @test() {
+; CHECK-LABEL: test:
+entry:
+  %call = tail call i32 @foo()
+  %tail = tail call i32 @foo()
+  ret void
+; CHECK: bl foo
+; CHECK: bl foo
+; CHECK-NOT: b foo
+}
+
+define void @test2() {
+; CHECK-LABEL: test2:
+entry:
+  %tail = tail call i32 @foo()
+  ret void
+; CHECK: b foo
+; CHECK-NOT: bl foo
+}
+
+declare i32 @foo()
diff --git a/test/CodeGen/ARM/v8m.base-jumptable_alignment.ll b/test/CodeGen/ARM/v8m.base-jumptable_alignment.ll
new file mode 100644
index 0000000000000000000000000000000000000000..673e04687a10ede1d2db27310967813e7ed22da9
--- /dev/null
+++ b/test/CodeGen/ARM/v8m.base-jumptable_alignment.ll
@@ -0,0 +1,51 @@
+; RUN: llc -filetype=obj -o /dev/null < %s
+; RUN: llc -filetype=asm < %s | FileCheck %s
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+source_filename = "bugpoint-output-39ed676.bc"
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv8m.base-arm-none-eabi"
+
+@crc32_tab = external unnamed_addr global [256 x i32], align 4
+@g_566 = external global i32**, align 4
+
+define void @main() {
+entry:
+  %0 = load volatile i32**, i32*** @g_566, align 4
+  br label %func_16.exit.i.i.i
+
+lbl_1394.i.i.i.loopexit:                          ; preds = %for.cond14.preheader.us.i.i.i
+  unreachable
+
+func_16.exit.i.i.i:                               ; preds = %entry
+  br i1 undef, label %for.cond7.preheader.i.lr.ph.i.i, label %for.end476.i.i.i.loopexit
+
+for.cond7.preheader.i.lr.ph.i.i:                  ; preds = %func_16.exit.i.i.i
+  br i1 undef, label %for.end476.i.i.i.loopexit, label %for.cond7.preheader.i.i.preheader.i
+
+for.cond7.preheader.i.i.preheader.i:              ; preds = %for.cond7.preheader.i.lr.ph.i.i
+  br label %for.cond14.preheader.us.i.i.i
+
+for.cond7.preheader.i.us.i.i:                     ; preds = %for.cond7.preheader.i.lr.ph.i.i
+  unreachable
+
+for.cond14.preheader.us.i.i.i:                    ; preds = %for.inc459.us.i.i.i, %for.cond7.preheader.i.i.preheader.i
+; CHECK: @ BB#4
+; CHECK-NEXT: .p2align 2
+  switch i4 undef, label %func_1.exit.loopexit [
+    i4 0, label %for.inc459.us.i.i.i
+    i4 -5, label %for.inc459.us.i.i.i
+    i4 2, label %lbl_1394.i.i.i.loopexit
+    i4 3, label %for.end476.i.i.i.loopexit
+  ]
+
+for.inc459.us.i.i.i:                              ; preds = %for.cond14.preheader.us.i.i.i, %for.cond14.preheader.us.i.i.i
+  br label %for.cond14.preheader.us.i.i.i
+
+for.end476.i.i.i.loopexit:                        ; preds = %for.cond14.preheader.us.i.i.i
+  unreachable
+
+func_1.exit.loopexit:                             ; preds = %for.cond14.preheader.us.i.i.i
+  %arrayidx.i63.i.i5252 = getelementptr inbounds [256 x i32], [256 x i32]* @crc32_tab, i32 0, i32 undef
+  unreachable
+}
diff --git a/test/CodeGen/ARM/va_arg.ll b/test/CodeGen/ARM/va_arg.ll
index d901a7461fc86fa735c7cbb231324b81d88a767b..57470694b124b9e8e6d3d30f5bf6323476ca7a28 100644
--- a/test/CodeGen/ARM/va_arg.ll
+++ b/test/CodeGen/ARM/va_arg.ll
@@ -4,8 +4,8 @@
 ; CHECK-LABEL: test1:
 ; CHECK-NOT: bfc
 ; CHECK: add	[[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7
-; CHECK: bfc	[[REG]], #0, #3
-; CHECK-NOT: bfc
+; CHECK: bic	{{(r[0-9]+)|(lr)}}, [[REG]], #7
+; CHECK-NOT: bic
 
 define i64 @test1(i32 %i, ...) nounwind optsize {
 entry:
@@ -20,8 +20,8 @@ entry:
 ; CHECK-LABEL: test2:
 ; CHECK-NOT: bfc
 ; CHECK: add	[[REG:(r[0-9]+)|(lr)]], {{(r[0-9]+)|(lr)}}, #7
-; CHECK: bfc	[[REG]], #0, #3
-; CHECK-NOT:	bfc
+; CHECK: bic	{{(r[0-9]+)|(lr)}}, [[REG]], #7
+; CHECK-NOT:	bic
 ; CHECK: bx	lr
 
 define double @test2(i32 %a, i32* %b, ...) nounwind optsize {
diff --git a/test/CodeGen/ARM/vcmp-crash.ll b/test/CodeGen/ARM/vcmp-crash.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2d3262be5849b9667f98ee678565b27b0a4ef3b6
--- /dev/null
+++ b/test/CodeGen/ARM/vcmp-crash.ll
@@ -0,0 +1,11 @@
+; RUN: llc -mcpu=cortex-m4 < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "thumbv7em-none--eabi"
+
+; CHECK: vcmp.f32
+define double @f(double %a, double %b, double %c, float %d) {
+  %1 = fcmp oeq float %d, 0.0
+  %2 = select i1 %1, double %a, double %c
+  ret double %2
+}
diff --git a/test/CodeGen/ARM/vldm-liveness.ll b/test/CodeGen/ARM/vldm-liveness.ll
index e114e6970a324a6c844ff47e59da6208ab4e0f4c..63dc9d61ebcca471cbc084bbf0a93af6e991a559 100644
--- a/test/CodeGen/ARM/vldm-liveness.ll
+++ b/test/CodeGen/ARM/vldm-liveness.ll
@@ -1,26 +1,13 @@
 ; RUN: llc -mtriple thumbv7-apple-ios -verify-machineinstrs -o - %s | FileCheck %s
 
-; ARM load store optimizer was dealing with a sequence like:
-;     s1 = VLDRS [r0, 1], Q0<imp-def>
-;     s3 = VLDRS [r0, 2], Q0<imp-use,kill>, Q0<imp-def>
-;     s0 = VLDRS [r0, 0], Q0<imp-use,kill>, Q0<imp-def>
-;     s2 = VLDRS [r0, 4], Q0<imp-use,kill>, Q0<imp-def>
+; Make sure we emit the loads in ascending order, and form a vldmia.
 ;
-; It decided to combine the {s0, s1} loads into a single instruction in the
-; third position. However, this leaves the instruction defining s3 with a stray
-; imp-use of Q0, which is undefined.
-;
-; The verifier catches this, so this test just makes sure that appropriate
-; liveness flags are added.
-;
-; I believe the change will be tested as long as the vldmia is not the first of
-; the loads. Earlier optimisations may perturb the output over time, but
-; fiddling the indices should be sufficient to restore the test.
+; See vldm-liveness.mir for the bug this file originally testing.
 
 define arm_aapcs_vfpcc <4 x float> @foo(float* %ptr) {
 ; CHECK-LABEL: foo:
-; CHECK: vldr s3, [r0, #8]
 ; CHECK: vldmia r0, {s0, s1}
+; CHECK: vldr s3, [r0, #8]
 ; CHECK: vldr s2, [r0, #16]
    %off0 = getelementptr float, float* %ptr, i32 0
    %val0 = load float, float* %off0
diff --git a/test/CodeGen/ARM/vldm-liveness.mir b/test/CodeGen/ARM/vldm-liveness.mir
new file mode 100644
index 0000000000000000000000000000000000000000..a85a018a8b1a5cf228c451b534a2eb89a7a7cbee
--- /dev/null
+++ b/test/CodeGen/ARM/vldm-liveness.mir
@@ -0,0 +1,40 @@
+# RUN: llc -run-pass arm-ldst-opt -verify-machineinstrs %s -o - | FileCheck %s
+# ARM load store optimizer was dealing with a sequence like:
+#     s1 = VLDRS [r0, 1], Q0<imp-def>
+#     s3 = VLDRS [r0, 2], Q0<imp-use,kill>, Q0<imp-def>
+#     s0 = VLDRS [r0, 0], Q0<imp-use,kill>, Q0<imp-def>
+#     s2 = VLDRS [r0, 4], Q0<imp-use,kill>, Q0<imp-def>
+#
+# It decided to combine the {s0, s1} loads into a single instruction in the
+# third position. However, this leaves the instruction defining s3 with a stray
+# imp-use of Q0, which is undefined.
+#
+# The verifier catches this, so this test just makes sure that appropriate
+# liveness flags are added.
+--- |
+  target triple = "thumbv7-apple-ios"
+  define arm_aapcs_vfpcc <4 x float> @foo(float* %ptr) {
+    ret <4 x float> undef
+  }
+...
+---
+name:            foo
+alignment:       1
+liveins:
+  - { reg: '%r0' }
+body:             |
+  bb.0 (%ir-block.0):
+    liveins: %r0
+
+    %s1 = VLDRS %r0, 1, 14, _, implicit-def %q0 :: (load 4)
+    %s3 = VLDRS %r0, 2, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+    ; CHECK: %s3 = VLDRS %r0, 2, 14, _, implicit killed undef %q0, implicit-def %q0 :: (load 4)
+
+    %s0 = VLDRS %r0, 0, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+    ; CHECK: VLDMSIA %r0, 14, _, def %s0, def %s1, implicit-def _
+
+    %s2 = VLDRS killed %r0, 4, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+    ; CHECK: %s2 = VLDRS killed %r0, 4, 14, _, implicit killed %q0, implicit-def %q0 :: (load 4)
+
+    tBX_RET 14, _, implicit %q0
+...
diff --git a/test/CodeGen/ARM/vsel.ll b/test/CodeGen/ARM/vsel.ll
index 746b1b000ef101a9e149a4d6c36eddfb98a2b462..daea41399b47c2c7a03ff13655e85b5b1b61b78c 100644
--- a/test/CodeGen/ARM/vsel.ll
+++ b/test/CodeGen/ARM/vsel.ll
@@ -132,7 +132,7 @@ define void @test_vsel32oeq(float %lhs32, float %rhs32, float %a, float %b) {
   %tst1 = fcmp oeq float %lhs32, %rhs32
   %val1 = select i1 %tst1, float %a, float %b
   store float %val1, float* @varfloat
-; CHECK: vcmpe.f32 s0, s1
+; CHECK: vcmp.f32 s0, s1
 ; CHECK: vseleq.f32 s0, s2, s3
   ret void
 }
@@ -141,7 +141,7 @@ define void @test_vsel64oeq(float %lhs32, float %rhs32, double %a, double %b) {
   %tst1 = fcmp oeq float %lhs32, %rhs32
   %val1 = select i1 %tst1, double %a, double %b
   store double %val1, double* @vardouble
-; CHECK: vcmpe.f32 s0, s1
+; CHECK: vcmp.f32 s0, s1
 ; CHECK: vseleq.f64 d16, d1, d2
   ret void
 }
@@ -276,7 +276,7 @@ define void @test_vsel32une(float %lhs32, float %rhs32, float %a, float %b) {
   %tst1 = fcmp une float %lhs32, %rhs32
   %val1 = select i1 %tst1, float %a, float %b
   store float %val1, float* @varfloat
-; CHECK: vcmpe.f32 s0, s1
+; CHECK: vcmp.f32 s0, s1
 ; CHECK: vseleq.f32 s0, s3, s2
   ret void
 }
@@ -285,7 +285,7 @@ define void @test_vsel64une(float %lhs32, float %rhs32, double %a, double %b) {
   %tst1 = fcmp une float %lhs32, %rhs32
   %val1 = select i1 %tst1, double %a, double %b
   store double %val1, double* @vardouble
-; CHECK: vcmpe.f32 s0, s1
+; CHECK: vcmp.f32 s0, s1
 ; CHECK: vseleq.f64 d16, d2, d1
   ret void
 }
diff --git a/test/CodeGen/ARM/vuzp.ll b/test/CodeGen/ARM/vuzp.ll
index a83a4df5490c30c28b2f6ec3c44de71157aef2ba..0a5235df319fecd4a21270d3b1488201b9ea1d49 100644
--- a/test/CodeGen/ARM/vuzp.ll
+++ b/test/CodeGen/ARM/vuzp.ll
@@ -318,33 +318,29 @@ entry:
   ret void
 }
 
-define <8 x i8> @vuzp_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
+define <8 x i8> @cmpsel_trunc(<8 x i8> %in0, <8 x i8> %in1, <8 x i32> %cmp0, <8 x i32> %cmp1) {
 ; In order to create the select we need to truncate the vcgt result from a vector of i32 to a vector of i8.
 ; This results in a build_vector with mismatched types. We will generate two vmovn.i32 instructions to
-; truncate from i32 to i16 and one vuzp to perform the final truncation for i8.
-; CHECK-LABEL: vuzp_trunc:
+; truncate from i32 to i16 and one vmovn.i16 to perform the final truncation for i8.
+; CHECK-LABEL: cmpsel_trunc:
 ; CHECK:       @ BB#0:
 ; CHECK-NEXT:    .save {r4, r5, r11, lr}
 ; CHECK-NEXT:    push {r4, r5, r11, lr}
-; CHECK-NEXT:    add r12, sp, #48
-; CHECK-NEXT:    add lr, sp, #16
 ; CHECK-NEXT:    add r4, sp, #64
 ; CHECK-NEXT:    add r5, sp, #32
+; CHECK-NEXT:    add r12, sp, #48
+; CHECK-NEXT:    add lr, sp, #16
 ; CHECK-NEXT:    vld1.64 {d16, d17}, [r5]
 ; CHECK-NEXT:    vld1.64 {d18, d19}, [r4]
 ; CHECK-NEXT:    vld1.64 {d20, d21}, [lr]
 ; CHECK-NEXT:    vld1.64 {d22, d23}, [r12]
 ; CHECK-NEXT:    vcgt.u32 q8, q9, q8
 ; CHECK-NEXT:    vcgt.u32 q9, q11, q10
-; CHECK-NEXT:    vmovn.i32 d16, q8
-; CHECK-NEXT:    vmovn.i32 d17, q9
-; CHECK-NEXT:    vmov.i8 d18, #0x7
-; CHECK-NEXT:    vmov d19, r0, r1
-; CHECK-NEXT:    vuzp.8 d17, d16
-; CHECK-NEXT:    vneg.s8 d16, d18
-; CHECK-NEXT:    vshl.i8 d17, d17, #7
+; CHECK-NEXT:    vmovn.i32 d17, q8
+; CHECK-NEXT:    vmovn.i32 d16, q9
 ; CHECK-NEXT:    vmov d18, r2, r3
-; CHECK-NEXT:    vshl.s8 d16, d17, d16
+; CHECK-NEXT:    vmov d19, r0, r1
+; CHECK-NEXT:    vmovn.i16 d16, q8
 ; CHECK-NEXT:    vbsl d16, d19, d18
 ; CHECK-NEXT:    vmov r0, r1, d16
 ; CHECK-NEXT:    pop {r4, r5, r11, lr}
diff --git a/test/CodeGen/AVR/inline-asm/inline-asm.ll b/test/CodeGen/AVR/inline-asm/inline-asm.ll
index 678395a3e5c4c205c42506fe1873951d08c37c1b..88d0c3af2e88511f3898d54d886755697becadb5 100644
--- a/test/CodeGen/AVR/inline-asm/inline-asm.ll
+++ b/test/CodeGen/AVR/inline-asm/inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=avr -mattr=movw | FileCheck %s
+; RUN: llc < %s -march=avr -mattr=movw -no-integrated-as | FileCheck %s
 
 ; CHECK-LABEL: no_operands:
 define void @no_operands() {
diff --git a/test/CodeGen/AVR/inline-asm/inline-asm2.ll b/test/CodeGen/AVR/inline-asm/inline-asm2.ll
index 083390999b8a97039dd36eb4c4e3e9af1a06c65c..74365b42c60e5ab89df18493deb10aca65d33f29 100644
--- a/test/CodeGen/AVR/inline-asm/inline-asm2.ll
+++ b/test/CodeGen/AVR/inline-asm/inline-asm2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=avr | FileCheck %s
+; RUN: llc < %s -march=avr -no-integrated-as | FileCheck %s
 
 ; CHECK-LABEL: foo
 define void @foo(i16 %a) {
diff --git a/test/CodeGen/AVR/inline-asm/multibyte.ll b/test/CodeGen/AVR/inline-asm/multibyte.ll
index 34cdf5d006e93ff63dbb9ac39c099c11e96c1b34..a7c8f6e75f0fbee7fe851f852f3fe35bff577ae1 100644
--- a/test/CodeGen/AVR/inline-asm/multibyte.ll
+++ b/test/CodeGen/AVR/inline-asm/multibyte.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=avr | FileCheck %s
+; RUN: llc < %s -march=avr -no-integrated-as | FileCheck %s
 ; XFAIL: *
 
 ; Multibyte references
diff --git a/test/CodeGen/AVR/intrinsics/stacksave-restore.ll b/test/CodeGen/AVR/intrinsics/stacksave-restore.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3985f49b92f59775aa1c0fdda714c67480e5a70a
--- /dev/null
+++ b/test/CodeGen/AVR/intrinsics/stacksave-restore.ll
@@ -0,0 +1,27 @@
+; RUN: llc -O0 < %s -march=avr | FileCheck %s
+
+; CHECK-LABEL: foo
+define void @foo() {
+entry:
+  br label %save
+
+; CHECK-LABEL: save
+; CHECK: in [[SREG1:r[0-9]+]], 61
+; CHECK-NEXT: in [[SREG2:r[0-9]+]], 62
+save:
+  %saved = call i8* @llvm.stacksave()
+  br label %restore
+
+; CHECK-LABEL: restore
+; CHECK: in r0, 63
+; CHECK-NEXT: cli
+; CHECK-NEXT: out 62, [[SREG2]]
+; CHECK-NEXT: out 63, r0
+; CHECK-NEXT: out 61, [[SREG1]]
+restore:
+  call void @llvm.stackrestore(i8* %saved)
+  ret void
+}
+
+declare i8* @llvm.stacksave()
+declare void @llvm.stackrestore(i8* %ptr)
diff --git a/test/CodeGen/AVR/no-print-operand-twice.ll b/test/CodeGen/AVR/no-print-operand-twice.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8326507768ba4e7fef492a12b6da096769dfc42f
--- /dev/null
+++ b/test/CodeGen/AVR/no-print-operand-twice.ll
@@ -0,0 +1,8 @@
+; RUN: llc -no-integrated-as -march=avr < %s | FileCheck %s
+
+define void @test() {
+entry:
+; CHECK: /* result: 68719476738 */
+  tail call void asm sideeffect "/* result: ${0:c} */", "i,~{dirflag},~{fpsr},~{flags}"( i64 68719476738 )
+  ret void
+}
diff --git a/test/CodeGen/AVR/pseudo/ADCWRdRr.mir b/test/CodeGen/AVR/pseudo/ADCWRdRr.mir
index 475d5b39299c1f5192019039fb3b29ea406c51e0..b1fc792d65946af03a3e44e1d10952ec955ef590 100644
--- a/test/CodeGen/AVR/pseudo/ADCWRdRr.mir
+++ b/test/CodeGen/AVR/pseudo/ADCWRdRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit add with carry pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/ADDWRdRr.mir b/test/CodeGen/AVR/pseudo/ADDWRdRr.mir
index 2205febcc933c476b3d8f28bd899739670e72072..5743b1536330cd744666ae93ef16fbe6b4785317 100644
--- a/test/CodeGen/AVR/pseudo/ADDWRdRr.mir
+++ b/test/CodeGen/AVR/pseudo/ADDWRdRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit add pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/ANDIWRdK.mir b/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
index 5af8db1595195bb3a22d9e875cc29294f1c1a8fc..bcea4e6dfe2714c16e29cd5d0e4bd7fdc6928399 100644
--- a/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/ANDIWRdK.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit ANDO pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/ANDWRdRr.mir b/test/CodeGen/AVR/pseudo/ANDWRdRr.mir
index c9458e9ba5d6897ae719f566cb26cd59d5ecf342..f6b060a5d734465c91448f98144a33923b69b53c 100644
--- a/test/CodeGen/AVR/pseudo/ANDWRdRr.mir
+++ b/test/CodeGen/AVR/pseudo/ANDWRdRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit AND pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/ASRWRd.mir b/test/CodeGen/AVR/pseudo/ASRWRd.mir
index 3e809564ca1c0aa451c868baa90d9fab3895589b..5253dcd87f136d779181a579e65e71734c202551 100644
--- a/test/CodeGen/AVR/pseudo/ASRWRd.mir
+++ b/test/CodeGen/AVR/pseudo/ASRWRd.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/COMWRd.mir b/test/CodeGen/AVR/pseudo/COMWRd.mir
index 282d601686ad60b637a878d5b4aef9102096f3ee..58ff7af7cb3c630355e3868059e5431033dcc4e2 100644
--- a/test/CodeGen/AVR/pseudo/COMWRd.mir
+++ b/test/CodeGen/AVR/pseudo/COMWRd.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit COM pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/CPCWRdRr.mir b/test/CodeGen/AVR/pseudo/CPCWRdRr.mir
index 2081aa0b5ee4776ea1ddf0b171a3b712c94bec73..c0ab60e892918562550cc04ea63f9f08d1a6df52 100644
--- a/test/CodeGen/AVR/pseudo/CPCWRdRr.mir
+++ b/test/CodeGen/AVR/pseudo/CPCWRdRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit CPCW pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/CPWRdRr.mir b/test/CodeGen/AVR/pseudo/CPWRdRr.mir
index 7e25e7fe22726dbce6e36aeff8e3f564d13abfeb..c93c99151a491fc62dc22adb04adbeb27fb7eec4 100644
--- a/test/CodeGen/AVR/pseudo/CPWRdRr.mir
+++ b/test/CodeGen/AVR/pseudo/CPWRdRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit CPW pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/EORWRdRr.mir b/test/CodeGen/AVR/pseudo/EORWRdRr.mir
index 8769c12cbb11e1649379b72be5b0a41fb67458d2..de53c2d077edeb6b978314b37573e2c5d90f1415 100644
--- a/test/CodeGen/AVR/pseudo/EORWRdRr.mir
+++ b/test/CodeGen/AVR/pseudo/EORWRdRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit EOR pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/FRMIDX.mir b/test/CodeGen/AVR/pseudo/FRMIDX.mir
index 47a9397fa6b0ea3651ecc63fd7d74e9f9298f93b..b56122a43adaa0be686169c92221876ad435c952 100644
--- a/test/CodeGen/AVR/pseudo/FRMIDX.mir
+++ b/test/CodeGen/AVR/pseudo/FRMIDX.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # TODO: Write this test.
 # This instruction isn't expanded by the pseudo expansion passs, but
diff --git a/test/CodeGen/AVR/pseudo/INWRdA.mir b/test/CodeGen/AVR/pseudo/INWRdA.mir
index a801598faddd10bc035aba51fc5ba16e6c29ea14..1b2d7fa0f539a5d03dc37417467fcf60b6fe4ad5 100644
--- a/test/CodeGen/AVR/pseudo/INWRdA.mir
+++ b/test/CodeGen/AVR/pseudo/INWRdA.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
index 781cb5d82433caf500b70cf5071a3f694871db53..5ff2ef1742e0e430d529374f9ea5c567ab9e8500 100644
--- a/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
+++ b/test/CodeGen/AVR/pseudo/LDDWRdPtrQ.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0  %s -o - 2>&1 -march=avr | FileCheck %s
+# RUN: llc -O0  %s -o - -march=avr | FileCheck %s
 
 # This test checks the expansion of the 16-bit 'LDDWRdPtrQ' pseudo instruction.
 
@@ -12,6 +12,7 @@
 
 ---
 name:            test_lddwrdptrq
+tracksRegLiveness: true
 body: |
   bb.0.entry:
 
@@ -20,5 +21,5 @@ body: |
     ; CHECK:      ldd     r30, Y+10
     ; CHECK-NEXT: ldd     r31, Y+11
 
-    early-clobber %r31r30 = LDDWRdPtrQ %r29r28, 10
+    early-clobber %r31r30 = LDDWRdPtrQ undef %r29r28, 10
 ...
diff --git a/test/CodeGen/AVR/pseudo/LDDWRdYQ.mir b/test/CodeGen/AVR/pseudo/LDDWRdYQ.mir
index 472f498b912c2228b8a1576f2da6b2beebe49671..831c75b38b17d5e68f399570a81c160643bc8e6d 100644
--- a/test/CodeGen/AVR/pseudo/LDDWRdYQ.mir
+++ b/test/CodeGen/AVR/pseudo/LDDWRdYQ.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0  %s -o - 2>&1 -march=avr | FileCheck %s
+# RUN: llc -O0  %s -o - -march=avr | FileCheck %s
 
 # This test checks the expansion of the 16-bit 'LDDWRdYQ instruction
 
@@ -12,6 +12,7 @@
 
 ---
 name:            test_lddwrdyq
+tracksRegLiveness: true
 body: |
   bb.0.entry:
 
@@ -20,5 +21,5 @@ body: |
     ; CHECK:      ldd     r30, Y+1
     ; CHECK-NEXT: ldd     r31, Y+2
 
-    early-clobber %r31r30 = LDDWRdYQ %r29r28, 1
+    early-clobber %r31r30 = LDDWRdYQ undef %r29r28, 1
 ...
diff --git a/test/CodeGen/AVR/pseudo/LDIWRdK.mir b/test/CodeGen/AVR/pseudo/LDIWRdK.mir
index 23d16d9c5692b47cf20ee277eba2704b7fbe8a8b..f4788adf20b47f6aa8a06c12a00c83a29dca5857 100644
--- a/test/CodeGen/AVR/pseudo/LDIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/LDIWRdK.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit LDIWRdK pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/LDSWRdK.mir b/test/CodeGen/AVR/pseudo/LDSWRdK.mir
index aa4883634d748b4041cfc1cacfce7f51b4da201a..b813923abcb2d9291798d381f7b2a18a7c258401 100644
--- a/test/CodeGen/AVR/pseudo/LDSWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/LDSWRdK.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit LDSWRdK pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/LDWRdPtr.mir b/test/CodeGen/AVR/pseudo/LDWRdPtr.mir
index aaf9f182f2be5eb23725cf22e51728157b4fa770..6db615878b95f2e572bc33e3bb5503a489f2007c 100644
--- a/test/CodeGen/AVR/pseudo/LDWRdPtr.mir
+++ b/test/CodeGen/AVR/pseudo/LDWRdPtr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit LDWRdPtr pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/LDWRdPtrPd.mir b/test/CodeGen/AVR/pseudo/LDWRdPtrPd.mir
index f304cc220cbc91f5cd8153416d846baa31f36d5f..eb65c6538d110b0c36f511dee55b96b41ce602ab 100644
--- a/test/CodeGen/AVR/pseudo/LDWRdPtrPd.mir
+++ b/test/CodeGen/AVR/pseudo/LDWRdPtrPd.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit LDWRdPtrPd pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/LDWRdPtrPi.mir b/test/CodeGen/AVR/pseudo/LDWRdPtrPi.mir
index 9153be0bf1c9673e9086c42754efa5ed28e6f04e..50bad2a4c76538a2d1f727ec113cda87dbaed153 100644
--- a/test/CodeGen/AVR/pseudo/LDWRdPtrPi.mir
+++ b/test/CodeGen/AVR/pseudo/LDWRdPtrPi.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit LDWRdPtrPi pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/LSLWRd.mir b/test/CodeGen/AVR/pseudo/LSLWRd.mir
index 441939856aef90094cc06cf59065e932eb5fc864..537944866e5392007ac0ea7cfe09761585e72a63 100644
--- a/test/CodeGen/AVR/pseudo/LSLWRd.mir
+++ b/test/CodeGen/AVR/pseudo/LSLWRd.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/LSRWRd.mir b/test/CodeGen/AVR/pseudo/LSRWRd.mir
index f5ffb93f4035aeee80e1add64a08867b7eea3dff..a1a513f4e364f9a4b0ec462892d0ba1c51cd4efd 100644
--- a/test/CodeGen/AVR/pseudo/LSRWRd.mir
+++ b/test/CodeGen/AVR/pseudo/LSRWRd.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/ORIWRdK.mir b/test/CodeGen/AVR/pseudo/ORIWRdK.mir
index 92bc36769eb816bd56e15af6578bd2ccec8eb68a..d77a6ba88488193c335a09b8451d76ce508c5404 100644
--- a/test/CodeGen/AVR/pseudo/ORIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/ORIWRdK.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit OR pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/ORWRdRr.mir b/test/CodeGen/AVR/pseudo/ORWRdRr.mir
index f7a377ec860b293afa54642370da2f5e05005167..834c21cba8f9338b4b5123e190bac0fc76a9842a 100644
--- a/test/CodeGen/AVR/pseudo/ORWRdRr.mir
+++ b/test/CodeGen/AVR/pseudo/ORWRdRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit OR pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/OUTWARr.mir b/test/CodeGen/AVR/pseudo/OUTWARr.mir
index 85e9f5259a871d9d4bf2f0998022603907a08690..99abad1c31b8215c9b519155c2d4fa09671aeca9 100644
--- a/test/CodeGen/AVR/pseudo/OUTWARr.mir
+++ b/test/CodeGen/AVR/pseudo/OUTWARr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/POPWRd.mir b/test/CodeGen/AVR/pseudo/POPWRd.mir
index 6794742bf54ab58726b9446895ad6fd8c6290475..8bd7fe68727c7b7f343ccdce634560cb1b42b2a5 100644
--- a/test/CodeGen/AVR/pseudo/POPWRd.mir
+++ b/test/CodeGen/AVR/pseudo/POPWRd.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/PUSHWRr.mir b/test/CodeGen/AVR/pseudo/PUSHWRr.mir
index 93920867030f75d7853179b2d95ba829081ab2d1..ec94ecbf5bb672743ff0ca650406b592bf7c3343 100644
--- a/test/CodeGen/AVR/pseudo/PUSHWRr.mir
+++ b/test/CodeGen/AVR/pseudo/PUSHWRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/SBCIWRdK.mir b/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
index 9152c6d9126651a5f8012f151d9288063b005113..644e6106ee7903d93285100b960983a363e6fc20 100644
--- a/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/SBCIWRdK.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit subtraction with carry pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/SBCWRdRr.mir b/test/CodeGen/AVR/pseudo/SBCWRdRr.mir
index 9159906b76a06e45ef2f0964fbebe2904031364c..5cf5d33252c7fb42f3752ce12780f438d529c76a 100644
--- a/test/CodeGen/AVR/pseudo/SBCWRdRr.mir
+++ b/test/CodeGen/AVR/pseudo/SBCWRdRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit subtraction with carry pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/SEXT.mir b/test/CodeGen/AVR/pseudo/SEXT.mir
index 069eb883dcc1b69a713a559ef44a90eb4abcda4d..0d10358c10e17d76188173a79cf983a819b48dbb 100644
--- a/test/CodeGen/AVR/pseudo/SEXT.mir
+++ b/test/CodeGen/AVR/pseudo/SEXT.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/STDWPtrQRr.mir b/test/CodeGen/AVR/pseudo/STDWPtrQRr.mir
index ff2fdb9155e1b9d1fd199605c09958f105962f5a..9252997d489e50f3a1a10bab3a69f12f274561b6 100644
--- a/test/CodeGen/AVR/pseudo/STDWPtrQRr.mir
+++ b/test/CodeGen/AVR/pseudo/STDWPtrQRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo  %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo  %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/STSWKRr.mir b/test/CodeGen/AVR/pseudo/STSWKRr.mir
index ccf852271ae98925d9ce03adc227d4b8cdb7cb24..18f1018080949ed7a840f68f4d42ac0ac74861c9 100644
--- a/test/CodeGen/AVR/pseudo/STSWKRr.mir
+++ b/test/CodeGen/AVR/pseudo/STSWKRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit STSWRdK pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/STWPtrPdRr.mir b/test/CodeGen/AVR/pseudo/STWPtrPdRr.mir
index 0d0d9e909e4af9e876ade8ca11186c0810677186..d884d2121c2ce8afba257f94f8c97c3154c39719 100644
--- a/test/CodeGen/AVR/pseudo/STWPtrPdRr.mir
+++ b/test/CodeGen/AVR/pseudo/STWPtrPdRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/STWPtrPiRr.mir b/test/CodeGen/AVR/pseudo/STWPtrPiRr.mir
index a436d9b109bbc1307c4ee6f6c78b0e453243c0ed..962776aa6330c18046c7a651b78dd1150ba78627 100644
--- a/test/CodeGen/AVR/pseudo/STWPtrPiRr.mir
+++ b/test/CodeGen/AVR/pseudo/STWPtrPiRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/STWPtrRr.mir b/test/CodeGen/AVR/pseudo/STWPtrRr.mir
index f85f4f8a0452a1a385410b7c941fd480ebeeb0c8..efed707bfe8aa020e9fa4bd31b07f426f00ddb18 100644
--- a/test/CodeGen/AVR/pseudo/STWPtrRr.mir
+++ b/test/CodeGen/AVR/pseudo/STWPtrRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit STSWRdK pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/SUBIWRdK.mir b/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
index 95c68c0a122abf256d1eab7be8fe6bce66da042d..c7d88d7ab3f68860200b713af17451880bb4e863 100644
--- a/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
+++ b/test/CodeGen/AVR/pseudo/SUBIWRdK.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit subtraction pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/SUBWRdRr.mir b/test/CodeGen/AVR/pseudo/SUBWRdRr.mir
index 9892cf5b7f3313126c7017c50bd45b34d9f98dde..b12b0e5349e2098b26ba586ea2618335f051bec3 100644
--- a/test/CodeGen/AVR/pseudo/SUBWRdRr.mir
+++ b/test/CodeGen/AVR/pseudo/SUBWRdRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 # This test checks the expansion of the 16-bit subtraction pseudo instruction.
 
diff --git a/test/CodeGen/AVR/pseudo/ZEXT.mir b/test/CodeGen/AVR/pseudo/ZEXT.mir
index 069eb883dcc1b69a713a559ef44a90eb4abcda4d..0d10358c10e17d76188173a79cf983a819b48dbb 100644
--- a/test/CodeGen/AVR/pseudo/ZEXT.mir
+++ b/test/CodeGen/AVR/pseudo/ZEXT.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-expand-pseudo %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir b/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir
index 5ed95ad76a7fee276b1ef96040fb1647a575ea9b..8427a2bfb4edf2b3efaa668e5ff04ba9aded8c26 100644
--- a/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir
+++ b/test/CodeGen/AVR/pseudo/expand-lddw-dst-src-same.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 %s -o - 2>&1 -march=avr | FileCheck %s
+# RUN: llc -O0 %s -o - -march=avr | FileCheck %s
 
 # This test ensures that the pseudo expander can correctly handle the case
 # where we are expanding a 16-bit LDD instruction where the source and
@@ -18,6 +18,7 @@
 ...
 ---
 name:            test_lddw
+tracksRegLiveness: true
 stack:
   - { id: 0, type: spill-slot, offset: -4, size: 1, alignment: 1, callee-saved-register: '%r28' }
 body:             |
diff --git a/test/CodeGen/AVR/relax-mem/STDWPtrQRr.mir b/test/CodeGen/AVR/relax-mem/STDWPtrQRr.mir
index b43c775083285cc98c426992e56a7af533fbb330..7421bd4c4e81c2187ff6a7182e75a81d23a4d7ce 100644
--- a/test/CodeGen/AVR/relax-mem/STDWPtrQRr.mir
+++ b/test/CodeGen/AVR/relax-mem/STDWPtrQRr.mir
@@ -1,4 +1,4 @@
-# RUN: llc -O0 -run-pass=avr-relax-mem %s -o - 2>&1 | FileCheck %s
+# RUN: llc -O0 -run-pass=avr-relax-mem %s -o - | FileCheck %s
 
 --- |
   target triple = "avr--"
diff --git a/test/CodeGen/BPF/undef.ll b/test/CodeGen/BPF/undef.ll
index 541d81ea07b738188823f506b5309337bc66e751..de14bfde1ab97a12a2041c8cd689fffda6a393fa 100644
--- a/test/CodeGen/BPF/undef.ll
+++ b/test/CodeGen/BPF/undef.ll
@@ -13,50 +13,55 @@
 
 ; Function Attrs: nounwind uwtable
 define i32 @ebpf_filter(%struct.__sk_buff* nocapture readnone %ebpf_packet) #0 section "socket1" {
+; CHECK: r2 = r10
+; CHECK: r2 += -2
+; CHECK: r1 = 0
+; CHECK: *(u16 *)(r2 + 6) = r1
+; CHECK: *(u16 *)(r2 + 4) = r1
+; CHECK: *(u16 *)(r2 + 2) = r1
+; CHECK: r2 = 6
+; CHECK: *(u8 *)(r10 - 7) = r2
+; CHECK: r2 = 5
+; CHECK: *(u8 *)(r10 - 8) = r2
+; CHECK: r2 = 7
+; CHECK: *(u8 *)(r10 - 6) = r2
+; CHECK: r2 = 8
+; CHECK: *(u8 *)(r10 - 5) = r2
+; CHECK: r2 = 9
+; CHECK: *(u8 *)(r10 - 4) = r2
+; CHECK: r2 = 10
+; CHECK: *(u8 *)(r10 - 3) = r2
+; CHECK: *(u16 *)(r10 + 24) = r1
+; CHECK: *(u16 *)(r10 + 22) = r1
+; CHECK: *(u16 *)(r10 + 20) = r1
+; CHECK: *(u16 *)(r10 + 18) = r1
+; CHECK: *(u16 *)(r10 + 16) = r1
+; CHECK: *(u16 *)(r10 + 14) = r1
+; CHECK: *(u16 *)(r10 + 12) = r1
+; CHECK: *(u16 *)(r10 + 10) = r1
+; CHECK: *(u16 *)(r10 + 8) = r1
+; CHECK: *(u16 *)(r10 + 6) = r1
+; CHECK: *(u16 *)(r10 - 2) = r1
+; CHECK: *(u16 *)(r10 + 26) = r1
+; CHECK: r2 = r10
+; CHECK: r2 += -8
+; CHECK: r1 = <MCOperand Expr:(routing)>ll
+; CHECK: call bpf_map_lookup_elem
+; CHECK: exit
   %key = alloca %struct.routing_key_2, align 1
   %1 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 0
-; CHECK: r1 = 5
-; CHECK: *(u8 *)(r10 - 8) = r1
   store i8 5, i8* %1, align 1
   %2 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 1
-; CHECK: r1 = 6
-; CHECK: *(u8 *)(r10 - 7) = r1
   store i8 6, i8* %2, align 1
   %3 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 2
-; CHECK: r1 = 7
-; CHECK: *(u8 *)(r10 - 6) = r1
   store i8 7, i8* %3, align 1
   %4 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 3
-; CHECK: r1 = 8
-; CHECK: *(u8 *)(r10 - 5) = r1
   store i8 8, i8* %4, align 1
   %5 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 4
-; CHECK: r1 = 9
-; CHECK: *(u8 *)(r10 - 4) = r1
   store i8 9, i8* %5, align 1
   %6 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 0, i32 0, i64 5
-; CHECK: r1 = 10
-; CHECK: *(u8 *)(r10 - 3) = r1
   store i8 10, i8* %6, align 1
   %7 = getelementptr inbounds %struct.routing_key_2, %struct.routing_key_2* %key, i64 1, i32 0, i64 0
-; CHECK: r1 = r10
-; CHECK: r1 += -2
-; CHECK: r2 = 0
-; CHECK: *(u16 *)(r1 + 6) = r2
-; CHECK: *(u16 *)(r1 + 4) = r2
-; CHECK: *(u16 *)(r1 + 2) = r2
-; CHECK: *(u16 *)(r10 + 24) = r2
-; CHECK: *(u16 *)(r10 + 22) = r2
-; CHECK: *(u16 *)(r10 + 20) = r2
-; CHECK: *(u16 *)(r10 + 18) = r2
-; CHECK: *(u16 *)(r10 + 16) = r2
-; CHECK: *(u16 *)(r10 + 14) = r2
-; CHECK: *(u16 *)(r10 + 12) = r2
-; CHECK: *(u16 *)(r10 + 10) = r2
-; CHECK: *(u16 *)(r10 + 8) = r2
-; CHECK: *(u16 *)(r10 + 6) = r2
-; CHECK: *(u16 *)(r10 - 2) = r2
-; CHECK: *(u16 *)(r10 + 26) = r2
   call void @llvm.memset.p0i8.i64(i8* %7, i8 0, i64 30, i32 1, i1 false)
   %8 = call i32 (%struct.bpf_map_def*, %struct.routing_key_2*, ...) bitcast (i32 (...)* @bpf_map_lookup_elem to i32 (%struct.bpf_map_def*, %struct.routing_key_2*, ...)*)(%struct.bpf_map_def* nonnull @routing, %struct.routing_key_2* nonnull %key) #3
   ret i32 undef
diff --git a/test/CodeGen/BPF/warn-stack.ll b/test/CodeGen/BPF/warn-stack.ll
index b7992960b73e41f30abe83e32e191e6b9fb22c79..5a579d28554adcfa89e6b42e8f1b262c1010caad 100644
--- a/test/CodeGen/BPF/warn-stack.ll
+++ b/test/CodeGen/BPF/warn-stack.ll
@@ -4,15 +4,15 @@
 define void @nowarn() local_unnamed_addr #0 !dbg !6 {
   %1 = alloca [504 x i8], align 1
   %2 = getelementptr inbounds [504 x i8], [504 x i8]* %1, i64 0, i64 0, !dbg !15
-  call void @llvm.lifetime.start(i64 504, i8* nonnull %2) #4, !dbg !15
+  call void @llvm.lifetime.start.p0i8(i64 504, i8* nonnull %2) #4, !dbg !15
   tail call void @llvm.dbg.declare(metadata [504 x i8]* %1, metadata !10, metadata !16), !dbg !17
   call void @doit(i8* nonnull %2) #4, !dbg !18
-  call void @llvm.lifetime.end(i64 504, i8* nonnull %2) #4, !dbg !19
+  call void @llvm.lifetime.end.p0i8(i64 504, i8* nonnull %2) #4, !dbg !19
   ret void, !dbg !19
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
@@ -20,17 +20,17 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
 declare void @doit(i8*) local_unnamed_addr #3
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 ; CHECK: error: warn_stack.c
 ; CHECK: BPF stack limit
 define void @warn() local_unnamed_addr #0 !dbg !20 {
   %1 = alloca [512 x i8], align 1
   %2 = getelementptr inbounds [512 x i8], [512 x i8]* %1, i64 0, i64 0, !dbg !26
-  call void @llvm.lifetime.start(i64 512, i8* nonnull %2) #4, !dbg !26
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* nonnull %2) #4, !dbg !26
   tail call void @llvm.dbg.declare(metadata [512 x i8]* %1, metadata !22, metadata !16), !dbg !27
   call void @doit(i8* nonnull %2) #4, !dbg !28
-  call void @llvm.lifetime.end(i64 512, i8* nonnull %2) #4, !dbg !29
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* nonnull %2) #4, !dbg !29
   ret void, !dbg !29
 }
 
diff --git a/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll b/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
index 010c0c5536380b05223fdb081b7fe821f48e302a..9e4664ad69c9b4b8309e3539c627613b8a238022 100644
--- a/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
+++ b/test/CodeGen/Generic/2003-07-29-BadConstSbyte.ll
@@ -1,5 +1,8 @@
 ; RUN: llc < %s
 
+; Bug: PR31341
+; XFAIL: avr
+
 ;; Date:     Jul 29, 2003.
 ;; From:     test/Programs/MultiSource/Ptrdist-bc
 ;; Function: ---
diff --git a/test/CodeGen/Generic/2007-04-08-MultipleFrameIndices.ll b/test/CodeGen/Generic/2007-04-08-MultipleFrameIndices.ll
index 21c05f17a7c5356debb18f6a18c35c882455e1dc..e961ea764ec28dfa2d959d1aee43a338ac97314f 100644
--- a/test/CodeGen/Generic/2007-04-08-MultipleFrameIndices.ll
+++ b/test/CodeGen/Generic/2007-04-08-MultipleFrameIndices.ll
@@ -3,6 +3,9 @@
 ; PR1308
 ; PR1557
 
+; Bug: PR31336
+; XFAIL: avr
+
 define i32 @stuff(i32, ...) {
         %foo = alloca i8*
         %bar = alloca i32*
diff --git a/test/CodeGen/Generic/2007-12-17-InvokeAsm.ll b/test/CodeGen/Generic/2007-12-17-InvokeAsm.ll
index fe7f463159a5cfda5dda438fd5f7275d05d17297..bb8058575c8295232e3c47c71c3b89368682ed61 100644
--- a/test/CodeGen/Generic/2007-12-17-InvokeAsm.ll
+++ b/test/CodeGen/Generic/2007-12-17-InvokeAsm.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -no-integrated-as < %s
 
+; XFAIL: avr
+
 define fastcc void @bc__support__high_resolution_time__initialize_clock_rate() personality i32 (...)* @__gxx_personality_v0 {
 entry:
   invoke void asm "rdtsc\0A\09movl %eax, $0\0A\09movl %edx, $1", "=*imr,=*imr,~{dirflag},~{fpsr},~{flags},~{dx},~{ax}"( i32* null, i32* null )
diff --git a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
index 5cc48c212c4081e06551a973b558525d3683fcd3..a9a33d72bca258e80f9d80da2ed3260399bc7b44 100644
--- a/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
+++ b/test/CodeGen/Generic/2011-07-07-ScheduleDAGCrash.ll
@@ -1,4 +1,8 @@
 ; RUN: llc < %s
+
+; Bug: PR31898
+; XFAIL: avr
+
 ; This caused ScheduleDAG to crash in EmitPhysRegCopy when searching
 ; the uses of a copy to a physical register without ignoring non-data
 ; dependence, PR10220.
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
index 921fa62c1c4358d512f8f00fd57b4ca3a2d75475..804e5b0ce9fca953011686521d946dfefaaf2584 100644
--- a/test/CodeGen/Generic/MachineBranchProb.ll
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -1,12 +1,12 @@
 ; RUN: llc < %s -print-machineinstrs=expand-isel-pseudos -o /dev/null 2>&1 | FileCheck %s
 
-; ARM & AArch64 run an extra SimplifyCFG which disrupts this test.
-; XFAIL: arm,aarch64
-
 ; Hexagon runs passes that renumber the basic blocks, causing this test
 ; to fail.
 ; XFAIL: hexagon
 
+; Bug: PR31899
+; XFAIL: avr
+
 ; Make sure we have the correct weight attached to each successor.
 define i32 @test2(i32 %x) nounwind uwtable readnone ssp {
 ; CHECK-LABEL: Machine code for function test2:
diff --git a/test/CodeGen/Generic/externally_available.ll b/test/CodeGen/Generic/externally_available.ll
index 7976cc971880f117b49f248885808291d77c93a8..2376bc7399277c40a7fc1973c831aab20d190c9f 100644
--- a/test/CodeGen/Generic/externally_available.ll
+++ b/test/CodeGen/Generic/externally_available.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | not grep test_
+; RUN: llc -verify-machine-dom-info < %s | not grep test_
 
 ; test_function should not be emitted to the .s file.
 define available_externally i32 @test_function() {
diff --git a/test/CodeGen/Generic/icmp-illegal.ll b/test/CodeGen/Generic/icmp-illegal.ll
index 23d20c04652fbeb90c5cc7f939a91d39eb597ad7..77dd5a59dfd1e4cf3d8aea717db631830ac0fac5 100644
--- a/test/CodeGen/Generic/icmp-illegal.ll
+++ b/test/CodeGen/Generic/icmp-illegal.ll
@@ -1,4 +1,3 @@
-
 ; RUN: llc < %s | FileCheck %s
 
 ; CHECK-LABEL: test_ult
diff --git a/test/CodeGen/Generic/inline-asm-mem-clobber.ll b/test/CodeGen/Generic/inline-asm-mem-clobber.ll
index be1e0a39b3b0dbcf265e4b60139418447c20befe..6184f803b71f6eb45f568cd47d409718ff725772 100644
--- a/test/CodeGen/Generic/inline-asm-mem-clobber.ll
+++ b/test/CodeGen/Generic/inline-asm-mem-clobber.ll
@@ -1,5 +1,8 @@
 ; RUN: llc -O2 -no-integrated-as < %s | FileCheck %s
 
+; Test uses 32-bit registers which aren't supported on AVR.
+; XFAIL: avr
+
 @G = common global i32 0, align 4
 
 define i32 @foo(i8* %p) nounwind uwtable {
diff --git a/test/CodeGen/Generic/overloaded-intrinsic-name.ll b/test/CodeGen/Generic/overloaded-intrinsic-name.ll
index 65fc9c1184cf1f811de076f9298fef967c9af050..89a5f80779911b3fc9837b87a8406e10b9ba38fe 100644
--- a/test/CodeGen/Generic/overloaded-intrinsic-name.ll
+++ b/test/CodeGen/Generic/overloaded-intrinsic-name.ll
@@ -1,4 +1,4 @@
-; RUN: opt -verify -S < %s
+; RUN: opt -verify -S < %s | FileCheck %s
 
 ; Tests the name mangling performed by the codepath following
 ; getMangledTypeStr(). Only tests that code with the various manglings
@@ -44,14 +44,43 @@ define <3 x i32>* @test_vAny(<3 x i32>* %v) gc "statepoint-example" {
 ; struct
 define %struct.test* @test_struct(%struct.test* %v) gc "statepoint-example" {
        %tok = call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, %struct.test* %v)
-       %v-new = call %struct.test* @llvm.experimental.gc.relocate.p0struct.test(token %tok,  i32 7, i32 7)
+       %v-new = call %struct.test* @llvm.experimental.gc.relocate.p0s_struct.tests(token %tok,  i32 7, i32 7)
        ret %struct.test* %v-new
 }
 
+; literal struct with nested literal struct
+define {i64, i64, {i64} }* @test_literal_struct({i64, i64, {i64}}* %v) gc "statepoint-example" {
+       %tok = call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, {i64, i64, {i64}} *%v)
+       %v-new = call {i64, i64, {i64}}* @llvm.experimental.gc.relocate.p0sl_i64i64sl_i64ss.test(token %tok,  i32 7, i32 7)
+       ret {i64, i64, {i64}}* %v-new
+}
+; struct with a horrible name, broken when structs were unprefixed
+%i32 = type { i32 }
+
+define %i32* @test_i32_struct(%i32* %v) gc "statepoint-example" {
+entry:
+      %tok = call token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.p0f_i1f(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, %i32* %v)
+      %v-new = call %i32* @llvm.experimental.gc.relocate.p0s_i32s(token %tok,  i32 7, i32 7)
+      ret %i32* %v-new
+}
+; completely broken intrinsic naming due to needing remangling. Just use random naming to test
+
+define %i32* @test_broken_names(%i32* %v) gc "statepoint-example" {
+entry:
+      %tok = call fastcc token (i64, i32, i1 ()*, i32, i32, ...) @llvm.experimental.gc.statepoint.deadbeef(i64 0, i32 0, i1 ()* @return_i1, i32 0, i32 0, i32 0, i32 0, %i32* %v)
+; Make sure we do not destroy the calling convention when remangling
+; CHECK: fastcc
+      %v-new = call %i32* @llvm.experimental.gc.relocate.beefdead(token %tok,  i32 7, i32 7)
+      ret %i32* %v-new
+}
 declare zeroext i1 @return_i1()
 declare token @llvm.experimental.gc.statepoint.p0f_i1f(i64, i32, i1 ()*, i32, i32, ...)
 declare i32* @llvm.experimental.gc.relocate.p0i32(token, i32, i32)
 declare float* @llvm.experimental.gc.relocate.p0f32(token, i32, i32)
 declare [3 x i32]* @llvm.experimental.gc.relocate.p0a3i32(token, i32, i32)
 declare <3 x i32>* @llvm.experimental.gc.relocate.p0v3i32(token, i32, i32)
-declare %struct.test* @llvm.experimental.gc.relocate.p0struct.test(token, i32, i32)
+declare %struct.test* @llvm.experimental.gc.relocate.p0s_struct.tests(token, i32, i32)
+declare {i64, i64, {i64}}* @llvm.experimental.gc.relocate.p0sl_i64i64sl_i64ss.test(token, i32, i32)
+declare %i32* @llvm.experimental.gc.relocate.p0s_i32s(token, i32, i32)
+declare %i32* @llvm.experimental.gc.relocate.beefdead(token, i32, i32)
+declare token @llvm.experimental.gc.statepoint.deadbeef(i64, i32, i1 ()*, i32, i32, ...)
diff --git a/test/CodeGen/Generic/pr24662.ll b/test/CodeGen/Generic/pr24662.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5a10b9cb0acb444f9d64f6d9bef1a29aca361468
--- /dev/null
+++ b/test/CodeGen/Generic/pr24662.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -fast-isel
+; RUN: llc < %s
+
+define i60 @PR24662a() {
+  ret i60 trunc (i670010 fptoui(float 0x400D9999A0000000 to i670010) to i60)
+}
+
+define i60 @PR24662b() {
+  %1 = fptoui float 0x400D9999A0000000 to i670010
+  %2 = trunc i670010 %1 to i60
+  ret i60 %2
+}
diff --git a/test/CodeGen/Generic/select-cc.ll b/test/CodeGen/Generic/select-cc.ll
index 7510f701b147814588deb6284647e47c3cc09834..b5d2f785dc2ca0237cf64e7997551e402c0ebfcd 100644
--- a/test/CodeGen/Generic/select-cc.ll
+++ b/test/CodeGen/Generic/select-cc.ll
@@ -1,6 +1,11 @@
 ; RUN: llc < %s
+
 ; PR2504
 ; XFAIL: hexagon
+
+; PR31338
+; XFAIL: avr
+
 define <2 x double> @vector_select(<2 x double> %x, <2 x double> %y) nounwind  {
 	%x.lo = extractelement <2 x double> %x, i32 0		; <double> [#uses=1]
 	%x.lo.ge = fcmp oge double %x.lo, 0.000000e+00		; <i1> [#uses=1]
diff --git a/test/CodeGen/Generic/v-split.ll b/test/CodeGen/Generic/v-split.ll
index 00c62f38952057cd7cec1e8ed130e5329c7a0739..91aece94fecd4e2f203841ca80327e58a5d8af68 100644
--- a/test/CodeGen/Generic/v-split.ll
+++ b/test/CodeGen/Generic/v-split.ll
@@ -1,4 +1,8 @@
 ; RUN: llc < %s
+
+; Bug: PR31898
+; XFAIL: avr
+
 %f8 = type <8 x float>
 
 define void @test_f8(%f8 *%P, %f8* %Q, %f8 *%S) {
diff --git a/test/CodeGen/Generic/vector-redux.ll b/test/CodeGen/Generic/vector-redux.ll
index 8efdbf85b8c04a05212d50c81cb6683c83b1b601..64562d6d949042bc1b2bafa9ceda51683611412b 100644
--- a/test/CodeGen/Generic/vector-redux.ll
+++ b/test/CodeGen/Generic/vector-redux.ll
@@ -1,6 +1,9 @@
 ; RUN: llc < %s -debug-only=isel -o /dev/null 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
+; Bug: PR31898
+; XFAIL: avr
+
 @a = global [1024 x i32] zeroinitializer, align 16
 
 define i32 @reduce_add() {
diff --git a/test/CodeGen/Generic/vector.ll b/test/CodeGen/Generic/vector.ll
index 2d4dc501a53abe51a77426b70a0c81c220277239..9c0cacdcd8788997fb807c0d8dfb8e3f945ca388 100644
--- a/test/CodeGen/Generic/vector.ll
+++ b/test/CodeGen/Generic/vector.ll
@@ -1,6 +1,9 @@
 ; Test that vectors are scalarized/lowered correctly.
 ; RUN: llc < %s
 
+; Bug: PR31898
+; XFAIL: avr
+
 %d8 = type <8 x double>
 %f1 = type <1 x float>
 %f2 = type <2 x float>
diff --git a/test/CodeGen/Hexagon/BranchPredict.ll b/test/CodeGen/Hexagon/BranchPredict.ll
index 17d169974e5acc4d88dda17a771475ab23b2e373..40791c981483df42d7cfd6ec7f79ea3c8a993203 100644
--- a/test/CodeGen/Hexagon/BranchPredict.ll
+++ b/test/CodeGen/Hexagon/BranchPredict.ll
@@ -9,7 +9,7 @@
 @j = external global i32
 
 define i32 @foo(i32 %a) nounwind {
-; CHECK: if{{ *}}(!p{{[0-3]}}.new) jump:nt
+; CHECK: if (!p{{[0-3]}}.new) jump:nt
 entry:
   %tobool = icmp eq i32 %a, 0
   br i1 %tobool, label %if.else, label %if.then, !prof !0
@@ -31,7 +31,7 @@ return:                                           ; preds = %if.else, %if.then
 declare i32 @foobar(...)
 
 define i32 @bar(i32 %a) nounwind {
-; CHECK: if{{ *}}(p{{[0-3]}}.new) jump:nt
+; CHECK: if (p{{[0-3]}}.new) jump:nt
 entry:
   %tobool = icmp eq i32 %a, 0
   br i1 %tobool, label %if.else, label %if.then, !prof !1
@@ -51,7 +51,7 @@ return:                                           ; preds = %if.else, %if.then
 }
 
 define i32 @foo_bar(i32 %a, i16 signext %b) nounwind {
-; CHECK: if{{ *}}(!cmp.eq(r{{[0-9]*}}.new, #0)) jump:nt
+; CHECK: if (!cmp.eq(r{{[0-9]*}}.new,#0)) jump:nt
 entry:
   %0 = load i32, i32* @j, align 4
   %tobool = icmp eq i32 %0, 0
diff --git a/test/CodeGen/Hexagon/adde.ll b/test/CodeGen/Hexagon/adde.ll
index 43ddb4307ef260048feea19bd8b747b26426a024..12913eea7e816ec6ab5e06e8abd991b1e5b4786b 100644
--- a/test/CodeGen/Hexagon/adde.ll
+++ b/test/CodeGen/Hexagon/adde.ll
@@ -1,34 +1,27 @@
-; RUN: llc -march=hexagon -disable-hsdr -hexagon-expand-condsets=0 -hexagon-bit=0 -disable-post-ra < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
 
-; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #1)
-; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #0)
-; CHECK: r{{[0-9]+:[0-9]+}} = add(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
-; CHECK: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
-; CHECK: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
-; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}})
-; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}})
-; CHECK: r{{[0-9]+:[0-9]+}} = combine(r{{[0-9]+}}, r{{[0-9]+}})
-; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}})
-; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}})
-; CHECK: r{{[0-9]+:[0-9]+}} = combine(r{{[0-9]+}}, r{{[0-9]+}})
-; CHECK: r{{[0-9]+:[0-9]+}} = add(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
+; CHECK-DAG: r{{[0-9]+:[0-9]+}} = add(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
+; CHECK-DAG: r{{[0-9]+:[0-9]+}} = add(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
+; CHECK-DAG: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
+; CHECK-DAG: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
+; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}})
 
-
-define void @check_adde_addc (i64 %AL, i64 %AH, i64 %BL, i64 %BH, i64* %RL, i64* %RH) {
-entry:
-        %tmp1 = zext i64 %AL to i128
-        %tmp23 = zext i64 %AH to i128
-        %tmp4 = shl i128 %tmp23, 64
-        %tmp5 = or i128 %tmp4, %tmp1
-        %tmp67 = zext i64 %BL to i128
-        %tmp89 = zext i64 %BH to i128
-        %tmp11 = shl i128 %tmp89, 64
-        %tmp12 = or i128 %tmp11, %tmp67
-        %tmp15 = add i128 %tmp12, %tmp5
-        %tmp1617 = trunc i128 %tmp15 to i64
-        store i64 %tmp1617, i64* %RL
-        %tmp21 = lshr i128 %tmp15, 64
-        %tmp2122 = trunc i128 %tmp21 to i64
-        store i64 %tmp2122, i64* %RH
-        ret void
+define void @check_adde_addc(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64* %a4, i64* %a5) {
+b6:
+  %v7 = zext i64 %a0 to i128
+  %v8 = zext i64 %a1 to i128
+  %v9 = shl i128 %v8, 64
+  %v10 = or i128 %v7, %v9
+  %v11 = zext i64 %a2 to i128
+  %v12 = zext i64 %a3 to i128
+  %v13 = shl i128 %v12, 64
+  %v14 = or i128 %v11, %v13
+  %v15 = add i128 %v10, %v14
+  %v16 = lshr i128 %v15, 64
+  %v17 = trunc i128 %v15 to i64
+  %v18 = trunc i128 %v16 to i64
+  store i64 %v17, i64* %a4
+  store i64 %v18, i64* %a5
+  ret void
 }
diff --git a/test/CodeGen/Hexagon/addh-sext-trunc.ll b/test/CodeGen/Hexagon/addh-sext-trunc.ll
index 7f219944436be1fc611fcd8ae97c4a9f2807b385..ec5dc611105d0c1f0c51692421672bca28a05e32 100644
--- a/test/CodeGen/Hexagon/addh-sext-trunc.ll
+++ b/test/CodeGen/Hexagon/addh-sext-trunc.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
-; CHECK: r{{[0-9]+}} = add(r{{[0-9]+}}.{{L|l}}, r{{[0-9]+}}.{{H|h}})
+; CHECK: r{{[0-9]+}} = add(r{{[0-9]+}}.{{L|l}},r{{[0-9]+}}.{{H|h}})
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon-unknown-none"
diff --git a/test/CodeGen/Hexagon/addh-shifted.ll b/test/CodeGen/Hexagon/addh-shifted.ll
index eb263521b42fb4d823ebd537e02d4af5faaf8f02..697a5c5c69bfd35d767a8476562e197d3ca7361b 100644
--- a/test/CodeGen/Hexagon/addh-shifted.ll
+++ b/test/CodeGen/Hexagon/addh-shifted.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
-; CHECK: r{{[0-9]+}} = add(r{{[0-9]+}}.{{L|l}}, r{{[0-9]+}}.{{L|l}}):<<16
+; CHECK: r{{[0-9]+}} = add(r{{[0-9]+}}.{{L|l}},r{{[0-9]+}}.{{L|l}}):<<16
 
 define i64 @test_cast(i64 %arg0, i16 zeroext %arg1, i16 zeroext %arg2) nounwind readnone {
 entry:
diff --git a/test/CodeGen/Hexagon/addh.ll b/test/CodeGen/Hexagon/addh.ll
index c2b536c4669a56e76b9154aa5d333ebe92c1804b..8217d6753cb3fa1e41b1d34ac7ea8c90a690bfb3 100644
--- a/test/CodeGen/Hexagon/addh.ll
+++ b/test/CodeGen/Hexagon/addh.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
-; CHECK: r{{[0-9]+}} = add(r{{[0-9]+}}.{{L|l}}, r{{[0-9]+}}.{{L|l}})
+; CHECK: r{{[0-9]+}} = add(r{{[0-9]+}}.{{L|l}},r{{[0-9]+}}.{{L|l}})
 
 define i64 @test_cast(i64 %arg0, i16 zeroext %arg1, i16 zeroext %arg2) nounwind readnone {
 entry:
diff --git a/test/CodeGen/Hexagon/alu64.ll b/test/CodeGen/Hexagon/alu64.ll
index f986f135937462a3c3e9b8036648ab5fa7d3b05b..453b40a6ee83a4d9d8ed027ec1477753c07a747a 100644
--- a/test/CodeGen/Hexagon/alu64.ll
+++ b/test/CodeGen/Hexagon/alu64.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
 
 ; CHECK-LABEL: @test00
-; CHECK: = cmp.eq(r1:0, r3:2)
+; CHECK: = cmp.eq(r1:0,r3:2)
 define i32 @test00(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.C2.cmpeqp(i64 %Rs, i64 %Rt)
@@ -9,7 +9,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test01
-; CHECK: = cmp.gt(r1:0, r3:2)
+; CHECK: = cmp.gt(r1:0,r3:2)
 define i32 @test01(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.C2.cmpgtp(i64 %Rs, i64 %Rt)
@@ -17,7 +17,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test02
-; CHECK: = cmp.gtu(r1:0, r3:2)
+; CHECK: = cmp.gtu(r1:0,r3:2)
 define i32 @test02(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.C2.cmpgtup(i64 %Rs, i64 %Rt)
@@ -25,7 +25,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test10
-; CHECK: = cmp.eq(r0, r1)
+; CHECK: = cmp.eq(r0,r1)
 define i32 @test10(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.rcmpeq(i32 %Rs, i32 %Rt)
@@ -33,7 +33,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test11
-; CHECK: = !cmp.eq(r0, r1)
+; CHECK: = !cmp.eq(r0,r1)
 define i32 @test11(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.rcmpneq(i32 %Rs, i32 %Rt)
@@ -41,7 +41,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test12
-; CHECK: = cmp.eq(r0, #23)
+; CHECK: = cmp.eq(r0,#23)
 define i32 @test12(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.rcmpeqi(i32 %Rs, i32 23)
@@ -49,7 +49,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test13
-; CHECK: = !cmp.eq(r0, #47)
+; CHECK: = !cmp.eq(r0,#47)
 define i32 @test13(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.rcmpneqi(i32 %Rs, i32 47)
@@ -57,7 +57,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test20
-; CHECK: = cmpb.eq(r0, r1)
+; CHECK: = cmpb.eq(r0,r1)
 define i32 @test20(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbeq(i32 %Rs, i32 %Rt)
@@ -65,7 +65,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test21
-; CHECK: = cmpb.gt(r0, r1)
+; CHECK: = cmpb.gt(r0,r1)
 define i32 @test21(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbgt(i32 %Rs, i32 %Rt)
@@ -73,7 +73,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test22
-; CHECK: = cmpb.gtu(r0, r1)
+; CHECK: = cmpb.gtu(r0,r1)
 define i32 @test22(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbgtu(i32 %Rs, i32 %Rt)
@@ -81,7 +81,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test23
-; CHECK: = cmpb.eq(r0, #56)
+; CHECK: = cmpb.eq(r0,#56)
 define i32 @test23(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbeqi(i32 %Rs, i32 56)
@@ -89,7 +89,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test24
-; CHECK: = cmpb.gt(r0, #29)
+; CHECK: = cmpb.gt(r0,#29)
 define i32 @test24(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbgti(i32 %Rs, i32 29)
@@ -97,7 +97,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test25
-; CHECK: = cmpb.gtu(r0, #111)
+; CHECK: = cmpb.gtu(r0,#111)
 define i32 @test25(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpbgtui(i32 %Rs, i32 111)
@@ -105,7 +105,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test30
-; CHECK: = cmph.eq(r0, r1)
+; CHECK: = cmph.eq(r0,r1)
 define i32 @test30(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpheq(i32 %Rs, i32 %Rt)
@@ -113,7 +113,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test31
-; CHECK: = cmph.gt(r0, r1)
+; CHECK: = cmph.gt(r0,r1)
 define i32 @test31(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmphgt(i32 %Rs, i32 %Rt)
@@ -121,7 +121,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test32
-; CHECK: = cmph.gtu(r0, r1)
+; CHECK: = cmph.gtu(r0,r1)
 define i32 @test32(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmphgtu(i32 %Rs, i32 %Rt)
@@ -129,7 +129,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test33
-; CHECK: = cmph.eq(r0, #-123)
+; CHECK: = cmph.eq(r0,#-123)
 define i32 @test33(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmpheqi(i32 %Rs, i32 -123)
@@ -137,7 +137,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test34
-; CHECK: = cmph.gt(r0, #-3)
+; CHECK: = cmph.gt(r0,#-3)
 define i32 @test34(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmphgti(i32 %Rs, i32 -3)
@@ -145,7 +145,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test35
-; CHECK: = cmph.gtu(r0, #13)
+; CHECK: = cmph.gtu(r0,#13)
 define i32 @test35(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.cmphgtui(i32 %Rs, i32 13)
@@ -153,7 +153,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test40
-; CHECK: = vmux(p0, r3:2, r5:4)
+; CHECK: = vmux(p0,r3:2,r5:4)
 define i64 @test40(i32 %Pu, i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.C2.vmux(i32 %Pu, i64 %Rs, i64 %Rt)
@@ -161,7 +161,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test41
-; CHECK: = any8(vcmpb.eq(r1:0, r3:2))
+; CHECK: = any8(vcmpb.eq(r1:0,r3:2))
 define i32 @test41(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.vcmpbeq.any(i64 %Rs, i64 %Rt)
@@ -169,7 +169,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test50
-; CHECK: = add(r1:0, r3:2)
+; CHECK: = add(r1:0,r3:2)
 define i64 @test50(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.addp(i64 %Rs, i64 %Rt)
@@ -177,7 +177,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test51
-; CHECK: = add(r1:0, r3:2):sat
+; CHECK: = add(r1:0,r3:2):sat
 define i64 @test51(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.addpsat(i64 %Rs, i64 %Rt)
@@ -185,7 +185,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test52
-; CHECK: = sub(r1:0, r3:2)
+; CHECK: = sub(r1:0,r3:2)
 define i64 @test52(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.subp(i64 %Rs, i64 %Rt)
@@ -193,7 +193,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test53
-; CHECK: = add(r1:0, r3:2):raw:
+; CHECK: = add(r1:0,r3:2):raw:
 define i64 @test53(i32 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.addsp(i32 %Rs, i64 %Rt)
@@ -201,7 +201,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test54
-; CHECK: = and(r1:0, r3:2)
+; CHECK: = and(r1:0,r3:2)
 define i64 @test54(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.andp(i64 %Rs, i64 %Rt)
@@ -209,7 +209,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test55
-; CHECK: = or(r1:0, r3:2)
+; CHECK: = or(r1:0,r3:2)
 define i64 @test55(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.orp(i64 %Rs, i64 %Rt)
@@ -217,7 +217,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test56
-; CHECK: = xor(r1:0, r3:2)
+; CHECK: = xor(r1:0,r3:2)
 define i64 @test56(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A2.xorp(i64 %Rs, i64 %Rt)
@@ -225,7 +225,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test57
-; CHECK: = and(r1:0, ~r3:2)
+; CHECK: = and(r1:0,~r3:2)
 define i64 @test57(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A4.andnp(i64 %Rs, i64 %Rt)
@@ -233,7 +233,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test58
-; CHECK: = or(r1:0, ~r3:2)
+; CHECK: = or(r1:0,~r3:2)
 define i64 @test58(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A4.ornp(i64 %Rs, i64 %Rt)
@@ -241,7 +241,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test60
-; CHECK: = add(r0.l, r1.l)
+; CHECK: = add(r0.l,r1.l)
 define i32 @test60(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.l16.ll(i32 %Rs, i32 %Rt)
@@ -249,7 +249,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test61
-; CHECK: = add(r0.l, r1.h)
+; CHECK: = add(r0.l,r1.h)
 define i32 @test61(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.l16.hl(i32 %Rs, i32 %Rt)
@@ -257,7 +257,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test62
-; CHECK: = add(r0.l, r1.l):sat
+; CHECK: = add(r0.l,r1.l):sat
 define i32 @test62(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32 %Rs, i32 %Rt)
@@ -265,7 +265,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test63
-; CHECK: = add(r0.l, r1.h):sat
+; CHECK: = add(r0.l,r1.h):sat
 define i32 @test63(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.l16.sat.hl(i32 %Rs, i32 %Rt)
@@ -273,7 +273,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test64
-; CHECK: = add(r0.l, r1.l):<<16
+; CHECK: = add(r0.l,r1.l):<<16
 define i32 @test64(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.ll(i32 %Rs, i32 %Rt)
@@ -281,7 +281,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test65
-; CHECK: = add(r0.l, r1.h):<<16
+; CHECK: = add(r0.l,r1.h):<<16
 define i32 @test65(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.lh(i32 %Rs, i32 %Rt)
@@ -289,7 +289,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test66
-; CHECK: = add(r0.h, r1.l):<<16
+; CHECK: = add(r0.h,r1.l):<<16
 define i32 @test66(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.hl(i32 %Rs, i32 %Rt)
@@ -297,7 +297,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test67
-; CHECK: = add(r0.h, r1.h):<<16
+; CHECK: = add(r0.h,r1.h):<<16
 define i32 @test67(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.hh(i32 %Rs, i32 %Rt)
@@ -305,7 +305,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test68
-; CHECK: = add(r0.l, r1.l):sat:<<16
+; CHECK: = add(r0.l,r1.l):sat:<<16
 define i32 @test68(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.sat.ll(i32 %Rs, i32 %Rt)
@@ -313,7 +313,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test69
-; CHECK: = add(r0.l, r1.h):sat:<<16
+; CHECK: = add(r0.l,r1.h):sat:<<16
 define i32 @test69(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.sat.lh(i32 %Rs, i32 %Rt)
@@ -321,7 +321,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test6A
-; CHECK: = add(r0.h, r1.l):sat:<<16
+; CHECK: = add(r0.h,r1.l):sat:<<16
 define i32 @test6A(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.sat.hl(i32 %Rs, i32 %Rt)
@@ -329,7 +329,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test6B
-; CHECK: = add(r0.h, r1.h):sat:<<16
+; CHECK: = add(r0.h,r1.h):sat:<<16
 define i32 @test6B(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.addh.h16.sat.hh(i32 %Rs, i32 %Rt)
@@ -337,7 +337,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test70
-; CHECK: = sub(r0.l, r1.l)
+; CHECK: = sub(r0.l,r1.l)
 define i32 @test70(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.l16.ll(i32 %Rs, i32 %Rt)
@@ -345,7 +345,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test71
-; CHECK: = sub(r0.l, r1.h)
+; CHECK: = sub(r0.l,r1.h)
 define i32 @test71(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.l16.hl(i32 %Rs, i32 %Rt)
@@ -353,7 +353,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test72
-; CHECK: = sub(r0.l, r1.l):sat
+; CHECK: = sub(r0.l,r1.l):sat
 define i32 @test72(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.l16.sat.ll(i32 %Rs, i32 %Rt)
@@ -361,7 +361,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test73
-; CHECK: = sub(r0.l, r1.h):sat
+; CHECK: = sub(r0.l,r1.h):sat
 define i32 @test73(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.l16.sat.hl(i32 %Rs, i32 %Rt)
@@ -369,7 +369,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test74
-; CHECK: = sub(r0.l, r1.l):<<16
+; CHECK: = sub(r0.l,r1.l):<<16
 define i32 @test74(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.ll(i32 %Rs, i32 %Rt)
@@ -377,7 +377,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test75
-; CHECK: = sub(r0.l, r1.h):<<16
+; CHECK: = sub(r0.l,r1.h):<<16
 define i32 @test75(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.lh(i32 %Rs, i32 %Rt)
@@ -385,7 +385,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test76
-; CHECK: = sub(r0.h, r1.l):<<16
+; CHECK: = sub(r0.h,r1.l):<<16
 define i32 @test76(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.hl(i32 %Rs, i32 %Rt)
@@ -393,7 +393,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test77
-; CHECK: = sub(r0.h, r1.h):<<16
+; CHECK: = sub(r0.h,r1.h):<<16
 define i32 @test77(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.hh(i32 %Rs, i32 %Rt)
@@ -401,7 +401,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test78
-; CHECK: = sub(r0.l, r1.l):sat:<<16
+; CHECK: = sub(r0.l,r1.l):sat:<<16
 define i32 @test78(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.sat.ll(i32 %Rs, i32 %Rt)
@@ -409,7 +409,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test79
-; CHECK: = sub(r0.l, r1.h):sat:<<16
+; CHECK: = sub(r0.l,r1.h):sat:<<16
 define i32 @test79(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.sat.lh(i32 %Rs, i32 %Rt)
@@ -417,7 +417,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test7A
-; CHECK: = sub(r0.h, r1.l):sat:<<16
+; CHECK: = sub(r0.h,r1.l):sat:<<16
 define i32 @test7A(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.sat.hl(i32 %Rs, i32 %Rt)
@@ -425,7 +425,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test7B
-; CHECK: = sub(r0.h, r1.h):sat:<<16
+; CHECK: = sub(r0.h,r1.h):sat:<<16
 define i32 @test7B(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A2.subh.h16.sat.hh(i32 %Rs, i32 %Rt)
@@ -433,7 +433,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test90
-; CHECK: = and(#1, asl(r0, #2))
+; CHECK: = and(#1,asl(r0,#2))
 define i32 @test90(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.andi.asl.ri(i32 1, i32 %Rs, i32 2)
@@ -441,7 +441,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test91
-; CHECK: = or(#1, asl(r0, #2))
+; CHECK: = or(#1,asl(r0,#2))
 define i32 @test91(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.ori.asl.ri(i32 1, i32 %Rs, i32 2)
@@ -449,7 +449,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test92
-; CHECK: = add(#1, asl(r0, #2))
+; CHECK: = add(#1,asl(r0,#2))
 define i32 @test92(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.addi.asl.ri(i32 1, i32 %Rs, i32 2)
@@ -457,7 +457,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test93
-; CHECK: = sub(#1, asl(r0, #2))
+; CHECK: = sub(#1,asl(r0,#2))
 define i32 @test93(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.subi.asl.ri(i32 1, i32 %Rs, i32 2)
@@ -465,7 +465,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test94
-; CHECK: = and(#1, lsr(r0, #2))
+; CHECK: = and(#1,lsr(r0,#2))
 define i32 @test94(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.andi.lsr.ri(i32 1, i32 %Rs, i32 2)
@@ -473,7 +473,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test95
-; CHECK: = or(#1, lsr(r0, #2))
+; CHECK: = or(#1,lsr(r0,#2))
 define i32 @test95(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.ori.lsr.ri(i32 1, i32 %Rs, i32 2)
@@ -481,7 +481,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test96
-; CHECK: = add(#1, lsr(r0, #2))
+; CHECK: = add(#1,lsr(r0,#2))
 define i32 @test96(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.addi.lsr.ri(i32 1, i32 %Rs, i32 2)
@@ -489,7 +489,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test97
-; CHECK: = sub(#1, lsr(r0, #2))
+; CHECK: = sub(#1,lsr(r0,#2))
 define i32 @test97(i32 %Rs) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.subi.lsr.ri(i32 1, i32 %Rs, i32 2)
@@ -497,7 +497,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test100
-; CHECK: = bitsplit(r0, r1)
+; CHECK: = bitsplit(r0,r1)
 define i64 @test100(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.A4.bitsplit(i32 %Rs, i32 %Rt)
@@ -505,7 +505,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test101
-; CHECK: = modwrap(r0, r1)
+; CHECK: = modwrap(r0,r1)
 define i32 @test101(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.A4.modwrapu(i32 %Rs, i32 %Rt)
@@ -513,7 +513,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test102
-; CHECK: = parity(r1:0, r3:2)
+; CHECK: = parity(r1:0,r3:2)
 define i32 @test102(i64 %Rs, i64 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S2.parityp(i64 %Rs, i64 %Rt)
@@ -521,7 +521,7 @@ entry:
 }
 
 ; CHECK-LABEL: @test103
-; CHECK: = parity(r0, r1)
+; CHECK: = parity(r0,r1)
 define i32 @test103(i32 %Rs, i32 %Rt) #0 {
 entry:
   %0 = tail call i32 @llvm.hexagon.S4.parity(i32 %Rs, i32 %Rt)
diff --git a/test/CodeGen/Hexagon/args.ll b/test/CodeGen/Hexagon/args.ll
index 3bfb8b159556d03cacfeb2db9f1e186a230209ec..a1c7bc3230ddeaba9c1179605c3286c3ca82d569 100644
--- a/test/CodeGen/Hexagon/args.ll
+++ b/test/CodeGen/Hexagon/args.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
-; CHECK: r5:4 = combine(#6, #5)
-; CHECK: r3:2 = combine(#4, #3)
-; CHECK: r1:0 = combine(#2, #1)
-; CHECK: memw(r29+#0)=#7
+; CHECK: r5:4 = combine(#6,#5)
+; CHECK: r3:2 = combine(#4,#3)
+; CHECK: r1:0 = combine(#2,#1)
+; CHECK: memw(r29+#0) = #7
 
 
 define void @foo() nounwind {
diff --git a/test/CodeGen/Hexagon/avoid-predspill-calleesaved.ll b/test/CodeGen/Hexagon/avoid-predspill-calleesaved.ll
index 561013b174ddceb1e4b77e3061dcd40b710cc7d2..906a877b91e51ed5ebd7ee108d01ac6d55f9262f 100644
--- a/test/CodeGen/Hexagon/avoid-predspill-calleesaved.ll
+++ b/test/CodeGen/Hexagon/avoid-predspill-calleesaved.ll
@@ -7,7 +7,6 @@
 ; without adding an extra spill of that register.
 ;
 ; CHECK: PredSpill:
-; CHECK: memd(r29{{.*}}) = r17:16
 ; CHECK-DAG: r{{[0-9]+}} = p0
 ; CHECK-DAG: p0 = r{{[0-9]+}}
 ; CHECK-NOT: = memw(r29
diff --git a/test/CodeGen/Hexagon/bit-bitsplit-at.ll b/test/CodeGen/Hexagon/bit-bitsplit-at.ll
new file mode 100644
index 0000000000000000000000000000000000000000..87d535fd0f22aa08b39825ff9c7c12c2fe0af3e2
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-bitsplit-at.ll
@@ -0,0 +1,33 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: asserts
+
+; This testcase used to crash due to putting the bitsplit instruction in a
+; wrong place.
+; CHECK: bitsplit
+
+target triple = "hexagon"
+
+define hidden fastcc i32 @fred(i32 %a0) unnamed_addr #0 {
+b1:
+  %v2 = lshr i32 %a0, 16
+  %v3 = trunc i32 %v2 to i8
+  br i1 undef, label %b6, label %b4
+
+b4:                                               ; preds = %b1
+  %v5 = and i32 %a0, 65535
+  br i1 undef, label %b8, label %b9
+
+b6:                                               ; preds = %b1
+  %v7 = and i32 %a0, 65535
+  br label %b9
+
+b8:                                               ; preds = %b4
+  store i8 %v3, i8* undef, align 2
+  unreachable
+
+b9:                                               ; preds = %b6, %b4
+  %v10 = phi i32 [ %v7, %b6 ], [ %v5, %b4 ]
+  ret i32 %v10
+}
+
+attributes #0 = { nounwind optsize "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" }
diff --git a/test/CodeGen/Hexagon/bit-bitsplit-src.ll b/test/CodeGen/Hexagon/bit-bitsplit-src.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2d1c71c709f47eb381499e7540eccec0ca840ee7
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-bitsplit-src.ll
@@ -0,0 +1,35 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: asserts
+
+; This used to crash. Check for some sane output.
+; CHECK: call printf
+
+target triple = "hexagon"
+
+@g0 = external local_unnamed_addr global [4 x i64], align 8
+@g1 = external hidden unnamed_addr constant [29 x i8], align 1
+@g2 = external hidden unnamed_addr constant [29 x i8], align 1
+
+define void @fred() local_unnamed_addr #0 {
+b0:
+  %v1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @g0, i32 0, i32 0), align 8
+  %v2 = trunc i64 %v1 to i32
+  %v3 = lshr i64 %v1, 16
+  %v4 = trunc i64 %v3 to i32
+  %v5 = and i32 %v4, 255
+  %v6 = add nuw nsw i32 0, %v5
+  %v7 = add nuw nsw i32 %v6, 0
+  %v8 = zext i32 %v7 to i64
+  %v9 = and i32 %v2, 65535
+  %v10 = and i32 %v4, 65535
+  %v11 = add nuw nsw i32 %v10, %v9
+  %v12 = zext i32 %v11 to i64
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @g1, i32 0, i32 0), i64 %v8) #0
+  tail call void (i8*, ...) @printf(i8* getelementptr inbounds ([29 x i8], [29 x i8]* @g2, i32 0, i32 0), i64 %v12) #0
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @printf(i8* nocapture readonly, ...) local_unnamed_addr #0
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
diff --git a/test/CodeGen/Hexagon/bit-bitsplit.ll b/test/CodeGen/Hexagon/bit-bitsplit.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4ae2e4e6650839542b5344aee1f337b182c19500
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-bitsplit.ll
@@ -0,0 +1,17 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: bitsplit(r{{[0-9]+}},#5)
+
+target triple = "hexagon"
+
+define i32 @fred(i32 %a, i32* nocapture readonly %b) local_unnamed_addr #0 {
+entry:
+  %and = and i32 %a, 31
+  %shr = lshr i32 %a, 5
+  %arrayidx = getelementptr inbounds i32, i32* %b, i32 %shr
+  %0 = load i32, i32* %arrayidx, align 4
+  %shr1 = lshr i32 %0, %and
+  %and2 = and i32 %shr1, 1
+  ret i32 %and2
+}
+
+attributes #0 = { norecurse nounwind readonly "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double" }
diff --git a/test/CodeGen/Hexagon/bit-eval.ll b/test/CodeGen/Hexagon/bit-eval.ll
index 1d2be5bfc19d971a14a7553b350403848b7fe6e8..5b0111dfcd10965e36cd97ffe286afe12c74316b 100644
--- a/test/CodeGen/Hexagon/bit-eval.ll
+++ b/test/CodeGen/Hexagon/bit-eval.ll
@@ -20,7 +20,7 @@ entry:
 }
 
 ; CHECK-LABEL: test3:
-; CHECK: r1:0 = combine(#0, #1)
+; CHECK: r1:0 = combine(#0,#1)
 define i64 @test3() #0 {
 entry:
   %0 = tail call i64 @llvm.hexagon.S4.extractp(i64 -1, i32 63, i32 63)
diff --git a/test/CodeGen/Hexagon/bit-ext-sat.ll b/test/CodeGen/Hexagon/bit-ext-sat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..47c49c2364b7e3bb44a6375437257cd759035dfb
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-ext-sat.ll
@@ -0,0 +1,57 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+target triple = "hexagon"
+
+; CHECK-LABEL: xh_sh
+; CHECK: sath
+; CHECK-NOT: sxth
+define i32 @xh_sh(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = tail call i32 @llvm.hexagon.A2.sath(i32 %x)
+  %1 = tail call i32 @llvm.hexagon.A2.sxth(i32 %0)
+  ret i32 %1
+}
+
+; CHECK-LABEL: xb_sb
+; CHECK: satb
+; CHECK-NOT: sxtb
+define i32 @xb_sb(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = tail call i32 @llvm.hexagon.A2.satb(i32 %x)
+  %1 = tail call i32 @llvm.hexagon.A2.sxtb(i32 %0)
+  ret i32 %1
+}
+
+; CHECK-LABEL: xuh_suh
+; CHECK: satuh
+; CHECK-NOT: zxth
+define i32 @xuh_suh(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = tail call i32 @llvm.hexagon.A2.satuh(i32 %x)
+  %1 = tail call i32 @llvm.hexagon.A2.zxth(i32 %0)
+  ret i32 %1
+}
+
+; CHECK-LABEL: xub_sub
+; CHECK: satub
+; CHECK-NOT: zxtb
+define i32 @xub_sub(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = tail call i32 @llvm.hexagon.A2.satub(i32 %x)
+  %1 = tail call i32 @llvm.hexagon.A2.zxtb(i32 %0)
+  ret i32 %1
+}
+
+
+declare i32 @llvm.hexagon.A2.sxtb(i32) #1
+declare i32 @llvm.hexagon.A2.sxth(i32) #1
+declare i32 @llvm.hexagon.A2.zxtb(i32) #1
+declare i32 @llvm.hexagon.A2.zxth(i32) #1
+
+declare i32 @llvm.hexagon.A2.satb(i32) #1
+declare i32 @llvm.hexagon.A2.sath(i32) #1
+declare i32 @llvm.hexagon.A2.satub(i32) #1
+declare i32 @llvm.hexagon.A2.satuh(i32) #1
+
+attributes #0 = { nounwind readnone "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/bit-extract-off.ll b/test/CodeGen/Hexagon/bit-extract-off.ll
new file mode 100644
index 0000000000000000000000000000000000000000..183435ab7b23a0ea450c85f2ef5113a87c819176
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-extract-off.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; CHECK: extractu(r1,#31,#0)
+
+; In the IR this was an extract of 31 bits starting at position 32 in r1:0.
+; When mapping it to an extract from r1, the offset was not reset to 0, and
+; we had "extractu(r1,#31,#32)".
+
+target triple = "hexagon"
+
+define hidden i32 @fred([101 x double]* %a0, i32 %a1, i32* %a2, i32* %a3) #0 {
+b4:
+  br label %b5
+
+b5:                                               ; preds = %b5, %b4
+  %v6 = call double @fabs(double undef) #1
+  store double %v6, double* undef, align 8
+  br label %b5
+}
+
+declare double @fabs(double) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
+attributes #1 = { nounwind readnone "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
diff --git a/test/CodeGen/Hexagon/bit-extract.ll b/test/CodeGen/Hexagon/bit-extract.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ad7d05d2c235bb03fb2c409771a7147ec57c0857
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-extract.ll
@@ -0,0 +1,75 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+target triple = "hexagon"
+
+; CHECK-LABEL: ua
+; CHECK: extractu(r0,#26,#0)
+define i32 @ua(i32 %x) local_unnamed_addr #0 {
+entry:
+  %shl = and i32 %x, 67108863
+  ret i32 %shl
+}
+
+; CHECK-LABEL: ub
+; CHECK: extractu(r0,#16,#4)
+define i32 @ub(i32 %x) local_unnamed_addr #0 {
+entry:
+  %0 = lshr i32 %x, 4
+  %shr = and i32 %0, 65535
+  ret i32 %shr
+}
+
+; CHECK-LABEL: uc
+; CHECK: extractu(r0,#24,#0)
+define i32 @uc(i32 %x) local_unnamed_addr #0 {
+entry:
+  %shl = and i32 %x, 16777215
+  ret i32 %shl
+}
+
+; CHECK-LABEL: ud
+; CHECK: extractu(r0,#16,#8)
+define i32 @ud(i32 %x) local_unnamed_addr #0 {
+entry:
+  %bf.lshr = lshr i32 %x, 8
+  %bf.clear = and i32 %bf.lshr, 65535
+  ret i32 %bf.clear
+}
+
+; CHECK-LABEL: sa
+; CHECK: extract(r0,#26,#0)
+define i32 @sa(i32 %x) local_unnamed_addr #0 {
+entry:
+  %shl = shl i32 %x, 6
+  %shr = ashr exact i32 %shl, 6
+  ret i32 %shr
+}
+
+; CHECK-LABEL: sb
+; CHECK: extract(r0,#16,#4)
+define i32 @sb(i32 %x) local_unnamed_addr #0 {
+entry:
+  %shl = shl i32 %x, 12
+  %shr = ashr i32 %shl, 16
+  ret i32 %shr
+}
+
+; CHECK-LABEL: sc
+; CHECK: extract(r0,#24,#0)
+define i32 @sc(i32 %x) local_unnamed_addr #0 {
+entry:
+  %shl = shl i32 %x, 8
+  %shr = ashr exact i32 %shl, 8
+  ret i32 %shr
+}
+
+; CHECK-LABEL: sd
+; CHECK: extract(r0,#16,#8)
+define i32 @sd(i32 %x) local_unnamed_addr #0 {
+entry:
+  %bf.shl = shl i32 %x, 8
+  %bf.ashr = ashr i32 %bf.shl, 16
+  ret i32 %bf.ashr
+}
+
+attributes #0 = { noinline norecurse nounwind readnone "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
diff --git a/test/CodeGen/Hexagon/bit-has.ll b/test/CodeGen/Hexagon/bit-has.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9022de3918682af270081e34e058b0c9f91266c0
--- /dev/null
+++ b/test/CodeGen/Hexagon/bit-has.ll
@@ -0,0 +1,64 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: asserts
+
+; This used to crash. Check for some sane output.
+; CHECK: sath
+
+target triple = "hexagon"
+
+define void @fred() local_unnamed_addr #0 {
+b0:
+  %v1 = load i32, i32* undef, align 4
+  %v2 = tail call i32 @llvm.hexagon.A2.sath(i32 undef)
+  %v3 = and i32 %v1, 603979776
+  %v4 = trunc i32 %v3 to i30
+  switch i30 %v4, label %b22 [
+    i30 -536870912, label %b5
+    i30 -469762048, label %b6
+  ]
+
+b5:                                               ; preds = %b0
+  unreachable
+
+b6:                                               ; preds = %b0
+  %v7 = load i32, i32* undef, align 4
+  %v8 = sub nsw i32 65536, %v7
+  %v9 = load i32, i32* undef, align 4
+  %v10 = mul nsw i32 %v9, %v9
+  %v11 = zext i32 %v10 to i64
+  %v12 = mul nsw i32 %v2, %v8
+  %v13 = sext i32 %v12 to i64
+  %v14 = mul nsw i64 %v13, %v11
+  %v15 = trunc i64 %v14 to i32
+  %v16 = and i32 %v15, 2147483647
+  store i32 %v16, i32* undef, align 4
+  %v17 = lshr i64 %v14, 31
+  %v18 = trunc i64 %v17 to i32
+  store i32 %v18, i32* undef, align 4
+  br label %b19
+
+b19:                                              ; preds = %b6
+  br i1 undef, label %b20, label %b21
+
+b20:                                              ; preds = %b19
+  unreachable
+
+b21:                                              ; preds = %b19
+  br label %b23
+
+b22:                                              ; preds = %b0
+  unreachable
+
+b23:                                              ; preds = %b21
+  %v24 = load i32, i32* undef, align 4
+  %v25 = shl i32 %v24, 1
+  %v26 = and i32 %v25, 65534
+  %v27 = or i32 %v26, 0
+  store i32 %v27, i32* undef, align 4
+  ret void
+}
+
+declare i32 @llvm.hexagon.A2.sath(i32) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" "target-features"="-hvx,-hvx-double,-long-calls" }
+attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/bit-phi.ll b/test/CodeGen/Hexagon/bit-phi.ll
index 86b18d8bf2563bc11818cf6e3eb1da1c9ca28de4..7abfba079bb07456ec77c301a3eb190ad6dfaea9 100644
--- a/test/CodeGen/Hexagon/bit-phi.ll
+++ b/test/CodeGen/Hexagon/bit-phi.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=hexagon < %s
+; RUN: llc -march=hexagon -disable-hcp < %s
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
diff --git a/test/CodeGen/Hexagon/bit-rie.ll b/test/CodeGen/Hexagon/bit-rie.ll
index 6bd0558f580c32d959be1bca280ccacfd8e21b4d..302382a1ade473c78911a320b180216963a7b533 100644
--- a/test/CodeGen/Hexagon/bit-rie.ll
+++ b/test/CodeGen/Hexagon/bit-rie.ll
@@ -187,8 +187,8 @@ declare i32 @llvm.hexagon.S2.asr.r.r.sat(i32, i32) #2
 declare i32 @llvm.hexagon.S2.clbnorm(i32) #2
 declare i32 @llvm.hexagon.S2.lsr.r.r(i32, i32) #2
 declare i64 @llvm.hexagon.M2.mpyd.ll.s1(i32, i32) #2
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 attributes #0 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/Hexagon/bit-skip-byval.ll b/test/CodeGen/Hexagon/bit-skip-byval.ll
index d6c1aad94007dab0f6db47e42dc5816fd57705d2..9ee4014ae346ad9df5da925109860ec4efc8428c 100644
--- a/test/CodeGen/Hexagon/bit-skip-byval.ll
+++ b/test/CodeGen/Hexagon/bit-skip-byval.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
 ;
 ; Either and or zxtb.
-; CHECK: r0 = and(r1, #255)
+; CHECK: r0 = and(r1,#255)
 
 %struct.t0 = type { i32 }
 
diff --git a/test/CodeGen/Hexagon/bit-validate-reg.ll b/test/CodeGen/Hexagon/bit-validate-reg.ll
index 16d4a5e4484d2980a70afafbbcbb8b9994b32e01..42eed97786cd4c7f526309292d568674f3c9f06d 100644
--- a/test/CodeGen/Hexagon/bit-validate-reg.ll
+++ b/test/CodeGen/Hexagon/bit-validate-reg.ll
@@ -1,10 +1,13 @@
-; RUN: llc -march=hexagon < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexbit-extract=0 < %s | FileCheck %s
 
 ; Make sure we don't generate zxtb to transfer a predicate register into
 ; a general purpose register.
 
 ; CHECK: r0 = p0
 ; CHECK-NOT: zxtb(p
+; CHECK-NOT: and(p
+; CHECK-NOT: extract(p
+; CHECK-NOT: extractu(p
 
 target triple = "hexagon"
 
diff --git a/test/CodeGen/Hexagon/bitmanip.ll b/test/CodeGen/Hexagon/bitmanip.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2044a2fdd083b29bf7b0b973ee839bd6fc19863b
--- /dev/null
+++ b/test/CodeGen/Hexagon/bitmanip.ll
@@ -0,0 +1,135 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; CHECK-LABEL: popcount_16
+; CHECK: zxth
+; CHECK: popcount
+define i16 @popcount_16(i16 %p) #0 {
+  %t = call i16 @llvm.ctpop.i16(i16 %p) #0
+  ret i16 %t
+}
+
+; CHECK-LABEL: popcount_32
+; CHECK: popcount
+define i32 @popcount_32(i32 %p) #0 {
+  %t = call i32 @llvm.ctpop.i32(i32 %p) #0
+  ret i32 %t
+}
+
+; CHECK-LABEL: popcount_64
+; CHECK: popcount
+define i64 @popcount_64(i64 %p) #0 {
+  %t = call i64 @llvm.ctpop.i64(i64 %p) #0
+  ret i64 %t
+}
+
+; CHECK-LABEL: ctlz_16
+; CHECK: [[REG0:r[0-9]+]] = zxth
+; CHECK: [[REG1:r[0-9]+]] = cl0([[REG0]])
+; CHECK: add([[REG1]],#-16)
+define i16 @ctlz_16(i16 %p) #0 {
+  %t = call i16 @llvm.ctlz.i16(i16 %p, i1 true) #0
+  ret i16 %t
+}
+
+; CHECK-LABEL: ctlz_32
+; CHECK: cl0
+define i32 @ctlz_32(i32 %p) #0 {
+  %t = call i32 @llvm.ctlz.i32(i32 %p, i1 true) #0
+  ret i32 %t
+}
+
+; CHECK-LABEL: ctlz_64
+; CHECK: cl0
+define i64 @ctlz_64(i64 %p) #0 {
+  %t = call i64 @llvm.ctlz.i64(i64 %p, i1 true) #0
+  ret i64 %t
+}
+
+; CHECK-LABEL: cttz_16
+; CHECK: ct0
+define i16 @cttz_16(i16 %p) #0 {
+  %t = call i16 @llvm.cttz.i16(i16 %p, i1 true) #0
+  ret i16 %t
+}
+
+; CHECK-LABEL: cttz_32
+; CHECK: ct0
+define i32 @cttz_32(i32 %p) #0 {
+  %t = call i32 @llvm.cttz.i32(i32 %p, i1 true) #0
+  ret i32 %t
+}
+
+; CHECK-LABEL: cttz_64
+; CHECK: ct0
+define i64 @cttz_64(i64 %p) #0 {
+  %t = call i64 @llvm.cttz.i64(i64 %p, i1 true) #0
+  ret i64 %t
+}
+
+; CHECK-LABEL: brev_16
+; CHECK: [[REG:r[0-9]+]] = brev
+; CHECK: lsr([[REG]],#16)
+define i16 @brev_16(i16 %p) #0 {
+  %t = call i16 @llvm.bitreverse.i16(i16 %p) #0
+  ret i16 %t
+}
+
+; CHECK-LABEL: brev_32
+; CHECK: brev
+define i32 @brev_32(i32 %p) #0 {
+  %t = call i32 @llvm.bitreverse.i32(i32 %p) #0
+  ret i32 %t
+}
+
+; CHECK-LABEL: brev_64
+; CHECK: brev
+define i64 @brev_64(i64 %p) #0 {
+  %t = call i64 @llvm.bitreverse.i64(i64 %p) #0
+  ret i64 %t
+}
+
+; CHECK-LABEL: bswap_16
+; CHECK: [[REG:r[0-9]+]] = swiz
+; CHECK: lsr([[REG]],#16)
+define i16 @bswap_16(i16 %p) #0 {
+  %t = call i16 @llvm.bswap.i16(i16 %p) #0
+  ret i16 %t
+}
+
+; CHECK-LABEL: bswap_32
+; CHECK: swiz
+define i32 @bswap_32(i32 %p) #0 {
+  %t = call i32 @llvm.bswap.i32(i32 %p) #0
+  ret i32 %t
+}
+
+; CHECK-LABEL: bswap_64
+; CHECK: swiz
+; CHECK: swiz
+; CHECK: combine
+define i64 @bswap_64(i64 %p) #0 {
+  %t = call i64 @llvm.bswap.i64(i64 %p) #0
+  ret i64 %t
+}
+
+declare i16 @llvm.ctpop.i16(i16) #0
+declare i32 @llvm.ctpop.i32(i32) #0
+declare i64 @llvm.ctpop.i64(i64) #0
+
+declare i16 @llvm.ctlz.i16(i16, i1) #0
+declare i32 @llvm.ctlz.i32(i32, i1) #0
+declare i64 @llvm.ctlz.i64(i64, i1) #0
+
+declare i16 @llvm.cttz.i16(i16, i1) #0
+declare i32 @llvm.cttz.i32(i32, i1) #0
+declare i64 @llvm.cttz.i64(i64, i1) #0
+
+declare i16 @llvm.bitreverse.i16(i16) #0
+declare i32 @llvm.bitreverse.i32(i32) #0
+declare i64 @llvm.bitreverse.i64(i64) #0
+
+declare i16 @llvm.bswap.i16(i16) #0
+declare i32 @llvm.bswap.i32(i32) #0
+declare i64 @llvm.bswap.i64(i64) #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/block-addr.ll b/test/CodeGen/Hexagon/block-addr.ll
index c0db2cef545e5db6b675a21337420cbed3bbf328..5af3a69f8aab196b5a06ff4cb1865d878097f7de 100644
--- a/test/CodeGen/Hexagon/block-addr.ll
+++ b/test/CodeGen/Hexagon/block-addr.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: .LJTI
 ; CHECK-DAG: r[[REG:[0-9]+]] = memw(r{{[0-9]+}}{{ *}}+{{ *}}r{{[0-9]+<<#[0-9]+}})
-; CHECK-DAG: jumpr:nt r[[REG]]
+; CHECK-DAG: jumpr r[[REG]]
 
 define void @main() #0 {
 entry:
diff --git a/test/CodeGen/Hexagon/branchfolder-keep-impdef.ll b/test/CodeGen/Hexagon/branchfolder-keep-impdef.ll
index a56680bd439995d8506229b49175313750a3a5bf..e09f7986621523292b7af25569ce764d5240e719 100644
--- a/test/CodeGen/Hexagon/branchfolder-keep-impdef.ll
+++ b/test/CodeGen/Hexagon/branchfolder-keep-impdef.ll
@@ -3,7 +3,7 @@
 ; Check that the testcase compiles successfully. Expect that if-conversion
 ; took place.
 ; CHECK-LABEL: fred:
-; CHECK: if (!p0) r1 = memw(r0 + #0)
+; CHECK: if (!p0) r1 = memw(r0+#0)
 
 target triple = "hexagon"
 
diff --git a/test/CodeGen/Hexagon/brev_ld.ll b/test/CodeGen/Hexagon/brev_ld.ll
index a2914296ec4188a9a09912bfa45ded101cf79c85..861da32b981b84a21366bbefb9a9aec001c17c89 100644
--- a/test/CodeGen/Hexagon/brev_ld.ll
+++ b/test/CodeGen/Hexagon/brev_ld.ll
@@ -29,7 +29,7 @@ entry:
   %1 = bitcast i64* %inputLR to i8*
   %sub = sub i32 13, %shr1
   %shl = shl i32 1, %sub
-; CHECK: = memd(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: = memd(r{{[0-9]*}}++m{{[0-1]}}:brev)
   %2 = call i8* @llvm.hexagon.brev.ldd(i8* %0, i8* %1, i32 %shl)
   %3 = bitcast i8* %1 to i64*
   %4 = load i64, i64* %3, align 8, !tbaa !0
@@ -49,7 +49,7 @@ entry:
   %1 = bitcast i32* %inputLR to i8*
   %sub = sub i32 14, %shr1
   %shl = shl i32 1, %sub
-; CHECK: = memw(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: = memw(r{{[0-9]*}}++m{{[0-1]}}:brev)
   %2 = call i8* @llvm.hexagon.brev.ldw(i8* %0, i8* %1, i32 %shl)
   %3 = bitcast i8* %1 to i32*
   %4 = load i32, i32* %3, align 4, !tbaa !2
@@ -69,7 +69,7 @@ entry:
   %1 = bitcast i16* %inputLR to i8*
   %sub = sub i32 15, %shr1
   %shl = shl i32 1, %sub
-; CHECK: = memh(r{{[0-9]*}} ++ m0:brev)
+; CHECK: = memh(r{{[0-9]*}}++m0:brev)
   %2 = call i8* @llvm.hexagon.brev.ldh(i8* %0, i8* %1, i32 %shl)
   %3 = bitcast i8* %1 to i16*
   %4 = load i16, i16* %3, align 2, !tbaa !3
@@ -89,7 +89,7 @@ entry:
   %1 = bitcast i16* %inputLR to i8*
   %sub = sub i32 15, %shr1
   %shl = shl i32 1, %sub
-; CHECK: = memuh(r{{[0-9]*}} ++ m0:brev)
+; CHECK: = memuh(r{{[0-9]*}}++m0:brev)
   %2 = call i8* @llvm.hexagon.brev.lduh(i8* %0, i8* %1, i32 %shl)
   %3 = bitcast i8* %1 to i16*
   %4 = load i16, i16* %3, align 2, !tbaa !3
@@ -108,7 +108,7 @@ entry:
   %0 = bitcast i16* %arrayidx to i8*
   %sub = sub nsw i32 16, %shr1
   %shl = shl i32 1, %sub
-; CHECK: = memub(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: = memub(r{{[0-9]*}}++m{{[0-1]}}:brev)
   %1 = call i8* @llvm.hexagon.brev.ldub(i8* %0, i8* %inputLR, i32 %shl)
   %2 = load i8, i8* %inputLR, align 1, !tbaa !0
   ret i8 %2
@@ -126,7 +126,7 @@ entry:
   %0 = bitcast i16* %arrayidx to i8*
   %sub = sub nsw i32 16, %shr1
   %shl = shl i32 1, %sub
-; CHECK: = memb(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: = memb(r{{[0-9]*}}++m{{[0-1]}}:brev)
   %1 = call i8* @llvm.hexagon.brev.ldb(i8* %0, i8* %inputLR, i32 %shl)
   %2 = load i8, i8* %inputLR, align 1, !tbaa !0
   ret i8 %2
diff --git a/test/CodeGen/Hexagon/brev_st.ll b/test/CodeGen/Hexagon/brev_st.ll
index 6c55681a683b293141552f9d79ece915bde7ad7b..cee5f52e3e4071ea0436cab32e9526a6b731c4ec 100644
--- a/test/CodeGen/Hexagon/brev_st.ll
+++ b/test/CodeGen/Hexagon/brev_st.ll
@@ -26,7 +26,7 @@ entry:
   %0 = bitcast i16* %arrayidx to i8*
   %sub = sub i32 13, %shr2
   %shl = shl i32 1, %sub
-; CHECK: memd(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: memd(r{{[0-9]*}}++m{{[0-1]}}:brev)
   %1 = tail call i8* @llvm.hexagon.brev.std(i8* %0, i64 undef, i32 %shl)
   ret i64 0
 }
@@ -42,7 +42,7 @@ entry:
   %0 = bitcast i16* %arrayidx to i8*
   %sub = sub i32 14, %shr1
   %shl = shl i32 1, %sub
-; CHECK: memw(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: memw(r{{[0-9]*}}++m{{[0-1]}}:brev)
   %1 = tail call i8* @llvm.hexagon.brev.stw(i8* %0, i32 undef, i32 %shl)
   ret i32 0
 }
@@ -58,7 +58,7 @@ entry:
   %0 = bitcast i16* %arrayidx to i8*
   %sub = sub i32 15, %shr2
   %shl = shl i32 1, %sub
-; CHECK: memh(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+; CHECK: memh(r{{[0-9]*}}++m{{[0-1]}}:brev)
   %1 = tail call i8* @llvm.hexagon.brev.sth(i8* %0, i32 0, i32 %shl)
   ret i16 0
 }
@@ -74,7 +74,7 @@ entry:
   %0 = bitcast i16* %arrayidx to i8*
   %sub = sub i32 15, %shr2
   %shl = shl i32 1, %sub
-; CHECK: memh(r{{[0-9]*}} ++ m{{[0-1]}}:brev){{ *}}={{ *}}r{{[0-9]*}}.h
+; CHECK: memh(r{{[0-9]*}}++m{{[0-1]}}:brev) = r{{[0-9]*}}.h
   %1 = tail call i8* @llvm.hexagon.brev.sthhi(i8* %0, i32 0, i32 %shl)
   ret i16 0
 }
@@ -89,7 +89,7 @@ entry:
   %arrayidx = getelementptr inbounds i16, i16* %filtMemLR, i32 %idxprom
   %0 = bitcast i16* %arrayidx to i8*
   %sub = sub nsw i32 16, %shr2
-  ; CHECK: memb(r{{[0-9]*}} ++ m{{[0-1]}}:brev)
+  ; CHECK: memb(r{{[0-9]*}}++m{{[0-1]}}:brev)
   %shl = shl i32 1, %sub
   %1 = tail call i8* @llvm.hexagon.brev.stb(i8* %0, i32 0, i32 %shl)
   ret i8 0
diff --git a/test/CodeGen/Hexagon/builtin-expect.ll b/test/CodeGen/Hexagon/builtin-expect.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9945da1782b2fd4c924122aeea113d4a7744ac7c
--- /dev/null
+++ b/test/CodeGen/Hexagon/builtin-expect.ll
@@ -0,0 +1,44 @@
+; RUN: llc -march=hexagon -disable-block-placement < %s | FileCheck %s
+
+; Check that the branch to the block b10 is marked as taken (i.e. ":t").
+; CHECK-LABEL: foo
+; CHECK: if ({{.*}}) jump:t .LBB0_[[LAB:[0-9]+]]
+; CHECK: [[LAB]]:
+; CHECK: add({{.*}},#65)
+
+target triple = "hexagon"
+
+define i32 @foo(i32 %a0) local_unnamed_addr #0 {
+b1:
+  %v2 = icmp eq i32 %a0, 0
+  br i1 %v2, label %b3, label %b10, !prof !0
+
+b3:                                               ; preds = %b1
+  br label %b4
+
+b4:                                               ; preds = %b4, %b3
+  %v5 = phi i32 [ %v6, %b4 ], [ 0, %b3 ]
+  %v6 = add nuw nsw i32 %v5, 1
+  %v7 = mul nuw nsw i32 %v5, 67
+  %v8 = tail call i32 @bar(i32 %v7) #0
+  %v9 = icmp eq i32 %v6, 10
+  br i1 %v9, label %b13, label %b4
+
+b10:                                              ; preds = %b1
+  %v11 = add nsw i32 %a0, 65
+  %v12 = tail call i32 @bar(i32 %v11) #0
+  br label %b14
+
+b13:                                              ; preds = %b4
+  br label %b14
+
+b14:                                              ; preds = %b13, %b10
+  %v15 = phi i32 [ %v12, %b10 ], [ 0, %b13 ]
+  ret i32 %v15
+}
+
+declare i32 @bar(i32) local_unnamed_addr #0
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double,-long-calls" }
+
+!0 = !{!"branch_weights", i32 1, i32 2000}
diff --git a/test/CodeGen/Hexagon/cext-valid-packet1.ll b/test/CodeGen/Hexagon/cext-valid-packet1.ll
index 36abc59f5e3e6c7eb09af9a707e1f7011a7d82c3..b0aa3c16f862f0ddf54a6f0382731ba4ff83f0e6 100644
--- a/test/CodeGen/Hexagon/cext-valid-packet1.ll
+++ b/test/CodeGen/Hexagon/cext-valid-packet1.ll
@@ -3,8 +3,8 @@
 ; Check that the packetizer generates valid packets with constant
 ; extended instructions.
 ; CHECK: {
-; CHECK-NEXT: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}, ##{{[0-9]+}})
-; CHECK-NEXT: r{{[0-9]+}}{{ *}}={{ *}}add(r{{[0-9]+}}, ##{{[0-9]+}})
+; CHECK-NEXT: r{{[0-9]+}} = add(r{{[0-9]+}},##{{[0-9]+}})
+; CHECK-NEXT: r{{[0-9]+}} = add(r{{[0-9]+}},##{{[0-9]+}})
 ; CHECK-NEXT: }
 
 define i32 @check-packet1(i32 %a, i32 %b, i32 %c) nounwind readnone {
diff --git a/test/CodeGen/Hexagon/circ_ld.ll b/test/CodeGen/Hexagon/circ_ld.ll
index ffa5f2cd2220f76b1acd8cd8efc96c11a72b5cd4..a9b367e9c4ee73dc78a3585cbdf59601567608fe 100644
--- a/test/CodeGen/Hexagon/circ_ld.ll
+++ b/test/CodeGen/Hexagon/circ_ld.ll
@@ -26,7 +26,7 @@ entry:
   %arrayidx = getelementptr inbounds i16, i16* %filtMemLR, i32 %idxprom
   %0 = bitcast i16* %arrayidx to i8*
   %or = or i32 %shr1, 33554432
-; CHECK: = memb(r{{[0-9]*.}}++{{.}}#-1:circ(m{{[0-1]}}))
+; CHECK: = memb(r{{[0-9]*}}++#-1:circ(m{{[0-1]}}))
   %1 = call i8* @llvm.hexagon.circ.ldb(i8* %0, i8* %inputLR, i32 %or, i32 -1)
   %2 = load i8, i8* %inputLR, align 1, !tbaa !0
   ret i8 %2
@@ -45,7 +45,7 @@ entry:
   %1 = bitcast i64* %inputLR to i8*
   %shl = shl nuw nsw i32 %shr1, 3
   %or = or i32 %shl, 83886080
-; CHECK: = memd(r{{[0-9]*.}}++{{.}}#-8:circ(m{{[0-1]}}))
+; CHECK: = memd(r{{[0-9]*}}++#-8:circ(m{{[0-1]}}))
   %2 = call i8* @llvm.hexagon.circ.ldd(i8* %0, i8* %1, i32 %or, i32 -8)
   %3 = bitcast i8* %1 to i64*
   %4 = load i64, i64* %3, align 8, !tbaa !0
@@ -64,7 +64,7 @@ entry:
   %0 = bitcast i16* %arrayidx to i8*
   %1 = bitcast i16* %inputLR to i8*
   %or = or i32 %shr1, 50331648
-; CHECK: = memh(r{{[0-9]*.}}++{{.}}#-2:circ(m{{[0-1]}}))
+; CHECK: = memh(r{{[0-9]*}}++#-2:circ(m{{[0-1]}}))
   %2 = call i8* @llvm.hexagon.circ.ldh(i8* %0, i8* %1, i32 %or, i32 -2)
   %3 = bitcast i8* %1 to i16*
   %4 = load i16, i16* %3, align 2, !tbaa !2
@@ -82,7 +82,7 @@ entry:
   %arrayidx = getelementptr inbounds i16, i16* %filtMemLR, i32 %idxprom
   %0 = bitcast i16* %arrayidx to i8*
   %or = or i32 %shr1, 33554432
-; CHECK: = memub(r{{[0-9]*.}}++{{.}}#-1:circ(m{{[0-1]}}))
+; CHECK: = memub(r{{[0-9]*}}++#-1:circ(m{{[0-1]}}))
   %1 = call i8* @llvm.hexagon.circ.ldub(i8* %0, i8* %inputLR, i32 %or, i32 -1)
   %2 = load i8, i8* %inputLR, align 1, !tbaa !0
   ret i8 %2
@@ -100,7 +100,7 @@ entry:
   %0 = bitcast i16* %arrayidx to i8*
   %1 = bitcast i16* %inputLR to i8*
   %or = or i32 %shr1, 50331648
-; CHECK: = memuh(r{{[0-9]*.}}++{{.}}#-2:circ(m{{[0-1]}}))
+; CHECK: = memuh(r{{[0-9]*}}++#-2:circ(m{{[0-1]}}))
   %2 = call i8* @llvm.hexagon.circ.lduh(i8* %0, i8* %1, i32 %or, i32 -2)
   %3 = bitcast i8* %1 to i16*
   %4 = load i16, i16* %3, align 2, !tbaa !2
@@ -120,7 +120,7 @@ entry:
   %1 = bitcast i32* %inputLR to i8*
   %shl = shl nuw nsw i32 %shr1, 2
   %or = or i32 %shl, 67108864
-; CHECK: = memw(r{{[0-9]*.}}++{{.}}#-4:circ(m{{[0-1]}}))
+; CHECK: = memw(r{{[0-9]*}}++#-4:circ(m{{[0-1]}}))
   %2 = call i8* @llvm.hexagon.circ.ldw(i8* %0, i8* %1, i32 %or, i32 -4)
   %3 = bitcast i8* %1 to i32*
   %4 = load i32, i32* %3, align 4, !tbaa !3
diff --git a/test/CodeGen/Hexagon/circ_ldw.ll b/test/CodeGen/Hexagon/circ_ldw.ll
index 4511a9cf69da7335c64cee92d954d0ab9e75cf1f..abfb0886c686ca43efa227079e2d0e622a45cd27 100644
--- a/test/CodeGen/Hexagon/circ_ldw.ll
+++ b/test/CodeGen/Hexagon/circ_ldw.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
-; CHECK:  r{{[0-9]*}} = memw(r{{[0-9]*.}}++{{.}}#-4:circ(m0))
+; CHECK: r{{[0-9]*}} = memw(r{{[0-9]*}}++#-4:circ(m0))
 
 
 %union.vect64 = type { i64 }
diff --git a/test/CodeGen/Hexagon/circ_st.ll b/test/CodeGen/Hexagon/circ_st.ll
index 4b54afbc611d846090cd8283b9b35203b1f50d85..c8fa256ad48abd417114436c243a4801db78bace 100644
--- a/test/CodeGen/Hexagon/circ_st.ll
+++ b/test/CodeGen/Hexagon/circ_st.ll
@@ -23,7 +23,7 @@ entry:
   %arrayidx = getelementptr inbounds i16, i16* %filtMemLR, i32 %idxprom
   %0 = bitcast i16* %arrayidx to i8*
   %or = or i32 %shr2, 33554432
-; CHECK: memb(r{{[0-9]*}}{{.}}++{{.}}#-1:circ(m{{[0-1]}}))
+; CHECK: memb(r{{[0-9]*}}++#-1:circ(m{{[0-1]}}))
   %1 = tail call i8* @llvm.hexagon.circ.stb(i8* %0, i32 0, i32 %or, i32 -1)
   ret i8 0
 }
@@ -39,7 +39,7 @@ entry:
   %0 = bitcast i16* %arrayidx to i8*
   %shl = shl nuw nsw i32 %shr1, 3
   %or = or i32 %shl, 83886080
-; CHECK: memd(r{{[0-9]*}}{{.}}++{{.}}#-8:circ(m{{[0-1]}}))
+; CHECK: memd(r{{[0-9]*}}++#-8:circ(m{{[0-1]}}))
   %1 = tail call i8* @llvm.hexagon.circ.std(i8* %0, i64 undef, i32 %or, i32 -8)
   ret i64 0
 }
@@ -54,7 +54,7 @@ entry:
   %arrayidx = getelementptr inbounds i16, i16* %filtMemLR, i32 %idxprom
   %0 = bitcast i16* %arrayidx to i8*
   %or = or i32 %shr2, 50331648
-; CHECK: memh(r{{[0-9]*}}{{.}}++{{.}}#-2:circ(m{{[0-1]}}))
+; CHECK: memh(r{{[0-9]*}}++#-2:circ(m{{[0-1]}}))
   %1 = tail call i8* @llvm.hexagon.circ.sth(i8* %0, i32 0, i32 %or, i32 -2)
   ret i16 0
 }
@@ -69,7 +69,7 @@ entry:
   %arrayidx = getelementptr inbounds i16, i16* %filtMemLR, i32 %idxprom
   %0 = bitcast i16* %arrayidx to i8*
   %or = or i32 %shr2, 50331648
-; CHECK: memh(r{{[0-9]*}}{{.}}++{{.}}#-2:circ(m{{[0-1]}})){{ *}}={{ *}}r{{[0-9]*}}.h
+; CHECK: memh(r{{[0-9]*}}++#-2:circ(m{{[0-1]}})) = r{{[0-9]*}}.h
   %1 = tail call i8* @llvm.hexagon.circ.sthhi(i8* %0, i32 0, i32 %or, i32 -2)
   ret i16 0
 }
@@ -85,7 +85,7 @@ entry:
   %0 = bitcast i16* %arrayidx to i8*
   %shl = shl nuw nsw i32 %shr1, 2
   %or = or i32 %shl, 67108864
-; CHECK: memw(r{{[0-9]*}}{{.}}++{{.}}#-4:circ(m{{[0-1]}}))
+; CHECK: memw(r{{[0-9]*}}++#-4:circ(m{{[0-1]}}))
   %1 = tail call i8* @llvm.hexagon.circ.stw(i8* %0, i32 undef, i32 %or, i32 -4)
   ret i32 0
 }
diff --git a/test/CodeGen/Hexagon/clr_set_toggle.ll b/test/CodeGen/Hexagon/clr_set_toggle.ll
index 19e3ed0cf89740d8003308201f0ed3a1c21d7efb..4e9838316522d00257877370a0a56dd3e32d7175 100644
--- a/test/CodeGen/Hexagon/clr_set_toggle.ll
+++ b/test/CodeGen/Hexagon/clr_set_toggle.ll
@@ -4,7 +4,7 @@
 define i32 @my_clrbit(i32 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_clrbit
-; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #31)
+; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}},#31)
   %x.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
   %0 = load i32, i32* %x.addr, align 4
@@ -15,7 +15,7 @@ entry:
 define i64 @my_clrbit2(i64 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_clrbit2
-; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #31)
+; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}},#31)
   %x.addr = alloca i64, align 8
   store i64 %x, i64* %x.addr, align 8
   %0 = load i64, i64* %x.addr, align 8
@@ -26,7 +26,7 @@ entry:
 define i64 @my_clrbit3(i64 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_clrbit3
-; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #31)
+; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}},#31)
   %x.addr = alloca i64, align 8
   store i64 %x, i64* %x.addr, align 8
   %0 = load i64, i64* %x.addr, align 8
@@ -37,7 +37,7 @@ entry:
 define i32 @my_clrbit4(i32 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_clrbit4
-; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #13)
+; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}},#13)
   %x.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
   %0 = load i32, i32* %x.addr, align 4
@@ -48,7 +48,7 @@ entry:
 define i64 @my_clrbit5(i64 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_clrbit5
-; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #13)
+; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}},#13)
   %x.addr = alloca i64, align 8
   store i64 %x, i64* %x.addr, align 8
   %0 = load i64, i64* %x.addr, align 8
@@ -59,7 +59,7 @@ entry:
 define i64 @my_clrbit6(i64 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_clrbit6
-; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #27)
+; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}},#27)
   %x.addr = alloca i64, align 8
   store i64 %x, i64* %x.addr, align 8
   %0 = load i64, i64* %x.addr, align 8
@@ -70,7 +70,7 @@ entry:
 define zeroext i16 @my_setbit(i16 zeroext %crc) nounwind {
 entry:
 ; CHECK-LABEL: my_setbit
-; CHECK: memh(r{{[0-9]+}}+#{{[0-9]+}}){{ *}}={{ *}}setbit(#15)
+; CHECK: memh(r{{[0-9]+}}+#{{[0-9]+}}) = setbit(#15)
   %crc.addr = alloca i16, align 2
   store i16 %crc, i16* %crc.addr, align 2
   %0 = load i16, i16* %crc.addr, align 2
@@ -85,7 +85,7 @@ entry:
 define i32 @my_setbit2(i32 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_setbit2
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}setbit(r{{[0-9]+}}, #15)
+; CHECK: r{{[0-9]+}} = setbit(r{{[0-9]+}},#15)
   %x.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
   %0 = load i32, i32* %x.addr, align 4
@@ -96,7 +96,7 @@ entry:
 define i64 @my_setbit3(i64 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_setbit3
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}setbit(r{{[0-9]+}}, #15)
+; CHECK: r{{[0-9]+}} = setbit(r{{[0-9]+}},#15)
   %x.addr = alloca i64, align 8
   store i64 %x, i64* %x.addr, align 8
   %0 = load i64, i64* %x.addr, align 8
@@ -107,7 +107,7 @@ entry:
 define i32 @my_setbit4(i32 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_setbit4
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}setbit(r{{[0-9]+}}, #31)
+; CHECK: r{{[0-9]+}} = setbit(r{{[0-9]+}},#31)
   %x.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
   %0 = load i32, i32* %x.addr, align 4
@@ -118,7 +118,7 @@ entry:
 define i64 @my_setbit5(i64 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_setbit5
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}setbit(r{{[0-9]+}}, #13)
+; CHECK: r{{[0-9]+}} = setbit(r{{[0-9]+}},#13)
   %x.addr = alloca i64, align 8
   store i64 %x, i64* %x.addr, align 8
   %0 = load i64, i64* %x.addr, align 8
@@ -129,7 +129,7 @@ entry:
 define zeroext i16 @my_togglebit(i16 zeroext %crc) nounwind {
 entry:
 ; CHECK-LABEL: my_togglebit
-; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #15)
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#15)
   %crc.addr = alloca i16, align 2
   store i16 %crc, i16* %crc.addr, align 2
   %0 = load i16, i16* %crc.addr, align 2
@@ -144,7 +144,7 @@ entry:
 define i32 @my_togglebit2(i32 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_togglebit2
-; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #15)
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#15)
   %x.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
   %0 = load i32, i32* %x.addr, align 4
@@ -155,7 +155,7 @@ entry:
 define i64 @my_togglebit3(i64 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_togglebit3
-; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #15)
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#15)
   %x.addr = alloca i64, align 8
   store i64 %x, i64* %x.addr, align 8
   %0 = load i64, i64* %x.addr, align 8
@@ -166,7 +166,7 @@ entry:
 define i64 @my_togglebit4(i64 %x) nounwind {
 entry:
 ; CHECK-LABEL: my_togglebit4
-; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #20)
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#20)
   %x.addr = alloca i64, align 8
   store i64 %x, i64* %x.addr, align 8
   %0 = load i64, i64* %x.addr, align 8
diff --git a/test/CodeGen/Hexagon/cmp.ll b/test/CodeGen/Hexagon/cmp.ll
index c274a787249a5ace86a78e3c3f62f7284d4171ab..a0bb90de1c276a7e203d15c6e39def8684556ac1 100644
--- a/test/CodeGen/Hexagon/cmp.ll
+++ b/test/CodeGen/Hexagon/cmp.ll
@@ -9,7 +9,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmpeq(i32 %0, i32 1)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.eq(r{{[0-9]}}, r{{[0-9]}})
+; CHECK: { p{{[0-3]}} = cmp.eq(r{{[0-9]}},r{{[0-9]}})
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.hexagon.C2.cmpeq(i32, i32) #1
@@ -23,7 +23,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmpgt(i32 %0, i32 2)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.gt(r{{[0-9]}}, r{{[0-9]}})
+; CHECK: { p{{[0-3]}} = cmp.gt(r{{[0-9]}},r{{[0-9]}})
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.hexagon.C2.cmpgt(i32, i32) #1
@@ -37,7 +37,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmpgtu(i32 %0, i32 3)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.gtu(r{{[0-9]}}, r{{[0-9]}})
+; CHECK: { p{{[0-3]}} = cmp.gtu(r{{[0-9]}},r{{[0-9]}})
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.hexagon.C2.cmpgtu(i32, i32) #1
@@ -51,7 +51,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmplt(i32 %0, i32 4)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.gt(r{{[0-9]}}, r{{[0-9]}})
+; CHECK: { p{{[0-3]}} = cmp.gt(r{{[0-9]}},r{{[0-9]}})
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.hexagon.C2.cmplt(i32, i32) #1
@@ -65,7 +65,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmpltu(i32 %0, i32 5)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.gtu(r{{[0-9]}}, r{{[0-9]}})
+; CHECK: { p{{[0-3]}} = cmp.gtu(r{{[0-9]}},r{{[0-9]}})
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.hexagon.C2.cmpltu(i32, i32) #1
@@ -79,7 +79,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmpeqi(i32 %0, i32 10)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.eq(r{{[0-9]}}, {{.*}}#10)
+; CHECK: { p{{[0-3]}} = cmp.eq(r{{[0-9]}},#10)
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.hexagon.C2.cmpeqi(i32, i32) #1
@@ -93,7 +93,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmpgti(i32 %0, i32 20)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.gt(r{{[0-9]}}, {{.*}}#20)
+; CHECK: { p{{[0-3]}} = cmp.gt(r{{[0-9]}},#20)
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.hexagon.C2.cmpgti(i32, i32) #1
@@ -107,7 +107,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmpgtui(i32 %0, i32 40)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.gtu(r{{[0-9]}}, {{.*}}#40)
+; CHECK: { p{{[0-3]}} = cmp.gtu(r{{[0-9]}},#40)
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.hexagon.C2.cmpgtui(i32, i32) #1
@@ -121,7 +121,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmpgei(i32 %0, i32 3)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.gt(r{{[0-9]}}, {{.*}}#2)
+; CHECK: { p{{[0-3]}} = cmp.gt(r{{[0-9]}},#2)
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.hexagon.C2.cmpgei(i32, i32) #1
@@ -135,7 +135,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmpgeui(i32 %0, i32 3)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.gtu(r{{[0-9]}}, {{.*}}#2)
+; CHECK: { p{{[0-3]}} = cmp.gtu(r{{[0-9]}},#2)
 
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.hexagon.C2.cmpgeui(i32, i32) #1
@@ -149,7 +149,7 @@ entry:
   %1 = call i32 @llvm.hexagon.C2.cmpgeui(i32 %0, i32 0)
   ret i32 %1
 }
-; CHECK: { p{{[0-3]}} = cmp.eq(r{{[0-9]}}, r{{[0-9]}})
+; CHECK: { p{{[0-3]}} = cmp.eq(r{{[0-9]}},r{{[0-9]}})
 
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Hexagon/combine.ll b/test/CodeGen/Hexagon/combine.ll
index 04a080fdf42599db388e3bbfbe71a8b6bd214b5d..5b71b36656673cade35ec353811bcda0a97a3ce9 100644
--- a/test/CodeGen/Hexagon/combine.ll
+++ b/test/CodeGen/Hexagon/combine.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr -hexagon-bit=0 < %s | FileCheck %s
-; CHECK: combine(r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK: combine(r{{[0-9]+}},r{{[0-9]+}})
 
 @j = external global i32
 @k = external global i64
diff --git a/test/CodeGen/Hexagon/compound.ll b/test/CodeGen/Hexagon/compound.ll
index f8d36b8b77d90747974efaae15152a1f2e4ea468..a3bd52f97194f675694ef3a02ebb364fa0b4e513 100644
--- a/test/CodeGen/Hexagon/compound.ll
+++ b/test/CodeGen/Hexagon/compound.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -filetype=obj -o - %s | llvm-objdump -d - | FileCheck %s
+; RUN: llc -march=hexagon -filetype=obj -ifcvt-limit=0 -o - %s | llvm-objdump -d - | FileCheck %s
 
 ; CHECK: p0 = cmp.gt(r0,#-1); if (!p0.new) jump:nt
 
@@ -14,4 +14,4 @@ ret void
 y:
 call void @b()
 ret void
-}
\ No newline at end of file
+}
diff --git a/test/CodeGen/Hexagon/constp-combine-neg.ll b/test/CodeGen/Hexagon/constp-combine-neg.ll
index 18f0e81076af2b882a6ce59b15d4c2d44d24b8b6..089d9f6a9984a12c7fb601f8b972eee7483075e9 100644
--- a/test/CodeGen/Hexagon/constp-combine-neg.ll
+++ b/test/CodeGen/Hexagon/constp-combine-neg.ll
@@ -19,9 +19,9 @@ entry:
 ; The instructions seem to be in a different order in the .s file than
 ; the corresponding values in the .ll file, so just run the test three
 ; times and each time test for a different instruction.
-; CHECK-TEST1: combine(#-2, #3)
-; CHECK-TEST2: combine(#6, #-4)
-; CHECK-TEST3: combine(#-10, #-8)
+; CHECK-TEST1: combine(#-2,#3)
+; CHECK-TEST2: combine(#6,#-4)
+; CHECK-TEST3: combine(#-10,#-8)
 
 attributes #0 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
diff --git a/test/CodeGen/Hexagon/convert-to-dot-old.ll b/test/CodeGen/Hexagon/convert-to-dot-old.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b793fa0c22cd57f3a5def44bb8a8c802f2224951
--- /dev/null
+++ b/test/CodeGen/Hexagon/convert-to-dot-old.ll
@@ -0,0 +1,110 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv55 -filetype=obj -o /dev/null
+; REQUIRES: asserts
+; There should be no output (nothing on stderr).
+
+; Due to a bug in converting a dot-new branch into a dot-old one, opcodes
+; with branch prediction bits were selected even if the architecture did
+; not support them. On V55-, the dot-old branch opcodes are J2_jumpt and
+; J2_jumpf (and a pair of J2_jumpr*), whereas J2_jumptpt could have been
+; a result of the conversion to dot-old. This would fail a verification
+; check in the MC code emitter, so make sure it does not happen.
+
+target triple = "hexagon"
+
+define void @fred(i16* nocapture %a0, i16* nocapture %a1, i16* nocapture %a2, i16 signext %a3, i16* %a4, i16 signext %a5, i16 signext %a6, i16 signext %a7, i32 %a8, i16 signext %a9, i16 signext %a10) local_unnamed_addr #0 {
+b11:
+  %v12 = sext i16 %a5 to i32
+  %v13 = tail call i32 @llvm.hexagon.A2.sxth(i32 %v12)
+  %v14 = tail call i32 @llvm.hexagon.A2.sxth(i32 2)
+  %v15 = tail call i32 @llvm.hexagon.A2.sxth(i32 undef)
+  %v16 = tail call i32 @llvm.hexagon.A2.sath(i32 undef)
+  %v17 = tail call i32 @llvm.hexagon.A2.sxth(i32 %v16)
+  %v18 = tail call i32 @llvm.hexagon.A2.aslh(i32 undef)
+  %v19 = tail call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %v18, i32 %v14)
+  %v20 = tail call i32 @llvm.hexagon.A2.asrh(i32 %v19)
+  %v21 = tail call i32 @llvm.hexagon.A2.sxth(i32 %v20)
+  %v22 = tail call i32 @llvm.hexagon.A2.sub(i32 %v17, i32 %v21)
+  %v23 = tail call i32 @llvm.hexagon.A2.sath(i32 %v22)
+  %v24 = select i1 undef, i32 undef, i32 %v23
+  %v25 = tail call i32 @llvm.hexagon.A2.sxth(i32 %v24)
+  %v26 = tail call i32 @llvm.hexagon.A2.sub(i32 %v13, i32 %v25)
+  %v27 = tail call i32 @llvm.hexagon.A2.sath(i32 %v26)
+  %v28 = tail call i32 @llvm.hexagon.A2.sxth(i32 %v27)
+  %v29 = tail call i32 @llvm.hexagon.A2.sub(i32 %v28, i32 %v14)
+  %v30 = tail call i32 @llvm.hexagon.A2.sath(i32 %v29)
+  %v31 = shl i32 %v30, 16
+  %v32 = icmp sgt i32 undef, %v31
+  %v33 = select i1 %v32, i32 %v30, i32 undef
+  %v34 = trunc i32 %v33 to i16
+  %v35 = trunc i32 %v24 to i16
+  call void @foo(i16* nonnull undef, i32* nonnull undef, i16* %a4, i16 signext %v35, i16 signext %v34, i16 signext 2) #4
+  %v36 = call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %v18, i32 undef)
+  %v37 = call i32 @llvm.hexagon.A2.asrh(i32 %v36)
+  %v38 = call i32 @llvm.hexagon.A2.sub(i32 %v13, i32 undef)
+  %v39 = call i32 @llvm.hexagon.A2.sath(i32 %v38)
+  %v40 = call i32 @llvm.hexagon.A2.sxth(i32 %v39)
+  %v41 = call i32 @llvm.hexagon.A2.sub(i32 %v40, i32 %v14)
+  %v42 = call i32 @llvm.hexagon.A2.sath(i32 %v41)
+  %v43 = select i1 undef, i32 %v42, i32 %v37
+  %v44 = trunc i32 %v43 to i16
+  call void @foo(i16* nonnull undef, i32* nonnull undef, i16* %a4, i16 signext undef, i16 signext %v44, i16 signext 2) #4
+  %v45 = call i32 @llvm.hexagon.A2.sath(i32 undef)
+  %v46 = select i1 undef, i32 undef, i32 %v45
+  %v47 = trunc i32 %v46 to i16
+  call void @foo(i16* nonnull undef, i32* nonnull undef, i16* %a4, i16 signext %v47, i16 signext undef, i16 signext 2) #4
+  %v48 = call i32 @llvm.hexagon.A2.sub(i32 undef, i32 %v15)
+  %v49 = call i32 @llvm.hexagon.A2.sath(i32 %v48)
+  %v50 = trunc i32 %v49 to i16
+  store i16 %v50, i16* undef, align 2
+  store i16 %a3, i16* %a0, align 2
+  %v51 = sext i16 %a10 to i32
+  %v52 = call i32 @llvm.hexagon.A2.sxth(i32 %v51)
+  %v53 = call i32 @llvm.hexagon.A2.add(i32 undef, i32 %v52)
+  %v54 = call i32 @llvm.hexagon.A2.sath(i32 %v53)
+  %v55 = trunc i32 %v54 to i16
+  store i16 %v55, i16* %a1, align 2
+  store i16 %a7, i16* %a2, align 2
+  %v56 = sext i16 %a9 to i32
+  %v57 = call i32 @llvm.hexagon.A2.sxth(i32 %v56)
+  br i1 undef, label %b58, label %b62
+
+b58:                                              ; preds = %b11
+  %v59 = call i32 @llvm.hexagon.A2.add(i32 %v57, i32 %v52)
+  %v60 = call i32 @llvm.hexagon.A2.sath(i32 %v59)
+  %v61 = trunc i32 %v60 to i16
+  store i16 %v61, i16* %a1, align 2
+  br label %b63
+
+b62:                                              ; preds = %b11
+  br label %b63
+
+b63:                                              ; preds = %b62, %b58
+  %v64 = phi i16 [ undef, %b58 ], [ %a9, %b62 ]
+  %v65 = icmp slt i16 undef, %v64
+  br i1 %v65, label %b66, label %b67
+
+b66:                                              ; preds = %b63
+  br i1 undef, label %b67, label %b68
+
+b67:                                              ; preds = %b66, %b63
+  store i16 0, i16* %a2, align 2
+  br label %b68
+
+b68:                                              ; preds = %b67, %b66
+  ret void
+}
+
+declare i32 @llvm.hexagon.A2.sath(i32) #2
+declare i32 @llvm.hexagon.A2.add(i32, i32) #2
+declare i32 @llvm.hexagon.A2.sxth(i32) #2
+declare i32 @llvm.hexagon.A2.sub(i32, i32) #2
+declare i32 @llvm.hexagon.A2.asrh(i32) #2
+declare i32 @llvm.hexagon.S2.asr.r.r.sat(i32, i32) #2
+declare i32 @llvm.hexagon.A2.aslh(i32) #2
+declare void @foo(i16*, i32*, i16*, i16 signext, i16 signext, i16 signext) local_unnamed_addr #3
+
+attributes #0 = { nounwind optsize "target-cpu"="hexagonv55" "target-features"="-hvx,-hvx-double,-long-calls" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { nounwind readnone }
+attributes #3 = { optsize "target-cpu"="hexagonv55" "target-features"="-hvx,-hvx-double,-long-calls" }
+attributes #4 = { nounwind optsize }
diff --git a/test/CodeGen/Hexagon/ctlz-cttz-ctpop.ll b/test/CodeGen/Hexagon/ctlz-cttz-ctpop.ll
deleted file mode 100644
index b8f483298f8c34afa3d057193100e7c90b3cbd00..0000000000000000000000000000000000000000
--- a/test/CodeGen/Hexagon/ctlz-cttz-ctpop.ll
+++ /dev/null
@@ -1,36 +0,0 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-
-; CHECK-DAG: ct0({{r[0-9]*:[0-9]*}})
-; CHECK-DAG: cl0({{r[0-9]*:[0-9]*}})
-; CHECK-DAG: ct0({{r[0-9]*}})
-; CHECK-DAG: cl0({{r[0-9]*}})
-; CHECK-DAG: r{{[0-9]+}} += lsr(r{{[0-9]+}}, #4)
-
-define i32 @foo(i64 %a, i32 %b) nounwind  {
-entry:
-        %tmp0 = tail call i64 @llvm.ctlz.i64( i64 %a, i1 true )
-        %tmp1 = tail call i64 @llvm.cttz.i64( i64 %a, i1 true )
-        %tmp2 = tail call i32 @llvm.ctlz.i32( i32 %b, i1 true )
-        %tmp3 = tail call i32 @llvm.cttz.i32( i32 %b, i1 true )
-        %tmp4 = tail call i64 @llvm.ctpop.i64( i64 %a )
-        %tmp5 = tail call i32 @llvm.ctpop.i32( i32 %b )
-
-
-        %tmp6 = trunc i64 %tmp0 to i32
-        %tmp7 = trunc i64 %tmp1 to i32
-        %tmp8 = trunc i64 %tmp4 to i32
-        %tmp9 = add i32 %tmp6, %tmp7
-        %tmp10 = add i32 %tmp9, %tmp8
-        %tmp11 = add i32 %tmp10, %tmp2
-        %tmp12 = add i32 %tmp11, %tmp3
-        %tmp13 = add i32 %tmp12, %tmp5
-
-        ret i32 %tmp13
-}
-
-declare i64 @llvm.ctlz.i64(i64, i1) nounwind readnone
-declare i32 @llvm.ctlz.i32(i32, i1) nounwind readnone
-declare i64 @llvm.cttz.i64(i64, i1) nounwind readnone
-declare i32 @llvm.cttz.i32(i32, i1) nounwind readnone
-declare i64 @llvm.ctpop.i64(i64) nounwind readnone
-declare i32 @llvm.ctpop.i32(i32) nounwind readnone
diff --git a/test/CodeGen/Hexagon/dead-store-stack.ll b/test/CodeGen/Hexagon/dead-store-stack.ll
index 93d324baad9e76618b24b915c3b7cc5ec8714a99..0d8124e76b90388d301a34f358f28bb689d2a8ee 100644
--- a/test/CodeGen/Hexagon/dead-store-stack.ll
+++ b/test/CodeGen/Hexagon/dead-store-stack.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
 ; CHECK: ParseFunc:
-; CHECK: r[[ARG0:[0-9]+]] = memuh(r[[ARG1:[0-9]+]] + #[[OFFSET:[0-9]+]])
+; CHECK: r[[ARG0:[0-9]+]] = memuh(r[[ARG1:[0-9]+]]+#[[OFFSET:[0-9]+]])
 ; CHECK: memw(r[[ARG1]]+#[[OFFSET]]) = r[[ARG0]]
 
 @.str.3 = external unnamed_addr constant [8 x i8], align 1
diff --git a/test/CodeGen/Hexagon/early-if-merge-loop.ll b/test/CodeGen/Hexagon/early-if-merge-loop.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f45058f029dd095a65bab69864139bf85d1dde3f
--- /dev/null
+++ b/test/CodeGen/Hexagon/early-if-merge-loop.ll
@@ -0,0 +1,91 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; Make sure that the loop in the end has only one basic block.
+
+; CHECK-LABEL: fred
+; Rely on the comments, make sure the one for the loop header is present.
+; CHECK: %loop
+; CHECK-NOT: %should_merge
+
+target triple = "hexagon"
+
+define i32 @fred(i32 %a0, i64* nocapture readonly %a1) #0 {
+b2:
+  %v3 = bitcast i64* %a1 to i32*
+  %v4 = getelementptr inbounds i32, i32* %v3, i32 1
+  %v5 = zext i32 %a0 to i64
+  br label %loop
+
+loop:                                             ; preds = %should_merge, %b2
+  %v7 = phi i32 [ 0, %b2 ], [ %v49, %should_merge ]
+  %v8 = phi i32 [ 0, %b2 ], [ %v42, %should_merge ]
+  %v9 = phi i32* [ %v4, %b2 ], [ %v53, %should_merge ]
+  %v10 = phi i32 [ 0, %b2 ], [ %v30, %should_merge ]
+  %v11 = phi i32* [ %v3, %b2 ], [ %v51, %should_merge ]
+  %v12 = phi i32 [ 0, %b2 ], [ %v23, %should_merge ]
+  %v13 = phi i32 [ 2, %b2 ], [ %v54, %should_merge ]
+  %v14 = load i32, i32* %v11, align 4, !tbaa !0
+  %v15 = load i32, i32* %v9, align 4, !tbaa !0
+  %v16 = icmp ult i32 %v13, 30
+  %v17 = zext i32 %v12 to i64
+  %v18 = shl nuw i64 %v17, 32
+  %v19 = zext i32 %v14 to i64
+  %v20 = or i64 %v18, %v19
+  %v21 = tail call i64 @llvm.hexagon.A2.addp(i64 %v20, i64 %v5)
+  %v22 = lshr i64 %v21, 32
+  %v23 = trunc i64 %v22 to i32
+  %v24 = zext i32 %v10 to i64
+  %v25 = shl nuw i64 %v24, 32
+  %v26 = zext i32 %v15 to i64
+  %v27 = or i64 %v25, %v26
+  %v28 = tail call i64 @llvm.hexagon.A2.addp(i64 %v27, i64 %v5)
+  %v29 = lshr i64 %v28, 32
+  %v30 = trunc i64 %v29 to i32
+  %v31 = getelementptr inbounds i32, i32* %v3, i32 %v13
+  %v32 = load i32, i32* %v31, align 4, !tbaa !0
+  %v33 = or i32 %v13, 1
+  %v34 = getelementptr inbounds i32, i32* %v3, i32 %v33
+  %v35 = load i32, i32* %v34, align 4, !tbaa !0
+  %v36 = zext i32 %v8 to i64
+  %v37 = shl nuw i64 %v36, 32
+  %v38 = zext i32 %v32 to i64
+  %v39 = or i64 %v37, %v38
+  %v40 = tail call i64 @llvm.hexagon.A2.subp(i64 %v39, i64 %v5)
+  %v41 = lshr i64 %v40, 32
+  %v42 = trunc i64 %v41 to i32
+  %v43 = zext i32 %v7 to i64
+  %v44 = shl nuw i64 %v43, 32
+  %v45 = zext i32 %v35 to i64
+  %v46 = or i64 %v44, %v45
+  %v47 = tail call i64 @llvm.hexagon.A2.subp(i64 %v46, i64 %v5)
+  %v48 = lshr i64 %v47, 32
+  %v49 = trunc i64 %v48 to i32
+  br i1 %v16, label %should_merge, label %exit
+
+should_merge:                                     ; preds = %loop
+  %v50 = add nuw nsw i32 %v13, 2
+  %v51 = getelementptr inbounds i32, i32* %v3, i32 %v50
+  %v52 = add nuw nsw i32 %v13, 3
+  %v53 = getelementptr inbounds i32, i32* %v3, i32 %v52
+  %v54 = add nuw nsw i32 %v13, 4
+  br label %loop
+
+exit:                                             ; preds = %loop
+  %v57 = tail call i64 @llvm.hexagon.A2.combinew(i32 %v42, i32 %v23)
+  %v58 = tail call i64 @llvm.hexagon.A2.combinew(i32 %v49, i32 %v30)
+  %v59 = tail call i64 @llvm.hexagon.A2.addp(i64 %v57, i64 %v58)
+  %v60 = lshr i64 %v59, 32
+  %v61 = trunc i64 %v60 to i32
+  ret i32 %v61
+}
+
+declare i64 @llvm.hexagon.A2.addp(i64, i64) #1
+declare i64 @llvm.hexagon.A2.subp(i64, i64) #1
+declare i64 @llvm.hexagon.A2.combinew(i32, i32) #1
+
+attributes #0 = { nounwind readonly "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
+attributes #1 = { nounwind readnone }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"long", !2, i64 0}
+!2 = !{!"omnipotent char", !3, i64 0}
+!3 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/early-if-phi-i1.ll b/test/CodeGen/Hexagon/early-if-phi-i1.ll
index 1649d51269ee2b3eb3341f8e119c8875489d6d50..f4af62d6b10eadb24df37ba088645595ef208017 100644
--- a/test/CodeGen/Hexagon/early-if-phi-i1.ll
+++ b/test/CodeGen/Hexagon/early-if-phi-i1.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -march=hexagon < %s
 ; REQUIRES: asserts
 ; Check that the early if-conversion does not predicate block1 (where the
 ; join block has a phi node of type i1).
diff --git a/test/CodeGen/Hexagon/early-if-vecpred.ll b/test/CodeGen/Hexagon/early-if-vecpred.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ca119e1d1dec304430d485505105b38f8f8c2d51
--- /dev/null
+++ b/test/CodeGen/Hexagon/early-if-vecpred.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: asserts
+
+; Hexagon early if-conversion used to crash on this testcase due to not
+; recognizing vector predicate registers.
+
+target triple = "hexagon"
+
+; Check that the early if-conversion has not happened.
+
+; CHECK-LABEL: fred
+; CHECK: q{{[0-3]}} = not
+; CHECK: LBB
+; CHECK: if (q{{[0-3]}}) vmem
+define void @fred(i32 %a0) #0 {
+b1:
+  %v2 = tail call <1024 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32 %a0) #2
+  br i1 undef, label %b3, label %b5
+
+b3:                                               ; preds = %b1
+  %v4 = tail call <1024 x i1> @llvm.hexagon.V6.pred.not.128B(<1024 x i1> %v2) #2
+  br label %b5
+
+b5:                                               ; preds = %b3, %b1
+  %v6 = phi <1024 x i1> [ %v4, %b3 ], [ %v2, %b1 ]
+  %v7 = bitcast <1024 x i1> %v6 to <32 x i32>
+  tail call void asm sideeffect "if ($0) vmem($1) = $2;", "q,r,v,~{memory}"(<32 x i32> %v7, <32 x i32>* undef, <32 x i32> undef) #2
+  ret void
+}
+
+declare <1024 x i1> @llvm.hexagon.V6.pred.scalar2.128B(i32) #1
+declare <1024 x i1> @llvm.hexagon.V6.pred.not.128B(<1024 x i1>) #1
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-double" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+
diff --git a/test/CodeGen/Hexagon/eh_return.ll b/test/CodeGen/Hexagon/eh_return.ll
index 67649a07afc7ec1e3207010ca09d62230df88e3b..1596ade24c820169c2b8fbc4a5a4564cc80b6b27 100644
--- a/test/CodeGen/Hexagon/eh_return.ll
+++ b/test/CodeGen/Hexagon/eh_return.ll
@@ -4,7 +4,7 @@
 ; CHECK:         deallocframe
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  {
-; CHECK-NEXT:    r29 = add(r29, r28)
+; CHECK-NEXT:    r29 = add(r29,r28)
 ; CHECK-NEXT:  }
 ; CHECK-NEXT:  {
 ; CHECK-NEXT:    jumpr r31
diff --git a/test/CodeGen/Hexagon/eliminate-pred-spill.ll b/test/CodeGen/Hexagon/eliminate-pred-spill.ll
index 6fb0a3e2658d642b31b6bb9e16082db47a97bc48..b3a4a2f4252494bfe0439bd127e1966e1fb75b84 100644
--- a/test/CodeGen/Hexagon/eliminate-pred-spill.ll
+++ b/test/CodeGen/Hexagon/eliminate-pred-spill.ll
@@ -1,5 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv60 -enable-hexagon-hvx-double \
-; RUN:     -hexagon-bit=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-bit=0 < %s | FileCheck %s
 
 ; This spill should be eliminated.
 ; CHECK-NOT: vmem(r29+#6)
@@ -140,5 +139,5 @@ declare <64 x i32> @llvm.hexagon.V6.vmpyuh.acc.128B(<64 x i32>, <32 x i32>, i32)
 
 declare <32 x i32> @llvm.hexagon.V6.hi.128B(<64 x i32>) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-double" }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/Hexagon/expand-condsets-dead-bad.ll b/test/CodeGen/Hexagon/expand-condsets-dead-bad.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ce7f5e0ce12fe92761f3e2196d65020f085bc786
--- /dev/null
+++ b/test/CodeGen/Hexagon/expand-condsets-dead-bad.ll
@@ -0,0 +1,54 @@
+; RUN: llc -march=hexagon -verify-machineinstrs < %s | FileCheck %s
+; REQUIRES: asserts
+
+; Check for some output other than crashing.
+; CHECK: bitsset
+
+target triple = "hexagon"
+
+; Function Attrs: nounwind
+define void @fred() local_unnamed_addr #0 {
+b0:
+  %v1 = load i32, i32* undef, align 4
+  %v2 = and i32 %v1, 603979776
+  %v3 = trunc i32 %v2 to i30
+  switch i30 %v3, label %b23 [
+    i30 -536870912, label %b4
+    i30 -469762048, label %b5
+  ]
+
+b4:                                               ; preds = %b0
+  unreachable
+
+b5:                                               ; preds = %b0
+  %v6 = load i32, i32* undef, align 4
+  br i1 undef, label %b7, label %b8
+
+b7:                                               ; preds = %b5
+  br label %b9
+
+b8:                                               ; preds = %b5
+  br label %b9
+
+b9:                                               ; preds = %b8, %b7
+  %v10 = load i32, i32* undef, align 4
+  %v11 = load i32, i32* undef, align 4
+  %v12 = mul nsw i32 %v11, %v10
+  %v13 = ashr i32 %v12, 13
+  %v14 = mul nsw i32 %v13, %v13
+  %v15 = zext i32 %v14 to i64
+  %v16 = mul nsw i32 %v6, %v6
+  %v17 = zext i32 %v16 to i64
+  %v18 = lshr i64 %v17, 5
+  %v19 = select i1 undef, i64 %v18, i64 %v17
+  %v20 = mul nuw nsw i64 %v19, %v15
+  %v21 = trunc i64 %v20 to i32
+  %v22 = and i32 %v21, 2147483647
+  store i32 %v22, i32* undef, align 4
+  unreachable
+
+b23:                                              ; preds = %b0
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv5" "target-features"="-hvx,-hvx-double,-long-calls" }
diff --git a/test/CodeGen/Hexagon/expand-condsets-dead-pred.ll b/test/CodeGen/Hexagon/expand-condsets-dead-pred.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ecec83625e1c2f80ca728bbf1fe17ce32a31be8a
--- /dev/null
+++ b/test/CodeGen/Hexagon/expand-condsets-dead-pred.ll
@@ -0,0 +1,45 @@
+; RUN: llc -march=hexagon -verify-machineinstrs < %s | FileCheck %s
+; REQUIRES: asserts
+
+; Check for some output (as opposed to a crash).
+; CHECK: loop0
+
+target triple = "hexagon"
+
+@x = external local_unnamed_addr global [80 x i32], align 8
+
+; Function Attrs: nounwind
+define void @fred() local_unnamed_addr #0 {
+b0:
+  br label %b1
+
+b1:                                               ; preds = %b20, %b0
+  br label %b2
+
+b2:                                               ; preds = %b2, %b1
+  %v3 = phi i32 [ 0, %b1 ], [ %v17, %b2 ]
+  %v4 = phi i32 [ 0, %b1 ], [ %v16, %b2 ]
+  %v5 = phi i32 [ undef, %b1 ], [ %v18, %b2 ]
+  %v6 = load i32, i32* undef, align 8
+  %v7 = icmp sgt i32 %v6, undef
+  %v8 = select i1 %v7, i32 %v3, i32 %v4
+  %v9 = select i1 undef, i32 0, i32 %v8
+  %v10 = select i1 undef, i32 undef, i32 %v9
+  %v11 = select i1 undef, i32 0, i32 %v10
+  %v12 = icmp sgt i32 undef, 0
+  %v13 = select i1 %v12, i32 undef, i32 %v11
+  %v14 = select i1 false, i32 undef, i32 %v13
+  %v15 = select i1 false, i32 undef, i32 %v14
+  %v16 = select i1 false, i32 undef, i32 %v15
+  %v17 = add nsw i32 %v3, 8
+  %v18 = add i32 %v5, -8
+  %v19 = icmp eq i32 %v18, 0
+  br i1 %v19, label %b20, label %b2
+
+b20:                                              ; preds = %b2
+  %v21 = getelementptr inbounds [80 x i32], [80 x i32]* @x, i32 0, i32 %v16
+  store i32 -2000, i32* %v21, align 4
+  br label %b1
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv55" "target-features"="-hvx,-hvx-double,-long-calls" }
diff --git a/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir b/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir
index 983035e228cc4aa61d35842f5f0653003470d080..f3d105f75da27f01d53759ab080036739ba9664d 100644
--- a/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir
+++ b/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=hexagon -run-pass expand-condsets -o - 2>&1 %s -verify-machineinstrs -debug-only=expand-condsets | FileCheck %s
+# RUN: llc -march=hexagon -run-pass expand-condsets -o - %s -verify-machineinstrs -debug-only=expand-condsets 2>&1 | FileCheck %s
 # REQUIRES: asserts
 
 # Check that coalesced registers are removed from live intervals.
diff --git a/test/CodeGen/Hexagon/extload-combine.ll b/test/CodeGen/Hexagon/extload-combine.ll
index c492343d7915d39046ef8714fbe65a6897cfbf9b..c7a386a664ba6aa8477d64b21408cd5c4cc6001f 100644
--- a/test/CodeGen/Hexagon/extload-combine.ll
+++ b/test/CodeGen/Hexagon/extload-combine.ll
@@ -15,8 +15,8 @@
 
 ; Function Attrs: nounwind
 define i64 @short_test1() #0 {
-; CHECK: [[VAR:r[0-9]+]]{{ *}}={{ *}}memuh(##
-; CHECK: combine(#0, [[VAR]])
+; CHECK: [[VAR:r[0-9]+]] = memuh(##
+; CHECK: combine(#0,[[VAR]])
 entry:
   store i16 0, i16* @a, align 2
   %0 = load i16, i16* @b, align 2
@@ -26,7 +26,7 @@ entry:
 
 ; Function Attrs: nounwind
 define i64 @short_test2() #0 {
-; CHECK: [[VAR1:r[0-9]+]]{{ *}}={{ *}}memh(##
+; CHECK: [[VAR1:r[0-9]+]] = memh(##
 ; CHECK: sxtw([[VAR1]])
 entry:
   store i16 0, i16* @a, align 2
@@ -37,8 +37,8 @@ entry:
 
 ; Function Attrs: nounwind
 define i64 @char_test1() #0 {
-; CHECK: [[VAR2:r[0-9]+]]{{ *}}={{ *}}memub(##
-; CHECK: combine(#0, [[VAR2]])
+; CHECK: [[VAR2:r[0-9]+]] = memub(##
+; CHECK: combine(#0,[[VAR2]])
 entry:
   store i8 0, i8* @char_a, align 1
   %0 = load i8, i8* @char_b, align 1
@@ -48,7 +48,7 @@ entry:
 
 ; Function Attrs: nounwind
 define i64 @char_test2() #0 {
-; CHECK: [[VAR3:r[0-9]+]]{{ *}}={{ *}}memb(##
+; CHECK: [[VAR3:r[0-9]+]] = memb(##
 ; CHECK: sxtw([[VAR3]])
 entry:
   store i8 0, i8* @char_a, align 1
@@ -59,8 +59,8 @@ entry:
 
 ; Function Attrs: nounwind
 define i64 @int_test1() #0 {
-; CHECK: [[VAR4:r[0-9]+]]{{ *}}={{ *}}memw(##
-; CHECK: combine(#0, [[VAR4]])
+; CHECK: [[VAR4:r[0-9]+]] = memw(##
+; CHECK: combine(#0,[[VAR4]])
 entry:
   store i32 0, i32* @int_a, align 4
   %0 = load i32, i32* @int_b, align 4
@@ -70,7 +70,7 @@ entry:
 
 ; Function Attrs: nounwind
 define i64 @int_test2() #0 {
-; CHECK: [[VAR5:r[0-9]+]]{{ *}}={{ *}}memw(##
+; CHECK: [[VAR5:r[0-9]+]] = memw(##
 ; CHECK: sxtw([[VAR5]])
 entry:
   store i32 0, i32* @int_a, align 4
diff --git a/test/CodeGen/Hexagon/extract-basic.ll b/test/CodeGen/Hexagon/extract-basic.ll
index c75125cedd3504053716e949f3f7259527a39f8f..ad118dea0ab65bbe3ae870c60bb9370daa6bc25c 100644
--- a/test/CodeGen/Hexagon/extract-basic.ll
+++ b/test/CodeGen/Hexagon/extract-basic.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
 
-; CHECK-DAG: extractu(r{{[0-9]*}}, #3, #4)
-; CHECK-DAG: extractu(r{{[0-9]*}}, #8, #7)
-; CHECK-DAG: extractu(r{{[0-9]*}}, #8, #16)
+; CHECK-DAG: extractu(r{{[0-9]*}},#3,#4)
+; CHECK-DAG: extractu(r{{[0-9]*}},#8,#7)
+; CHECK-DAG: extractu(r{{[0-9]*}},#8,#16)
 
 ; C source:
 ; typedef struct {
diff --git a/test/CodeGen/Hexagon/fadd.ll b/test/CodeGen/Hexagon/fadd.ll
index 6cf0fbbccf73e128ad5d7c15772016bef180b6df..0418c1724f5bdc043bc7d98778e613be9c239c55 100644
--- a/test/CodeGen/Hexagon/fadd.ll
+++ b/test/CodeGen/Hexagon/fadd.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
 ; Check that we generate sp floating point add in V5.
 
-; CHECK: r{{[0-9]+}} = sfadd(r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK: r{{[0-9]+}} = sfadd(r{{[0-9]+}},r{{[0-9]+}})
 
 define i32 @main() nounwind {
 entry:
diff --git a/test/CodeGen/Hexagon/find-loop-instr.ll b/test/CodeGen/Hexagon/find-loop-instr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1234baf17f528e6dc1a99bd06d56d41f58cbbc7e
--- /dev/null
+++ b/test/CodeGen/Hexagon/find-loop-instr.ll
@@ -0,0 +1,79 @@
+; RUN: llc -march=hexagon < %s
+; REQUIRES: asserts
+
+; This code causes multiple endloop instructions to be generated for the
+; same loop. The findLoopInstr would encounter for one endloop would encounter
+; the other endloop, and return null in response. This resulted in a crash.
+;
+; Check that with the fix we are able to compile this code successfully.
+
+target triple = "hexagon"
+
+; Function Attrs: norecurse
+define void @fred() local_unnamed_addr #0 align 2 {
+b0:
+  br label %b7
+
+b1:                                               ; preds = %b9
+  br i1 undef, label %b4, label %b2
+
+b2:                                               ; preds = %b1
+  %v3 = sub i32 undef, undef
+  br label %b4
+
+b4:                                               ; preds = %b2, %b1
+  %v5 = phi i32 [ undef, %b1 ], [ %v3, %b2 ]
+  br i1 undef, label %b14, label %b6
+
+b6:                                               ; preds = %b4
+  br label %b10
+
+b7:                                               ; preds = %b0
+  br i1 undef, label %b9, label %b8
+
+b8:                                               ; preds = %b7
+  unreachable
+
+b9:                                               ; preds = %b7
+  br label %b1
+
+b10:                                              ; preds = %b21, %b6
+  %v11 = phi i32 [ %v22, %b21 ], [ %v5, %b6 ]
+  br i1 undef, label %b21, label %b12
+
+b12:                                              ; preds = %b10
+  br label %b15
+
+b13:                                              ; preds = %b21
+  br label %b14
+
+b14:                                              ; preds = %b13, %b4
+  ret void
+
+b15:                                              ; preds = %b12
+  br i1 undef, label %b16, label %b17
+
+b16:                                              ; preds = %b15
+  store i32 0, i32* undef, align 4
+  br label %b21
+
+b17:                                              ; preds = %b15
+  br label %b18
+
+b18:                                              ; preds = %b17
+  br i1 undef, label %b19, label %b20
+
+b19:                                              ; preds = %b18
+  br label %b21
+
+b20:                                              ; preds = %b18
+  store i32 0, i32* undef, align 4
+  br label %b21
+
+b21:                                              ; preds = %b20, %b19, %b16, %b10
+  %v22 = add i32 %v11, -8
+  %v23 = icmp eq i32 %v22, 0
+  br i1 %v23, label %b13, label %b10
+}
+
+attributes #0 = { norecurse "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
diff --git a/test/CodeGen/Hexagon/float-amode.ll b/test/CodeGen/Hexagon/float-amode.ll
index 9804f48349f8d3130eeed1537083c7c2740894fa..d770582ecab997ba1db072e683965883c5546b07 100644
--- a/test/CodeGen/Hexagon/float-amode.ll
+++ b/test/CodeGen/Hexagon/float-amode.ll
@@ -12,9 +12,9 @@
 @a = common global float 0.000000e+00, align 4
 
 ; CHECK-LABEL: test1
-; CHECK: [[REG11:(r[0-9]+)]]{{ *}}={{ *}}memw(r{{[0-9]+}} + r{{[0-9]+}}<<#2)
+; CHECK: [[REG11:(r[0-9]+)]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#2)
 ; CHECK: [[REG12:(r[0-9]+)]] += sfmpy({{.*}}[[REG11]]
-; CHECK: memw(r{{[0-9]+}} + r{{[0-9]+}}<<#2) = [[REG12]].new
+; CHECK: memw(r{{[0-9]+}}+r{{[0-9]+}}<<#2) = [[REG12]].new
 
 ; Function Attrs: norecurse nounwind
 define void @test1(%struct.matrix_params* nocapture readonly %params, i32 %col1) {
@@ -35,7 +35,7 @@ entry:
 }
 
 ; CHECK-LABEL: test2
-; CHECK: [[REG21:(r[0-9]+)]]{{ *}}={{ *}}memw(##globB+92)
+; CHECK: [[REG21:(r[0-9]+)]] = memw(##globB+92)
 ; CHECK: [[REG22:(r[0-9]+)]] = sfadd({{.*}}[[REG21]]
 ; CHECK: memw(##globA+84) = [[REG22]]
 
@@ -54,9 +54,9 @@ entry:
 }
 
 ; CHECK-LABEL: test3
-; CHECK: [[REG31:(r[0-9]+)]]{{ *}}={{ *}}memw(#b)
+; CHECK: [[REG31:(r[0-9]+)]] = memw(gp+#b)
 ; CHECK: [[REG32:(r[0-9]+)]] = sfadd({{.*}}[[REG31]]
-; CHECK: memw(#a) = [[REG32]]
+; CHECK: memw(gp+#a) = [[REG32]]
 
 ; Function Attrs: norecurse nounwind
 define void @test3(%struct.matrix_params* nocapture readonly %params, i32 %col1) {
@@ -73,9 +73,9 @@ entry:
 }
 
 ; CHECK-LABEL: test4
-; CHECK: [[REG41:(r[0-9]+)]]{{ *}}={{ *}}memw(r0<<#2 + ##globB+52)
+; CHECK: [[REG41:(r[0-9]+)]] = memw(r0<<#2+##globB+52)
 ; CHECK: [[REG42:(r[0-9]+)]] = sfadd({{.*}}[[REG41]]
-; CHECK: memw(r0<<#2 + ##globA+60) = [[REG42]]
+; CHECK: memw(r0<<#2+##globA+60) = [[REG42]]
 ; Function Attrs: noinline norecurse nounwind
 define void @test4(i32 %col1) {
 entry:
diff --git a/test/CodeGen/Hexagon/fmul.ll b/test/CodeGen/Hexagon/fmul.ll
index 4f55d0bec47173772c5e07c8749aeaa65454b3cd..552f98ec7a53afa4acbf8ab33c933bdaec0732d0 100644
--- a/test/CodeGen/Hexagon/fmul.ll
+++ b/test/CodeGen/Hexagon/fmul.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
 ; Check that we generate single precision floating point multiply in V5.
 
-; CHECK: r{{[0-9]+}} = sfmpy(r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK: r{{[0-9]+}} = sfmpy(r{{[0-9]+}},r{{[0-9]+}})
 
 
 define i32 @main() nounwind {
diff --git a/test/CodeGen/Hexagon/fsel.ll b/test/CodeGen/Hexagon/fsel.ll
index 247249da50b14ba926eb38333229ded59799e0cb..a2f0b4a47f105911b425948602a6488c956e8178 100644
--- a/test/CodeGen/Hexagon/fsel.ll
+++ b/test/CodeGen/Hexagon/fsel.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon -O0 < %s | FileCheck %s
 
 ; CHECK-LABEL: danny:
-; CHECK: mux(p0, r1, ##1065353216)
+; CHECK: mux(p0,r1,##1065353216)
 
 define float @danny(i32 %x, float %f) #0 {
   %t = icmp sgt i32 %x, 0
@@ -10,7 +10,7 @@ define float @danny(i32 %x, float %f) #0 {
 }
 
 ; CHECK-LABEL: sammy:
-; CHECK: mux(p0, ##1069547520, r1)
+; CHECK: mux(p0,##1069547520,r1)
 
 define float @sammy(i32 %x, float %f) #0 {
   %t = icmp sgt i32 %x, 0
diff --git a/test/CodeGen/Hexagon/fsub.ll b/test/CodeGen/Hexagon/fsub.ll
index ca7bdc4d0b38b98e9aa3cc1b8938f651ad051cd0..d7b0e2f65b33cd4ba498f2728d68476528bd5c89 100644
--- a/test/CodeGen/Hexagon/fsub.ll
+++ b/test/CodeGen/Hexagon/fsub.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5  < %s | FileCheck %s
 ; Check that we generate sp floating point subtract in V5.
 
-; CHECK: r{{[0-9]+}} = sfsub(r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK: r{{[0-9]+}} = sfsub(r{{[0-9]+}},r{{[0-9]+}})
 
 define i32 @main() nounwind {
 entry:
diff --git a/test/CodeGen/Hexagon/fusedandshift.ll b/test/CodeGen/Hexagon/fusedandshift.ll
index 414574aec401cb38b7a2f3aa20bd1ebebc6330ca..9abd366e6916be7d7f3070d3d3b0846938fa42ba 100644
--- a/test/CodeGen/Hexagon/fusedandshift.ll
+++ b/test/CodeGen/Hexagon/fusedandshift.ll
@@ -1,8 +1,8 @@
-; RUN: llc -march=hexagon -hexagon-extract=0 < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-extract=0 -hexbit-extract=0 < %s | FileCheck %s
 ; Check that we generate fused logical and with shift instruction.
 ; Disable "extract" generation, since it may eliminate the and/lsr.
 
-; CHECK: r{{[0-9]+}} = and(#15, lsr(r{{[0-9]+}}, #{{[0-9]+}})
+; CHECK: r{{[0-9]+}} = and(#15,lsr(r{{[0-9]+}},#{{[0-9]+}})
 
 define i32 @main(i16* %a, i16* %b) nounwind {
   entry:
diff --git a/test/CodeGen/Hexagon/gp-rel.ll b/test/CodeGen/Hexagon/gp-rel.ll
index bb7cb182bf1b9acf9b8ce625c6d134cf8ceab144..00f57797b6f1d314734d353ee757f0caa856fdbf 100644
--- a/test/CodeGen/Hexagon/gp-rel.ll
+++ b/test/CodeGen/Hexagon/gp-rel.ll
@@ -7,8 +7,8 @@
 
 define i32 @foo(i32 %p) #0 {
 entry:
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(#a)
-; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(#b)
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(gp+#a)
+; CHECK: r{{[0-9]+}}{{ *}}={{ *}}memw(gp+#b)
 ; CHECK: if{{ *}}(p{{[0-3]}}) memw(##c){{ *}}={{ *}}r{{[0-9]+}}
   %0 = load i32, i32* @a, align 4
   %1 = load i32, i32* @b, align 4
diff --git a/test/CodeGen/Hexagon/hwloop-cleanup.ll b/test/CodeGen/Hexagon/hwloop-cleanup.ll
index c04966a5a4b250a56072682156d00cfcddccefda..56a6fedf81ef8a2ef9028d9d8993f30d1f2c34b2 100644
--- a/test/CodeGen/Hexagon/hwloop-cleanup.ll
+++ b/test/CodeGen/Hexagon/hwloop-cleanup.ll
@@ -5,7 +5,7 @@
 ; Bug 6685.
 
 ; CHECK: loop0
-; CHECK-NOT: r{{[0-9]+}}{{.}}={{.}}add(r{{[0-9]+}},{{.}}#-1)
+; CHECK-NOT: r{{[0-9]+}} = add(r{{[0-9]+}},#-1)
 ; CHECK-NOT: cmp.eq
 ; CHECK: endloop0
 
@@ -39,7 +39,7 @@ for.end:
 ; This test checks that that initial loop count value is removed.
 ; CHECK-NOT: ={{.}}#40
 ; CHECK: loop0
-; CHECK-NOT: r{{[0-9]+}}{{.}}={{.}}add(r{{[0-9]+}},{{.}}#-1)
+; CHECK-NOT: r{{[0-9]+}} = add(r{{[0-9]+}},#-1)
 ; CHECK-NOT: cmp.eq
 ; CHECK: endloop0
 
@@ -64,7 +64,7 @@ for.end:
 
 ; This test checks that we don't remove the induction variable since it's used.
 ; CHECK: loop0
-; CHECK: r{{[0-9]+}}{{.}}={{.}}add(r{{[0-9]+}},{{.}}#1)
+; CHECK: r{{[0-9]+}} = add(r{{[0-9]+}},#1)
 ; CHECK-NOT: cmp.eq
 ; CHECK: endloop0
 define i32 @test3(i32* nocapture %b) nounwind {
diff --git a/test/CodeGen/Hexagon/hwloop-loop1.ll b/test/CodeGen/Hexagon/hwloop-loop1.ll
index 238d34e7ea1513ab38a9cde44ef72a45ef76c593..427efdc2c1110cc4deee58bc6a242289134a9f19 100644
--- a/test/CodeGen/Hexagon/hwloop-loop1.ll
+++ b/test/CodeGen/Hexagon/hwloop-loop1.ll
@@ -2,8 +2,8 @@
 ;
 ; Generate loop1 instruction for double loop sequence.
 
-; CHECK: loop1(.LBB{{.}}_{{.}}, #100)
-; CHECK: loop0(.LBB{{.}}_{{.}}, #100)
+; CHECK: loop1(.LBB{{.}}_{{.}},#100)
+; CHECK: loop0(.LBB{{.}}_{{.}},#100)
 ; CHECK: endloop0
 ; CHECK: endloop1
 
@@ -12,9 +12,9 @@ entry:
   %array = alloca [100 x i32], align 8
   %doublearray = alloca [100 x [100 x i32]], align 8
   %0 = bitcast [100 x i32]* %array to i8*
-  call void @llvm.lifetime.start(i64 400, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 400, i8* %0) #1
   %1 = bitcast [100 x [100 x i32]]* %doublearray to i8*
-  call void @llvm.lifetime.start(i64 40000, i8* %1) #1
+  call void @llvm.lifetime.start.p0i8(i64 40000, i8* %1) #1
   %arrayidx1 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* %doublearray, i32 0, i32 10, i32 10
   %arrayidx2.gep = getelementptr [100 x i32], [100 x i32]* %array, i32 0, i32 0
   br label %for.body
@@ -56,11 +56,11 @@ for.inc15:
 
 for.end17:
   %3 = load i32, i32* %arrayidx1, align 8
-  call void @llvm.lifetime.end(i64 40000, i8* %1) #1
-  call void @llvm.lifetime.end(i64 400, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 40000, i8* %1) #1
+  call void @llvm.lifetime.end.p0i8(i64 400, i8* %0) #1
   ret i32 %3
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
diff --git a/test/CodeGen/Hexagon/hwloop1.ll b/test/CodeGen/Hexagon/hwloop1.ll
index 68af3b34eeeb237bb4b0e858d8041d00dc21de44..7a805d951b959f1b7e9351773466bf17c4eb72c5 100644
--- a/test/CodeGen/Hexagon/hwloop1.ll
+++ b/test/CodeGen/Hexagon/hwloop1.ll
@@ -3,7 +3,7 @@
 
 ; Case 1 : Loop with a constant number of iterations.
 ; CHECK-LABEL: @hwloop1
-; CHECK: loop0(.LBB{{.}}_{{.}}, #10)
+; CHECK: loop0(.LBB{{.}}_{{.}},#10)
 ; CHECK: endloop0
 
 @a = common global [10 x i32] zeroinitializer, align 4
@@ -23,7 +23,7 @@ for.end:
 
 ; Case 2 : Loop with a run-time number of iterations.
 ; CHECK-LABEL: @hwloop2
-; CHECK: loop0(.LBB{{.}}_{{.}}, r{{[0-9]+}})
+; CHECK: loop0(.LBB{{.}}_{{.}},r{{[0-9]+}})
 ; CHECK: endloop0
 
 define i32 @hwloop2(i32 %n, i32* nocapture %b) nounwind {
@@ -54,8 +54,8 @@ for.end:
 
 ; Case 3 : Induction variable increment more than 1.
 ; CHECK-LABEL: @hwloop3
-; CHECK: lsr(r{{[0-9]+}}, #2)
-; CHECK: loop0(.LBB{{.}}_{{.}}, r{{[0-9]+}})
+; CHECK: lsr(r{{[0-9]+}},#2)
+; CHECK: loop0(.LBB{{.}}_{{.}},r{{[0-9]+}})
 ; CHECK: endloop0
 
 define i32 @hwloop3(i32 %n, i32* nocapture %b) nounwind {
@@ -86,7 +86,7 @@ for.end:
 
 ; Case 4 : Loop exit compare uses register instead of immediate value.
 ; CHECK-LABEL: @hwloop4
-; CHECK: loop0(.LBB{{.}}_{{.}}, r{{[0-9]+}})
+; CHECK: loop0(.LBB{{.}}_{{.}},r{{[0-9]+}})
 ; CHECK: endloop0
 
 define i32 @hwloop4(i32 %n, i32* nocapture %b) nounwind {
@@ -114,7 +114,7 @@ for.end:
 
 ; Case 5: After LSR, the initial value is 100 and the iv decrements to 0.
 ; CHECK-LABEL: @hwloop5
-; CHECK: loop0(.LBB{{.}}_{{.}}, #100)
+; CHECK: loop0(.LBB{{.}}_{{.}},#100)
 ; CHECK: endloop0
 
 define void @hwloop5(i32* nocapture %a, i32* nocapture %res) nounwind {
@@ -138,8 +138,8 @@ for.end:
 
 ; Case 6: Large immediate offset
 ; CHECK-LABEL: @hwloop6
-; CHECK-NOT: loop0(.LBB{{.}}_{{.}}, #1024)
-; CHECK: loop0(.LBB{{.}}_{{.}}, r{{[0-9]+}})
+; CHECK-NOT: loop0(.LBB{{.}}_{{.}},#1024)
+; CHECK: loop0(.LBB{{.}}_{{.}},r{{[0-9]+}})
 ; CHECK: endloop0
 
 define void @hwloop6(i32* nocapture %a, i32* nocapture %res) nounwind {
diff --git a/test/CodeGen/Hexagon/hwloop2.ll b/test/CodeGen/Hexagon/hwloop2.ll
index d411d979904ec9d092266c268054db3fa4a87f46..ba3de1f1a2af0b90d2aa02cdb2132206bbcc10e5 100644
--- a/test/CodeGen/Hexagon/hwloop2.ll
+++ b/test/CodeGen/Hexagon/hwloop2.ll
@@ -2,7 +2,7 @@
 
 ; Test for multiple phis with induction variables.
 
-; CHECK: loop0(.LBB{{.}}_{{.}}, r{{[0-9]+}})
+; CHECK: loop0(.LBB{{.}}_{{.}},r{{[0-9]+}})
 ; CHECK: endloop0
 
 define i32 @hwloop4(i32* nocapture %s, i32* nocapture %a, i32 %n) {
diff --git a/test/CodeGen/Hexagon/hwloop4.ll b/test/CodeGen/Hexagon/hwloop4.ll
index d159c45e3fb82695e36e1b6b4b3ffc407ef4780a..b8cea4c7772083d1d7f5f60193628031d867830e 100644
--- a/test/CodeGen/Hexagon/hwloop4.ll
+++ b/test/CodeGen/Hexagon/hwloop4.ll
@@ -2,9 +2,9 @@
 ;
 ; Remove the unnecessary 'add' instruction used for the hardware loop setup.
 
-; CHECK: [[OP0:r[0-9]+]] = add([[OP1:r[0-9]+]], #-[[OP2:[0-9]+]]
-; CHECK-NOT: add([[OP0]], #[[OP2]])
-; CHECK: lsr([[OP1]], #{{[0-9]+}})
+; CHECK: [[OP0:r[0-9]+]] = add([[OP1:r[0-9]+]],#-[[OP2:[0-9]+]]
+; CHECK-NOT: add([[OP0]],#[[OP2]])
+; CHECK: lsr([[OP1]],#{{[0-9]+}})
 ; CHECK: loop0
 
 define void @matrix_mul_matrix(i32 %N, i32* nocapture %C, i16* nocapture readnone %A, i16* nocapture readnone %B) #0 {
diff --git a/test/CodeGen/Hexagon/hwloop5.ll b/test/CodeGen/Hexagon/hwloop5.ll
index 0886b03cc7545bebf1c2bd1bff56d164c9882d0e..f4990dabebb9dcf8c11db019c9694e65109998aa 100644
--- a/test/CodeGen/Hexagon/hwloop5.ll
+++ b/test/CodeGen/Hexagon/hwloop5.ll
@@ -2,9 +2,9 @@
 ;
 ; Generate hardware loop when unknown trip count loop is vectorized.
 
-; CHECK: loop0(.LBB{{[0-9]*}}_{{[0-9]*}}, r{{[0-9]+}})
+; CHECK: loop0(.LBB{{[0-9]*}}_{{[0-9]*}},r{{[0-9]+}})
 ; CHECK: endloop0
-; CHECK: loop0(.LBB{{[0-9]*}}_{{[0-9]*}}, r{{[0-9]+}})
+; CHECK: loop0(.LBB{{[0-9]*}}_{{[0-9]*}},r{{[0-9]+}})
 ; CHECK: endloop0
 
 @A = common global [1000 x i32] zeroinitializer, align 8
diff --git a/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll b/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll
index 68a5dc16ecff9778fe75d994343a23a9cd637afd..91b9aaa9cb4ea80444df7fe87617dcd56265aadf 100644
--- a/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll
+++ b/test/CodeGen/Hexagon/ifcvt-diamond-bug-2016-08-26.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -o - %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-eif=0 < %s | FileCheck %s
 target triple = "hexagon"
 
 %struct.0 = type { i16, i16 }
@@ -15,7 +15,7 @@ entry:
   br i1 %cmp199, label %if.then200, label %if.else201
 
 ; CHECK-DAG: [[R4:r[0-9]+]] = #4
-; CHECK: p0 = cmp.eq(r0, #0)
+; CHECK: p0 = cmp.eq(r0,#0)
 ; CHECK: if (!p0.new) [[R3:r[0-9]+]] = #3
 ; CHECK-DAG: if (!p0) memh(##t) = [[R3]]
 ; CHECK-DAG: if (p0) memh(##t) = [[R4]]
diff --git a/test/CodeGen/Hexagon/ifcvt-simple-bprob.ll b/test/CodeGen/Hexagon/ifcvt-simple-bprob.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2d48d30dd7d8976ca7460662cdd4db987c5eabe1
--- /dev/null
+++ b/test/CodeGen/Hexagon/ifcvt-simple-bprob.ll
@@ -0,0 +1,36 @@
+; RUN: llc -march=hexagon < %s
+
+; Check that branch probabilities are set correctly after performing the
+; simple variant of if-conversion. The converted block has a branch that
+; is not analyzable.
+
+target triple = "hexagon"
+
+declare void @foo()
+
+; CHECK-LABEL: danny
+; CHECK: if (p0.new) jump:nt foo
+define void @danny(i32 %x) {
+  %t0 = icmp sgt i32 %x, 0
+  br i1 %t0, label %tail, label %exit, !prof !0
+tail:
+  tail call void @foo();
+  ret void
+exit:
+  ret void
+}
+
+; CHECK-LABEL: sammy
+; CHECK: if (!p0.new) jump:t foo
+define void @sammy(i32 %x) {
+  %t0 = icmp sgt i32 %x, 0
+  br i1 %t0, label %exit, label %tail, !prof !0
+tail:
+  tail call void @foo();
+  ret void
+exit:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 2000}
+
diff --git a/test/CodeGen/Hexagon/inline-asm-vecpred128.ll b/test/CodeGen/Hexagon/inline-asm-vecpred128.ll
new file mode 100644
index 0000000000000000000000000000000000000000..234f5a0b792600c328ec77fad7d9ed0a0dfc9683
--- /dev/null
+++ b/test/CodeGen/Hexagon/inline-asm-vecpred128.ll
@@ -0,0 +1,15 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+; REQUIRES: asserts
+
+; Make sure we can handle the 'q' constraint in the 128-byte mode.
+
+target triple = "hexagon"
+
+; CHECK-LABEL: fred
+; CHECK: if (q{{[0-3]}}) vmem
+define void @fred() #0 {
+  tail call void asm sideeffect "if ($0) vmem($1) = $2;", "q,r,v,~{memory}"(<32 x i32> undef, <32 x i32>* undef, <32 x i32> undef) #0
+  ret void
+}
+
+attributes #0 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,+hvx-double" }
diff --git a/test/CodeGen/Hexagon/insert-basic.ll b/test/CodeGen/Hexagon/insert-basic.ll
index e941c063d9edeced2f8710378b3771776f2a436b..14ee735abd79c65091c5db52992dd0d2ae544fc4 100644
--- a/test/CodeGen/Hexagon/insert-basic.ll
+++ b/test/CodeGen/Hexagon/insert-basic.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
-; CHECK-DAG: insert(r{{[0-9]*}}, #17, #0)
-; CHECK-DAG: insert(r{{[0-9]*}}, #18, #0)
-; CHECK-DAG: insert(r{{[0-9]*}}, #22, #0)
-; CHECK-DAG: insert(r{{[0-9]*}}, #12, #0)
+; CHECK-DAG: insert(r{{[0-9]*}},#17,#0)
+; CHECK-DAG: insert(r{{[0-9]*}},#18,#0)
+; CHECK-DAG: insert(r{{[0-9]*}},#22,#0)
+; CHECK-DAG: insert(r{{[0-9]*}},#12,#0)
 
 ; C source:
 ; typedef struct {
diff --git a/test/CodeGen/Hexagon/insert4.ll b/test/CodeGen/Hexagon/insert4.ll
index c4d575dd40602f76ca47c89880a32dad6f6fcc62..3bc8e9e57982744056b56945d42d7f906ae48cee 100644
--- a/test/CodeGen/Hexagon/insert4.ll
+++ b/test/CodeGen/Hexagon/insert4.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
 ;
 ; Check that we no longer generate 4 inserts.
-; CHECK: combine(r{{[0-9]+}}.l, r{{[0-9]+}}.l)
-; CHECK: combine(r{{[0-9]+}}.l, r{{[0-9]+}}.l)
+; CHECK: combine(r{{[0-9]+}}.l,r{{[0-9]+}}.l)
+; CHECK: combine(r{{[0-9]+}}.l,r{{[0-9]+}}.l)
 ; CHECK-NOT: insert
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
diff --git a/test/CodeGen/Hexagon/intrinsics/alu32_alu.ll b/test/CodeGen/Hexagon/intrinsics/alu32_alu.ll
index fcf80b08181ee233165311582b3d562a6149672c..abdd4cba7c5c719e3f563ae497e374cdd09e8650 100644
--- a/test/CodeGen/Hexagon/intrinsics/alu32_alu.ll
+++ b/test/CodeGen/Hexagon/intrinsics/alu32_alu.ll
@@ -10,21 +10,21 @@ define i32 @A2_addi(i32 %a) {
   %z = call i32 @llvm.hexagon.A2.addi(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = add({{.*}}, #0)
+; CHECK: = add({{.*}},#0)
 
 declare i32 @llvm.hexagon.A2.add(i32, i32)
 define i32 @A2_add(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.add(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}, {{.*}})
+; CHECK: = add({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.addsat(i32, i32)
 define i32 @A2_addsat(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addsat(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}, {{.*}}):sat
+; CHECK: = add({{.*}},{{.*}}):sat
 
 ; Logical operations
 declare i32 @llvm.hexagon.A2.and(i32, i32)
@@ -32,35 +32,35 @@ define i32 @A2_and(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.and(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = and({{.*}}, {{.*}})
+; CHECK: = and({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.or(i32, i32)
 define i32 @A2_or(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.or(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = or({{.*}}, {{.*}})
+; CHECK: = or({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.xor(i32, i32)
 define i32 @A2_xor(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.xor(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = xor({{.*}}, {{.*}})
+; CHECK: = xor({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.andn(i32, i32)
 define i32 @A4_andn(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.andn(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = and({{.*}}, ~{{.*}})
+; CHECK: = and({{.*}},~{{.*}})
 
 declare i32 @llvm.hexagon.A4.orn(i32, i32)
 define i32 @A4_orn(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.orn(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = or({{.*}}, ~{{.*}})
+; CHECK: = or({{.*}},~{{.*}})
 
 ; Subtract
 declare i32 @llvm.hexagon.A2.sub(i32, i32)
@@ -68,14 +68,14 @@ define i32 @A2_sub(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.sub(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}, {{.*}})
+; CHECK: = sub({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.subsat(i32, i32)
 define i32 @A2_subsat(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subsat(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}, {{.*}}):sat
+; CHECK: = sub({{.*}},{{.*}}):sat
 
 ; Sign extend
 declare i32 @llvm.hexagon.A2.sxtb(i32)
@@ -128,21 +128,21 @@ define i32 @A2_svaddh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.svaddh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vaddh({{.*}}, {{.*}})
+; CHECK: = vaddh({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.svaddhs(i32, i32)
 define i32 @A2_svaddhs(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.svaddhs(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vaddh({{.*}}, {{.*}}):sat
+; CHECK: = vaddh({{.*}},{{.*}}):sat
 
 declare i32 @llvm.hexagon.A2.svadduhs(i32, i32)
 define i32 @A2_svadduhs(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.svadduhs(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vadduh({{.*}}, {{.*}}):sat
+; CHECK: = vadduh({{.*}},{{.*}}):sat
 
 ; Vector average halfwords
 declare i32 @llvm.hexagon.A2.svavgh(i32, i32)
@@ -150,21 +150,21 @@ define i32 @A2_svavgh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.svavgh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vavgh({{.*}}, {{.*}})
+; CHECK: = vavgh({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.svavghs(i32, i32)
 define i32 @A2_svavghs(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.svavghs(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vavgh({{.*}}, {{.*}}):rnd
+; CHECK: = vavgh({{.*}},{{.*}}):rnd
 
 declare i32 @llvm.hexagon.A2.svnavgh(i32, i32)
 define i32 @A2_svnavgh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.svnavgh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vnavgh({{.*}}, {{.*}})
+; CHECK: = vnavgh({{.*}},{{.*}})
 
 ; Vector subtract halfwords
 declare i32 @llvm.hexagon.A2.svsubh(i32, i32)
@@ -172,21 +172,21 @@ define i32 @A2_svsubh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.svsubh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vsubh({{.*}}, {{.*}})
+; CHECK: = vsubh({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.svsubhs(i32, i32)
 define i32 @A2_svsubhs(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.svsubhs(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vsubh({{.*}}, {{.*}}):sat
+; CHECK: = vsubh({{.*}},{{.*}}):sat
 
 declare i32 @llvm.hexagon.A2.svsubuhs(i32, i32)
 define i32 @A2_svsubuhs(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.svsubuhs(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vsubuh({{.*}}, {{.*}}):sat
+; CHECK: = vsubuh({{.*}},{{.*}}):sat
 
 ; Zero extend
 declare i32 @llvm.hexagon.A2.zxth(i32)
diff --git a/test/CodeGen/Hexagon/intrinsics/alu32_perm.ll b/test/CodeGen/Hexagon/intrinsics/alu32_perm.ll
index c9fb0afe0781035ad1309c98a985c4a5f13c2b97..554dac4563d100008386a47fdc990b707395701b 100644
--- a/test/CodeGen/Hexagon/intrinsics/alu32_perm.ll
+++ b/test/CodeGen/Hexagon/intrinsics/alu32_perm.ll
@@ -10,56 +10,56 @@ define i64 @A4_combineri(i32 %a) {
   %z = call i64 @llvm.hexagon.A4.combineri(i32 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = combine({{.*}}, #0)
+; CHECK: = combine({{.*}},#0)
 
 declare i64 @llvm.hexagon.A4.combineir(i32, i32)
 define i64 @A4_combineir(i32 %a) {
   %z = call i64 @llvm.hexagon.A4.combineir(i32 0, i32 %a)
   ret i64 %z
 }
-; CHECK: = combine(#0, {{.*}})
+; CHECK: = combine(#0,{{.*}})
 
 declare i64 @llvm.hexagon.A2.combineii(i32, i32)
 define i64 @A2_combineii() {
   %z = call i64 @llvm.hexagon.A2.combineii(i32 0, i32 0)
   ret i64 %z
 }
-; CHECK: = combine(#0, #0)
+; CHECK: = combine(#0,#0)
 
 declare i32 @llvm.hexagon.A2.combine.hh(i32, i32)
 define i32 @A2_combine_hh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.combine.hh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = combine({{.*}}, {{.*}})
+; CHECK: = combine({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.combine.hl(i32, i32)
 define i32 @A2_combine_hl(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.combine.hl(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = combine({{.*}}, {{.*}})
+; CHECK: = combine({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.combine.lh(i32, i32)
 define i32 @A2_combine_lh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.combine.lh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = combine({{.*}}, {{.*}})
+; CHECK: = combine({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.combine.ll(i32, i32)
 define i32 @A2_combine_ll(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.combine.ll(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = combine({{.*}}, {{.*}})
+; CHECK: = combine({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.combinew(i32, i32)
 define i64 @A2_combinew(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.A2.combinew(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = combine({{.*}}, {{.*}})
+; CHECK: = combine({{.*}},{{.*}})
 
 ; Mux
 declare i32 @llvm.hexagon.C2.muxri(i32, i32, i32)
@@ -67,21 +67,21 @@ define i32 @C2_muxri(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.C2.muxri(i32 %a, i32 0, i32 %b)
   ret i32 %z
 }
-; CHECK: = mux({{.*}}, #0, {{.*}})
+; CHECK: = mux({{.*}},#0,{{.*}})
 
 declare i32 @llvm.hexagon.C2.muxir(i32, i32, i32)
 define i32 @C2_muxir(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.C2.muxir(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: = mux({{.*}}, {{.*}}, #0)
+; CHECK: = mux({{.*}},{{.*}},#0)
 
 declare i32 @llvm.hexagon.C2.mux(i32, i32, i32)
 define i32 @C2_mux(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.C2.mux(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: = mux({{.*}}, {{.*}}, {{.*}})
+; CHECK: = mux({{.*}},{{.*}},{{.*}})
 
 ; Shift word by 16
 declare i32 @llvm.hexagon.A2.aslh(i32)
@@ -104,4 +104,4 @@ define i64 @S2_packhl(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.packhl(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = packhl({{.*}}, {{.*}})
+; CHECK: = packhl({{.*}},{{.*}})
diff --git a/test/CodeGen/Hexagon/intrinsics/byte-store-double.ll b/test/CodeGen/Hexagon/intrinsics/byte-store-double.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2a54bfef0ad7a0cd7bb7903218b4cec19df41f75
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/byte-store-double.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mattr=+hvx-double -march=hexagon -O2 < %s | FileCheck %s
+
+; CHECK-LABEL: V6_vmaskedstoreq_128B
+; CHECK: if (q{{[0-3]+}}) vmem(r{{[0-9]+}}+#0) = v{{[0-9]+}}
+
+; CHECK-LABEL: V6_vmaskedstorenq_128B
+; CHECK: if (!q{{[0-3]+}}) vmem(r{{[0-9]+}}+#0) = v{{[0-9]+}}
+
+; CHECK-LABEL: V6_vmaskedstorentq_128B
+; CHECK: if (q{{[0-3]+}}) vmem(r{{[0-9]+}}+#0):nt = v{{[0-9]+}}
+
+; CHECK-LABEL: V6_vmaskedstorentnq_128B
+; CHECK: if (!q{{[0-3]+}}) vmem(r{{[0-9]+}}+#0):nt = v{{[0-9]+}}
+
+declare void @llvm.hexagon.V6.vmaskedstoreq.128B(<1024 x i1>, i8*, <32 x i32>)
+define void @V6_vmaskedstoreq_128B( <32 x i32> %a, i8* %b, <32 x i32> %c) {
+  %1 = bitcast <32 x i32> %a to <1024 x i1>
+  call void @llvm.hexagon.V6.vmaskedstoreq.128B(<1024 x i1> %1, i8* %b, <32 x i32> %c)
+  ret void
+}
+
+declare void @llvm.hexagon.V6.vmaskedstorenq.128B(<1024 x i1>, i8*, <32 x i32>)
+define void @V6_vmaskedstorenq_128B( <32 x i32> %a, i8* %b, <32 x i32> %c) {
+  %1 = bitcast <32 x i32> %a to <1024 x i1>
+  call void @llvm.hexagon.V6.vmaskedstorenq.128B(<1024 x i1> %1, i8* %b, <32 x i32> %c)
+  ret void
+}
+
+declare void @llvm.hexagon.V6.vmaskedstorentq.128B(<1024 x i1>, i8*, <32 x i32>)
+define void @V6_vmaskedstorentq_128B( <32 x i32> %a, i8* %b, <32 x i32> %c) {
+  %1 = bitcast <32 x i32> %a to <1024 x i1>
+  call void @llvm.hexagon.V6.vmaskedstorentq.128B(<1024 x i1> %1, i8* %b, <32 x i32> %c)
+  ret void
+}
+
+declare void @llvm.hexagon.V6.vmaskedstorentnq.128B(<1024 x i1>, i8*, <32 x i32>)
+define void @V6_vmaskedstorentnq_128B( <32 x i32> %a, i8* %b, <32 x i32> %c) {
+  %1 = bitcast <32 x i32> %a to <1024 x i1>
+  call void @llvm.hexagon.V6.vmaskedstorentnq.128B(<1024 x i1> %1, i8* %b, <32 x i32> %c)
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/intrinsics/byte-store.ll b/test/CodeGen/Hexagon/intrinsics/byte-store.ll
new file mode 100644
index 0000000000000000000000000000000000000000..208c15fec9804304d7611c8f09d00a91b995d157
--- /dev/null
+++ b/test/CodeGen/Hexagon/intrinsics/byte-store.ll
@@ -0,0 +1,41 @@
+; RUN: llc -mattr=+hvx -march=hexagon -O2 < %s | FileCheck %s
+
+; CHECK-LABEL: V6_vmaskedstoreq
+; CHECK: if (q{{[0-3]+}}) vmem(r{{[0-9]+}}+#0) = v{{[0-9]+}}
+
+; CHECK-LABEL: V6_vmaskedstorenq
+; CHECK: if (!q{{[0-3]+}}) vmem(r{{[0-9]+}}+#0) = v{{[0-9]+}}
+
+; CHECK-LABEL: V6_vmaskedstorentq
+; CHECK: if (q{{[0-3]+}}) vmem(r{{[0-9]+}}+#0):nt = v{{[0-9]+}}
+
+; CHECK-LABEL: V6_vmaskedstorentnq
+; CHECK: if (!q{{[0-3]+}}) vmem(r{{[0-9]+}}+#0):nt = v{{[0-9]+}}
+
+declare void @llvm.hexagon.V6.vmaskedstoreq(<512 x i1>, i8*, <16 x i32>)
+define void @V6_vmaskedstoreq( <16 x i32> %a, i8* %b, <16 x i32> %c) {
+  %1 = bitcast <16 x i32> %a to <512 x i1>
+  call void @llvm.hexagon.V6.vmaskedstoreq(<512 x i1> %1, i8* %b, <16 x i32> %c)
+  ret void
+}
+
+declare void @llvm.hexagon.V6.vmaskedstorenq(<512 x i1>, i8*, <16 x i32>)
+define void @V6_vmaskedstorenq( <16 x i32> %a, i8* %b, <16 x i32> %c) {
+  %1 = bitcast <16 x i32> %a to <512 x i1>
+  call void @llvm.hexagon.V6.vmaskedstorenq(<512 x i1> %1, i8* %b, <16 x i32> %c)
+  ret void
+}
+
+declare void @llvm.hexagon.V6.vmaskedstorentq(<512 x i1>, i8*, <16 x i32>)
+define void @V6_vmaskedstorentq( <16 x i32> %a, i8* %b, <16 x i32> %c) {
+  %1 = bitcast <16 x i32> %a to <512 x i1>
+  call void @llvm.hexagon.V6.vmaskedstorentq(<512 x i1> %1, i8* %b, <16 x i32> %c)
+  ret void
+}
+
+declare void @llvm.hexagon.V6.vmaskedstorentnq(<512 x i1>, i8*, <16 x i32>)
+define void @V6_vmaskedstorentnq( <16 x i32> %a, i8* %b, <16 x i32> %c) {
+  %1 = bitcast <16 x i32> %a to <512 x i1>
+  call void @llvm.hexagon.V6.vmaskedstorentnq(<512 x i1> %1, i8* %b, <16 x i32> %c)
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/intrinsics/cr.ll b/test/CodeGen/Hexagon/intrinsics/cr.ll
index f308ef8e566437d6276cda95f09343873dd1316e..4c0fcb3707c1e38152616e04de530cf651c8ee68 100644
--- a/test/CodeGen/Hexagon/intrinsics/cr.ll
+++ b/test/CodeGen/Hexagon/intrinsics/cr.ll
@@ -10,14 +10,14 @@ define i32 @C4_fastcorner9(i32 %a, i32 %b) {
   %z = call i32@llvm.hexagon.C4.fastcorner9(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = fastcorner9({{.*}}, {{.*}})
+; CHECK: = fastcorner9({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.C4.fastcorner9.not(i32, i32)
 define i32 @C4_fastcorner9_not(i32 %a, i32 %b) {
   %z = call i32@llvm.hexagon.C4.fastcorner9.not(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = !fastcorner9({{.*}}, {{.*}})
+; CHECK: = !fastcorner9({{.*}},{{.*}})
 
 ; Logical reductions on predicates
 declare i32 @llvm.hexagon.C2.any8(i32)
@@ -41,70 +41,70 @@ define i32 @C2_and(i32 %a, i32 %b) {
   %z = call i32@llvm.hexagon.C2.and(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = and({{.*}}, {{.*}})
+; CHECK: = and({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.C4.and.and(i32, i32, i32)
 define i32 @C4_and_and(i32 %a, i32 %b, i32 %c) {
   %z = call i32@llvm.hexagon.C4.and.and(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: = and({{.*}}, and({{.*}}, {{.*}}))
+; CHECK: = and({{.*}},and({{.*}},{{.*}}))
 
 declare i32 @llvm.hexagon.C2.or(i32, i32)
 define i32 @C2_or(i32 %a, i32 %b) {
   %z = call i32@llvm.hexagon.C2.or(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = or({{.*}}, {{.*}})
+; CHECK: = or({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.C4.and.or(i32, i32, i32)
 define i32 @C4_and_or(i32 %a, i32 %b, i32 %c) {
   %z = call i32@llvm.hexagon.C4.and.or(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: = and({{.*}}, or({{.*}}, {{.*}}))
+; CHECK: = and({{.*}},or({{.*}},{{.*}}))
 
 declare i32 @llvm.hexagon.C2.xor(i32, i32)
 define i32 @C2_xor(i32 %a, i32 %b) {
   %z = call i32@llvm.hexagon.C2.xor(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = xor({{.*}}, {{.*}})
+; CHECK: = xor({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.C4.or.and(i32, i32, i32)
 define i32 @C4_or_and(i32 %a, i32 %b, i32 %c) {
   %z = call i32@llvm.hexagon.C4.or.and(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: = or({{.*}}, and({{.*}}, {{.*}}))
+; CHECK: = or({{.*}},and({{.*}},{{.*}}))
 
 declare i32 @llvm.hexagon.C2.andn(i32, i32)
 define i32 @C2_andn(i32 %a, i32 %b) {
   %z = call i32@llvm.hexagon.C2.andn(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = and({{.*}}, !{{.*}})
+; CHECK: = and({{.*}},!{{.*}})
 
 declare i32 @llvm.hexagon.C4.or.or(i32, i32, i32)
 define i32 @C4_or_or(i32 %a, i32 %b, i32 %c) {
   %z = call i32@llvm.hexagon.C4.or.or(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: = or({{.*}}, or({{.*}}, {{.*}}))
+; CHECK: = or({{.*}},or({{.*}},{{.*}}))
 
 declare i32 @llvm.hexagon.C4.and.andn(i32, i32, i32)
 define i32 @C4_and_andn(i32 %a, i32 %b, i32 %c) {
   %z = call i32@llvm.hexagon.C4.and.andn(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: = and({{.*}}, and({{.*}}, !{{.*}}))
+; CHECK: = and({{.*}},and({{.*}},!{{.*}}))
 
 declare i32 @llvm.hexagon.C4.and.orn(i32, i32, i32)
 define i32 @C4_and_orn(i32 %a, i32 %b, i32 %c) {
   %z = call i32@llvm.hexagon.C4.and.orn(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: = and({{.*}}, or({{.*}}, !{{.*}}))
+; CHECK: = and({{.*}},or({{.*}},!{{.*}}))
 
 declare i32 @llvm.hexagon.C2.not(i32)
 define i32 @C2_not(i32 %a) {
@@ -118,18 +118,18 @@ define i32 @C4_or_andn(i32 %a, i32 %b, i32 %c) {
   %z = call i32@llvm.hexagon.C4.or.andn(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: = or({{.*}}, and({{.*}}, !{{.*}}))
+; CHECK: = or({{.*}},and({{.*}},!{{.*}}))
 
 declare i32 @llvm.hexagon.C2.orn(i32, i32)
 define i32 @C2_orn(i32 %a, i32 %b) {
   %z = call i32@llvm.hexagon.C2.orn(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = or({{.*}}, !{{.*}})
+; CHECK: = or({{.*}},!{{.*}})
 
 declare i32 @llvm.hexagon.C4.or.orn(i32, i32, i32)
 define i32 @C4_or_orn(i32 %a, i32 %b, i32 %c) {
   %z = call i32@llvm.hexagon.C4.or.orn(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: = or({{.*}}, or({{.*}}, !{{.*}}))
+; CHECK: = or({{.*}},or({{.*}},!{{.*}}))
diff --git a/test/CodeGen/Hexagon/intrinsics/system_user.ll b/test/CodeGen/Hexagon/intrinsics/system_user.ll
index dad4effb0a14294c6bfa89e2332f99e10fbfc8a2..ac4c53e221d07a44dfd832689677624542c54ff0 100644
--- a/test/CodeGen/Hexagon/intrinsics/system_user.ll
+++ b/test/CodeGen/Hexagon/intrinsics/system_user.ll
@@ -10,4 +10,4 @@ define void @prefetch(i8* %a) {
   call void @llvm.hexagon.prefetch(i8* %a)
   ret void
 }
-; CHECK: dcfetch({{.*}} + #0)
+; CHECK: dcfetch({{.*}}+#0)
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_alu.ll b/test/CodeGen/Hexagon/intrinsics/xtype_alu.ll
index c5c23c22bde949932dc412a55946529e89f2621f..4d630c62005b52811a0da770bf1692b33a08a052 100644
--- a/test/CodeGen/Hexagon/intrinsics/xtype_alu.ll
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_alu.ll
@@ -34,42 +34,42 @@ define i32 @S4_addaddi(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S4.addaddi(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: = add({{.*}}, add({{.*}}, #0))
+; CHECK: = add({{.*}},add({{.*}},#0))
 
 declare i32 @llvm.hexagon.S4.subaddi(i32, i32, i32)
 define i32 @S4_subaddi(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S4.subaddi(i32 %a, i32 0, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}, sub(#0, {{.*}}))
+; CHECK: = add({{.*}},sub(#0,{{.*}}))
 
 declare i32 @llvm.hexagon.M2.accii(i32, i32, i32)
 define i32 @M2_accii(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.accii(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: += add({{.*}}, #0)
+; CHECK: += add({{.*}},#0)
 
 declare i32 @llvm.hexagon.M2.naccii(i32, i32, i32)
 define i32 @M2_naccii(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.naccii(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: -= add({{.*}}, #0)
+; CHECK: -= add({{.*}},#0)
 
 declare i32 @llvm.hexagon.M2.acci(i32, i32, i32)
 define i32 @M2_acci(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.acci(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += add({{.*}}, {{.*}})
+; CHECK: += add({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M2.nacci(i32, i32, i32)
 define i32 @M2_nacci(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.nacci(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= add({{.*}}, {{.*}})
+; CHECK: -= add({{.*}},{{.*}})
 
 ; Add doublewords
 declare i64 @llvm.hexagon.A2.addp(i64, i64)
@@ -77,14 +77,14 @@ define i64 @A2_addp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.addp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = add({{.*}}, {{.*}})
+; CHECK: = add({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.addpsat(i64, i64)
 define i64 @A2_addpsat(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.addpsat(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = add({{.*}}, {{.*}}):sat
+; CHECK: = add({{.*}},{{.*}}):sat
 
 ; Add halfword
 declare i32 @llvm.hexagon.A2.addh.l16.ll(i32, i32)
@@ -92,84 +92,84 @@ define i32 @A2_addh_l16_ll(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.l16.ll(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.l, {{.*}}.l)
+; CHECK: = add({{.*}}.l,{{.*}}.l)
 
 declare i32 @llvm.hexagon.A2.addh.l16.hl(i32, i32)
 define i32 @A2_addh_l16_hl(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.l16.hl(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.l, {{.*}}.h)
+; CHECK: = add({{.*}}.l,{{.*}}.h)
 
 declare i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32, i32)
 define i32 @A2_addh_l16_sat.ll(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.l16.sat.ll(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.l, {{.*}}.l):sat
+; CHECK: = add({{.*}}.l,{{.*}}.l):sat
 
 declare i32 @llvm.hexagon.A2.addh.l16.sat.hl(i32, i32)
 define i32 @A2_addh_l16_sat.hl(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.l16.sat.hl(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.l, {{.*}}.h):sat
+; CHECK: = add({{.*}}.l,{{.*}}.h):sat
 
 declare i32 @llvm.hexagon.A2.addh.h16.ll(i32, i32)
 define i32 @A2_addh_h16_ll(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.h16.ll(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.l, {{.*}}.l):<<16
+; CHECK: = add({{.*}}.l,{{.*}}.l):<<16
 
 declare i32 @llvm.hexagon.A2.addh.h16.lh(i32, i32)
 define i32 @A2_addh_h16_lh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.h16.lh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.l, {{.*}}.h):<<16
+; CHECK: = add({{.*}}.l,{{.*}}.h):<<16
 
 declare i32 @llvm.hexagon.A2.addh.h16.hl(i32, i32)
 define i32 @A2_addh_h16_hl(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.h16.hl(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.h, {{.*}}.l):<<16
+; CHECK: = add({{.*}}.h,{{.*}}.l):<<16
 
 declare i32 @llvm.hexagon.A2.addh.h16.hh(i32, i32)
 define i32 @A2_addh_h16_hh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.h16.hh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.h, {{.*}}.h):<<16
+; CHECK: = add({{.*}}.h,{{.*}}.h):<<16
 
 declare i32 @llvm.hexagon.A2.addh.h16.sat.ll(i32, i32)
 define i32 @A2_addh_h16_sat_ll(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.h16.sat.ll(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.l, {{.*}}.l):sat:<<16
+; CHECK: = add({{.*}}.l,{{.*}}.l):sat:<<16
 
 declare i32 @llvm.hexagon.A2.addh.h16.sat.lh(i32, i32)
 define i32 @A2_addh_h16_sat_lh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.h16.sat.lh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.l, {{.*}}.h):sat:<<16
+; CHECK: = add({{.*}}.l,{{.*}}.h):sat:<<16
 
 declare i32 @llvm.hexagon.A2.addh.h16.sat.hl(i32, i32)
 define i32 @A2_addh_h16_sat_hl(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.h16.sat.hl(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.h, {{.*}}.l):sat:<<16
+; CHECK: = add({{.*}}.h,{{.*}}.l):sat:<<16
 
 declare i32 @llvm.hexagon.A2.addh.h16.sat.hh(i32, i32)
 define i32 @A2_addh_h16_sat_hh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.addh.h16.sat.hh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}.h, {{.*}}.h):sat:<<16
+; CHECK: = add({{.*}}.h,{{.*}}.h):sat:<<16
 
 ; Logical doublewords
 declare i64 @llvm.hexagon.A2.notp(i64)
@@ -184,35 +184,35 @@ define i64 @A2_andp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.andp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = and({{.*}}, {{.*}})
+; CHECK: = and({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A4.andnp(i64, i64)
 define i64 @A2_andnp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A4.andnp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = and({{.*}}, ~{{.*}})
+; CHECK: = and({{.*}},~{{.*}})
 
 declare i64 @llvm.hexagon.A2.orp(i64, i64)
 define i64 @A2_orp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.orp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = or({{.*}}, {{.*}})
+; CHECK: = or({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A4.ornp(i64, i64)
 define i64 @A2_ornp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A4.ornp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = or({{.*}}, ~{{.*}})
+; CHECK: = or({{.*}},~{{.*}})
 
 declare i64 @llvm.hexagon.A2.xorp(i64, i64)
 define i64 @A2_xorp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.xorp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = xor({{.*}}, {{.*}})
+; CHECK: = xor({{.*}},{{.*}})
 
 ; Logical-logical doublewords
 declare i64 @llvm.hexagon.M4.xor.xacc(i64, i64, i64)
@@ -220,7 +220,7 @@ define i64 @M4_xor_xacc(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M4.xor.xacc(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: ^= xor({{.*}}, {{.*}})
+; CHECK: ^= xor({{.*}},{{.*}})
 
 ; Logical-logical words
 declare i32 @llvm.hexagon.S4.or.andi(i32, i32, i32)
@@ -228,91 +228,91 @@ define i32 @S4_or_andi(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S4.or.andi(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: |= and({{.*}}, #0)
+; CHECK: |= and({{.*}},#0)
 
 declare i32 @llvm.hexagon.S4.or.andix(i32, i32, i32)
 define i32 @S4_or_andix(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S4.or.andix(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: = or({{.*}}, and({{.*}}, #0))
+; CHECK: = or({{.*}},and({{.*}},#0))
 
 declare i32 @llvm.hexagon.M4.or.andn(i32, i32, i32)
 define i32 @M4_or_andn(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.or.andn(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: |= and({{.*}}, ~{{.*}})
+; CHECK: |= and({{.*}},~{{.*}})
 
 declare i32 @llvm.hexagon.M4.and.andn(i32, i32, i32)
 define i32 @M4_and_andn(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.and.andn(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: &= and({{.*}}, ~{{.*}})
+; CHECK: &= and({{.*}},~{{.*}})
 
 declare i32 @llvm.hexagon.M4.xor.andn(i32, i32, i32)
 define i32 @M4_xor_andn(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.xor.andn(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: ^= and({{.*}}, ~{{.*}})
+; CHECK: ^= and({{.*}},~{{.*}})
 
 declare i32 @llvm.hexagon.M4.and.and(i32, i32, i32)
 define i32 @M4_and_and(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.and.and(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: &= and({{.*}}, {{.*}})
+; CHECK: &= and({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M4.and.or(i32, i32, i32)
 define i32 @M4_and_or(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.and.or(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: &= or({{.*}}, {{.*}})
+; CHECK: &= or({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M4.and.xor(i32, i32, i32)
 define i32 @M4_and_xor(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.and.xor(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: &= xor({{.*}}, {{.*}})
+; CHECK: &= xor({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M4.or.and(i32, i32, i32)
 define i32 @M4_or_and(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.or.and(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: |= and({{.*}}, {{.*}})
+; CHECK: |= and({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M4.or.or(i32, i32, i32)
 define i32 @M4_or_or(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.or.or(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: |= or({{.*}}, {{.*}})
+; CHECK: |= or({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M4.or.xor(i32, i32, i32)
 define i32 @M4_or_xor(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.or.xor(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: |= xor({{.*}}, {{.*}})
+; CHECK: |= xor({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M4.xor.and(i32, i32, i32)
 define i32 @M4_xor_and(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.xor.and(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: ^= and({{.*}}, {{.*}})
+; CHECK: ^= and({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M4.xor.or(i32, i32, i32)
 define i32 @M4_xor_or(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.xor.or(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: ^= or({{.*}}, {{.*}})
+; CHECK: ^= or({{.*}},{{.*}})
 
 ; Maximum words
 declare i32 @llvm.hexagon.A2.max(i32, i32)
@@ -320,14 +320,14 @@ define i32 @A2_max(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.max(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = max({{.*}}, {{.*}})
+; CHECK: = max({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.maxu(i32, i32)
 define i32 @A2_maxu(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.maxu(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = maxu({{.*}}, {{.*}})
+; CHECK: = maxu({{.*}},{{.*}})
 
 ; Maximum doublewords
 declare i64 @llvm.hexagon.A2.maxp(i64, i64)
@@ -335,14 +335,14 @@ define i64 @A2_maxp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.maxp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = max({{.*}}, {{.*}})
+; CHECK: = max({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.maxup(i64, i64)
 define i64 @A2_maxup(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.maxup(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = maxu({{.*}}, {{.*}})
+; CHECK: = maxu({{.*}},{{.*}})
 
 ; Minimum words
 declare i32 @llvm.hexagon.A2.min(i32, i32)
@@ -350,14 +350,14 @@ define i32 @A2_min(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.min(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = min({{.*}}, {{.*}})
+; CHECK: = min({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.minu(i32, i32)
 define i32 @A2_minu(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.minu(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = minu({{.*}}, {{.*}})
+; CHECK: = minu({{.*}},{{.*}})
 
 ; Minimum doublewords
 declare i64 @llvm.hexagon.A2.minp(i64, i64)
@@ -365,14 +365,14 @@ define i64 @A2_minp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.minp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = min({{.*}}, {{.*}})
+; CHECK: = min({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.minup(i64, i64)
 define i64 @A2_minup(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.minup(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = minu({{.*}}, {{.*}})
+; CHECK: = minu({{.*}},{{.*}})
 
 ; Module wrap
 declare i32 @llvm.hexagon.A4.modwrapu(i32, i32)
@@ -380,7 +380,7 @@ define i32 @A4_modwrapu(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.modwrapu(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = modwrap({{.*}}, {{.*}})
+; CHECK: = modwrap({{.*}},{{.*}})
 
 ; Negate
 declare i64 @llvm.hexagon.A2.negp(i64)
@@ -410,42 +410,42 @@ define i32 @A4_cround_ri(i32 %a) {
   %z = call i32 @llvm.hexagon.A4.cround.ri(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = cround({{.*}}, #0)
+; CHECK: = cround({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.round.ri(i32, i32)
 define i32 @A4_round_ri(i32 %a) {
   %z = call i32 @llvm.hexagon.A4.round.ri(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = round({{.*}}, #0)
+; CHECK: = round({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.round.ri.sat(i32, i32)
 define i32 @A4_round_ri_sat(i32 %a) {
   %z = call i32 @llvm.hexagon.A4.round.ri.sat(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = round({{.*}}, #0):sat
+; CHECK: = round({{.*}},#0):sat
 
 declare i32 @llvm.hexagon.A4.cround.rr(i32, i32)
 define i32 @A4_cround_rr(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.cround.rr(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cround({{.*}}, {{.*}})
+; CHECK: = cround({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.round.rr(i32, i32)
 define i32 @A4_round_rr(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.round.rr(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = round({{.*}}, {{.*}})
+; CHECK: = round({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.round.rr.sat(i32, i32)
 define i32 @A4_round_rr_sat(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.round.rr.sat(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = round({{.*}}, {{.*}}):sat
+; CHECK: = round({{.*}},{{.*}}):sat
 
 ; Subtract doublewords
 declare i64 @llvm.hexagon.A2.subp(i64, i64)
@@ -453,7 +453,7 @@ define i64 @A2_subp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.subp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = sub({{.*}}, {{.*}})
+; CHECK: = sub({{.*}},{{.*}})
 
 ; Subtract and accumulate
 declare i32 @llvm.hexagon.M2.subacc(i32, i32, i32)
@@ -461,7 +461,7 @@ define i32 @M2_subacc(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.subacc(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += sub({{.*}}, {{.*}})
+; CHECK: += sub({{.*}},{{.*}})
 
 ; Subtract halfwords
 declare i32 @llvm.hexagon.A2.subh.l16.ll(i32, i32)
@@ -469,84 +469,84 @@ define i32 @A2_subh_l16_ll(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.l16.ll(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.l, {{.*}}.l)
+; CHECK: = sub({{.*}}.l,{{.*}}.l)
 
 declare i32 @llvm.hexagon.A2.subh.l16.hl(i32, i32)
 define i32 @A2_subh_l16_hl(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.l16.hl(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.l, {{.*}}.h)
+; CHECK: = sub({{.*}}.l,{{.*}}.h)
 
 declare i32 @llvm.hexagon.A2.subh.l16.sat.ll(i32, i32)
 define i32 @A2_subh_l16_sat.ll(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.l16.sat.ll(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.l, {{.*}}.l):sat
+; CHECK: = sub({{.*}}.l,{{.*}}.l):sat
 
 declare i32 @llvm.hexagon.A2.subh.l16.sat.hl(i32, i32)
 define i32 @A2_subh_l16_sat.hl(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.l16.sat.hl(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.l, {{.*}}.h):sat
+; CHECK: = sub({{.*}}.l,{{.*}}.h):sat
 
 declare i32 @llvm.hexagon.A2.subh.h16.ll(i32, i32)
 define i32 @A2_subh_h16_ll(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.h16.ll(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.l, {{.*}}.l):<<16
+; CHECK: = sub({{.*}}.l,{{.*}}.l):<<16
 
 declare i32 @llvm.hexagon.A2.subh.h16.lh(i32, i32)
 define i32 @A2_subh_h16_lh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.h16.lh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.l, {{.*}}.h):<<16
+; CHECK: = sub({{.*}}.l,{{.*}}.h):<<16
 
 declare i32 @llvm.hexagon.A2.subh.h16.hl(i32, i32)
 define i32 @A2_subh_h16_hl(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.h16.hl(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.h, {{.*}}.l):<<16
+; CHECK: = sub({{.*}}.h,{{.*}}.l):<<16
 
 declare i32 @llvm.hexagon.A2.subh.h16.hh(i32, i32)
 define i32 @A2_subh_h16_hh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.h16.hh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.h, {{.*}}.h):<<16
+; CHECK: = sub({{.*}}.h,{{.*}}.h):<<16
 
 declare i32 @llvm.hexagon.A2.subh.h16.sat.ll(i32, i32)
 define i32 @A2_subh_h16_sat_ll(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.h16.sat.ll(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.l, {{.*}}.l):sat:<<16
+; CHECK: = sub({{.*}}.l,{{.*}}.l):sat:<<16
 
 declare i32 @llvm.hexagon.A2.subh.h16.sat.lh(i32, i32)
 define i32 @A2_subh_h16_sat_lh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.h16.sat.lh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.l, {{.*}}.h):sat:<<16
+; CHECK: = sub({{.*}}.l,{{.*}}.h):sat:<<16
 
 declare i32 @llvm.hexagon.A2.subh.h16.sat.hl(i32, i32)
 define i32 @A2_subh_h16_sat_hl(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.h16.sat.hl(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.h, {{.*}}.l):sat:<<16
+; CHECK: = sub({{.*}}.h,{{.*}}.l):sat:<<16
 
 declare i32 @llvm.hexagon.A2.subh.h16.sat.hh(i32, i32)
 define i32 @A2_subh_h16_sat_hh(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A2.subh.h16.sat.hh(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = sub({{.*}}.h, {{.*}}.h):sat:<<16
+; CHECK: = sub({{.*}}.h,{{.*}}.h):sat:<<16
 
 ; Sign extend word to doubleword
 declare i64 @llvm.hexagon.A2.sxtw(i32)
@@ -592,7 +592,7 @@ define i64 @M2_vabsdiffh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vabsdiffh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vabsdiffh({{.*}}, {{.*}})
+; CHECK: = vabsdiffh({{.*}},{{.*}})
 
 ; Vector absolute difference words
 declare i64 @llvm.hexagon.M2.vabsdiffw(i64, i64)
@@ -600,7 +600,7 @@ define i64 @M2_vabsdiffw(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vabsdiffw(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vabsdiffw({{.*}}, {{.*}})
+; CHECK: = vabsdiffw({{.*}},{{.*}})
 
 ; Vector add halfwords
 declare i64 @llvm.hexagon.A2.vaddh(i64, i64)
@@ -608,21 +608,21 @@ define i64 @A2_vaddh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vaddh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vaddh({{.*}}, {{.*}})
+; CHECK: = vaddh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vaddhs(i64, i64)
 define i64 @A2_vaddhs(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vaddhs(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vaddh({{.*}}, {{.*}}):sat
+; CHECK: = vaddh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.A2.vadduhs(i64, i64)
 define i64 @A2_vadduhs(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vadduhs(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vadduh({{.*}}, {{.*}}):sat
+; CHECK: = vadduh({{.*}},{{.*}}):sat
 
 ; Vector add halfwords with saturate and pack to unsigned bytes
 declare i32 @llvm.hexagon.A5.vaddhubs(i64, i64)
@@ -630,7 +630,7 @@ define i32 @A5_vaddhubs(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A5.vaddhubs(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vaddhub({{.*}}, {{.*}}):sat
+; CHECK: = vaddhub({{.*}},{{.*}}):sat
 
 ; Vector reduce add unsigned bytes
 declare i64 @llvm.hexagon.A2.vraddub(i64, i64)
@@ -638,14 +638,14 @@ define i64 @A2_vraddub(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vraddub(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vraddub({{.*}}, {{.*}})
+; CHECK: = vraddub({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vraddub.acc(i64, i64, i64)
 define i64 @A2_vraddub_acc(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.A2.vraddub.acc(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vraddub({{.*}}, {{.*}})
+; CHECK: += vraddub({{.*}},{{.*}})
 
 ; Vector reduce add halfwords
 declare i32 @llvm.hexagon.M2.vradduh(i64, i64)
@@ -653,14 +653,14 @@ define i32 @M2_vradduh(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.M2.vradduh(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vradduh({{.*}}, {{.*}})
+; CHECK: = vradduh({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M2.vraddh(i64, i64)
 define i32 @M2_vraddh(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.M2.vraddh(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vraddh({{.*}}, {{.*}})
+; CHECK: = vraddh({{.*}},{{.*}})
 
 ; Vector add bytes
 declare i64 @llvm.hexagon.A2.vaddub(i64, i64)
@@ -668,14 +668,14 @@ define i64 @A2_vaddub(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vaddub(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vaddub({{.*}}, {{.*}})
+; CHECK: = vaddub({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vaddubs(i64, i64)
 define i64 @A2_vaddubs(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vaddubs(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vaddub({{.*}}, {{.*}}):sat
+; CHECK: = vaddub({{.*}},{{.*}}):sat
 
 ; Vector add words
 declare i64 @llvm.hexagon.A2.vaddw(i64, i64)
@@ -683,14 +683,14 @@ define i64 @A2_vaddw(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vaddw(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vaddw({{.*}}, {{.*}})
+; CHECK: = vaddw({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vaddws(i64, i64)
 define i64 @A2_vaddws(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vaddws(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vaddw({{.*}}, {{.*}}):sat
+; CHECK: = vaddw({{.*}},{{.*}}):sat
 
 ; Vector average halfwords
 declare i64 @llvm.hexagon.A2.vavgh(i64, i64)
@@ -698,56 +698,56 @@ define i64 @A2_vavgh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavgh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavgh({{.*}}, {{.*}})
+; CHECK: = vavgh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vavghr(i64, i64)
 define i64 @A2_vavghr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavghr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavgh({{.*}}, {{.*}}):rnd
+; CHECK: = vavgh({{.*}},{{.*}}):rnd
 
 declare i64 @llvm.hexagon.A2.vavghcr(i64, i64)
 define i64 @A2_vavghcr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavghcr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavgh({{.*}}, {{.*}}):crnd
+; CHECK: = vavgh({{.*}},{{.*}}):crnd
 
 declare i64 @llvm.hexagon.A2.vavguh(i64, i64)
 define i64 @A2_vavguh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavguh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavguh({{.*}}, {{.*}})
+; CHECK: = vavguh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vavguhr(i64, i64)
 define i64 @A2_vavguhr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavguhr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavguh({{.*}}, {{.*}}):rnd
+; CHECK: = vavguh({{.*}},{{.*}}):rnd
 
 declare i64 @llvm.hexagon.A2.vnavgh(i64, i64)
 define i64 @A2_vnavgh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vnavgh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vnavgh({{.*}}, {{.*}})
+; CHECK: = vnavgh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vnavghr(i64, i64)
 define i64 @A2_vnavghr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vnavghr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vnavgh({{.*}}, {{.*}}):rnd
+; CHECK: = vnavgh({{.*}},{{.*}}):rnd
 
 declare i64 @llvm.hexagon.A2.vnavghcr(i64, i64)
 define i64 @A2_vnavghcr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vnavghcr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vnavgh({{.*}}, {{.*}}):crnd
+; CHECK: = vnavgh({{.*}},{{.*}}):crnd
 
 ; Vector average unsigned bytes
 declare i64 @llvm.hexagon.A2.vavgub(i64, i64)
@@ -755,14 +755,14 @@ define i64 @A2_vavgub(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavgub(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK:  vavgub({{.*}}, {{.*}})
+; CHECK:  vavgub({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vavgubr(i64, i64)
 define i64 @A2_vavgubr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavgubr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavgub({{.*}}, {{.*}}):rnd
+; CHECK: = vavgub({{.*}},{{.*}}):rnd
 
 ; Vector average words
 declare i64 @llvm.hexagon.A2.vavgw(i64, i64)
@@ -770,56 +770,56 @@ define i64 @A2_vavgw(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavgw(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavgw({{.*}}, {{.*}})
+; CHECK: = vavgw({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vavgwr(i64, i64)
 define i64 @A2_vavgwr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavgwr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavgw({{.*}}, {{.*}}):rnd
+; CHECK: = vavgw({{.*}},{{.*}}):rnd
 
 declare i64 @llvm.hexagon.A2.vavgwcr(i64, i64)
 define i64 @A2_vavgwcr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavgwcr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavgw({{.*}}, {{.*}}):crnd
+; CHECK: = vavgw({{.*}},{{.*}}):crnd
 
 declare i64 @llvm.hexagon.A2.vavguw(i64, i64)
 define i64 @A2_vavguw(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavguw(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavguw({{.*}}, {{.*}})
+; CHECK: = vavguw({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vavguwr(i64, i64)
 define i64 @A2_vavguwr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vavguwr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vavguw({{.*}}, {{.*}}):rnd
+; CHECK: = vavguw({{.*}},{{.*}}):rnd
 
 declare i64 @llvm.hexagon.A2.vnavgw(i64, i64)
 define i64 @A2_vnavgw(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vnavgw(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vnavgw({{.*}}, {{.*}})
+; CHECK: = vnavgw({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vnavgwr(i64, i64)
 define i64 @A2_vnavgwr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vnavgwr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vnavgw({{.*}}, {{.*}}):rnd
+; CHECK: = vnavgw({{.*}},{{.*}}):rnd
 
 declare i64 @llvm.hexagon.A2.vnavgwcr(i64, i64)
 define i64 @A2_vnavgwcr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vnavgwcr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vnavgw({{.*}}, {{.*}}):crnd
+; CHECK: = vnavgw({{.*}},{{.*}}):crnd
 
 ; Vector conditional negate
 declare i64 @llvm.hexagon.S2.vcnegh(i64, i32)
@@ -827,14 +827,14 @@ define i64 @S2_vcnegh(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.vcnegh(i64 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vcnegh({{.*}}, {{.*}})
+; CHECK: = vcnegh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.vrcnegh(i64, i64, i32)
 define i64 @S2_vrcnegh(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.vrcnegh(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += vrcnegh({{.*}}, {{.*}})
+; CHECK: += vrcnegh({{.*}},{{.*}})
 
 ; Vector maximum bytes
 declare i64 @llvm.hexagon.A2.vmaxub(i64, i64)
@@ -842,14 +842,14 @@ define i64 @A2_vmaxub(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vmaxub(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmaxub({{.*}}, {{.*}})
+; CHECK: = vmaxub({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vmaxb(i64, i64)
 define i64 @A2_vmaxb(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vmaxb(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmaxb({{.*}}, {{.*}})
+; CHECK: = vmaxb({{.*}},{{.*}})
 
 ; Vector maximum halfwords
 declare i64 @llvm.hexagon.A2.vmaxh(i64, i64)
@@ -857,14 +857,14 @@ define i64 @A2_vmaxh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vmaxh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmaxh({{.*}}, {{.*}})
+; CHECK: = vmaxh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vmaxuh(i64, i64)
 define i64 @A2_vmaxuh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vmaxuh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmaxuh({{.*}}, {{.*}})
+; CHECK: = vmaxuh({{.*}},{{.*}})
 
 ; Vector reduce maximum halfwords
 declare i64 @llvm.hexagon.A4.vrmaxh(i64, i64, i32)
@@ -872,14 +872,14 @@ define i64 @A4_vrmaxh(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.A4.vrmaxh(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: = vrmaxh({{.*}}, {{.*}})
+; CHECK: = vrmaxh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A4.vrmaxuh(i64, i64, i32)
 define i64 @A4_vrmaxuh(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.A4.vrmaxuh(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: = vrmaxuh({{.*}}, {{.*}})
+; CHECK: = vrmaxuh({{.*}},{{.*}})
 
 ; Vector reduce maximum words
 declare i64 @llvm.hexagon.A4.vrmaxw(i64, i64, i32)
@@ -887,14 +887,14 @@ define i64 @A4_vrmaxw(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.A4.vrmaxw(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: = vrmaxw({{.*}}, {{.*}})
+; CHECK: = vrmaxw({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A4.vrmaxuw(i64, i64, i32)
 define i64 @A4_vrmaxuw(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.A4.vrmaxuw(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK:  vrmaxuw({{.*}}, {{.*}})
+; CHECK:  vrmaxuw({{.*}},{{.*}})
 
 ; Vector minimum bytes
 declare i64 @llvm.hexagon.A2.vminub(i64, i64)
@@ -902,14 +902,14 @@ define i64 @A2_vminub(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vminub(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vminub({{.*}}, {{.*}})
+; CHECK: = vminub({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vminb(i64, i64)
 define i64 @A2_vminb(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vminb(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vminb({{.*}}, {{.*}})
+; CHECK: = vminb({{.*}},{{.*}})
 
 ; Vector minimum halfwords
 declare i64 @llvm.hexagon.A2.vminh(i64, i64)
@@ -917,14 +917,14 @@ define i64 @A2_vminh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vminh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vminh({{.*}}, {{.*}})
+; CHECK: = vminh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vminuh(i64, i64)
 define i64 @A2_vminuh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vminuh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vminuh({{.*}}, {{.*}})
+; CHECK: = vminuh({{.*}},{{.*}})
 
 ; Vector reduce minimum halfwords
 declare i64 @llvm.hexagon.A4.vrminh(i64, i64, i32)
@@ -932,14 +932,14 @@ define i64 @A4_vrminh(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.A4.vrminh(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: = vrminh({{.*}}, {{.*}})
+; CHECK: = vrminh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A4.vrminuh(i64, i64, i32)
 define i64 @A4_vrminuh(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.A4.vrminuh(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: = vrminuh({{.*}}, {{.*}})
+; CHECK: = vrminuh({{.*}},{{.*}})
 
 ; Vector reduce minimum words
 declare i64 @llvm.hexagon.A4.vrminw(i64, i64, i32)
@@ -947,14 +947,14 @@ define i64 @A4_vrminw(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.A4.vrminw(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: = vrminw({{.*}}, {{.*}})
+; CHECK: = vrminw({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A4.vrminuw(i64, i64, i32)
 define i64 @A4_vrminuw(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.A4.vrminuw(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: = vrminuw({{.*}}, {{.*}})
+; CHECK: = vrminuw({{.*}},{{.*}})
 
 ; Vector sum of absolute differences unsigned bytes
 declare i64 @llvm.hexagon.A2.vrsadub(i64, i64)
@@ -962,14 +962,14 @@ define i64 @A2_vrsadub(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vrsadub(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrsadub({{.*}}, {{.*}})
+; CHECK: = vrsadub({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vrsadub.acc(i64, i64, i64)
 define i64 @A2_vrsadub_acc(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.A2.vrsadub.acc(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrsadub({{.*}}, {{.*}})
+; CHECK: += vrsadub({{.*}},{{.*}})
 
 ; Vector subtract halfwords
 declare i64 @llvm.hexagon.A2.vsubh(i64, i64)
@@ -977,21 +977,21 @@ define i64 @A2_vsubh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vsubh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vsubh({{.*}}, {{.*}})
+; CHECK: = vsubh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vsubhs(i64, i64)
 define i64 @A2_vsubhs(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vsubhs(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vsubh({{.*}}, {{.*}}):sat
+; CHECK: = vsubh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.A2.vsubuhs(i64, i64)
 define i64 @A2_vsubuhs(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vsubuhs(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vsubuh({{.*}}, {{.*}}):sat
+; CHECK: = vsubuh({{.*}},{{.*}}):sat
 
 ; Vector subtract bytes
 declare i64 @llvm.hexagon.A2.vsubub(i64, i64)
@@ -999,14 +999,14 @@ define i64 @A2_vsubub(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vsubub(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vsubub({{.*}}, {{.*}})
+; CHECK: = vsubub({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vsububs(i64, i64)
 define i64 @A2_vsububs(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vsububs(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vsubub({{.*}}, {{.*}}):sat
+; CHECK: = vsubub({{.*}},{{.*}}):sat
 
 ; Vector subtract words
 declare i64 @llvm.hexagon.A2.vsubw(i64, i64)
@@ -1014,11 +1014,11 @@ define i64 @A2_vsubw(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vsubw(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vsubw({{.*}}, {{.*}})
+; CHECK: = vsubw({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.A2.vsubws(i64, i64)
 define i64 @A2_vsubws(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.A2.vsubws(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vsubw({{.*}}, {{.*}}):sat
+; CHECK: = vsubw({{.*}},{{.*}}):sat
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_bit.ll b/test/CodeGen/Hexagon/intrinsics/xtype_bit.ll
index e8f83d01820a01b5e97690504edf79084fbd75f7..ec7613e3ef2a0973a4e523ccda71733a9772080c 100644
--- a/test/CodeGen/Hexagon/intrinsics/xtype_bit.ll
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_bit.ll
@@ -38,14 +38,14 @@ define i32 @S4_clbpaddi(i64 %a) {
   %z = call i32 @llvm.hexagon.S4.clbpaddi(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = add(clb({{.*}}), #0)
+; CHECK: = add(clb({{.*}}),#0)
 
 declare i32 @llvm.hexagon.S4.clbaddi(i32, i32)
 define i32 @S4_clbaddi(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.clbaddi(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = add(clb({{.*}}), #0)
+; CHECK: = add(clb({{.*}}),#0)
 
 declare i32 @llvm.hexagon.S2.cl0(i32)
 define i32 @S2_cl0(i32 %a) {
@@ -111,56 +111,56 @@ define i64 @S2_extractup(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.extractup(i64 %a, i32 0, i32 0)
   ret i64 %z
 }
-; CHECK: = extractu({{.*}}, #0, #0)
+; CHECK: = extractu({{.*}},#0,#0)
 
 declare i64 @llvm.hexagon.S4.extractp(i64, i32, i32)
 define i64 @S2_extractp(i64 %a) {
   %z = call i64 @llvm.hexagon.S4.extractp(i64 %a, i32 0, i32 0)
   ret i64 %z
 }
-; CHECK: = extract({{.*}}, #0, #0)
+; CHECK: = extract({{.*}},#0,#0)
 
 declare i32 @llvm.hexagon.S2.extractu(i32, i32, i32)
 define i32 @S2_extractu(i32 %a) {
   %z = call i32 @llvm.hexagon.S2.extractu(i32 %a, i32 0, i32 0)
   ret i32 %z
 }
-; CHECK: = extractu({{.*}}, #0, #0)
+; CHECK: = extractu({{.*}},#0,#0)
 
 declare i32 @llvm.hexagon.S4.extract(i32, i32, i32)
 define i32 @S2_extract(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.extract(i32 %a, i32 0, i32 0)
   ret i32 %z
 }
-; CHECK: = extract({{.*}}, #0, #0)
+; CHECK: = extract({{.*}},#0,#0)
 
 declare i64 @llvm.hexagon.S2.extractup.rp(i64, i64)
 define i64 @S2_extractup_rp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.extractup.rp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = extractu({{.*}}, {{.*}})
+; CHECK: = extractu({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S4.extractp.rp(i64, i64)
 define i64 @S4_extractp_rp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S4.extractp.rp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = extract({{.*}}, {{.*}})
+; CHECK: = extract({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.extractu.rp(i32, i64)
 define i32 @S2_extractu_rp(i32 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.S2.extractu.rp(i32 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = extractu({{.*}}, {{.*}})
+; CHECK: = extractu({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S4.extract.rp(i32, i64)
 define i32 @S4_extract_rp(i32 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.S4.extract.rp(i32 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = extract({{.*}}, {{.*}})
+; CHECK: = extract({{.*}},{{.*}})
 
 ; Insert bitfield
 declare i64 @llvm.hexagon.S2.insertp(i64, i64, i32, i32)
@@ -168,28 +168,28 @@ define i64 @S2_insertp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.insertp(i64 %a, i64 %b, i32 0, i32 0)
   ret i64 %z
 }
-; CHECK: = insert({{.*}}, #0, #0)
+; CHECK: = insert({{.*}},#0,#0)
 
 declare i32 @llvm.hexagon.S2.insert(i32, i32, i32, i32)
 define i32 @S2_insert(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.insert(i32 %a, i32 %b, i32 0, i32 0)
   ret i32 %z
 }
-; CHECK: = insert({{.*}}, #0, #0)
+; CHECK: = insert({{.*}},#0,#0)
 
 declare i32 @llvm.hexagon.S2.insert.rp(i32, i32, i64)
 define i32 @S2_insert_rp(i32 %a, i32 %b, i64 %c) {
   %z = call i32 @llvm.hexagon.S2.insert.rp(i32 %a, i32 %b, i64 %c)
   ret i32 %z
 }
-; CHECK: = insert({{.*}}, {{.*}})
+; CHECK: = insert({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.insertp.rp(i64, i64, i64)
 define i64 @S2_insertp_rp(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.S2.insertp.rp(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: = insert({{.*}}, r5:4)
+; CHECK: = insert({{.*}},r5:4)
 
 ; Interleave/deinterleave
 declare i64 @llvm.hexagon.S2.deinterleave(i64)
@@ -212,7 +212,7 @@ define i64 @S2_lfsp(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.lfsp(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = lfs({{.*}}, {{.*}})
+; CHECK: = lfs({{.*}},{{.*}})
 
 ; Masked parity
 declare i32 @llvm.hexagon.S2.parityp(i64, i64)
@@ -220,14 +220,14 @@ define i32 @S2_parityp(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.S2.parityp(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = parity({{.*}}, {{.*}})
+; CHECK: = parity({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S4.parity(i32, i32)
 define i32 @S4_parity(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S4.parity(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = parity({{.*}}, {{.*}})
+; CHECK: = parity({{.*}},{{.*}})
 
 ; Bit reverse
 declare i64 @llvm.hexagon.S2.brevp(i64)
@@ -250,42 +250,42 @@ define i32 @S2_setbit_i(i32 %a) {
   %z = call i32 @llvm.hexagon.S2.setbit.i(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = setbit({{.*}}, #0)
+; CHECK: = setbit({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.clrbit.i(i32, i32)
 define i32 @S2_clrbit_i(i32 %a) {
   %z = call i32 @llvm.hexagon.S2.clrbit.i(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = clrbit({{.*}}, #0)
+; CHECK: = clrbit({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.togglebit.i(i32, i32)
 define i32 @S2_togglebit_i(i32 %a) {
   %z = call i32 @llvm.hexagon.S2.togglebit.i(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = togglebit({{.*}}, #0)
+; CHECK: = togglebit({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.setbit.r(i32, i32)
 define i32 @S2_setbit_r(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.setbit.r(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = setbit({{.*}}, {{.*}})
+; CHECK: = setbit({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.clrbit.r(i32, i32)
 define i32 @S2_clrbit_r(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.clrbit.r(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = clrbit({{.*}}, {{.*}})
+; CHECK: = clrbit({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.togglebit.r(i32, i32)
 define i32 @S2_togglebit_r(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.togglebit.r(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = togglebit({{.*}}, {{.*}})
+; CHECK: = togglebit({{.*}},{{.*}})
 
 ; Split bitfield
 declare i64 @llvm.hexagon.A4.bitspliti(i32, i32)
@@ -293,14 +293,14 @@ define i64 @A4_bitspliti(i32 %a) {
   %z = call i64 @llvm.hexagon.A4.bitspliti(i32 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = bitsplit({{.*}}, #0)
+; CHECK: = bitsplit({{.*}},#0)
 
 declare i64 @llvm.hexagon.A4.bitsplit(i32, i32)
 define i64 @A4_bitsplit(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.A4.bitsplit(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = bitsplit({{.*}}, {{.*}})
+; CHECK: = bitsplit({{.*}},{{.*}})
 
 ; Table index
 declare i32 @llvm.hexagon.S2.tableidxb.goodsyntax(i32, i32, i32, i32)
@@ -308,25 +308,25 @@ define i32 @S2_tableidxb_goodsyntax(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.tableidxb.goodsyntax(i32 %a, i32 %b, i32 0, i32 0)
   ret i32 %z
 }
-; CHECK: = tableidxb({{.*}}, #0, #0)
+; CHECK: = tableidxb({{.*}},#0,#0)
 
 declare i32 @llvm.hexagon.S2.tableidxh.goodsyntax(i32, i32, i32, i32)
 define i32 @S2_tableidxh_goodsyntax(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.tableidxh.goodsyntax(i32 %a, i32 %b, i32 0, i32 0)
   ret i32 %z
 }
-; CHECK: = tableidxh({{.*}}, #0, #-1)
+; CHECK: = tableidxh({{.*}},#0,#-1)
 
 declare i32 @llvm.hexagon.S2.tableidxw.goodsyntax(i32, i32, i32, i32)
 define i32 @S2_tableidxw_goodsyntax(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.tableidxw.goodsyntax(i32 %a, i32 %b, i32 0, i32 0)
   ret i32 %z
 }
-; CHECK: = tableidxw({{.*}}, #0, #-2)
+; CHECK: = tableidxw({{.*}},#0,#-2)
 
 declare i32 @llvm.hexagon.S2.tableidxd.goodsyntax(i32, i32, i32, i32)
 define i32 @S2_tableidxd_goodsyntax(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.tableidxd.goodsyntax(i32 %a, i32 %b, i32 0, i32 0)
   ret i32 %z
 }
-; CHECK: = tableidxd({{.*}}, #0, #-3)
+; CHECK: = tableidxd({{.*}},#0,#-3)
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_complex.ll b/test/CodeGen/Hexagon/intrinsics/xtype_complex.ll
index 0087883573ec2d10e8d998dbef156e07f1b4263a..254b928aa982159f4205575c2c6d3c53efcdf3d9 100644
--- a/test/CodeGen/Hexagon/intrinsics/xtype_complex.ll
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_complex.ll
@@ -10,28 +10,28 @@ define i64 @S4_vxaddsubh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S4.vxaddsubh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vxaddsubh({{.*}}, {{.*}}):sat
+; CHECK: = vxaddsubh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.S4.vxsubaddh(i64, i64)
 define i64 @S4_vxsubaddh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S4.vxsubaddh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vxsubaddh({{.*}}, {{.*}}):sat
+; CHECK: = vxsubaddh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.S4.vxaddsubhr(i64, i64)
 define i64 @S4_vxaddsubhr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S4.vxaddsubhr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vxaddsubh({{.*}}, {{.*}}):rnd:>>1:sat
+; CHECK: = vxaddsubh({{.*}},{{.*}}):rnd:>>1:sat
 
 declare i64 @llvm.hexagon.S4.vxsubaddhr(i64, i64)
 define i64 @S4_vxsubaddhr(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S4.vxsubaddhr(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vxsubaddh({{.*}}, {{.*}}):rnd:>>1:sat
+; CHECK: = vxsubaddh({{.*}},{{.*}}):rnd:>>1:sat
 
 ; Complex add/sub words
 declare i64 @llvm.hexagon.S4.vxaddsubw(i64, i64)
@@ -39,14 +39,14 @@ define i64 @S4_vxaddsubw(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S4.vxaddsubw(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vxaddsubw({{.*}}, {{.*}}):sat
+; CHECK: = vxaddsubw({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.S4.vxsubaddw(i64, i64)
 define i64 @S4_vxsubaddw(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S4.vxsubaddw(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vxsubaddw({{.*}}, {{.*}}):sat
+; CHECK: = vxsubaddw({{.*}},{{.*}}):sat
 
 ; Complex multiply
 declare i64 @llvm.hexagon.M2.cmpys.s0(i32, i32)
@@ -54,84 +54,84 @@ define i64 @M2_cmpys_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.cmpys.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = cmpy({{.*}}, {{.*}}):sat
+; CHECK: = cmpy({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.cmpys.s1(i32, i32)
 define i64 @M2_cmpys_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.cmpys.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = cmpy({{.*}}, {{.*}}):<<1:sat
+; CHECK: = cmpy({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.cmpysc.s0(i32, i32)
 define i64 @M2_cmpysc_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.cmpysc.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = cmpy({{.*}}, {{.*}}*):sat
+; CHECK: = cmpy({{.*}},{{.*}}*):sat
 
 declare i64 @llvm.hexagon.M2.cmpysc.s1(i32, i32)
 define i64 @M2_cmpysc_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.cmpysc.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = cmpy({{.*}}, {{.*}}*):<<1:sat
+; CHECK: = cmpy({{.*}},{{.*}}*):<<1:sat
 
 declare i64 @llvm.hexagon.M2.cmacs.s0(i64, i32, i32)
 define i64 @M2_cmacs_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.cmacs.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += cmpy({{.*}}, {{.*}}):sat
+; CHECK: += cmpy({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.cmacs.s1(i64, i32, i32)
 define i64 @M2_cmacs_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.cmacs.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += cmpy({{.*}}, {{.*}}):<<1:sat
+; CHECK: += cmpy({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.cnacs.s0(i64, i32, i32)
 define i64 @M2_cnacs_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.cnacs.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= cmpy({{.*}}, {{.*}}):sat
+; CHECK: -= cmpy({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.cnacs.s1(i64, i32, i32)
 define i64 @M2_cnacs_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.cnacs.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= cmpy({{.*}}, {{.*}}):<<1:sat
+; CHECK: -= cmpy({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.cmacsc.s0(i64, i32, i32)
 define i64 @M2_cmacsc_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.cmacsc.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += cmpy({{.*}}, {{.*}}*):sat
+; CHECK: += cmpy({{.*}},{{.*}}*):sat
 
 declare i64 @llvm.hexagon.M2.cmacsc.s1(i64, i32, i32)
 define i64 @M2_cmacsc_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.cmacsc.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += cmpy({{.*}}, {{.*}}*):<<1:sat
+; CHECK: += cmpy({{.*}},{{.*}}*):<<1:sat
 
 declare i64 @llvm.hexagon.M2.cnacsc.s0(i64, i32, i32)
 define i64 @M2_cnacsc_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.cnacsc.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= cmpy({{.*}}, {{.*}}*):sat
+; CHECK: -= cmpy({{.*}},{{.*}}*):sat
 
 declare i64 @llvm.hexagon.M2.cnacsc.s1(i64, i32, i32)
 define i64 @M2_cnacsc_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.cnacsc.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= cmpy({{.*}}, {{.*}}*):<<1:sat
+; CHECK: -= cmpy({{.*}},{{.*}}*):<<1:sat
 
 ; Complex multiply real or imaginary
 declare i64 @llvm.hexagon.M2.cmpyi.s0(i32, i32)
@@ -139,28 +139,28 @@ define i64 @M2_cmpyi_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.cmpyi.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = cmpyi({{.*}}, {{.*}})
+; CHECK: = cmpyi({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.cmpyr.s0(i32, i32)
 define i64 @M2_cmpyr_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.cmpyr.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = cmpyr({{.*}}, {{.*}})
+; CHECK: = cmpyr({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.cmaci.s0(i64, i32, i32)
 define i64 @M2_cmaci_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.cmaci.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += cmpyi({{.*}}, {{.*}})
+; CHECK: += cmpyi({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.cmacr.s0(i64, i32, i32)
 define i64 @M2_cmacr_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.cmacr.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += cmpyr({{.*}}, {{.*}})
+; CHECK: += cmpyr({{.*}},{{.*}})
 
 ; Complex multiply with round and pack
 declare i32 @llvm.hexagon.M2.cmpyrs.s0(i32, i32)
@@ -168,28 +168,28 @@ define i32 @M2_cmpyrs_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.cmpyrs.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpy({{.*}}, {{.*}}):rnd:sat
+; CHECK: = cmpy({{.*}},{{.*}}):rnd:sat
 
 declare i32 @llvm.hexagon.M2.cmpyrs.s1(i32, i32)
 define i32 @M2_cmpyrs_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.cmpyrs.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpy({{.*}}, {{.*}}):<<1:rnd:sat
+; CHECK: = cmpy({{.*}},{{.*}}):<<1:rnd:sat
 
 declare i32 @llvm.hexagon.M2.cmpyrsc.s0(i32, i32)
 define i32 @M2_cmpyrsc_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.cmpyrsc.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpy({{.*}}, {{.*}}*):rnd:sat
+; CHECK: = cmpy({{.*}},{{.*}}*):rnd:sat
 
 declare i32 @llvm.hexagon.M2.cmpyrsc.s1(i32, i32)
 define i32 @M2_cmpyrsc_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.cmpyrsc.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpy({{.*}}, {{.*}}*):<<1:rnd:sat
+; CHECK: = cmpy({{.*}},{{.*}}*):<<1:rnd:sat
 
 ; Complex multiply 32x16
 declare i32 @llvm.hexagon.M4.cmpyi.wh(i64, i32)
@@ -197,28 +197,28 @@ define i32 @M4_cmpyi_wh(i64 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M4.cmpyi.wh(i64 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpyiwh({{.*}}, {{.*}}):<<1:rnd:sat
+; CHECK: = cmpyiwh({{.*}},{{.*}}):<<1:rnd:sat
 
 declare i32 @llvm.hexagon.M4.cmpyi.whc(i64, i32)
 define i32 @M4_cmpyi_whc(i64 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M4.cmpyi.whc(i64 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpyiwh({{.*}}, {{.*}}*):<<1:rnd:sat
+; CHECK: = cmpyiwh({{.*}},{{.*}}*):<<1:rnd:sat
 
 declare i32 @llvm.hexagon.M4.cmpyr.wh(i64, i32)
 define i32 @M4_cmpyr_wh(i64 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M4.cmpyr.wh(i64 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpyrwh({{.*}}, {{.*}}):<<1:rnd:sat
+; CHECK: = cmpyrwh({{.*}},{{.*}}):<<1:rnd:sat
 
 declare i32 @llvm.hexagon.M4.cmpyr.whc(i64, i32)
 define i32 @M4_cmpyr_whc(i64 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M4.cmpyr.whc(i64 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpyrwh({{.*}}, {{.*}}*):<<1:rnd:sat
+; CHECK: = cmpyrwh({{.*}},{{.*}}*):<<1:rnd:sat
 
 ; Vector complex multiply real or imaginary
 declare i64 @llvm.hexagon.M2.vcmpy.s0.sat.r(i64, i64)
@@ -226,42 +226,42 @@ define i64 @M2_vcmpy_s0_sat_r(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vcmpy.s0.sat.r(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vcmpyr({{.*}}, {{.*}}):sat
+; CHECK: = vcmpyr({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.vcmpy.s1.sat.r(i64, i64)
 define i64 @M2_vcmpy_s1_sat_r(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vcmpy.s1.sat.r(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vcmpyr({{.*}}, {{.*}}):<<1:sat
+; CHECK: = vcmpyr({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.vcmpy.s0.sat.i(i64, i64)
 define i64 @M2_vcmpy_s0_sat_i(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vcmpy.s0.sat.i(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vcmpyi({{.*}}, {{.*}}):sat
+; CHECK: = vcmpyi({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.vcmpy.s1.sat.i(i64, i64)
 define i64 @M2_vcmpy_s1_sat_i(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vcmpy.s1.sat.i(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vcmpyi({{.*}}, {{.*}}):<<1:sat
+; CHECK: = vcmpyi({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.vcmac.s0.sat.r(i64, i64, i64)
 define i64 @M2_vcmac_s0_sat_r(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M2.vcmac.s0.sat.r(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vcmpyr({{.*}}, r5:4):sat
+; CHECK: += vcmpyr({{.*}},r5:4):sat
 
 declare i64 @llvm.hexagon.M2.vcmac.s0.sat.i(i64, i64, i64)
 define i64 @M2_vcmac_s0_sat_i(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M2.vcmac.s0.sat.i(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vcmpyi({{.*}}, r5:4):sat
+; CHECK: += vcmpyi({{.*}},r5:4):sat
 
 ; Vector complex conjugate
 declare i64 @llvm.hexagon.A2.vconj(i64)
@@ -277,7 +277,7 @@ define i64 @S2_vcrotate(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.vcrotate(i64 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vcrotate({{.*}}, {{.*}})
+; CHECK: = vcrotate({{.*}},{{.*}})
 
 ; Vector reduce complex multiply real or imaginary
 declare i64 @llvm.hexagon.M2.vrcmpyi.s0(i64, i64)
@@ -285,56 +285,56 @@ define i64 @M2_vrcmpyi_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vrcmpyi.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrcmpyi({{.*}}, {{.*}})
+; CHECK: = vrcmpyi({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.vrcmpyr.s0(i64, i64)
 define i64 @M2_vrcmpyr_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vrcmpyr.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrcmpyr({{.*}}, {{.*}})
+; CHECK: = vrcmpyr({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.vrcmpyi.s0c(i64, i64)
 define i64 @M2_vrcmpyi_s0c(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vrcmpyi.s0c(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrcmpyi({{.*}}, {{.*}}*)
+; CHECK: = vrcmpyi({{.*}},{{.*}}*)
 
 declare i64 @llvm.hexagon.M2.vrcmpyr.s0c(i64, i64)
 define i64 @M2_vrcmpyr_s0c(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vrcmpyr.s0c(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrcmpyr({{.*}}, {{.*}}*)
+; CHECK: = vrcmpyr({{.*}},{{.*}}*)
 
 declare i64 @llvm.hexagon.M2.vrcmaci.s0(i64, i64, i64)
 define i64 @M2_vrcmaci_s0(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M2.vrcmaci.s0(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrcmpyi({{.*}}, r5:4)
+; CHECK: += vrcmpyi({{.*}},r5:4)
 
 declare i64 @llvm.hexagon.M2.vrcmacr.s0(i64, i64, i64)
 define i64 @M2_vrcmacr_s0(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M2.vrcmacr.s0(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrcmpyr({{.*}}, r5:4)
+; CHECK: += vrcmpyr({{.*}},r5:4)
 
 declare i64 @llvm.hexagon.M2.vrcmaci.s0c(i64, i64, i64)
 define i64 @M2_vrcmaci_s0c(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M2.vrcmaci.s0c(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrcmpyi({{.*}}, r5:4*)
+; CHECK: += vrcmpyi({{.*}},r5:4*)
 
 declare i64 @llvm.hexagon.M2.vrcmacr.s0c(i64, i64, i64)
 define i64 @M2_vrcmacr_s0c(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M2.vrcmacr.s0c(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrcmpyr({{.*}}, r5:4*)
+; CHECK: += vrcmpyr({{.*}},r5:4*)
 
 ; Vector reduce complex rotate
 declare i64 @llvm.hexagon.S4.vrcrotate(i64, i32, i32)
@@ -342,11 +342,11 @@ define i64 @S4_vrcrotate(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S4.vrcrotate(i64 %a, i32 %b, i32 0)
   ret i64 %z
 }
-; CHECK: = vrcrotate({{.*}}, {{.*}}, #0)
+; CHECK: = vrcrotate({{.*}},{{.*}},#0)
 
 declare i64 @llvm.hexagon.S4.vrcrotate.acc(i64, i64, i32, i32)
 define i64 @S4_vrcrotate_acc(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S4.vrcrotate.acc(i64 %a, i64 %b, i32 %c, i32 0)
   ret i64 %z
 }
-; CHECK: += vrcrotate({{.*}}, {{.*}}, #0)
+; CHECK: += vrcrotate({{.*}},{{.*}},#0)
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_fp.ll b/test/CodeGen/Hexagon/intrinsics/xtype_fp.ll
index 598d0a83206dd517178e65d4eb62c936652b0070..ee56e90516210700fdaac335c77929d88ebead67 100644
--- a/test/CodeGen/Hexagon/intrinsics/xtype_fp.ll
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_fp.ll
@@ -11,7 +11,7 @@ define float @F2_sfadd(float %a, float %b) {
   %z = call float @llvm.hexagon.F2.sfadd(float %a, float %b)
   ret float %z
 }
-; CHECK: = sfadd({{.*}}, {{.*}})
+; CHECK: = sfadd({{.*}},{{.*}})
 
 ; Classify floating-point value
 declare i32 @llvm.hexagon.F2.sfclass(float, i32)
@@ -19,14 +19,14 @@ define i32 @F2_sfclass(float %a) {
   %z = call i32 @llvm.hexagon.F2.sfclass(float %a, i32 0)
   ret i32 %z
 }
-; CHECK: = sfclass({{.*}}, #0)
+; CHECK: = sfclass({{.*}},#0)
 
 declare i32 @llvm.hexagon.F2.dfclass(double, i32)
 define i32 @F2_dfclass(double %a) {
   %z = call i32 @llvm.hexagon.F2.dfclass(double %a, i32 0)
   ret i32 %z
 }
-; CHECK: = dfclass({{.*}}, #0)
+; CHECK: = dfclass({{.*}},#0)
 
 ; Compare floating-point value
 declare i32 @llvm.hexagon.F2.sfcmpge(float, float)
@@ -34,56 +34,56 @@ define i32 @F2_sfcmpge(float %a, float %b) {
   %z = call i32 @llvm.hexagon.F2.sfcmpge(float %a, float %b)
   ret i32 %z
 }
-; CHECK: = sfcmp.ge({{.*}}, {{.*}})
+; CHECK: = sfcmp.ge({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.F2.sfcmpuo(float, float)
 define i32 @F2_sfcmpuo(float %a, float %b) {
   %z = call i32 @llvm.hexagon.F2.sfcmpuo(float %a, float %b)
   ret i32 %z
 }
-; CHECK: = sfcmp.uo({{.*}}, {{.*}})
+; CHECK: = sfcmp.uo({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.F2.sfcmpeq(float, float)
 define i32 @F2_sfcmpeq(float %a, float %b) {
   %z = call i32 @llvm.hexagon.F2.sfcmpeq(float %a, float %b)
   ret i32 %z
 }
-; CHECK: = sfcmp.eq({{.*}}, {{.*}})
+; CHECK: = sfcmp.eq({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.F2.sfcmpgt(float, float)
 define i32 @F2_sfcmpgt(float %a, float %b) {
   %z = call i32 @llvm.hexagon.F2.sfcmpgt(float %a, float %b)
   ret i32 %z
 }
-; CHECK: = sfcmp.gt({{.*}}, {{.*}})
+; CHECK: = sfcmp.gt({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.F2.dfcmpge(double, double)
 define i32 @F2_dfcmpge(double %a, double %b) {
   %z = call i32 @llvm.hexagon.F2.dfcmpge(double %a, double %b)
   ret i32 %z
 }
-; CHECK: = dfcmp.ge({{.*}}, {{.*}})
+; CHECK: = dfcmp.ge({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.F2.dfcmpuo(double, double)
 define i32 @F2_dfcmpuo(double %a, double %b) {
   %z = call i32 @llvm.hexagon.F2.dfcmpuo(double %a, double %b)
   ret i32 %z
 }
-; CHECK: = dfcmp.uo({{.*}}, {{.*}})
+; CHECK: = dfcmp.uo({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.F2.dfcmpeq(double, double)
 define i32 @F2_dfcmpeq(double %a, double %b) {
   %z = call i32 @llvm.hexagon.F2.dfcmpeq(double %a, double %b)
   ret i32 %z
 }
-; CHECK: = dfcmp.eq({{.*}}, {{.*}})
+; CHECK: = dfcmp.eq({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.F2.dfcmpgt(double, double)
 define i32 @F2_dfcmpgt(double %a, double %b) {
   %z = call i32 @llvm.hexagon.F2.dfcmpgt(double %a, double %b)
   ret i32 %z
 }
-; CHECK: = dfcmp.gt({{.*}}, {{.*}})
+; CHECK: = dfcmp.gt({{.*}},{{.*}})
 
 ; Convert floating-point value to other format
 declare double @llvm.hexagon.F2.conv.sf2df(float)
@@ -283,14 +283,14 @@ define float @F2_sffixupn(float %a, float %b) {
   %z = call float @llvm.hexagon.F2.sffixupn(float %a, float %b)
   ret float %z
 }
-; CHECK: = sffixupn({{.*}}, {{.*}})
+; CHECK: = sffixupn({{.*}},{{.*}})
 
 declare float @llvm.hexagon.F2.sffixupd(float, float)
 define float @F2_sffixupd(float %a, float %b) {
   %z = call float @llvm.hexagon.F2.sffixupd(float %a, float %b)
   ret float %z
 }
-; CHECK: = sffixupd({{.*}}, {{.*}})
+; CHECK: = sffixupd({{.*}},{{.*}})
 
 ; Floating point fused multiply-add
 declare float @llvm.hexagon.F2.sffma(float, float, float)
@@ -298,14 +298,14 @@ define float @F2_sffma(float %a, float %b, float %c) {
   %z = call float @llvm.hexagon.F2.sffma(float %a, float %b, float %c)
   ret float %z
 }
-; CHECK: += sfmpy({{.*}}, {{.*}})
+; CHECK: += sfmpy({{.*}},{{.*}})
 
 declare float @llvm.hexagon.F2.sffms(float, float, float)
 define float @F2_sffms(float %a, float %b, float %c) {
   %z = call float @llvm.hexagon.F2.sffms(float %a, float %b, float %c)
   ret float %z
 }
-; CHECK: -= sfmpy({{.*}}, {{.*}})
+; CHECK: -= sfmpy({{.*}},{{.*}})
 
 ; Floating point fused multiply-add with scaling
 declare float @llvm.hexagon.F2.sffma.sc(float, float, float, i32)
@@ -313,7 +313,7 @@ define float @F2_sffma_sc(float %a, float %b, float %c, i32 %d) {
   %z = call float @llvm.hexagon.F2.sffma.sc(float %a, float %b, float %c, i32 %d)
   ret float %z
 }
-; CHECK: += sfmpy({{.*}}, {{.*}}, {{.*}}):scale
+; CHECK: += sfmpy({{.*}},{{.*}},{{.*}}):scale
 
 ; Floating point fused multiply-add for library routines
 declare float @llvm.hexagon.F2.sffma.lib(float, float, float)
@@ -321,14 +321,14 @@ define float @F2_sffma_lib(float %a, float %b, float %c) {
   %z = call float @llvm.hexagon.F2.sffma.lib(float %a, float %b, float %c)
   ret float %z
 }
-; CHECK: += sfmpy({{.*}}, {{.*}}):lib
+; CHECK: += sfmpy({{.*}},{{.*}}):lib
 
 declare float @llvm.hexagon.F2.sffms.lib(float, float, float)
 define float @F2_sffms_lib(float %a, float %b, float %c) {
   %z = call float @llvm.hexagon.F2.sffms.lib(float %a, float %b, float %c)
   ret float %z
 }
-; CHECK: -= sfmpy({{.*}}, {{.*}}):lib
+; CHECK: -= sfmpy({{.*}},{{.*}}):lib
 
 ; Create floating-point constant
 declare float @llvm.hexagon.F2.sfimm.p(i32)
@@ -365,7 +365,7 @@ define float @F2_sfmax(float %a, float %b) {
   %z = call float @llvm.hexagon.F2.sfmax(float %a, float %b)
   ret float %z
 }
-; CHECK: = sfmax({{.*}}, {{.*}})
+; CHECK: = sfmax({{.*}},{{.*}})
 
 ; Floating point minimum
 declare float @llvm.hexagon.F2.sfmin(float, float)
@@ -373,7 +373,7 @@ define float @F2_sfmin(float %a, float %b) {
   %z = call float @llvm.hexagon.F2.sfmin(float %a, float %b)
   ret float %z
 }
-; CHECK: = sfmin({{.*}}, {{.*}})
+; CHECK: = sfmin({{.*}},{{.*}})
 
 ; Floating point multiply
 declare float @llvm.hexagon.F2.sfmpy(float, float)
@@ -381,7 +381,7 @@ define float @F2_sfmpy(float %a, float %b) {
   %z = call float @llvm.hexagon.F2.sfmpy(float %a, float %b)
   ret float %z
 }
-; CHECK: = sfmpy({{.*}}, {{.*}})
+; CHECK: = sfmpy({{.*}},{{.*}})
 
 ; Floating point subtraction
 declare float @llvm.hexagon.F2.sfsub(float, float)
@@ -389,4 +389,4 @@ define float @F2_sfsub(float %a, float %b) {
   %z = call float @llvm.hexagon.F2.sfsub(float %a, float %b)
   ret float %z
 }
-; CHECK: = sfsub({{.*}}, {{.*}})
+; CHECK: = sfsub({{.*}},{{.*}})
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_mpy.ll b/test/CodeGen/Hexagon/intrinsics/xtype_mpy.ll
index a1490499fbf6d4c4e977ddd088efc7575306eda6..4da4a8a6393f395d054fb5505ed06fccde2f0ffb 100644
--- a/test/CodeGen/Hexagon/intrinsics/xtype_mpy.ll
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_mpy.ll
@@ -11,35 +11,35 @@ define i32 @M4_mpyrr_addi(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M4.mpyrr.addi(i32 0, i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = add(#0, mpyi({{.*}}, {{.*}}))
+; CHECK: = add(#0,mpyi({{.*}},{{.*}}))
 
 declare i32 @llvm.hexagon.M4.mpyri.addi(i32, i32, i32)
 define i32 @M4_mpyri_addi(i32 %a) {
   %z = call i32 @llvm.hexagon.M4.mpyri.addi(i32 0, i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = add(#0, mpyi({{.*}}, #0))
+; CHECK: = add(#0,mpyi({{.*}},#0))
 
 declare i32 @llvm.hexagon.M4.mpyri.addr.u2(i32, i32, i32)
 define i32 @M4_mpyri_addr_u2(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M4.mpyri.addr.u2(i32 %a, i32 0, i32 %b)
   ret i32 %z
 }
-; CHECK: = add({{.*}}, mpyi(#0, {{.*}}))
+; CHECK: = add({{.*}},mpyi(#0,{{.*}}))
 
 declare i32 @llvm.hexagon.M4.mpyri.addr(i32, i32, i32)
 define i32 @M4_mpyri_addr(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M4.mpyri.addr(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: = add({{.*}}, mpyi({{.*}}, #0))
+; CHECK: = add({{.*}},mpyi({{.*}},#0))
 
 declare i32 @llvm.hexagon.M4.mpyrr.addr(i32, i32, i32)
 define i32 @M4_mpyrr_addr(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.mpyrr.addr(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: = add({{.*}}, mpyi({{.*}}, {{.*}}))
+; CHECK: = add({{.*}},mpyi({{.*}},{{.*}}))
 
 ; Vector multiply word by signed half (32x16)
 declare i64 @llvm.hexagon.M2.mmpyl.s0(i64, i64)
@@ -47,56 +47,56 @@ define i64 @M2_mmpyl_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyl.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpyweh({{.*}}, {{.*}}):sat
+; CHECK: = vmpyweh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.mmpyl.s1(i64, i64)
 define i64 @M2_mmpyl_s1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyl.s1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpyweh({{.*}}, {{.*}}):<<1:sat
+; CHECK: = vmpyweh({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.mmpyh.s0(i64, i64)
 define i64 @M2_mmpyh_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyh.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpywoh({{.*}}, {{.*}}):sat
+; CHECK: = vmpywoh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.mmpyh.s1(i64, i64)
 define i64 @M2_mmpyh_s1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyh.s1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpywoh({{.*}}, {{.*}}):<<1:sat
+; CHECK: = vmpywoh({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.mmpyl.rs0(i64, i64)
 define i64 @M2_mmpyl_rs0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyl.rs0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpyweh({{.*}}, {{.*}}):rnd:sat
+; CHECK: = vmpyweh({{.*}},{{.*}}):rnd:sat
 
 declare i64 @llvm.hexagon.M2.mmpyl.rs1(i64, i64)
 define i64 @M2_mmpyl_rs1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyl.rs1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpyweh({{.*}}, {{.*}}):<<1:rnd:sat
+; CHECK: = vmpyweh({{.*}},{{.*}}):<<1:rnd:sat
 
 declare i64 @llvm.hexagon.M2.mmpyh.rs0(i64, i64)
 define i64 @M2_mmpyh_rs0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyh.rs0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpywoh({{.*}}, {{.*}}):rnd:sat
+; CHECK: = vmpywoh({{.*}},{{.*}}):rnd:sat
 
 declare i64 @llvm.hexagon.M2.mmpyh.rs1(i64, i64)
 define i64 @M2_mmpyh_rs1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyh.rs1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpywoh({{.*}}, {{.*}}):<<1:rnd:sat
+; CHECK: = vmpywoh({{.*}},{{.*}}):<<1:rnd:sat
 
 ; Vector multiply word by unsigned half (32x16)
 declare i64 @llvm.hexagon.M2.mmpyul.s0(i64, i64)
@@ -104,56 +104,56 @@ define i64 @M2_mmpyul_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyul.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpyweuh({{.*}}, {{.*}}):sat
+; CHECK: = vmpyweuh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.mmpyul.s1(i64, i64)
 define i64 @M2_mmpyul_s1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyul.s1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpyweuh({{.*}}, {{.*}}):<<1:sat
+; CHECK: = vmpyweuh({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.mmpyuh.s0(i64, i64)
 define i64 @M2_mmpyuh_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyuh.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpywouh({{.*}}, {{.*}}):sat
+; CHECK: = vmpywouh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.mmpyuh.s1(i64, i64)
 define i64 @M2_mmpyuh_s1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyuh.s1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpywouh({{.*}}, {{.*}}):<<1:sat
+; CHECK: = vmpywouh({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.mmpyul.rs0(i64, i64)
 define i64 @M2_mmpyul_rs0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyul.rs0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpyweuh({{.*}}, {{.*}}):rnd:sat
+; CHECK: = vmpyweuh({{.*}},{{.*}}):rnd:sat
 
 declare i64 @llvm.hexagon.M2.mmpyul.rs1(i64, i64)
 define i64 @M2_mmpyul_rs1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyul.rs1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpyweuh({{.*}}, {{.*}}):<<1:rnd:sat
+; CHECK: = vmpyweuh({{.*}},{{.*}}):<<1:rnd:sat
 
 declare i64 @llvm.hexagon.M2.mmpyuh.rs0(i64, i64)
 define i64 @M2_mmpyuh_rs0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyuh.rs0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpywouh({{.*}}, {{.*}}):rnd:sat
+; CHECK: = vmpywouh({{.*}},{{.*}}):rnd:sat
 
 declare i64 @llvm.hexagon.M2.mmpyuh.rs1(i64, i64)
 define i64 @M2_mmpyuh_rs1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.mmpyuh.rs1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpywouh({{.*}}, {{.*}}):<<1:rnd:sat
+; CHECK: = vmpywouh({{.*}},{{.*}}):<<1:rnd:sat
 
 ; Multiply signed halfwords
 declare i64 @llvm.hexagon.M2.mpyd.ll.s0(i32, i32)
@@ -161,616 +161,616 @@ define i64 @M2_mpyd_ll_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.ll.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.l)
+; CHECK: = mpy({{.*}}.l,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyd.ll.s1(i32, i32)
 define i64 @M2_mpyd_ll_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.ll.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.l):<<1
+; CHECK: = mpy({{.*}}.l,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.lh.s0(i32, i32)
 define i64 @M2_mpyd_lh_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.lh.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.h)
+; CHECK: = mpy({{.*}}.l,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyd.lh.s1(i32, i32)
 define i64 @M2_mpyd_lh_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.lh.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.h):<<1
+; CHECK: = mpy({{.*}}.l,{{.*}}.h):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.hl.s0(i32, i32)
 define i64 @M2_mpyd_hl_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.hl.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.l)
+; CHECK: = mpy({{.*}}.h,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyd.hl.s1(i32, i32)
 define i64 @M2_mpyd_hl_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.hl.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.l):<<1
+; CHECK: = mpy({{.*}}.h,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.hh.s0(i32, i32)
 define i64 @M2_mpyd_hh_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.hh.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.h)
+; CHECK: = mpy({{.*}}.h,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyd.hh.s1(i32, i32)
 define i64 @M2_mpyd_hh_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.hh.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.h):<<1
+; CHECK: = mpy({{.*}}.h,{{.*}}.h):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.rnd.ll.s0(i32, i32)
 define i64 @M2_mpyd_rnd_ll_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.rnd.ll.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.l):rnd
+; CHECK: = mpy({{.*}}.l,{{.*}}.l):rnd
 
 declare i64 @llvm.hexagon.M2.mpyd.rnd.ll.s1(i32, i32)
 define i64 @M2_mpyd_rnd_ll_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.rnd.ll.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.l):<<1:rnd
+; CHECK: = mpy({{.*}}.l,{{.*}}.l):<<1:rnd
 
 declare i64 @llvm.hexagon.M2.mpyd.rnd.lh.s0(i32, i32)
 define i64 @M2_mpyd_rnd_lh_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.rnd.lh.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.h):rnd
+; CHECK: = mpy({{.*}}.l,{{.*}}.h):rnd
 
 declare i64 @llvm.hexagon.M2.mpyd.rnd.lh.s1(i32, i32)
 define i64 @M2_mpyd_rnd_lh_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.rnd.lh.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.h):<<1:rnd
+; CHECK: = mpy({{.*}}.l,{{.*}}.h):<<1:rnd
 
 declare i64 @llvm.hexagon.M2.mpyd.rnd.hl.s0(i32, i32)
 define i64 @M2_mpyd_rnd_hl_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.rnd.hl.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.l):rnd
+; CHECK: = mpy({{.*}}.h,{{.*}}.l):rnd
 
 declare i64 @llvm.hexagon.M2.mpyd.rnd.hl.s1(i32, i32)
 define i64 @M2_mpyd_rnd_hl_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.rnd.hl.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.l):<<1:rnd
+; CHECK: = mpy({{.*}}.h,{{.*}}.l):<<1:rnd
 
 declare i64 @llvm.hexagon.M2.mpyd.rnd.hh.s0(i32, i32)
 define i64 @M2_mpyd_rnd_hh_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.rnd.hh.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.h):rnd
+; CHECK: = mpy({{.*}}.h,{{.*}}.h):rnd
 
 declare i64 @llvm.hexagon.M2.mpyd.rnd.hh.s1(i32, i32)
 define i64 @M2_mpyd_rnd_hh_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyd.rnd.hh.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.h):<<1:rnd
+; CHECK: = mpy({{.*}}.h,{{.*}}.h):<<1:rnd
 
 declare i64 @llvm.hexagon.M2.mpyd.acc.ll.s0(i64, i32, i32)
 define i64 @M2_mpyd_acc_ll_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.acc.ll.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.l)
+; CHECK: += mpy({{.*}}.l,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyd.acc.ll.s1(i64, i32, i32)
 define i64 @M2_mpyd_acc_ll_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.acc.ll.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.l):<<1
+; CHECK: += mpy({{.*}}.l,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.acc.lh.s0(i64, i32, i32)
 define i64 @M2_mpyd_acc_lh_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.acc.lh.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.h)
+; CHECK: += mpy({{.*}}.l,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyd.acc.lh.s1(i64, i32, i32)
 define i64 @M2_mpyd_acc_lh_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.acc.lh.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.h):<<1
+; CHECK: += mpy({{.*}}.l,{{.*}}.h):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.acc.hl.s0(i64, i32, i32)
 define i64 @M2_mpyd_acc_hl_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.acc.hl.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.l)
+; CHECK: += mpy({{.*}}.h,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyd.acc.hl.s1(i64, i32, i32)
 define i64 @M2_mpyd_acc_hl_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.acc.hl.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.l):<<1
+; CHECK: += mpy({{.*}}.h,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.acc.hh.s0(i64, i32, i32)
 define i64 @M2_mpyd_acc_hh_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.acc.hh.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.h)
+; CHECK: += mpy({{.*}}.h,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyd.acc.hh.s1(i64, i32, i32)
 define i64 @M2_mpyd_acc_hh_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.acc.hh.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.h):<<1
+; CHECK: += mpy({{.*}}.h,{{.*}}.h):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.nac.ll.s0(i64, i32, i32)
 define i64 @M2_mpyd_nac_ll_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.nac.ll.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.l)
+; CHECK: -= mpy({{.*}}.l,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyd.nac.ll.s1(i64, i32, i32)
 define i64 @M2_mpyd_nac_ll_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.nac.ll.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.l):<<1
+; CHECK: -= mpy({{.*}}.l,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.nac.lh.s0(i64, i32, i32)
 define i64 @M2_mpyd_nac_lh_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.nac.lh.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.h)
+; CHECK: -= mpy({{.*}}.l,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyd.nac.lh.s1(i64, i32, i32)
 define i64 @M2_mpyd_nac_lh_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.nac.lh.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.h):<<1
+; CHECK: -= mpy({{.*}}.l,{{.*}}.h):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.nac.hl.s0(i64, i32, i32)
 define i64 @M2_mpyd_nac_hl_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.nac.hl.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.l)
+; CHECK: -= mpy({{.*}}.h,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyd.nac.hl.s1(i64, i32, i32)
 define i64 @M2_mpyd_nac_hl_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.nac.hl.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.l):<<1
+; CHECK: -= mpy({{.*}}.h,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyd.nac.hh.s0(i64, i32, i32)
 define i64 @M2_mpyd_nac_hh_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.nac.hh.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.h)
+; CHECK: -= mpy({{.*}}.h,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyd.nac.hh.s1(i64, i32, i32)
 define i64 @M2_mpyd_nac_hh_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyd.nac.hh.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.h):<<1
+; CHECK: -= mpy({{.*}}.h,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.ll.s0(i32, i32)
 define i32 @M2_mpy_ll_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.ll.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.l)
+; CHECK: = mpy({{.*}}.l,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpy.ll.s1(i32, i32)
 define i32 @M2_mpy_ll_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.ll.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.l):<<1
+; CHECK: = mpy({{.*}}.l,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.lh.s0(i32, i32)
 define i32 @M2_mpy_lh_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.lh.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.h)
+; CHECK: = mpy({{.*}}.l,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpy.lh.s1(i32, i32)
 define i32 @M2_mpy_lh_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.lh.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.h):<<1
+; CHECK: = mpy({{.*}}.l,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.hl.s0(i32, i32)
 define i32 @M2_mpy_hl_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.hl.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.l)
+; CHECK: = mpy({{.*}}.h,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpy.hl.s1(i32, i32)
 define i32 @M2_mpy_hl_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.hl.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.l):<<1
+; CHECK: = mpy({{.*}}.h,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.hh.s0(i32, i32)
 define i32 @M2_mpy_hh_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.hh.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.h)
+; CHECK: = mpy({{.*}}.h,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpy.hh.s1(i32, i32)
 define i32 @M2_mpy_hh_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.hh.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.h):<<1
+; CHECK: = mpy({{.*}}.h,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.sat.ll.s0(i32, i32)
 define i32 @M2_mpy_sat_ll_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.ll.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.l):sat
+; CHECK: = mpy({{.*}}.l,{{.*}}.l):sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.ll.s1(i32, i32)
 define i32 @M2_mpy_sat_ll_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.ll.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.l):<<1:sat
+; CHECK: = mpy({{.*}}.l,{{.*}}.l):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.lh.s0(i32, i32)
 define i32 @M2_mpy_sat_lh_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.lh.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.h):sat
+; CHECK: = mpy({{.*}}.l,{{.*}}.h):sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.lh.s1(i32, i32)
 define i32 @M2_mpy_sat_lh_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.lh.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.h):<<1:sat
+; CHECK: = mpy({{.*}}.l,{{.*}}.h):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.hl.s0(i32, i32)
 define i32 @M2_mpy_sat_hl_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.hl.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.l):sat
+; CHECK: = mpy({{.*}}.h,{{.*}}.l):sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.hl.s1(i32, i32)
 define i32 @M2_mpy_sat_hl_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.hl.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.l):<<1:sat
+; CHECK: = mpy({{.*}}.h,{{.*}}.l):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.hh.s0(i32, i32)
 define i32 @M2_mpy_sat_hh_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.hh.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.h):sat
+; CHECK: = mpy({{.*}}.h,{{.*}}.h):sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.hh.s1(i32, i32)
 define i32 @M2_mpy_sat_hh_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.hh.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.h):<<1:sat
+; CHECK: = mpy({{.*}}.h,{{.*}}.h):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.rnd.ll.s0(i32, i32)
 define i32 @M2_mpy_sat_rnd_ll_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.ll.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.l):rnd:sat
+; CHECK: = mpy({{.*}}.l,{{.*}}.l):rnd:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.rnd.ll.s1(i32, i32)
 define i32 @M2_mpy_sat_rnd_ll_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.ll.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.l):<<1:rnd:sat
+; CHECK: = mpy({{.*}}.l,{{.*}}.l):<<1:rnd:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.rnd.lh.s0(i32, i32)
 define i32 @M2_mpy_sat_rnd_lh_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.lh.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.h):rnd:sat
+; CHECK: = mpy({{.*}}.l,{{.*}}.h):rnd:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.rnd.lh.s1(i32, i32)
 define i32 @M2_mpy_sat_rnd_lh_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.lh.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.l, {{.*}}.h):<<1:rnd:sat
+; CHECK: = mpy({{.*}}.l,{{.*}}.h):<<1:rnd:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.rnd.hl.s0(i32, i32)
 define i32 @M2_mpy_sat_rnd_hl_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.hl.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.l):rnd:sat
+; CHECK: = mpy({{.*}}.h,{{.*}}.l):rnd:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.rnd.hl.s1(i32, i32)
 define i32 @M2_mpy_sat_rnd_hl_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.hl.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.l):<<1:rnd:sat
+; CHECK: = mpy({{.*}}.h,{{.*}}.l):<<1:rnd:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.rnd.hh.s0(i32, i32)
 define i32 @M2_mpy_sat_rnd_hh_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.hh.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.h):rnd:sat
+; CHECK: = mpy({{.*}}.h,{{.*}}.h):rnd:sat
 
 declare i32 @llvm.hexagon.M2.mpy.sat.rnd.hh.s1(i32, i32)
 define i32 @M2_mpy_sat_rnd_hh_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.sat.rnd.hh.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}.h, {{.*}}.h):<<1:rnd:sat
+; CHECK: = mpy({{.*}}.h,{{.*}}.h):<<1:rnd:sat
 
 declare i32 @llvm.hexagon.M2.mpy.acc.ll.s0(i32, i32, i32)
 define i32 @M2_mpy_acc_ll_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.ll.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.l)
+; CHECK: += mpy({{.*}}.l,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpy.acc.ll.s1(i32, i32, i32)
 define i32 @M2_mpy_acc_ll_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.ll.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.l):<<1
+; CHECK: += mpy({{.*}}.l,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.acc.lh.s0(i32, i32, i32)
 define i32 @M2_mpy_acc_lh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.lh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.h)
+; CHECK: += mpy({{.*}}.l,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpy.acc.lh.s1(i32, i32, i32)
 define i32 @M2_mpy_acc_lh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.lh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.h):<<1
+; CHECK: += mpy({{.*}}.l,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.acc.hl.s0(i32, i32, i32)
 define i32 @M2_mpy_acc_hl_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.hl.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.l)
+; CHECK: += mpy({{.*}}.h,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpy.acc.hl.s1(i32, i32, i32)
 define i32 @M2_mpy_acc_hl_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.hl.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.l):<<1
+; CHECK: += mpy({{.*}}.h,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.acc.hh.s0(i32, i32, i32)
 define i32 @M2_mpy_acc_hh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.hh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.h)
+; CHECK: += mpy({{.*}}.h,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpy.acc.hh.s1(i32, i32, i32)
 define i32 @M2_mpy_acc_hh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.hh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.h):<<1
+; CHECK: += mpy({{.*}}.h,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32, i32, i32)
 define i32 @M2_mpy_acc_sat_ll_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.l):sat
+; CHECK: += mpy({{.*}}.l,{{.*}}.l):sat
 
 declare i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s1(i32, i32, i32)
 define i32 @M2_mpy_acc_sat_ll_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.ll.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.l):<<1:sat
+; CHECK: += mpy({{.*}}.l,{{.*}}.l):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.acc.sat.lh.s0(i32, i32, i32)
 define i32 @M2_mpy_acc_sat_lh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.lh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.h):sat
+; CHECK: += mpy({{.*}}.l,{{.*}}.h):sat
 
 declare i32 @llvm.hexagon.M2.mpy.acc.sat.lh.s1(i32, i32, i32)
 define i32 @M2_mpy_acc_sat_lh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.lh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.l, {{.*}}.h):<<1:sat
+; CHECK: += mpy({{.*}}.l,{{.*}}.h):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.acc.sat.hl.s0(i32, i32, i32)
 define i32 @M2_mpy_acc_sat_hl_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.hl.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.l):sat
+; CHECK: += mpy({{.*}}.h,{{.*}}.l):sat
 
 declare i32 @llvm.hexagon.M2.mpy.acc.sat.hl.s1(i32, i32, i32)
 define i32 @M2_mpy_acc_sat_hl_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.hl.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.l):<<1:sat
+; CHECK: += mpy({{.*}}.h,{{.*}}.l):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.acc.sat.hh.s0(i32, i32, i32)
 define i32 @M2_mpy_acc_sat_hh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.hh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.h):sat
+; CHECK: += mpy({{.*}}.h,{{.*}}.h):sat
 
 declare i32 @llvm.hexagon.M2.mpy.acc.sat.hh.s1(i32, i32, i32)
 define i32 @M2_mpy_acc_sat_hh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.acc.sat.hh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}.h, {{.*}}.h):<<1:sat
+; CHECK: += mpy({{.*}}.h,{{.*}}.h):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.nac.ll.s0(i32, i32, i32)
 define i32 @M2_mpy_nac_ll_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.ll.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.l)
+; CHECK: -= mpy({{.*}}.l,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpy.nac.ll.s1(i32, i32, i32)
 define i32 @M2_mpy_nac_ll_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.ll.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.l):<<1
+; CHECK: -= mpy({{.*}}.l,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.nac.lh.s0(i32, i32, i32)
 define i32 @M2_mpy_nac_lh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.lh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.h)
+; CHECK: -= mpy({{.*}}.l,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpy.nac.lh.s1(i32, i32, i32)
 define i32 @M2_mpy_nac_lh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.lh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK:  -= mpy({{.*}}.l, {{.*}}.h):<<1
+; CHECK:  -= mpy({{.*}}.l,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.nac.hl.s0(i32, i32, i32)
 define i32 @M2_mpy_nac_hl_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.hl.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.l)
+; CHECK: -= mpy({{.*}}.h,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpy.nac.hl.s1(i32, i32, i32)
 define i32 @M2_mpy_nac_hl_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.hl.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.l):<<1
+; CHECK: -= mpy({{.*}}.h,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.nac.hh.s0(i32, i32, i32)
 define i32 @M2_mpy_nac_hh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.hh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.h)
+; CHECK: -= mpy({{.*}}.h,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpy.nac.hh.s1(i32, i32, i32)
 define i32 @M2_mpy_nac_hh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.hh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.h):<<1
+; CHECK: -= mpy({{.*}}.h,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpy.nac.sat.ll.s0(i32, i32, i32)
 define i32 @M2_mpy_nac_sat_ll_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.ll.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.l):sat
+; CHECK: -= mpy({{.*}}.l,{{.*}}.l):sat
 
 declare i32 @llvm.hexagon.M2.mpy.nac.sat.ll.s1(i32, i32, i32)
 define i32 @M2_mpy_nac_sat_ll_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.ll.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.l):<<1:sat
+; CHECK: -= mpy({{.*}}.l,{{.*}}.l):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.nac.sat.lh.s0(i32, i32, i32)
 define i32 @M2_mpy_nac_sat_lh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.lh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.h):sat
+; CHECK: -= mpy({{.*}}.l,{{.*}}.h):sat
 
 declare i32 @llvm.hexagon.M2.mpy.nac.sat.lh.s1(i32, i32, i32)
 define i32 @M2_mpy_nac_sat_lh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.lh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.l, {{.*}}.h):<<1:sat
+; CHECK: -= mpy({{.*}}.l,{{.*}}.h):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.nac.sat.hl.s0(i32, i32, i32)
 define i32 @M2_mpy_nac_sat_hl_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.hl.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.l):sat
+; CHECK: -= mpy({{.*}}.h,{{.*}}.l):sat
 
 declare i32 @llvm.hexagon.M2.mpy.nac.sat.hl.s1(i32, i32, i32)
 define i32 @M2_mpy_nac_sat_hl_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.hl.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.l):<<1:sat
+; CHECK: -= mpy({{.*}}.h,{{.*}}.l):<<1:sat
 
 declare i32 @llvm.hexagon.M2.mpy.nac.sat.hh.s0(i32, i32, i32)
 define i32 @M2_mpy_nac_sat_hh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.hh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.h):sat
+; CHECK: -= mpy({{.*}}.h,{{.*}}.h):sat
 
 declare i32 @llvm.hexagon.M2.mpy.nac.sat.hh.s1(i32, i32, i32)
 define i32 @M2_mpy_nac_sat_hh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpy.nac.sat.hh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}.h, {{.*}}.h):<<1:sat
+; CHECK: -= mpy({{.*}}.h,{{.*}}.h):<<1:sat
 
 ; Multiply unsigned halfwords
 declare i64 @llvm.hexagon.M2.mpyud.ll.s0(i32, i32)
@@ -778,336 +778,336 @@ define i64 @M2_mpyud_ll_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyud.ll.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpyu({{.*}}.l, {{.*}}.l)
+; CHECK: = mpyu({{.*}}.l,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyud.ll.s1(i32, i32)
 define i64 @M2_mpyud_ll_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyud.ll.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpyu({{.*}}.l, {{.*}}.l):<<1
+; CHECK: = mpyu({{.*}}.l,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.lh.s0(i32, i32)
 define i64 @M2_mpyud_lh_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyud.lh.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpyu({{.*}}.l, {{.*}}.h)
+; CHECK: = mpyu({{.*}}.l,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyud.lh.s1(i32, i32)
 define i64 @M2_mpyud_lh_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyud.lh.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpyu({{.*}}.l, {{.*}}.h):<<1
+; CHECK: = mpyu({{.*}}.l,{{.*}}.h):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.hl.s0(i32, i32)
 define i64 @M2_mpyud_hl_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyud.hl.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpyu({{.*}}.h, {{.*}}.l)
+; CHECK: = mpyu({{.*}}.h,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyud.hl.s1(i32, i32)
 define i64 @M2_mpyud_hl_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyud.hl.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpyu({{.*}}.h, {{.*}}.l):<<1
+; CHECK: = mpyu({{.*}}.h,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.hh.s0(i32, i32)
 define i64 @M2_mpyud_hh_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyud.hh.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpyu({{.*}}.h, {{.*}}.h)
+; CHECK: = mpyu({{.*}}.h,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyud.hh.s1(i32, i32)
 define i64 @M2_mpyud_hh_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.mpyud.hh.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpyu({{.*}}.h, {{.*}}.h):<<1
+; CHECK: = mpyu({{.*}}.h,{{.*}}.h):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.acc.ll.s0(i64, i32, i32)
 define i64 @M2_mpyud_acc_ll_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.acc.ll.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpyu({{.*}}.l, {{.*}}.l)
+; CHECK: += mpyu({{.*}}.l,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyud.acc.ll.s1(i64, i32, i32)
 define i64 @M2_mpyud_acc_ll_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.acc.ll.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpyu({{.*}}.l, {{.*}}.l):<<1
+; CHECK: += mpyu({{.*}}.l,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.acc.lh.s0(i64, i32, i32)
 define i64 @M2_mpyud_acc_lh_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.acc.lh.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpyu({{.*}}.l, {{.*}}.h)
+; CHECK: += mpyu({{.*}}.l,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyud.acc.lh.s1(i64, i32, i32)
 define i64 @M2_mpyud_acc_lh_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.acc.lh.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpyu({{.*}}.l, {{.*}}.h):<<1
+; CHECK: += mpyu({{.*}}.l,{{.*}}.h):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.acc.hl.s0(i64, i32, i32)
 define i64 @M2_mpyud_acc_hl_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.acc.hl.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpyu({{.*}}.h, {{.*}}.l)
+; CHECK: += mpyu({{.*}}.h,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyud.acc.hl.s1(i64, i32, i32)
 define i64 @M2_mpyud_acc_hl_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.acc.hl.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpyu({{.*}}.h, {{.*}}.l):<<1
+; CHECK: += mpyu({{.*}}.h,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.acc.hh.s0(i64, i32, i32)
 define i64 @M2_mpyud_acc_hh_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.acc.hh.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpyu({{.*}}.h, {{.*}}.h)
+; CHECK: += mpyu({{.*}}.h,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyud.acc.hh.s1(i64, i32, i32)
 define i64 @M2_mpyud_acc_hh_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.acc.hh.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpyu({{.*}}.h, {{.*}}.h):<<1
+; CHECK: += mpyu({{.*}}.h,{{.*}}.h):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.nac.ll.s0(i64, i32, i32)
 define i64 @M2_mpyud_nac_ll_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.nac.ll.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpyu({{.*}}.l, {{.*}}.l)
+; CHECK: -= mpyu({{.*}}.l,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyud.nac.ll.s1(i64, i32, i32)
 define i64 @M2_mpyud_nac_ll_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.nac.ll.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpyu({{.*}}.l, {{.*}}.l):<<1
+; CHECK: -= mpyu({{.*}}.l,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.nac.lh.s0(i64, i32, i32)
 define i64 @M2_mpyud_nac_lh_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.nac.lh.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpyu({{.*}}.l, {{.*}}.h)
+; CHECK: -= mpyu({{.*}}.l,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyud.nac.lh.s1(i64, i32, i32)
 define i64 @M2_mpyud_nac_lh_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.nac.lh.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpyu({{.*}}.l, {{.*}}.h):<<1
+; CHECK: -= mpyu({{.*}}.l,{{.*}}.h):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.nac.hl.s0(i64, i32, i32)
 define i64 @M2_mpyud_nac_hl_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.nac.hl.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpyu({{.*}}.h, {{.*}}.l)
+; CHECK: -= mpyu({{.*}}.h,{{.*}}.l)
 
 declare i64 @llvm.hexagon.M2.mpyud.nac.hl.s1(i64, i32, i32)
 define i64 @M2_mpyud_nac_hl_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.nac.hl.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpyu({{.*}}.h, {{.*}}.l):<<1
+; CHECK: -= mpyu({{.*}}.h,{{.*}}.l):<<1
 
 declare i64 @llvm.hexagon.M2.mpyud.nac.hh.s0(i64, i32, i32)
 define i64 @M2_mpyud_nac_hh_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.nac.hh.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpyu({{.*}}.h, {{.*}}.h)
+; CHECK: -= mpyu({{.*}}.h,{{.*}}.h)
 
 declare i64 @llvm.hexagon.M2.mpyud.nac.hh.s1(i64, i32, i32)
 define i64 @M2_mpyud_nac_hh_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.mpyud.nac.hh.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpyu({{.*}}.h, {{.*}}.h):<<1
+; CHECK: -= mpyu({{.*}}.h,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.ll.s0(i32, i32)
 define i32 @M2_mpyu_ll_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpyu.ll.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpyu({{.*}}.l, {{.*}}.l)
+; CHECK: = mpyu({{.*}}.l,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpyu.ll.s1(i32, i32)
 define i32 @M2_mpyu_ll_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpyu.ll.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpyu({{.*}}.l, {{.*}}.l):<<1
+; CHECK: = mpyu({{.*}}.l,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.lh.s0(i32, i32)
 define i32 @M2_mpyu_lh_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpyu.lh.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpyu({{.*}}.l, {{.*}}.h)
+; CHECK: = mpyu({{.*}}.l,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpyu.lh.s1(i32, i32)
 define i32 @M2_mpyu_lh_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpyu.lh.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpyu({{.*}}.l, {{.*}}.h):<<1
+; CHECK: = mpyu({{.*}}.l,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.hl.s0(i32, i32)
 define i32 @M2_mpyu_hl_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpyu.hl.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpyu({{.*}}.h, {{.*}}.l)
+; CHECK: = mpyu({{.*}}.h,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpyu.hl.s1(i32, i32)
 define i32 @M2_mpyu_hl_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpyu.hl.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpyu({{.*}}.h, {{.*}}.l):<<1
+; CHECK: = mpyu({{.*}}.h,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.hh.s0(i32, i32)
 define i32 @M2_mpyu_hh_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpyu.hh.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpyu({{.*}}.h, {{.*}}.h)
+; CHECK: = mpyu({{.*}}.h,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpyu.hh.s1(i32, i32)
 define i32 @M2_mpyu_hh_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpyu.hh.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpyu({{.*}}.h, {{.*}}.h):<<1
+; CHECK: = mpyu({{.*}}.h,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.acc.ll.s0(i32, i32, i32)
 define i32 @M2_mpyu_acc_ll_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.acc.ll.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpyu({{.*}}.l, {{.*}}.l)
+; CHECK: += mpyu({{.*}}.l,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpyu.acc.ll.s1(i32, i32, i32)
 define i32 @M2_mpyu_acc_ll_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.acc.ll.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpyu({{.*}}.l, {{.*}}.l):<<1
+; CHECK: += mpyu({{.*}}.l,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.acc.lh.s0(i32, i32, i32)
 define i32 @M2_mpyu_acc_lh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.acc.lh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpyu({{.*}}.l, {{.*}}.h)
+; CHECK: += mpyu({{.*}}.l,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpyu.acc.lh.s1(i32, i32, i32)
 define i32 @M2_mpyu_acc_lh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.acc.lh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpyu({{.*}}.l, {{.*}}.h):<<1
+; CHECK: += mpyu({{.*}}.l,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.acc.hl.s0(i32, i32, i32)
 define i32 @M2_mpyu_acc_hl_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.acc.hl.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpyu({{.*}}.h, {{.*}}.l)
+; CHECK: += mpyu({{.*}}.h,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpyu.acc.hl.s1(i32, i32, i32)
 define i32 @M2_mpyu_acc_hl_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.acc.hl.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpyu({{.*}}.h, {{.*}}.l):<<1
+; CHECK: += mpyu({{.*}}.h,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.acc.hh.s0(i32, i32, i32)
 define i32 @M2_mpyu_acc_hh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.acc.hh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpyu({{.*}}.h, {{.*}}.h)
+; CHECK: += mpyu({{.*}}.h,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpyu.acc.hh.s1(i32, i32, i32)
 define i32 @M2_mpyu_acc_hh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.acc.hh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpyu({{.*}}.h, {{.*}}.h):<<1
+; CHECK: += mpyu({{.*}}.h,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.nac.ll.s0(i32, i32, i32)
 define i32 @M2_mpyu_nac_ll_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.nac.ll.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpyu({{.*}}.l, {{.*}}.l)
+; CHECK: -= mpyu({{.*}}.l,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpyu.nac.ll.s1(i32, i32, i32)
 define i32 @M2_mpyu_nac_ll_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.nac.ll.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpyu({{.*}}.l, {{.*}}.l):<<1
+; CHECK: -= mpyu({{.*}}.l,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.nac.lh.s0(i32, i32, i32)
 define i32 @M2_mpyu_nac_lh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.nac.lh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpyu({{.*}}.l, {{.*}}.h)
+; CHECK: -= mpyu({{.*}}.l,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpyu.nac.lh.s1(i32, i32, i32)
 define i32 @M2_mpyu_nac_lh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.nac.lh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpyu({{.*}}.l, {{.*}}.h):<<1
+; CHECK: -= mpyu({{.*}}.l,{{.*}}.h):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.nac.hl.s0(i32, i32, i32)
 define i32 @M2_mpyu_nac_hl_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.nac.hl.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpyu({{.*}}.h, {{.*}}.l)
+; CHECK: -= mpyu({{.*}}.h,{{.*}}.l)
 
 declare i32 @llvm.hexagon.M2.mpyu.nac.hl.s1(i32, i32, i32)
 define i32 @M2_mpyu_nac_hl_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.nac.hl.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpyu({{.*}}.h, {{.*}}.l):<<1
+; CHECK: -= mpyu({{.*}}.h,{{.*}}.l):<<1
 
 declare i32 @llvm.hexagon.M2.mpyu.nac.hh.s0(i32, i32, i32)
 define i32 @M2_mpyu_nac_hh_s0(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.nac.hh.s0(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpyu({{.*}}.h, {{.*}}.h)
+; CHECK: -= mpyu({{.*}}.h,{{.*}}.h)
 
 declare i32 @llvm.hexagon.M2.mpyu.nac.hh.s1(i32, i32, i32)
 define i32 @M2_mpyu_nac_hh_s1(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M2.mpyu.nac.hh.s1(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpyu({{.*}}.h, {{.*}}.h):<<1
+; CHECK: -= mpyu({{.*}}.h,{{.*}}.h):<<1
 
 ; Polynomial multiply words
 declare i64 @llvm.hexagon.M4.pmpyw(i32, i32)
@@ -1115,14 +1115,14 @@ define i64 @M4_pmpyw(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M4.pmpyw(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = pmpyw({{.*}}, {{.*}})
+; CHECK: = pmpyw({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M4.pmpyw.acc(i64, i32, i32)
 define i64 @M4_pmpyw_acc(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M4.pmpyw.acc(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: ^= pmpyw({{.*}}, {{.*}})
+; CHECK: ^= pmpyw({{.*}},{{.*}})
 
 ; Vector reduce multiply word by signed half
 declare i64 @llvm.hexagon.M4.vrmpyoh.s0(i64, i64)
@@ -1130,56 +1130,56 @@ define i64 @M4_vrmpyoh_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M4.vrmpyoh.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrmpywoh({{.*}}, {{.*}})
+; CHECK: = vrmpywoh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M4.vrmpyoh.s1(i64, i64)
 define i64 @M4_vrmpyoh_s1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M4.vrmpyoh.s1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrmpywoh({{.*}}, {{.*}}):<<1
+; CHECK: = vrmpywoh({{.*}},{{.*}}):<<1
 
 declare i64 @llvm.hexagon.M4.vrmpyeh.s0(i64, i64)
 define i64 @M4_vrmpyeh_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M4.vrmpyeh.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrmpyweh({{.*}}, {{.*}})
+; CHECK: = vrmpyweh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M4.vrmpyeh.s1(i64, i64)
 define i64 @M4_vrmpyeh_s1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M4.vrmpyeh.s1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrmpyweh({{.*}}, {{.*}}):<<1
+; CHECK: = vrmpyweh({{.*}},{{.*}}):<<1
 
 declare i64 @llvm.hexagon.M4.vrmpyoh.acc.s0(i64, i64, i64)
 define i64 @M4_vrmpyoh_acc_s0(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M4.vrmpyoh.acc.s0(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrmpywoh({{.*}}, r5:4)
+; CHECK: += vrmpywoh({{.*}},r5:4)
 
 declare i64 @llvm.hexagon.M4.vrmpyoh.acc.s1(i64, i64, i64)
 define i64 @M4_vrmpyoh_acc_s1(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M4.vrmpyoh.acc.s1(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrmpywoh({{.*}}, r5:4):<<1
+; CHECK: += vrmpywoh({{.*}},r5:4):<<1
 
 declare i64 @llvm.hexagon.M4.vrmpyeh.acc.s0(i64, i64, i64)
 define i64 @M4_vrmpyeh_acc_s0(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M4.vrmpyeh.acc.s0(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrmpyweh({{.*}}, r5:4)
+; CHECK: += vrmpyweh({{.*}},r5:4)
 
 declare i64 @llvm.hexagon.M4.vrmpyeh.acc.s1(i64, i64, i64)
 define i64 @M4_vrmpyeh_acc_s1(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M4.vrmpyeh.acc.s1(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrmpyweh({{.*}}, r5:4):<<1
+; CHECK: += vrmpyweh({{.*}},r5:4):<<1
 
 ; Multiply and use upper result
 declare i32 @llvm.hexagon.M2.dpmpyss.rnd.s0(i32, i32)
@@ -1187,84 +1187,84 @@ define i32 @M2_dpmpyss_rnd_s0(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.dpmpyss.rnd.s0(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}, {{.*}}):rnd
+; CHECK: = mpy({{.*}},{{.*}}):rnd
 
 declare i32 @llvm.hexagon.M2.mpyu.up(i32, i32)
 define i32 @M2_mpyu_up(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpyu.up(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpyu({{.*}}, {{.*}})
+; CHECK: = mpyu({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M2.mpysu.up(i32, i32)
 define i32 @M2_mpysu_up(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpysu.up(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpysu({{.*}}, {{.*}})
+; CHECK: = mpysu({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M2.hmmpyh.s1(i32, i32)
 define i32 @M2_hmmpyh_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.hmmpyh.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}, {{.*}}.h):<<1:sat
+; CHECK: = mpy({{.*}},{{.*}}.h):<<1:sat
 
 declare i32 @llvm.hexagon.M2.hmmpyl.s1(i32, i32)
 define i32 @M2_hmmpyl_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.hmmpyl.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}, {{.*}}.l):<<1:sat
+; CHECK: = mpy({{.*}},{{.*}}.l):<<1:sat
 
 declare i32 @llvm.hexagon.M2.hmmpyh.rs1(i32, i32)
 define i32 @M2_hmmpyh_rs1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.hmmpyh.rs1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}, {{.*}}.h):<<1:rnd:sat
+; CHECK: = mpy({{.*}},{{.*}}.h):<<1:rnd:sat
 
 declare i32 @llvm.hexagon.M2.mpy.up.s1.sat(i32, i32)
 define i32 @M2_mpy_up_s1_sat(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.up.s1.sat(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}, {{.*}}):<<1:sat
+; CHECK: = mpy({{.*}},{{.*}}):<<1:sat
 
 declare i32 @llvm.hexagon.M2.hmmpyl.rs1(i32, i32)
 define i32 @M2_hmmpyl_rs1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.hmmpyl.rs1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}, {{.*}}.l):<<1:rnd:sat
+; CHECK: = mpy({{.*}},{{.*}}.l):<<1:rnd:sat
 
 declare i32 @llvm.hexagon.M2.mpy.up(i32, i32)
 define i32 @M2_mpy_up(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.up(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}, {{.*}})
+; CHECK: = mpy({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.M2.mpy.up.s1(i32, i32)
 define i32 @M2_mpy_up_s1(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.M2.mpy.up.s1(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = mpy({{.*}}, {{.*}}):<<1
+; CHECK: = mpy({{.*}},{{.*}}):<<1
 
 declare i32 @llvm.hexagon.M4.mac.up.s1.sat(i32, i32, i32)
 define i32 @M4_mac_up_s1_sat(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.mac.up.s1.sat(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += mpy({{.*}}, {{.*}}):<<1:sat
+; CHECK: += mpy({{.*}},{{.*}}):<<1:sat
 
 declare i32 @llvm.hexagon.M4.nac.up.s1.sat(i32, i32, i32)
 define i32 @M4_nac_up_s1_sat(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.M4.nac.up.s1.sat(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= mpy({{.*}}, {{.*}}):<<1:sat
+; CHECK: -= mpy({{.*}},{{.*}}):<<1:sat
 
 ; Multiply and use full result
 declare i64 @llvm.hexagon.M2.dpmpyss.s0(i32, i32)
@@ -1272,42 +1272,42 @@ define i64 @M2_dpmpyss_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.dpmpyss.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpy({{.*}}, {{.*}})
+; CHECK: = mpy({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.dpmpyuu.s0(i32, i32)
 define i64 @M2_dpmpyuu_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.dpmpyuu.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = mpyu({{.*}}, {{.*}})
+; CHECK: = mpyu({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.dpmpyss.acc.s0(i64, i32, i32)
 define i64 @M2_dpmpyss_acc_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.dpmpyss.acc.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpy({{.*}}, {{.*}})
+; CHECK: += mpy({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64, i32, i32)
 define i64 @M2_dpmpyss_nac_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.dpmpyss.nac.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpy({{.*}}, {{.*}})
+; CHECK: -= mpy({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.dpmpyuu.acc.s0(i64, i32, i32)
 define i64 @M2_dpmpyuu_acc_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.dpmpyuu.acc.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += mpyu({{.*}}, {{.*}})
+; CHECK: += mpyu({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.dpmpyuu.nac.s0(i64, i32, i32)
 define i64 @M2_dpmpyuu_nac_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.dpmpyuu.nac.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= mpyu({{.*}}, {{.*}})
+; CHECK: -= mpyu({{.*}},{{.*}})
 
 ; Vector dual multiply
 declare i64 @llvm.hexagon.M2.vdmpys.s0(i64, i64)
@@ -1315,14 +1315,14 @@ define i64 @M2_vdmpys_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vdmpys.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vdmpy({{.*}}, {{.*}}):sat
+; CHECK: = vdmpy({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.vdmpys.s1(i64, i64)
 define i64 @M2_vdmpys_s1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vdmpys.s1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vdmpy({{.*}}, {{.*}}):<<1:sat
+; CHECK: = vdmpy({{.*}},{{.*}}):<<1:sat
 
 ; Vector reduce multiply bytes
 declare i64 @llvm.hexagon.M5.vrmpybuu(i64, i64)
@@ -1330,28 +1330,28 @@ define i64 @M5_vrmpybuu(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M5.vrmpybuu(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrmpybu({{.*}}, {{.*}})
+; CHECK: = vrmpybu({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M5.vrmpybsu(i64, i64)
 define i64 @M5_vrmpybsu(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M5.vrmpybsu(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrmpybsu({{.*}}, {{.*}})
+; CHECK: = vrmpybsu({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M5.vrmacbuu(i64, i64, i64)
 define i64 @M5_vrmacbuu(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M5.vrmacbuu(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrmpybu({{.*}}, r5:4)
+; CHECK: += vrmpybu({{.*}},r5:4)
 
 declare i64 @llvm.hexagon.M5.vrmacbsu(i64, i64, i64)
 define i64 @M5_vrmacbsu(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M5.vrmacbsu(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrmpybsu({{.*}}, r5:4)
+; CHECK: += vrmpybsu({{.*}},r5:4)
 
 ; Vector dual multiply signed by unsigned bytes
 declare i64 @llvm.hexagon.M5.vdmpybsu(i64, i64)
@@ -1359,14 +1359,14 @@ define i64 @M5_vdmpybsu(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M5.vdmpybsu(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vdmpybsu({{.*}}, {{.*}}):sat
+; CHECK: = vdmpybsu({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M5.vdmacbsu(i64, i64, i64)
 define i64 @M5_vdmacbsu(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M5.vdmacbsu(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vdmpybsu({{.*}}, r5:4):sat
+; CHECK: += vdmpybsu({{.*}},r5:4):sat
 
 ; Vector multiply even halfwords
 declare i64 @llvm.hexagon.M2.vmpy2es.s0(i64, i64)
@@ -1374,35 +1374,35 @@ define i64 @M2_vmpy2es_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vmpy2es.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpyeh({{.*}}, {{.*}}):sat
+; CHECK: = vmpyeh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.vmpy2es.s1(i64, i64)
 define i64 @M2_vmpy2es_s1(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vmpy2es.s1(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vmpyeh({{.*}}, {{.*}}):<<1:sat
+; CHECK: = vmpyeh({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.vmac2es(i64, i64, i64)
 define i64 @M2_vmac2es(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M2.vmac2es(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vmpyeh({{.*}}, r5:4)
+; CHECK: += vmpyeh({{.*}},r5:4)
 
 declare i64 @llvm.hexagon.M2.vmac2es.s0(i64, i64, i64)
 define i64 @M2_vmac2es_s0(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M2.vmac2es.s0(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vmpyeh({{.*}}, r5:4):sat
+; CHECK: += vmpyeh({{.*}},r5:4):sat
 
 declare i64 @llvm.hexagon.M2.vmac2es.s1(i64, i64, i64)
 define i64 @M2_vmac2es_s1(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M2.vmac2es.s1(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vmpyeh({{.*}}, r5:4):<<1:sat
+; CHECK: += vmpyeh({{.*}},r5:4):<<1:sat
 
 ; Vector multiply halfwords
 declare i64 @llvm.hexagon.M2.vmpy2s.s0(i32, i32)
@@ -1410,35 +1410,35 @@ define i64 @M2_vmpy2s_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.vmpy2s.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vmpyh({{.*}}, {{.*}}):sat
+; CHECK: = vmpyh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.vmpy2s.s1(i32, i32)
 define i64 @M2_vmpy2s_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.vmpy2s.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vmpyh({{.*}}, {{.*}}):<<1:sat
+; CHECK: = vmpyh({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.vmac2(i64, i32, i32)
 define i64 @M2_vmac2(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.vmac2(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += vmpyh({{.*}}, {{.*}})
+; CHECK: += vmpyh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.vmac2s.s0(i64, i32, i32)
 define i64 @M2_vmac2s_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.vmac2s.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += vmpyh({{.*}}, {{.*}}):sat
+; CHECK: += vmpyh({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.vmac2s.s1(i64, i32, i32)
 define i64 @M2_vmac2s_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.vmac2s.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += vmpyh({{.*}}, {{.*}}):<<1:sat
+; CHECK: += vmpyh({{.*}},{{.*}}):<<1:sat
 
 ; Vector multiply halfwords signed by unsigned
 declare i64 @llvm.hexagon.M2.vmpy2su.s0(i32, i32)
@@ -1446,28 +1446,28 @@ define i64 @M2_vmpy2su_s0(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.vmpy2su.s0(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vmpyhsu({{.*}}, {{.*}}):sat
+; CHECK: = vmpyhsu({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.vmpy2su.s1(i32, i32)
 define i64 @M2_vmpy2su_s1(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M2.vmpy2su.s1(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vmpyhsu({{.*}}, {{.*}}):<<1:sat
+; CHECK: = vmpyhsu({{.*}},{{.*}}):<<1:sat
 
 declare i64 @llvm.hexagon.M2.vmac2su.s0(i64, i32, i32)
 define i64 @M2_vmac2su_s0(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.vmac2su.s0(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += vmpyhsu({{.*}}, {{.*}}):sat
+; CHECK: += vmpyhsu({{.*}},{{.*}}):sat
 
 declare i64 @llvm.hexagon.M2.vmac2su.s1(i64, i32, i32)
 define i64 @M2_vmac2su_s1(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M2.vmac2su.s1(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += vmpyhsu({{.*}}, {{.*}}):<<1:sat
+; CHECK: += vmpyhsu({{.*}},{{.*}}):<<1:sat
 
 ; Vector reduce multiply halfwords
 declare i64 @llvm.hexagon.M2.vrmpy.s0(i64, i64)
@@ -1475,14 +1475,14 @@ define i64 @M2_vrmpy_s0(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.M2.vrmpy.s0(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vrmpyh({{.*}}, {{.*}})
+; CHECK: = vrmpyh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M2.vrmac.s0(i64, i64, i64)
 define i64 @M2_vrmac_s0(i64 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.M2.vrmac.s0(i64 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: += vrmpyh({{.*}}, r5:4)
+; CHECK: += vrmpyh({{.*}},r5:4)
 
 ; Vector multiply bytes
 declare i64 @llvm.hexagon.M5.vmpybsu(i32, i32)
@@ -1490,28 +1490,28 @@ define i64 @M2_vmpybsu(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M5.vmpybsu(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vmpybsu({{.*}}, {{.*}})
+; CHECK: = vmpybsu({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M5.vmpybuu(i32, i32)
 define i64 @M2_vmpybuu(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M5.vmpybuu(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vmpybu({{.*}}, {{.*}})
+; CHECK: = vmpybu({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M5.vmacbuu(i64, i32, i32)
 define i64 @M2_vmacbuu(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M5.vmacbuu(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += vmpybu({{.*}}, {{.*}})
+; CHECK: += vmpybu({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M5.vmacbsu(i64, i32, i32)
 define i64 @M2_vmacbsu(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M5.vmacbsu(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += vmpybsu({{.*}}, {{.*}})
+; CHECK: += vmpybsu({{.*}},{{.*}})
 
 ; Vector polynomial multiply halfwords
 declare i64 @llvm.hexagon.M4.vpmpyh(i32, i32)
@@ -1519,11 +1519,11 @@ define i64 @M4_vpmpyh(i32 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.M4.vpmpyh(i32 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vpmpyh({{.*}}, {{.*}})
+; CHECK: = vpmpyh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.M4.vpmpyh.acc(i64, i32, i32)
 define i64 @M4_vpmpyh_acc(i64 %a, i32 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.M4.vpmpyh.acc(i64 %a, i32 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: ^= vpmpyh({{.*}}, {{.*}})
+; CHECK: ^= vpmpyh({{.*}},{{.*}})
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_perm.ll b/test/CodeGen/Hexagon/intrinsics/xtype_perm.ll
index 3e044e3838decd102492e4f89ae8b04f3b36f59c..9260790e33a6316f715601cbcb4ecb285aca021b 100644
--- a/test/CodeGen/Hexagon/intrinsics/xtype_perm.ll
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_perm.ll
@@ -141,28 +141,28 @@ define i64 @S2_shuffeb(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.shuffeb(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = shuffeb({{.*}}, {{.*}})
+; CHECK: = shuffeb({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.shuffob(i64, i64)
 define i64 @S2_shuffob(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.shuffob(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = shuffob({{.*}}, {{.*}})
+; CHECK: = shuffob({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.shuffeh(i64, i64)
 define i64 @S2_shuffeh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.shuffeh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = shuffeh({{.*}}, {{.*}})
+; CHECK: = shuffeh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.shuffoh(i64, i64)
 define i64 @S2_shuffoh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.shuffoh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = shuffoh({{.*}}, {{.*}})
+; CHECK: = shuffoh({{.*}},{{.*}})
 
 ; Vector splat bytes
 declare i32 @llvm.hexagon.S2.vsplatrb(i32)
@@ -186,14 +186,14 @@ define i64 @S2_vspliceib(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.vspliceib(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: = vspliceb({{.*}}, {{.*}}, #0)
+; CHECK: = vspliceb({{.*}},{{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.vsplicerb(i64, i64, i32)
 define i64 @S2_vsplicerb(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.vsplicerb(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: = vspliceb({{.*}}, {{.*}}, {{.*}})
+; CHECK: = vspliceb({{.*}},{{.*}},{{.*}})
 
 ; Vector sign extend
 declare i64 @llvm.hexagon.S2.vsxtbh(i32)
@@ -230,14 +230,14 @@ define i64 @S2_vtrunowh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.vtrunowh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vtrunowh({{.*}}, {{.*}})
+; CHECK: = vtrunowh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.vtrunewh(i64, i64)
 define i64 @S2_vtrunewh(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.vtrunewh(i64 %a, i64 %b)
   ret i64 %z
 }
-; CHECK: = vtrunewh({{.*}}, {{.*}})
+; CHECK: = vtrunewh({{.*}},{{.*}})
 
 ; Vector zero extend
 declare i64 @llvm.hexagon.S2.vzxtbh(i32)
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_pred.ll b/test/CodeGen/Hexagon/intrinsics/xtype_pred.ll
index f06339b9a85a38c657a4da25477f7778e002abe8..506dc88d3c1ae0ab0903b07662c9e475546f2599 100644
--- a/test/CodeGen/Hexagon/intrinsics/xtype_pred.ll
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_pred.ll
@@ -10,42 +10,42 @@ define i32 @A4_cmpbgt(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.cmpbgt(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpb.gt({{.*}}, {{.*}})
+; CHECK: = cmpb.gt({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.cmpbeq(i32, i32)
 define i32 @A4_cmpbeq(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.cmpbeq(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpb.eq({{.*}}, {{.*}})
+; CHECK: = cmpb.eq({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.cmpbgtu(i32, i32)
 define i32 @A4_cmpbgtu(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.cmpbgtu(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmpb.gtu({{.*}}, {{.*}})
+; CHECK: = cmpb.gtu({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.cmpbgti(i32, i32)
 define i32 @A4_cmpbgti(i32 %a) {
   %z = call i32 @llvm.hexagon.A4.cmpbgti(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = cmpb.gt({{.*}}, #0)
+; CHECK: = cmpb.gt({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.cmpbeqi(i32, i32)
 define i32 @A4_cmpbeqi(i32 %a) {
   %z = call i32 @llvm.hexagon.A4.cmpbeqi(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = cmpb.eq({{.*}}, #0)
+; CHECK: = cmpb.eq({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.cmpbgtui(i32, i32)
 define i32 @A4_cmpbgtui(i32 %a) {
   %z = call i32 @llvm.hexagon.A4.cmpbgtui(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = cmpb.gtu({{.*}}, #0)
+; CHECK: = cmpb.gtu({{.*}},#0)
 
 ; Compare half
 declare i32 @llvm.hexagon.A4.cmphgt(i32, i32)
@@ -53,42 +53,42 @@ define i32 @A4_cmphgt(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.cmphgt(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmph.gt({{.*}}, {{.*}})
+; CHECK: = cmph.gt({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.cmpheq(i32, i32)
 define i32 @A4_cmpheq(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.cmpheq(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmph.eq({{.*}}, {{.*}})
+; CHECK: = cmph.eq({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.cmphgtu(i32, i32)
 define i32 @A4_cmphgtu(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.cmphgtu(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = cmph.gtu({{.*}}, {{.*}})
+; CHECK: = cmph.gtu({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.cmphgti(i32, i32)
 define i32 @A4_cmphgti(i32 %a) {
   %z = call i32 @llvm.hexagon.A4.cmphgti(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = cmph.gt({{.*}}, #0)
+; CHECK: = cmph.gt({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.cmpheqi(i32, i32)
 define i32 @A4_cmpheqi(i32 %a) {
   %z = call i32 @llvm.hexagon.A4.cmpheqi(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = cmph.eq({{.*}}, #0)
+; CHECK: = cmph.eq({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.cmphgtui(i32, i32)
 define i32 @A4_cmphgtui(i32 %a) {
   %z = call i32 @llvm.hexagon.A4.cmphgtui(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = cmph.gtu({{.*}}, #0)
+; CHECK: = cmph.gtu({{.*}},#0)
 
 ; Compare doublewords
 declare i32 @llvm.hexagon.C2.cmpgtp(i64, i64)
@@ -96,21 +96,21 @@ define i32 @C2_cmpgtp(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.C2.cmpgtp(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = cmp.gt({{.*}}, {{.*}})
+; CHECK: = cmp.gt({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.C2.cmpeqp(i64, i64)
 define i32 @C2_cmpeqp(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.C2.cmpeqp(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = cmp.eq({{.*}}, {{.*}})
+; CHECK: = cmp.eq({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.C2.cmpgtup(i64, i64)
 define i32 @C2_cmpgtup(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.C2.cmpgtup(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = cmp.gtu({{.*}}, {{.*}})
+; CHECK: = cmp.gtu({{.*}},{{.*}})
 
 ; Compare bitmask
 declare i32 @llvm.hexagon.C2.bitsclri(i32, i32)
@@ -118,42 +118,42 @@ define i32 @C2_bitsclri(i32 %a) {
   %z = call i32 @llvm.hexagon.C2.bitsclri(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = bitsclr({{.*}}, #0)
+; CHECK: = bitsclr({{.*}},#0)
 
 declare i32 @llvm.hexagon.C4.nbitsclri(i32, i32)
 define i32 @C4_nbitsclri(i32 %a) {
   %z = call i32 @llvm.hexagon.C4.nbitsclri(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = !bitsclr({{.*}}, #0)
+; CHECK: = !bitsclr({{.*}},#0)
 
 declare i32 @llvm.hexagon.C2.bitsset(i32, i32)
 define i32 @C2_bitsset(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.C2.bitsset(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = bitsset({{.*}}, {{.*}})
+; CHECK: = bitsset({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.C4.nbitsset(i32, i32)
 define i32 @C4_nbitsset(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.C4.nbitsset(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = !bitsset({{.*}}, {{.*}})
+; CHECK: = !bitsset({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.C2.bitsclr(i32, i32)
 define i32 @C2_bitsclr(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.C2.bitsclr(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = bitsclr({{.*}}, {{.*}})
+; CHECK: = bitsclr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.C4.nbitsclr(i32, i32)
 define i32 @C4_nbitsclr(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.C4.nbitsclr(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = !bitsclr({{.*}}, {{.*}})
+; CHECK: = !bitsclr({{.*}},{{.*}})
 
 ; Mask generate from predicate
 declare i64 @llvm.hexagon.C2.mask(i32)
@@ -169,7 +169,7 @@ define i32 @A4_tlbmatch(i64 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.A4.tlbmatch(i64 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = tlbmatch({{.*}}, {{.*}})
+; CHECK: = tlbmatch({{.*}},{{.*}})
 
 ; Test bit
 declare i32 @llvm.hexagon.S2.tstbit.i(i32, i32)
@@ -177,28 +177,28 @@ define i32 @S2_tstbit_i(i32 %a) {
   %z = call i32 @llvm.hexagon.S2.tstbit.i(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = tstbit({{.*}}, #0)
+; CHECK: = tstbit({{.*}},#0)
 
 declare i32 @llvm.hexagon.S4.ntstbit.i(i32, i32)
 define i32 @S4_ntstbit_i(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.ntstbit.i(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = !tstbit({{.*}}, #0)
+; CHECK: = !tstbit({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.tstbit.r(i32, i32)
 define i32 @S2_tstbit_r(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.tstbit.r(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = tstbit({{.*}}, {{.*}})
+; CHECK: = tstbit({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S4.ntstbit.r(i32, i32)
 define i32 @S4_ntstbit_r(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S4.ntstbit.r(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = !tstbit({{.*}}, {{.*}})
+; CHECK: = !tstbit({{.*}},{{.*}})
 
 ; Vector compare halfwords
 declare i32 @llvm.hexagon.A2.vcmpheq(i64, i64)
@@ -206,42 +206,42 @@ define i32 @A2_vcmpheq(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A2.vcmpheq(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vcmph.eq({{.*}}, {{.*}})
+; CHECK: = vcmph.eq({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.vcmphgt(i64, i64)
 define i32 @A2_vcmphgt(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A2.vcmphgt(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vcmph.gt({{.*}}, {{.*}})
+; CHECK: = vcmph.gt({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.vcmphgtu(i64, i64)
 define i32 @A2_vcmphgtu(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A2.vcmphgtu(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vcmph.gtu({{.*}}, {{.*}})
+; CHECK: = vcmph.gtu({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.vcmpheqi(i64, i32)
 define i32 @A4_vcmpheqi(i64 %a) {
   %z = call i32 @llvm.hexagon.A4.vcmpheqi(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = vcmph.eq({{.*}}, #0)
+; CHECK: = vcmph.eq({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.vcmphgti(i64, i32)
 define i32 @A4_vcmphgti(i64 %a) {
   %z = call i32 @llvm.hexagon.A4.vcmphgti(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = vcmph.gt({{.*}}, #0)
+; CHECK: = vcmph.gt({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.vcmphgtui(i64, i32)
 define i32 @A4_vcmphgtui(i64 %a) {
   %z = call i32 @llvm.hexagon.A4.vcmphgtui(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = vcmph.gtu({{.*}}, #0)
+; CHECK: = vcmph.gtu({{.*}},#0)
 
 ; Vector compare bytes for any match
 declare i32 @llvm.hexagon.A4.vcmpbeq.any(i64, i64)
@@ -249,7 +249,7 @@ define i32 @A4_vcmpbeq_any(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A4.vcmpbeq.any(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = any8(vcmpb.eq({{.*}}, {{.*}}))
+; CHECK: = any8(vcmpb.eq({{.*}},{{.*}}))
 
 ; Vector compare bytes
 declare i32 @llvm.hexagon.A2.vcmpbeq(i64, i64)
@@ -257,42 +257,42 @@ define i32 @A2_vcmpbeq(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A2.vcmpbeq(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vcmpb.eq({{.*}}, {{.*}})
+; CHECK: = vcmpb.eq({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.vcmpbgtu(i64, i64)
 define i32 @A2_vcmpbgtu(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A2.vcmpbgtu(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vcmpb.gtu({{.*}}, {{.*}})
+; CHECK: = vcmpb.gtu({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.vcmpbgt(i64, i64)
 define i32 @A4_vcmpbgt(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A4.vcmpbgt(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vcmpb.gt({{.*}}, {{.*}})
+; CHECK: = vcmpb.gt({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.vcmpbeqi(i64, i32)
 define i32 @A4_vcmpbeqi(i64 %a) {
   %z = call i32 @llvm.hexagon.A4.vcmpbeqi(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = vcmpb.eq({{.*}}, #0)
+; CHECK: = vcmpb.eq({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.vcmpbgti(i64, i32)
 define i32 @A4_vcmpbgti(i64 %a) {
   %z = call i32 @llvm.hexagon.A4.vcmpbgti(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = vcmpb.gt({{.*}}, #0)
+; CHECK: = vcmpb.gt({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.vcmpbgtui(i64, i32)
 define i32 @A4_vcmpbgtui(i64 %a) {
   %z = call i32 @llvm.hexagon.A4.vcmpbgtui(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = vcmpb.gtu({{.*}}, #0)
+; CHECK: = vcmpb.gtu({{.*}},#0)
 
 ; Vector compare words
 declare i32 @llvm.hexagon.A2.vcmpweq(i64, i64)
@@ -300,42 +300,42 @@ define i32 @A2_vcmpweq(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A2.vcmpweq(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vcmpw.eq({{.*}}, {{.*}})
+; CHECK: = vcmpw.eq({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.vcmpwgt(i64, i64)
 define i32 @A2_vcmpwgt(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A2.vcmpwgt(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vcmpw.gt({{.*}}, {{.*}})
+; CHECK: = vcmpw.gt({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A2.vcmpwgtu(i64, i64)
 define i32 @A2_vcmpwgtu(i64 %a, i64 %b) {
   %z = call i32 @llvm.hexagon.A2.vcmpwgtu(i64 %a, i64 %b)
   ret i32 %z
 }
-; CHECK: = vcmpw.gtu({{.*}}, {{.*}})
+; CHECK: = vcmpw.gtu({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.A4.vcmpweqi(i64, i32)
 define i32 @A4_vcmpweqi(i64 %a) {
   %z = call i32 @llvm.hexagon.A4.vcmpweqi(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = vcmpw.eq({{.*}}, #0)
+; CHECK: = vcmpw.eq({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.vcmpwgti(i64, i32)
 define i32 @A4_vcmpwgti(i64 %a) {
   %z = call i32 @llvm.hexagon.A4.vcmpwgti(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = vcmpw.gt({{.*}}, #0)
+; CHECK: = vcmpw.gt({{.*}},#0)
 
 declare i32 @llvm.hexagon.A4.vcmpwgtui(i64, i32)
 define i32 @A4_vcmpwgtui(i64 %a) {
   %z = call i32 @llvm.hexagon.A4.vcmpwgtui(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = vcmpw.gtu({{.*}}, #0)
+; CHECK: = vcmpw.gtu({{.*}},#0)
 
 ; Viterbi pack even and odd predicate bitsclr
 declare i32 @llvm.hexagon.C2.vitpack(i32, i32)
@@ -343,7 +343,7 @@ define i32 @C2_vitpack(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.C2.vitpack(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vitpack({{.*}}, {{.*}})
+; CHECK: = vitpack({{.*}},{{.*}})
 
 ; Vector mux
 declare i64 @llvm.hexagon.C2.vmux(i32, i64, i64)
@@ -351,4 +351,4 @@ define i64 @C2_vmux(i32 %a, i64 %b, i64 %c) {
   %z = call i64 @llvm.hexagon.C2.vmux(i32 %a, i64 %b, i64 %c)
   ret i64 %z
 }
-; CHECK: = vmux({{.*}}, {{.*}}, {{.*}})
+; CHECK: = vmux({{.*}},{{.*}},{{.*}})
diff --git a/test/CodeGen/Hexagon/intrinsics/xtype_shift.ll b/test/CodeGen/Hexagon/intrinsics/xtype_shift.ll
index 1a65f44c195469cb44a368443f70126915b69580..8809baf3551ba3eb8453a5d85e426519c47cbd7f 100644
--- a/test/CodeGen/Hexagon/intrinsics/xtype_shift.ll
+++ b/test/CodeGen/Hexagon/intrinsics/xtype_shift.ll
@@ -10,42 +10,42 @@ define i64 @S2_asr_i_p(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.asr.i.p(i64 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = asr({{.*}}, #0)
+; CHECK: = asr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.lsr.i.p(i64, i32)
 define i64 @S2_lsr_i_p(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.lsr.i.p(i64 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = lsr({{.*}}, #0)
+; CHECK: = lsr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.asl.i.p(i64, i32)
 define i64 @S2_asl_i_p(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.asl.i.p(i64 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = asl({{.*}}, #0)
+; CHECK: = asl({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asr.i.r(i32, i32)
 define i32 @S2_asr_i_r(i32 %a) {
   %z = call i32 @llvm.hexagon.S2.asr.i.r(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = asr({{.*}}, #0)
+; CHECK: = asr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.lsr.i.r(i32, i32)
 define i32 @S2_lsr_i_r(i32 %a) {
   %z = call i32 @llvm.hexagon.S2.lsr.i.r(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = lsr({{.*}}, #0)
+; CHECK: = lsr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asl.i.r(i32, i32)
 define i32 @S2_asl_i_r(i32 %a) {
   %z = call i32 @llvm.hexagon.S2.asl.i.r(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = asl({{.*}}, #0)
+; CHECK: = asl({{.*}},#0)
 
 ; Shift by immediate and accumulate
 declare i64 @llvm.hexagon.S2.asr.i.p.nac(i64, i64, i32)
@@ -53,84 +53,84 @@ define i64 @S2_asr_i_p_nac(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.asr.i.p.nac(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: -= asr({{.*}}, #0)
+; CHECK: -= asr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.lsr.i.p.nac(i64, i64, i32)
 define i64 @S2_lsr_i_p_nac(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.lsr.i.p.nac(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: -= lsr({{.*}}, #0)
+; CHECK: -= lsr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.asl.i.p.nac(i64, i64, i32)
 define i64 @S2_asl_i_p_nac(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.asl.i.p.nac(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: -= asl({{.*}}, #0)
+; CHECK: -= asl({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.asr.i.p.acc(i64, i64, i32)
 define i64 @S2_asr_i_p_acc(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.asr.i.p.acc(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: += asr({{.*}}, #0)
+; CHECK: += asr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.lsr.i.p.acc(i64, i64, i32)
 define i64 @S2_lsr_i_p_acc(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.lsr.i.p.acc(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: += lsr({{.*}}, #0)
+; CHECK: += lsr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.asl.i.p.acc(i64, i64, i32)
 define i64 @S2_asl_i_p_acc(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.asl.i.p.acc(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: += asl({{.*}}, #0)
+; CHECK: += asl({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asr.i.r.nac(i32, i32, i32)
 define i32 @S2_asr_i_r_nac(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asr.i.r.nac(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: -= asr({{.*}}, #0)
+; CHECK: -= asr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.lsr.i.r.nac(i32, i32, i32)
 define i32 @S2_lsr_i_r_nac(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.lsr.i.r.nac(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: -= lsr({{.*}}, #0)
+; CHECK: -= lsr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asl.i.r.nac(i32, i32, i32)
 define i32 @S2_asl_i_r_nac(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asl.i.r.nac(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: -= asl({{.*}}, #0)
+; CHECK: -= asl({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asr.i.r.acc(i32, i32, i32)
 define i32 @S2_asr_i_r_acc(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asr.i.r.acc(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: += asr({{.*}}, #0)
+; CHECK: += asr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.lsr.i.r.acc(i32, i32, i32)
 define i32 @S2_lsr_i_r_acc(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.lsr.i.r.acc(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: += lsr({{.*}}, #0)
+; CHECK: += lsr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asl.i.r.acc(i32, i32, i32)
 define i32 @S2_asl_i_r_acc(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asl.i.r.acc(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: += asl({{.*}}, #0)
+; CHECK: += asl({{.*}},#0)
 
 ; Shift by immediate and add
 declare i32 @llvm.hexagon.S4.addi.asl.ri(i32, i32, i32)
@@ -138,35 +138,35 @@ define i32 @S4_addi_asl_ri(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.addi.asl.ri(i32 0, i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = add(#0, asl({{.*}}, #0))
+; CHECK: = add(#0,asl({{.*}},#0))
 
 declare i32 @llvm.hexagon.S4.subi.asl.ri(i32, i32, i32)
 define i32 @S4_subi_asl_ri(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.subi.asl.ri(i32 0, i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = sub(#0, asl({{.*}}, #0))
+; CHECK: = sub(#0,asl({{.*}},#0))
 
 declare i32 @llvm.hexagon.S4.addi.lsr.ri(i32, i32, i32)
 define i32 @S4_addi_lsr_ri(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.addi.lsr.ri(i32 0, i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = add(#0, lsr({{.*}}, #0))
+; CHECK: = add(#0,lsr({{.*}},#0))
 
 declare i32 @llvm.hexagon.S4.subi.lsr.ri(i32, i32, i32)
 define i32 @S4_subi_lsr_ri(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.subi.lsr.ri(i32 0, i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = sub(#0, lsr({{.*}}, #0))
+; CHECK: = sub(#0,lsr({{.*}},#0))
 
 declare i32 @llvm.hexagon.S2.addasl.rrri(i32, i32, i32)
 define i32 @S2_addasl_rrri(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.addasl.rrri(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: = addasl({{.*}}, {{.*}}, #0)
+; CHECK: = addasl({{.*}},{{.*}},#0)
 
 ; Shift by immediate and logical
 declare i64 @llvm.hexagon.S2.asr.i.p.and(i64, i64, i32)
@@ -174,140 +174,140 @@ define i64 @S2_asr_i_p_and(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.asr.i.p.and(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: &= asr({{.*}}, #0)
+; CHECK: &= asr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.lsr.i.p.and(i64, i64, i32)
 define i64 @S2_lsr_i_p_and(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.lsr.i.p.and(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: {{.*}} &= lsr({{.*}}, #0)
+; CHECK: {{.*}} &= lsr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.asl.i.p.and(i64, i64, i32)
 define i64 @S2_asl_i_p_and(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.asl.i.p.and(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: &= asl({{.*}}, #0)
+; CHECK: &= asl({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.asr.i.p.or(i64, i64, i32)
 define i64 @S2_asr_i_p_or(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.asr.i.p.or(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: |= asr({{.*}}, #0)
+; CHECK: |= asr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.lsr.i.p.or(i64, i64, i32)
 define i64 @S2_lsr_i_p_or(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.lsr.i.p.or(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: |= lsr({{.*}}, #0)
+; CHECK: |= lsr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.asl.i.p.or(i64, i64, i32)
 define i64 @S2_asl_i_p_or(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.asl.i.p.or(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: |= asl({{.*}}, #0)
+; CHECK: |= asl({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.lsr.i.p.xacc(i64, i64, i32)
 define i64 @S2_lsr_i_p_xacc(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.lsr.i.p.xacc(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: ^= lsr({{.*}}, #0)
+; CHECK: ^= lsr({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.asl.i.p.xacc(i64, i64, i32)
 define i64 @S2_asl_i_p_xacc(i64 %a, i64 %b) {
   %z = call i64 @llvm.hexagon.S2.asl.i.p.xacc(i64 %a, i64 %b, i32 0)
   ret i64 %z
 }
-; CHECK: ^= asl({{.*}}, #0)
+; CHECK: ^= asl({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asr.i.r.and(i32, i32, i32)
 define i32 @S2_asr_i_r_and(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asr.i.r.and(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: &= asr({{.*}}, #0)
+; CHECK: &= asr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.lsr.i.r.and(i32, i32, i32)
 define i32 @S2_lsr_i_r_and(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.lsr.i.r.and(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: &= lsr({{.*}}, #0)
+; CHECK: &= lsr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asl.i.r.and(i32, i32, i32)
 define i32 @S2_asl_i_r_and(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asl.i.r.and(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: &= asl({{.*}}, #0)
+; CHECK: &= asl({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asr.i.r.or(i32, i32, i32)
 define i32 @S2_asr_i_r_or(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asr.i.r.or(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: |= asr({{.*}}, #0)
+; CHECK: |= asr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.lsr.i.r.or(i32, i32, i32)
 define i32 @S2_lsr_i_r_or(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.lsr.i.r.or(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: |= lsr({{.*}}, #0)
+; CHECK: |= lsr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asl.i.r.or(i32, i32, i32)
 define i32 @S2_asl_i_r_or(i32%a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asl.i.r.or(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: |= asl({{.*}}, #0)
+; CHECK: |= asl({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.lsr.i.r.xacc(i32, i32, i32)
 define i32 @S2_lsr_i_r_xacc(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.lsr.i.r.xacc(i32%a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: ^= lsr({{.*}}, #0)
+; CHECK: ^= lsr({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asl.i.r.xacc(i32, i32, i32)
 define i32 @S2_asl_i_r_xacc(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asl.i.r.xacc(i32 %a, i32 %b, i32 0)
   ret i32 %z
 }
-; CHECK: ^= asl({{.*}}, #0)
+; CHECK: ^= asl({{.*}},#0)
 
 declare i32 @llvm.hexagon.S4.andi.asl.ri(i32, i32, i32)
 define i32 @S4_andi_asl_ri(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.andi.asl.ri(i32 0, i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = and(#0, asl({{.*}}, #0))
+; CHECK: = and(#0,asl({{.*}},#0))
 
 declare i32 @llvm.hexagon.S4.ori.asl.ri(i32, i32, i32)
 define i32 @S4_ori_asl_ri(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.ori.asl.ri(i32 0, i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = or(#0, asl({{.*}}, #0))
+; CHECK: = or(#0,asl({{.*}},#0))
 
 declare i32 @llvm.hexagon.S4.andi.lsr.ri(i32, i32, i32)
 define i32 @S4_andi_lsr_ri(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.andi.lsr.ri(i32 0, i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = and(#0, lsr({{.*}}, #0))
+; CHECK: = and(#0,lsr({{.*}},#0))
 
 declare i32 @llvm.hexagon.S4.ori.lsr.ri(i32, i32, i32)
 define i32 @S4_ori_lsr_ri(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.ori.lsr.ri(i32 0, i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = or(#0, lsr({{.*}}, #0))
+; CHECK: = or(#0,lsr({{.*}},#0))
 
 ; Shift right by immediate with rounding
 declare i64 @llvm.hexagon.S2.asr.i.p.rnd(i64, i32)
@@ -315,14 +315,14 @@ define i64 @S2_asr_i_p_rnd(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.asr.i.p.rnd(i64 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = asr({{.*}}, #0):rnd
+; CHECK: = asr({{.*}},#0):rnd
 
 declare i32 @llvm.hexagon.S2.asr.i.r.rnd(i32, i32)
 define i32 @S2_asr_i_r_rnd(i32 %a) {
   %z = call i32 @llvm.hexagon.S2.asr.i.r.rnd(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = asr({{.*}}, #0):rnd
+; CHECK: = asr({{.*}},#0):rnd
 
 ; Shift left by immediate with saturation
 declare i32 @llvm.hexagon.S2.asl.i.r.sat(i32, i32)
@@ -330,7 +330,7 @@ define i32 @S2_asl_i_r_sat(i32 %a) {
   %z = call i32 @llvm.hexagon.S2.asl.i.r.sat(i32 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = asl({{.*}}, #0):sat
+; CHECK: = asl({{.*}},#0):sat
 
 ; Shift by register
 declare i64 @llvm.hexagon.S2.asr.r.p(i64, i32)
@@ -338,63 +338,63 @@ define i64 @S2_asr_r_p(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.asr.r.p(i64 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = asr({{.*}}, {{.*}})
+; CHECK: = asr({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.lsr.r.p(i64, i32)
 define i64 @S2_lsr_r_p(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.lsr.r.p(i64 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = lsr({{.*}}, {{.*}})
+; CHECK: = lsr({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.asl.r.p(i64, i32)
 define i64 @S2_asl_r_p(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.asl.r.p(i64 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = asl({{.*}}, {{.*}})
+; CHECK: = asl({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.lsl.r.p(i64, i32)
 define i64 @S2_lsl_r_p(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.lsl.r.p(i64 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = lsl({{.*}}, {{.*}})
+; CHECK: = lsl({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.asr.r.r(i32, i32)
 define i32 @S2_asr_r_r(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asr.r.r(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = asr({{.*}}, {{.*}})
+; CHECK: = asr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.lsr.r.r(i32, i32)
 define i32 @S2_lsr_r_r(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.lsr.r.r(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = lsr({{.*}}, {{.*}})
+; CHECK: = lsr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.asl.r.r(i32, i32)
 define i32 @S2_asl_r_r(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asl.r.r(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = asl({{.*}}, {{.*}})
+; CHECK: = asl({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.lsl.r.r(i32, i32)
 define i32 @S2_lsl_r_r(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.lsl.r.r(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = lsl({{.*}}, {{.*}})
+; CHECK: = lsl({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S4.lsli(i32, i32)
 define i32 @S4_lsli(i32 %a) {
   %z = call i32 @llvm.hexagon.S4.lsli(i32 0, i32 %a)
   ret i32 %z
 }
-; CHECK: = lsl(#0, {{.*}})
+; CHECK: = lsl(#0,{{.*}})
 
 ; Shift by register and accumulate
 declare i64 @llvm.hexagon.S2.asr.r.p.nac(i64, i64, i32)
@@ -402,112 +402,112 @@ define i64 @S2_asr_r_p_nac(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.asr.r.p.nac(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= asr({{.*}}, r4)
+; CHECK: -= asr({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.lsr.r.p.nac(i64, i64, i32)
 define i64 @S2_lsr_r_p_nac(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.lsr.r.p.nac(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= lsr({{.*}}, r4)
+; CHECK: -= lsr({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.asl.r.p.nac(i64, i64, i32)
 define i64 @S2_asl_r_p_nac(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.asl.r.p.nac(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= asl({{.*}}, r4)
+; CHECK: -= asl({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.lsl.r.p.nac(i64, i64, i32)
 define i64 @S2_lsl_r_p_nac(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.lsl.r.p.nac(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: -= lsl({{.*}}, r4)
+; CHECK: -= lsl({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.asr.r.p.acc(i64, i64, i32)
 define i64 @S2_asr_r_p_acc(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.asr.r.p.acc(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += asr({{.*}}, r4)
+; CHECK: += asr({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.lsr.r.p.acc(i64, i64, i32)
 define i64 @S2_lsr_r_p_acc(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.lsr.r.p.acc(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += lsr({{.*}}, r4)
+; CHECK: += lsr({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.asl.r.p.acc(i64, i64, i32)
 define i64 @S2_asl_r_p_acc(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.asl.r.p.acc(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += asl({{.*}}, r4)
+; CHECK: += asl({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.lsl.r.p.acc(i64, i64, i32)
 define i64 @S2_lsl_r_p_acc(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.lsl.r.p.acc(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: += lsl({{.*}}, r4)
+; CHECK: += lsl({{.*}},r4)
 
 declare i32 @llvm.hexagon.S2.asr.r.r.nac(i32, i32, i32)
 define i32 @S2_asr_r_r_nac(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.asr.r.r.nac(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= asr({{.*}}, {{.*}})
+; CHECK: -= asr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.lsr.r.r.nac(i32, i32, i32)
 define i32 @S2_lsr_r_r_nac(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.lsr.r.r.nac(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= lsr({{.*}}, {{.*}})
+; CHECK: -= lsr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.asl.r.r.nac(i32, i32, i32)
 define i32 @S2_asl_r_r_nac(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.asl.r.r.nac(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= asl({{.*}}, {{.*}})
+; CHECK: -= asl({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.lsl.r.r.nac(i32, i32, i32)
 define i32 @S2_lsl_r_r_nac(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.lsl.r.r.nac(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: -= lsl({{.*}}, {{.*}})
+; CHECK: -= lsl({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.asr.r.r.acc(i32, i32, i32)
 define i32 @S2_asr_r_r_acc(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.asr.r.r.acc(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += asr({{.*}}, {{.*}})
+; CHECK: += asr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.lsr.r.r.acc(i32, i32, i32)
 define i32 @S2_lsr_r_r_acc(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.lsr.r.r.acc(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += lsr({{.*}}, {{.*}})
+; CHECK: += lsr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.asl.r.r.acc(i32, i32, i32)
 define i32 @S2_asl_r_r_acc(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.asl.r.r.acc(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += asl({{.*}}, {{.*}})
+; CHECK: += asl({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.lsl.r.r.acc(i32, i32, i32)
 define i32 @S2_lsl_r_r_acc(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.lsl.r.r.acc(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: += lsl({{.*}}, {{.*}})
+; CHECK: += lsl({{.*}},{{.*}})
 
 ; Shift by register and logical
 declare i64 @llvm.hexagon.S2.asr.r.p.or(i64, i64, i32)
@@ -515,112 +515,112 @@ define i64 @S2_asr_r_p_or(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.asr.r.p.or(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: |= asr({{.*}}, r4)
+; CHECK: |= asr({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.lsr.r.p.or(i64, i64, i32)
 define i64 @S2_lsr_r_p_or(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.lsr.r.p.or(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: |= lsr({{.*}}, r4)
+; CHECK: |= lsr({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.asl.r.p.or(i64, i64, i32)
 define i64 @S2_asl_r_p_or(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.asl.r.p.or(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: |= asl({{.*}}, r4)
+; CHECK: |= asl({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.lsl.r.p.or(i64, i64, i32)
 define i64 @S2_lsl_r_p_or(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.lsl.r.p.or(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: |= lsl({{.*}}, r4)
+; CHECK: |= lsl({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.asr.r.p.and(i64, i64, i32)
 define i64 @S2_asr_r_p_and(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.asr.r.p.and(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: &= asr({{.*}}, r4)
+; CHECK: &= asr({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.lsr.r.p.and(i64, i64, i32)
 define i64 @S2_lsr_r_p_and(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.lsr.r.p.and(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: &= lsr({{.*}}, r4)
+; CHECK: &= lsr({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.asl.r.p.and(i64, i64, i32)
 define i64 @S2_asl_r_p_and(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.asl.r.p.and(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: &= asl({{.*}}, r4)
+; CHECK: &= asl({{.*}},r4)
 
 declare i64 @llvm.hexagon.S2.lsl.r.p.and(i64, i64, i32)
 define i64 @S2_lsl_r_p_and(i64 %a, i64 %b, i32 %c) {
   %z = call i64 @llvm.hexagon.S2.lsl.r.p.and(i64 %a, i64 %b, i32 %c)
   ret i64 %z
 }
-; CHECK: &= lsl({{.*}}, r4)
+; CHECK: &= lsl({{.*}},r4)
 
 declare i32 @llvm.hexagon.S2.asr.r.r.or(i32, i32, i32)
 define i32 @S2_asr_r_r_or(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.asr.r.r.or(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: |= asr({{.*}}, {{.*}})
+; CHECK: |= asr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.lsr.r.r.or(i32, i32, i32)
 define i32 @S2_lsr_r_r_or(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.lsr.r.r.or(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: |= lsr({{.*}}, {{.*}})
+; CHECK: |= lsr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.asl.r.r.or(i32, i32, i32)
 define i32 @S2_asl_r_r_or(i32%a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.asl.r.r.or(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: |= asl({{.*}}, {{.*}})
+; CHECK: |= asl({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.lsl.r.r.or(i32, i32, i32)
 define i32 @S2_lsl_r_r_or(i32%a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.lsl.r.r.or(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: |= lsl({{.*}}, {{.*}})
+; CHECK: |= lsl({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.asr.r.r.and(i32, i32, i32)
 define i32 @S2_asr_r_r_and(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.asr.r.r.and(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: &= asr({{.*}}, {{.*}})
+; CHECK: &= asr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.lsr.r.r.and(i32, i32, i32)
 define i32 @S2_lsr_r_r_and(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.lsr.r.r.and(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: &= lsr({{.*}}, {{.*}})
+; CHECK: &= lsr({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.asl.r.r.and(i32, i32, i32)
 define i32 @S2_asl_r_r_and(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.asl.r.r.and(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: &= asl({{.*}}, {{.*}})
+; CHECK: &= asl({{.*}},{{.*}})
 
 declare i32 @llvm.hexagon.S2.lsl.r.r.and(i32, i32, i32)
 define i32 @S2_lsl_r_r_and(i32 %a, i32 %b, i32 %c) {
   %z = call i32 @llvm.hexagon.S2.lsl.r.r.and(i32 %a, i32 %b, i32 %c)
   ret i32 %z
 }
-; CHECK: &= lsl({{.*}}, {{.*}})
+; CHECK: &= lsl({{.*}},{{.*}})
 
 ; Shift by register with saturation
 declare i32 @llvm.hexagon.S2.asr.r.r.sat(i32, i32)
@@ -628,14 +628,14 @@ define i32 @S2_asr_r_r_sat(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asr.r.r.sat(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = asr({{.*}}, {{.*}}):sat
+; CHECK: = asr({{.*}},{{.*}}):sat
 
 declare i32 @llvm.hexagon.S2.asl.r.r.sat(i32, i32)
 define i32 @S2_asl_r_r_sat(i32 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asl.r.r.sat(i32 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = asl({{.*}}, {{.*}}):sat
+; CHECK: = asl({{.*}},{{.*}}):sat
 
 ; Vector shift halfwords by immediate
 declare i64 @llvm.hexagon.S2.asr.i.vh(i64, i32)
@@ -643,21 +643,21 @@ define i64 @S2_asr_i_vh(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.asr.i.vh(i64 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = vasrh({{.*}}, #0)
+; CHECK: = vasrh({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.lsr.i.vh(i64, i32)
 define i64 @S2_lsr_i_vh(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.lsr.i.vh(i64 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = vlsrh({{.*}}, #0)
+; CHECK: = vlsrh({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.asl.i.vh(i64, i32)
 define i64 @S2_asl_i_vh(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.asl.i.vh(i64 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = vaslh({{.*}}, #0)
+; CHECK: = vaslh({{.*}},#0)
 
 ; Vector shift halfwords by register
 declare i64 @llvm.hexagon.S2.asr.r.vh(i64, i32)
@@ -665,28 +665,28 @@ define i64 @S2_asr_r_vh(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.asr.r.vh(i64 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vasrh({{.*}}, {{.*}})
+; CHECK: = vasrh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.lsr.r.vh(i64, i32)
 define i64 @S2_lsr_r_vh(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.lsr.r.vh(i64 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vlsrh({{.*}}, {{.*}})
+; CHECK: = vlsrh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.asl.r.vh(i64, i32)
 define i64 @S2_asl_r_vh(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.asl.r.vh(i64 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vaslh({{.*}}, {{.*}})
+; CHECK: = vaslh({{.*}},{{.*}})
 
 declare i64 @llvm.hexagon.S2.lsl.r.vh(i64, i32)
 define i64 @S2_lsl_r_vh(i64 %a, i32 %b) {
   %z = call i64 @llvm.hexagon.S2.lsl.r.vh(i64 %a, i32 %b)
   ret i64 %z
 }
-; CHECK: = vlslh({{.*}}, {{.*}})
+; CHECK: = vlslh({{.*}},{{.*}})
 
 ; Vector shift words by immediate
 declare i64 @llvm.hexagon.S2.asr.i.vw(i64, i32)
@@ -694,21 +694,21 @@ define i64 @S2_asr_i_vw(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.asr.i.vw(i64 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = vasrw({{.*}}, #0)
+; CHECK: = vasrw({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.lsr.i.vw(i64, i32)
 define i64 @S2_lsr_i_vw(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.lsr.i.vw(i64 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = vlsrw({{.*}}, #0)
+; CHECK: = vlsrw({{.*}},#0)
 
 declare i64 @llvm.hexagon.S2.asl.i.vw(i64, i32)
 define i64 @S2_asl_i_vw(i64 %a) {
   %z = call i64 @llvm.hexagon.S2.asl.i.vw(i64 %a, i32 0)
   ret i64 %z
 }
-; CHECK: = vaslw({{.*}}, #0)
+; CHECK: = vaslw({{.*}},#0)
 
 ; Vector shift words by with truncate and pack
 declare i32 @llvm.hexagon.S2.asr.i.svw.trun(i64, i32)
@@ -716,11 +716,11 @@ define i32 @S2_asr_i_svw_trun(i64 %a) {
   %z = call i32 @llvm.hexagon.S2.asr.i.svw.trun(i64 %a, i32 0)
   ret i32 %z
 }
-; CHECK: = vasrw({{.*}}, #0)
+; CHECK: = vasrw({{.*}},#0)
 
 declare i32 @llvm.hexagon.S2.asr.r.svw.trun(i64, i32)
 define i32 @S2_asr_r_svw_trun(i64 %a, i32 %b) {
   %z = call i32 @llvm.hexagon.S2.asr.r.svw.trun(i64 %a, i32 %b)
   ret i32 %z
 }
-; CHECK: = vasrw({{.*}}, {{.*}})
+; CHECK: = vasrw({{.*}},{{.*}})
diff --git a/test/CodeGen/Hexagon/isel-exti1.ll b/test/CodeGen/Hexagon/isel-exti1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b49986628e4e4aa356d865919c36f25986c3abd0
--- /dev/null
+++ b/test/CodeGen/Hexagon/isel-exti1.ll
@@ -0,0 +1,22 @@
+; RUN: llc -O0 -march=hexagon < %s | FileCheck %s
+
+; CHECK-LABEL: sexti1
+; CHECK: r[[REG:[0-9]+]] = mux(p{{[0-3]}},#-1,#0)
+; CHECK: combine(r[[REG]],r[[REG]])
+define i64 @sexti1(i64 %a0, i64 %a1) {
+entry:
+  %t0 = icmp ult i64 %a0, %a1
+  %t1 = sext i1 %t0 to i64
+  ret i64 %t1
+}
+
+; CHECK-LABEL: zexti1
+; CHECK: r[[REG:[0-9]+]] = mux(p{{[0-3]}},#1,#0)
+; CHECK: combine(#0,r[[REG]])
+define i64 @zexti1(i64 %a0, i64 %a1) {
+entry:
+  %t0 = icmp ult i64 %a0, %a1
+  %t1 = zext i1 %t0 to i64
+  ret i64 %t1
+}
+
diff --git a/test/CodeGen/Hexagon/isel-i1arg-crash.ll b/test/CodeGen/Hexagon/isel-i1arg-crash.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7e8bd9e93b276dc8f40b0aee4e7bbdaee9309f90
--- /dev/null
+++ b/test/CodeGen/Hexagon/isel-i1arg-crash.ll
@@ -0,0 +1,6 @@
+; RUN: llc -march=hexagon -debug-only=isel < %s
+; REQUIRES: asserts
+
+define void @g(i1 %cond) {
+  ret void
+}
diff --git a/test/CodeGen/Hexagon/isel-op-zext-i1.ll b/test/CodeGen/Hexagon/isel-op-zext-i1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d77d0929e21f455aed8b5713c6b68b8b57fe682f
--- /dev/null
+++ b/test/CodeGen/Hexagon/isel-op-zext-i1.ll
@@ -0,0 +1,13 @@
+; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
+
+; In the IR, the i1 value is zero-extended first, then passed to add.
+; Check that in the final code, the mux happens after the add.
+; CHECK: [[REG1:r[0-9]+]] = add([[REG0:r[0-9]+]],#1)
+; CHECK: r{{[0-9]+}} = mux(p{{[0-3]}},[[REG1]],[[REG0]])
+
+define i32 @foo(i32 %a, i32 %b) {
+  %v0 = icmp eq i32 %a, %b
+  %v1 = zext i1 %v0 to i32
+  %v2 = add i32 %v1, %a
+  ret i32 %v2
+}
diff --git a/test/CodeGen/Hexagon/loop-idiom/pmpy-infinite-loop.ll b/test/CodeGen/Hexagon/loop-idiom/pmpy-infinite-loop.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f738282c0f1bcbd9b06794e142c92b7c5d6a9b40
--- /dev/null
+++ b/test/CodeGen/Hexagon/loop-idiom/pmpy-infinite-loop.ll
@@ -0,0 +1,83 @@
+; RUN: opt -march=hexagon -hexagon-loop-idiom -S < %s | FileCheck %s
+; CHECK-LABEL: define void @fred
+
+; Check that this test does not crash.
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+%struct.0 = type { [120 x i16], [80 x i16], [80 x i16], [80 x i16], [80 x i16], [80 x i16], [40 x i16], [40 x i16], [40 x i16], [40 x i16], [40 x i16], [40 x i16] }
+
+define void @fred(%struct.0* %demod_state) local_unnamed_addr #0 {
+entry:
+  br label %for.body309
+
+for.body309:                                      ; preds = %for.body309, %entry
+  %max_diff.0300 = phi i16 [ %max_diff.1, %for.body309 ], [ 0, %entry ]
+  %arrayidx322.phi = phi i16* [ undef, %entry ], [ %arrayidx322.inc, %for.body309 ]
+  %arrayidx331.phi = phi i16* [ undef, %entry ], [ %arrayidx331.inc, %for.body309 ]
+  %lag.4299.apmt = phi i32 [ %inc376.apmt, %for.body309 ], [ 0, %entry ]
+  %0 = load i16, i16* %arrayidx322.phi, align 2
+  %conv323 = sext i16 %0 to i32
+  %sub324 = sub nsw i32 0, %conv323
+  %ispos258 = icmp sgt i32 %sub324, -1
+  %1 = select i1 %ispos258, i32 %sub324, i32 0
+  %add326 = add nsw i32 %1, 0
+  %2 = load i16, i16* %arrayidx331.phi, align 2
+  %conv332 = sext i16 %2 to i32
+  %sub333 = sub nsw i32 0, %conv332
+  %ispos260 = icmp sgt i32 %sub333, -1
+  %3 = select i1 %ispos260, i32 %sub333, i32 undef
+  %sub342 = sub nsw i32 0, %conv323
+  %ispos262 = icmp sgt i32 %sub342, -1
+  %4 = select i1 %ispos262, i32 %sub342, i32 undef
+  %sub351 = sub nsw i32 0, %conv332
+  %ispos264 = icmp sgt i32 %sub351, -1
+  %5 = select i1 %ispos264, i32 %sub351, i32 0
+  %sub360 = sub nsw i32 %conv323, %conv332
+  %ispos266 = icmp sgt i32 %sub360, -1
+  %6 = select i1 %ispos266, i32 %sub360, i32 0
+  %add335 = add nsw i32 %add326, %4
+  %add344 = add nsw i32 %add335, %3
+  %add353 = add i32 %add344, %5
+  %add362 = add i32 %add353, %6
+  %div363 = sdiv i32 %add362, 6
+  %conv364 = trunc i32 %div363 to i16
+  %sext268 = shl i32 %div363, 16
+  %conv369 = ashr exact i32 %sext268, 16
+  %conv370 = sext i16 %max_diff.0300 to i32
+  %cmp371 = icmp sgt i32 %conv369, %conv370
+  %max_diff.1 = select i1 %cmp371, i16 %conv364, i16 %max_diff.0300
+  %inc376.apmt = add nuw nsw i32 %lag.4299.apmt, 1
+  %exitcond331 = icmp ne i32 %inc376.apmt, 40
+  %arrayidx322.inc = getelementptr i16, i16* %arrayidx322.phi, i32 1
+  %arrayidx331.inc = getelementptr i16, i16* %arrayidx331.phi, i32 1
+  br i1 %exitcond331, label %for.body309, label %for.end377
+
+for.end377:                                       ; preds = %for.body309
+  %max_diff.1.lcssa = phi i16 [ %max_diff.1, %for.body309 ]
+  %cmp407 = icmp sgt i16 %max_diff.1.lcssa, 4
+  br label %for.body405
+
+for.body405:                                      ; preds = %if.end437, %for.end377
+  %arrayidx412 = getelementptr inbounds %struct.0, %struct.0* %demod_state, i32 0, i32 11, i32 undef
+  br i1 %cmp407, label %if.then409, label %if.end437
+
+if.then409:                                       ; preds = %for.body405
+  %arrayidx416 = getelementptr inbounds [40 x i16], [40 x i16]* null, i32 0, i32 undef
+  %7 = load i16, i16* %arrayidx416, align 2
+  %conv417 = sext i16 %7 to i32
+  %shl = shl i32 %conv417, 4
+  %mul419 = mul nsw i32 %shl, 655
+  %add420 = add nsw i32 %mul419, 0
+  br label %if.end437
+
+if.end437:                                        ; preds = %if.then409, %for.body405
+  %mul431.sink = phi i32 [ %add420, %if.then409 ], [ undef, %for.body405 ]
+  %shr432257 = lshr i32 %mul431.sink, 15
+  %conv433 = trunc i32 %shr432257 to i16
+  store i16 %conv433, i16* %arrayidx412, align 2
+  br label %for.body405
+}
+
+attributes #0 = { noinline nounwind "target-cpu"="hexagonv60" "target-features"="-hvx-double,-long-calls" }
diff --git a/test/CodeGen/Hexagon/loop-idiom/pmpy-mod.ll b/test/CodeGen/Hexagon/loop-idiom/pmpy-mod.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9907ae71c99216fb5f11265300be7c1d6dd99a31
--- /dev/null
+++ b/test/CodeGen/Hexagon/loop-idiom/pmpy-mod.ll
@@ -0,0 +1,84 @@
+; Run -O2 to make sure that all the usual optimizations do happen before
+; the Hexagon loop idiom recognition runs. This is to check that we still
+; get this opportunity regardless of what happens before.
+
+; RUN: opt -O2 -march=hexagon -S < %s | FileCheck %s
+
+target triple = "hexagon"
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+
+; CHECK-LABEL: define zeroext i16 @pmpy_mod_lsr
+; There need to be two pmpy instructions.
+; CHECK: call i64 @llvm.hexagon.M4.pmpyw
+; CHECK: call i64 @llvm.hexagon.M4.pmpyw
+
+define zeroext i16 @pmpy_mod_lsr(i8 zeroext %a0, i16 zeroext %a1) #0 {
+b2:
+  br label %b3
+
+b3:                                               ; preds = %b44, %b2
+  %v4 = phi i8 [ %a0, %b2 ], [ %v19, %b44 ]
+  %v5 = phi i16 [ %a1, %b2 ], [ %v43, %b44 ]
+  %v6 = phi i8 [ 0, %b2 ], [ %v45, %b44 ]
+  %v7 = zext i8 %v6 to i32
+  %v8 = icmp slt i32 %v7, 8
+  br i1 %v8, label %b9, label %b46
+
+b9:                                               ; preds = %b3
+  %v10 = zext i8 %v4 to i32
+  %v11 = and i32 %v10, 1
+  %v12 = trunc i16 %v5 to i8
+  %v13 = zext i8 %v12 to i32
+  %v14 = and i32 %v13, 1
+  %v15 = xor i32 %v11, %v14
+  %v16 = trunc i32 %v15 to i8
+  %v17 = zext i8 %v4 to i32
+  %v18 = ashr i32 %v17, 1
+  %v19 = trunc i32 %v18 to i8
+  %v20 = zext i8 %v16 to i32
+  %v21 = icmp eq i32 %v20, 1
+  br i1 %v21, label %b22, label %b26
+
+b22:                                              ; preds = %b9
+  %v23 = zext i16 %v5 to i32
+  %v24 = xor i32 %v23, 16386
+  %v25 = trunc i32 %v24 to i16
+  br label %b27
+
+b26:                                              ; preds = %b9
+  br label %b27
+
+b27:                                              ; preds = %b26, %b22
+  %v28 = phi i16 [ %v25, %b22 ], [ %v5, %b26 ]
+  %v29 = phi i8 [ 1, %b22 ], [ 0, %b26 ]
+  %v30 = zext i16 %v28 to i32
+  %v31 = ashr i32 %v30, 1
+  %v32 = trunc i32 %v31 to i16
+  %v33 = icmp ne i8 %v29, 0
+  br i1 %v33, label %b34, label %b38
+
+b34:                                              ; preds = %b27
+  %v35 = zext i16 %v32 to i32
+  %v36 = or i32 %v35, 32768
+  %v37 = trunc i32 %v36 to i16
+  br label %b42
+
+b38:                                              ; preds = %b27
+  %v39 = zext i16 %v32 to i32
+  %v40 = and i32 %v39, 32767
+  %v41 = trunc i32 %v40 to i16
+  br label %b42
+
+b42:                                              ; preds = %b38, %b34
+  %v43 = phi i16 [ %v37, %b34 ], [ %v41, %b38 ]
+  br label %b44
+
+b44:                                              ; preds = %b42
+  %v45 = add i8 %v6, 1
+  br label %b3
+
+b46:                                              ; preds = %b3
+  ret i16 %v5
+}
+
+attributes #0 = { noinline nounwind "target-cpu"="hexagonv5" "target-features"="-hvx,-hvx-double,-long-calls" }
diff --git a/test/CodeGen/Hexagon/memops-stack.ll b/test/CodeGen/Hexagon/memops-stack.ll
index a8dc664591e904b164c2aef4d327cb6a8729998f..1aa2e30ea25b63f731437e236e1d8f44b2ef9b33 100644
--- a/test/CodeGen/Hexagon/memops-stack.ll
+++ b/test/CodeGen/Hexagon/memops-stack.ll
@@ -9,13 +9,13 @@ define void @test0() #0 {
 entry:
   %x = alloca i32, align 4
   %0 = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0) #3
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
   call void @foo(i32* nonnull %x) #3
   %1 = load i32, i32* %x, align 4, !tbaa !1
   %inc = add nsw i32 %1, 1
   store i32 %inc, i32* %x, align 4, !tbaa !1
   call void @foo(i32* nonnull %x) #3
-  call void @llvm.lifetime.end(i64 4, i8* %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3
   ret void
 }
 
@@ -25,13 +25,13 @@ define void @test1() #0 {
 entry:
   %x = alloca i32, align 4
   %0 = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0) #3
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
   call void @foo(i32* nonnull %x) #3
   %1 = load i32, i32* %x, align 4, !tbaa !1
   %inc = sub nsw i32 %1, 1
   store i32 %inc, i32* %x, align 4, !tbaa !1
   call void @foo(i32* nonnull %x) #3
-  call void @llvm.lifetime.end(i64 4, i8* %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3
   ret void
 }
 
@@ -41,13 +41,13 @@ define void @test2() #0 {
 entry:
   %x = alloca i32, align 4
   %0 = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0) #3
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
   call void @foo(i32* nonnull %x) #3
   %1 = load i32, i32* %x, align 4, !tbaa !1
   %inc = or i32 %1, 1
   store i32 %inc, i32* %x, align 4, !tbaa !1
   call void @foo(i32* nonnull %x) #3
-  call void @llvm.lifetime.end(i64 4, i8* %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3
   ret void
 }
 
@@ -57,13 +57,13 @@ define void @test3() #0 {
 entry:
   %x = alloca i32, align 4
   %0 = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0) #3
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
   call void @foo(i32* nonnull %x) #3
   %1 = load i32, i32* %x, align 4, !tbaa !1
   %inc = and i32 %1, -2
   store i32 %inc, i32* %x, align 4, !tbaa !1
   call void @foo(i32* nonnull %x) #3
-  call void @llvm.lifetime.end(i64 4, i8* %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3
   ret void
 }
 
@@ -73,13 +73,13 @@ define void @test4(i32 %a) #0 {
 entry:
   %x = alloca i32, align 4
   %0 = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0) #3
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
   call void @foo(i32* nonnull %x) #3
   %1 = load i32, i32* %x, align 4, !tbaa !1
   %inc = add nsw i32 %1, %a
   store i32 %inc, i32* %x, align 4, !tbaa !1
   call void @foo(i32* nonnull %x) #3
-  call void @llvm.lifetime.end(i64 4, i8* %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3
   ret void
 }
 
@@ -89,13 +89,13 @@ define void @test5(i32 %a) #0 {
 entry:
   %x = alloca i32, align 4
   %0 = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0) #3
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
   call void @foo(i32* nonnull %x) #3
   %1 = load i32, i32* %x, align 4, !tbaa !1
   %inc = sub nsw i32 %1, %a
   store i32 %inc, i32* %x, align 4, !tbaa !1
   call void @foo(i32* nonnull %x) #3
-  call void @llvm.lifetime.end(i64 4, i8* %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3
   ret void
 }
 
@@ -105,13 +105,13 @@ define void @test6(i32 %a) #0 {
 entry:
   %x = alloca i32, align 4
   %0 = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0) #3
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
   call void @foo(i32* nonnull %x) #3
   %1 = load i32, i32* %x, align 4, !tbaa !1
   %inc = or i32 %1, %a
   store i32 %inc, i32* %x, align 4, !tbaa !1
   call void @foo(i32* nonnull %x) #3
-  call void @llvm.lifetime.end(i64 4, i8* %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3
   ret void
 }
 
@@ -121,20 +121,20 @@ define void @test7(i32 %a) #0 {
 entry:
   %x = alloca i32, align 4
   %0 = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0) #3
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #3
   call void @foo(i32* nonnull %x) #3
   %1 = load i32, i32* %x, align 4, !tbaa !1
   %inc = and i32 %1, %a
   store i32 %inc, i32* %x, align 4, !tbaa !1
   call void @foo(i32* nonnull %x) #3
-  call void @llvm.lifetime.end(i64 4, i8* %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #3
   ret void
 }
 
 
 declare void @foo(i32*) #2
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/Hexagon/newvalueSameReg.ll b/test/CodeGen/Hexagon/newvalueSameReg.ll
index 0fc4df22eb32eda1dfa6a274fa5760add845464b..39f32fb2f9d5827f646fb886329d4a3e2a0c598e 100644
--- a/test/CodeGen/Hexagon/newvalueSameReg.ll
+++ b/test/CodeGen/Hexagon/newvalueSameReg.ll
@@ -12,8 +12,8 @@
 ; Test that we don't generate a new value compare if the operands are
 ; the same register.
 
-; CHECK-NOT: cmp.eq([[REG0:(r[0-9]+)]].new, [[REG0]])
-; CHECK: cmp.eq([[REG1:(r[0-9]+)]], [[REG1]])
+; CHECK-NOT: cmp.eq([[REG0:(r[0-9]+)]].new,[[REG0]])
+; CHECK: cmp.eq([[REG1:(r[0-9]+)]],[[REG1]])
 
 ; Function Attrs: nounwind
 declare void @fprintf(%struct._Dnk_filet.1* nocapture, i8* nocapture readonly, ...) #1
diff --git a/test/CodeGen/Hexagon/newvaluejump.ll b/test/CodeGen/Hexagon/newvaluejump.ll
index 3e1ee179573a7d38912eae65cc940bfe74896655..e1437f369c88e1394d7beae9f576d933ac4daeb7 100644
--- a/test/CodeGen/Hexagon/newvaluejump.ll
+++ b/test/CodeGen/Hexagon/newvaluejump.ll
@@ -6,7 +6,7 @@
 
 define i32 @foo(i32 %a) nounwind {
 entry:
-; CHECK: if (cmp.eq(r{{[0-9]+}}.new, #0)) jump{{.}}
+; CHECK: if (cmp.eq(r{{[0-9]+}}.new,#0)) jump{{.}}
   %addr1 = alloca i32, align 4
   %addr2 = alloca i32, align 4
   %0 = load i32, i32* @i, align 4
diff --git a/test/CodeGen/Hexagon/newvaluejump2.ll b/test/CodeGen/Hexagon/newvaluejump2.ll
index a812a7d966598f2af2b138774fbf4aece48f2e9a..4c897f0830f37a66e54d8f455d27e177d21b4264 100644
--- a/test/CodeGen/Hexagon/newvaluejump2.ll
+++ b/test/CodeGen/Hexagon/newvaluejump2.ll
@@ -6,7 +6,7 @@
 @Reg = common global i32 0, align 4
 define i32 @main() nounwind {
 entry:
-; CHECK: if (cmp.gt(r{{[0-9]+}}, r{{[0-9]+}}.new)) jump:{{[t|nt]}} .LBB{{[0-9]+}}_{{[0-9]+}}
+; CHECK: if (cmp.gt(r{{[0-9]+}},r{{[0-9]+}}.new)) jump:{{[t|nt]}} .LBB{{[0-9]+}}_{{[0-9]+}}
   %Reg2 = alloca i32, align 4
   %0 = load i32, i32* %Reg2, align 4
   %1 = load i32, i32* @Reg, align 4
diff --git a/test/CodeGen/Hexagon/newvaluejump3.ll b/test/CodeGen/Hexagon/newvaluejump3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1e2e6c28c849fc79f49400f07d2127d3b3e452e9
--- /dev/null
+++ b/test/CodeGen/Hexagon/newvaluejump3.ll
@@ -0,0 +1,79 @@
+; RUN: llc -march=hexagon -filetype=obj -o /dev/null < %s
+; REQUIRES: asserts
+
+; This crashed in the MC code emitter, because a new-value branch was created
+; with IMPLICIT_DEF as the producer.
+
+target triple = "hexagon"
+
+%type.0 = type { %type.1, [64 x i8] }
+%type.1 = type { [12 x i8], %type.2*, double }
+%type.2 = type { i16, i16, [1 x %type.3] }
+%type.3 = type { i32 }
+%type.4 = type { %type.2*, i32 }
+
+define hidden fastcc i8* @fred(%type.0* nocapture readonly %a0, i8* readonly %a1) unnamed_addr #2 {
+b2:
+  %v3 = load i8, i8* %a1, align 1
+  br i1 undef, label %b4, label %b24
+
+b4:                                               ; preds = %b2
+  switch i8 %v3, label %b13 [
+    i8 25, label %b5
+    i8 26, label %b6
+    i8 28, label %b8
+    i8 27, label %b9
+    i8 43, label %b11
+    i8 110, label %b12
+  ]
+
+b5:                                               ; preds = %b4
+  unreachable
+
+b6:                                               ; preds = %b4
+  %v7 = getelementptr inbounds i8, i8* %a1, i32 2
+  br label %b16
+
+b8:                                               ; preds = %b4
+  br label %b16
+
+b9:                                               ; preds = %b4
+  %v10 = tail call fastcc i8* @fred(%type.0* undef, i8* undef)
+  br label %b24
+
+b11:                                              ; preds = %b4
+  unreachable
+
+b12:                                              ; preds = %b4
+  unreachable
+
+b13:                                              ; preds = %b4
+  br label %b14
+
+b14:                                              ; preds = %b13
+  br i1 undef, label %b15, label %b16
+
+b15:                                              ; preds = %b14
+  unreachable
+
+b16:                                              ; preds = %b20, %b14, %b8, %b6
+  %v17 = phi i8* [ %v21, %b20 ], [ undef, %b14 ], [ undef, %b8 ], [ %v7, %b6 ]
+  %v18 = phi i32 [ 0, %b20 ], [ undef, %b14 ], [ 0, %b8 ], [ 8, %b6 ]
+  %v19 = icmp sgt i32 %v18, 0
+  br i1 %v19, label %b20, label %b24
+
+b20:                                              ; preds = %b16
+  %v21 = getelementptr inbounds i8, i8* %v17, i32 1
+  %v22 = load i8, i8* %v17, align 1
+  %v23 = icmp eq i8 %v22, undef
+  br i1 %v23, label %b16, label %b24
+
+b24:                                              ; preds = %b20, %b16, %b9, %b2
+  %v25 = phi i8* [ null, %b2 ], [ null, %b9 ], [ %v17, %b16 ], [ null, %b20 ]
+  ret i8* %v25
+}
+
+attributes #0 = { argmemonly nounwind }
+attributes #1 = { nounwind readonly "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double,-long-calls" }
+attributes #2 = { nounwind "target-cpu"="hexagonv60" "target-features"="+hvx,-hvx-double,-long-calls" }
+
diff --git a/test/CodeGen/Hexagon/opt-addr-mode.ll b/test/CodeGen/Hexagon/opt-addr-mode.ll
index 7cb437c327cfcb9ad3c15013b0faf698f254d226..705cd045ea3070868b4d252ce7f896fa22841826 100644
--- a/test/CodeGen/Hexagon/opt-addr-mode.ll
+++ b/test/CodeGen/Hexagon/opt-addr-mode.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -march=hexagon -hexagon-small-data-threshold=0 -disable-hexagon-amodeopt=0 -hexagon-amode-growth-limit=4 < %s | FileCheck %s --check-prefix=CHECK-AMODE
 
 ; CHECK-NO-AMODE: [[REG0:(r[0-9]+)]] = ##global_2
-; CHECK-NO-AMODE: memw([[REG0]] + {{.*}}<<#2) =
+; CHECK-NO-AMODE: memw([[REG0]]+{{.*}}<<#2) =
 
 ; CHECK-AMODE: [[REG1:(r[0-9]+)]] = memw(##global_1)
-; CHECK-AMODE: memw([[REG1]]<<#2 + ##global_2) =
+; CHECK-AMODE: memw([[REG1]]<<#2+##global_2) =
 
 @global_1 = external global i32, align 4
 @global_2 = external global [128 x i32], align 8
diff --git a/test/CodeGen/Hexagon/opt-fabs.ll b/test/CodeGen/Hexagon/opt-fabs.ll
index 2ecbce310adeeb5a7677d029e0333191d1f451eb..9c94f853ba50f65afa657d403d44194fbc49707e 100644
--- a/test/CodeGen/Hexagon/opt-fabs.ll
+++ b/test/CodeGen/Hexagon/opt-fabs.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=hexagon-unknown-elf -mcpu=hexagonv5 -hexagon-bit=0 < %s | FileCheck %s
 ; Optimize fabsf to clrbit in V5.
 
-; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}}, #31)
+; CHECK: r{{[0-9]+}} = clrbit(r{{[0-9]+}},#31)
 
 define float @my_fabsf(float %x) nounwind {
 entry:
diff --git a/test/CodeGen/Hexagon/opt-fneg.ll b/test/CodeGen/Hexagon/opt-fneg.ll
index 97895786586352378303f3cc79f2835ac8318f51..da496c588019f8aa78b101db7625e96b4323ec29 100644
--- a/test/CodeGen/Hexagon/opt-fneg.ll
+++ b/test/CodeGen/Hexagon/opt-fneg.ll
@@ -3,7 +3,7 @@
 
 define float @foo(float %x) nounwind {
 entry:
-; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #31)
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#31)
   %x.addr = alloca float, align 4
   store float %x, float* %x.addr, align 4
   %0 = load float, float* %x.addr, align 4
@@ -13,14 +13,14 @@ entry:
 
 define float @bar(float %x) nounwind {
 entry:
-; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #31)
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#31)
   %sub = fsub float -0.000000e+00, %x
   ret float %sub
 }
 
 define float @baz(float %x) nounwind {
 entry:
-; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}}, #31)
+; CHECK: r{{[0-9]+}} = togglebit(r{{[0-9]+}},#31)
   %conv1 = fmul float %x, -1.000000e+00
   ret float %conv1
 }
diff --git a/test/CodeGen/Hexagon/opt-spill-volatile.ll b/test/CodeGen/Hexagon/opt-spill-volatile.ll
index 99dd4646d743a243421de001752a0bd1747fd966..1c86716132fde836c150f4dd73f2996d785e402a 100644
--- a/test/CodeGen/Hexagon/opt-spill-volatile.ll
+++ b/test/CodeGen/Hexagon/opt-spill-volatile.ll
@@ -6,22 +6,22 @@ target triple = "hexagon"
 
 ; CHECK-LABEL: foo
 ; CHECK: memw(r29+#4) =
-; CHECK: = memw(r29 + #4)
+; CHECK: = memw(r29+#4)
 define i32 @foo(i32 %a) #0 {
 entry:
   %x = alloca i32, align 4
   %x.0.x.0..sroa_cast = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %x.0.x.0..sroa_cast)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x.0.x.0..sroa_cast)
   store volatile i32 0, i32* %x, align 4
   %call = tail call i32 bitcast (i32 (...)* @bar to i32 ()*)() #0
   %x.0.x.0. = load volatile i32, i32* %x, align 4
   %add = add nsw i32 %x.0.x.0., %a
-  call void @llvm.lifetime.end(i64 4, i8* %x.0.x.0..sroa_cast)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %x.0.x.0..sroa_cast)
   ret i32 %add
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 declare i32 @bar(...) #0
 
diff --git a/test/CodeGen/Hexagon/pic-local.ll b/test/CodeGen/Hexagon/pic-local.ll
index 48b0096aa6521a3dbbaa0725eb4625a7047d5325..6544b3d32165a05b2f8f8872d97277b7eda8ce8b 100644
--- a/test/CodeGen/Hexagon/pic-local.ll
+++ b/test/CodeGen/Hexagon/pic-local.ll
@@ -9,11 +9,11 @@ define internal void @f2() {
 }
 
 define void()* @get_f1() {
-  ; CHECK:  r0 = add(pc, ##.Lf1@PCREL)
+  ; CHECK:  r0 = add(pc,##.Lf1@PCREL)
   ret void()* @f1
 }
 
 define void()* @get_f2() {
-  ; CHECK: r0 = add(pc, ##f2@PCREL)
+  ; CHECK: r0 = add(pc,##f2@PCREL)
   ret void()* @f2
 }
diff --git a/test/CodeGen/Hexagon/pic-simple.ll b/test/CodeGen/Hexagon/pic-simple.ll
index 46d95204f2e7e5f0524aaad4a6144796eeeb1afa..aeb21ef7de1cf79441713ae16766662226792cff 100644
--- a/test/CodeGen/Hexagon/pic-simple.ll
+++ b/test/CodeGen/Hexagon/pic-simple.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=hexagon-- -mcpu=hexagonv5 -relocation-model=pic < %s | FileCheck %s
 
-; CHECK: r{{[0-9]+}} = add({{pc|PC}}, ##_GLOBAL_OFFSET_TABLE_@PCREL)
-; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}{{.*}}+{{.*}}##src@GOT)
-; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}{{.*}}+{{.*}}##dst@GOT)
+; CHECK: r{{[0-9]+}} = add({{pc|PC}},##_GLOBAL_OFFSET_TABLE_@PCREL)
+; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}+##src@GOT)
+; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}+##dst@GOT)
 
 @dst = external global i32
 @src = external global i32
diff --git a/test/CodeGen/Hexagon/pic-static.ll b/test/CodeGen/Hexagon/pic-static.ll
index 66d7734f2cf26904cf685f6739b4d072b357ecf5..95da5f060d721c3a8aceb99b2487a751eecb9b1a 100644
--- a/test/CodeGen/Hexagon/pic-static.ll
+++ b/test/CodeGen/Hexagon/pic-static.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=hexagon-- -mcpu=hexagonv5 -relocation-model=pic < %s | FileCheck %s
 
-; CHECK-DAG: r{{[0-9]+}} = add({{pc|PC}}, ##_GLOBAL_OFFSET_TABLE_@PCREL)
-; CHECK-DAG: r{{[0-9]+}} = add({{pc|PC}}, ##x@PCREL)
-; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}{{.*}}+{{.*}}##bar@GOT)
+; CHECK-DAG: r{{[0-9]+}} = add({{pc|PC}},##_GLOBAL_OFFSET_TABLE_@PCREL)
+; CHECK-DAG: r{{[0-9]+}} = add({{pc|PC}},##x@PCREL)
+; CHECK: r{{[0-9]+}} = memw(r{{[0-9]+}}+##bar@GOT)
 
 @x = internal global i32 9, align 4
 @bar = external global i32*
diff --git a/test/CodeGen/Hexagon/pred-absolute-store.ll b/test/CodeGen/Hexagon/pred-absolute-store.ll
index 3e5e98270d53605ce7a744ef814d46c8ffcfcf67..2f19e9aeb7bbc9f0234e8e46dead35ef1bfc2ddb 100644
--- a/test/CodeGen/Hexagon/pred-absolute-store.ll
+++ b/test/CodeGen/Hexagon/pred-absolute-store.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
-; Check that we are able to predicate instructions with abosolute
+; Check that we are able to predicate instructions with absolute
 ; addressing mode.
-; CHECK: if ({{!*}}p{{[0-2]}}.new) memw(##gvar) = r{{[0-9]+}}
+; CHECK: if ({{!?}}p{{[0-3]}}) memw(##gvar) = r{{[0-9]+}}
 
 @gvar = external global i32
 define i32 @test2(i32 %a, i32 %b) nounwind {
diff --git a/test/CodeGen/Hexagon/predicate-logical.ll b/test/CodeGen/Hexagon/predicate-logical.ll
index be2bcb03d6a1130fa3a5912630f0c5f1e6808f23..e3ba4d8643db3a827c25c145772aa866a884e6d2 100644
--- a/test/CodeGen/Hexagon/predicate-logical.ll
+++ b/test/CodeGen/Hexagon/predicate-logical.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
-; CHECK: p{{[0-9]}} = or(p{{[0-9]}}, and(p{{[0-9]}}, p{{[0-9]}}))
+; CHECK: p{{[0-9]}} = or(p{{[0-9]}},and(p{{[0-9]}},p{{[0-9]}}))
 
 target triple = "hexagon"
 
diff --git a/test/CodeGen/Hexagon/predicate-rcmp.ll b/test/CodeGen/Hexagon/predicate-rcmp.ll
index 45daa88d7161300c2ebaaadf25732c4edf9d0b03..78991e0dbe70523d16f9f9da5ec868b8a6277de9 100644
--- a/test/CodeGen/Hexagon/predicate-rcmp.ll
+++ b/test/CodeGen/Hexagon/predicate-rcmp.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O2 -march=hexagon < %s | FileCheck %s
-; CHECK: cmp.eq(r{{[0-9]+}}, #0)
+; CHECK: cmp.eq(r{{[0-9]+}},#0)
 ; Check that the result of the builtin is not stored directly, i.e. that
 ; there is an instruction that converts it to {0,1} from {0,-1}. Right now
 ; the instruction is "r4 = !cmp.eq(r0, #0)".
diff --git a/test/CodeGen/Hexagon/rdf-copy-undef2.ll b/test/CodeGen/Hexagon/rdf-copy-undef2.ll
index 5f29d414ffc18769d2347f1d2a89008d25a1464e..28bf4c67cd75084cc2229fe49e8ab3c7a163b4e1 100644
--- a/test/CodeGen/Hexagon/rdf-copy-undef2.ll
+++ b/test/CodeGen/Hexagon/rdf-copy-undef2.ll
@@ -3,8 +3,8 @@
 
 target triple = "hexagon"
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 declare signext i16 @cat(i16 signext) #1
 declare void @danny(i16 signext, i16 signext, i16 signext, i16* nocapture readonly, i16 signext, i16* nocapture) #1
 declare void @sammy(i16* nocapture readonly, i16* nocapture readonly, i16* nocapture readonly, i32* nocapture, i16* nocapture, i16 signext, i16 signext, i16 signext) #1
diff --git a/test/CodeGen/Hexagon/rdf-inline-asm-fixed.ll b/test/CodeGen/Hexagon/rdf-inline-asm-fixed.ll
index 7adf7e8a53558e16af4a5ae5498b5117c929210c..222d8a2b2e147a64d2d22e2cb483e6cffdce1cf8 100644
--- a/test/CodeGen/Hexagon/rdf-inline-asm-fixed.ll
+++ b/test/CodeGen/Hexagon/rdf-inline-asm-fixed.ll
@@ -13,18 +13,18 @@ define i32 @foo(i32 %status) #0 {
 entry:
   %arg1 = alloca i32, align 4
   %0 = bitcast i32* %arg1 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0) #2
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #2
   store i32 %status, i32* %arg1, align 4, !tbaa !1
   %1 = call i32 asm sideeffect "r0 = #$1\0Ar1 = $2\0Ar2 = $4\0Atrap0 (#0)\0A$0 = r0", "=r,i,r,*m,r,~{r0},~{r1},~{r2}"(i32 24, i32* nonnull %arg1, i32* nonnull %arg1, i32 %status) #2, !srcloc !5
-  call void @llvm.lifetime.end(i64 4, i8* %0) #2
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #2
   ret i32 %1
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv5" "target-features"="-hvx,-hvx-double" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/Hexagon/rdf-phi-up.ll b/test/CodeGen/Hexagon/rdf-phi-up.ll
index 28f4c90c174d9f73d3e4f1ff41054dfb1984d43d..d4e7264712385f64e2a854e978abbb26f725b9c1 100644
--- a/test/CodeGen/Hexagon/rdf-phi-up.ll
+++ b/test/CodeGen/Hexagon/rdf-phi-up.ll
@@ -7,8 +7,8 @@ target triple = "hexagon"
 
 %struct.0 = type { i32, i16, i8* }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 define i32 @fred(i8* readonly %p0, i32* %p1) local_unnamed_addr #0 {
 entry:
@@ -32,7 +32,7 @@ if.then3:                                         ; preds = %if.then
 
 if.else:                                          ; preds = %lor.lhs.false
   %v6 = bitcast i16* %v0 to i8*
-  call void @llvm.lifetime.start(i64 2, i8* nonnull %v6) #0
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* nonnull %v6) #0
   store i16 0, i16* %v0, align 2
   %v7 = call i32 @foo(%struct.0* nonnull %v3, i16* nonnull %v0) #0
   %v8 = icmp eq i32* %p1, null
@@ -45,7 +45,7 @@ if.then6:                                         ; preds = %if.else
   br label %if.end7
 
 if.end7:                                          ; preds = %if.else, %if.then6
-  call void @llvm.lifetime.end(i64 2, i8* nonnull %v6) #0
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* nonnull %v6) #0
   br label %cleanup
 
 cleanup:                                          ; preds = %if.then3, %if.then,
diff --git a/test/CodeGen/Hexagon/readcyclecounter.ll b/test/CodeGen/Hexagon/readcyclecounter.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0a60c94b019c58af7cb19f8e81630589c8c65738
--- /dev/null
+++ b/test/CodeGen/Hexagon/readcyclecounter.ll
@@ -0,0 +1,10 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; CHECK-LABEL: test_readcyclecounter
+; CHECK: r1:0 = c15:14
+define i64 @test_readcyclecounter() nounwind {
+  %t0 = call i64 @llvm.readcyclecounter()
+  ret i64 %t0
+}
+
+declare i64 @llvm.readcyclecounter()
diff --git a/test/CodeGen/Hexagon/ret-struct-by-val.ll b/test/CodeGen/Hexagon/ret-struct-by-val.ll
index 26ed2ff36f77905cecf4647142715ef8615cbffd..60a97bcccfc559d4c57e681d718e9ae0662de117 100644
--- a/test/CodeGen/Hexagon/ret-struct-by-val.ll
+++ b/test/CodeGen/Hexagon/ret-struct-by-val.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
-; CHECK: r0 = add(r0, r1)
+; CHECK: r0 = add(r0,r1)
 
 ; Allow simple structures to be returned by value.
 
diff --git a/test/CodeGen/Hexagon/runtime-stkchk.ll b/test/CodeGen/Hexagon/runtime-stkchk.ll
index a4e8f117679efdcbe74895efe9e547171b2d7145..38aa8726d19cc12fffd90083494f36d3a2186cb2 100644
--- a/test/CodeGen/Hexagon/runtime-stkchk.ll
+++ b/test/CodeGen/Hexagon/runtime-stkchk.ll
@@ -6,12 +6,12 @@ define i32 @foo_1(i32 %n) #0 {
 entry:
   %local = alloca [1024 x i32], align 8
   %0 = bitcast [1024 x i32]* %local to i8*
-  call void @llvm.lifetime.start(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
   %arraydecay = getelementptr inbounds [1024 x i32], [1024 x i32]* %local, i32 0, i32 0
   call void @baz_1(i32* %arraydecay) #3
   %arrayidx = getelementptr inbounds [1024 x i32], [1024 x i32]* %local, i32 0, i32 %n
   %1 = load i32, i32* %arrayidx, align 4
-  call void @llvm.lifetime.end(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
   ret i32 %1
 }
 
@@ -21,21 +21,21 @@ define i32 @foo_2(i32 %n, i32* %y) #0 {
 entry:
   %local = alloca [2048 x i32], align 8
   %0 = bitcast [2048 x i32]* %local to i8*
-  call void @llvm.lifetime.start(i64 8192, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 8192, i8* %0) #1
   %arraydecay = getelementptr inbounds [2048 x i32], [2048 x i32]* %local, i32 0, i32 0
   call void @baz_2(i32* %y, i32* %arraydecay) #3
   %1 = load i32, i32* %y, align 4
   %add = add nsw i32 %n, %1
   %arrayidx = getelementptr inbounds [2048 x i32], [2048 x i32]* %local, i32 0, i32 %add
   %2 = load i32, i32* %arrayidx, align 4
-  call void @llvm.lifetime.end(i64 8192, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 8192, i8* %0) #1
   ret i32 %2
 }
 
 declare void @baz_1(i32*) #2
 declare void @baz_2(i32*, i32*) #2
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/test/CodeGen/Hexagon/section_7275.ll b/test/CodeGen/Hexagon/section_7275.ll
index c2b80ae3f69d00e45c2ca5ba08c3e2f5cc2d47e3..1806f1e9c844f735c632621f60be9a224ab9b121 100644
--- a/test/CodeGen/Hexagon/section_7275.ll
+++ b/test/CodeGen/Hexagon/section_7275.ll
@@ -8,13 +8,13 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
 ; CHECK-LABEL: foo
 ; CHECK-DAG: memw(##b)
-; CHECK-DAG: memw(#d)
+; CHECK-DAG: memw(gp+#d)
 ; CHECK-DAG: memw(##g)
-; CHECK-DAG: memw(#h)
-; CHECK-DAG: memw(#f)
+; CHECK-DAG: memw(gp+#h)
+; CHECK-DAG: memw(gp+#f)
 ; CHECK-DAG: memw(##e)
-; CHECK-DAG: memw(#a)
-; CHECK-DAG: memw(#c)
+; CHECK-DAG: memw(gp+#a)
+; CHECK-DAG: memw(gp+#c)
 ; CHECK-LABEL: bar
 ; CHECK: memw(##b)
 
diff --git a/test/CodeGen/Hexagon/signed_immediates.ll b/test/CodeGen/Hexagon/signed_immediates.ll
index a4766313cc68284a824ae0520ec18ac876069778..ad4aa259660708791585e8f0b2ba72b7d87266bf 100644
--- a/test/CodeGen/Hexagon/signed_immediates.ll
+++ b/test/CodeGen/Hexagon/signed_immediates.ll
@@ -33,7 +33,7 @@ define i64* @foo4(i64* %a, i64 %b)  {
 }
 
 ; s6Ext
-; CHECK: if (p0.new) memw(r0+#0)=#-1
+; CHECK: if (p0.new) memw(r0+#0) = #-1
 define void @foo5(i32* %a, i1 %b) {
 br i1 %b, label %x, label %y
 x:
@@ -44,7 +44,7 @@ y:
 }
 
 ; s10Ext
-; CHECK: p0 = cmp.eq(r0, #-1)
+; CHECK: p0 = cmp.eq(r0,#-1)
 define i1 @foo7(i32 %a) {
   %b = icmp eq i32 %a, -1
   ret i1 %b
@@ -96,4 +96,4 @@ y:
 ; CHECK: r0 = #-2
 define i32 @foo13() {
   ret i32 -2
-}
\ No newline at end of file
+}
diff --git a/test/CodeGen/Hexagon/stack-align1.ll b/test/CodeGen/Hexagon/stack-align1.ll
index 4efa70f598547c834d0e3d8b4e7060a16b41e1d6..aefd16594f067767fd1a6f3fd8b242d808745483 100644
--- a/test/CodeGen/Hexagon/stack-align1.ll
+++ b/test/CodeGen/Hexagon/stack-align1.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -O0 -march=hexagon < %s | FileCheck %s
-; CHECK: and(r29, #-32)
-; CHECK-DAG: add(r29, #0)
-; CHECK-DAG: add(r29, #28)
+; CHECK: and(r29,#-32)
+; CHECK-DAG: add(r29,#0)
+; CHECK-DAG: add(r29,#28)
 
 target triple = "hexagon-unknown-unknown"
 
diff --git a/test/CodeGen/Hexagon/stack-align2.ll b/test/CodeGen/Hexagon/stack-align2.ll
index 1bbd57820325ed5b3fc4c0a121604d8ab8e578f2..042e4097c56a57b6b1bba98f906d18fbd5fdc3f9 100644
--- a/test/CodeGen/Hexagon/stack-align2.ll
+++ b/test/CodeGen/Hexagon/stack-align2.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -O0 -march=hexagon < %s | FileCheck %s
-; CHECK: and(r29, #-128)
-; CHECK-DAG: add(r29, #0)
-; CHECK-DAG: add(r29, #64)
-; CHECK-DAG: add(r29, #96)
-; CHECK-DAG: add(r29, #124)
+; CHECK: and(r29,#-128)
+; CHECK-DAG: add(r29,#0)
+; CHECK-DAG: add(r29,#64)
+; CHECK-DAG: add(r29,#96)
+; CHECK-DAG: add(r29,#124)
 
 target triple = "hexagon-unknown-unknown"
 
diff --git a/test/CodeGen/Hexagon/stack-alloca1.ll b/test/CodeGen/Hexagon/stack-alloca1.ll
index 00e9e051aebbadb1cbebea188cf909ff531c571e..b38b8846d26fc85c0eeb1d75e59774f3353fb2ad 100644
--- a/test/CodeGen/Hexagon/stack-alloca1.ll
+++ b/test/CodeGen/Hexagon/stack-alloca1.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -march=hexagon < %s | FileCheck %s
-; CHECK: sub(r29, r[[REG:[0-9]+]])
+; CHECK: sub(r29,r[[REG:[0-9]+]])
 ; CHECK: r29 = r[[REG]]
 
 target triple = "hexagon-unknown-unknown"
diff --git a/test/CodeGen/Hexagon/stack-alloca2.ll b/test/CodeGen/Hexagon/stack-alloca2.ll
index ad5e13166aa291a0f0799c6ae9f1109e4cf0b537..b211be0c0fff6678f1d165615fe83a6250e5a629 100644
--- a/test/CodeGen/Hexagon/stack-alloca2.ll
+++ b/test/CodeGen/Hexagon/stack-alloca2.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -O0 -march=hexagon < %s | FileCheck %s
-; CHECK-DAG: r[[AP:[0-9]+]] = and(r30, #-32)
-; CHECK-DAG: r1 = add(r[[AP]], #-32)
+; CHECK-DAG: r[[AP:[0-9]+]] = and(r30,#-32)
+; CHECK-DAG: r1 = add(r[[AP]],#-32)
 
-; CHECK-DAG: sub(r29, r[[SP:[0-9]+]])
+; CHECK-DAG: sub(r29,r[[SP:[0-9]+]])
 ; CHECK-DAG: r29 = r[[SP]]
 
 target triple = "hexagon-unknown-unknown"
diff --git a/test/CodeGen/Hexagon/static.ll b/test/CodeGen/Hexagon/static.ll
index c3237b748881f543000d84a063158174b1a3f396..15aab434158c6b449c9973c4f057bc30af3e9630 100644
--- a/test/CodeGen/Hexagon/static.ll
+++ b/test/CodeGen/Hexagon/static.ll
@@ -4,9 +4,9 @@
 @acc = external global i32
 @val = external global i32
 
-; CHECK-DAG: memw(#num)
-; CHECK-DAG: memw(#acc)
-; CHECK-DAG: memw(#val)
+; CHECK-DAG: memw(gp+#num)
+; CHECK-DAG: memw(gp+#acc)
+; CHECK-DAG: memw(gp+#val)
 
 define void @foo() nounwind {
 entry:
diff --git a/test/CodeGen/Hexagon/store-shift.ll b/test/CodeGen/Hexagon/store-shift.ll
index 866930990baa0e4b94184b17a5412ff94ac2a36e..981071a0181e7e6b6a9dfadfd4ee66063d7fa8d0 100644
--- a/test/CodeGen/Hexagon/store-shift.ll
+++ b/test/CodeGen/Hexagon/store-shift.ll
@@ -1,12 +1,12 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
 
 ; CHECK-DAG: r[[BASE:[0-9]+]] += add
-; CHECK-DAG: r[[IDX0:[0-9]+]] = add(r2, #5)
-; CHECK-DAG: r[[IDX1:[0-9]+]] = add(r2, #6)
-; CHECK-DAG: memw(r0 + r[[IDX0]]<<#2) = r3
-; CHECK-DAG: memw(r0 + r[[IDX1]]<<#2) = r3
-; CHECK-DAG: memw(r[[BASE]] + r[[IDX0]]<<#2) = r[[IDX0]]
-; CHECK-DAG: memw(r[[BASE]] + r[[IDX1]]<<#2) = r[[IDX0]]
+; CHECK-DAG: r[[IDX0:[0-9]+]] = add(r2,#5)
+; CHECK-DAG: r[[IDX1:[0-9]+]] = add(r2,#6)
+; CHECK-DAG: memw(r0+r[[IDX0]]<<#2) = r3
+; CHECK-DAG: memw(r0+r[[IDX1]]<<#2) = r3
+; CHECK-DAG: memw(r[[BASE]]+r[[IDX0]]<<#2) = r[[IDX0]]
+; CHECK-DAG: memw(r[[BASE]]+r[[IDX1]]<<#2) = r[[IDX0]]
 
 target triple = "hexagon"
 
diff --git a/test/CodeGen/Hexagon/sube.ll b/test/CodeGen/Hexagon/sube.ll
index 7bc00759303f30ab1428e7e43f06feca6eed8759..2b09a998eff08392bab9c2e9415b7120012705c1 100644
--- a/test/CodeGen/Hexagon/sube.ll
+++ b/test/CodeGen/Hexagon/sube.ll
@@ -1,29 +1,26 @@
-; RUN: llc -march=hexagon -disable-hsdr -hexagon-expand-condsets=0 -hexagon-bit=0 -disable-post-ra < %s | FileCheck %s
+; RUN: llc -march=hexagon -hexagon-expand-condsets=0 < %s | FileCheck %s
 
-; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #0)
-; CHECK: r{{[0-9]+:[0-9]+}} = combine(#0, #1)
-; CHECK: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
-; CHECK: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
-; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}})
-; CHECK: r{{[0-9]+}} = mux(p{{[0-9]+}}, r{{[0-9]+}}, r{{[0-9]+}})
-; CHECK: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
-; CHECK: r{{[0-9]+:[0-9]+}} = combine(r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK-DAG: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
+; CHECK-DAG: r{{[0-9]+:[0-9]+}} = sub(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
+; CHECK-DAG: p{{[0-9]+}} = cmp.gtu(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
+; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-DAG: r{{[0-9]+}} = mux(p{{[0-9]+}},r{{[0-9]+}},r{{[0-9]+}})
 
-define void @check_sube_subc(i64 %AL, i64 %AH, i64 %BL, i64 %BH, i64* %RL, i64* %RH) {
-entry:
-        %tmp1 = zext i64 %AL to i128
-        %tmp23 = zext i64 %AH to i128
-        %tmp4 = shl i128 %tmp23, 64
-        %tmp5 = or i128 %tmp4, %tmp1
-        %tmp67 = zext i64 %BL to i128
-        %tmp89 = zext i64 %BH to i128
-        %tmp11 = shl i128 %tmp89, 64
-        %tmp12 = or i128 %tmp11, %tmp67
-        %tmp15 = sub i128 %tmp5, %tmp12
-        %tmp1617 = trunc i128 %tmp15 to i64
-        store i64 %tmp1617, i64* %RL
-        %tmp21 = lshr i128 %tmp15, 64
-        %tmp2122 = trunc i128 %tmp21 to i64
-        store i64 %tmp2122, i64* %RH
-        ret void
+define void @check_sube_subc(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64* %a4, i64* %a5) {
+b6:
+  %v7 = zext i64 %a0 to i128
+  %v8 = zext i64 %a1 to i128
+  %v9 = shl i128 %v8, 64
+  %v10 = or i128 %v7, %v9
+  %v11 = zext i64 %a2 to i128
+  %v12 = zext i64 %a3 to i128
+  %v13 = shl i128 %v12, 64
+  %v14 = or i128 %v11, %v13
+  %v15 = sub i128 %v10, %v14
+  %v16 = lshr i128 %v15, 64
+  %v17 = trunc i128 %v15 to i64
+  %v18 = trunc i128 %v16 to i64
+  store i64 %v17, i64* %a4
+  store i64 %v18, i64* %a5
+  ret void
 }
diff --git a/test/CodeGen/Hexagon/subi-asl.ll b/test/CodeGen/Hexagon/subi-asl.ll
index f0b27e828f5075d0a5d0859ec3313db4e22ae7b7..d7610ceb62ace82033c60c050542f9abf4e8a8ab 100644
--- a/test/CodeGen/Hexagon/subi-asl.ll
+++ b/test/CodeGen/Hexagon/subi-asl.ll
@@ -3,11 +3,11 @@
 ; Check if S4_subi_asl_ri is being generated correctly.
 
 ; CHECK-LABEL: yes_sub_asl
-; CHECK: [[REG1:(r[0-9]+)]] = sub(#0, asl([[REG1]], #1))
+; CHECK: [[REG1:(r[0-9]+)]] = sub(#0,asl([[REG1]],#1))
 
 ; CHECK-LABEL: no_sub_asl
-; CHECK: [[REG2:(r[0-9]+)]] = asl(r{{[0-9]+}}, #1)
-; CHECK: r{{[0-9]+}} = sub([[REG2]], r{{[0-9]+}})
+; CHECK: [[REG2:(r[0-9]+)]] = asl(r{{[0-9]+}},#1)
+; CHECK: r{{[0-9]+}} = sub([[REG2]],r{{[0-9]+}})
 
 %struct.rtx_def = type { i16, i8 }
 
diff --git a/test/CodeGen/Hexagon/swp-const-tc.ll b/test/CodeGen/Hexagon/swp-const-tc.ll
index 3113094d2ba3071a3332e3dd7e92488b78ea0987..c07d23623eba1b54d0c5832ee211e288b2f486fa 100644
--- a/test/CodeGen/Hexagon/swp-const-tc.ll
+++ b/test/CodeGen/Hexagon/swp-const-tc.ll
@@ -4,7 +4,7 @@
 ; of computing a new LC0 value.
 
 ; CHECK-LABEL: @test
-; CHECK: loop0(.LBB0_1, #998)
+; CHECK: loop0(.LBB0_1,#998)
 
 define i32 @test(i32* %A, i32* %B, i32 %count) {
 entry:
diff --git a/test/CodeGen/Hexagon/swp-matmul-bitext.ll b/test/CodeGen/Hexagon/swp-matmul-bitext.ll
index db5bb96d0bc9d85f4f0fbd37463d569b60dd9dac..9c425ae6a0988a4a0d8078a5ce63cca10ff3b961 100644
--- a/test/CodeGen/Hexagon/swp-matmul-bitext.ll
+++ b/test/CodeGen/Hexagon/swp-matmul-bitext.ll
@@ -11,7 +11,7 @@
 ; CHECK: [[REG0:(r[0-9]+)]] = memh
 ; CHECK: [[REG1:(r[0-9]+)]] = memh
 ; CHECK: += mpyi
-; CHECK: [[REG2]] = mpyi([[REG0]], [[REG1]])
+; CHECK: [[REG2]] = mpyi([[REG0]],[[REG1]])
 ; CHECK: endloop0
 
 %union_h2_sem_t = type { i32 }
diff --git a/test/CodeGen/Hexagon/swp-max.ll b/test/CodeGen/Hexagon/swp-max.ll
index 038138ff2561450447d49e85617999c2bc35a0bd..26238ea6fb374407d20033f70278e02b8706e2b4 100644
--- a/test/CodeGen/Hexagon/swp-max.ll
+++ b/test/CodeGen/Hexagon/swp-max.ll
@@ -15,8 +15,8 @@ for.body.preheader:
 
 ; CHECK: loop0(.LBB0_[[LOOP:.]],
 ; CHECK: .LBB0_[[LOOP]]:
-; CHECK: [[REG1:(r[0-9]+)]] = max(r{{[0-9]+}}, [[REG1]])
-; CHECK: [[REG0:(r[0-9]+)]] = add([[REG2:(r[0-9]+)]], [[REG0]])
+; CHECK: [[REG1:(r[0-9]+)]] = max(r{{[0-9]+}},[[REG1]])
+; CHECK: [[REG0:(r[0-9]+)]] = add([[REG2:(r[0-9]+)]],[[REG0]])
 ; CHECK: [[REG2]] = memw
 ; CHECK: endloop0
 
diff --git a/test/CodeGen/Hexagon/swp-multi-loops.ll b/test/CodeGen/Hexagon/swp-multi-loops.ll
index 56e8c65110005b87e128b485168d0888ce2a60b3..fc2576af8ac2c54814b58a314ac478ac244a2a78 100644
--- a/test/CodeGen/Hexagon/swp-multi-loops.ll
+++ b/test/CodeGen/Hexagon/swp-multi-loops.ll
@@ -5,15 +5,15 @@
 ; Check if the first loop is pipelined.
 ; CHECK: loop0(.LBB0_[[LOOP:.]],
 ; CHECK: .LBB0_[[LOOP]]:
-; CHECK: add(r{{[0-9]+}}, r{{[0-9]+}})
-; CHECK-NEXT: memw(r{{[0-9]+}}{{.*}}++{{.*}}#4)
+; CHECK: add(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-NEXT: memw(r{{[0-9]+}}++#4)
 ; CHECK-NEXT: endloop0
 
 ; Check if the second loop is pipelined.
 ; CHECK: loop0(.LBB0_[[LOOP:.]],
 ; CHECK: .LBB0_[[LOOP]]:
-; CHECK: add(r{{[0-9]+}}, r{{[0-9]+}})
-; CHECK-NEXT: memw(r{{[0-9]+}}{{.*}}++{{.*}}#4)
+; CHECK: add(r{{[0-9]+}},r{{[0-9]+}})
+; CHECK-NEXT: memw(r{{[0-9]+}}++#4)
 ; CHECK-NEXT: endloop0
 
 define i32 @test(i32* %a, i32 %n, i32 %l) {
diff --git a/test/CodeGen/Hexagon/swp-stages4.ll b/test/CodeGen/Hexagon/swp-stages4.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f58e83203154eff14b0a66eccb59ddbe580ade39
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-stages4.ll
@@ -0,0 +1,94 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -disable-block-placement=0 -hexagon-bit=0 < %s | FileCheck %s
+
+; Test that we rename registers correctly for multiple stages when there is a
+; Phi and depends upon another Phi.
+
+; CHECK: = and
+; CHECK: = and
+; CHECK: = and
+; CHECK: [[REG0:(r[0-9]+)]] = and([[REG1:(r[0-9]+)]],#255)
+; CHECK-NOT: [[REG0]] = and([[REG1]],#255)
+; CHECK: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: [[REG0]] += add
+; CHECK: = and
+; CHECK: = and
+; CHECK: [[REG0]] = and
+; CHECK: endloop
+
+; Function Attrs: nounwind
+define void @test(i8* noalias nocapture %src, i32 %srcWidth, i32 %srcHeight, i32 %srcStride, i8* noalias nocapture %dst, i32 %dstStride) #0 {
+entry:
+  %sub = add i32 %srcWidth, -1
+  %sub1 = add i32 %srcHeight, -1
+  %add.ptr = getelementptr inbounds i8, i8* %src, i32 %srcStride
+  %add.ptr.sum = mul i32 %srcStride, 2
+  %add.ptr2 = getelementptr inbounds i8, i8* %src, i32 %add.ptr.sum
+  br label %for.body.lr.ph
+
+for.body.lr.ph:
+  %0 = add i32 %srcHeight, -2
+  %1 = mul i32 %0, %dstStride
+  %2 = mul i32 %0, %srcStride
+  %3 = mul i32 %sub1, %srcStride
+  br label %for.cond
+
+for.cond:
+  %scevgep = getelementptr i8, i8* %dst, i32 %1
+  %scevgep220 = getelementptr i8, i8* %src, i32 %2
+  %scevgep221 = getelementptr i8, i8* %src, i32 %3
+  %arrayidx6 = getelementptr inbounds i8, i8* %src, i32 1
+  %add11 = add i32 %srcStride, 1
+  %arrayidx12 = getelementptr inbounds i8, i8* %src, i32 %add11
+  br label %for.body75.preheader
+
+for.body75.preheader:
+  %sri = load i8, i8* %arrayidx6, align 1
+  %sri224 = load i8, i8* %src, align 1
+  %sri227 = load i8, i8* %arrayidx12, align 1
+  %sri229 = load i8, i8* %add.ptr, align 1
+  br label %for.body75
+
+for.body75:
+  %j.0211 = phi i32 [ %add82, %for.body75 ], [ 1, %for.body75.preheader ]
+  %sr = phi i8 [ %4, %for.body75 ], [ %sri, %for.body75.preheader ]
+  %sr225 = phi i8 [ %sr, %for.body75 ], [ %sri224, %for.body75.preheader ]
+  %sr230 = phi i8 [ %5, %for.body75 ], [ %sri227, %for.body75.preheader ]
+  %sr231 = phi i8 [ %sr230, %for.body75 ], [ %sri229, %for.body75.preheader ]
+  %conv78 = zext i8 %sr225 to i32
+  %conv80 = zext i8 %sr to i32
+  %add81 = add nsw i32 %conv80, %conv78
+  %add82 = add i32 %j.0211, 1
+  %arrayidx83 = getelementptr inbounds i8, i8* %src, i32 %add82
+  %4 = load i8, i8* %arrayidx83, align 1, !tbaa !0
+  %conv84 = zext i8 %4 to i32
+  %add85 = add nsw i32 %add81, %conv84
+  %conv88 = zext i8 %sr231 to i32
+  %add89 = add nsw i32 %add85, %conv88
+  %conv91 = zext i8 %sr230 to i32
+  %add92 = add nsw i32 %add89, %conv91
+  %add.ptr.sum208 = add i32 %add82, %srcStride
+  %arrayidx94 = getelementptr inbounds i8, i8* %src, i32 %add.ptr.sum208
+  %5 = load i8, i8* %arrayidx94, align 1, !tbaa !0
+  %conv95 = zext i8 %5 to i32
+  %add96 = add nsw i32 %add92, %conv95
+  %mul97 = mul nsw i32 %add96, 7282
+  %add98 = add nsw i32 %mul97, 32768
+  %shr99209 = lshr i32 %add98, 16
+  %conv100 = trunc i32 %shr99209 to i8
+  %arrayidx101 = getelementptr inbounds i8, i8* %dst, i32 %j.0211
+  store i8 %conv100, i8* %arrayidx101, align 1, !tbaa !0
+  %exitcond = icmp eq i32 %add82, %sub
+  br i1 %exitcond, label %for.end104.loopexit, label %for.body75
+
+for.end104.loopexit:
+  br label %for.end104
+
+for.end104:
+  ret void
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf"="true" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!0 = !{!"omnipotent char", !1}
+!1 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/swp-stages5.ll b/test/CodeGen/Hexagon/swp-stages5.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fdfb2101cd36cfe549ff33bc265bb5b3d8a430a1
--- /dev/null
+++ b/test/CodeGen/Hexagon/swp-stages5.ll
@@ -0,0 +1,78 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv5 -enable-pipeliner -pipeliner-max-stages=2 -hexagon-bit=0 < %s | FileCheck %s
+
+; Very similar to swp-stages4.ll, but the pipelined schedule is a little
+; different.
+
+; CHECK: = memub(r{{[0-9]+}}++#1)
+; CHECK-DAG: [[REG0:(r[0-9]+)]] = memub(r{{[0-9]+}}++#1)
+; CHECK-DAG: loop0(.LBB0_[[LOOP:.]],
+; CHECK: .LBB0_[[LOOP]]:
+; CHECK: = and([[REG0]],#255)
+; CHECK: [[REG0]]{{[:0-9]*}} =
+; CHECK: endloop
+
+define void @fred(i8* noalias nocapture %src, i32 %srcWidth, i32 %srcHeight, i32 %srcStride, i8* noalias nocapture %dst, i32 %dstStride) #0 {
+entry:
+  %sub = add i32 %srcWidth, -1
+  %sub1 = add i32 %srcHeight, -1
+  %add.ptr = getelementptr inbounds i8, i8* %src, i32 %srcStride
+  %add.ptr.sum = mul i32 %srcStride, 2
+  %add.ptr2 = getelementptr inbounds i8, i8* %src, i32 %add.ptr.sum
+  %cmp212 = icmp ugt i32 %sub1, 1
+  br i1 %cmp212, label %for.body.lr.ph, label %for.end
+
+for.body.lr.ph:
+  br label %for.body74.preheader
+
+for.body74.preheader:
+  %0 = load i8, i8* %add.ptr, align 1, !tbaa !0
+  %arrayidx40 = getelementptr inbounds i8, i8* %add.ptr, i32 1
+  %1 = load i8, i8* %arrayidx40, align 1, !tbaa !0
+  %2 = load i8, i8* %add.ptr, align 1, !tbaa !0
+  %arrayidx46 = getelementptr inbounds i8, i8* %add.ptr, i32 1
+  %3 = load i8, i8* %arrayidx46, align 1, !tbaa !0
+  br label %for.body74
+
+for.body74:
+  %4 = phi i8 [ %9, %for.body74 ], [ %3, %for.body74.preheader ]
+  %5 = phi i8 [ %4, %for.body74 ], [ %2, %for.body74.preheader ]
+  %6 = phi i8 [ %8, %for.body74 ], [ %1, %for.body74.preheader ]
+  %7 = phi i8 [ %6, %for.body74 ], [ %0, %for.body74.preheader ]
+  %j.0211 = phi i32 [ %add81, %for.body74 ], [ 1, %for.body74.preheader ]
+  %conv77 = zext i8 %7 to i32
+  %conv79 = zext i8 %6 to i32
+  %add80 = add nsw i32 %conv79, %conv77
+  %add81 = add i32 %j.0211, 1
+  %arrayidx82 = getelementptr inbounds i8, i8* %src, i32 %add81
+  %8 = load i8, i8* %arrayidx82, align 1, !tbaa !0
+  %conv83 = zext i8 %8 to i32
+  %add84 = add nsw i32 %add80, %conv83
+  %conv87 = zext i8 %5 to i32
+  %add88 = add nsw i32 %add84, %conv87
+  %conv90 = zext i8 %4 to i32
+  %add91 = add nsw i32 %add88, %conv90
+  %arrayidx93 = getelementptr inbounds i8, i8* %add.ptr, i32 %add81
+  %9 = load i8, i8* %arrayidx93, align 1, !tbaa !0
+  %conv94 = zext i8 %9 to i32
+  %add95 = add nsw i32 %add91, %conv94
+  %mul96 = mul nsw i32 %add95, 7282
+  %add97 = add nsw i32 %mul96, 32768
+  %shr98208 = lshr i32 %add97, 16
+  %conv99 = trunc i32 %shr98208 to i8
+  %add.ptr5.sum209 = add i32 %j.0211, %dstStride
+  %arrayidx100 = getelementptr inbounds i8, i8* %dst, i32 %add.ptr5.sum209
+  store i8 %conv99, i8* %arrayidx100, align 1, !tbaa !0
+  %exitcond = icmp eq i32 %add81, %sub
+  br i1 %exitcond, label %for.end103.loopexit, label %for.body74
+
+for.end103.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{!"omnipotent char", !1}
+!1 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/Hexagon/swp-vmult.ll b/test/CodeGen/Hexagon/swp-vmult.ll
index 9018405274cde4a06e7a5c044352ba68fa0a6486..7c53248f47fc731bdf59531e2573b0cfd48a39b8 100644
--- a/test/CodeGen/Hexagon/swp-vmult.ll
+++ b/test/CodeGen/Hexagon/swp-vmult.ll
@@ -2,10 +2,10 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5 -O3 < %s | FileCheck %s
 
 ; Multiply and accumulate
-; CHECK: mpyi([[REG0:r([0-9]+)]], [[REG1:r([0-9]+)]])
-; CHECK-NEXT: add(r{{[0-9]+}}, #4)
-; CHECK-NEXT: [[REG0]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
-; CHECK-NEXT: [[REG1]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
+; CHECK: mpyi([[REG0:r([0-9]+)]],[[REG1:r([0-9]+)]])
+; CHECK-NEXT: add(r{{[0-9]+}},#4)
+; CHECK-NEXT: [[REG0]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
+; CHECK-NEXT: [[REG1]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
 ; CHECK-NEXT: endloop0
 
 define i32 @foo(i32* %a, i32* %b, i32 %n) {
diff --git a/test/CodeGen/Hexagon/swp-vsum.ll b/test/CodeGen/Hexagon/swp-vsum.ll
index 4756c644709f1f5daf909def87c64a59872874d5..3561997450de77b965756bf9cbef8960a980d81a 100644
--- a/test/CodeGen/Hexagon/swp-vsum.ll
+++ b/test/CodeGen/Hexagon/swp-vsum.ll
@@ -4,9 +4,9 @@
 ; Simple vector total.
 ; CHECK: loop0(.LBB0_[[LOOP:.]],
 ; CHECK: .LBB0_[[LOOP]]:
-; CHECK: add([[REG:r([0-9]+)]], r{{[0-9]+}})
-; CHECK-NEXT: add(r{{[0-9]+}}, #4)
-; CHECK-NEXT: [[REG]] = memw(r{{[0-9]+}} + r{{[0-9]+}}<<#0)
+; CHECK: add([[REG:r([0-9]+)]],r{{[0-9]+}})
+; CHECK-NEXT: add(r{{[0-9]+}},#4)
+; CHECK-NEXT: [[REG]] = memw(r{{[0-9]+}}+r{{[0-9]+}}<<#0)
 ; CHECK-NEXT: endloop0
 
 define i32 @foo(i32* %a, i32 %n) {
diff --git a/test/CodeGen/Hexagon/tail-dup-subreg-map.ll b/test/CodeGen/Hexagon/tail-dup-subreg-map.ll
index 08dadeb9aaa47c592ee86cc4d29205bbfa42d6e9..1b11d087832ac6841731c8d6e571115a9dd6b5f9 100644
--- a/test/CodeGen/Hexagon/tail-dup-subreg-map.ll
+++ b/test/CodeGen/Hexagon/tail-dup-subreg-map.ll
@@ -5,7 +5,7 @@
 ; subregisters were dropped by the tail duplicator, resulting in invalid
 ; COPY instructions being generated.
 
-; CHECK: = extractu(r{{[0-9]+}}, #15, #17)
+; CHECK: = extractu(r{{[0-9]+}},#15,#17)
 
 target triple = "hexagon"
 
diff --git a/test/CodeGen/Hexagon/tfr-to-combine.ll b/test/CodeGen/Hexagon/tfr-to-combine.ll
index 1b82f3e4562e7db856476f2e725d40c87941ed48..50879ffe582dd20c43862c90fba1a3c66005184b 100644
--- a/test/CodeGen/Hexagon/tfr-to-combine.ll
+++ b/test/CodeGen/Hexagon/tfr-to-combine.ll
@@ -8,7 +8,7 @@
 
 ; Function Attrs: nounwind
 define i64 @test1() #0 {
-; CHECK: combine(#10, #0)
+; CHECK: combine(#10,#0)
 entry:
   store i16 0, i16* @a, align 2
   store i16 10, i16* @b, align 2
@@ -17,7 +17,7 @@ entry:
 
 ; Function Attrs: nounwind
 define i64 @test2() #0 {
-; CHECK: combine(#0, r{{[0-9]+}})
+; CHECK: combine(#0,r{{[0-9]+}})
 entry:
   store i16 0, i16* @a, align 2
   %0 = load i16, i16* @c, align 2
@@ -27,7 +27,7 @@ entry:
 
 ; Function Attrs: nounwind
 define i64 @test4() #0 {
-; CHECK: combine(#0, #100)
+; CHECK: combine(#0,#100)
 entry:
   store i16 100, i16* @b, align 2
   store i16 0, i16* @a, align 2
diff --git a/test/CodeGen/Hexagon/tls_pic.ll b/test/CodeGen/Hexagon/tls_pic.ll
index 190e1d71d39b09d725e92300ff4a29487adbf2bd..2c2be0dc384af2de5b949f09b0903985a37eff9c 100644
--- a/test/CodeGen/Hexagon/tls_pic.ll
+++ b/test/CodeGen/Hexagon/tls_pic.ll
@@ -4,7 +4,7 @@
 @src_ie = thread_local(initialexec) global i32 0, align 4
 
 ; CHECK-LABEL:    test_initial_exec
-; CHECK-DAG:      = add(pc, ##_GLOBAL_OFFSET_TABLE_@PCREL)
+; CHECK-DAG:      = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
 ; CHECK-DAG:      = ##src_ie@IEGOT
 ; CHECK-DAG:      = ##dst_ie@IEGOT
 ; CHECK-NOT:  call
@@ -22,7 +22,7 @@ entry:
 ; general-dynamic model.
 
 ; CHECK-LABEL: test_dynamic
-; CHECK-DAG:   = add(pc, ##_GLOBAL_OFFSET_TABLE_@PCREL)
+; CHECK-DAG:   = add(pc,##_GLOBAL_OFFSET_TABLE_@PCREL)
 ; CHECK-DAG:   = ##src_gd@GDGOT
 ; CHECK-DAG:   = ##dst_gd@GDGOT
 ; CHECK-DAG:   call src_gd@GDPLT
diff --git a/test/CodeGen/Hexagon/two-crash.ll b/test/CodeGen/Hexagon/two-crash.ll
index 0ab02cda8a07bc37487f641f5228c43056b056f1..7e79cb3be912a9ee70e17d03442ef79a34d05b3f 100644
--- a/test/CodeGen/Hexagon/two-crash.ll
+++ b/test/CodeGen/Hexagon/two-crash.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s
 ; This testcase crashed, because we propagated a reg:sub into a tied use.
 ; The two-address pass rewrote it in a way that generated incorrect code.
-; CHECK: r{{[0-9]+}} += lsr(r{{[0-9]+}}, #16)
+; CHECK: r{{[0-9]+}} += lsr(r{{[0-9]+}},#16)
 
 target triple = "hexagon"
 
diff --git a/test/CodeGen/Hexagon/undo-dag-shift.ll b/test/CodeGen/Hexagon/undo-dag-shift.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c1ab5d73f5c389ee8d7d657194c0ad19f560a247
--- /dev/null
+++ b/test/CodeGen/Hexagon/undo-dag-shift.ll
@@ -0,0 +1,59 @@
+; RUN: llc -march=hexagon < %s | FileCheck %s
+
+; DAG combiner folds sequences of shifts, which can sometimes obscure
+; optimization opportunities. For example
+;
+;   unsigned int c(unsigned int b, unsigned int *a) {
+;     unsigned int bitidx = b >> 5;
+;     return a[bitidx];
+;   }
+;
+; produces
+;   (add x (shl (srl y 5) 2))
+; which is then folded into
+;   (add x (and (srl y 3) 1FFFFFFC))
+;
+; That results in a constant-extended and:
+;   r0 = and(##536870908,lsr(r0,#3))
+;   r0 = memw(r1+r0<<#0)
+; whereas
+;   r0 = lsr(r0,#5)
+;   r0 = memw(r1+r0<<#2)
+; is more desirable.
+
+target triple = "hexagon"
+
+; CHECK-LABEL: load_0
+; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#2)
+define i32 @load_0(i32 %b, i32* nocapture readonly %a) #0 {
+entry:
+  %shr = lshr i32 %b, 5
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %shr
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; This would require r0<<#3, which is not legal.
+; CHECK-LABEL: load_1
+; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#0)
+define i32 @load_1(i32 %b, [3 x i32]* nocapture readonly %a) #0 {
+entry:
+  %shr = lshr i32 %b, 5
+  %arrayidx = getelementptr inbounds [3 x i32], [3 x i32]* %a, i32 %shr, i32 0
+  %0 = load i32, i32* %arrayidx, align 4
+  ret i32 %0
+}
+
+; CHECK-LABEL: store_0
+; CHECK: memw(r{{[0-9]+}}+r{{[0-9]}}<<#2)
+define void @store_0(i32 %b, i32* nocapture %a, i32 %v) #1 {
+entry:
+  %shr = lshr i32 %b, 5
+  %arrayidx = getelementptr inbounds i32, i32* %a, i32 %shr
+  store i32 %v, i32* %arrayidx, align 4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind readonly "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
+attributes #1 = { norecurse nounwind "target-cpu"="hexagonv60" "target-features"="-hvx,-hvx-double,-long-calls" }
+
diff --git a/test/CodeGen/Hexagon/vaddh.ll b/test/CodeGen/Hexagon/vaddh.ll
index 88194b750ad5e9b530a386d9d517584e9a05cb78..a4fb33de4ac5b02a49295f915d5da1f9aadf39a5 100644
--- a/test/CodeGen/Hexagon/vaddh.ll
+++ b/test/CodeGen/Hexagon/vaddh.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
-; CHECK: vaddh(r{{[0-9]+}}, r{{[0-9]+}})
+; CHECK: vaddh(r{{[0-9]+}},r{{[0-9]+}})
 
 @j = external global i32
 @k = external global i32
diff --git a/test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll b/test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll
index 70c4aeb4bac0a26903d3388b154c1484f32dd7f3..4bba134a40cb3e10b5f48d02ecf67ab4603e1a00 100644
--- a/test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll
+++ b/test/CodeGen/Hexagon/vect/vect-cst-v4i32.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
 ; This one should generate a combine with two immediates.
-; CHECK: combine(#7, #7)
+; CHECK: combine(#7,#7)
 @B = common global [400 x i32] zeroinitializer, align 8
 @A = common global [400 x i32] zeroinitializer, align 8
 @C = common global [400 x i32] zeroinitializer, align 8
diff --git a/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll b/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
index 91b32652400f772ce1daaaf5a7df5718f2e03989..f49a1e24a1bbe43a21544d393f1dad40b40c91b8 100644
--- a/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
+++ b/test/CodeGen/Hexagon/vect/vect-loadv4i16.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
 
 ; Check that store is post-incremented.
-; CHECK: memuh(r{{[0-9]+}} + {{ *}}#6{{ *}})
-; CHECK: combine(r{{[0-9]+}}{{ *}},{{ *}}r{{[0-9]+}}{{ *}})
+; CHECK: memuh(r{{[0-9]+}}+#6)
+; CHECK: combine(r{{[0-9]+}},r{{[0-9]+}})
 ; CHECK: vaddh
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
diff --git a/test/CodeGen/Hexagon/vect/vect-shift-imm.ll b/test/CodeGen/Hexagon/vect/vect-shift-imm.ll
index 4861181d4125dee33a51320d6af80303c930b172..a4d6afa40bce3009f1ff9c14cf67a8937a5850ab 100644
--- a/test/CodeGen/Hexagon/vect/vect-shift-imm.ll
+++ b/test/CodeGen/Hexagon/vect/vect-shift-imm.ll
@@ -6,12 +6,12 @@
 ; RUN: llc -march=hexagon < %s | FileCheck %s --check-prefix=CHECK-LSRH
 ;
 ; Make sure that the instructions with immediate operands are generated.
-; CHECK-ASLW: vaslw({{.*}}, #9)
-; CHECK-ASRW: vasrw({{.*}}, #8)
-; CHECK-LSRW: vlsrw({{.*}}, #7)
-; CHECK-ASLH: vaslh({{.*}}, #6)
-; CHECK-ASRH: vasrh({{.*}}, #5)
-; CHECK-LSRH: vlsrh({{.*}}, #4)
+; CHECK-ASLW: vaslw({{.*}},#9)
+; CHECK-ASRW: vasrw({{.*}},#8)
+; CHECK-LSRW: vlsrw({{.*}},#7)
+; CHECK-ASLH: vaslh({{.*}},#6)
+; CHECK-ASRH: vasrh({{.*}},#5)
+; CHECK-LSRH: vlsrh({{.*}},#4)
 
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon"
diff --git a/test/CodeGen/Hexagon/vect/vect-shuffle.ll b/test/CodeGen/Hexagon/vect/vect-shuffle.ll
index bd5b2b981695d172fc05d761008b80c3006a484c..27840bbd28d951559ac568536212760a800a69d1 100644
--- a/test/CodeGen/Hexagon/vect/vect-shuffle.ll
+++ b/test/CodeGen/Hexagon/vect/vect-shuffle.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
 
 ; Check that store is post-incremented.
-; CHECK-NOT: extractu
+; CHECK-NOT: extractu(r{{[0-9]+}},#32,
 ; CHECK-NOT: insert
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon"
diff --git a/test/CodeGen/Hexagon/vect/vect-vshifts.ll b/test/CodeGen/Hexagon/vect/vect-vshifts.ll
index 49ff812601aedbbc188210d15f998934b5f56e44..9d3cbe6e113f80617009210e20f89835b19c8918 100644
--- a/test/CodeGen/Hexagon/vect/vect-vshifts.ll
+++ b/test/CodeGen/Hexagon/vect/vect-vshifts.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s
 
 ; Check that store is post-incremented.
-; CHECK: r{{[0-9]+:[0-9]+}} = vasrw(r{{[0-9]+:[0-9]+}}, r{{[0-9]+}})
-; CHECK: r{{[0-9]+:[0-9]+}} = vaslw(r{{[0-9]+:[0-9]+}}, r{{[0-9]+}})
+; CHECK: r{{[0-9]+:[0-9]+}} = vasrw(r{{[0-9]+:[0-9]+}},r{{[0-9]+}})
+; CHECK: r{{[0-9]+:[0-9]+}} = vaslw(r{{[0-9]+:[0-9]+}},r{{[0-9]+}})
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon"
 
diff --git a/test/CodeGen/Hexagon/vect/vect-xor.ll b/test/CodeGen/Hexagon/vect/vect-xor.ll
index 96719e683413e837ade2973c600e8385109862c0..8864ab5c5cb72d44ca34683d859a2bc9d164d375 100644
--- a/test/CodeGen/Hexagon/vect/vect-xor.ll
+++ b/test/CodeGen/Hexagon/vect/vect-xor.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -march=hexagon -mcpu=hexagonv5 -disable-hsdr < %s | FileCheck %s
 
 ; Check that the parsing succeeded.
-; CHECK: r{{[0-9]+:[0-9]+}} = xor(r{{[0-9]+:[0-9]+}}, r{{[0-9]+:[0-9]+}})
+; CHECK: r{{[0-9]+:[0-9]+}} = xor(r{{[0-9]+:[0-9]+}},r{{[0-9]+:[0-9]+}})
 target datalayout = "e-p:32:32:32-i64:64:64-i32:32:32-i16:16:16-i1:32:32-f64:64:64-f32:32:32-v64:64:64-v32:32:32-a0:0-n16:32"
 target triple = "hexagon"
 
diff --git a/test/CodeGen/MIR/AArch64/atomic-memoperands.mir b/test/CodeGen/MIR/AArch64/atomic-memoperands.mir
new file mode 100644
index 0000000000000000000000000000000000000000..1fe42a7314881190e30f1f0873234c0909e6dd4f
--- /dev/null
+++ b/test/CodeGen/MIR/AArch64/atomic-memoperands.mir
@@ -0,0 +1,30 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass none -o - %s | FileCheck %s
+
+--- |
+
+  define void @atomic_memoperands() {
+    ret void
+  }
+
+...
+---
+# CHECK-LABEL: name: atomic_memoperands
+# CHECK: %1(s64) = G_LOAD %0(p0) :: (load unordered 8)
+# CHECK: %2(s32) = G_LOAD %0(p0) :: (load monotonic 4)
+# CHECK: %3(s16) = G_LOAD %0(p0) :: (load acquire 2)
+# CHECK: G_STORE %3(s16), %0(p0) :: (store release 2)
+# CHECK: G_STORE %2(s32), %0(p0) :: (store acq_rel 4)
+# CHECK: G_STORE %1(s64), %0(p0) :: (store singlethread seq_cst 8)
+name:            atomic_memoperands
+body: |
+  bb.0:
+
+    %0:_(p0) = COPY %x0
+    %1:_(s64) = G_LOAD %0(p0) :: (load unordered 8)
+    %2:_(s32) = G_LOAD %0(p0) :: (load monotonic 4)
+    %3:_(s16) = G_LOAD %0(p0) :: (load acquire 2)
+    G_STORE %3(s16), %0(p0) :: (store release 2)
+    G_STORE %2(s32), %0(p0) :: (store acq_rel 4)
+    G_STORE %1(s64), %0(p0) :: (store singlethread seq_cst 8)
+    RET_ReallyLR
+...
diff --git a/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir b/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
index 47f0e168a722e16418d8cd7761871b181f9fbff5..5da98fb9c2d1ce709c9f910e5b0e37a70103132e 100644
--- a/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
+++ b/test/CodeGen/MIR/AMDGPU/expected-target-index-name.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=amdgcn -mcpu=SI -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=amdgcn -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
 
 --- |
 
@@ -6,7 +6,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
@@ -14,21 +14,7 @@
     ret void
   }
 
-  declare { i1, i64 } @llvm.SI.if(i1)
-
-  declare { i1, i64 } @llvm.SI.else(i64)
-
-  declare i64 @llvm.SI.break(i64)
-
-  declare i64 @llvm.SI.if.break(i1, i64)
-
-  declare i64 @llvm.SI.else.break(i64, i64)
-
-  declare i1 @llvm.SI.loop(i64)
-
-  declare void @llvm.SI.end.cf(i64)
-
-  attributes #0 = { "target-cpu"="SI" }
+  attributes #0 = { nounwind }
 
 ...
 ---
diff --git a/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir b/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
index 3277d37d7e4d3f25fddb423e342d3635172438b3..7cef01c9d12d9ba57d9bd86c0de1965fe7bbe404 100644
--- a/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
+++ b/test/CodeGen/MIR/AMDGPU/fold-imm-f16-f32.mir
@@ -1,6 +1,6 @@
 # RUN: llc --mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -run-pass si-fold-operands,si-shrink-instructions %s -o - | FileCheck %s
 --- |
-  define void @add_f32_1.0_one_f16_use() #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -11,7 +11,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_f32_1.0_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -22,7 +22,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f32_use_one_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -33,7 +33,7 @@
     ret void
   }
 
-  define void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
+  define amdgpu_kernel void @add_f32_1.0_one_f32_use_multi_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -46,7 +46,7 @@
     ret void
   }
 
-  define void @add_i32_1_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_i32_1_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f16.add0 = fadd half %f16.val0, 0xH0001
@@ -56,7 +56,7 @@
     ret void
   }
 
-  define void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
+  define amdgpu_kernel void @add_i32_m2_one_f32_use_multi_f16_use () #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -69,7 +69,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_multi_f32_use() #0 {
+  define amdgpu_kernel void @add_f16_1.0_multi_f32_use() #0 {
     %f32.val0 = load volatile float, float addrspace(1)* undef
     %f32.val1 = load volatile float, float addrspace(1)* undef
     %f32.val = load volatile float, float addrspace(1)* undef
@@ -80,7 +80,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
+  define amdgpu_kernel void @add_f16_1.0_other_high_bits_multi_f16_use() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile half, half addrspace(1)* undef
@@ -91,7 +91,7 @@
     ret void
   }
 
-  define void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
+  define amdgpu_kernel void @add_f16_1.0_other_high_bits_use_f16_f32() #0 {
     %f16.val0 = load volatile half, half addrspace(1)* undef
     %f16.val1 = load volatile half, half addrspace(1)* undef
     %f32.val = load volatile half, half addrspace(1)* undef
diff --git a/test/CodeGen/MIR/AMDGPU/intrinsics.mir b/test/CodeGen/MIR/AMDGPU/intrinsics.mir
index f43266eacbf03d43b7b8d731cd949a8bc63fbca6..cb6e6190990b0cfe49cc240fdade1e25cf5e8d38 100644
--- a/test/CodeGen/MIR/AMDGPU/intrinsics.mir
+++ b/test/CodeGen/MIR/AMDGPU/intrinsics.mir
@@ -2,18 +2,18 @@
 
 --- |
 
-  define void @use_intrin() {
+  define amdgpu_kernel void @use_intrin() {
     ret void
   }
 
 ...
 ---
 # Completely invalid code, but it checks that intrinsics round-trip properly.
-# CHECK: %0(s64) = COPY intrinsic(@llvm.AMDGPU.bfe.i32)
+# CHECK: %0(s64) = COPY intrinsic(@llvm.amdgcn.sbfe)
 name:            use_intrin
 registers:
   - { id: 0, class: _ }
 body: |
   bb.0:
-    %0(s64) = COPY intrinsic(@llvm.AMDGPU.bfe.i32)
+    %0(s64) = COPY intrinsic(@llvm.amdgcn.sbfe.i32)
 ...
diff --git a/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir b/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
index d73503223aa8eadf8dbc6d17ba1756eb6bfd756c..8cffc86373a35f58a518097e1efd5351daa4bc68 100644
--- a/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
+++ b/test/CodeGen/MIR/AMDGPU/invalid-target-index-operand.mir
@@ -1,4 +1,4 @@
-# RUN: not llc -march=amdgcn -mcpu=SI -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+# RUN: not llc -march=amdgcn -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
 
 --- |
 
@@ -6,7 +6,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
@@ -14,21 +14,7 @@
     ret void
   }
 
-  declare { i1, i64 } @llvm.SI.if(i1)
-
-  declare { i1, i64 } @llvm.SI.else(i64)
-
-  declare i64 @llvm.SI.break(i64)
-
-  declare i64 @llvm.SI.if.break(i1, i64)
-
-  declare i64 @llvm.SI.else.break(i64, i64)
-
-  declare i1 @llvm.SI.loop(i64)
-
-  declare void @llvm.SI.end.cf(i64)
-
-  attributes #0 = { "target-cpu"="SI" }
+  attributes #0 = { nounwind }
 
 ...
 ---
diff --git a/test/CodeGen/MIR/AMDGPU/target-index-operands.mir b/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
index a4e77f281ea6bebcf410e8ebfe042bb67d5ca440..32669de15ea36fc6903d7b6ac74526d4894a35db 100644
--- a/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
+++ b/test/CodeGen/MIR/AMDGPU/target-index-operands.mir
@@ -7,7 +7,7 @@
 
   @float_gv = internal unnamed_addr addrspace(2) constant [5 x float] [float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+00], align 4
 
-  define void @float(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
@@ -15,29 +15,14 @@
     ret void
   }
 
-  define void @float2(float addrspace(1)* %out, i32 %index) #0 {
+  define amdgpu_kernel void @float2(float addrspace(1)* %out, i32 %index) #0 {
   entry:
     %0 = getelementptr inbounds [5 x float], [5 x float] addrspace(2)* @float_gv, i32 0, i32 %index
     %1 = load float, float addrspace(2)* %0
     store float %1, float addrspace(1)* %out
     ret void
   }
-
-  declare { i1, i64 } @llvm.SI.if(i1)
-
-  declare { i1, i64 } @llvm.SI.else(i64)
-
-  declare i64 @llvm.SI.break(i64)
-
-  declare i64 @llvm.SI.if.break(i1, i64)
-
-  declare i64 @llvm.SI.else.break(i64, i64)
-
-  declare i1 @llvm.SI.loop(i64)
-
-  declare void @llvm.SI.end.cf(i64)
-
-  attributes #0 = { "target-cpu"="SI" }
+  attributes #0 = { nounwind }
 
 ...
 ---
diff --git a/test/CodeGen/MIR/Generic/llvmIR.mir b/test/CodeGen/MIR/Generic/llvmIR.mir
index 432b18ff939d722eef62bb6ec53930f01df77f0d..5c0e60e916f06d0e3ff7648262c73153ae9e5a58 100644
--- a/test/CodeGen/MIR/Generic/llvmIR.mir
+++ b/test/CodeGen/MIR/Generic/llvmIR.mir
@@ -28,10 +28,8 @@
   IfUnequal:
     ret i32 0
   }
-  
+
 ...
 ---
 name: foo
-body: |
-  bb.0:
 ...
diff --git a/test/CodeGen/MIR/Generic/llvmIRMissing.mir b/test/CodeGen/MIR/Generic/llvmIRMissing.mir
index 9f361e8d3fe47467662ab18f781015a7fe5bf008..419f60be80619359821dabe04466ecdec994b2a2 100644
--- a/test/CodeGen/MIR/Generic/llvmIRMissing.mir
+++ b/test/CodeGen/MIR/Generic/llvmIRMissing.mir
@@ -1,9 +1,7 @@
-# RUN: llc -run-pass none -o - %s 2>&1 | FileCheck %s
+# RUN: llc -run-pass none -o - %s | FileCheck %s
 # This test ensures that the MIR parser accepts files without the LLVM IR.
 
 ---
 # CHECK: name: foo
 name: foo
-body: |
-  bb.0:
 ...
diff --git a/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir b/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir
index a5737c2c15269258809e0dedbea7089c8b1b72f4..cf095537bebdc7880ae7e03a3749071e3fad5114 100644
--- a/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir
+++ b/test/CodeGen/MIR/Generic/machine-basic-block-ir-block-reference.mir
@@ -1,4 +1,4 @@
-# RUN: llc -run-pass none -o - %s 2>&1 | FileCheck %s
+# RUN: llc -run-pass none -o - %s | FileCheck %s
 # This test ensures that the MIR parser preserves unnamed LLVM IR block
 # references.
 
diff --git a/test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir b/test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir
deleted file mode 100644
index 1896371db36a7258491a67ad317f8e8a1abca8f4..0000000000000000000000000000000000000000
--- a/test/CodeGen/MIR/Generic/machine-function-missing-body-error.mir
+++ /dev/null
@@ -1,15 +0,0 @@
-# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
-# This test ensures that the MIR parser reports an error when it encounters a
-# machine function with an empty body.
-
---- |
-
-  define i32 @foo() {
-    ret i32 0
-  }
-
-...
----
-# CHECK: machine function 'foo' requires at least one machine basic block in its body
-name:            foo
-...
diff --git a/test/CodeGen/MIR/Generic/machine-function-missing-body.mir b/test/CodeGen/MIR/Generic/machine-function-missing-body.mir
new file mode 100644
index 0000000000000000000000000000000000000000..0fd970c3af7cc5c3e4de099c85c4a5c3e14c56ac
--- /dev/null
+++ b/test/CodeGen/MIR/Generic/machine-function-missing-body.mir
@@ -0,0 +1,15 @@
+# RUN: llc -run-pass none -o - %s | FileCheck %s
+# This test ensures that the MIR parser accepts files with llvm IR but
+# no machine function body.
+
+--- |
+  ; CHECK: define i32 @foo()
+  define i32 @foo() {
+    ret i32 0
+  }
+
+...
+---
+# CHECK: name: foo
+name:            foo
+...
diff --git a/test/CodeGen/MIR/Generic/machine-function-missing-function.mir b/test/CodeGen/MIR/Generic/machine-function-missing-function.mir
index c547bb25d75324f8c64af0aba1cc344939082254..b218afd72ea366f41912f546bc3587f1f9550ed5 100644
--- a/test/CodeGen/MIR/Generic/machine-function-missing-function.mir
+++ b/test/CodeGen/MIR/Generic/machine-function-missing-function.mir
@@ -12,12 +12,8 @@
 ...
 ---
 name:            foo
-body: |
-  bb.0:
 ...
 ---
 # CHECK: function 'faa' isn't defined in the provided LLVM IR
 name:            faa
-body: |
-  bb.0:
 ...
diff --git a/test/CodeGen/MIR/Generic/machine-function-missing-name.mir b/test/CodeGen/MIR/Generic/machine-function-missing-name.mir
index 30f0e51b3b663bd764c2b761afe5bb9bf90d57cb..bc279a6ecfdc7cd4ef6c99f3445d52d5a3abe16c 100644
--- a/test/CodeGen/MIR/Generic/machine-function-missing-name.mir
+++ b/test/CodeGen/MIR/Generic/machine-function-missing-name.mir
@@ -16,11 +16,7 @@
 ---
 # CHECK: [[@LINE+1]]:1: missing required key 'name'
 nme:             foo
-body: |
-  bb.0:
 ...
 ---
 name:            bar
-body: |
-  bb.0:
 ...
diff --git a/test/CodeGen/MIR/Generic/machine-function.mir b/test/CodeGen/MIR/Generic/machine-function.mir
index f9001cca4c26b1de463282bfb14845808079dd34..9c19b980e675fe5bad8f7e760cc4fb5d6776e913 100644
--- a/test/CodeGen/MIR/Generic/machine-function.mir
+++ b/test/CodeGen/MIR/Generic/machine-function.mir
@@ -18,7 +18,7 @@
   define i32 @func2() {
     ret i32 0
   }
-  
+
 ...
 ---
 # CHECK: name: foo
@@ -26,8 +26,6 @@
 # CHECK-NEXT: exposesReturnsTwice: false
 # CHECK: ...
 name:            foo
-body: |
-  bb.0:
 ...
 ---
 # CHECK: name: bar
@@ -35,8 +33,6 @@ body: |
 # CHECK-NEXT: exposesReturnsTwice: false
 # CHECK: ...
 name:            bar
-body: |
-  bb.0:
 ...
 ---
 # CHECK: name: func
@@ -45,8 +41,6 @@ body: |
 # CHECK: ...
 name:            func
 alignment:       8
-body: |
-  bb.0:
 ...
 ---
 # CHECK: name: func2
@@ -56,6 +50,4 @@ body: |
 name:            func2
 alignment:       16
 exposesReturnsTwice: true
-body: |
-  bb.0:
 ...
diff --git a/test/CodeGen/MIR/Generic/register-info.mir b/test/CodeGen/MIR/Generic/register-info.mir
index af3f44f9abcc45f403cfe461eea1de0e71db432d..84a6125abe883d8ae39f285ad5d574083a58c4a1 100644
--- a/test/CodeGen/MIR/Generic/register-info.mir
+++ b/test/CodeGen/MIR/Generic/register-info.mir
@@ -20,8 +20,6 @@
 # CHECK: tracksRegLiveness: false
 # CHECK: ...
 name:            foo
-body: |
-  bb.0:
 ...
 ---
 # CHECK: name: bar
@@ -29,6 +27,4 @@ body: |
 # CHECK: ...
 name: bar
 tracksRegLiveness: true
-body: |
-  bb.0:
 ...
diff --git a/test/CodeGen/MIR/Generic/runPass.mir b/test/CodeGen/MIR/Generic/runPass.mir
index bf37bdd1836b9a9e022afca749aaf7437109c561..eeef9d526510d7a52b588b17f921efc9cb31dffb 100644
--- a/test/CodeGen/MIR/Generic/runPass.mir
+++ b/test/CodeGen/MIR/Generic/runPass.mir
@@ -1,4 +1,4 @@
-# RUN: llc -run-pass=greedy -debug-pass=Arguments -o - %s 2>&1 | FileCheck %s
+# RUN: llc -run-pass=greedy -debug-pass=Arguments -o - %s | FileCheck %s
 
 # Check that passes are initialized correctly, so that it's possible to
 # use -run-pass.
diff --git a/test/CodeGen/MIR/X86/dynamic-regmask.ll b/test/CodeGen/MIR/X86/dynamic-regmask.ll
new file mode 100644
index 0000000000000000000000000000000000000000..df58f4be79d759fcb64d47ebdb2271e850b8934f
--- /dev/null
+++ b/test/CodeGen/MIR/X86/dynamic-regmask.ll
@@ -0,0 +1,30 @@
+; RUN: llc -mtriple=x86_64-pc-win32 -stop-after machine-sink %s -o %t.mir
+; RUN: FileCheck %s < %t.mir
+; RUN: llc %t.mir -mtriple=x86_64-pc-win32 -run-pass machine-sink
+; Check that callee saved registers are printed in a format that can then be parsed.
+
+declare x86_regcallcc i32 @callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0)
+
+define i32 @caller(i32 %a0) nounwind {
+  %b1 = call x86_regcallcc i32 @callee(i32 %a0, i32 %a0, i32 %a0, i32 %a0, i32 %a0)
+  %b2 = add i32 %b1, %a0
+  ret i32 %b2
+}
+; CHECK:    name: caller
+; CHECK:    CALL64pcrel32 @callee, CustomRegMask(%bh,%bl,%bp,%bpl,%bx,%ebp,%ebx,%esp,%rbp,%rbx,%rsp,%sp,%spl,%r10,%r11,%r12,%r13,%r14,%r15,%xmm8,%xmm9,%xmm10,%xmm11,%xmm12,%xmm13,%xmm14,%xmm15,%r10b,%r11b,%r12b,%r13b,%r14b,%r15b,%r10d,%r11d,%r12d,%r13d,%r14d,%r15d,%r10w,%r11w,%r12w,%r13w,%r14w,%r15w)
+; CHECK:    RET 0, %eax
+
+define x86_regcallcc {i32, i32, i32} @test_callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0) nounwind {
+  %b1 = mul i32 7, %e0
+  %b2 = udiv i32 5, %e0
+  %b3 = mul i32 7, %d0
+  %b4 = insertvalue {i32, i32, i32} undef, i32 %b1, 0
+  %b5 = insertvalue {i32, i32, i32} %b4, i32 %b2, 1
+  %b6 = insertvalue {i32, i32, i32} %b5, i32 %b3, 2
+  ret {i32, i32, i32} %b6
+}
+; CHECK: name:            test_callee
+; CHECK: calleeSavedRegisters: [ '%rbx', '%rbp', '%rsp', '%r10', '%r11', '%r12',
+; CHECK:                         '%r13', '%r14', '%r15', '%xmm8', '%xmm9', '%xmm10',
+; CHECK:                         '%xmm11', '%xmm12', '%xmm13', '%xmm14', '%xmm15' ]
+; CHECK: RET 0, %eax, %ecx, %edx
diff --git a/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir b/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir
index 5e7dde26769b228884162f6af3548c549545dae3..9847d027ee023c710cb607e6e64ae90e185e12bc 100644
--- a/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir
+++ b/test/CodeGen/MIR/X86/expected-named-register-in-allocation-hint.mir
@@ -1,4 +1,4 @@
-# RUN: llc -march=x86-64 -run-pass none -o - %s 2>&1 | FileCheck %s
+# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
 
 --- |
 
diff --git a/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir b/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir
index cfa03247e31f8105502641e2364bc400e4b277e0..57e11d39723a1ff606badbc172287c26d613eab1 100644
--- a/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir
+++ b/test/CodeGen/MIR/X86/expected-size-integer-after-memory-operation.mir
@@ -17,7 +17,7 @@ liveins:
 body: |
   bb.0.entry:
     liveins: %rdi
-  ; CHECK: [[@LINE+1]]:53: expected the size integer literal after memory operation
+  ; CHECK: [[@LINE+1]]:53: expected an atomic scope, ordering or a size integer literal
     %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load from %ir.a)
     RETQ %eax
 ...
diff --git a/test/CodeGen/MIR/X86/used-physical-register-info.mir b/test/CodeGen/MIR/X86/used-physical-register-info.mir
deleted file mode 100644
index 9edc4113b27915c4923b2c114f3d3b736db7e4ea..0000000000000000000000000000000000000000
--- a/test/CodeGen/MIR/X86/used-physical-register-info.mir
+++ /dev/null
@@ -1,109 +0,0 @@
-# RUN: llc -march=x86-64 -run-pass none -o - %s | FileCheck %s
-# This test ensures that the MIR parser parses the callee saved register mask
-# correctly and that the MIR parser can infer it as well.
-
---- |
-
-  define i32 @compute(i32 %a) #0 {
-  body:
-    %c = mul i32 %a, 11
-    ret i32 %c
-  }
-
-  define i32 @foo(i32 %a) #0 {
-  entry:
-    %b = call i32 @compute(i32 %a)
-    ret i32 %b
-  }
-
-  define i32 @bar(i32 %a) #0 {
-  entry:
-    %b = call i32 @compute(i32 %a)
-    ret i32 %b
-  }
-
-  define i32 @empty(i32 %a) #0 {
-  entry:
-    %b = call i32 @compute(i32 %a)
-    ret i32 %b
-  }
-
-  attributes #0 = { "no-frame-pointer-elim"="false" }
-
-...
----
-# CHECK: name: compute
-# CHECK: liveins:
-# CHECK-NEXT: - { reg: '%edi' }
-# CHECK-NEXT: frameInfo:
-name:            compute
-liveins:
-  - { reg: '%edi' }
-frameInfo:
-  stackSize:     8
-body: |
-  bb.0.body:
-    liveins: %edi
-
-    %eax = IMUL32rri8 %edi, 11, implicit-def %eflags
-    RETQ %eax
-...
----
-name:            foo
-liveins:
-  - { reg: '%edi' }
-# CHECK: name: foo
-# CHECK: calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx',
-# CHECK-NEXT:                    '%rbp', '%rbx', '%r12', '%r13', '%r14', '%r15',
-# CHECK-NEXT:                    '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d',
-# CHECK-NEXT:                    '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
-calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx',
-                        '%rbp', '%rbx', '%r12', '%r13', '%r14', '%r15',
-                        '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d',
-                        '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
-body: |
-  bb.0.entry:
-    liveins: %edi
-
-    PUSH64r %rax, implicit-def %rsp, implicit %rsp
-    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
-    %rdx = POP64r implicit-def %rsp, implicit %rsp
-    RETQ %eax
-...
----
-name:            bar
-liveins:
-  - { reg: '%edi' }
-# Verify that the callee saved register can be inferred from register mask
-# machine operands:
-# CHECK: name: bar
-# CHECK: calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx',
-# CHECK-NEXT:                    '%rbp', '%rbx', '%r12', '%r13', '%r14', '%r15',
-# CHECK-NEXT:                    '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d',
-# CHECK-NEXT:                    '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
-body: |
-  bb.0.entry:
-    liveins: %edi
-
-    PUSH64r %rax, implicit-def %rsp, implicit %rsp
-    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
-    %rdx = POP64r implicit-def %rsp, implicit %rsp
-    RETQ %eax
-...
----
-name:            empty
-liveins:
-  - { reg: '%edi' }
-# Verify that the callee saved register can be empty.
-# CHECK: name: empty
-# CHECK: calleeSavedRegisters: [ ]
-calleeSavedRegisters: [ ]
-body: |
-  bb.0.entry:
-    liveins: %edi
-
-    PUSH64r %rax, implicit-def %rsp, implicit %rsp
-    CALL64pcrel32 @compute, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, implicit-def %eax
-    %rdx = POP64r implicit-def %rsp, implicit %rsp
-    RETQ %eax
-...
diff --git a/test/CodeGen/MSP430/AddrMode-bis-rx.ll b/test/CodeGen/MSP430/AddrMode-bis-rx.ll
index 941ee2dc2ce924e42076d60b360587640f1a6b12..f4cb30f2d014c38a358524fec3337d79fefeadfb 100644
--- a/test/CodeGen/MSP430/AddrMode-bis-rx.ll
+++ b/test/CodeGen/MSP430/AddrMode-bis-rx.ll
@@ -8,7 +8,7 @@ define i16 @am1(i16 %x, i16* %a) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am1:
-; CHECK:		bis.w	0(r14), r15
+; CHECK:		bis.w	0(r13), r12
 
 @foo = external global i16
 
@@ -18,7 +18,7 @@ define i16 @am2(i16 %x) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am2:
-; CHECK:		bis.w	&foo, r15
+; CHECK:		bis.w	&foo, r12
 
 @bar = internal constant [2 x i8] [ i8 32, i8 64 ]
 
@@ -29,7 +29,7 @@ define i8 @am3(i8 %x, i16 %n) nounwind {
 	ret i8 %3
 }
 ; CHECK-LABEL: am3:
-; CHECK:		bis.b	bar(r14), r15
+; CHECK:		bis.b	bar(r13), r12
 
 define i16 @am4(i16 %x) nounwind {
 	%1 = load volatile i16, i16* inttoptr(i16 32 to i16*)
@@ -37,7 +37,7 @@ define i16 @am4(i16 %x) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am4:
-; CHECK:		bis.w	&32, r15
+; CHECK:		bis.w	&32, r12
 
 define i16 @am5(i16 %x, i16* %a) nounwind {
 	%1 = getelementptr i16, i16* %a, i16 2
@@ -46,7 +46,7 @@ define i16 @am5(i16 %x, i16* %a) nounwind {
 	ret i16 %3
 }
 ; CHECK-LABEL: am5:
-; CHECK:		bis.w	4(r14), r15
+; CHECK:		bis.w	4(r13), r12
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer, align 1
@@ -57,7 +57,7 @@ define i16 @am6(i16 %x) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am6:
-; CHECK:		bis.w	&baz+2, r15
+; CHECK:		bis.w	&baz+2, r12
 
 %T = type { i16, [2 x i8] }
 @duh = internal constant %T { i16 16, [2 x i8][i8 32, i8 64 ] }
@@ -70,5 +70,5 @@ define i8 @am7(i8 %x, i16 %n) nounwind {
 	ret i8 %4
 }
 ; CHECK-LABEL: am7:
-; CHECK:		bis.b	duh+2(r14), r15
+; CHECK:		bis.b	duh+2(r13), r12
 
diff --git a/test/CodeGen/MSP430/AddrMode-bis-xr.ll b/test/CodeGen/MSP430/AddrMode-bis-xr.ll
index 4b8f367a8880ed4044c76ccc19ab097e0be01cf5..1e150f382062ccc32ed550377941ba0142e85737 100644
--- a/test/CodeGen/MSP430/AddrMode-bis-xr.ll
+++ b/test/CodeGen/MSP430/AddrMode-bis-xr.ll
@@ -9,7 +9,7 @@ define void @am1(i16* %a, i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am1:
-; CHECK:		bis.w	r14, 0(r15)
+; CHECK:		bis.w	r13, 0(r12)
 
 @foo = external global i16
 
@@ -20,7 +20,7 @@ define void @am2(i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am2:
-; CHECK:		bis.w	r15, &foo
+; CHECK:		bis.w	r12, &foo
 
 @bar = external global [2 x i8]
 
@@ -32,7 +32,7 @@ define void @am3(i16 %i, i8 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am3:
-; CHECK:		bis.b	r14, bar(r15)
+; CHECK:		bis.b	r13, bar(r12)
 
 define void @am4(i16 %x) nounwind {
 	%1 = load volatile i16, i16* inttoptr(i16 32 to i16*)
@@ -41,7 +41,7 @@ define void @am4(i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am4:
-; CHECK:		bis.w	r15, &32
+; CHECK:		bis.w	r12, &32
 
 define void @am5(i16* %a, i16 %x) readonly {
 	%1 = getelementptr inbounds i16, i16* %a, i16 2
@@ -51,7 +51,7 @@ define void @am5(i16* %a, i16 %x) readonly {
 	ret void
 }
 ; CHECK-LABEL: am5:
-; CHECK:		bis.w	r14, 4(r15)
+; CHECK:		bis.w	r13, 4(r12)
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer
@@ -63,7 +63,7 @@ define void @am6(i16 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am6:
-; CHECK:		bis.w	r15, &baz+2
+; CHECK:		bis.w	r12, &baz+2
 
 %T = type { i16, [2 x i8] }
 @duh = external global %T
@@ -77,5 +77,5 @@ define void @am7(i16 %n, i8 %x) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am7:
-; CHECK:		bis.b	r14, duh+2(r15)
+; CHECK:		bis.b	r13, duh+2(r12)
 
diff --git a/test/CodeGen/MSP430/AddrMode-mov-rx.ll b/test/CodeGen/MSP430/AddrMode-mov-rx.ll
index cdee931bf96de9e4d4b6b869d0e31c82c3111fc1..808aca0ea10b549ce04cec2bac8cfeedbf1308fa 100644
--- a/test/CodeGen/MSP430/AddrMode-mov-rx.ll
+++ b/test/CodeGen/MSP430/AddrMode-mov-rx.ll
@@ -7,7 +7,7 @@ define i16 @am1(i16* %a) nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am1:
-; CHECK:		mov.w	0(r15), r15
+; CHECK:		mov.w	0(r12), r12
 
 @foo = external global i16
 
@@ -16,7 +16,7 @@ define i16 @am2() nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am2:
-; CHECK:		mov.w	&foo, r15
+; CHECK:		mov.w	&foo, r12
 
 @bar = internal constant [2 x i8] [ i8 32, i8 64 ]
 
@@ -26,14 +26,14 @@ define i8 @am3(i16 %n) nounwind {
 	ret i8 %2
 }
 ; CHECK-LABEL: am3:
-; CHECK:		mov.b	bar(r15), r15
+; CHECK:		mov.b	bar(r12), r12
 
 define i16 @am4() nounwind {
 	%1 = load volatile i16, i16* inttoptr(i16 32 to i16*)
 	ret i16 %1
 }
 ; CHECK-LABEL: am4:
-; CHECK:		mov.w	&32, r15
+; CHECK:		mov.w	&32, r12
 
 define i16 @am5(i16* %a) nounwind {
 	%1 = getelementptr i16, i16* %a, i16 2
@@ -41,7 +41,7 @@ define i16 @am5(i16* %a) nounwind {
 	ret i16 %2
 }
 ; CHECK-LABEL: am5:
-; CHECK:		mov.w	4(r15), r15
+; CHECK:		mov.w	4(r12), r12
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer, align 1
@@ -51,7 +51,7 @@ define i16 @am6() nounwind {
 	ret i16 %1
 }
 ; CHECK-LABEL: am6:
-; CHECK:		mov.w	&baz+2, r15
+; CHECK:		mov.w	&baz+2, r12
 
 %T = type { i16, [2 x i8] }
 @duh = internal constant %T { i16 16, [2 x i8][i8 32, i8 64 ] }
@@ -63,5 +63,5 @@ define i8 @am7(i16 %n) nounwind {
 	ret i8 %3
 }
 ; CHECK-LABEL: am7:
-; CHECK:		mov.b	duh+2(r15), r15
+; CHECK:		mov.b	duh+2(r12), r12
 
diff --git a/test/CodeGen/MSP430/AddrMode-mov-xr.ll b/test/CodeGen/MSP430/AddrMode-mov-xr.ll
index ccb42886e9b424c0c7b7cc75bbdb31981af4dc78..c336289a60d730350318b51019237914eec313ad 100644
--- a/test/CodeGen/MSP430/AddrMode-mov-xr.ll
+++ b/test/CodeGen/MSP430/AddrMode-mov-xr.ll
@@ -7,7 +7,7 @@ define void @am1(i16* %a, i16 %b) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am1:
-; CHECK:		mov.w	r14, 0(r15)
+; CHECK:		mov.w	r13, 0(r12)
 
 @foo = external global i16
 
@@ -16,7 +16,7 @@ define void @am2(i16 %a) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am2:
-; CHECK:		mov.w	r15, &foo
+; CHECK:		mov.w	r12, &foo
 
 @bar = external global [2 x i8]
 
@@ -26,14 +26,14 @@ define void @am3(i16 %i, i8 %a) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am3:
-; CHECK:		mov.b	r14, bar(r15)
+; CHECK:		mov.b	r13, bar(r12)
 
 define void @am4(i16 %a) nounwind {
 	store volatile i16 %a, i16* inttoptr(i16 32 to i16*)
 	ret void
 }
 ; CHECK-LABEL: am4:
-; CHECK:		mov.w	r15, &32
+; CHECK:		mov.w	r12, &32
 
 define void @am5(i16* nocapture %p, i16 %a) nounwind readonly {
 	%1 = getelementptr inbounds i16, i16* %p, i16 2
@@ -41,7 +41,7 @@ define void @am5(i16* nocapture %p, i16 %a) nounwind readonly {
 	ret void
 }
 ; CHECK-LABEL: am5:
-; CHECK:		mov.w	r14, 4(r15)
+; CHECK:		mov.w	r13, 4(r12)
 
 %S = type { i16, i16 }
 @baz = common global %S zeroinitializer, align 1
@@ -51,7 +51,7 @@ define void @am6(i16 %a) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am6:
-; CHECK:		mov.w	r15, &baz+2
+; CHECK:		mov.w	r12, &baz+2
 
 %T = type { i16, [2 x i8] }
 @duh = external global %T
@@ -63,5 +63,5 @@ define void @am7(i16 %n, i8 %a) nounwind {
 	ret void
 }
 ; CHECK-LABEL: am7:
-; CHECK:		mov.b	r14, duh+2(r15)
+; CHECK:		mov.b	r13, duh+2(r12)
 
diff --git a/test/CodeGen/MSP430/Inst16mm.ll b/test/CodeGen/MSP430/Inst16mm.ll
index c75e1beb2356efe40ab46adf46253b007a1a14f6..a48d8592c1a6893709e7a7cb9d986da228ff6d54 100644
--- a/test/CodeGen/MSP430/Inst16mm.ll
+++ b/test/CodeGen/MSP430/Inst16mm.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=msp430 -combiner-alias-analysis < %s | FileCheck %s
+; RUN: llc -march=msp430 < %s | FileCheck %s
 target datalayout = "e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8"
 target triple = "msp430-generic-generic"
 @foo = common global i16 0, align 2
diff --git a/test/CodeGen/MSP430/Inst16mr.ll b/test/CodeGen/MSP430/Inst16mr.ll
index 50dc4c0b673105cf16b11fcd5e3f0f79b2ebd0d9..847c093f4088c2dbcffa890f4920a310d2214a3c 100644
--- a/test/CodeGen/MSP430/Inst16mr.ll
+++ b/test/CodeGen/MSP430/Inst16mr.ll
@@ -5,14 +5,14 @@ target triple = "msp430-generic-generic"
 
 define void @mov(i16 %a) nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	r15, &foo
+; CHECK: mov.w	r12, &foo
 	store i16 %a, i16* @foo
 	ret void
 }
 
 define void @add(i16 %a) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	r15, &foo
+; CHECK: add.w	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = add i16 %a, %1
 	store i16 %2, i16* @foo
@@ -21,7 +21,7 @@ define void @add(i16 %a) nounwind {
 
 define void @and(i16 %a) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	r15, &foo
+; CHECK: and.w	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = and i16 %a, %1
 	store i16 %2, i16* @foo
@@ -30,7 +30,7 @@ define void @and(i16 %a) nounwind {
 
 define void @bis(i16 %a) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	r15, &foo
+; CHECK: bis.w	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = or i16 %a, %1
 	store i16 %2, i16* @foo
@@ -39,7 +39,7 @@ define void @bis(i16 %a) nounwind {
 
 define void @bic(i16 zeroext %m) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.w   r15, &foo
+; CHECK: bic.w   r12, &foo
         %1 = xor i16 %m, -1
         %2 = load i16, i16* @foo
         %3 = and i16 %2, %1
@@ -49,7 +49,7 @@ define void @bic(i16 zeroext %m) nounwind {
 
 define void @xor(i16 %a) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	r15, &foo
+; CHECK: xor.w	r12, &foo
 	%1 = load i16, i16* @foo
 	%2 = xor i16 %a, %1
 	store i16 %2, i16* @foo
diff --git a/test/CodeGen/MSP430/Inst16ri.ll b/test/CodeGen/MSP430/Inst16ri.ll
index f89f686ab567c951871bd82de3ab156887f5ea2a..3a4bb6a93d995a8551e3706d97a2a89dba86675b 100644
--- a/test/CodeGen/MSP430/Inst16ri.ll
+++ b/test/CodeGen/MSP430/Inst16ri.ll
@@ -4,34 +4,34 @@ target triple = "msp430-generic-generic"
 
 define i16 @mov() nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	#1, r15
+; CHECK: mov.w	#1, r12
 	ret i16 1
 }
 
 define i16 @add(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	#1, r15
+; CHECK: add.w	#1, r12
 	%1 = add i16 %a, 1
 	ret i16 %1
 }
 
 define i16 @and(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	#1, r15
+; CHECK: and.w	#1, r12
 	%1 = and i16 %a, 1
 	ret i16 %1
 }
 
 define i16 @bis(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	#1, r15
+; CHECK: bis.w	#1, r12
 	%1 = or i16 %a, 1
 	ret i16 %1
 }
 
 define i16 @xor(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	#1, r15
+; CHECK: xor.w	#1, r12
 	%1 = xor i16 %a, 1
 	ret i16 %1
 }
diff --git a/test/CodeGen/MSP430/Inst16rm.ll b/test/CodeGen/MSP430/Inst16rm.ll
index 4f6998ee68dfe9d73b8cf09efd8c38d0de25a8be..44b8f39d8fa625b808f5989d8a18effca5788494 100644
--- a/test/CodeGen/MSP430/Inst16rm.ll
+++ b/test/CodeGen/MSP430/Inst16rm.ll
@@ -5,7 +5,7 @@ target triple = "msp430-generic-generic"
 
 define i16 @add(i16 %a) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	&foo, r15
+; CHECK: add.w	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = add i16 %a, %1
 	ret i16 %2
@@ -13,7 +13,7 @@ define i16 @add(i16 %a) nounwind {
 
 define i16 @and(i16 %a) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	&foo, r15
+; CHECK: and.w	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = and i16 %a, %1
 	ret i16 %2
@@ -21,7 +21,7 @@ define i16 @and(i16 %a) nounwind {
 
 define i16 @bis(i16 %a) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	&foo, r15
+; CHECK: bis.w	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = or i16 %a, %1
 	ret i16 %2
@@ -29,7 +29,7 @@ define i16 @bis(i16 %a) nounwind {
 
 define i16  @bic(i16 %a) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.w	&foo, r15
+; CHECK: bic.w	&foo, r12
         %1 = load i16, i16* @foo
         %2 = xor i16 %1, -1
         %3 = and i16 %a, %2
@@ -38,7 +38,7 @@ define i16  @bic(i16 %a) nounwind {
 
 define i16 @xor(i16 %a) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	&foo, r15
+; CHECK: xor.w	&foo, r12
 	%1 = load i16, i16* @foo
 	%2 = xor i16 %a, %1
 	ret i16 %2
diff --git a/test/CodeGen/MSP430/Inst16rr.ll b/test/CodeGen/MSP430/Inst16rr.ll
index d74bfae9b938b9d00818bf2da07d83966eb9eeb3..75440ca2b403a78b5114a3c6c5412d705899f7eb 100644
--- a/test/CodeGen/MSP430/Inst16rr.ll
+++ b/test/CodeGen/MSP430/Inst16rr.ll
@@ -4,34 +4,34 @@ target triple = "msp430-generic-generic"
 
 define i16 @mov(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.w	r14, r15
+; CHECK: mov.w	r13, r12
 	ret i16 %b
 }
 
 define i16 @add(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.w	r14, r15
+; CHECK: add.w	r13, r12
 	%1 = add i16 %a, %b
 	ret i16 %1
 }
 
 define i16 @and(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	r14, r15
+; CHECK: and.w	r13, r12
 	%1 = and i16 %a, %b
 	ret i16 %1
 }
 
 define i16 @bis(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	r14, r15
+; CHECK: bis.w	r13, r12
 	%1 = or i16 %a, %b
 	ret i16 %1
 }
 
 define i16 @bic(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.w	r14, r15
+; CHECK: bic.w	r13, r12
         %1 = xor i16 %b, -1
         %2 = and i16 %a, %1
         ret i16 %2
@@ -39,7 +39,7 @@ define i16 @bic(i16 %a, i16 %b) nounwind {
 
 define i16 @xor(i16 %a, i16 %b) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	r14, r15
+; CHECK: xor.w	r13, r12
 	%1 = xor i16 %a, %b
 	ret i16 %1
 }
diff --git a/test/CodeGen/MSP430/Inst8mr.ll b/test/CodeGen/MSP430/Inst8mr.ll
index f03c7e1a659b005c4f8c61ac86300888954c6de4..7fbdff257fe7c5eaa26978a286947a3e4ca05eef 100644
--- a/test/CodeGen/MSP430/Inst8mr.ll
+++ b/test/CodeGen/MSP430/Inst8mr.ll
@@ -5,14 +5,14 @@ target triple = "msp430-generic-generic"
 
 define void @mov(i8 %a) nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.b	r15, &foo
+; CHECK: mov.b	r12, &foo
 	store i8 %a, i8* @foo
 	ret void
 }
 
 define void @and(i8 %a) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.b	r15, &foo
+; CHECK: and.b	r12, &foo
 	%1 = load i8, i8* @foo
 	%2 = and i8 %a, %1
 	store i8 %2, i8* @foo
@@ -21,7 +21,7 @@ define void @and(i8 %a) nounwind {
 
 define void @add(i8 %a) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.b	r15, &foo
+; CHECK: add.b	r12, &foo
 	%1 = load i8, i8* @foo
 	%2 = add i8 %a, %1
 	store i8 %2, i8* @foo
@@ -30,7 +30,7 @@ define void @add(i8 %a) nounwind {
 
 define void @bis(i8 %a) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.b	r15, &foo
+; CHECK: bis.b	r12, &foo
 	%1 = load i8, i8* @foo
 	%2 = or i8 %a, %1
 	store i8 %2, i8* @foo
@@ -39,7 +39,7 @@ define void @bis(i8 %a) nounwind {
 
 define void @bic(i8 zeroext %m) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.b   r15, &foo
+; CHECK: bic.b   r12, &foo
         %1 = xor i8 %m, -1
         %2 = load i8, i8* @foo
         %3 = and i8 %2, %1
@@ -49,7 +49,7 @@ define void @bic(i8 zeroext %m) nounwind {
 
 define void @xor(i8 %a) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.b	r15, &foo
+; CHECK: xor.b	r12, &foo
 	%1 = load i8, i8* @foo
 	%2 = xor i8 %a, %1
 	store i8 %2, i8* @foo
diff --git a/test/CodeGen/MSP430/Inst8ri.ll b/test/CodeGen/MSP430/Inst8ri.ll
index ec0dff9c563e2e27a58226eeaa928cda5b08a73f..0e50f17f2a550b1e312ea8ed9c4f5fd7ba3d1f2e 100644
--- a/test/CodeGen/MSP430/Inst8ri.ll
+++ b/test/CodeGen/MSP430/Inst8ri.ll
@@ -4,34 +4,34 @@ target triple = "msp430-generic-generic"
 
 define i8 @mov() nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.b	#1, r15
+; CHECK: mov.b	#1, r12
 	ret i8 1
 }
 
 define i8 @add(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.b	#1, r15
+; CHECK: add.b	#1, r12
 	%1 = add i8 %a, 1
 	ret i8 %1
 }
 
 define i8 @and(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.b	#1, r15
+; CHECK: and.b	#1, r12
 	%1 = and i8 %a, 1
 	ret i8 %1
 }
 
 define i8 @bis(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.b	#1, r15
+; CHECK: bis.b	#1, r12
 	%1 = or i8 %a, 1
 	ret i8 %1
 }
 
 define i8 @xor(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.b	#1, r15
+; CHECK: xor.b	#1, r12
 	%1 = xor i8 %a, 1
 	ret i8 %1
 }
diff --git a/test/CodeGen/MSP430/Inst8rm.ll b/test/CodeGen/MSP430/Inst8rm.ll
index e1a9703955787191c85ef7a1a473a5b7127d97ee..826a3c65ec949e31e0b33e7992ff68980631d7f1 100644
--- a/test/CodeGen/MSP430/Inst8rm.ll
+++ b/test/CodeGen/MSP430/Inst8rm.ll
@@ -5,7 +5,7 @@ target triple = "msp430-generic-generic"
 
 define i8 @add(i8 %a) nounwind {
 ; CHECK-LABEL: add:
-; CHECK: add.b	&foo, r15
+; CHECK: add.b	&foo, r12
 	%1 = load i8, i8* @foo
 	%2 = add i8 %a, %1
 	ret i8 %2
@@ -13,7 +13,7 @@ define i8 @add(i8 %a) nounwind {
 
 define i8 @and(i8 %a) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.b	&foo, r15
+; CHECK: and.b	&foo, r12
 	%1 = load i8, i8* @foo
 	%2 = and i8 %a, %1
 	ret i8 %2
@@ -21,7 +21,7 @@ define i8 @and(i8 %a) nounwind {
 
 define i8 @bis(i8 %a) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.b	&foo, r15
+; CHECK: bis.b	&foo, r12
 	%1 = load i8, i8* @foo
 	%2 = or i8 %a, %1
 	ret i8 %2
@@ -29,7 +29,7 @@ define i8 @bis(i8 %a) nounwind {
 
 define i8  @bic(i8 %a) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.b  &foo, r15
+; CHECK: bic.b  &foo, r12
         %1 = load i8, i8* @foo
         %2 = xor i8 %1, -1
         %3 = and i8 %a, %2
@@ -38,7 +38,7 @@ define i8  @bic(i8 %a) nounwind {
 
 define i8 @xor(i8 %a) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.b	&foo, r15
+; CHECK: xor.b	&foo, r12
 	%1 = load i8, i8* @foo
 	%2 = xor i8 %a, %1
 	ret i8 %2
diff --git a/test/CodeGen/MSP430/Inst8rr.ll b/test/CodeGen/MSP430/Inst8rr.ll
index 76e8d19112820cdfaeb2395ca4bb2317fd9d5ed4..f37bc32a28fe1063ecf4b7179414414c459fac32 100644
--- a/test/CodeGen/MSP430/Inst8rr.ll
+++ b/test/CodeGen/MSP430/Inst8rr.ll
@@ -4,7 +4,7 @@ target triple = "msp430-generic-generic"
 
 define i8 @mov(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: mov:
-; CHECK: mov.{{[bw]}} r14, r15
+; CHECK: mov.{{[bw]}} r13, r12
 	ret i8 %b
 }
 
@@ -17,21 +17,21 @@ define i8 @add(i8 %a, i8 %b) nounwind {
 
 define i8 @and(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: and:
-; CHECK: and.w	r14, r15
+; CHECK: and.w	r13, r12
 	%1 = and i8 %a, %b
 	ret i8 %1
 }
 
 define i8 @bis(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: bis:
-; CHECK: bis.w	r14, r15
+; CHECK: bis.w	r13, r12
 	%1 = or i8 %a, %b
 	ret i8 %1
 }
 
 define i8 @bic(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: bic:
-; CHECK: bic.b  r14, r15
+; CHECK: bic.b  r13, r12
         %1 = xor i8 %b, -1
         %2 = and i8 %a, %1
         ret i8 %2
@@ -39,7 +39,7 @@ define i8 @bic(i8 %a, i8 %b) nounwind {
 
 define i8 @xor(i8 %a, i8 %b) nounwind {
 ; CHECK-LABEL: xor:
-; CHECK: xor.w	r14, r15
+; CHECK: xor.w	r13, r12
 	%1 = xor i8 %a, %b
 	ret i8 %1
 }
diff --git a/test/CodeGen/MSP430/bit.ll b/test/CodeGen/MSP430/bit.ll
index 45964f97f1bf86a6a8110ea3cc6d54384a3358a1..172822fbb5fef3fc7ffd40e8853ee0a8cd6bb3b8 100644
--- a/test/CodeGen/MSP430/bit.ll
+++ b/test/CodeGen/MSP430/bit.ll
@@ -12,7 +12,7 @@ define i8 @bitbrr(i8 %a, i8 %b) nounwind {
 	ret i8 %t3
 }
 ; CHECK-LABEL: bitbrr:
-; CHECK: bit.b	r14, r15
+; CHECK: bit.b	r13, r12
 
 define i8 @bitbri(i8 %a) nounwind {
 	%t1 = and i8 %a, 15
@@ -21,7 +21,7 @@ define i8 @bitbri(i8 %a) nounwind {
 	ret i8 %t3
 }
 ; CHECK-LABEL: bitbri:
-; CHECK: bit.b	#15, r15
+; CHECK: bit.b	#15, r12
 
 define i8 @bitbir(i8 %a) nounwind {
 	%t1 = and i8 15, %a
@@ -30,7 +30,7 @@ define i8 @bitbir(i8 %a) nounwind {
 	ret i8 %t3
 }
 ; CHECK-LABEL: bitbir:
-; CHECK: bit.b	#15, r15
+; CHECK: bit.b	#15, r12
 
 define i8 @bitbmi() nounwind {
 	%t1 = load i8, i8* @foo8
@@ -60,7 +60,7 @@ define i8 @bitbrm(i8 %a) nounwind {
 	ret i8 %t4
 }
 ; CHECK-LABEL: bitbrm:
-; CHECK: bit.b	&foo8, r15
+; CHECK: bit.b	&foo8, r12
 
 define i8 @bitbmr(i8 %a) nounwind {
 	%t1 = load i8, i8* @foo8
@@ -70,7 +70,7 @@ define i8 @bitbmr(i8 %a) nounwind {
 	ret i8 %t4
 }
 ; CHECK-LABEL: bitbmr:
-; CHECK: bit.b	r15, &foo8
+; CHECK: bit.b	r12, &foo8
 
 define i8 @bitbmm() nounwind {
 	%t1 = load i8, i8* @foo8
@@ -93,7 +93,7 @@ define i16 @bitwrr(i16 %a, i16 %b) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: bitwrr:
-; CHECK: bit.w	r14, r15
+; CHECK: bit.w	r13, r12
 
 define i16 @bitwri(i16 %a) nounwind {
 	%t1 = and i16 %a, 4080
@@ -102,7 +102,7 @@ define i16 @bitwri(i16 %a) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: bitwri:
-; CHECK: bit.w	#4080, r15
+; CHECK: bit.w	#4080, r12
 
 define i16 @bitwir(i16 %a) nounwind {
 	%t1 = and i16 4080, %a
@@ -111,7 +111,7 @@ define i16 @bitwir(i16 %a) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: bitwir:
-; CHECK: bit.w	#4080, r15
+; CHECK: bit.w	#4080, r12
 
 define i16 @bitwmi() nounwind {
 	%t1 = load i16, i16* @foo16
@@ -141,7 +141,7 @@ define i16 @bitwrm(i16 %a) nounwind {
 	ret i16 %t4
 }
 ; CHECK-LABEL: bitwrm:
-; CHECK: bit.w	&foo16, r15
+; CHECK: bit.w	&foo16, r12
 
 define i16 @bitwmr(i16 %a) nounwind {
 	%t1 = load i16, i16* @foo16
@@ -151,7 +151,7 @@ define i16 @bitwmr(i16 %a) nounwind {
 	ret i16 %t4
 }
 ; CHECK-LABEL: bitwmr:
-; CHECK: bit.w	r15, &foo16
+; CHECK: bit.w	r12, &foo16
 
 define i16 @bitwmm() nounwind {
 	%t1 = load i16, i16* @foo16
diff --git a/test/CodeGen/MSP430/byval.ll b/test/CodeGen/MSP430/byval.ll
index 410a6b047b6e38a4b46ebb99d87fee481dbca2cd..401896b43c20cc96a564711ec59b324281db3d9e 100644
--- a/test/CodeGen/MSP430/byval.ll
+++ b/test/CodeGen/MSP430/byval.ll
@@ -9,7 +9,7 @@ target triple = "msp430---elf"
 define i16 @callee(%struct.Foo* byval %f) nounwind {
 entry:
 ; CHECK-LABEL: callee:
-; CHECK: mov.w 2(r1), r15
+; CHECK: mov.w 2(r1), r12
   %0 = getelementptr inbounds %struct.Foo, %struct.Foo* %f, i32 0, i32 0
   %1 = load i16, i16* %0, align 2
   ret i16 %1
diff --git a/test/CodeGen/MSP430/cc_args.ll b/test/CodeGen/MSP430/cc_args.ll
index 39e99e2637449be888ad0f960b20145502d8a83b..70ac901f7e4e291fec47b67c13edf3c92ac6fe7c 100644
--- a/test/CodeGen/MSP430/cc_args.ll
+++ b/test/CodeGen/MSP430/cc_args.ll
@@ -7,12 +7,12 @@ define void @test() #0 {
 entry:
 ; CHECK: test:
 
-; CHECK: mov.w #1, r15
+; CHECK: mov.w #1, r12
 ; CHECK: call #f_i16
   call void @f_i16(i16 1)
 
-; CHECK: mov.w #772, r14
-; CHECK: mov.w #258, r15
+; CHECK: mov.w #772, r12
+; CHECK: mov.w #258, r13
 ; CHECK: call #f_i32
   call void @f_i32(i32 16909060)
 
@@ -23,26 +23,34 @@ entry:
 ; CHECK: call #f_i64
   call void @f_i64(i64 72623859790382856)
 
-; CHECK: mov.w #772, r14
-; CHECK: mov.w #258, r15
-; CHECK: mov.w #1800, r12
-; CHECK: mov.w #1286, r13
+; CHECK: mov.w #772, r12
+; CHECK: mov.w #258, r13
+; CHECK: mov.w #1800, r14
+; CHECK: mov.w #1286, r15
 ; CHECK: call #f_i32_i32
   call void @f_i32_i32(i32 16909060, i32 84281096)
 
-; CHECK: mov.w #1, r15
+; CHECK: mov.w #1, r12
 ; CHECK: mov.w #772, r13
 ; CHECK: mov.w #258, r14
-; CHECK: mov.w #2, r12
+; CHECK: mov.w #2, r15
 ; CHECK: call #f_i16_i32_i16
   call void @f_i16_i32_i16(i16 1, i32 16909060, i16 2)
 
-; CHECK: mov.w #2, 8(r1)
+; CHECK: mov.w #1286, 0(r1)
+; CHECK: mov.w #1, r12
+; CHECK: mov.w #772, r13
+; CHECK: mov.w #258, r14
+; CHECK: mov.w #1800, r15
+; CHECK: call #f_i16_i32_i32
+  call void @f_i16_i32_i32(i16 1, i32 16909060, i32 84281096)
+
 ; CHECK: mov.w #258, 6(r1)
 ; CHECK: mov.w #772, 4(r1)
 ; CHECK: mov.w #1286, 2(r1)
 ; CHECK: mov.w #1800, 0(r1)
-; CHECK: mov.w #1, r15
+; CHECK: mov.w #1, r12
+; CHECK: mov.w #2, r13
 ; CHECK: call #f_i16_i64_i16
   call void @f_i16_i64_i16(i16 1, i64 72623859790382856, i16 2)
 
@@ -55,15 +63,15 @@ entry:
 
 define void @f_i16(i16 %a) #0 {
 ; CHECK: f_i16:
-; CHECK: mov.w r15, &g_i16
+; CHECK: mov.w r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
   ret void
 }
 
 define void @f_i32(i32 %a) #0 {
 ; CHECK: f_i32:
-; CHECK: mov.w r15, &g_i32+2
-; CHECK: mov.w r14, &g_i32
+; CHECK: mov.w r13, &g_i32+2
+; CHECK: mov.w r12, &g_i32
   store volatile i32 %a, i32* @g_i32, align 2
   ret void
 }
@@ -80,37 +88,50 @@ define void @f_i64(i64 %a) #0 {
 
 define void @f_i32_i32(i32 %a, i32 %b) #0 {
 ; CHECK: f_i32_i32:
-; CHECK: mov.w r15, &g_i32+2
-; CHECK: mov.w r14, &g_i32
-  store volatile i32 %a, i32* @g_i32, align 2
 ; CHECK: mov.w r13, &g_i32+2
 ; CHECK: mov.w r12, &g_i32
+  store volatile i32 %a, i32* @g_i32, align 2
+; CHECK: mov.w r15, &g_i32+2
+; CHECK: mov.w r14, &g_i32
   store volatile i32 %b, i32* @g_i32, align 2
   ret void
 }
 
+define void @f_i16_i32_i32(i16 %a, i32 %b, i32 %c) #0 {
+; CHECK: f_i16_i32_i32:
+; CHECK: mov.w r12, &g_i16
+  store volatile i16 %a, i16* @g_i16, align 2
+; CHECK: mov.w r14, &g_i32+2
+; CHECK: mov.w r13, &g_i32
+  store volatile i32 %b, i32* @g_i32, align 2
+; CHECK: mov.w r15, &g_i32
+; CHECK: mov.w 4(r4), &g_i32+2
+  store volatile i32 %c, i32* @g_i32, align 2
+  ret void
+}
+
 define void @f_i16_i32_i16(i16 %a, i32 %b, i16 %c) #0 {
 ; CHECK: f_i16_i32_i16:
-; CHECK: mov.w r15, &g_i16
+; CHECK: mov.w r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
 ; CHECK: mov.w r14, &g_i32+2
 ; CHECK: mov.w r13, &g_i32
   store volatile i32 %b, i32* @g_i32, align 2
-; CHECK: mov.w r12, &g_i16
+; CHECK: mov.w r15, &g_i16
   store volatile i16 %c, i16* @g_i16, align 2
   ret void
 }
 
 define void @f_i16_i64_i16(i16 %a, i64 %b, i16 %c) #0 {
 ; CHECK: f_i16_i64_i16:
-; CHECK: mov.w r15, &g_i16
+; CHECK: mov.w r12, &g_i16
   store volatile i16 %a, i16* @g_i16, align 2
 ;CHECK: mov.w 10(r4), &g_i64+6
 ;CHECK: mov.w 8(r4), &g_i64+4
 ;CHECK: mov.w 6(r4), &g_i64+2
 ;CHECK: mov.w 4(r4), &g_i64
   store volatile i64 %b, i64* @g_i64, align 2
-;CHECK: mov.w 12(r4), &g_i16
+;CHECK: mov.w r13, &g_i16
   store volatile i16 %c, i16* @g_i16, align 2
   ret void
 }
diff --git a/test/CodeGen/MSP430/cc_ret.ll b/test/CodeGen/MSP430/cc_ret.ll
index c2a9ae664509d4896200dd94fa2abada824194e0..937db6dbf3bf0a26f534e4281c24ced91eb972f3 100644
--- a/test/CodeGen/MSP430/cc_ret.ll
+++ b/test/CodeGen/MSP430/cc_ret.ll
@@ -8,13 +8,13 @@ entry:
 ; CHECK: test:
 
 ; CHECK: call #f_i16
-; CHECK: mov.w r15, &g_i16
+; CHECK: mov.w r12, &g_i16
   %0 = call i16 @f_i16()
   store volatile i16 %0, i16* @g_i16
 
 ; CHECK: call #f_i32
-; CHECK: mov.w r15, &g_i32+2
-; CHECK: mov.w r14, &g_i32
+; CHECK: mov.w r13, &g_i32+2
+; CHECK: mov.w r12, &g_i32
   %1 = call i32 @f_i32()
   store volatile i32 %1, i32* @g_i32
 
@@ -35,15 +35,15 @@ entry:
 
 define i16 @f_i16() #0 {
 ; CHECK: f_i16:
-; CHECK: mov.w #1, r15
+; CHECK: mov.w #1, r12
 ; CHECK: ret
   ret i16 1
 }
 
 define i32 @f_i32() #0 {
 ; CHECK: f_i32:
-; CHECK: mov.w #772, r14
-; CHECK: mov.w #258, r15
+; CHECK: mov.w #772, r12
+; CHECK: mov.w #258, r13
 ; CHECK: ret
   ret i32 16909060
 }
diff --git a/test/CodeGen/MSP430/jumptable.ll b/test/CodeGen/MSP430/jumptable.ll
index 4ba930b04e39c09be9c559512f0d4d064b62f0e3..5ccdbb701db1fc82de723e195c9372fd08506bd8 100644
--- a/test/CodeGen/MSP430/jumptable.ll
+++ b/test/CodeGen/MSP430/jumptable.ll
@@ -11,9 +11,9 @@ entry:
   %i.addr = alloca i16, align 2
   store i16 %i, i16* %i.addr, align 2
   %0 = load i16, i16* %i.addr, align 2
-; CHECK: mov.w #2, r14
+; CHECK: mov.w #2, r13
 ; CHECK: call #__mulhi3hw_noint
-; CHECK: br .LJTI0_0(r15)
+; CHECK: br .LJTI0_0(r12)
   switch i16 %0, label %sw.default [
     i16 0, label %sw.bb
     i16 1, label %sw.bb1
diff --git a/test/CodeGen/MSP430/memset.ll b/test/CodeGen/MSP430/memset.ll
index 76cfb29586d784dab7842005e76c0123a10e9d31..a24bfafc20051e783c26c2a88d9fb022f97d2ffd 100644
--- a/test/CodeGen/MSP430/memset.ll
+++ b/test/CodeGen/MSP430/memset.ll
@@ -9,9 +9,9 @@ define void @test() nounwind {
 entry:
 ; CHECK-LABEL: test:
   %0 = load i8*, i8** @buf, align 2
-; CHECK: mov.w &buf, r15
-; CHECK-NEXT: mov.w #5, r14
-; CHECK-NEXT: mov.w #128, r13
+; CHECK: mov.w &buf, r12
+; CHECK-NEXT: mov.w #5, r13
+; CHECK-NEXT: mov.w #128, r14
 ; CHECK-NEXT: call #memset
   call void @llvm.memset.p0i8.i16(i8* %0, i8 5, i16 128, i32 1, i1 false)
   ret void
diff --git a/test/CodeGen/MSP430/setcc.ll b/test/CodeGen/MSP430/setcc.ll
index d5a8057ddd6c49e981851fcf65994a4dcf33d5a8..6e2ec8ea3ea1dade9979e54b40fb3c78869b47d2 100644
--- a/test/CodeGen/MSP430/setcc.ll
+++ b/test/CodeGen/MSP430/setcc.ll
@@ -9,10 +9,10 @@ define i16 @sccweqand(i16 %a, i16 %b) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: sccweqand:
-; CHECK:	bit.w	r14, r15
-; CHECK:	mov.w	r2, r15
-; CHECK:	rra.w   r15
-; CHECK:	and.w	#1, r15
+; CHECK:	bit.w	r13, r12
+; CHECK:	mov.w	r2, r12
+; CHECK:	rra.w   r12
+; CHECK:	and.w	#1, r12
 
 define i16 @sccwneand(i16 %a, i16 %b) nounwind {
 	%t1 = and i16 %a, %b
@@ -21,9 +21,9 @@ define i16 @sccwneand(i16 %a, i16 %b) nounwind {
 	ret i16 %t3
 }
 ; CHECK-LABEL: sccwneand:
-; CHECK: 	bit.w	r14, r15
-; CHECK:	mov.w	r2, r15
-; CHECK:	and.w	#1, r15
+; CHECK: 	bit.w	r13, r12
+; CHECK:	mov.w	r2, r12
+; CHECK:	and.w	#1, r12
 
 define i16 @sccwne(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ne i16 %a, %b
@@ -31,11 +31,11 @@ define i16 @sccwne(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwne:
-; CHECK:	cmp.w	r14, r15
-; CHECK:	mov.w	r2, r12
-; CHECK:	rra.w	r12
-; CHECK:	mov.w	#1, r15
-; CHECK:	bic.w	r12, r15
+; CHECK:	cmp.w	r13, r12
+; CHECK:	mov.w	r2, r13
+; CHECK:	rra.w	r13
+; CHECK:	mov.w	#1, r12
+; CHECK:	bic.w	r13, r12
 
 define i16 @sccweq(i16 %a, i16 %b) nounwind {
 	%t1 = icmp eq i16 %a, %b
@@ -43,10 +43,10 @@ define i16 @sccweq(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccweq:
-; CHECK:	cmp.w	r14, r15
-; CHECK:	mov.w	r2, r15
-; CHECK:	rra.w	r15
-; CHECK:	and.w	#1, r15
+; CHECK:	cmp.w	r13, r12
+; CHECK:	mov.w	r2, r12
+; CHECK:	rra.w	r12
+; CHECK:	and.w	#1, r12
 
 define i16 @sccwugt(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ugt i16 %a, %b
@@ -54,9 +54,9 @@ define i16 @sccwugt(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwugt:
-; CHECK:	cmp.w	r15, r14
-; CHECK:	mov.w	#1, r15
-; CHECK:	bic.w	r2, r15
+; CHECK:	cmp.w	r12, r13
+; CHECK:	mov.w	#1, r12
+; CHECK:	bic.w	r2, r12
 
 define i16 @sccwuge(i16 %a, i16 %b) nounwind {
 	%t1 = icmp uge i16 %a, %b
@@ -64,9 +64,9 @@ define i16 @sccwuge(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwuge:
-; CHECK:	cmp.w	r14, r15
-; CHECK:	mov.w	r2, r15
-; CHECK:	and.w	#1, r15
+; CHECK:	cmp.w	r13, r12
+; CHECK:	mov.w	r2, r12
+; CHECK:	and.w	#1, r12
 
 define i16 @sccwult(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ult i16 %a, %b
@@ -74,9 +74,9 @@ define i16 @sccwult(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwult:
-; CHECK:	cmp.w	r14, r15
-; CHECK:	mov.w	#1, r15
-; CHECK:	bic.w	r2, r15
+; CHECK:	cmp.w	r13, r12
+; CHECK:	mov.w	#1, r12
+; CHECK:	bic.w	r2, r12
 
 define i16 @sccwule(i16 %a, i16 %b) nounwind {
 	%t1 = icmp ule i16 %a, %b
@@ -84,9 +84,9 @@ define i16 @sccwule(i16 %a, i16 %b) nounwind {
 	ret i16 %t2
 }
 ; CHECK-LABEL:sccwule:
-; CHECK:	cmp.w	r15, r14
-; CHECK:	mov.w	r2, r15
-; CHECK:	and.w	#1, r15
+; CHECK:	cmp.w	r12, r13
+; CHECK:	mov.w	r2, r12
+; CHECK:	and.w	#1, r12
 
 define i16 @sccwsgt(i16 %a, i16 %b) nounwind {
 	%t1 = icmp sgt i16 %a, %b
diff --git a/test/CodeGen/MSP430/struct-return.ll b/test/CodeGen/MSP430/struct-return.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c28bf06af439d1258764238a77d3b1d1cfcda79e
--- /dev/null
+++ b/test/CodeGen/MSP430/struct-return.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"
+target triple = "msp430---elf"
+
+; Allow simple structures to be returned by value.
+
+%s = type { i64, i64 }
+
+define %s @fred() #0 {
+; CHECK-LABEL: fred:
+; CHECK: mov.w	#2314, 14(r12)
+; CHECK: mov.w	#2828, 12(r12)
+; CHECK: mov.w	#3342, 10(r12)
+; CHECK: mov.w	#3840, 8(r12)
+; CHECK: mov.w	#258, 6(r12)
+; CHECK: mov.w	#772, 4(r12)
+; CHECK: mov.w	#1286, 2(r12)
+; CHECK: mov.w	#1800, 0(r12)
+  ret %s {i64 72623859790382856, i64 651345242494996224} 
+}
+
+attributes #0 = { nounwind }
diff --git a/test/CodeGen/MSP430/vararg.ll b/test/CodeGen/MSP430/vararg.ll
index 9e511fce956f11dc9f25401f3b5e97873c20a282..6c8bceff5de94b100882689c345cd8112ead8bdd 100644
--- a/test/CodeGen/MSP430/vararg.ll
+++ b/test/CodeGen/MSP430/vararg.ll
@@ -25,13 +25,13 @@ define i16 @va_arg(i8* %vl) nounwind {
 entry:
 ; CHECK-LABEL: va_arg:
   %vl.addr = alloca i8*, align 2
-; CHECK: mov.w r15, 0(r1)
+; CHECK: mov.w r12, 0(r1)
   store i8* %vl, i8** %vl.addr, align 2
-; CHECK: mov.w r15, [[REG:r[0-9]+]]
+; CHECK: mov.w r12, [[REG:r[0-9]+]]
 ; CHECK-NEXT: add.w #2, [[REG]]
 ; CHECK-NEXT: mov.w [[REG]], 0(r1)
   %0 = va_arg i8** %vl.addr, i16
-; CHECK-NEXT: mov.w 0(r15), r15
+; CHECK-NEXT: mov.w 0(r12), r12
   ret i16 %0
 }
 
@@ -40,11 +40,11 @@ entry:
 ; CHECK-LABEL: va_copy:
   %vl.addr = alloca i8*, align 2
   %vl2 = alloca i8*, align 2
-; CHECK: mov.w r15, 2(r1)
+; CHECK: mov.w r12, 2(r1)
   store i8* %vl, i8** %vl.addr, align 2
   %0 = bitcast i8** %vl2 to i8*
   %1 = bitcast i8** %vl.addr to i8*
-; CHECK-NEXT: mov.w r15, 0(r1)
+; CHECK-NEXT: mov.w r12, 0(r1)
   call void @llvm.va_copy(i8* %0, i8* %1)
   ret void
 }
diff --git a/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll b/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll
index 5594de8177d4942b459a99f7a5251b4e7af25734..b8973efda17985553e315443024852db863769c3 100644
--- a/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll
+++ b/test/CodeGen/Mips/Fast-ISel/check-disabled-mcpus.ll
@@ -1,40 +1,40 @@
 ; Targets where we should not enable FastISel.
 ; RUN: llc -march=mips -mcpu=mips2 -O0 -relocation-model=pic \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 ; RUN: llc -march=mips -mcpu=mips3 -O0 -relocation-model=pic -target-abi n64 \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 ; RUN: llc -march=mips -mcpu=mips4 -O0 -relocation-model=pic -target-abi n64 \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 
 ; RUN: llc -march=mips -mcpu=mips32r6 -O0 -relocation-model=pic \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 
 ; RUN: llc -march=mips -mattr=mips16 -O0 -relocation-model=pic \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 
 ; RUN: llc -march=mips -mcpu=mips32r2 -mattr=+micromips -O0 -relocation-model=pic \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 ; RUN: llc -march=mips -mcpu=mips32r3 -mattr=+micromips -O0 -relocation-model=pic \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 ; RUN: llc -march=mips -mcpu=mips32r5 -mattr=+micromips -O0 -relocation-model=pic \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 
 ; RUN: llc -march=mips -mcpu=mips64 -O0 -relocation-model=pic -target-abi n64 \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 ; RUN: llc -march=mips -mcpu=mips64r2 -O0 -relocation-model=pic -target-abi n64 \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 ; RUN: llc -march=mips -mcpu=mips64r3 -O0 -relocation-model=pic -target-abi n64 \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 ; RUN: llc -march=mips -mcpu=mips64r5 -O0 -relocation-model=pic -target-abi n64 \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 ; RUN: llc -march=mips -mcpu=mips32r6 -O0 -relocation-model=pic \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s
 
 ; Valid targets for FastISel.
 ; RUN: llc -march=mips -mcpu=mips32r0 -O0 -relocation-model=pic \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s -check-prefix=FISEL
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s -check-prefix=FISEL
 ; RUN: llc -march=mips -mcpu=mips32r2 -O0 -relocation-model=pic \
-; RUN:     -fast-isel-verbose <%s 2>&1 | FileCheck %s -check-prefix=FISEL
+; RUN:     -pass-remarks-missed=isel <%s 2>&1 | FileCheck %s -check-prefix=FISEL
 
 ; The CHECK prefix is being used by those targets that do not support FastISel.
 ; By checking that we don't emit the "FastISel missed terminator..." message,
diff --git a/test/CodeGen/Mips/Fast-ISel/fastcc-miss.ll b/test/CodeGen/Mips/Fast-ISel/fastcc-miss.ll
index d9ce8b3964a461887defd2d548527fda4efc3396..0aec8d506f778af7b0957ac0f91f7d8669cdb373 100644
--- a/test/CodeGen/Mips/Fast-ISel/fastcc-miss.ll
+++ b/test/CodeGen/Mips/Fast-ISel/fastcc-miss.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -march=mipsel -mcpu=mips32r2 -O0 -relocation-model=pic \
-; RUN:     -fast-isel-verbose 2>&1 | FileCheck %s
+; RUN:     -pass-remarks-missed=isel 2>&1 | FileCheck %s
 
 ; CHECK:      FastISel missed call:
 ; CHECK-SAME: %call = call fastcc i32 @foo(i32 signext %a, i32 signext %b)
diff --git a/test/CodeGen/Mips/brconnez.ll b/test/CodeGen/Mips/brconnez.ll
index 27cf9e8cacb8144ae6ae59eaa0711f45bff830d0..eafddccdd4c70b6fd098f631bdd1ebb56f071afa 100644
--- a/test/CodeGen/Mips/brconnez.ll
+++ b/test/CodeGen/Mips/brconnez.ll
@@ -7,7 +7,7 @@ define void @test() nounwind {
 entry:
   %0 = load i32, i32* @j, align 4
   %cmp = icmp eq i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.end
+  br i1 %cmp, label %if.then, label %if.end, !prof !1
 
 ; 16:	bnez	${{[0-9]+}}, $[[LABEL:[0-9A-Ba-b_]+]]
 ; 16:   lw ${{[0-9]+}}, %got(result)(${{[0-9]+}})
@@ -21,4 +21,4 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-
+!1 = !{!"branch_weights", i32 2, i32 1}
diff --git a/test/CodeGen/Mips/cconv/arguments-float.ll b/test/CodeGen/Mips/cconv/arguments-float.ll
index 7d32992ecb128a60b3575b346cceac12fccd93b6..004f6d94749d8ec0c5e5ee301fbb9f3b2944b73c 100644
--- a/test/CodeGen/Mips/cconv/arguments-float.ll
+++ b/test/CodeGen/Mips/cconv/arguments-float.ll
@@ -63,39 +63,39 @@ entry:
 ; NEW-DAG:           sd $5, 16([[R2]])
 
 ; O32 has run out of argument registers and starts using the stack
-; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 24($sp)
-; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 28($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 16($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 20($sp)
 ; O32-DAG:           sw [[R3]], 24([[R2]])
 ; O32-DAG:           sw [[R4]], 28([[R2]])
 ; NEW-DAG:           sd $6, 24([[R2]])
 
-; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 32($sp)
-; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 36($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 24($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 28($sp)
 ; O32-DAG:           sw [[R3]], 32([[R2]])
 ; O32-DAG:           sw [[R4]], 36([[R2]])
 ; NEW-DAG:           sd $7, 32([[R2]])
 
-; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 40($sp)
-; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 44($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 32($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 36($sp)
 ; O32-DAG:           sw [[R3]], 40([[R2]])
 ; O32-DAG:           sw [[R4]], 44([[R2]])
 ; NEW-DAG:           sd $8, 40([[R2]])
 
-; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 48($sp)
-; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 52($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 40($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 44($sp)
 ; O32-DAG:           sw [[R3]], 48([[R2]])
 ; O32-DAG:           sw [[R4]], 52([[R2]])
 ; NEW-DAG:           sd $9, 48([[R2]])
 
-; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 56($sp)
-; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 60($sp)
+; O32-DAG:           lw [[R3:\$([0-9]+|gp)]], 48($sp)
+; O32-DAG:           lw [[R4:\$([0-9]+|gp)]], 52($sp)
 ; O32-DAG:           sw [[R3]], 56([[R2]])
 ; O32-DAG:           sw [[R4]], 60([[R2]])
 ; NEW-DAG:           sd $10, 56([[R2]])
 
 ; N32/N64 have run out of registers and starts using the stack too
-; O32-DAG:           lw [[R3:\$[0-9]+]], 64($sp)
-; O32-DAG:           lw [[R4:\$[0-9]+]], 68($sp)
+; O32-DAG:           lw [[R3:\$[0-9]+]], 56($sp)
+; O32-DAG:           lw [[R4:\$[0-9]+]], 60($sp)
 ; O32-DAG:           sw [[R3]], 64([[R2]])
 ; O32-DAG:           sw [[R4]], 68([[R2]])
 ; NEW-DAG:           ld [[R3:\$[0-9]+]], 0($sp)
diff --git a/test/CodeGen/Mips/cconv/arguments-varargs.ll b/test/CodeGen/Mips/cconv/arguments-varargs.ll
index 785188b3c5188c8cdd644c2d14a8c0018cd7753c..d662128945f87de898fbb6e26aa5ab53e283b509 100644
--- a/test/CodeGen/Mips/cconv/arguments-varargs.ll
+++ b/test/CodeGen/Mips/cconv/arguments-varargs.ll
@@ -315,12 +315,11 @@ entry:
 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
 ; order.
 ; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
-; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA_TMP2]])
 ; O32-DAG:       sw [[ARG1]], 8([[GV]])
-; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
-; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
-; O32-DAG:       sw [[VA2]], 0([[SP]])
-; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 4
+; O32-DAG:       sw [[VA3]], 0([[SP]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 4([[VA_TMP2]])
 ; O32-DAG:       sw [[ARG1]], 12([[GV]])
 
 ; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
@@ -349,10 +348,9 @@ entry:
 ; Load the second argument from the variable portion and copy it to the global.
 ; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
 ; O32-DAG:       sw [[ARG2]], 16([[GV]])
-; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
-; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
-; O32-DAG:       sw [[VA2]], 0([[SP]])
-; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 4
+; O32-DAG:       sw [[VA3]], 0([[SP]])
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 4([[VA_TMP2]])
 ; O32-DAG:       sw [[ARG2]], 20([[GV]])
 
 ; NEW-DAG:       ld [[ARG2:\$[0-9]+]], 0([[VA2]])
@@ -678,12 +676,11 @@ entry:
 ; Big-endian mode for N32/N64 must add an additional 4 to the offset due to byte
 ; order.
 ; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
-; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA_TMP2]])
 ; O32-DAG:       sw [[ARG1]], 8([[GV]])
-; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
-; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
-; O32-DAG:       sw [[VA2]], 0([[SP]])
-; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 4
+; O32-DAG:       sw [[VA3]], 0([[SP]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 4([[VA_TMP2]])
 ; O32-DAG:       sw [[ARG1]], 12([[GV]])
 
 ; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
@@ -712,10 +709,9 @@ entry:
 ; Load the second argument from the variable portion and copy it to the global.
 ; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
 ; O32-DAG:       sw [[ARG2]], 16([[GV]])
-; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
-; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
+; O32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 4
 ; O32-DAG:       sw [[VA2]], 0([[SP]])
-; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 4([[VA_TMP2]])
 ; O32-DAG:       sw [[ARG2]], 20([[GV]])
 
 ; NEW-DAG:       ld [[ARG2:\$[0-9]+]], 0([[VA2]])
@@ -1040,10 +1036,9 @@ entry:
 ; O32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
 ; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
 ; O32-DAG:       sw [[ARG1]], 8([[GV]])
-; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
-; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
-; O32-DAG:       sw [[VA2]], 0([[SP]])
-; O32-DAG:       lw [[ARG1:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 4
+; O32-DAG:       sw [[VA3]], 0([[SP]])
+; O32-DAG:       lw [[ARG1:\$[0-9]+]], 4([[VA_TMP2]])
 ; O32-DAG:       sw [[ARG1]], 12([[GV]])
 
 ; N32-DAG:       addiu [[GV:\$[0-9]+]], ${{[0-9]+}}, %lo(dwords)
@@ -1072,10 +1067,9 @@ entry:
 ; Load the second argument from the variable portion and copy it to the global.
 ; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
 ; O32-DAG:       sw [[ARG2]], 16([[GV]])
-; O32-DAG:       lw [[VA:\$[0-9]+]], 0([[SP]])
-; O32-DAG:       addiu [[VA2:\$[0-9]+]], [[VA]], 4
-; O32-DAG:       sw [[VA2]], 0([[SP]])
-; O32-DAG:       lw [[ARG2:\$[0-9]+]], 0([[VA]])
+; O32-DAG:       addiu [[VA3:\$[0-9]+]], [[VA2]], 4
+; O32-DAG:       sw [[VA3]], 0([[SP]])
+; O32-DAG:       lw [[ARG2:\$[0-9]+]], 4([[VA_TMP2]])
 ; O32-DAG:       sw [[ARG2]], 20([[GV]])
 
 ; NEW-DAG:       ld [[ARG2:\$[0-9]+]], 0([[VA2]])
diff --git a/test/CodeGen/Mips/cins.ll b/test/CodeGen/Mips/cins.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4fe25564d1c12dabc26e203ec905b21868162046
--- /dev/null
+++ b/test/CodeGen/Mips/cins.ll
@@ -0,0 +1,92 @@
+; RUN: llc -march=mips64 -mcpu=octeon -target-abi=n64 < %s -o - | FileCheck %s
+
+define i64 @cins_zext(i32 signext %n) {
+entry:
+  %shl = shl i32 %n, 5
+  %conv = zext i32 %shl to i64
+  ret i64 %conv
+
+; CHECK-LABEL: cins_zext:
+; CHECK:       cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 5, 26
+
+}
+
+define i64 @cins_and_shl(i64 zeroext %n) {
+entry:
+  %and = shl i64 %n, 8
+  %shl = and i64 %and, 16776960
+  ret i64 %shl
+
+; CHECK-LABEL: cins_and_shl:
+; CHECK:       cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 8, 15
+
+}
+
+define i64 @cins_and_shl32(i64 zeroext %n) {
+entry:
+  %and = shl i64 %n, 38
+  %shl = and i64 %and, 18014123631575040
+  ret i64 %shl
+
+; CHECK-LABEL: cins_and_shl32:
+; CHECK:       cins32 $[[R0:[0-9]+]], $[[R1:[0-9]+]], 6, 15
+
+}
+
+define zeroext i16 @cins_and_shl_16(i16 zeroext %n) {
+entry:
+  %0 = shl i16 %n, 2
+  %1 = and i16 %0, 60
+  ret i16 %1
+
+; CHECK-LABEL: cins_and_shl_16:
+; CHECK:       cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 2, 3
+
+}
+
+define zeroext i8 @cins_and_shl_8(i8 zeroext %n) {
+entry:
+  %0 = shl i8 %n, 2
+  %1 = and i8 %0, 12
+  ret i8 %1
+
+; CHECK-LABEL: cins_and_shl_8:
+; CHECK:       cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 2, 1
+
+}
+
+define i32 @cins_i32(i32 signext %a) {
+entry:
+  %and = shl i32 %a, 17
+  %shl = and i32 %and, 536739840
+  ret i32 %shl
+
+; CHECK-LABEL: cins_i32:
+; CHECK:       cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 17, 11
+
+}
+
+define i64 @cins_shl_and(i32 signext %n) {
+entry:
+  %and = and i32 %n, 65535
+  %conv = zext i32 %and to i64
+  %shl = shl nuw nsw i64 %conv, 31
+  ret i64 %shl
+
+; CHECK-LABEL: cins_shl_and:
+; CHECK:       cins $[[R0:[0-9]+]], $[[R1:[0-9]+]], 31, 15
+
+}
+
+
+define i64 @cins_shl_and32(i32 signext %n) {
+entry:
+  %and = and i32 %n, 65535
+  %conv = zext i32 %and to i64
+  %shl = shl nuw nsw i64 %conv, 47
+  ret i64 %shl
+
+; CHECK-LABEL: cins_shl_and32:
+; CHECK:       cins32 $[[R0:[0-9]+]], $[[R1:[0-9]+]], 15, 15
+
+}
diff --git a/test/CodeGen/Mips/compactbranches/empty-block.mir b/test/CodeGen/Mips/compactbranches/empty-block.mir
new file mode 100644
index 0000000000000000000000000000000000000000..7831e51e31579d3eef372f634780cea24ac3b2f1
--- /dev/null
+++ b/test/CodeGen/Mips/compactbranches/empty-block.mir
@@ -0,0 +1,92 @@
+# RUN: llc -march=mipsel -mcpu=mips32r6 -start-after=block-placement %s -o - | FileCheck %s
+
+# Check that empty blocks in the cfg don't cause the mips hazard scheduler to
+# crash and that the nop is inserted correctly.
+
+# CHECK:  blezc
+# CHECK:  nop
+# CHECK: # BB#1:
+# CHECK:  .insn
+# CHECK: # BB#2:
+# CHECK:  .insn
+# CHECK: # BB#3:
+# CHECK:  jal
+
+--- |
+  ; ModuleID = '<stdin>'
+  source_filename = "<stdin>"
+  target datalayout = "e-m:m-p:32:32-i8:8:32-i16:16:32-i64:64-n32-S64"
+
+  declare i32 @k()
+
+  declare void @f(i32)
+
+  define void @l5() {
+  entry:
+    %call = tail call i32 @k()
+    %cmp = icmp sgt i32 %call, 0
+    br i1 %cmp, label %if.then, label %if.end
+
+  if.then:                                          ; preds = %entry
+    tail call void @f(i32 signext 2)
+    br label %if.end
+
+  if.end:                                           ; preds = %if.then, %entry
+    ret void
+  }
+
+---
+name:            l5
+alignment:       2
+exposesReturnsTwice: false
+noVRegs:         true
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       24
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 16
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+stack:
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '%ra' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1.if.then(0x50000000), %bb.4.if.end(0x30000000)
+    liveins: %ra
+
+    %sp = ADDiu %sp, -24
+    CFI_INSTRUCTION def_cfa_offset 24
+    SW killed %ra, %sp, 20 :: (store 4 into %stack.0)
+    CFI_INSTRUCTION offset %ra_64, -4
+    JAL @k, csr_o32_fp64, implicit-def dead %ra, implicit-def %sp, implicit-def %v0
+    BLEZ %v0, %bb.4.if.end, implicit-def %at
+
+  bb.1.if.then:
+    successors: %bb.2.if.then(0x80000000)
+
+  bb.2.if.then:
+    successors: %bb.3.if.then(0x80000000)
+
+  bb.3.if.then:
+    successors: %bb.4.if.end(0x80000000)
+
+    %a0 = ADDiu %zero, 2
+    JAL @f, csr_o32_fp64, implicit-def dead %ra, implicit killed %a0, implicit-def %sp
+
+  bb.4.if.end:
+    %ra = LW %sp, 20 :: (load 4 from %stack.0)
+    %sp = ADDiu %sp, 24
+    PseudoReturn undef %ra
+
+...
diff --git a/test/CodeGen/Mips/dext.ll b/test/CodeGen/Mips/dext.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1794f16b2cd70c0fdb67280a4f339d175e95dc74
--- /dev/null
+++ b/test/CodeGen/Mips/dext.ll
@@ -0,0 +1,105 @@
+; RUN: llc -march=mips64 -mcpu=mips64r2 -target-abi=n64 < %s -o - | FileCheck %s
+
+define i64 @dext_add_zext(i32 signext %n) {
+entry:
+  %add = add i32 %n, 1
+  %res = zext i32 %add to i64
+  ret i64 %res
+
+; CHECK-LABEL: dext_add_zext:
+; CHECK:       dext $[[R0:[0-9]+]], $[[R0:[0-9]+]], 0, 32
+
+}
+
+define i32 @ext_and24(i32 signext %a) {
+entry:
+  %and = and i32 %a, 16777215
+  ret i32 %and
+
+; CHECK-LABEL: ext_and24:
+; CHECK:       ext $[[R0:[0-9]+]], $[[R1:[0-9]+]], 0, 24
+
+}
+
+define i64 @dext_and32(i64 zeroext %a) {
+entry:
+  %and = and i64 %a, 4294967295
+  ret i64 %and
+
+; CHECK-LABEL: dext_and32:
+; CHECK:       dext $[[R0:[0-9]+]], $[[R1:[0-9]+]], 0, 32
+
+}
+
+define i64 @dext_and35(i64 zeroext %a) {
+entry:
+  %and = and i64 %a, 34359738367
+  ret i64 %and
+
+; CHECK-LABEL: dext_and35:
+; CHECK:       dextm $[[R0:[0-9]+]], $[[R1:[0-9]+]], 0, 35
+
+}
+
+define i64 @dext_and20(i64 zeroext %a) {
+entry:
+  %and = and i64 %a, 1048575
+  ret i64 %and
+
+; CHECK-LABEL: dext_and20:
+; CHECK:       dext $[[R0:[0-9]+]], $[[R1:[0-9]+]], 0, 20
+
+}
+
+define i64 @dext_and16(i64 zeroext %a) {
+entry:
+  %and = and i64 %a, 65535
+  ret i64 %and
+
+; CHECK-LABEL: dext_and16:
+; CHECK:       andi $[[R0:[0-9]+]], $[[R1:[0-9]+]], 65535
+
+}
+
+define i64 @dext_lsr_and20(i64 zeroext %a) {
+entry:
+  %shr = lshr i64 %a, 5
+  %and = and i64 %shr, 1048575
+  ret i64 %and
+
+; CHECK-LABEL: dext_lsr_and20:
+; CHECK:       dext $[[R0:[0-9]+]], $[[R1:[0-9]+]], 5, 20
+
+}
+
+define i64 @dext_lsr_and8(i64 zeroext %a) {
+entry:
+  %shr = lshr i64 %a, 40
+  %and = and i64 %shr, 255
+  ret i64 %and
+
+; CHECK-LABEL: dext_lsr_and8:
+; CHECK:       dextu $[[R0:[0-9]+]], $[[R1:[0-9]+]], 40, 8
+
+}
+
+define i64 @dext_zext(i32 signext %a) {
+entry:
+  %conv = zext i32 %a to i64
+  ret i64 %conv
+
+; CHECK-LABEL: dext_zext:
+; CHECK:       dext $[[R0:[0-9]+]], $[[R1:[0-9]+]], 0, 32
+
+}
+
+define i64 @dext_and_lsr(i64 zeroext %n) {
+entry:
+  %and = lshr i64 %n, 8
+  %shr = and i64 %and, 4095
+  ret i64 %shr
+
+; CHECK-LABEL: dext_and_lsr:
+; CHECK:       dext $[[R0:[0-9]+]], $[[R1:[0-9]+]], 8, 12
+
+}
diff --git a/test/CodeGen/Mips/fastcc.ll b/test/CodeGen/Mips/fastcc.ll
index 13abc20eb3e8053b362a0a766f98aed26c755aa3..fb1bc4d9a8ab5efe7d4902d1308ec30c4ad967d6 100644
--- a/test/CodeGen/Mips/fastcc.ll
+++ b/test/CodeGen/Mips/fastcc.ll
@@ -132,20 +132,19 @@ entry:
 define internal fastcc void @callee0(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8, i32 %a9, i32 %a10, i32 %a11, i32 %a12, i32 %a13, i32 %a14, i32 %a15, i32 %a16) nounwind noinline {
 entry:
 ; CHECK: callee0
-; CHECK: sw  $4
-; CHECK: sw  $5
-; CHECK: sw  $6
-; CHECK: sw  $7
-; CHECK: sw  $8
-; CHECK: sw  $9
-; CHECK: sw  $10
-; CHECK: sw  $11
-; CHECK: sw  $12
-; CHECK: sw  $13
-; CHECK: sw  $14
-; CHECK: sw  $15
-; CHECK: sw  $24
-; CHECK: sw  $3
+; CHECK-DAG: sw  $4
+; CHECK-DAG: sw  $5
+; CHECK-DAG: sw  $7
+; CHECK-DAG: sw  $8
+; CHECK-DAG: sw  $9
+; CHECK-DAG: sw  $10
+; CHECK-DAG: sw  $11
+; CHECK-DAG: sw  $12
+; CHECK-DAG: sw  $13
+; CHECK-DAG: sw  $14
+; CHECK-DAG: sw  $15
+; CHECK-DAG: sw  $24
+; CHECK-DAG: sw  $3
 
 ; t6, t7 and t8 are reserved in NaCl and cannot be used for fastcc.
 ; CHECK-NACL-NOT: sw  $14
@@ -223,27 +222,27 @@ entry:
 
 define internal fastcc void @callee1(float %a0, float %a1, float %a2, float %a3, float %a4, float %a5, float %a6, float %a7, float %a8, float %a9, float %a10, float %a11, float %a12, float %a13, float %a14, float %a15, float %a16, float %a17, float %a18, float %a19, float %a20) nounwind noinline {
 entry:
-; CHECK: callee1
-; CHECK: swc1  $f0
-; CHECK: swc1  $f1
-; CHECK: swc1  $f2
-; CHECK: swc1  $f3
-; CHECK: swc1  $f4
-; CHECK: swc1  $f5
-; CHECK: swc1  $f6
-; CHECK: swc1  $f7
-; CHECK: swc1  $f8
-; CHECK: swc1  $f9
-; CHECK: swc1  $f10
-; CHECK: swc1  $f11
-; CHECK: swc1  $f12
-; CHECK: swc1  $f13
-; CHECK: swc1  $f14
-; CHECK: swc1  $f15
-; CHECK: swc1  $f16
-; CHECK: swc1  $f17
-; CHECK: swc1  $f18
-; CHECK: swc1  $f19
+; CHECK-LABEL: callee1:
+; CHECK-DAG: swc1  $f0
+; CHECK-DAG: swc1  $f1
+; CHECK-DAG: swc1  $f2
+; CHECK-DAG: swc1  $f3
+; CHECK-DAG: swc1  $f4
+; CHECK-DAG: swc1  $f5
+; CHECK-DAG: swc1  $f6
+; CHECK-DAG: swc1  $f7
+; CHECK-DAG: swc1  $f8
+; CHECK-DAG: swc1  $f9
+; CHECK-DAG: swc1  $f10
+; CHECK-DAG: swc1  $f11
+; CHECK-DAG: swc1  $f12
+; CHECK-DAG: swc1  $f13
+; CHECK-DAG: swc1  $f14
+; CHECK-DAG: swc1  $f15
+; CHECK-DAG: swc1  $f16
+; CHECK-DAG: swc1  $f17
+; CHECK-DAG: swc1  $f18
+; CHECK-DAG: swc1  $f19
 
   store float %a0, float* @gf0, align 4
   store float %a1, float* @gf1, align 4
@@ -316,8 +315,6 @@ entry:
 
 ; NOODDSPREG-LABEL:  callee2:
 
-; NOODDSPREG:        addiu   $sp, $sp, -[[OFFSET:[0-9]+]]
-
 ; Check that first 10 arguments are received in even float registers
 ; f0, f2, ... , f18. Check that 11th argument is received on stack.
 
@@ -333,7 +330,7 @@ entry:
 ; NOODDSPREG-DAG:    swc1    $f16, 32($[[R0]])
 ; NOODDSPREG-DAG:    swc1    $f18, 36($[[R0]])
 
-; NOODDSPREG-DAG:    lwc1    $[[F0:f[0-9]*[02468]]], [[OFFSET]]($sp)
+; NOODDSPREG-DAG:    lwc1    $[[F0:f[0-9]*[02468]]], 0($sp)
 ; NOODDSPREG-DAG:    swc1    $[[F0]], 40($[[R0]])
 
   store float %a0, float* getelementptr ([11 x float], [11 x float]* @fa, i32 0, i32 0), align 4
@@ -397,7 +394,6 @@ entry:
 
 ; FP64-NOODDSPREG-LABEL:  callee3:
 
-; FP64-NOODDSPREG:        addiu   $sp, $sp, -[[OFFSET:[0-9]+]]
 
 ; Check that first 10 arguments are received in even float registers
 ; f0, f2, ... , f18. Check that 11th argument is received on stack.
@@ -414,7 +410,7 @@ entry:
 ; FP64-NOODDSPREG-DAG:    sdc1    $f16, 64($[[R0]])
 ; FP64-NOODDSPREG-DAG:    sdc1    $f18, 72($[[R0]])
 
-; FP64-NOODDSPREG-DAG:    ldc1    $[[F0:f[0-9]*[02468]]], [[OFFSET]]($sp)
+; FP64-NOODDSPREG-DAG:    ldc1    $[[F0:f[0-9]*[02468]]], 0($sp)
 ; FP64-NOODDSPREG-DAG:    sdc1    $[[F0]], 80($[[R0]])
 
   store double %a0, double* getelementptr ([11 x double], [11 x double]* @da, i32 0, i32 0), align 8
diff --git a/test/CodeGen/Mips/llvm-ir/ashr.ll b/test/CodeGen/Mips/llvm-ir/ashr.ll
index c8d0e76f94e295fc6b9d8fb1b41d3a379d4389d4..f9fb91be0906481914082cdb04bd46ec62e172ee 100644
--- a/test/CodeGen/Mips/llvm-ir/ashr.ll
+++ b/test/CodeGen/Mips/llvm-ir/ashr.ll
@@ -83,20 +83,23 @@ entry:
 
   ; M2:         srav      $[[T0:[0-9]+]], $4, $7
   ; M2:         andi      $[[T1:[0-9]+]], $7, 32
-  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         beqz      $[[T1]], $[[BB0:BB[0-9_]+]]
   ; M2:         move      $3, $[[T0]]
+  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         nop
+  ; M2:         $[[EXIT:BB[0-9_]+]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+  ; M2:         $[[BB0]]:
   ; M2:         srlv      $[[T2:[0-9]+]], $5, $7
   ; M2:         not       $[[T3:[0-9]+]], $7
   ; M2:         sll       $[[T4:[0-9]+]], $4, 1
   ; M2:         sllv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
+  ; M2:         beqz      $[[T1]], $[[EXIT]]
   ; M2:         or        $3, $[[T3]], $[[T2]]
-  ; M2:         $[[BB0]]:
-  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
-  ; M2:         nop
-  ; M2:         sra       $2, $4, 31
   ; M2:         $[[BB1]]:
   ; M2:         jr        $ra
-  ; M2:         nop
+  ; M2:         sra       $2, $4, 31
 
   ; 32R1-R5:    srlv      $[[T0:[0-9]+]], $5, $7
   ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
@@ -169,20 +172,23 @@ entry:
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsrav     $[[T1:[0-9]+]], $4, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:.LBB[0-9_]+]]
+  ; M3:             beqz      $[[T3:[0-9]+]], [[BB0:.LBB[0-9_]+]]
   ; M3:             move      $3, $[[T1]]
+  ; M3:             bnez      $[[T3]], [[BB1:.LBB[0-9_]+]]
+  ; M3:             nop
+  ; M3:             [[EXIT:.LBB[0-9_]+]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+  ; M3:             [[BB0]]:
   ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $7
   ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
+  ; M3:             beqz      $[[T3]], [[EXIT]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
-  ; M3:             [[BB0]]:
-  ; M3:             beqz      $[[T3]], [[BB1:.LBB[0-9_]+]]
-  ; M3:             nop
-  ; M3:             dsra      $2, $4, 63
   ; M3:             [[BB1]]:
   ; M3:             jr        $ra
-  ; M3:             nop
+  ; M3:             dsra      $2, $4, 63
 
   ; GP64-NOT-R6:    dsrlv     $[[T0:[0-9]+]], $5, $7
   ; GP64-NOT-R6:    dsll      $[[T1:[0-9]+]], $4, 1
diff --git a/test/CodeGen/Mips/llvm-ir/lshr.ll b/test/CodeGen/Mips/llvm-ir/lshr.ll
index 09617edc9406816f144db7d3f8b8c9651bab9711..926f3e4c8d798204f77db47578fc7632d081791a 100644
--- a/test/CodeGen/Mips/llvm-ir/lshr.ll
+++ b/test/CodeGen/Mips/llvm-ir/lshr.ll
@@ -81,20 +81,24 @@ entry:
 
   ; M2:         srlv      $[[T0:[0-9]+]], $4, $7
   ; M2:         andi      $[[T1:[0-9]+]], $7, 32
-  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         beqz      $[[T1]], $[[BB0:BB[0-9_]+]]
   ; M2:         move      $3, $[[T0]]
+  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         addiu     $2, $zero, 0
+  ; M2:         $[[EXIT:BB[0-9_]+]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+  ; M2:         $[[BB0]]:
   ; M2:         srlv      $[[T2:[0-9]+]], $5, $7
   ; M2:         not       $[[T3:[0-9]+]], $7
   ; M2:         sll       $[[T4:[0-9]+]], $4, 1
   ; M2:         sllv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
   ; M2:         or        $3, $[[T3]], $[[T2]]
-  ; M2:         $[[BB0]]:
-  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         bnez      $[[T1]], $[[EXIT:BB[0-9_]+]]
   ; M2:         addiu     $2, $zero, 0
-  ; M2:         move      $2, $[[T0]]
   ; M2:         $[[BB1]]:
   ; M2:         jr        $ra
-  ; M2:         nop
+  ; M2:         move      $2, $[[T0]]
 
   ; 32R1-R5:    srlv      $[[T0:[0-9]+]], $5, $7
   ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
@@ -160,20 +164,24 @@ entry:
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsrlv     $[[T1:[0-9]+]], $4, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]]
+  ; M3:             beqz      $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]]
   ; M3:             move      $3, $[[T1]]
+  ; M3:             beqz      $[[T3]], [[BB1:\.LBB[0-9_]+]]
+  ; M3:             daddiu    $2, $zero, 0
+  ; M3:             [[EXIT:\.LBB[0-9_]+]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+  ; M3:             [[BB0]]:
   ; M3:             dsrlv     $[[T4:[0-9]+]], $5, $7
   ; M3:             dsll      $[[T5:[0-9]+]], $4, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsllv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $3, $[[T7]], $[[T4]]
-  ; M3:             [[BB0]]:
-  ; M3:             bnez      $[[T3]], [[BB1:\.LBB[0-9_]+]]
+  ; M3:             bnez      $[[T3]], [[EXIT]]
   ; M3:             daddiu    $2, $zero, 0
-  ; M3:             move      $2, $[[T1]]
   ; M3:             [[BB1]]:
   ; M3:             jr        $ra
-  ; M3:             nop
+  ; M3:             move      $2, $[[T1]]
 
   ; GP64-NOT-R6:    dsrlv     $[[T0:[0-9]+]], $5, $7
   ; GP64-NOT-R6:    dsll      $[[T1:[0-9]+]], $4, 1
diff --git a/test/CodeGen/Mips/llvm-ir/shl.ll b/test/CodeGen/Mips/llvm-ir/shl.ll
index ce3b91373f7f0ab4d9bee3af5f92946ccb47254a..13545907e21e54667e4bc45d4cdf119bf06f4b4f 100644
--- a/test/CodeGen/Mips/llvm-ir/shl.ll
+++ b/test/CodeGen/Mips/llvm-ir/shl.ll
@@ -97,20 +97,24 @@ entry:
 
   ; M2:         sllv      $[[T0:[0-9]+]], $5, $7
   ; M2:         andi      $[[T1:[0-9]+]], $7, 32
-  ; M2:         bnez      $[[T1]], $[[BB0:BB[0-9_]+]]
+  ; M2:         beqz      $[[T1]], $[[BB0:BB[0-9_]+]]
   ; M2:         move      $2, $[[T0]]
+  ; M2:         beqz      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         addiu     $3, $zero, 0
+  ; M2:         $[[EXIT:BB[0-9_]+]]:
+  ; M2:         jr        $ra
+  ; M2:         nop
+  ; M2:         $[[BB0]]:
   ; M2:         sllv      $[[T2:[0-9]+]], $4, $7
   ; M2:         not       $[[T3:[0-9]+]], $7
   ; M2:         srl       $[[T4:[0-9]+]], $5, 1
   ; M2:         srlv      $[[T5:[0-9]+]], $[[T4]], $[[T3]]
   ; M2:         or        $2, $[[T2]], $[[T3]]
-  ; M2:         $[[BB0]]:
-  ; M2:         bnez      $[[T1]], $[[BB1:BB[0-9_]+]]
+  ; M2:         bnez      $[[T1]], $[[EXIT]]
   ; M2:         addiu     $3, $zero, 0
-  ; M2:         move      $3, $[[T0]]
   ; M2:         $[[BB1]]:
   ; M2:         jr        $ra
-  ; M2:         nop
+  ; M2:         move      $3, $[[T0]]
 
   ; 32R1-R5:    sllv      $[[T0:[0-9]+]], $4, $7
   ; 32R1-R5:    not       $[[T1:[0-9]+]], $7
@@ -176,20 +180,24 @@ entry:
   ; M3:             sll       $[[T0:[0-9]+]], $7, 0
   ; M3:             dsllv     $[[T1:[0-9]+]], $5, $7
   ; M3:             andi      $[[T2:[0-9]+]], $[[T0]], 64
-  ; M3:             bnez      $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]]
+  ; M3:             beqz      $[[T3:[0-9]+]], [[BB0:\.LBB[0-9_]+]]
   ; M3:             move      $2, $[[T1]]
+  ; M3:             beqz      $[[T3]], [[BB1:\.LBB[0-9_]+]]
+  ; M3:             daddiu    $3, $zero, 0
+  ; M3:             [[EXIT:\.LBB[0-9_]+]]:
+  ; M3:             jr        $ra
+  ; M3:             nop
+  ; M3:             [[BB0]]:
   ; M3:             dsllv     $[[T4:[0-9]+]], $4, $7
   ; M3:             dsrl      $[[T5:[0-9]+]], $5, 1
   ; M3:             not       $[[T6:[0-9]+]], $[[T0]]
   ; M3:             dsrlv     $[[T7:[0-9]+]], $[[T5]], $[[T6]]
   ; M3:             or        $2, $[[T4]], $[[T7]]
-  ; M3:             [[BB0]]:
-  ; M3:             bnez      $[[T3]], [[BB1:\.LBB[0-9_]+]]
+  ; M3:             bnez      $[[T3]], [[EXIT]]
   ; M3:             daddiu    $3, $zero, 0
-  ; M3:             move      $3, $[[T1]]
   ; M3:             [[BB1]]:
   ; M3:             jr        $ra
-  ; M3:             nop
+  ; M3:             move      $3, $[[T1]]
 
   ; GP64-NOT-R6:    dsllv     $[[T0:[0-9]+]], $4, $7
   ; GP64-NOT-R6:    dsrl      $[[T1:[0-9]+]], $5, 1
diff --git a/test/CodeGen/Mips/load-store-left-right.ll b/test/CodeGen/Mips/load-store-left-right.ll
index 3bd924a8120019559a350a96e9e79b7c9651de12..b998772d367c472bba2b19b99bf19cd948116b14 100644
--- a/test/CodeGen/Mips/load-store-left-right.ll
+++ b/test/CodeGen/Mips/load-store-left-right.ll
@@ -8,8 +8,8 @@
 ; RUN: llc -march=mips64   -mcpu=mips4    -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EB %s
 ; RUN: llc -march=mips64el -mcpu=mips64   -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EL %s
 ; RUN: llc -march=mips64   -mcpu=mips64   -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EB %s
-; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EL %s
-; RUN: llc -march=mips64   -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64-EB %s
+; RUN: llc -march=mips64el -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64R2-EL %s
+; RUN: llc -march=mips64   -mcpu=mips64r2 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64,MIPS64R2-EB %s
 ; RUN: llc -march=mips64el -mcpu=mips64r6 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64R6,MIPS64R6-EL %s
 ; RUN: llc -march=mips64   -mcpu=mips64r6 -target-abi=n64 -relocation-model=pic < %s | FileCheck -check-prefixes=ALL,MIPS64R6,MIPS64R6-EB %s
 
@@ -37,9 +37,15 @@ entry:
 ; MIPS64-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
 ; MIPS64-EL:     lwr $[[R0]], 0($[[R1]])
 
+; MIPS64R2-EL:   lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64R2-EL:   lwr $[[R0]], 0($[[R1]])
+
 ; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
 ; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
 
+; MIPS64R2-EB:   lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64R2-EB:   lwr $[[R0]], 3($[[R1]])
+
 ; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
 ; MIPS64R6:      lw $2, 0($[[PTR]])
 
@@ -63,9 +69,15 @@ entry:
 ; MIPS64-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
 ; MIPS64-EL:     swr $[[R0]], 0($[[R1]])
 
+; MIPS64R2-EL:   swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64R2-EL:   swr $[[R0]], 0($[[R1]])
+
 ; MIPS64-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
 ; MIPS64-EB:     swr $[[R0]], 3($[[R1]])
 
+; MIPS64R2-EB:   swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64R2-EB:   swr $[[R0]], 3($[[R1]])
+
 ; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
 ; MIPS64R6:      sw $4, 0($[[PTR]])
 
@@ -94,9 +106,15 @@ entry:
 ; MIPS64-EL:     ldl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
 ; MIPS64-EL:     ldr $[[R0]], 0($[[R1]])
 
+; MIPS64R2-EL:   ldl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
+; MIPS64R2-EL:   ldr $[[R0]], 0($[[R1]])
+
 ; MIPS64-EB:     ldl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
 ; MIPS64-EB:     ldr $[[R0]], 7($[[R1]])
 
+; MIPS64R2-EB:   ldl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64R2-EB:   ldr $[[R0]], 7($[[R1]])
+
 ; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sll)(
 ; MIPS64R6:      ld $2, 0($[[PTR]])
 
@@ -123,9 +141,15 @@ entry:
 ; MIPS64-EL:     lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
 ; MIPS64-EL:     lwr $[[R0]], 0($[[R1]])
 
+; MIPS64R2-EL:   lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64R2-EL:   lwr $[[R0]], 0($[[R1]])
+
 ; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
 ; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
 
+; MIPS64R2-EB:   lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64R2-EB:   lwr $[[R0]], 3($[[R1]])
+
 ; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
 ; MIPS64R6:      lw $2, 0($[[PTR]])
 
@@ -159,9 +183,17 @@ entry:
 ; MIPS64-EL-DAG: daddiu $[[R4:[0-9]+]], $[[R3]], -1
 ; MIPS64-EL-DAG: and    ${{[0-9]+}}, $[[R0]], $[[R4]]
 
+; MIPS64R2-EL-DAG: lwl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64R2-EL-DAG: lwr $[[R0]], 0($[[R1]])
+; MIPS64R2-EL-DAG: dext $[[R0]], $[[R0]], 0, 32
+
 ; MIPS64-EB:     lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
 ; MIPS64-EB:     lwr $[[R0]], 3($[[R1]])
 
+; MIPS64R2-EB:   lwl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64R2-EB:   lwr $[[R0]], 3($[[R1]])
+; MIPS64R2-EB:   dext $[[R0]], $[[R0]], 0, 32
+
 ; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sui)(
 ; MIPS64R6:      lwu $2, 0($[[PTR]])
 
@@ -191,9 +223,15 @@ entry:
 ; MIPS64-EL:     sdl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
 ; MIPS64-EL:     sdr $[[R0]], 0($[[R1]])
 
+; MIPS64R2-EL:   sdl $[[R0:[0-9]+]], 7($[[R1:[0-9]+]])
+; MIPS64R2-EL:   sdr $[[R0]], 0($[[R1]])
+
 ; MIPS64-EB:     sdl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
 ; MIPS64-EB:     sdr $[[R0]], 7($[[R1]])
 
+; MIPS64R2-EB:   sdl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64R2-EB:   sdr $[[R0]], 7($[[R1]])
+
 ; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(sll)(
 ; MIPS64R6:      sd $4, 0($[[PTR]])
 
@@ -217,9 +255,15 @@ entry:
 ; MIPS64-EL:     swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
 ; MIPS64-EL:     swr $[[R0]], 0($[[R1]])
 
+; MIPS64R2-EL:   swl $[[R0:[0-9]+]], 3($[[R1:[0-9]+]])
+; MIPS64R2-EL:   swr $[[R0]], 0($[[R1]])
+
 ; MIPS64-EB:     swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
 ; MIPS64-EB:     swr $[[R0]], 3($[[R1]])
 
+; MIPS64R2-EB:   swl $[[R0:[0-9]+]], 0($[[R1:[0-9]+]])
+; MIPS64R2-EB:   swr $[[R0]], 3($[[R1]])
+
 ; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(si)(
 ; MIPS64R6:      sw $4, 0($[[PTR]])
 
@@ -247,15 +291,23 @@ entry:
 ; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s0)(
 ; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s0)(
 ; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+; MIPS64R2-EL:   ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
 ; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
+; MIPS64R2-EB:   ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
 ; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s0)(
 
-; FIXME: We should be able to do better than this on MIPS32r6/MIPS64r6 since
-;        we have unaligned halfword load/store available
-; ALL-DAG:       lbu $[[R1:[0-9]+]], 0($[[PTR]])
-; ALL-DAG:       sb $[[R1]], 2($[[PTR]])
-; ALL-DAG:       lbu $[[R1:[0-9]+]], 1($[[PTR]])
-; ALL-DAG:       sb $[[R1]], 3($[[PTR]])
+; MIPS32-DAG:       lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32-DAG:       sb $[[R1]], 2($[[PTR]])
+; MIPS32-DAG:       lbu $[[R2:[0-9]+]], 1($[[PTR]])
+; MIPS32-DAG:       sb $[[R2]], 3($[[PTR]])
+
+; MIPS32R6:       lhu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32R6:       sh $[[R1]], 2($[[PTR]])
+
+; MIPS64-DAG:       lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-DAG:       sb $[[R1]], 2($[[PTR]])
+; MIPS64-DAG:       lbu $[[R2:[0-9]+]], 1($[[PTR]])
+; MIPS64-DAG:       sb $[[R2]], 3($[[PTR]])
 
   %0 = load %struct.S0, %struct.S0* getelementptr inbounds (%struct.S0, %struct.S0* @struct_s0, i32 0), align 1
   store %struct.S0 %0, %struct.S0* getelementptr inbounds (%struct.S0, %struct.S0* @struct_s0, i32 1), align 1
@@ -268,37 +320,65 @@ entry:
 
 ; MIPS32-EL:     lw $[[PTR:[0-9]+]], %got(struct_s1)(
 ; MIPS32-EB:     lw $[[PTR:[0-9]+]], %got(struct_s1)(
-; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 0($[[PTR]])
-; MIPS32-DAG:    sb $[[R1]], 4($[[PTR]])
-; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 1($[[PTR]])
-; MIPS32-DAG:    sb $[[R1]], 5($[[PTR]])
-; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 2($[[PTR]])
-; MIPS32-DAG:    sb $[[R1]], 6($[[PTR]])
-; MIPS32-DAG:    lbu $[[R1:[0-9]+]], 3($[[PTR]])
-; MIPS32-DAG:    sb $[[R1]], 7($[[PTR]])
+; MIPS32-EL-DAG:    lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS32-EL-DAG:    lwr $[[R1]], 0($[[PTR]])
+; MIPS32-EL-DAG:    swl $[[R1]], 7($[[PTR]])
+; MIPS32-EL-DAG:    swr $[[R1]], 4($[[PTR]])
+; MIPS32-EB-DAG:    lwl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32-EB-DAG:    lwr $[[R1]], 3($[[PTR]])
+; MIPS32-EB-DAG:    swl $[[R1]], 4($[[PTR]])
+; MIPS32-EB-DAG:    swr $[[R1]], 7($[[PTR]])
+
+; MIPS32-NOLEFTRIGHT-DAG:    lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32-NOLEFTRIGHT-DAG:    sb $[[R1]], 4($[[PTR]])
+; MIPS32-NOLEFTRIGHT-DAG:    lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; MIPS32-NOLEFTRIGHT-DAG:    sb $[[R1]], 5($[[PTR]])
+; MIPS32-NOLEFTRIGHT-DAG:    lbu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS32-NOLEFTRIGHT-DAG:    sb $[[R1]], 6($[[PTR]])
+; MIPS32-NOLEFTRIGHT-DAG:    lbu $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS32-NOLEFTRIGHT-DAG:    sb $[[R1]], 7($[[PTR]])
 
 ; MIPS32R6:      lw $[[PTR:[0-9]+]], %got(struct_s1)(
-; MIPS32R6-DAG:  lhu $[[R1:[0-9]+]], 0($[[PTR]])
-; MIPS32R6-DAG:  sh $[[R1]], 4($[[PTR]])
-; MIPS32R6-DAG:  lhu $[[R1:[0-9]+]], 2($[[PTR]])
-; MIPS32R6-DAG:  sh $[[R1]], 6($[[PTR]])
+; MIPS32R6-DAG:  lw $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS32R6-DAG:  sw $[[R1]], 4($[[PTR]])
 
 ; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+; MIPS64R2-EL:   ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
 ; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
-; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 0($[[PTR]])
-; MIPS64-DAG:    sb $[[R1]], 4($[[PTR]])
-; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 1($[[PTR]])
-; MIPS64-DAG:    sb $[[R1]], 5($[[PTR]])
-; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 2($[[PTR]])
-; MIPS64-DAG:    sb $[[R1]], 6($[[PTR]])
-; MIPS64-DAG:    lbu $[[R1:[0-9]+]], 3($[[PTR]])
-; MIPS64-DAG:    sb $[[R1]], 7($[[PTR]])
+; MIPS64R2-EB:   ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
+
+; MIPS64-EL-DAG:    lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-EL-DAG:    lwr $[[R1]], 0($[[PTR]])
+; MIPS64-EL-DAG:    swl $[[R1]], 7($[[PTR]])
+; MIPS64-EL-DAG:    swr $[[R1]], 4($[[PTR]])
+
+; MIPS64R2-EL-DAG:    lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64R2-EL-DAG:    lwr $[[R1]], 0($[[PTR]])
+; MIPS64R2-EL-DAG:    swl $[[R1]], 7($[[PTR]])
+; MIPS64R2-EL-DAG:    swr $[[R1]], 4($[[PTR]])
+
+; MIPS64-EB-DAG:    lwl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-EB-DAG:    lwr $[[R1]], 3($[[PTR]])
+; MIPS64-EB-DAG:    swl $[[R1]], 4($[[PTR]])
+; MIPS64-EB-DAG:    swr $[[R1]], 7($[[PTR]])
+
+; MIPS64R2-EB-DAG:    lwl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64R2-EB-DAG:    lwr $[[R1]], 3($[[PTR]])
+; MIPS64R2-EB-DAG:    swl $[[R1]], 4($[[PTR]])
+; MIPS64R2-EB-DAG:    swr $[[R1]], 7($[[PTR]])
+
+; MIPS64-NOLEFTRIGHT-DAG:    lbu $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-NOLEFTRIGHT-DAG:    sb $[[R1]], 4($[[PTR]])
+; MIPS64-NOLEFTRIGHT-DAG:    lbu $[[R1:[0-9]+]], 1($[[PTR]])
+; MIPS64-NOLEFTRIGHT-DAG:    sb $[[R1]], 5($[[PTR]])
+; MIPS64-NOLEFTRIGHT-DAG:    lbu $[[R1:[0-9]+]], 2($[[PTR]])
+; MIPS64-NOLEFTRIGHT-DAG:    sb $[[R1]], 6($[[PTR]])
+; MIPS64-NOLEFTRIGHT-DAG:    lbu $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64-NOLEFTRIGHT-DAG:    sb $[[R1]], 7($[[PTR]])
 
 ; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s1)(
-; MIPS64R6-DAG:  lhu $[[R1:[0-9]+]], 0($[[PTR]])
-; MIPS64R6-DAG:  sh $[[R1]], 4($[[PTR]])
-; MIPS64R6-DAG:  lhu $[[R1:[0-9]+]], 2($[[PTR]])
-; MIPS64R6-DAG:  sh $[[R1]], 6($[[PTR]])
+; MIPS64R6-DAG:  lw $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64R6-DAG:  sw $[[R1]], 4($[[PTR]])
 
   %0 = load %struct.S1, %struct.S1* getelementptr inbounds (%struct.S1, %struct.S1* @struct_s1, i32 0), align 1
   store %struct.S1 %0, %struct.S1* getelementptr inbounds (%struct.S1, %struct.S1* @struct_s1, i32 1), align 1
@@ -336,30 +416,34 @@ entry:
 ; MIPS32R6-DAG:  sw $[[R1]],       12($[[PTR]])
 
 ; MIPS64-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
-; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
-; MIPS64-EL-DAG: lwr $[[R1]],        0($[[PTR]])
-; MIPS64-EL-DAG: swl $[[R1]],       11($[[PTR]])
-; MIPS64-EL-DAG: swr $[[R1]],        8($[[PTR]])
-; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 7($[[PTR]])
-; MIPS64-EL-DAG: lwr $[[R1]],        4($[[PTR]])
-; MIPS64-EL-DAG: swl $[[R1]],       15($[[PTR]])
-; MIPS64-EL-DAG: swr $[[R1]],       12($[[PTR]])
+
+; MIPS64-EL-DAG: ldl $[[R1:[0-9]+]], 7($[[PTR]])
+; MIPS64-EL-DAG: ldr $[[R1]],        0($[[PTR]])
+; MIPS64-EL-DAG: sdl $[[R1]],       15($[[PTR]])
+; MIPS64-EL-DAG: sdr $[[R1]],        8($[[PTR]])
+
+; MIPS64R2-EL:     ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+
+; MIPS64R2-EL-DAG: ldl $[[R1:[0-9]+]], 7($[[PTR]])
+; MIPS64R2-EL-DAG: ldr $[[R1]],        0($[[PTR]])
+; MIPS64R2-EL-DAG: sdl $[[R1]],       15($[[PTR]])
+; MIPS64R2-EL-DAG: sdr $[[R1]],        8($[[PTR]])
 
 ; MIPS64-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
-; MIPS64-EB-DAG: lwl $[[R1:[0-9]+]], 0($[[PTR]])
-; MIPS64-EB-DAG: lwr $[[R1]],        3($[[PTR]])
-; MIPS64-EB-DAG: swl $[[R1]],        8($[[PTR]])
-; MIPS64-EB-DAG: swr $[[R1]],       11($[[PTR]])
-; MIPS64-EB-DAG: lwl $[[R1:[0-9]+]], 4($[[PTR]])
-; MIPS64-EB-DAG: lwr $[[R1]],        7($[[PTR]])
-; MIPS64-EB-DAG: swl $[[R1]],       12($[[PTR]])
-; MIPS64-EB-DAG: swr $[[R1]],       15($[[PTR]])
+; MIPS64-EB-DAG: ldl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-EB-DAG: ldr $[[R1]],        7($[[PTR]])
+; MIPS64-EB-DAG: sdl $[[R1]],        8($[[PTR]])
+; MIPS64-EB-DAG: sdr $[[R1]],       15($[[PTR]])
+
+; MIPS64R2-EB:     ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
+; MIPS64R2-EB-DAG: ldl $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64R2-EB-DAG: ldr $[[R1]],        7($[[PTR]])
+; MIPS64R2-EB-DAG: sdl $[[R1]],        8($[[PTR]])
+; MIPS64R2-EB-DAG: sdr $[[R1]],       15($[[PTR]])
 
 ; MIPS64R6:      ld $[[PTR:[0-9]+]], %got_disp(struct_s2)(
-; MIPS64R6-DAG:  lw $[[R1:[0-9]+]], 0($[[PTR]])
-; MIPS64R6-DAG:  sw $[[R1]],        8($[[PTR]])
-; MIPS64R6-DAG:  lw $[[R1:[0-9]+]], 4($[[PTR]])
-; MIPS64R6-DAG:  sw $[[R1]],       12($[[PTR]])
+; MIPS64R6-DAG:  ld $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64R6-DAG:  sd $[[R1]],        8($[[PTR]])
 
   %0 = load %struct.S2, %struct.S2* getelementptr inbounds (%struct.S2, %struct.S2* @struct_s2, i32 0), align 1
   store %struct.S2 %0, %struct.S2* getelementptr inbounds (%struct.S2, %struct.S2* @struct_s2, i32 1), align 1
@@ -416,20 +500,39 @@ entry:
 ; MIPS64-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
 ; MIPS64-EL-DAG: lwr $[[R1]],   0($[[PTR]])
 
-; MIPS64-EB:     ld $[[SPTR:[0-9]+]], %got_disp(arr)(
-; MIPS64-EB-DAG: lwl  $[[R1:[0-9]+]], 0($[[PTR]])
-; MIPS64-EB-DAG: lwr  $[[R1]],   3($[[PTR]])
-; MIPS64-EB-DAG: dsll $[[R1]], $[[R1]], 32
+; MIPS64R2-EL:     ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+; MIPS64R2-EL-DAG: lwl $[[R1:[0-9]+]], 3($[[PTR]])
+; MIPS64R2-EL-DAG: lwr $[[R1]],   0($[[PTR]])
+
+; MIPS64-EB: ld $[[SPTR:[0-9]+]], %got_disp(arr)(
 ; MIPS64-EB-DAG: lbu  $[[R2:[0-9]+]], 5($[[PTR]])
 ; MIPS64-EB-DAG: lbu  $[[R3:[0-9]+]], 4($[[PTR]])
 ; MIPS64-EB-DAG: dsll $[[T0:[0-9]+]], $[[R3]], 8
 ; MIPS64-EB-DAG: or   $[[T1:[0-9]+]], $[[T0]], $[[R2]]
-; MIPS64-EB-DAG: dsll $[[T1]], $[[T1]], 16
-; MIPS64-EB-DAG: or   $[[T3:[0-9]+]], $[[R1]], $[[T1]]
 ; MIPS64-EB-DAG: lbu  $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[T1]], $[[T1]], 16
+; MIPS64-EB-DAG: lwl  $[[R1:[0-9]+]], 0($[[PTR]])
+; MIPS64-EB-DAG: lwr  $[[R1]],   3($[[PTR]])
+; MIPS64-EB-DAG: dsll $[[R5:[0-9]+]], $[[R1]], 32
+; MIPS64-EB-DAG: or   $[[T3:[0-9]+]], $[[R5]], $[[T1]]
 ; MIPS64-EB-DAG: dsll $[[T4:[0-9]+]], $[[R4]], 8
 ; MIPS64-EB-DAG: or   $4, $[[T3]], $[[T4]]
 
+; MIPS64R2-EB: ld $[[SPTR:[0-9]+]], %got_disp(arr)(
+; MIPS64R2-EB-DAG: lbu  $[[R1:[0-9]+]], 5($[[PTR]])
+; MIPS64R2-EB-DAG: lbu  $[[R2:[0-9]+]], 4($[[PTR]])
+; MIPS64R2-EB-DAG: dsll $[[T0:[0-9]+]], $[[R2]], 8
+; MIPS64R2-EB-DAG: or   $[[T1:[0-9]+]], $[[T0]], $[[R1]]
+; MIPS64R2-EB-DAG: dsll $[[T1]], $[[T1]], 16
+; MIPS64R2-EB-DAG: lwl  $[[R3:[0-9]+]], 0($[[PTR]])
+; MIPS64R2-EB-DAG: lwr  $[[R3]], 3($[[PTR]])
+; MIPS64R2-EB-DAG: dext $[[R3]], $[[R3]], 0, 32
+; MIPS64R2-EB-DAG: dsll $[[R3]], $[[R3]], 32
+; MIPS64R2-EB-DAG: or   $[[T2:[0-9]+]], $[[R3]], $[[T1]]
+; MIPS64R2-EB-DAG: lbu  $[[R4:[0-9]+]], 6($[[PTR]])
+; MIPS64R2-EB-DAG: dsll $[[T3:[0-9]+]], $[[R4]], 8
+; MIPS64R2-EB-DAG: or   $4, $[[T2]], $[[T3]]
+
 ; MIPS64R6:      ld $[[SPTR:[0-9]+]], %got_disp(arr)(
 
   tail call void @extern_func([7 x i8]* byval @arr) nounwind
diff --git a/test/CodeGen/Mips/mature-mc-support.ll b/test/CodeGen/Mips/mature-mc-support.ll
index 6e5998d8a7cb7a15b7209468fe12ecb6efdc6ab8..9c93e96a376b8f2aa95298535793d271a4943549 100644
--- a/test/CodeGen/Mips/mature-mc-support.ll
+++ b/test/CodeGen/Mips/mature-mc-support.ll
@@ -29,4 +29,4 @@
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
 
-; CHECK: LLVM ERROR: Error parsing inline asm
+; CHECK: error: unknown directive
diff --git a/test/CodeGen/Mips/micromips-compact-branches.ll b/test/CodeGen/Mips/micromips-compact-branches.ll
index c689944d386b158efa8ce649b26f443c3fc1c471..332cd8cd105c0bee15cfe4d9c838893a75531ee4 100644
--- a/test/CodeGen/Mips/micromips-compact-branches.ll
+++ b/test/CodeGen/Mips/micromips-compact-branches.ll
@@ -6,7 +6,7 @@ entry:
   %x = alloca i32, align 4
   %0 = load i32, i32* %x, align 4
   %cmp = icmp eq i32 %0, 0
-  br i1 %cmp, label %if.then, label %if.end
+  br i1 %cmp, label %if.then, label %if.end, !prof !1
 
 if.then:
   store i32 10, i32* %x, align 4
@@ -17,3 +17,4 @@ if.end:
 }
 
 ; CHECK: bnezc
+!1 = !{!"branch_weights", i32 2, i32 1}
diff --git a/test/CodeGen/Mips/micromips-li.ll b/test/CodeGen/Mips/micromips-li.ll
index ac315f93825115ef800468deba2caf002e91417c..997f4e9196afaf118c7d1536271f910cdf5f6706 100644
--- a/test/CodeGen/Mips/micromips-li.ll
+++ b/test/CodeGen/Mips/micromips-li.ll
@@ -13,6 +13,6 @@ entry:
   ret i32 0
 }
 
-; CHECK: li16   ${{[2-7]|16|17}}, 1
 ; CHECK: addiu  ${{[0-9]+}}, $zero, 2148
+; CHECK: li16   ${{[2-7]|16|17}}, 1
 ; CHECK: ori ${{[0-9]+}}, $zero, 33332
diff --git a/test/CodeGen/Mips/mips64-f128-call.ll b/test/CodeGen/Mips/mips64-f128-call.ll
index c59f25ef4afa146289003ed35bcce06667697d8b..19fa8fc752450d8e08d9298b8d4d013887fd490f 100644
--- a/test/CodeGen/Mips/mips64-f128-call.ll
+++ b/test/CodeGen/Mips/mips64-f128-call.ll
@@ -4,8 +4,8 @@
 @gld1 = external global fp128
 
 ; CHECK: foo0
-; CHECK: sdc1  $f12, %lo(gld0)(${{[0-9]+}})
-; CHECK: sdc1  $f13, 8(${{[0-9]+}})
+; CHECK-DAG: sdc1  $f12, %lo(gld0)(${{[0-9]+}})
+; CHECK-DAG: sdc1  $f13, 8(${{[0-9]+}})
 
 define void @foo0(fp128 %a0) {
 entry:
@@ -14,8 +14,8 @@ entry:
 }
 
 ; CHECK: foo1
-; CHECK: ldc1  $f12, %lo(gld0)(${{[0-9]+}})
-; CHECK: ldc1  $f13, 8(${{[0-9]+}})
+; CHECK-DAG: ldc1  $f12, %lo(gld0)(${{[0-9]+}})
+; CHECK-DAG: ldc1  $f13, 8(${{[0-9]+}})
 
 define void @foo1() {
 entry:
@@ -26,11 +26,11 @@ entry:
 
 declare void @foo2(fp128)
 
+
 ; CHECK: foo3:
-; CHECK: daddiu $[[R0:[0-9]+]], ${{[0-9]+}}, %hi(gld0)
-; CHECK: dsll $[[R1:[0-9]+]], $[[R0]], 16
+
+; CHECK: daddiu $[[R2:[0-9]+]], $[[R1:[0-9]+]], %lo(gld0)
 ; CHECK: sdc1 $f0, %lo(gld0)($[[R1]])
-; CHECK: daddiu $[[R2:[0-9]]], $[[R1]], %lo(gld0)
 ; CHECK: sdc1 $f2, 8($[[R2]])
 ; CHECK: daddiu $[[R3:[0-9]+]], ${{[0-9]+}}, %hi(gld1)
 ; CHECK: dsll $[[R4:[0-9]+]], $[[R3]], 16
@@ -39,7 +39,6 @@ declare void @foo2(fp128)
 ; CHECK: ldc1 $f2, 8($[[R5]])
 
 
-
 define fp128 @foo3() {
 entry:
   %call = tail call fp128 @foo4()
diff --git a/test/CodeGen/Mips/mips64-f128.ll b/test/CodeGen/Mips/mips64-f128.ll
index 304ab8b90d33bae006480bf8249c5564c2375eda..a6dafb1abfd664037ca157e29ab2b0dafa488739 100644
--- a/test/CodeGen/Mips/mips64-f128.ll
+++ b/test/CodeGen/Mips/mips64-f128.ll
@@ -1,15 +1,15 @@
 ; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips4 -mattr=+soft-float -O1 \
 ; RUN:     -disable-mips-delay-filler -relocation-model=pic < %s | FileCheck \
-; RUN:     %s -check-prefixes=ALL,C_CC_FMT,PRER6
+; RUN:     %s -check-prefixes=ALL,C_CC_FMT,PRER6,NOT-R2R6
 ; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64 -mattr=+soft-float -O1 \
 ; RUN:     -disable-mips-delay-filler -relocation-model=pic < %s | FileCheck \
-; RUN:     %s -check-prefixes=ALL,C_CC_FMT,PRER6
+; RUN:     %s -check-prefixes=ALL,C_CC_FMT,PRER6,NOT-R2R6
 ; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64r2 -mattr=+soft-float \
 ; RUN:     -O1 -disable-mips-delay-filler -relocation-model=pic < %s | FileCheck \
-; RUN:     %s -check-prefixes=ALL,C_CC_FMT,PRER6
+; RUN:     %s -check-prefixes=ALL,C_CC_FMT,PRER6,R2R6
 ; RUN: llc -mtriple=mips64el-unknown-unknown -mcpu=mips64r6 -mattr=+soft-float \
 ; RUN:     -O1 -disable-mips-delay-filler -relocation-model=pic < %s | FileCheck \
-; RUN:     %s -check-prefixes=ALL,CMP_CC_FMT,R6
+; RUN:     %s -check-prefixes=ALL,CMP_CC_FMT,R6,R2R6
 
 @gld0 = external global fp128
 @gld1 = external global fp128
@@ -242,12 +242,16 @@ entry:
 }
 
 ; ALL-LABEL:             libcall1_fabsl:
-; ALL-DAG: ld      $[[R0:[0-9]+]], 8($[[R4:[0-9]+]])
-; ALL-DAG: daddiu  $[[R1:[0-9]+]], $zero, 1
-; ALL-DAG: dsll    $[[R2:[0-9]+]], $[[R1]], 63
-; ALL-DAG: daddiu  $[[R3:[0-9]+]], $[[R2]], -1
-; ALL-DAG: and     $4, $[[R0]], $[[R3]]
-; ALL-DAG: ld      $2, 0($[[R4]])
+; NOT-R2R6-DAG: ld      $[[R0:[0-9]+]], 8($[[R4:[0-9]+]])
+; NOT-R2R6-DAG: daddiu  $[[R1:[0-9]+]], $zero, 1
+; NOT-R2R6-DAG: dsll    $[[R2:[0-9]+]], $[[R1]], 63
+; NOT-R2R6-DAG: daddiu  $[[R3:[0-9]+]], $[[R2]], -1
+; NOT-R2R6-DAG: and     $4, $[[R0]], $[[R3]]
+; NOT-R2R6-DAG: ld      $2, 0($[[R4]])
+
+; R2R6-DAG: ld     $[[R0:[0-9]+]], 0($[[R3:[0-9]+]])
+; R2R6-DAG: ld     $[[R1:[0-9]+]], 8($[[R3]])
+; R2R6-DAG: dextm  $[[R2:[0-9]+]], $[[R1]], 0, 63
 
 define fp128 @libcall1_fabsl() {
 entry:
@@ -414,17 +418,19 @@ entry:
 declare fp128 @llvm.powi.f128(fp128, i32) #3
 
 ; ALL-LABEL:     libcall2_copysignl:
-; ALL-DAG: daddiu $[[R2:[0-9]+]], $zero, 1
-; ALL-DAG: dsll   $[[R3:[0-9]+]], $[[R2]], 63
-; ALL-DAG: ld     $[[R0:[0-9]+]], %got_disp(gld1)
-; ALL-DAG: ld     $[[R1:[0-9]+]], 8($[[R0]])
-; ALL-DAG: and    $[[R4:[0-9]+]], $[[R1]], $[[R3]]
-; ALL-DAG: ld     $[[R5:[0-9]+]], %got_disp(gld0)
-; ALL-DAG: ld     $[[R6:[0-9]+]], 8($[[R5]])
-; ALL-DAG: daddiu $[[R7:[0-9]+]], $[[R3]], -1
-; ALL-DAG: and    $[[R8:[0-9]+]], $[[R6]], $[[R7]]
-; ALL-DAG: or     $4, $[[R8]], $[[R4]]
-; ALL-DAG: ld     $2, 0($[[R5]])
+; ALL-DAG:      daddiu $[[R2:[0-9]+]], $zero, 1
+; ALL-DAG:      dsll   $[[R3:[0-9]+]], $[[R2]], 63
+; ALL-DAG:      ld     $[[R0:[0-9]+]], %got_disp(gld1)
+; ALL-DAG:      ld     $[[R1:[0-9]+]], 8($[[R0]])
+; ALL-DAG:      and    $[[R4:[0-9]+]], $[[R1]], $[[R3]]
+; ALL-DAG:      ld     $[[R5:[0-9]+]], %got_disp(gld0)
+; ALL-DAG:      ld     $[[R6:[0-9]+]], 8($[[R5]])
+; NOT-R2R6-DAG: daddiu $[[R7:[0-9]+]], $[[R3]], -1
+; NOT-R2R6-DAG: and    $[[R8:[0-9]+]], $[[R6]], $[[R7]]
+; NOT-R2R6-DAG: or     $4, $[[R8]], $[[R4]]
+; R2R6-DAG:     dextm  $[[R7:[0-9]+]], $[[R6]], 0, 63
+; R2R6-DAG:     or     $4, $[[R7]], $[[R4]]
+; ALL-DAG:      ld     $2, 0($[[R5]])
 
 define fp128 @libcall2_copysignl() {
 entry:
@@ -577,10 +583,10 @@ entry:
 
 ; ALL-LABEL: store_LD_LD:
 ; ALL: ld $[[R0:[0-9]+]], %got_disp(gld1)
-; ALL: ld $[[R1:[0-9]+]], 0($[[R0]])
 ; ALL: ld $[[R2:[0-9]+]], 8($[[R0]])
 ; ALL: ld $[[R3:[0-9]+]], %got_disp(gld0)
 ; ALL: sd $[[R2]], 8($[[R3]])
+; ALL: ld $[[R1:[0-9]+]], 0($[[R0]])
 ; ALL: sd $[[R1]], 0($[[R3]])
 
 define void @store_LD_LD() {
diff --git a/test/CodeGen/Mips/mno-ldc1-sdc1.ll b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
index 9663138d4c8175eefc8ab9e2ef3f2001f9075116..0260afaa18602a6fb41afd0b10597ef4a7bf9325 100644
--- a/test/CodeGen/Mips/mno-ldc1-sdc1.ll
+++ b/test/CodeGen/Mips/mno-ldc1-sdc1.ll
@@ -130,12 +130,12 @@
 ; MM-MNO-PIC:         addiu   $[[R1:[0-9]+]], $[[R0]], %lo(_gp_disp)
 ; MM-MNO-PIC:         addu    $[[R2:[0-9]+]], $[[R1]], $25
 ; MM-MNO-PIC:         lw      $[[R3:[0-9]+]], %got(g0)($[[R2]])
-; MM-MNO-PIC:         lw16    $[[R4:[0-9]+]], 0($[[R3]])
-; MM-MNO-PIC:         lw16    $[[R5:[0-9]+]], 4($[[R3]])
-; MM-MNO-LE-PIC:      mtc1    $[[R4]], $f0
-; MM-MNO-LE-PIC:      mthc1   $[[R5]], $f0
-; MM-MNO-BE-PIC:      mtc1    $[[R5]], $f0
-; MM-MNO-BE-PIC:      mthc1   $[[R4]], $f0
+; MM-MNO-PIC-DAG:     lw16    $[[R4:[0-9]+]], 0($[[R3]])
+; MM-MNO-PIC-DAG:     lw16    $[[R5:[0-9]+]], 4($[[R3]])
+; MM-MNO-LE-PIC-DAG:  mtc1    $[[R4]], $f0
+; MM-MNO-LE-PIC-DAG:  mthc1   $[[R5]], $f0
+; MM-MNO-BE-PIC-DAG:  mtc1    $[[R5]], $f0
+; MM-MNO-BE-PIC-DAG:  mthc1   $[[R4]], $f0
 
 ; MM-STATIC-PIC:      lui     $[[R0:[0-9]+]], %hi(g0)
 ; MM-STATIC-PIC:      ldc1    $f0, %lo(g0)($[[R0]])
@@ -214,13 +214,13 @@ entry:
 ; MM-MNO-PIC:         lui     $[[R0:[0-9]+]], %hi(_gp_disp)
 ; MM-MNO-PIC:         addiu   $[[R1:[0-9]+]], $[[R0]], %lo(_gp_disp)
 ; MM-MNO-PIC:         addu    $[[R2:[0-9]+]], $[[R1]], $25
-; MM-MNO-LE-PIC:      mfc1    $[[R3:[0-9]+]], $f12
-; MM-MNO-BE-PIC:      mfhc1   $[[R3:[0-9]+]], $f12
-; MM-MNO-PIC:         lw      $[[R4:[0-9]+]], %got(g0)($[[R2]])
-; MM-MNO-PIC:         sw16    $[[R3]], 0($[[R4]])
-; MM-MNO-LE-PIC:      mfhc1   $[[R5:[0-9]+]], $f12
-; MM-MNO-BE-PIC:      mfc1    $[[R5:[0-9]+]], $f12
-; MM-MNO-PIC:         sw16    $[[R5]], 4($[[R4]])
+; MM-MNO-LE-PIC-DAG:  mfc1    $[[R3:[0-9]+]], $f12
+; MM-MNO-BE-PIC-DAG:  mfhc1   $[[R3:[0-9]+]], $f12
+; MM-MNO-PIC-DAG:     lw      $[[R4:[0-9]+]], %got(g0)($[[R2]])
+; MM-MNO-PIC-DAG:     sw16    $[[R3]], 0($[[R4]])
+; MM-MNO-LE-PIC-DAG:  mfhc1   $[[R5:[0-9]+]], $f12
+; MM-MNO-BE-PIC-DAG:  mfc1    $[[R5:[0-9]+]], $f12
+; MM-MNO-PIC-DAG:     sw16    $[[R5]], 4($[[R4]])
 
 ; MM-STATIC-PIC:      lui     $[[R0:[0-9]+]], %hi(g0)
 ; MM-STATIC-PIC:      sdc1    $f12, %lo(g0)($[[R0]])
@@ -267,8 +267,8 @@ entry:
 
 ; MM-MNO-PIC:    sll16   $[[R0:[0-9]+]], $5, 3
 ; MM-MNO-PIC:    addu16  $[[R1:[0-9]+]], $4, $[[R0]]
-; MM-MNO-PIC:    lw16    $[[R2:[0-9]+]], 0($[[R1]])
-; MM-MNO-PIC:    lw16    $[[R3:[0-9]+]], 4($[[R1]])
+; MM-MNO-PIC-DAG: lw16    $[[R2:[0-9]+]], 0($[[R1]])
+; MM-MNO-PIC-DAG: lw16    $[[R3:[0-9]+]], 4($[[R1]])
 ; MM-MNO-LE-PIC: mtc1    $[[R2]], $f0
 ; MM-MNO-LE-PIC: mthc1   $[[R3]], $f0
 ; MM-MNO-BE-PIC: mtc1    $[[R3]], $f0
@@ -313,14 +313,14 @@ entry:
 ; MM:            addu16  $[[R1:[0-9]+]], $6, $[[R0]]
 ; MM:            sdc1    $f12, 0($[[R1]])
 
-; MM-MNO-PIC:    sll16   $[[R0:[0-9]+]], $7, 3
-; MM-MNO-PIC:    addu16  $[[R1:[0-9]+]], $6, $[[R0]]
-; MM-MNO-LE-PIC: mfc1    $[[R2:[0-9]+]], $f12
-; MM-MNO-BE-PIC: mfhc1   $[[R2:[0-9]+]], $f12
-; MM-MNO-PIC:    sw16    $[[R2]], 0($[[R1]])
-; MM-MNO-LE-PIC: mfhc1   $[[R3:[0-9]+]], $f12
-; MM-MNO-BE-PIC: mfc1    $[[R3:[0-9]+]], $f12
-; MM-MNO-PIC:    sw16    $[[R3]], 4($[[R1]])
+; MM-MNO-PIC:        sll16   $[[R0:[0-9]+]], $7, 3
+; MM-MNO-PIC:        addu16  $[[R1:[0-9]+]], $6, $[[R0]]
+; MM-MNO-LE-PIC-DAG: mfc1    $[[R2:[0-9]+]], $f12
+; MM-MNO-BE-PIC-DAG: mfhc1   $[[R2:[0-9]+]], $f12
+; MM-MNO-PIC-DAG:    sw16    $[[R2]], 0($[[R1]])
+; MM-MNO-LE-PIC-DAG: mfhc1   $[[R3:[0-9]+]], $f12
+; MM-MNO-BE-PIC-DAG: mfc1    $[[R3:[0-9]+]], $f12
+; MM-MNO-PIC-DAG:    sw16    $[[R3]], 4($[[R1]])
 
 ; MM-STATIC-PIC: sll16   $[[R0:[0-9]+]], $7, 3
 ; MM-STATIC-PIC: addu16  $[[R1:[0-9]+]], $6, $[[R0]]
diff --git a/test/CodeGen/Mips/msa/3r_4r_widen.ll b/test/CodeGen/Mips/msa/3r_4r_widen.ll
index fe248eeb566b3c6d7d112a896d9be911f5d85619..467cff5a1a3c39904e52c24b0cc1d73d2244534c 100644
--- a/test/CodeGen/Mips/msa/3r_4r_widen.ll
+++ b/test/CodeGen/Mips/msa/3r_4r_widen.ll
@@ -5,18 +5,16 @@
 ; RUN: llc -march=mips -mattr=+msa,+fp64 < %s | FileCheck %s
 ; RUN: llc -march=mipsel -mattr=+msa,+fp64 < %s | FileCheck %s
 
-@llvm_mips_dpadd_s_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
 @llvm_mips_dpadd_s_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
 @llvm_mips_dpadd_s_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
 @llvm_mips_dpadd_s_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
 
 define void @llvm_mips_dpadd_s_h_test() nounwind {
 entry:
-  %0 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_s_h_ARG1
-  %1 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_s_h_ARG2
-  %2 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_s_h_ARG3
-  %3 = tail call <8 x i16> @llvm.mips.dpadd.s.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
-  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpadd_s_h_RES
+  %0 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_s_h_ARG2
+  %1 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_s_h_ARG3
+  %2 = tail call <8 x i16> @llvm.mips.dpadd.s.h(<8 x i16> <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>, <16 x i8> %0, <16 x i8> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_dpadd_s_h_RES
   ret void
 }
 
@@ -25,23 +23,21 @@ declare <8 x i16> @llvm.mips.dpadd.s.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
 ; CHECK: llvm_mips_dpadd_s_h_test:
 ; CHECK: ld.b
 ; CHECK: ld.b
-; CHECK: ld.h
-; CHECK: dpadd_s.h
+; CHECK: ldi.h [[R1:\$w[0-9]+]],
+; CHECK: dpadd_s.h [[R1]],
 ; CHECK: st.h
 ; CHECK: .size llvm_mips_dpadd_s_h_test
 ;
-@llvm_mips_dpadd_s_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
 @llvm_mips_dpadd_s_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
 @llvm_mips_dpadd_s_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
 @llvm_mips_dpadd_s_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
 
 define void @llvm_mips_dpadd_s_w_test() nounwind {
 entry:
-  %0 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_s_w_ARG1
-  %1 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_s_w_ARG2
-  %2 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_s_w_ARG3
-  %3 = tail call <4 x i32> @llvm.mips.dpadd.s.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
-  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpadd_s_w_RES
+  %0 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_s_w_ARG2
+  %1 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_s_w_ARG3
+  %2 = tail call <4 x i32> @llvm.mips.dpadd.s.w(<4 x i32> <i32 4, i32 4, i32 4, i32 4>, <8 x i16> %0, <8 x i16> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_dpadd_s_w_RES
   ret void
 }
 
@@ -50,48 +46,44 @@ declare <4 x i32> @llvm.mips.dpadd.s.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
 ; CHECK: llvm_mips_dpadd_s_w_test:
 ; CHECK: ld.h
 ; CHECK: ld.h
-; CHECK: ld.w
-; CHECK: dpadd_s.w
+; CHECK: ldi.w [[R1:\$w[0-9]+]],
+; CHECK: dpadd_s.w [[R1]],
 ; CHECK: st.w
 ; CHECK: .size llvm_mips_dpadd_s_w_test
 ;
-@llvm_mips_dpadd_s_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
 @llvm_mips_dpadd_s_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
 @llvm_mips_dpadd_s_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
 @llvm_mips_dpadd_s_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
 
 define void @llvm_mips_dpadd_s_d_test() nounwind {
 entry:
-  %0 = load <2 x i64>, <2 x i64>* @llvm_mips_dpadd_s_d_ARG1
-  %1 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_s_d_ARG2
-  %2 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_s_d_ARG3
-  %3 = tail call <2 x i64> @llvm.mips.dpadd.s.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
-  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpadd_s_d_RES
+  %0 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_s_d_ARG2
+  %1 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_s_d_ARG3
+  %2 = tail call <2 x i64> @llvm.mips.dpadd.s.d(<2 x i64> <i64 4, i64 4>, <4 x i32> %0, <4 x i32> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_dpadd_s_d_RES
   ret void
 }
 
 declare <2 x i64> @llvm.mips.dpadd.s.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
 
 ; CHECK: llvm_mips_dpadd_s_d_test:
+; CHECK: ldi.d [[R1:\$w[0-9]+]],
 ; CHECK: ld.w
 ; CHECK: ld.w
-; CHECK: ld.d
-; CHECK: dpadd_s.d
+; CHECK: dpadd_s.d [[R1]],
 ; CHECK: st.d
 ; CHECK: .size llvm_mips_dpadd_s_d_test
 ;
-@llvm_mips_dpadd_u_h_ARG1 = global <8 x i16> <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, align 16
 @llvm_mips_dpadd_u_h_ARG2 = global <16 x i8> <i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 16, i8 17, i8 18, i8 19, i8 20, i8 21, i8 22, i8 23>, align 16
 @llvm_mips_dpadd_u_h_ARG3 = global <16 x i8> <i8 24, i8 25, i8 26, i8 27, i8 28, i8 29, i8 30, i8 31, i8 32, i8 33, i8 34, i8 35, i8 36, i8 37, i8 38, i8 39>, align 16
 @llvm_mips_dpadd_u_h_RES  = global <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0, i16 0>, align 16
 
 define void @llvm_mips_dpadd_u_h_test() nounwind {
 entry:
-  %0 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_u_h_ARG1
-  %1 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_u_h_ARG2
-  %2 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_u_h_ARG3
-  %3 = tail call <8 x i16> @llvm.mips.dpadd.u.h(<8 x i16> %0, <16 x i8> %1, <16 x i8> %2)
-  store <8 x i16> %3, <8 x i16>* @llvm_mips_dpadd_u_h_RES
+  %0 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_u_h_ARG2
+  %1 = load <16 x i8>, <16 x i8>* @llvm_mips_dpadd_u_h_ARG3
+  %2 = tail call <8 x i16> @llvm.mips.dpadd.u.h(<8 x i16> <i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4, i16 4>, <16 x i8> %0, <16 x i8> %1)
+  store <8 x i16> %2, <8 x i16>* @llvm_mips_dpadd_u_h_RES
   ret void
 }
 
@@ -100,23 +92,21 @@ declare <8 x i16> @llvm.mips.dpadd.u.h(<8 x i16>, <16 x i8>, <16 x i8>) nounwind
 ; CHECK: llvm_mips_dpadd_u_h_test:
 ; CHECK: ld.b
 ; CHECK: ld.b
-; CHECK: ld.h
-; CHECK: dpadd_u.h
+; CHECK: ldi.h [[R1:\$w[0-9]+]],
+; CHECK: dpadd_u.h [[R1]],
 ; CHECK: st.h
 ; CHECK: .size llvm_mips_dpadd_u_h_test
 ;
-@llvm_mips_dpadd_u_w_ARG1 = global <4 x i32> <i32 0, i32 1, i32 2, i32 3>, align 16
 @llvm_mips_dpadd_u_w_ARG2 = global <8 x i16> <i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11>, align 16
 @llvm_mips_dpadd_u_w_ARG3 = global <8 x i16> <i16 12, i16 13, i16 14, i16 15, i16 16, i16 17, i16 18, i16 19>, align 16
 @llvm_mips_dpadd_u_w_RES  = global <4 x i32> <i32 0, i32 0, i32 0, i32 0>, align 16
 
 define void @llvm_mips_dpadd_u_w_test() nounwind {
 entry:
-  %0 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_u_w_ARG1
-  %1 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_u_w_ARG2
-  %2 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_u_w_ARG3
-  %3 = tail call <4 x i32> @llvm.mips.dpadd.u.w(<4 x i32> %0, <8 x i16> %1, <8 x i16> %2)
-  store <4 x i32> %3, <4 x i32>* @llvm_mips_dpadd_u_w_RES
+  %0 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_u_w_ARG2
+  %1 = load <8 x i16>, <8 x i16>* @llvm_mips_dpadd_u_w_ARG3
+  %2 = tail call <4 x i32> @llvm.mips.dpadd.u.w(<4 x i32> <i32 4, i32 4, i32 4, i32 4>, <8 x i16> %0, <8 x i16> %1)
+  store <4 x i32> %2, <4 x i32>* @llvm_mips_dpadd_u_w_RES
   ret void
 }
 
@@ -125,33 +115,31 @@ declare <4 x i32> @llvm.mips.dpadd.u.w(<4 x i32>, <8 x i16>, <8 x i16>) nounwind
 ; CHECK: llvm_mips_dpadd_u_w_test:
 ; CHECK: ld.h
 ; CHECK: ld.h
-; CHECK: ld.w
-; CHECK: dpadd_u.w
+; CHECK: ldi.w [[R1:\$w[0-9]+]],
+; CHECK: dpadd_u.w [[R1]],
 ; CHECK: st.w
 ; CHECK: .size llvm_mips_dpadd_u_w_test
 ;
-@llvm_mips_dpadd_u_d_ARG1 = global <2 x i64> <i64 0, i64 1>, align 16
 @llvm_mips_dpadd_u_d_ARG2 = global <4 x i32> <i32 2, i32 3, i32 4, i32 5>, align 16
 @llvm_mips_dpadd_u_d_ARG3 = global <4 x i32> <i32 6, i32 7, i32 8, i32 9>, align 16
 @llvm_mips_dpadd_u_d_RES  = global <2 x i64> <i64 0, i64 0>, align 16
 
 define void @llvm_mips_dpadd_u_d_test() nounwind {
 entry:
-  %0 = load <2 x i64>, <2 x i64>* @llvm_mips_dpadd_u_d_ARG1
-  %1 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_u_d_ARG2
-  %2 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_u_d_ARG3
-  %3 = tail call <2 x i64> @llvm.mips.dpadd.u.d(<2 x i64> %0, <4 x i32> %1, <4 x i32> %2)
-  store <2 x i64> %3, <2 x i64>* @llvm_mips_dpadd_u_d_RES
+  %0 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_u_d_ARG2
+  %1 = load <4 x i32>, <4 x i32>* @llvm_mips_dpadd_u_d_ARG3
+  %2 = tail call <2 x i64> @llvm.mips.dpadd.u.d(<2 x i64> <i64 4, i64 4>, <4 x i32> %0, <4 x i32> %1)
+  store <2 x i64> %2, <2 x i64>* @llvm_mips_dpadd_u_d_RES
   ret void
 }
 
 declare <2 x i64> @llvm.mips.dpadd.u.d(<2 x i64>, <4 x i32>, <4 x i32>) nounwind
 
 ; CHECK: llvm_mips_dpadd_u_d_test:
+; CHECK: ldi.d [[R1:\$w[0-9]+]],
 ; CHECK: ld.w
 ; CHECK: ld.w
-; CHECK: ld.d
-; CHECK: dpadd_u.d
+; CHECK: dpadd_u.d [[R1]],
 ; CHECK: st.d
 ; CHECK: .size llvm_mips_dpadd_u_d_test
 ;
diff --git a/test/CodeGen/Mips/msa/basic_operations.ll b/test/CodeGen/Mips/msa/basic_operations.ll
index d7a05800a27372e5d62d84f9fa644fa8cc7a2c81..c14221937f4da3dd4dcc3726d1ca5c65b70a6173 100644
--- a/test/CodeGen/Mips/msa/basic_operations.ll
+++ b/test/CodeGen/Mips/msa/basic_operations.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=mips -mattr=+msa,+fp64 -relocation-model=pic \
 ; RUN:   -verify-machineinstrs < %s | \
-; RUN:   FileCheck -check-prefixes=ALL,O32,MIPS32,ALL-BE %s
+; RUN:   FileCheck -check-prefixes=ALL,O32,MIPS32,ALL-BE,O32-BE %s
 ; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic \
 ; RUN:   -verify-machineinstrs < %s | \
-; RUN:   FileCheck -check-prefixes=ALL,O32,MIPS32,ALL-LE %s
+; RUN:   FileCheck -check-prefixes=ALL,O32,MIPS32,ALL-LE,O32-LE %s
 ; RUN: llc -march=mips64 -target-abi n32 -mattr=+msa,+fp64 \
 ; RUN:   -relocation-model=pic -verify-machineinstrs < %s | \
 ; RUN:   FileCheck -check-prefixes=ALL,N32,MIPS64,ALL-BE %s
@@ -58,10 +58,19 @@ define void @const_v16i8() nounwind {
   ; ALL-DAG: fill.w [[R1:\$w[0-9]+]], [[R2]]
 
   store volatile <16 x i8> <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>, <16 x i8>*@v16i8
-  ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
-  ; ALL: ld.b  [[R1:\$w[0-9]+]], 0([[G_PTR]])
+  ; ALL-BE-DAG: lui [[R3:\$[0-9]+]], 1286
+  ; ALL-LE-DAG: lui [[R3:\$[0-9]+]], 2055
+  ; ALL-BE-DAG: ori [[R4:\$[0-9]+]], [[R3]], 1800
+  ; ALL-LE-DAG: ori [[R4:\$[0-9]+]], [[R3]], 1541
+  ; O32-BE: fill.w  [[R1:\$w[0-9]+]], [[R4]]
+
+  ; O32: insert.w [[R1]][1], [[R2]]
+  ; O32: splati.d $w{{.*}}, [[R1]][0]
+
+  ; MIPS64-BE: dinsu [[R4]], [[R2]], 32, 32
+  ; MIPS64-LE: dinsu [[R2]], [[R4]], 32, 32
+  ; MIPS64-BE: fill.d $w{{.*}}, [[R4]]
+  ; MIPS64-LE: fill.d $w{{.*}}, [[R2]]
 
   ret void
 }
@@ -92,10 +101,19 @@ define void @const_v8i16() nounwind {
   ; ALL-DAG: fill.w [[R1:\$w[0-9]+]], [[R2]]
 
   store volatile <8 x i16> <i16 1, i16 2, i16 3, i16 4, i16 1, i16 2, i16 3, i16 4>, <8 x i16>*@v8i16
-  ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
-  ; ALL: ld.h  [[R1:\$w[0-9]+]], 0([[G_PTR]])
+  ; ALL-BE-DAG: lui [[R3:\$[0-9]+]], 3
+  ; ALL-LE-DAG: lui [[R3:\$[0-9]+]], 4
+  ; ALL-BE-DAG: ori [[R4:\$[0-9]+]], [[R3]], 4
+  ; ALL-LE-DAG: ori [[R4:\$[0-9]+]], [[R3]], 3
+
+  ; O32-BE: fill.w [[R1:\$w[0-9]+]], [[R4]]
+  ; O32: insert.w [[R1]][1], [[R2]]
+  ; O32: splati.d $w{{.*}}, [[R1]][0]
+
+  ; MIPS64-BE: dinsu [[R4]], [[R2]], 32, 32
+  ; MIPS64-LE: dinsu [[R2]], [[R4]], 32, 32
+  ; MIPS64-BE: fill.d $w{{.*}}, [[R4]]
+  ; MIPS64-LE: fill.d $w{{.*}}, [[R2]]
 
   ret void
 }
@@ -122,10 +140,23 @@ define void @const_v4i32() nounwind {
   ; ALL: ldi.h [[R1:\$w[0-9]+]], 1
 
   store volatile <4 x i32> <i32 1, i32 2, i32 1, i32 2>, <4 x i32>*@v4i32
-  ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
-  ; N32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
-  ; N64: daddiu [[G_PTR:\$[0-9]+]], {{.*}}, %got_ofst(.L
-  ; ALL: ld.w  [[R1:\$w[0-9]+]], 0([[G_PTR]])
+  ; -BE-DAG: ori [[R2:\$[0-9]+]], $zero, 1
+  ; O32-BE-DAG: ori [[R3:\$[0-9]+]], $zero, 1
+  ; O32-BE-DAG: ori [[R4:\$[0-9]+]], $zero, 2
+  ; O32-LE-DAG: ori [[R3:\$[0-9]+]], $zero, 2
+  ; O32-LE-DAG: ori [[R4:\$[0-9]+]], $zero, 1
+  ; O32: fill.w [[W0:\$w[0-9]+]], [[R4]]
+  ; O32: insert.w [[W0]][1], [[R3]]
+  ; O32: splati.d [[W1:\$w[0-9]+]], [[W0]]
+
+  ; MIPS64-DAG: ori [[R5:\$[0-9]+]], $zero, 2
+  ; MIPS64-DAG: ori [[R6:\$[0-9]+]], $zero, 1
+
+  ; MIPS64-BE: dinsu [[R5]], [[R6]], 32, 32
+  ; MIPS64-LE: dinsu [[R6]], [[R5]], 32, 32
+  ; MIPS64-BE: fill.d $w{{.*}}, [[R4]]
+  ; MIPS64-LE: fill.d $w{{.*}}, [[R2]]
+
 
   store volatile <4 x i32> <i32 3, i32 4, i32 5, i32 6>, <4 x i32>*@v4i32
   ; O32: addiu [[G_PTR:\$[0-9]+]], {{.*}}, %lo($
diff --git a/test/CodeGen/Mips/msa/bitwise.ll b/test/CodeGen/Mips/msa/bitwise.ll
index 2a260b2c573328cdeedadf907e7d6d04f87b1ae0..63fce5283ba0a87c4091ce1c2b1c384a5ce7e599 100644
--- a/test/CodeGen/Mips/msa/bitwise.ll
+++ b/test/CodeGen/Mips/msa/bitwise.ll
@@ -1099,7 +1099,7 @@ define void @binsl_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind
                           i8 63, i8 63, i8 63, i8 63,
                           i8 63, i8 63, i8 63, i8 63>
   %5 = or <16 x i8> %3, %4
-  ; CHECK-DAG: binsli.b [[R2]], [[R1]], 2
+  ; CHECK-DAG: binsli.b [[R2]], [[R1]], 1
   store <16 x i8> %5, <16 x i8>* %c
   ; CHECK-DAG: st.b [[R2]], 0($4)
 
@@ -1119,7 +1119,7 @@ define void @binsl_v8i16_i(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind
   %4 = and <8 x i16> %2, <i16 16383, i16 16383, i16 16383, i16 16383,
                           i16 16383, i16 16383, i16 16383, i16 16383>
   %5 = or <8 x i16> %3, %4
-  ; CHECK-DAG: binsli.h [[R2]], [[R1]], 2
+  ; CHECK-DAG: binsli.h [[R2]], [[R1]], 1
   store <8 x i16> %5, <8 x i16>* %c
   ; CHECK-DAG: st.h [[R2]], 0($4)
 
@@ -1137,7 +1137,7 @@ define void @binsl_v4i32_i(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind
   %3 = and <4 x i32> %1, <i32 3221225472, i32 3221225472, i32 3221225472, i32 3221225472>
   %4 = and <4 x i32> %2, <i32 1073741823, i32 1073741823, i32 1073741823, i32 1073741823>
   %5 = or <4 x i32> %3, %4
-  ; CHECK-DAG: binsli.w [[R2]], [[R1]], 2
+  ; CHECK-DAG: binsli.w [[R2]], [[R1]], 1
   store <4 x i32> %5, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R2]], 0($4)
 
@@ -1159,7 +1159,7 @@ define void @binsl_v2i64_i(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind
   ;       issue. If the mask doesn't fit within a 10-bit immediate, it gets
   ;       legalized into a constant pool. We should add a test to cover the
   ;       other cases once they correctly select binsli.d.
-  ; CHECK-DAG: binsli.d [[R2]], [[R1]], 61
+  ; CHECK-DAG: binsli.d [[R2]], [[R1]], 60
   store <2 x i64> %5, <2 x i64>* %c
   ; CHECK-DAG: st.d [[R2]], 0($4)
 
@@ -1181,7 +1181,7 @@ define void @binsr_v16i8_i(<16 x i8>* %c, <16 x i8>* %a, <16 x i8>* %b) nounwind
                           i8 252, i8 252, i8 252, i8 252,
                           i8 252, i8 252, i8 252, i8 252>
   %5 = or <16 x i8> %3, %4
-  ; CHECK-DAG: binsri.b [[R2]], [[R1]], 2
+  ; CHECK-DAG: binsri.b [[R2]], [[R1]], 1
   store <16 x i8> %5, <16 x i8>* %c
   ; CHECK-DAG: st.b [[R2]], 0($4)
 
@@ -1201,7 +1201,7 @@ define void @binsr_v8i16_i(<8 x i16>* %c, <8 x i16>* %a, <8 x i16>* %b) nounwind
   %4 = and <8 x i16> %2, <i16 65532, i16 65532, i16 65532, i16 65532,
                           i16 65532, i16 65532, i16 65532, i16 65532>
   %5 = or <8 x i16> %3, %4
-  ; CHECK-DAG: binsri.h [[R2]], [[R1]], 2
+  ; CHECK-DAG: binsri.h [[R2]], [[R1]], 1
   store <8 x i16> %5, <8 x i16>* %c
   ; CHECK-DAG: st.h [[R2]], 0($4)
 
@@ -1219,7 +1219,7 @@ define void @binsr_v4i32_i(<4 x i32>* %c, <4 x i32>* %a, <4 x i32>* %b) nounwind
   %3 = and <4 x i32> %1, <i32 3, i32 3, i32 3, i32 3>
   %4 = and <4 x i32> %2, <i32 4294967292, i32 4294967292, i32 4294967292, i32 4294967292>
   %5 = or <4 x i32> %3, %4
-  ; CHECK-DAG: binsri.w [[R2]], [[R1]], 2
+  ; CHECK-DAG: binsri.w [[R2]], [[R1]], 1
   store <4 x i32> %5, <4 x i32>* %c
   ; CHECK-DAG: st.w [[R2]], 0($4)
 
@@ -1237,7 +1237,7 @@ define void @binsr_v2i64_i(<2 x i64>* %c, <2 x i64>* %a, <2 x i64>* %b) nounwind
   %3 = and <2 x i64> %1, <i64 3, i64 3>
   %4 = and <2 x i64> %2, <i64 18446744073709551612, i64 18446744073709551612>
   %5 = or <2 x i64> %3, %4
-  ; CHECK-DAG: binsri.d [[R2]], [[R1]], 2
+  ; CHECK-DAG: binsri.d [[R2]], [[R1]], 1
   store <2 x i64> %5, <2 x i64>* %c
   ; CHECK-DAG: st.d [[R2]], 0($4)
 
diff --git a/test/CodeGen/Mips/msa/bmzi_bmnzi.ll b/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d1cb3c348c73ffa6e0010e96ed16aeb765a8979c
--- /dev/null
+++ b/test/CodeGen/Mips/msa/bmzi_bmnzi.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=mipsel -mattr=+msa,+fp64 -relocation-model=pic < %s | FileCheck %s
+
+@llvm_mips_bmnzi_b_ARG1 = global <16 x i8> <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>, align 16
+@llvm_mips_bmnzi_b_ARG2 = global <16 x i8> zeroinitializer, align 16
+@llvm_mips_bmnzi_b_RES = global <16 x i8> zeroinitializer, align 16
+
+define void @llvm_mips_bmnzi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG1
+  %1 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 240)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  %3 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 15)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  %4 = tail call <16 x i8> @llvm.mips.bmnzi.b(<16 x i8> %0, <16 x i8> %1, i32 170)
+  store <16 x i8> %4, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  ret void
+}
+; CHECK-LABEL: llvm_mips_bmnzi_b_test:
+; CHECK: lw [[R0:\$[0-9]+]], %got(llvm_mips_bmnzi_b_RES)(
+; CHECK: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnzi_b_ARG1)(
+; CHECK: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnzi_b_ARG2)(
+; CHECK: ld.b [[R3:\$w[0-9]+]], 0([[R2]])
+; CHECK: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK: move.v [[R5:\$w[0-9]+]], [[R4]]
+; CHECK: binsli.b [[R5]], [[R3]], 3
+; CHECK: binsri.b [[R5]], [[R3]], 3
+; CHECK: bmnzi.b [[R4]], [[R3]], 170
+
+define void @llvm_mips_bmzi_b_test() nounwind {
+entry:
+  %0 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG1
+  %1 = load <16 x i8>, <16 x i8>* @llvm_mips_bmnzi_b_ARG2
+  %2 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 240)
+  store <16 x i8> %2, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  %3 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 15)
+  store <16 x i8> %3, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  %4 = tail call <16 x i8> @llvm.mips.bmzi.b(<16 x i8> %0, <16 x i8> %1, i32 170)
+  store <16 x i8> %4, <16 x i8>* @llvm_mips_bmnzi_b_RES
+  ret void
+}
+; CHECK-LABEL: llvm_mips_bmzi_b_test:
+; CHECK: lw [[R0:\$[0-9]+]], %got(llvm_mips_bmnzi_b_RES)(
+; CHECK: lw [[R1:\$[0-9]+]], %got(llvm_mips_bmnzi_b_ARG2)(
+; CHECK: lw [[R2:\$[0-9]+]], %got(llvm_mips_bmnzi_b_ARG1)(
+; CHECK: ld.b [[R3:\$w[0-9]+]], 0([[R2]])
+; CHECK: ld.b [[R4:\$w[0-9]+]], 0([[R1]])
+; CHECK: move.v [[R5:\$w[0-9]+]], [[R4]]
+; CHECK: binsli.b [[R5]], [[R3]], 3
+; CHECK: binsri.b [[R5]], [[R3]], 3
+; bmnzi.b is the same as bmzi.b with ws and wd_in swapped
+; CHECK: bmnzi.b [[R4]], [[R3]], 170
+
+declare <16 x i8> @llvm.mips.bmnzi.b(<16 x i8>, <16 x i8>, i32) nounwind
+declare <16 x i8> @llvm.mips.bmzi.b(<16 x i8>, <16 x i8>, i32) nounwind
diff --git a/test/CodeGen/Mips/msa/f16-llvm-ir.ll b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
index 9957d5be26edaa70dc104986f1c236252004ea55..ac69dc913c182919e6a76760efdac6b591351434 100644
--- a/test/CodeGen/Mips/msa/f16-llvm-ir.ll
+++ b/test/CodeGen/Mips/msa/f16-llvm-ir.ll
@@ -234,15 +234,15 @@ entry:
 ; MIPS32:     insert.w $w[[W0]][1], $[[R1]]
 ; MIPS32:     insert.w $w[[W0]][3], $[[R1]]
 
-; MIPS64-N64: ld $[[R3:[0-9]+]], %got_disp(h)
-; MIPS64-N32: lw $[[R3:[0-9]+]], %got_disp(h)
-; MIPS64:     dmfc1 $[[R1:[0-9]+]], $f[[F2]]
-; MIPS64:     fill.d $w[[W0:[0-9]+]], $[[R1]]
+; MIPS64-N64-DAG: ld $[[R3:[0-9]+]], %got_disp(h)
+; MIPS64-N32-DAG: lw $[[R3:[0-9]+]], %got_disp(h)
+; MIPS64-DAG:     dmfc1 $[[R1:[0-9]+]], $f[[F2]]
+; MIPS64-DAG:     fill.d $w[[W0:[0-9]+]], $[[R1]]
 
-; ALL:        fexdo.w $w[[W1:[0-9]+]], $w[[W0]], $w[[W0]]
-; ALL:        fexdo.h $w[[W2:[0-9]+]], $w[[W1]], $w[[W1]]
+; ALL-DAG:        fexdo.w $w[[W1:[0-9]+]], $w[[W0]], $w[[W0]]
+; ALL-DAG:        fexdo.h $w[[W2:[0-9]+]], $w[[W1]], $w[[W1]]
 
-; MIPS32:     lw $[[R3:[0-9]+]], %got(h)
+; MIPS32-DAG:     lw $[[R3:[0-9]+]], %got(h)
 
 ; ALL:        copy_u.h $[[R2:[0-9]+]], $w[[W2]]
 ; ALL:        sh $[[R2]], 0($[[R3]])
diff --git a/test/CodeGen/Mips/msa/i5-b.ll b/test/CodeGen/Mips/msa/i5-b.ll
index c588c8b2407edd1a8d72974cb3ebcb01815b9567..5afd3cd48dd5d8a4dae8bbf238711c5b6f965c80 100644
--- a/test/CodeGen/Mips/msa/i5-b.ll
+++ b/test/CodeGen/Mips/msa/i5-b.ll
@@ -89,7 +89,7 @@ define void @llvm_mips_binsli_b_test() nounwind {
 entry:
   %0 = load <16 x i8>, <16 x i8>* @llvm_mips_binsli_b_ARG1
   %1 = load <16 x i8>, <16 x i8>* @llvm_mips_binsli_b_ARG2
-  %2 = tail call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %0, <16 x i8> %1, i32 7)
+  %2 = tail call <16 x i8> @llvm.mips.binsli.b(<16 x i8> %0, <16 x i8> %1, i32 6)
   store <16 x i8> %2, <16 x i8>* @llvm_mips_binsli_b_RES
   ret void
 }
@@ -101,7 +101,7 @@ declare <16 x i8> @llvm.mips.binsli.b(<16 x i8>, <16 x i8>, i32) nounwind
 ; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsli_b_ARG2)(
 ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
 ; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
-; CHECK-DAG: binsli.b [[R3]], [[R4]], 7
+; CHECK-DAG: binsli.b [[R3]], [[R4]], 6
 ; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsli_b_RES)(
 ; CHECK-DAG: st.b [[R3]], 0([[R5]])
 ; CHECK: .size llvm_mips_binsli_b_test
@@ -193,7 +193,7 @@ define void @llvm_mips_binsri_b_test() nounwind {
 entry:
   %0 = load <16 x i8>, <16 x i8>* @llvm_mips_binsri_b_ARG1
   %1 = load <16 x i8>, <16 x i8>* @llvm_mips_binsri_b_ARG2
-  %2 = tail call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %0, <16 x i8> %1, i32 7)
+  %2 = tail call <16 x i8> @llvm.mips.binsri.b(<16 x i8> %0, <16 x i8> %1, i32 6)
   store <16 x i8> %2, <16 x i8>* @llvm_mips_binsri_b_RES
   ret void
 }
@@ -205,7 +205,7 @@ declare <16 x i8> @llvm.mips.binsri.b(<16 x i8>, <16 x i8>, i32) nounwind
 ; CHECK-DAG: lw [[R2:\$[0-9]+]], %got(llvm_mips_binsri_b_ARG2)(
 ; CHECK-DAG: ld.b [[R3:\$w[0-9]+]], 0([[R1]])
 ; CHECK-DAG: ld.b [[R4:\$w[0-9]+]], 0([[R2]])
-; CHECK-DAG: binsri.b [[R3]], [[R4]], 7
+; CHECK-DAG: binsri.b [[R3]], [[R4]], 6
 ; CHECK-DAG: lw [[R5:\$[0-9]+]], %got(llvm_mips_binsri_b_RES)(
 ; CHECK-DAG: st.b [[R3]], 0([[R5]])
 ; CHECK: .size llvm_mips_binsri_b_test
diff --git a/test/CodeGen/Mips/msa/i5_ld_st.ll b/test/CodeGen/Mips/msa/i5_ld_st.ll
index c644d242a003f70d0728cd4af101bfd9c81cf2a6..812c400d46e483e9e9d76e02ad692b8a8f947b10 100644
--- a/test/CodeGen/Mips/msa/i5_ld_st.ll
+++ b/test/CodeGen/Mips/msa/i5_ld_st.ll
@@ -336,8 +336,8 @@ entry:
 
 ; CHECK: llvm_mips_st_b_valid_range_tests:
 ; CHECK: ld.b
-; CHECK: st.b [[R1:\$w[0-9]+]], -512(
-; CHECK: st.b [[R1:\$w[0-9]+]], 511(
+; CHECK-DAG: st.b [[R1:\$w[0-9]+]], -512(
+; CHECK-DAG: st.b [[R1:\$w[0-9]+]], 511(
 ; CHECK: .size llvm_mips_st_b_valid_range_tests
 ;
 
@@ -351,10 +351,10 @@ entry:
 }
 
 ; CHECK: llvm_mips_st_b_invalid_range_tests:
-; CHECK: addiu $2, $1, -513
+; CHECK: addiu $2, $1, 512
 ; CHECK: ld.b
 ; CHECK: st.b [[R1:\$w[0-9]+]], 0(
-; CHECK: addiu $1, $1, 512
+; CHECK: addiu $1, $1, -513
 ; CHECK: st.b [[R1:\$w[0-9]+]], 0(
 ; CHECK: .size llvm_mips_st_b_invalid_range_tests
 ;
@@ -404,8 +404,8 @@ entry:
 
 ; CHECK: llvm_mips_st_h_valid_range_tests:
 ; CHECK: ld.h
-; CHECK: st.h [[R1:\$w[0-9]+]], -1024(
-; CHECK: st.h [[R1:\$w[0-9]+]], 1022(
+; CHECK-DAG: st.h [[R1:\$w[0-9]+]], -1024(
+; CHECK-DAG: st.h [[R1:\$w[0-9]+]], 1022(
 ; CHECK: .size llvm_mips_st_h_valid_range_tests
 ;
 
@@ -419,10 +419,10 @@ entry:
 }
 
 ; CHECK: llvm_mips_st_h_invalid_range_tests:
-; CHECK: addiu $2, $1, -1026
+; CHECK: addiu $2, $1, 1024
 ; CHECK: ld.h
 ; CHECK: st.h [[R1:\$w[0-9]+]], 0(
-; CHECK: addiu $1, $1, 1024
+; CHECK: addiu $1, $1, -1026
 ; CHECK: st.h [[R1:\$w[0-9]+]], 0(
 ; CHECK: .size llvm_mips_st_h_invalid_range_tests
 ;
@@ -472,8 +472,8 @@ entry:
 
 ; CHECK: llvm_mips_st_w_valid_range_tests:
 ; CHECK: ld.w
-; CHECK: st.w [[R1:\$w[0-9]+]], -2048(
-; CHECK: st.w [[R1:\$w[0-9]+]], 2044(
+; CHECK-DAG: st.w [[R1:\$w[0-9]+]], -2048(
+; CHECK-DAG: st.w [[R1:\$w[0-9]+]], 2044(
 ; CHECK: .size llvm_mips_st_w_valid_range_tests
 ;
 
@@ -487,10 +487,10 @@ entry:
 }
 
 ; CHECK: llvm_mips_st_w_invalid_range_tests:
-; CHECK: addiu $2, $1, -2052
+; CHECK: addiu $2, $1, 2048
 ; CHECK: ld.w
 ; CHECK: st.w [[R1:\$w[0-9]+]], 0(
-; CHECK: addiu $1, $1, 2048
+; CHECK: addiu $1, $1, -2052
 ; CHECK: st.w [[R1:\$w[0-9]+]], 0(
 ; CHECK: .size llvm_mips_st_w_invalid_range_tests
 ;
@@ -540,8 +540,8 @@ entry:
 
 ; CHECK: llvm_mips_st_d_valid_range_tests:
 ; CHECK: ld.d
-; CHECK: st.d [[R1:\$w[0-9]+]], -4096(
-; CHECK: st.d [[R1:\$w[0-9]+]], 4088(
+; CHECK-DAG: st.d [[R1:\$w[0-9]+]], -4096(
+; CHECK-DAG: st.d [[R1:\$w[0-9]+]], 4088(
 ; CHECK: .size llvm_mips_st_d_valid_range_tests
 ;
 
@@ -555,10 +555,10 @@ entry:
 }
 
 ; CHECK: llvm_mips_st_d_invalid_range_tests:
-; CHECK: addiu $2, $1, -4104
+; CHECK: addiu $2, $1, 4096
 ; CHECK: ld.d
 ; CHECK: st.d [[R1:\$w[0-9]+]], 0(
-; CHECK: addiu $1, $1, 4096
+; CHECK: addiu $1, $1, -4104
 ; CHECK: st.d [[R1:\$w[0-9]+]], 0(
 ; CHECK: .size llvm_mips_st_d_invalid_range_tests
 ;
diff --git a/test/CodeGen/Mips/msa/immediates.ll b/test/CodeGen/Mips/msa/immediates.ll
index b561ace30a8a144b3c350bffa49b9d8831e4fbdd..0e9fb4c7adfc18e04e466bdc222b8d41559635f9 100644
--- a/test/CodeGen/Mips/msa/immediates.ll
+++ b/test/CodeGen/Mips/msa/immediates.ll
@@ -616,7 +616,7 @@ entry:
 ; CHECK: binsri.h
   %a = load <8 x i16>, <8 x i16> * %ptr, align 16
   %b = load <8 x i16>, <8 x i16> * %ptr2, align 16
-  %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %b, i32 15)
+  %r = call <8 x i16> @llvm.mips.binsri.h(<8 x i16> %a, <8 x i16> %b, i32 14)
   store <8 x i16> %r, <8 x i16> * %ptr, align 16
   ret void
 }
@@ -920,7 +920,7 @@ entry:
 define void @bclri_d(<2 x i64> * %ptr) {
 entry:
 ; CHECK-LABEL: bclri_d:
-; CHECK: and.v
+; CHECK: bclri.d
   %a = load <2 x i64>, <2 x i64> * %ptr, align 16
   %r = call <2 x i64> @llvm.mips.bclri.d(<2 x i64> %a, i32 16)
   store <2 x i64> %r, <2 x i64> * %ptr, align 16
@@ -930,7 +930,7 @@ entry:
 define void @binsli_d(<2 x i64> * %ptr, <2 x i64> * %ptr2) {
 entry:
 ; CHECK-LABEL: binsli_d:
-; CHECK: bsel.v
+; CHECK: binsli.d
   %a = load <2 x i64>, <2 x i64> * %ptr, align 16
   %b = load <2 x i64>, <2 x i64> * %ptr2, align 16
   %r = call <2 x i64> @llvm.mips.binsli.d(<2 x i64> %a, <2 x i64> %b, i32 4)
@@ -952,7 +952,7 @@ entry:
 define void @bnegi_d(<2 x i64> * %ptr) {
 entry:
 ; CHECK-LABEL: bnegi_d:
-; CHECK: xor.v
+; CHECK: bnegi.d
   %a = load <2 x i64>, <2 x i64> * %ptr, align 16
   %r = call <2 x i64> @llvm.mips.bnegi.d(<2 x i64> %a, i32 9)
   store <2 x i64> %r, <2 x i64> * %ptr, align 16
@@ -962,7 +962,7 @@ entry:
 define void @bseti_d(<2 x i64> * %ptr) {
 entry:
 ; CHECK-LABEL: bseti_d:
-; CHECK: or.v
+; CHECK: bseti.d
   %a = load <2 x i64>, <2 x i64> * %ptr, align 16
   %r = call <2 x i64> @llvm.mips.bseti.d(<2 x i64> %a, i32 25)
   store <2 x i64> %r, <2 x i64> * %ptr, align 16
diff --git a/test/CodeGen/Mips/o32_cc_byval.ll b/test/CodeGen/Mips/o32_cc_byval.ll
index 33431dba43c484e8b829f97bd13c3d7cc25e9746..eadf4abfc7590c08951447c15588e0ba601e0da6 100644
--- a/test/CodeGen/Mips/o32_cc_byval.ll
+++ b/test/CodeGen/Mips/o32_cc_byval.ll
@@ -45,20 +45,18 @@ declare void @callee3(float, %struct.S3* byval, %struct.S1* byval)
 define void @f2(float %f, %struct.S1* nocapture byval %s1) nounwind {
 entry:
 ; CHECK: addiu $sp, $sp, -48
-; CHECK: sw  $7, 60($sp)
-; CHECK: sw  $6, 56($sp)
-; CHECK: lw  $4, 80($sp)
-; CHECK: ldc1 $f[[F0:[0-9]+]], 72($sp)
-; CHECK: lw  $[[R3:[0-9]+]], 64($sp)
-; CHECK: lw  $[[R4:[0-9]+]], 68($sp)
-; CHECK: lw  $[[R2:[0-9]+]], 60($sp)
-; CHECK: lh  $[[R1:[0-9]+]], 58($sp)
-; CHECK: lb  $[[R0:[0-9]+]], 56($sp)
-; CHECK: sw  $[[R0]], 32($sp)
-; CHECK: sw  $[[R1]], 28($sp)
-; CHECK: sw  $[[R2]], 24($sp)
-; CHECK: sw  $[[R4]], 20($sp)
-; CHECK: sw  $[[R3]], 16($sp)
+; CHECK-DAG: sw  $7, 60($sp)
+; CHECK-DAG: sw  $6, 56($sp)
+; CHECK-DAG: ldc1 $f[[F0:[0-9]+]], 72($sp)
+; CHECK-DAG: lw  $[[R3:[0-9]+]], 64($sp)
+; CHECK-DAG: lw  $[[R4:[0-9]+]], 68($sp)
+; CHECK-DAG: lh  $[[R1:[0-9]+]], 58($sp)
+; CHECK-DAG: lb  $[[R0:[0-9]+]], 56($sp)
+; CHECK-DAG: sw  $[[R0]], 32($sp)
+; CHECK-DAG: sw  $[[R1]], 28($sp)
+; CHECK-DAG: sw  $[[R4]], 20($sp)
+; CHECK-DAG: sw  $[[R3]], 16($sp)
+; CHECK-DAG: sw  $7, 24($sp)
 ; CHECK: mfc1 $6, $f[[F0]]
 
   %i2 = getelementptr inbounds %struct.S1, %struct.S1* %s1, i32 0, i32 5
@@ -82,13 +80,11 @@ declare void @callee4(i32, double, i64, i32, i16 signext, i8 signext, float)
 define void @f3(%struct.S2* nocapture byval %s2) nounwind {
 entry:
 ; CHECK: addiu $sp, $sp, -48
-; CHECK: sw  $7, 60($sp)
-; CHECK: sw  $6, 56($sp)
-; CHECK: sw  $5, 52($sp)
-; CHECK: sw  $4, 48($sp)
-; CHECK: lw  $4, 48($sp)
-; CHECK: lw  $[[R0:[0-9]+]], 60($sp)
-; CHECK: sw  $[[R0]], 24($sp)
+; CHECK-DAG: sw  $7, 60($sp)
+; CHECK-DAG: sw  $6, 56($sp)
+; CHECK-DAG: sw  $5, 52($sp)
+; CHECK-DAG: sw  $4, 48($sp)
+; CHECK-DAG: sw  $7, 24($sp)
 
   %arrayidx = getelementptr inbounds %struct.S2, %struct.S2* %s2, i32 0, i32 0, i32 0
   %tmp = load i32, i32* %arrayidx, align 4
@@ -101,14 +97,14 @@ entry:
 define void @f4(float %f, %struct.S3* nocapture byval %s3, %struct.S1* nocapture byval %s1) nounwind {
 entry:
 ; CHECK: addiu $sp, $sp, -48
-; CHECK: sw  $7, 60($sp)
-; CHECK: sw  $6, 56($sp)
-; CHECK: sw  $5, 52($sp)
-; CHECK: lw  $4, 60($sp)
-; CHECK: lw  $[[R1:[0-9]+]], 80($sp)
-; CHECK: lb  $[[R0:[0-9]+]], 52($sp)
-; CHECK: sw  $[[R0]], 32($sp)
-; CHECK: sw  $[[R1]], 24($sp)
+; CHECK-DAG: sw  $7, 60($sp)
+; CHECK-DAG: sw  $6, 56($sp)
+; CHECK-DAG: sw  $5, 52($sp)
+; CHECK-DAG: lw  $[[R1:[0-9]+]], 80($sp)
+; CHECK-DAG: lb  $[[R0:[0-9]+]], 52($sp)
+; CHECK-DAG: sw  $[[R0]], 32($sp)
+; CHECK-DAG: sw  $[[R1]], 24($sp)
+; CHECK: move $4, $7
 
   %i = getelementptr inbounds %struct.S1, %struct.S1* %s1, i32 0, i32 2
   %tmp = load i32, i32* %i, align 4
diff --git a/test/CodeGen/Mips/o32_cc_vararg.ll b/test/CodeGen/Mips/o32_cc_vararg.ll
index b4597a3214e2b0579c7a161e1e51400901924247..73aad48b73e68cc0162a48fe2ad7c006ba23ceb7 100644
--- a/test/CodeGen/Mips/o32_cc_vararg.ll
+++ b/test/CodeGen/Mips/o32_cc_vararg.ll
@@ -29,9 +29,9 @@ entry:
 
 ; CHECK-LABEL: va1:
 ; CHECK: addiu   $sp, $sp, -16
+; CHECK: sw      $5, 20($sp)
 ; CHECK: sw      $7, 28($sp)
 ; CHECK: sw      $6, 24($sp)
-; CHECK: sw      $5, 20($sp)
 ; CHECK: lw      $2, 20($sp)
 }
 
@@ -83,8 +83,8 @@ entry:
 
 ; CHECK-LABEL: va3:
 ; CHECK: addiu   $sp, $sp, -16
-; CHECK: sw      $7, 28($sp)
 ; CHECK: sw      $6, 24($sp)
+; CHECK: sw      $7, 28($sp)
 ; CHECK: lw      $2, 24($sp)
 }
 
@@ -236,8 +236,8 @@ entry:
   ret i32 %tmp
 
 ; CHECK-LABEL: va9:
-; CHECK: addiu   $sp, $sp, -32
-; CHECK: lw      $2, 52($sp)
+; CHECK: addiu   $sp, $sp, -24
+; CHECK: lw      $2, 44($sp)
 }
 
 ; double
diff --git a/test/CodeGen/Mips/return_address.ll b/test/CodeGen/Mips/return_address.ll
index 34b72baa6d25c79b06af10229cfad3fee462f5a1..54a106f4b349e7fdac82d97c5316328926710b4d 100644
--- a/test/CodeGen/Mips/return_address.ll
+++ b/test/CodeGen/Mips/return_address.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel -verify-machineinstrs < %s | FileCheck %s
 
 define i8* @f1() nounwind {
 entry:
diff --git a/test/CodeGen/Mips/stackcoloring.ll b/test/CodeGen/Mips/stackcoloring.ll
index 817caee2f2753657147c88b8a93be2c4eb76804d..680b3128cc1b4627277f0546d37e0be2ed642fbb 100644
--- a/test/CodeGen/Mips/stackcoloring.ll
+++ b/test/CodeGen/Mips/stackcoloring.ll
@@ -11,7 +11,7 @@ define i32 @foo1() {
 entry:
   %b = alloca [16 x i32], align 4
   %0 = bitcast [16 x i32]* %b to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* %0)
   %arraydecay = getelementptr inbounds [16 x i32], [16 x i32]* %b, i32 0, i32 0
   br label %for.body
 
@@ -28,12 +28,12 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.end, label %for.body
 
 for.end:                                          ; preds = %for.body
-  call void @llvm.lifetime.end(i64 64, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %0)
   ret i32 %add
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare i32 @foo2(i32, i32*)
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
diff --git a/test/CodeGen/Mips/stchar.ll b/test/CodeGen/Mips/stchar.ll
index 34493e9ae338f228d0d8957728c6707b60bddd7c..a6021be8e808e7442a73af70c85af1d96c11c74f 100644
--- a/test/CodeGen/Mips/stchar.ll
+++ b/test/CodeGen/Mips/stchar.ll
@@ -34,7 +34,7 @@ entry:
 ; 16_h: lh      ${{[0-9]+}}, [[offset2]](${{[0-9]+}})
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 
diff --git a/test/CodeGen/Mips/tnaked.ll b/test/CodeGen/Mips/tnaked.ll
index 08f1ab5be86eefd2cbdd8d1056a1a5fc9d4102fc..7dff19c5d0009c88d5e310d1954c64fa56c3c928 100644
--- a/test/CodeGen/Mips/tnaked.ll
+++ b/test/CodeGen/Mips/tnaked.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s
+; RUN: llc -march=mipsel < %s -verify-machineinstrs | FileCheck %s
 
 
 define void @tnaked() #0 {
diff --git a/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll b/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a7c859a1815fe1bd20fe04048aaf69ba922b1482
--- /dev/null
+++ b/test/CodeGen/Mips/xray-mips-attribute-instrumentation.ll
@@ -0,0 +1,147 @@
+; RUN: llc -filetype=asm -o - -mtriple=mips-unknown-linux-gnu < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MIPS32 %s
+; RUN: llc -filetype=asm -o - -mtriple=mipsel-unknown-linux-gnu < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MIPS32 %s
+; RUN: llc -filetype=asm -o - -mtriple=mips64-unknown-linux-gnu < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MIPS64 %s
+; RUN: llc -filetype=asm -o - -mtriple=mips64el-unknown-linux-gnu < %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-MIPS64 %s
+
+define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" {
+; CHECK:       .p2align 2
+; CHECK-MIPS64-LABEL: .Lxray_sled_0:
+; CHECK-MIPS32-LABEL: $xray_sled_0:
+; CHECK-MIPS64:  b .Ltmp0
+; CHECK-MIPS32:  b $tmp0
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64-LABEL: .Ltmp0:
+; CHECK-MIPS32-LABEL: $tmp0:
+; CHECK-MIPS32:  addiu $25, $25, 52
+  ret i32 0
+; CHECK:       .p2align 2
+; CHECK-MIPS64-LABEL: .Lxray_sled_1:
+; CHECK-MIPS32-LABEL: $xray_sled_1:
+; CHECK-MIPS64:  b .Ltmp1
+; CHECK-MIPS32:  b $tmp1
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64-LABEL: .Ltmp1:
+; CHECK-MIPS32-LABEL: $tmp1:
+; CHECK-MIPS32:  addiu $25, $25, 52
+}
+; CHECK:  .section xray_instr_map,{{.*}}
+; CHECK-MIPS64: .8byte  .Lxray_sled_0
+; CHECK-MIPS64: .8byte  .Lxray_sled_1
+; CHECK-MIPS32: .4byte  ($xray_sled_0)
+; CHECK-MIPS32: .4byte  ($xray_sled_1)
+
+; We test multiple returns in a single function to make sure we're getting all
+; of them with XRay instrumentation.
+define i32 @bar(i32 %i) nounwind noinline uwtable "function-instrument"="xray-always" {
+; CHECK:       .p2align 2
+; CHECK-MIPS64-LABEL: .Lxray_sled_2:
+; CHECK-MIPS32-LABEL: $xray_sled_2:
+; CHECK-MIPS64:  b .Ltmp2
+; CHECK-MIPS32:  b $tmp2
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64-LABEL: .Ltmp2:
+; CHECK-MIPS32-LABEL: $tmp2:
+; CHECK-MIPS32:  addiu $25, $25, 52
+Test:
+  %cond = icmp eq i32 %i, 0
+  br i1 %cond, label %IsEqual, label %NotEqual
+IsEqual:
+  ret i32 0
+; CHECK:       .p2align 2
+; CHECK-MIPS64-LABEL: .Lxray_sled_3:
+; CHECK-MIPS32-LABEL: $xray_sled_3:
+; CHECK-MIPS64:  b .Ltmp3
+; CHECK-MIPS32:  b $tmp3
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64-LABEL: .Ltmp3:
+; CHECK-MIPS32-LABEL: $tmp3:
+; CHECK-MIPS32:  addiu $25, $25, 52 
+NotEqual:
+  ret i32 1
+; CHECK:       .p2align 2
+; CHECK-MIPS64-LABEL: .Lxray_sled_4:
+; CHECK-MIPS32-LABEL: $xray_sled_4:
+; CHECK-MIPS64:  b .Ltmp4
+; CHECK-MIPS32:  b $tmp4
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-NEXT:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64:  nop
+; CHECK-MIPS64-LABEL: .Ltmp4:
+; CHECK-MIPS32-LABEL: $tmp4:
+; CHECK-MIPS32:  addiu $25, $25, 52
+}
+; CHECK: .section xray_instr_map,{{.*}}
+; CHECK-MIPS64: .8byte  .Lxray_sled_2
+; CHECK-MIPS64: .8byte  .Lxray_sled_3
+; CHECK-MIPS64: .8byte  .Lxray_sled_4
+; CHECK-MIPS32: .4byte	($xray_sled_2)
+; CHECK-MIPS32: .4byte	($xray_sled_3)
+; CHECK-MIPS32: .4byte	($xray_sled_4)
diff --git a/test/CodeGen/Mips/xray-section-group.ll b/test/CodeGen/Mips/xray-section-group.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d87f178ec4bec845be1b5fe0ea3ddabeb04860bc
--- /dev/null
+++ b/test/CodeGen/Mips/xray-section-group.ll
@@ -0,0 +1,31 @@
+; RUN: llc -filetype=asm -o - -mtriple=mips-unknown-linux-gnu -function-sections < %s | FileCheck %s
+; RUN: llc -filetype=asm -o - -mtriple=mipsel-unknown-linux-gnu -function-sections < %s | FileCheck %s
+; RUN: llc -filetype=obj -o %t -mtriple=mips-unknown-linux-gnu -function-sections < %s
+; RUN: llvm-readobj -sections %t | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc -filetype=obj -o %t -mtriple=mipsel-unknown-linux-gnu -function-sections < %s
+; RUN: llvm-readobj -sections %t | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc -filetype=asm -o - -mtriple=mips64-unknown-linux-gnu -function-sections < %s | FileCheck %s
+; RUN: llc -filetype=asm -o - -mtriple=mips64el-unknown-linux-gnu -function-sections < %s | FileCheck %s
+; RUN: llc -filetype=obj -o %t -mtriple=mips64-unknown-linux-gnu -function-sections < %s
+; RUN: llvm-readobj -sections %t | FileCheck %s --check-prefix=CHECK-OBJ
+; RUN: llc -filetype=obj -o %t -mtriple=mips64el-unknown-linux-gnu -function-sections < %s
+; RUN: llvm-readobj -sections %t | FileCheck %s --check-prefix=CHECK-OBJ
+
+define i32 @foo() nounwind noinline uwtable "function-instrument"="xray-always" {
+; CHECK: .section .text.foo,"ax",@progbits
+  ret i32 0
+; CHECK: .section xray_instr_map,"a",@progbits
+}
+
+; CHECK-OBJ: Section {
+; CHECK-OBJ:   Name: xray_instr_map
+
+$bar = comdat any
+define i32 @bar() nounwind noinline uwtable "function-instrument"="xray-always" comdat($bar) {
+; CHECK: .section .text.bar,"axG",@progbits,bar,comdat
+  ret i32 1
+; CHECK: .section xray_instr_map,"aG",@progbits,bar,comdat
+}
+
+; CHECK-OBJ: Section {
+; CHECK-OBJ:   Name: xray_instr_map
diff --git a/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
index 1a4b0bad36e1b386d5f12d7b46f785a493b592fc..e84030f385c41d5b9d94719a2f67096758185820 100644
--- a/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
+++ b/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
@@ -15,3 +15,37 @@ define i32 @f(i32* %p) {
   %sum = add i32 %v0, %v1
   ret i32 %sum
 }
+
+define half @fh(half* %p) {
+  %p.1 = getelementptr half, half* %p, i32 1
+  %p.2 = getelementptr half, half* %p, i32 2
+  %p.3 = getelementptr half, half* %p, i32 3
+  %p.4 = getelementptr half, half* %p, i32 4
+  %v0 = load half, half* %p, align 64
+  %v1 = load half, half* %p.1, align 4
+  %v2 = load half, half* %p.2, align 4
+  %v3 = load half, half* %p.3, align 4
+  %v4 = load half, half* %p.4, align 4
+  %sum1 = fadd half %v0, %v1
+  %sum2 = fadd half %v2, %v3
+  %sum3 = fadd half %sum1, %sum2
+  %sum = fadd half %sum3, %v4
+  ret half %sum
+}
+
+define float @ff(float* %p) {
+  %p.1 = getelementptr float, float* %p, i32 1
+  %p.2 = getelementptr float, float* %p, i32 2
+  %p.3 = getelementptr float, float* %p, i32 3
+  %p.4 = getelementptr float, float* %p, i32 4
+  %v0 = load float, float* %p, align 64
+  %v1 = load float, float* %p.1, align 4
+  %v2 = load float, float* %p.2, align 4
+  %v3 = load float, float* %p.3, align 4
+  %v4 = load float, float* %p.4, align 4
+  %sum1 = fadd float %v0, %v1
+  %sum2 = fadd float %v2, %v3
+  %sum3 = fadd float %sum1, %sum2
+  %sum = fadd float %sum3, %v4
+  ret float %sum
+}
diff --git a/test/CodeGen/NVPTX/add-128bit.ll b/test/CodeGen/NVPTX/add-128bit.ll
index 29e3cdffae7bf13dccacb245e9bbe3a7afb25c6d..a077c3fcf8915feda510f42017276628eeaec107 100644
--- a/test/CodeGen/NVPTX/add-128bit.ll
+++ b/test/CodeGen/NVPTX/add-128bit.ll
@@ -8,7 +8,7 @@ define void @foo(i64 %a, i64 %add, i128* %retptr) {
 ; CHECK:        add.s64
 ; CHECK:        setp.lt.u64
 ; CHECK:        setp.lt.u64
-; CHECK:        selp.b64
+; CHECK:        selp.u64
 ; CHECK:        selp.b64
 ; CHECK:        add.s64
   %t1 = sext i64 %a to i128
diff --git a/test/CodeGen/NVPTX/aggregate-return.ll b/test/CodeGen/NVPTX/aggregate-return.ll
index 527c5c9aa85d9434a42200ad08bb69ece1523dd4..785b4d6d90dc5aa28ccd566bfe854098e5c071e2 100644
--- a/test/CodeGen/NVPTX/aggregate-return.ll
+++ b/test/CodeGen/NVPTX/aggregate-return.ll
@@ -1,21 +1,40 @@
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s
 
 declare <2 x float> @barv(<2 x float> %input)
+declare <3 x float> @barv3(<3 x float> %input)
 declare [2 x float] @bara([2 x float] %input)
 declare {float, float} @bars({float, float} %input)
 
-define void @foov(<2 x float> %input, <2 x float>* %output) {
-; CHECK-LABEL: @foov
+define void @test_v2f32(<2 x float> %input, <2 x float>* %output) {
+; CHECK-LABEL: @test_v2f32
   %call = tail call <2 x float> @barv(<2 x float> %input)
 ; CHECK: .param .align 8 .b8 retval0[8];
-; CHECK: ld.param.v2.f32 {[[ELEMV1:%f[0-9]+]], [[ELEMV2:%f[0-9]+]]}, [retval0+0];
+; CHECK: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0];
   store <2 x float> %call, <2 x float>* %output, align 8
-; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[ELEMV1]], [[ELEMV2]]}
+; CHECK: st.v2.f32 [{{%rd[0-9]+}}], {[[E0]], [[E1]]}
   ret void
 }
 
-define void @fooa([2 x float] %input, [2 x float]* %output) {
-; CHECK-LABEL: @fooa
+define void @test_v3f32(<3 x float> %input, <3 x float>* %output) {
+; CHECK-LABEL: @test_v3f32
+;
+  %call = tail call <3 x float> @barv3(<3 x float> %input)
+; CHECK: .param .align 16 .b8 retval0[16];
+; CHECK-DAG: ld.param.v2.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]]}, [retval0+0];
+; CHECK-DAG: ld.param.f32 [[E2:%f[0-9]+]], [retval0+8];
+; Make sure we don't load more values than than we need to.
+; CHECK-NOT: ld.param.f32 [[E3:%f[0-9]+]], [retval0+12];
+  store <3 x float> %call, <3 x float>* %output, align 8
+; CHECK-DAG: st.f32 [{{%rd[0-9]}}+8],
+; -- This is suboptimal. We should do st.v2.f32 instead
+;    of combining 2xf32 info i64.
+; CHECK-DAG: st.u64 [{{%rd[0-9]}}],
+; CHECK: ret;
+  ret void
+}
+
+define void @test_a2f32([2 x float] %input, [2 x float]* %output) {
+; CHECK-LABEL: @test_a2f32
   %call = tail call [2 x float] @bara([2 x float] %input)
 ; CHECK: .param .align 4 .b8 retval0[8];
 ; CHECK-DAG: ld.param.f32 [[ELEMA1:%f[0-9]+]], [retval0+0];
@@ -28,8 +47,8 @@ define void @fooa([2 x float] %input, [2 x float]* %output) {
 ; CHECK: ret
 }
 
-define void @foos({float, float} %input, {float, float}* %output) {
-; CHECK-LABEL: @foos
+define void @test_s2f32({float, float} %input, {float, float}* %output) {
+; CHECK-LABEL: @test_s2f32
   %call = tail call {float, float} @bars({float, float} %input)
 ; CHECK: .param .align 4 .b8 retval0[8];
 ; CHECK-DAG: ld.param.f32 [[ELEMS1:%f[0-9]+]], [retval0+0];
diff --git a/test/CodeGen/NVPTX/bug22322.ll b/test/CodeGen/NVPTX/bug22322.ll
index 0c4c30cf37ed2b9bf0bde64fcc4952ffbae30931..74133d3dcabdb4f80f222d5f3c5b193d77e5ea25 100644
--- a/test/CodeGen/NVPTX/bug22322.ll
+++ b/test/CodeGen/NVPTX/bug22322.ll
@@ -17,7 +17,7 @@ _ZL11compute_vecRK6float3jb.exit:
   %4 = add nsw i32 %2, %3
   %5 = zext i32 %4 to i64
   %6 = bitcast float* %ret_vec.sroa.8.i to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %6)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6)
   %7 = and i32 %4, 15
   %8 = icmp eq i32 %7, 0
   %9 = select i1 %8, float 0.000000e+00, float -1.000000e+00
@@ -26,7 +26,7 @@ _ZL11compute_vecRK6float3jb.exit:
   %10 = fcmp olt float %9, 0.000000e+00
   %ret_vec.sroa.8.i.val = load float, float* %ret_vec.sroa.8.i, align 4
   %11 = select i1 %10, float 0.000000e+00, float %ret_vec.sroa.8.i.val
-  call void @llvm.lifetime.end(i64 4, i8* %6)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %6)
   %12 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 0
   store float 0.000000e+00, float* %12, align 4
   %13 = getelementptr inbounds %class.float3, %class.float3* %dst, i64 %5, i32 1
@@ -46,10 +46,10 @@ declare i32 @llvm.nvvm.read.ptx.sreg.ntid.x() #1
 declare i32 @llvm.nvvm.read.ptx.sreg.tid.x() #1
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
 
 attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/test/CodeGen/NVPTX/f16-instructions.ll b/test/CodeGen/NVPTX/f16-instructions.ll
index b94fd17e91fa59a90b25c7355aab61716870984d..403a67f02f80aae0755d819c64395d02a285dc1f 100644
--- a/test/CodeGen/NVPTX/f16-instructions.ll
+++ b/test/CodeGen/NVPTX/f16-instructions.ll
@@ -127,13 +127,13 @@ define half @test_fdiv(half %a, half %b) #0 {
 ; CHECK-LABEL: test_frem(
 ; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_frem_param_0];
 ; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_frem_param_1];
-; CHECK-DAG:  cvt.f32.f16     [[F0:%f[0-9]+]], [[A]];
-; CHECK-DAG:  cvt.f32.f16     [[F1:%f[0-9]+]], [[B]];
-; CHECK-NEXT: div.rn.f32      [[F2:%f[0-9]+]], [[F0]], [[F1]];
-; CHECK-NEXT: cvt.rmi.f32.f32 [[F3:%f[0-9]+]], [[F2]];
-; CHECK-NEXT: mul.f32         [[F4:%f[0-9]+]], [[F3]], [[F1]];
-; CHECK-NEXT: sub.f32         [[F5:%f[0-9]+]], [[F0]], [[F4]];
-; CHECK-NEXT: cvt.rn.f16.f32  [[R:%h[0-9]+]], [[F5]];
+; CHECK-DAG:  cvt.f32.f16     [[FA:%f[0-9]+]], [[A]];
+; CHECK-DAG:  cvt.f32.f16     [[FB:%f[0-9]+]], [[B]];
+; CHECK-NEXT: div.rn.f32      [[D:%f[0-9]+]], [[FA]], [[FB]];
+; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]];
+; CHECK-NEXT: mul.f32         [[RI:%f[0-9]+]], [[DI]], [[FB]];
+; CHECK-NEXT: sub.f32         [[RF:%f[0-9]+]], [[FA]], [[RI]];
+; CHECK-NEXT: cvt.rn.f16.f32  [[R:%h[0-9]+]], [[RF]];
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
 define half @test_frem(half %a, half %b) #0 {
@@ -161,6 +161,20 @@ define half @test_load(half* %a) #0 {
   ret half %r
 }
 
+; CHECK-LABEL: .visible .func test_halfp0a1(
+; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0];
+; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1];
+; CHECK-DAG: ld.u8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.u8        [%[[TO]]], [[B0]]
+; CHECK-DAG: ld.u8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.u8        [%[[TO]]+1], [[B1]]
+; CHECK: ret
+define void @test_halfp0a1(half * noalias readonly %from, half * %to) {
+  %1 = load half, half * %from , align 1
+  store half %1, half * %to , align 1
+  ret void
+}
+
 declare half @test_callee(half %a, half %b) #0
 
 ; CHECK-LABEL: test_call(
@@ -229,7 +243,7 @@ define half @test_tailcall_flipped(half %a, half %b) #0 {
 ; CHECK-LABEL: test_select(
 ; CHECK-DAG:  ld.param.b16    [[A:%h[0-9]+]], [test_select_param_0];
 ; CHECK-DAG:  ld.param.b16    [[B:%h[0-9]+]], [test_select_param_1];
-; CHECK:      setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
+; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
 ; CHECK-NEXT: selp.b16        [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]];
 ; CHECK-NEXT: st.param.b16    [func_retval0+0], [[R]];
 ; CHECK-NEXT: ret;
@@ -509,7 +523,7 @@ define i1 @test_fcmp_ord(half %a, half %b) #0 {
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]];
 ; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]];
 ; CHECK-NOF16: setp.lt.f32    [[PRED:%p[0-9]+]], [[AF]], [[BF]]
-; CHECK-NEXT: @%p1 bra        [[LABEL:LBB.*]];
+; CHECK-NEXT: @[[PRED]] bra   [[LABEL:LBB.*]];
 ; CHECK:      st.u32  [%[[C]]],
 ; CHECK:      [[LABEL]]:
 ; CHECK:      st.u32  [%[[D]]],
diff --git a/test/CodeGen/NVPTX/f16x2-instructions.ll b/test/CodeGen/NVPTX/f16x2-instructions.ll
new file mode 100644
index 0000000000000000000000000000000000000000..33bb616d895c4255e7af252e4d510f4d7df5bf6b
--- /dev/null
+++ b/test/CodeGen/NVPTX/f16x2-instructions.ll
@@ -0,0 +1,1426 @@
+; ## Full FP16 support enabled by default.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -disable-fp-elim \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s
+; ## FP16 support explicitly disabled.
+; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \
+; RUN:          -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+; ## FP16 is not supported by hardware.
+; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \
+; RUN:          -disable-post-ra -disable-fp-elim \
+; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: test_ret_const(
+; CHECK:     mov.u32         [[T:%r[0-9+]]], 1073757184;
+; CHECK:     mov.b32         [[R:%hh[0-9+]]], [[T]];
+; CHECK:     st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_ret_const() #0 {
+  ret <2 x half> <half 1.0, half 2.0>
+}
+
+; CHECK-LABEL: test_extract_0(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_extract_0_param_0];
+; CHECK:      mov.b32         {[[R:%h[0-9]+]], %tmp_hi}, [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_extract_0(<2 x half> %a) #0 {
+  %e = extractelement <2 x half> %a, i32 0
+  ret half %e
+}
+
+; CHECK-LABEL: test_extract_1(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_extract_1_param_0];
+; CHECK:      mov.b32         {%tmp_lo, [[R:%h[0-9]+]]}, [[A]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_extract_1(<2 x half> %a) #0 {
+  %e = extractelement <2 x half> %a, i32 1
+  ret half %e
+}
+
+; CHECK-LABEL: test_extract_i(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_extract_i_param_0];
+; CHECK-DAG:  ld.param.u64    [[IDX:%rd[0-9]+]], [test_extract_i_param_1];
+; CHECK-DAG:  setp.eq.s64     [[PRED:%p[0-9]+]], [[IDX]], 0;
+; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]];
+; CHECK:      selp.b16        [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define half @test_extract_i(<2 x half> %a, i64 %idx) #0 {
+  %e = extractelement <2 x half> %a, i64 %idx
+  ret half %e
+}
+
+; CHECK-LABEL: test_fadd(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fadd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fadd_param_1];
+;
+; CHECK-F16-NEXT:   add.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fadd <2 x half> %a, %b
+  ret <2 x half> %r
+}
+
+; Check that we can lower fadd with immediate arguments.
+; CHECK-LABEL: test_fadd_imm_0(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0];
+;
+; CHECK-F16:        mov.u32        [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16:        mov.b32        [[IHH:%hh[0-9+]]], [[I]];
+; CHECK-F16:        add.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[IHH]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 {
+  %r = fadd <2 x half> <half 1.0, half 2.0>, %a
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fadd_imm_1(
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0];
+;
+; CHECK-F16:        mov.u32        [[I:%r[0-9+]]], 1073757184;
+; CHECK-F16:        mov.b32        [[IHH:%hh[0-9+]]], [[I]];
+; CHECK-F16:        add.rn.f16x2   [[R:%hh[0-9]+]], [[B]], [[IHH]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000;
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], 0f40000000;
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 {
+  %r = fadd <2 x half> %a, <half 1.0, half 2.0>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fsub(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fsub_param_0];
+;
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fsub_param_1];
+; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  sub.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG:  sub.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fsub <2 x half> %a, %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fneg(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fneg_param_0];
+;
+; CHECK-F16:        mov.u32        [[I0:%r[0-9+]]], 0;
+; CHECK-F16:        mov.b32        [[IHH0:%hh[0-9+]]], [[I0]];
+; CHECK-F16-NEXT:   sub.rn.f16x2   [[R:%hh[0-9]+]], [[IHH0]], [[A]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  mov.f32        [[Z:%f[0-9]+]], 0f00000000;
+; CHECK-NOF16-DAG:  sub.rn.f32     [[FR0:%f[0-9]+]], [[Z]], [[FA0]];
+; CHECK-NOF16-DAG:  sub.rn.f32     [[FR1:%f[0-9]+]], [[Z]], [[FA1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fneg(<2 x half> %a) #0 {
+  %r = fsub <2 x half> <half 0.0, half 0.0>, %a
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fmul(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fmul_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fmul_param_1];
+; CHECK-F16-NEXT: mul.rn.f16x2     [[R:%hh[0-9]+]], [[A]], [[B]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  mul.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-NOF16-DAG:  mul.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fmul <2 x half> %a, %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fdiv(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fdiv_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fdiv_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.f16     [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.f16     [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.f16     [[FB1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  div.rn.f32      [[FR0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  div.rn.f32      [[FR1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[FR0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[FR1]];
+; CHECK-NEXT: mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fdiv <2 x half> %a, %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_frem(
+; -- Load two 16x2 inputs and split them into f16 elements
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_frem_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_frem_param_1];
+; -- Split into elements
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; -- promote to f32.
+; CHECK-DAG:  cvt.f32.f16     [[FA0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[FB0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.f16     [[FA1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.f16     [[FB1:%f[0-9]+]], [[B1]];
+; -- frem(a[0],b[0]).
+; CHECK-DAG:  div.rn.f32      [[FD0:%f[0-9]+]], [[FA0]], [[FB0]];
+; CHECK-DAG:  cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]];
+; CHECK-DAG:  mul.f32         [[RI0:%f[0-9]+]], [[DI0]], [[FB0]];
+; CHECK-DAG:  sub.f32         [[RF0:%f[0-9]+]], [[FA0]], [[RI0]];
+; -- frem(a[1],b[1]).
+; CHECK-DAG:  div.rn.f32      [[FD1:%f[0-9]+]], [[FA1]], [[FB1]];
+; CHECK-DAG:  cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]];
+; CHECK-DAG:  mul.f32         [[RI1:%f[0-9]+]], [[DI1]], [[FB1]];
+; CHECK-DAG:  sub.f32         [[RF1:%f[0-9]+]], [[FA1]], [[RI1]];
+; -- convert back to f16.
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; -- merge into f16x2 and return it.
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 {
+  %r = frem <2 x half> %a, %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: .func test_ldst_v2f16(
+; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0];
+; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1];
+; CHECK-DAG:    ld.b32          [[E:%hh[0-9]+]], [%[[A]]]
+; CHECK:        mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]];
+; CHECK-DAG:    st.v2.b16       [%[[B]]], {[[E0]], [[E1]]};
+; CHECK:        ret;
+define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) {
+  %t1 = load <2 x half>, <2 x half>* %a
+  store <2 x half> %t1, <2 x half>* %b, align 16
+  ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v3f16(
+; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0];
+; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1];
+; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair
+;    number of bitshifting instructions that may change at llvm's whim.
+;    So we only verify that we only issue correct number of writes using
+;    correct offset, but not the values we write.
+; CHECK-DAG:    ld.u64
+; CHECK-DAG:    st.u32          [%[[B]]],
+; CHECK-DAG:    st.b16          [%[[B]]+4],
+; CHECK:        ret;
+define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) {
+  %t1 = load <3 x half>, <3 x half>* %a
+  store <3 x half> %t1, <3 x half>* %b, align 16
+  ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v4f16(
+; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0];
+; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1];
+; CHECK-DAG:    ld.v4.b16       {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]];
+; CHECK-DAG:    st.v4.b16       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:        ret;
+define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) {
+  %t1 = load <4 x half>, <4 x half>* %a
+  store <4 x half> %t1, <4 x half>* %b, align 16
+  ret void
+}
+
+; CHECK-LABEL: .func test_ldst_v8f16(
+; CHECK-DAG:    ld.param.u64    %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0];
+; CHECK-DAG:    ld.param.u64    %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1];
+; CHECK-DAG:    ld.v4.b32       {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]];
+; CHECK-DAG:    st.v4.b32       [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:        ret;
+define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) {
+  %t1 = load <8 x half>, <8 x half>* %a
+  store <8 x half> %t1, <8 x half>* %b, align 16
+  ret void
+}
+
+declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0
+
+; CHECK-LABEL: test_call(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_call_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_call_param_1];
+; CHECK:      {
+; CHECK-DAG:  .param .align 4 .b8 param0[4];
+; CHECK-DAG:  .param .align 4 .b8 param1[4];
+; CHECK-DAG:  st.param.b32    [param0+0], [[A]];
+; CHECK-DAG:  st.param.b32    [param1+0], [[B]];
+; CHECK-DAG:  .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT:        test_callee,
+; CHECK:      );
+; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_call_flipped(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_call_flipped_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_call_flipped_param_1];
+; CHECK:      {
+; CHECK-DAG:  .param .align 4 .b8 param0[4];
+; CHECK-DAG:  .param .align 4 .b8 param1[4];
+; CHECK-DAG:  st.param.b32    [param0+0], [[B]];
+; CHECK-DAG:  st.param.b32    [param1+0], [[A]];
+; CHECK-DAG:  .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT:        test_callee,
+; CHECK:      );
+; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_tailcall_flipped(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1];
+; CHECK:      {
+; CHECK-DAG:  .param .align 4 .b8 param0[4];
+; CHECK-DAG:  .param .align 4 .b8 param1[4];
+; CHECK-DAG:  st.param.b32    [param0+0], [[B]];
+; CHECK-DAG:  st.param.b32    [param1+0], [[A]];
+; CHECK-DAG:  .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT:        test_callee,
+; CHECK:      );
+; CHECK-NEXT: ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK-NEXT: }
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 {
+  %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_param_1];
+; CHECK-DAG:  ld.param.u8     [[C:%rs[0-9]+]], [test_select_param_2]
+; CHECK-DAG:  setp.eq.b16     [[PRED:%p[0-9]+]], %rs{{.*}}, 1;
+; CHECK-NEXT: selp.b32        [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]];
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 {
+  %r = select i1 %c, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select_cc(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_cc_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_cc_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_select_cc_param_2];
+; CHECK-DAG:  ld.param.b32    [[D:%hh[0-9]+]], [test_select_cc_param_3];
+;
+; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+;
+; CHECK-NOF16-DAG: mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32        {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
+; CHECK-NOF16-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; CHECK-NOF16-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  selp.b16        [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG:  selp.b16        [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 {
+  %cc = fcmp une <2 x half> %c, %d
+  %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_select_cc_f32_f16(
+; CHECK-DAG:  ld.param.v2.f32    {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0];
+; CHECK-DAG:  ld.param.v2.f32    {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2];
+; CHECK-DAG:  ld.param.b32    [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3];
+;
+; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]]
+; CHECK-NOF16-DAG: mov.b32         {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG: mov.b32         {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]]
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]];
+; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]];
+; CHECK-NOF16-DAG: setp.neu.f32    [[P0:%p[0-9]+]], [[CF0]], [[DF0]]
+; CHECK-NOF16-DAG: setp.neu.f32    [[P1:%p[0-9]+]], [[CF1]], [[DF1]]
+;
+; CHECK-DAG: selp.f32        [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG: selp.f32        [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK-NEXT: st.param.v2.f32    [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b,
+                                           <2 x half> %c, <2 x half> %d) #0 {
+  %cc = fcmp une <2 x half> %c, %d
+  %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b
+  ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_select_cc_f16_f32(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1];
+; CHECK-DAG:  ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2];
+; CHECK-DAG:  ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3];
+; CHECK-DAG:  setp.neu.f32    [[P0:%p[0-9]+]], [[C0]], [[D0]]
+; CHECK-DAG:  setp.neu.f32    [[P1:%p[0-9]+]], [[C1]], [[D1]]
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  selp.b16        [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]];
+; CHECK-DAG:  selp.b16        [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK-NEXT: st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b,
+                                          <2 x float> %c, <2 x float> %d) #0 {
+  %cc = fcmp une <2 x float> %c, %d
+  %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fcmp_une(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_une_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_une_param_1];
+; CHECK-F16:  setp.neu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.neu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.neu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp une <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ueq(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1];
+; CHECK-F16:  setp.equ.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.equ.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.equ.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ueq <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ugt(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1];
+; CHECK-F16:  setp.gtu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.gtu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.gtu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ugt <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_uge(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_uge_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_uge_param_1];
+; CHECK-F16:  setp.geu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.geu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.geu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp uge <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ult(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ult_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ult_param_1];
+; CHECK-F16:  setp.ltu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.ltu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.ltu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ult <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ule(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ule_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ule_param_1];
+; CHECK-F16:  setp.leu.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.leu.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.leu.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ule <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+
+; CHECK-LABEL: test_fcmp_uno(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_uno_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_uno_param_1];
+; CHECK-F16:  setp.nan.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.nan.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.nan.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp uno <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_one(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_one_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_one_param_1];
+; CHECK-F16:  setp.ne.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.ne.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.ne.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp one <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_oeq(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1];
+; CHECK-F16:  setp.eq.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.eq.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.eq.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp oeq <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ogt(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1];
+; CHECK-F16:  setp.gt.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.gt.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.gt.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ogt <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_oge(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_oge_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_oge_param_1];
+; CHECK-F16:  setp.ge.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.ge.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.ge.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp oge <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_olt(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_olt_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_olt_param_1];
+; CHECK-F16:  setp.lt.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.lt.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.lt.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp olt <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; XCHECK-LABEL: test_fcmp_ole(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ole_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ole_param_1];
+; CHECK-F16:  setp.le.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.le.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.le.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ole <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fcmp_ord(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fcmp_ord_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fcmp_ord_param_1];
+; CHECK-F16:  setp.num.f16x2  [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  setp.num.f32   [[P0:%p[0-9]+]], [[FA0]], [[FB0]]
+; CHECK-NOF16-DAG:  setp.num.f32   [[P1:%p[0-9]+]], [[FA1]], [[FB1]]
+; CHECK-DAG:  selp.u16        [[R0:%rs[0-9]+]], -1, 0, [[P0]];
+; CHECK-DAG:  selp.u16        [[R1:%rs[0-9]+]], -1, 0, [[P1]];
+; CHECK-NEXT: st.param.v2.b8  [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-NEXT: ret;
+define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 {
+  %r = fcmp ord <2 x half> %a, %b
+  ret <2 x i1> %r
+}
+
+; CHECK-LABEL: test_fptosi_i32(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptosi_i32_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]];
+; CHECK:      st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK:      ret;
+define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 {
+  %r = fptosi <2 x half> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+; CHECK-LABEL: test_fptosi_i64(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptosi_i64_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]];
+; CHECK:      st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK:      ret;
+define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 {
+  %r = fptosi <2 x half> %a to <2 x i64>
+  ret <2 x i64> %r
+}
+
+; CHECK-LABEL: test_fptoui_2xi32(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]];
+; CHECK:      st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK:      ret;
+define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 {
+  %r = fptoui <2 x half> %a to <2 x i32>
+  ret <2 x i32> %r
+}
+
+; CHECK-LABEL: test_fptoui_2xi64(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]];
+; CHECK:      st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK:      ret;
+define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 {
+  %r = fptoui <2 x half> %a to <2 x i64>
+  ret <2 x i64> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi32(
+; CHECK:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0];
+; CHECK-DAG:  cvt.rn.f16.u32  [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.u32  [[R1:%h[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 {
+  %r = uitofp <2 x i32> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi64(
+; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0];
+; CHECK-DAG:  cvt.rn.f32.u64  [[F0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f32.u64  [[F1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[F0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[F1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 {
+  %r = uitofp <2 x i64> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi32(
+; CHECK:      ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0];
+; CHECK-DAG:  cvt.rn.f16.s32  [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.s32  [[R1:%h[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 {
+  %r = sitofp <2 x i32> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi64(
+; CHECK:      ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0];
+; CHECK-DAG:  cvt.rn.f32.s64  [[F0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f32.s64  [[F1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[F0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[F1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 {
+  %r = sitofp <2 x i64> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_uitofp_2xi32_fadd(
+; CHECK-DAG:  ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1];
+; CHECK-DAG:  cvt.rn.f16.u32  [[C0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.u32  [[C1:%h[0-9]+]], [[A1]];
+
+; CHECK-F16-DAG:  mov.b32         [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG:  add.rn.f16x2    [[R:%hh[0-9]+]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC1:%f[0-9]+]], [[C1]]
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+  %c = uitofp <2 x i32> %a to <2 x half>
+  %r = fadd <2 x half> %b, %c
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_sitofp_2xi32_fadd(
+; CHECK-DAG:  ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1];
+; CHECK-DAG:  cvt.rn.f16.s32  [[C0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.s32  [[C1:%h[0-9]+]], [[A1]];
+;
+; CHECK-F16-DAG:  mov.b32         [[C:%hh[0-9]+]], {[[C0]], [[C1]]}
+; CHECK-F16-DAG:  add.rn.f16x2    [[R:%hh[0-9]+]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC1:%f[0-9]+]], [[C1]]
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR0:%f[0-9]+]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG:  add.rn.f32     [[FR1:%f[0-9]+]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 {
+  %c = sitofp <2 x i32> %a to <2 x half>
+  %r = fadd <2 x half> %b, %c
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xfloat(
+; CHECK:      ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 {
+  %r = fptrunc <2 x float> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fptrunc_2xdouble(
+; CHECK:      ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0];
+; CHECK-DAG:  cvt.rn.f16.f64  [[R0:%h[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.rn.f16.f64  [[R1:%h[0-9]+]], [[A1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 {
+  %r = fptrunc <2 x double> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fpext_2xfloat(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.f16     [[R0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[R1:%f[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK:      ret;
+define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 {
+  %r = fpext <2 x half> %a to <2 x float>
+  ret <2 x float> %r
+}
+
+; CHECK-LABEL: test_fpext_2xdouble(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f64.f16     [[R0:%fd[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f64.f16     [[R1:%fd[0-9]+]], [[A1]];
+; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK:      ret;
+define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 {
+  %r = fpext <2 x half> %a to <2 x double>
+  ret <2 x double> %r
+}
+
+
+; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16(
+; CHECK:      ld.param.u32    [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0];
+; CHECK-DAG:  cvt.u16.u32     [[R0:%rs[0-9]+]], [[A]]
+; CHECK-DAG:  shr.u32         [[AH:%r[0-9]+]], [[A]], 16
+; CHECK-DAG:  cvt.u16.u32     [[R1:%rs[0-9]+]], [[AH]]
+; CHECK:      st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}
+; CHECK:      ret;
+define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 {
+  %r = bitcast <2 x half> %a to <2 x i16>
+  ret <2 x i16> %r
+}
+
+; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf(
+; CHECK:      ld.param.v2.u16         {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0];
+; CHECK-DAG:  cvt.u32.u16     [[R0:%r[0-9]+]], [[RS0]];
+; CHECK-DAG:  cvt.u32.u16     [[R1:%r[0-9]+]], [[RS1]];
+; CHECK-DAG:  shl.b32         [[R1H:%r[0-9]+]], [[R1]], 16;
+; CHECK-DAG:  or.b32          [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], [[R1H0L]];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 {
+  %r = bitcast <2 x i16> %a to <2 x half>
+  ret <2 x half> %r
+}
+
+
+declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0
+declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
+declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0
+declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.round.f16(<2 x half> %a) #0
+declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0
+
+; CHECK-LABEL: test_sqrt(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_sqrt_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  sqrt.rn.f32     [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  sqrt.rn.f32     [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_sqrt(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_powi(
+;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 {
+;  %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b)
+;  ret <2 x half> %r
+;}
+
+; CHECK-LABEL: test_sin(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_sin_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  sin.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  sin.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_sin(<2 x half> %a) #0 #1 {
+  %r = call <2 x half> @llvm.sin.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_cos(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_cos_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cos.approx.f32  [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  cos.approx.f32  [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_cos(<2 x half> %a) #0 #1 {
+  %r = call <2 x half> @llvm.cos.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_pow(
+;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 {
+;  %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b)
+;  ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp(
+;define <2 x half> @test_exp(<2 x half> %a) #0 {
+;  %r = call <2 x half> @llvm.exp.f16(<2 x half> %a)
+;  ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_exp2(
+;define <2 x half> @test_exp2(<2 x half> %a) #0 {
+;  %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a)
+;  ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log(
+;define <2 x half> @test_log(<2 x half> %a) #0 {
+;  %r = call <2 x half> @llvm.log.f16(<2 x half> %a)
+;  ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log10(
+;define <2 x half> @test_log10(<2 x half> %a) #0 {
+;  %r = call <2 x half> @llvm.log10.f16(<2 x half> %a)
+;  ret <2 x half> %r
+;}
+
+;;; Can't do this yet: requires libcall.
+; XCHECK-LABEL: test_log2(
+;define <2 x half> @test_log2(<2 x half> %a) #0 {
+;  %r = call <2 x half> @llvm.log2.f16(<2 x half> %a)
+;  ret <2 x half> %r
+;}
+
+; CHECK-LABEL: test_fma(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fma_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fma_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_fma_param_2];
+;
+; CHECK-F16:        fma.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  fma.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG:  fma.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret
+define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+  %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fabs(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_fabs_param_0];
+; CHECK:      mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  abs.f32         [[RF0:%f[0-9]+]], [[AF0]];
+; CHECK-DAG:  abs.f32         [[RF1:%f[0-9]+]], [[AF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_fabs(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_minnum(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_minnum_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_minnum_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.f16     [[BF0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.f16     [[BF1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  min.f32         [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
+; CHECK-DAG:  min.f32         [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_maxnum(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_maxnum_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_maxnum_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  cvt.f32.f16     [[AF0:%f[0-9]+]], [[A0]];
+; CHECK-DAG:  cvt.f32.f16     [[AF1:%f[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.f32.f16     [[BF0:%f[0-9]+]], [[B0]];
+; CHECK-DAG:  cvt.f32.f16     [[BF1:%f[0-9]+]], [[B1]];
+; CHECK-DAG:  max.f32         [[RF0:%f[0-9]+]], [[AF0]], [[BF0]];
+; CHECK-DAG:  max.f32         [[RF1:%f[0-9]+]], [[AF1]], [[BF1]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R0:%h[0-9]+]], [[RF0]];
+; CHECK-DAG:  cvt.rn.f16.f32  [[R1:%h[0-9]+]], [[RF1]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_copysign_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  mov.b16         [[BS0:%rs[0-9]+]], [[B0]];
+; CHECK-DAG:  mov.b16         [[BS1:%rs[0-9]+]], [[B1]];
+; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[BS0]], -32768;
+; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[BS1]], -32768;
+; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_f32(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_f32_param_0];
+; CHECK-DAG:  ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  mov.b32         [[BI0:%r[0-9]+]], [[B0]];
+; CHECK-DAG:  mov.b32         [[BI1:%r[0-9]+]], [[B1]];
+; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG:  and.b32         [[BX0:%r[0-9]+]], [[BI0]], -2147483648;
+; CHECK-DAG:  and.b32         [[BX1:%r[0-9]+]], [[BI1]], -2147483648;
+; CHECK-DAG:  shr.u32         [[BY0:%r[0-9]+]], [[BX0]], 16;
+; CHECK-DAG:  shr.u32         [[BY1:%r[0-9]+]], [[BX1]], 16;
+; CHECK-DAG:  cvt.u16.u32     [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-DAG:  cvt.u16.u32     [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 {
+  %tb = fptrunc <2 x float> %b to <2 x half>
+  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_f64(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_f64_param_0];
+; CHECK-DAG:  ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  mov.b64         [[BI0:%rd[0-9]+]], [[B0]];
+; CHECK-DAG:  mov.b64         [[BI1:%rd[0-9]+]], [[B1]];
+; CHECK-DAG:  and.b16         [[AI0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG:  and.b16         [[AI1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG:  and.b64         [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808;
+; CHECK-DAG:  and.b64         [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808;
+; CHECK-DAG:  shr.u64         [[BY0:%rd[0-9]+]], [[BX0]], 48;
+; CHECK-DAG:  shr.u64         [[BY1:%rd[0-9]+]], [[BX1]], 48;
+; CHECK-DAG:  cvt.u16.u64     [[BZ0:%rs[0-9]+]], [[BY0]];
+; CHECK-DAG:  cvt.u16.u64     [[BZ1:%rs[0-9]+]], [[BY1]];
+; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]];
+; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]];
+; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 {
+  %tb = fptrunc <2 x double> %b to <2 x half>
+  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_copysign_extended(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_copysign_extended_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_copysign_extended_param_1];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-DAG:  mov.b32         {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-DAG:  mov.b16         [[AS0:%rs[0-9]+]], [[A0]];
+; CHECK-DAG:  mov.b16         [[AS1:%rs[0-9]+]], [[A1]];
+; CHECK-DAG:  mov.b16         [[BS0:%rs[0-9]+]], [[B0]];
+; CHECK-DAG:  mov.b16         [[BS1:%rs[0-9]+]], [[B1]];
+; CHECK-DAG:  and.b16         [[AX0:%rs[0-9]+]], [[AS0]], 32767;
+; CHECK-DAG:  and.b16         [[AX1:%rs[0-9]+]], [[AS1]], 32767;
+; CHECK-DAG:  and.b16         [[BX0:%rs[0-9]+]], [[BS0]], -32768;
+; CHECK-DAG:  and.b16         [[BX1:%rs[0-9]+]], [[BS1]], -32768;
+; CHECK-DAG:  or.b16          [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]];
+; CHECK-DAG:  or.b16          [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]];
+; CHECK-DAG:  mov.b16         [[R0:%h[0-9]+]], [[RS0]];
+; CHECK-DAG:  mov.b16         [[R1:%h[0-9]+]], [[RS1]];
+; CHECK-DAG:  mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      mov.b32         {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]]
+; CHECK-DAG:  cvt.f32.f16     [[XR0:%f[0-9]+]], [[RX0]];
+; CHECK-DAG:  cvt.f32.f16     [[XR1:%f[0-9]+]], [[RX1]];
+; CHECK:      st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]};
+; CHECK:      ret;
+define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 {
+  %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b)
+  %xr = fpext <2 x half> %r to <2 x float>
+  ret <2 x float> %xr
+}
+
+; CHECK-LABEL: test_floor(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_floor_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_floor(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.floor.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_ceil(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_ceil_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_ceil(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_trunc(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_trunc_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_trunc(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_rint(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_rint_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_rint(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.rint.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_nearbyint(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_nearbyint_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_nearbyint(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_round(
+; CHECK:      ld.param.b32    [[A:%hh[0-9]+]], [test_round_param_0];
+; CHECK-DAG:  mov.b32         {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]];
+; CHECK-DAG:  cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]];
+; CHECK:      mov.b32         [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_round(<2 x half> %a) #0 {
+  %r = call <2 x half> @llvm.round.f16(<2 x half> %a)
+  ret <2 x half> %r
+}
+
+; CHECK-LABEL: test_fmuladd(
+; CHECK-DAG:  ld.param.b32    [[A:%hh[0-9]+]], [test_fmuladd_param_0];
+; CHECK-DAG:  ld.param.b32    [[B:%hh[0-9]+]], [test_fmuladd_param_1];
+; CHECK-DAG:  ld.param.b32    [[C:%hh[0-9]+]], [test_fmuladd_param_2];
+;
+; CHECK-F16:        fma.rn.f16x2   [[R:%hh[0-9]+]], [[A]], [[B]], [[C]];
+;
+; CHECK-NOF16-DAG:  mov.b32        {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]
+; CHECK-NOF16-DAG:  mov.b32        {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]]
+; CHECK-NOF16-DAG:  mov.b32        {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA0:%f[0-9]+]], [[A0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB0:%f[0-9]+]], [[B0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FA1:%f[0-9]+]], [[A1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FB1:%f[0-9]+]], [[B1]]
+; CHECK-NOF16-DAG:  cvt.f32.f16    [[FC0:%f[0-9]+]], [[C0]]
+; CHECK-NOF16-DAG:  fma.rn.f32     [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]];
+; CHECK-NOF16-DAG:  fma.rn.f32     [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]];
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]
+; CHECK-NOF16-DAG:  cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]
+; CHECK-NOF16:      mov.b32        [[R:%hh[0-9]+]], {[[R0]], [[R1]]}
+;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 {
+  %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c)
+  ret <2 x half> %r
+}
+
+attributes #0 = { nounwind }
+attributes #1 = { "unsafe-fp-math" = "true" }
diff --git a/test/CodeGen/NVPTX/fast-math.ll b/test/CodeGen/NVPTX/fast-math.ll
index 08b435b993f5a5a0cd6616d9fb84efe50a4b4d85..56b1f88f3b2eaf99a35c2e0fdee43063d88117e9 100644
--- a/test/CodeGen/NVPTX/fast-math.ll
+++ b/test/CodeGen/NVPTX/fast-math.ll
@@ -1,25 +1,91 @@
 ; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s
 
-declare float @llvm.nvvm.sqrt.f(float)
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.sqrt.f64(double)
 
-; CHECK-LABEL: sqrt_div
+; CHECK-LABEL: sqrt_div(
 ; CHECK: sqrt.rn.f32
 ; CHECK: div.rn.f32
 define float @sqrt_div(float %a, float %b) {
-  %t1 = tail call float @llvm.nvvm.sqrt.f(float %a)
+  %t1 = tail call float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
-; CHECK-LABEL: sqrt_div_fast
+; CHECK-LABEL: sqrt_div_fast(
 ; CHECK: sqrt.approx.f32
 ; CHECK: div.approx.f32
 define float @sqrt_div_fast(float %a, float %b) #0 {
-  %t1 = tail call float @llvm.nvvm.sqrt.f(float %a)
+  %t1 = tail call float @llvm.sqrt.f32(float %a)
   %t2 = fdiv float %t1, %b
   ret float %t2
 }
 
+; CHECK-LABEL: sqrt_div_ftz(
+; CHECK: sqrt.rn.ftz.f32
+; CHECK: div.rn.ftz.f32
+define float @sqrt_div_ftz(float %a, float %b) #1 {
+  %t1 = tail call float @llvm.sqrt.f32(float %a)
+  %t2 = fdiv float %t1, %b
+  ret float %t2
+}
+
+; CHECK-LABEL: sqrt_div_fast_ftz(
+; CHECK: sqrt.approx.ftz.f32
+; CHECK: div.approx.ftz.f32
+define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 {
+  %t1 = tail call float @llvm.sqrt.f32(float %a)
+  %t2 = fdiv float %t1, %b
+  ret float %t2
+}
+
+; There are no fast-math or ftz versions of sqrt and div for f64.  We use
+; reciprocal(rsqrt(x)) for sqrt(x), and emit a vanilla divide.
+
+; CHECK-LABEL: sqrt_div_fast_ftz_f64(
+; CHECK: rsqrt.approx.f64
+; CHECK: rcp.approx.ftz.f64
+; CHECK: div.rn.f64
+define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 {
+  %t1 = tail call double @llvm.sqrt.f64(double %a)
+  %t2 = fdiv double %t1, %b
+  ret double %t2
+}
+
+; CHECK-LABEL: rsqrt(
+; CHECK-NOT: rsqrt.approx
+; CHECK: sqrt.rn.f32
+; CHECK-NOT: rsqrt.approx
+define float @rsqrt(float %a) {
+  %b = tail call float @llvm.sqrt.f32(float %a)
+  %ret = fdiv float 1.0, %b
+  ret float %ret
+}
+
+; CHECK-LABEL: rsqrt_fast(
+; CHECK-NOT: div.
+; CHECK-NOT: sqrt.
+; CHECK: rsqrt.approx.f32
+; CHECK-NOT: div.
+; CHECK-NOT: sqrt.
+define float @rsqrt_fast(float %a) #0 {
+  %b = tail call float @llvm.sqrt.f32(float %a)
+  %ret = fdiv float 1.0, %b
+  ret float %ret
+}
+
+; CHECK-LABEL: rsqrt_fast_ftz(
+; CHECK-NOT: div.
+; CHECK-NOT: sqrt.
+; CHECK: rsqrt.approx.ftz.f32
+; CHECK-NOT: div.
+; CHECK-NOT: sqrt.
+define float @rsqrt_fast_ftz(float %a) #0 #1 {
+  %b = tail call float @llvm.sqrt.f32(float %a)
+  %ret = fdiv float 1.0, %b
+  ret float %ret
+}
+
 ; CHECK-LABEL: fadd
 ; CHECK: add.rn.f32
 define float @fadd(float %a, float %b) {
@@ -51,5 +117,49 @@ define float @fcos_approx(float %a) #0 {
   ret float %r
 }
 
+; CHECK-LABEL: repeated_div_recip_allowed
+define float @repeated_div_recip_allowed(i1 %pred, float %a, float %b, float %divisor) {
+; CHECK: rcp.rn.f32
+; CHECK: mul.rn.f32
+; CHECK: mul.rn.f32
+  %x = fdiv arcp float %a, %divisor
+  %y = fdiv arcp float %b, %divisor
+  %z = select i1 %pred, float %x, float %y
+  ret float %z
+}
+
+; CHECK-LABEL: repeated_div_recip_allowed_ftz
+define float @repeated_div_recip_allowed_ftz(i1 %pred, float %a, float %b, float %divisor) #1 {
+; CHECK: rcp.rn.ftz.f32
+; CHECK: mul.rn.ftz.f32
+; CHECK: mul.rn.ftz.f32
+  %x = fdiv arcp float %a, %divisor
+  %y = fdiv arcp float %b, %divisor
+  %z = select i1 %pred, float %x, float %y
+  ret float %z
+}
+
+; CHECK-LABEL: repeated_div_fast
+define float @repeated_div_fast(i1 %pred, float %a, float %b, float %divisor) #0 {
+; CHECK: rcp.approx.f32
+; CHECK: mul.f32
+; CHECK: mul.f32
+  %x = fdiv float %a, %divisor
+  %y = fdiv float %b, %divisor
+  %z = select i1 %pred, float %x, float %y
+  ret float %z
+}
+
+; CHECK-LABEL: repeated_div_fast_ftz
+define float @repeated_div_fast_ftz(i1 %pred, float %a, float %b, float %divisor) #0 #1 {
+; CHECK: rcp.approx.ftz.f32
+; CHECK: mul.ftz.f32
+; CHECK: mul.ftz.f32
+  %x = fdiv float %a, %divisor
+  %y = fdiv float %b, %divisor
+  %z = select i1 %pred, float %x, float %y
+  ret float %z
+}
+
 attributes #0 = { "unsafe-fp-math" = "true" }
 attributes #1 = { "nvptx-f32ftz" = "true" }
diff --git a/test/CodeGen/NVPTX/fma-assoc.ll b/test/CodeGen/NVPTX/fma-assoc.ll
index 80a08a86316c86f0d0ef7ca9e2412341943b8258..df86d476efdce18b00c273674d1b87ca4111d515 100644
--- a/test/CodeGen/NVPTX/fma-assoc.ll
+++ b/test/CodeGen/NVPTX/fma-assoc.ll
@@ -1,9 +1,10 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE
 
 define ptx_device float @t1_f32(float %x, float %y, float %z,
                                 float %u, float %v) {
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
-; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK-UNSAFE: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
+; CHECK-UNSAFE: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}};
 ; CHECK: ret;
   %a = fmul float %x, %y
   %b = fmul float %u, %v
@@ -14,8 +15,8 @@ define ptx_device float @t1_f32(float %x, float %y, float %z,
 
 define ptx_device double @t1_f64(double %x, double %y, double %z,
                                  double %u, double %v) {
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
-; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK-UNSAFE: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
+; CHECK-UNSAFE: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}};
 ; CHECK: ret;
   %a = fmul double %x, %y
   %b = fmul double %u, %v
diff --git a/test/CodeGen/NVPTX/ldg-invariant.ll b/test/CodeGen/NVPTX/ldg-invariant.ll
index 40dad1f1769bafcbb3c8983ba0a7e3e77a101738..311bea6f41645a1abdbb75d5a296e8bddfe25597 100644
--- a/test/CodeGen/NVPTX/ldg-invariant.ll
+++ b/test/CodeGen/NVPTX/ldg-invariant.ll
@@ -10,6 +10,30 @@ define i32 @ld_global(i32 addrspace(1)* %ptr) {
   ret i32 %a
 }
 
+; CHECK-LABEL: @ld_global_v2i32
+define i32 @ld_global_v2i32(<2 x i32> addrspace(1)* %ptr) {
+; CHECK: ld.global.nc.v2.{{[a-z]}}32
+  %a = load <2 x i32>, <2 x i32> addrspace(1)* %ptr, !invariant.load !0
+  %v1 = extractelement <2 x i32> %a, i32 0
+  %v2 = extractelement <2 x i32> %a, i32 1
+  %sum = add i32 %v1, %v2
+  ret i32 %sum
+}
+
+; CHECK-LABEL: @ld_global_v4i32
+define i32 @ld_global_v4i32(<4 x i32> addrspace(1)* %ptr) {
+; CHECK: ld.global.nc.v4.{{[a-z]}}32
+  %a = load <4 x i32>, <4 x i32> addrspace(1)* %ptr, !invariant.load !0
+  %v1 = extractelement <4 x i32> %a, i32 0
+  %v2 = extractelement <4 x i32> %a, i32 1
+  %v3 = extractelement <4 x i32> %a, i32 2
+  %v4 = extractelement <4 x i32> %a, i32 3
+  %sum1 = add i32 %v1, %v2
+  %sum2 = add i32 %v3, %v4
+  %sum3 = add i32 %sum1, %sum2
+  ret i32 %sum3
+}
+
 ; CHECK-LABEL: @ld_not_invariant
 define i32 @ld_not_invariant(i32 addrspace(1)* %ptr) {
 ; CHECK: ld.global.{{[a-z]}}32
diff --git a/test/CodeGen/NVPTX/ldparam-v4.ll b/test/CodeGen/NVPTX/ldparam-v4.ll
index ec306aafe85499dd39c108d9987918c3bd647f6a..4d082f6e9a58d456e984a31466466833b113e753 100644
--- a/test/CodeGen/NVPTX/ldparam-v4.ll
+++ b/test/CodeGen/NVPTX/ldparam-v4.ll
@@ -2,8 +2,11 @@
 
 declare <4 x float> @bar()
 
+; CHECK-LABEL: .func foo(
 define void @foo(<4 x float>* %ptr) {
-; CHECK: ld.param.v4.f32
+; CHECK:     ld.param.u32 %[[PTR:r[0-9]+]], [foo_param_0];
+; CHECK:     ld.param.v4.f32 {[[E0:%f[0-9]+]], [[E1:%f[0-9]+]], [[E2:%f[0-9]+]], [[E3:%f[0-9]+]]}, [retval0+0];
+; CHECK:     st.v4.f32    [%[[PTR]]], {[[E0]], [[E1]], [[E2]], [[E3]]}
   %val = tail call <4 x float> @bar()
   store <4 x float> %val, <4 x float>* %ptr
   ret void
diff --git a/test/CodeGen/NVPTX/lower-aggr-copies.ll b/test/CodeGen/NVPTX/lower-aggr-copies.ll
index ef570982b8081f7474382b292fd49f6f0942554b..192d4becb05963fdb8fe36e1d30bcdbc15e23e77 100644
--- a/test/CodeGen/NVPTX/lower-aggr-copies.ll
+++ b/test/CodeGen/NVPTX/lower-aggr-copies.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 | FileCheck %s --check-prefix PTX
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX
 ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR
 
 ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
@@ -27,9 +27,9 @@ entry:
 ; PTX:        LBB[[LABEL:[_0-9]+]]:
 ; PTX:        ld.u8 %rs[[REG:[0-9]+]]
 ; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
-; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
-; PTX-NEXT:   setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
-; PTX-NEXT:   @%p[[PRED]] bra LBB[[LABEL]]
+; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
+; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; PTX:        @%p[[PRED]] bra LBB[[LABEL]]
 }
 
 define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
@@ -45,9 +45,9 @@ entry:
 ; PTX:        LBB[[LABEL:[_0-9]+]]:
 ; PTX:        ld.volatile.u8 %rs[[REG:[0-9]+]]
 ; PTX:        st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]]
-; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
-; PTX-NEXT:   setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
-; PTX-NEXT:   @%p[[PRED]] bra LBB[[LABEL]]
+; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
+; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; PTX:        @%p[[PRED]] bra LBB[[LABEL]]
 }
 
 define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 {
@@ -78,12 +78,26 @@ entry:
 ; IR-NEXT:    store i8 [[VAL]], i8* [[STOREPTR]]
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memset_caller(
-; PTX:        ld.param.u8 %rs[[REG:[0-9]+]]
+; PTX:        ld.param.u32 %r[[C:[0-9]+]]
+; PTX:        cvt.u16.u32  %rs[[REG:[0-9]+]], %r[[C]];
 ; PTX:        LBB[[LABEL:[_0-9]+]]:
 ; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
-; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
-; PTX-NEXT:   setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
-; PTX-NEXT:   @%p[[PRED]] bra LBB[[LABEL]]
+; PTX:        add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1
+; PTX:        setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
+; PTX:        @%p[[PRED]] bra LBB[[LABEL]]
+}
+
+define i8* @volatile_memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
+entry:
+  %0 = trunc i32 %c to i8
+  tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i32 1, i1 true)
+  ret i8* %dst
+
+; IR-LABEL:   @volatile_memset_caller
+; IR:         [[VAL:%[0-9]+]] = trunc i32 %c to i8
+; IR:         loadstoreloop:
+; IR:         [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64
+; IR-NEXT:    store volatile i8 [[VAL]], i8* [[STOREPTR]]
 }
 
 define i8* @memmove_caller(i8* %dst, i8* %src, i64 %n) #0 {
@@ -100,12 +114,12 @@ entry:
 
 ; PTX-LABEL:  .visible .func (.param .b64 func_retval0) memmove_caller(
 ; PTX:        ld.param.u64 %rd[[N:[0-9]+]]
-; PTX:        setp.eq.s64 %p[[NEQ0:[0-9]+]], %rd[[N]], 0
-; PTX:        setp.ge.u64 %p[[SRC_GT_THAN_DST:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
+; PTX-DAG:    setp.eq.s64 %p[[NEQ0:[0-9]+]], %rd[[N]], 0
+; PTX-DAG:    setp.ge.u64 %p[[SRC_GT_THAN_DST:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
 ; PTX-NEXT:   @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]]
 ; -- this is the backwards copying BB
 ; PTX:        @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]]
-; PTX:        add.s64 %rd[[N]], %rd[[N]], -1
+; PTX:        add.s64 %rd{{[0-9]}}, %rd{{[0-9]}}, -1
 ; PTX:        ld.u8 %rs[[ELEMENT:[0-9]+]]
 ; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]]
 ; -- this is the forwards copying BB
@@ -113,7 +127,7 @@ entry:
 ; PTX:        @%p[[NEQ0]] bra LBB[[EXIT]]
 ; PTX:        ld.u8 %rs[[ELEMENT2:[0-9]+]]
 ; PTX:        st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]]
-; PTX:        add.s64 %rd[[INDEX:[0-9]+]], %rd[[INDEX]], 1
+; PTX:        add.s64 %rd{{[0-9]+}}, %rd{{[0-9]+}}, 1
 ; -- exit block
 ; PTX:        LBB[[EXIT]]:
 ; PTX-NEXT:   st.param.b64 [func_retval0
diff --git a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
index 2ad72b018851bd2b0592b0917a5f095c59744d8b..036d9638ceac7a954bf73c8a51b69be67a56c5b2 100644
--- a/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
+++ b/test/CodeGen/NVPTX/misaligned-vector-ldst.ll
@@ -41,6 +41,64 @@ define <4 x float> @t4(i8* %p1) {
   ret <4 x float> %r
 }
 
+; CHECK-LABEL: .visible .func test_v1halfp0a1(
+; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v1halfp0a1_param_0];
+; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v1halfp0a1_param_1];
+; CHECK-DAG: ld.u8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.u8        [%[[TO]]], [[B0]]
+; CHECK-DAG: ld.u8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.u8        [%[[TO]]+1], [[B1]]
+; CHECK: ret
+define void @test_v1halfp0a1(<1 x half> * noalias readonly %from, <1 x half> * %to) {
+  %1 = load <1 x half>, <1 x half> * %from , align 1
+  store <1 x half> %1, <1 x half> * %to , align 1
+  ret void
+}
+
+; CHECK-LABEL: .visible .func test_v2halfp0a1(
+; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v2halfp0a1_param_0];
+; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v2halfp0a1_param_1];
+; CHECK-DAG: ld.u8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.u8        [%[[TO]]],
+; CHECK-DAG: ld.u8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.u8        [%[[TO]]+1],
+; CHECK-DAG: ld.u8        [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2]
+; CHECK-DAG: st.u8        [%[[TO]]+2],
+; CHECK-DAG: ld.u8        [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3]
+; CHECK-DAG: st.u8        [%[[TO]]+3],
+; CHECK: ret
+define void @test_v2halfp0a1(<2 x half> * noalias readonly %from, <2 x half> * %to) {
+  %1 = load <2 x half>, <2 x half> * %from , align 1
+  store <2 x half> %1, <2 x half> * %to , align 1
+  ret void
+}
+
+; CHECK-LABEL: .visible .func test_v4halfp0a1(
+; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_v4halfp0a1_param_0];
+; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_v4halfp0a1_param_1];
+; CHECK-DAG: ld.u8        [[B0:%r[sd]?[0-9]+]], [%[[FROM]]]
+; CHECK-DAG: st.u8        [%[[TO]]], [[B0]]
+; CHECK-DAG: ld.u8        [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1]
+; CHECK-DAG: st.u8        [%[[TO]]+1], [[B1]]
+; CHECK-DAG: ld.u8        [[B2:%r[sd]?[0-9]+]], [%[[FROM]]+2]
+; CHECK-DAG: st.u8        [%[[TO]]+2], [[B2]]
+; CHECK-DAG: ld.u8        [[B3:%r[sd]?[0-9]+]], [%[[FROM]]+3]
+; CHECK-DAG: st.u8        [%[[TO]]+3], [[B3]]
+; CHECK-DAG: ld.u8        [[B4:%r[sd]?[0-9]+]], [%[[FROM]]+4]
+; CHECK-DAG: st.u8        [%[[TO]]+4], [[B4]]
+; CHECK-DAG: ld.u8        [[B5:%r[sd]?[0-9]+]], [%[[FROM]]+5]
+; CHECK-DAG: st.u8        [%[[TO]]+5], [[B5]]
+; CHECK-DAG: ld.u8        [[B6:%r[sd]?[0-9]+]], [%[[FROM]]+6]
+; CHECK-DAG: st.u8        [%[[TO]]+6], [[B6]]
+; CHECK-DAG: ld.u8        [[B7:%r[sd]?[0-9]+]], [%[[FROM]]+7]
+; CHECK-DAG: st.u8        [%[[TO]]+7], [[B7]]
+; CHECK: ret
+define void @test_v4halfp0a1(<4 x half> * noalias readonly %from, <4 x half> * %to) {
+  %1 = load <4 x half>, <4 x half> * %from , align 1
+  store <4 x half> %1, <4 x half> * %to , align 1
+  ret void
+}
+
 
 ; CHECK-LABEL: s1
 define void @s1(<4 x float>* %p1, <4 x float> %v) {
diff --git a/test/CodeGen/NVPTX/param-load-store.ll b/test/CodeGen/NVPTX/param-load-store.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8a67567acc966316469351864d58a281ec6b2859
--- /dev/null
+++ b/test/CodeGen/NVPTX/param-load-store.ll
@@ -0,0 +1,939 @@
+; Verifies correctness of load/store of parameters and return values.
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s
+
+%s_i1 = type { i1 }
+%s_i8 = type { i8 }
+%s_i16 = type { i16 }
+%s_f16 = type { half }
+%s_i32 = type { i32 }
+%s_f32 = type { float }
+%s_i64 = type { i64 }
+%s_f64 = type { double }
+
+; More complicated types. i64 is used to increase natural alignment
+; requirement for the type.
+%s_i32x4 = type { i32, i32, i32, i32, i64}
+%s_i32f32 = type { i32, float, i32, float, i64}
+%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64}
+%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}>
+%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]}
+; All scalar parameters must be at least 32 bits in size.
+; i1 is loaded/stored as i8.
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i1(
+; CHECK-NEXT: .param .b32 test_i1_param_0
+; CHECK:      ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0];
+; CHECK:      and.b32 [[A:%r[0-9]+]], [[A8]], 1;
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[A]]
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni
+; CHECK-NEXT: test_i1,
+; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0+0];
+; CHECK:      and.b32         [[R:%r[0-9]+]], [[R8]], 1;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK:      ret;
+define i1 @test_i1(i1 %a) {
+  %r = tail call i1 @test_i1(i1 %a);
+  ret i1 %r;
+}
+
+; Signed i1 is a somewhat special case. We only care about one bit and
+; then us neg.s32 to convert it to 32-bit -1 if it's set.
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i1s(
+; CHECK-NEXT: .param .b32 test_i1s_param_0
+; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0];
+; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
+; CHECK:      and.b32         [[A1:%r[0-9]+]], [[A32]], 1;
+; CHECK:      neg.s32         [[A:%r[0-9]+]], [[A1]];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[A]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni
+; CHECK:      ld.param.b32    [[R8:%r[0-9]+]], [retval0+0];
+; CHECK:      and.b32         [[R1:%r[0-9]+]], [[R8]], 1;
+; CHECK:      neg.s32         [[R:%r[0-9]+]], [[R1]];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i1 @test_i1s(i1 signext %a) {
+       %r = tail call signext i1 @test_i1s(i1 signext %a);
+       ret i1 %r;
+}
+
+; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment.
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v3i1(
+; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4]
+; CHECK-DAG:  ld.param.u8     [[E2:%rs[0-9]+]], [test_v3i1_param_0+2];
+; CHECK-DAG:  ld.param.v2.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0]
+; CHECK:      .param .align 4 .b8 param0[4];
+; CHECK-DAG:  st.param.v2.b8  [param0+0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.b8     [param0+2], [[E2]];
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v3i1,
+; CHECK-DAG:  ld.param.v2.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+2];
+; CHECK-DAG:  st.param.v2.b8  [func_retval0+0], {[[RE0]], [[RE1]]}
+; CHECK-DAG:  st.param.b8     [func_retval0+2], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i1> @test_v3i1(<3 x i1> %a) {
+       %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a);
+       ret <3 x i1> %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v4i1(
+; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4]
+; CHECK:      ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0]
+; CHECK:      .param .align 4 .b8 param0[4];
+; CHECK:      st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v4i1,
+; CHECK:      ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]};
+; CHECK-NEXT: ret;
+define <4 x i1> @test_v4i1(<4 x i1> %a) {
+       %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a);
+       ret <4 x i1> %r;
+}
+
+; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v5i1(
+; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8]
+; CHECK-DAG:  ld.param.u8     [[E4:%rs[0-9]+]], [test_v5i1_param_0+4];
+; CHECK-DAG:  ld.param.v4.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0]
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK-DAG:  st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v5i1,
+; CHECK-DAG:  ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG:  st.param.v4.b8  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG:  st.param.b8     [func_retval0+4], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i1> @test_v5i1(<5 x i1> %a) {
+       %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a);
+       ret <5 x i1> %r;
+}
+
+; Unsigned i8 is loaded directly into 32-bit register.
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i8(
+; CHECK-NEXT: .param .b32 test_i8_param_0
+; CHECK:      ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0];
+; CHECK:      cvt.u32.u16     [[A32:%r[0-9]+]], [[A8]];
+; CHECK:      and.b32         [[A:%r[0-9]+]], [[A32]], 255;
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[A]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK:      test_i8,
+; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0+0];
+; CHECK:      and.b32         [[R:%r[0-9]+]], [[R32]], 255;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i8 @test_i8(i8 %a) {
+       %r = tail call i8 @test_i8(i8 %a);
+       ret i8 %r;
+}
+
+; signed i8 is loaded into 16-bit register which is then sign-extended to i32.
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i8s(
+; CHECK-NEXT: .param .b32 test_i8s_param_0
+; CHECK:      ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0];
+; CHECK:      cvt.s32.s16     [[A:%r[0-9]+]], [[A8]];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[A]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK:      test_i8s,
+; CHECK:      ld.param.b32    [[R32:%r[0-9]+]], [retval0+0];
+; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ?
+; CHECK:      cvt.u16.u32     [[R16:%rs[0-9]+]], [[R32]];
+; CHECK:      cvt.s32.s16     [[R:%r[0-9]+]], [[R16]];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i8 @test_i8s(i8 signext %a) {
+       %r = tail call signext i8 @test_i8s(i8 signext %a);
+       ret i8 %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v3i8(
+; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4]
+; CHECK-DAG:  ld.param.u8     [[E2:%rs[0-9]+]], [test_v3i8_param_0+2];
+; CHECK-DAG:  ld.param.v2.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0];
+; CHECK:      .param .align 4 .b8 param0[4];
+; CHECK:      st.param.v2.b8  [param0+0], {[[E0]], [[E1]]};
+; CHECK:      st.param.b8     [param0+2], [[E2]];
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v3i8,
+; CHECK-DAG:  ld.param.v2.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+2];
+; CHECK-DAG:  st.param.v2.b8  [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:  st.param.b8     [func_retval0+2], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i8> @test_v3i8(<3 x i8> %a) {
+       %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a);
+       ret <3 x i8> %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v4i8(
+; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4]
+; CHECK:      ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0]
+; CHECK:      .param .align 4 .b8 param0[4];
+; CHECK:      st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v4i8,
+; CHECK:      ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-NEXT: ret;
+define <4 x i8> @test_v4i8(<4 x i8> %a) {
+       %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a);
+       ret <4 x i8> %r;
+}
+
+; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v5i8(
+; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8]
+; CHECK-DAG:  ld.param.u8     [[E4:%rs[0-9]+]], [test_v5i8_param_0+4];
+; CHECK-DAG   ld.param.v4.u8  {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0]
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK-DAG:  st.param.v4.b8  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG:  st.param.b8     [param0+4], [[E4]];
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v5i8,
+; CHECK-DAG:  ld.param.v4.b8  {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b8     [[RE4:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG:  st.param.v4.b8  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG:  st.param.b8     [func_retval0+4], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i8> @test_v5i8(<5 x i8> %a) {
+       %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a);
+       ret <5 x i8> %r;
+}
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i16(
+; CHECK-NEXT: .param .b32 test_i16_param_0
+; CHECK:      ld.param.u16    [[E16:%rs[0-9]+]], [test_i16_param_0];
+; CHECK:      cvt.u32.u16     [[E32:%r[0-9]+]], [[E16]];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[E32]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_i16,
+; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0+0];
+; CHECK:      and.b32         [[R:%r[0-9]+]], [[RE32]], 65535;
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i16 @test_i16(i16 %a) {
+       %r = tail call i16 @test_i16(i16 %a);
+       ret i16 %r;
+}
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i16s(
+; CHECK-NEXT: .param .b32 test_i16s_param_0
+; CHECK:      ld.param.u16    [[E16:%rs[0-9]+]], [test_i16s_param_0];
+; CHECK:      cvt.s32.s16     [[E32:%r[0-9]+]], [[E16]];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[E32]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_i16s,
+; CHECK:      ld.param.b32    [[RE32:%r[0-9]+]], [retval0+0];
+; CHECK:      cvt.s32.s16     [[R:%r[0-9]+]], [[RE32]];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define signext i16 @test_i16s(i16 signext %a) {
+       %r = tail call signext i16 @test_i16s(i16 signext %a);
+       ret i16 %r;
+}
+
+; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v3i16(
+; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8]
+; CHECK-DAG:  ld.param.u16    [[E2:%rs[0-9]+]], [test_v3i16_param_0+4];
+; CHECK-DAG:  ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0];
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK:      st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
+; CHECK:      st.param.b16    [param0+4], [[E2]];
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v3i16,
+; CHECK:      ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0];
+; CHECK:      ld.param.b16    [[RE2:%rs[0-9]+]], [retval0+4];
+; CHECK-DAG:  st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:  st.param.b16    [func_retval0+4], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i16> @test_v3i16(<3 x i16> %a) {
+       %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a);
+       ret <3 x i16> %r;
+}
+
+; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v4i16(
+; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8]
+; CHECK:      ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0]
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK:      st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v4i16,
+; CHECK:      ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-NEXT: ret;
+define <4 x i16> @test_v4i16(<4 x i16> %a) {
+       %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a);
+       ret <4 x i16> %r;
+}
+
+; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v5i16(
+; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16]
+; CHECK-DAG:  ld.param.u16    [[E4:%rs[0-9]+]], [test_v5i16_param_0+8];
+; CHECK-DAG   ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0]
+; CHECK:      .param .align 16 .b8 param0[16];
+; CHECK-DAG:  st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
+; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v5i16,
+; CHECK-DAG:  ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b16    [[RE4:%rs[0-9]+]], [retval0+8];
+; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG:  st.param.b16    [func_retval0+8], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i16> @test_v5i16(<5 x i16> %a) {
+       %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a);
+       ret <5 x i16> %r;
+}
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_f16(
+; CHECK-NEXT: .param .b32 test_f16_param_0
+; CHECK:      ld.param.b16    [[E:%h[0-9]+]], [test_f16_param_0];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b16    [param0+0], [[E]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_f16,
+; CHECK:      ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]]
+; CHECK-NEXT: ret;
+define half @test_f16(half %a) {
+       %r = tail call half @test_f16(half %a);
+       ret half %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_v2f16(
+; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4]
+; CHECK:      ld.param.b32    [[E:%hh[0-9]+]], [test_v2f16_param_0];
+; CHECK:      .param .align 4 .b8 param0[4];
+; CHECK:      st.param.b32    [param0+0], [[E]];
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v2f16,
+; CHECK:      ld.param.b32    [[R:%hh[0-9]+]], [retval0+0];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]]
+; CHECK-NEXT: ret;
+define <2 x half> @test_v2f16(<2 x half> %a) {
+       %r = tail call <2 x half> @test_v2f16(<2 x half> %a);
+       ret <2 x half> %r;
+}
+
+; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v3f16(
+; CHECK:      .param .align 8 .b8 test_v3f16_param_0[8]
+; CHECK-DAG:  ld.param.b32    [[HH01:%hh[0-9]+]], [test_v3f16_param_0];
+; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
+; CHECK-DAG:  ld.param.b16    [[E2:%h[0-9]+]], [test_v3f16_param_0+4];
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK-DAG:  st.param.v2.b16 [param0+0], {[[E0]], [[E1]]};
+; CHECK-DAG:  st.param.b16    [param0+4], [[E2]];
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v3f16,
+; CHECK-DAG:  ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b16    [[R2:%h[0-9]+]], [retval0+4];
+; CHECK-DAG:  st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]};
+; CHECK-DAG:  st.param.b16    [func_retval0+4], [[R2]];
+; CHECK:      ret;
+define <3 x half> @test_v3f16(<3 x half> %a) {
+       %r = tail call <3 x half> @test_v3f16(<3 x half> %a);
+       ret <3 x half> %r;
+}
+
+; CHECK:.func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_v4f16(
+; CHECK:      .param .align 8 .b8 test_v4f16_param_0[8]
+; CHECK:      ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0];
+; CHECK-DAG:  mov.b32         [[HH01:%hh[0-9]+]], [[R01]];
+; CHECK-DAG:  mov.b32         [[HH23:%hh[0-9]+]], [[R23]];
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK:      st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]};
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v4f16,
+; CHECK:      ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]};
+; CHECK:      ret;
+define <4 x half> @test_v4f16(<4 x half> %a) {
+       %r = tail call <4 x half> @test_v4f16(<4 x half> %a);
+       ret <4 x half> %r;
+}
+
+; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v5f16(
+; CHECK:      .param .align 16 .b8 test_v5f16_param_0[16]
+; CHECK-DAG:  ld.param.v4.b16  {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0];
+; CHECK-DAG:  mov.b32         {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]];
+; CHECK-DAG:  ld.param.b16    [[E4:%h[0-9]+]], [test_v5f16_param_0+8];
+; CHECK:      .param .align 16 .b8 param0[16];
+; CHECK-DAG:  st.param.v4.b16 [param0+0],
+; CHECK-DAG:  st.param.b16    [param0+8], [[E4]];
+; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v5f16,
+; CHECK-DAG:  ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b16    [[R4:%h[0-9]+]], [retval0+8];
+; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
+; CHECK-DAG:  st.param.b16    [func_retval0+8], [[R4]];
+; CHECK:      ret;
+define <5 x half> @test_v5f16(<5 x half> %a) {
+       %r = tail call <5 x half> @test_v5f16(<5 x half> %a);
+       ret <5 x half> %r;
+}
+
+; CHECK:.func  (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v8f16(
+; CHECK:      .param .align 16 .b8 test_v8f16_param_0[16]
+; CHECK:      ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0];
+; CHECK-DAG:  mov.b32         [[HH01:%hh[0-9]+]], [[R01]];
+; CHECK-DAG:  mov.b32         [[HH23:%hh[0-9]+]], [[R23]];
+; CHECK-DAG:  mov.b32         [[HH45:%hh[0-9]+]], [[R45]];
+; CHECK-DAG:  mov.b32         [[HH67:%hh[0-9]+]], [[R67]];
+; CHECK:      .param .align 16 .b8 param0[16];
+; CHECK:      st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]};
+; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v8f16,
+; CHECK:      ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]};
+; CHECK:      ret;
+define <8 x half> @test_v8f16(<8 x half> %a) {
+       %r = tail call <8 x half> @test_v8f16(<8 x half> %a);
+       ret <8 x half> %r;
+}
+
+; CHECK:.func  (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v9f16(
+; CHECK:      .param .align 32 .b8 test_v9f16_param_0[32]
+; CHECK-DAG:  ld.param.v4.b16  {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0];
+; CHECK-DAG:  ld.param.v4.b16  {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8];
+; CHECK-DAG:  ld.param.b16     [[E8:%h[0-9]+]], [test_v9f16_param_0+16];
+; CHECK:      .param .align 32 .b8 param0[32];
+; CHECK-DAG:  st.param.v4.b16 [param0+0],
+; CHECK-DAG:  st.param.v4.b16 [param0+8],
+; CHECK-DAG:  st.param.b16    [param0+16], [[E8]];
+; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK:      call.uni (retval0),
+; CHECK:      test_v9f16,
+; CHECK-DAG:  ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8];
+; CHECK-DAG:  ld.param.b16    [[R8:%h[0-9]+]], [retval0+16];
+; CHECK-DAG:  st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]};
+; CHECK-DAG:  st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]};
+; CHECK-DAG:  st.param.b16    [func_retval0+16], [[R8]];
+; CHECK:      ret;
+define <9 x half> @test_v9f16(<9 x half> %a) {
+       %r = tail call <9 x half> @test_v9f16(<9 x half> %a);
+       ret <9 x half> %r;
+}
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_i32(
+; CHECK-NEXT: .param .b32 test_i32_param_0
+; CHECK:      ld.param.u32    [[E:%r[0-9]+]], [test_i32_param_0];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.b32    [param0+0], [[E]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_i32,
+; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i32 @test_i32(i32 %a) {
+       %r = tail call i32 @test_i32(i32 %a);
+       ret i32 %r;
+}
+
+; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v3i32(
+; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16]
+; CHECK-DAG:  ld.param.u32     [[E2:%r[0-9]+]], [test_v3i32_param_0+8];
+; CHECK-DAG:  ld.param.v2.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0];
+; CHECK:      .param .align 16 .b8 param0[16];
+; CHECK:      st.param.v2.b32  [param0+0], {[[E0]], [[E1]]};
+; CHECK:      st.param.b32     [param0+8], [[E2]];
+; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v3i32,
+; CHECK:      ld.param.v2.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK:      ld.param.b32     [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK-DAG:  st.param.v2.b32  [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:  st.param.b32     [func_retval0+8], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i32> @test_v3i32(<3 x i32> %a) {
+       %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a);
+       ret <3 x i32> %r;
+}
+
+; CHECK: .func  (.param .align 16 .b8 func_retval0[16])
+; CHECK-LABEL: test_v4i32(
+; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16]
+; CHECK:      ld.param.v4.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0]
+; CHECK:      .param .align 16 .b8 param0[16];
+; CHECK:      st.param.v4.b32  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:      .param .align 16 .b8 retval0[16];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v4i32,
+; CHECK:      ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
+; CHECK:      st.param.v4.b32  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHCK-NEXT: ret;
+define <4 x i32> @test_v4i32(<4 x i32> %a) {
+       %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a);
+       ret <4 x i32> %r;
+}
+
+; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v5i32(
+; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32]
+; CHECK-DAG:  ld.param.u32     [[E4:%r[0-9]+]], [test_v5i32_param_0+16];
+; CHECK-DAG   ld.param.v4.u32  {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0]
+; CHECK:      .param .align 32 .b8 param0[32];
+; CHECK-DAG:  st.param.v4.b32  [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK-DAG:  st.param.b32     [param0+16], [[E4]];
+; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v5i32,
+; CHECK-DAG:  ld.param.v4.b32  {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0];
+; CHECK-DAG:  ld.param.b32     [[RE4:%r[0-9]+]], [retval0+16];
+; CHECK-DAG:  st.param.v4.b32  [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}
+; CHECK-DAG:  st.param.b32     [func_retval0+16], [[RE4]];
+; CHECK-NEXT: ret;
+define <5 x i32> @test_v5i32(<5 x i32> %a) {
+       %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a);
+       ret <5 x i32> %r;
+}
+
+; CHECK: .func  (.param .b32 func_retval0)
+; CHECK-LABEL: test_f32(
+; CHECK-NEXT: .param .b32 test_f32_param_0
+; CHECK:      ld.param.f32    [[E:%f[0-9]+]], [test_f32_param_0];
+; CHECK:      .param .b32 param0;
+; CHECK:      st.param.f32    [param0+0], [[E]];
+; CHECK:      .param .b32 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_f32,
+; CHECK:      ld.param.f32    [[R:%f[0-9]+]], [retval0+0];
+; CHECK:      st.param.f32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define float @test_f32(float %a) {
+       %r = tail call float @test_f32(float %a);
+       ret float %r;
+}
+
+; CHECK: .func  (.param .b64 func_retval0)
+; CHECK-LABEL: test_i64(
+; CHECK-NEXT: .param .b64 test_i64_param_0
+; CHECK:      ld.param.u64    [[E:%rd[0-9]+]], [test_i64_param_0];
+; CHECK:      .param .b64 param0;
+; CHECK:      st.param.b64    [param0+0], [[E]];
+; CHECK:      .param .b64 retval0;
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_i64,
+; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0+0];
+; CHECK:      st.param.b64    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define i64 @test_i64(i64 %a) {
+       %r = tail call i64 @test_i64(i64 %a);
+       ret i64 %r;
+}
+
+; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v3i64(
+; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32]
+; CHECK-DAG:  ld.param.u64     [[E2:%rd[0-9]+]], [test_v3i64_param_0+16];
+; CHECK-DAG:  ld.param.v2.u64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0];
+; CHECK:      .param .align 32 .b8 param0[32];
+; CHECK:      st.param.v2.b64  [param0+0], {[[E0]], [[E1]]};
+; CHECK:      st.param.b64     [param0+16], [[E2]];
+; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v3i64,
+; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
+; CHECK:      ld.param.b64     [[RE2:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:  st.param.b64     [func_retval0+16], [[RE2]];
+; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:  st.param.b64     [func_retval0+16], [[RE2]];
+; CHECK-NEXT: ret;
+define <3 x i64> @test_v3i64(<3 x i64> %a) {
+       %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a);
+       ret <3 x i64> %r;
+}
+
+; For i64 vector loads are limited by PTX to 2 elements.
+; CHECK: .func  (.param .align 32 .b8 func_retval0[32])
+; CHECK-LABEL: test_v4i64(
+; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32]
+; CHECK-DAG:  ld.param.v2.u64  {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16];
+; CHECK-DAG:  ld.param.v2.u64  {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0];
+; CHECK:      .param .align 32 .b8 param0[32];
+; CHECK:      st.param.v2.b64  [param0+0], {[[E0]], [[E1]]};
+; CHECK:      st.param.v2.b64  [param0+16], {[[E2]], [[E3]]};
+; CHECK:      .param .align 32 .b8 retval0[32];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_v4i64,
+; CHECK:      ld.param.v2.b64  {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0];
+; CHECK:      ld.param.v2.b64  {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16];
+; CHECK-DAG:  st.param.v2.b64  [func_retval0+16], {[[RE2]], [[RE3]]};
+; CHECK-DAG:  st.param.v2.b64  [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-NEXT: ret;
+define <4 x i64> @test_v4i64(<4 x i64> %a) {
+       %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a);
+       ret <4 x i64> %r;
+}
+
+; Aggregates, on the other hand, do not get extended.
+
+; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
+; CHECK-LABEL: test_s_i1(
+; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1]
+; CHECK:      ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0];
+; CHECK:      .param .align 1 .b8 param0[1];
+; CHECK:      st.param.b8    [param0+0], [[A]]
+; CHECK:      .param .align 1 .b8 retval0[1];
+; CHECK:      call.uni
+; CHECK-NEXT: test_s_i1,
+; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0+0];
+; CHECK:      st.param.b8    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i1 @test_s_i1(%s_i1 %a) {
+       %r = tail call %s_i1 @test_s_i1(%s_i1 %a);
+       ret %s_i1 %r;
+}
+
+; CHECK: .func  (.param .align 1 .b8 func_retval0[1])
+; CHECK-LABEL: test_s_i8(
+; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1]
+; CHECK:      ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0];
+; CHECK:      .param .align 1 .b8 param0[1];
+; CHECK:      st.param.b8    [param0+0], [[A]]
+; CHECK:      .param .align 1 .b8 retval0[1];
+; CHECK:      call.uni
+; CHECK-NEXT: test_s_i8,
+; CHECK:      ld.param.b8    [[R:%rs[0-9]+]], [retval0+0];
+; CHECK:      st.param.b8    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i8 @test_s_i8(%s_i8 %a) {
+       %r = tail call %s_i8 @test_s_i8(%s_i8 %a);
+       ret %s_i8 %r;
+}
+
+; CHECK: .func  (.param .align 2 .b8 func_retval0[2])
+; CHECK-LABEL: test_s_i16(
+; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2]
+; CHECK:      ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0];
+; CHECK:      .param .align 2 .b8 param0[2];
+; CHECK:      st.param.b16    [param0+0], [[A]]
+; CHECK:      .param .align 2 .b8 retval0[2];
+; CHECK:      call.uni
+; CHECK-NEXT: test_s_i16,
+; CHECK:      ld.param.b16    [[R:%rs[0-9]+]], [retval0+0];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i16 @test_s_i16(%s_i16 %a) {
+       %r = tail call %s_i16 @test_s_i16(%s_i16 %a);
+       ret %s_i16 %r;
+}
+
+; CHECK: .func  (.param .align 2 .b8 func_retval0[2])
+; CHECK-LABEL: test_s_f16(
+; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2]
+; CHECK:      ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0];
+; CHECK:      .param .align 2 .b8 param0[2];
+; CHECK:      st.param.b16    [param0+0], [[A]]
+; CHECK:      .param .align 2 .b8 retval0[2];
+; CHECK:      call.uni
+; CHECK-NEXT: test_s_f16,
+; CHECK:      ld.param.b16    [[R:%h[0-9]+]], [retval0+0];
+; CHECK:      st.param.b16    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_f16 @test_s_f16(%s_f16 %a) {
+       %r = tail call %s_f16 @test_s_f16(%s_f16 %a);
+       ret %s_f16 %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_s_i32(
+; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4]
+; CHECK:      ld.param.u32    [[E:%r[0-9]+]], [test_s_i32_param_0];
+; CHECK:      .param .align 4 .b8 param0[4]
+; CHECK:      st.param.b32    [param0+0], [[E]];
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_s_i32,
+; CHECK:      ld.param.b32    [[R:%r[0-9]+]], [retval0+0];
+; CHECK:      st.param.b32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i32 @test_s_i32(%s_i32 %a) {
+       %r = tail call %s_i32 @test_s_i32(%s_i32 %a);
+       ret %s_i32 %r;
+}
+
+; CHECK: .func  (.param .align 4 .b8 func_retval0[4])
+; CHECK-LABEL: test_s_f32(
+; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4]
+; CHECK:      ld.param.f32    [[E:%f[0-9]+]], [test_s_f32_param_0];
+; CHECK:      .param .align 4 .b8 param0[4]
+; CHECK:      st.param.f32    [param0+0], [[E]];
+; CHECK:      .param .align 4 .b8 retval0[4];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_s_f32,
+; CHECK:      ld.param.f32    [[R:%f[0-9]+]], [retval0+0];
+; CHECK:      st.param.f32    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_f32 @test_s_f32(%s_f32 %a) {
+       %r = tail call %s_f32 @test_s_f32(%s_f32 %a);
+       ret %s_f32 %r;
+}
+
+; CHECK: .func  (.param .align 8 .b8 func_retval0[8])
+; CHECK-LABEL: test_s_i64(
+; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8]
+; CHECK:      ld.param.u64    [[E:%rd[0-9]+]], [test_s_i64_param_0];
+; CHECK:      .param .align 8 .b8 param0[8];
+; CHECK:      st.param.b64    [param0+0], [[E]];
+; CHECK:      .param .align 8 .b8 retval0[8];
+; CHECK:      call.uni (retval0),
+; CHECK-NEXT: test_s_i64,
+; CHECK:      ld.param.b64    [[R:%rd[0-9]+]], [retval0+0];
+; CHECK:      st.param.b64    [func_retval0+0], [[R]];
+; CHECK-NEXT: ret;
+define %s_i64 @test_s_i64(%s_i64 %a) {
+       %r = tail call %s_i64 @test_s_i64(%s_i64 %a);
+       ret %s_i64 %r;
+}
+
+; Fields that have different types, but identical sizes are not vectorized.
+; CHECK: .func  (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i32f32(
+; CHECK:        .param .align 8 .b8 test_s_i32f32_param_0[24]
+; CHECK-DAG:    ld.param.u64    [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16];
+; CHECK-DAG:    ld.param.f32    [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12];
+; CHECK-DAG:    ld.param.u32    [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8];
+; CHECK-DAG:    ld.param.f32    [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4];
+; CHECK-DAG:    ld.param.u32    [[E0:%r[0-9]+]], [test_s_i32f32_param_0];
+; CHECK:        .param .align 8 .b8 param0[24];
+; CHECK-DAG:    st.param.b32    [param0+0], [[E0]];
+; CHECK-DAG:    st.param.f32    [param0+4], [[E1]];
+; CHECK-DAG:    st.param.b32    [param0+8], [[E2]];
+; CHECK-DAG:    st.param.f32    [param0+12], [[E3]];
+; CHECK-DAG:    st.param.b64    [param0+16], [[E4]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK:        call.uni (retval0),
+; CHECK-NEXT:   test_s_i32f32,
+; CHECK-DAG:    ld.param.b32    [[RE0:%r[0-9]+]], [retval0+0];
+; CHECK-DAG:    ld.param.f32    [[RE1:%f[0-9]+]], [retval0+4];
+; CHECK-DAG:    ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK-DAG:    ld.param.f32    [[RE3:%f[0-9]+]], [retval0+12];
+; CHECK-DAG:    ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG:    st.param.b32    [func_retval0+0], [[RE0]];
+; CHECK-DAG:    st.param.f32    [func_retval0+4], [[RE1]];
+; CHECK-DAG:    st.param.b32    [func_retval0+8], [[RE2]];
+; CHECK-DAG:    st.param.f32    [func_retval0+12], [[RE3]];
+; CHECK-DAG:    st.param.b64    [func_retval0+16], [[RE4]];
+; CHECK:        ret;
+define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) {
+       %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a);
+       ret %s_i32f32 %r;
+}
+
+; We do vectorize consecutive fields with matching types.
+; CHECK:.visible .func  (.param .align 8 .b8 func_retval0[24])
+; CHECK-LABEL: test_s_i32x4(
+; CHECK:        .param .align 8 .b8 test_s_i32x4_param_0[24]
+; CHECK-DAG:    ld.param.u64    [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16];
+; CHECK-DAG:    ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8];
+; CHECK-DAG:    ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0];
+; CHECK:        .param .align 8 .b8 param0[24];
+; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK:        st.param.v2.b32 [param0+8], {[[E2]], [[E3]]};
+; CHECK:        st.param.b64    [param0+16], [[E4]];
+; CHECK:        .param .align 8 .b8 retval0[24];
+; CHECK:        call.uni (retval0),
+; CHECK-NEXT:   test_s_i32x4,
+; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK:        ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8];
+; CHECK:        ld.param.b64    [[RE4:%rd[0-9]+]], [retval0+16];
+; CHECK-DAG:    st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK-DAG:    st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]};
+; CHECK-DAG:    st.param.b64    [func_retval0+16], [[RE4]];
+; CHECK:        ret;
+
+define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) {
+       %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a);
+       ret %s_i32x4 %r;
+}
+
+; CHECK:.visible .func  (.param .align 8 .b8 func_retval0[32])
+; CHECK-LABEL: test_s_i1i32x4(
+; CHECK:        .param .align 8 .b8 test_s_i1i32x4_param_0[32]
+; CHECK:        ld.param.u64    [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24];
+; CHECK:        ld.param.u32    [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16];
+; CHECK:        ld.param.u32    [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12];
+; CHECK:        ld.param.u8     [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8];
+; CHECK:        ld.param.v2.u32         {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0];
+; CHECK:        .param .align 8 .b8 param0[32];
+; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK:        st.param.b8     [param0+8], [[E2]];
+; CHECK:        st.param.b32    [param0+12], [[E3]];
+; CHECK:        st.param.b32    [param0+16], [[E4]];
+; CHECK:        st.param.b64    [param0+24], [[E5]];
+; CHECK:        .param .align 8 .b8 retval0[32];
+; CHECK:        call.uni (retval0),
+; CHECK:        test_s_i1i32x4,
+; CHECK:        (
+; CHECK:        param0
+; CHECK:        );
+; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK:        ld.param.b8     [[RE2:%rs[0-9]+]], [retval0+8];
+; CHECK:        ld.param.b32    [[RE3:%r[0-9]+]], [retval0+12];
+; CHECK:        ld.param.b32    [[RE4:%r[0-9]+]], [retval0+16];
+; CHECK:        ld.param.b64    [[RE5:%rd[0-9]+]], [retval0+24];
+; CHECK:        st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK:        st.param.b8     [func_retval0+8], [[RE2]];
+; CHECK:        st.param.b32    [func_retval0+12], [[RE3]];
+; CHECK:        st.param.b32    [func_retval0+16], [[RE4]];
+; CHECK:        st.param.b64    [func_retval0+24], [[RE5]];
+; CHECK:        ret;
+
+define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) {
+       %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a);
+       ret %s_i8i32x4 %r;
+}
+
+; -- All loads/stores from parameters aligned by one must be done one
+; -- byte at a time.
+; CHECK:.visible .func  (.param .align 1 .b8 func_retval0[25])
+; CHECK-LABEL: test_s_i1i32x4p(
+; CHECK-DAG:        .param .align 1 .b8 test_s_i1i32x4p_param_0[25]
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+24];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+23];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+22];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+21];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+20];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+19];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+18];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+17];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+16];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+15];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+14];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+13];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+12];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+11];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+10];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+9];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+8];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+7];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+6];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+5];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+4];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+3];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+2];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0+1];
+; CHECK-DAG:        ld.param.u8     %r{{.*}}, [test_s_i1i32x4p_param_0];
+; --- TODO
+; --- Unaligned parameter store/ return value load is broken in both nvcc
+; --- and llvm and needs to be fixed.
+; CHECK:        .param .align 1 .b8 param0[25];
+; CHECK-DAG:        st.param.b32    [param0+0],
+; CHECK-DAG:        st.param.b32    [param0+4],
+; CHECK-DAG:        st.param.b8     [param0+8],
+; CHECK-DAG:        st.param.b32    [param0+9],
+; CHECK-DAG:        st.param.b32    [param0+13],
+; CHECK-DAG:        st.param.b64    [param0+17],
+; CHECK:            .param .align 1 .b8 retval0[25];
+; CHECK:            call.uni (retval0),
+; CHECK-NEXT:       test_s_i1i32x4p,
+; CHECK-DAG:        ld.param.b32    %r41, [retval0+0];
+; CHECK-DAG:        ld.param.b32    %r42, [retval0+4];
+; CHECK-DAG:        ld.param.b8     %rs2, [retval0+8];
+; CHECK-DAG:        ld.param.b32    %r43, [retval0+9];
+; CHECK-DAG:        ld.param.b32    %r44, [retval0+13];
+; CHECK-DAG:        ld.param.b64    %rd23, [retval0+17];
+; CHECK-DAG:        st.param.b32    [func_retval0+0],
+; CHECK-DAG:        st.param.b32    [func_retval0+4],
+; CHECK-DAG:        st.param.b8     [func_retval0+8],
+; CHECK-DAG:        st.param.b32    [func_retval0+9],
+; CHECK-DAG:        st.param.b32    [func_retval0+13],
+; CHECK-DAG:        st.param.b64    [func_retval0+17],
+
+define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) {
+       %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a);
+       ret %s_i8i32x4p %r;
+}
+
+; Check that we can vectorize loads that span multiple aggregate fields.
+; CHECK:.visible .func  (.param .align 16 .b8 func_retval0[80])
+; CHECK-LABEL: test_s_crossfield(
+; CHECK:        .param .align 16 .b8 test_s_crossfield_param_0[80]
+; CHECK:        ld.param.u32    [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64];
+; CHECK:        ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48];
+; CHECK:        ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32];
+; CHECK:        ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16];
+; CHECK:        ld.param.u32    [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8];
+; CHECK:        ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0];
+; CHECK:        .param .align 16 .b8 param0[80];
+; CHECK:        st.param.v2.b32 [param0+0], {[[E0]], [[E1]]};
+; CHECK:        st.param.b32    [param0+8], [[E2]];
+; CHECK:        st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]};
+; CHECK:        st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]};
+; CHECK:        st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]};
+; CHECK:        st.param.b32    [param0+64], [[E15]];
+; CHECK:        .param .align 16 .b8 retval0[80];
+; CHECK:        call.uni (retval0),
+; CHECK:        test_s_crossfield,
+; CHECK:        ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0];
+; CHECK:        ld.param.b32    [[RE2:%r[0-9]+]], [retval0+8];
+; CHECK:        ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16];
+; CHECK:        ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32];
+; CHECK:        ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48];
+; CHECK:        ld.param.b32    [[RE15:%r[0-9]+]], [retval0+64];
+; CHECK:        st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]};
+; CHECK:        st.param.b32    [func_retval0+8], [[RE2]];
+; CHECK:        st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]};
+; CHECK:        st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]};
+; CHECK:        st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]};
+; CHECK:        st.param.b32    [func_retval0+64], [[RE15]];
+; CHECK:        ret;
+
+define %s_crossfield @test_s_crossfield(%s_crossfield %a) {
+       %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a);
+       ret %s_crossfield %r;
+}
diff --git a/test/CodeGen/NVPTX/rsqrt.ll b/test/CodeGen/NVPTX/rsqrt.ll
deleted file mode 100644
index 3a52a493abdd1803fd85c0049dd26ed8a43df53b..0000000000000000000000000000000000000000
--- a/test/CodeGen/NVPTX/rsqrt.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=1 -nvptx-prec-sqrtf32=0 | FileCheck %s
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
-
-declare float @llvm.nvvm.sqrt.f(float)
-
-define float @foo(float %a) {
-; CHECK: rsqrt.approx.f32
-  %val = tail call float @llvm.nvvm.sqrt.f(float %a)
-  %ret = fdiv float 1.0, %val
-  ret float %ret
-}
-  
diff --git a/test/CodeGen/NVPTX/sqrt-approx.ll b/test/CodeGen/NVPTX/sqrt-approx.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1e28db44b804473a84f014071a82a6a6ef726e9c
--- /dev/null
+++ b/test/CodeGen/NVPTX/sqrt-approx.ll
@@ -0,0 +1,150 @@
+; RUN: llc < %s -march=nvptx -mcpu=sm_20 -nvptx-prec-divf32=0 -nvptx-prec-sqrtf32=0 \
+; RUN:   | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
+
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.sqrt.f64(double)
+
+; -- reciprocal sqrt --
+
+; CHECK-LABEL test_rsqrt32
+define float @test_rsqrt32(float %a) #0 {
+; CHECK: rsqrt.approx.f32
+  %val = tail call float @llvm.sqrt.f32(float %a)
+  %ret = fdiv float 1.0, %val
+  ret float %ret
+}
+
+; CHECK-LABEL test_rsqrt_ftz
+define float @test_rsqrt_ftz(float %a) #0 #1 {
+; CHECK: rsqrt.approx.ftz.f32
+  %val = tail call float @llvm.sqrt.f32(float %a)
+  %ret = fdiv float 1.0, %val
+  ret float %ret
+}
+
+; CHECK-LABEL test_rsqrt64
+define double @test_rsqrt64(double %a) #0 {
+; CHECK: rsqrt.approx.f64
+  %val = tail call double @llvm.sqrt.f64(double %a)
+  %ret = fdiv double 1.0, %val
+  ret double %ret
+}
+
+; CHECK-LABEL test_rsqrt64_ftz
+define double @test_rsqrt64_ftz(double %a) #0 #1 {
+; There's no rsqrt.approx.ftz.f64 instruction; we just use the non-ftz version.
+; CHECK: rsqrt.approx.f64
+  %val = tail call double @llvm.sqrt.f64(double %a)
+  %ret = fdiv double 1.0, %val
+  ret double %ret
+}
+
+; -- sqrt --
+
+; CHECK-LABEL test_sqrt32
+define float @test_sqrt32(float %a) #0 {
+; CHECK: sqrt.approx.f32
+  %ret = tail call float @llvm.sqrt.f32(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL test_sqrt_ftz
+define float @test_sqrt_ftz(float %a) #0 #1 {
+; CHECK: sqrt.approx.ftz.f32
+  %ret = tail call float @llvm.sqrt.f32(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL test_sqrt64
+define double @test_sqrt64(double %a) #0 {
+; There's no sqrt.approx.f64 instruction; we emit
+; reciprocal(rsqrt.approx.f64(x)).  There's no non-ftz approximate reciprocal,
+; so we just use the ftz version.
+; CHECK: rsqrt.approx.f64
+; CHECK: rcp.approx.ftz.f64
+  %ret = tail call double @llvm.sqrt.f64(double %a)
+  ret double %ret
+}
+
+; CHECK-LABEL test_sqrt64_ftz
+define double @test_sqrt64_ftz(double %a) #0 #1 {
+; There's no sqrt.approx.ftz.f64 instruction; we just use the non-ftz version.
+; CHECK: rsqrt.approx.f64
+; CHECK: rcp.approx.ftz.f64
+  %ret = tail call double @llvm.sqrt.f64(double %a)
+  ret double %ret
+}
+
+; -- refined sqrt and rsqrt --
+;
+; The sqrt and rsqrt refinement algorithms both emit an rsqrt.approx, followed
+; by some math.
+
+; CHECK-LABEL: test_rsqrt32_refined
+define float @test_rsqrt32_refined(float %a) #0 #2 {
+; CHECK: rsqrt.approx.f32
+  %val = tail call float @llvm.sqrt.f32(float %a)
+  %ret = fdiv float 1.0, %val
+  ret float %ret
+}
+
+; CHECK-LABEL: test_sqrt32_refined
+define float @test_sqrt32_refined(float %a) #0 #2 {
+; CHECK: rsqrt.approx.f32
+  %ret = tail call float @llvm.sqrt.f32(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: test_rsqrt64_refined
+define double @test_rsqrt64_refined(double %a) #0 #2 {
+; CHECK: rsqrt.approx.f64
+  %val = tail call double @llvm.sqrt.f64(double %a)
+  %ret = fdiv double 1.0, %val
+  ret double %ret
+}
+
+; CHECK-LABEL: test_sqrt64_refined
+define double @test_sqrt64_refined(double %a) #0 #2 {
+; CHECK: rsqrt.approx.f64
+  %ret = tail call double @llvm.sqrt.f64(double %a)
+  ret double %ret
+}
+
+; -- refined sqrt and rsqrt with ftz enabled --
+
+; CHECK-LABEL: test_rsqrt32_refined_ftz
+define float @test_rsqrt32_refined_ftz(float %a) #0 #1 #2 {
+; CHECK: rsqrt.approx.ftz.f32
+  %val = tail call float @llvm.sqrt.f32(float %a)
+  %ret = fdiv float 1.0, %val
+  ret float %ret
+}
+
+; CHECK-LABEL: test_sqrt32_refined_ftz
+define float @test_sqrt32_refined_ftz(float %a) #0 #1 #2 {
+; CHECK: rsqrt.approx.ftz.f32
+  %ret = tail call float @llvm.sqrt.f32(float %a)
+  ret float %ret
+}
+
+; CHECK-LABEL: test_rsqrt64_refined_ftz
+define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 {
+; There's no rsqrt.approx.ftz.f64, so we just use the non-ftz version.
+; CHECK: rsqrt.approx.f64
+  %val = tail call double @llvm.sqrt.f64(double %a)
+  %ret = fdiv double 1.0, %val
+  ret double %ret
+}
+
+; CHECK-LABEL: test_sqrt64_refined_ftz
+define double @test_sqrt64_refined_ftz(double %a) #0 #1 #2 {
+; CHECK: rsqrt.approx.f64
+  %ret = tail call double @llvm.sqrt.f64(double %a)
+  ret double %ret
+}
+
+attributes #0 = { "unsafe-fp-math" = "true" }
+attributes #1 = { "nvptx-f32ftz" = "true" }
+attributes #2 = { "reciprocal-estimates" = "rsqrtf:1,rsqrtd:1,sqrtf:1,sqrtd:1" }
diff --git a/test/CodeGen/NVPTX/vec-param-load.ll b/test/CodeGen/NVPTX/vec-param-load.ll
index 4193ac4085cc1bb9d3d9f5dcd88a35d40a32ecfa..bf26e5ff1bdbb0a7f1f6cc8ed42ab00d3c495372 100644
--- a/test/CodeGen/NVPTX/vec-param-load.ll
+++ b/test/CodeGen/NVPTX/vec-param-load.ll
@@ -2,12 +2,81 @@
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64"
 
-
-define <16 x float> @foo(<16 x float> %a) {
-; Make sure we index into vectors properly
-; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+48];
-; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+32];
-; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0+16];
-; CHECK: ld.param.v4.f32         {%f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}}, [foo_param_0];
+define <16 x float> @test_v16f32(<16 x float> %a) {
+; CHECK-LABEL: test_v16f32(
+; CHECK-DAG: ld.param.v4.f32     {[[V_12_15:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+48];
+; CHECK-DAG: ld.param.v4.f32     {[[V_8_11:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+32];
+; CHECK-DAG: ld.param.v4.f32     {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0+16];
+; CHECK-DAG: ld.param.v4.f32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v16f32_param_0];
+; CHECK-DAG: st.param.v4.f32     [func_retval0+0],  {[[V_0_3]]}
+; CHECK-DAG: st.param.v4.f32     [func_retval0+16], {[[V_4_7]]}
+; CHECK-DAG: st.param.v4.f32     [func_retval0+32], {[[V_8_11]]}
+; CHECK-DAG: st.param.v4.f32     [func_retval0+48], {[[V_12_15]]}
+; CHECK: ret;
   ret <16 x float> %a
 }
+
+define <8 x float> @test_v8f32(<8 x float> %a) {
+; CHECK-LABEL: test_v8f32(
+; CHECK-DAG: ld.param.v4.f32     {[[V_4_7:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0+16];
+; CHECK-DAG: ld.param.v4.f32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v8f32_param_0];
+; CHECK-DAG: st.param.v4.f32     [func_retval0+0],  {[[V_0_3]]}
+; CHECK-DAG: st.param.v4.f32     [func_retval0+16], {[[V_4_7]]}
+; CHECK: ret;
+  ret <8 x float> %a
+}
+
+define <4 x float> @test_v4f32(<4 x float> %a) {
+; CHECK-LABEL: test_v4f32(
+; CHECK-DAG: ld.param.v4.f32     {[[V_0_3:(%f[0-9]+[, ]*){4}]]}, [test_v4f32_param_0];
+; CHECK-DAG: st.param.v4.f32     [func_retval0+0],  {[[V_0_3]]}
+; CHECK: ret;
+  ret <4 x float> %a
+}
+
+define <2 x float> @test_v2f32(<2 x float> %a) {
+; CHECK-LABEL: test_v2f32(
+; CHECK-DAG: ld.param.v2.f32     {[[V_0_3:(%f[0-9]+[, ]*){2}]]}, [test_v2f32_param_0];
+; CHECK-DAG: st.param.v2.f32     [func_retval0+0],  {[[V_0_3]]}
+; CHECK: ret;
+  ret <2 x float> %a
+}
+
+; Oddly shaped vectors should not load any extra elements.
+define <3 x float> @test_v3f32(<3 x float> %a) {
+; CHECK-LABEL: test_v3f32(
+; CHECK-DAG: ld.param.f32        [[V_2:%f[0-9]+]], [test_v3f32_param_0+8];
+; CHECK-DAG: ld.param.v2.f32     {[[V_0_1:(%f[0-9]+[, ]*){2}]]}, [test_v3f32_param_0];
+; CHECK-DAG: st.param.v2.f32     [func_retval0+0], {[[V_0_1]]}
+; CHECK-DAG: st.param.f32        [func_retval0+8], [[V_2]]
+; CHECK: ret;
+  ret <3 x float> %a
+}
+
+define <8 x i64> @test_v8i64(<8 x i64> %a) {
+; CHECK-LABEL: test_v8i64(
+; CHECK-DAG: ld.param.v2.u64     {[[V_6_7:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+48];
+; CHECK-DAG: ld.param.v2.u64     {[[V_4_5:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+32];
+; CHECK-DAG: ld.param.v2.u64     {[[V_2_3:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0+16];
+; CHECK-DAG: ld.param.v2.u64     {[[V_0_1:(%rd[0-9]+[, ]*){2}]]}, [test_v8i64_param_0];
+; CHECK-DAG: st.param.v2.b64     [func_retval0+0],  {[[V_0_1]]}
+; CHECK-DAG: st.param.v2.b64     [func_retval0+16], {[[V_2_3]]}
+; CHECK-DAG: st.param.v2.b64     [func_retval0+32], {[[V_4_5]]}
+; CHECK-DAG: st.param.v2.b64     [func_retval0+48], {[[V_6_7]]}
+; CHECK: ret;
+  ret <8 x i64> %a
+}
+
+define <16 x i16> @test_v16i16(<16 x i16> %a) {
+; CHECK-LABEL: test_v16i16(
+; CHECK-DAG: ld.param.v4.u16     {[[V_12_15:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+24];
+; CHECK-DAG: ld.param.v4.u16     {[[V_8_11:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+16];
+; CHECK-DAG: ld.param.v4.u16     {[[V_4_7:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0+8];
+; CHECK-DAG: ld.param.v4.u16     {[[V_0_3:(%rs[0-9]+[, ]*){4}]]}, [test_v16i16_param_0];
+; CHECK-DAG: st.param.v4.b16     [func_retval0+0], {[[V_0_3]]}
+; CHECK-DAG: st.param.v4.b16     [func_retval0+8], {[[V_4_7]]}
+; CHECK-DAG: st.param.v4.b16     [func_retval0+16], {[[V_8_11]]}
+; CHECK-DAG: st.param.v4.b16     [func_retval0+24], {[[V_12_15]]}
+; CHECK: ret;
+  ret <16 x i16> %a
+}
diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll
index 03f5cfc6cb014b41954cdbffeb94b733c9a9a3f5..a86ba1e29d5cd8c33445c5912389b1f0979eb777 100644
--- a/test/CodeGen/NVPTX/vec8.ll
+++ b/test/CodeGen/NVPTX/vec8.ll
@@ -4,10 +4,15 @@ target triple = "nvptx-unknown-cuda"
 
 ; CHECK: .visible .func foo
 define void @foo(<8 x i8> %a, i8* %b) {
-  %t0 = extractelement <8 x i8> %a, i32 0
-; CHECK-DAG: ld.param.v4.u8
-; CHECK-DAG: ld.param.u32
-  store i8 %t0, i8* %b
+; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0]
+; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4]
+; CHECK-DAG: ld.param.u32   %[[B:r[0-9+]]], [foo_param_1]
+; CHECK:     add.s16        [[T:%rs[0-9+]]], [[E1]], [[E6]];
+; CHECK:     st.u8          [%[[B]]], [[T]];
+  %t0 = extractelement <8 x i8> %a, i32 1
+  %t1 = extractelement <8 x i8> %a, i32 6
+  %t  = add i8 %t0, %t1
+  store i8 %t, i8* %b
   ret void
 }
 
diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll
index 968d1d4a5f51a7eaa3f56dd6b32f2d6548366264..bf7b931a5758eecc06a609f1ae6e2e5ab78e8d75 100644
--- a/test/CodeGen/NVPTX/vector-call.ll
+++ b/test/CodeGen/NVPTX/vector-call.ll
@@ -4,9 +4,27 @@ target triple = "nvptx-unknown-cuda"
 
 declare void @bar(<4 x i32>)
 
-; CHECK-LABEL: @foo
+; CHECK-LABEL: .func foo(
+; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v4.b32  [param0+0],  {[[E0]], [[E1]], [[E2]], [[E3]]};
+; CHECK:     call.uni
+; CHECK:     ret;
 define void @foo(<4 x i32> %a) {
-; CHECK: st.param.v4.b32
   tail call void @bar(<4 x i32> %a)
   ret void
 }
+
+; CHECK-LABEL: .func foo3(
+; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0];
+; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8];
+; CHECK: .param .align 16 .b8 param0[16];
+; CHECK-DAG: st.param.v2.b32  [param0+0],  {[[E0]], [[E1]]};
+; CHECK-DAG: st.param.b32     [param0+8],  [[E2]];
+; CHECK:     call.uni
+; CHECK:     ret;
+declare void @bar3(<3 x i32>)
+define void @foo3(<3 x i32> %a) {
+  tail call void @bar3(<3 x i32> %a)
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/2006-07-07-ComputeMaskedBits.ll b/test/CodeGen/PowerPC/2006-07-07-ComputeMaskedBits.ll
index 264967157d7a0c2830f2558594156e88bf33659b..56f4a4173ef59d551dddaaac143725e026020a1d 100644
--- a/test/CodeGen/PowerPC/2006-07-07-ComputeMaskedBits.ll
+++ b/test/CodeGen/PowerPC/2006-07-07-ComputeMaskedBits.ll
@@ -1,17 +1,33 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-apple-darwin | grep extsw | count 2
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64le-unknown-linux-gnu | FileCheck %s
 
 @lens = external global i8*             ; <i8**> [#uses=1]
 @vals = external global i32*            ; <i32**> [#uses=1]
 
 define i32 @test(i32 %i) {
-        %tmp = load i8*, i8** @lens          ; <i8*> [#uses=1]
-        %tmp1 = getelementptr i8, i8* %tmp, i32 %i          ; <i8*> [#uses=1]
-        %tmp.upgrd.1 = load i8, i8* %tmp1           ; <i8> [#uses=1]
-        %tmp2 = zext i8 %tmp.upgrd.1 to i32             ; <i32> [#uses=1]
-        %tmp3 = load i32*, i32** @vals                ; <i32*> [#uses=1]
-        %tmp5 = sub i32 1, %tmp2                ; <i32> [#uses=1]
-        %tmp6 = getelementptr i32, i32* %tmp3, i32 %tmp5             ; <i32*> [#uses=1]
-        %tmp7 = load i32, i32* %tmp6         ; <i32> [#uses=1]
-        ret i32 %tmp7
+; CHECK-LABEL: test:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    addis 4, 2, .LC0@toc@ha
+; CHECK-NEXT:    extsw 3, 3
+; CHECK-NEXT:    addis 5, 2, .LC1@toc@ha
+; CHECK-NEXT:    ld 4, .LC0@toc@l(4)
+; CHECK-NEXT:    ld 4, 0(4)
+; CHECK-NEXT:    lbzx 3, 4, 3
+; CHECK-NEXT:    ld 4, .LC1@toc@l(5)
+; CHECK-NEXT:    subfic 3, 3, 1
+; CHECK-NEXT:    extsw 3, 3
+; CHECK-NEXT:    ld 4, 0(4)
+; CHECK-NEXT:    sldi 3, 3, 2
+; CHECK-NEXT:    lwzx 3, 4, 3
+; CHECK-NEXT:    blr
+  %tmp = load i8*, i8** @lens          ; <i8*> [#uses=1]
+  %tmp1 = getelementptr i8, i8* %tmp, i32 %i          ; <i8*> [#uses=1]
+  %tmp.upgrd.1 = load i8, i8* %tmp1           ; <i8> [#uses=1]
+  %tmp2 = zext i8 %tmp.upgrd.1 to i32             ; <i32> [#uses=1]
+  %tmp3 = load i32*, i32** @vals                ; <i32*> [#uses=1]
+  %tmp5 = sub i32 1, %tmp2                ; <i32> [#uses=1]
+  %tmp6 = getelementptr i32, i32* %tmp3, i32 %tmp5             ; <i32*> [#uses=1]
+  %tmp7 = load i32, i32* %tmp6         ; <i32> [#uses=1]
+  ret i32 %tmp7
 }
 
diff --git a/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll b/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
index bd496704890f78b665e6f3e8412632b6ffd826cf..53bad4fe06eea78b29d99c47037bfb906ddd184e 100644
--- a/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
+++ b/test/CodeGen/PowerPC/2007-11-16-landingpad-split.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -mcpu=g5 < %s | FileCheck %s
-; RUN: llc -mcpu=g5 -addr-sink-using-gep=1 < %s | FileCheck %s
 ;; Formerly crashed, see PR 1508
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f128:64:128"
 target triple = "powerpc64-apple-darwin8"
diff --git a/test/CodeGen/PowerPC/BreakableToken-reduced.ll b/test/CodeGen/PowerPC/BreakableToken-reduced.ll
index 39516537da42912fadf4e2aed58776b3020a52bd..dcc09304168262ce0ccb584b7995e854f4f4ffdd 100644
--- a/test/CodeGen/PowerPC/BreakableToken-reduced.ll
+++ b/test/CodeGen/PowerPC/BreakableToken-reduced.ll
@@ -265,12 +265,12 @@ _ZNK4llvm9StringRef10startswithES0_.exit:         ; preds = %entry._ZNK4llvm9Str
 }
 
 ; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
 
 declare void @_ZN5clang6format17WhitespaceManager24replaceWhitespaceInTokenERKNS0_11FormatTokenEjjN4llvm9StringRefES6_bjji(%"class.clang::format::WhitespaceManager"*, %"struct.clang::format::FormatToken"* dereferenceable(272), i32 zeroext, i32 zeroext, [2 x i64], [2 x i64], i1 zeroext, i32 zeroext, i32 zeroext, i32 signext) #3
 
 ; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
 
 attributes #9 = { nounwind }
 
diff --git a/test/CodeGen/PowerPC/aantidep-def-ec.mir b/test/CodeGen/PowerPC/aantidep-def-ec.mir
index cf6ab35d8db738988f44cb2583ebcf1fd3fb331c..09aac7b0240aa80c6a6d2cbc477e43352cb972eb 100644
--- a/test/CodeGen/PowerPC/aantidep-def-ec.mir
+++ b/test/CodeGen/PowerPC/aantidep-def-ec.mir
@@ -48,22 +48,6 @@ tracksRegLiveness: true
 liveins:         
   - { reg: '%x3' }
   - { reg: '%x4' }
-calleeSavedRegisters: [ '%cr2', '%cr3', '%cr4', '%f14', '%f15', '%f16', 
-                        '%f17', '%f18', '%f19', '%f20', '%f21', '%f22', 
-                        '%f23', '%f24', '%f25', '%f26', '%f27', '%f28', 
-                        '%f29', '%f30', '%f31', '%r14', '%r15', '%r16', 
-                        '%r17', '%r18', '%r19', '%r20', '%r21', '%r22', 
-                        '%r23', '%r24', '%r25', '%r26', '%r27', '%r28', 
-                        '%r29', '%r30', '%r31', '%v20', '%v21', '%v22', 
-                        '%v23', '%v24', '%v25', '%v26', '%v27', '%v28', 
-                        '%v29', '%v30', '%v31', '%vf20', '%vf21', '%vf22', 
-                        '%vf23', '%vf24', '%vf25', '%vf26', '%vf27', '%vf28', 
-                        '%vf29', '%vf30', '%vf31', '%x14', '%x15', '%x16', 
-                        '%x17', '%x18', '%x19', '%x20', '%x21', '%x22', 
-                        '%x23', '%x24', '%x25', '%x26', '%x27', '%x28', 
-                        '%x29', '%x30', '%x31', '%cr2eq', '%cr3eq', '%cr4eq', 
-                        '%cr2gt', '%cr3gt', '%cr4gt', '%cr2lt', '%cr3lt', 
-                        '%cr4lt', '%cr2un', '%cr3un', '%cr4un' ]
 frameInfo:       
   isFrameAddressTaken: false
   isReturnAddressTaken: false
diff --git a/test/CodeGen/PowerPC/addegluecrash.ll b/test/CodeGen/PowerPC/addegluecrash.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7605340d305f58311264aca5f3e4c8b7d668def5
--- /dev/null
+++ b/test/CodeGen/PowerPC/addegluecrash.ll
@@ -0,0 +1,58 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+define void @bn_mul_comba8(i64* nocapture %r, i64* nocapture readonly %a, i64* nocapture readonly %b) {
+; CHECK-LABEL: bn_mul_comba8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    ld 6, 0(4)
+; CHECK-NEXT:    ld 7, 0(5)
+; CHECK-NEXT:    mulhdu 8, 7, 6
+; CHECK-NEXT:    ld 4, 8(4)
+; CHECK-NEXT:    mulld 9, 4, 6
+; CHECK-NEXT:    mulhdu 4, 4, 6
+; CHECK-NEXT:    addc 6, 9, 8
+; CHECK-NEXT:    addze 4, 4
+; CHECK-NEXT:    ld 5, 8(5)
+; CHECK-NEXT:    mulld 8, 5, 7
+; CHECK-NEXT:    mulhdu 5, 5, 7
+; CHECK-NEXT:    addc 6, 6, 8
+; CHECK-NEXT:    addze 5, 5
+; CHECK-NEXT:    add 4, 5, 4
+; CHECK-NEXT:    cmpld 7, 4, 5
+; CHECK-NEXT:    mfocrf 10, 1
+; CHECK-NEXT:    rlwinm 10, 10, 29, 31, 31
+; CHECK-NEXT:    # implicit-def: %X4
+; CHECK-NEXT:    mr 4, 10
+; CHECK-NEXT:    clrldi 4, 4, 32
+; CHECK-NEXT:    std 4, 0(3)
+; CHECK-NEXT:    blr
+  %1 = load i64, i64* %a, align 8
+  %conv = zext i64 %1 to i128
+  %2 = load i64, i64* %b, align 8
+  %conv2 = zext i64 %2 to i128
+  %mul = mul nuw i128 %conv2, %conv
+  %shr = lshr i128 %mul, 64
+  %agep = getelementptr inbounds i64, i64* %a, i64 1
+  %3 = load i64, i64* %agep, align 8
+  %conv14 = zext i64 %3 to i128
+  %mul15 = mul nuw i128 %conv14, %conv
+  %add17 = add i128 %mul15, %shr
+  %shr19 = lshr i128 %add17, 64
+  %conv20 = trunc i128 %shr19 to i64
+  %bgep = getelementptr inbounds i64, i64* %b, i64 1
+  %4 = load i64, i64* %bgep, align 8
+  %conv28 = zext i64 %4 to i128
+  %mul31 = mul nuw i128 %conv28, %conv2
+  %conv32 = and i128 %add17, 18446744073709551615
+  %add33 = add i128 %conv32, %mul31
+  %shr35 = lshr i128 %add33, 64
+  %conv36 = trunc i128 %shr35 to i64
+  %add37 = add i64 %conv36, %conv20
+  %cmp38 = icmp ult i64 %add37, %conv36
+  %conv148 = zext i1 %cmp38 to i64
+  store i64 %conv148, i64* %r, align 8
+  ret void
+}
+
diff --git a/test/CodeGen/PowerPC/addi-licm.ll b/test/CodeGen/PowerPC/addi-licm.ll
index 37a14899debce817f6bc45c801f2d0f830973078..d0178a8aec0e5d22744fbae8b0a8283897b1e48b 100644
--- a/test/CodeGen/PowerPC/addi-licm.ll
+++ b/test/CodeGen/PowerPC/addi-licm.ll
@@ -9,9 +9,9 @@ entry:
   %x = alloca [2048 x float], align 4
   %y = alloca [2048 x float], align 4
   %0 = bitcast [2048 x float]* %x to i8*
-  call void @llvm.lifetime.start(i64 8192, i8* %0) #2
+  call void @llvm.lifetime.start.p0i8(i64 8192, i8* %0) #2
   %1 = bitcast [2048 x float]* %y to i8*
-  call void @llvm.lifetime.start(i64 8192, i8* %1) #2
+  call void @llvm.lifetime.start.p0i8(i64 8192, i8* %1) #2
   br label %for.body.i
 
 ; CHECK-LABEL: @foo
@@ -50,12 +50,12 @@ loop.exit:                                        ; preds = %for.body.i
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #2
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #2
 
 declare void @bar(float*, float*)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #2
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #2
 
 attributes #0 = { nounwind readonly }
 attributes #1 = { nounwind }
diff --git a/test/CodeGen/PowerPC/anon_aggr.ll b/test/CodeGen/PowerPC/anon_aggr.ll
index f4e788849ec8e0f3a1689bc101c4e9b20a744f5e..9b32a8f55f34cd460b1ae881c007d9e8c9163e12 100644
--- a/test/CodeGen/PowerPC/anon_aggr.ll
+++ b/test/CodeGen/PowerPC/anon_aggr.ll
@@ -60,33 +60,34 @@ equal:
 unequal:
   ret i8* %array2_ptr
 }
-
 ; CHECK-LABEL: func2:
-; CHECK: ld [[REG2:[0-9]+]], 72(1)
-; CHECK: cmpld {{([0-9]+,)?}}4, [[REG2]]
-; CHECK-DAG: std [[REG2]], -[[OFFSET1:[0-9]+]]
+; CHECK: cmpld {{([0-9]+,)?}}4, 6
+; CHECK-DAG: std 6, 72(1)
+; CHECK-DAG: std 5, 64(1)
+; CHECK-DAG: std 6, -[[OFFSET1:[0-9]+]]
 ; CHECK-DAG: std 4, -[[OFFSET2:[0-9]+]]
 ; CHECK: ld 3, -[[OFFSET2]](1)
 ; CHECK: ld 3, -[[OFFSET1]](1)
 
-; DARWIN32: _func2:
-; DARWIN32: addi r[[REG1:[0-9]+]], r[[REGSP:[0-9]+]], 36
-; DARWIN32: lwz r[[REG2:[0-9]+]], 44(r[[REGSP]])
+; DARWIN32-LABEL: _func2
+; DARWIN32-DAG: addi r[[REG8:[0-9]+]], r[[REGSP:[0-9]+]], 36
+; DARWIN32-DAG: lwz r[[REG2:[0-9]+]], 44(r[[REGSP]])
 ; DARWIN32: mr
-; DARWIN32: mr r[[REG3:[0-9]+]], r[[REGA:[0-9]+]]
-; DARWIN32: cmplw {{(cr[0-9]+,)?}}r[[REGA]], r[[REG2]]
-; DARWIN32: stw r[[REG3]], -[[OFFSET1:[0-9]+]]
-; DARWIN32: stw r[[REG2]], -[[OFFSET2:[0-9]+]]
-; DARWIN32: lwz r3, -[[OFFSET1]]
-; DARWIN32: lwz r3, -[[OFFSET2]]
+; DARWIN32: mr r[[REG7:[0-9]+]], r5
+; DARWIN32-DAG: cmplw {{(cr[0-9]+,)?}}r5, r[[REG2]]
+; DARWIN32-DAG: stw r[[REG7]], -[[OFFSET1:[0-9]+]]
+; DARWIN32-DAG: stw r[[REG2]], -[[OFFSET2:[0-9]+]]
+; DARWIN32-DAG: lwz r3, -[[OFFSET1]]
+; DARWIN32-DAG: lwz r3, -[[OFFSET2]]
+
 
 ; DARWIN64: _func2:
 ; DARWIN64: ld r[[REG2:[0-9]+]], 72(r1)
 ; DARWIN64: mr
 ; DARWIN64: mr r[[REG3:[0-9]+]], r[[REGA:[0-9]+]]
 ; DARWIN64: cmpld {{(cr[0-9]+,)?}}r[[REGA]], r[[REG2]]
-; DARWIN64: std r[[REG3]], -[[OFFSET1:[0-9]+]]
 ; DARWIN64: std r[[REG2]], -[[OFFSET2:[0-9]+]]
+; DARWIN64: std r[[REG3]], -[[OFFSET1:[0-9]+]]
 ; DARWIN64: ld r3, -[[OFFSET1]]
 ; DARWIN64: ld r3, -[[OFFSET2]]
 
@@ -106,24 +107,24 @@ unequal:
 }
 
 ; CHECK-LABEL: func3:
-; CHECK: ld [[REG3:[0-9]+]], 72(1)
-; CHECK: ld [[REG4:[0-9]+]], 56(1)
-; CHECK: cmpld {{([0-9]+,)?}}[[REG4]], [[REG3]]
-; CHECK: std [[REG3]], -[[OFFSET1:[0-9]+]](1)
-; CHECK: std [[REG4]], -[[OFFSET2:[0-9]+]](1)
+; CHECK: cmpld {{([0-9]+,)?}}4, 6
+; CHECK-DAG: std 4, -[[OFFSET2:[0-9]+]](1)
+; CHECK-DAG: std 6, -[[OFFSET1:[0-9]+]](1)
 ; CHECK: ld 3, -[[OFFSET2]](1)
 ; CHECK: ld 3, -[[OFFSET1]](1)
 
-; DARWIN32: _func3:
-; DARWIN32: addi r[[REG1:[0-9]+]], r[[REGSP:[0-9]+]], 36
-; DARWIN32: addi r[[REG2:[0-9]+]], r[[REGSP]], 24
-; DARWIN32: lwz r[[REG3:[0-9]+]], 44(r[[REGSP]])
-; DARWIN32: lwz r[[REG4:[0-9]+]], 32(r[[REGSP]])
-; DARWIN32: cmplw {{(cr[0-9]+,)?}}r[[REG4]], r[[REG3]]
-; DARWIN32: stw r[[REG3]], -[[OFFSET1:[0-9]+]]
-; DARWIN32: stw r[[REG4]], -[[OFFSET2:[0-9]+]]
-; DARWIN32: lwz r3, -[[OFFSET2]]
-; DARWIN32: lwz r3, -[[OFFSET1]]
+; DARWIN32-LABEL: _func3:
+; DARWIN32-DAG: stw r[[REG8:[0-9]+]], 44(r[[REGSP:[0-9]+]])
+; DARWIN32-DAG: stw r[[REG5:[0-9]+]], 32(r[[REGSP]])
+; DARWIN32-DAG: addi r[[REG5a:[0-9]+]], r[[REGSP:[0-9]+]], 36
+; DARWIN32-DAG: addi r[[REG8a:[0-9]+]], r[[REGSP]], 24
+; DARWIN32-DAG: lwz r[[REG5a:[0-9]+]], 44(r[[REGSP]])
+; DARWIN32-DAG: lwz r[[REG8a:[0-9]+]], 32(r[[REGSP]])
+; DARWIN32-DAG: cmplw {{(cr[0-9]+,)?}}r[[REG8a]], r[[REG5a]]
+; DARWIN32-DAG: stw r[[REG5a]], -[[OFFSET1:[0-9]+]]
+; DARWIN32-DAG: stw r[[REG8a]], -[[OFFSET2:[0-9]+]]
+; DARWIN32-DAG: lwz r3, -[[OFFSET1:[0-9]+]]
+; DARWIN32-DAG: lwz r3, -[[OFFSET2:[0-9]+]]
 
 ; DARWIN64: _func3:
 ; DARWIN64: ld r[[REG3:[0-9]+]], 72(r1)
diff --git a/test/CodeGen/PowerPC/atomics-regression.ll b/test/CodeGen/PowerPC/atomics-regression.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9af82b625532ad0a095e37d488e946b172206a24
--- /dev/null
+++ b/test/CodeGen/PowerPC/atomics-regression.ll
@@ -0,0 +1,9546 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64le-linux-gnu < %s | FileCheck %s -check-prefix=PPC64LE
+
+define i8 @test0(i8* %ptr) {
+; PPC64LE-LABEL: test0:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lbz 3, 0(3)
+; PPC64LE-NEXT:    blr
+  %val = load atomic i8, i8* %ptr unordered, align 1
+  ret i8 %val
+}
+
+define i8 @test1(i8* %ptr) {
+; PPC64LE-LABEL: test1:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lbz 3, 0(3)
+; PPC64LE-NEXT:    blr
+  %val = load atomic i8, i8* %ptr monotonic, align 1
+  ret i8 %val
+}
+
+define i8 @test2(i8* %ptr) {
+; PPC64LE-LABEL: test2:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lbz 3, 0(3)
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %val = load atomic i8, i8* %ptr acquire, align 1
+  ret i8 %val
+}
+
+define i8 @test3(i8* %ptr) {
+; PPC64LE-LABEL: test3:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    ori 2, 2, 0
+; PPC64LE-NEXT:    lbz 3, 0(3)
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %val = load atomic i8, i8* %ptr seq_cst, align 1
+  ret i8 %val
+}
+
+define i16 @test4(i16* %ptr) {
+; PPC64LE-LABEL: test4:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lhz 3, 0(3)
+; PPC64LE-NEXT:    blr
+  %val = load atomic i16, i16* %ptr unordered, align 2
+  ret i16 %val
+}
+
+define i16 @test5(i16* %ptr) {
+; PPC64LE-LABEL: test5:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lhz 3, 0(3)
+; PPC64LE-NEXT:    blr
+  %val = load atomic i16, i16* %ptr monotonic, align 2
+  ret i16 %val
+}
+
+define i16 @test6(i16* %ptr) {
+; PPC64LE-LABEL: test6:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lhz 3, 0(3)
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %val = load atomic i16, i16* %ptr acquire, align 2
+  ret i16 %val
+}
+
+define i16 @test7(i16* %ptr) {
+; PPC64LE-LABEL: test7:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    ori 2, 2, 0
+; PPC64LE-NEXT:    lhz 3, 0(3)
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %val = load atomic i16, i16* %ptr seq_cst, align 2
+  ret i16 %val
+}
+
+define i32 @test8(i32* %ptr) {
+; PPC64LE-LABEL: test8:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwz 3, 0(3)
+; PPC64LE-NEXT:    blr
+  %val = load atomic i32, i32* %ptr unordered, align 4
+  ret i32 %val
+}
+
+define i32 @test9(i32* %ptr) {
+; PPC64LE-LABEL: test9:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwz 3, 0(3)
+; PPC64LE-NEXT:    blr
+  %val = load atomic i32, i32* %ptr monotonic, align 4
+  ret i32 %val
+}
+
+define i32 @test10(i32* %ptr) {
+; PPC64LE-LABEL: test10:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwz 3, 0(3)
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %val = load atomic i32, i32* %ptr acquire, align 4
+  ret i32 %val
+}
+
+define i32 @test11(i32* %ptr) {
+; PPC64LE-LABEL: test11:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    ori 2, 2, 0
+; PPC64LE-NEXT:    lwz 3, 0(3)
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %val = load atomic i32, i32* %ptr seq_cst, align 4
+  ret i32 %val
+}
+
+define i64 @test12(i64* %ptr) {
+; PPC64LE-LABEL: test12:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    ld 3, 0(3)
+; PPC64LE-NEXT:    blr
+  %val = load atomic i64, i64* %ptr unordered, align 8
+  ret i64 %val
+}
+
+define i64 @test13(i64* %ptr) {
+; PPC64LE-LABEL: test13:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    ld 3, 0(3)
+; PPC64LE-NEXT:    blr
+  %val = load atomic i64, i64* %ptr monotonic, align 8
+  ret i64 %val
+}
+
+define i64 @test14(i64* %ptr) {
+; PPC64LE-LABEL: test14:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    ld 3, 0(3)
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %val = load atomic i64, i64* %ptr acquire, align 8
+  ret i64 %val
+}
+
+define i64 @test15(i64* %ptr) {
+; PPC64LE-LABEL: test15:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    ori 2, 2, 0
+; PPC64LE-NEXT:    ld 3, 0(3)
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %val = load atomic i64, i64* %ptr seq_cst, align 8
+  ret i64 %val
+}
+
+define void @test16(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test16:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    stb 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i8 %val, i8* %ptr unordered, align 1
+  ret void
+}
+
+define void @test17(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test17:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    stb 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i8 %val, i8* %ptr monotonic, align 1
+  ret void
+}
+
+define void @test18(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test18:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    stb 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i8 %val, i8* %ptr release, align 1
+  ret void
+}
+
+define void @test19(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test19:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    stb 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i8 %val, i8* %ptr seq_cst, align 1
+  ret void
+}
+
+define void @test20(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test20:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sth 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i16 %val, i16* %ptr unordered, align 2
+  ret void
+}
+
+define void @test21(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test21:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sth 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i16 %val, i16* %ptr monotonic, align 2
+  ret void
+}
+
+define void @test22(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test22:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    sth 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i16 %val, i16* %ptr release, align 2
+  ret void
+}
+
+define void @test23(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test23:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    sth 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i16 %val, i16* %ptr seq_cst, align 2
+  ret void
+}
+
+define void @test24(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test24:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    stw 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i32 %val, i32* %ptr unordered, align 4
+  ret void
+}
+
+define void @test25(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test25:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    stw 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i32 %val, i32* %ptr monotonic, align 4
+  ret void
+}
+
+define void @test26(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test26:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    stw 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i32 %val, i32* %ptr release, align 4
+  ret void
+}
+
+define void @test27(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test27:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    stw 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i32 %val, i32* %ptr seq_cst, align 4
+  ret void
+}
+
+define void @test28(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test28:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    std 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i64 %val, i64* %ptr unordered, align 8
+  ret void
+}
+
+define void @test29(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test29:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    std 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i64 %val, i64* %ptr monotonic, align 8
+  ret void
+}
+
+define void @test30(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test30:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    std 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i64 %val, i64* %ptr release, align 8
+  ret void
+}
+
+define void @test31(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test31:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    std 4, 0(3)
+; PPC64LE-NEXT:    blr
+  store atomic i64 %val, i64* %ptr seq_cst, align 8
+  ret void
+}
+
+define void @test32() {
+; PPC64LE-LABEL: test32:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  fence acquire
+  ret void
+}
+
+define void @test33() {
+; PPC64LE-LABEL: test33:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  fence release
+  ret void
+}
+
+define void @test34() {
+; PPC64LE-LABEL: test34:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  fence acq_rel
+  ret void
+}
+
+define void @test35() {
+; PPC64LE-LABEL: test35:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    blr
+  fence seq_cst
+  ret void
+}
+
+define void @test36() {
+; PPC64LE-LABEL: test36:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  fence singlethread acquire
+  ret void
+}
+
+define void @test37() {
+; PPC64LE-LABEL: test37:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  fence singlethread release
+  ret void
+}
+
+define void @test38() {
+; PPC64LE-LABEL: test38:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  fence singlethread acq_rel
+  ret void
+}
+
+define void @test39() {
+; PPC64LE-LABEL: test39:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:    blr
+  fence singlethread seq_cst
+  ret void
+}
+
+define void @test40(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test40:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    b .LBB40_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB40_1:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB40_2
+; PPC64LE-NEXT:  .LBB40_2:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB40_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val monotonic monotonic
+  ret void
+}
+
+define void @test41(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test41:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB41_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB41_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB41_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB41_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acquire monotonic
+  ret void
+}
+
+define void @test42(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test42:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB42_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB42_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB42_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB42_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acquire acquire
+  ret void
+}
+
+define void @test43(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test43:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB43_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB43_1:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB43_2
+; PPC64LE-NEXT:  .LBB43_2:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB43_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val release monotonic
+  ret void
+}
+
+define void @test44(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test44:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB44_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB44_1:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB44_2
+; PPC64LE-NEXT:  .LBB44_2:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB44_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val release acquire
+  ret void
+}
+
+define void @test45(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test45:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB45_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB45_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB45_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB45_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acq_rel monotonic
+  ret void
+}
+
+define void @test46(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test46:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB46_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB46_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB46_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB46_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val acq_rel acquire
+  ret void
+}
+
+define void @test47(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test47:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB47_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB47_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB47_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB47_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val seq_cst monotonic
+  ret void
+}
+
+define void @test48(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test48:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB48_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB48_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB48_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB48_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val seq_cst acquire
+  ret void
+}
+
+define void @test49(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test49:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB49_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB49_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB49_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB49_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val seq_cst seq_cst
+  ret void
+}
+
+define void @test50(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test50:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    b .LBB50_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB50_1:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB50_2
+; PPC64LE-NEXT:  .LBB50_2:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB50_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val monotonic monotonic
+  ret void
+}
+
+define void @test51(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test51:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB51_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB51_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB51_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB51_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acquire monotonic
+  ret void
+}
+
+define void @test52(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test52:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB52_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB52_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB52_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB52_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acquire acquire
+  ret void
+}
+
+define void @test53(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test53:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB53_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB53_1:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB53_2
+; PPC64LE-NEXT:  .LBB53_2:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB53_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val release monotonic
+  ret void
+}
+
+define void @test54(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test54:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB54_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB54_1:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB54_2
+; PPC64LE-NEXT:  .LBB54_2:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB54_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val release acquire
+  ret void
+}
+
+define void @test55(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test55:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB55_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB55_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB55_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB55_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acq_rel monotonic
+  ret void
+}
+
+define void @test56(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test56:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB56_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB56_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB56_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB56_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val acq_rel acquire
+  ret void
+}
+
+define void @test57(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test57:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB57_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB57_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB57_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB57_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val seq_cst monotonic
+  ret void
+}
+
+define void @test58(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test58:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB58_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB58_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB58_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB58_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val seq_cst acquire
+  ret void
+}
+
+define void @test59(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test59:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB59_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB59_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB59_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB59_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val seq_cst seq_cst
+  ret void
+}
+
+define void @test60(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test60:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    b .LBB60_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB60_1:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB60_2
+; PPC64LE-NEXT:  .LBB60_2:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB60_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val monotonic monotonic
+  ret void
+}
+
+define void @test61(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test61:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB61_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB61_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB61_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB61_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acquire monotonic
+  ret void
+}
+
+define void @test62(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test62:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB62_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB62_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB62_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB62_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acquire acquire
+  ret void
+}
+
+define void @test63(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test63:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB63_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB63_1:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB63_2
+; PPC64LE-NEXT:  .LBB63_2:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB63_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val release monotonic
+  ret void
+}
+
+define void @test64(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test64:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB64_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB64_1:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB64_2
+; PPC64LE-NEXT:  .LBB64_2:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB64_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val release acquire
+  ret void
+}
+
+define void @test65(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test65:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB65_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB65_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB65_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB65_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acq_rel monotonic
+  ret void
+}
+
+define void @test66(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test66:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB66_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB66_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB66_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB66_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val acq_rel acquire
+  ret void
+}
+
+define void @test67(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test67:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB67_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB67_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB67_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB67_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val seq_cst monotonic
+  ret void
+}
+
+define void @test68(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test68:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB68_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB68_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB68_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB68_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val seq_cst acquire
+  ret void
+}
+
+define void @test69(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test69:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB69_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB69_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB69_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB69_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val seq_cst seq_cst
+  ret void
+}
+
+define void @test70(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test70:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    b .LBB70_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB70_1:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB70_2
+; PPC64LE-NEXT:  .LBB70_2:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB70_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val monotonic monotonic
+  ret void
+}
+
+define void @test71(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test71:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB71_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB71_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB71_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB71_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acquire monotonic
+  ret void
+}
+
+define void @test72(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test72:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB72_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB72_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB72_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB72_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acquire acquire
+  ret void
+}
+
+define void @test73(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test73:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB73_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB73_1:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB73_2
+; PPC64LE-NEXT:  .LBB73_2:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB73_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val release monotonic
+  ret void
+}
+
+define void @test74(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test74:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB74_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB74_1:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB74_2
+; PPC64LE-NEXT:  .LBB74_2:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB74_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val release acquire
+  ret void
+}
+
+define void @test75(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test75:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB75_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB75_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB75_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB75_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acq_rel monotonic
+  ret void
+}
+
+define void @test76(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test76:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB76_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB76_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB76_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB76_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val acq_rel acquire
+  ret void
+}
+
+define void @test77(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test77:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB77_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB77_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB77_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB77_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val seq_cst monotonic
+  ret void
+}
+
+define void @test78(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test78:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB78_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB78_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB78_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB78_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val seq_cst acquire
+  ret void
+}
+
+define void @test79(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test79:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB79_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB79_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB79_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB79_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val seq_cst seq_cst
+  ret void
+}
+
+define void @test80(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test80:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    b .LBB80_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB80_1:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB80_2
+; PPC64LE-NEXT:  .LBB80_2:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB80_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread monotonic monotonic
+  ret void
+}
+
+define void @test81(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test81:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB81_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB81_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB81_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB81_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acquire monotonic
+  ret void
+}
+
+define void @test82(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test82:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB82_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB82_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB82_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB82_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acquire acquire
+  ret void
+}
+
+define void @test83(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test83:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB83_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB83_1:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB83_2
+; PPC64LE-NEXT:  .LBB83_2:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB83_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread release monotonic
+  ret void
+}
+
+define void @test84(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test84:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB84_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB84_1:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB84_2
+; PPC64LE-NEXT:  .LBB84_2:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB84_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread release acquire
+  ret void
+}
+
+define void @test85(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test85:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB85_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB85_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB85_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB85_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acq_rel monotonic
+  ret void
+}
+
+define void @test86(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test86:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB86_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB86_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB86_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB86_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread acq_rel acquire
+  ret void
+}
+
+define void @test87(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test87:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB87_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB87_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB87_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB87_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst monotonic
+  ret void
+}
+
+define void @test88(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test88:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB88_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB88_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB88_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB88_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst acquire
+  ret void
+}
+
+define void @test89(i8* %ptr, i8 %cmp, i8 %val) {
+; PPC64LE-LABEL: test89:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB89_1:
+; PPC64LE-NEXT:    lbarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB89_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB89_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB89_4:
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i8* %ptr, i8 %cmp, i8 %val singlethread seq_cst seq_cst
+  ret void
+}
+
+define void @test90(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test90:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    b .LBB90_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB90_1:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB90_2
+; PPC64LE-NEXT:  .LBB90_2:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB90_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread monotonic monotonic
+  ret void
+}
+
+define void @test91(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test91:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB91_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB91_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB91_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB91_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acquire monotonic
+  ret void
+}
+
+define void @test92(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test92:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB92_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB92_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB92_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB92_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acquire acquire
+  ret void
+}
+
+define void @test93(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test93:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB93_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB93_1:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB93_2
+; PPC64LE-NEXT:  .LBB93_2:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB93_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread release monotonic
+  ret void
+}
+
+define void @test94(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test94:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB94_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB94_1:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB94_2
+; PPC64LE-NEXT:  .LBB94_2:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB94_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread release acquire
+  ret void
+}
+
+define void @test95(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test95:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB95_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB95_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB95_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB95_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acq_rel monotonic
+  ret void
+}
+
+define void @test96(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test96:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB96_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB96_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB96_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB96_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread acq_rel acquire
+  ret void
+}
+
+define void @test97(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test97:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB97_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB97_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB97_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB97_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst monotonic
+  ret void
+}
+
+define void @test98(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test98:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB98_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB98_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB98_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB98_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst acquire
+  ret void
+}
+
+define void @test99(i16* %ptr, i16 %cmp, i16 %val) {
+; PPC64LE-LABEL: test99:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB99_1:
+; PPC64LE-NEXT:    lharx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB99_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB99_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB99_4:
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i16* %ptr, i16 %cmp, i16 %val singlethread seq_cst seq_cst
+  ret void
+}
+
+define void @test100(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test100:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    b .LBB100_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB100_1:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB100_2
+; PPC64LE-NEXT:  .LBB100_2:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB100_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread monotonic monotonic
+  ret void
+}
+
+define void @test101(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test101:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB101_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB101_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB101_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB101_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acquire monotonic
+  ret void
+}
+
+define void @test102(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test102:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB102_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB102_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB102_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB102_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acquire acquire
+  ret void
+}
+
+define void @test103(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test103:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB103_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB103_1:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB103_2
+; PPC64LE-NEXT:  .LBB103_2:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB103_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread release monotonic
+  ret void
+}
+
+define void @test104(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test104:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB104_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB104_1:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB104_2
+; PPC64LE-NEXT:  .LBB104_2:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB104_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread release acquire
+  ret void
+}
+
+define void @test105(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test105:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB105_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB105_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB105_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB105_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acq_rel monotonic
+  ret void
+}
+
+define void @test106(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test106:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB106_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB106_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB106_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB106_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread acq_rel acquire
+  ret void
+}
+
+define void @test107(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test107:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB107_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB107_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB107_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB107_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst monotonic
+  ret void
+}
+
+define void @test108(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test108:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB108_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB108_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB108_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB108_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst acquire
+  ret void
+}
+
+define void @test109(i32* %ptr, i32 %cmp, i32 %val) {
+; PPC64LE-LABEL: test109:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB109_1:
+; PPC64LE-NEXT:    lwarx 6, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB109_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB109_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB109_4:
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i32* %ptr, i32 %cmp, i32 %val singlethread seq_cst seq_cst
+  ret void
+}
+
+define void @test110(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test110:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    b .LBB110_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB110_1:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB110_2
+; PPC64LE-NEXT:  .LBB110_2:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB110_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread monotonic monotonic
+  ret void
+}
+
+define void @test111(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test111:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB111_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB111_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB111_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB111_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acquire monotonic
+  ret void
+}
+
+define void @test112(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test112:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB112_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB112_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB112_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB112_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acquire acquire
+  ret void
+}
+
+define void @test113(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test113:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB113_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB113_1:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB113_2
+; PPC64LE-NEXT:  .LBB113_2:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB113_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread release monotonic
+  ret void
+}
+
+define void @test114(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test114:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    b .LBB114_2
+; PPC64LE-NEXT:    .p2align 5
+; PPC64LE-NEXT:  .LBB114_1:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    beqlr 0
+; PPC64LE-NEXT:    b .LBB114_2
+; PPC64LE-NEXT:  .LBB114_2:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    beq 0, .LBB114_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread release acquire
+  ret void
+}
+
+define void @test115(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test115:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB115_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB115_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB115_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB115_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acq_rel monotonic
+  ret void
+}
+
+define void @test116(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test116:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB116_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB116_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB116_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB116_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread acq_rel acquire
+  ret void
+}
+
+define void @test117(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test117:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB117_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB117_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB117_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB117_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst monotonic
+  ret void
+}
+
+define void @test118(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test118:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB118_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB118_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB118_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB118_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst acquire
+  ret void
+}
+
+define void @test119(i64* %ptr, i64 %cmp, i64 %val) {
+; PPC64LE-LABEL: test119:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB119_1:
+; PPC64LE-NEXT:    ldarx 6, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 6
+; PPC64LE-NEXT:    bne 0, .LBB119_4
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 5, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB119_1
+; PPC64LE-NEXT:  # BB#3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+; PPC64LE-NEXT:  .LBB119_4:
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %res = cmpxchg i64* %ptr, i64 %cmp, i64 %val singlethread seq_cst seq_cst
+  ret void
+}
+
+define i8 @test120(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test120:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB120_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB120_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test121(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test121:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB121_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    stbcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB121_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test122(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test122:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB122_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB122_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test123(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test123:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB123_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB123_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test124(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test124:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB124_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB124_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test125(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test125:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB125_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB125_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test126(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test126:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB126_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    sthcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB126_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test127(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test127:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB127_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB127_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test128(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test128:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB128_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB128_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test129(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test129:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB129_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB129_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test130(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test130:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB130_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB130_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test131(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test131:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB131_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    stwcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB131_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test132(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test132:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB132_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB132_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test133(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test133:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB133_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB133_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test134(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test134:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB134_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB134_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test135(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test135:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB135_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB135_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test136(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test136:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB136_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    stdcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB136_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test137(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test137:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB137_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB137_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test138(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test138:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB138_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB138_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test139(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test139:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB139_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB139_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test140(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test140:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB140_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB140_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test141(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test141:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB141_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    add 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB141_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test142(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test142:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB142_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB142_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test143(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test143:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB143_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB143_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test144(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test144:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB144_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB144_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test145(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test145:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB145_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB145_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test146(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test146:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB146_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    add 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB146_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test147(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test147:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB147_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB147_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test148(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test148:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB148_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB148_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test149(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test149:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB149_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB149_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test150(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test150:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB150_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB150_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test151(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test151:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB151_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    add 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB151_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test152(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test152:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB152_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB152_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test153(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test153:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB153_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB153_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test154(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test154:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB154_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB154_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test155(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test155:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB155_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB155_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test156(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test156:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB156_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    add 6, 4, 3
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB156_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test157(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test157:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB157_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB157_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test158(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test158:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB158_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB158_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test159(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test159:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB159_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB159_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test160(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test160:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB160_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB160_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test161(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test161:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB161_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    subf 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB161_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test162(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test162:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB162_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB162_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test163(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test163:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB163_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB163_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test164(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test164:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB164_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB164_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test165(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test165:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB165_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB165_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test166(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test166:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB166_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    subf 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB166_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test167(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test167:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB167_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB167_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test168(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test168:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB168_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB168_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test169(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test169:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB169_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB169_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test170(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test170:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB170_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB170_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test171(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test171:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB171_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    subf 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB171_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test172(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test172:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB172_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB172_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test173(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test173:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB173_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB173_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test174(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test174:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB174_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB174_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test175(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test175:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB175_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    sub 6, 5, 4
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB175_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test176(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test176:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB176_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    sub 6, 3, 4
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB176_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test177(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test177:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB177_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    sub 6, 5, 4
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB177_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test178(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test178:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB178_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    sub 6, 5, 4
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB178_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test179(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test179:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB179_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    sub 6, 5, 4
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB179_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test180(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test180:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB180_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB180_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test181(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test181:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB181_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    and 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB181_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test182(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test182:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB182_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB182_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test183(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test183:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB183_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB183_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test184(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test184:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB184_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB184_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test185(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test185:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB185_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB185_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test186(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test186:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB186_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    and 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB186_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test187(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test187:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB187_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB187_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test188(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test188:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB188_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB188_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test189(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test189:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB189_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB189_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test190(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test190:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB190_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB190_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test191(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test191:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB191_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    and 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB191_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test192(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test192:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB192_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB192_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test193(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test193:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB193_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB193_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test194(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test194:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB194_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB194_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test195(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test195:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB195_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB195_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test196(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test196:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB196_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    and 6, 4, 3
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB196_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test197(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test197:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB197_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB197_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test198(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test198:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB198_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB198_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test199(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test199:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB199_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB199_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test200(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test200:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB200_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB200_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test201(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test201:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB201_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    nand 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB201_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test202(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test202:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB202_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB202_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test203(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test203:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB203_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB203_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test204(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test204:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB204_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB204_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test205(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test205:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB205_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB205_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test206(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test206:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB206_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    nand 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB206_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test207(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test207:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB207_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB207_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test208(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test208:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB208_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB208_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test209(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test209:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB209_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB209_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test210(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test210:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB210_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB210_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test211(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test211:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB211_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    nand 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB211_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test212(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test212:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB212_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB212_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test213(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test213:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB213_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB213_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test214(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test214:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB214_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB214_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test215(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test215:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB215_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB215_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test216(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test216:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB216_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    nand 6, 4, 3
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB216_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test217(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test217:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB217_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB217_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test218(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test218:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB218_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB218_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test219(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test219:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB219_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB219_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test220(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test220:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB220_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB220_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test221(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test221:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB221_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    or 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB221_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test222(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test222:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB222_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB222_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test223(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test223:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB223_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB223_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test224(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test224:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB224_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB224_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test225(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test225:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB225_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB225_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test226(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test226:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB226_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    or 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB226_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test227(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test227:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB227_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB227_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test228(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test228:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB228_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB228_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test229(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test229:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB229_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB229_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test230(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test230:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB230_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB230_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test231(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test231:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB231_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    or 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB231_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test232(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test232:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB232_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB232_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test233(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test233:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB233_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB233_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test234(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test234:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB234_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB234_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test235(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test235:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB235_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB235_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test236(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test236:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB236_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    or 6, 4, 3
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB236_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test237(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test237:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB237_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB237_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test238(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test238:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB238_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB238_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test239(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test239:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB239_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB239_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test240(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test240:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB240_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB240_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test241(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test241:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB241_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    xor 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB241_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test242(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test242:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB242_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB242_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test243(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test243:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB243_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB243_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test244(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test244:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB244_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB244_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test245(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test245:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB245_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB245_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test246(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test246:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB246_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    xor 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB246_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test247(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test247:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB247_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB247_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test248(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test248:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB248_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB248_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test249(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test249:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB249_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB249_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test250(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test250:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB250_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB250_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test251(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test251:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB251_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    xor 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB251_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test252(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test252:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB252_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB252_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test253(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test253:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB253_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB253_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test254(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test254:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB254_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB254_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test255(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test255:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB255_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB255_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test256(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test256:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB256_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    xor 6, 4, 3
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB256_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test257(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test257:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB257_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB257_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test258(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test258:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB258_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB258_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test259(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test259:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB259_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB259_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test260(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test260:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB260_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB260_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB260_1
+; PPC64LE-NEXT:  .LBB260_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test261(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test261:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB261_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    extsb 6, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB261_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB261_1
+; PPC64LE-NEXT:  .LBB261_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test262(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test262:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB262_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB262_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB262_1
+; PPC64LE-NEXT:  .LBB262_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test263(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test263:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB263_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB263_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB263_1
+; PPC64LE-NEXT:  .LBB263_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test264(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test264:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB264_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB264_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB264_1
+; PPC64LE-NEXT:  .LBB264_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test265(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test265:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB265_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB265_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB265_1
+; PPC64LE-NEXT:  .LBB265_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test266(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test266:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB266_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    extsh 6, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB266_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB266_1
+; PPC64LE-NEXT:  .LBB266_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test267(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test267:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB267_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB267_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB267_1
+; PPC64LE-NEXT:  .LBB267_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test268(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test268:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB268_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB268_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB268_1
+; PPC64LE-NEXT:  .LBB268_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test269(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test269:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB269_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB269_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB269_1
+; PPC64LE-NEXT:  .LBB269_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test270(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test270:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB270_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB270_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB270_1
+; PPC64LE-NEXT:  .LBB270_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test271(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test271:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB271_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    cmpw 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB271_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB271_1
+; PPC64LE-NEXT:  .LBB271_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test272(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test272:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB272_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB272_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB272_1
+; PPC64LE-NEXT:  .LBB272_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test273(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test273:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB273_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB273_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB273_1
+; PPC64LE-NEXT:  .LBB273_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test274(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test274:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB274_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB274_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB274_1
+; PPC64LE-NEXT:  .LBB274_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test275(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test275:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB275_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB275_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB275_1
+; PPC64LE-NEXT:  .LBB275_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test276(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test276:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB276_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    cmpd 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB276_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB276_1
+; PPC64LE-NEXT:  .LBB276_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test277(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test277:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB277_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB277_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB277_1
+; PPC64LE-NEXT:  .LBB277_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test278(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test278:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB278_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB278_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB278_1
+; PPC64LE-NEXT:  .LBB278_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test279(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test279:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB279_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB279_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB279_1
+; PPC64LE-NEXT:  .LBB279_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test280(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test280:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB280_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB280_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB280_1
+; PPC64LE-NEXT:  .LBB280_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test281(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test281:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB281_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    extsb 6, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB281_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB281_1
+; PPC64LE-NEXT:  .LBB281_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test282(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test282:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB282_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB282_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB282_1
+; PPC64LE-NEXT:  .LBB282_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test283(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test283:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB283_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB283_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB283_1
+; PPC64LE-NEXT:  .LBB283_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test284(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test284:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB284_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB284_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB284_1
+; PPC64LE-NEXT:  .LBB284_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test285(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test285:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB285_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB285_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB285_1
+; PPC64LE-NEXT:  .LBB285_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test286(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test286:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB286_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    extsh 6, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB286_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB286_1
+; PPC64LE-NEXT:  .LBB286_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test287(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test287:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB287_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB287_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB287_1
+; PPC64LE-NEXT:  .LBB287_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test288(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test288:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB288_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB288_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB288_1
+; PPC64LE-NEXT:  .LBB288_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test289(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test289:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB289_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB289_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB289_1
+; PPC64LE-NEXT:  .LBB289_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test290(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test290:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB290_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB290_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB290_1
+; PPC64LE-NEXT:  .LBB290_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test291(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test291:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB291_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    cmpw 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB291_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB291_1
+; PPC64LE-NEXT:  .LBB291_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test292(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test292:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB292_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB292_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB292_1
+; PPC64LE-NEXT:  .LBB292_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test293(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test293:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB293_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB293_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB293_1
+; PPC64LE-NEXT:  .LBB293_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test294(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test294:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB294_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB294_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB294_1
+; PPC64LE-NEXT:  .LBB294_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test295(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test295:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB295_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB295_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB295_1
+; PPC64LE-NEXT:  .LBB295_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test296(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test296:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB296_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    cmpd 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB296_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB296_1
+; PPC64LE-NEXT:  .LBB296_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test297(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test297:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB297_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB297_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB297_1
+; PPC64LE-NEXT:  .LBB297_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test298(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test298:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB298_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB298_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB298_1
+; PPC64LE-NEXT:  .LBB298_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test299(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test299:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB299_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB299_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB299_1
+; PPC64LE-NEXT:  .LBB299_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test300(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test300:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB300_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB300_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB300_1
+; PPC64LE-NEXT:  .LBB300_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test301(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test301:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB301_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB301_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB301_1
+; PPC64LE-NEXT:  .LBB301_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test302(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test302:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB302_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB302_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB302_1
+; PPC64LE-NEXT:  .LBB302_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test303(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test303:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB303_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB303_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB303_1
+; PPC64LE-NEXT:  .LBB303_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test304(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test304:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB304_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB304_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB304_1
+; PPC64LE-NEXT:  .LBB304_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test305(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test305:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB305_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB305_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB305_1
+; PPC64LE-NEXT:  .LBB305_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test306(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test306:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB306_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB306_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB306_1
+; PPC64LE-NEXT:  .LBB306_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test307(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test307:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB307_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB307_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB307_1
+; PPC64LE-NEXT:  .LBB307_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test308(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test308:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB308_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB308_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB308_1
+; PPC64LE-NEXT:  .LBB308_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test309(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test309:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB309_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB309_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB309_1
+; PPC64LE-NEXT:  .LBB309_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test310(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test310:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB310_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB310_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB310_1
+; PPC64LE-NEXT:  .LBB310_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test311(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test311:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB311_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB311_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB311_1
+; PPC64LE-NEXT:  .LBB311_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test312(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test312:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB312_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB312_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB312_1
+; PPC64LE-NEXT:  .LBB312_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test313(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test313:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB313_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB313_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB313_1
+; PPC64LE-NEXT:  .LBB313_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test314(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test314:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB314_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB314_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB314_1
+; PPC64LE-NEXT:  .LBB314_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test315(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test315:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB315_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB315_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB315_1
+; PPC64LE-NEXT:  .LBB315_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test316(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test316:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB316_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    cmpld 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB316_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB316_1
+; PPC64LE-NEXT:  .LBB316_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test317(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test317:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB317_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB317_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB317_1
+; PPC64LE-NEXT:  .LBB317_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test318(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test318:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB318_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB318_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB318_1
+; PPC64LE-NEXT:  .LBB318_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test319(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test319:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB319_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB319_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB319_1
+; PPC64LE-NEXT:  .LBB319_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test320(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test320:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB320_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB320_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB320_1
+; PPC64LE-NEXT:  .LBB320_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i8* %ptr, i8 %val monotonic
+  ret i8 %ret
+}
+
+define i8 @test321(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test321:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB321_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB321_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB321_1
+; PPC64LE-NEXT:  .LBB321_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i8* %ptr, i8 %val acquire
+  ret i8 %ret
+}
+
+define i8 @test322(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test322:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB322_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB322_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB322_1
+; PPC64LE-NEXT:  .LBB322_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i8* %ptr, i8 %val release
+  ret i8 %ret
+}
+
+define i8 @test323(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test323:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB323_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB323_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB323_1
+; PPC64LE-NEXT:  .LBB323_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i8* %ptr, i8 %val acq_rel
+  ret i8 %ret
+}
+
+define i8 @test324(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test324:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB324_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB324_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB324_1
+; PPC64LE-NEXT:  .LBB324_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i8* %ptr, i8 %val seq_cst
+  ret i8 %ret
+}
+
+define i16 @test325(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test325:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB325_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB325_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB325_1
+; PPC64LE-NEXT:  .LBB325_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i16* %ptr, i16 %val monotonic
+  ret i16 %ret
+}
+
+define i16 @test326(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test326:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB326_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB326_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB326_1
+; PPC64LE-NEXT:  .LBB326_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i16* %ptr, i16 %val acquire
+  ret i16 %ret
+}
+
+define i16 @test327(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test327:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB327_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB327_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB327_1
+; PPC64LE-NEXT:  .LBB327_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i16* %ptr, i16 %val release
+  ret i16 %ret
+}
+
+define i16 @test328(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test328:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB328_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB328_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB328_1
+; PPC64LE-NEXT:  .LBB328_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i16* %ptr, i16 %val acq_rel
+  ret i16 %ret
+}
+
+define i16 @test329(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test329:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB329_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB329_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB329_1
+; PPC64LE-NEXT:  .LBB329_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i16* %ptr, i16 %val seq_cst
+  ret i16 %ret
+}
+
+define i32 @test330(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test330:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB330_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB330_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB330_1
+; PPC64LE-NEXT:  .LBB330_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i32* %ptr, i32 %val monotonic
+  ret i32 %ret
+}
+
+define i32 @test331(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test331:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB331_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB331_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB331_1
+; PPC64LE-NEXT:  .LBB331_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i32* %ptr, i32 %val acquire
+  ret i32 %ret
+}
+
+define i32 @test332(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test332:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB332_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB332_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB332_1
+; PPC64LE-NEXT:  .LBB332_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i32* %ptr, i32 %val release
+  ret i32 %ret
+}
+
+define i32 @test333(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test333:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB333_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB333_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB333_1
+; PPC64LE-NEXT:  .LBB333_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i32* %ptr, i32 %val acq_rel
+  ret i32 %ret
+}
+
+define i32 @test334(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test334:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB334_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB334_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB334_1
+; PPC64LE-NEXT:  .LBB334_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i32* %ptr, i32 %val seq_cst
+  ret i32 %ret
+}
+
+define i64 @test335(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test335:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB335_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB335_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB335_1
+; PPC64LE-NEXT:  .LBB335_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i64* %ptr, i64 %val monotonic
+  ret i64 %ret
+}
+
+define i64 @test336(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test336:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB336_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    cmpld 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB336_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB336_1
+; PPC64LE-NEXT:  .LBB336_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i64* %ptr, i64 %val acquire
+  ret i64 %ret
+}
+
+define i64 @test337(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test337:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB337_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB337_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB337_1
+; PPC64LE-NEXT:  .LBB337_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i64* %ptr, i64 %val release
+  ret i64 %ret
+}
+
+define i64 @test338(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test338:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB338_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB338_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB338_1
+; PPC64LE-NEXT:  .LBB338_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i64* %ptr, i64 %val acq_rel
+  ret i64 %ret
+}
+
+define i64 @test339(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test339:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB339_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB339_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB339_1
+; PPC64LE-NEXT:  .LBB339_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i64* %ptr, i64 %val seq_cst
+  ret i64 %ret
+}
+
+define i8 @test340(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test340:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB340_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB340_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test341(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test341:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB341_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    stbcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB341_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test342(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test342:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB342_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB342_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test343(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test343:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB343_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB343_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test344(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test344:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB344_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB344_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test345(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test345:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB345_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB345_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test346(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test346:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB346_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    sthcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB346_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test347(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test347:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB347_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB347_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test348(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test348:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB348_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB348_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test349(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test349:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB349_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB349_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test350(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test350:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB350_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB350_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test351(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test351:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB351_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    stwcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB351_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test352(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test352:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB352_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB352_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test353(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test353:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB353_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB353_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test354(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test354:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB354_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB354_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test355(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test355:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB355_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB355_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test356(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test356:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB356_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    stdcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB356_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test357(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test357:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB357_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB357_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test358(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test358:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB358_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB358_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test359(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test359:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB359_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB359_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xchg i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
+
+define i8 @test360(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test360:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB360_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB360_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test361(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test361:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB361_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    add 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB361_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test362(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test362:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB362_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB362_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test363(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test363:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB363_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB363_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test364(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test364:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB364_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB364_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test365(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test365:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB365_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB365_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test366(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test366:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB366_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    add 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB366_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test367(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test367:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB367_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB367_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test368(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test368:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB368_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB368_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test369(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test369:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB369_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB369_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test370(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test370:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB370_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB370_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test371(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test371:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB371_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    add 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB371_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test372(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test372:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB372_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB372_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test373(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test373:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB373_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB373_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test374(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test374:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB374_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB374_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test375(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test375:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB375_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB375_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test376(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test376:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB376_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    add 6, 4, 3
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB376_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test377(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test377:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB377_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB377_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test378(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test378:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB378_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB378_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test379(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test379:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB379_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    add 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB379_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw add i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
+
+define i8 @test380(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test380:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB380_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB380_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test381(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test381:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB381_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    subf 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB381_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test382(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test382:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB382_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB382_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test383(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test383:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB383_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB383_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test384(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test384:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB384_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB384_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test385(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test385:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB385_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB385_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test386(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test386:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB386_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    subf 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB386_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test387(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test387:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB387_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB387_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test388(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test388:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB388_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB388_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test389(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test389:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB389_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB389_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test390(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test390:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB390_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB390_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test391(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test391:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB391_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    subf 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB391_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test392(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test392:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB392_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB392_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test393(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test393:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB393_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB393_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test394(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test394:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB394_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    subf 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB394_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test395(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test395:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB395_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    sub 6, 5, 4
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB395_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test396(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test396:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB396_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    sub 6, 3, 4
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB396_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test397(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test397:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB397_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    sub 6, 5, 4
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB397_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test398(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test398:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB398_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    sub 6, 5, 4
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB398_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test399(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test399:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB399_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    sub 6, 5, 4
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB399_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw sub i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
+
+define i8 @test400(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test400:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB400_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB400_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test401(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test401:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB401_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    and 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB401_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test402(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test402:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB402_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB402_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test403(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test403:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB403_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB403_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test404(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test404:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB404_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB404_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test405(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test405:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB405_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB405_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test406(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test406:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB406_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    and 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB406_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test407(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test407:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB407_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB407_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test408(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test408:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB408_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB408_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test409(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test409:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB409_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB409_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test410(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test410:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB410_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB410_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test411(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test411:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB411_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    and 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB411_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test412(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test412:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB412_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB412_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test413(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test413:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB413_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB413_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test414(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test414:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB414_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB414_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test415(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test415:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB415_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB415_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test416(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test416:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB416_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    and 6, 4, 3
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB416_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test417(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test417:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB417_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB417_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test418(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test418:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB418_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB418_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test419(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test419:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB419_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    and 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB419_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw and i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
+
+define i8 @test420(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test420:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB420_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB420_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test421(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test421:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB421_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    nand 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB421_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test422(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test422:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB422_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB422_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test423(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test423:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB423_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB423_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test424(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test424:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB424_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB424_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test425(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test425:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB425_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB425_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test426(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test426:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB426_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    nand 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB426_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test427(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test427:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB427_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB427_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test428(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test428:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB428_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB428_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test429(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test429:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB429_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB429_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test430(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test430:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB430_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB430_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test431(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test431:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB431_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    nand 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB431_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test432(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test432:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB432_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB432_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test433(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test433:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB433_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB433_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test434(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test434:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB434_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB434_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test435(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test435:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB435_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB435_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test436(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test436:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB436_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    nand 6, 4, 3
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB436_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test437(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test437:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB437_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB437_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test438(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test438:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB438_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB438_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test439(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test439:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB439_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    nand 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB439_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw nand i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
+
+define i8 @test440(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test440:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB440_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB440_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test441(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test441:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB441_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    or 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB441_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test442(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test442:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB442_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB442_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test443(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test443:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB443_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB443_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test444(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test444:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB444_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB444_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test445(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test445:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB445_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB445_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test446(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test446:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB446_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    or 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB446_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test447(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test447:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB447_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB447_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test448(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test448:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB448_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB448_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test449(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test449:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB449_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB449_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test450(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test450:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB450_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB450_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test451(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test451:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB451_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    or 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB451_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test452(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test452:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB452_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB452_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test453(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test453:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB453_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB453_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test454(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test454:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB454_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB454_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test455(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test455:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB455_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB455_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test456(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test456:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB456_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    or 6, 4, 3
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB456_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test457(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test457:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB457_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB457_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test458(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test458:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB458_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB458_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test459(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test459:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB459_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    or 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB459_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw or i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
+
+define i8 @test460(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test460:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB460_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB460_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test461(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test461:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB461_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    xor 6, 4, 3
+; PPC64LE-NEXT:    stbcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB461_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test462(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test462:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB462_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB462_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test463(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test463:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB463_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB463_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test464(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test464:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB464_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stbcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB464_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test465(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test465:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB465_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB465_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test466(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test466:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB466_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    xor 6, 4, 3
+; PPC64LE-NEXT:    sthcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB466_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test467(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test467:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB467_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB467_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test468(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test468:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB468_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB468_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test469(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test469:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB469_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    sthcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB469_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test470(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test470:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB470_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB470_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test471(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test471:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB471_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    xor 6, 4, 3
+; PPC64LE-NEXT:    stwcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB471_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test472(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test472:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB472_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB472_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test473(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test473:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB473_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB473_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test474(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test474:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB474_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stwcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB474_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test475(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test475:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB475_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB475_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test476(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test476:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB476_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    xor 6, 4, 3
+; PPC64LE-NEXT:    stdcx. 6, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB476_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test477(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test477:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB477_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB477_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test478(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test478:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB478_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB478_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test479(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test479:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB479_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    xor 6, 4, 5
+; PPC64LE-NEXT:    stdcx. 6, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB479_1
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw xor i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
+
+define i8 @test480(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test480:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB480_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB480_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB480_1
+; PPC64LE-NEXT:  .LBB480_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test481(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test481:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB481_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    extsb 6, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB481_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB481_1
+; PPC64LE-NEXT:  .LBB481_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test482(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test482:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB482_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB482_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB482_1
+; PPC64LE-NEXT:  .LBB482_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test483(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test483:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB483_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB483_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB483_1
+; PPC64LE-NEXT:  .LBB483_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test484(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test484:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB484_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB484_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB484_1
+; PPC64LE-NEXT:  .LBB484_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test485(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test485:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB485_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB485_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB485_1
+; PPC64LE-NEXT:  .LBB485_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test486(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test486:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB486_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    extsh 6, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB486_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB486_1
+; PPC64LE-NEXT:  .LBB486_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test487(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test487:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB487_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB487_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB487_1
+; PPC64LE-NEXT:  .LBB487_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test488(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test488:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB488_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB488_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB488_1
+; PPC64LE-NEXT:  .LBB488_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test489(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test489:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB489_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    ble 0, .LBB489_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB489_1
+; PPC64LE-NEXT:  .LBB489_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test490(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test490:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB490_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB490_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB490_1
+; PPC64LE-NEXT:  .LBB490_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test491(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test491:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB491_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    cmpw 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB491_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB491_1
+; PPC64LE-NEXT:  .LBB491_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test492(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test492:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB492_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB492_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB492_1
+; PPC64LE-NEXT:  .LBB492_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test493(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test493:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB493_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB493_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB493_1
+; PPC64LE-NEXT:  .LBB493_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test494(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test494:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB494_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB494_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB494_1
+; PPC64LE-NEXT:  .LBB494_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test495(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test495:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB495_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB495_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB495_1
+; PPC64LE-NEXT:  .LBB495_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test496(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test496:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB496_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    cmpd 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB496_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB496_1
+; PPC64LE-NEXT:  .LBB496_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test497(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test497:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB497_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB497_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB497_1
+; PPC64LE-NEXT:  .LBB497_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test498(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test498:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB498_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB498_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB498_1
+; PPC64LE-NEXT:  .LBB498_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test499(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test499:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB499_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB499_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB499_1
+; PPC64LE-NEXT:  .LBB499_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw max i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
+
+define i8 @test500(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test500:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB500_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB500_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB500_1
+; PPC64LE-NEXT:  .LBB500_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test501(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test501:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB501_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    extsb 6, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB501_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB501_1
+; PPC64LE-NEXT:  .LBB501_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test502(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test502:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB502_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB502_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB502_1
+; PPC64LE-NEXT:  .LBB502_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test503(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test503:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB503_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB503_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB503_1
+; PPC64LE-NEXT:  .LBB503_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test504(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test504:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB504_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    extsb 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB504_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB504_1
+; PPC64LE-NEXT:  .LBB504_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test505(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test505:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB505_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB505_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB505_1
+; PPC64LE-NEXT:  .LBB505_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test506(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test506:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB506_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    extsh 6, 3
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB506_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB506_1
+; PPC64LE-NEXT:  .LBB506_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test507(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test507:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB507_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB507_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB507_1
+; PPC64LE-NEXT:  .LBB507_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test508(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test508:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB508_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB508_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB508_1
+; PPC64LE-NEXT:  .LBB508_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test509(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test509:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB509_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    extsh 6, 5
+; PPC64LE-NEXT:    cmpw 4, 6
+; PPC64LE-NEXT:    bge 0, .LBB509_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB509_1
+; PPC64LE-NEXT:  .LBB509_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test510(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test510:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB510_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB510_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB510_1
+; PPC64LE-NEXT:  .LBB510_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test511(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test511:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB511_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    cmpw 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB511_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB511_1
+; PPC64LE-NEXT:  .LBB511_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test512(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test512:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB512_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB512_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB512_1
+; PPC64LE-NEXT:  .LBB512_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test513(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test513:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB513_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB513_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB513_1
+; PPC64LE-NEXT:  .LBB513_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test514(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test514:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB514_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmpw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB514_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB514_1
+; PPC64LE-NEXT:  .LBB514_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test515(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test515:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB515_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB515_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB515_1
+; PPC64LE-NEXT:  .LBB515_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test516(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test516:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB516_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    cmpd 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB516_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB516_1
+; PPC64LE-NEXT:  .LBB516_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test517(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test517:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB517_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB517_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB517_1
+; PPC64LE-NEXT:  .LBB517_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test518(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test518:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB518_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB518_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB518_1
+; PPC64LE-NEXT:  .LBB518_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test519(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test519:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB519_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpd 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB519_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB519_1
+; PPC64LE-NEXT:  .LBB519_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw min i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
+
+define i8 @test520(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test520:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB520_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB520_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB520_1
+; PPC64LE-NEXT:  .LBB520_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test521(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test521:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB521_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB521_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB521_1
+; PPC64LE-NEXT:  .LBB521_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test522(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test522:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB522_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB522_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB522_1
+; PPC64LE-NEXT:  .LBB522_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test523(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test523:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB523_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB523_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB523_1
+; PPC64LE-NEXT:  .LBB523_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test524(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test524:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB524_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB524_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB524_1
+; PPC64LE-NEXT:  .LBB524_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test525(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test525:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB525_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB525_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB525_1
+; PPC64LE-NEXT:  .LBB525_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test526(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test526:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB526_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB526_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB526_1
+; PPC64LE-NEXT:  .LBB526_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test527(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test527:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB527_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB527_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB527_1
+; PPC64LE-NEXT:  .LBB527_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test528(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test528:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB528_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB528_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB528_1
+; PPC64LE-NEXT:  .LBB528_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test529(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test529:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB529_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB529_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB529_1
+; PPC64LE-NEXT:  .LBB529_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test530(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test530:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB530_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB530_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB530_1
+; PPC64LE-NEXT:  .LBB530_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test531(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test531:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB531_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB531_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB531_1
+; PPC64LE-NEXT:  .LBB531_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test532(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test532:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB532_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB532_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB532_1
+; PPC64LE-NEXT:  .LBB532_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test533(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test533:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB533_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB533_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB533_1
+; PPC64LE-NEXT:  .LBB533_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test534(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test534:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB534_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB534_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB534_1
+; PPC64LE-NEXT:  .LBB534_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test535(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test535:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB535_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB535_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB535_1
+; PPC64LE-NEXT:  .LBB535_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test536(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test536:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB536_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    cmpld 4, 3
+; PPC64LE-NEXT:    ble 0, .LBB536_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB536_1
+; PPC64LE-NEXT:  .LBB536_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test537(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test537:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB537_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB537_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB537_1
+; PPC64LE-NEXT:  .LBB537_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test538(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test538:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB538_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB538_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB538_1
+; PPC64LE-NEXT:  .LBB538_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test539(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test539:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB539_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    ble 0, .LBB539_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB539_1
+; PPC64LE-NEXT:  .LBB539_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umax i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
+
+define i8 @test540(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test540:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB540_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB540_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB540_1
+; PPC64LE-NEXT:  .LBB540_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i8* %ptr, i8 %val singlethread monotonic
+  ret i8 %ret
+}
+
+define i8 @test541(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test541:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB541_1:
+; PPC64LE-NEXT:    lbarx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB541_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB541_1
+; PPC64LE-NEXT:  .LBB541_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i8* %ptr, i8 %val singlethread acquire
+  ret i8 %ret
+}
+
+define i8 @test542(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test542:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB542_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB542_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB542_1
+; PPC64LE-NEXT:  .LBB542_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i8* %ptr, i8 %val singlethread release
+  ret i8 %ret
+}
+
+define i8 @test543(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test543:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB543_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB543_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB543_1
+; PPC64LE-NEXT:  .LBB543_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i8* %ptr, i8 %val singlethread acq_rel
+  ret i8 %ret
+}
+
+define i8 @test544(i8* %ptr, i8 %val) {
+; PPC64LE-LABEL: test544:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB544_1:
+; PPC64LE-NEXT:    lbarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB544_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stbcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB544_1
+; PPC64LE-NEXT:  .LBB544_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i8* %ptr, i8 %val singlethread seq_cst
+  ret i8 %ret
+}
+
+define i16 @test545(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test545:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB545_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB545_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB545_1
+; PPC64LE-NEXT:  .LBB545_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i16* %ptr, i16 %val singlethread monotonic
+  ret i16 %ret
+}
+
+define i16 @test546(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test546:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB546_1:
+; PPC64LE-NEXT:    lharx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB546_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB546_1
+; PPC64LE-NEXT:  .LBB546_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i16* %ptr, i16 %val singlethread acquire
+  ret i16 %ret
+}
+
+define i16 @test547(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test547:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB547_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB547_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB547_1
+; PPC64LE-NEXT:  .LBB547_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i16* %ptr, i16 %val singlethread release
+  ret i16 %ret
+}
+
+define i16 @test548(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test548:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB548_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB548_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB548_1
+; PPC64LE-NEXT:  .LBB548_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i16* %ptr, i16 %val singlethread acq_rel
+  ret i16 %ret
+}
+
+define i16 @test549(i16* %ptr, i16 %val) {
+; PPC64LE-LABEL: test549:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB549_1:
+; PPC64LE-NEXT:    lharx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB549_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    sthcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB549_1
+; PPC64LE-NEXT:  .LBB549_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i16* %ptr, i16 %val singlethread seq_cst
+  ret i16 %ret
+}
+
+define i32 @test550(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test550:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB550_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB550_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB550_1
+; PPC64LE-NEXT:  .LBB550_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i32* %ptr, i32 %val singlethread monotonic
+  ret i32 %ret
+}
+
+define i32 @test551(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test551:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB551_1:
+; PPC64LE-NEXT:    lwarx 3, 0, 5
+; PPC64LE-NEXT:    cmplw 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB551_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB551_1
+; PPC64LE-NEXT:  .LBB551_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i32* %ptr, i32 %val singlethread acquire
+  ret i32 %ret
+}
+
+define i32 @test552(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test552:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB552_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB552_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB552_1
+; PPC64LE-NEXT:  .LBB552_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i32* %ptr, i32 %val singlethread release
+  ret i32 %ret
+}
+
+define i32 @test553(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test553:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB553_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB553_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB553_1
+; PPC64LE-NEXT:  .LBB553_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i32* %ptr, i32 %val singlethread acq_rel
+  ret i32 %ret
+}
+
+define i32 @test554(i32* %ptr, i32 %val) {
+; PPC64LE-LABEL: test554:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB554_1:
+; PPC64LE-NEXT:    lwarx 5, 0, 3
+; PPC64LE-NEXT:    cmplw 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB554_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stwcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB554_1
+; PPC64LE-NEXT:  .LBB554_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i32* %ptr, i32 %val singlethread seq_cst
+  ret i32 %ret
+}
+
+define i64 @test555(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test555:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:  .LBB555_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB555_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB555_1
+; PPC64LE-NEXT:  .LBB555_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i64* %ptr, i64 %val singlethread monotonic
+  ret i64 %ret
+}
+
+define i64 @test556(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test556:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    mr 5, 3
+; PPC64LE-NEXT:  .LBB556_1:
+; PPC64LE-NEXT:    ldarx 3, 0, 5
+; PPC64LE-NEXT:    cmpld 4, 3
+; PPC64LE-NEXT:    bge 0, .LBB556_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 5
+; PPC64LE-NEXT:    bne 0, .LBB556_1
+; PPC64LE-NEXT:  .LBB556_3:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i64* %ptr, i64 %val singlethread acquire
+  ret i64 %ret
+}
+
+define i64 @test557(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test557:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB557_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB557_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB557_1
+; PPC64LE-NEXT:  .LBB557_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i64* %ptr, i64 %val singlethread release
+  ret i64 %ret
+}
+
+define i64 @test558(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test558:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:  .LBB558_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB558_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB558_1
+; PPC64LE-NEXT:  .LBB558_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i64* %ptr, i64 %val singlethread acq_rel
+  ret i64 %ret
+}
+
+define i64 @test559(i64* %ptr, i64 %val) {
+; PPC64LE-LABEL: test559:
+; PPC64LE:       # BB#0:
+; PPC64LE-NEXT:    sync
+; PPC64LE-NEXT:  .LBB559_1:
+; PPC64LE-NEXT:    ldarx 5, 0, 3
+; PPC64LE-NEXT:    cmpld 4, 5
+; PPC64LE-NEXT:    bge 0, .LBB559_3
+; PPC64LE-NEXT:  # BB#2:
+; PPC64LE-NEXT:    stdcx. 4, 0, 3
+; PPC64LE-NEXT:    bne 0, .LBB559_1
+; PPC64LE-NEXT:  .LBB559_3:
+; PPC64LE-NEXT:    mr 3, 5
+; PPC64LE-NEXT:    lwsync
+; PPC64LE-NEXT:    blr
+  %ret = atomicrmw umin i64* %ptr, i64 %val singlethread seq_cst
+  ret i64 %ret
+}
diff --git a/test/CodeGen/PowerPC/bitcasts-direct-move.ll b/test/CodeGen/PowerPC/bitcasts-direct-move.ll
index 79da5cb68740ad0e6582131c5a9b821818c063b0..d6c7dd3804ff4288e8c951f0fb2f1a225da888ce 100644
--- a/test/CodeGen/PowerPC/bitcasts-direct-move.ll
+++ b/test/CodeGen/PowerPC/bitcasts-direct-move.ll
@@ -20,7 +20,7 @@ entry:
   ret i64 %0
 ; CHECK-P7: stxsdx 1,
 ; CHECK-P7: ld 3,
-; CHECK: mfvsrd 3, 1
+; CHECK: mffprd 3, 1
 }
 
 define float @i32tof32(i32 signext %a) {
@@ -60,7 +60,7 @@ entry:
   ret i64 %0
 ; CHECK-P7: stxsdx 1,
 ; CHECK-P7: ld 3,
-; CHECK: mfvsrd 3, 1
+; CHECK: mffprd 3, 1
 }
 
 define float @i32utof32(i32 zeroext %a) {
diff --git a/test/CodeGen/PowerPC/branch_coalesce.ll b/test/CodeGen/PowerPC/branch_coalesce.ll
new file mode 100644
index 0000000000000000000000000000000000000000..deb6d898c2e0bed543f0ecf897c2eab5a68c93d4
--- /dev/null
+++ b/test/CodeGen/PowerPC/branch_coalesce.ll
@@ -0,0 +1,31 @@
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu -verify-machineinstrs -enable-branch-coalesce=true < %s | FileCheck %s 
+; RUN: llc -mcpu=pwr8 -mtriple=powerpc64-unknown-linux-gnu -verify-machineinstrs -enable-branch-coalesce=true < %s | FileCheck %s 
+
+; Function Attrs: nounwind
+define double @testBranchCoal(double %a, double %b, double %c, i32 %x) {
+entry:
+  %test = icmp eq i32 %x, 0
+  %tmp1 = select i1 %test, double %a, double 2.000000e-03
+  %tmp2 = select i1 %test, double %b, double 0.000000e+00
+  %tmp3 = select i1 %test, double %c, double 5.000000e-03
+
+  %res1 = fadd double %tmp1, %tmp2
+  %result = fadd double %res1, %tmp3
+  ret double %result
+
+; CHECK-LABEL: @testBranchCoal 
+; CHECK: cmplwi [[CMPR:[0-7]+]], 6, 0
+; CHECK: beq [[CMPR]], .LBB[[LAB1:[0-9_]+]]
+; CHECK-DAG: addis [[LD1REG:[0-9]+]], 2, .LCPI0_0@toc@ha
+; CHECK-DAG: addis [[LD2REG:[0-9]+]], 2, .LCPI0_1@toc@ha
+; CHECK-DAG: xxlxor 2, 2, 2
+; CHECK-NOT: beq 
+; CHECK-DAG: addi [[LD1BASE:[0-9]+]], [[LD1REG]] 
+; CHECK-DAG: addi [[LD2BASE:[0-9]+]], [[LD2REG]]
+; CHECK-DAG: lxsdx 1, 0, [[LD1BASE]]
+; CHECK-DAG: lxsdx 3, 0, [[LD2BASE]]
+; CHECK: .LBB[[LAB1]]
+; CHECK: xsadddp 0, 1, 2
+; CHECK: xsadddp 1, 0, 3
+; CHECK: blr
+}
diff --git a/test/CodeGen/PowerPC/complex-return.ll b/test/CodeGen/PowerPC/complex-return.ll
index f6097e6551285bae0cb3f242905dfccf1e68cfe4..ec87a89b110808c2cbfe5caaf099f5738411b685 100644
--- a/test/CodeGen/PowerPC/complex-return.ll
+++ b/test/CodeGen/PowerPC/complex-return.ll
@@ -24,10 +24,10 @@ entry:
 }
 
 ; CHECK-LABEL: foo:
-; CHECK: lfd 1
-; CHECK: lfd 2
-; CHECK: lfd 3
-; CHECK: lfd 4
+; CHECK-DAG: lfd 1
+; CHECK-DAG: lfd 2
+; CHECK-DAG: lfd 3
+; CHECK_DAG: lfd 4
 
 define { float, float } @oof() nounwind {
 entry:
@@ -50,6 +50,6 @@ entry:
 }
 
 ; CHECK-LABEL: oof:
-; CHECK: lfs 2
-; CHECK: lfs 1
+; CHECK-DAG: lfs 2
+; CHECK-DAG: lfs 1
 
diff --git a/test/CodeGen/PowerPC/crbits.ll b/test/CodeGen/PowerPC/crbits.ll
index b894a361d261d49fd39f0eeb59303fc8c76705bd..a85237195c5ec37672dcd90b7cddf8a282893002 100644
--- a/test/CodeGen/PowerPC/crbits.ll
+++ b/test/CodeGen/PowerPC/crbits.ll
@@ -142,7 +142,7 @@ entry:
   ret i32 %cond
 
 ; CHECK-LABEL: @exttest7
-; CHECK-DAG: cmplwi {{[0-9]+}}, 3, 5
+; CHECK-DAG: cmpwi {{[0-9]+}}, 3, 5
 ; CHECK-DAG: li [[REG1:[0-9]+]], 8
 ; CHECK-DAG: li [[REG2:[0-9]+]], 7
 ; CHECK: isel 3, [[REG2]], [[REG1]],
diff --git a/test/CodeGen/PowerPC/ctrloop-i128.ll b/test/CodeGen/PowerPC/ctrloop-i128.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8c1e0c160d30345363f4dc377704b0cf4b98aa7c
--- /dev/null
+++ b/test/CodeGen/PowerPC/ctrloop-i128.ll
@@ -0,0 +1,34 @@
+; RUN: llc -O1 -verify-machineinstrs < %s
+target datalayout = "E-m:e-i64:64-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+; Function Attrs: uwtable
+define fastcc void @_Crash_Fn() unnamed_addr #0 {
+entry-block:
+  br label %_Label_0
+
+_Label_0:                                         ; preds = %_Label_0, %entry-block
+  %result.0138 = phi i128 [ %5, %_Label_0 ], [ 0, %entry-block ]
+  %iter.sroa.0.0137 = phi i8* [ %0, %_Label_0 ], [ undef, %entry-block ]
+  %0 = getelementptr inbounds i8, i8* %iter.sroa.0.0137, i64 1
+  %1 = tail call { i128, i1 } @llvm.smul.with.overflow.i128(i128 %result.0138, i128 undef) #2
+  %2 = extractvalue { i128, i1 } %1, 0
+  %3 = tail call { i128, i1 } @llvm.sadd.with.overflow.i128(i128 %2, i128 0) #2
+  %4 = extractvalue { i128, i1 } %3, 1
+  %5 = extractvalue { i128, i1 } %3, 0
+  %6 = icmp eq i8* %0, null
+  br i1 %6, label %bb66.loopexit, label %_Label_0
+
+bb66.loopexit:                                    ; preds = %_Label_0
+  unreachable
+}
+
+; Function Attrs: nounwind readnone
+declare { i128, i1 } @llvm.sadd.with.overflow.i128(i128, i128) #1
+
+; Function Attrs: nounwind readnone
+declare { i128, i1 } @llvm.smul.with.overflow.i128(i128, i128) #1
+
+attributes #0 = { uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
diff --git a/test/CodeGen/PowerPC/ctrloop-intrin.ll b/test/CodeGen/PowerPC/ctrloop-intrin.ll
index 3a6e8855971ba49c5ad158bce86ff4b9523618fb..6ae5d3368c1a92334651dd720d3c61fcb8f63277 100644
--- a/test/CodeGen/PowerPC/ctrloop-intrin.ll
+++ b/test/CodeGen/PowerPC/ctrloop-intrin.ll
@@ -17,10 +17,10 @@ target triple = "powerpc64le--linux-gnu"
 @.str.11.98 = external hidden unnamed_addr constant [3 x i8], align 1
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 ; Function Attrs: nounwind
 declare i8* @halide_string_to_string(i8*, i8*, i8*) #1
@@ -36,7 +36,7 @@ entry:
   %buf = alloca [512 x i8], align 1
   store double %arg, double* %arg.addr, align 8, !tbaa !4
   %0 = bitcast i64* %bits to i8*
-  call void @llvm.lifetime.start(i64 8, i8* %0) #0
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %0) #0
   store i64 0, i64* %bits, align 8, !tbaa !8
   %1 = bitcast double* %arg.addr to i8*
   %call = call i8* @memcpy(i8* %0, i8* %1, i64 8) #2
@@ -245,7 +245,7 @@ if.end.105:                                       ; preds = %if.end.84, %if.end.
   %integer_exponent.0 = phi i32 [ 0, %if.end.84 ], [ %sub70, %if.end.66 ]
   %fractional_part.2 = phi i64 [ %.fractional_part.0, %if.end.84 ], [ 0, %if.end.66 ]
   %7 = bitcast [512 x i8]* %buf to i8*
-  call void @llvm.lifetime.start(i64 512, i8* %7) #0
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* %7) #0
   %add.ptr = getelementptr inbounds [512 x i8], [512 x i8]* %buf, i64 0, i64 512
   %add.ptr106 = getelementptr inbounds [512 x i8], [512 x i8]* %buf, i64 0, i64 480
   %call109 = call i8* @halide_int64_to_string(i8* %add.ptr106, i8* %add.ptr, i64 %integer_part.2, i32 1) #3
@@ -272,7 +272,7 @@ for.cond.cleanup:                                 ; preds = %if.end.138, %if.end
   %call142 = call i8* @halide_string_to_string(i8* %dst.addr.0, i8* %end, i8* %int_part_ptr.0.lcssa) #3
   %call143 = call i8* @halide_string_to_string(i8* %call142, i8* %end, i8* getelementptr inbounds ([2 x i8], [2 x i8]* @.str.9.96, i64 0, i64 0)) #3
   %call144 = call i8* @halide_int64_to_string(i8* %call143, i8* %end, i64 %fractional_part.2, i32 6) #3
-  call void @llvm.lifetime.end(i64 512, i8* %9) #0
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %9) #0
   br label %cleanup.148
 
 for.cond.cleanup.115:                             ; preds = %for.body.116
@@ -315,7 +315,7 @@ if.end.138:                                       ; preds = %if.then.136, %for.c
 cleanup.148:                                      ; preds = %for.cond.cleanup, %if.then.64, %if.end.59, %if.else.30, %if.then.28, %if.else.24, %if.then.22, %if.else.13, %if.then.11, %if.else, %if.then.6
   %retval.1 = phi i8* [ %call7, %if.then.6 ], [ %call8, %if.else ], [ %call12, %if.then.11 ], [ %call14, %if.else.13 ], [ %call23, %if.then.22 ], [ %call25, %if.else.24 ], [ %call29, %if.then.28 ], [ %call31, %if.else.30 ], [ %call65, %if.then.64 ], [ %call61, %if.end.59 ], [ %call144, %for.cond.cleanup ]
   %13 = bitcast i64* %bits to i8*
-  call void @llvm.lifetime.end(i64 8, i8* %13) #0
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %13) #0
   ret i8* %retval.1
 }
 
diff --git a/test/CodeGen/PowerPC/fma-aggr-FMF.ll b/test/CodeGen/PowerPC/fma-aggr-FMF.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8e97115bd1f2b85134cd8837a5b9fe165c6a4b66
--- /dev/null
+++ b/test/CodeGen/PowerPC/fma-aggr-FMF.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64le-linux-gnu | FileCheck %s
+
+define float @can_fma_with_fewer_uses(float %f1, float %f2, float %f3, float %f4) {
+; CHECK-LABEL: can_fma_with_fewer_uses:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xsmulsp 0, 1, 2
+; CHECK-NEXT:    fmr 1, 0
+; CHECK-NEXT:    xsmaddasp 1, 3, 4
+; CHECK-NEXT:    xsdivsp 1, 0, 1
+; CHECK-NEXT:    blr
+  %mul1 = fmul contract float %f1, %f2
+  %mul2 = fmul contract float %f3, %f4
+  %add = fadd contract float %mul1, %mul2
+  %second_use_of_mul1 = fdiv float %mul1, %add
+  ret float %second_use_of_mul1
+}
+
+; There is no contract on the mul with no extra use so we can't fuse that.
+; Since we are fusing with the mul with an extra use, the fmul needs to stick
+; around beside the fma.
+define float @no_fma_with_fewer_uses(float %f1, float %f2, float %f3, float %f4) {
+; CHECK-LABEL: no_fma_with_fewer_uses:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xsmulsp 0, 3, 4
+; CHECK-NEXT:    xsmulsp 13, 1, 2
+; CHECK-NEXT:    xsmaddasp 0, 1, 2
+; CHECK-NEXT:    xsdivsp 1, 13, 0
+; CHECK-NEXT:    blr
+  %mul1 = fmul contract float %f1, %f2
+  %mul2 = fmul float %f3, %f4
+  %add = fadd contract float %mul1, %mul2
+  %second_use_of_mul1 = fdiv float %mul1, %add
+  ret float %second_use_of_mul1
+}
diff --git a/test/CodeGen/PowerPC/fma-assoc.ll b/test/CodeGen/PowerPC/fma-assoc.ll
index e44bfc65242e686751c8ab013c03dbfd4534abed..5080e5b250e92aea9ec3e06530cb7febc59f706f 100644
--- a/test/CodeGen/PowerPC/fma-assoc.ll
+++ b/test/CodeGen/PowerPC/fma-assoc.ll
@@ -1,5 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s -march=ppc32 -fp-contract=fast -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX %s
+; RUN: llc -verify-machineinstrs < %s -march=ppc32 -fp-contract=fast -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK -check-prefix=CHECK-SAFE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX -check-prefix=CHECK-VSX-SAFE %s
+; RUN: llc -verify-machineinstrs < %s -march=ppc32 -fp-contract=fast -enable-unsafe-fp-math -mattr=-vsx -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK -check-prefix=CHECK-UNSAFE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=powerpc64-unknown-linux-gnu -fp-contract=fast -enable-unsafe-fp-math -mattr=+vsx -mcpu=pwr7 -disable-ppc-vsx-fma-mutation=false | FileCheck -check-prefix=CHECK-VSX -check-prefix=CHECK-UNSAFE-VSX %s
 
 define double @test_FMADD_ASSOC1(double %A, double %B, double %C,
                                  double %D, double %E) {
@@ -8,16 +10,28 @@ define double @test_FMADD_ASSOC1(double %A, double %B, double %C,
   %H = fadd double %F, %G         ; <double> [#uses=1]
   %I = fadd double %H, %E         ; <double> [#uses=1]
   ret double %I
-; CHECK-LABEL: test_FMADD_ASSOC1:
-; CHECK: fmadd
-; CHECK-NEXT: fmadd
-; CHECK-NEXT: blr
+; CHECK-SAFE-LABEL: test_FMADD_ASSOC1:
+; CHECK-SAFE: fmul
+; CHECK-SAFE-NEXT: fmadd
+; CHECK-SAFE-NEXT: fadd
+; CHECK-SAFE-NEXT: blr
 
-; CHECK-VSX-LABEL: test_FMADD_ASSOC1:
-; CHECK-VSX: xsmaddmdp
-; CHECK-VSX-NEXT: xsmaddadp
-; CHECK-VSX-NEXT: fmr
-; CHECK-VSX-NEXT: blr
+; CHECK-UNSAFE-LABEL: test_FMADD_ASSOC1:
+; CHECK-UNSAFE: fmadd
+; CHECK-UNSAFE-NEXT: fmadd
+; CHECK-UNSAFE-NEXT: blr
+
+; CHECK-VSX-SAFE-LABEL: test_FMADD_ASSOC1:
+; CHECK-VSX-SAFE: xsmuldp
+; CHECK-VSX-SAFE-NEXT: xsmaddadp
+; CHECK-VSX-SAFE-NEXT: xsadddp
+; CHECK-VSX-SAFE-NEXT: blr
+
+; CHECK-VSX-UNSAFE-LABEL: test_FMADD_ASSOC1:
+; CHECK-VSX-UNSAFE: xsmaddmdp
+; CHECK-VSX-UNSAFE-NEXT: xsmaddadp
+; CHECK-VSX-UNSAFE-NEXT: fmr
+; CHECK-VSX-UNSAFE-NEXT: blr
 }
 
 define double @test_FMADD_ASSOC2(double %A, double %B, double %C,
@@ -27,16 +41,28 @@ define double @test_FMADD_ASSOC2(double %A, double %B, double %C,
   %H = fadd double %F, %G         ; <double> [#uses=1]
   %I = fadd double %E, %H         ; <double> [#uses=1]
   ret double %I
-; CHECK-LABEL: test_FMADD_ASSOC2:
-; CHECK: fmadd
-; CHECK-NEXT: fmadd
-; CHECK-NEXT: blr
+; CHECK-SAFE-LABEL: test_FMADD_ASSOC2:
+; CHECK-SAFE: fmul
+; CHECK-SAFE-NEXT: fmadd
+; CHECK-SAFE-NEXT: fadd
+; CHECK-SAFE-NEXT: blr
 
-; CHECK-VSX-LABEL: test_FMADD_ASSOC2:
-; CHECK-VSX: xsmaddmdp
-; CHECK-VSX-NEXT: xsmaddadp
-; CHECK-VSX-NEXT: fmr
-; CHECK-VSX-NEXT: blr
+; CHECK-UNSAFE-LABEL: test_FMADD_ASSOC2:
+; CHECK-UNSAFE: fmadd
+; CHECK-UNSAFE-NEXT: fmadd
+; CHECK-UNSAFE-NEXT: blr
+
+; CHECK-VSX-SAFE-LABEL: test_FMADD_ASSOC2:
+; CHECK-VSX-SAFE: xsmuldp
+; CHECK-VSX-SAFE-NEXT: xsmaddadp
+; CHECK-VSX-SAFE-NEXT: xsadddp
+; CHECK-VSX-SAFE-NEXT: blr
+
+; CHECK-VSX-UNSAFE-LABEL: test_FMADD_ASSOC2:
+; CHECK-VSX-UNSAFE: xsmaddmdp
+; CHECK-VSX-UNSAFE-NEXT: xsmaddadp
+; CHECK-VSX-UNSAFE-NEXT: fmr
+; CHECK-VSX-UNSAFE-NEXT: blr
 }
 
 define double @test_FMSUB_ASSOC1(double %A, double %B, double %C,
@@ -46,16 +72,28 @@ define double @test_FMSUB_ASSOC1(double %A, double %B, double %C,
   %H = fadd double %F, %G         ; <double> [#uses=1]
   %I = fsub double %H, %E         ; <double> [#uses=1]
   ret double %I
-; CHECK-LABEL: test_FMSUB_ASSOC1:
-; CHECK: fmsub
-; CHECK-NEXT: fmadd
-; CHECK-NEXT: blr
+; CHECK-SAFE-LABEL: test_FMSUB_ASSOC1:
+; CHECK-SAFE: fmul
+; CHECK-SAFE-NEXT: fmadd
+; CHECK-SAFE-NEXT: fsub
+; CHECK-SAFE-NEXT: blr
 
-; CHECK-VSX-LABEL: test_FMSUB_ASSOC1:
-; CHECK-VSX: xsmsubmdp
-; CHECK-VSX-NEXT: xsmaddadp
-; CHECK-VSX-NEXT: fmr
-; CHECK-VSX-NEXT: blr
+; CHECK-UNSAFE-LABEL: test_FMSUB_ASSOC1:
+; CHECK-UNSAFE: fmsub
+; CHECK-UNSAFE-NEXT: fmadd
+; CHECK-UNSAFE-NEXT: blr
+
+; CHECK-SAFE-VSX-LABEL: test_FMSUB_ASSOC1:
+; CHECK-SAFE-VSX: xsmuldp
+; CHECK-SAFE-VSX-NEXT: xsmaddadp
+; CHECK-SAFE-VSX-NEXT: xssubdp
+; CHECK-SAFE-VSX-NEXT: blr
+
+; CHECK-UNSAFE-VSX-LABEL: test_FMSUB_ASSOC1:
+; CHECK-UNSAFE-VSX: xsmsubmdp
+; CHECK-UNSAFE-VSX-NEXT: xsmaddadp
+; CHECK-UNSAFE-VSX-NEXT: fmr
+; CHECK-UNSAFE-VSX-NEXT: blr
 }
 
 define double @test_FMSUB_ASSOC2(double %A, double %B, double %C,
@@ -65,16 +103,28 @@ define double @test_FMSUB_ASSOC2(double %A, double %B, double %C,
   %H = fadd double %F, %G         ; <double> [#uses=1]
   %I = fsub double %E, %H         ; <double> [#uses=1]
   ret double %I
-; CHECK-LABEL: test_FMSUB_ASSOC2:
-; CHECK: fnmsub
-; CHECK-NEXT: fnmsub
-; CHECK-NEXT: blr
+; CHECK-SAFE-LABEL: test_FMSUB_ASSOC2:
+; CHECK-SAFE: fmul
+; CHECK-SAFE-NEXT: fmadd
+; CHECK-SAFE-NEXT: fsub
+; CHECK-SAFE-NEXT: blr
 
-; CHECK-VSX-LABEL: test_FMSUB_ASSOC2:
-; CHECK-VSX: xsnmsubmdp
-; CHECK-VSX-NEXT: xsnmsubadp
-; CHECK-VSX-NEXT: fmr
-; CHECK-VSX-NEXT: blr
+; CHECK-UNSAFE-LABEL: test_FMSUB_ASSOC2:
+; CHECK-UNSAFE: fnmsub
+; CHECK-UNSAFE-NEXT: fnmsub
+; CHECK-UNSAFE-NEXT: blr
+
+; CHECK-SAFE-VSX-LABEL: test_FMSUB_ASSOC2:
+; CHECK-SAFE-VSX: xsmuldp
+; CHECK-SAFE-VSX-NEXT: xsmaddadp
+; CHECK-SAFE-VSX-NEXT: xssubdp
+; CHECK-SAFE-VSX-NEXT: blr
+
+; CHECK-UNSAFE-VSX-LABEL: test_FMSUB_ASSOC2:
+; CHECK-UNSAFE-VSX: xsnmsubmdp
+; CHECK-UNSAFE-VSX-NEXT: xsnmsubadp
+; CHECK-UNSAFE-VSX-NEXT: fmr
+; CHECK-UNSAFE-VSX-NEXT: blr
 }
 
 define double @test_FMADD_ASSOC_EXT1(float %A, float %B, double %C,
diff --git a/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll b/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll
index 9b8fd409579337a60e46b469214894ed9126c6da..955b1f27ca2670ece8d7728941a5280020288421 100644
--- a/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll
+++ b/test/CodeGen/PowerPC/fp-int-conversions-direct-moves.ll
@@ -323,7 +323,7 @@ entry:
   ret i64 %conv
 ; CHECK-LABEL: @_Z7testllff
 ; CHECK: xscvdpsxds [[CONVREG13:[0-9]+]], 1
-; CHECK: mfvsrd 3, [[CONVREG13]]
+; CHECK: mffprd 3, [[CONVREG13]]
 }
 
 ; Function Attrs: nounwind
@@ -349,7 +349,7 @@ entry:
   ret i64 %conv
 ; CHECK-LABEL: @_Z7testlldd
 ; CHECK: xscvdpsxds [[CONVREG14:[0-9]+]], 1
-; CHECK: mfvsrd 3, [[CONVREG14]]
+; CHECK: mffprd 3, [[CONVREG14]]
 }
 
 ; Function Attrs: nounwind
@@ -375,7 +375,7 @@ entry:
   ret i64 %conv
 ; CHECK-LABEL: @_Z8testullff
 ; CHECK: xscvdpuxds [[CONVREG15:[0-9]+]], 1
-; CHECK: mfvsrd 3, [[CONVREG15]]
+; CHECK: mffprd 3, [[CONVREG15]]
 }
 
 ; Function Attrs: nounwind
@@ -401,7 +401,7 @@ entry:
   ret i64 %conv
 ; CHECK-LABEL: @_Z8testulldd
 ; CHECK: xscvdpuxds [[CONVREG16:[0-9]+]], 1
-; CHECK: mfvsrd 3, [[CONVREG16]]
+; CHECK: mffprd 3, [[CONVREG16]]
 }
 
 ; Function Attrs: nounwind
diff --git a/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll b/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
index 37120a56f4dabc3957857a88562aa72121f2936f..cd4eac42f26c842260c68068c5a6f4946d52d13f 100644
--- a/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
+++ b/test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll
@@ -11,21 +11,17 @@ entry:
 ; PPC64-DAG: stxsdx 1, 0, [[ADDR_LO:[0-9]+]]
 ; PPC64-DAG: addi [[ADDR_HI]], [[SP:[0-9]+]], [[OFFSET_HI:-?[0-9]+]]
 ; PPC64-DAG: addi [[ADDR_LO]], [[SP]], [[OFFSET_LO:-?[0-9]+]]
-; PPC64-DAG: li [[MASK_REG:[0-9]+]], 1
-; PPC64: sldi [[MASK_REG]], [[MASK_REG]], 63
 ; PPC64-DAG: ld [[HI:[0-9]+]], [[OFFSET_LO]]([[SP]])
 ; PPC64-DAG: ld [[LO:[0-9]+]], [[OFFSET_HI]]([[SP]])
-; PPC64: and [[FLIP_BIT:[0-9]+]], [[HI]], [[MASK_REG]]
+; PPC64-DAG: rldicr [[FLIP_BIT:[0-9]+]], [[HI]], 0, 0
 ; PPC64-DAG: xor 3, [[HI]], [[FLIP_BIT]]
 ; PPC64-DAG: xor 4, [[LO]], [[FLIP_BIT]]
 ; PPC64: blr
 
 ; PPC64-P8-LABEL: test_abs:
-; PPC64-P8-DAG: mfvsrd [[LO:[0-9]+]], 2
-; PPC64-P8-DAG: mfvsrd [[HI:[0-9]+]], 1
-; PPC64-P8-DAG: li [[MASK_REG:[0-9]+]], 1
-; PPC64-P8-DAG: sldi [[SHIFT_REG:[0-9]+]], [[MASK_REG]], 63
-; PPC64-P8: and [[FLIP_BIT:[0-9]+]], [[HI]], [[SHIFT_REG]]
+; PPC64-P8-DAG: mffprd [[LO:[0-9]+]], 2
+; PPC64-P8-DAG: mffprd [[HI:[0-9]+]], 1
+; PPC64-P8-DAG: rldicr [[FLIP_BIT:[0-9]+]], [[HI]], 0, 0
 ; PPC64-P8-DAG: xor 3, [[HI]], [[FLIP_BIT]]
 ; PPC64-P8-DAG: xor 4, [[LO]], [[FLIP_BIT]]
 ; PPC64-P8: blr
@@ -63,10 +59,10 @@ entry:
 ; PPC64: blr
 
 ; PPC64-P8-LABEL: test_neg:
-; PPC64-P8-DAG: mfvsrd [[LO:[0-9]+]], 2
-; PPC64-P8-DAG: mfvsrd [[HI:[0-9]+]], 1
+; PPC64-P8-DAG: mffprd [[LO:[0-9]+]], 2
+; PPC64-P8-DAG: mffprd [[HI:[0-9]+]], 1
 ; PPC64-P8-DAG: li [[IMM1:[0-9]+]], 1
-; PPC64-P8-DAG: sldi [[FLIP_BIT]], [[IMM1]], 63
+; PPC64-P8-DAG: sldi [[FLIP_BIT:[0-9]+]], [[IMM1]], 63
 ; PPC64-P8-NOT: BARRIER
 ; PPC64-P8-DAG: xor 3, [[HI]], [[FLIP_BIT]]
 ; PPC64-P8-DAG: xor 4, [[LO]], [[FLIP_BIT]]
@@ -93,29 +89,25 @@ entry:
 ; PPC64-LABEL: test_copysign:
 ; PPC64-DAG: stxsdx 1, 0, [[ADDR_REG:[0-9]+]]
 ; PPC64-DAG: addi [[ADDR_REG]], 1, [[OFFSET:-?[0-9]+]]
-; PPC64-DAG: li [[SIGN:[0-9]+]], 1
-; PPC64-DAG: sldi [[SIGN]], [[SIGN]], 63
 ; PPC64-DAG: li [[HI_TMP:[0-9]+]], 16399
 ; PPC64-DAG: sldi [[CST_HI:[0-9]+]], [[HI_TMP]], 48
 ; PPC64-DAG: li [[LO_TMP:[0-9]+]], 3019
 ; PPC64-DAG: sldi [[CST_LO:[0-9]+]], [[LO_TMP]], 52
 ; PPC64-NOT: BARRIER
 ; PPC64-DAG: ld [[X_HI:[0-9]+]], [[OFFSET]](1)
-; PPC64-DAG: and [[NEW_HI_TMP:[0-9]+]], [[X_HI]], [[SIGN]]
+; PPC64-DAG: rldicr [[NEW_HI_TMP:[0-9]+]], [[X_HI]], 0, 0
 ; PPC64-DAG: or 3, [[NEW_HI_TMP]], [[CST_HI]]
-; PPC64-DAG: xor 4, [[SIGN]], [[CST_LO]]
+; PPC64-DAG: xor 4, [[NEW_HI_TMP]], [[CST_LO]]
 ; PPC64: blr
 
 ; PPC64-P8-LABEL: test_copysign:
-; PPC64-P8-DAG: mfvsrd [[X_HI:[0-9]+]], 1
-; PPC64-P8-DAG: li [[SIGN:[0-9]+]], 1
-; PPC64-P8-DAG: sldi [[SIGN]], [[SIGN]], 63
+; PPC64-P8-DAG: mffprd [[X_HI:[0-9]+]], 1
 ; PPC64-P8-DAG: li [[HI_TMP:[0-9]+]], 16399
 ; PPC64-P8-DAG: sldi [[CST_HI:[0-9]+]], [[HI_TMP]], 48
 ; PPC64-P8-DAG: li [[LO_TMP:[0-9]+]], 3019
 ; PPC64-P8-DAG: sldi [[CST_LO:[0-9]+]], [[LO_TMP]], 52
 ; PPC64-P8-NOT: BARRIER
-; PPC64-P8-DAG: and [[NEW_HI_TMP:[0-9]+]], [[X_HI]], [[SIGN]]
+; PPC64-P8-DAG: rldicr [[NEW_HI_TMP:[0-9]+]], [[X_HI]], 0, 0
 ; PPC64-P8-DAG: or 3, [[NEW_HI_TMP]], [[CST_HI]]
 ; PPC64-P8-DAG: xor 4, [[NEW_HI_TMP]], [[CST_LO]]
 ; PPC64-P8: blr
diff --git a/test/CodeGen/PowerPC/i64_fp_round.ll b/test/CodeGen/PowerPC/i64_fp_round.ll
index 5e959f73568427e26742cf31dac36674c35b380d..9fe7a3bfcbb708a06b3f68c877807430a5ceaf1d 100644
--- a/test/CodeGen/PowerPC/i64_fp_round.ll
+++ b/test/CodeGen/PowerPC/i64_fp_round.ll
@@ -19,11 +19,14 @@ entry:
 ; CHECK: addi [[REG2:[0-9]+]], [[REG1]], 1
 ; CHECK: cmpldi [[REG2]], 1
 ; CHECK: isel [[REG3:[0-9]+]], {{[0-9]+}}, 3, 1
+; CHECK-NO-ISEL: rldicr [[REG2:[0-9]+]], {{[0-9]+}}, 0, 52
 ; CHECK-NO-ISEL: bc 12, 1, [[TRUE:.LBB[0-9]+]]
-; CHECK-NO-ISEL: ori 11, 3, 0
+; CHECK-NO-ISEL: ori [[REG3:[0-9]+]], 3, 0
 ; CHECK-NO-ISEL-NEXT: b [[SUCCESSOR:.LBB[0-9]+]]
 ; CHECK-NO-ISEL-NEXT: [[TRUE]]
-; CHECK-NO-ISEL-NEXT: addi 11, 4, 0
+; CHECK-NO-ISEL-NEXT: addi [[REG3]], [[REG2]], 0
+; CHECK-NO-ISEL-NEXT: [[SUCCESSOR]]
+; CHECK-NO-ISEL: std [[REG3]], -{{[0-9]+}}(1)
 ; CHECK: std [[REG3]], -{{[0-9]+}}(1)
 
 
diff --git a/test/CodeGen/PowerPC/indirectbr.ll b/test/CodeGen/PowerPC/indirectbr.ll
index d1e03ca7773ace0c2a160b2087c353fb59e0664f..c040d7859a8b07c1ee6f439bfe115531c8e0f9c5 100644
--- a/test/CodeGen/PowerPC/indirectbr.ll
+++ b/test/CodeGen/PowerPC/indirectbr.ll
@@ -17,23 +17,35 @@ entry:
 bb2:                                              ; preds = %entry, %bb3
   %gotovar.4.0 = phi i8* [ %gotovar.4.0.pre, %bb3 ], [ %0, %entry ] ; <i8*> [#uses=1]
 ; PIC: mtctr
-; PIC-NEXT: li
-; PIC-NEXT: li
-; PIC-NEXT: li
-; PIC-NEXT: li
 ; PIC-NEXT: bctr
+; PIC: li
+; PIC: b LBB
+; PIC: li
+; PIC: b LBB
+; PIC: li
+; PIC: b LBB
+; PIC: li
+; PIC: b LBB
 ; STATIC: mtctr
-; STATIC-NEXT: li
-; STATIC-NEXT: li
-; STATIC-NEXT: li
-; STATIC-NEXT: li
 ; STATIC-NEXT: bctr
+; STATIC: li
+; STATIC: b LBB
+; STATIC: li
+; STATIC: b LBB
+; STATIC: li
+; STATIC: b LBB
+; STATIC: li
+; STATIC: b LBB
 ; PPC64: mtctr
-; PPC64-NEXT: li
-; PPC64-NEXT: li
-; PPC64-NEXT: li
-; PPC64-NEXT: li
 ; PPC64-NEXT: bctr
+; PPC64: li
+; PPC64: b LBB
+; PPC64: li
+; PPC64: b LBB
+; PPC64: li
+; PPC64: b LBB
+; PPC64: li
+; PPC64: b LBB
   indirectbr i8* %gotovar.4.0, [label %L5, label %L4, label %L3, label %L2, label %L1]
 
 bb3:                                              ; preds = %entry
diff --git a/test/CodeGen/PowerPC/jaggedstructs.ll b/test/CodeGen/PowerPC/jaggedstructs.ll
index b28b34d7814f02e430ad65b12fb303d50808a2cc..6128316f45fa3d0712cab94d96acb36fa405d0e8 100644
--- a/test/CodeGen/PowerPC/jaggedstructs.ll
+++ b/test/CodeGen/PowerPC/jaggedstructs.ll
@@ -18,31 +18,31 @@ entry:
   ret void
 }
 
-; CHECK: std 6, 184(1)
-; CHECK: std 5, 176(1)
-; CHECK: std 4, 168(1)
-; CHECK: std 3, 160(1)
-; CHECK: lbz {{[0-9]+}}, 167(1)
-; CHECK: lhz {{[0-9]+}}, 165(1)
-; CHECK: stb {{[0-9]+}}, 55(1)
-; CHECK: sth {{[0-9]+}}, 53(1)
-; CHECK: lbz {{[0-9]+}}, 175(1)
-; CHECK: lwz {{[0-9]+}}, 171(1)
-; CHECK: stb {{[0-9]+}}, 63(1)
-; CHECK: stw {{[0-9]+}}, 59(1)
-; CHECK: lhz {{[0-9]+}}, 182(1)
-; CHECK: lwz {{[0-9]+}}, 178(1)
-; CHECK: sth {{[0-9]+}}, 70(1)
-; CHECK: stw {{[0-9]+}}, 66(1)
-; CHECK: lbz {{[0-9]+}}, 191(1)
-; CHECK: lhz {{[0-9]+}}, 189(1)
-; CHECK: lwz {{[0-9]+}}, 185(1)
-; CHECK: stb {{[0-9]+}}, 79(1)
-; CHECK: sth {{[0-9]+}}, 77(1)
-; CHECK: stw {{[0-9]+}}, 73(1)
-; CHECK: ld 6, 72(1)
-; CHECK: ld 5, 64(1)
-; CHECK: ld 4, 56(1)
-; CHECK: ld 3, 48(1)
+; CHECK-DAG: std 3, 160(1)
+; CHECK-DAG: std 6, 184(1)
+; CHECK-DAG: std 5, 176(1)
+; CHECK-DAG: std 4, 168(1)
+; CHECK-DAG: lbz {{[0-9]+}}, 167(1)
+; CHECK-DAG: lhz {{[0-9]+}}, 165(1)
+; CHECK-DAG: stb {{[0-9]+}}, 55(1)
+; CHECK-DAG-DAG: sth {{[0-9]+}}, 53(1)
+; CHECK-DAG: lbz {{[0-9]+}}, 175(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 171(1)
+; CHECK-DAG: stb {{[0-9]+}}, 63(1)
+; CHECK-DAG: stw {{[0-9]+}}, 59(1)
+; CHECK-DAG: lhz {{[0-9]+}}, 182(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 178(1)
+; CHECK-DAG: sth {{[0-9]+}}, 70(1)
+; CHECK-DAG: stw {{[0-9]+}}, 66(1)
+; CHECK-DAG: lbz {{[0-9]+}}, 191(1)
+; CHECK-DAG: lhz {{[0-9]+}}, 189(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 185(1)
+; CHECK-DAG: stb {{[0-9]+}}, 79(1)
+; CHECK-DAG: sth {{[0-9]+}}, 77(1)
+; CHECK-DAG: stw {{[0-9]+}}, 73(1)
+; CHECK-DAG: ld 6, 72(1)
+; CHECK-DAG: ld 5, 64(1)
+; CHECK-DAG: ld 4, 56(1)
+; CHECK-DAG: ld 3, 48(1)
 
 declare void @check(%struct.S3* byval, %struct.S5* byval, %struct.S6* byval, %struct.S7* byval)
diff --git a/test/CodeGen/PowerPC/lsa.ll b/test/CodeGen/PowerPC/lsa.ll
index dc74b9dbca2234c0c8c5e2f8b83f5266212da041..d0ebd473133cf1a134d03b4792f71a88401567a9 100644
--- a/test/CodeGen/PowerPC/lsa.ll
+++ b/test/CodeGen/PowerPC/lsa.ll
@@ -8,11 +8,11 @@ entry:
   %w = alloca [8200 x i32], align 4
   %q = alloca [8200 x i32], align 4
   %0 = bitcast [8200 x i32]* %v to i8*
-  call void @llvm.lifetime.start(i64 32800, i8* %0) #0
+  call void @llvm.lifetime.start.p0i8(i64 32800, i8* %0) #0
   %1 = bitcast [8200 x i32]* %w to i8*
-  call void @llvm.lifetime.start(i64 32800, i8* %1) #0
+  call void @llvm.lifetime.start.p0i8(i64 32800, i8* %1) #0
   %2 = bitcast [8200 x i32]* %q to i8*
-  call void @llvm.lifetime.start(i64 32800, i8* %2) #0
+  call void @llvm.lifetime.start.p0i8(i64 32800, i8* %2) #0
   %arraydecay = getelementptr inbounds [8200 x i32], [8200 x i32]* %q, i64 0, i64 0
   %arraydecay1 = getelementptr inbounds [8200 x i32], [8200 x i32]* %v, i64 0, i64 0
   %arraydecay2 = getelementptr inbounds [8200 x i32], [8200 x i32]* %w, i64 0, i64 0
@@ -28,16 +28,16 @@ entry:
 ; CHECK: blr
 
   %add = add nsw i32 %4, %3
-  call void @llvm.lifetime.end(i64 32800, i8* %2) #0
-  call void @llvm.lifetime.end(i64 32800, i8* %1) #0
-  call void @llvm.lifetime.end(i64 32800, i8* %0) #0
+  call void @llvm.lifetime.end.p0i8(i64 32800, i8* %2) #0
+  call void @llvm.lifetime.end.p0i8(i64 32800, i8* %1) #0
+  call void @llvm.lifetime.end.p0i8(i64 32800, i8* %0) #0
   ret i32 %add
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
 
 declare void @bar(i32*, i32*, i32*)
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/PowerPC/mature-mc-support.ll b/test/CodeGen/PowerPC/mature-mc-support.ll
index aa387f6e26667faceca0581cb1b5ee4e4ab18f2e..543877d60cfae8a2f0d14d4c12a203443c250a5f 100644
--- a/test/CodeGen/PowerPC/mature-mc-support.ll
+++ b/test/CodeGen/PowerPC/mature-mc-support.ll
@@ -28,4 +28,4 @@
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
 
-; CHECK: LLVM ERROR: Error parsing inline asm
+; CHECK: error: unknown directive
diff --git a/test/CodeGen/PowerPC/misched-inorder-latency.ll b/test/CodeGen/PowerPC/misched-inorder-latency.ll
index ded3111da977b319b843d114afc99e63f0d9ef70..26663d81f35756cbbe2d4adc9b262d789f84b8f3 100644
--- a/test/CodeGen/PowerPC/misched-inorder-latency.ll
+++ b/test/CodeGen/PowerPC/misched-inorder-latency.ll
@@ -17,7 +17,7 @@ entry:
   %sum1 = add i32 %sumin, 1
   %val1 = load i32, i32* %ptr
   %p = icmp eq i32 %sumin, 0
-  br i1 %p, label %true, label %end
+  br i1 %p, label %true, label %end, !prof !1
 true:
   %sum2 = add i32 %sum1, 1
   %ptr2 = getelementptr i32, i32* %ptr, i32 1
@@ -53,3 +53,5 @@ end:
   ret i32 %valmerge
 }
 declare void @llvm.prefetch(i8*, i32, i32, i32) nounwind
+
+!1 = !{!"branch_weights", i32 2, i32 1}
diff --git a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
index 1f317992a3b79ff063d23c0e18d18182a8c089ba..f399b2584d0b112881dc84b57d7ea0232845bdc7 100644
--- a/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
+++ b/test/CodeGen/PowerPC/p8-scalar_vector_conversions.ll
@@ -7,13 +7,10 @@
 
 @d = common global double 0.000000e+00, align 8
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define <16 x i8> @buildc(i8 zeroext %a) {
 entry:
-  %a.addr = alloca i8, align 1
-  store i8 %a, i8* %a.addr, align 1
-  %0 = load i8, i8* %a.addr, align 1
-  %splat.splatinsert = insertelement <16 x i8> undef, i8 %0, i32 0
+  %splat.splatinsert = insertelement <16 x i8> undef, i8 %a, i32 0
   %splat.splat = shufflevector <16 x i8> %splat.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %splat.splat
 ; CHECK: sldi [[REG1:[0-9]+]], 3, 56
@@ -22,13 +19,10 @@ entry:
 ; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define <8 x i16> @builds(i16 zeroext %a) {
 entry:
-  %a.addr = alloca i16, align 2
-  store i16 %a, i16* %a.addr, align 2
-  %0 = load i16, i16* %a.addr, align 2
-  %splat.splatinsert = insertelement <8 x i16> undef, i16 %0, i32 0
+  %splat.splatinsert = insertelement <8 x i16> undef, i16 %a, i32 0
   %splat.splat = shufflevector <8 x i16> %splat.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
   ret <8 x i16> %splat.splat
 ; CHECK: sldi [[REG1:[0-9]+]], 3, 48
@@ -37,13 +31,10 @@ entry:
 ; CHECK-LE: xxswapd {{[0-9]+}}, [[REG1]]
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define <4 x i32> @buildi(i32 zeroext %a) {
 entry:
-  %a.addr = alloca i32, align 4
-  store i32 %a, i32* %a.addr, align 4
-  %0 = load i32, i32* %a.addr, align 4
-  %splat.splatinsert = insertelement <4 x i32> undef, i32 %0, i32 0
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %a, i32 0
   %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
   ret <4 x i32> %splat.splat
 ; CHECK: mtvsrwz [[REG1:[0-9]+]], 3
@@ -52,13 +43,10 @@ entry:
 ; CHECK-LE: xxspltw 34, [[REG1]]
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define <2 x i64> @buildl(i64 %a) {
 entry:
-  %a.addr = alloca i64, align 8
-  store i64 %a, i64* %a.addr, align 8
-  %0 = load i64, i64* %a.addr, align 8
-  %splat.splatinsert = insertelement <2 x i64> undef, i64 %0, i32 0
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %a, i32 0
   %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
   ret <2 x i64> %splat.splat
 ; CHECK: mtvsrd {{[0-9]+}}, 3
@@ -66,13 +54,10 @@ entry:
 ; CHECK-LE: xxspltd 34, [[REG1]], 0
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define <4 x float> @buildf(float %a) {
 entry:
-  %a.addr = alloca float, align 4
-  store float %a, float* %a.addr, align 4
-  %0 = load float, float* %a.addr, align 4
-  %splat.splatinsert = insertelement <4 x float> undef, float %0, i32 0
+  %splat.splatinsert = insertelement <4 x float> undef, float %a, i32 0
   %splat.splat = shufflevector <4 x float> %splat.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer
   ret <4 x float> %splat.splat
 ; CHECK: xscvdpspn [[REG1:[0-9]+]], 1
@@ -83,8 +68,8 @@ entry:
 
 ; The optimization to remove stack operations from PPCDAGToDAGISel::Select
 ; should still trigger for v2f64, producing an lxvdsx.
-; Function Attrs: nounwind
-define <2 x double> @buildd() #0 {
+; Function Attrs: norecurse nounwind readonly
+define <2 x double> @buildd() {
 entry:
   %0 = load double, double* @d, align 8
   %splat.splatinsert = insertelement <2 x double> undef, double %0, i32 0
@@ -96,13 +81,10 @@ entry:
 ; CHECK-LE: lxvdsx 34, 0, [[REG1]]
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc0(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 0
+  %vecext = extractelement <16 x i8> %vsc, i32 0
   ret i8 %vecext
 ; CHECK-LABEL: @getsc0
 ; CHECK: mfvsrd 3, 34
@@ -114,13 +96,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc1(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 1
+  %vecext = extractelement <16 x i8> %vsc, i32 1
   ret i8 %vecext
 ; CHECK-LABEL: @getsc1
 ; CHECK: mfvsrd 3, 34
@@ -132,13 +111,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc2(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 2
+  %vecext = extractelement <16 x i8> %vsc, i32 2
   ret i8 %vecext
 ; CHECK-LABEL: @getsc2
 ; CHECK: mfvsrd 3, 34
@@ -150,13 +126,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc3(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 3
+  %vecext = extractelement <16 x i8> %vsc, i32 3
   ret i8 %vecext
 ; CHECK-LABEL: @getsc3
 ; CHECK: mfvsrd 3, 34
@@ -168,13 +141,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc4(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 4
+  %vecext = extractelement <16 x i8> %vsc, i32 4
   ret i8 %vecext
 ; CHECK-LABEL: @getsc4
 ; CHECK: mfvsrd 3, 34
@@ -186,13 +156,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc5(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 5
+  %vecext = extractelement <16 x i8> %vsc, i32 5
   ret i8 %vecext
 ; CHECK-LABEL: @getsc5
 ; CHECK: mfvsrd 3, 34
@@ -204,13 +171,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc6(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 6
+  %vecext = extractelement <16 x i8> %vsc, i32 6
   ret i8 %vecext
 ; CHECK-LABEL: @getsc6
 ; CHECK: mfvsrd 3, 34
@@ -222,13 +186,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc7(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 7
+  %vecext = extractelement <16 x i8> %vsc, i32 7
   ret i8 %vecext
 ; CHECK-LABEL: @getsc7
 ; CHECK: mfvsrd 3, 34
@@ -240,13 +201,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc8(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 8
+  %vecext = extractelement <16 x i8> %vsc, i32 8
   ret i8 %vecext
 ; CHECK-LABEL: @getsc8
 ; CHECK: mfvsrd 3,
@@ -258,13 +216,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc9(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 9
+  %vecext = extractelement <16 x i8> %vsc, i32 9
   ret i8 %vecext
 ; CHECK-LABEL: @getsc9
 ; CHECK: mfvsrd 3,
@@ -276,13 +231,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc10(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 10
+  %vecext = extractelement <16 x i8> %vsc, i32 10
   ret i8 %vecext
 ; CHECK-LABEL: @getsc10
 ; CHECK: mfvsrd 3,
@@ -294,13 +246,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc11(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 11
+  %vecext = extractelement <16 x i8> %vsc, i32 11
   ret i8 %vecext
 ; CHECK-LABEL: @getsc11
 ; CHECK: mfvsrd 3,
@@ -312,13 +261,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc12(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 12
+  %vecext = extractelement <16 x i8> %vsc, i32 12
   ret i8 %vecext
 ; CHECK-LABEL: @getsc12
 ; CHECK: mfvsrd 3,
@@ -330,13 +276,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc13(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 13
+  %vecext = extractelement <16 x i8> %vsc, i32 13
   ret i8 %vecext
 ; CHECK-LABEL: @getsc13
 ; CHECK: mfvsrd 3,
@@ -348,13 +291,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc14(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 14
+  %vecext = extractelement <16 x i8> %vsc, i32 14
   ret i8 %vecext
 ; CHECK-LABEL: @getsc14
 ; CHECK: mfvsrd 3,
@@ -366,13 +306,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getsc15(<16 x i8> %vsc) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 15
+  %vecext = extractelement <16 x i8> %vsc, i32 15
   ret i8 %vecext
 ; CHECK-LABEL: @getsc15
 ; CHECK: mfvsrd 3,
@@ -383,13 +320,10 @@ entry:
 ; CHECK-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc0(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 0
+  %vecext = extractelement <16 x i8> %vuc, i32 0
   ret i8 %vecext
 ; CHECK-LABEL: @getuc0
 ; CHECK: mfvsrd 3, 34
@@ -400,13 +334,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc1(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 1
+  %vecext = extractelement <16 x i8> %vuc, i32 1
   ret i8 %vecext
 ; CHECK-LABEL: @getuc1
 ; CHECK: mfvsrd 3, 34
@@ -418,13 +349,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc2(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 2
+  %vecext = extractelement <16 x i8> %vuc, i32 2
   ret i8 %vecext
 ; CHECK-LABEL: @getuc2
 ; CHECK: mfvsrd 3, 34
@@ -436,13 +364,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc3(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 3
+  %vecext = extractelement <16 x i8> %vuc, i32 3
   ret i8 %vecext
 ; CHECK-LABEL: @getuc3
 ; CHECK: mfvsrd 3, 34
@@ -454,13 +379,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc4(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 4
+  %vecext = extractelement <16 x i8> %vuc, i32 4
   ret i8 %vecext
 ; CHECK-LABEL: @getuc4
 ; CHECK: mfvsrd 3, 34
@@ -472,13 +394,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc5(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 5
+  %vecext = extractelement <16 x i8> %vuc, i32 5
   ret i8 %vecext
 ; CHECK-LABEL: @getuc5
 ; CHECK: mfvsrd 3, 34
@@ -490,13 +409,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc6(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 6
+  %vecext = extractelement <16 x i8> %vuc, i32 6
   ret i8 %vecext
 ; CHECK-LABEL: @getuc6
 ; CHECK: mfvsrd 3, 34
@@ -508,13 +424,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc7(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 7
+  %vecext = extractelement <16 x i8> %vuc, i32 7
   ret i8 %vecext
 ; CHECK-LABEL: @getuc7
 ; CHECK: mfvsrd 3, 34
@@ -525,13 +438,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc8(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 8
+  %vecext = extractelement <16 x i8> %vuc, i32 8
   ret i8 %vecext
 ; CHECK-LABEL: @getuc8
 ; CHECK: mfvsrd 3,
@@ -542,13 +452,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc9(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 9
+  %vecext = extractelement <16 x i8> %vuc, i32 9
   ret i8 %vecext
 ; CHECK-LABEL: @getuc9
 ; CHECK: mfvsrd 3,
@@ -560,13 +467,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc10(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 10
+  %vecext = extractelement <16 x i8> %vuc, i32 10
   ret i8 %vecext
 ; CHECK-LABEL: @getuc10
 ; CHECK: mfvsrd 3,
@@ -578,13 +482,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc11(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 11
+  %vecext = extractelement <16 x i8> %vuc, i32 11
   ret i8 %vecext
 ; CHECK-LABEL: @getuc11
 ; CHECK: mfvsrd 3,
@@ -596,13 +497,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc12(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 12
+  %vecext = extractelement <16 x i8> %vuc, i32 12
   ret i8 %vecext
 ; CHECK-LABEL: @getuc12
 ; CHECK: mfvsrd 3,
@@ -614,13 +512,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc13(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 13
+  %vecext = extractelement <16 x i8> %vuc, i32 13
   ret i8 %vecext
 ; CHECK-LABEL: @getuc13
 ; CHECK: mfvsrd 3,
@@ -632,13 +527,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc14(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 14
+  %vecext = extractelement <16 x i8> %vuc, i32 14
   ret i8 %vecext
 ; CHECK-LABEL: @getuc14
 ; CHECK: mfvsrd 3,
@@ -650,13 +542,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getuc15(<16 x i8> %vuc) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %vecext = extractelement <16 x i8> %0, i32 15
+  %vecext = extractelement <16 x i8> %vuc, i32 15
   ret i8 %vecext
 ; CHECK-LABEL: @getuc15
 ; CHECK: mfvsrd 3,
@@ -667,16 +556,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i8 @getvelsc(<16 x i8> %vsc, i32 signext %i) {
 entry:
-  %vsc.addr = alloca <16 x i8>, align 16
-  %i.addr = alloca i32, align 4
-  store <16 x i8> %vsc, <16 x i8>* %vsc.addr, align 16
-  store i32 %i, i32* %i.addr, align 4
-  %0 = load <16 x i8>, <16 x i8>* %vsc.addr, align 16
-  %1 = load i32, i32* %i.addr, align 4
-  %vecext = extractelement <16 x i8> %0, i32 %1
+  %vecext = extractelement <16 x i8> %vsc, i32 %i
   ret i8 %vecext
 ; CHECK-LABEL: @getvelsc
 ; CHECK-DAG: andi. [[ANDI:[0-9]+]], {{[0-9]+}}, 8
@@ -701,16 +584,10 @@ entry:
 ; CHECK-DAG-LE: extsb 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i8 @getveluc(<16 x i8> %vuc, i32 signext %i) {
 entry:
-  %vuc.addr = alloca <16 x i8>, align 16
-  %i.addr = alloca i32, align 4
-  store <16 x i8> %vuc, <16 x i8>* %vuc.addr, align 16
-  store i32 %i, i32* %i.addr, align 4
-  %0 = load <16 x i8>, <16 x i8>* %vuc.addr, align 16
-  %1 = load i32, i32* %i.addr, align 4
-  %vecext = extractelement <16 x i8> %0, i32 %1
+  %vecext = extractelement <16 x i8> %vuc, i32 %i
   ret i8 %vecext
 ; CHECK-LABEL: @getveluc
 ; CHECK-DAG: andi. [[ANDI:[0-9]+]], {{[0-9]+}}, 8
@@ -735,13 +612,10 @@ entry:
 ; CHECK-DAG-LE: clrldi   3, 3, 56
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i16 @getss0(<8 x i16> %vss) {
 entry:
-  %vss.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 0
+  %vecext = extractelement <8 x i16> %vss, i32 0
   ret i16 %vecext
 ; CHECK-LABEL: @getss0
 ; CHECK: mfvsrd 3, 34
@@ -753,13 +627,10 @@ entry:
 ; CHECK-LE: extsh 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i16 @getss1(<8 x i16> %vss) {
 entry:
-  %vss.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 1
+  %vecext = extractelement <8 x i16> %vss, i32 1
   ret i16 %vecext
 ; CHECK-LABEL: @getss1
 ; CHECK: mfvsrd 3, 34
@@ -771,13 +642,10 @@ entry:
 ; CHECK-LE: extsh 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i16 @getss2(<8 x i16> %vss) {
 entry:
-  %vss.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 2
+  %vecext = extractelement <8 x i16> %vss, i32 2
   ret i16 %vecext
 ; CHECK-LABEL: @getss2
 ; CHECK: mfvsrd 3, 34
@@ -789,13 +657,10 @@ entry:
 ; CHECK-LE: extsh 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i16 @getss3(<8 x i16> %vss) {
 entry:
-  %vss.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 3
+  %vecext = extractelement <8 x i16> %vss, i32 3
   ret i16 %vecext
 ; CHECK-LABEL: @getss3
 ; CHECK: mfvsrd 3, 34
@@ -807,13 +672,10 @@ entry:
 ; CHECK-LE: extsh 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i16 @getss4(<8 x i16> %vss) {
 entry:
-  %vss.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 4
+  %vecext = extractelement <8 x i16> %vss, i32 4
   ret i16 %vecext
 ; CHECK-LABEL: @getss4
 ; CHECK: mfvsrd 3,
@@ -825,13 +687,10 @@ entry:
 ; CHECK-LE: extsh 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i16 @getss5(<8 x i16> %vss) {
 entry:
-  %vss.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 5
+  %vecext = extractelement <8 x i16> %vss, i32 5
   ret i16 %vecext
 ; CHECK-LABEL: @getss5
 ; CHECK: mfvsrd 3,
@@ -843,13 +702,10 @@ entry:
 ; CHECK-LE: extsh 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i16 @getss6(<8 x i16> %vss) {
 entry:
-  %vss.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 6
+  %vecext = extractelement <8 x i16> %vss, i32 6
   ret i16 %vecext
 ; CHECK-LABEL: @getss6
 ; CHECK: mfvsrd 3,
@@ -861,13 +717,10 @@ entry:
 ; CHECK-LE: extsh 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i16 @getss7(<8 x i16> %vss) {
 entry:
-  %vss.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 7
+  %vecext = extractelement <8 x i16> %vss, i32 7
   ret i16 %vecext
 ; CHECK-LABEL: @getss7
 ; CHECK: mfvsrd 3,
@@ -878,13 +731,10 @@ entry:
 ; CHECK-LE: extsh 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @getus0(<8 x i16> %vus) {
 entry:
-  %vus.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 0
+  %vecext = extractelement <8 x i16> %vus, i32 0
   ret i16 %vecext
 ; CHECK-LABEL: @getus0
 ; CHECK: mfvsrd 3, 34
@@ -895,13 +745,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 48
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @getus1(<8 x i16> %vus) {
 entry:
-  %vus.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 1
+  %vecext = extractelement <8 x i16> %vus, i32 1
   ret i16 %vecext
 ; CHECK-LABEL: @getus1
 ; CHECK: mfvsrd 3, 34
@@ -913,13 +760,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 48
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @getus2(<8 x i16> %vus) {
 entry:
-  %vus.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 2
+  %vecext = extractelement <8 x i16> %vus, i32 2
   ret i16 %vecext
 ; CHECK-LABEL: @getus2
 ; CHECK: mfvsrd 3, 34
@@ -931,13 +775,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 48
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @getus3(<8 x i16> %vus) {
 entry:
-  %vus.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 3
+  %vecext = extractelement <8 x i16> %vus, i32 3
   ret i16 %vecext
 ; CHECK-LABEL: @getus3
 ; CHECK: mfvsrd 3, 34
@@ -948,13 +789,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 48
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @getus4(<8 x i16> %vus) {
 entry:
-  %vus.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 4
+  %vecext = extractelement <8 x i16> %vus, i32 4
   ret i16 %vecext
 ; CHECK-LABEL: @getus4
 ; CHECK: mfvsrd 3,
@@ -965,13 +803,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 48
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @getus5(<8 x i16> %vus) {
 entry:
-  %vus.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 5
+  %vecext = extractelement <8 x i16> %vus, i32 5
   ret i16 %vecext
 ; CHECK-LABEL: @getus5
 ; CHECK: mfvsrd 3,
@@ -983,13 +818,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 48
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @getus6(<8 x i16> %vus) {
 entry:
-  %vus.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 6
+  %vecext = extractelement <8 x i16> %vus, i32 6
   ret i16 %vecext
 ; CHECK-LABEL: @getus6
 ; CHECK: mfvsrd 3,
@@ -1001,13 +833,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 48
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @getus7(<8 x i16> %vus) {
 entry:
-  %vus.addr = alloca <8 x i16>, align 16
-  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
-  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
-  %vecext = extractelement <8 x i16> %0, i32 7
+  %vecext = extractelement <8 x i16> %vus, i32 7
   ret i16 %vecext
 ; CHECK-LABEL: @getus7
 ; CHECK: mfvsrd 3,
@@ -1018,16 +847,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 48
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i16 @getvelss(<8 x i16> %vss, i32 signext %i) {
 entry:
-  %vss.addr = alloca <8 x i16>, align 16
-  %i.addr = alloca i32, align 4
-  store <8 x i16> %vss, <8 x i16>* %vss.addr, align 16
-  store i32 %i, i32* %i.addr, align 4
-  %0 = load <8 x i16>, <8 x i16>* %vss.addr, align 16
-  %1 = load i32, i32* %i.addr, align 4
-  %vecext = extractelement <8 x i16> %0, i32 %1
+  %vecext = extractelement <8 x i16> %vss, i32 %i
   ret i16 %vecext
 ; CHECK-LABEL: @getvelss
 ; CHECK-DAG: andi. [[ANDI:[0-9]+]], {{[0-9]+}}, 4
@@ -1054,16 +877,10 @@ entry:
 ; CHECK-DAG-LE: extsh 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i16 @getvelus(<8 x i16> %vus, i32 signext %i) {
 entry:
-  %vus.addr = alloca <8 x i16>, align 16
-  %i.addr = alloca i32, align 4
-  store <8 x i16> %vus, <8 x i16>* %vus.addr, align 16
-  store i32 %i, i32* %i.addr, align 4
-  %0 = load <8 x i16>, <8 x i16>* %vus.addr, align 16
-  %1 = load i32, i32* %i.addr, align 4
-  %vecext = extractelement <8 x i16> %0, i32 %1
+  %vecext = extractelement <8 x i16> %vus, i32 %i
   ret i16 %vecext
 ; CHECK-LABEL: @getvelus
 ; CHECK-DAG: andi. [[ANDI:[0-9]+]], {{[0-9]+}}, 4
@@ -1090,13 +907,10 @@ entry:
 ; CHECK-DAG-LE: clrldi   3, 3, 48
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i32 @getsi0(<4 x i32> %vsi) {
 entry:
-  %vsi.addr = alloca <4 x i32>, align 16
-  store <4 x i32> %vsi, <4 x i32>* %vsi.addr, align 16
-  %0 = load <4 x i32>, <4 x i32>* %vsi.addr, align 16
-  %vecext = extractelement <4 x i32> %0, i32 0
+  %vecext = extractelement <4 x i32> %vsi, i32 0
   ret i32 %vecext
 ; CHECK-LABEL: @getsi0
 ; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 3
@@ -1108,13 +922,10 @@ entry:
 ; CHECK-LE: extsw 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i32 @getsi1(<4 x i32> %vsi) {
 entry:
-  %vsi.addr = alloca <4 x i32>, align 16
-  store <4 x i32> %vsi, <4 x i32>* %vsi.addr, align 16
-  %0 = load <4 x i32>, <4 x i32>* %vsi.addr, align 16
-  %vecext = extractelement <4 x i32> %0, i32 1
+  %vecext = extractelement <4 x i32> %vsi, i32 1
   ret i32 %vecext
 ; CHECK-LABEL: @getsi1
 ; CHECK: mfvsrwz 3, 34
@@ -1125,13 +936,10 @@ entry:
 ; CHECK-LE: extsw 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i32 @getsi2(<4 x i32> %vsi) {
 entry:
-  %vsi.addr = alloca <4 x i32>, align 16
-  store <4 x i32> %vsi, <4 x i32>* %vsi.addr, align 16
-  %0 = load <4 x i32>, <4 x i32>* %vsi.addr, align 16
-  %vecext = extractelement <4 x i32> %0, i32 2
+  %vecext = extractelement <4 x i32> %vsi, i32 2
   ret i32 %vecext
 ; CHECK-LABEL: @getsi2
 ; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
@@ -1142,13 +950,10 @@ entry:
 ; CHECK-LE: extsw 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i32 @getsi3(<4 x i32> %vsi) {
 entry:
-  %vsi.addr = alloca <4 x i32>, align 16
-  store <4 x i32> %vsi, <4 x i32>* %vsi.addr, align 16
-  %0 = load <4 x i32>, <4 x i32>* %vsi.addr, align 16
-  %vecext = extractelement <4 x i32> %0, i32 3
+  %vecext = extractelement <4 x i32> %vsi, i32 3
   ret i32 %vecext
 ; CHECK-LABEL: @getsi3
 ; CHECK: xxswapd [[SHL:[0-9]+]], 34
@@ -1160,13 +965,10 @@ entry:
 ; CHECK-LE: extsw 3, 3
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i32 @getui0(<4 x i32> %vui) {
 entry:
-  %vui.addr = alloca <4 x i32>, align 16
-  store <4 x i32> %vui, <4 x i32>* %vui.addr, align 16
-  %0 = load <4 x i32>, <4 x i32>* %vui.addr, align 16
-  %vecext = extractelement <4 x i32> %0, i32 0
+  %vecext = extractelement <4 x i32> %vui, i32 0
   ret i32 %vecext
 ; CHECK-LABEL: @getui0
 ; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 3
@@ -1178,13 +980,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 32
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i32 @getui1(<4 x i32> %vui) {
 entry:
-  %vui.addr = alloca <4 x i32>, align 16
-  store <4 x i32> %vui, <4 x i32>* %vui.addr, align 16
-  %0 = load <4 x i32>, <4 x i32>* %vui.addr, align 16
-  %vecext = extractelement <4 x i32> %0, i32 1
+  %vecext = extractelement <4 x i32> %vui, i32 1
   ret i32 %vecext
 ; CHECK-LABEL: @getui1
 ; CHECK: mfvsrwz 3, 34
@@ -1195,13 +994,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 32
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i32 @getui2(<4 x i32> %vui) {
 entry:
-  %vui.addr = alloca <4 x i32>, align 16
-  store <4 x i32> %vui, <4 x i32>* %vui.addr, align 16
-  %0 = load <4 x i32>, <4 x i32>* %vui.addr, align 16
-  %vecext = extractelement <4 x i32> %0, i32 2
+  %vecext = extractelement <4 x i32> %vui, i32 2
   ret i32 %vecext
 ; CHECK-LABEL: @getui2
 ; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
@@ -1212,13 +1008,10 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 32
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i32 @getui3(<4 x i32> %vui) {
 entry:
-  %vui.addr = alloca <4 x i32>, align 16
-  store <4 x i32> %vui, <4 x i32>* %vui.addr, align 16
-  %0 = load <4 x i32>, <4 x i32>* %vui.addr, align 16
-  %vecext = extractelement <4 x i32> %0, i32 3
+  %vecext = extractelement <4 x i32> %vui, i32 3
   ret i32 %vecext
 ; CHECK-LABEL: @getui3
 ; CHECK: xxswapd [[SHL:[0-9]+]], 34
@@ -1230,45 +1023,30 @@ entry:
 ; CHECK-LE: clrldi   3, 3, 32
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define signext i32 @getvelsi(<4 x i32> %vsi, i32 signext %i) {
 entry:
-  %vsi.addr = alloca <4 x i32>, align 16
-  %i.addr = alloca i32, align 4
-  store <4 x i32> %vsi, <4 x i32>* %vsi.addr, align 16
-  store i32 %i, i32* %i.addr, align 4
-  %0 = load <4 x i32>, <4 x i32>* %vsi.addr, align 16
-  %1 = load i32, i32* %i.addr, align 4
-  %vecext = extractelement <4 x i32> %0, i32 %1
+  %vecext = extractelement <4 x i32> %vsi, i32 %i
   ret i32 %vecext
 ; CHECK-LABEL: @getvelsi
 ; CHECK-LE-LABEL: @getvelsi
 ; FIXME: add check patterns when variable element extraction is implemented
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define zeroext i32 @getvelui(<4 x i32> %vui, i32 signext %i) {
 entry:
-  %vui.addr = alloca <4 x i32>, align 16
-  %i.addr = alloca i32, align 4
-  store <4 x i32> %vui, <4 x i32>* %vui.addr, align 16
-  store i32 %i, i32* %i.addr, align 4
-  %0 = load <4 x i32>, <4 x i32>* %vui.addr, align 16
-  %1 = load i32, i32* %i.addr, align 4
-  %vecext = extractelement <4 x i32> %0, i32 %1
+  %vecext = extractelement <4 x i32> %vui, i32 %i
   ret i32 %vecext
 ; CHECK-LABEL: @getvelui
 ; CHECK-LE-LABEL: @getvelui
 ; FIXME: add check patterns when variable element extraction is implemented
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define i64 @getsl0(<2 x i64> %vsl) {
 entry:
-  %vsl.addr = alloca <2 x i64>, align 16
-  store <2 x i64> %vsl, <2 x i64>* %vsl.addr, align 16
-  %0 = load <2 x i64>, <2 x i64>* %vsl.addr, align 16
-  %vecext = extractelement <2 x i64> %0, i32 0
+  %vecext = extractelement <2 x i64> %vsl, i32 0
   ret i64 %vecext
 ; CHECK-LABEL: @getsl0
 ; CHECK: mfvsrd 3, 34
@@ -1277,13 +1055,10 @@ entry:
 ; CHECK-LE: mfvsrd 3, [[SWP]]
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define i64 @getsl1(<2 x i64> %vsl) {
 entry:
-  %vsl.addr = alloca <2 x i64>, align 16
-  store <2 x i64> %vsl, <2 x i64>* %vsl.addr, align 16
-  %0 = load <2 x i64>, <2 x i64>* %vsl.addr, align 16
-  %vecext = extractelement <2 x i64> %0, i32 1
+  %vecext = extractelement <2 x i64> %vsl, i32 1
   ret i64 %vecext
 ; CHECK-LABEL: @getsl1
 ; CHECK: xxswapd  [[SWP:[0-9]+]], 34
@@ -1292,13 +1067,10 @@ entry:
 ; CHECK-LE: mfvsrd 3, 34
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define i64 @getul0(<2 x i64> %vul) {
 entry:
-  %vul.addr = alloca <2 x i64>, align 16
-  store <2 x i64> %vul, <2 x i64>* %vul.addr, align 16
-  %0 = load <2 x i64>, <2 x i64>* %vul.addr, align 16
-  %vecext = extractelement <2 x i64> %0, i32 0
+  %vecext = extractelement <2 x i64> %vul, i32 0
   ret i64 %vecext
 ; CHECK-LABEL: @getul0
 ; CHECK: mfvsrd 3, 34
@@ -1307,13 +1079,10 @@ entry:
 ; CHECK-LE: mfvsrd 3, [[SWP]]
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define i64 @getul1(<2 x i64> %vul) {
 entry:
-  %vul.addr = alloca <2 x i64>, align 16
-  store <2 x i64> %vul, <2 x i64>* %vul.addr, align 16
-  %0 = load <2 x i64>, <2 x i64>* %vul.addr, align 16
-  %vecext = extractelement <2 x i64> %0, i32 1
+  %vecext = extractelement <2 x i64> %vul, i32 1
   ret i64 %vecext
 ; CHECK-LABEL: @getul1
 ; CHECK: xxswapd  [[SWP:[0-9]+]], 34
@@ -1322,45 +1091,30 @@ entry:
 ; CHECK-LE: mfvsrd 3, 34
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define i64 @getvelsl(<2 x i64> %vsl, i32 signext %i) {
 entry:
-  %vsl.addr = alloca <2 x i64>, align 16
-  %i.addr = alloca i32, align 4
-  store <2 x i64> %vsl, <2 x i64>* %vsl.addr, align 16
-  store i32 %i, i32* %i.addr, align 4
-  %0 = load <2 x i64>, <2 x i64>* %vsl.addr, align 16
-  %1 = load i32, i32* %i.addr, align 4
-  %vecext = extractelement <2 x i64> %0, i32 %1
+  %vecext = extractelement <2 x i64> %vsl, i32 %i
   ret i64 %vecext
 ; CHECK-LABEL: @getvelsl
 ; CHECK-LE-LABEL: @getvelsl
 ; FIXME: add check patterns when variable element extraction is implemented
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define i64 @getvelul(<2 x i64> %vul, i32 signext %i) {
 entry:
-  %vul.addr = alloca <2 x i64>, align 16
-  %i.addr = alloca i32, align 4
-  store <2 x i64> %vul, <2 x i64>* %vul.addr, align 16
-  store i32 %i, i32* %i.addr, align 4
-  %0 = load <2 x i64>, <2 x i64>* %vul.addr, align 16
-  %1 = load i32, i32* %i.addr, align 4
-  %vecext = extractelement <2 x i64> %0, i32 %1
+  %vecext = extractelement <2 x i64> %vul, i32 %i
   ret i64 %vecext
 ; CHECK-LABEL: @getvelul
 ; CHECK-LE-LABEL: @getvelul
 ; FIXME: add check patterns when variable element extraction is implemented
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define float @getf0(<4 x float> %vf) {
 entry:
-  %vf.addr = alloca <4 x float>, align 16
-  store <4 x float> %vf, <4 x float>* %vf.addr, align 16
-  %0 = load <4 x float>, <4 x float>* %vf.addr, align 16
-  %vecext = extractelement <4 x float> %0, i32 0
+  %vecext = extractelement <4 x float> %vf, i32 0
   ret float %vecext
 ; CHECK-LABEL: @getf0
 ; CHECK: xscvspdpn 1, 34
@@ -1369,13 +1123,10 @@ entry:
 ; CHECK-LE: xscvspdpn 1, [[SHL]]
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define float @getf1(<4 x float> %vf) {
 entry:
-  %vf.addr = alloca <4 x float>, align 16
-  store <4 x float> %vf, <4 x float>* %vf.addr, align 16
-  %0 = load <4 x float>, <4 x float>* %vf.addr, align 16
-  %vecext = extractelement <4 x float> %0, i32 1
+  %vecext = extractelement <4 x float> %vf, i32 1
   ret float %vecext
 ; CHECK-LABEL: @getf1
 ; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 1
@@ -1385,13 +1136,10 @@ entry:
 ; CHECK-LE: xscvspdpn 1, [[SHL]]
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define float @getf2(<4 x float> %vf) {
 entry:
-  %vf.addr = alloca <4 x float>, align 16
-  store <4 x float> %vf, <4 x float>* %vf.addr, align 16
-  %0 = load <4 x float>, <4 x float>* %vf.addr, align 16
-  %vecext = extractelement <4 x float> %0, i32 2
+  %vecext = extractelement <4 x float> %vf, i32 2
   ret float %vecext
 ; CHECK-LABEL: @getf2
 ; CHECK: xxswapd [[SHL:[0-9]+]], 34
@@ -1401,13 +1149,10 @@ entry:
 ; CHECK-LE: xscvspdpn 1, [[SHL]]
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define float @getf3(<4 x float> %vf) {
 entry:
-  %vf.addr = alloca <4 x float>, align 16
-  store <4 x float> %vf, <4 x float>* %vf.addr, align 16
-  %0 = load <4 x float>, <4 x float>* %vf.addr, align 16
-  %vecext = extractelement <4 x float> %0, i32 3
+  %vecext = extractelement <4 x float> %vf, i32 3
   ret float %vecext
 ; CHECK-LABEL: @getf3
 ; CHECK: xxsldwi [[SHL:[0-9]+]], 34, 34, 3
@@ -1416,29 +1161,20 @@ entry:
 ; CHECK-LE: xscvspdpn 1, 34
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define float @getvelf(<4 x float> %vf, i32 signext %i) {
 entry:
-  %vf.addr = alloca <4 x float>, align 16
-  %i.addr = alloca i32, align 4
-  store <4 x float> %vf, <4 x float>* %vf.addr, align 16
-  store i32 %i, i32* %i.addr, align 4
-  %0 = load <4 x float>, <4 x float>* %vf.addr, align 16
-  %1 = load i32, i32* %i.addr, align 4
-  %vecext = extractelement <4 x float> %0, i32 %1
+  %vecext = extractelement <4 x float> %vf, i32 %i
   ret float %vecext
 ; CHECK-LABEL: @getvelf
 ; CHECK-LE-LABEL: @getvelf
 ; FIXME: add check patterns when variable element extraction is implemented
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define double @getd0(<2 x double> %vd) {
 entry:
-  %vd.addr = alloca <2 x double>, align 16
-  store <2 x double> %vd, <2 x double>* %vd.addr, align 16
-  %0 = load <2 x double>, <2 x double>* %vd.addr, align 16
-  %vecext = extractelement <2 x double> %0, i32 0
+  %vecext = extractelement <2 x double> %vd, i32 0
   ret double %vecext
 ; CHECK-LABEL: @getd0
 ; CHECK: xxlor 1, 34, 34
@@ -1446,13 +1182,10 @@ entry:
 ; CHECK-LE: xxswapd  1, 34
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define double @getd1(<2 x double> %vd) {
 entry:
-  %vd.addr = alloca <2 x double>, align 16
-  store <2 x double> %vd, <2 x double>* %vd.addr, align 16
-  %0 = load <2 x double>, <2 x double>* %vd.addr, align 16
-  %vecext = extractelement <2 x double> %0, i32 1
+  %vecext = extractelement <2 x double> %vd, i32 1
   ret double %vecext
 ; CHECK-LABEL: @getd1
 ; CHECK: xxswapd  1, 34
@@ -1460,16 +1193,10 @@ entry:
 ; CHECK-LE: xxlor 1, 34, 34
 }
 
-; Function Attrs: nounwind
+; Function Attrs: norecurse nounwind readnone
 define double @getveld(<2 x double> %vd, i32 signext %i) {
 entry:
-  %vd.addr = alloca <2 x double>, align 16
-  %i.addr = alloca i32, align 4
-  store <2 x double> %vd, <2 x double>* %vd.addr, align 16
-  store i32 %i, i32* %i.addr, align 4
-  %0 = load <2 x double>, <2 x double>* %vd.addr, align 16
-  %1 = load i32, i32* %i.addr, align 4
-  %vecext = extractelement <2 x double> %0, i32 %1
+  %vecext = extractelement <2 x double> %vd, i32 %i
   ret double %vecext
 ; CHECK-LABEL: @getveld
 ; CHECK-LE-LABEL: @getveld
diff --git a/test/CodeGen/PowerPC/ppc64-align-long-double.ll b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
index c3cccd5b29357d2e8d44f76bd22b6e2c7caa5ca7..d59dc64dcf8574ee37e8d7157877a2e3f8a18671 100644
--- a/test/CodeGen/PowerPC/ppc64-align-long-double.ll
+++ b/test/CodeGen/PowerPC/ppc64-align-long-double.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 -O0 -fast-isel=false -mattr=-vsx < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mcpu=pwr7 -O0 -fast-isel=false -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-VSX %s
-; RUN: llc -verify-machineinstrs -mcpu=pwr9 -O0 -fast-isel=false -mattr=+vsx < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mcpu=pwr9 -O0 -fast-isel=false -mattr=+vsx < %s | FileCheck -check-prefix=CHECK-P9 %s
 
 ; Verify internal alignment of long double in a struct.  The double
 ; argument comes in in GPR3; GPR4 is skipped; GPRs 5 and 6 contain
@@ -19,19 +19,44 @@ entry:
   ret ppc_fp128 %0
 }
 
+; The additional stores are caused because we forward the value in the
+; store->load->bitcast path to make a store and bitcast of the same
+; value. Since the target does bitcast through memory and we no longer
+; remember the address we need to do the store in a fresh local
+; address. 
+
 ; CHECK-DAG: std 6, 72(1)
 ; CHECK-DAG: std 5, 64(1)
 ; CHECK-DAG: std 4, 56(1)
 ; CHECK-DAG: std 3, 48(1)
-; CHECK: lfd 1, 64(1)
-; CHECK: lfd 2, 72(1)
+
+; CHECK-DAG: std 5, -16(1)
+; CHECK-DAG: std 6, -8(1)
+; CHECK-DAG: lfd 1, -16(1)
+; CHECK-DAG: lfd 2, -8(1)
+
+; FIXMECHECK: lfd 1, 64(1)
+; FIXMECHECK: lfd 2, 72(1)
 
 ; CHECK-VSX-DAG: std 6, 72(1)
 ; CHECK-VSX-DAG: std 5, 64(1)
 ; CHECK-VSX-DAG: std 4, 56(1)
 ; CHECK-VSX-DAG: std 3, 48(1)
-; CHECK-VSX: li 3, 16
-; CHECK-VSX: addi 4, 1, 48
-; CHECK-VSX: lxsdx 1, 4, 3
-; CHECK-VSX: li 3, 24
-; CHECK-VSX: lxsdx 2, 4, 3
+; CHECK-VSX-DAG: std 5, -16(1)
+; CHECK-VSX-DAG: std 6, -8(1)
+; CHECK-VSX: addi 3, 1, -16
+; CHECK-VSX: lxsdx 1, 0, 3
+; CHECK-VSX: addi 3, 1, -8
+; CHECK-VSX: lxsdx 2, 0, 3
+
+; FIXME-VSX: addi 4, 1, 48
+; FIXME-VSX: lxsdx 1, 4, 3
+; FIXME-VSX: li 3, 24
+; FIXME-VSX: lxsdx 2, 4, 3
+
+; CHECK-P9: std 6, 72(1)
+; CHECK-P9: std 5, 64(1)
+; CHECK-P9: std 4, 56(1)
+; CHECK-P9: std 3, 48(1)
+; CHECK-P9: mtvsrd 1, 5
+; CHECK-P9: mtvsrd 2, 6
diff --git a/test/CodeGen/PowerPC/ppc64-gep-opt.ll b/test/CodeGen/PowerPC/ppc64-gep-opt.ll
index 1a78310ddf32ce322f24b3dfd7a1d788b5d08349..d1ae1bcbd88ce8f88f3ebd5159c4a1dd2dd78bb5 100644
--- a/test/CodeGen/PowerPC/ppc64-gep-opt.ll
+++ b/test/CodeGen/PowerPC/ppc64-gep-opt.ll
@@ -84,9 +84,9 @@ exit:
 ; CHECK-NoAA: add i64 [[TMP:%[a-zA-Z0-9]+]], 528
 ; CHECK-NoAA: add i64 [[TMP]], 532
 ; CHECK-NoAA: if.true:
-; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 532
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8, i8* {{.*}}, i64 532
 ; CHECK-NoAA: exit:
-; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = add i64 [[TMP]], 528
+; CHECK-NoAA: {{%sunk[a-zA-Z0-9]+}} = getelementptr i8, i8* {{.*}}, i64 528
 
 ; CHECK-UseAA-LABEL: test_GEP_across_BB(
 ; CHECK-UseAA: [[PTR0:%[a-zA-Z0-9]+]] = getelementptr
diff --git a/test/CodeGen/PowerPC/ppc64le-aggregates.ll b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
index 25b3e5d89331340a24223653f62399c22db4eff6..6fcbdda4e34f0c5bbf71757111fd8fa4edc21538 100644
--- a/test/CodeGen/PowerPC/ppc64le-aggregates.ll
+++ b/test/CodeGen/PowerPC/ppc64le-aggregates.ll
@@ -284,10 +284,7 @@ entry:
 ; CHECK-DAG: lfs 12, 12({{[0-9]+}})
 ; CHECK-DAG: lfs 13, 16({{[0-9]+}})
 
-; CHECK-DAG: lwz [[REG0:[0-9]+]], 0({{[0-9]+}})
-; CHECK-DAG: lwz [[REG1:[0-9]+]], 4({{[0-9]+}})
-; CHECK-DAG: sldi [[REG2:[0-9]+]], [[REG1]], 32
-; CHECK-DAG: or 10, [[REG0]], [[REG2]]
+; CHECK-DAG: ld 10, 0({{[0-9]+}})
 ; CHECK: bl test2
 
 declare void @test2([8 x float], [5 x float], [2 x float])
diff --git a/test/CodeGen/PowerPC/pr30451.ll b/test/CodeGen/PowerPC/pr30451.ll
index 930553451cf883101cd8e5f48e3bf9573be740a2..9b07df00f9c357aa708aeb617cebe742b059c0a9 100644
--- a/test/CodeGen/PowerPC/pr30451.ll
+++ b/test/CodeGen/PowerPC/pr30451.ll
@@ -3,11 +3,11 @@ define i8 @atomic_min_i8() {
     top:
       %0 = alloca i8, align 2
       %1 = bitcast i8* %0 to i8*
-      call void @llvm.lifetime.start(i64 2, i8* %1)
+      call void @llvm.lifetime.start.p0i8(i64 2, i8* %1)
       store i8 -1, i8* %0, align 2
       %2 = atomicrmw min i8* %0, i8 0 acq_rel
       %3 = load atomic i8, i8* %0 acquire, align 8
-      call void @llvm.lifetime.end(i64 2, i8* %1)
+      call void @llvm.lifetime.end.p0i8(i64 2, i8* %1)
       ret i8 %3
 ; CHECK-LABEL: atomic_min_i8
 ; CHECK: lbarx [[DST:[0-9]+]],
@@ -19,11 +19,11 @@ define i16 @atomic_min_i16() {
     top:
       %0 = alloca i16, align 2
       %1 = bitcast i16* %0 to i8*
-      call void @llvm.lifetime.start(i64 2, i8* %1)
+      call void @llvm.lifetime.start.p0i8(i64 2, i8* %1)
       store i16 -1, i16* %0, align 2
       %2 = atomicrmw min i16* %0, i16 0 acq_rel
       %3 = load atomic i16, i16* %0 acquire, align 8
-      call void @llvm.lifetime.end(i64 2, i8* %1)
+      call void @llvm.lifetime.end.p0i8(i64 2, i8* %1)
       ret i16 %3
 ; CHECK-LABEL: atomic_min_i16
 ; CHECK: lharx [[DST:[0-9]+]],
@@ -36,11 +36,11 @@ define i8 @atomic_max_i8() {
     top:
       %0 = alloca i8, align 2
       %1 = bitcast i8* %0 to i8*
-      call void @llvm.lifetime.start(i64 2, i8* %1)
+      call void @llvm.lifetime.start.p0i8(i64 2, i8* %1)
       store i8 -1, i8* %0, align 2
       %2 = atomicrmw max i8* %0, i8 0 acq_rel
       %3 = load atomic i8, i8* %0 acquire, align 8
-      call void @llvm.lifetime.end(i64 2, i8* %1)
+      call void @llvm.lifetime.end.p0i8(i64 2, i8* %1)
       ret i8 %3
 ; CHECK-LABEL: atomic_max_i8
 ; CHECK: lbarx [[DST:[0-9]+]],
@@ -52,11 +52,11 @@ define i16 @atomic_max_i16() {
     top:
       %0 = alloca i16, align 2
       %1 = bitcast i16* %0 to i8*
-      call void @llvm.lifetime.start(i64 2, i8* %1)
+      call void @llvm.lifetime.start.p0i8(i64 2, i8* %1)
       store i16 -1, i16* %0, align 2
       %2 = atomicrmw max i16* %0, i16 0 acq_rel
       %3 = load atomic i16, i16* %0 acquire, align 8
-      call void @llvm.lifetime.end(i64 2, i8* %1)
+      call void @llvm.lifetime.end.p0i8(i64 2, i8* %1)
       ret i16 %3
 ; CHECK-LABEL: atomic_max_i16
 ; CHECK: lharx [[DST:[0-9]+]],
@@ -65,5 +65,5 @@ define i16 @atomic_max_i16() {
 ; CHECK-NEXT: ble 0
 }
 
-declare void @llvm.lifetime.start(i64, i8*)
-declare void @llvm.lifetime.end(i64, i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
diff --git a/test/CodeGen/PowerPC/pr32063.ll b/test/CodeGen/PowerPC/pr32063.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f031ec83c55e042c52224d4bf0ae3fe3e14d4468
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr32063.ll
@@ -0,0 +1,16 @@
+; RUN: llc -O2 < %s | FileCheck %s
+target triple = "powerpc64le-linux-gnu"
+
+define void @foo(i32 %v, i16* %p) {
+        %1 = and i32 %v, -65536
+        %2 = tail call i32 @llvm.bswap.i32(i32 %1)
+        %conv = trunc i32 %2 to i16
+        store i16 %conv, i16* %p
+        ret void
+
+; CHECK:     srwi
+; CHECK:     sthbrx
+; CHECK-NOT: stwbrx
+}
+
+declare i32 @llvm.bswap.i32(i32)
diff --git a/test/CodeGen/PowerPC/pr32140.ll b/test/CodeGen/PowerPC/pr32140.ll
new file mode 100644
index 0000000000000000000000000000000000000000..827a90404e4b10515bb75be07d5d38797d8b9818
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr32140.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=powerpc64le-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+; RUN: llc -mtriple=powerpc64-linux-gnu -mcpu=pwr8 < %s | FileCheck %s
+
+@as = common local_unnamed_addr global i16 0, align 2
+@bs = common local_unnamed_addr global i16 0, align 2
+@ai = common local_unnamed_addr global i32 0, align 4
+@bi = common local_unnamed_addr global i32 0, align 4
+
+define void @bswapStorei64Toi32() {
+; CHECK-LABEL: bswapStorei64Toi32:
+; CHECK:       # BB#0: # %entry
+; CHECK:         lwa 3, 0(3)
+; CHECK-NEXT:    rldicl 3, 3, 32, 32
+; CHECK-NEXT:    stwbrx 3, 0, 4
+; CHECK-NEXT:    blr
+entry:
+  %0 = load i32, i32* @ai, align 4
+  %conv.i = sext i32 %0 to i64
+  %or26.i = tail call i64 @llvm.bswap.i64(i64 %conv.i)
+  %conv = trunc i64 %or26.i to i32
+  store i32 %conv, i32* @bi, align 4
+  ret void
+}
+
+define void @bswapStorei32Toi16() {
+; CHECK-LABEL: bswapStorei32Toi16:
+; CHECK:       # BB#0: # %entry
+; CHECK:         lha 3, 0(3)
+; CHECK-NEXT:    srwi 3, 3, 16
+; CHECK-NEXT:    sthbrx 3, 0, 4
+; CHECK-NEXT:    blr
+entry:
+  %0 = load i16, i16* @as, align 2
+  %conv.i = sext i16 %0 to i32
+  %or26.i = tail call i32 @llvm.bswap.i32(i32 %conv.i)
+  %conv = trunc i32 %or26.i to i16
+  store i16 %conv, i16* @bs, align 2
+  ret void
+}
+
+define void @bswapStorei64Toi16() {
+; CHECK-LABEL: bswapStorei64Toi16:
+; CHECK:       # BB#0: # %entry
+; CHECK:         lha 3, 0(3)
+; CHECK-NEXT:    rldicl 3, 3, 16, 48
+; CHECK-NEXT:    sthbrx 3, 0, 4
+; CHECK-NEXT:    blr
+entry:
+  %0 = load i16, i16* @as, align 2
+  %conv.i = sext i16 %0 to i64
+  %or26.i = tail call i64 @llvm.bswap.i64(i64 %conv.i)
+  %conv = trunc i64 %or26.i to i16
+  store i16 %conv, i16* @bs, align 2
+  ret void
+}
+
+declare i32 @llvm.bswap.i32(i32)
+declare i64 @llvm.bswap.i64(i64)
diff --git a/test/CodeGen/PowerPC/pristine-and-livein.mir b/test/CodeGen/PowerPC/pristine-and-livein.mir
new file mode 100644
index 0000000000000000000000000000000000000000..6d93bb68c102c03a704d976347d5b158e9294bc6
--- /dev/null
+++ b/test/CodeGen/PowerPC/pristine-and-livein.mir
@@ -0,0 +1,330 @@
+# RUN: llc -run-pass=post-RA-sched %s -o - | FileCheck %s
+
+# CHECK: callee-saved-register: '[[REG:%x[0-9]+]]'
+# CHECK: callee-saved-register: '{{%x[0-9]+}}'
+# CHECK-NOT: [[REG]] = LI8 0
+# CHECK: STD killed [[REG]],
+--- |
+  ; ModuleID = '<stdin>'
+  source_filename = "bugpoint-output-4d91ae2.bc"
+  target datalayout = "e-m:e-i64:64-n32:64"
+  target triple = "powerpc64le--linux-gnu"
+  
+  ; Function Attrs: norecurse nounwind readonly
+  define i64 @adler32_z(i64 %adler, i8* readonly %buf, i64 %len) local_unnamed_addr #0 {
+  entry:
+    %shr = lshr i64 %adler, 16
+    %and = and i64 %shr, 65535
+    %and1 = and i64 %adler, 65535
+    br i1 undef, label %if.then, label %if.end15
+  
+  if.then:                                          ; preds = %entry
+    %add5 = add nsw i64 %and1, %and
+    %sub9 = add nsw i64 %add5, 281474976645135
+    %shl = shl i64 %add5, 16
+    %or = or i64 %shl, %and1
+    br label %cleanup
+  
+  if.end15:                                         ; preds = %entry
+    br i1 undef, label %while.cond.preheader, label %while.cond30.preheader
+  
+  while.cond30.preheader:                           ; preds = %if.end15
+    br i1 undef, label %while.body33.preheader, label %while.body109.preheader
+  
+  while.body33.preheader:                           ; preds = %while.cond30.preheader
+    br label %while.body33
+  
+  while.cond.preheader:                             ; preds = %if.end15
+    %sub25 = add i64 %and1, -65521
+    %rem = urem i64 %and, 65521
+    %shl27 = shl nuw nsw i64 %rem, 16
+    %or28 = or i64 %shl27, %and1
+    br label %cleanup
+  
+  while.body33:                                     ; preds = %do.end, %while.body33.preheader
+    %indvar = phi i64 [ %indvar.next, %do.end ], [ 0, %while.body33.preheader ]
+    %sum2.2385 = phi i64 [ %rem102, %do.end ], [ %and, %while.body33.preheader ]
+    %len.addr.1384 = phi i64 [ %sub34, %do.end ], [ %len, %while.body33.preheader ]
+    %buf.addr.1383 = phi i8* [ %scevgep390, %do.end ], [ %buf, %while.body33.preheader ]
+    %adler.addr.3382 = phi i64 [ %rem101, %do.end ], [ %and1, %while.body33.preheader ]
+    %0 = mul i64 %indvar, 5552
+    %1 = add i64 %0, -13
+    %scevgep2 = getelementptr i8, i8* %buf, i64 %1
+    %sub34 = add i64 %len.addr.1384, -5552
+    call void @llvm.ppc.mtctr.i64(i64 347)
+    br label %do.body
+  
+  do.body:                                          ; preds = %do.body, %while.body33
+    %adler.addr.4 = phi i64 [ %adler.addr.3382, %while.body33 ], [ %add49, %do.body ]
+    %sum2.3 = phi i64 [ %sum2.2385, %while.body33 ], [ %add98, %do.body ]
+    %tmp15.phi = phi i8* [ %scevgep2, %while.body33 ], [ %tmp15.inc, %do.body ]
+    %tmp15.inc = getelementptr i8, i8* %tmp15.phi, i64 16
+    %add38 = add i64 %adler.addr.4, %sum2.3
+    %add42 = add i64 %add38, %adler.addr.4
+    %add46 = add i64 %add42, %adler.addr.4
+    %tmp15 = load i8, i8* %tmp15.inc, align 1, !tbaa !1
+    %conv48 = zext i8 %tmp15 to i64
+    %add49 = add i64 %adler.addr.4, %conv48
+    %add50 = add i64 %add46, %add49
+    %add54 = add i64 %add50, %add49
+    %add58 = add i64 %add54, %add49
+    %add62 = add i64 %add58, %add49
+    %add66 = add i64 %add62, %add49
+    %add70 = add i64 %add66, %add49
+    %add74 = add i64 %add70, %add49
+    %add78 = add i64 %add74, %add49
+    %add82 = add i64 %add78, %add49
+    %add86 = add i64 %add82, %add49
+    %add90 = add i64 %add86, %add49
+    %add94 = add i64 %add90, %add49
+    %add98 = add i64 %add94, %add49
+    %2 = call i1 @llvm.ppc.is.decremented.ctr.nonzero()
+    br i1 %2, label %do.body, label %do.end
+  
+  do.end:                                           ; preds = %do.body
+    %scevgep390 = getelementptr i8, i8* %buf.addr.1383, i64 5552
+    %rem101 = urem i64 %add49, 65521
+    %rem102 = urem i64 %add98, 65521
+    %cmp31 = icmp ugt i64 %sub34, 5551
+    %indvar.next = add i64 %indvar, 1
+    br i1 %cmp31, label %while.body33, label %while.end103
+  
+  while.end103:                                     ; preds = %do.end
+    br i1 undef, label %if.end188, label %while.body109.preheader
+  
+  while.body109.preheader:                          ; preds = %while.end103, %while.cond30.preheader
+    %buf.addr.1.lcssa394400 = phi i8* [ %buf, %while.cond30.preheader ], [ %scevgep390, %while.end103 ]
+    %arrayidx151 = getelementptr inbounds i8, i8* %buf.addr.1.lcssa394400, i64 10
+    %tmp45 = load i8, i8* %arrayidx151, align 1, !tbaa !1
+    %conv152 = zext i8 %tmp45 to i64
+    br label %while.body109
+  
+  while.body109:                                    ; preds = %while.body109, %while.body109.preheader
+    %adler.addr.5373 = phi i64 [ %add153, %while.body109 ], [ undef, %while.body109.preheader ]
+    %add153 = add i64 %adler.addr.5373, %conv152
+    br label %while.body109
+  
+  if.end188:                                        ; preds = %while.end103
+    %shl189 = shl nuw nsw i64 %rem102, 16
+    %or190 = or i64 %shl189, %rem101
+    br label %cleanup
+  
+  cleanup:                                          ; preds = %if.end188, %while.cond.preheader, %if.then
+    %retval.0 = phi i64 [ %or, %if.then ], [ %or28, %while.cond.preheader ], [ %or190, %if.end188 ]
+    ret i64 %retval.0
+  }
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.ppc.mtctr.i64(i64) #1
+  
+  ; Function Attrs: nounwind
+  declare i1 @llvm.ppc.is.decremented.ctr.nonzero() #1
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #1
+  
+  attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx,-power9-vector,-qpx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #1 = { nounwind }
+  
+  !llvm.ident = !{!0}
+  
+  !0 = !{!"clang version 5.0.0 "}
+  !1 = !{!2, !2, i64 0}
+  !2 = !{!"omnipotent char", !3, i64 0}
+  !3 = !{!"Simple C/C++ TBAA"}
+
+...
+---
+name:            adler32_z
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%x3' }
+  - { reg: '%x4' }
+  - { reg: '%x5' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -16, size: 8, alignment: 16, callee-saved-register: '%x30' }
+  - { id: 1, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%x29' }
+  - { id: 2, offset: -8, size: 8, alignment: 8, isImmutable: true, isAliased: false }
+body:             |
+  bb.0.entry:
+    successors: %bb.1.if.then(0x40000000), %bb.3.if.end15(0x40000000)
+    liveins: %x3, %x4, %x5, %x29, %x30
+  
+    %x6 = RLWINM8 %x3, 16, 16, 31
+    %x3 = RLDICL killed %x3, 0, 48
+    BC undef %cr5lt, %bb.3.if.end15
+  
+  bb.1.if.then:
+    successors: %bb.2.if.then(0x80000000)
+    liveins: %x3, %x6, %x29, %x30
+  
+    %x4 = ADD8 %x3, killed %x6
+  
+  bb.2.if.then:
+    liveins: %lr8, %rm, %x3, %x4
+  
+    %x4 = RLDICR killed %x4, 16, 47
+    %x3 = OR8 killed %x4, killed %x3
+    BLR8 implicit %lr8, implicit %rm, implicit %x3
+  
+  bb.3.if.end15:
+    successors: %bb.6.while.cond.preheader(0x40000000), %bb.4.while.cond30.preheader(0x40000000)
+    liveins: %x3, %x4, %x5, %x6, %x29, %x30
+  
+    BC undef %cr5lt, %bb.6.while.cond.preheader
+  
+  bb.4.while.cond30.preheader:
+    successors: %bb.7.while.body33.preheader(0x40000000), %bb.5(0x40000000)
+    liveins: %x3, %x4, %x5, %x6, %x29, %x30
+  
+    BCn undef %cr5lt, %bb.7.while.body33.preheader
+  
+  bb.5:
+    successors: %bb.12.while.body109.preheader(0x80000000)
+    liveins: %x4, %x29, %x30
+  
+    %x7 = OR8 %x4, killed %x4
+    B %bb.12.while.body109.preheader
+  
+  bb.6.while.cond.preheader:
+    successors: %bb.2.if.then(0x80000000)
+    liveins: %x3, %x6, %x29, %x30
+  
+    %x4 = LIS8 15
+    %x4 = ORI8 killed %x4, 225
+    %x4 = RLDICR killed %x4, 32, 31
+    %x4 = ORIS8 killed %x4, 3375
+    %x4 = ORI8 killed %x4, 50637
+    %x4 = MULHDU %x6, killed %x4
+    %x5 = SUBF8 %x4, %x6
+    %x5 = RLDICL killed %x5, 63, 1
+    %x4 = ADD8 killed %x5, killed %x4
+    %x5 = LI8 0
+    %x4 = RLDICL killed %x4, 49, 15
+    %x5 = ORI8 killed %x5, 65521
+    %x4 = MULLD killed %x4, killed %x5
+    %x4 = SUBF8 killed %x4, killed %x6
+    B %bb.2.if.then
+  
+  bb.7.while.body33.preheader:
+    successors: %bb.8.while.body33(0x80000000)
+    liveins: %x3, %x4, %x5, %x6, %x29, %x30
+  
+    STD killed %x29, -24, %x1 :: (store 8 into %fixed-stack.1)
+    STD killed %x30, -16, %x1 :: (store 8 into %fixed-stack.0, align 16)
+    %x7 = LIS8 15
+    %x7 = ORI8 killed %x7, 225
+    %x7 = RLDICR killed %x7, 32, 31
+    %x8 = LI8 0
+    %x7 = ORIS8 killed %x7, 3375
+    %x9 = LI8 347
+    %x10 = ORI8 killed %x7, 50637
+    %x11 = ORI8 %x8, 65521
+    %x7 = OR8 %x4, %x4
+  
+  bb.8.while.body33:
+    successors: %bb.9.do.body(0x80000000)
+    liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11
+  
+    %x12 = MULLI8 %x8, 5552
+    %x12 = ADD8 %x4, killed %x12
+    %x12 = ADDI8 killed %x12, -13
+    %x5 = ADDI8 killed %x5, -5552
+    MTCTR8loop %x9, implicit-def dead %ctr8
+  
+  bb.9.do.body:
+    successors: %bb.9.do.body(0x7c000000), %bb.10.do.end(0x04000000)
+    liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11, %x12
+  
+    %x0, %x12 = LBZU8 16, killed %x12 :: (load 1 from %ir.tmp15.inc, !tbaa !1)
+    %x6 = ADD8 %x3, killed %x6
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x3 = ADD8 killed %x3, killed %x0
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    %x6 = ADD8 killed %x6, %x3
+    BDNZ8 %bb.9.do.body, implicit-def %ctr8, implicit %ctr8
+  
+  bb.10.do.end:
+    successors: %bb.8.while.body33(0x7c000000), %bb.11.while.end103(0x04000000)
+    liveins: %x3, %x4, %x5, %x6, %x7, %x8, %x9, %x10, %x11
+  
+    %x12 = MULHDU %x3, %x10
+    %x0 = MULHDU %x6, %x10
+    %x30 = SUBF8 %x12, %x3
+    %x29 = SUBF8 %x0, %x6
+    %x30 = RLDICL killed %x30, 63, 1
+    %x29 = RLDICL killed %x29, 63, 1
+    %x12 = ADD8 killed %x30, killed %x12
+    %x0 = ADD8 killed %x29, killed %x0
+    %cr0 = CMPLDI %x5, 5551
+    %x12 = RLDICL killed %x12, 49, 15
+    %x0 = RLDICL killed %x0, 49, 15
+    %x12 = MULLD killed %x12, %x11
+    %x0 = MULLD killed %x0, %x11
+    %x7 = ADDI8 killed %x7, 5552
+    %x3 = SUBF8 killed %x12, killed %x3
+    %x6 = SUBF8 killed %x0, killed %x6
+    %x8 = ADDI8 killed %x8, 1
+    BCC 44, killed %cr0, %bb.8.while.body33
+  
+  bb.11.while.end103:
+    successors: %bb.14.if.end188(0x40000000), %bb.12.while.body109.preheader(0x40000000)
+    liveins: %x3, %x6, %x7
+  
+    %x30 = LD -16, %x1 :: (load 8 from %fixed-stack.0, align 16)
+    %x29 = LD -24, %x1 :: (load 8 from %fixed-stack.1)
+    BC undef %cr5lt, %bb.14.if.end188
+  
+  bb.12.while.body109.preheader:
+    successors: %bb.13.while.body109(0x80000000)
+    liveins: %x7, %x29, %x30
+  
+    %x3 = LBZ8 10, killed %x7 :: (load 1 from %ir.arrayidx151, !tbaa !1)
+    %x4 = IMPLICIT_DEF
+  
+  bb.13.while.body109:
+    successors: %bb.13.while.body109(0x80000000)
+    liveins: %x3, %x4, %x29, %x30
+  
+    %x4 = ADD8 killed %x4, %x3
+    B %bb.13.while.body109
+  
+  bb.14.if.end188:
+    liveins: %x3, %x6, %x29, %x30
+  
+    %x4 = RLDICR killed %x6, 16, 47
+    %x3 = OR8 killed %x4, killed %x3
+    BLR8 implicit %lr8, implicit %rm, implicit %x3
+
+...
diff --git a/test/CodeGen/PowerPC/select-i1-vs-i1.ll b/test/CodeGen/PowerPC/select-i1-vs-i1.ll
index f2b8e09a1c1d4616045223dd8117763755604068..b7beb8165fdf161e0f06eab70bc2d89967a663d9 100644
--- a/test/CodeGen/PowerPC/select-i1-vs-i1.ll
+++ b/test/CodeGen/PowerPC/select-i1-vs-i1.ll
@@ -859,7 +859,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -876,7 +876,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -893,7 +893,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -910,7 +910,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -927,9 +927,9 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bc 12, [[REG1]], .LBB[[BB1:[0-9_]+]]
-; CHECK: vor 3, 2, 2
+; CHECK: vmr 3, 2
 ; CHECK: .LBB[[BB1]]
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -946,7 +946,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -963,7 +963,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -980,7 +980,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -997,7 +997,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1014,7 +1014,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1062,7 +1062,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1079,7 +1079,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1096,7 +1096,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1113,7 +1113,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1130,9 +1130,9 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bc 12, [[REG1]], .LBB[[BB55:[0-9_]+]]
-; CHECK: vor 3, 2, 2
+; CHECK: vmr 3, 2
 ; CHECK: .LBB[[BB55]]
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1149,7 +1149,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1166,7 +1166,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crorc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1183,7 +1183,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1200,7 +1200,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crandc [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
@@ -1217,7 +1217,7 @@ entry:
 ; CHECK-DAG: fcmpu {{[0-9]+}}, 1, 2
 ; CHECK: crxor [[REG1:[0-9]+]], {{[0-9]+}}, {{[0-9]+}}
 ; CHECK: bclr 12, [[REG1]], 0
-; CHECK: vor 2, 3, 3
+; CHECK: vmr 2, 3
 ; CHECK: blr
 }
 
diff --git a/test/CodeGen/PowerPC/select_const.ll b/test/CodeGen/PowerPC/select_const.ll
new file mode 100644
index 0000000000000000000000000000000000000000..29548123be8881f4e6b943417494493262a3b5c3
--- /dev/null
+++ b/test/CodeGen/PowerPC/select_const.ll
@@ -0,0 +1,789 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs -mattr=+isel | FileCheck %s --check-prefix=ALL --check-prefix=ISEL
+; RUN: llc < %s -mtriple=powerpc64le-unknown-unknown -verify-machineinstrs -mattr=-isel | FileCheck %s --check-prefix=ALL --check-prefix=NO_ISEL
+
+; Select of constants: control flow / conditional moves can always be replaced by logic+math (but may not be worth it?).
+; Test the zeroext/signext variants of each pattern to see if that makes a difference.
+
+; select Cond, 0, 1 --> zext (!Cond)
+
+define i32 @select_0_or_1(i1 %cond) {
+; ALL-LABEL: select_0_or_1:
+; ALL:       # BB#0:
+; ALL-NEXT:    not 3, 3
+; ALL-NEXT:    clrldi 3, 3, 63
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 0, i32 1
+  ret i32 %sel
+}
+
+define i32 @select_0_or_1_zeroext(i1 zeroext %cond) {
+; ALL-LABEL: select_0_or_1_zeroext:
+; ALL:       # BB#0:
+; ALL-NEXT:    xori 3, 3, 1
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 0, i32 1
+  ret i32 %sel
+}
+
+define i32 @select_0_or_1_signext(i1 signext %cond) {
+; ALL-LABEL: select_0_or_1_signext:
+; ALL:       # BB#0:
+; ALL-NEXT:    not 3, 3
+; ALL-NEXT:    clrldi 3, 3, 63
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 0, i32 1
+  ret i32 %sel
+}
+
+; select Cond, 1, 0 --> zext (Cond)
+
+define i32 @select_1_or_0(i1 %cond) {
+; ALL-LABEL: select_1_or_0:
+; ALL:       # BB#0:
+; ALL-NEXT:    clrldi 3, 3, 63
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 1, i32 0
+  ret i32 %sel
+}
+
+define i32 @select_1_or_0_zeroext(i1 zeroext %cond) {
+; ALL-LABEL: select_1_or_0_zeroext:
+; ALL:       # BB#0:
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 1, i32 0
+  ret i32 %sel
+}
+
+define i32 @select_1_or_0_signext(i1 signext %cond) {
+; ALL-LABEL: select_1_or_0_signext:
+; ALL:       # BB#0:
+; ALL-NEXT:    clrldi 3, 3, 63
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 1, i32 0
+  ret i32 %sel
+}
+
+; select Cond, 0, -1 --> sext (!Cond)
+
+define i32 @select_0_or_neg1(i1 %cond) {
+; ISEL-LABEL: select_0_or_neg1:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    li 4, 0
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    oris 3, 4, 65535
+; ISEL-NEXT:    ori 3, 3, 65535
+; ISEL-NEXT:    isel 3, 0, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: select_0_or_neg1:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    li 4, 0
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    oris 3, 4, 65535
+; NO_ISEL-NEXT:    ori 3, 3, 65535
+; NO_ISEL-NEXT:    bc 12, 1, .LBB6_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB6_1:
+; NO_ISEL-NEXT:    addi 3, 0, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i32 0, i32 -1
+  ret i32 %sel
+}
+
+define i32 @select_0_or_neg1_zeroext(i1 zeroext %cond) {
+; ISEL-LABEL: select_0_or_neg1_zeroext:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    li 4, 0
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    oris 3, 4, 65535
+; ISEL-NEXT:    ori 3, 3, 65535
+; ISEL-NEXT:    isel 3, 0, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: select_0_or_neg1_zeroext:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    li 4, 0
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    oris 3, 4, 65535
+; NO_ISEL-NEXT:    ori 3, 3, 65535
+; NO_ISEL-NEXT:    bc 12, 1, .LBB7_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB7_1:
+; NO_ISEL-NEXT:    addi 3, 0, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i32 0, i32 -1
+  ret i32 %sel
+}
+
+define i32 @select_0_or_neg1_signext(i1 signext %cond) {
+; ISEL-LABEL: select_0_or_neg1_signext:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    li 4, 0
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    oris 3, 4, 65535
+; ISEL-NEXT:    ori 3, 3, 65535
+; ISEL-NEXT:    isel 3, 0, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: select_0_or_neg1_signext:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    li 4, 0
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    oris 3, 4, 65535
+; NO_ISEL-NEXT:    ori 3, 3, 65535
+; NO_ISEL-NEXT:    bc 12, 1, .LBB8_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB8_1:
+; NO_ISEL-NEXT:    addi 3, 0, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i32 0, i32 -1
+  ret i32 %sel
+}
+
+; select Cond, -1, 0 --> sext (Cond)
+
+define i32 @select_neg1_or_0(i1 %cond) {
+; ISEL-LABEL: select_neg1_or_0:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    li 4, 0
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    oris 3, 4, 65535
+; ISEL-NEXT:    ori 3, 3, 65535
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: select_neg1_or_0:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    li 4, 0
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    oris 3, 4, 65535
+; NO_ISEL-NEXT:    ori 3, 3, 65535
+; NO_ISEL-NEXT:    bclr 12, 1, 0
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i32 -1, i32 0
+  ret i32 %sel
+}
+
+define i32 @select_neg1_or_0_zeroext(i1 zeroext %cond) {
+; ISEL-LABEL: select_neg1_or_0_zeroext:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    li 4, 0
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    oris 3, 4, 65535
+; ISEL-NEXT:    ori 3, 3, 65535
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: select_neg1_or_0_zeroext:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    li 4, 0
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    oris 3, 4, 65535
+; NO_ISEL-NEXT:    ori 3, 3, 65535
+; NO_ISEL-NEXT:    bclr 12, 1, 0
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i32 -1, i32 0
+  ret i32 %sel
+}
+
+define i32 @select_neg1_or_0_signext(i1 signext %cond) {
+; ISEL-LABEL: select_neg1_or_0_signext:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    li 4, 0
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    oris 3, 4, 65535
+; ISEL-NEXT:    ori 3, 3, 65535
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: select_neg1_or_0_signext:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    li 4, 0
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    oris 3, 4, 65535
+; NO_ISEL-NEXT:    ori 3, 3, 65535
+; NO_ISEL-NEXT:    bclr 12, 1, 0
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i32 -1, i32 0
+  ret i32 %sel
+}
+
+; select Cond, C+1, C --> add (zext Cond), C
+
+define i32 @select_Cplus1_C(i1 %cond) {
+; ALL-LABEL: select_Cplus1_C:
+; ALL:       # BB#0:
+; ALL-NEXT:    clrldi 3, 3, 63
+; ALL-NEXT:    addi 3, 3, 41
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 42, i32 41
+  ret i32 %sel
+}
+
+define i32 @select_Cplus1_C_zeroext(i1 zeroext %cond) {
+; ALL-LABEL: select_Cplus1_C_zeroext:
+; ALL:       # BB#0:
+; ALL-NEXT:    addi 3, 3, 41
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 42, i32 41
+  ret i32 %sel
+}
+
+define i32 @select_Cplus1_C_signext(i1 signext %cond) {
+; ALL-LABEL: select_Cplus1_C_signext:
+; ALL:       # BB#0:
+; ALL-NEXT:    subfic 3, 3, 41
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 42, i32 41
+  ret i32 %sel
+}
+
+; select Cond, C, C+1 --> add (sext Cond), C
+
+define i32 @select_C_Cplus1(i1 %cond) {
+; ALL-LABEL: select_C_Cplus1:
+; ALL:       # BB#0:
+; ALL-NEXT:    clrldi 3, 3, 63
+; ALL-NEXT:    subfic 3, 3, 42
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 41, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C_Cplus1_zeroext(i1 zeroext %cond) {
+; ALL-LABEL: select_C_Cplus1_zeroext:
+; ALL:       # BB#0:
+; ALL-NEXT:    subfic 3, 3, 42
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 41, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C_Cplus1_signext(i1 signext %cond) {
+; ALL-LABEL: select_C_Cplus1_signext:
+; ALL:       # BB#0:
+; ALL-NEXT:    addi 3, 3, 42
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i32 41, i32 42
+  ret i32 %sel
+}
+
+; In general, select of 2 constants could be:
+; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> add (and (sext Cond), C1-C2), C2
+
+define i32 @select_C1_C2(i1 %cond) {
+; ISEL-LABEL: select_C1_C2:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    li 4, 421
+; ISEL-NEXT:    li 3, 42
+; ISEL-NEXT:    isel 3, 4, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: select_C1_C2:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    li 4, 421
+; NO_ISEL-NEXT:    li 3, 42
+; NO_ISEL-NEXT:    bc 12, 1, .LBB18_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB18_1:
+; NO_ISEL-NEXT:    addi 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i32 421, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C1_C2_zeroext(i1 zeroext %cond) {
+; ISEL-LABEL: select_C1_C2_zeroext:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    li 4, 421
+; ISEL-NEXT:    li 3, 42
+; ISEL-NEXT:    isel 3, 4, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: select_C1_C2_zeroext:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    li 4, 421
+; NO_ISEL-NEXT:    li 3, 42
+; NO_ISEL-NEXT:    bc 12, 1, .LBB19_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB19_1:
+; NO_ISEL-NEXT:    addi 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i32 421, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C1_C2_signext(i1 signext %cond) {
+; ISEL-LABEL: select_C1_C2_signext:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    li 4, 421
+; ISEL-NEXT:    li 3, 42
+; ISEL-NEXT:    isel 3, 4, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: select_C1_C2_signext:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    li 4, 421
+; NO_ISEL-NEXT:    li 3, 42
+; NO_ISEL-NEXT:    bc 12, 1, .LBB20_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB20_1:
+; NO_ISEL-NEXT:    addi 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i32 421, i32 42
+  ret i32 %sel
+}
+
+; A binary operator with constant after the select should always get folded into the select.
+
+define i8 @sel_constants_add_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_add_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    li 4, 1
+; ISEL-NEXT:    li 3, 28
+; ISEL-NEXT:    isel 3, 4, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_add_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    li 4, 1
+; NO_ISEL-NEXT:    li 3, 28
+; NO_ISEL-NEXT:    bc 12, 1, .LBB21_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB21_1:
+; NO_ISEL-NEXT:    addi 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = add i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_sub_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_sub_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    li 4, 0
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    oris 3, 4, 65535
+; ISEL-NEXT:    li 4, 18
+; ISEL-NEXT:    ori 3, 3, 65527
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_sub_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    li 4, 0
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    oris 3, 4, 65535
+; NO_ISEL-NEXT:    li 4, 18
+; NO_ISEL-NEXT:    ori 3, 3, 65527
+; NO_ISEL-NEXT:    bclr 12, 1, 0
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = sub i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_mul_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_mul_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    lis 4, 16383
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    ori 3, 4, 65531
+; ISEL-NEXT:    li 4, 115
+; ISEL-NEXT:    sldi 3, 3, 2
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_mul_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    lis 4, 16383
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    ori 3, 4, 65531
+; NO_ISEL-NEXT:    li 4, 115
+; NO_ISEL-NEXT:    sldi 3, 3, 2
+; NO_ISEL-NEXT:    bclr 12, 1, 0
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = mul i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_sdiv_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_sdiv_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    li 3, 4
+; ISEL-NEXT:    isel 3, 0, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_sdiv_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    li 3, 4
+; NO_ISEL-NEXT:    bc 12, 1, .LBB24_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB24_1:
+; NO_ISEL-NEXT:    addi 3, 0, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = sdiv i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_udiv_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_udiv_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    li 4, 50
+; ISEL-NEXT:    li 3, 4
+; ISEL-NEXT:    isel 3, 4, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_udiv_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    li 4, 50
+; NO_ISEL-NEXT:    li 3, 4
+; NO_ISEL-NEXT:    bc 12, 1, .LBB25_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB25_1:
+; NO_ISEL-NEXT:    addi 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = udiv i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_srem_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_srem_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    lis 4, 16383
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    ori 3, 4, 65535
+; ISEL-NEXT:    li 4, 3
+; ISEL-NEXT:    sldi 3, 3, 2
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_srem_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    lis 4, 16383
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    ori 3, 4, 65535
+; NO_ISEL-NEXT:    li 4, 3
+; NO_ISEL-NEXT:    sldi 3, 3, 2
+; NO_ISEL-NEXT:    bclr 12, 1, 0
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = srem i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_urem_constant(i1 %cond) {
+; ALL-LABEL: sel_constants_urem_constant:
+; ALL:       # BB#0:
+; ALL-NEXT:    rlwinm 3, 3, 0, 31, 31
+; ALL-NEXT:    subfic 3, 3, 3
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = urem i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_and_constant(i1 %cond) {
+; ALL-LABEL: sel_constants_and_constant:
+; ALL:       # BB#0:
+; ALL-NEXT:    rlwinm 3, 3, 0, 31, 31
+; ALL-NEXT:    subfic 3, 3, 5
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = and i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_or_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_or_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    li 4, 0
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    oris 3, 4, 65535
+; ISEL-NEXT:    li 4, 23
+; ISEL-NEXT:    ori 3, 3, 65533
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_or_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    li 4, 0
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    oris 3, 4, 65535
+; NO_ISEL-NEXT:    li 4, 23
+; NO_ISEL-NEXT:    ori 3, 3, 65533
+; NO_ISEL-NEXT:    bclr 12, 1, 0
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = or i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_xor_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_xor_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    li 4, 0
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    oris 3, 4, 65535
+; ISEL-NEXT:    li 4, 18
+; ISEL-NEXT:    ori 3, 3, 65529
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_xor_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    li 4, 0
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    oris 3, 4, 65535
+; NO_ISEL-NEXT:    li 4, 18
+; NO_ISEL-NEXT:    ori 3, 3, 65529
+; NO_ISEL-NEXT:    bclr 12, 1, 0
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = xor i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_shl_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_shl_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    lis 5, 511
+; ISEL-NEXT:    lis 4, 2047
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    ori 3, 4, 65535
+; ISEL-NEXT:    ori 12, 5, 65535
+; ISEL-NEXT:    sldi 3, 3, 5
+; ISEL-NEXT:    sldi 4, 12, 7
+; ISEL-NEXT:    isel 3, 4, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_shl_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    lis 5, 511
+; NO_ISEL-NEXT:    lis 4, 2047
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    ori 3, 4, 65535
+; NO_ISEL-NEXT:    ori 12, 5, 65535
+; NO_ISEL-NEXT:    sldi 3, 3, 5
+; NO_ISEL-NEXT:    sldi 4, 12, 7
+; NO_ISEL-NEXT:    bc 12, 1, .LBB31_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB31_1:
+; NO_ISEL-NEXT:    addi 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = shl i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_lshr_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_lshr_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    li 4, 7
+; ISEL-NEXT:    li 3, 0
+; ISEL-NEXT:    isel 3, 4, 3, 1
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_lshr_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    li 4, 7
+; NO_ISEL-NEXT:    li 3, 0
+; NO_ISEL-NEXT:    bc 12, 1, .LBB32_1
+; NO_ISEL-NEXT:    blr
+; NO_ISEL-NEXT:  .LBB32_1:
+; NO_ISEL-NEXT:    addi 3, 4, 0
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = lshr i8 %sel, 5
+  ret i8 %bo
+}
+
+define i8 @sel_constants_ashr_constant(i1 %cond) {
+; ALL-LABEL: sel_constants_ashr_constant:
+; ALL:       # BB#0:
+; ALL-NEXT:    clrldi 3, 3, 63
+; ALL-NEXT:    neg 3, 3
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, i8 -4, i8 23
+  %bo = ashr i8 %sel, 5
+  ret i8 %bo
+}
+
+define double @sel_constants_fadd_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_fadd_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    addis 4, 2, .LCPI34_0@toc@ha
+; ISEL-NEXT:    addis 3, 2, .LCPI34_1@toc@ha
+; ISEL-NEXT:    addi 4, 4, .LCPI34_0@toc@l
+; ISEL-NEXT:    addi 3, 3, .LCPI34_1@toc@l
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    lxsdx 1, 0, 3
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_fadd_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    addis 4, 2, .LCPI34_0@toc@ha
+; NO_ISEL-NEXT:    addis 3, 2, .LCPI34_1@toc@ha
+; NO_ISEL-NEXT:    addi 4, 4, .LCPI34_0@toc@l
+; NO_ISEL-NEXT:    addi 3, 3, .LCPI34_1@toc@l
+; NO_ISEL-NEXT:    bc 12, 1, .LBB34_2
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    b .LBB34_2
+; NO_ISEL-NEXT:  .LBB34_2:
+; NO_ISEL-NEXT:    lxsdx 1, 0, 3
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, double -4.0, double 23.3
+  %bo = fadd double %sel, 5.1
+  ret double %bo
+}
+
+define double @sel_constants_fsub_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_fsub_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    addis 4, 2, .LCPI35_0@toc@ha
+; ISEL-NEXT:    addis 3, 2, .LCPI35_1@toc@ha
+; ISEL-NEXT:    addi 4, 4, .LCPI35_0@toc@l
+; ISEL-NEXT:    addi 3, 3, .LCPI35_1@toc@l
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    lxsdx 1, 0, 3
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_fsub_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    addis 4, 2, .LCPI35_0@toc@ha
+; NO_ISEL-NEXT:    addis 3, 2, .LCPI35_1@toc@ha
+; NO_ISEL-NEXT:    addi 4, 4, .LCPI35_0@toc@l
+; NO_ISEL-NEXT:    addi 3, 3, .LCPI35_1@toc@l
+; NO_ISEL-NEXT:    bc 12, 1, .LBB35_2
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    b .LBB35_2
+; NO_ISEL-NEXT:  .LBB35_2:
+; NO_ISEL-NEXT:    lxsdx 1, 0, 3
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, double -4.0, double 23.3
+  %bo = fsub double %sel, 5.1
+  ret double %bo
+}
+
+define double @sel_constants_fmul_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_fmul_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    addis 4, 2, .LCPI36_0@toc@ha
+; ISEL-NEXT:    addis 3, 2, .LCPI36_1@toc@ha
+; ISEL-NEXT:    addi 4, 4, .LCPI36_0@toc@l
+; ISEL-NEXT:    addi 3, 3, .LCPI36_1@toc@l
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    lxsdx 1, 0, 3
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_fmul_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    addis 4, 2, .LCPI36_0@toc@ha
+; NO_ISEL-NEXT:    addis 3, 2, .LCPI36_1@toc@ha
+; NO_ISEL-NEXT:    addi 4, 4, .LCPI36_0@toc@l
+; NO_ISEL-NEXT:    addi 3, 3, .LCPI36_1@toc@l
+; NO_ISEL-NEXT:    bc 12, 1, .LBB36_2
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    b .LBB36_2
+; NO_ISEL-NEXT:  .LBB36_2:
+; NO_ISEL-NEXT:    lxsdx 1, 0, 3
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, double -4.0, double 23.3
+  %bo = fmul double %sel, 5.1
+  ret double %bo
+}
+
+define double @sel_constants_fdiv_constant(i1 %cond) {
+; ISEL-LABEL: sel_constants_fdiv_constant:
+; ISEL:       # BB#0:
+; ISEL-NEXT:    andi. 3, 3, 1
+; ISEL-NEXT:    addis 4, 2, .LCPI37_0@toc@ha
+; ISEL-NEXT:    addis 3, 2, .LCPI37_1@toc@ha
+; ISEL-NEXT:    addi 4, 4, .LCPI37_0@toc@l
+; ISEL-NEXT:    addi 3, 3, .LCPI37_1@toc@l
+; ISEL-NEXT:    isel 3, 3, 4, 1
+; ISEL-NEXT:    lxsdx 1, 0, 3
+; ISEL-NEXT:    blr
+;
+; NO_ISEL-LABEL: sel_constants_fdiv_constant:
+; NO_ISEL:       # BB#0:
+; NO_ISEL-NEXT:    andi. 3, 3, 1
+; NO_ISEL-NEXT:    addis 4, 2, .LCPI37_0@toc@ha
+; NO_ISEL-NEXT:    addis 3, 2, .LCPI37_1@toc@ha
+; NO_ISEL-NEXT:    addi 4, 4, .LCPI37_0@toc@l
+; NO_ISEL-NEXT:    addi 3, 3, .LCPI37_1@toc@l
+; NO_ISEL-NEXT:    bc 12, 1, .LBB37_2
+; NO_ISEL-NEXT:  # BB#1:
+; NO_ISEL-NEXT:    ori 3, 4, 0
+; NO_ISEL-NEXT:    b .LBB37_2
+; NO_ISEL-NEXT:  .LBB37_2:
+; NO_ISEL-NEXT:    lxsdx 1, 0, 3
+; NO_ISEL-NEXT:    blr
+  %sel = select i1 %cond, double -4.0, double 23.3
+  %bo = fdiv double %sel, 5.1
+  ret double %bo
+}
+
+define double @sel_constants_frem_constant(i1 %cond) {
+; ALL-LABEL: sel_constants_frem_constant:
+; ALL:       # BB#0:
+; ALL-NEXT:    andi. 3, 3, 1
+; ALL-NEXT:    bc 12, 1, .LBB38_2
+; ALL-NEXT:  # BB#1:
+; ALL-NEXT:    addis 3, 2, .LCPI38_0@toc@ha
+; ALL-NEXT:    addi 3, 3, .LCPI38_0@toc@l
+; ALL-NEXT:    lxsdx 1, 0, 3
+; ALL-NEXT:    blr
+; ALL-NEXT:  .LBB38_2:
+; ALL-NEXT:    addis 3, 2, .LCPI38_1@toc@ha
+; ALL-NEXT:    addi 3, 3, .LCPI38_1@toc@l
+; ALL-NEXT:    lxsspx 1, 0, 3
+; ALL-NEXT:    blr
+  %sel = select i1 %cond, double -4.0, double 23.3
+  %bo = frem double %sel, 5.1
+  ret double %bo
+}
+
diff --git a/test/CodeGen/PowerPC/setcc-logic.ll b/test/CodeGen/PowerPC/setcc-logic.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2ed08e2ae380cf2555da0d74f5680750a3ea59b4
--- /dev/null
+++ b/test/CodeGen/PowerPC/setcc-logic.ll
@@ -0,0 +1,478 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -verify-machineinstrs -mtriple=powerpc64le-unknown-unknown | FileCheck %s
+
+define zeroext i1 @all_bits_clear(i32 %P, i32 %Q)  {
+; CHECK-LABEL: all_bits_clear:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    cntlzw 3, 3
+; CHECK-NEXT:    rlwinm 3, 3, 27, 31, 31
+; CHECK-NEXT:    blr
+  %a = icmp eq i32 %P, 0
+  %b = icmp eq i32 %Q, 0
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @all_sign_bits_clear(i32 %P, i32 %Q)  {
+; CHECK-LABEL: all_sign_bits_clear:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    nor 3, 3, 3
+; CHECK-NEXT:    srwi 3, 3, 31
+; CHECK-NEXT:    blr
+  %a = icmp sgt i32 %P, -1
+  %b = icmp sgt i32 %Q, -1
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @all_bits_set(i32 %P, i32 %Q)  {
+; CHECK-LABEL: all_bits_set:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    and 3, 3, 4
+; CHECK-NEXT:    li 5, 0
+; CHECK-NEXT:    li 12, 1
+; CHECK-NEXT:    cmpwi 0, 3, -1
+; CHECK-NEXT:    isel 3, 12, 5, 2
+; CHECK-NEXT:    blr
+  %a = icmp eq i32 %P, -1
+  %b = icmp eq i32 %Q, -1
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @all_sign_bits_set(i32 %P, i32 %Q)  {
+; CHECK-LABEL: all_sign_bits_set:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    and 3, 3, 4
+; CHECK-NEXT:    srwi 3, 3, 31
+; CHECK-NEXT:    blr
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @any_bits_set(i32 %P, i32 %Q)  {
+; CHECK-LABEL: any_bits_set:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    cntlzw 3, 3
+; CHECK-NEXT:    nor 3, 3, 3
+; CHECK-NEXT:    rlwinm 3, 3, 27, 31, 31
+; CHECK-NEXT:    blr
+  %a = icmp ne i32 %P, 0
+  %b = icmp ne i32 %Q, 0
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @any_sign_bits_set(i32 %P, i32 %Q)  {
+; CHECK-LABEL: any_sign_bits_set:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    srwi 3, 3, 31
+; CHECK-NEXT:    blr
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @any_bits_clear(i32 %P, i32 %Q)  {
+; CHECK-LABEL: any_bits_clear:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    and 3, 3, 4
+; CHECK-NEXT:    li 5, 1
+; CHECK-NEXT:    cmpwi 0, 3, -1
+; CHECK-NEXT:    isel 3, 0, 5, 2
+; CHECK-NEXT:    blr
+  %a = icmp ne i32 %P, -1
+  %b = icmp ne i32 %Q, -1
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @any_sign_bits_clear(i32 %P, i32 %Q)  {
+; CHECK-LABEL: any_sign_bits_clear:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    and 3, 3, 4
+; CHECK-NEXT:    nor 3, 3, 3
+; CHECK-NEXT:    srwi 3, 3, 31
+; CHECK-NEXT:    blr
+  %a = icmp sgt i32 %P, -1
+  %b = icmp sgt i32 %Q, -1
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+; PR3351 - (P == 0) & (Q == 0) -> (P|Q) == 0
+define i32 @all_bits_clear_branch(i32* %P, i32* %Q)  {
+; CHECK-LABEL: all_bits_clear_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    or. 3, 3, 4
+; CHECK-NEXT:    bne 0, .LBB8_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    li 3, 4
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB8_2: # %return
+; CHECK-NEXT:    li 3, 192
+; CHECK-NEXT:    blr
+entry:
+  %a = icmp eq i32* %P, null
+  %b = icmp eq i32* %Q, null
+  %c = and i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @all_sign_bits_clear_branch(i32 %P, i32 %Q)  {
+; CHECK-LABEL: all_sign_bits_clear_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    cmpwi 0, 3, 0
+; CHECK-NEXT:    blt 0, .LBB9_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    li 3, 4
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB9_2: # %return
+; CHECK-NEXT:    li 3, 192
+; CHECK-NEXT:    blr
+entry:
+  %a = icmp sgt i32 %P, -1
+  %b = icmp sgt i32 %Q, -1
+  %c = and i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @all_bits_set_branch(i32 %P, i32 %Q)  {
+; CHECK-LABEL: all_bits_set_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    and 3, 3, 4
+; CHECK-NEXT:    cmpwi 0, 3, -1
+; CHECK-NEXT:    bne 0, .LBB10_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    li 3, 4
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB10_2: # %return
+; CHECK-NEXT:    li 3, 192
+; CHECK-NEXT:    blr
+entry:
+  %a = icmp eq i32 %P, -1
+  %b = icmp eq i32 %Q, -1
+  %c = and i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @all_sign_bits_set_branch(i32 %P, i32 %Q)  {
+; CHECK-LABEL: all_sign_bits_set_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    and 3, 3, 4
+; CHECK-NEXT:    cmpwi 0, 3, -1
+; CHECK-NEXT:    bgt 0, .LBB11_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    li 3, 4
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB11_2: # %return
+; CHECK-NEXT:    li 3, 192
+; CHECK-NEXT:    blr
+entry:
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = and i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+; PR3351 - (P != 0) | (Q != 0) -> (P|Q) != 0
+define i32 @any_bits_set_branch(i32* %P, i32* %Q)  {
+; CHECK-LABEL: any_bits_set_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    or. 3, 3, 4
+; CHECK-NEXT:    beq 0, .LBB12_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    li 3, 4
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB12_2: # %return
+; CHECK-NEXT:    li 3, 192
+; CHECK-NEXT:    blr
+entry:
+  %a = icmp ne i32* %P, null
+  %b = icmp ne i32* %Q, null
+  %c = or i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @any_sign_bits_set_branch(i32 %P, i32 %Q)  {
+; CHECK-LABEL: any_sign_bits_set_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    or 3, 3, 4
+; CHECK-NEXT:    cmpwi 0, 3, -1
+; CHECK-NEXT:    bgt 0, .LBB13_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    li 3, 4
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB13_2: # %return
+; CHECK-NEXT:    li 3, 192
+; CHECK-NEXT:    blr
+entry:
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = or i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @any_bits_clear_branch(i32 %P, i32 %Q)  {
+; CHECK-LABEL: any_bits_clear_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    and 3, 3, 4
+; CHECK-NEXT:    cmpwi 0, 3, -1
+; CHECK-NEXT:    beq 0, .LBB14_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    li 3, 4
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB14_2: # %return
+; CHECK-NEXT:    li 3, 192
+; CHECK-NEXT:    blr
+entry:
+  %a = icmp ne i32 %P, -1
+  %b = icmp ne i32 %Q, -1
+  %c = or i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @any_sign_bits_clear_branch(i32 %P, i32 %Q)  {
+; CHECK-LABEL: any_sign_bits_clear_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    and 3, 3, 4
+; CHECK-NEXT:    cmpwi 0, 3, 0
+; CHECK-NEXT:    blt 0, .LBB15_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    li 3, 4
+; CHECK-NEXT:    blr
+; CHECK-NEXT:  .LBB15_2: # %return
+; CHECK-NEXT:    li 3, 192
+; CHECK-NEXT:    blr
+entry:
+  %a = icmp sgt i32 %P, -1
+  %b = icmp sgt i32 %Q, -1
+  %c = or i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define <4 x i1> @all_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) {
+; CHECK-LABEL: all_bits_clear_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xxlxor 36, 36, 36
+; CHECK-NEXT:    xxlor 34, 34, 35
+; CHECK-NEXT:    vcmpequw 2, 2, 4
+; CHECK-NEXT:    blr
+  %a = icmp eq <4 x i32> %P, zeroinitializer
+  %b = icmp eq <4 x i32> %Q, zeroinitializer
+  %c = and <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @all_sign_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) {
+; CHECK-LABEL: all_sign_bits_clear_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vspltisb 4, -1
+; CHECK-NEXT:    xxlor 34, 34, 35
+; CHECK-NEXT:    vcmpgtsw 2, 2, 4
+; CHECK-NEXT:    blr
+  %a = icmp sgt <4 x i32> %P, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %b = icmp sgt <4 x i32> %Q, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = and <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @all_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) {
+; CHECK-LABEL: all_bits_set_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vspltisb 4, -1
+; CHECK-NEXT:    xxland 34, 34, 35
+; CHECK-NEXT:    vcmpequw 2, 2, 4
+; CHECK-NEXT:    blr
+  %a = icmp eq <4 x i32> %P, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %b = icmp eq <4 x i32> %Q, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = and <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @all_sign_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) {
+; CHECK-LABEL: all_sign_bits_set_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xxlxor 36, 36, 36
+; CHECK-NEXT:    xxland 34, 34, 35
+; CHECK-NEXT:    vcmpgtsw 2, 4, 2
+; CHECK-NEXT:    blr
+  %a = icmp slt <4 x i32> %P, zeroinitializer
+  %b = icmp slt <4 x i32> %Q, zeroinitializer
+  %c = and <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @any_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) {
+; CHECK-LABEL: any_bits_set_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xxlxor 36, 36, 36
+; CHECK-NEXT:    xxlor 34, 34, 35
+; CHECK-NEXT:    vcmpequw 2, 2, 4
+; CHECK-NEXT:    xxlnor 34, 34, 34
+; CHECK-NEXT:    blr
+  %a = icmp ne <4 x i32> %P, zeroinitializer
+  %b = icmp ne <4 x i32> %Q, zeroinitializer
+  %c = or <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @any_sign_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) {
+; CHECK-LABEL: any_sign_bits_set_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xxlxor 36, 36, 36
+; CHECK-NEXT:    xxlor 34, 34, 35
+; CHECK-NEXT:    vcmpgtsw 2, 4, 2
+; CHECK-NEXT:    blr
+  %a = icmp slt <4 x i32> %P, zeroinitializer
+  %b = icmp slt <4 x i32> %Q, zeroinitializer
+  %c = or <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @any_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) {
+; CHECK-LABEL: any_bits_clear_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vspltisb 4, -1
+; CHECK-NEXT:    xxland 34, 34, 35
+; CHECK-NEXT:    vcmpequw 2, 2, 4
+; CHECK-NEXT:    xxlnor 34, 34, 34
+; CHECK-NEXT:    blr
+  %a = icmp ne <4 x i32> %P, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %b = icmp ne <4 x i32> %Q, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = or <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @any_sign_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) {
+; CHECK-LABEL: any_sign_bits_clear_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vspltisb 4, -1
+; CHECK-NEXT:    xxland 34, 34, 35
+; CHECK-NEXT:    vcmpgtsw 2, 2, 4
+; CHECK-NEXT:    blr
+  %a = icmp sgt <4 x i32> %P, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %b = icmp sgt <4 x i32> %Q, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = or <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define zeroext i1 @ne_neg1_and_ne_zero(i64 %x) {
+; CHECK-LABEL: ne_neg1_and_ne_zero:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    addi 3, 3, 1
+; CHECK-NEXT:    li 4, 0
+; CHECK-NEXT:    li 12, 1
+; CHECK-NEXT:    cmpldi 3, 1
+; CHECK-NEXT:    isel 3, 12, 4, 1
+; CHECK-NEXT:    blr
+  %cmp1 = icmp ne i64 %x, -1
+  %cmp2 = icmp ne i64 %x, 0
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+; PR32401 - https://bugs.llvm.org/show_bug.cgi?id=32401
+
+define zeroext i1 @and_eq(i16 zeroext  %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d) {
+; CHECK-LABEL: and_eq:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xor 5, 5, 6
+; CHECK-NEXT:    xor 3, 3, 4
+; CHECK-NEXT:    or 3, 3, 5
+; CHECK-NEXT:    cntlzw 3, 3
+; CHECK-NEXT:    rlwinm 3, 3, 27, 31, 31
+; CHECK-NEXT:    blr
+  %cmp1 = icmp eq i16 %a, %b
+  %cmp2 = icmp eq i16 %c, %d
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define zeroext i1 @or_ne(i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: or_ne:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xor 5, 5, 6
+; CHECK-NEXT:    xor 3, 3, 4
+; CHECK-NEXT:    or 3, 3, 5
+; CHECK-NEXT:    cntlzw 3, 3
+; CHECK-NEXT:    nor 3, 3, 3
+; CHECK-NEXT:    rlwinm 3, 3, 27, 31, 31
+; CHECK-NEXT:    blr
+  %cmp1 = icmp ne i32 %a, %b
+  %cmp2 = icmp ne i32 %c, %d
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+; This should not be transformed because vector compares + bitwise logic are faster.
+
+define <4 x i1> @and_eq_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) {
+; CHECK-LABEL: and_eq_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vcmpequw 2, 2, 3
+; CHECK-NEXT:    vcmpequw 19, 4, 5
+; CHECK-NEXT:    xxland 34, 34, 51
+; CHECK-NEXT:    blr
+  %cmp1 = icmp eq <4 x i32> %a, %b
+  %cmp2 = icmp eq <4 x i32> %c, %d
+  %and = and <4 x i1> %cmp1, %cmp2
+  ret <4 x i1> %and
+}
+
diff --git a/test/CodeGen/PowerPC/setcc-to-sub.ll b/test/CodeGen/PowerPC/setcc-to-sub.ll
index 335bb403cd7ffb60416e57924fe38f04c0f93857..752ebe0c9d8b5b1876ce4b9571812a91dcedf37f 100644
--- a/test/CodeGen/PowerPC/setcc-to-sub.ll
+++ b/test/CodeGen/PowerPC/setcc-to-sub.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu \
 ; RUN: -mcpu=pwr8 < %s | FileCheck %s
 
@@ -6,6 +7,15 @@
 
 ; Function Attrs: norecurse nounwind readonly
 define zeroext i1 @test1(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    lwz 3, 0(3)
+; CHECK-NEXT:    lwz 4, 0(4)
+; CHECK-NEXT:    rlwinm 3, 3, 0, 28, 28
+; CHECK-NEXT:    rlwinm 4, 4, 0, 28, 28
+; CHECK-NEXT:    sub 3, 3, 4
+; CHECK-NEXT:    rldicl 3, 3, 1, 63
+; CHECK-NEXT:    blr
 entry:
   %arrayidx.i6 = bitcast %class.PB2* %s_a to i32*
   %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1
@@ -15,18 +25,20 @@ entry:
   %and.i4 = and i32 %1, 8
   %cmp.i5 = icmp ult i32 %and.i, %and.i4
   ret i1 %cmp.i5
-
-; CHECK-LABEL: @test1
-; CHECK: rlwinm [[REG1:[0-9]*]]
-; CHECK-NEXT: rlwinm [[REG2:[0-9]*]]
-; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG1]], [[REG2]]
-; CHECK-NEXT: rldicl 3, [[REG3]]
-; CHECK: blr
-
 }
 
 ; Function Attrs: norecurse nounwind readonly
 define zeroext i1 @test2(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    lwz 3, 0(3)
+; CHECK-NEXT:    lwz 4, 0(4)
+; CHECK-NEXT:    rlwinm 3, 3, 0, 28, 28
+; CHECK-NEXT:    rlwinm 4, 4, 0, 28, 28
+; CHECK-NEXT:    sub 3, 4, 3
+; CHECK-NEXT:    rldicl 3, 3, 1, 63
+; CHECK-NEXT:    xori 3, 3, 1
+; CHECK-NEXT:    blr
 entry:
   %arrayidx.i6 = bitcast %class.PB2* %s_a to i32*
   %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1
@@ -36,19 +48,19 @@ entry:
   %and.i4 = and i32 %1, 8
   %cmp.i5 = icmp ule i32 %and.i, %and.i4
   ret i1 %cmp.i5
-
-; CHECK-LABEL: @test2
-; CHECK: rlwinm [[REG1:[0-9]*]]
-; CHECK-NEXT: rlwinm [[REG2:[0-9]*]]
-; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG2]], [[REG1]]
-; CHECK-NEXT: rldicl [[REG4:[0-9]*]], [[REG3]]
-; CHECK-NEXT: xori 3, [[REG4]], 1
-; CHECK: blr
-
 }
 
 ; Function Attrs: norecurse nounwind readonly
 define zeroext i1 @test3(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    lwz 3, 0(3)
+; CHECK-NEXT:    lwz 4, 0(4)
+; CHECK-NEXT:    rlwinm 3, 3, 0, 28, 28
+; CHECK-NEXT:    rlwinm 4, 4, 0, 28, 28
+; CHECK-NEXT:    sub 3, 4, 3
+; CHECK-NEXT:    rldicl 3, 3, 1, 63
+; CHECK-NEXT:    blr
 entry:
   %arrayidx.i6 = bitcast %class.PB2* %s_a to i32*
   %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1
@@ -58,18 +70,20 @@ entry:
   %and.i4 = and i32 %1, 8
   %cmp.i5 = icmp ugt i32 %and.i, %and.i4
   ret i1 %cmp.i5
-
-; CHECK-LABEL: @test3
-; CHECK: rlwinm [[REG1:[0-9]*]]
-; CHECK-NEXT: rlwinm [[REG2:[0-9]*]]
-; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG2]], [[REG1]]
-; CHECK-NEXT: rldicl 3, [[REG3]]
-; CHECK: blr
-
 }
 
 ; Function Attrs: norecurse nounwind readonly
 define zeroext i1 @test4(%class.PB2* %s_a, %class.PB2* %s_b) local_unnamed_addr #0 {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    lwz 3, 0(3)
+; CHECK-NEXT:    lwz 4, 0(4)
+; CHECK-NEXT:    rlwinm 3, 3, 0, 28, 28
+; CHECK-NEXT:    rlwinm 4, 4, 0, 28, 28
+; CHECK-NEXT:    sub 3, 3, 4
+; CHECK-NEXT:    rldicl 3, 3, 1, 63
+; CHECK-NEXT:    xori 3, 3, 1
+; CHECK-NEXT:    blr
 entry:
   %arrayidx.i6 = bitcast %class.PB2* %s_a to i32*
   %0 = load i32, i32* %arrayidx.i6, align 8, !tbaa !1
@@ -79,15 +93,6 @@ entry:
   %and.i4 = and i32 %1, 8
   %cmp.i5 = icmp uge i32 %and.i, %and.i4
   ret i1 %cmp.i5
-
-; CHECK-LABEL: @test4
-; CHECK: rlwinm [[REG1:[0-9]*]]
-; CHECK-NEXT: rlwinm [[REG2:[0-9]*]]
-; CHECK-NEXT: sub [[REG3:[0-9]*]], [[REG1]], [[REG2]]
-; CHECK-NEXT: rldicl [[REG4:[0-9]*]], [[REG3]]
-; CHECK-NEXT: xori 3, [[REG4]], 1
-; CHECK: blr
-
 }
 
 !1 = !{!2, !2, i64 0}
diff --git a/test/CodeGen/PowerPC/sjlj_no0x.ll b/test/CodeGen/PowerPC/sjlj_no0x.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2018bcbbc9312a377c517c47197235604e3691aa
--- /dev/null
+++ b/test/CodeGen/PowerPC/sjlj_no0x.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr7 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64-unknown-linux-gnu -mcpu=a2 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -verify-machineinstrs | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind
+define void @_Z23BuiltinLongJmpFunc1_bufv() #0 {
+entry:
+  call void @llvm.eh.sjlj.longjmp(i8* bitcast (void ()* @_Z23BuiltinLongJmpFunc1_bufv to i8*))
+  unreachable
+
+; CHECK: @_Z23BuiltinLongJmpFunc1_bufv
+; CHECK: addis [[REG:[0-9]+]], 2, .LC0@toc@ha
+; CHECK: ld 31, 0([[REG]])
+; CHECK: ld [[REG2:[0-9]+]], 8([[REG]])
+; CHECK-DAG: ld 1, 16([[REG]])
+; CHECK-DAG: ld 30, 32([[REG]])
+; CHECK-DAG: ld 2, 24([[REG]])
+; CHECK-DAG: mtctr [[REG2]]
+; CHECK: bctr
+
+return:                                           ; No predecessors!
+  ret void
+}
+
+; Function Attrs: noreturn nounwind
+declare void @llvm.eh.sjlj.longjmp(i8*) #1
diff --git a/test/CodeGen/PowerPC/srl-mask.ll b/test/CodeGen/PowerPC/srl-mask.ll
index e581eae0ee5761f3bb9fd94740bd36251a659530..1a429b1bae3616518c57da8a8b6abf3fb1aaa04b 100644
--- a/test/CodeGen/PowerPC/srl-mask.ll
+++ b/test/CodeGen/PowerPC/srl-mask.ll
@@ -12,5 +12,16 @@ entry:
 ; CHECK: blr
 }
 
+; for AND with an immediate like (x & ~0xFFFF)
+; we should use rldicl instruction
+define i64 @bar(i64 %x) #0 {
+entry:
+; CHECK-LABEL: @bar
+  %a = and i64 %x, 18446744073709486080
+; CHECK: rldicr 3, 3, 0, 47
+  ret i64 %a
+; CHECK: blr
+}
+
 attributes #0 = { nounwind }
 
diff --git a/test/CodeGen/PowerPC/stacksize.ll b/test/CodeGen/PowerPC/stacksize.ll
new file mode 100644
index 0000000000000000000000000000000000000000..947aaa0fa49ef9e9b386590526ffec2b71dd394f
--- /dev/null
+++ b/test/CodeGen/PowerPC/stacksize.ll
@@ -0,0 +1,86 @@
+; For ELFv2 ABI, we can avoid allocating the parameter area in the stack frame of the caller function
+; if all the arguments can be passed to the callee in registers.
+; For ELFv1 ABI, we always need to allocate the parameter area.
+
+; Tests for ELFv2 ABI
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -target-abi elfv2 < %s | FileCheck %s -check-prefix=PPC64-ELFV2
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -target-abi elfv2 < %s | FileCheck %s -check-prefix=PPC64-ELFV2
+
+; Tests for ELFv1 ABI
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -target-abi elfv1 < %s | FileCheck %s -check-prefix=PPC64-ELFV1
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -target-abi elfv1 < %s | FileCheck %s -check-prefix=PPC64-ELFV1
+
+; If the callee has at most eight integer args, parameter area can be ommited for ELFv2 ABI.
+
+; PPC64-ELFV2-LABEL: WithoutParamArea1:
+; PPC64-ELFV2-NOT: stw {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV2: stdu 1, -32(1)
+; PPC64-ELFV2: addi 1, 1, 32
+; PPC64-ELFV2-NOT: lwz {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV1-LABEL: WithoutParamArea1:
+; PPC64-ELFV1-NOT: stw {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV1: stdu 1, -112(1)
+; PPC64-ELFV1: addi 1, 1, 112
+; PPC64-ELFV1-NOT: lwz {{[0-9]+}}, -{{[0-9]+}}(1)
+define signext i32 @WithoutParamArea1(i32 signext %a) local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @onearg(i32 signext %a) #2
+  ret i32 %call
+}
+
+; PPC64-ELFV2-LABEL: WithoutParamArea2:
+; PPC64-ELFV2-NOT: stw {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV2: stdu 1, -32(1)
+; PPC64-ELFV2: addi 1, 1, 32
+; PPC64-ELFV2-NOT: lwz {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV1-LABEL: WithoutParamArea2:
+; PPC64-ELFV1-NOT: stw {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV1: stdu 1, -112(1)
+; PPC64-ELFV1: addi 1, 1, 112
+; PPC64-ELFV1-NOT: lwz {{[0-9]+}}, -{{[0-9]+}}(1)
+define signext i32 @WithoutParamArea2(i32 signext %a) local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @eightargs(i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a) #2
+  ret i32 %call
+}
+
+; If the callee has more than eight integer args or variable number of args, 
+; parameter area cannot be ommited even for ELFv2 ABI
+
+; PPC64-ELFV2-LABEL: WithParamArea1:
+; PPC64-ELFV2-NOT: stw {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV2: stdu 1, -96(1)
+; PPC64-ELFV2: addi 1, 1, 96
+; PPC64-ELFV2-NOT: lwz {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV1-LABEL: WithParamArea1:
+; PPC64-ELFV1-NOT: stw {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV1: stdu 1, -112(1)
+; PPC64-ELFV1: addi 1, 1, 112
+; PPC64-ELFV1-NOT: lwz {{[0-9]+}}, -{{[0-9]+}}(1)
+define signext i32 @WithParamArea1(i32 signext %a) local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 (i32, ...) @varargs(i32 signext %a, i32 signext %a) #2
+  ret i32 %call
+}
+
+; PPC64-ELFV2-LABEL: WithParamArea2:
+; PPC64-ELFV2-NOT: stw {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV2: stdu 1, -112(1)
+; PPC64-ELFV2: addi 1, 1, 112
+; PPC64-ELFV2-NOT: lwz {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV1-LABEL: WithParamArea2:
+; PPC64-ELFV1-NOT: stw {{[0-9]+}}, -{{[0-9]+}}(1)
+; PPC64-ELFV1: stdu 1, -128(1)
+; PPC64-ELFV1: addi 1, 1, 128
+; PPC64-ELFV1-NOT: lwz {{[0-9]+}}, -{{[0-9]+}}(1)
+define signext i32 @WithParamArea2(i32 signext %a) local_unnamed_addr #0 {
+entry:
+  %call = tail call signext i32 @nineargs(i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a, i32 signext %a) #2
+  ret i32 %call
+}
+
+declare signext i32 @onearg(i32 signext) local_unnamed_addr #1
+declare signext i32 @eightargs(i32 signext, i32 signext, i32 signext, i32 signext, i32 signext, i32 signext, i32 signext, i32 signext) local_unnamed_addr #1
+declare signext i32 @nineargs(i32 signext, i32 signext, i32 signext, i32 signext, i32 signext, i32 signext, i32 signext, i32 signext, i32 signext) local_unnamed_addr #1
+declare signext i32 @varargs(i32 signext, ...) local_unnamed_addr #1
+
diff --git a/test/CodeGen/PowerPC/structsinmem.ll b/test/CodeGen/PowerPC/structsinmem.ll
index 3777f3ec5bab9ea3d0505a21100cf18a12b6f8c5..01b0848e7070775c650121cc87c2c2e10a29107d 100644
--- a/test/CodeGen/PowerPC/structsinmem.ll
+++ b/test/CodeGen/PowerPC/structsinmem.ll
@@ -113,13 +113,13 @@ entry:
   %add13 = add nsw i32 %add11, %6
   ret i32 %add13
 
-; CHECK: lha {{[0-9]+}}, 126(1)
-; CHECK: lha {{[0-9]+}}, 132(1)
-; CHECK: lbz {{[0-9]+}}, 119(1)
-; CHECK: lwz {{[0-9]+}}, 140(1)
-; CHECK: lwz {{[0-9]+}}, 144(1)
-; CHECK: lwz {{[0-9]+}}, 152(1)
-; CHECK: lwz {{[0-9]+}}, 160(1)
+; CHECK-DAG: lha {{[0-9]+}}, 126(1)
+; CHECK-DAG: lha {{[0-9]+}}, 132(1)
+; CHECK-DAG: lbz {{[0-9]+}}, 119(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 140(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 144(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 152(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 160(1)
 }
 
 define i32 @caller2() nounwind {
@@ -205,11 +205,11 @@ entry:
   %add13 = add nsw i32 %add11, %6
   ret i32 %add13
 
-; CHECK: lha {{[0-9]+}}, 126(1)
-; CHECK: lha {{[0-9]+}}, 133(1)
-; CHECK: lbz {{[0-9]+}}, 119(1)
-; CHECK: lwz {{[0-9]+}}, 140(1)
-; CHECK: lwz {{[0-9]+}}, 147(1)
-; CHECK: lwz {{[0-9]+}}, 154(1)
-; CHECK: lwz {{[0-9]+}}, 161(1)
+; CHECK-DAG: lha {{[0-9]+}}, 126(1)
+; CHECK-DAG: lha {{[0-9]+}}, 133(1)
+; CHECK-DAG: lbz {{[0-9]+}}, 119(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 140(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 147(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 154(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 161(1)
 }
diff --git a/test/CodeGen/PowerPC/structsinregs.ll b/test/CodeGen/PowerPC/structsinregs.ll
index e27041dd4c88a5f95b6dc7696d225c895ac26652..54679f259e9a1f4e50a98d4310581b8d11d08b47 100644
--- a/test/CodeGen/PowerPC/structsinregs.ll
+++ b/test/CodeGen/PowerPC/structsinregs.ll
@@ -59,6 +59,7 @@ entry:
   %call = call i32 @callee1(%struct.s1* byval %p1, %struct.s2* byval %p2, %struct.s3* byval %p3, %struct.s4* byval %p4, %struct.s5* byval %p5, %struct.s6* byval %p6, %struct.s7* byval %p7)
   ret i32 %call
 
+; CHECK-LABEL: caller1
 ; CHECK: ld 9, 112(31)
 ; CHECK: ld 8, 120(31)
 ; CHECK: ld 7, 128(31)
@@ -97,20 +98,21 @@ entry:
   %add13 = add nsw i32 %add11, %6
   ret i32 %add13
 
-; CHECK: std 9, 96(1)
-; CHECK: std 8, 88(1)
-; CHECK: std 7, 80(1)
-; CHECK: stw 6, 76(1)
-; CHECK: stw 5, 68(1)
-; CHECK: sth 4, 62(1)
-; CHECK: stb 3, 55(1)
-; CHECK: lha {{[0-9]+}}, 62(1)
-; CHECK: lha {{[0-9]+}}, 68(1)
-; CHECK: lbz {{[0-9]+}}, 55(1)
-; CHECK: lwz {{[0-9]+}}, 76(1)
-; CHECK: lwz {{[0-9]+}}, 80(1)
-; CHECK: lwz {{[0-9]+}}, 88(1)
-; CHECK: lwz {{[0-9]+}}, 96(1)
+; CHECK-LABEL: callee1
+; CHECK-DAG: std 9, 96(1)
+; CHECK-DAG: std 8, 88(1)
+; CHECK-DAG: std 7, 80(1)
+; CHECK-DAG: stw 6, 76(1)
+; CHECK-DAG: stw 5, 68(1)
+; CHECK-DAG: sth 4, 62(1)
+; CHECK-DAG: stb 3, 55(1)
+; CHECK-DAG: lha {{[0-9]+}}, 62(1)
+; CHECK-DAG: lha {{[0-9]+}}, 68(1)
+; CHECK-DAG: lbz {{[0-9]+}}, 55(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 76(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 80(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 88(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 96(1)
 }
 
 define i32 @caller2() nounwind {
@@ -139,6 +141,7 @@ entry:
   %call = call i32 @callee2(%struct.t1* byval %p1, %struct.t2* byval %p2, %struct.t3* byval %p3, %struct.t4* byval %p4, %struct.t5* byval %p5, %struct.t6* byval %p6, %struct.t7* byval %p7)
   ret i32 %call
 
+; CHECK-LABEL: caller2
 ; CHECK: stb {{[0-9]+}}, 71(1)
 ; CHECK: sth {{[0-9]+}}, 69(1)
 ; CHECK: stb {{[0-9]+}}, 87(1)
@@ -184,18 +187,19 @@ entry:
   %add13 = add nsw i32 %add11, %6
   ret i32 %add13
 
-; CHECK: std 9, 96(1)
-; CHECK: std 8, 88(1)
-; CHECK: std 7, 80(1)
-; CHECK: stw 6, 76(1)
-; CHECK: std 5, 64(1)
-; CHECK: sth 4, 62(1)
-; CHECK: stb 3, 55(1)
-; CHECK: lha {{[0-9]+}}, 62(1)
-; CHECK: lha {{[0-9]+}}, 69(1)
-; CHECK: lbz {{[0-9]+}}, 55(1)
-; CHECK: lwz {{[0-9]+}}, 76(1)
-; CHECK: lwz {{[0-9]+}}, 83(1)
-; CHECK: lwz {{[0-9]+}}, 90(1)
-; CHECK: lwz {{[0-9]+}}, 97(1)
+; CHECK-LABEL: callee2
+; CHECK-DAG: std 9, 96(1)
+; CHECK-DAG: std 8, 88(1)
+; CHECK-DAG: std 7, 80(1)
+; CHECK-DAG: stw 6, 76(1)
+; CHECK-DAG: std 5, 64(1)
+; CHECK-DAG: sth 4, 62(1)
+; CHECK-DAG: stb 3, 55(1)
+; CHECK-DAG: lha {{[0-9]+}}, 62(1)
+; CHECK-DAG: lha {{[0-9]+}}, 69(1)
+; CHECK-DAG: lbz {{[0-9]+}}, 55(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 76(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 83(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 90(1)
+; CHECK-DAG: lwz {{[0-9]+}}, 97(1)
 }
diff --git a/test/CodeGen/PowerPC/subtract_from_imm.ll b/test/CodeGen/PowerPC/subtract_from_imm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8fa07b671a3de034e16c016dae5a21db4898d144
--- /dev/null
+++ b/test/CodeGen/PowerPC/subtract_from_imm.ll
@@ -0,0 +1,41 @@
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu < %s | FileCheck %s
+
+; Make sure that the subfic is generated iff possible
+
+define i64 @subtract_from_imm1(i64 %v) nounwind readnone {
+entry:
+; CHECK-LABEL: subtract_from_imm1
+; CHECK: subfic 3, 3, 32767
+; CHECK: blr
+  %sub = sub i64 32767, %v
+  ret i64 %sub
+}
+
+define i64 @subtract_from_imm2(i64 %v) nounwind readnone {
+entry:
+; CHECK-LABEL: subtract_from_imm2
+; CHECK-NOT: subfic
+; CHECK: blr
+  %sub = sub i64 32768, %v
+  ret i64 %sub
+}
+
+define i64 @subtract_from_imm3(i64 %v) nounwind readnone {
+entry:
+; CHECK-LABEL: subtract_from_imm3
+; CHECK: subfic 3, 3, -32768
+; CHECK: blr
+  %sub = sub i64 -32768, %v
+  ret i64 %sub
+}
+
+define i64 @subtract_from_imm4(i64 %v) nounwind readnone {
+entry:
+; CHECK-LABEL: subtract_from_imm4
+; CHECK-NOT: subfic
+; CHECK: blr
+  %sub = sub i64 -32769, %v
+  ret i64 %sub
+}
+
diff --git a/test/CodeGen/PowerPC/swaps-le-4.ll b/test/CodeGen/PowerPC/swaps-le-4.ll
index 87c6dac9630b62741060c4d66ffabab610415a35..2bf684d9d61449c83c3b5d63682bae17520cefd8 100644
--- a/test/CodeGen/PowerPC/swaps-le-4.ll
+++ b/test/CodeGen/PowerPC/swaps-le-4.ll
@@ -8,11 +8,11 @@ define void @bar() {
 entry:
   %x = alloca <2 x i64>, align 16
   %0 = bitcast <2 x i64>* %x to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %0)
   %arrayidx = getelementptr inbounds <2 x i64>, <2 x i64>* %x, i64 0, i64 0
   store <2 x i64> <i64 0, i64 1>, <2 x i64>* %x, align 16
   call void @foo(i64* %arrayidx)
-  call void @llvm.lifetime.end(i64 16, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %0)
   ret void
 }
 
@@ -21,7 +21,7 @@ entry:
 ; CHECK: stxvd2x
 ; CHECK-NOT: xxswapd
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 declare void @foo(i64*)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
diff --git a/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll b/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll
index 5d03af801fc6c628945db070c1187831b3969938..0b1014571613dbeb982700a1d0056de04e2c3058 100644
--- a/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll
+++ b/test/CodeGen/PowerPC/tail-dup-branch-to-fallthrough.ll
@@ -3,7 +3,7 @@ target datalayout = "E-m:e-i64:64-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 declare void @f1()
 declare void @f2()
@@ -54,11 +54,11 @@ if.else:                                      ; preds = %sw.default
   br label %dup2
 
 dup1:                                         ; preds = %sw.0, %sw.1
-  call void @llvm.lifetime.end(i64 8, i8* nonnull undef) #0
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull undef) #0
   unreachable
 
 dup2:                                         ; preds = %if.then, %if.else
-  call void @llvm.lifetime.end(i64 8, i8* nonnull undef) #0
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* nonnull undef) #0
   unreachable
 }
 
diff --git a/test/CodeGen/PowerPC/tail-dup-break-cfg.ll b/test/CodeGen/PowerPC/tail-dup-break-cfg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f19b11f2ae4ca653d09c27ae205350971ca34162
--- /dev/null
+++ b/test/CodeGen/PowerPC/tail-dup-break-cfg.ll
@@ -0,0 +1,140 @@
+; RUN: llc -O2 -o - %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-n32:64"
+target triple = "powerpc64le-grtev4-linux-gnu"
+
+; Intended layout:
+; The code for tail-duplication during layout will produce the layout:
+; test1
+; test2
+; body1 (with copy of test2)
+; body2
+; exit
+
+;CHECK-LABEL: tail_dup_break_cfg:
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, [[BODY1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, [[BODY2LABEL:[._0-9A-Za-z]+]]
+;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK: blr
+;CHECK-NEXT: [[BODY1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, [[EXITLABEL]]
+;CHECK-NEXT: [[BODY2LABEL:[._0-9A-Za-z]+]]:
+;CHECK: b [[EXITLABEL]]
+define void @tail_dup_break_cfg(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely
+body1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %exit, label %body2, !prof !1 ; %exit more likely
+body2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %exit
+exit:
+  ret void
+}
+
+; The branch weights here hint that we shouldn't tail duplicate in this case.
+;CHECK-LABEL: tail_dup_dont_break_cfg:
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 4, 1, [[TEST2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %body1
+;CHECK: [[TEST2LABEL]]: # %test2
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, [[EXITLABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %body2
+;CHECK: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK: blr
+define void @tail_dup_dont_break_cfg(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %body1, !prof !1 ; %test2 more likely
+body1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp ne i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %body2, label %exit, !prof !3 ; %body2 more likely
+body2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %exit
+exit:
+  ret void
+}
+declare void @a()
+declare void @b()
+declare void @c()
+declare void @d()
+
+; This function arranges for the successors of %succ to have already been laid
+; out. When we consider whether to lay out succ after bb and to tail-duplicate
+; it, v and ret have already been placed, so we tail-duplicate as it removes a
+; branch and strictly increases fallthrough
+; CHECK-LABEL: tail_dup_no_succ
+; CHECK: # %entry
+; CHECK: # %v
+; CHECK: # %ret
+; CHECK: # %bb
+; CHECK: # %succ
+; CHECK: # %c
+; CHECK: bl c
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK: beq
+; CHECK: b
+define void @tail_dup_no_succ(i32 %tag) {
+entry:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %v, label %bb, !prof !2 ; %v very much more likely
+bb:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %succ, label %c, !prof !3 ; %succ more likely
+c:
+  call void @c()
+  call void @c()
+  br label %succ
+succ:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %ret, label %v, !prof !1 ; %u more likely
+v:
+  call void @d()
+  call void @d()
+  br label %ret
+ret:
+  ret void
+}
+
+
+!1 = !{!"branch_weights", i32 5, i32 3}
+!2 = !{!"branch_weights", i32 95, i32 5}
+!3 = !{!"branch_weights", i32 8, i32 3}
diff --git a/test/CodeGen/PowerPC/tail-dup-layout.ll b/test/CodeGen/PowerPC/tail-dup-layout.ll
index 6790aa8e944147453d986065ddae3ce9d2525548..c9b5bf8c9eeb3709cfdebea75b31fbab043b2c91 100644
--- a/test/CodeGen/PowerPC/tail-dup-layout.ll
+++ b/test/CodeGen/PowerPC/tail-dup-layout.ll
@@ -1,59 +1,59 @@
-; RUN: llc -outline-optional-branches -O2 < %s | FileCheck %s
+; RUN: llc -O2 < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-n32:64"
 target triple = "powerpc64le-grtev4-linux-gnu"
 
 ; Intended layout:
-; The outlining flag produces the layout
+; The chain-based outlining produces the layout
 ; test1
 ; test2
 ; test3
 ; test4
-; exit
 ; optional1
 ; optional2
 ; optional3
 ; optional4
+; exit
 ; Tail duplication puts test n+1 at the end of optional n
 ; so optional1 includes a copy of test2 at the end, and branches
 ; to test3 (at the top) or falls through to optional 2.
-; The CHECK statements check for the whole string of tests and exit block,
+; The CHECK statements check for the whole string of tests
 ; and then check that the correct test has been duplicated into the end of
 ; the optional blocks and that the optional blocks are in the correct order.
-;CHECK-LABEL: f:
+;CHECK-LABEL: straight_test:
 ; test1 may have been merged with entry
 ;CHECK: mr [[TAGREG:[0-9]+]], 3
 ;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
-;CHECK-NEXT: bc 12, 1, [[OPT1LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST2LABEL:[._0-9A-Za-z]+]]: # %test2
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: bne 0, [[OPT2LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
-;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[TEST4LABEL:[._0-9A-Za-z]+]]: # %test4
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[_0-9A-Za-z]+]]: # %test4
 ;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: bne 0, .[[OPT4LABEL:[._0-9A-Za-z]+]]
-;CHECK-NEXT: [[EXITLABEL:[._0-9A-Za-z]+]]: # %exit
+;CHECK-NEXT: bne 0, .[[OPT4LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
 ;CHECK: blr
-;CHECK-NEXT: [[OPT1LABEL]]
+;CHECK-NEXT: .[[OPT1LABEL]]:
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
-;CHECK-NEXT: beq 0, [[TEST3LABEL]]
-;CHECK-NEXT: [[OPT2LABEL]]
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]:
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
-;CHECK-NEXT: beq 0, [[TEST4LABEL]]
-;CHECK-NEXT: [[OPT3LABEL]]
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]:
 ;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
-;CHECK-NEXT: beq 0, [[EXITLABEL]]
-;CHECK-NEXT: [[OPT4LABEL]]
-;CHECK: b [[EXITLABEL]]
+;CHECK-NEXT: beq 0, .[[EXITLABEL]]
+;CHECK-NEXT: .[[OPT4LABEL]]:
+;CHECK: b .[[EXITLABEL]]
 
-define void @f(i32 %tag) {
+define void @straight_test(i32 %tag) {
 entry:
   br label %test1
 test1:
   %tagbit1 = and i32 %tag, 1
   %tagbit1eq0 = icmp eq i32 %tagbit1, 0
-  br i1 %tagbit1eq0, label %test2, label %optional1
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
 optional1:
   call void @a()
   call void @a()
@@ -63,7 +63,7 @@ optional1:
 test2:
   %tagbit2 = and i32 %tag, 2
   %tagbit2eq0 = icmp eq i32 %tagbit2, 0
-  br i1 %tagbit2eq0, label %test3, label %optional2
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
 optional2:
   call void @b()
   call void @b()
@@ -73,7 +73,7 @@ optional2:
 test3:
   %tagbit3 = and i32 %tag, 4
   %tagbit3eq0 = icmp eq i32 %tagbit3, 0
-  br i1 %tagbit3eq0, label %test4, label %optional3
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
 optional3:
   call void @c()
   call void @c()
@@ -83,7 +83,7 @@ optional3:
 test4:
   %tagbit4 = and i32 %tag, 8
   %tagbit4eq0 = icmp eq i32 %tagbit4, 0
-  br i1 %tagbit4eq0, label %exit, label %optional4
+  br i1 %tagbit4eq0, label %exit, label %optional4, !prof !1
 optional4:
   call void @d()
   call void @d()
@@ -94,7 +94,449 @@ exit:
   ret void
 }
 
+; Intended layout:
+; The chain-of-triangles based duplicating produces the layout
+; test1
+; test2
+; test3
+; test4
+; optional1
+; optional2
+; optional3
+; optional4
+; exit
+; even for 50/50 branches.
+; Tail duplication puts test n+1 at the end of optional n
+; so optional1 includes a copy of test2 at the end, and branches
+; to test3 (at the top) or falls through to optional 2.
+; The CHECK statements check for the whole string of tests
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: straight_test_50:
+; test1 may have been merged with entry
+;CHECK: mr [[TAGREG:[0-9]+]], 3
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3
+;CHECK-NEXT: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]]
+;CHECK-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit
+;CHECK: blr
+;CHECK-NEXT: .[[OPT1LABEL]]:
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]:
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, .[[EXITLABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]:
+;CHECK: b .[[EXITLABEL]]
+
+define void @straight_test_50(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2
+optional1:
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2
+optional2:
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1
+optional3:
+  call void @c()
+  br label %exit
+exit:
+  ret void
+}
+
+; Intended layout:
+; The chain-based outlining produces the layout
+; entry
+; --- Begin loop ---
+; for.latch
+; for.check
+; test1
+; test2
+; test3
+; test4
+; optional1
+; optional2
+; optional3
+; optional4
+; --- End loop ---
+; exit
+; The CHECK statements check for the whole string of tests and exit block,
+; and then check that the correct test has been duplicated into the end of
+; the optional blocks and that the optional blocks are in the correct order.
+;CHECK-LABEL: loop_test:
+;CHECK: add [[TAGPTRREG:[0-9]+]], 3, 4
+;CHECK: .[[LATCHLABEL:[._0-9A-Za-z]+]]: # %for.latch
+;CHECK: addi
+;CHECK: .[[CHECKLABEL:[._0-9A-Za-z]+]]: # %for.check
+;CHECK: lwz [[TAGREG:[0-9]+]], 0([[TAGPTRREG]])
+;CHECK: # %test1
+;CHECK: andi. {{[0-9]+}}, [[TAGREG]], 1
+;CHECK-NEXT: bc 12, 1, .[[OPT1LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: # %test2
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: bne 0, .[[OPT2LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST3LABEL:[._0-9A-Za-z]+]]: # %test3
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: bne 0, .[[OPT3LABEL:[._0-9A-Za-z]+]]
+;CHECK-NEXT: .[[TEST4LABEL:[._0-9A-Za-z]+]]: # %{{(test4|optional3)}}
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK-NEXT: b .[[OPT4LABEL:[._0-9A-Za-z]+]]
+;CHECK: [[OPT1LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 30, 30
+;CHECK-NEXT: beq 0, .[[TEST3LABEL]]
+;CHECK-NEXT: .[[OPT2LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 29, 29
+;CHECK-NEXT: beq 0, .[[TEST4LABEL]]
+;CHECK-NEXT: .[[OPT3LABEL]]
+;CHECK: rlwinm. {{[0-9]+}}, [[TAGREG]], 0, 28, 28
+;CHECK-NEXT: beq 0, .[[LATCHLABEL]]
+;CHECK: [[OPT4LABEL]]:
+;CHECK: b .[[LATCHLABEL]]
+define void @loop_test(i32* %tags, i32 %count) {
+entry:
+  br label %for.check
+for.check:
+  %count.loop = phi i32 [%count, %entry], [%count.sub, %for.latch]
+  %done.count = icmp ugt i32 %count.loop, 0
+  %tag_ptr = getelementptr inbounds i32, i32* %tags, i32 %count
+  %tag = load i32, i32* %tag_ptr
+  %done.tag = icmp eq i32 %tag, 0
+  %done = and i1 %done.count, %done.tag
+  br i1 %done, label %test1, label %exit, !prof !1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %optional1, !prof !1
+optional1:
+  call void @a()
+  call void @a()
+  call void @a()
+  call void @a()
+  br label %test2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %test3, label %optional2, !prof !1
+optional2:
+  call void @b()
+  call void @b()
+  call void @b()
+  call void @b()
+  br label %test3
+test3:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %test4, label %optional3, !prof !1
+optional3:
+  call void @c()
+  call void @c()
+  call void @c()
+  call void @c()
+  br label %test4
+test4:
+  %tagbit4 = and i32 %tag, 8
+  %tagbit4eq0 = icmp eq i32 %tagbit4, 0
+  br i1 %tagbit4eq0, label %for.latch, label %optional4, !prof !1
+optional4:
+  call void @d()
+  call void @d()
+  call void @d()
+  call void @d()
+  br label %for.latch
+for.latch:
+  %count.sub = sub i32 %count.loop, 1
+  br label %for.check
+exit:
+  ret void
+}
+
+; The block then2 is not unavoidable, meaning it does not dominate the exit.
+; But since it can be tail-duplicated, it should be placed as a fallthrough from
+; test2 and copied. The purpose here is to make sure that the tail-duplication
+; code is independent of the outlining code, which works by choosing the
+; "unavoidable" blocks.
+; CHECK-LABEL: avoidable_test:
+; CHECK: # %entry
+; CHECK: andi.
+; CHECK: # %test2
+; Make sure then2 falls through from test2
+; CHECK-NOT: # %{{[-_a-zA-Z0-9]+}}
+; CHECK: # %then2
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK: # %else1
+; CHECK: bl a
+; CHECK: bl a
+; Make sure then2 was copied into else1
+; CHECK: rlwinm. {{[0-9]+}}, {{[0-9]+}}, 0, 29, 29
+; CHECK: # %end1
+; CHECK: bl d
+; CHECK: # %else2
+; CHECK: bl c
+; CHECK: # %end2
+define void @avoidable_test(i32 %tag) {
+entry:
+  br label %test1
+test1:
+  %tagbit1 = and i32 %tag, 1
+  %tagbit1eq0 = icmp eq i32 %tagbit1, 0
+  br i1 %tagbit1eq0, label %test2, label %else1, !prof !1 ; %test2 more likely
+else1:
+  call void @a()
+  call void @a()
+  br label %then2
+test2:
+  %tagbit2 = and i32 %tag, 2
+  %tagbit2eq0 = icmp eq i32 %tagbit2, 0
+  br i1 %tagbit2eq0, label %then2, label %else2, !prof !1 ; %then2 more likely
+then2:
+  %tagbit3 = and i32 %tag, 4
+  %tagbit3eq0 = icmp eq i32 %tagbit3, 0
+  br i1 %tagbit3eq0, label %end2, label %end1, !prof !1 ; %end2 more likely
+else2:
+  call void @c()
+  br label %end2
+end2:
+  ret void
+end1:
+  call void @d()
+  ret void
+}
+
+; CHECK-LABEL: trellis_test
+; The number in the block labels is the expected block frequency given the
+; probabilities annotated. There is a conflict in the b;c->d;e trellis that
+; should be resolved as c->e;b->d.
+; The d;e->f;g trellis should be resolved as e->g;d->f.
+; The f;g->h;i trellis should be resolved as f->i;g->h.
+; The h;i->j;ret trellis contains a triangle edge, and should be resolved as
+; h->j->ret
+; CHECK: # %entry
+; CHECK: # %c10
+; CHECK: # %e9
+; CHECK: # %g10
+; CHECK: # %h10
+; CHECK: # %j8
+; CHECK: # %ret
+; CHECK: # %b6
+; CHECK: # %d7
+; CHECK: # %f6
+; CHECK: # %i6
+define void @trellis_test(i32 %tag) {
+entry:
+  br label %a16
+a16:
+  call void @a()
+  call void @a()
+  %tagbits.a = and i32 %tag, 3
+  %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
+  br i1 %tagbits.a.eq0, label %c10, label %b6, !prof !1 ; 10 to 6
+c10:
+  call void @c()
+  call void @c()
+  %tagbits.c = and i32 %tag, 12
+  %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
+  ; Both of these edges should be hotter than the other incoming edge
+  ; for e9 or d7
+  br i1 %tagbits.c.eq0, label %e9, label %d7, !prof !3 ; 6 to 4
+e9:
+  call void @e()
+  call void @e()
+  %tagbits.e = and i32 %tag, 48
+  %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
+  br i1 %tagbits.e.eq0, label %g10, label %f6, !prof !4 ; 7 to 2
+g10:
+  call void @g()
+  call void @g()
+  %tagbits.g = and i32 %tag, 192
+  %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
+  br i1 %tagbits.g.eq0, label %i6, label %h10, !prof !5 ; 2 to 8
+i6:
+  call void @i()
+  call void @i()
+  %tagbits.i = and i32 %tag, 768
+  %tagbits.i.eq0 = icmp eq i32 %tagbits.i, 0
+  br i1 %tagbits.i.eq0, label %ret, label %j8, !prof !2 ; balanced (3 to 3)
+b6:
+  call void @b()
+  call void @b()
+  %tagbits.b = and i32 %tag, 12
+  %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
+  br i1 %tagbits.b.eq1, label %e9, label %d7, !prof !2 ; balanced (3 to 3)
+d7:
+  call void @d()
+  call void @d()
+  %tagbits.d = and i32 %tag, 48
+  %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
+  br i1 %tagbits.d.eq1, label %g10, label %f6, !prof !6 ; 3 to 4
+f6:
+  call void @f()
+  call void @f()
+  %tagbits.f = and i32 %tag, 192
+  %tagbits.f.eq1 = icmp eq i32 %tagbits.f, 128
+  br i1 %tagbits.f.eq1, label %i6, label %h10, !prof !7 ; 4 to 2
+h10:
+  call void @h()
+  call void @h()
+  %tagbits.h = and i32 %tag, 768
+  %tagbits.h.eq1 = icmp eq i32 %tagbits.h, 512
+  br i1 %tagbits.h.eq1, label %ret, label %j8, !prof !2 ; balanced (5 to 5)
+j8:
+  call void @j()
+  call void @j()
+  br label %ret
+ret:
+  ret void
+}
+
+; Verify that we still consider tail-duplication opportunities if we find a
+; triangle trellis. Here D->F->G is the triangle, and D;E are both predecessors
+; of both F and G. The basic trellis algorithm picks the F->G edge, but after
+; checking, it's profitable to duplicate G into F. The weights here are not
+; really important. They are there to help make the test stable.
+; CHECK-LABEL: trellis_then_dup_test
+; CHECK: # %entry
+; CHECK: # %b
+; CHECK: # %d
+; CHECK: # %g
+; CHECK: # %ret1
+; CHECK: # %c
+; CHECK: # %e
+; CHECK: # %f
+; CHECK: # %ret2
+; CHECK: # %ret
+define void @trellis_then_dup_test(i32 %tag) {
+entry:
+  br label %a
+a:
+  call void @a()
+  call void @a()
+  %tagbits.a = and i32 %tag, 3
+  %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
+  br i1 %tagbits.a.eq0, label %b, label %c, !prof !1 ; 5 to 3
+b:
+  call void @b()
+  call void @b()
+  %tagbits.b = and i32 %tag, 12
+  %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
+  br i1 %tagbits.b.eq1, label %d, label %e, !prof !1 ; 5 to 3
+d:
+  call void @d()
+  call void @d()
+  %tagbits.d = and i32 %tag, 48
+  %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
+  br i1 %tagbits.d.eq1, label %g, label %f, !prof !1 ; 5 to 3
+f:
+  call void @f()
+  call void @f()
+  br label %g
+g:
+  %tagbits.g = and i32 %tag, 192
+  %tagbits.g.eq0 = icmp eq i32 %tagbits.g, 0
+  br i1 %tagbits.g.eq0, label %ret1, label %ret2, !prof !2 ; balanced
+c:
+  call void @c()
+  call void @c()
+  %tagbits.c = and i32 %tag, 12
+  %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
+  br i1 %tagbits.c.eq0, label %d, label %e, !prof !1 ; 5 to 3
+e:
+  call void @e()
+  call void @e()
+  %tagbits.e = and i32 %tag, 48
+  %tagbits.e.eq0 = icmp eq i32 %tagbits.e, 0
+  br i1 %tagbits.e.eq0, label %g, label %f, !prof !1 ; 5 to 3
+ret1:
+  call void @a()
+  br label %ret
+ret2:
+  call void @b()
+  br label %ret
+ret:
+  ret void
+}
+
+; Verify that we did not mis-identify triangle trellises if it is not
+; really a triangle.
+; CHECK-LABEL: trellis_no_triangle
+; CHECK: # %entry
+; CHECK: # %b
+; CHECK: # %d
+; CHECK: # %ret
+; CHECK: # %c
+; CHECK: # %e
+define void @trellis_no_triangle(i32 %tag) {
+entry:
+  br label %a
+a:
+  call void @a()
+  call void @a()
+  %tagbits.a = and i32 %tag, 3
+  %tagbits.a.eq0 = icmp eq i32 %tagbits.a, 0
+  br i1 %tagbits.a.eq0, label %b, label %c, !prof !8 ; 98 to 2
+b:
+  call void @b()
+  call void @b()
+  %tagbits.b = and i32 %tag, 12
+  %tagbits.b.eq1 = icmp eq i32 %tagbits.b, 8
+  br i1 %tagbits.b.eq1, label %d, label %e, !prof !9 ; 97 to 1
+d:
+  call void @d()
+  call void @d()
+  %tagbits.d = and i32 %tag, 48
+  %tagbits.d.eq1 = icmp eq i32 %tagbits.d, 32
+  br i1 %tagbits.d.eq1, label %ret, label %e, !prof !10 ; 96 to 2
+c:
+  call void @c()
+  call void @c()
+  %tagbits.c = and i32 %tag, 12
+  %tagbits.c.eq0 = icmp eq i32 %tagbits.c, 0
+  br i1 %tagbits.c.eq0, label %d, label %e, !prof !2 ; 1 to 1
+e:
+  call void @e()
+  call void @e()
+  br label %ret
+ret:
+  call void @f()
+  ret void
+}
+
 declare void @a()
 declare void @b()
 declare void @c()
 declare void @d()
+declare void @e()
+declare void @f()
+declare void @g()
+declare void @h()
+declare void @i()
+declare void @j()
+
+!1 = !{!"branch_weights", i32 5, i32 3}
+!2 = !{!"branch_weights", i32 50, i32 50}
+!3 = !{!"branch_weights", i32 6, i32 4}
+!4 = !{!"branch_weights", i32 7, i32 2}
+!5 = !{!"branch_weights", i32 2, i32 8}
+!6 = !{!"branch_weights", i32 3, i32 4}
+!7 = !{!"branch_weights", i32 4, i32 2}
+!8 = !{!"branch_weights", i32 98, i32 2}
+!9 = !{!"branch_weights", i32 97, i32 1}
+!10 = !{!"branch_weights", i32 96, i32 2}
diff --git a/test/CodeGen/PowerPC/toc-load-sched-bug.ll b/test/CodeGen/PowerPC/toc-load-sched-bug.ll
index e83124cbb990f90b7c76035f3294ed82b2350118..21ccbf6f1ead043396ae9cb939a8ab94f18dcf5f 100644
--- a/test/CodeGen/PowerPC/toc-load-sched-bug.ll
+++ b/test/CodeGen/PowerPC/toc-load-sched-bug.ll
@@ -223,7 +223,7 @@ if.then:                                          ; preds = %_ZNK4llvm7ErrorOrIS
   %10 = getelementptr inbounds %"class.std::allocator", %"class.std::allocator"* %ref.tmp.i.i2.i, i64 0, i32 0
   %11 = bitcast %"class.llvm::SMDiagnostic"* %ref.tmp to i8*
   call void @llvm.memset.p0i8.i64(i8* %11, i8 0, i64 16, i32 8, i1 false) #3
-  call void @llvm.lifetime.start(i64 1, i8* %10) #3
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %10) #3
   %tobool.i.i4.i = icmp eq i8* %4, null
   br i1 %tobool.i.i4.i, label %if.then.i.i6.i, label %if.end.i.i8.i
 
@@ -237,7 +237,7 @@ if.end.i.i8.i:                                    ; preds = %if.then
   br label %_ZNK4llvm9StringRefcvSsEv.exit9.i
 
 _ZNK4llvm9StringRefcvSsEv.exit9.i:                ; preds = %if.end.i.i8.i, %if.then.i.i6.i
-  call void @llvm.lifetime.end(i64 1, i8* %10) #3
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %10) #3
   %LineNo.i = getelementptr inbounds %"class.llvm::SMDiagnostic", %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 3
   store i32 -1, i32* %LineNo.i, align 8, !tbaa !14
   %ColumnNo.i = getelementptr inbounds %"class.llvm::SMDiagnostic", %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 4
@@ -246,7 +246,7 @@ _ZNK4llvm9StringRefcvSsEv.exit9.i:                ; preds = %if.end.i.i8.i, %if.
   store i32 0, i32* %Kind.i, align 8, !tbaa !22
   %Message.i = getelementptr inbounds %"class.llvm::SMDiagnostic", %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 6
   %12 = getelementptr inbounds %"class.std::allocator", %"class.std::allocator"* %ref.tmp.i.i.i, i64 0, i32 0
-  call void @llvm.lifetime.start(i64 1, i8* %12) #3
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %12) #3
   %tobool.i.i.i = icmp eq i8* %8, null
   br i1 %tobool.i.i.i, label %if.then.i.i.i, label %if.end.i.i.i
 
@@ -260,7 +260,7 @@ if.end.i.i.i:                                     ; preds = %_ZNK4llvm9StringRef
   br label %_ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit
 
 _ZN4llvm12SMDiagnosticC2ENS_9StringRefENS_9SourceMgr8DiagKindES1_.exit: ; preds = %if.then.i.i.i, %if.end.i.i.i
-  call void @llvm.lifetime.end(i64 1, i8* %12) #3
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %12) #3
   %_M_p.i.i.i.i.i = getelementptr inbounds %"class.llvm::SMDiagnostic", %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 7, i32 0, i32 0
   store i8* bitcast (i64* getelementptr inbounds ([0 x i64], [0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE, i64 0, i64 3) to i8*), i8** %_M_p.i.i.i.i.i, align 8, !tbaa !13
   %Ranges.i = getelementptr inbounds %"class.llvm::SMDiagnostic", %"class.llvm::SMDiagnostic"* %ref.tmp, i64 0, i32 8
@@ -320,7 +320,7 @@ _ZN4llvm12SMDiagnosticaSEOS0_.exit:               ; preds = %_ZN4llvm12SMDiagnos
   %call2.i.i42 = call dereferenceable(48) %"class.llvm::SmallVectorImpl.85"* @_ZN4llvm15SmallVectorImplINS_7SMFixItEEaSEOS2_(%"class.llvm::SmallVectorImpl.85"* %24, %"class.llvm::SmallVectorImpl.85"* dereferenceable(48) %25) #3
   call void @_ZN4llvm12SMDiagnosticD2Ev(%"class.llvm::SMDiagnostic"* %ref.tmp) #3
   %26 = getelementptr inbounds %"class.std::allocator", %"class.std::allocator"* %ref.tmp.i.i, i64 0, i32 0
-  call void @llvm.lifetime.start(i64 1, i8* %26) #3
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %26) #3
   %27 = bitcast i8* %arrayidx.i.i.i36 to %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Rep"*
   %cmp.i.i.i = icmp eq i8* %arrayidx.i.i.i36, bitcast ([0 x i64]* @_ZNSs4_Rep20_S_empty_rep_storageE to i8*)
   br i1 %cmp.i.i.i, label %_ZNSsD1Ev.exit, label %if.then.i.i.i45, !prof !28
@@ -332,11 +332,11 @@ if.then.i.i.i45:                                  ; preds = %_ZN4llvm12SMDiagnos
 
 if.then.i.i.i.i:                                  ; preds = %if.then.i.i.i45
   %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast = bitcast i32* %.atomicdst.i.i.i.i.i to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
   %29 = atomicrmw volatile add i32* %28, i32 -1 acq_rel
   store i32 %29, i32* %.atomicdst.i.i.i.i.i, align 4
   %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..atomicdst.0..atomicdst.0..i.i.i.i.i = load volatile i32, i32* %.atomicdst.i.i.i.i.i, align 4
-  call void @llvm.lifetime.end(i64 4, i8* %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %.atomicdst.i.i.i.i.i.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
   br label %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i
 
 if.else.i.i.i.i:                                  ; preds = %if.then.i.i.i45
@@ -355,9 +355,9 @@ if.then4.i.i.i:                                   ; preds = %_ZN9__gnu_cxxL27__e
   br label %_ZNSsD1Ev.exit
 
 _ZNSsD1Ev.exit:                                   ; preds = %_ZN4llvm12SMDiagnosticaSEOS0_.exit, %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i, %if.then4.i.i.i
-  call void @llvm.lifetime.end(i64 1, i8* %26) #3
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %26) #3
   %31 = getelementptr inbounds %"class.std::allocator", %"class.std::allocator"* %ref.tmp.i.i47, i64 0, i32 0
-  call void @llvm.lifetime.start(i64 1, i8* %31) #3
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %31) #3
   %_M_p.i.i.i.i48 = getelementptr inbounds %"class.std::basic_string", %"class.std::basic_string"* %ref.tmp5, i64 0, i32 0, i32 0
   %32 = load i8*, i8** %_M_p.i.i.i.i48, align 8, !tbaa !1
   %arrayidx.i.i.i49 = getelementptr inbounds i8, i8* %32, i64 -24
@@ -372,11 +372,11 @@ if.then.i.i.i52:                                  ; preds = %_ZNSsD1Ev.exit
 
 if.then.i.i.i.i55:                                ; preds = %if.then.i.i.i52
   %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast = bitcast i32* %.atomicdst.i.i.i.i.i46 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
   %35 = atomicrmw volatile add i32* %34, i32 -1 acq_rel
   store i32 %35, i32* %.atomicdst.i.i.i.i.i46, align 4
   %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..atomicdst.0..atomicdst.0..i.i.i.i.i54 = load volatile i32, i32* %.atomicdst.i.i.i.i.i46, align 4
-  call void @llvm.lifetime.end(i64 4, i8* %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %.atomicdst.i.i.i.i.i46.0..atomicdst.i.i.i.i.0..atomicdst.i.i.i.0..atomicdst.i.i.0..atomicdst.i.0..sroa_cast)
   br label %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60
 
 if.else.i.i.i.i57:                                ; preds = %if.then.i.i.i52
@@ -395,7 +395,7 @@ if.then4.i.i.i61:                                 ; preds = %_ZN9__gnu_cxxL27__e
   br label %_ZNSsD1Ev.exit62
 
 _ZNSsD1Ev.exit62:                                 ; preds = %_ZNSsD1Ev.exit, %_ZN9__gnu_cxxL27__exchange_and_add_dispatchEPii.exit.i.i.i60, %if.then4.i.i.i61
-  call void @llvm.lifetime.end(i64 1, i8* %31) #3
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %31) #3
   br label %cleanup
 
 cond.false.i.i:                                   ; preds = %_ZNK4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEE8getErrorEv.exit
@@ -438,10 +438,10 @@ _ZN4llvm7ErrorOrISt10unique_ptrINS_12MemoryBufferESt14default_deleteIS2_EEED2Ev.
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #3
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #3
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #3
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #3
 
 ; Function Attrs: noreturn nounwind
 declare void @__assert_fail(i8*, i8*, i32 zeroext, i8*) #4
diff --git a/test/CodeGen/PowerPC/vec_cmp.ll b/test/CodeGen/PowerPC/vec_cmp.ll
index 0eaac554aa4d0ddd51578d3ea77682408e011d58..88de9a17d91e2251fcf32f29b298892441ba2a18 100644
--- a/test/CodeGen/PowerPC/vec_cmp.ll
+++ b/test/CodeGen/PowerPC/vec_cmp.ll
@@ -54,7 +54,7 @@ entry:
 }
 ; CHECK-LABEL:     v16si8_cmp_ne:
 ; CHECK:     vcmpequb [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <16 x i8> @v16si8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
 entry:
@@ -64,7 +64,7 @@ entry:
 }
 ; CHECK-LABEL:      v16si8_cmp_le:
 ; CHECK:      vcmpgtsb [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <16 x i8> @v16ui8_cmp_le(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
 entry:
@@ -74,7 +74,7 @@ entry:
 }
 ; CHECK-LABEL:      v16ui8_cmp_le:
 ; CHECK:      vcmpgtub [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <16 x i8> @v16si8_cmp_lt(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
 entry:
@@ -120,7 +120,7 @@ entry:
 }
 ; CHECK-LABEL:      v16si8_cmp_ge:
 ; CHECK:      vcmpgtsb [[RET:[0-9]+]], 3, 2
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <16 x i8> @v16ui8_cmp_ge(<16 x i8> %x, <16 x i8> %y) nounwind readnone {
 entry:
@@ -130,7 +130,7 @@ entry:
 }
 ; CHECK-LABEL:      v16ui8_cmp_ge:
 ; CHECK:      vcmpgtub [[RET:[0-9]+]], 3, 2
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 
 define <32 x i8> @v32si8_cmp(<32 x i8> %x, <32 x i8> %y) nounwind readnone {
@@ -180,7 +180,7 @@ entry:
 }
 ; CHECK-LABEL:      v8si16_cmp_ne:
 ; CHECK:      vcmpequh [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <8 x i16> @v8si16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -190,7 +190,7 @@ entry:
 }
 ; CHECK-LABEL:      v8si16_cmp_le:
 ; CHECK:      vcmpgtsh [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <8 x i16> @v8ui16_cmp_le(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -200,7 +200,7 @@ entry:
 }
 ; CHECK-LABEL:      v8ui16_cmp_le:
 ; CHECK:      vcmpgtuh [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <8 x i16> @v8si16_cmp_lt(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -246,7 +246,7 @@ entry:
 }
 ; CHECK-LABEL:      v8si16_cmp_ge:
 ; CHECK:      vcmpgtsh [[RET:[0-9]+]], 3, 2
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <8 x i16> @v8ui16_cmp_ge(<8 x i16> %x, <8 x i16> %y) nounwind readnone {
 entry:
@@ -256,7 +256,7 @@ entry:
 }
 ; CHECK-LABEL:      v8ui16_cmp_ge:
 ; CHECK:      vcmpgtuh [[RET:[0-9]+]], 3, 2
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 
 define <16 x i16> @v16si16_cmp(<16 x i16> %x, <16 x i16> %y) nounwind readnone {
@@ -309,7 +309,7 @@ entry:
 }
 ; CHECK-LABEL:      v4si32_cmp_ne:
 ; CHECK:      vcmpequw [[RCMP:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RCMP]], [[RCMP]]
+; CHECK-NEXT: vnot     2, [[RCMP]]
 
 define <4 x i32> @v4si32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -319,7 +319,7 @@ entry:
 }
 ; CHECK-LABEL:      v4si32_cmp_le:
 ; CHECK:      vcmpgtsw [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <4 x i32> @v4ui32_cmp_le(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -329,7 +329,7 @@ entry:
 }
 ; CHECK-LABEL:      v4ui32_cmp_le:
 ; CHECK:      vcmpgtuw [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <4 x i32> @v4si32_cmp_lt(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -375,7 +375,7 @@ entry:
 }
 ; CHECK-LABEL:      v4si32_cmp_ge:
 ; CHECK:      vcmpgtsw [[RET:[0-9]+]], 3, 2
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <4 x i32> @v4ui32_cmp_ge(<4 x i32> %x, <4 x i32> %y) nounwind readnone {
 entry:
@@ -385,7 +385,7 @@ entry:
 }
 ; CHECK-LABEL:      v4ui32_cmp_ge:
 ; CHECK:      vcmpgtuw [[RET:[0-9]+]], 3, 2
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 
 define <8 x i32> @v8si32_cmp(<8 x i32> %x, <8 x i32> %y) nounwind readnone {
@@ -458,7 +458,7 @@ entry:
 }
 ; CHECK-LABEL:      v4f32_cmp_ne:
 ; CHECK:      vcmpeqfp [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <4 x float> @v4f32_cmp_le(<4 x float> %x, <4 x float> %y) nounwind readnone {
 entry:
@@ -509,7 +509,7 @@ entry:
 }
 ; CHECK-LABEL: v4f32_cmp_ule:
 ; CHECK:      vcmpgtfp [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <4 x float> @v4f32_cmp_ult(<4 x float> %x, <4 x float> %y) nounwind readnone {
 entry:
@@ -520,7 +520,7 @@ entry:
 }
 ; CHECK-LABEL: v4f32_cmp_ult:
 ; CHECK:      vcmpgefp [[RET:[0-9]+]], 2, 3
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <4 x float> @v4f32_cmp_uge(<4 x float> %x, <4 x float> %y) nounwind readnone {
 entry:
@@ -531,7 +531,7 @@ entry:
 }
 ; CHECK-LABEL: v4f32_cmp_uge:
 ; CHECK:      vcmpgtfp [[RET:[0-9]+]], 3, 2
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 define <4 x float> @v4f32_cmp_ugt(<4 x float> %x, <4 x float> %y) nounwind readnone {
 entry:
@@ -542,7 +542,7 @@ entry:
 }
 ; CHECK-LABEL: v4f32_cmp_ugt:
 ; CHECK:      vcmpgefp [[RET:[0-9]+]], 3, 2
-; CHECK-NEXT: vnor     2, [[RET]], [[RET]]
+; CHECK-NEXT: vnot     2, [[RET]]
 
 
 define <8 x float> @v8f32_cmp(<8 x float> %x, <8 x float> %y) nounwind readnone {
diff --git a/test/CodeGen/PowerPC/vsx-args.ll b/test/CodeGen/PowerPC/vsx-args.ll
index 252f9b360b968b239c4926f1c8dead562e8933db..7fa31aea84ba4e85f85c0a2e6e98a82607a9a796 100644
--- a/test/CodeGen/PowerPC/vsx-args.ll
+++ b/test/CodeGen/PowerPC/vsx-args.ll
@@ -13,10 +13,10 @@ entry:
   ret <2 x double> %v
 
 ; CHECK-LABEL: @main
-; CHECK-DAG: vor [[V:[0-9]+]], 2, 2
-; CHECK-DAG: vor 2, 3, 3
-; CHECK-DAG: vor 3, 4, 4
-; CHECK-DAG: vor 4, [[V]], [[V]]
+; CHECK-DAG: vmr [[V:[0-9]+]], 2
+; CHECK-DAG: vmr 2, 3
+; CHECK-DAG: vmr 3, 4
+; CHECK-DAG: vmr 4, [[V]]
 ; CHECK: bl sv
 ; CHECK: lxvd2x [[VC:[0-9]+]],
 ; CHECK: xvadddp 34, 34, [[VC]]
@@ -24,8 +24,8 @@ entry:
 
 ; CHECK-FISL-LABEL: @main
 ; CHECK-FISL: stxvd2x 34
-; CHECK-FISL: vor 2, 3, 3
-; CHECK-FISL: vor 3, 4, 4
+; CHECK-FISL: vmr 2, 3
+; CHECK-FISL: vmr 3, 4
 ; CHECK-FISL: lxvd2x 36
 ; CHECK-FISL: bl sv
 ; CHECK-FISL: lxvd2x [[VC:[0-9]+]],
diff --git a/test/CodeGen/PowerPC/vsx-infl-copy1.ll b/test/CodeGen/PowerPC/vsx-infl-copy1.ll
index 592f85e2bcaf610e13a87b9d649cff00fc7d6b78..1d6718279a0d9bab916c4eb0f7f9fd27c520581e 100644
--- a/test/CodeGen/PowerPC/vsx-infl-copy1.ll
+++ b/test/CodeGen/PowerPC/vsx-infl-copy1.ll
@@ -11,15 +11,15 @@ entry:
   br label %vector.body
 
 ; CHECK-LABEL: @_Z8example9Pj
-; CHECK: vor
-; CHECK: vor
-; CHECK: vor
-; CHECK: vor
-; CHECK: vor
-; CHECK: vor
-; CHECK: vor
-; CHECK: vor
-; CHECK: vor
+; CHECK: vmr
+; CHECK: vmr
+; CHECK: vmr
+; CHECK: vmr
+; CHECK: vmr
+; CHECK: vmr
+; CHECK: vmr
+; CHECK: vmr
+; CHECK: vmr
 
 vector.body:                                      ; preds = %vector.body, %entry
   %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ]
diff --git a/test/CodeGen/SPARC/mature-mc-support.ll b/test/CodeGen/SPARC/mature-mc-support.ll
index 4ed33098051de069ada2479cf1285689ec585046..3951ddd604c48be35543b7493943c34a0386b42a 100644
--- a/test/CodeGen/SPARC/mature-mc-support.ll
+++ b/test/CodeGen/SPARC/mature-mc-support.ll
@@ -17,4 +17,4 @@
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
 
-; CHECK: LLVM ERROR: Error parsing inline asm
+; CHECK: error: unknown directive
diff --git a/test/CodeGen/SPARC/reserved-regs.ll b/test/CodeGen/SPARC/reserved-regs.ll
index fe208015827be2f513bdf5cc435b0f4e5fabad7a..c5a124f538f9c7afd4266cb5cda6b855a3dd0c72 100644
--- a/test/CodeGen/SPARC/reserved-regs.ll
+++ b/test/CodeGen/SPARC/reserved-regs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=sparc  < %s | FileCheck %s
+; RUN: llc -march=sparc -verify-machineinstrs < %s | FileCheck %s
 
 @g = common global [32 x i32] zeroinitializer, align 16
 @h = common global [16 x i64] zeroinitializer, align 16
@@ -6,6 +6,7 @@
 ;; Ensures that we don't use registers which are supposed to be reserved.
 
 ; CHECK-LABEL: use_all_i32_regs:
+; CHECK: save %sp
 ; CHECK-NOT: %g0
 ; CHECK-NOT: %g1
 ; CHECK-NOT: %g5
@@ -86,6 +87,7 @@ entry:
 
 
 ; CHECK-LABEL: use_all_i64_regs:
+; CHECK: save %sp
 ; CHECK-NOT: %g0
 ; CHECK-NOT: %g1
 ; CHECK-NOT: %g4
diff --git a/test/CodeGen/SPARC/sjlj.ll b/test/CodeGen/SPARC/sjlj.ll
index 3bf583aa475431e0e242cbec9f8380884a14bc0c..459630f9255fc92818d93ffff75fcb9b26f7027e 100755
--- a/test/CodeGen/SPARC/sjlj.ll
+++ b/test/CodeGen/SPARC/sjlj.ll
@@ -66,13 +66,18 @@ return:                                           ; preds = %if.end, %if.then
 ; CHECK:  ba   .LBB1_1
 ; CHECK:  nop
 ; CHECK:.LBB1_1:                                ! %entry
-; CHECK:  ba   .LBB1_3
 ; CHECK:  mov  %g0, %i0
+; CHECK:                                        ! %entry
+; CHECK:  cmp %i0, 0
+; CHECK:  be   .LBB1_5
+; CHECK:  nop
+; CHECK:.LBB1_4:
+; CHECK:  mov  1, %i0
+; CHECK:  ba .LBB1_6
 ; CHECK:.LBB1_2:                                ! Block address taken
 ; CHECK:  mov  1, %i0
-; CHECK:.LBB1_3:                                ! %entry
 ; CHECK:  cmp %i0, 0
-; CHECK:  be   .LBB1_5
+; CHECK:  bne  .LBB1_4
 ; CHECK:  nop
 }
 declare i8* @llvm.frameaddress(i32) #2
diff --git a/test/CodeGen/SystemZ/DAGCombine_trunc_extract.ll b/test/CodeGen/SystemZ/DAGCombine_trunc_extract.ll
new file mode 100644
index 0000000000000000000000000000000000000000..63c1c6363189ffaf1e468c70036e15e03fa45c62
--- /dev/null
+++ b/test/CodeGen/SystemZ/DAGCombine_trunc_extract.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=s390x-linux-gnu -mcpu=zEC12 < %s  | FileCheck %s
+;
+; Check that DAGCombiner doesn't crash in SystemZ combineTruncateExtract()
+; when handling EXTRACT_VECTOR_ELT without vector support.
+
+define void @autogen_SD21598(<2 x i8> %Arg) {
+; CHECK:	stc	%r3, 0(%r1)
+; CHECK:	j	.LBB0_1
+
+entry:
+  br label %loop
+
+loop:                                            ; preds = %CF249, %CF247
+  %Shuff = shufflevector <2 x i8> undef, <2 x i8> %Arg, <2 x i32> <i32 3, i32 1>
+  %E = extractelement <2 x i8> %Shuff, i32 0
+  store i8 %E, i8* undef
+  br label %loop
+}
diff --git a/test/CodeGen/SystemZ/DAGCombiner_illegal_BUILD_VECTOR.ll b/test/CodeGen/SystemZ/DAGCombiner_illegal_BUILD_VECTOR.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3e5757c902cc5fe89dfe12d7b0e9b709066e5429
--- /dev/null
+++ b/test/CodeGen/SystemZ/DAGCombiner_illegal_BUILD_VECTOR.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+;
+; Check that DAGCombiner does not crash after producing an illegal
+; BUILD_VECTOR node.
+
+
+define void @pr32422() {
+; CHECK:        cdbr    %f0, %f0
+; CHECK:        jo      .LBB0_1
+
+BB:
+  %I = insertelement <8 x i8> zeroinitializer, i8 -95, i32 3
+  %I8 = insertelement <8 x i8> zeroinitializer, i8 -119, i32 2
+  %FC = uitofp <8 x i8> %I8 to <8 x float>
+  %Cmp18 = fcmp uno <8 x float> zeroinitializer, %FC
+  %I22 = insertelement <8 x i1> %Cmp18, i1 true, i32 5
+  br label %CF
+
+CF:                                               ; preds = %CF, %BB
+  %Cmp40 = fcmp uno double 0xC663C682E9619F00, undef
+  br i1 %Cmp40, label %CF, label %CF353
+
+CF353:                                            ; preds = %CF
+  %E195 = extractelement <8 x i1> %I22, i32 4
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/expand-zext-pseudo.ll b/test/CodeGen/SystemZ/expand-zext-pseudo.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1ee42885cb9c464707925ec44ddf33e3da9a19a8
--- /dev/null
+++ b/test/CodeGen/SystemZ/expand-zext-pseudo.ll
@@ -0,0 +1,132 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -verify-machineinstrs | FileCheck %s
+;
+; Test that a def operand of super-reg is not dropped during post RA pseudo
+; expansion in expandZExtPseudo().
+
+define void @fun_llvm_stress_reduced(i8*, i32*, i64*, i32) {
+; CHECK: .text
+BB:
+  %A = alloca i32
+  %Sl24 = select i1 undef, i32* %1, i32* %1
+  %L26 = load i16, i16* undef
+  %L32 = load i32, i32* %Sl24
+  br label %CF847
+
+CF847:                                            ; preds = %CF878, %BB
+  %L61 = load i16, i16* undef
+  br label %CF878
+
+CF878:                                            ; preds = %CF847
+  %PC66 = bitcast i32* %Sl24 to double*
+  %Sl67 = select i1 undef, <2 x i32> undef, <2 x i32> undef
+  %Cmp68 = icmp ugt i32 undef, %3
+  br i1 %Cmp68, label %CF847, label %CF863
+
+CF863:                                            ; preds = %CF878
+  %L84 = load i16, i16* undef
+  br label %CF825
+
+CF825:                                            ; preds = %CF825, %CF863
+  %Sl105 = select i1 undef, i1 undef, i1 undef
+  br i1 %Sl105, label %CF825, label %CF856
+
+CF856:                                            ; preds = %CF856, %CF825
+  %Cmp114 = icmp ult i16 -24837, %L61
+  br i1 %Cmp114, label %CF856, label %CF875
+
+CF875:                                            ; preds = %CF856
+  %Shuff124 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 3>
+  %PC126 = bitcast i32* %A to i64*
+  br label %CF827
+
+CF827:                                            ; preds = %CF923, %CF911, %CF875
+  %Sl142 = select i1 undef, i64 undef, i64 -1
+  %B148 = sdiv i32 409071, 409071
+  %E153 = extractelement <2 x i32> %Shuff124, i32 1
+  br label %CF911
+
+CF911:                                            ; preds = %CF827
+  br i1 undef, label %CF827, label %CF867
+
+CF867:                                            ; preds = %CF911
+  br label %CF870
+
+CF870:                                            ; preds = %CF870, %CF867
+  store i8 0, i8* %0
+  %FC176 = fptoui double undef to i1
+  br i1 %FC176, label %CF870, label %CF923
+
+CF923:                                            ; preds = %CF870
+  %L179 = load i16, i16* undef
+  %Sl191 = select i1 undef, i64* %PC126, i64* %PC126
+  br i1 false, label %CF827, label %CF828
+
+CF828:                                            ; preds = %CF905, %CF923
+  %B205 = urem i16 -7553, undef
+  %E209 = extractelement <2 x i32> %Sl67, i32 1
+  %Cmp215 = icmp ugt i16 %L179, 0
+  br label %CF905
+
+CF905:                                            ; preds = %CF828
+  %E231 = extractelement <4 x i1> undef, i32 1
+  br i1 %E231, label %CF828, label %CF829
+
+CF829:                                            ; preds = %CF909, %CF829, %CF905
+  %B234 = udiv i16 %L26, %L84
+  br i1 undef, label %CF829, label %CF894
+
+CF894:                                            ; preds = %CF894, %CF829
+  store i64 %Sl142, i64* %Sl191
+  %Sl241 = select i1 %Cmp114, i1 false, i1 %Cmp215
+  br i1 %Sl241, label %CF894, label %CF907
+
+CF907:                                            ; preds = %CF894
+  %B247 = udiv i32 0, %E153
+  %PC248 = bitcast i64* %2 to i8*
+  br label %CF909
+
+CF909:                                            ; preds = %CF907
+  store i1 %FC176, i1* undef
+  %Cmp263 = icmp ugt i1 undef, %Sl241
+  br i1 %Cmp263, label %CF829, label %CF830
+
+CF830:                                            ; preds = %CF909
+  %B304 = urem i16 %L84, %B205
+  %I311 = insertelement <2 x i32> %Shuff124, i32 %B247, i32 1
+  store i8 0, i8* %0
+  %Sl373 = select i1 %Cmp68, i32 0, i32 %E153
+  br label %CF833
+
+CF833:                                            ; preds = %CF880, %CF830
+  br label %CF880
+
+CF880:                                            ; preds = %CF833
+  %Cmp412 = icmp ne i16 %B234, -18725
+  br i1 %Cmp412, label %CF833, label %CF865
+
+CF865:                                            ; preds = %CF880
+  store double 0.000000e+00, double* %PC66
+  br label %CF860
+
+CF860:                                            ; preds = %CF860, %CF865
+  store i8 0, i8* %PC248
+  %Cmp600 = icmp sge i32 %B148, undef
+  br i1 %Cmp600, label %CF860, label %CF913
+
+CF913:                                            ; preds = %CF860
+  store i32 %E209, i32* undef
+  store i32 %Sl373, i32* undef
+  %Cmp771 = icmp ule i32 undef, %L32
+  br label %CF842
+
+CF842:                                            ; preds = %CF925, %CF913
+  br label %CF925
+
+CF925:                                            ; preds = %CF842
+  %Cmp778 = icmp sgt i1 %Cmp771, %Sl241
+  br i1 %Cmp778, label %CF842, label %CF898
+
+CF898:                                            ; preds = %CF925
+  %Sl785 = select i1 %Cmp600, i16 undef, i16 %B304
+  unreachable
+}
diff --git a/test/CodeGen/SystemZ/extract-vector-elt-zEC12.ll b/test/CodeGen/SystemZ/extract-vector-elt-zEC12.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7bfe5ac8c1a5487569b1ec64cd586b20d6c99afd
--- /dev/null
+++ b/test/CodeGen/SystemZ/extract-vector-elt-zEC12.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 | FileCheck %s
+;
+; Test that <1 x i8> is legalized properly without vector support.
+
+define void @autogen_SD18500(i8*) {
+; CHECK: .text
+BB:
+  %L5 = load i8, i8* %0
+  %I22 = insertelement <1 x i8> undef, i8 %L5, i32 0
+  %Cmp53 = icmp ule i1 undef, undef
+  br label %CF244
+
+CF244:                                            ; preds = %CF244, %BB
+  %Sl119 = select i1 %Cmp53, <1 x i8> %I22, <1 x i8> undef
+  %Cmp148 = fcmp une float 0x3E03A81780000000, 0x42D92DCD00000000
+  br i1 %Cmp148, label %CF244, label %CF241
+
+CF241:                                            ; preds = %CF241, %CF244
+  %Sl199 = select i1 true, <1 x i8> %Sl119, <1 x i8> zeroinitializer
+  br label %CF241
+}
diff --git a/test/CodeGen/SystemZ/fold-memory-op-impl.ll b/test/CodeGen/SystemZ/fold-memory-op-impl.ll
new file mode 100644
index 0000000000000000000000000000000000000000..dda4df90d1b95ce6f3a9f89884adee6524b2af68
--- /dev/null
+++ b/test/CodeGen/SystemZ/fold-memory-op-impl.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -verify-machineinstrs | FileCheck %s
+;
+; Test that foldMemoryOperandImpl() doesn't drop subreg / read-undef flags.
+
+
+define void @fun_llvm_stress_reduced(i8*, i32*, i64*, i64, i8) {
+; CHECK: .text
+BB:
+  %A4 = alloca <4 x i64>
+  %A1 = alloca <8 x i1>
+  %E6 = extractelement <4 x i1> undef, i32 3
+  %L23 = load i8, i8* %0
+  %B27 = fmul double 0x59A989483BA7E0C6, undef
+  %L30 = load i16, i16* undef
+  store i16 -11933, i16* undef
+  %L46 = load i16, i16* undef
+  %L61 = load i16, i16* undef
+  %Sl74 = select i1 undef, i1 undef, i1 true
+  br label %CF846
+
+CF846:                                            ; preds = %CF877, %BB
+  %I86 = insertelement <4 x i1> undef, i1 undef, i32 0
+  %Cmp89 = icmp ne i64 undef, %3
+  %L90 = load i16, i16* undef
+  %Shuff92 = shufflevector <4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <4 x i32> <i32 0, i32 2, i32 undef, i32 6>
+  br label %CF877
+
+CF877:                                            ; preds = %CF846
+  store i16 %L61, i16* undef
+  %Cmp110 = icmp eq i16 %L61, undef
+  br i1 %Cmp110, label %CF846, label %CF862
+
+CF862:                                            ; preds = %CF877
+  %I114 = insertelement <4 x i64> zeroinitializer, i64 0, i32 0
+  %B115 = shl <4 x i64> zeroinitializer, %I114
+  %Sl124 = select i1 true, <8 x i1>* %A1, <8 x i1>* %A1
+  %B130 = frem double %B27, 0x59A989483BA7E0C6
+  %E143 = extractelement <4 x i64> %B115, i32 1
+  %Sl148 = select i1 %Cmp89, <1 x i32> undef, <1 x i32> zeroinitializer
+  br label %CF855
+
+CF855:                                            ; preds = %CF855, %CF862
+  %Sl171 = select i1 %Sl74, i1 %E6, i1 undef
+  br i1 %Sl171, label %CF855, label %CF874
+
+CF874:                                            ; preds = %CF855
+  %PC186 = bitcast i32* %1 to i16*
+  %L196 = load i16, i16* undef
+  %B207 = or i8 %4, %L23
+  %L211 = load <8 x i1>, <8 x i1>* %Sl124
+  %B215 = fdiv double 0x8421A9C0D21F6D3E, %B130
+  %L218 = load i16, i16* %PC186
+  %Sl223 = select i1 %Sl171, <4 x i1> %I86, <4 x i1> undef
+  br label %CF826
+
+CF826:                                            ; preds = %CF866, %CF910, %CF874
+  %B245 = ashr i16 -11933, %L46
+  br label %CF910
+
+CF910:                                            ; preds = %CF826
+  %L257 = load i8, i8* %0
+  %BC262 = bitcast i64 %E143 to double
+  store i16 %L196, i16* %PC186
+  %E266 = extractelement <4 x i16> %Shuff92, i32 0
+  %Sl271 = select i1 %Cmp89, i1 %Cmp89, i1 %Cmp110
+  br i1 %Sl271, label %CF826, label %CF866
+
+CF866:                                            ; preds = %CF910
+  store i64 %E143, i64* %2
+  %I276 = insertelement <4 x double> undef, double %BC262, i32 3
+  %L281 = load <8 x i1>, <8 x i1>* %Sl124
+  %E282 = extractelement <4 x i1> zeroinitializer, i32 2
+  br i1 %E282, label %CF826, label %CF848
+
+CF848:                                            ; preds = %CF866
+  %Cmp288 = fcmp olt <4 x double> undef, %I276
+  %FC294 = fptosi double undef to i16
+  %Cmp296 = icmp ule i16 %FC294, %B245
+  store i16 %L218, i16* undef
+  store i8 %L23, i8* %0
+  %E320 = extractelement <4 x i1> %Sl223, i32 1
+  %PC337 = bitcast <8 x i1>* %Sl124 to i1*
+  %Cmp345 = icmp uge <1 x i32> undef, %Sl148
+  store i16 %L196, i16* %PC186
+  br label %CF893
+
+CF893:                                            ; preds = %CF893, %CF848
+  %Cmp361 = fcmp uge float undef, undef
+  br i1 %Cmp361, label %CF893, label %CF906
+
+CF906:                                            ; preds = %CF893
+  store i16 -11933, i16* undef
+  %Shuff379 = shufflevector <1 x i1> undef, <1 x i1> %Cmp345, <1 x i32> <i32 1>
+  br label %CF850
+
+CF850:                                            ; preds = %CF850, %CF906
+  br i1 undef, label %CF850, label %CF925
+
+CF925:                                            ; preds = %CF850
+  store i16 %E266, i16* %PC186
+  %Cmp413 = icmp ugt i8 %L257, undef
+  store i16 %L30, i16* %PC186
+  %Sl420 = select i1 %Sl171, <8 x i1> undef, <8 x i1> %L281
+  store i16 %L90, i16* undef
+  %FC469 = uitofp i1 %Cmp296 to float
+  store i1 %Cmp413, i1* %PC337
+  br label %CF833
+
+CF833:                                            ; preds = %CF833, %CF925
+  store i8 %B207, i8* %0
+  %E509 = extractelement <8 x i1> %L211, i32 7
+  br i1 %E509, label %CF833, label %CF882
+
+CF882:                                            ; preds = %CF833
+  store i1 %Sl271, i1* %PC337
+  br label %CF852
+
+CF852:                                            ; preds = %CF896, %CF882
+  store i1 %Sl74, i1* %PC337
+  br label %CF896
+
+CF896:                                            ; preds = %CF852
+  %E576 = extractelement <4 x i1> %Cmp288, i32 3
+  br i1 %E576, label %CF852, label %CF890
+
+CF890:                                            ; preds = %CF896
+  %Sl581 = select i1 undef, float undef, float %FC469
+  unreachable
+}
diff --git a/test/CodeGen/SystemZ/int-cmp-44.ll b/test/CodeGen/SystemZ/int-cmp-44.ll
index 1b9a4ae353fe864b6aa47af00dc061c52dd46ea8..85a8788a3bddec2d3f263d19eeb3cf9108442367 100644
--- a/test/CodeGen/SystemZ/int-cmp-44.ll
+++ b/test/CodeGen/SystemZ/int-cmp-44.ll
@@ -473,8 +473,8 @@ entry:
   %xor = xor i32 %val, 1
   %add = add i32 %xor, 1000000
   call void @foo()
-  %cmp = icmp ne i32 %add, 0
-  br i1 %cmp, label %exit, label %store
+  %cmp = icmp eq i32 %add, 0
+  br i1 %cmp, label %store, label %exit, !prof !1
 
 store:
   store i32 %add, i32 *%ptr
@@ -888,3 +888,5 @@ store:
 exit:
   ret i64 %res
 }
+
+!1 = !{!"branch_weights", i32 2, i32 1}
diff --git a/test/CodeGen/SystemZ/locr-legal-regclass.ll b/test/CodeGen/SystemZ/locr-legal-regclass.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1f792439a49cc1421ddcbc96ceb5884906fb2963
--- /dev/null
+++ b/test/CodeGen/SystemZ/locr-legal-regclass.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=zEC12 -verify-machineinstrs | FileCheck %s
+;
+; Test that early if conversion produces LOCR with operands of the right
+; register classes.
+
+define void @autogen_SD4739(i8*) {
+; CHECK-NOT: Expected a GR32Bit register, but got a GRX32Bit register
+BB:
+  %L34 = load i8, i8* %0
+  %Cmp56 = icmp sgt i8 undef, %L34
+  br label %CF246
+
+CF246:                                            ; preds = %CF246, %BB
+  %Sl163 = select i1 %Cmp56, i8 %L34, i8 undef
+  br i1 undef, label %CF246, label %CF248
+
+CF248:                                            ; preds = %CF248, %CF246
+  store i8 %Sl163, i8* %0
+  br label %CF248
+}
diff --git a/test/CodeGen/SystemZ/mature-mc-support.ll b/test/CodeGen/SystemZ/mature-mc-support.ll
index 5520f55e1e29ee7a720c3ca89c5d8de5b028a40c..a01716c27670149dcff89a594ea2c49f7d3f95b5 100644
--- a/test/CodeGen/SystemZ/mature-mc-support.ll
+++ b/test/CodeGen/SystemZ/mature-mc-support.ll
@@ -12,4 +12,4 @@
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
 
-; CHECK: LLVM ERROR: Error parsing inline asm
+; CHECK: error: unknown directive
diff --git a/test/CodeGen/SystemZ/memchr-01.ll b/test/CodeGen/SystemZ/memchr-01.ll
index f4d381b37f26440ee8cf2e4cec41c3f6dacd4e19..0cfca2af1e9833077a109bfe375000eac503dd70 100644
--- a/test/CodeGen/SystemZ/memchr-01.ll
+++ b/test/CodeGen/SystemZ/memchr-01.ll
@@ -1,21 +1,57 @@
-; Test memchr using SRST, with a weird but usable prototype.
+; Test memchr using SRST, with the correct prototype.
 ;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
 
-declare i8 *@memchr(i8 *%src, i16 %char, i32 %len)
+declare i8 *@memchr(i8 *%src, i32 %char, i64 %len)
 
 ; Test a simple forwarded call.
-define i8 *@f1(i8 *%src, i16 %char, i32 %len) {
+define i8 *@f1(i64 %len, i8 *%src, i32 %char) {
 ; CHECK-LABEL: f1:
-; CHECK-DAG: lgr [[REG:%r[1-5]]], %r2
-; CHECK-DAG: algfr %r2, %r4
-; CHECK-DAG: llcr %r0, %r3
+; CHECK-DAG: agr %r2, %r3
+; CHECK-DAG: llcr %r0, %r4
 ; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: srst %r2, [[REG]]
+; CHECK: srst %r2, %r3
 ; CHECK-NEXT: jo [[LABEL]]
 ; CHECK: blr %r14
 ; CHECK: lghi %r2, 0
 ; CHECK: br %r14
-  %res = call i8 *@memchr(i8 *%src, i16 %char, i32 %len)
+  %res = call i8 *@memchr(i8 *%src, i32 %char, i64 %len)
   ret i8 *%res
 }
+
+; Test a doubled call with no use of %r0 in between.  There should be a
+; single load of %r0.
+define i8 *@f2(i8 *%src, i8 *%charptr, i64 %len) {
+; CHECK-LABEL: f2:
+; CHECK: llc %r0, 0(%r3)
+; CHECK-NOT: %r0
+; CHECK: srst [[RES1:%r[1-5]]], %r2
+; CHECK-NOT: %r0
+; CHECK: srst %r2, [[RES1]]
+; CHECK: br %r14
+  %char = load volatile i8 , i8 *%charptr
+  %charext = zext i8 %char to i32
+  %res1 = call i8 *@memchr(i8 *%src, i32 %charext, i64 %len)
+  %res2 = call i8 *@memchr(i8 *%res1, i32 %charext, i64 %len)
+  ret i8 *%res2
+}
+
+; Test a doubled call with a use of %r0 in between.  %r0 must be loaded
+; for each loop.
+define i8 *@f3(i8 *%src, i8 *%charptr, i64 %len) {
+; CHECK-LABEL: f3:
+; CHECK: llc [[CHAR:%r[1-5]]], 0(%r3)
+; CHECK: lr %r0, [[CHAR]]
+; CHECK: srst [[RES1:%r[1-5]]], %r2
+; CHECK: lhi %r0, 0
+; CHECK: blah %r0
+; CHECK: lr %r0, [[CHAR]]
+; CHECK: srst %r2, [[RES1]]
+; CHECK: br %r14
+  %char = load volatile i8 , i8 *%charptr
+  %charext = zext i8 %char to i32
+  %res1 = call i8 *@memchr(i8 *%src, i32 %charext, i64 %len)
+  call void asm sideeffect "blah $0", "{r0}" (i32 0)
+  %res2 = call i8 *@memchr(i8 *%res1, i32 %charext, i64 %len)
+  ret i8 *%res2
+}
diff --git a/test/CodeGen/SystemZ/memchr-02.ll b/test/CodeGen/SystemZ/memchr-02.ll
deleted file mode 100644
index 0cfca2af1e9833077a109bfe375000eac503dd70..0000000000000000000000000000000000000000
--- a/test/CodeGen/SystemZ/memchr-02.ll
+++ /dev/null
@@ -1,57 +0,0 @@
-; Test memchr using SRST, with the correct prototype.
-;
-; RUN: llc < %s -mtriple=s390x-linux-gnu -no-integrated-as | FileCheck %s
-
-declare i8 *@memchr(i8 *%src, i32 %char, i64 %len)
-
-; Test a simple forwarded call.
-define i8 *@f1(i64 %len, i8 *%src, i32 %char) {
-; CHECK-LABEL: f1:
-; CHECK-DAG: agr %r2, %r3
-; CHECK-DAG: llcr %r0, %r4
-; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: srst %r2, %r3
-; CHECK-NEXT: jo [[LABEL]]
-; CHECK: blr %r14
-; CHECK: lghi %r2, 0
-; CHECK: br %r14
-  %res = call i8 *@memchr(i8 *%src, i32 %char, i64 %len)
-  ret i8 *%res
-}
-
-; Test a doubled call with no use of %r0 in between.  There should be a
-; single load of %r0.
-define i8 *@f2(i8 *%src, i8 *%charptr, i64 %len) {
-; CHECK-LABEL: f2:
-; CHECK: llc %r0, 0(%r3)
-; CHECK-NOT: %r0
-; CHECK: srst [[RES1:%r[1-5]]], %r2
-; CHECK-NOT: %r0
-; CHECK: srst %r2, [[RES1]]
-; CHECK: br %r14
-  %char = load volatile i8 , i8 *%charptr
-  %charext = zext i8 %char to i32
-  %res1 = call i8 *@memchr(i8 *%src, i32 %charext, i64 %len)
-  %res2 = call i8 *@memchr(i8 *%res1, i32 %charext, i64 %len)
-  ret i8 *%res2
-}
-
-; Test a doubled call with a use of %r0 in between.  %r0 must be loaded
-; for each loop.
-define i8 *@f3(i8 *%src, i8 *%charptr, i64 %len) {
-; CHECK-LABEL: f3:
-; CHECK: llc [[CHAR:%r[1-5]]], 0(%r3)
-; CHECK: lr %r0, [[CHAR]]
-; CHECK: srst [[RES1:%r[1-5]]], %r2
-; CHECK: lhi %r0, 0
-; CHECK: blah %r0
-; CHECK: lr %r0, [[CHAR]]
-; CHECK: srst %r2, [[RES1]]
-; CHECK: br %r14
-  %char = load volatile i8 , i8 *%charptr
-  %charext = zext i8 %char to i32
-  %res1 = call i8 *@memchr(i8 *%src, i32 %charext, i64 %len)
-  call void asm sideeffect "blah $0", "{r0}" (i32 0)
-  %res2 = call i8 *@memchr(i8 *%res1, i32 %charext, i64 %len)
-  ret i8 *%res2
-}
diff --git a/test/CodeGen/SystemZ/memcmp-02.ll b/test/CodeGen/SystemZ/memcmp-02.ll
deleted file mode 100644
index da11170def79c5f0932e92fb661bcfc83e39120a..0000000000000000000000000000000000000000
--- a/test/CodeGen/SystemZ/memcmp-02.ll
+++ /dev/null
@@ -1,139 +0,0 @@
-; Test memcmp using CLC, with i64 results.
-;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-
-declare i64 @memcmp(i8 *%src1, i8 *%src2, i64 %size)
-
-; Zero-length comparisons should be optimized away.
-define i64 @f1(i8 *%src1, i8 *%src2) {
-; CHECK-LABEL: f1:
-; CHECK: lghi %r2, 0
-; CHECK: br %r14
-  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 0)
-  ret i64 %res
-}
-
-; Check a case where the result is used as an integer.
-define i64 @f2(i8 *%src1, i8 *%src2) {
-; CHECK-LABEL: f2:
-; CHECK: clc 0(2,%r2), 0(%r3)
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: rll [[REG]], [[REG]], 31
-; CHECK: lgfr %r2, [[REG]]
-; CHECK: br %r14
-  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 2)
-  ret i64 %res
-}
-
-; Check a case where the result is tested for equality.
-define void @f3(i8 *%src1, i8 *%src2, i64 *%dest) {
-; CHECK-LABEL: f3:
-; CHECK: clc 0(3,%r2), 0(%r3)
-; CHECK-NEXT: ber %r14
-; CHECK: br %r14
-  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 3)
-  %cmp = icmp eq i64 %res, 0
-  br i1 %cmp, label %exit, label %store
-
-store:
-  store i64 0, i64 *%dest
-  br label %exit
-
-exit:
-  ret void
-}
-
-; Check a case where the result is tested for inequality.
-define void @f4(i8 *%src1, i8 *%src2, i64 *%dest) {
-; CHECK-LABEL: f4:
-; CHECK: clc 0(4,%r2), 0(%r3)
-; CHECK-NEXT: blhr %r14
-; CHECK: br %r14
-entry:
-  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 4)
-  %cmp = icmp ne i64 %res, 0
-  br i1 %cmp, label %exit, label %store
-
-store:
-  store i64 0, i64 *%dest
-  br label %exit
-
-exit:
-  ret void
-}
-
-; Check a case where the result is tested via slt.
-define void @f5(i8 *%src1, i8 *%src2, i64 *%dest) {
-; CHECK-LABEL: f5:
-; CHECK: clc 0(5,%r2), 0(%r3)
-; CHECK-NEXT: blr %r14
-; CHECK: br %r14
-entry:
-  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 5)
-  %cmp = icmp slt i64 %res, 0
-  br i1 %cmp, label %exit, label %store
-
-store:
-  store i64 0, i64 *%dest
-  br label %exit
-
-exit:
-  ret void
-}
-
-; Check a case where the result is tested for sgt.
-define void @f6(i8 *%src1, i8 *%src2, i64 *%dest) {
-; CHECK-LABEL: f6:
-; CHECK: clc 0(6,%r2), 0(%r3)
-; CHECK-NEXT: bhr %r14
-; CHECK: br %r14
-entry:
-  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 6)
-  %cmp = icmp sgt i64 %res, 0
-  br i1 %cmp, label %exit, label %store
-
-store:
-  store i64 0, i64 *%dest
-  br label %exit
-
-exit:
-  ret void
-}
-
-; Check the upper end of the CLC range.  Here the result is used both as
-; an integer and for branching.
-define i64 @f7(i8 *%src1, i8 *%src2, i64 *%dest) {
-; CHECK-LABEL: f7:
-; CHECK: clc 0(256,%r2), 0(%r3)
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: rll [[REG]], [[REG]], 31
-; CHECK: lgfr %r2, [[REG]]
-; CHECK: blr %r14
-; CHECK: br %r14
-entry:
-  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 256)
-  %cmp = icmp slt i64 %res, 0
-  br i1 %cmp, label %exit, label %store
-
-store:
-  store i64 0, i64 *%dest
-  br label %exit
-
-exit:
-  ret i64 %res
-}
-
-; 257 bytes needs two CLCs.
-define i64 @f8(i8 *%src1, i8 *%src2) {
-; CHECK-LABEL: f8:
-; CHECK: clc 0(256,%r2), 0(%r3)
-; CHECK: jlh [[LABEL:\..*]]
-; CHECK: clc 256(1,%r2), 256(%r3)
-; CHECK: [[LABEL]]:
-; CHECK: ipm [[REG:%r[0-5]]]
-; CHECK: br %r14
-  %res = call i64 @memcmp(i8 *%src1, i8 *%src2, i64 257)
-  ret i64 %res
-}
diff --git a/test/CodeGen/SystemZ/pr32372.ll b/test/CodeGen/SystemZ/pr32372.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c18e238fbaf90aa67ff3812fe3cc5d1cbfafc0ed
--- /dev/null
+++ b/test/CodeGen/SystemZ/pr32372.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -o - -mtriple=s390x-linux-gnu | FileCheck %s
+
+define void @pr32372(i8*) {
+; CHECK-LABEL: pr32372:
+; CHECK:       # BB#0: # %BB
+; CHECK-NEXT:    llc %r1, 0(%r2)
+; CHECK-NEXT:    mvhhi 0(%r1), -3825
+; CHECK-NEXT:    llill %r0, 0
+; CHECK-NEXT:    dlr %r0, %r1
+; CHECK-NEXT:  .LBB0_1: # %CF251
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    j .LBB0_1
+BB:
+  %L = load i8, i8* %0
+  store i16 -3825, i16* undef
+  %L5 = load i8, i8* %0
+  %B9 = urem i8 %L5, %L
+  %I107 = insertelement <8 x i8> zeroinitializer, i8 %B9, i32 7
+  %ZE141 = zext i8 %L5 to i16
+  br label %CF251
+
+CF251:                                            ; preds = %CF258, %CF251, %BB
+  %Shuff217 = shufflevector <8 x i8> zeroinitializer, <8 x i8> %I107, <8 x i32> <i32 0, i32 2, i32 undef, i32 6, i32 8, i32 undef, i32 12, i32 14>
+  %Cmp227 = icmp sge i16 %ZE141, 0
+  br i1 %Cmp227, label %CF251, label %CF258
+
+CF258:                                            ; preds = %CF251
+  %Shuff230 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 3, i32 1>
+  br label %CF251
+}
diff --git a/test/CodeGen/SystemZ/pr32505.ll b/test/CodeGen/SystemZ/pr32505.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6abad02201647e787c6e1f41dc82cf95ea52eb42
--- /dev/null
+++ b/test/CodeGen/SystemZ/pr32505.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mcpu=zEC12 -o - %s | FileCheck %s
+
+target triple = "s390x-ibm-linux"
+
+define <2 x float> @pr32505(<2 x i8> * %a) {
+; CHECK-LABEL: pr32505:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    lbh %r0, 0(%r2)
+; CHECK-NEXT:    ldgr %f0, %r0
+; CHECK-NEXT:    lbh %r0, 1(%r2)
+; CHECK-NEXT:    ldgr %f2, %r0
+; CHECK-NEXT:    # kill: %F0S<def> %F0S<kill> %F0D<kill>
+; CHECK-NEXT:    # kill: %F2S<def> %F2S<kill> %F2D<kill>
+; CHECK-NEXT:    br %r14
+  %L17 = load <2 x i8>, <2 x i8>* %a
+  %Se21 = sext <2 x i8> %L17 to <2 x i32>
+  %BC = bitcast <2 x i32> %Se21 to <2 x float>
+  ret <2 x float> %BC
+}
diff --git a/test/CodeGen/SystemZ/splitMove_undefReg_mverifier.ll b/test/CodeGen/SystemZ/splitMove_undefReg_mverifier.ll
new file mode 100644
index 0000000000000000000000000000000000000000..db6e3653b50691cf90487bdc4cca8260ee4c0a9a
--- /dev/null
+++ b/test/CodeGen/SystemZ/splitMove_undefReg_mverifier.ll
@@ -0,0 +1,413 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -verify-machineinstrs | FileCheck %s
+;
+; Regression test for a machine verifier complaint discovered with llvm-stress.
+; Test that splitting of a 128 bit store does not result in use of undef phys reg.
+
+define void @autogen_SD29355(i8*, i32*, i64*, i32, i64, i8) {
+; CHECK: .text
+BB:
+  %A4 = alloca double
+  %A3 = alloca float
+  %A2 = alloca i8
+  %A1 = alloca double
+  %A = alloca i64
+  %L = load i8, i8* %0
+  store i8 33, i8* %0
+  %E = extractelement <8 x i1> zeroinitializer, i32 2
+  br label %CF261
+
+CF261:                                            ; preds = %BB
+  %Shuff = shufflevector <2 x i16> zeroinitializer, <2 x i16> zeroinitializer, <2 x i32> <i32 undef, i32 3>
+  %I = insertelement <8 x i8> zeroinitializer, i8 69, i32 3
+  %B = udiv i8 -99, 33
+  %Tr = trunc i64 -1 to i32
+  %Sl = select i1 true, i64* %2, i64* %2
+  %L5 = load i64, i64* %Sl
+  store i64 %L5, i64* %2
+  %E6 = extractelement <4 x i16> zeroinitializer, i32 3
+  %Shuff7 = shufflevector <4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <4 x i32> <i32 6, i32 0, i32 2, i32 4>
+  %I8 = insertelement <4 x i16> %Shuff7, i16 27357, i32 0
+  %B9 = xor <4 x i16> %Shuff7, %Shuff7
+  %Tr10 = trunc i64 %4 to i1
+  br label %CF239
+
+CF239:                                            ; preds = %CF261
+  %Sl11 = select i1 %Tr10, i16 -1, i16 27357
+  %L12 = load i8, i8* %0
+  store i64 %L5, i64* %A
+  %E13 = extractelement <8 x i1> zeroinitializer, i32 0
+  br label %CF238
+
+CF238:                                            ; preds = %CF238, %CF239
+  %Shuff14 = shufflevector <4 x i16> zeroinitializer, <4 x i16> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 7, i32 1>
+  %I15 = insertelement <4 x i16> %Shuff7, i16 -1, i32 1
+  %B16 = fsub double 0xDACBFCEAC1C99968, 0xDACBFCEAC1C99968
+  %Sl17 = select i1 %E, i64* %Sl, i64* %Sl
+  %Cmp = icmp ugt i16 %E6, 27357
+  br i1 %Cmp, label %CF238, label %CF251
+
+CF251:                                            ; preds = %CF238
+  %L18 = load i64, i64* %Sl17
+  store i64 0, i64* %Sl
+  %E19 = extractelement <4 x i16> zeroinitializer, i32 1
+  %Shuff20 = shufflevector <2 x i1> zeroinitializer, <2 x i1> zeroinitializer, <2 x i32> <i32 undef, i32 2>
+  %I21 = insertelement <2 x i1> zeroinitializer, i1 true, i32 0
+  %FC = fptoui float 0x3BE9BD7D80000000 to i1
+  br label %CF237
+
+CF237:                                            ; preds = %CF237, %CF271, %CF268, %CF251
+  %Sl22 = select i1 true, i16 -1, i16 %E6
+  %Cmp23 = icmp sgt i1 %E13, true
+  br i1 %Cmp23, label %CF237, label %CF256
+
+CF256:                                            ; preds = %CF256, %CF237
+  %L24 = load i64, i64* %A
+  store i64 %L5, i64* %Sl17
+  %E25 = extractelement <4 x i16> zeroinitializer, i32 3
+  %Shuff26 = shufflevector <4 x i16> %Shuff7, <4 x i16> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 6, i32 undef>
+  %I27 = insertelement <4 x i16> zeroinitializer, i16 %Sl22, i32 0
+  %B28 = udiv i16 %Sl11, -1
+  %ZE = zext i1 true to i32
+  %Sl29 = select i1 true, i8 -99, i8 33
+  %Cmp30 = fcmp ord double 0xC275146F92573C4, 0x16FB351AF5F9C998
+  br i1 %Cmp30, label %CF256, label %CF271
+
+CF271:                                            ; preds = %CF256
+  %L31 = load i8, i8* %0
+  store i64 %L5, i64* %Sl
+  %E32 = extractelement <4 x i16> zeroinitializer, i32 2
+  %Shuff33 = shufflevector <1 x i32> zeroinitializer, <1 x i32> zeroinitializer, <1 x i32> <i32 1>
+  %I34 = insertelement <4 x i16> zeroinitializer, i16 %Sl11, i32 1
+  %PC = bitcast double* %A4 to i1*
+  %Sl35 = select i1 %FC, i32* %1, i32* %1
+  %Cmp36 = icmp ult <2 x i1> %Shuff20, %Shuff20
+  %L37 = load i64, i64* %Sl
+  store i64 %L5, i64* %Sl
+  %E38 = extractelement <2 x i32> zeroinitializer, i32 0
+  %Shuff39 = shufflevector <4 x i16> zeroinitializer, <4 x i16> %Shuff7, <4 x i32> <i32 undef, i32 1, i32 3, i32 undef>
+  %I40 = insertelement <4 x i16> %Shuff7, i16 %E19, i32 1
+  %ZE41 = zext i1 true to i16
+  %Sl42 = select i1 true, i1 true, i1 true
+  br i1 %Sl42, label %CF237, label %CF246
+
+CF246:                                            ; preds = %CF246, %CF271
+  %Cmp43 = icmp uge i64 %L37, %L18
+  br i1 %Cmp43, label %CF246, label %CF249
+
+CF249:                                            ; preds = %CF249, %CF263, %CF246
+  %L44 = load i64, i64* %A
+  store i64 %L5, i64* %Sl17
+  %E45 = extractelement <4 x i16> %Shuff14, i32 2
+  %Shuff46 = shufflevector <1 x i32> zeroinitializer, <1 x i32> zeroinitializer, <1 x i32> <i32 1>
+  %I47 = insertelement <4 x i16> %Shuff7, i16 %E6, i32 1
+  %Sl48 = select i1 %FC, double 0xDACBFCEAC1C99968, double 0xDACBFCEAC1C99968
+  %Cmp49 = fcmp ult double 0x9E8F85AE4F8D6C2C, 0x5A7FED9E637D2C1C
+  br i1 %Cmp49, label %CF249, label %CF263
+
+CF263:                                            ; preds = %CF249
+  %L50 = load i64, i64* %Sl
+  store i1 true, i1* %PC
+  %E51 = extractelement <2 x i1> zeroinitializer, i32 0
+  br i1 %E51, label %CF249, label %CF259
+
+CF259:                                            ; preds = %CF259, %CF263
+  %Shuff52 = shufflevector <4 x i64> zeroinitializer, <4 x i64> zeroinitializer, <4 x i32> <i32 undef, i32 5, i32 7, i32 1>
+  %I53 = insertelement <4 x i16> zeroinitializer, i16 -1, i32 1
+  %B54 = or <2 x i16> %Shuff, zeroinitializer
+  %Sl55 = select i1 %Sl42, i16 %Sl22, i16 27357
+  %Cmp56 = icmp uge i1 %Sl42, true
+  br i1 %Cmp56, label %CF259, label %CF268
+
+CF268:                                            ; preds = %CF259
+  %L57 = load i8, i8* %0
+  store i64 %L5, i64* %Sl
+  %E58 = extractelement <4 x i16> %Shuff14, i32 1
+  %Shuff59 = shufflevector <1 x i32> %Shuff33, <1 x i32> %Shuff33, <1 x i32> zeroinitializer
+  %I60 = insertelement <2 x i1> %Shuff20, i1 true, i32 0
+  %B61 = frem double 0x5A7FED9E637D2C1C, %B16
+  %FC62 = sitofp i8 -99 to float
+  %Sl63 = select i1 true, i16 %E19, i16 -1
+  %Cmp64 = icmp slt i16 %Sl63, 27357
+  br i1 %Cmp64, label %CF237, label %CF241
+
+CF241:                                            ; preds = %CF241, %CF265, %CF268
+  %L65 = load i1, i1* %PC
+  br i1 %L65, label %CF241, label %CF262
+
+CF262:                                            ; preds = %CF262, %CF270, %CF241
+  store i64 %L37, i64* %Sl
+  %E66 = extractelement <4 x i16> %Shuff14, i32 2
+  %Shuff67 = shufflevector <4 x i16> %Shuff26, <4 x i16> %Shuff7, <4 x i32> <i32 1, i32 3, i32 undef, i32 7>
+  %I68 = insertelement <2 x i32> zeroinitializer, i32 454413, i32 1
+  %B69 = sub <4 x i16> %I8, %Shuff7
+  %Tr70 = trunc i16 %E32 to i1
+  br i1 %Tr70, label %CF262, label %CF270
+
+CF270:                                            ; preds = %CF262
+  %Sl71 = select i1 %Sl42, <8 x i1> zeroinitializer, <8 x i1> zeroinitializer
+  %Cmp72 = icmp sge <2 x i16> %B54, zeroinitializer
+  %L73 = load i64, i64* %Sl
+  store i64 %L73, i64* %Sl
+  %E74 = extractelement <8 x i1> %Sl71, i32 5
+  br i1 %E74, label %CF262, label %CF265
+
+CF265:                                            ; preds = %CF270
+  %Shuff75 = shufflevector <2 x i32> %I68, <2 x i32> zeroinitializer, <2 x i32> <i32 undef, i32 2>
+  %I76 = insertelement <2 x i1> %Cmp72, i1 %Sl42, i32 0
+  %B77 = xor i16 27357, %B28
+  %PC78 = bitcast i1* %PC to i32*
+  %Sl79 = select i1 %Cmp64, <4 x i16> %Shuff14, <4 x i16> %Shuff7
+  %Cmp80 = icmp slt <2 x i1> zeroinitializer, %Shuff20
+  %L81 = load i1, i1* %PC
+  br i1 %L81, label %CF241, label %CF245
+
+CF245:                                            ; preds = %CF245, %CF265
+  store i1 true, i1* %PC
+  %E82 = extractelement <1 x i32> %Shuff33, i32 0
+  %Shuff83 = shufflevector <4 x i16> zeroinitializer, <4 x i16> %Shuff14, <4 x i32> <i32 2, i32 4, i32 6, i32 0>
+  %I84 = insertelement <2 x i1> %Shuff20, i1 %Sl42, i32 0
+  %FC85 = uitofp i1 %Cmp to float
+  %Sl86 = select i1 %Tr10, i16 -1, i16 %Sl63
+  %Cmp87 = icmp ugt <2 x i1> %I76, %I60
+  %L88 = load i32, i32* %PC78
+  store i8 33, i8* %0
+  %E89 = extractelement <2 x i32> zeroinitializer, i32 1
+  %Shuff90 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %Shuff52, <4 x i32> <i32 0, i32 undef, i32 4, i32 6>
+  %I91 = insertelement <2 x i32> %Shuff75, i32 %ZE, i32 0
+  %B92 = add i64 -1, %L73
+  %Tr93 = trunc i64 0 to i16
+  %Sl94 = select i1 %FC, i64 %L37, i64 %L5
+  %Cmp95 = icmp sge i64 454853, %B92
+  br i1 %Cmp95, label %CF245, label %CF257
+
+CF257:                                            ; preds = %CF245
+  %L96 = load i64, i64* %Sl
+  store i1 true, i1* %PC
+  %E97 = extractelement <2 x i1> %Shuff20, i32 1
+  br label %CF
+
+CF:                                               ; preds = %CF, %CF258, %CF257
+  %Shuff98 = shufflevector <2 x i1> %Cmp80, <2 x i1> zeroinitializer, <2 x i32> <i32 undef, i32 0>
+  %I99 = insertelement <2 x i1> %Shuff98, i1 %Cmp30, i32 0
+  %B100 = sub <8 x i8> zeroinitializer, zeroinitializer
+  %FC101 = uitofp <2 x i1> %I99 to <2 x double>
+  %Sl102 = select i1 %FC, i16 %Sl63, i16 %E58
+  %Cmp103 = fcmp ord double %B16, 0xDACBFCEAC1C99968
+  br i1 %Cmp103, label %CF, label %CF240
+
+CF240:                                            ; preds = %CF240, %CF260, %CF
+  %L104 = load i32, i32* %1
+  store i1 true, i1* %PC
+  %E105 = extractelement <4 x i16> %I8, i32 1
+  %Shuff106 = shufflevector <4 x i16> %Shuff7, <4 x i16> %I34, <4 x i32> <i32 4, i32 undef, i32 undef, i32 2>
+  %I107 = insertelement <2 x i1> %Cmp87, i1 %FC, i32 0
+  %ZE108 = zext <4 x i16> %B69 to <4 x i64>
+  %Sl109 = select i1 %Cmp, i16 27357, i16 %Sl102
+  %Cmp110 = icmp sge <4 x i16> %B9, zeroinitializer
+  %L111 = load i64, i64* %Sl
+  store i8 %L57, i8* %0
+  %E112 = extractelement <2 x i1> %Shuff98, i32 0
+  br i1 %E112, label %CF240, label %CF254
+
+CF254:                                            ; preds = %CF254, %CF267, %CF264, %CF240
+  %Shuff113 = shufflevector <2 x i32> %I68, <2 x i32> zeroinitializer, <2 x i32> undef
+  %I114 = insertelement <4 x i16> zeroinitializer, i16 27357, i32 3
+  %B115 = and i16 %Sl102, %Sl11
+  %FC116 = uitofp i16 %B115 to double
+  %Sl117 = select i1 %L81, i32* %1, i32* %1
+  %Cmp118 = icmp ne i64 %Sl94, %L50
+  br i1 %Cmp118, label %CF254, label %CF267
+
+CF267:                                            ; preds = %CF254
+  %L119 = load i64, i64* %Sl
+  store i32 %ZE, i32* %PC78
+  %E120 = extractelement <4 x i16> zeroinitializer, i32 1
+  %Shuff121 = shufflevector <1 x i32> %Shuff33, <1 x i32> %Shuff33, <1 x i32> zeroinitializer
+  %I122 = insertelement <1 x i32> %Shuff121, i32 %E82, i32 0
+  %B123 = mul <4 x i16> %I40, %I34
+  %Sl124 = select i1 %FC, <4 x i1> %Cmp110, <4 x i1> %Cmp110
+  %Cmp125 = icmp ne <4 x i64> %ZE108, zeroinitializer
+  %L126 = load i64, i64* %Sl
+  store i32 %ZE, i32* %Sl117
+  %E127 = extractelement <2 x i1> %Cmp87, i32 1
+  br i1 %E127, label %CF254, label %CF264
+
+CF264:                                            ; preds = %CF267
+  %Shuff128 = shufflevector <4 x i16> %Shuff83, <4 x i16> %I47, <4 x i32> <i32 undef, i32 2, i32 undef, i32 6>
+  %I129 = insertelement <4 x i16> %Shuff67, i16 %Sl109, i32 2
+  %B130 = add i32 %3, %E38
+  %FC131 = sitofp i32 %3 to float
+  %Sl132 = select i1 %Sl42, i64 %L24, i64 %L5
+  %Cmp133 = icmp eq <2 x i1> %I99, %Shuff20
+  %L134 = load i32, i32* %PC78
+  store i32 %L104, i32* %1
+  %E135 = extractelement <8 x i1> zeroinitializer, i32 4
+  br i1 %E135, label %CF254, label %CF260
+
+CF260:                                            ; preds = %CF264
+  %Shuff136 = shufflevector <1 x i32> %Shuff59, <1 x i32> %Shuff121, <1 x i32> undef
+  %I137 = insertelement <4 x i16> %Shuff67, i16 %Sl55, i32 3
+  %B138 = lshr <1 x i32> %Shuff33, %Shuff59
+  %Sl139 = select i1 %E135, i64 %L119, i64 %L126
+  %Cmp140 = icmp slt i8 -99, %Sl29
+  br i1 %Cmp140, label %CF240, label %CF247
+
+CF247:                                            ; preds = %CF247, %CF272, %CF260
+  %L141 = load i32, i32* %Sl117
+  store i8 %5, i8* %0
+  %E142 = extractelement <2 x i1> %Cmp36, i32 1
+  br i1 %E142, label %CF247, label %CF272
+
+CF272:                                            ; preds = %CF247
+  %Shuff143 = shufflevector <4 x i64> %Shuff90, <4 x i64> %Shuff52, <4 x i32> <i32 6, i32 undef, i32 2, i32 undef>
+  %I144 = insertelement <1 x i32> %Shuff121, i32 %L88, i32 0
+  %Tr145 = trunc i64 %Sl139 to i16
+  %Sl146 = select i1 %Cmp49, i32 %L134, i32 %L104
+  %L147 = load i32, i32* %PC78
+  store i32 %Tr, i32* %Sl117
+  %E148 = extractelement <4 x i16> %Shuff67, i32 3
+  %Shuff149 = shufflevector <4 x i16> zeroinitializer, <4 x i16> %Shuff67, <4 x i32> <i32 2, i32 4, i32 6, i32 0>
+  %I150 = insertelement <2 x i1> zeroinitializer, i1 %E127, i32 0
+  %B151 = fdiv double 0x16FB351AF5F9C998, 0xC275146F92573C4
+  %FC152 = uitofp <1 x i32> %I144 to <1 x double>
+  %Sl153 = select i1 %Cmp118, <1 x i32> %Shuff136, <1 x i32> %Shuff121
+  %Cmp154 = icmp ule i8 %5, %Sl29
+  br i1 %Cmp154, label %CF247, label %CF253
+
+CF253:                                            ; preds = %CF253, %CF269, %CF272
+  %L155 = load i32, i32* %Sl117
+  store i32 %L141, i32* %PC78
+  %E156 = extractelement <4 x i1> %Cmp125, i32 2
+  br i1 %E156, label %CF253, label %CF269
+
+CF269:                                            ; preds = %CF253
+  %Shuff157 = shufflevector <1 x i32> %Shuff46, <1 x i32> %Shuff121, <1 x i32> <i32 1>
+  %I158 = insertelement <4 x i16> %Shuff128, i16 %E66, i32 1
+  %B159 = shl i64 %L119, %L73
+  %Se = sext i16 %B77 to i32
+  %Sl160 = select i1 %Cmp56, i16 %Sl63, i16 %B77
+  %L161 = load i64, i64* %Sl
+  store i32 %B130, i32* %Sl117
+  %E162 = extractelement <1 x i32> %Shuff59, i32 0
+  %Shuff163 = shufflevector <4 x i16> %Shuff7, <4 x i16> %Shuff67, <4 x i32> <i32 5, i32 7, i32 1, i32 3>
+  %I164 = insertelement <4 x i16> %Shuff106, i16 27357, i32 3
+  %Se165 = sext <4 x i1> %Sl124 to <4 x i8>
+  %Sl166 = select i1 true, i1 %Cmp, i1 %Tr70
+  br i1 %Sl166, label %CF253, label %CF255
+
+CF255:                                            ; preds = %CF255, %CF266, %CF269
+  %Cmp167 = icmp sge i64 %4, %L24
+  br i1 %Cmp167, label %CF255, label %CF266
+
+CF266:                                            ; preds = %CF255
+  %L168 = load i8, i8* %0
+  store i32 %E38, i32* %PC78
+  %E169 = extractelement <2 x i16> zeroinitializer, i32 1
+  %Shuff170 = shufflevector <4 x i16> %Sl79, <4 x i16> %I137, <4 x i32> <i32 6, i32 0, i32 2, i32 4>
+  %I171 = insertelement <4 x i16> %Shuff163, i16 %ZE41, i32 0
+  %Tr172 = trunc i16 %Tr145 to i1
+  br i1 %Tr172, label %CF255, label %CF258
+
+CF258:                                            ; preds = %CF266
+  %Sl173 = select i1 true, <2 x i32> %I68, <2 x i32> %I91
+  %Cmp174 = icmp ugt <2 x i1> %Cmp72, %I150
+  %L175 = load i32, i32* %Sl117
+  store i32 %L104, i32* %Sl117
+  %E176 = extractelement <4 x i16> %Shuff67, i32 1
+  %Shuff177 = shufflevector <1 x i32> %Shuff121, <1 x i32> %Shuff33, <1 x i32> zeroinitializer
+  %I178 = insertelement <4 x i16> zeroinitializer, i16 27357, i32 0
+  %FC179 = sitofp <4 x i16> %I47 to <4 x float>
+  %Sl180 = select i1 %FC, i64 %L126, i64 %B92
+  %Cmp181 = fcmp ugt double %B61, %B16
+  br i1 %Cmp181, label %CF, label %CF236
+
+CF236:                                            ; preds = %CF236, %CF258
+  %L182 = load i8, i8* %0
+  store i32 %E38, i32* %Sl117
+  %E183 = extractelement <1 x i32> %Shuff121, i32 0
+  %Shuff184 = shufflevector <4 x i64> zeroinitializer, <4 x i64> %Shuff90, <4 x i32> <i32 7, i32 undef, i32 3, i32 5>
+  %I185 = insertelement <4 x i16> %Shuff106, i16 %Tr93, i32 1
+  %ZE186 = zext i32 %E162 to i64
+  %Sl187 = select i1 %Cmp95, <8 x i8> %B100, <8 x i8> %B100
+  %Cmp188 = icmp uge i16 %B115, %Sl11
+  br i1 %Cmp188, label %CF236, label %CF242
+
+CF242:                                            ; preds = %CF242, %CF250, %CF248, %CF236
+  %L189 = load i8, i8* %0
+  store i8 %Sl29, i8* %0
+  %E190 = extractelement <4 x i16> %B9, i32 3
+  %Shuff191 = shufflevector <4 x i16> %Shuff26, <4 x i16> %Shuff26, <4 x i32> <i32 6, i32 0, i32 2, i32 4>
+  %I192 = insertelement <1 x i32> %I122, i32 %3, i32 0
+  %B193 = udiv i8 %5, %L168
+  %Se194 = sext <8 x i1> %Sl71 to <8 x i32>
+  %Sl195 = select i1 %Cmp188, i8 %L182, i8 %L168
+  %Cmp196 = icmp slt i16 %B77, %Sl102
+  br i1 %Cmp196, label %CF242, label %CF250
+
+CF250:                                            ; preds = %CF242
+  %L197 = load i64, i64* %Sl
+  store i32 %ZE, i32* %Sl117
+  %E198 = extractelement <2 x i1> %Shuff20, i32 1
+  br i1 %E198, label %CF242, label %CF244
+
+CF244:                                            ; preds = %CF244, %CF250
+  %Shuff199 = shufflevector <1 x i32> %Shuff46, <1 x i32> %Shuff177, <1 x i32> zeroinitializer
+  %I200 = insertelement <4 x i16> %Shuff191, i16 %Sl86, i32 0
+  %B201 = mul i16 %ZE41, %E169
+  %Se202 = sext <4 x i16> %I171 to <4 x i64>
+  %Sl203 = select i1 %Sl166, i32 %E162, i32 %E82
+  %Cmp204 = icmp ule i16 %E32, %E120
+  br i1 %Cmp204, label %CF244, label %CF248
+
+CF248:                                            ; preds = %CF244
+  %L205 = load float, float* %A3
+  store i32 %Tr, i32* %PC78
+  %E206 = extractelement <2 x i1> %Shuff20, i32 1
+  br i1 %E206, label %CF242, label %CF243
+
+CF243:                                            ; preds = %CF243, %CF273, %CF248
+  %Shuff207 = shufflevector <8 x i1> zeroinitializer, <8 x i1> %Sl71, <8 x i32> <i32 4, i32 6, i32 8, i32 undef, i32 12, i32 undef, i32 undef, i32 2>
+  %I208 = insertelement <2 x i1> %Shuff20, i1 %E198, i32 0
+  %B209 = xor <4 x i16> %I129, %I34
+  %FC210 = uitofp <8 x i8> zeroinitializer to <8 x double>
+  %Sl211 = select i1 %E74, i16 %Tr93, i16 %E19
+  %Cmp212 = icmp ugt i32 %Se, %E38
+  br i1 %Cmp212, label %CF243, label %CF273
+
+CF273:                                            ; preds = %CF243
+  %L213 = load i32, i32* %PC78
+  store i8 %L168, i8* %0
+  %E214 = extractelement <2 x i32> %Shuff113, i32 1
+  %Shuff215 = shufflevector <4 x i16> %Shuff128, <4 x i16> %I137, <4 x i32> <i32 6, i32 0, i32 2, i32 4>
+  %I216 = insertelement <2 x i1> %Shuff20, i1 %Cmp30, i32 0
+  %B217 = sub <4 x i16> %Shuff83, %I185
+  %Tr218 = trunc <4 x i16> %B9 to <4 x i1>
+  %Sl219 = select i1 %Cmp154, i8 %B, i8 %5
+  %Cmp220 = icmp uge <4 x i64> %Shuff52, %Shuff52
+  %L221 = load i32, i32* %Sl117
+  store i8 %L168, i8* %0
+  %E222 = extractelement <4 x i16> %Shuff191, i32 0
+  %Shuff223 = shufflevector <4 x i16> %Shuff26, <4 x i16> %I34, <4 x i32> <i32 undef, i32 1, i32 3, i32 5>
+  %I224 = insertelement <4 x i16> %Shuff26, i16 %Tr145, i32 1
+  %FC225 = sitofp i1 %Cmp56 to float
+  %Sl226 = select i1 %E, i1 %Cmp154, i1 %Sl166
+  br i1 %Sl226, label %CF243, label %CF252
+
+CF252:                                            ; preds = %CF273
+  %Cmp227 = icmp ugt <4 x i64> %Shuff143, zeroinitializer
+  %L228 = load i32, i32* %Sl117
+  store i32 %Tr, i32* %PC78
+  %E229 = extractelement <4 x i16> %Shuff163, i32 2
+  %Shuff230 = shufflevector <1 x i32> %Shuff199, <1 x i32> zeroinitializer, <1 x i32> <i32 1>
+  %I231 = insertelement <4 x i16> %Shuff106, i16 %E32, i32 1
+  %B232 = srem i32 %Sl203, %Sl203
+  %FC233 = fptoui double 0x5A7FED9E637D2C1C to i32
+  %Sl234 = select i1 %Cmp103, i8 %B193, i8 %L168
+  %Cmp235 = icmp uge <2 x i16> zeroinitializer, zeroinitializer
+  store i32 %ZE, i32* %PC78
+  store i64 %L5, i64* %Sl
+  store i8 33, i8* %0
+  store i8 %L168, i8* %0
+  store i1 %Sl226, i1* %PC
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/stack-guard.ll b/test/CodeGen/SystemZ/stack-guard.ll
index 0889e7ba941e2fa1a235df1eb3fe0ba35bad6bf3..2908cbe92bbb1c335a9db762660384415825667a 100644
--- a/test/CodeGen/SystemZ/stack-guard.ll
+++ b/test/CodeGen/SystemZ/stack-guard.ll
@@ -17,19 +17,19 @@ define i32 @test_stack_guard() #0 {
 entry:
   %a1 = alloca [256 x i32], align 4
   %0 = bitcast [256 x i32]* %a1 to i8*
-  call void @llvm.lifetime.start(i64 1024, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1024, i8* %0)
   %arraydecay = getelementptr inbounds [256 x i32], [256 x i32]* %a1, i64 0, i64 0
   call void @foo3(i32* %arraydecay)
-  call void @llvm.lifetime.end(i64 1024, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 1024, i8* %0)
   ret i32 0
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare void @foo3(i32*)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 attributes #0 = { sspstrong }
diff --git a/test/CodeGen/SystemZ/strcmp-02.ll b/test/CodeGen/SystemZ/strcmp-02.ll
deleted file mode 100644
index 99d7d9cfa6923bde9fdfd4643dd7bc9b66733632..0000000000000000000000000000000000000000
--- a/test/CodeGen/SystemZ/strcmp-02.ll
+++ /dev/null
@@ -1,72 +0,0 @@
-; Test strcmp using CLST, i64 version.
-;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-
-declare i64 @strcmp(i8 *%src1, i8 *%src2)
-
-; Check a case where the result is used as an integer.
-define i64 @f1(i8 *%src1, i8 *%src2) {
-; CHECK-LABEL: f1:
-; CHECK: lhi %r0, 0
-; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: clst %r2, %r3
-; CHECK-NEXT: jo [[LABEL]]
-; CHECK-NEXT: BB#{{[0-9]+}}
-; CHECK-NEXT: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: rll [[REG]], [[REG]], 31
-; CHECK: lgfr %r2, [[REG]]
-; CHECK: br %r14
-  %res = call i64 @strcmp(i8 *%src1, i8 *%src2)
-  ret i64 %res
-}
-
-; Check a case where the result is tested for equality.
-define void @f2(i8 *%src1, i8 *%src2, i64 *%dest) {
-; CHECK-LABEL: f2:
-; CHECK: lhi %r0, 0
-; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: clst %r2, %r3
-; CHECK-NEXT: jo [[LABEL]]
-; CHECK-NEXT: BB#{{[0-9]+}}
-; CHECK-NEXT: ber %r14
-; CHECK: br %r14
-  %res = call i64 @strcmp(i8 *%src1, i8 *%src2)
-  %cmp = icmp eq i64 %res, 0
-  br i1 %cmp, label %exit, label %store
-
-store:
-  store i64 0, i64 *%dest
-  br label %exit
-
-exit:
-  ret void
-}
-
-; Test a case where the result is used both as an integer and for
-; branching.
-define i64 @f3(i8 *%src1, i8 *%src2, i64 *%dest) {
-; CHECK-LABEL: f3:
-; CHECK: lhi %r0, 0
-; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK: clst %r2, %r3
-; CHECK-NEXT: jo [[LABEL]]
-; CHECK-NEXT: BB#{{[0-9]+}}
-; CHECK-NEXT: ipm [[REG:%r[0-5]]]
-; CHECK: srl [[REG]], 28
-; CHECK: rll [[REG]], [[REG]], 31
-; CHECK: lgfr %r2, [[REG]]
-; CHECK: blr %r14
-; CHECK: br %r14
-entry:
-  %res = call i64 @strcmp(i8 *%src1, i8 *%src2)
-  %cmp = icmp slt i64 %res, 0
-  br i1 %cmp, label %exit, label %store
-
-store:
-  store i64 0, i64 *%dest
-  br label %exit
-
-exit:
-  ret i64 %res
-}
diff --git a/test/CodeGen/SystemZ/strlen-02.ll b/test/CodeGen/SystemZ/strlen-02.ll
deleted file mode 100644
index e1abbff4b4e02694338e77e3215e03a1dba7d41c..0000000000000000000000000000000000000000
--- a/test/CodeGen/SystemZ/strlen-02.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; Test strlen using SRST, i32 version.
-;
-; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
-
-declare i32 @strlen(i8 *%src)
-declare i32 @strnlen(i8 *%src, i32 %len)
-
-; Test strlen with an i32-based prototype.  It would also be valid for
-; the uses of %r3 and REG after the LGR to be swapped.
-define i32 @f1(i32 %dummy, i8 *%src) {
-; CHECK-LABEL: f1:
-; CHECK-DAG: lhi %r0, 0
-; CHECK-DAG: lghi %r2, 0
-; CHECK-DAG: lgr [[REG:%r[145]]], %r3
-; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK-NEXT: srst %r2, [[REG]]
-; CHECK-NEXT: jo [[LABEL]]
-; CHECK-NEXT: BB#{{[0-9]+}}
-; CHECK-NEXT: sgr %r2, %r3
-; CHECK: br %r14
-  %res = call i32 @strlen(i8 *%src)
-  ret i32 %res
-}
-
-; Test strnlen with an i32-based prototype.
-define i32 @f2(i32 zeroext %len, i8 *%src) {
-; CHECK-LABEL: f2:
-; CHECK-DAG: agr %r2, %r3
-; CHECK-DAG: lhi %r0, 0
-; CHECK-DAG: lgr [[REG:%r[145]]], %r3
-; CHECK: [[LABEL:\.[^:]*]]:
-; CHECK-NEXT: srst %r2, [[REG]]
-; CHECK-NEXT: jo [[LABEL]]
-; CHECK-NEXT: BB#{{[0-9]+}}
-; CHECK-NEXT: sgr %r2, %r3
-; CHECK: br %r14
-  %res = call i32 @strnlen(i8 *%src, i32 %len)
-  ret i32 %res
-}
diff --git a/test/CodeGen/SystemZ/unaligned-01.ll b/test/CodeGen/SystemZ/unaligned-01.ll
index 94cad0e1743a3f8ead015bfee0673bca1c339462..2af1aa79a23f735cd6f01d740f72a74715d6492f 100644
--- a/test/CodeGen/SystemZ/unaligned-01.ll
+++ b/test/CodeGen/SystemZ/unaligned-01.ll
@@ -1,10 +1,7 @@
 ; Check that unaligned accesses are allowed in general.  We check the
 ; few exceptions (like CRL) in their respective test files.
 ;
-; FIXME: -combiner-alias-analysis (the default for SystemZ) stops
-;        f1 from being optimized.
-; RUN: llc < %s -mtriple=s390x-linux-gnu -combiner-alias-analysis=false \
-; RUN:   | FileCheck %s
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s
 
 ; Check that these four byte stores become a single word store.
 define void @f1(i8 *%ptr) {
diff --git a/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll b/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll
new file mode 100644
index 0000000000000000000000000000000000000000..271513f2e9edaf8e1253cd7dc305f57638f4227d
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-cmp-cmp-logic-select.ll
@@ -0,0 +1,5784 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;
+; Test that a vector select with a logic combination of two compares do not
+; produce any unnecessary pack, unpack or shift instructions.
+; And, Or and Xor are tested.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13   | FileCheck %s
+
+
+define <2 x i8> @fun0(<2 x i8> %val1, <2 x i8> %val2, <2 x i8> %val3, <2 x i8> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i8> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun1(<2 x i8> %val1, <2 x i8> %val2, <2 x i8> %val3, <2 x i8> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i8> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i8> @fun2(<2 x i8> %val1, <2 x i8> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v1, %v28, %v30
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vpkh %v1, %v1, %v1
+; CHECK-NEXT:    vn %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i32> @fun3(<2 x i8> %val1, <2 x i8> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i32> @fun4(<2 x i8> %val1, <2 x i8> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i16> @fun5(<2 x i8> %val1, <2 x i8> %val2, <2 x float> %val3, <2 x float> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i64> @fun6(<2 x i8> %val1, <2 x i8> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v28, %v30
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i8> @fun7(<2 x i16> %val1, <2 x i16> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun8(<2 x i16> %val1, <2 x i16> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun9(<2 x i16> %val1, <2 x i16> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i8> @fun10(<2 x i16> %val1, <2 x i16> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v1, %v28, %v30
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vpkf %v1, %v1, %v1
+; CHECK-NEXT:    vn %v0, %v0, %v1
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i8> @fun11(<2 x i16> %val1, <2 x i16> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI11_0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x double> @fun12(<2 x i16> %val1, <2 x i16> %val2, <2 x float> %val3, <2 x float> %val4, <2 x double> %val5, <2 x double> %val6) {
+; CHECK-LABEL: fun12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x double> %val5, <2 x double> %val6
+  ret <2 x double> %sel
+}
+
+define <2 x i16> @fun13(<2 x i16> %val1, <2 x i16> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI13_0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vfchdb %v0, %v28, %v30
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i16> @fun14(<2 x i32> %val1, <2 x i32> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun14:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun15(<2 x i32> %val1, <2 x i32> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i64> @fun16(<2 x i32> %val1, <2 x i32> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @fun17(<2 x i32> %val1, <2 x i32> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i16> @fun18(<2 x i32> %val1, <2 x i32> %val2, <2 x float> %val3, <2 x float> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun18:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x float> @fun19(<2 x i32> %val1, <2 x i32> %val2, <2 x double> %val3, <2 x double> %val4, <2 x float> %val5, <2 x float> %val6) {
+; CHECK-LABEL: fun19:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v1, %v28, %v30
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vpkg %v1, %v1, %v1
+; CHECK-NEXT:    vn %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x float> %val5, <2 x float> %val6
+  ret <2 x float> %sel
+}
+
+define <2 x i16> @fun20(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun20:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    larl %r1, .LCPI20_0
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i64> @fun21(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @fun22(<2 x i64> %val1, <2 x i64> %val2, <2 x float> %val3, <2 x float> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun22:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i16> @fun23(<2 x i64> %val1, <2 x i64> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun23:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v28, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    larl %r1, .LCPI23_0
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x float> @fun24(<2 x float> %val1, <2 x float> %val2, <2 x float> %val3, <2 x float> %val4, <2 x float> %val5, <2 x float> %val6) {
+; CHECK-LABEL: fun24:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <2 x float> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x float> %val5, <2 x float> %val6
+  ret <2 x float> %sel
+}
+
+define <2 x i32> @fun25(<2 x float> %val1, <2 x float> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun25:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vfchdb %v1, %v28, %v30
+; CHECK-NEXT:    vpkg %v1, %v1, %v1
+; CHECK-NEXT:    vn %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <2 x float> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = and <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <4 x i16> @fun26(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3, <4 x i32> %val4, <4 x i16> %val5, <4 x i16> %val6) {
+; CHECK-LABEL: fun26:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i32> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i16> %val5, <4 x i16> %val6
+  ret <4 x i16> %sel
+}
+
+define <4 x i32> @fun27(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3, <4 x i32> %val4, <4 x i32> %val5, <4 x i32> %val6) {
+; CHECK-LABEL: fun27:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i32> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i32> %val5, <4 x i32> %val6
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun28(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3, <4 x i32> %val4, <4 x i64> %val5, <4 x i64> %val6) {
+; CHECK-LABEL: fun28:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i32> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i64> %val5, <4 x i64> %val6
+  ret <4 x i64> %sel
+}
+
+define <4 x i32> @fun29(<4 x i32> %val1, <4 x i32> %val2, <4 x i64> %val3, <4 x i64> %val4, <4 x i32> %val5, <4 x i32> %val6) {
+; CHECK-LABEL: fun29:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v30, %v27
+; CHECK-NEXT:    vceqg %v1, %v28, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i64> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i32> %val5, <4 x i32> %val6
+  ret <4 x i32> %sel
+}
+
+define <4 x i16> @fun30(<4 x i32> %val1, <4 x i32> %val2, <4 x float> %val3, <4 x float> %val4, <4 x i16> %val5, <4 x i16> %val6) {
+; CHECK-LABEL: fun30:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i16> %val5, <4 x i16> %val6
+  ret <4 x i16> %sel
+}
+
+define <4 x i8> @fun31(<4 x i32> %val1, <4 x i32> %val2, <4 x double> %val3, <4 x double> %val4, <4 x i8> %val5, <4 x i8> %val6) {
+; CHECK-LABEL: fun31:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v27
+; CHECK-NEXT:    vfchdb %v1, %v28, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    larl %r1, .LCPI31_0
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <4 x double> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i8> %val5, <4 x i8> %val6
+  ret <4 x i8> %sel
+}
+
+define <4 x i32> @fun32(<4 x i64> %val1, <4 x i64> %val2, <4 x i64> %val3, <4 x i64> %val4, <4 x i32> %val5, <4 x i32> %val6) {
+; CHECK-LABEL: fun32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v27, %v31
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v29
+; CHECK-NEXT:    vceqg %v2, %v24, %v28
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = icmp eq <4 x i64> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i32> %val5, <4 x i32> %val6
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun33(<4 x i64> %val1, <4 x i64> %val2, <4 x i64> %val3, <4 x i64> %val4, <4 x i64> %val5, <4 x i64> %val6) {
+; CHECK-LABEL: fun33:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v25, %v29
+; CHECK-NEXT:    vceqg %v1, %v24, %v28
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vceqg %v0, %v27, %v31
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = icmp eq <4 x i64> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i64> %val5, <4 x i64> %val6
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @fun34(<4 x i64> %val1, <4 x i64> %val2, <4 x float> %val3, <4 x float> %val4, <4 x i64> %val5, <4 x i64> %val6) {
+; CHECK-LABEL: fun34:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v1, %v25, %v25
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v27, %v27
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vceqg %v2, %v24, %v28
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v2, %v1
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v31, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i64> %val5, <4 x i64> %val6
+  ret <4 x i64> %sel
+}
+
+define <4 x float> @fun35(<4 x i64> %val1, <4 x i64> %val2, <4 x double> %val3, <4 x double> %val4, <4 x float> %val5, <4 x float> %val6) {
+; CHECK-LABEL: fun35:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v27, %v31
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vfchdb %v1, %v25, %v29
+; CHECK-NEXT:    vceqg %v2, %v24, %v28
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <4 x double> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x float> %val5, <4 x float> %val6
+  ret <4 x float> %sel
+}
+
+define <4 x i16> @fun36(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4, <4 x i16> %val5, <4 x i16> %val6) {
+; CHECK-LABEL: fun36:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i16> %val5, <4 x i16> %val6
+  ret <4 x i16> %sel
+}
+
+define <4 x float> @fun37(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4, <4 x float> %val5, <4 x float> %val6) {
+; CHECK-LABEL: fun37:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x float> %val5, <4 x float> %val6
+  ret <4 x float> %sel
+}
+
+define <4 x double> @fun38(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4, <4 x double> %val5, <4 x double> %val6) {
+; CHECK-LABEL: fun38:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x double> %val5, <4 x double> %val6
+  ret <4 x double> %sel
+}
+
+define <4 x i8> @fun39(<4 x float> %val1, <4 x float> %val2, <4 x double> %val3, <4 x double> %val4, <4 x i8> %val5, <4 x i8> %val6) {
+; CHECK-LABEL: fun39:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v27
+; CHECK-NEXT:    vfchdb %v1, %v28, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    larl %r1, .LCPI39_0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x double> %val3, %val4
+  %and = and <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i8> %val5, <4 x i8> %val6
+  ret <4 x i8> %sel
+}
+
+define <8 x i8> @fun40(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3, <8 x i16> %val4, <8 x i8> %val5, <8 x i8> %val6) {
+; CHECK-LABEL: fun40:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i16> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i8> %val5, <8 x i8> %val6
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun41(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3, <8 x i16> %val4, <8 x i16> %val5, <8 x i16> %val6) {
+; CHECK-LABEL: fun41:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i16> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i16> %val5, <8 x i16> %val6
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun42(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3, <8 x i16> %val4, <8 x i32> %val5, <8 x i32> %val6) {
+; CHECK-LABEL: fun42:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i16> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i32> %val5, <8 x i32> %val6
+  ret <8 x i32> %sel
+}
+
+define <8 x i64> @fun43(<8 x i16> %val1, <8 x i16> %val2, <8 x i32> %val3, <8 x i32> %val4, <8 x i64> %val5, <8 x i64> %val6) {
+; CHECK-LABEL: fun43:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vceqf %v0, %v28, %v25
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vn %v0, %v2, %v0
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v31, %v2, %v0
+; CHECK-NEXT:    vceqf %v0, %v30, %v27
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i32> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i64> %val5, <8 x i64> %val6
+  ret <8 x i64> %sel
+}
+
+define <8 x i8> @fun44(<8 x i16> %val1, <8 x i16> %val2, <8 x i64> %val3, <8 x i64> %val4, <8 x i8> %val5, <8 x i8> %val6) {
+; CHECK-LABEL: fun44:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 160(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqg %v1, %v30, %v31
+; CHECK-NEXT:    vceqg %v2, %v28, %v29
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vlrepg %v1, 200(%r15)
+; CHECK-NEXT:    vlrepg %v2, 192(%r15)
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i64> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i8> %val5, <8 x i8> %val6
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun45(<8 x i16> %val1, <8 x i16> %val2, <8 x float> %val3, <8 x float> %val4, <8 x i16> %val5, <8 x i16> %val6) {
+; CHECK-LABEL: fun45:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v27, %v27
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v28, %v28
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v25, %v25
+; CHECK-NEXT:    vmrlf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <8 x float> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i16> %val5, <8 x i16> %val6
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun46(<8 x i16> %val1, <8 x i16> %val2, <8 x double> %val3, <8 x double> %val4, <8 x i32> %val5, <8 x i32> %val6) {
+; CHECK-LABEL: fun46:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v31
+; CHECK-NEXT:    vfchdb %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vn %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v2, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <8 x double> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i32> %val5, <8 x i32> %val6
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @fun47(<8 x i32> %val1, <8 x i32> %val2, <8 x i64> %val3, <8 x i64> %val4, <8 x i32> %val5, <8 x i32> %val6) {
+; CHECK-LABEL: fun47:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 160(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vceqg %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v26, %v30
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i32> %val1, %val2
+  %cmp1 = icmp eq <8 x i64> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i32> %val5, <8 x i32> %val6
+  ret <8 x i32> %sel
+}
+
+define <8 x double> @fun48(<8 x i32> %val1, <8 x i32> %val2, <8 x float> %val3, <8 x float> %val4, <8 x double> %val5, <8 x double> %val6) {
+; CHECK-LABEL: fun48:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v29, %v29
+; CHECK-NEXT:    vmrlf %v1, %v25, %v25
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v29, %v29
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vmrlf %v1, %v31, %v31
+; CHECK-NEXT:    vmrlf %v2, %v27, %v27
+; CHECK-NEXT:    vmrhf %v3, %v27, %v27
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v31, %v31
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vceqf %v2, %v26, %v30
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <8 x float> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x double> %val5, <8 x double> %val6
+  ret <8 x double> %sel
+}
+
+define <8 x double> @fun49(<8 x i32> %val1, <8 x i32> %val2, <8 x double> %val3, <8 x double> %val4, <8 x double> %val5, <8 x double> %val6) {
+; CHECK-LABEL: fun49:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 160(%r15)
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vfchdb %v0, %v25, %v0
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vn %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 192(%r15)
+; CHECK-NEXT:    vceqf %v2, %v26, %v30
+; CHECK-NEXT:    vfchdb %v0, %v29, %v0
+; CHECK-NEXT:    vuphf %v3, %v2
+; CHECK-NEXT:    vn %v0, %v3, %v0
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vl %v4, 256(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 304(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vl %v2, 272(%r15)
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <8 x double> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x double> %val5, <8 x double> %val6
+  ret <8 x double> %sel
+}
+
+define <8 x i64> @fun50(<8 x float> %val1, <8 x float> %val2, <8 x double> %val3, <8 x double> %val4, <8 x i64> %val5, <8 x i64> %val6) {
+; CHECK-LABEL: fun50:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v28, %v28
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v28, %v28
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vl %v3, 224(%r15)
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v4, 256(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vn %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vmrlf %v2, %v26, %v26
+; CHECK-NEXT:    vmrhf %v3, %v26, %v26
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vfchdb %v3, %v29, %v3
+; CHECK-NEXT:    vn %v2, %v2, %v3
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vfchdb %v2, %v27, %v2
+; CHECK-NEXT:    vn %v0, %v0, %v2
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 272(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vfchdb %v1, %v31, %v1
+; CHECK-NEXT:    vn %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <8 x float> %val1, %val2
+  %cmp1 = fcmp ogt <8 x double> %val3, %val4
+  %and = and <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i64> %val5, <8 x i64> %val6
+  ret <8 x i64> %sel
+}
+
+define <16 x i8> @fun51(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3, <16 x i8> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun51:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i8> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun52(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3, <16 x i8> %val4, <16 x i16> %val5, <16 x i16> %val6) {
+; CHECK-LABEL: fun52:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i8> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i16> %val5, <16 x i16> %val6
+  ret <16 x i16> %sel
+}
+
+define <16 x i64> @fun53(<16 x i8> %val1, <16 x i8> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i64> %val5, <16 x i64> %val6) {
+; CHECK-LABEL: fun53:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vceqh %v0, %v28, %v25
+; CHECK-NEXT:    vuphb %v2, %v1
+; CHECK-NEXT:    vn %v0, %v2, %v0
+; CHECK-NEXT:    vuphh %v2, %v0
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v24, %v29, %v3, %v2
+; CHECK-NEXT:    vpkg %v2, %v0, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vl %v3, 272(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v26, %v31, %v3, %v2
+; CHECK-NEXT:    vmrlg %v2, %v0, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 12
+; CHECK-NEXT:    vl %v3, 288(%r15)
+; CHECK-NEXT:    vl %v4, 160(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vsel %v0, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vceqh %v2, %v30, %v27
+; CHECK-NEXT:    vlr %v30, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vn %v1, %v1, %v2
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v25, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v3, 336(%r15)
+; CHECK-NEXT:    vl %v4, 208(%r15)
+; CHECK-NEXT:    vpkg %v2, %v1, %v1
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v27, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vl %v4, 224(%r15)
+; CHECK-NEXT:    vmrlg %v2, %v1, %v1
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vsldb %v1, %v1, %v1, 12
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vsel %v29, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v31, %v3, %v2, %v1
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i64> %val5, <16 x i64> %val6
+  ret <16 x i64> %sel
+}
+
+define <16 x i64> @fun54(<16 x i8> %val1, <16 x i8> %val2, <16 x i32> %val3, <16 x i32> %val4, <16 x i64> %val5, <16 x i64> %val6) {
+; CHECK-LABEL: fun54:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v2, %v1
+; CHECK-NEXT:    vceqf %v0, %v28, %v29
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vn %v0, %v2, %v0
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vl %v3, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vpkg %v2, %v1, %v1
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vceqf %v0, %v30, %v31
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vn %v0, %v2, %v0
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vl %v4, 224(%r15)
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vl %v5, 256(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vl %v4, 384(%r15)
+; CHECK-NEXT:    vmrlg %v3, %v1, %v1
+; CHECK-NEXT:    vuphb %v3, %v3
+; CHECK-NEXT:    vceqf %v2, %v25, %v2
+; CHECK-NEXT:    vuphh %v3, %v3
+; CHECK-NEXT:    vn %v2, %v3, %v2
+; CHECK-NEXT:    vuphf %v3, %v2
+; CHECK-NEXT:    vsldb %v1, %v1, %v1, 12
+; CHECK-NEXT:    vsel %v25, %v5, %v4, %v3
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vl %v4, 416(%r15)
+; CHECK-NEXT:    vl %v5, 288(%r15)
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vceqf %v3, %v27, %v3
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vn %v1, %v1, %v3
+; CHECK-NEXT:    vuphf %v3, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v29, %v5, %v4, %v3
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vl %v4, 240(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v3, 272(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v2, %v2
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i32> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i64> %val5, <16 x i64> %val6
+  ret <16 x i64> %sel
+}
+
+define <16 x i64> @fun55(<16 x i8> %val1, <16 x i8> %val2, <16 x i64> %val3, <16 x i64> %val4, <16 x i64> %val5, <16 x i64> %val6) {
+; CHECK-LABEL: fun55:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 192(%r15)
+; CHECK-NEXT:    vceqg %v1, %v28, %v0
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v2, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 448(%r15)
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vpkf %v2, %v0, %v0
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v30, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 464(%r15)
+; CHECK-NEXT:    vl %v3, 336(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v1
+; CHECK-NEXT:    vpkg %v2, %v0, %v0
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 480(%r15)
+; CHECK-NEXT:    vsel %v28, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vsldb %v2, %v0, %v0, 6
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v27, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 496(%r15)
+; CHECK-NEXT:    vsel %v30, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v3, 384(%r15)
+; CHECK-NEXT:    vmrlg %v2, %v0, %v0
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v29, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 512(%r15)
+; CHECK-NEXT:    vsel %v25, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v3, 400(%r15)
+; CHECK-NEXT:    vsldb %v2, %v0, %v0, 10
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v31, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 528(%r15)
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 288(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vl %v3, 416(%r15)
+; CHECK-NEXT:    vceqg %v1, %v2, %v1
+; CHECK-NEXT:    vsldb %v2, %v0, %v0, 12
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 14
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 544(%r15)
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v29, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vceqg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 432(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vn %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 560(%r15)
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i64> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i64> %val5, <16 x i64> %val6
+  ret <16 x i64> %sel
+}
+
+define <16 x i16> @fun56(<16 x i8> %val1, <16 x i8> %val2, <16 x float> %val3, <16 x float> %val4, <16 x i16> %val5, <16 x i16> %val6) {
+; CHECK-LABEL: fun56:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v31, %v31
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v31, %v31
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v28, %v28
+; CHECK-NEXT:    vmrlf %v4, %v25, %v25
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v29, %v29
+; CHECK-NEXT:    vmrlf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v29, %v29
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v2, %v1
+; CHECK-NEXT:    vn %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vmrlf %v2, %v0, %v0
+; CHECK-NEXT:    vmrlf %v3, %v27, %v27
+; CHECK-NEXT:    vmrhf %v0, %v0, %v0
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vmrhf %v3, %v27, %v27
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v0, %v3, %v0
+; CHECK-NEXT:    vpkg %v0, %v0, %v2
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vmrlf %v3, %v2, %v2
+; CHECK-NEXT:    vmrhf %v2, %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v25, %v25
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v2, %v4, %v2
+; CHECK-NEXT:    vpkg %v2, %v2, %v3
+; CHECK-NEXT:    vpkf %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <16 x float> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i16> %val5, <16 x i16> %val6
+  ret <16 x i16> %sel
+}
+
+define <16 x i8> @fun57(<16 x i8> %val1, <16 x i8> %val2, <16 x double> %val3, <16 x double> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun57:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 304(%r15)
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 288(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 256(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v31, %v1
+; CHECK-NEXT:    vfchdb %v2, %v29, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v27, %v1
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vfchdb %v2, %v30, %v2
+; CHECK-NEXT:    vfchdb %v3, %v28, %v3
+; CHECK-NEXT:    vpkg %v2, %v3, %v2
+; CHECK-NEXT:    vpkf %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <16 x double> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @fun58(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun58:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v27, %v31
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v25, %v29
+; CHECK-NEXT:    vceqh %v2, %v24, %v28
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun59(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i16> %val5, <16 x i16> %val6) {
+; CHECK-LABEL: fun59:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v25, %v29
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vceqh %v0, %v27, %v31
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i16> %val5, <16 x i16> %val6
+  ret <16 x i16> %sel
+}
+
+define <16 x i32> @fun60(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i32> %val5, <16 x i32> %val6) {
+; CHECK-LABEL: fun60:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v25, %v29
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vceqh %v1, %v27, %v31
+; CHECK-NEXT:    vceqh %v2, %v26, %v30
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i32> %val5, <16 x i32> %val6
+  ret <16 x i32> %sel
+}
+
+define <16 x i8> @fun61(<16 x i16> %val1, <16 x i16> %val2, <16 x i32> %val3, <16 x i32> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun61:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vceqf %v0, %v31, %v0
+; CHECK-NEXT:    vceqf %v1, %v29, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vceqf %v1, %v27, %v1
+; CHECK-NEXT:    vceqf %v2, %v25, %v2
+; CHECK-NEXT:    vpkf %v1, %v2, %v1
+; CHECK-NEXT:    vceqh %v2, %v24, %v28
+; CHECK-NEXT:    vn %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i32> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i32> @fun62(<16 x i16> %val1, <16 x i16> %val2, <16 x i64> %val3, <16 x i64> %val4, <16 x i32> %val5, <16 x i32> %val6) {
+; CHECK-LABEL: fun62:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vn %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 416(%r15)
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vceqg %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vceqg %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v0, %v2, %v0
+; CHECK-NEXT:    vceqh %v2, %v26, %v30
+; CHECK-NEXT:    vuphh %v3, %v2
+; CHECK-NEXT:    vn %v0, %v3, %v0
+; CHECK-NEXT:    vl %v3, 448(%r15)
+; CHECK-NEXT:    vl %v4, 384(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vceqg %v3, %v29, %v3
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v3, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v1, %v0
+; CHECK-NEXT:    vl %v0, 336(%r15)
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 320(%r15)
+; CHECK-NEXT:    vceqg %v1, %v3, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i64> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i32> %val5, <16 x i32> %val6
+  ret <16 x i32> %sel
+}
+
+define <16 x double> @fun63(<16 x i16> %val1, <16 x i16> %val2, <16 x float> %val3, <16 x float> %val4, <16 x double> %val5, <16 x double> %val6) {
+; CHECK-LABEL: fun63:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 160(%r15)
+; CHECK-NEXT:    vmrlf %v1, %v0, %v0
+; CHECK-NEXT:    vmrlf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v0, %v0, %v0
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vl %v4, 224(%r15)
+; CHECK-NEXT:    vl %v5, 416(%r15)
+; CHECK-NEXT:    vl %v6, 288(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v2, %v0
+; CHECK-NEXT:    vpkg %v0, %v0, %v1
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vn %v0, %v2, %v0
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vsel %v24, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vmrlf %v3, %v2, %v2
+; CHECK-NEXT:    vmrlf %v4, %v27, %v27
+; CHECK-NEXT:    vmrhf %v2, %v2, %v2
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v27, %v27
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v2, %v4, %v2
+; CHECK-NEXT:    vl %v4, 256(%r15)
+; CHECK-NEXT:    vpkg %v2, %v2, %v3
+; CHECK-NEXT:    vl %v3, 384(%r15)
+; CHECK-NEXT:    vn %v1, %v1, %v2
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vmrlf %v3, %v2, %v2
+; CHECK-NEXT:    vmrlf %v4, %v29, %v29
+; CHECK-NEXT:    vmrhf %v2, %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v29, %v29
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v2, %v4, %v2
+; CHECK-NEXT:    vpkg %v2, %v2, %v3
+; CHECK-NEXT:    vceqh %v3, %v26, %v30
+; CHECK-NEXT:    vuphh %v4, %v3
+; CHECK-NEXT:    vn %v2, %v4, %v2
+; CHECK-NEXT:    vuphf %v4, %v2
+; CHECK-NEXT:    vsel %v25, %v6, %v5, %v4
+; CHECK-NEXT:    vl %v4, 208(%r15)
+; CHECK-NEXT:    vmrlf %v5, %v4, %v4
+; CHECK-NEXT:    vmrlf %v6, %v31, %v31
+; CHECK-NEXT:    vmrhf %v4, %v4, %v4
+; CHECK-NEXT:    vmrlg %v3, %v3, %v3
+; CHECK-NEXT:    vuphh %v3, %v3
+; CHECK-NEXT:    vldeb %v5, %v5
+; CHECK-NEXT:    vldeb %v6, %v6
+; CHECK-NEXT:    vfchdb %v5, %v6, %v5
+; CHECK-NEXT:    vmrhf %v6, %v31, %v31
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vldeb %v6, %v6
+; CHECK-NEXT:    vfchdb %v4, %v6, %v4
+; CHECK-NEXT:    vl %v6, 320(%r15)
+; CHECK-NEXT:    vpkg %v4, %v4, %v5
+; CHECK-NEXT:    vl %v5, 448(%r15)
+; CHECK-NEXT:    vn %v3, %v3, %v4
+; CHECK-NEXT:    vuphf %v4, %v3
+; CHECK-NEXT:    vsel %v29, %v6, %v5, %v4
+; CHECK-NEXT:    vl %v4, 368(%r15)
+; CHECK-NEXT:    vl %v5, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v5, %v4, %v0
+; CHECK-NEXT:    vl %v4, 272(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 400(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v4, %v1, %v0
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v2, %v2
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v27, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v3, %v3
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <16 x float> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x double> %val5, <16 x double> %val6
+  ret <16 x double> %sel
+}
+
+define <16 x i32> @fun64(<16 x i16> %val1, <16 x i16> %val2, <16 x double> %val3, <16 x double> %val4, <16 x i32> %val5, <16 x i32> %val6) {
+; CHECK-LABEL: fun64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vfchdb %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vn %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 416(%r15)
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v0, %v2, %v0
+; CHECK-NEXT:    vceqh %v2, %v26, %v30
+; CHECK-NEXT:    vuphh %v3, %v2
+; CHECK-NEXT:    vn %v0, %v3, %v0
+; CHECK-NEXT:    vl %v3, 448(%r15)
+; CHECK-NEXT:    vl %v4, 384(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vfchdb %v3, %v29, %v3
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v3, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v1, %v0
+; CHECK-NEXT:    vl %v0, 336(%r15)
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 320(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v3, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vn %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <16 x double> %val3, %val4
+  %and = and <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i32> %val5, <16 x i32> %val6
+  ret <16 x i32> %sel
+}
+
+define <2 x i8> @fun65(<2 x i8> %val1, <2 x i8> %val2, <2 x i8> %val3, <2 x i8> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun65:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i8> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun66(<2 x i8> %val1, <2 x i8> %val2, <2 x i8> %val3, <2 x i8> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun66:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i8> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i8> @fun67(<2 x i8> %val1, <2 x i8> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun67:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v1, %v28, %v30
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vpkh %v1, %v1, %v1
+; CHECK-NEXT:    vo %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i32> @fun68(<2 x i8> %val1, <2 x i8> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun68:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i32> @fun69(<2 x i8> %val1, <2 x i8> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun69:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i16> @fun70(<2 x i8> %val1, <2 x i8> %val2, <2 x float> %val3, <2 x float> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun70:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i64> @fun71(<2 x i8> %val1, <2 x i8> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun71:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v28, %v30
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i8> @fun72(<2 x i16> %val1, <2 x i16> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun72:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun73(<2 x i16> %val1, <2 x i16> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun73:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun74(<2 x i16> %val1, <2 x i16> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun74:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i8> @fun75(<2 x i16> %val1, <2 x i16> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun75:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v1, %v28, %v30
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vpkf %v1, %v1, %v1
+; CHECK-NEXT:    vo %v0, %v0, %v1
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i8> @fun76(<2 x i16> %val1, <2 x i16> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun76:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI76_0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x double> @fun77(<2 x i16> %val1, <2 x i16> %val2, <2 x float> %val3, <2 x float> %val4, <2 x double> %val5, <2 x double> %val6) {
+; CHECK-LABEL: fun77:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x double> %val5, <2 x double> %val6
+  ret <2 x double> %sel
+}
+
+define <2 x i16> @fun78(<2 x i16> %val1, <2 x i16> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun78:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI78_0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vfchdb %v0, %v28, %v30
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i16> @fun79(<2 x i32> %val1, <2 x i32> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun79:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun80(<2 x i32> %val1, <2 x i32> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun80:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i64> @fun81(<2 x i32> %val1, <2 x i32> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun81:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @fun82(<2 x i32> %val1, <2 x i32> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun82:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i16> @fun83(<2 x i32> %val1, <2 x i32> %val2, <2 x float> %val3, <2 x float> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun83:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x float> @fun84(<2 x i32> %val1, <2 x i32> %val2, <2 x double> %val3, <2 x double> %val4, <2 x float> %val5, <2 x float> %val6) {
+; CHECK-LABEL: fun84:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v1, %v28, %v30
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vpkg %v1, %v1, %v1
+; CHECK-NEXT:    vo %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x float> %val5, <2 x float> %val6
+  ret <2 x float> %sel
+}
+
+define <2 x i16> @fun85(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun85:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    larl %r1, .LCPI85_0
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i64> @fun86(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun86:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @fun87(<2 x i64> %val1, <2 x i64> %val2, <2 x float> %val3, <2 x float> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun87:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i16> @fun88(<2 x i64> %val1, <2 x i64> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun88:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v28, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    larl %r1, .LCPI88_0
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x float> @fun89(<2 x float> %val1, <2 x float> %val2, <2 x float> %val3, <2 x float> %val4, <2 x float> %val5, <2 x float> %val6) {
+; CHECK-LABEL: fun89:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <2 x float> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x float> %val5, <2 x float> %val6
+  ret <2 x float> %sel
+}
+
+define <2 x i32> @fun90(<2 x float> %val1, <2 x float> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun90:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vfchdb %v1, %v28, %v30
+; CHECK-NEXT:    vpkg %v1, %v1, %v1
+; CHECK-NEXT:    vo %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <2 x float> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = or <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <4 x i16> @fun91(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3, <4 x i32> %val4, <4 x i16> %val5, <4 x i16> %val6) {
+; CHECK-LABEL: fun91:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i32> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i16> %val5, <4 x i16> %val6
+  ret <4 x i16> %sel
+}
+
+define <4 x i32> @fun92(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3, <4 x i32> %val4, <4 x i32> %val5, <4 x i32> %val6) {
+; CHECK-LABEL: fun92:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i32> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i32> %val5, <4 x i32> %val6
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun93(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3, <4 x i32> %val4, <4 x i64> %val5, <4 x i64> %val6) {
+; CHECK-LABEL: fun93:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i32> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i64> %val5, <4 x i64> %val6
+  ret <4 x i64> %sel
+}
+
+define <4 x i32> @fun94(<4 x i32> %val1, <4 x i32> %val2, <4 x i64> %val3, <4 x i64> %val4, <4 x i32> %val5, <4 x i32> %val6) {
+; CHECK-LABEL: fun94:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v30, %v27
+; CHECK-NEXT:    vceqg %v1, %v28, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i64> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i32> %val5, <4 x i32> %val6
+  ret <4 x i32> %sel
+}
+
+define <4 x i16> @fun95(<4 x i32> %val1, <4 x i32> %val2, <4 x float> %val3, <4 x float> %val4, <4 x i16> %val5, <4 x i16> %val6) {
+; CHECK-LABEL: fun95:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i16> %val5, <4 x i16> %val6
+  ret <4 x i16> %sel
+}
+
+define <4 x i8> @fun96(<4 x i32> %val1, <4 x i32> %val2, <4 x double> %val3, <4 x double> %val4, <4 x i8> %val5, <4 x i8> %val6) {
+; CHECK-LABEL: fun96:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v27
+; CHECK-NEXT:    vfchdb %v1, %v28, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    larl %r1, .LCPI96_0
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <4 x double> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i8> %val5, <4 x i8> %val6
+  ret <4 x i8> %sel
+}
+
+define <4 x i32> @fun97(<4 x i64> %val1, <4 x i64> %val2, <4 x i64> %val3, <4 x i64> %val4, <4 x i32> %val5, <4 x i32> %val6) {
+; CHECK-LABEL: fun97:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v27, %v31
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v29
+; CHECK-NEXT:    vceqg %v2, %v24, %v28
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = icmp eq <4 x i64> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i32> %val5, <4 x i32> %val6
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun98(<4 x i64> %val1, <4 x i64> %val2, <4 x i64> %val3, <4 x i64> %val4, <4 x i64> %val5, <4 x i64> %val6) {
+; CHECK-LABEL: fun98:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v25, %v29
+; CHECK-NEXT:    vceqg %v1, %v24, %v28
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vceqg %v0, %v27, %v31
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = icmp eq <4 x i64> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i64> %val5, <4 x i64> %val6
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @fun99(<4 x i64> %val1, <4 x i64> %val2, <4 x float> %val3, <4 x float> %val4, <4 x i64> %val5, <4 x i64> %val6) {
+; CHECK-LABEL: fun99:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v1, %v25, %v25
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v27, %v27
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vceqg %v2, %v24, %v28
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v2, %v1
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v31, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i64> %val5, <4 x i64> %val6
+  ret <4 x i64> %sel
+}
+
+define <4 x float> @fun100(<4 x i64> %val1, <4 x i64> %val2, <4 x double> %val3, <4 x double> %val4, <4 x float> %val5, <4 x float> %val6) {
+; CHECK-LABEL: fun100:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v27, %v31
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vfchdb %v1, %v25, %v29
+; CHECK-NEXT:    vceqg %v2, %v24, %v28
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <4 x double> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x float> %val5, <4 x float> %val6
+  ret <4 x float> %sel
+}
+
+define <4 x i16> @fun101(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4, <4 x i16> %val5, <4 x i16> %val6) {
+; CHECK-LABEL: fun101:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i16> %val5, <4 x i16> %val6
+  ret <4 x i16> %sel
+}
+
+define <4 x float> @fun102(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4, <4 x float> %val5, <4 x float> %val6) {
+; CHECK-LABEL: fun102:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x float> %val5, <4 x float> %val6
+  ret <4 x float> %sel
+}
+
+define <4 x double> @fun103(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4, <4 x double> %val5, <4 x double> %val6) {
+; CHECK-LABEL: fun103:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x double> %val5, <4 x double> %val6
+  ret <4 x double> %sel
+}
+
+define <4 x i8> @fun104(<4 x float> %val1, <4 x float> %val2, <4 x double> %val3, <4 x double> %val4, <4 x i8> %val5, <4 x i8> %val6) {
+; CHECK-LABEL: fun104:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v27
+; CHECK-NEXT:    vfchdb %v1, %v28, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    larl %r1, .LCPI104_0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x double> %val3, %val4
+  %and = or <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i8> %val5, <4 x i8> %val6
+  ret <4 x i8> %sel
+}
+
+define <8 x i8> @fun105(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3, <8 x i16> %val4, <8 x i8> %val5, <8 x i8> %val6) {
+; CHECK-LABEL: fun105:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i16> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i8> %val5, <8 x i8> %val6
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun106(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3, <8 x i16> %val4, <8 x i16> %val5, <8 x i16> %val6) {
+; CHECK-LABEL: fun106:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i16> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i16> %val5, <8 x i16> %val6
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun107(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3, <8 x i16> %val4, <8 x i32> %val5, <8 x i32> %val6) {
+; CHECK-LABEL: fun107:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i16> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i32> %val5, <8 x i32> %val6
+  ret <8 x i32> %sel
+}
+
+define <8 x i64> @fun108(<8 x i16> %val1, <8 x i16> %val2, <8 x i32> %val3, <8 x i32> %val4, <8 x i64> %val5, <8 x i64> %val6) {
+; CHECK-LABEL: fun108:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vceqf %v0, %v28, %v25
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vo %v0, %v2, %v0
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v31, %v2, %v0
+; CHECK-NEXT:    vceqf %v0, %v30, %v27
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i32> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i64> %val5, <8 x i64> %val6
+  ret <8 x i64> %sel
+}
+
+define <8 x i8> @fun109(<8 x i16> %val1, <8 x i16> %val2, <8 x i64> %val3, <8 x i64> %val4, <8 x i8> %val5, <8 x i8> %val6) {
+; CHECK-LABEL: fun109:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 160(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqg %v1, %v30, %v31
+; CHECK-NEXT:    vceqg %v2, %v28, %v29
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vlrepg %v1, 200(%r15)
+; CHECK-NEXT:    vlrepg %v2, 192(%r15)
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i64> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i8> %val5, <8 x i8> %val6
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun110(<8 x i16> %val1, <8 x i16> %val2, <8 x float> %val3, <8 x float> %val4, <8 x i16> %val5, <8 x i16> %val6) {
+; CHECK-LABEL: fun110:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v27, %v27
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v28, %v28
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v25, %v25
+; CHECK-NEXT:    vmrlf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <8 x float> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i16> %val5, <8 x i16> %val6
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun111(<8 x i16> %val1, <8 x i16> %val2, <8 x double> %val3, <8 x double> %val4, <8 x i32> %val5, <8 x i32> %val6) {
+; CHECK-LABEL: fun111:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v31
+; CHECK-NEXT:    vfchdb %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vo %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v2, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <8 x double> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i32> %val5, <8 x i32> %val6
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @fun112(<8 x i32> %val1, <8 x i32> %val2, <8 x i64> %val3, <8 x i64> %val4, <8 x i32> %val5, <8 x i32> %val6) {
+; CHECK-LABEL: fun112:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 160(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vceqg %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v26, %v30
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i32> %val1, %val2
+  %cmp1 = icmp eq <8 x i64> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i32> %val5, <8 x i32> %val6
+  ret <8 x i32> %sel
+}
+
+define <8 x double> @fun113(<8 x i32> %val1, <8 x i32> %val2, <8 x float> %val3, <8 x float> %val4, <8 x double> %val5, <8 x double> %val6) {
+; CHECK-LABEL: fun113:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v29, %v29
+; CHECK-NEXT:    vmrlf %v1, %v25, %v25
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v29, %v29
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vmrlf %v1, %v31, %v31
+; CHECK-NEXT:    vmrlf %v2, %v27, %v27
+; CHECK-NEXT:    vmrhf %v3, %v27, %v27
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v31, %v31
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vceqf %v2, %v26, %v30
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <8 x float> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x double> %val5, <8 x double> %val6
+  ret <8 x double> %sel
+}
+
+define <8 x double> @fun114(<8 x i32> %val1, <8 x i32> %val2, <8 x double> %val3, <8 x double> %val4, <8 x double> %val5, <8 x double> %val6) {
+; CHECK-LABEL: fun114:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 160(%r15)
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vfchdb %v0, %v25, %v0
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vo %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 192(%r15)
+; CHECK-NEXT:    vceqf %v2, %v26, %v30
+; CHECK-NEXT:    vfchdb %v0, %v29, %v0
+; CHECK-NEXT:    vuphf %v3, %v2
+; CHECK-NEXT:    vo %v0, %v3, %v0
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vl %v4, 256(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 304(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vl %v2, 272(%r15)
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <8 x double> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x double> %val5, <8 x double> %val6
+  ret <8 x double> %sel
+}
+
+define <8 x i64> @fun115(<8 x float> %val1, <8 x float> %val2, <8 x double> %val3, <8 x double> %val4, <8 x i64> %val5, <8 x i64> %val6) {
+; CHECK-LABEL: fun115:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v28, %v28
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v28, %v28
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vl %v3, 224(%r15)
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v4, 256(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vo %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vmrlf %v2, %v26, %v26
+; CHECK-NEXT:    vmrhf %v3, %v26, %v26
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vfchdb %v3, %v29, %v3
+; CHECK-NEXT:    vo %v2, %v2, %v3
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vfchdb %v2, %v27, %v2
+; CHECK-NEXT:    vo %v0, %v0, %v2
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 272(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vfchdb %v1, %v31, %v1
+; CHECK-NEXT:    vo %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <8 x float> %val1, %val2
+  %cmp1 = fcmp ogt <8 x double> %val3, %val4
+  %and = or <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i64> %val5, <8 x i64> %val6
+  ret <8 x i64> %sel
+}
+
+define <16 x i8> @fun116(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3, <16 x i8> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun116:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i8> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun117(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3, <16 x i8> %val4, <16 x i16> %val5, <16 x i16> %val6) {
+; CHECK-LABEL: fun117:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i8> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i16> %val5, <16 x i16> %val6
+  ret <16 x i16> %sel
+}
+
+define <16 x i64> @fun118(<16 x i8> %val1, <16 x i8> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i64> %val5, <16 x i64> %val6) {
+; CHECK-LABEL: fun118:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vceqh %v0, %v28, %v25
+; CHECK-NEXT:    vuphb %v2, %v1
+; CHECK-NEXT:    vo %v0, %v2, %v0
+; CHECK-NEXT:    vuphh %v2, %v0
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v24, %v29, %v3, %v2
+; CHECK-NEXT:    vpkg %v2, %v0, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vl %v3, 272(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v26, %v31, %v3, %v2
+; CHECK-NEXT:    vmrlg %v2, %v0, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 12
+; CHECK-NEXT:    vl %v3, 288(%r15)
+; CHECK-NEXT:    vl %v4, 160(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vsel %v0, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vceqh %v2, %v30, %v27
+; CHECK-NEXT:    vlr %v30, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vo %v1, %v1, %v2
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v25, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v3, 336(%r15)
+; CHECK-NEXT:    vl %v4, 208(%r15)
+; CHECK-NEXT:    vpkg %v2, %v1, %v1
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v27, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vl %v4, 224(%r15)
+; CHECK-NEXT:    vmrlg %v2, %v1, %v1
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vsldb %v1, %v1, %v1, 12
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vsel %v29, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v31, %v3, %v2, %v1
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i64> %val5, <16 x i64> %val6
+  ret <16 x i64> %sel
+}
+
+define <16 x i64> @fun119(<16 x i8> %val1, <16 x i8> %val2, <16 x i32> %val3, <16 x i32> %val4, <16 x i64> %val5, <16 x i64> %val6) {
+; CHECK-LABEL: fun119:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v2, %v1
+; CHECK-NEXT:    vceqf %v0, %v28, %v29
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vo %v0, %v2, %v0
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vl %v3, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vpkg %v2, %v1, %v1
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vceqf %v0, %v30, %v31
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vo %v0, %v2, %v0
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vl %v4, 224(%r15)
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vl %v5, 256(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vl %v4, 384(%r15)
+; CHECK-NEXT:    vmrlg %v3, %v1, %v1
+; CHECK-NEXT:    vuphb %v3, %v3
+; CHECK-NEXT:    vceqf %v2, %v25, %v2
+; CHECK-NEXT:    vuphh %v3, %v3
+; CHECK-NEXT:    vo %v2, %v3, %v2
+; CHECK-NEXT:    vuphf %v3, %v2
+; CHECK-NEXT:    vsldb %v1, %v1, %v1, 12
+; CHECK-NEXT:    vsel %v25, %v5, %v4, %v3
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vl %v4, 416(%r15)
+; CHECK-NEXT:    vl %v5, 288(%r15)
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vceqf %v3, %v27, %v3
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vo %v1, %v1, %v3
+; CHECK-NEXT:    vuphf %v3, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v29, %v5, %v4, %v3
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vl %v4, 240(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v3, 272(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v2, %v2
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i32> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i64> %val5, <16 x i64> %val6
+  ret <16 x i64> %sel
+}
+
+define <16 x i64> @fun120(<16 x i8> %val1, <16 x i8> %val2, <16 x i64> %val3, <16 x i64> %val4, <16 x i64> %val5, <16 x i64> %val6) {
+; CHECK-LABEL: fun120:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 192(%r15)
+; CHECK-NEXT:    vceqg %v1, %v28, %v0
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v2, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 448(%r15)
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vpkf %v2, %v0, %v0
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v30, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 464(%r15)
+; CHECK-NEXT:    vl %v3, 336(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v1
+; CHECK-NEXT:    vpkg %v2, %v0, %v0
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 480(%r15)
+; CHECK-NEXT:    vsel %v28, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vsldb %v2, %v0, %v0, 6
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v27, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 496(%r15)
+; CHECK-NEXT:    vsel %v30, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v3, 384(%r15)
+; CHECK-NEXT:    vmrlg %v2, %v0, %v0
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v29, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 512(%r15)
+; CHECK-NEXT:    vsel %v25, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v3, 400(%r15)
+; CHECK-NEXT:    vsldb %v2, %v0, %v0, 10
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v31, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 528(%r15)
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 288(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vl %v3, 416(%r15)
+; CHECK-NEXT:    vceqg %v1, %v2, %v1
+; CHECK-NEXT:    vsldb %v2, %v0, %v0, 12
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 14
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 544(%r15)
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v29, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vceqg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 432(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vo %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 560(%r15)
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i64> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i64> %val5, <16 x i64> %val6
+  ret <16 x i64> %sel
+}
+
+define <16 x i16> @fun121(<16 x i8> %val1, <16 x i8> %val2, <16 x float> %val3, <16 x float> %val4, <16 x i16> %val5, <16 x i16> %val6) {
+; CHECK-LABEL: fun121:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v31, %v31
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v31, %v31
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v28, %v28
+; CHECK-NEXT:    vmrlf %v4, %v25, %v25
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v29, %v29
+; CHECK-NEXT:    vmrlf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v29, %v29
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v2, %v1
+; CHECK-NEXT:    vo %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vmrlf %v2, %v0, %v0
+; CHECK-NEXT:    vmrlf %v3, %v27, %v27
+; CHECK-NEXT:    vmrhf %v0, %v0, %v0
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vmrhf %v3, %v27, %v27
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v0, %v3, %v0
+; CHECK-NEXT:    vpkg %v0, %v0, %v2
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vmrlf %v3, %v2, %v2
+; CHECK-NEXT:    vmrhf %v2, %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v25, %v25
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v2, %v4, %v2
+; CHECK-NEXT:    vpkg %v2, %v2, %v3
+; CHECK-NEXT:    vpkf %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <16 x float> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i16> %val5, <16 x i16> %val6
+  ret <16 x i16> %sel
+}
+
+define <16 x i8> @fun122(<16 x i8> %val1, <16 x i8> %val2, <16 x double> %val3, <16 x double> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun122:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 304(%r15)
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 288(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 256(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v31, %v1
+; CHECK-NEXT:    vfchdb %v2, %v29, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v27, %v1
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vfchdb %v2, %v30, %v2
+; CHECK-NEXT:    vfchdb %v3, %v28, %v3
+; CHECK-NEXT:    vpkg %v2, %v3, %v2
+; CHECK-NEXT:    vpkf %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <16 x double> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @fun123(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun123:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v27, %v31
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v25, %v29
+; CHECK-NEXT:    vceqh %v2, %v24, %v28
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun124(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i16> %val5, <16 x i16> %val6) {
+; CHECK-LABEL: fun124:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v25, %v29
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vceqh %v0, %v27, %v31
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i16> %val5, <16 x i16> %val6
+  ret <16 x i16> %sel
+}
+
+define <16 x i32> @fun125(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i32> %val5, <16 x i32> %val6) {
+; CHECK-LABEL: fun125:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v25, %v29
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vceqh %v1, %v27, %v31
+; CHECK-NEXT:    vceqh %v2, %v26, %v30
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i32> %val5, <16 x i32> %val6
+  ret <16 x i32> %sel
+}
+
+define <16 x i8> @fun126(<16 x i16> %val1, <16 x i16> %val2, <16 x i32> %val3, <16 x i32> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun126:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vceqf %v0, %v31, %v0
+; CHECK-NEXT:    vceqf %v1, %v29, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vceqf %v1, %v27, %v1
+; CHECK-NEXT:    vceqf %v2, %v25, %v2
+; CHECK-NEXT:    vpkf %v1, %v2, %v1
+; CHECK-NEXT:    vceqh %v2, %v24, %v28
+; CHECK-NEXT:    vo %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i32> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i32> @fun127(<16 x i16> %val1, <16 x i16> %val2, <16 x i64> %val3, <16 x i64> %val4, <16 x i32> %val5, <16 x i32> %val6) {
+; CHECK-LABEL: fun127:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vo %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 416(%r15)
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vceqg %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vceqg %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v0, %v2, %v0
+; CHECK-NEXT:    vceqh %v2, %v26, %v30
+; CHECK-NEXT:    vuphh %v3, %v2
+; CHECK-NEXT:    vo %v0, %v3, %v0
+; CHECK-NEXT:    vl %v3, 448(%r15)
+; CHECK-NEXT:    vl %v4, 384(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vceqg %v3, %v29, %v3
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v3, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v1, %v0
+; CHECK-NEXT:    vl %v0, 336(%r15)
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 320(%r15)
+; CHECK-NEXT:    vceqg %v1, %v3, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i64> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i32> %val5, <16 x i32> %val6
+  ret <16 x i32> %sel
+}
+
+define <16 x double> @fun128(<16 x i16> %val1, <16 x i16> %val2, <16 x float> %val3, <16 x float> %val4, <16 x double> %val5, <16 x double> %val6) {
+; CHECK-LABEL: fun128:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 160(%r15)
+; CHECK-NEXT:    vmrlf %v1, %v0, %v0
+; CHECK-NEXT:    vmrlf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v0, %v0, %v0
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vl %v4, 224(%r15)
+; CHECK-NEXT:    vl %v5, 416(%r15)
+; CHECK-NEXT:    vl %v6, 288(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v2, %v0
+; CHECK-NEXT:    vpkg %v0, %v0, %v1
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vo %v0, %v2, %v0
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vsel %v24, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vmrlf %v3, %v2, %v2
+; CHECK-NEXT:    vmrlf %v4, %v27, %v27
+; CHECK-NEXT:    vmrhf %v2, %v2, %v2
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v27, %v27
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v2, %v4, %v2
+; CHECK-NEXT:    vl %v4, 256(%r15)
+; CHECK-NEXT:    vpkg %v2, %v2, %v3
+; CHECK-NEXT:    vl %v3, 384(%r15)
+; CHECK-NEXT:    vo %v1, %v1, %v2
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vmrlf %v3, %v2, %v2
+; CHECK-NEXT:    vmrlf %v4, %v29, %v29
+; CHECK-NEXT:    vmrhf %v2, %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v29, %v29
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v2, %v4, %v2
+; CHECK-NEXT:    vpkg %v2, %v2, %v3
+; CHECK-NEXT:    vceqh %v3, %v26, %v30
+; CHECK-NEXT:    vuphh %v4, %v3
+; CHECK-NEXT:    vo %v2, %v4, %v2
+; CHECK-NEXT:    vuphf %v4, %v2
+; CHECK-NEXT:    vsel %v25, %v6, %v5, %v4
+; CHECK-NEXT:    vl %v4, 208(%r15)
+; CHECK-NEXT:    vmrlf %v5, %v4, %v4
+; CHECK-NEXT:    vmrlf %v6, %v31, %v31
+; CHECK-NEXT:    vmrhf %v4, %v4, %v4
+; CHECK-NEXT:    vmrlg %v3, %v3, %v3
+; CHECK-NEXT:    vuphh %v3, %v3
+; CHECK-NEXT:    vldeb %v5, %v5
+; CHECK-NEXT:    vldeb %v6, %v6
+; CHECK-NEXT:    vfchdb %v5, %v6, %v5
+; CHECK-NEXT:    vmrhf %v6, %v31, %v31
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vldeb %v6, %v6
+; CHECK-NEXT:    vfchdb %v4, %v6, %v4
+; CHECK-NEXT:    vl %v6, 320(%r15)
+; CHECK-NEXT:    vpkg %v4, %v4, %v5
+; CHECK-NEXT:    vl %v5, 448(%r15)
+; CHECK-NEXT:    vo %v3, %v3, %v4
+; CHECK-NEXT:    vuphf %v4, %v3
+; CHECK-NEXT:    vsel %v29, %v6, %v5, %v4
+; CHECK-NEXT:    vl %v4, 368(%r15)
+; CHECK-NEXT:    vl %v5, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v5, %v4, %v0
+; CHECK-NEXT:    vl %v4, 272(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 400(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v4, %v1, %v0
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v2, %v2
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v27, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v3, %v3
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <16 x float> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x double> %val5, <16 x double> %val6
+  ret <16 x double> %sel
+}
+
+define <16 x i32> @fun129(<16 x i16> %val1, <16 x i16> %val2, <16 x double> %val3, <16 x double> %val4, <16 x i32> %val5, <16 x i32> %val6) {
+; CHECK-LABEL: fun129:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vfchdb %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vo %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 416(%r15)
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v0, %v2, %v0
+; CHECK-NEXT:    vceqh %v2, %v26, %v30
+; CHECK-NEXT:    vuphh %v3, %v2
+; CHECK-NEXT:    vo %v0, %v3, %v0
+; CHECK-NEXT:    vl %v3, 448(%r15)
+; CHECK-NEXT:    vl %v4, 384(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vfchdb %v3, %v29, %v3
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v3, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v1, %v0
+; CHECK-NEXT:    vl %v0, 336(%r15)
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 320(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v3, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vo %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <16 x double> %val3, %val4
+  %and = or <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i32> %val5, <16 x i32> %val6
+  ret <16 x i32> %sel
+}
+
+define <2 x i8> @fun130(<2 x i8> %val1, <2 x i8> %val2, <2 x i8> %val3, <2 x i8> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun130:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i8> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun131(<2 x i8> %val1, <2 x i8> %val2, <2 x i8> %val3, <2 x i8> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun131:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i8> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i8> @fun132(<2 x i8> %val1, <2 x i8> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun132:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v1, %v28, %v30
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vpkh %v1, %v1, %v1
+; CHECK-NEXT:    vx %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i32> @fun133(<2 x i8> %val1, <2 x i8> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun133:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i32> @fun134(<2 x i8> %val1, <2 x i8> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun134:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i16> @fun135(<2 x i8> %val1, <2 x i8> %val2, <2 x float> %val3, <2 x float> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun135:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i64> @fun136(<2 x i8> %val1, <2 x i8> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun136:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v28, %v30
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i8> @fun137(<2 x i16> %val1, <2 x i16> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun137:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun138(<2 x i16> %val1, <2 x i16> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun138:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun139(<2 x i16> %val1, <2 x i16> %val2, <2 x i16> %val3, <2 x i16> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun139:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i16> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i8> @fun140(<2 x i16> %val1, <2 x i16> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun140:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v1, %v28, %v30
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vpkf %v1, %v1, %v1
+; CHECK-NEXT:    vx %v0, %v0, %v1
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x i8> @fun141(<2 x i16> %val1, <2 x i16> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i8> %val5, <2 x i8> %val6) {
+; CHECK-LABEL: fun141:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI141_0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i8> %val5, <2 x i8> %val6
+  ret <2 x i8> %sel
+}
+
+define <2 x double> @fun142(<2 x i16> %val1, <2 x i16> %val2, <2 x float> %val3, <2 x float> %val4, <2 x double> %val5, <2 x double> %val6) {
+; CHECK-LABEL: fun142:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x double> %val5, <2 x double> %val6
+  ret <2 x double> %sel
+}
+
+define <2 x i16> @fun143(<2 x i16> %val1, <2 x i16> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun143:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI143_0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vfchdb %v0, %v28, %v30
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i16> @fun144(<2 x i32> %val1, <2 x i32> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun144:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun145(<2 x i32> %val1, <2 x i32> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun145:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <2 x i64> @fun146(<2 x i32> %val1, <2 x i32> %val2, <2 x i32> %val3, <2 x i32> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun146:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i32> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @fun147(<2 x i32> %val1, <2 x i32> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun147:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i16> @fun148(<2 x i32> %val1, <2 x i32> %val2, <2 x float> %val3, <2 x float> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun148:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x float> @fun149(<2 x i32> %val1, <2 x i32> %val2, <2 x double> %val3, <2 x double> %val4, <2 x float> %val5, <2 x float> %val6) {
+; CHECK-LABEL: fun149:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v1, %v28, %v30
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vpkg %v1, %v1, %v1
+; CHECK-NEXT:    vx %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x float> %val5, <2 x float> %val6
+  ret <2 x float> %sel
+}
+
+define <2 x i16> @fun150(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun150:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    larl %r1, .LCPI150_0
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x i64> @fun151(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3, <2 x i64> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun151:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v28, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = icmp eq <2 x i64> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i64> @fun152(<2 x i64> %val1, <2 x i64> %val2, <2 x float> %val3, <2 x float> %val4, <2 x i64> %val5, <2 x i64> %val6) {
+; CHECK-LABEL: fun152:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i64> %val5, <2 x i64> %val6
+  ret <2 x i64> %sel
+}
+
+define <2 x i16> @fun153(<2 x i64> %val1, <2 x i64> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i16> %val5, <2 x i16> %val6) {
+; CHECK-LABEL: fun153:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v28, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v26
+; CHECK-NEXT:    larl %r1, .LCPI153_0
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <2 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i16> %val5, <2 x i16> %val6
+  ret <2 x i16> %sel
+}
+
+define <2 x float> @fun154(<2 x float> %val1, <2 x float> %val2, <2 x float> %val3, <2 x float> %val4, <2 x float> %val5, <2 x float> %val6) {
+; CHECK-LABEL: fun154:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <2 x float> %val1, %val2
+  %cmp1 = fcmp ogt <2 x float> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x float> %val5, <2 x float> %val6
+  ret <2 x float> %sel
+}
+
+define <2 x i32> @fun155(<2 x float> %val1, <2 x float> %val2, <2 x double> %val3, <2 x double> %val4, <2 x i32> %val5, <2 x i32> %val6) {
+; CHECK-LABEL: fun155:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vfchdb %v1, %v28, %v30
+; CHECK-NEXT:    vpkg %v1, %v1, %v1
+; CHECK-NEXT:    vx %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <2 x float> %val1, %val2
+  %cmp1 = fcmp ogt <2 x double> %val3, %val4
+  %and = xor <2 x i1> %cmp0, %cmp1
+  %sel = select <2 x i1> %and, <2 x i32> %val5, <2 x i32> %val6
+  ret <2 x i32> %sel
+}
+
+define <4 x i16> @fun156(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3, <4 x i32> %val4, <4 x i16> %val5, <4 x i16> %val6) {
+; CHECK-LABEL: fun156:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i32> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i16> %val5, <4 x i16> %val6
+  ret <4 x i16> %sel
+}
+
+define <4 x i32> @fun157(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3, <4 x i32> %val4, <4 x i32> %val5, <4 x i32> %val6) {
+; CHECK-LABEL: fun157:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i32> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i32> %val5, <4 x i32> %val6
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun158(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3, <4 x i32> %val4, <4 x i64> %val5, <4 x i64> %val6) {
+; CHECK-LABEL: fun158:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v28, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i32> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i64> %val5, <4 x i64> %val6
+  ret <4 x i64> %sel
+}
+
+define <4 x i32> @fun159(<4 x i32> %val1, <4 x i32> %val2, <4 x i64> %val3, <4 x i64> %val4, <4 x i32> %val5, <4 x i32> %val6) {
+; CHECK-LABEL: fun159:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v30, %v27
+; CHECK-NEXT:    vceqg %v1, %v28, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = icmp eq <4 x i64> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i32> %val5, <4 x i32> %val6
+  ret <4 x i32> %sel
+}
+
+define <4 x i16> @fun160(<4 x i32> %val1, <4 x i32> %val2, <4 x float> %val3, <4 x float> %val4, <4 x i16> %val5, <4 x i16> %val6) {
+; CHECK-LABEL: fun160:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i16> %val5, <4 x i16> %val6
+  ret <4 x i16> %sel
+}
+
+define <4 x i8> @fun161(<4 x i32> %val1, <4 x i32> %val2, <4 x double> %val3, <4 x double> %val4, <4 x i8> %val5, <4 x i8> %val6) {
+; CHECK-LABEL: fun161:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v27
+; CHECK-NEXT:    vfchdb %v1, %v28, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v26
+; CHECK-NEXT:    larl %r1, .LCPI161_0
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <4 x double> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i8> %val5, <4 x i8> %val6
+  ret <4 x i8> %sel
+}
+
+define <4 x i32> @fun162(<4 x i64> %val1, <4 x i64> %val2, <4 x i64> %val3, <4 x i64> %val4, <4 x i32> %val5, <4 x i32> %val6) {
+; CHECK-LABEL: fun162:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v27, %v31
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v29
+; CHECK-NEXT:    vceqg %v2, %v24, %v28
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = icmp eq <4 x i64> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i32> %val5, <4 x i32> %val6
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun163(<4 x i64> %val1, <4 x i64> %val2, <4 x i64> %val3, <4 x i64> %val4, <4 x i64> %val5, <4 x i64> %val6) {
+; CHECK-LABEL: fun163:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v25, %v29
+; CHECK-NEXT:    vceqg %v1, %v24, %v28
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vceqg %v0, %v27, %v31
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = icmp eq <4 x i64> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i64> %val5, <4 x i64> %val6
+  ret <4 x i64> %sel
+}
+
+define <4 x i64> @fun164(<4 x i64> %val1, <4 x i64> %val2, <4 x float> %val3, <4 x float> %val4, <4 x i64> %val5, <4 x i64> %val6) {
+; CHECK-LABEL: fun164:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v1, %v25, %v25
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v27, %v27
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vceqg %v2, %v24, %v28
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v2, %v1
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v31, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i64> %val5, <4 x i64> %val6
+  ret <4 x i64> %sel
+}
+
+define <4 x float> @fun165(<4 x i64> %val1, <4 x i64> %val2, <4 x double> %val3, <4 x double> %val4, <4 x float> %val5, <4 x float> %val6) {
+; CHECK-LABEL: fun165:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v27, %v31
+; CHECK-NEXT:    vceqg %v1, %v26, %v30
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vfchdb %v1, %v25, %v29
+; CHECK-NEXT:    vceqg %v2, %v24, %v28
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <4 x i64> %val1, %val2
+  %cmp1 = fcmp ogt <4 x double> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x float> %val5, <4 x float> %val6
+  ret <4 x float> %sel
+}
+
+define <4 x i16> @fun166(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4, <4 x i16> %val5, <4 x i16> %val6) {
+; CHECK-LABEL: fun166:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i16> %val5, <4 x i16> %val6
+  ret <4 x i16> %sel
+}
+
+define <4 x float> @fun167(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4, <4 x float> %val5, <4 x float> %val6) {
+; CHECK-LABEL: fun167:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x float> %val5, <4 x float> %val6
+  ret <4 x float> %sel
+}
+
+define <4 x double> @fun168(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4, <4 x double> %val5, <4 x double> %val6) {
+; CHECK-LABEL: fun168:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x float> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x double> %val5, <4 x double> %val6
+  ret <4 x double> %sel
+}
+
+define <4 x i8> @fun169(<4 x float> %val1, <4 x float> %val2, <4 x double> %val3, <4 x double> %val4, <4 x i8> %val5, <4 x i8> %val6) {
+; CHECK-LABEL: fun169:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v27
+; CHECK-NEXT:    vfchdb %v1, %v28, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    larl %r1, .LCPI169_0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <4 x float> %val1, %val2
+  %cmp1 = fcmp ogt <4 x double> %val3, %val4
+  %and = xor <4 x i1> %cmp0, %cmp1
+  %sel = select <4 x i1> %and, <4 x i8> %val5, <4 x i8> %val6
+  ret <4 x i8> %sel
+}
+
+define <8 x i8> @fun170(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3, <8 x i16> %val4, <8 x i8> %val5, <8 x i8> %val6) {
+; CHECK-LABEL: fun170:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i16> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i8> %val5, <8 x i8> %val6
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun171(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3, <8 x i16> %val4, <8 x i16> %val5, <8 x i16> %val6) {
+; CHECK-LABEL: fun171:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i16> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i16> %val5, <8 x i16> %val6
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun172(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3, <8 x i16> %val4, <8 x i32> %val5, <8 x i32> %val6) {
+; CHECK-LABEL: fun172:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v28, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i16> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i32> %val5, <8 x i32> %val6
+  ret <8 x i32> %sel
+}
+
+define <8 x i64> @fun173(<8 x i16> %val1, <8 x i16> %val2, <8 x i32> %val3, <8 x i32> %val4, <8 x i64> %val5, <8 x i64> %val6) {
+; CHECK-LABEL: fun173:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vceqf %v0, %v28, %v25
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v31, %v2, %v0
+; CHECK-NEXT:    vceqf %v0, %v30, %v27
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i32> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i64> %val5, <8 x i64> %val6
+  ret <8 x i64> %sel
+}
+
+define <8 x i8> @fun174(<8 x i16> %val1, <8 x i16> %val2, <8 x i64> %val3, <8 x i64> %val4, <8 x i8> %val5, <8 x i8> %val6) {
+; CHECK-LABEL: fun174:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 160(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqg %v1, %v30, %v31
+; CHECK-NEXT:    vceqg %v2, %v28, %v29
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vlrepg %v1, 200(%r15)
+; CHECK-NEXT:    vlrepg %v2, 192(%r15)
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = icmp eq <8 x i64> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i8> %val5, <8 x i8> %val6
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun175(<8 x i16> %val1, <8 x i16> %val2, <8 x float> %val3, <8 x float> %val4, <8 x i16> %val5, <8 x i16> %val6) {
+; CHECK-LABEL: fun175:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v27, %v27
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v28, %v28
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v25, %v25
+; CHECK-NEXT:    vmrlf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v29, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <8 x float> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i16> %val5, <8 x i16> %val6
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun176(<8 x i16> %val1, <8 x i16> %val2, <8 x double> %val3, <8 x double> %val4, <8 x i32> %val5, <8 x i32> %val6) {
+; CHECK-LABEL: fun176:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v31
+; CHECK-NEXT:    vfchdb %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v26
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v2, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <8 x double> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i32> %val5, <8 x i32> %val6
+  ret <8 x i32> %sel
+}
+
+define <8 x i32> @fun177(<8 x i32> %val1, <8 x i32> %val2, <8 x i64> %val3, <8 x i64> %val4, <8 x i32> %val5, <8 x i32> %val6) {
+; CHECK-LABEL: fun177:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 160(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vceqg %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v26, %v30
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i32> %val1, %val2
+  %cmp1 = icmp eq <8 x i64> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i32> %val5, <8 x i32> %val6
+  ret <8 x i32> %sel
+}
+
+define <8 x double> @fun178(<8 x i32> %val1, <8 x i32> %val2, <8 x float> %val3, <8 x float> %val4, <8 x double> %val5, <8 x double> %val6) {
+; CHECK-LABEL: fun178:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v29, %v29
+; CHECK-NEXT:    vmrlf %v1, %v25, %v25
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v29, %v29
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vmrlf %v1, %v31, %v31
+; CHECK-NEXT:    vmrlf %v2, %v27, %v27
+; CHECK-NEXT:    vmrhf %v3, %v27, %v27
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v31, %v31
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vceqf %v2, %v26, %v30
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <8 x float> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x double> %val5, <8 x double> %val6
+  ret <8 x double> %sel
+}
+
+define <8 x double> @fun179(<8 x i32> %val1, <8 x i32> %val2, <8 x double> %val3, <8 x double> %val4, <8 x double> %val5, <8 x double> %val6) {
+; CHECK-LABEL: fun179:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 160(%r15)
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vfchdb %v0, %v25, %v0
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 192(%r15)
+; CHECK-NEXT:    vceqf %v2, %v26, %v30
+; CHECK-NEXT:    vfchdb %v0, %v29, %v0
+; CHECK-NEXT:    vuphf %v3, %v2
+; CHECK-NEXT:    vx %v0, %v3, %v0
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vl %v4, 256(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 304(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vl %v2, 272(%r15)
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <8 x i32> %val1, %val2
+  %cmp1 = fcmp ogt <8 x double> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x double> %val5, <8 x double> %val6
+  ret <8 x double> %sel
+}
+
+define <8 x i64> @fun180(<8 x float> %val1, <8 x float> %val2, <8 x double> %val3, <8 x double> %val4, <8 x i64> %val5, <8 x i64> %val6) {
+; CHECK-LABEL: fun180:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v28, %v28
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v28, %v28
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vl %v3, 224(%r15)
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v4, 256(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vx %v1, %v1, %v2
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vmrlf %v2, %v26, %v26
+; CHECK-NEXT:    vmrhf %v3, %v26, %v26
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vfchdb %v3, %v29, %v3
+; CHECK-NEXT:    vx %v2, %v2, %v3
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vfchdb %v2, %v27, %v2
+; CHECK-NEXT:    vx %v0, %v0, %v2
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 272(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vfchdb %v1, %v31, %v1
+; CHECK-NEXT:    vx %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = fcmp ogt <8 x float> %val1, %val2
+  %cmp1 = fcmp ogt <8 x double> %val3, %val4
+  %and = xor <8 x i1> %cmp0, %cmp1
+  %sel = select <8 x i1> %and, <8 x i64> %val5, <8 x i64> %val6
+  ret <8 x i64> %sel
+}
+
+define <16 x i8> @fun181(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3, <16 x i8> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun181:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i8> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun182(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3, <16 x i8> %val4, <16 x i16> %val5, <16 x i16> %val6) {
+; CHECK-LABEL: fun182:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v28, %v30
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v1
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i8> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i16> %val5, <16 x i16> %val6
+  ret <16 x i16> %sel
+}
+
+define <16 x i64> @fun183(<16 x i8> %val1, <16 x i8> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i64> %val5, <16 x i64> %val6) {
+; CHECK-LABEL: fun183:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vceqh %v0, %v28, %v25
+; CHECK-NEXT:    vuphb %v2, %v1
+; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vuphh %v2, %v0
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v24, %v29, %v3, %v2
+; CHECK-NEXT:    vpkg %v2, %v0, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vl %v3, 272(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v26, %v31, %v3, %v2
+; CHECK-NEXT:    vmrlg %v2, %v0, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 12
+; CHECK-NEXT:    vl %v3, 288(%r15)
+; CHECK-NEXT:    vl %v4, 160(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vsel %v0, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vceqh %v2, %v30, %v27
+; CHECK-NEXT:    vlr %v30, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vx %v1, %v1, %v2
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v25, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v3, 336(%r15)
+; CHECK-NEXT:    vl %v4, 208(%r15)
+; CHECK-NEXT:    vpkg %v2, %v1, %v1
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v27, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vl %v4, 224(%r15)
+; CHECK-NEXT:    vmrlg %v2, %v1, %v1
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vsldb %v1, %v1, %v1, 12
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vsel %v29, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v31, %v3, %v2, %v1
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i64> %val5, <16 x i64> %val6
+  ret <16 x i64> %sel
+}
+
+define <16 x i64> @fun184(<16 x i8> %val1, <16 x i8> %val2, <16 x i32> %val3, <16 x i32> %val4, <16 x i64> %val5, <16 x i64> %val6) {
+; CHECK-LABEL: fun184:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v2, %v1
+; CHECK-NEXT:    vceqf %v0, %v28, %v29
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vl %v3, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vpkg %v2, %v1, %v1
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vceqf %v0, %v30, %v31
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vl %v4, 224(%r15)
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vl %v5, 256(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vl %v4, 384(%r15)
+; CHECK-NEXT:    vmrlg %v3, %v1, %v1
+; CHECK-NEXT:    vuphb %v3, %v3
+; CHECK-NEXT:    vceqf %v2, %v25, %v2
+; CHECK-NEXT:    vuphh %v3, %v3
+; CHECK-NEXT:    vx %v2, %v3, %v2
+; CHECK-NEXT:    vuphf %v3, %v2
+; CHECK-NEXT:    vsldb %v1, %v1, %v1, 12
+; CHECK-NEXT:    vsel %v25, %v5, %v4, %v3
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vl %v4, 416(%r15)
+; CHECK-NEXT:    vl %v5, 288(%r15)
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vceqf %v3, %v27, %v3
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vx %v1, %v1, %v3
+; CHECK-NEXT:    vuphf %v3, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v29, %v5, %v4, %v3
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vl %v4, 240(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v3, 272(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v2, %v2
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i32> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i64> %val5, <16 x i64> %val6
+  ret <16 x i64> %sel
+}
+
+define <16 x i64> @fun185(<16 x i8> %val1, <16 x i8> %val2, <16 x i64> %val3, <16 x i64> %val4, <16 x i64> %val5, <16 x i64> %val6) {
+; CHECK-LABEL: fun185:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 192(%r15)
+; CHECK-NEXT:    vceqg %v1, %v28, %v0
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v2, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 448(%r15)
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vpkf %v2, %v0, %v0
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v30, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 464(%r15)
+; CHECK-NEXT:    vl %v3, 336(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v1
+; CHECK-NEXT:    vpkg %v2, %v0, %v0
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 480(%r15)
+; CHECK-NEXT:    vsel %v28, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vsldb %v2, %v0, %v0, 6
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v27, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 496(%r15)
+; CHECK-NEXT:    vsel %v30, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v3, 384(%r15)
+; CHECK-NEXT:    vmrlg %v2, %v0, %v0
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v29, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 512(%r15)
+; CHECK-NEXT:    vsel %v25, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v3, 400(%r15)
+; CHECK-NEXT:    vsldb %v2, %v0, %v0, 10
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vceqg %v1, %v31, %v1
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 528(%r15)
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 288(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vl %v3, 416(%r15)
+; CHECK-NEXT:    vceqg %v1, %v2, %v1
+; CHECK-NEXT:    vsldb %v2, %v0, %v0, 12
+; CHECK-NEXT:    vuphb %v2, %v2
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 14
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 544(%r15)
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v29, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vceqg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 432(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vx %v0, %v0, %v1
+; CHECK-NEXT:    vl %v1, 560(%r15)
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = icmp eq <16 x i64> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i64> %val5, <16 x i64> %val6
+  ret <16 x i64> %sel
+}
+
+define <16 x i16> @fun186(<16 x i8> %val1, <16 x i8> %val2, <16 x float> %val3, <16 x float> %val4, <16 x i16> %val5, <16 x i16> %val6) {
+; CHECK-LABEL: fun186:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v31, %v31
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v31, %v31
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v28, %v28
+; CHECK-NEXT:    vmrlf %v4, %v25, %v25
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v29, %v29
+; CHECK-NEXT:    vmrlf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v29, %v29
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vuphb %v2, %v1
+; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vmrlf %v2, %v0, %v0
+; CHECK-NEXT:    vmrlf %v3, %v27, %v27
+; CHECK-NEXT:    vmrhf %v0, %v0, %v0
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vmrhf %v3, %v27, %v27
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v0, %v3, %v0
+; CHECK-NEXT:    vpkg %v0, %v0, %v2
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vmrlf %v3, %v2, %v2
+; CHECK-NEXT:    vmrhf %v2, %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v25, %v25
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v2, %v4, %v2
+; CHECK-NEXT:    vpkg %v2, %v2, %v3
+; CHECK-NEXT:    vpkf %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <16 x float> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i16> %val5, <16 x i16> %val6
+  ret <16 x i16> %sel
+}
+
+define <16 x i8> @fun187(<16 x i8> %val1, <16 x i8> %val2, <16 x double> %val3, <16 x double> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun187:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 304(%r15)
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 288(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 256(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v31, %v1
+; CHECK-NEXT:    vfchdb %v2, %v29, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v27, %v1
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vfchdb %v2, %v30, %v2
+; CHECK-NEXT:    vfchdb %v3, %v28, %v3
+; CHECK-NEXT:    vpkg %v2, %v3, %v2
+; CHECK-NEXT:    vpkf %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vceqb %v1, %v24, %v26
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i8> %val1, %val2
+  %cmp1 = fcmp ogt <16 x double> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i8> @fun188(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun188:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v27, %v31
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v25, %v29
+; CHECK-NEXT:    vceqh %v2, %v24, %v28
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun189(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i16> %val5, <16 x i16> %val6) {
+; CHECK-LABEL: fun189:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v25, %v29
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vceqh %v0, %v27, %v31
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i16> %val5, <16 x i16> %val6
+  ret <16 x i16> %sel
+}
+
+define <16 x i32> @fun190(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4, <16 x i32> %val5, <16 x i32> %val6) {
+; CHECK-LABEL: fun190:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v25, %v29
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v1
+; CHECK-NEXT:    vceqh %v1, %v27, %v31
+; CHECK-NEXT:    vceqh %v2, %v26, %v30
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v26, %v3, %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i16> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i32> %val5, <16 x i32> %val6
+  ret <16 x i32> %sel
+}
+
+define <16 x i8> @fun191(<16 x i16> %val1, <16 x i16> %val2, <16 x i32> %val3, <16 x i32> %val4, <16 x i8> %val5, <16 x i8> %val6) {
+; CHECK-LABEL: fun191:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vceqf %v0, %v31, %v0
+; CHECK-NEXT:    vceqf %v1, %v29, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vceqf %v1, %v27, %v1
+; CHECK-NEXT:    vceqf %v2, %v25, %v2
+; CHECK-NEXT:    vpkf %v1, %v2, %v1
+; CHECK-NEXT:    vceqh %v2, %v24, %v28
+; CHECK-NEXT:    vx %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i32> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i8> %val5, <16 x i8> %val6
+  ret <16 x i8> %sel
+}
+
+define <16 x i32> @fun192(<16 x i16> %val1, <16 x i16> %val2, <16 x i64> %val3, <16 x i64> %val4, <16 x i32> %val5, <16 x i32> %val6) {
+; CHECK-LABEL: fun192:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 416(%r15)
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vceqg %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vceqg %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v0, %v2, %v0
+; CHECK-NEXT:    vceqh %v2, %v26, %v30
+; CHECK-NEXT:    vuphh %v3, %v2
+; CHECK-NEXT:    vx %v0, %v3, %v0
+; CHECK-NEXT:    vl %v3, 448(%r15)
+; CHECK-NEXT:    vl %v4, 384(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vceqg %v3, %v29, %v3
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v3, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v1, %v0
+; CHECK-NEXT:    vl %v0, 336(%r15)
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 320(%r15)
+; CHECK-NEXT:    vceqg %v1, %v3, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = icmp eq <16 x i64> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i32> %val5, <16 x i32> %val6
+  ret <16 x i32> %sel
+}
+
+define <16 x double> @fun193(<16 x i16> %val1, <16 x i16> %val2, <16 x float> %val3, <16 x float> %val4, <16 x double> %val5, <16 x double> %val6) {
+; CHECK-LABEL: fun193:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 160(%r15)
+; CHECK-NEXT:    vmrlf %v1, %v0, %v0
+; CHECK-NEXT:    vmrlf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v0, %v0, %v0
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vl %v4, 224(%r15)
+; CHECK-NEXT:    vl %v5, 416(%r15)
+; CHECK-NEXT:    vl %v6, 288(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v2, %v0
+; CHECK-NEXT:    vpkg %v0, %v0, %v1
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vuphf %v2, %v0
+; CHECK-NEXT:    vsel %v24, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vmrlf %v3, %v2, %v2
+; CHECK-NEXT:    vmrlf %v4, %v27, %v27
+; CHECK-NEXT:    vmrhf %v2, %v2, %v2
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v27, %v27
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v2, %v4, %v2
+; CHECK-NEXT:    vl %v4, 256(%r15)
+; CHECK-NEXT:    vpkg %v2, %v2, %v3
+; CHECK-NEXT:    vl %v3, 384(%r15)
+; CHECK-NEXT:    vx %v1, %v1, %v2
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v2
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vmrlf %v3, %v2, %v2
+; CHECK-NEXT:    vmrlf %v4, %v29, %v29
+; CHECK-NEXT:    vmrhf %v2, %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v29, %v29
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v2, %v4, %v2
+; CHECK-NEXT:    vpkg %v2, %v2, %v3
+; CHECK-NEXT:    vceqh %v3, %v26, %v30
+; CHECK-NEXT:    vuphh %v4, %v3
+; CHECK-NEXT:    vx %v2, %v4, %v2
+; CHECK-NEXT:    vuphf %v4, %v2
+; CHECK-NEXT:    vsel %v25, %v6, %v5, %v4
+; CHECK-NEXT:    vl %v4, 208(%r15)
+; CHECK-NEXT:    vmrlf %v5, %v4, %v4
+; CHECK-NEXT:    vmrlf %v6, %v31, %v31
+; CHECK-NEXT:    vmrhf %v4, %v4, %v4
+; CHECK-NEXT:    vmrlg %v3, %v3, %v3
+; CHECK-NEXT:    vuphh %v3, %v3
+; CHECK-NEXT:    vldeb %v5, %v5
+; CHECK-NEXT:    vldeb %v6, %v6
+; CHECK-NEXT:    vfchdb %v5, %v6, %v5
+; CHECK-NEXT:    vmrhf %v6, %v31, %v31
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vldeb %v6, %v6
+; CHECK-NEXT:    vfchdb %v4, %v6, %v4
+; CHECK-NEXT:    vl %v6, 320(%r15)
+; CHECK-NEXT:    vpkg %v4, %v4, %v5
+; CHECK-NEXT:    vl %v5, 448(%r15)
+; CHECK-NEXT:    vx %v3, %v3, %v4
+; CHECK-NEXT:    vuphf %v4, %v3
+; CHECK-NEXT:    vsel %v29, %v6, %v5, %v4
+; CHECK-NEXT:    vl %v4, 368(%r15)
+; CHECK-NEXT:    vl %v5, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v5, %v4, %v0
+; CHECK-NEXT:    vl %v4, 272(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 400(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v4, %v1, %v0
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v2, %v2
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v27, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v3, %v3
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <16 x float> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x double> %val5, <16 x double> %val6
+  ret <16 x double> %sel
+}
+
+define <16 x i32> @fun194(<16 x i16> %val1, <16 x i16> %val2, <16 x double> %val3, <16 x double> %val4, <16 x i32> %val5, <16 x i32> %val6) {
+; CHECK-LABEL: fun194:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vfchdb %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vx %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 416(%r15)
+; CHECK-NEXT:    vl %v3, 352(%r15)
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v0, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v2, %v0
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v0, %v2, %v0
+; CHECK-NEXT:    vceqh %v2, %v26, %v30
+; CHECK-NEXT:    vuphh %v3, %v2
+; CHECK-NEXT:    vx %v0, %v3, %v0
+; CHECK-NEXT:    vl %v3, 448(%r15)
+; CHECK-NEXT:    vl %v4, 384(%r15)
+; CHECK-NEXT:    vsel %v28, %v4, %v3, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vfchdb %v3, %v29, %v3
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vpkg %v0, %v3, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v3, 368(%r15)
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vsel %v26, %v3, %v1, %v0
+; CHECK-NEXT:    vl %v0, 336(%r15)
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 320(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v3, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vx %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp0 = icmp eq <16 x i16> %val1, %val2
+  %cmp1 = fcmp ogt <16 x double> %val3, %val4
+  %and = xor <16 x i1> %cmp0, %cmp1
+  %sel = select <16 x i1> %and, <16 x i32> %val5, <16 x i32> %val6
+  ret <16 x i32> %sel
+}
+
diff --git a/test/CodeGen/SystemZ/vec-cmpsel.ll b/test/CodeGen/SystemZ/vec-cmpsel.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2d518a2cc838f8ffb61a5947caf695d8cd4e53fb
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-cmpsel.ll
@@ -0,0 +1,3378 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;
+; Test that vector compare / select combinations do not produce any
+; unnecessary pack /unpack / shift instructions.
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+
+
+define <2 x i8> @fun0(<2 x i8> %val1, <2 x i8> %val2, <2 x i8> %val3, <2 x i8> %val4) {
+; CHECK-LABEL: fun0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun1(<2 x i8> %val1, <2 x i8> %val2, <2 x i16> %val3, <2 x i16> %val4) {
+; CHECK-LABEL: fun1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun2(<2 x i8> %val1, <2 x i8> %val2, <2 x i32> %val3, <2 x i32> %val4) {
+; CHECK-LABEL: fun2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+}
+
+define <2 x i64> @fun3(<2 x i8> %val1, <2 x i8> %val2, <2 x i64> %val3, <2 x i64> %val4) {
+; CHECK-LABEL: fun3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+}
+
+define <2 x float> @fun4(<2 x i8> %val1, <2 x i8> %val2, <2 x float> %val3, <2 x float> %val4) {
+; CHECK-LABEL: fun4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+}
+
+define <2 x double> @fun5(<2 x i8> %val1, <2 x i8> %val2, <2 x double> %val3, <2 x double> %val4) {
+; CHECK-LABEL: fun5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i8> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+}
+
+define <2 x i8> @fun6(<2 x i16> %val1, <2 x i16> %val2, <2 x i8> %val3, <2 x i8> %val4) {
+; CHECK-LABEL: fun6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun7(<2 x i16> %val1, <2 x i16> %val2, <2 x i16> %val3, <2 x i16> %val4) {
+; CHECK-LABEL: fun7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun8(<2 x i16> %val1, <2 x i16> %val2, <2 x i32> %val3, <2 x i32> %val4) {
+; CHECK-LABEL: fun8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+}
+
+define <2 x i64> @fun9(<2 x i16> %val1, <2 x i16> %val2, <2 x i64> %val3, <2 x i64> %val4) {
+; CHECK-LABEL: fun9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+}
+
+define <2 x float> @fun10(<2 x i16> %val1, <2 x i16> %val2, <2 x float> %val3, <2 x float> %val4) {
+; CHECK-LABEL: fun10:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+}
+
+define <2 x double> @fun11(<2 x i16> %val1, <2 x i16> %val2, <2 x double> %val3, <2 x double> %val4) {
+; CHECK-LABEL: fun11:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i16> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+}
+
+define <2 x i8> @fun12(<2 x i32> %val1, <2 x i32> %val2, <2 x i8> %val3, <2 x i8> %val4) {
+; CHECK-LABEL: fun12:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI12_0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun13(<2 x i32> %val1, <2 x i32> %val2, <2 x i16> %val3, <2 x i16> %val4) {
+; CHECK-LABEL: fun13:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun14(<2 x i32> %val1, <2 x i32> %val2, <2 x i32> %val3, <2 x i32> %val4) {
+; CHECK-LABEL: fun14:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+}
+
+define <2 x i64> @fun15(<2 x i32> %val1, <2 x i32> %val2, <2 x i64> %val3, <2 x i64> %val4) {
+; CHECK-LABEL: fun15:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+}
+
+define <2 x float> @fun16(<2 x i32> %val1, <2 x i32> %val2, <2 x float> %val3, <2 x float> %val4) {
+; CHECK-LABEL: fun16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+}
+
+define <2 x double> @fun17(<2 x i32> %val1, <2 x i32> %val2, <2 x double> %val3, <2 x double> %val4) {
+; CHECK-LABEL: fun17:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i32> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+}
+
+define <2 x i8> @fun18(<2 x i64> %val1, <2 x i64> %val2, <2 x i8> %val3, <2 x i8> %val4) {
+; CHECK-LABEL: fun18:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v24, %v26
+; CHECK-NEXT:    vrepih %v1, 1807
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun19(<2 x i64> %val1, <2 x i64> %val2, <2 x i16> %val3, <2 x i16> %val4) {
+; CHECK-LABEL: fun19:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI19_0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vceqg %v0, %v24, %v26
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun20(<2 x i64> %val1, <2 x i64> %val2, <2 x i32> %val3, <2 x i32> %val4) {
+; CHECK-LABEL: fun20:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v24, %v26
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+}
+
+define <2 x i64> @fun21(<2 x i64> %val1, <2 x i64> %val2, <2 x i64> %val3, <2 x i64> %val4) {
+; CHECK-LABEL: fun21:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+}
+
+define <2 x float> @fun22(<2 x i64> %val1, <2 x i64> %val2, <2 x float> %val3, <2 x float> %val4) {
+; CHECK-LABEL: fun22:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v24, %v26
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+}
+
+define <2 x double> @fun23(<2 x i64> %val1, <2 x i64> %val2, <2 x double> %val3, <2 x double> %val4) {
+; CHECK-LABEL: fun23:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <2 x i64> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+}
+
+define <4 x i8> @fun24(<4 x i8> %val1, <4 x i8> %val2, <4 x i8> %val3, <4 x i8> %val4) {
+; CHECK-LABEL: fun24:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+}
+
+define <4 x i16> @fun25(<4 x i8> %val1, <4 x i8> %val2, <4 x i16> %val3, <4 x i16> %val4) {
+; CHECK-LABEL: fun25:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+}
+
+define <4 x i32> @fun26(<4 x i8> %val1, <4 x i8> %val2, <4 x i32> %val3, <4 x i32> %val4) {
+; CHECK-LABEL: fun26:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun27(<4 x i8> %val1, <4 x i8> %val2, <4 x i64> %val3, <4 x i64> %val4) {
+; CHECK-LABEL: fun27:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+}
+
+define <4 x float> @fun28(<4 x i8> %val1, <4 x i8> %val2, <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: fun28:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+}
+
+define <4 x double> @fun29(<4 x i8> %val1, <4 x i8> %val2, <4 x double> %val3, <4 x double> %val4) {
+; CHECK-LABEL: fun29:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i8> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+}
+
+define <4 x i8> @fun30(<4 x i16> %val1, <4 x i16> %val2, <4 x i8> %val3, <4 x i8> %val4) {
+; CHECK-LABEL: fun30:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+}
+
+define <4 x i16> @fun31(<4 x i16> %val1, <4 x i16> %val2, <4 x i16> %val3, <4 x i16> %val4) {
+; CHECK-LABEL: fun31:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+}
+
+define <4 x i32> @fun32(<4 x i16> %val1, <4 x i16> %val2, <4 x i32> %val3, <4 x i32> %val4) {
+; CHECK-LABEL: fun32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun33(<4 x i16> %val1, <4 x i16> %val2, <4 x i64> %val3, <4 x i64> %val4) {
+; CHECK-LABEL: fun33:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+}
+
+define <4 x float> @fun34(<4 x i16> %val1, <4 x i16> %val2, <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: fun34:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+}
+
+define <4 x double> @fun35(<4 x i16> %val1, <4 x i16> %val2, <4 x double> %val3, <4 x double> %val4) {
+; CHECK-LABEL: fun35:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i16> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+}
+
+define <4 x i8> @fun36(<4 x i32> %val1, <4 x i32> %val2, <4 x i8> %val3, <4 x i8> %val4) {
+; CHECK-LABEL: fun36:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI36_0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+}
+
+define <4 x i16> @fun37(<4 x i32> %val1, <4 x i32> %val2, <4 x i16> %val3, <4 x i16> %val4) {
+; CHECK-LABEL: fun37:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+}
+
+define <4 x i32> @fun38(<4 x i32> %val1, <4 x i32> %val2, <4 x i32> %val3, <4 x i32> %val4) {
+; CHECK-LABEL: fun38:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun39(<4 x i32> %val1, <4 x i32> %val2, <4 x i64> %val3, <4 x i64> %val4) {
+; CHECK-LABEL: fun39:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+}
+
+define <4 x float> @fun40(<4 x i32> %val1, <4 x i32> %val2, <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: fun40:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+}
+
+define <4 x double> @fun41(<4 x i32> %val1, <4 x i32> %val2, <4 x double> %val3, <4 x double> %val4) {
+; CHECK-LABEL: fun41:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v26
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i32> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+}
+
+define <4 x i8> @fun42(<4 x i64> %val1, <4 x i64> %val2, <4 x i8> %val3, <4 x i8> %val4) {
+; CHECK-LABEL: fun42:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI42_0
+; CHECK-NEXT:    vl %v2, 0(%r1)
+; CHECK-NEXT:    vceqg %v0, %v26, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v28
+; CHECK-NEXT:    vperm %v0, %v1, %v0, %v2
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+}
+
+define <4 x i16> @fun43(<4 x i64> %val1, <4 x i64> %val2, <4 x i16> %val3, <4 x i16> %val4) {
+; CHECK-LABEL: fun43:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI43_0
+; CHECK-NEXT:    vl %v2, 0(%r1)
+; CHECK-NEXT:    vceqg %v0, %v26, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v28
+; CHECK-NEXT:    vperm %v0, %v1, %v0, %v2
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+}
+
+define <4 x i32> @fun44(<4 x i64> %val1, <4 x i64> %val2, <4 x i32> %val3, <4 x i32> %val4) {
+; CHECK-LABEL: fun44:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v26, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v28
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun45(<4 x i64> %val1, <4 x i64> %val2, <4 x i64> %val3, <4 x i64> %val4) {
+; CHECK-LABEL: fun45:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v24, %v28
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v0
+; CHECK-NEXT:    vceqg %v0, %v26, %v30
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+}
+
+define <4 x float> @fun46(<4 x i64> %val1, <4 x i64> %val2, <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: fun46:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v26, %v30
+; CHECK-NEXT:    vceqg %v1, %v24, %v28
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+}
+
+define <4 x double> @fun47(<4 x i64> %val1, <4 x i64> %val2, <4 x double> %val3, <4 x double> %val4) {
+; CHECK-LABEL: fun47:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v24, %v28
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v0
+; CHECK-NEXT:    vceqg %v0, %v26, %v30
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <4 x i64> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+}
+
+define <8 x i8> @fun48(<8 x i8> %val1, <8 x i8> %val2, <8 x i8> %val3, <8 x i8> %val4) {
+; CHECK-LABEL: fun48:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun49(<8 x i8> %val1, <8 x i8> %val2, <8 x i16> %val3, <8 x i16> %val4) {
+; CHECK-LABEL: fun49:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun50(<8 x i8> %val1, <8 x i8> %val2, <8 x i32> %val3, <8 x i32> %val4) {
+; CHECK-LABEL: fun50:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+}
+
+define <8 x i64> @fun51(<8 x i8> %val1, <8 x i8> %val2, <8 x i64> %val3, <8 x i64> %val4) {
+; CHECK-LABEL: fun51:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v29, %v1
+; CHECK-NEXT:    vpkf %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v31, %v1
+; CHECK-NEXT:    vpkg %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 6
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v25, %v2, %v1
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v27, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+}
+
+define <8 x float> @fun52(<8 x i8> %val1, <8 x i8> %val2, <8 x float> %val3, <8 x float> %val4) {
+; CHECK-LABEL: fun52:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+}
+
+define <8 x double> @fun53(<8 x i8> %val1, <8 x i8> %val2, <8 x double> %val3, <8 x double> %val4) {
+; CHECK-LABEL: fun53:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v29, %v1
+; CHECK-NEXT:    vpkf %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v31, %v1
+; CHECK-NEXT:    vpkg %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 6
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v25, %v2, %v1
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v27, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i8> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+}
+
+define <8 x i8> @fun54(<8 x i16> %val1, <8 x i16> %val2, <8 x i8> %val3, <8 x i8> %val4) {
+; CHECK-LABEL: fun54:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vpkh %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun55(<8 x i16> %val1, <8 x i16> %val2, <8 x i16> %val3, <8 x i16> %val4) {
+; CHECK-LABEL: fun55:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun56(<8 x i16> %val1, <8 x i16> %val2, <8 x i32> %val3, <8 x i32> %val4) {
+; CHECK-LABEL: fun56:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+}
+
+define <8 x i64> @fun57(<8 x i16> %val1, <8 x i16> %val2, <8 x i64> %val3, <8 x i64> %val4) {
+; CHECK-LABEL: fun57:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v29, %v1
+; CHECK-NEXT:    vpkg %v1, %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v31, %v1
+; CHECK-NEXT:    vmrlg %v1, %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 12
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v25, %v2, %v1
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v27, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+}
+
+define <8 x float> @fun58(<8 x i16> %val1, <8 x i16> %val2, <8 x float> %val3, <8 x float> %val4) {
+; CHECK-LABEL: fun58:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+}
+
+define <8 x double> @fun59(<8 x i16> %val1, <8 x i16> %val2, <8 x double> %val3, <8 x double> %val4) {
+; CHECK-LABEL: fun59:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v26
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v29, %v1
+; CHECK-NEXT:    vpkg %v1, %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v31, %v1
+; CHECK-NEXT:    vmrlg %v1, %v0, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 12
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v25, %v2, %v1
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v27, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i16> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+}
+
+define <8 x i8> @fun60(<8 x i32> %val1, <8 x i32> %val2, <8 x i8> %val3, <8 x i8> %val4) {
+; CHECK-LABEL: fun60:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI60_0
+; CHECK-NEXT:    vl %v2, 0(%r1)
+; CHECK-NEXT:    vceqf %v0, %v26, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vperm %v0, %v1, %v0, %v2
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun61(<8 x i32> %val1, <8 x i32> %val2, <8 x i16> %val3, <8 x i16> %val4) {
+; CHECK-LABEL: fun61:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v26, %v30
+; CHECK-NEXT:    vceqf %v1, %v24, %v28
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun62(<8 x i32> %val1, <8 x i32> %val2, <8 x i32> %val3, <8 x i32> %val4) {
+; CHECK-LABEL: fun62:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v28
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v0
+; CHECK-NEXT:    vceqf %v0, %v26, %v30
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+}
+
+define <8 x i64> @fun63(<8 x i32> %val1, <8 x i32> %val2, <8 x i64> %val3, <8 x i64> %val4) {
+; CHECK-LABEL: fun63:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v28
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v2, %v1
+; CHECK-NEXT:    vceqf %v1, %v26, %v30
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v26, %v27, %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v31, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+}
+
+define <8 x float> @fun64(<8 x i32> %val1, <8 x i32> %val2, <8 x float> %val3, <8 x float> %val4) {
+; CHECK-LABEL: fun64:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v28
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v0
+; CHECK-NEXT:    vceqf %v0, %v26, %v30
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+}
+
+define <8 x double> @fun65(<8 x i32> %val1, <8 x i32> %val2, <8 x double> %val3, <8 x double> %val4) {
+; CHECK-LABEL: fun65:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v24, %v28
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v2, %v1
+; CHECK-NEXT:    vceqf %v1, %v26, %v30
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v26, %v27, %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v31, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i32> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+}
+
+define <8 x i8> @fun66(<8 x i64> %val1, <8 x i64> %val2, <8 x i8> %val3, <8 x i8> %val4) {
+; CHECK-LABEL: fun66:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v30, %v31
+; CHECK-NEXT:    vceqg %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqg %v1, %v26, %v27
+; CHECK-NEXT:    vceqg %v2, %v24, %v25
+; CHECK-NEXT:    larl %r1, .LCPI66_0
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v1, %v0, %v2
+; CHECK-NEXT:    vlrepg %v1, 168(%r15)
+; CHECK-NEXT:    vlrepg %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun67(<8 x i64> %val1, <8 x i64> %val2, <8 x i16> %val3, <8 x i16> %val4) {
+; CHECK-LABEL: fun67:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v30, %v31
+; CHECK-NEXT:    vceqg %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vceqg %v1, %v26, %v27
+; CHECK-NEXT:    vceqg %v2, %v24, %v25
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun68(<8 x i64> %val1, <8 x i64> %val2, <8 x i32> %val3, <8 x i32> %val4) {
+; CHECK-LABEL: fun68:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v26, %v27
+; CHECK-NEXT:    vceqg %v1, %v24, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vceqg %v0, %v30, %v31
+; CHECK-NEXT:    vceqg %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+}
+
+define <8 x i64> @fun69(<8 x i64> %val1, <8 x i64> %val2, <8 x i64> %val3, <8 x i64> %val4) {
+; CHECK-LABEL: fun69:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vceqg %v0, %v24, %v25
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vceqg %v0, %v26, %v27
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v28, %v29
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vceqg %v0, %v30, %v31
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+}
+
+define <8 x float> @fun70(<8 x i64> %val1, <8 x i64> %val2, <8 x float> %val3, <8 x float> %val4) {
+; CHECK-LABEL: fun70:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqg %v0, %v26, %v27
+; CHECK-NEXT:    vceqg %v1, %v24, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vceqg %v0, %v30, %v31
+; CHECK-NEXT:    vceqg %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+}
+
+define <8 x double> @fun71(<8 x i64> %val1, <8 x i64> %val2, <8 x double> %val3, <8 x double> %val4) {
+; CHECK-LABEL: fun71:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vceqg %v0, %v24, %v25
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vceqg %v0, %v26, %v27
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v28, %v29
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vceqg %v0, %v30, %v31
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <8 x i64> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+}
+
+define <16 x i8> @fun72(<16 x i8> %val1, <16 x i8> %val2, <16 x i8> %val3, <16 x i8> %val4) {
+; CHECK-LABEL: fun72:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun73(<16 x i8> %val1, <16 x i8> %val2, <16 x i16> %val3, <16 x i16> %val4) {
+; CHECK-LABEL: fun73:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+}
+
+define <16 x i32> @fun74(<16 x i8> %val1, <16 x i8> %val2, <16 x i32> %val3, <16 x i32> %val4) {
+; CHECK-LABEL: fun74:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v29, %v1
+; CHECK-NEXT:    vpkg %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v31, %v1
+; CHECK-NEXT:    vmrlg %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 12
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v25, %v2, %v1
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v27, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+}
+
+define <16 x i64> @fun75(<16 x i8> %val1, <16 x i8> %val2, <16 x i64> %val3, <16 x i64> %val4) {
+; CHECK-LABEL: fun75:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v2, %v1
+; CHECK-NEXT:    vpkf %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v2, %v1
+; CHECK-NEXT:    vpkg %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vsel %v28, %v25, %v2, %v1
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vsldb %v1, %v0, %v0, 6
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v30, %v27, %v2, %v1
+; CHECK-NEXT:    vl %v2, 256(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v25, %v29, %v2, %v1
+; CHECK-NEXT:    vl %v2, 272(%r15)
+; CHECK-NEXT:    vsldb %v1, %v0, %v0, 10
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v27, %v31, %v2, %v1
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsldb %v1, %v0, %v0, 12
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 14
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v29, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+}
+
+define <16 x float> @fun76(<16 x i8> %val1, <16 x i8> %val2, <16 x float> %val3, <16 x float> %val4) {
+; CHECK-LABEL: fun76:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v29, %v1
+; CHECK-NEXT:    vpkg %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v31, %v1
+; CHECK-NEXT:    vmrlg %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 12
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v25, %v2, %v1
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v27, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+}
+
+define <16 x double> @fun77(<16 x i8> %val1, <16 x i8> %val2, <16 x double> %val3, <16 x double> %val4) {
+; CHECK-LABEL: fun77:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqb %v0, %v24, %v26
+; CHECK-NEXT:    vuphb %v1, %v0
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v2, %v1
+; CHECK-NEXT:    vpkf %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v2, %v1
+; CHECK-NEXT:    vpkg %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vsel %v28, %v25, %v2, %v1
+; CHECK-NEXT:    vl %v2, 240(%r15)
+; CHECK-NEXT:    vsldb %v1, %v0, %v0, 6
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v30, %v27, %v2, %v1
+; CHECK-NEXT:    vl %v2, 256(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v0, %v0
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v25, %v29, %v2, %v1
+; CHECK-NEXT:    vl %v2, 272(%r15)
+; CHECK-NEXT:    vsldb %v1, %v0, %v0, 10
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v27, %v31, %v2, %v1
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsldb %v1, %v0, %v0, 12
+; CHECK-NEXT:    vuphb %v1, %v1
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 14
+; CHECK-NEXT:    vuphh %v1, %v1
+; CHECK-NEXT:    vuphb %v0, %v0
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v29, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v1, 304(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i8> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+}
+
+define <16 x i8> @fun78(<16 x i16> %val1, <16 x i16> %val2, <16 x i8> %val3, <16 x i8> %val4) {
+; CHECK-LABEL: fun78:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v26, %v30
+; CHECK-NEXT:    vceqh %v1, %v24, %v28
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun79(<16 x i16> %val1, <16 x i16> %val2, <16 x i16> %val3, <16 x i16> %val4) {
+; CHECK-LABEL: fun79:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v28
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v0
+; CHECK-NEXT:    vceqh %v0, %v26, %v30
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+}
+
+define <16 x i32> @fun80(<16 x i16> %val1, <16 x i16> %val2, <16 x i32> %val3, <16 x i32> %val4) {
+; CHECK-LABEL: fun80:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v28
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v2, %v1
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v26, %v27, %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v31, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+}
+
+define <16 x i64> @fun81(<16 x i16> %val1, <16 x i16> %val2, <16 x i64> %val3, <16 x i64> %val4) {
+; CHECK-LABEL: fun81:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v28
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v2, %v1
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vl %v3, 288(%r15)
+; CHECK-NEXT:    vl %v4, 160(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v25, %v4, %v3, %v2
+; CHECK-NEXT:    vpkg %v2, %v0, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v26, %v27, %v3, %v2
+; CHECK-NEXT:    vmrlg %v2, %v0, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 12
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 272(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v31, %v2, %v0
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v29, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vsldb %v0, %v1, %v1, 12
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+}
+
+define <16 x float> @fun82(<16 x i16> %val1, <16 x i16> %val2, <16 x float> %val3, <16 x float> %val4) {
+; CHECK-LABEL: fun82:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v28
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v2, %v1
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v26, %v27, %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v31, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+}
+
+define <16 x double> @fun83(<16 x i16> %val1, <16 x i16> %val2, <16 x double> %val3, <16 x double> %val4) {
+; CHECK-LABEL: fun83:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqh %v0, %v24, %v28
+; CHECK-NEXT:    vuphh %v1, %v0
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v24, %v25, %v2, %v1
+; CHECK-NEXT:    vceqh %v1, %v26, %v30
+; CHECK-NEXT:    vuphh %v2, %v1
+; CHECK-NEXT:    vl %v3, 288(%r15)
+; CHECK-NEXT:    vl %v4, 160(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v25, %v4, %v3, %v2
+; CHECK-NEXT:    vpkg %v2, %v0, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vsel %v26, %v27, %v3, %v2
+; CHECK-NEXT:    vmrlg %v2, %v0, %v0
+; CHECK-NEXT:    vuphh %v2, %v2
+; CHECK-NEXT:    vsldb %v0, %v0, %v0, 12
+; CHECK-NEXT:    vl %v3, 256(%r15)
+; CHECK-NEXT:    vuphf %v2, %v2
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vsel %v28, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 272(%r15)
+; CHECK-NEXT:    vl %v3, 176(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v31, %v2, %v0
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v29, %v3, %v2, %v0
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vsldb %v0, %v1, %v1, 12
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vuphh %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i16> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+}
+
+define <16 x i8> @fun84(<16 x i32> %val1, <16 x i32> %val2, <16 x i8> %val3, <16 x i8> %val4) {
+; CHECK-LABEL: fun84:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v30, %v31
+; CHECK-NEXT:    vceqf %v1, %v28, %v29
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vceqf %v1, %v26, %v27
+; CHECK-NEXT:    vceqf %v2, %v24, %v25
+; CHECK-NEXT:    vpkf %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun85(<16 x i32> %val1, <16 x i32> %val2, <16 x i16> %val3, <16 x i16> %val4) {
+; CHECK-LABEL: fun85:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v0, %v26, %v27
+; CHECK-NEXT:    vceqf %v1, %v24, %v25
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vceqf %v0, %v30, %v31
+; CHECK-NEXT:    vceqf %v1, %v28, %v29
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+}
+
+define <16 x i32> @fun86(<16 x i32> %val1, <16 x i32> %val2, <16 x i32> %val3, <16 x i32> %val4) {
+; CHECK-LABEL: fun86:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vceqf %v0, %v24, %v25
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vceqf %v0, %v26, %v27
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vceqf %v0, %v28, %v29
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vceqf %v0, %v30, %v31
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+}
+
+define <16 x i64> @fun87(<16 x i32> %val1, <16 x i32> %val2, <16 x i64> %val3, <16 x i64> %val4) {
+; CHECK-LABEL: fun87:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v1, %v24, %v25
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vuphf %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vceqf %v2, %v26, %v27
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphf %v0, %v2
+; CHECK-NEXT:    vsel %v0, %v4, %v3, %v0
+; CHECK-NEXT:    vceqf %v3, %v28, %v29
+; CHECK-NEXT:    vl %v5, 352(%r15)
+; CHECK-NEXT:    vl %v6, 224(%r15)
+; CHECK-NEXT:    vuphf %v4, %v3
+; CHECK-NEXT:    vsel %v25, %v6, %v5, %v4
+; CHECK-NEXT:    vceqf %v4, %v30, %v31
+; CHECK-NEXT:    vl %v6, 384(%r15)
+; CHECK-NEXT:    vl %v7, 256(%r15)
+; CHECK-NEXT:    vuphf %v5, %v4
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vsel %v29, %v7, %v6, %v5
+; CHECK-NEXT:    vl %v5, 304(%r15)
+; CHECK-NEXT:    vl %v6, 176(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v6, %v5, %v1
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vl %v5, 208(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v30, %v5, %v2, %v1
+; CHECK-NEXT:    vmrlg %v1, %v3, %v3
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vlr %v28, %v0
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vl %v3, 272(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v4, %v4
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v31, %v3, %v2, %v1
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+}
+
+define <16 x float> @fun88(<16 x i32> %val1, <16 x i32> %val2, <16 x float> %val3, <16 x float> %val4) {
+; CHECK-LABEL: fun88:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vceqf %v0, %v24, %v25
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vceqf %v0, %v26, %v27
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vceqf %v0, %v28, %v29
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vceqf %v0, %v30, %v31
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+}
+
+define <16 x double> @fun89(<16 x i32> %val1, <16 x i32> %val2, <16 x double> %val3, <16 x double> %val4) {
+; CHECK-LABEL: fun89:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vceqf %v1, %v24, %v25
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vuphf %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vceqf %v2, %v26, %v27
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vuphf %v0, %v2
+; CHECK-NEXT:    vsel %v0, %v4, %v3, %v0
+; CHECK-NEXT:    vceqf %v3, %v28, %v29
+; CHECK-NEXT:    vl %v5, 352(%r15)
+; CHECK-NEXT:    vl %v6, 224(%r15)
+; CHECK-NEXT:    vuphf %v4, %v3
+; CHECK-NEXT:    vsel %v25, %v6, %v5, %v4
+; CHECK-NEXT:    vceqf %v4, %v30, %v31
+; CHECK-NEXT:    vl %v6, 384(%r15)
+; CHECK-NEXT:    vl %v7, 256(%r15)
+; CHECK-NEXT:    vuphf %v5, %v4
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vsel %v29, %v7, %v6, %v5
+; CHECK-NEXT:    vl %v5, 304(%r15)
+; CHECK-NEXT:    vl %v6, 176(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v26, %v6, %v5, %v1
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vl %v5, 208(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v30, %v5, %v2, %v1
+; CHECK-NEXT:    vmrlg %v1, %v3, %v3
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vlr %v28, %v0
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vl %v3, 272(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v4, %v4
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v31, %v3, %v2, %v1
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i32> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+}
+
+define <16 x i8> @fun90(<16 x i64> %val1, <16 x i64> %val2, <16 x i8> %val3, <16 x i8> %val4) {
+; CHECK-LABEL: fun90:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vceqg %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vceqg %v1, %v27, %v1
+; CHECK-NEXT:    vceqg %v2, %v25, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vceqg %v1, %v30, %v1
+; CHECK-NEXT:    vceqg %v2, %v28, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vceqg %v2, %v26, %v2
+; CHECK-NEXT:    vceqg %v3, %v24, %v3
+; CHECK-NEXT:    vpkg %v2, %v3, %v2
+; CHECK-NEXT:    vpkf %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 304(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun91(<16 x i64> %val1, <16 x i64> %val2, <16 x i16> %val3, <16 x i16> %val4) {
+; CHECK-LABEL: fun91:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v30, %v0
+; CHECK-NEXT:    vceqg %v1, %v28, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vceqg %v1, %v26, %v1
+; CHECK-NEXT:    vceqg %v2, %v24, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 320(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vceqg %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vceqg %v1, %v27, %v1
+; CHECK-NEXT:    vceqg %v2, %v25, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+}
+
+define <16 x i32> @fun92(<16 x i64> %val1, <16 x i64> %val2, <16 x i32> %val3, <16 x i32> %val4) {
+; CHECK-LABEL: fun92:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 160(%r15)
+; CHECK-NEXT:    vceqg %v0, %v26, %v0
+; CHECK-NEXT:    vceqg %v1, %v24, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 352(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v30, %v0
+; CHECK-NEXT:    vceqg %v1, %v28, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 368(%r15)
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 384(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vceqg %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 400(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+}
+
+define <16 x i64> @fun93(<16 x i64> %val1, <16 x i64> %val2, <16 x i64> %val3, <16 x i64> %val4) {
+; CHECK-LABEL: fun93:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 160(%r15)
+; CHECK-NEXT:    vl %v1, 416(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vceqg %v0, %v24, %v0
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vceqg %v0, %v26, %v0
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 192(%r15)
+; CHECK-NEXT:    vl %v1, 448(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vceqg %v0, %v28, %v0
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vceqg %v0, %v30, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 224(%r15)
+; CHECK-NEXT:    vl %v1, 480(%r15)
+; CHECK-NEXT:    vl %v2, 352(%r15)
+; CHECK-NEXT:    vceqg %v0, %v25, %v0
+; CHECK-NEXT:    vsel %v25, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 496(%r15)
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vsel %v27, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 256(%r15)
+; CHECK-NEXT:    vceqg %v0, %v29, %v0
+; CHECK-NEXT:    vl %v1, 512(%r15)
+; CHECK-NEXT:    vl %v2, 384(%r15)
+; CHECK-NEXT:    vsel %v29, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vl %v1, 528(%r15)
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+}
+
+define <16 x float> @fun94(<16 x i64> %val1, <16 x i64> %val2, <16 x float> %val3, <16 x float> %val4) {
+; CHECK-LABEL: fun94:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 160(%r15)
+; CHECK-NEXT:    vceqg %v0, %v26, %v0
+; CHECK-NEXT:    vceqg %v1, %v24, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 352(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vceqg %v0, %v30, %v0
+; CHECK-NEXT:    vceqg %v1, %v28, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 368(%r15)
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vceqg %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 384(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vceqg %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 400(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+}
+
+define <16 x double> @fun95(<16 x i64> %val1, <16 x i64> %val2, <16 x double> %val3, <16 x double> %val4) {
+; CHECK-LABEL: fun95:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 160(%r15)
+; CHECK-NEXT:    vl %v1, 416(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vceqg %v0, %v24, %v0
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vceqg %v0, %v26, %v0
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 192(%r15)
+; CHECK-NEXT:    vl %v1, 448(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vceqg %v0, %v28, %v0
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vceqg %v0, %v30, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 224(%r15)
+; CHECK-NEXT:    vl %v1, 480(%r15)
+; CHECK-NEXT:    vl %v2, 352(%r15)
+; CHECK-NEXT:    vceqg %v0, %v25, %v0
+; CHECK-NEXT:    vsel %v25, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 496(%r15)
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vceqg %v0, %v27, %v0
+; CHECK-NEXT:    vsel %v27, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 256(%r15)
+; CHECK-NEXT:    vceqg %v0, %v29, %v0
+; CHECK-NEXT:    vl %v1, 512(%r15)
+; CHECK-NEXT:    vl %v2, 384(%r15)
+; CHECK-NEXT:    vsel %v29, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vceqg %v0, %v31, %v0
+; CHECK-NEXT:    vl %v1, 528(%r15)
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = icmp eq <16 x i64> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+}
+
+define <2 x i8> @fun96(<2 x float> %val1, <2 x float> %val2, <2 x i8> %val3, <2 x i8> %val4) {
+; CHECK-LABEL: fun96:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    larl %r1, .LCPI96_0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun97(<2 x float> %val1, <2 x float> %val2, <2 x i16> %val3, <2 x i16> %val4) {
+; CHECK-LABEL: fun97:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun98(<2 x float> %val1, <2 x float> %val2, <2 x i32> %val3, <2 x i32> %val4) {
+; CHECK-LABEL: fun98:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+}
+
+define <2 x i64> @fun99(<2 x float> %val1, <2 x float> %val2, <2 x i64> %val3, <2 x i64> %val4) {
+; CHECK-LABEL: fun99:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+}
+
+define <2 x float> @fun100(<2 x float> %val1, <2 x float> %val2, <2 x float> %val3, <2 x float> %val4) {
+; CHECK-LABEL: fun100:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+}
+
+define <2 x double> @fun101(<2 x float> %val1, <2 x float> %val2, <2 x double> %val3, <2 x double> %val4) {
+; CHECK-LABEL: fun101:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x float> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+}
+
+define <2 x i8> @fun102(<2 x double> %val1, <2 x double> %val2, <2 x i8> %val3, <2 x i8> %val4) {
+; CHECK-LABEL: fun102:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v24, %v26
+; CHECK-NEXT:    vrepih %v1, 1807
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i8> %val3, <2 x i8> %val4
+  ret <2 x i8> %sel
+}
+
+define <2 x i16> @fun103(<2 x double> %val1, <2 x double> %val2, <2 x i16> %val3, <2 x i16> %val4) {
+; CHECK-LABEL: fun103:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI103_0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vfchdb %v0, %v24, %v26
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i16> %val3, <2 x i16> %val4
+  ret <2 x i16> %sel
+}
+
+define <2 x i32> @fun104(<2 x double> %val1, <2 x double> %val2, <2 x i32> %val3, <2 x i32> %val4) {
+; CHECK-LABEL: fun104:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v24, %v26
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i32> %val3, <2 x i32> %val4
+  ret <2 x i32> %sel
+}
+
+define <2 x i64> @fun105(<2 x double> %val1, <2 x double> %val2, <2 x i64> %val3, <2 x i64> %val4) {
+; CHECK-LABEL: fun105:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x i64> %val3, <2 x i64> %val4
+  ret <2 x i64> %sel
+}
+
+define <2 x float> @fun106(<2 x double> %val1, <2 x double> %val2, <2 x float> %val3, <2 x float> %val4) {
+; CHECK-LABEL: fun106:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v24, %v26
+; CHECK-NEXT:    vpkg %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x float> %val3, <2 x float> %val4
+  ret <2 x float> %sel
+}
+
+define <2 x double> @fun107(<2 x double> %val1, <2 x double> %val2, <2 x double> %val3, <2 x double> %val4) {
+; CHECK-LABEL: fun107:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v24, %v26
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <2 x double> %val1, %val2
+  %sel = select <2 x i1> %cmp, <2 x double> %val3, <2 x double> %val4
+  ret <2 x double> %sel
+}
+
+define <4 x i8> @fun108(<4 x float> %val1, <4 x float> %val2, <4 x i8> %val3, <4 x i8> %val4) {
+; CHECK-LABEL: fun108:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    larl %r1, .LCPI108_0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v0, %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+}
+
+define <4 x i16> @fun109(<4 x float> %val1, <4 x float> %val2, <4 x i16> %val3, <4 x i16> %val4) {
+; CHECK-LABEL: fun109:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vpkf %v0, %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+}
+
+define <4 x i32> @fun110(<4 x float> %val1, <4 x float> %val2, <4 x i32> %val3, <4 x i32> %val4) {
+; CHECK-LABEL: fun110:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun111(<4 x float> %val1, <4 x float> %val2, <4 x i64> %val3, <4 x i64> %val4) {
+; CHECK-LABEL: fun111:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+}
+
+define <4 x float> @fun112(<4 x float> %val1, <4 x float> %val2, <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: fun112:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v30, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+}
+
+define <4 x double> @fun113(<4 x float> %val1, <4 x float> %val2, <4 x double> %val3, <4 x double> %val4) {
+; CHECK-LABEL: fun113:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v26, %v26
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v24, %v28, %v25, %v1
+; CHECK-NEXT:    vsel %v26, %v30, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x float> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+}
+
+define <4 x i8> @fun114(<4 x double> %val1, <4 x double> %val2, <4 x i8> %val3, <4 x i8> %val4) {
+; CHECK-LABEL: fun114:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI114_0
+; CHECK-NEXT:    vl %v2, 0(%r1)
+; CHECK-NEXT:    vfchdb %v0, %v26, %v30
+; CHECK-NEXT:    vfchdb %v1, %v24, %v28
+; CHECK-NEXT:    vperm %v0, %v1, %v0, %v2
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i8> %val3, <4 x i8> %val4
+  ret <4 x i8> %sel
+}
+
+define <4 x i16> @fun115(<4 x double> %val1, <4 x double> %val2, <4 x i16> %val3, <4 x i16> %val4) {
+; CHECK-LABEL: fun115:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    larl %r1, .LCPI115_0
+; CHECK-NEXT:    vl %v2, 0(%r1)
+; CHECK-NEXT:    vfchdb %v0, %v26, %v30
+; CHECK-NEXT:    vfchdb %v1, %v24, %v28
+; CHECK-NEXT:    vperm %v0, %v1, %v0, %v2
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i16> %val3, <4 x i16> %val4
+  ret <4 x i16> %sel
+}
+
+define <4 x i32> @fun116(<4 x double> %val1, <4 x double> %val2, <4 x i32> %val3, <4 x i32> %val4) {
+; CHECK-LABEL: fun116:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v26, %v30
+; CHECK-NEXT:    vfchdb %v1, %v24, %v28
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i32> %val3, <4 x i32> %val4
+  ret <4 x i32> %sel
+}
+
+define <4 x i64> @fun117(<4 x double> %val1, <4 x double> %val2, <4 x i64> %val3, <4 x i64> %val4) {
+; CHECK-LABEL: fun117:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v24, %v28
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v0
+; CHECK-NEXT:    vfchdb %v0, %v26, %v30
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x i64> %val3, <4 x i64> %val4
+  ret <4 x i64> %sel
+}
+
+define <4 x float> @fun118(<4 x double> %val1, <4 x double> %val2, <4 x float> %val3, <4 x float> %val4) {
+; CHECK-LABEL: fun118:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v26, %v30
+; CHECK-NEXT:    vfchdb %v1, %v24, %v28
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x float> %val3, <4 x float> %val4
+  ret <4 x float> %sel
+}
+
+define <4 x double> @fun119(<4 x double> %val1, <4 x double> %val2, <4 x double> %val3, <4 x double> %val4) {
+; CHECK-LABEL: fun119:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v24, %v28
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v0
+; CHECK-NEXT:    vfchdb %v0, %v26, %v30
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <4 x double> %val1, %val2
+  %sel = select <4 x i1> %cmp, <4 x double> %val3, <4 x double> %val4
+  ret <4 x double> %sel
+}
+
+define <8 x i8> @fun120(<8 x float> %val1, <8 x float> %val2, <8 x i8> %val3, <8 x i8> %val4) {
+; CHECK-LABEL: fun120:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    larl %r1, .LCPI120_0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v1, %v0, %v2
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun121(<8 x float> %val1, <8 x float> %val2, <8 x i16> %val3, <8 x i16> %val4) {
+; CHECK-LABEL: fun121:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v27, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun122(<8 x float> %val1, <8 x float> %val2, <8 x i32> %val3, <8 x i32> %val4) {
+; CHECK-LABEL: fun122:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v28, %v28
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v28, %v28
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v0
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+}
+
+define <8 x i64> @fun123(<8 x float> %val1, <8 x float> %val2, <8 x i64> %val3, <8 x i64> %val4) {
+; CHECK-LABEL: fun123:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v28, %v28
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v28, %v28
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v2, %v1
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vmrlf %v2, %v26, %v26
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vsel %v28, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v27, %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v31, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+}
+
+define <8 x float> @fun124(<8 x float> %val1, <8 x float> %val2, <8 x float> %val3, <8 x float> %val4) {
+; CHECK-LABEL: fun124:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v28, %v28
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v28, %v28
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v29, %v0
+; CHECK-NEXT:    vmrlf %v0, %v30, %v30
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v30, %v30
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vsel %v26, %v27, %v31, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+}
+
+define <8 x double> @fun125(<8 x float> %val1, <8 x float> %val2, <8 x double> %val3, <8 x double> %val4) {
+; CHECK-LABEL: fun125:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v28, %v28
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v28, %v28
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v26, %v26
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vuphf %v1, %v0
+; CHECK-NEXT:    vsel %v24, %v25, %v2, %v1
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vmrlf %v2, %v26, %v26
+; CHECK-NEXT:    vmrlg %v0, %v0, %v0
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 192(%r15)
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vuphf %v2, %v1
+; CHECK-NEXT:    vsel %v28, %v29, %v3, %v2
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v27, %v2, %v0
+; CHECK-NEXT:    vmrlg %v0, %v1, %v1
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vuphf %v0, %v0
+; CHECK-NEXT:    vsel %v30, %v31, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x float> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+}
+
+define <8 x i8> @fun126(<8 x double> %val1, <8 x double> %val2, <8 x i8> %val3, <8 x i8> %val4) {
+; CHECK-LABEL: fun126:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v31
+; CHECK-NEXT:    vfchdb %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vfchdb %v1, %v26, %v27
+; CHECK-NEXT:    vfchdb %v2, %v24, %v25
+; CHECK-NEXT:    larl %r1, .LCPI126_0
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 0(%r1)
+; CHECK-NEXT:    vperm %v0, %v1, %v0, %v2
+; CHECK-NEXT:    vlrepg %v1, 168(%r15)
+; CHECK-NEXT:    vlrepg %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i8> %val3, <8 x i8> %val4
+  ret <8 x i8> %sel
+}
+
+define <8 x i16> @fun127(<8 x double> %val1, <8 x double> %val2, <8 x i16> %val3, <8 x i16> %val4) {
+; CHECK-LABEL: fun127:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v30, %v31
+; CHECK-NEXT:    vfchdb %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vfchdb %v1, %v26, %v27
+; CHECK-NEXT:    vfchdb %v2, %v24, %v25
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i16> %val3, <8 x i16> %val4
+  ret <8 x i16> %sel
+}
+
+define <8 x i32> @fun128(<8 x double> %val1, <8 x double> %val2, <8 x i32> %val3, <8 x i32> %val4) {
+; CHECK-LABEL: fun128:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v26, %v27
+; CHECK-NEXT:    vfchdb %v1, %v24, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vfchdb %v0, %v30, %v31
+; CHECK-NEXT:    vfchdb %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i32> %val3, <8 x i32> %val4
+  ret <8 x i32> %sel
+}
+
+define <8 x i64> @fun129(<8 x double> %val1, <8 x double> %val2, <8 x i64> %val3, <8 x i64> %val4) {
+; CHECK-LABEL: fun129:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v24, %v25
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v26, %v27
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v28, %v29
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v30, %v31
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x i64> %val3, <8 x i64> %val4
+  ret <8 x i64> %sel
+}
+
+define <8 x float> @fun130(<8 x double> %val1, <8 x double> %val2, <8 x float> %val3, <8 x float> %val4) {
+; CHECK-LABEL: fun130:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vfchdb %v0, %v26, %v27
+; CHECK-NEXT:    vfchdb %v1, %v24, %v25
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vfchdb %v0, %v30, %v31
+; CHECK-NEXT:    vfchdb %v1, %v28, %v29
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x float> %val3, <8 x float> %val4
+  ret <8 x float> %sel
+}
+
+define <8 x double> @fun131(<8 x double> %val1, <8 x double> %val2, <8 x double> %val3, <8 x double> %val4) {
+; CHECK-LABEL: fun131:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v24, %v25
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v26, %v27
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v28, %v29
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v30, %v31
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <8 x double> %val1, %val2
+  %sel = select <8 x i1> %cmp, <8 x double> %val3, <8 x double> %val4
+  ret <8 x double> %sel
+}
+
+define <16 x i8> @fun132(<16 x float> %val1, <16 x float> %val2, <16 x i8> %val3, <16 x i8> %val4) {
+; CHECK-LABEL: fun132:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v31, %v31
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v31, %v31
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v28, %v28
+; CHECK-NEXT:    vmrhf %v4, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v29, %v29
+; CHECK-NEXT:    vmrlf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v29, %v29
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v27, %v27
+; CHECK-NEXT:    vmrlf %v2, %v26, %v26
+; CHECK-NEXT:    vmrhf %v3, %v26, %v26
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v27, %v27
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vmrlf %v2, %v25, %v25
+; CHECK-NEXT:    vmrlf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vmrhf %v3, %v25, %v25
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vpkg %v2, %v3, %v2
+; CHECK-NEXT:    vpkf %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun133(<16 x float> %val1, <16 x float> %val2, <16 x i16> %val3, <16 x i16> %val4) {
+; CHECK-LABEL: fun133:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v27, %v27
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vmrhf %v3, %v24, %v24
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v25, %v25
+; CHECK-NEXT:    vmrlf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v25, %v25
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vmrlf %v0, %v31, %v31
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vmrhf %v3, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v31, %v31
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vmrlf %v1, %v29, %v29
+; CHECK-NEXT:    vmrlf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vmrhf %v2, %v29, %v29
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+}
+
+define <16 x i32> @fun134(<16 x float> %val1, <16 x float> %val2, <16 x i32> %val3, <16 x i32> %val4) {
+; CHECK-LABEL: fun134:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v25, %v25
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v25, %v25
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v27, %v27
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vmrlf %v0, %v29, %v29
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v29, %v29
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vmrlf %v0, %v31, %v31
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v31, %v31
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+}
+
+define <16 x i64> @fun135(<16 x float> %val1, <16 x float> %val2, <16 x i64> %val3, <16 x i64> %val4) {
+; CHECK-LABEL: fun135:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v25, %v25
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v25, %v25
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vl %v6, 224(%r15)
+; CHECK-NEXT:    vl %v7, 256(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vpkg %v1, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v2, %v26, %v26
+; CHECK-NEXT:    vmrhf %v3, %v26, %v26
+; CHECK-NEXT:    vmrhf %v5, %v28, %v28
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v0, %v2, %v0
+; CHECK-NEXT:    vmrhf %v2, %v27, %v27
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vpkg %v2, %v2, %v0
+; CHECK-NEXT:    vuphf %v0, %v2
+; CHECK-NEXT:    vsel %v0, %v4, %v3, %v0
+; CHECK-NEXT:    vmrlf %v3, %v29, %v29
+; CHECK-NEXT:    vmrlf %v4, %v28, %v28
+; CHECK-NEXT:    vlr %v28, %v0
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v29, %v29
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vldeb %v5, %v5
+; CHECK-NEXT:    vfchdb %v4, %v5, %v4
+; CHECK-NEXT:    vl %v5, 352(%r15)
+; CHECK-NEXT:    vpkg %v3, %v4, %v3
+; CHECK-NEXT:    vuphf %v4, %v3
+; CHECK-NEXT:    vsel %v25, %v6, %v5, %v4
+; CHECK-NEXT:    vmrlf %v4, %v31, %v31
+; CHECK-NEXT:    vmrlf %v5, %v30, %v30
+; CHECK-NEXT:    vmrhf %v6, %v30, %v30
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vldeb %v5, %v5
+; CHECK-NEXT:    vfchdb %v4, %v5, %v4
+; CHECK-NEXT:    vmrhf %v5, %v31, %v31
+; CHECK-NEXT:    vldeb %v5, %v5
+; CHECK-NEXT:    vldeb %v6, %v6
+; CHECK-NEXT:    vfchdb %v5, %v6, %v5
+; CHECK-NEXT:    vl %v6, 384(%r15)
+; CHECK-NEXT:    vpkg %v4, %v5, %v4
+; CHECK-NEXT:    vuphf %v5, %v4
+; CHECK-NEXT:    vsel %v29, %v7, %v6, %v5
+; CHECK-NEXT:    vl %v5, 304(%r15)
+; CHECK-NEXT:    vl %v6, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v6, %v5, %v1
+; CHECK-NEXT:    vl %v5, 208(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v30, %v5, %v2, %v1
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v3, %v3
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vl %v3, 272(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v4, %v4
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v31, %v3, %v2, %v1
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+}
+
+define <16 x float> @fun136(<16 x float> %val1, <16 x float> %val2, <16 x float> %val3, <16 x float> %val4) {
+; CHECK-LABEL: fun136:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v25, %v25
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v25, %v25
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v1, %v26, %v26
+; CHECK-NEXT:    vmrhf %v2, %v26, %v26
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v27, %v27
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vmrlf %v0, %v29, %v29
+; CHECK-NEXT:    vmrlf %v1, %v28, %v28
+; CHECK-NEXT:    vmrhf %v2, %v28, %v28
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v29, %v29
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vmrlf %v0, %v31, %v31
+; CHECK-NEXT:    vmrlf %v1, %v30, %v30
+; CHECK-NEXT:    vmrhf %v2, %v30, %v30
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v31, %v31
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 208(%r15)
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 272(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+}
+
+define <16 x double> @fun137(<16 x float> %val1, <16 x float> %val2, <16 x double> %val3, <16 x double> %val4) {
+; CHECK-LABEL: fun137:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmrlf %v0, %v25, %v25
+; CHECK-NEXT:    vmrlf %v1, %v24, %v24
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vfchdb %v0, %v1, %v0
+; CHECK-NEXT:    vmrhf %v1, %v25, %v25
+; CHECK-NEXT:    vmrhf %v2, %v24, %v24
+; CHECK-NEXT:    vldeb %v1, %v1
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vl %v4, 192(%r15)
+; CHECK-NEXT:    vl %v6, 224(%r15)
+; CHECK-NEXT:    vl %v7, 256(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vpkg %v1, %v1, %v0
+; CHECK-NEXT:    vuphf %v0, %v1
+; CHECK-NEXT:    vsel %v24, %v3, %v2, %v0
+; CHECK-NEXT:    vmrlf %v0, %v27, %v27
+; CHECK-NEXT:    vmrlf %v2, %v26, %v26
+; CHECK-NEXT:    vmrhf %v3, %v26, %v26
+; CHECK-NEXT:    vmrhf %v5, %v28, %v28
+; CHECK-NEXT:    vmrlg %v1, %v1, %v1
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vldeb %v0, %v0
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vfchdb %v0, %v2, %v0
+; CHECK-NEXT:    vmrhf %v2, %v27, %v27
+; CHECK-NEXT:    vldeb %v2, %v2
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vfchdb %v2, %v3, %v2
+; CHECK-NEXT:    vl %v3, 320(%r15)
+; CHECK-NEXT:    vpkg %v2, %v2, %v0
+; CHECK-NEXT:    vuphf %v0, %v2
+; CHECK-NEXT:    vsel %v0, %v4, %v3, %v0
+; CHECK-NEXT:    vmrlf %v3, %v29, %v29
+; CHECK-NEXT:    vmrlf %v4, %v28, %v28
+; CHECK-NEXT:    vlr %v28, %v0
+; CHECK-NEXT:    vldeb %v3, %v3
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vfchdb %v3, %v4, %v3
+; CHECK-NEXT:    vmrhf %v4, %v29, %v29
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vldeb %v5, %v5
+; CHECK-NEXT:    vfchdb %v4, %v5, %v4
+; CHECK-NEXT:    vl %v5, 352(%r15)
+; CHECK-NEXT:    vpkg %v3, %v4, %v3
+; CHECK-NEXT:    vuphf %v4, %v3
+; CHECK-NEXT:    vsel %v25, %v6, %v5, %v4
+; CHECK-NEXT:    vmrlf %v4, %v31, %v31
+; CHECK-NEXT:    vmrlf %v5, %v30, %v30
+; CHECK-NEXT:    vmrhf %v6, %v30, %v30
+; CHECK-NEXT:    vldeb %v4, %v4
+; CHECK-NEXT:    vldeb %v5, %v5
+; CHECK-NEXT:    vfchdb %v4, %v5, %v4
+; CHECK-NEXT:    vmrhf %v5, %v31, %v31
+; CHECK-NEXT:    vldeb %v5, %v5
+; CHECK-NEXT:    vldeb %v6, %v6
+; CHECK-NEXT:    vfchdb %v5, %v6, %v5
+; CHECK-NEXT:    vl %v6, 384(%r15)
+; CHECK-NEXT:    vpkg %v4, %v5, %v4
+; CHECK-NEXT:    vuphf %v5, %v4
+; CHECK-NEXT:    vsel %v29, %v7, %v6, %v5
+; CHECK-NEXT:    vl %v5, 304(%r15)
+; CHECK-NEXT:    vl %v6, 176(%r15)
+; CHECK-NEXT:    vsel %v26, %v6, %v5, %v1
+; CHECK-NEXT:    vl %v5, 208(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v2, %v2
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v30, %v5, %v2, %v1
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v3, %v3
+; CHECK-NEXT:    vl %v3, 240(%r15)
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v27, %v3, %v2, %v1
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vl %v3, 272(%r15)
+; CHECK-NEXT:    vmrlg %v1, %v4, %v4
+; CHECK-NEXT:    vuphf %v1, %v1
+; CHECK-NEXT:    vsel %v31, %v3, %v2, %v1
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x float> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+}
+
+define <16 x i8> @fun138(<16 x double> %val1, <16 x double> %val2, <16 x i8> %val3, <16 x i8> %val4) {
+; CHECK-LABEL: fun138:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vfchdb %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v27, %v1
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 208(%r15)
+; CHECK-NEXT:    vl %v2, 192(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v30, %v1
+; CHECK-NEXT:    vfchdb %v2, %v28, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vl %v2, 176(%r15)
+; CHECK-NEXT:    vl %v3, 160(%r15)
+; CHECK-NEXT:    vfchdb %v2, %v26, %v2
+; CHECK-NEXT:    vfchdb %v3, %v24, %v3
+; CHECK-NEXT:    vpkg %v2, %v3, %v2
+; CHECK-NEXT:    vpkf %v1, %v2, %v1
+; CHECK-NEXT:    vpkh %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 304(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i8> %val3, <16 x i8> %val4
+  ret <16 x i8> %sel
+}
+
+define <16 x i16> @fun139(<16 x double> %val1, <16 x double> %val2, <16 x i16> %val3, <16 x i16> %val4) {
+; CHECK-LABEL: fun139:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v30, %v0
+; CHECK-NEXT:    vfchdb %v1, %v28, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 176(%r15)
+; CHECK-NEXT:    vl %v2, 160(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v26, %v1
+; CHECK-NEXT:    vfchdb %v2, %v24, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 320(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vfchdb %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 240(%r15)
+; CHECK-NEXT:    vl %v2, 224(%r15)
+; CHECK-NEXT:    vfchdb %v1, %v27, %v1
+; CHECK-NEXT:    vfchdb %v2, %v25, %v2
+; CHECK-NEXT:    vpkg %v1, %v2, %v1
+; CHECK-NEXT:    vpkf %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 336(%r15)
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i16> %val3, <16 x i16> %val4
+  ret <16 x i16> %sel
+}
+
+define <16 x i32> @fun140(<16 x double> %val1, <16 x double> %val2, <16 x i32> %val3, <16 x i32> %val4) {
+; CHECK-LABEL: fun140:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 160(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v26, %v0
+; CHECK-NEXT:    vfchdb %v1, %v24, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 352(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v30, %v0
+; CHECK-NEXT:    vfchdb %v1, %v28, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 368(%r15)
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vfchdb %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 384(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vfchdb %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 400(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i32> %val3, <16 x i32> %val4
+  ret <16 x i32> %sel
+}
+
+define <16 x i64> @fun141(<16 x double> %val1, <16 x double> %val2, <16 x i64> %val3, <16 x i64> %val4) {
+; CHECK-LABEL: fun141:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 160(%r15)
+; CHECK-NEXT:    vl %v1, 416(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v24, %v0
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v26, %v0
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 192(%r15)
+; CHECK-NEXT:    vl %v1, 448(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v28, %v0
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v30, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 224(%r15)
+; CHECK-NEXT:    vl %v1, 480(%r15)
+; CHECK-NEXT:    vl %v2, 352(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v25, %v0
+; CHECK-NEXT:    vsel %v25, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 496(%r15)
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vsel %v27, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 256(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v29, %v0
+; CHECK-NEXT:    vl %v1, 512(%r15)
+; CHECK-NEXT:    vl %v2, 384(%r15)
+; CHECK-NEXT:    vsel %v29, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vl %v1, 528(%r15)
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x i64> %val3, <16 x i64> %val4
+  ret <16 x i64> %sel
+}
+
+define <16 x float> @fun142(<16 x double> %val1, <16 x double> %val2, <16 x float> %val3, <16 x float> %val4) {
+; CHECK-LABEL: fun142:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 160(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v26, %v0
+; CHECK-NEXT:    vfchdb %v1, %v24, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 352(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 192(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v30, %v0
+; CHECK-NEXT:    vfchdb %v1, %v28, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 368(%r15)
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 224(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vfchdb %v1, %v25, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 384(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vl %v1, 256(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vfchdb %v1, %v29, %v1
+; CHECK-NEXT:    vpkg %v0, %v1, %v0
+; CHECK-NEXT:    vl %v1, 400(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x float> %val3, <16 x float> %val4
+  ret <16 x float> %sel
+}
+
+define <16 x double> @fun143(<16 x double> %val1, <16 x double> %val2, <16 x double> %val3, <16 x double> %val4) {
+; CHECK-LABEL: fun143:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vl %v0, 160(%r15)
+; CHECK-NEXT:    vl %v1, 416(%r15)
+; CHECK-NEXT:    vl %v2, 288(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v24, %v0
+; CHECK-NEXT:    vsel %v24, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 176(%r15)
+; CHECK-NEXT:    vl %v1, 432(%r15)
+; CHECK-NEXT:    vl %v2, 304(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v26, %v0
+; CHECK-NEXT:    vsel %v26, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 192(%r15)
+; CHECK-NEXT:    vl %v1, 448(%r15)
+; CHECK-NEXT:    vl %v2, 320(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v28, %v0
+; CHECK-NEXT:    vsel %v28, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 208(%r15)
+; CHECK-NEXT:    vl %v1, 464(%r15)
+; CHECK-NEXT:    vl %v2, 336(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v30, %v0
+; CHECK-NEXT:    vsel %v30, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 224(%r15)
+; CHECK-NEXT:    vl %v1, 480(%r15)
+; CHECK-NEXT:    vl %v2, 352(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v25, %v0
+; CHECK-NEXT:    vsel %v25, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 240(%r15)
+; CHECK-NEXT:    vl %v1, 496(%r15)
+; CHECK-NEXT:    vl %v2, 368(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v27, %v0
+; CHECK-NEXT:    vsel %v27, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 256(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v29, %v0
+; CHECK-NEXT:    vl %v1, 512(%r15)
+; CHECK-NEXT:    vl %v2, 384(%r15)
+; CHECK-NEXT:    vsel %v29, %v2, %v1, %v0
+; CHECK-NEXT:    vl %v0, 272(%r15)
+; CHECK-NEXT:    vfchdb %v0, %v31, %v0
+; CHECK-NEXT:    vl %v1, 528(%r15)
+; CHECK-NEXT:    vl %v2, 400(%r15)
+; CHECK-NEXT:    vsel %v31, %v2, %v1, %v0
+; CHECK-NEXT:    br %r14
+  %cmp = fcmp ogt <16 x double> %val1, %val2
+  %sel = select <16 x i1> %cmp, <16 x double> %val3, <16 x double> %val4
+  ret <16 x double> %sel
+}
+
diff --git a/test/CodeGen/SystemZ/vec-trunc-to-i1.ll b/test/CodeGen/SystemZ/vec-trunc-to-i1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..705fe3dbac90d91d7b75bbcd9f828626d2ac6113
--- /dev/null
+++ b/test/CodeGen/SystemZ/vec-trunc-to-i1.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+;
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s
+;
+; Check that a widening truncate to a vector of i1 elements can be handled.
+
+
+define void @pr32275(<4 x i8> %B15) {
+; CHECK-LABEL: pr32275:
+; CHECK:       # BB#0: # %BB
+; CHECK-NEXT:    vrepif %v0, 1
+; CHECK-NEXT:  .LBB0_1: # %CF34
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    vlgvb %r0, %v24, 3
+; CHECK-NEXT:    vlgvb %r1, %v24, 1
+; CHECK-NEXT:    vlvgp %v1, %r1, %r0
+; CHECK-NEXT:    vlgvb %r0, %v24, 0
+; CHECK-NEXT:    vlvgf %v1, %r0, 0
+; CHECK-NEXT:    vlgvb %r0, %v24, 2
+; CHECK-NEXT:    vlvgf %v1, %r0, 2
+; CHECK-NEXT:    vn %v1, %v1, %v0
+; CHECK-NEXT:    vlgvf %r0, %v1, 3
+; CHECK-NEXT:    tmll %r0, 1
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # BB#2: # %CF36
+; CHECK-NEXT:    br %r14
+BB:
+  br label %CF34
+
+CF34:
+  %Tr24 = trunc <4 x i8> %B15 to <4 x i1>
+  %E28 = extractelement <4 x i1> %Tr24, i32 3
+  br i1 %E28, label %CF34, label %CF36
+
+CF36:
+  ret void
+}
diff --git a/test/CodeGen/SystemZ/vectorizer-output-3xi32.ll b/test/CodeGen/SystemZ/vectorizer-output-3xi32.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3e7ba6095926f61e15c4e64e9539e3340f8724c4
--- /dev/null
+++ b/test/CodeGen/SystemZ/vectorizer-output-3xi32.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13
+;
+; This tescase origininates from the BB-vectorizer output.
+
+define void @fun() {
+  %1 = zext <3 x i1> zeroinitializer to <3 x i32>
+  %2 = extractelement <3 x i32> %1, i32 2
+  store i32 %2, i32* undef, align 8
+  unreachable
+}
diff --git a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
index 2f8e36b66b87181bc6f6d40e57cad0e953f87118..08349a31dfa2668f837dc5c3eb3202295b277140 100644
--- a/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
+++ b/test/CodeGen/Thumb/2010-07-15-debugOrdering.ll
@@ -9,9 +9,9 @@
 
 define void @_Z19getClosestDiagonal3ii(%0* noalias sret, i32, i32) nounwind {
 ; CHECK: bl ___muldf3
-; CHECK: bl ___muldf3
 ; CHECK: beq LBB0
 ; CHECK: bl ___muldf3
+; CHECK: bl ___muldf3
 ; <label>:3
   switch i32 %1, label %4 [
     i32 0, label %5
diff --git a/test/CodeGen/Thumb/PR17309.ll b/test/CodeGen/Thumb/PR17309.ll
index f1033e7d741890e578f15eb845c55e28503871c5..8869537425b77551f811aad0874480e316a1b7d1 100644
--- a/test/CodeGen/Thumb/PR17309.ll
+++ b/test/CodeGen/Thumb/PR17309.ll
@@ -11,9 +11,9 @@ define void @pass_C() #0 {
 entry:
   %c = alloca %struct.C, align 1
   %0 = getelementptr inbounds %struct.C, %struct.C* %c, i32 0, i32 0, i32 0
-  call void @llvm.lifetime.start(i64 1000, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 1000, i8* %0) #1
   call void @use_C(%struct.C* byval %c) #3
-  call void @llvm.lifetime.end(i64 1000, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 1000, i8* %0) #1
   ret void
 }
 
@@ -24,9 +24,9 @@ define void @pass_S() #0 {
 entry:
   %s = alloca %struct.S, align 2
   %0 = bitcast %struct.S* %s to i8*
-  call void @llvm.lifetime.start(i64 2000, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 2000, i8* %0) #1
   call void @use_S(%struct.S* byval %s) #3
-  call void @llvm.lifetime.end(i64 2000, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 2000, i8* %0) #1
   ret void
 }
 
@@ -37,9 +37,9 @@ define void @pass_I() #0 {
 entry:
   %i = alloca %struct.I, align 4
   %0 = bitcast %struct.I* %i to i8*
-  call void @llvm.lifetime.start(i64 4000, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 4000, i8* %0) #1
   call void @use_I(%struct.I* byval %i) #3
-  call void @llvm.lifetime.end(i64 4000, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 4000, i8* %0) #1
   ret void
 }
 
@@ -47,8 +47,8 @@ declare void @use_C(%struct.C* byval) #2
 declare void @use_S(%struct.S* byval) #2
 declare void @use_I(%struct.I* byval) #2
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 
 attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Thumb/cmp-add-fold.ll b/test/CodeGen/Thumb/cmp-add-fold.ll
index b0ad8ab93f8a89e134e150403b7b14e5993e514b..aa61b0825b0ccb61041d9a9781a594ad032af1a1 100644
--- a/test/CodeGen/Thumb/cmp-add-fold.ll
+++ b/test/CodeGen/Thumb/cmp-add-fold.ll
@@ -2,8 +2,9 @@
 ; RUN: llc -mtriple=thumbv7m-eabi -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK --check-prefix=T2 %s
 
 ; CHECK-LABEL: addri1:
-; CHECK: adds r0, #3
+; T1: adds r0, r0, #3
 ; T1-NEXT: b{{eq|ne}}
+; T2: adds r0, #3
 ; T2-NOT: cmp
 define i32 @addri1(i32 %a, i32 %b) {
   %c = add i32 %a, 3
diff --git a/test/CodeGen/Thumb/copy_thumb.ll b/test/CodeGen/Thumb/copy_thumb.ll
index 528f54bd84e6f283d4cf2f73416c088d5b4a612d..008c31aba7aa27510bb9925540ec89cbf1dc9864 100644
--- a/test/CodeGen/Thumb/copy_thumb.ll
+++ b/test/CodeGen/Thumb/copy_thumb.ll
@@ -16,15 +16,9 @@
 ; RUN: llc -mtriple=thumbv4t-none--eabi < %s | FileCheck %s --check-prefix=CHECK-NOLOLOMOV
 ; RUN: llc -mtriple=thumbv5-none--eabi < %s | FileCheck %s --check-prefix=CHECK-NOLOLOMOV
 ; CHECK-NOLOLOMOV-LABEL: foo
-; CHECK-NOLOLOMOV-NOT:   mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
-; CHECK-NOLOLOMOV:       push  {[[SRC1:r[01]]]}
-; CHECK-NOLOLOMOV-NEXT:  pop {[[TMP:r[0-7]]]}
-; CHECK-NOLOLOMOV-NOT:   mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
-; CHECK-NOLOLOMOV:       push  {[[SRC2:r[01]]]}
-; CHECK-NOLOLOMOV-NEXT:  pop {[[SRC1]]}
-; CHECK-NOLOLOMOV-NOT:   mov [[TMP:r[0-7]]], [[SRC1:r[01]]]
-; CHECK-NOLOLOMOV:       push  {[[TMP]]}
-; CHECK-NOLOLOMOV-NEXT:  pop {[[SRC2]]}
+; CHECK-NOLOLOMOV:       movs [[TMP:r[0-7]]], [[SRC1:r[01]]]
+; CHECK-NOLOLOMOV-NEXT:  movs [[SRC1]], [[SRC2:r[01]]]
+; CHECK-NOLOLOMOV-NEXT:  movs [[SRC2]], [[TMP]]
 ; CHECK-NOLOLOMOV-LABEL: bar
 ; CHECK-NOLOLOMOV-LABEL: fnend
 
diff --git a/test/CodeGen/Thumb/ispositive.ll b/test/CodeGen/Thumb/ispositive.ll
index 8d396878932bbf6b9318f0e2a38798fc428539e6..a9b2c139797eb94f30566b378c85eeba66a238ed 100644
--- a/test/CodeGen/Thumb/ispositive.ll
+++ b/test/CodeGen/Thumb/ispositive.ll
@@ -9,3 +9,12 @@ entry:
         ret i32 %1
 }
 
+define i32 @test2(i32 %X) {
+entry:
+; CHECK-LABEL: test2:
+; CHECK: lsls r1, r1, #31
+; CHECK-NEXT: adds
+        %tmp1 = sub i32 %X, 2147483648
+        ret i32 %tmp1
+}
+
diff --git a/test/CodeGen/Thumb/long.ll b/test/CodeGen/Thumb/long.ll
index 33f63892ec3fe417b77eafd76b797a6a8d6f02a0..c549bd425aafe4261fecd3d01da18d2de347e6da 100644
--- a/test/CodeGen/Thumb/long.ll
+++ b/test/CodeGen/Thumb/long.ll
@@ -1,41 +1,88 @@
-; RUN: llc -mtriple=thumb-eabi %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumb-apple-darwin %s -o - | FileCheck %s -check-prefix CHECK-DARWIN
+; RUN: llc -mtriple=thumb-eabi %s -verify-machineinstrs -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-apple-darwin %s -verify-machineinstrs -o - | \
+; RUN:    FileCheck %s -check-prefix CHECK -check-prefix CHECK-DARWIN
 
 define i64 @f1() {
 entry:
         ret i64 0
+; CHECK-LABEL: f1:
+; CHECK: movs r0, #0
+; CHECK: movs r1, r0
 }
 
 define i64 @f2() {
 entry:
         ret i64 1
+; CHECK-LABEL: f2:
+; CHECK: movs r0, #1
+; CHECK: movs r1, #0
 }
 
 define i64 @f3() {
 entry:
         ret i64 2147483647
+; CHECK-LABEL: f3:
+; CHECK: ldr r0,
+; CHECK: movs r1, #0
 }
 
 define i64 @f4() {
 entry:
         ret i64 2147483648
+; CHECK-LABEL: f4:
+; CHECK: movs r0, #1
+; CHECK: lsls r0, r0, #31
+; CHECK: movs r1, #0
 }
 
 define i64 @f5() {
 entry:
         ret i64 9223372036854775807
+; CHECK-LABEL: f5:
+; CHECK: movs r0, #0
+; CHECK: mvns r0, r0
+; CHECK: ldr r1,
 }
 
 define i64 @f6(i64 %x, i64 %y) {
 entry:
         %tmp1 = add i64 %y, 1           ; <i64> [#uses=1]
         ret i64 %tmp1
+; CHECK-LABEL: f6:
+; CHECK: movs r1, #0
+; CHECK: adds r0, r2, #1
+; CHECK: adcs r1, r3
+}
+
+define i64 @f6a(i64 %x, i64 %y) {
+entry:
+        %tmp1 = add i64 %y, 10
+        ret i64 %tmp1
+; CHECK-LABEL: f6a:
+; CHECK: movs r1, #0
+; CHECK: adds r2, #10
+; CHECK: adcs r1, r3
+; CHECK: movs r0, r2
+}
+
+define i64 @f6b(i64 %x, i64 %y) {
+entry:
+        %tmp1 = add i64 %y, 1000
+        ret i64 %tmp1
+; CHECK-LABEL: f6b:
+; CHECK: movs r0, #125
+; CHECK: lsls r0, r0, #3
+; CHECK: movs r1, #0
+; CHECK: adds r0, r2, r0
+; CHECK: adcs r1, r3
 }
 
 define void @f7() {
 entry:
         %tmp = call i64 @f8( )          ; <i64> [#uses=0]
         ret void
+; CHECK-LABEL: f7:
+; CHECK: bl
 }
 
 declare i64 @f8()
@@ -44,6 +91,57 @@ define i64 @f9(i64 %a, i64 %b) {
 entry:
         %tmp = sub i64 %a, %b           ; <i64> [#uses=1]
         ret i64 %tmp
+; CHECK-LABEL: f9:
+; CHECK: subs r0, r0, r2
+; CHECK: sbcs r1, r3
+}
+
+define i64 @f9a(i64 %x, i64 %y) { ; ADDC with small negative imm => SUBS imm
+entry:
+        %tmp1 = sub i64 %y, 10
+        ret i64 %tmp1
+; CHECK-LABEL: f9a:
+; CHECK: movs r0, #0
+; CHECK: subs r2, #10
+; CHECK: sbcs r3, r0
+; CHECK: movs r0, r2
+; CHECK: movs r1, r3
+}
+
+define i64 @f9b(i64 %x, i64 %y) { ; ADDC with big negative imm => SUBS reg
+entry:
+        %tmp1 = sub i64 1000, %y
+        ret i64 %tmp1
+; CHECK-LABEL: f9b:
+; CHECK: movs r0, #125
+; CHECK: lsls r0, r0, #3
+; CHECK: movs r1, #0
+; CHECK: subs r0, r0, r2
+; CHECK: sbcs r1, r3
+}
+
+define i64 @f9c(i64 %x, i32 %y) { ; SUBS with small positive imm => SUBS imm
+entry:
+        %conv = sext i32 %y to i64
+        %shl = shl i64 %conv, 32
+        %or = or i64 %shl, 1
+        %sub = sub nsw i64 %x, %or
+        ret i64 %sub
+; CHECK-LABEL: f9c:
+; CHECK: subs r0, r0, #1
+; CHECK: sbcs r1, r2
+}
+
+define i64 @f9d(i64 %x, i32 %y) { ; SUBS with small negative imm => ADDS imm
+entry:
+        %conv = sext i32 %y to i64
+        %shl = shl i64 %conv, 32
+        %or = or i64 %shl, 4294967295
+        %sub = sub nsw i64 %x, %or
+        ret i64 %sub
+; CHECK-LABEL: f9d:
+; CHECK: adds r0, r0, #1
+; CHECK: sbcs r1, r2
 }
 
 define i64 @f(i32 %a, i32 %b) {
@@ -52,6 +150,9 @@ entry:
         %tmp1 = sext i32 %b to i64              ; <i64> [#uses=1]
         %tmp2 = mul i64 %tmp1, %tmp             ; <i64> [#uses=1]
         ret i64 %tmp2
+; CHECK-LABEL: f:
+; CHECK-V6: bl __aeabi_lmul
+; CHECK-DARWIN: __muldi3
 }
 
 define i64 @g(i32 %a, i32 %b) {
@@ -60,6 +161,9 @@ entry:
         %tmp1 = zext i32 %b to i64              ; <i64> [#uses=1]
         %tmp2 = mul i64 %tmp1, %tmp             ; <i64> [#uses=1]
         ret i64 %tmp2
+; CHECK-LABEL: g:
+; CHECK-V6: bl __aeabi_lmul
+; CHECK-DARWIN: __muldi3
 }
 
 define i64 @f10() {
@@ -67,16 +171,38 @@ entry:
         %a = alloca i64, align 8                ; <i64*> [#uses=1]
         %retval = load i64, i64* %a          ; <i64> [#uses=1]
         ret i64 %retval
+; CHECK-LABEL: f10:
+; CHECK: sub sp, #8
+; CHECK: ldr r0, [sp]
+; CHECK: ldr r1, [sp, #4]
+; CHECK: add sp, #8
 }
 
-; CHECK: mvn
-; CHECK-NOT: mvn
-
-; CHECK: adc
-; CHECK-NOT: adc
-
-; CHECK: sbc
-; CHECK-NOT: sbc
-
-; CHECK-DARWIN: __muldi3
+define i64 @f11(i64 %x, i64 %y) {
+entry:
+        %tmp1 = add i64 -1000, %y
+        %tmp2 = add i64 %tmp1, -1000
+        ret i64 %tmp2
+; CHECK-LABEL: f11:
+; CHECK: movs r0, #125
+; CHECK: lsls r0, r0, #3
+; CHECK: movs r1, #0
+; CHECK: subs r2, r2, r0
+; CHECK: sbcs r3, r1
+; CHECK: subs r0, r2, r0
+; CHECK: sbcs r3, r1
+; CHECK: movs r1, r3
+}
 
+; "sub 2147483648" has to be lowered into "add -2147483648"
+define i64 @f12(i64 %x, i64 %y) {
+entry:
+        %tmp1 = sub i64 %x, 2147483648
+        ret i64 %tmp1
+; CHECK-LABEL: f12:
+; CHECK: movs r2, #1
+; CHECK: lsls r2, r2, #31
+; CHECK: movs r3, #0
+; CHECK: adds r0, r0, r2
+; CHECK: sbcs r1, r3
+}
diff --git a/test/CodeGen/Thumb/mature-mc-support.ll b/test/CodeGen/Thumb/mature-mc-support.ll
index d7f8ae6c6c4d808bbacc72129c685e5a47bba8f7..6a638d4050696bdb64332173512b39d0efe5863b 100644
--- a/test/CodeGen/Thumb/mature-mc-support.ll
+++ b/test/CodeGen/Thumb/mature-mc-support.ll
@@ -9,4 +9,4 @@
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
 
-; CHECK: LLVM ERROR: Error parsing inline asm
+; CHECK: error: unknown directive
diff --git a/test/CodeGen/Thumb/remove-unneeded-push-pop.ll b/test/CodeGen/Thumb/remove-unneeded-push-pop.ll
new file mode 100644
index 0000000000000000000000000000000000000000..054be2ea8587214388d485ebad33efc39eab7642
--- /dev/null
+++ b/test/CodeGen/Thumb/remove-unneeded-push-pop.ll
@@ -0,0 +1,1052 @@
+; RUN: llc -O0 -mtriple thumbv6m-arm-none-eabi < %s | FileCheck %s
+
+@a = external hidden global i32*, align 4
+@f = external hidden global i32, align 4
+
+define hidden void @foo() {
+entry:
+; CHECK-NOT: push	{lr}
+; CHECK-NOT: pop	{pc}
+  store i32 24654, i32* @f, align 4
+  br label %if.end
+
+if.end:                                           ; preds = %entry
+  %0 = load i32*, i32** @a, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32* %0, i32 2
+  %1 = load i32, i32* %arrayidx1, align 4
+  %tobool2 = icmp ne i32 %1, 0
+  br i1 %tobool2, label %if.then3, label %if.end4
+
+if.then3:                                         ; preds = %if.end
+  store i32 17785, i32* @f, align 4
+  br label %if.end4
+
+if.end4:                                          ; preds = %if.then3, %if.end
+  %2 = load i32*, i32** @a, align 4
+  %arrayidx5 = getelementptr inbounds i32, i32* %2, i32 3
+  %3 = load i32, i32* %arrayidx5, align 4
+  %tobool6 = icmp ne i32 %3, 0
+  br i1 %tobool6, label %if.then7, label %if.end8
+
+if.then7:                                         ; preds = %if.end4
+  store i32 10342, i32* @f, align 4
+  br label %if.end8
+
+if.end8:                                          ; preds = %if.then7, %if.end4
+  %4 = load i32*, i32** @a, align 4
+  %arrayidx9 = getelementptr inbounds i32, i32* %4, i32 4
+  %5 = load i32, i32* %arrayidx9, align 4
+  %tobool10 = icmp ne i32 %5, 0
+  br i1 %tobool10, label %if.then11, label %if.end12
+
+if.then11:                                        ; preds = %if.end8
+  store i32 29082, i32* @f, align 4
+  br label %if.end12
+
+if.end12:                                         ; preds = %if.then11, %if.end8
+  %6 = load i32*, i32** @a, align 4
+  %arrayidx13 = getelementptr inbounds i32, i32* %6, i32 5
+  %7 = load i32, i32* %arrayidx13, align 4
+  %tobool14 = icmp ne i32 %7, 0
+  br i1 %tobool14, label %if.then15, label %if.end16
+
+if.then15:                                        ; preds = %if.end12
+  store i32 29893, i32* @f, align 4
+  br label %if.end16
+
+if.end16:                                         ; preds = %if.then15, %if.end12
+  %8 = load i32*, i32** @a, align 4
+  %arrayidx17 = getelementptr inbounds i32, i32* %8, i32 6
+  %9 = load i32, i32* %arrayidx17, align 4
+  %tobool18 = icmp ne i32 %9, 0
+  br i1 %tobool18, label %if.then19, label %if.end20
+
+if.then19:                                        ; preds = %if.end16
+  store i32 19071, i32* @f, align 4
+  br label %if.end20
+
+if.end20:                                         ; preds = %if.then19, %if.end16
+  %10 = load i32*, i32** @a, align 4
+  %arrayidx21 = getelementptr inbounds i32, i32* %10, i32 7
+  %11 = load i32, i32* %arrayidx21, align 4
+  %tobool22 = icmp ne i32 %11, 0
+  br i1 %tobool22, label %if.then23, label %if.end24
+
+if.then23:                                        ; preds = %if.end20
+  store i32 6154, i32* @f, align 4
+  br label %if.end24
+
+if.end24:                                         ; preds = %if.then23, %if.end20
+  %12 = load i32*, i32** @a, align 4
+  %arrayidx25 = getelementptr inbounds i32, i32* %12, i32 8
+  %13 = load i32, i32* %arrayidx25, align 4
+  %tobool26 = icmp ne i32 %13, 0
+  br i1 %tobool26, label %if.then27, label %if.end28
+
+if.then27:                                        ; preds = %if.end24
+  store i32 30498, i32* @f, align 4
+  br label %if.end28
+
+if.end28:                                         ; preds = %if.then27, %if.end24
+  %14 = load i32*, i32** @a, align 4
+  %arrayidx29 = getelementptr inbounds i32, i32* %14, i32 9
+  %15 = load i32, i32* %arrayidx29, align 4
+  %tobool30 = icmp ne i32 %15, 0
+  br i1 %tobool30, label %if.then31, label %if.end32
+
+if.then31:                                        ; preds = %if.end28
+  store i32 16667, i32* @f, align 4
+  br label %if.end32
+
+if.end32:                                         ; preds = %if.then31, %if.end28
+  %16 = load i32*, i32** @a, align 4
+  %arrayidx33 = getelementptr inbounds i32, i32* %16, i32 10
+  %17 = load i32, i32* %arrayidx33, align 4
+  %tobool34 = icmp ne i32 %17, 0
+  br i1 %tobool34, label %if.then35, label %if.end36
+
+if.then35:                                        ; preds = %if.end32
+  store i32 195, i32* @f, align 4
+  br label %if.end36
+
+if.end36:                                         ; preds = %if.then35, %if.end32
+  %18 = load i32*, i32** @a, align 4
+  %arrayidx37 = getelementptr inbounds i32, i32* %18, i32 11
+  %19 = load i32, i32* %arrayidx37, align 4
+  %tobool38 = icmp ne i32 %19, 0
+  br i1 %tobool38, label %if.then39, label %if.end40
+
+if.then39:                                        ; preds = %if.end36
+  store i32 14665, i32* @f, align 4
+  br label %if.end40
+
+if.end40:                                         ; preds = %if.then39, %if.end36
+  %20 = load i32*, i32** @a, align 4
+  %arrayidx41 = getelementptr inbounds i32, i32* %20, i32 12
+  %21 = load i32, i32* %arrayidx41, align 4
+  %tobool42 = icmp ne i32 %21, 0
+  br i1 %tobool42, label %if.then43, label %if.end44
+
+if.then43:                                        ; preds = %if.end40
+  store i32 19305, i32* @f, align 4
+  br label %if.end44
+
+if.end44:                                         ; preds = %if.then43, %if.end40
+  %22 = load i32*, i32** @a, align 4
+  %arrayidx45 = getelementptr inbounds i32, i32* %22, i32 13
+  %23 = load i32, i32* %arrayidx45, align 4
+  %tobool46 = icmp ne i32 %23, 0
+  br i1 %tobool46, label %if.then47, label %if.end48
+
+if.then47:                                        ; preds = %if.end44
+  store i32 15133, i32* @f, align 4
+  br label %if.end48
+
+if.end48:                                         ; preds = %if.then47, %if.end44
+  %24 = load i32*, i32** @a, align 4
+  %arrayidx49 = getelementptr inbounds i32, i32* %24, i32 14
+  %25 = load i32, i32* %arrayidx49, align 4
+  %tobool50 = icmp ne i32 %25, 0
+  br i1 %tobool50, label %if.then51, label %if.end52
+
+if.then51:                                        ; preds = %if.end48
+  store i32 19173, i32* @f, align 4
+  br label %if.end52
+
+if.end52:                                         ; preds = %if.then51, %if.end48
+  br label %if.then55
+
+if.then55:                                        ; preds = %if.end52
+  store i32 14025, i32* @f, align 4
+  br label %if.end56
+
+if.end56:                                         ; preds = %if.then55
+  %26 = load i32*, i32** @a, align 4
+  %arrayidx57 = getelementptr inbounds i32, i32* %26, i32 16
+  %27 = load i32, i32* %arrayidx57, align 4
+  %tobool58 = icmp ne i32 %27, 0
+  br i1 %tobool58, label %if.then59, label %if.end60
+
+if.then59:                                        ; preds = %if.end56
+  store i32 8209, i32* @f, align 4
+  br label %if.end60
+
+if.end60:                                         ; preds = %if.then59, %if.end56
+  %28 = load i32*, i32** @a, align 4
+  %arrayidx61 = getelementptr inbounds i32, i32* %28, i32 17
+  %29 = load i32, i32* %arrayidx61, align 4
+  %tobool62 = icmp ne i32 %29, 0
+  br i1 %tobool62, label %if.then63, label %if.end64
+
+if.then63:                                        ; preds = %if.end60
+  store i32 29621, i32* @f, align 4
+  br label %if.end64
+
+if.end64:                                         ; preds = %if.then63, %if.end60
+  %30 = load i32*, i32** @a, align 4
+  %arrayidx65 = getelementptr inbounds i32, i32* %30, i32 18
+  %31 = load i32, i32* %arrayidx65, align 4
+  %tobool66 = icmp ne i32 %31, 0
+  br i1 %tobool66, label %if.then67, label %if.end68
+
+if.then67:                                        ; preds = %if.end64
+  store i32 14963, i32* @f, align 4
+  br label %if.end68
+
+if.end68:                                         ; preds = %if.then67, %if.end64
+  %32 = load i32*, i32** @a, align 4
+  %arrayidx69 = getelementptr inbounds i32, i32* %32, i32 19
+  %33 = load i32, i32* %arrayidx69, align 4
+  %tobool70 = icmp ne i32 %33, 0
+  br i1 %tobool70, label %if.then71, label %if.end72
+
+if.then71:                                        ; preds = %if.end68
+  store i32 32282, i32* @f, align 4
+  br label %if.end72
+
+if.end72:                                         ; preds = %if.then71, %if.end68
+  %34 = load i32*, i32** @a, align 4
+  %arrayidx73 = getelementptr inbounds i32, i32* %34, i32 20
+  %35 = load i32, i32* %arrayidx73, align 4
+  %tobool74 = icmp ne i32 %35, 0
+  br i1 %tobool74, label %if.then75, label %if.end76
+
+if.then75:                                        ; preds = %if.end72
+  store i32 3072, i32* @f, align 4
+  br label %if.end76
+
+if.end76:                                         ; preds = %if.then75, %if.end72
+  %36 = load i32*, i32** @a, align 4
+  %arrayidx77 = getelementptr inbounds i32, i32* %36, i32 21
+  %37 = load i32, i32* %arrayidx77, align 4
+  %tobool78 = icmp ne i32 %37, 0
+  br i1 %tobool78, label %if.then79, label %if.end80
+
+if.then79:                                        ; preds = %if.end76
+  store i32 1992, i32* @f, align 4
+  br label %if.end80
+
+if.end80:                                         ; preds = %if.then79, %if.end76
+  %38 = load i32*, i32** @a, align 4
+  %arrayidx81 = getelementptr inbounds i32, i32* %38, i32 22
+  %39 = load i32, i32* %arrayidx81, align 4
+  %tobool82 = icmp ne i32 %39, 0
+  br i1 %tobool82, label %if.then83, label %if.end84
+
+if.then83:                                        ; preds = %if.end80
+  store i32 9614, i32* @f, align 4
+  br label %if.end84
+
+if.end84:                                         ; preds = %if.then83, %if.end80
+  %40 = load i32*, i32** @a, align 4
+  %arrayidx85 = getelementptr inbounds i32, i32* %40, i32 23
+  %41 = load i32, i32* %arrayidx85, align 4
+  %tobool86 = icmp ne i32 %41, 0
+  br i1 %tobool86, label %if.then87, label %if.end88
+
+if.then87:                                        ; preds = %if.end84
+  store i32 25931, i32* @f, align 4
+  br label %if.end88
+
+if.end88:                                         ; preds = %if.then87, %if.end84
+  %42 = load i32*, i32** @a, align 4
+  %arrayidx89 = getelementptr inbounds i32, i32* %42, i32 24
+  %43 = load i32, i32* %arrayidx89, align 4
+  %tobool90 = icmp ne i32 %43, 0
+  br i1 %tobool90, label %if.then91, label %if.end92
+
+if.then91:                                        ; preds = %if.end88
+  store i32 22035, i32* @f, align 4
+  br label %if.end92
+
+if.end92:                                         ; preds = %if.then91, %if.end88
+  %44 = load i32*, i32** @a, align 4
+  %arrayidx93 = getelementptr inbounds i32, i32* %44, i32 25
+  %45 = load i32, i32* %arrayidx93, align 4
+  %tobool94 = icmp ne i32 %45, 0
+  br i1 %tobool94, label %if.then95, label %if.end96
+
+if.then95:                                        ; preds = %if.end92
+  store i32 10712, i32* @f, align 4
+  br label %if.end96
+
+if.end96:                                         ; preds = %if.then95, %if.end92
+  %46 = load i32*, i32** @a, align 4
+  %arrayidx97 = getelementptr inbounds i32, i32* %46, i32 26
+  %47 = load i32, i32* %arrayidx97, align 4
+  %tobool98 = icmp ne i32 %47, 0
+  br i1 %tobool98, label %if.then99, label %if.end100
+
+if.then99:                                        ; preds = %if.end96
+  store i32 18267, i32* @f, align 4
+  br label %if.end100
+
+if.end100:                                        ; preds = %if.then99, %if.end96
+  %48 = load i32*, i32** @a, align 4
+  %arrayidx101 = getelementptr inbounds i32, i32* %48, i32 27
+  %49 = load i32, i32* %arrayidx101, align 4
+  %tobool102 = icmp ne i32 %49, 0
+  br i1 %tobool102, label %if.then103, label %if.end104
+
+if.then103:                                       ; preds = %if.end100
+  store i32 30432, i32* @f, align 4
+  br label %if.end104
+
+if.end104:                                        ; preds = %if.then103, %if.end100
+  %50 = load i32*, i32** @a, align 4
+  %arrayidx105 = getelementptr inbounds i32, i32* %50, i32 28
+  %51 = load i32, i32* %arrayidx105, align 4
+  %tobool106 = icmp ne i32 %51, 0
+  br i1 %tobool106, label %if.then107, label %if.end108
+
+if.then107:                                       ; preds = %if.end104
+  store i32 5847, i32* @f, align 4
+  br label %if.end108
+
+if.end108:                                        ; preds = %if.then107, %if.end104
+  %52 = load i32*, i32** @a, align 4
+  %arrayidx109 = getelementptr inbounds i32, i32* %52, i32 29
+  %53 = load i32, i32* %arrayidx109, align 4
+  %tobool110 = icmp ne i32 %53, 0
+  br i1 %tobool110, label %if.then111, label %if.end112
+
+if.then111:                                       ; preds = %if.end108
+  store i32 14705, i32* @f, align 4
+  br label %if.end112
+
+if.end112:                                        ; preds = %if.then111, %if.end108
+  %54 = load i32*, i32** @a, align 4
+  %arrayidx113 = getelementptr inbounds i32, i32* %54, i32 30
+  %55 = load i32, i32* %arrayidx113, align 4
+  %tobool114 = icmp ne i32 %55, 0
+  br i1 %tobool114, label %if.then115, label %if.end116
+
+if.then115:                                       ; preds = %if.end112
+  store i32 28488, i32* @f, align 4
+  br label %if.end116
+
+if.end116:                                        ; preds = %if.then115, %if.end112
+  %56 = load i32*, i32** @a, align 4
+  %arrayidx117 = getelementptr inbounds i32, i32* %56, i32 31
+  %57 = load i32, i32* %arrayidx117, align 4
+  %tobool118 = icmp ne i32 %57, 0
+  br i1 %tobool118, label %if.then119, label %if.end120
+
+if.then119:                                       ; preds = %if.end116
+  store i32 13853, i32* @f, align 4
+  br label %if.end120
+
+if.end120:                                        ; preds = %if.then119, %if.end116
+  %58 = load i32*, i32** @a, align 4
+  %arrayidx121 = getelementptr inbounds i32, i32* %58, i32 32
+  %59 = load i32, i32* %arrayidx121, align 4
+  %tobool122 = icmp ne i32 %59, 0
+  br i1 %tobool122, label %if.then123, label %if.end124
+
+if.then123:                                       ; preds = %if.end120
+  store i32 31379, i32* @f, align 4
+  br label %if.end124
+
+if.end124:                                        ; preds = %if.then123, %if.end120
+  %60 = load i32*, i32** @a, align 4
+  %arrayidx125 = getelementptr inbounds i32, i32* %60, i32 33
+  %61 = load i32, i32* %arrayidx125, align 4
+  %tobool126 = icmp ne i32 %61, 0
+  br i1 %tobool126, label %if.then127, label %if.end128
+
+if.then127:                                       ; preds = %if.end124
+  store i32 7010, i32* @f, align 4
+  br label %if.end128
+
+if.end128:                                        ; preds = %if.then127, %if.end124
+  br label %if.then131
+
+if.then131:                                       ; preds = %if.end128
+  store i32 31840, i32* @f, align 4
+  br label %if.end132
+
+if.end132:                                        ; preds = %if.then131
+  %62 = load i32*, i32** @a, align 4
+  %arrayidx133 = getelementptr inbounds i32, i32* %62, i32 35
+  %63 = load i32, i32* %arrayidx133, align 4
+  %tobool134 = icmp ne i32 %63, 0
+  br i1 %tobool134, label %if.then135, label %if.end136
+
+if.then135:                                       ; preds = %if.end132
+  store i32 16119, i32* @f, align 4
+  br label %if.end136
+
+if.end136:                                        ; preds = %if.then135, %if.end132
+  %64 = load i32*, i32** @a, align 4
+  %arrayidx137 = getelementptr inbounds i32, i32* %64, i32 36
+  %65 = load i32, i32* %arrayidx137, align 4
+  %tobool138 = icmp ne i32 %65, 0
+  br i1 %tobool138, label %if.then139, label %if.end140
+
+if.then139:                                       ; preds = %if.end136
+  store i32 7119, i32* @f, align 4
+  br label %if.end140
+
+if.end140:                                        ; preds = %if.then139, %if.end136
+  %66 = load i32*, i32** @a, align 4
+  %arrayidx141 = getelementptr inbounds i32, i32* %66, i32 37
+  %67 = load i32, i32* %arrayidx141, align 4
+  %tobool142 = icmp ne i32 %67, 0
+  br i1 %tobool142, label %if.then143, label %if.end144
+
+if.then143:                                       ; preds = %if.end140
+  store i32 3333, i32* @f, align 4
+  br label %if.end144
+
+if.end144:                                        ; preds = %if.then143, %if.end140
+  %68 = load i32*, i32** @a, align 4
+  %arrayidx145 = getelementptr inbounds i32, i32* %68, i32 38
+  %69 = load i32, i32* %arrayidx145, align 4
+  %tobool146 = icmp ne i32 %69, 0
+  br i1 %tobool146, label %if.then147, label %if.end148
+
+if.then147:                                       ; preds = %if.end144
+  store i32 6430, i32* @f, align 4
+  br label %if.end148
+
+if.end148:                                        ; preds = %if.then147, %if.end144
+  %70 = load i32*, i32** @a, align 4
+  %arrayidx149 = getelementptr inbounds i32, i32* %70, i32 39
+  %71 = load i32, i32* %arrayidx149, align 4
+  %tobool150 = icmp ne i32 %71, 0
+  br i1 %tobool150, label %if.then151, label %if.end152
+
+if.then151:                                       ; preds = %if.end148
+  store i32 19857, i32* @f, align 4
+  br label %if.end152
+
+if.end152:                                        ; preds = %if.then151, %if.end148
+  %72 = load i32*, i32** @a, align 4
+  %arrayidx153 = getelementptr inbounds i32, i32* %72, i32 40
+  %73 = load i32, i32* %arrayidx153, align 4
+  %tobool154 = icmp ne i32 %73, 0
+  br i1 %tobool154, label %if.then155, label %if.end156
+
+if.then155:                                       ; preds = %if.end152
+  store i32 13237, i32* @f, align 4
+  br label %if.end156
+
+if.end156:                                        ; preds = %if.then155, %if.end152
+  br label %if.then159
+
+if.then159:                                       ; preds = %if.end156
+  store i32 163, i32* @f, align 4
+  br label %if.end160
+
+if.end160:                                        ; preds = %if.then159
+  %74 = load i32*, i32** @a, align 4
+  %arrayidx161 = getelementptr inbounds i32, i32* %74, i32 42
+  %75 = load i32, i32* %arrayidx161, align 4
+  %tobool162 = icmp ne i32 %75, 0
+  br i1 %tobool162, label %if.then163, label %if.end164
+
+if.then163:                                       ; preds = %if.end160
+  store i32 1961, i32* @f, align 4
+  br label %if.end164
+
+if.end164:                                        ; preds = %if.then163, %if.end160
+  %76 = load i32*, i32** @a, align 4
+  %arrayidx165 = getelementptr inbounds i32, i32* %76, i32 43
+  %77 = load i32, i32* %arrayidx165, align 4
+  %tobool166 = icmp ne i32 %77, 0
+  br i1 %tobool166, label %if.then167, label %if.end168
+
+if.then167:                                       ; preds = %if.end164
+  store i32 11325, i32* @f, align 4
+  br label %if.end168
+
+if.end168:                                        ; preds = %if.then167, %if.end164
+  %78 = load i32*, i32** @a, align 4
+  %arrayidx169 = getelementptr inbounds i32, i32* %78, i32 44
+  %79 = load i32, i32* %arrayidx169, align 4
+  %tobool170 = icmp ne i32 %79, 0
+  br i1 %tobool170, label %if.then171, label %if.end172
+
+if.then171:                                       ; preds = %if.end168
+  store i32 12189, i32* @f, align 4
+  br label %if.end172
+
+if.end172:                                        ; preds = %if.then171, %if.end168
+  %80 = load i32*, i32** @a, align 4
+  %arrayidx173 = getelementptr inbounds i32, i32* %80, i32 45
+  %81 = load i32, i32* %arrayidx173, align 4
+  %tobool174 = icmp ne i32 %81, 0
+  br i1 %tobool174, label %if.then175, label %if.end176
+
+if.then175:                                       ; preds = %if.end172
+  store i32 15172, i32* @f, align 4
+  br label %if.end176
+
+if.end176:                                        ; preds = %if.then175, %if.end172
+  br label %if.then179
+
+if.then179:                                       ; preds = %if.end176
+  store i32 13491, i32* @f, align 4
+  br label %if.end180
+
+if.end180:                                        ; preds = %if.then179
+  %82 = load i32*, i32** @a, align 4
+  %arrayidx181 = getelementptr inbounds i32, i32* %82, i32 47
+  %83 = load i32, i32* %arrayidx181, align 4
+  %tobool182 = icmp ne i32 %83, 0
+  br i1 %tobool182, label %if.then183, label %if.end184
+
+if.then183:                                       ; preds = %if.end180
+  store i32 9521, i32* @f, align 4
+  br label %if.end184
+
+if.end184:                                        ; preds = %if.then183, %if.end180
+  %84 = load i32*, i32** @a, align 4
+  %arrayidx185 = getelementptr inbounds i32, i32* %84, i32 48
+  %85 = load i32, i32* %arrayidx185, align 4
+  %tobool186 = icmp ne i32 %85, 0
+  br i1 %tobool186, label %if.then187, label %if.end188
+
+if.then187:                                       ; preds = %if.end184
+  store i32 448, i32* @f, align 4
+  br label %if.end188
+
+if.end188:                                        ; preds = %if.then187, %if.end184
+  %86 = load i32*, i32** @a, align 4
+  %arrayidx189 = getelementptr inbounds i32, i32* %86, i32 49
+  %87 = load i32, i32* %arrayidx189, align 4
+  %tobool190 = icmp ne i32 %87, 0
+  br i1 %tobool190, label %if.then191, label %if.end192
+
+if.then191:                                       ; preds = %if.end188
+  store i32 13468, i32* @f, align 4
+  br label %if.end192
+
+if.end192:                                        ; preds = %if.then191, %if.end188
+  %88 = load i32*, i32** @a, align 4
+  %arrayidx193 = getelementptr inbounds i32, i32* %88, i32 50
+  %89 = load i32, i32* %arrayidx193, align 4
+  %tobool194 = icmp ne i32 %89, 0
+  br i1 %tobool194, label %if.then195, label %if.end196
+
+if.then195:                                       ; preds = %if.end192
+  store i32 16190, i32* @f, align 4
+  br label %if.end196
+
+if.end196:                                        ; preds = %if.then195, %if.end192
+  %90 = load i32*, i32** @a, align 4
+  %arrayidx197 = getelementptr inbounds i32, i32* %90, i32 51
+  %91 = load i32, i32* %arrayidx197, align 4
+  %tobool198 = icmp ne i32 %91, 0
+  br i1 %tobool198, label %if.then199, label %if.end200
+
+if.then199:                                       ; preds = %if.end196
+  store i32 8602, i32* @f, align 4
+  br label %if.end200
+
+if.end200:                                        ; preds = %if.then199, %if.end196
+  %92 = load i32*, i32** @a, align 4
+  %arrayidx201 = getelementptr inbounds i32, i32* %92, i32 52
+  %93 = load i32, i32* %arrayidx201, align 4
+  %tobool202 = icmp ne i32 %93, 0
+  br i1 %tobool202, label %if.then203, label %if.end204
+
+if.then203:                                       ; preds = %if.end200
+  store i32 21083, i32* @f, align 4
+  br label %if.end204
+
+if.end204:                                        ; preds = %if.then203, %if.end200
+  %94 = load i32*, i32** @a, align 4
+  %arrayidx205 = getelementptr inbounds i32, i32* %94, i32 53
+  %95 = load i32, i32* %arrayidx205, align 4
+  %tobool206 = icmp ne i32 %95, 0
+  br i1 %tobool206, label %if.then207, label %if.end208
+
+if.then207:                                       ; preds = %if.end204
+  store i32 5172, i32* @f, align 4
+  br label %if.end208
+
+if.end208:                                        ; preds = %if.then207, %if.end204
+  %96 = load i32*, i32** @a, align 4
+  %arrayidx209 = getelementptr inbounds i32, i32* %96, i32 54
+  %97 = load i32, i32* %arrayidx209, align 4
+  %tobool210 = icmp ne i32 %97, 0
+  br i1 %tobool210, label %if.then211, label %if.end212
+
+if.then211:                                       ; preds = %if.end208
+  store i32 32505, i32* @f, align 4
+  br label %if.end212
+
+if.end212:                                        ; preds = %if.then211, %if.end208
+  br label %if.then215
+
+if.then215:                                       ; preds = %if.end212
+  store i32 23490, i32* @f, align 4
+  br label %if.end216
+
+if.end216:                                        ; preds = %if.then215
+  %98 = load i32*, i32** @a, align 4
+  %arrayidx217 = getelementptr inbounds i32, i32* %98, i32 56
+  %99 = load i32, i32* %arrayidx217, align 4
+  %tobool218 = icmp ne i32 %99, 0
+  br i1 %tobool218, label %if.then219, label %if.end220
+
+if.then219:                                       ; preds = %if.end216
+  store i32 30699, i32* @f, align 4
+  br label %if.end220
+
+if.end220:                                        ; preds = %if.then219, %if.end216
+  %100 = load i32*, i32** @a, align 4
+  %arrayidx221 = getelementptr inbounds i32, i32* %100, i32 57
+  %101 = load i32, i32* %arrayidx221, align 4
+  %tobool222 = icmp ne i32 %101, 0
+  br i1 %tobool222, label %if.then223, label %if.end224
+
+if.then223:                                       ; preds = %if.end220
+  store i32 16286, i32* @f, align 4
+  br label %if.end224
+
+if.end224:                                        ; preds = %if.then223, %if.end220
+  %102 = load i32*, i32** @a, align 4
+  %arrayidx225 = getelementptr inbounds i32, i32* %102, i32 58
+  %103 = load i32, i32* %arrayidx225, align 4
+  %tobool226 = icmp ne i32 %103, 0
+  br i1 %tobool226, label %if.then227, label %if.end228
+
+if.then227:                                       ; preds = %if.end224
+  store i32 17939, i32* @f, align 4
+  br label %if.end228
+
+if.end228:                                        ; preds = %if.then227, %if.end224
+  %104 = load i32*, i32** @a, align 4
+  %arrayidx229 = getelementptr inbounds i32, i32* %104, i32 59
+  %105 = load i32, i32* %arrayidx229, align 4
+  %tobool230 = icmp ne i32 %105, 0
+  br i1 %tobool230, label %if.then231, label %if.end232
+
+if.then231:                                       ; preds = %if.end228
+  store i32 25148, i32* @f, align 4
+  br label %if.end232
+
+if.end232:                                        ; preds = %if.then231, %if.end228
+  %106 = load i32*, i32** @a, align 4
+  %arrayidx233 = getelementptr inbounds i32, i32* %106, i32 60
+  %107 = load i32, i32* %arrayidx233, align 4
+  %tobool234 = icmp ne i32 %107, 0
+  br i1 %tobool234, label %if.then235, label %if.end236
+
+if.then235:                                       ; preds = %if.end232
+  store i32 644, i32* @f, align 4
+  br label %if.end236
+
+if.end236:                                        ; preds = %if.then235, %if.end232
+  br label %if.then239
+
+if.then239:                                       ; preds = %if.end236
+  store i32 23457, i32* @f, align 4
+  br label %if.end240
+
+if.end240:                                        ; preds = %if.then239
+  %108 = load i32*, i32** @a, align 4
+  %arrayidx241 = getelementptr inbounds i32, i32* %108, i32 62
+  %109 = load i32, i32* %arrayidx241, align 4
+  %tobool242 = icmp ne i32 %109, 0
+  br i1 %tobool242, label %if.then243, label %if.end244
+
+if.then243:                                       ; preds = %if.end240
+  store i32 21116, i32* @f, align 4
+  br label %if.end244
+
+if.end244:                                        ; preds = %if.then243, %if.end240
+  br label %if.then247
+
+if.then247:                                       ; preds = %if.end244
+  store i32 10066, i32* @f, align 4
+  br label %if.end248
+
+if.end248:                                        ; preds = %if.then247
+  %110 = load i32*, i32** @a, align 4
+  %arrayidx249 = getelementptr inbounds i32, i32* %110, i32 64
+  %111 = load i32, i32* %arrayidx249, align 4
+  %tobool250 = icmp ne i32 %111, 0
+  br i1 %tobool250, label %if.then251, label %if.end252
+
+if.then251:                                       ; preds = %if.end248
+  store i32 9058, i32* @f, align 4
+  br label %if.end252
+
+if.end252:                                        ; preds = %if.then251, %if.end248
+  %112 = load i32*, i32** @a, align 4
+  %arrayidx253 = getelementptr inbounds i32, i32* %112, i32 65
+  %113 = load i32, i32* %arrayidx253, align 4
+  %tobool254 = icmp ne i32 %113, 0
+  br i1 %tobool254, label %if.then255, label %if.end256
+
+if.then255:                                       ; preds = %if.end252
+  store i32 8383, i32* @f, align 4
+  br label %if.end256
+
+if.end256:                                        ; preds = %if.then255, %if.end252
+  %114 = load i32*, i32** @a, align 4
+  %arrayidx257 = getelementptr inbounds i32, i32* %114, i32 66
+  %115 = load i32, i32* %arrayidx257, align 4
+  %tobool258 = icmp ne i32 %115, 0
+  br i1 %tobool258, label %if.then259, label %if.end260
+
+if.then259:                                       ; preds = %if.end256
+  store i32 31069, i32* @f, align 4
+  br label %if.end260
+
+if.end260:                                        ; preds = %if.then259, %if.end256
+  %116 = load i32*, i32** @a, align 4
+  %arrayidx261 = getelementptr inbounds i32, i32* %116, i32 67
+  %117 = load i32, i32* %arrayidx261, align 4
+  %tobool262 = icmp ne i32 %117, 0
+  br i1 %tobool262, label %if.then263, label %if.end264
+
+if.then263:                                       ; preds = %if.end260
+  store i32 32280, i32* @f, align 4
+  br label %if.end264
+
+if.end264:                                        ; preds = %if.then263, %if.end260
+  br label %if.then267
+
+if.then267:                                       ; preds = %if.end264
+  store i32 1553, i32* @f, align 4
+  br label %if.end268
+
+if.end268:                                        ; preds = %if.then267
+  %118 = load i32*, i32** @a, align 4
+  %arrayidx269 = getelementptr inbounds i32, i32* %118, i32 69
+  %119 = load i32, i32* %arrayidx269, align 4
+  %tobool270 = icmp ne i32 %119, 0
+  br i1 %tobool270, label %if.then271, label %if.end272
+
+if.then271:                                       ; preds = %if.end268
+  store i32 8118, i32* @f, align 4
+  br label %if.end272
+
+if.end272:                                        ; preds = %if.then271, %if.end268
+  %120 = load i32*, i32** @a, align 4
+  %arrayidx273 = getelementptr inbounds i32, i32* %120, i32 70
+  %121 = load i32, i32* %arrayidx273, align 4
+  %tobool274 = icmp ne i32 %121, 0
+  br i1 %tobool274, label %if.then275, label %if.end276
+
+if.then275:                                       ; preds = %if.end272
+  store i32 12959, i32* @f, align 4
+  br label %if.end276
+
+if.end276:                                        ; preds = %if.then275, %if.end272
+  %122 = load i32*, i32** @a, align 4
+  %arrayidx277 = getelementptr inbounds i32, i32* %122, i32 71
+  %123 = load i32, i32* %arrayidx277, align 4
+  %tobool278 = icmp ne i32 %123, 0
+  br i1 %tobool278, label %if.then279, label %if.end280
+
+if.then279:                                       ; preds = %if.end276
+  store i32 675, i32* @f, align 4
+  br label %if.end280
+
+if.end280:                                        ; preds = %if.then279, %if.end276
+  %124 = load i32*, i32** @a, align 4
+  %arrayidx281 = getelementptr inbounds i32, i32* %124, i32 72
+  %125 = load i32, i32* %arrayidx281, align 4
+  %tobool282 = icmp ne i32 %125, 0
+  br i1 %tobool282, label %if.then283, label %if.end284
+
+if.then283:                                       ; preds = %if.end280
+  store i32 29144, i32* @f, align 4
+  br label %if.end284
+
+if.end284:                                        ; preds = %if.then283, %if.end280
+  %126 = load i32*, i32** @a, align 4
+  %arrayidx285 = getelementptr inbounds i32, i32* %126, i32 73
+  %127 = load i32, i32* %arrayidx285, align 4
+  %tobool286 = icmp ne i32 %127, 0
+  br i1 %tobool286, label %if.then287, label %if.end288
+
+if.then287:                                       ; preds = %if.end284
+  store i32 26130, i32* @f, align 4
+  br label %if.end288
+
+if.end288:                                        ; preds = %if.then287, %if.end284
+  %128 = load i32*, i32** @a, align 4
+  %arrayidx289 = getelementptr inbounds i32, i32* %128, i32 74
+  %129 = load i32, i32* %arrayidx289, align 4
+  %tobool290 = icmp ne i32 %129, 0
+  br i1 %tobool290, label %if.then291, label %if.end292
+
+if.then291:                                       ; preds = %if.end288
+  store i32 31934, i32* @f, align 4
+  br label %if.end292
+
+if.end292:                                        ; preds = %if.then291, %if.end288
+  %130 = load i32*, i32** @a, align 4
+  %arrayidx293 = getelementptr inbounds i32, i32* %130, i32 75
+  %131 = load i32, i32* %arrayidx293, align 4
+  %tobool294 = icmp ne i32 %131, 0
+  br i1 %tobool294, label %if.then295, label %if.end296
+
+if.then295:                                       ; preds = %if.end292
+  store i32 25862, i32* @f, align 4
+  br label %if.end296
+
+if.end296:                                        ; preds = %if.then295, %if.end292
+  %132 = load i32*, i32** @a, align 4
+  %arrayidx297 = getelementptr inbounds i32, i32* %132, i32 76
+  %133 = load i32, i32* %arrayidx297, align 4
+  %tobool298 = icmp ne i32 %133, 0
+  br i1 %tobool298, label %if.then299, label %if.end300
+
+if.then299:                                       ; preds = %if.end296
+  store i32 10642, i32* @f, align 4
+  br label %if.end300
+
+if.end300:                                        ; preds = %if.then299, %if.end296
+  %134 = load i32*, i32** @a, align 4
+  %arrayidx301 = getelementptr inbounds i32, i32* %134, i32 77
+  %135 = load i32, i32* %arrayidx301, align 4
+  %tobool302 = icmp ne i32 %135, 0
+  br i1 %tobool302, label %if.then303, label %if.end304
+
+if.then303:                                       ; preds = %if.end300
+  store i32 20209, i32* @f, align 4
+  br label %if.end304
+
+if.end304:                                        ; preds = %if.then303, %if.end300
+  %136 = load i32*, i32** @a, align 4
+  %arrayidx305 = getelementptr inbounds i32, i32* %136, i32 78
+  %137 = load i32, i32* %arrayidx305, align 4
+  %tobool306 = icmp ne i32 %137, 0
+  br i1 %tobool306, label %if.then307, label %if.end308
+
+if.then307:                                       ; preds = %if.end304
+  store i32 30889, i32* @f, align 4
+  br label %if.end308
+
+if.end308:                                        ; preds = %if.then307, %if.end304
+  %138 = load i32*, i32** @a, align 4
+  %arrayidx309 = getelementptr inbounds i32, i32* %138, i32 79
+  %139 = load i32, i32* %arrayidx309, align 4
+  %tobool310 = icmp ne i32 %139, 0
+  br i1 %tobool310, label %if.then311, label %if.end312
+
+if.then311:                                       ; preds = %if.end308
+  store i32 18688, i32* @f, align 4
+  br label %if.end312
+
+if.end312:                                        ; preds = %if.then311, %if.end308
+  %140 = load i32*, i32** @a, align 4
+  %arrayidx313 = getelementptr inbounds i32, i32* %140, i32 80
+  %141 = load i32, i32* %arrayidx313, align 4
+  %tobool314 = icmp ne i32 %141, 0
+  br i1 %tobool314, label %if.then315, label %if.end316
+
+if.then315:                                       ; preds = %if.end312
+  store i32 28726, i32* @f, align 4
+  br label %if.end316
+
+if.end316:                                        ; preds = %if.then315, %if.end312
+  %142 = load i32*, i32** @a, align 4
+  %arrayidx317 = getelementptr inbounds i32, i32* %142, i32 81
+  %143 = load i32, i32* %arrayidx317, align 4
+  %tobool318 = icmp ne i32 %143, 0
+  br i1 %tobool318, label %if.then319, label %if.end320
+
+if.then319:                                       ; preds = %if.end316
+  store i32 4266, i32* @f, align 4
+  br label %if.end320
+
+if.end320:                                        ; preds = %if.then319, %if.end316
+  %144 = load i32*, i32** @a, align 4
+  %arrayidx321 = getelementptr inbounds i32, i32* %144, i32 82
+  %145 = load i32, i32* %arrayidx321, align 4
+  %tobool322 = icmp ne i32 %145, 0
+  br i1 %tobool322, label %if.then323, label %if.end324
+
+if.then323:                                       ; preds = %if.end320
+  store i32 15461, i32* @f, align 4
+  br label %if.end324
+
+if.end324:                                        ; preds = %if.then323, %if.end320
+  %146 = load i32*, i32** @a, align 4
+  %arrayidx325 = getelementptr inbounds i32, i32* %146, i32 83
+  %147 = load i32, i32* %arrayidx325, align 4
+  %tobool326 = icmp ne i32 %147, 0
+  br i1 %tobool326, label %if.then327, label %if.end328
+
+if.then327:                                       ; preds = %if.end324
+  store i32 24716, i32* @f, align 4
+  br label %if.end328
+
+if.end328:                                        ; preds = %if.then327, %if.end324
+  br label %if.then331
+
+if.then331:                                       ; preds = %if.end328
+  store i32 18727, i32* @f, align 4
+  br label %if.end332
+
+if.end332:                                        ; preds = %if.then331
+  %148 = load i32*, i32** @a, align 4
+  %arrayidx333 = getelementptr inbounds i32, i32* %148, i32 85
+  %149 = load i32, i32* %arrayidx333, align 4
+  %tobool334 = icmp ne i32 %149, 0
+  br i1 %tobool334, label %if.then335, label %if.end336
+
+if.then335:                                       ; preds = %if.end332
+  store i32 29505, i32* @f, align 4
+  br label %if.end336
+
+if.end336:                                        ; preds = %if.then335, %if.end332
+  %150 = load i32*, i32** @a, align 4
+  %arrayidx337 = getelementptr inbounds i32, i32* %150, i32 86
+  %151 = load i32, i32* %arrayidx337, align 4
+  %tobool338 = icmp ne i32 %151, 0
+  br i1 %tobool338, label %if.then339, label %if.end340
+
+if.then339:                                       ; preds = %if.end336
+  store i32 27008, i32* @f, align 4
+  br label %if.end340
+
+if.end340:                                        ; preds = %if.then339, %if.end336
+  %152 = load i32*, i32** @a, align 4
+  %arrayidx341 = getelementptr inbounds i32, i32* %152, i32 87
+  %153 = load i32, i32* %arrayidx341, align 4
+  %tobool342 = icmp ne i32 %153, 0
+  br i1 %tobool342, label %if.then343, label %if.end344
+
+if.then343:                                       ; preds = %if.end340
+  store i32 6550, i32* @f, align 4
+  br label %if.end344
+
+if.end344:                                        ; preds = %if.then343, %if.end340
+  br label %if.then347
+
+if.then347:                                       ; preds = %if.end344
+  store i32 1117, i32* @f, align 4
+  br label %if.end348
+
+if.end348:                                        ; preds = %if.then347
+  %154 = load i32*, i32** @a, align 4
+  %arrayidx349 = getelementptr inbounds i32, i32* %154, i32 89
+  %155 = load i32, i32* %arrayidx349, align 4
+  %tobool350 = icmp ne i32 %155, 0
+  br i1 %tobool350, label %if.then351, label %if.end352
+
+if.then351:                                       ; preds = %if.end348
+  store i32 20118, i32* @f, align 4
+  br label %if.end352
+
+if.end352:                                        ; preds = %if.then351, %if.end348
+  %156 = load i32*, i32** @a, align 4
+  %arrayidx353 = getelementptr inbounds i32, i32* %156, i32 90
+  %157 = load i32, i32* %arrayidx353, align 4
+  %tobool354 = icmp ne i32 %157, 0
+  br i1 %tobool354, label %if.then355, label %if.end356
+
+if.then355:                                       ; preds = %if.end352
+  store i32 13650, i32* @f, align 4
+  br label %if.end356
+
+if.end356:                                        ; preds = %if.then355, %if.end352
+  br label %if.then359
+
+if.then359:                                       ; preds = %if.end356
+  store i32 18642, i32* @f, align 4
+  br label %if.end360
+
+if.end360:                                        ; preds = %if.then359
+  %158 = load i32*, i32** @a, align 4
+  %arrayidx361 = getelementptr inbounds i32, i32* %158, i32 92
+  %159 = load i32, i32* %arrayidx361, align 4
+  %tobool362 = icmp ne i32 %159, 0
+  br i1 %tobool362, label %if.then363, label %if.end364
+
+if.then363:                                       ; preds = %if.end360
+  store i32 30662, i32* @f, align 4
+  br label %if.end364
+
+if.end364:                                        ; preds = %if.then363, %if.end360
+  %160 = load i32*, i32** @a, align 4
+  %arrayidx365 = getelementptr inbounds i32, i32* %160, i32 93
+  %161 = load i32, i32* %arrayidx365, align 4
+  %tobool366 = icmp ne i32 %161, 0
+  br i1 %tobool366, label %if.then367, label %if.end368
+
+if.then367:                                       ; preds = %if.end364
+  store i32 8095, i32* @f, align 4
+  br label %if.end368
+
+if.end368:                                        ; preds = %if.then367, %if.end364
+  %162 = load i32*, i32** @a, align 4
+  %arrayidx369 = getelementptr inbounds i32, i32* %162, i32 94
+  %163 = load i32, i32* %arrayidx369, align 4
+  %tobool370 = icmp ne i32 %163, 0
+  br i1 %tobool370, label %if.then371, label %if.end372
+
+if.then371:                                       ; preds = %if.end368
+  store i32 8442, i32* @f, align 4
+  br label %if.end372
+
+if.end372:                                        ; preds = %if.then371, %if.end368
+  %164 = load i32*, i32** @a, align 4
+  %arrayidx373 = getelementptr inbounds i32, i32* %164, i32 95
+  %165 = load i32, i32* %arrayidx373, align 4
+  %tobool374 = icmp ne i32 %165, 0
+  br i1 %tobool374, label %if.then375, label %if.end376
+
+if.then375:                                       ; preds = %if.end372
+  store i32 8153, i32* @f, align 4
+  br label %if.end376
+
+if.end376:                                        ; preds = %if.then375, %if.end372
+  br label %if.then379
+
+if.then379:                                       ; preds = %if.end376
+  store i32 12965, i32* @f, align 4
+  br label %if.end380
+
+if.end380:                                        ; preds = %if.then379
+  %166 = load i32*, i32** @a, align 4
+  %arrayidx381 = getelementptr inbounds i32, i32* %166, i32 97
+  %167 = load i32, i32* %arrayidx381, align 4
+  %tobool382 = icmp ne i32 %167, 0
+  br i1 %tobool382, label %if.then383, label %if.end384
+
+if.then383:                                       ; preds = %if.end380
+  store i32 14277, i32* @f, align 4
+  br label %if.end384
+
+if.end384:                                        ; preds = %if.then383, %if.end380
+  br label %if.then387
+
+if.then387:                                       ; preds = %if.end384
+  store i32 1997, i32* @f, align 4
+  br label %if.end388
+
+if.end388:                                        ; preds = %if.then387
+  %168 = load i32*, i32** @a, align 4
+  %arrayidx389 = getelementptr inbounds i32, i32* %168, i32 99
+  %169 = load i32, i32* %arrayidx389, align 4
+  %tobool390 = icmp ne i32 %169, 0
+  br i1 %tobool390, label %if.then391, label %if.end392
+
+if.then391:                                       ; preds = %if.end388
+  store i32 31385, i32* @f, align 4
+  br label %if.end392
+
+if.end392:                                        ; preds = %if.then391, %if.end388
+  %170 = load i32*, i32** @a, align 4
+  %arrayidx393 = getelementptr inbounds i32, i32* %170, i32 100
+  %171 = load i32, i32* %arrayidx393, align 4
+  %tobool394 = icmp ne i32 %171, 0
+  br i1 %tobool394, label %if.then395, label %if.end396
+
+if.then395:                                       ; preds = %if.end392
+  store i32 8286, i32* @f, align 4
+  br label %if.end396
+
+if.end396:                                        ; preds = %if.then395, %if.end392
+  ret void
+}
diff --git a/test/CodeGen/Thumb/stack-access.ll b/test/CodeGen/Thumb/stack-access.ll
index fded4104207c69716ad6848ee52e3f3d4d83a2d7..44217aba62d5e66b6f84f1869ea1f97ed8053675 100644
--- a/test/CodeGen/Thumb/stack-access.ll
+++ b/test/CodeGen/Thumb/stack-access.ll
@@ -74,15 +74,17 @@ define zeroext i16 @test6() {
 }
 
 ; Accessing the bottom of a large array shouldn't require materializing a base
+; 
+; CHECK: movs [[REG:r[0-9]+]], #1
+; CHECK: str [[REG]], [sp, #16]
+; CHECK: str [[REG]], [sp, #4]
+
 define void @test7() {
   %arr = alloca [200 x i32], align 4
 
-  ; CHECK: movs [[REG:r[0-9]+]], #1
-  ; CHECK: str [[REG]], [sp, #4]
   %arrayidx = getelementptr inbounds [200 x i32], [200 x i32]* %arr, i32 0, i32 1
   store i32 1, i32* %arrayidx, align 4
 
-  ; CHECK: str [[REG]], [sp, #16]
   %arrayidx1 = getelementptr inbounds [200 x i32], [200 x i32]* %arr, i32 0, i32 4
   store i32 1, i32* %arrayidx1, align 4
 
@@ -96,30 +98,36 @@ define void @test8() {
   %arr1 = alloca [224 x i32], align 4
 
 ; CHECK: movs [[REG:r[0-9]+]], #1
-; CHECK: str [[REG]], [sp]
+; CHECK-DAG: str [[REG]], [sp]
   %arr1idx1 = getelementptr inbounds [224 x i32], [224 x i32]* %arr1, i32 0, i32 0
   store i32 1, i32* %arr1idx1, align 4
 
 ; Offset in range for sp-based store, but not for non-sp-based store
-; CHECK: str [[REG]], [sp, #128]
+; CHECK-DAG: str [[REG]], [sp, #128]
   %arr1idx2 = getelementptr inbounds [224 x i32], [224 x i32]* %arr1, i32 0, i32 32
   store i32 1, i32* %arr1idx2, align 4
 
-; CHECK: str [[REG]], [sp, #896]
+; CHECK-DAG: str [[REG]], [sp, #896]
   %arr2idx1 = getelementptr inbounds [224 x i32], [224 x i32]* %arr2, i32 0, i32 0
   store i32 1, i32* %arr2idx1, align 4
 
 ; %arr2 is in range, but this element of it is not
-; CHECK: str [[REG]], [{{r[0-9]+}}]
+; CHECK-DAG: ldr [[RA:r[0-9]+]], .LCPI7_2
+; CHECK-DAG: add [[RA]], sp
+; CHECK-DAG: str [[REG]], [{{r[0-9]+}}]
   %arr2idx2 = getelementptr inbounds [224 x i32], [224 x i32]* %arr2, i32 0, i32 32
   store i32 1, i32* %arr2idx2, align 4
 
 ; %arr3 is not in range
-; CHECK: str [[REG]], [{{r[0-9]+}}]
+; CHECK-DAG: ldr [[RB:r[0-9]+]], .LCPI7_3
+; CHECK-DAG: add [[RB]], sp
+; CHECK-DAG: str [[REG]], [{{r[0-9]+}}]
   %arr3idx1 = getelementptr inbounds [224 x i32], [224 x i32]* %arr3, i32 0, i32 0
   store i32 1, i32* %arr3idx1, align 4
 
-; CHECK: str [[REG]], [{{r[0-9]+}}]
+; CHECK-DAG: ldr [[RC:r[0-9]+]], .LCPI7_4
+; CHECK-DAG: add [[RC]], sp
+; CHECK-DAG: str [[REG]], [{{r[0-9]+}}]
   %arr3idx2 = getelementptr inbounds [224 x i32], [224 x i32]* %arr3, i32 0, i32 32
   store i32 1, i32* %arr3idx2, align 4
 
diff --git a/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll b/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
index 97c66d9dc865e2a56f961a14f9a3fb56b6ceb2bd..6678f68c4e8952e40c4fc82a9027a871a8f1cdcf 100644
--- a/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
+++ b/test/CodeGen/Thumb/stack-coloring-without-frame-ptr.ll
@@ -12,18 +12,18 @@ entry:
 
   %0 = bitcast %deque* %var3 to i8*
   %1 = bitcast %iterator* %var1 to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %1) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %1) nounwind
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %0, i32 16, i32 4, i1 false)
-  call void @llvm.lifetime.end(i64 16, i8* %1) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %1) nounwind
 
   %2 = bitcast %insert_iterator* %var2 to i8*
-  call void @llvm.lifetime.start(i64 20, i8* %2) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* %2) nounwind
 
   ret i32 0
 }
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/Thumb/stack_guard_remat.ll b/test/CodeGen/Thumb/stack_guard_remat.ll
index 41edef5a58e65d3a5056ab3b37a2824cffd8bf3c..294c6a6bd45451ee33a330fc580498459fbbc179 100644
--- a/test/CodeGen/Thumb/stack_guard_remat.ll
+++ b/test/CodeGen/Thumb/stack_guard_remat.ll
@@ -27,20 +27,20 @@
 define i32 @test_stack_guard_remat() #0 {
   %a1 = alloca [256 x i32], align 4
   %1 = bitcast [256 x i32]* %a1 to i8*
-  call void @llvm.lifetime.start(i64 1024, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 1024, i8* %1)
   %2 = getelementptr inbounds [256 x i32], [256 x i32]* %a1, i32 0, i32 0
   call void @foo3(i32* %2) #3
   call void asm sideeffect "foo2", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{sp},~{lr}"()
-  call void @llvm.lifetime.end(i64 1024, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1024, i8* %1)
   ret i32 0
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare void @foo3(i32*)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Thumb/stm-deprecated.ll b/test/CodeGen/Thumb/stm-deprecated.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ffe2c0afd921f4d601084822852a9deb6d6f97eb
--- /dev/null
+++ b/test/CodeGen/Thumb/stm-deprecated.ll
@@ -0,0 +1,19 @@
+; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumbv5e-linux-gnueabi -verify-machineinstrs %s -o - | FileCheck %s
+
+%0 = type { %0*, %0*, i32 }
+
+@x1 = external global %0, align 4
+@x2 = external global %0, align 4
+
+; CHECK: str r0, [r1]
+; CHECK-NEXT: str r1, [r1, #4]
+; CHECK-NOT: stm
+
+define void @foo(i32 %unused, %0* %x) {
+  %first = getelementptr inbounds %0, %0* %x, i32 0, i32 0
+  %second = getelementptr inbounds %0, %0* %x, i32 0, i32 1
+  store %0* @x1, %0** %first
+  store %0* %x, %0** %second
+  unreachable
+}
diff --git a/test/CodeGen/Thumb/tbb-reuse.mir b/test/CodeGen/Thumb/tbb-reuse.mir
new file mode 100644
index 0000000000000000000000000000000000000000..15b9fa184c384fccbe6ea44e4bd478f5c02ebf67
--- /dev/null
+++ b/test/CodeGen/Thumb/tbb-reuse.mir
@@ -0,0 +1,151 @@
+# RUN: llc -run-pass arm-cp-islands %s -o - | FileCheck %s
+
+--- |
+  ; ModuleID = '<stdin>'
+  source_filename = "<stdin>"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv6m--none-eabi"
+  
+  declare void @exit0()
+  
+  declare void @exit1(i32)
+  
+  declare void @exit2()
+  
+  declare void @exit3()
+  
+  declare void @exit4()
+  
+  define void @jump_table(i32 %val, i32 %arg2, i32 %arg3, i32 %arg4) {
+  entry:
+    switch i32 %val, label %default [
+      i32 1, label %lab1
+      i32 2, label %lab2
+      i32 3, label %lab3
+      i32 4, label %lab4
+    ]
+  
+  default:                                          ; preds = %entry
+    tail call void @exit0()
+    ret void
+  
+  lab1:                                             ; preds = %entry
+    %b = sub i32 %val, 1
+    %a = shl i32 %b, 2
+    tail call void @exit1(i32 %a)
+    ret void
+  
+  lab2:                                             ; preds = %entry
+    tail call void @exit2()
+    ret void
+  
+  lab3:                                             ; preds = %entry
+    tail call void @exit3()
+    ret void
+  
+  lab4:                                             ; preds = %entry
+    tail call void @exit4()
+    ret void
+  }
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #0
+  
+  attributes #0 = { nounwind }
+
+...
+---
+name:            jump_table
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%r0' }
+calleeSavedRegisters: [ '%lr', '%d8', '%d9', '%d10', '%d11', '%d12', '%d13', 
+                        '%d14', '%d15', '%q4', '%q5', '%q6', '%q7', '%r4', 
+                        '%r5', '%r6', '%r7', '%r8', '%r9', '%r10', '%r11', 
+                        '%s16', '%s17', '%s18', '%s19', '%s20', '%s21', 
+                        '%s22', '%s23', '%s24', '%s25', '%s26', '%s27', 
+                        '%s28', '%s29', '%s30', '%s31', '%d8_d10', '%d9_d11', 
+                        '%d10_d12', '%d11_d13', '%d12_d14', '%d13_d15', 
+                        '%q4_q5', '%q5_q6', '%q6_q7', '%q4_q5_q6_q7', '%r4_r5', 
+                        '%r6_r7', '%r8_r9', '%r10_r11', '%d8_d9_d10', '%d9_d10_d11', 
+                        '%d10_d11_d12', '%d11_d12_d13', '%d12_d13_d14', 
+                        '%d13_d14_d15', '%d8_d10_d12', '%d9_d11_d13', '%d10_d12_d14', 
+                        '%d11_d13_d15', '%d8_d10_d12_d14', '%d9_d11_d13_d15', 
+                        '%d9_d10', '%d11_d12', '%d13_d14', '%d9_d10_d11_d12', 
+                        '%d11_d12_d13_d14' ]
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       8
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+stack:           
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '%lr' }
+  - { id: 1, type: spill-slot, offset: -8, size: 4, alignment: 4, callee-saved-register: '%r7' }
+jumpTable:       
+  kind:            inline
+  entries:         
+    - id:              0
+      blocks:          [ '%bb.3.lab1', '%bb.4.lab2', '%bb.5.lab3', '%bb.6.lab4' ]
+# r1 is redefined in the middle of the recognizable jump sequence - it shouldn't be clobbered!
+# CHECK-NOT: tTBB_JT 
+
+body:             |
+  bb.0.entry:
+    successors: %bb.2.default(0x19999998), %bb.1.entry(0x66666668)
+    liveins: %r0, %r7, %lr
+  
+    frame-setup tPUSH 14, _, killed %r7, killed %lr, implicit-def %sp, implicit %sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset %lr, -4
+    frame-setup CFI_INSTRUCTION offset %r7, -8
+    %r1, dead %cpsr = tSUBi3 %r0, 1, 14, _
+    tCMPi8 %r1, 3, 14, _, implicit-def %cpsr
+    tBcc %bb.2.default, 8, killed %cpsr
+  
+  bb.1.entry:
+    successors: %bb.3.lab1(0x20000000), %bb.4.lab2(0x20000000), %bb.5.lab3(0x20000000), %bb.6.lab4(0x20000000)
+    liveins: %r0, %r1
+  
+    %r1, dead %cpsr = tLSLri killed %r1, 2, 14, _
+    %r2 = tLEApcrelJT %jump-table.0, 14, _
+    %r2 = tLDRr killed %r1, killed %r2, 14, _ :: (load 4 from jump-table)
+    %r1, dead %cpsr = tLSLri %r2, 2, 14, _
+    tBR_JTr killed %r2, %jump-table.0
+  
+  bb.2.default:
+    tBL 14, _, @exit0, csr_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp
+    tPOP_RET 14, _, def %r7, def %pc, implicit-def %sp, implicit %sp
+  
+  bb.3.lab1:
+    liveins: %r0,%r1
+  
+    tBL 14, _, @exit1, csr_aapcs, implicit-def dead %lr, implicit %sp, implicit %r0, implicit-def %sp
+    tPOP_RET 14, _, def %r7, def %pc, implicit-def %sp, implicit %sp
+  
+  bb.4.lab2:
+    tBL 14, _, @exit2, csr_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp
+    tPOP_RET 14, _, def %r7, def %pc, implicit-def %sp, implicit %sp
+  
+  bb.5.lab3:
+    tBL 14, _, @exit3, csr_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp
+    tPOP_RET 14, _, def %r7, def %pc, implicit-def %sp, implicit %sp
+  
+  bb.6.lab4:
+    tBL 14, _, @exit4, csr_aapcs, implicit-def dead %lr, implicit %sp, implicit-def %sp
+    tPOP_RET 14, _, def %r7, def %pc, implicit-def %sp, implicit %sp
+
+...
diff --git a/test/CodeGen/Thumb/thumb-shrink-wrapping.ll b/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
index 6114b72569e7fcd954c64736dde93e68d8eb2bd8..c571e351a1ef16d68ecbdf1bcb4cfde7b8eabf10 100644
--- a/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
+++ b/test/CodeGen/Thumb/thumb-shrink-wrapping.ll
@@ -1,11 +1,12 @@
-; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V4T
-; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=true -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE --check-prefix=ENABLE-V5T
-; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumb-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumb-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V4T
-; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -mtriple=thumbv5-macho \
+; RUN: llc %s -o - -enable-shrink-wrap=false -ifcvt-fn-start=1 -ifcvt-fn-stop=0 -tail-dup-placement=0 -mtriple=thumbv5-macho \
 ; RUN:      | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE --check-prefix=DISABLE-V5T
+
 ;
 ; Note: Lots of tests use inline asm instead of regular calls.
 ; This allows to have a better control on what the allocation will do.
@@ -15,6 +16,8 @@
 ; edges.
 ; Also disable the late if-converter as it makes harder to reason on
 ; the diffs.
+; Disable tail-duplication during placement, as v4t vs v5t get different
+; results due to branches not being analyzable under v5
 
 ; Initial motivating example: Simple diamond with a call just on one side.
 ; CHECK-LABEL: foo:
@@ -502,14 +505,9 @@ if.end:                                           ; preds = %for.body, %if.else
 ; CHECK-NEXT: str r1, {{\[}}[[TMP_SP]]]
 ; CHECK-NEXT: str r1, {{\[}}[[TMP_SP]], #4]
 ; CHECK-NEXT: str r1, {{\[}}[[TMP_SP]], #8]
-; Thumb has quite a strange way for moving stuff
-; in around. Oh well, match the current sequence.
-; CHECK: push {r1}
-; CHECK-NEXT: pop {r0}
-; CHECK: push {r1}
-; CHECK-NEXT: pop {r2}
-; CHECK: push {r1}
-; CHECK-NEXT: pop {r3}
+; CHECK:      movs r0, r1
+; CHECK-NEXT: movs r2, r1
+; CHECK-NEXT: movs r3, r1
 ; CHECK-NEXT: bl
 ; CHECK-NEXT: lsls r0, r0, #3
 ;
diff --git a/test/CodeGen/Thumb2/cbnz.ll b/test/CodeGen/Thumb2/cbnz.ll
index 5c0bb5bfe1cdc741483d2a60b781fd7644c158be..e11c4038678c4b7873acdbd014bf6cddb354d61e 100644
--- a/test/CodeGen/Thumb2/cbnz.ll
+++ b/test/CodeGen/Thumb2/cbnz.ll
@@ -26,7 +26,7 @@ t:
   call void @x()
   call void @x()
   call void @x()
-  ; CHECK: cbnz
+  ; CHECK: cbz
   %q = icmp eq i32 %y, 0
   br i1 %q, label %t2, label %f
 
diff --git a/test/CodeGen/Thumb2/float-cmp.ll b/test/CodeGen/Thumb2/float-cmp.ll
index 77b0999337c671d9f9605aac5b43164f6b2a0790..834812cddd6dc9ec822256ac46062e039dacc947 100644
--- a/test/CodeGen/Thumb2/float-cmp.ll
+++ b/test/CodeGen/Thumb2/float-cmp.ll
@@ -15,7 +15,7 @@ define i1 @cmp_f_false(float %a, float %b) {
 define i1 @cmp_f_oeq(float %a, float %b) {
 ; CHECK-LABEL: cmp_f_oeq:
 ; NONE: bl __aeabi_fcmpeq
-; HARD: vcmpe.f32
+; HARD: vcmp.f32
 ; HARD: moveq r0, #1
   %1 = fcmp oeq float %a, %b
   ret i1 %1
@@ -56,7 +56,7 @@ define i1 @cmp_f_one(float %a, float %b) {
 ; CHECK-LABEL: cmp_f_one:
 ; NONE: bl __aeabi_fcmpgt
 ; NONE: bl __aeabi_fcmplt
-; HARD: vcmpe.f32
+; HARD: vcmp.f32
 ; HARD: movmi r0, #1
 ; HARD: movgt r0, #1
   %1 = fcmp one float %a, %b
@@ -73,7 +73,7 @@ define i1 @cmp_f_ord(float %a, float %b) {
 ; CHECK-LABEL: cmp_f_ueq:
 ; NONE: bl __aeabi_fcmpeq
 ; NONE: bl __aeabi_fcmpun
-; HARD: vcmpe.f32
+; HARD: vcmp.f32
 ; HARD: moveq r0, #1
 ; HARD: movvs r0, #1
   %1 = fcmp ueq float %a, %b
@@ -122,7 +122,7 @@ define i1 @cmp_f_ule(float %a, float %b) {
 define i1 @cmp_f_une(float %a, float %b) {
 ; CHECK-LABEL: cmp_f_une:
 ; NONE: bl __aeabi_fcmpeq
-; HARD: vcmpe.f32
+; HARD: vcmp.f32
 ; HARD: movne r0, #1
   %1 = fcmp une float %a, %b
   ret i1 %1
@@ -154,7 +154,7 @@ define i1 @cmp_d_oeq(double %a, double %b) {
 ; CHECK-LABEL: cmp_d_oeq:
 ; NONE: bl __aeabi_dcmpeq
 ; SP: bl __aeabi_dcmpeq
-; DP: vcmpe.f64
+; DP: vcmp.f64
 ; DP: moveq r0, #1
   %1 = fcmp oeq double %a, %b
   ret i1 %1
@@ -201,7 +201,7 @@ define i1 @cmp_d_one(double %a, double %b) {
 ; NONE: bl __aeabi_dcmplt
 ; SP: bl __aeabi_dcmpgt
 ; SP: bl __aeabi_dcmplt
-; DP: vcmpe.f64
+; DP: vcmp.f64
 ; DP: movmi r0, #1
 ; DP: movgt r0, #1
   %1 = fcmp one double %a, %b
@@ -259,7 +259,7 @@ define i1 @cmp_d_ueq(double %a, double %b) {
 ; NONE: bl __aeabi_dcmpun
 ; SP: bl __aeabi_dcmpeq
 ; SP: bl __aeabi_dcmpun
-; DP: vcmpe.f64
+; DP: vcmp.f64
 ; DP: moveq r0, #1
 ; DP: movvs r0, #1
   %1 = fcmp ueq double %a, %b
@@ -290,7 +290,7 @@ define i1 @cmp_d_une(double %a, double %b) {
 ; CHECK-LABEL: cmp_d_une:
 ; NONE: bl __aeabi_dcmpeq
 ; SP: bl __aeabi_dcmpeq
-; DP: vcmpe.f64
+; DP: vcmp.f64
 ; DP: movne r0, #1
   %1 = fcmp une double %a, %b
   ret i1 %1
diff --git a/test/CodeGen/Thumb2/ifcvt-compare.ll b/test/CodeGen/Thumb2/ifcvt-compare.ll
index 7b5ce4fa3f5f33b15344b6057e11a432b2483bb0..688195f579ebf2983ae41d615ba72504cf0ee9de 100644
--- a/test/CodeGen/Thumb2/ifcvt-compare.ll
+++ b/test/CodeGen/Thumb2/ifcvt-compare.ll
@@ -4,7 +4,7 @@ declare void @x()
 
 define void @f0(i32 %x) optsize {
   ; CHECK-LABEL: f0:
-  ; CHECK: cbnz
+  ; CHECK: cbz
   %p = icmp eq i32 %x, 0
   br i1 %p, label %t, label %f
 
diff --git a/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll b/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll
index ae3084dcc62e14d14c5298d364df0dd06fab3aba..65ee4283b3f720acb3af1fad9a06ca9cdb024ba5 100644
--- a/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll
+++ b/test/CodeGen/Thumb2/ifcvt-rescan-bug-2016-08-22.ll
@@ -3,7 +3,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "thumbv7-unknown-linux-gnueabihf"
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
 
 ; Function Attrs: nounwind
 declare void @_ZNSaIcEC2Ev() unnamed_addr #0 align 2
@@ -25,7 +25,7 @@ define hidden void @_ZN4llvm14DOTGraphTraitsIPNS_13ScheduleDAGMIEE17getEdgeAttri
   br label %3
 
 ; <label>:2:                                      ; preds = %0
-  call void @llvm.lifetime.start(i64 1, i8* undef) #0
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* undef) #0
   call void @_ZNSaIcEC2Ev() #0
   br label %3
 
diff --git a/test/CodeGen/Thumb2/intrinsics-coprocessor.ll b/test/CodeGen/Thumb2/intrinsics-coprocessor.ll
new file mode 100644
index 0000000000000000000000000000000000000000..248ec223a61e79f1b3af121488a8f279c3e16130
--- /dev/null
+++ b/test/CodeGen/Thumb2/intrinsics-coprocessor.ll
@@ -0,0 +1,93 @@
+; RUN: llc < %s -march=thumb -mtriple=thumbv7-eabi -mcpu=cortex-a8 -show-mc-encoding | FileCheck %s
+define void @coproc(i8* %i) nounwind {
+entry:
+  ; CHECK: mrc p7, #1, r{{[0-9]+}}, c1, c1, #4
+  %0 = tail call i32 @llvm.arm.mrc(i32 7, i32 1, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mcr p7, #1, r{{[0-9]+}}, c1, c1, #4
+  tail call void @llvm.arm.mcr(i32 7, i32 1, i32 %0, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mrc2 p7, #1, r{{[0-9]+}}, c1, c1, #4
+  %1 = tail call i32 @llvm.arm.mrc2(i32 7, i32 1, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mcr2 p7, #1, r{{[0-9]+}}, c1, c1, #4
+  tail call void @llvm.arm.mcr2(i32 7, i32 1, i32 %1, i32 1, i32 1, i32 4) nounwind
+  ; CHECK: mcrr p7, #1, r{{[0-9]+}}, r{{[0-9]+}}, c1
+  tail call void @llvm.arm.mcrr(i32 7, i32 1, i32 %0, i32 %1, i32 1) nounwind
+  ; CHECK: mcrr2 p7, #1, r{{[0-9]+}}, r{{[0-9]+}}, c1
+  tail call void @llvm.arm.mcrr2(i32 7, i32 1, i32 %0, i32 %1, i32 1) nounwind
+  ; CHECK: cdp p7, #3, c1, c1, c1, #5
+  tail call void @llvm.arm.cdp(i32 7, i32 3, i32 1, i32 1, i32 1, i32 5) nounwind
+  ; CHECK: cdp2 p7, #3, c1, c1, c1, #5
+  tail call void @llvm.arm.cdp2(i32 7, i32 3, i32 1, i32 1, i32 1, i32 5) nounwind
+  ; CHECK: ldc p7, c3, [r{{[0-9]+}}]
+  tail call void @llvm.arm.ldc(i32 7, i32 3, i8* %i) nounwind
+  ; CHECK: ldcl p7, c3, [r{{[0-9]+}}]
+  tail call void @llvm.arm.ldcl(i32 7, i32 3, i8* %i) nounwind
+  ; CHECK: ldc2 p7, c3, [r{{[0-9]+}}]
+  tail call void @llvm.arm.ldc2(i32 7, i32 3, i8* %i) nounwind
+  ; CHECK: ldc2l p7, c3, [r{{[0-9]+}}]
+  tail call void @llvm.arm.ldc2l(i32 7, i32 3, i8* %i) nounwind
+  ; CHECK: stc p7, c3, [r{{[0-9]+}}]
+  tail call void @llvm.arm.stc(i32 7, i32 3, i8* %i) nounwind
+  ; CHECK: stcl p7, c3, [r{{[0-9]+}}]
+  tail call void @llvm.arm.stcl(i32 7, i32 3, i8* %i) nounwind
+  ; CHECK: stc2 p7, c3, [r{{[0-9]+}}]
+  tail call void @llvm.arm.stc2(i32 7, i32 3, i8* %i) nounwind
+  ; CHECK: stc2l p7, c3, [r{{[0-9]+}}]
+  tail call void @llvm.arm.stc2l(i32 7, i32 3, i8* %i) nounwind
+  ; CHECK: mrrc p1, #2, r{{[0-9]+}}, r{{[0-9]+}}, c3
+  %2 = tail call { i32, i32 } @llvm.arm.mrrc(i32 1, i32 2, i32 3) nounwind
+  ; CHECK: mrrc2 p1, #2, r{{[0-9]+}}, r{{[0-9]+}}, c3
+  %3 = tail call { i32, i32 } @llvm.arm.mrrc2(i32 1, i32 2, i32 3) nounwind
+  ret void
+}
+
+define hidden void @cond_cdp(i32 %a) {
+; CHECK-LABEL: cond_cdp:
+entry:
+  %tobool = icmp eq i32 %a, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:
+; CHECK: it ne
+; CHECK: cdpne   p15, #0, c0, c0, c0, #0 @ encoding: [0x00,0xee,0x00,0x0f]
+  tail call void @llvm.arm.cdp(i32 15, i32 0, i32 0, i32 0, i32 0, i32 0)
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+declare void @llvm.arm.ldc(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.ldcl(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.ldc2(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.ldc2l(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.stc(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.stcl(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.stc2(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.stc2l(i32, i32, i8*) nounwind
+
+declare void @llvm.arm.cdp2(i32, i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.cdp(i32, i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcrr2(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcrr(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcr2(i32, i32, i32, i32, i32, i32) nounwind
+
+declare i32 @llvm.arm.mrc2(i32, i32, i32, i32, i32) nounwind
+
+declare void @llvm.arm.mcr(i32, i32, i32, i32, i32, i32) nounwind
+
+declare i32 @llvm.arm.mrc(i32, i32, i32, i32, i32) nounwind
+
+declare { i32, i32 } @llvm.arm.mrrc(i32, i32, i32) nounwind
+
+declare { i32, i32 } @llvm.arm.mrrc2(i32, i32, i32) nounwind
diff --git a/test/CodeGen/Thumb2/stack_guard_remat.ll b/test/CodeGen/Thumb2/stack_guard_remat.ll
index cf34e8c0c2fb0f1f7ee8a0933606e15a47391f8c..839a506b35e610a0a194d37e40926c24b1962a81 100644
--- a/test/CodeGen/Thumb2/stack_guard_remat.ll
+++ b/test/CodeGen/Thumb2/stack_guard_remat.ll
@@ -24,20 +24,20 @@
 define i32 @test_stack_guard_remat() #0 {
   %a1 = alloca [256 x i32], align 4
   %1 = bitcast [256 x i32]* %a1 to i8*
-  call void @llvm.lifetime.start(i64 1024, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 1024, i8* %1)
   %2 = getelementptr inbounds [256 x i32], [256 x i32]* %a1, i32 0, i32 0
   call void @foo3(i32* %2) #3
   call void asm sideeffect "foo2", "~{r0},~{r1},~{r2},~{r3},~{r4},~{r5},~{r6},~{r7},~{r8},~{r9},~{r10},~{r11},~{r12},~{sp},~{lr}"()
-  call void @llvm.lifetime.end(i64 1024, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1024, i8* %1)
   ret i32 0
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare void @foo3(i32*)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/Thumb2/tbb-removeadd.mir b/test/CodeGen/Thumb2/tbb-removeadd.mir
new file mode 100644
index 0000000000000000000000000000000000000000..89ed987205394dc78ae2d3a123d124e9d0e9cf1a
--- /dev/null
+++ b/test/CodeGen/Thumb2/tbb-removeadd.mir
@@ -0,0 +1,124 @@
+#RUN: llc -run-pass arm-cp-islands %s -o - | FileCheck %s
+
+--- |
+  ; ModuleID = 'test.ll'
+  source_filename = "test.c"
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8r-arm-none-eabi"
+  
+  define void @Func(i32 %i, i32* nocapture %p) local_unnamed_addr {
+  entry:
+    switch i32 %i, label %sw.epilog [
+      i32 0, label %sw.bb
+      i32 1, label %sw.bb1
+      i32 2, label %sw.epilog.sink.split
+      i32 4, label %sw.bb3
+    ]
+  
+  sw.bb:                                            ; preds = %entry
+    br label %sw.epilog.sink.split
+  
+  sw.bb1:                                           ; preds = %entry
+    store i32 0, i32* %p, align 4
+    br label %sw.epilog.sink.split
+  
+  sw.bb3:                                           ; preds = %entry
+    br label %sw.epilog.sink.split
+  
+  sw.epilog.sink.split:                             ; preds = %sw.bb3, %sw.bb1, %sw.bb, %entry
+    %.sink = phi i32 [ 2, %sw.bb3 ], [ 0, %sw.bb ], [ 1, %entry ], [ 1, %sw.bb1 ]
+    store i32 %.sink, i32* %p, align 4
+    br label %sw.epilog
+  
+  sw.epilog:                                        ; preds = %sw.epilog.sink.split, %entry
+    ret void
+  }
+
+...
+---
+name:            Func
+alignment:       1
+exposesReturnsTwice: false
+noVRegs:         true
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%r0' }
+  - { reg: '%r1' }
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+jumpTable:       
+  kind:            inline
+  entries:         
+    - id:              0
+      blocks:          [ '%bb.2.sw.bb', '%bb.3.sw.bb1', '%bb.5.sw.epilog.sink.split', 
+                         '%bb.6.sw.epilog', '%bb.4.sw.bb3' ]
+# The ADD should be deleted along with the LEA
+# CHECK-NOT: t2LEApcrelJT
+# CHECK-NOT: t2ADDrs
+# CHECK: tMOVi8
+# CHECK: t2TBB_JT
+
+body:             |
+  bb.0.entry:
+    successors: %bb.6.sw.epilog(0x0ccccccb), %bb.1.entry(0x73333335)
+    liveins: %r0, %r1
+  
+    tCMPi8 %r0, 4, 14, _, implicit-def %cpsr
+    t2Bcc %bb.6.sw.epilog, 8, killed %cpsr
+  
+  bb.1.entry:
+    successors: %bb.2.sw.bb(0x1c71c71c), %bb.3.sw.bb1(0x1c71c71c), %bb.5.sw.epilog.sink.split(0x1c71c71c), %bb.6.sw.epilog(0x0e38e38e), %bb.4.sw.bb3(0x1c71c71c)
+    liveins: %r0, %r1
+  
+    %r2 = t2LEApcrelJT %jump-table.0, 14, _
+    %r3 = t2ADDrs killed %r2, %r0, 18, 14, _, _
+    %r2, dead %cpsr = tMOVi8 1, 14, _
+    t2BR_JT killed %r3, killed %r0, %jump-table.0
+  
+  bb.2.sw.bb:
+    successors: %bb.5.sw.epilog.sink.split(0x80000000)
+    liveins: %r1
+  
+    %r2, dead %cpsr = tMOVi8 0, 14, _
+    t2B %bb.5.sw.epilog.sink.split, 14, _
+  
+  bb.3.sw.bb1:
+    successors: %bb.5.sw.epilog.sink.split(0x80000000)
+    liveins: %r1
+  
+    %r0, dead %cpsr = tMOVi8 0, 14, _
+    %r2, dead %cpsr = tMOVi8 1, 14, _
+    tSTRi killed %r0, %r1, 0, 14, _ :: (store 4 into %ir.p)
+    t2B %bb.5.sw.epilog.sink.split, 14, _
+  
+  bb.4.sw.bb3:
+    successors: %bb.5.sw.epilog.sink.split(0x80000000)
+    liveins: %r1
+  
+    %r2, dead %cpsr = tMOVi8 2, 14, _
+  
+  bb.5.sw.epilog.sink.split:
+    successors: %bb.6.sw.epilog(0x80000000)
+    liveins: %r1, %r2
+  
+    tSTRi killed %r2, killed %r1, 0, 14, _ :: (store 4 into %ir.p)
+  
+  bb.6.sw.epilog:
+    tBX_RET 14, _
+
+...
diff --git a/test/CodeGen/Thumb2/thumb2-pack.ll b/test/CodeGen/Thumb2/thumb2-pack.ll
index 4825628f3014ba7eb675d3c5e6332eddf2c9bc24..26b68ec443b921d6ed449a9838515033b8cff784 100644
--- a/test/CodeGen/Thumb2/thumb2-pack.ll
+++ b/test/CodeGen/Thumb2/thumb2-pack.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2 %s -o - | FileCheck %s
 
 ; CHECK: test1
 ; CHECK: pkhbt   r0, r0, r1, lsl #16
diff --git a/test/CodeGen/Thumb2/thumb2-rev.ll b/test/CodeGen/Thumb2/thumb2-rev.ll
index 873a2d4cf7de77a5132271bacd7c53b813d88db0..81d0822d500b13da217f376ff5f5f9cc0b95e563 100644
--- a/test/CodeGen/Thumb2/thumb2-rev.ll
+++ b/test/CodeGen/Thumb2/thumb2-rev.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+v7,+t2xtpk %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+v7 %s -o - | FileCheck %s
 
 define i32 @f1(i32 %a) {
 ; CHECK-LABEL: f1:
diff --git a/test/CodeGen/Thumb2/thumb2-smla.ll b/test/CodeGen/Thumb2/thumb2-smla.ll
index 5ddaf9353f92d258f3eb6169a8e93278ff59a0c3..f1850d46092884ab4f3712a0cd47dcba24c2c423 100644
--- a/test/CodeGen/Thumb2/thumb2-smla.ll
+++ b/test/CodeGen/Thumb2/thumb2-smla.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+dsp %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+dsp -arm-use-mulops=false %s -o - | FileCheck %s -check-prefix=NO_MULOPS
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+dsp %s -o - | FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+dsp -arm-use-mulops=false %s -o - | FileCheck %s -check-prefix=NO_MULOPS
 
 define i32 @f3(i32 %a, i16 %x, i32 %y) {
 ; CHECK: f3
diff --git a/test/CodeGen/Thumb2/thumb2-smul.ll b/test/CodeGen/Thumb2/thumb2-smul.ll
index a196a3c79ae9b00f1f806235897a71f692d5975f..53fca567af164295fda83e6138f7ce485df211dd 100644
--- a/test/CodeGen/Thumb2/thumb2-smul.ll
+++ b/test/CodeGen/Thumb2/thumb2-smul.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk,+dsp %s -o - |  FileCheck %s
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+dsp %s -o - |  FileCheck %s
 
 @x = weak global i16 0          ; <i16*> [#uses=1]
 @y = weak global i16 0          ; <i16*> [#uses=0]
diff --git a/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll b/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll
index 693a8e4e99f7dbed0be98ae291d1068e8bc04a8d..c1170137c7fc32519d2631922e729a7e56522890 100644
--- a/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll
+++ b/test/CodeGen/Thumb2/thumb2-sxt-uxt.ll
@@ -1,38 +1,45 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m4 %s -o - | FileCheck %s --check-prefix=CHECK-M4
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m4 %s -o - | FileCheck %s --check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi %s -o - | FileCheck %s -check-prefix=CHECK-NO-DSP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi -mattr=+dsp %s -o - | FileCheck %s -check-prefix=CHECK-DSP
 
 define i32 @test1(i16 zeroext %z) nounwind {
 ; CHECK-LABEL: test1:
-; CHECK: sxth
+; CHECK-DSP: sxth
+; CHECK-NO-DSP: sxth
   %r = sext i16 %z to i32
   ret i32 %r
 }
 
 define i32 @test2(i8 zeroext %z) nounwind {
 ; CHECK-LABEL: test2:
-; CHECK: sxtb
+; CHECK-DSP: sxtb
+; CHECK-NO-DSP: sxtb
   %r = sext i8 %z to i32
   ret i32 %r
 }
 
 define i32 @test3(i16 signext %z) nounwind {
 ; CHECK-LABEL: test3:
-; CHECK: uxth
+; CHECK-DSP: uxth
+; CHECK-NO-DSP: uxth
   %r = zext i16 %z to i32
   ret i32 %r
 }
 
 define i32 @test4(i8 signext %z) nounwind {
 ; CHECK-LABEL: test4:
-; CHECK: uxtb
+; CHECK-DSP: uxtb
+; CHECK-NO-DSP: uxtb
   %r = zext i8 %z to i32
   ret i32 %r
 }
 
 define i32 @test5(i32 %a, i8 %b) {
 ; CHECK-LABEL: test5:
-; CHECK-NOT: sxtab
-; CHECK-M4: sxtab r0, r0, r1
+; CHECK-DSP: sxtab r0, r0, r1
+; CHECK-NO-DSP-NOT: sxtab
   %sext = sext i8 %b to i32
   %add = add i32 %a, %sext
   ret i32 %add
@@ -40,8 +47,8 @@ define i32 @test5(i32 %a, i8 %b) {
 
 define i32 @test6(i32 %a, i32 %b) {
 ; CHECK-LABEL: test6:
-; CHECK-NOT: sxtab
-; CHECK-M4: sxtab r0, r0, r1
+; CHECK-DSP: sxtab r0, r0, r1
+; CHECK-NO-DSP-NOT: sxtab
   %shl = shl i32 %b, 24
   %ashr = ashr i32 %shl, 24
   %add = add i32 %a, %ashr
@@ -50,8 +57,8 @@ define i32 @test6(i32 %a, i32 %b) {
 
 define i32 @test7(i32 %a, i16 %b) {
 ; CHECK-LABEL: test7:
-; CHECK-NOT: sxtah
-; CHECK-M4: sxtah r0, r0, r1
+; CHECK-DSP: sxtah r0, r0, r1
+; CHECK-NO-DSPNOT: sxtah
   %sext = sext i16 %b to i32
   %add = add i32 %a, %sext
   ret i32 %add
@@ -59,8 +66,8 @@ define i32 @test7(i32 %a, i16 %b) {
 
 define i32 @test8(i32 %a, i32 %b) {
 ; CHECK-LABEL: test8:
-; CHECK-NOT: sxtah
-; CHECK-M4: sxtah r0, r0, r1
+; CHECK-DSP: sxtah r0, r0, r1
+; CHECK-NO-DSP-NOT: sxtah
   %shl = shl i32 %b, 16
   %ashr = ashr i32 %shl, 16
   %add = add i32 %a, %ashr
@@ -69,8 +76,8 @@ define i32 @test8(i32 %a, i32 %b) {
 
 define i32 @test9(i32 %a, i8 %b) {
 ; CHECK-LABEL: test9:
-; CHECK-NOT: uxtab
-; CHECK-M4: uxtab r0, r0, r1
+; CHECK-DSP: uxtab r0, r0, r1
+; CHECK-NO-DSP-NOT: uxtab
   %zext = zext i8 %b to i32
   %add = add i32 %a, %zext
   ret i32 %add
@@ -78,8 +85,8 @@ define i32 @test9(i32 %a, i8 %b) {
 
 define i32 @test10(i32 %a, i32 %b) {
 ;CHECK-LABEL: test10:
-;CHECK-NOT: uxtab
-;CHECK-M4: uxtab r0, r0, r1
+;CHECK-DSP: uxtab r0, r0, r1
+;CHECK-NO-DSP-NOT: uxtab
   %and = and i32 %b, 255
   %add = add i32 %a, %and
   ret i32 %add
@@ -87,8 +94,8 @@ define i32 @test10(i32 %a, i32 %b) {
 
 define i32 @test11(i32 %a, i16 %b) {
 ; CHECK-LABEL: test11:
-; CHECK-NOT: uxtah
-; CHECK-M4: uxtah r0, r0, r1
+; CHECK-DSP: uxtah r0, r0, r1
+; CHECK-NO-DSP-NOT: uxtah
   %zext = zext i16 %b to i32
   %add = add i32 %a, %zext
   ret i32 %add
@@ -96,8 +103,8 @@ define i32 @test11(i32 %a, i16 %b) {
 
 define i32 @test12(i32 %a, i32 %b) {
 ;CHECK-LABEL: test12:
-;CHECK-NOT: uxtah
-;CHECK-M4: uxtah r0, r0, r1
+;CHECK-DSP: uxtah r0, r0, r1
+;CHECK-NO-DSP-NOT: uxtah
   %and = and i32 %b, 65535
   %add = add i32 %a, %and
   ret i32 %add
diff --git a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
index a4f8aa0dbd03d22c9c55a1469da650c8e6a250ad..c4af67a2f91d0f21417248a2593d684dfb0d7ea9 100644
--- a/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-sxt_rot.ll
@@ -1,18 +1,21 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s -mattr=+thumb2,+t2xtpk %s -o - | FileCheck %s
-; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s --check-prefix=CHECK-M3
+; RUN: llc -mtriple=thumb-eabi -mcpu=arm1156t2-s %s -o - | FileCheck %s --check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi %s -o - | FileCheck %s -check-prefix=CHECK-NO-DSP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi -mattr=+dsp %s -o - | FileCheck %s -check-prefix=CHECK-DSP
 
 define i32 @test0(i8 %A) {
 ; CHECK-LABEL: test0:
-; CHECK: sxtb r0, r0
-; CHECK-M3: sxtb r0, r0
+; CHECK-DSP: sxtb r0, r0
+; CHECK-NO-DSP: sxtb r0, r0
         %B = sext i8 %A to i32
 	ret i32 %B
 }
 
 define signext i8 @test1(i32 %A)  {
 ; CHECK-LABEL: test1:
-; CHECK: sbfx r0, r0, #8, #8
-; CHECK-M3: sbfx r0, r0, #8, #8
+; CHECK-DSP: sbfx r0, r0, #8, #8
+; CHECK-NO-DSP: sbfx r0, r0, #8, #8
 	%B = lshr i32 %A, 8
 	%C = shl i32 %A, 24
 	%D = or i32 %B, %C
@@ -22,8 +25,8 @@ define signext i8 @test1(i32 %A)  {
 
 define signext i32 @test2(i32 %A, i32 %X)  {
 ; CHECK-LABEL: test2:
-; CHECK: sxtab  r0, r1, r0, ror #8
-; CHECK-M3-NOT: sxtab
+; CHECK-DSP: sxtab  r0, r1, r0, ror #8
+; CHECK-NO-DSP-NOT: sxtab
 	%B = lshr i32 %A, 8
 	%C = shl i32 %A, 24
 	%D = or i32 %B, %C
@@ -35,8 +38,8 @@ define signext i32 @test2(i32 %A, i32 %X)  {
 
 define i32 @test3(i32 %A, i32 %X) {
 ; CHECK-LABEL: test3:
-; CHECK: sxtah r0, r0, r1, ror #8
-; CHECK-M3-NOT: sxtah
+; CHECK-DSP: sxtah r0, r0, r1, ror #8
+; CHECK-NO-DSP-NOT: sxtah
   %X.hi = lshr i32 %X, 8
   %X.trunc = trunc i32 %X.hi to i16
   %addend = sext i16 %X.trunc to i32
@@ -46,8 +49,8 @@ define i32 @test3(i32 %A, i32 %X) {
 
 define signext i32 @test4(i32 %A, i32 %X)  {
 ; CHECK-LABEL: test4:
-; CHECK: sxtab  r0, r1, r0, ror #16
-; CHECK-M3-NOT: sxtab
+; CHECK-DSP: sxtab  r0, r1, r0, ror #16
+; CHECK-NO-DSP-NOT: sxtab
 	%B = lshr i32 %A, 16
 	%C = shl i32 %A, 16
 	%D = or i32 %B, %C
@@ -59,8 +62,8 @@ define signext i32 @test4(i32 %A, i32 %X)  {
 
 define signext i32 @test5(i32 %A, i32 %X)  {
 ; CHECK-LABEL: test5:
-; CHECK: sxtah  r0, r1, r0, ror #24
-; CHECK-M3-NOT: sxtah
+; CHECK-DSP: sxtah  r0, r1, r0, ror #24
+; CHECK-NO-DSP-NOT: sxtah
 	%B = lshr i32 %A, 24
 	%C = shl i32 %A, 8
 	%D = or i32 %B, %C
diff --git a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
index 891c706972c061903163fdf53aa57c3582047156..22740b715dcb13b47cefd9c3c723df3c6b663048 100644
--- a/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxt_rot.ll
@@ -1,21 +1,22 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s --check-prefix=A8
-; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s --check-prefix=M3
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s --check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi %s -o - | FileCheck %s -check-prefix=CHECK-NO-DSP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi -mattr=+dsp %s -o - | FileCheck %s -check-prefix=CHECK-DSP
 ; rdar://11318438
 
 define zeroext i8 @test1(i32 %A.u)  {
 ; CHECK-LABEL: test1:
-; A8: uxtb r0, r0
+; CHECK-DSP: uxtb r0, r0
+; CHECK-NO-DSP: uxtb r0, r0
     %B.u = trunc i32 %A.u to i8
     ret i8 %B.u
 }
 
 define zeroext i32 @test2(i32 %A.u, i32 %B.u)  {
 ; CHECK-LABEL: test2:
-; A8: uxtab  r0, r0, r1
-
-; M3: uxtb  r1, r1
-; M3-NOT: uxtab
-; M3: add   r0, r1
+; CHECK-DSP: uxtab  r0, r0, r1
+; CHECK-NO-DSP-NOT: uxtab
     %C.u = trunc i32 %B.u to i8
     %D.u = zext i8 %C.u to i32
     %E.u = add i32 %A.u, %D.u
@@ -24,8 +25,8 @@ define zeroext i32 @test2(i32 %A.u, i32 %B.u)  {
 
 define zeroext i32 @test3(i32 %A.u)  {
 ; CHECK-LABEL: test3:
-; A8: ubfx  r0, r0, #8, #16
-; M3: ubfx r0, r0, #8, #16
+; CHECK-DSP: ubfx  r0, r0, #8, #16
+; CHECK-NO-DSP: ubfx  r0, r0, #8, #16
     %B.u = lshr i32 %A.u, 8
     %C.u = shl i32 %A.u, 24
     %D.u = or i32 %B.u, %C.u
@@ -36,8 +37,8 @@ define zeroext i32 @test3(i32 %A.u)  {
 
 define i32 @test4(i32 %A, i32 %X) {
 ; CHECK-LABEL: test4:
-; A8: uxtab r0, r0, r1, ror #16
-; M3-NOT: uxtab
+; CHECK-DSP: uxtab r0, r0, r1, ror #16
+; CHECK-NO-DSP-NOT: uxtab
   %X.hi = lshr i32 %X, 16
   %X.trunc = trunc i32 %X.hi to i8
   %addend = zext i8 %X.trunc to i32
@@ -47,8 +48,8 @@ define i32 @test4(i32 %A, i32 %X) {
 
 define i32 @test5(i32 %A, i32 %X) {
 ; CHECK-LABEL: test5:
-; A8: uxtah r0, r0, r1, ror #8
-; M3-NOT: uxtah
+; CHECK-DSP: uxtah r0, r0, r1, ror #8
+; CHECK-NO-DSP-NOT: uxtah
   %X.hi = lshr i32 %X, 8
   %X.trunc = trunc i32 %X.hi to i16
   %addend = zext i16 %X.trunc to i32
@@ -58,8 +59,8 @@ define i32 @test5(i32 %A, i32 %X) {
 
 define i32 @test6(i32 %A, i32 %X) {
 ; CHECK-LABEL: test6:
-; A8: uxtab r0, r0, r1, ror #8
-; M3-NOT: uxtab
+; CHECK-DSP: uxtab r0, r0, r1, ror #8
+; CHECK-NO-DSP-NOT: uxtab
   %X.hi = lshr i32 %X, 8
   %X.trunc = trunc i32 %X.hi to i8
   %addend = zext i8 %X.trunc to i32
@@ -69,8 +70,8 @@ define i32 @test6(i32 %A, i32 %X) {
 
 define i32 @test7(i32 %A, i32 %X) {
 ; CHECK-LABEL: test7:
-; A8: uxtah r0, r0, r1, ror #24
-; M3-NOT: uxtah
+; CHECK-DSP: uxtah r0, r0, r1, ror #24
+; CHECK-NO-DSP-NOT: uxtah
   %lshr = lshr i32 %X, 24
   %shl = shl i32 %X, 8
   %or = or i32 %lshr, %shl
@@ -82,8 +83,8 @@ define i32 @test7(i32 %A, i32 %X) {
 
 define i32 @test8(i32 %A, i32 %X) {
 ; CHECK-LABEL: test8:
-; A8: uxtah r0, r0, r1, ror #24
-; M3-NOT: uxtah
+; CHECK-DSP: uxtah r0, r0, r1, ror #24
+; CHECK-NO-DSP-NOT: uxtah
   %lshr = lshr i32 %X, 24
   %shl = shl i32 %X, 8
   %or = or i32 %lshr, %shl
diff --git a/test/CodeGen/Thumb2/thumb2-uxtb.ll b/test/CodeGen/Thumb2/thumb2-uxtb.ll
index b8b1bc832d962af361783d2760d13e9ae5d6878e..af4532cf6f3d8fd29ee80cfca26cead5be655b97 100644
--- a/test/CodeGen/Thumb2/thumb2-uxtb.ll
+++ b/test/CodeGen/Thumb2/thumb2-uxtb.ll
@@ -1,72 +1,63 @@
-; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=ARMv7A
-; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s -check-prefix=ARMv7M
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-a8 %s -o - | FileCheck %s -check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumb-eabi -mcpu=cortex-m3 %s -o - | FileCheck %s -check-prefix=CHECK-NO-DSP
+; RUN: llc -mtriple=thumbv7em-eabi %s -o - | FileCheck %s -check-prefix=CHECK-DSP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi %s -o - | FileCheck %s -check-prefix=CHECK-NO-DSP
+; RUN: llc -mtriple=thumbv8m.main-none-eabi -mattr=+dsp %s -o - | FileCheck %s -check-prefix=CHECK-DSP
 
 define i32 @test1(i32 %x) {
-; ARMv7A: test1
-; ARMv7A: uxtb16 r0, r0
-
-; ARMv7M: test1
-; ARMv7M: bic r0, r0, #-16711936
+; CHECK-LABEL: test1
+; CHECK-DSP: uxtb16 r0, r0
+; CHECK-NO-DSP: bic r0, r0, #-16711936
 	%tmp1 = and i32 %x, 16711935		; <i32> [#uses=1]
 	ret i32 %tmp1
 }
 
 ; PR7503
 define i32 @test2(i32 %x) {
-; ARMv7A: test2
-; ARMv7A: uxtb16  r0, r0, ror #8
-
-; ARMv7M: test2
-; ARMv7M: mov.w r1, #16711935
-; ARMv7M: and.w r0, r1, r0, lsr #8
+; CHECK-LABEL: test2
+; CHECK-DSP: uxtb16  r0, r0, ror #8
+; CHECK-NO-DSP: mov.w r1, #16711935
+; CHECK-NO-DSP: and.w r0, r1, r0, lsr #8
 	%tmp1 = lshr i32 %x, 8		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 16711935		; <i32> [#uses=1]
 	ret i32 %tmp2
 }
 
 define i32 @test3(i32 %x) {
-; ARMv7A: test3
-; ARMv7A: uxtb16  r0, r0, ror #8
-
-; ARMv7M: test3
-; ARMv7M: mov.w r1, #16711935
-; ARMv7M: and.w r0, r1, r0, lsr #8
+; CHECK-LABEL: test3
+; CHECK-DSP: uxtb16  r0, r0, ror #8
+; CHECK-NO-DSP: mov.w r1, #16711935
+; CHECK-NO-DSP: and.w r0, r1, r0, lsr #8
 	%tmp1 = lshr i32 %x, 8		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 16711935		; <i32> [#uses=1]
 	ret i32 %tmp2
 }
 
 define i32 @test4(i32 %x) {
-; ARMv7A: test4
-; ARMv7A: uxtb16  r0, r0, ror #8
-
-; ARMv7M: test4
-; ARMv7M: mov.w r1, #16711935
-; ARMv7M: and.w r0, r1, r0, lsr #8
+; CHECK-LABEL: test4
+; CHECK-DSP: uxtb16  r0, r0, ror #8
+; CHECK-NO-DSP: mov.w r1, #16711935
+; CHECK-NO-DSP: and.w r0, r1, r0, lsr #8
 	%tmp1 = lshr i32 %x, 8		; <i32> [#uses=1]
 	%tmp6 = and i32 %tmp1, 16711935		; <i32> [#uses=1]
 	ret i32 %tmp6
 }
 
 define i32 @test5(i32 %x) {
-; ARMv7A: test5
-; ARMv7A: uxtb16  r0, r0, ror #8
-
-; ARMv7M: test5
-; ARMv7M: mov.w r1, #16711935
-; ARMv7M: and.w r0, r1, r0, lsr #8
+; CHECK-LABEL: test5
+; CHECK-DSP: uxtb16  r0, r0, ror #8
+; CHECK-NO-DSP: mov.w r1, #16711935
+; CHECK-NO-DSP: and.w r0, r1, r0, lsr #8
 	%tmp1 = lshr i32 %x, 8		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 16711935		; <i32> [#uses=1]
 	ret i32 %tmp2
 }
 
 define i32 @test6(i32 %x) {
-; ARMv7A: test6
-; ARMv7A: uxtb16  r0, r0, ror #16
-
-; ARMv7M: test6
-; ARMv7M: mov.w r1, #16711935
-; ARMv7M: and.w r0, r1, r0, ror #16
+; CHECK-LABEL: test6
+; CHECK-DSP: uxtb16  r0, r0, ror #16
+; CHECK-NO-DSP: mov.w r1, #16711935
+; CHECK-NO-DSP: and.w r0, r1, r0, ror #16
 	%tmp1 = lshr i32 %x, 16		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 255		; <i32> [#uses=1]
 	%tmp4 = shl i32 %x, 16		; <i32> [#uses=1]
@@ -76,12 +67,10 @@ define i32 @test6(i32 %x) {
 }
 
 define i32 @test7(i32 %x) {
-; ARMv7A: test7
-; ARMv7A: uxtb16  r0, r0, ror #16
-
-; ARMv7M: test7
-; ARMv7M: mov.w r1, #16711935
-; ARMv7M: and.w r0, r1, r0, ror #16
+; CHECK-LABEL: test7
+; CHECK-DSP: uxtb16  r0, r0, ror #16
+; CHECK-NO-DSP: mov.w r1, #16711935
+; CHECK-NO-DSP: and.w r0, r1, r0, ror #16
 	%tmp1 = lshr i32 %x, 16		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 255		; <i32> [#uses=1]
 	%tmp4 = shl i32 %x, 16		; <i32> [#uses=1]
@@ -91,12 +80,10 @@ define i32 @test7(i32 %x) {
 }
 
 define i32 @test8(i32 %x) {
-; ARMv7A: test8
-; ARMv7A: uxtb16  r0, r0, ror #24
-
-; ARMv7M: test8
-; ARMv7M: mov.w r1, #16711935
-; ARMv7M: and.w r0, r1, r0, ror #24
+; CHECK-LABEL: test8
+; CHECK-DSP: uxtb16  r0, r0, ror #24
+; CHECK-NO-DSP: mov.w r1, #16711935
+; CHECK-NO-DSP: and.w r0, r1, r0, ror #24
 	%tmp1 = shl i32 %x, 8		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 16711680		; <i32> [#uses=1]
 	%tmp5 = lshr i32 %x, 24		; <i32> [#uses=1]
@@ -105,12 +92,10 @@ define i32 @test8(i32 %x) {
 }
 
 define i32 @test9(i32 %x) {
-; ARMv7A: test9
-; ARMv7A: uxtb16  r0, r0, ror #24
-
-; ARMv7M: test9
-; ARMv7M: mov.w r1, #16711935
-; ARMv7M: and.w r0, r1, r0, ror #24
+; CHECK-LABEL: test9
+; CHECK-DSP: uxtb16  r0, r0, ror #24
+; CHECK-NO-DSP: mov.w r1, #16711935
+; CHECK-NO-DSP: and.w r0, r1, r0, ror #24
 	%tmp1 = lshr i32 %x, 24		; <i32> [#uses=1]
 	%tmp4 = shl i32 %x, 8		; <i32> [#uses=1]
 	%tmp5 = and i32 %tmp4, 16711680		; <i32> [#uses=1]
@@ -119,19 +104,18 @@ define i32 @test9(i32 %x) {
 }
 
 define i32 @test10(i32 %p0) {
-; ARMv7A: test10
-; ARMv7A: mov.w r1, #16253176
-; ARMv7A: and.w r0, r1, r0, lsr #7
-; ARMv7A: lsrs  r1, r0, #5
-; ARMv7A: uxtb16  r1, r1
-; ARMv7A: orrs r0, r1
-
-; ARMv7M: test10
-; ARMv7M: mov.w r1, #16253176
-; ARMv7M: and.w r0, r1, r0, lsr #7
-; ARMv7M: mov.w r1, #458759
-; ARMv7M: and.w r1, r1, r0, lsr #5
-; ARMv7M: orrs r0, r1
+; CHECK-LABEL: test10
+; CHECK-DSP: mov.w r1, #16253176
+; CHECK-DSP: and.w r0, r1, r0, lsr #7
+; CHECK-DSP: lsrs  r1, r0, #5
+; CHECK-DSP: uxtb16  r1, r1
+; CHECk-DSP: orrs r0, r1
+
+; CHECK-NO-DSP: mov.w r1, #16253176
+; CHECK-NO-DSP: and.w r0, r1, r0, lsr #7
+; CHECK-NO-DSP: mov.w r1, #458759
+; CHECK-NO-DSP: and.w r1, r1, r0, lsr #5
+; CHECK-NO-DSP: orrs r0, r1
 	%tmp1 = lshr i32 %p0, 7		; <i32> [#uses=1]
 	%tmp2 = and i32 %tmp1, 16253176		; <i32> [#uses=2]
 	%tmp4 = lshr i32 %tmp2, 5		; <i32> [#uses=1]
diff --git a/test/CodeGen/Thumb2/v8_IT_4.ll b/test/CodeGen/Thumb2/v8_IT_4.ll
index 5a80d8cd7b4e8df36e035d43c9ec19882e1fa05d..5901a8e81cafa777c83482da83ebc27eb2cabb2b 100644
--- a/test/CodeGen/Thumb2/v8_IT_4.ll
+++ b/test/CodeGen/Thumb2/v8_IT_4.ll
@@ -12,10 +12,11 @@
 
 define weak arm_aapcs_vfpcc i32 @_ZNKSs7compareERKSs(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this, %"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %__str) {
 ; CHECK-LABEL: _ZNKSs7compareERKSs:
-; CHECK:      cbnz	r0,
+; CHECK:      cbz	r0,
+; CHECK-NEXT: %bb1
+; CHECK-NEXT: pop.w
 ; CHECK-NEXT: %bb
 ; CHECK-NEXT: sub{{(.w)?}} r0, r{{[0-9]+}}, r{{[0-9]+}}
-; CHECK-NEXT: %bb1
 ; CHECK-NEXT: pop.w
 entry:
   %0 = tail call arm_aapcs_vfpcc  i32 @_ZNKSs4sizeEv(%"struct.std::basic_string<char,std::char_traits<char>,std::allocator<char> >"* %this) ; <i32> [#uses=3]
diff --git a/test/CodeGen/WebAssembly/address-offsets.ll b/test/CodeGen/WebAssembly/address-offsets.ll
index b9efec86f0da97f5c4bd37bf71d08273ee325344..da198978fc2f82d09fbcec527432c3c592927713 100644
--- a/test/CodeGen/WebAssembly/address-offsets.ll
+++ b/test/CodeGen/WebAssembly/address-offsets.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test folding constant offsets and symbols into load and store addresses under
 ; a variety of circumstances.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 @g = external global [0 x i32], align 4
 
diff --git a/test/CodeGen/WebAssembly/byval.ll b/test/CodeGen/WebAssembly/byval.ll
index 7a995769a8e75eb9b1dada69c20367604a31091d..907320d7977c7a1560612920ba2eaa4e65560054 100644
--- a/test/CodeGen/WebAssembly/byval.ll
+++ b/test/CodeGen/WebAssembly/byval.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs -fast-isel | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -verify-machineinstrs -fast-isel | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 %SmallStruct = type { i32 }
 %OddStruct = type { i32, i8, i32 }
@@ -23,15 +23,13 @@ declare void @ext_byval_func_empty(%EmptyStruct* byval)
 ; CHECK-LABEL: byval_arg
 define void @byval_arg(%SmallStruct* %ptr) {
  ; CHECK: .param i32
- ; CHECK: i32.const $push[[L4:.+]]=, 0
  ; Subtract 16 from SP (SP is 16-byte aligned)
- ; CHECK: i32.const $push[[L1:.+]]=, 0
- ; CHECK-NEXT: i32.load $push[[L2:.+]]=, __stack_pointer($pop[[L1]])
+ ; CHECK-NEXT: get_global $push[[L2:.+]]=, 0
  ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 16
  ; CHECK-NEXT: i32.sub $push[[L11:.+]]=, $pop[[L2]], $pop[[L3]]
  ; Ensure SP is stored back before the call
  ; CHECK-NEXT: tee_local $push[[L10:.+]]=, $[[SP:.+]]=, $pop[[L11]]{{$}}
- ; CHECK-NEXT: i32.store __stack_pointer($pop[[L4]]), $pop[[L10]]{{$}}
+ ; CHECK-NEXT: set_global 0, $pop[[L10]]{{$}}
  ; Copy the SmallStruct argument to the stack (SP+12, original SP-4)
  ; CHECK-NEXT: i32.load $push[[L0:.+]]=, 0($0)
  ; CHECK-NEXT: i32.store 12($[[SP]]), $pop[[L0]]
@@ -41,10 +39,9 @@ define void @byval_arg(%SmallStruct* %ptr) {
  ; CHECK-NEXT: call ext_byval_func@FUNCTION, $pop[[ARG]]{{$}}
  call void @ext_byval_func(%SmallStruct* byval %ptr)
  ; Restore the stack
- ; CHECK-NEXT: i32.const $push[[L7:.+]]=, 0
  ; CHECK-NEXT: i32.const $push[[L6:.+]]=, 16
  ; CHECK-NEXT: i32.add $push[[L8:.+]]=, $[[SP]], $pop[[L6]]
- ; CHECK-NEXT: i32.store __stack_pointer($pop[[L7]]), $pop[[L8]]
+ ; CHECK-NEXT: set_global 0, $pop[[L8]]
  ; CHECK-NEXT: return
  ret void
 }
@@ -56,7 +53,7 @@ define void @byval_arg_align8(%SmallStruct* %ptr) {
  ; CHECK: i32.const $push[[L1:.+]]=, 16
  ; CHECK-NEXT: i32.sub $push[[L11:.+]]=, {{.+}}, $pop[[L1]]
  ; CHECK-NEXT: tee_local $push[[L10:.+]]=, $[[SP:.+]]=, $pop[[L11]]{{$}}
- ; CHECK-NEXT: i32.store __stack_pointer($pop{{.+}}), $pop[[L10]]{{$}}
+ ; CHECK-NEXT: set_global 0, $pop[[L10]]{{$}}
  ; Copy the SmallStruct argument to the stack (SP+8, original SP-8)
  ; CHECK-NEXT: i32.load $push[[L0:.+]]=, 0($0){{$}}
  ; CHECK-NEXT: i32.store 8($[[SP]]), $pop[[L0]]{{$}}
@@ -75,7 +72,7 @@ define void @byval_arg_double(%AlignedStruct* %ptr) {
  ; CHECK: i32.const $push[[L1:.+]]=, 16
  ; CHECK-NEXT: i32.sub $push[[L14:.+]]=, {{.+}}, $pop[[L1]]
  ; CHECK-NEXT: tee_local $push[[L13:.+]]=, $[[SP:.+]]=, $pop[[L14]]
- ; CHECK-NEXT: i32.store {{.+}}, $pop[[L13]]
+ ; CHECK-NEXT: set_global 0, $pop[[L13]]
  ; Copy the AlignedStruct argument to the stack (SP+0, original SP-16)
  ; Just check the last load/store pair of the memcpy
  ; CHECK: i64.load $push[[L4:.+]]=, 0($0)
@@ -113,13 +110,11 @@ define void @byval_empty_callee(%EmptyStruct* byval %ptr) {
 
 ; Call memcpy for "big" byvals.
 ; CHECK-LABEL: big_byval:
-; CHECK: i32.const $push[[L4:.+]]=, 0
-; CHECK: i32.const $push[[L1:.+]]=, 0
-; CHECK-NEXT: i32.load $push[[L2:.+]]=, __stack_pointer($pop[[L1]])
+; CHECK:      get_global $push[[L2:.+]]=, 0{{$}}
 ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 131072
 ; CHECK-NEXT: i32.sub $push[[L11:.+]]=, $pop[[L2]], $pop[[L3]]
 ; CHECK-NEXT: tee_local $push[[L10:.+]]=, $[[SP:.+]]=, $pop[[L11]]{{$}}
-; CHECK-NEXT: i32.store __stack_pointer($pop[[L4]]), $pop[[L10]]{{$}}
+; CHECK-NEXT: set_global 0, $pop[[L10]]{{$}}
 ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 131072
 ; CHECK-NEXT: i32.call       $push[[L11:.+]]=, memcpy@FUNCTION, $[[SP]], ${{.+}}, $pop{{.+}}
 ; CHECK-NEXT: tee_local      $push[[L9:.+]]=, $[[SP:.+]]=, $pop[[L11]]{{$}}
diff --git a/test/CodeGen/WebAssembly/call.ll b/test/CodeGen/WebAssembly/call.ll
index 1a9d5b8fb8e6f55a1d138570afc82b72023d9853..1cf42242a6cc2fcbb9ebb782b107624b62950517 100644
--- a/test/CodeGen/WebAssembly/call.ll
+++ b/test/CodeGen/WebAssembly/call.ll
@@ -4,7 +4,7 @@
 ; Test that basic call operations assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare i32 @i32_nullary()
 declare i32 @i32_unary(i32)
@@ -61,7 +61,8 @@ define void @call_void_nullary() {
 ; CHECK-LABEL: call_i32_unary:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_unary@FUNCTION, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_unary@FUNCTION, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @call_i32_unary(i32 %a) {
   %r = call i32 @i32_unary(i32 %a)
@@ -71,7 +72,9 @@ define i32 @call_i32_unary(i32 %a) {
 ; CHECK-LABEL: call_i32_binary:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_binary@FUNCTION, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: {{^}} i32.call $push[[NUM:[0-9]+]]=, i32_binary@FUNCTION, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @call_i32_binary(i32 %a, i32 %b) {
   %r = call i32 @i32_binary(i32 %a, i32 %b)
@@ -80,7 +83,8 @@ define i32 @call_i32_binary(i32 %a, i32 %b) {
 
 ; CHECK-LABEL: call_indirect_void:
 ; CHECK-NEXT: .param i32{{$}}
-; CHECK-NEXT: {{^}} call_indirect $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: {{^}} call_indirect $pop[[L0]]{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @call_indirect_void(void ()* %callee) {
   call void %callee()
@@ -90,7 +94,8 @@ define void @call_indirect_void(void ()* %callee) {
 ; CHECK-LABEL: call_indirect_i32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: {{^}} i32.call_indirect $push[[NUM:[0-9]+]]=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: {{^}} i32.call_indirect $push[[NUM:[0-9]+]]=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @call_indirect_i32(i32 ()* %callee) {
   %t = call i32 %callee()
@@ -99,7 +104,9 @@ define i32 @call_indirect_i32(i32 ()* %callee) {
 
 ; CHECK-LABEL: call_indirect_arg:
 ; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: {{^}} call_indirect $1, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: {{^}} call_indirect $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @call_indirect_arg(void (i32)* %callee, i32 %arg) {
   call void %callee(i32 %arg)
@@ -108,7 +115,11 @@ define void @call_indirect_arg(void (i32)* %callee, i32 %arg) {
 
 ; CHECK-LABEL: call_indirect_arg_2:
 ; CHECK-NEXT: .param i32, i32, i32{{$}}
-; CHECK-NEXT: {{^}} i32.call_indirect $drop=, $1, $2, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 2{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: {{^}} i32.call_indirect $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]], $pop[[L2]]{{$}}
+; CHECK-NEXT: drop $pop[[NUM]]{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @call_indirect_arg_2(i32 (i32, i32)* %callee, i32 %arg, i32 %arg2) {
   call i32 %callee(i32 %arg, i32 %arg2)
diff --git a/test/CodeGen/WebAssembly/cfg-stackify.ll b/test/CodeGen/WebAssembly/cfg-stackify.ll
index 3b42df190266fe3db6fb4ffa05db0841b1f5c71d..ae6dd7a34ef8f113b08626aadb1e0d09e693ddb3 100644
--- a/test/CodeGen/WebAssembly/cfg-stackify.ll
+++ b/test/CodeGen/WebAssembly/cfg-stackify.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0 -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 | FileCheck -check-prefix=OPT %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -disable-block-placement -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -tail-dup-placement=0 -verify-machineinstrs -fast-isel=false -machine-sink-split-probability-threshold=0 -cgp-freq-ratio-to-skip-merge=1000 | FileCheck -check-prefix=OPT %s
 
 ; Test the CFG stackifier pass.
 
@@ -7,7 +7,7 @@
 ; optnone test.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare void @something()
 
@@ -1144,7 +1144,6 @@ bb7:
 ; optnone to disable optimizations to test this case.
 
 ; CHECK-LABEL: test13:
-; CHECK-NEXT:  .local i32{{$}}
 ; CHECK-NEXT:  block   {{$}}
 ; CHECK-NEXT:  block   {{$}}
 ; CHECK:       br_if 0, $pop0{{$}}
@@ -1161,7 +1160,6 @@ bb7:
 ; CHECK-NEXT:  end_block{{$}}
 ; CHECK-NEXT:  unreachable{{$}}
 ; OPT-LABEL: test13:
-; OPT-NEXT:  .local i32{{$}}
 ; OPT-NEXT:  block   {{$}}
 ; OPT-NEXT:  block   {{$}}
 ; OPT:       br_if 0, $pop0{{$}}
diff --git a/test/CodeGen/WebAssembly/cfi.ll b/test/CodeGen/WebAssembly/cfi.ll
index e5664ba73a0d57d738a53503f8e8cc5d2517e9dd..992e0f0c63d8be5d726c2f0362efee4c9d36dfa8 100644
--- a/test/CodeGen/WebAssembly/cfi.ll
+++ b/test/CodeGen/WebAssembly/cfi.ll
@@ -3,7 +3,7 @@
 ; Tests that we correctly assign indexes for control flow integrity.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 @0 = private unnamed_addr constant [2 x void (...)*] [void (...)* bitcast (void ()* @f to void (...)*), void (...)* bitcast (void ()* @g to void (...)*)], align 16
 
diff --git a/test/CodeGen/WebAssembly/comparisons_f32.ll b/test/CodeGen/WebAssembly/comparisons_f32.ll
index 10e037d57a7a27e7607dc2becd071a043500a7eb..8051b25689ddbe21954f52fb0fbbcfcde8ba5629 100644
--- a/test/CodeGen/WebAssembly/comparisons_f32.ll
+++ b/test/CodeGen/WebAssembly/comparisons_f32.ll
@@ -4,13 +4,17 @@
 ; expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: ord_f32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f32.eq $push[[NUM0:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f32.eq $push[[NUM1:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @ord_f32(float %x, float %y) {
@@ -22,8 +26,12 @@ define i32 @ord_f32(float %x, float %y) {
 ; CHECK-LABEL: uno_f32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM0:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @uno_f32(float %x, float %y) {
@@ -35,7 +43,9 @@ define i32 @uno_f32(float %x, float %y) {
 ; CHECK-LABEL: oeq_f32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f32.eq $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @oeq_f32(float %x, float %y) {
   %a = fcmp oeq float %x, %y
@@ -44,7 +54,7 @@ define i32 @oeq_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: une_f32:
-; CHECK: f32.ne $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: f32.ne $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @une_f32(float %x, float %y) {
   %a = fcmp une float %x, %y
@@ -53,7 +63,7 @@ define i32 @une_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: olt_f32:
-; CHECK: f32.lt $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: f32.lt $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @olt_f32(float %x, float %y) {
   %a = fcmp olt float %x, %y
@@ -62,7 +72,7 @@ define i32 @olt_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: ole_f32:
-; CHECK: f32.le $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: f32.le $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ole_f32(float %x, float %y) {
   %a = fcmp ole float %x, %y
@@ -71,7 +81,7 @@ define i32 @ole_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: ogt_f32:
-; CHECK: f32.gt $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: f32.gt $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ogt_f32(float %x, float %y) {
   %a = fcmp ogt float %x, %y
@@ -80,7 +90,7 @@ define i32 @ogt_f32(float %x, float %y) {
 }
 
 ; CHECK-LABEL: oge_f32:
-; CHECK: f32.ge $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: f32.ge $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @oge_f32(float %x, float %y) {
   %a = fcmp oge float %x, %y
@@ -93,9 +103,15 @@ define i32 @oge_f32(float %x, float %y) {
 ; CHECK-LABEL: ueq_f32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f32.eq $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]{{$}}
@@ -108,9 +124,15 @@ define i32 @ueq_f32(float %x, float %y) {
 ; CHECK-LABEL: one_f32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f32.eq $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f32.eq $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.eq $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]
@@ -123,9 +145,15 @@ define i32 @one_f32(float %x, float %y) {
 ; CHECK-LABEL: ult_f32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f32.lt $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.lt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]{{$}}
@@ -138,9 +166,15 @@ define i32 @ult_f32(float %x, float %y) {
 ; CHECK-LABEL: ule_f32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f32.le $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.le $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]{{$}}
@@ -153,9 +187,15 @@ define i32 @ule_f32(float %x, float %y) {
 ; CHECK-LABEL: ugt_f32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f32.gt $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.gt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]{{$}}
@@ -168,9 +208,15 @@ define i32 @ugt_f32(float %x, float %y) {
 ; CHECK-LABEL: uge_f32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f32.ge $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.ge $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.ne $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]{{$}}
diff --git a/test/CodeGen/WebAssembly/comparisons_f64.ll b/test/CodeGen/WebAssembly/comparisons_f64.ll
index 7d038a09ccbf57843afdbcb0788e0e201d8e9aeb..6694f989627f2578aded37483cc9dab1e8e80217 100644
--- a/test/CodeGen/WebAssembly/comparisons_f64.ll
+++ b/test/CodeGen/WebAssembly/comparisons_f64.ll
@@ -4,13 +4,17 @@
 ; expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: ord_f64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f64.eq $push[[NUM0:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f64.eq $push[[NUM1:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @ord_f64(double %x, double %y) {
@@ -22,8 +26,12 @@ define i32 @ord_f64(double %x, double %y) {
 ; CHECK-LABEL: uno_f64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM0:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM2]]{{$}}
 define i32 @uno_f64(double %x, double %y) {
@@ -35,7 +43,9 @@ define i32 @uno_f64(double %x, double %y) {
 ; CHECK-LABEL: oeq_f64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f64.eq $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @oeq_f64(double %x, double %y) {
   %a = fcmp oeq double %x, %y
@@ -44,7 +54,7 @@ define i32 @oeq_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: une_f64:
-; CHECK: f64.ne $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: f64.ne $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @une_f64(double %x, double %y) {
   %a = fcmp une double %x, %y
@@ -53,7 +63,7 @@ define i32 @une_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: olt_f64:
-; CHECK: f64.lt $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: f64.lt $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @olt_f64(double %x, double %y) {
   %a = fcmp olt double %x, %y
@@ -62,7 +72,7 @@ define i32 @olt_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: ole_f64:
-; CHECK: f64.le $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: f64.le $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ole_f64(double %x, double %y) {
   %a = fcmp ole double %x, %y
@@ -71,7 +81,7 @@ define i32 @ole_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: ogt_f64:
-; CHECK: f64.gt $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: f64.gt $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ogt_f64(double %x, double %y) {
   %a = fcmp ogt double %x, %y
@@ -80,7 +90,7 @@ define i32 @ogt_f64(double %x, double %y) {
 }
 
 ; CHECK-LABEL: oge_f64:
-; CHECK: f64.ge $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: f64.ge $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @oge_f64(double %x, double %y) {
   %a = fcmp oge double %x, %y
@@ -93,9 +103,15 @@ define i32 @oge_f64(double %x, double %y) {
 ; CHECK-LABEL: ueq_f64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f64.eq $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]{{$}}
@@ -108,9 +124,15 @@ define i32 @ueq_f64(double %x, double %y) {
 ; CHECK-LABEL: one_f64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f64.eq $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f64.eq $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.eq $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.and $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]
@@ -123,9 +145,15 @@ define i32 @one_f64(double %x, double %y) {
 ; CHECK-LABEL: ult_f64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f64.lt $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.lt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]{{$}}
@@ -138,9 +166,15 @@ define i32 @ult_f64(double %x, double %y) {
 ; CHECK-LABEL: ule_f64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f64.le $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.le $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]{{$}}
@@ -153,9 +187,15 @@ define i32 @ule_f64(double %x, double %y) {
 ; CHECK-LABEL: ugt_f64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f64.gt $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.gt $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]{{$}}
@@ -168,9 +208,15 @@ define i32 @ugt_f64(double %x, double %y) {
 ; CHECK-LABEL: uge_f64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: f64.ge $push[[NUM0:[0-9]+]]=, $0, $1{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $0, $0{{$}}
-; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $1, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.ge $push[[NUM0:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: get_local $push[[L2:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L3:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM1:[0-9]+]]=, $pop[[L2]], $pop[[L3]]{{$}}
+; CHECK-NEXT: get_local $push[[L4:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: get_local $push[[L5:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.ne $push[[NUM2:[0-9]+]]=, $pop[[L4]], $pop[[L5]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM3:[0-9]+]]=, $pop[[NUM1]], $pop[[NUM2]]{{$}}
 ; CHECK-NEXT: i32.or $push[[NUM4:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM3]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM4]]{{$}}
diff --git a/test/CodeGen/WebAssembly/comparisons_i32.ll b/test/CodeGen/WebAssembly/comparisons_i32.ll
index d2ba73f79a3d7f23edf1e37da73f76b84406e18a..a9a79c24fb47734b5616db75499b456327622cbc 100644
--- a/test/CodeGen/WebAssembly/comparisons_i32.ll
+++ b/test/CodeGen/WebAssembly/comparisons_i32.ll
@@ -4,12 +4,14 @@
 ; Test that basic 32-bit integer comparison operations assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: eq_i32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.eq $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.eq $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @eq_i32(i32 %x, i32 %y) {
   %a = icmp eq i32 %x, %y
@@ -18,7 +20,7 @@ define i32 @eq_i32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: ne_i32:
-; CHECK: i32.ne $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i32.ne $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ne_i32(i32 %x, i32 %y) {
   %a = icmp ne i32 %x, %y
@@ -27,7 +29,7 @@ define i32 @ne_i32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: slt_i32:
-; CHECK: i32.lt_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i32.lt_s $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @slt_i32(i32 %x, i32 %y) {
   %a = icmp slt i32 %x, %y
@@ -36,7 +38,7 @@ define i32 @slt_i32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: sle_i32:
-; CHECK: i32.le_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i32.le_s $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @sle_i32(i32 %x, i32 %y) {
   %a = icmp sle i32 %x, %y
@@ -45,7 +47,7 @@ define i32 @sle_i32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: ult_i32:
-; CHECK: i32.lt_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i32.lt_u $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ult_i32(i32 %x, i32 %y) {
   %a = icmp ult i32 %x, %y
@@ -54,7 +56,7 @@ define i32 @ult_i32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: ule_i32:
-; CHECK: i32.le_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i32.le_u $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ule_i32(i32 %x, i32 %y) {
   %a = icmp ule i32 %x, %y
@@ -63,7 +65,7 @@ define i32 @ule_i32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: sgt_i32:
-; CHECK: i32.gt_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i32.gt_s $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @sgt_i32(i32 %x, i32 %y) {
   %a = icmp sgt i32 %x, %y
@@ -72,7 +74,7 @@ define i32 @sgt_i32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: sge_i32:
-; CHECK: i32.ge_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i32.ge_s $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @sge_i32(i32 %x, i32 %y) {
   %a = icmp sge i32 %x, %y
@@ -81,7 +83,7 @@ define i32 @sge_i32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: ugt_i32:
-; CHECK: i32.gt_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i32.gt_u $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ugt_i32(i32 %x, i32 %y) {
   %a = icmp ugt i32 %x, %y
@@ -90,7 +92,7 @@ define i32 @ugt_i32(i32 %x, i32 %y) {
 }
 
 ; CHECK-LABEL: uge_i32:
-; CHECK: i32.ge_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i32.ge_u $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @uge_i32(i32 %x, i32 %y) {
   %a = icmp uge i32 %x, %y
diff --git a/test/CodeGen/WebAssembly/comparisons_i64.ll b/test/CodeGen/WebAssembly/comparisons_i64.ll
index 80950ae5cd9abf5781ff746544c4f31671c6d451..106520483c8fd1225ff7c1685eafc2e5dff3eeb1 100644
--- a/test/CodeGen/WebAssembly/comparisons_i64.ll
+++ b/test/CodeGen/WebAssembly/comparisons_i64.ll
@@ -4,12 +4,14 @@
 ; Test that basic 64-bit integer comparison operations assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: eq_i64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i64.eq $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.eq $push[[NUM:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @eq_i64(i64 %x, i64 %y) {
   %a = icmp eq i64 %x, %y
@@ -18,7 +20,7 @@ define i32 @eq_i64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: ne_i64:
-; CHECK: i64.ne $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i64.ne $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ne_i64(i64 %x, i64 %y) {
   %a = icmp ne i64 %x, %y
@@ -27,7 +29,7 @@ define i32 @ne_i64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: slt_i64:
-; CHECK: i64.lt_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i64.lt_s $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @slt_i64(i64 %x, i64 %y) {
   %a = icmp slt i64 %x, %y
@@ -36,7 +38,7 @@ define i32 @slt_i64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: sle_i64:
-; CHECK: i64.le_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i64.le_s $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @sle_i64(i64 %x, i64 %y) {
   %a = icmp sle i64 %x, %y
@@ -45,7 +47,7 @@ define i32 @sle_i64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: ult_i64:
-; CHECK: i64.lt_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i64.lt_u $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ult_i64(i64 %x, i64 %y) {
   %a = icmp ult i64 %x, %y
@@ -54,7 +56,7 @@ define i32 @ult_i64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: ule_i64:
-; CHECK: i64.le_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i64.le_u $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ule_i64(i64 %x, i64 %y) {
   %a = icmp ule i64 %x, %y
@@ -63,7 +65,7 @@ define i32 @ule_i64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: sgt_i64:
-; CHECK: i64.gt_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i64.gt_s $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @sgt_i64(i64 %x, i64 %y) {
   %a = icmp sgt i64 %x, %y
@@ -72,7 +74,7 @@ define i32 @sgt_i64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: sge_i64:
-; CHECK: i64.ge_s $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i64.ge_s $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @sge_i64(i64 %x, i64 %y) {
   %a = icmp sge i64 %x, %y
@@ -81,7 +83,7 @@ define i32 @sge_i64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: ugt_i64:
-; CHECK: i64.gt_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i64.gt_u $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ugt_i64(i64 %x, i64 %y) {
   %a = icmp ugt i64 %x, %y
@@ -90,7 +92,7 @@ define i32 @ugt_i64(i64 %x, i64 %y) {
 }
 
 ; CHECK-LABEL: uge_i64:
-; CHECK: i64.ge_u $push[[NUM:[0-9]+]]=, $0, $1{{$}}
+; CHECK: i64.ge_u $push[[NUM:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @uge_i64(i64 %x, i64 %y) {
   %a = icmp uge i64 %x, %y
diff --git a/test/CodeGen/WebAssembly/conv.ll b/test/CodeGen/WebAssembly/conv.ll
index 27cebb117dd40384d3d15c31b55110f4ef1e575a..913c4b0b19ea754ae027ea2b8d2bb030a988610e 100644
--- a/test/CodeGen/WebAssembly/conv.ll
+++ b/test/CodeGen/WebAssembly/conv.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test that basic conversion operations assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: i32_wrap_i64:
 ; CHECK-NEXT: .param i64{{$}}
diff --git a/test/CodeGen/WebAssembly/copysign-casts.ll b/test/CodeGen/WebAssembly/copysign-casts.ll
index f8e50d043ca91bda992a07d81d3f9bc9bfe90ca3..7cd40efafcd587fd004ad3f6630757bb2a8bf537 100644
--- a/test/CodeGen/WebAssembly/copysign-casts.ll
+++ b/test/CodeGen/WebAssembly/copysign-casts.ll
@@ -4,14 +4,14 @@
 ; unfolded.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare double @copysign(double, double) nounwind readnone
 declare float @copysignf(float, float) nounwind readnone
 
 ; CHECK-LABEL: fold_promote:
-; CHECK: f64.promote/f32 $push0=, $1{{$}}
-; CHECK: f64.copysign    $push1=, $0, $pop0{{$}}
+; CHECK: f64.promote/f32 $push0=, $pop{{[0-9]+}}{{$}}
+; CHECK: f64.copysign    $push1=, $pop{{[0-9]+}}, $pop0{{$}}
 define double @fold_promote(double %a, float %b) {
   %c = fpext float %b to double
   %t = call double @copysign(double %a, double %c)
@@ -19,8 +19,8 @@ define double @fold_promote(double %a, float %b) {
 }
 
 ; CHECK-LABEL: fold_demote:{{$}}
-; CHECK: f32.demote/f64  $push0=, $1{{$}}
-; CHECK: f32.copysign    $push1=, $0, $pop0{{$}}
+; CHECK: f32.demote/f64  $push0=, $pop{{[0-9]+}}{{$}}
+; CHECK: f32.copysign    $push1=, $pop{{[0-9]+}}, $pop0{{$}}
 define float @fold_demote(float %a, double %b) {
   %c = fptrunc double %b to float
   %t = call float @copysignf(float %a, float %c)
diff --git a/test/CodeGen/WebAssembly/cpus.ll b/test/CodeGen/WebAssembly/cpus.ll
index 78aee0f59d921fa607b21b3b99ecd8a22f7e4f04..9b4ac4425ca94ba124cadfecb394b5b11a0086c5 100644
--- a/test/CodeGen/WebAssembly/cpus.ll
+++ b/test/CodeGen/WebAssembly/cpus.ll
@@ -1,13 +1,13 @@
 ; This tests that llc accepts all valid WebAssembly CPUs.
 
-; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown -mcpu=mvp 2>&1 | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown -mcpu=bleeding-edge 2>&1 | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
-; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown-wasm -mcpu=mvp 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=mvp 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown-wasm -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown-wasm -mcpu=bleeding-edge 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=bleeding-edge 2>&1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm32-unknown-unknown-wasm -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
+; RUN: llc < %s -asm-verbose=false -mtriple=wasm64-unknown-unknown-wasm -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 
 ; CHECK-NOT: is not a recognized processor for this target
 ; INVALID: {{.+}} is not a recognized processor for this target
diff --git a/test/CodeGen/WebAssembly/dbgvalue.ll b/test/CodeGen/WebAssembly/dbgvalue.ll
index c6a091bc78c8b745eae1394f5fefe56f60b70e58..eb39c6da1c9966b9b57af947b4df6672c930ba0c 100644
--- a/test/CodeGen/WebAssembly/dbgvalue.ll
+++ b/test/CodeGen/WebAssembly/dbgvalue.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=wasm32-unknown-unknown | FileCheck %s
+; RUN: llc < %s -O0 -verify-machineinstrs -mtriple=wasm32-unknown-unknown-wasm | FileCheck %s
 
 ; CHECK: BB#0
 ; CHECK: #DEBUG_VALUE: usage:self <- %vreg4
@@ -6,7 +6,7 @@
 ; CHECK: DW_TAG_variable
 source_filename = "test/CodeGen/WebAssembly/dbgvalue.ll"
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 @key = external local_unnamed_addr global [15 x i8], align 1
 @.str = external unnamed_addr constant [33 x i8], align 1
diff --git a/test/CodeGen/WebAssembly/dead-vreg.ll b/test/CodeGen/WebAssembly/dead-vreg.ll
index 190a0856400183bb43857cc6b869316ff5200e39..06487e4cd363f501a2503a9a854afc502909a23d 100644
--- a/test/CodeGen/WebAssembly/dead-vreg.ll
+++ b/test/CodeGen/WebAssembly/dead-vreg.ll
@@ -3,7 +3,7 @@
 ; Check that unused vregs aren't assigned registers.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 define void @foo(i32* nocapture %a, i32 %w, i32 %h) {
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/WebAssembly/divrem-constant.ll b/test/CodeGen/WebAssembly/divrem-constant.ll
index 6150cab4d4fd219181edf72e4ec1a7bfb69fa26a..1b4d30ad9493783e3c40ada8947008801c8adafe 100644
--- a/test/CodeGen/WebAssembly/divrem-constant.ll
+++ b/test/CodeGen/WebAssembly/divrem-constant.ll
@@ -3,7 +3,7 @@
 ; Test that integer div and rem by constant are optimized appropriately.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: test_udiv_2:
 ; CHECK: i32.shr_u
diff --git a/test/CodeGen/WebAssembly/f16.ll b/test/CodeGen/WebAssembly/f16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6915f93e9b96d9dc9f7949b3935dfb414d93c4b1
--- /dev/null
+++ b/test/CodeGen/WebAssembly/f16.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -fast-isel | FileCheck %s
+
+; Test that f16 is expanded.
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown-wasm"
+
+; CHECK-LABEL: demote:
+; CHECK-NEXT: .param  	f32{{$}}
+; CHECK-NEXT: .result 	f32{{$}}
+; CHECK-NEXT: get_local	$push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.call	$push[[L1:[0-9]+]]=, __gnu_f2h_ieee@FUNCTION, $pop[[L0]]{{$}}
+; CHECK-NEXT: f32.call	$push[[L2:[0-9]+]]=, __gnu_h2f_ieee@FUNCTION, $pop[[L1]]{{$}}
+; CHECK-NEXT: return  	$pop[[L2]]{{$}}
+define half @demote(float %f) {
+    %t = fptrunc float %f to half
+    ret half %t
+}
+
+; CHECK-LABEL: promote:
+; CHECK-NEXT: .param  	f32{{$}}
+; CHECK-NEXT: .result 	f32{{$}}
+; CHECK-NEXT: get_local	$push0=, 0{{$}}
+; CHECK-NEXT: return  	$pop0{{$}}
+define float @promote(half %f) {
+    %t = fpext half %f to float
+    ret float %t
+}
diff --git a/test/CodeGen/WebAssembly/f32.ll b/test/CodeGen/WebAssembly/f32.ll
index 1c1d8191a987e1ee192def66035558b9006ec95d..45f00aa5a01fd7d16094c63d20134a0e639f61b0 100644
--- a/test/CodeGen/WebAssembly/f32.ll
+++ b/test/CodeGen/WebAssembly/f32.ll
@@ -3,7 +3,7 @@
 ; Test that basic 32-bit floating-point operations assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare float @llvm.fabs.f32(float)
 declare float @llvm.copysign.f32(float, float)
@@ -18,104 +18,106 @@ declare float @llvm.fma.f32(float, float, float)
 ; CHECK-LABEL: fadd32:
 ; CHECK-NEXT: .param f32, f32{{$}}
 ; CHECK-NEXT: .result f32{{$}}
-; CHECK-NEXT: f32.add $push0=, $0, $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.add $push[[LR:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fadd32(float %x, float %y) {
   %a = fadd float %x, %y
   ret float %a
 }
 
 ; CHECK-LABEL: fsub32:
-; CHECK: f32.sub $push0=, $0, $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.sub $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fsub32(float %x, float %y) {
   %a = fsub float %x, %y
   ret float %a
 }
 
 ; CHECK-LABEL: fmul32:
-; CHECK: f32.mul $push0=, $0, $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.mul $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fmul32(float %x, float %y) {
   %a = fmul float %x, %y
   ret float %a
 }
 
 ; CHECK-LABEL: fdiv32:
-; CHECK: f32.div $push0=, $0, $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.div $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fdiv32(float %x, float %y) {
   %a = fdiv float %x, %y
   ret float %a
 }
 
 ; CHECK-LABEL: fabs32:
-; CHECK: f32.abs $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.abs $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fabs32(float %x) {
   %a = call float @llvm.fabs.f32(float %x)
   ret float %a
 }
 
 ; CHECK-LABEL: fneg32:
-; CHECK: f32.neg $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.neg $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fneg32(float %x) {
   %a = fsub float -0., %x
   ret float %a
 }
 
 ; CHECK-LABEL: copysign32:
-; CHECK: f32.copysign $push0=, $0, $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.copysign $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @copysign32(float %x, float %y) {
   %a = call float @llvm.copysign.f32(float %x, float %y)
   ret float %a
 }
 
 ; CHECK-LABEL: sqrt32:
-; CHECK: f32.sqrt $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.sqrt $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @sqrt32(float %x) {
   %a = call float @llvm.sqrt.f32(float %x)
   ret float %a
 }
 
 ; CHECK-LABEL: ceil32:
-; CHECK: f32.ceil $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.ceil $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @ceil32(float %x) {
   %a = call float @llvm.ceil.f32(float %x)
   ret float %a
 }
 
 ; CHECK-LABEL: floor32:
-; CHECK: f32.floor $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.floor $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @floor32(float %x) {
   %a = call float @llvm.floor.f32(float %x)
   ret float %a
 }
 
 ; CHECK-LABEL: trunc32:
-; CHECK: f32.trunc $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.trunc $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @trunc32(float %x) {
   %a = call float @llvm.trunc.f32(float %x)
   ret float %a
 }
 
 ; CHECK-LABEL: nearest32:
-; CHECK: f32.nearest $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.nearest $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @nearest32(float %x) {
   %a = call float @llvm.nearbyint.f32(float %x)
   ret float %a
 }
 
 ; CHECK-LABEL: nearest32_via_rint:
-; CHECK: f32.nearest $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f32.nearest $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @nearest32_via_rint(float %x) {
   %a = call float @llvm.rint.f32(float %x)
   ret float %a
@@ -128,7 +130,7 @@ define float @nearest32_via_rint(float %x) {
 ; tests.
 
 ; CHECK-LABEL: fmin32:
-; CHECK: f32.min $push1=, $0, $pop0{{$}}
+; CHECK: f32.min $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
 define float @fmin32(float %x) {
   %a = fcmp ult float %x, 0.0
@@ -137,7 +139,7 @@ define float @fmin32(float %x) {
 }
 
 ; CHECK-LABEL: fmax32:
-; CHECK: f32.max $push1=, $0, $pop0{{$}}
+; CHECK: f32.max $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
 define float @fmax32(float %x) {
   %a = fcmp ugt float %x, 0.0
@@ -146,8 +148,8 @@ define float @fmax32(float %x) {
 }
 
 ; CHECK-LABEL: fma32:
-; CHECK: {{^}} f32.call $push0=, fmaf@FUNCTION, $0, $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: {{^}} f32.call $push[[LR:[0-9]+]]=, fmaf@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define float @fma32(float %a, float %b, float %c) {
   %d = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %d
diff --git a/test/CodeGen/WebAssembly/f64.ll b/test/CodeGen/WebAssembly/f64.ll
index 670f3f0b697845e28f9dafcb581df0d91d266aef..fb52c3f92ad64933a5209a8cf781875959c5be13 100644
--- a/test/CodeGen/WebAssembly/f64.ll
+++ b/test/CodeGen/WebAssembly/f64.ll
@@ -3,7 +3,7 @@
 ; Test that basic 64-bit floating-point operations assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare double @llvm.fabs.f64(double)
 declare double @llvm.copysign.f64(double, double)
@@ -18,104 +18,106 @@ declare double @llvm.fma.f64(double, double, double)
 ; CHECK-LABEL: fadd64:
 ; CHECK-NEXT: .param f64, f64{{$}}
 ; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: f64.add $push0=, $0, $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.add $push[[LR:[0-9]+]]=, $pop[[L0]], $pop[[L1]]{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fadd64(double %x, double %y) {
   %a = fadd double %x, %y
   ret double %a
 }
 
 ; CHECK-LABEL: fsub64:
-; CHECK: f64.sub $push0=, $0, $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.sub $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fsub64(double %x, double %y) {
   %a = fsub double %x, %y
   ret double %a
 }
 
 ; CHECK-LABEL: fmul64:
-; CHECK: f64.mul $push0=, $0, $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.mul $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fmul64(double %x, double %y) {
   %a = fmul double %x, %y
   ret double %a
 }
 
 ; CHECK-LABEL: fdiv64:
-; CHECK: f64.div $push0=, $0, $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.div $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fdiv64(double %x, double %y) {
   %a = fdiv double %x, %y
   ret double %a
 }
 
 ; CHECK-LABEL: fabs64:
-; CHECK: f64.abs $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.abs $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fabs64(double %x) {
   %a = call double @llvm.fabs.f64(double %x)
   ret double %a
 }
 
 ; CHECK-LABEL: fneg64:
-; CHECK: f64.neg $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.neg $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fneg64(double %x) {
   %a = fsub double -0., %x
   ret double %a
 }
 
 ; CHECK-LABEL: copysign64:
-; CHECK: f64.copysign $push0=, $0, $1{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.copysign $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @copysign64(double %x, double %y) {
   %a = call double @llvm.copysign.f64(double %x, double %y)
   ret double %a
 }
 
 ; CHECK-LABEL: sqrt64:
-; CHECK: f64.sqrt $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.sqrt $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @sqrt64(double %x) {
   %a = call double @llvm.sqrt.f64(double %x)
   ret double %a
 }
 
 ; CHECK-LABEL: ceil64:
-; CHECK: f64.ceil $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.ceil $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @ceil64(double %x) {
   %a = call double @llvm.ceil.f64(double %x)
   ret double %a
 }
 
 ; CHECK-LABEL: floor64:
-; CHECK: f64.floor $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.floor $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @floor64(double %x) {
   %a = call double @llvm.floor.f64(double %x)
   ret double %a
 }
 
 ; CHECK-LABEL: trunc64:
-; CHECK: f64.trunc $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.trunc $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @trunc64(double %x) {
   %a = call double @llvm.trunc.f64(double %x)
   ret double %a
 }
 
 ; CHECK-LABEL: nearest64:
-; CHECK: f64.nearest $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.nearest $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @nearest64(double %x) {
   %a = call double @llvm.nearbyint.f64(double %x)
   ret double %a
 }
 
 ; CHECK-LABEL: nearest64_via_rint:
-; CHECK: f64.nearest $push0=, $0{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: f64.nearest $push[[LR:[0-9]+]]=, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @nearest64_via_rint(double %x) {
   %a = call double @llvm.rint.f64(double %x)
   ret double %a
@@ -128,7 +130,7 @@ define double @nearest64_via_rint(double %x) {
 ; tests.
 
 ; CHECK-LABEL: fmin64:
-; CHECK: f64.min $push1=, $0, $pop0{{$}}
+; CHECK: f64.min $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
 define double @fmin64(double %x) {
   %a = fcmp ult double %x, 0.0
@@ -137,7 +139,7 @@ define double @fmin64(double %x) {
 }
 
 ; CHECK-LABEL: fmax64:
-; CHECK: f64.max $push1=, $0, $pop0{{$}}
+; CHECK: f64.max $push1=, $pop{{[0-9]+}}, $pop[[LR]]{{$}}
 ; CHECK-NEXT: return $pop1{{$}}
 define double @fmax64(double %x) {
   %a = fcmp ugt double %x, 0.0
@@ -146,8 +148,8 @@ define double @fmax64(double %x) {
 }
 
 ; CHECK-LABEL: fma64:
-; CHECK: {{^}} f64.call $push0=, fma@FUNCTION, $0, $1, $2{{$}}
-; CHECK-NEXT: return $pop0{{$}}
+; CHECK: {{^}} f64.call $push[[LR:[0-9]+]]=, fma@FUNCTION, $pop{{[0-9]+}}, $pop{{[0-9]+}}, $pop{{[0-9]+}}{{$}}
+; CHECK-NEXT: return $pop[[LR]]{{$}}
 define double @fma64(double %a, double %b, double %c) {
   %d = call double @llvm.fma.f64(double %a, double %b, double %c)
   ret double %d
diff --git a/test/CodeGen/WebAssembly/fast-isel-noreg.ll b/test/CodeGen/WebAssembly/fast-isel-noreg.ll
index a2504822dd1c3196894e89b991f15b4d433aff8f..229651d093f040dd3dac92a6cd009ad73c05794c 100644
--- a/test/CodeGen/WebAssembly/fast-isel-noreg.ll
+++ b/test/CodeGen/WebAssembly/fast-isel-noreg.ll
@@ -4,7 +4,7 @@
 ; Test that FastISel does not generate instructions with NoReg
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK: i32.const $push0=, 0
 define hidden i32 @a() #0 {
diff --git a/test/CodeGen/WebAssembly/fast-isel.ll b/test/CodeGen/WebAssembly/fast-isel.ll
index 953bd610b1bc64b922a515cfdbacc721d9af4240..457c5874e49369bc40e89f412eb83d9877566f42 100644
--- a/test/CodeGen/WebAssembly/fast-isel.ll
+++ b/test/CodeGen/WebAssembly/fast-isel.ll
@@ -1,9 +1,10 @@
 ; RUN: llc < %s -asm-verbose=false \
 ; RUN:   -fast-isel -fast-isel-abort=1 -verify-machineinstrs \
+; RUN:   -disable-wasm-explicit-locals \
 ; RUN:   | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; This tests very minimal fast-isel functionality.
 
diff --git a/test/CodeGen/WebAssembly/frem.ll b/test/CodeGen/WebAssembly/frem.ll
index b8745224ab82930f697491c9c07c8a2d70ca8f96..1a9c13417b67aeab35dd77c84cb85a29dce7a785 100644
--- a/test/CodeGen/WebAssembly/frem.ll
+++ b/test/CodeGen/WebAssembly/frem.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test that the frem instruction works.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: frem32:
 ; CHECK-NEXT: .param f32, f32{{$}}
diff --git a/test/CodeGen/WebAssembly/func.ll b/test/CodeGen/WebAssembly/func.ll
index 71c00a46de868e2fd3a8dd3340adff4557902e35..994ef62bf54d2b199802e6e52329959456d4bb7d 100644
--- a/test/CodeGen/WebAssembly/func.ll
+++ b/test/CodeGen/WebAssembly/func.ll
@@ -1,13 +1,13 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test that basic functions assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: f0:
 ; CHECK: return{{$}}
-; CHECK: .endfunc{{$}}
+; CHECK: end_function{{$}}
 ; CHECK: .size f0,
 define void @f0() {
   ret void
diff --git a/test/CodeGen/WebAssembly/function-bitcasts.ll b/test/CodeGen/WebAssembly/function-bitcasts.ll
index 9cafdd5bc2c8ba369b07850dbd6717fc97b0d372..3f20aef081159b6354f64970aa4b4207ce4fece0 100644
--- a/test/CodeGen/WebAssembly/function-bitcasts.ll
+++ b/test/CodeGen/WebAssembly/function-bitcasts.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test that function pointer casts are replaced with wrappers.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: test:
 ; CHECK-NEXT: call        .Lbitcast@FUNCTION{{$}}
@@ -20,11 +20,10 @@ target triple = "wasm32-unknown-unknown"
 ; CHECK-NEXT: call        foo2@FUNCTION{{$}}
 ; CHECK-NEXT: call        foo1@FUNCTION{{$}}
 ; CHECK-NEXT: call        foo3@FUNCTION{{$}}
-; CHECK-NEXT: .endfunc
+; CHECK-NEXT: end_function
 
 ; CHECK-LABEL: test_varargs:
-; CHECK-NEXT: .local      i32
-; CHECK:      store
+; CHECK:      set_global
 ; CHECK:      i32.const   $push[[L3:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: call        vararg@FUNCTION, $pop[[L3]]{{$}}
 ; CHECK-NEXT: i32.const   $push[[L4:[0-9]+]]=, 0{{$}}
@@ -32,25 +31,23 @@ target triple = "wasm32-unknown-unknown"
 ; CHECK-NEXT: call        plain@FUNCTION, $[[L5]]{{$}}
 
 ; CHECK-LABEL: .Lbitcast:
-; CHECK-NEXT: .local      i32
 ; CHECK-NEXT: call        has_i32_arg@FUNCTION, $0{{$}}
-; CHECK-NEXT: .endfunc
+; CHECK-NEXT: end_function
 
 ; CHECK-LABEL: .Lbitcast.1:
 ; CHECK-NEXT: call        $drop=, has_i32_ret@FUNCTION{{$}}
-; CHECK-NEXT: .endfunc
+; CHECK-NEXT: end_function
 
 ; CHECK-LABEL: .Lbitcast.2:
 ; CHECK-NEXT: .param      i32
 ; CHECK-NEXT: call        foo0@FUNCTION{{$}}
-; CHECK-NEXT: .endfunc
+; CHECK-NEXT: end_function
 
 ; CHECK-LABEL: .Lbitcast.3:
 ; CHECK-NEXT: .result     i32
-; CHECK-NEXT: .local      i32
 ; CHECK-NEXT: call        foo1@FUNCTION{{$}}
 ; CHECK-NEXT: copy_local  $push0=, $0
-; CHECK-NEXT: .endfunc
+; CHECK-NEXT: end_function
 
 declare void @has_i32_arg(i32)
 declare i32 @has_i32_ret()
diff --git a/test/CodeGen/WebAssembly/global.ll b/test/CodeGen/WebAssembly/global.ll
index 1d24035d8dd481271d238d83f9d2b70d5bbe3541..599eb53b431b89f3a4e3868ea820998d735073fb 100644
--- a/test/CodeGen/WebAssembly/global.ll
+++ b/test/CodeGen/WebAssembly/global.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test that globals assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-NOT: llvm.used
 ; CHECK-NOT: llvm.metadata
@@ -42,15 +42,21 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 @ud = internal global i32 undef
 
 ; CHECK: .type nil,@object
-; CHECK-NEXT: .lcomm nil,4,2{{$}}
+; CHECK: .p2align 2
+; CHECK: nil:
+; CHECK: .int32 0
+; CHECK: .size nil, 4
 @nil = internal global i32 zeroinitializer
 
 ; CHECK: .type z,@object
-; CHECK-NEXT: .lcomm z,4,2{{$}}
+; CHECK: .p2align 2
+; CHECK: z:
+; CHECK: .int32 0
+; CHECK: .size z, 4
 @z = internal global i32 0
 
-; CHECK-NEXT: .type one,@object
-; CHECK-NEXT: .p2align 2{{$}}
+; CHECK: .type one,@object
+; CHECK: .p2align 2{{$}}
 ; CHECK-NEXT: one:
 ; CHECK-NEXT: .int32 1{{$}}
 ; CHECK-NEXT: .size one, 4{{$}}
@@ -78,11 +84,17 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 @ud64 = internal global i64 undef
 
 ; CHECK: .type nil64,@object
-; CHECK: .lcomm nil64,8,3{{$}}
+; CHECK: .p2align 3{{$}}
+; CHECK-NEXT: nil64:
+; CHECK-NEXT: .int64 0{{$}}
+; CHECK-NEXT: .size nil64, 8{{$}}
 @nil64 = internal global i64 zeroinitializer
 
 ; CHECK: .type z64,@object
-; CHECK: .lcomm z64,8,3{{$}}
+; CHECK: .p2align 3{{$}}
+; CHECK-NEXT: z64:
+; CHECK-NEXT: .int64 0{{$}}
+; CHECK-NEXT: .size z64, 8{{$}}
 @z64 = internal global i64 0
 
 ; CHECK: .type twoP32,@object
@@ -107,11 +119,17 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 @f32ud = internal global float undef
 
 ; CHECK: .type f32nil,@object
-; CHECK: .lcomm f32nil,4,2{{$}}
+; CHECK: .p2align 2{{$}}
+; CHECK-NEXT: f32nil:
+; CHECK-NEXT: .int32 0{{$}}
+; CHECK-NEXT: .size f32nil, 4{{$}}
 @f32nil = internal global float zeroinitializer
 
 ; CHECK: .type f32z,@object
-; CHECK: .lcomm f32z,4,2{{$}}
+; CHECK: .p2align 2{{$}}
+; CHECK-NEXT: f32z:
+; CHECK-NEXT: .int32 0{{$}}
+; CHECK-NEXT: .size f32z, 4{{$}}
 @f32z = internal global float 0.0
 
 ; CHECK: .type f32nz,@object
@@ -136,11 +154,17 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 @f64ud = internal global double undef
 
 ; CHECK: .type f64nil,@object
-; CHECK: .lcomm f64nil,8,3{{$}}
+; CHECK: .p2align 3{{$}}
+; CHECK-NEXT: f64nil:
+; CHECK-NEXT: .int64 0{{$}}
+; CHECK-NEXT: .size f64nil, 8{{$}}
 @f64nil = internal global double zeroinitializer
 
 ; CHECK: .type f64z,@object
-; CHECK: .lcomm f64z,8,3{{$}}
+; CHECK: .p2align 3{{$}}
+; CHECK-NEXT: f64z:
+; CHECK-NEXT: .int64 0{{$}}
+; CHECK-NEXT: .size f64z, 8{{$}}
 @f64z = internal global double 0.0
 
 ; CHECK: .type f64nz,@object
@@ -168,7 +192,7 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 
 ; Constant global.
 ; CHECK: .type    rom,@object{{$}}
-; CHECK: .section .rodata,"a",@progbits{{$}}
+; CHECK: .section .rodata.rom,
 ; CHECK: .globl   rom{{$}}
 ; CHECK: .p2align   4{{$}}
 ; CHECK: rom:
@@ -177,11 +201,11 @@ define i8* @call_memcpy(i8* %p, i8* nocapture readonly %q, i32 %n) {
 @rom = constant [128 x i32] zeroinitializer, align 16
 
 ; CHECK: .type       array,@object
-; CHECK-NEXT: array:
+; CHECK: array:
 ; CHECK-NEXT: .skip       8
 ; CHECK-NEXT: .size       array, 8
 ; CHECK: .type       pointer_to_array,@object
-; CHECK-NEXT: .section    .data.rel.ro,"aw",@progbits
+; CHECK-NEXT: .section    .data.rel.ro.pointer_to_array,
 ; CHECK-NEXT: .globl      pointer_to_array
 ; CHECK-NEXT: .p2align      2
 ; CHECK-NEXT: pointer_to_array:
diff --git a/test/CodeGen/WebAssembly/globl.ll b/test/CodeGen/WebAssembly/globl.ll
index 3ebd3d88fb4e717df2f7ab82eca34f12986658d2..ba9f6659d7d73f36912f896af2aafbe18009f91c 100644
--- a/test/CodeGen/WebAssembly/globl.ll
+++ b/test/CodeGen/WebAssembly/globl.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -asm-verbose=false | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK: .globl foo
 ; CHECK-LABEL: foo:
diff --git a/test/CodeGen/WebAssembly/i128.ll b/test/CodeGen/WebAssembly/i128.ll
index 29bf787863d5f502e150b8871537518c9fcf0697..2e44af9c518451b5abf9daa87f15228a5043d1ca 100644
--- a/test/CodeGen/WebAssembly/i128.ll
+++ b/test/CodeGen/WebAssembly/i128.ll
@@ -3,7 +3,7 @@
 ; Test that basic 128-bit integer operations assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare i128 @llvm.ctlz.i128(i128, i1)
 declare i128 @llvm.cttz.i128(i128, i1)
diff --git a/test/CodeGen/WebAssembly/i32-load-store-alignment.ll b/test/CodeGen/WebAssembly/i32-load-store-alignment.ll
index fb7deecff33a0ae630ee891dc9b9eb43ae67c980..661d1b7bfc3e30a8600eceaea50245a6db2234a2 100644
--- a/test/CodeGen/WebAssembly/i32-load-store-alignment.ll
+++ b/test/CodeGen/WebAssembly/i32-load-store-alignment.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test loads and stores with custom alignment values.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: ldi32_a1:
 ; CHECK-NEXT: .param i32{{$}}
diff --git a/test/CodeGen/WebAssembly/i32.ll b/test/CodeGen/WebAssembly/i32.ll
index a07dd02becedfe481f6969c0422ede13968ed66b..e451695d8903401a7486deca078cb5755daced09 100644
--- a/test/CodeGen/WebAssembly/i32.ll
+++ b/test/CodeGen/WebAssembly/i32.ll
@@ -3,7 +3,7 @@
 ; Test that basic 32-bit integer operations assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare i32 @llvm.ctlz.i32(i32, i1)
 declare i32 @llvm.cttz.i32(i32, i1)
@@ -12,7 +12,9 @@ declare i32 @llvm.ctpop.i32(i32)
 ; CHECK-LABEL: add32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.add $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.add $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @add32(i32 %x, i32 %y) {
   %a = add i32 %x, %y
@@ -22,7 +24,9 @@ define i32 @add32(i32 %x, i32 %y) {
 ; CHECK-LABEL: sub32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.sub $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.sub $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @sub32(i32 %x, i32 %y) {
   %a = sub i32 %x, %y
@@ -32,7 +36,9 @@ define i32 @sub32(i32 %x, i32 %y) {
 ; CHECK-LABEL: mul32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.mul $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.mul $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @mul32(i32 %x, i32 %y) {
   %a = mul i32 %x, %y
@@ -42,7 +48,9 @@ define i32 @mul32(i32 %x, i32 %y) {
 ; CHECK-LABEL: sdiv32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.div_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.div_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @sdiv32(i32 %x, i32 %y) {
   %a = sdiv i32 %x, %y
@@ -52,7 +60,9 @@ define i32 @sdiv32(i32 %x, i32 %y) {
 ; CHECK-LABEL: udiv32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.div_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.div_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @udiv32(i32 %x, i32 %y) {
   %a = udiv i32 %x, %y
@@ -62,7 +72,9 @@ define i32 @udiv32(i32 %x, i32 %y) {
 ; CHECK-LABEL: srem32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.rem_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.rem_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @srem32(i32 %x, i32 %y) {
   %a = srem i32 %x, %y
@@ -72,7 +84,9 @@ define i32 @srem32(i32 %x, i32 %y) {
 ; CHECK-LABEL: urem32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.rem_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.rem_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @urem32(i32 %x, i32 %y) {
   %a = urem i32 %x, %y
@@ -82,7 +96,9 @@ define i32 @urem32(i32 %x, i32 %y) {
 ; CHECK-LABEL: and32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.and $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.and $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @and32(i32 %x, i32 %y) {
   %a = and i32 %x, %y
@@ -92,7 +108,9 @@ define i32 @and32(i32 %x, i32 %y) {
 ; CHECK-LABEL: or32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.or $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.or $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @or32(i32 %x, i32 %y) {
   %a = or i32 %x, %y
@@ -102,7 +120,9 @@ define i32 @or32(i32 %x, i32 %y) {
 ; CHECK-LABEL: xor32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.xor $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.xor $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @xor32(i32 %x, i32 %y) {
   %a = xor i32 %x, %y
@@ -112,7 +132,9 @@ define i32 @xor32(i32 %x, i32 %y) {
 ; CHECK-LABEL: shl32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.shl $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.shl $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @shl32(i32 %x, i32 %y) {
   %a = shl i32 %x, %y
@@ -122,7 +144,9 @@ define i32 @shl32(i32 %x, i32 %y) {
 ; CHECK-LABEL: shr32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.shr_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.shr_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @shr32(i32 %x, i32 %y) {
   %a = lshr i32 %x, %y
@@ -132,7 +156,9 @@ define i32 @shr32(i32 %x, i32 %y) {
 ; CHECK-LABEL: sar32:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.shr_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.shr_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @sar32(i32 %x, i32 %y) {
   %a = ashr i32 %x, %y
@@ -142,7 +168,8 @@ define i32 @sar32(i32 %x, i32 %y) {
 ; CHECK-LABEL: clz32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.clz $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.clz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @clz32(i32 %x) {
   %a = call i32 @llvm.ctlz.i32(i32 %x, i1 false)
@@ -152,7 +179,8 @@ define i32 @clz32(i32 %x) {
 ; CHECK-LABEL: clz32_zero_undef:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.clz $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.clz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @clz32_zero_undef(i32 %x) {
   %a = call i32 @llvm.ctlz.i32(i32 %x, i1 true)
@@ -162,7 +190,8 @@ define i32 @clz32_zero_undef(i32 %x) {
 ; CHECK-LABEL: ctz32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.ctz $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.ctz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @ctz32(i32 %x) {
   %a = call i32 @llvm.cttz.i32(i32 %x, i1 false)
@@ -172,7 +201,8 @@ define i32 @ctz32(i32 %x) {
 ; CHECK-LABEL: ctz32_zero_undef:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.ctz $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.ctz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @ctz32_zero_undef(i32 %x) {
   %a = call i32 @llvm.cttz.i32(i32 %x, i1 true)
@@ -182,7 +212,8 @@ define i32 @ctz32_zero_undef(i32 %x) {
 ; CHECK-LABEL: popcnt32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.popcnt $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.popcnt $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @popcnt32(i32 %x) {
   %a = call i32 @llvm.ctpop.i32(i32 %x)
@@ -192,7 +223,8 @@ define i32 @popcnt32(i32 %x) {
 ; CHECK-LABEL: eqz32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.eqz $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.eqz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @eqz32(i32 %x) {
   %a = icmp eq i32 %x, 0
@@ -203,7 +235,9 @@ define i32 @eqz32(i32 %x) {
 ; CHECK-LABEL: rotl:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.rotl $push0=, $0, $1
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.rotl $push0=, $pop[[L0]], $pop[[L1]]
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @rotl(i32 %x, i32 %y) {
   %z = sub i32 32, %y
@@ -216,7 +250,9 @@ define i32 @rotl(i32 %x, i32 %y) {
 ; CHECK-LABEL: masked_rotl:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.rotl $push0=, $0, $1
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.rotl $push0=, $pop[[L0]], $pop[[L1]]
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @masked_rotl(i32 %x, i32 %y) {
   %a = and i32 %y, 31
@@ -230,7 +266,9 @@ define i32 @masked_rotl(i32 %x, i32 %y) {
 ; CHECK-LABEL: rotr:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.rotr $push0=, $0, $1
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.rotr $push0=, $pop[[L0]], $pop[[L1]]
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @rotr(i32 %x, i32 %y) {
   %z = sub i32 32, %y
@@ -243,7 +281,9 @@ define i32 @rotr(i32 %x, i32 %y) {
 ; CHECK-LABEL: masked_rotr:
 ; CHECK-NEXT: .param i32, i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.rotr $push0=, $0, $1
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.rotr $push0=, $pop[[L0]], $pop[[L1]]
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @masked_rotr(i32 %x, i32 %y) {
   %a = and i32 %y, 31
diff --git a/test/CodeGen/WebAssembly/i64-load-store-alignment.ll b/test/CodeGen/WebAssembly/i64-load-store-alignment.ll
index a3901dfc079ab1a4d613d3e42801c6712ad2b864..1ccb74cb9d286db19f97933afce739115a0e5edc 100644
--- a/test/CodeGen/WebAssembly/i64-load-store-alignment.ll
+++ b/test/CodeGen/WebAssembly/i64-load-store-alignment.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test loads and stores with custom alignment values.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: ldi64_a1:
 ; CHECK-NEXT: .param i32{{$}}
diff --git a/test/CodeGen/WebAssembly/i64.ll b/test/CodeGen/WebAssembly/i64.ll
index 93e32bfc0e1d485561ec33af1450529200ba75c5..4386bed4ebf788e968a9dda650ce60892227de1e 100644
--- a/test/CodeGen/WebAssembly/i64.ll
+++ b/test/CodeGen/WebAssembly/i64.ll
@@ -3,7 +3,7 @@
 ; Test that basic 64-bit integer operations assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare i64 @llvm.ctlz.i64(i64, i1)
 declare i64 @llvm.cttz.i64(i64, i1)
@@ -12,7 +12,9 @@ declare i64 @llvm.ctpop.i64(i64)
 ; CHECK-LABEL: add64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.add $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.add $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @add64(i64 %x, i64 %y) {
   %a = add i64 %x, %y
@@ -22,7 +24,9 @@ define i64 @add64(i64 %x, i64 %y) {
 ; CHECK-LABEL: sub64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.sub $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.sub $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sub64(i64 %x, i64 %y) {
   %a = sub i64 %x, %y
@@ -32,7 +36,9 @@ define i64 @sub64(i64 %x, i64 %y) {
 ; CHECK-LABEL: mul64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.mul $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.mul $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @mul64(i64 %x, i64 %y) {
   %a = mul i64 %x, %y
@@ -42,7 +48,9 @@ define i64 @mul64(i64 %x, i64 %y) {
 ; CHECK-LABEL: sdiv64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.div_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.div_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sdiv64(i64 %x, i64 %y) {
   %a = sdiv i64 %x, %y
@@ -52,7 +60,9 @@ define i64 @sdiv64(i64 %x, i64 %y) {
 ; CHECK-LABEL: udiv64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.div_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.div_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @udiv64(i64 %x, i64 %y) {
   %a = udiv i64 %x, %y
@@ -62,7 +72,9 @@ define i64 @udiv64(i64 %x, i64 %y) {
 ; CHECK-LABEL: srem64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.rem_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.rem_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @srem64(i64 %x, i64 %y) {
   %a = srem i64 %x, %y
@@ -72,7 +84,9 @@ define i64 @srem64(i64 %x, i64 %y) {
 ; CHECK-LABEL: urem64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.rem_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.rem_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @urem64(i64 %x, i64 %y) {
   %a = urem i64 %x, %y
@@ -82,7 +96,9 @@ define i64 @urem64(i64 %x, i64 %y) {
 ; CHECK-LABEL: and64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.and $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.and $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @and64(i64 %x, i64 %y) {
   %a = and i64 %x, %y
@@ -92,7 +108,9 @@ define i64 @and64(i64 %x, i64 %y) {
 ; CHECK-LABEL: or64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.or $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.or $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @or64(i64 %x, i64 %y) {
   %a = or i64 %x, %y
@@ -102,7 +120,9 @@ define i64 @or64(i64 %x, i64 %y) {
 ; CHECK-LABEL: xor64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.xor $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.xor $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @xor64(i64 %x, i64 %y) {
   %a = xor i64 %x, %y
@@ -112,7 +132,9 @@ define i64 @xor64(i64 %x, i64 %y) {
 ; CHECK-LABEL: shl64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.shl $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.shl $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @shl64(i64 %x, i64 %y) {
   %a = shl i64 %x, %y
@@ -122,7 +144,9 @@ define i64 @shl64(i64 %x, i64 %y) {
 ; CHECK-LABEL: shr64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.shr_u $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.shr_u $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @shr64(i64 %x, i64 %y) {
   %a = lshr i64 %x, %y
@@ -132,7 +156,9 @@ define i64 @shr64(i64 %x, i64 %y) {
 ; CHECK-LABEL: sar64:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.shr_s $push0=, $0, $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.shr_s $push0=, $pop[[L0]], $pop[[L1]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @sar64(i64 %x, i64 %y) {
   %a = ashr i64 %x, %y
@@ -142,7 +168,8 @@ define i64 @sar64(i64 %x, i64 %y) {
 ; CHECK-LABEL: clz64:
 ; CHECK-NEXT: .param i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.clz $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.clz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @clz64(i64 %x) {
   %a = call i64 @llvm.ctlz.i64(i64 %x, i1 false)
@@ -152,7 +179,8 @@ define i64 @clz64(i64 %x) {
 ; CHECK-LABEL: clz64_zero_undef:
 ; CHECK-NEXT: .param i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.clz $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.clz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @clz64_zero_undef(i64 %x) {
   %a = call i64 @llvm.ctlz.i64(i64 %x, i1 true)
@@ -162,7 +190,8 @@ define i64 @clz64_zero_undef(i64 %x) {
 ; CHECK-LABEL: ctz64:
 ; CHECK-NEXT: .param i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.ctz $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.ctz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @ctz64(i64 %x) {
   %a = call i64 @llvm.cttz.i64(i64 %x, i1 false)
@@ -172,7 +201,8 @@ define i64 @ctz64(i64 %x) {
 ; CHECK-LABEL: ctz64_zero_undef:
 ; CHECK-NEXT: .param i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.ctz $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.ctz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @ctz64_zero_undef(i64 %x) {
   %a = call i64 @llvm.cttz.i64(i64 %x, i1 true)
@@ -182,7 +212,8 @@ define i64 @ctz64_zero_undef(i64 %x) {
 ; CHECK-LABEL: popcnt64:
 ; CHECK-NEXT: .param i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.popcnt $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.popcnt $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @popcnt64(i64 %x) {
   %a = call i64 @llvm.ctpop.i64(i64 %x)
@@ -192,7 +223,8 @@ define i64 @popcnt64(i64 %x) {
 ; CHECK-LABEL: eqz64:
 ; CHECK-NEXT: .param i64{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i64.eqz $push0=, $0{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.eqz $push0=, $pop[[L0]]{{$}}
 ; CHECK-NEXT: return $pop0{{$}}
 define i32 @eqz64(i64 %x) {
   %a = icmp eq i64 %x, 0
@@ -203,7 +235,9 @@ define i32 @eqz64(i64 %x) {
 ; CHECK-LABEL: rotl:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.rotl $push0=, $0, $1
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.rotl $push0=, $pop[[L0]], $pop[[L1]]
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @rotl(i64 %x, i64 %y) {
   %z = sub i64 64, %y
@@ -216,7 +250,9 @@ define i64 @rotl(i64 %x, i64 %y) {
 ; CHECK-LABEL: masked_rotl:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.rotl $push0=, $0, $1
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.rotl $push0=, $pop[[L0]], $pop[[L1]]
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @masked_rotl(i64 %x, i64 %y) {
   %a = and i64 %y, 63
@@ -230,7 +266,9 @@ define i64 @masked_rotl(i64 %x, i64 %y) {
 ; CHECK-LABEL: rotr:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.rotr $push0=, $0, $1
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.rotr $push0=, $pop[[L0]], $pop[[L1]]
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @rotr(i64 %x, i64 %y) {
   %z = sub i64 64, %y
@@ -243,7 +281,9 @@ define i64 @rotr(i64 %x, i64 %y) {
 ; CHECK-LABEL: masked_rotr:
 ; CHECK-NEXT: .param i64, i64{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.rotr $push0=, $0, $1
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.rotr $push0=, $pop[[L0]], $pop[[L1]]
 ; CHECK-NEXT: return $pop0{{$}}
 define i64 @masked_rotr(i64 %x, i64 %y) {
   %a = and i64 %y, 63
diff --git a/test/CodeGen/WebAssembly/ident.ll b/test/CodeGen/WebAssembly/ident.ll
index 49c188ec2578af1809b39e57f11261697c916aba..e5d85d090f1101a7dc94d72d1a8cfaea2037507e 100644
--- a/test/CodeGen/WebAssembly/ident.ll
+++ b/test/CodeGen/WebAssembly/ident.ll
@@ -3,7 +3,7 @@
 ; Test llvm.ident.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK: .ident "hello world"
 
diff --git a/test/CodeGen/WebAssembly/immediates.ll b/test/CodeGen/WebAssembly/immediates.ll
index 3d11f9410a7921d045af6ea9a4724986689227a2..1182423a594e24708b9afb2565f2121a43e4eaa5 100644
--- a/test/CodeGen/WebAssembly/immediates.ll
+++ b/test/CodeGen/WebAssembly/immediates.ll
@@ -3,7 +3,7 @@
 ; Test that basic immediates assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: zero_i32:
 ; CHECK-NEXT: .result i32{{$}}
diff --git a/test/CodeGen/WebAssembly/implicit-def.ll b/test/CodeGen/WebAssembly/implicit-def.ll
index 01ee171b449b78f0172ae547c95ee82552f085d6..1f9f74887e8adc8b847cf922dd0086cfd6b80880 100644
--- a/test/CodeGen/WebAssembly/implicit-def.ll
+++ b/test/CodeGen/WebAssembly/implicit-def.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -o - %s | FileCheck %s
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; Test that stackified IMPLICIT_DEF instructions are converted into
 ; CONST_I32 to provide an explicit push.
diff --git a/test/CodeGen/WebAssembly/inline-asm.ll b/test/CodeGen/WebAssembly/inline-asm.ll
index 9b72eb65e0d5307afc9359862682831906ef7b86..56576305d9e2d6a999e6a35d1c7736565b51c716 100644
--- a/test/CodeGen/WebAssembly/inline-asm.ll
+++ b/test/CodeGen/WebAssembly/inline-asm.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -no-integrated-as | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -no-integrated-as | FileCheck %s
 
 ; Test basic inline assembly. Pass -no-integrated-as since these aren't
 ; actually valid assembly syntax.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: foo:
 ; CHECK-NEXT: .param i32{{$}}
@@ -33,7 +33,6 @@ entry:
 
 ; CHECK-LABEL: imm:
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: .local i32{{$}}
 ; CHECK-NEXT: #APP{{$}}
 ; CHECK-NEXT: # $0 = ccc(42){{$}}
 ; CHECK-NEXT: #NO_APP{{$}}
diff --git a/test/CodeGen/WebAssembly/irreducible-cfg.ll b/test/CodeGen/WebAssembly/irreducible-cfg.ll
index 8fe7d10c5f315421579be3199271e709893366bc..dd47b5827d5b0dba2a01861df08e703bb4b27135 100644
--- a/test/CodeGen/WebAssembly/irreducible-cfg.ll
+++ b/test/CodeGen/WebAssembly/irreducible-cfg.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-block-placement | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -verify-machineinstrs -disable-block-placement -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test irreducible CFG handling.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; A simple loop with two entries.
 
diff --git a/test/CodeGen/WebAssembly/legalize.ll b/test/CodeGen/WebAssembly/legalize.ll
index 5cbfb8ace9edb52d51e4808a1ecc24d58d5c9ad7..978e72b5b85b3b7ce217d2f69ceb8e6b0e416c55 100644
--- a/test/CodeGen/WebAssembly/legalize.ll
+++ b/test/CodeGen/WebAssembly/legalize.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test various types and operators that need to be legalized.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: shl_i3:
 ; CHECK: i32.const   $push0=, 7{{$}}
diff --git a/test/CodeGen/WebAssembly/load-ext.ll b/test/CodeGen/WebAssembly/load-ext.ll
index 48a7ce7c4bd2fc903d87095b28d378d5be2b0078..a624995ea62598addb250cc4deb06b979b22de00 100644
--- a/test/CodeGen/WebAssembly/load-ext.ll
+++ b/test/CodeGen/WebAssembly/load-ext.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test that extending loads are assembled properly.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: sext_i8_i32:
 ; CHECK: i32.load8_s $push0=, 0($0){{$}}
diff --git a/test/CodeGen/WebAssembly/load-store-i1.ll b/test/CodeGen/WebAssembly/load-store-i1.ll
index ea0ec717c7a0cf8baaab99a18e26cea097f80c46..9882609d773b7c474e409b36d34247895ef6a14e 100644
--- a/test/CodeGen/WebAssembly/load-store-i1.ll
+++ b/test/CodeGen/WebAssembly/load-store-i1.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test that i1 extending loads and truncating stores are assembled properly.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: load_u_i1_i32:
 ; CHECK:      i32.load8_u $push[[NUM0:[0-9]+]]=, 0($0){{$}}
diff --git a/test/CodeGen/WebAssembly/load.ll b/test/CodeGen/WebAssembly/load.ll
index a8e174e914e158587d2f09297b30e66be7a98ea7..165d145fde1ab67dc27dcfbabd42a32964ae2d3d 100644
--- a/test/CodeGen/WebAssembly/load.ll
+++ b/test/CodeGen/WebAssembly/load.ll
@@ -4,12 +4,13 @@
 ; Test that basic loads are assembled properly.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: ldi32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i32{{$}}
-; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i32.load $push[[NUM:[0-9]+]]=, 0($pop[[L0]]){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i32 @ldi32(i32 *%p) {
   %v = load i32, i32* %p
@@ -19,7 +20,8 @@ define i32 @ldi32(i32 *%p) {
 ; CHECK-LABEL: ldi64:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result i64{{$}}
-; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: i64.load $push[[NUM:[0-9]+]]=, 0($pop[[L0]]){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define i64 @ldi64(i64 *%p) {
   %v = load i64, i64* %p
@@ -29,7 +31,8 @@ define i64 @ldi64(i64 *%p) {
 ; CHECK-LABEL: ldf32:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result f32{{$}}
-; CHECK-NEXT: f32.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f32.load $push[[NUM:[0-9]+]]=, 0($pop[[L0]]){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define float @ldf32(float *%p) {
   %v = load float, float* %p
@@ -39,7 +42,8 @@ define float @ldf32(float *%p) {
 ; CHECK-LABEL: ldf64:
 ; CHECK-NEXT: .param i32{{$}}
 ; CHECK-NEXT: .result f64{{$}}
-; CHECK-NEXT: f64.load $push[[NUM:[0-9]+]]=, 0($0){{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: f64.load $push[[NUM:[0-9]+]]=, 0($pop[[L0]]){{$}}
 ; CHECK-NEXT: return $pop[[NUM]]{{$}}
 define double @ldf64(double *%p) {
   %v = load double, double* %p
diff --git a/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll b/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll
index 8283b49cd584ff91e3b113b4b3a4fbdad667b6f2..91fde29ea59e6609bd8bb74de2f9068c1aefc87d 100644
--- a/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll
+++ b/test/CodeGen/WebAssembly/lower-em-ehsjlj-options.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s | FileCheck %s --check-prefix=NONE
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 %struct.__jmp_buf_tag = type { [6 x i32], i32, [32 x i32] }
 
diff --git a/test/CodeGen/WebAssembly/lower-em-exceptions-whitelist.ll b/test/CodeGen/WebAssembly/lower-em-exceptions-whitelist.ll
index 5fcc39909b057010c785e952078dd340467f0449..3864e445f63948cc86f0f29d0474cc186a505710 100644
--- a/test/CodeGen/WebAssembly/lower-em-exceptions-whitelist.ll
+++ b/test/CodeGen/WebAssembly/lower-em-exceptions-whitelist.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -wasm-lower-em-ehsjlj -emscripten-cxx-exceptions-whitelist=do_catch -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 define void @dont_catch() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
 ; CHECK-LABEL: @dont_catch(
diff --git a/test/CodeGen/WebAssembly/lower-em-exceptions.ll b/test/CodeGen/WebAssembly/lower-em-exceptions.ll
index 60953cdb6efe206c6fb697463d52f81f3a79b2e5..060f481c3265b98d0ec9fd561c5a101e0e87016b 100644
--- a/test/CodeGen/WebAssembly/lower-em-exceptions.ll
+++ b/test/CodeGen/WebAssembly/lower-em-exceptions.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -wasm-lower-em-ehsjlj -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 @_ZTIi = external constant i8*
 @_ZTIc = external constant i8*
diff --git a/test/CodeGen/WebAssembly/lower-em-sjlj.ll b/test/CodeGen/WebAssembly/lower-em-sjlj.ll
index 40b9d62a03606fdf1fe538f5009bb2ba490c854e..cf42219c0114161e9a5fd232de4c3ab262f9494e 100644
--- a/test/CodeGen/WebAssembly/lower-em-sjlj.ll
+++ b/test/CodeGen/WebAssembly/lower-em-sjlj.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -wasm-lower-em-ehsjlj -S | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 %struct.__jmp_buf_tag = type { [6 x i32], i32, [32 x i32] }
 
diff --git a/test/CodeGen/WebAssembly/mem-intrinsics.ll b/test/CodeGen/WebAssembly/mem-intrinsics.ll
index 0ac1e1e182cd9b737e91a0e21b17eec9bf1d77fa..32a7117a1ea6792b22886ec1f2ee40230509307c 100644
--- a/test/CodeGen/WebAssembly/mem-intrinsics.ll
+++ b/test/CodeGen/WebAssembly/mem-intrinsics.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -tail-dup-placement=0| FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -tail-dup-placement=0 | FileCheck %s
 
 ; Test memcpy, memmove, and memset intrinsics.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
 declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
diff --git a/test/CodeGen/WebAssembly/memory-addr32.ll b/test/CodeGen/WebAssembly/memory-addr32.ll
index bd11b818319558bb3bd3d2b15a9bc30b010d35ec..ad599b1b3f17da9c1698445a8660279ff416f4d0 100644
--- a/test/CodeGen/WebAssembly/memory-addr32.ll
+++ b/test/CodeGen/WebAssembly/memory-addr32.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test that basic memory operations assemble as expected with 32-bit addresses.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare i32 @llvm.wasm.current.memory.i32() nounwind readonly
 declare i32 @llvm.wasm.grow.memory.i32(i32) nounwind
diff --git a/test/CodeGen/WebAssembly/non-executable-stack.ll b/test/CodeGen/WebAssembly/non-executable-stack.ll
index b81063724e9c407fd21ff55b60c889e9426ac959..f1e1ba36a79065b64360dc4b28140ce21b1208aa 100644
--- a/test/CodeGen/WebAssembly/non-executable-stack.ll
+++ b/test/CodeGen/WebAssembly/non-executable-stack.ll
@@ -4,6 +4,6 @@
 ; because wasm's stack is always non-executable.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-NOT: .note.GNU-stack
diff --git a/test/CodeGen/WebAssembly/offset-folding.ll b/test/CodeGen/WebAssembly/offset-folding.ll
index 863549fc20fc288637a04aae7924c3c3755ac04e..e8e98ecc3307bf8a7b7b1a7e21e3ae46575f36c4 100644
--- a/test/CodeGen/WebAssembly/offset-folding.ll
+++ b/test/CodeGen/WebAssembly/offset-folding.ll
@@ -3,7 +3,7 @@
 ; Test that constant offsets can be folded into global addresses.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 @x = external global [0 x i32]
 @y = global [50 x i32] zeroinitializer
diff --git a/test/CodeGen/WebAssembly/offset.ll b/test/CodeGen/WebAssembly/offset.ll
index 37f08abc9fa84e9fcf28f8099baf2dde59226990..27c71873302ab00abbf99bda672d520a374e670d 100644
--- a/test/CodeGen/WebAssembly/offset.ll
+++ b/test/CodeGen/WebAssembly/offset.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test constant load and store address offsets.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; With an nuw add, we can fold an offset.
 
diff --git a/test/CodeGen/WebAssembly/phi.ll b/test/CodeGen/WebAssembly/phi.ll
index 747ae5cb15d49583cfabf72d1f1b83cf23ec7417..4aae92df54d90dc4484f17befdaaf2bf536cb50a 100644
--- a/test/CodeGen/WebAssembly/phi.ll
+++ b/test/CodeGen/WebAssembly/phi.ll
@@ -1,15 +1,16 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -verify-machineinstrs | FileCheck %s
 
 ; Test that phis are lowered.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; Basic phi triangle.
 
 ; CHECK-LABEL: test0:
-; CHECK: div_s $[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}}
-; CHECK: return $[[NUM0]]{{$}}
+; CHECK: return $0
+; CHECK: div_s $push[[NUM0:[0-9]+]]=, $0, $pop[[NUM1:[0-9]+]]{{$}}
+; CHECK: return $pop[[NUM0]]{{$}}
 define i32 @test0(i32 %p) {
 entry:
   %t = icmp slt i32 %p, 0
diff --git a/test/CodeGen/WebAssembly/reg-stackify.ll b/test/CodeGen/WebAssembly/reg-stackify.ll
index 00469132c9536dd0b4a09dcb7601103e73c5348e..d1423b5db395b69b14c92277e112621c5ed06c8c 100644
--- a/test/CodeGen/WebAssembly/reg-stackify.ll
+++ b/test/CodeGen/WebAssembly/reg-stackify.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -verify-machineinstrs | FileCheck %s
 
 ; Test the register stackifier pass.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; No because of pointer aliasing.
 
@@ -126,7 +126,6 @@ false:
 
 ; CHECK-LABEL: multiple_uses:
 ; CHECK: .param       i32, i32, i32{{$}}
-; CHECK-NEXT: .local       i32{{$}}
 ; CHECK-NEXT: block   {{$}}
 ; CHECK-NEXT: i32.load    $push[[NUM0:[0-9]+]]=, 0($2){{$}}
 ; CHECK-NEXT: tee_local   $push[[NUM1:[0-9]+]]=, $3=, $pop[[NUM0]]{{$}}
@@ -449,8 +448,7 @@ bb10:                                             ; preds = %bb9, %bb
 
 ; CHECK-LABEL: stackpointer_dependency:
 ; CHECK:      call {{.+}}, stackpointer_callee@FUNCTION,
-; CHECK:      i32.const $push[[L0:.+]]=, 0
-; CHECK-NEXT: i32.store __stack_pointer($pop[[L0]]),
+; CHECK-NEXT: set_global 0,
 declare i32 @stackpointer_callee(i8* readnone, i8* readnone)
 declare i8* @llvm.frameaddress(i32)
 define i32 @stackpointer_dependency(i8* readnone) {
diff --git a/test/CodeGen/WebAssembly/return-int32.ll b/test/CodeGen/WebAssembly/return-int32.ll
index 9e663b969e1405af4a1dcf35f069c177f790752e..a6634b740cfc02539b1ae242e9bba76890071c74 100644
--- a/test/CodeGen/WebAssembly/return-int32.ll
+++ b/test/CodeGen/WebAssembly/return-int32.ll
@@ -2,13 +2,13 @@
 ; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: return_i32:
 ; CHECK-NEXT:  .param i32{{$}}
 ; CHECK-NEXT:  .result i32{{$}}
-; CHECK-NEXT:  copy_local  $push0=, $0
-; CHECK-NEXT:  .endfunc{{$}}
+; CHECK-NEXT:  get_local  $push0=, 0
+; CHECK-NEXT:  end_function{{$}}
 define i32 @return_i32(i32 %p) {
   ret i32 %p
 }
@@ -19,7 +19,7 @@ define i32 @return_i32(i32 %p) {
 ; CHECK-NEXT: return $pop[[L0]]{{$}}
 ; CHECK:      store
 ; CHECK-NEXT: i32.const $push{{[^,]+}}=, 3{{$}}
-; CHECK-NEXT: .endfunc{{$}}
+; CHECK-NEXT: end_function{{$}}
 define i32 @return_i32_twice(i32 %a) {
   %b = icmp ne i32 %a, 0
   br i1 %b, label %true, label %false
diff --git a/test/CodeGen/WebAssembly/return-void.ll b/test/CodeGen/WebAssembly/return-void.ll
index c3a600f7838dbe26fdf4411bc827df1ae8e831a3..90cf37fd2c69748392d903d1be79f1e4c31da692 100644
--- a/test/CodeGen/WebAssembly/return-void.ll
+++ b/test/CodeGen/WebAssembly/return-void.ll
@@ -2,10 +2,10 @@
 ; RUN: llc < %s -asm-verbose=false -fast-isel -fast-isel-abort=1 | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: return_void:
-; CHECK-NEXT: .endfunc{{$}}
+; CHECK-NEXT: end_function{{$}}
 define void @return_void() {
   ret void
 }
@@ -14,7 +14,7 @@ define void @return_void() {
 ; CHECK:      store
 ; CHECK-NEXT: return{{$}}
 ; CHECK:      store
-; CHECK-NEXT: .endfunc{{$}}
+; CHECK-NEXT: end_function{{$}}
 define void @return_void_twice(i32 %a) {
   %b = icmp ne i32 %a, 0
   br i1 %b, label %true, label %false
diff --git a/test/CodeGen/WebAssembly/returned.ll b/test/CodeGen/WebAssembly/returned.ll
index a277928ae400673457d2c21c75b5afb3c4eadd97..b059fd8a59879a2c9b7ff441e2c0d55d50e4d67b 100644
--- a/test/CodeGen/WebAssembly/returned.ll
+++ b/test/CodeGen/WebAssembly/returned.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test that the "returned" attribute is optimized effectively.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: _Z3foov:
 ; CHECK-NEXT: .result   i32{{$}}
diff --git a/test/CodeGen/WebAssembly/select.ll b/test/CodeGen/WebAssembly/select.ll
index 06837e4c2368b1fd50695d46f8b53473392abae8..b25f16c499a80fab326a322d7060823f08f91622 100644
--- a/test/CodeGen/WebAssembly/select.ll
+++ b/test/CodeGen/WebAssembly/select.ll
@@ -1,10 +1,10 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -fast-isel -fast-isel-abort=1 | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -fast-isel -fast-isel-abort=1 | FileCheck %s
 
 ; Test that wasm select instruction is selected from LLVM select instruction.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: select_i32_bool:
 ; CHECK-NEXT: .param     i32, i32, i32{{$}}
diff --git a/test/CodeGen/WebAssembly/signext-zeroext.ll b/test/CodeGen/WebAssembly/signext-zeroext.ll
index f9561da5363d56a5bc4627ceeee92c2bc510d246..b07c7f669c37e8c00e583be2a748b74800afa21a 100644
--- a/test/CodeGen/WebAssembly/signext-zeroext.ll
+++ b/test/CodeGen/WebAssembly/signext-zeroext.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test zeroext and signext ABI keywords
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: z2s_func:
 ; CHECK-NEXT: .param i32{{$}}
diff --git a/test/CodeGen/WebAssembly/simd-arith.ll b/test/CodeGen/WebAssembly/simd-arith.ll
index f0e71f2cc104d10d72b19c8f376e4bf86b7a5257..62c659b7c01c0fc16648dd663f068304f56c6e96 100644
--- a/test/CodeGen/WebAssembly/simd-arith.ll
+++ b/test/CodeGen/WebAssembly/simd-arith.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -mattr=+simd128 -fast-isel | FileCheck %s --check-prefixes CHECK,SIMD128
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -mattr=-simd128 | FileCheck %s --check-prefixes CHECK,NO-SIMD128
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -mattr=-simd128 -fast-isel | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -mattr=+simd128 | FileCheck %s --check-prefixes CHECK,SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -mattr=+simd128 -fast-isel | FileCheck %s --check-prefixes CHECK,SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -mattr=-simd128 | FileCheck %s --check-prefixes CHECK,NO-SIMD128
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -mattr=-simd128 -fast-isel | FileCheck %s --check-prefixes CHECK,NO-SIMD128
 
 ; Test that basic SIMD128 arithmetic operations assemble as expected.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare i32 @llvm.ctlz.i32(i32, i1)
 declare i32 @llvm.cttz.i32(i32, i1)
diff --git a/test/CodeGen/WebAssembly/stack-alignment.ll b/test/CodeGen/WebAssembly/stack-alignment.ll
index 3bb6617f8779228f0777a47f3971f82d2406c8c4..95aa1f9dbf07658b85d1ba90c82a04e5ea4d1a49 100644
--- a/test/CodeGen/WebAssembly/stack-alignment.ll
+++ b/test/CodeGen/WebAssembly/stack-alignment.ll
@@ -1,21 +1,23 @@
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare void @somefunc(i32*)
 
 ; CHECK-LABEL: underalign:
-; CHECK:      i32.load  $push[[L1:.+]]=, __stack_pointer{{.+}}
+; CHECK:      get_global $push[[L1:.+]]=, 0{{$}}
 ; CHECK-NEXT: i32.const $push[[L2:.+]]=, 16
 ; CHECK-NEXT: i32.sub   $push[[L10:.+]]=, $pop[[L1]], $pop[[L2]]
-; CHECK-NEXT: tee_local $push{{.+}}=, $[[SP:.+]]=, $pop[[L10]]
+; CHECK-NEXT: tee_local $push{{.+}}=, [[SP:.+]], $pop[[L10]]
 
-; CHECK:      i32.add   $push[[underaligned:.+]]=, $[[SP]], $pop{{.+}}
+; CHECK:      get_local $push[[L3:.+]]=, [[SP]]{{$}}
+; CHECK:      i32.add   $push[[underaligned:.+]]=, $pop[[L3]], $pop{{.+}}
 ; CHECK-NEXT: call      somefunc@FUNCTION, $pop[[underaligned]]
 
-; CHECK:      i32.add   $push[[L5:.+]]=, $[[SP]], $pop{{.+}}
-; CHECK-NEXT: i32.store __stack_pointer($pop{{.+}}), $pop[[L5]]
+; CHECK:      get_local $push[[M4:.+]]=, [[SP]]{{$}}
+; CHECK:      i32.add   $push[[L5:.+]]=, $pop[[M4]], $pop{{.+}}
+; CHECK-NEXT: set_global 0, $pop[[L5]]
 define void @underalign() {
 entry:
   %underaligned = alloca i32, align 8
@@ -24,18 +26,19 @@ entry:
 }
 
 ; CHECK-LABEL: overalign:
-; CHECK:      i32.load   $push[[L10:.+]]=, __stack_pointer
-; CHECK-NEXT: tee_local  $push[[L9:.+]]=, $[[BP:.+]]=, $pop[[L10]]
+; CHECK:      get_global $push[[L10:.+]]=, 0{{$}}
+; CHECK-NEXT: tee_local  $push[[L9:.+]]=, [[BP:.+]], $pop[[L10]]
 ; CHECK-NEXT: i32.const  $push[[L2:.+]]=, 32
 ; CHECK-NEXT: i32.sub    $push[[L8:.+]]=, $pop[[L9]], $pop[[L2]]
 ; CHECK-NEXT: i32.const  $push[[L3:.+]]=, -32
 ; CHECK-NEXT: i32.and    $push[[L7:.+]]=, $pop[[L8]], $pop[[L3]]
-; CHECK-NEXT: tee_local  $push{{.+}}=, $[[SP:.+]]=, $pop[[L7]]
+; CHECK-NEXT: tee_local  $push{{.+}}=, [[SP:.+]], $pop[[L7]]
 
-; CHECK:      call       somefunc@FUNCTION, $[[SP]]
+; CHECK:      get_local  $push[[M5:.+]]=, [[SP]]{{$}}
+; CHECK:      call       somefunc@FUNCTION, $pop[[M5]]{{$}}
 
-; CHECK:      copy_local $push[[L5:.+]]=, $[[BP]]
-; CHECK-NEXT: i32.store  __stack_pointer($pop{{.+}}), $pop[[L5]]
+; CHECK:      get_local  $push[[M6:.+]]=, [[BP]]{{$}}
+; CHECK-NEXT: set_global 0, $pop[[M6]]
 define void @overalign() {
 entry:
   %overaligned = alloca i32, align 32
@@ -44,19 +47,21 @@ entry:
 }
 
 ; CHECK-LABEL: over_and_normal_align:
-; CHECK:      i32.load   $push[[L14:.+]]=, __stack_pointer
-; CHECK-NEXT: tee_local  $push[[L13:.+]]=, $[[BP:.+]]=, $pop[[L14]]
+; CHECK:      get_global $push[[L14:.+]]=, 0{{$}}
+; CHECK-NEXT: tee_local  $push[[L13:.+]]=, [[BP:.+]], $pop[[L14]]
 ; CHECK:      i32.sub    $push[[L12:.+]]=, $pop[[L13]], $pop{{.+}}
 ; CHECK:      i32.and    $push[[L11:.+]]=, $pop[[L12]], $pop{{.+}}
-; CHECK-NEXT: tee_local  $push{{.+}}=, $[[SP]]=, $pop[[L11]]
+; CHECK-NEXT: tee_local  $push{{.+}}=, [[SP:.+]], $pop[[L11]]
 
-; CHECK:      i32.add    $push[[L6:.+]]=, $[[SP]], $pop{{.+}}
+; CHECK:      get_local  $push[[M6:.+]]=, [[SP]]{{$}}
+; CHECK:      i32.add    $push[[L6:.+]]=, $pop[[M6]], $pop{{.+}}
 ; CHECK-NEXT: call       somefunc@FUNCTION, $pop[[L6]]
-; CHECK:      i32.add    $push[[L8:.+]]=, $[[SP]], $pop{{.+}}
+; CHECK:      get_local  $push[[M7:.+]]=, [[SP]]{{$}}
+; CHECK:      i32.add    $push[[L8:.+]]=, $pop[[M7]], $pop{{.+}}
 ; CHECK-NEXT: call       somefunc@FUNCTION, $pop[[L8]]
 
-; CHECK:      copy_local $push[[L9:.+]]=, $[[BP]]
-; CHECK-NEXT: i32.store  __stack_pointer({{.+}}), $pop[[L9]]
+; CHECK:      get_local  $push[[L6:.+]]=, [[BP]]{{$}}
+; CHECK-NEXT: set_global 0, $pop[[L6]]
 define void @over_and_normal_align() {
 entry:
   %over = alloca i32, align 32
@@ -67,14 +72,16 @@ entry:
 }
 
 ; CHECK-LABEL: dynamic_overalign:
-; CHECK:      i32.load   $push[[L18:.+]]=, __stack_pointer
-; CHECK-NEXT: tee_local  $push[[L17:.+]]=, $[[SP:.+]]=, $pop[[L18]]
-; CHECK-NEXT: copy_local $[[BP:.+]]=, $pop[[L17]]
-; CHECK:      tee_local  $push{{.+}}=, $[[SP_2:.+]]=, $pop{{.+}}
+; CHECK:      get_global $push[[L18:.+]]=, 0{{$}}
+; CHECK-NEXT: tee_local  $push[[L17:.+]]=, [[SP:.+]], $pop[[L18]]
+; CHECK-NEXT: set_local  [[BP:.+]], $pop[[L17]]
+; CHECK:      tee_local  $push{{.+}}=, [[SP_2:.+]], $pop{{.+}}
 
-; CHECK:      call       somefunc@FUNCTION, $[[SP_2]]
+; CHECK:      get_local  $push[[M8:.+]]=, [[SP_2]]{{$}}
+; CHECK:      call       somefunc@FUNCTION, $pop[[M8]]
 
-; CHECK: i32.store __stack_pointer($pop{{.+}}), $[[BP]]
+; CHECK:      get_local  $push[[M9:.+]]=, [[BP]]{{$}}
+; CHECK-NEXT: set_global 0, $pop[[M9]]
 define void @dynamic_overalign(i32 %num) {
 entry:
   %dynamic = alloca i32, i32 %num, align 32
@@ -83,20 +90,22 @@ entry:
 }
 
 ; CHECK-LABEL: overalign_and_dynamic:
-; CHECK:      i32.load   $push[[L21:.+]]=, __stack_pointer
-; CHECK-NEXT: tee_local  $push[[L20:.+]]=, $[[BP:.+]]=, $pop[[L21]]
+; CHECK:      get_global $push[[L21:.+]]=, 0{{$}}
+; CHECK-NEXT: tee_local  $push[[L20:.+]]=, [[BP:.+]], $pop[[L21]]
 ; CHECK:      i32.sub    $push[[L19:.+]]=, $pop[[L20]], $pop{{.+}}
 ; CHECK:      i32.and    $push[[L18:.+]]=, $pop[[L19]], $pop{{.+}}
-; CHECK:      tee_local  $push{{.+}}=, $[[FP:.+]]=, $pop[[L18]]
-; CHECK:      i32.sub    $push[[L16:.+]]=, $[[FP]], $pop{{.+}}
-; CHECK-NEXT: tee_local  $push{{.+}}=, $[[SP:.+]]=, $pop[[L16]]
+; CHECK:      tee_local  $push{{.+}}=, [[FP:.+]], $pop[[L18]]
+; CHECK:      get_local  $push[[M10:.+]]=, [[FP]]{{$}}
+; CHECK:      i32.sub    $push[[L16:.+]]=, $pop[[M10]], $pop{{.+}}
+; CHECK-NEXT: tee_local  $push{{.+}}=, [[SP:.+]], $pop[[L16]]
 
-; CHECK:      copy_local $push[[over:.+]]=, $[[FP]]
+; CHECK:      get_local  $push[[over:.+]]=, [[FP]]
 ; CHECK-NEXT: call       somefunc@FUNCTION, $pop[[over]]
-; CHECK-NEXT: call       somefunc@FUNCTION, $[[SP]]
+; CHECK:      get_local  $push[[another:.+]]=, [[SP]]
+; CHECK-NEXT: call       somefunc@FUNCTION, $pop[[another]]
 
-; CHECK:      copy_local $push[[L12:.+]]=, $[[BP]]
-; CHECK-NEXT: i32.store  __stack_pointer($pop{{.+}}), $pop[[L12]]
+; CHECK:      get_local  $push[[M11:.+]]=, [[BP]]{{$}}
+; CHECK-NEXT: set_global 0, $pop[[M11]]
 define void @overalign_and_dynamic(i32 %num) {
 entry:
   %over = alloca i32, align 32
@@ -107,24 +116,27 @@ entry:
 }
 
 ; CHECK-LABEL: overalign_static_and_dynamic:
-; CHECK:      i32.load   $push[[L26:.+]]=, __stack_pointer
-; CHECK-NEXT: tee_local  $push[[L25:.+]]=, $[[BP:.+]]=, $pop[[L26]]
+; CHECK:      get_global $push[[L26:.+]]=, 0{{$}}
+; CHECK-NEXT: tee_local  $push[[L25:.+]]=, [[BP:.+]], $pop[[L26]]
 ; CHECK:      i32.sub    $push[[L24:.+]]=, $pop[[L25]], $pop{{.+}}
 ; CHECK:      i32.and    $push[[L23:.+]]=, $pop[[L24]], $pop{{.+}}
-; CHECK:      tee_local  $push{{.+}}=, $[[FP:.+]]=, $pop[[L23]]
-; CHECK:      i32.sub    $push[[L21:.+]]=, $[[FP]], $pop{{.+}}
-; CHECK-NEXT: tee_local  $push{{.+}}=, $[[SP:.+]]=, $pop[[L21]]
+; CHECK:      tee_local  $push{{.+}}=, [[FP:.+]], $pop[[L23]]
+; CHECK:      get_local  $push[[M12:.+]]=, [[FP]]{{$}}
+; CHECK:      i32.sub    $push[[L21:.+]]=, $pop[[M12]], $pop{{.+}}
+; CHECK-NEXT: tee_local  $push{{.+}}=, [[SP:.+]], $pop[[L21]]
 
-; CHECK:      copy_local $push[[L19:.+]]=, $[[FP]]
-; CHECK:      tee_local  $push[[L18:.+]]=, $[[FP_2:.+]]=, $pop[[L19]]
+; CHECK:      get_local  $push[[L19:.+]]=, [[FP]]
+; CHECK:      tee_local  $push[[L18:.+]]=, [[FP_2:.+]], $pop[[L19]]
 ; CHECK:      i32.add    $push[[over:.+]]=, $pop[[L18]], $pop{{.+}}
 ; CHECK-NEXT: call       somefunc@FUNCTION, $pop[[over]]
-; CHECK:      call       somefunc@FUNCTION, $[[SP]]
-; CHECK:      i32.add    $push[[static:.+]]=, $[[FP_2]], $pop{{.+}}
+; CHECK:      get_local  $push[[M12:.+]]=, [[SP]]
+; CHECK:      call       somefunc@FUNCTION, $pop[[M12]]
+; CHECK:      get_local  $push[[M13:.+]]=, [[FP_2]]
+; CHECK:      i32.add    $push[[static:.+]]=, $pop[[M13]], $pop{{.+}}
 ; CHECK-NEXT: call       somefunc@FUNCTION, $pop[[static]]
 
-; CHECK:      copy_local $push[[L16:.+]]=, $[[BP]]
-; CHECK-NEXT: i32.store  __stack_pointer({{.+}}), $pop[[L16]]
+; CHECK:      get_local  $push[[M14:.+]]=, [[BP]]{{$}}
+; CHECK-NEXT: set_global 0, $pop[[M14]]
 define void @overalign_static_and_dynamic(i32 %num) {
 entry:
   %over = alloca i32, align 32
diff --git a/test/CodeGen/WebAssembly/store-trunc.ll b/test/CodeGen/WebAssembly/store-trunc.ll
index 4369338804816692e73d7a7b763fbd19f0e345a6..ff358227d987c54d8601395c08d139eeb32274ca 100644
--- a/test/CodeGen/WebAssembly/store-trunc.ll
+++ b/test/CodeGen/WebAssembly/store-trunc.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-explicit-locals | FileCheck %s
 
 ; Test that truncating stores are assembled properly.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: trunc_i8_i32:
 ; CHECK: i32.store8 0($0), $1{{$}}
diff --git a/test/CodeGen/WebAssembly/store.ll b/test/CodeGen/WebAssembly/store.ll
index 3852b6e420ca6bf7ccae34c349db17e1ed87bf15..153d7d9addf75af480f8dee7cf26821f15cca1ad 100644
--- a/test/CodeGen/WebAssembly/store.ll
+++ b/test/CodeGen/WebAssembly/store.ll
@@ -4,11 +4,13 @@
 ; Test that basic stores are assembled properly.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: sti32:
 ; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: i32.store 0($0), $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i32.store 0($pop[[L0]]), $pop[[L1]]{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti32(i32 *%p, i32 %v) {
   store i32 %v, i32* %p
@@ -17,7 +19,9 @@ define void @sti32(i32 *%p, i32 %v) {
 
 ; CHECK-LABEL: sti64:
 ; CHECK-NEXT: .param i32, i64{{$}}
-; CHECK-NEXT: i64.store 0($0), $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: i64.store 0($pop[[L0]]), $pop[[L1]]{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @sti64(i64 *%p, i64 %v) {
   store i64 %v, i64* %p
@@ -26,7 +30,9 @@ define void @sti64(i64 *%p, i64 %v) {
 
 ; CHECK-LABEL: stf32:
 ; CHECK-NEXT: .param i32, f32{{$}}
-; CHECK-NEXT: f32.store 0($0), $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f32.store 0($pop[[L0]]), $pop[[L1]]{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @stf32(float *%p, float %v) {
   store float %v, float* %p
@@ -35,7 +41,9 @@ define void @stf32(float *%p, float %v) {
 
 ; CHECK-LABEL: stf64:
 ; CHECK-NEXT: .param i32, f64{{$}}
-; CHECK-NEXT: f64.store 0($0), $1{{$}}
+; CHECK-NEXT: get_local $push[[L0:[0-9]+]]=, 0{{$}}
+; CHECK-NEXT: get_local $push[[L1:[0-9]+]]=, 1{{$}}
+; CHECK-NEXT: f64.store 0($pop[[L0]]), $pop[[L1]]{{$}}
 ; CHECK-NEXT: return{{$}}
 define void @stf64(double *%p, double %v) {
   store double %v, double* %p
diff --git a/test/CodeGen/WebAssembly/switch.ll b/test/CodeGen/WebAssembly/switch.ll
index c6354baa57a6e919b3aa6c5539eb95439cc8755b..18eac5534a450d982ff4dd4306a999bfa0392a77 100644
--- a/test/CodeGen/WebAssembly/switch.ll
+++ b/test/CodeGen/WebAssembly/switch.ll
@@ -4,7 +4,7 @@
 ; the blocks in a way that isn't interesting here.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare void @foo0()
 declare void @foo1()
diff --git a/test/CodeGen/WebAssembly/unreachable.ll b/test/CodeGen/WebAssembly/unreachable.ll
index 77fda44d5ff35e2d162ffa9bdc93fd147bbe73b2..de96b0927563ccca54567e239c5bd622968f5f7a 100644
--- a/test/CodeGen/WebAssembly/unreachable.ll
+++ b/test/CodeGen/WebAssembly/unreachable.ll
@@ -5,7 +5,7 @@
 ; wasm unreachable
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare void @llvm.trap()
 declare void @llvm.debugtrap()
diff --git a/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll b/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
index ef4318ec299b72c80b7287a9047fe27804c08e58..c3d420a6ece641e3393132161bc5c381f4e15d89 100644
--- a/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
+++ b/test/CodeGen/WebAssembly/unsupported-function-bitcasts.ll
@@ -5,13 +5,14 @@
 ; conversions are implemented.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: test:
 ; CHECK-NEXT: i32.const   $push[[L0:[0-9]+]]=, 0{{$}}
 ; CHECK-NEXT: call        has_i64_arg@FUNCTION, $pop[[L0]]{{$}}
-; CHECK-NEXT: i32.call    $drop=, has_i64_ret@FUNCTION{{$}}
-; CHECK-NEXT: .endfunc
+; CHECK-NEXT: i32.call    $push{{[0-9]+}}=, has_i64_ret@FUNCTION{{$}}
+; CHECK-NEXT: drop
+; CHECK-NEXT: end_function
 
 ; CHECK-NOT: .Lbitcast
 
diff --git a/test/CodeGen/WebAssembly/unused-argument.ll b/test/CodeGen/WebAssembly/unused-argument.ll
index ff943b215438a07c8020370e13ea3718c79778ce..a70fc4bd2a46dd931f514d906afc221725ddad83 100644
--- a/test/CodeGen/WebAssembly/unused-argument.ll
+++ b/test/CodeGen/WebAssembly/unused-argument.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals | FileCheck %s
 
 ; Make sure that argument offsets are correct even if some arguments are unused.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; CHECK-LABEL: unused_first:
 ; CHECK-NEXT: .param i32, i32{{$}}
diff --git a/test/CodeGen/WebAssembly/userstack.ll b/test/CodeGen/WebAssembly/userstack.ll
index a163f879f6dff607dd3791873ace7cb4df86803c..57ca75705e5ea73644ccf5f9455c4f809b99080f 100644
--- a/test/CodeGen/WebAssembly/userstack.ll
+++ b/test/CodeGen/WebAssembly/userstack.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 declare void @ext_func(i64* %ptr)
 declare void @ext_func_i32(i32* %ptr)
@@ -10,39 +10,38 @@ declare void @ext_func_i32(i32* %ptr)
 ; Check that there is an extra local for the stack pointer.
 ; CHECK: .local i32{{$}}
 define void @alloca32() noredzone {
- ; CHECK: i32.const $push[[L4:.+]]=, 0{{$}}
- ; CHECK: i32.const $push[[L1:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.load $push[[L2:.+]]=, __stack_pointer($pop[[L1]])
+ ; CHECK-NEXT: get_global $push[[L2:.+]]=, 0{{$}}
  ; CHECK-NEXT: i32.const $push[[L3:.+]]=, 16
  ; CHECK-NEXT: i32.sub $push[[L9:.+]]=, $pop[[L2]], $pop[[L3]]
- ; CHECK-NEXT: tee_local $push[[L8:.+]]=, $[[SP:.+]]=, $pop[[L9]]{{$}}
- ; CHECK-NEXT: i32.store __stack_pointer($pop[[L4]]), $pop[[L8]]{{$}}
+ ; CHECK-NEXT: tee_local $push[[L8:.+]]=, [[SP:.+]], $pop[[L9]]{{$}}
+ ; CHECK-NEXT: set_global 0, $pop[[L8]]{{$}}
  %retval = alloca i32
+ ; CHECK: get_local $push[[L4:.+]]=, [[SP]]{{$}}
  ; CHECK: i32.const $push[[L0:.+]]=, 0
- ; CHECK: i32.store 12($[[SP]]), $pop[[L0]]
+ ; CHECK: i32.store 12($pop[[L4]]), $pop[[L0]]
  store i32 0, i32* %retval
- ; CHECK: i32.const $push[[L6:.+]]=, 0
+ ; CHECK: get_local $push[[L6:.+]]=, [[SP]]{{$}}
  ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 16
- ; CHECK-NEXT: i32.add $push[[L7:.+]]=, $[[SP]], $pop[[L5]]
- ; CHECK-NEXT: i32.store __stack_pointer($pop[[L6]]), $pop[[L7]]
+ ; CHECK-NEXT: i32.add $push[[L7:.+]]=, $pop[[L6]], $pop[[L5]]
+ ; CHECK-NEXT: set_global 0, $pop[[L7]]
  ret void
 }
 
 ; CHECK-LABEL: alloca3264:
 ; CHECK: .local i32{{$}}
 define void @alloca3264() {
- ; CHECK: i32.const $push[[L2:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.load $push[[L3:.+]]=, __stack_pointer($pop[[L2]])
+ ; CHECK: get_global $push[[L3:.+]]=, 0{{$}}
  ; CHECK-NEXT: i32.const $push[[L4:.+]]=, 16
  ; CHECK-NEXT: i32.sub $push[[L6:.+]]=, $pop[[L3]], $pop[[L4]]
- ; CHECK-NEXT: tee_local $push[[L5:.+]]=, $[[SP:.+]]=, $pop[[L6]]
+ ; CHECK-NEXT: tee_local $push[[L5:.+]]=, [[SP:.+]], $pop[[L6]]
  %r1 = alloca i32
  %r2 = alloca double
  ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 0
  ; CHECK-NEXT: i32.store 12($pop[[L5]]), $pop[[L0]]
  store i32 0, i32* %r1
+ ; CHECK-NEXT: get_local $push[[L2:.+]]=, [[SP]]{{$}}
  ; CHECK-NEXT: i64.const $push[[L1:.+]]=, 0
- ; CHECK-NEXT: i64.store 0($[[SP]]), $pop[[L1]]
+ ; CHECK-NEXT: i64.store 0($pop[[L2]]), $pop[[L1]]
  store double 0.0, double* %r2
  ; CHECK-NEXT: return
  ret void
@@ -51,30 +50,29 @@ define void @alloca3264() {
 ; CHECK-LABEL: allocarray:
 ; CHECK: .local i32{{$}}
 define void @allocarray() {
- ; CHECK: i32.const $push[[L6:.+]]=, 0{{$}}
- ; CHECK: i32.const $push[[L3:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.load $push[[L4:.+]]=, __stack_pointer($pop[[L3]])
+ ; CHECK-NEXT: get_global $push[[L4:.+]]=, 0{{$}}
  ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 144{{$}}
  ; CHECK-NEXT: i32.sub $push[[L12:.+]]=, $pop[[L4]], $pop[[L5]]
- ; CHECK-NEXT: tee_local $push[[L11:.+]]=, $0=, $pop[[L12]]
- ; CHECK-NEXT: i32.store __stack_pointer($pop[[L6]]), $pop[[L11]]
+ ; CHECK-NEXT: tee_local $push[[L11:.+]]=, 0, $pop[[L12]]
+ ; CHECK-NEXT: set_global 0, $pop[[L11]]
  %r = alloca [33 x i32]
 
  ; CHECK:      i32.const $push{{.+}}=, 24
- ; CHECK-NEXT: i32.add $push[[L3:.+]]=, $[[SP]], $pop{{.+}}
+ ; CHECK-NEXT: i32.add $push[[L3:.+]]=, $pop{{.+}}, $pop{{.+}}
  ; CHECK-NEXT: i32.const $push[[L1:.+]]=, 1{{$}}
  ; CHECK-NEXT: i32.store 0($pop[[L3]]), $pop[[L1]]{{$}}
+ ; CHECK-NEXT: get_local $push[[L4:.+]]=, 0{{$}}
  ; CHECK-NEXT: i32.const $push[[L10:.+]]=, 1{{$}}
- ; CHECK-NEXT: i32.store 12(${{.+}}), $pop[[L10]]{{$}}
+ ; CHECK-NEXT: i32.store 12($pop[[L4]]), $pop[[L10]]{{$}}
  %p = getelementptr [33 x i32], [33 x i32]* %r, i32 0, i32 0
  store i32 1, i32* %p
  %p2 = getelementptr [33 x i32], [33 x i32]* %r, i32 0, i32 3
  store i32 1, i32* %p2
 
- ; CHECK: i32.const $push[[L9:.+]]=, 0{{$}}
+ ; CHECK-NEXT: get_local $push[[L2:.+]]=, [[SP]]{{$}}
  ; CHECK-NEXT: i32.const $push[[L7:.+]]=, 144
- ; CHECK-NEXT: i32.add $push[[L8:.+]]=, $[[SP]], $pop[[L7]]
- ; CHECK-NEXT: i32.store __stack_pointer($pop[[L9]]), $pop[[L8]]
+ ; CHECK-NEXT: i32.add $push[[L8:.+]]=, $pop[[L2]], $pop[[L7]]
+ ; CHECK-NEXT: set_global 0, $pop[[L8]]
  ret void
 }
 
@@ -82,24 +80,27 @@ define void @allocarray() {
 define void @non_mem_use(i8** %addr) {
  ; CHECK: i32.const $push[[L2:.+]]=, 48
  ; CHECK-NEXT: i32.sub $push[[L12:.+]]=, {{.+}}, $pop[[L2]]
- ; CHECK-NEXT: tee_local $push[[L11:.+]]=, $[[SP:.+]]=, $pop[[L12]]
- ; CHECK-NEXT: i32.store {{.+}}, $pop[[L11]]
+ ; CHECK-NEXT: tee_local $push[[L11:.+]]=, [[SP:.+]], $pop[[L12]]
+ ; CHECK-NEXT: set_global 0, $pop[[L11]]
  %buf = alloca [27 x i8], align 16
  %r = alloca i64
  %r2 = alloca i64
  ; %r is at SP+8
+ ; CHECK: get_local $push[[L3:.+]]=, [[SP]]
  ; CHECK: i32.const $push[[OFF:.+]]=, 8
- ; CHECK-NEXT: i32.add $push[[ARG1:.+]]=, $[[SP]], $pop[[OFF]]
+ ; CHECK-NEXT: i32.add $push[[ARG1:.+]]=, $pop[[L3]], $pop[[OFF]]
  ; CHECK-NEXT: call ext_func@FUNCTION, $pop[[ARG1]]
  call void @ext_func(i64* %r)
  ; %r2 is at SP+0, no add needed
- ; CHECK-NEXT: call ext_func@FUNCTION, $[[SP]]
+ ; CHECK: get_local $push[[L4:.+]]=, [[SP]]
+ ; CHECK-NEXT: call ext_func@FUNCTION, $pop[[L4]]
  call void @ext_func(i64* %r2)
  ; Use as a value, but in a store
  ; %buf is at SP+16
+ ; CHECK: get_local $push[[L5:.+]]=, [[SP]]
  ; CHECK: i32.const $push[[OFF:.+]]=, 16
- ; CHECK-NEXT: i32.add $push[[VAL:.+]]=, $[[SP]], $pop[[OFF]]
- ; CHECK-NEXT: i32.store 0($0), $pop[[VAL]]
+ ; CHECK-NEXT: i32.add $push[[VAL:.+]]=, $pop[[L5]], $pop[[OFF]]
+ ; CHECK-NEXT: i32.store 0($pop{{.+}}), $pop[[VAL]]
  %gep = getelementptr inbounds [27 x i8], [27 x i8]* %buf, i32 0, i32 0
  store i8* %gep, i8** %addr
  ret void
@@ -108,13 +109,11 @@ define void @non_mem_use(i8** %addr) {
 ; CHECK-LABEL: allocarray_inbounds:
 ; CHECK: .local i32{{$}}
 define void @allocarray_inbounds() {
- ; CHECK: i32.const $push[[L5:.+]]=, 0{{$}}
- ; CHECK: i32.const $push[[L2:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.load $push[[L3:.+]]=, __stack_pointer($pop[[L2]])
+ ; CHECK: get_global $push[[L3:.+]]=, 0{{$}}
  ; CHECK-NEXT: i32.const $push[[L4:.+]]=, 32{{$}}
  ; CHECK-NEXT: i32.sub $push[[L11:.+]]=, $pop[[L3]], $pop[[L4]]
- ; CHECK-NEXT: tee_local $push[[L10:.+]]=, $[[SP:.+]]=, $pop[[L11]]
- ; CHECK-NEXT: i32.store __stack_pointer($pop[[L5]]), $pop[[L10]]{{$}}
+ ; CHECK-NEXT: tee_local $push[[L10:.+]]=, [[SP:.+]], $pop[[L11]]
+ ; CHECK-NEXT: set_global 0, $pop[[L10]]{{$}}
  %r = alloca [5 x i32]
  ; CHECK: i32.const $push[[L3:.+]]=, 1
  ; CHECK-DAG: i32.store 24(${{.+}}), $pop[[L3]]
@@ -126,45 +125,39 @@ define void @allocarray_inbounds() {
  store i32 1, i32* %p2
  call void @ext_func(i64* null);
  ; CHECK: call ext_func
- ; CHECK: i32.const $push[[L6:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.const $push[[L5:.+]]=, 32{{$}}
+ ; CHECK: i32.const $push[[L5:.+]]=, 32{{$}}
  ; CHECK-NEXT: i32.add $push[[L7:.+]]=, ${{.+}}, $pop[[L5]]
- ; CHECK-NEXT: i32.store __stack_pointer($pop[[L6]]), $pop[[L7]]
+ ; CHECK-NEXT: set_global 0, $pop[[L7]]
  ret void
 }
 
 ; CHECK-LABEL: dynamic_alloca:
 define void @dynamic_alloca(i32 %alloc) {
- ; CHECK: i32.const $push[[L7:.+]]=, 0{{$}}
- ; CHECK: i32.const $push[[L1:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.load $push[[L13:.+]]=, __stack_pointer($pop[[L1]])
+ ; CHECK: get_global $push[[L13:.+]]=, 0{{$}}
  ; CHECK-NEXT: tee_local $push[[L12:.+]]=, [[SP:.+]], $pop[[L13]]{{$}}
  ; Target independent codegen bumps the stack pointer.
  ; CHECK: i32.sub
  ; Check that SP is written back to memory after decrement
- ; CHECK: i32.store __stack_pointer($pop{{.+}}), 
+ ; CHECK: set_global 0, 
  %r = alloca i32, i32 %alloc
  ; Target-independent codegen also calculates the store addr
  ; CHECK: call ext_func_i32@FUNCTION
  call void @ext_func_i32(i32* %r)
- ; CHECK: i32.const $push[[L3:.+]]=, 0{{$}}
- ; CHECK: i32.store __stack_pointer($pop[[L3]]), $pop{{.+}}
+ ; CHECK: set_global 0, $pop{{.+}}
  ret void
 }
 
 ; CHECK-LABEL: dynamic_alloca_redzone:
 define void @dynamic_alloca_redzone(i32 %alloc) {
- ; CHECK: i32.const $push[[L8:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.load $push[[L13:.+]]=, __stack_pointer($pop[[L1]])
+ ; CHECK: get_global $push[[L13:.+]]=, 0{{$}}
  ; CHECK-NEXT: tee_local $push[[L12:.+]]=, [[SP:.+]], $pop[[L13]]{{$}}
- ; CHECK-NEXT: copy_local [[FP:.+]]=, $pop[[L12]]{{$}}
  ; Target independent codegen bumps the stack pointer
  ; CHECK: i32.sub
  %r = alloca i32, i32 %alloc
- ; CHECK-NEXT: tee_local       $push[[L8:.+]]=, $0=, $pop
- ; CHECK-NEXT: copy_local      $drop=, $pop[[L8]]{{$}}
+ ; CHECK-NEXT: tee_local       $push[[L8:.+]]=, {{.+}}, $pop
+ ; CHECK: get_local $push[[L7:.+]]=, 0{{$}}
  ; CHECK-NEXT: i32.const       $push[[L6:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.store       0($0), $pop[[L6]]{{$}}
+ ; CHECK-NEXT: i32.store       0($pop[[L7]]), $pop[[L6]]{{$}}
  store i32 0, i32* %r
  ; CHECK-NEXT: return
  ret void
@@ -173,17 +166,15 @@ define void @dynamic_alloca_redzone(i32 %alloc) {
 ; CHECK-LABEL: dynamic_static_alloca:
 define void @dynamic_static_alloca(i32 %alloc) noredzone {
  ; Decrement SP in the prolog by the static amount and writeback to memory.
- ; CHECK: i32.const $push[[L13:.+]]=, 0{{$}}
- ; CHECK: i32.const $push[[L10:.+]]=, 0{{$}}
- ; CHECK-NEXT: i32.load $push[[L11:.+]]=, __stack_pointer($pop[[L10]])
+ ; CHECK: get_global $push[[L11:.+]]=, 0{{$}}
  ; CHECK-NEXT: i32.const $push[[L12:.+]]=, 16
  ; CHECK-NEXT: i32.sub $push[[L23:.+]]=, $pop[[L11]], $pop[[L12]]
- ; CHECK-NEXT: tee_local $push[[L22:.+]]=, $[[SP:.+]]=, $pop[[L23]]
- ; CHECK-NEXT: i32.store __stack_pointer($pop{{.+}}), $pop[[L22]]
+ ; CHECK-NEXT: tee_local $push[[L22:.+]]=, [[SP:.+]], $pop[[L23]]
+ ; CHECK-NEXT: set_global 0, $pop[[L22]]
 
  ; Alloc and write to a static alloca
- ; CHECK: copy_local $push[[L21:.+]]=, $[[SP]]
- ; CHECK-NEXT: tee_local $push[[pushedFP:.+]]=, $[[FP:.+]]=, $pop[[L21]]
+ ; CHECK: get_local $push[[L21:.+]]=, [[SP:.+]]
+ ; CHECK-NEXT: tee_local $push[[pushedFP:.+]]=, [[FP:.+]], $pop[[L21]]
  ; CHECK-NEXT: i32.const $push[[L0:.+]]=, 101
  ; CHECK-NEXT: i32.store [[static_offset:.+]]($pop[[pushedFP]]), $pop[[L0]]
  %static = alloca i32
@@ -191,44 +182,51 @@ define void @dynamic_static_alloca(i32 %alloc) noredzone {
 
  ; Decrement SP in the body by the dynamic amount.
  ; CHECK: i32.sub
- ; CHECK: tee_local $push{{.+}}=, $[[dynamic_local:.+]]=, $pop{{.+}}
- ; CHECK: i32.store __stack_pointer
+ ; CHECK: tee_local $push[[L16:.+]]=, [[dynamic_local:.+]], $pop{{.+}}
+ ; CHECK: tee_local $push[[L15:.+]]=, [[other:.+]], $pop[[L16]]{{$}}
+ ; CHECK: set_global 0, $pop[[L15]]{{$}}
  %dynamic = alloca i32, i32 %alloc
 
  ; Ensure we don't modify the frame pointer after assigning it.
  ; CHECK-NOT: $[[FP]]=
 
  ; Ensure the static address doesn't change after modifying the stack pointer.
+ ; CHECK: get_local $push[[L17:.+]]=, [[FP]]
  ; CHECK: i32.const $push[[L7:.+]]=, 102
- ; CHECK-NEXT: i32.store [[static_offset]]($[[FP]]), $pop[[L7]]
+ ; CHECK-NEXT: i32.store [[static_offset]]($pop[[L17]]), $pop[[L7]]
+ ; CHECK-NEXT: get_local $push[[L9:.+]]=, [[dynamic_local]]{{$}}
  ; CHECK-NEXT: i32.const $push[[L8:.+]]=, 103
- ; CHECK-NEXT: i32.store 0($[[dynamic_local]]), $pop[[L8]]
+ ; CHECK-NEXT: i32.store 0($pop[[L9]]), $pop[[L8]]
  store volatile i32 102, i32* %static
  store volatile i32 103, i32* %dynamic
 
  ; Decrement SP in the body by the dynamic amount.
  ; CHECK: i32.sub
- ; CHECK: tee_local $push{{.+}}=, $[[dynamic2_local:.+]]=, $pop{{.+}}
+ ; CHECK: tee_local $push{{.+}}=, [[dynamic2_local:.+]], $pop{{.+}}
  %dynamic.2 = alloca i32, i32 %alloc
 
  ; CHECK-NOT: $[[FP]]=
 
  ; Ensure neither the static nor dynamic address changes after the second
  ; modification of the stack pointer.
+ ; CHECK: get_local $push[[L22:.+]]=, [[FP]]
  ; CHECK: i32.const $push[[L9:.+]]=, 104
- ; CHECK-NEXT: i32.store [[static_offset]]($[[FP]]), $pop[[L9]]
+ ; CHECK-NEXT: i32.store [[static_offset]]($pop[[L22]]), $pop[[L9]]
+ ; CHECK-NEXT: get_local $push[[L23:.+]]=, [[dynamic_local]]
  ; CHECK-NEXT: i32.const $push[[L10:.+]]=, 105
- ; CHECK-NEXT: i32.store 0($[[dynamic_local]]), $pop[[L10]]
+ ; CHECK-NEXT: i32.store 0($pop[[L23]]), $pop[[L10]]
+ ; CHECK-NEXT: get_local $push[[L23:.+]]=, [[dynamic2_local]]
  ; CHECK-NEXT: i32.const $push[[L11:.+]]=, 106
- ; CHECK-NEXT: i32.store 0($[[dynamic2_local]]), $pop[[L11]]
+ ; CHECK-NEXT: i32.store 0($pop[[L23]]), $pop[[L11]]
  store volatile i32 104, i32* %static
  store volatile i32 105, i32* %dynamic
  store volatile i32 106, i32* %dynamic.2
 
  ; Writeback to memory.
- ; CHECK: i32.const $push[[L17:.+]]=, 16
- ; CHECK-NEXT: i32.add $push[[L18:.+]]=, $[[FP]], $pop[[L17]]
- ; CHECK-NEXT: i32.store __stack_pointer($pop{{.+}}), $pop[[L18]]
+ ; CHECK: get_local $push[[L24:.+]]=, [[FP]]{{$}}
+ ; CHECK: i32.const $push[[L18:.+]]=, 16
+ ; CHECK-NEXT: i32.add $push[[L19:.+]]=, $pop[[L24]], $pop[[L18]]
+ ; CHECK-NEXT: set_global 0, $pop[[L19]]
  ret void
 }
 
@@ -237,16 +235,17 @@ declare void @llvm.stackrestore(i8*)
 
 ; CHECK-LABEL: llvm_stack_builtins:
 define void @llvm_stack_builtins(i32 %alloc) noredzone {
- ; CHECK: i32.load $push[[L11:.+]]=, __stack_pointer($pop{{.+}})
- ; CHECK-NEXT: tee_local $push[[L10:.+]]=, ${{.+}}=, $pop[[L11]]
- ; CHECK-NEXT: copy_local $[[STACK:.+]]=, $pop[[L10]]
+ ; CHECK: get_global $push[[L11:.+]]=, 0{{$}}
+ ; CHECK-NEXT: tee_local $push[[L10:.+]]=, {{.+}}, $pop[[L11]]
+ ; CHECK-NEXT: set_local [[STACK:.+]], $pop[[L10]]
  %stack = call i8* @llvm.stacksave()
 
  ; Ensure we don't reassign the stacksave local
- ; CHECK-NOT: $[[STACK]]=
+ ; CHECK-NOT: set_local [[STACK]],
  %dynamic = alloca i32, i32 %alloc
 
- ; CHECK: i32.store __stack_pointer($pop{{.+}}), $[[STACK]]
+ ; CHECK: get_local $push[[L12:.+]]=, [[STACK]]
+ ; CHECK-NEXT: set_global 0, $pop[[L12]]
  call void @llvm.stackrestore(i8* %stack)
 
  ret void
@@ -257,14 +256,15 @@ define void @llvm_stack_builtins(i32 %alloc) noredzone {
 ; moved after the stack pointer was updated for the dynamic alloca.
 ; CHECK-LABEL: dynamic_alloca_nouse:
 define void @dynamic_alloca_nouse(i32 %alloc) noredzone {
- ; CHECK: i32.load $push[[L11:.+]]=, __stack_pointer($pop{{.+}})
- ; CHECK-NEXT: tee_local $push[[L10:.+]]=, ${{.+}}=, $pop[[L11]]
- ; CHECK-NEXT: copy_local $[[FP:.+]]=, $pop[[L10]]
+ ; CHECK: get_global $push[[L11:.+]]=, 0{{$}}
+ ; CHECK-NEXT: tee_local $push[[L10:.+]]=, {{.+}}, $pop[[L11]]
+ ; CHECK-NEXT: set_local [[FP:.+]], $pop[[L10]]
  %dynamic = alloca i32, i32 %alloc
 
- ; CHECK-NOT: $[[FP]]=,
+ ; CHECK-NOT: set_local [[FP]],
 
- ; CHECK: i32.store __stack_pointer($pop{{.+}}), $[[FP]]
+ ; CHECK: get_local $push[[L12:.+]]=, [[FP]]
+ ; CHECK-NEXT: set_global 0, $pop[[L12]]
  ret void
 }
 
@@ -278,12 +278,13 @@ entry:
  %addr = alloca i32
  ; CHECK: i32.const $push[[OFF:.+]]=, 12
  ; CHECK-NEXT: i32.add $push[[ADDR:.+]]=, $pop[[L3]], $pop[[OFF]]
- ; CHECK-NEXT: copy_local [[COPY:.+]]=, $pop[[ADDR]]
+ ; CHECK-NEXT: set_local [[COPY:.+]], $pop[[ADDR]]
  br label %body
 body:
  %a = phi i32* [%addr, %entry], [%b, %body]
  store i32 1, i32* %a
- ; CHECK: i32.store 0([[COPY]]),
+ ; CHECK: get_local $push[[L12:.+]]=, [[COPY]]
+ ; CHECK: i32.store 0($pop[[L12]]),
  br i1 %cond, label %body, label %exit
 exit:
  ret void
@@ -294,13 +295,11 @@ declare i8* @llvm.frameaddress(i32)
 
 ; Test __builtin_frame_address(0).
 ; CHECK-LABEL: frameaddress_0:
-; CHECK: i32.const $push[[L0:.+]]=, 0{{$}}
-; CHECK-NEXT: i32.load $push[[L3:.+]]=, __stack_pointer($pop[[L0]])
-; CHECK-NEXT: copy_local $push[[L4:.+]]=, $pop[[L3]]{{$}}
-; CHECK-NEXT: tee_local $push[[L2:.+]]=, $[[FP:.+]]=, $pop[[L4]]{{$}}
+; CHECK: get_global $push[[L3:.+]]=, 0{{$}}
+; CHECK-NEXT: tee_local $push[[L2:.+]]=, [[FP:.+]], $pop[[L3]]{{$}}
 ; CHECK-NEXT: call use_i8_star@FUNCTION, $pop[[L2]]
-; CHECK-NEXT: i32.const $push[[L1:.+]]=, 0{{$}}
-; CHECK-NEXT: i32.store __stack_pointer($pop[[L1]]), $[[FP]]
+; CHECK-NEXT: get_local $push[[L5:.+]]=, [[FP]]
+; CHECK-NEXT: set_global 0, $pop[[L5]]
 define void @frameaddress_0() {
   %t = call i8* @llvm.frameaddress(i32 0)
   call void @use_i8_star(i8* %t)
@@ -321,7 +320,7 @@ define void @frameaddress_1() {
 
 ; Test a stack address passed to an inline asm.
 ; CHECK-LABEL: inline_asm:
-; CHECK:       __stack_pointer
+; CHECK:       get_global {{.+}}, 0{{$}}
 ; CHECK:       #APP
 ; CHECK-NEXT:  # %{{[0-9]+}}{{$}}
 ; CHECK-NEXT:  #NO_APP
diff --git a/test/CodeGen/WebAssembly/varargs.ll b/test/CodeGen/WebAssembly/varargs.ll
index c77ed10c25849df7c67511ef6de0c5470ba6d094..3f04700131cc2ee4b85d5341959912b145862b82 100644
--- a/test/CodeGen/WebAssembly/varargs.ll
+++ b/test/CodeGen/WebAssembly/varargs.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -asm-verbose=false -disable-wasm-fallthrough-return-opt -disable-wasm-explicit-locals -verify-machineinstrs | FileCheck %s
 
 ; Test varargs constructs.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 ; Test va_start.
 
@@ -52,7 +52,6 @@ entry:
 ; CHECK-LABEL: arg_i8:
 ; CHECK-NEXT: .param     i32{{$}}
 ; CHECK-NEXT: .result    i32{{$}}
-; CHECK-NEXT: .local     i32{{$}}
 ; CHECK-NEXT: i32.load   $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: tee_local  $push[[NUM1:[0-9]+]]=, $1=, $pop[[NUM0]]{{$}}
 ; CHECK-NEXT: i32.const  $push[[NUM2:[0-9]+]]=, 4{{$}}
@@ -71,7 +70,6 @@ entry:
 ; CHECK-LABEL: arg_i32:
 ; CHECK-NEXT: .param     i32{{$}}
 ; CHECK-NEXT: .result    i32{{$}}
-; CHECK-NEXT: .local     i32{{$}}
 ; CHECK-NEXT: i32.load   $push[[NUM0:[0-9]+]]=, 0($0){{$}}
 ; CHECK-NEXT: i32.const  $push[[NUM1:[0-9]+]]=, 3{{$}}
 ; CHECK-NEXT: i32.add    $push[[NUM2:[0-9]+]]=, $pop[[NUM0]], $pop[[NUM1]]{{$}}
@@ -93,7 +91,6 @@ entry:
 
 ; CHECK-LABEL: arg_i128:
 ; CHECK-NEXT: .param i32, i32{{$}}
-; CHECK-NEXT: .local
 ; CHECK: i32.and
 ; CHECK: i64.load
 ; CHECK: i64.load
@@ -123,8 +120,8 @@ define void @caller_none() {
 ; disabling it.
 
 ; CHECK-LABEL: caller_some
-; CHECK: i32.store
-; CHECK: i64.store
+; CHECK-DAG: i32.store
+; CHECK-DAG: i64.store
 define void @caller_some() {
   call void (...) @callee(i32 0, double 2.0)
   ret void
diff --git a/test/CodeGen/WebAssembly/vtable.ll b/test/CodeGen/WebAssembly/vtable.ll
index 739ba2aaf5a58c7371aa352520df41287b68cddb..b39e7bc0f7f2b711e1bf0c7b80daedfa71fb18bf 100644
--- a/test/CodeGen/WebAssembly/vtable.ll
+++ b/test/CodeGen/WebAssembly/vtable.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -asm-verbose=false | FileCheck %s --check-prefix=TYPEINFONAME
-; RUN: llc < %s -asm-verbose=false | FileCheck %s --check-prefix=VTABLE
-; RUN: llc < %s -asm-verbose=false | FileCheck %s --check-prefix=TYPEINFO
+; RUN: llc < %s -asm-verbose=false -disable-wasm-explicit-locals | FileCheck %s --check-prefix=TYPEINFONAME
+; RUN: llc < %s -asm-verbose=false -disable-wasm-explicit-locals | FileCheck %s --check-prefix=VTABLE
+; RUN: llc < %s -asm-verbose=false -disable-wasm-explicit-locals | FileCheck %s --check-prefix=TYPEINFO
 
 ; Test that simple vtables assemble as expected.
 ;
@@ -12,7 +12,7 @@
 ; Each with a virtual dtor and method foo.
 
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 %struct.A = type { i32 (...)** }
 %struct.B = type { %struct.A }
@@ -36,7 +36,7 @@ target triple = "wasm32-unknown-unknown"
 @_ZTS1D = constant [3 x i8] c"1D\00"
 
 ; VTABLE:       .type _ZTV1A,@object
-; VTABLE-NEXT:  .section .data.rel.ro,"aw",@progbits
+; VTABLE-NEXT:  .section .data.rel.ro._ZTV1A,
 ; VTABLE-NEXT:  .globl _ZTV1A
 ; VTABLE-LABEL: _ZTV1A:
 ; VTABLE-NEXT:  .int32 0
@@ -47,6 +47,7 @@ target triple = "wasm32-unknown-unknown"
 ; VTABLE-NEXT:  .size _ZTV1A, 20
 @_ZTV1A = constant [5 x i8*] [i8* null, i8* bitcast ({ i8*, i8* }* @_ZTI1A to i8*), i8* bitcast (%struct.A* (%struct.A*)* @_ZN1AD2Ev to i8*), i8* bitcast (void (%struct.A*)* @_ZN1AD0Ev to i8*), i8* bitcast (void (%struct.A*)* @_ZN1A3fooEv to i8*)], align 4
 ; VTABLE:       .type _ZTV1B,@object
+; VTABLE-NEXT:  .section .data.rel.ro._ZTV1B,
 ; VTABLE-NEXT:  .globl _ZTV1B
 ; VTABLE-LABEL: _ZTV1B:
 ; VTABLE-NEXT:  .int32 0
@@ -57,6 +58,7 @@ target triple = "wasm32-unknown-unknown"
 ; VTABLE-NEXT:  .size _ZTV1B, 20
 @_ZTV1B = constant [5 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1B to i8*), i8* bitcast (%struct.A* (%struct.A*)* @_ZN1AD2Ev to i8*), i8* bitcast (void (%struct.B*)* @_ZN1BD0Ev to i8*), i8* bitcast (void (%struct.B*)* @_ZN1B3fooEv to i8*)], align 4
 ; VTABLE:       .type _ZTV1C,@object
+; VTABLE-NEXT:  .section .data.rel.ro._ZTV1C,
 ; VTABLE-NEXT:  .globl _ZTV1C
 ; VTABLE-LABEL: _ZTV1C:
 ; VTABLE-NEXT:  .int32 0
@@ -67,6 +69,7 @@ target triple = "wasm32-unknown-unknown"
 ; VTABLE-NEXT:  .size _ZTV1C, 20
 @_ZTV1C = constant [5 x i8*] [i8* null, i8* bitcast ({ i8*, i8*, i8* }* @_ZTI1C to i8*), i8* bitcast (%struct.A* (%struct.A*)* @_ZN1AD2Ev to i8*), i8* bitcast (void (%struct.C*)* @_ZN1CD0Ev to i8*), i8* bitcast (void (%struct.C*)* @_ZN1C3fooEv to i8*)], align 4
 ; VTABLE:       .type _ZTV1D,@object
+; VTABLE-NEXT:  .section .data.rel.ro._ZTV1D,
 ; VTABLE-NEXT:  .globl _ZTV1D
 ; VTABLE-LABEL: _ZTV1D:
 ; VTABLE-NEXT:  .int32 0
diff --git a/test/CodeGen/X86/2003-11-03-GlobalBool.ll b/test/CodeGen/X86/2003-11-03-GlobalBool.ll
index f201b981a87201a4e1cecf47799cbe8588cde74f..e0d4988abbf7fe3be7cebc9b346870e657775e60 100644
--- a/test/CodeGen/X86/2003-11-03-GlobalBool.ll
+++ b/test/CodeGen/X86/2003-11-03-GlobalBool.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | \
-; RUN:   not grep ".byte[[:space:]]*true"
+; RUN: llc < %s -march=x86 | FileCheck %s
 
-@X = global i1 true             ; <i1*> [#uses=0]
+@X = global i1 true
+; CHECK-NOT: .byte true
diff --git a/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll b/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
index dde210b776af79220537c933af300a8c1c0fc727..bd3317a68b8c734a21a3eb57efa658e770cbd49f 100644
--- a/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
+++ b/test/CodeGen/X86/2004-02-13-FrameReturnAddress.ll
@@ -1,18 +1,23 @@
-; RUN: llc < %s -march=x86 | grep "(%esp"
-; RUN: llc < %s -march=x86 | grep "pushl	%ebp" | count 1
-; RUN: llc < %s -march=x86 | grep "popl	%ebp" | count 1
+; RUN: llc < %s -march=x86 | FileCheck %s
 
 declare i8* @llvm.returnaddress(i32)
 
 declare i8* @llvm.frameaddress(i32)
 
 define i8* @test1() {
-        %X = call i8* @llvm.returnaddress( i32 0 )              ; <i8*> [#uses=1]
-        ret i8* %X
+; CHECK-LABEL: test1:
+entry:
+  %X = call i8* @llvm.returnaddress( i32 0 )
+  ret i8* %X
+; CHECK: movl {{.*}}(%esp), %eax
 }
 
 define i8* @test2() {
-        %X = call i8* @llvm.frameaddress( i32 0 )               ; <i8*> [#uses=1]
-        ret i8* %X
+; CHECK-LABEL: test2:
+entry:
+  %X = call i8* @llvm.frameaddress( i32 0 )
+  ret i8* %X
+; CHECK: pushl %ebp
+; CHECK: popl %ebp
 }
 
diff --git a/test/CodeGen/X86/2004-02-14-InefficientStackPointer.ll b/test/CodeGen/X86/2004-02-14-InefficientStackPointer.ll
index f986ebd35f85edc71df0a860e931dbf2568b4be6..d7f7e262d89345c5d57d0208364b8da4a9346d83 100644
--- a/test/CodeGen/X86/2004-02-14-InefficientStackPointer.ll
+++ b/test/CodeGen/X86/2004-02-14-InefficientStackPointer.ll
@@ -1,5 +1,10 @@
-; RUN: llc < %s -march=x86 | grep -i ESP | not grep sub
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+target triple = "i686-unknown-unknown"
 
 define i32 @test(i32 %X) {
-        ret i32 %X
+; CHECK-LABEL: test:
+entry:
+  ret i32 %X
+; CHECK-NOT: subl %esp
 }
diff --git a/test/CodeGen/X86/2005-01-17-CycleInDAG.ll b/test/CodeGen/X86/2005-01-17-CycleInDAG.ll
index 48236cd0c8fe5039cb57cd1c96cc8f17361ce55f..7bb634d97130ba69f7b7517bfc6d82740c50d82f 100644
--- a/test/CodeGen/X86/2005-01-17-CycleInDAG.ll
+++ b/test/CodeGen/X86/2005-01-17-CycleInDAG.ll
@@ -3,15 +3,18 @@
 ; is invalid code (there is no correct way to order the instruction).  Check
 ; that we do not fold the load into the sub.
 
-; RUN: llc < %s -march=x86 | not grep sub.*GLOBAL
+; RUN: llc < %s -march=x86 | FileCheck %s
 
-@GLOBAL = external global i32           ; <i32*> [#uses=1]
+@GLOBAL = external global i32
 
 define i32 @test(i32* %P1, i32* %P2, i32* %P3) nounwind {
-        %L = load i32, i32* @GLOBAL          ; <i32> [#uses=1]
-        store i32 12, i32* %P2
-        %Y = load i32, i32* %P3              ; <i32> [#uses=1]
-        %Z = sub i32 %Y, %L             ; <i32> [#uses=1]
-        ret i32 %Z
+; CHECK-LABEL: test:
+entry:
+  %L = load i32, i32* @GLOBAL
+  store i32 12, i32* %P2
+  %Y = load i32, i32* %P3
+  %Z = sub i32 %Y, %L
+  ret i32 %Z
+; CHECK-NOT: {{sub.*GLOBAL}}
 }
 
diff --git a/test/CodeGen/X86/2005-02-14-IllegalAssembler.ll b/test/CodeGen/X86/2005-02-14-IllegalAssembler.ll
index a05fc840922ffedb75d58ca180874dfaf7edbfdd..1e3a0937d5b1f3d56a4f2402525de9e039509a5e 100644
--- a/test/CodeGen/X86/2005-02-14-IllegalAssembler.ll
+++ b/test/CodeGen/X86/2005-02-14-IllegalAssembler.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=x86 | not grep 18446744073709551612
+; RUN: llc < %s -march=x86 | FileCheck %s
 
 @A = external global i32                ; <i32*> [#uses=1]
 @Y = global i32* getelementptr (i32, i32* @A, i32 -1)                ; <i32**> [#uses=0]
+; CHECK-NOT: 18446744073709551612
 
diff --git a/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll b/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
index f6b5b2c103fe448e7b2db65e482f75c000577bde..48f5bc3e2986bdd3a868c588ebc0574576d5bf36 100644
--- a/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
+++ b/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
@@ -1,20 +1,31 @@
-; RUN: llc < %s -march=x86 -mcpu=generic | \
-; RUN:   grep shld | count 1
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
 ;
 ; Check that the isel does not fold the shld, which already folds a load
 ; and has two uses, into a store.
 
-@A = external global i32                ; <i32*> [#uses=2]
+target triple = "i686-unknown-unknown"
+
+@A = external global i32
 
 define i32 @test5(i32 %B, i8 %C) {
-        %tmp.1 = load i32, i32* @A           ; <i32> [#uses=1]
-        %shift.upgrd.1 = zext i8 %C to i32              ; <i32> [#uses=1]
-        %tmp.2 = shl i32 %tmp.1, %shift.upgrd.1         ; <i32> [#uses=1]
-        %tmp.3 = sub i8 32, %C          ; <i8> [#uses=1]
-        %shift.upgrd.2 = zext i8 %tmp.3 to i32          ; <i32> [#uses=1]
-        %tmp.4 = lshr i32 %B, %shift.upgrd.2            ; <i32> [#uses=1]
-        %tmp.5 = or i32 %tmp.4, %tmp.2          ; <i32> [#uses=2]
-        store i32 %tmp.5, i32* @A
-        ret i32 %tmp.5
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl A, %eax
+; CHECK-NEXT:    shldl %cl, %edx, %eax
+; CHECK-NEXT:    movl %eax, A
+; CHECK-NEXT:    retl
+entry:
+  %tmp.1 = load i32, i32* @A
+  %shift.upgrd.1 = zext i8 %C to i32
+  %tmp.2 = shl i32 %tmp.1, %shift.upgrd.1
+  %tmp.3 = sub i8 32, %C
+  %shift.upgrd.2 = zext i8 %tmp.3 to i32
+  %tmp.4 = lshr i32 %B, %shift.upgrd.2
+  %tmp.5 = or i32 %tmp.4, %tmp.2
+  store i32 %tmp.5, i32* @A
+  ret i32 %tmp.5
 }
 
diff --git a/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll b/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
index f8bf0991fb148442ac716e3fcf903aeee74852d2..ca3eb9cda372e885ba927aeaa14551ef611e4cc5 100644
--- a/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
+++ b/test/CodeGen/X86/2006-03-01-InstrSchedBug.ll
@@ -1,12 +1,24 @@
-; RUN: llc < %s -march=x86 | not grep "subl.*%esp"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s
 
 define i32 @f(i32 %a, i32 %b) {
-        %tmp.2 = mul i32 %a, %a         ; <i32> [#uses=1]
-        %tmp.5 = shl i32 %a, 1          ; <i32> [#uses=1]
-        %tmp.6 = mul i32 %tmp.5, %b             ; <i32> [#uses=1]
-        %tmp.10 = mul i32 %b, %b                ; <i32> [#uses=1]
-        %tmp.7 = add i32 %tmp.10, %tmp.2                ; <i32> [#uses=1]
-        %tmp.11 = add i32 %tmp.7, %tmp.6                ; <i32> [#uses=1]
-        ret i32 %tmp.11
+; CHECK-LABEL: f:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl %ecx, %edx
+; CHECK-NEXT:    imull %edx, %edx
+; CHECK-NEXT:    imull %eax, %ecx
+; CHECK-NEXT:    imull %eax, %eax
+; CHECK-NEXT:    addl %edx, %eax
+; CHECK-NEXT:    leal (%eax,%ecx,2), %eax
+; CHECK-NEXT:    retl
+  %tmp.2 = mul i32 %a, %a
+  %tmp.5 = shl i32 %a, 1
+  %tmp.6 = mul i32 %tmp.5, %b
+  %tmp.10 = mul i32 %b, %b
+  %tmp.7 = add i32 %tmp.10, %tmp.2
+  %tmp.11 = add i32 %tmp.7, %tmp.6
+  ret i32 %tmp.11
 }
 
diff --git a/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll b/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
index 7673124d5dda93a54ed8bf2a991cfa162ad6e230..6963b1d92f6cca8a6ccaee3b305f0bc38b235dc7 100644
--- a/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
+++ b/test/CodeGen/X86/2006-03-02-InstrSchedBug.ll
@@ -1,13 +1,14 @@
 ; REQUIRES: asserts
-; RUN: llc < %s -march=x86  -stats 2>&1 | \
-; RUN:   grep asm-printer | grep 7
+; RUN: llc < %s -march=x86 -stats 2>&1 | FileCheck %s
+; CHECK: 7 asm-printer
 
 define i32 @g(i32 %a, i32 %b) nounwind {
-        %tmp.1 = shl i32 %b, 1          ; <i32> [#uses=1]
-        %tmp.3 = add i32 %tmp.1, %a             ; <i32> [#uses=1]
-        %tmp.5 = mul i32 %tmp.3, %a             ; <i32> [#uses=1]
-        %tmp.8 = mul i32 %b, %b         ; <i32> [#uses=1]
-        %tmp.9 = add i32 %tmp.5, %tmp.8         ; <i32> [#uses=1]
-        ret i32 %tmp.9
+entry:
+  %tmp.1 = shl i32 %b, 1
+  %tmp.3 = add i32 %tmp.1, %a
+  %tmp.5 = mul i32 %tmp.3, %a
+  %tmp.8 = mul i32 %b, %b
+  %tmp.9 = add i32 %tmp.5, %tmp.8
+  ret i32 %tmp.9
 }
 
diff --git a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
index f159bcdee134f3ee4a95b817c97e86c97232cc40..645221fe299e947aab0f23dc001d345063b042e3 100644
--- a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
+++ b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -march=x86 -mtriple=i686-darwin | FileCheck %s
-; RUN: llc < %s -march=x86 -mtriple=i686-darwin -addr-sink-using-gep=1 | FileCheck %s
 
 define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) nounwind {
 entry:
diff --git a/test/CodeGen/X86/2008-02-14-BitMiscompile.ll b/test/CodeGen/X86/2008-02-14-BitMiscompile.ll
index 259a3acd2db2367adf16553f37bb0d4044fcdce7..fdc1c3bb67bae59bdfd31b1cde131c8d93e79593 100644
--- a/test/CodeGen/X86/2008-02-14-BitMiscompile.ll
+++ b/test/CodeGen/X86/2008-02-14-BitMiscompile.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-unknown-unknown | FileCheck %s
 
 define i32 @test(i1 %A) {
@@ -9,7 +9,6 @@ define i32 @test(i1 %A) {
 ; CHECK-NEXT:    negl %eax
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    retl
-;
   %B = zext i1 %A to i32
   %C = sub i32 0, %B
   %D = and i32 %C, 255
diff --git a/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll b/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll
index fc5520e12ac047ffbf1270a829c0f9470dc75386..24abb719b0f960349b7b53a05ce465a0a66b3749 100644
--- a/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll
+++ b/test/CodeGen/X86/2010-04-30-LocalAlloc-LandingPad.ll
@@ -8,9 +8,10 @@ target triple = "i386-apple-darwin10.0.0"
 @.str = internal constant [4 x i8] c"%p\0A\00"    ; <[4 x i8]*> [#uses=1]
 @llvm.used = appending global [1 x i8*] [i8* bitcast (i8* (%struct.S*, i32, %struct.S*)* @_Z4test1SiS_ to i8*)], section "llvm.metadata" ; <[1 x i8*]*> [#uses=0]
 
-; Verify that %esi gets spilled before the call.
+; Verify that %s1 gets spilled before the call.
 ; CHECK: Z4test1SiS
-; CHECK: movl %esi,{{.*}}(%ebp) 
+; CHECK: leal 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]],{{.*}}(%ebp) ## 4-byte Spill
 ; CHECK: calll __Z6throwsv
 
 define i8* @_Z4test1SiS_(%struct.S* byval %s1, i32 %n, %struct.S* byval %s2) ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
diff --git a/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll b/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
index ab9715d22377e51ec03e8b0bf73191aaae326fb1..66d3f3108ec4d842313e6413888d4446aafd9f73 100644
--- a/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
+++ b/test/CodeGen/X86/2010-08-04-MaskedSignedCompare.ll
@@ -1,36 +1,58 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 ; PR7814
 
-@g_16 = global i64 -3738643449681751625, align 8  ; <i64*> [#uses=1]
-@g_38 = global i32 0, align 4                     ; <i32*> [#uses=2]
-@.str = private constant [4 x i8] c"%d\0A\00"     ; <[4 x i8]*> [#uses=1]
+@g_16 = global i64 -3738643449681751625, align 8
+@g_38 = global i32 0, align 4
+@.str = private constant [4 x i8] c"%d\0A\00"
 
 define i32 @main() nounwind {
+; CHECK-LABEL: main:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpq $0, {{.*}}(%rip)
+; CHECK-NEXT:    movb $-106, %al
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # BB#1: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:  .LBB0_2: # %entry
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jle .LBB0_3
+; CHECK-NEXT:  # BB#4: # %if.then
+; CHECK-NEXT:    movl $1, {{.*}}(%rip)
+; CHECK-NEXT:    movl $1, %esi
+; CHECK-NEXT:    jmp .LBB0_5
+; CHECK-NEXT:  .LBB0_3: # %entry.if.end_crit_edge
+; CHECK-NEXT:    movl {{.*}}(%rip), %esi
+; CHECK-NEXT:  .LBB0_5: # %if.end
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movl $.L.str, %edi
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    callq printf
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
 entry:
-  %tmp = load i64, i64* @g_16                          ; <i64> [#uses=1]
-  %not.lnot = icmp ne i64 %tmp, 0                 ; <i1> [#uses=1]
-  %conv = sext i1 %not.lnot to i64                ; <i64> [#uses=1]
-  %and = and i64 %conv, 150                       ; <i64> [#uses=1]
-  %conv.i = trunc i64 %and to i8                  ; <i8> [#uses=1]
-  %cmp = icmp sgt i8 %conv.i, 0                   ; <i1> [#uses=1]
+  %tmp = load i64, i64* @g_16
+  %not.lnot = icmp ne i64 %tmp, 0
+  %conv = sext i1 %not.lnot to i64
+  %and = and i64 %conv, 150
+  %conv.i = trunc i64 %and to i8
+  %cmp = icmp sgt i8 %conv.i, 0
   br i1 %cmp, label %if.then, label %entry.if.end_crit_edge
 
-; CHECK: andl	$150
-; CHECK-NEXT: testb
-; CHECK-NEXT: jle
-
-entry.if.end_crit_edge:                           ; preds = %entry
-  %tmp4.pre = load i32, i32* @g_38                     ; <i32> [#uses=1]
+entry.if.end_crit_edge:
+  %tmp4.pre = load i32, i32* @g_38
   br label %if.end
 
-if.then:                                          ; preds = %entry
+if.then:
   store i32 1, i32* @g_38
   br label %if.end
 
-if.end:                                           ; preds = %entry.if.end_crit_edge, %if.then
+if.end:
   %tmp4 = phi i32 [ %tmp4.pre, %entry.if.end_crit_edge ], [ 1, %if.then ] ; <i32> [#uses=1]
   %call5 = tail call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %tmp4) nounwind ; <i32> [#uses=0]
   ret i32 0
 }
 
 declare i32 @printf(i8* nocapture, ...) nounwind
+
diff --git a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
index b7380196bd9baacd8af216e5a17015035945fe75..54a7763eb696feaa6cc3bead9e9601712fd45d88 100644
--- a/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
+++ b/test/CodeGen/X86/2010-09-17-SideEffectsInChain.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -combiner-alias-analysis -march=x86-64 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=core2 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.4"
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index b52726962405cb5a2186a3ef1d9585874fcb5541..ba5de8eb5fcb76e3df9f5fe6c7b2ec99d0bee2f5 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -1,23 +1,44 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mcpu=corei7   | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s --check-prefix=X64
 
 ; Make sure that we don't crash when legalizing vselect and vsetcc and that
 ; we are able to generate vector blend instructions.
 
-; CHECK-LABEL: simple_widen
-; CHECK-NOT: blend
-; CHECK: ret
 define void @simple_widen(<2 x float> %a, <2 x float> %b) {
+; X32-LABEL: simple_widen:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    extractps $1, %xmm1, (%eax)
+; X32-NEXT:    movss %xmm1, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: simple_widen:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movlps %xmm1, (%rax)
+; X64-NEXT:    retq
 entry:
   %0 = select <2 x i1> undef, <2 x float> %a, <2 x float> %b
   store <2 x float> %0, <2 x float>* undef
   ret void
 }
 
-; CHECK-LABEL: complex_inreg_work
-; CHECK: blend
-; CHECK: ret
-
 define void @complex_inreg_work(<2 x float> %a, <2 x float> %b) {
+; X32-LABEL: complex_inreg_work:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    movaps %xmm0, %xmm2
+; X32-NEXT:    cmpordps %xmm0, %xmm0
+; X32-NEXT:    blendvps %xmm0, %xmm2, %xmm1
+; X32-NEXT:    extractps $1, %xmm1, (%eax)
+; X32-NEXT:    movss %xmm1, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: complex_inreg_work:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movaps %xmm0, %xmm2
+; X64-NEXT:    cmpordps %xmm0, %xmm0
+; X64-NEXT:    blendvps %xmm0, %xmm2, %xmm1
+; X64-NEXT:    movlps %xmm1, (%rax)
+; X64-NEXT:    retq
 entry:
   %0 = fcmp oeq <2 x float> undef, undef
   %1 = select <2 x i1> %0, <2 x float> %a, <2 x float> %b
@@ -25,22 +46,67 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: zero_test
-; CHECK: xorps %xmm0, %xmm0
-; CHECK: ret
-
 define void @zero_test() {
+; X32-LABEL: zero_test:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    pxor %xmm0, %xmm0
+; X32-NEXT:    pextrd $1, %xmm0, (%eax)
+; X32-NEXT:    movd %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: zero_test:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movlps %xmm0, (%rax)
+; X64-NEXT:    retq
 entry:
   %0 = select <2 x i1> undef, <2 x float> undef, <2 x float> zeroinitializer
   store <2 x float> %0, <2 x float>* undef
   ret void
 }
 
-; CHECK-LABEL: full_test
-; CHECK: blend
-; CHECK: ret
-
 define void @full_test() {
+; X32-LABEL: full_test:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    subl $60, %esp
+; X32-NEXT:  .Lcfi0:
+; X32-NEXT:    .cfi_def_cfa_offset 64
+; X32-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; X32-NEXT:    cvttps2dq %xmm2, %xmm0
+; X32-NEXT:    cvtdq2ps %xmm0, %xmm1
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    cmpltps %xmm2, %xmm0
+; X32-NEXT:    movaps {{.*#+}} xmm3 = <1,1,u,u>
+; X32-NEXT:    addps %xmm1, %xmm3
+; X32-NEXT:    movaps %xmm1, %xmm4
+; X32-NEXT:    blendvps %xmm0, %xmm3, %xmm4
+; X32-NEXT:    cmpeqps %xmm2, %xmm1
+; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    blendvps %xmm0, %xmm2, %xmm4
+; X32-NEXT:    extractps $1, %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movss %xmm4, {{[0-9]+}}(%esp)
+; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    addl $60, %esp
+; X32-NEXT:    retl
+;
+; X64-LABEL: full_test:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; X64-NEXT:    cvttps2dq %xmm2, %xmm0
+; X64-NEXT:    cvtdq2ps %xmm0, %xmm1
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    cmpltps %xmm2, %xmm0
+; X64-NEXT:    movaps {{.*#+}} xmm3 = <1,1,u,u>
+; X64-NEXT:    addps %xmm1, %xmm3
+; X64-NEXT:    movaps %xmm1, %xmm4
+; X64-NEXT:    blendvps %xmm0, %xmm3, %xmm4
+; X64-NEXT:    cmpeqps %xmm2, %xmm1
+; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    blendvps %xmm0, %xmm2, %xmm4
+; X64-NEXT:    movlps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movlps %xmm4, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
  entry:
    %Cy300 = alloca <4 x float>
    %Cy11a = alloca <2 x float>
@@ -62,5 +128,3 @@ define void @full_test() {
    store <2 x float> %8, <2 x float>* %Cy11a
    ret void
 }
-
-
diff --git a/test/CodeGen/X86/2011-10-21-widen-cmp.ll b/test/CodeGen/X86/2011-10-21-widen-cmp.ll
index 9e6e2f70b0a75060e4a83707f3443fb2c21b9f56..9232eba213bfa1cbad261de5d0eaf0729c951905 100644
--- a/test/CodeGen/X86/2011-10-21-widen-cmp.ll
+++ b/test/CodeGen/X86/2011-10-21-widen-cmp.ll
@@ -9,9 +9,7 @@ define void @cmp_2_floats(<2 x float> %a, <2 x float> %b) {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    movaps %xmm0, %xmm2
 ; CHECK-NEXT:    cmpordps %xmm0, %xmm0
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1]
-; CHECK-NEXT:    pslld $31, %xmm0
-; CHECK-NEXT:    blendvps %xmm2, %xmm1
+; CHECK-NEXT:    blendvps %xmm0, %xmm2, %xmm1
 ; CHECK-NEXT:    movlps %xmm1, (%rax)
 ; CHECK-NEXT:    retq
 entry:
@@ -26,7 +24,7 @@ define void @cmp_2_doubles(<2 x double> %a, <2 x double> %b) {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    movapd %xmm0, %xmm2
 ; CHECK-NEXT:    cmpordpd %xmm0, %xmm0
-; CHECK-NEXT:    blendvpd %xmm2, %xmm1
+; CHECK-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; CHECK-NEXT:    movapd %xmm1, (%rax)
 ; CHECK-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/2011-11-30-or.ll b/test/CodeGen/X86/2011-11-30-or.ll
index 8378a022eab7e01b3cc4d81152eca533cf625e9e..5c324a423923eb058957d24046c16752b4cd9750 100644
--- a/test/CodeGen/X86/2011-11-30-or.ll
+++ b/test/CodeGen/X86/2011-11-30-or.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.6.6"
 
 ; Test that the order of operands is correct
 ; CHECK: select_func
-; CHECK: pblendvb        {{LCPI0_[0-9]*}}(%rip), %xmm1
+; CHECK: pblendvb        %xmm0, {{LCPI0_[0-9]*}}(%rip), %xmm1
 ; CHECK: ret
 
 define void @select_func(<8 x i16> %in) {
diff --git a/test/CodeGen/X86/2011-12-15-vec_shift.ll b/test/CodeGen/X86/2011-12-15-vec_shift.ll
index 4d49b3af88ee1cccc9020ae6e4686a0700068b39..70783509bb7fde397945d538e6b818bb805fdda1 100644
--- a/test/CodeGen/X86/2011-12-15-vec_shift.ll
+++ b/test/CodeGen/X86/2011-12-15-vec_shift.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.7"
 define <16 x i8> @shift(<16 x i8> %a, <16 x i8> %b) nounwind {
   ; Make sure operands to pblend are in the right order.
   ; CHECK-W-SSE4: psllw $4, [[REG1:%xmm.]]
-  ; CHECK-W-SSE4: pblendvb [[REG1]],{{ %xmm.}}
+  ; CHECK-W-SSE4: pblendvb %xmm0, [[REG1]],{{ %xmm.}}
   ; CHECK-W-SSE4: psllw $2
 
   ; Make sure we're masking and pcmp'ing the VSELECT conditon vector.
diff --git a/test/CodeGen/X86/2011-12-8-bitcastintprom.ll b/test/CodeGen/X86/2011-12-8-bitcastintprom.ll
index 0cae34c9dfca60ed8e13db23ae74ee0be20a59bf..e2ccaa1b837862b851b23603312fec729416a83a 100644
--- a/test/CodeGen/X86/2011-12-8-bitcastintprom.ll
+++ b/test/CodeGen/X86/2011-12-8-bitcastintprom.ll
@@ -18,7 +18,6 @@ define void @prom_bug(<4 x i8> %t, i16* %p) {
 ; SSE41-LABEL: prom_bug:
 ; SSE41:       ## BB#0:
 ; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; SSE41-NEXT:    pextrw $0, %xmm0, (%rdi)
 ; SSE41-NEXT:    retq
   %r = bitcast <4 x i8> %t to <2 x i16>
diff --git a/test/CodeGen/X86/2012-07-10-extload64.ll b/test/CodeGen/X86/2012-07-10-extload64.ll
index a366102fbd74251b22e0ddae49b549e7f96c0dc0..a41123e40a5863e83fed06089f000a5ed029ac8b 100644
--- a/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -1,32 +1,42 @@
-; RUN: llc < %s -march=x86 -mcpu=corei7 -mtriple=i686-pc-win32 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-pc-win32 -mcpu=corei7 | FileCheck %s
 
-; CHECK: load_store
 define void @load_store(<4 x i16>* %in) {
+; CHECK-LABEL: load_store:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    pmovzxwd {{.*#+}} xmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; CHECK-NEXT:    paddw %xmm0, %xmm0
+; CHECK-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; CHECK-NEXT:    movq %xmm0, (%eax)
+; CHECK-NEXT:    retl
 entry:
-; CHECK: pmovzxwd
   %A27 = load <4 x i16>, <4 x i16>* %in, align 4
   %A28 = add <4 x i16> %A27, %A27
-; CHECK: movq
   store <4 x i16> %A28, <4 x i16>* %in, align 4
   ret void
-; CHECK: ret
 }
 
 ; Make sure that we store a 64bit value, even on 32bit systems.
-;CHECK-LABEL: store_64:
 define void @store_64(<2 x i32>* %ptr) {
+; CHECK-LABEL: store_64:
+; CHECK:       # BB#0: # %BB
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    movlps %xmm0, (%eax)
+; CHECK-NEXT:    retl
 BB:
   store <2 x i32> zeroinitializer, <2 x i32>* %ptr
   ret void
-;CHECK: movlps
-;CHECK: ret
 }
 
-;CHECK-LABEL: load_64:
 define <2 x i32> @load_64(<2 x i32>* %ptr) {
+; CHECK-LABEL: load_64:
+; CHECK:       # BB#0: # %BB
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
+; CHECK-NEXT:    retl
 BB:
   %t = load <2 x i32>, <2 x i32>* %ptr
   ret <2 x i32> %t
-;CHECK: pmovzxdq
-;CHECK: ret
 }
diff --git a/test/CodeGen/X86/2012-11-28-merge-store-alias.ll b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
index c16deeff3d99a01d47d1dbb0606ee63119a8e029..2e8206a75916304efd823a2b4bc4d3e44d979ac5 100644
--- a/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
+++ b/test/CodeGen/X86/2012-11-28-merge-store-alias.ll
@@ -3,8 +3,8 @@
 ; CHECK: merge_stores_can
 ; CHECK: callq foo
 ; CHECK: xorps %xmm0, %xmm0
-; CHECK-NEXT: movl 36(%rsp), %ebp
 ; CHECK-NEXT: movups  %xmm0
+; CHECK-NEXT: movl 36(%rsp), %ebp
 ; CHECK: callq foo
 ; CHECK: ret
 declare i32 @foo([10 x i32]* )
diff --git a/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll b/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9dff4e596caa3f6e468670a674ec2f0e7c5cd815
--- /dev/null
+++ b/test/CodeGen/X86/DynamicCalleeSavedRegisters.ll
@@ -0,0 +1,60 @@
+; RUN: llc < %s -mtriple=i386-linux-gnu  | FileCheck --check-prefix=CHECK %s
+
+declare x86_regcallcc i32 @callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0);
+
+; In RegCall calling convention, ESI and EDI are callee saved registers.
+; One might think that the caller could assume that ESI value is the same before
+; and after calling the callee.
+; However, RegCall also says that a register that was used for 
+; passing/returning argumnets, can be assumed to be modified by the callee.
+; In other words, it is no longer a callee saved register.
+; In this case we want to see that EDX/ECX values are saved and EDI/ESI are assumed
+; to be modified by the callee.
+; This is a hipe CC function that doesn't save any register for the caller.
+; So we can be sure that there is no other reason to save EDX/ECX.
+; The caller arguments are expected to be passed (in the following order) 
+; in registers: ESI, EBP, EAX, EDX and ECX.
+define cc 11 i32 @caller(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0) nounwind {
+  %b1 = call x86_regcallcc i32 @callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0)
+  %b2 = add i32 %b1, %d0
+  %b3 = add i32 %b2, %e0
+  ret i32 %b3
+}
+; CHECK-LABEL:  caller
+; CHECK:        subl    $12, %esp
+; CHECK-NEXT:   movl    %ecx, 8(%esp)
+; CHECK-NEXT:   movl    %edx, %ebx
+; CHECK-NEXT:   movl    %eax, %edx
+; CHECK-NEXT:   movl    %esi, %eax
+; CHECK-NEXT:   movl    %ebp, %ecx
+; CHECK-NEXT:   movl    %ebx, %edi
+; CHECK-NEXT:   movl    8(%esp), %ebp
+; CHECK-NEXT:   movl    %ebp, %esi
+; CHECK-NEXT:   calll   callee
+; CHECK-NEXT:   leal    (%eax,%ebx), %esi
+; CHECK-NEXT:   addl    %ebp, %esi
+; CHECK-NEXT:   addl    $12, %esp
+; CHECK-NEXT:   retl
+
+!hipe.literals = !{ !0, !1, !2 }
+!0 = !{ !"P_NSP_LIMIT", i32 120 }
+!1 = !{ !"X86_LEAF_WORDS", i32 24 }
+!2 = !{ !"AMD64_LEAF_WORDS", i32 18 }
+
+; Make sure that the callee doesn't save parameters that were passed as arguments.
+; The caller arguments are expected to be passed (in the following order) 
+; in registers: EAX, ECX, EDX, EDI and ESI.
+; The result will return in EAX, ECX and EDX.
+define x86_regcallcc {i32, i32, i32} @test_callee(i32 %a0, i32 %b0, i32 %c0, i32 %d0, i32 %e0) nounwind {
+  %b1 = mul i32 7, %e0
+  %b2 = udiv i32 5, %e0
+  %b3 = mul i32 7, %d0
+  %b4 = insertvalue {i32, i32, i32} undef, i32 %b1, 0
+  %b5 = insertvalue {i32, i32, i32} %b4, i32 %b2, 1
+  %b6 = insertvalue {i32, i32, i32} %b5, i32 %b3, 2
+  ret {i32, i32, i32} %b6
+}
+; CHECK-LABEL: test_callee
+; CHECK-NOT:   pushl %esi
+; CHECK-NOT:   pushl %edi
+; CHECK:       retl
diff --git a/test/CodeGen/X86/GlobalISel/X86-regbankselect.mir b/test/CodeGen/X86/GlobalISel/X86-regbankselect.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c4e5fb2d05fc0348ccd39e0640063a099589ca8d
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/X86-regbankselect.mir
@@ -0,0 +1,634 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel                       -run-pass=regbankselect %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -regbankselect-greedy -run-pass=regbankselect %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=GREEDY
+
+--- |
+  ; ModuleID = 'tmp.ll'
+  source_filename = "tmp.ll"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64--linux-gnu"
+
+  define i8 @test_add_i8(i8 %arg1, i8 %arg2) {
+    %ret = add i8 %arg1, %arg2
+    ret i8 %ret
+  }
+
+  define i16 @test_add_i16(i16 %arg1, i16 %arg2) {
+    %ret = add i16 %arg1, %arg2
+    ret i16 %ret
+  }
+
+  define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
+    %ret = add i32 %arg1, %arg2
+    ret i32 %ret
+  }
+
+  define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
+    %ret = add i64 %arg1, %arg2
+    ret i64 %ret
+  }
+
+  define float @test_add_float(float %arg1, float %arg2) {
+    %ret = fadd float %arg1, %arg2
+    ret float %ret
+  }
+
+  define double @test_add_double(double %arg1, double %arg2) {
+    %ret = fadd double %arg1, %arg2
+    ret double %ret
+  }
+
+  define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
+    %ret = add <4 x i32> %arg1, %arg2
+    ret <4 x i32> %ret
+  }
+
+  define <4 x float> @test_add_v4f32(<4 x float> %arg1, <4 x float> %arg2) {
+    %ret = fadd <4 x float> %arg1, %arg2
+    ret <4 x float> %ret
+  }
+
+  define i8 @test_load_i8(i8* %p1) {
+    %r = load i8, i8* %p1
+    ret i8 %r
+  }
+
+  define i16 @test_load_i16(i16* %p1) {
+    %r = load i16, i16* %p1
+    ret i16 %r
+  }
+
+  define i32 @test_load_i32(i32* %p1) {
+    %r = load i32, i32* %p1
+    ret i32 %r
+  }
+
+  define i64 @test_load_i64(i64* %p1) {
+    %r = load i64, i64* %p1
+    ret i64 %r
+  }
+
+  define float @test_load_float(float* %p1) {
+    %r = load float, float* %p1
+    ret float %r
+  }
+
+  define double @test_load_double(double* %p1) {
+    %r = load double, double* %p1
+    ret double %r
+  }
+
+  define <4 x i32> @test_load_v4i32(<4 x i32>* %p1) {
+    %r = load <4 x i32>, <4 x i32>* %p1, align 16
+    ret <4 x i32> %r
+  }
+
+  define i32* @test_store_i32(i32 %val, i32* %p1) {
+    store i32 %val, i32* %p1
+    ret i32* %p1
+  }
+
+  define i64* @test_store_i64(i64 %val, i64* %p1) {
+    store i64 %val, i64* %p1
+    ret i64* %p1
+  }
+
+  define float* @test_store_float(float %val, float* %p1) {
+    store float %val, float* %p1
+    ret float* %p1
+  }
+
+  define double* @test_store_double(double %val, double* %p1) {
+    store double %val, double* %p1
+    ret double* %p1
+  }
+
+  define void @constInt_check() {
+    ret void
+  }
+
+...
+---
+name:            test_add_i8
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_add_i8
+# CHECK: registers:
+# CHECK:  - { id: 0, class: gpr }
+# CHECK:  - { id: 1, class: gpr }
+# CHECK:  - { id: 2, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s8) = COPY %edi
+    %1(s8) = COPY %esi
+    %2(s8) = G_ADD %0, %1
+    %al = COPY %2(s8)
+    RET 0, implicit %al
+
+...
+---
+name:            test_add_i16
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_add_i16
+# CHECK: registers:
+# CHECK:  - { id: 0, class: gpr }
+# CHECK:  - { id: 1, class: gpr }
+# CHECK:  - { id: 2, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s16) = COPY %edi
+    %1(s16) = COPY %esi
+    %2(s16) = G_ADD %0, %1
+    %ax = COPY %2(s16)
+    RET 0, implicit %ax
+
+...
+---
+name:            test_add_i32
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_add_i32
+# CHECK: registers:
+# CHECK:  - { id: 0, class: gpr }
+# CHECK:  - { id: 1, class: gpr }
+# CHECK:  - { id: 2, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s32) = G_ADD %0, %1
+    %eax = COPY %2(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_add_i64
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_add_i64
+# CHECK: registers:
+# CHECK:  - { id: 0, class: gpr }
+# CHECK:  - { id: 1, class: gpr }
+# CHECK:  - { id: 2, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %rsi
+
+    %0(s64) = COPY %rdi
+    %1(s64) = COPY %rsi
+    %2(s64) = G_ADD %0, %1
+    %rax = COPY %2(s64)
+    RET 0, implicit %rax
+
+...
+---
+name:            test_add_float
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_add_float
+# CHECK: registers:
+# CHECK:  - { id: 0, class: vecr }
+# CHECK:  - { id: 1, class: vecr }
+# CHECK:  - { id: 2, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(s32) = COPY %xmm0
+    %1(s32) = COPY %xmm1
+    %2(s32) = G_FADD %0, %1
+    %xmm0 = COPY %2(s32)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_add_double
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_add_double
+# CHECK: registers:
+# CHECK:  - { id: 0, class: vecr }
+# CHECK:  - { id: 1, class: vecr }
+# CHECK:  - { id: 2, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(s64) = COPY %xmm0
+    %1(s64) = COPY %xmm1
+    %2(s64) = G_FADD %0, %1
+    %xmm0 = COPY %2(s64)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_add_v4i32
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_add_v4i32
+# CHECK: registers:
+# CHECK:  - { id: 0, class: vecr }
+# CHECK:  - { id: 1, class: vecr }
+# CHECK:  - { id: 2, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(<4 x s32>) = COPY %xmm1
+    %2(<4 x s32>) = G_ADD %0, %1
+    %xmm0 = COPY %2(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_add_v4f32
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+# CHECK-LABEL: name:            test_add_v4f32
+# CHECK: registers:
+# CHECK:  - { id: 0, class: vecr }
+# CHECK:  - { id: 1, class: vecr }
+# CHECK:  - { id: 2, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(<4 x s32>) = COPY %xmm1
+    %2(<4 x s32>) = G_FADD %0, %1
+    %xmm0 = COPY %2(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_load_i8
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_load_i8
+# CHECK: registers:
+# CHECK:   - { id: 0, class: gpr }
+# CHECK:   - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1)
+    %al = COPY %1(s8)
+    RET 0, implicit %al
+
+...
+---
+name:            test_load_i16
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_load_i16
+# CHECK: registers:
+# CHECK:   - { id: 0, class: gpr }
+# CHECK:   - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1)
+    %ax = COPY %1(s16)
+    RET 0, implicit %ax
+
+...
+---
+name:            test_load_i32
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_load_i32
+# CHECK: registers:
+# CHECK:   - { id: 0, class: gpr }
+# CHECK:   - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
+    %eax = COPY %1(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            test_load_i64
+alignment:       4
+exposesReturnsTwice: false
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_load_i64
+# CHECK: registers:
+# CHECK:   - { id: 0, class: gpr }
+# CHECK:   - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
+    %rax = COPY %1(s64)
+    RET 0, implicit %rax
+
+...
+---
+name:            test_load_float
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_load_float
+# CHECK: registers:
+# CHECK:   - { id: 0, class: gpr }
+# CHECK:   - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
+    %xmm0 = COPY %1(s32)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_load_double
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_load_double
+# CHECK: registers:
+# CHECK:   - { id: 0, class: gpr }
+# CHECK:   - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
+    %xmm0 = COPY %1(s64)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_load_v4i32
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_load_v4i32
+# CHECK: registers:
+# CHECK:   - { id: 0, class: gpr }
+# CHECK:   - { id: 1, class: vecr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1)
+    %xmm0 = COPY %1(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_store_i32
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_store_i32
+# CHECK: registers:
+# CHECK:   - { id: 0, class: gpr }
+# CHECK:   - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %rsi
+
+    %0(s32) = COPY %edi
+    %1(p0) = COPY %rsi
+    G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+name:            test_store_i64
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_store_i64
+# CHECK: registers:
+# CHECK:   - { id: 0, class: gpr }
+# CHECK:   - { id: 1, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %rsi
+
+    %0(s64) = COPY %rdi
+    %1(p0) = COPY %rsi
+    G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+name:            test_store_float
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_store_float
+# CHECK: registers:
+
+# FAST-NEXT:    - { id: 0, class: vecr }
+# FAST-NEXT:    - { id: 1, class: gpr }
+# FAST-NEXT:    - { id: 2, class: gpr }
+
+# GREEDY-NEXT:    - { id: 0, class: vecr }
+# GREEDY-NEXT:    - { id: 1, class: gpr }
+
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(s32) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    ; CHECK:      %1(p0) = COPY %rdi
+
+    ; FAST-NEXT:  %2(s32) = COPY %0(s32)
+    ; FAST-NEXT:  G_STORE %2(s32), %1(p0) :: (store 4 into %ir.p1)
+
+    ; GREEDY-NEXT:  G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
+
+    G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+name:            test_store_double
+alignment:       4
+legalized:       true
+regBankSelected: false
+selected:        false
+# CHECK-LABEL: name:            test_store_double
+# CHECK: registers:
+
+# FAST-NEXT:    - { id: 0, class: vecr }
+# FAST-NEXT:    - { id: 1, class: gpr }
+# FAST-NEXT:    - { id: 2, class: gpr }
+
+# GREEDY-NEXT:    - { id: 0, class: vecr }
+# GREEDY-NEXT:    - { id: 1, class: gpr }
+
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(s64) = COPY %xmm0
+    %1(p0) = COPY %rdi
+
+    ; CHECK:       %1(p0) = COPY %rdi
+
+    ; FAST-NEXT:   %2(s64) = COPY %0(s64)
+    ; FAST-NEXT:   G_STORE %2(s64), %1(p0) :: (store 8 into %ir.p1)
+
+    ; GREEDY-NEXT: G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1)
+
+    G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+name:            constInt_check
+alignment:       4
+legalized:       true
+# CHECK-LABEL: name:            constInt_check
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gpr }
+# CHECK-NEXT:  - { id: 1, class: gpr }
+# CHECK-NEXT:  - { id: 2, class: gpr }
+# CHECK-NEXT:  - { id: 3, class: gpr }
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+body:             |
+  bb.0 (%ir-block.0):
+    %0(s8) = G_CONSTANT i8 8
+    %1(s16) = G_CONSTANT i16 16
+    %2(s32) = G_CONSTANT i32 32
+    %3(s64) = G_CONSTANT i64 64
+    RET 0
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/binop-isel.ll b/test/CodeGen/X86/GlobalISel/binop-isel.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8499dd958447b39833f69cf53a13038b80d34b4d
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/binop-isel.ll
@@ -0,0 +1,186 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu                                  -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512F
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512VL
+
+define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
+; ALL-LABEL: test_add_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    leaq (%rsi,%rdi), %rax
+; ALL-NEXT:    retq
+  %ret = add i64 %arg1, %arg2
+  ret i64 %ret
+}
+
+define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
+; ALL-LABEL: test_add_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ALL-NEXT:    leal (%rsi,%rdi), %eax
+; ALL-NEXT:    retq
+  %ret = add i32 %arg1, %arg2
+  ret i32 %ret
+}
+
+define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
+; ALL-LABEL: test_sub_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    subq %rsi, %rdi
+; ALL-NEXT:    movq %rdi, %rax
+; ALL-NEXT:    retq
+  %ret = sub i64 %arg1, %arg2
+  ret i64 %ret
+}
+
+define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {
+; ALL-LABEL: test_sub_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    subl %esi, %edi
+; ALL-NEXT:    movl %edi, %eax
+; ALL-NEXT:    retq
+  %ret = sub i32 %arg1, %arg2
+  ret i32 %ret
+}
+
+define float @test_add_float(float %arg1, float %arg2) {
+; SSE-LABEL: test_add_float:
+; SSE:       # BB#0:
+; SSE-NEXT:    addss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_add_float:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
+; ALL_AVX-NEXT:    retq
+  %ret = fadd float %arg1, %arg2
+  ret float %ret
+}
+
+define double @test_add_double(double %arg1, double %arg2) {
+; SSE-LABEL: test_add_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    addsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_add_double:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
+; ALL_AVX-NEXT:    retq
+  %ret = fadd double %arg1, %arg2
+  ret double %ret
+}
+
+define float @test_sub_float(float %arg1, float %arg2) {
+; SSE-LABEL: test_sub_float:
+; SSE:       # BB#0:
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_sub_float:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; ALL_AVX-NEXT:    retq
+  %ret = fsub float %arg1, %arg2
+  ret float %ret
+}
+
+define double @test_sub_double(double %arg1, double %arg2) {
+; SSE-LABEL: test_sub_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_sub_double:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; ALL_AVX-NEXT:    retq
+  %ret = fsub double %arg1, %arg2
+  ret double %ret
+}
+
+define <4 x i32>  @test_add_v4i32(<4 x i32> %arg1, <4 x i32>  %arg2) {
+; SSE-LABEL: test_add_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    paddd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_add_v4i32:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
+; ALL_AVX-NEXT:    retq
+  %ret = add <4 x i32>  %arg1, %arg2
+  ret <4 x i32>  %ret
+}
+
+define <4 x i32>  @test_sub_v4i32(<4 x i32> %arg1, <4 x i32>  %arg2) {
+; SSE-LABEL: test_sub_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    psubd %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_sub_v4i32:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vpsubd %xmm1, %xmm0, %xmm0
+; ALL_AVX-NEXT:    retq
+  %ret = sub <4 x i32>  %arg1, %arg2
+  ret <4 x i32>  %ret
+}
+
+define <4 x float>  @test_add_v4f32(<4 x float> %arg1, <4 x float>  %arg2) {
+; SSE-LABEL: test_add_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    addps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_add_v4f32:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vaddps %xmm1, %xmm0, %xmm0
+; ALL_AVX-NEXT:    retq
+  %ret = fadd <4 x float>  %arg1, %arg2
+  ret <4 x float>  %ret
+}
+
+define <4 x float>  @test_sub_v4f32(<4 x float> %arg1, <4 x float>  %arg2) {
+; SSE-LABEL: test_sub_v4f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    subps %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_sub_v4f32:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vsubps %xmm1, %xmm0, %xmm0
+; ALL_AVX-NEXT:    retq
+  %ret = fsub <4 x float>  %arg1, %arg2
+  ret <4 x float>  %ret
+}
+
+define i32  @test_copy_float(float %val) {
+; SSE-LABEL: test_copy_float:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_copy_float:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vmovd %xmm0, %eax
+; ALL_AVX-NEXT:    retq
+  %r = bitcast float %val to i32
+  ret i32 %r
+}
+
+define float  @test_copy_i32(i32 %val) {
+; SSE-LABEL: test_copy_i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movd %edi, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_copy_i32:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vmovd %edi, %xmm0
+; ALL_AVX-NEXT:    retq
+  %r = bitcast i32 %val to float
+  ret float %r
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/constant.ll b/test/CodeGen/X86/GlobalISel/constant.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cab043a51f0526ce46607acddc4bf501cddd7013
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/constant.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+define i8 @const_i8() {
+; ALL-LABEL: const_i8:
+; ALL:       # BB#0:
+; ALL-NEXT:    movb $2, %al
+; ALL-NEXT:    retq
+  ret i8 2
+}
+
+define i16 @const_i16() {
+; ALL-LABEL: const_i16:
+; ALL:       # BB#0:
+; ALL-NEXT:    movw $3, %ax
+; ALL-NEXT:    retq
+  ret i16 3
+}
+
+define i32 @const_i32() {
+; ALL-LABEL: const_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    movl $4, %eax
+; ALL-NEXT:    retq
+  ret i32 4
+}
+
+define i64 @const_i64() {
+; ALL-LABEL: const_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    movabsq $68719476720, %rax # imm = 0xFFFFFFFF0
+; ALL-NEXT:    retq
+  ret i64 68719476720
+}
+
+;i64 value fit into u32
+define i64 @const_i64_u32() {
+; ALL-LABEL: const_i64_u32:
+; ALL:       # BB#0:
+; ALL-NEXT:    movq $1879048192, %rax # imm = 0x70000000
+; ALL-NEXT:    retq
+  ret i64 1879048192
+}
+
+;i64 value fit into i32
+define i64 @const_i64_i32() {
+; ALL-LABEL: const_i64_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    movq $-1, %rax
+; ALL-NEXT:    retq
+  ret i64 -1
+}
+
+
diff --git a/test/CodeGen/X86/GlobalISel/frameIndex-instructionselect.mir b/test/CodeGen/X86/GlobalISel/frameIndex-instructionselect.mir
new file mode 100644
index 0000000000000000000000000000000000000000..2fa9ac23a7afa4501eb5552a79dc8a652b58b698
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/frameIndex-instructionselect.mir
@@ -0,0 +1,36 @@
+# RUN: llc -mtriple=x86_64-linux-gnu    -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64
+# RUN: llc -mtriple=i386-linux-gnu      -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32
+# RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32ABI
+
+--- |
+  define i32* @allocai32() {
+    %ptr1 = alloca i32
+    ret i32* %ptr1
+  }
+
+...
+---
+name:            allocai32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name:            allocai32
+# CHECK: registers:
+# CHECK-X32:     - { id: 0, class: gr32 }
+# CHECK-X32ABI:  - { id: 0, class: gr32 }
+# CHECK-X64:     - { id: 0, class: gr64 }
+registers:
+  - { id: 0, class: gpr }
+stack:
+  - { id: 0, name: ptr1, offset: 0, size: 4, alignment: 4 }
+
+# CHECK-X32:    %0 = LEA32r %stack.0.ptr1, 1, _, 0, _
+# CHECK-X32ABI: %0 = LEA64_32r %stack.0.ptr1, 1, _, 0, _
+# CHECK-X64:    %0 = LEA64r %stack.0.ptr1, 1, _, 0, _
+body:             |
+  bb.1 (%ir-block.0):
+    %0(p0) = G_FRAME_INDEX %stack.0.ptr1
+    %eax = COPY %0(p0)
+    RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/frameIndex.ll b/test/CodeGen/X86/GlobalISel/frameIndex.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2bb11adcc3b562129aad1a2299b42d01c559ffcd
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/frameIndex.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu    -global-isel < %s -o - | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=x86_64-linux-gnu                 < %s -o - | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=i386-linux-gnu      -global-isel < %s -o - | FileCheck %s --check-prefix=X32
+; RUN: llc -mtriple=i386-linux-gnu                   < %s -o - | FileCheck %s --check-prefix=X32
+; RUN: llc -mtriple=x86_64-linux-gnux32 -global-isel < %s -o - | FileCheck %s --check-prefix=X32ABI
+; RUN: llc -mtriple=x86_64-linux-gnux32              < %s -o - | FileCheck %s --check-prefix=X32ABI
+
+define i32* @allocai32() {
+; X64-LABEL: allocai32:
+; X64:       # BB#0:
+; X64-NEXT:    leaq -4(%rsp), %rax
+; X64-NEXT:    retq
+;
+; X32-LABEL: allocai32:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:  .Lcfi0:
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:    movl %esp, %eax
+; X32-NEXT:    popl %ecx
+; X32-NEXT:    retl
+;
+; X32ABI-LABEL: allocai32:
+; X32ABI:       # BB#0:
+; X32ABI-NEXT:    leal -4(%rsp), %eax
+; X32ABI-NEXT:    retq
+  %ptr1 = alloca i32
+  ret i32* %ptr1
+}
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
index 425d2609380e17d629bda31377f3cc79c228f3e2..c1bf444176660170035669d9573b3c97099da7ed 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-call.ll
@@ -5,6 +5,7 @@ define void @test_void_return() {
 ; CHECK-LABEL: name:            test_void_return
 ; CHECK:      alignment:       4
 ; CHECK-NEXT: exposesReturnsTwice: false
+; CHECK-NEXT: noVRegs:         false
 ; CHECK-NEXT: legalized:       false
 ; CHECK-NEXT: regBankSelected: false
 ; CHECK-NEXT: selected:        false
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
index e2d1ce6fe3d4bcb5e9f8931382532064daa65047..616cb70652bb1a7a9bbfbbf0fe057dfdc8c999d4 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv.ll
@@ -5,15 +5,15 @@
 @a7_8bit = external global i8
 @a8_8bit = external global i8
 
-define void @test_i8_args_8(i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4, 
-		            i8 %arg5, i8 %arg6, i8 %arg7, i8 %arg8) {
+define i8 @test_i8_args_8(i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4,
+		                      i8 %arg5, i8 %arg6, i8 %arg7, i8 %arg8) {
 
 ; ALL-LABEL: name:            test_i8_args_8
 
-; X64: fixedStack:      
+; X64: fixedStack:
 ; X64:  id: [[STACK8:[0-9]+]], offset: 8, size: 1, alignment: 8, isImmutable: true, isAliased: false
 ; X64:  id: [[STACK0:[0-9]+]], offset: 0, size: 1, alignment: 16, isImmutable: true, isAliased: false
-; X64: liveins: %ecx, %edi, %edx, %esi, %r8d, %r9d 
+; X64: liveins: %ecx, %edi, %edx, %esi, %r8d, %r9d
 ; X64:      [[ARG1:%[0-9]+]](s8) = COPY %edi
 ; X64-NEXT: %{{[0-9]+}}(s8) = COPY %esi
 ; X64-NEXT: %{{[0-9]+}}(s8) = COPY %edx
@@ -25,7 +25,7 @@ define void @test_i8_args_8(i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4,
 ; X64-NEXT: [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
 ; X64-NEXT: [[ARG8:%[0-9]+]](s8) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK8]], align 0)
 
-; X32: fixedStack:      
+; X32: fixedStack:
 ; X32:  id: [[STACK28:[0-9]+]], offset: 28, size: 1, alignment: 4, isImmutable: true, isAliased: false }
 ; X32:  id: [[STACK24:[0-9]+]], offset: 24, size: 1, alignment: 8, isImmutable: true, isAliased: false }
 ; X32:  id: [[STACK20:[0-9]+]], offset: 20, size: 1, alignment: 4, isImmutable: true, isAliased: false }
@@ -40,7 +40,7 @@ define void @test_i8_args_8(i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4,
 ; X32-NEXT:  [[ARG2:%[0-9]+]](s8) = G_LOAD [[ARG2_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK4]], align 0)
 ; X32-NEXT:  [[ARG3_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
 ; X32-NEXT:  [[ARG3:%[0-9]+]](s8) = G_LOAD [[ARG3_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK8]], align 0)
-; X32-NEXT:  [[ARG4_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]    
+; X32-NEXT:  [[ARG4_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]
 ; X32-NEXT:  [[ARG4:%[0-9]+]](s8) = G_LOAD [[ARG4_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK12]], align 0)
 ; X32-NEXT:  [[ARG5_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]]
 ; X32-NEXT:  [[ARG5:%[0-9]+]](s8) = G_LOAD [[ARG5_ADDR]](p0) :: (invariant load 1 from %fixed-stack.[[STACK16]], align 0)
@@ -53,30 +53,33 @@ define void @test_i8_args_8(i8 %arg1, i8 %arg2, i8 %arg3, i8 %arg4,
 
 ; ALL-NEXT:  [[GADDR_A1:%[0-9]+]](p0) = G_GLOBAL_VALUE @a1_8bit
 ; ALL-NEXT:  [[GADDR_A7:%[0-9]+]](p0) = G_GLOBAL_VALUE @a7_8bit
-; ALL-NEXT:  [[GADDR_A8:%[0-9]+]](p0) = G_GLOBAL_VALUE @a8_8bit	
+; ALL-NEXT:  [[GADDR_A8:%[0-9]+]](p0) = G_GLOBAL_VALUE @a8_8bit
 ; ALL-NEXT:  G_STORE [[ARG1]](s8), [[GADDR_A1]](p0) :: (store 1 into @a1_8bit)
 ; ALL-NEXT:  G_STORE [[ARG7]](s8), [[GADDR_A7]](p0) :: (store 1 into @a7_8bit)
 ; ALL-NEXT:  G_STORE [[ARG8]](s8), [[GADDR_A8]](p0) :: (store 1 into @a8_8bit)
+; ALL-NEXT:  %al = COPY [[ARG1]](s8)
+; ALL-NEXT:  RET 0, implicit %al
+
 entry:
   store i8 %arg1, i8* @a1_8bit
   store i8 %arg7, i8* @a7_8bit
   store i8 %arg8, i8* @a8_8bit
-  ret void
+  ret i8 %arg1
 }
 
 @a1_32bit = external global i32
 @a7_32bit = external global i32
 @a8_32bit = external global i32
 
-define void @test_i32_args_8(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, 
-		            i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8) {
+define i32 @test_i32_args_8(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4,
+		                        i32 %arg5, i32 %arg6, i32 %arg7, i32 %arg8) {
 
 ; ALL-LABEL: name:            test_i32_args_8
 
-; X64: fixedStack:      
+; X64: fixedStack:
 ; X64:  id: [[STACK8:[0-9]+]], offset: 8, size: 4, alignment: 8, isImmutable: true, isAliased: false
 ; X64:  id: [[STACK0:[0-9]+]], offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false
-; X64: liveins: %ecx, %edi, %edx, %esi, %r8d, %r9d 
+; X64: liveins: %ecx, %edi, %edx, %esi, %r8d, %r9d
 ; X64:      [[ARG1:%[0-9]+]](s32) = COPY %edi
 ; X64-NEXT: %{{[0-9]+}}(s32) = COPY %esi
 ; X64-NEXT: %{{[0-9]+}}(s32) = COPY %edx
@@ -88,7 +91,7 @@ define void @test_i32_args_8(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4,
 ; X64-NEXT: [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
 ; X64-NEXT: [[ARG8:%[0-9]+]](s32) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK8]], align 0)
 
-; X32: fixedStack:      
+; X32: fixedStack:
 ; X32:  id: [[STACK28:[0-9]+]], offset: 28, size: 4, alignment: 4, isImmutable: true, isAliased: false }
 ; X32:  id: [[STACK24:[0-9]+]], offset: 24, size: 4, alignment: 8, isImmutable: true, isAliased: false }
 ; X32:  id: [[STACK20:[0-9]+]], offset: 20, size: 4, alignment: 4, isImmutable: true, isAliased: false }
@@ -99,30 +102,209 @@ define void @test_i32_args_8(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4,
 ; X32:  id: [[STACK0:[0-9]+]],  offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
 ; X32:       [[ARG1_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
 ; X32-NEXT:  [[ARG1:%[0-9]+]](s32) = G_LOAD [[ARG1_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
-; X32-NEXT:  [[ARG2_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]  
+; X32-NEXT:  [[ARG2_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
 ; X32-NEXT:  [[ARG2:%[0-9]+]](s32) = G_LOAD [[ARG2_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK4]], align 0)
-; X32-NEXT:  [[ARG3_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]  
+; X32-NEXT:  [[ARG3_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
 ; X32-NEXT:  [[ARG3:%[0-9]+]](s32) = G_LOAD [[ARG3_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK8]], align 0)
-; X32-NEXT:  [[ARG4_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]   
+; X32-NEXT:  [[ARG4_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]
 ; X32-NEXT:  [[ARG4:%[0-9]+]](s32) = G_LOAD [[ARG4_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK12]], align 0)
-; X32-NEXT:  [[ARG5_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]] 
+; X32-NEXT:  [[ARG5_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]]
 ; X32-NEXT:  [[ARG5:%[0-9]+]](s32) = G_LOAD [[ARG5_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK16]], align 0)
-; X32-NEXT:  [[ARG6_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK20]] 
+; X32-NEXT:  [[ARG6_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK20]]
 ; X32-NEXT:  [[ARG6:%[0-9]+]](s32) = G_LOAD [[ARG6_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK20]], align 0)
-; X32-NEXT:  [[ARG7_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK24]] 
+; X32-NEXT:  [[ARG7_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK24]]
 ; X32-NEXT:  [[ARG7:%[0-9]+]](s32) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK24]], align 0)
-; X32-NEXT:  [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK28]] 
+; X32-NEXT:  [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK28]]
 ; X32-NEXT:  [[ARG8:%[0-9]+]](s32) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK28]], align 0)
 
 ; ALL-NEXT:  [[GADDR_A1:%[0-9]+]](p0) = G_GLOBAL_VALUE @a1_32bit
 ; ALL-NEXT:  [[GADDR_A7:%[0-9]+]](p0) = G_GLOBAL_VALUE @a7_32bit
-; ALL-NEXT:  [[GADDR_A8:%[0-9]+]](p0) = G_GLOBAL_VALUE @a8_32bit	
+; ALL-NEXT:  [[GADDR_A8:%[0-9]+]](p0) = G_GLOBAL_VALUE @a8_32bit
 ; ALL-NEXT:  G_STORE [[ARG1]](s32), [[GADDR_A1]](p0) :: (store 4 into @a1_32bit)
 ; ALL-NEXT:  G_STORE [[ARG7]](s32), [[GADDR_A7]](p0) :: (store 4 into @a7_32bit)
 ; ALL-NEXT:  G_STORE [[ARG8]](s32), [[GADDR_A8]](p0) :: (store 4 into @a8_32bit)
+; ALL-NEXT:  %eax = COPY [[ARG1]](s32)
+; ALL-NEXT:  RET 0, implicit %eax
+
 entry:
   store i32 %arg1, i32* @a1_32bit
   store i32 %arg7, i32* @a7_32bit
-  store i32 %arg8, i32* @a8_32bit 
-  ret void
+  store i32 %arg8, i32* @a8_32bit
+  ret i32 %arg1
 }
+
+@a1_64bit = external global i64
+@a7_64bit = external global i64
+@a8_64bit = external global i64
+
+define i64 @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4,
+                            i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8) {
+
+; ALL-LABEL: name:            test_i64_args_8
+; X64: fixedStack:
+; X64:  id: [[STACK8:[0-9]+]], offset: 8, size: 8, alignment: 8, isImmutable: true, isAliased: false
+; X64:  id: [[STACK0:[0-9]+]], offset: 0, size: 8, alignment: 16, isImmutable: true, isAliased: false
+; X64: liveins: %rcx, %rdi, %rdx, %rsi, %r8, %r9
+; X64:      [[ARG1:%[0-9]+]](s64) = COPY %rdi
+; X64-NEXT: %{{[0-9]+}}(s64) = COPY %rsi
+; X64-NEXT: %{{[0-9]+}}(s64) = COPY %rdx
+; X64-NEXT: %{{[0-9]+}}(s64) = COPY %rcx
+; X64-NEXT: %{{[0-9]+}}(s64) = COPY %r8
+; X64-NEXT: %{{[0-9]+}}(s64) = COPY %r9
+; X64-NEXT: [[ARG7_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X64-NEXT: [[ARG7:%[0-9]+]](s64) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK0]], align 0)
+; X64-NEXT: [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
+; X64-NEXT: [[ARG8:%[0-9]+]](s64) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK8]], align 0)
+
+; X32: fixedStack:
+; X32:  id: [[STACK60:[0-9]+]], offset: 60, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK56:[0-9]+]], offset: 56, size: 4, alignment: 8, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK52:[0-9]+]], offset: 52, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK48:[0-9]+]], offset: 48, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK44:[0-9]+]], offset: 44, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK40:[0-9]+]], offset: 40, size: 4, alignment: 8, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK36:[0-9]+]], offset: 36, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK32:[0-9]+]], offset: 32, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK28:[0-9]+]], offset: 28, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK24:[0-9]+]], offset: 24, size: 4, alignment: 8, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK20:[0-9]+]], offset: 20, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK16:[0-9]+]], offset: 16, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK12:[0-9]+]], offset: 12, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK8:[0-9]+]], offset: 8, size: 4, alignment: 8, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK4:[0-9]+]], offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK0:[0-9]+]], offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+
+; X32:      [[ARG1L_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X32-NEXT: [[ARG1L:%[0-9]+]](s32) = G_LOAD [[ARG1L_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
+; X32-NEXT: [[ARG1H_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
+; X32-NEXT: [[ARG1H:%[0-9]+]](s32) = G_LOAD [[ARG1H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK4]], align 0)
+; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
+; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK8]], align 0)
+; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK12]]
+; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK12]], align 0)
+; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK16]]
+; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK16]], align 0)
+; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK20]]
+; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK20]], align 0)
+; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK24]]
+; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK24]], align 0)
+; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK28]]
+; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK28]], align 0)
+; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK32]]
+; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK32]], align 0)
+; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK36]]
+; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK36]], align 0)
+; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK40]]
+; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK40]], align 0)
+; X32-NEXT: %{{[0-9]+}}(p0) = G_FRAME_INDEX %fixed-stack.[[STACK44]]
+; X32-NEXT: %{{[0-9]+}}(s32) = G_LOAD %{{[0-9]+}}(p0) :: (invariant load 4 from %fixed-stack.[[STACK44]], align 0)
+; X32-NEXT: [[ARG7L_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK48]]
+; X32-NEXT: [[ARG7L:%[0-9]+]](s32) = G_LOAD [[ARG7L_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK48]], align 0)
+; X32-NEXT: [[ARG7H_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK52]]
+; X32-NEXT: [[ARG7H:%[0-9]+]](s32) = G_LOAD [[ARG7H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK52]], align 0)
+; X32-NEXT: [[ARG8L_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK56]]
+; X32-NEXT: [[ARG8L:%[0-9]+]](s32) = G_LOAD [[ARG8L_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK56]], align 0)
+; X32-NEXT: [[ARG8H_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK60]]
+; X32-NEXT: [[ARG8H:%[0-9]+]](s32) = G_LOAD [[ARG8H_ADDR]](p0) :: (invariant load 4 from %fixed-stack.[[STACK60]], align 0)
+
+; X32-NEXT: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF
+; X32-NEXT: [[ARG1_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG1L]](s32), 0
+; X32-NEXT: [[ARG1_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG1_TMP0]], [[ARG1H]](s32), 32
+; X32-NEXT: [[ARG1:%[0-9]+]](s64) = COPY [[ARG1_TMP1]]
+  ; ... a bunch more that we don't track ...
+  ; X32: IMPLICIT_DEF
+  ; X32: IMPLICIT_DEF
+  ; X32: IMPLICIT_DEF
+  ; X32: IMPLICIT_DEF
+  ; X32: IMPLICIT_DEF
+; X32: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF
+; X32-NEXT: [[ARG7_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG7L]](s32), 0
+; X32-NEXT: [[ARG7_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG7_TMP0]], [[ARG7H]](s32), 32
+; X32-NEXT: [[ARG7:%[0-9]+]](s64) = COPY [[ARG7_TMP1]]
+; X32-NEXT: [[UNDEF:%[0-9]+]](s64) = IMPLICIT_DEF
+; X32-NEXT: [[ARG8_TMP0:%[0-9]+]](s64) = G_INSERT [[UNDEF]], [[ARG8L]](s32), 0
+; X32-NEXT: [[ARG8_TMP1:%[0-9]+]](s64) = G_INSERT [[ARG8_TMP0]], [[ARG8H]](s32), 32
+; X32-NEXT: [[ARG8:%[0-9]+]](s64) = COPY [[ARG8_TMP1]]
+
+; ALL-NEXT: [[GADDR_A1:%[0-9]+]](p0) = G_GLOBAL_VALUE @a1_64bit
+; ALL-NEXT: [[GADDR_A7:%[0-9]+]](p0) = G_GLOBAL_VALUE @a7_64bit
+; ALL-NEXT: [[GADDR_A8:%[0-9]+]](p0) = G_GLOBAL_VALUE @a8_64bit
+; ALL-NEXT: G_STORE [[ARG1]](s64), [[GADDR_A1]](p0) :: (store 8 into @a1_64bit
+; ALL-NEXT: G_STORE [[ARG7]](s64), [[GADDR_A7]](p0) :: (store 8 into @a7_64bit
+; ALL-NEXT: G_STORE [[ARG8]](s64), [[GADDR_A8]](p0) :: (store 8 into @a8_64bit
+
+; X64-NEXT: %rax = COPY [[ARG1]](s64)
+; X64-NEXT: RET 0, implicit %rax
+
+; X32-NEXT: [[RETL:%[0-9]+]](s32) = G_EXTRACT [[ARG1:%[0-9]+]](s64), 0
+; X32-NEXT: [[RETH:%[0-9]+]](s32) = G_EXTRACT [[ARG1:%[0-9]+]](s64), 32
+; X32-NEXT: %eax = COPY [[RETL:%[0-9]+]](s32)
+; X32-NEXT: %edx = COPY [[RETH:%[0-9]+]](s32)
+; X32-NEXT: RET 0, implicit %eax, implicit %edx
+
+entry:
+  store i64 %arg1, i64* @a1_64bit
+  store i64 %arg7, i64* @a7_64bit
+  store i64 %arg8, i64* @a8_64bit
+  ret i64 %arg1
+}
+
+define float @test_float_args(float %arg1, float %arg2) {
+; ALL-LABEL:name:            test_float_args
+
+; X64: liveins: %xmm0, %xmm1
+; X64:      [[ARG1:%[0-9]+]](s32) = COPY %xmm0
+; X64-NEXT: [[ARG2:%[0-9]+]](s32) = COPY %xmm1
+; X64-NEXT: %xmm0 = COPY [[ARG2:%[0-9]+]](s32)
+; X64-NEXT: RET 0, implicit %xmm0
+
+; X32: fixedStack:
+; X32:  id: [[STACK4:[0-9]+]], offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK0:[0-9]+]], offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+; X32:       [[ARG1_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X32-NEXT:  [[ARG1:%[0-9]+]](s32) = G_LOAD [[ARG1_ADDR:%[0-9]+]](p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
+; X32-NEXT:  [[ARG2_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
+; X32-NEXT:  [[ARG2:%[0-9]+]](s32) = G_LOAD [[ARG2_ADDR:%[0-9]+]](p0) :: (invariant load 4 from %fixed-stack.[[STACK4]], align 0)
+; X32-NEXT:  %fp0 = COPY [[ARG2:%[0-9]+]](s32)
+; X32-NEXT:  RET 0, implicit %fp0
+
+  ret float %arg2
+}
+
+define double @test_double_args(double %arg1, double %arg2) {
+; ALL-LABEL:name:            test_double_args
+; X64: liveins: %xmm0, %xmm1
+; X64:     [[ARG1:%[0-9]+]](s64) = COPY %xmm0
+; X64-NEXT: [[ARG2:%[0-9]+]](s64) = COPY %xmm1
+; X64-NEXT: %xmm0 = COPY [[ARG2:%[0-9]+]](s64)
+; X64-NEXT: RET 0, implicit %xmm0
+
+; X32: fixedStack:
+; X32:  id: [[STACK4:[0-9]+]], offset: 8, size: 8, alignment: 8, isImmutable: true, isAliased: false }
+; X32:  id: [[STACK0:[0-9]+]], offset: 0, size: 8, alignment: 16, isImmutable: true, isAliased: false }
+; X32:       [[ARG1_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+; X32-NEXT:  [[ARG1:%[0-9]+]](s64) = G_LOAD [[ARG1_ADDR:%[0-9]+]](p0) :: (invariant load 8 from %fixed-stack.[[STACK0]], align 0)
+; X32-NEXT:  [[ARG2_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK4]]
+; X32-NEXT:  [[ARG2:%[0-9]+]](s64) = G_LOAD [[ARG2_ADDR:%[0-9]+]](p0) :: (invariant load 8 from %fixed-stack.[[STACK4]], align 0)
+; X32-NEXT:  %fp0 = COPY [[ARG2:%[0-9]+]](s64)
+; X32-NEXT:  RET 0, implicit %fp0
+
+  ret double %arg2
+}
+
+define i32 * @test_memop_i32(i32 * %p1) {
+; ALL-LABEL:name:            test_memop_i32
+;X64    liveins: %rdi
+;X64:       %0(p0) = COPY %rdi
+;X64-NEXT:  %rax = COPY %0(p0)
+;X64-NEXT:  RET 0, implicit %rax
+
+;X32: fixedStack:
+;X32:  id: [[STACK0:[0-9]+]], offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+;X32:         %1(p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
+;X32-NEXT:    %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.[[STACK0]], align 0)
+;X32-NEXT:    %eax = COPY %0(p0)
+;X32-NEXT:    RET 0, implicit %eax
+
+  ret i32 * %p1;
+}
\ No newline at end of file
diff --git a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll
index d9e9615a69404ac7ec54e9d5a830e7ace83566da..e2d938550aea0ffa2fdd2407b957968ce6a907f9 100644
--- a/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll
+++ b/test/CodeGen/X86/GlobalISel/irtranslator-callingconv_64bit.ll
@@ -1,35 +1,29 @@
-; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -stop-after=irtranslator < %s -o - | FileCheck %s --check-prefix=X64
+; RUN: llc -mtriple=x86_64-linux-gnu -global-isel -stop-after=irtranslator < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
 
-@a1_64bit = external global i64
-@a7_64bit = external global i64
-@a8_64bit = external global i64
+define <4 x i32> @test_v4i32_args(<4 x i32> %arg1, <4 x i32> %arg2) {
+; X64: name:            test_v4i32_args
+; X64: liveins: %xmm0, %xmm1
+; X64:      [[ARG1:%[0-9]+]](<4 x s32>) = COPY %xmm0
+; X64-NEXT: [[ARG2:%[0-9]+]](<4 x s32>) = COPY %xmm1
+; X64-NEXT: %xmm0 = COPY [[ARG2:%[0-9]+]](<4 x s32>)
+; X64-NEXT: RET 0, implicit %xmm0
+  ret <4 x i32> %arg2
+}
+
+define <8 x i32> @test_v8i32_args(<8 x i32> %arg1) {
+; X64: name:            test_v8i32_args
+; X64: liveins: %xmm0, %xmm1
+; X64:      [[ARG1L:%[0-9]+]](<4 x s32>) = COPY %xmm0
+; X64-NEXT: [[ARG1H:%[0-9]+]](<4 x s32>) = COPY %xmm1
+; X64-NEXT: [[UNDEF:%[0-9]+]](<8 x s32>) = IMPLICIT_DEF
+; X64-NEXT: [[ARG1_TMP0:%[0-9]+]](<8 x s32>) = G_INSERT [[UNDEF]], [[ARG1L]](<4 x s32>), 0
+; X64-NEXT: [[ARG1_TMP1:%[0-9]+]](<8 x s32>) = G_INSERT [[ARG1_TMP0]], [[ARG1H]](<4 x s32>), 128
+; X64-NEXT: [[ARG1:%[0-9]+]](<8 x s32>) = COPY [[ARG1_TMP1]]
+; X64-NEXT: [[RETL:%[0-9]+]](<4 x s32>) = G_EXTRACT [[ARG1:%[0-9]+]](<8 x s32>), 0
+; X64-NEXT: [[RETH:%[0-9]+]](<4 x s32>) = G_EXTRACT [[ARG1:%[0-9]+]](<8 x s32>), 128
+; X64-NEXT: %xmm0 = COPY [[RETL:%[0-9]+]](<4 x s32>)
+; X64-NEXT: %xmm1 = COPY [[RETH:%[0-9]+]](<4 x s32>)
+; X64-NEXT: RET 0, implicit %xmm0, implicit %xmm1
 
-define void @test_i64_args_8(i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, 
-		            i64 %arg5, i64 %arg6, i64 %arg7, i64 %arg8) {
-; X64-LABEL: name:            test_i64_args_8
-; X64: fixedStack:      
-; X64:  id: [[STACK8:[0-9]+]], offset: 8, size: 8, alignment: 8, isImmutable: true, isAliased: false
-; X64:  id: [[STACK0:[0-9]+]], offset: 0, size: 8, alignment: 16, isImmutable: true, isAliased: false
-; X64: liveins: %rcx, %rdi, %rdx, %rsi, %r8, %r9
-; X64:      [[ARG1:%[0-9]+]](s64) = COPY %rdi
-; X64-NEXT: %{{[0-9]+}}(s64) = COPY %rsi
-; X64-NEXT: %{{[0-9]+}}(s64) = COPY %rdx
-; X64-NEXT: %{{[0-9]+}}(s64) = COPY %rcx
-; X64-NEXT: %{{[0-9]+}}(s64) = COPY %r8
-; X64-NEXT: %{{[0-9]+}}(s64) = COPY %r9
-; X64-NEXT: [[ARG7_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK0]]
-; X64-NEXT: [[ARG7:%[0-9]+]](s64) = G_LOAD [[ARG7_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK0]], align 0)
-; X64-NEXT: [[ARG8_ADDR:%[0-9]+]](p0) = G_FRAME_INDEX %fixed-stack.[[STACK8]]
-; X64-NEXT: [[ARG8:%[0-9]+]](s64) = G_LOAD [[ARG8_ADDR]](p0) :: (invariant load 8 from %fixed-stack.[[STACK8]], align 0)
-; X64-NEXT: [[GADDR_A1:%[0-9]+]](p0) = G_GLOBAL_VALUE @a1_64bit
-; X64-NEXT: [[GADDR_A7:%[0-9]+]](p0) = G_GLOBAL_VALUE @a7_64bit
-; X64-NEXT: [[GADDR_A8:%[0-9]+]](p0) = G_GLOBAL_VALUE @a8_64bit
-; X64-NEXT: G_STORE [[ARG1]](s64), [[GADDR_A1]](p0) :: (store 8 into @a1_64bit)
-; X64-NEXT: G_STORE [[ARG7]](s64), [[GADDR_A7]](p0) :: (store 8 into @a7_64bit)
-; X64-NEXT: G_STORE [[ARG8]](s64), [[GADDR_A8]](p0) :: (store 8 into @a8_64bit)
-entry:
-  store i64 %arg1, i64* @a1_64bit
-  store i64 %arg7, i64* @a7_64bit
-  store i64 %arg8, i64* @a8_64bit 
-  ret void
+  ret <8 x i32> %arg1
 }
diff --git a/test/CodeGen/X86/GlobalISel/legalize-add.mir b/test/CodeGen/X86/GlobalISel/legalize-add.mir
new file mode 100644
index 0000000000000000000000000000000000000000..22619cc71033bf5ef8aff9f068cff562f9945404
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-add.mir
@@ -0,0 +1,40 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
+
+--- |
+  ; ModuleID = '<stdin>'
+  source_filename = "<stdin>"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64--linux-gnu"
+
+  define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
+    %ret = add i32 %arg1, %arg2
+    ret i32 %ret
+  }
+
+...
+---
+name:            test_add_i32
+alignment:       4
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+    ;  CHECK-LABEL: name: test_add_i32
+    ;  CHECK: [[VAL1:%.*]](s32) = COPY %edi
+    ;  CHECK: [[VAL2:%.*]](s32) = COPY %esi
+    ;  CHECK: [[RES:%.*]](s32) = G_ADD [[VAL1:%.*]], [[VAL2:%.*]]
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s32) = G_ADD %0, %1
+    %eax = COPY %2(s32)
+    RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/legalize-const.mir b/test/CodeGen/X86/GlobalISel/legalize-const.mir
new file mode 100644
index 0000000000000000000000000000000000000000..612d33a77fc96acde2fba65d1d4921afa4777f11
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-const.mir
@@ -0,0 +1,43 @@
+# RUN: llc -mtriple=i386-linux-gnu   -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64
+
+--- | 
+  define void @constInt_check() {
+    ret void
+  }
+
+...
+---
+name:            constInt_check
+# ALL-LABEL: name:            constInt_check
+registers:       
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+  - { id: 3, class: _ }
+  - { id: 4, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    ; ALL: %5(s8) = G_CONSTANT i8 -1
+    ; ALL: %0(s1) = G_TRUNC %5(s8)
+    %0(s1) = G_CONSTANT i1 1
+
+    ; ALL: %1(s8) = G_CONSTANT i8 8    
+    %1(s8) = G_CONSTANT i8 8   
+
+    ; ALL: %2(s16) = G_CONSTANT i16 16
+    %2(s16) = G_CONSTANT i16 16
+
+    ; ALL: %3(s32) = G_CONSTANT i32 32
+    %3(s32) = G_CONSTANT i32 32
+
+    ; X64: %4(s64) = G_CONSTANT i64 64
+    
+    ; X32: %6(s32) = G_CONSTANT i32 64
+    ; X32: %7(s32) = G_CONSTANT i32 0
+    ; X32: %4(s64) = G_MERGE_VALUES %6(s32), %7(s32) 
+    %4(s64) = G_CONSTANT i64 64
+
+    RET 0
+...
+
diff --git a/test/CodeGen/X86/GlobalISel/legalize-sub.mir b/test/CodeGen/X86/GlobalISel/legalize-sub.mir
new file mode 100644
index 0000000000000000000000000000000000000000..26ef285929a689fdf2e795549cf7bdf6db27e8e4
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/legalize-sub.mir
@@ -0,0 +1,40 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s
+
+--- |
+  ; ModuleID = '<stdin>'
+  source_filename = "<stdin>"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64--linux-gnu"
+
+  define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {
+    %ret = sub i32 %arg1, %arg2
+    ret i32 %ret
+  }
+
+...
+---
+name:            test_sub_i32
+alignment:       4
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: _ }
+  - { id: 1, class: _ }
+  - { id: 2, class: _ }
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+    ;  CHECK-LABEL: name: test_sub_i32
+    ;  CHECK: [[VAL1:%.*]](s32) = COPY %edi
+    ;  CHECK: [[VAL2:%.*]](s32) = COPY %esi
+    ;  CHECK: [[RES:%.*]](s32) = G_SUB [[VAL1:%.*]], [[VAL2:%.*]]
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s32) = G_SUB %0, %1
+    %eax = COPY %2(s32)
+    RET 0, implicit %eax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/memop-isel.ll b/test/CodeGen/X86/GlobalISel/memop-isel.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6fe66436e4a8a38832cc87ab9fd896393b3ed53f
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/memop-isel.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-linux-gnu                 			                                  -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST
+; RUN: llc -mtriple=x86_64-linux-gnu                                  -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                                            -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST   --check-prefix=AVX_FAST
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX_GREEDY
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f 	                                      -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST   --check-prefix=AVX512F_FAST
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512F_GREEDY
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl			                  -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST   --check-prefix=AVX512VL_FAST
+; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512VL_GREEDY
+
+
+define i8 @test_load_i8(i8 * %p1) {
+; ALL-LABEL: test_load_i8:
+; ALL:       # BB#0:
+; ALL-NEXT:    movb (%rdi), %al
+; ALL-NEXT:    retq
+  %r = load i8, i8* %p1
+  ret i8 %r
+}
+
+define i16 @test_load_i16(i16 * %p1) {
+; ALL-LABEL: test_load_i16:
+; ALL:       # BB#0:
+; ALL-NEXT:    movzwl (%rdi), %eax
+; ALL-NEXT:    retq
+  %r = load i16, i16* %p1
+  ret i16 %r
+}
+
+define i32 @test_load_i32(i32 * %p1) {
+; ALL-LABEL: test_load_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    movl (%rdi), %eax
+; ALL-NEXT:    retq
+  %r = load i32, i32* %p1
+  ret i32 %r
+}
+
+define i64 @test_load_i64(i64 * %p1) {
+; ALL-LABEL: test_load_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    movq (%rdi), %rax
+; ALL-NEXT:    retq
+  %r = load i64, i64* %p1
+  ret i64 %r
+}
+
+define float @test_load_float(float * %p1) {
+; SSE-LABEL: test_load_float:
+; SSE:       # BB#0:
+; SSE-NEXT:    movl (%rdi), %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_load_float:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    movl (%rdi), %eax
+; ALL_AVX-NEXT:    vmovd %eax, %xmm0
+; ALL_AVX-NEXT:    retq
+  %r = load float, float* %p1
+  ret float %r
+}
+
+define double @test_load_double(double * %p1) {
+; SSE-LABEL: test_load_double:
+; SSE:       # BB#0:
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movd %rax, %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_load_double:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    movq (%rdi), %rax
+; ALL_AVX-NEXT:    vmovq %rax, %xmm0
+; ALL_AVX-NEXT:    retq
+  %r = load double, double* %p1
+  ret double %r
+}
+
+define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) {
+; SSE-LABEL: test_load_v4i32_noalign:
+; SSE:       # BB#0:
+; SSE-NEXT:    movups (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_load_v4i32_noalign:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vmovups (%rdi), %xmm0
+; ALL_AVX-NEXT:    retq
+  %r = load <4 x i32>, <4 x i32>* %p1, align 1
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) {
+; SSE-LABEL: test_load_v4i32_align:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; ALL_AVX-LABEL: test_load_v4i32_align:
+; ALL_AVX:       # BB#0:
+; ALL_AVX-NEXT:    vmovaps (%rdi), %xmm0
+; ALL_AVX-NEXT:    retq
+  %r = load <4 x i32>, <4 x i32>* %p1, align 16
+  ret <4 x i32> %r
+}
+
+define i32 * @test_store_i32(i32 %val, i32 * %p1) {
+; ALL-LABEL: test_store_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    movl %edi, (%rsi)
+; ALL-NEXT:    movq %rsi, %rax
+; ALL-NEXT:    retq
+  store i32 %val, i32* %p1
+  ret i32 * %p1;
+}
+
+define i64 * @test_store_i64(i64 %val, i64 * %p1) {
+; ALL-LABEL: test_store_i64:
+; ALL:       # BB#0:
+; ALL-NEXT:    movq %rdi, (%rsi)
+; ALL-NEXT:    movq %rsi, %rax
+; ALL-NEXT:    retq
+  store i64 %val, i64* %p1
+  ret i64 * %p1;
+}
+
+define float * @test_store_float(float %val, float * %p1) {
+;
+; SSE_FAST-LABEL: test_store_float:
+; SSE_FAST:       # BB#0:
+; SSE_FAST-NEXT:    movd %xmm0, %eax
+; SSE_FAST-NEXT:    movl %eax, (%rdi)
+; SSE_FAST-NEXT:    movq %rdi, %rax
+; SSE_FAST-NEXT:    retq
+;
+; SSE_GREEDY-LABEL: test_store_float:
+; SSE_GREEDY:       # BB#0:
+; SSE_GREEDY-NEXT:    movss %xmm0, (%rdi)
+; SSE_GREEDY-NEXT:    movq %rdi, %rax
+; SSE_GREEDY-NEXT:    retq
+;
+; ALL_AVX_FAST-LABEL: test_store_float:
+; ALL_AVX_FAST:       # BB#0:
+; ALL_AVX_FAST-NEXT:    vmovd %xmm0, %eax
+; ALL_AVX_FAST-NEXT:    movl %eax, (%rdi)
+; ALL_AVX_FAST-NEXT:    movq %rdi, %rax
+; ALL_AVX_FAST-NEXT:    retq
+;
+; ALL_AVX_GREEDY-LABEL: test_store_float:
+; ALL_AVX_GREEDY:       # BB#0:
+; ALL_AVX_GREEDY-NEXT:    vmovss %xmm0, (%rdi)
+; ALL_AVX_GREEDY-NEXT:    movq %rdi, %rax
+; ALL_AVX_GREEDY-NEXT:    retq
+  store float %val, float* %p1
+  ret float * %p1;
+}
+
+define double * @test_store_double(double %val, double * %p1) {
+;
+; SSE_FAST-LABEL: test_store_double:
+; SSE_FAST:       # BB#0:
+; SSE_FAST-NEXT:    movd %xmm0, %rax
+; SSE_FAST-NEXT:    movq %rax, (%rdi)
+; SSE_FAST-NEXT:    movq %rdi, %rax
+; SSE_FAST-NEXT:    retq
+;
+; SSE_GREEDY-LABEL: test_store_double:
+; SSE_GREEDY:       # BB#0:
+; SSE_GREEDY-NEXT:    movsd %xmm0, (%rdi)
+; SSE_GREEDY-NEXT:    movq %rdi, %rax
+; SSE_GREEDY-NEXT:    retq
+;
+; ALL_AVX_FAST-LABEL: test_store_double:
+; ALL_AVX_FAST:       # BB#0:
+; ALL_AVX_FAST-NEXT:    vmovq %xmm0, %rax
+; ALL_AVX_FAST-NEXT:    movq %rax, (%rdi)
+; ALL_AVX_FAST-NEXT:    movq %rdi, %rax
+; ALL_AVX_FAST-NEXT:    retq
+;
+; ALL_AVX_GREEDY-LABEL: test_store_double:
+; ALL_AVX_GREEDY:       # BB#0:
+; ALL_AVX_GREEDY-NEXT:    vmovsd %xmm0, (%rdi)
+; ALL_AVX_GREEDY-NEXT:    movq %rdi, %rax
+; ALL_AVX_GREEDY-NEXT:    retq
+  store double %val, double* %p1
+  ret double * %p1;
+}
+
diff --git a/test/CodeGen/X86/GlobalISel/select-constant.mir b/test/CodeGen/X86/GlobalISel/select-constant.mir
new file mode 100644
index 0000000000000000000000000000000000000000..f6b97b578b9274c90f470224fe62515a0c7bf2d2
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/select-constant.mir
@@ -0,0 +1,143 @@
+# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK
+
+--- |
+  define i8 @const_i8() {
+    ret i8 2
+  }
+
+  define i16 @const_i16() {
+    ret i16 3
+  }
+
+  define i32 @const_i32() {
+    ret i32 4
+  }
+
+  define i64 @const_i64() {
+    ret i64 68719476720
+  }
+
+  define i64 @const_i64_u32() {
+    ret i64 1879048192
+  }
+
+  define i64 @const_i64_i32() {
+    ret i64 -1
+  }
+
+...
+---
+name:            const_i8
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name:            const_i8
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gr8 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV8ri 2
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s8) = G_CONSTANT i8 2
+    %al = COPY %0(s8)
+    RET 0, implicit %al
+
+...
+---
+name:            const_i16
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name:            const_i16
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gr16 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV16ri 3
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s16) = G_CONSTANT i16 3
+    %ax = COPY %0(s16)
+    RET 0, implicit %ax
+
+...
+---
+name:            const_i32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name:            const_i32
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV32ri 4
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s32) = G_CONSTANT i32 4
+    %eax = COPY %0(s32)
+    RET 0, implicit %eax
+
+...
+---
+name:            const_i64
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name:            const_i64
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gr64 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV64ri 68719476720
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s64) = G_CONSTANT i64 68719476720
+    %rax = COPY %0(s64)
+    RET 0, implicit %rax
+
+...
+---
+name:            const_i64_u32
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name:            const_i64_u32
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gr64 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV64ri32 1879048192
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s64) = G_CONSTANT i64 1879048192
+    %rax = COPY %0(s64)
+    RET 0, implicit %rax
+
+...
+---
+name:            const_i64_i32
+legalized:       true
+regBankSelected: true
+selected:        false
+# CHECK-LABEL: name:            const_i64_i32
+# CHECK: registers:
+# CHECK-NEXT:  - { id: 0, class: gr64 }
+registers:
+  - { id: 0, class: gpr }
+# CHECK:  body:
+# CHECK:    %0 = MOV64ri32 -1
+body:             |
+  bb.1 (%ir-block.0):
+    %0(s64) = G_CONSTANT i64 -1
+    %rax = COPY %0(s64)
+    RET 0, implicit %rax
+
+...
diff --git a/test/CodeGen/X86/GlobalISel/x86_64-instructionselect.mir b/test/CodeGen/X86/GlobalISel/x86_64-instructionselect.mir
new file mode 100644
index 0000000000000000000000000000000000000000..17522c3cb45ebd8f929b124a866f3d724e9d7c1a
--- /dev/null
+++ b/test/CodeGen/X86/GlobalISel/x86_64-instructionselect.mir
@@ -0,0 +1,1022 @@
+# RUN: llc -mtriple=x86_64-linux-gnu                                  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx                      -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f                  -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F
+# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL
+
+--- |
+  define i64 @test_add_i64(i64 %arg1, i64 %arg2) {
+    %ret = add i64 %arg1, %arg2
+    ret i64 %ret
+  }
+
+  define i32 @test_add_i32(i32 %arg1, i32 %arg2) {
+    %ret = add i32 %arg1, %arg2
+    ret i32 %ret
+  }
+
+  define i64 @test_sub_i64(i64 %arg1, i64 %arg2) {
+    %ret = sub i64 %arg1, %arg2
+    ret i64 %ret
+  }
+
+  define i32 @test_sub_i32(i32 %arg1, i32 %arg2) {
+    %ret = sub i32 %arg1, %arg2
+    ret i32 %ret
+  }
+
+  define float @test_add_float(float %arg1, float %arg2) {
+    %ret = fadd float %arg1, %arg2
+    ret float %ret
+  }
+
+  define double @test_add_double(double %arg1, double %arg2) {
+    %ret = fadd double %arg1, %arg2
+    ret double %ret
+  }
+
+  define float @test_sub_float(float %arg1, float %arg2) {
+    %ret = fsub float %arg1, %arg2
+    ret float %ret
+  }
+
+  define double @test_sub_double(double %arg1, double %arg2) {
+    %ret = fsub double %arg1, %arg2
+    ret double %ret
+  }
+
+  define <4 x i32> @test_add_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
+    %ret = add <4 x i32> %arg1, %arg2
+    ret <4 x i32> %ret
+  }
+
+  define <4 x i32> @test_sub_v4i32(<4 x i32> %arg1, <4 x i32> %arg2) {
+    %ret = sub <4 x i32> %arg1, %arg2
+    ret <4 x i32> %ret
+  }
+
+  define <4 x float>  @test_add_v4f32(<4 x float> %arg1, <4 x float>  %arg2) {
+    %ret = fadd <4 x float>  %arg1, %arg2
+    ret <4 x float>  %ret
+  }
+
+  define <4 x float>  @test_sub_v4f32(<4 x float> %arg1, <4 x float>  %arg2) {
+    %ret = fsub <4 x float>  %arg1, %arg2
+    ret <4 x float>  %ret
+  }
+  
+    define i8 @test_load_i8(i8* %p1) {
+    %r = load i8, i8* %p1
+    ret i8 %r
+  }
+
+  define i16 @test_load_i16(i16* %p1) {
+    %r = load i16, i16* %p1
+    ret i16 %r
+  }
+
+  define i32 @test_load_i32(i32* %p1) {
+    %r = load i32, i32* %p1
+    ret i32 %r
+  }
+
+  define i64 @test_load_i64(i64* %p1) {
+    %r = load i64, i64* %p1
+    ret i64 %r
+  }
+
+  define float @test_load_float(float* %p1) {
+    %r = load float, float* %p1
+    ret float %r
+  }
+  
+  define float @test_load_float_vecreg(float* %p1) {
+    %r = load float, float* %p1
+    ret float %r
+  }
+  
+
+  define double @test_load_double(double* %p1) {
+    %r = load double, double* %p1
+    ret double %r
+  }
+
+  define double @test_load_double_vecreg(double* %p1) {
+    %r = load double, double* %p1
+    ret double %r
+  }
+
+  define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) {
+    %r = load <4 x i32>, <4 x i32>* %p1, align 1
+    ret <4 x i32> %r
+  }
+
+  define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) {
+    %r = load <4 x i32>, <4 x i32>* %p1, align 16
+    ret <4 x i32> %r
+  }
+
+  define i32* @test_store_i32(i32 %val, i32* %p1) {
+    store i32 %val, i32* %p1
+    ret i32* %p1
+  }
+
+  define i64* @test_store_i64(i64 %val, i64* %p1) {
+    store i64 %val, i64* %p1
+    ret i64* %p1
+  }
+
+  define float* @test_store_float(float %val, float* %p1) {
+    store float %val, float* %p1
+    ret float* %p1
+  }
+
+  define float* @test_store_float_vec(float %val, float* %p1) {
+    store float %val, float* %p1
+    ret float* %p1
+  }
+
+  define double* @test_store_double(double %val, double* %p1) {
+    store double %val, double* %p1
+    ret double* %p1
+  }
+  
+  define double* @test_store_double_vec(double %val, double* %p1) {
+    store double %val, double* %p1
+    ret double* %p1
+  }
+
+  define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) {
+    store <4 x i32> %val, <4 x i32>* %p1, align 16
+    ret <4 x i32>* %p1
+  }
+
+  define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) {
+    store <4 x i32> %val, <4 x i32>* %p1, align 1
+    ret <4 x i32>* %p1
+  }
+
+...
+
+---
+name:            test_add_i64
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:  - { id: 0, class: gr64 }
+# ALL-NEXT:  - { id: 1, class: gr64 }
+# ALL-NEXT:  - { id: 2, class: gr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+# ALL:      %0 = COPY %rdi
+# ALL-NEXT: %1 = COPY %rsi
+# ALL-NEXT: %2 = ADD64rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s64) = COPY %rdi
+    %1(s64) = COPY %rsi
+    %2(s64) = G_ADD %0, %1
+    %rax = COPY %2(s64)
+
+...
+
+---
+name:            test_add_i32
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:  - { id: 0, class: gr32 }
+# ALL-NEXT:  - { id: 1, class: gr32 }
+# ALL-NEXT:  - { id: 2, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+# ALL:      %0 = COPY %edi
+# ALL-NEXT: %1 = COPY %esi
+# ALL-NEXT: %2 = ADD32rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s32) = G_ADD %0, %1
+    %rax = COPY %2(s32)
+
+...
+
+---
+name:            test_sub_i64
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:  - { id: 0, class: gr64 }
+# ALL-NEXT:  - { id: 1, class: gr64 }
+# ALL-NEXT:  - { id: 2, class: gr64 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+# ALL:      %0 = COPY %rdi
+# ALL-NEXT: %1 = COPY %rsi
+# ALL-NEXT: %2 = SUB64rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s64) = COPY %rdi
+    %1(s64) = COPY %rsi
+    %2(s64) = G_SUB %0, %1
+    %rax = COPY %2(s64)
+
+...
+
+---
+name:            test_sub_i32
+legalized:       true
+regBankSelected: true
+# ALL:      registers:
+# ALL-NEXT:  - { id: 0, class: gr32 }
+# ALL-NEXT:  - { id: 1, class: gr32 }
+# ALL-NEXT:  - { id: 2, class: gr32 }
+registers:
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+# ALL:      %0 = COPY %edi
+# ALL-NEXT: %1 = COPY %esi
+# ALL-NEXT: %2 = SUB32rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %esi
+
+    %0(s32) = COPY %edi
+    %1(s32) = COPY %esi
+    %2(s32) = G_SUB %0, %1
+    %rax = COPY %2(s32)
+
+...
+
+---
+name:            test_add_float
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+tracksRegLiveness: true
+# ALL: registers:
+# NO_AVX512F-NEXT:  - { id: 0, class: fr32 }
+# NO_AVX512F-NEXT:  - { id: 1, class: fr32 }
+# NO_AVX512F-NEXT:  - { id: 2, class: fr32 }
+# AVX512ALL-NEXT:  - { id: 0, class: fr32x }
+# AVX512ALL-NEXT:  - { id: 1, class: fr32x }
+# AVX512ALL-NEXT:  - { id: 2, class: fr32x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %0 = COPY %xmm0
+# ALL-NEXT:     %1 = COPY %xmm1
+# SSE-NEXT:     %2 = ADDSSrr %0, %1
+# AVX-NEXT:     %2 = VADDSSrr %0, %1
+# AVX512F-NEXT: %2 = VADDSSZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(s32) = COPY %xmm0
+    %1(s32) = COPY %xmm1
+    %2(s32) = G_FADD %0, %1
+    %xmm0 = COPY %2(s32)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_add_double
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+tracksRegLiveness: true
+# ALL: registers:
+# NO_AVX512F-NEXT:  - { id: 0, class: fr64 }
+# NO_AVX512F-NEXT:  - { id: 1, class: fr64 }
+# NO_AVX512F-NEXT:  - { id: 2, class: fr64 }
+# AVX512ALL-NEXT:  - { id: 0, class: fr64x }
+# AVX512ALL-NEXT:  - { id: 1, class: fr64x }
+# AVX512ALL-NEXT:  - { id: 2, class: fr64x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %0 = COPY %xmm0
+# ALL-NEXT:     %1 = COPY %xmm1
+# SSE-NEXT:     %2 = ADDSDrr %0, %1
+# AVX-NEXT:     %2 = VADDSDrr %0, %1
+# AVX512F-NEXT: %2 = VADDSDZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(s64) = COPY %xmm0
+    %1(s64) = COPY %xmm1
+    %2(s64) = G_FADD %0, %1
+    %xmm0 = COPY %2(s64)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_sub_float
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+tracksRegLiveness: true
+# ALL: registers:
+# NO_AVX512F-NEXT:  - { id: 0, class: fr32 }
+# NO_AVX512F-NEXT:  - { id: 1, class: fr32 }
+# NO_AVX512F-NEXT:  - { id: 2, class: fr32 }
+# AVX512ALL-NEXT:  - { id: 0, class: fr32x }
+# AVX512ALL-NEXT:  - { id: 1, class: fr32x }
+# AVX512ALL-NEXT:  - { id: 2, class: fr32x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %0 = COPY %xmm0
+# ALL-NEXT:     %1 = COPY %xmm1
+# SSE-NEXT:     %2 = SUBSSrr %0, %1
+# AVX-NEXT:     %2 = VSUBSSrr %0, %1
+# AVX512F-NEXT: %2 = VSUBSSZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(s32) = COPY %xmm0
+    %1(s32) = COPY %xmm1
+    %2(s32) = G_FSUB %0, %1
+    %xmm0 = COPY %2(s32)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_sub_double
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+tracksRegLiveness: true
+# ALL: registers:
+# NO_AVX512F-NEXT:  - { id: 0, class: fr64 }
+# NO_AVX512F-NEXT:  - { id: 1, class: fr64 }
+# NO_AVX512F-NEXT:  - { id: 2, class: fr64 }
+# AVX512ALL-NEXT:  - { id: 0, class: fr64x }
+# AVX512ALL-NEXT:  - { id: 1, class: fr64x }
+# AVX512ALL-NEXT:  - { id: 2, class: fr64x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:          %0 = COPY %xmm0
+# ALL-NEXT:     %1 = COPY %xmm1
+# SSE-NEXT:     %2 = SUBSDrr %0, %1
+# AVX-NEXT:     %2 = VSUBSDrr %0, %1
+# AVX512F-NEXT: %2 = VSUBSDZrr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(s64) = COPY %xmm0
+    %1(s64) = COPY %xmm1
+    %2(s64) = G_FSUB %0, %1
+    %xmm0 = COPY %2(s64)
+    RET 0, implicit %xmm0
+...
+---
+name:            test_add_v4i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+tracksRegLiveness: true
+# ALL: registers:
+# NO_AVX512VL-NEXT:  - { id: 0, class: vr128 }
+# NO_AVX512VL-NEXT:  - { id: 1, class: vr128 }
+# NO_AVX512VL-NEXT:  - { id: 2, class: vr128 }
+# AVX512VL-NEXT:  - { id: 0, class: vr128x }
+# AVX512VL-NEXT:  - { id: 1, class: vr128x }
+# AVX512VL-NEXT:  - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:           %0 = COPY %xmm0
+# ALL-NEXT:      %1 = COPY %xmm1
+# SSE-NEXT:      %2 = PADDDrr %0, %1
+# AVX-NEXT:      %2 = VPADDDrr %0, %1
+# AVX512F-NEXT:  %2 = VPADDDrr %0, %1
+# AVX512VL-NEXT: %2 = VPADDDZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(<4 x s32>) = COPY %xmm1
+    %2(<4 x s32>) = G_ADD %0, %1
+    %xmm0 = COPY %2(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_sub_v4i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+tracksRegLiveness: true
+# ALL: registers:
+# NO_AVX512VL-NEXT:  - { id: 0, class: vr128 }
+# NO_AVX512VL-NEXT:  - { id: 1, class: vr128 }
+# NO_AVX512VL-NEXT:  - { id: 2, class: vr128 }
+# AVX512VL-NEXT:  - { id: 0, class: vr128x }
+# AVX512VL-NEXT:  - { id: 1, class: vr128x }
+# AVX512VL-NEXT:  - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:           %0 = COPY %xmm0
+# ALL-NEXT:      %1 = COPY %xmm1
+# SSE-NEXT:      %2 = PSUBDrr %0, %1
+# AVX-NEXT:      %2 = VPSUBDrr %0, %1
+# AVX512F-NEXT:  %2 = VPSUBDrr %0, %1
+# AVX512VL-NEXT: %2 = VPSUBDZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(<4 x s32>) = COPY %xmm1
+    %2(<4 x s32>) = G_SUB %0, %1
+    %xmm0 = COPY %2(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_add_v4f32
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+tracksRegLiveness: true
+# ALL: registers:
+# NO_AVX512VL-NEXT:  - { id: 0, class: vr128 }
+# NO_AVX512VL-NEXT:  - { id: 1, class: vr128 }
+# NO_AVX512VL-NEXT:  - { id: 2, class: vr128 }
+# AVX512VL-NEXT:  - { id: 0, class: vr128x }
+# AVX512VL-NEXT:  - { id: 1, class: vr128x }
+# AVX512VL-NEXT:  - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:           %0 = COPY %xmm0
+# ALL-NEXT:      %1 = COPY %xmm1
+# SSE-NEXT:      %2 = ADDPSrr %0, %1
+# AVX-NEXT:      %2 = VADDPSrr %0, %1
+# AVX512F-NEXT:  %2 = VADDPSrr %0, %1
+# AVX512VL-NEXT: %2 = VADDPSZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(<4 x s32>) = COPY %xmm1
+    %2(<4 x s32>) = G_FADD %0, %1
+    %xmm0 = COPY %2(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+name:            test_sub_v4f32
+alignment:       4
+legalized:       true
+regBankSelected: true
+selected:        false
+tracksRegLiveness: true
+# ALL: registers:
+# NO_AVX512VL-NEXT:  - { id: 0, class: vr128 }
+# NO_AVX512VL-NEXT:  - { id: 1, class: vr128 }
+# NO_AVX512VL-NEXT:  - { id: 2, class: vr128 }
+# AVX512VL-NEXT:  - { id: 0, class: vr128x }
+# AVX512VL-NEXT:  - { id: 1, class: vr128x }
+# AVX512VL-NEXT:  - { id: 2, class: vr128x }
+registers:
+  - { id: 0, class: vecr }
+  - { id: 1, class: vecr }
+  - { id: 2, class: vecr }
+# ALL:           %0 = COPY %xmm0
+# ALL-NEXT:      %1 = COPY %xmm1
+# SSE-NEXT:      %2 = SUBPSrr %0, %1
+# AVX-NEXT:      %2 = VSUBPSrr %0, %1
+# AVX512F-NEXT:  %2 = VSUBPSrr %0, %1
+# AVX512VL-NEXT: %2 = VSUBPSZ128rr %0, %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %xmm0, %xmm1
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(<4 x s32>) = COPY %xmm1
+    %2(<4 x s32>) = G_FSUB %0, %1
+    %xmm0 = COPY %2(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_load_i8
+name:            test_load_i8
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr8 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1)
+# ALL:     %al = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1)
+    %al = COPY %1(s8)
+    RET 0, implicit %al
+
+...
+---
+# ALL-LABEL: name:            test_load_i16
+name:            test_load_i16
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr16 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1)
+# ALL:     %ax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1)
+    %ax = COPY %1(s16)
+    RET 0, implicit %ax
+
+...
+---
+# ALL-LABEL: name:            test_load_i32
+name:            test_load_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr32 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# ALL:     %eax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
+    %eax = COPY %1(s32)
+    RET 0, implicit %eax
+
+...
+---
+# ALL-LABEL: name:            test_load_i64
+name:            test_load_i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# ALL:     %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
+    %rax = COPY %1(s64)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_load_float
+name:            test_load_float
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr32 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# ALL:     %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
+    %xmm0 = COPY %1(s32)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_load_float_vecreg
+name:            test_load_float_vecreg
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:          - { id: 0, class: gr64 }
+# NO_AVX512F:   - { id: 1, class: fr32 }
+# AVX512ALL:    - { id: 1, class: fr32x }
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# ALL:       %0 = COPY %rdi
+# SSE:       %1 = MOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# AVX:       %1 = VMOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# AVX512ALL: %1 = VMOVSSZrm %0, 1, _, 0, _ :: (load 4 from %ir.p1)
+# ALL: %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1)
+    %xmm0 = COPY %1(s32)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_load_double
+name:            test_load_double
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# ALL:     %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
+    %xmm0 = COPY %1(s64)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_load_double_vecreg
+name:            test_load_double_vecreg
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:          - { id: 0, class: gr64 }
+# NO_AVX512F:   - { id: 1, class: fr64 }
+# AVX512ALL:    - { id: 1, class: fr64x }
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# ALL:       %0 = COPY %rdi
+# SSE:       %1 = MOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# AVX:       %1 = VMOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# AVX512ALL: %1 = VMOVSDZrm %0, 1, _, 0, _ :: (load 8 from %ir.p1)
+# ALL: %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1)
+    %xmm0 = COPY %1(s64)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_load_v4i32_noalign
+name:            test_load_v4i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:          - { id: 0, class: gr64 }
+# NO_AVX512F:   - { id: 1, class: vr128 }
+# AVX512ALL:    - { id: 1, class: vr128x }
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# ALL:      %0 = COPY %rdi
+# SSE:      %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# AVX:      %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# AVX512F:  %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1)
+# ALL: %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1)
+    %xmm0 = COPY %1(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_load_v4i32_align
+name:            test_load_v4i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# NO_AVX512F:   - { id: 1, class: vr128 }
+# AVX512ALL:    - { id: 1, class: vr128x }
+  - { id: 0, class: gpr }
+  - { id: 1, class: vecr }
+# ALL:      %0 = COPY %rdi
+# SSE:      %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# AVX:      %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# AVX512F:  %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1)
+# ALL: %xmm0 = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi
+
+    %0(p0) = COPY %rdi
+    %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1)
+    %xmm0 = COPY %1(<4 x s32>)
+    RET 0, implicit %xmm0
+
+...
+---
+# ALL-LABEL: name:            test_store_i32
+name:            test_store_i32
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr32 }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %edi
+# ALL:     %1 = COPY %rsi
+# ALL:     MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# ALL:     %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %edi, %rsi
+
+    %0(s32) = COPY %edi
+    %1(p0) = COPY %rsi
+    G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_i64
+name:            test_store_i64
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: gr64 }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: gpr }
+  - { id: 1, class: gpr }
+# ALL:     %0 = COPY %rdi
+# ALL:     %1 = COPY %rsi
+# ALL:     MOV64mr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
+# ALL:     %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %rsi
+
+    %0(s64) = COPY %rdi
+    %1(p0) = COPY %rsi
+    G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_float
+name:            test_store_float
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: fr32x }
+# ALL:   - { id: 1, class: gr64 }
+# ALL:   - { id: 2, class: gr32 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+# ALL:     %0 = COPY %xmm0
+# ALL:     %1 = COPY %rdi
+# ALL:     %2 = COPY %0
+# ALL:     MOV32mr %1, 1, _, 0, _, %2 :: (store 4 into %ir.p1)
+# ALL:     %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(s32) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    %2(s32) = COPY %0(s32)
+    G_STORE %2(s32), %1(p0) :: (store 4 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_float_vec
+name:            test_store_float_vec
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# NO_AVX512F:   - { id: 0, class: fr32 }
+# AVX512ALL:    - { id: 0, class: fr32x }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# ALL:       %0 = COPY %xmm0
+# ALL:       %1 = COPY %rdi
+# SSE:       MOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# AVX:       VMOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# AVX512ALL: VMOVSSZmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1)
+# ALL:       %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(s32) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_double
+name:            test_store_double
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# ALL:   - { id: 0, class: fr64x }
+# ALL:   - { id: 1, class: gr64 }
+# ALL:   - { id: 2, class: gr64 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+  - { id: 2, class: gpr }
+# ALL:     %0 = COPY %xmm0
+# ALL:     %1 = COPY %rdi
+# ALL:     %2 = COPY %0
+# ALL:     MOV64mr %1, 1, _, 0, _, %2 :: (store 8 into %ir.p1)
+# ALL:     %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(s64) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    %2(s64) = COPY %0(s64)
+    G_STORE %2(s64), %1(p0) :: (store 8 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_double_vec
+name:            test_store_double_vec
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# NO_AVX512F:   - { id: 0, class: fr64 }
+# AVX512ALL:    - { id: 0, class: fr64x }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# ALL:       %0 = COPY %xmm0
+# ALL:       %1 = COPY %rdi
+# SSE:       MOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
+# AVX:       VMOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
+# AVX512ALL: VMOVSDZmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1)
+# ALL:       %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(s64) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_v4i32_align
+name:            test_store_v4i32_align
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# NO_AVX512F:   - { id: 0, class: vr128 }
+# AVX512ALL:    - { id: 0, class: vr128x }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# ALL:       %0 = COPY %xmm0
+# ALL:       %1 = COPY %rdi
+# SSE:       MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# AVX:       VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# AVX512F:   VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# AVX512VL:  VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1)
+# ALL:       %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
+---
+# ALL-LABEL: name:            test_store_v4i32_noalign
+name:            test_store_v4i32_noalign
+alignment:       4
+legalized:       true
+regBankSelected: true
+registers:
+# NO_AVX512F:   - { id: 0, class: vr128 }
+# AVX512ALL:    - { id: 0, class: vr128x }
+# ALL:   - { id: 1, class: gr64 }
+  - { id: 0, class: vecr }
+  - { id: 1, class: gpr }
+# ALL:       %0 = COPY %xmm0
+# ALL:       %1 = COPY %rdi
+# SSE:       MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# AVX:       VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# AVX512F:   VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# AVX512VL:  VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1)
+# ALL:       %rax = COPY %1
+body:             |
+  bb.1 (%ir-block.0):
+    liveins: %rdi, %xmm0
+
+    %0(<4 x s32>) = COPY %xmm0
+    %1(p0) = COPY %rdi
+    G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1)
+    %rax = COPY %1(p0)
+    RET 0, implicit %rax
+
+...
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index 4d7cb765d7b9e96af2abba1b5d1af1f0e9338c03..4303b6254464255526c4fb8ce9ff81f150fdbe0d 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -1,6 +1,5 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
 ; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -fixup-byte-word-insts=0 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWOFF %s
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck -check-prefix=CHECK -check-prefix=BWON %s
 
 %struct.A = type { i8, i8, i8, i8, i8, i8, i8, i8 }
 %struct.B = type { i32, i32, i32, i32, i32, i32, i32, i32 }
@@ -111,8 +110,7 @@ define void @merge_const_store_vec(i32 %count, %struct.B* nocapture %p) nounwind
 ; CHECK-LABEL: merge_nonconst_store:
 ; CHECK: movl $67305985
 ; CHECK: movb
-; CHECK: movb
-; CHECK: movb
+; CHECK: movw
 ; CHECK: movb
 ; CHECK: ret
 define void @merge_nonconst_store(i32 %count, i8 %zz, %struct.A* nocapture %p) nounwind uwtable noinline ssp {
@@ -292,16 +290,12 @@ block4:                                       ; preds = %4, %.lr.ph
   ret void
 }
 
-;; On x86, even unaligned copies should be merged to vector ops.
-;; TODO: however, this cannot happen at the moment, due to brokenness
-;; in MergeConsecutiveStores. See UseAA FIXME in DAGCombiner.cpp
-;; visitSTORE.
-
+;; On x86, even unaligned copies can be merged to vector ops.
 ; CHECK-LABEL: merge_loads_no_align:
 ;  load:
-; CHECK-NOT: vmovups ;; TODO
+; CHECK: vmovups
 ;  store:
-; CHECK-NOT: vmovups ;; TODO
+; CHECK: vmovups
 ; CHECK: ret
 define void @merge_loads_no_align(i32 %count, %struct.B* noalias nocapture %q, %struct.B* noalias nocapture %p) nounwind uwtable noinline ssp {
   %a1 = icmp sgt i32 %count, 0
@@ -583,8 +577,8 @@ define void @merge_vec_element_and_scalar_load([6 x i64]* %array) {
 
 ; CHECK-LABEL: merge_vec_element_and_scalar_load
 ; CHECK:      movq	(%rdi), %rax
+; CHECK-NEXT: movq	8(%rdi), %rcx
 ; CHECK-NEXT: movq	%rax, 32(%rdi)
-; CHECK-NEXT: movq	8(%rdi), %rax
-; CHECK-NEXT: movq	%rax, 40(%rdi)
+; CHECK-NEXT: movq	%rcx, 40(%rdi)
 ; CHECK-NEXT: retq
 }
diff --git a/test/CodeGen/X86/StackColoring-dbg.ll b/test/CodeGen/X86/StackColoring-dbg.ll
index 15be7aa1029ff15337931413247fc4ae897f9f7f..0ebd01d1c4edf33d84d1ef3c502433ff8eeb6691 100644
--- a/test/CodeGen/X86/StackColoring-dbg.ll
+++ b/test/CodeGen/X86/StackColoring-dbg.ll
@@ -15,16 +15,16 @@ entry:
   br label %for.body
 
 for.body:
-  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
-  call void @llvm.lifetime.start(i64 -1, i8* %x.i) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x.i) nounwind
   call void @llvm.dbg.declare(metadata i8* %x.i, metadata !22, metadata !DIExpression()) nounwind, !dbg !DILocation(scope: !2)
   br label %for.body
 }
 
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
index f974cdc30a210c434eb3e514be72467baea2c6ff..93888c470e2db7d13e6105f9127a2511eaf1a06b 100644
--- a/test/CodeGen/X86/StackColoring.ll
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -15,14 +15,14 @@ entry:
   %a2 = alloca [16 x i8*], align 8
   %b = bitcast [17 x i8*]* %a to i8*
   %b2 = bitcast [16 x i8*]* %a2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b)
   %t1 = call i32 @foo(i32 %in, i8* %b)
   %t2 = call i32 @foo(i32 %in, i8* %b)
-  call void @llvm.lifetime.end(i64 -1, i8* %b)
-  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b2)
   %t3 = call i32 @foo(i32 %in, i8* %b2)
   %t4 = call i32 @foo(i32 %in, i8* %b2)
-  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b2)
   %t5 = add i32 %t1, %t2
   %t6 = add i32 %t3, %t4
   %t7 = add i32 %t5, %t6
@@ -40,22 +40,22 @@ entry:
   %a2 = alloca [16 x i8*], align 8
   %b = bitcast [17 x i8*]* %a to i8*
   %b2 = bitcast [16 x i8*]* %a2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b)
   %t1 = call i32 @foo(i32 %in, i8* %b)
   %t2 = call i32 @foo(i32 %in, i8* %b)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b2)
   %t3 = call i32 @foo(i32 %in, i8* %b2)
   %t4 = call i32 @foo(i32 %in, i8* %b2)
-  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b2)
   %t5 = add i32 %t1, %t2
   %t6 = add i32 %t3, %t4
   %t7 = add i32 %t5, %t6
-  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b)
   ret i32 %t7
 bb3:
-  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b)
   ret i32 0
 }
 
@@ -69,16 +69,16 @@ entry:
   %a2 = alloca [16 x i8*], align 8
   %b = bitcast [17 x i8*]* %a to i8*
   %b2 = bitcast [16 x i8*]* %a2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b)
   %t1 = call i32 @foo(i32 %in, i8* %b)
   %t2 = call i32 @foo(i32 %in, i8* %b)
-  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b2)
   %t3 = call i32 @foo(i32 %in, i8* %b2)
   %t4 = call i32 @foo(i32 %in, i8* %b2)
-  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b2)
   %t5 = add i32 %t1, %t2
   %t6 = add i32 %t3, %t4
   %t7 = add i32 %t5, %t6
@@ -102,21 +102,21 @@ entry:
   %b2 = bitcast [13 x i8*]* %a2 to i8*
   %b3 = bitcast [12 x i8*]* %a3 to i8*
   %b4 = bitcast [11 x i8*]* %a4 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %b4)
-  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b4)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b1)
   %t1 = call i32 @foo(i32 %in, i8* %b1)
   %t2 = call i32 @foo(i32 %in, i8* %b1)
-  call void @llvm.lifetime.end(i64 -1, i8* %b1)
-  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b2)
   %t9 = call i32 @foo(i32 %in, i8* %b2)
   %t8 = call i32 @foo(i32 %in, i8* %b2)
-  call void @llvm.lifetime.end(i64 -1, i8* %b2)
-  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b3)
   %t3 = call i32 @foo(i32 %in, i8* %b3)
   %t4 = call i32 @foo(i32 %in, i8* %b3)
-  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b3)
   %t11 = call i32 @foo(i32 %in, i8* %b4)
-  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b4)
   %t5 = add i32 %t1, %t2
   %t6 = add i32 %t3, %t4
   %t7 = add i32 %t5, %t6
@@ -137,23 +137,23 @@ entry:
   %b2 = bitcast [13 x i8*]* %a2 to i8*
   %b3 = bitcast [12 x i8*]* %a3 to i8*
   %b4 = bitcast [11 x i8*]* %a4 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b1)
   %t1 = call i32 @foo(i32 %in, i8* %b1)
   %t2 = call i32 @foo(i32 %in, i8* %b1)
-  call void @llvm.lifetime.end(i64 -1, i8* %b1)
-  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b2)
   %t9 = call i32 @foo(i32 %in, i8* %b2)
   %t8 = call i32 @foo(i32 %in, i8* %b2)
-  call void @llvm.lifetime.end(i64 -1, i8* %b2)
-  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b3)
   %t3 = call i32 @foo(i32 %in, i8* %b3)
   %t4 = call i32 @foo(i32 %in, i8* %b3)
-  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b3)
   br i1 undef, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b4)
   %t11 = call i32 @foo(i32 %in, i8* %b4)
-  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b4)
   %t5 = add i32 %t1, %t2
   %t6 = add i32 %t3, %t4
   %t7 = add i32 %t5, %t6
@@ -174,13 +174,13 @@ entry:
   %a2 = alloca [16 x i8*], align 8
   %b = bitcast [17 x i8*]* %a to i8*
   %b2 = bitcast [16 x i8*]* %a2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b)
   %t1 = call i32 @foo(i32 %in, i8* %b)
   %t2 = call i32 @foo(i32 %in, i8* %b)
-  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b2)
   %t3 = call i32 @foo(i32 %in, i8* %b2)
   %t4 = call i32 @foo(i32 %in, i8* %b2)
   %t5 = add i32 %t1, %t2
@@ -200,13 +200,13 @@ entry:
   %a2 = alloca [16 x i8*], align 8
   %b = bitcast [17 x i8*]* %a to i8*
   %b2 = bitcast [16 x i8*]* %a2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b)
   %t1 = call i32 @foo(i32 %in, i8* %b)
   %t2 = call i32 @foo(i32 %in, i8* %b)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.end(i64 -1, i8* %b)
-  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b2)
   %t3 = call i32 @foo(i32 %in, i8* %b2)
   %t4 = call i32 @foo(i32 %in, i8* %b2)
   %t5 = add i32 %t1, %t2
@@ -229,10 +229,10 @@ entry:
   %b2 = bitcast [16 x i8*]* %a2 to i8*
   %t1 = call i32 @foo(i32 %in, i8* %b)
   %t2 = call i32 @foo(i32 %in, i8* %b)
-  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b2)
   %t3 = call i32 @foo(i32 %in, i8* %b2)
   %t4 = call i32 @foo(i32 %in, i8* %b2)
   %t5 = add i32 %t1, %t2
@@ -254,19 +254,19 @@ entry:
   %A.i = alloca [100 x i32], align 4
   %B.i = alloca [100 x i32], align 4
   %0 = bitcast [100 x i32]* %A.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0) nounwind
   %1 = bitcast [100 x i32]* %B.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %1) nounwind
   call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1) nounwind
   %2 = bitcast [100 x i32]* %A.i1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %2) nounwind
   %3 = bitcast [100 x i32]* %B.i2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %3) nounwind
   call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %3) nounwind
   ret void
 }
 
@@ -281,7 +281,7 @@ entry:
   %b2 = bitcast [16 x i8*]* %a2 to i8*
   %t1 = call i32 @foo(i32 %in, i8* %b)
   %t2 = call i32 @foo(i32 %in, i8* %b)
-  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b)
   br i1 %d, label %bb0, label %bb1
 
 bb0:
@@ -294,13 +294,13 @@ bb1:
 
 bb2:
   %split = phi i8* [ %I1, %bb0 ], [ %I2, %bb1 ]
-  call void @llvm.lifetime.start(i64 -1, i8* %split)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %split)
   %t3 = call i32 @foo(i32 %in, i8* %b2)
   %t4 = call i32 @foo(i32 %in, i8* %b2)
   %t5 = add i32 %t1, %t2
   %t6 = add i32 %t3, %t4
   %t7 = add i32 %t5, %t6
-  call void @llvm.lifetime.end(i64 -1, i8* %split)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %split)
   ret i32 %t7
 bb3:
   ret i32 0
@@ -318,21 +318,21 @@ entry:
   %A.i = alloca [100 x i32], align 4
   %B.i = alloca [100 x i32], align 4
   %0 = bitcast [100 x i32]* %A.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind ; <---- start #1
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0) nounwind ; <---- start #1
   %1 = bitcast [100 x i32]* %B.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %1) nounwind
   call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1) nounwind
   %2 = bitcast [100 x i32]* %A.i1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %2) nounwind
   %3 = bitcast [100 x i32]* %B.i2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
-  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind  ; <---- start #2
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0) nounwind  ; <---- start #2
   call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %3) nounwind
   ret void
 }
 
@@ -344,11 +344,11 @@ entry:
   %b2 = bitcast [16 x i8*]* %a2 to i8*
   %t1 = call i32 @foo(i32 %in, i8* %b)
   %t2 = call i32 @foo(i32 %in, i8* %b)
-  call void @llvm.lifetime.end(i64 -1, i8* %b)
-  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b2)
   %t3 = call i32 @foo(i32 %in, i8* %b2)
   %t4 = call i32 @foo(i32 %in, i8* %b2)
   %t5 = add i32 %t1, %t2
@@ -369,11 +369,11 @@ define void @myCall_pr15707() {
   %buf1 = alloca i8, i32 100000, align 16
   %buf2 = alloca i8, i32 100000, align 16
 
-  call void @llvm.lifetime.start(i64 -1, i8* %buf1)
-  call void @llvm.lifetime.end(i64 -1, i8* %buf1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %buf1)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %buf1)
 
-  call void @llvm.lifetime.start(i64 -1, i8* %buf1)
-  call void @llvm.lifetime.start(i64 -1, i8* %buf2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %buf1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %buf2)
   %result1 = call i32 @foo(i32 0, i8* %buf1)
   %result2 = call i32 @foo(i32 0, i8* %buf2)
   ret void
@@ -390,12 +390,12 @@ entry:
   %A.i = alloca [100 x i32], align 4
   %B.i = alloca [100 x i32], align 4
   %0 = bitcast [100 x i32]* %A.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0) nounwind
   %1 = bitcast [100 x i32]* %B.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %1) nounwind
   call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1) nounwind
   br label %block2
 
 block2:
@@ -418,13 +418,13 @@ define i32 @shady_range(i32 %argc, i8** nocapture %argv) uwtable {
   %b8 = bitcast [4 x %struct.Klass]* %b.i to i8*
   ; I am used outside the lifetime zone below:
   %z2 = getelementptr inbounds [4 x %struct.Klass], [4 x %struct.Klass]* %a.i, i64 0, i64 0, i32 0
-  call void @llvm.lifetime.start(i64 -1, i8* %a8)
-  call void @llvm.lifetime.start(i64 -1, i8* %b8)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %a8)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b8)
   %z3 = load i32, i32* %z2, align 16
   %r = call i32 @foo(i32 %z3, i8* %a8)
   %r2 = call i32 @foo(i32 %z3, i8* %b8)
-  call void @llvm.lifetime.end(i64 -1, i8* %a8)
-  call void @llvm.lifetime.end(i64 -1, i8* %b8)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %a8)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b8)
   ret i32 9
 }
 
@@ -446,38 +446,38 @@ entry:
   %b4 = alloca [128 x i32], align 16
   %b5 = alloca [128 x i32], align 16
   %tmp = bitcast [128 x i32]* %b1 to i8*
-  call void @llvm.lifetime.start(i64 512, i8* %tmp)
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* %tmp)
   %tmp1 = bitcast [128 x i32]* %b2 to i8*
-  call void @llvm.lifetime.start(i64 512, i8* %tmp1)
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* %tmp1)
   %and = and i32 %x, 1
   %tobool = icmp eq i32 %and, 0
   br i1 %tobool, label %if.else, label %if.then
 
 if.then:                                          ; preds = %entry
   %tmp2 = bitcast [128 x i32]* %b3 to i8*
-  call void @llvm.lifetime.start(i64 512, i8* %tmp2)
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* %tmp2)
   %a1 = getelementptr inbounds [128 x i32], [128 x i32]* %b1, i64 0, i64 0
   %a2 = getelementptr inbounds [128 x i32], [128 x i32]* %b3, i64 0, i64 0
   call void @initb(i32* %a1, i32* %a2, i32* null)
-  call void @llvm.lifetime.end(i64 512, i8* %tmp2)
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %tmp2)
   br label %if.end
 
 if.else:                                          ; preds = %entry
   %tmp3 = bitcast [128 x i32]* %b4 to i8*
-  call void @llvm.lifetime.start(i64 512, i8* %tmp3)
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* %tmp3)
   %tmp4 = bitcast [128 x i32]* %b5 to i8*
-  call void @llvm.lifetime.start(i64 512, i8* %tmp4)
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* %tmp4)
   %a3 = getelementptr inbounds [128 x i32], [128 x i32]* %b2, i64 0, i64 0
   %a4 = getelementptr inbounds [128 x i32], [128 x i32]* %b4, i64 0, i64 0
   %a5 = getelementptr inbounds [128 x i32], [128 x i32]* %b5, i64 0, i64 0
   call void @initb(i32* %a3, i32* %a4, i32* %a5) #3
-  call void @llvm.lifetime.end(i64 512, i8* %tmp4)
-  call void @llvm.lifetime.end(i64 512, i8* %tmp3)
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %tmp4)
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %tmp3)
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then
-  call void @llvm.lifetime.end(i64 512, i8* %tmp1)
-  call void @llvm.lifetime.end(i64 512, i8* %tmp)
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %tmp1)
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %tmp)
   ret i32 0
 
 }
@@ -499,9 +499,9 @@ entry:
   %b2 = alloca [128 x i32], align 16
   %b3 = alloca [128 x i32], align 16
   %tmp = bitcast [128 x i32]* %b1 to i8*
-  call void @llvm.lifetime.start(i64 512, i8* %tmp) #3
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* %tmp) #3
   %tmp1 = bitcast [128 x i32]* %b2 to i8*
-  call void @llvm.lifetime.start(i64 512, i8* %tmp1) #3
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* %tmp1) #3
   %and = and i32 %x, 1
   %tobool = icmp eq i32 %and, 0
   br i1 %tobool, label %if.else, label %if.then
@@ -526,9 +526,9 @@ while.body.lr.ph:                                 ; preds = %if.else
 while.body:                                       ; preds = %while.body.lr.ph, %while.body
   %x.addr.06 = phi i32 [ %x, %while.body.lr.ph ], [ %dec, %while.body ]
   %dec = add nsw i32 %x.addr.06, -1
-  call void @llvm.lifetime.start(i64 512, i8* %tmp2) #3
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* %tmp2) #3
   call void @inita(i32* %arraydecay3) #3
-  call void @llvm.lifetime.end(i64 512, i8* %tmp2) #3
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %tmp2) #3
   %tobool2 = icmp eq i32 %dec, 0
   br i1 %tobool2, label %if.end.loopexit, label %while.body
 
@@ -536,8 +536,8 @@ if.end.loopexit:                                  ; preds = %while.body
   br label %if.end
 
 if.end:                                           ; preds = %if.end.loopexit, %if.else, %if.then
-  call void @llvm.lifetime.end(i64 512, i8* %tmp1) #3
-  call void @llvm.lifetime.end(i64 512, i8* %tmp) #3
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %tmp1) #3
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %tmp) #3
   ret i32 0
 }
 
@@ -556,25 +556,25 @@ entry:
   %buffer.i = alloca [12 x i32], align 16
   %abc = alloca [12 x i32], align 16
   %tmp = bitcast [12 x i32]* %buffer.i to i8*
-  call void @llvm.lifetime.start(i64 48, i8* %tmp)
+  call void @llvm.lifetime.start.p0i8(i64 48, i8* %tmp)
   %idxprom.i = sext i32 %y to i64
   %arrayidx.i = getelementptr inbounds [12 x i32], [12 x i32]* %buffer.i, i64 0, i64 %idxprom.i
   call void @inita(i32* %arrayidx.i)
   %add.i = add nsw i32 %x, %y
-  call void @llvm.lifetime.end(i64 48, i8* %tmp)
+  call void @llvm.lifetime.end.p0i8(i64 48, i8* %tmp)
   %tobool = icmp eq i32 %y, 0
   br i1 %tobool, label %if.end, label %if.then
 
 if.then:                                          ; preds = %entry
   %tmp1 = bitcast [12 x i32]* %abc to i8*
-  call void @llvm.lifetime.start(i64 48, i8* %tmp1)
+  call void @llvm.lifetime.start.p0i8(i64 48, i8* %tmp1)
   %arrayidx = getelementptr inbounds [12 x i32], [12 x i32]* %abc, i64 0, i64 %idxprom.i
   call void @inita(i32* %arrayidx)
-  call void @llvm.lifetime.start(i64 48, i8* %tmp)
+  call void @llvm.lifetime.start.p0i8(i64 48, i8* %tmp)
   call void @inita(i32* %arrayidx.i)
   %add.i9 = add nsw i32 %add.i, %y
-  call void @llvm.lifetime.end(i64 48, i8* %tmp)
-  call void @llvm.lifetime.end(i64 48, i8* %tmp1)
+  call void @llvm.lifetime.end.p0i8(i64 48, i8* %tmp)
+  call void @llvm.lifetime.end.p0i8(i64 48, i8* %tmp1)
   br label %if.end
 
 if.end:                                           ; preds = %if.then, %entry
@@ -588,8 +588,8 @@ declare void @initb(i32*,i32*,i32*)
 
 declare void @bar([100 x i32]* , [100 x i32]*) nounwind
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 
 declare i32 @foo(i32, i8*)
diff --git a/test/CodeGen/X86/absolute-cmp.ll b/test/CodeGen/X86/absolute-cmp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..01e8a90177ccc487058bdedfb99a8a226846f94f
--- /dev/null
+++ b/test/CodeGen/X86/absolute-cmp.ll
@@ -0,0 +1,39 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -relocation-model=pic < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@cmp8 = external hidden global i8, !absolute_symbol !0
+@cmp32 = external hidden global i8, !absolute_symbol !1
+
+declare void @f()
+
+define void @foo8(i64 %val) {
+  ; CHECK: cmpq $cmp8@ABS8, %rdi
+  %cmp = icmp ule i64 %val, ptrtoint (i8* @cmp8 to i64)
+  br i1 %cmp, label %t, label %f
+
+t:
+  call void @f()
+  ret void
+
+f:
+  ret void
+}
+
+define void @foo32(i64 %val) {
+  ; CHECK: cmpq $cmp32, %rdi
+  %cmp = icmp ule i64 %val, ptrtoint (i8* @cmp32 to i64)
+  br i1 %cmp, label %t, label %f
+
+t:
+  call void @f()
+  ret void
+
+f:
+  ret void
+}
+
+!0 = !{i64 0, i64 128}
+!1 = !{i64 0, i64 2147483648}
diff --git a/test/CodeGen/X86/absolute-rotate.ll b/test/CodeGen/X86/absolute-rotate.ll
index c0ecb82adc2f54f3a4057e3eb03f3dee20f8bdb7..6240e8d3f76ff043f0c979a27c5f7865b698cde1 100644
--- a/test/CodeGen/X86/absolute-rotate.ll
+++ b/test/CodeGen/X86/absolute-rotate.ll
@@ -11,7 +11,7 @@ declare void @f()
 define void @foo(i64 %val) {
   %shr = lshr i64 %val, zext (i8 ptrtoint (i8* @align to i8) to i64)
   %shl = shl i64 %val, zext (i8 sub (i8 64, i8 ptrtoint (i8* @align to i8)) to i64)
-  ; CHECK: rorq $align, %rdi
+  ; CHECK: rorq $align@ABS8, %rdi
   %ror = or i64 %shr, %shl
   %cmp = icmp ult i64 %ror, 109
   br i1 %cmp, label %t, label %f
@@ -24,4 +24,4 @@ f:
   ret void
 }
 
-!0 = !{i64 0, i64 256}
+!0 = !{i64 0, i64 128}
diff --git a/test/CodeGen/X86/add-of-carry.ll b/test/CodeGen/X86/add-of-carry.ll
index 44b587af3aaad1e3a5aa3da24d467d21c9558980..b9f7fc68cf689fadf5de5396d0b523b3493e4a78 100644
--- a/test/CodeGen/X86/add-of-carry.ll
+++ b/test/CodeGen/X86/add-of-carry.ll
@@ -1,13 +1,18 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s
+
+; These tests use adc/sbb in place of set+add/sub. Should this transform
+; be enabled by micro-architecture rather than as part of generic lowering/isel?
+
 ; <rdar://problem/8449754>
 
 define i32 @test1(i32 %sum, i32 %x) nounwind readnone ssp {
-entry:
 ; CHECK-LABEL: test1:
-; CHECK: movl
-; CHECK-NEXT: addl
-; CHECK-NEXT: adcl $0
-; CHECK-NEXT: ret
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    adcl $0, %eax
+; CHECK-NEXT:    retl
   %add4 = add i32 %x, %sum
   %cmp = icmp ult i32 %add4, %x
   %inc = zext i1 %cmp to i32
@@ -16,14 +21,18 @@ entry:
 }
 
 ; <rdar://problem/12579915>
+
 define i32 @test2(i32 %x, i32 %y, i32 %res) nounwind uwtable readnone ssp {
-entry:
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    sbbl $0, %eax
+; CHECK-NEXT:    retl
   %cmp = icmp ugt i32 %x, %y
   %dec = sext i1 %cmp to i32
   %dec.res = add nsw i32 %dec, %res
   ret i32 %dec.res
-; CHECK-LABEL: test2:
-; CHECK: cmpl
-; CHECK: sbbl
-; CHECK: ret
 }
+
diff --git a/test/CodeGen/X86/adde-carry.ll b/test/CodeGen/X86/adde-carry.ll
index 4d6ad82a2631ba8d67178c1eeb3a683d11af97e5..9483a6b492c5cf37d5b0de676d5ce7d4c8ef0fdd 100644
--- a/test/CodeGen/X86/adde-carry.ll
+++ b/test/CodeGen/X86/adde-carry.ll
@@ -24,26 +24,171 @@ entry:
  ret void
 }
 
-define i64 @pr31719(i64 %.elt, i64 %.elt24, i64 %t1) {
+define void @b(i32* nocapture %r, i64 %a, i64 %b, i32 %c) nounwind {
+; CHECK-LABEL: b:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addq %rdx, %rsi
+; CHECK-NEXT:    adcl $0, %ecx
+; CHECK-NEXT:    movl %ecx, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+ %0 = zext i64 %a to i128
+ %1 = zext i64 %b to i128
+ %2 = zext i32 %c to i128
+ %3 = add i128 %1, %0
+ %4 = lshr i128 %3, 64
+ %5 = add i128 %4, %2
+ %6 = trunc i128 %5 to i32
+ store i32 %6, i32* %r, align 4
+ ret void
+}
+
+define void @c(i16* nocapture %r, i64 %a, i64 %b, i16 %c) nounwind {
+; CHECK-LABEL: c:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addq %rdx, %rsi
+; CHECK-NEXT:    adcl $0, %ecx
+; CHECK-NEXT:    movw %cx, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+ %0 = zext i64 %a to i128
+ %1 = zext i64 %b to i128
+ %2 = zext i16 %c to i128
+ %3 = add i128 %1, %0
+ %4 = lshr i128 %3, 64
+ %5 = add i128 %4, %2
+ %6 = trunc i128 %5 to i16
+ store i16 %6, i16* %r, align 4
+ ret void
+}
+
+define void @d(i8* nocapture %r, i64 %a, i64 %b, i8 %c) nounwind {
+; CHECK-LABEL: d:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    addq %rdx, %rsi
+; CHECK-NEXT:    adcl $0, %ecx
+; CHECK-NEXT:    movb %cl, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+ %0 = zext i64 %a to i128
+ %1 = zext i64 %b to i128
+ %2 = zext i8 %c to i128
+ %3 = add i128 %1, %0
+ %4 = lshr i128 %3, 64
+ %5 = add i128 %4, %2
+ %6 = trunc i128 %5 to i8
+ store i8 %6, i8* %r, align 4
+ ret void
+}
+
+%scalar = type { [4 x i64] }
+
+define %scalar @pr31719(%scalar* nocapture readonly %this, %scalar %arg.b) {
 ; CHECK-LABEL: pr31719:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    addq %rdx, %rdi
-; CHECK-NEXT:    sbbq %rax, %rax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    addq %rsi, %rax
+; CHECK-NEXT:    addq (%rsi), %rdx
+; CHECK-NEXT:    sbbq %r10, %r10
+; CHECK-NEXT:    andl $1, %r10d
+; CHECK-NEXT:    addq 8(%rsi), %rcx
+; CHECK-NEXT:    sbbq %r11, %r11
+; CHECK-NEXT:    andl $1, %r11d
+; CHECK-NEXT:    addq %r10, %rcx
+; CHECK-NEXT:    adcq $0, %r11
+; CHECK-NEXT:    addq 16(%rsi), %r8
 ; CHECK-NEXT:    sbbq %rax, %rax
 ; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    addq %r11, %r8
+; CHECK-NEXT:    adcq $0, %rax
+; CHECK-NEXT:    addq 24(%rsi), %r9
+; CHECK-NEXT:    addq %rax, %r9
+; CHECK-NEXT:    movq %rdx, (%rdi)
+; CHECK-NEXT:    movq %rcx, 8(%rdi)
+; CHECK-NEXT:    movq %r8, 16(%rdi)
+; CHECK-NEXT:    movq %r9, 24(%rdi)
+; CHECK-NEXT:    movq %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+  %0 = extractvalue %scalar %arg.b, 0
+  %.elt = extractvalue [4 x i64] %0, 0
+  %.elt24 = extractvalue [4 x i64] %0, 1
+  %.elt26 = extractvalue [4 x i64] %0, 2
+  %.elt28 = extractvalue [4 x i64] %0, 3
+  %1 = getelementptr inbounds %scalar , %scalar* %this, i64 0, i32 0, i64 0
+  %2 = load i64, i64* %1, align 8
+  %3 = zext i64 %2 to i128
+  %4 = zext i64 %.elt to i128
+  %5 = add nuw nsw i128 %3, %4
+  %6 = trunc i128 %5 to i64
+  %7 = lshr i128 %5, 64
+  %8 = getelementptr inbounds %scalar , %scalar * %this, i64 0, i32 0, i64 1
+  %9 = load i64, i64* %8, align 8
+  %10 = zext i64 %9 to i128
+  %11 = zext i64 %.elt24 to i128
+  %12 = add nuw nsw i128 %10, %11
+  %13 = add nuw nsw i128 %12, %7
+  %14 = trunc i128 %13 to i64
+  %15 = lshr i128 %13, 64
+  %16 = getelementptr inbounds %scalar , %scalar* %this, i64 0, i32 0, i64 2
+  %17 = load i64, i64* %16, align 8
+  %18 = zext i64 %17 to i128
+  %19 = zext i64 %.elt26 to i128
+  %20 = add nuw nsw i128 %18, %19
+  %21 = add nuw nsw i128 %20, %15
+  %22 = trunc i128 %21 to i64
+  %23 = lshr i128 %21, 64
+  %24 = getelementptr inbounds %scalar , %scalar* %this, i64 0, i32 0, i64 3
+  %25 = load i64, i64* %24, align 8
+  %26 = zext i64 %25 to i128
+  %27 = zext i64 %.elt28 to i128
+  %28 = add nuw nsw i128 %26, %27
+  %29 = add nuw nsw i128 %28, %23
+  %30 = trunc i128 %29 to i64
+  %31 = insertvalue [4 x i64] undef, i64 %6, 0
+  %32 = insertvalue [4 x i64] %31, i64 %14, 1
+  %33 = insertvalue [4 x i64] %32, i64 %22, 2
+  %34 = insertvalue [4 x i64] %33, i64 %30, 3
+  %35 = insertvalue %scalar undef, [4 x i64] %34, 0
+  ret %scalar %35
+}
+
+%accumulator= type { i64, i64, i32 }
+
+define void @muladd(%accumulator* nocapture %this, i64 %arg.a, i64 %arg.b) {
+; CHECK-LABEL: muladd:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movq %rdx, %rax
+; CHECK-NEXT:    mulq %rsi
+; CHECK-NEXT:    addq (%rdi), %rax
+; CHECK-NEXT:    adcq $0, %rdx
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    addq 8(%rdi), %rdx
+; CHECK-NEXT:    movq %rdx, 8(%rdi)
+; CHECK-NEXT:    sbbl %eax, %eax
+; CHECK-NEXT:    subl %eax, 16(%rdi)
 ; CHECK-NEXT:    retq
 entry:
-  %t2 = zext i64 %t1 to i128
-  %t3 = zext i64 %.elt to i128
-  %t4 = add nuw nsw i128 %t2, %t3
-  %t5 = lshr i128 %t4, 64
-  %t6 = zext i64 %.elt24 to i128
-  %t7 = add nuw nsw i128 0, %t6
-  %t8 = add nuw nsw i128 %t7, %t5
-  %t9 = lshr i128 %t8, 64
-  %t10 = add nuw nsw i128 0, %t9
-  %t11 = trunc i128 %t10 to i64
-  ret i64 %t11
+  %0 = zext i64 %arg.a to i128
+  %1 = zext i64 %arg.b to i128
+  %2 = mul nuw i128 %1, %0
+  %3 = getelementptr inbounds %accumulator, %accumulator* %this, i64 0, i32 0
+  %4 = load i64, i64* %3, align 8
+  %5 = zext i64 %4 to i128
+  %6 = add i128 %5, %2
+  %7 = trunc i128 %6 to i64
+  store i64 %7, i64* %3, align 8
+  %8 = lshr i128 %6, 64
+  %9 = getelementptr inbounds %accumulator, %accumulator* %this, i64 0, i32 1
+  %10 = load i64, i64* %9, align 8
+  %11 = zext i64 %10 to i128
+  %12 = add nuw nsw i128 %8, %11
+  %13 = trunc i128 %12 to i64
+  store i64 %13, i64* %9, align 8
+  %14 = lshr i128 %12, 64
+  %15 = getelementptr inbounds %accumulator, %accumulator* %this, i64 0, i32 2
+  %16 = load i32, i32* %15, align 4
+  %17 = zext i32 %16 to i128
+  %18 = add nuw nsw i128 %14, %17
+  %19 = trunc i128 %18 to i32
+  store i32 %19, i32* %15, align 4
+  ret void
 }
diff --git a/test/CodeGen/X86/aes_intrinsics.ll b/test/CodeGen/X86/aes_intrinsics.ll
index fc1a2cc61289ae511c4f926e3a558f04036c0512..fc3d55a05429850dab186762f6a07164d251c824 100644
--- a/test/CodeGen/X86/aes_intrinsics.ll
+++ b/test/CodeGen/X86/aes_intrinsics.ll
@@ -1,7 +1,17 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+aes,-avx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+aes,-avx -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+aes,+avx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK
 
 define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: aesdec
+; CHECK-LABEL: test_x86_aesni_aesdec:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    aesdec %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0xde,0xc1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+;
+; VCHECK-LABEL: test_x86_aesni_aesdec:
+; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    vaesdec %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xde,0xc1]
+; VCHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   ret <2 x i64> %res
 }
@@ -9,7 +19,15 @@ declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
 
 
 define <2 x i64> @test_x86_aesni_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: aesdeclast
+; CHECK-LABEL: test_x86_aesni_aesdeclast:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    aesdeclast %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0xdf,0xc1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+;
+; VCHECK-LABEL: test_x86_aesni_aesdeclast:
+; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    vaesdeclast %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdf,0xc1]
+; VCHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   ret <2 x i64> %res
 }
@@ -17,7 +35,15 @@ declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind read
 
 
 define <2 x i64> @test_x86_aesni_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: aesenc
+; CHECK-LABEL: test_x86_aesni_aesenc:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    aesenc %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0xdc,0xc1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+;
+; VCHECK-LABEL: test_x86_aesni_aesenc:
+; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    vaesenc %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdc,0xc1]
+; VCHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   ret <2 x i64> %res
 }
@@ -25,7 +51,15 @@ declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
 
 
 define <2 x i64> @test_x86_aesni_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
-  ; CHECK: aesenclast
+; CHECK-LABEL: test_x86_aesni_aesenclast:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    aesenclast %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0xdd,0xc1]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+;
+; VCHECK-LABEL: test_x86_aesni_aesenclast:
+; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    vaesenclast %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdd,0xc1]
+; VCHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
   ret <2 x i64> %res
 }
@@ -33,7 +67,15 @@ declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind read
 
 
 define <2 x i64> @test_x86_aesni_aesimc(<2 x i64> %a0) {
-  ; CHECK: aesimc
+; CHECK-LABEL: test_x86_aesni_aesimc:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    aesimc %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0xdb,0xc0]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+;
+; VCHECK-LABEL: test_x86_aesni_aesimc:
+; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    vaesimc %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdb,0xc0]
+; VCHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
   ret <2 x i64> %res
 }
@@ -41,7 +83,15 @@ declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
 
 
 define <2 x i64> @test_x86_aesni_aeskeygenassist(<2 x i64> %a0) {
-  ; CHECK: aeskeygenassist
+; CHECK-LABEL: test_x86_aesni_aeskeygenassist:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    aeskeygenassist $7, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x3a,0xdf,0xc0,0x07]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+;
+; VCHECK-LABEL: test_x86_aesni_aeskeygenassist:
+; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    vaeskeygenassist $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0xdf,0xc0,0x07]
+; VCHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) ; <<2 x i64>> [#uses=1]
   ret <2 x i64> %res
 }
diff --git a/test/CodeGen/X86/and-sink.ll b/test/CodeGen/X86/and-sink.ll
new file mode 100644
index 0000000000000000000000000000000000000000..46e50f2a6a74e54b37e71ac5c3d0babddf0df5fe
--- /dev/null
+++ b/test/CodeGen/X86/and-sink.ll
@@ -0,0 +1,181 @@
+; RUN: llc -mtriple=i686-unknown -verify-machineinstrs < %s | FileCheck %s
+; RUN: opt < %s -codegenprepare -S -mtriple=x86_64-unknown-unknown | FileCheck --check-prefix=CHECK-CGP %s
+
+@A = global i32 zeroinitializer
+@B = global i32 zeroinitializer
+@C = global i32 zeroinitializer
+
+; Test that 'and' is sunk into bb0.
+define i32 @and_sink1(i32 %a, i1 %c) {
+; CHECK-LABEL: and_sink1:
+; CHECK: testb $1,
+; CHECK: je
+; CHECK-NOT: andl $4,
+; CHECK: movl $0, A
+; CHECK: testb $4,
+; CHECK: jne
+
+; CHECK-CGP-LABEL: @and_sink1(
+; CHECK-CGP-NOT: and i32
+  %and = and i32 %a, 4
+  br i1 %c, label %bb0, label %bb2
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+; CHECK-CGP-NEXT: store
+; CHECK-CGP-NEXT: br
+  %cmp = icmp eq i32 %and, 0
+  store i32 0, i32* @A
+  br i1 %cmp, label %bb1, label %bb2
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+; Test that both 'and' and cmp get sunk to bb1.
+define i32 @and_sink2(i32 %a, i1 %c, i1 %c2) {
+; CHECK-LABEL: and_sink2:
+; CHECK: movl $0, A
+; CHECK: testb $1,
+; CHECK: je
+; CHECK-NOT: andl $4,
+; CHECK: movl $0, B
+; CHECK: testb $1,
+; CHECK: je
+; CHECK: movl $0, C
+; CHECK: testb $4,
+; CHECK: jne
+
+; CHECK-CGP-LABEL: @and_sink2(
+; CHECK-CGP-NOT: and i32
+  %and = and i32 %a, 4
+  store i32 0, i32* @A
+  br i1 %c, label %bb0, label %bb3
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP-NOT: icmp
+  %cmp = icmp eq i32 %and, 0
+  store i32 0, i32* @B
+  br i1 %c2, label %bb1, label %bb3
+bb1:
+; CHECK-CGP-LABEL: bb1:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+; CHECK-CGP-NEXT: store
+; CHECK-CGP-NEXT: br
+  store i32 0, i32* @C
+  br i1 %cmp, label %bb2, label %bb0
+bb2:
+  ret i32 1
+bb3:
+  ret i32 0
+}
+
+; Test that CodeGenPrepare doesn't get stuck in a loop sinking and hoisting a masked load.
+define i32 @and_sink3(i1 %c, i32* %p) {
+; CHECK-LABEL: and_sink3:
+; CHECK: testb $1,
+; CHECK: je
+; CHECK: movzbl
+; CHECK-DAG: movl $0, A
+; CHECK-DAG: testl %
+; CHECK: je
+
+; CHECK-CGP-LABEL: @and_sink3(
+; CHECK-CGP: load i32
+; CHECK-CGP-NEXT: and i32
+  %load = load i32, i32* %p
+  %and = and i32 %load, 255
+  br i1 %c, label %bb0, label %bb2
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP: icmp eq i32
+  %cmp = icmp eq i32 %and, 0
+  store i32 0, i32* @A
+  br i1 %cmp, label %bb1, label %bb2
+bb1:
+  ret i32 1
+bb2:
+  ret i32 0
+}
+
+; Test that CodeGenPrepare sinks/duplicates non-immediate 'and'.
+define i32 @and_sink4(i32 %a, i32 %b, i1 %c) {
+; CHECK-LABEL: and_sink4:
+; CHECK: testb $1,
+; CHECK: je
+; CHECK-NOT: andl
+; CHECK-DAG: movl $0, A
+; CHECK-DAG: testl [[REG1:%[a-z0-9]+]], [[REG2:%[a-z0-9]+]]
+; CHECK: jne
+; CHECK-DAG: movl {{%[a-z0-9]+}}, B
+; CHECK-DAG: testl [[REG1]], [[REG2]]
+; CHECK: je
+
+; CHECK-CGP-LABEL: @and_sink4(
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP-NOT: icmp
+  %and = and i32 %a, %b
+  %cmp = icmp eq i32 %and, 0
+  br i1 %c, label %bb0, label %bb3
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+  store i32 0, i32* @A
+  br i1 %cmp, label %bb1, label %bb3
+bb1:
+; CHECK-CGP-LABEL: bb1:
+; CHECK-CGP: and i32
+; CHECK-CGP-NEXT: icmp eq i32
+  %add = add i32 %a, %b
+  store i32 %add, i32* @B
+  br i1 %cmp, label %bb2, label %bb3
+bb2:
+  ret i32 1
+bb3:
+  ret i32 0
+}
+
+
+; Test that CodeGenPrepare doesn't sink/duplicate non-immediate 'and'
+; when it would increase register pressure.
+define i32 @and_sink5(i32 %a, i32 %b, i32 %a2, i32 %b2, i1 %c) {
+; CHECK-LABEL: and_sink5:
+; CHECK: testb $1,
+; CHECK: je
+; CHECK-DAG: andl {{[0-9]+\(%[a-z0-9]+\)}}, [[REG:%[a-z0-9]+]]
+; CHECK-DAG: movl $0, A
+; CHECK: jne
+; CHECK-DAG: movl {{%[a-z0-9]+}}, B
+; CHECK-DAG: testl [[REG]], [[REG]]
+; CHECK: je
+
+; CHECK-CGP-LABEL: @and_sink5(
+; CHECK-CGP: and i32
+; CHECK-CGP-NOT: icmp
+  %and = and i32 %a, %b
+  %cmp = icmp eq i32 %and, 0
+  br i1 %c, label %bb0, label %bb3
+bb0:
+; CHECK-CGP-LABEL: bb0:
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP: icmp eq i32
+  store i32 0, i32* @A
+  br i1 %cmp, label %bb1, label %bb3
+bb1:
+; CHECK-CGP-LABEL: bb1:
+; CHECK-CGP-NOT: and i32
+; CHECK-CGP: icmp eq i32
+  %add = add i32 %a2, %b2
+  store i32 %add, i32* @B
+  br i1 %cmp, label %bb2, label %bb3
+bb2:
+  ret i32 1
+bb3:
+  ret i32 0
+}
diff --git a/test/CodeGen/X86/arg-copy-elide.ll b/test/CodeGen/X86/arg-copy-elide.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b9a2eeeb7f8f989000c6025664065ebc15dd73cc
--- /dev/null
+++ b/test/CodeGen/X86/arg-copy-elide.ll
@@ -0,0 +1,299 @@
+; RUN: llc -mtriple=i686-windows < %s | FileCheck %s
+
+declare void @addrof_i1(i1*)
+declare void @addrof_i32(i32*)
+declare void @addrof_i64(i64*)
+declare void @addrof_i128(i128*)
+declare void @addrof_i32_x3(i32*, i32*, i32*)
+
+define void @simple(i32 %x) {
+entry:
+  %x.addr = alloca i32
+  store i32 %x, i32* %x.addr
+  call void @addrof_i32(i32* %x.addr)
+  ret void
+}
+
+; CHECK-LABEL: _simple:
+; CHECK: leal 4(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+; We need to load %x before calling addrof_i32 now because it could mutate %x in
+; place.
+
+define i32 @use_arg(i32 %x) {
+entry:
+  %x.addr = alloca i32
+  store i32 %x, i32* %x.addr
+  call void @addrof_i32(i32* %x.addr)
+  ret i32 %x
+}
+
+; CHECK-LABEL: _use_arg:
+; CHECK: pushl %[[csr:[^ ]*]]
+; CHECK-DAG: movl 8(%esp), %[[csr]]
+; CHECK-DAG: leal 8(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: movl %[[csr]], %eax
+; CHECK: popl %[[csr]]
+; CHECK: retl
+
+; We won't copy elide for types needing legalization such as i64 or i1.
+
+define i64 @split_i64(i64 %x) {
+entry:
+  %x.addr = alloca i64, align 4
+  store i64 %x, i64* %x.addr, align 4
+  call void @addrof_i64(i64* %x.addr)
+  ret i64 %x
+}
+
+; CHECK-LABEL: _split_i64:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: pushl %[[csr2:[^ ]*]]
+; CHECK: pushl %[[csr1:[^ ]*]]
+; CHECK: andl $-8, %esp
+; CHECK-DAG: movl 8(%ebp), %[[csr1]]
+; CHECK-DAG: movl 12(%ebp), %[[csr2]]
+; CHECK-DAG: leal 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i64
+; CHECK-DAG: movl %[[csr1]], %eax
+; CHECK-DAG: movl %[[csr2]], %edx
+; CHECK: leal -8(%ebp), %esp
+; CHECK: popl %[[csr1]]
+; CHECK: popl %[[csr2]]
+; CHECK: popl %ebp
+; CHECK: retl
+
+define i1 @i1_arg(i1 %x) {
+  %x.addr = alloca i1
+  store i1 %x, i1* %x.addr
+  call void @addrof_i1(i1* %x.addr)
+  ret i1 %x
+}
+
+; CHECK-LABEL: _i1_arg:
+; CHECK: pushl   %ebx
+; CHECK: movb 8(%esp), %bl
+; CHECK: leal 8(%esp), %eax
+; CHECK: pushl %eax
+; CHECK: calll _addrof_i1
+; CHECK: addl $4, %esp
+; CHECK: movl %ebx, %eax
+; CHECK: popl %ebx
+; CHECK: retl
+
+; We can't copy elide when an i64 is split between registers and memory in a
+; fastcc function.
+
+define fastcc i64 @fastcc_split_i64(i64* %p, i64 %x) {
+entry:
+  %x.addr = alloca i64, align 4
+  store i64 %x, i64* %x.addr, align 4
+  call void @addrof_i64(i64* %x.addr)
+  ret i64 %x
+}
+
+; CHECK-LABEL: _fastcc_split_i64:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK-DAG: movl %edx, %[[r1:[^ ]*]]
+; CHECK-DAG: movl 8(%ebp), %[[r2:[^ ]*]]
+; CHECK-DAG: movl %[[r2]], 4(%esp)
+; CHECK-DAG: movl %[[r1]], (%esp)
+; CHECK: movl %esp, %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i64
+; CHECK: popl %ebp
+; CHECK: retl
+
+
+; We can't copy elide when it would reduce the user requested alignment.
+
+define void @high_alignment(i32 %x) {
+entry:
+  %x.p = alloca i32, align 128
+  store i32 %x, i32* %x.p
+  call void @addrof_i32(i32* %x.p)
+  ret void
+}
+
+; CHECK-LABEL: _high_alignment:
+; CHECK: andl $-128, %esp
+; CHECK: movl 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]], (%esp)
+; CHECK: movl %esp, %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+; We can't copy elide when it would reduce the ABI required alignment.
+; FIXME: We should lower the ABI alignment of i64 on Windows, since MSVC
+; doesn't guarantee it.
+
+define void @abi_alignment(i64 %x) {
+entry:
+  %x.p = alloca i64
+  store i64 %x, i64* %x.p
+  call void @addrof_i64(i64* %x.p)
+  ret void
+}
+
+; CHECK-LABEL: _abi_alignment:
+; CHECK: andl $-8, %esp
+; CHECK: movl 8(%ebp), %[[reg:[^ ]*]]
+; CHECK: movl %[[reg]], (%esp)
+; CHECK: movl %esp, %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i64
+; CHECK: retl
+
+
+; The code we generate for this is unimportant. This is mostly a crash test.
+
+define void @split_i128(i128* %sret, i128 %x) {
+entry:
+  %x.addr = alloca i128
+  store i128 %x, i128* %x.addr
+  call void @addrof_i128(i128* %x.addr)
+  store i128 %x, i128* %sret
+  ret void
+}
+
+; CHECK-LABEL: _split_i128:
+; CHECK: pushl %ebp
+; CHECK: calll _addrof_i128
+; CHECK: retl
+
+
+; Check that we load all of x, y, and z before the call.
+
+define i32 @three_args(i32 %x, i32 %y, i32 %z) {
+entry:
+  %z.addr = alloca i32, align 4
+  %y.addr = alloca i32, align 4
+  %x.addr = alloca i32, align 4
+  store i32 %z, i32* %z.addr, align 4
+  store i32 %y, i32* %y.addr, align 4
+  store i32 %x, i32* %x.addr, align 4
+  call void @addrof_i32_x3(i32* %x.addr, i32* %y.addr, i32* %z.addr)
+  %s1 = add i32 %x, %y
+  %sum = add i32 %s1, %z
+  ret i32 %sum
+}
+
+; CHECK-LABEL: _three_args:
+; CHECK: pushl %[[csr:[^ ]*]]
+; CHECK-DAG: movl {{[0-9]+}}(%esp), %[[csr]]
+; CHECK-DAG: addl {{[0-9]+}}(%esp), %[[csr]]
+; CHECK-DAG: addl {{[0-9]+}}(%esp), %[[csr]]
+; CHECK-DAG: leal 8(%esp), %[[x:[^ ]*]]
+; CHECK-DAG: leal 12(%esp), %[[y:[^ ]*]]
+; CHECK-DAG: leal 16(%esp), %[[z:[^ ]*]]
+; CHECK: pushl %[[z]]
+; CHECK: pushl %[[y]]
+; CHECK: pushl %[[x]]
+; CHECK: calll _addrof_i32_x3
+; CHECK: movl %[[csr]], %eax
+; CHECK: popl %[[csr]]
+; CHECK: retl
+
+
+define void @two_args_same_alloca(i32 %x, i32 %y) {
+entry:
+  %x.addr = alloca i32
+  store i32 %x, i32* %x.addr
+  store i32 %y, i32* %x.addr
+  call void @addrof_i32(i32* %x.addr)
+  ret void
+}
+
+; CHECK-LABEL: _two_args_same_alloca:
+; CHECK: movl 8(%esp), {{.*}}
+; CHECK: movl {{.*}}, 4(%esp)
+; CHECK: leal 4(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+define void @avoid_byval(i32* byval %x) {
+entry:
+  %x.p.p = alloca i32*
+  store i32* %x, i32** %x.p.p
+  call void @addrof_i32(i32* %x)
+  ret void
+}
+
+; CHECK-LABEL: _avoid_byval:
+; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+define void @avoid_inalloca(i32* inalloca %x) {
+entry:
+  %x.p.p = alloca i32*
+  store i32* %x, i32** %x.p.p
+  call void @addrof_i32(i32* %x)
+  ret void
+}
+
+; CHECK-LABEL: _avoid_inalloca:
+; CHECK: leal {{[0-9]+}}(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: retl
+
+
+; Don't elide the copy when the alloca is escaped with a store.
+
+define void @escape_with_store(i32 %x) {
+  %x1 = alloca i32
+  %x2 = alloca i32*
+  store i32* %x1, i32** %x2
+  %x3 = load i32*, i32** %x2
+  store i32 0, i32* %x3
+  store i32 %x, i32* %x1
+  call void @addrof_i32(i32* %x1)
+  ret void
+}
+
+; CHECK-LABEL: _escape_with_store:
+; CHECK-DAG: movl {{.*}}(%esp), %[[reg:[^ ]*]]
+; CHECK-DAG: movl $0, [[offs:[0-9]*]](%esp)
+; CHECK: movl %[[reg]], [[offs]](%esp)
+; CHECK: calll _addrof_i32
+
+
+; This test case exposed issues with the use of TokenFactor.
+
+define void @sret_and_elide(i32* sret %sret, i32 %v) {
+  %v.p = alloca i32
+  store i32 %v, i32* %v.p
+  call void @addrof_i32(i32* %v.p)
+  store i32 %v, i32* %sret
+  ret void
+}
+
+; CHECK-LABEL: _sret_and_elide:
+; CHECK: pushl
+; CHECK: pushl
+; CHECK: movl 12(%esp), %[[sret:[^ ]*]]
+; CHECK: movl 16(%esp), %[[v:[^ ]*]]
+; CHECK: leal 16(%esp), %[[reg:[^ ]*]]
+; CHECK: pushl %[[reg]]
+; CHECK: calll _addrof_i32
+; CHECK: movl %[[v]], (%[[sret]])
+; CHECK: movl %[[sret]], %eax
+; CHECK: popl
+; CHECK: popl
+; CHECK: retl
diff --git a/test/CodeGen/X86/atomic128.ll b/test/CodeGen/X86/atomic128.ll
index 1bf7bfbfa26011498bbd666ca847bd8c04f7c4de..77bbdec826a598304c4b288d027a7160d0fbe0ac 100644
--- a/test/CodeGen/X86/atomic128.ll
+++ b/test/CodeGen/X86/atomic128.ll
@@ -1,20 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9 -verify-machineinstrs -mattr=cx16 | FileCheck %s
 
 @var = global i128 0
 
-define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
-; CHECK-LABEL: val_compare_and_swap:
 ; Due to the scheduling right after isel for cmpxchg and given the
 ; machine scheduler and copy coalescer do not mess up with physical
 ; register live-ranges, we end up with a useless copy.
-;
-; CHECK: movq %rcx, [[TMP:%r[0-9a-z]+]]
-; CHECK: movq %rsi, %rax
-; CHECK: movq %r8, %rcx
-; CHECK: movq [[TMP]], %rbx
-; CHECK: lock
-; CHECK: cmpxchg16b (%rdi)
-
+define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
+; CHECK-LABEL: val_compare_and_swap:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi1:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rcx, %r9
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    movq %r8, %rcx
+; CHECK-NEXT:    movq %r9, %rbx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
   %pair = cmpxchg i128* %p, i128 %oldval, i128 %newval acquire acquire
   %val = extractvalue { i128, i1 } %pair, 0
   ret i128 %val
@@ -22,24 +28,31 @@ define i128 @val_compare_and_swap(i128* %p, i128 %oldval, i128 %newval) {
 
 define void @fetch_and_nand(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_nand:
-; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
-; CHECK-DAG:     movq (%rdi), %rax
-; CHECK-DAG:     movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         movq %rdx, %rcx
-; CHECK:         andq [[INCHI]], %rcx
-; CHECK:         movq %rax, %rbx
-  ; INCLO equivalent comes in in %rsi, so it makes sense it stays there.
-; CHECK:         andq %rsi, %rbx
-; CHECK:         notq %rbx
-; CHECK:         notq %rcx
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-
-; CHECK:         movq %rax, _var
-; CHECK:         movq %rdx, _var+8
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi2:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi3:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB1_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    andq %r8, %rcx
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    andq %rsi, %rbx
+; CHECK-NEXT:    notq %rbx
+; CHECK-NEXT:    notq %rcx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB1_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
   %val = atomicrmw nand i128* %p, i128 %bits release
   store i128 %val, i128* @var, align 16
   ret void
@@ -47,23 +60,29 @@ define void @fetch_and_nand(i128* %p, i128 %bits) {
 
 define void @fetch_and_or(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_or:
-; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
-; CHECK-DAG:     movq (%rdi), %rax
-; CHECK-DAG:     movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         movq %rax, %rbx
-  ; INCLO equivalent comes in in %rsi, so it makes sense it stays there.
-; CHECK:         orq %rsi, %rbx
-; CHECK:         movq %rdx, %rcx
-; CHECK:         orq [[INCHI]], %rcx
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-
-; CHECK:         movq %rax, _var
-; CHECK:         movq %rdx, _var+8
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi4:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi5:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB2_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    orq %rsi, %rbx
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    orq %r8, %rcx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB2_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
   %val = atomicrmw or i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -71,23 +90,29 @@ define void @fetch_and_or(i128* %p, i128 %bits) {
 
 define void @fetch_and_add(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_add:
-; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
-; CHECK-DAG:     movq (%rdi), %rax
-; CHECK-DAG:     movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         movq %rax, %rbx
-  ; INCLO equivalent comes in in %rsi, so it makes sense it stays there.
-; CHECK:         addq %rsi, %rbx
-; CHECK:         movq %rdx, %rcx
-; CHECK:         adcq [[INCHI]], %rcx
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-
-; CHECK:         movq %rax, _var
-; CHECK:         movq %rdx, _var+8
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi6:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi7:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB3_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    addq %rsi, %rbx
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    adcq %r8, %rcx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB3_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
   %val = atomicrmw add i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -95,23 +120,29 @@ define void @fetch_and_add(i128* %p, i128 %bits) {
 
 define void @fetch_and_sub(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_sub:
-; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
-; CHECK-DAG:     movq (%rdi), %rax
-; CHECK-DAG:     movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         movq %rax, %rbx
-  ; INCLO equivalent comes in in %rsi, so it makes sense it stays there.
-; CHECK:         subq %rsi, %rbx
-; CHECK:         movq %rdx, %rcx
-; CHECK:         sbbq [[INCHI]], %rcx
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-
-; CHECK:         movq %rax, _var
-; CHECK:         movq %rdx, _var+8
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi8:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi9:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB4_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movq %rax, %rbx
+; CHECK-NEXT:    subq %rsi, %rbx
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    sbbq %r8, %rcx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB4_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
   %val = atomicrmw sub i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -119,24 +150,35 @@ define void @fetch_and_sub(i128* %p, i128 %bits) {
 
 define void @fetch_and_min(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_min:
-; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
-; CHECK-DAG:     movq (%rdi), %rax
-; CHECK-DAG:     movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         cmpq
-; CHECK:         sbbq
-; CHECK:         setg
-; CHECK:         cmovneq %rax, %rbx
-; CHECK:         movq [[INCHI]], %rcx
-; CHECK:         cmovneq %rdx, %rcx
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-
-; CHECK:         movq %rax, _var
-; CHECK:         movq %rdx, _var+8
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi10:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi11:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB5_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpq %rax, %rsi
+; CHECK-NEXT:    movq %r8, %rcx
+; CHECK-NEXT:    sbbq %rdx, %rcx
+; CHECK-NEXT:    setge %cl
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %r8, %rcx
+; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB5_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
   %val = atomicrmw min i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -144,24 +186,35 @@ define void @fetch_and_min(i128* %p, i128 %bits) {
 
 define void @fetch_and_max(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_max:
-; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
-; CHECK-DAG:     movq (%rdi), %rax
-; CHECK-DAG:     movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         cmpq
-; CHECK:         sbbq
-; CHECK:         setge
-; CHECK:         cmovneq %rax, %rbx
-; CHECK:         movq [[INCHI]], %rcx
-; CHECK:         cmovneq %rdx, %rcx
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-
-; CHECK:         movq %rax, _var
-; CHECK:         movq %rdx, _var+8
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi12:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi13:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB6_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpq %rsi, %rax
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    sbbq %r8, %rcx
+; CHECK-NEXT:    setge %cl
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %r8, %rcx
+; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB6_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
   %val = atomicrmw max i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -169,24 +222,35 @@ define void @fetch_and_max(i128* %p, i128 %bits) {
 
 define void @fetch_and_umin(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_umin:
-; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
-; CHECK-DAG:     movq (%rdi), %rax
-; CHECK-DAG:     movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         cmpq
-; CHECK:         sbbq
-; CHECK:         seta
-; CHECK:         cmovneq %rax, %rbx
-; CHECK:         movq [[INCHI]], %rcx
-; CHECK:         cmovneq %rdx, %rcx
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-
-; CHECK:         movq %rax, _var
-; CHECK:         movq %rdx, _var+8
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi14:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi15:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB7_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpq %rax, %rsi
+; CHECK-NEXT:    movq %r8, %rcx
+; CHECK-NEXT:    sbbq %rdx, %rcx
+; CHECK-NEXT:    setae %cl
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %r8, %rcx
+; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB7_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
   %val = atomicrmw umin i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -194,24 +258,35 @@ define void @fetch_and_umin(i128* %p, i128 %bits) {
 
 define void @fetch_and_umax(i128* %p, i128 %bits) {
 ; CHECK-LABEL: fetch_and_umax:
-; CHECK-DAG:     movq %rdx, [[INCHI:%[a-z0-9]+]]
-; CHECK-DAG:     movq (%rdi), %rax
-; CHECK-DAG:     movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         cmpq
-; CHECK:         sbbq
-; CHECK:         setb
-; CHECK:         cmovneq %rax, %rbx
-; CHECK:         movq [[INCHI]], %rcx
-; CHECK:         cmovneq %rdx, %rcx
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-
-; CHECK:         movq %rax, _var
-; CHECK:         movq %rdx, _var+8
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi16:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi17:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %r8
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB8_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    cmpq %rax, %rsi
+; CHECK-NEXT:    movq %r8, %rcx
+; CHECK-NEXT:    sbbq %rdx, %rcx
+; CHECK-NEXT:    setb %cl
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    cmovneq %rax, %rbx
+; CHECK-NEXT:    testb %cl, %cl
+; CHECK-NEXT:    movq %r8, %rcx
+; CHECK-NEXT:    cmovneq %rdx, %rcx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB8_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    movq %rax, {{.*}}(%rip)
+; CHECK-NEXT:    movq %rdx, _var+{{.*}}(%rip)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
   %val = atomicrmw umax i128* %p, i128 %bits seq_cst
   store i128 %val, i128* @var, align 16
   ret void
@@ -219,75 +294,110 @@ define void @fetch_and_umax(i128* %p, i128 %bits) {
 
 define i128 @atomic_load_seq_cst(i128* %p) {
 ; CHECK-LABEL: atomic_load_seq_cst:
-; CHECK: xorl %eax, %eax
-; CHECK: xorl %edx, %edx
-; CHECK: xorl %ecx, %ecx
-; CHECK: xorl %ebx, %ebx
-; CHECK: lock
-; CHECK: cmpxchg16b (%rdi)
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi18:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi19:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
    %r = load atomic i128, i128* %p seq_cst, align 16
    ret i128 %r
 }
 
 define i128 @atomic_load_relaxed(i128* %p) {
-; CHECK: atomic_load_relaxed:
-; CHECK: xorl %eax, %eax
-; CHECK: xorl %edx, %edx
-; CHECK: xorl %ecx, %ecx
-; CHECK: xorl %ebx, %ebx
-; CHECK: lock
-; CHECK: cmpxchg16b (%rdi)
-
+; CHECK-LABEL: atomic_load_relaxed:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi20:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi21:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    xorl %ebx, %ebx
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
    %r = load atomic i128, i128* %p monotonic, align 16
    ret i128 %r
 }
 
 define void @atomic_store_seq_cst(i128* %p, i128 %in) {
 ; CHECK-LABEL: atomic_store_seq_cst:
-; CHECK:         movq %rdx, %rcx
-; CHECK:         movq %rsi, %rbx
-; CHECK:         movq (%rdi), %rax
-; CHECK:         movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-; CHECK-NOT:     callq ___sync_lock_test_and_set_16
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi22:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi23:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB11_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB11_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
    store atomic i128 %in, i128* %p seq_cst, align 16
    ret void
 }
 
 define void @atomic_store_release(i128* %p, i128 %in) {
 ; CHECK-LABEL: atomic_store_release:
-; CHECK:         movq %rdx, %rcx
-; CHECK:         movq %rsi, %rbx
-; CHECK:         movq (%rdi), %rax
-; CHECK:         movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi24:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi25:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB12_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB12_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
    store atomic i128 %in, i128* %p release, align 16
    ret void
 }
 
 define void @atomic_store_relaxed(i128* %p, i128 %in) {
 ; CHECK-LABEL: atomic_store_relaxed:
-; CHECK:         movq %rdx, %rcx
-; CHECK:         movq %rsi, %rbx
-; CHECK:         movq (%rdi), %rax
-; CHECK:         movq 8(%rdi), %rdx
-
-; CHECK: [[LOOP:.?LBB[0-9]+_[0-9]+]]:
-; CHECK:         lock
-; CHECK:         cmpxchg16b (%rdi)
-; CHECK:         jne [[LOOP]]
-
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:  Lcfi26:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:  Lcfi27:
+; CHECK-NEXT:    .cfi_offset %rbx, -16
+; CHECK-NEXT:    movq %rdx, %rcx
+; CHECK-NEXT:    movq %rsi, %rbx
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rdx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  LBB13_1: ## %atomicrmw.start
+; CHECK-NEXT:    ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    lock cmpxchg16b (%rdi)
+; CHECK-NEXT:    jne LBB13_1
+; CHECK-NEXT:  ## BB#2: ## %atomicrmw.end
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
    store atomic i128 %in, i128* %p unordered, align 16
    ret void
 }
diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll
index 99d3f206f0c1403d8598b00ea0cfcb644c454216..2aaf14001758f86abb02de1a9bd720dc7353f035 100644
--- a/test/CodeGen/X86/avg.ll
+++ b/test/CodeGen/X86/avg.ll
@@ -140,82 +140,82 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm8[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm10
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
 ; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm10, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm11[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm11, %xmm14
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm8, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm11, %xmm15
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm15, %xmm14
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3],xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm15, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm11, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm15, %xmm2
-; SSE2-NEXT:    paddd %xmm9, %xmm13
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; SSE2-NEXT:    paddd %xmm11, %xmm1
+; SSE2-NEXT:    paddd %xmm9, %xmm13
+; SSE2-NEXT:    paddd %xmm15, %xmm2
 ; SSE2-NEXT:    paddd %xmm14, %xmm5
-; SSE2-NEXT:    paddd %xmm10, %xmm3
-; SSE2-NEXT:    paddd %xmm12, %xmm6
 ; SSE2-NEXT:    paddd %xmm8, %xmm0
+; SSE2-NEXT:    paddd %xmm12, %xmm6
+; SSE2-NEXT:    paddd %xmm10, %xmm3
 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm4, %xmm7
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm4, %xmm6
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    paddd %xmm4, %xmm6
+; SSE2-NEXT:    paddd %xmm4, %xmm0
 ; SSE2-NEXT:    paddd %xmm4, %xmm5
-; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    paddd %xmm4, %xmm13
 ; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    paddd %xmm4, %xmm13
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm3
 ; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; SSE2-NEXT:    pand %xmm4, %xmm7
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    packuswb %xmm7, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    packuswb %xmm7, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm0
 ; SSE2-NEXT:    psrld $1, %xmm6
 ; SSE2-NEXT:    pand %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    packuswb %xmm6, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm6, %xmm0
 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pand %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm5, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm1
 ; SSE2-NEXT:    psrld $1, %xmm13
 ; SSE2-NEXT:    pand %xmm4, %xmm13
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    packuswb %xmm13, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    packuswb %xmm13, %xmm1
 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
@@ -234,6 +234,7 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512F-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v32i8:
@@ -241,6 +242,7 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) {
 ; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512BW-NEXT:    vpavgb (%rdi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <32 x i8>, <32 x i8>* %a
   %2 = load <32 x i8>, <32 x i8>* %b
@@ -261,194 +263,193 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 ; SSE2-NEXT:  .Lcfi0:
 ; SSE2-NEXT:    .cfi_def_cfa_offset 160
 ; SSE2-NEXT:    movdqa (%rdi), %xmm1
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm4
 ; SSE2-NEXT:    movdqa 32(%rdi), %xmm5
 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm6
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm6[2,3,0,1]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm6, %xmm8
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm8, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
 ; SSE2-NEXT:    movdqa %xmm6, %xmm1
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3],xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm9, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3]
-; SSE2-NEXT:    movdqa (%rsi), %xmm15
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm15[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm15, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm7, %xmm14
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT:    movdqa (%rsi), %xmm14
+; SSE2-NEXT:    movdqa %xmm14, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm7, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm13
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm14, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm12
+; SSE2-NEXT:    movdqa %xmm12, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm12, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm5, %xmm3
+; SSE2-NEXT:    movdqa 48(%rsi), %xmm1
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3],xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    paddd %xmm9, %xmm8
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    paddd %xmm8, %xmm4
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd (%rsp), %xmm11 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
 ; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm10
 ; SSE2-NEXT:    paddd %xmm0, %xmm15
-; SSE2-NEXT:    paddd %xmm0, %xmm14
 ; SSE2-NEXT:    paddd %xmm0, %xmm7
+; SSE2-NEXT:    paddd %xmm0, %xmm9
+; SSE2-NEXT:    paddd %xmm0, %xmm14
 ; SSE2-NEXT:    paddd %xmm0, %xmm13
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    paddd %xmm0, %xmm12
 ; SSE2-NEXT:    paddd %xmm0, %xmm6
+; SSE2-NEXT:    paddd %xmm0, %xmm10
+; SSE2-NEXT:    paddd %xmm0, %xmm12
 ; SSE2-NEXT:    paddd %xmm0, %xmm11
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    paddd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm0, %xmm9
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm8
+; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm3
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    psrld $1, %xmm15
-; SSE2-NEXT:    psrld $1, %xmm10
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm10
 ; SSE2-NEXT:    pand %xmm0, %xmm15
-; SSE2-NEXT:    packuswb %xmm10, %xmm15
-; SSE2-NEXT:    psrld $1, %xmm7
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    packuswb %xmm15, %xmm7
 ; SSE2-NEXT:    psrld $1, %xmm14
+; SSE2-NEXT:    psrld $1, %xmm9
+; SSE2-NEXT:    pand %xmm0, %xmm9
 ; SSE2-NEXT:    pand %xmm0, %xmm14
-; SSE2-NEXT:    pand %xmm0, %xmm7
-; SSE2-NEXT:    packuswb %xmm14, %xmm7
-; SSE2-NEXT:    packuswb %xmm7, %xmm15
-; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    packuswb %xmm9, %xmm14
+; SSE2-NEXT:    packuswb %xmm7, %xmm14
+; SSE2-NEXT:    psrld $1, %xmm6
 ; SSE2-NEXT:    psrld $1, %xmm13
 ; SSE2-NEXT:    pand %xmm0, %xmm13
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm13, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm6
+; SSE2-NEXT:    packuswb %xmm13, %xmm6
 ; SSE2-NEXT:    psrld $1, %xmm12
+; SSE2-NEXT:    psrld $1, %xmm10
+; SSE2-NEXT:    pand %xmm0, %xmm10
 ; SSE2-NEXT:    pand %xmm0, %xmm12
-; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    packuswb %xmm12, %xmm6
-; SSE2-NEXT:    packuswb %xmm6, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    packuswb %xmm10, %xmm12
+; SSE2-NEXT:    packuswb %xmm6, %xmm12
+; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    psrld $1, %xmm11
 ; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    packuswb %xmm11, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    packuswb %xmm11, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
 ; SSE2-NEXT:    psrld $1, %xmm6
 ; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    packuswb %xmm6, %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    packuswb %xmm6, %xmm2
 ; SSE2-NEXT:    packuswb %xmm5, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm3
-; SSE2-NEXT:    movdqa %xmm9, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    movdqa %xmm8, %xmm5
 ; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    packuswb %xmm5, %xmm3
-; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    packuswb %xmm5, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm8
-; SSE2-NEXT:    packuswb %xmm5, %xmm8
-; SSE2-NEXT:    packuswb %xmm8, %xmm3
-; SSE2-NEXT:    movdqu %xmm3, (%rax)
-; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm5, %xmm1
+; SSE2-NEXT:    packuswb %xmm4, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
-; SSE2-NEXT:    movdqu %xmm15, (%rax)
+; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm12, (%rax)
+; SSE2-NEXT:    movdqu %xmm14, (%rax)
 ; SSE2-NEXT:    addq $152, %rsp
 ; SSE2-NEXT:    retq
 ;
@@ -495,7 +496,7 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 ; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm7
 ; AVX2-NEXT:    vpsrld $1, %ymm10, %ymm8
 ; AVX2-NEXT:    vpsrld $1, %ymm9, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -563,6 +564,7 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v64i8:
@@ -570,6 +572,7 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) {
 ; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
 ; AVX512BW-NEXT:    vpavgb (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, <64 x i8>* %a
   %2 = load <64 x i8>, <64 x i8>* %b
@@ -727,6 +730,7 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512F-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v16i16:
@@ -734,6 +738,7 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) {
 ; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512BW-NEXT:    vpavgw (%rdi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <16 x i16>, <16 x i16>* %a
   %2 = load <16 x i16>, <16 x i16>* %b
@@ -858,7 +863,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
@@ -889,6 +894,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdw %zmm0, (%rax)
 ; AVX512F-NEXT:    vpmovdw %zmm1, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v32i16:
@@ -896,6 +902,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX512BW-NEXT:    vmovdqu16 (%rsi), %zmm0
 ; AVX512BW-NEXT:    vpavgw (%rdi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <32 x i16>, <32 x i16>* %a
   %2 = load <32 x i16>, <32 x i16>* %b
@@ -1045,82 +1052,82 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
 ; SSE2-NEXT:    movdqa (%rsi), %xmm0
 ; SSE2-NEXT:    movdqa 16(%rsi), %xmm1
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm10 = xmm8[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NEXT:    movdqa %xmm8, %xmm10
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
 ; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3],xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm10, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm15 = xmm11[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm11, %xmm14
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm8, %xmm12
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm11, %xmm15
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm15, %xmm14
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3],xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm15, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm7
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm11, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm7
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm13
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; SSE2-NEXT:    paddd %xmm15, %xmm2
-; SSE2-NEXT:    paddd %xmm9, %xmm13
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; SSE2-NEXT:    paddd %xmm11, %xmm1
+; SSE2-NEXT:    paddd %xmm9, %xmm13
+; SSE2-NEXT:    paddd %xmm15, %xmm2
 ; SSE2-NEXT:    paddd %xmm14, %xmm5
-; SSE2-NEXT:    paddd %xmm10, %xmm3
-; SSE2-NEXT:    paddd %xmm12, %xmm6
 ; SSE2-NEXT:    paddd %xmm8, %xmm0
+; SSE2-NEXT:    paddd %xmm12, %xmm6
+; SSE2-NEXT:    paddd %xmm10, %xmm3
 ; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [1,1,1,1]
 ; SSE2-NEXT:    paddd %xmm4, %xmm7
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm4, %xmm6
 ; SSE2-NEXT:    paddd %xmm4, %xmm3
+; SSE2-NEXT:    paddd %xmm4, %xmm6
+; SSE2-NEXT:    paddd %xmm4, %xmm0
 ; SSE2-NEXT:    paddd %xmm4, %xmm5
-; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    paddd %xmm4, %xmm13
 ; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm0
+; SSE2-NEXT:    paddd %xmm4, %xmm13
+; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm3
 ; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
 ; SSE2-NEXT:    pand %xmm4, %xmm7
-; SSE2-NEXT:    pand %xmm4, %xmm0
-; SSE2-NEXT:    packuswb %xmm7, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm3
+; SSE2-NEXT:    packuswb %xmm7, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm0
 ; SSE2-NEXT:    psrld $1, %xmm6
 ; SSE2-NEXT:    pand %xmm4, %xmm6
-; SSE2-NEXT:    pand %xmm4, %xmm3
-; SSE2-NEXT:    packuswb %xmm6, %xmm3
+; SSE2-NEXT:    pand %xmm4, %xmm0
+; SSE2-NEXT:    packuswb %xmm6, %xmm0
 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pand %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm4, %xmm1
-; SSE2-NEXT:    packuswb %xmm5, %xmm1
-; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm2
+; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm1
 ; SSE2-NEXT:    psrld $1, %xmm13
 ; SSE2-NEXT:    pand %xmm4, %xmm13
-; SSE2-NEXT:    pand %xmm4, %xmm2
-; SSE2-NEXT:    packuswb %xmm13, %xmm2
+; SSE2-NEXT:    pand %xmm4, %xmm1
+; SSE2-NEXT:    packuswb %xmm13, %xmm1
 ; SSE2-NEXT:    packuswb %xmm2, %xmm1
 ; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    movdqu %xmm0, (%rax)
@@ -1139,6 +1146,7 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v32i8_2:
@@ -1146,6 +1154,7 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpavgb (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <32 x i8>, <32 x i8>* %a
   %2 = load <32 x i8>, <32 x i8>* %b
@@ -1162,136 +1171,136 @@ define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) {
 define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
 ; SSE2-LABEL: avg_v64i8_2:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rsi), %xmm15
-; SSE2-NEXT:    movdqa 16(%rsi), %xmm13
+; SSE2-NEXT:    movdqa (%rsi), %xmm14
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm12
 ; SSE2-NEXT:    movdqa 32(%rsi), %xmm2
-; SSE2-NEXT:    movdqa 48(%rsi), %xmm3
+; SSE2-NEXT:    movdqa 48(%rsi), %xmm1
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm15[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3],xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm15, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm7, %xmm14
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm14, %xmm7
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm7, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm13[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm13, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm12
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm14, %xmm8
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm12, %xmm6
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm13
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm2[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm11
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm12, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm2, %xmm5
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm11
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm5, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm4, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    paddd %xmm1, %xmm1
-; SSE2-NEXT:    paddd %xmm4, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    paddd %xmm3, %xmm3
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    paddd %xmm4, %xmm4
-; SSE2-NEXT:    paddd %xmm5, %xmm5
-; SSE2-NEXT:    paddd %xmm10, %xmm10
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm3, %xmm3
 ; SSE2-NEXT:    paddd %xmm2, %xmm2
+; SSE2-NEXT:    paddd %xmm10, %xmm10
+; SSE2-NEXT:    paddd %xmm5, %xmm5
 ; SSE2-NEXT:    paddd %xmm11, %xmm11
-; SSE2-NEXT:    paddd %xmm6, %xmm6
 ; SSE2-NEXT:    paddd %xmm12, %xmm12
-; SSE2-NEXT:    paddd %xmm13, %xmm13
 ; SSE2-NEXT:    paddd %xmm9, %xmm9
-; SSE2-NEXT:    paddd %xmm7, %xmm7
+; SSE2-NEXT:    paddd %xmm6, %xmm6
+; SSE2-NEXT:    paddd %xmm13, %xmm13
 ; SSE2-NEXT:    paddd %xmm14, %xmm14
-; SSE2-NEXT:    paddd %xmm15, %xmm15
 ; SSE2-NEXT:    paddd %xmm8, %xmm8
+; SSE2-NEXT:    paddd %xmm7, %xmm7
+; SSE2-NEXT:    paddd %xmm15, %xmm15
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,1,1,1]
-; SSE2-NEXT:    paddd %xmm0, %xmm8
 ; SSE2-NEXT:    paddd %xmm0, %xmm15
-; SSE2-NEXT:    paddd %xmm0, %xmm14
 ; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    paddd %xmm0, %xmm9
+; SSE2-NEXT:    paddd %xmm0, %xmm8
+; SSE2-NEXT:    paddd %xmm0, %xmm14
 ; SSE2-NEXT:    paddd %xmm0, %xmm13
-; SSE2-NEXT:    paddd %xmm0, %xmm12
 ; SSE2-NEXT:    paddd %xmm0, %xmm6
+; SSE2-NEXT:    paddd %xmm0, %xmm9
+; SSE2-NEXT:    paddd %xmm0, %xmm12
 ; SSE2-NEXT:    paddd %xmm0, %xmm11
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm0, %xmm10
 ; SSE2-NEXT:    paddd %xmm0, %xmm5
-; SSE2-NEXT:    paddd %xmm0, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    paddd %xmm0, %xmm10
+; SSE2-NEXT:    paddd %xmm0, %xmm2
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm0, %xmm3
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    psrld $1, %xmm15
-; SSE2-NEXT:    psrld $1, %xmm8
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm0, %xmm8
 ; SSE2-NEXT:    pand %xmm0, %xmm15
-; SSE2-NEXT:    packuswb %xmm8, %xmm15
-; SSE2-NEXT:    psrld $1, %xmm7
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    packuswb %xmm15, %xmm7
 ; SSE2-NEXT:    psrld $1, %xmm14
+; SSE2-NEXT:    psrld $1, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm8
 ; SSE2-NEXT:    pand %xmm0, %xmm14
-; SSE2-NEXT:    pand %xmm0, %xmm7
-; SSE2-NEXT:    packuswb %xmm14, %xmm7
-; SSE2-NEXT:    packuswb %xmm7, %xmm15
+; SSE2-NEXT:    packuswb %xmm8, %xmm14
+; SSE2-NEXT:    packuswb %xmm7, %xmm14
+; SSE2-NEXT:    psrld $1, %xmm6
 ; SSE2-NEXT:    psrld $1, %xmm13
-; SSE2-NEXT:    psrld $1, %xmm9
-; SSE2-NEXT:    pand %xmm0, %xmm9
 ; SSE2-NEXT:    pand %xmm0, %xmm13
-; SSE2-NEXT:    packuswb %xmm9, %xmm13
-; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm6
+; SSE2-NEXT:    packuswb %xmm13, %xmm6
 ; SSE2-NEXT:    psrld $1, %xmm12
+; SSE2-NEXT:    psrld $1, %xmm9
+; SSE2-NEXT:    pand %xmm0, %xmm9
 ; SSE2-NEXT:    pand %xmm0, %xmm12
-; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    packuswb %xmm12, %xmm6
-; SSE2-NEXT:    packuswb %xmm6, %xmm13
-; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    packuswb %xmm9, %xmm12
+; SSE2-NEXT:    packuswb %xmm6, %xmm12
+; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    psrld $1, %xmm11
 ; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    packuswb %xmm11, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    packuswb %xmm11, %xmm5
+; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm10
 ; SSE2-NEXT:    pand %xmm0, %xmm10
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    packuswb %xmm10, %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    packuswb %xmm10, %xmm2
 ; SSE2-NEXT:    packuswb %xmm5, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm4
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
 ; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    packuswb %xmm5, %xmm3
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    packuswb %xmm5, %xmm4
 ; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
 ; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    pand %xmm0, %xmm5
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    packuswb %xmm5, %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm3
-; SSE2-NEXT:    movdqu %xmm3, (%rax)
+; SSE2-NEXT:    packuswb %xmm4, %xmm1
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    movdqu %xmm2, (%rax)
-; SSE2-NEXT:    movdqu %xmm13, (%rax)
-; SSE2-NEXT:    movdqu %xmm15, (%rax)
+; SSE2-NEXT:    movdqu %xmm12, (%rax)
+; SSE2-NEXT:    movdqu %xmm14, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v64i8_2:
@@ -1329,7 +1338,7 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
 ; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm7
 ; AVX2-NEXT:    vpsrld $1, %ymm10, %ymm8
 ; AVX2-NEXT:    vpsrld $1, %ymm9, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm9 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -1393,6 +1402,7 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vmovdqu %ymm1, (%rax)
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v64i8_2:
@@ -1400,6 +1410,7 @@ define void @avg_v64i8_2(<64 x i8>* %a, <64 x i8>* %b) {
 ; AVX512BW-NEXT:    vmovdqu8 (%rsi), %zmm0
 ; AVX512BW-NEXT:    vpavgb %zmm0, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, <64 x i8>* %a
   %2 = load <64 x i8>, <64 x i8>* %b
@@ -1558,6 +1569,7 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v16i16_2:
@@ -1565,6 +1577,7 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) {
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpavgw (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <16 x i16>, <16 x i16>* %a
   %2 = load <16 x i16>, <16 x i16>* %b
@@ -1689,7 +1702,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
@@ -1720,6 +1733,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX512F-NEXT:    vpsrld $1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdw %zmm0, (%rax)
 ; AVX512F-NEXT:    vpmovdw %zmm1, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v32i16_2:
@@ -1727,6 +1741,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) {
 ; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpavgw (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <32 x i16>, <32 x i16>* %a
   %2 = load <32 x i16>, <32 x i16>* %b
@@ -1854,62 +1869,62 @@ define void @avg_v16i8_const(<16 x i8>* %a) {
 define void @avg_v32i8_const(<32 x i8>* %a) {
 ; SSE2-LABEL: avg_v32i8_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm4
+; SSE2-NEXT:    movdqa (%rdi), %xmm5
 ; SSE2-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm8
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm7
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; SSE2-NEXT:    movdqa %xmm2, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT:    movdqa %xmm5, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [5,6,7,8]
-; SSE2-NEXT:    paddd %xmm9, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,3,4]
-; SSE2-NEXT:    paddd %xmm1, %xmm5
+; SSE2-NEXT:    paddd %xmm9, %xmm5
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,2,3,4]
+; SSE2-NEXT:    paddd %xmm3, %xmm7
 ; SSE2-NEXT:    paddd %xmm9, %xmm6
-; SSE2-NEXT:    paddd %xmm1, %xmm7
+; SSE2-NEXT:    paddd %xmm3, %xmm4
 ; SSE2-NEXT:    paddd %xmm9, %xmm2
-; SSE2-NEXT:    paddd %xmm1, %xmm3
-; SSE2-NEXT:    paddd %xmm9, %xmm0
-; SSE2-NEXT:    paddd %xmm1, %xmm8
-; SSE2-NEXT:    psrld $1, %xmm8
+; SSE2-NEXT:    paddd %xmm3, %xmm8
+; SSE2-NEXT:    paddd %xmm9, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm0
 ; SSE2-NEXT:    psrld $1, %xmm0
-; SSE2-NEXT:    psrld $1, %xmm3
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm8
 ; SSE2-NEXT:    psrld $1, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm7
+; SSE2-NEXT:    psrld $1, %xmm4
 ; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    psrld $1, %xmm7
 ; SSE2-NEXT:    psrld $1, %xmm5
-; SSE2-NEXT:    psrld $1, %xmm4
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm1, %xmm4
-; SSE2-NEXT:    pand %xmm1, %xmm5
-; SSE2-NEXT:    packuswb %xmm4, %xmm5
-; SSE2-NEXT:    pand %xmm1, %xmm6
-; SSE2-NEXT:    pand %xmm1, %xmm7
-; SSE2-NEXT:    packuswb %xmm6, %xmm7
-; SSE2-NEXT:    packuswb %xmm7, %xmm5
-; SSE2-NEXT:    pand %xmm1, %xmm2
-; SSE2-NEXT:    pand %xmm1, %xmm3
-; SSE2-NEXT:    packuswb %xmm2, %xmm3
-; SSE2-NEXT:    pand %xmm1, %xmm0
-; SSE2-NEXT:    pand %xmm1, %xmm8
-; SSE2-NEXT:    packuswb %xmm0, %xmm8
-; SSE2-NEXT:    packuswb %xmm8, %xmm3
-; SSE2-NEXT:    movdqu %xmm3, (%rax)
-; SSE2-NEXT:    movdqu %xmm5, (%rax)
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm3, %xmm5
+; SSE2-NEXT:    pand %xmm3, %xmm7
+; SSE2-NEXT:    packuswb %xmm5, %xmm7
+; SSE2-NEXT:    pand %xmm3, %xmm6
+; SSE2-NEXT:    pand %xmm3, %xmm4
+; SSE2-NEXT:    packuswb %xmm6, %xmm4
+; SSE2-NEXT:    packuswb %xmm7, %xmm4
+; SSE2-NEXT:    pand %xmm3, %xmm2
+; SSE2-NEXT:    pand %xmm3, %xmm8
+; SSE2-NEXT:    packuswb %xmm2, %xmm8
+; SSE2-NEXT:    pand %xmm3, %xmm1
+; SSE2-NEXT:    pand %xmm3, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    packuswb %xmm8, %xmm0
+; SSE2-NEXT:    movdqu %xmm0, (%rax)
+; SSE2-NEXT:    movdqu %xmm4, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v32i8_const:
@@ -1925,6 +1940,7 @@ define void @avg_v32i8_const(<32 x i8>* %a) {
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v32i8_const:
@@ -1932,6 +1948,7 @@ define void @avg_v32i8_const(<32 x i8>* %a) {
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <32 x i8>, <32 x i8>* %a
   %2 = zext <32 x i8> %1 to <32 x i32>
@@ -1945,121 +1962,121 @@ define void @avg_v32i8_const(<32 x i8>* %a) {
 define void @avg_v64i8_const(<64 x i8>* %a) {
 ; SSE2-LABEL: avg_v64i8_const:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa (%rdi), %xmm7
-; SSE2-NEXT:    movdqa 16(%rdi), %xmm1
-; SSE2-NEXT:    movdqa 32(%rdi), %xmm14
+; SSE2-NEXT:    movdqa (%rdi), %xmm5
+; SSE2-NEXT:    movdqa 16(%rdi), %xmm6
+; SSE2-NEXT:    movdqa 32(%rdi), %xmm15
 ; SSE2-NEXT:    movdqa 48(%rdi), %xmm11
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[2,3,0,1]
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm9
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm11, %xmm12
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm14[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm10
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm9
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm0[8],xmm11[9],xmm0[9],xmm11[10],xmm0[10],xmm11[11],xmm0[11],xmm11[12],xmm0[12],xmm11[13],xmm0[13],xmm11[14],xmm0[14],xmm11[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm15, %xmm14
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm14, %xmm15
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
+; SSE2-NEXT:    movdqa %xmm14, %xmm13
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm15, %xmm12
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm3
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm8
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; SSE2-NEXT:    movdqa %xmm7, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7]
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [5,6,7,8]
-; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    paddd %xmm0, %xmm4
-; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    paddd %xmm0, %xmm6
 ; SSE2-NEXT:    paddd %xmm0, %xmm3
+; SSE2-NEXT:    paddd %xmm0, %xmm15
 ; SSE2-NEXT:    paddd %xmm0, %xmm14
-; SSE2-NEXT:    paddd %xmm0, %xmm13
 ; SSE2-NEXT:    paddd %xmm0, %xmm11
 ; SSE2-NEXT:    paddd %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [1,2,3,4]
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    paddd %xmm0, %xmm7
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    paddd %xmm0, %xmm4
 ; SSE2-NEXT:    paddd %xmm0, %xmm8
-; SSE2-NEXT:    paddd %xmm0, %xmm6
-; SSE2-NEXT:    paddd %xmm0, %xmm15
-; SSE2-NEXT:    paddd %xmm0, %xmm10
 ; SSE2-NEXT:    paddd %xmm0, %xmm12
+; SSE2-NEXT:    paddd %xmm0, %xmm13
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Reload
 ; SSE2-NEXT:    paddd %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa %xmm9, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    psrld $1, %xmm2
+; SSE2-NEXT:    paddd %xmm0, %xmm10
 ; SSE2-NEXT:    psrld $1, %xmm7
+; SSE2-NEXT:    psrld $1, %xmm5
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSE2-NEXT:    pand %xmm0, %xmm5
 ; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    packuswb %xmm5, %xmm7
+; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    pand %xmm0, %xmm2
-; SSE2-NEXT:    packuswb %xmm7, %xmm2
-; SSE2-NEXT:    psrld $1, %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    packuswb %xmm2, %xmm1
+; SSE2-NEXT:    packuswb %xmm7, %xmm1
 ; SSE2-NEXT:    psrld $1, %xmm4
+; SSE2-NEXT:    psrld $1, %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm6
 ; SSE2-NEXT:    pand %xmm0, %xmm4
-; SSE2-NEXT:    pand %xmm0, %xmm5
-; SSE2-NEXT:    packuswb %xmm4, %xmm5
-; SSE2-NEXT:    packuswb %xmm5, %xmm2
+; SSE2-NEXT:    packuswb %xmm6, %xmm4
 ; SSE2-NEXT:    psrld $1, %xmm8
-; SSE2-NEXT:    psrld $1, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    pand %xmm0, %xmm8
-; SSE2-NEXT:    packuswb %xmm1, %xmm8
-; SSE2-NEXT:    psrld $1, %xmm6
 ; SSE2-NEXT:    psrld $1, %xmm3
 ; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm6
-; SSE2-NEXT:    packuswb %xmm3, %xmm6
-; SSE2-NEXT:    packuswb %xmm6, %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm8
+; SSE2-NEXT:    packuswb %xmm3, %xmm8
+; SSE2-NEXT:    packuswb %xmm4, %xmm8
+; SSE2-NEXT:    psrld $1, %xmm12
 ; SSE2-NEXT:    psrld $1, %xmm15
-; SSE2-NEXT:    psrld $1, %xmm14
-; SSE2-NEXT:    pand %xmm0, %xmm14
 ; SSE2-NEXT:    pand %xmm0, %xmm15
-; SSE2-NEXT:    packuswb %xmm14, %xmm15
-; SSE2-NEXT:    psrld $1, %xmm10
+; SSE2-NEXT:    pand %xmm0, %xmm12
+; SSE2-NEXT:    packuswb %xmm15, %xmm12
 ; SSE2-NEXT:    psrld $1, %xmm13
+; SSE2-NEXT:    psrld $1, %xmm14
+; SSE2-NEXT:    pand %xmm0, %xmm14
 ; SSE2-NEXT:    pand %xmm0, %xmm13
-; SSE2-NEXT:    pand %xmm0, %xmm10
-; SSE2-NEXT:    packuswb %xmm13, %xmm10
-; SSE2-NEXT:    packuswb %xmm10, %xmm15
-; SSE2-NEXT:    psrld $1, %xmm12
+; SSE2-NEXT:    packuswb %xmm14, %xmm13
+; SSE2-NEXT:    packuswb %xmm12, %xmm13
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    psrld $1, %xmm2
 ; SSE2-NEXT:    psrld $1, %xmm11
 ; SSE2-NEXT:    pand %xmm0, %xmm11
-; SSE2-NEXT:    pand %xmm0, %xmm12
-; SSE2-NEXT:    packuswb %xmm11, %xmm12
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psrld $1, %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    packuswb %xmm11, %xmm2
+; SSE2-NEXT:    psrld $1, %xmm10
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
 ; SSE2-NEXT:    psrld $1, %xmm3
 ; SSE2-NEXT:    pand %xmm0, %xmm3
-; SSE2-NEXT:    pand %xmm0, %xmm1
-; SSE2-NEXT:    packuswb %xmm3, %xmm1
-; SSE2-NEXT:    packuswb %xmm1, %xmm12
-; SSE2-NEXT:    movdqu %xmm12, (%rax)
-; SSE2-NEXT:    movdqu %xmm15, (%rax)
+; SSE2-NEXT:    pand %xmm0, %xmm10
+; SSE2-NEXT:    packuswb %xmm3, %xmm10
+; SSE2-NEXT:    packuswb %xmm2, %xmm10
+; SSE2-NEXT:    movdqu %xmm10, (%rax)
+; SSE2-NEXT:    movdqu %xmm13, (%rax)
 ; SSE2-NEXT:    movdqu %xmm8, (%rax)
-; SSE2-NEXT:    movdqu %xmm2, (%rax)
+; SSE2-NEXT:    movdqu %xmm1, (%rax)
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: avg_v64i8_const:
@@ -2089,7 +2106,7 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
 ; AVX2-NEXT:    vpsrld $1, %ymm5, %ymm5
 ; AVX2-NEXT:    vpsrld $1, %ymm6, %ymm6
 ; AVX2-NEXT:    vpsrld $1, %ymm7, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm3, %ymm3
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm7 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -2149,6 +2166,7 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
 ; AVX512F-NEXT:    vmovdqu %ymm2, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v64i8_const:
@@ -2156,6 +2174,7 @@ define void @avg_v64i8_const(<64 x i8>* %a) {
 ; AVX512BW-NEXT:    vmovdqu8 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpavgb {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu8 %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <64 x i8>, <64 x i8>* %a
   %2 = zext <64 x i8> %1 to <64 x i32>
@@ -2289,6 +2308,7 @@ define void @avg_v16i16_const(<16 x i16>* %a) {
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v16i16_const:
@@ -2296,6 +2316,7 @@ define void @avg_v16i16_const(<16 x i16>* %a) {
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <16 x i16>, <16 x i16>* %a
   %2 = zext <16 x i16> %1 to <16 x i32>
@@ -2385,7 +2406,7 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
 ; AVX2-NEXT:    vpsrld $1, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrld $1, %ymm2, %ymm2
 ; AVX2-NEXT:    vpsrld $1, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
@@ -2412,6 +2433,7 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
 ; AVX512F-NEXT:    vpsrld $1, %zmm1, %zmm1
 ; AVX512F-NEXT:    vpmovdw %zmm1, (%rax)
 ; AVX512F-NEXT:    vpmovdw %zmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: avg_v32i16_const:
@@ -2419,6 +2441,7 @@ define void @avg_v32i16_const(<32 x i16>* %a) {
 ; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpavgw {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = load <32 x i16>, <32 x i16>* %a
   %2 = zext <32 x i16> %1 to <32 x i32>
diff --git a/test/CodeGen/X86/avx-cvt-3.ll b/test/CodeGen/X86/avx-cvt-3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..066719b3bfe8cbb0bb1f09b227d1fd13fcdfcb71
--- /dev/null
+++ b/test/CodeGen/X86/avx-cvt-3.ll
@@ -0,0 +1,148 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefix=X64
+
+; Insertion/shuffles of all-zero/all-bits/constants into v8i32->v8f32 sitofp conversion.
+
+define <8 x float> @sitofp_insert_zero_v8i32(<8 x i32> %a0) {
+; X86-LABEL: sitofp_insert_zero_v8i32:
+; X86:       # BB#0:
+; X86-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_insert_zero_v8i32:
+; X64:       # BB#0:
+; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT:    retq
+  %1 = insertelement <8 x i32> %a0, i32 0, i32 0
+  %2 = insertelement <8 x i32>  %1, i32 0, i32 2
+  %3 = insertelement <8 x i32>  %2, i32 0, i32 4
+  %4 = insertelement <8 x i32>  %3, i32 0, i32 5
+  %5 = sitofp <8 x i32> %4 to <8 x float>
+  ret <8 x float> %5
+}
+
+define <8 x float> @sitofp_shuffle_zero_v8i32(<8 x i32> %a0) {
+; X86-LABEL: sitofp_shuffle_zero_v8i32:
+; X86:       # BB#0:
+; X86-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_shuffle_zero_v8i32:
+; X64:       # BB#0:
+; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT:    retq
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> <i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef, i32 0, i32 undef>, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  %2 = sitofp <8 x i32> %1 to <8 x float>
+  ret <8 x float> %2
+}
+
+define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) {
+; X86-LABEL: sitofp_insert_allbits_v8i32:
+; X86:       # BB#0:
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_insert_allbits_v8i32:
+; X64:       # BB#0:
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7]
+; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT:    retq
+  %1 = insertelement <8 x i32> %a0, i32 -1, i32 0
+  %2 = insertelement <8 x i32>  %1, i32 -1, i32 2
+  %3 = insertelement <8 x i32>  %2, i32 -1, i32 4
+  %4 = insertelement <8 x i32>  %3, i32 -1, i32 5
+  %5 = sitofp <8 x i32> %4 to <8 x float>
+  ret <8 x float> %5
+}
+
+define <8 x float> @sitofp_shuffle_allbits_v8i32(<8 x i32> %a0) {
+; X86-LABEL: sitofp_shuffle_allbits_v8i32:
+; X86:       # BB#0:
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_shuffle_allbits_v8i32:
+; X64:       # BB#0:
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7]
+; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT:    retq
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> <i32 -1, i32 undef, i32 -1, i32 undef, i32 -1, i32 undef, i32 -1, i32 undef>, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  %2 = sitofp <8 x i32> %1 to <8 x float>
+  ret <8 x float> %2
+}
+
+define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) {
+; X86-LABEL: sitofp_insert_constants_v8i32:
+; X86:       # BB#0:
+; X86-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; X86-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X86-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
+; X86-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X86-NEXT:    movl $2, %eax
+; X86-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
+; X86-NEXT:    movl $-3, %eax
+; X86-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X86-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_insert_constants_v8i32:
+; X64:       # BB#0:
+; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7]
+; X64-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7]
+; X64-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT:    movl $2, %eax
+; X64-NEXT:    vpinsrd $0, %eax, %xmm1, %xmm1
+; X64-NEXT:    movl $-3, %eax
+; X64-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X64-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT:    retq
+  %1 = insertelement <8 x i32> %a0, i32  0, i32 0
+  %2 = insertelement <8 x i32>  %1, i32 -1, i32 2
+  %3 = insertelement <8 x i32>  %2, i32  2, i32 4
+  %4 = insertelement <8 x i32>  %3, i32 -3, i32 5
+  %5 = sitofp <8 x i32> %4 to <8 x float>
+  ret <8 x float> %5
+}
+
+define <8 x float> @sitofp_shuffle_constants_v8i32(<8 x i32> %a0) {
+; X86-LABEL: sitofp_shuffle_constants_v8i32:
+; X86:       # BB#0:
+; X86-NEXT:    vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
+; X86-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_shuffle_constants_v8i32:
+; X64:       # BB#0:
+; X64-NEXT:    vblendps {{.*#+}} ymm0 = mem[0],ymm0[1],mem[2],ymm0[3],mem[4],ymm0[5],mem[6],ymm0[7]
+; X64-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT:    retq
+  %1 = shufflevector <8 x i32> %a0, <8 x i32> <i32 0, i32 undef, i32 -1, i32 undef, i32 2, i32 undef, i32 -3, i32 undef>, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 12, i32 5, i32 14, i32 7>
+  %2 = sitofp <8 x i32> %1 to <8 x float>
+  ret <8 x float> %2
+}
diff --git a/test/CodeGen/X86/avx-cvt.ll b/test/CodeGen/X86/avx-cvt.ll
index a7cd8cf23984398fa074022f7e21c91720c20f3d..f2900dba938a7f2ff0f5be24b8e5113751963235 100644
--- a/test/CodeGen/X86/avx-cvt.ll
+++ b/test/CodeGen/X86/avx-cvt.ll
@@ -136,7 +136,8 @@ define float @funcD(i64* nocapture %e) nounwind uwtable readonly ssp {
 define void @fpext() nounwind uwtable {
 ; CHECK-LABEL: fpext:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    vcvtss2sd -{{[0-9]+}}(%rsp), %xmm0, %xmm0
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    retq
   %f = alloca float, align 4
diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
index fe4fc65ef7156ca842eeb06349dee66f00fb67ac..4a86fa22f081543c5b540934d7eb3722c879d0aa 100644
--- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -3310,16 +3310,16 @@ define <8 x float> @test_mm256_sub_ps(<8 x float> %a0, <8 x float> %a1) nounwind
 define i32 @test_mm_testc_pd(<2 x double> %a0, <2 x double> %a1) nounwind {
 ; X32-LABEL: test_mm_testc_pd:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    vtestpd %xmm1, %xmm0
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_testc_pd:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    vtestpd %xmm1, %xmm0
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1)
   ret i32 %res
@@ -3329,17 +3329,17 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnon
 define i32 @test_mm256_testc_pd(<4 x double> %a0, <4 x double> %a1) nounwind {
 ; X32-LABEL: test_mm256_testc_pd:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    vtestpd %ymm1, %ymm0
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_testc_pd:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    vtestpd %ymm1, %ymm0
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    setb %al
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1)
@@ -3350,16 +3350,16 @@ declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind rea
 define i32 @test_mm_testc_ps(<4 x float> %a0, <4 x float> %a1) nounwind {
 ; X32-LABEL: test_mm_testc_ps:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    vtestps %xmm1, %xmm0
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_testc_ps:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    vtestps %xmm1, %xmm0
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1)
   ret i32 %res
@@ -3369,17 +3369,17 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
 define i32 @test_mm256_testc_ps(<8 x float> %a0, <8 x float> %a1) nounwind {
 ; X32-LABEL: test_mm256_testc_ps:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    vtestps %ymm1, %ymm0
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_testc_ps:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    vtestps %ymm1, %ymm0
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    setb %al
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1)
@@ -3390,17 +3390,17 @@ declare i32 @llvm.x86.avx.vtestc.ps.256(<8 x float>, <8 x float>) nounwind readn
 define i32 @test_mm256_testc_si256(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; X32-LABEL: test_mm256_testc_si256:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    vptest %ymm1, %ymm0
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    vzeroupper
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_testc_si256:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    vptest %ymm1, %ymm0
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    setb %al
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1)
diff --git a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
index 1db0256e8e38b41a95adc570633c80d73b19a052..27aeb77468ce9108dcf399a3dc607bbeec7dda5c 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -403,8 +403,8 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_storeu_pd:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    vpslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; CHECK-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovupd %xmm0, (%eax)
 ; CHECK-NEXT:    retl
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index a8befa8b0e1dc02e5981b790ad2278edffec8fa8..70e31771071f0163b42265d058f85547aa75a9b5 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -1,2631 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx,aes,pclmul -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=avx,pclmul -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
 ; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL
 
-define <2 x i64> @test_x86_aesni_aesdec(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesdec:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vaesdec %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xde,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.aesni.aesdec(<2 x i64>, <2 x i64>) nounwind readnone
-
-
-define <2 x i64> @test_x86_aesni_aesdeclast(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesdeclast:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vaesdeclast %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdf,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.aesni.aesdeclast(<2 x i64>, <2 x i64>) nounwind readnone
-
-
-define <2 x i64> @test_x86_aesni_aesenc(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesenc:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vaesenc %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdc,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.aesni.aesenc(<2 x i64>, <2 x i64>) nounwind readnone
-
-
-define <2 x i64> @test_x86_aesni_aesenclast(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_aesni_aesenclast:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vaesenclast %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdd,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.aesni.aesenclast(<2 x i64>, <2 x i64>) nounwind readnone
-
-
-define <2 x i64> @test_x86_aesni_aesimc(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_aesni_aesimc:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vaesimc %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0xdb,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64> %a0) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.aesni.aesimc(<2 x i64>) nounwind readnone
-
-
-define <2 x i64> @test_x86_aesni_aeskeygenassist(<2 x i64> %a0) {
-; CHECK-LABEL: test_x86_aesni_aeskeygenassist:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vaeskeygenassist $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0xdf,0xc0,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64> %a0, i8 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.aesni.aeskeygenassist(<2 x i64>, i8) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_cmp_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcmpordpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xc2,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cmp.pd(<2 x double>, <2 x double>, i8) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_cmp_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_cmp_sd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcmpordsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0xc2,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cmp.sd(<2 x double>, <2 x double>, i8) nounwind readnone
-
-
-define i32 @test_x86_sse2_comieq_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_comieq_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2f,0xc1]
-; AVX-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; AVX-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; AVX-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; AVX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_comieq_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc1]
-; AVX512VL-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; AVX512VL-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; AVX512VL-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; AVX512VL-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.comieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.comieq.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_comige_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_comige_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vcomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2f,0xc1]
-; AVX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_comige_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vcomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc1]
-; AVX512VL-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.comige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.comige.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_comigt_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_comigt_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vcomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2f,0xc1]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_comigt_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vcomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc1]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.comigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.comigt.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_comile_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_comile_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vcomisd %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0x2f,0xc8]
-; AVX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_comile_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vcomisd %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc8]
-; AVX512VL-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.comile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.comile.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_comilt_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_comilt_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vcomisd %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0x2f,0xc8]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_comilt_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vcomisd %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc8]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.comilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.comilt.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_comineq_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_comineq_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2f,0xc1]
-; AVX-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_comineq_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2f,0xc1]
-; AVX512VL-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX512VL-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX512VL-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX512VL-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.comineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.comineq.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse2_cvtdq2ps(<4 x i32> %a0) {
-; AVX-LABEL: test_x86_sse2_cvtdq2ps:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcvtdq2ps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5b,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_cvtdq2ps:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcvtdq2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5b,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_cvtpd2dq(<2 x double> %a0) {
-; AVX-LABEL: test_x86_sse2_cvtpd2dq:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcvtpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0xe6,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_cvtpd2dq:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcvtpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0xe6,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse2_cvtpd2ps(<2 x double> %a0) {
-; AVX-LABEL: test_x86_sse2_cvtpd2ps:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcvtpd2ps %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x5a,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_cvtpd2ps:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcvtpd2ps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5a,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse2.cvtpd2ps(<2 x double>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_cvtps2dq(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse2_cvtps2dq:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtps2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x5b,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse2_cvtsd2si(<2 x double> %a0) {
-; AVX-LABEL: test_x86_sse2_cvtsd2si:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcvtsd2si %xmm0, %eax ## encoding: [0xc5,0xfb,0x2d,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_cvtsd2si:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcvtsd2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x2d,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.cvtsd2si(<2 x double> %a0) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.cvtsd2si(<2 x double>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse2_cvtsd2ss(<4 x float> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_cvtsd2ss:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtsd2ss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5a,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> %a0, <2 x double> %a1) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_cvtsi2sd(<2 x double> %a0, i32 %a1) {
-; AVX-LABEL: test_x86_sse2_cvtsi2sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcvtsi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x2a,0x44,0x24,0x04]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_cvtsi2sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcvtsi2sdl {{[0-9]+}}(%esp), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x2a,0x44,0x24,0x04]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double> %a0, i32 %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtsi2sd(<2 x double>, i32) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_cvtss2sd(<2 x double> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse2_cvtss2sd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcvtss2sd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5a,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> %a0, <4 x float> %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_cvttpd2dq(<2 x double> %a0) {
-; AVX-LABEL: test_x86_sse2_cvttpd2dq:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe6,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_cvttpd2dq:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcvttpd2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe6,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_cvttps2dq(<4 x float> %a0) {
-; AVX-LABEL: test_x86_sse2_cvttps2dq:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcvttps2dq %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5b,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_cvttps2dq:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcvttps2dq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5b,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse2_cvttsd2si(<2 x double> %a0) {
-; AVX-LABEL: test_x86_sse2_cvttsd2si:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcvttsd2si %xmm0, %eax ## encoding: [0xc5,0xfb,0x2c,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_cvttsd2si:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcvttsd2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x2c,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %a0) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.cvttsd2si(<2 x double>) nounwind readnone
-
-
-
-define <2 x double> @test_x86_sse2_max_pd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_max_pd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x5f,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_max_pd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vmaxpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5f,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.max.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_max_sd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5f,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_min_pd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_min_pd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vminpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x5d,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_min_pd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vminpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x5d,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.min.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse2_min_sd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5d,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_movmsk_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_movmsk_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovmskpd %xmm0, %eax ## encoding: [0xc5,0xf9,0x50,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.movmsk.pd(<2 x double> %a0) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.movmsk.pd(<2 x double>) nounwind readnone
-
-
-
-
-define <8 x i16> @test_x86_sse2_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse2_packssdw_128:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x6b,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_packssdw_128:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_packsswb_128:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x63,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_packsswb_128:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_packuswb_128:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x67,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_packuswb_128:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_sse2_padds_b:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_padds_b:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_padds_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_padds_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_sse2_paddus_b:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdc,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_paddus_b:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdc,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.paddus.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_paddus_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_paddus_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xdd,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_paddus_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdd,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_pavg_b(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_sse2_pavg_b:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe0,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pavg_b:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_pavg_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_pavg_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe3,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pavg_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_pmadd_wd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf5,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pmadd_wd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_pmaxs_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_pmaxs_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xee,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pmaxs_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_pmaxu_b(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_sse2_pmaxu_b:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xde,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pmaxu_b:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xde,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_pmins_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_pmins_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xea,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pmins_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpminsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_pminu_b(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_sse2_pminu_b:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpminub %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xda,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pminu_b:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpminub %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xda,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define i32 @test_x86_sse2_pmovmskb_128(<16 x i8> %a0) {
-; CHECK-LABEL: test_x86_sse2_pmovmskb_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpmovmskb %xmm0, %eax ## encoding: [0xc5,0xf9,0xd7,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %a0) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_pmulh_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_pmulh_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe5,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pmulh_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pmulh.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_pmulhu_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_pmulhu_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe4,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pmulhu_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_pmulu_dq(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse2_pmulu_dq:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf4,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pmulu_dq:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmuludq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf4,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psad_bw(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_sse2_psad_bw:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf6,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psad_bw:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf6,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_psll_d(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse2_psll_d:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpslld %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf2,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psll_d:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpslld %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf2,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psll_q(<2 x i64> %a0, <2 x i64> %a1) {
-; AVX-LABEL: test_x86_sse2_psll_q:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf3,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psll_q:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsllq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf3,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_psll_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_psll_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xf1,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psll_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf1,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_pslli_d(<4 x i32> %a0) {
-; AVX-LABEL: test_x86_sse2_pslli_d:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpslld $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xf0,0x07]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pslli_d:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpslld $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xf0,0x07]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_pslli_q(<2 x i64> %a0) {
-; AVX-LABEL: test_x86_sse2_pslli_q:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsllq $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x73,0xf0,0x07]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pslli_q:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsllq $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xf0,0x07]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_pslli_w(<8 x i16> %a0) {
-; AVX-LABEL: test_x86_sse2_pslli_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsllw $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xf0,0x07]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_pslli_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsllw $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xf0,0x07]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_psra_d(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse2_psra_d:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe2,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psra_d:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsrad %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe2,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_psra_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_psra_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe1,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psra_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe1,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_psrai_d(<4 x i32> %a0) {
-; AVX-LABEL: test_x86_sse2_psrai_d:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsrad $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xe0,0x07]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psrai_d:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsrad $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xe0,0x07]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_psrai_w(<8 x i16> %a0) {
-; AVX-LABEL: test_x86_sse2_psrai_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsraw $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xe0,0x07]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psrai_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsraw $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xe0,0x07]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_psrl_d(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse2_psrl_d:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd2,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psrl_d:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsrld %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrl_q(<2 x i64> %a0, <2 x i64> %a1) {
-; AVX-LABEL: test_x86_sse2_psrl_q:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd3,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psrl_q:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsrlq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_psrl_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_psrl_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd1,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psrl_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse2_psrli_d(<4 x i32> %a0) {
-; AVX-LABEL: test_x86_sse2_psrli_d:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsrld $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x72,0xd0,0x07]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psrli_d:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsrld $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x72,0xd0,0x07]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %a0, i32 7) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse2_psrli_q(<2 x i64> %a0) {
-; AVX-LABEL: test_x86_sse2_psrli_q:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsrlq $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x73,0xd0,0x07]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psrli_q:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsrlq $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x73,0xd0,0x07]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_psrli_w(<8 x i16> %a0) {
-; AVX-LABEL: test_x86_sse2_psrli_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsrlw $7, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x71,0xd0,0x07]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psrli_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsrlw $7, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x71,0xd0,0x07]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %a0, i32 7) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_sse2_psubs_b:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psubs_b:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_psubs_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psubs_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_sse2_psubus_b:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd8,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psubus_b:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd8,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse2_psubus_w(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse2_psubus_w:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd9,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_psubus_w:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd9,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_sqrt_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse2_sqrt_sd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_ucomieq_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vucomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2e,0xc1]
-; AVX-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; AVX-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; AVX-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; AVX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_ucomieq_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vucomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc1]
-; AVX512VL-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; AVX512VL-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; AVX512VL-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; AVX512VL-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.ucomieq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.ucomieq.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_ucomige_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_ucomige_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vucomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2e,0xc1]
-; AVX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_ucomige_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vucomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc1]
-; AVX512VL-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.ucomige.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.ucomige.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_ucomigt_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_ucomigt_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vucomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2e,0xc1]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_ucomigt_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vucomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc1]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.ucomigt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.ucomigt.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_ucomile_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_ucomile_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vucomisd %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0x2e,0xc8]
-; AVX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_ucomile_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vucomisd %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc8]
-; AVX512VL-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.ucomile.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.ucomile.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_ucomilt_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_ucomilt_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vucomisd %xmm0, %xmm1 ## encoding: [0xc5,0xf9,0x2e,0xc8]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_ucomilt_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vucomisd %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc8]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.ucomilt.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.ucomilt.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
-; AVX-LABEL: test_x86_sse2_ucomineq_sd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vucomisd %xmm1, %xmm0 ## encoding: [0xc5,0xf9,0x2e,0xc1]
-; AVX-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse2_ucomineq_sd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vucomisd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x2e,0xc1]
-; AVX512VL-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX512VL-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX512VL-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX512VL-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse2.ucomineq.sd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse3_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse3_addsub_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vaddsubpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd0,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse3.addsub.pd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse3_addsub_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse3_addsub_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vaddsubps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0xd0,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse3_hadd_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse3_hadd_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vhaddpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x7c,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse3.hadd.pd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse3_hadd_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse3_hadd_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vhaddps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x7c,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse3.hadd.ps(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse3_hsub_pd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse3_hsub_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vhsubpd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x7d,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse3.hsub.pd(<2 x double>, <2 x double>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse3_hsub_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse3_hsub_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vhsubps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x7d,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse3.hsub.ps(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse3_ldu_dq(i8* %a0) {
-; CHECK-LABEL: test_x86_sse3_ldu_dq:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT:    vlddqu (%eax), %xmm0 ## encoding: [0xc5,0xfb,0xf0,0x00]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse3.ldu.dq(i8* %a0) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
-
-
-define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
-; CHECK-LABEL: test_x86_sse41_blendvpd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4b,0xc1,0x20]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-; CHECK-LABEL: test_x86_sse41_blendvps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4a,0xc1,0x20]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse41_dppd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_dppd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vdppd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x41,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse41.dppd(<2 x double> %a0, <2 x double> %a1, i8 7) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse41.dppd(<2 x double>, <2 x double>, i8) nounwind readnone
-
-
-define <4 x float> @test_x86_sse41_dpps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse41_dpps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vdpps $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x40,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse41.dpps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse41.dpps(<4 x float>, <4 x float>, i8) nounwind readnone
-
-
-define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse41_insertps:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vinsertps $21, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x15]
-; AVX-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_insertps:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vinsertps $21, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x21,0xc1,0x15]
-; AVX512VL-NEXT:    ## xmm0 = zero,xmm1[0],zero,xmm0[3]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 21) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
-
-
-
-define <8 x i16> @test_x86_sse41_mpsadbw(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse41_mpsadbw:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmpsadbw $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x42,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.mpsadbw(<16 x i8>, <16 x i8>, i8) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse41_packusdw:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_packusdw:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse41_pblendvb:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x4c,0xc1,0x20]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_phminposuw(<8 x i16> %a0) {
-; CHECK-LABEL: test_x86_sse41_phminposuw:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vphminposuw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x41,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse41_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_sse41_pmaxsb:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3c,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_pmaxsb:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3c,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse41_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse41_pmaxsd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3d,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_pmaxsd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmaxsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3d,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse41_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse41_pmaxud:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3f,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_pmaxud:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmaxud %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3f,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse41_pmaxuw:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3e,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_pmaxuw:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse41_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_sse41_pminsb:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x38,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_pminsb:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x38,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse41_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse41_pminsd:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x39,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_pminsd:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpminsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x39,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <4 x i32> @test_x86_sse41_pminud(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse41_pminud:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpminud %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3b,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_pminud:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpminud %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3b,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <8 x i16> @test_x86_sse41_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_sse41_pminuw:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x3a,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_pminuw:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpminuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <2 x i64> @test_x86_sse41_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
-; AVX-LABEL: test_x86_sse41_pmuldq:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x28,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse41_pmuldq:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmuldq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x28,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1) ; <<2 x i64>> [#uses=1]
-  ret <2 x i64> %res
-}
-declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse41_ptestc:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
-; CHECK-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; CHECK-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse41.ptestc(<2 x i64>, <2 x i64>) nounwind readnone
-
-
-define i32 @test_x86_sse41_ptestnzc(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse41_ptestnzc:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
-; CHECK-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse41.ptestnzc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
-
-
-define i32 @test_x86_sse41_ptestz(<2 x i64> %a0, <2 x i64> %a1) {
-; CHECK-LABEL: test_x86_sse41_ptestz:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
-; CHECK-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse41_round_pd(<2 x double> %a0) {
-; CHECK-LABEL: test_x86_sse41_round_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vroundpd $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x09,0xc0,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %a0, i32 7) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse41.round.pd(<2 x double>, i32) nounwind readnone
-
-
-define <4 x float> @test_x86_sse41_round_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse41_round_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vroundps $7, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x08,0xc0,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse41.round.ps(<4 x float>, i32) nounwind readnone
-
-
-define <2 x double> @test_x86_sse41_round_sd(<2 x double> %a0, <2 x double> %a1) {
-; CHECK-LABEL: test_x86_sse41_round_sd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vroundsd $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0b,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a0, <2 x double> %a1, i32 7) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse41.round.sd(<2 x double>, <2 x double>, i32) nounwind readnone
-
-
-define <4 x float> @test_x86_sse41_round_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse41_round_ss:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vroundss $7, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0a,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a0, <4 x float> %a1, i32 7) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse41.round.ss(<4 x float>, <4 x float>, i32) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestri128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
-; CHECK-NEXT:    movl %ecx, %eax ## encoding: [0x89,0xc8]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpestri128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpestri128_load(<16 x i8>* %a0, <16 x i8>* %a2) {
-; AVX-LABEL: test_x86_sse42_pcmpestri128_load:
-; AVX:       ## BB#0:
-; AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT:    vmovdqa (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x6f,0x00]
-; AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; AVX-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; AVX-NEXT:    vpcmpestri $7, (%ecx), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0x01,0x07]
-; AVX-NEXT:    movl %ecx, %eax ## encoding: [0x89,0xc8]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse42_pcmpestri128_load:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT:    vmovdqu (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x00]
-; AVX512VL-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; AVX512VL-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; AVX512VL-NEXT:    vpcmpestri $7, (%ecx), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0x01,0x07]
-; AVX512VL-NEXT:    movl %ecx, %eax ## encoding: [0x89,0xc8]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %1 = load <16 x i8>, <16 x i8>* %a0
-  %2 = load <16 x i8>, <16 x i8>* %a2
-  %res = call i32 @llvm.x86.sse42.pcmpestri128(<16 x i8> %1, i32 7, <16 x i8> %2, i32 7, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-
-
-define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
-; CHECK-LABEL: test_x86_sse42_pcmpestria128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    pushl %ebx ## encoding: [0x53]
-; CHECK-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
-; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
-; CHECK-NEXT:    seta %bl ## encoding: [0x0f,0x97,0xc3]
-; CHECK-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
-; CHECK-NEXT:    popl %ebx ## encoding: [0x5b]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpestria128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestric128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
-; CHECK-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; CHECK-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpestric128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpestrio128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
-; CHECK-LABEL: test_x86_sse42_pcmpestrio128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    pushl %ebx ## encoding: [0x53]
-; CHECK-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
-; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
-; CHECK-NEXT:    seto %bl ## encoding: [0x0f,0x90,0xc3]
-; CHECK-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
-; CHECK-NEXT:    popl %ebx ## encoding: [0x5b]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpestrio128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpestris128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
-; CHECK-LABEL: test_x86_sse42_pcmpestris128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    pushl %ebx ## encoding: [0x53]
-; CHECK-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
-; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
-; CHECK-NEXT:    sets %bl ## encoding: [0x0f,0x98,0xc3]
-; CHECK-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
-; CHECK-NEXT:    popl %ebx ## encoding: [0x5b]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpestris128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpestris128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpestriz128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
-; CHECK-LABEL: test_x86_sse42_pcmpestriz128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    pushl %ebx ## encoding: [0x53]
-; CHECK-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
-; CHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
-; CHECK-NEXT:    sete %bl ## encoding: [0x0f,0x94,0xc3]
-; CHECK-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
-; CHECK-NEXT:    popl %ebx ## encoding: [0x5b]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpestriz128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse42_pcmpestrm128(<16 x i8> %a0, <16 x i8> %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestrm128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    vpcmpestrm $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x60,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse42_pcmpestrm128_load(<16 x i8> %a0, <16 x i8>* %a2) {
-; CHECK-LABEL: test_x86_sse42_pcmpestrm128_load:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; CHECK-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
-; CHECK-NEXT:    vpcmpestrm $7, (%ecx), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x60,0x01,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %1 = load <16 x i8>, <16 x i8>* %a2
-  %res = call <16 x i8> @llvm.x86.sse42.pcmpestrm128(<16 x i8> %a0, i32 7, <16 x i8> %1, i32 7, i8 7) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
-define i32 @test_x86_sse42_pcmpistri128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistri128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
-; CHECK-NEXT:    movl %ecx, %eax ## encoding: [0x89,0xc8]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpistri128(<16 x i8>, <16 x i8>, i8) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpistri128_load(<16 x i8>* %a0, <16 x i8>* %a1) {
-; AVX-LABEL: test_x86_sse42_pcmpistri128_load:
-; AVX:       ## BB#0:
-; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; AVX-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; AVX-NEXT:    vmovdqa (%ecx), %xmm0 ## encoding: [0xc5,0xf9,0x6f,0x01]
-; AVX-NEXT:    vpcmpistri $7, (%eax), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0x00,0x07]
-; AVX-NEXT:    movl %ecx, %eax ## encoding: [0x89,0xc8]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse42_pcmpistri128_load:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; AVX512VL-NEXT:    vmovdqu (%ecx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x01]
-; AVX512VL-NEXT:    vpcmpistri $7, (%eax), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0x00,0x07]
-; AVX512VL-NEXT:    movl %ecx, %eax ## encoding: [0x89,0xc8]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %1 = load <16 x i8>, <16 x i8>* %a0
-  %2 = load <16 x i8>, <16 x i8>* %a1
-  %res = call i32 @llvm.x86.sse42.pcmpistri128(<16 x i8> %1, <16 x i8> %2, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-
-
-define i32 @test_x86_sse42_pcmpistria128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistria128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
-; CHECK-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpistria128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistric128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
-; CHECK-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; CHECK-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpistric128(<16 x i8>, <16 x i8>, i8) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistrio128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
-; CHECK-NEXT:    seto %al ## encoding: [0x0f,0x90,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpistrio128(<16 x i8>, <16 x i8>, i8) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpistris128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistris128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
-; CHECK-NEXT:    sets %al ## encoding: [0x0f,0x98,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpistris128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpistris128(<16 x i8>, <16 x i8>, i8) nounwind readnone
-
-
-define i32 @test_x86_sse42_pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistriz128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; CHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
-; CHECK-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse42.pcmpistriz128(<16 x i8>, <16 x i8>, i8) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse42_pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistrm128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpistrm $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x62,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8>, <16 x i8>, i8) nounwind readnone
-
-
-define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1) {
-; CHECK-LABEL: test_x86_sse42_pcmpistrm128_load:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT:    vpcmpistrm $7, (%eax), %xmm0 ## encoding: [0xc4,0xe3,0x79,0x62,0x00,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %1 = load <16 x i8>, <16 x i8>* %a1
-  %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-
-
-define <4 x float> @test_x86_sse_cmp_ps(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_cmp_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcmpordps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc2,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.cmp.ps(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.cmp.ps(<4 x float>, <4 x float>, i8) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_cmp_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_cmp_ss:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vcmpordss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0xc2,0xc1,0x07]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.cmp.ss(<4 x float> %a0, <4 x float> %a1, i8 7) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.cmp.ss(<4 x float>, <4 x float>, i8) nounwind readnone
-
-
-define i32 @test_x86_sse_comieq_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_comieq_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; AVX-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; AVX-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; AVX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_comieq_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX512VL-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; AVX512VL-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; AVX512VL-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; AVX512VL-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.comieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.comieq.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_comige_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_comige_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_comige_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX512VL-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.comige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.comige.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_comigt_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_comigt_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_comigt_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.comigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.comigt.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_comile_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_comile_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8]
-; AVX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_comile_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
-; AVX512VL-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.comile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.comile.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_comilt_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_comilt_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vcomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2f,0xc8]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_comilt_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vcomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc8]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.comilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.comilt.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_comineq_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_comineq_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_comineq_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2f,0xc1]
-; AVX512VL-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX512VL-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX512VL-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX512VL-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.comineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.comineq.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_cvtsi2ss(<4 x float> %a0) {
-; AVX-LABEL: test_x86_sse_cvtsi2ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; AVX-NEXT:    vcvtsi2ssl %eax, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x2a,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_cvtsi2ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
-; AVX512VL-NEXT:    vcvtsi2ssl %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2a,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float> %a0, i32 7) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.cvtsi2ss(<4 x float>, i32) nounwind readnone
-
-
-define i32 @test_x86_sse_cvtss2si(<4 x float> %a0) {
-; AVX-LABEL: test_x86_sse_cvtss2si:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcvtss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2d,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_cvtss2si:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcvtss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2d,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.cvtss2si(<4 x float> %a0) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.cvtss2si(<4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_cvttss2si(<4 x float> %a0) {
-; AVX-LABEL: test_x86_sse_cvttss2si:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vcvttss2si %xmm0, %eax ## encoding: [0xc5,0xfa,0x2c,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_cvttss2si:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vcvttss2si %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x2c,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.cvttss2si(<4 x float> %a0) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.cvttss2si(<4 x float>) nounwind readnone
-
-
-define void @test_x86_sse_ldmxcsr(i8* %a0) {
-; CHECK-LABEL: test_x86_sse_ldmxcsr:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT:    vldmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x10]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  call void @llvm.x86.sse.ldmxcsr(i8* %a0)
-  ret void
-}
-declare void @llvm.x86.sse.ldmxcsr(i8*) nounwind
-
-
-
-define <4 x float> @test_x86_sse_max_ps(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_max_ps:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5f,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_max_ps:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_max_ss:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5f,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_min_ps(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_min_ps:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vminps %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x5d,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_min_ps:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
-; CHECK-LABEL: test_x86_sse_min_ss:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vminss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5d,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.min.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_movmsk_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_movmsk_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovmskps %xmm0, %eax ## encoding: [0xc5,0xf8,0x50,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.movmsk.ps(<4 x float> %a0) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.movmsk.ps(<4 x float>) nounwind readnone
-
-
-
-define <4 x float> @test_x86_sse_rcp_ps(<4 x float> %a0) {
-; AVX-LABEL: test_x86_sse_rcp_ps:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vrcpps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x53,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_rcp_ps:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vrcp14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4c,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.rcp.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.rcp.ps(<4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_rcp_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_rcp_ss:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vrcpss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x53,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_rsqrt_ps(<4 x float> %a0) {
-; AVX-LABEL: test_x86_sse_rsqrt_ps:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vrsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x52,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_rsqrt_ps:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vrsqrt14ps %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x4e,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.rsqrt.ps(<4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_rsqrt_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_rsqrt_ss:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x52,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_sqrt_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
-; CHECK-LABEL: test_x86_sse_sqrt_ss:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x51,0xc0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.sqrt.ss(<4 x float>) nounwind readnone
-
-
-define void @test_x86_sse_stmxcsr(i8* %a0) {
-; CHECK-LABEL: test_x86_sse_stmxcsr:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT:    vstmxcsr (%eax) ## encoding: [0xc5,0xf8,0xae,0x18]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  call void @llvm.x86.sse.stmxcsr(i8* %a0)
-  ret void
-}
-declare void @llvm.x86.sse.stmxcsr(i8*) nounwind
-
-
-define i32 @test_x86_sse_ucomieq_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_ucomieq_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; AVX-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; AVX-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; AVX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_ucomieq_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX512VL-NEXT:    setnp %al ## encoding: [0x0f,0x9b,0xc0]
-; AVX512VL-NEXT:    sete %cl ## encoding: [0x0f,0x94,0xc1]
-; AVX512VL-NEXT:    andb %al, %cl ## encoding: [0x20,0xc1]
-; AVX512VL-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.ucomieq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.ucomieq.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_ucomige_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_ucomige_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_ucomige_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX512VL-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.ucomige.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.ucomige.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_ucomigt_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_ucomigt_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_ucomigt_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.ucomigt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.ucomigt.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_ucomile_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_ucomile_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8]
-; AVX-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_ucomile_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
-; AVX512VL-NEXT:    setae %al ## encoding: [0x0f,0x93,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.ucomile.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.ucomile.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_ucomilt_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_ucomilt_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vucomiss %xmm0, %xmm1 ## encoding: [0xc5,0xf8,0x2e,0xc8]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_ucomilt_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vucomiss %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc8]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.ucomilt.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.ucomilt.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
-; AVX-LABEL: test_x86_sse_ucomineq_ss:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vucomiss %xmm1, %xmm0 ## encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_sse_ucomineq_ss:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vucomiss %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x2e,0xc1]
-; AVX512VL-NEXT:    setp %al ## encoding: [0x0f,0x9a,0xc0]
-; AVX512VL-NEXT:    setne %cl ## encoding: [0x0f,0x95,0xc1]
-; AVX512VL-NEXT:    orb %al, %cl ## encoding: [0x08,0xc1]
-; AVX512VL-NEXT:    movzbl %cl, %eax ## encoding: [0x0f,0xb6,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call i32 @llvm.x86.sse.ucomineq.ss(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
-  ret i32 %res
-}
-declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
-
-
-define <16 x i8> @test_x86_ssse3_pabs_b_128(<16 x i8> %a0) {
-; AVX-LABEL: test_x86_ssse3_pabs_b_128:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpabsb %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_ssse3_pabs_b_128:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpabsb %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
-
-
-define <4 x i32> @test_x86_ssse3_pabs_d_128(<4 x i32> %a0) {
-; AVX-LABEL: test_x86_ssse3_pabs_d_128:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpabsd %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_ssse3_pabs_d_128:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpabsd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1e,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
-
-
-define <8 x i16> @test_x86_ssse3_pabs_w_128(<8 x i16> %a0) {
-; AVX-LABEL: test_x86_ssse3_pabs_w_128:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpabsw %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_ssse3_pabs_w_128:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpabsw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
-
-
-define <4 x i32> @test_x86_ssse3_phadd_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phadd_d_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vphaddd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x02,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.ssse3.phadd.d.128(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <8 x i16> @test_x86_ssse3_phadd_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phadd_sw_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vphaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x03,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <8 x i16> @test_x86_ssse3_phadd_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phadd_w_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vphaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x01,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.ssse3.phadd.w.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <4 x i32> @test_x86_ssse3_phsub_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phsub_d_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vphsubd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x06,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.ssse3.phsub.d.128(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <8 x i16> @test_x86_ssse3_phsub_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phsub_sw_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vphsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x07,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.ssse3.phsub.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <8 x i16> @test_x86_ssse3_phsub_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_phsub_w_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vphsubw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x05,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.ssse3.phsub.w.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x04,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_ssse3_pmadd_ub_sw_128:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-; Make sure we don't commute this operation.
-define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128_load_op0(<16 x i8>* %ptr, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
-; AVX:       ## BB#0:
-; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT:    vmovdqa (%eax), %xmm1 ## encoding: [0xc5,0xf9,0x6f,0x08]
-; AVX-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0x04,0xc0]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT:    vmovdqu (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x08]
-; AVX512VL-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x04,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %a0 = load <16 x i8>, <16 x i8>* %ptr
-  %res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-
-
-define <8 x i16> @test_x86_ssse3_pmul_hr_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
-; AVX-LABEL: test_x86_ssse3_pmul_hr_sw_128:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_ssse3_pmul_hr_sw_128:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.ssse3.pmul.hr.sw.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-
-define <16 x i8> @test_x86_ssse3_pshuf_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; AVX-LABEL: test_x86_ssse3_pshuf_b_128:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x00,0xc1]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_ssse3_pshuf_b_128:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpshufb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xc1]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <16 x i8> @test_x86_ssse3_psign_b_128(<16 x i8> %a0, <16 x i8> %a1) {
-; CHECK-LABEL: test_x86_ssse3_psign_b_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpsignb %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x08,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1]
-  ret <16 x i8> %res
-}
-declare <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8>, <16 x i8>) nounwind readnone
-
-
-define <4 x i32> @test_x86_ssse3_psign_d_128(<4 x i32> %a0, <4 x i32> %a1) {
-; CHECK-LABEL: test_x86_ssse3_psign_d_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpsignd %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0a,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %a0, <4 x i32> %a1) ; <<4 x i32>> [#uses=1]
-  ret <4 x i32> %res
-}
-declare <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32>, <4 x i32>) nounwind readnone
-
-
-define <8 x i16> @test_x86_ssse3_psign_w_128(<8 x i16> %a0, <8 x i16> %a1) {
-; CHECK-LABEL: test_x86_ssse3_psign_w_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpsignw %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x09,0xc1]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1]
-  ret <8 x i16> %res
-}
-declare <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16>, <8 x i16>) nounwind readnone
-
-
 define <4 x double> @test_x86_avx_addsub_pd_256(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_addsub_pd_256:
 ; CHECK:       ## BB#0:
@@ -2773,6 +149,7 @@ define <4 x float> @test_x86_avx_cvt_pd2_ps_256(<4 x double> %a0) {
 ; AVX512VL-LABEL: test_x86_avx_cvt_pd2_ps_256:
 ; AVX512VL:       ## BB#0:
 ; AVX512VL-NEXT:    vcvtpd2ps %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x5a,0xc0]
+; AVX512VL-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx.cvt.pd2.ps.256(<4 x double> %a0) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
@@ -2790,6 +167,7 @@ define <4 x i32> @test_x86_avx_cvt_pd2dq_256(<4 x double> %a0) {
 ; AVX512VL-LABEL: test_x86_avx_cvt_pd2dq_256:
 ; AVX512VL:       ## BB#0:
 ; AVX512VL-NEXT:    vcvtpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0xe6,0xc0]
+; AVX512VL-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx.cvt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
@@ -2834,6 +212,7 @@ define <4 x i32> @test_x86_avx_cvtt_pd2dq_256(<4 x double> %a0) {
 ; AVX512VL-LABEL: test_x86_avx_cvtt_pd2dq_256:
 ; AVX512VL:       ## BB#0:
 ; AVX512VL-NEXT:    vcvttpd2dq %ymm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe6,0xc0]
+; AVX512VL-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx.cvtt.pd2dq.256(<4 x double> %a0) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
@@ -2985,18 +364,12 @@ declare void @llvm.x86.avx.maskstore.pd(i8*, <2 x i64>, <2 x double>) nounwind
 
 
 define void @test_x86_avx_maskstore_pd_256(i8* %a0, <4 x i64> %mask, <4 x double> %a2) {
-; AVX-LABEL: test_x86_avx_maskstore_pd_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_maskstore_pd_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_maskstore_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    vmaskmovpd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2f,0x08]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   call void @llvm.x86.avx.maskstore.pd.256(i8* %a0, <4 x i64> %mask, <4 x double> %a2)
   ret void
 }
@@ -3016,18 +389,12 @@ declare void @llvm.x86.avx.maskstore.ps(i8*, <4 x i32>, <4 x float>) nounwind
 
 
 define void @test_x86_avx_maskstore_ps_256(i8* %a0, <8 x i32> %mask, <8 x float> %a2) {
-; AVX-LABEL: test_x86_avx_maskstore_ps_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_maskstore_ps_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_maskstore_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    vmaskmovps %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x2e,0x08]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   call void @llvm.x86.avx.maskstore.ps.256(i8* %a0, <8 x i32> %mask, <8 x float> %a2)
   ret void
 }
@@ -3099,16 +466,11 @@ declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind
 
 
 define i32 @test_x86_avx_movmsk_pd_256(<4 x double> %a0) {
-; AVX-LABEL: test_x86_avx_movmsk_pd_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_movmsk_pd_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_movmsk_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovmskpd %ymm0, %eax ## encoding: [0xc5,0xfd,0x50,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.movmsk.pd.256(<4 x double> %a0) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3116,16 +478,11 @@ declare i32 @llvm.x86.avx.movmsk.pd.256(<4 x double>) nounwind readnone
 
 
 define i32 @test_x86_avx_movmsk_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_movmsk_ps_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_movmsk_ps_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_movmsk_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovmskps %ymm0, %eax ## encoding: [0xc5,0xfc,0x50,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %a0) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3138,20 +495,13 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
 
 define i32 @test_x86_avx_ptestc_256(<4 x i64> %a0, <4 x i64> %a1) {
-; AVX-LABEL: test_x86_avx_ptestc_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_ptestc_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX512VL-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX512VL-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_ptestc_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.ptestc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3159,20 +509,13 @@ declare i32 @llvm.x86.avx.ptestc.256(<4 x i64>, <4 x i64>) nounwind readnone
 
 
 define i32 @test_x86_avx_ptestnzc_256(<4 x i64> %a0, <4 x i64> %a1) {
-; AVX-LABEL: test_x86_avx_ptestnzc_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_ptestnzc_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_ptestnzc_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.ptestnzc.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3180,20 +523,13 @@ declare i32 @llvm.x86.avx.ptestnzc.256(<4 x i64>, <4 x i64>) nounwind readnone
 
 
 define i32 @test_x86_avx_ptestz_256(<4 x i64> %a0, <4 x i64> %a1) {
-; AVX-LABEL: test_x86_avx_ptestz_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_ptestz_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
-; AVX512VL-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_ptestz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    vptest %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x17,0xc1]
+; CHECK-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.ptestz.256(<4 x i64> %a0, <4 x i64> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3410,9 +746,9 @@ declare <8 x float> @llvm.x86.avx.vpermilvar.ps.256(<8 x float>, <8 x i32>) noun
 define i32 @test_x86_avx_vtestc_pd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestc_pd:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; CHECK-NEXT:    vtestpd %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0f,0xc1]
-; CHECK-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; CHECK-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
 ; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.vtestc.pd(<2 x double> %a0, <2 x double> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -3421,20 +757,13 @@ declare i32 @llvm.x86.avx.vtestc.pd(<2 x double>, <2 x double>) nounwind readnon
 
 
 define i32 @test_x86_avx_vtestc_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; AVX-LABEL: test_x86_avx_vtestc_pd_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestc_pd_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX512VL-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX512VL-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestc_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.vtestc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3444,9 +773,9 @@ declare i32 @llvm.x86.avx.vtestc.pd.256(<4 x double>, <4 x double>) nounwind rea
 define i32 @test_x86_avx_vtestc_ps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_avx_vtestc_ps:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; CHECK-NEXT:    vtestps %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x0e,0xc1]
-; CHECK-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; CHECK-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
 ; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.vtestc.ps(<4 x float> %a0, <4 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
@@ -3455,20 +784,13 @@ declare i32 @llvm.x86.avx.vtestc.ps(<4 x float>, <4 x float>) nounwind readnone
 
 
 define i32 @test_x86_avx_vtestc_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; AVX-LABEL: test_x86_avx_vtestc_ps_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestc_ps_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX512VL-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; AVX512VL-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestc_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.vtestc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3489,20 +811,13 @@ declare i32 @llvm.x86.avx.vtestnzc.pd(<2 x double>, <2 x double>) nounwind readn
 
 
 define i32 @test_x86_avx_vtestnzc_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; AVX-LABEL: test_x86_avx_vtestnzc_pd_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestnzc_pd_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestnzc_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.vtestnzc.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3523,20 +838,13 @@ declare i32 @llvm.x86.avx.vtestnzc.ps(<4 x float>, <4 x float>) nounwind readnon
 
 
 define i32 @test_x86_avx_vtestnzc_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; AVX-LABEL: test_x86_avx_vtestnzc_ps_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestnzc_ps_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX512VL-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestnzc_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT:    seta %al ## encoding: [0x0f,0x97,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.vtestnzc.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3557,20 +865,13 @@ declare i32 @llvm.x86.avx.vtestz.pd(<2 x double>, <2 x double>) nounwind readnon
 
 
 define i32 @test_x86_avx_vtestz_pd_256(<4 x double> %a0, <4 x double> %a1) {
-; AVX-LABEL: test_x86_avx_vtestz_pd_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestz_pd_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
-; AVX512VL-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestz_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    vtestpd %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0f,0xc1]
+; CHECK-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.vtestz.pd.256(<4 x double> %a0, <4 x double> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3591,20 +892,13 @@ declare i32 @llvm.x86.avx.vtestz.ps(<4 x float>, <4 x float>) nounwind readnone
 
 
 define i32 @test_x86_avx_vtestz_ps_256(<8 x float> %a0, <8 x float> %a1) {
-; AVX-LABEL: test_x86_avx_vtestz_ps_256:
-; AVX:       ## BB#0:
-; AVX-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX-NEXT:    vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_vtestz_ps_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
-; AVX512VL-NEXT:    vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
-; AVX512VL-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx_vtestz_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
+; CHECK-NEXT:    vtestps %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x0e,0xc1]
+; CHECK-NEXT:    sete %al ## encoding: [0x0f,0x94,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx.vtestz.ps.256(<8 x float> %a0, <8 x float> %a1) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -3632,114 +926,12 @@ define void @test_x86_avx_vzeroupper() {
 }
 declare void @llvm.x86.avx.vzeroupper() nounwind
 
-; Make sure instructions with no AVX equivalents, but are associated with SSEX feature flags still work
-
-define void @monitor(i8* %P, i32 %E, i32 %H) nounwind {
-; CHECK-LABEL: monitor:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x0c]
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT:    leal (%eax), %eax ## encoding: [0x8d,0x00]
-; CHECK-NEXT:    monitor ## encoding: [0x0f,0x01,0xc8]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  tail call void @llvm.x86.sse3.monitor(i8* %P, i32 %E, i32 %H)
-  ret void
-}
-declare void @llvm.x86.sse3.monitor(i8*, i32, i32) nounwind
-
-define void @mwait(i32 %E, i32 %H) nounwind {
-; CHECK-LABEL: mwait:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
-; CHECK-NEXT:    mwait ## encoding: [0x0f,0x01,0xc9]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  tail call void @llvm.x86.sse3.mwait(i32 %E, i32 %H)
-  ret void
-}
-declare void @llvm.x86.sse3.mwait(i32, i32) nounwind
-
-define void @sfence() nounwind {
-; CHECK-LABEL: sfence:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    sfence ## encoding: [0x0f,0xae,0xf8]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  tail call void @llvm.x86.sse.sfence()
-  ret void
-}
-declare void @llvm.x86.sse.sfence() nounwind
-
-define void @lfence() nounwind {
-; CHECK-LABEL: lfence:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    lfence ## encoding: [0x0f,0xae,0xe8]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  tail call void @llvm.x86.sse2.lfence()
-  ret void
-}
-declare void @llvm.x86.sse2.lfence() nounwind
-
-define void @mfence() nounwind {
-; CHECK-LABEL: mfence:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    mfence ## encoding: [0x0f,0xae,0xf0]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  tail call void @llvm.x86.sse2.mfence()
-  ret void
-}
-declare void @llvm.x86.sse2.mfence() nounwind
-
-define void @clflush(i8* %p) nounwind {
-; CHECK-LABEL: clflush:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT:    clflush (%eax) ## encoding: [0x0f,0xae,0x38]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  tail call void @llvm.x86.sse2.clflush(i8* %p)
-  ret void
-}
-declare void @llvm.x86.sse2.clflush(i8*) nounwind
-
-define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
-; CHECK-LABEL: crc32_32_8:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT:    crc32b {{[0-9]+}}(%esp), %eax ## encoding: [0xf2,0x0f,0x38,0xf0,0x44,0x24,0x08]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
-  ret i32 %tmp
-}
-declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
-
-define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
-; CHECK-LABEL: crc32_32_16:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT:    crc32w {{[0-9]+}}(%esp), %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0x44,0x24,0x08]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
-  ret i32 %tmp
-}
-declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
-
-define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
-; CHECK-LABEL: crc32_32_32:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; CHECK-NEXT:    crc32l {{[0-9]+}}(%esp), %eax ## encoding: [0xf2,0x0f,0x38,0xf1,0x44,0x24,0x08]
-; CHECK-NEXT:    retl ## encoding: [0xc3]
-  %tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
-  ret i32 %tmp
-}
-declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
-
 define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
 ; AVX-LABEL: movnt_dq:
 ; AVX:       ## BB#0:
 ; AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX-NEXT:    vpaddq LCPI247_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A]
-; AVX-NEXT:    ## fixup A - offset: 4, value: LCPI247_0, kind: FK_Data_4
+; AVX-NEXT:    vpaddq LCPI65_0, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A]
+; AVX-NEXT:    ## fixup A - offset: 4, value: LCPI65_0, kind: FK_Data_4
 ; AVX-NEXT:    vmovntdq %ymm0, (%eax) ## encoding: [0xc5,0xfd,0xe7,0x00]
 ; AVX-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; AVX-NEXT:    retl ## encoding: [0xc3]
@@ -3747,9 +939,10 @@ define void @movnt_dq(i8* %p, <2 x i64> %a1) nounwind {
 ; AVX512VL-LABEL: movnt_dq:
 ; AVX512VL:       ## BB#0:
 ; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT:    vpaddq LCPI247_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A]
-; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI247_0, kind: FK_Data_4
+; AVX512VL-NEXT:    vpaddq LCPI65_0, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0x05,A,A,A,A]
+; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI65_0, kind: FK_Data_4
 ; AVX512VL-NEXT:    vmovntdq %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe7,0x00]
+; AVX512VL-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
   %a2 = add <2 x i64> %a1, <i64 1, i64 1>
   %a3 = shufflevector <2 x i64> %a2, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -3770,6 +963,7 @@ define void @movnt_ps(i8* %p, <8 x float> %a) nounwind {
 ; AVX512VL:       ## BB#0:
 ; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
 ; AVX512VL-NEXT:    vmovntps %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x2b,0x00]
+; AVX512VL-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
   tail call void @llvm.x86.avx.movnt.ps.256(i8* %p, <8 x float> %a) nounwind
   ret void
@@ -3793,6 +987,7 @@ define void @movnt_pd(i8* %p, <4 x double> %a1) nounwind {
 ; AVX512VL-NEXT:    vxorpd %ymm1, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x57,0xc9]
 ; AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1]
 ; AVX512VL-NEXT:    vmovntpd %ymm0, (%eax) ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x2b,0x00]
+; AVX512VL-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; AVX512VL-NEXT:    retl ## encoding: [0xc3]
   %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
   tail call void @llvm.x86.avx.movnt.pd.256(i8* %p, <4 x double> %a2) nounwind
diff --git a/test/CodeGen/X86/avx-intrinsics-x86_64.ll b/test/CodeGen/X86/avx-intrinsics-x86_64.ll
index 252574d84d8ffeaed10027111efcdb58cbdef442..909c69cb9a1794314c58c8ce2aea0ec6589b1c2e 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86_64.ll
@@ -1,51 +1,45 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=corei7 -mattr=avx | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=corei7 -mattr=avx512vl | FileCheck %s
-
-define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
-  ; CHECK: vcvtsd2si
-  %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
-  ret i64 %res
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=corei7 -mattr=avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86-64 -mcpu=corei7 -mattr=avx512vl | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512VL
+
+define <4 x double> @test_x86_avx_vzeroall(<4 x double> %a, <4 x double> %b) {
+; AVX-LABEL: test_x86_avx_vzeroall:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vmovupd %ymm0, -{{[0-9]+}}(%rsp) ## 32-byte Spill
+; AVX-NEXT:    vzeroall
+; AVX-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm0 ## 32-byte Reload
+; AVX-NEXT:    retq
+;
+; AVX512VL-LABEL: test_x86_avx_vzeroall:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm16
+; AVX512VL-NEXT:    vzeroall
+; AVX512VL-NEXT:    vmovapd %ymm16, %ymm0
+; AVX512VL-NEXT:    retq
+  %c = fadd <4 x double> %a, %b
+  call void @llvm.x86.avx.vzeroall()
+  ret <4 x double> %c
 }
-declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
-
-
-define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
-  ; CHECK: vcvtsi2sd
-  %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
-
-
-define i64 @test_x86_sse2_cvttsd2si64(<2 x double> %a0) {
-  ; CHECK: vcvttsd2si
-  %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
-  ret i64 %res
-}
-declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
-
-
-define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
-  ; CHECK: vcvtss2si
-  %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
-  ret i64 %res
+declare void @llvm.x86.avx.vzeroall() nounwind
+
+define <4 x double> @test_x86_avx_vzeroupper(<4 x double> %a, <4 x double> %b) {
+; AVX-LABEL: test_x86_avx_vzeroupper:
+; AVX:       ## BB#0:
+; AVX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vmovupd %ymm0, -{{[0-9]+}}(%rsp) ## 32-byte Spill
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    vmovups -{{[0-9]+}}(%rsp), %ymm0 ## 32-byte Reload
+; AVX-NEXT:    retq
+;
+; AVX512VL-LABEL: test_x86_avx_vzeroupper:
+; AVX512VL:       ## BB#0:
+; AVX512VL-NEXT:    vaddpd %ymm1, %ymm0, %ymm16
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    vmovapd %ymm16, %ymm0
+; AVX512VL-NEXT:    retq
+  %c = fadd <4 x double> %a, %b
+  call void @llvm.x86.avx.vzeroupper()
+  ret <4 x double> %c
 }
-declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
-
-
-define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
-  ; CHECK: vcvtsi2ss
-  %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
-
-
-define i64 @test_x86_sse_cvttss2si64(<4 x float> %a0) {
-  ; CHECK: vcvttss2si
-  %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ; <i64> [#uses=1]
-  ret i64 %res
-}
-declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
-
-
+declare void @llvm.x86.avx.vzeroupper() nounwind
diff --git a/test/CodeGen/X86/avx-shuffle-x86_32.ll b/test/CodeGen/X86/avx-shuffle-x86_32.ll
index 3fe0784c5201ec29ac5c46ab4dc5ba79f146e15e..6defe7efb941241ee85483788e8f760d28172d21 100644
--- a/test/CodeGen/X86/avx-shuffle-x86_32.ll
+++ b/test/CodeGen/X86/avx-shuffle-x86_32.ll
@@ -16,8 +16,7 @@ define <8 x i16> @test2(<4 x i16>* %v) nounwind {
 ; CHECK-LABEL: test2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    retl
   %v9 = load <4 x i16>, <4 x i16> * %v, align 8
   %v10 = shufflevector <4 x i16> %v9, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 0cd236da24ac54a9ba114dde016d78aa782afd41..41ea2a8c3677e531c43513f546471e57d0fb50d7 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -28,6 +28,40 @@ entry:
   ret <4 x i64> %vecinit6.i
 }
 
+define <4 x i64> @A2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
+; X32-LABEL: A2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %edx
+; X32-NEXT:    movl 4(%ecx), %ecx
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    vmovd %edx, %xmm0
+; X32-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: A2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    vmovq %rax, %xmm0
+; X64-NEXT:    movq %rax, (%rsi)
+; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT:    retq
+entry:
+  %q = load i64, i64* %ptr, align 8
+  store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
+  %vecinit.i = insertelement <4 x i64> undef, i64 %q, i32 0
+  %vecinit2.i = insertelement <4 x i64> %vecinit.i, i64 %q, i32 1
+  %vecinit4.i = insertelement <4 x i64> %vecinit2.i, i64 %q, i32 2
+  %vecinit6.i = insertelement <4 x i64> %vecinit4.i, i64 %q, i32 3
+  ret <4 x i64> %vecinit6.i
+}
+
 define <8 x i32> @B(i32* %ptr) nounwind uwtable readnone ssp {
 ; X32-LABEL: B:
 ; X32:       ## BB#0: ## %entry
@@ -48,6 +82,64 @@ entry:
   ret <8 x i32> %vecinit6.i
 }
 
+define <8 x i32> @B2(i32* %ptr) nounwind uwtable readnone ssp {
+; X32-LABEL: B2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: B2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss (%rdi), %ymm0
+; X64-NEXT:    retq
+entry:
+  %q = load i32, i32* %ptr, align 4
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
+  %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
+  %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
+  %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
+  %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
+  %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
+  %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
+  ret <8 x i32> %vecinit14.i
+}
+
+define <8 x i32> @B3(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
+; X32-LABEL: B3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    vmovd %ecx, %xmm0
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: B3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    vmovd %eax, %xmm0
+; X64-NEXT:    movl %eax, (%rsi)
+; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT:    retq
+entry:
+  %q = load i32, i32* %ptr, align 4
+  store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %vecinit2.i = insertelement <8 x i32> %vecinit.i, i32 %q, i32 1
+  %vecinit4.i = insertelement <8 x i32> %vecinit2.i, i32 %q, i32 2
+  %vecinit6.i = insertelement <8 x i32> %vecinit4.i, i32 %q, i32 3
+  %vecinit8.i = insertelement <8 x i32> %vecinit6.i, i32 %q, i32 4
+  %vecinit10.i = insertelement <8 x i32> %vecinit8.i, i32 %q, i32 5
+  %vecinit12.i = insertelement <8 x i32> %vecinit10.i, i32 %q, i32 6
+  %vecinit14.i = insertelement <8 x i32> %vecinit12.i, i32 %q, i32 7
+  ret <8 x i32> %vecinit14.i
+}
+
 define <4 x double> @C(double* %ptr) nounwind uwtable readnone ssp {
 ; X32-LABEL: C:
 ; X32:       ## BB#0: ## %entry
@@ -68,6 +160,34 @@ entry:
   ret <4 x double> %vecinit6.i
 }
 
+define <4 x double> @C2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
+; X32-LABEL: C2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    vmovsd %xmm0, (%eax)
+; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: C2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    vmovsd %xmm0, (%rsi)
+; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT:    retq
+entry:
+  %q = load double, double* %ptr, align 8
+  store double %q, double* %ptr2, align 8 ; to create a chain to prevent broadcast
+  %vecinit.i = insertelement <4 x double> undef, double %q, i32 0
+  %vecinit2.i = insertelement <4 x double> %vecinit.i, double %q, i32 1
+  %vecinit4.i = insertelement <4 x double> %vecinit2.i, double %q, i32 2
+  %vecinit6.i = insertelement <4 x double> %vecinit4.i, double %q, i32 3
+  ret <4 x double> %vecinit6.i
+}
+
 define <8 x float> @D(float* %ptr) nounwind uwtable readnone ssp {
 ; X32-LABEL: D:
 ; X32:       ## BB#0: ## %entry
@@ -88,6 +208,62 @@ entry:
   ret <8 x float> %vecinit6.i
 }
 
+define <8 x float> @D2(float* %ptr) nounwind uwtable readnone ssp {
+; X32-LABEL: D2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastss (%eax), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: D2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vbroadcastss (%rdi), %ymm0
+; X64-NEXT:    retq
+entry:
+  %q = load float, float* %ptr, align 4
+  %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
+  %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
+  %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
+  %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
+  %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
+  ret <8 x float> %vecinit14.i
+}
+
+define <8 x float> @D3(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
+; X32-LABEL: D3:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss %xmm0, (%eax)
+; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: D3:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    vmovss %xmm0, (%rsi)
+; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT:    retq
+entry:
+  %q = load float, float* %ptr, align 4
+  store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
+  %vecinit.i = insertelement <8 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <8 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <8 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <8 x float> %vecinit4.i, float %q, i32 3
+  %vecinit8.i = insertelement <8 x float> %vecinit6.i, float %q, i32 4
+  %vecinit10.i = insertelement <8 x float> %vecinit8.i, float %q, i32 5
+  %vecinit12.i = insertelement <8 x float> %vecinit10.i, float %q, i32 6
+  %vecinit14.i = insertelement <8 x float> %vecinit12.i, float %q, i32 7
+  ret <8 x float> %vecinit14.i
+}
+
 ;;;; 128-bit versions
 
 define <4 x float> @e(float* %ptr) nounwind uwtable readnone ssp {
@@ -110,6 +286,32 @@ entry:
   ret <4 x float> %vecinit6.i
 }
 
+define <4 x float> @e2(float* %ptr, float* %ptr2) nounwind uwtable readnone ssp {
+; X32-LABEL: e2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss %xmm0, (%eax)
+; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: e2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    vmovss %xmm0, (%rsi)
+; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    retq
+entry:
+  %q = load float, float* %ptr, align 4
+  store float %q, float* %ptr2, align 4 ; to create a chain to prevent broadcast
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  ret <4 x float> %vecinit6.i
+}
+
 ; Don't broadcast constants on pre-AVX2 hardware.
 define <4 x float> @_e2(float* %ptr) nounwind uwtable readnone ssp {
 ; X32-LABEL: _e2:
@@ -150,6 +352,34 @@ entry:
   ret <4 x i32> %vecinit6.i
 }
 
+define <4 x i32> @F2(i32* %ptr, i32* %ptr2) nounwind uwtable readnone ssp {
+; X32-LABEL: F2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    movl %ecx, (%eax)
+; X32-NEXT:    vmovd %ecx, %xmm0
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: F2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    movl %eax, (%rsi)
+; X64-NEXT:    vmovd %eax, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    retq
+entry:
+  %q = load i32, i32* %ptr, align 4
+  store i32 %q, i32* %ptr2, align 4 ; to create a chain to prevent broadcast
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %vecinit2.i = insertelement <4 x i32> %vecinit.i, i32 %q, i32 1
+  %vecinit4.i = insertelement <4 x i32> %vecinit2.i, i32 %q, i32 2
+  %vecinit6.i = insertelement <4 x i32> %vecinit4.i, i32 %q, i32 3
+  ret <4 x i32> %vecinit6.i
+}
+
 ; FIXME: Pointer adjusted broadcasts
 
 define <4 x i32> @load_splat_4i32_4i32_1111(<4 x i32>* %ptr) nounwind uwtable readnone ssp {
@@ -382,6 +612,36 @@ entry:
   ret <2 x i64> %vecinit2.i
 }
 
+define <2 x i64> @G2(i64* %ptr, i64* %ptr2) nounwind uwtable readnone ssp {
+; X32-LABEL: G2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %edx
+; X32-NEXT:    movl 4(%ecx), %ecx
+; X32-NEXT:    movl %ecx, 4(%eax)
+; X32-NEXT:    movl %edx, (%eax)
+; X32-NEXT:    vmovd %edx, %xmm0
+; X32-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, %edx, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: G2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    movq %rax, (%rsi)
+; X64-NEXT:    vmovq %rax, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
+; X64-NEXT:    retq
+entry:
+  %q = load i64, i64* %ptr, align 8
+  store i64 %q, i64* %ptr2, align 8 ; to create a chain to prevent broadcast
+  %vecinit.i = insertelement <2 x i64> undef, i64 %q, i32 0
+  %vecinit2.i = insertelement <2 x i64> %vecinit.i, i64 %q, i32 1
+  ret <2 x i64> %vecinit2.i
+}
+
 define <4 x i32> @H(<4 x i32> %a) {
 ; X32-LABEL: H:
 ; X32:       ## BB#0: ## %entry
@@ -415,6 +675,30 @@ entry:
   ret <2 x double> %vecinit2.i
 }
 
+define <2 x double> @I2(double* %ptr, double* %ptr2) nounwind uwtable readnone ssp {
+; X32-LABEL: I2:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-NEXT:    vmovsd %xmm0, (%eax)
+; X32-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X32-NEXT:    retl
+;
+; X64-LABEL: I2:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    vmovsd %xmm0, (%rsi)
+; X64-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
+; X64-NEXT:    retq
+entry:
+  %q = load double, double* %ptr, align 4
+  store double %q, double* %ptr2, align 4 ; to create a chain to prevent broadcast
+  %vecinit.i = insertelement <2 x double> undef, double %q, i32 0
+  %vecinit2.i = insertelement <2 x double> %vecinit.i, double %q, i32 1
+  ret <2 x double> %vecinit2.i
+}
+
 define <4 x float> @_RR(float* %ptr, i32* %k) nounwind uwtable readnone ssp {
 ; X32-LABEL: _RR:
 ; X32:       ## BB#0: ## %entry
@@ -558,12 +842,15 @@ define float @broadcast_lifetime() nounwind {
 ; X32-NEXT:    leal {{[0-9]+}}(%esp), %esi
 ; X32-NEXT:    movl %esi, (%esp)
 ; X32-NEXT:    calll _gfunc
-; X32-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%esp) ## 16-byte Spill
 ; X32-NEXT:    movl %esi, (%esp)
 ; X32-NEXT:    calll _gfunc
-; X32-NEXT:    vbroadcastss {{[0-9]+}}(%esp), %xmm0
-; X32-NEXT:    vsubss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## 16-byte Folded Reload
+; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    vpermilps $0, {{[0-9]+}}(%esp), %xmm1 ## 16-byte Folded Reload
+; X32-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X32-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X32-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; X32-NEXT:    vmovss %xmm0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    flds {{[0-9]+}}(%esp)
 ; X32-NEXT:    addl $56, %esp
@@ -575,12 +862,15 @@ define float @broadcast_lifetime() nounwind {
 ; X64-NEXT:    subq $40, %rsp
 ; X64-NEXT:    movq %rsp, %rdi
 ; X64-NEXT:    callq _gfunc
-; X64-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm0
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X64-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) ## 16-byte Spill
 ; X64-NEXT:    movq %rsp, %rdi
 ; X64-NEXT:    callq _gfunc
-; X64-NEXT:    vbroadcastss {{[0-9]+}}(%rsp), %xmm0
-; X64-NEXT:    vsubss {{[0-9]+}}(%rsp), %xmm0, %xmm0 ## 16-byte Folded Reload
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    vpermilps $0, {{[0-9]+}}(%rsp), %xmm1 ## 16-byte Folded Reload
+; X64-NEXT:    ## xmm1 = mem[0,0,0,0]
+; X64-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; X64-NEXT:    vsubss %xmm1, %xmm0, %xmm0
 ; X64-NEXT:    addq $40, %rsp
 ; X64-NEXT:    retq
   %1 = alloca <4 x float>, align 16
@@ -588,15 +878,15 @@ define float @broadcast_lifetime() nounwind {
   %3 = bitcast <4 x float>* %1 to i8*
   %4 = bitcast <4 x float>* %2 to i8*
 
-  call void @llvm.lifetime.start(i64 16, i8* %3)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
   call void @gfunc(<4 x float>* %1)
   %5 = load <4 x float>, <4 x float>* %1, align 16
-  call void @llvm.lifetime.end(i64 16, i8* %3)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 
-  call void @llvm.lifetime.start(i64 16, i8* %4)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %4)
   call void @gfunc(<4 x float>* %2)
   %6 = load <4 x float>, <4 x float>* %2, align 16
-  call void @llvm.lifetime.end(i64 16, i8* %4)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %4)
 
   %7 = extractelement <4 x float> %5, i32 1
   %8 = extractelement <4 x float> %6, i32 1
@@ -605,5 +895,5 @@ define float @broadcast_lifetime() nounwind {
 }
 
 declare void @gfunc(<4 x float>*)
-declare void @llvm.lifetime.start(i64, i8*)
-declare void @llvm.lifetime.end(i64, i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
diff --git a/test/CodeGen/X86/avx-vperm2x128.ll b/test/CodeGen/X86/avx-vperm2x128.ll
index f7f54a01d7ffb0431709705cdb4afe573a95ae14..f4a77c370db5e3707f681685b53aeaff88f85399 100644
--- a/test/CodeGen/X86/avx-vperm2x128.ll
+++ b/test/CodeGen/X86/avx-vperm2x128.ll
@@ -466,8 +466,7 @@ define <4 x i64> @shuffle_v4i64_67zz(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1:       ## BB#0:
 ; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],zero,zero
 ; AVX1-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v4i64_67zz:
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index 3c52aaf71adca9fa3024f3a2213397aaf9606235..cf514d7aeb318ec0e3ddc9327be1fbb856c2f150 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,8 +1,9 @@
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s
-; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-write | FileCheck --check-prefix=FASTYMM %s
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx512f | FileCheck %s
+; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mattr=+avx,+fast-partial-ymm-or-zmm-write | FileCheck --check-prefix=FAST-YMM-ZMM %s
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=btver2 | FileCheck --check-prefix=BTVER2 %s
 
-; FASTYMM-NOT: vzeroupper
+; FAST-YMM-ZMM-NOT: vzeroupper
 ; BTVER2-NOT: vzeroupper
 
 declare i32 @foo()
diff --git a/test/CodeGen/X86/avx2-conversions.ll b/test/CodeGen/X86/avx2-conversions.ll
index f0fb58ff7c8a2e4ac6196e41da41f802377b6fe7..26edafbdb64fd48cb6741d9657cfa24a64c852cb 100644
--- a/test/CodeGen/X86/avx2-conversions.ll
+++ b/test/CodeGen/X86/avx2-conversions.ll
@@ -25,7 +25,7 @@ define <4 x i32> @trunc4(<4 x i64> %A) nounwind {
 define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
 ; X32-LABEL: trunc8:
 ; X32:       ## BB#0:
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X32-NEXT:    vzeroupper
@@ -33,7 +33,7 @@ define <8 x i16> @trunc8(<8 x i32> %A) nounwind {
 ;
 ; X64-LABEL: trunc8:
 ; X64:       ## BB#0:
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X64-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx2-gather.ll b/test/CodeGen/X86/avx2-gather.ll
index cd8c354e99605bb3e2d2febe34aed40f3594fd9f..d162b4755ee1a5f1c1db10cbc024ceaeb2adc70d 100644
--- a/test/CodeGen/X86/avx2-gather.ll
+++ b/test/CodeGen/X86/avx2-gather.ll
@@ -9,12 +9,14 @@ define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1, <4 x i32> %idx, <4 x floa
 ; X32-LABEL: test_x86_avx2_gather_d_ps:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X32-NEXT:    vgatherdps %xmm1, (%eax,%xmm0,2), %xmm2
 ; X32-NEXT:    vmovaps %xmm2, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_avx2_gather_d_ps:
 ; X64:       ## BB#0:
+; X64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; X64-NEXT:    vgatherdps %xmm1, (%rdi,%xmm0,2), %xmm2
 ; X64-NEXT:    vmovaps %xmm2, %xmm0
 ; X64-NEXT:    retq
@@ -30,12 +32,14 @@ define <2 x double> @test_x86_avx2_gather_d_pd(i8* %a1, <4 x i32> %idx, <2 x dou
 ; X32-LABEL: test_x86_avx2_gather_d_pd:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; X32-NEXT:    vgatherdpd %xmm1, (%eax,%xmm0,2), %xmm2
 ; X32-NEXT:    vmovapd %xmm2, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_avx2_gather_d_pd:
 ; X64:       ## BB#0:
+; X64-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; X64-NEXT:    vgatherdpd %xmm1, (%rdi,%xmm0,2), %xmm2
 ; X64-NEXT:    vmovapd %xmm2, %xmm0
 ; X64-NEXT:    retq
@@ -51,12 +55,14 @@ define <8 x float> @test_x86_avx2_gather_d_ps_256(i8* %a1, <8 x i32> %idx, <8 x
 ; X32-LABEL: test_x86_avx2_gather_d_ps_256:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vxorps %ymm2, %ymm2, %ymm2
 ; X32-NEXT:    vgatherdps %ymm1, (%eax,%ymm0,4), %ymm2
 ; X32-NEXT:    vmovaps %ymm2, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_avx2_gather_d_ps_256:
 ; X64:       ## BB#0:
+; X64-NEXT:    vxorps %ymm2, %ymm2, %ymm2
 ; X64-NEXT:    vgatherdps %ymm1, (%rdi,%ymm0,4), %ymm2
 ; X64-NEXT:    vmovaps %ymm2, %ymm0
 ; X64-NEXT:    retq
@@ -72,12 +78,14 @@ define <4 x double> @test_x86_avx2_gather_d_pd_256(i8* %a1, <4 x i32> %idx, <4 x
 ; X32-LABEL: test_x86_avx2_gather_d_pd_256:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
 ; X32-NEXT:    vgatherdpd %ymm1, (%eax,%xmm0,8), %ymm2
 ; X32-NEXT:    vmovapd %ymm2, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_x86_avx2_gather_d_pd_256:
 ; X64:       ## BB#0:
+; X64-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
 ; X64-NEXT:    vgatherdpd %ymm1, (%rdi,%xmm0,8), %ymm2
 ; X64-NEXT:    vmovapd %ymm2, %ymm0
 ; X64-NEXT:    retq
@@ -85,3 +93,55 @@ define <4 x double> @test_x86_avx2_gather_d_pd_256(i8* %a1, <4 x i32> %idx, <4 x
                             i8* %a1, <4 x i32> %idx, <4 x double> %mask, i8 8) ;
   ret <4 x double> %res
 }
+
+define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i32gather_epi32:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X32-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT:    vmovdqa %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_i32gather_epi32:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT:    vmovdqa %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast i32 *%a0 to i8*
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x i32>
+  %call = call <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32> zeroinitializer, i8* %arg0, <4 x i32> %arg1, <4 x i32> %mask, i8 2)
+  %bc = bitcast <4 x i32> %call to <2 x i64>
+  ret <2 x i64> %bc
+}
+declare <4 x i32> @llvm.x86.avx2.gather.d.d(<4 x i32>, i8*, <4 x i32>, <4 x i32>, i8) nounwind readonly
+
+define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
+; X32-LABEL: test_mm_i32gather_pd:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X32-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
+; X32-NEXT:    vmovapd %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_i32gather_pd:
+; X64:       ## BB#0:
+; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
+; X64-NEXT:    vmovapd %xmm1, %xmm0
+; X64-NEXT:    retq
+  %arg0 = bitcast double *%a0 to i8*
+  %arg1 = bitcast <2 x i64> %a1 to <4 x i32>
+  %cmp = fcmp oeq <2 x double> zeroinitializer, zeroinitializer
+  %sext = sext <2 x i1> %cmp to <2 x i64>
+  %mask = bitcast <2 x i64> %sext to <2 x double>
+  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> zeroinitializer, i8* %arg0, <4 x i32> %arg1, <2 x double> %mask, i8 2)
+  ret <2 x double> %res
+}
diff --git a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
index d7a1422e992ffb78bf54c91ecdd776516fcaecdd..cb0abf3b137f28c054670f5b6e5a3618d1c39fa8 100644
--- a/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-fast-isel.ll
@@ -1068,6 +1068,7 @@ define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vpgatherdd %xmm2, (%eax,%xmm0,2), %xmm1
 ; X32-NEXT:    vmovdqa %xmm1, %xmm0
 ; X32-NEXT:    retl
@@ -1075,6 +1076,7 @@ define <2 x i64> @test_mm_i32gather_epi32(i32 *%a0, <2 x i64> %a1) {
 ; X64-LABEL: test_mm_i32gather_epi32:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherdd %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -1112,6 +1114,7 @@ define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; X32-NEXT:    vpgatherdd %ymm2, (%eax,%ymm0,2), %ymm1
 ; X32-NEXT:    vmovdqa %ymm1, %ymm0
 ; X32-NEXT:    retl
@@ -1119,6 +1122,7 @@ define <4 x i64> @test_mm256_i32gather_epi32(i32 *%a0, <4 x i64> %a1) {
 ; X64-LABEL: test_mm256_i32gather_epi32:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vpgatherdd %ymm2, (%rdi,%ymm0,2), %ymm1
 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-NEXT:    retq
@@ -1156,6 +1160,7 @@ define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vpgatherdq %xmm2, (%eax,%xmm0,2), %xmm1
 ; X32-NEXT:    vmovdqa %xmm1, %xmm0
 ; X32-NEXT:    retl
@@ -1163,6 +1168,7 @@ define <2 x i64> @test_mm_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
 ; X64-LABEL: test_mm_i32gather_epi64:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherdq %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -1195,6 +1201,7 @@ define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; X32-NEXT:    vpgatherdq %ymm2, (%eax,%xmm0,2), %ymm1
 ; X32-NEXT:    vmovdqa %ymm1, %ymm0
 ; X32-NEXT:    retl
@@ -1202,6 +1209,7 @@ define <4 x i64> @test_mm256_i32gather_epi64(i64 *%a0, <2 x i64> %a1) {
 ; X64-LABEL: test_mm256_i32gather_epi64:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vpgatherdq %ymm2, (%rdi,%xmm0,2), %ymm1
 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-NEXT:    retq
@@ -1234,6 +1242,7 @@ define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vgatherdpd %xmm2, (%eax,%xmm0,2), %xmm1
 ; X32-NEXT:    vmovapd %xmm1, %xmm0
 ; X32-NEXT:    retl
@@ -1241,6 +1250,7 @@ define <2 x double> @test_mm_i32gather_pd(double *%a0, <2 x i64> %a1) {
 ; X64-LABEL: test_mm_i32gather_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vgatherdpd %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovapd %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -1318,6 +1328,7 @@ define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vgatherdps %xmm2, (%eax,%xmm0,2), %xmm1
 ; X32-NEXT:    vmovaps %xmm1, %xmm0
 ; X32-NEXT:    retl
@@ -1325,6 +1336,7 @@ define <4 x float> @test_mm_i32gather_ps(float *%a0, <2 x i64> %a1) {
 ; X64-LABEL: test_mm_i32gather_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vgatherdps %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovaps %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -1402,6 +1414,7 @@ define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vpgatherqd %xmm2, (%eax,%xmm0,2), %xmm1
 ; X32-NEXT:    vmovdqa %xmm1, %xmm0
 ; X32-NEXT:    retl
@@ -1409,6 +1422,7 @@ define <2 x i64> @test_mm_i64gather_epi32(i32 *%a0, <2 x i64> %a1) {
 ; X64-LABEL: test_mm_i64gather_epi32:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -1444,6 +1458,7 @@ define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vpgatherqd %xmm2, (%eax,%ymm0,2), %xmm1
 ; X32-NEXT:    vmovdqa %xmm1, %xmm0
 ; X32-NEXT:    vzeroupper
@@ -1452,6 +1467,7 @@ define <2 x i64> @test_mm256_i64gather_epi32(i32 *%a0, <4 x i64> %a1) {
 ; X64-LABEL: test_mm256_i64gather_epi32:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherqd %xmm2, (%rdi,%ymm0,2), %xmm1
 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 ; X64-NEXT:    vzeroupper
@@ -1490,6 +1506,7 @@ define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vpgatherqq %xmm2, (%eax,%xmm0,2), %xmm1
 ; X32-NEXT:    vmovdqa %xmm1, %xmm0
 ; X32-NEXT:    retl
@@ -1497,6 +1514,7 @@ define <2 x i64> @test_mm_i64gather_epi64(i64 *%a0, <2 x i64> %a1) {
 ; X64-LABEL: test_mm_i64gather_epi64:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpgatherqq %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -1527,6 +1545,7 @@ define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; X32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; X32-NEXT:    vpgatherqq %ymm2, (%eax,%ymm0,2), %ymm1
 ; X32-NEXT:    vmovdqa %ymm1, %ymm0
 ; X32-NEXT:    retl
@@ -1534,6 +1553,7 @@ define <4 x i64> @test_mm256_i64gather_epi64(i64 *%a0, <4 x i64> %a1) {
 ; X64-LABEL: test_mm256_i64gather_epi64:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %ymm2, %ymm2, %ymm2
+; X64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; X64-NEXT:    vpgatherqq %ymm2, (%rdi,%ymm0,2), %ymm1
 ; X64-NEXT:    vmovdqa %ymm1, %ymm0
 ; X64-NEXT:    retq
@@ -1564,6 +1584,7 @@ define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vgatherqpd %xmm2, (%eax,%xmm0,2), %xmm1
 ; X32-NEXT:    vmovapd %xmm1, %xmm0
 ; X32-NEXT:    retl
@@ -1571,6 +1592,7 @@ define <2 x double> @test_mm_i64gather_pd(double *%a0, <2 x i64> %a1) {
 ; X64-LABEL: test_mm_i64gather_pd:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vgatherqpd %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovapd %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -1644,6 +1666,7 @@ define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vgatherqps %xmm2, (%eax,%xmm0,2), %xmm1
 ; X32-NEXT:    vmovaps %xmm1, %xmm0
 ; X32-NEXT:    retl
@@ -1651,6 +1674,7 @@ define <4 x float> @test_mm_i64gather_ps(float *%a0, <2 x i64> %a1) {
 ; X64-LABEL: test_mm_i64gather_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%xmm0,2), %xmm1
 ; X64-NEXT:    vmovaps %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -1684,6 +1708,7 @@ define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
 ; X32:       # BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vgatherqps %xmm2, (%eax,%ymm0,2), %xmm1
 ; X32-NEXT:    vmovaps %xmm1, %xmm0
 ; X32-NEXT:    vzeroupper
@@ -1692,6 +1717,7 @@ define <4 x float> @test_mm256_i64gather_ps(float *%a0, <4 x i64> %a1) {
 ; X64-LABEL: test_mm256_i64gather_ps:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vgatherqps %xmm2, (%rdi,%ymm0,2), %xmm1
 ; X64-NEXT:    vmovaps %xmm1, %xmm0
 ; X64-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index 830e68fb1e340aa8c5473e0ded380c2b7fff2e69..449ac4287c9679c29b9f895c32f15f31b58d2e19 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -227,16 +227,11 @@ declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
 
 
 define i32 @test_x86_avx2_pmovmskb(<32 x i8> %a0) {
-; AVX2-LABEL: test_x86_avx2_pmovmskb:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
-; AVX2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_pmovmskb:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_pmovmskb:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpmovmskb %ymm0, %eax ## encoding: [0xc5,0xfd,0xd7,0xc0]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %a0) ; <i32> [#uses=1]
   ret i32 %res
 }
@@ -1179,18 +1174,12 @@ declare void @llvm.x86.avx2.maskstore.q(i8*, <2 x i64>, <2 x i64>) nounwind
 
 
 define void @test_x86_avx2_maskstore_q_256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2) {
-; AVX2-LABEL: test_x86_avx2_maskstore_q_256:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
-; AVX2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_maskstore_q_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_maskstore_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    vpmaskmovq %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0xfd,0x8e,0x08]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   call void @llvm.x86.avx2.maskstore.q.256(i8* %a0, <4 x i64> %a1, <4 x i64> %a2)
   ret void
 }
@@ -1210,18 +1199,12 @@ declare void @llvm.x86.avx2.maskstore.d(i8*, <4 x i32>, <4 x i32>) nounwind
 
 
 define void @test_x86_avx2_maskstore_d_256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2) {
-; AVX2-LABEL: test_x86_avx2_maskstore_d_256:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
-; AVX2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_maskstore_d_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_maskstore_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    vpmaskmovd %ymm1, %ymm0, (%eax) ## encoding: [0xc4,0xe2,0x7d,0x8e,0x08]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   call void @llvm.x86.avx2.maskstore.d.256(i8* %a0, <8 x i32> %a1, <8 x i32> %a2)
   ret void
 }
@@ -1522,18 +1505,12 @@ declare <4 x float> @llvm.x86.avx2.gather.q.ps(<4 x float>, i8*,
                       <2 x i64>, <4 x float>, i8) nounwind readonly
 
 define <4 x float> @test_x86_avx2_gather_q_ps_256(<4 x float> %a0, i8* %a1, <4 x i64> %idx, <4 x float> %mask) {
-; AVX2-LABEL: test_x86_avx2_gather_q_ps_256:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
-; AVX2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_gather_q_ps_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_gather_q_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    vgatherqps %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x93,0x04,0x48]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx2.gather.q.ps.256(<4 x float> %a0,
                             i8* %a1, <4 x i64> %idx, <4 x float> %mask, i8 2) ;
   ret <4 x float> %res
@@ -1633,18 +1610,12 @@ declare <4 x i32> @llvm.x86.avx2.gather.q.d(<4 x i32>, i8*,
                       <2 x i64>, <4 x i32>, i8) nounwind readonly
 
 define <4 x i32> @test_x86_avx2_gather_q_d_256(<4 x i32> %a0, i8* %a1, <4 x i64> %idx, <4 x i32> %mask) {
-; AVX2-LABEL: test_x86_avx2_gather_q_d_256:
-; AVX2:       ## BB#0:
-; AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
-; AVX2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx2_gather_q_d_256:
-; AVX512VL:       ## BB#0:
-; AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX512VL-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
-; AVX512VL-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_avx2_gather_q_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    vpgatherqd %xmm2, (%eax,%ymm1,2), %xmm0 ## encoding: [0xc4,0xe2,0x6d,0x91,0x04,0x48]
+; CHECK-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx2.gather.q.d.256(<4 x i32> %a0,
                             i8* %a1, <4 x i64> %idx, <4 x i32> %mask, i8 2) ;
   ret <4 x i32> %res
diff --git a/test/CodeGen/X86/avx2-shift.ll b/test/CodeGen/X86/avx2-shift.ll
index 887fef113e72a74d40e7e7de34449a438f8c2fef..4345bd6f792668cd694684a128170e67f071b4eb 100644
--- a/test/CodeGen/X86/avx2-shift.ll
+++ b/test/CodeGen/X86/avx2-shift.ll
@@ -530,7 +530,7 @@ define <8 x i16> @variable_shl16(<8 x i16> %lhs, <8  x i16> %rhs) {
 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X32-NEXT:    vzeroupper
@@ -541,7 +541,7 @@ define <8 x i16> @variable_shl16(<8 x i16> %lhs, <8  x i16> %rhs) {
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X64-NEXT:    vzeroupper
@@ -556,7 +556,7 @@ define <8 x i16> @variable_ashr16(<8 x i16> %lhs, <8  x i16> %rhs) {
 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X32-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X32-NEXT:    vzeroupper
@@ -567,7 +567,7 @@ define <8 x i16> @variable_ashr16(<8 x i16> %lhs, <8  x i16> %rhs) {
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X64-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X64-NEXT:    vzeroupper
@@ -582,7 +582,7 @@ define <8 x i16> @variable_lshr16(<8 x i16> %lhs, <8  x i16> %rhs) {
 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X32-NEXT:    vzeroupper
@@ -593,7 +593,7 @@ define <8 x i16> @variable_lshr16(<8 x i16> %lhs, <8  x i16> %rhs) {
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X64-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index f65f485cc62ce750d11fb887dd5a4de1dd937406..ba47e2ba15c2e060c0e0a2d1cff57355f202c941 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -1133,97 +1133,52 @@ eintry:
 }
 
 define void @isel_crash_32b(i8* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_32b:
-; X32-AVX2:       ## BB#0: ## %eintry
-; X32-AVX2-NEXT:    pushl %ebp
-; X32-AVX2-NEXT:  Lcfi1:
-; X32-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X32-AVX2-NEXT:  Lcfi2:
-; X32-AVX2-NEXT:    .cfi_offset %ebp, -8
-; X32-AVX2-NEXT:    movl %esp, %ebp
-; X32-AVX2-NEXT:  Lcfi3:
-; X32-AVX2-NEXT:    .cfi_def_cfa_register %ebp
-; X32-AVX2-NEXT:    andl $-32, %esp
-; X32-AVX2-NEXT:    subl $128, %esp
-; X32-AVX2-NEXT:    movl 8(%ebp), %eax
-; X32-AVX2-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X32-AVX2-NEXT:    vpbroadcastb (%eax), %ymm1
-; X32-AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    movl %ebp, %esp
-; X32-AVX2-NEXT:    popl %ebp
-; X32-AVX2-NEXT:    vzeroupper
-; X32-AVX2-NEXT:    retl
-;
-; X64-AVX2-LABEL: isel_crash_32b:
-; X64-AVX2:       ## BB#0: ## %eintry
-; X64-AVX2-NEXT:    pushq %rbp
-; X64-AVX2-NEXT:  Lcfi0:
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT:  Lcfi1:
-; X64-AVX2-NEXT:    .cfi_offset %rbp, -16
-; X64-AVX2-NEXT:    movq %rsp, %rbp
-; X64-AVX2-NEXT:  Lcfi2:
-; X64-AVX2-NEXT:    .cfi_def_cfa_register %rbp
-; X64-AVX2-NEXT:    andq $-32, %rsp
-; X64-AVX2-NEXT:    subq $128, %rsp
-; X64-AVX2-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vmovaps %ymm0, (%rsp)
-; X64-AVX2-NEXT:    movb (%rdi), %al
-; X64-AVX2-NEXT:    vmovd %eax, %xmm1
-; X64-AVX2-NEXT:    vpbroadcastb %xmm1, %ymm1
-; X64-AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    movq %rbp, %rsp
-; X64-AVX2-NEXT:    popq %rbp
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X32-AVX512VL-LABEL: isel_crash_32b:
-; X32-AVX512VL:       ## BB#0: ## %eintry
-; X32-AVX512VL-NEXT:    pushl %ebp
-; X32-AVX512VL-NEXT:  Lcfi1:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_offset 8
-; X32-AVX512VL-NEXT:  Lcfi2:
-; X32-AVX512VL-NEXT:    .cfi_offset %ebp, -8
-; X32-AVX512VL-NEXT:    movl %esp, %ebp
-; X32-AVX512VL-NEXT:  Lcfi3:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_register %ebp
-; X32-AVX512VL-NEXT:    andl $-32, %esp
-; X32-AVX512VL-NEXT:    subl $128, %esp
-; X32-AVX512VL-NEXT:    movl 8(%ebp), %eax
-; X32-AVX512VL-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX512VL-NEXT:    vmovaps %ymm0, (%esp)
-; X32-AVX512VL-NEXT:    vpbroadcastb (%eax), %ymm1
-; X32-AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    movl %ebp, %esp
-; X32-AVX512VL-NEXT:    popl %ebp
-; X32-AVX512VL-NEXT:    retl
+; X32-LABEL: isel_crash_32b:
+; X32:       ## BB#0: ## %eintry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:  Lcfi1:
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:  Lcfi2:
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:  Lcfi3:
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    andl $-32, %esp
+; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT:    vmovaps %ymm0, (%esp)
+; X32-NEXT:    vpbroadcastb (%eax), %ymm1
+; X32-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
 ;
-; X64-AVX512VL-LABEL: isel_crash_32b:
-; X64-AVX512VL:       ## BB#0: ## %eintry
-; X64-AVX512VL-NEXT:    pushq %rbp
-; X64-AVX512VL-NEXT:  Lcfi0:
-; X64-AVX512VL-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX512VL-NEXT:  Lcfi1:
-; X64-AVX512VL-NEXT:    .cfi_offset %rbp, -16
-; X64-AVX512VL-NEXT:    movq %rsp, %rbp
-; X64-AVX512VL-NEXT:  Lcfi2:
-; X64-AVX512VL-NEXT:    .cfi_def_cfa_register %rbp
-; X64-AVX512VL-NEXT:    andq $-32, %rsp
-; X64-AVX512VL-NEXT:    subq $128, %rsp
-; X64-AVX512VL-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X64-AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
-; X64-AVX512VL-NEXT:    movb (%rdi), %al
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm1
-; X64-AVX512VL-NEXT:    vpbroadcastb %xmm1, %ymm1
-; X64-AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    movq %rbp, %rsp
-; X64-AVX512VL-NEXT:    popq %rbp
-; X64-AVX512VL-NEXT:    retq
+; X64-LABEL: isel_crash_32b:
+; X64:       ## BB#0: ## %eintry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:  Lcfi0:
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:  Lcfi1:
+; X64-NEXT:    .cfi_offset %rbp, -16
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:  Lcfi2:
+; X64-NEXT:    .cfi_def_cfa_register %rbp
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $128, %rsp
+; X64-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT:    vmovaps %ymm0, (%rsp)
+; X64-NEXT:    movb (%rdi), %al
+; X64-NEXT:    vmovd %eax, %xmm1
+; X64-NEXT:    vpbroadcastb %xmm1, %ymm1
+; X64-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
 eintry:
   %__a.addr.i = alloca <4 x i64>, align 16
   %__b.addr.i = alloca <4 x i64>, align 16
@@ -1280,97 +1235,52 @@ entry:
 }
 
 define void @isel_crash_16w(i16* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_16w:
-; X32-AVX2:       ## BB#0: ## %eintry
-; X32-AVX2-NEXT:    pushl %ebp
-; X32-AVX2-NEXT:  Lcfi5:
-; X32-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X32-AVX2-NEXT:  Lcfi6:
-; X32-AVX2-NEXT:    .cfi_offset %ebp, -8
-; X32-AVX2-NEXT:    movl %esp, %ebp
-; X32-AVX2-NEXT:  Lcfi7:
-; X32-AVX2-NEXT:    .cfi_def_cfa_register %ebp
-; X32-AVX2-NEXT:    andl $-32, %esp
-; X32-AVX2-NEXT:    subl $128, %esp
-; X32-AVX2-NEXT:    movl 8(%ebp), %eax
-; X32-AVX2-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X32-AVX2-NEXT:    vpbroadcastw (%eax), %ymm1
-; X32-AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    movl %ebp, %esp
-; X32-AVX2-NEXT:    popl %ebp
-; X32-AVX2-NEXT:    vzeroupper
-; X32-AVX2-NEXT:    retl
-;
-; X64-AVX2-LABEL: isel_crash_16w:
-; X64-AVX2:       ## BB#0: ## %eintry
-; X64-AVX2-NEXT:    pushq %rbp
-; X64-AVX2-NEXT:  Lcfi3:
-; X64-AVX2-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX2-NEXT:  Lcfi4:
-; X64-AVX2-NEXT:    .cfi_offset %rbp, -16
-; X64-AVX2-NEXT:    movq %rsp, %rbp
-; X64-AVX2-NEXT:  Lcfi5:
-; X64-AVX2-NEXT:    .cfi_def_cfa_register %rbp
-; X64-AVX2-NEXT:    andq $-32, %rsp
-; X64-AVX2-NEXT:    subq $128, %rsp
-; X64-AVX2-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X64-AVX2-NEXT:    vmovaps %ymm0, (%rsp)
-; X64-AVX2-NEXT:    movw (%rdi), %ax
-; X64-AVX2-NEXT:    vmovd %eax, %xmm1
-; X64-AVX2-NEXT:    vpbroadcastw %xmm1, %ymm1
-; X64-AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
-; X64-AVX2-NEXT:    movq %rbp, %rsp
-; X64-AVX2-NEXT:    popq %rbp
-; X64-AVX2-NEXT:    vzeroupper
-; X64-AVX2-NEXT:    retq
-;
-; X32-AVX512VL-LABEL: isel_crash_16w:
-; X32-AVX512VL:       ## BB#0: ## %eintry
-; X32-AVX512VL-NEXT:    pushl %ebp
-; X32-AVX512VL-NEXT:  Lcfi5:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_offset 8
-; X32-AVX512VL-NEXT:  Lcfi6:
-; X32-AVX512VL-NEXT:    .cfi_offset %ebp, -8
-; X32-AVX512VL-NEXT:    movl %esp, %ebp
-; X32-AVX512VL-NEXT:  Lcfi7:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_register %ebp
-; X32-AVX512VL-NEXT:    andl $-32, %esp
-; X32-AVX512VL-NEXT:    subl $128, %esp
-; X32-AVX512VL-NEXT:    movl 8(%ebp), %eax
-; X32-AVX512VL-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX512VL-NEXT:    vmovaps %ymm0, (%esp)
-; X32-AVX512VL-NEXT:    vpbroadcastw (%eax), %ymm1
-; X32-AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    movl %ebp, %esp
-; X32-AVX512VL-NEXT:    popl %ebp
-; X32-AVX512VL-NEXT:    retl
+; X32-LABEL: isel_crash_16w:
+; X32:       ## BB#0: ## %eintry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:  Lcfi5:
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:  Lcfi6:
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:  Lcfi7:
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    andl $-32, %esp
+; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT:    vmovaps %ymm0, (%esp)
+; X32-NEXT:    vpbroadcastw (%eax), %ymm1
+; X32-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
 ;
-; X64-AVX512VL-LABEL: isel_crash_16w:
-; X64-AVX512VL:       ## BB#0: ## %eintry
-; X64-AVX512VL-NEXT:    pushq %rbp
-; X64-AVX512VL-NEXT:  Lcfi3:
-; X64-AVX512VL-NEXT:    .cfi_def_cfa_offset 16
-; X64-AVX512VL-NEXT:  Lcfi4:
-; X64-AVX512VL-NEXT:    .cfi_offset %rbp, -16
-; X64-AVX512VL-NEXT:    movq %rsp, %rbp
-; X64-AVX512VL-NEXT:  Lcfi5:
-; X64-AVX512VL-NEXT:    .cfi_def_cfa_register %rbp
-; X64-AVX512VL-NEXT:    andq $-32, %rsp
-; X64-AVX512VL-NEXT:    subq $128, %rsp
-; X64-AVX512VL-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X64-AVX512VL-NEXT:    vmovaps %ymm0, (%rsp)
-; X64-AVX512VL-NEXT:    movw (%rdi), %ax
-; X64-AVX512VL-NEXT:    vmovd %eax, %xmm1
-; X64-AVX512VL-NEXT:    vpbroadcastw %xmm1, %ymm1
-; X64-AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    movq %rbp, %rsp
-; X64-AVX512VL-NEXT:    popq %rbp
-; X64-AVX512VL-NEXT:    retq
+; X64-LABEL: isel_crash_16w:
+; X64:       ## BB#0: ## %eintry
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:  Lcfi3:
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:  Lcfi4:
+; X64-NEXT:    .cfi_offset %rbp, -16
+; X64-NEXT:    movq %rsp, %rbp
+; X64-NEXT:  Lcfi5:
+; X64-NEXT:    .cfi_def_cfa_register %rbp
+; X64-NEXT:    andq $-32, %rsp
+; X64-NEXT:    subq $128, %rsp
+; X64-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X64-NEXT:    vmovaps %ymm0, (%rsp)
+; X64-NEXT:    movw (%rdi), %ax
+; X64-NEXT:    vmovd %eax, %xmm1
+; X64-NEXT:    vpbroadcastw %xmm1, %ymm1
+; X64-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
+; X64-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
+; X64-NEXT:    movq %rbp, %rsp
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    vzeroupper
+; X64-NEXT:    retq
 eintry:
   %__a.addr.i = alloca <4 x i64>, align 16
   %__b.addr.i = alloca <4 x i64>, align 16
@@ -1419,7 +1329,7 @@ define void @isel_crash_4d(i32* %cV_R.addr) {
 ; X64-AVX512VL-NEXT:    movl (%rdi), %eax
 ; X64-AVX512VL-NEXT:    vpbroadcastd %eax, %xmm1
 ; X64-AVX512VL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX512VL-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX512VL-NEXT:    retq
 entry:
   %__a.addr.i = alloca <2 x i64>, align 16
@@ -1437,28 +1347,28 @@ entry:
 }
 
 define void @isel_crash_8d(i32* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_8d:
-; X32-AVX2:       ## BB#0: ## %eintry
-; X32-AVX2-NEXT:    pushl %ebp
-; X32-AVX2-NEXT:  Lcfi9:
-; X32-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X32-AVX2-NEXT:  Lcfi10:
-; X32-AVX2-NEXT:    .cfi_offset %ebp, -8
-; X32-AVX2-NEXT:    movl %esp, %ebp
-; X32-AVX2-NEXT:  Lcfi11:
-; X32-AVX2-NEXT:    .cfi_def_cfa_register %ebp
-; X32-AVX2-NEXT:    andl $-32, %esp
-; X32-AVX2-NEXT:    subl $128, %esp
-; X32-AVX2-NEXT:    movl 8(%ebp), %eax
-; X32-AVX2-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X32-AVX2-NEXT:    vbroadcastss (%eax), %ymm1
-; X32-AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    movl %ebp, %esp
-; X32-AVX2-NEXT:    popl %ebp
-; X32-AVX2-NEXT:    vzeroupper
-; X32-AVX2-NEXT:    retl
+; X32-LABEL: isel_crash_8d:
+; X32:       ## BB#0: ## %eintry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:  Lcfi9:
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:  Lcfi10:
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:  Lcfi11:
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    andl $-32, %esp
+; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT:    vmovaps %ymm0, (%esp)
+; X32-NEXT:    vbroadcastss (%eax), %ymm1
+; X32-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: isel_crash_8d:
 ; X64-AVX2:       ## BB#0: ## %eintry
@@ -1484,28 +1394,6 @@ define void @isel_crash_8d(i32* %cV_R.addr) {
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
-; X32-AVX512VL-LABEL: isel_crash_8d:
-; X32-AVX512VL:       ## BB#0: ## %eintry
-; X32-AVX512VL-NEXT:    pushl %ebp
-; X32-AVX512VL-NEXT:  Lcfi9:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_offset 8
-; X32-AVX512VL-NEXT:  Lcfi10:
-; X32-AVX512VL-NEXT:    .cfi_offset %ebp, -8
-; X32-AVX512VL-NEXT:    movl %esp, %ebp
-; X32-AVX512VL-NEXT:  Lcfi11:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_register %ebp
-; X32-AVX512VL-NEXT:    andl $-32, %esp
-; X32-AVX512VL-NEXT:    subl $128, %esp
-; X32-AVX512VL-NEXT:    movl 8(%ebp), %eax
-; X32-AVX512VL-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX512VL-NEXT:    vmovaps %ymm0, (%esp)
-; X32-AVX512VL-NEXT:    vbroadcastss (%eax), %ymm1
-; X32-AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    movl %ebp, %esp
-; X32-AVX512VL-NEXT:    popl %ebp
-; X32-AVX512VL-NEXT:    retl
-;
 ; X64-AVX512VL-LABEL: isel_crash_8d:
 ; X64-AVX512VL:       ## BB#0: ## %eintry
 ; X64-AVX512VL-NEXT:    pushq %rbp
@@ -1523,9 +1411,10 @@ define void @isel_crash_8d(i32* %cV_R.addr) {
 ; X64-AVX512VL-NEXT:    movl (%rdi), %eax
 ; X64-AVX512VL-NEXT:    vpbroadcastd %eax, %ymm1
 ; X64-AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX512VL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
 ; X64-AVX512VL-NEXT:    movq %rbp, %rsp
 ; X64-AVX512VL-NEXT:    popq %rbp
+; X64-AVX512VL-NEXT:    vzeroupper
 ; X64-AVX512VL-NEXT:    retq
 eintry:
   %__a.addr.i = alloca <4 x i64>, align 16
@@ -1580,7 +1469,7 @@ define void @isel_crash_2q(i64* %cV_R.addr) {
 ; X64-AVX512VL-NEXT:    movq (%rdi), %rax
 ; X64-AVX512VL-NEXT:    vpbroadcastq %rax, %xmm1
 ; X64-AVX512VL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    vmovaps %xmm1, -{{[0-9]+}}(%rsp)
+; X64-AVX512VL-NEXT:    vmovdqa %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-AVX512VL-NEXT:    retq
 entry:
   %__a.addr.i = alloca <2 x i64>, align 16
@@ -1597,34 +1486,34 @@ entry:
 }
 
 define void @isel_crash_4q(i64* %cV_R.addr) {
-; X32-AVX2-LABEL: isel_crash_4q:
-; X32-AVX2:       ## BB#0: ## %eintry
-; X32-AVX2-NEXT:    pushl %ebp
-; X32-AVX2-NEXT:  Lcfi13:
-; X32-AVX2-NEXT:    .cfi_def_cfa_offset 8
-; X32-AVX2-NEXT:  Lcfi14:
-; X32-AVX2-NEXT:    .cfi_offset %ebp, -8
-; X32-AVX2-NEXT:    movl %esp, %ebp
-; X32-AVX2-NEXT:  Lcfi15:
-; X32-AVX2-NEXT:    .cfi_def_cfa_register %ebp
-; X32-AVX2-NEXT:    andl $-32, %esp
-; X32-AVX2-NEXT:    subl $128, %esp
-; X32-AVX2-NEXT:    movl 8(%ebp), %eax
-; X32-AVX2-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX2-NEXT:    vmovaps %ymm0, (%esp)
-; X32-AVX2-NEXT:    movl (%eax), %ecx
-; X32-AVX2-NEXT:    movl 4(%eax), %eax
-; X32-AVX2-NEXT:    vmovd %ecx, %xmm1
-; X32-AVX2-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X32-AVX2-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
-; X32-AVX2-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
-; X32-AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
-; X32-AVX2-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX2-NEXT:    movl %ebp, %esp
-; X32-AVX2-NEXT:    popl %ebp
-; X32-AVX2-NEXT:    vzeroupper
-; X32-AVX2-NEXT:    retl
+; X32-LABEL: isel_crash_4q:
+; X32:       ## BB#0: ## %eintry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:  Lcfi13:
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:  Lcfi14:
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:  Lcfi15:
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    andl $-32, %esp
+; X32-NEXT:    subl $128, %esp
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    vxorps %ymm0, %ymm0, %ymm0
+; X32-NEXT:    vmovaps %ymm0, (%esp)
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl 4(%eax), %eax
+; X32-NEXT:    vmovd %ecx, %xmm1
+; X32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
+; X32-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
+; X32-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
+; X32-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
+; X32-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
+; X32-NEXT:    movl %ebp, %esp
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    vzeroupper
+; X32-NEXT:    retl
 ;
 ; X64-AVX2-LABEL: isel_crash_4q:
 ; X64-AVX2:       ## BB#0: ## %eintry
@@ -1650,34 +1539,6 @@ define void @isel_crash_4q(i64* %cV_R.addr) {
 ; X64-AVX2-NEXT:    vzeroupper
 ; X64-AVX2-NEXT:    retq
 ;
-; X32-AVX512VL-LABEL: isel_crash_4q:
-; X32-AVX512VL:       ## BB#0: ## %eintry
-; X32-AVX512VL-NEXT:    pushl %ebp
-; X32-AVX512VL-NEXT:  Lcfi13:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_offset 8
-; X32-AVX512VL-NEXT:  Lcfi14:
-; X32-AVX512VL-NEXT:    .cfi_offset %ebp, -8
-; X32-AVX512VL-NEXT:    movl %esp, %ebp
-; X32-AVX512VL-NEXT:  Lcfi15:
-; X32-AVX512VL-NEXT:    .cfi_def_cfa_register %ebp
-; X32-AVX512VL-NEXT:    andl $-32, %esp
-; X32-AVX512VL-NEXT:    subl $128, %esp
-; X32-AVX512VL-NEXT:    movl 8(%ebp), %eax
-; X32-AVX512VL-NEXT:    vxorps %ymm0, %ymm0, %ymm0
-; X32-AVX512VL-NEXT:    vmovaps %ymm0, (%esp)
-; X32-AVX512VL-NEXT:    movl (%eax), %ecx
-; X32-AVX512VL-NEXT:    movl 4(%eax), %eax
-; X32-AVX512VL-NEXT:    vmovd %ecx, %xmm1
-; X32-AVX512VL-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; X32-AVX512VL-NEXT:    vpinsrd $2, %ecx, %xmm1, %xmm1
-; X32-AVX512VL-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm1
-; X32-AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm1
-; X32-AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%esp)
-; X32-AVX512VL-NEXT:    movl %ebp, %esp
-; X32-AVX512VL-NEXT:    popl %ebp
-; X32-AVX512VL-NEXT:    retl
-;
 ; X64-AVX512VL-LABEL: isel_crash_4q:
 ; X64-AVX512VL:       ## BB#0: ## %eintry
 ; X64-AVX512VL-NEXT:    pushq %rbp
@@ -1695,9 +1556,10 @@ define void @isel_crash_4q(i64* %cV_R.addr) {
 ; X64-AVX512VL-NEXT:    movq (%rdi), %rax
 ; X64-AVX512VL-NEXT:    vpbroadcastq %rax, %ymm1
 ; X64-AVX512VL-NEXT:    vmovaps %ymm0, {{[0-9]+}}(%rsp)
-; X64-AVX512VL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX512VL-NEXT:    vmovdqa %ymm1, {{[0-9]+}}(%rsp)
 ; X64-AVX512VL-NEXT:    movq %rbp, %rsp
 ; X64-AVX512VL-NEXT:    popq %rbp
+; X64-AVX512VL-NEXT:    vzeroupper
 ; X64-AVX512VL-NEXT:    retq
 eintry:
   %__a.addr.i = alloca <4 x i64>, align 16
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index d509046cccd52eb5f6ad1e6c9e6c87381adfde79..45a1cd97503849de9355fba34a4c0a29373d69b2 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -407,7 +407,7 @@ define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X32-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X32-NEXT:    vzeroupper
@@ -418,7 +418,7 @@ define <8 x i16> @shl_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X64-NEXT:    vzeroupper
@@ -499,7 +499,7 @@ define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X32-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X32-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X32-NEXT:    vzeroupper
@@ -510,7 +510,7 @@ define <8 x i16> @ashr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X64-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; X64-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X64-NEXT:    vzeroupper
@@ -617,7 +617,7 @@ define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X32-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X32-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X32-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X32-NEXT:    vzeroupper
@@ -628,7 +628,7 @@ define <8 x i16> @lshr_8i16(<8 x i16> %r, <8 x i16> %a) nounwind {
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; X64-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; X64-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; X64-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; X64-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/avx512-adc-sbb.ll b/test/CodeGen/X86/avx512-adc-sbb.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c994fdef6919d7f75882d6d7f0afef314598fa40
--- /dev/null
+++ b/test/CodeGen/X86/avx512-adc-sbb.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=avx512f %s -o - | FileCheck %s
+
+; This asserted because we didn't account for a zext of a non-SETCC node:
+; https://bugs.llvm.org/show_bug.cgi?id=32316
+
+define i8 @PR32316(i8 %t1, i32 %t5, i8 %t8)  {
+; CHECK-LABEL: PR32316:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %dil, %dil
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    cmpl %esi, %eax
+; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    cmpb $1, %dl
+; CHECK-NEXT:    sbbb $-1, %al
+; CHECK-NEXT:    retq
+  %t2 = icmp eq i8 %t1, 0
+  %t3 = zext i1 %t2 to i32
+  %t6 = icmp ugt i32 %t3, %t5
+  %t7 = zext i1 %t6 to i8
+  %t9 = icmp ne i8 %t8, 0
+  %t10 = zext i1 %t9 to i8
+  %t11 = add i8 %t7, %t10
+  ret i8 %t11
+}
+
diff --git a/test/CodeGen/X86/avx512-any_extend_load.ll b/test/CodeGen/X86/avx512-any_extend_load.ll
index 87f8cc9a418e09b0e22a92bd2e3a51225c6255b6..f6ab0044ee80ab7d6e2219d88c5b1bfb20c61758 100644
--- a/test/CodeGen/X86/avx512-any_extend_load.ll
+++ b/test/CodeGen/X86/avx512-any_extend_load.ll
@@ -4,12 +4,20 @@
 
 
 define void @any_extend_load_v8i64(<8 x i8> * %ptr) {
-; ALL-LABEL: any_extend_load_v8i64:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
-; ALL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
-; ALL-NEXT:    vpmovqb %zmm0, (%rdi)
-; ALL-NEXT:    retq
+; KNL-LABEL: any_extend_load_v8i64:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; KNL-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; KNL-NEXT:    vpmovqb %zmm0, (%rdi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: any_extend_load_v8i64:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpmovzxbq {{.*#+}} zmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero,mem[2],zero,zero,zero,zero,zero,zero,zero,mem[3],zero,zero,zero,zero,zero,zero,zero,mem[4],zero,zero,zero,zero,zero,zero,zero,mem[5],zero,zero,zero,zero,zero,zero,zero,mem[6],zero,zero,zero,zero,zero,zero,zero,mem[7],zero,zero,zero,zero,zero,zero,zero
+; SKX-NEXT:    vpaddq {{.*}}(%rip){1to8}, %zmm0, %zmm0
+; SKX-NEXT:    vpmovqb %zmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
   %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
   %1 = zext <8 x i8> %wide.load to <8 x i64>
   %2 = add nuw nsw <8 x i64> %1, <i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4, i64 4>
@@ -33,6 +41,7 @@ define void @any_extend_load_v8i32(<8 x i8> * %ptr) {
 ; SKX-NEXT:    vpmovzxbd {{.*#+}} ymm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
 ; SKX-NEXT:    vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm0
 ; SKX-NEXT:    vpmovdb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %wide.load = load <8 x i8>, <8 x i8>* %ptr, align 1
   %1 = zext <8 x i8> %wide.load to <8 x i32>
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index 5bb21ef5aa25f56e7fc7ec20965615d6ac0508c6..26be20840563e9765a561634b2c8d288b15be155 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -233,6 +233,7 @@ define <2 x i64> @imulq128(<2 x i64> %y, <2 x i64> %x) {
 ; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vpmullq %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; SKX-LABEL: imulq128:
diff --git a/test/CodeGen/X86/avx512-calling-conv.ll b/test/CodeGen/X86/avx512-calling-conv.ll
index 1a91bc1dee9a7710d95ff2cd610d362a7fc9ddce..138b8750633c355368fa590e79f642222a31eab0 100644
--- a/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/test/CodeGen/X86/avx512-calling-conv.ll
@@ -140,6 +140,7 @@ define <8 x i32> @test5(<8 x i32>%a, <8 x i32>%b) {
 ; SKX-NEXT:    .cfi_def_cfa_offset 16
 ; SKX-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _func8xi1
 ; SKX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; SKX-NEXT:    vpslld $31, %ymm0, %ymm0
@@ -192,6 +193,7 @@ define <16 x i32> @test6(<16 x i32>%a, <16 x i32>%b) {
 ; SKX-NEXT:    .cfi_def_cfa_offset 16
 ; SKX-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _func16xi1
 ; SKX-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; SKX-NEXT:    vpslld $31, %zmm0, %zmm0
@@ -291,11 +293,12 @@ define <8 x i1> @test7a(<8 x i32>%a, <8 x i32>%b) {
 ; SKX-NEXT:    .cfi_def_cfa_offset 16
 ; SKX-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    callq _func8xi1
 ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovw2m %xmm0, %k0
 ; SKX-NEXT:    movb $85, %al
-; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kandb %k1, %k0, %k0
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0
 ; SKX-NEXT:    popq %rax
diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index 9775c79796f7ffff2693f444a7c68dac34ed7db2..63b0281a7339926428f17259d1266fd877289b7c 100644
--- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -19,6 +19,7 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float
 ; CHECK-NEXT:    korw %k3, %k2, %k1
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %x, i32 13, i16 -1, i32 4)
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index 78df51be5c3e1ec3bd3cb10b36a63c67a65e6b33..c1b64743f89853cded5f6ba546423b37d655923e 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -69,13 +69,14 @@ define float @test5(float %p) #0 {
 ; ALL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; ALL-NEXT:    vucomiss %xmm1, %xmm0
 ; ALL-NEXT:    jne LBB3_1
-; ALL-NEXT:    jnp LBB3_2
+; ALL-NEXT:    jp  LBB3_1
+; ALL-NEXT:  ## BB#2: ## %return
+; ALL-NEXT:    retq
 ; ALL-NEXT:  LBB3_1: ## %if.end
 ; ALL-NEXT:    seta %al
 ; ALL-NEXT:    movzbl %al, %eax
 ; ALL-NEXT:    leaq {{.*}}(%rip), %rcx
 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; ALL-NEXT:  LBB3_2: ## %return
 ; ALL-NEXT:    retq
 entry:
   %cmp = fcmp oeq float %p, 0.000000e+00
@@ -119,12 +120,12 @@ entry:
 define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
 ; ALL-LABEL: test8:
 ; ALL:       ## BB#0:
+; ALL-NEXT:    notl %edi
+; ALL-NEXT:    xorl $-2147483648, %esi ## imm = 0x80000000
 ; ALL-NEXT:    testl %edx, %edx
 ; ALL-NEXT:    movl $1, %eax
 ; ALL-NEXT:    cmovel %eax, %edx
-; ALL-NEXT:    cmpl $-2147483648, %esi ## imm = 0x80000000
-; ALL-NEXT:    cmovnel %edx, %eax
-; ALL-NEXT:    cmpl $-1, %edi
+; ALL-NEXT:    orl %edi, %esi
 ; ALL-NEXT:    cmovnel %edx, %eax
 ; ALL-NEXT:    retq
   %tmp1 = icmp eq i32 %a1, -1
@@ -157,26 +158,47 @@ B:
 }
 
 define i32 @test10(i64 %b, i64 %c, i1 %d) {
-; ALL-LABEL: test10:
-; ALL:       ## BB#0:
-; ALL-NEXT:    andl $1, %edx
-; ALL-NEXT:    kmovw %edx, %k0
-; ALL-NEXT:    cmpq %rsi, %rdi
-; ALL-NEXT:    sete %al
-; ALL-NEXT:    andl $1, %eax
-; ALL-NEXT:    kmovw %eax, %k1
-; ALL-NEXT:    korw %k1, %k0, %k1
-; ALL-NEXT:    kxorw %k1, %k0, %k0
-; ALL-NEXT:    kmovw %k0, %eax
-; ALL-NEXT:    andl $1, %eax
-; ALL-NEXT:    testb %al, %al
-; ALL-NEXT:    je LBB8_1
-; ALL-NEXT:  ## BB#2: ## %if.end.i
-; ALL-NEXT:    movl $6, %eax
-; ALL-NEXT:    retq
-; ALL-NEXT:  LBB8_1: ## %if.then.i
-; ALL-NEXT:    movl $5, %eax
-; ALL-NEXT:    retq
+; KNL-LABEL: test10:
+; KNL:       ## BB#0:
+; KNL-NEXT:    andl $1, %edx
+; KNL-NEXT:    kmovw %edx, %k0
+; KNL-NEXT:    cmpq %rsi, %rdi
+; KNL-NEXT:    sete %al
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    korw %k1, %k0, %k1
+; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    testb %al, %al
+; KNL-NEXT:    je LBB8_1
+; KNL-NEXT:  ## BB#2: ## %if.end.i
+; KNL-NEXT:    movl $6, %eax
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB8_1: ## %if.then.i
+; KNL-NEXT:    movl $5, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test10:
+; SKX:       ## BB#0:
+; SKX-NEXT:    andl $1, %edx
+; SKX-NEXT:    kmovd %edx, %k0
+; SKX-NEXT:    cmpq %rsi, %rdi
+; SKX-NEXT:    sete %al
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    korw %k1, %k0, %k1
+; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    testb %al, %al
+; SKX-NEXT:    je LBB8_1
+; SKX-NEXT:  ## BB#2: ## %if.end.i
+; SKX-NEXT:    movl $6, %eax
+; SKX-NEXT:    retq
+; SKX-NEXT:  LBB8_1: ## %if.then.i
+; SKX-NEXT:    movl $5, %eax
+; SKX-NEXT:    retq
 
   %cmp8.i = icmp eq i64 %b, %c
   %or1 = or i1 %d, %cmp8.i
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 87deeb9e16c03e8bdbfcdb96f2f1cddd5707a34c..2b55372f30667183eb17518765ad7fa73c0226db 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=KNL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx  | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLBW --check-prefix=SKX
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLNOBW --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=DQ --check-prefix=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NOVL --check-prefix=NODQ --check-prefix=NOVLDQ --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512dq  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=DQ --check-prefix=VL --check-prefix=VLDQ --check-prefix=VLNOBW --check-prefix=AVX512VLDQ
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=avx512vl,avx512bw  | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=NODQ --check-prefix=VL --check-prefix=VLNODQ --check-prefix=VLBW --check-prefix=AVX512VLBW
 
 
 define <16 x float> @sitof32(<16 x i32> %a) nounwind {
@@ -110,40 +110,78 @@ define <2 x float> @sltof2f32(<2 x i64> %a) {
 ; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %b = sitofp <2 x i64> %a to <2 x float>
   ret <2 x float>%b
 }
 
 define <4 x float> @sltof4f32_mem(<4 x i64>* %a) {
-; NODQ-LABEL: sltof4f32_mem:
-; NODQ:       ## BB#0:
-; NODQ-NEXT:    vmovdqu (%rdi), %ymm0
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT:    retq
+; KNL-LABEL: sltof4f32_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vmovdqu (%rdi), %ymm0
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT:    retq
 ;
 ; VLDQ-LABEL: sltof4f32_mem:
 ; VLDQ:       ## BB#0:
 ; VLDQ-NEXT:    vcvtqq2psy (%rdi), %xmm0
 ; VLDQ-NEXT:    retq
 ;
+; VLNODQ-LABEL: sltof4f32_mem:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vmovdqu (%rdi), %ymm0
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VLNODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VLNODQ-NEXT:    vzeroupper
+; VLNODQ-NEXT:    retq
+;
 ; AVX512DQ-LABEL: sltof4f32_mem:
 ; AVX512DQ:       ## BB#0:
 ; AVX512DQ-NEXT:    vmovups (%rdi), %ymm0
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: sltof4f32_mem:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %a1 = load <4 x i64>, <4 x i64>* %a, align 8
   %b = sitofp <4 x i64> %a1 to <4 x float>
   ret <4 x float>%b
@@ -218,65 +256,137 @@ define <4 x i64> @f32tosl(<4 x float> %a) {
 }
 
 define <4 x float> @sltof432(<4 x i64> %a) {
-; NODQ-LABEL: sltof432:
-; NODQ:       ## BB#0:
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT:    retq
+; KNL-LABEL: sltof432:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT:    retq
 ;
 ; VLDQ-LABEL: sltof432:
 ; VLDQ:       ## BB#0:
 ; VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; VLDQ-NEXT:    vzeroupper
 ; VLDQ-NEXT:    retq
 ;
+; VLNODQ-LABEL: sltof432:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VLNODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VLNODQ-NEXT:    vzeroupper
+; VLNODQ-NEXT:    retq
+;
 ; AVX512DQ-LABEL: sltof432:
 ; AVX512DQ:       ## BB#0:
 ; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: sltof432:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm2
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm2
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %b = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
 }
 
 define <4 x float> @ultof432(<4 x i64> %a) {
-; NODQ-LABEL: ultof432:
-; NODQ:       ## BB#0:
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
-; NODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; NODQ-NEXT:    vmovq %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
-; NODQ-NEXT:    vpextrq $1, %xmm0, %rax
-; NODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
-; NODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; NODQ-NEXT:    retq
+; KNL-LABEL: ultof432:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vmovq %xmm0, %rax
+; KNL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; KNL-NEXT:    vpextrq $1, %xmm0, %rax
+; KNL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; KNL-NEXT:    retq
 ;
 ; VLDQ-LABEL: ultof432:
 ; VLDQ:       ## BB#0:
 ; VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; VLDQ-NEXT:    vzeroupper
 ; VLDQ-NEXT:    retq
 ;
+; VLNODQ-LABEL: ultof432:
+; VLNODQ:       ## BB#0:
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; VLNODQ-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; VLNODQ-NEXT:    vmovq %xmm0, %rax
+; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; VLNODQ-NEXT:    vpextrq $1, %xmm0, %rax
+; VLNODQ-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; VLNODQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; VLNODQ-NEXT:    vzeroupper
+; VLNODQ-NEXT:    retq
+;
 ; AVX512DQ-LABEL: ultof432:
 ; AVX512DQ:       ## BB#0:
 ; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: ultof432:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm1, %xmm1
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm2, %xmm2
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3]
+; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512BW-NEXT:    vmovq %xmm0, %rax
+; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm2
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3]
+; AVX512BW-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX512BW-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
+; AVX512BW-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %b = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -355,17 +465,33 @@ define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
 }
 
 define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
-; NOVL-LABEL: fptoui_128:
-; NOVL:       ## BB#0:
-; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; NOVL-NEXT:    vcvttps2udq %zmm0, %zmm0
-; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; NOVL-NEXT:    retq
+; KNL-LABEL: fptoui_128:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vcvttps2udq %zmm0, %zmm0
+; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
 ;
 ; VL-LABEL: fptoui_128:
 ; VL:       ## BB#0:
 ; VL-NEXT:    vcvttps2udq %xmm0, %xmm0
 ; VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_128:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: fptoui_128:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vcvttps2udq %zmm0, %zmm0
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %b = fptoui <4 x float> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -380,17 +506,34 @@ define <8 x i32> @fptoui01(<8 x double> %a) nounwind {
 }
 
 define <4 x i32> @fptoui_256d(<4 x double> %a) nounwind {
-; NOVL-LABEL: fptoui_256d:
-; NOVL:       ## BB#0:
-; NOVL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; NOVL-NEXT:    vcvttpd2udq %zmm0, %ymm0
-; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; NOVL-NEXT:    retq
+; KNL-LABEL: fptoui_256d:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    retq
 ;
 ; VL-LABEL: fptoui_256d:
 ; VL:       ## BB#0:
 ; VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptoui_256d:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: fptoui_256d:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vcvttpd2udq %zmm0, %ymm0
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %b = fptoui <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -404,34 +547,70 @@ define <8 x double> @sitof64(<8 x i32> %a) {
   ret <8 x double> %b
 }
 define <8 x double> @sitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; NODQ-LABEL: sitof64_mask:
-; NODQ:       ## BB#0:
-; NODQ-NEXT:    kmovw %edi, %k1
-; NODQ-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; NODQ-NEXT:    retq
+; KNL-LABEL: sitof64_mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
 ;
-; DQ-LABEL: sitof64_mask:
-; DQ:       ## BB#0:
-; DQ-NEXT:    kmovb %edi, %k1
-; DQ-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
-; DQ-NEXT:    retq
+; VLBW-LABEL: sitof64_mask:
+; VLBW:       ## BB#0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; VLBW-NEXT:    retq
+;
+; VLNOBW-LABEL: sitof64_mask:
+; VLNOBW:       ## BB#0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; VLNOBW-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitof64_mask:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k1
+; AVX512DQ-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: sitof64_mask:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vcvtdq2pd %ymm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
   %1 = bitcast i8 %c to <8 x i1>
   %2 = sitofp <8 x i32> %b to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
   ret <8 x double> %3
 }
 define <8 x double> @sitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; NODQ-LABEL: sitof64_maskz:
-; NODQ:       ## BB#0:
-; NODQ-NEXT:    kmovw %edi, %k1
-; NODQ-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; NODQ-NEXT:    retq
+; KNL-LABEL: sitof64_maskz:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
 ;
-; DQ-LABEL: sitof64_maskz:
-; DQ:       ## BB#0:
-; DQ-NEXT:    kmovb %edi, %k1
-; DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
-; DQ-NEXT:    retq
+; VLBW-LABEL: sitof64_maskz:
+; VLBW:       ## BB#0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; VLBW-NEXT:    retq
+;
+; VLNOBW-LABEL: sitof64_maskz:
+; VLNOBW:       ## BB#0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; VLNOBW-NEXT:    retq
+;
+; AVX512DQ-LABEL: sitof64_maskz:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k1
+; AVX512DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: sitof64_maskz:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vcvtdq2pd %ymm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %1 = bitcast i8 %b to <8 x i1>
   %2 = sitofp <8 x i32> %a to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
@@ -448,10 +627,16 @@ define <8 x i32> @fptosi01(<8 x double> %a) {
 }
 
 define <4 x i32> @fptosi03(<4 x double> %a) {
-; ALL-LABEL: fptosi03:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: fptosi03:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; KNL-NEXT:    retq
+;
+; AVX512-LABEL: fptosi03:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %b = fptosi <4 x double> %a to <4 x i32>
   ret <4 x i32> %b
 }
@@ -475,29 +660,54 @@ define <16 x float> @fptrunc00(<16 x double> %b) nounwind {
 }
 
 define <4 x float> @fptrunc01(<4 x double> %b) {
-; ALL-LABEL: fptrunc01:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; ALL-NEXT:    retq
+; KNL-LABEL: fptrunc01:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; KNL-NEXT:    retq
+;
+; AVX512-LABEL: fptrunc01:
+; AVX512:       ## BB#0:
+; AVX512-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
   %a = fptrunc <4 x double> %b to <4 x float>
   ret <4 x float> %a
 }
 
 define <4 x float> @fptrunc02(<4 x double> %b, <4 x i1> %mask) {
-; NOVL-LABEL: fptrunc02:
-; NOVL:       ## BB#0:
-; NOVL-NEXT:    vpslld $31, %xmm1, %xmm1
-; NOVL-NEXT:    vpsrad $31, %xmm1, %xmm1
-; NOVL-NEXT:    vcvtpd2ps %ymm0, %xmm0
-; NOVL-NEXT:    vpand %xmm0, %xmm1, %xmm0
-; NOVL-NEXT:    retq
+; KNL-LABEL: fptrunc02:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %xmm1, %xmm1
+; KNL-NEXT:    vpsrad $31, %xmm1, %xmm1
+; KNL-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; KNL-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    retq
 ;
 ; VL-LABEL: fptrunc02:
 ; VL:       ## BB#0:
 ; VL-NEXT:    vpslld $31, %xmm1, %xmm1
 ; VL-NEXT:    vptestmd %xmm1, %xmm1, %k1
 ; VL-NEXT:    vcvtpd2ps %ymm0, %xmm0 {%k1} {z}
+; VL-NEXT:    vzeroupper
 ; VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: fptrunc02:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; AVX512DQ-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: fptrunc02:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpslld $31, %xmm1, %xmm1
+; AVX512BW-NEXT:    vpsrad $31, %xmm1, %xmm1
+; AVX512BW-NEXT:    vcvtpd2ps %ymm0, %xmm0
+; AVX512BW-NEXT:    vpand %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %a = fptrunc <4 x double> %b to <4 x float>
   %c = select <4 x i1>%mask, <4 x float>%a, <4 x float> zeroinitializer
   ret <4 x float> %c
@@ -685,34 +895,70 @@ define <16 x double> @uitof64(<16 x i32> %a) nounwind {
   ret <16 x double> %b
 }
 define <8 x double> @uitof64_mask(<8 x double> %a, <8 x i32> %b, i8 %c) nounwind {
-; NODQ-LABEL: uitof64_mask:
-; NODQ:       ## BB#0:
-; NODQ-NEXT:    kmovw %edi, %k1
-; NODQ-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; NODQ-NEXT:    retq
+; KNL-LABEL: uitof64_mask:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; KNL-NEXT:    retq
 ;
-; DQ-LABEL: uitof64_mask:
-; DQ:       ## BB#0:
-; DQ-NEXT:    kmovb %edi, %k1
-; DQ-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
-; DQ-NEXT:    retq
+; VLBW-LABEL: uitof64_mask:
+; VLBW:       ## BB#0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; VLBW-NEXT:    retq
+;
+; VLNOBW-LABEL: uitof64_mask:
+; VLNOBW:       ## BB#0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; VLNOBW-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitof64_mask:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k1
+; AVX512DQ-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: uitof64_mask:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vcvtudq2pd %ymm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
   %1 = bitcast i8 %c to <8 x i1>
   %2 = uitofp <8 x i32> %b to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> %a
   ret <8 x double> %3
 }
 define <8 x double> @uitof64_maskz(<8 x i32> %a, i8 %b) nounwind {
-; NODQ-LABEL: uitof64_maskz:
-; NODQ:       ## BB#0:
-; NODQ-NEXT:    kmovw %edi, %k1
-; NODQ-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; NODQ-NEXT:    retq
+; KNL-LABEL: uitof64_maskz:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
 ;
-; DQ-LABEL: uitof64_maskz:
-; DQ:       ## BB#0:
-; DQ-NEXT:    kmovb %edi, %k1
-; DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
-; DQ-NEXT:    retq
+; VLBW-LABEL: uitof64_maskz:
+; VLBW:       ## BB#0:
+; VLBW-NEXT:    kmovd %edi, %k1
+; VLBW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; VLBW-NEXT:    retq
+;
+; VLNOBW-LABEL: uitof64_maskz:
+; VLNOBW:       ## BB#0:
+; VLNOBW-NEXT:    kmovw %edi, %k1
+; VLNOBW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; VLNOBW-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitof64_maskz:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k1
+; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: uitof64_maskz:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vcvtudq2pd %ymm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
   %1 = bitcast i8 %b to <8 x i1>
   %2 = uitofp <8 x i32> %a to <8 x double>
   %3 = select <8 x i1> %1, <8 x double> %2, <8 x double> zeroinitializer
@@ -761,17 +1007,33 @@ define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
 }
 
 define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
-; NOVL-LABEL: uitof32_128:
-; NOVL:       ## BB#0:
-; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; NOVL-NEXT:    vcvtudq2ps %zmm0, %zmm0
-; NOVL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
-; NOVL-NEXT:    retq
+; KNL-LABEL: uitof32_128:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    retq
 ;
 ; VL-LABEL: uitof32_128:
 ; VL:       ## BB#0:
 ; VL-NEXT:    vcvtudq2ps %xmm0, %xmm0
 ; VL-NEXT:    retq
+;
+; AVX512DQ-LABEL: uitof32_128:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+;
+; AVX512BW-LABEL: uitof32_128:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vcvtudq2ps %zmm0, %zmm0
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
   %b = uitofp <4 x i32> %a to <4 x float>
   ret <4 x float> %b
 }
@@ -917,11 +1179,9 @@ define <16 x double> @sitofp_16i1_double(<16 x double> %a) {
 ; AVX512DQ-NEXT:    vxorpd %zmm2, %zmm2, %zmm2
 ; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm2, %k0
 ; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm2, %k1
-; AVX512DQ-NEXT:    vpmovm2q %k1, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
 ; AVX512DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
-; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm1
-; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
 ; AVX512DQ-NEXT:    vcvtdq2pd %ymm1, %zmm1
 ; AVX512DQ-NEXT:    retq
   %cmpres = fcmp ogt <16 x double> %a, zeroinitializer
@@ -960,8 +1220,7 @@ define <8 x double> @sitofp_8i1_double(<8 x double> %a) {
 ; AVX512DQ:       ## BB#0:
 ; AVX512DQ-NEXT:    vxorpd %zmm1, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
-; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    retq
   %cmpres = fcmp ogt <8 x double> %a, zeroinitializer
@@ -1002,8 +1261,7 @@ define <8 x float> @sitofp_8i1_float(<8 x float> %a) {
 ; AVX512DQ-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm1, %k0
-; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
-; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX512DQ-NEXT:    retq
   %cmpres = fcmp ogt <8 x float> %a, zeroinitializer
@@ -1075,7 +1333,6 @@ define <2 x float> @sitofp_2i1_float(<2 x float> %a) {
 ; NOVL:       ## BB#0:
 ; NOVL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; NOVL-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; NOVL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1]
 ; NOVL-NEXT:    vcvtdq2ps %xmm0, %xmm0
 ; NOVL-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/avx512-ext.ll b/test/CodeGen/X86/avx512-ext.ll
index f1f984110608d9eb94e4a3075e174536f6249f51..796ee83b6fa79ab6d76173b116e1d6d7cedc3330 100644
--- a/test/CodeGen/X86/avx512-ext.ll
+++ b/test/CodeGen/X86/avx512-ext.ll
@@ -491,8 +491,7 @@ define <2 x i64> @zext_2x8mem_to_2x64(<2 x i8> *%i , <2 x i1> %mask) nounwind re
 ; KNL-LABEL: zext_2x8mem_to_2x64:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
-; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
-; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpsraq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
 ; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
@@ -512,8 +511,7 @@ define <2 x i64> @sext_2x8mem_to_2x64mask(<2 x i8> *%i , <2 x i1> %mask) nounwin
 ; KNL-LABEL: sext_2x8mem_to_2x64mask:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
-; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
-; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpsraq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxbq (%rdi), %xmm1
 ; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
@@ -872,8 +870,7 @@ define <2 x i64> @zext_2x16mem_to_2x64(<2 x i16> *%i , <2 x i1> %mask) nounwind
 ; KNL-LABEL: zext_2x16mem_to_2x64:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
-; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
-; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpsraq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovzxwq {{.*#+}} xmm1 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
 ; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
@@ -894,8 +891,7 @@ define <2 x i64> @sext_2x16mem_to_2x64mask(<2 x i16> *%i , <2 x i1> %mask) nounw
 ; KNL-LABEL: sext_2x16mem_to_2x64mask:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
-; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
-; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpsraq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxwq (%rdi), %xmm1
 ; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
@@ -1061,8 +1057,7 @@ define <2 x i64> @zext_2x32mem_to_2x64(<2 x i32> *%i , <2 x i1> %mask) nounwind
 ; KNL-LABEL: zext_2x32mem_to_2x64:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
-; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
-; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpsraq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
 ; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
@@ -1083,8 +1078,7 @@ define <2 x i64> @sext_2x32mem_to_2x64mask(<2 x i32> *%i , <2 x i1> %mask) nounw
 ; KNL-LABEL: sext_2x32mem_to_2x64mask:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpsllq $63, %xmm0, %xmm0
-; KNL-NEXT:    vpsrad $31, %xmm0, %xmm0
-; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; KNL-NEXT:    vpsraq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vpmovsxdq (%rdi), %xmm1
 ; KNL-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; KNL-NEXT:    retq
@@ -1294,11 +1288,17 @@ define <8 x double> @fpext_test(<8 x float> %a) nounwind readnone {
 }
 
 define   <16 x i32> @zext_16i1_to_16xi32(i16 %b) {
-; ALL-LABEL: zext_16i1_to_16xi32:
-; ALL:       ## BB#0:
-; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
-; ALL-NEXT:    retq
+; KNL-LABEL: zext_16i1_to_16xi32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_16i1_to_16xi32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vpbroadcastd {{.*}}(%rip), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
   %a = bitcast i16 %b to <16 x i1>
   %c = zext <16 x i1> %a to <16 x i32>
   ret <16 x i32> %c
@@ -1313,7 +1313,7 @@ define   <8 x i64> @zext_8i1_to_8xi64(i8 %b) {
 ;
 ; SKX-LABEL: zext_8i1_to_8xi64:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vpbroadcastq {{.*}}(%rip), %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %a = bitcast i8 %b to <8 x i1>
@@ -1328,13 +1328,15 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_16i8_to_16i1:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpsllw $7, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovb2m %xmm0, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; SKX-NEXT:    retq
   %mask_b = trunc <16 x i8>%a to <16 x i1>
   %mask = bitcast <16 x i1> %mask_b to i16
@@ -1342,12 +1344,22 @@ define i16 @trunc_16i8_to_16i1(<16 x i8> %a) {
 }
 
 define i16 @trunc_16i32_to_16i1(<16 x i32> %a) {
-; ALL-LABEL: trunc_16i32_to_16i1:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vpslld $31, %zmm0, %zmm0
-; ALL-NEXT:    vptestmd %zmm0, %zmm0, %k0
-; ALL-NEXT:    kmovw %k0, %eax
-; ALL-NEXT:    retq
+; KNL-LABEL: trunc_16i32_to_16i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
+; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_16i32_to_16i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpslld $31, %zmm0, %zmm0
+; SKX-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
   %mask_b = trunc <16 x i32>%a to <16 x i1>
   %mask = bitcast <16 x i1> %mask_b to i16
   ret i16 %mask
@@ -1384,13 +1396,15 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
 ; KNL-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_8i16_to_8i1:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovw2m %xmm0, %k0
-; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
   %mask_b = trunc <8 x i16>%a to <8 x i1>
   %mask = bitcast <8 x i1> %mask_b to i8
@@ -1418,17 +1432,31 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
 
 
 define i16 @trunc_i32_to_i1(i32 %a) {
-; ALL-LABEL: trunc_i32_to_i1:
-; ALL:       ## BB#0:
-; ALL-NEXT:    andl $1, %edi
-; ALL-NEXT:    kmovw %edi, %k0
-; ALL-NEXT:    movw $-4, %ax
-; ALL-NEXT:    kmovw %eax, %k1
-; ALL-NEXT:    kshiftrw $1, %k1, %k1
-; ALL-NEXT:    kshiftlw $1, %k1, %k1
-; ALL-NEXT:    korw %k0, %k1, %k0
-; ALL-NEXT:    kmovw %k0, %eax
-; ALL-NEXT:    retq
+; KNL-LABEL: trunc_i32_to_i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    andl $1, %edi
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    movw $-4, %ax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    kshiftrw $1, %k1, %k1
+; KNL-NEXT:    kshiftlw $1, %k1, %k1
+; KNL-NEXT:    korw %k0, %k1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: trunc_i32_to_i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    andl $1, %edi
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    movw $-4, %ax
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    kshiftrw $1, %k1, %k1
+; SKX-NEXT:    kshiftlw $1, %k1, %k1
+; SKX-NEXT:    korw %k0, %k1, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
   %a_i = trunc i32 %a to i1
   %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
   %res = bitcast <16 x i1> %maskv to i16
@@ -1447,6 +1475,7 @@ define <8 x i16> @sext_8i1_8i16(<8 x i32> %a1, <8 x i32> %a2) nounwind {
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpcmpgtd %ymm0, %ymm1, %k0
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = icmp slt <8 x i32> %a1, %a2
   %y = sext <8 x i1> %x to <8 x i16>
@@ -1488,11 +1517,18 @@ define <8 x i64> @sext_8i1_8i64(<8 x i32> %a1, <8 x i32> %a2) nounwind {
 }
 
 define void @extload_v8i64(<8 x i8>* %a, <8 x i64>* %res) {
-; ALL-LABEL: extload_v8i64:
-; ALL:       ## BB#0:
-; ALL-NEXT:    vpmovsxbq (%rdi), %zmm0
-; ALL-NEXT:    vmovdqa64 %zmm0, (%rsi)
-; ALL-NEXT:    retq
+; KNL-LABEL: extload_v8i64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpmovsxbq (%rdi), %zmm0
+; KNL-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: extload_v8i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpmovsxbq (%rdi), %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm0, (%rsi)
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
   %sign_load = load <8 x i8>, <8 x i8>* %a
   %c = sext <8 x i8> %sign_load to <8 x i64>
   store <8 x i64> %c, <8 x i64>* %res
diff --git a/test/CodeGen/X86/avx512-extract-subvector.ll b/test/CodeGen/X86/avx512-extract-subvector.ll
index 391bf6ba4554eea4221a99f84c7d2f1c4fabe7b3..2d0a81046b4e3217871eb6cbeface766274db054 100644
--- a/test/CodeGen/X86/avx512-extract-subvector.ll
+++ b/test/CodeGen/X86/avx512-extract-subvector.ll
@@ -6,6 +6,7 @@ define <8 x i16> @extract_subvector128_v32i16(<32 x i16> %x) nounwind {
 ; SKX-LABEL: extract_subvector128_v32i16:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   ret <8 x i16> %r1
@@ -15,6 +16,7 @@ define <8 x i16> @extract_subvector128_v32i16_first_element(<32 x i16> %x) nounw
 ; SKX-LABEL: extract_subvector128_v32i16_first_element:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = shufflevector <32 x i16> %x, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i16> %r1
@@ -24,6 +26,7 @@ define <16 x i8> @extract_subvector128_v64i8(<64 x i8> %x) nounwind {
 ; SKX-LABEL: extract_subvector128_v64i8:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38,i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47>
   ret <16 x i8> %r1
@@ -33,6 +36,7 @@ define <16 x i8> @extract_subvector128_v64i8_first_element(<64 x i8> %x) nounwin
 ; SKX-LABEL: extract_subvector128_v64i8_first_element:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = shufflevector <64 x i8> %x, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i8> %r1
@@ -61,6 +65,7 @@ define void @extract_subvector256_v8f64_store(double* nocapture %addr, <4 x doub
 ; SKX-LABEL: extract_subvector256_v8f64_store:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vextractf128 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 2, i32 3>
@@ -73,6 +78,7 @@ define void @extract_subvector256_v8f32_store(float* nocapture %addr, <8 x float
 ; SKX-LABEL: extract_subvector256_v8f32_store:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vextractf128 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -85,6 +91,7 @@ define void @extract_subvector256_v4i64_store(i64* nocapture %addr, <4 x i64> %a
 ; SKX-LABEL: extract_subvector256_v4i64_store:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
@@ -97,6 +104,7 @@ define void @extract_subvector256_v8i32_store(i32* nocapture %addr, <8 x i32> %a
 ; SKX-LABEL: extract_subvector256_v8i32_store:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -109,6 +117,7 @@ define void @extract_subvector256_v16i16_store(i16* nocapture %addr, <16 x i16>
 ; SKX-LABEL: extract_subvector256_v16i16_store:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -121,6 +130,7 @@ define void @extract_subvector256_v32i8_store(i8* nocapture %addr, <32 x i8> %a)
 ; SKX-LABEL: extract_subvector256_v32i8_store:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vextracti128 $1, %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -133,6 +143,7 @@ define void @extract_subvector256_v4f64_store_lo(double* nocapture %addr, <4 x d
 ; SKX-LABEL: extract_subvector256_v4f64_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -145,6 +156,7 @@ define void @extract_subvector256_v4f64_store_lo_align_16(double* nocapture %add
 ; SKX-LABEL: extract_subvector256_v4f64_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <4 x double> %a, <4 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -157,6 +169,7 @@ define void @extract_subvector256_v4f32_store_lo(float* nocapture %addr, <8 x fl
 ; SKX-LABEL: extract_subvector256_v4f32_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -169,6 +182,7 @@ define void @extract_subvector256_v4f32_store_lo_align_16(float* nocapture %addr
 ; SKX-LABEL: extract_subvector256_v4f32_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x float> %a, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -181,6 +195,7 @@ define void @extract_subvector256_v2i64_store_lo(i64* nocapture %addr, <4 x i64>
 ; SKX-LABEL: extract_subvector256_v2i64_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -193,6 +208,7 @@ define void @extract_subvector256_v2i64_store_lo_align_16(i64* nocapture %addr,
 ; SKX-LABEL: extract_subvector256_v2i64_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <4 x i64> %a, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -205,6 +221,7 @@ define void @extract_subvector256_v4i32_store_lo(i32* nocapture %addr, <8 x i32>
 ; SKX-LABEL: extract_subvector256_v4i32_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -217,6 +234,7 @@ define void @extract_subvector256_v4i32_store_lo_align_16(i32* nocapture %addr,
 ; SKX-LABEL: extract_subvector256_v4i32_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x i32> %a, <8 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -229,6 +247,7 @@ define void @extract_subvector256_v8i16_store_lo(i16* nocapture %addr, <16 x i16
 ; SKX-LABEL: extract_subvector256_v8i16_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -241,6 +260,7 @@ define void @extract_subvector256_v8i16_store_lo_align_16(i16* nocapture %addr,
 ; SKX-LABEL: extract_subvector256_v8i16_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x i16> %a, <16 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -253,6 +273,7 @@ define void @extract_subvector256_v16i8_store_lo(i8* nocapture %addr, <32 x i8>
 ; SKX-LABEL: extract_subvector256_v16i8_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -265,6 +286,7 @@ define void @extract_subvector256_v16i8_store_lo_align_16(i8* nocapture %addr, <
 ; SKX-LABEL: extract_subvector256_v16i8_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <32 x i8> %a, <32 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -277,6 +299,7 @@ define void @extract_subvector512_v2f64_store_lo(double* nocapture %addr, <8 x d
 ; SKX-LABEL: extract_subvector512_v2f64_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -289,6 +312,7 @@ define void @extract_subvector512_v2f64_store_lo_align_16(double* nocapture %add
 ; SKX-LABEL: extract_subvector512_v2f64_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -301,6 +325,7 @@ define void @extract_subvector512_v4f32_store_lo(float* nocapture %addr, <16 x f
 ; SKX-LABEL: extract_subvector512_v4f32_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -313,6 +338,7 @@ define void @extract_subvector512_v4f32_store_lo_align_16(float* nocapture %addr
 ; SKX-LABEL: extract_subvector512_v4f32_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -325,6 +351,7 @@ define void @extract_subvector512_v2i64_store_lo(i64* nocapture %addr, <8 x i64>
 ; SKX-LABEL: extract_subvector512_v2i64_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -337,6 +364,7 @@ define void @extract_subvector512_v2i64_store_lo_align_16(i64* nocapture %addr,
 ; SKX-LABEL: extract_subvector512_v2i64_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -349,6 +377,7 @@ define void @extract_subvector512_v4i32_store_lo(i32* nocapture %addr, <16 x i32
 ; SKX-LABEL: extract_subvector512_v4i32_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -361,6 +390,7 @@ define void @extract_subvector512_v4i32_store_lo_align_16(i32* nocapture %addr,
 ; SKX-LABEL: extract_subvector512_v4i32_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -373,6 +403,7 @@ define void @extract_subvector512_v8i16_store_lo(i16* nocapture %addr, <32 x i16
 ; SKX-LABEL: extract_subvector512_v8i16_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -385,6 +416,7 @@ define void @extract_subvector512_v16i8_store_lo(i8* nocapture %addr, <64 x i8>
 ; SKX-LABEL: extract_subvector512_v16i8_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -397,6 +429,7 @@ define void @extract_subvector512_v16i8_store_lo_align_16(i8* nocapture %addr, <
 ; SKX-LABEL: extract_subvector512_v16i8_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -409,6 +442,7 @@ define void @extract_subvector512_v4f64_store_lo(double* nocapture %addr, <8 x d
 ; SKX-LABEL: extract_subvector512_v4f64_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -421,6 +455,7 @@ define void @extract_subvector512_v4f64_store_lo_align_16(double* nocapture %add
 ; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -433,6 +468,7 @@ define void @extract_subvector512_v4f64_store_lo_align_32(double* nocapture %add
 ; SKX-LABEL: extract_subvector512_v4f64_store_lo_align_32:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x double> %a, <8 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -445,6 +481,7 @@ define void @extract_subvector512_v8f32_store_lo(float* nocapture %addr, <16 x f
 ; SKX-LABEL: extract_subvector512_v8f32_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -457,6 +494,7 @@ define void @extract_subvector512_v8f32_store_lo_align_16(float* nocapture %addr
 ; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -469,6 +507,7 @@ define void @extract_subvector512_v8f32_store_lo_align_32(float* nocapture %addr
 ; SKX-LABEL: extract_subvector512_v8f32_store_lo_align_32:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x float> %a, <16 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -481,6 +520,7 @@ define void @extract_subvector512_v4i64_store_lo(i64* nocapture %addr, <8 x i64>
 ; SKX-LABEL: extract_subvector512_v4i64_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -493,6 +533,7 @@ define void @extract_subvector512_v4i64_store_lo_align_16(i64* nocapture %addr,
 ; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -505,6 +546,7 @@ define void @extract_subvector512_v4i64_store_lo_align_32(i64* nocapture %addr,
 ; SKX-LABEL: extract_subvector512_v4i64_store_lo_align_32:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <8 x i64> %a, <8 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -517,6 +559,7 @@ define void @extract_subvector512_v8i32_store_lo(i32* nocapture %addr, <16 x i32
 ; SKX-LABEL: extract_subvector512_v8i32_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -529,6 +572,7 @@ define void @extract_subvector512_v8i32_store_lo_align_16(i32* nocapture %addr,
 ; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -541,6 +585,7 @@ define void @extract_subvector512_v8i32_store_lo_align_32(i32* nocapture %addr,
 ; SKX-LABEL: extract_subvector512_v8i32_store_lo_align_32:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <16 x i32> %a, <16 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -553,6 +598,7 @@ define void @extract_subvector512_v16i16_store_lo(i16* nocapture %addr, <32 x i1
 ; SKX-LABEL: extract_subvector512_v16i16_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -565,6 +611,7 @@ define void @extract_subvector512_v16i16_store_lo_align_16(i16* nocapture %addr,
 ; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -577,6 +624,7 @@ define void @extract_subvector512_v16i16_store_lo_align_32(i16* nocapture %addr,
 ; SKX-LABEL: extract_subvector512_v16i16_store_lo_align_32:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <32 x i16> %a, <32 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -589,6 +637,7 @@ define void @extract_subvector512_v32i8_store_lo(i8* nocapture %addr, <64 x i8>
 ; SKX-LABEL: extract_subvector512_v32i8_store_lo:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -601,6 +650,7 @@ define void @extract_subvector512_v32i8_store_lo_align_16(i8* nocapture %addr, <
 ; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_16:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovups %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -613,6 +663,7 @@ define void @extract_subvector512_v32i8_store_lo_align_32(i8* nocapture %addr, <
 ; SKX-LABEL: extract_subvector512_v32i8_store_lo_align_32:
 ; SKX:       ## BB#0: ## %entry
 ; SKX-NEXT:    vmovaps %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = shufflevector <64 x i8> %a, <64 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -624,7 +675,7 @@ entry:
 define <4 x double> @test_mm512_mask_extractf64x4_pd(<4 x double> %__W, i8 %__U, <8 x double> %__A) {
 ; SKX-LABEL: test_mm512_mask_extractf64x4_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf64x4 $1, %zmm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -638,7 +689,7 @@ entry:
 define <4 x double> @test_mm512_maskz_extractf64x4_pd(i8 %__U, <8 x double> %__A) {
 ; SKX-LABEL: test_mm512_maskz_extractf64x4_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf64x4 $1, %zmm0, %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -652,8 +703,9 @@ entry:
 define <4 x float> @test_mm512_mask_extractf32x4_ps(<4 x float> %__W, i8 %__U, <8 x double> %__A) {
 ; SKX-LABEL: test_mm512_mask_extractf32x4_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf32x4 $1, %zmm1, %xmm0 {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = bitcast <8 x double> %__A to <16 x float>
@@ -667,8 +719,9 @@ entry:
 define <4 x float> @test_mm512_maskz_extractf32x4_ps(i8 %__U, <8 x double> %__A) {
 ; SKX-LABEL: test_mm512_maskz_extractf32x4_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf32x4 $1, %zmm0, %xmm0 {%k1} {z}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = bitcast <8 x double> %__A to <16 x float>
@@ -682,8 +735,9 @@ entry:
 define <2 x double> @test_mm256_mask_extractf64x2_pd(<2 x double> %__W, i8 %__U, <4 x double> %__A) {
 ; SKX-LABEL: test_mm256_mask_extractf64x2_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf64x2 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %shuffle = shufflevector <4 x double> %__A, <4 x double> undef, <2 x i32> <i32 2, i32 3>
@@ -696,8 +750,9 @@ entry:
 define <2 x double> @test_mm256_maskz_extractf64x2_pd(i8 %__U, <4 x double> %__A) {
 ; SKX-LABEL: test_mm256_maskz_extractf64x2_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %shuffle = shufflevector <4 x double> %__A, <4 x double> undef, <2 x i32> <i32 2, i32 3>
@@ -710,8 +765,9 @@ entry:
 define <2 x i64> @test_mm256_mask_extracti64x2_epi64(<2 x i64> %__W, i8 %__U, <4 x i64> %__A) {
 ; SKX-LABEL: test_mm256_mask_extracti64x2_epi64:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextracti64x2 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %shuffle = shufflevector <4 x i64> %__A, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
@@ -724,8 +780,9 @@ entry:
 define <2 x i64> @test_mm256_maskz_extracti64x2_epi64(i8 %__U, <4 x i64> %__A) {
 ; SKX-LABEL: test_mm256_maskz_extracti64x2_epi64:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextracti64x2 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %shuffle = shufflevector <4 x i64> %__A, <4 x i64> undef, <2 x i32> <i32 2, i32 3>
@@ -738,8 +795,9 @@ entry:
 define <4 x float> @test_mm256_mask_extractf32x4_ps(<4 x float> %__W, i8 %__U, <8 x float> %__A) {
 ; SKX-LABEL: test_mm256_mask_extractf32x4_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf32x4 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x float> %__A, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -752,8 +810,9 @@ entry:
 define <4 x float> @test_mm256_maskz_extractf32x4_ps(i8 %__U, <8 x float> %__A) {
 ; SKX-LABEL: test_mm256_maskz_extractf32x4_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf32x4 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x float> %__A, <8 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -766,8 +825,9 @@ entry:
 define <2 x i64> @test_mm256_mask_extracti32x4_epi32(<2 x i64> %__W, i8 %__U, <4 x i64> %__A) {
 ; SKX-LABEL: test_mm256_mask_extracti32x4_epi32:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextracti32x4 $1, %ymm1, %xmm0 {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__A to <8 x i32>
@@ -783,8 +843,9 @@ entry:
 define <2 x i64> @test_mm256_maskz_extracti32x4_epi32(i8 %__U, <4 x i64> %__A) {
 ; SKX-LABEL: test_mm256_maskz_extracti32x4_epi32:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextracti32x4 $1, %ymm0, %xmm0 {%k1} {z}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i64> %__A to <8 x i32>
@@ -799,7 +860,7 @@ entry:
 define <8 x float> @test_mm512_mask_extractf32x8_ps(<8 x float> %__W, i8 %__U, <16 x float> %__A) {
 ; SKX-LABEL: test_mm512_mask_extractf32x8_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf32x8 $1, %zmm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -812,7 +873,7 @@ entry:
 define <8 x float> @test_mm512_maskz_extractf32x8_ps(i8 %__U, <16 x float> %__A) {
 ; SKX-LABEL: test_mm512_maskz_extractf32x8_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -825,8 +886,9 @@ entry:
 define <2 x double> @test_mm512_mask_extractf64x2_pd(<2 x double> %__W, i8 %__U, <8 x double> %__A) {
 ; SKX-LABEL: test_mm512_mask_extractf64x2_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf64x2 $3, %zmm1, %xmm0 {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x double> %__A, <8 x double> undef, <2 x i32> <i32 6, i32 7>
@@ -839,8 +901,9 @@ entry:
 define <2 x double> @test_mm512_maskz_extractf64x2_pd(i8 %__U, <8 x double> %__A) {
 ; SKX-LABEL: test_mm512_maskz_extractf64x2_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vextractf64x2 $3, %zmm0, %xmm0 {%k1} {z}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 entry:
   %shuffle = shufflevector <8 x double> %__A, <8 x double> undef, <2 x i32> <i32 6, i32 7>
diff --git a/test/CodeGen/X86/avx512-fsel.ll b/test/CodeGen/X86/avx512-fsel.ll
index c6f2da6ff60b82c02275402fe786f24dd446600c..a9b8914ee1fe619a06b6422472ccf9956b558f2a 100644
--- a/test/CodeGen/X86/avx512-fsel.ll
+++ b/test/CodeGen/X86/avx512-fsel.ll
@@ -10,25 +10,24 @@ define i32 @test(float %a, float %b)  {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:  Lcfi0:
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movb $1, %al
 ; CHECK-NEXT:    vucomiss %xmm1, %xmm0
-; CHECK-NEXT:    setnp %cl
-; CHECK-NEXT:    sete %dl
-; CHECK-NEXT:    setp %sil
-; CHECK-NEXT:    setne %dil
-; CHECK-NEXT:    andb %cl, %dl
-; CHECK-NEXT:    ## implicit-def: %R8D
-; CHECK-NEXT:    movb %dl, %r8b
-; CHECK-NEXT:    andl $1, %r8d
-; CHECK-NEXT:    kmovw %r8d, %k0
-; CHECK-NEXT:    orb %sil, %dil
-; CHECK-NEXT:    ## implicit-def: %R8D
-; CHECK-NEXT:    movb %dil, %r8b
-; CHECK-NEXT:    andl $1, %r8d
-; CHECK-NEXT:    kmovw %r8d, %k1
-; CHECK-NEXT:    kmovw %k1, %ecx
-; CHECK-NEXT:    testb $1, %cl
-; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp) ## 1-byte Spill
+; CHECK-NEXT:    setp %al
+; CHECK-NEXT:    setne %cl
+; CHECK-NEXT:    setnp %dl
+; CHECK-NEXT:    sete %sil
+; CHECK-NEXT:    andb %dl, %sil
+; CHECK-NEXT:    ## implicit-def: %EDI
+; CHECK-NEXT:    movb %sil, %dil
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    orb %al, %cl
+; CHECK-NEXT:    ## implicit-def: %EDI
+; CHECK-NEXT:    movb %cl, %dil
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kmovw %k1, %edi
+; CHECK-NEXT:    movb %dil, %al
+; CHECK-NEXT:    testb $1, %al
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; CHECK-NEXT:    jne LBB0_1
 ; CHECK-NEXT:    jmp LBB0_2
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 7c763d95ade69f00db42c7dc89bd51e69f0a3666..4890afec2164b9d6ffa1fa6962cff0331f6ce8ad 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -14,11 +14,12 @@ declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>,
 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: gather_mask_dps:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterdps %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -29,11 +30,12 @@ define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8*
 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: gather_mask_dpd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
 ; CHECK-NEXT:    vscatterdpd %zmm1, (%rdx,%ymm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -44,11 +46,12 @@ define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %b
 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: gather_mask_qps:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterqps %ymm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
@@ -59,11 +62,12 @@ define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %ba
 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: gather_mask_qpd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterqpd %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
@@ -86,11 +90,12 @@ declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i3
 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: gather_mask_dd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherdd (%rsi,%zmm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vpscatterdd %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -101,11 +106,12 @@ define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %ba
 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: gather_mask_qd:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherqd (%rsi,%zmm0,4), %ymm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vpscatterqd %ymm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
@@ -116,11 +122,12 @@ define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base,
 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: gather_mask_qq:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherqq (%rsi,%zmm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vpscatterqq %zmm1, (%rdx,%zmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
@@ -131,11 +138,12 @@ define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base,
 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: gather_mask_dq:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherdq (%rsi,%ymm0,4), %zmm1 {%k2}
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
 ; CHECK-NEXT:    vpscatterdq %zmm1, (%rdx,%ymm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -146,9 +154,10 @@ define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base,
 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
 ; CHECK-LABEL: gather_mask_dpd_execdomain:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vgatherdpd (%rsi,%ymm0,4), %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, (%rdx)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   store <8 x double> %x, <8 x double>* %stbuf
@@ -158,9 +167,10 @@ define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %m
 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
 ; CHECK-LABEL: gather_mask_qpd_execdomain:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vgatherqpd (%rsi,%zmm0,4), %zmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %zmm1, (%rdx)
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   store <8 x double> %x, <8 x double>* %stbuf
@@ -170,7 +180,7 @@ define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %m
 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base)  {
 ; CHECK-LABEL: gather_mask_dps_execdomain:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vgatherdps (%rsi,%zmm0,4), %zmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm0
 ; CHECK-NEXT:    retq
@@ -181,7 +191,7 @@ define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %s
 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base)  {
 ; CHECK-LABEL: gather_mask_qps_execdomain:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vgatherqps (%rsi,%zmm0,4), %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -192,9 +202,10 @@ define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src,
 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: scatter_mask_dpd_execdomain:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovapd (%rdi), %zmm1
 ; CHECK-NEXT:    vscatterdpd %zmm1, (%rcx,%ymm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = load <8 x double>, <8 x double>* %src, align 64
   call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
@@ -204,9 +215,10 @@ define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8
 define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: scatter_mask_qpd_execdomain:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovapd (%rdi), %zmm1
 ; CHECK-NEXT:    vscatterqpd %zmm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = load <8 x double>, <8 x double>* %src, align 64
   call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
@@ -216,9 +228,10 @@ define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8
 define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: scatter_mask_dps_execdomain:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovaps (%rdi), %zmm1
 ; CHECK-NEXT:    vscatterdps %zmm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = load <16 x float>, <16 x float>* %src, align 64
   call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
@@ -228,9 +241,10 @@ define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i1
 define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: scatter_mask_qps_execdomain:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovaps (%rdi), %ymm1
 ; CHECK-NEXT:    vscatterqps %ymm1, (%rcx,%zmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = load <8 x float>, <8 x float>* %src, align 32
   call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
@@ -240,11 +254,13 @@ define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %
 define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf)  {
 ; CHECK-LABEL: gather_qps:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    vxorps %ymm1, %ymm1, %ymm1
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k2
 ; CHECK-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm1 {%k2}
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %zmm0, %zmm0
 ; CHECK-NEXT:    vscatterqps %ymm1, (%rsi,%zmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
@@ -262,16 +278,17 @@ define void @prefetch(<8 x i64> %ind, i8* %base) {
 ; CHECK-NEXT:    kxorw %k0, %k0, %k1
 ; CHECK-NEXT:    vgatherpf1qps (%rdi,%zmm0,4) {%k1}
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovb %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vscatterpf0qps (%rdi,%zmm0,2) {%k1}
 ; CHECK-NEXT:    movb $120, %al
-; CHECK-NEXT:    kmovb %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vscatterpf1qps (%rdi,%zmm0,2) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
-  call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 1)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
-  call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 1)
+  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 3)
+  call void @llvm.x86.avx512.gatherpf.qps.512(i8 0, <8 x i64> %ind, i8* %base, i32 4, i32 2)
+  call void @llvm.x86.avx512.scatterpf.qps.512(i8 1, <8 x i64> %ind, i8* %base, i32 2, i32 3)
+  call void @llvm.x86.avx512.scatterpf.qps.512(i8 120, <8 x i64> %ind, i8* %base, i32 2, i32 2)
   ret void
 }
 
@@ -280,12 +297,12 @@ declare <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double>, i8*, <2 x i64
 define <2 x double>@test_int_x86_avx512_gather3div2_df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_df:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vmovapd %xmm0, %xmm2
-; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,4), %xmm2 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,4), %xmm0 {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,2), %xmm0 {%k1}
-; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vgatherqpd (%rdi,%xmm1,2), %xmm2 {%k1}
+; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.gather3div2.df(<2 x double> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
@@ -298,9 +315,9 @@ declare <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64>, i8*, <2 x i64>, i8,
 define <2 x i64>@test_int_x86_avx512_gather3div2_di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3div2_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpgatherqq (%rdi,%xmm1,8), %xmm0 {%k1}
-; CHECK-NEXT:    vpaddq  %xmm0, %xmm0, %xmm0
+; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
   %res1 = call <2 x i64> @llvm.x86.avx512.gather3div2.di(<2 x i64> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 8)
@@ -313,12 +330,12 @@ declare <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double>, i8*, <4 x i64
 define <4 x double>@test_int_x86_avx512_gather3div4_df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_df:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vmovapd %ymm0, %ymm2
-; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,4), %ymm2 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,4), %ymm0 {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,2), %ymm0 {%k1}
-; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
+; CHECK-NEXT:    vgatherqpd (%rdi,%ymm1,2), %ymm2 {%k1}
+; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %res = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
   %res1 = call <4 x double> @llvm.x86.avx512.gather3div4.df(<4 x double> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
@@ -331,12 +348,12 @@ declare <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64>, i8*, <4 x i64>, i8,
 define <4 x i64>@test_int_x86_avx512_gather3div4_di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vmovdqa %ymm0, %ymm2
-; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
-; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm0 {%k1}
-; CHECK-NEXT:    vpaddq  %ymm0, %ymm2, %ymm0 
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; CHECK-NEXT:    vpgatherqq (%rdi,%ymm1,8), %ymm2 {%k1}
+; CHECK-NEXT:    vpaddq %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 8)
   %res1 = call <4 x i64> @llvm.x86.avx512.gather3div4.di(<4 x i64> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 8)
@@ -349,12 +366,12 @@ declare <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float>, i8*, <2 x i64>,
 define <4 x float>@test_int_x86_avx512_gather3div4_sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_sf:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm2
-; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,4), %xmm2 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,4), %xmm0 {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,2), %xmm0 {%k1}
-; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vgatherqps (%rdi,%xmm1,2), %xmm2 {%k1}
+; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 %x3, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.gather3div4.sf(<4 x float> %x0, i8* %x1, <2 x i64> %x2, i8 -1, i32 2)
@@ -367,9 +384,9 @@ declare <4 x i32> @llvm.x86.avx512.gather3div4.si(<4 x i32>, i8*, <2 x i64>, i8,
 define <4 x i32>@test_int_x86_avx512_gather3div4_si(<4 x i32> %x0, i8* %x1, <2 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3div4_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k2
-; CHECK-NEXT:    vmovdqa %xmm0, %xmm2
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm2 {%k2}
 ; CHECK-NEXT:    vpgatherqd (%rdi,%xmm1,4), %xmm0 {%k1}
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
@@ -385,12 +402,13 @@ declare <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float>, i8*, <4 x i64>,
 define <4 x float>@test_int_x86_avx512_gather3div8_sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_sf:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm2
-; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,4), %xmm2 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,4), %xmm0 {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,2), %xmm0 {%k1}
-; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vgatherqps (%rdi,%ymm1,2), %xmm2 {%k1}
+; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.gather3div8.sf(<4 x float> %x0, i8* %x1, <4 x i64> %x2, i8 -1, i32 2)
@@ -403,12 +421,13 @@ declare <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32>, i8*, <4 x i64>, i8,
 define <4 x i32>@test_int_x86_avx512_gather3div8_si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3div8_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm2
 ; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,4), %xmm2 {%k2}
 ; CHECK-NEXT:    vpgatherqd (%rdi,%ymm1,2), %xmm0 {%k1}
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 4)
   %res1 = call <4 x i32> @llvm.x86.avx512.gather3div8.si(<4 x i32> %x0, i8* %x1, <4 x i64> %x2, i8 %x3, i32 2)
@@ -421,12 +440,12 @@ declare <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double>, i8*, <4 x i32
 define <2 x double>@test_int_x86_avx512_gather3siv2_df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_df:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vmovapd %xmm0, %xmm2
-; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %xmm2 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %xmm0 {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %xmm0 {%k1}
-; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %xmm2 {%k1}
+; CHECK-NEXT:    vaddpd %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
   %res1 = call <2 x double> @llvm.x86.avx512.gather3siv2.df(<2 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
@@ -439,7 +458,7 @@ declare <2 x i64> @llvm.x86.avx512.gather3siv2.di(<2 x i64>, i8*, <4 x i32>, i8,
 define <2 x i64>@test_int_x86_avx512_gather3siv2_di(<2 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3siv2_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpgatherdq (%rdi,%xmm1,8), %xmm0 {%k1}
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
@@ -454,12 +473,12 @@ declare <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double>, i8*, <4 x i32
 define <4 x double>@test_int_x86_avx512_gather3siv4_df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_df:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vmovapd %ymm0, %ymm2
-; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %ymm2 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,4), %ymm0 {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %ymm0 {%k1}
-; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vxorpd %ymm2, %ymm2, %ymm2
+; CHECK-NEXT:    vgatherdpd (%rdi,%xmm1,2), %ymm2 {%k1}
+; CHECK-NEXT:    vaddpd %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %res = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
   %res1 = call <4 x double> @llvm.x86.avx512.gather3siv4.df(<4 x double> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
@@ -472,9 +491,9 @@ declare <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64>, i8*, <4 x i32>, i8,
 define <4 x i64>@test_int_x86_avx512_gather3siv4_di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpgatherdq (%rdi,%xmm1,8), %ymm0 {%k1}
-; CHECK-NEXT:    vpaddq  %ymm0, %ymm0, %ymm0 
+; CHECK-NEXT:    vpaddq %ymm0, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
   %res1 = call <4 x i64> @llvm.x86.avx512.gather3siv4.di(<4 x i64> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 8)
@@ -487,12 +506,12 @@ declare <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float>, i8*, <4 x i32>,
 define <4 x float>@test_int_x86_avx512_gather3siv4_sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_sf:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm2
-; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm2 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,2), %xmm0 {%k1}
-; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0
+; CHECK-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK-NEXT:    vgatherdps (%rdi,%xmm1,2), %xmm2 {%k1}
+; CHECK-NEXT:    vaddps %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 %x3, i32 4)
   %res1 = call <4 x float> @llvm.x86.avx512.gather3siv4.sf(<4 x float> %x0, i8* %x1, <4 x i32> %x2, i8 -1, i32 2)
@@ -505,9 +524,9 @@ declare <4 x i32> @llvm.x86.avx512.gather3siv4.si(<4 x i32>, i8*, <4 x i32>, i8,
 define <4 x i32>@test_int_x86_avx512_gather3siv4_si(<4 x i32> %x0, i8* %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3siv4_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k2
-; CHECK-NEXT:    vmovdqa %xmm0, %xmm2
+; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,4), %xmm2 {%k2}
 ; CHECK-NEXT:    vpgatherdd (%rdi,%xmm1,2), %xmm0 {%k1}
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0
@@ -523,12 +542,12 @@ declare <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float>, i8*, <8 x i32>,
 define <8 x float>@test_int_x86_avx512_gather3siv8_sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_sf:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm2
-; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm2 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,4), %ymm0 {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm0 {%k1}
-; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0
+; CHECK-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; CHECK-NEXT:    vgatherdps (%rdi,%ymm1,2), %ymm2 {%k1}
+; CHECK-NEXT:    vaddps %ymm2, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %res = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 %x3, i32 4)
   %res1 = call <8 x float> @llvm.x86.avx512.gather3siv8.sf(<8 x float> %x0, i8* %x1, <8 x i32> %x2, i8 -1, i32 2)
@@ -541,7 +560,7 @@ declare <8 x i32> @llvm.x86.avx512.gather3siv8.si(<8 x i32>, i8*, <8 x i32>, i8,
 define <8 x i32>@test_int_x86_avx512_gather3siv8_si(<8 x i32> %x0, i8* %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_gather3siv8_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm2
 ; CHECK-NEXT:    kmovq %k1, %k2
 ; CHECK-NEXT:    vpgatherdd (%rdi,%ymm1,4), %ymm2 {%k2}
@@ -559,7 +578,7 @@ declare void @llvm.x86.avx512.scatterdiv2.df(i8*, i8, <2 x i64>, <2 x double>, i
 define void@test_int_x86_avx512_scatterdiv2_df(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x double> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_df:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k2
 ; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,2) {%k2}
 ; CHECK-NEXT:    vscatterqpd %xmm1, (%rdi,%xmm0,4) {%k1}
@@ -574,7 +593,7 @@ declare void @llvm.x86.avx512.scatterdiv2.di(i8*, i8, <2 x i64>, <2 x i64>, i32)
 define void@test_int_x86_avx512_scatterdiv2_di(i8* %x0, i8 %x1, <2 x i64> %x2, <2 x i64> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv2_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterqq %xmm1, (%rdi,%xmm0,4) {%k1}
@@ -589,10 +608,11 @@ declare void @llvm.x86.avx512.scatterdiv4.df(i8*, i8, <4 x i64>, <4 x double>, i
 define void@test_int_x86_avx512_scatterdiv4_df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_df:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterqpd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x double> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv4.df(i8* %x0, i8 -1, <4 x i64> %x2, <4 x double> %x3, i32 4)
@@ -604,10 +624,11 @@ declare void @llvm.x86.avx512.scatterdiv4.di(i8*, i8, <4 x i64>, <4 x i64>, i32)
 define void@test_int_x86_avx512_scatterdiv4_di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterqq %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i64> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv4.di(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i64> %x3, i32 4)
@@ -619,7 +640,7 @@ declare void @llvm.x86.avx512.scatterdiv4.sf(i8*, i8, <2 x i64>, <4 x float>, i3
 define void@test_int_x86_avx512_scatterdiv4_sf(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x float> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_sf:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%xmm0,4) {%k1}
@@ -634,7 +655,7 @@ declare void @llvm.x86.avx512.scatterdiv4.si(i8*, i8, <2 x i64>, <4 x i32>, i32)
 define void@test_int_x86_avx512_scatterdiv4_si(i8* %x0, i8 %x1, <2 x i64> %x2, <4 x i32> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv4_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k2
 ; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,2) {%k2}
 ; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%xmm0,4) {%k1}
@@ -649,10 +670,11 @@ declare void @llvm.x86.avx512.scatterdiv8.sf(i8*, i8, <4 x i64>, <4 x float>, i3
 define void@test_int_x86_avx512_scatterdiv8_sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_sf:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterqps %xmm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x float> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv8.sf(i8* %x0, i8 -1, <4 x i64> %x2, <4 x float> %x3, i32 4)
@@ -664,10 +686,11 @@ declare void @llvm.x86.avx512.scatterdiv8.si(i8*, i8, <4 x i64>, <4 x i32>, i32)
 define void@test_int_x86_avx512_scatterdiv8_si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scatterdiv8_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterqd %xmm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 %x1, <4 x i64> %x2, <4 x i32> %x3, i32 2)
   call void @llvm.x86.avx512.scatterdiv8.si(i8* %x0, i8 -1, <4 x i64> %x2, <4 x i32> %x3, i32 4)
@@ -679,7 +702,7 @@ declare void @llvm.x86.avx512.scattersiv2.df(i8*, i8, <4 x i32>, <2 x double>, i
 define void@test_int_x86_avx512_scattersiv2_df(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x double> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_df:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k2
 ; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,2) {%k2}
 ; CHECK-NEXT:    vscatterdpd %xmm1, (%rdi,%xmm0,4) {%k1}
@@ -694,7 +717,7 @@ declare void @llvm.x86.avx512.scattersiv2.di(i8*, i8, <4 x i32>, <2 x i64>, i32)
 define void@test_int_x86_avx512_scattersiv2_di(i8* %x0, i8 %x1, <4 x i32> %x2, <2 x i64> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv2_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k2
 ; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,2) {%k2}
 ; CHECK-NEXT:    vpscatterdq %xmm1, (%rdi,%xmm0,4) {%k1}
@@ -709,10 +732,11 @@ declare void @llvm.x86.avx512.scattersiv4.df(i8*, i8, <4 x i32>, <4 x double>, i
 define void@test_int_x86_avx512_scattersiv4_df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_df:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterdpd %ymm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x double> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv4.df(i8* %x0, i8 -1, <4 x i32> %x2, <4 x double> %x3, i32 4)
@@ -724,10 +748,11 @@ declare void @llvm.x86.avx512.scattersiv4.di(i8*, i8, <4 x i32>, <4 x i64>, i32)
 define void@test_int_x86_avx512_scattersiv4_di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_di:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k2
 ; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,2) {%k2}
 ; CHECK-NEXT:    vpscatterdq %ymm1, (%rdi,%xmm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 -1, <4 x i32> %x2, <4 x i64> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv4.di(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i64> %x3, i32 4)
@@ -739,7 +764,7 @@ declare void @llvm.x86.avx512.scattersiv4.sf(i8*, i8, <4 x i32>, <4 x float>, i3
 define void@test_int_x86_avx512_scattersiv4_sf(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x float> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_sf:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterdps %xmm1, (%rdi,%xmm0,4) {%k1}
@@ -754,7 +779,7 @@ declare void @llvm.x86.avx512.scattersiv4.si(i8*, i8, <4 x i32>, <4 x i32>, i32)
 define void@test_int_x86_avx512_scattersiv4_si(i8* %x0, i8 %x1, <4 x i32> %x2, <4 x i32> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv4_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %xmm1, (%rdi,%xmm0,4) {%k1}
@@ -769,10 +794,11 @@ declare void @llvm.x86.avx512.scattersiv8.sf(i8*, i8, <8 x i32>, <8 x float>, i3
 define void@test_int_x86_avx512_scattersiv8_sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_sf:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vscatterdps %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x float> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv8.sf(i8* %x0, i8 -1, <8 x i32> %x2, <8 x float> %x3, i32 4)
@@ -784,10 +810,11 @@ declare void @llvm.x86.avx512.scattersiv8.si(i8*, i8, <8 x i32>, <8 x i32>, i32)
 define void@test_int_x86_avx512_scattersiv8_si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_scattersiv8_si:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 %x1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 4)
@@ -802,11 +829,12 @@ define void @scatter_mask_test(i8* %x0, <8 x i32> %x2, <8 x i32> %x3) {
 ; CHECK-NEXT:    kxorw %k0, %k0, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
 ; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    kmovb %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,2) {%k1}
 ; CHECK-NEXT:    movb $96, %al
-; CHECK-NEXT:    kmovb %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vpscatterdd %ymm1, (%rdi,%ymm0,4) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 -1, <8 x i32> %x2, <8 x i32> %x3, i32 2)
   call void @llvm.x86.avx512.scattersiv8.si(i8* %x0, i8 0, <8 x i32> %x2, <8 x i32> %x3, i32 4)
@@ -819,17 +847,17 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b
 ; CHECK-LABEL: gather_mask_test:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kxnorw %k0, %k0, %k1
-; CHECK-NEXT:    vmovaps %zmm1, %zmm2
+; CHECK-NEXT:    vxorps %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm2 {%k1}
 ; CHECK-NEXT:    kxorw %k0, %k0, %k1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
 ; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1}
 ; CHECK-NEXT:    movw $1, %ax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm4
 ; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1}
 ; CHECK-NEXT:    movw $220, %ax
-; CHECK-NEXT:    kmovw %eax, %k1
+; CHECK-NEXT:    kmovd %eax, %k1
 ; CHECK-NEXT:    vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
 ; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm0
 ; CHECK-NEXT:    vaddps %zmm4, %zmm1, %zmm1
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index cb8ed0e59a3a2593d56ab5efd4736c842f0e3d57..87928348a851af83b10648f85d8c26fcab94a16f 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -1,24 +1,25 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefix=KNL %s
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=SKX_ONLY %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx -mattr=avx512vbmi | FileCheck --check-prefix=SKX --check-prefix=SKX_VBMI %s
 
 define <16 x float> @test1(<16 x float> %x, float* %br, float %y) nounwind {
 ; KNL-LABEL: test1:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
-; KNL-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
-; KNL-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
-; KNL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; KNL-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
+; KNL-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; KNL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; KNL-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test1:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],mem[0],xmm0[2,3]
-; SKX-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
-; SKX-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
-; SKX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; SKX-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
+; SKX-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; SKX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
+; SKX-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
 ; SKX-NEXT:    retq
   %rrr = load float, float* %br
   %rrr2 = insertelement <16 x float> %x, float %rrr, i32 1
@@ -30,19 +31,19 @@ define <8 x double> @test2(<8 x double> %x, double* %br, double %y) nounwind {
 ; KNL-LABEL: test2:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; KNL-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm0
-; KNL-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
-; KNL-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
-; KNL-NEXT:    vinsertf32x4 $3, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    vinsertf32x4 $0, %xmm2, %zmm0, %zmm2
+; KNL-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
+; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; KNL-NEXT:    vinsertf32x4 $3, %xmm0, %zmm2, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test2:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm0[0],mem[0]
-; SKX-NEXT:    vinsertf64x2 $0, %xmm2, %zmm0, %zmm0
-; SKX-NEXT:    vextractf64x2 $3, %zmm0, %xmm2
-; SKX-NEXT:    vmovsd {{.*#+}} xmm1 = xmm1[0],xmm2[1]
-; SKX-NEXT:    vinsertf64x2 $3, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vinsertf64x2 $0, %xmm2, %zmm0, %zmm2
+; SKX-NEXT:    vextractf64x2 $3, %zmm0, %xmm0
+; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; SKX-NEXT:    vinsertf64x2 $3, %xmm0, %zmm2, %zmm0
 ; SKX-NEXT:    retq
   %rrr = load double, double* %br
   %rrr2 = insertelement <8 x double> %x, double %rrr, i32 1
@@ -123,16 +124,31 @@ define void @test6(<4 x float> %x, float* %out) nounwind {
 define float @test7(<16 x float> %x, i32 %ind) nounwind {
 ; KNL-LABEL: test7:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vmovd %edi, %xmm1
-; KNL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test7:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1
-; SKX-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %e = extractelement <16 x float> %x, i32 %ind
   ret float %e
@@ -141,18 +157,31 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind {
 define double @test8(<8 x double> %x, i32 %ind) nounwind {
 ; KNL-LABEL: test8:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    movslq %edi, %rax
-; KNL-NEXT:    vmovq %rax, %xmm1
-; KNL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test8:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    movslq %edi, %rax
-; SKX-NEXT:    vmovq %rax, %xmm1
-; SKX-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %e = extractelement <8 x double> %x, i32 %ind
   ret double %e
@@ -161,16 +190,31 @@ define double @test8(<8 x double> %x, i32 %ind) nounwind {
 define float @test9(<8 x float> %x, i32 %ind) nounwind {
 ; KNL-LABEL: test9:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vmovd %edi, %xmm1
-; KNL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test9:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1
-; SKX-NEXT:    vpermps %ymm0, %ymm1, %ymm0
-; SKX-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %ymm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %e = extractelement <8 x float> %x, i32 %ind
   ret float %e
@@ -179,16 +223,31 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind {
 define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
 ; KNL-LABEL: test10:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vmovd %edi, %xmm1
-; KNL-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; KNL-NEXT:    vmovd %xmm0, %eax
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test10:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vmovd %edi, %xmm1
-; SKX-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; SKX-NEXT:    vmovd %xmm0, %eax
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    movl (%rsp,%rdi,4), %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %e = extractelement <16 x i32> %x, i32 %ind
   ret i32 %e
@@ -216,7 +275,7 @@ define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
 ; SKX-NEXT:    vpcmpltud %zmm1, %zmm0, %k0
 ; SKX-NEXT:    kshiftlw $11, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    je LBB10_2
@@ -258,11 +317,12 @@ define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
 ; SKX-NEXT:    kunpckbw %k0, %k1, %k0
 ; SKX-NEXT:    kshiftlw $15, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    cmoveq %rsi, %rdi
 ; SKX-NEXT:    movq %rdi, %rax
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %cmpvector_func.i = icmp slt <16 x i64> %a, %b
   %extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
@@ -283,6 +343,7 @@ define i16 @test13(i32 %a, i32 %b) {
 ; KNL-NEXT:    kshiftlw $1, %k1, %k1
 ; KNL-NEXT:    korw %k0, %k1, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test13:
@@ -290,13 +351,14 @@ define i16 @test13(i32 %a, i32 %b) {
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovw %eax, %k0
+; SKX-NEXT:    kmovd %eax, %k0
 ; SKX-NEXT:    movw $-4, %ax
-; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    kshiftrw $1, %k1, %k1
 ; SKX-NEXT:    kshiftlw $1, %k1, %k1
 ; SKX-NEXT:    korw %k0, %k1, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; SKX-NEXT:    retq
   %cmp_res = icmp ult i32 %a, %b
   %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
@@ -322,11 +384,12 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
 ; SKX-NEXT:    vpcmpgtq %zmm0, %zmm1, %k0
 ; SKX-NEXT:    kshiftlb $3, %k0, %k0
 ; SKX-NEXT:    kshiftrb $7, %k0, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    testb %al, %al
 ; SKX-NEXT:    cmoveq %rsi, %rdi
 ; SKX-NEXT:    movq %rdi, %rax
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %cmpvector_func.i = icmp slt <8 x i64> %a, %b
   %extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
@@ -372,6 +435,7 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ; KNL-NEXT:    vpslld $31, %zmm2, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test16:
@@ -379,13 +443,15 @@ define i16 @test16(i1 *%addr, i16 %a) {
 ; SKX-NEXT:    movzbl (%rdi), %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    kmovw %esi, %k1
+; SKX-NEXT:    kmovd %esi, %k1
 ; SKX-NEXT:    vpmovm2d %k1, %zmm0
 ; SKX-NEXT:    vpmovm2d %k0, %zmm1
 ; SKX-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [0,1,2,3,4,5,6,7,8,9,16,11,12,13,14,15]
 ; SKX-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; SKX-NEXT:    vpmovd2m %zmm2, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = load i1 , i1 * %addr, align 128
   %a1 = bitcast i16 %a to <16 x i1>
@@ -408,6 +474,7 @@ define i8 @test17(i1 *%addr, i8 %a) {
 ; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test17:
@@ -415,13 +482,15 @@ define i8 @test17(i1 *%addr, i8 %a) {
 ; SKX-NEXT:    movzbl (%rdi), %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    kmovd %eax, %k0
-; SKX-NEXT:    kmovb %esi, %k1
+; SKX-NEXT:    kmovd %esi, %k1
 ; SKX-NEXT:    vpmovm2q %k1, %zmm0
 ; SKX-NEXT:    vpmovm2q %k0, %zmm1
 ; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,8,5,6,7]
 ; SKX-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; SKX-NEXT:    vpmovq2m %zmm2, %k0
-; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = load i1 , i1 * %addr, align 128
   %a1 = bitcast i8 %a to <8 x i1>
@@ -443,6 +512,7 @@ define i64 @extract_v8i64(<8 x i64> %x, i64* %dst) {
 ; SKX-NEXT:    vpextrq $1, %xmm0, %rax
 ; SKX-NEXT:    vextracti64x2 $1, %zmm0, %xmm0
 ; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = extractelement <8 x i64> %x, i32 1
   %r2 = extractelement <8 x i64> %x, i32 3
@@ -463,6 +533,7 @@ define i64 @extract_v4i64(<4 x i64> %x, i64* %dst) {
 ; SKX-NEXT:    vpextrq $1, %xmm0, %rax
 ; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; SKX-NEXT:    vpextrq $1, %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = extractelement <4 x i64> %x, i32 1
   %r2 = extractelement <4 x i64> %x, i32 3
@@ -501,6 +572,7 @@ define i32 @extract_v16i32(<16 x i32> %x, i32* %dst) {
 ; SKX-NEXT:    vpextrd $1, %xmm0, %eax
 ; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
 ; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = extractelement <16 x i32> %x, i32 1
   %r2 = extractelement <16 x i32> %x, i32 5
@@ -521,6 +593,7 @@ define i32 @extract_v8i32(<8 x i32> %x, i32* %dst) {
 ; SKX-NEXT:    vpextrd $1, %xmm0, %eax
 ; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; SKX-NEXT:    vpextrd $1, %xmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = extractelement <8 x i32> %x, i32 1
   %r2 = extractelement <8 x i32> %x, i32 5
@@ -561,6 +634,7 @@ define i16 @extract_v32i16(<32 x i16> %x, i16* %dst) {
 ; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
 ; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi)
 ; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = extractelement <32 x i16> %x, i32 1
   %r2 = extractelement <32 x i16> %x, i32 9
@@ -583,6 +657,7 @@ define i16 @extract_v16i16(<16 x i16> %x, i16* %dst) {
 ; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; SKX-NEXT:    vpextrw $1, %xmm0, (%rdi)
 ; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = extractelement <16 x i16> %x, i32 1
   %r2 = extractelement <16 x i16> %x, i32 9
@@ -625,6 +700,7 @@ define i8 @extract_v64i8(<64 x i8> %x, i8* %dst) {
 ; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
 ; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi)
 ; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = extractelement <64 x i8> %x, i32 1
   %r2 = extractelement <64 x i8> %x, i32 17
@@ -647,6 +723,7 @@ define i8 @extract_v32i8(<32 x i8> %x, i8* %dst) {
 ; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; SKX-NEXT:    vpextrb $1, %xmm0, (%rdi)
 ; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %r1 = extractelement <32 x i8> %x, i32 1
   %r2 = extractelement <32 x i8> %x, i32 17
@@ -678,19 +755,19 @@ define <8 x i64> @insert_v8i64(<8 x i64> %x, i64 %y , i64* %ptr) {
 ; KNL-LABEL: insert_v8i64:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
-; KNL-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
-; KNL-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; KNL-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
+; KNL-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
+; KNL-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: insert_v8i64:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT:    vextracti64x2 $1, %zmm0, %xmm1
-; SKX-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm1
+; SKX-NEXT:    vextracti64x2 $1, %zmm0, %xmm0
+; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
+; SKX-NEXT:    vinserti64x2 $1, %xmm0, %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %val = load i64, i64* %ptr
   %r1 = insertelement <8 x i64> %x, i64 %val, i32 1
@@ -702,19 +779,19 @@ define <4 x i64> @insert_v4i64(<4 x i64> %x, i64 %y , i64* %ptr) {
 ; KNL-LABEL: insert_v4i64:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
+; KNL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: insert_v4i64:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrq $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; SKX-NEXT:    vpinsrq $1, %rdi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
+; SKX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; SKX-NEXT:    retq
   %val = load i64, i64* %ptr
   %r1 = insertelement <4 x i64> %x, i64 %val, i32 1
@@ -744,19 +821,19 @@ define <16 x i32> @insert_v16i32(<16 x i32> %x, i32 %y, i32* %ptr) {
 ; KNL-LABEL: insert_v16i32:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; KNL-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
-; KNL-NEXT:    vpinsrd $1, %edi, %xmm1, %xmm1
-; KNL-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; KNL-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
+; KNL-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
+; KNL-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: insert_v16i32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
-; SKX-NEXT:    vpinsrd $1, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
+; SKX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
+; SKX-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %val = load i32, i32* %ptr
   %r1 = insertelement <16 x i32> %x, i32 %val, i32 1
@@ -768,19 +845,19 @@ define <8 x i32> @insert_v8i32(<8 x i32> %x, i32 %y, i32* %ptr) {
 ; KNL-LABEL: insert_v8i32:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vpinsrd $1, %edi, %xmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
+; KNL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: insert_v8i32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrd $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; SKX-NEXT:    vpinsrd $1, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT:    vpinsrd $1, %edi, %xmm0, %xmm0
+; SKX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; SKX-NEXT:    retq
   %val = load i32, i32* %ptr
   %r1 = insertelement <8 x i32> %x, i32 %val, i32 1
@@ -810,19 +887,19 @@ define <32 x i16> @insert_v32i16(<32 x i16> %x, i16 %y, i16* %ptr) {
 ; KNL-LABEL: insert_v32i16:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm2
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; KNL-NEXT:    vpinsrw $1, %edi, %xmm2, %xmm2
-; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
+; KNL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
+; KNL-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: insert_v32i16:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm1
-; SKX-NEXT:    vpinsrw $1, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; SKX-NEXT:    vextracti32x4 $1, %zmm0, %xmm0
+; SKX-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
+; SKX-NEXT:    vinserti32x4 $1, %xmm0, %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %val = load i16, i16* %ptr
   %r1 = insertelement <32 x i16> %x, i16 %val, i32 1
@@ -834,19 +911,19 @@ define <16 x i16> @insert_v16i16(<16 x i16> %x, i16 %y, i16* %ptr) {
 ; KNL-LABEL: insert_v16i16:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vpinsrw $1, %edi, %xmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
+; KNL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: insert_v16i16:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrw $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; SKX-NEXT:    vpinsrw $1, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT:    vpinsrw $1, %edi, %xmm0, %xmm0
+; SKX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; SKX-NEXT:    retq
   %val = load i16, i16* %ptr
   %r1 = insertelement <16 x i16> %x, i16 %val, i32 1
@@ -885,10 +962,10 @@ define <64 x i8> @insert_v64i8(<64 x i8> %x, i8 %y, i8* %ptr) {
 ; SKX-LABEL: insert_v64i8:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
-; SKX-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
-; SKX-NEXT:    vpinsrb $2, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti32x4 $3, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm1
+; SKX-NEXT:    vextracti32x4 $3, %zmm0, %xmm0
+; SKX-NEXT:    vpinsrb $2, %edi, %xmm0, %xmm0
+; SKX-NEXT:    vinserti32x4 $3, %xmm0, %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %val = load i8, i8* %ptr
   %r1 = insertelement <64 x i8> %x, i8 %val, i32 1
@@ -900,19 +977,19 @@ define <32 x i8> @insert_v32i8(<32 x i8> %x, i8 %y, i8* %ptr) {
 ; KNL-LABEL: insert_v32i8:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; KNL-NEXT:    vpinsrb $1, %edi, %xmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; KNL-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0
+; KNL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: insert_v32i8:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpinsrb $1, (%rsi), %xmm0, %xmm1
-; SKX-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; SKX-NEXT:    vpinsrb $1, %edi, %xmm1, %xmm1
-; SKX-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; SKX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; SKX-NEXT:    vpinsrb $1, %edi, %xmm0, %xmm0
+; SKX-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; SKX-NEXT:    retq
   %val = load i8, i8* %ptr
   %r1 = insertelement <32 x i8> %x, i8 %val, i32 1
@@ -1051,149 +1128,148 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; KNL-NEXT:    .cfi_def_cfa_register %rbp
 ; KNL-NEXT:    andq $-32, %rsp
 ; KNL-NEXT:    subq $32, %rsp
+; KNL-NEXT:    xorl %eax, %eax
 ; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    setb %al
 ; KNL-NEXT:    vpcmpltud %zmm3, %zmm1, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm1
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    vmovd %edx, %xmm1
+; KNL-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $4, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $5, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $6, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vpinsrb $15, %ecx, %xmm1, %xmm1
 ; KNL-NEXT:    vpcmpltud %zmm2, %zmm0, %k0
 ; KNL-NEXT:    kshiftlw $14, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
+; KNL-NEXT:    kmovw %k1, %ecx
 ; KNL-NEXT:    kshiftlw $15, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %ecx
-; KNL-NEXT:    vmovd %ecx, %xmm0
-; KNL-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %edx
+; KNL-NEXT:    vmovd %edx, %xmm0
+; KNL-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $13, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $12, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $11, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $10, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $9, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $8, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $7, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $6, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $5, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $4, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $3, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $2, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftlw $1, %k0, %k1
 ; KNL-NEXT:    kshiftrw $15, %k1, %k1
-; KNL-NEXT:    kmovw %k1, %eax
-; KNL-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k1, %ecx
+; KNL-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    kshiftrw $15, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; KNL-NEXT:    kmovw %k0, %ecx
+; KNL-NEXT:    vpinsrb $15, %ecx, %xmm0, %xmm0
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
 ; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
 ; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
-; KNL-NEXT:    sbbl %eax, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm1
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; KNL-NEXT:    vpmovsxbd %xmm1, %zmm1
 ; KNL-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL-NEXT:    vptestmd %zmm1, %zmm1, %k0
 ; KNL-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
 ; KNL-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; KNL-NEXT:    vpslld $31, %zmm0, %zmm0
 ; KNL-NEXT:    vptestmd %zmm0, %zmm0, %k0
@@ -1208,7 +1284,7 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovw %eax, %k0
+; SKX-NEXT:    kmovd %eax, %k0
 ; SKX-NEXT:    vpcmpltud %zmm2, %zmm0, %k1
 ; SKX-NEXT:    vpcmpltud %zmm3, %zmm1, %k2
 ; SKX-NEXT:    kunpckwd %k1, %k2, %k1
@@ -1218,6 +1294,7 @@ define i32 @test_insertelement_v32i1(i32 %a, i32 %b, <32 x i32> %x , <32 x i32>
 ; SKX-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; SKX-NEXT:    vpmovw2m %zmm2, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %cmp_res_i1 = icmp ult i32 %a, %b
   %cmp_cmp_vec = icmp ult <32 x i32> %x, %y
@@ -1265,6 +1342,7 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_iinsertelement_v4i1:
@@ -1272,14 +1350,15 @@ define i8 @test_iinsertelement_v4i1(i32 %a, i32 %b, <4 x i32> %x , <4 x i32> %y)
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovw %eax, %k0
+; SKX-NEXT:    kmovd %eax, %k0
 ; SKX-NEXT:    vpcmpltud %xmm1, %xmm0, %k1
 ; SKX-NEXT:    vpmovm2d %k1, %xmm0
 ; SKX-NEXT:    vpmovm2d %k0, %xmm1
 ; SKX-NEXT:    vpbroadcastq %xmm1, %xmm1
 ; SKX-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3]
 ; SKX-NEXT:    vpmovd2m %xmm0, %k0
-; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
   %cmp_res_i1 = icmp ult i32 %a, %b
   %cmp_cmp_vec = icmp ult <4 x i32> %x, %y
@@ -1310,6 +1389,7 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
 ; KNL-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; KNL-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_iinsertelement_v2i1:
@@ -1317,13 +1397,14 @@ define i8 @test_iinsertelement_v2i1(i32 %a, i32 %b, <2 x i64> %x , <2 x i64> %y)
 ; SKX-NEXT:    cmpl %esi, %edi
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovw %eax, %k0
+; SKX-NEXT:    kmovd %eax, %k0
 ; SKX-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1
 ; SKX-NEXT:    kshiftlw $1, %k1, %k1
 ; SKX-NEXT:    kshiftrw $1, %k1, %k1
 ; SKX-NEXT:    kshiftlw $1, %k0, %k0
 ; SKX-NEXT:    korw %k0, %k1, %k0
-; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
   %cmp_res_i1 = icmp ult i32 %a, %b
   %cmp_cmp_vec = icmp ult <2 x i64> %x, %y
@@ -1340,10 +1421,8 @@ define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
 ; KNL-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vmovq %xmm0, %rax
-; KNL-NEXT:    testb $1, %al
-; KNL-NEXT:    sete %al
-; KNL-NEXT:    addb $3, %al
+; KNL-NEXT:    vpextrb $0, %xmm0, %eax
+; KNL-NEXT:    addb $4, %al
 ; KNL-NEXT:    movzbl %al, %eax
 ; KNL-NEXT:    retq
 ;
@@ -1352,11 +1431,11 @@ define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
 ; SKX-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0
 ; SKX-NEXT:    kshiftlw $15, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    testb %al, %al
-; SKX-NEXT:    sete %al
-; SKX-NEXT:    addb $3, %al
+; SKX-NEXT:    cmpb $1, %al
+; SKX-NEXT:    movb $3, %al
+; SKX-NEXT:    adcb $0, %al
 ; SKX-NEXT:    movzbl %al, %eax
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <2 x i64> %a, %b
@@ -1365,6 +1444,37 @@ define zeroext i8 @test_extractelement_v2i1(<2 x i64> %a, <2 x i64> %b) {
   ret i8 %res
 }
 
+define zeroext i8 @extractelement_v2i1_alt(<2 x i64> %a, <2 x i64> %b) {
+; KNL-LABEL: extractelement_v2i1_alt:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vpextrb $0, %xmm0, %eax
+; KNL-NEXT:    addb $4, %al
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: extractelement_v2i1_alt:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0
+; SKX-NEXT:    kshiftlw $15, %k0, %k0
+; SKX-NEXT:    kshiftrw $15, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    cmpb $1, %al
+; SKX-NEXT:    movb $3, %al
+; SKX-NEXT:    adcb $0, %al
+; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <2 x i64> %a, %b
+  %t2 = extractelement <2 x i1> %t1, i32 0
+  %sext = sext i1 %t2 to i8
+  %res = add i8 %sext, 4
+  ret i8 %res
+}
+
 define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
 ; KNL-LABEL: test_extractelement_v4i1:
 ; KNL:       ## BB#0:
@@ -1381,7 +1491,7 @@ define zeroext i8 @test_extractelement_v4i1(<4 x i32> %a, <4 x i32> %b) {
 ; SKX-NEXT:    vpcmpnleud %xmm1, %xmm0, %k0
 ; SKX-NEXT:    kshiftlw $12, %k0, %k0
 ; SKX-NEXT:    kshiftrw $15, %k0, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andl $1, %eax
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <4 x i32> %a, %b
@@ -1406,8 +1516,9 @@ define zeroext i8 @test_extractelement_v32i1(<32 x i8> %a, <32 x i8> %b) {
 ; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0
 ; SKX-NEXT:    kshiftld $29, %k0, %k0
 ; SKX-NEXT:    kshiftrd $31, %k0, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <32 x i8> %a, %b
   %t2 = extractelement <32 x i1> %t1, i32 2
@@ -1424,9 +1535,7 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
 ; KNL-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
 ; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; KNL-NEXT:    vpextrb $15, %xmm0, %eax
-; KNL-NEXT:    testb $1, %al
-; KNL-NEXT:    sete %al
-; KNL-NEXT:    addb $3, %al
+; KNL-NEXT:    addb $4, %al
 ; KNL-NEXT:    movzbl %al, %eax
 ; KNL-NEXT:    retq
 ;
@@ -1434,15 +1543,996 @@ define zeroext i8 @test_extractelement_v64i1(<64 x i8> %a, <64 x i8> %b) {
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
 ; SKX-NEXT:    kshiftrq $63, %k0, %k0
-; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    testb %al, %al
-; SKX-NEXT:    sete %al
-; SKX-NEXT:    addb $3, %al
+; SKX-NEXT:    cmpb $1, %al
+; SKX-NEXT:    movb $3, %al
+; SKX-NEXT:    adcb $0, %al
 ; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %t1 = icmp ugt <64 x i8> %a, %b
   %t2 = extractelement <64 x i1> %t1, i32 63
   %res = select i1 %t2, i8 3, i8 4
   ret i8 %res
 }
+
+define zeroext i8 @extractelement_v64i1_alt(<64 x i8> %a, <64 x i8> %b) {
+; KNL-LABEL: extractelement_v64i1_alt:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT:    vpxor %ymm0, %ymm3, %ymm2
+; KNL-NEXT:    vpxor %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpcmpgtb %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; KNL-NEXT:    vpextrb $15, %xmm0, %eax
+; KNL-NEXT:    addb $4, %al
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: extractelement_v64i1_alt:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
+; SKX-NEXT:    kshiftrq $63, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    cmpb $1, %al
+; SKX-NEXT:    movb $3, %al
+; SKX-NEXT:    adcb $0, %al
+; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <64 x i8> %a, %b
+  %t2 = extractelement <64 x i1> %t1, i32 63
+  %sext = sext i1 %t2 to i8
+  %res = add i8 %sext, 4
+  ret i8 %res
+}
+
+define i64 @test_extractelement_variable_v2i64(<2 x i64> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v2i64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    andl $1, %edi
+; KNL-NEXT:    movq -24(%rsp,%rdi,8), %rax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v2i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    andl $1, %edi
+; SKX-NEXT:    movq -24(%rsp,%rdi,8), %rax
+; SKX-NEXT:    retq
+  %t2 = extractelement <2 x i64> %t1, i32 %index
+  ret i64 %t2
+}
+
+define i64 @test_extractelement_variable_v4i64(<4 x i64> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v4i64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi3:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi4:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi5:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $3, %edi
+; KNL-NEXT:    movq (%rsp,%rdi,8), %rax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v4i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi0:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi1:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi2:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %ymm0, (%rsp)
+; SKX-NEXT:    andl $3, %edi
+; SKX-NEXT:    movq (%rsp,%rdi,8), %rax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t2 = extractelement <4 x i64> %t1, i32 %index
+  ret i64 %t2
+}
+
+define i64 @test_extractelement_variable_v8i64(<8 x i64> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v8i64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi6:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi7:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi8:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    movq (%rsp,%rdi,8), %rax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v8i64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi3:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi4:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi5:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    movq (%rsp,%rdi,8), %rax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t2 = extractelement <8 x i64> %t1, i32 %index
+  ret i64 %t2
+}
+
+define double @test_extractelement_variable_v2f64(<2 x double> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v2f64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    andl $1, %edi
+; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v2f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    andl $1, %edi
+; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT:    retq
+  %t2 = extractelement <2 x double> %t1, i32 %index
+  ret double %t2
+}
+
+define double @test_extractelement_variable_v4f64(<4 x double> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v4f64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi9:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi10:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi11:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $3, %edi
+; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v4f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi6:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi7:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi8:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %ymm0, (%rsp)
+; SKX-NEXT:    andl $3, %edi
+; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t2 = extractelement <4 x double> %t1, i32 %index
+  ret double %t2
+}
+
+define double @test_extractelement_variable_v8f64(<8 x double> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v8f64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi12:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi13:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi14:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v8f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi9:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi10:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi11:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t2 = extractelement <8 x double> %t1, i32 %index
+  ret double %t2
+}
+
+define i32 @test_extractelement_variable_v4i32(<4 x i32> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v4i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    andl $3, %edi
+; KNL-NEXT:    movl -24(%rsp,%rdi,4), %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v4i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    andl $3, %edi
+; SKX-NEXT:    movl -24(%rsp,%rdi,4), %eax
+; SKX-NEXT:    retq
+  %t2 = extractelement <4 x i32> %t1, i32 %index
+  ret i32 %t2
+}
+
+define i32 @test_extractelement_variable_v8i32(<8 x i32> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v8i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi15:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi16:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi17:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v8i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi12:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi13:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi14:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %ymm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    movl (%rsp,%rdi,4), %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t2 = extractelement <8 x i32> %t1, i32 %index
+  ret i32 %t2
+}
+
+define i32 @test_extractelement_variable_v16i32(<16 x i32> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v16i32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi18:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi19:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi20:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v16i32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi15:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi16:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi17:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    movl (%rsp,%rdi,4), %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t2 = extractelement <16 x i32> %t1, i32 %index
+  ret i32 %t2
+}
+
+define float @test_extractelement_variable_v4f32(<4 x float> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v4f32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    andl $3, %edi
+; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v4f32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    andl $3, %edi
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    retq
+  %t2 = extractelement <4 x float> %t1, i32 %index
+  ret float %t2
+}
+
+define float @test_extractelement_variable_v8f32(<8 x float> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v8f32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi21:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi22:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi23:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v8f32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi18:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi19:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi20:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %ymm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t2 = extractelement <8 x float> %t1, i32 %index
+  ret float %t2
+}
+
+define float @test_extractelement_variable_v16f32(<16 x float> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v16f32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi24:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi25:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi26:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %zmm0, (%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v16f32:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi21:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi22:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi23:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovaps %zmm0, (%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t2 = extractelement <16 x float> %t1, i32 %index
+  ret float %t2
+}
+
+define i16 @test_extractelement_variable_v8i16(<8 x i16> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v8i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v8i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovdqu %xmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
+; SKX-NEXT:    retq
+  %t2 = extractelement <8 x i16> %t1, i32 %index
+  ret i16 %t2
+}
+
+define i16 @test_extractelement_variable_v16i16(<16 x i16> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v16i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi27:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi28:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi29:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v16i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi24:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi25:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi26:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovdqu %ymm0, (%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t2 = extractelement <16 x i16> %t1, i32 %index
+  ret i16 %t2
+}
+
+define i16 @test_extractelement_variable_v32i16(<32 x i16> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v32i16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi30:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi31:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi32:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $31, %edi
+; KNL-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v32i16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi27:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi28:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi29:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovdqu16 %zmm0, (%rsp)
+; SKX-NEXT:    andl $31, %edi
+; SKX-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t2 = extractelement <32 x i16> %t1, i32 %index
+  ret i16 %t2
+}
+
+define i8 @test_extractelement_variable_v16i8(<16 x i8> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v16i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; KNL-NEXT:    movb (%rdi,%rax), %al
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v16i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovdqu %xmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
+; SKX-NEXT:    movb (%rdi,%rax), %al
+; SKX-NEXT:    retq
+  %t2 = extractelement <16 x i8> %t1, i32 %index
+  ret i8 %t2
+}
+
+define i8 @test_extractelement_variable_v32i8(<32 x i8> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v32i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi33:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi34:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi35:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $31, %edi
+; KNL-NEXT:    movq %rsp, %rax
+; KNL-NEXT:    movb (%rdi,%rax), %al
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v32i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi30:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi31:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi32:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-32, %rsp
+; SKX-NEXT:    subq $64, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovdqu %ymm0, (%rsp)
+; SKX-NEXT:    andl $31, %edi
+; SKX-NEXT:    movq %rsp, %rax
+; SKX-NEXT:    movb (%rdi,%rax), %al
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+
+  %t2 = extractelement <32 x i8> %t1, i32 %index
+  ret i8 %t2
+}
+
+define i8 @test_extractelement_variable_v64i8(<64 x i8> %t1, i32 %index) {
+; KNL-LABEL: test_extractelement_variable_v64i8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi36:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi37:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi38:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    andl $63, %edi
+; KNL-NEXT:    movq %rsp, %rax
+; KNL-NEXT:    movb (%rdi,%rax), %al
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v64i8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi33:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi34:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi35:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vmovdqu8 %zmm0, (%rsp)
+; SKX-NEXT:    andl $63, %edi
+; SKX-NEXT:    movq %rsp, %rax
+; SKX-NEXT:    movb (%rdi,%rax), %al
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+
+  %t2 = extractelement <64 x i8> %t1, i32 %index
+  ret i8 %t2
+}
+
+define i8 @test_extractelement_variable_v64i8_indexi8(<64 x i8> %t1, i8 %index) {
+; KNL-LABEL: test_extractelement_variable_v64i8_indexi8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi39:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi40:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi41:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    addb %dil, %dil
+; KNL-NEXT:    vmovaps %ymm1, {{[0-9]+}}(%rsp)
+; KNL-NEXT:    vmovaps %ymm0, (%rsp)
+; KNL-NEXT:    movzbl %dil, %eax
+; KNL-NEXT:    andl $63, %eax
+; KNL-NEXT:    movq %rsp, %rcx
+; KNL-NEXT:    movb (%rax,%rcx), %al
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_variable_v64i8_indexi8:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi36:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi37:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi38:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    addb %dil, %dil
+; SKX-NEXT:    vmovdqu8 %zmm0, (%rsp)
+; SKX-NEXT:    movzbl %dil, %eax
+; SKX-NEXT:    andl $63, %eax
+; SKX-NEXT:    movq %rsp, %rcx
+; SKX-NEXT:    movb (%rax,%rcx), %al
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+
+  %i  = add i8 %index, %index
+  %t2 = extractelement <64 x i8> %t1, i8 %i
+  ret i8 %t2
+}
+
+define zeroext i8 @test_extractelement_varible_v2i1(<2 x i64> %a, <2 x i64> %b, i32 %index) {
+; KNL-LABEL: test_extractelement_varible_v2i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovdqa {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
+; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    andl $1, %edi
+; KNL-NEXT:    movl -24(%rsp,%rdi,8), %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_varible_v2i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k0
+; SKX-NEXT:    vpmovm2q %k0, %xmm0
+; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    andl $1, %edi
+; SKX-NEXT:    movl -24(%rsp,%rdi,8), %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <2 x i64> %a, %b
+  %t2 = extractelement <2 x i1> %t1, i32 %index
+  %res = zext i1 %t2 to i8
+  ret i8 %res
+}
+
+define zeroext i8 @test_extractelement_varible_v4i1(<4 x i32> %a, <4 x i32> %b, i32 %index) {
+; KNL-LABEL: test_extractelement_varible_v4i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; KNL-NEXT:    vpxor %xmm2, %xmm1, %xmm1
+; KNL-NEXT:    vpxor %xmm2, %xmm0, %xmm0
+; KNL-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; KNL-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; KNL-NEXT:    andl $3, %edi
+; KNL-NEXT:    movl -24(%rsp,%rdi,4), %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_varible_v4i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vpcmpnleud %xmm1, %xmm0, %k0
+; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    vmovdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SKX-NEXT:    andl $3, %edi
+; SKX-NEXT:    movl -24(%rsp,%rdi,4), %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <4 x i32> %a, %b
+  %t2 = extractelement <4 x i1> %t1, i32 %index
+  %res = zext i1 %t2 to i8
+  ret i8 %res
+}
+
+define zeroext i8 @test_extractelement_varible_v8i1(<8 x i32> %a, <8 x i32> %b, i32 %index) {
+; KNL-LABEL: test_extractelement_varible_v8i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi42:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi43:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi44:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    ## kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
+; KNL-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vmovdqa64 %zmm0, (%rsp)
+; KNL-NEXT:    andl $7, %edi
+; KNL-NEXT:    movl (%rsp,%rdi,8), %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_varible_v8i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi39:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi40:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi41:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vpcmpnleud %ymm1, %ymm0, %k0
+; SKX-NEXT:    vpmovm2q %k0, %zmm0
+; SKX-NEXT:    vmovdqa64 %zmm0, (%rsp)
+; SKX-NEXT:    andl $7, %edi
+; SKX-NEXT:    movl (%rsp,%rdi,8), %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <8 x i32> %a, %b
+  %t2 = extractelement <8 x i1> %t1, i32 %index
+  %res = zext i1 %t2 to i8
+  ret i8 %res
+}
+
+define zeroext i8 @test_extractelement_varible_v16i1(<16 x i32> %a, <16 x i32> %b, i32 %index) {
+; KNL-LABEL: test_extractelement_varible_v16i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi45:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi46:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi47:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-64, %rsp
+; KNL-NEXT:    subq $128, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vmovdqa32 %zmm0, (%rsp)
+; KNL-NEXT:    andl $15, %edi
+; KNL-NEXT:    movl (%rsp,%rdi,4), %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_varible_v16i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi42:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi43:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi44:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT:    vpmovm2d %k0, %zmm0
+; SKX-NEXT:    vmovdqa32 %zmm0, (%rsp)
+; SKX-NEXT:    andl $15, %edi
+; SKX-NEXT:    movl (%rsp,%rdi,4), %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <16 x i32> %a, %b
+  %t2 = extractelement <16 x i1> %t1, i32 %index
+  %res = zext i1 %t2 to i8
+  ret i8 %res
+}
+
+define zeroext i8 @test_extractelement_varible_v32i1(<32 x i8> %a, <32 x i8> %b, i32 %index) {
+; KNL-LABEL: test_extractelement_varible_v32i1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    pushq %rbp
+; KNL-NEXT:  Lcfi48:
+; KNL-NEXT:    .cfi_def_cfa_offset 16
+; KNL-NEXT:  Lcfi49:
+; KNL-NEXT:    .cfi_offset %rbp, -16
+; KNL-NEXT:    movq %rsp, %rbp
+; KNL-NEXT:  Lcfi50:
+; KNL-NEXT:    .cfi_def_cfa_register %rbp
+; KNL-NEXT:    andq $-32, %rsp
+; KNL-NEXT:    subq $64, %rsp
+; KNL-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; KNL-NEXT:    vpxor %ymm2, %ymm1, %ymm1
+; KNL-NEXT:    vpxor %ymm2, %ymm0, %ymm0
+; KNL-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
+; KNL-NEXT:    vmovdqa %ymm0, (%rsp)
+; KNL-NEXT:    andl $31, %edi
+; KNL-NEXT:    movq %rsp, %rax
+; KNL-NEXT:    movb (%rdi,%rax), %al
+; KNL-NEXT:    andb $1, %al
+; KNL-NEXT:    movzbl %al, %eax
+; KNL-NEXT:    movq %rbp, %rsp
+; KNL-NEXT:    popq %rbp
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_extractelement_varible_v32i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi45:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi46:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi47:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vpcmpnleub %ymm1, %ymm0, %k0
+; SKX-NEXT:    vpmovm2w %k0, %zmm0
+; SKX-NEXT:    vmovdqu16 %zmm0, (%rsp)
+; SKX-NEXT:    andl $31, %edi
+; SKX-NEXT:    movzwl (%rsp,%rdi,2), %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <32 x i8> %a, %b
+  %t2 = extractelement <32 x i1> %t1, i32 %index
+  %res = zext i1 %t2 to i8
+  ret i8 %res
+}
+
diff --git a/test/CodeGen/X86/avx512-insert-extract_i1.ll b/test/CodeGen/X86/avx512-insert-extract_i1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a1d1a7dae19009d66af0d7a87fb2a5ba6c0cfc60
--- /dev/null
+++ b/test/CodeGen/X86/avx512-insert-extract_i1.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefix=SKX --check-prefix=SKX_ONLY %s
+
+; TODO - fix fail on KNL and move this test to avx512-insert-extract.ll
+
+define zeroext i8 @test_extractelement_varible_v64i1(<64 x i8> %a, <64 x i8> %b, i32 %index) {
+; SKX-LABEL: test_extractelement_varible_v64i1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    pushq %rbp
+; SKX-NEXT:  Lcfi0:
+; SKX-NEXT:    .cfi_def_cfa_offset 16
+; SKX-NEXT:  Lcfi1:
+; SKX-NEXT:    .cfi_offset %rbp, -16
+; SKX-NEXT:    movq %rsp, %rbp
+; SKX-NEXT:  Lcfi2:
+; SKX-NEXT:    .cfi_def_cfa_register %rbp
+; SKX-NEXT:    andq $-64, %rsp
+; SKX-NEXT:    subq $128, %rsp
+; SKX-NEXT:    ## kill: %EDI<def> %EDI<kill> %RDI<def>
+; SKX-NEXT:    vpcmpnleub %zmm1, %zmm0, %k0
+; SKX-NEXT:    vpmovm2b %k0, %zmm0
+; SKX-NEXT:    vmovdqu8 %zmm0, (%rsp)
+; SKX-NEXT:    andl $63, %edi
+; SKX-NEXT:    movq %rsp, %rax
+; SKX-NEXT:    movb (%rdi,%rax), %al
+; SKX-NEXT:    andb $1, %al
+; SKX-NEXT:    movzbl %al, %eax
+; SKX-NEXT:    movq %rbp, %rsp
+; SKX-NEXT:    popq %rbp
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+  %t1 = icmp ugt <64 x i8> %a, %b
+  %t2 = extractelement <64 x i1> %t1, i32 %index
+  %res = zext i1 %t2 to i8
+  ret i8 %res
+}
+
diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index 8590d641a4c55e4e96cc562edfedde09f1e9f522..1ac743d7d5bbb4914f3d7aaad7e5718e243a9e6a 100644
--- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -504,6 +504,7 @@ define i16 @test_pcmpeq_d(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
   ret i16 %res
@@ -515,6 +516,7 @@ define i16 @test_mask_pcmpeq_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   ret i16 %res
@@ -527,6 +529,7 @@ define i8 @test_pcmpeq_q(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
   ret i8 %res
@@ -538,6 +541,7 @@ define i8 @test_mask_pcmpeq_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret i8 %res
@@ -550,6 +554,7 @@ define i16 @test_pcmpgt_d(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 -1)
   ret i16 %res
@@ -561,6 +566,7 @@ define i16 @test_mask_pcmpgt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
   ret i16 %res
@@ -573,6 +579,7 @@ define i8 @test_pcmpgt_q(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 -1)
   ret i8 %res
@@ -584,6 +591,7 @@ define i8 @test_mask_pcmpgt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret i8 %res
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 2ed27dbff391e612eb4f31ca4183042824ee4c28..cc5e9e038e0bf4e785b2770d204d36321849c2b2 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -21,9 +21,9 @@ define i32 @test_kortestc(i16 %a0, i16 %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k0
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    kortestw %k0, %k1
-; CHECK-NEXT:    sbbl %eax, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    retq
   %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1)
   ret i32 %res
@@ -33,19 +33,38 @@ declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
 define i16 @test_kand(i16 %a0, i16 %a1) {
 ; CHECK-LABEL: test_kand:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    movw $8, %ax
-; CHECK-NEXT:    kmovw %eax, %k0
+; CHECK-NEXT:    kmovw %esi, %k0
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    movw $8, %ax
+; CHECK-NEXT:    kmovw %eax, %k2
 ; CHECK-NEXT:    kandw %k0, %k1, %k0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    kandw %k1, %k0, %k0
+; CHECK-NEXT:    kandw %k0, %k2, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
   %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
   ret i16 %t2
 }
 
+declare i16 @llvm.x86.avx512.kandn.w(i16, i16) nounwind readnone
+define i16 @test_kandn(i16 %a0, i16 %a1) {
+; CHECK-LABEL: test_kandn:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    movw $8, %ax
+; CHECK-NEXT:    kmovw %eax, %k2
+; CHECK-NEXT:    kandnw %k2, %k1, %k1
+; CHECK-NEXT:    kandnw %k0, %k1, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq
+  %t1 = call i16 @llvm.x86.avx512.kandn.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kandn.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
 declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
 define i16 @test_knot(i16 %a0) {
 ; CHECK-LABEL: test_knot:
@@ -53,11 +72,30 @@ define i16 @test_knot(i16 %a0) {
 ; CHECK-NEXT:    kmovw %edi, %k0
 ; CHECK-NEXT:    knotw %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
   ret i16 %res
 }
 
+declare i16 @llvm.x86.avx512.kor.w(i16, i16) nounwind readnone
+define i16 @test_kor(i16 %a0, i16 %a1) {
+; CHECK-LABEL: test_kor:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    movw $8, %ax
+; CHECK-NEXT:    kmovw %eax, %k2
+; CHECK-NEXT:    korw %k0, %k1, %k0
+; CHECK-NEXT:    korw %k0, %k2, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq
+  %t1 = call i16 @llvm.x86.avx512.kor.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kor.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
 declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
 
 define i16 @unpckbw_test(i16 %a0, i16 %a1) {
@@ -67,11 +105,48 @@ define i16 @unpckbw_test(i16 %a0, i16 %a1) {
 ; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    kunpckbw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
   ret i16 %res
 }
 
+declare i16 @llvm.x86.avx512.kxnor.w(i16, i16) nounwind readnone
+define i16 @test_kxnor(i16 %a0, i16 %a1) {
+; CHECK-LABEL: test_kxnor:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    movw $8, %ax
+; CHECK-NEXT:    kmovw %eax, %k2
+; CHECK-NEXT:    kxorw %k0, %k1, %k0
+; CHECK-NEXT:    kxorw %k0, %k2, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq
+  %t1 = call i16 @llvm.x86.avx512.kxnor.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kxnor.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
+declare i16 @llvm.x86.avx512.kxor.w(i16, i16) nounwind readnone
+define i16 @test_kxor(i16 %a0, i16 %a1) {
+; CHECK-LABEL: test_kxor:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    movw $8, %ax
+; CHECK-NEXT:    kmovw %eax, %k2
+; CHECK-NEXT:    kxorw %k0, %k1, %k0
+; CHECK-NEXT:    kxorw %k0, %k2, %k0
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; CHECK-NEXT:    retq
+  %t1 = call i16 @llvm.x86.avx512.kxor.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kxor.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
 ; CHECK-LABEL: test_rcp_ps_512:
 ; CHECK:       ## BB#0:
@@ -223,7 +298,7 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vmovapd %xmm2, %xmm3
 ; CHECK-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
 ; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
@@ -641,99 +716,12 @@ define <8 x double> @test_x86_vbroadcast_sd_512(i8* %a0) {
 }
 declare <8 x double> @llvm.x86.avx512.vbroadcast.sd.512(i8*) nounwind readonly
 
-define <16 x i32> @test_conflict_d(<16 x i32> %a) {
-; CHECK-LABEL: test_conflict_d:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpconflictd %zmm0, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
-
-define <8 x i64> @test_conflict_q(<8 x i64> %a) {
-; CHECK-LABEL: test_conflict_q:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpconflictq %zmm0, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
-
-define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
-; CHECK-LABEL: test_maskz_conflict_d:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpconflictd %zmm0, %zmm0 {%k1} {z}
-; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_conflict_q:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpconflictq %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
-  ret <8 x i64> %res
-}
-
-define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
-; CHECK-LABEL: test_lzcnt_d:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vplzcntd %zmm0, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
-  ret <16 x i32> %res
-}
-
-declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
-
-define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
-; CHECK-LABEL: test_lzcnt_q:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vplzcntq %zmm0, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
-  ret <8 x i64> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
-
-
-define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; CHECK-LABEL: test_mask_lzcnt_d:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vplzcntd %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
-  ret <16 x i32> %res
-}
-
-define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-; CHECK-LABEL: test_mask_lzcnt_q:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vplzcntq %zmm0, %zmm1 {%k1}
-; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
-  ret <8 x i64> %res
-}
-
  define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
 ; CHECK-LABEL: test_cmpps:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpleps {sae}, %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
    %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
    ret i16 %res
@@ -745,6 +733,7 @@ define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpneqpd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
    %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
    ret i8 %res
@@ -812,11 +801,12 @@ define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1, i8 %m) {
 ; CHECK-LABEL: test_vptestmq:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vptestmq %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    addb %cl, %al
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
   %res1 = call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 %m)
@@ -829,9 +819,9 @@ define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1, i16 %m) {
 ; CHECK-LABEL: test_vptestmd:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vptestmd %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
@@ -858,29 +848,29 @@ declare void @llvm.x86.avx512.mask.store.ss(i8*, <4 x float>, i8 )
 define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: test_cmp_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k3
-; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k4
-; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k5
-; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k6
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k7
-; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k2
-; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    kmovw %k3, %ecx
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k2
+; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k3
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k4
+; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k5
+; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k6
+; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k7
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vmovd %ecx, %xmm0
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
+; CHECK-NEXT:    kmovw %k2, %eax
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k6, %eax
+; CHECK-NEXT:    kmovw %k3, %eax
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k7, %eax
+; CHECK-NEXT:    kmovw %k4, %eax
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k5, %eax
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k6, %eax
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    kmovw %k7, %eax
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
@@ -905,30 +895,30 @@ define <8 x i16> @test_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
 define <8 x i16> @test_mask_cmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
 ; CHECK-LABEL: test_mask_cmp_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k4 {%k3}
-; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k5 {%k3}
-; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k6 {%k3}
-; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k7 {%k3}
-; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k0 {%k3}
-; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k2 {%k3}
-; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k1 {%k3}
-; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k3 {%k3}
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    kmovw %k4, %ecx
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    vpcmpltd %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT:    vpcmpled %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT:    vpcmpunordd %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT:    vpcmpneqd %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT:    vpcmpnltd %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT:    vpcmpnled %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT:    vpcmpordd %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vmovd %ecx, %xmm0
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k6, %eax
+; CHECK-NEXT:    kmovw %k3, %eax
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k7, %eax
+; CHECK-NEXT:    kmovw %k4, %eax
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    kmovw %k5, %eax
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k6, %eax
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k7, %eax
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
+; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
@@ -955,29 +945,29 @@ declare i16 @llvm.x86.avx512.mask.cmp.d.512(<16 x i32>, <16 x i32>, i32, i16) no
 define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
 ; CHECK-LABEL: test_ucmp_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k3
-; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k4
-; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k5
-; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k6
-; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k7
-; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k2
-; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    kmovw %k3, %ecx
+; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k2
+; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k3
+; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k4
+; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k5
+; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k6
+; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k7
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vmovd %ecx, %xmm0
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
+; CHECK-NEXT:    kmovw %k2, %eax
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k6, %eax
+; CHECK-NEXT:    kmovw %k3, %eax
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k7, %eax
+; CHECK-NEXT:    kmovw %k4, %eax
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k5, %eax
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k6, %eax
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    kmovw %k7, %eax
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 -1)
@@ -1002,30 +992,30 @@ define <8 x i16> @test_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1) {
 define <8 x i16> @test_mask_ucmp_d_512(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
 ; CHECK-LABEL: test_mask_ucmp_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3
-; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k4 {%k3}
-; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k5 {%k3}
-; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k6 {%k3}
-; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k7 {%k3}
-; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k0 {%k3}
-; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k2 {%k3}
-; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k1 {%k3}
-; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k3 {%k3}
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    kmovw %k4, %ecx
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcmpequd %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    vpcmpltud %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT:    vpcmpleud %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT:    vpcmpunordud %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT:    vpcmpnequd %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT:    vpcmpnltud %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT:    vpcmpordud %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vmovd %ecx, %xmm0
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k6, %eax
+; CHECK-NEXT:    kmovw %k3, %eax
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k7, %eax
+; CHECK-NEXT:    kmovw %k4, %eax
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    kmovw %k5, %eax
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k6, %eax
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k7, %eax
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
+; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32> %a0, <16 x i32> %a1, i32 0, i16 %mask)
@@ -1052,29 +1042,29 @@ declare i16 @llvm.x86.avx512.mask.ucmp.d.512(<16 x i32>, <16 x i32>, i32, i16) n
 define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
 ; CHECK-LABEL: test_cmp_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k3
-; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k4
-; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k5
-; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k6
-; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k7
-; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k2
-; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    kmovw %k3, %ecx
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k2
+; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k3
+; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k4
+; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k5
+; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k6
+; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k7
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vmovd %ecx, %xmm0
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
+; CHECK-NEXT:    kmovw %k2, %eax
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k6, %eax
+; CHECK-NEXT:    kmovw %k3, %eax
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k7, %eax
+; CHECK-NEXT:    kmovw %k4, %eax
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k5, %eax
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k6, %eax
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    kmovw %k7, %eax
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
@@ -1099,30 +1089,30 @@ define <8 x i8> @test_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
 define <8 x i8> @test_mask_cmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_cmp_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3
-; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k4 {%k3}
-; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k5 {%k3}
-; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k6 {%k3}
-; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k7 {%k3}
-; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k0 {%k3}
-; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k2 {%k3}
-; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k1 {%k3}
-; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k3 {%k3}
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    kmovw %k4, %ecx
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcmpeqq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    vpcmpltq %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT:    vpcmpleq %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT:    vpcmpunordq %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT:    vpcmpneqq %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT:    vpcmpnltq %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT:    vpcmpnleq %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT:    vpcmpordq %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vmovd %ecx, %xmm0
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k6, %eax
+; CHECK-NEXT:    kmovw %k3, %eax
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k7, %eax
+; CHECK-NEXT:    kmovw %k4, %eax
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    kmovw %k5, %eax
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k6, %eax
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k7, %eax
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
+; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
@@ -1149,29 +1139,29 @@ declare i8 @llvm.x86.avx512.mask.cmp.q.512(<8 x i64>, <8 x i64>, i32, i8) nounwi
 define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
 ; CHECK-LABEL: test_ucmp_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k3
-; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k4
-; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k5
-; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k6
-; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k7
-; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k2
-; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1
-; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k4, %eax
-; CHECK-NEXT:    kmovw %k3, %ecx
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k1
+; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k2
+; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k3
+; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k4
+; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k5
+; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k6
+; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k7
+; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vmovd %ecx, %xmm0
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k5, %eax
+; CHECK-NEXT:    kmovw %k2, %eax
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k6, %eax
+; CHECK-NEXT:    kmovw %k3, %eax
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k7, %eax
+; CHECK-NEXT:    kmovw %k4, %eax
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k5, %eax
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k6, %eax
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    kmovw %k7, %eax
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 -1)
@@ -1196,30 +1186,30 @@ define <8 x i8> @test_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1) {
 define <8 x i8> @test_mask_ucmp_q_512(<8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_ucmp_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3
-; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k4 {%k3}
-; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k5 {%k3}
-; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k6 {%k3}
-; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k7 {%k3}
-; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k0 {%k3}
-; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k2 {%k3}
-; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k1 {%k3}
-; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k3 {%k3}
-; CHECK-NEXT:    kmovw %k5, %eax
-; CHECK-NEXT:    kmovw %k4, %ecx
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpcmpequq %zmm1, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    vpcmpltuq %zmm1, %zmm0, %k2 {%k1}
+; CHECK-NEXT:    vpcmpleuq %zmm1, %zmm0, %k3 {%k1}
+; CHECK-NEXT:    vpcmpunorduq %zmm1, %zmm0, %k4 {%k1}
+; CHECK-NEXT:    vpcmpnequq %zmm1, %zmm0, %k5 {%k1}
+; CHECK-NEXT:    vpcmpnltuq %zmm1, %zmm0, %k6 {%k1}
+; CHECK-NEXT:    vpcmpnleuq %zmm1, %zmm0, %k7 {%k1}
+; CHECK-NEXT:    vpcmporduq %zmm1, %zmm0, %k1 {%k1}
+; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vmovd %ecx, %xmm0
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k6, %eax
+; CHECK-NEXT:    kmovw %k3, %eax
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k7, %eax
+; CHECK-NEXT:    kmovw %k4, %eax
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    kmovw %k5, %eax
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k2, %eax
+; CHECK-NEXT:    kmovw %k6, %eax
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k1, %eax
+; CHECK-NEXT:    kmovw %k7, %eax
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
-; CHECK-NEXT:    kmovw %k3, %eax
+; CHECK-NEXT:    kmovw %k1, %eax
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.512(<8 x i64> %a0, <8 x i64> %a1, i32 0, i8 %mask)
@@ -2301,6 +2291,39 @@ define <4 x float> @test_add_ss_rn(<4 x float> %a0, <4 x float> %a1) {
   ret <4 x float> %res
 }
 
+define <4 x float> @test_mask_add_ss_current_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_add_ss_current_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vaddss (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a1.val = load float, float* %a1
+  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_add_ss_current_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
+; CHECK-LABEL: test_maskz_add_ss_current_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a1.val = load float, float* %a1
+  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
 declare <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
 
 define <2 x double> @test_mask_add_sd_rn(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
@@ -2383,6 +2406,35 @@ define <2 x double> @test_add_sd_rn(<2 x double> %a0, <2 x double> %a1) {
   ret <2 x double> %res
 }
 
+define <2 x double> @test_mask_add_sd_current_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_add_sd_current_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vaddsd (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a1.val = load double, double* %a1
+  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_add_sd_current_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
+; CHECK-LABEL: test_maskz_add_sd_current_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vaddsd (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a1.val = load double, double* %a1
+  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+  %res = call <2 x double> @llvm.x86.avx512.mask.add.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
 declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <4 x float> @test_mask_max_ss_sae(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
@@ -2448,6 +2500,39 @@ define <4 x float> @test_max_ss(<4 x float> %a0, <4 x float> %a1) {
   %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 4)
   ret <4 x float> %res
 }
+
+define <4 x float> @test_mask_max_ss_memfold(<4 x float> %a0, float* %a1, <4 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_max_ss_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmaxss (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a1.val = load float, float* %a1
+  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> %a2, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_maskz_max_ss_memfold(<4 x float> %a0, float* %a1, i8 %mask) {
+; CHECK-LABEL: test_maskz_max_ss_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a1.val = load float, float* %a1
+  %a1v0 = insertelement <4 x float> undef, float %a1.val, i32 0
+  %a1v1 = insertelement <4 x float> %a1v0, float 0.000000e+00, i32 1
+  %a1v2 = insertelement <4 x float> %a1v1, float 0.000000e+00, i32 2
+  %a1v  = insertelement <4 x float> %a1v2, float 0.000000e+00, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a0, <4 x float> %a1v, <4 x float> zeroinitializer, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
 declare <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
 
 define <2 x double> @test_mask_max_sd_sae(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
@@ -2514,6 +2599,35 @@ define <2 x double> @test_max_sd(<2 x double> %a0, <2 x double> %a1) {
   ret <2 x double> %res
 }
 
+define <2 x double> @test_mask_max_sd_memfold(<2 x double> %a0, double* %a1, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_max_sd_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmaxsd (%rdi), %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a1.val = load double, double* %a1
+  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> %a2, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_maskz_max_sd_memfold(<2 x double> %a0, double* %a1, i8 %mask) {
+; CHECK-LABEL: test_maskz_max_sd_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmaxsd (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %a1.val = load double, double* %a1
+  %a1v0 = insertelement <2 x double> undef, double %a1.val, i32 0
+  %a1v = insertelement <2 x double> %a1v0, double 0.000000e+00, i32 1
+  %res = call <2 x double> @llvm.x86.avx512.mask.max.sd.round(<2 x double>%a0, <2 x double> %a1v, <2 x double> zeroinitializer, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
+
 define <2 x double> @test_x86_avx512_cvtsi2sd64(<2 x double> %a, i64 %b) {
 ; CHECK-LABEL: test_x86_avx512_cvtsi2sd64:
 ; CHECK:       ## BB#0:
@@ -2666,9 +2780,9 @@ define <8 x double>@test_int_x86_avx512_mask_vpermi2var_pd_512(<8 x double> %x0,
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovapd %zmm1, %zmm3
-; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm3 {%k1}
-; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vaddpd %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm3
+; CHECK-NEXT:    vpermi2pd %zmm2, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vaddpd %zmm3, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.vpermi2var.pd.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 -1)
@@ -2683,9 +2797,9 @@ define <16 x float>@test_int_x86_avx512_mask_vpermi2var_ps_512(<16 x float> %x0,
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
-; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm3 {%k1}
-; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vaddps %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm3
+; CHECK-NEXT:    vpermi2ps %zmm2, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.vpermi2var.ps.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 -1)
@@ -2700,9 +2814,9 @@ define <8 x i64>@test_int_x86_avx512_mask_vpermi2var_q_512(<8 x i64> %x0, <8 x i
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm3
-; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3 {%k1}
-; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vpaddq %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm3
+; CHECK-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpaddq %zmm3, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -2755,9 +2869,9 @@ define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0,
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm1, %zmm3
-; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vaddps %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm3
+; CHECK-NEXT:    vpermt2ps %zmm2, %zmm0, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vaddps %zmm3, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 -1)
@@ -2773,9 +2887,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm3
-; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vpaddq %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm3
+; CHECK-NEXT:    vpermt2q %zmm2, %zmm0, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm3, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
@@ -2790,9 +2904,9 @@ define <16 x i32>@test_int_x86_avx512_mask_vpermt2var_d_512(<16 x i32> %x0, <16
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm3
-; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3 {%k1}
-; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
-; CHECK-NEXT:    vpaddd %zmm1, %zmm3, %zmm0
+; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm3
+; CHECK-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vpaddd %zmm3, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
@@ -2836,8 +2950,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_512(<8 x i64> %x0, <16 x i8> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovqb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovqb %zmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
@@ -2870,8 +2984,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_512(<8 x i64> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsqb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovsqb %zmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
@@ -2904,8 +3018,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_512(<8 x i64> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovusqb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovusqb %zmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
@@ -2938,8 +3052,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_512(<8 x i64> %x0, <8 x i16> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovqw %zmm0, %xmm0
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
@@ -2972,8 +3086,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_512(<8 x i64> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovsqw %zmm0, %xmm0
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
@@ -3006,8 +3120,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_512(<8 x i64> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovusqw %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovusqw %zmm0, %xmm0
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0
@@ -3040,8 +3154,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pmov_qd_512(<8 x i64> %x0, <8 x i32> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovqd %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovqd %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovqd %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
@@ -3074,8 +3188,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovs_qd_512(<8 x i64> %x0, <8 x i32>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovsqd %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsqd %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovsqd %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
@@ -3108,8 +3222,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pmovus_qd_512(<8 x i64> %x0, <8 x i32>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovusqd %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovusqd %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovusqd %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
@@ -3142,8 +3256,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_512(<16 x i32> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovdb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovdb %zmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
@@ -3176,8 +3290,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_512(<16 x i32> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsdb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovsdb %zmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
@@ -3210,8 +3324,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_512(<16 x i32> %x0, <16 x i8
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpmovusdb %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpmovusdb %zmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
@@ -3244,8 +3358,8 @@ define <16 x i16>@test_int_x86_avx512_mask_pmov_dw_512(<16 x i32> %x0, <16 x i16
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovdw %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovdw %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
@@ -3278,8 +3392,8 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovs_dw_512(<16 x i32> %x0, <16 x i1
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovsdw %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovsdw %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
@@ -3312,8 +3426,8 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovus_dw_512(<16 x i32> %x0, <16 x i
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm2 {%k1} {z}
+; CHECK-NEXT:    vpmovusdw %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vpmovusdw %zmm0, %ymm0
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
@@ -3566,7 +3680,7 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x dou
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vmovapd %xmm2, %xmm3
 ; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm3 {%k1}
 ; CHECK-NEXT:    vgetexpsd %xmm1, %xmm0, %xmm4
 ; CHECK-NEXT:    vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1}
@@ -3700,8 +3814,8 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_f64x2(<8 x double> %x0, <8 x d
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm3 {%k1} {z} = zmm0[4,5,2,3],zmm1[2,3,0,1]
+; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} = zmm0[4,5,2,3],zmm1[2,3,0,1]
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[4,5,2,3],zmm1[2,3,0,1]
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
@@ -3978,9 +4092,9 @@ define <16 x i32>@test_int_x86_avx512_mask_pternlog_d_512(<16 x i32> %x0, <16 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1}
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
@@ -3995,9 +4109,9 @@ define <16 x i32>@test_int_x86_avx512_maskz_pternlog_d_512(<16 x i32> %x0, <16 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vpaddd %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %x4)
   %res1 = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
@@ -4012,9 +4126,9 @@ define <8 x i64>@test_int_x86_avx512_mask_pternlog_q_512(<8 x i64> %x0, <8 x i64
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
-; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1}
-; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
@@ -4029,9 +4143,9 @@ define <8 x i64>@test_int_x86_avx512_maskz_pternlog_q_512(<8 x i64> %x0, <8 x i6
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
-; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0
-; CHECK-NEXT:    vpaddq %zmm0, %zmm3, %zmm0
+; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    vpternlogq $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 %x4)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
@@ -4278,11 +4392,11 @@ define <16 x i32>@test_int_x86_avx512_mask_prorv_d_512(<16 x i32> %x0, <16 x i32
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vprorvd %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.prorv.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
@@ -4298,11 +4412,11 @@ define <8 x i64>@test_int_x86_avx512_mask_prorv_q_512(<8 x i64> %x0, <8 x i64> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm3
 ; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm2 {%k1}
-; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vprorvq %zmm1, %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.prorv.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
@@ -4358,11 +4472,11 @@ define <8 x double>@test_int_x86_avx512_mask_permvar_df_512(<8 x double> %x0, <8
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm3
 ; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm2 {%k1}
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vaddpd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpermpd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vaddpd %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> %x2, i8 %x3)
   %res1 = call <8 x double> @llvm.x86.avx512.mask.permvar.df.512(<8 x double> %x0, <8 x i64> %x1, <8 x double> zeroinitializer, i8 %x3)
@@ -4378,11 +4492,11 @@ define <8 x i64>@test_int_x86_avx512_mask_permvar_di_512(<8 x i64> %x0, <8 x i64
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm3
 ; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm2 {%k1}
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpaddq %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpermq %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vpaddq %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
@@ -4398,11 +4512,11 @@ define <16 x float>@test_int_x86_avx512_mask_permvar_sf_512(<16 x float> %x0, <1
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm3
 ; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm2 {%k1}
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vaddps %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vaddps %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpermps %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vaddps %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vaddps %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> %x2, i16 %x3)
   %res1 = call <16 x float> @llvm.x86.avx512.mask.permvar.sf.512(<16 x float> %x0, <16 x i32> %x1, <16 x float> zeroinitializer, i16 %x3)
@@ -4418,11 +4532,11 @@ define <16 x i32>@test_int_x86_avx512_mask_permvar_si_512(<16 x i32> %x0, <16 x
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm3
 ; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm2 {%k1}
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm3 {%k1} {z}
-; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0
-; CHECK-NEXT:    vpaddd %zmm3, %zmm2, %zmm1
-; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    vpermd %zmm0, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
+; CHECK-NEXT:    vpaddd %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %x3)
   %res1 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> zeroinitializer, i16 %x3)
@@ -4555,13 +4669,13 @@ define <16 x float>@test_int_x86_avx512_maskz_fixupimm_ps_512(<16 x float> %x0,
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm3
-; CHECK-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm3 {%k1} {z}
+; CHECK-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm3
 ; CHECK-NEXT:    vmovaps %zmm0, %zmm4
-; CHECK-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm4
+; CHECK-NEXT:    vfixupimmps $5, %zmm2, %zmm1, %zmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; CHECK-NEXT:    vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK-NEXT:    vaddps %zmm0, %zmm3, %zmm0
-; CHECK-NEXT:    vaddps %zmm4, %zmm0, %zmm0
+; CHECK-NEXT:    vaddps %zmm0, %zmm4, %zmm0
+; CHECK-NEXT:    vaddps %zmm3, %zmm0, %zmm0
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4)
   %res1 = call <16 x float> @llvm.x86.avx512.maskz.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 8)
@@ -4625,9 +4739,9 @@ define i16@test_int_x86_avx512_ptestnm_d_512(<16 x i32> %x0, <16 x i32> %x1, i16
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestnmd %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vptestnmd %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vptestnmd %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
@@ -4644,11 +4758,12 @@ define i8@test_int_x86_avx512_ptestnm_q_512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_q_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vptestnmq %zmm1, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vptestnmq %zmm1, %zmm0, %k0
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vptestnmq %zmm1, %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    addb %cl, %al
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestnm.q.512(<8 x i64> %x0, <8 x i64> %x1, i8-1)
@@ -4660,8 +4775,8 @@ define <16 x i32>@test_int_x86_avx512_mask_pbroadcastd_gpr_512(i32 %x0, <16 x i3
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastd_gpr_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastd %edi, %zmm0 {%k1}
 ; CHECK-NEXT:    vpbroadcastd %edi, %zmm2
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddd %zmm0, %zmm1, %zmm0
@@ -4680,8 +4795,8 @@ define <8 x i64>@test_int_x86_avx512_mask_pbroadcastq_gpr_512(i64 %x0, <8 x i64>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcastq_gpr_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm1 {%k1} {z}
+; CHECK-NEXT:    vpbroadcastq %rdi, %zmm0 {%k1}
 ; CHECK-NEXT:    vpbroadcastq %rdi, %zmm2
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
@@ -4702,11 +4817,11 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm3
+; CHECK-NEXT:    vmovapd %xmm0, %xmm3
 ; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm0, %xmm4
+; CHECK-NEXT:    vmovapd %xmm0, %xmm4
 ; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm4
-; CHECK-NEXT:    vmovaps %xmm0, %xmm5
+; CHECK-NEXT:    vmovapd %xmm0, %xmm5
 ; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1}
 ; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0
 ; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm1
@@ -4758,7 +4873,7 @@ define <2 x double>@test_int_x86_avx512_maskz_vfmadd_sd(<2 x double> %x0, <2 x d
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm3
+; CHECK-NEXT:    vmovapd %xmm0, %xmm3
 ; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} {z}
 ; CHECK-NEXT:    vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
@@ -4790,11 +4905,11 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vmovapd %xmm2, %xmm3
 ; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vmovapd %xmm2, %xmm4
 ; CHECK-NEXT:    vfmadd231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vmovaps %xmm2, %xmm5
+; CHECK-NEXT:    vmovapd %xmm2, %xmm5
 ; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
 ; CHECK-NEXT:    vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
@@ -4839,6 +4954,110 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo
   ret <4 x float> %res6
 }
 
+define void @fmadd_ss_mask_memfold(float* %a, float* %b, i8 %c) {
+; CHECK-LABEL: fmadd_ss_mask_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1}
+; CHECK-NEXT:    vmovss %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+  %a.val = load float, float* %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, float* %a
+  ret void
+}
+
+define void @fmadd_ss_maskz_memfold(float* %a, float* %b, i8 %c) {
+; CHECK-LABEL: fmadd_ss_maskz_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovss %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+  %a.val = load float, float* %a
+  %av0 = insertelement <4 x float> undef, float %a.val, i32 0
+  %av1 = insertelement <4 x float> %av0, float 0.000000e+00, i32 1
+  %av2 = insertelement <4 x float> %av1, float 0.000000e+00, i32 2
+  %av  = insertelement <4 x float> %av2, float 0.000000e+00, i32 3
+
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+
+  %vr = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %av, <4 x float> %bv, <4 x float> %av, i8 %c, i32 4)
+
+  %sr = extractelement <4 x float> %vr, i32 0
+  store float %sr, float* %a
+  ret void
+}
+
+define void @fmadd_sd_mask_memfold(double* %a, double* %b, i8 %c) {
+; CHECK-LABEL: fmadd_sd_mask_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1}
+; CHECK-NEXT:    vmovlpd %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+  %a.val = load double, double* %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, double* %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, double* %a
+  ret void
+}
+
+define void @fmadd_sd_maskz_memfold(double* %a, double* %b, i8 %c) {
+; CHECK-LABEL: fmadd_sd_maskz_memfold:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    kmovw %edx, %k1
+; CHECK-NEXT:    vfmadd132sd (%rsi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vmovlpd %xmm0, (%rdi)
+; CHECK-NEXT:    retq
+  %a.val = load double, double* %a
+  %av0 = insertelement <2 x double> undef, double %a.val, i32 0
+  %av = insertelement <2 x double> %av0, double 0.000000e+00, i32 1
+
+  %b.val = load double, double* %b
+  %bv0 = insertelement <2 x double> undef, double %b.val, i32 0
+  %bv = insertelement <2 x double> %bv0, double 0.000000e+00, i32 1
+
+  %vr = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.sd(<2 x double> %av, <2 x double> %bv, <2 x double> %av, i8 %c, i32 4)
+
+  %sr = extractelement <2 x double> %vr, i32 0
+  store double %sr, double* %a
+  ret void
+}
+
 declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
 
 define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3,i32 %x4 ){
@@ -4846,11 +5065,11 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vmovapd %xmm2, %xmm3
 ; CHECK-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vmovapd %xmm2, %xmm4
 ; CHECK-NEXT:    vfmsub231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vmovaps %xmm2, %xmm5
+; CHECK-NEXT:    vmovapd %xmm2, %xmm5
 ; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
 ; CHECK-NEXT:    vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
@@ -4902,11 +5121,11 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3
+; CHECK-NEXT:    vmovapd %xmm2, %xmm3
 ; CHECK-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vmovapd %xmm2, %xmm4
 ; CHECK-NEXT:    vfnmsub231sd %xmm1, %xmm0, %xmm4
-; CHECK-NEXT:    vmovaps %xmm2, %xmm5
+; CHECK-NEXT:    vmovapd %xmm2, %xmm5
 ; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1}
 ; CHECK-NEXT:    vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2
 ; CHECK-NEXT:    vaddpd %xmm3, %xmm4, %xmm0
diff --git a/test/CodeGen/X86/avx512-load-store.ll b/test/CodeGen/X86/avx512-load-store.ll
index fe1003e8b7392367ba7c9e269931a1fbd0b488c8..3295c66c6d420129284a10c989c20dda119d09e0 100644
--- a/test/CodeGen/X86/avx512-load-store.ll
+++ b/test/CodeGen/X86/avx512-load-store.ll
@@ -1,12 +1,22 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O2 -mattr=avx512f -mtriple=x86_64-unknown | FileCheck %s
+; RUN: llc < %s -O2 -mattr=avx512f -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64
+; RUN: llc < %s -O2 -mattr=avx512f -mtriple=i386-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32
 
 define <4 x float> @test_mm_mask_move_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: test_mm_mask_move_ss:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovss %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    retq
+; CHECK64-LABEL: test_mm_mask_move_ss:
+; CHECK64:       # BB#0: # %entry
+; CHECK64-NEXT:    kmovw %edi, %k1
+; CHECK64-NEXT:    vmovss %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK64-NEXT:    retq
+;
+; CHECK32-LABEL: test_mm_mask_move_ss:
+; CHECK32:       # BB#0: # %entry
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    andl $1, %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
+; CHECK32-NEXT:    vmovss %xmm2, %xmm0, %xmm0 {%k1}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
   %tobool.i = icmp ne i8 %0, 0
@@ -18,11 +28,21 @@ entry:
 }
 
 define <4 x float> @test_mm_maskz_move_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: test_mm_maskz_move_ss:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; CHECK64-LABEL: test_mm_maskz_move_ss:
+; CHECK64:       # BB#0: # %entry
+; CHECK64-NEXT:    kmovw %edi, %k1
+; CHECK64-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK64-NEXT:    retq
+;
+; CHECK32-LABEL: test_mm_maskz_move_ss:
+; CHECK32:       # BB#0: # %entry
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    andl $1, %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
+; CHECK32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; CHECK32-NEXT:    vmovss %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK32-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
+; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
   %tobool.i = icmp ne i8 %0, 0
@@ -33,11 +53,20 @@ entry:
 }
 
 define <2 x double> @test_mm_mask_move_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: test_mm_mask_move_sd:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovsd %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    retq
+; CHECK64-LABEL: test_mm_mask_move_sd:
+; CHECK64:       # BB#0: # %entry
+; CHECK64-NEXT:    kmovw %edi, %k1
+; CHECK64-NEXT:    vmovsd %xmm2, %xmm1, %xmm0 {%k1}
+; CHECK64-NEXT:    retq
+;
+; CHECK32-LABEL: test_mm_mask_move_sd:
+; CHECK32:       # BB#0: # %entry
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    andl $1, %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
+; CHECK32-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
   %tobool.i = icmp ne i8 %0, 0
@@ -49,11 +78,21 @@ entry:
 }
 
 define <2 x double> @test_mm_maskz_move_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) local_unnamed_addr #0 {
-; CHECK-LABEL: test_mm_maskz_move_sd:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; CHECK64-LABEL: test_mm_maskz_move_sd:
+; CHECK64:       # BB#0: # %entry
+; CHECK64-NEXT:    kmovw %edi, %k1
+; CHECK64-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1} {z}
+; CHECK64-NEXT:    retq
+;
+; CHECK32-LABEL: test_mm_maskz_move_sd:
+; CHECK32:       # BB#0: # %entry
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; CHECK32-NEXT:    andl $1, %eax
+; CHECK32-NEXT:    kmovw %eax, %k1
+; CHECK32-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; CHECK32-NEXT:    vmovsd %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK32-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
+; CHECK32-NEXT:    retl
 entry:
   %0 = and i8 %__U, 1
   %tobool.i = icmp ne i8 %0, 0
@@ -64,11 +103,19 @@ entry:
 }
 
 define void @test_mm_mask_store_ss(float* %__W, i8 zeroext %__U, <4 x float> %__A) local_unnamed_addr #1 {
-; CHECK-LABEL: test_mm_mask_store_ss:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovss %xmm0, (%rdi) {%k1}
-; CHECK-NEXT:    retq
+; CHECK64-LABEL: test_mm_mask_store_ss:
+; CHECK64:       # BB#0: # %entry
+; CHECK64-NEXT:    kmovw %esi, %k1
+; CHECK64-NEXT:    vmovss %xmm0, (%rdi) {%k1}
+; CHECK64-NEXT:    retq
+;
+; CHECK32-LABEL: test_mm_mask_store_ss:
+; CHECK32:       # BB#0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    kmovw %ecx, %k1
+; CHECK32-NEXT:    vmovss %xmm0, (%eax) {%k1}
+; CHECK32-NEXT:    retl
 entry:
   %0 = bitcast float* %__W to <16 x float>*
   %shuffle.i.i = shufflevector <4 x float> %__A, <4 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -80,11 +127,19 @@ entry:
 }
 
 define void @test_mm_mask_store_sd(double* %__W, i8 zeroext %__U, <2 x double> %__A) local_unnamed_addr #1 {
-; CHECK-LABEL: test_mm_mask_store_sd:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vmovsd %xmm0, (%rdi) {%k1}
-; CHECK-NEXT:    retq
+; CHECK64-LABEL: test_mm_mask_store_sd:
+; CHECK64:       # BB#0: # %entry
+; CHECK64-NEXT:    kmovw %esi, %k1
+; CHECK64-NEXT:    vmovsd %xmm0, (%rdi) {%k1}
+; CHECK64-NEXT:    retq
+;
+; CHECK32-LABEL: test_mm_mask_store_sd:
+; CHECK32:       # BB#0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK32-NEXT:    kmovw %ecx, %k1
+; CHECK32-NEXT:    vmovsd %xmm0, (%eax) {%k1}
+; CHECK32-NEXT:    retl
 entry:
   %0 = bitcast double* %__W to <8 x double>*
   %shuffle.i.i = shufflevector <2 x double> %__A, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -95,11 +150,19 @@ entry:
 }
 
 define <4 x float> @test_mm_mask_load_ss(<4 x float> %__A, i8 zeroext %__U, float* %__W) local_unnamed_addr #2 {
-; CHECK-LABEL: test_mm_mask_load_ss:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovss (%rsi), %xmm0 {%k1}
-; CHECK-NEXT:    retq
+; CHECK64-LABEL: test_mm_mask_load_ss:
+; CHECK64:       # BB#0: # %entry
+; CHECK64-NEXT:    kmovw %edi, %k1
+; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    retq
+;
+; CHECK32-LABEL: test_mm_mask_load_ss:
+; CHECK32:       # BB#0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    kmovw %ecx, %k1
+; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    retl
 entry:
   %shuffle.i = shufflevector <4 x float> %__A, <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, <4 x i32> <i32 0, i32 4, i32 4, i32 4>
   %0 = bitcast float* %__W to <16 x float>*
@@ -113,11 +176,19 @@ entry:
 }
 
 define <2 x double> @test_mm_mask_load_sd(<2 x double> %__A, i8 zeroext %__U, double* %__W) local_unnamed_addr #2 {
-; CHECK-LABEL: test_mm_mask_load_sd:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
-; CHECK-NEXT:    retq
+; CHECK64-LABEL: test_mm_mask_load_sd:
+; CHECK64:       # BB#0: # %entry
+; CHECK64-NEXT:    kmovw %edi, %k1
+; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1}
+; CHECK64-NEXT:    retq
+;
+; CHECK32-LABEL: test_mm_mask_load_sd:
+; CHECK32:       # BB#0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK32-NEXT:    kmovw %ecx, %k1
+; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1}
+; CHECK32-NEXT:    retl
 entry:
   %shuffle5.i = insertelement <2 x double> %__A, double 0.000000e+00, i32 1
   %0 = bitcast double* %__W to <8 x double>*
@@ -130,11 +201,19 @@ entry:
 }
 
 define <4 x float> @test_mm_maskz_load_ss(i8 zeroext %__U, float* %__W) local_unnamed_addr #2 {
-; CHECK-LABEL: test_mm_maskz_load_ss:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; CHECK64-LABEL: test_mm_maskz_load_ss:
+; CHECK64:       # BB#0: # %entry
+; CHECK64-NEXT:    kmovw %edi, %k1
+; CHECK64-NEXT:    vmovss (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    retq
+;
+; CHECK32-LABEL: test_mm_maskz_load_ss:
+; CHECK32:       # BB#0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK32-NEXT:    kmovw %ecx, %k1
+; CHECK32-NEXT:    vmovss (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    retl
 entry:
   %0 = bitcast float* %__W to <16 x float>*
   %1 = and i8 %__U, 1
@@ -146,11 +225,19 @@ entry:
 }
 
 define <2 x double> @test_mm_maskz_load_sd(i8 zeroext %__U, double* %__W) local_unnamed_addr #2 {
-; CHECK-LABEL: test_mm_maskz_load_sd:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
-; CHECK-NEXT:    retq
+; CHECK64-LABEL: test_mm_maskz_load_sd:
+; CHECK64:       # BB#0: # %entry
+; CHECK64-NEXT:    kmovw %edi, %k1
+; CHECK64-NEXT:    vmovsd (%rsi), %xmm0 {%k1} {z}
+; CHECK64-NEXT:    retq
+;
+; CHECK32-LABEL: test_mm_maskz_load_sd:
+; CHECK32:       # BB#0: # %entry
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movb {{[0-9]+}}(%esp), %cl
+; CHECK32-NEXT:    kmovw %ecx, %k1
+; CHECK32-NEXT:    vmovsd (%eax), %xmm0 {%k1} {z}
+; CHECK32-NEXT:    retl
 entry:
   %0 = bitcast double* %__W to <8 x double>*
   %1 = and i8 %__U, 1
diff --git a/test/CodeGen/X86/avx512-logic.ll b/test/CodeGen/X86/avx512-logic.ll
index 119e03dc19daed45f8d88c66afe522785ba51ace..7153c1ffaaa698f5535a74021b562bf4d749473a 100644
--- a/test/CodeGen/X86/avx512-logic.ll
+++ b/test/CodeGen/X86/avx512-logic.ll
@@ -299,7 +299,7 @@ define <16 x float> @masked_and_v16f32(<16 x float> %a, <16 x float> %b, <16 x f
 ;
 ; SKX-LABEL: masked_and_v16f32:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandps %zmm1, %zmm0, %zmm2 {%k1}
 ; SKX-NEXT:    vaddps %zmm2, %zmm3, %zmm0
 ; SKX-NEXT:    retq
@@ -324,7 +324,7 @@ define <16 x float> @masked_or_v16f32(<16 x float> %a, <16 x float> %b, <16 x fl
 ;
 ; SKX-LABEL: masked_or_v16f32:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandps %zmm1, %zmm0, %zmm2 {%k1}
 ; SKX-NEXT:    vaddps %zmm2, %zmm3, %zmm0
 ; SKX-NEXT:    retq
@@ -349,7 +349,7 @@ define <16 x float> @masked_xor_v16f32(<16 x float> %a, <16 x float> %b, <16 x f
 ;
 ; SKX-LABEL: masked_xor_v16f32:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandps %zmm1, %zmm0, %zmm2 {%k1}
 ; SKX-NEXT:    vaddps %zmm2, %zmm3, %zmm0
 ; SKX-NEXT:    retq
@@ -374,7 +374,7 @@ define <8 x double> @masked_and_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou
 ;
 ; SKX-LABEL: masked_and_v8f64:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandpd %zmm1, %zmm0, %zmm2 {%k1}
 ; SKX-NEXT:    vaddpd %zmm2, %zmm3, %zmm0
 ; SKX-NEXT:    retq
@@ -399,7 +399,7 @@ define <8 x double> @masked_or_v8f64(<8 x double> %a, <8 x double> %b, <8 x doub
 ;
 ; SKX-LABEL: masked_or_v8f64:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandpd %zmm1, %zmm0, %zmm2 {%k1}
 ; SKX-NEXT:    vaddpd %zmm2, %zmm3, %zmm0
 ; SKX-NEXT:    retq
@@ -424,7 +424,7 @@ define <8 x double> @masked_xor_v8f64(<8 x double> %a, <8 x double> %b, <8 x dou
 ;
 ; SKX-LABEL: masked_xor_v8f64:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandpd %zmm1, %zmm0, %zmm2 {%k1}
 ; SKX-NEXT:    vaddpd %zmm2, %zmm3, %zmm0
 ; SKX-NEXT:    retq
@@ -448,7 +448,7 @@ define <8 x i64> @test_mm512_mask_and_epi32(<8 x i64> %__src, i16 zeroext %__k,
 ;
 ; SKX-LABEL: test_mm512_mask_and_epi32:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandps %zmm2, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -470,7 +470,7 @@ define <8 x i64> @test_mm512_mask_or_epi32(<8 x i64> %__src, i16 zeroext %__k, <
 ;
 ; SKX-LABEL: test_mm512_mask_or_epi32:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorps %zmm2, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -492,7 +492,7 @@ define <8 x i64> @test_mm512_mask_xor_epi32(<8 x i64> %__src, i16 zeroext %__k,
 ;
 ; SKX-LABEL: test_mm512_mask_xor_epi32:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorps %zmm2, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -514,7 +514,7 @@ define <8 x double> @test_mm512_mask_xor_pd(<8 x double> %__W, i8 zeroext %__U,
 ;
 ; SKX-LABEL: test_mm512_mask_xor_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorpd %zmm2, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -536,7 +536,7 @@ define <8 x double> @test_mm512_maskz_xor_pd(i8 zeroext %__U, <8 x double> %__A,
 ;
 ; SKX-LABEL: test_mm512_maskz_xor_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorpd %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -558,7 +558,7 @@ define <16 x float> @test_mm512_mask_xor_ps(<16 x float> %__W, i16 zeroext %__U,
 ;
 ; SKX-LABEL: test_mm512_mask_xor_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorps %zmm2, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -580,7 +580,7 @@ define <16 x float> @test_mm512_maskz_xor_ps(i16 zeroext %__U, <16 x float> %__A
 ;
 ; SKX-LABEL: test_mm512_maskz_xor_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorps %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -602,7 +602,7 @@ define <8 x double> @test_mm512_mask_or_pd(<8 x double> %__W, i8 zeroext %__U, <
 ;
 ; SKX-LABEL: test_mm512_mask_or_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorpd %zmm1, %zmm2, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -624,7 +624,7 @@ define <8 x double> @test_mm512_maskz_or_pd(i8 zeroext %__U, <8 x double> %__A,
 ;
 ; SKX-LABEL: test_mm512_maskz_or_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorpd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -646,7 +646,7 @@ define <16 x float> @test_mm512_mask_or_ps(<16 x float> %__W, i16 zeroext %__U,
 ;
 ; SKX-LABEL: test_mm512_mask_or_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorps %zmm1, %zmm2, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -668,7 +668,7 @@ define <16 x float> @test_mm512_maskz_or_ps(i16 zeroext %__U, <16 x float> %__A,
 ;
 ; SKX-LABEL: test_mm512_maskz_or_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -690,7 +690,7 @@ define <8 x double> @test_mm512_mask_and_pd(<8 x double> %__W, i8 zeroext %__U,
 ;
 ; SKX-LABEL: test_mm512_mask_and_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandpd %zmm1, %zmm2, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -712,7 +712,7 @@ define <8 x double> @test_mm512_maskz_and_pd(i8 zeroext %__U, <8 x double> %__A,
 ;
 ; SKX-LABEL: test_mm512_maskz_and_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandpd %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -734,7 +734,7 @@ define <16 x float> @test_mm512_mask_and_ps(<16 x float> %__W, i16 zeroext %__U,
 ;
 ; SKX-LABEL: test_mm512_mask_and_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandps %zmm1, %zmm2, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -756,7 +756,7 @@ define <16 x float> @test_mm512_maskz_and_ps(i16 zeroext %__U, <16 x float> %__A
 ;
 ; SKX-LABEL: test_mm512_maskz_and_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandps %zmm0, %zmm1, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -778,7 +778,7 @@ define <8 x double> @test_mm512_mask_andnot_pd(<8 x double> %__W, i8 zeroext %__
 ;
 ; SKX-LABEL: test_mm512_mask_andnot_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnpd %zmm2, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -801,7 +801,7 @@ define <8 x double> @test_mm512_maskz_andnot_pd(i8 zeroext %__U, <8 x double> %_
 ;
 ; SKX-LABEL: test_mm512_maskz_andnot_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnpd %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -824,7 +824,7 @@ define <16 x float> @test_mm512_mask_andnot_ps(<16 x float> %__W, i16 zeroext %_
 ;
 ; SKX-LABEL: test_mm512_mask_andnot_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnps %zmm2, %zmm1, %zmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -847,7 +847,7 @@ define <16 x float> @test_mm512_maskz_andnot_ps(i16 zeroext %__U, <16 x float> %
 ;
 ; SKX-LABEL: test_mm512_maskz_andnot_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnps %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index 41fb19f38e09a727386747bba5956cd2aa3e3bdb..aec1339d653da2c100b2175032c900084df7b3b0 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -1,14 +1,42 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
 ; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mattr=+avx512bw  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512BW
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mattr=+avx512dq  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512DQ
+
 
 define i16 @mask16(i16 %x) {
-; CHECK-LABEL: mask16:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    knotw %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; KNL-LABEL: mask16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    knotw %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: mask16:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    knotw %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: mask16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    knotw %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT:    retq
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <16 x i1> %m1 to i16
@@ -16,12 +44,33 @@ define i16 @mask16(i16 %x) {
 }
 
 define i32 @mask16_zext(i16 %x) {
-; CHECK-LABEL: mask16_zext:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    knotw %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; KNL-LABEL: mask16_zext:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    knotw %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: mask16_zext:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    knotw %k0, %k0
+; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: mask16_zext:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kmovw %k0, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask16_zext:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    knotw %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    retq
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = xor <16 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %m2 = bitcast <16 x i1> %m1 to i16
@@ -35,14 +84,32 @@ define i8 @mask8(i8 %x) {
 ; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    knotw %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: mask8:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k0
+; SKX-NEXT:    kmovd %edi, %k0
 ; SKX-NEXT:    knotb %k0, %k0
-; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: mask8:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    knotb %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %ret = bitcast <8 x i1> %m1 to i8
@@ -60,10 +127,25 @@ define i32 @mask8_zext(i8 %x) {
 ;
 ; SKX-LABEL: mask8_zext:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k0
+; SKX-NEXT:    kmovd %edi, %k0
 ; SKX-NEXT:    knotb %k0, %k0
 ; SKX-NEXT:    kmovb %k0, %eax
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: mask8_zext:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movzbl %al, %eax
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask8_zext:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    knotb %k0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, %eax
+; AVX512DQ-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
   %m2 = bitcast <8 x i1> %m1 to i8
@@ -102,6 +184,22 @@ define void @mask8_mem(i8* %ptr) {
 ; SKX-NEXT:    knotb %k0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rdi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: mask8_mem:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    movzbl (%rdi), %eax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: mask8_mem:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    knotb %k0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
+; AVX512DQ-NEXT:    retq
   %x = load i8, i8* %ptr, align 4
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -129,15 +227,49 @@ define i16 @mand16(i16 %x, i16 %y) {
 }
 
 define i16 @mand16_mem(<16 x i1>* %x, <16 x i1>* %y) {
-; CHECK-LABEL: mand16_mem:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw (%rdi), %k0
-; CHECK-NEXT:    kmovw (%rsi), %k1
-; CHECK-NEXT:    kandw %k1, %k0, %k2
-; CHECK-NEXT:    kxorw %k1, %k0, %k0
-; CHECK-NEXT:    korw %k0, %k2, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; KNL-LABEL: mand16_mem:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw (%rdi), %k0
+; KNL-NEXT:    kmovw (%rsi), %k1
+; KNL-NEXT:    kandw %k1, %k0, %k2
+; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    korw %k0, %k2, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: mand16_mem:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovw (%rdi), %k0
+; SKX-NEXT:    kmovw (%rsi), %k1
+; SKX-NEXT:    kandw %k1, %k0, %k2
+; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    korw %k0, %k2, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: mand16_mem:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    kmovw (%rsi), %k1
+; AVX512BW-NEXT:    kandw %k1, %k0, %k2
+; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k2, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: mand16_mem:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw (%rsi), %k1
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k2
+; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
+; AVX512DQ-NEXT:    korw %k0, %k2, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT:    retq
   %ma = load <16 x i1>, <16 x i1>* %x
   %mb = load <16 x i1>, <16 x i1>* %y
   %mc = and <16 x i1> %ma, %mb
@@ -153,14 +285,32 @@ define i8 @shuf_test1(i16 %v) nounwind {
 ; KNL-NEXT:    kmovw %edi, %k0
 ; KNL-NEXT:    kshiftrw $8, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: shuf_test1:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovw %edi, %k0
+; SKX-NEXT:    kmovd %edi, %k0
 ; SKX-NEXT:    kshiftrw $8, %k0, %k0
-; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: shuf_test1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: shuf_test1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kshiftrw $8, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT:    retq
    %v1 = bitcast i16 %v to <16 x i1>
    %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
    %mask1 = bitcast <8 x i1> %mask to i8
@@ -168,14 +318,44 @@ define i8 @shuf_test1(i16 %v) nounwind {
 }
 
 define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test1:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kshiftlw $10, %k0, %k0
-; CHECK-NEXT:    kshiftrw $15, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    retq
+; KNL-LABEL: zext_test1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_test1:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT:    kshiftlw $10, %k0, %k0
+; SKX-NEXT:    kshiftrw $15, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: zext_test1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: zext_test1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
   %res = zext i1 %cmp_res.i1 to i32
@@ -183,15 +363,48 @@ define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
 }
 
 define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test2:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kshiftlw $10, %k0, %k0
-; CHECK-NEXT:    kshiftrw $15, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT:    retq
+; KNL-LABEL: zext_test2:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_test2:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT:    kshiftlw $10, %k0, %k0
+; SKX-NEXT:    kshiftrw $15, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: zext_test2:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: zext_test2:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
   %res = zext i1 %cmp_res.i1 to i16
@@ -199,15 +412,48 @@ define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
 }
 
 define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
-; CHECK-LABEL: zext_test3:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kshiftlw $10, %k0, %k0
-; CHECK-NEXT:    kshiftrw $15, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
-; CHECK-NEXT:    retq
+; KNL-LABEL: zext_test3:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; KNL-NEXT:    kshiftlw $10, %k0, %k0
+; KNL-NEXT:    kshiftrw $15, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: zext_test3:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; SKX-NEXT:    kshiftlw $10, %k0, %k0
+; SKX-NEXT:    kshiftrw $15, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    andl $1, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: zext_test3:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: zext_test3:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpcmpnleud %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %cmp_res = icmp ugt <16 x i32> %a, %b
   %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
   %res = zext i1 %cmp_res.i1 to i8
@@ -231,6 +477,23 @@ define i8 @conv1(<8 x i1>* %R) {
 ; SKX-NEXT:    movb $-2, -{{[0-9]+}}(%rsp)
 ; SKX-NEXT:    movb $-2, %al
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: conv1:
+; AVX512BW:       ## BB#0: ## %entry
+; AVX512BW-NEXT:    kxnorw %k0, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    movb $-2, -{{[0-9]+}}(%rsp)
+; AVX512BW-NEXT:    movb $-2, %al
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: conv1:
+; AVX512DQ:       ## BB#0: ## %entry
+; AVX512DQ-NEXT:    kxnorw %k0, %k0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
+; AVX512DQ-NEXT:    movb $-2, -{{[0-9]+}}(%rsp)
+; AVX512DQ-NEXT:    movb $-2, %al
+; AVX512DQ-NEXT:    retq
 entry:
   store <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1>* %R
 
@@ -257,7 +520,28 @@ define <4 x i32> @test4(<4 x i64> %x, <4 x i64> %y, <4 x i64> %x1, <4 x i64> %y1
 ; SKX-NEXT:    vpcmpgtq %ymm3, %ymm2, %k1
 ; SKX-NEXT:    kandnw %k0, %k1, %k0
 ; SKX-NEXT:    vpmovm2d %k0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test4:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm1
+; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512BW-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test4:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpcmpgtq %ymm3, %ymm2, %ymm1
+; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
+; AVX512DQ-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %x_gt_y = icmp sgt <4 x i64> %x, %y
   %x1_gt_y1 = icmp sgt <4 x i64> %x1, %y1
   %res = icmp sgt <4 x i1>%x_gt_y, %x1_gt_y1
@@ -280,6 +564,20 @@ define <2 x i64> @test5(<2 x i64> %x, <2 x i64> %y, <2 x i64> %x1, <2 x i64> %y1
 ; SKX-NEXT:    kandnw %k1, %k0, %k0
 ; SKX-NEXT:    vpmovm2q %k0, %xmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test5:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; AVX512BW-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test5:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vpcmpgtq %xmm3, %xmm2, %xmm1
+; AVX512DQ-NEXT:    vpcmpgtq %xmm0, %xmm1, %xmm0
+; AVX512DQ-NEXT:    retq
   %x_gt_y = icmp slt <2 x i64> %x, %y
   %x1_gt_y1 = icmp sgt <2 x i64> %x1, %y1
   %res = icmp slt <2 x i1>%x_gt_y, %x1_gt_y1
@@ -316,10 +614,34 @@ define void @test7(<8 x i1> %mask)  {
 ; SKX-NEXT:    vpsllw $15, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovw2m %xmm0, %k0
 ; SKX-NEXT:    movb $85, %al
-; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    ktestb %k0, %k0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test7:
+; AVX512BW:       ## BB#0: ## %allocas
+; AVX512BW-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT:    movb $85, %al
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    korw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    testb %al, %al
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test7:
+; AVX512DQ:       ## BB#0: ## %allocas
+; AVX512DQ-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    movb $85, %al
+; AVX512DQ-NEXT:    kmovw %eax, %k1
+; AVX512DQ-NEXT:    korb %k1, %k0, %k0
+; AVX512DQ-NEXT:    ktestb %k0, %k0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
 allocas:
   %a= or <8 x i1> %mask, <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>
   %b = bitcast <8 x i1> %a to i8
@@ -356,11 +678,45 @@ define <16 x i8> @test8(<16 x i32>%a, <16 x i32>%b, i32 %a1, i32 %b1) {
 ; SKX-NEXT:  ## BB#2:
 ; SKX-NEXT:    vpcmpltud %zmm2, %zmm1, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ; SKX-NEXT:  LBB17_1:
 ; SKX-NEXT:    vpcmpgtd %zmm2, %zmm0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test8:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    jg LBB17_1
+; AVX512BW-NEXT:  ## BB#2:
+; AVX512BW-NEXT:    vpcmpltud %zmm2, %zmm1, %k0
+; AVX512BW-NEXT:    jmp LBB17_3
+; AVX512BW-NEXT:  LBB17_1:
+; AVX512BW-NEXT:    vpcmpgtd %zmm2, %zmm0, %k0
+; AVX512BW-NEXT:  LBB17_3:
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test8:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    jg LBB17_1
+; AVX512DQ-NEXT:  ## BB#2:
+; AVX512DQ-NEXT:    vpcmpltud %zmm2, %zmm1, %k0
+; AVX512DQ-NEXT:    jmp LBB17_3
+; AVX512DQ-NEXT:  LBB17_1:
+; AVX512DQ-NEXT:    vpcmpgtd %zmm2, %zmm0, %k0
+; AVX512DQ-NEXT:  LBB17_3:
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %cond = icmp sgt i32 %a1, %b1
   %cmp1 = icmp sgt <16 x i32> %a, zeroinitializer
   %cmp2 = icmp ult <16 x i32> %b, zeroinitializer
@@ -398,6 +754,39 @@ define <16 x i1> @test9(<16 x i1>%a, <16 x i1>%b, i32 %a1, i32 %b1) {
 ; SKX-NEXT:    vpmovb2m %xmm0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test9:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    jg LBB18_1
+; AVX512BW-NEXT:  ## BB#2:
+; AVX512BW-NEXT:    vpsllw $7, %xmm1, %xmm0
+; AVX512BW-NEXT:    jmp LBB18_3
+; AVX512BW-NEXT:  LBB18_1:
+; AVX512BW-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX512BW-NEXT:  LBB18_3:
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test9:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    jg LBB18_1
+; AVX512DQ-NEXT:  ## BB#2:
+; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm0
+; AVX512DQ-NEXT:    jmp LBB18_3
+; AVX512DQ-NEXT:  LBB18_1:
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:  LBB18_3:
+; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %mask = icmp sgt i32 %a1, %b1
   %c = select i1 %mask, <16 x i1>%a, <16 x i1>%b
   ret <16 x i1>%c
@@ -430,6 +819,24 @@ define <4 x i1> @test11(<4 x i1>%a, <4 x i1>%b, i32 %a1, i32 %b1) {
 ; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0
 ; SKX-NEXT:    vpmovm2d %k0, %xmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test11:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    jg LBB20_2
+; AVX512BW-NEXT:  ## BB#1:
+; AVX512BW-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512BW-NEXT:  LBB20_2:
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test11:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    jg LBB20_2
+; AVX512DQ-NEXT:  ## BB#1:
+; AVX512DQ-NEXT:    vmovaps %xmm1, %xmm0
+; AVX512DQ-NEXT:  LBB20_2:
+; AVX512DQ-NEXT:    retq
   %mask = icmp sgt i32 %a1, %b1
   %c = select i1 %mask, <4 x i1>%a, <4 x i1>%b
   ret <4 x i1>%c
@@ -480,9 +887,33 @@ define <16 x i1> @test15(i32 %x, i32 %y)  {
 ; SKX-NEXT:    movw $21845, %ax ## imm = 0x5555
 ; SKX-NEXT:    movw $1, %cx
 ; SKX-NEXT:    cmovgw %ax, %cx
-; SKX-NEXT:    kmovw %ecx, %k0
+; SKX-NEXT:    kmovd %ecx, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test15:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    cmpl %esi, %edi
+; AVX512BW-NEXT:    movw $21845, %ax ## imm = 0x5555
+; AVX512BW-NEXT:    movw $1, %cx
+; AVX512BW-NEXT:    cmovgw %ax, %cx
+; AVX512BW-NEXT:    kmovd %ecx, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test15:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    cmpl %esi, %edi
+; AVX512DQ-NEXT:    movw $21845, %ax ## imm = 0x5555
+; AVX512DQ-NEXT:    movw $1, %cx
+; AVX512DQ-NEXT:    cmovgw %ax, %cx
+; AVX512DQ-NEXT:    kmovw %ecx, %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %a = bitcast i16 21845 to <16 x i1>
   %b = bitcast i16 1 to <16 x i1>
   %mask = icmp sgt i32 %x, %y
@@ -509,18 +940,13 @@ define <64 x i8> @test16(i64 %x) {
 ; KNL-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    kmovw (%rsp), %k1
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; KNL-NEXT:    movl $1, %eax
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
-; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
 ; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -528,6 +954,10 @@ define <64 x i8> @test16(i64 %x) {
 ; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
 ; KNL-NEXT:    vpmovdb %zmm2, %xmm2
 ; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
@@ -543,11 +973,64 @@ define <64 x i8> @test16(i64 %x) {
 ; SKX-NEXT:    movl $32, %eax
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
-; SKX-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test16:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k0
+; AVX512BW-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512BW-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512BW-NEXT:    vpsllq $40, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test16:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:  Lcfi0:
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 16
+; AVX512DQ-NEXT:  Lcfi1:
+; AVX512DQ-NEXT:    .cfi_offset %rbp, -16
+; AVX512DQ-NEXT:    movq %rsp, %rbp
+; AVX512DQ-NEXT:  Lcfi2:
+; AVX512DQ-NEXT:    .cfi_def_cfa_register %rbp
+; AVX512DQ-NEXT:    andq $-32, %rsp
+; AVX512DQ-NEXT:    subq $64, %rsp
+; AVX512DQ-NEXT:    movl %edi, (%rsp)
+; AVX512DQ-NEXT:    shrq $32, %rdi
+; AVX512DQ-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; AVX512DQ-NEXT:    kmovw (%rsp), %k0
+; AVX512DQ-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    movl $1, %eax
+; AVX512DQ-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0
+; AVX512DQ-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm2
+; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    movq %rbp, %rsp
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
   %a = bitcast i64 %x to <64 x i1>
   %b = insertelement <64 x i1>%a, i1 true, i32 5
   %c = sext <64 x i1>%b to <64 x i8>
@@ -573,20 +1056,15 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; KNL-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
 ; KNL-NEXT:    kmovw (%rsp), %k1
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
-; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
-; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
-; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; KNL-NEXT:    xorl %eax, %eax
 ; KNL-NEXT:    cmpl %edx, %esi
 ; KNL-NEXT:    setg %al
-; KNL-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
-; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
-; KNL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; KNL-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; KNL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
 ; KNL-NEXT:    kmovw {{[0-9]+}}(%rsp), %k2
 ; KNL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
@@ -594,6 +1072,10 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; KNL-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2 {%k2} {z}
 ; KNL-NEXT:    vpmovdb %zmm2, %xmm2
 ; KNL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; KNL-NEXT:    vpsllw $7, %ymm0, %ymm0
+; KNL-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; KNL-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
 ; KNL-NEXT:    movq %rbp, %rsp
 ; KNL-NEXT:    popq %rbp
 ; KNL-NEXT:    retq
@@ -604,18 +1086,75 @@ define <64 x i8> @test17(i64 %x, i32 %y, i32 %z) {
 ; SKX-NEXT:    cmpl %edx, %esi
 ; SKX-NEXT:    setg %al
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vpmovm2b %k1, %zmm0
 ; SKX-NEXT:    vpsllq $40, %xmm0, %xmm0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm1
 ; SKX-NEXT:    movl $32, %eax
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vpblendmb %ymm0, %ymm1, %ymm0 {%k1}
-; SKX-NEXT:    vextracti64x4 $1, %zmm1, %ymm1
-; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; SKX-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test17:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k0
+; AVX512BW-NEXT:    cmpl %edx, %esi
+; AVX512BW-NEXT:    setg %al
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpmovm2b %k1, %zmm0
+; AVX512BW-NEXT:    vpsllq $40, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
+; AVX512BW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm1[4,5,6,7]
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test17:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:  Lcfi3:
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 16
+; AVX512DQ-NEXT:  Lcfi4:
+; AVX512DQ-NEXT:    .cfi_offset %rbp, -16
+; AVX512DQ-NEXT:    movq %rsp, %rbp
+; AVX512DQ-NEXT:  Lcfi5:
+; AVX512DQ-NEXT:    .cfi_def_cfa_register %rbp
+; AVX512DQ-NEXT:    andq $-32, %rsp
+; AVX512DQ-NEXT:    subq $64, %rsp
+; AVX512DQ-NEXT:    movl %edi, (%rsp)
+; AVX512DQ-NEXT:    shrq $32, %rdi
+; AVX512DQ-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; AVX512DQ-NEXT:    kmovw (%rsp), %k0
+; AVX512DQ-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    xorl %eax, %eax
+; AVX512DQ-NEXT:    cmpl %edx, %esi
+; AVX512DQ-NEXT:    setg %al
+; AVX512DQ-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512DQ-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0
+; AVX512DQ-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm1
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm2
+; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpxor %ymm2, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpcmpgtb %ymm0, %ymm2, %ymm0
+; AVX512DQ-NEXT:    movq %rbp, %rsp
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    retq
   %a = bitcast i64 %x to <64 x i1>
   %b = icmp sgt i32 %y, %z
   %c = insertelement <64 x i1>%a, i1 %b, i32 5
@@ -648,8 +1187,8 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ;
 ; SKX-LABEL: test18:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k0
-; SKX-NEXT:    kmovw %esi, %k1
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
 ; SKX-NEXT:    kshiftlw $7, %k1, %k2
 ; SKX-NEXT:    kshiftrw $15, %k2, %k2
 ; SKX-NEXT:    kshiftlw $6, %k1, %k1
@@ -664,7 +1203,53 @@ define <8 x i1> @test18(i8 %a, i16 %y) {
 ; SKX-NEXT:    kshiftlb $7, %k2, %k1
 ; SKX-NEXT:    korb %k1, %k0, %k0
 ; SKX-NEXT:    vpmovm2w %k0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test18:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    kmovd %esi, %k2
+; AVX512BW-NEXT:    kshiftlw $7, %k2, %k0
+; AVX512BW-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512BW-NEXT:    kshiftlw $6, %k2, %k2
+; AVX512BW-NEXT:    kshiftrw $15, %k2, %k2
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
+; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512BW-NEXT:    vpsllq $63, %zmm2, %zmm0
+; AVX512BW-NEXT:    vptestmq %zmm0, %zmm0, %k1
+; AVX512BW-NEXT:    kshiftlw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftrw $1, %k1, %k1
+; AVX512BW-NEXT:    kshiftlw $7, %k0, %k0
+; AVX512BW-NEXT:    korw %k0, %k1, %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test18:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kmovw %esi, %k1
+; AVX512DQ-NEXT:    kshiftlw $7, %k1, %k2
+; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k2
+; AVX512DQ-NEXT:    kshiftlw $6, %k1, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovm2q %k1, %zmm1
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,5,8,7]
+; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
+; AVX512DQ-NEXT:    vpmovq2m %zmm2, %k0
+; AVX512DQ-NEXT:    kshiftlb $1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftrb $1, %k0, %k0
+; AVX512DQ-NEXT:    kshiftlb $7, %k2, %k1
+; AVX512DQ-NEXT:    korb %k1, %k0, %k0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %b = bitcast i8 %a to <8 x i1>
   %b1 = bitcast i16 %y to <16 x i1>
   %el1 = extractelement <16 x i1>%b1, i32 8
@@ -693,6 +1278,26 @@ define <32 x i16> @test21(<32 x i16> %x , <32 x i1> %mask) nounwind readnone {
 ; SKX-NEXT:    vpmovb2m %ymm1, %k1
 ; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test21:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $7, %ymm1, %ymm1
+; AVX512BW-NEXT:    vpmovb2m %zmm1, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test21:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512DQ-NEXT:    vpsllw $15, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpsraw $15, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpand %ymm0, %ymm3, %ymm0
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
+; AVX512DQ-NEXT:    vpmovzxbw {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero,xmm2[8],zero,xmm2[9],zero,xmm2[10],zero,xmm2[11],zero,xmm2[12],zero,xmm2[13],zero,xmm2[14],zero,xmm2[15],zero
+; AVX512DQ-NEXT:    vpsllw $15, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpsraw $15, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; AVX512DQ-NEXT:    retq
   %ret = select <32 x i1> %mask, <32 x i16> %x, <32 x i16> zeroinitializer
   ret <32 x i16> %ret
 }
@@ -713,6 +1318,25 @@ define void @test22(<4 x i1> %a, <4 x i1>* %addr) {
 ; SKX-NEXT:    vptestmd %xmm0, %xmm0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rdi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test22:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512BW-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test22:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX512DQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   store <4 x i1> %a, <4 x i1>* %addr
   ret void
 }
@@ -733,6 +1357,25 @@ define void @test23(<2 x i1> %a, <2 x i1>* %addr) {
 ; SKX-NEXT:    vptestmq %xmm0, %xmm0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rdi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test23:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test23:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   store <2 x i1> %a, <2 x i1>* %addr
   ret void
 }
@@ -752,12 +1395,33 @@ define void @store_v1i1(<1 x i1> %c , <1 x i1>* %ptr) {
 ; SKX-LABEL: store_v1i1:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    andl $1, %edi
-; SKX-NEXT:    kmovw %edi, %k0
+; SKX-NEXT:    kmovd %edi, %k0
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX-NEXT:    kshiftrw $15, %k1, %k1
 ; SKX-NEXT:    kxorw %k1, %k0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rsi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_v1i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    andl $1, %edi
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rsi)
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_v1i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    andl $1, %edi
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rsi)
+; AVX512DQ-NEXT:    retq
   %x = xor <1 x i1> %c, <i1 1>
   store <1 x i1> %x, <1 x i1>*  %ptr, align 4
   ret void
@@ -780,6 +1444,25 @@ define void @store_v2i1(<2 x i1> %c , <2 x i1>* %ptr) {
 ; SKX-NEXT:    knotw %k0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rdi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_v2i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512BW-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_v2i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %x = xor <2 x i1> %c, <i1 1, i1 1>
   store <2 x i1> %x, <2 x i1>*  %ptr, align 4
   ret void
@@ -803,6 +1486,27 @@ define void @store_v4i1(<4 x i1> %c , <4 x i1>* %ptr) {
 ; SKX-NEXT:    knotw %k0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rdi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_v4i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512BW-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512BW-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_v4i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
+; AVX512DQ-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpslld $31, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %x = xor <4 x i1> %c, <i1 1, i1 1, i1 1, i1 1>
   store <4 x i1> %x, <4 x i1>*  %ptr, align 4
   ret void
@@ -826,6 +1530,26 @@ define void @store_v8i1(<8 x i1> %c , <8 x i1>* %ptr) {
 ; SKX-NEXT:    knotb %k0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rdi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_v8i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_v8i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    knotb %k0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %x = xor <8 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
   store <8 x i1> %x, <8 x i1>*  %ptr, align 4
   ret void
@@ -848,6 +1572,25 @@ define void @store_v16i1(<16 x i1> %c , <16 x i1>* %ptr) {
 ; SKX-NEXT:    knotw %k0, %k0
 ; SKX-NEXT:    kmovw %k0, (%rdi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_v16i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    knotw %k0, %k0
+; AVX512BW-NEXT:    kmovw %k0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_v16i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    knotw %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %x = xor <16 x i1> %c, <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>
   store <16 x i1> %x, <16 x i1>*  %ptr, align 4
   ret void
@@ -886,13 +1629,40 @@ define void @f1(i32 %c) {
 ; SKX-NEXT:    movzbl {{.*}}(%rip), %edi
 ; SKX-NEXT:    movl %edi, %eax
 ; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    kmovw %eax, %k0
+; SKX-NEXT:    kmovd %eax, %k0
 ; SKX-NEXT:    kxnorw %k0, %k0, %k1
 ; SKX-NEXT:    kshiftrw $15, %k1, %k1
 ; SKX-NEXT:    kxorw %k1, %k0, %k0
 ; SKX-NEXT:    kmovb %k0, {{.*}}(%rip)
 ; SKX-NEXT:    xorl $1, %edi
 ; SKX-NEXT:    jmp _f2 ## TAILCALL
+;
+; AVX512BW-LABEL: f1:
+; AVX512BW:       ## BB#0: ## %entry
+; AVX512BW-NEXT:    movzbl {{.*}}(%rip), %edi
+; AVX512BW-NEXT:    movl %edi, %eax
+; AVX512BW-NEXT:    andl $1, %eax
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512BW-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, {{.*}}(%rip)
+; AVX512BW-NEXT:    xorl $1, %edi
+; AVX512BW-NEXT:    jmp _f2 ## TAILCALL
+;
+; AVX512DQ-LABEL: f1:
+; AVX512DQ:       ## BB#0: ## %entry
+; AVX512DQ-NEXT:    movzbl {{.*}}(%rip), %edi
+; AVX512DQ-NEXT:    movl %edi, %eax
+; AVX512DQ-NEXT:    andl $1, %eax
+; AVX512DQ-NEXT:    kmovw %eax, %k0
+; AVX512DQ-NEXT:    kxnorw %k0, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, {{.*}}(%rip)
+; AVX512DQ-NEXT:    xorl $1, %edi
+; AVX512DQ-NEXT:    jmp _f2 ## TAILCALL
 entry:
   %.b1 = load i1, i1* @f1.v, align 4
   %not..b1 = xor i1 %.b1, true
@@ -929,14 +1699,8 @@ define void @store_i8_i1(i8 %x, i1 *%y) {
 define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
 ; KNL-LABEL: test_build_vec_v32i1:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; KNL-NEXT:    vpsllw $15, %ymm2, %ymm2
-; KNL-NEXT:    vpsraw $15, %ymm2, %ymm2
-; KNL-NEXT:    vpand %ymm0, %ymm2, %ymm0
-; KNL-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; KNL-NEXT:    vpsllw $15, %ymm2, %ymm2
-; KNL-NEXT:    vpsraw $15, %ymm2, %ymm2
-; KNL-NEXT:    vpand %ymm1, %ymm2, %ymm1
+; KNL-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_build_vec_v32i1:
@@ -945,6 +1709,19 @@ define <32 x i16> @test_build_vec_v32i1(<32 x i16> %x) {
 ; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_build_vec_v32i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    movl $1497715861, %eax ## imm = 0x59455495
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_build_vec_v32i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
   %ret = select <32 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <32 x i16> %x, <32 x i16> zeroinitializer
   ret <32 x i16> %ret
 }
@@ -962,6 +1739,19 @@ define <64 x i8> @test_build_vec_v64i1(<64 x i8> %x) {
 ; SKX-NEXT:    kmovq %rax, %k1
 ; SKX-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z}
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_build_vec_v64i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    movabsq $6432645796886517060, %rax ## imm = 0x5945594549549544
+; AVX512BW-NEXT:    kmovq %rax, %k1
+; AVX512BW-NEXT:    vmovdqu8 %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_build_vec_v64i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX512DQ-NEXT:    vandps {{.*}}(%rip), %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
   %ret = select <64 x i1> <i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 false, i1 false, i1 true, i1 true, i1 false, i1 true, i1 false>, <64 x i8> %x, <64 x i8> zeroinitializer
   ret <64 x i8> %ret
 }
@@ -993,10 +1783,47 @@ define void @ktest_1(<8 x double> %in, double * %base) {
 ; SKX-NEXT:    je LBB41_2
 ; SKX-NEXT:  ## BB#1: ## %L1
 ; SKX-NEXT:    vmovapd %zmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ; SKX-NEXT:  LBB41_2: ## %L2
 ; SKX-NEXT:    vmovapd %zmm0, 8(%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: ktest_1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vmovupd (%rdi), %zmm1
+; AVX512BW-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; AVX512BW-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    testb %al, %al
+; AVX512BW-NEXT:    je LBB41_2
+; AVX512BW-NEXT:  ## BB#1: ## %L1
+; AVX512BW-NEXT:    vmovapd %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+; AVX512BW-NEXT:  LBB41_2: ## %L2
+; AVX512BW-NEXT:    vmovapd %zmm0, 8(%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: ktest_1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vmovupd (%rdi), %zmm1
+; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
+; AVX512DQ-NEXT:    vmovupd 8(%rdi), %zmm1 {%k1} {z}
+; AVX512DQ-NEXT:    vcmpltpd %zmm1, %zmm0, %k0 {%k1}
+; AVX512DQ-NEXT:    ktestb %k0, %k0
+; AVX512DQ-NEXT:    je LBB41_2
+; AVX512DQ-NEXT:  ## BB#1: ## %L1
+; AVX512DQ-NEXT:    vmovapd %zmm0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
+; AVX512DQ-NEXT:  LBB41_2: ## %L2
+; AVX512DQ-NEXT:    vmovapd %zmm0, 8(%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %addr1 = getelementptr double, double * %base, i64 0
   %addr2 = getelementptr double, double * %base, i64 1
 
@@ -1169,10 +1996,6 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; KNL-NEXT:    kmovw %k0, %eax
 ; KNL-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
 ; KNL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; KNL-NEXT:    vpsllw $7, %ymm2, %ymm2
-; KNL-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm2
-; KNL-NEXT:    vpxor %ymm3, %ymm3, %ymm3
-; KNL-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm2
 ; KNL-NEXT:    vmovups 4(%rdi), %zmm3 {%k2} {z}
 ; KNL-NEXT:    vmovups 68(%rdi), %zmm4 {%k1} {z}
 ; KNL-NEXT:    vcmpltps %zmm4, %zmm1, %k0
@@ -1346,11 +2169,338 @@ define void @ktest_2(<32 x float> %in, float * %base) {
 ; SKX-NEXT:  ## BB#1: ## %L1
 ; SKX-NEXT:    vmovaps %zmm0, (%rdi)
 ; SKX-NEXT:    vmovaps %zmm1, 64(%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ; SKX-NEXT:  LBB42_2: ## %L2
 ; SKX-NEXT:    vmovaps %zmm0, 4(%rdi)
 ; SKX-NEXT:    vmovaps %zmm1, 68(%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: ktest_2:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vmovups (%rdi), %zmm2
+; AVX512BW-NEXT:    vmovups 64(%rdi), %zmm3
+; AVX512BW-NEXT:    vcmpltps %zmm0, %zmm2, %k1
+; AVX512BW-NEXT:    vcmpltps %zmm1, %zmm3, %k2
+; AVX512BW-NEXT:    kunpckwd %k1, %k2, %k0
+; AVX512BW-NEXT:    vmovups 68(%rdi), %zmm2 {%k2} {z}
+; AVX512BW-NEXT:    vmovups 4(%rdi), %zmm3 {%k1} {z}
+; AVX512BW-NEXT:    vcmpltps %zmm3, %zmm0, %k1
+; AVX512BW-NEXT:    vcmpltps %zmm2, %zmm1, %k2
+; AVX512BW-NEXT:    kunpckwd %k1, %k2, %k1
+; AVX512BW-NEXT:    kord %k1, %k0, %k0
+; AVX512BW-NEXT:    ktestd %k0, %k0
+; AVX512BW-NEXT:    je LBB42_2
+; AVX512BW-NEXT:  ## BB#1: ## %L1
+; AVX512BW-NEXT:    vmovaps %zmm0, (%rdi)
+; AVX512BW-NEXT:    vmovaps %zmm1, 64(%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+; AVX512BW-NEXT:  LBB42_2: ## %L2
+; AVX512BW-NEXT:    vmovaps %zmm0, 4(%rdi)
+; AVX512BW-NEXT:    vmovaps %zmm1, 68(%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: ktest_2:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:  Lcfi6:
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 16
+; AVX512DQ-NEXT:  Lcfi7:
+; AVX512DQ-NEXT:    .cfi_offset %rbp, -16
+; AVX512DQ-NEXT:    movq %rsp, %rbp
+; AVX512DQ-NEXT:  Lcfi8:
+; AVX512DQ-NEXT:    .cfi_def_cfa_register %rbp
+; AVX512DQ-NEXT:    andq $-32, %rsp
+; AVX512DQ-NEXT:    subq $32, %rsp
+; AVX512DQ-NEXT:    vmovups (%rdi), %zmm2
+; AVX512DQ-NEXT:    vmovups 64(%rdi), %zmm3
+; AVX512DQ-NEXT:    vcmpltps %zmm1, %zmm3, %k1
+; AVX512DQ-NEXT:    kshiftlw $14, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    vmovd %ecx, %xmm3
+; AVX512DQ-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $13, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $12, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $11, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $10, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $9, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $8, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $7, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $6, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $5, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $4, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $3, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $2, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $1, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm2, %k2
+; AVX512DQ-NEXT:    kshiftlw $14, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    vmovd %ecx, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $1, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $13, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $2, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $12, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $3, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $11, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $4, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $10, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $5, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $9, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $6, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $8, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $7, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $7, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $8, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $6, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $9, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $5, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $4, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $11, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $3, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $2, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $13, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftlw $1, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $14, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vmovups 4(%rdi), %zmm3 {%k2} {z}
+; AVX512DQ-NEXT:    vmovups 68(%rdi), %zmm4 {%k1} {z}
+; AVX512DQ-NEXT:    vcmpltps %zmm4, %zmm1, %k0
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    vmovd %ecx, %xmm4
+; AVX512DQ-NEXT:    vpinsrb $1, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $2, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $3, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $4, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $5, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $6, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $7, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $8, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $9, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $10, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $11, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $13, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $14, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm4, %xmm4
+; AVX512DQ-NEXT:    vcmpltps %zmm3, %zmm0, %k0
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    vmovd %ecx, %xmm3
+; AVX512DQ-NEXT:    vpinsrb $1, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $2, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $3, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $4, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $5, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $6, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $7, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $8, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $9, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $10, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $11, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $13, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $14, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm3, %xmm3
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
+; AVX512DQ-NEXT:    vpor %ymm3, %ymm2, %ymm2
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
+; AVX512DQ-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512DQ-NEXT:    vpslld $31, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512DQ-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp)
+; AVX512DQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512DQ-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rsp)
+; AVX512DQ-NEXT:    cmpl $0, (%rsp)
+; AVX512DQ-NEXT:    je LBB42_2
+; AVX512DQ-NEXT:  ## BB#1: ## %L1
+; AVX512DQ-NEXT:    vmovaps %zmm0, (%rdi)
+; AVX512DQ-NEXT:    vmovaps %zmm1, 64(%rdi)
+; AVX512DQ-NEXT:    jmp LBB42_3
+; AVX512DQ-NEXT:  LBB42_2: ## %L2
+; AVX512DQ-NEXT:    vmovaps %zmm0, 4(%rdi)
+; AVX512DQ-NEXT:    vmovaps %zmm1, 68(%rdi)
+; AVX512DQ-NEXT:  LBB42_3: ## %End
+; AVX512DQ-NEXT:    movq %rbp, %rsp
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %addr1 = getelementptr float, float * %base, i64 0
   %addr2 = getelementptr float, float * %base, i64 1
 
@@ -1391,6 +2541,19 @@ define <8 x i64> @load_8i1(<8 x i1>* %a) {
 ; SKX-NEXT:    kmovb (%rdi), %k0
 ; SKX-NEXT:    vpmovm2q %k0, %zmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: load_8i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    movzbl (%rdi), %eax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: load_8i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT:    retq
   %b = load <8 x i1>, <8 x i1>* %a
   %c = sext <8 x i1> %b to <8 x i64>
   ret <8 x i64> %c
@@ -1408,6 +2571,18 @@ define <16 x i32> @load_16i1(<16 x i1>* %a) {
 ; SKX-NEXT:    kmovw (%rdi), %k0
 ; SKX-NEXT:    vpmovm2d %k0, %zmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: load_16i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovw (%rdi), %k1
+; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: load_16i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    retq
   %b = load <16 x i1>, <16 x i1>* %a
   %c = sext <16 x i1> %b to <16 x i32>
   ret <16 x i32> %c
@@ -1427,6 +2602,23 @@ define <2 x i16> @load_2i1(<2 x i1>* %a) {
 ; SKX-NEXT:    kmovb (%rdi), %k0
 ; SKX-NEXT:    vpmovm2q %k0, %xmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: load_2i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    movzbl (%rdi), %eax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: load_2i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %b = load <2 x i1>, <2 x i1>* %a
   %c = sext <2 x i1> %b to <2 x i16>
   ret <2 x i16> %c
@@ -1447,6 +2639,24 @@ define <4 x i16> @load_4i1(<4 x i1>* %a) {
 ; SKX-NEXT:    kmovb (%rdi), %k0
 ; SKX-NEXT:    vpmovm2d %k0, %xmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: load_4i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    movzbl (%rdi), %eax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
+; AVX512BW-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: load_4i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovb (%rdi), %k0
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %b = load <4 x i1>, <4 x i1>* %a
   %c = sext <4 x i1> %b to <4 x i16>
   ret <4 x i16> %c
@@ -1468,6 +2678,22 @@ define <32 x i16> @load_32i1(<32 x i1>* %a) {
 ; SKX-NEXT:    kmovd (%rdi), %k0
 ; SKX-NEXT:    vpmovm2w %k0, %zmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: load_32i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: load_32i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k1
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm1
+; AVX512DQ-NEXT:    vpmovdw %zmm1, %ymm1
+; AVX512DQ-NEXT:    retq
   %b = load <32 x i1>, <32 x i1>* %a
   %c = sext <32 x i1> %b to <32 x i16>
   ret <32 x i16> %c
@@ -1497,6 +2723,30 @@ define <64 x i8> @load_64i1(<64 x i1>* %a) {
 ; SKX-NEXT:    kmovq (%rdi), %k0
 ; SKX-NEXT:    vpmovm2b %k0, %zmm0
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: load_64i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: load_64i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw (%rdi), %k0
+; AVX512DQ-NEXT:    kmovw 2(%rdi), %k1
+; AVX512DQ-NEXT:    kmovw 4(%rdi), %k2
+; AVX512DQ-NEXT:    kmovw 6(%rdi), %k3
+; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpmovm2d %k1, %zmm1
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512DQ-NEXT:    vpmovm2d %k2, %zmm1
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovm2d %k3, %zmm2
+; AVX512DQ-NEXT:    vpmovdb %zmm2, %xmm2
+; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
+; AVX512DQ-NEXT:    retq
   %b = load <64 x i1>, <64 x i1>* %a
   %c = sext <64 x i1> %b to <64 x i8>
   ret <64 x i8> %c
@@ -1518,6 +2768,24 @@ define void @store_8i1(<8 x i1>* %a, <8 x i1> %v) {
 ; SKX-NEXT:    vpmovw2m %xmm0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rdi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_8i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_8i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   store <8 x i1> %v, <8 x i1>* %a
   ret void
 }
@@ -1538,6 +2806,24 @@ define void @store_8i1_1(<8 x i1>* %a, <8 x i16> %v) {
 ; SKX-NEXT:    vpmovw2m %xmm0, %k0
 ; SKX-NEXT:    kmovb %k0, (%rdi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_8i1_1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $15, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movb %al, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_8i1_1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpmovsxwq %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpsllq $63, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %v1 = trunc <8 x i16> %v to <8 x i1>
   store <8 x i1> %v1, <8 x i1>* %a
   ret void
@@ -1558,6 +2844,23 @@ define void @store_16i1(<16 x i1>* %a, <16 x i1> %v) {
 ; SKX-NEXT:    vpmovb2m %xmm0, %k0
 ; SKX-NEXT:    kmovw %k0, (%rdi)
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_16i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $7, %xmm0, %xmm0
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovw %k0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_16i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   store <16 x i1> %v, <16 x i1>* %a
   ret void
 }
@@ -1581,7 +2884,30 @@ define void @store_32i1(<32 x i1>* %a, <32 x i1> %v) {
 ; SKX-NEXT:    vpsllw $7, %ymm0, %ymm0
 ; SKX-NEXT:    vpmovb2m %ymm0, %k0
 ; SKX-NEXT:    kmovd %k0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_32i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $7, %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_32i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512DQ-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 2(%rdi)
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   store <32 x i1> %v, <32 x i1>* %a
   ret void
 }
@@ -1608,7 +2934,33 @@ define void @store_32i1_1(<32 x i1>* %a, <32 x i16> %v) {
 ; SKX-NEXT:    vpsllw $15, %zmm0, %zmm0
 ; SKX-NEXT:    vpmovw2m %zmm0, %k0
 ; SKX-NEXT:    kmovd %k0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_32i1_1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $15, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovw2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_32i1_1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
+; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vpmovsxwd %ymm1, %zmm1
+; AVX512DQ-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512DQ-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 2(%rdi)
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   %v1 = trunc <32 x i16> %v to <32 x i1>
   store <32 x i1> %v1, <32 x i1>* %a
   ret void
@@ -1942,7 +3294,337 @@ define void @store_64i1(<64 x i1>* %a, <64 x i1> %v) {
 ; SKX-NEXT:    vpsllw $7, %zmm0, %zmm0
 ; SKX-NEXT:    vpmovb2m %zmm0, %k0
 ; SKX-NEXT:    kmovq %k0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: store_64i1:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpsllw $7, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovb2m %zmm0, %k0
+; AVX512BW-NEXT:    kmovq %k0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: store_64i1:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    pushq %rbp
+; AVX512DQ-NEXT:  Lcfi9:
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 16
+; AVX512DQ-NEXT:    pushq %r15
+; AVX512DQ-NEXT:  Lcfi10:
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 24
+; AVX512DQ-NEXT:    pushq %r14
+; AVX512DQ-NEXT:  Lcfi11:
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 32
+; AVX512DQ-NEXT:    pushq %r13
+; AVX512DQ-NEXT:  Lcfi12:
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 40
+; AVX512DQ-NEXT:    pushq %r12
+; AVX512DQ-NEXT:  Lcfi13:
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 48
+; AVX512DQ-NEXT:    pushq %rbx
+; AVX512DQ-NEXT:  Lcfi14:
+; AVX512DQ-NEXT:    .cfi_def_cfa_offset 56
+; AVX512DQ-NEXT:  Lcfi15:
+; AVX512DQ-NEXT:    .cfi_offset %rbx, -56
+; AVX512DQ-NEXT:  Lcfi16:
+; AVX512DQ-NEXT:    .cfi_offset %r12, -48
+; AVX512DQ-NEXT:  Lcfi17:
+; AVX512DQ-NEXT:    .cfi_offset %r13, -40
+; AVX512DQ-NEXT:  Lcfi18:
+; AVX512DQ-NEXT:    .cfi_offset %r14, -32
+; AVX512DQ-NEXT:  Lcfi19:
+; AVX512DQ-NEXT:    .cfi_offset %r15, -24
+; AVX512DQ-NEXT:  Lcfi20:
+; AVX512DQ-NEXT:    .cfi_offset %rbp, -16
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512DQ-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512DQ-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vpmovsxbd %xmm3, %zmm3
+; AVX512DQ-NEXT:    vpslld $31, %zmm3, %zmm3
+; AVX512DQ-NEXT:    vptestmd %zmm3, %zmm3, %k0
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r8d
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r9d
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r10d
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r11d
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r14d
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r15d
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r13d
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ebx
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %esi
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    vmovd %r9d, %xmm3
+; AVX512DQ-NEXT:    kmovw %k1, %r9d
+; AVX512DQ-NEXT:    vptestmd %zmm2, %zmm2, %k2
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    vpinsrb $1, %r8d, %xmm3, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $2, %r10d, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $3, %r11d, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $4, %r14d, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $5, %r15d, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $6, %r12d, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $7, %r13d, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $8, %ebx, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $9, %ebp, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $10, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $11, %ecx, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $12, %edx, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $13, %esi, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpinsrb $14, %r9d, %xmm2, %xmm2
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm2, %xmm2
+; AVX512DQ-NEXT:    vpmovsxbd %xmm2, %zmm2
+; AVX512DQ-NEXT:    vpslld $31, %zmm2, %zmm2
+; AVX512DQ-NEXT:    vptestmd %zmm2, %zmm2, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 6(%rdi)
+; AVX512DQ-NEXT:    kshiftlw $14, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r8d
+; AVX512DQ-NEXT:    kshiftlw $15, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r10d
+; AVX512DQ-NEXT:    kshiftlw $13, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r9d
+; AVX512DQ-NEXT:    kshiftlw $12, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r11d
+; AVX512DQ-NEXT:    kshiftlw $11, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r14d
+; AVX512DQ-NEXT:    kshiftlw $10, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r15d
+; AVX512DQ-NEXT:    kshiftlw $9, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r12d
+; AVX512DQ-NEXT:    kshiftlw $8, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r13d
+; AVX512DQ-NEXT:    kshiftlw $7, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    kshiftlw $6, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %esi
+; AVX512DQ-NEXT:    kshiftlw $5, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %ebp
+; AVX512DQ-NEXT:    kshiftlw $4, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %ebx
+; AVX512DQ-NEXT:    kshiftlw $3, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    kshiftlw $2, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %edx
+; AVX512DQ-NEXT:    kshiftlw $1, %k2, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    vmovd %r10d, %xmm2
+; AVX512DQ-NEXT:    kmovw %k0, %r10d
+; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k2, %k0
+; AVX512DQ-NEXT:    vpinsrb $1, %r8d, %xmm2, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $2, %r9d, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $3, %r11d, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $4, %r14d, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $5, %r15d, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $6, %r12d, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $7, %r13d, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $8, %ecx, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $9, %esi, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $10, %ebp, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $11, %ebx, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $13, %edx, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpinsrb $14, %r10d, %xmm1, %xmm1
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX512DQ-NEXT:    vpmovsxbd %xmm1, %zmm1
+; AVX512DQ-NEXT:    vpslld $31, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vptestmd %zmm1, %zmm1, %k0
+; AVX512DQ-NEXT:    kmovw %k0, 4(%rdi)
+; AVX512DQ-NEXT:    kshiftlw $14, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r8d
+; AVX512DQ-NEXT:    kshiftlw $15, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r10d
+; AVX512DQ-NEXT:    kshiftlw $13, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r9d
+; AVX512DQ-NEXT:    kshiftlw $12, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r11d
+; AVX512DQ-NEXT:    kshiftlw $11, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r14d
+; AVX512DQ-NEXT:    kshiftlw $10, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r15d
+; AVX512DQ-NEXT:    kshiftlw $9, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r12d
+; AVX512DQ-NEXT:    kshiftlw $8, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %r13d
+; AVX512DQ-NEXT:    kshiftlw $7, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %ecx
+; AVX512DQ-NEXT:    kshiftlw $6, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %esi
+; AVX512DQ-NEXT:    kshiftlw $5, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %ebp
+; AVX512DQ-NEXT:    kshiftlw $4, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %ebx
+; AVX512DQ-NEXT:    kshiftlw $3, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    kshiftlw $2, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %edx
+; AVX512DQ-NEXT:    kshiftlw $1, %k1, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    vmovd %r10d, %xmm1
+; AVX512DQ-NEXT:    kmovw %k0, %r10d
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    vpinsrb $1, %r8d, %xmm1, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $2, %r9d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $9, %esi, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $13, %edx, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $14, %r10d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k1
+; AVX512DQ-NEXT:    kmovw %k1, 2(%rdi)
+; AVX512DQ-NEXT:    kshiftlw $14, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r8d
+; AVX512DQ-NEXT:    kshiftlw $15, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r9d
+; AVX512DQ-NEXT:    kshiftlw $13, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r10d
+; AVX512DQ-NEXT:    kshiftlw $12, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r11d
+; AVX512DQ-NEXT:    kshiftlw $11, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r14d
+; AVX512DQ-NEXT:    kshiftlw $10, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r15d
+; AVX512DQ-NEXT:    kshiftlw $9, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r12d
+; AVX512DQ-NEXT:    kshiftlw $8, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %r13d
+; AVX512DQ-NEXT:    kshiftlw $7, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %edx
+; AVX512DQ-NEXT:    kshiftlw $6, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %esi
+; AVX512DQ-NEXT:    kshiftlw $5, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ebp
+; AVX512DQ-NEXT:    kshiftlw $4, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ebx
+; AVX512DQ-NEXT:    kshiftlw $3, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %eax
+; AVX512DQ-NEXT:    kshiftlw $2, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    kmovw %k1, %ecx
+; AVX512DQ-NEXT:    kshiftlw $1, %k0, %k1
+; AVX512DQ-NEXT:    kshiftrw $15, %k1, %k1
+; AVX512DQ-NEXT:    vmovd %r9d, %xmm0
+; AVX512DQ-NEXT:    kmovw %k1, %r9d
+; AVX512DQ-NEXT:    vpinsrb $1, %r8d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $2, %r10d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $3, %r11d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $8, %edx, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $9, %esi, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $11, %ebx, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kshiftrw $15, %k0, %k0
+; AVX512DQ-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpinsrb $14, %r9d, %xmm0, %xmm0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX512DQ-NEXT:    vpmovsxbd %xmm0, %zmm0
+; AVX512DQ-NEXT:    vpslld $31, %zmm0, %zmm0
+; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
+; AVX512DQ-NEXT:    popq %rbx
+; AVX512DQ-NEXT:    popq %r12
+; AVX512DQ-NEXT:    popq %r13
+; AVX512DQ-NEXT:    popq %r14
+; AVX512DQ-NEXT:    popq %r15
+; AVX512DQ-NEXT:    popq %rbp
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
   store <64 x i1> %v, <64 x i1>* %a
   ret void
 }
@@ -1963,7 +3645,27 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
 ; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
 ; SKX-NEXT:    kmovb %k0, %eax
 ; SKX-NEXT:    addl %eax, %eax
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_bitcast_v8i1_zext:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    movzbl %al, %eax
+; AVX512BW-NEXT:    addl %eax, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_bitcast_v8i1_zext:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovb %k0, %eax
+; AVX512DQ-NEXT:    addl %eax, %eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
    %v1 = icmp eq <16 x i32> %a, zeroinitializer
    %mask = shufflevector <16 x i1> %v1, <16 x i1> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
    %mask1 = bitcast <8 x i1> %mask to i8
@@ -1973,13 +3675,40 @@ define i32 @test_bitcast_v8i1_zext(<16 x i32> %a) {
 }
 
 define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
-; CHECK-LABEL: test_bitcast_v16i1_zext:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; CHECK-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    addl %eax, %eax
-; CHECK-NEXT:    retq
+; KNL-LABEL: test_bitcast_v16i1_zext:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; KNL-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    addl %eax, %eax
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_bitcast_v16i1_zext:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; SKX-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; SKX-NEXT:    kmovw %k0, %eax
+; SKX-NEXT:    addl %eax, %eax
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_bitcast_v16i1_zext:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512BW-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    kmovw %k0, %eax
+; AVX512BW-NEXT:    addl %eax, %eax
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_bitcast_v16i1_zext:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512DQ-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    addl %eax, %eax
+; AVX512DQ-NEXT:    vzeroupper
+; AVX512DQ-NEXT:    retq
    %v1 = icmp eq <16 x i32> %a, zeroinitializer
    %mask1 = bitcast <16 x i1> %v1 to i16
    %val = zext i16 %mask1 to i32
@@ -1988,13 +3717,41 @@ define i32 @test_bitcast_v16i1_zext(<16 x i32> %a) {
 }
 
 define i16 @test_v16i1_add(i16 %x, i16 %y) {
-; CHECK-LABEL: test_v16i1_add:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    kxorw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; KNL-LABEL: test_v16i1_add:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_v16i1_add:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v16i1_add:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v16i1_add:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kmovw %esi, %k1
+; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT:    retq
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = bitcast i16 %y to <16 x i1>
   %m2 = add <16 x i1> %m0,  %m1
@@ -2003,13 +3760,41 @@ define i16 @test_v16i1_add(i16 %x, i16 %y) {
 }
 
 define i16 @test_v16i1_sub(i16 %x, i16 %y) {
-; CHECK-LABEL: test_v16i1_sub:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    kxorw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; KNL-LABEL: test_v16i1_sub:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kxorw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_v16i1_sub:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kxorw %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v16i1_sub:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v16i1_sub:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kmovw %esi, %k1
+; AVX512DQ-NEXT:    kxorw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT:    retq
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = bitcast i16 %y to <16 x i1>
   %m2 = sub <16 x i1> %m0,  %m1
@@ -2018,13 +3803,41 @@ define i16 @test_v16i1_sub(i16 %x, i16 %y) {
 }
 
 define i16 @test_v16i1_mul(i16 %x, i16 %y) {
-; CHECK-LABEL: test_v16i1_mul:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    kandw %k1, %k0, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; KNL-LABEL: test_v16i1_mul:
+; KNL:       ## BB#0:
+; KNL-NEXT:    kmovw %edi, %k0
+; KNL-NEXT:    kmovw %esi, %k1
+; KNL-NEXT:    kandw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test_v16i1_mul:
+; SKX:       ## BB#0:
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
+; SKX-NEXT:    kandw %k1, %k0, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v16i1_mul:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v16i1_mul:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kmovw %esi, %k1
+; AVX512DQ-NEXT:    kandw %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512DQ-NEXT:    retq
   %m0 = bitcast i16 %x to <16 x i1>
   %m1 = bitcast i16 %y to <16 x i1>
   %m2 = mul <16 x i1> %m0,  %m1
@@ -2039,15 +3852,35 @@ define i8 @test_v8i1_add(i8 %x, i8 %y) {
 ; KNL-NEXT:    kmovw %esi, %k1
 ; KNL-NEXT:    kxorw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_v8i1_add:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k0
-; SKX-NEXT:    kmovb %esi, %k1
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
 ; SKX-NEXT:    kxorb %k1, %k0, %k0
-; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v8i1_add:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v8i1_add:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kmovw %esi, %k1
+; AVX512DQ-NEXT:    kxorb %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = bitcast i8 %y to <8 x i1>
   %m2 = add <8 x i1> %m0,  %m1
@@ -2062,15 +3895,35 @@ define i8 @test_v8i1_sub(i8 %x, i8 %y) {
 ; KNL-NEXT:    kmovw %esi, %k1
 ; KNL-NEXT:    kxorw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_v8i1_sub:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k0
-; SKX-NEXT:    kmovb %esi, %k1
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
 ; SKX-NEXT:    kxorb %k1, %k0, %k0
-; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v8i1_sub:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kxorw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v8i1_sub:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kmovw %esi, %k1
+; AVX512DQ-NEXT:    kxorb %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = bitcast i8 %y to <8 x i1>
   %m2 = sub <8 x i1> %m0,  %m1
@@ -2085,15 +3938,35 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) {
 ; KNL-NEXT:    kmovw %esi, %k1
 ; KNL-NEXT:    kandw %k1, %k0, %k0
 ; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: test_v8i1_mul:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    kmovb %edi, %k0
-; SKX-NEXT:    kmovb %esi, %k1
+; SKX-NEXT:    kmovd %edi, %k0
+; SKX-NEXT:    kmovd %esi, %k1
 ; SKX-NEXT:    kandb %k1, %k0, %k0
-; SKX-NEXT:    kmovb %k0, %eax
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; SKX-NEXT:    retq
+;
+; AVX512BW-LABEL: test_v8i1_mul:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    kandw %k1, %k0, %k0
+; AVX512BW-NEXT:    kmovd %k0, %eax
+; AVX512BW-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512BW-NEXT:    retq
+;
+; AVX512DQ-LABEL: test_v8i1_mul:
+; AVX512DQ:       ## BB#0:
+; AVX512DQ-NEXT:    kmovw %edi, %k0
+; AVX512DQ-NEXT:    kmovw %esi, %k1
+; AVX512DQ-NEXT:    kandb %k1, %k0, %k0
+; AVX512DQ-NEXT:    kmovw %k0, %eax
+; AVX512DQ-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512DQ-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = bitcast i8 %y to <8 x i1>
   %m2 = mul <8 x i1> %m0,  %m1
diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll
index d00d9bfdfcd77bac46863b8bbc3dd79b7fe56914..4ef88ac495c324f3926c3c80e64af292b4cc0763 100644
--- a/test/CodeGen/X86/avx512-mask-spills.ll
+++ b/test/CodeGen/X86/avx512-mask-spills.ll
@@ -37,6 +37,7 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
 ; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
@@ -62,6 +63,7 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) {
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
 ; CHECK-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload
 ; CHECK-NEXT:    kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload
@@ -86,6 +88,7 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) {
 ; CHECK-NEXT:    kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill
 ; CHECK-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovd %k0, (%rsp) ## 4-byte Spill
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload
 ; CHECK-NEXT:    kmovd (%rsp), %k1 ## 4-byte Reload
@@ -110,6 +113,7 @@ define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) {
 ; CHECK-NEXT:    kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
 ; CHECK-NEXT:    vpcmpgtb %zmm1, %zmm0, %k0
 ; CHECK-NEXT:    kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq _f
 ; CHECK-NEXT:    kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload
 ; CHECK-NEXT:    kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload
diff --git a/test/CodeGen/X86/avx512-masked-memop-64-32.ll b/test/CodeGen/X86/avx512-masked-memop-64-32.ll
index 09754ad2620ed7b68671673106b3c6360d76e7a1..607c4f4ade6f5fd2481bacb88d0d2d3ed6fbe896 100644
--- a/test/CodeGen/X86/avx512-masked-memop-64-32.ll
+++ b/test/CodeGen/X86/avx512-masked-memop-64-32.ll
@@ -32,6 +32,7 @@ define void @test3(<16 x i32> %trigger, <16 x i32>* %addr, <16 x i32> %val) {
 ; AVX512-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; AVX512-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
 ; AVX512-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v16i32.p0v16i32(<16 x i32>%val, <16 x i32>* %addr, i32 4, <16 x i1>%mask)
@@ -56,6 +57,7 @@ define void @test13(<16 x i32> %trigger, <16 x float>* %addr, <16 x float> %val)
 ; AVX512-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; AVX512-NEXT:    vpcmpeqd %zmm2, %zmm0, %k1
 ; AVX512-NEXT:    vmovups %zmm1, (%rdi) {%k1}
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %mask = icmp eq <16 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v16f32.p0v16f32(<16 x float>%val, <16 x float>* %addr, i32 4, <16 x i1>%mask)
@@ -67,6 +69,7 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
 ; AVX512:       ## BB#0:
 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
 ; AVX512-NEXT:    vmovlps %xmm0, 48(%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
   ret void
@@ -143,6 +146,7 @@ define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %sr
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
 ; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test_store_16i64:
@@ -152,6 +156,7 @@ define void @test_store_16i64(<16 x i64>* %ptrs, <16 x i1> %mask, <16 x i64> %sr
 ; SKX-NEXT:    vmovdqu64 %zmm1, (%rdi) {%k1}
 ; SKX-NEXT:    kshiftrw $8, %k1, %k1
 ; SKX-NEXT:    vmovdqu64 %zmm2, 64(%rdi) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   call void @llvm.masked.store.v16i64.p0v16i64(<16 x i64> %src0, <16 x i64>* %ptrs, i32 4, <16 x i1> %mask)
   ret void
@@ -167,6 +172,7 @@ define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x doubl
 ; AVX512F-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
 ; AVX512F-NEXT:    kshiftrw $8, %k1, %k1
 ; AVX512F-NEXT:    vmovupd %zmm2, 64(%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test_store_16f64:
@@ -176,6 +182,7 @@ define void @test_store_16f64(<16 x double>* %ptrs, <16 x i1> %mask, <16 x doubl
 ; SKX-NEXT:    vmovupd %zmm1, (%rdi) {%k1}
 ; SKX-NEXT:    kshiftrw $8, %k1, %k1
 ; SKX-NEXT:    vmovupd %zmm2, 64(%rdi) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   call void @llvm.masked.store.v16f64.p0v16f64(<16 x double> %src0, <16 x double>* %ptrs, i32 4, <16 x i1> %mask)
   ret void
diff --git a/test/CodeGen/X86/avx512-masked_memop-16-8.ll b/test/CodeGen/X86/avx512-masked_memop-16-8.ll
index 72c3451a685dc8f17e06c08157e584fd0f5abf29..aedfbf7dbd65e60361e7f9eb9bbd850ba30cb338 100644
--- a/test/CodeGen/X86/avx512-masked_memop-16-8.ll
+++ b/test/CodeGen/X86/avx512-masked_memop-16-8.ll
@@ -93,6 +93,7 @@ define void @test_mask_store_32xi8(<32 x i1> %mask, <32 x i8>* %addr, <32 x i8>
 ; CHECK-NEXT:    vpsllw $7, %ymm0, %ymm0
 ; CHECK-NEXT:    vpmovb2m %ymm0, %k1
 ; CHECK-NEXT:    vmovdqu8 %ymm1, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.masked.store.v32i8.p0v32i8(<32 x i8> %val, <32 x i8>* %addr, i32 4, <32 x i1>%mask)
   ret void
@@ -105,6 +106,7 @@ define void @test_mask_store_64xi8(<64 x i1> %mask, <64 x i8>* %addr, <64 x i8>
 ; CHECK-NEXT:    vpsllw $7, %zmm0, %zmm0
 ; CHECK-NEXT:    vpmovb2m %zmm0, %k1
 ; CHECK-NEXT:    vmovdqu8 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.masked.store.v64i8.p0v64i8(<64 x i8> %val, <64 x i8>* %addr, i32 4, <64 x i1>%mask)
   ret void
@@ -129,6 +131,7 @@ define void @test_mask_store_16xi16(<16 x i1> %mask, <16 x i16>* %addr, <16 x i1
 ; CHECK-NEXT:    vpsllw $7, %xmm0, %xmm0
 ; CHECK-NEXT:    vpmovb2m %xmm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %ymm1, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.masked.store.v16i16.p0v16i16(<16 x i16> %val, <16 x i16>* %addr, i32 4, <16 x i1>%mask)
   ret void
@@ -141,6 +144,7 @@ define void @test_mask_store_32xi16(<32 x i1> %mask, <32 x i16>* %addr, <32 x i1
 ; CHECK-NEXT:    vpsllw $7, %ymm0, %ymm0
 ; CHECK-NEXT:    vpmovb2m %ymm0, %k1
 ; CHECK-NEXT:    vmovdqu16 %zmm1, (%rdi) {%k1}
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   call void @llvm.masked.store.v32i16.p0v32i16(<32 x i16> %val, <32 x i16>* %addr, i32 4, <32 x i1>%mask)
   ret void
diff --git a/test/CodeGen/X86/avx512-memfold.ll b/test/CodeGen/X86/avx512-memfold.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d754b2b78f6ca7c2a3d4daac0a42e215a3d88c3e
--- /dev/null
+++ b/test/CodeGen/X86/avx512-memfold.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+define i8 @test_int_x86_avx512_mask_cmp_ss(<4 x float> %a, float* %b, i8 %mask) {
+; CHECK-LABEL: test_int_x86_avx512_mask_cmp_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vcmpunordss (%rdi), %xmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; CHECK-NEXT:    retq
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+  %res2 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %a, <4 x float> %bv, i32 3, i8 %mask, i32 4)
+  ret i8 %res2
+}
+declare i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float>, <4 x float>, i32, i8, i32)
+
+define <4 x float> @test_mask_max_ss(<4 x float> %a, float* %b, i8 %mask) {
+; CHECK-LABEL: test_mask_max_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vmaxss (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>%a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.max.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_maskz_add_ss(<4 x float> %a, float* %b, i8 %mask) {
+; CHECK-LABEL: test_maskz_add_ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vaddss (%rdi), %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %b.val = load float, float* %b
+  %bv0 = insertelement <4 x float> undef, float %b.val, i32 0
+  %bv1 = insertelement <4 x float> %bv0, float 0.000000e+00, i32 1
+  %bv2 = insertelement <4 x float> %bv1, float 0.000000e+00, i32 2
+  %bv  = insertelement <4 x float> %bv2, float 0.000000e+00, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>%a, <4 x float> %bv, <4 x float> zeroinitializer, i8 %mask, i32 4)
+  ret <4 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.add.ss.round(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+declare <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32)
+
+define <2 x double> @test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %a, <2 x double> %b, double* %c, i8 %mask){
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    vfmadd213sd (%rdi), %xmm1, %xmm0 {%k1}
+; CHECK-NEXT:    retq
+  %c.val = load double, double* %c
+  %cv0 = insertelement <2 x double> undef, double %c.val, i32 0
+  %cv = insertelement <2 x double> %cv0, double 0.000000e+00, i32 1
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %cv, i8 %mask, i32 4)
+  ret <2 x double> %res
+}
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 9234ae838cffd9f5c8bdf5520ac9fa195a4c11e9..df988185efc589ccd870afb30fcbbb87e7b51a48 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -4,7 +4,7 @@
 define i32 @test1(float %x) {
 ; CHECK-LABEL: test1:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vmovd %xmm0, %eax ## encoding: [0x62,0xf1,0x7d,0x08,0x7e,0xc0]
+; CHECK-NEXT:    vmovd %xmm0, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x7e,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %res = bitcast float %x to i32
    ret i32 %res
diff --git a/test/CodeGen/X86/avx512-regcall-Mask.ll b/test/CodeGen/X86/avx512-regcall-Mask.ll
index 325097ee9510581862fdf8276b41a17791c252ef..781112866ca5cd3992cc900aad86ae5a093e5c02 100644
--- a/test/CodeGen/X86/avx512-regcall-Mask.ll
+++ b/test/CodeGen/X86/avx512-regcall-Mask.ll
@@ -251,9 +251,9 @@ entry:
 }
 
 ; CHECK-LABEL:  test_argv16i1:
-; CHECK:        kmovw    %edx, %k{{[0-9]+}}
-; CHECK:        kmovw    %ecx, %k{{[0-9]+}}
-; CHECK:        kmovw    %eax, %k{{[0-9]+}}
+; CHECK:        kmovd    %edx, %k{{[0-9]+}}
+; CHECK:        kmovd    %ecx, %k{{[0-9]+}}
+; CHECK:        kmovd    %eax, %k{{[0-9]+}}
 ; CHECK:        ret{{l|q}}
 
 ; Test regcall when receiving arguments of v16i1 type
@@ -301,9 +301,9 @@ entry:
 }
 
 ; CHECK-LABEL:  test_argv8i1:
-; CHECK:        kmovw    %edx, %k{{[0-9]+}}
-; CHECK:        kmovw    %ecx, %k{{[0-9]+}}
-; CHECK:        kmovw    %eax, %k{{[0-9]+}}
+; CHECK:        kmovd    %edx, %k{{[0-9]+}}
+; CHECK:        kmovd    %ecx, %k{{[0-9]+}}
+; CHECK:        kmovd    %eax, %k{{[0-9]+}}
 ; CHECK:        ret{{l|q}}
 
 ; Test regcall when receiving arguments of v8i1 type
@@ -339,7 +339,7 @@ define x86_regcallcc <8 x i1> @test_retv8i1()  {
 
 ; CHECK-LABEL: caller_retv8i1:
 ; CHECK:       call{{l|q}}   {{_*}}test_retv8i1
-; CHECK:       kmovw %eax, %k{{[0-9]+}}
+; CHECK:       kmovd %eax, %k{{[0-9]+}}
 ; CHECK:       ret{{l|q}}
 
 ; Test regcall when processing result of v8i1 type
diff --git a/test/CodeGen/X86/avx512-regcall-NoMask.ll b/test/CodeGen/X86/avx512-regcall-NoMask.ll
index a2b058d0a4071ad833e6ba6f60e0cb5d3a9e347e..334097917853b43ca419346ee6395381c1297a20 100644
--- a/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -469,32 +469,27 @@ define x86_regcallcc <32 x float> @testf32_inp(<32 x float> %a, <32 x float> %b,
   ret <32 x float> %x4
 }
 
-; X32-LABEL: pushl {{%e(si|di|bx|bp)}}
-; X32: pushl {{%e(si|di|bx|bp)}}
-; X32: pushl {{%e(si|di|bx|bp)}}
-; X32: pushl {{%e(si|di|bx|bp)}}
-; X32: popl {{%e(si|di|bx|bp)}}
-; X32: popl {{%e(si|di|bx|bp)}}
-; X32: popl {{%e(si|di|bx|bp)}}
-; X32: popl {{%e(si|di|bx|bp)}}
+; X32-LABEL: testi32_inp
+; X32: pushl {{%e(bx|bp)}}
+; X32: pushl {{%e(bx|bp)}}
+; X32: popl {{%e(bx|bp)}}
+; X32: popl {{%e(bx|bp)}}
 ; X32: retl
 
-; WIN64-LABEL: pushq	{{%r(bp|bx|1[0-5])}}
+; WIN64-LABEL: testi32_inp
 ; WIN64: pushq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: pushq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: pushq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: popq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: popq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: popq	{{%r(bp|bx|1[0-5])}}
-; WIN64: popq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: retq
 
-; LINUXOSX64-LABEL: pushq	{{%r(bp|bx|1[2-5])}}
+; LINUXOSX64-LABEL: testi32_inp
 ; LINUXOSX64: pushq	{{%r(bp|bx|1[2-5])}}
 ; LINUXOSX64: pushq	{{%r(bp|bx|1[2-5])}}
 ; LINUXOSX64: popq	{{%r(bp|bx|1[2-5])}}
 ; LINUXOSX64: popq	{{%r(bp|bx|1[2-5])}}
-; LINUXOSX64: popq	{{%r(bp|bx|1[2-5])}}
 ; LINUXOSX64: retq
 
 ; Test regcall when running multiple input parameters - callee saved GPRs
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index 3f427298c177c87aee86613741607197ec393a23..1859b1bcfaf6b5525eed8d7e5d9a61309e89a292 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -90,6 +90,7 @@ define i8 @select05_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    korw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %mask = load <8 x i1> , <8 x i1>* %m
   %a = load <8 x i1> , <8 x i1>* %a.0
@@ -120,6 +121,7 @@ define i8 @select06_mem(<8 x i1>* %a.0, <8 x i1>* %m) {
 ; CHECK-NEXT:    kmovw %eax, %k1
 ; CHECK-NEXT:    kandw %k1, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %mask = load <8 x i1> , <8 x i1>* %m
   %a = load <8 x i1> , <8 x i1>* %a.0
@@ -137,6 +139,7 @@ define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
 ; CHECK-NEXT:    kandw %k0, %k1, %k0
 ; CHECK-NEXT:    korw %k2, %k0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
   %a = bitcast i8 %a.0 to <8 x i1>
@@ -149,10 +152,7 @@ define i8 @select07(i8 %a.0, i8 %b.0, i8 %m) {
 define i64 @pr30249() {
 ; CHECK-LABEL: pr30249:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    cmpb $1, %cl
-; CHECK-NEXT:    movl $1, %eax
-; CHECK-NEXT:    adcxq %rcx, %rax
+; CHECK-NEXT:    movl $2, %eax
 ; CHECK-NEXT:    retq
   %v = select i1 undef , i64 1, i64 2
   ret i64 %v
diff --git a/test/CodeGen/X86/avx512-skx-insert-subvec.ll b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
index 3a93b544b9593f3271f023a69315bbeb06e14de4..b7f80ec9715078d2f0a2ef9532f0e98a6eeb173d 100644
--- a/test/CodeGen/X86/avx512-skx-insert-subvec.ll
+++ b/test/CodeGen/X86/avx512-skx-insert-subvec.ll
@@ -30,11 +30,12 @@ define <8 x i1> @test2(<2 x i1> %a) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; CHECK-NEXT:    vptestmq %xmm0, %xmm0, %k0
-; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
-; CHECK-NEXT:    vpmovm2q %k0, %zmm1
-; CHECK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; CHECK-NEXT:    vpmovm2q %k0, %zmm0
+; CHECK-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; CHECK-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vpmovq2m %zmm0, %k0
 ; CHECK-NEXT:    vpmovm2w %k0, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %res = shufflevector <2 x i1> %a, <2 x i1> zeroinitializer, <8 x i32> <i32 3, i32 3, i32 undef, i32 undef, i32 0, i32 1, i32 undef, i32 undef>
   ret <8 x i1> %res
diff --git a/test/CodeGen/X86/avx512-trunc.ll b/test/CodeGen/X86/avx512-trunc.ll
index ac58024772b49c8c18e3444ee8dc31af00e89759..1c88ce6eb2f77d40b035d108e9e71b40ce24a1b5 100644
--- a/test/CodeGen/X86/avx512-trunc.ll
+++ b/test/CodeGen/X86/avx512-trunc.ll
@@ -8,6 +8,7 @@ define <16 x i8> @trunc_16x32_to_16x8(<16 x i32> %i) #0 {
 ; ALL-LABEL: trunc_16x32_to_16x8:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
   %x = trunc <16 x i32> %i to <16 x i8>
   ret <16 x i8> %x
@@ -17,6 +18,7 @@ define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) #0 {
 ; ALL-LABEL: trunc_8x64_to_8x16:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovqw %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
   %x = trunc <8 x i64> %i to <8 x i16>
   ret <8 x i16> %x
@@ -35,6 +37,7 @@ define <8 x i8> @trunc_qb_512(<8 x i64> %i) #0 {
 ; ALL-LABEL: trunc_qb_512:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovqw %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
   %x = trunc <8 x i64> %i to <8 x i8>
   ret <8 x i8> %x
@@ -44,6 +47,7 @@ define void @trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) #0 {
 ; ALL-LABEL: trunc_qb_512_mem:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovqb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
     %x = trunc <8 x i64> %i to <8 x i8>
     store <8 x i8> %x, <8 x i8>* %res
@@ -56,11 +60,13 @@ define <4 x i8> @trunc_qb_256(<4 x i64> %i) #0 {
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_qb_256:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovqd %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = trunc <4 x i64> %i to <4 x i8>
   ret <4 x i8> %x
@@ -73,11 +79,13 @@ define void @trunc_qb_256_mem(<4 x i64> %i, <4 x i8>* %res) #0 {
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; KNL-NEXT:    vmovd %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_qb_256_mem:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovqb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
     %x = trunc <4 x i64> %i to <4 x i8>
     store <4 x i8> %x, <4 x i8>* %res
@@ -112,6 +120,7 @@ define <8 x i16> @trunc_qw_512(<8 x i64> %i) #0 {
 ; ALL-LABEL: trunc_qw_512:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovqw %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
   %x = trunc <8 x i64> %i to <8 x i16>
   ret <8 x i16> %x
@@ -121,6 +130,7 @@ define void @trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) #0 {
 ; ALL-LABEL: trunc_qw_512_mem:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovqw %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
     %x = trunc <8 x i64> %i to <8 x i16>
     store <8 x i16> %x, <8 x i16>* %res
@@ -133,11 +143,13 @@ define <4 x i16> @trunc_qw_256(<4 x i64> %i) #0 {
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_qw_256:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovqd %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = trunc <4 x i64> %i to <4 x i16>
   ret <4 x i16> %x
@@ -150,11 +162,13 @@ define void @trunc_qw_256_mem(<4 x i64> %i, <4 x i16>* %res) #0 {
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_qw_256_mem:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovqw %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
     %x = trunc <4 x i64> %i to <4 x i16>
     store <4 x i16> %x, <4 x i16>* %res
@@ -199,6 +213,7 @@ define void @trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) #0 {
 ; ALL-LABEL: trunc_qd_512_mem:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovqd %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
     %x = trunc <8 x i64> %i to <8 x i32>
     store <8 x i32> %x, <8 x i32>* %res
@@ -211,11 +226,13 @@ define <4 x i32> @trunc_qd_256(<4 x i64> %i) #0 {
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_qd_256:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovqd %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = trunc <4 x i64> %i to <4 x i32>
   ret <4 x i32> %x
@@ -227,11 +244,13 @@ define void @trunc_qd_256_mem(<4 x i64> %i, <4 x i32>* %res) #0 {
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vpmovqd %zmm0, %ymm0
 ; KNL-NEXT:    vmovdqa %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_qd_256_mem:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovqd %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
     %x = trunc <4 x i64> %i to <4 x i32>
     store <4 x i32> %x, <4 x i32>* %res
@@ -266,6 +285,7 @@ define <16 x i8> @trunc_db_512(<16 x i32> %i) #0 {
 ; ALL-LABEL: trunc_db_512:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovdb %zmm0, %xmm0
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
   %x = trunc <16 x i32> %i to <16 x i8>
   ret <16 x i8> %x
@@ -275,6 +295,7 @@ define void @trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) #0 {
 ; ALL-LABEL: trunc_db_512_mem:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovdb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
     %x = trunc <16 x i32> %i to <16 x i8>
     store <16 x i8> %x, <16 x i8>* %res
@@ -287,11 +308,13 @@ define <8 x i8> @trunc_db_256(<8 x i32> %i) #0 {
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_db_256:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = trunc <8 x i32> %i to <8 x i8>
   ret <8 x i8> %x
@@ -304,11 +327,13 @@ define void @trunc_db_256_mem(<8 x i32> %i, <8 x i8>* %res) #0 {
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; KNL-NEXT:    vmovq %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_db_256_mem:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovdb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
     %x = trunc <8 x i32> %i to <8 x i8>
     store <8 x i8> %x, <8 x i8>* %res
@@ -352,6 +377,7 @@ define void @trunc_dw_512_mem(<16 x i32> %i, <16 x i16>* %res) #0 {
 ; ALL-LABEL: trunc_dw_512_mem:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovdw %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
     %x = trunc <16 x i32> %i to <16 x i16>
     store <16 x i16> %x, <16 x i16>* %res
@@ -364,11 +390,13 @@ define <8 x i16> @trunc_dw_256(<8 x i32> %i) #0 {
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_dw_256:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovdw %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = trunc <8 x i32> %i to <8 x i16>
   ret <8 x i16> %x
@@ -380,11 +408,13 @@ define void @trunc_dw_256_mem(<8 x i32> %i, <8 x i16>* %res) #0 {
 ; KNL-NEXT:    ## kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    vmovdqa %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_dw_256_mem:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovdw %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
     %x = trunc <8 x i32> %i to <8 x i16>
     store <8 x i16> %x, <8 x i16>* %res
@@ -434,11 +464,13 @@ define void @trunc_wb_512_mem(<32 x i16> %i, <32 x i8>* %res) #0 {
 ; KNL-NEXT:    vpmovdb %zmm1, %xmm1
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; KNL-NEXT:    vmovdqa %ymm0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_wb_512_mem:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovwb %zmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
     %x = trunc <32 x i16> %i to <32 x i8>
     store <32 x i8> %x, <32 x i8>* %res
@@ -450,11 +482,13 @@ define <16 x i8> @trunc_wb_256(<16 x i16> %i) #0 {
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_wb_256:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovwb %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x = trunc <16 x i16> %i to <16 x i8>
   ret <16 x i8> %x
@@ -466,11 +500,13 @@ define void @trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) #0 {
 ; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vmovdqa %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: trunc_wb_256_mem:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovwb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
     %x = trunc <16 x i16> %i to <16 x i8>
     store <16 x i8> %x, <16 x i8>* %res
@@ -509,11 +545,13 @@ define void @usat_trunc_wb_256_mem(<16 x i16> %i, <16 x i8>* %res) {
 ; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
 ; KNL-NEXT:    vmovdqu %xmm0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: usat_trunc_wb_256_mem:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovuswb %ymm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
   %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -528,11 +566,13 @@ define <16 x i8> @usat_trunc_wb_256(<16 x i16> %i) {
 ; KNL-NEXT:    vpminuw {{.*}}(%rip), %ymm0, %ymm0
 ; KNL-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; KNL-NEXT:    vpmovdb %zmm0, %xmm0
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: usat_trunc_wb_256:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vpmovuswb %ymm0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x3 = icmp ult <16 x i16> %i, <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
   %x5 = select <16 x i1> %x3, <16 x i16> %i, <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>
@@ -563,6 +603,7 @@ define void @usat_trunc_db_512_mem(<16 x i32> %i, <16 x i8>* %res) {
 ; ALL-LABEL: usat_trunc_db_512_mem:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovusdb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
   %x3 = icmp ult <16 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %x5 = select <16 x i1> %x3, <16 x i32> %i, <16 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -575,6 +616,7 @@ define void @usat_trunc_qb_512_mem(<8 x i64> %i, <8 x i8>* %res) {
 ; ALL-LABEL: usat_trunc_qb_512_mem:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovusqb %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
   %x3 = icmp ult <8 x i64> %i, <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
   %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255, i64 255>
@@ -587,6 +629,7 @@ define void @usat_trunc_qd_512_mem(<8 x i64> %i, <8 x i32>* %res) {
 ; ALL-LABEL: usat_trunc_qd_512_mem:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovusqd %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
   %x3 = icmp ult <8 x i64> %i, <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
   %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>
@@ -599,6 +642,7 @@ define void @usat_trunc_qw_512_mem(<8 x i64> %i, <8 x i16>* %res) {
 ; ALL-LABEL: usat_trunc_qw_512_mem:
 ; ALL:       ## BB#0:
 ; ALL-NEXT:    vpmovusqw %zmm0, (%rdi)
+; ALL-NEXT:    vzeroupper
 ; ALL-NEXT:    retq
   %x3 = icmp ult <8 x i64> %i, <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
   %x5 = select <8 x i1> %x3, <8 x i64> %i, <8 x i64> <i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535, i64 65535>
@@ -638,6 +682,7 @@ define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
 ; KNL-NEXT:    vpmovusdb %zmm1, %xmm1
 ; KNL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; KNL-NEXT:    vmovdqu %ymm0, (%rdi)
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: usat_trunc_db_1024_mem:
@@ -649,6 +694,7 @@ define void @usat_trunc_db_1024_mem(<32 x i32> %i, <32 x i8>* %p) {
 ; SKX-NEXT:    vpmovdw %zmm1, %ymm1
 ; SKX-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; SKX-NEXT:    vpmovwb %zmm0, (%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x3 = icmp ult <32 x i32> %i, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %x5 = select <32 x i1> %x3, <32 x i32> %i, <32 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
@@ -714,6 +760,7 @@ define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) {
 ; KNL-NEXT:    vpminud %ymm1, %ymm0, %ymm0
 ; KNL-NEXT:    vpmovdw %zmm0, %ymm0
 ; KNL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; KNL-NEXT:    vzeroupper
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: usat_trunc_db_256:
@@ -721,6 +768,7 @@ define <16 x i8> @usat_trunc_db_256(<8 x i32> %x) {
 ; SKX-NEXT:    vpminud {{.*}}(%rip){1to8}, %ymm0, %ymm0
 ; SKX-NEXT:    vpmovdw %ymm0, %xmm0
 ; SKX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %tmp1 = icmp ult <8 x i32> %x, <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
   %tmp2 = select <8 x i1> %tmp1, <8 x i32> %x, <8 x i32> <i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255, i32 255>
diff --git a/test/CodeGen/X86/avx512-vbroadcasti128.ll b/test/CodeGen/X86/avx512-vbroadcasti128.ll
index ad8a29cacd82869738810ed918e7b3a49758a53d..ed19324df995fab7bed793ebf7e4a36f1a545b2f 100644
--- a/test/CodeGen/X86/avx512-vbroadcasti128.ll
+++ b/test/CodeGen/X86/avx512-vbroadcasti128.ll
@@ -10,13 +10,13 @@
 define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
 ; X64-AVX512VL-LABEL: test_broadcast_2f64_4f64:
 ; X64-AVX512VL:       ## BB#0:
-; X64-AVX512VL-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512VL-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512VL-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX512VL-NEXT:    retq
 ;
 ; X64-AVX512BWVL-LABEL: test_broadcast_2f64_4f64:
 ; X64-AVX512BWVL:       ## BB#0:
-; X64-AVX512BWVL-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512BWVL-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512BWVL-NEXT:    vaddpd {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX512BWVL-NEXT:    retq
 ;
@@ -34,13 +34,13 @@ define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
 define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
 ; X64-AVX512VL-LABEL: test_broadcast_2i64_4i64:
 ; X64-AVX512VL:       ## BB#0:
-; X64-AVX512VL-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512VL-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX512VL-NEXT:    retq
 ;
 ; X64-AVX512BWVL-LABEL: test_broadcast_2i64_4i64:
 ; X64-AVX512BWVL:       ## BB#0:
-; X64-AVX512BWVL-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512BWVL-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512BWVL-NEXT:    vpaddq {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX512BWVL-NEXT:    retq
 ;
@@ -58,7 +58,7 @@ define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
 define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
 ; X64-AVX512-LABEL: test_broadcast_4f32_8f32:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512-NEXT:    vaddps {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX512-NEXT:    retq
  %1 = load <4 x float>, <4 x float> *%p
@@ -70,7 +70,7 @@ define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
 define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512-NEXT:    vpaddd {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX512-NEXT:    retq
  %1 = load <4 x i32>, <4 x i32> *%p
@@ -82,7 +82,7 @@ define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
 define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512-NEXT:    vpaddw {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX512-NEXT:    retq
  %1 = load <8 x i16>, <8 x i16> *%p
@@ -94,7 +94,7 @@ define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
 define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512-NEXT:    vpaddb {{.*}}(%rip), %ymm0, %ymm0
 ; X64-AVX512-NEXT:    retq
  %1 = load <16 x i8>, <16 x i8> *%p
@@ -182,7 +182,7 @@ define <16 x i32> @test_broadcast_4i32_16i32(<4 x i32> *%p) nounwind {
 define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
 ; X64-AVX512VL-LABEL: test_broadcast_8i16_32i16:
 ; X64-AVX512VL:       ## BB#0:
-; X64-AVX512VL-NEXT:    vbroadcasti32x4 {{.*#+}} ymm1 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
 ; X64-AVX512VL-NEXT:    vpaddw {{.*}}(%rip), %ymm1, %ymm0
 ; X64-AVX512VL-NEXT:    vpaddw {{.*}}(%rip), %ymm1, %ymm1
 ; X64-AVX512VL-NEXT:    retq
@@ -195,7 +195,7 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
 ;
 ; X64-AVX512DQVL-LABEL: test_broadcast_8i16_32i16:
 ; X64-AVX512DQVL:       ## BB#0:
-; X64-AVX512DQVL-NEXT:    vbroadcasti32x4 {{.*#+}} ymm1 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512DQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
 ; X64-AVX512DQVL-NEXT:    vpaddw {{.*}}(%rip), %ymm1, %ymm0
 ; X64-AVX512DQVL-NEXT:    vpaddw {{.*}}(%rip), %ymm1, %ymm1
 ; X64-AVX512DQVL-NEXT:    retq
@@ -208,7 +208,7 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
 define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
 ; X64-AVX512VL-LABEL: test_broadcast_16i8_64i8:
 ; X64-AVX512VL:       ## BB#0:
-; X64-AVX512VL-NEXT:    vbroadcasti32x4 {{.*#+}} ymm1 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
 ; X64-AVX512VL-NEXT:    vpaddb {{.*}}(%rip), %ymm1, %ymm0
 ; X64-AVX512VL-NEXT:    vpaddb {{.*}}(%rip), %ymm1, %ymm1
 ; X64-AVX512VL-NEXT:    retq
@@ -221,7 +221,7 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
 ;
 ; X64-AVX512DQVL-LABEL: test_broadcast_16i8_64i8:
 ; X64-AVX512DQVL:       ## BB#0:
-; X64-AVX512DQVL-NEXT:    vbroadcasti32x4 {{.*#+}} ymm1 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512DQVL-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = mem[0,1,0,1]
 ; X64-AVX512DQVL-NEXT:    vpaddb {{.*}}(%rip), %ymm1, %ymm0
 ; X64-AVX512DQVL-NEXT:    vpaddb {{.*}}(%rip), %ymm1, %ymm1
 ; X64-AVX512DQVL-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index d06b15ead4b6e3c3b228e9417e648c8eea1f32c3..2b04b9229b3d2a7d862be88405fac3cbfd5d157e 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -160,13 +160,24 @@ define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
 }
 
 define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
-; CHECK-LABEL: test12:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
-; CHECK-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
-; CHECK-NEXT:    kunpckbw %k0, %k1, %k0
-; CHECK-NEXT:    kmovw %k0, %eax
-; CHECK-NEXT:    retq
+; KNL-LABEL: test12:
+; KNL:       ## BB#0:
+; KNL-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
+; KNL-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; KNL-NEXT:    kunpckbw %k0, %k1, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: test12:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vpcmpeqq %zmm2, %zmm0, %k0
+; SKX-NEXT:    vpcmpeqq %zmm3, %zmm1, %k1
+; SKX-NEXT:    kunpckbw %k0, %k1, %k0
+; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
   %res = icmp eq <16 x i64> %a, %b
   %res1 = bitcast <16 x i1> %res to i16
   ret i16 %res1
@@ -326,6 +337,7 @@ define i32 @test12_v32i32(<32 x i32> %a, <32 x i32> %b) nounwind {
 ; SKX-NEXT:    vpcmpeqd %zmm3, %zmm1, %k1
 ; SKX-NEXT:    kunpckwd %k0, %k1, %k0
 ; SKX-NEXT:    kmovd %k0, %eax
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %res = icmp eq <32 x i32> %a, %b
   %res1 = bitcast <32 x i1> %res to i32
@@ -637,6 +649,7 @@ define i64 @test12_v64i16(<64 x i16> %a, <64 x i16> %b) nounwind {
 ; SKX-NEXT:    vpcmpeqw %zmm3, %zmm1, %k1
 ; SKX-NEXT:    kunpckdq %k0, %k1, %k0
 ; SKX-NEXT:    kmovq %k0, %rax
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %res = icmp eq <64 x i16> %a, %b
   %res1 = bitcast <64 x i1> %res to i64
@@ -892,6 +905,7 @@ define <16 x i8>@test29(<16 x i32> %x, <16 x i32> %y, <16 x i32> %x1, <16 x i32>
 ; SKX-NEXT:    vpcmpgtd %zmm3, %zmm2, %k1
 ; SKX-NEXT:    kxorw %k1, %k0, %k0
 ; SKX-NEXT:    vpmovm2b %k0, %xmm0
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %x_gt_y = icmp sgt <16 x i32> %x, %y
   %x1_gt_y1 = icmp sgt <16 x i32> %x1, %y1
@@ -1226,11 +1240,7 @@ define <2 x i64> @test46(<2 x float> %x, <2 x float> %y) #0 {
 ; KNL-LABEL: test46:
 ; KNL:       ## BB#0:
 ; KNL-NEXT:    vcmpeqps %xmm1, %xmm0, %xmm0
-; KNL-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; KNL-NEXT:    vpsllq $32, %xmm0, %xmm0
-; KNL-NEXT:    vpsrad $31, %xmm0, %xmm1
-; KNL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
-; KNL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; KNL-NEXT:    vpmovsxdq %xmm0, %xmm0
 ; KNL-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; KNL-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/avx512-vpermv3-commute.ll b/test/CodeGen/X86/avx512-vpermv3-commute.ll
index 471a8ea49f697d8e519cb55cc692ce6d0690152b..2827f471762f444fe67b785650d6804e5083a0a6 100644
--- a/test/CodeGen/X86/avx512-vpermv3-commute.ll
+++ b/test/CodeGen/X86/avx512-vpermv3-commute.ll
@@ -53,7 +53,7 @@ declare <16 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.512(<16 x i32>, <16 x i32
 define <16 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2p, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpermi2d (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2p
@@ -66,7 +66,7 @@ declare <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64>, <8 x do
 define <8 x double>@test_int_x86_avx512_maskz_vpermt2var_pd_512(<8 x i64> %x0, <8 x double> %x1, double* %x2ptr, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpermi2pd (%rdi){1to8}, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x2s = load double, double* %x2ptr
@@ -81,7 +81,7 @@ declare <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32>, <16 x
 define <16 x float>@test_int_x86_avx512_maskz_vpermt2var_ps_512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_ps_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vpermi2ps %zmm2, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> %x0, <16 x float> %x1, <16 x float> %x2, i16 %x3)
@@ -94,7 +94,7 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64>, <8 x i64>,
 define <8 x i64>@test_int_x86_avx512_maskz_vpermt2var_q_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_q_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vpermi2q %zmm2, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i64> @llvm.x86.avx512.maskz.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
@@ -128,7 +128,7 @@ declare <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32>, <4 x i32>,
 define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vpermi2d %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
@@ -138,7 +138,7 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
 define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128_broadcast(<4 x i32> %x0, <4 x i32> %x1, i32* %x2ptr, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_128_broadcast:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpermi2d (%rdi){1to4}, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x2s = load i32, i32* %x2ptr
@@ -164,7 +164,7 @@ declare <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32>, <8 x i32>,
 define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vpermi2d %ymm2, %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
@@ -296,7 +296,7 @@ declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>,
 define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vpermi2b %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
@@ -306,7 +306,7 @@ define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16
 define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128_load(<16 x i8> %x0, <16 x i8> %x1, <16 x i8>* %x2p, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128_load:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vpermi2b (%rdi), %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i8>, <16 x i8>* %x2p
diff --git a/test/CodeGen/X86/avx512-vpternlog-commute.ll b/test/CodeGen/X86/avx512-vpternlog-commute.ll
index 9cb82bcd66f732634eec03745e182a2905844538..c917e0b17f1ce1d19fb2e13c70b053d7cf3fda75 100644
--- a/test/CodeGen/X86/avx512-vpternlog-commute.ll
+++ b/test/CodeGen/X86/avx512-vpternlog-commute.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s
 
 ; These test cases demonstrate cases where vpternlog could benefit from being commuted.
 
@@ -9,485 +9,1060 @@ declare <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32>, <16 x i32>,
 define <16 x i32> @vpternlog_v16i32_012(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_012:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $114, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_102:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $9, %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $78, %zmm2, %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_210:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $9, %zmm0, %zmm2, %zmm1
+; CHECK-NEXT:    vpternlogd $78, %zmm0, %zmm2, %zmm1
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_012_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_012_load0:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $9, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $46, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_012_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_012_load1:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $116, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_012_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
 ; CHECK-LABEL: vpternlog_v16i32_012_load2:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $114, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_102_load0:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $116, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_102_load1:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $9, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $46, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
 ; CHECK-LABEL: vpternlog_v16i32_102_load2:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $9, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $78, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_210_load0:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $9, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $78, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_210_load1:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $92, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
 ; CHECK-LABEL: vpternlog_v16i32_210_load2:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $58, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_021_load0(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_021_load0:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $58, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_021_load1(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2) {
 ; CHECK-LABEL: vpternlog_v16i32_021_load1:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $114, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_021_load2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr) {
 ; CHECK-LABEL: vpternlog_v16i32_021_load2:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0
+; CHECK-NEXT:    vpternlogd $116, (%rdi), %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 -1)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_012_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_012_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpternlogd $114, %zmm2, %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_102_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpternlogd $114, %zmm2, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_210_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpternlogd $33, %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
+define <16 x i32> @vpternlog_v16i32_012_mask1(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_mask1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpternlogd $78, %zmm2, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x1
+  ret <16 x i32> %res2
+}
+
+define <16 x i32> @vpternlog_v16i32_012_mask2(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_mask2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpternlogd $58, %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x2
+  ret <16 x i32> %res2
+}
+
 define <16 x i32> @vpternlog_v16i32_012_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_012_load0_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT:    vpternlogd $33, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
+define <16 x i32> @vpternlog_v16i32_012_load0_mask1(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_load0_mask1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x0 = load <16 x i32>, <16 x i32>* %x0ptr
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x1
+  ret <16 x i32> %res2
+}
+
+define <16 x i32> @vpternlog_v16i32_012_load0_mask2(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_load0_mask2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x0 = load <16 x i32>, <16 x i32>* %x0ptr
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x2
+  ret <16 x i32> %res2
+}
+
 define <16 x i32> @vpternlog_v16i32_012_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_012_load1_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
+define <16 x i32> @vpternlog_v16i32_012_load1_mask2(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_load1_mask2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $9, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x1 = load <16 x i32>, <16 x i32>* %x1ptr
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x2
+  ret <16 x i32> %res2
+}
+
 define <16 x i32> @vpternlog_v16i32_012_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_012_load2_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
+define <16 x i32> @vpternlog_v16i32_012_load2_mask1(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_load2_mask1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $9, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x2 = load <16 x i32>, <16 x i32>* %x2ptr
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x1
+  ret <16 x i32> %res2
+}
+
 define <16 x i32> @vpternlog_v16i32_102_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_102_load0_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_102_load1_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT:    vpternlogd $33, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_102_load2_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi), %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_210_load0_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi), %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_210_load1_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi), %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_210_load2_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT:    vpternlogd $33, %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT:    vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_021_load0_mask(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_021_load0_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vmovdqa32 (%rdi), %zmm2
-; CHECK-NEXT:    vpternlogd $33, %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT:    vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1}
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_021_load1_mask(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_021_load1_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_021_load2_mask(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_021_load2_mask:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1}
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_012_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_012_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpternlogd $33, %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpternlogd $114, %zmm2, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_102_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpternlogd $9, %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpternlogd $78, %zmm2, %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_210_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpternlogd $9, %zmm0, %zmm2, %zmm1 {%k1} {z}
+; CHECK-NEXT:    kmovd %edi, %k1
+; CHECK-NEXT:    vpternlogd $78, %zmm0, %zmm2, %zmm1 {%k1} {z}
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_012_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_012_load0_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $46, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_012_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_012_load1_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_012_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_012_load2_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_102_load0_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_102_load1_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $46, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_102_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_102_load2_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $78, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_210_load0_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $9, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $78, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_210_load1_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $92, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_210_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_210_load2_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $58, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_021_load0_maskz(<16 x i32>* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_021_load0_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $58, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x0 = load <16 x i32>, <16 x i32>* %x0ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_021_load1_maskz(<16 x i32> %x0, <16 x i32>* %x1ptr, <16 x i32> %x2, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_021_load1_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $33, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x1 = load <16 x i32>, <16 x i32>* %x1ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
 
 define <16 x i32> @vpternlog_v16i32_021_load2_maskz(<16 x i32> %x0, <16 x i32> %x1, <16 x i32>* %x2ptr, i16 %mask) {
 ; CHECK-LABEL: vpternlog_v16i32_021_load2_maskz:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1
-; CHECK-NEXT:    vpternlogd $65, (%rdi), %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi), %zmm1, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %x2 = load <16 x i32>, <16 x i32>* %x2ptr
-  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 33, i16 %mask)
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast0:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpternlogd $46, (%rdi){1to16}, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x0_scalar = load i32, i32* %ptr_x0
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0_scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x1_scalar = load i32, i32* %ptr_x1
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1_scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast2(<16 x i32> %x0, <16 x i32> %x1, i32* %ptr_x2) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpternlogd $114, (%rdi){1to16}, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x2_scalar = load i32, i32* %ptr_x2
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2_scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_102_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: vpternlog_v16i32_102_broadcast0:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x0_scalar = load i32, i32* %ptr_x0
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0_scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_102_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) {
+; CHECK-LABEL: vpternlog_v16i32_102_broadcast1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpternlogd $46, (%rdi){1to16}, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x1_scalar = load i32, i32* %ptr_x1
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1_scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_102_broadcast2(<16 x i32> %x0, <16 x i32> %x1, i32* %ptr_x2) {
+; CHECK-LABEL: vpternlog_v16i32_102_broadcast2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpternlogd $78, (%rdi){1to16}, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x2_scalar = load i32, i32* %ptr_x2
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2_scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_210_broadcast0(i32* %ptr_x0, <16 x i32> %x1, <16 x i32> %x2) {
+; CHECK-LABEL: vpternlog_v16i32_210_broadcast0:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpternlogd $78, (%rdi){1to16}, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x0_scalar = load i32, i32* %ptr_x0
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0_scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_210_broadcast1(<16 x i32> %x0, i32* %ptr_x1, <16 x i32> %x2) {
+; CHECK-LABEL: vpternlog_v16i32_210_broadcast1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpternlogd $92, (%rdi){1to16}, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x1_scalar = load i32, i32* %ptr_x1
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1_scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_210_broadcast2(<16 x i32> %x0, <16 x i32> %x1, i32* %ptr_x2) {
+; CHECK-LABEL: vpternlog_v16i32_210_broadcast2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpternlogd $58, (%rdi){1to16}, %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x2_scalar = load i32, i32* %ptr_x2
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2_scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 -1)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast0_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm2
+; CHECK-NEXT:    vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %x0scalar = load i32, i32* %x0ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast1_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x1scalar = load i32, i32* %x1ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast2_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x2scalar = load i32, i32* %x2ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_102_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_102_broadcast0_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x0scalar = load i32, i32* %x0ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_102_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_102_broadcast1_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm2
+; CHECK-NEXT:    vpternlogd $114, %zmm1, %zmm0, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %x1scalar = load i32, i32* %x1ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
   ret <16 x i32> %res
 }
+
+define <16 x i32> @vpternlog_v16i32_102_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_102_broadcast2_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x2scalar = load i32, i32* %x2ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_210_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_210_broadcast0_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x0scalar = load i32, i32* %x0ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_210_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_210_broadcast1_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x1scalar = load i32, i32* %x1ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_210_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_210_broadcast2_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm2
+; CHECK-NEXT:    vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %x2scalar = load i32, i32* %x2ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_021_broadcast0_mask(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_021_broadcast0_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpbroadcastd (%rdi), %zmm2
+; CHECK-NEXT:    vpternlogd $114, %zmm0, %zmm1, %zmm2 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0
+; CHECK-NEXT:    retq
+  %x0scalar = load i32, i32* %x0ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_021_broadcast1_mask(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_021_broadcast1_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x1scalar = load i32, i32* %x1ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_021_broadcast2_mask(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_021_broadcast2_mask:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x2scalar = load i32, i32* %x2ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast0_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $46, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x0scalar = load i32, i32* %x0ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast1_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x1scalar = load i32, i32* %x1ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast2_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x2scalar = load i32, i32* %x2ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_102_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_102_broadcast0_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x0scalar = load i32, i32* %x0ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_102_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_102_broadcast1_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $46, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x1scalar = load i32, i32* %x1ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_102_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_102_broadcast2_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $78, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x2scalar = load i32, i32* %x2ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x1, <16 x i32> %x0, <16 x i32> %x2, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_210_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_210_broadcast0_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $78, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x0scalar = load i32, i32* %x0ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_210_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_210_broadcast1_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $92, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x1scalar = load i32, i32* %x1ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_210_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_210_broadcast2_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $58, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x2scalar = load i32, i32* %x2ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x2, <16 x i32> %x1, <16 x i32> %x0, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_021_broadcast0_maskz(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_021_broadcast0_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $58, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x0scalar = load i32, i32* %x0ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_021_broadcast1_maskz(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_021_broadcast1_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $114, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x1scalar = load i32, i32* %x1ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_021_broadcast2_maskz(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_021_broadcast2_maskz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $116, (%rdi){1to16}, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %x2scalar = load i32, i32* %x2ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.maskz.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x2, <16 x i32> %x1, i32 114, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask1(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast0_mask1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $92, (%rdi), %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    retq
+  %x0scalar = load i32, i32* %x0ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x1
+  ret <16 x i32> %res2
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast0_mask2(i32* %x0ptr, <16 x i32> %x1, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast0_mask2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $58, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x0scalar = load i32, i32* %x0ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x0scalar, i32 0
+  %x0 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x2
+  ret <16 x i32> %res2
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast1_mask2(<16 x i32> %x0, i32* %x1ptr, <16 x i32> %x2, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast1_mask2:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $46, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x1scalar = load i32, i32* %x1ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x1scalar, i32 0
+  %x1 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x2
+  ret <16 x i32> %res2
+}
+
+define <16 x i32> @vpternlog_v16i32_012_broadcast2_mask1(<16 x i32> %x0, <16 x i32> %x1, i32* %x2ptr, i16 %mask) {
+; CHECK-LABEL: vpternlog_v16i32_012_broadcast2_mask1:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1
+; CHECK-NEXT:    vpternlogd $78, (%rdi), %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %x2scalar = load i32, i32* %x2ptr
+  %vecinit.i = insertelement <16 x i32> undef, i32 %x2scalar, i32 0
+  %x2 = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 114, i16 -1)
+  %mask.cast = bitcast i16 %mask to <16 x i1>
+  %res2 = select <16 x i1> %mask.cast, <16 x i32> %res, <16 x i32> %x1
+  ret <16 x i32> %res2
+}
diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 507205ceb4f9c5be53c27415e1d947752e1ebcaf..9b4e73a18fc288b0cff37e8176df33d19592a75c 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -992,3 +992,575 @@ define <64 x i8>@test_int_x86_avx512_mask_pshuf_b_512(<64 x i8> %x0, <64 x i8> %
   ret <64 x i8> %res2
 }
 
+
+declare <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64)
+
+define <64 x i8>@test_int_x86_avx512_cvtmask2b_512(i64 %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_cvtmask2b_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_cvtmask2b_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64 %x0)
+  ret <64 x i8> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32)
+
+define <32 x i16>@test_int_x86_avx512_cvtmask2w_512(i32 %x0) {
+; AVX512BW-LABEL: test_int_x86_avx512_cvtmask2w_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_int_x86_avx512_cvtmask2w_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
+; AVX512F-32-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32 %x0)
+  ret <32 x i16> %res
+}
+define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rmb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmbk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackssdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi32_rmbkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackssdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
+
+define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+; AVX512BW-LABEL: test_mask_packs_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
+; AVX512BW-LABEL: test_mask_packs_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpacksswb (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
+  ret <64 x i8> %res
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
+
+
+define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
+; AVX512BW-LABEL: test_mask_packus_epi32_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packus_epi32_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packus_epi32_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
+; AVX512BW-LABEL: test_mask_packus_epi32_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packus_epi32_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packus_epi32_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <16 x i32>, <16 x i32>* %ptr_b
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
+; AVX512BW-LABEL: test_mask_packus_epi32_rmb_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmb_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packus_epi32_rmbk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmbk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
+  ret <32 x i16> %res
+}
+
+define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
+; AVX512BW-LABEL: test_mask_packus_epi32_rmbkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovd %esi, %k1
+; AVX512BW-NEXT:    vpackusdw (%rdi){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi32_rmbkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackusdw (%eax){1to16}, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
+  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %res
+}
+
+declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
+
+define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
+; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rr_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
+; AVX512BW-LABEL: test_mask_packus_epi16_rrk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
+; AVX512BW-LABEL: test_mask_packus_epi16_rrkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rdi, %k1
+; AVX512BW-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
+; AVX512BW-LABEL: test_mask_packus_epi16_rm_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rm_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
+; AVX512BW-LABEL: test_mask_packus_epi16_rmk_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm1 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1}
+; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
+  ret <64 x i8> %res
+}
+
+define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
+; AVX512BW-LABEL: test_mask_packus_epi16_rmkz_512:
+; AVX512BW:       ## BB#0:
+; AVX512BW-NEXT:    kmovq %rsi, %k1
+; AVX512BW-NEXT:    vpackuswb (%rdi), %zmm0, %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
+;
+; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
+; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
+; AVX512F-32-NEXT:    retl
+  %b = load <32 x i16>, <32 x i16>* %ptr_b
+  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
+  ret <64 x i8> %res
+}
+
+declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll
index d9a9a2d655b404f39c947152952050113794e6ac..3337f42eb14280e722388c745a45c4ad6959f063 100644
--- a/test/CodeGen/X86/avx512bw-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bw-intrinsics.ll
@@ -660,8 +660,8 @@ define <32 x i16> @test_mask_packs_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0
 ; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
@@ -678,8 +678,10 @@ define <32 x i16> @test_mask_packs_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <
 ; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
 }
 
 define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
@@ -694,8 +696,10 @@ define <32 x i16> @test_mask_packs_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpackssdw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
 }
 
 define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
@@ -710,8 +714,8 @@ define <32 x i16> @test_mask_packs_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_
 ; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0
 ; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
@@ -730,8 +734,10 @@ define <32 x i16> @test_mask_packs_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr
 ; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
@@ -748,8 +754,10 @@ define <32 x i16> @test_mask_packs_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %pt
 ; AVX512F-32-NEXT:    vpackssdw (%eax), %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
@@ -766,8 +774,8 @@ define <32 x i16> @test_mask_packs_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
@@ -788,8 +796,10 @@ define <32 x i16> @test_mask_packs_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <3
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
 }
 
 define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
@@ -808,11 +818,13 @@ define <32 x i16> @test_mask_packs_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
 }
 
-declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>)
 
 define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: test_mask_packs_epi16_rr_512:
@@ -824,8 +836,8 @@ define <64 x i8> @test_mask_packs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0
 ; AVX512F-32-NEXT:    retl
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  ret <64 x i8> %1
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
@@ -838,14 +850,14 @@ define <64 x i8> @test_mask_packs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <6
 ;
 ; AVX512F-32-LABEL: test_mask_packs_epi16_rrk_512:
 ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
+  ret <64 x i8> %3
 }
 
 define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
@@ -857,13 +869,13 @@ define <64 x i8> @test_mask_packs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i
 ;
 ; AVX512F-32-LABEL: test_mask_packs_epi16_rrkz_512:
 ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpacksswb %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-32-NEXT:    retl
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
+  ret <64 x i8> %3
 }
 
 define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
@@ -878,8 +890,8 @@ define <64 x i8> @test_mask_packs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b
 ; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0
 ; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  ret <64 x i8> %1
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
@@ -893,15 +905,15 @@ define <64 x i8> @test_mask_packs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512F-32-LABEL: test_mask_packs_epi16_rmk_512:
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm1 {%k1}
 ; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
+  ret <64 x i8> %3
 }
 
 define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
@@ -914,17 +926,17 @@ define <64 x i8> @test_mask_packs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512F-32-LABEL: test_mask_packs_epi16_rmkz_512:
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpacksswb (%eax), %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
+  ret <64 x i8> %3
 }
 
-declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
+declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>)
 
 
 define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
@@ -937,8 +949,8 @@ define <32 x i16> @test_mask_packus_epi32_rr_512(<16 x i32> %a, <16 x i32> %b) {
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0
 ; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask) {
@@ -955,8 +967,10 @@ define <32 x i16> @test_mask_packus_epi32_rrk_512(<16 x i32> %a, <16 x i32> %b,
 ; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b, i32 %mask) {
@@ -971,8 +985,10 @@ define <32 x i16> @test_mask_packus_epi32_rrkz_512(<16 x i32> %a, <16 x i32> %b,
 ; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpackusdw %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr_b) {
@@ -987,8 +1003,8 @@ define <32 x i16> @test_mask_packus_epi32_rm_512(<16 x i32> %a, <16 x i32>* %ptr
 ; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0
 ; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %ptr_b, <32 x i16> %passThru, i32 %mask) {
@@ -1007,8 +1023,10 @@ define <32 x i16> @test_mask_packus_epi32_rmk_512(<16 x i32> %a, <16 x i32>* %pt
 ; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %ptr_b, i32 %mask) {
@@ -1025,8 +1043,10 @@ define <32 x i16> @test_mask_packus_epi32_rmkz_512(<16 x i32> %a, <16 x i32>* %p
 ; AVX512F-32-NEXT:    vpackusdw (%eax), %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-32-NEXT:    retl
   %b = load <16 x i32>, <16 x i32>* %ptr_b
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
@@ -1043,8 +1063,8 @@ define <32 x i16> @test_mask_packus_epi32_rmb_512(<16 x i32> %a, i32* %ptr_b) {
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 -1)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  ret <32 x i16> %1
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <32 x i16> %passThru, i32 %mask) {
@@ -1065,8 +1085,10 @@ define <32 x i16> @test_mask_packus_epi32_rmbk_512(<16 x i32> %a, i32* %ptr_b, <
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> %passThru, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru
+  ret <32 x i16> %3
 }
 
 define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b, i32 %mask) {
@@ -1085,11 +1107,13 @@ define <32 x i16> @test_mask_packus_epi32_rmbkz_512(<16 x i32> %a, i32* %ptr_b,
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <16 x i32> undef, i32 %q, i32 0
   %b = shufflevector <16 x i32> %vecinit.i, <16 x i32> undef, <16 x i32> zeroinitializer
-  %res = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a, <16 x i32> %b, <32 x i16> zeroinitializer, i32 %mask)
-  ret <32 x i16> %res
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %a, <16 x i32> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer
+  ret <32 x i16> %3
 }
 
-declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32)
+declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>)
 
 define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: test_mask_packus_epi16_rr_512:
@@ -1101,8 +1125,8 @@ define <64 x i8> @test_mask_packus_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0
 ; AVX512F-32-NEXT:    retl
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  ret <64 x i8> %1
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask) {
@@ -1115,14 +1139,14 @@ define <64 x i8> @test_mask_packus_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <
 ;
 ; AVX512F-32-LABEL: test_mask_packus_epi16_rrk_512:
 ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512F-32-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-32-NEXT:    retl
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
+  ret <64 x i8> %3
 }
 
 define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i64 %mask) {
@@ -1134,13 +1158,13 @@ define <64 x i8> @test_mask_packus_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b,
 ;
 ; AVX512F-32-LABEL: test_mask_packus_epi16_rrkz_512:
 ; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpackuswb %zmm1, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-32-NEXT:    retl
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
+  ret <64 x i8> %3
 }
 
 define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) {
@@ -1155,8 +1179,8 @@ define <64 x i8> @test_mask_packus_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_
 ; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0
 ; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 -1)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  ret <64 x i8> %1
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <64 x i8> %passThru, i64 %mask) {
@@ -1170,15 +1194,15 @@ define <64 x i8> @test_mask_packus_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr
 ; AVX512F-32-LABEL: test_mask_packus_epi16_rmk_512:
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm1 {%k1}
 ; AVX512F-32-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> %passThru, i64 %mask)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> %passThru
+  ret <64 x i8> %3
 }
 
 define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i64 %mask) {
@@ -1191,17 +1215,17 @@ define <64 x i8> @test_mask_packus_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %pt
 ; AVX512F-32-LABEL: test_mask_packus_epi16_rmkz_512:
 ; AVX512F-32:       # BB#0:
 ; AVX512F-32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k1
+; AVX512F-32-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
 ; AVX512F-32-NEXT:    vpackuswb (%eax), %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-32-NEXT:    retl
   %b = load <32 x i16>, <32 x i16>* %ptr_b
-  %res = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a, <32 x i16> %b, <64 x i8> zeroinitializer, i64 %mask)
-  ret <64 x i8> %res
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %a, <32 x i16> %b)
+  %2 = bitcast i64 %mask to <64 x i1>
+  %3 = select <64 x i1> %2, <64 x i8> %1, <64 x i8> zeroinitializer
+  ret <64 x i8> %3
 }
 
-declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64)
+declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>)
 
 define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) {
 ; AVX512BW-LABEL: test_mask_adds_epi16_rr_512:
@@ -2108,7 +2132,7 @@ declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <1
 define <16 x i32>@test_int_x86_avx512_mask_pmaddw_d_512(<32 x i16> %x0, <32 x i16> %x1, <16 x i32> %x2, i16 %x3) {
 ; AVX512BW-LABEL: test_int_x86_avx512_mask_pmaddw_d_512:
 ; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    kmovw %edi, %k1
+; AVX512BW-NEXT:    kmovd %edi, %k1
 ; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vpmaddwd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddd %zmm0, %zmm2, %zmm0
@@ -2271,44 +2295,6 @@ define i32@test_int_x86_avx512_cvtw2mask_512(<32 x i16> %x0) {
     ret i32 %res
 }
 
-declare <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64)
-
-define <64 x i8>@test_int_x86_avx512_cvtmask2b_512(i64 %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_cvtmask2b_512:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    kmovq %rdi, %k0
-; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_cvtmask2b_512:
-; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k1
-; AVX512F-32-NEXT:    kunpckdq %k0, %k1, %k0
-; AVX512F-32-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %res = call <64 x i8> @llvm.x86.avx512.cvtmask2b.512(i64 %x0)
-  ret <64 x i8> %res
-}
-
-declare <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32)
-
-define <32 x i16>@test_int_x86_avx512_cvtmask2w_512(i32 %x0) {
-; AVX512BW-LABEL: test_int_x86_avx512_cvtmask2w_512:
-; AVX512BW:       ## BB#0:
-; AVX512BW-NEXT:    kmovd %edi, %k0
-; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT:    retq
-;
-; AVX512F-32-LABEL: test_int_x86_avx512_cvtmask2w_512:
-; AVX512F-32:       # BB#0:
-; AVX512F-32-NEXT:    kmovd {{[0-9]+}}(%esp), %k0
-; AVX512F-32-NEXT:    vpmovm2w %k0, %zmm0
-; AVX512F-32-NEXT:    retl
-  %res = call <32 x i16> @llvm.x86.avx512.cvtmask2w.512(i32 %x0)
-  ret <32 x i16> %res
-}
-
 declare <32 x i16> @llvm.x86.avx512.mask.psrlv32hi(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
 define <32 x i16>@test_int_x86_avx512_mask_psrlv32hi(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) {
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
index 7cd0da9564ffe80b0eb27367b9443b87bd153e34..98b346a2d733b588b4289b2c6026b7bfb11d728d 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-fast-isel.ll
@@ -24,13 +24,13 @@ define <2 x i64> @test_mm_mask_broadcastb_epi8(<2 x i64> %a0, i16 %a1, <2 x i64>
 ; X32-LABEL: test_mm_mask_broadcastb_epi8:
 ; X32:       # BB#0:
 ; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    kmovd %eax, %k1
 ; X32-NEXT:    vpbroadcastb %xmm1, %xmm0 {%k1}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_broadcastb_epi8:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vpbroadcastb %xmm1, %xmm0 {%k1}
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -46,13 +46,13 @@ define <2 x i64> @test_mm_maskz_broadcastb_epi8(i16 %a0, <2 x i64> %a1) {
 ; X32-LABEL: test_mm_maskz_broadcastb_epi8:
 ; X32:       # BB#0:
 ; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    kmovd %eax, %k1
 ; X32-NEXT:    vpbroadcastb %xmm0, %xmm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskz_broadcastb_epi8:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vpbroadcastb %xmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %arg0 = bitcast i16 %a0 to <16 x i1>
@@ -142,13 +142,13 @@ define <2 x i64> @test_mm_mask_broadcastw_epi16(<2 x i64> %a0, i8 %a1, <2 x i64>
 ; X32-LABEL: test_mm_mask_broadcastw_epi16:
 ; X32:       # BB#0:
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    kmovd %eax, %k1
 ; X32-NEXT:    vpbroadcastw %xmm1, %xmm0 {%k1}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_mask_broadcastw_epi16:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vpbroadcastw %xmm1, %xmm0 {%k1}
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <8 x i16>
@@ -164,13 +164,13 @@ define <2 x i64> @test_mm_maskz_broadcastw_epi16(i8 %a0, <2 x i64> %a1) {
 ; X32-LABEL: test_mm_maskz_broadcastw_epi16:
 ; X32:       # BB#0:
 ; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
-; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    kmovd %eax, %k1
 ; X32-NEXT:    vpbroadcastw %xmm0, %xmm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_maskz_broadcastw_epi16:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vpbroadcastw %xmm0, %xmm0 {%k1} {z}
 ; X64-NEXT:    retq
   %arg0 = bitcast i8 %a0 to <8 x i1>
@@ -201,13 +201,13 @@ define <4 x i64> @test_mm256_mask_broadcastw_epi16(<4 x i64> %a0, i16 %a1, <2 x
 ; X32-LABEL: test_mm256_mask_broadcastw_epi16:
 ; X32:       # BB#0:
 ; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    kmovd %eax, %k1
 ; X32-NEXT:    vpbroadcastw %xmm1, %ymm0 {%k1}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_mask_broadcastw_epi16:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vpbroadcastw %xmm1, %ymm0 {%k1}
 ; X64-NEXT:    retq
   %arg0 = bitcast <4 x i64> %a0 to <16 x i16>
@@ -223,13 +223,13 @@ define <4 x i64> @test_mm256_maskz_broadcastw_epi16(i16 %a0, <2 x i64> %a1) {
 ; X32-LABEL: test_mm256_maskz_broadcastw_epi16:
 ; X32:       # BB#0:
 ; X32-NEXT:    movw {{[0-9]+}}(%esp), %ax
-; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    kmovd %eax, %k1
 ; X32-NEXT:    vpbroadcastw %xmm0, %ymm0 {%k1} {z}
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_maskz_broadcastw_epi16:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vpbroadcastw %xmm0, %ymm0 {%k1} {z}
 ; X64-NEXT:    retq
   %arg0 = bitcast i16 %a0 to <16 x i1>
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
index 0a8e1445be88bf4b7e0af370a6084e669ea6fcf4..7df07b0413ed4c9a94970df6a7966c338adc68a6 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll
@@ -27,7 +27,7 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x
 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastb_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8]
 ; CHECK-NEXT:    vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9]
@@ -47,7 +47,7 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9]
@@ -67,7 +67,7 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-LABEL: test_int_x86_avx512_pbroadcastw_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8]
 ; CHECK-NEXT:    vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9]
@@ -126,7 +126,7 @@ declare void @llvm.x86.avx512.mask.storeu.b.128(i8*, <16 x i8>, i16)
 define void@test_int_x86_avx512_mask_storeu_b_128(i8* %ptr1, i8* %ptr2, <16 x i8> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_storeu_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT:    kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
 ; CHECK-NEXT:    vmovdqu8 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x7f,0x07]
 ; CHECK-NEXT:    vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -154,7 +154,7 @@ declare void @llvm.x86.avx512.mask.storeu.w.128(i8*, <8 x i16>, i8)
 define void@test_int_x86_avx512_mask_storeu_w_128(i8* %ptr1, i8* %ptr2, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT:    kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
 ; CHECK-NEXT:    vmovdqu16 %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7f,0x07]
 ; CHECK-NEXT:    vmovdqu %xmm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7f,0x06]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -168,7 +168,7 @@ declare void @llvm.x86.avx512.mask.storeu.w.256(i8*, <16 x i16>, i16)
 define void@test_int_x86_avx512_mask_storeu_w_256(i8* %ptr1, i8* %ptr2, <16 x i16> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_storeu_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT:    kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
 ; CHECK-NEXT:    vmovdqu16 %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7f,0x07]
 ; CHECK-NEXT:    vmovdqu %ymm0, (%rsi) ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x7f,0x06]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -183,7 +183,7 @@ define <8 x i16>@test_int_x86_avx512_mask_loadu_w_128(i8* %ptr, i8* %ptr2, <8 x
 ; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
-; CHECK-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT:    kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
 ; CHECK-NEXT:    vmovdqu16 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x6f,0x06]
 ; CHECK-NEXT:    vmovdqu16 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x6f,0x0f]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
@@ -201,7 +201,7 @@ define <16 x i16>@test_int_x86_avx512_mask_loadu_w_256(i8* %ptr, i8* %ptr2, <16
 ; CHECK-LABEL: test_int_x86_avx512_mask_loadu_w_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovdqu (%rdi), %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x6f,0x07]
-; CHECK-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT:    kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
 ; CHECK-NEXT:    vmovdqu16 (%rsi), %ymm0 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x6f,0x06]
 ; CHECK-NEXT:    vmovdqu16 (%rdi), %ymm1 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x6f,0x0f]
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1]
@@ -219,7 +219,7 @@ define <16 x i8>@test_int_x86_avx512_mask_loadu_b_128(i8* %ptr, i8* %ptr2, <16 x
 ; CHECK-LABEL: test_int_x86_avx512_mask_loadu_b_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmovdqu (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x07]
-; CHECK-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
+; CHECK-NEXT:    kmovd %edx, %k1 ## encoding: [0xc5,0xfb,0x92,0xca]
 ; CHECK-NEXT:    vmovdqu8 (%rsi), %xmm0 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x6f,0x06]
 ; CHECK-NEXT:    vmovdqu8 (%rdi), %xmm1 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x6f,0x0f]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
@@ -256,7 +256,7 @@ define <16 x i8>@test_int_x86_avx512_mask_palignr_128(<16 x i8> %x0, <16 x i8> %
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x0f,0xd9,0x02]
 ; CHECK-NEXT:    ## xmm3 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x0f,0xd1,0x02]
 ; CHECK-NEXT:    ## xmm2 {%k1} = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1]
 ; CHECK-NEXT:    vpalignr $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x0f,0xc1,0x02]
@@ -302,7 +302,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pshufh_w_128(<8 x i16> %x0, i32 %x1, <
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpshufhw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x70,0xd0,0x03]
 ; CHECK-NEXT:    ## xmm2 = xmm0[0,1,2,3,7,4,4,4]
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpshufhw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x70,0xc8,0x03]
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[0,1,2,3,7,4,4,4]
 ; CHECK-NEXT:    vpshufhw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x70,0xc0,0x03]
@@ -325,7 +325,7 @@ define <16 x i16>@test_int_x86_avx512_mask_pshufh_w_256(<16 x i16> %x0, i32 %x1,
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpshufhw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfe,0x70,0xd0,0x03]
 ; CHECK-NEXT:    ## ymm2 = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpshufhw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x70,0xc8,0x03]
 ; CHECK-NEXT:    ## ymm1 {%k1} = ymm0[0,1,2,3,7,4,4,4,8,9,10,11,15,12,12,12]
 ; CHECK-NEXT:    vpshufhw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x70,0xc0,0x03]
@@ -348,7 +348,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pshufl_w_128(<8 x i16> %x0, i32 %x1, <
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpshuflw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x70,0xd0,0x03]
 ; CHECK-NEXT:    ## xmm2 = xmm0[3,0,0,0,4,5,6,7]
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpshuflw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x09,0x70,0xc8,0x03]
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[3,0,0,0,4,5,6,7]
 ; CHECK-NEXT:    vpshuflw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0x89,0x70,0xc0,0x03]
@@ -371,7 +371,7 @@ define <16 x i16>@test_int_x86_avx512_mask_pshufl_w_256(<16 x i16> %x0, i32 %x1,
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpshuflw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xff,0x70,0xd0,0x03]
 ; CHECK-NEXT:    ## ymm2 = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpshuflw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7f,0x29,0x70,0xc8,0x03]
 ; CHECK-NEXT:    ## ymm1 {%k1} = ymm0[3,0,0,0,4,5,6,7,11,8,8,8,12,13,14,15]
 ; CHECK-NEXT:    vpshuflw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7f,0xa9,0x70,0xc0,0x03]
@@ -414,7 +414,8 @@ define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-LABEL: test_pcmpeq_w_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x75,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
   ret i16 %res
@@ -423,9 +424,10 @@ define i16 @test_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b) {
 define i16 @test_mask_pcmpeq_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x75,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
   ret i16 %res
@@ -460,7 +462,8 @@ define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-LABEL: test_pcmpgt_w_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x65,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 -1)
   ret i16 %res
@@ -469,9 +472,10 @@ define i16 @test_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b) {
 define i16 @test_mask_pcmpgt_w_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x65,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.w.256(<16 x i16> %a, <16 x i16> %b, i16 %mask)
   ret i16 %res
@@ -486,7 +490,7 @@ define <16 x i8>@test_int_x86_avx512_mask_punpckhb_w_128(<16 x i8> %x0, <16 x i8
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpunpckhbw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x68,0xd9]
 ; CHECK-NEXT:    ## xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpunpckhbw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x68,0xd1]
 ; CHECK-NEXT:    ## xmm2 {%k1} = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; CHECK-NEXT:    vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3]
@@ -504,7 +508,7 @@ define <16 x i8>@test_int_x86_avx512_mask_punpcklb_w_128(<16 x i8> %x0, <16 x i8
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpunpcklbw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x60,0xd9]
 ; CHECK-NEXT:    ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpunpcklbw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x60,0xd1]
 ; CHECK-NEXT:    ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; CHECK-NEXT:    vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3]
@@ -558,7 +562,7 @@ define <8 x i16>@test_int_x86_avx512_mask_punpcklw_d_128(<8 x i16> %x0, <8 x i16
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpunpcklwd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x61,0xd9]
 ; CHECK-NEXT:    ## xmm3 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpunpcklwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x61,0xd1]
 ; CHECK-NEXT:    ## xmm2 {%k1} = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
@@ -576,7 +580,7 @@ define <8 x i16>@test_int_x86_avx512_mask_punpckhw_d_128(<8 x i16> %x0, <8 x i16
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpunpckhwd %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x69,0xd9]
 ; CHECK-NEXT:    ## xmm3 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpunpckhwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x69,0xd1]
 ; CHECK-NEXT:    ## xmm2 {%k1} = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
@@ -594,7 +598,7 @@ define <16 x i16>@test_int_x86_avx512_mask_punpcklw_d_256(<16 x i16> %x0, <16 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpunpcklwd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x61,0xd9]
 ; CHECK-NEXT:    ## ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpunpcklwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x61,0xd1]
 ; CHECK-NEXT:    ## ymm2 {%k1} = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
 ; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
@@ -612,7 +616,7 @@ define <16 x i16>@test_int_x86_avx512_mask_punpckhw_d_256(<16 x i16> %x0, <16 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpunpckhwd %ymm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x69,0xd9]
 ; CHECK-NEXT:    ## ymm3 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpunpckhwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x69,0xd1]
 ; CHECK-NEXT:    ## ymm2 {%k1} = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15]
 ; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc3]
@@ -635,7 +639,7 @@ define <8 x i16> @test_mask_add_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test_mask_add_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_epi16_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -646,7 +650,7 @@ define <8 x i16> @test_mask_add_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i
 define <8 x i16> @test_mask_add_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_epi16_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.padd.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
@@ -666,7 +670,7 @@ define <8 x i16> @test_mask_add_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
 define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_epi16_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xfd,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -678,7 +682,7 @@ define <8 x i16> @test_mask_add_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <
 define <8 x i16> @test_mask_add_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_add_epi16_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xfd,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -700,7 +704,7 @@ define <16 x i16> @test_mask_add_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i16> @test_mask_add_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_add_epi16_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -711,7 +715,7 @@ define <16 x i16> @test_mask_add_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16
 define <16 x i16> @test_mask_add_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_add_epi16_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.padd.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
@@ -731,7 +735,7 @@ define <16 x i16> @test_mask_add_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b)
 define <16 x i16> @test_mask_add_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_add_epi16_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xfd,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -743,7 +747,7 @@ define <16 x i16> @test_mask_add_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b
 define <16 x i16> @test_mask_add_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_add_epi16_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xfd,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -765,7 +769,7 @@ define <8 x i16> @test_mask_sub_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test_mask_sub_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi16_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -776,7 +780,7 @@ define <8 x i16> @test_mask_sub_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i
 define <8 x i16> @test_mask_sub_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi16_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psub.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
@@ -796,7 +800,7 @@ define <8 x i16> @test_mask_sub_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
 define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi16_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf9,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -808,7 +812,7 @@ define <8 x i16> @test_mask_sub_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <
 define <8 x i16> @test_mask_sub_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi16_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf9,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -830,7 +834,7 @@ define <16 x i16> @test_mask_sub_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i16> @test_mask_sub_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi16_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -841,7 +845,7 @@ define <16 x i16> @test_mask_sub_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16
 define <16 x i16> @test_mask_sub_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi16_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psub.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
@@ -861,7 +865,7 @@ define <16 x i16> @test_mask_sub_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b)
 define <16 x i16> @test_mask_sub_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi16_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf9,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -873,7 +877,7 @@ define <16 x i16> @test_mask_sub_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b
 define <16 x i16> @test_mask_sub_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_sub_epi16_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf9,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -1090,7 +1094,7 @@ define <8 x i16> @test_mask_mullo_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test_mask_mullo_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi16_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmullw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1101,7 +1105,7 @@ define <8 x i16> @test_mask_mullo_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x
 define <8 x i16> @test_mask_mullo_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi16_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmullw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.pmull.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
@@ -1121,7 +1125,7 @@ define <8 x i16> @test_mask_mullo_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b)
 define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi16_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpmullw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd5,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1133,7 +1137,7 @@ define <8 x i16> @test_mask_mullo_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
 define <8 x i16> @test_mask_mullo_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi16_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpmullw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd5,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -1155,7 +1159,7 @@ define <16 x i16> @test_mask_mullo_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i16> @test_mask_mullo_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi16_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1166,7 +1170,7 @@ define <16 x i16> @test_mask_mullo_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <
 define <16 x i16> @test_mask_mullo_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi16_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmullw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.pmull.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
@@ -1186,7 +1190,7 @@ define <16 x i16> @test_mask_mullo_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_
 define <16 x i16> @test_mask_mullo_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi16_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpmullw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd5,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1198,7 +1202,7 @@ define <16 x i16> @test_mask_mullo_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr
 define <16 x i16> @test_mask_mullo_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi16_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpmullw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd5,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -1213,7 +1217,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmaxs.b.128(<16 x i8>, <16 x i8>, <16 x
 define <16 x i8>@test_int_x86_avx512_mask_pmaxs_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3c,0xd1]
 ; CHECK-NEXT:    vpmaxsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x3c,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
@@ -1246,7 +1250,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmaxs_w_128(<8 x i16> %x0, <8 x i16> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xee,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmaxsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xee,0xd1]
 ; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1261,7 +1265,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmaxs.w.256(<16 x i16>, <16 x i16>, <16
 define <16 x i16>@test_int_x86_avx512_mask_pmaxs_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxs_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xee,0xd1]
 ; CHECK-NEXT:    vpmaxsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xee,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -1277,7 +1281,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmaxu.b.128(<16 x i8>, <16 x i8>, <16 x
 define <16 x i8>@test_int_x86_avx512_mask_pmaxu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2,i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xde,0xd1]
 ; CHECK-NEXT:    vpmaxub %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xde,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
@@ -1310,7 +1314,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmaxu_w_128(<8 x i16> %x0, <8 x i16> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3e,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3e,0xd1]
 ; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1325,7 +1329,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmaxu.w.256(<16 x i16>, <16 x i16>, <16
 define <16 x i16>@test_int_x86_avx512_mask_pmaxu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaxu_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3e,0xd1]
 ; CHECK-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x3e,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -1341,7 +1345,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmins.b.128(<16 x i8>, <16 x i8>, <16 x
 define <16 x i8>@test_int_x86_avx512_mask_pmins_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpminsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x38,0xd1]
 ; CHECK-NEXT:    vpminsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x38,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
@@ -1374,7 +1378,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmins_w_128(<8 x i16> %x0, <8 x i16> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpminsw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xea,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpminsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xea,0xd1]
 ; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1389,7 +1393,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmins.w.256(<16 x i16>, <16 x i16>, <16
 define <16 x i16>@test_int_x86_avx512_mask_pmins_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmins_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xea,0xd1]
 ; CHECK-NEXT:    vpminsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xea,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -1405,7 +1409,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pminu.b.128(<16 x i8>, <16 x i8>, <16 x
 define <16 x i8>@test_int_x86_avx512_mask_pminu_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpminub %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xda,0xd1]
 ; CHECK-NEXT:    vpminub %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xda,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
@@ -1438,7 +1442,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pminu_w_128(<8 x i16> %x0, <8 x i16> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpminuw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x3a,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpminuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x3a,0xd1]
 ; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1453,7 +1457,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pminu.w.256(<16 x i16>, <16 x i16>, <16
 define <16 x i16>@test_int_x86_avx512_mask_pminu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pminu_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x3a,0xd1]
 ; CHECK-NEXT:    vpminuw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x3a,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -1470,7 +1474,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1]
 ; CHECK-NEXT:    vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb]
@@ -1490,7 +1494,7 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_w_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1]
 ; CHECK-NEXT:    vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb]
@@ -1510,7 +1514,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psra_w_128(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe1,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe1,0xd1]
 ; CHECK-NEXT:    vpsraw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe1,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
@@ -1530,7 +1534,7 @@ define <16 x i16>@test_int_x86_avx512_mask_psra_w_256(<16 x i16> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_w_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe1,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe1,0xd1]
 ; CHECK-NEXT:    vpsraw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe1,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -1550,7 +1554,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psll_w_128(<8 x i16> %x0, <8 x i16> %x
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf1,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf1,0xd1]
 ; CHECK-NEXT:    vpsllw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xf1,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
@@ -1570,7 +1574,7 @@ define <16 x i16>@test_int_x86_avx512_mask_psll_w_256(<16 x i16> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_w_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf1,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf1,0xd1]
 ; CHECK-NEXT:    vpsllw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xf1,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -1590,7 +1594,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03]
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03]
 ; CHECK-NEXT:    vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca]
@@ -1610,7 +1614,7 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1,
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrl_wi_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03]
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03]
 ; CHECK-NEXT:    vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03]
 ; CHECK-NEXT:    vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca]
@@ -1630,7 +1634,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psra_wi_128(<8 x i16> %x0, i32 %x1, <8
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xe0,0x03]
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xe0,0x03]
 ; CHECK-NEXT:    vpsraw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xe0,0x03]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -1650,7 +1654,7 @@ define <16 x i16>@test_int_x86_avx512_mask_psra_wi_256(<16 x i16> %x0, i32 %x1,
 ; CHECK-LABEL: test_int_x86_avx512_mask_psra_wi_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xe0,0x03]
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xe0,0x03]
 ; CHECK-NEXT:    vpsraw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xe0,0x03]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
@@ -1670,7 +1674,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psll_wi_128(<8 x i16> %x0, i32 %x1, <8
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xf0,0x03]
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xf0,0x03]
 ; CHECK-NEXT:    vpsllw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xf0,0x03]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -1690,7 +1694,7 @@ define <16 x i16>@test_int_x86_avx512_mask_psll_wi_256(<16 x i16> %x0, i32 %x1,
 ; CHECK-LABEL: test_int_x86_avx512_mask_psll_wi_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xf0,0x03]
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xf0,0x03]
 ; CHECK-NEXT:    vpsllw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xf0,0x03]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
@@ -1710,7 +1714,7 @@ define <16 x i8>@test_int_x86_avx512_mask_pshuf_b_128(<16 x i8> %x0, <16 x i8> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pshuf_b_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpshufb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x00,0xd1]
 ; CHECK-NEXT:    vpaddb %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1743,7 +1747,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovzxb_w_128(<16 x i8> %x0, <8 x i16>
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x30,0xd0]
 ; CHECK-NEXT:    ## xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x30,0xc8]
 ; CHECK-NEXT:    ## xmm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x30,0xc0]
@@ -1766,7 +1770,7 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovzxb_w_256(<16 x i8> %x0, <16 x i1
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x30,0xd0]
 ; CHECK-NEXT:    ## ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x30,0xc8]
 ; CHECK-NEXT:    ## ymm1 {%k1} = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
 ; CHECK-NEXT:    vpmovzxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x30,0xc0]
@@ -1789,7 +1793,7 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovsxb_w_128(<16 x i8> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x20,0xd0]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x20,0xc8]
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x20,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -1809,7 +1813,7 @@ define <16 x i16>@test_int_x86_avx512_mask_pmovsxb_w_256(<16 x i8> %x0, <16 x i1
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxb_w_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x20,0xd0]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x20,0xc8]
 ; CHECK-NEXT:    vpmovsxbw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x20,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
@@ -1829,7 +1833,7 @@ define <2 x i64>@test_int_x86_avx512_mask_pmovsxd_q_128(<4 x i32> %x0, <2 x i64>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x25,0xd0]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x25,0xc8]
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x25,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
@@ -1849,7 +1853,7 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovsxd_q_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x25,0xd0]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x25,0xc8]
 ; CHECK-NEXT:    vpmovsxdq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x25,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
@@ -1863,3 +1867,728 @@ define <4 x i64>@test_int_x86_avx512_mask_pmovsxd_q_256(<4 x i32> %x0, <4 x i64>
   ret <4 x i64> %res4
 }
 
+
+declare <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16)
+
+define <16 x i8>@test_int_x86_avx512_cvtmask2b_128(i16 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
+; CHECK-NEXT:    vpmovm2b %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x28,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16 %x0)
+  ret <16 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32)
+
+define <32 x i8>@test_int_x86_avx512_cvtmask2b_256(i32 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
+; CHECK-NEXT:    vpmovm2b %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x28,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32 %x0)
+  ret <32 x i8> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8)
+
+define <8 x i16>@test_int_x86_avx512_cvtmask2w_128(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
+; CHECK-NEXT:    vpmovm2w %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x28,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8 %x0)
+  ret <8 x i16> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16)
+
+define <16 x i16>@test_int_x86_avx512_cvtmask2w_256(i16 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
+; CHECK-NEXT:    vpmovm2w %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x28,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16 %x0)
+  ret <16 x i16> %res
+}
+define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_mask_packs_epi32_rr_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rrk_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rrkz_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_packs_epi32_rm_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rmk_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rmkz_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_packs_epi32_rmb_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x18,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rmbk_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rmbkz_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
+
+define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_mask_packs_epi32_rr_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rrk_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rrkz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_packs_epi32_rm_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rmk_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rmkz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_packs_epi32_rmb_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x38,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rmbk_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_packs_epi32_rmbkz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
+
+define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_mask_packs_epi16_rr_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_packs_epi16_rrk_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_packs_epi16_rrkz_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_packs_epi16_rm_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_packs_epi16_rmk_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_packs_epi16_rmkz_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
+  ret <16 x i8> %res
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
+
+define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_mask_packs_epi16_rr_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_packs_epi16_rrk_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_packs_epi16_rrkz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_packs_epi16_rm_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_packs_epi16_rmk_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
+; CHECK-LABEL: test_mask_packs_epi16_rmkz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32)
+
+
+define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: test_mask_packus_epi32_rr_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rrk_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rrkz_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_packus_epi32_rm_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rmk_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rmkz_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <4 x i32>, <4 x i32>* %ptr_b
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_packus_epi32_rmb_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x18,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rmbk_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rmbkz_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
+  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
+  ret <8 x i16> %res
+}
+
+declare <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
+
+define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: test_mask_packus_epi32_rr_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rrk_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rrkz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
+; CHECK-LABEL: test_mask_packus_epi32_rm_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rmk_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rmkz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i32>, <8 x i32>* %ptr_b
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
+; CHECK-LABEL: test_mask_packus_epi32_rmb_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x38,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rmbk_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
+  ret <16 x i16> %res
+}
+
+define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_packus_epi32_rmbkz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load i32, i32* %ptr_b
+  %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
+  %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
+  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
+  ret <16 x i16> %res
+}
+
+declare <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
+
+define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_mask_packus_epi16_rr_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_packus_epi16_rrk_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
+; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_packus_epi16_rrkz_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_packus_epi16_rm_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
+; CHECK-LABEL: test_mask_packus_epi16_rmk_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f]
+; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
+  ret <16 x i8> %res
+}
+
+define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
+; CHECK-LABEL: test_mask_packus_epi16_rmkz_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <8 x i16>, <8 x i16>* %ptr_b
+  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
+  ret <16 x i8> %res
+}
+
+declare <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
+
+define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
+; CHECK-LABEL: test_mask_packus_epi16_rr_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_packus_epi16_rrk_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
+; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
+; CHECK-LABEL: test_mask_packus_epi16_rrkz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
+; CHECK-LABEL: test_mask_packus_epi16_rm_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
+; CHECK-LABEL: test_mask_packus_epi16_rmk_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0x0f]
+; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
+  ret <32 x i8> %res
+}
+
+define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
+; CHECK-LABEL: test_mask_packus_epi16_rmkz_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
+; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %b = load <16 x i16>, <16 x i16>* %ptr_b
+  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
+  ret <32 x i8> %res
+}
+
+declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32)
+
diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics.ll b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
index 7a9d7d7885fffea6e52bf8c483701be9d9e18ddf..1d0a3be069432c0e07269bfa67adc5663b6ef9b6 100644
--- a/test/CodeGen/X86/avx512bwvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512bwvl-intrinsics.ll
@@ -204,29 +204,29 @@ declare i32 @llvm.x86.avx512.mask.ucmp.b.256(<32 x i8>, <32 x i8>, i32, i32) nou
 define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
 ; CHECK-LABEL: test_cmp_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x00]
-; CHECK-NEXT:    vpcmpltw %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x01]
-; CHECK-NEXT:    vpcmplew %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe9,0x02]
-; CHECK-NEXT:    vpcmpunordw %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xf1,0x03]
-; CHECK-NEXT:    vpcmpneqw %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xf9,0x04]
-; CHECK-NEXT:    vpcmpnltw %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnlew %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
+; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xc9,0x01]
+; CHECK-NEXT:    vpcmplew %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunordw %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xd9,0x03]
+; CHECK-NEXT:    vpcmpneqw %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltw %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnlew %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xf1,0x06]
+; CHECK-NEXT:    vpcmpordw %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x3f,0xf9,0x07]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
 ; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x03]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
@@ -251,30 +251,30 @@ define <8 x i16> @test_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
 define <8 x i16> @test_mask_cmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
 ; CHECK-LABEL: test_mask_cmp_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3 ## encoding: [0xc5,0xf8,0x92,0xdf]
-; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k4 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3f,0xe1,0x00]
-; CHECK-NEXT:    vpcmpltw %ymm1, %ymm0, %k5 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3f,0xe9,0x01]
-; CHECK-NEXT:    vpcmplew %ymm1, %ymm0, %k6 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3f,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordw %ymm1, %ymm0, %k7 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3f,0xf9,0x03]
-; CHECK-NEXT:    vpcmpneqw %ymm1, %ymm0, %k0 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3f,0xc1,0x04]
-; CHECK-NEXT:    vpcmpnltw %ymm1, %ymm0, %k2 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnlew %ymm1, %ymm0, %k1 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordw %ymm1, %ymm0, %k3 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3f,0xd9,0x07]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpcmpeqw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltw %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd1,0x01]
+; CHECK-NEXT:    vpcmplew %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunordw %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe1,0x03]
+; CHECK-NEXT:    vpcmpneqw %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltw %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnlew %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xf9,0x06]
+; CHECK-NEXT:    vpcmpordw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3f,0xc9,0x07]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
 ; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x03]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
@@ -301,29 +301,29 @@ declare i16 @llvm.x86.avx512.mask.cmp.w.256(<16 x i16>, <16 x i16>, i32, i16) no
 define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
 ; CHECK-LABEL: test_ucmp_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpequw %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd9,0x00]
-; CHECK-NEXT:    vpcmpltuw %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x01]
-; CHECK-NEXT:    vpcmpleuw %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x02]
-; CHECK-NEXT:    vpcmpunorduw %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xf1,0x03]
-; CHECK-NEXT:    vpcmpnequw %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xf9,0x04]
-; CHECK-NEXT:    vpcmpnltuw %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleuw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x06]
-; CHECK-NEXT:    vpcmporduw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
+; CHECK-NEXT:    vpcmpequw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltuw %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleuw %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunorduw %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xd9,0x03]
+; CHECK-NEXT:    vpcmpnequw %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltuw %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnleuw %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xf1,0x06]
+; CHECK-NEXT:    vpcmporduw %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x3e,0xf9,0x07]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
 ; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x03]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 -1)
@@ -348,30 +348,30 @@ define <8 x i16> @test_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1) {
 define <8 x i16> @test_mask_ucmp_w_256(<16 x i16> %a0, <16 x i16> %a1, i16 %mask) {
 ; CHECK-LABEL: test_mask_ucmp_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3 ## encoding: [0xc5,0xf8,0x92,0xdf]
-; CHECK-NEXT:    vpcmpequw %ymm1, %ymm0, %k4 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3e,0xe1,0x00]
-; CHECK-NEXT:    vpcmpltuw %ymm1, %ymm0, %k5 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3e,0xe9,0x01]
-; CHECK-NEXT:    vpcmpleuw %ymm1, %ymm0, %k6 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3e,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunorduw %ymm1, %ymm0, %k7 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3e,0xf9,0x03]
-; CHECK-NEXT:    vpcmpnequw %ymm1, %ymm0, %k0 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3e,0xc1,0x04]
-; CHECK-NEXT:    vpcmpnltuw %ymm1, %ymm0, %k2 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleuw %ymm1, %ymm0, %k1 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3e,0xc9,0x06]
-; CHECK-NEXT:    vpcmporduw %ymm1, %ymm0, %k3 {%k3} ## encoding: [0x62,0xf3,0xfd,0x2b,0x3e,0xd9,0x07]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpcmpequw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltuw %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd1,0x01]
+; CHECK-NEXT:    vpcmpleuw %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunorduw %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe1,0x03]
+; CHECK-NEXT:    vpcmpnequw %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltuw %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnleuw %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xf9,0x06]
+; CHECK-NEXT:    vpcmporduw %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x3e,0xc9,0x07]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
 ; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x03]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.w.256(<16 x i16> %a0, <16 x i16> %a1, i32 0, i16 %mask)
@@ -401,7 +401,8 @@ define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test_pcmpeq_b_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x74,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
   ret i16 %res
@@ -410,9 +411,10 @@ define i16 @test_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b) {
 define i16 @test_mask_pcmpeq_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x74,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpeq.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
   ret i16 %res
@@ -424,7 +426,8 @@ define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_pcmpeq_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x75,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
   ret i8 %res
@@ -433,9 +436,10 @@ define i8 @test_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b) {
 define i8 @test_mask_pcmpeq_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_pcmpeq_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x75,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
   ret i8 %res
@@ -447,7 +451,8 @@ define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
 ; CHECK-LABEL: test_pcmpgt_b_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x64,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 -1)
   ret i16 %res
@@ -456,9 +461,10 @@ define i16 @test_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b) {
 define i16 @test_mask_pcmpgt_b_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x64,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i16 @llvm.x86.avx512.mask.pcmpgt.b.128(<16 x i8> %a, <16 x i8> %b, i16 %mask)
   ret i16 %res
@@ -470,7 +476,8 @@ define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_pcmpgt_w_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x08,0x65,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 -1)
   ret i8 %res
@@ -479,9 +486,10 @@ define i8 @test_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b) {
 define i8 @test_mask_pcmpgt_w_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_pcmpgt_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x65,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16> %a, <8 x i16> %b, i8 %mask)
   ret i8 %res
@@ -492,29 +500,29 @@ declare i8 @llvm.x86.avx512.mask.pcmpgt.w.128(<8 x i16>, <8 x i16>, i8)
 define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_cmp_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x00]
-; CHECK-NEXT:    vpcmpltb %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x01]
-; CHECK-NEXT:    vpcmpleb %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe9,0x02]
-; CHECK-NEXT:    vpcmpunordb %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xf1,0x03]
-; CHECK-NEXT:    vpcmpneqb %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xf9,0x04]
-; CHECK-NEXT:    vpcmpnltb %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleb %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
+; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltb %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleb %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunordb %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xd9,0x03]
+; CHECK-NEXT:    vpcmpneqb %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltb %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnleb %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xf1,0x06]
+; CHECK-NEXT:    vpcmpordb %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x3f,0xf9,0x07]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
 ; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x03]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
@@ -539,30 +547,30 @@ define <8 x i16> @test_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 define <8 x i16> @test_mask_cmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
 ; CHECK-LABEL: test_mask_cmp_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3 ## encoding: [0xc5,0xf8,0x92,0xdf]
-; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k4 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3f,0xe1,0x00]
-; CHECK-NEXT:    vpcmpltb %xmm1, %xmm0, %k5 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3f,0xe9,0x01]
-; CHECK-NEXT:    vpcmpleb %xmm1, %xmm0, %k6 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3f,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordb %xmm1, %xmm0, %k7 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3f,0xf9,0x03]
-; CHECK-NEXT:    vpcmpneqb %xmm1, %xmm0, %k0 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3f,0xc1,0x04]
-; CHECK-NEXT:    vpcmpnltb %xmm1, %xmm0, %k2 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleb %xmm1, %xmm0, %k1 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordb %xmm1, %xmm0, %k3 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3f,0xd9,0x07]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpcmpeqb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltb %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd1,0x01]
+; CHECK-NEXT:    vpcmpleb %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunordb %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe1,0x03]
+; CHECK-NEXT:    vpcmpneqb %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltb %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnleb %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xf9,0x06]
+; CHECK-NEXT:    vpcmpordb %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3f,0xc9,0x07]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
 ; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x03]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
@@ -589,29 +597,29 @@ declare i16 @llvm.x86.avx512.mask.cmp.b.128(<16 x i8>, <16 x i8>, i32, i16) noun
 define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK-LABEL: test_ucmp_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpequb %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd9,0x00]
-; CHECK-NEXT:    vpcmpltub %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x01]
-; CHECK-NEXT:    vpcmpleub %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x02]
-; CHECK-NEXT:    vpcmpunordub %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xf1,0x03]
-; CHECK-NEXT:    vpcmpnequb %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xf9,0x04]
-; CHECK-NEXT:    vpcmpnltub %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleub %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordub %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
+; CHECK-NEXT:    vpcmpequb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltub %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleub %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunordub %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xd9,0x03]
+; CHECK-NEXT:    vpcmpnequb %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltub %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnleub %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xf1,0x06]
+; CHECK-NEXT:    vpcmpordub %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x3e,0xf9,0x07]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
 ; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x03]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 -1)
@@ -636,30 +644,30 @@ define <8 x i16> @test_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1) {
 define <8 x i16> @test_mask_ucmp_b_128(<16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
 ; CHECK-LABEL: test_mask_ucmp_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3 ## encoding: [0xc5,0xf8,0x92,0xdf]
-; CHECK-NEXT:    vpcmpequb %xmm1, %xmm0, %k4 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3e,0xe1,0x00]
-; CHECK-NEXT:    vpcmpltub %xmm1, %xmm0, %k5 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3e,0xe9,0x01]
-; CHECK-NEXT:    vpcmpleub %xmm1, %xmm0, %k6 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3e,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordub %xmm1, %xmm0, %k7 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3e,0xf9,0x03]
-; CHECK-NEXT:    vpcmpnequb %xmm1, %xmm0, %k0 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3e,0xc1,0x04]
-; CHECK-NEXT:    vpcmpnltub %xmm1, %xmm0, %k2 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleub %xmm1, %xmm0, %k1 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3e,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordub %xmm1, %xmm0, %k3 {%k3} ## encoding: [0x62,0xf3,0x7d,0x0b,0x3e,0xd9,0x07]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpcmpequb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltub %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd1,0x01]
+; CHECK-NEXT:    vpcmpleub %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunordub %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe1,0x03]
+; CHECK-NEXT:    vpcmpnequb %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltub %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnleub %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xf9,0x06]
+; CHECK-NEXT:    vpcmpordub %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x3e,0xc9,0x07]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
 ; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x01]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x03]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x05]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc4,0xc0,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8> %a0, <16 x i8> %a1, i32 0, i16 %mask)
@@ -686,29 +694,29 @@ declare i16 @llvm.x86.avx512.mask.ucmp.b.128(<16 x i8>, <16 x i8>, i32, i16) nou
 define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_cmp_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd9,0x00]
-; CHECK-NEXT:    vpcmpltw %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xe1,0x01]
-; CHECK-NEXT:    vpcmplew %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xe9,0x02]
-; CHECK-NEXT:    vpcmpunordw %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xf1,0x03]
-; CHECK-NEXT:    vpcmpneqw %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xf9,0x04]
-; CHECK-NEXT:    vpcmpnltw %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnlew %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xc9,0x01]
+; CHECK-NEXT:    vpcmplew %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunordw %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xd9,0x03]
+; CHECK-NEXT:    vpcmpneqw %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltw %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnlew %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xf1,0x06]
+; CHECK-NEXT:    vpcmpordw %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x3f,0xf9,0x07]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
@@ -733,30 +741,30 @@ define <8 x i8> @test_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 define <8 x i8> @test_mask_cmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_cmp_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3 ## encoding: [0xc5,0xf8,0x92,0xdf]
-; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k4 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3f,0xe1,0x00]
-; CHECK-NEXT:    vpcmpltw %xmm1, %xmm0, %k5 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3f,0xe9,0x01]
-; CHECK-NEXT:    vpcmplew %xmm1, %xmm0, %k6 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3f,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordw %xmm1, %xmm0, %k7 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3f,0xf9,0x03]
-; CHECK-NEXT:    vpcmpneqw %xmm1, %xmm0, %k0 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3f,0xc1,0x04]
-; CHECK-NEXT:    vpcmpnltw %xmm1, %xmm0, %k2 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnlew %xmm1, %xmm0, %k1 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordw %xmm1, %xmm0, %k3 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3f,0xd9,0x07]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpcmpeqw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltw %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd1,0x01]
+; CHECK-NEXT:    vpcmplew %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunordw %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xe1,0x03]
+; CHECK-NEXT:    vpcmpneqw %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltw %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnlew %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xf9,0x06]
+; CHECK-NEXT:    vpcmpordw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3f,0xc9,0x07]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
@@ -783,29 +791,29 @@ declare i8 @llvm.x86.avx512.mask.cmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwi
 define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK-LABEL: test_ucmp_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpequw %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd9,0x00]
-; CHECK-NEXT:    vpcmpltuw %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe1,0x01]
-; CHECK-NEXT:    vpcmpleuw %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe9,0x02]
-; CHECK-NEXT:    vpcmpunorduw %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xf1,0x03]
-; CHECK-NEXT:    vpcmpnequw %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xf9,0x04]
-; CHECK-NEXT:    vpcmpnltuw %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc9,0x06]
-; CHECK-NEXT:    vpcmporduw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    vpcmpequw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltuw %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleuw %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunorduw %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xd9,0x03]
+; CHECK-NEXT:    vpcmpnequw %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltuw %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xf1,0x06]
+; CHECK-NEXT:    vpcmporduw %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x3e,0xf9,0x07]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 -1)
@@ -830,30 +838,30 @@ define <8 x i8> @test_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1) {
 define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_ucmp_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3 ## encoding: [0xc5,0xf8,0x92,0xdf]
-; CHECK-NEXT:    vpcmpequw %xmm1, %xmm0, %k4 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3e,0xe1,0x00]
-; CHECK-NEXT:    vpcmpltuw %xmm1, %xmm0, %k5 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3e,0xe9,0x01]
-; CHECK-NEXT:    vpcmpleuw %xmm1, %xmm0, %k6 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3e,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunorduw %xmm1, %xmm0, %k7 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3e,0xf9,0x03]
-; CHECK-NEXT:    vpcmpnequw %xmm1, %xmm0, %k0 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3e,0xc1,0x04]
-; CHECK-NEXT:    vpcmpnltuw %xmm1, %xmm0, %k2 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k1 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3e,0xc9,0x06]
-; CHECK-NEXT:    vpcmporduw %xmm1, %xmm0, %k3 {%k3} ## encoding: [0x62,0xf3,0xfd,0x0b,0x3e,0xd9,0x07]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpcmpequw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltuw %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd1,0x01]
+; CHECK-NEXT:    vpcmpleuw %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunorduw %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xe1,0x03]
+; CHECK-NEXT:    vpcmpnequw %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltuw %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnleuw %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xf9,0x06]
+; CHECK-NEXT:    vpcmporduw %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x3e,0xc9,0x07]
+; CHECK-NEXT:    kmovd %k2, %eax ## encoding: [0xc5,0xfb,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovd %k3, %eax ## encoding: [0xc5,0xfb,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovd %k4, %eax ## encoding: [0xc5,0xfb,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k5, %eax ## encoding: [0xc5,0xfb,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovd %k6, %eax ## encoding: [0xc5,0xfb,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovd %k7, %eax ## encoding: [0xc5,0xfb,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovd %k1, %eax ## encoding: [0xc5,0xfb,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16> %a0, <8 x i16> %a1, i32 0, i8 %mask)
@@ -877,1145 +885,38 @@ define <8 x i8> @test_mask_ucmp_w_128(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
 
 declare i8 @llvm.x86.avx512.mask.ucmp.w.128(<8 x i16>, <8 x i16>, i32, i8) nounwind readnone
 
-declare <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
-
-define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd256_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
-  ret <8 x float> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-
-define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd128_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
-  ret <4 x float> %res
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
-; CHECK-LABEL: test_mask_fmadd256_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
-  ret <4 x double> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: test_mask_fmadd128_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
-  ret <2 x double> %res
-}
-
-define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8]
-; CHECK-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xd9]
-; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm2, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xda]
-; CHECK-NEXT:    vfmadd231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd9]
-; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
-; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0xa8,0xda]
-; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8]
-; CHECK-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xd9]
-; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2 = fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-declare <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xda]
-; CHECK-NEXT:    vfmadd231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd9]
-; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2 = fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-declare <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
-; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0xa8,0xda]
-; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2 = fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8]
-; CHECK-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xd9]
-; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xda]
-; CHECK-NEXT:    vfmadd231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd9]
-; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
-; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0xa8,0xda]
-; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8]
-; CHECK-NEXT:    vfmadd132ps %ymm1, %ymm2, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x98,0xd9]
-; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
-; CHECK-NEXT:    vfmadd231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd9]
-; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
-; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0xa8,0xda]
-; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-
-declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm2, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xda]
-; CHECK-NEXT:    vfmsub231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd9]
-; CHECK-NEXT:    vfmsub213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaa,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-
-declare <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xda]
-; CHECK-NEXT:    vfmsub231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd9]
-; CHECK-NEXT:    vfmsub213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xaa,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2 = fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xda]
-; CHECK-NEXT:    vfmsub231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd9]
-; CHECK-NEXT:    vfmsub213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaa,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
-; CHECK-NEXT:    vfmsub231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd9]
-; CHECK-NEXT:    vfmsub213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xaa,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
-
-define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfnmadd256_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
-  ret <8 x float> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-
-define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfnmadd128_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
-  ret <4 x float> %res
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
-
-define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfnmadd256_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
-  ret <4 x double> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
-
-define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfnmadd128_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
-  ret <2 x double> %res
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
-
-define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfnmsub256_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
-  ret <8 x float> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-
-define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfnmsub128_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
-  ret <4 x float> %res
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
-
-define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfnmsub256_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
-  ret <4 x double> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
-
-define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfnmsub128_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
-  ret <2 x double> %res
-}
-
-
-define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8]
-; CHECK-NEXT:    vfnmsub132pd %xmm1, %xmm2, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9e,0xd9]
-; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xae,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm2, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xda]
-; CHECK-NEXT:    vfnmsub231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd9]
-; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xae,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8]
-; CHECK-NEXT:    vfnmsub132pd %ymm1, %ymm2, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9e,0xd9]
-; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xae,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2 = fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-declare <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xda]
-; CHECK-NEXT:    vfnmsub231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd9]
-; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xae,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2 = fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8]
-; CHECK-NEXT:    vfnmsub132ps %xmm1, %xmm2, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xd9]
-; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xae,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xda]
-; CHECK-NEXT:    vfnmsub231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd9]
-; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xae,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8]
-; CHECK-NEXT:    vfnmsub132ps %ymm1, %ymm2, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xd9]
-; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xae,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
-; CHECK-NEXT:    vfnmsub231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd9]
-; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xae,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8]
-; CHECK-NEXT:    vfnmadd132pd %xmm1, %xmm2, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9c,0xd9]
-; CHECK-NEXT:    vfnmadd213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xac,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8]
-; CHECK-NEXT:    vfnmadd132pd %ymm1, %ymm2, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9c,0xd9]
-; CHECK-NEXT:    vfnmadd213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xac,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2 = fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8]
-; CHECK-NEXT:    vfnmadd132ps %xmm1, %xmm2, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xd9]
-; CHECK-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xac,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8]
-; CHECK-NEXT:    vfnmadd132ps %ymm1, %ymm2, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xd9]
-; CHECK-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xac,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
-
-define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
-; CHECK-LABEL: test_mask_fmaddsub256_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
-  ret <8 x float> %res
-}
-
-declare <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
-
-define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
-; CHECK-LABEL: test_mask_fmaddsub128_ps:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
-  ret <4 x float> %res
-}
-
-declare <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
-
-define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmaddsub256_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
-  ret <4 x double> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
-
-define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmaddsub128_pd:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
-  ret <2 x double> %res
-}
-
-define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd8]
-; CHECK-NEXT:    vfmaddsub132pd %xmm1, %xmm2, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x96,0xd9]
-; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm2, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xda]
-; CHECK-NEXT:    vfmaddsub231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd9]
-; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
-; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0xa6,0xda]
-; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2 = fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd8]
-; CHECK-NEXT:    vfmaddsub132pd %ymm1, %ymm2, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x96,0xd9]
-; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2 = fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-declare <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xda]
-; CHECK-NEXT:    vfmaddsub231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd9]
-; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2 = fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-declare <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
-; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0xa6,0xda]
-; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2 = fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8]
-; CHECK-NEXT:    vfmaddsub132ps %xmm1, %xmm2, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x96,0xd9]
-; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xda]
-; CHECK-NEXT:    vfmaddsub231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd9]
-; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
-; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0xa6,0xda]
-; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2 = fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8]
-; CHECK-NEXT:    vfmaddsub132ps %ymm1, %ymm2, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x96,0xd9]
-; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
-; CHECK-NEXT:    vfmaddsub231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd9]
-; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
-; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0xa6,0xda]
-; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2 = fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-declare <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
-
-define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %xmm2, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xda]
-; CHECK-NEXT:    vfmsubadd231pd %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd9]
-; CHECK-NEXT:    vfmsubadd213pd %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa7,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
-  %res2=fadd <2 x double> %res, %res1
-  ret <2 x double> %res2
-}
-
-declare <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
-
-define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovapd %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xda]
-; CHECK-NEXT:    vfmsubadd231pd %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd9]
-; CHECK-NEXT:    vfmsubadd213pd %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa7,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
-  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
-  %res2=fadd <4 x double> %res, %res1
-  ret <4 x double> %res2
-}
-
-declare <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
-
-define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xda]
-; CHECK-NEXT:    vfmsubadd231ps %xmm1, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd9]
-; CHECK-NEXT:    vfmsubadd213ps %xmm2, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa7,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
-  %res2=fadd <4 x float> %res, %res1
-  ret <4 x float> %res2
-}
-
-declare <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
-; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vmovaps %ymm2, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xda]
-; CHECK-NEXT:    vfmsubadd231ps %ymm1, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd9]
-; CHECK-NEXT:    vfmsubadd213ps %ymm2, %ymm0, %ymm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa7,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
-  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
-  %res2=fadd <8 x float> %res, %res1
-  ret <8 x float> %res2
-}
-
-
-define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd128_ps_r:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-; CHECK-LABEL: test_mask_vfmadd128_ps_rz:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd128_ps_rmk:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %a2 = load <4 x float>, <4 x float>* %ptr_a2
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd128_ps_rmka:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
-; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %a2 = load <4 x float>, <4 x float>* %ptr_a2
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
-; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd128_ps_rmb:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load float, float* %ptr_a2
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
-  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
-  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
-  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd128_ps_rmba:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load float, float* %ptr_a2, align 4
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
-  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
-  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
-  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
-; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load float, float* %ptr_a2
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
-  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
-  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
-  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
-  ret <4 x float> %res
-}
-
-define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
-; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %q = load float, float* %ptr_a2, align 4
-  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
-  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
-  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
-  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
-  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
-  ret <4 x float> %res
-}
-
-define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd128_pd_r:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
-; CHECK-LABEL: test_mask_vfmadd128_pd_rz:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd128_pd_rmk:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %a2 = load <2 x double>, <2 x double>* %ptr_a2
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
-  ret <2 x double> %res
-}
-
-define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) {
-; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %a2 = load <2 x double>, <2 x double>* %ptr_a2
-  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
-  ret <2 x double> %res
-}
-
-define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd256_pd_r:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
-; CHECK-LABEL: test_mask_vfmadd256_pd_rz:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) {
-; CHECK-LABEL: test_mask_vfmadd256_pd_rmk:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %a2 = load <4 x double>, <4 x double>* %ptr_a2
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
-  ret <4 x double> %res
-}
-
-define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) {
-; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %a2 = load <4 x double>, <4 x double>* %ptr_a2
-  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
-  ret <4 x double> %res
-}
-
 define <8 x i16> @test_mask_packs_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_mask_packs_epi32_rr_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
 }
 
 define <8 x i16> @test_mask_packs_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
 }
 
 define <8 x i16> @test_mask_packs_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackssdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
 }
 
 define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
@@ -2024,31 +925,35 @@ define <8 x i16> @test_mask_packs_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b)
 ; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
 }
 
 define <8 x i16> @test_mask_packs_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x6b,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
 }
 
 define <8 x i16> @test_mask_packs_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackssdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x6b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
 }
 
 define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
@@ -2059,67 +964,75 @@ define <8 x i16> @test_mask_packs_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
 }
 
 define <8 x i16> @test_mask_packs_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rmbk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x19,0x6b,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
 }
 
 define <8 x i16> @test_mask_packs_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rmbkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackssdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x99,0x6b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
 }
 
-declare <8 x i16> @llvm.x86.avx512.mask.packssdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>)
 
 define <16 x i16> @test_mask_packs_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: test_mask_packs_epi32_rr_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
 }
 
 define <16 x i16> @test_mask_packs_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
 }
 
 define <16 x i16> @test_mask_packs_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackssdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
 }
 
 define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
@@ -2128,31 +1041,35 @@ define <16 x i16> @test_mask_packs_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b)
 ; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
 }
 
 define <16 x i16> @test_mask_packs_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x6b,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
 }
 
 define <16 x i16> @test_mask_packs_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackssdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x6b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
 }
 
 define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
@@ -2163,67 +1080,75 @@ define <16 x i16> @test_mask_packs_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
 }
 
 define <16 x i16> @test_mask_packs_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rmbk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x39,0x6b,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
 }
 
 define <16 x i16> @test_mask_packs_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi32_rmbkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackssdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xb9,0x6b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
 }
 
-declare <16 x i16> @llvm.x86.avx512.mask.packssdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>)
 
 define <16 x i8> @test_mask_packs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_mask_packs_epi16_rr_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  ret <16 x i8> %1
 }
 
 define <16 x i8> @test_mask_packs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi16_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
+  ret <16 x i8> %3
 }
 
 define <16 x i8> @test_mask_packs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi16_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
+  ret <16 x i8> %3
 }
 
 define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
@@ -2232,42 +1157,46 @@ define <16 x i8> @test_mask_packs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b)
 ; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x63,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  ret <16 x i8> %1
 }
 
 define <16 x i8> @test_mask_packs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi16_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x63,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
+  ret <16 x i8> %3
 }
 
 define <16 x i8> @test_mask_packs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_packs_epi16_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpacksswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x63,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
+  ret <16 x i8> %3
 }
 
-declare <16 x i8> @llvm.x86.avx512.mask.packsswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>)
 
 define <32 x i8> @test_mask_packs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-LABEL: test_mask_packs_epi16_rr_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  ret <32 x i8> %1
 }
 
 define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
@@ -2277,8 +1206,10 @@ define <32 x i8> @test_mask_packs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <3
 ; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x63,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
+  ret <32 x i8> %3
 }
 
 define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
@@ -2287,8 +1218,10 @@ define <32 x i8> @test_mask_packs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpacksswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
+  ret <32 x i8> %3
 }
 
 define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
@@ -2297,8 +1230,8 @@ define <32 x i8> @test_mask_packs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
 ; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x63,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  ret <32 x i8> %1
 }
 
 define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
@@ -2309,8 +1242,10 @@ define <32 x i8> @test_mask_packs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
+  ret <32 x i8> %3
 }
 
 define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
@@ -2320,11 +1255,13 @@ define <32 x i8> @test_mask_packs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr
 ; CHECK-NEXT:    vpacksswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x63,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
+  ret <32 x i8> %3
 }
 
-declare <32 x i8> @llvm.x86.avx512.mask.packsswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32)
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>)
 
 
 define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
@@ -2332,29 +1269,33 @@ define <8 x i16> @test_mask_packus_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
 }
 
 define <8 x i16> @test_mask_packus_epi32_rrk_128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
 }
 
 define <8 x i16> @test_mask_packus_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackusdw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
 }
 
 define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
@@ -2363,31 +1304,35 @@ define <8 x i16> @test_mask_packus_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b)
 ; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x2b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
 }
 
 define <8 x i16> @test_mask_packus_epi32_rmk_128(<4 x i32> %a, <4 x i32>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x2b,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
 }
 
 define <8 x i16> @test_mask_packus_epi32_rmkz_128(<4 x i32> %a, <4 x i32>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackusdw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x2b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x i32>, <4 x i32>* %ptr_b
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
 }
 
 define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
@@ -2398,67 +1343,75 @@ define <8 x i16> @test_mask_packus_epi32_rmb_128(<4 x i32> %a, i32* %ptr_b) {
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 -1)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  ret <8 x i16> %1
 }
 
 define <8 x i16> @test_mask_packus_epi32_rmbk_128(<4 x i32> %a, i32* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rmbk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x19,0x2b,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> %passThru, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru
+  ret <8 x i16> %3
 }
 
 define <8 x i16> @test_mask_packus_epi32_rmbkz_128(<4 x i32> %a, i32* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rmbkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackusdw (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x99,0x2b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <4 x i32> undef, i32 %q, i32 0
   %b = shufflevector <4 x i32> %vecinit.i, <4 x i32> undef, <4 x i32> zeroinitializer
-  %res = call <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32> %a, <4 x i32> %b, <8 x i16> zeroinitializer, i8 %mask)
-  ret <8 x i16> %res
+  %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a, <4 x i32> %b)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer
+  ret <8 x i16> %3
 }
 
-declare <8 x i16> @llvm.x86.avx512.mask.packusdw.128(<4 x i32>, <4 x i32>, <8 x i16>, i8)
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>)
 
 define <16 x i16> @test_mask_packus_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: test_mask_packus_epi32_rr_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
 }
 
 define <16 x i16> @test_mask_packus_epi32_rrk_256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
 }
 
 define <16 x i16> @test_mask_packus_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackusdw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
 }
 
 define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
@@ -2467,31 +1420,35 @@ define <16 x i16> @test_mask_packus_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b
 ; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x2b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
 }
 
 define <16 x i16> @test_mask_packus_epi32_rmk_256(<8 x i32> %a, <8 x i32>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x2b,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
 }
 
 define <16 x i16> @test_mask_packus_epi32_rmkz_256(<8 x i32> %a, <8 x i32>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackusdw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x2b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i32>, <8 x i32>* %ptr_b
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
 }
 
 define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
@@ -2502,67 +1459,75 @@ define <16 x i16> @test_mask_packus_epi32_rmb_256(<8 x i32> %a, i32* %ptr_b) {
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 -1)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  ret <16 x i16> %1
 }
 
 define <16 x i16> @test_mask_packus_epi32_rmbk_256(<8 x i32> %a, i32* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rmbk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x39,0x2b,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> %passThru, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru
+  ret <16 x i16> %3
 }
 
 define <16 x i16> @test_mask_packus_epi32_rmbkz_256(<8 x i32> %a, i32* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi32_rmbkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackusdw (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xb9,0x2b,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i32, i32* %ptr_b
   %vecinit.i = insertelement <8 x i32> undef, i32 %q, i32 0
   %b = shufflevector <8 x i32> %vecinit.i, <8 x i32> undef, <8 x i32> zeroinitializer
-  %res = call <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32> %a, <8 x i32> %b, <16 x i16> zeroinitializer, i16 %mask)
-  ret <16 x i16> %res
+  %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a, <8 x i32> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer
+  ret <16 x i16> %3
 }
 
-declare <16 x i16> @llvm.x86.avx512.mask.packusdw.256(<8 x i32>, <8 x i32>, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>)
 
 define <16 x i8> @test_mask_packus_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_mask_packus_epi16_rr_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  ret <16 x i8> %1
 }
 
 define <16 x i8> @test_mask_packus_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi16_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
+  ret <16 x i8> %3
 }
 
 define <16 x i8> @test_mask_packus_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi16_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
+  ret <16 x i8> %3
 }
 
 define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
@@ -2571,42 +1536,46 @@ define <16 x i8> @test_mask_packus_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b)
 ; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x67,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 -1)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  ret <16 x i8> %1
 }
 
 define <16 x i8> @test_mask_packus_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi16_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x67,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> %passThru, i16 %mask)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru
+  ret <16 x i8> %3
 }
 
 define <16 x i8> @test_mask_packus_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_packus_epi16_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpackuswb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x67,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
-  %res = call <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16> %a, <8 x i16> %b, <16 x i8> zeroinitializer, i16 %mask)
-  ret <16 x i8> %res
+  %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a, <8 x i16> %b)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer
+  ret <16 x i8> %3
 }
 
-declare <16 x i8> @llvm.x86.avx512.mask.packuswb.128(<8 x i16>, <8 x i16>, <16 x i8>, i16)
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>)
 
 define <32 x i8> @test_mask_packus_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
 ; CHECK-LABEL: test_mask_packus_epi16_rr_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  ret <32 x i8> %1
 }
 
 define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask) {
@@ -2616,8 +1585,10 @@ define <32 x i8> @test_mask_packus_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <
 ; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x67,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
+  ret <32 x i8> %3
 }
 
 define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i32 %mask) {
@@ -2626,8 +1597,10 @@ define <32 x i8> @test_mask_packus_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b,
 ; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpackuswb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
+  ret <32 x i8> %3
 }
 
 define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) {
@@ -2636,8 +1609,8 @@ define <32 x i8> @test_mask_packus_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_
 ; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x67,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 -1)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  ret <32 x i8> %1
 }
 
 define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <32 x i8> %passThru, i32 %mask) {
@@ -2648,8 +1621,10 @@ define <32 x i8> @test_mask_packus_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> %passThru, i32 %mask)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru
+  ret <32 x i8> %3
 }
 
 define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i32 %mask) {
@@ -2659,11 +1634,13 @@ define <32 x i8> @test_mask_packus_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %pt
 ; CHECK-NEXT:    vpackuswb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x67,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
-  %res = call <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16> %a, <16 x i16> %b, <32 x i8> zeroinitializer, i32 %mask)
-  ret <32 x i8> %res
+  %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a, <16 x i16> %b)
+  %2 = bitcast i32 %mask to <32 x i1>
+  %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer
+  ret <32 x i8> %3
 }
 
-declare <32 x i8> @llvm.x86.avx512.mask.packuswb.256(<16 x i16>, <16 x i16>, <32 x i8>, i32)
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>)
 
 define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 ; CHECK-LABEL: test_mask_adds_epi16_rr_128:
@@ -2677,7 +1654,7 @@ define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi16_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2688,7 +1665,7 @@ define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x
 define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi16_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.padds.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
@@ -2708,7 +1685,7 @@ define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
 define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi16_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2720,7 +1697,7 @@ define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
 define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi16_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -2742,7 +1719,7 @@ define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi16_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2753,7 +1730,7 @@ define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <1
 define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi16_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
@@ -2773,7 +1750,7 @@ define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
 define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi16_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2785,7 +1762,7 @@ define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
 define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi16_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -2807,7 +1784,7 @@ define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi16_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2818,7 +1795,7 @@ define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x
 define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi16_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psubs.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
@@ -2838,7 +1815,7 @@ define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
 define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi16_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2850,7 +1827,7 @@ define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
 define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi16_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -2872,7 +1849,7 @@ define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi16_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2883,7 +1860,7 @@ define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <1
 define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi16_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
@@ -2903,7 +1880,7 @@ define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
 define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi16_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2915,7 +1892,7 @@ define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
 define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi16_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -2937,7 +1914,7 @@ define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu16_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2948,7 +1925,7 @@ define <8 x i16> @test_mask_adds_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x
 define <8 x i16> @test_mask_adds_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu16_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.paddus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
@@ -2968,7 +1945,7 @@ define <8 x i16> @test_mask_adds_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
 define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu16_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdd,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -2980,7 +1957,7 @@ define <8 x i16> @test_mask_adds_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
 define <8 x i16> @test_mask_adds_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu16_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdd,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -3002,7 +1979,7 @@ define <16 x i16> @test_mask_adds_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu16_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3013,7 +1990,7 @@ define <16 x i16> @test_mask_adds_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <1
 define <16 x i16> @test_mask_adds_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu16_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.paddus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
@@ -3033,7 +2010,7 @@ define <16 x i16> @test_mask_adds_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
 define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu16_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xdd,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3045,7 +2022,7 @@ define <16 x i16> @test_mask_adds_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
 define <16 x i16> @test_mask_adds_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu16_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xdd,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -3067,7 +2044,7 @@ define <8 x i16> @test_mask_subs_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) {
 define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu16_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3078,7 +2055,7 @@ define <8 x i16> @test_mask_subs_epu16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x
 define <8 x i16> @test_mask_subs_epu16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu16_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubusw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.psubus.w.128(<8 x i16> %a, <8 x i16> %b, <8 x i16> zeroinitializer, i8 %mask)
@@ -3098,7 +2075,7 @@ define <8 x i16> @test_mask_subs_epu16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) {
 define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu16_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd9,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3110,7 +2087,7 @@ define <8 x i16> @test_mask_subs_epu16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b,
 define <8 x i16> @test_mask_subs_epu16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu16_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubusw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd9,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i16>, <8 x i16>* %ptr_b
@@ -3132,7 +2109,7 @@ define <16 x i16> @test_mask_subs_epu16_rr_256(<16 x i16> %a, <16 x i16> %b) {
 define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu16_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3143,7 +2120,7 @@ define <16 x i16> @test_mask_subs_epu16_rrk_256(<16 x i16> %a, <16 x i16> %b, <1
 define <16 x i16> @test_mask_subs_epu16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu16_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubusw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.psubus.w.256(<16 x i16> %a, <16 x i16> %b, <16 x i16> zeroinitializer, i16 %mask)
@@ -3163,7 +2140,7 @@ define <16 x i16> @test_mask_subs_epu16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b
 define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu16_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd9,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3175,7 +2152,7 @@ define <16 x i16> @test_mask_subs_epu16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_
 define <16 x i16> @test_mask_subs_epu16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu16_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubusw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd9,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i16>, <16 x i16>* %ptr_b
@@ -3197,7 +2174,7 @@ define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
 define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi8_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3208,7 +2185,7 @@ define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x
 define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi8_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.padds.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
@@ -3228,7 +2205,7 @@ define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
 define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi8_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3240,7 +2217,7 @@ define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <
 define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epi8_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i8>, <16 x i8>* %ptr_b
@@ -3327,7 +2304,7 @@ define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) {
 define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi8_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3338,7 +2315,7 @@ define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x
 define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi8_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.psubs.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
@@ -3358,7 +2335,7 @@ define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
 define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi8_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3370,7 +2347,7 @@ define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <
 define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epi8_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i8>, <16 x i8>* %ptr_b
@@ -3457,7 +2434,7 @@ define <16 x i8> @test_mask_adds_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
 define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu8_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3468,7 +2445,7 @@ define <16 x i8> @test_mask_adds_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x
 define <16 x i8> @test_mask_adds_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu8_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpaddusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.paddus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
@@ -3488,7 +2465,7 @@ define <16 x i8> @test_mask_adds_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
 define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu8_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xdc,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3500,7 +2477,7 @@ define <16 x i8> @test_mask_adds_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <
 define <16 x i8> @test_mask_adds_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_adds_epu8_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpaddusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xdc,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i8>, <16 x i8>* %ptr_b
@@ -3587,7 +2564,7 @@ define <16 x i8> @test_mask_subs_epu8_rr_128(<16 x i8> %a, <16 x i8> %b) {
 define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu8_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3598,7 +2575,7 @@ define <16 x i8> @test_mask_subs_epu8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x
 define <16 x i8> @test_mask_subs_epu8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu8_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsubusb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.psubus.b.128(<16 x i8> %a, <16 x i8> %b, <16 x i8> zeroinitializer, i16 %mask)
@@ -3618,7 +2595,7 @@ define <16 x i8> @test_mask_subs_epu8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) {
 define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu8_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd8,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -3630,7 +2607,7 @@ define <16 x i8> @test_mask_subs_epu8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <
 define <16 x i8> @test_mask_subs_epu8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) {
 ; CHECK-LABEL: test_mask_subs_epu8_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpsubusb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd8,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <16 x i8>, <16 x i8>* %ptr_b
@@ -3710,11 +2687,11 @@ declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>,
 define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
-; CHECK-NEXT:    vpermt2w %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xda]
-; CHECK-NEXT:    vpermt2w %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xca]
-; CHECK-NEXT:    vpaddw %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc1]
+; CHECK-NEXT:    vpermt2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda]
+; CHECK-NEXT:    vpermt2w %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7d,0xca]
+; CHECK-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -3727,11 +2704,11 @@ declare <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16>, <8 x i16>,
 define <8 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
-; CHECK-NEXT:    vpermt2w %xmm2, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xda]
-; CHECK-NEXT:    vpermt2w %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xca]
-; CHECK-NEXT:    vpaddw %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc1]
+; CHECK-NEXT:    vpermt2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x7d,0xda]
+; CHECK-NEXT:    vpermt2w %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7d,0xca]
+; CHECK-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -3744,11 +2721,11 @@ declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16
 define <16 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
-; CHECK-NEXT:    vpermt2w %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xda]
-; CHECK-NEXT:    vpermt2w %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xca]
-; CHECK-NEXT:    vpaddw %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc1]
+; CHECK-NEXT:    vpermt2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda]
+; CHECK-NEXT:    vpermt2w %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7d,0xca]
+; CHECK-NEXT:    vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -3761,11 +2738,11 @@ declare <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16>, <16 x i1
 define <16 x i16>@test_int_x86_avx512_maskz_vpermt2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_hi_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
-; CHECK-NEXT:    vpermt2w %ymm2, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xda]
-; CHECK-NEXT:    vpermt2w %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xca]
-; CHECK-NEXT:    vpaddw %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc1]
+; CHECK-NEXT:    vpermt2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x7d,0xda]
+; CHECK-NEXT:    vpermt2w %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7d,0xca]
+; CHECK-NEXT:    vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.maskz.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -3778,11 +2755,11 @@ declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>,
 define <8 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
-; CHECK-NEXT:    vpermi2w %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x75,0xda]
-; CHECK-NEXT:    vpermi2w %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x75,0xca]
-; CHECK-NEXT:    vpaddw %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc1]
+; CHECK-NEXT:    vpermi2w %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x75,0xda]
+; CHECK-NEXT:    vpermi2w %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x75,0xca]
+; CHECK-NEXT:    vpaddw %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
@@ -3795,11 +2772,11 @@ declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16
 define <16 x i16>@test_int_x86_avx512_mask_vpermi2var_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_hi_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x75,0xda]
-; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x75,0xca]
-; CHECK-NEXT:    vpaddw %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfd,0xc1]
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x75,0xda]
+; CHECK-NEXT:    vpermi2w %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x75,0xca]
+; CHECK-NEXT:    vpaddw %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
@@ -3812,7 +2789,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pavg.b.128(<16 x i8>, <16 x i8>, <16 x i
 define <16 x i8>@test_int_x86_avx512_mask_pavg_b_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pavg_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpavgb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe0,0xd1]
 ; CHECK-NEXT:    vpavgb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe0,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
@@ -3844,7 +2821,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pavg.w.128(<8 x i16>, <8 x i16>, <8 x i1
 define <8 x i16>@test_int_x86_avx512_mask_pavg_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpavgw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe3,0xd1]
 ; CHECK-NEXT:    vpavgw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe3,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
@@ -3860,7 +2837,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pavg.w.256(<16 x i16>, <16 x i16>, <16
 define <16 x i16>@test_int_x86_avx512_mask_pavg_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pavg_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe3,0xd1]
 ; CHECK-NEXT:    vpavgw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe3,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -3876,7 +2853,7 @@ declare <16 x i8> @llvm.x86.avx512.mask.pabs.b.128(<16 x i8>, <16 x i8>, i16)
 define <16 x i8>@test_int_x86_avx512_mask_pabs_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pabs_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpabsb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1c,0xc8]
 ; CHECK-NEXT:    vpabsb %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1c,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -3908,7 +2885,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pabs.w.128(<8 x i16>, <8 x i16>, i8)
 define <8 x i16>@test_int_x86_avx512_mask_pabs_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpabsw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x1d,0xc8]
 ; CHECK-NEXT:    vpabsw %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x1d,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
@@ -3924,7 +2901,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pabs.w.256(<16 x i16>, <16 x i16>, i16)
 define <16 x i16>@test_int_x86_avx512_mask_pabs_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pabs_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpabsw %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x1d,0xc8]
 ; CHECK-NEXT:    vpabsw %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x1d,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
@@ -3940,7 +2917,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmulhu.w.128(<8 x i16>, <8 x i16>, <8 x
 define <8 x i16>@test_int_x86_avx512_mask_pmulhu_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe4,0xd1]
 ; CHECK-NEXT:    vpmulhuw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe4,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
@@ -3956,7 +2933,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmulhu.w.256(<16 x i16>, <16 x i16>, <1
 define <16 x i16>@test_int_x86_avx512_mask_pmulhu_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmulhu_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe4,0xd1]
 ; CHECK-NEXT:    vpmulhuw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe4,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -3972,7 +2949,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmulh.w.128(<8 x i16>, <8 x i16>, <8 x i
 define <8 x i16>@test_int_x86_avx512_mask_pmulh_w_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmulhw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe5,0xd1]
 ; CHECK-NEXT:    vpmulhw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe5,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
@@ -3988,7 +2965,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmulh.w.256(<16 x i16>, <16 x i16>, <16
 define <16 x i16>@test_int_x86_avx512_mask_pmulh_w_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmulh_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe5,0xd1]
 ; CHECK-NEXT:    vpmulhw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe5,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -4004,7 +2981,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.128(<8 x i16>, <8 x i16>, <8
 define <8 x i16>@test_int_x86_avx512_mask_pmulhr_sw_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x0b,0xd1]
 ; CHECK-NEXT:    vpmulhrsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x0b,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
@@ -4020,7 +2997,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmul.hr.sw.256(<16 x i16>, <16 x i16>,
 define <16 x i16>@test_int_x86_avx512_mask_pmulhr_sw_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmulhr_sw_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x0b,0xd1]
 ; CHECK-NEXT:    vpmulhrsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x0b,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -4036,9 +3013,9 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.128(<8 x i16>, <16 x i8>, i8)
 define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovwb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x30,0xc2]
+; CHECK-NEXT:    vpmovwb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0xc1]
 ; CHECK-NEXT:    vpmovwb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -4056,7 +3033,7 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.128(i8* %ptr, <8 x i16>, i8)
 define void @test_int_x86_avx512_mask_pmov_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpmovwb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x30,0x07]
 ; CHECK-NEXT:    vpmovwb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x30,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -4070,9 +3047,9 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.128(<8 x i16>, <16 x i8>, i8)
 define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovswb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x20,0xc2]
+; CHECK-NEXT:    vpmovswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0xc1]
 ; CHECK-NEXT:    vpmovswb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x20,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -4090,7 +3067,7 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.128(i8* %ptr, <8 x i16>, i8)
 define void @test_int_x86_avx512_mask_pmovs_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpmovswb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x20,0x07]
 ; CHECK-NEXT:    vpmovswb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x20,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -4104,9 +3081,9 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.128(<8 x i16>, <16 x i8>, i8)
 define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_128(<8 x i16> %x0, <16 x i8> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovuswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovuswb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x10,0xc2]
+; CHECK-NEXT:    vpmovuswb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0xc1]
 ; CHECK-NEXT:    vpmovuswb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x10,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -4124,7 +3101,7 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.128(i8* %ptr, <8 x i16>, i8)
 define void @test_int_x86_avx512_mask_pmovus_wb_mem_128(i8* %ptr, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpmovuswb %xmm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x08,0x10,0x07]
 ; CHECK-NEXT:    vpmovuswb %xmm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -4138,9 +3115,9 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
 define <16 x i8>@test_int_x86_avx512_mask_pmov_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovwb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovwb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x30,0xc2]
+; CHECK-NEXT:    vpmovwb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0xc1]
 ; CHECK-NEXT:    vpmovwb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x30,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -4158,7 +3135,7 @@ declare void @llvm.x86.avx512.mask.pmov.wb.mem.256(i8* %ptr, <16 x i16>, i16)
 define void @test_int_x86_avx512_mask_pmov_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_wb_mem_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpmovwb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x30,0x07]
 ; CHECK-NEXT:    vpmovwb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x30,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -4172,9 +3149,9 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
 define <16 x i8>@test_int_x86_avx512_mask_pmovs_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovswb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x20,0xc2]
+; CHECK-NEXT:    vpmovswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0xc1]
 ; CHECK-NEXT:    vpmovswb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x20,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -4192,7 +3169,7 @@ declare void @llvm.x86.avx512.mask.pmovs.wb.mem.256(i8* %ptr, <16 x i16>, i16)
 define void @test_int_x86_avx512_mask_pmovs_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_wb_mem_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpmovswb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x20,0x07]
 ; CHECK-NEXT:    vpmovswb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x20,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -4206,9 +3183,9 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16
 define <16 x i8>@test_int_x86_avx512_mask_pmovus_wb_256(<16 x i16> %x0, <16 x i8> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovuswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmovuswb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x10,0xc2]
+; CHECK-NEXT:    vpmovuswb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0xc1]
 ; CHECK-NEXT:    vpmovuswb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x10,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -4226,7 +3203,7 @@ declare void @llvm.x86.avx512.mask.pmovus.wb.mem.256(i8* %ptr, <16 x i16>, i16)
 define void @test_int_x86_avx512_mask_pmovus_wb_mem_256(i8* %ptr, <16 x i16> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_wb_mem_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpmovuswb %ymm0, (%rdi) ## encoding: [0x62,0xf2,0x7e,0x28,0x10,0x07]
 ; CHECK-NEXT:    vpmovuswb %ymm0, (%rdi) {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -4240,7 +3217,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pmaddw.d.128(<8 x i16>, <8 x i16>, <4 x
 define <4 x i32>@test_int_x86_avx512_mask_pmaddw_d_128(<8 x i16> %x0, <8 x i16> %x1, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xf5,0xd1]
 ; CHECK-NEXT:    vpmaddwd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xf5,0xc1]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
@@ -4256,7 +3233,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pmaddw.d.256(<16 x i16>, <16 x i16>, <8
 define <8 x i32>@test_int_x86_avx512_mask_pmaddw_d_256(<16 x i16> %x0, <16 x i16> %x1, <8 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddw_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xf5,0xd1]
 ; CHECK-NEXT:    vpmaddwd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xf5,0xc1]
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
@@ -4272,7 +3249,7 @@ declare <8 x i16> @llvm.x86.avx512.mask.pmaddubs.w.128(<16 x i8>, <16 x i8>, <8
 define <8 x i16>@test_int_x86_avx512_mask_pmaddubs_w_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x04,0xd1]
 ; CHECK-NEXT:    vpmaddubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x04,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
@@ -4288,7 +3265,7 @@ declare <16 x i16> @llvm.x86.avx512.mask.pmaddubs.w.256(<32 x i8>, <32 x i8>, <1
 define <16 x i16>@test_int_x86_avx512_mask_pmaddubs_w_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmaddubs_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x04,0xd1]
 ; CHECK-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x04,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -4304,12 +3281,12 @@ declare <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8>, <16 x i8>, i32,
 define <8 x i16>@test_int_x86_avx512_mask_dbpsadbw_128(<16 x i8> %x0, <16 x i8> %x1, <8 x i16> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf3,0x7d,0x08,0x42,0xd9,0x02]
 ; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x42,0xd1,0x02]
-; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x42,0xd9,0x02]
-; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x42,0xc1,0x02]
-; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb]
-; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1]
+; CHECK-NEXT:    vdbpsadbw $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x42,0xc1,0x02]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfd,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> %x3, i8 %x4)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.dbpsadbw.128(<16 x i8> %x0, <16 x i8> %x1, i32 2, <8 x i16> zeroinitializer, i8 %x4)
@@ -4324,12 +3301,12 @@ declare <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8>, <32 x i8>, i32,
 define <16 x i16>@test_int_x86_avx512_mask_dbpsadbw_256(<32 x i8> %x0, <32 x i8> %x1, <16 x i16> %x3, i16 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_dbpsadbw_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf3,0x7d,0x28,0x42,0xd9,0x02]
 ; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x42,0xd1,0x02]
-; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xd9,0x02]
-; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x42,0xc1,0x02]
-; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; CHECK-NEXT:    vdbpsadbw $2, %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x42,0xc1,0x02]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> %x3, i16 %x4)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.dbpsadbw.256(<32 x i8> %x0, <32 x i8> %x1, i32 2, <16 x i16> zeroinitializer, i16 %x4)
@@ -4345,7 +3322,8 @@ define i16@test_int_x86_avx512_cvtb2mask_128(<16 x i8> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_cvtb2mask_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovb2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x29,0xc0]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
     %res = call i16 @llvm.x86.avx512.cvtb2mask.128(<16 x i8> %x0)
     ret i16 %res
@@ -4369,7 +3347,8 @@ define i8@test_int_x86_avx512_cvtw2mask_128(<8 x i16> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovw2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x29,0xc0]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
     %res = call i8 @llvm.x86.avx512.cvtw2mask.128(<8 x i16> %x0)
     ret i8 %res
@@ -4381,67 +3360,20 @@ define i16@test_int_x86_avx512_cvtw2mask_256(<16 x i16> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_cvtw2mask_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovw2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x29,0xc0]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
     %res = call i16 @llvm.x86.avx512.cvtw2mask.256(<16 x i16> %x0)
     ret i16 %res
 }
 
-declare <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16)
-
-define <16 x i8>@test_int_x86_avx512_cvtmask2b_128(i16 %x0) {
-; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
-; CHECK-NEXT:    vpmovm2b %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x28,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i8> @llvm.x86.avx512.cvtmask2b.128(i16 %x0)
-  ret <16 x i8> %res
-}
-
-declare <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32)
-
-define <32 x i8>@test_int_x86_avx512_cvtmask2b_256(i32 %x0) {
-; CHECK-LABEL: test_int_x86_avx512_cvtmask2b_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovd %edi, %k0 ## encoding: [0xc5,0xfb,0x92,0xc7]
-; CHECK-NEXT:    vpmovm2b %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x28,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <32 x i8> @llvm.x86.avx512.cvtmask2b.256(i32 %x0)
-  ret <32 x i8> %res
-}
-
-declare <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8)
-
-define <8 x i16>@test_int_x86_avx512_cvtmask2w_128(i8 %x0) {
-; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
-; CHECK-NEXT:    vpmovm2w %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x28,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i16> @llvm.x86.avx512.cvtmask2w.128(i8 %x0)
-  ret <8 x i16> %res
-}
-
-declare <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16)
-
-define <16 x i16>@test_int_x86_avx512_cvtmask2w_256(i16 %x0) {
-; CHECK-LABEL: test_int_x86_avx512_cvtmask2w_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
-; CHECK-NEXT:    vpmovm2w %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x28,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <16 x i16> @llvm.x86.avx512.cvtmask2w.256(i16 %x0)
-  ret <16 x i16> %res
-}
-
 declare <16 x i16> @llvm.x86.avx512.mask.psrlv16.hi(<16 x i16>, <16 x i16>, <16 x i16>, i16)
 
 define <16 x i16>@test_int_x86_avx512_mask_psrlv16_hi(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv16_hi:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x10,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x10,0xd1]
 ; CHECK-NEXT:    vpsrlvw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x10,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -4461,7 +3393,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psrlv8_hi(<8 x i16> %x0, <8 x i16> %x1
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrlv8_hi:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x10,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x10,0xd1]
 ; CHECK-NEXT:    vpsrlvw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x10,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
@@ -4481,7 +3413,7 @@ define <16 x i16>@test_int_x86_avx512_mask_psrav16_hi(<16 x i16> %x0, <16 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrav16_hi:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsravw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x11,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsravw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x11,0xd1]
 ; CHECK-NEXT:    vpsravw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x11,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -4501,7 +3433,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psrav8_hi(<8 x i16> %x0, <8 x i16> %x1
 ; CHECK-LABEL: test_int_x86_avx512_mask_psrav8_hi:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsravw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x11,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsravw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x11,0xd1]
 ; CHECK-NEXT:    vpsravw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x11,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
@@ -4521,7 +3453,7 @@ define <16 x i16>@test_int_x86_avx512_mask_psllv16_hi(<16 x i16> %x0, <16 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_psllv16_hi:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsllvw %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x12,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsllvw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x12,0xd1]
 ; CHECK-NEXT:    vpsllvw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x12,0xc1]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
@@ -4541,7 +3473,7 @@ define <8 x i16>@test_int_x86_avx512_mask_psllv8_hi(<8 x i16> %x0, <8 x i16> %x1
 ; CHECK-LABEL: test_int_x86_avx512_mask_psllv8_hi:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpsllvw %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x12,0xd9]
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpsllvw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x12,0xd1]
 ; CHECK-NEXT:    vpsllvw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x12,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
@@ -4560,12 +3492,12 @@ declare <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16>, <8 x i16>, <8
 define <8 x i16>@test_int_x86_avx512_mask_permvar_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpermw %xmm0, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xd8]
 ; CHECK-NEXT:    vpermw %xmm0, %xmm1, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0x8d,0xd0]
-; CHECK-NEXT:    vpermw %xmm0, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xd8]
-; CHECK-NEXT:    vpermw %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0xf5,0x08,0x8d,0xc0]
-; CHECK-NEXT:    vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb]
-; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
+; CHECK-NEXT:    vpermw %xmm0, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0x89,0x8d,0xc0]
+; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3)
   %res1 = call <8 x i16> @llvm.x86.avx512.mask.permvar.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> zeroinitializer, i8 %x3)
@@ -4580,12 +3512,12 @@ declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>,
 define <16 x i16>@test_int_x86_avx512_mask_permvar_hi_256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_hi_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xd8]
 ; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x8d,0xd0]
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xd8]
-; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0x8d,0xc0]
-; CHECK-NEXT:    vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb]
-; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
+; CHECK-NEXT:    vpermw %ymm0, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x8d,0xc0]
+; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
+; CHECK-NEXT:    vpaddw %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 %x3)
   %res1 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> zeroinitializer, i16 %x3)
@@ -4600,11 +3532,11 @@ declare i16 @llvm.x86.avx512.ptestm.b.128(<16 x i8>, <16 x i8>, i16)
 define i16@test_int_x86_avx512_ptestm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ptestm_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vptestmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vptestmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vptestmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x26,0xc1]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -4637,12 +3569,13 @@ declare i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16>, <8 x i16>, i8)
 define i8@test_int_x86_avx512_ptestm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ptestm_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vptestmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vptestmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vptestmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x26,0xc1]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
@@ -4655,11 +3588,11 @@ declare i16 @llvm.x86.avx512.ptestm.w.256(<16 x i16>, <16 x i16>, i16)
 define i16@test_int_x86_avx512_ptestm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ptestm_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vptestmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vptestmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vptestmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x26,0xc1]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -4674,11 +3607,11 @@ declare i16 @llvm.x86.avx512.ptestnm.b.128(<16 x i8>, <16 x i8>, i16)
 define i16@test_int_x86_avx512_ptestnm_b_128(<16 x i8> %x0, <16 x i8> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_b_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vptestnmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vptestnmb %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vptestnmb %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x26,0xc1]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -4711,12 +3644,13 @@ declare i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16>, <8 x i16>, i8 %x2)
 define i8@test_int_x86_avx512_ptestnm_w_128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vptestnmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vptestnmw %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vptestnmw %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x09,0x26,0xc1]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestnm.w.128(<8 x i16> %x0, <8 x i16> %x1, i8-1)
@@ -4729,11 +3663,11 @@ declare i16 @llvm.x86.avx512.ptestnm.w.256(<16 x i16>, <16 x i16>, i16 %x2)
 define i16@test_int_x86_avx512_ptestnm_w_256(<16 x i16> %x0, <16 x i16> %x1, i16 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_w_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vptestnmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vptestnmw %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x26,0xc1]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovd %k0, %ecx ## encoding: [0xc5,0xfb,0x93,0xc8]
+; CHECK-NEXT:    vptestnmw %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0xfe,0x29,0x26,0xc1]
+; CHECK-NEXT:    kmovd %k0, %eax ## encoding: [0xc5,0xfb,0x93,0xc0]
 ; CHECK-NEXT:    addl %ecx, %eax ## encoding: [0x01,0xc8]
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -4768,9 +3702,9 @@ declare <16 x i8> @llvm.x86.avx512.mask.pbroadcast.b.gpr.128(i8, <16 x i8>, i16)
 define <16 x i8>@test_int_x86_avx512_mask_pbroadcast_b_gpr_128(i8 %x0, <16 x i8> %x1, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_b_gpr_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastb %dil, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpbroadcastb %dil, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7a,0xcf]
+; CHECK-NEXT:    vpbroadcastb %dil, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7a,0xc7]
 ; CHECK-NEXT:    vpbroadcastb %dil, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7a,0xd7]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
@@ -4788,9 +3722,9 @@ declare <16 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.256(i16, <16 x i16>, i
 define <16 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_256(i16 %x0, <16 x i16> %x1, i16 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastw %di, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpbroadcastw %di, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7b,0xcf]
+; CHECK-NEXT:    vpbroadcastw %di, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7b,0xc7]
 ; CHECK-NEXT:    vpbroadcastw %di, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7b,0xd7]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0]
@@ -4808,9 +3742,9 @@ declare <8 x i16> @llvm.x86.avx512.mask.pbroadcast.w.gpr.128(i16, <8 x i16>, i8)
 define <8 x i16>@test_int_x86_avx512_mask_pbroadcast_w_gpr_128(i16 %x0, <8 x i16> %x1, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_w_gpr_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastw %di, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
+; CHECK-NEXT:    kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce]
 ; CHECK-NEXT:    vpbroadcastw %di, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7b,0xcf]
+; CHECK-NEXT:    vpbroadcastw %di, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7b,0xc7]
 ; CHECK-NEXT:    vpbroadcastw %di, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7b,0xd7]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xc0]
diff --git a/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e5dbff9ac5152310daa2717a294f85481f8610d9
--- /dev/null
+++ b/test/CodeGen/X86/avx512cd-intrinsics-upgrade.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
+
+define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
+; CHECK-LABEL: test_lzcnt_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntd %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
+  ret <16 x i32> %res
+}
+
+declare <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
+
+define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
+; CHECK-LABEL: test_lzcnt_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntq %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
+
+
+define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_lzcnt_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vplzcntd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.lzcnt.d.512(<16 x i32> %a, <16 x i32> %b, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_lzcnt_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vplzcntq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.lzcnt.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+  ret <8 x i64> %res
+}
diff --git a/test/CodeGen/X86/avx512cd-intrinsics.ll b/test/CodeGen/X86/avx512cd-intrinsics.ll
index febd3d69dd18099785d61e7ef18ad4ad0d402911..7e5a3e8fe25d6c5fe3ee4e7dd5fb3486e29b3060 100644
--- a/test/CodeGen/X86/avx512cd-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cd-intrinsics.ll
@@ -1,18 +1,104 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s
 
 define <16 x i32> @test_x86_vbroadcastmw_512(i16 %a0) {
-  ; CHECK: test_x86_vbroadcastmw_512
-  ; CHECK: vpbroadcastmw2d %k0, %zmm0
-  %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0) ; 
+; CHECK-LABEL: test_x86_vbroadcastmw_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpbroadcastmw2d %k0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16 %a0)
   ret <16 x i32> %res
 }
 declare <16 x i32> @llvm.x86.avx512.broadcastmw.512(i16)
 
 define <8 x i64> @test_x86_broadcastmb_512(i8 %a0) {
-  ; CHECK: test_x86_broadcastmb_512
-  ; CHECK: vpbroadcastmb2q %k0, %zmm0
-  %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0) ; 
+; CHECK-LABEL: test_x86_broadcastmb_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpbroadcastmb2q %k0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8 %a0)
   ret <8 x i64> %res
 }
 declare <8 x i64> @llvm.x86.avx512.broadcastmb.512(i8)
 
+declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
+
+define <8 x i64> @test_conflict_q(<8 x i64> %a) {
+; CHECK-LABEL: test_conflict_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vpconflictq %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
+
+define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
+; CHECK-LABEL: test_maskz_conflict_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpconflictd %zmm0, %zmm0 {%k1} {z}
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %res
+}
+
+define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_conflict_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vpconflictq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
+  ret <8 x i64> %res
+}
+
+define <16 x i32> @test_lzcnt_d(<16 x i32> %a) {
+; CHECK-LABEL: test_lzcnt_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntd %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+  ret <16 x i32> %1
+}
+declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1) #0
+
+define <8 x i64> @test_lzcnt_q(<8 x i64> %a) {
+; CHECK-LABEL: test_lzcnt_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntq %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+  ret <8 x i64> %1
+}
+declare <8 x i64> @llvm.ctlz.v8i64(<8 x i64>, i1) #0
+
+define <16 x i32> @test_mask_lzcnt_d(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+; CHECK-LABEL: test_mask_lzcnt_d:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vplzcntd %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %a, i1 false)
+  %2 = bitcast i16 %mask to <16 x i1>
+  %3 = select <16 x i1> %2, <16 x i32> %1, <16 x i32> %b
+  ret <16 x i32> %3
+}
+
+define <8 x i64> @test_mask_lzcnt_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+; CHECK-LABEL: test_mask_lzcnt_q:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vplzcntq %zmm0, %zmm1 {%k1}
+; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0
+; CHECK-NEXT:    retq
+  %1 = call <8 x i64> @llvm.ctlz.v8i64(<8 x i64> %a, i1 false)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i64> %1, <8 x i64> %b
+  ret <8 x i64> %3
+}
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8f528394f5bd50b1f003e3635e34a591b1bd3c59
--- /dev/null
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll
@@ -0,0 +1,71 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s
+
+declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8)
+
+define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntd %xmm0, %xmm2
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vplzcntd %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vplzcntd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
+  %res1 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
+  %res3 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2)
+  %res2 = add <4 x i32> %res, %res1
+  %res4 = add <4 x i32> %res2, %res3
+  ret <4 x i32> %res4
+}
+
+declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8)
+
+define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntd %ymm0, %ymm2
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vplzcntd %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
+  %res1 = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
+  %res2 = add <8 x i32> %res, %res1
+  ret <8 x i32> %res2
+}
+
+declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8)
+
+define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntq %xmm0, %xmm2
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vplzcntq %xmm0, %xmm1 {%k1}
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
+  %res1 = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
+  %res2 = add <2 x i64> %res, %res1
+  ret <2 x i64> %res2
+}
+
+declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8)
+
+define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntq %ymm0, %ymm2
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vplzcntq %ymm0, %ymm1 {%k1}
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
+  %res1 = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
+  %res2 = add <4 x i64> %res, %res1
+  ret <4 x i64> %res2
+}
+
diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
index b27b795b44096eac631078eea26a2087664061f9..37aea45e6107d343b87f0b60f8df5630b2e56bd0 100644
--- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll
@@ -1,75 +1,83 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl| FileCheck %s
 
-declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) nounwind readonly
-
-declare <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32>, <4 x i32>, i8)
-
-define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
+define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_128:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntd %xmm0, %xmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntd %xmm0, %xmm1 {%k1}
-; CHECK-NEXT:    vplzcntd %xmm0, %xmm2 {%k1} {z}
-; CHECK-NEXT:    vplzcntd %xmm0, %xmm0
+; CHECK-NEXT:    vplzcntd %xmm0, %xmm0 {%k1} {z}
+; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
-  %res1 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1)
-  %res3 = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> zeroinitializer, i8 %x2)
-  %res2 = add <4 x i32> %res, %res1
-  %res4 = add <4 x i32> %res2, %res3
+  %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
+  %2 = bitcast i8 %x2 to <8 x i1>
+  %extract1 = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract1, <4 x i32> %1, <4 x i32> %x1
+  %4 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
+  %5 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false)
+  %6 = bitcast i8 %x2 to <8 x i1>
+  %extract = shufflevector <8 x i1> %6, <8 x i1> %6, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %7 = select <4 x i1> %extract, <4 x i32> %5, <4 x i32> zeroinitializer
+  %res2 = add <4 x i32> %3, %4
+  %res4 = add <4 x i32> %res2, %7
   ret <4 x i32> %res4
 }
+declare <4 x i32> @llvm.ctlz.v4i32(<4 x i32>, i1) #0
 
-declare <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32>, <8 x i32>, i8)
-
-define <8 x i32>@test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
+define <8 x i32> @test_int_x86_avx512_mask_vplzcnt_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_d_256:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntd %ymm0, %ymm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntd %ymm0, %ymm1 {%k1}
-; CHECK-NEXT:    vplzcntd %ymm0, %ymm0
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vpaddd %ymm2, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
-  %res1 = call <8 x i32> @llvm.x86.avx512.mask.lzcnt.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 -1)
-  %res2 = add <8 x i32> %res, %res1
+  %1 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %x0, i1 false)
+  %2 = bitcast i8 %x2 to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x i32> %1, <8 x i32> %x1
+  %4 = call <8 x i32> @llvm.ctlz.v8i32(<8 x i32> %x0, i1 false)
+  %res2 = add <8 x i32> %3, %4
   ret <8 x i32> %res2
 }
+declare <8 x i32> @llvm.ctlz.v8i32(<8 x i32>, i1) #0
 
-declare <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64>, <2 x i64>, i8)
-
-define <2 x i64>@test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
+define <2 x i64> @test_int_x86_avx512_mask_vplzcnt_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_128:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntq %xmm0, %xmm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntq %xmm0, %xmm1 {%k1}
-; CHECK-NEXT:    vplzcntq %xmm0, %xmm0
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
-  %res1 = call <2 x i64> @llvm.x86.avx512.mask.lzcnt.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 -1)
-  %res2 = add <2 x i64> %res, %res1
+  %1 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %x0, i1 false)
+  %2 = bitcast i8 %x2 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <2 x i32> <i32 0, i32 1>
+  %3 = select <2 x i1> %extract, <2 x i64> %1, <2 x i64> %x1
+  %4 = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> %x0, i1 false)
+  %res2 = add <2 x i64> %3, %4
   ret <2 x i64> %res2
 }
+declare <2 x i64> @llvm.ctlz.v2i64(<2 x i64>, i1) #0
 
-declare <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64>, <4 x i64>, i8)
-
-define <4 x i64>@test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
+define <4 x i64> @test_int_x86_avx512_mask_vplzcnt_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vplzcnt_q_256:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    vplzcntq %ymm0, %ymm2
 ; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vplzcntq %ymm0, %ymm1 {%k1}
-; CHECK-NEXT:    vplzcntq %ymm0, %ymm0
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
-  %res1 = call <4 x i64> @llvm.x86.avx512.mask.lzcnt.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 -1)
-  %res2 = add <4 x i64> %res, %res1
+  %1 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %x0, i1 false)
+  %2 = bitcast i8 %x2 to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x i64> %1, <4 x i64> %x1
+  %4 = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> %x0, i1 false)
+  %res2 = add <4 x i64> %3, %4
   ret <4 x i64> %res2
 }
+declare <4 x i64> @llvm.ctlz.v4i64(<4 x i64>, i1) #0
 
 declare <4 x i32> @llvm.x86.avx512.mask.conflict.d.128(<4 x i32>, <4 x i32>, i8)
 
@@ -77,8 +85,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vpconflict_d_128(<4 x i32> %x0, <4 x i
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpconflict_d_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vpconflictd %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpconflictd %xmm0, %xmm2 {%k1} {z}
+; CHECK-NEXT:    vpconflictd %xmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vpconflictd %xmm0, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
index f25acb603753b70a3ff71c06ffb3d953303dd8c4..c5478dad42245eae2f94f7b5fe72b1dafec9db29 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll
@@ -7,7 +7,7 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0,
 ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm0
-; CHECK-NEXT:    kmovb %edi, %k0
+; CHECK-NEXT:    kmovw %edi, %k0
 ; CHECK-NEXT:    kshiftlb $7, %k0, %k1
 ; CHECK-NEXT:    kshiftrb $7, %k1, %k1
 ; CHECK-NEXT:    kshiftlb $6, %k0, %k0
@@ -18,8 +18,7 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_512(<8 x double> %x0,
 ; CHECK-NEXT:    vmovq %rax, %xmm3
 ; CHECK-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; CHECK-NEXT:    vpsllq $63, %xmm2, %xmm2
-; CHECK-NEXT:    vpsrad $31, %xmm2, %xmm2
-; CHECK-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3]
+; CHECK-NEXT:    vpsraq $63, %zmm2, %zmm2
 ; CHECK-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm1
 ; CHECK-NEXT:    vandpd %xmm0, %xmm2, %xmm2
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
@@ -39,7 +38,7 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x
 ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf32x8:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm2
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1
@@ -79,7 +78,7 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x2_512(<8 x double> %x0, <
 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
@@ -119,7 +118,7 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i6
 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm3
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %zmm0, %zmm0 {%k1} {z}
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm2, %zmm0
@@ -132,3 +131,28 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x2_512(<8 x i64> %x0, <2 x i6
   %res4 = add <8 x i64> %res2, %res3
   ret <8 x i64> %res4
 }
+
+
+declare <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16)
+
+define <16 x i32>@test_int_x86_avx512_cvtmask2d_512(i16 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpmovm2d %k0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16 %x0)
+  ret <16 x i32> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8)
+
+define <8 x i64>@test_int_x86_avx512_cvtmask2q_512(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_512:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0
+; CHECK-NEXT:    vpmovm2q %k0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8 %x0)
+  ret <8 x i64> %res
+}
diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll
index d355c92061b32314219f86ec6851d28013a8568b..000390404b54d814b5f9dccfae5b7b10e09d5edb 100644
--- a/test/CodeGen/X86/avx512dq-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dq-intrinsics.ll
@@ -6,7 +6,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2qq.512(<8 x double>, <8 x i64>, i8
 define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2qq {ru-sae}, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2qq {rn-sae}, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
@@ -22,7 +22,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.512(<8 x double>, <8 x i64>, i
 define <8 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtpd2uqq {ru-sae}, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvtpd2uqq {rn-sae}, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
@@ -38,7 +38,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvtps2qq.512(<8 x float>, <8 x i64>, i8,
 define <8 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2qq {ru-sae}, %ymm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvtps2qq {rn-sae}, %ymm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
@@ -54,7 +54,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvtps2uqq.512(<8 x float>, <8 x i64>, i8
 define <8 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtps2uqq {ru-sae}, %ymm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvtps2uqq {rn-sae}, %ymm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
@@ -70,7 +70,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtqq2pd.512(<8 x i64>, <8 x double>,
 define <8 x double>@test_int_x86_avx512_mask_cvt_qq2pd_512(<8 x i64> %x0, <8 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtqq2pd %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvtqq2pd {rn-sae}, %zmm0, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
@@ -86,7 +86,7 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtqq2ps.512(<8 x i64>, <8 x float>, i
 define <8 x float>@test_int_x86_avx512_mask_cvt_qq2ps_512(<8 x i64> %x0, <8 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtqq2ps %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtqq2ps {rn-sae}, %zmm0, %ymm0
 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -102,7 +102,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2qq.512(<8 x double>, <8 x i64>, i
 define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttpd2qq %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvttpd2qq {sae}, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
@@ -118,7 +118,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.512(<8 x double>, <8 x i64>,
 define <8 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_512(<8 x double> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttpd2uqq %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvttpd2uqq {sae}, %zmm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
@@ -134,7 +134,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvttps2qq.512(<8 x float>, <8 x i64>, i8
 define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttps2qq %ymm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvttps2qq {sae}, %ymm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
@@ -150,7 +150,7 @@ declare <8 x i64> @llvm.x86.avx512.mask.cvttps2uqq.512(<8 x float>, <8 x i64>, i
 define <8 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_512(<8 x float> %x0, <8 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvttps2uqq %ymm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvttps2uqq {sae}, %ymm0, %zmm0
 ; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
@@ -166,7 +166,7 @@ declare <8 x double> @llvm.x86.avx512.mask.cvtuqq2pd.512(<8 x i64>, <8 x double>
 define <8 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_512(<8 x i64> %x0, <8 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtuqq2pd %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vcvtuqq2pd {rn-sae}, %zmm0, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
@@ -182,7 +182,7 @@ declare <8 x float> @llvm.x86.avx512.mask.cvtuqq2ps.512(<8 x i64>, <8 x float>,
 define <8 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_512(<8 x i64> %x0, <8 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vcvtuqq2ps %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vcvtuqq2ps {rn-sae}, %zmm0, %ymm0
 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0
@@ -198,7 +198,7 @@ declare <8 x double> @llvm.x86.avx512.mask.reduce.pd.512(<8 x double>, i32, <8 x
 define <8 x double>@test_int_x86_avx512_mask_reduce_pd_512(<8 x double> %x0, <8 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vreducepd $8, %zmm0, %zmm1 {%k1}
 ; CHECK-NEXT:    vreducepd $4, {sae}, %zmm0, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm1, %zmm0
@@ -230,7 +230,7 @@ declare <8 x double> @llvm.x86.avx512.mask.range.pd.512(<8 x double>, <8 x doubl
 define <8 x double>@test_int_x86_avx512_mask_range_pd_512(<8 x double> %x0, <8 x double> %x1, <8 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vrangepd $8, %zmm1, %zmm0, %zmm2 {%k1}
 ; CHECK-NEXT:    vrangepd $4, {sae}, %zmm1, %zmm0, %zmm0
 ; CHECK-NEXT:    vaddpd %zmm0, %zmm2, %zmm0
@@ -330,12 +330,13 @@ declare i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double>, i32, i8)
 define i8 @test_int_x86_avx512_mask_fpclass_pd_512(<8 x double> %x0, i8 %x1) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vfpclasspd $2, %zmm0, %k0 {%k1}
-; CHECK-NEXT:    kmovb %k0, %ecx
+; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vfpclasspd $4, %zmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    addb %cl, %al
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
     %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 2, i8 %x1)
     %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.512(<8 x double> %x0, i32 4, i8 -1)
@@ -348,9 +349,9 @@ define i16@test_int_x86_avx512_mask_fpclass_ps_512(<16 x float> %x0, i16 %x1) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vfpclassps $4, %zmm0, %k0
-; CHECK-NEXT:    kmovw %k0, %ecx
 ; CHECK-NEXT:    vfpclassps $4, %zmm0, %k0 {%k1}
+; CHECK-NEXT:    kmovw %k0, %ecx
+; CHECK-NEXT:    vfpclassps $4, %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
 ; CHECK-NEXT:    addl %ecx, %eax
 ; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
@@ -452,6 +453,7 @@ define i16@test_int_x86_avx512_cvtd2mask_512(<16 x i32> %x0) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovd2m %zmm0, %k0
 ; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AX<def> %AX<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i16 @llvm.x86.avx512.cvtd2mask.512(<16 x i32> %x0)
   ret i16 %res
@@ -463,36 +465,13 @@ define i8@test_int_x86_avx512_cvtq2mask_512(<8 x i64> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovq2m %zmm0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    kmovw %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %res = call i8 @llvm.x86.avx512.cvtq2mask.512(<8 x i64> %x0)
   ret i8 %res
 }
 
-declare <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16)
-
-define <16 x i32>@test_int_x86_avx512_cvtmask2d_512(i16 %x0) {
-; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    vpmovm2d %k0, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <16 x i32> @llvm.x86.avx512.cvtmask2d.512(i16 %x0)
-  ret <16 x i32> %res
-}
-
-declare <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8)
-
-define <8 x i64>@test_int_x86_avx512_cvtmask2q_512(i8 %x0) {
-; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_512:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k0
-; CHECK-NEXT:    vpmovm2q %k0, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <8 x i64> @llvm.x86.avx512.cvtmask2q.512(i8 %x0)
-  ret <8 x i64> %res
-}
-
 declare <16 x float> @llvm.x86.avx512.mask.broadcastf32x8.512(<8 x float>, <16 x float>, i16)
 
 define <16 x float>@test_int_x86_avx512_mask_broadcastf32x8_512(<8 x float> %x0, <16 x float> %x2, i16 %mask) {
@@ -534,7 +513,7 @@ define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
@@ -553,7 +532,7 @@ define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512(<2 x double> %x0
 define <8 x double>@test_int_x86_avx512_mask_broadcastf64x2_512_load(<2 x double>* %x0ptr, <8 x double> %x2, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_512_load:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovapd (%rdi), %xmm1
 ; CHECK-NEXT:    vshuff64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
@@ -604,7 +583,7 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm2 {%k1} {z} = zmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm1 {%k1} = zmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,0,1,0,1,0,1]
@@ -623,7 +602,7 @@ define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512(<2 x i64> %x0, <8 x
 define <8 x i64>@test_int_x86_avx512_mask_broadcasti64x2_512_load(<2 x i64>* %x0ptr, <8 x i64> %x2, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_512_load:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm1
 ; CHECK-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm1[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/avx512dq-mask-op.ll b/test/CodeGen/X86/avx512dq-mask-op.ll
index e83aa14d35e3c00961003ec43f27e5fc7d3efe57..f0ae1b0129a8cc218e66084eead8fc98f9c8b5ab 100644
--- a/test/CodeGen/X86/avx512dq-mask-op.ll
+++ b/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -4,9 +4,10 @@
 define i8 @mask8(i8 %x) {
 ; CHECK-LABEL: mask8:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k0
+; CHECK-NEXT:    kmovd %edi, %k0
 ; CHECK-NEXT:    knotb %k0, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %m0 = bitcast i8 %x to <8 x i1>
   %m1 = xor <8 x i1> %m0, <i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1, i1 -1>
@@ -55,7 +56,8 @@ define i8 @mand8_mem(<8 x i1>* %x, <8 x i1>* %y) {
 ; CHECK-NEXT:    kandb %k1, %k0, %k2
 ; CHECK-NEXT:    kxorb %k1, %k0, %k0
 ; CHECK-NEXT:    korb %k0, %k2, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq
   %ma = load <8 x i1>, <8 x i1>* %x
   %mb = load <8 x i1>, <8 x i1>* %y
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
index f8460bf880f9e4932e99bb5c365e8e5fb24d22bb..52a84deebf51918b4177229a2f0cbfb72eb26e19 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll
@@ -13,7 +13,7 @@ define <4 x float> @test_mask_andnot_ps_rr_128(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @test_mask_andnot_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vandnps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0xd1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -24,7 +24,7 @@ define <4 x float> @test_mask_andnot_ps_rrk_128(<4 x float> %a, <4 x float> %b,
 define <4 x float> @test_mask_andnot_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vandnps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.andn.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
@@ -44,7 +44,7 @@ define <4 x float> @test_mask_andnot_ps_rm_128(<4 x float> %a, <4 x float>* %ptr
 define <4 x float> @test_mask_andnot_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandnps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x55,0x0f]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -56,7 +56,7 @@ define <4 x float> @test_mask_andnot_ps_rmk_128(<4 x float> %a, <4 x float>* %pt
 define <4 x float> @test_mask_andnot_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandnps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x55,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x float>, <4 x float>* %ptr_b
@@ -79,7 +79,7 @@ define <4 x float> @test_mask_andnot_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
 define <4 x float> @test_mask_andnot_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rmbk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandnps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x55,0x0f]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -93,7 +93,7 @@ define <4 x float> @test_mask_andnot_ps_rmbk_128(<4 x float> %a, float* %ptr_b,
 define <4 x float> @test_mask_andnot_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rmbkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandnps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x55,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load float, float* %ptr_b
@@ -117,7 +117,7 @@ define <8 x float> @test_mask_andnot_ps_rr_256(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @test_mask_andnot_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vandnps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0xd1]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -128,7 +128,7 @@ define <8 x float> @test_mask_andnot_ps_rrk_256(<8 x float> %a, <8 x float> %b,
 define <8 x float> @test_mask_andnot_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vandnps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.andn.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
@@ -148,7 +148,7 @@ define <8 x float> @test_mask_andnot_ps_rm_256(<8 x float> %a, <8 x float>* %ptr
 define <8 x float> @test_mask_andnot_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandnps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x55,0x0f]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -160,7 +160,7 @@ define <8 x float> @test_mask_andnot_ps_rmk_256(<8 x float> %a, <8 x float>* %pt
 define <8 x float> @test_mask_andnot_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandnps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x55,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x float>, <8 x float>* %ptr_b
@@ -183,7 +183,7 @@ define <8 x float> @test_mask_andnot_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
 define <8 x float> @test_mask_andnot_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rmbk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandnps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x55,0x0f]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -197,7 +197,7 @@ define <8 x float> @test_mask_andnot_ps_rmbk_256(<8 x float> %a, float* %ptr_b,
 define <8 x float> @test_mask_andnot_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_andnot_ps_rmbkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandnps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x55,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load float, float* %ptr_b
@@ -325,7 +325,7 @@ define <4 x float> @test_mask_and_ps_rr_128(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @test_mask_and_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0xd1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -336,7 +336,7 @@ define <4 x float> @test_mask_and_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4
 define <4 x float> @test_mask_and_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vandps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.and.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
@@ -356,7 +356,7 @@ define <4 x float> @test_mask_and_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b)
 define <4 x float> @test_mask_and_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x54,0x0f]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -368,7 +368,7 @@ define <4 x float> @test_mask_and_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b
 define <4 x float> @test_mask_and_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x54,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x float>, <4 x float>* %ptr_b
@@ -391,7 +391,7 @@ define <4 x float> @test_mask_and_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
 define <4 x float> @test_mask_and_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rmbk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x54,0x0f]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -405,7 +405,7 @@ define <4 x float> @test_mask_and_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4
 define <4 x float> @test_mask_and_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rmbkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x54,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load float, float* %ptr_b
@@ -429,7 +429,7 @@ define <8 x float> @test_mask_and_ps_rr_256(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @test_mask_and_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0xd1]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -440,7 +440,7 @@ define <8 x float> @test_mask_and_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8
 define <8 x float> @test_mask_and_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vandps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.and.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
@@ -460,7 +460,7 @@ define <8 x float> @test_mask_and_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b)
 define <8 x float> @test_mask_and_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x54,0x0f]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -472,7 +472,7 @@ define <8 x float> @test_mask_and_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b
 define <8 x float> @test_mask_and_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x54,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x float>, <8 x float>* %ptr_b
@@ -495,7 +495,7 @@ define <8 x float> @test_mask_and_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
 define <8 x float> @test_mask_and_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rmbk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x54,0x0f]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -509,7 +509,7 @@ define <8 x float> @test_mask_and_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8
 define <8 x float> @test_mask_and_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_and_ps_rmbkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vandps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x54,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load float, float* %ptr_b
@@ -637,7 +637,7 @@ define <4 x float> @test_mask_or_ps_rr_128(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @test_mask_or_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0xd1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -648,7 +648,7 @@ define <4 x float> @test_mask_or_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x
 define <4 x float> @test_mask_or_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.or.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
@@ -668,7 +668,7 @@ define <4 x float> @test_mask_or_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b)
 define <4 x float> @test_mask_or_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x56,0x0f]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -680,7 +680,7 @@ define <4 x float> @test_mask_or_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b,
 define <4 x float> @test_mask_or_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x56,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x float>, <4 x float>* %ptr_b
@@ -703,7 +703,7 @@ define <4 x float> @test_mask_or_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
 define <4 x float> @test_mask_or_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rmbk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x56,0x0f]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -717,7 +717,7 @@ define <4 x float> @test_mask_or_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x
 define <4 x float> @test_mask_or_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rmbkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x56,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load float, float* %ptr_b
@@ -741,7 +741,7 @@ define <8 x float> @test_mask_or_ps_rr_256(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @test_mask_or_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0xd1]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -752,7 +752,7 @@ define <8 x float> @test_mask_or_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x
 define <8 x float> @test_mask_or_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.or.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
@@ -772,7 +772,7 @@ define <8 x float> @test_mask_or_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b)
 define <8 x float> @test_mask_or_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x56,0x0f]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -784,7 +784,7 @@ define <8 x float> @test_mask_or_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b,
 define <8 x float> @test_mask_or_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x56,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x float>, <8 x float>* %ptr_b
@@ -807,7 +807,7 @@ define <8 x float> @test_mask_or_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
 define <8 x float> @test_mask_or_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rmbk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x56,0x0f]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -821,7 +821,7 @@ define <8 x float> @test_mask_or_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x
 define <8 x float> @test_mask_or_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_or_ps_rmbkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x56,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load float, float* %ptr_b
@@ -949,7 +949,7 @@ define <4 x float> @test_mask_xor_ps_rr_128(<4 x float> %a, <4 x float> %b) {
 define <4 x float> @test_mask_xor_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vxorps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0xd1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -960,7 +960,7 @@ define <4 x float> @test_mask_xor_ps_rrk_128(<4 x float> %a, <4 x float> %b, <4
 define <4 x float> @test_mask_xor_ps_rrkz_128(<4 x float> %a, <4 x float> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vxorps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.xor.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> zeroinitializer, i8 %mask)
@@ -980,7 +980,7 @@ define <4 x float> @test_mask_xor_ps_rm_128(<4 x float> %a, <4 x float>* %ptr_b)
 define <4 x float> @test_mask_xor_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vxorps (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x57,0x0f]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -992,7 +992,7 @@ define <4 x float> @test_mask_xor_ps_rmk_128(<4 x float> %a, <4 x float>* %ptr_b
 define <4 x float> @test_mask_xor_ps_rmkz_128(<4 x float> %a, <4 x float>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vxorps (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x57,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x float>, <4 x float>* %ptr_b
@@ -1015,7 +1015,7 @@ define <4 x float> @test_mask_xor_ps_rmb_128(<4 x float> %a, float* %ptr_b) {
 define <4 x float> @test_mask_xor_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rmbk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vxorps (%rdi){1to4}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x19,0x57,0x0f]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1029,7 +1029,7 @@ define <4 x float> @test_mask_xor_ps_rmbk_128(<4 x float> %a, float* %ptr_b, <4
 define <4 x float> @test_mask_xor_ps_rmbkz_128(<4 x float> %a, float* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rmbkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vxorps (%rdi){1to4}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x99,0x57,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load float, float* %ptr_b
@@ -1053,7 +1053,7 @@ define <8 x float> @test_mask_xor_ps_rr_256(<8 x float> %a, <8 x float> %b) {
 define <8 x float> @test_mask_xor_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0xd1]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1064,7 +1064,7 @@ define <8 x float> @test_mask_xor_ps_rrk_256(<8 x float> %a, <8 x float> %b, <8
 define <8 x float> @test_mask_xor_ps_rrkz_256(<8 x float> %a, <8 x float> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vxorps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.xor.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> zeroinitializer, i8 %mask)
@@ -1084,7 +1084,7 @@ define <8 x float> @test_mask_xor_ps_rm_256(<8 x float> %a, <8 x float>* %ptr_b)
 define <8 x float> @test_mask_xor_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vxorps (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x57,0x0f]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1096,7 +1096,7 @@ define <8 x float> @test_mask_xor_ps_rmk_256(<8 x float> %a, <8 x float>* %ptr_b
 define <8 x float> @test_mask_xor_ps_rmkz_256(<8 x float> %a, <8 x float>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vxorps (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x57,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x float>, <8 x float>* %ptr_b
@@ -1119,7 +1119,7 @@ define <8 x float> @test_mask_xor_ps_rmb_256(<8 x float> %a, float* %ptr_b) {
 define <8 x float> @test_mask_xor_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8 x float> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rmbk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vxorps (%rdi){1to8}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7c,0x39,0x57,0x0f]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1133,7 +1133,7 @@ define <8 x float> @test_mask_xor_ps_rmbk_256(<8 x float> %a, float* %ptr_b, <8
 define <8 x float> @test_mask_xor_ps_rmbkz_256(<8 x float> %a, float* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_xor_ps_rmbkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vxorps (%rdi){1to8}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xb9,0x57,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load float, float* %ptr_b
@@ -1261,7 +1261,7 @@ define <8 x i64> @test_mask_mullo_epi64_rr_512(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @test_mask_mullo_epi64_rrk_512(<8 x i64> %a, <8 x i64> %b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rrk_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpmullq %zmm1, %zmm0, %zmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0xd1]
 ; CHECK-NEXT:    vmovdqa64 %zmm2, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1272,7 +1272,7 @@ define <8 x i64> @test_mask_mullo_epi64_rrk_512(<8 x i64> %a, <8 x i64> %b, <8 x
 define <8 x i64> @test_mask_mullo_epi64_rrkz_512(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rrkz_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpmullq %zmm1, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i64> @llvm.x86.avx512.mask.pmull.q.512(<8 x i64> %a, <8 x i64> %b, <8 x i64> zeroinitializer, i8 %mask)
@@ -1292,7 +1292,7 @@ define <8 x i64> @test_mask_mullo_epi64_rm_512(<8 x i64> %a, <8 x i64>* %ptr_b)
 define <8 x i64> @test_mask_mullo_epi64_rmk_512(<8 x i64> %a, <8 x i64>* %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmk_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi), %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x49,0x40,0x0f]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1304,7 +1304,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmk_512(<8 x i64> %a, <8 x i64>* %ptr_b,
 define <8 x i64> @test_mask_mullo_epi64_rmkz_512(<8 x i64> %a, <8 x i64>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmkz_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi), %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xc9,0x40,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i64>, <8 x i64>* %ptr_b
@@ -1327,7 +1327,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmb_512(<8 x i64> %a, i64* %ptr_b) {
 define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmbk_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi){1to8}, %zmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x59,0x40,0x0f]
 ; CHECK-NEXT:    vmovdqa64 %zmm1, %zmm0 ## encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1341,7 +1341,7 @@ define <8 x i64> @test_mask_mullo_epi64_rmbk_512(<8 x i64> %a, i64* %ptr_b, <8 x
 define <8 x i64> @test_mask_mullo_epi64_rmbkz_512(<8 x i64> %a, i64* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_512:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi){1to8}, %zmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xd9,0x40,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
@@ -1364,7 +1364,7 @@ define <4 x i64> @test_mask_mullo_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
 define <4 x i64> @test_mask_mullo_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rrk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpmullq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0xd1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1375,7 +1375,7 @@ define <4 x i64> @test_mask_mullo_epi64_rrk_256(<4 x i64> %a, <4 x i64> %b, <4 x
 define <4 x i64> @test_mask_mullo_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rrkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpmullq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.pmull.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 %mask)
@@ -1395,7 +1395,7 @@ define <4 x i64> @test_mask_mullo_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b)
 define <4 x i64> @test_mask_mullo_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b, <4 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x40,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1407,7 +1407,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmk_256(<4 x i64> %a, <4 x i64>* %ptr_b,
 define <4 x i64> @test_mask_mullo_epi64_rmkz_256(<4 x i64> %a, <4 x i64>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x40,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x i64>, <4 x i64>* %ptr_b
@@ -1430,7 +1430,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmb_256(<4 x i64> %a, i64* %ptr_b) {
 define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmbk_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi){1to4}, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x39,0x40,0x0f]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1444,7 +1444,7 @@ define <4 x i64> @test_mask_mullo_epi64_rmbk_256(<4 x i64> %a, i64* %ptr_b, <4 x
 define <4 x i64> @test_mask_mullo_epi64_rmbkz_256(<4 x i64> %a, i64* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi){1to4}, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xb9,0x40,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
@@ -1468,7 +1468,7 @@ define <2 x i64> @test_mask_mullo_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
 define <2 x i64> @test_mask_mullo_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rrk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpmullq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0xd1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1479,7 +1479,7 @@ define <2 x i64> @test_mask_mullo_epi64_rrk_128(<2 x i64> %a, <2 x i64> %b, <2 x
 define <2 x i64> @test_mask_mullo_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rrkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpmullq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.pmull.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 %mask)
@@ -1499,7 +1499,7 @@ define <2 x i64> @test_mask_mullo_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b)
 define <2 x i64> @test_mask_mullo_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b, <2 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x40,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1511,7 +1511,7 @@ define <2 x i64> @test_mask_mullo_epi64_rmk_128(<2 x i64> %a, <2 x i64>* %ptr_b,
 define <2 x i64> @test_mask_mullo_epi64_rmkz_128(<2 x i64> %a, <2 x i64>* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x40,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <2 x i64>, <2 x i64>* %ptr_b
@@ -1534,7 +1534,7 @@ define <2 x i64> @test_mask_mullo_epi64_rmb_128(<2 x i64> %a, i64* %ptr_b) {
 define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x i64> %passThru, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmbk_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi){1to2}, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x19,0x40,0x0f]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -1548,7 +1548,7 @@ define <2 x i64> @test_mask_mullo_epi64_rmbk_128(<2 x i64> %a, i64* %ptr_b, <2 x
 define <2 x i64> @test_mask_mullo_epi64_rmbkz_128(<2 x i64> %a, i64* %ptr_b, i8 %mask) {
 ; CHECK-LABEL: test_mask_mullo_epi64_rmbkz_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vpmullq (%rdi){1to2}, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x99,0x40,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %q = load i64, i64* %ptr_b
@@ -1566,7 +1566,7 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0,
 ; CHECK-LABEL: test_int_x86_avx512_mask_vextractf64x2_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01]
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01]
 ; CHECK-NEXT:    vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01]
 ; CHECK-NEXT:    vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca]
@@ -1586,7 +1586,7 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, <
 ; CHECK-LABEL: test_int_x86_avx512_mask_insertf64x2_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01]
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01]
 ; CHECK-NEXT:    vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01]
 ; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb]
@@ -1606,7 +1606,7 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6
 ; CHECK-LABEL: test_int_x86_avx512_mask_inserti64x2_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01]
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01]
 ; CHECK-NEXT:    vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01]
 ; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
@@ -1619,3 +1619,51 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6
   %res4 = add <4 x i64> %res3, %res2
   ret <4 x i64> %res4
 }
+
+declare <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8)
+
+define <4 x i32>@test_int_x86_avx512_cvtmask2d_128(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; CHECK-NEXT:    vpmovm2d %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8 %x0)
+  ret <4 x i32> %res
+}
+
+declare <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8)
+
+define <8 x i32>@test_int_x86_avx512_cvtmask2d_256(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; CHECK-NEXT:    vpmovm2d %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x38,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8 %x0)
+  ret <8 x i32> %res
+}
+
+declare <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8)
+
+define <2 x i64>@test_int_x86_avx512_cvtmask2q_128(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; CHECK-NEXT:    vpmovm2q %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x38,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8 %x0)
+  ret <2 x i64> %res
+}
+
+declare <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8)
+
+define <4 x i64>@test_int_x86_avx512_cvtmask2q_256(i8 %x0) {
+; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k0 ## encoding: [0xc5,0xf8,0x92,0xc7]
+; CHECK-NEXT:    vpmovm2q %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x38,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8 %x0)
+  ret <4 x i64> %res
+}
diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
index 0bf7f7bf3d35ec63d300faa44425cb3719d9865f..ad9ea93c20311d53ea7b04ffea812f4b7fe599d0 100644
--- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll
@@ -6,7 +6,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2qq.128(<2 x double>, <2 x i64>, i8
 define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtpd2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7b,0xc8]
 ; CHECK-NEXT:    vcvtpd2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x7b,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
@@ -22,7 +22,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2qq.256(<4 x double>, <4 x i64>, i8
 define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2qq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtpd2qq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7b,0xc8]
 ; CHECK-NEXT:    vcvtpd2qq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x7b,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
@@ -38,7 +38,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.128(<2 x double>, <2 x i64>, i
 define <2 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtpd2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x79,0xc8]
 ; CHECK-NEXT:    vcvtpd2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x79,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
@@ -54,7 +54,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtpd2uqq.256(<4 x double>, <4 x i64>, i
 define <4 x i64>@test_int_x86_avx512_mask_cvt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_pd2uqq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtpd2uqq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x79,0xc8]
 ; CHECK-NEXT:    vcvtpd2uqq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x79,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
@@ -70,7 +70,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtps2qq.128(<4 x float>, <2 x i64>, i8)
 define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtps2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7b,0xc8]
 ; CHECK-NEXT:    vcvtps2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x7b,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
@@ -86,7 +86,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtps2qq.256(<4 x float>, <4 x i64>, i8)
 define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2qq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtps2qq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7b,0xc8]
 ; CHECK-NEXT:    vcvtps2qq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x7b,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
@@ -102,7 +102,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvtps2uqq.128(<4 x float>, <2 x i64>, i8
 define <2 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtps2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x79,0xc8]
 ; CHECK-NEXT:    vcvtps2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x79,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
@@ -118,7 +118,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvtps2uqq.256(<4 x float>, <4 x i64>, i8
 define <4 x i64>@test_int_x86_avx512_mask_cvt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_ps2uqq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtps2uqq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x79,0xc8]
 ; CHECK-NEXT:    vcvtps2uqq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x79,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
@@ -134,7 +134,7 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtqq2pd.128(<2 x i64>, <2 x double>,
 define <2 x double>@test_int_x86_avx512_mask_cvt_qq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtqq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0xe6,0xc8]
 ; CHECK-NEXT:    vcvtqq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0xe6,0xc0]
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
@@ -150,7 +150,7 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtqq2pd.256(<4 x i64>, <4 x double>,
 define <4 x double>@test_int_x86_avx512_mask_cvt_qq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtqq2pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0xe6,0xc8]
 ; CHECK-NEXT:    vcvtqq2pd %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0xe6,0xc0]
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
@@ -166,7 +166,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.128(<2 x i64>, <4 x float>, i
 define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8]
 ; CHECK-NEXT:    vcvtqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x08,0x5b,0xc0]
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0]
@@ -180,7 +180,7 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128(<2 x i64> %x0, <4 x fl
 define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_128_zext:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x09,0x5b,0xc8]
 ; CHECK-NEXT:    vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]
 ; CHECK-NEXT:    ## xmm1 = xmm1[0],zero
@@ -200,7 +200,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtqq2ps.256(<4 x i64>, <4 x float>, i
 define <4 x float>@test_int_x86_avx512_mask_cvt_qq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_qq2ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtqq2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfc,0x29,0x5b,0xc8]
 ; CHECK-NEXT:    vcvtqq2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xfc,0x28,0x5b,0xc0]
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0]
@@ -216,7 +216,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2qq.128(<2 x double>, <2 x i64>, i
 define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvttpd2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x7a,0xc8]
 ; CHECK-NEXT:    vcvttpd2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x7a,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
@@ -232,7 +232,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2qq.256(<4 x double>, <4 x i64>, i
 define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2qq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2qq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvttpd2qq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x7a,0xc8]
 ; CHECK-NEXT:    vcvttpd2qq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x7a,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
@@ -248,7 +248,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.128(<2 x double>, <2 x i64>,
 define <2 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_128(<2 x double> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvttpd2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0x78,0xc8]
 ; CHECK-NEXT:    vcvttpd2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x78,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
@@ -264,7 +264,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttpd2uqq.256(<4 x double>, <4 x i64>,
 define <4 x i64>@test_int_x86_avx512_mask_cvtt_pd2uqq_256(<4 x double> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_pd2uqq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvttpd2uqq %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0x78,0xc8]
 ; CHECK-NEXT:    vcvttpd2uqq %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x78,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
@@ -280,7 +280,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttps2qq.128(<4 x float>, <2 x i64>, i8
 define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvttps2qq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x7a,0xc8]
 ; CHECK-NEXT:    vcvttps2qq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x7a,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
@@ -296,7 +296,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttps2qq.256(<4 x float>, <4 x i64>, i8
 define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2qq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2qq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvttps2qq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x7a,0xc8]
 ; CHECK-NEXT:    vcvttps2qq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x7a,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
@@ -312,7 +312,7 @@ declare <2 x double> @llvm.x86.avx512.mask.cvtuqq2pd.128(<2 x i64>, <2 x double>
 define <2 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_128(<2 x i64> %x0, <2 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtuqq2pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x09,0x7a,0xc8]
 ; CHECK-NEXT:    vcvtuqq2pd %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfe,0x08,0x7a,0xc0]
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
@@ -328,7 +328,7 @@ declare <4 x double> @llvm.x86.avx512.mask.cvtuqq2pd.256(<4 x i64>, <4 x double>
 define <4 x double>@test_int_x86_avx512_mask_cvt_uqq2pd_256(<4 x i64> %x0, <4 x double> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtuqq2pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xfe,0x29,0x7a,0xc8]
 ; CHECK-NEXT:    vcvtuqq2pd %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfe,0x28,0x7a,0xc0]
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
@@ -344,7 +344,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.128(<2 x i64>, <4 x float>,
 define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtuqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8]
 ; CHECK-NEXT:    vcvtuqq2ps %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x08,0x7a,0xc0]
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0]
@@ -358,7 +358,7 @@ define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128(<2 x i64> %x0, <4 x f
 define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_128_zext(<2 x i64> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_128_zext:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtuqq2ps %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x7a,0xc8]
 ; CHECK-NEXT:    vmovq %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc9]
 ; CHECK-NEXT:    ## xmm1 = xmm1[0],zero
@@ -378,7 +378,7 @@ declare <4 x float> @llvm.x86.avx512.mask.cvtuqq2ps.256(<4 x i64>, <4 x float>,
 define <4 x float>@test_int_x86_avx512_mask_cvt_uqq2ps_256(<4 x i64> %x0, <4 x float> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvt_uqq2ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvtuqq2ps %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x7a,0xc8]
 ; CHECK-NEXT:    vcvtuqq2ps %ymm0, %xmm0 ## encoding: [0x62,0xf1,0xff,0x28,0x7a,0xc0]
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0]
@@ -394,7 +394,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.cvttps2uqq.128(<4 x float>, <2 x i64>, i
 define <2 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_128(<4 x float> %x0, <2 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvttps2uqq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0x78,0xc8]
 ; CHECK-NEXT:    vcvttps2uqq %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x78,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
@@ -410,7 +410,7 @@ declare <4 x i64> @llvm.x86.avx512.mask.cvttps2uqq.256(<4 x float>, <4 x i64>, i
 define <4 x i64>@test_int_x86_avx512_mask_cvtt_ps2uqq_256(<4 x float> %x0, <4 x i64> %x1, i8 %x2) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_cvtt_ps2uqq_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vcvttps2uqq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x78,0xc8]
 ; CHECK-NEXT:    vcvttps2uqq %xmm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x78,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
@@ -426,7 +426,7 @@ declare <2 x double> @llvm.x86.avx512.mask.reduce.pd.128(<2 x double>, i32, <2 x
 define <2 x double>@test_int_x86_avx512_mask_reduce_pd_128(<2 x double> %x0, <2 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vreducepd $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x56,0xc8,0x04]
 ; CHECK-NEXT:    vreducepd $8, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x56,0xc0,0x08]
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
@@ -442,7 +442,7 @@ declare <4 x double> @llvm.x86.avx512.mask.reduce.pd.256(<4 x double>, i32, <4 x
 define <4 x double>@test_int_x86_avx512_mask_reduce_pd_256(<4 x double> %x0, <4 x double> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vreducepd $4, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x56,0xc8,0x04]
 ; CHECK-NEXT:    vreducepd $0, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x56,0xc0,0x00]
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
@@ -458,7 +458,7 @@ declare <4 x float> @llvm.x86.avx512.mask.reduce.ps.128(<4 x float>, i32, <4 x f
 define <4 x float>@test_int_x86_avx512_mask_reduce_ps_128(<4 x float> %x0, <4 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vreduceps $4, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x56,0xc8,0x04]
 ; CHECK-NEXT:    vreduceps $88, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x56,0xc0,0x58]
 ; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc0]
@@ -474,7 +474,7 @@ declare <8 x float> @llvm.x86.avx512.mask.reduce.ps.256(<8 x float>, i32, <8 x f
 define <8 x float>@test_int_x86_avx512_mask_reduce_ps_256(<8 x float> %x0, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_reduce_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vreduceps $11, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x56,0xc8,0x0b]
 ; CHECK-NEXT:    vreduceps $11, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x56,0xc0,0x0b]
 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
@@ -490,7 +490,7 @@ declare <2 x double> @llvm.x86.avx512.mask.range.pd.128(<2 x double>, <2 x doubl
 define <2 x double>@test_int_x86_avx512_mask_range_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrangepd $4, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x50,0xd1,0x04]
 ; CHECK-NEXT:    vrangepd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x50,0xc1,0x08]
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0]
@@ -506,7 +506,7 @@ declare <4 x double> @llvm.x86.avx512.mask.range.pd.256(<4 x double>, <4 x doubl
 define <4 x double>@test_int_x86_avx512_mask_range_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_range_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrangepd $4, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x50,0xd1,0x04]
 ; CHECK-NEXT:    vrangepd $88, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x50,0xc1,0x58]
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
@@ -522,7 +522,7 @@ declare <4 x float> @llvm.x86.avx512.mask.range.ps.128(<4 x float>, <4 x float>,
 define <4 x float>@test_int_x86_avx512_mask_range_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrangeps $4, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x50,0xd1,0x04]
 ; CHECK-NEXT:    vrangeps $88, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x50,0xc1,0x58]
 ; CHECK-NEXT:    vaddps %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc0]
@@ -538,7 +538,7 @@ declare <8 x float> @llvm.x86.avx512.mask.range.ps.256(<8 x float>, <8 x float>,
 define <8 x float>@test_int_x86_avx512_mask_range_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x3, i8 %x4) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_range_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrangeps $4, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x50,0xd1,0x04]
 ; CHECK-NEXT:    vrangeps $88, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x50,0xc1,0x58]
 ; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0]
@@ -554,12 +554,13 @@ declare i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float>, i32, i8)
 define i8 @test_int_x86_avx512_mask_fpclass_ps_128(<4 x float> %x0, i8 %x1) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vfpclassps $2, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x66,0xc0,0x02]
-; CHECK-NEXT:    kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
 ; CHECK-NEXT:    vfpclassps $4, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x66,0xc0,0x04]
-; CHECK-NEXT:    kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 2, i8 %x1)
   %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.128(<4 x float> %x0, i32 4, i8 -1)
@@ -572,12 +573,13 @@ declare i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float>, i32, i8)
 define i8 @test_int_x86_avx512_mask_fpclass_ps_256(<8 x float> %x0, i8 %x1) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_ps_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vfpclassps $2, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x66,0xc0,0x02]
-; CHECK-NEXT:    kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
 ; CHECK-NEXT:    vfpclassps $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x66,0xc0,0x04]
-; CHECK-NEXT:    kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 2, i8 %x1)
   %res1 = call i8 @llvm.x86.avx512.mask.fpclass.ps.256(<8 x float> %x0, i32 4, i8 -1)
@@ -590,12 +592,13 @@ declare i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double>, i32, i8)
 define i8 @test_int_x86_avx512_mask_fpclass_pd_128(<2 x double> %x0, i8 %x1) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vfpclasspd $4, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x66,0xc0,0x04]
-; CHECK-NEXT:    kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
 ; CHECK-NEXT:    vfpclasspd $2, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x66,0xc0,0x02]
-; CHECK-NEXT:    kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res =  call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 4, i8 %x1)
   %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.128(<2 x double> %x0, i32 2, i8 -1)
@@ -608,12 +611,13 @@ declare i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double>, i32, i8)
 define i8 @test_int_x86_avx512_mask_fpclass_pd_256(<4 x double> %x0, i8 %x1) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_fpclass_pd_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vfpclasspd $2, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x66,0xc0,0x02]
-; CHECK-NEXT:    kmovb %k0, %ecx ## encoding: [0xc5,0xf9,0x93,0xc8]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
 ; CHECK-NEXT:    vfpclasspd $4, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x66,0xc0,0x04]
-; CHECK-NEXT:    kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 2, i8 %x1)
   %res1 = call i8 @llvm.x86.avx512.mask.fpclass.pd.256(<4 x double> %x0, i32 4, i8 -1)
@@ -626,13 +630,12 @@ declare <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float>, <8 x f
 define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, <8 x float> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf32x2_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x19,0xc8]
 ; CHECK-NEXT:    ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0]
 ; CHECK-NEXT:    ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT:    vbroadcastf32x2 %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x19,0xc0]
-; CHECK-NEXT:    ## ymm0 = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT:    vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0]
 ; CHECK-NEXT:    vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca]
 ; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -649,13 +652,12 @@ declare <8 x i32> @llvm.x86.avx512.mask.broadcasti32x2.256(<4 x i32>, <8 x i32>,
 define <8 x i32>@test_int_x86_avx512_mask_broadcasti32x2_256(<4 x i32> %x0, <8 x i32> %x2, i8 %x3, i64 * %y_ptr) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcasti32x2 (%rsi), %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x59,0x0e]
 ; CHECK-NEXT:    ## ymm1 {%k1} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x59,0xd0]
 ; CHECK-NEXT:    ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1]
-; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x59,0xc0]
-; CHECK-NEXT:    ## ymm0 = xmm0[0,1,0,1,0,1,0,1]
+; CHECK-NEXT:    vpbroadcastq %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xc0]
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -675,10 +677,10 @@ declare <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32>, <4 x i32>,
 define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti32x2_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8]
 ; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0]
-; CHECK-NEXT:    vbroadcasti32x2 %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x59,0xc0]
+; CHECK-NEXT:    vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
@@ -696,7 +698,8 @@ define i8@test_int_x86_avx512_cvtd2mask_128(<4 x i32> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovd2m %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x39,0xc0]
-; CHECK-NEXT:    kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
     %res = call i8 @llvm.x86.avx512.cvtd2mask.128(<4 x i32> %x0)
     ret i8 %res
@@ -708,7 +711,8 @@ define i8@test_int_x86_avx512_cvtd2mask_256(<8 x i32> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_cvtd2mask_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovd2m %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x39,0xc0]
-; CHECK-NEXT:    kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
     %res = call i8 @llvm.x86.avx512.cvtd2mask.256(<8 x i32> %x0)
     ret i8 %res
@@ -720,7 +724,8 @@ define i8@test_int_x86_avx512_cvtq2mask_128(<2 x i64> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovq2m %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x39,0xc0]
-; CHECK-NEXT:    kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
     %res = call i8 @llvm.x86.avx512.cvtq2mask.128(<2 x i64> %x0)
     ret i8 %res
@@ -732,67 +737,20 @@ define i8@test_int_x86_avx512_cvtq2mask_256(<4 x i64> %x0) {
 ; CHECK-LABEL: test_int_x86_avx512_cvtq2mask_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpmovq2m %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x39,0xc0]
-; CHECK-NEXT:    kmovb %k0, %eax ## encoding: [0xc5,0xf9,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
     %res = call i8 @llvm.x86.avx512.cvtq2mask.256(<4 x i64> %x0)
     ret i8 %res
 }
 
-declare <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8)
-
-define <4 x i32>@test_int_x86_avx512_cvtmask2d_128(i8 %x0) {
-; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
-; CHECK-NEXT:    vpmovm2d %k0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x38,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x i32> @llvm.x86.avx512.cvtmask2d.128(i8 %x0)
-  ret <4 x i32> %res
-}
-
-declare <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8)
-
-define <8 x i32>@test_int_x86_avx512_cvtmask2d_256(i8 %x0) {
-; CHECK-LABEL: test_int_x86_avx512_cvtmask2d_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
-; CHECK-NEXT:    vpmovm2d %k0, %ymm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x38,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x i32> @llvm.x86.avx512.cvtmask2d.256(i8 %x0)
-  ret <8 x i32> %res
-}
-
-declare <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8)
-
-define <2 x i64>@test_int_x86_avx512_cvtmask2q_128(i8 %x0) {
-; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_128:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
-; CHECK-NEXT:    vpmovm2q %k0, %xmm0 ## encoding: [0x62,0xf2,0xfe,0x08,0x38,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <2 x i64> @llvm.x86.avx512.cvtmask2q.128(i8 %x0)
-  ret <2 x i64> %res
-}
-
-declare <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8)
-
-define <4 x i64>@test_int_x86_avx512_cvtmask2q_256(i8 %x0) {
-; CHECK-LABEL: test_int_x86_avx512_cvtmask2q_256:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k0 ## encoding: [0xc5,0xf9,0x92,0xc7]
-; CHECK-NEXT:    vpmovm2q %k0, %ymm0 ## encoding: [0x62,0xf2,0xfe,0x28,0x38,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x i64> @llvm.x86.avx512.cvtmask2q.256(i8 %x0)
-  ret <4 x i64> %res
-}
-
 declare <4 x double> @llvm.x86.avx512.mask.broadcastf64x2.256(<2 x double>, <4 x double>, i8)
 
 define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0, <4 x double> %x2, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vshuff64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd0,0x00]
 ; CHECK-NEXT:    ## ymm2 {%k1} {z} = ymm0[0,1,0,1]
 ; CHECK-NEXT:    vshuff64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xc8,0x00]
@@ -814,7 +772,7 @@ define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256(<2 x double> %x0
 define <4 x double>@test_int_x86_avx512_mask_broadcastf64x2_256_load(<2 x double>* %x0ptr, <4 x double> %x2, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcastf64x2_256_load:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vmovapd (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x0f]
 ; CHECK-NEXT:    vshuff64x2 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x23,0xc1,0x00]
 ; CHECK-NEXT:    ## ymm0 {%k1} = ymm1[0,1,0,1]
@@ -831,7 +789,7 @@ define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    ## kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; CHECK-NEXT:    kmovb %edi, %k1 ## encoding: [0xc5,0xf9,0x92,0xcf]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vshufi64x2 $0, %ymm0, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x43,0xd0,0x00]
 ; CHECK-NEXT:    ## ymm2 {%k1} {z} = ymm0[0,1,0,1]
 ; CHECK-NEXT:    vshufi64x2 $0, %ymm0, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x43,0xc8,0x00]
@@ -853,7 +811,7 @@ define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256(<2 x i64> %x0, <4 x
 define <4 x i64>@test_int_x86_avx512_mask_broadcasti64x2_256_load(<2 x i64>* %x0ptr, <4 x i64> %x2, i8 %mask) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_broadcasti64x2_256_load:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1 ## encoding: [0xc5,0xf9,0x92,0xce]
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
 ; CHECK-NEXT:    vmovdqa (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0x0f]
 ; CHECK-NEXT:    vshufi64x2 $0, %ymm1, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x43,0xc1,0x00]
 ; CHECK-NEXT:    ## ymm0 {%k1} = ymm1[0,1,0,1]
diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll
index 827a56d76ae1f20af76778ee9213f16ec0c6604b..ca130bd2b6762740386943cc3881636168b3cdee 100644
--- a/test/CodeGen/X86/avx512er-intrinsics.ll
+++ b/test/CodeGen/X86/avx512er-intrinsics.ll
@@ -1,34 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=knl --show-mc-encoding| FileCheck %s
 
 define <16 x float> @test_rsqrt28_ps(<16 x float> %a0) {
-  ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
+; CHECK-LABEL: test_rsqrt28_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vrsqrt28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
   ret <16 x float> %res
 }
 
 define <16 x float> @test1_rsqrt28_ps(<16 x float> %a0, <16 x float> %a1) {
-  ; CHECK: kmovw
-  ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
+; CHECK-LABEL: test1_rsqrt28_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
+; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; CHECK-NEXT:    vrsqrt28ps {sae}, %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcc,0xc8]
+; CHECK-NEXT:    vmovaps %zmm1, %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x28,0xc1]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> %a1, i16 6, i32 8)
   ret <16 x float> %res
 }
 
 define <16 x float> @test2_rsqrt28_ps(<16 x float> %a0) {
-  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
+; CHECK-LABEL: test2_rsqrt28_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
+; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; CHECK-NEXT:    vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test3_rsqrt28_ps(<16 x float> %a0) {
-  ; CHECK: kmovw
-  ; CHECK: vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
+; CHECK-LABEL: test3_rsqrt28_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
+; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; CHECK-NEXT:    vrsqrt28ps %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0xc9,0xcc,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 6, i32 4)
   ret <16 x float> %res
 }
 
 define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
-  ; CHECK: vrsqrt28ps {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8) 
+; CHECK-LABEL: test4_rsqrt28_ps:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movw $6, %ax # encoding: [0x66,0xb8,0x06,0x00]
+; CHECK-NEXT:    kmovw %eax, %k1 # encoding: [0xc5,0xf8,0x92,0xc8]
+; CHECK-NEXT:    vrsqrt28ps {sae}, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcc,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> undef, i16 6, i32 8)
   ret <16 x float> %res
 }
 
@@ -36,77 +59,133 @@ define <16 x float> @test4_rsqrt28_ps(<16 x float> %a0) {
 declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
-  ; CHECK: vrcp28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
+; CHECK-LABEL: test_rcp28_ps_512:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vrcp28ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
-  ; CHECK: vrcp28pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
-  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) 
+; CHECK-LABEL: test_rcp28_pd_512:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vrcp28pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <16 x float> @test_exp2_ps_512(<16 x float> %a0) {
-  ; CHECK: vexp2ps {sae}, %zmm0, %zmm0     # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
+; CHECK-LABEL: test_exp2_ps_512:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vexp2ps {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xc8,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %res = call <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8)
   ret <16 x float> %res
 }
 declare <16 x float> @llvm.x86.avx512.exp2.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_exp2_pd_512(<8 x double> %a0) {
-  ; CHECK: vexp2pd {sae}, %zmm0, %zmm0      # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
+; CHECK-LABEL: test_exp2_pd_512:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vexp2pd {sae}, %zmm0, %zmm0 # encoding: [0x62,0xf2,0xfd,0x18,0xc8,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %res = call <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8)
   ret <8 x double> %res
 }
 declare <8 x double> @llvm.x86.avx512.exp2.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
 define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
-  ; CHECK: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
+; CHECK-LABEL: test_rsqrt28_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
-  ; CHECK: vrcp28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
+; CHECK-LABEL: test_rcp28_ss:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vrcp28ss {sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) {
-  ; CHECK: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ; 
+; CHECK-LABEL: test_rsqrt28_ss_maskz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
+; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ;
   ret <4 x float> %res
 }
 
 define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) {
-  ; CHECK: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
+; CHECK-LABEL: test_rsqrt28_ss_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
+; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ;
   ret <4 x float> %res
 }
 
 define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) {
-  ; CHECK: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ; 
+; CHECK-LABEL: test_rsqrt28_sd_maskz:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
+; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ;
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0) {
+; CHECK-LABEL: test_rsqrt28_sd_mask:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
+; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1]
+; CHECK-NEXT:    vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 7, i32 8) ;
   ret <2 x double> %res
 }
 
 declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
 
 define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) {
-  ; CHECK: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07]
+; CHECK-LABEL: test_rsqrt28_sd_maskz_mem:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
+; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %mem = load double , double * %ptr, align 8
   %mem_v = insertelement <2 x double> undef, double %mem, i32 0
-  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ; 
+  %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ;
   ret <2 x double> %res
 }
 
 define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) {
-  ; CHECK: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12]
+; CHECK-LABEL: test_rsqrt28_sd_maskz_mem_offset:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0]
+; CHECK-NEXT:    kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f]
+; CHECK-NEXT:    vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12]
+; CHECK-NEXT:    retq # encoding: [0xc3]
   %ptr1 = getelementptr double, double* %ptr, i32 18
   %mem = load double , double * %ptr1, align 8
   %mem_v = insertelement <2 x double> undef, double %mem, i32 0
diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll
index 727a0dce334efa94feb96deac5b29514af16fa4b..30ecc0d2e49e55ea4896c01582fd35b307540470 100644
--- a/test/CodeGen/X86/avx512ifma-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll
@@ -1,20 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512ifma | FileCheck %s
 
 declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vmovaps %zmm0, %zmm3
-; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm3 {%k1}
-; CHECK: vmovaps %zmm0, %zmm4
-; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm4
-; CHECK: vpxord %zmm2, %zmm2, %zmm2
-; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
-; CHECK: vpaddq %zmm0, %zmm3, %zmm0
-; CHECK: vpaddq %zmm2, %zmm4, %zmm1
-; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1}
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 
   %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
@@ -30,17 +33,19 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64>, <8 x i64>,
 
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vmovaps %zmm0, %zmm3
-; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm3 {%k1} {z}
-; CHECK: vmovaps %zmm0, %zmm4
-; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm4
-; CHECK: vpxord %zmm2, %zmm2, %zmm2
-; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
-; CHECK: vpaddq %zmm0, %zmm3, %zmm0
-; CHECK: vpaddq %zmm2, %zmm4, %zmm1
-; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 
   %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52h.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
@@ -56,17 +61,19 @@ declare <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64>, <8 x i64>, <
 
 define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vmovaps %zmm0, %zmm3
-; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm3 {%k1}
-; CHECK: vmovaps %zmm0, %zmm4
-; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm4
-; CHECK: vpxord %zmm2, %zmm2, %zmm2
-; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
-; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
-; CHECK: vpaddq %zmm0, %zmm3, %zmm0
-; CHECK: vpaddq %zmm2, %zmm4, %zmm1
-; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1}
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1}
+; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 
   %res = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.mask.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
@@ -82,17 +89,19 @@ declare <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64>, <8 x i64>,
 
 define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_512:
-; CHECK: kmovw %edi, %k1
-; CHECK: vmovaps %zmm0, %zmm3
-; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm3 {%k1} {z}
-; CHECK: vmovaps %zmm0, %zmm4
-; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm4
-; CHECK: vpxord %zmm2, %zmm2, %zmm2
-; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
-; CHECK: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
-; CHECK: vpaddq %zmm0, %zmm3, %zmm0
-; CHECK: vpaddq %zmm2, %zmm4, %zmm1
-; CHECK: vpaddq %zmm0, %zmm1, %zmm0
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm3
+; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm3
+; CHECK-NEXT:    vmovdqa64 %zmm0, %zmm4
+; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z}
+; CHECK-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z}
+; CHECK-NEXT:    vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z}
+; CHECK-NEXT:    vpaddq %zmm0, %zmm4, %zmm0
+; CHECK-NEXT:    vpaddq %zmm2, %zmm3, %zmm1
+; CHECK-NEXT:    vpaddq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    retq
 
   %res = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3)
   %res1 = call <8 x i64> @llvm.x86.avx512.maskz.vpmadd52l.uq.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> zeroinitializer, i8 %x3)
diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
index 8ba45aa381974686324de06914bf592bee3f0ee8..3ca686cef3bf48821e0c26324f806d87dda0484b 100644
--- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512vl -mattr=+avx512ifma | FileCheck %s
 
@@ -7,15 +8,15 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm3
-; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm0, %xmm4
-; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
+; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
+; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 
@@ -35,15 +36,15 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52h_uq_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm3
-; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT:    vmovaps %ymm0, %ymm4
-; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
+; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
+; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
-; CHECK-NEXT:    vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 
@@ -63,15 +64,15 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm3
-; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm0, %xmm4
-; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
+; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
+; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 
@@ -91,15 +92,15 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52h_uq_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm3
-; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT:    vmovaps %ymm0, %ymm4
-; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
+; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
+; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
-; CHECK-NEXT:    vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 
@@ -119,15 +120,15 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm3
-; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm3 {%k1}
-; CHECK-NEXT:    vmovaps %xmm0, %xmm4
-; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
+; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
+; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1}
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 
@@ -147,15 +148,15 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpmadd52l_uq_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm3
-; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm3 {%k1}
-; CHECK-NEXT:    vmovaps %ymm0, %ymm4
-; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
+; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
+; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
-; CHECK-NEXT:    vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 
@@ -175,15 +176,15 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm0, %xmm3
-; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm3 {%k1} {z}
-; CHECK-NEXT:    vmovaps %xmm0, %xmm4
-; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4
+; CHECK-NEXT:    vmovdqa %xmm0, %xmm3
+; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm3
+; CHECK-NEXT:    vmovdqa %xmm0, %xmm4
+; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z}
 ; CHECK-NEXT:    vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0
-; CHECK-NEXT:    vpaddq %xmm2, %xmm4, %xmm1
+; CHECK-NEXT:    vpaddq %xmm0, %xmm4, %xmm0
+; CHECK-NEXT:    vpaddq %xmm2, %xmm3, %xmm1
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 
@@ -203,15 +204,15 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpmadd52l_uq_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %ymm0, %ymm3
-; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm3 {%k1} {z}
-; CHECK-NEXT:    vmovaps %ymm0, %ymm4
-; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4
+; CHECK-NEXT:    vmovdqa %ymm0, %ymm3
+; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm3
+; CHECK-NEXT:    vmovdqa %ymm0, %ymm4
+; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z}
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z}
-; CHECK-NEXT:    vpaddq %ymm0, %ymm3, %ymm0
-; CHECK-NEXT:    vpaddq %ymm2, %ymm4, %ymm1
+; CHECK-NEXT:    vpaddq %ymm0, %ymm4, %ymm0
+; CHECK-NEXT:    vpaddq %ymm2, %ymm3, %ymm1
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 
diff --git a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
index adb419d58789df924c370a1b476a33b577744fac..22edbcc8e157ebacd8c02b9a3359f5a762062899 100644
--- a/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vbmivl-intrinsics.ll
@@ -6,11 +6,11 @@ declare <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8>, <16 x i8>, <16
 define <16 x i8>@test_int_x86_avx512_mask_permvar_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_qi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpermb %xmm0, %xmm1, %xmm3 ## encoding: [0x62,0xf2,0x75,0x08,0x8d,0xd8]
 ; CHECK-NEXT:    vpermb %xmm0, %xmm1, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0x8d,0xd0]
-; CHECK-NEXT:    vpermb %xmm0, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0x8d,0xd8]
-; CHECK-NEXT:    vpermb %xmm0, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x08,0x8d,0xc0]
-; CHECK-NEXT:    vpaddb %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfc,0xc0]
+; CHECK-NEXT:    vpermb %xmm0, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0x8d,0xc0]
+; CHECK-NEXT:    vpaddb %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.permvar.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
@@ -46,11 +46,11 @@ declare <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8>, <16 x i8>,
 define <16 x i8>@test_int_x86_avx512_mask_pmultishift_qb_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmultishift_qb_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
+; CHECK-NEXT:    vpmultishiftqb %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x83,0xd9]
 ; CHECK-NEXT:    vpmultishiftqb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x83,0xd1]
-; CHECK-NEXT:    vpmultishiftqb %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x83,0xd9]
-; CHECK-NEXT:    vpmultishiftqb %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x83,0xc1]
-; CHECK-NEXT:    vpaddb %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfc,0xc0]
+; CHECK-NEXT:    vpmultishiftqb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x83,0xc1]
+; CHECK-NEXT:    vpaddb %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc3]
 ; CHECK-NEXT:    vpaddb %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.pmultishift.qb.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
@@ -86,14 +86,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>,
 define <16 x i8>@test_int_x86_avx512_mask_vpermi2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermi2var_qi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
-; CHECK-NEXT:    vpermi2b %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x75,0xda]
-; CHECK-NEXT:    vpermi2b %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x75,0xca]
+; CHECK-NEXT:    vpermi2b %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x75,0xda]
+; CHECK-NEXT:    vpermi2b %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x75,0xca]
 ; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
 ; CHECK-NEXT:    vpermi2b %xmm2, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x75,0xe2]
-; CHECK-NEXT:    vpaddb %xmm1, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc1]
-; CHECK-NEXT:    vpaddb %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfc,0xc0]
+; CHECK-NEXT:    vpaddb %xmm3, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3]
+; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
   %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3)
@@ -130,14 +130,14 @@ declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>,
 define <16 x i8>@test_int_x86_avx512_mask_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_mask_vpermt2var_qi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
-; CHECK-NEXT:    vpermt2b %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xda]
-; CHECK-NEXT:    vpermt2b %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xca]
+; CHECK-NEXT:    vpermt2b %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7d,0xda]
+; CHECK-NEXT:    vpermt2b %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7d,0xca]
 ; CHECK-NEXT:    vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4]
 ; CHECK-NEXT:    vpermt2b %xmm2, %xmm0, %xmm4 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7d,0xe2]
-; CHECK-NEXT:    vpaddb %xmm1, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc1]
-; CHECK-NEXT:    vpaddb %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfc,0xc0]
+; CHECK-NEXT:    vpaddb %xmm3, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xfc,0xc3]
+; CHECK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfc,0xc0]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
   %res1 = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> zeroinitializer, <16 x i8> %x2, i16 %x3)
@@ -174,7 +174,7 @@ declare <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8>, <16 x i8>,
 define <16 x i8>@test_int_x86_avx512_maskz_vpermt2var_qi_128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3) {
 ; CHECK-LABEL: test_int_x86_avx512_maskz_vpermt2var_qi_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf]
 ; CHECK-NEXT:    vpermi2b %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0x89,0x75,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.avx512.maskz.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 %x3)
diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
index c63d47d780d1a5bd96f4c52373a32283d760a3c4..4d906a4fd29a24a4bc58e7d83f39e8b689fe4bae 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -980,6 +980,7 @@ define i8 @test_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x76,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
   ret i8 %res
@@ -991,6 +992,7 @@ define i8 @test_mask_pcmpeq_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x76,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
   ret i8 %res
@@ -1005,6 +1007,7 @@ define i8 @test_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
   ret i8 %res
@@ -1018,6 +1021,7 @@ define i8 @test_mask_pcmpeq_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
   ret i8 %res
@@ -1030,6 +1034,7 @@ define i8 @test_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7d,0x28,0x66,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 -1)
   ret i8 %res
@@ -1041,6 +1046,7 @@ define i8 @test_mask_pcmpgt_d_256(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0x66,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.256(<8 x i32> %a, <8 x i32> %b, i8 %mask)
   ret i8 %res
@@ -1055,6 +1061,7 @@ define i8 @test_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 -1)
   ret i8 %res
@@ -1068,6 +1075,7 @@ define i8 @test_mask_pcmpgt_q_256(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.256(<4 x i64> %a, <4 x i64> %b, i8 %mask)
   ret i8 %res
@@ -1082,6 +1090,7 @@ define i8 @test_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
   ret i8 %res
@@ -1095,6 +1104,7 @@ define i8 @test_mask_pcmpeq_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
   ret i8 %res
@@ -1111,6 +1121,7 @@ define i8 @test_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
   ret i8 %res
@@ -1126,6 +1137,7 @@ define i8 @test_mask_pcmpeq_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpeq.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
   ret i8 %res
@@ -1140,6 +1152,7 @@ define i8 @test_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 -1)
   ret i8 %res
@@ -1153,6 +1166,7 @@ define i8 @test_mask_pcmpgt_d_128(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.d.128(<4 x i32> %a, <4 x i32> %b, i8 %mask)
   ret i8 %res
@@ -1169,6 +1183,7 @@ define i8 @test_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 -1)
   ret i8 %res
@@ -1184,6 +1199,7 @@ define i8 @test_mask_pcmpgt_q_128(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
 ; CHECK-NEXT:    kshiftlw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x32,0xc0,0x0c]
 ; CHECK-NEXT:    kshiftrw $12, %k0, %k0 ## encoding: [0xc4,0xe3,0xf9,0x30,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.mask.pcmpgt.q.128(<2 x i64> %a, <2 x i64> %b, i8 %mask)
   ret i8 %res
@@ -2106,7 +2122,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pxor.d.256(<8 x i32>, <8 x i32>, <8 x i3
 define <4 x i32> @test_mask_andnot_epi32_rr_128(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-LABEL: test_mask_andnot_epi32_rr_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpandnd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0xc1]
+; CHECK-NEXT:    vpandn %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
   ret <4 x i32> %res
@@ -2136,7 +2152,7 @@ define <4 x i32> @test_mask_andnot_epi32_rrkz_128(<4 x i32> %a, <4 x i32> %b, i8
 define <4 x i32> @test_mask_andnot_epi32_rm_128(<4 x i32> %a, <4 x i32>* %ptr_b) {
 ; CHECK-LABEL: test_mask_andnot_epi32_rm_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpandnd (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0xdf,0x07]
+; CHECK-NEXT:    vpandn (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x i32>, <4 x i32>* %ptr_b
   %res = call <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32> %a, <4 x i32> %b, <4 x i32> zeroinitializer, i8 -1)
@@ -2210,7 +2226,7 @@ declare <4 x i32> @llvm.x86.avx512.mask.pandn.d.128(<4 x i32>, <4 x i32>, <4 x i
 define <8 x i32> @test_mask_andnot_epi32_rr_256(<8 x i32> %a, <8 x i32> %b) {
 ; CHECK-LABEL: test_mask_andnot_epi32_rr_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpandnd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0xc1]
+; CHECK-NEXT:    vpandn %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
   ret <8 x i32> %res
@@ -2240,7 +2256,7 @@ define <8 x i32> @test_mask_andnot_epi32_rrkz_256(<8 x i32> %a, <8 x i32> %b, i8
 define <8 x i32> @test_mask_andnot_epi32_rm_256(<8 x i32> %a, <8 x i32>* %ptr_b) {
 ; CHECK-LABEL: test_mask_andnot_epi32_rm_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpandnd (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0xdf,0x07]
+; CHECK-NEXT:    vpandn (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <8 x i32>, <8 x i32>* %ptr_b
   %res = call <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32> %a, <8 x i32> %b, <8 x i32> zeroinitializer, i8 -1)
@@ -2314,7 +2330,7 @@ declare <8 x i32> @llvm.x86.avx512.mask.pandn.d.256(<8 x i32>, <8 x i32>, <8 x i
 define <2 x i64> @test_mask_andnot_epi64_rr_128(<2 x i64> %a, <2 x i64> %b) {
 ; CHECK-LABEL: test_mask_andnot_epi64_rr_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpandnq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0xc1]
+; CHECK-NEXT:    vpandn %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
   ret <2 x i64> %res
@@ -2344,7 +2360,7 @@ define <2 x i64> @test_mask_andnot_epi64_rrkz_128(<2 x i64> %a, <2 x i64> %b, i8
 define <2 x i64> @test_mask_andnot_epi64_rm_128(<2 x i64> %a, <2 x i64>* %ptr_b) {
 ; CHECK-LABEL: test_mask_andnot_epi64_rm_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpandnq (%rdi), %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0xdf,0x07]
+; CHECK-NEXT:    vpandn (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xdf,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <2 x i64>, <2 x i64>* %ptr_b
   %res = call <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64> %a, <2 x i64> %b, <2 x i64> zeroinitializer, i8 -1)
@@ -2418,7 +2434,7 @@ declare <2 x i64> @llvm.x86.avx512.mask.pandn.q.128(<2 x i64>, <2 x i64>, <2 x i
 define <4 x i64> @test_mask_andnot_epi64_rr_256(<4 x i64> %a, <4 x i64> %b) {
 ; CHECK-LABEL: test_mask_andnot_epi64_rr_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpandnq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0xc1]
+; CHECK-NEXT:    vpandn %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
   ret <4 x i64> %res
@@ -2448,7 +2464,7 @@ define <4 x i64> @test_mask_andnot_epi64_rrkz_256(<4 x i64> %a, <4 x i64> %b, i8
 define <4 x i64> @test_mask_andnot_epi64_rm_256(<4 x i64> %a, <4 x i64>* %ptr_b) {
 ; CHECK-LABEL: test_mask_andnot_epi64_rm_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpandnq (%rdi), %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0xdf,0x07]
+; CHECK-NEXT:    vpandn (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xdf,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %b = load <4 x i64>, <4 x i64>* %ptr_b
   %res = call <4 x i64> @llvm.x86.avx512.mask.pandn.q.256(<4 x i64> %a, <4 x i64> %b, <4 x i64> zeroinitializer, i8 -1)
@@ -4833,3 +4849,128 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3
   %res4 = add <8 x i32> %res2, %res3
   ret <8 x i32> %res4
 }
+
+define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_mm512_maskz_max_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+; CHECK-LABEL: test_mm512_mask_max_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5f,0xd1]
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_mm512_max_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_mm512_maskz_max_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+; CHECK-LABEL: test_mm512_mask_max_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmaxps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5f,0xd1]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_mm512_max_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_mm512_maskz_min_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5d,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
+; CHECK-LABEL: test_mm512_mask_min_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5d,0xd1]
+; CHECK-NEXT:    vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
+  ret <8 x float> %res
+}
+
+define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_mm512_min_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_mm512_maskz_min_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vminps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5d,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
+; CHECK-LABEL: test_mm512_mask_min_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vminps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5d,0xd1]
+; CHECK-NEXT:    vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+; CHECK-LABEL: test_mm512_min_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll
index 8a2daf8685977cb0aff9eb83374098d3542a2fa8..1f324d67956497c54be73b8f3d6ce71e59dda9d6 100644
--- a/test/CodeGen/X86/avx512vl-intrinsics.ll
+++ b/test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -6,29 +6,29 @@
 define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: test_cmp_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd9,0x00]
-; CHECK-NEXT:    vpcmpltd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe1,0x01]
-; CHECK-NEXT:    vpcmpled %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe9,0x02]
-; CHECK-NEXT:    vpcmpunordd %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf1,0x03]
-; CHECK-NEXT:    vpcmpneqd %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf9,0x04]
-; CHECK-NEXT:    vpcmpnltd %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnled %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltd %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xc9,0x01]
+; CHECK-NEXT:    vpcmpled %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunordd %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xd9,0x03]
+; CHECK-NEXT:    vpcmpneqd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltd %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnled %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf1,0x06]
+; CHECK-NEXT:    vpcmpordd %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1f,0xf9,0x07]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
@@ -53,30 +53,30 @@ define <8 x i8> @test_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
 define <8 x i8> @test_mask_cmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_cmp_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3 ## encoding: [0xc5,0xf8,0x92,0xdf]
-; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k4 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1f,0xe1,0x00]
-; CHECK-NEXT:    vpcmpltd %ymm1, %ymm0, %k5 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1f,0xe9,0x01]
-; CHECK-NEXT:    vpcmpled %ymm1, %ymm0, %k6 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1f,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordd %ymm1, %ymm0, %k7 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1f,0xf9,0x03]
-; CHECK-NEXT:    vpcmpneqd %ymm1, %ymm0, %k0 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1f,0xc1,0x04]
-; CHECK-NEXT:    vpcmpnltd %ymm1, %ymm0, %k2 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnled %ymm1, %ymm0, %k1 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordd %ymm1, %ymm0, %k3 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1f,0xd9,0x07]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcmpeqd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltd %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd1,0x01]
+; CHECK-NEXT:    vpcmpled %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunordd %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe1,0x03]
+; CHECK-NEXT:    vpcmpneqd %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltd %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnled %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xf9,0x06]
+; CHECK-NEXT:    vpcmpordd %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1f,0xc9,0x07]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
@@ -103,29 +103,29 @@ declare i8 @llvm.x86.avx512.mask.cmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounwi
 define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: test_ucmp_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpequd %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd9,0x00]
-; CHECK-NEXT:    vpcmpltud %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe1,0x01]
-; CHECK-NEXT:    vpcmpleud %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe9,0x02]
-; CHECK-NEXT:    vpcmpunordud %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf1,0x03]
-; CHECK-NEXT:    vpcmpnequd %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf9,0x04]
-; CHECK-NEXT:    vpcmpnltud %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleud %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordud %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    kmovw %k3, %ecx ## encoding: [0xc5,0xf8,0x93,0xcb]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    vpcmpequd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltud %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleud %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunordud %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xd9,0x03]
+; CHECK-NEXT:    vpcmpnequd %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltud %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnleud %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf1,0x06]
+; CHECK-NEXT:    vpcmpordud %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x28,0x1e,0xf9,0x07]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 -1)
@@ -150,30 +150,30 @@ define <8 x i8> @test_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1) {
 define <8 x i8> @test_mask_ucmp_d_256(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_ucmp_d_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k3 ## encoding: [0xc5,0xf8,0x92,0xdf]
-; CHECK-NEXT:    vpcmpequd %ymm1, %ymm0, %k4 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1e,0xe1,0x00]
-; CHECK-NEXT:    vpcmpltud %ymm1, %ymm0, %k5 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1e,0xe9,0x01]
-; CHECK-NEXT:    vpcmpleud %ymm1, %ymm0, %k6 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1e,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordud %ymm1, %ymm0, %k7 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1e,0xf9,0x03]
-; CHECK-NEXT:    vpcmpnequd %ymm1, %ymm0, %k0 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1e,0xc1,0x04]
-; CHECK-NEXT:    vpcmpnltud %ymm1, %ymm0, %k2 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleud %ymm1, %ymm0, %k1 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1e,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordud %ymm1, %ymm0, %k3 {%k3} ## encoding: [0x62,0xf3,0x7d,0x2b,0x1e,0xd9,0x07]
-; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
-; CHECK-NEXT:    kmovw %k4, %ecx ## encoding: [0xc5,0xf8,0x93,0xcc]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcmpequd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltud %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd1,0x01]
+; CHECK-NEXT:    vpcmpleud %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunordud %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe1,0x03]
+; CHECK-NEXT:    vpcmpnequd %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltud %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnleud %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xf9,0x06]
+; CHECK-NEXT:    vpcmpordud %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x1e,0xc9,0x07]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32> %a0, <8 x i32> %a1, i32 0, i8 %mask)
@@ -200,29 +200,29 @@ declare i8 @llvm.x86.avx512.mask.ucmp.d.256(<8 x i32>, <8 x i32>, i32, i8) nounw
 define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_cmp_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf9,0x01]
-; CHECK-NEXT:    vpcmpleq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe1,0x03]
-; CHECK-NEXT:    vpcmpneqq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunordq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xd9,0x03]
+; CHECK-NEXT:    vpcmpneqq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnleq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf1,0x06]
+; CHECK-NEXT:    vpcmpordq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1f,0xf9,0x07]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
 ; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
@@ -247,30 +247,30 @@ define <8 x i8> @test_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
 define <8 x i8> @test_mask_cmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_cmp_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k7 ## encoding: [0xc5,0xf8,0x92,0xff]
-; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k5 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1f,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltq %ymm1, %ymm0, %k0 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1f,0xc1,0x01]
-; CHECK-NEXT:    vpcmpleq %ymm1, %ymm0, %k6 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1f,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordq %ymm1, %ymm0, %k4 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1f,0xe1,0x03]
-; CHECK-NEXT:    vpcmpneqq %ymm1, %ymm0, %k3 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltq %ymm1, %ymm0, %k2 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleq %ymm1, %ymm0, %k1 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordq %ymm1, %ymm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1f,0xf9,0x07]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcmpeqq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd1,0x01]
+; CHECK-NEXT:    vpcmpleq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunordq %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe1,0x03]
+; CHECK-NEXT:    vpcmpneqq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnleq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xf9,0x06]
+; CHECK-NEXT:    vpcmpordq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1f,0xc9,0x07]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
 ; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
@@ -297,29 +297,29 @@ declare i8 @llvm.x86.avx512.mask.cmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounwi
 define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
 ; CHECK-LABEL: test_ucmp_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpequq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltuq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf9,0x01]
-; CHECK-NEXT:    vpcmpleuq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunorduq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe1,0x03]
-; CHECK-NEXT:    vpcmpnequq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltuq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleuq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc9,0x06]
-; CHECK-NEXT:    vpcmporduq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    vpcmpequq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltuq %ymm1, %ymm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleuq %ymm1, %ymm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunorduq %ymm1, %ymm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xd9,0x03]
+; CHECK-NEXT:    vpcmpnequq %ymm1, %ymm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltuq %ymm1, %ymm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnleuq %ymm1, %ymm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf1,0x06]
+; CHECK-NEXT:    vpcmporduq %ymm1, %ymm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x28,0x1e,0xf9,0x07]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
 ; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 -1)
@@ -344,30 +344,30 @@ define <8 x i8> @test_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1) {
 define <8 x i8> @test_mask_ucmp_q_256(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_ucmp_q_256:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k7 ## encoding: [0xc5,0xf8,0x92,0xff]
-; CHECK-NEXT:    vpcmpequq %ymm1, %ymm0, %k5 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1e,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltuq %ymm1, %ymm0, %k0 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1e,0xc1,0x01]
-; CHECK-NEXT:    vpcmpleuq %ymm1, %ymm0, %k6 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1e,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunorduq %ymm1, %ymm0, %k4 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1e,0xe1,0x03]
-; CHECK-NEXT:    vpcmpnequq %ymm1, %ymm0, %k3 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1e,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltuq %ymm1, %ymm0, %k2 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleuq %ymm1, %ymm0, %k1 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1e,0xc9,0x06]
-; CHECK-NEXT:    vpcmporduq %ymm1, %ymm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x2f,0x1e,0xf9,0x07]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcmpequq %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltuq %ymm1, %ymm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd1,0x01]
+; CHECK-NEXT:    vpcmpleuq %ymm1, %ymm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunorduq %ymm1, %ymm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe1,0x03]
+; CHECK-NEXT:    vpcmpnequq %ymm1, %ymm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltuq %ymm1, %ymm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnleuq %ymm1, %ymm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xf9,0x06]
+; CHECK-NEXT:    vpcmporduq %ymm1, %ymm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x1e,0xc9,0x07]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
 ; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64> %a0, <4 x i64> %a1, i32 0, i8 %mask)
@@ -396,29 +396,29 @@ declare i8 @llvm.x86.avx512.mask.ucmp.q.256(<4 x i64>, <4 x i64>, i32, i8) nounw
 define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_cmp_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltd %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf9,0x01]
-; CHECK-NEXT:    vpcmpled %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe1,0x03]
-; CHECK-NEXT:    vpcmpneqd %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltd %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnled %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltd %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xc9,0x01]
+; CHECK-NEXT:    vpcmpled %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunordd %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xd9,0x03]
+; CHECK-NEXT:    vpcmpneqd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltd %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnled %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf1,0x06]
+; CHECK-NEXT:    vpcmpordd %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1f,0xf9,0x07]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
 ; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
@@ -443,30 +443,30 @@ define <8 x i8> @test_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
 define <8 x i8> @test_mask_cmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_cmp_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k7 ## encoding: [0xc5,0xf8,0x92,0xff]
-; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k5 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1f,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltd %xmm1, %xmm0, %k0 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1f,0xc1,0x01]
-; CHECK-NEXT:    vpcmpled %xmm1, %xmm0, %k6 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1f,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordd %xmm1, %xmm0, %k4 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1f,0xe1,0x03]
-; CHECK-NEXT:    vpcmpneqd %xmm1, %xmm0, %k3 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltd %xmm1, %xmm0, %k2 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnled %xmm1, %xmm0, %k1 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordd %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1f,0xf9,0x07]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcmpeqd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltd %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd1,0x01]
+; CHECK-NEXT:    vpcmpled %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunordd %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe1,0x03]
+; CHECK-NEXT:    vpcmpneqd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltd %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnled %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xf9,0x06]
+; CHECK-NEXT:    vpcmpordd %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1f,0xc9,0x07]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
 ; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
@@ -493,29 +493,29 @@ declare i8 @llvm.x86.avx512.mask.cmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounwi
 define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: test_ucmp_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpequd %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltud %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf9,0x01]
-; CHECK-NEXT:    vpcmpleud %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordud %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe1,0x03]
-; CHECK-NEXT:    vpcmpnequd %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltud %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleud %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordud %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    vpcmpequd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltud %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleud %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunordud %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xd9,0x03]
+; CHECK-NEXT:    vpcmpnequd %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltud %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnleud %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf1,0x06]
+; CHECK-NEXT:    vpcmpordud %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0x7d,0x08,0x1e,0xf9,0x07]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
 ; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 -1)
@@ -540,30 +540,30 @@ define <8 x i8> @test_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1) {
 define <8 x i8> @test_mask_ucmp_d_128(<4 x i32> %a0, <4 x i32> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_ucmp_d_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k7 ## encoding: [0xc5,0xf8,0x92,0xff]
-; CHECK-NEXT:    vpcmpequd %xmm1, %xmm0, %k5 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1e,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltud %xmm1, %xmm0, %k0 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1e,0xc1,0x01]
-; CHECK-NEXT:    vpcmpleud %xmm1, %xmm0, %k6 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1e,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordud %xmm1, %xmm0, %k4 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1e,0xe1,0x03]
-; CHECK-NEXT:    vpcmpnequd %xmm1, %xmm0, %k3 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1e,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltud %xmm1, %xmm0, %k2 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleud %xmm1, %xmm0, %k1 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1e,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordud %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0x7d,0x0f,0x1e,0xf9,0x07]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcmpequd %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltud %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd1,0x01]
+; CHECK-NEXT:    vpcmpleud %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunordud %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe1,0x03]
+; CHECK-NEXT:    vpcmpnequd %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltud %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnleud %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xf9,0x06]
+; CHECK-NEXT:    vpcmpordud %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x1e,0xc9,0x07]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
 ; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32> %a0, <4 x i32> %a1, i32 0, i8 %mask)
@@ -590,29 +590,29 @@ declare i8 @llvm.x86.avx512.mask.ucmp.d.128(<4 x i32>, <4 x i32>, i32, i8) nounw
 define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_cmp_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf9,0x01]
-; CHECK-NEXT:    vpcmpleq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe1,0x03]
-; CHECK-NEXT:    vpcmpneqq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunordq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xd9,0x03]
+; CHECK-NEXT:    vpcmpneqq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnleq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf1,0x06]
+; CHECK-NEXT:    vpcmpordq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1f,0xf9,0x07]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
 ; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
@@ -637,30 +637,30 @@ define <8 x i8> @test_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
 define <8 x i8> @test_mask_cmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_cmp_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k7 ## encoding: [0xc5,0xf8,0x92,0xff]
-; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k5 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltq %xmm1, %xmm0, %k0 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xc1,0x01]
-; CHECK-NEXT:    vpcmpleq %xmm1, %xmm0, %k6 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunordq %xmm1, %xmm0, %k4 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xe1,0x03]
-; CHECK-NEXT:    vpcmpneqq %xmm1, %xmm0, %k3 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltq %xmm1, %xmm0, %k2 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleq %xmm1, %xmm0, %k1 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xc9,0x06]
-; CHECK-NEXT:    vpcmpordq %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1f,0xf9,0x07]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcmpeqq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd1,0x01]
+; CHECK-NEXT:    vpcmpleq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunordq %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe1,0x03]
+; CHECK-NEXT:    vpcmpneqq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnleq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xf9,0x06]
+; CHECK-NEXT:    vpcmpordq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1f,0xc9,0x07]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
 ; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
@@ -687,29 +687,29 @@ declare i8 @llvm.x86.avx512.mask.cmp.q.128(<2 x i64>, <2 x i64>, i32, i8) nounwi
 define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
 ; CHECK-LABEL: test_ucmp_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    vpcmpequq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltuq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf9,0x01]
-; CHECK-NEXT:    vpcmpleuq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunorduq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe1,0x03]
-; CHECK-NEXT:    vpcmpnequq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltuq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc9,0x06]
-; CHECK-NEXT:    vpcmporduq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x07]
-; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    vpcmpequq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltuq %xmm1, %xmm0, %k1 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xc9,0x01]
+; CHECK-NEXT:    vpcmpleuq %xmm1, %xmm0, %k2 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd1,0x02]
+; CHECK-NEXT:    vpcmpunorduq %xmm1, %xmm0, %k3 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xd9,0x03]
+; CHECK-NEXT:    vpcmpnequq %xmm1, %xmm0, %k4 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe1,0x04]
+; CHECK-NEXT:    vpcmpnltuq %xmm1, %xmm0, %k5 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xe9,0x05]
+; CHECK-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k6 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf1,0x06]
+; CHECK-NEXT:    vpcmporduq %xmm1, %xmm0, %k7 ## encoding: [0x62,0xf3,0xfd,0x08,0x1e,0xf9,0x07]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
-; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
-; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
 ; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
+; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 -1)
@@ -734,30 +734,30 @@ define <8 x i8> @test_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1) {
 define <8 x i8> @test_mask_ucmp_q_128(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mask_ucmp_q_128:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovw %edi, %k7 ## encoding: [0xc5,0xf8,0x92,0xff]
-; CHECK-NEXT:    vpcmpequq %xmm1, %xmm0, %k5 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xe9,0x00]
-; CHECK-NEXT:    vpcmpltuq %xmm1, %xmm0, %k0 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xc1,0x01]
-; CHECK-NEXT:    vpcmpleuq %xmm1, %xmm0, %k6 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xf1,0x02]
-; CHECK-NEXT:    vpcmpunorduq %xmm1, %xmm0, %k4 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xe1,0x03]
-; CHECK-NEXT:    vpcmpnequq %xmm1, %xmm0, %k3 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xd9,0x04]
-; CHECK-NEXT:    vpcmpnltuq %xmm1, %xmm0, %k2 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xd1,0x05]
-; CHECK-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k1 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xc9,0x06]
-; CHECK-NEXT:    vpcmporduq %xmm1, %xmm0, %k7 {%k7} ## encoding: [0x62,0xf3,0xfd,0x0f,0x1e,0xf9,0x07]
-; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
-; CHECK-NEXT:    kmovw %k5, %ecx ## encoding: [0xc5,0xf8,0x93,0xcd]
-; CHECK-NEXT:    vpinsrb $0, %ecx, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc1,0x00]
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpcmpequq %xmm1, %xmm0, %k0 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc1,0x00]
+; CHECK-NEXT:    vpcmpltuq %xmm1, %xmm0, %k2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd1,0x01]
+; CHECK-NEXT:    vpcmpleuq %xmm1, %xmm0, %k3 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xd9,0x02]
+; CHECK-NEXT:    vpcmpunorduq %xmm1, %xmm0, %k4 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe1,0x03]
+; CHECK-NEXT:    vpcmpnequq %xmm1, %xmm0, %k5 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xe9,0x04]
+; CHECK-NEXT:    vpcmpnltuq %xmm1, %xmm0, %k6 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf1,0x05]
+; CHECK-NEXT:    vpcmpnleuq %xmm1, %xmm0, %k7 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xf9,0x06]
+; CHECK-NEXT:    vpcmporduq %xmm1, %xmm0, %k1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x1e,0xc9,0x07]
+; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vmovd %ecx, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0xc1]
 ; CHECK-NEXT:    vpinsrb $2, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x02]
-; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
+; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
 ; CHECK-NEXT:    vpinsrb $4, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x04]
 ; CHECK-NEXT:    kmovw %k4, %eax ## encoding: [0xc5,0xf8,0x93,0xc4]
 ; CHECK-NEXT:    vpinsrb $6, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x06]
-; CHECK-NEXT:    kmovw %k3, %eax ## encoding: [0xc5,0xf8,0x93,0xc3]
+; CHECK-NEXT:    kmovw %k5, %eax ## encoding: [0xc5,0xf8,0x93,0xc5]
 ; CHECK-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x08]
-; CHECK-NEXT:    kmovw %k2, %eax ## encoding: [0xc5,0xf8,0x93,0xc2]
+; CHECK-NEXT:    kmovw %k6, %eax ## encoding: [0xc5,0xf8,0x93,0xc6]
 ; CHECK-NEXT:    vpinsrb $10, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0a]
-; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
-; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
 ; CHECK-NEXT:    kmovw %k7, %eax ## encoding: [0xc5,0xf8,0x93,0xc7]
+; CHECK-NEXT:    vpinsrb $12, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0c]
+; CHECK-NEXT:    kmovw %k1, %eax ## encoding: [0xc5,0xf8,0x93,0xc1]
 ; CHECK-NEXT:    vpinsrb $14, %eax, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x20,0xc0,0x0e]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res0 = call i8 @llvm.x86.avx512.mask.ucmp.q.128(<2 x i64> %a0, <2 x i64> %a1, i32 0, i8 %mask)
@@ -1498,6 +1498,7 @@ define i8 @test_cmpps_256(<8 x float> %a, <8 x float> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpleps %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x28,0xc2,0xc1,0x02]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %res = call i8 @llvm.x86.avx512.mask.cmp.ps.256(<8 x float> %a, <8 x float> %b, i32 2, i8 -1)
    ret i8 %res
@@ -1509,6 +1510,7 @@ define i8 @test_cmpps_128(<4 x float> %a, <4 x float> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmpleps %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0x7c,0x08,0xc2,0xc1,0x02]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %res = call i8 @llvm.x86.avx512.mask.cmp.ps.128(<4 x float> %a, <4 x float> %b, i32 2, i8 -1)
    ret i8 %res
@@ -1520,6 +1522,7 @@ define i8 @test_cmppd_256(<4 x double> %a, <4 x double> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmplepd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x28,0xc2,0xc1,0x02]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %res = call i8 @llvm.x86.avx512.mask.cmp.pd.256(<4 x double> %a, <4 x double> %b, i32 2, i8 -1)
    ret i8 %res
@@ -1531,6 +1534,7 @@ define i8 @test_cmppd_128(<2 x double> %a, <2 x double> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vcmplepd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf1,0xfd,0x08,0xc2,0xc1,0x02]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %res = call i8 @llvm.x86.avx512.mask.cmp.pd.128(<2 x double> %a, <2 x double> %b, i32 2, i8 -1)
    ret i8 %res
@@ -1543,8 +1547,10 @@ define <8 x float> @test_mm512_maskz_max_ps_256(<8 x float> %a0, <8 x float> %a1
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
-  ret <8 x float> %res
+  %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
+  ret <8 x float> %3
 }
 
 define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
@@ -1554,8 +1560,10 @@ define <8 x float> @test_mm512_mask_max_ps_256(<8 x float> %a0, <8 x float> %a1,
 ; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5f,0xd1]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
-  ret <8 x float> %res
+  %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %src
+  ret <8 x float> %3
 }
 
 define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
@@ -1563,10 +1571,10 @@ define <8 x float> @test_mm512_max_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmaxps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
-  ret <8 x float> %res
+  %1 = call <8 x float> @llvm.x86.avx.max.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %1
 }
-declare <8 x float> @llvm.x86.avx512.mask.max.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>)
 
 define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mm512_maskz_max_ps_128:
@@ -1574,8 +1582,11 @@ define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
-  ret <4 x float> %res
+  %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
+  ret <4 x float> %3
 }
 
 define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
@@ -1585,8 +1596,11 @@ define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-NEXT:    vmaxps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5f,0xd1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
-  ret <4 x float> %res
+  %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src
+  ret <4 x float> %3
 }
 
 define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
@@ -1594,10 +1608,10 @@ define <4 x float> @test_mm512_max_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vmaxps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5f,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
-  ret <4 x float> %res
+  %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %1
 }
-declare <4 x float> @llvm.x86.avx512.mask.max.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>)
 
 define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mm512_maskz_min_ps_256:
@@ -1605,8 +1619,10 @@ define <8 x float> @test_mm512_maskz_min_ps_256(<8 x float> %a0, <8 x float> %a1
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x5d,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 %mask)
-  ret <8 x float> %res
+  %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> zeroinitializer
+  ret <8 x float> %3
 }
 
 define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask) {
@@ -1616,8 +1632,10 @@ define <8 x float> @test_mm512_mask_min_ps_256(<8 x float> %a0, <8 x float> %a1,
 ; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x29,0x5d,0xd1]
 ; CHECK-NEXT:    vmovaps %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %src, i8 %mask)
-  ret <8 x float> %res
+  %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %3 = select <8 x i1> %2, <8 x float> %1, <8 x float> %src
+  ret <8 x float> %3
 }
 
 define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %mask) {
@@ -1625,10 +1643,10 @@ define <8 x float> @test_mm512_min_ps_256(<8 x float> %a0, <8 x float> %a1, i8 %
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vminps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5d,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float>zeroinitializer, i8 -1)
-  ret <8 x float> %res
+  %1 = call <8 x float> @llvm.x86.avx.min.ps.256(<8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %1
 }
-declare <8 x float> @llvm.x86.avx512.mask.min.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>)
 
 define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
 ; CHECK-LABEL: test_mm512_maskz_min_ps_128:
@@ -1636,8 +1654,11 @@ define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vminps %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0x89,0x5d,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 %mask)
-  ret <4 x float> %res
+  %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer
+  ret <4 x float> %3
 }
 
 define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask) {
@@ -1647,8 +1668,11 @@ define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-NEXT:    vminps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7c,0x09,0x5d,0xd1]
 ; CHECK-NEXT:    vmovaps %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc2]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %src, i8 %mask)
-  ret <4 x float> %res
+  %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  %2 = bitcast i8 %mask to <8 x i1>
+  %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src
+  ret <4 x float> %3
 }
 
 define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
@@ -1656,10 +1680,10 @@ define <4 x float> @test_mm512_min_ps_128(<4 x float> %a0, <4 x float> %a1, i8 %
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vminps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x5d,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float>zeroinitializer, i8 -1)
-  ret <4 x float> %res
+  %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %1
 }
-declare <4 x float> @llvm.x86.avx512.mask.min.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
 
 define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) {
 ; CHECK-LABEL: test_sqrt_pd_256:
@@ -1712,9 +1736,9 @@ define <4 x i32>@test_int_x86_avx512_mask_vpermt2var_d_128(<4 x i32> %x0, <4 x i
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
-; CHECK-NEXT:    vpermt2d %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xda]
-; CHECK-NEXT:    vpermt2d %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xca]
-; CHECK-NEXT:    vpaddd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc1]
+; CHECK-NEXT:    vpermt2d %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda]
+; CHECK-NEXT:    vpermt2d %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7e,0xca]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -1729,9 +1753,9 @@ define <4 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_128(<4 x i32> %x0, <4 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd9]
-; CHECK-NEXT:    vpermt2d %xmm2, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xda]
-; CHECK-NEXT:    vpermt2d %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xca]
-; CHECK-NEXT:    vpaddd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc1]
+; CHECK-NEXT:    vpermt2d %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x7e,0xda]
+; CHECK-NEXT:    vpermt2d %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7e,0xca]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
@@ -1746,9 +1770,9 @@ define <8 x i32>@test_int_x86_avx512_mask_vpermt2var_d_256(<8 x i32> %x0, <8 x i
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
-; CHECK-NEXT:    vpermt2d %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xda]
-; CHECK-NEXT:    vpermt2d %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xca]
-; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc1]
+; CHECK-NEXT:    vpermt2d %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda]
+; CHECK-NEXT:    vpermt2d %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7e,0xca]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -1763,9 +1787,9 @@ define <8 x i32>@test_int_x86_avx512_maskz_vpermt2var_d_256(<8 x i32> %x0, <8 x
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd9]
-; CHECK-NEXT:    vpermt2d %ymm2, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xda]
-; CHECK-NEXT:    vpermt2d %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xca]
-; CHECK-NEXT:    vpaddd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc1]
+; CHECK-NEXT:    vpermt2d %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x7e,0xda]
+; CHECK-NEXT:    vpermt2d %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7e,0xca]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.maskz.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
@@ -1780,9 +1804,9 @@ define <2 x double>@test_int_x86_avx512_mask_vpermi2var_pd_128(<2 x double> %x0,
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
-; CHECK-NEXT:    vpermi2pd %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x77,0xda]
-; CHECK-NEXT:    vpermi2pd %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0xfd,0x08,0x77,0xca]
-; CHECK-NEXT:    vaddpd %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc1]
+; CHECK-NEXT:    vpermi2pd %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x77,0xda]
+; CHECK-NEXT:    vpermi2pd %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x77,0xca]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 %x3)
   %res1 = call <2 x double> @llvm.x86.avx512.mask.vpermi2var.pd.128(<2 x double> %x0, <2 x i64> %x1, <2 x double> %x2, i8 -1)
@@ -1797,9 +1821,9 @@ define <4 x double>@test_int_x86_avx512_mask_vpermi2var_pd_256(<4 x double> %x0,
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x77,0xda]
-; CHECK-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0xfd,0x28,0x77,0xca]
-; CHECK-NEXT:    vaddpd %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc1]
+; CHECK-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x77,0xda]
+; CHECK-NEXT:    vpermi2pd %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x77,0xca]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
   %res1 = call <4 x double> @llvm.x86.avx512.mask.vpermi2var.pd.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 -1)
@@ -1814,9 +1838,9 @@ define <4 x float>@test_int_x86_avx512_mask_vpermi2var_ps_128(<4 x float> %x0, <
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
-; CHECK-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x77,0xda]
-; CHECK-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm1 ## encoding: [0x62,0xf2,0x7d,0x08,0x77,0xca]
-; CHECK-NEXT:    vaddps %xmm1, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc1]
+; CHECK-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x77,0xda]
+; CHECK-NEXT:    vpermi2ps %xmm2, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x77,0xca]
+; CHECK-NEXT:    vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 %x3)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.vpermi2var.ps.128(<4 x float> %x0, <4 x i32> %x1, <4 x float> %x2, i8 -1)
@@ -1843,9 +1867,9 @@ define <8 x float>@test_int_x86_avx512_mask_vpermi2var_ps_256(<8 x float> %x0, <
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm3 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x77,0xda]
-; CHECK-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm1 ## encoding: [0x62,0xf2,0x7d,0x28,0x77,0xca]
-; CHECK-NEXT:    vaddps %ymm1, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc1]
+; CHECK-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x77,0xda]
+; CHECK-NEXT:    vpermi2ps %ymm2, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x77,0xca]
+; CHECK-NEXT:    vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
   %res1 = call <8 x float> @llvm.x86.avx512.mask.vpermi2var.ps.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 -1)
@@ -1987,8 +2011,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_128(<2 x i64> %x0, <16 x i8> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
 ; CHECK-NEXT:    vpmovqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x32,0xc2]
+; CHECK-NEXT:    vpmovqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x32,0xc1]
 ; CHECK-NEXT:    vpmovqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x32,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2021,8 +2045,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_128(<2 x i64> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovsqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
 ; CHECK-NEXT:    vpmovsqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x22,0xc2]
+; CHECK-NEXT:    vpmovsqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x22,0xc1]
 ; CHECK-NEXT:    vpmovsqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x22,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2055,8 +2079,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_128(<2 x i64> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovusqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
 ; CHECK-NEXT:    vpmovusqb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x12,0xc2]
+; CHECK-NEXT:    vpmovusqb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x12,0xc1]
 ; CHECK-NEXT:    vpmovusqb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x12,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2089,8 +2113,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_qb_256(<4 x i64> %x0, <16 x i8> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qb_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
 ; CHECK-NEXT:    vpmovqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x32,0xc2]
+; CHECK-NEXT:    vpmovqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x32,0xc1]
 ; CHECK-NEXT:    vpmovqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x32,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2123,8 +2147,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_qb_256(<4 x i64> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qb_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovsqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
 ; CHECK-NEXT:    vpmovsqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x22,0xc2]
+; CHECK-NEXT:    vpmovsqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x22,0xc1]
 ; CHECK-NEXT:    vpmovsqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x22,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2157,8 +2181,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_qb_256(<4 x i64> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qb_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovusqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
 ; CHECK-NEXT:    vpmovusqb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x12,0xc2]
+; CHECK-NEXT:    vpmovusqb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x12,0xc1]
 ; CHECK-NEXT:    vpmovusqb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x12,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2191,8 +2215,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_128(<2 x i64> %x0, <8 x i16> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
 ; CHECK-NEXT:    vpmovqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x34,0xc2]
+; CHECK-NEXT:    vpmovqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x34,0xc1]
 ; CHECK-NEXT:    vpmovqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x34,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2225,8 +2249,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_128(<2 x i64> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovsqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
 ; CHECK-NEXT:    vpmovsqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x24,0xc2]
+; CHECK-NEXT:    vpmovsqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x24,0xc1]
 ; CHECK-NEXT:    vpmovsqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x24,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2259,8 +2283,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_128(<2 x i64> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovusqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
 ; CHECK-NEXT:    vpmovusqw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x14,0xc2]
+; CHECK-NEXT:    vpmovusqw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x14,0xc1]
 ; CHECK-NEXT:    vpmovusqw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x14,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2293,8 +2317,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_qw_256(<4 x i64> %x0, <8 x i16> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qw_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
 ; CHECK-NEXT:    vpmovqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x34,0xc2]
+; CHECK-NEXT:    vpmovqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x34,0xc1]
 ; CHECK-NEXT:    vpmovqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x34,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2327,8 +2351,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_qw_256(<4 x i64> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qw_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovsqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
 ; CHECK-NEXT:    vpmovsqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x24,0xc2]
+; CHECK-NEXT:    vpmovsqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x24,0xc1]
 ; CHECK-NEXT:    vpmovsqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x24,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2361,8 +2385,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_qw_256(<4 x i64> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qw_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovusqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
 ; CHECK-NEXT:    vpmovusqw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x14,0xc2]
+; CHECK-NEXT:    vpmovusqw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x14,0xc1]
 ; CHECK-NEXT:    vpmovusqw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x14,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2395,8 +2419,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_128(<2 x i64> %x0, <4 x i32> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1]
 ; CHECK-NEXT:    vpmovqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x35,0xc2]
+; CHECK-NEXT:    vpmovqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x35,0xc1]
 ; CHECK-NEXT:    vpmovqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x35,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2]
@@ -2429,8 +2453,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_128(<2 x i64> %x0, <4 x i32>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovsqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1]
 ; CHECK-NEXT:    vpmovsqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x25,0xc2]
+; CHECK-NEXT:    vpmovsqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x25,0xc1]
 ; CHECK-NEXT:    vpmovsqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x25,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2]
@@ -2463,8 +2487,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_128(<2 x i64> %x0, <4 x i32>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovusqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1]
 ; CHECK-NEXT:    vpmovusqd %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x15,0xc2]
+; CHECK-NEXT:    vpmovusqd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x15,0xc1]
 ; CHECK-NEXT:    vpmovusqd %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x15,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2]
@@ -2497,8 +2521,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmov_qd_256(<4 x i64> %x0, <4 x i32> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_qd_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1]
 ; CHECK-NEXT:    vpmovqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x35,0xc2]
+; CHECK-NEXT:    vpmovqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x35,0xc1]
 ; CHECK-NEXT:    vpmovqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x35,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2]
@@ -2531,8 +2555,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovs_qd_256(<4 x i64> %x0, <4 x i32>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_qd_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovsqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x25,0xc1]
 ; CHECK-NEXT:    vpmovsqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x25,0xc2]
+; CHECK-NEXT:    vpmovsqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x25,0xc1]
 ; CHECK-NEXT:    vpmovsqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x25,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2]
@@ -2565,8 +2589,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pmovus_qd_256(<4 x i64> %x0, <4 x i32>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_qd_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovusqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x15,0xc1]
 ; CHECK-NEXT:    vpmovusqd %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x15,0xc2]
+; CHECK-NEXT:    vpmovusqd %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x15,0xc1]
 ; CHECK-NEXT:    vpmovusqd %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x15,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1]
 ; CHECK-NEXT:    vpaddd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc2]
@@ -2599,8 +2623,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_128(<4 x i32> %x0, <16 x i8> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
 ; CHECK-NEXT:    vpmovdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x31,0xc2]
+; CHECK-NEXT:    vpmovdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x31,0xc1]
 ; CHECK-NEXT:    vpmovdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x31,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2633,8 +2657,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_128(<4 x i32> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovsdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
 ; CHECK-NEXT:    vpmovsdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x21,0xc2]
+; CHECK-NEXT:    vpmovsdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x21,0xc1]
 ; CHECK-NEXT:    vpmovsdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x21,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2667,8 +2691,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_128(<4 x i32> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovusdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
 ; CHECK-NEXT:    vpmovusdb %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x11,0xc2]
+; CHECK-NEXT:    vpmovusdb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x11,0xc1]
 ; CHECK-NEXT:    vpmovusdb %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x11,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2701,8 +2725,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmov_db_256(<8 x i32> %x0, <16 x i8> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_db_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
 ; CHECK-NEXT:    vpmovdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x31,0xc2]
+; CHECK-NEXT:    vpmovdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x31,0xc1]
 ; CHECK-NEXT:    vpmovdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x31,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2735,8 +2759,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovs_db_256(<8 x i32> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_db_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovsdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
 ; CHECK-NEXT:    vpmovsdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x21,0xc2]
+; CHECK-NEXT:    vpmovsdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x21,0xc1]
 ; CHECK-NEXT:    vpmovsdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x21,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2769,8 +2793,8 @@ define <16 x i8>@test_int_x86_avx512_mask_pmovus_db_256(<8 x i32> %x0, <16 x i8>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_db_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovusdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
 ; CHECK-NEXT:    vpmovusdb %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x11,0xc2]
+; CHECK-NEXT:    vpmovusdb %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x11,0xc1]
 ; CHECK-NEXT:    vpmovusdb %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x11,0xc0]
 ; CHECK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc1]
 ; CHECK-NEXT:    vpaddb %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfc,0xc2]
@@ -2803,8 +2827,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_128(<4 x i32> %x0, <8 x i16> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
 ; CHECK-NEXT:    vpmovdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x33,0xc2]
+; CHECK-NEXT:    vpmovdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x33,0xc1]
 ; CHECK-NEXT:    vpmovdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x33,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2837,8 +2861,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_128(<4 x i32> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovsdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
 ; CHECK-NEXT:    vpmovsdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x23,0xc2]
+; CHECK-NEXT:    vpmovsdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x23,0xc1]
 ; CHECK-NEXT:    vpmovsdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x23,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2871,8 +2895,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_128(<4 x i32> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovusdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
 ; CHECK-NEXT:    vpmovusdw %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0x89,0x13,0xc2]
+; CHECK-NEXT:    vpmovusdw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x09,0x13,0xc1]
 ; CHECK-NEXT:    vpmovusdw %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x08,0x13,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2905,8 +2929,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmov_dw_256(<8 x i32> %x0, <8 x i16> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmov_dw_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
 ; CHECK-NEXT:    vpmovdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x33,0xc2]
+; CHECK-NEXT:    vpmovdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x33,0xc1]
 ; CHECK-NEXT:    vpmovdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x33,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2939,8 +2963,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovs_dw_256(<8 x i32> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovs_dw_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovsdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
 ; CHECK-NEXT:    vpmovsdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x23,0xc2]
+; CHECK-NEXT:    vpmovsdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x23,0xc1]
 ; CHECK-NEXT:    vpmovsdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x23,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -2973,8 +2997,8 @@ define <8 x i16>@test_int_x86_avx512_mask_pmovus_dw_256(<8 x i32> %x0, <8 x i16>
 ; CHECK-LABEL: test_int_x86_avx512_mask_pmovus_dw_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vpmovusdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
 ; CHECK-NEXT:    vpmovusdw %ymm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7e,0xa9,0x13,0xc2]
+; CHECK-NEXT:    vpmovusdw %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x13,0xc1]
 ; CHECK-NEXT:    vpmovusdw %ymm0, %xmm0 ## encoding: [0x62,0xf2,0x7e,0x28,0x13,0xc0]
 ; CHECK-NEXT:    vpaddw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc1]
 ; CHECK-NEXT:    vpaddw %xmm2, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xfd,0xc2]
@@ -3545,10 +3569,10 @@ define <8 x float>@test_int_x86_avx512_mask_shuf_f32x4_256(<8 x float> %x0, <8 x
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f32x4_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16]
-; CHECK-NEXT:    ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x23,0xd9,0x16]
 ; CHECK-NEXT:    ## ymm3 {%k1} {z} = ymm0[0,1,2,3],ymm1[4,5,6,7]
+; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x23,0xd1,0x16]
+; CHECK-NEXT:    ## ymm2 {%k1} = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; CHECK-NEXT:    vshuff32x4 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0x7d,0x28,0x23,0xc1,0x16]
 ; CHECK-NEXT:    ## ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0]
@@ -3568,10 +3592,10 @@ define <4 x double>@test_int_x86_avx512_mask_shuf_f64x2_256(<4 x double> %x0, <4
 ; CHECK-LABEL: test_int_x86_avx512_mask_shuf_f64x2_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16]
-; CHECK-NEXT:    ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
 ; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x23,0xd9,0x16]
 ; CHECK-NEXT:    ## ymm3 {%k1} {z} = ymm0[0,1],ymm1[2,3]
+; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x23,0xd1,0x16]
+; CHECK-NEXT:    ## ymm2 {%k1} = ymm0[0,1],ymm1[2,3]
 ; CHECK-NEXT:    vshuff64x2 $22, %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf3,0xfd,0x28,0x23,0xc1,0x16]
 ; CHECK-NEXT:    ## ymm0 = ymm0[0,1],ymm1[2,3]
 ; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
@@ -3627,8 +3651,8 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_pd_128(<2 x double> %x0, <2
 ; CHECK-LABEL: test_int_x86_avx512_mask_getmant_pd_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x26,0xc8,0x0b]
 ; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0x89,0x26,0xd0,0x0b]
+; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x09,0x26,0xc8,0x0b]
 ; CHECK-NEXT:    vgetmantpd $11, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0xfd,0x08,0x26,0xc0,0x0b]
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0]
 ; CHECK-NEXT:    vaddpd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc0]
@@ -3696,9 +3720,9 @@ define <4 x i32>@test_int_x86_avx512_mask_pternlog_d_128(<4 x i32> %x0, <4 x i32
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x25,0xda,0x21]
-; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21]
-; CHECK-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xda,0x21]
+; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x25,0xc2,0x21]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
@@ -3713,9 +3737,9 @@ define <4 x i32>@test_int_x86_avx512_maskz_pternlog_d_128(<4 x i32> %x0, <4 x i3
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x25,0xda,0x21]
-; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xc2,0x21]
-; CHECK-NEXT:    vpaddd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xfe,0xc0]
+; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x25,0xda,0x21]
+; CHECK-NEXT:    vpternlogd $33, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x25,0xc2,0x21]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 %x4)
   %res1 = call <4 x i32> @llvm.x86.avx512.maskz.pternlog.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i32 33, i8 -1)
@@ -3730,9 +3754,9 @@ define <8 x i32>@test_int_x86_avx512_mask_pternlog_d_256(<8 x i32> %x0, <8 x i32
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x25,0xda,0x21]
-; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21]
-; CHECK-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xda,0x21]
+; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x25,0xc2,0x21]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
@@ -3747,9 +3771,9 @@ define <8 x i32>@test_int_x86_avx512_maskz_pternlog_d_256(<8 x i32> %x0, <8 x i3
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x25,0xda,0x21]
-; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xc2,0x21]
-; CHECK-NEXT:    vpaddd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xfe,0xc0]
+; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x25,0xda,0x21]
+; CHECK-NEXT:    vpternlogd $33, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x25,0xc2,0x21]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 %x4)
   %res1 = call <8 x i32> @llvm.x86.avx512.maskz.pternlog.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i32 33, i8 -1)
@@ -3764,9 +3788,9 @@ define <2 x i64>@test_int_x86_avx512_mask_pternlog_q_128(<2 x i64> %x0, <2 x i64
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x25,0xda,0x21]
-; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21]
-; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xd4,0xc0]
+; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xda,0x21]
+; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x25,0xc2,0x21]
+; CHECK-NEXT:    vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
@@ -3781,9 +3805,9 @@ define <2 x i64>@test_int_x86_avx512_maskz_pternlog_q_128(<2 x i64> %x0, <2 x i6
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xd8]
-; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x25,0xda,0x21]
-; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xc2,0x21]
-; CHECK-NEXT:    vpaddq %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0xd4,0xc0]
+; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0xf5,0x08,0x25,0xda,0x21]
+; CHECK-NEXT:    vpternlogq $33, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x25,0xc2,0x21]
+; CHECK-NEXT:    vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 %x4)
   %res1 = call <2 x i64> @llvm.x86.avx512.maskz.pternlog.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i32 33, i8 -1)
@@ -3798,9 +3822,9 @@ define <4 x i64>@test_int_x86_avx512_mask_pternlog_q_256(<4 x i64> %x0, <4 x i64
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x25,0xda,0x21]
-; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xd4,0xc0]
+; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xda,0x21]
+; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x25,0xc2,0x21]
+; CHECK-NEXT:    vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
@@ -3815,9 +3839,9 @@ define <4 x i64>@test_int_x86_avx512_maskz_pternlog_q_256(<4 x i64> %x0, <4 x i6
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovdqa %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xd8]
-; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xda,0x21]
-; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xc2,0x21]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0xd4,0xc0]
+; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0xf5,0x28,0x25,0xda,0x21]
+; CHECK-NEXT:    vpternlogq $33, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x25,0xc2,0x21]
+; CHECK-NEXT:    vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 %x4)
   %res1 = call <4 x i64> @llvm.x86.avx512.maskz.pternlog.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i32 33, i8 -1)
@@ -4080,7 +4104,7 @@ define <4 x double> @test_rsqrt_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrsqrt14pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x4e,0xc8]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.rsqrt14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
   ret <4 x double> %res
@@ -4110,7 +4134,7 @@ define <2 x double> @test_rsqrt_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrsqrt14pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x4e,0xc8]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.rsqrt14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
   ret <2 x double> %res
@@ -4143,7 +4167,7 @@ define <4 x double> @test_rcp_pd_256_rrk(<4 x double> %a0, <4 x double> %a1, i8
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrcp14pd %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x4c,0xc8]
-; CHECK-NEXT:    vmovaps %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xc1]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.rcp14.pd.256(<4 x double> %a0, <4 x double> %a1, i8 %mask)
   ret <4 x double> %res
@@ -4173,7 +4197,7 @@ define <2 x double> @test_rcp_pd_128_rrk(<2 x double> %a0, <2 x double> %a1, i8
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vrcp14pd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x4c,0xc8]
-; CHECK-NEXT:    vmovaps %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xc1]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xc1]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.avx512.rcp14.pd.128(<2 x double> %a0, <2 x double> %a1, i8 %mask)
   ret <2 x double> %res
@@ -4262,11 +4286,11 @@ define <4 x i32>@test_int_x86_avx512_mask_prorv_d_128(<4 x i32> %x0, <4 x i32> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x14,0xd9]
 ; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x14,0xd1]
-; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x14,0xd9]
-; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x14,0xc1]
-; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT:    vprorvd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x14,0xc1]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.prorv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
@@ -4282,11 +4306,11 @@ define <8 x i32>@test_int_x86_avx512_mask_prorv_d_256(<8 x i32> %x0, <8 x i32> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_d_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x14,0xd9]
 ; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x14,0xd1]
-; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x14,0xd9]
-; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x14,0xc1]
-; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
+; CHECK-NEXT:    vprorvd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x14,0xc1]
+; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.prorv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
@@ -4302,11 +4326,11 @@ define <2 x i64>@test_int_x86_avx512_mask_prorv_q_128(<2 x i64> %x0, <2 x i64> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x14,0xd9]
 ; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x14,0xd1]
-; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x14,0xd9]
-; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x14,0xc1]
-; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb]
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
+; CHECK-NEXT:    vprorvq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x14,0xc1]
+; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.prorv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
@@ -4322,11 +4346,11 @@ define <4 x i64>@test_int_x86_avx512_mask_prorv_q_256(<4 x i64> %x0, <4 x i64> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_prorv_q_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x14,0xd9]
 ; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x14,0xd1]
-; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x14,0xd9]
-; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x14,0xc1]
-; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT:    vprorvq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x14,0xc1]
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.prorv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
@@ -4422,11 +4446,11 @@ define <4 x i32>@test_int_x86_avx512_mask_prolv_d_128(<4 x i32> %x0, <4 x i32> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0x7d,0x08,0x15,0xd9]
 ; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x15,0xd1]
-; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x15,0xd9]
-; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0x7d,0x08,0x15,0xc1]
-; CHECK-NEXT:    vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb]
-; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
+; CHECK-NEXT:    vprolvd %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x15,0xc1]
+; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
+; CHECK-NEXT:    vpaddd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3)
   %res1 = call <4 x i32> @llvm.x86.avx512.mask.prolv.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> zeroinitializer, i8 %x3)
@@ -4442,11 +4466,11 @@ define <8 x i32>@test_int_x86_avx512_mask_prolv_d_256(<8 x i32> %x0, <8 x i32> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_d_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0x7d,0x28,0x15,0xd9]
 ; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x15,0xd1]
-; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x15,0xd9]
-; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0x7d,0x28,0x15,0xc1]
-; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
+; CHECK-NEXT:    vprolvd %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x15,0xc1]
+; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.prolv.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
@@ -4462,11 +4486,11 @@ define <2 x i64>@test_int_x86_avx512_mask_prolv_q_128(<2 x i64> %x0, <2 x i64> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm3 ## encoding: [0x62,0xf2,0xfd,0x08,0x15,0xd9]
 ; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x15,0xd1]
-; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x15,0xd9]
-; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf2,0xfd,0x08,0x15,0xc1]
-; CHECK-NEXT:    vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb]
-; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
+; CHECK-NEXT:    vprolvq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x15,0xc1]
+; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3)
   %res1 = call <2 x i64> @llvm.x86.avx512.mask.prolv.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> zeroinitializer, i8 %x3)
@@ -4482,11 +4506,11 @@ define <4 x i64>@test_int_x86_avx512_mask_prolv_q_256(<4 x i64> %x0, <4 x i64> %
 ; CHECK-LABEL: test_int_x86_avx512_mask_prolv_q_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm3 ## encoding: [0x62,0xf2,0xfd,0x28,0x15,0xd9]
 ; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x15,0xd1]
-; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x15,0xd9]
-; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm0 ## encoding: [0x62,0xf2,0xfd,0x28,0x15,0xc1]
-; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT:    vprolvq %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x15,0xc1]
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.prolv.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
@@ -4582,11 +4606,11 @@ define <4 x double>@test_int_x86_avx512_mask_permvar_df_256(<4 x double> %x0, <4
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_df_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpermpd %ymm0, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x16,0xd8]
 ; CHECK-NEXT:    vpermpd %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x16,0xd0]
-; CHECK-NEXT:    vpermpd %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xd8]
-; CHECK-NEXT:    vpermpd %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0x16,0xc0]
-; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb]
-; CHECK-NEXT:    vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0]
+; CHECK-NEXT:    vpermpd %ymm0, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x16,0xc0]
+; CHECK-NEXT:    vaddpd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc0]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> %x2, i8 %x3)
   %res1 = call <4 x double> @llvm.x86.avx512.mask.permvar.df.256(<4 x double> %x0, <4 x i64> %x1, <4 x double> zeroinitializer, i8 %x3)
@@ -4602,11 +4626,11 @@ define <4 x i64>@test_int_x86_avx512_mask_permvar_di_256(<4 x i64> %x0, <4 x i64
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_di_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpermq %ymm0, %ymm1, %ymm3 ## encoding: [0x62,0xf2,0xf5,0x28,0x36,0xd8]
 ; CHECK-NEXT:    vpermq %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0x36,0xd0]
-; CHECK-NEXT:    vpermq %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xd8]
-; CHECK-NEXT:    vpermq %ymm0, %ymm1, %ymm0 ## encoding: [0x62,0xf2,0xf5,0x28,0x36,0xc0]
-; CHECK-NEXT:    vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb]
-; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
+; CHECK-NEXT:    vpermq %ymm0, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xf5,0xa9,0x36,0xc0]
+; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
+; CHECK-NEXT:    vpaddq %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 %x3)
   %res1 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> zeroinitializer, i8 %x3)
@@ -4622,11 +4646,11 @@ define <8 x float>@test_int_x86_avx512_mask_permvar_sf_256(<8 x float> %x0, <8 x
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_sf_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xd8]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x16,0xd0]
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x16,0xd8]
-; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x16,0xc0]
-; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb]
-; CHECK-NEXT:    vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0]
+; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x16,0xc0]
+; CHECK-NEXT:    vaddps %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc0]
+; CHECK-NEXT:    vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> %x2, i8 %x3)
   %res1 = call <8 x float> @llvm.x86.avx512.mask.permvar.sf.256(<8 x float> %x0, <8 x i32> %x1, <8 x float> zeroinitializer, i8 %x3)
@@ -4642,11 +4666,11 @@ define <8 x i32>@test_int_x86_avx512_mask_permvar_si_256(<8 x i32> %x0, <8 x i32
 ; CHECK-LABEL: test_int_x86_avx512_mask_permvar_si_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xd8]
 ; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x75,0x29,0x36,0xd0]
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x36,0xd8]
-; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0x36,0xc0]
-; CHECK-NEXT:    vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb]
-; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
+; CHECK-NEXT:    vpermd %ymm0, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x75,0xa9,0x36,0xc0]
+; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
+; CHECK-NEXT:    vpaddd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 %x3)
   %res1 = call <8 x i32> @llvm.x86.avx512.mask.permvar.si.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> zeroinitializer, i8 %x3)
@@ -4750,13 +4774,13 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ps_128(<4 x float> %x0, <4
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8]
-; CHECK-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xda,0x05]
+; CHECK-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xda,0x05]
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xe0]
-; CHECK-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xe2,0x05]
+; CHECK-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm4 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xe2,0x05]
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x09,0x54,0xc2,0x05]
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc0]
-; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc4]
+; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd8,0x58,0xc0]
+; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4)
   %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4)
@@ -4773,13 +4797,13 @@ define <4 x float>@test_int_x86_avx512_maskz_fixupimm_ps_128(<4 x float> %x0, <4
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd8]
-; CHECK-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xda,0x05]
+; CHECK-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm3 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xda,0x05]
 ; CHECK-NEXT:    vmovaps %xmm0, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xe0]
-; CHECK-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm4 ## encoding: [0x62,0xf3,0x75,0x08,0x54,0xe2,0x05]
+; CHECK-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xe2,0x05]
 ; CHECK-NEXT:    vpxor %xmm2, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xef,0xd2]
 ; CHECK-NEXT:    vfixupimmps $5, %xmm2, %xmm1, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0x89,0x54,0xc2,0x05]
-; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe0,0x58,0xc0]
-; CHECK-NEXT:    vaddps %xmm4, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc4]
+; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xd8,0x58,0xc0]
+; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4)
   %res1 = call <4 x float> @llvm.x86.avx512.maskz.fixupimm.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4)
@@ -4796,13 +4820,13 @@ define <8 x float>@test_int_x86_avx512_mask_fixupimm_ps_256(<8 x float> %x0, <8
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8]
-; CHECK-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xda,0x05]
+; CHECK-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xda,0x05]
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe0]
-; CHECK-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm4 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xe2,0x05]
+; CHECK-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm4 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xe2,0x05]
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf3,0x75,0x29,0x54,0xc2,0x05]
-; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0]
-; CHECK-NEXT:    vaddps %ymm4, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc4]
+; CHECK-NEXT:    vaddps %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdc,0x58,0xc0]
+; CHECK-NEXT:    vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4)
   %res1 = call <8 x float> @llvm.x86.avx512.mask.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 5, i8 %x4)
@@ -4819,13 +4843,13 @@ define <8 x float>@test_int_x86_avx512_maskz_fixupimm_ps_256(<8 x float> %x0, <8
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd8]
-; CHECK-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm3 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xda,0x05]
+; CHECK-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm3 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xda,0x05]
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xe0]
-; CHECK-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm4 ## encoding: [0x62,0xf3,0x75,0x28,0x54,0xe2,0x05]
+; CHECK-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xe2,0x05]
 ; CHECK-NEXT:    vpxor %ymm2, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xef,0xd2]
 ; CHECK-NEXT:    vfixupimmps $5, %ymm2, %ymm1, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x75,0xa9,0x54,0xc2,0x05]
-; CHECK-NEXT:    vaddps %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe4,0x58,0xc0]
-; CHECK-NEXT:    vaddps %ymm4, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc4]
+; CHECK-NEXT:    vaddps %ymm0, %ymm4, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xdc,0x58,0xc0]
+; CHECK-NEXT:    vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> %x2, i32 5, i8 %x4)
   %res1 = call <8 x float> @llvm.x86.avx512.maskz.fixupimm.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x i32> zeroinitializer, i32 5, i8 %x4)
@@ -4846,6 +4870,7 @@ define i8@test_int_x86_avx512_ptestm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
 ; CHECK-NEXT:    vptestmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x08,0x27,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1)
@@ -4859,11 +4884,12 @@ define i8@test_int_x86_avx512_ptestm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
 ; CHECK-LABEL: test_int_x86_avx512_ptestm_d_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vptestmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x27,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
 ; CHECK-NEXT:    vptestmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7d,0x28,0x27,0xc1]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vptestmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x27,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1)
@@ -4882,6 +4908,7 @@ define i8@test_int_x86_avx512_ptestm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
 ; CHECK-NEXT:    vptestmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x08,0x27,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1)
@@ -4900,6 +4927,7 @@ define i8@test_int_x86_avx512_ptestm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
 ; CHECK-NEXT:    vptestmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfd,0x28,0x27,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
@@ -4918,6 +4946,7 @@ define i8@test_int_x86_avx512_ptestnm_d_128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2
 ; CHECK-NEXT:    vptestnmd %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x08,0x27,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestnm.d.128(<4 x i32> %x0, <4 x i32> %x1, i8-1)
@@ -4931,11 +4960,12 @@ define i8@test_int_x86_avx512_ptestnm_d_256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2
 ; CHECK-LABEL: test_int_x86_avx512_ptestnm_d_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vptestnmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x27,0xc1]
-; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
 ; CHECK-NEXT:    vptestnmd %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0x7e,0x28,0x27,0xc1]
+; CHECK-NEXT:    kmovw %k0, %ecx ## encoding: [0xc5,0xf8,0x93,0xc8]
+; CHECK-NEXT:    vptestnmd %ymm1, %ymm0, %k0 {%k1} ## encoding: [0x62,0xf2,0x7e,0x29,0x27,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestnm.d.256(<8 x i32> %x0, <8 x i32> %x1, i8-1)
@@ -4954,6 +4984,7 @@ define i8@test_int_x86_avx512_ptestnm_q_128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2
 ; CHECK-NEXT:    vptestnmq %xmm1, %xmm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x08,0x27,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestnm.q.128(<2 x i64> %x0, <2 x i64> %x1, i8-1)
@@ -4972,6 +5003,7 @@ define i8@test_int_x86_avx512_ptestnm_q_256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2
 ; CHECK-NEXT:    vptestnmq %ymm1, %ymm0, %k0 ## encoding: [0x62,0xf2,0xfe,0x28,0x27,0xc1]
 ; CHECK-NEXT:    kmovw %k0, %eax ## encoding: [0xc5,0xf8,0x93,0xc0]
 ; CHECK-NEXT:    addb %cl, %al ## encoding: [0x00,0xc8]
+; CHECK-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
   %res = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8 %x2)
   %res1 = call i8 @llvm.x86.avx512.ptestnm.q.256(<4 x i64> %x0, <4 x i64> %x1, i8-1)
@@ -4985,8 +5017,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_256(i32 %x0, <8 x i32
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7]
 ; CHECK-NEXT:    vpbroadcastd %edi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x7c,0xcf]
+; CHECK-NEXT:    vpbroadcastd %edi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x7c,0xc7]
 ; CHECK-NEXT:    vpbroadcastd %edi, %ymm2 ## encoding: [0x62,0xf2,0x7d,0x28,0x7c,0xd7]
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xc0]
 ; CHECK-NEXT:    vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0]
@@ -5005,8 +5037,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pbroadcast_d_gpr_128(i32 %x0, <4 x i32
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_d_gpr_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7]
 ; CHECK-NEXT:    vpbroadcastd %edi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x7c,0xcf]
+; CHECK-NEXT:    vpbroadcastd %edi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x7c,0xc7]
 ; CHECK-NEXT:    vpbroadcastd %edi, %xmm2 ## encoding: [0x62,0xf2,0x7d,0x08,0x7c,0xd7]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0]
 ; CHECK-NEXT:    vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0]
@@ -5025,8 +5057,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_256(i64 %x0, <4 x i64
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_256:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7]
 ; CHECK-NEXT:    vpbroadcastq %rdi, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x7c,0xcf]
+; CHECK-NEXT:    vpbroadcastq %rdi, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x7c,0xc7]
 ; CHECK-NEXT:    vpbroadcastq %rdi, %ymm2 ## encoding: [0x62,0xf2,0xfd,0x28,0x7c,0xd7]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0]
 ; CHECK-NEXT:    vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0]
@@ -5045,8 +5077,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pbroadcast_q_gpr_128(i64 %x0, <2 x i64
 ; CHECK-LABEL: test_int_x86_avx512_mask_pbroadcast_q_gpr_128:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
-; CHECK-NEXT:    vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7]
 ; CHECK-NEXT:    vpbroadcastq %rdi, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x7c,0xcf]
+; CHECK-NEXT:    vpbroadcastq %rdi, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x7c,0xc7]
 ; CHECK-NEXT:    vpbroadcastq %rdi, %xmm2 ## encoding: [0x62,0xf2,0xfd,0x08,0x7c,0xd7]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0]
 ; CHECK-NEXT:    vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0]
@@ -5278,3 +5310,1115 @@ define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1
 }
 
 declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+declare <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mask_vfmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd256_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mask_vfmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd128_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double> @test_mask_fmadd256_pd(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmadd256_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a, <4 x double> %b, <4 x double> %c, i8 %mask)
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double> @test_mask_fmadd128_pd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmadd128_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask)
+  ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda]
+; CHECK-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda]
+; CHECK-NEXT:    vfmadd231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb8,0xd1]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa8,0xda]
+; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0xa8,0xca]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+define <4 x double>@test_int_x86_avx512_mask_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda]
+; CHECK-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda]
+; CHECK-NEXT:    vfmadd231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb8,0xd1]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_maskz_vfmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa8,0xda]
+; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0xa8,0xca]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda]
+; CHECK-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
+; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda]
+; CHECK-NEXT:    vfmadd231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb8,0xd1]
+; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa8,0xda]
+; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0xa8,0xca]
+; CHECK-NEXT:    vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+define <8 x float>@test_int_x86_avx512_mask_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmadd_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda]
+; CHECK-NEXT:    vfmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x98,0xc1]
+; CHECK-NEXT:    vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmadd_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda]
+; CHECK-NEXT:    vfmadd231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb8,0xd1]
+; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_maskz_vfmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmadd_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa8,0xda]
+; CHECK-NEXT:    vfmadd213ps %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0xa8,0xca]
+; CHECK-NEXT:    vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaa,0xda]
+; CHECK-NEXT:    vfmsub231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xba,0xd1]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+
+declare <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xaa,0xda]
+; CHECK-NEXT:    vfmsub231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xba,0xd1]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaa,0xda]
+; CHECK-NEXT:    vfmsub231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xba,0xd1]
+; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsub_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xaa,0xda]
+; CHECK-NEXT:    vfmsub231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xba,0xd1]
+; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mask_vfnmadd256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfnmadd256_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mask_vfnmadd128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfnmadd128_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x double> @test_mask_vfnmadd256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfnmadd256_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mask_vfnmadd128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfnmadd128_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mask_vfnmsub256_ps(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfnmsub256_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2, i8 %mask) nounwind
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mask_vfnmsub128_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfnmsub128_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x double> @test_mask_vfnmsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfnmsub256_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mask_vfnmsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfnmsub128_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+
+define <2 x double>@test_int_x86_avx512_mask_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xae,0xda]
+; CHECK-NEXT:    vfnmsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9e,0xc1]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfnmsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xae,0xda]
+; CHECK-NEXT:    vfnmsub231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xbe,0xd1]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+define <4 x double>@test_int_x86_avx512_mask_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xae,0xda]
+; CHECK-NEXT:    vfnmsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9e,0xc1]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask3_vfnmsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfnmsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xae,0xda]
+; CHECK-NEXT:    vfnmsub231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xbe,0xd1]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfnmsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xae,0xda]
+; CHECK-NEXT:    vfnmsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9e,0xc1]
+; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfnmsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xae,0xda]
+; CHECK-NEXT:    vfnmsub231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xbe,0xd1]
+; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+define <8 x float>@test_int_x86_avx512_mask_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmsub_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xae,0xda]
+; CHECK-NEXT:    vfnmsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9e,0xc1]
+; CHECK-NEXT:    vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask3_vfnmsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfnmsub_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfnmsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xae,0xda]
+; CHECK-NEXT:    vfnmsub231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xbe,0xd1]
+; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfnmsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+define <2 x double>@test_int_x86_avx512_mask_vfnmadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfnmadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xac,0xda]
+; CHECK-NEXT:    vfnmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x9c,0xc1]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfnmadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+define <4 x double>@test_int_x86_avx512_mask_vfnmadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfnmadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xac,0xda]
+; CHECK-NEXT:    vfnmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x9c,0xc1]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfnmadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfnmadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xac,0xda]
+; CHECK-NEXT:    vfnmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x9c,0xc1]
+; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfnmadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+define <8 x float>@test_int_x86_avx512_mask_vfnmadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfnmadd_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xac,0xda]
+; CHECK-NEXT:    vfnmadd132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x9c,0xc1]
+; CHECK-NEXT:    vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfnmadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8) nounwind readnone
+
+define <8 x float> @test_mask_fmaddsub256_ps(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmaddsub256_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %a, <8 x float> %b, <8 x float> %c, i8 %mask)
+  ret <8 x float> %res
+}
+
+declare <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
+
+define <4 x float> @test_mask_fmaddsub128_ps(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) {
+; CHECK-LABEL: test_mask_fmaddsub128_ps:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask)
+  ret <4 x float> %res
+}
+
+declare <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x double> @test_mask_vfmaddsub256_pd(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmaddsub256_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8) nounwind readnone
+
+define <2 x double> @test_mask_vfmaddsub128_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmaddsub128_pd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x96,0xc1]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb6,0xd1]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub213pd %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0xa6,0xca]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2 = fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+define <4 x double>@test_int_x86_avx512_mask_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x96,0xc1]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb6,0xd1]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_maskz_vfmaddsub_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub213pd %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0xa6,0xca]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.maskz.vfmaddsub.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2 = fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+define <4 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x96,0xc1]
+; CHECK-NEXT:    vaddps %xmm3, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb6,0xd1]
+; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub213ps %xmm2, %xmm0, %xmm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0xa6,0xca]
+; CHECK-NEXT:    vaddps %xmm3, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2 = fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+define <8 x float>@test_int_x86_avx512_mask_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask_vfmaddsub_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub132ps %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x29,0x96,0xc1]
+; CHECK-NEXT:    vaddps %ymm3, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmaddsub_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb6,0xd1]
+; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_maskz_vfmaddsub_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_maskz_vfmaddsub_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa6,0xda]
+; CHECK-NEXT:    vfmaddsub213ps %ymm2, %ymm0, %ymm1 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0xa6,0xca]
+; CHECK-NEXT:    vaddps %ymm3, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.maskz.vfmaddsub.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2 = fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+declare <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double>, <2 x double>, <2 x double>, i8)
+
+define <2 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0xd9]
+; CHECK-NEXT:    vfmsubadd213pd %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa7,0xda]
+; CHECK-NEXT:    vfmsubadd231pd %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0xb7,0xd1]
+; CHECK-NEXT:    vaddpd %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.128(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1)
+  %res2=fadd <2 x double> %res, %res1
+  ret <2 x double> %res2
+}
+
+declare <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double>, <4 x double>, <4 x double>, i8)
+
+define <4 x double>@test_int_x86_avx512_mask3_vfmsubadd_pd_256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_pd_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovapd %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xd9]
+; CHECK-NEXT:    vfmsubadd213pd %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xfd,0xa7,0xda]
+; CHECK-NEXT:    vfmsubadd231pd %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0xb7,0xd1]
+; CHECK-NEXT:    vaddpd %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 %x3)
+  %res1 = call <4 x double> @llvm.x86.avx512.mask3.vfmsubadd.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x double> %x2, i8 -1)
+  %res2=fadd <4 x double> %res, %res1
+  ret <4 x double> %res2
+}
+
+declare <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float>, <4 x float>, <4 x float>, i8)
+
+define <4 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_128:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %xmm1, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0xd9]
+; CHECK-NEXT:    vfmsubadd213ps %xmm2, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa7,0xda]
+; CHECK-NEXT:    vfmsubadd231ps %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0xb7,0xd1]
+; CHECK-NEXT:    vaddps %xmm3, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.128(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1)
+  %res2=fadd <4 x float> %res, %res1
+  ret <4 x float> %res2
+}
+
+declare <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float>, <8 x float>, <8 x float>, i8)
+
+define <8 x float>@test_int_x86_avx512_mask3_vfmsubadd_ps_256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3) {
+; CHECK-LABEL: test_int_x86_avx512_mask3_vfmsubadd_ps_256:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vmovaps %ymm1, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x28,0xd9]
+; CHECK-NEXT:    vfmsubadd213ps %ymm2, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0xa7,0xda]
+; CHECK-NEXT:    vfmsubadd231ps %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0xb7,0xd1]
+; CHECK-NEXT:    vaddps %ymm3, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc3]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 %x3)
+  %res1 = call <8 x float> @llvm.x86.avx512.mask3.vfmsubadd.ps.256(<8 x float> %x0, <8 x float> %x1, <8 x float> %x2, i8 -1)
+  %res2=fadd <8 x float> %res, %res1
+  ret <8 x float> %res2
+}
+
+
+define <4 x float> @test_mask_vfmadd128_ps_r(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd128_ps_r:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmadd132ps %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x6d,0x09,0x98,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rz(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
+; CHECK-LABEL: test_mask_vfmadd128_ps_rz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmk(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %a2 = load <4 x float>, <4 x float>* %ptr_a2
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmka(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmka:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x09,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 8
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmkz(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %a2 = load <4 x float>, <4 x float>* %ptr_a2
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmkza(<4 x float> %a0, <4 x float> %a1, <4 x float>* %ptr_a2) {
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmkza:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ps (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %a2 = load <4 x float>, <4 x float>* %ptr_a2, align 4
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 -1) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmb:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load float, float* %ptr_a2
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmba:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0x75,0x19,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load float, float* %ptr_a2, align 4
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 %mask) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmbz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load float, float* %ptr_a2
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
+  ret <4 x float> %res
+}
+
+define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a1, float* %ptr_a2) {
+; CHECK-LABEL: test_mask_vfmadd128_ps_rmbza:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213ps (%rdi){1to4}, %xmm1, %xmm0 ## encoding: [0x62,0xf2,0x75,0x18,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %q = load float, float* %ptr_a2, align 4
+  %vecinit.i = insertelement <4 x float> undef, float %q, i32 0
+  %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1
+  %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2
+  %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3
+  %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ps.128(<4 x float> %a0, <4 x float> %a1, <4 x float> %vecinit6.i, i8 -1) nounwind
+  ret <4 x float> %res
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_r(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd128_pd_r:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmadd132pd %xmm1, %xmm2, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x09,0x98,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_rz(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
+; CHECK-LABEL: test_mask_vfmadd128_pd_rz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_rmk(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd128_pd_rmk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x09,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %a2 = load <2 x double>, <2 x double>* %ptr_a2
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) nounwind
+  ret <2 x double> %res
+}
+
+define <2 x double> @test_mask_vfmadd128_pd_rmkz(<2 x double> %a0, <2 x double> %a1, <2 x double>* %ptr_a2) {
+; CHECK-LABEL: test_mask_vfmadd128_pd_rmkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213pd (%rdi), %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %a2 = load <2 x double>, <2 x double>* %ptr_a2
+  %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.pd.128(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 -1) nounwind
+  ret <2 x double> %res
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_r(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd256_pd_r:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vfmadd132pd %ymm1, %ymm2, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xed,0x29,0x98,0xc1]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_rz(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
+; CHECK-LABEL: test_mask_vfmadd256_pd_rz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_rmk(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2, i8 %mask) {
+; CHECK-LABEL: test_mask_vfmadd256_pd_rmk:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce]
+; CHECK-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 {%k1} ## encoding: [0x62,0xf2,0xf5,0x29,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %a2 = load <4 x double>, <4 x double>* %ptr_a2
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 %mask) nounwind
+  ret <4 x double> %res
+}
+
+define <4 x double> @test_mask_vfmadd256_pd_rmkz(<4 x double> %a0, <4 x double> %a1, <4 x double>* %ptr_a2) {
+; CHECK-LABEL: test_mask_vfmadd256_pd_rmkz:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vfmadd213pd (%rdi), %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0x07]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %a2 = load <4 x double>, <4 x double>* %ptr_a2
+  %res = call <4 x double> @llvm.x86.avx512.mask.vfmadd.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2, i8 -1) nounwind
+  ret <4 x double> %res
+}
+
diff --git a/test/CodeGen/X86/avx512vl-logic.ll b/test/CodeGen/X86/avx512vl-logic.ll
index e820bc4ca82402d420a080dc59905c082a33bc62..83fa8d4c34cd77c2c3772ecd8f808b046b812c78 100644
--- a/test/CodeGen/X86/avx512vl-logic.ll
+++ b/test/CodeGen/X86/avx512vl-logic.ll
@@ -21,7 +21,7 @@ define <8 x i32> @vpandnd256(<8 x i32> %a, <8 x i32> %b) nounwind uwtable readno
 ; CHECK-LABEL: vpandnd256:
 ; CHECK:       ## BB#0: ## %entry
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to8}, %ymm0, %ymm1
-; CHECK-NEXT:    vpandnd %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpandn %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -74,7 +74,7 @@ define <4 x i64> @vpandnq256(<4 x i64> %a, <4 x i64> %b) nounwind uwtable readno
 ; CHECK-LABEL: vpandnq256:
 ; CHECK:       ## BB#0: ## %entry
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip){1to4}, %ymm0, %ymm0
-; CHECK-NEXT:    vpandnq %ymm0, %ymm1, %ymm0
+; CHECK-NEXT:    vpandn %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -129,7 +129,7 @@ define <4 x i32> @vpandnd128(<4 x i32> %a, <4 x i32> %b) nounwind uwtable readno
 ; CHECK-LABEL: vpandnd128:
 ; CHECK:       ## BB#0: ## %entry
 ; CHECK-NEXT:    vpaddd {{.*}}(%rip){1to4}, %xmm0, %xmm0
-; CHECK-NEXT:    vpandnd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpandn %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -182,7 +182,7 @@ define <2 x i64> @vpandnq128(<2 x i64> %a, <2 x i64> %b) nounwind uwtable readno
 ; CHECK-LABEL: vpandnq128:
 ; CHECK:       ## BB#0: ## %entry
 ; CHECK-NEXT:    vpaddq {{.*}}(%rip), %xmm0, %xmm0
-; CHECK-NEXT:    vpandnq %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    vpandn %xmm0, %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   ; Force the execution domain with an add.
@@ -228,7 +228,7 @@ define <4 x double> @test_mm256_mask_andnot_pd(<4 x double> %__W, i8 zeroext %__
 ;
 ; SKX-LABEL: test_mm256_mask_andnot_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnpd %ymm2, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -252,7 +252,7 @@ define <4 x double> @test_mm256_maskz_andnot_pd(i8 zeroext %__U, <4 x double> %_
 ;
 ; SKX-LABEL: test_mm256_maskz_andnot_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnpd %ymm1, %ymm0, %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -276,7 +276,7 @@ define <2 x double> @test_mm_mask_andnot_pd(<2 x double> %__W, i8 zeroext %__U,
 ;
 ; SKX-LABEL: test_mm_mask_andnot_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnpd %xmm2, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -300,7 +300,7 @@ define <2 x double> @test_mm_maskz_andnot_pd(i8 zeroext %__U, <2 x double> %__A,
 ;
 ; SKX-LABEL: test_mm_maskz_andnot_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnpd %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -324,7 +324,7 @@ define <8 x float> @test_mm256_mask_andnot_ps(<8 x float> %__W, i8 zeroext %__U,
 ;
 ; SKX-LABEL: test_mm256_mask_andnot_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnps %ymm2, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -347,7 +347,7 @@ define <8 x float> @test_mm256_maskz_andnot_ps(i8 zeroext %__U, <8 x float> %__A
 ;
 ; SKX-LABEL: test_mm256_maskz_andnot_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnps %ymm1, %ymm0, %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -370,7 +370,7 @@ define <4 x float> @test_mm_mask_andnot_ps(<4 x float> %__W, i8 zeroext %__U, <4
 ;
 ; SKX-LABEL: test_mm_mask_andnot_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnps %xmm2, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -394,7 +394,7 @@ define <4 x float> @test_mm_maskz_andnot_ps(i8 zeroext %__U, <4 x float> %__A, <
 ;
 ; SKX-LABEL: test_mm_maskz_andnot_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandnps %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -418,7 +418,7 @@ define <4 x double> @test_mm256_mask_and_pd(<4 x double> %__W, i8 zeroext %__U,
 ;
 ; SKX-LABEL: test_mm256_mask_and_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandpd %ymm1, %ymm2, %ymm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -441,7 +441,7 @@ define <4 x double> @test_mm256_maskz_and_pd(i8 zeroext %__U, <4 x double> %__A,
 ;
 ; SKX-LABEL: test_mm256_maskz_and_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandpd %ymm0, %ymm1, %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -464,7 +464,7 @@ define <2 x double> @test_mm_mask_and_pd(<2 x double> %__W, i8 zeroext %__U, <2
 ;
 ; SKX-LABEL: test_mm_mask_and_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandpd %xmm1, %xmm2, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -487,7 +487,7 @@ define <2 x double> @test_mm_maskz_and_pd(i8 zeroext %__U, <2 x double> %__A, <2
 ;
 ; SKX-LABEL: test_mm_maskz_and_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandpd %xmm0, %xmm1, %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -510,7 +510,7 @@ define <8 x float> @test_mm256_mask_and_ps(<8 x float> %__W, i8 zeroext %__U, <8
 ;
 ; SKX-LABEL: test_mm256_mask_and_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandps %ymm1, %ymm2, %ymm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -532,7 +532,7 @@ define <8 x float> @test_mm256_maskz_and_ps(i8 zeroext %__U, <8 x float> %__A, <
 ;
 ; SKX-LABEL: test_mm256_maskz_and_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandps %ymm0, %ymm1, %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -554,7 +554,7 @@ define <4 x float> @test_mm_mask_and_ps(<4 x float> %__W, i8 zeroext %__U, <4 x
 ;
 ; SKX-LABEL: test_mm_mask_and_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandps %xmm1, %xmm2, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -577,7 +577,7 @@ define <4 x float> @test_mm_maskz_and_ps(i8 zeroext %__U, <4 x float> %__A, <4 x
 ;
 ; SKX-LABEL: test_mm_maskz_and_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vandps %xmm0, %xmm1, %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -600,7 +600,7 @@ define <4 x double> @test_mm256_mask_xor_pd(<4 x double> %__W, i8 zeroext %__U,
 ;
 ; SKX-LABEL: test_mm256_mask_xor_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorpd %ymm2, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -623,7 +623,7 @@ define <4 x double> @test_mm256_maskz_xor_pd(i8 zeroext %__U, <4 x double> %__A,
 ;
 ; SKX-LABEL: test_mm256_maskz_xor_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorpd %ymm1, %ymm0, %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -646,7 +646,7 @@ define <2 x double> @test_mm_mask_xor_pd(<2 x double> %__W, i8 zeroext %__U, <2
 ;
 ; SKX-LABEL: test_mm_mask_xor_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorpd %xmm2, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -669,7 +669,7 @@ define <2 x double> @test_mm_maskz_xor_pd(i8 zeroext %__U, <2 x double> %__A, <2
 ;
 ; SKX-LABEL: test_mm_maskz_xor_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorpd %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -692,7 +692,7 @@ define <8 x float> @test_mm256_mask_xor_ps(<8 x float> %__W, i8 zeroext %__U, <8
 ;
 ; SKX-LABEL: test_mm256_mask_xor_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorps %ymm2, %ymm1, %ymm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -714,7 +714,7 @@ define <8 x float> @test_mm256_maskz_xor_ps(i8 zeroext %__U, <8 x float> %__A, <
 ;
 ; SKX-LABEL: test_mm256_maskz_xor_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorps %ymm1, %ymm0, %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -736,7 +736,7 @@ define <4 x float> @test_mm_mask_xor_ps(<4 x float> %__W, i8 zeroext %__U, <4 x
 ;
 ; SKX-LABEL: test_mm_mask_xor_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorps %xmm2, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -759,7 +759,7 @@ define <4 x float> @test_mm_maskz_xor_ps(i8 zeroext %__U, <4 x float> %__A, <4 x
 ;
 ; SKX-LABEL: test_mm_maskz_xor_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorps %xmm1, %xmm0, %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -782,7 +782,7 @@ define <4 x double> @test_mm256_mask_or_pd(<4 x double> %__W, i8 zeroext %__U, <
 ;
 ; SKX-LABEL: test_mm256_mask_or_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorpd %ymm1, %ymm2, %ymm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -805,7 +805,7 @@ define <4 x double> @test_mm256_maskz_or_pd(i8 zeroext %__U, <4 x double> %__A,
 ;
 ; SKX-LABEL: test_mm256_maskz_or_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorpd %ymm0, %ymm1, %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -828,7 +828,7 @@ define <2 x double> @test_mm_mask_or_pd(<2 x double> %__W, i8 zeroext %__U, <2 x
 ;
 ; SKX-LABEL: test_mm_mask_or_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorpd %xmm1, %xmm2, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -851,7 +851,7 @@ define <2 x double> @test_mm_maskz_or_pd(i8 zeroext %__U, <2 x double> %__A, <2
 ;
 ; SKX-LABEL: test_mm_maskz_or_pd:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorpd %xmm0, %xmm1, %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -874,7 +874,7 @@ define <8 x float> @test_mm256_mask_or_ps(<8 x float> %__W, i8 zeroext %__U, <8
 ;
 ; SKX-LABEL: test_mm256_mask_or_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorps %ymm1, %ymm2, %ymm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -896,7 +896,7 @@ define <8 x float> @test_mm256_maskz_or_ps(i8 zeroext %__U, <8 x float> %__A, <8
 ;
 ; SKX-LABEL: test_mm256_maskz_or_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorps %ymm0, %ymm1, %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
@@ -918,7 +918,7 @@ define <4 x float> @test_mm_mask_or_ps(<4 x float> %__W, i8 zeroext %__U, <4 x f
 ;
 ; SKX-LABEL: test_mm_mask_or_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorps %xmm1, %xmm2, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 entry:
@@ -941,7 +941,7 @@ define <4 x float> @test_mm_maskz_or_ps(i8 zeroext %__U, <4 x float> %__A, <4 x
 ;
 ; SKX-LABEL: test_mm_maskz_or_ps:
 ; SKX:       ## BB#0: ## %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vorps %xmm0, %xmm1, %xmm0 {%k1} {z}
 ; SKX-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/bc-extract.ll b/test/CodeGen/X86/bc-extract.ll
index a1c0f5ae527c85a166695b229330ef22699159eb..b43c70e303a1e261e604411e92c0a4a8a2ddb3e4 100644
--- a/test/CodeGen/X86/bc-extract.ll
+++ b/test/CodeGen/X86/bc-extract.ll
@@ -1,25 +1,50 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse4.2 |  FileCheck %s
-
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.2   | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64
 
 define float @extractFloat1() nounwind {
+; X32-LABEL: extractFloat1:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    fld1
+; X32-NEXT:    retl
+;
+; X64-LABEL: extractFloat1:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    retq
 entry:
-  ; CHECK: 1065353216
   %tmp0 = bitcast <1 x double> <double 0x000000003F800000> to <2 x float>
-  %tmp1 = extractelement <2 x float> %tmp0, i32 0 
+  %tmp1 = extractelement <2 x float> %tmp0, i32 0
   ret float %tmp1
 }
 
 define float @extractFloat2() nounwind {
+; X32-LABEL: extractFloat2:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    fldz
+; X32-NEXT:    retl
+;
+; X64-LABEL: extractFloat2:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    retq
 entry:
-  ; CHECK: xorps	%xmm0, %xmm0
   %tmp4 = bitcast <1 x double> <double 0x000000003F800000> to <2 x float>
   %tmp5 = extractelement <2 x float> %tmp4, i32 1
   ret float %tmp5
 }
 
 define i32 @extractInt2() nounwind {
+; X32-LABEL: extractInt2:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: extractInt2:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
 entry:
-  ; CHECK: xorl	%eax, %eax
   %tmp4 = bitcast <1 x i64> <i64 256> to <2 x i32>
   %tmp5 = extractelement <2 x i32> %tmp4, i32 1
   ret i32 %tmp5
diff --git a/test/CodeGen/X86/bitcast-mmx.ll b/test/CodeGen/X86/bitcast-mmx.ll
index 4107f3914f81f3ec735e28a2e80bddcb33185ada..f0318ede531a3f823ef1c9930db7495ef1a00759 100644
--- a/test/CodeGen/X86/bitcast-mmx.ll
+++ b/test/CodeGen/X86/bitcast-mmx.ll
@@ -1,12 +1,20 @@
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
 
-define i32 @t0(i64 %x) {
-; CHECK-LABEL: t0:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    movd %[[REG1:[a-z]+]], %mm0
-; CHECK-NEXT:    pshufw $238, %mm0, %mm0
-; CHECK-NEXT:    movd %mm0, %eax
-; CHECK-NEXT:    retq
+define i32 @t0(i64 %x) nounwind {
+; X86-LABEL: t0:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pshufw $238, {{[0-9]+}}(%esp), %mm0 # mm0 = mem[2,3,2,3]
+; X86-NEXT:    movd %mm0, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: t0:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movd %rdi, %mm0
+; X64-NEXT:    pshufw $238, %mm0, %mm0 # mm0 = mm0[2,3,2,3]
+; X64-NEXT:    movd %mm0, %eax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast i64 %x to <4 x i16>
   %1 = bitcast <4 x i16> %0 to x86_mmx
@@ -19,14 +27,30 @@ entry:
   ret i32 %7
 }
 
-define i64 @t1(i64 %x, i32 %n) {
-; CHECK-LABEL: t1:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    movd %[[REG2:[a-z]+]], %mm0
-; CHECK-NEXT:    movd %[[REG1]], %mm1
-; CHECK-NEXT:    psllq %mm0, %mm1
-; CHECK-NEXT:    movd %mm1, %rax
-; CHECK-NEXT:    retq
+define i64 @t1(i64 %x, i32 %n) nounwind {
+; X86-LABEL: t1:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movd 16(%ebp), %mm0
+; X86-NEXT:    movq 8(%ebp), %mm1
+; X86-NEXT:    psllq %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t1:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movd %esi, %mm0
+; X64-NEXT:    movd %rdi, %mm1
+; X64-NEXT:    psllq %mm0, %mm1
+; X64-NEXT:    movd %mm1, %rax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast i64 %x to x86_mmx
   %1 = tail call x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx %0, i32 %n)
@@ -34,16 +58,33 @@ entry:
   ret i64 %2
 }
 
-define i64 @t2(i64 %x, i32 %n, i32 %w) {
-; CHECK-LABEL: t2:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:  movd %[[REG4:[a-z]+]], %mm0
-; CHECK-NEXT:  movd %[[REG6:[a-z0-9]+]], %mm1
-; CHECK-NEXT:  psllq %mm0, %mm1
-; CHECK-NEXT:  movd %[[REG1]], %mm0
-; CHECK-NEXT:  por %mm1, %mm0
-; CHECK-NEXT:  movd %mm0, %rax
-; CHECK-NEXT:  retq
+define i64 @t2(i64 %x, i32 %n, i32 %w) nounwind {
+; X86-LABEL: t2:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movd 16(%ebp), %mm0
+; X86-NEXT:    movd 20(%ebp), %mm1
+; X86-NEXT:    psllq %mm0, %mm1
+; X86-NEXT:    por 8(%ebp), %mm1
+; X86-NEXT:    movq %mm1, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t2:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movd %esi, %mm0
+; X64-NEXT:    movd %edx, %mm1
+; X64-NEXT:    psllq %mm0, %mm1
+; X64-NEXT:    movd %rdi, %mm0
+; X64-NEXT:    por %mm1, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
 entry:
   %0 = insertelement <2 x i32> undef, i32 %w, i32 0
   %1 = insertelement <2 x i32> %0, i32 0, i32 1
@@ -55,13 +96,32 @@ entry:
   ret i64 %6
 }
 
-define i64 @t3(<1 x i64>* %y, i32* %n) {
-; CHECK-LABEL: t3:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    movq (%[[REG1]]), %mm0
-; CHECK-NEXT:    psllq (%[[REG3:[a-z]+]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    retq
+define i64 @t3(<1 x i64>* %y, i32* %n) nounwind {
+; X86-LABEL: t3:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    movd (%eax), %mm1
+; X86-NEXT:    psllq %mm1, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t3:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movd (%rsi), %mm1
+; X64-NEXT:    psllq %mm1, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64>* %y to x86_mmx*
   %1 = load x86_mmx, x86_mmx* %0, align 8
diff --git a/test/CodeGen/X86/bitreverse.ll b/test/CodeGen/X86/bitreverse.ll
index 35cbbdafb464fa824c859161a7671e9082e08514..06daf014c1510ec13d3c697fb5715485749f692b 100644
--- a/test/CodeGen/X86/bitreverse.ll
+++ b/test/CodeGen/X86/bitreverse.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
 
 ; These tests just check that the plumbing is in place for @llvm.bitreverse. The
 ; actual output is massive at the moment as llvm.bitreverse is not yet legal.
@@ -7,100 +8,354 @@
 declare <2 x i16> @llvm.bitreverse.v2i16(<2 x i16>) readnone
 
 define <2 x i16> @test_bitreverse_v2i16(<2 x i16> %a) nounwind {
-; CHECK-LABEL: test_bitreverse_v2i16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %cx
-; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %ax
-; CHECK-NEXT:    rolw $8, %ax
-; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    andl $3855, %edx # imm = 0xF0F
-; CHECK-NEXT:    shll $4, %edx
-; CHECK-NEXT:    andl $61680, %eax # imm = 0xF0F0
-; CHECK-NEXT:    shrl $4, %eax
-; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    andl $13107, %edx # imm = 0x3333
-; CHECK-NEXT:    andl $52428, %eax # imm = 0xCCCC
-; CHECK-NEXT:    shrl $2, %eax
-; CHECK-NEXT:    leal (%eax,%edx,4), %eax
-; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    andl $21845, %edx # imm = 0x5555
-; CHECK-NEXT:    andl $43690, %eax # imm = 0xAAAA
-; CHECK-NEXT:    shrl %eax
-; CHECK-NEXT:    leal (%eax,%edx,2), %eax
-; CHECK-NEXT:    rolw $8, %cx
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    andl $3855, %edx # imm = 0xF0F
-; CHECK-NEXT:    shll $4, %edx
-; CHECK-NEXT:    andl $61680, %ecx # imm = 0xF0F0
-; CHECK-NEXT:    shrl $4, %ecx
-; CHECK-NEXT:    orl %edx, %ecx
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    andl $13107, %edx # imm = 0x3333
-; CHECK-NEXT:    andl $52428, %ecx # imm = 0xCCCC
-; CHECK-NEXT:    shrl $2, %ecx
-; CHECK-NEXT:    leal (%ecx,%edx,4), %ecx
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    andl $21845, %edx # imm = 0x5555
-; CHECK-NEXT:    andl $43690, %ecx # imm = 0xAAAA
-; CHECK-NEXT:    shrl %ecx
-; CHECK-NEXT:    leal (%ecx,%edx,2), %edx
-; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT:    # kill: %DX<def> %DX<kill> %EDX<kill>
-; CHECK-NEXT:    retl
+; X86-LABEL: test_bitreverse_v2i16:
+; X86:       # BB#0:
+; X86-NEXT:    movw {{[0-9]+}}(%esp), %cx
+; X86-NEXT:    movw {{[0-9]+}}(%esp), %ax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    andl $61680, %eax # imm = 0xF0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $13107, %edx # imm = 0x3333
+; X86-NEXT:    andl $52428, %eax # imm = 0xCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%edx,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $21845, %edx # imm = 0x5555
+; X86-NEXT:    andl $43690, %eax # imm = 0xAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%edx,2), %eax
+; X86-NEXT:    rolw $8, %cx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    andl $61680, %ecx # imm = 0xF0F0
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $13107, %edx # imm = 0x3333
+; X86-NEXT:    andl $52428, %ecx # imm = 0xCCCC
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $21845, %edx # imm = 0x5555
+; X86-NEXT:    andl $43690, %ecx # imm = 0xAAAA
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    leal (%ecx,%edx,2), %edx
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    # kill: %DX<def> %DX<kill> %EDX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bitreverse_v2i16:
+; X64:       # BB#0:
+; X64-NEXT:    pxor %xmm1, %xmm1
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; X64-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; X64-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,2,1,0,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,2,1,0,4,5,6,7]
+; X64-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; X64-NEXT:    packuswb %xmm2, %xmm0
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; X64-NEXT:    movdqa %xmm0, %xmm2
+; X64-NEXT:    pand %xmm1, %xmm2
+; X64-NEXT:    psllw $4, %xmm2
+; X64-NEXT:    movdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240]
+; X64-NEXT:    pand %xmm3, %xmm2
+; X64-NEXT:    pand %xmm3, %xmm0
+; X64-NEXT:    psrlw $4, %xmm0
+; X64-NEXT:    pand %xmm1, %xmm0
+; X64-NEXT:    por %xmm2, %xmm0
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [51,51,51,51,51,51,51,51,51,51,51,51,51,51,51,51]
+; X64-NEXT:    pand %xmm0, %xmm1
+; X64-NEXT:    psllw $2, %xmm1
+; X64-NEXT:    pand {{.*}}(%rip), %xmm1
+; X64-NEXT:    pand {{.*}}(%rip), %xmm0
+; X64-NEXT:    psrlw $2, %xmm0
+; X64-NEXT:    pand {{.*}}(%rip), %xmm0
+; X64-NEXT:    por %xmm1, %xmm0
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [85,85,85,85,85,85,85,85,85,85,85,85,85,85,85,85]
+; X64-NEXT:    pand %xmm0, %xmm1
+; X64-NEXT:    paddb %xmm1, %xmm1
+; X64-NEXT:    pand {{.*}}(%rip), %xmm0
+; X64-NEXT:    psrlw $1, %xmm0
+; X64-NEXT:    pand {{.*}}(%rip), %xmm0
+; X64-NEXT:    por %xmm1, %xmm0
+; X64-NEXT:    psrlq $48, %xmm0
+; X64-NEXT:    retq
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
   ret <2 x i16> %b
 }
 
+declare i64 @llvm.bitreverse.i64(i64) readnone
+
+define i64 @test_bitreverse_i64(i64 %a) nounwind {
+; X86-LABEL: test_bitreverse_i64:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%edx,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%edx,2), %eax
+; X86-NEXT:    bswapl %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    andl $-252645136, %ecx # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %ecx # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %ecx # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    leal (%ecx,%edx,2), %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bitreverse_i64:
+; X64:       # BB#0:
+; X64-NEXT:    bswapq %rdi
+; X64-NEXT:    movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    movabsq $-1085102592571150096, %rcx # imm = 0xF0F0F0F0F0F0F0F0
+; X64-NEXT:    andq %rdi, %rcx
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    movabsq $-3689348814741910324, %rdx # imm = 0xCCCCCCCCCCCCCCCC
+; X64-NEXT:    andq %rcx, %rdx
+; X64-NEXT:    shrq $2, %rdx
+; X64-NEXT:    leaq (%rdx,%rax,4), %rax
+; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    movabsq $-6148914691236517206, %rdx # imm = 0xAAAAAAAAAAAAAAAA
+; X64-NEXT:    andq %rax, %rdx
+; X64-NEXT:    shrq %rdx
+; X64-NEXT:    leaq (%rdx,%rcx,2), %rax
+; X64-NEXT:    retq
+  %b = call i64 @llvm.bitreverse.i64(i64 %a)
+  ret i64 %b
+}
+
+declare i32 @llvm.bitreverse.i32(i32) readnone
+
+define i32 @test_bitreverse_i32(i32 %a) nounwind {
+; X86-LABEL: test_bitreverse_i32:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bitreverse_i32:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    bswapl %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
+; X64-NEXT:    shrl $4, %edi
+; X64-NEXT:    orl %eax, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X64-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
+; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X64-NEXT:    andl $-1431655766, %eax # imm = 0xAAAAAAAA
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    retq
+  %b = call i32 @llvm.bitreverse.i32(i32 %a)
+  ret i32 %b
+}
+
 declare i24 @llvm.bitreverse.i24(i24) readnone
 
 define i24 @test_bitreverse_i24(i24 %a) nounwind {
-; CHECK-LABEL: test_bitreverse_i24:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; CHECK-NEXT:    shll $4, %ecx
-; CHECK-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
-; CHECK-NEXT:    shrl $4, %eax
-; CHECK-NEXT:    orl %ecx, %eax
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; CHECK-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
-; CHECK-NEXT:    shrl $2, %eax
-; CHECK-NEXT:    leal (%eax,%ecx,4), %eax
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    andl $1431655680, %ecx # imm = 0x55555500
-; CHECK-NEXT:    andl $-1431655936, %eax # imm = 0xAAAAAA00
-; CHECK-NEXT:    shrl %eax
-; CHECK-NEXT:    leal (%eax,%ecx,2), %eax
-; CHECK-NEXT:    shrl $8, %eax
-; CHECK-NEXT:    retl
+; X86-LABEL: test_bitreverse_i24:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $-252645136, %eax # imm = 0xF0F0F0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    andl $-858993460, %eax # imm = 0xCCCCCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655680, %ecx # imm = 0x55555500
+; X86-NEXT:    andl $-1431655936, %eax # imm = 0xAAAAAA00
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    shrl $8, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bitreverse_i24:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    bswapl %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    andl $-252645136, %edi # imm = 0xF0F0F0F0
+; X64-NEXT:    shrl $4, %edi
+; X64-NEXT:    orl %eax, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X64-NEXT:    andl $-858993460, %edi # imm = 0xCCCCCCCC
+; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $1431655680, %ecx # imm = 0x55555500
+; X64-NEXT:    andl $-1431655936, %eax # imm = 0xAAAAAA00
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    shrl $8, %eax
+; X64-NEXT:    retq
   %b = call i24 @llvm.bitreverse.i24(i24 %a)
   ret i24 %b
 }
 
+declare i16 @llvm.bitreverse.i16(i16) readnone
+
+define i16 @test_bitreverse_i16(i16 %a) nounwind {
+; X86-LABEL: test_bitreverse_i16:
+; X86:       # BB#0:
+; X86-NEXT:    movw {{[0-9]+}}(%esp), %ax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $3855, %ecx # imm = 0xF0F
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    andl $61680, %eax # imm = 0xF0F0
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $13107, %ecx # imm = 0x3333
+; X86-NEXT:    andl $52428, %eax # imm = 0xCCCC
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $21845, %ecx # imm = 0x5555
+; X86-NEXT:    andl $43690, %eax # imm = 0xAAAA
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bitreverse_i16:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    rolw $8, %di
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $3855, %eax # imm = 0xF0F
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    andl $61680, %edi # imm = 0xF0F0
+; X64-NEXT:    shrl $4, %edi
+; X64-NEXT:    orl %eax, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andl $13107, %eax # imm = 0x3333
+; X64-NEXT:    andl $52428, %edi # imm = 0xCCCC
+; X64-NEXT:    shrl $2, %edi
+; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $21845, %ecx # imm = 0x5555
+; X64-NEXT:    andl $43690, %eax # imm = 0xAAAA
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %b = call i16 @llvm.bitreverse.i16(i16 %a)
+  ret i16 %b
+}
+
 declare i8 @llvm.bitreverse.i8(i8) readnone
 
 define i8 @test_bitreverse_i8(i8 %a) {
-; CHECK-LABEL: test_bitreverse_i8:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
-; CHECK-NEXT:    rolb $4, %al
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    andb $51, %cl
-; CHECK-NEXT:    shlb $2, %cl
-; CHECK-NEXT:    andb $-52, %al
-; CHECK-NEXT:    shrb $2, %al
-; CHECK-NEXT:    orb %cl, %al
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    andb $85, %cl
-; CHECK-NEXT:    addb %cl, %cl
-; CHECK-NEXT:    andb $-86, %al
-; CHECK-NEXT:    shrb %al
-; CHECK-NEXT:    orb %cl, %al
-; CHECK-NEXT:    retl
+; X86-LABEL: test_bitreverse_i8:
+; X86:       # BB#0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    rolb $4, %al
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andb $51, %cl
+; X86-NEXT:    shlb $2, %cl
+; X86-NEXT:    andb $-52, %al
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andb $85, %cl
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    andb $-86, %al
+; X86-NEXT:    shrb %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bitreverse_i8:
+; X64:       # BB#0:
+; X64-NEXT:    rolb $4, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $51, %al
+; X64-NEXT:    shlb $2, %al
+; X64-NEXT:    andb $-52, %dil
+; X64-NEXT:    shrb $2, %dil
+; X64-NEXT:    orb %al, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $85, %al
+; X64-NEXT:    addb %al, %al
+; X64-NEXT:    andb $-86, %dil
+; X64-NEXT:    shrb %dil
+; X64-NEXT:    orb %al, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
   ret i8 %b
 }
@@ -108,24 +363,43 @@ define i8 @test_bitreverse_i8(i8 %a) {
 declare i4 @llvm.bitreverse.i4(i4) readnone
 
 define i4 @test_bitreverse_i4(i4 %a) {
-; CHECK-LABEL: test_bitreverse_i4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
-; CHECK-NEXT:    rolb $4, %al
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    andb $51, %cl
-; CHECK-NEXT:    shlb $2, %cl
-; CHECK-NEXT:    andb $-52, %al
-; CHECK-NEXT:    shrb $2, %al
-; CHECK-NEXT:    orb %cl, %al
-; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    andb $80, %cl
-; CHECK-NEXT:    addb %cl, %cl
-; CHECK-NEXT:    andb $-96, %al
-; CHECK-NEXT:    shrb %al
-; CHECK-NEXT:    orb %cl, %al
-; CHECK-NEXT:    shrb $4, %al
-; CHECK-NEXT:    retl
+; X86-LABEL: test_bitreverse_i4:
+; X86:       # BB#0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    rolb $4, %al
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andb $51, %cl
+; X86-NEXT:    shlb $2, %cl
+; X86-NEXT:    andb $-52, %al
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andb $80, %cl
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    andb $-96, %al
+; X86-NEXT:    shrb %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    shrb $4, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_bitreverse_i4:
+; X64:       # BB#0:
+; X64-NEXT:    rolb $4, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $51, %al
+; X64-NEXT:    shlb $2, %al
+; X64-NEXT:    andb $-52, %dil
+; X64-NEXT:    shrb $2, %dil
+; X64-NEXT:    orb %al, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    andb $80, %al
+; X64-NEXT:    addb %al, %al
+; X64-NEXT:    andb $-96, %dil
+; X64-NEXT:    shrb %dil
+; X64-NEXT:    orb %al, %dil
+; X64-NEXT:    shrb $4, %dil
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
   %b = call i4 @llvm.bitreverse.i4(i4 %a)
   ret i4 %b
 }
@@ -133,38 +407,58 @@ define i4 @test_bitreverse_i4(i4 %a) {
 ; These tests check that bitreverse(constant) calls are folded
 
 define <2 x i16> @fold_v2i16() {
-; CHECK-LABEL: fold_v2i16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movw $-4096, %ax # imm = 0xF000
-; CHECK-NEXT:    movw $240, %dx
-; CHECK-NEXT:    retl
+; X86-LABEL: fold_v2i16:
+; X86:       # BB#0:
+; X86-NEXT:    movw $-4096, %ax # imm = 0xF000
+; X86-NEXT:    movw $240, %dx
+; X86-NEXT:    retl
+;
+; X64-LABEL: fold_v2i16:
+; X64:       # BB#0:
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [61440,240]
+; X64-NEXT:    retq
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
   ret <2 x i16> %b
 }
 
 define i24 @fold_i24() {
-; CHECK-LABEL: fold_i24:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movl $2048, %eax # imm = 0x800
-; CHECK-NEXT:    retl
+; X86-LABEL: fold_i24:
+; X86:       # BB#0:
+; X86-NEXT:    movl $2048, %eax # imm = 0x800
+; X86-NEXT:    retl
+;
+; X64-LABEL: fold_i24:
+; X64:       # BB#0:
+; X64-NEXT:    movl $2048, %eax # imm = 0x800
+; X64-NEXT:    retq
   %b = call i24 @llvm.bitreverse.i24(i24 4096)
   ret i24 %b
 }
 
 define i8 @fold_i8() {
-; CHECK-LABEL: fold_i8:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $-16, %al
-; CHECK-NEXT:    retl
+; X86-LABEL: fold_i8:
+; X86:       # BB#0:
+; X86-NEXT:    movb $-16, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: fold_i8:
+; X64:       # BB#0:
+; X64-NEXT:    movb $-16, %al
+; X64-NEXT:    retq
   %b = call i8 @llvm.bitreverse.i8(i8 15)
   ret i8 %b
 }
 
 define i4 @fold_i4() {
-; CHECK-LABEL: fold_i4:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movb $1, %al
-; CHECK-NEXT:    retl
+; X86-LABEL: fold_i4:
+; X86:       # BB#0:
+; X86-NEXT:    movb $1, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: fold_i4:
+; X64:       # BB#0:
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    retq
   %b = call i4 @llvm.bitreverse.i4(i4 8)
   ret i4 %b
 }
@@ -172,21 +466,30 @@ define i4 @fold_i4() {
 ; These tests check that bitreverse(bitreverse()) calls are removed
 
 define i8 @identity_i8(i8 %a) {
-; CHECK-LABEL: identity_i8:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movb {{[0-9]+}}(%esp), %al
-; CHECK-NEXT:    retl
+; X86-LABEL: identity_i8:
+; X86:       # BB#0:
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: identity_i8:
+; X64:       # BB#0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
   %b = call i8 @llvm.bitreverse.i8(i8 %a)
   %c = call i8 @llvm.bitreverse.i8(i8 %b)
   ret i8 %c
 }
 
 define <2 x i16> @identity_v2i16(<2 x i16> %a) {
-; CHECK-LABEL: identity_v2i16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
-; CHECK-NEXT:    retl
+; X86-LABEL: identity_v2i16:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: identity_v2i16:
+; X64:       # BB#0:
+; X64-NEXT:    retq
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %a)
   %c = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> %b)
   ret <2 x i16> %c
@@ -195,17 +498,25 @@ define <2 x i16> @identity_v2i16(<2 x i16> %a) {
 ; These tests check that bitreverse(undef) calls are removed
 
 define i8 @undef_i8() {
-; CHECK-LABEL: undef_i8:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    retl
+; X86-LABEL: undef_i8:
+; X86:       # BB#0:
+; X86-NEXT:    retl
+;
+; X64-LABEL: undef_i8:
+; X64:       # BB#0:
+; X64-NEXT:    retq
   %b = call i8 @llvm.bitreverse.i8(i8 undef)
   ret i8 %b
 }
 
 define <2 x i16> @undef_v2i16() {
-; CHECK-LABEL: undef_v2i16:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    retl
+; X86-LABEL: undef_v2i16:
+; X86:       # BB#0:
+; X86-NEXT:    retl
+;
+; X64-LABEL: undef_v2i16:
+; X64:       # BB#0:
+; X64-NEXT:    retq
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> undef)
   ret <2 x i16> %b
 }
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index 807dfe464cbeabb5f1f0bd21d16082263fa53855..c7de65d84507b8260236f7520700968891e3221b 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -314,7 +314,7 @@ exit:
 define void @unnatural_cfg1() {
 ; Test that we can handle a loop with an inner unnatural loop at the end of
 ; a function. This is a gross CFG reduced out of the single source GCC.
-; CHECK: unnatural_cfg1
+; CHECK-LABEL: unnatural_cfg1
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
@@ -352,17 +352,15 @@ define void @unnatural_cfg2() {
 ; Test that we can handle a loop with a nested natural loop *and* an unnatural
 ; loop. This was reduced from a crash on block placement when run over
 ; single-source GCC.
-; CHECK: unnatural_cfg2
+; CHECK-LABEL: unnatural_cfg2
 ; CHECK: %entry
 ; CHECK: %loop.body1
 ; CHECK: %loop.body2
-; CHECK: %loop.body3
-; CHECK: %loop.inner1.begin
-; The end block is folded with %loop.body3...
-; CHECK-NOT: %loop.inner1.end
 ; CHECK: %loop.body4
 ; CHECK: %loop.inner2.begin
-; The loop.inner2.end block is folded
+; CHECK: %loop.inner2.begin
+; CHECK: %loop.body3
+; CHECK: %loop.inner1.begin
 ; CHECK: %loop.header
 ; CHECK: %bail
 
@@ -559,7 +557,7 @@ define void @test_eh_lpad_successor() personality i8* bitcast (i32 (...)* @__gxx
 ; didn't correctly locate the fallthrough successor, assuming blindly that the
 ; first one was the fallthrough successor. As a result, we would add an
 ; erroneous jump to the landing pad thinking *that* was the default successor.
-; CHECK: test_eh_lpad_successor
+; CHECK-LABEL: test_eh_lpad_successor
 ; CHECK: %entry
 ; CHECK-NOT: jmp
 ; CHECK: %loop
@@ -587,7 +585,7 @@ define void @test_eh_throw() personality i8* bitcast (i32 (...)* @__gxx_personal
 ; fallthrough simply won't occur. Make sure we don't crash trying to update
 ; terminators for such constructs.
 ;
-; CHECK: test_eh_throw
+; CHECK-LABEL: test_eh_throw
 ; CHECK: %entry
 ; CHECK: %cleanup
 
@@ -609,7 +607,7 @@ define void @test_unnatural_cfg_backwards_inner_loop() {
 ; attempt to merge onto the wrong end of the inner loop just because we find it
 ; first. This was reduced from a crasher in GCC's single source.
 ;
-; CHECK: test_unnatural_cfg_backwards_inner_loop
+; CHECK-LABEL: test_unnatural_cfg_backwards_inner_loop
 ; CHECK: %entry
 ; CHECK: %loop2b
 ; CHECK: %loop1
@@ -649,7 +647,7 @@ define void @unanalyzable_branch_to_loop_header() {
 ; fallthrough because that happens to always produce unanalyzable branches on
 ; x86.
 ;
-; CHECK: unanalyzable_branch_to_loop_header
+; CHECK-LABEL: unanalyzable_branch_to_loop_header
 ; CHECK: %entry
 ; CHECK: %loop
 ; CHECK: %exit
@@ -673,7 +671,7 @@ define void @unanalyzable_branch_to_best_succ(i1 %cond) {
 ; This branch is now analyzable and hence the destination block becomes the
 ; hotter one. The right order is entry->bar->exit->foo.
 ;
-; CHECK: unanalyzable_branch_to_best_succ
+; CHECK-LABEL: unanalyzable_branch_to_best_succ
 ; CHECK: %entry
 ; CHECK: %bar
 ; CHECK: %exit
@@ -699,7 +697,7 @@ define void @unanalyzable_branch_to_free_block(float %x) {
 ; Ensure that we can handle unanalyzable branches where the destination block
 ; gets selected as the best free block in the CFG.
 ;
-; CHECK: unanalyzable_branch_to_free_block
+; CHECK-LABEL: unanalyzable_branch_to_free_block
 ; CHECK: %entry
 ; CHECK: %a
 ; CHECK: %b
@@ -729,7 +727,7 @@ define void @many_unanalyzable_branches() {
 ; Ensure that we don't crash as we're building up many unanalyzable branches,
 ; blocks, and loops.
 ;
-; CHECK: many_unanalyzable_branches
+; CHECK-LABEL: many_unanalyzable_branches
 ; CHECK: %entry
 ; CHECK: %exit
 
@@ -948,7 +946,7 @@ define void @benchmark_heapsort(i32 %n, double* nocapture %ra) {
 ;    strange layouts that are siginificantly less efficient, often times maing
 ;    it discontiguous.
 ;
-; CHECK: @benchmark_heapsort
+; CHECK-LABEL: @benchmark_heapsort
 ; CHECK: %entry
 ; First rotated loop top.
 ; CHECK: .p2align
@@ -1456,9 +1454,50 @@ exit:
   ret void
 }
 
+; Because %endif has a higher frequency than %if, the calculations show we
+; shouldn't tail-duplicate %endif so that we can place it after %if. We were
+; previously undercounting the cost by ignoring execution frequency that didn't
+; come from the %if->%endif path.
+; CHECK-LABEL: higher_frequency_succ_tail_dup
+; CHECK: %entry
+; CHECK: %elseif
+; CHECK: %else
+; CHECK: %endif
+; CHECK: %then
+; CHECK: %ret
+define void @higher_frequency_succ_tail_dup(i1 %a, i1 %b, i1 %c) {
+entry:
+  br label %if
+if:                                               ; preds = %entry
+  call void @effect(i32 0)
+  br i1 %a, label %elseif, label %endif, !prof !11 ; even
+
+elseif:                                           ; preds = %if
+  call void @effect(i32 1)
+  br i1 %b, label %else, label %endif, !prof !11 ; even
+
+else:                                             ; preds = %elseif
+  call void @effect(i32 2)
+  br label %endif
+
+endif:                                            ; preds = %if, %elseif, %else
+  br i1 %c, label %then, label %ret, !prof !12 ; 5 to 3
+
+then:                                             ; preds = %endif
+  call void @effect(i32 3)
+  br label %ret
+
+ret:                                              ; preds = %endif, %then
+  ret void
+}
+
+declare void @effect(i32)
+
 !5 = !{!"branch_weights", i32 84, i32 16}
 !6 = !{!"function_entry_count", i32 10}
 !7 = !{!"branch_weights", i32 60, i32 40}
 !8 = !{!"branch_weights", i32 5001, i32 4999}
 !9 = !{!"branch_weights", i32 85, i32 15}
 !10 = !{!"branch_weights", i32 90, i32 10}
+!11 = !{!"branch_weights", i32 1, i32 1}
+!12 = !{!"branch_weights", i32 5, i32 3}
diff --git a/test/CodeGen/X86/block-placement.mir b/test/CodeGen/X86/block-placement.mir
index 7d13c3e529cafef059cc645e11672174e4d95369..c0cd7057d5c6f57c1735ba01f88918cfdc4fbcf1 100644
--- a/test/CodeGen/X86/block-placement.mir
+++ b/test/CodeGen/X86/block-placement.mir
@@ -46,7 +46,7 @@ liveins:
   - { reg: '%rdi' }
   - { reg: '%esi' }
 
-# CHECK: %eax = FAULTING_LOAD_OP %bb.3.null, 1684, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
+# CHECK: %eax = FAULTING_OP 1, %bb.3.null, 1684, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
 # CHECK-NEXT: JMP_1 %bb.2.not_null
 # CHECK: bb.3.null:
 # CHECK:  bb.4.right:
@@ -66,7 +66,7 @@ body:             |
     successors: %bb.2.null(0x7ffff800), %bb.4.not_null(0x00000800)
     liveins: %rdi
   
-    %eax = FAULTING_LOAD_OP %bb.2.null, 1684, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
+    %eax = FAULTING_OP 1, %bb.2.null, 1684, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
     JMP_1 %bb.4.not_null
   
   bb.4.not_null:
diff --git a/test/CodeGen/X86/bool-ext-inc.ll b/test/CodeGen/X86/bool-ext-inc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d0967c102149290cf074312e42592f967c5c35b2
--- /dev/null
+++ b/test/CodeGen/X86/bool-ext-inc.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; FIXME: add (sext i1 X), 1 -> zext (not i1 X)
+
+define i32 @sext_inc(i1 zeroext %x) nounwind {
+; CHECK-LABEL: sext_inc:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzbl %dil, %ecx
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    retq
+  %ext = sext i1 %x to i32
+  %add = add i32 %ext, 1
+  ret i32 %add
+}
+
+; FIXME: add (sext i1 X), 1 -> zext (not i1 X)
+
+define <4 x i32> @sext_inc_vec(<4 x i1> %x) nounwind {
+; CHECK-LABEL: sext_inc_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pslld $31, %xmm0
+; CHECK-NEXT:    psrad $31, %xmm0
+; CHECK-NEXT:    paddd {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
+  %ext = sext <4 x i1> %x to <4 x i32>
+  %add = add <4 x i32> %ext, <i32 1, i32 1, i32 1, i32 1>
+  ret <4 x i32> %add
+}
+
+
diff --git a/test/CodeGen/X86/branchfolding-debugloc.ll b/test/CodeGen/X86/branchfolding-debugloc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3ad8315f083dd823e8917fb0ae08d4ea3c282a10
--- /dev/null
+++ b/test/CodeGen/X86/branchfolding-debugloc.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s | FileCheck %s
+;
+; The test code is generated from the following source code:
+; 
+;  1 extern int bar(int x);
+;  2
+;  3 int foo(int *begin, int *end) {
+;  4   int *i;
+;  5   int ret = 0;
+;  6   for (
+;  7       i = begin ;
+;  8       i != end ;
+;  9       i++)
+; 10   {
+; 11       ret += bar(*i);
+; 12   }
+; 13   return ret;
+; 14 }
+; 
+; CHECK: # %entry
+; CHECK-NOT: # %for.body
+; CHECK: .loc  1 6 3
+; CHECK-NEXT: je  [[BB:.LBB[^ ]+]]
+; CHECK: [[BB]]:{{.}}# %for.end
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo(i32* readonly %begin, i32* readnone %end) !dbg !4 {
+entry:
+  %cmp6 = icmp eq i32* %begin, %end, !dbg !9
+  br i1 %cmp6, label %for.end, label %for.body.preheader, !dbg !12
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !13
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %ret.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %i.07 = phi i32* [ %incdec.ptr, %for.body ], [ %begin, %for.body.preheader ]
+  %0 = load i32, i32* %i.07, align 4, !dbg !13, !tbaa !15
+  %call = tail call i32 @bar(i32 %0), !dbg !19
+  %add = add nsw i32 %call, %ret.08, !dbg !20
+  %incdec.ptr = getelementptr inbounds i32, i32* %i.07, i64 1, !dbg !21
+  %cmp = icmp eq i32* %incdec.ptr, %end, !dbg !9
+  br i1 %cmp, label %for.end.loopexit, label %for.body, !dbg !12, !llvm.loop !22
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end, !dbg !24
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %ret.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
+  ret i32 %ret.0.lcssa, !dbg !24
+}
+
+declare i32 @bar(i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "foo.c", directory: "b/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7, !8, !8}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64)
+!9 = !DILocation(line: 8, column: 9, scope: !10)
+!10 = distinct !DILexicalBlock(scope: !11, file: !1, line: 6, column: 3)
+!11 = distinct !DILexicalBlock(scope: !4, file: !1, line: 6, column: 3)
+!12 = !DILocation(line: 6, column: 3, scope: !11)
+!13 = !DILocation(line: 11, column: 18, scope: !14)
+!14 = distinct !DILexicalBlock(scope: !10, file: !1, line: 10, column: 3)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !17, i64 0}
+!17 = !{!"omnipotent char", !18, i64 0}
+!18 = !{!"Simple C/C++ TBAA"}
+!19 = !DILocation(line: 11, column: 14, scope: !14)
+!20 = !DILocation(line: 11, column: 11, scope: !14)
+!21 = !DILocation(line: 9, column: 8, scope: !10)
+!22 = distinct !{!22, !12, !23}
+!23 = !DILocation(line: 12, column: 3, scope: !11)
+!24 = !DILocation(line: 13, column: 3, scope: !4)
diff --git a/test/CodeGen/X86/brcond.ll b/test/CodeGen/X86/brcond.ll
index f4db3ba7fecb5ea7ddca8f65f52984181e963315..ce8a0dab98ce9f117e16c0a004a7a876e811ce22 100644
--- a/test/CodeGen/X86/brcond.ll
+++ b/test/CodeGen/X86/brcond.ll
@@ -30,45 +30,6 @@ declare i32 @foo(...)
 declare i32 @bar(...)
 
 
-
-; PR3351 - (P == 0) & (Q == 0) -> (P|Q) == 0
-define i32 @test2(i32* %P, i32* %Q) nounwind ssp {
-entry:
-  %a = icmp eq i32* %P, null                    ; <i1> [#uses=1]
-  %b = icmp eq i32* %Q, null                    ; <i1> [#uses=1]
-  %c = and i1 %a, %b
-  br i1 %c, label %bb1, label %return
-
-bb1:                                              ; preds = %entry
-  ret i32 4
-
-return:                                           ; preds = %entry
-  ret i32 192
-; CHECK-LABEL: test2:
-; CHECK:	movl	4(%esp), %eax
-; CHECK-NEXT:	orl	8(%esp), %eax
-; CHECK-NEXT:	jne	LBB1_2
-}
-
-; PR3351 - (P != 0) | (Q != 0) -> (P|Q) != 0
-define i32 @test3(i32* %P, i32* %Q) nounwind ssp {
-entry:
-  %a = icmp ne i32* %P, null                    ; <i1> [#uses=1]
-  %b = icmp ne i32* %Q, null                    ; <i1> [#uses=1]
-  %c = or i1 %a, %b
-  br i1 %c, label %bb1, label %return
-
-bb1:                                              ; preds = %entry
-  ret i32 4
-
-return:                                           ; preds = %entry
-  ret i32 192
-; CHECK-LABEL: test3:
-; CHECK:	movl	4(%esp), %eax
-; CHECK-NEXT:	orl	8(%esp), %eax
-; CHECK-NEXT:	je	LBB2_2
-}
-
 ; <rdar://problem/7598384>:
 ;
 ;    jCC  L1
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index 6576f33a5b9c25ce842bd9388f5407794a79be52..cebcba38bd4fe871fbb144d1b86cfb248319aa75 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -43,7 +43,7 @@ define void @test2b(i32 %x, i32 %n) nounwind {
 ; CHECK-LABEL: test2b:
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
-; CHECK-NEXT:    jb .LBB1_2
+; CHECK-NEXT:    jae .LBB1_1
 ;
 entry:
   %tmp29 = lshr i32 %x, %n
@@ -83,7 +83,7 @@ define void @atest2b(i32 %x, i32 %n) nounwind {
 ; CHECK-LABEL: atest2b:
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
-; CHECK-NEXT:    jb .LBB3_2
+; CHECK-NEXT:    jae .LBB3_1
 ;
 entry:
   %tmp29 = ashr i32 %x, %n
@@ -103,7 +103,7 @@ define void @test3(i32 %x, i32 %n) nounwind {
 ; CHECK-LABEL: test3:
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
-; CHECK-NEXT:    jb .LBB4_2
+; CHECK-NEXT:    jae .LBB4_1
 ;
 entry:
   %tmp29 = shl i32 1, %n
@@ -123,7 +123,7 @@ define void @test3b(i32 %x, i32 %n) nounwind {
 ; CHECK-LABEL: test3b:
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    btl %esi, %edi
-; CHECK-NEXT:    jb .LBB5_2
+; CHECK-NEXT:    jae .LBB5_1
 ;
 entry:
   %tmp29 = shl i32 1, %n
diff --git a/test/CodeGen/X86/buildvec-insertvec.ll b/test/CodeGen/X86/buildvec-insertvec.ll
index 616d352a75d3f3ed7130db0e38be8ad3669167c9..730376acdc9379f33ceb97ef842af722789ffc41 100644
--- a/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/test/CodeGen/X86/buildvec-insertvec.ll
@@ -1,15 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE41
 
 define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
-; CHECK-LABEL: foo:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    cvttps2dq %xmm0, %xmm0
-; CHECK-NEXT:    movl $255, %eax
-; CHECK-NEXT:    pinsrd $3, %eax, %xmm0
-; CHECK-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; CHECK-NEXT:    movd %xmm0, (%rdi)
-; CHECK-NEXT:    retq
+; SSE2-LABEL: foo:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE2-NEXT:    movl $255, %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[2,0]
+; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
+; SSE2-NEXT:    andps {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    packuswb %xmm0, %xmm0
+; SSE2-NEXT:    movd %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: foo:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    cvttps2dq %xmm0, %xmm0
+; SSE41-NEXT:    movl $255, %eax
+; SSE41-NEXT:    pinsrd $3, %eax, %xmm0
+; SSE41-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movd %xmm0, (%rdi)
+; SSE41-NEXT:    retq
   %t0 = fptoui <3 x float> %in to <3 x i8>
   %t1 = shufflevector <3 x i8> %t0, <3 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 undef>
   %t2 = insertelement <4 x i8> %t1, i8 -1, i32 3
@@ -21,10 +35,21 @@ define void @foo(<3 x float> %in, <4 x i8>* nocapture %out) nounwind {
 ; blend with a zero vector if the build_vector contains negative zero.
 
 define <4 x float> @test_negative_zero_1(<4 x float> %A) {
-; CHECK-LABEL: test_negative_zero_1:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2],zero
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_negative_zero_1:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    xorps %xmm2, %xmm2
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3]
+; SSE2-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_negative_zero_1:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2],zero
+; SSE41-NEXT:    retq
 entry:
   %0 = extractelement <4 x float> %A, i32 0
   %1 = insertelement <4 x float> undef, float %0, i32 0
@@ -48,12 +73,19 @@ entry:
 }
 
 define <4 x float> @test_buildvector_v4f32_register(float %f0, float %f1, float %f2, float %f3) {
-; CHECK-LABEL: test_buildvector_v4f32_register:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_buildvector_v4f32_register:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_buildvector_v4f32_register:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0]
+; SSE41-NEXT:    retq
   %ins0 = insertelement <4 x float> undef, float %f0, i32 0
   %ins1 = insertelement <4 x float> %ins0, float %f1, i32 1
   %ins2 = insertelement <4 x float> %ins1, float %f2, i32 2
@@ -62,13 +94,24 @@ define <4 x float> @test_buildvector_v4f32_register(float %f0, float %f1, float
 }
 
 define <4 x float> @test_buildvector_v4f32_load(float* %p0, float* %p1, float* %p2, float* %p3) {
-; CHECK-LABEL: test_buildvector_v4f32_load:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_buildvector_v4f32_load:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_buildvector_v4f32_load:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE41-NEXT:    retq
   %f0 = load float, float* %p0, align 4
   %f1 = load float, float* %p1, align 4
   %f2 = load float, float* %p2, align 4
@@ -81,12 +124,20 @@ define <4 x float> @test_buildvector_v4f32_load(float* %p0, float* %p1, float* %
 }
 
 define <4 x float> @test_buildvector_v4f32_partial_load(float %f0, float %f1, float %f2, float* %p3) {
-; CHECK-LABEL: test_buildvector_v4f32_partial_load:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
-; CHECK-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; CHECK-NEXT:    retq
+; SSE2-LABEL: test_buildvector_v4f32_partial_load:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_buildvector_v4f32_partial_load:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3]
+; SSE41-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; SSE41-NEXT:    retq
   %f3 = load float, float* %p3, align 4
   %ins0 = insertelement <4 x float> undef, float %f0, i32 0
   %ins1 = insertelement <4 x float> %ins0, float %f1, i32 1
@@ -94,3 +145,405 @@ define <4 x float> @test_buildvector_v4f32_partial_load(float %f0, float %f1, fl
   %ins3 = insertelement <4 x float> %ins2, float %f3, i32 3
   ret <4 x float> %ins3
 }
+
+define <4 x i32> @test_buildvector_v4i32_register(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; SSE2-LABEL: test_buildvector_v4i32_register:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movd %esi, %xmm1
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2-NEXT:    movd %edx, %xmm2
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_buildvector_v4i32_register:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pinsrd $1, %esi, %xmm0
+; SSE41-NEXT:    pinsrd $2, %edx, %xmm0
+; SSE41-NEXT:    pinsrd $3, %ecx, %xmm0
+; SSE41-NEXT:    retq
+  %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+  %ins1 = insertelement <4 x i32> %ins0, i32 %a1, i32 1
+  %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2
+  %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3
+  ret <4 x i32> %ins3
+}
+
+define <4 x i32> @test_buildvector_v4i32_partial(i32 %a0, i32 %a3) {
+; SSE2-LABEL: test_buildvector_v4i32_partial:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    movd %esi, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_buildvector_v4i32_partial:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pinsrd $3, %esi, %xmm0
+; SSE41-NEXT:    retq
+  %ins0 = insertelement <4 x i32> undef, i32   %a0, i32 0
+  %ins1 = insertelement <4 x i32> %ins0, i32 undef, i32 1
+  %ins2 = insertelement <4 x i32> %ins1, i32 undef, i32 2
+  %ins3 = insertelement <4 x i32> %ins2, i32   %a3, i32 3
+  ret <4 x i32> %ins3
+}
+
+define <4 x i32> @test_buildvector_v4i32_register_zero(i32 %a0, i32 %a2, i32 %a3) {
+; CHECK-LABEL: test_buildvector_v4i32_register_zero:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    movd %esi, %xmm1
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    movd %edi, %xmm0
+; CHECK-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; CHECK-NEXT:    retq
+  %ins0 = insertelement <4 x i32> undef, i32 %a0, i32 0
+  %ins1 = insertelement <4 x i32> %ins0, i32   0, i32 1
+  %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2
+  %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3
+  ret <4 x i32> %ins3
+}
+
+define <4 x i32> @test_buildvector_v4i32_register_zero_2(i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: test_buildvector_v4i32_register_zero_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movd %edx, %xmm0
+; CHECK-NEXT:    movd %esi, %xmm1
+; CHECK-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    movd %edi, %xmm0
+; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1]
+; CHECK-NEXT:    retq
+  %ins0 = insertelement <4 x i32> undef, i32   0, i32 0
+  %ins1 = insertelement <4 x i32> %ins0, i32 %a1, i32 1
+  %ins2 = insertelement <4 x i32> %ins1, i32 %a2, i32 2
+  %ins3 = insertelement <4 x i32> %ins2, i32 %a3, i32 3
+  ret <4 x i32> %ins3
+}
+
+define <8 x i16> @test_buildvector_v8i16_register(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) {
+; SSE2-LABEL: test_buildvector_v8i16_register:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    movd %r9d, %xmm1
+; SSE2-NEXT:    movd %esi, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movd %r8d, %xmm3
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_buildvector_v8i16_register:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pinsrw $1, %esi, %xmm0
+; SSE41-NEXT:    pinsrw $2, %edx, %xmm0
+; SSE41-NEXT:    pinsrw $3, %ecx, %xmm0
+; SSE41-NEXT:    pinsrw $4, %r8d, %xmm0
+; SSE41-NEXT:    pinsrw $5, %r9d, %xmm0
+; SSE41-NEXT:    pinsrw $6, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrw $7, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    retq
+  %ins0 = insertelement <8 x i16> undef, i16 %a0, i32 0
+  %ins1 = insertelement <8 x i16> %ins0, i16 %a1, i32 1
+  %ins2 = insertelement <8 x i16> %ins1, i16 %a2, i32 2
+  %ins3 = insertelement <8 x i16> %ins2, i16 %a3, i32 3
+  %ins4 = insertelement <8 x i16> %ins3, i16 %a4, i32 4
+  %ins5 = insertelement <8 x i16> %ins4, i16 %a5, i32 5
+  %ins6 = insertelement <8 x i16> %ins5, i16 %a6, i32 6
+  %ins7 = insertelement <8 x i16> %ins6, i16 %a7, i32 7
+  ret <8 x i16> %ins7
+}
+
+define <8 x i16> @test_buildvector_v8i16_partial(i16 %a1, i16 %a3, i16 %a4, i16 %a5) {
+; CHECK-LABEL: test_buildvector_v8i16_partial:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    pinsrw $1, %edi, %xmm0
+; CHECK-NEXT:    pinsrw $3, %esi, %xmm0
+; CHECK-NEXT:    pinsrw $4, %edx, %xmm0
+; CHECK-NEXT:    pinsrw $5, %ecx, %xmm0
+; CHECK-NEXT:    retq
+  %ins0 = insertelement <8 x i16> undef, i16 undef, i32 0
+  %ins1 = insertelement <8 x i16> %ins0, i16   %a1, i32 1
+  %ins2 = insertelement <8 x i16> %ins1, i16 undef, i32 2
+  %ins3 = insertelement <8 x i16> %ins2, i16   %a3, i32 3
+  %ins4 = insertelement <8 x i16> %ins3, i16   %a4, i32 4
+  %ins5 = insertelement <8 x i16> %ins4, i16   %a5, i32 5
+  %ins6 = insertelement <8 x i16> %ins5, i16 undef, i32 6
+  %ins7 = insertelement <8 x i16> %ins6, i16 undef, i32 7
+  ret <8 x i16> %ins7
+}
+
+define <8 x i16> @test_buildvector_v8i16_register_zero(i16 %a0, i16 %a3, i16 %a4, i16 %a5) {
+; CHECK-LABEL: test_buildvector_v8i16_register_zero:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    pinsrw $0, %edi, %xmm0
+; CHECK-NEXT:    pinsrw $3, %esi, %xmm0
+; CHECK-NEXT:    pinsrw $4, %edx, %xmm0
+; CHECK-NEXT:    pinsrw $5, %ecx, %xmm0
+; CHECK-NEXT:    retq
+  %ins0 = insertelement <8 x i16> undef, i16   %a0, i32 0
+  %ins1 = insertelement <8 x i16> %ins0, i16     0, i32 1
+  %ins2 = insertelement <8 x i16> %ins1, i16     0, i32 2
+  %ins3 = insertelement <8 x i16> %ins2, i16   %a3, i32 3
+  %ins4 = insertelement <8 x i16> %ins3, i16   %a4, i32 4
+  %ins5 = insertelement <8 x i16> %ins4, i16   %a5, i32 5
+  %ins6 = insertelement <8 x i16> %ins5, i16     0, i32 6
+  %ins7 = insertelement <8 x i16> %ins6, i16     0, i32 7
+  ret <8 x i16> %ins7
+}
+
+define <8 x i16> @test_buildvector_v8i16_register_zero_2(i16 %a1, i16 %a3, i16 %a4, i16 %a5) {
+; CHECK-LABEL: test_buildvector_v8i16_register_zero_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pxor %xmm0, %xmm0
+; CHECK-NEXT:    pinsrw $1, %edi, %xmm0
+; CHECK-NEXT:    pinsrw $3, %esi, %xmm0
+; CHECK-NEXT:    pinsrw $4, %edx, %xmm0
+; CHECK-NEXT:    pinsrw $5, %ecx, %xmm0
+; CHECK-NEXT:    retq
+  %ins0 = insertelement <8 x i16> undef, i16     0, i32 0
+  %ins1 = insertelement <8 x i16> %ins0, i16   %a1, i32 1
+  %ins2 = insertelement <8 x i16> %ins1, i16     0, i32 2
+  %ins3 = insertelement <8 x i16> %ins2, i16   %a3, i32 3
+  %ins4 = insertelement <8 x i16> %ins3, i16   %a4, i32 4
+  %ins5 = insertelement <8 x i16> %ins4, i16   %a5, i32 5
+  %ins6 = insertelement <8 x i16> %ins5, i16     0, i32 6
+  %ins7 = insertelement <8 x i16> %ins6, i16     0, i32 7
+  ret <8 x i16> %ins7
+}
+
+define <16 x i8> @test_buildvector_v16i8_register(i8 %a0, i8 %a1, i8 %a2, i8 %a3, i8 %a4, i8 %a5, i8 %a6, i8 %a7, i8 %a8, i8 %a9, i8 %a10, i8 %a11, i8 %a12, i8 %a13, i8 %a14, i8 %a15) {
+; SSE2-LABEL: test_buildvector_v16i8_register:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movd %r9d, %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movd %esi, %xmm2
+; SSE2-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movd %edx, %xmm3
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE2-NEXT:    movd %r8d, %xmm1
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_buildvector_v16i8_register:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    pinsrb $1, %esi, %xmm0
+; SSE41-NEXT:    pinsrb $2, %edx, %xmm0
+; SSE41-NEXT:    pinsrb $3, %ecx, %xmm0
+; SSE41-NEXT:    pinsrb $4, %r8d, %xmm0
+; SSE41-NEXT:    pinsrb $5, %r9d, %xmm0
+; SSE41-NEXT:    pinsrb $6, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrb $7, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrb $8, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrb $9, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrb $10, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrb $11, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrb $12, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrb $13, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrb $14, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    pinsrb $15, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    retq
+  %ins0  = insertelement <16 x i8> undef,  i8 %a0,  i32 0
+  %ins1  = insertelement <16 x i8> %ins0,  i8 %a1,  i32 1
+  %ins2  = insertelement <16 x i8> %ins1,  i8 %a2,  i32 2
+  %ins3  = insertelement <16 x i8> %ins2,  i8 %a3,  i32 3
+  %ins4  = insertelement <16 x i8> %ins3,  i8 %a4,  i32 4
+  %ins5  = insertelement <16 x i8> %ins4,  i8 %a5,  i32 5
+  %ins6  = insertelement <16 x i8> %ins5,  i8 %a6,  i32 6
+  %ins7  = insertelement <16 x i8> %ins6,  i8 %a7,  i32 7
+  %ins8  = insertelement <16 x i8> %ins7,  i8 %a8,  i32 8
+  %ins9  = insertelement <16 x i8> %ins8,  i8 %a9,  i32 9
+  %ins10 = insertelement <16 x i8> %ins9,  i8 %a10, i32 10
+  %ins11 = insertelement <16 x i8> %ins10, i8 %a11, i32 11
+  %ins12 = insertelement <16 x i8> %ins11, i8 %a12, i32 12
+  %ins13 = insertelement <16 x i8> %ins12, i8 %a13, i32 13
+  %ins14 = insertelement <16 x i8> %ins13, i8 %a14, i32 14
+  %ins15 = insertelement <16 x i8> %ins14, i8 %a15, i32 15
+  ret <16 x i8> %ins15
+}
+
+define <16 x i8> @test_buildvector_v16i8_partial(i8 %a2, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
+; SSE2-LABEL: test_buildvector_v16i8_partial:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl %dil, %eax
+; SSE2-NEXT:    pinsrw $1, %eax, %xmm0
+; SSE2-NEXT:    movzbl %sil, %eax
+; SSE2-NEXT:    pinsrw $3, %eax, %xmm0
+; SSE2-NEXT:    movzbl %dl, %eax
+; SSE2-NEXT:    pinsrw $4, %eax, %xmm0
+; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    pinsrw $5, %ecx, %xmm0
+; SSE2-NEXT:    movzbl %r8b, %eax
+; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
+; SSE2-NEXT:    shll $8, %r9d
+; SSE2-NEXT:    pinsrw $7, %r9d, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_buildvector_v16i8_partial:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
+; SSE41-NEXT:    pinsrb $6, %esi, %xmm0
+; SSE41-NEXT:    pinsrb $8, %edx, %xmm0
+; SSE41-NEXT:    pinsrb $11, %ecx, %xmm0
+; SSE41-NEXT:    pinsrb $12, %r8d, %xmm0
+; SSE41-NEXT:    pinsrb $15, %r9d, %xmm0
+; SSE41-NEXT:    retq
+  %ins0  = insertelement <16 x i8> undef,  i8 undef, i32 0
+  %ins1  = insertelement <16 x i8> %ins0,  i8 undef, i32 1
+  %ins2  = insertelement <16 x i8> %ins1,  i8   %a2, i32 2
+  %ins3  = insertelement <16 x i8> %ins2,  i8 undef, i32 3
+  %ins4  = insertelement <16 x i8> %ins3,  i8 undef, i32 4
+  %ins5  = insertelement <16 x i8> %ins4,  i8 undef, i32 5
+  %ins6  = insertelement <16 x i8> %ins5,  i8   %a6, i32 6
+  %ins7  = insertelement <16 x i8> %ins6,  i8 undef, i32 7
+  %ins8  = insertelement <16 x i8> %ins7,  i8   %a8, i32 8
+  %ins9  = insertelement <16 x i8> %ins8,  i8 undef, i32 9
+  %ins10 = insertelement <16 x i8> %ins9,  i8 undef, i32 10
+  %ins11 = insertelement <16 x i8> %ins10, i8  %a11, i32 11
+  %ins12 = insertelement <16 x i8> %ins11, i8  %a12, i32 12
+  %ins13 = insertelement <16 x i8> %ins12, i8 undef, i32 13
+  %ins14 = insertelement <16 x i8> %ins13, i8 undef, i32 14
+  %ins15 = insertelement <16 x i8> %ins14, i8  %a15, i32 15
+  ret <16 x i8> %ins15
+}
+
+define <16 x i8> @test_buildvector_v16i8_register_zero(i8 %a0, i8 %a4, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
+; SSE2-LABEL: test_buildvector_v16i8_register_zero:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movzbl %sil, %eax
+; SSE2-NEXT:    movzbl %dil, %esi
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    pinsrw $2, %eax, %xmm0
+; SSE2-NEXT:    movzbl %dl, %eax
+; SSE2-NEXT:    pinsrw $3, %eax, %xmm0
+; SSE2-NEXT:    movzbl %cl, %eax
+; SSE2-NEXT:    pinsrw $4, %eax, %xmm0
+; SSE2-NEXT:    shll $8, %r8d
+; SSE2-NEXT:    pinsrw $5, %r8d, %xmm0
+; SSE2-NEXT:    movzbl %r9b, %eax
+; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
+; SSE2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    shll $8, %eax
+; SSE2-NEXT:    pinsrw $7, %eax, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_buildvector_v16i8_register_zero:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pinsrb $0, %edi, %xmm0
+; SSE41-NEXT:    pinsrb $4, %esi, %xmm0
+; SSE41-NEXT:    pinsrb $6, %edx, %xmm0
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
+; SSE41-NEXT:    pinsrb $11, %r8d, %xmm0
+; SSE41-NEXT:    pinsrb $12, %r9d, %xmm0
+; SSE41-NEXT:    pinsrb $15, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    retq
+  %ins0  = insertelement <16 x i8> undef,  i8   %a0, i32 0
+  %ins1  = insertelement <16 x i8> %ins0,  i8     0, i32 1
+  %ins2  = insertelement <16 x i8> %ins1,  i8     0, i32 2
+  %ins3  = insertelement <16 x i8> %ins2,  i8     0, i32 3
+  %ins4  = insertelement <16 x i8> %ins3,  i8   %a4, i32 4
+  %ins5  = insertelement <16 x i8> %ins4,  i8     0, i32 5
+  %ins6  = insertelement <16 x i8> %ins5,  i8   %a6, i32 6
+  %ins7  = insertelement <16 x i8> %ins6,  i8     0, i32 7
+  %ins8  = insertelement <16 x i8> %ins7,  i8   %a8, i32 8
+  %ins9  = insertelement <16 x i8> %ins8,  i8     0, i32 9
+  %ins10 = insertelement <16 x i8> %ins9,  i8     0, i32 10
+  %ins11 = insertelement <16 x i8> %ins10, i8  %a11, i32 11
+  %ins12 = insertelement <16 x i8> %ins11, i8  %a12, i32 12
+  %ins13 = insertelement <16 x i8> %ins12, i8     0, i32 13
+  %ins14 = insertelement <16 x i8> %ins13, i8     0, i32 14
+  %ins15 = insertelement <16 x i8> %ins14, i8  %a15, i32 15
+  ret <16 x i8> %ins15
+}
+
+define <16 x i8> @test_buildvector_v16i8_register_zero_2(i8 %a2, i8 %a3, i8 %a6, i8 %a8, i8 %a11, i8 %a12, i8 %a15) {
+; SSE2-LABEL: test_buildvector_v16i8_register_zero_2:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    shll $8, %esi
+; SSE2-NEXT:    movzbl %dil, %eax
+; SSE2-NEXT:    orl %esi, %eax
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pinsrw $1, %eax, %xmm0
+; SSE2-NEXT:    movzbl %dl, %eax
+; SSE2-NEXT:    pinsrw $3, %eax, %xmm0
+; SSE2-NEXT:    movzbl %cl, %eax
+; SSE2-NEXT:    pinsrw $4, %eax, %xmm0
+; SSE2-NEXT:    shll $8, %r8d
+; SSE2-NEXT:    pinsrw $5, %r8d, %xmm0
+; SSE2-NEXT:    movzbl %r9b, %eax
+; SSE2-NEXT:    pinsrw $6, %eax, %xmm0
+; SSE2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    shll $8, %eax
+; SSE2-NEXT:    pinsrw $7, %eax, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: test_buildvector_v16i8_register_zero_2:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pxor %xmm0, %xmm0
+; SSE41-NEXT:    pinsrb $2, %edi, %xmm0
+; SSE41-NEXT:    pinsrb $3, %esi, %xmm0
+; SSE41-NEXT:    pinsrb $6, %edx, %xmm0
+; SSE41-NEXT:    pinsrb $8, %ecx, %xmm0
+; SSE41-NEXT:    pinsrb $11, %r8d, %xmm0
+; SSE41-NEXT:    pinsrb $12, %r9d, %xmm0
+; SSE41-NEXT:    pinsrb $15, {{[0-9]+}}(%rsp), %xmm0
+; SSE41-NEXT:    retq
+  %ins0  = insertelement <16 x i8> undef,  i8     0, i32 0
+  %ins1  = insertelement <16 x i8> %ins0,  i8     0, i32 1
+  %ins2  = insertelement <16 x i8> %ins1,  i8   %a2, i32 2
+  %ins3  = insertelement <16 x i8> %ins2,  i8   %a3, i32 3
+  %ins4  = insertelement <16 x i8> %ins3,  i8     0, i32 4
+  %ins5  = insertelement <16 x i8> %ins4,  i8     0, i32 5
+  %ins6  = insertelement <16 x i8> %ins5,  i8   %a6, i32 6
+  %ins7  = insertelement <16 x i8> %ins6,  i8     0, i32 7
+  %ins8  = insertelement <16 x i8> %ins7,  i8   %a8, i32 8
+  %ins9  = insertelement <16 x i8> %ins8,  i8     0, i32 9
+  %ins10 = insertelement <16 x i8> %ins9,  i8     0, i32 10
+  %ins11 = insertelement <16 x i8> %ins10, i8  %a11, i32 11
+  %ins12 = insertelement <16 x i8> %ins11, i8  %a12, i32 12
+  %ins13 = insertelement <16 x i8> %ins12, i8     0, i32 13
+  %ins14 = insertelement <16 x i8> %ins13, i8     0, i32 14
+  %ins15 = insertelement <16 x i8> %ins14, i8  %a15, i32 15
+  ret <16 x i8> %ins15
+}
diff --git a/test/CodeGen/X86/bypass-slow-division-32.ll b/test/CodeGen/X86/bypass-slow-division-32.ll
index ea545d22385c14e07ad78bf9f5a28e9425f41dda..9f266647d8aa22b522f70d91dea3618026afb11a 100644
--- a/test/CodeGen/X86/bypass-slow-division-32.ll
+++ b/test/CodeGen/X86/bypass-slow-division-32.ll
@@ -95,20 +95,19 @@ define i32 @Test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
 ; CHECK-NEXT:    idivl %ebx
 ; CHECK-NEXT:    movl %eax, %esi
 ; CHECK-NEXT:    testl $-256, %edi
-; CHECK-NEXT:    jne .LBB3_5
-; CHECK-NEXT:    jmp .LBB3_4
-; CHECK-NEXT:  .LBB3_1:
-; CHECK-NEXT:    movzbl %cl, %eax
-; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
-; CHECK-NEXT:    divb %bl
-; CHECK-NEXT:    movzbl %al, %esi
-; CHECK-NEXT:    testl $-256, %edi
 ; CHECK-NEXT:    je .LBB3_4
 ; CHECK-NEXT:  .LBB3_5:
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    divl %ebx
 ; CHECK-NEXT:    jmp .LBB3_6
+; CHECK-NEXT:  .LBB3_1:
+; CHECK-NEXT:    movzbl %cl, %eax
+; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
+; CHECK-NEXT:    divb %bl
+; CHECK-NEXT:    movzbl %al, %esi
+; CHECK-NEXT:    testl $-256, %edi
+; CHECK-NEXT:    jne .LBB3_5
 ; CHECK-NEXT:  .LBB3_4:
 ; CHECK-NEXT:    movzbl %cl, %eax
 ; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %AX<def>
diff --git a/test/CodeGen/X86/catchpad-lifetime.ll b/test/CodeGen/X86/catchpad-lifetime.ll
index 77d3f25057cfa65e099fd5e9cb3d2984bf2657f4..d85adec360c8a2a8465e598178bc5a6d686557fb 100644
--- a/test/CodeGen/X86/catchpad-lifetime.ll
+++ b/test/CodeGen/X86/catchpad-lifetime.ll
@@ -26,9 +26,9 @@ catch.pad:                                        ; preds = %catch.dispatch
   %cp = catchpad within %cs [i8* null, i32 0, i8** %alloca1]
   store volatile i8* null, i8** %alloca1
   %bc1 = bitcast i8** %alloca1 to i8*
-  call void @llvm.lifetime.end(i64 4, i8* nonnull %bc1)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %bc1)
   %bc2 = bitcast i8** %alloca2 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %bc2)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %bc2)
   store volatile i8* null, i8** %alloca1
   unreachable
 
@@ -63,9 +63,9 @@ catch.pad:                                        ; preds = %catch.dispatch
   %cp = catchpad within %cs [i8* null, i32 0, i8** null]
   store volatile i8* null, i8** %alloca1
   %bc1 = bitcast i8** %alloca1 to i8*
-  call void @llvm.lifetime.end(i64 4, i8* nonnull %bc1)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %bc1)
   %bc2 = bitcast i8** %alloca2 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %bc2)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %bc2)
   store volatile i8* null, i8** %alloca1
   unreachable
 
@@ -83,9 +83,9 @@ unreachable:                                      ; preds = %entry
 
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 attributes #0 = { argmemonly nounwind }
diff --git a/test/CodeGen/X86/catchpad-weight.ll b/test/CodeGen/X86/catchpad-weight.ll
index 60939bc6b03ee3da12f91affbccb9f6650c53fdf..6caf0c6012f785b380279db49dca7aedef103c4e 100644
--- a/test/CodeGen/X86/catchpad-weight.ll
+++ b/test/CodeGen/X86/catchpad-weight.ll
@@ -26,7 +26,7 @@ define i32 @main() #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to
 entry:
   %o = alloca %struct.HasDtor, align 1
   %0 = getelementptr inbounds %struct.HasDtor, %struct.HasDtor* %o, i64 0, i32 0
-  call void @llvm.lifetime.start(i64 1, i8* %0) #4
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %0) #4
   invoke void @"\01?may_throw@@YAXXZ"()
           to label %try.cont unwind label %catch.dispatch
 
@@ -39,7 +39,7 @@ catch.5:                                          ; preds = %catch.dispatch
 
 try.cont:                                         ; preds = %entry, %catch, %catch.3, %catch.5
   call void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor* nonnull %o) #4
-  call void @llvm.lifetime.end(i64 1, i8* %0) #4
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %0) #4
   ret i32 0
 
 catch.dispatch.1:                                 ; preds = %catch.dispatch
@@ -63,7 +63,7 @@ ehcleanup:                                        ; preds = %catchendblock
 }
 
 ; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare void @"\01?may_throw@@YAXXZ"() #2
 
@@ -73,7 +73,7 @@ declare i32 @__CxxFrameHandler3(...)
 declare void @"\01??1HasDtor@@QEAA@XZ"(%struct.HasDtor*) #3
 
 ; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind argmemonly }
diff --git a/test/CodeGen/X86/chain_order.ll b/test/CodeGen/X86/chain_order.ll
index 8c3aa6e15156873d7db91ae8de59624ee6343db4..cc48e5b6149c7cf43c5cdbf50fef4806eb518679 100644
--- a/test/CodeGen/X86/chain_order.ll
+++ b/test/CodeGen/X86/chain_order.ll
@@ -11,9 +11,9 @@ define void @cftx020(double* nocapture %a) {
 ; CHECK-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; CHECK-NEXT:    vaddpd %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovupd (%rdi), %xmm1
-; CHECK-NEXT:    vsubpd 16(%rdi), %xmm1, %xmm1
 ; CHECK-NEXT:    vmovupd %xmm0, (%rdi)
-; CHECK-NEXT:    vmovupd %xmm1, 16(%rdi)
+; CHECK-NEXT:    vsubpd 16(%rdi), %xmm1, %xmm0
+; CHECK-NEXT:    vmovupd %xmm0, 16(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   %0 = load double, double* %a, align 8
diff --git a/test/CodeGen/X86/clear_upper_vector_element_bits.ll b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
index 9256717f155d97499953910374d9ddf6e70efd35..c425e3a92d17370a54abc66c05574553d245cfc6 100644
--- a/test/CodeGen/X86/clear_upper_vector_element_bits.ll
+++ b/test/CodeGen/X86/clear_upper_vector_element_bits.ll
@@ -35,6 +35,44 @@ define <2 x i64> @_clearupper2xi64a(<2 x i64>) nounwind {
   ret <2 x i64> %v1
 }
 
+define <4 x i64> @_clearupper4xi64a(<4 x i64>) nounwind {
+; SSE-LABEL: _clearupper4xi64a:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [4294967295,4294967295]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: _clearupper4xi64a:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: _clearupper4xi64a:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %x0 = extractelement <4 x i64> %0, i32 0
+  %x1 = extractelement <4 x i64> %0, i32 1
+  %x2 = extractelement <4 x i64> %0, i32 2
+  %x3 = extractelement <4 x i64> %0, i32 3
+  %trunc0 = trunc i64 %x0 to i32
+  %trunc1 = trunc i64 %x1 to i32
+  %trunc2 = trunc i64 %x2 to i32
+  %trunc3 = trunc i64 %x3 to i32
+  %ext0 = zext i32 %trunc0 to i64
+  %ext1 = zext i32 %trunc1 to i64
+  %ext2 = zext i32 %trunc2 to i64
+  %ext3 = zext i32 %trunc3 to i64
+  %v0 = insertelement <4 x i64> undef, i64 %ext0, i32 0
+  %v1 = insertelement <4 x i64> %v0,   i64 %ext1, i32 1
+  %v2 = insertelement <4 x i64> %v1,   i64 %ext2, i32 2
+  %v3 = insertelement <4 x i64> %v2,   i64 %ext3, i32 3
+  ret <4 x i64> %v3
+}
+
 define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind {
 ; SSE-LABEL: _clearupper4xi32a:
 ; SSE:       # BB#0:
@@ -65,6 +103,59 @@ define <4 x i32> @_clearupper4xi32a(<4 x i32>) nounwind {
   ret <4 x i32> %v3
 }
 
+define <8 x i32> @_clearupper8xi32a(<8 x i32>) nounwind {
+; SSE-LABEL: _clearupper8xi32a:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [65535,65535,65535,65535]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: _clearupper8xi32a:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: _clearupper8xi32a:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    retq
+  %x0 = extractelement <8 x i32> %0, i32 0
+  %x1 = extractelement <8 x i32> %0, i32 1
+  %x2 = extractelement <8 x i32> %0, i32 2
+  %x3 = extractelement <8 x i32> %0, i32 3
+  %x4 = extractelement <8 x i32> %0, i32 4
+  %x5 = extractelement <8 x i32> %0, i32 5
+  %x6 = extractelement <8 x i32> %0, i32 6
+  %x7 = extractelement <8 x i32> %0, i32 7
+  %trunc0 = trunc i32 %x0 to i16
+  %trunc1 = trunc i32 %x1 to i16
+  %trunc2 = trunc i32 %x2 to i16
+  %trunc3 = trunc i32 %x3 to i16
+  %trunc4 = trunc i32 %x4 to i16
+  %trunc5 = trunc i32 %x5 to i16
+  %trunc6 = trunc i32 %x6 to i16
+  %trunc7 = trunc i32 %x7 to i16
+  %ext0 = zext i16 %trunc0 to i32
+  %ext1 = zext i16 %trunc1 to i32
+  %ext2 = zext i16 %trunc2 to i32
+  %ext3 = zext i16 %trunc3 to i32
+  %ext4 = zext i16 %trunc4 to i32
+  %ext5 = zext i16 %trunc5 to i32
+  %ext6 = zext i16 %trunc6 to i32
+  %ext7 = zext i16 %trunc7 to i32
+  %v0 = insertelement <8 x i32> undef, i32 %ext0, i32 0
+  %v1 = insertelement <8 x i32> %v0,   i32 %ext1, i32 1
+  %v2 = insertelement <8 x i32> %v1,   i32 %ext2, i32 2
+  %v3 = insertelement <8 x i32> %v2,   i32 %ext3, i32 3
+  %v4 = insertelement <8 x i32> %v3,   i32 %ext4, i32 4
+  %v5 = insertelement <8 x i32> %v4,   i32 %ext5, i32 5
+  %v6 = insertelement <8 x i32> %v5,   i32 %ext6, i32 6
+  %v7 = insertelement <8 x i32> %v6,   i32 %ext7, i32 7
+  ret <8 x i32> %v7
+}
+
 define <8 x i16> @_clearupper8xi16a(<8 x i16>) nounwind {
 ; SSE-LABEL: _clearupper8xi16a:
 ; SSE:       # BB#0:
@@ -131,90 +222,194 @@ define <8 x i16> @_clearupper8xi16a(<8 x i16>) nounwind {
   ret <8 x i16> %v7
 }
 
+define <16 x i16> @_clearupper16xi16a(<16 x i16>) nounwind {
+; SSE-LABEL: _clearupper16xi16a:
+; SSE:       # BB#0:
+; SSE-NEXT:    pushq %rbp
+; SSE-NEXT:    pushq %r15
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %r12
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    pextrw $1, %xmm0, %edi
+; SSE-NEXT:    pextrw $2, %xmm0, %eax
+; SSE-NEXT:    pextrw $3, %xmm0, %ecx
+; SSE-NEXT:    pextrw $4, %xmm0, %edx
+; SSE-NEXT:    pextrw $5, %xmm0, %esi
+; SSE-NEXT:    pextrw $6, %xmm0, %ebx
+; SSE-NEXT:    pextrw $7, %xmm0, %ebp
+; SSE-NEXT:    pextrw $1, %xmm1, %r10d
+; SSE-NEXT:    pextrw $2, %xmm1, %r9d
+; SSE-NEXT:    pextrw $3, %xmm1, %r14d
+; SSE-NEXT:    pextrw $4, %xmm1, %r8d
+; SSE-NEXT:    pextrw $5, %xmm1, %r15d
+; SSE-NEXT:    pextrw $6, %xmm1, %r11d
+; SSE-NEXT:    pextrw $7, %xmm1, %r12d
+; SSE-NEXT:    movd %ebp, %xmm2
+; SSE-NEXT:    movd %ecx, %xmm3
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT:    movd %esi, %xmm2
+; SSE-NEXT:    movd %edi, %xmm4
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE-NEXT:    movd %ebx, %xmm2
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE-NEXT:    movd %edx, %xmm2
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    movd %r12d, %xmm3
+; SSE-NEXT:    movd %r14d, %xmm4
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE-NEXT:    movd %r15d, %xmm3
+; SSE-NEXT:    movd %r10d, %xmm5
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE-NEXT:    movd %r11d, %xmm3
+; SSE-NEXT:    movd %r9d, %xmm4
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
+; SSE-NEXT:    movd %r8d, %xmm3
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r12
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    popq %r15
+; SSE-NEXT:    popq %rbp
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: _clearupper16xi16a:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %x0  = extractelement <16 x i16> %0, i32 0
+  %x1  = extractelement <16 x i16> %0, i32 1
+  %x2  = extractelement <16 x i16> %0, i32 2
+  %x3  = extractelement <16 x i16> %0, i32 3
+  %x4  = extractelement <16 x i16> %0, i32 4
+  %x5  = extractelement <16 x i16> %0, i32 5
+  %x6  = extractelement <16 x i16> %0, i32 6
+  %x7  = extractelement <16 x i16> %0, i32 7
+  %x8  = extractelement <16 x i16> %0, i32 8
+  %x9  = extractelement <16 x i16> %0, i32 9
+  %x10 = extractelement <16 x i16> %0, i32 10
+  %x11 = extractelement <16 x i16> %0, i32 11
+  %x12 = extractelement <16 x i16> %0, i32 12
+  %x13 = extractelement <16 x i16> %0, i32 13
+  %x14 = extractelement <16 x i16> %0, i32 14
+  %x15 = extractelement <16 x i16> %0, i32 15
+  %trunc0  = trunc i16 %x0  to i8
+  %trunc1  = trunc i16 %x1  to i8
+  %trunc2  = trunc i16 %x2  to i8
+  %trunc3  = trunc i16 %x3  to i8
+  %trunc4  = trunc i16 %x4  to i8
+  %trunc5  = trunc i16 %x5  to i8
+  %trunc6  = trunc i16 %x6  to i8
+  %trunc7  = trunc i16 %x7  to i8
+  %trunc8  = trunc i16 %x8  to i8
+  %trunc9  = trunc i16 %x9  to i8
+  %trunc10 = trunc i16 %x10 to i8
+  %trunc11 = trunc i16 %x11 to i8
+  %trunc12 = trunc i16 %x12 to i8
+  %trunc13 = trunc i16 %x13 to i8
+  %trunc14 = trunc i16 %x14 to i8
+  %trunc15 = trunc i16 %x15 to i8
+  %ext0  = zext i8 %trunc0  to i16
+  %ext1  = zext i8 %trunc1  to i16
+  %ext2  = zext i8 %trunc2  to i16
+  %ext3  = zext i8 %trunc3  to i16
+  %ext4  = zext i8 %trunc4  to i16
+  %ext5  = zext i8 %trunc5  to i16
+  %ext6  = zext i8 %trunc6  to i16
+  %ext7  = zext i8 %trunc7  to i16
+  %ext8  = zext i8 %trunc8  to i16
+  %ext9  = zext i8 %trunc9  to i16
+  %ext10 = zext i8 %trunc10 to i16
+  %ext11 = zext i8 %trunc11 to i16
+  %ext12 = zext i8 %trunc12 to i16
+  %ext13 = zext i8 %trunc13 to i16
+  %ext14 = zext i8 %trunc14 to i16
+  %ext15 = zext i8 %trunc15 to i16
+  %v0  = insertelement <16 x i16> undef, i16 %ext0,  i32 0
+  %v1  = insertelement <16 x i16> %v0,   i16 %ext1,  i32 1
+  %v2  = insertelement <16 x i16> %v1,   i16 %ext2,  i32 2
+  %v3  = insertelement <16 x i16> %v2,   i16 %ext3,  i32 3
+  %v4  = insertelement <16 x i16> %v3,   i16 %ext4,  i32 4
+  %v5  = insertelement <16 x i16> %v4,   i16 %ext5,  i32 5
+  %v6  = insertelement <16 x i16> %v5,   i16 %ext6,  i32 6
+  %v7  = insertelement <16 x i16> %v6,   i16 %ext7,  i32 7
+  %v8  = insertelement <16 x i16> %v7,   i16 %ext8,  i32 8
+  %v9  = insertelement <16 x i16> %v8,   i16 %ext9,  i32 9
+  %v10 = insertelement <16 x i16> %v9,   i16 %ext10, i32 10
+  %v11 = insertelement <16 x i16> %v10,  i16 %ext11, i32 11
+  %v12 = insertelement <16 x i16> %v11,  i16 %ext12, i32 12
+  %v13 = insertelement <16 x i16> %v12,  i16 %ext13, i32 13
+  %v14 = insertelement <16 x i16> %v13,  i16 %ext14, i32 14
+  %v15 = insertelement <16 x i16> %v14,  i16 %ext15, i32 15
+  ret <16 x i16> %v15
+}
+
 define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
 ; SSE-LABEL: _clearupper16xi8a:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE-NEXT:    movd %eax, %xmm0
-; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r9d
-; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %r8d
-; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
 ; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE-NEXT:    movd %eax, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT:    movd %esi, %xmm0
 ; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT:    movd %ecx, %xmm2
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm2
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE-NEXT:    movd %edx, %xmm0
-; SSE-NEXT:    movd %esi, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT:    movd %edi, %xmm0
-; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE-NEXT:    movd %edx, %xmm3
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm3
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE-NEXT:    movd %r9d, %xmm0
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
 ; SSE-NEXT:    movd %eax, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT:    movd %r8d, %xmm0
-; SSE-NEXT:    movd %ecx, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm4
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: _clearupper16xi8a:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    vpextrb $1, %xmm0, %ecx
 ; AVX-NEXT:    vmovd %eax, %xmm1
-; AVX-NEXT:    vpextrb $1, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $2, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $3, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $4, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $5, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $6, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $7, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $7, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $8, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $9, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $9, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $10, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $10, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $11, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $11, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $12, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $12, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $13, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $13, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $14, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrb $15, %xmm0, %eax
-; AVX-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
+; AVX-NEXT:    vpinsrb $1, %ecx, %xmm1, %xmm1
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5,6,7]
 ; AVX-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x0  = extractelement <16 x i8> %0, i32 0
@@ -284,16 +479,270 @@ define <16 x i8> @_clearupper16xi8a(<16 x i8>) nounwind {
   ret <16 x i8> %v15
 }
 
+define <32 x i8> @_clearupper32xi8a(<32 x i8>) nounwind {
+; SSE-LABEL: _clearupper32xi8a:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3],xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE-NEXT:    movd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm2
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm4
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm4
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm5
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm1
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE-NEXT:    movd {{.*#+}} xmm5 = mem[0],zero,zero,zero
+; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm4
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm5
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm4
+; SSE-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE-NEXT:    movd %eax, %xmm6
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: _clearupper32xi8a:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX1-NEXT:    vpextrb $1, %xmm0, %ecx
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vpextrb $0, %xmm1, %edx
+; AVX1-NEXT:    vpextrb $1, %xmm1, %esi
+; AVX1-NEXT:    vmovd %edx, %xmm2
+; AVX1-NEXT:    vpinsrb $1, %esi, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vmovd %eax, %xmm2
+; AVX1-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: _clearupper32xi8a:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    vpextrb $1, %xmm0, %ecx
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpextrb $0, %xmm1, %edx
+; AVX2-NEXT:    vpextrb $1, %xmm1, %esi
+; AVX2-NEXT:    vmovd %edx, %xmm2
+; AVX2-NEXT:    vpinsrb $1, %esi, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vmovd %eax, %xmm2
+; AVX2-NEXT:    vpinsrb $1, %ecx, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5,6,7]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+  %x0  = extractelement <32 x i8> %0, i32 0
+  %x1  = extractelement <32 x i8> %0, i32 1
+  %x2  = extractelement <32 x i8> %0, i32 2
+  %x3  = extractelement <32 x i8> %0, i32 3
+  %x4  = extractelement <32 x i8> %0, i32 4
+  %x5  = extractelement <32 x i8> %0, i32 5
+  %x6  = extractelement <32 x i8> %0, i32 6
+  %x7  = extractelement <32 x i8> %0, i32 7
+  %x8  = extractelement <32 x i8> %0, i32 8
+  %x9  = extractelement <32 x i8> %0, i32 9
+  %x10 = extractelement <32 x i8> %0, i32 10
+  %x11 = extractelement <32 x i8> %0, i32 11
+  %x12 = extractelement <32 x i8> %0, i32 12
+  %x13 = extractelement <32 x i8> %0, i32 13
+  %x14 = extractelement <32 x i8> %0, i32 14
+  %x15 = extractelement <32 x i8> %0, i32 15
+  %x16 = extractelement <32 x i8> %0, i32 16
+  %x17 = extractelement <32 x i8> %0, i32 17
+  %x18 = extractelement <32 x i8> %0, i32 18
+  %x19 = extractelement <32 x i8> %0, i32 19
+  %x20 = extractelement <32 x i8> %0, i32 20
+  %x21 = extractelement <32 x i8> %0, i32 21
+  %x22 = extractelement <32 x i8> %0, i32 22
+  %x23 = extractelement <32 x i8> %0, i32 23
+  %x24 = extractelement <32 x i8> %0, i32 24
+  %x25 = extractelement <32 x i8> %0, i32 25
+  %x26 = extractelement <32 x i8> %0, i32 26
+  %x27 = extractelement <32 x i8> %0, i32 27
+  %x28 = extractelement <32 x i8> %0, i32 28
+  %x29 = extractelement <32 x i8> %0, i32 29
+  %x30 = extractelement <32 x i8> %0, i32 30
+  %x31 = extractelement <32 x i8> %0, i32 31
+  %trunc0  = trunc i8 %x0  to i4
+  %trunc1  = trunc i8 %x1  to i4
+  %trunc2  = trunc i8 %x2  to i4
+  %trunc3  = trunc i8 %x3  to i4
+  %trunc4  = trunc i8 %x4  to i4
+  %trunc5  = trunc i8 %x5  to i4
+  %trunc6  = trunc i8 %x6  to i4
+  %trunc7  = trunc i8 %x7  to i4
+  %trunc8  = trunc i8 %x8  to i4
+  %trunc9  = trunc i8 %x9  to i4
+  %trunc10 = trunc i8 %x10 to i4
+  %trunc11 = trunc i8 %x11 to i4
+  %trunc12 = trunc i8 %x12 to i4
+  %trunc13 = trunc i8 %x13 to i4
+  %trunc14 = trunc i8 %x14 to i4
+  %trunc15 = trunc i8 %x15 to i4
+  %trunc16 = trunc i8 %x16 to i4
+  %trunc17 = trunc i8 %x17 to i4
+  %trunc18 = trunc i8 %x18 to i4
+  %trunc19 = trunc i8 %x19 to i4
+  %trunc20 = trunc i8 %x20 to i4
+  %trunc21 = trunc i8 %x21 to i4
+  %trunc22 = trunc i8 %x22 to i4
+  %trunc23 = trunc i8 %x23 to i4
+  %trunc24 = trunc i8 %x24 to i4
+  %trunc25 = trunc i8 %x25 to i4
+  %trunc26 = trunc i8 %x26 to i4
+  %trunc27 = trunc i8 %x27 to i4
+  %trunc28 = trunc i8 %x28 to i4
+  %trunc29 = trunc i8 %x29 to i4
+  %trunc30 = trunc i8 %x30 to i4
+  %trunc31 = trunc i8 %x31 to i4
+  %ext0  = zext i4 %trunc0  to i8
+  %ext1  = zext i4 %trunc1  to i8
+  %ext2  = zext i4 %trunc2  to i8
+  %ext3  = zext i4 %trunc3  to i8
+  %ext4  = zext i4 %trunc4  to i8
+  %ext5  = zext i4 %trunc5  to i8
+  %ext6  = zext i4 %trunc6  to i8
+  %ext7  = zext i4 %trunc7  to i8
+  %ext8  = zext i4 %trunc8  to i8
+  %ext9  = zext i4 %trunc9  to i8
+  %ext10 = zext i4 %trunc10 to i8
+  %ext11 = zext i4 %trunc11 to i8
+  %ext12 = zext i4 %trunc12 to i8
+  %ext13 = zext i4 %trunc13 to i8
+  %ext14 = zext i4 %trunc14 to i8
+  %ext15 = zext i4 %trunc15 to i8
+  %ext16 = zext i4 %trunc16 to i8
+  %ext17 = zext i4 %trunc17 to i8
+  %ext18 = zext i4 %trunc18 to i8
+  %ext19 = zext i4 %trunc19 to i8
+  %ext20 = zext i4 %trunc20 to i8
+  %ext21 = zext i4 %trunc21 to i8
+  %ext22 = zext i4 %trunc22 to i8
+  %ext23 = zext i4 %trunc23 to i8
+  %ext24 = zext i4 %trunc24 to i8
+  %ext25 = zext i4 %trunc25 to i8
+  %ext26 = zext i4 %trunc26 to i8
+  %ext27 = zext i4 %trunc27 to i8
+  %ext28 = zext i4 %trunc28 to i8
+  %ext29 = zext i4 %trunc29 to i8
+  %ext30 = zext i4 %trunc30 to i8
+  %ext31 = zext i4 %trunc31 to i8
+  %v0  = insertelement <32 x i8> undef, i8 %ext0,  i32 0
+  %v1  = insertelement <32 x i8> %v0,   i8 %ext1,  i32 1
+  %v2  = insertelement <32 x i8> %v1,   i8 %ext2,  i32 2
+  %v3  = insertelement <32 x i8> %v2,   i8 %ext3,  i32 3
+  %v4  = insertelement <32 x i8> %v3,   i8 %ext4,  i32 4
+  %v5  = insertelement <32 x i8> %v4,   i8 %ext5,  i32 5
+  %v6  = insertelement <32 x i8> %v5,   i8 %ext6,  i32 6
+  %v7  = insertelement <32 x i8> %v6,   i8 %ext7,  i32 7
+  %v8  = insertelement <32 x i8> %v7,   i8 %ext8,  i32 8
+  %v9  = insertelement <32 x i8> %v8,   i8 %ext9,  i32 9
+  %v10 = insertelement <32 x i8> %v9,   i8 %ext10, i32 10
+  %v11 = insertelement <32 x i8> %v10,  i8 %ext11, i32 11
+  %v12 = insertelement <32 x i8> %v11,  i8 %ext12, i32 12
+  %v13 = insertelement <32 x i8> %v12,  i8 %ext13, i32 13
+  %v14 = insertelement <32 x i8> %v13,  i8 %ext14, i32 14
+  %v15 = insertelement <32 x i8> %v14,  i8 %ext15, i32 15
+  %v16 = insertelement <32 x i8> %v15,  i8 %ext16, i32 16
+  %v17 = insertelement <32 x i8> %v16,  i8 %ext17, i32 17
+  %v18 = insertelement <32 x i8> %v17,  i8 %ext18, i32 18
+  %v19 = insertelement <32 x i8> %v18,  i8 %ext19, i32 19
+  %v20 = insertelement <32 x i8> %v19,  i8 %ext20, i32 20
+  %v21 = insertelement <32 x i8> %v20,  i8 %ext21, i32 21
+  %v22 = insertelement <32 x i8> %v21,  i8 %ext22, i32 22
+  %v23 = insertelement <32 x i8> %v22,  i8 %ext23, i32 23
+  %v24 = insertelement <32 x i8> %v23,  i8 %ext24, i32 24
+  %v25 = insertelement <32 x i8> %v24,  i8 %ext25, i32 25
+  %v26 = insertelement <32 x i8> %v25,  i8 %ext26, i32 26
+  %v27 = insertelement <32 x i8> %v26,  i8 %ext27, i32 27
+  %v28 = insertelement <32 x i8> %v27,  i8 %ext28, i32 28
+  %v29 = insertelement <32 x i8> %v28,  i8 %ext29, i32 29
+  %v30 = insertelement <32 x i8> %v29,  i8 %ext30, i32 30
+  %v31 = insertelement <32 x i8> %v30,  i8 %ext31, i32 31
+  ret <32 x i8> %v31
+}
+
 define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind {
 ; SSE-LABEL: _clearupper2xi64b:
 ; SSE:       # BB#0:
-; SSE-NEXT:    xorps %xmm2, %xmm2
-; SSE-NEXT:    xorps %xmm1, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0],xmm0[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[2,3]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[2,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,0]
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: _clearupper2xi64b:
@@ -314,14 +763,38 @@ define <2 x i64> @_clearupper2xi64b(<2 x i64>) nounwind {
   ret <2 x i64> %r
 }
 
+define <4 x i64> @_clearupper4xi64b(<4 x i64>) nounwind {
+; SSE-LABEL: _clearupper4xi64b:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: _clearupper4xi64b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: _clearupper4xi64b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %x32 = bitcast <4 x i64> %0 to <8 x i32>
+  %r0 = insertelement <8 x i32> %x32, i32 zeroinitializer, i32 1
+  %r1 = insertelement <8 x i32> %r0,  i32 zeroinitializer, i32 3
+  %r2 = insertelement <8 x i32> %r1,  i32 zeroinitializer, i32 5
+  %r3 = insertelement <8 x i32> %r2,  i32 zeroinitializer, i32 7
+  %r = bitcast <8 x i32> %r3 to <4 x i64>
+  ret <4 x i64> %r
+}
+
 define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind {
 ; SSE-LABEL: _clearupper4xi32b:
 ; SSE:       # BB#0:
-; SSE-NEXT:    xorl %eax, %eax
-; SSE-NEXT:    pinsrw $1, %eax, %xmm0
-; SSE-NEXT:    pinsrw $3, %eax, %xmm0
-; SSE-NEXT:    pinsrw $5, %eax, %xmm0
-; SSE-NEXT:    pinsrw $7, %eax, %xmm0
+; SSE-NEXT:    andps {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: _clearupper4xi32b:
@@ -338,6 +811,46 @@ define <4 x i32> @_clearupper4xi32b(<4 x i32>) nounwind {
   ret <4 x i32> %r
 }
 
+define <8 x i32> @_clearupper8xi32b(<8 x i32>) nounwind {
+; SSE-LABEL: _clearupper8xi32b:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: _clearupper8xi32b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: _clearupper8xi32b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+  %x16 = bitcast <8 x i32> %0 to <16 x i16>
+  %r0 = insertelement <16 x i16> %x16, i16 zeroinitializer, i32 1
+  %r1 = insertelement <16 x i16> %r0,  i16 zeroinitializer, i32 3
+  %r2 = insertelement <16 x i16> %r1,  i16 zeroinitializer, i32 5
+  %r3 = insertelement <16 x i16> %r2,  i16 zeroinitializer, i32 7
+  %r4 = insertelement <16 x i16> %r3,  i16 zeroinitializer, i32 9
+  %r5 = insertelement <16 x i16> %r4,  i16 zeroinitializer, i32 11
+  %r6 = insertelement <16 x i16> %r5,  i16 zeroinitializer, i32 13
+  %r7 = insertelement <16 x i16> %r6,  i16 zeroinitializer, i32 15
+  %r = bitcast <16 x i16> %r7 to <8 x i32>
+  ret <8 x i32> %r
+}
+
 define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind {
 ; SSE-LABEL: _clearupper8xi16b:
 ; SSE:       # BB#0:
@@ -346,15 +859,7 @@ define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind {
 ;
 ; AVX-LABEL: _clearupper8xi16b:
 ; AVX:       # BB#0:
-; AVX-NEXT:    xorl %eax, %eax
-; AVX-NEXT:    vpinsrb $1, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $5, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $7, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $11, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $13, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vandps {{.*}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x8 = bitcast <8 x i16> %0 to <16 x i8>
   %r0 = insertelement <16 x i8> %x8, i8 zeroinitializer, i32 1
@@ -369,6 +874,54 @@ define <8 x i16> @_clearupper8xi16b(<8 x i16>) nounwind {
   ret <8 x i16> %r
 }
 
+define <16 x i16> @_clearupper16xi16b(<16 x i16>) nounwind {
+; SSE-LABEL: _clearupper16xi16b:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: _clearupper16xi16b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm2
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: _clearupper16xi16b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
+; AVX2-NEXT:    retq
+  %x8 = bitcast <16 x i16> %0 to <32 x i8>
+  %r0  = insertelement <32 x i8> %x8,  i8 zeroinitializer, i32 1
+  %r1  = insertelement <32 x i8> %r0,  i8 zeroinitializer, i32 3
+  %r2  = insertelement <32 x i8> %r1,  i8 zeroinitializer, i32 5
+  %r3  = insertelement <32 x i8> %r2,  i8 zeroinitializer, i32 7
+  %r4  = insertelement <32 x i8> %r3,  i8 zeroinitializer, i32 9
+  %r5  = insertelement <32 x i8> %r4,  i8 zeroinitializer, i32 11
+  %r6  = insertelement <32 x i8> %r5,  i8 zeroinitializer, i32 13
+  %r7  = insertelement <32 x i8> %r6,  i8 zeroinitializer, i32 15
+  %r8  = insertelement <32 x i8> %r7,  i8 zeroinitializer, i32 17
+  %r9  = insertelement <32 x i8> %r8,  i8 zeroinitializer, i32 19
+  %r10 = insertelement <32 x i8> %r9,  i8 zeroinitializer, i32 21
+  %r11 = insertelement <32 x i8> %r10, i8 zeroinitializer, i32 23
+  %r12 = insertelement <32 x i8> %r11, i8 zeroinitializer, i32 25
+  %r13 = insertelement <32 x i8> %r12, i8 zeroinitializer, i32 27
+  %r14 = insertelement <32 x i8> %r13, i8 zeroinitializer, i32 29
+  %r15 = insertelement <32 x i8> %r14, i8 zeroinitializer, i32 31
+  %r = bitcast <32 x i8> %r15 to <16 x i16>
+  ret <16 x i16> %r
+}
+
 define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
 ; SSE-LABEL: _clearupper16xi8b:
 ; SSE:       # BB#0:
@@ -547,6 +1100,447 @@ define <16 x i8> @_clearupper16xi8b(<16 x i8>) nounwind {
   ret <16 x i8> %r
 }
 
+define <32 x i8> @_clearupper32xi8b(<32 x i8>) nounwind {
+; SSE-LABEL: _clearupper32xi8b:
+; SSE:       # BB#0:
+; SSE-NEXT:    pushq %r14
+; SSE-NEXT:    pushq %rbx
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE-NEXT:    movd %xmm0, %rcx
+; SSE-NEXT:    movq %rcx, %r8
+; SSE-NEXT:    movq %rcx, %r9
+; SSE-NEXT:    movq %rcx, %r10
+; SSE-NEXT:    movq %rcx, %rax
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    movq %rcx, %rsi
+; SSE-NEXT:    movq %rcx, %rdi
+; SSE-NEXT:    andb $15, %cl
+; SSE-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movd %xmm2, %rcx
+; SSE-NEXT:    shrq $56, %rdi
+; SSE-NEXT:    andb $15, %dil
+; SSE-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, %r11
+; SSE-NEXT:    shrq $48, %rsi
+; SSE-NEXT:    andb $15, %sil
+; SSE-NEXT:    movb %sil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, %r14
+; SSE-NEXT:    shrq $40, %rdx
+; SSE-NEXT:    andb $15, %dl
+; SSE-NEXT:    movb %dl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, %rdx
+; SSE-NEXT:    shrq $32, %rax
+; SSE-NEXT:    andb $15, %al
+; SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, %rax
+; SSE-NEXT:    shrq $24, %r10
+; SSE-NEXT:    andb $15, %r10b
+; SSE-NEXT:    movb %r10b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, %rdi
+; SSE-NEXT:    shrq $16, %r9
+; SSE-NEXT:    andb $15, %r9b
+; SSE-NEXT:    movb %r9b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, %rsi
+; SSE-NEXT:    shrq $8, %r8
+; SSE-NEXT:    andb $15, %r8b
+; SSE-NEXT:    movb %r8b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq %rcx, %rbx
+; SSE-NEXT:    movb $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    andb $15, %cl
+; SSE-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    shrq $56, %rbx
+; SSE-NEXT:    andb $15, %bl
+; SSE-NEXT:    movb %bl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    shrq $48, %rsi
+; SSE-NEXT:    andb $15, %sil
+; SSE-NEXT:    movb %sil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    shrq $40, %rdi
+; SSE-NEXT:    andb $15, %dil
+; SSE-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    shrq $32, %rax
+; SSE-NEXT:    andb $15, %al
+; SSE-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    shrq $24, %rdx
+; SSE-NEXT:    andb $15, %dl
+; SSE-NEXT:    movb %dl, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    shrq $16, %r14
+; SSE-NEXT:    andb $15, %r14b
+; SSE-NEXT:    movb %r14b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    shrq $8, %r11
+; SSE-NEXT:    andb $15, %r11b
+; SSE-NEXT:    movb %r11b, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movb $0, -{{[0-9]+}}(%rsp)
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT:    popq %rbx
+; SSE-NEXT:    popq %r14
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: _clearupper32xi8b:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    pushq %r14
+; AVX1-NEXT:    pushq %rbx
+; AVX1-NEXT:    vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovq %xmm0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movq -{{[0-9]+}}(%rsp), %r14
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX1-NEXT:    movq %rdx, %r8
+; AVX1-NEXT:    movq %rdx, %r9
+; AVX1-NEXT:    movq %rdx, %r11
+; AVX1-NEXT:    movq %rdx, %rsi
+; AVX1-NEXT:    movq %rdx, %rdi
+; AVX1-NEXT:    movq %rdx, %rcx
+; AVX1-NEXT:    movq %rdx, %rax
+; AVX1-NEXT:    andb $15, %dl
+; AVX1-NEXT:    movb %dl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    shrq $56, %rax
+; AVX1-NEXT:    andb $15, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movq %r14, %r10
+; AVX1-NEXT:    shrq $48, %rcx
+; AVX1-NEXT:    andb $15, %cl
+; AVX1-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movq %r14, %rdx
+; AVX1-NEXT:    shrq $40, %rdi
+; AVX1-NEXT:    andb $15, %dil
+; AVX1-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movq %r14, %rax
+; AVX1-NEXT:    shrq $32, %rsi
+; AVX1-NEXT:    andb $15, %sil
+; AVX1-NEXT:    movb %sil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movq %r14, %rcx
+; AVX1-NEXT:    shrq $24, %r11
+; AVX1-NEXT:    andb $15, %r11b
+; AVX1-NEXT:    movb %r11b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movq %r14, %rsi
+; AVX1-NEXT:    shrq $16, %r9
+; AVX1-NEXT:    andb $15, %r9b
+; AVX1-NEXT:    movb %r9b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movq %r14, %rdi
+; AVX1-NEXT:    shrq $8, %r8
+; AVX1-NEXT:    andb $15, %r8b
+; AVX1-NEXT:    movb %r8b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    movq %r14, %rbx
+; AVX1-NEXT:    andb $15, %r14b
+; AVX1-NEXT:    movb %r14b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    shrq $8, %r10
+; AVX1-NEXT:    shrq $16, %rdx
+; AVX1-NEXT:    shrq $24, %rax
+; AVX1-NEXT:    shrq $32, %rcx
+; AVX1-NEXT:    shrq $40, %rsi
+; AVX1-NEXT:    shrq $48, %rdi
+; AVX1-NEXT:    shrq $56, %rbx
+; AVX1-NEXT:    andb $15, %bl
+; AVX1-NEXT:    movb %bl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    andb $15, %dil
+; AVX1-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    andb $15, %sil
+; AVX1-NEXT:    movb %sil, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    andb $15, %cl
+; AVX1-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    andb $15, %al
+; AVX1-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    andb $15, %dl
+; AVX1-NEXT:    movb %dl, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    andb $15, %r10b
+; AVX1-NEXT:    movb %r10b, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    movq %rax, %r8
+; AVX1-NEXT:    movq %rax, %rdx
+; AVX1-NEXT:    movq %rax, %rsi
+; AVX1-NEXT:    movq %rax, %rdi
+; AVX1-NEXT:    movl %eax, %ebx
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    vmovd %eax, %xmm1
+; AVX1-NEXT:    shrl $8, %eax
+; AVX1-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    shrl $16, %ecx
+; AVX1-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    shrl $24, %ebx
+; AVX1-NEXT:    vpinsrb $3, %ebx, %xmm1, %xmm1
+; AVX1-NEXT:    shrq $32, %rdi
+; AVX1-NEXT:    vpinsrb $4, %edi, %xmm1, %xmm1
+; AVX1-NEXT:    shrq $40, %rsi
+; AVX1-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX1-NEXT:    movb $0, -{{[0-9]+}}(%rsp)
+; AVX1-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2
+; AVX1-NEXT:    shrq $48, %rdx
+; AVX1-NEXT:    vpinsrb $6, %edx, %xmm1, %xmm1
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    shrq $56, %r8
+; AVX1-NEXT:    vpinsrb $7, %r8d, %xmm1, %xmm0
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $8, %ecx
+; AVX1-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $16, %ecx
+; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $24, %ecx
+; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $32, %rcx
+; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $40, %rcx
+; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $48, %rcx
+; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX1-NEXT:    vmovq %xmm2, %rcx
+; AVX1-NEXT:    shrq $56, %rax
+; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    movl %ecx, %eax
+; AVX1-NEXT:    shrl $8, %eax
+; AVX1-NEXT:    vmovd %ecx, %xmm1
+; AVX1-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    movl %ecx, %eax
+; AVX1-NEXT:    shrl $16, %eax
+; AVX1-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    movl %ecx, %eax
+; AVX1-NEXT:    shrl $24, %eax
+; AVX1-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rcx, %rax
+; AVX1-NEXT:    shrq $32, %rax
+; AVX1-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rcx, %rax
+; AVX1-NEXT:    shrq $40, %rax
+; AVX1-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rcx, %rax
+; AVX1-NEXT:    shrq $48, %rax
+; AVX1-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX1-NEXT:    shrq $56, %rcx
+; AVX1-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $8, %ecx
+; AVX1-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $16, %ecx
+; AVX1-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movl %eax, %ecx
+; AVX1-NEXT:    shrl $24, %ecx
+; AVX1-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $32, %rcx
+; AVX1-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $40, %rcx
+; AVX1-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    movq %rax, %rcx
+; AVX1-NEXT:    shrq $48, %rcx
+; AVX1-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX1-NEXT:    shrq $56, %rax
+; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    popq %rbx
+; AVX1-NEXT:    popq %r14
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: _clearupper32xi8b:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    vpextrq $1, %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovq %xmm0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq -{{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX2-NEXT:    movq %rdx, %r8
+; AVX2-NEXT:    movq %rdx, %r9
+; AVX2-NEXT:    movq %rdx, %r11
+; AVX2-NEXT:    movq %rdx, %rsi
+; AVX2-NEXT:    movq %rdx, %rdi
+; AVX2-NEXT:    movq %rdx, %rcx
+; AVX2-NEXT:    movq %rdx, %rax
+; AVX2-NEXT:    andb $15, %dl
+; AVX2-NEXT:    movb %dl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    shrq $56, %rax
+; AVX2-NEXT:    andb $15, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r14, %r10
+; AVX2-NEXT:    shrq $48, %rcx
+; AVX2-NEXT:    andb $15, %cl
+; AVX2-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r14, %rdx
+; AVX2-NEXT:    shrq $40, %rdi
+; AVX2-NEXT:    andb $15, %dil
+; AVX2-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r14, %rax
+; AVX2-NEXT:    shrq $32, %rsi
+; AVX2-NEXT:    andb $15, %sil
+; AVX2-NEXT:    movb %sil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r14, %rcx
+; AVX2-NEXT:    shrq $24, %r11
+; AVX2-NEXT:    andb $15, %r11b
+; AVX2-NEXT:    movb %r11b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r14, %rsi
+; AVX2-NEXT:    shrq $16, %r9
+; AVX2-NEXT:    andb $15, %r9b
+; AVX2-NEXT:    movb %r9b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r14, %rdi
+; AVX2-NEXT:    shrq $8, %r8
+; AVX2-NEXT:    andb $15, %r8b
+; AVX2-NEXT:    movb %r8b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movq %r14, %rbx
+; AVX2-NEXT:    andb $15, %r14b
+; AVX2-NEXT:    movb %r14b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    shrq $8, %r10
+; AVX2-NEXT:    shrq $16, %rdx
+; AVX2-NEXT:    shrq $24, %rax
+; AVX2-NEXT:    shrq $32, %rcx
+; AVX2-NEXT:    shrq $40, %rsi
+; AVX2-NEXT:    shrq $48, %rdi
+; AVX2-NEXT:    shrq $56, %rbx
+; AVX2-NEXT:    andb $15, %bl
+; AVX2-NEXT:    movb %bl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andb $15, %dil
+; AVX2-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andb $15, %sil
+; AVX2-NEXT:    movb %sil, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andb $15, %cl
+; AVX2-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andb $15, %al
+; AVX2-NEXT:    movb %al, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andb $15, %dl
+; AVX2-NEXT:    movb %dl, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    andb $15, %r10b
+; AVX2-NEXT:    movb %r10b, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    movq %rax, %r8
+; AVX2-NEXT:    movq %rax, %rdx
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    movq %rax, %rdi
+; AVX2-NEXT:    movl %eax, %ebx
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    vmovd %eax, %xmm1
+; AVX2-NEXT:    shrl $8, %eax
+; AVX2-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    shrl $16, %ecx
+; AVX2-NEXT:    vpinsrb $2, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    shrl $24, %ebx
+; AVX2-NEXT:    vpinsrb $3, %ebx, %xmm1, %xmm1
+; AVX2-NEXT:    shrq $32, %rdi
+; AVX2-NEXT:    vpinsrb $4, %edi, %xmm1, %xmm1
+; AVX2-NEXT:    shrq $40, %rsi
+; AVX2-NEXT:    vpinsrb $5, %esi, %xmm1, %xmm1
+; AVX2-NEXT:    movb $0, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovdqa -{{[0-9]+}}(%rsp), %xmm2
+; AVX2-NEXT:    shrq $48, %rdx
+; AVX2-NEXT:    vpinsrb $6, %edx, %xmm1, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    shrq $56, %r8
+; AVX2-NEXT:    vpinsrb $7, %r8d, %xmm1, %xmm0
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $8, %ecx
+; AVX2-NEXT:    vpinsrb $8, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $16, %ecx
+; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $24, %ecx
+; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $32, %rcx
+; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $40, %rcx
+; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $48, %rcx
+; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm2, %rcx
+; AVX2-NEXT:    shrq $56, %rax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    shrl $8, %eax
+; AVX2-NEXT:    vmovd %ecx, %xmm1
+; AVX2-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    shrl $16, %eax
+; AVX2-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    movl %ecx, %eax
+; AVX2-NEXT:    shrl $24, %eax
+; AVX2-NEXT:    vpinsrb $3, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    shrq $32, %rax
+; AVX2-NEXT:    vpinsrb $4, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    shrq $40, %rax
+; AVX2-NEXT:    vpinsrb $5, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rcx, %rax
+; AVX2-NEXT:    shrq $48, %rax
+; AVX2-NEXT:    vpinsrb $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    shrq $56, %rcx
+; AVX2-NEXT:    vpinsrb $7, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $8, %ecx
+; AVX2-NEXT:    vpinsrb $8, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrb $9, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $16, %ecx
+; AVX2-NEXT:    vpinsrb $10, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movl %eax, %ecx
+; AVX2-NEXT:    shrl $24, %ecx
+; AVX2-NEXT:    vpinsrb $11, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $32, %rcx
+; AVX2-NEXT:    vpinsrb $12, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $40, %rcx
+; AVX2-NEXT:    vpinsrb $13, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    shrq $48, %rcx
+; AVX2-NEXT:    vpinsrb $14, %ecx, %xmm1, %xmm1
+; AVX2-NEXT:    shrq $56, %rax
+; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    retq
+  %x4  = bitcast <32 x i8> %0 to <64 x i4>
+  %r0  = insertelement <64 x i4> %x4,  i4 zeroinitializer, i32 1
+  %r1  = insertelement <64 x i4> %r0,  i4 zeroinitializer, i32 3
+  %r2  = insertelement <64 x i4> %r1,  i4 zeroinitializer, i32 5
+  %r3  = insertelement <64 x i4> %r2,  i4 zeroinitializer, i32 7
+  %r4  = insertelement <64 x i4> %r3,  i4 zeroinitializer, i32 9
+  %r5  = insertelement <64 x i4> %r4,  i4 zeroinitializer, i32 11
+  %r6  = insertelement <64 x i4> %r5,  i4 zeroinitializer, i32 13
+  %r7  = insertelement <64 x i4> %r6,  i4 zeroinitializer, i32 15
+  %r8  = insertelement <64 x i4> %r7,  i4 zeroinitializer, i32 17
+  %r9  = insertelement <64 x i4> %r8,  i4 zeroinitializer, i32 19
+  %r10 = insertelement <64 x i4> %r9,  i4 zeroinitializer, i32 21
+  %r11 = insertelement <64 x i4> %r10, i4 zeroinitializer, i32 23
+  %r12 = insertelement <64 x i4> %r11, i4 zeroinitializer, i32 25
+  %r13 = insertelement <64 x i4> %r12, i4 zeroinitializer, i32 27
+  %r14 = insertelement <64 x i4> %r13, i4 zeroinitializer, i32 29
+  %r15 = insertelement <64 x i4> %r14, i4 zeroinitializer, i32 31
+  %r16 = insertelement <64 x i4> %r15, i4 zeroinitializer, i32 33
+  %r17 = insertelement <64 x i4> %r16, i4 zeroinitializer, i32 35
+  %r18 = insertelement <64 x i4> %r17, i4 zeroinitializer, i32 37
+  %r19 = insertelement <64 x i4> %r18, i4 zeroinitializer, i32 39
+  %r20 = insertelement <64 x i4> %r19, i4 zeroinitializer, i32 41
+  %r21 = insertelement <64 x i4> %r20, i4 zeroinitializer, i32 43
+  %r22 = insertelement <64 x i4> %r21, i4 zeroinitializer, i32 45
+  %r23 = insertelement <64 x i4> %r22, i4 zeroinitializer, i32 47
+  %r24 = insertelement <64 x i4> %r23, i4 zeroinitializer, i32 49
+  %r25 = insertelement <64 x i4> %r24, i4 zeroinitializer, i32 51
+  %r26 = insertelement <64 x i4> %r25, i4 zeroinitializer, i32 53
+  %r27 = insertelement <64 x i4> %r26, i4 zeroinitializer, i32 55
+  %r28 = insertelement <64 x i4> %r27, i4 zeroinitializer, i32 57
+  %r29 = insertelement <64 x i4> %r28, i4 zeroinitializer, i32 59
+  %r30 = insertelement <64 x i4> %r29, i4 zeroinitializer, i32 61
+  %r31 = insertelement <64 x i4> %r30, i4 zeroinitializer, i32 63
+  %r = bitcast <64 x i4> %r15 to <32 x i8>
+  ret <32 x i8> %r
+}
+
 define <2 x i64> @_clearupper2xi64c(<2 x i64>) nounwind {
 ; SSE-LABEL: _clearupper2xi64c:
 ; SSE:       # BB#0:
@@ -568,6 +1562,29 @@ define <2 x i64> @_clearupper2xi64c(<2 x i64>) nounwind {
   ret <2 x i64> %r
 }
 
+define <4 x i64> @_clearupper4xi64c(<4 x i64>) nounwind {
+; SSE-LABEL: _clearupper4xi64c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [4294967295,0,4294967295,0]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: _clearupper4xi64c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: _clearupper4xi64c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    retq
+  %r = and <4 x i64> <i64 4294967295, i64 4294967295, i64 4294967295, i64 4294967295>, %0
+  ret <4 x i64> %r
+}
+
 define <4 x i32> @_clearupper4xi32c(<4 x i32>) nounwind {
 ; SSE-LABEL: _clearupper4xi32c:
 ; SSE:       # BB#0:
@@ -583,6 +1600,28 @@ define <4 x i32> @_clearupper4xi32c(<4 x i32>) nounwind {
   ret <4 x i32> %r
 }
 
+define <8 x i32> @_clearupper8xi32c(<8 x i32>) nounwind {
+; SSE-LABEL: _clearupper8xi32c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [65535,0,65535,0,65535,0,65535,0]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: _clearupper8xi32c:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: _clearupper8xi32c:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
+; AVX2-NEXT:    retq
+  %r = and <8 x i32> <i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535, i32 65535>, %0
+  ret <8 x i32> %r
+}
+
 define <8 x i16> @_clearupper8xi16c(<8 x i16>) nounwind {
 ; SSE-LABEL: _clearupper8xi16c:
 ; SSE:       # BB#0:
@@ -597,6 +1636,22 @@ define <8 x i16> @_clearupper8xi16c(<8 x i16>) nounwind {
   ret <8 x i16> %r
 }
 
+define <16 x i16> @_clearupper16xi16c(<16 x i16>) nounwind {
+; SSE-LABEL: _clearupper16xi16c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: _clearupper16xi16c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %r = and <16 x i16> <i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255, i16 255>, %0
+  ret <16 x i16> %r
+}
+
 define <16 x i8> @_clearupper16xi8c(<16 x i8>) nounwind {
 ; SSE-LABEL: _clearupper16xi8c:
 ; SSE:       # BB#0:
@@ -610,3 +1665,19 @@ define <16 x i8> @_clearupper16xi8c(<16 x i8>) nounwind {
   %r = and <16 x i8> <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>, %0
   ret <16 x i8> %r
 }
+
+define <32 x i8> @_clearupper32xi8c(<32 x i8>) nounwind {
+; SSE-LABEL: _clearupper32xi8c:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; SSE-NEXT:    andps %xmm2, %xmm0
+; SSE-NEXT:    andps %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: _clearupper32xi8c:
+; AVX:       # BB#0:
+; AVX-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    retq
+  %r = and <32 x i8> <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>, %0
+  ret <32 x i8> %r
+}
diff --git a/test/CodeGen/X86/clflushopt.ll b/test/CodeGen/X86/clflushopt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ee416eb96c5e551ca680f45b267203255a8bc161
--- /dev/null
+++ b/test/CodeGen/X86/clflushopt.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=clflushopt | FileCheck %s
+
+define void @clflushopt(i8* %p) nounwind {
+; CHECK-LABEL: clflushopt:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    clflushopt (%eax)
+; CHECK-NEXT:    retl
+  tail call void @llvm.x86.clflushopt(i8* %p)
+  ret void
+}
+declare void @llvm.x86.clflushopt(i8*) nounwind
diff --git a/test/CodeGen/X86/clzero.ll b/test/CodeGen/X86/clzero.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f15d4deedeff64bb6e757aa24c3b0363f7821505
--- /dev/null
+++ b/test/CodeGen/X86/clzero.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux -mattr=+clzero | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-pc-linux -mattr=+clzero | FileCheck %s --check-prefix=X32
+
+define void @foo(i8* %p) #0 {
+; X64-LABEL: foo:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    leaq (%rdi), %rax
+; X64-NEXT:    clzero
+; X64-NEXT:    retq
+;
+; X32-LABEL: foo:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    leal (%eax), %eax
+; X32-NEXT:    clzero
+; X32-NEXT:    retl
+entry:
+  tail call void @llvm.x86.clzero(i8* %p) #1
+  ret void
+}
+
+declare void @llvm.x86.clzero(i8*) #1
diff --git a/test/CodeGen/X86/cmovcmov.ll b/test/CodeGen/X86/cmovcmov.ll
index 38ba308ecff56902e3be59043215650af80d224e..5b984d27249b74e479fa1a7c52ff4f935a63eae8 100644
--- a/test/CodeGen/X86/cmovcmov.ll
+++ b/test/CodeGen/X86/cmovcmov.ll
@@ -249,16 +249,23 @@ attributes #0 = { nounwind }
 ; CMOV-DAG: cmpl %edx, %esi
 ; CMOV-DAG: movb $20, %al
 ; CMOV-DAG: movb $20, %dl
-; CMOV:   jl [[BB0:.LBB[0-9_]+]]
+; CMOV:   jge [[BB2:.LBB[0-9_]+]]
+; CMOV:   jle [[BB3:.LBB[0-9_]+]]
+; CMOV: [[BB0:.LBB[0-9_]+]]
+; CMOV:   testl %edi, %edi
+; CMOV:   jne [[BB4:.LBB[0-9_]+]]
+; CMOV: [[BB1:.LBB[0-9_]+]]
+; CMOV:   movb %al, g8(%rip)
+; CMOV:   retq
+; CMOV: [[BB2]]:
 ; CMOV:   movl %ecx, %edx
-; CMOV: [[BB0]]:
-; CMOV:   jg [[BB1:.LBB[0-9_]+]]
+; CMOV:   jg [[BB0]]
+; CMOV: [[BB3]]:
 ; CMOV:   movl %edx, %eax
-; CMOV: [[BB1]]:
 ; CMOV:   testl %edi, %edi
-; CMOV:   je [[BB2:.LBB[0-9_]+]]
+; CMOV:   je [[BB1]]
+; CMOV: [[BB4]]:
 ; CMOV:   movl %edx, %eax
-; CMOV: [[BB2]]:
 ; CMOV:   movb %al, g8(%rip)
 ; CMOV:   retq
 define void @no_cascade_opt(i32 %v0, i32 %v1, i32 %v2, i32 %v3) {
diff --git a/test/CodeGen/X86/code_placement_outline_optional_branches.ll b/test/CodeGen/X86/code_placement_outline_optional_branches.ll
deleted file mode 100644
index 5624d435215a9fbdacead0f20611b1fee5d68d8b..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/code_placement_outline_optional_branches.ll
+++ /dev/null
@@ -1,77 +0,0 @@
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux < %s | FileCheck %s
-; RUN: llc -mcpu=corei7 -mtriple=x86_64-linux -outline-optional-branches < %s | FileCheck %s -check-prefix=CHECK-OUTLINE
-
-define void @foo(i32 %t1, i32 %t2, i32 %t3) {
-; Test that we lift the call to 'c' up to immediately follow the call to 'b'
-; when we disable the cfg conflict check.
-;
-; CHECK-LABEL: foo:
-; CHECK: callq a
-; CHECK: callq a
-; CHECK: callq a
-; CHECK: callq a
-; CHECK: callq b
-; CHECK: callq c
-; CHECK: callq d
-; CHECK: callq e
-; CHECK: callq f
-;
-; CHECK-OUTLINE-LABEL: foo:
-; CHECK-OUTLINE: callq b
-; CHECK-OUTLINE: callq c
-; CHECK-OUTLINE: callq d
-; CHECK-OUTLINE: callq e
-; CHECK-OUTLINE: callq f
-; CHECK-OUTLINE: callq a
-; CHECK-OUTLINE: callq a
-; CHECK-OUTLINE: callq a
-; CHECK-OUTLINE: callq a
-
-entry:
-  %cmp = icmp eq i32 %t1, 0
-  br i1 %cmp, label %if.then, label %if.end
-
-if.then:
-  call void @a()
-  call void @a()
-  call void @a()
-  call void @a()
-  br label %if.end
-
-if.end:
-  call void @b()
-  br label %hotbranch
-
-hotbranch:
-  %cmp2 = icmp eq i32 %t2, 0
-  br i1 %cmp2, label %if.then2, label %if.end2, !prof !1
-
-if.then2:
-  call void @c()
-  br label %if.end2
-
-if.end2:
-  call void @d()
-  br label %shortbranch
-
-shortbranch:
-  %cmp3 = icmp eq i32 %t3, 0
-  br i1 %cmp3, label %if.then3, label %if.end3
-
-if.then3:
-  call void @e()
-  br label %if.end3
-
-if.end3:
-  call void @f()
-  ret void
-}
-
-declare void @a()
-declare void @b()
-declare void @c()
-declare void @d()
-declare void @e()
-declare void @f()
-
-!1 = !{!"branch_weights", i32 64, i32 4}
diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index f00c40ba3a920279fdf8d9349308d7c952536a25..1f4578c95314c46d225bc24105a371d808ed9919 100644
--- a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -1,5 +1,4 @@
 ; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
-; RUN: opt -S -codegenprepare -addr-sink-using-gep=1 %s -o - | FileCheck -check-prefix=CHECK-GEP %s
 ; This file tests the different cases what are involved when codegen prepare
 ; tries to get sign/zero extension out of the way of addressing mode.
 ; This tests require an actual target as addressing mode decisions depends
@@ -309,33 +308,18 @@ define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, i8* %base) {
 ; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2
 ; CHECK: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
 ; BB then
-; CHECK: [[BASE1:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTADD]], 48
-; CHECK: [[ADDR1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[BASE1]] to i32*
+; CHECK: [[BASE1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to i32*
+; CHECK: [[BCC1:%[a-zA-Z_0-9-]+]] = bitcast i32* [[BASE1]] to i8*
+; CHECK: [[FULL1:%[a-zA-Z_0-9-]+]] = getelementptr i8, i8* [[BCC1]], i64 48
+; CHECK: [[ADDR1:%[a-zA-Z_0-9-]+]] = bitcast i8* [[FULL1]] to i32*
 ; CHECK: load i32, i32* [[ADDR1]]
 ; BB else
-; CHECK: [[BASE2:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTADD]], 48
-; CHECK: [[ADDR2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[BASE2]] to i32*
+; CHECK: [[BASE2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to i32*
+; CHECK: [[BCC2:%[a-zA-Z_0-9-]+]] = bitcast i32* [[BASE2]] to i8*
+; CHECK: [[FULL2:%[a-zA-Z_0-9-]+]] = getelementptr i8, i8* [[BCC2]], i64 48
+; CHECK: [[ADDR2:%[a-zA-Z_0-9-]+]] = bitcast i8* [[FULL2]] to i32*
 ; CHECK: load i32, i32* [[ADDR2]]
 ; CHECK: ret
-; CHECK-GEP-LABEL: @checkProfitability
-; CHECK-GEP-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg1 to i64
-; CHECK-GEP-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg2 to i64
-; CHECK-GEP: [[SHL:%[a-zA-Z_0-9-]+]] = shl nsw i32 %arg1, 1
-; CHECK-GEP: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2
-; CHECK-GEP: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
-; BB then
-; CHECK-GEP: [[BASE1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to i32*
-; CHECK-GEP: [[BCC1:%[a-zA-Z_0-9-]+]] = bitcast i32* [[BASE1]] to i8*
-; CHECK-GEP: [[FULL1:%[a-zA-Z_0-9-]+]] = getelementptr i8, i8* [[BCC1]], i64 48
-; CHECK-GEP: [[ADDR1:%[a-zA-Z_0-9-]+]] = bitcast i8* [[FULL1]] to i32*
-; CHECK-GEP: load i32, i32* [[ADDR1]]
-; BB else
-; CHECK-GEP: [[BASE2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to i32*
-; CHECK-GEP: [[BCC2:%[a-zA-Z_0-9-]+]] = bitcast i32* [[BASE2]] to i8*
-; CHECK-GEP: [[FULL2:%[a-zA-Z_0-9-]+]] = getelementptr i8, i8* [[BCC2]], i64 48
-; CHECK-GEP: [[ADDR2:%[a-zA-Z_0-9-]+]] = bitcast i8* [[FULL2]] to i32*
-; CHECK-GEP: load i32, i32* [[ADDR2]]
-; CHECK-GEP: ret
 define i32 @checkProfitability(i32 %arg1, i32 %arg2, i1 %test) {
   %shl = shl nsw i32 %arg1, 1
   %add1 = add nsw i32 %shl, %arg2
@@ -371,11 +355,10 @@ end:
 ; Use it at the starting point for the matching.
 ; CHECK: %conv.i = zext i16 [[PLAIN_OPND:%[.a-zA-Z_0-9-]+]] to i32
 ; CHECK-NEXT: [[PROMOTED_CONV:%[.a-zA-Z_0-9-]+]] = zext i16 [[PLAIN_OPND]] to i64
-; CHECK-NEXT: [[BASE:%[a-zA-Z_0-9-]+]] = ptrtoint %struct.dns_packet* %P to i64
-; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = add i64 [[BASE]], [[PROMOTED_CONV]]
-; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = add i64 [[ADD]], 7
-; CHECK-NEXT: [[CAST:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[ADDR]] to i8*
-; CHECK-NEXT: load i8, i8* [[CAST]], align 1
+; CHECK-NEXT: [[BASE:%[a-zA-Z_0-9-]+]] = bitcast %struct.dns_packet* %P to i8*
+; CHECK-NEXT: [[ADD:%[a-zA-Z_0-9-]+]] = getelementptr i8, i8* [[BASE]], i64 [[PROMOTED_CONV]]
+; CHECK-NEXT: [[ADDR:%[a-zA-Z_0-9-]+]] = getelementptr i8, i8* [[ADD]], i64 7
+; CHECK-NEXT: load i8, i8* [[ADDR]], align 1
 define signext i16 @fn3(%struct.dns_packet* nocapture readonly %P) {
 entry:
   %tmp = getelementptr inbounds %struct.dns_packet, %struct.dns_packet* %P, i64 0, i32 2
diff --git a/test/CodeGen/X86/codegen-prepare-extload.ll b/test/CodeGen/X86/codegen-prepare-extload.ll
index c5c761ee63efffa87e852c0a9e8a867e6ca599c0..db5476ae1fe709d6d06099163a4a47bce6a755ea 100644
--- a/test/CodeGen/X86/codegen-prepare-extload.ll
+++ b/test/CodeGen/X86/codegen-prepare-extload.ll
@@ -264,8 +264,7 @@ false:
 ;    => We have one zext of %zextld left and we created one sext of %ld2.
 ; 2. We try to promote the operand of %sextaddza.
 ;    a. This creates one sext of %zexta and one of %zextld
-;    b. The sext of %zexta does not lead to any load, it stays here, even if it
-;       could have been combine with the zext of %a.
+;    b. The sext of %zexta can be combined with the zext of %a.
 ;    c. The sext of %zextld leads to %ld and can be combined with it. This is
 ;       done by promoting %zextld. This is fine with the current heuristic:
 ;       neutral.
@@ -287,16 +286,14 @@ false:
 ; OPTALL: [[LD:%[a-zA-Z_0-9-]+]] = load i8, i8* %addr1
 ; OPT-NEXT: [[ZEXTLD1_1:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
 ; OPT-NEXT: [[ZEXTLD1_2:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
-; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
 ; OPT-NEXT: [[LD2:%[a-zA-Z_0-9-]+]] = load i32, i32* %addr2
 ; OPT-NEXT: [[SEXTLD2:%[a-zA-Z_0-9-]+]] = sext i32 [[LD2]] to i64
-; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_1]]
-; We do not combine this one: see 2.b.
-; OPT-NEXT: [[ZEXTA:%[a-zA-Z_0-9-]+]] = zext i8 %a to i32
-; OPT-NEXT: [[SEXTZEXTA:%[a-zA-Z_0-9-]+]] = sext i32 [[ZEXTA]] to i64
-; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTZEXTA]], [[ZEXTLD1_3]]
+; OPT-NEXT: [[ZEXTLD1_3:%[a-zA-Z_0-9-]+]] = zext i8 [[LD]] to i64
+; OPT-NEXT: [[RES:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTLD2]], [[ZEXTLD1_3]]
+; OPT-NEXT: [[ZEXTLD1_4:%[a-zA-Z_0-9-]+]] = zext i8 %a to i64
+; OPT-NEXT: [[RESZA:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ZEXTLD1_4]], [[ZEXTLD1_2]]
 ; OPT-NEXT: [[SEXTB:%[a-zA-Z_0-9-]+]] = sext i32 %b to i64
-; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_2]]
+; OPT-NEXT: [[RESB:%[a-zA-Z_0-9-]+]] = add nsw i64 [[SEXTB]], [[ZEXTLD1_1]]
 ;
 ; DISABLE: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32
 ; DISABLE: [[RES:%[a-zA-Z_0-9-]+]]  = sext i32 [[ADD]] to i64
diff --git a/test/CodeGen/X86/codegen-prepare.ll b/test/CodeGen/X86/codegen-prepare.ll
index e58bc22ef142140202842aa7a9192e312a7d42e9..9d7d3d376cdc256a56af818c88e7f10e536182e1 100644
--- a/test/CodeGen/X86/codegen-prepare.ll
+++ b/test/CodeGen/X86/codegen-prepare.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-pc-linux -addr-sink-using-gep=1 | FileCheck %s
 
 ; Check that the CodeGenPrepare Pass
 ; does not wrongly rewrite the address computed by Instruction %4
diff --git a/test/CodeGen/X86/combine-abs.ll b/test/CodeGen/X86/combine-abs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ac8f790a2ead6b1718a255e9b7955925af2ec0b7
--- /dev/null
+++ b/test/CodeGen/X86/combine-abs.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s
+
+; fold (abs c1) -> c2
+define <4 x i32> @combine_v4i32_abs_constant() {
+; CHECK-LABEL: combine_v4i32_abs_constant:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [0,1,3,2147483648]
+; CHECK-NEXT:    retq
+  %1 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> <i32 0, i32 -1, i32 3, i32 -2147483648>)
+  ret <4 x i32> %1
+}
+
+define <16 x i16> @combine_v16i16_abs_constant() {
+; CHECK-LABEL: combine_v16i16_abs_constant:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [0,1,1,3,3,7,7,255,255,4096,4096,32767,32767,32768,32768,0]
+; CHECK-NEXT:    retq
+  %1 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> <i16 0, i16 1, i16 -1, i16 3, i16 -3, i16 7, i16 -7, i16 255, i16 -255, i16 4096, i16 -4096, i16 32767, i16 -32767, i16 -32768, i16 32768, i16 65536>)
+  ret <16 x i16> %1
+}
+
+; fold (abs (abs x)) -> (abs x)
+define <8 x i16> @combine_v8i16_abs_abs(<8 x i16> %a) {
+; CHECK-LABEL: combine_v8i16_abs_abs:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpabsw %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %a1 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a)
+  %n2 = sub <8 x i16> zeroinitializer, %a1
+  %c2 = icmp slt <8 x i16> %a1, zeroinitializer
+  %a2 = select <8 x i1> %c2, <8 x i16> %n2, <8 x i16> %a1
+  ret <8 x i16> %a2
+}
+
+define <32 x i8> @combine_v32i8_abs_abs(<32 x i8> %a) {
+; CHECK-LABEL: combine_v32i8_abs_abs:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpabsb %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %n1 = sub <32 x i8> zeroinitializer, %a
+  %b1 = icmp slt <32 x i8> %a, zeroinitializer
+  %a1 = select <32 x i1> %b1, <32 x i8> %n1, <32 x i8> %a
+  %a2 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a1)
+  ret <32 x i8> %a2
+}
+
+define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) {
+; CHECK-LABEL: combine_v4i64_abs_abs:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrad $31, %ymm0, %ymm1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpsrad $31, %ymm0, %ymm1
+; CHECK-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7]
+; CHECK-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %n1 = sub <4 x i64> zeroinitializer, %a
+  %b1 = icmp slt <4 x i64> %a, zeroinitializer
+  %a1 = select <4 x i1> %b1, <4 x i64> %n1, <4 x i64> %a
+  %n2 = sub <4 x i64> zeroinitializer, %a1
+  %b2 = icmp sgt <4 x i64> %a1, zeroinitializer
+  %a2 = select <4 x i1> %b2, <4 x i64> %a1, <4 x i64> %n2
+  ret <4 x i64> %a2
+}
+
+; fold (abs x) -> x iff not-negative
+define <16 x i8> @combine_v16i8_abs_constant(<16 x i8> %a) {
+; CHECK-LABEL: combine_v16i8_abs_constant:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vpabsb %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %1 = insertelement <16 x i8> undef, i8 15, i32 0
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
+  %3 = and <16 x i8> %a, %2
+  %4 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %3)
+  ret <16 x i8> %4
+}
+
+define <8 x i32> @combine_v8i32_abs_pos(<8 x i32> %a) {
+; CHECK-LABEL: combine_v8i32_abs_pos:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpsrld $1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %1 = lshr <8 x i32> %a, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %2 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %1)
+  ret <8 x i32> %2
+}
+
+declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
+declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
+declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
+
+declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
+declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
diff --git a/test/CodeGen/X86/combine-and.ll b/test/CodeGen/X86/combine-and.ll
index 6f310d9b7b123584060fde016ee09ced06ebae56..f30fa61bbfbe65abd671e23eaac3543402365705 100644
--- a/test/CodeGen/X86/combine-and.ll
+++ b/test/CodeGen/X86/combine-and.ll
@@ -245,3 +245,28 @@ define <4 x i32> @and_or_zext_v4i16(<4 x i16> %a0) {
   %3 = and <4 x i32> %2, <i32 65536, i32 65536, i32 65536, i32 65536>
   ret <4 x i32> %3
 }
+
+;
+; known sign bits folding
+;
+
+define <8 x i16> @ashr_mask1_v8i16(<8 x i16> %a0) {
+; CHECK-LABEL: ashr_mask1_v8i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrlw $15, %xmm0
+; CHECK-NEXT:    retq
+  %1 = ashr <8 x i16> %a0, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
+  %2 = and <8 x i16> %1, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <8 x i16> %2
+}
+
+define <4 x i32> @ashr_mask7_v4i32(<4 x i32> %a0) {
+; CHECK-LABEL: ashr_mask7_v4i32:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    psrad $31, %xmm0
+; CHECK-NEXT:    psrld $29, %xmm0
+; CHECK-NEXT:    retq
+  %1 = ashr <4 x i32> %a0, <i32 31, i32 31, i32 31, i32 31>
+  %2 = and <4 x i32> %1, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %2
+}
diff --git a/test/CodeGen/X86/combine-fcopysign.ll b/test/CodeGen/X86/combine-fcopysign.ll
index 807ac4e3fc6b56c3e4646f01517ac88c58a2a269..43e09bfe5fea93706f8cd6ab23ce69d64f464712 100644
--- a/test/CodeGen/X86/combine-fcopysign.ll
+++ b/test/CodeGen/X86/combine-fcopysign.ll
@@ -292,7 +292,7 @@ define <4 x float> @combine_vec_fcopysign_fptrunc_sgn(<4 x float> %x, <4 x doubl
 ; SSE-NEXT:    cvtsd2ss %xmm1, %xmm1
 ; SSE-NEXT:    andps %xmm4, %xmm1
 ; SSE-NEXT:    orps %xmm6, %xmm1
-; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; SSE-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE-NEXT:    movaps %xmm3, %xmm1
 ; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
 ; SSE-NEXT:    andps %xmm5, %xmm1
diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll
index 5cc7312de47ffff6272d7e4f86876290ea680a9c..3ad38f2717d947bc85436c2ec7ca0a7b63e68dba 100644
--- a/test/CodeGen/X86/combine-shl.ll
+++ b/test/CodeGen/X86/combine-shl.ll
@@ -243,11 +243,11 @@ define <8 x i32> @combine_vec_shl_ext_shl1(<8 x i16> %x) {
 define <8 x i32> @combine_vec_shl_zext_lshr0(<8 x i16> %x) {
 ; SSE-LABEL: combine_vec_shl_zext_lshr0:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE-NEXT:    movdqa %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_vec_shl_zext_lshr0:
@@ -270,15 +270,15 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    psrlw $4, %xmm0
 ; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4,5,6],xmm1[7]
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    psrlw $2, %xmm2
-; SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2],xmm0[3,4],xmm2[5,6],xmm0[7]
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    psrlw $1, %xmm1
-; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7]
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    psrlw $2, %xmm1
+; SSE-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    psrlw $1, %xmm0
+; SSE-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5],xmm0[6],xmm1[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    pmulld {{.*}}(%rip), %xmm1
 ; SSE-NEXT:    retq
@@ -288,7 +288,7 @@ define <8 x i32> @combine_vec_shl_zext_lshr1(<8 x i16> %x) {
 ; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,2,3,4,5,6,7,8]
 ; AVX-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
diff --git a/test/CodeGen/X86/combine-testm-and.ll b/test/CodeGen/X86/combine-testm-and.ll
index 2b95a114540d221924935da42b28e1e0d7e9e51a..b10a4b5ed29851ae74408f192b23610f1228e5d3 100644
--- a/test/CodeGen/X86/combine-testm-and.ll
+++ b/test/CodeGen/X86/combine-testm-and.ll
@@ -6,6 +6,7 @@ define i32 @combineTESTM_AND_1(<8 x i64> %a, <8 x i64> %b) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vptestmq %zmm0, %zmm1, %k0
 ; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %and.i = and <8 x i64> %b, %a
   %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 -1)
@@ -16,9 +17,10 @@ define i32 @combineTESTM_AND_1(<8 x i64> %a, <8 x i64> %b) {
 define i32 @combineTESTM_AND_2(<8 x i64> %a, <8 x i64> %b , i8 %mask) {
 ; CHECK-LABEL: combineTESTM_AND_2:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vptestmq %zmm0, %zmm1, %k0 {%k1}
 ; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %and.i = and <8 x i64> %b, %a
   %test.i = tail call i8 @llvm.x86.avx512.ptestm.q.512(<8 x i64> %and.i, <8 x i64> %and.i, i8 %mask)
@@ -29,9 +31,10 @@ define i32 @combineTESTM_AND_2(<8 x i64> %a, <8 x i64> %b , i8 %mask) {
 define i32 @combineTESTM_AND_mask_3(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
 ; CHECK-LABEL: combineTESTM_AND_mask_3:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vptestmq (%rdi), %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %bptr
   %and.i = and <8 x i64> %a, %b
@@ -43,9 +46,10 @@ define i32 @combineTESTM_AND_mask_3(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
 define i32 @combineTESTM_AND_mask_4(<8 x i64> %a, <8 x i64>* %bptr , i8 %mask) {
 ; CHECK-LABEL: combineTESTM_AND_mask_4:
 ; CHECK:       ## BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovd %esi, %k1
 ; CHECK-NEXT:    vptestmq (%rdi), %zmm0, %k0 {%k1}
 ; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %b = load <8 x i64>, <8 x i64>* %bptr
   %and.i = and <8 x i64> %b, %a
diff --git a/test/CodeGen/X86/combiner-aa-0.ll b/test/CodeGen/X86/combiner-aa-0.ll
deleted file mode 100644
index 403059d90ab1f10497f708a13d69f5ae1f5eae56..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/combiner-aa-0.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; RUN: llc < %s -march=x86-64 -combiner-global-alias-analysis -combiner-alias-analysis
-
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
-	%struct.Hash_Key = type { [4 x i32], i32 }
-@g_flipV_hashkey = external global %struct.Hash_Key, align 16		; <%struct.Hash_Key*> [#uses=1]
-
-define void @foo() nounwind {
-	%t0 = load i32, i32* undef, align 16		; <i32> [#uses=1]
-	%t1 = load i32, i32* null, align 4		; <i32> [#uses=1]
-	%t2 = srem i32 %t0, 32		; <i32> [#uses=1]
-	%t3 = shl i32 1, %t2		; <i32> [#uses=1]
-	%t4 = xor i32 %t3, %t1		; <i32> [#uses=1]
-	store i32 %t4, i32* null, align 4
-	%t5 = getelementptr %struct.Hash_Key, %struct.Hash_Key* @g_flipV_hashkey, i64 0, i32 0, i64 0		; <i32*> [#uses=2]
-	%t6 = load i32, i32* %t5, align 4		; <i32> [#uses=1]
-	%t7 = shl i32 1, undef		; <i32> [#uses=1]
-	%t8 = xor i32 %t7, %t6		; <i32> [#uses=1]
-	store i32 %t8, i32* %t5, align 4
-	unreachable
-}
diff --git a/test/CodeGen/X86/combiner-aa-1.ll b/test/CodeGen/X86/combiner-aa-1.ll
deleted file mode 100644
index cc3e5ca12602b94e29ac099b5785de79b72dac5c..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/combiner-aa-1.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: llc < %s --combiner-alias-analysis --combiner-global-alias-analysis
-; PR4880
-
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
-target triple = "i386-pc-linux-gnu"
-
-%struct.alst_node = type { %struct.node }
-%struct.arg_node = type { %struct.node, i8*, %struct.alst_node* }
-%struct.arglst_node = type { %struct.alst_node, %struct.arg_node*, %struct.arglst_node* }
-%struct.lam_node = type { %struct.alst_node, %struct.arg_node*, %struct.alst_node* }
-%struct.node = type { i32 (...)**, %struct.node* }
-
-define i32 @._ZN8lam_node18resolve_name_clashEP8arg_nodeP9alst_node._ZNK8lam_nodeeqERK8exp_node._ZN11arglst_nodeD0Ev(%struct.lam_node* %this.this, %struct.arg_node* %outer_arg, %struct.alst_node* %env.cmp, %struct.arglst_node* %this, i32 %functionID) {
-comb_entry:
-  %.SV59 = alloca %struct.node*                   ; <%struct.node**> [#uses=1]
-  %0 = load i32 (...)**, i32 (...)*** null, align 4            ; <i32 (...)**> [#uses=1]
-  %1 = getelementptr inbounds i32 (...)*, i32 (...)** %0, i32 3 ; <i32 (...)**> [#uses=1]
-  %2 = load i32 (...)*, i32 (...)** %1, align 4               ; <i32 (...)*> [#uses=1]
-  store %struct.node* undef, %struct.node** %.SV59
-  %3 = bitcast i32 (...)* %2 to i32 (%struct.node*)* ; <i32 (%struct.node*)*> [#uses=1]
-  %4 = tail call i32 %3(%struct.node* undef)      ; <i32> [#uses=0]
-  unreachable
-}
diff --git a/test/CodeGen/X86/commute-3dnow.ll b/test/CodeGen/X86/commute-3dnow.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b7a01efe2d3a0ac83a5a19626b27157fd8349def
--- /dev/null
+++ b/test/CodeGen/X86/commute-3dnow.ll
@@ -0,0 +1,270 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+3dnow | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+3dnow | FileCheck %s --check-prefix=X64
+
+define void @commute_m_pfadd(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
+; X32-LABEL: commute_m_pfadd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq (%edx), %mm0
+; X32-NEXT:    pfadd (%eax), %mm0
+; X32-NEXT:    pfadd (%ecx), %mm0
+; X32-NEXT:    movq %mm0, (%ecx)
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_m_pfadd:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    pfadd (%rsi), %mm0
+; X64-NEXT:    pfadd (%rdx), %mm0
+; X64-NEXT:    movq %mm0, (%rdx)
+; X64-NEXT:    retq
+  %1 = load x86_mmx, x86_mmx* %a0
+  %2 = load x86_mmx, x86_mmx* %a1
+  %3 = load x86_mmx, x86_mmx* %a2
+  %4 = tail call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %1, x86_mmx %2)
+  %5 = tail call x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx %3, x86_mmx %4)
+  store x86_mmx %5, x86_mmx* %a2
+  ret void
+}
+declare x86_mmx @llvm.x86.3dnow.pfadd(x86_mmx, x86_mmx)
+
+define void @commute_m_pfsub(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
+; X32-LABEL: commute_m_pfsub:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq (%edx), %mm0
+; X32-NEXT:    pfsub (%eax), %mm0
+; X32-NEXT:    pfsubr (%ecx), %mm0
+; X32-NEXT:    movq %mm0, (%ecx)
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_m_pfsub:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    pfsub (%rsi), %mm0
+; X64-NEXT:    pfsubr (%rdx), %mm0
+; X64-NEXT:    movq %mm0, (%rdx)
+; X64-NEXT:    retq
+  %1 = load x86_mmx, x86_mmx* %a0
+  %2 = load x86_mmx, x86_mmx* %a1
+  %3 = load x86_mmx, x86_mmx* %a2
+  %4 = tail call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %1, x86_mmx %2)
+  %5 = tail call x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx %3, x86_mmx %4)
+  store x86_mmx %5, x86_mmx* %a2
+  ret void
+}
+declare x86_mmx @llvm.x86.3dnow.pfsub(x86_mmx, x86_mmx)
+
+define void @commute_m_pfsubr(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
+; X32-LABEL: commute_m_pfsubr:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq (%edx), %mm0
+; X32-NEXT:    pfsubr (%eax), %mm0
+; X32-NEXT:    pfsub (%ecx), %mm0
+; X32-NEXT:    movq %mm0, (%ecx)
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_m_pfsubr:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    pfsubr (%rsi), %mm0
+; X64-NEXT:    pfsub (%rdx), %mm0
+; X64-NEXT:    movq %mm0, (%rdx)
+; X64-NEXT:    retq
+  %1 = load x86_mmx, x86_mmx* %a0
+  %2 = load x86_mmx, x86_mmx* %a1
+  %3 = load x86_mmx, x86_mmx* %a2
+  %4 = tail call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %1, x86_mmx %2)
+  %5 = tail call x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx %3, x86_mmx %4)
+  store x86_mmx %5, x86_mmx* %a2
+  ret void
+}
+declare x86_mmx @llvm.x86.3dnow.pfsubr(x86_mmx, x86_mmx)
+
+define void @commute_m_pfmul(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
+; X32-LABEL: commute_m_pfmul:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq (%edx), %mm0
+; X32-NEXT:    pfmul (%eax), %mm0
+; X32-NEXT:    pfmul (%ecx), %mm0
+; X32-NEXT:    movq %mm0, (%ecx)
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_m_pfmul:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    pfmul (%rsi), %mm0
+; X64-NEXT:    pfmul (%rdx), %mm0
+; X64-NEXT:    movq %mm0, (%rdx)
+; X64-NEXT:    retq
+  %1 = load x86_mmx, x86_mmx* %a0
+  %2 = load x86_mmx, x86_mmx* %a1
+  %3 = load x86_mmx, x86_mmx* %a2
+  %4 = tail call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %1, x86_mmx %2)
+  %5 = tail call x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx %3, x86_mmx %4)
+  store x86_mmx %5, x86_mmx* %a2
+  ret void
+}
+declare x86_mmx @llvm.x86.3dnow.pfmul(x86_mmx, x86_mmx)
+
+; PFMAX can't commute without fast-math.
+define void @commute_m_pfmax(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
+; X32-LABEL: commute_m_pfmax:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq (%edx), %mm0
+; X32-NEXT:    movq (%ecx), %mm1
+; X32-NEXT:    pfmax (%eax), %mm0
+; X32-NEXT:    pfmax %mm0, %mm1
+; X32-NEXT:    movq %mm1, (%ecx)
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_m_pfmax:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movq (%rdx), %mm1
+; X64-NEXT:    pfmax (%rsi), %mm0
+; X64-NEXT:    pfmax %mm0, %mm1
+; X64-NEXT:    movq %mm1, (%rdx)
+; X64-NEXT:    retq
+  %1 = load x86_mmx, x86_mmx* %a0
+  %2 = load x86_mmx, x86_mmx* %a1
+  %3 = load x86_mmx, x86_mmx* %a2
+  %4 = tail call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %1, x86_mmx %2)
+  %5 = tail call x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx %3, x86_mmx %4)
+  store x86_mmx %5, x86_mmx* %a2
+  ret void
+}
+declare x86_mmx @llvm.x86.3dnow.pfmax(x86_mmx, x86_mmx)
+
+; PFMIN can't commute without fast-math.
+define void @commute_m_pfmin(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
+; X32-LABEL: commute_m_pfmin:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq (%edx), %mm0
+; X32-NEXT:    movq (%ecx), %mm1
+; X32-NEXT:    pfmin (%eax), %mm0
+; X32-NEXT:    pfmin %mm0, %mm1
+; X32-NEXT:    movq %mm1, (%ecx)
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_m_pfmin:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movq (%rdx), %mm1
+; X64-NEXT:    pfmin (%rsi), %mm0
+; X64-NEXT:    pfmin %mm0, %mm1
+; X64-NEXT:    movq %mm1, (%rdx)
+; X64-NEXT:    retq
+  %1 = load x86_mmx, x86_mmx* %a0
+  %2 = load x86_mmx, x86_mmx* %a1
+  %3 = load x86_mmx, x86_mmx* %a2
+  %4 = tail call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %1, x86_mmx %2)
+  %5 = tail call x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx %3, x86_mmx %4)
+  store x86_mmx %5, x86_mmx* %a2
+  ret void
+}
+declare x86_mmx @llvm.x86.3dnow.pfmin(x86_mmx, x86_mmx)
+
+define void @commute_m_pfcmpeq(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
+; X32-LABEL: commute_m_pfcmpeq:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq (%edx), %mm0
+; X32-NEXT:    pfcmpeq (%eax), %mm0
+; X32-NEXT:    pfcmpeq (%ecx), %mm0
+; X32-NEXT:    movq %mm0, (%ecx)
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_m_pfcmpeq:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    pfcmpeq (%rsi), %mm0
+; X64-NEXT:    pfcmpeq (%rdx), %mm0
+; X64-NEXT:    movq %mm0, (%rdx)
+; X64-NEXT:    retq
+  %1 = load x86_mmx, x86_mmx* %a0
+  %2 = load x86_mmx, x86_mmx* %a1
+  %3 = load x86_mmx, x86_mmx* %a2
+  %4 = tail call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %1, x86_mmx %2)
+  %5 = tail call x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx %3, x86_mmx %4)
+  store x86_mmx %5, x86_mmx* %a2
+  ret void
+}
+declare x86_mmx @llvm.x86.3dnow.pfcmpeq(x86_mmx, x86_mmx)
+
+define void @commute_m_pavgusb(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
+; X32-LABEL: commute_m_pavgusb:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq (%edx), %mm0
+; X32-NEXT:    pavgusb (%eax), %mm0
+; X32-NEXT:    pavgusb (%ecx), %mm0
+; X32-NEXT:    movq %mm0, (%ecx)
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_m_pavgusb:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    pavgusb (%rsi), %mm0
+; X64-NEXT:    pavgusb (%rdx), %mm0
+; X64-NEXT:    movq %mm0, (%rdx)
+; X64-NEXT:    retq
+  %1 = load x86_mmx, x86_mmx* %a0
+  %2 = load x86_mmx, x86_mmx* %a1
+  %3 = load x86_mmx, x86_mmx* %a2
+  %4 = tail call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %1, x86_mmx %2)
+  %5 = tail call x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx %3, x86_mmx %4)
+  store x86_mmx %5, x86_mmx* %a2
+  ret void
+}
+declare x86_mmx @llvm.x86.3dnow.pavgusb(x86_mmx, x86_mmx)
+
+define void @commute_m_pmulhrw(x86_mmx *%a0, x86_mmx *%a1, x86_mmx *%a2) nounwind {
+; X32-LABEL: commute_m_pmulhrw:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movq (%edx), %mm0
+; X32-NEXT:    pmulhrw (%eax), %mm0
+; X32-NEXT:    pmulhrw (%ecx), %mm0
+; X32-NEXT:    movq %mm0, (%ecx)
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_m_pmulhrw:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    pmulhrw (%rsi), %mm0
+; X64-NEXT:    pmulhrw (%rdx), %mm0
+; X64-NEXT:    movq %mm0, (%rdx)
+; X64-NEXT:    retq
+  %1 = load x86_mmx, x86_mmx* %a0
+  %2 = load x86_mmx, x86_mmx* %a1
+  %3 = load x86_mmx, x86_mmx* %a2
+  %4 = tail call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %1, x86_mmx %2)
+  %5 = tail call x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx %3, x86_mmx %4)
+  store x86_mmx %5, x86_mmx* %a2
+  ret void
+}
+declare x86_mmx @llvm.x86.3dnow.pmulhrw(x86_mmx, x86_mmx)
diff --git a/test/CodeGen/X86/commute-clmul.ll b/test/CodeGen/X86/commute-clmul.ll
index d13911abc864fe3ced6f40d3095a55845ced4a39..84d9a914c9bbf7c4cc500059de28af5d513a9b49 100644
--- a/test/CodeGen/X86/commute-clmul.ll
+++ b/test/CodeGen/X86/commute-clmul.ll
@@ -1,59 +1,64 @@
-; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+sse2,+pclmul < %s | FileCheck %s --check-prefix=SSE
-; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+avx2,+pclmul < %s | FileCheck %s --check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+pclmul | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,+pclmul | FileCheck %s --check-prefix=AVX
 
 declare <2 x i64> @llvm.x86.pclmulqdq(<2 x i64>, <2 x i64>, i8) nounwind readnone
 
 define <2 x i64> @commute_lq_lq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
-  ;SSE-LABEL: commute_lq_lq
-  ;SSE:       pclmulqdq $0, (%rdi), %xmm0
-  ;SSE-NEXT:  retq
-
-  ;AVX-LABEL: commute_lq_lq
-  ;AVX:       vpclmulqdq $0, (%rdi), %xmm0, %xmm0
-  ;AVX-NEXT:  retq
-
+; SSE-LABEL: commute_lq_lq:
+; SSE:       # BB#0:
+; SSE-NEXT:    pclmulqdq $0, (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: commute_lq_lq:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpclmulqdq $0, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = load <2 x i64>, <2 x i64>* %a0
   %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %a1, i8 0)
   ret <2 x i64> %2
 }
 
 define <2 x i64> @commute_lq_hq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
-  ;SSE-LABEL: commute_lq_hq
-  ;SSE:       pclmulqdq $1, (%rdi), %xmm0
-  ;SSE-NEXT:  retq
-
-  ;AVX-LABEL: commute_lq_hq
-  ;AVX:       vpclmulqdq $1, (%rdi), %xmm0, %xmm0
-  ;AVX-NEXT:  retq
-
+; SSE-LABEL: commute_lq_hq:
+; SSE:       # BB#0:
+; SSE-NEXT:    pclmulqdq $1, (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: commute_lq_hq:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpclmulqdq $1, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = load <2 x i64>, <2 x i64>* %a0
   %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %a1, i8 16)
   ret <2 x i64> %2
 }
 
 define <2 x i64> @commute_hq_lq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
-  ;SSE-LABEL: commute_hq_lq
-  ;SSE:       pclmulqdq $16, (%rdi), %xmm0
-  ;SSE-NEXT:  retq
-
-  ;AVX-LABEL: commute_hq_lq
-  ;AVX:       vpclmulqdq $16, (%rdi), %xmm0, %xmm0
-  ;AVX-NEXT:  retq
-
+; SSE-LABEL: commute_hq_lq:
+; SSE:       # BB#0:
+; SSE-NEXT:    pclmulqdq $16, (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: commute_hq_lq:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpclmulqdq $16, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = load <2 x i64>, <2 x i64>* %a0
   %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %a1, i8 1)
   ret <2 x i64> %2
 }
 
 define <2 x i64> @commute_hq_hq(<2 x i64>* %a0, <2 x i64> %a1) #0 {
-  ;SSE-LABEL: commute_hq_hq
-  ;SSE:       pclmulqdq $17, (%rdi), %xmm0
-  ;SSE-NEXT:  retq
-
-  ;AVX-LABEL: commute_hq_hq
-  ;AVX:       vpclmulqdq $17, (%rdi), %xmm0, %xmm0
-  ;AVX-NEXT:  retq
-
+; SSE-LABEL: commute_hq_hq:
+; SSE:       # BB#0:
+; SSE-NEXT:    pclmulqdq $17, (%rdi), %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: commute_hq_hq:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpclmulqdq $17, (%rdi), %xmm0, %xmm0
+; AVX-NEXT:    retq
   %1 = load <2 x i64>, <2 x i64>* %a0
   %2 = call <2 x i64> @llvm.x86.pclmulqdq(<2 x i64> %1, <2 x i64> %a1, i8 17)
   ret <2 x i64> %2
diff --git a/test/CodeGen/X86/commute-fcmp.ll b/test/CodeGen/X86/commute-fcmp.ll
index 4274d1feaa3bf097808aa01ce4c010c759fe82e7..f05fb805b411caec34738d0bcc4f39cd0b10fee4 100644
--- a/test/CodeGen/X86/commute-fcmp.ll
+++ b/test/CodeGen/X86/commute-fcmp.ll
@@ -1,6 +1,6 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+sse2 < %s | FileCheck %s --check-prefix=SSE
-; RUN: llc -O3 -mtriple=x86_64-unknown -mcpu=x86-64 -mattr=+avx2 < %s | FileCheck %s --check-prefix=AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX
 
 ;
 ; Float Comparisons
@@ -17,7 +17,6 @@ define <4 x i32> @commute_cmpps_eq(<4 x float>* %a0, <4 x float> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpeqps (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x float>, <4 x float>* %a0
   %2 = fcmp oeq <4 x float> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i32>
@@ -34,7 +33,6 @@ define <4 x i32> @commute_cmpps_ne(<4 x float>* %a0, <4 x float> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpneqps (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x float>, <4 x float>* %a0
   %2 = fcmp une <4 x float> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i32>
@@ -51,7 +49,6 @@ define <4 x i32> @commute_cmpps_ord(<4 x float>* %a0, <4 x float> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpordps (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x float>, <4 x float>* %a0
   %2 = fcmp ord <4 x float> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i32>
@@ -68,7 +65,6 @@ define <4 x i32> @commute_cmpps_uno(<4 x float>* %a0, <4 x float> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpunordps (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x float>, <4 x float>* %a0
   %2 = fcmp uno <4 x float> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i32>
@@ -92,7 +88,6 @@ define <4 x i32> @commute_cmpps_ueq(<4 x float>* %a0, <4 x float> %a1) {
 ; AVX-NEXT:    vcmpunordps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vorps %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x float>, <4 x float>* %a0
   %2 = fcmp ueq <4 x float> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i32>
@@ -116,7 +111,6 @@ define <4 x i32> @commute_cmpps_one(<4 x float>* %a0, <4 x float> %a1) {
 ; AVX-NEXT:    vcmpordps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vandps %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x float>, <4 x float>* %a0
   %2 = fcmp one <4 x float> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i32>
@@ -136,7 +130,6 @@ define <4 x i32> @commute_cmpps_lt(<4 x float>* %a0, <4 x float> %a1) {
 ; AVX-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x float>, <4 x float>* %a0
   %2 = fcmp olt <4 x float> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i32>
@@ -156,7 +149,6 @@ define <4 x i32> @commute_cmpps_le(<4 x float>* %a0, <4 x float> %a1) {
 ; AVX-NEXT:    vmovaps (%rdi), %xmm1
 ; AVX-NEXT:    vcmpleps %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x float>, <4 x float>* %a0
   %2 = fcmp ole <4 x float> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i32>
@@ -174,7 +166,6 @@ define <8 x i32> @commute_cmpps_eq_ymm(<8 x float>* %a0, <8 x float> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpeqps (%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <8 x float>, <8 x float>* %a0
   %2 = fcmp oeq <8 x float> %1, %a1
   %3 = sext <8 x i1> %2 to <8 x i32>
@@ -192,7 +183,6 @@ define <8 x i32> @commute_cmpps_ne_ymm(<8 x float>* %a0, <8 x float> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpneqps (%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <8 x float>, <8 x float>* %a0
   %2 = fcmp une <8 x float> %1, %a1
   %3 = sext <8 x i1> %2 to <8 x i32>
@@ -210,7 +200,6 @@ define <8 x i32> @commute_cmpps_ord_ymm(<8 x float>* %a0, <8 x float> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpordps (%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <8 x float>, <8 x float>* %a0
   %2 = fcmp ord <8 x float> %1, %a1
   %3 = sext <8 x i1> %2 to <8 x i32>
@@ -228,7 +217,6 @@ define <8 x i32> @commute_cmpps_uno_ymm(<8 x float>* %a0, <8 x float> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpunordps (%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <8 x float>, <8 x float>* %a0
   %2 = fcmp uno <8 x float> %1, %a1
   %3 = sext <8 x i1> %2 to <8 x i32>
@@ -257,7 +245,6 @@ define <8 x i32> @commute_cmpps_ueq_ymm(<8 x float>* %a0, <8 x float> %a1) {
 ; AVX-NEXT:    vcmpunordps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <8 x float>, <8 x float>* %a0
   %2 = fcmp ueq <8 x float> %1, %a1
   %3 = sext <8 x i1> %2 to <8 x i32>
@@ -286,7 +273,6 @@ define <8 x i32> @commute_cmpps_one_ymm(<8 x float>* %a0, <8 x float> %a1) {
 ; AVX-NEXT:    vcmpordps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vandps %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <8 x float>, <8 x float>* %a0
   %2 = fcmp one <8 x float> %1, %a1
   %3 = sext <8 x i1> %2 to <8 x i32>
@@ -309,7 +295,6 @@ define <8 x i32> @commute_cmpps_lt_ymm(<8 x float>* %a0, <8 x float> %a1) {
 ; AVX-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <8 x float>, <8 x float>* %a0
   %2 = fcmp olt <8 x float> %1, %a1
   %3 = sext <8 x i1> %2 to <8 x i32>
@@ -332,7 +317,6 @@ define <8 x i32> @commute_cmpps_le_ymm(<8 x float>* %a0, <8 x float> %a1) {
 ; AVX-NEXT:    vmovaps (%rdi), %ymm1
 ; AVX-NEXT:    vcmpleps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <8 x float>, <8 x float>* %a0
   %2 = fcmp ole <8 x float> %1, %a1
   %3 = sext <8 x i1> %2 to <8 x i32>
@@ -354,7 +338,6 @@ define <2 x i64> @commute_cmppd_eq(<2 x double>* %a0, <2 x double> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpeqpd (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <2 x double>, <2 x double>* %a0
   %2 = fcmp oeq <2 x double> %1, %a1
   %3 = sext <2 x i1> %2 to <2 x i64>
@@ -371,7 +354,6 @@ define <2 x i64> @commute_cmppd_ne(<2 x double>* %a0, <2 x double> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpneqpd (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <2 x double>, <2 x double>* %a0
   %2 = fcmp une <2 x double> %1, %a1
   %3 = sext <2 x i1> %2 to <2 x i64>
@@ -388,7 +370,6 @@ define <2 x i64> @commute_cmppd_ord(<2 x double>* %a0, <2 x double> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpordpd (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <2 x double>, <2 x double>* %a0
   %2 = fcmp ord <2 x double> %1, %a1
   %3 = sext <2 x i1> %2 to <2 x i64>
@@ -412,7 +393,6 @@ define <2 x i64> @commute_cmppd_ueq(<2 x double>* %a0, <2 x double> %a1) {
 ; AVX-NEXT:    vcmpunordpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vorpd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <2 x double>, <2 x double>* %a0
   %2 = fcmp ueq <2 x double> %1, %a1
   %3 = sext <2 x i1> %2 to <2 x i64>
@@ -436,7 +416,6 @@ define <2 x i64> @commute_cmppd_one(<2 x double>* %a0, <2 x double> %a1) {
 ; AVX-NEXT:    vcmpordpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vandpd %xmm2, %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <2 x double>, <2 x double>* %a0
   %2 = fcmp one <2 x double> %1, %a1
   %3 = sext <2 x i1> %2 to <2 x i64>
@@ -453,7 +432,6 @@ define <2 x i64> @commute_cmppd_uno(<2 x double>* %a0, <2 x double> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpunordpd (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <2 x double>, <2 x double>* %a0
   %2 = fcmp uno <2 x double> %1, %a1
   %3 = sext <2 x i1> %2 to <2 x i64>
@@ -473,7 +451,6 @@ define <2 x i64> @commute_cmppd_lt(<2 x double>* %a0, <2 x double> %a1) {
 ; AVX-NEXT:    vmovapd (%rdi), %xmm1
 ; AVX-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <2 x double>, <2 x double>* %a0
   %2 = fcmp olt <2 x double> %1, %a1
   %3 = sext <2 x i1> %2 to <2 x i64>
@@ -493,7 +470,6 @@ define <2 x i64> @commute_cmppd_le(<2 x double>* %a0, <2 x double> %a1) {
 ; AVX-NEXT:    vmovapd (%rdi), %xmm1
 ; AVX-NEXT:    vcmplepd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
-;
   %1 = load <2 x double>, <2 x double>* %a0
   %2 = fcmp ole <2 x double> %1, %a1
   %3 = sext <2 x i1> %2 to <2 x i64>
@@ -511,7 +487,6 @@ define <4 x i64> @commute_cmppd_eq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpeqpd (%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x double>, <4 x double>* %a0
   %2 = fcmp oeq <4 x double> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i64>
@@ -529,7 +504,6 @@ define <4 x i64> @commute_cmppd_ne_ymmm(<4 x double>* %a0, <4 x double> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpneqpd (%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x double>, <4 x double>* %a0
   %2 = fcmp une <4 x double> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i64>
@@ -547,7 +521,6 @@ define <4 x i64> @commute_cmppd_ord_ymmm(<4 x double>* %a0, <4 x double> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpordpd (%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x double>, <4 x double>* %a0
   %2 = fcmp ord <4 x double> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i64>
@@ -565,7 +538,6 @@ define <4 x i64> @commute_cmppd_uno_ymmm(<4 x double>* %a0, <4 x double> %a1) {
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpunordpd (%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x double>, <4 x double>* %a0
   %2 = fcmp uno <4 x double> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i64>
@@ -594,7 +566,6 @@ define <4 x i64> @commute_cmppd_ueq_ymmm(<4 x double>* %a0, <4 x double> %a1) {
 ; AVX-NEXT:    vcmpunordpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vorpd %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x double>, <4 x double>* %a0
   %2 = fcmp ueq <4 x double> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i64>
@@ -623,7 +594,6 @@ define <4 x i64> @commute_cmppd_one_ymmm(<4 x double>* %a0, <4 x double> %a1) {
 ; AVX-NEXT:    vcmpordpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vandpd %ymm2, %ymm0, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x double>, <4 x double>* %a0
   %2 = fcmp one <4 x double> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i64>
@@ -646,7 +616,6 @@ define <4 x i64> @commute_cmppd_lt_ymmm(<4 x double>* %a0, <4 x double> %a1) {
 ; AVX-NEXT:    vmovapd (%rdi), %ymm1
 ; AVX-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x double>, <4 x double>* %a0
   %2 = fcmp olt <4 x double> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i64>
@@ -669,7 +638,6 @@ define <4 x i64> @commute_cmppd_le_ymmm(<4 x double>* %a0, <4 x double> %a1) {
 ; AVX-NEXT:    vmovapd (%rdi), %ymm1
 ; AVX-NEXT:    vcmplepd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    retq
-;
   %1 = load <4 x double>, <4 x double>* %a0
   %2 = fcmp ole <4 x double> %1, %a1
   %3 = sext <4 x i1> %2 to <4 x i64>
diff --git a/test/CodeGen/X86/commute-xop.ll b/test/CodeGen/X86/commute-xop.ll
index e551d9bfc78fbd21e48d576e150b36f10c05cdd2..4043155ba8d4888eca64b9f60b79fc403af8440a 100644
--- a/test/CodeGen/X86/commute-xop.ll
+++ b/test/CodeGen/X86/commute-xop.ll
@@ -1,8 +1,18 @@
-; RUN: llc -O3 -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx,+xop < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+xop | FileCheck %s --check-prefix=X64
 
 define <16 x i8> @commute_fold_vpcomb(<16 x i8>* %a0, <16 x i8> %a1) {
-  ;CHECK-LABEL: commute_fold_vpcomb
-  ;CHECK:       vpcomgtb (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpcomb:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpcomgtb (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpcomb:
+; X64:       # BB#0:
+; X64-NEXT:    vpcomgtb (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a0
   %2 = call <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8> %1, <16 x i8> %a1, i8 0) ; vpcomltb
   ret <16 x i8> %2
@@ -10,8 +20,16 @@ define <16 x i8> @commute_fold_vpcomb(<16 x i8>* %a0, <16 x i8> %a1) {
 declare <16 x i8> @llvm.x86.xop.vpcomb(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 define <4 x i32> @commute_fold_vpcomd(<4 x i32>* %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL: commute_fold_vpcomd
-  ;CHECK:       vpcomged (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpcomd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpcomged (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpcomd:
+; X64:       # BB#0:
+; X64-NEXT:    vpcomged (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x i32>, <4 x i32>* %a0
   %2 = call <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32> %1, <4 x i32> %a1, i8 1) ; vpcomled
   ret <4 x i32> %2
@@ -19,8 +37,16 @@ define <4 x i32> @commute_fold_vpcomd(<4 x i32>* %a0, <4 x i32> %a1) {
 declare <4 x i32> @llvm.x86.xop.vpcomd(<4 x i32>, <4 x i32>, i8) nounwind readnone
 
 define <2 x i64> @commute_fold_vpcomq(<2 x i64>* %a0, <2 x i64> %a1) {
-  ;CHECK-LABEL: commute_fold_vpcomq
-  ;CHECK:       vpcomltq (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpcomq:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpcomltq (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpcomq:
+; X64:       # BB#0:
+; X64-NEXT:    vpcomltq (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <2 x i64>, <2 x i64>* %a0
   %2 = call <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64> %1, <2 x i64> %a1, i8 2) ; vpcomgtq
   ret <2 x i64> %2
@@ -28,8 +54,16 @@ define <2 x i64> @commute_fold_vpcomq(<2 x i64>* %a0, <2 x i64> %a1) {
 declare <2 x i64> @llvm.x86.xop.vpcomq(<2 x i64>, <2 x i64>, i8) nounwind readnone
 
 define <16 x i8> @commute_fold_vpcomub(<16 x i8>* %a0, <16 x i8> %a1) {
-  ;CHECK-LABEL: commute_fold_vpcomub
-  ;CHECK:       vpcomleub (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpcomub:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpcomleub (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpcomub:
+; X64:       # BB#0:
+; X64-NEXT:    vpcomleub (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <16 x i8>, <16 x i8>* %a0
   %2 = call <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8> %1, <16 x i8> %a1, i8 3) ; vpcomgeub
   ret <16 x i8> %2
@@ -37,8 +71,16 @@ define <16 x i8> @commute_fold_vpcomub(<16 x i8>* %a0, <16 x i8> %a1) {
 declare <16 x i8> @llvm.x86.xop.vpcomub(<16 x i8>, <16 x i8>, i8) nounwind readnone
 
 define <4 x i32> @commute_fold_vpcomud(<4 x i32>* %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL: commute_fold_vpcomud
-  ;CHECK:       vpcomequd (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpcomud:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpcomequd (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpcomud:
+; X64:       # BB#0:
+; X64-NEXT:    vpcomequd (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x i32>, <4 x i32>* %a0
   %2 = call <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32> %1, <4 x i32> %a1, i8 4) ; vpcomequd
   ret <4 x i32> %2
@@ -46,8 +88,16 @@ define <4 x i32> @commute_fold_vpcomud(<4 x i32>* %a0, <4 x i32> %a1) {
 declare <4 x i32> @llvm.x86.xop.vpcomud(<4 x i32>, <4 x i32>, i8) nounwind readnone
 
 define <2 x i64> @commute_fold_vpcomuq(<2 x i64>* %a0, <2 x i64> %a1) {
-  ;CHECK-LABEL: commute_fold_vpcomuq
-  ;CHECK:       vpcomnequq (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpcomuq:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpcomnequq (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpcomuq:
+; X64:       # BB#0:
+; X64-NEXT:    vpcomnequq (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <2 x i64>, <2 x i64>* %a0
   %2 = call <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64> %1, <2 x i64> %a1, i8 5) ; vpcomnequq
   ret <2 x i64> %2
@@ -55,8 +105,16 @@ define <2 x i64> @commute_fold_vpcomuq(<2 x i64>* %a0, <2 x i64> %a1) {
 declare <2 x i64> @llvm.x86.xop.vpcomuq(<2 x i64>, <2 x i64>, i8) nounwind readnone
 
 define <8 x i16> @commute_fold_vpcomuw(<8 x i16>* %a0, <8 x i16> %a1) {
-  ;CHECK-LABEL: commute_fold_vpcomuw
-  ;CHECK:       vpcomfalseuw (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpcomuw:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpcomfalseuw (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpcomuw:
+; X64:       # BB#0:
+; X64-NEXT:    vpcomfalseuw (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a0
   %2 = call <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16> %1, <8 x i16> %a1, i8 6) ; vpcomfalseuw
   ret <8 x i16> %2
@@ -64,8 +122,16 @@ define <8 x i16> @commute_fold_vpcomuw(<8 x i16>* %a0, <8 x i16> %a1) {
 declare <8 x i16> @llvm.x86.xop.vpcomuw(<8 x i16>, <8 x i16>, i8) nounwind readnone
 
 define <8 x i16> @commute_fold_vpcomw(<8 x i16>* %a0, <8 x i16> %a1) {
-  ;CHECK-LABEL: commute_fold_vpcomw
-  ;CHECK:       vpcomtruew (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpcomw:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpcomtruew (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpcomw:
+; X64:       # BB#0:
+; X64-NEXT:    vpcomtruew (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a0
   %2 = call <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16> %1, <8 x i16> %a1, i8 7) ; vpcomtruew
   ret <8 x i16> %2
@@ -73,8 +139,16 @@ define <8 x i16> @commute_fold_vpcomw(<8 x i16>* %a0, <8 x i16> %a1) {
 declare <8 x i16> @llvm.x86.xop.vpcomw(<8 x i16>, <8 x i16>, i8) nounwind readnone
 
 define <4 x i32> @commute_fold_vpmacsdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmacsdd
-  ;CHECK:       vpmacsdd %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmacsdd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmacsdd %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmacsdd:
+; X64:       # BB#0:
+; X64-NEXT:    vpmacsdd %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x i32>, <4 x i32>* %a0
   %2 = call <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2)
   ret <4 x i32> %2
@@ -82,8 +156,16 @@ define <4 x i32> @commute_fold_vpmacsdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32>
 declare <4 x i32> @llvm.x86.xop.vpmacsdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @commute_fold_vpmacsdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmacsdqh
-  ;CHECK:       vpmacsdqh %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmacsdqh:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmacsdqh %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmacsdqh:
+; X64:       # BB#0:
+; X64-NEXT:    vpmacsdqh %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x i32>, <4 x i32>* %a0
   %2 = call <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
   ret <2 x i64> %2
@@ -91,8 +173,16 @@ define <2 x i64> @commute_fold_vpmacsdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64
 declare <2 x i64> @llvm.x86.xop.vpmacsdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @commute_fold_vpmacsdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmacsdql
-  ;CHECK:       vpmacsdql %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmacsdql:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmacsdql %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmacsdql:
+; X64:       # BB#0:
+; X64-NEXT:    vpmacsdql %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x i32>, <4 x i32>* %a0
   %2 = call <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
   ret <2 x i64> %2
@@ -100,8 +190,16 @@ define <2 x i64> @commute_fold_vpmacsdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64
 declare <2 x i64> @llvm.x86.xop.vpmacsdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
 
 define <4 x i32> @commute_fold_vpmacssdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmacssdd
-  ;CHECK:       vpmacssdd %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmacssdd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmacssdd %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmacssdd:
+; X64:       # BB#0:
+; X64-NEXT:    vpmacssdd %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x i32>, <4 x i32>* %a0
   %2 = call <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32> %1, <4 x i32> %a1, <4 x i32> %a2)
   ret <4 x i32> %2
@@ -109,8 +207,16 @@ define <4 x i32> @commute_fold_vpmacssdd(<4 x i32>* %a0, <4 x i32> %a1, <4 x i32
 declare <4 x i32> @llvm.x86.xop.vpmacssdd(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
 
 define <2 x i64> @commute_fold_vpmacssdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmacssdqh
-  ;CHECK:       vpmacssdqh %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmacssdqh:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmacssdqh %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmacssdqh:
+; X64:       # BB#0:
+; X64-NEXT:    vpmacssdqh %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x i32>, <4 x i32>* %a0
   %2 = call <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
   ret <2 x i64> %2
@@ -118,8 +224,16 @@ define <2 x i64> @commute_fold_vpmacssdqh(<4 x i32>* %a0, <4 x i32> %a1, <2 x i6
 declare <2 x i64> @llvm.x86.xop.vpmacssdqh(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
 
 define <2 x i64> @commute_fold_vpmacssdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i64> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmacssdql
-  ;CHECK:       vpmacssdql %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmacssdql:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmacssdql %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmacssdql:
+; X64:       # BB#0:
+; X64-NEXT:    vpmacssdql %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <4 x i32>, <4 x i32>* %a0
   %2 = call <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32> %1, <4 x i32> %a1, <2 x i64> %a2)
   ret <2 x i64> %2
@@ -127,8 +241,16 @@ define <2 x i64> @commute_fold_vpmacssdql(<4 x i32>* %a0, <4 x i32> %a1, <2 x i6
 declare <2 x i64> @llvm.x86.xop.vpmacssdql(<4 x i32>, <4 x i32>, <2 x i64>) nounwind readnone
 
 define <4 x i32> @commute_fold_vpmacsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmacsswd
-  ;CHECK:       vpmacsswd %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmacsswd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmacsswd %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmacsswd:
+; X64:       # BB#0:
+; X64-NEXT:    vpmacsswd %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a0
   %2 = call <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
   ret <4 x i32> %2
@@ -136,8 +258,16 @@ define <4 x i32> @commute_fold_vpmacsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32
 declare <4 x i32> @llvm.x86.xop.vpmacsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
 
 define <8 x i16> @commute_fold_vpmacssww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmacssww
-  ;CHECK:       vpmacssww %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmacssww:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmacssww %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmacssww:
+; X64:       # BB#0:
+; X64-NEXT:    vpmacssww %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a0
   %2 = call <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2)
   ret <8 x i16> %2
@@ -145,8 +275,16 @@ define <8 x i16> @commute_fold_vpmacssww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16
 declare <8 x i16> @llvm.x86.xop.vpmacssww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
 
 define <4 x i32> @commute_fold_vpmacswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmacswd
-  ;CHECK:       vpmacswd %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmacswd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmacswd %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmacswd:
+; X64:       # BB#0:
+; X64-NEXT:    vpmacswd %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a0
   %2 = call <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
   ret <4 x i32> %2
@@ -154,8 +292,16 @@ define <4 x i32> @commute_fold_vpmacswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32>
 declare <4 x i32> @llvm.x86.xop.vpmacswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
 
 define <8 x i16> @commute_fold_vpmacsww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmacsww
-  ;CHECK:       vpmacsww %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmacsww:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmacsww %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmacsww:
+; X64:       # BB#0:
+; X64-NEXT:    vpmacsww %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a0
   %2 = call <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16> %1, <8 x i16> %a1, <8 x i16> %a2)
   ret <8 x i16> %2
@@ -163,8 +309,16 @@ define <8 x i16> @commute_fold_vpmacsww(<8 x i16>* %a0, <8 x i16> %a1, <8 x i16>
 declare <8 x i16> @llvm.x86.xop.vpmacsww(<8 x i16>, <8 x i16>, <8 x i16>) nounwind readnone
 
 define <4 x i32> @commute_fold_vpmadcsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmadcsswd
-  ;CHECK:       vpmadcsswd %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmadcsswd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmadcsswd %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmadcsswd:
+; X64:       # BB#0:
+; X64-NEXT:    vpmadcsswd %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a0
   %2 = call <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
   ret <4 x i32> %2
@@ -172,13 +326,18 @@ define <4 x i32> @commute_fold_vpmadcsswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i3
 declare <4 x i32> @llvm.x86.xop.vpmadcsswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
 
 define <4 x i32> @commute_fold_vpmadcswd(<8 x i16>* %a0, <8 x i16> %a1, <4 x i32> %a2) {
-  ;CHECK-LABEL: commute_fold_vpmadcswd
-  ;CHECK:       vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0
+; X32-LABEL: commute_fold_vpmadcswd:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vpmadcswd %xmm1, (%eax), %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: commute_fold_vpmadcswd:
+; X64:       # BB#0:
+; X64-NEXT:    vpmadcswd %xmm1, (%rdi), %xmm0, %xmm0
+; X64-NEXT:    retq
   %1 = load <8 x i16>, <8 x i16>* %a0
   %2 = call <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16> %1, <8 x i16> %a1, <4 x i32> %a2)
   ret <4 x i32> %2
 }
 declare <4 x i32> @llvm.x86.xop.vpmadcswd(<8 x i16>, <8 x i16>, <4 x i32>) nounwind readnone
-
-
-
diff --git a/test/CodeGen/X86/compare-global.ll b/test/CodeGen/X86/compare-global.ll
index 8e3d3a93a5644f2b4017449076e379d4cd09f9fc..747595c1a89c3f4443889260c1605f516a9849df 100644
--- a/test/CodeGen/X86/compare-global.ll
+++ b/test/CodeGen/X86/compare-global.ll
@@ -7,7 +7,7 @@ target triple = "i686-pc-windows-msvc18.0.0"
 
 define void @f(i8* %c) {
 entry:
-  ; CHECK: subl $_foo, %eax
+  ; CHECK: cmpl $_foo, 4(%esp)
   %cmp = icmp eq i8* %c, @foo
   br i1 %cmp, label %if.then, label %if.end
 
diff --git a/test/CodeGen/X86/complex-fastmath.ll b/test/CodeGen/X86/complex-fastmath.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d31707260a0a3094785b0370ba9a1a2e33153591
--- /dev/null
+++ b/test/CodeGen/X86/complex-fastmath.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fma | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=FMA
+
+; PR31866
+; complex float complex_square_f32(complex float x) {
+;   return x*x;
+; }
+
+define <2 x float> @complex_square_f32(<2 x float>) #0 {
+; SSE-LABEL: complex_square_f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    addss %xmm2, %xmm2
+; SSE-NEXT:    mulss %xmm1, %xmm2
+; SSE-NEXT:    mulss %xmm0, %xmm0
+; SSE-NEXT:    mulss %xmm1, %xmm1
+; SSE-NEXT:    subss %xmm1, %xmm0
+; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: complex_square_f32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vaddss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vmulss %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmulss %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3]
+; AVX1-NEXT:    retq
+;
+; FMA-LABEL: complex_square_f32:
+; FMA:       # BB#0:
+; FMA-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; FMA-NEXT:    vaddss %xmm0, %xmm0, %xmm2
+; FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; FMA-NEXT:    vmulss %xmm1, %xmm1, %xmm1
+; FMA-NEXT:    vfmsub231ss %xmm0, %xmm0, %xmm1
+; FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm2[0],xmm1[2,3]
+; FMA-NEXT:    retq
+  %2 = extractelement <2 x float> %0, i32 0
+  %3 = extractelement <2 x float> %0, i32 1
+  %4 = fmul fast float %3, 2.000000e+00
+  %5 = fmul fast float %4, %2
+  %6 = fmul fast float %2, %2
+  %7 = fmul fast float %3, %3
+  %8 = fsub fast float %6, %7
+  %9 = insertelement <2 x float> undef, float %8, i32 0
+  %10 = insertelement <2 x float> %9, float %5, i32 1
+  ret <2 x float> %10
+}
+
+define <2 x double> @complex_square_f64(<2 x double>) #0 {
+; SSE-LABEL: complex_square_f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, %xmm1
+; SSE-NEXT:    movhlps {{.*#+}} xmm1 = xmm1[1,1]
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    addsd %xmm2, %xmm2
+; SSE-NEXT:    mulsd %xmm1, %xmm2
+; SSE-NEXT:    mulsd %xmm0, %xmm0
+; SSE-NEXT:    mulsd %xmm1, %xmm1
+; SSE-NEXT:    subsd %xmm1, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: complex_square_f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; AVX1-NEXT:    vaddsd %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vmulsd %xmm2, %xmm1, %xmm2
+; AVX1-NEXT:    vmulsd %xmm0, %xmm0, %xmm0
+; AVX1-NEXT:    vmulsd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; AVX1-NEXT:    retq
+;
+; FMA-LABEL: complex_square_f64:
+; FMA:       # BB#0:
+; FMA-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
+; FMA-NEXT:    vaddsd %xmm0, %xmm0, %xmm2
+; FMA-NEXT:    vmulsd %xmm2, %xmm1, %xmm2
+; FMA-NEXT:    vmulsd %xmm1, %xmm1, %xmm1
+; FMA-NEXT:    vfmsub231sd %xmm0, %xmm0, %xmm1
+; FMA-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm2[0]
+; FMA-NEXT:    retq
+  %2 = extractelement <2 x double> %0, i32 0
+  %3 = extractelement <2 x double> %0, i32 1
+  %4 = fmul fast double %3, 2.000000e+00
+  %5 = fmul fast double %4, %2
+  %6 = fmul fast double %2, %2
+  %7 = fmul fast double %3, %3
+  %8 = fsub fast double %6, %7
+  %9 = insertelement <2 x double> undef, double %8, i32 0
+  %10 = insertelement <2 x double> %9, double %5, i32 1
+  ret <2 x double> %10
+}
+
+; complex float complex_mul_f32(complex float x, complex float y) {
+;   return x*y;
+; }
+
+define <2 x float> @complex_mul_f32(<2 x float>, <2 x float>) #0 {
+; SSE-LABEL: complex_mul_f32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; SSE-NEXT:    movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    mulss %xmm0, %xmm4
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    mulss %xmm2, %xmm1
+; SSE-NEXT:    addss %xmm4, %xmm1
+; SSE-NEXT:    mulss %xmm2, %xmm3
+; SSE-NEXT:    subss %xmm3, %xmm0
+; SSE-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: complex_mul_f32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX1-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; AVX1-NEXT:    vmulss %xmm0, %xmm3, %xmm4
+; AVX1-NEXT:    vmulss %xmm2, %xmm1, %xmm5
+; AVX1-NEXT:    vaddss %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmulss %xmm2, %xmm3, %xmm1
+; AVX1-NEXT:    vsubss %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[2,3]
+; AVX1-NEXT:    retq
+;
+; FMA-LABEL: complex_mul_f32:
+; FMA:       # BB#0:
+; FMA-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; FMA-NEXT:    vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm4
+; FMA-NEXT:    vfmadd231ss %xmm0, %xmm3, %xmm4
+; FMA-NEXT:    vmulss %xmm2, %xmm3, %xmm2
+; FMA-NEXT:    vfmsub231ss %xmm0, %xmm1, %xmm2
+; FMA-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm4[0],xmm2[2,3]
+; FMA-NEXT:    retq
+  %3 = extractelement <2 x float> %0, i32 0
+  %4 = extractelement <2 x float> %0, i32 1
+  %5 = extractelement <2 x float> %1, i32 0
+  %6 = extractelement <2 x float> %1, i32 1
+  %7 = fmul fast float %6, %3
+  %8 = fmul fast float %5, %4
+  %9 = fadd fast float %7, %8
+  %10 = fmul fast float %5, %3
+  %11 = fmul fast float %6, %4
+  %12 = fsub fast float %10, %11
+  %13 = insertelement <2 x float> undef, float %12, i32 0
+  %14 = insertelement <2 x float> %13, float %9, i32 1
+  ret <2 x float> %14
+}
+
+define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 {
+; SSE-LABEL: complex_mul_f64:
+; SSE:       # BB#0:
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    movhlps {{.*#+}} xmm2 = xmm2[1,1]
+; SSE-NEXT:    movaps %xmm1, %xmm3
+; SSE-NEXT:    movhlps {{.*#+}} xmm3 = xmm3[1,1]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    mulsd %xmm0, %xmm4
+; SSE-NEXT:    mulsd %xmm1, %xmm0
+; SSE-NEXT:    mulsd %xmm2, %xmm1
+; SSE-NEXT:    addsd %xmm4, %xmm1
+; SSE-NEXT:    mulsd %xmm2, %xmm3
+; SSE-NEXT:    subsd %xmm3, %xmm0
+; SSE-NEXT:    unpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: complex_mul_f64:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
+; AVX1-NEXT:    vmulsd %xmm0, %xmm3, %xmm4
+; AVX1-NEXT:    vmulsd %xmm2, %xmm1, %xmm5
+; AVX1-NEXT:    vaddsd %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmulsd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vmulsd %xmm2, %xmm3, %xmm1
+; AVX1-NEXT:    vsubsd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; AVX1-NEXT:    retq
+;
+; FMA-LABEL: complex_mul_f64:
+; FMA:       # BB#0:
+; FMA-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
+; FMA-NEXT:    vpermilpd {{.*#+}} xmm3 = xmm1[1,0]
+; FMA-NEXT:    vmulsd %xmm2, %xmm1, %xmm4
+; FMA-NEXT:    vfmadd231sd %xmm0, %xmm3, %xmm4
+; FMA-NEXT:    vmulsd %xmm2, %xmm3, %xmm2
+; FMA-NEXT:    vfmsub231sd %xmm0, %xmm1, %xmm2
+; FMA-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm2[0],xmm4[0]
+; FMA-NEXT:    retq
+  %3 = extractelement <2 x double> %0, i32 0
+  %4 = extractelement <2 x double> %0, i32 1
+  %5 = extractelement <2 x double> %1, i32 0
+  %6 = extractelement <2 x double> %1, i32 1
+  %7 = fmul fast double %6, %3
+  %8 = fmul fast double %5, %4
+  %9 = fadd fast double %7, %8
+  %10 = fmul fast double %5, %3
+  %11 = fmul fast double %6, %4
+  %12 = fsub fast double %10, %11
+  %13 = insertelement <2 x double> undef, double %12, i32 0
+  %14 = insertelement <2 x double> %13, double %9, i32 1
+  ret <2 x double> %14
+}
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "unsafe-fp-math"="true"  }
diff --git a/test/CodeGen/X86/compress_expand.ll b/test/CodeGen/X86/compress_expand.ll
index c1a3a1b92bbc51293562c69fc60bcbf4fddf2ec5..e09fcf2a336e9dd01dd1d78701df853a592c4a30 100644
--- a/test/CodeGen/X86/compress_expand.ll
+++ b/test/CodeGen/X86/compress_expand.ll
@@ -8,23 +8,37 @@ target triple = "x86_64-unknown-linux-gnu"
 
 
 define <16 x float> @test1(float* %base) {
-; ALL-LABEL: test1:
-; ALL:       # BB#0:
-; ALL-NEXT:    movw $-2049, %ax # imm = 0xF7FF
-; ALL-NEXT:    kmovw %eax, %k1
-; ALL-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
-; ALL-NEXT:    retq
+; SKX-LABEL: test1:
+; SKX:       # BB#0:
+; SKX-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test1:
+; KNL:       # BB#0:
+; KNL-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandps (%rdi), %zmm0 {%k1} {z}
+; KNL-NEXT:    retq
   %res = call <16 x float> @llvm.masked.expandload.v16f32(float* %base, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true>, <16 x float> undef)
   ret <16 x float>%res
 }
 
 define <16 x float> @test2(float* %base, <16 x float> %src0) {
-; ALL-LABEL: test2:
-; ALL:       # BB#0:
-; ALL-NEXT:    movw $30719, %ax # imm = 0x77FF
-; ALL-NEXT:    kmovw %eax, %k1
-; ALL-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
-; ALL-NEXT:    retq
+; SKX-LABEL: test2:
+; SKX:       # BB#0:
+; SKX-NEXT:    movw $30719, %ax # imm = 0x77FF
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test2:
+; KNL:       # BB#0:
+; KNL-NEXT:    movw $30719, %ax # imm = 0x77FF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vexpandps (%rdi), %zmm0 {%k1}
+; KNL-NEXT:    retq
   %res = call <16 x float> @llvm.masked.expandload.v16f32(float* %base, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x float> %src0)
   ret <16 x float>%res
 }
@@ -52,7 +66,7 @@ define <4 x float> @test4(float* %base, <4 x float> %src0) {
 ; SKX-LABEL: test4:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    movb $7, %al
-; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vexpandps (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
@@ -72,7 +86,7 @@ define <2 x i64> @test5(i64* %base, <2 x i64> %src0) {
 ; SKX-LABEL: test5:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    movb $2, %al
-; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vpexpandq (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
@@ -94,12 +108,20 @@ declare <4 x float>  @llvm.masked.expandload.v4f32(float*, <4 x i1>, <4 x float>
 declare <2 x i64>    @llvm.masked.expandload.v2i64(i64*, <2 x i1>, <2 x i64>)
 
 define void @test6(float* %base, <16 x float> %V) {
-; ALL-LABEL: test6:
-; ALL:       # BB#0:
-; ALL-NEXT:    movw $-2049, %ax # imm = 0xF7FF
-; ALL-NEXT:    kmovw %eax, %k1
-; ALL-NEXT:    vcompressps %zmm0, (%rdi) {%k1}
-; ALL-NEXT:    retq
+; SKX-LABEL: test6:
+; SKX:       # BB#0:
+; SKX-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vcompressps %zmm0, (%rdi) {%k1}
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test6:
+; KNL:       # BB#0:
+; KNL-NEXT:    movw $-2049, %ax # imm = 0xF7FF
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    vcompressps %zmm0, (%rdi) {%k1}
+; KNL-NEXT:    retq
   call void @llvm.masked.compressstore.v16f32(<16 x float> %V, float* %base, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 false, i1 true, i1 true, i1 true, i1 true>)
   ret void
 }
@@ -110,6 +132,7 @@ define void @test7(float* %base, <8 x float> %V, <8 x i1> %mask) {
 ; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; SKX-NEXT:    vpmovw2m %xmm1, %k1
 ; SKX-NEXT:    vcompressps %ymm0, (%rdi) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test7:
@@ -132,6 +155,7 @@ define void @test8(double* %base, <8 x double> %V, <8 x i1> %mask) {
 ; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; SKX-NEXT:    vpmovw2m %xmm1, %k1
 ; SKX-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test8:
@@ -151,6 +175,7 @@ define void @test9(i64* %base, <8 x i64> %V, <8 x i1> %mask) {
 ; SKX-NEXT:    vpsllw $15, %xmm1, %xmm1
 ; SKX-NEXT:    vpmovw2m %xmm1, %k1
 ; SKX-NEXT:    vpcompressq %zmm0, (%rdi) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test9:
@@ -170,6 +195,7 @@ define void @test10(i64* %base, <4 x i64> %V, <4 x i1> %mask) {
 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
 ; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
 ; SKX-NEXT:    vpcompressq %ymm0, (%rdi) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test10:
@@ -200,8 +226,7 @@ define void @test11(i64* %base, <2 x i64> %V, <2 x i1> %mask) {
 ; KNL:       # BB#0:
 ; KNL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; KNL-NEXT:    vpsllq $63, %xmm1, %xmm1
-; KNL-NEXT:    vpsrad $31, %xmm1, %xmm1
-; KNL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; KNL-NEXT:    vpsraq $63, %zmm1, %zmm1
 ; KNL-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; KNL-NEXT:    vinserti32x4 $0, %xmm1, %zmm2, %zmm1
 ; KNL-NEXT:    vpsllq $63, %zmm1, %zmm1
@@ -341,16 +366,28 @@ define <16 x double> @test16(double* %base, <16 x double> %src0, <16 x i32> %tri
 }
 
 define void @test17(float* %base, <32 x float> %V, <32 x i32> %trigger) {
-; ALL-LABEL: test17:
-; ALL:       # BB#0:
-; ALL-NEXT:    vpxord %zmm4, %zmm4, %zmm4
-; ALL-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
-; ALL-NEXT:    vpcmpeqd %zmm4, %zmm2, %k2
-; ALL-NEXT:    kmovw %k2, %eax
-; ALL-NEXT:    popcntl %eax, %eax
-; ALL-NEXT:    vcompressps %zmm1, (%rdi,%rax,4) {%k1}
-; ALL-NEXT:    vcompressps %zmm0, (%rdi) {%k2}
-; ALL-NEXT:    retq
+; SKX-LABEL: test17:
+; SKX:       # BB#0:
+; SKX-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; SKX-NEXT:    vpcmpeqd %zmm4, %zmm2, %k2
+; SKX-NEXT:    kmovw %k2, %eax
+; SKX-NEXT:    popcntl %eax, %eax
+; SKX-NEXT:    vcompressps %zmm1, (%rdi,%rax,4) {%k1}
+; SKX-NEXT:    vcompressps %zmm0, (%rdi) {%k2}
+; SKX-NEXT:    vzeroupper
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test17:
+; KNL:       # BB#0:
+; KNL-NEXT:    vpxord %zmm4, %zmm4, %zmm4
+; KNL-NEXT:    vpcmpeqd %zmm4, %zmm3, %k1
+; KNL-NEXT:    vpcmpeqd %zmm4, %zmm2, %k2
+; KNL-NEXT:    kmovw %k2, %eax
+; KNL-NEXT:    popcntl %eax, %eax
+; KNL-NEXT:    vcompressps %zmm1, (%rdi,%rax,4) {%k1}
+; KNL-NEXT:    vcompressps %zmm0, (%rdi) {%k2}
+; KNL-NEXT:    retq
   %mask = icmp eq <32 x i32> %trigger, zeroinitializer
   call void @llvm.masked.compressstore.v32f32(<32 x float> %V, float* %base, <32 x i1> %mask)
   ret void
@@ -366,6 +403,7 @@ define void @test18(double* %base, <16 x double> %V, <16 x i1> %mask) {
 ; SKX-NEXT:    popcntl %eax, %eax
 ; SKX-NEXT:    vcompresspd %zmm1, (%rdi,%rax,8) {%k2}
 ; SKX-NEXT:    vcompresspd %zmm0, (%rdi) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; KNL-LABEL: test18:
diff --git a/test/CodeGen/X86/conditional-indecrement.ll b/test/CodeGen/X86/conditional-indecrement.ll
index c3e71180bb1830e730dbaa8c754b97f826b881a0..f9e18f6269727ca1f6f01da58e3b39db0e7e6ba7 100644
--- a/test/CodeGen/X86/conditional-indecrement.ll
+++ b/test/CodeGen/X86/conditional-indecrement.ll
@@ -1,89 +1,119 @@
-; RUN: llc -march=x86 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
 
 define i32 @test1(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    sbbl $-1, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %not.cmp = icmp ne i32 %a, 0
   %inc = zext i1 %not.cmp to i32
   %retval.0 = add i32 %inc, %b
   ret i32 %retval.0
-; CHECK-LABEL: test1:
-; CHECK: cmpl $1
-; CHECK: sbbl $-1
-; CHECK: ret
+}
+
+define i32 @test1_commute(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test1_commute:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    sbbl $-1, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
+  %cmp = icmp ne i32 %a, 0
+  %inc = zext i1 %cmp to i32
+  %ret = add i32 %b, %inc
+  ret i32 %ret
 }
 
 define i32 @test2(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    adcl $0, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %cmp = icmp eq i32 %a, 0
   %inc = zext i1 %cmp to i32
   %retval.0 = add i32 %inc, %b
   ret i32 %retval.0
-; CHECK-LABEL: test2:
-; CHECK: cmpl $1
-; CHECK: adcl $0
-; CHECK: ret
 }
 
 define i32 @test3(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    adcl $0, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %cmp = icmp eq i32 %a, 0
   %inc = zext i1 %cmp to i32
   %retval.0 = add i32 %inc, %b
   ret i32 %retval.0
-; CHECK-LABEL: test3:
-; CHECK: cmpl $1
-; CHECK: adcl $0
-; CHECK: ret
 }
 
 define i32 @test4(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    sbbl $-1, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %not.cmp = icmp ne i32 %a, 0
   %inc = zext i1 %not.cmp to i32
   %retval.0 = add i32 %inc, %b
   ret i32 %retval.0
-; CHECK-LABEL: test4:
-; CHECK: cmpl $1
-; CHECK: sbbl $-1
-; CHECK: ret
 }
 
 define i32 @test5(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    adcl $-1, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %not.cmp = icmp ne i32 %a, 0
   %inc = zext i1 %not.cmp to i32
   %retval.0 = sub i32 %b, %inc
   ret i32 %retval.0
-; CHECK-LABEL: test5:
-; CHECK: cmpl $1
-; CHECK: adcl $-1
-; CHECK: ret
 }
 
 define i32 @test6(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    sbbl $0, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %cmp = icmp eq i32 %a, 0
   %inc = zext i1 %cmp to i32
   %retval.0 = sub i32 %b, %inc
   ret i32 %retval.0
-; CHECK-LABEL: test6:
-; CHECK: cmpl $1
-; CHECK: sbbl $0
-; CHECK: ret
 }
 
 define i32 @test7(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    sbbl $0, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %cmp = icmp eq i32 %a, 0
   %inc = zext i1 %cmp to i32
   %retval.0 = sub i32 %b, %inc
   ret i32 %retval.0
-; CHECK-LABEL: test7:
-; CHECK: cmpl $1
-; CHECK: sbbl $0
-; CHECK: ret
 }
 
 define i32 @test8(i32 %a, i32 %b) nounwind readnone {
+; CHECK-LABEL: test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl $1, %edi
+; CHECK-NEXT:    adcl $-1, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %not.cmp = icmp ne i32 %a, 0
   %inc = zext i1 %not.cmp to i32
   %retval.0 = sub i32 %b, %inc
   ret i32 %retval.0
-; CHECK-LABEL: test8:
-; CHECK: cmpl $1
-; CHECK: adcl $-1
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/conditional-tailcall.ll b/test/CodeGen/X86/conditional-tailcall.ll
index 502643d9a91766a01a3a87412a1f8da64d436f53..c00ce75b26decc5e10e553afe3c09b6c43a90e9f 100644
--- a/test/CodeGen/X86/conditional-tailcall.ll
+++ b/test/CodeGen/X86/conditional-tailcall.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=i686-linux -show-mc-encoding | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-linux -show-mc-encoding | FileCheck %s
+; RUN: llc < %s -mtriple=i686-linux   -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32
+; RUN: llc < %s -mtriple=x86_64-linux -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64
+; RUN: llc < %s -mtriple=x86_64-win32 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=WIN64
 
 declare void @foo()
 declare void @bar()
@@ -23,6 +24,28 @@ bb2:
 ; CHECK: jmp foo
 }
 
+define void @f_non_leaf(i32 %x, i32 %y) optsize {
+entry:
+  ; Force %ebx to be spilled on the stack, turning this into
+  ; not a "leaf" function for Win64.
+  tail call void asm sideeffect "", "~{ebx}"()
+
+	%p = icmp eq i32 %x, %y
+  br i1 %p, label %bb1, label %bb2
+bb1:
+  tail call void @foo()
+  ret void
+bb2:
+  tail call void @bar()
+  ret void
+
+; CHECK-LABEL: f_non_leaf:
+; WIN64-NOT: je foo
+; WIN64-NOT: jne bar
+; WIN64: jne
+; WIN64: jmp foo
+; WIN64: jmp bar
+}
 
 declare x86_thiscallcc zeroext i1 @baz(i8*, i32)
 define x86_thiscallcc zeroext i1 @BlockPlacementTest(i8* %this, i32 %x) optsize {
@@ -51,3 +74,90 @@ land.end:
 ; CHECK-NOT: xor
 ; CHECK: ret
 }
+
+
+
+%"class.std::basic_string" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider" = type { i8* }
+declare zeroext i1 @_Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_(i8*, i8*)
+
+define zeroext i1 @pr31257(%"class.std::basic_string"* nocapture readonly dereferenceable(8) %s) minsize {
+; CHECK-LABEL: pr31257
+entry:
+  %_M_p.i.i = getelementptr inbounds %"class.std::basic_string", %"class.std::basic_string"* %s, i64 0, i32 0, i32 0
+  %0 = load i8*, i8** %_M_p.i.i, align 8
+  %arrayidx.i.i.i54 = getelementptr inbounds i8, i8* %0, i64 -24
+  %_M_length.i.i55 = bitcast i8* %arrayidx.i.i.i54 to i64*
+  %1 = load i64, i64* %_M_length.i.i55, align 8
+  %add.ptr.i56 = getelementptr inbounds i8, i8* %0, i64 %1
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %it.sroa.0.0 = phi i8* [ %0, %entry ], [ %incdec.ptr.i, %for.inc ]
+  %state.0 = phi i32 [ 0, %entry ], [ %state.1, %for.inc ]
+  %cmp.i = icmp eq i8* %it.sroa.0.0, %add.ptr.i56
+  br i1 %cmp.i, label %5, label %for.body
+
+for.body:                                         ; preds = %for.cond
+  switch i32 %state.0, label %for.inc [
+    i32 0, label %sw.bb
+    i32 1, label %sw.bb14
+    i32 2, label %sw.bb22
+  ]
+
+sw.bb:                                            ; preds = %for.body
+  %2 = load i8, i8* %it.sroa.0.0, align 1
+  switch i8 %2, label %if.else [
+    i8 43, label %for.inc
+    i8 45, label %for.inc
+  ]
+
+if.else:                                          ; preds = %sw.bb
+  %conv9 = zext i8 %2 to i32
+  %isdigittmp45 = add nsw i32 %conv9, -48
+  %isdigit46 = icmp ult i32 %isdigittmp45, 10
+  br i1 %isdigit46, label %for.inc, label %cleanup.thread.loopexit
+
+sw.bb14:                                          ; preds = %for.body
+  %3 = load i8, i8* %it.sroa.0.0, align 1
+  %conv16 = zext i8 %3 to i32
+  %isdigittmp43 = add nsw i32 %conv16, -48
+  %isdigit44 = icmp ult i32 %isdigittmp43, 10
+  br i1 %isdigit44, label %for.inc, label %cleanup.thread.loopexit
+
+sw.bb22:                                          ; preds = %for.body
+  %4 = load i8, i8* %it.sroa.0.0, align 1
+  %conv24 = zext i8 %4 to i32
+  %isdigittmp = add nsw i32 %conv24, -48
+  %isdigit = icmp ult i32 %isdigittmp, 10
+  br i1 %isdigit, label %for.inc, label %if.else28
+
+; Make sure Machine Copy Propagation doesn't delete the mov to %ecx becaue it
+; thinks the conditional tail call clobbers it.
+; CHECK64-LABEL: .LBB3_11:
+; CHECK64:       movzbl  (%rdi), %ecx
+; CHECK64-NEXT:  addl    $-48, %ecx
+; CHECK64-NEXT:  cmpl    $10, %ecx
+; CHECK64-NEXT:  movl    %r9d, %ecx
+; CHECK64-NEXT:  jae     _Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEE
+
+if.else28:                                        ; preds = %sw.bb22
+  %call34 = tail call zeroext i1 @_Z20isValidIntegerSuffixN9__gnu_cxx17__normal_iteratorIPKcSsEES3_(i8* nonnull %it.sroa.0.0, i8* %add.ptr.i56)
+  br label %cleanup.thread
+
+for.inc:                                          ; preds = %sw.bb, %sw.bb, %sw.bb22, %sw.bb14, %if.else, %for.body
+  %state.1 = phi i32 [ %state.0, %for.body ], [ 1, %sw.bb ], [ 2, %if.else ], [ 2, %sw.bb14 ], [ 2, %sw.bb22 ], [ 1, %sw.bb ]
+  %incdec.ptr.i = getelementptr inbounds i8, i8* %it.sroa.0.0, i64 1
+  br label %for.cond
+
+; <label>:5:                                      ; preds = %for.cond
+  %cmp37 = icmp eq i32 %state.0, 2
+  br label %cleanup.thread
+
+cleanup.thread.loopexit:                          ; preds = %if.else, %sw.bb14
+  br label %cleanup.thread
+
+cleanup.thread:                                   ; preds = %cleanup.thread.loopexit, %if.else28, %5
+  %6 = phi i1 [ %cmp37, %5 ], [ %call34, %if.else28 ], [ false, %cleanup.thread.loopexit ]
+  ret i1 %6
+}
diff --git a/test/CodeGen/X86/copy-eflags.ll b/test/CodeGen/X86/copy-eflags.ll
index 796c1ecd8c712e701c66703a39c0867c08407ece..d98d8a7839b1d4cb8ed0313776a6670ce03bb011 100644
--- a/test/CodeGen/X86/copy-eflags.ll
+++ b/test/CodeGen/X86/copy-eflags.ll
@@ -9,19 +9,22 @@ target triple = "i686-unknown-linux-gnu"
 @.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
 
 ; CHECK-LABEL: func:
-; This tests whether eax is properly saved/restored around the lahf/sahf
-; instruction sequences.
+; This tests whether eax is properly saved/restored around the
+; lahf/sahf instruction sequences. We make mem op volatile to prevent
+; their reordering to avoid spills.
+
+
 define i32 @func() {
 entry:
   %bval = load i8, i8* @b
   %inc = add i8 %bval, 1
-  store i8 %inc, i8* @b
-  %cval = load i32, i32* @c
+  store volatile i8 %inc, i8* @b
+  %cval = load volatile i32, i32* @c
   %inc1 = add nsw i32 %cval, 1
-  store i32 %inc1, i32* @c
-  %aval = load i8, i8* @a
+  store volatile i32 %inc1, i32* @c
+  %aval = load volatile i8, i8* @a
   %inc2 = add i8 %aval, 1
-  store i8 %inc2, i8* @a
+  store volatile i8 %inc2, i8* @a
 ; Copy flags produced by the incb of %inc1 to a register, need to save+restore
 ; eax around it. The flags will be reused by %tobool.
 ; CHECK: pushl %eax
diff --git a/test/CodeGen/X86/copy-propagation.ll b/test/CodeGen/X86/copy-propagation.ll
index dac46c173825c80b2490b781e6d1e6f602bb3570..4d8b8462b5fa3fd74ef6b63358f18be5a4784559 100644
--- a/test/CodeGen/X86/copy-propagation.ll
+++ b/test/CodeGen/X86/copy-propagation.ll
@@ -1,38 +1,25 @@
 ; RUN: llc %s -mattr=+avx -o - | FileCheck %s
-; PR21743.
+; Originally from http://llvm.org/PR21743.
 
 target triple = "x86_64-pc-win32-elf"
 
-; Check that copy propagation conservatively assumes that undef register
-; can be rewritten by the backend to break false dependencies for the
-; hardware.
-; In this function we are in this situation:
-; reg1 = copy reg2
-; = inst reg2<undef>
-; reg2 = copy reg1
-; Copy propagation used to remove the last copy.
-; This is incorrect because the undef flag on reg2 in inst, allows next
-; passes to put whatever trashed value in reg2 that may help.
-; In practice we end up with this code:
-; reg1 = copy reg2
-; reg2 = 0
-; = inst reg2<undef>
-; reg2 = copy reg1
-; Therefore, removing the last copy is wrong.
+; Copy propagation may remove COPYs if the result is only used by undef
+; operands.
 ;
 ; CHECK-LABEL: foo:
 ; CHECK: movl	$339752784, %e[[INDIRECT_CALL1:[a-z]+]]
 ; CHECK: callq *%r[[INDIRECT_CALL1]]
 ; Copy the result in a temporary.
-; Note: Technically the regalloc could have been smarter and this move not required,
-; which would have hidden the bug.
+; Note: Technically the regalloc could have been smarter and this move not
+; required, which would have hidden the bug.
 ; CHECK: vmovapd	%xmm0, [[TMP:%xmm[0-9]+]]
-; Crush xmm0.
-; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; CHECK-NOT: vxorps  %xmm0, %xmm0, %xmm0
+; CHECK-NEXT: vcvtsi2sdq      %rsi, %xmm0, %xmm6
 ; CHECK: movl	$339772768, %e[[INDIRECT_CALL2:[a-z]+]]
+; CHECK-NOT: vmovapd %xmm7, %xmm0
+; CHECK-NEXT: vmovapd %xmm6, %xmm1
 ; Set TMP in the first argument of the second call.
-; CHECK-NEXT: vmovapd	[[TMP]], %xmm0
-; CHECK: callq *%r[[INDIRECT_CALL2]]
+; CHECK_NEXT: callq *%r[[INDIRECT_CALL2]]
 ; CHECK: retq
 define double @foo(i64 %arg) {
 top:
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index a95b84d4c3b0681a78a10c0dcb74fecd3c58b463..4bdb2ddfab6293be0c2f4246a8284660c5fac72d 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -314,9 +314,9 @@ declare %t14* @_ZN4llvm9MCContext16CreateTempSymbolEv(%t2*)
 
 declare void @_ZNSt6vectorIN4llvm11MachineMoveESaIS1_EE13_M_insert_auxEN9__gnu_cxx17__normal_iteratorIPS1_S3_EERKS1_(%t10*, %t21* byval align 4, %t13*)
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 
 ; PR10463
 ; Spilling a virtual register with <undef> uses.
diff --git a/test/CodeGen/X86/ctpop-combine.ll b/test/CodeGen/X86/ctpop-combine.ll
index 435401639f0510a8e5756e34956153495ffa52ab..b7031a817e82dc131c5485a2a52fa6b531e65257 100644
--- a/test/CodeGen/X86/ctpop-combine.ll
+++ b/test/CodeGen/X86/ctpop-combine.ll
@@ -36,11 +36,11 @@ define i32 @test2(i64 %x) nounwind readnone {
 define i32 @test3(i64 %x) nounwind readnone {
 ; CHECK-LABEL: test3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    popcntq %rdi, %rax
-; CHECK-NEXT:    andb $63, %al
-; CHECK-NEXT:    cmpb $2, %al
-; CHECK-NEXT:    sbbl %eax, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    popcntq %rdi, %rcx
+; CHECK-NEXT:    andb $63, %cl
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    cmpb $2, %cl
+; CHECK-NEXT:    setb %al
 ; CHECK-NEXT:    retq
   %count = tail call i64 @llvm.ctpop.i64(i64 %x)
   %cast = trunc i64 %count to i6 ; Too small for 0-64
diff --git a/test/CodeGen/X86/dag-fmf-cse.ll b/test/CodeGen/X86/dag-fmf-cse.ll
index ac8c5000aba4c5742ab5c10cce3cb44d7646f2a4..c12c49d0f40b5cfa34d8c833651ae3956df50ee2 100644
--- a/test/CodeGen/X86/dag-fmf-cse.ll
+++ b/test/CodeGen/X86/dag-fmf-cse.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma -enable-unsafe-fp-math -enable-fmf-dag=1 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma -enable-unsafe-fp-math | FileCheck %s
 
 ; If fast-math-flags are propagated correctly, the mul1 expression
 ; should be recognized as a factor in the last fsub, so we should
diff --git a/test/CodeGen/X86/dag-merge-fast-accesses.ll b/test/CodeGen/X86/dag-merge-fast-accesses.ll
index 867881d83d3f4559f1841946ac5d95472512787e..e5dfccb278cef634ac7ba92add26a547135f4d49 100644
--- a/test/CodeGen/X86/dag-merge-fast-accesses.ll
+++ b/test/CodeGen/X86/dag-merge-fast-accesses.ll
@@ -51,19 +51,11 @@ define void @merge_vec_element_store(<4 x double> %v, double* %ptr) {
 }
 
 
-;; TODO: FAST *should* be:
-;;    movups (%rdi), %xmm0
-;;    movups %xmm0, 40(%rdi)
-;; ..but is not currently. See the UseAA FIXME in DAGCombiner.cpp
-;; visitSTORE.
-
 define void @merge_vec_load_and_stores(i64 *%ptr) {
 ; FAST-LABEL: merge_vec_load_and_stores:
 ; FAST:       # BB#0:
-; FAST-NEXT:    movq (%rdi), %rax
-; FAST-NEXT:    movq 8(%rdi), %rcx
-; FAST-NEXT:    movq %rax, 40(%rdi)
-; FAST-NEXT:    movq %rcx, 48(%rdi)
+; FAST-NEXT:    movups (%rdi), %xmm0
+; FAST-NEXT:    movups %xmm0, 40(%rdi)
 ; FAST-NEXT:    retq
 ;
 ; SLOW-LABEL: merge_vec_load_and_stores:
diff --git a/test/CodeGen/X86/dag-update-nodetomatch.ll b/test/CodeGen/X86/dag-update-nodetomatch.ll
new file mode 100644
index 0000000000000000000000000000000000000000..45b6d020ce45033e90b7ced8aa6094d2f5650c58
--- /dev/null
+++ b/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -0,0 +1,241 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+%struct.i = type { i32, i24 }
+%struct.m = type { %struct.i }
+
+@a = local_unnamed_addr global i32 0, align 4
+@b = local_unnamed_addr global i16 0, align 2
+@c = local_unnamed_addr global i16 0, align 2
+@e = local_unnamed_addr global i16 0, align 2
+@l = local_unnamed_addr global %struct.i zeroinitializer, align 4
+@k = local_unnamed_addr global %struct.m zeroinitializer, align 4
+
+@x0 = local_unnamed_addr global double 0.000000e+00, align 8
+@x1 = local_unnamed_addr global i32 0, align 4
+@x2 = local_unnamed_addr global i32 0, align 4
+@x3 = local_unnamed_addr global i32 0, align 4
+@x4 = local_unnamed_addr global i32 0, align 4
+@x5 = local_unnamed_addr global double* null, align 8
+
+; Check that compiler does not crash.
+; Test for PR30775
+define void @_Z1nv() local_unnamed_addr {
+; CHECK-LABEL: _Z1nv:
+entry:
+  %bf.load = load i32, i32* bitcast (i24* getelementptr inbounds (%struct.m, %struct.m* @k, i64 0, i32 0, i32 1) to i32*), align 4
+  %0 = load i16, i16* @c, align 2
+  %conv = sext i16 %0 to i32
+  %1 = load i16, i16* @b, align 2
+  %conv1 = sext i16 %1 to i32
+  %2 = load i32, i32* @a, align 4
+  %tobool = icmp ne i32 %2, 0
+  %bf.load3 = load i32, i32* getelementptr inbounds (%struct.i, %struct.i* @l, i64 0, i32 0), align 4
+  %bf.shl = shl i32 %bf.load3, 7
+  %bf.ashr = ashr exact i32 %bf.shl, 7
+  %bf.clear = shl i32 %bf.load, 1
+  %factor = and i32 %bf.clear, 131070
+  %add13 = add nsw i32 %factor, %conv
+  %add15 = add nsw i32 %add13, %conv1
+  %bf.ashr.op = sub nsw i32 0, %bf.ashr
+  %add28 = select i1 %tobool, i32 %bf.ashr.op, i32 0
+  %tobool29 = icmp eq i32 %add15, %add28
+  %phitmp = icmp eq i32 %bf.ashr, 0
+  %.phitmp = or i1 %phitmp, %tobool29
+  %conv37 = zext i1 %.phitmp to i16
+  store i16 %conv37, i16* @e, align 2
+  %bf.clear39 = and i32 %bf.load, 65535
+  %factor53 = shl nuw nsw i32 %bf.clear39, 1
+  %add46 = add nsw i32 %factor53, %conv
+  %add48 = add nsw i32 %add46, %conv1
+  %add48.lobit = lshr i32 %add48, 31
+  %add48.lobit.not = xor i32 %add48.lobit, 1
+  %add51 = add nuw nsw i32 %add48.lobit.not, %bf.clear39
+  %shr = ashr i32 %2, %add51
+  %conv52 = trunc i32 %shr to i16
+  store i16 %conv52, i16* @b, align 2
+  ret void
+}
+
+; Test for PR31536
+define void @_Z2x6v() local_unnamed_addr {
+; CHECK-LABEL: _Z2x6v:
+entry:
+  %0 = load i32, i32* @x1, align 4
+  %and = and i32 %0, 511
+  %add = add nuw nsw i32 %and, 1
+  store i32 %add, i32* @x4, align 4
+  %.pr = load i32, i32* @x3, align 4
+  %tobool8 = icmp eq i32 %.pr, 0
+  br i1 %tobool8, label %for.end5, label %for.cond1thread-pre-split.lr.ph
+
+for.cond1thread-pre-split.lr.ph:                  ; preds = %entry
+  %idx.ext13 = zext i32 %add to i64
+  %x5.promoted = load double*, double** @x5, align 8
+  %x5.promoted9 = bitcast double* %x5.promoted to i8*
+  %1 = xor i32 %.pr, -1
+  %2 = zext i32 %1 to i64
+  %3 = shl nuw nsw i64 %2, 3
+  %4 = add nuw nsw i64 %3, 8
+  %5 = mul nuw nsw i64 %4, %idx.ext13
+  %uglygep = getelementptr i8, i8* %x5.promoted9, i64 %5
+  %.pr6.pre = load i32, i32* @x2, align 4
+  %6 = shl nuw nsw i32 %and, 3
+  %addconv = add nuw nsw i32 %6, 8
+  %7 = zext i32 %addconv to i64
+  %scevgep15 = getelementptr double, double* %x5.promoted, i64 1
+  %scevgep1516 = bitcast double* %scevgep15 to i8*
+  br label %for.cond1thread-pre-split
+
+for.cond1thread-pre-split:                        ; preds = %for.cond1thread-pre-split.lr.ph, %for.inc3
+  %indvar = phi i64 [ 0, %for.cond1thread-pre-split.lr.ph ], [ %indvar.next, %for.inc3 ]
+  %.pr6 = phi i32 [ %.pr6.pre, %for.cond1thread-pre-split.lr.ph ], [ %.pr611, %for.inc3 ]
+  %8 = phi double* [ %x5.promoted, %for.cond1thread-pre-split.lr.ph ], [ %add.ptr, %for.inc3 ]
+  %9 = phi i32 [ %.pr, %for.cond1thread-pre-split.lr.ph ], [ %inc4, %for.inc3 ]
+  %10 = mul i64 %7, %indvar
+  %uglygep14 = getelementptr i8, i8* %x5.promoted9, i64 %10
+  %uglygep17 = getelementptr i8, i8* %scevgep1516, i64 %10
+  %cmp7 = icmp slt i32 %.pr6, 0
+  br i1 %cmp7, label %for.body2.preheader, label %for.inc3
+
+for.body2.preheader:                              ; preds = %for.cond1thread-pre-split
+  %11 = sext i32 %.pr6 to i64
+  %12 = sext i32 %.pr6 to i64
+  %13 = icmp sgt i64 %12, -1
+  %smax = select i1 %13, i64 %12, i64 -1
+  %14 = add nsw i64 %smax, 1
+  %15 = sub nsw i64 %14, %12
+  %min.iters.check = icmp ult i64 %15, 4
+  br i1 %min.iters.check, label %for.body2.preheader21, label %min.iters.checked
+
+min.iters.checked:                                ; preds = %for.body2.preheader
+  %n.vec = and i64 %15, -4
+  %cmp.zero = icmp eq i64 %n.vec, 0
+  br i1 %cmp.zero, label %for.body2.preheader21, label %vector.memcheck
+
+vector.memcheck:                                  ; preds = %min.iters.checked
+  %16 = shl nsw i64 %11, 3
+  %scevgep = getelementptr i8, i8* %uglygep14, i64 %16
+  %17 = icmp sgt i64 %11, -1
+  %smax18 = select i1 %17, i64 %11, i64 -1
+  %18 = shl nsw i64 %smax18, 3
+  %scevgep19 = getelementptr i8, i8* %uglygep17, i64 %18
+  %bound0 = icmp ult i8* %scevgep, bitcast (double* @x0 to i8*)
+  %bound1 = icmp ugt i8* %scevgep19, bitcast (double* @x0 to i8*)
+  %memcheck.conflict = and i1 %bound0, %bound1
+  %ind.end = add nsw i64 %11, %n.vec
+  br i1 %memcheck.conflict, label %for.body2.preheader21, label %vector.body.preheader
+
+vector.body.preheader:                            ; preds = %vector.memcheck
+  %19 = add nsw i64 %n.vec, -4
+  %20 = lshr exact i64 %19, 2
+  %21 = and i64 %20, 1
+  %lcmp.mod = icmp eq i64 %21, 0
+  br i1 %lcmp.mod, label %vector.body.prol.preheader, label %vector.body.prol.loopexit.unr-lcssa
+
+vector.body.prol.preheader:                       ; preds = %vector.body.preheader
+  br label %vector.body.prol
+
+vector.body.prol:                                 ; preds = %vector.body.prol.preheader
+  %22 = load i64, i64* bitcast (double* @x0 to i64*), align 8
+  %23 = insertelement <2 x i64> undef, i64 %22, i32 0
+  %24 = shufflevector <2 x i64> %23, <2 x i64> undef, <2 x i32> zeroinitializer
+  %25 = insertelement <2 x i64> undef, i64 %22, i32 0
+  %26 = shufflevector <2 x i64> %25, <2 x i64> undef, <2 x i32> zeroinitializer
+  %27 = getelementptr inbounds double, double* %8, i64 %11
+  %28 = bitcast double* %27 to <2 x i64>*
+  store <2 x i64> %24, <2 x i64>* %28, align 8
+  %29 = getelementptr double, double* %27, i64 2
+  %30 = bitcast double* %29 to <2 x i64>*
+  store <2 x i64> %26, <2 x i64>* %30, align 8
+  br label %vector.body.prol.loopexit.unr-lcssa
+
+vector.body.prol.loopexit.unr-lcssa:              ; preds = %vector.body.preheader, %vector.body.prol
+  %index.unr.ph = phi i64 [ 4, %vector.body.prol ], [ 0, %vector.body.preheader ]
+  br label %vector.body.prol.loopexit
+
+vector.body.prol.loopexit:                        ; preds = %vector.body.prol.loopexit.unr-lcssa
+  %31 = icmp eq i64 %20, 0
+  br i1 %31, label %middle.block, label %vector.body.preheader.new
+
+vector.body.preheader.new:                        ; preds = %vector.body.prol.loopexit
+  %32 = load i64, i64* bitcast (double* @x0 to i64*), align 8
+  %33 = insertelement <2 x i64> undef, i64 %32, i32 0
+  %34 = shufflevector <2 x i64> %33, <2 x i64> undef, <2 x i32> zeroinitializer
+  %35 = insertelement <2 x i64> undef, i64 %32, i32 0
+  %36 = shufflevector <2 x i64> %35, <2 x i64> undef, <2 x i32> zeroinitializer
+  %37 = load i64, i64* bitcast (double* @x0 to i64*), align 8
+  %38 = insertelement <2 x i64> undef, i64 %37, i32 0
+  %39 = shufflevector <2 x i64> %38, <2 x i64> undef, <2 x i32> zeroinitializer
+  %40 = insertelement <2 x i64> undef, i64 %37, i32 0
+  %41 = shufflevector <2 x i64> %40, <2 x i64> undef, <2 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.body.preheader.new
+  %index = phi i64 [ %index.unr.ph, %vector.body.preheader.new ], [ %index.next.1, %vector.body ]
+  %42 = add i64 %11, %index
+  %43 = getelementptr inbounds double, double* %8, i64 %42
+  %44 = bitcast double* %43 to <2 x i64>*
+  store <2 x i64> %34, <2 x i64>* %44, align 8
+  %45 = getelementptr double, double* %43, i64 2
+  %46 = bitcast double* %45 to <2 x i64>*
+  store <2 x i64> %36, <2 x i64>* %46, align 8
+  %index.next = add i64 %index, 4
+  %47 = add i64 %11, %index.next
+  %48 = getelementptr inbounds double, double* %8, i64 %47
+  %49 = bitcast double* %48 to <2 x i64>*
+  store <2 x i64> %39, <2 x i64>* %49, align 8
+  %50 = getelementptr double, double* %48, i64 2
+  %51 = bitcast double* %50 to <2 x i64>*
+  store <2 x i64> %41, <2 x i64>* %51, align 8
+  %index.next.1 = add i64 %index, 8
+  %52 = icmp eq i64 %index.next.1, %n.vec
+  br i1 %52, label %middle.block.unr-lcssa, label %vector.body
+
+middle.block.unr-lcssa:                           ; preds = %vector.body
+  br label %middle.block
+
+middle.block:                                     ; preds = %vector.body.prol.loopexit, %middle.block.unr-lcssa
+  %cmp.n = icmp eq i64 %15, %n.vec
+  br i1 %cmp.n, label %for.cond1.for.inc3_crit_edge, label %for.body2.preheader21
+
+for.body2.preheader21:                            ; preds = %middle.block, %vector.memcheck, %min.iters.checked, %for.body2.preheader
+  %indvars.iv.ph = phi i64 [ %11, %vector.memcheck ], [ %11, %min.iters.checked ], [ %11, %for.body2.preheader ], [ %ind.end, %middle.block ]
+  br label %for.body2
+
+for.body2:                                        ; preds = %for.body2.preheader21, %for.body2
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body2 ], [ %indvars.iv.ph, %for.body2.preheader21 ]
+  %53 = load i64, i64* bitcast (double* @x0 to i64*), align 8
+  %arrayidx = getelementptr inbounds double, double* %8, i64 %indvars.iv
+  %54 = bitcast double* %arrayidx to i64*
+  store i64 %53, i64* %54, align 8
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %cmp = icmp slt i64 %indvars.iv, -1
+  br i1 %cmp, label %for.body2, label %for.cond1.for.inc3_crit_edge.loopexit
+
+for.cond1.for.inc3_crit_edge.loopexit:            ; preds = %for.body2
+  br label %for.cond1.for.inc3_crit_edge
+
+for.cond1.for.inc3_crit_edge:                     ; preds = %for.cond1.for.inc3_crit_edge.loopexit, %middle.block
+  %indvars.iv.next.lcssa = phi i64 [ %ind.end, %middle.block ], [ %indvars.iv.next, %for.cond1.for.inc3_crit_edge.loopexit ]
+  %55 = trunc i64 %indvars.iv.next.lcssa to i32
+  store i32 %55, i32* @x2, align 4
+  br label %for.inc3
+
+for.inc3:                                         ; preds = %for.cond1.for.inc3_crit_edge, %for.cond1thread-pre-split
+  %.pr611 = phi i32 [ %55, %for.cond1.for.inc3_crit_edge ], [ %.pr6, %for.cond1thread-pre-split ]
+  %inc4 = add nsw i32 %9, 1
+  %add.ptr = getelementptr inbounds double, double* %8, i64 %idx.ext13
+  %tobool = icmp eq i32 %inc4, 0
+  %indvar.next = add i64 %indvar, 1
+  br i1 %tobool, label %for.cond.for.end5_crit_edge, label %for.cond1thread-pre-split
+
+for.cond.for.end5_crit_edge:                      ; preds = %for.inc3
+  store i8* %uglygep, i8** bitcast (double** @x5 to i8**), align 8
+  store i32 0, i32* @x3, align 4
+  br label %for.end5
+
+for.end5:                                         ; preds = %for.cond.for.end5_crit_edge, %entry
+  ret void
+}
+
diff --git a/test/CodeGen/X86/dagcombine-cse.ll b/test/CodeGen/X86/dagcombine-cse.ll
index bff0e64910bf3d6340f65e7e9043db7caa88168f..a283bcc6d460c2878927be146980fd14e14c879a 100644
--- a/test/CodeGen/X86/dagcombine-cse.ll
+++ b/test/CodeGen/X86/dagcombine-cse.ll
@@ -1,7 +1,40 @@
-; REQUIRES: asserts
-; RUN: llc < %s -march=x86 -mattr=+sse2 -mtriple=i386-apple-darwin -stats 2>&1 | grep asm-printer | grep 13
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s --check-prefix=X64
 
 define i32 @t(i8* %ref_frame_ptr, i32 %ref_frame_stride, i32 %idxX, i32 %idxY) nounwind  {
+; X32-LABEL: t:
+; X32:       ## BB#0: ## %entry
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    imull {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzwl 4(%eax,%ecx), %edx
+; X32-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-NEXT:    movd %edx, %xmm1
+; X32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; X32-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
+; X32-NEXT:    movd %xmm0, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: t:
+; X64:       ## BB#0: ## %entry
+; X64-NEXT:    ## kill: %EDX<def> %EDX<kill> %RDX<def>
+; X64-NEXT:    ## kill: %ESI<def> %ESI<kill> %RSI<def>
+; X64-NEXT:    imull %ecx, %esi
+; X64-NEXT:    leal (%rsi,%rdx), %eax
+; X64-NEXT:    cltq
+; X64-NEXT:    leal 4(%rsi,%rdx), %ecx
+; X64-NEXT:    movslq %ecx, %rcx
+; X64-NEXT:    movzwl (%rdi,%rcx), %ecx
+; X64-NEXT:    shlq $32, %rcx
+; X64-NEXT:    movl (%rdi,%rax), %eax
+; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    movd %rax, %xmm0
+; X64-NEXT:    movq {{.*#+}} xmm0 = xmm0[0],zero
+; X64-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,2,4,5,6,7]
+; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    retq
 entry:
 	%tmp7 = mul i32 %idxY, %ref_frame_stride		; <i32> [#uses=2]
 	%tmp9 = add i32 %tmp7, %idxX		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index 7968777b0d88ce98a5942bf337955fcdd2bd43ae..7a19dd2a98d17eeac79d87d60752c4c8ea3db8b3 100644
--- a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -65,13 +65,13 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %entry, %if.then
   %0 = getelementptr inbounds %struct.AAA3, %struct.AAA3* %var1, i64 0, i32 0, i64 0, !dbg !56
-  call void @llvm.lifetime.start(i64 4, i8* %0) #4, !dbg !56
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #4, !dbg !56
   tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !32, metadata !57), !dbg !58
   tail call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !36, metadata !46), !dbg !59
   tail call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0), i64 0, metadata !38, metadata !46), !dbg !62
   call void @_Z3fooPcjPKc(i8* %0, i32 4, i8* nonnull getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !63
   %1 = getelementptr inbounds %struct.AAA3, %struct.AAA3* %var2, i64 0, i32 0, i64 0, !dbg !65
-  call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !65
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #4, !dbg !65
   call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !33, metadata !57), !dbg !66
   call void @llvm.dbg.value(metadata %struct.AAA3* %var2, i64 0, metadata !36, metadata !46), !dbg !67
   call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0), i64 0, metadata !38, metadata !46), !dbg !69
@@ -96,18 +96,18 @@ if.end3:                                          ; preds = %if.else, %if.then2
   call void @llvm.dbg.value(metadata %struct.AAA3* %var1, i64 0, metadata !41, metadata !46), !dbg !82
   call void @llvm.dbg.value(metadata i8* getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0), i64 0, metadata !42, metadata !46), !dbg !84
   call void @_Z3fooPcjPKc(i8* %0, i32 4, i8* nonnull getelementptr inbounds ([1 x i8], [1 x i8]* @.str, i64 0, i64 0)), !dbg !85
-  call void @llvm.lifetime.end(i64 4, i8* %1) #4, !dbg !86
-  call void @llvm.lifetime.end(i64 4, i8* %0) #4, !dbg !87
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %1) #4, !dbg !86
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #4, !dbg !87
   ret void, !dbg !86
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare i8* @_Z5i2stri(i32) #2
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 declare void @_Z3fooPcjPKc(i8*, i32, i8*) #2
 
diff --git a/test/CodeGen/X86/div-rem-simplify.ll b/test/CodeGen/X86/div-rem-simplify.ll
new file mode 100644
index 0000000000000000000000000000000000000000..04cf439dc1555d4d90c9a1616d5fe8d0a8cbf0d2
--- /dev/null
+++ b/test/CodeGen/X86/div-rem-simplify.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+; Div/rem by zero is undef.
+
+define i32 @srem0(i32 %x) {
+; CHECK-LABEL: srem0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %rem = srem i32 %x, 0
+  ret i32 %rem
+}
+
+define i32 @urem0(i32 %x) {
+; CHECK-LABEL: urem0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %rem = urem i32 %x, 0
+  ret i32 %rem
+}
+
+define i32 @sdiv0(i32 %x) {
+; CHECK-LABEL: sdiv0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %div = sdiv i32 %x, 0
+  ret i32 %div
+}
+
+define i32 @udiv0(i32 %x) {
+; CHECK-LABEL: udiv0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %div = udiv i32 %x, 0
+  ret i32 %div
+}
+
+; Div/rem by zero vectors is undef.
+
+define <4 x i32> @srem_vec0(<4 x i32> %x) {
+; CHECK-LABEL: srem_vec0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %rem = srem <4 x i32> %x, zeroinitializer
+  ret <4 x i32> %rem
+}
+
+define <4 x i32> @urem_vec0(<4 x i32> %x) {
+; CHECK-LABEL: urem_vec0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %rem = urem <4 x i32> %x, zeroinitializer
+  ret <4 x i32> %rem
+}
+
+define <4 x i32> @sdiv_vec0(<4 x i32> %x) {
+; CHECK-LABEL: sdiv_vec0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %div = sdiv <4 x i32> %x, zeroinitializer
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @udiv_vec0(<4 x i32> %x) {
+; CHECK-LABEL: udiv_vec0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %div = udiv <4 x i32> %x, zeroinitializer
+  ret <4 x i32> %div
+}
+
+; Make sure we handle undef before we try to fold constants from the select with the 0.
+; These used to assert because we can't fold div/rem-by-0 into APInt.
+
+define i32 @sel_urem0(i1 %cond) {
+; CHECK-LABEL: sel_urem0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 23, i32 234
+  %rem = urem i32 %sel, 0
+  ret i32 %rem
+}
+
+define i32 @sel_srem0(i1 %cond) {
+; CHECK-LABEL: sel_srem0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 23, i32 234
+  %rem = srem i32 %sel, 0
+  ret i32 %rem
+}
+
+define i32 @sel_udiv0(i1 %cond) {
+; CHECK-LABEL: sel_udiv0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 23, i32 234
+  %div = udiv i32 %sel, 0
+  ret i32 %div
+}
+
+define i32 @sel_sdiv0(i1 %cond) {
+; CHECK-LABEL: sel_sdiv0:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 23, i32 234
+  %div = sdiv i32 %sel, 0
+  ret i32 %div
+}
+
+; Make sure we handle undef before we try to fold constants from the select with the vector 0.
+; These used to assert because we can't fold div/rem-by-0 into APInt.
+
+define <4 x i32> @sel_urem0_vec(i1 %cond) {
+; CHECK-LABEL: sel_urem0_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, <4 x i32> <i32 -1, i32 0, i32 1, i32 2>, <4 x i32> <i32 11, i32 12, i32 13, i32 14>
+  %rem = urem <4 x i32> %sel, zeroinitializer
+  ret <4 x i32> %rem
+}
+
+define <4 x i32> @sel_srem0_vec(i1 %cond) {
+; CHECK-LABEL: sel_srem0_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, <4 x i32> <i32 -1, i32 0, i32 1, i32 2>, <4 x i32> <i32 11, i32 12, i32 13, i32 14>
+  %rem = srem <4 x i32> %sel, zeroinitializer
+  ret <4 x i32> %rem
+}
+
+define <4 x i32> @sel_udiv0_vec(i1 %cond) {
+; CHECK-LABEL: sel_udiv0_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, <4 x i32> <i32 -1, i32 0, i32 1, i32 2>, <4 x i32> <i32 11, i32 12, i32 13, i32 14>
+  %div = udiv <4 x i32> %sel, zeroinitializer
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @sel_sdiv0_vec(i1 %cond) {
+; CHECK-LABEL: sel_sdiv0_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, <4 x i32> <i32 -1, i32 0, i32 1, i32 2>, <4 x i32> <i32 11, i32 12, i32 13, i32 14>
+  %div = sdiv <4 x i32> %sel, zeroinitializer
+  ret <4 x i32> %div
+}
+
+; If any element of a constant divisor vector is zero, the whole op is undef.
+
+define <4 x i32> @sdiv0elt_vec(<4 x i32> %x) {
+; CHECK-LABEL: sdiv0elt_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %zero = and <4 x i32> %x, <i32 0, i32 0, i32 0, i32 0>
+  %some_ones = or <4 x i32> %zero, <i32 0, i32 -1, i32 0, i32 3>
+  %div = sdiv <4 x i32> <i32 -11, i32 -12, i32 -13, i32 -14>, %some_ones
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @udiv0elt_vec(<4 x i32> %x) {
+; CHECK-LABEL: udiv0elt_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %div = udiv <4 x i32> <i32 11, i32 12, i32 13, i32 14>, <i32 0, i32 3, i32 4, i32 0>
+  ret <4 x i32> %div
+}
+
+define <4 x i32> @urem0elt_vec(<4 x i32> %x) {
+; CHECK-LABEL: urem0elt_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %zero = and <4 x i32> %x, <i32 0, i32 0, i32 0, i32 0>
+  %some_ones = or <4 x i32> %zero, <i32 0, i32 0, i32 0, i32 3>
+  %rem = urem <4 x i32> <i32 11, i32 12, i32 13, i32 14>, %some_ones
+  ret <4 x i32> %rem
+}
+
+define <4 x i32> @srem0elt_vec(<4 x i32> %x) {
+; CHECK-LABEL: srem0elt_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    retq
+  %rem = srem <4 x i32> <i32 -11, i32 -12, i32 -13, i32 -14>, <i32 -3, i32 -3, i32 0, i32 2>
+  ret <4 x i32> %rem
+}
+
diff --git a/test/CodeGen/X86/divrem8_ext.ll b/test/CodeGen/X86/divrem8_ext.ll
index fc516001aa59b612012293c9f72aceb3ad6ad6d4..7521156a370e9dec125879f13e4af37e65e4a484 100644
--- a/test/CodeGen/X86/divrem8_ext.ll
+++ b/test/CodeGen/X86/divrem8_ext.ll
@@ -206,8 +206,7 @@ define i64 @pr25754(i8 %a, i8 %c) {
 ; X32-NEXT:    movzbl %ah, %ecx # NOREX
 ; X32-NEXT:    movzbl %al, %eax
 ; X32-NEXT:    addl %ecx, %eax
-; X32-NEXT:    sbbl %edx, %edx
-; X32-NEXT:    andl $1, %edx
+; X32-NEXT:    xorl %edx, %edx
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: pr25754:
diff --git a/test/CodeGen/X86/dont-trunc-store-double-to-float.ll b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
index 8a334d21631a7fa73ad7914015d857636f78dbda..05245d0d9e1e11c33c7ab801fb7c28366e5b4f57 100644
--- a/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
+++ b/test/CodeGen/X86/dont-trunc-store-double-to-float.ll
@@ -1,9 +1,9 @@
 ; RUN: llc -march=x86 < %s | FileCheck %s
 
 ; CHECK-LABEL: @bar
-; CHECK: movl $1074339512,
-; CHECK: movl $1374389535,
-; CHECK: movl $1078523331,
+; CHECK-DAG: movl $1074339512,
+; CHECK-DAG: movl $1374389535,
+; CHECK-DAG: movl $1078523331,
 define void @bar() unnamed_addr {
 entry-block:
   %a = alloca double
diff --git a/test/CodeGen/X86/dwarf-headers.ll b/test/CodeGen/X86/dwarf-headers.ll
new file mode 100644
index 0000000000000000000000000000000000000000..612807dd8123e106e2d474c4d9b8bee551371c7f
--- /dev/null
+++ b/test/CodeGen/X86/dwarf-headers.ll
@@ -0,0 +1,109 @@
+; RUN: llc -split-dwarf=Disable -dwarf-version=4 -generate-type-units \
+; RUN:     -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
+; RUN:     | llvm-dwarfdump - | FileCheck %s --check-prefix=SINGLE-4
+
+; RUN: llc -split-dwarf=Enable -dwarf-version=4 -generate-type-units \
+; RUN:     -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
+; RUN:     | llvm-dwarfdump - | FileCheck %s --check-prefix=SPLIT-4
+
+; RUN: llc -split-dwarf=Disable -dwarf-version=5 -generate-type-units \
+; RUN:     -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
+; RUN:     | llvm-dwarfdump - | FileCheck %s --check-prefix=SINGLE-5
+
+; RUN: llc -split-dwarf=Enable -dwarf-version=5 -generate-type-units \
+; RUN:     -filetype=obj -O0 -mtriple=x86_64-unknown-linux-gnu < %s \
+; RUN:     | llvm-dwarfdump - | FileCheck %s --check-prefix=SPLIT-5
+
+; Looking for DWARF headers to be generated correctly.
+; There are 7 variants: v4 CU, v4 TU, v5 (normal/skeleton/split) CU,
+; v5 (normal/split) TU.  The v5 CU variants and TU variants differ
+; only in the type-unit code.
+; (v2 thru v4 CUs are all the same, and TUs were invented in v4,
+; so we don't bother checking older versions.)
+
+; Test case built from:
+;struct S {
+;  int s1;
+;};
+;
+;S s;
+
+; Verify the v4 non-split headers.
+; Note that we check the exact offset of the DIEs because that tells us
+; the length of the header.
+;
+; SINGLE-4: .debug_info contents:
+; SINGLE-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004 abbr_offset
+; SINGLE-4: 0x0000000b: DW_TAG_compile_unit
+;
+; SINGLE-4: .debug_types contents:
+; SINGLE-4: 0x00000000: Type Unit: {{.*}} version = 0x0004 abbr_offset
+; SINGLE-4: 0x00000017: DW_TAG_type_unit
+
+; Verify the v4 split headers.
+;
+; SPLIT-4: .debug_info contents:
+; SPLIT-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004 abbr_offset
+; SPLIT-4: 0x0000000b: DW_TAG_compile_unit
+;
+; SPLIT-4: .debug_info.dwo contents:
+; SPLIT-4: 0x00000000: Compile Unit: {{.*}} version = 0x0004 abbr_offset
+; SPLIT-4: 0x0000000b: DW_TAG_compile_unit
+;
+; SPLIT-4: .debug_types.dwo contents:
+; SPLIT-4: 0x00000000: Type Unit: {{.*}} version = 0x0004 abbr_offset
+; SPLIT-4: 0x00000017: DW_TAG_type_unit
+
+; Verify the v5 non-split headers.
+;
+; SINGLE-5: .debug_info contents:
+; SINGLE-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_compile abbr_offset
+; SINGLE-5: 0x0000000c: DW_TAG_compile_unit
+;
+; FIXME: V5 wants type units in .debug_info not .debug_types.
+; SINGLE-5: .debug_types contents:
+; SINGLE-5: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_type abbr_offset
+; SINGLE-5: 0x00000018: DW_TAG_type_unit
+
+; Verify the v5 split headers.
+;
+; SPLIT-5: .debug_info contents:
+; SPLIT-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_skeleton abbr_offset
+; SPLIT-5: 0x0000000c: DW_TAG_compile_unit
+;
+; SPLIT-5: .debug_info.dwo contents:
+; SPLIT-5: 0x00000000: Compile Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_compile abbr_offset
+; SPLIT-5: 0x0000000c: DW_TAG_compile_unit
+;
+; FIXME: V5 wants type units in .debug_info.dwo not .debug_types.dwo.
+; SPLIT-5: .debug_types.dwo contents:
+; SPLIT-5: 0x00000000: Type Unit: {{.*}} version = 0x0005 unit_type = DW_UT_split_type abbr_offset
+; SPLIT-5: 0x00000018: DW_TAG_type_unit
+
+
+; ModuleID = 't.cpp'
+source_filename = "t.cpp"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.S = type { i32 }
+
+@s = global %struct.S zeroinitializer, align 4, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!10, !11}
+!llvm.ident = !{!12}
+
+!0 = !DIGlobalVariableExpression(var: !1)
+!1 = distinct !DIGlobalVariable(name: "s", scope: !2, file: !3, line: 5, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 5.0.0 (trunk 295942)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "t.cpp", directory: "/home/probinson/projects/scratch")
+!4 = !{}
+!5 = !{!0}
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "S", file: !3, line: 1, size: 32, elements: !7, identifier: "_ZTS1S")
+!7 = !{!8}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "s1", scope: !6, file: !3, line: 2, baseType: !9, size: 32)
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{!"clang version 5.0.0 (trunk 295942)"}
diff --git a/test/CodeGen/X86/dynamic-alloca-lifetime.ll b/test/CodeGen/X86/dynamic-alloca-lifetime.ll
index 034b074ef9bd4df144c8497f1ee94d801d6c0597..996eec05163de601cee190f916a75815fe8ff0a1 100644
--- a/test/CodeGen/X86/dynamic-alloca-lifetime.ll
+++ b/test/CodeGen/X86/dynamic-alloca-lifetime.ll
@@ -10,10 +10,10 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
 target triple = "i386-apple-macosx10.10.0"
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 ; Function Attrs: ssp
 define void @foo(i1 %cond1, i1 %cond2) #1 {
@@ -30,11 +30,11 @@ end1:
 
 if.else130:                                       ; preds = %bb1
   %tmp = getelementptr inbounds [8192 x i8], [8192 x i8]* %bitmapBuffer, i32 0, i32 0
-  call void @llvm.lifetime.start(i64 8192, i8* %tmp) #0
-  call void @llvm.lifetime.end(i64 8192, i8* %tmp) #0
+  call void @llvm.lifetime.start.p0i8(i64 8192, i8* %tmp) #0
+  call void @llvm.lifetime.end.p0i8(i64 8192, i8* %tmp) #0
   %tmp25 = getelementptr inbounds [8192 x i8], [8192 x i8]* %bitmapBuffer229, i32 0, i32 0
-  call void @llvm.lifetime.start(i64 8192, i8* %tmp25) #0
-  call void @llvm.lifetime.end(i64 8192, i8* %tmp25) #0
+  call void @llvm.lifetime.start.p0i8(i64 8192, i8* %tmp25) #0
+  call void @llvm.lifetime.end.p0i8(i64 8192, i8* %tmp25) #0
   br label %end1
 }
 
diff --git a/test/CodeGen/X86/elf-associated.ll b/test/CodeGen/X86/elf-associated.ll
new file mode 100644
index 0000000000000000000000000000000000000000..361cf66cce728092fa0dfc329f02769b1a522e6b
--- /dev/null
+++ b/test/CodeGen/X86/elf-associated.ll
@@ -0,0 +1,39 @@
+; RUN: llc -data-sections=1 -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
+; RUN: llc -data-sections=0 -mtriple x86_64-pc-linux-gnu < %s | FileCheck %s
+
+@a = global i32 1
+@b = global i32 2, !associated !0
+!0 = !{i32* @a}
+; CHECK-DAG: .section .data.b,"awo",@progbits,a
+
+; Loop is OK. Also, normally -data-sections=0 would place @c and @d in the same section. !associated prevents that.
+@c = global i32 2, !associated !2
+@d = global i32 2, !associated !1
+!1 = !{i32* @c}
+!2 = !{i32* @d}
+; CHECK-DAG: .section .data.c,"awo",@progbits,d
+; CHECK-DAG: .section .data.d,"awo",@progbits,c
+
+; BSS is OK.
+@e = global i32 0
+@f = global i32 0, !associated !3
+@g = global i32 1, !associated !3
+!3 = !{i32* @e}
+; CHECK-DAG: .section .bss.f,"awo",@nobits,e
+; CHECK-DAG: .section .data.g,"awo",@progbits,e
+
+; Explicit sections.
+@h = global i32 1, section "aaa"
+@i = global i32 1, section "bbb", !associated !4
+@j = global i32 1, section "bbb", !associated !4
+@k = global i32 1, !associated !4
+!4 = !{i32* @h}
+; CHECK-DAG: .section	aaa,"aw",@progbits
+; CHECK-DAG: .section	bbb,"awo",@progbits,h,unique,1
+; CHECK-DAG: .section	bbb,"awo",@progbits,h,unique,2
+; CHECK-DAG: .section	.data.k,"awo",@progbits,h
+
+; Non-GlobalObject metadata.
+@l = global i32 1, section "ccc", !associated !5
+!5 = !{i32* null}
+; CHECK-DAG: .section	ccc,"aw",@progbits
diff --git a/test/CodeGen/X86/evex-to-vex-compress.mir b/test/CodeGen/X86/evex-to-vex-compress.mir
index 099189119ed1a62b9fb1cd109037461067f214ac..2295ddb5b2b9d67f3c9073019b3e5f14eb9b1e0e 100755
--- a/test/CodeGen/X86/evex-to-vex-compress.mir
+++ b/test/CodeGen/X86/evex-to-vex-compress.mir
@@ -119,6 +119,14 @@ body: |
   %ymm0 = VPANDQZ256rm                         %ymm0, %rip, 1, _, %rax, _                     
   ; CHECK: %ymm0 = VPANDYrr                    %ymm0, %ymm1
   %ymm0 = VPANDQZ256rr                         %ymm0, %ymm1                                   
+  ; CHECK: %ymm0 = VPANDNYrm                    %ymm0, %rip, 1, _, %rax, _
+  %ymm0 = VPANDNDZ256rm                         %ymm0, %rip, 1, _, %rax, _                     
+  ; CHECK: %ymm0 = VPANDNYrr                    %ymm0, %ymm1  
+  %ymm0 = VPANDNDZ256rr                         %ymm0, %ymm1                                   
+  ; CHECK: %ymm0 = VPANDNYrm                    %ymm0, %rip, 1, _, %rax, _
+  %ymm0 = VPANDNQZ256rm                         %ymm0, %rip, 1, _, %rax, _                     
+  ; CHECK: %ymm0 = VPANDNYrr                    %ymm0, %ymm1
+  %ymm0 = VPANDNQZ256rr                         %ymm0, %ymm1                                   
   ; CHECK: %ymm0 = VPAVGBYrm                   %ymm0, %rip, 1, _, %rax, _
   %ymm0 = VPAVGBZ256rm                         %ymm0, %rip, 1, _, %rax, _                     
   ; CHECK: %ymm0 = VPAVGBYrr                   %ymm0, %ymm1
@@ -347,13 +355,13 @@ body: |
   %ymm0 = VMAXCPSZ256rm                        %ymm0, %rip, 1, _, %rax, _                     
   ; CHECK: %ymm0 = VMAXCPSYrr                  %ymm0, %ymm1
   %ymm0 = VMAXCPSZ256rr                        %ymm0, %ymm1                                   
-  ; CHECK: %ymm0 = VMAXPDYrm                   %ymm0, %rip, 1, _, %rax, _
+  ; CHECK: %ymm0 = VMAXCPDYrm                  %ymm0, %rip, 1, _, %rax, _
   %ymm0 = VMAXPDZ256rm                         %ymm0, %rip, 1, _, %rax, _                     
-  ; CHECK: %ymm0 = VMAXPDYrr                   %ymm0, %ymm1
+  ; CHECK: %ymm0 = VMAXCPDYrr                  %ymm0, %ymm1
   %ymm0 = VMAXPDZ256rr                         %ymm0, %ymm1                                   
-  ; CHECK: %ymm0 = VMAXPSYrm                   %ymm0, %rip, 1, _, %rax, _
+  ; CHECK: %ymm0 = VMAXCPSYrm                  %ymm0, %rip, 1, _, %rax, _
   %ymm0 = VMAXPSZ256rm                         %ymm0, %rip, 1, _, %rax, _                     
-  ; CHECK: %ymm0 = VMAXPSYrr                   %ymm0, %ymm1
+  ; CHECK: %ymm0 = VMAXCPSYrr                  %ymm0, %ymm1
   %ymm0 = VMAXPSZ256rr                         %ymm0, %ymm1                                   
   ; CHECK: %ymm0 = VMINCPDYrm                  %ymm0, %rip, 1, _, %rax, _
   %ymm0 = VMINCPDZ256rm                        %ymm0, %rip, 1, _, %rax, _                     
@@ -363,13 +371,13 @@ body: |
   %ymm0 = VMINCPSZ256rm                        %ymm0, %rip, 1, _, %rax, _                     
   ; CHECK: %ymm0 = VMINCPSYrr                  %ymm0, %ymm1
   %ymm0 = VMINCPSZ256rr                        %ymm0, %ymm1                                   
-  ; CHECK: %ymm0 = VMINPDYrm                   %ymm0, %rip, 1, _, %rax, _
+  ; CHECK: %ymm0 = VMINCPDYrm                  %ymm0, %rip, 1, _, %rax, _
   %ymm0 = VMINPDZ256rm                         %ymm0, %rip, 1, _, %rax, _                     
-  ; CHECK: %ymm0 = VMINPDYrr                   %ymm0, %ymm1
+  ; CHECK: %ymm0 = VMINCPDYrr                  %ymm0, %ymm1
   %ymm0 = VMINPDZ256rr                         %ymm0, %ymm1                                   
-  ; CHECK: %ymm0 = VMINPSYrm                   %ymm0, %rip, 1, _, %rax, _
+  ; CHECK: %ymm0 = VMINCPSYrm                  %ymm0, %rip, 1, _, %rax, _
   %ymm0 = VMINPSZ256rm                         %ymm0, %rip, 1, _, %rax, _                     
-  ; CHECK: %ymm0 = VMINPSYrr                   %ymm0, %ymm1
+  ; CHECK: %ymm0 = VMINCPSYrr                  %ymm0, %ymm1
   %ymm0 = VMINPSZ256rr                         %ymm0, %ymm1                                   
   ; CHECK: %ymm0 = VXORPDYrm                   %ymm0, %rip, 1, _, %rax, _
   %ymm0 = VXORPDZ256rm                         %ymm0, %rip, 1, _, %rax, _                     
@@ -687,6 +695,12 @@ body: |
   %ymm0 = VPMOVZXWQZ256rm                      %rip, 1, _, %rax, _                            
   ; CHECK: %ymm0 = VPMOVZXWQYrr                %xmm0                                      
   %ymm0 = VPMOVZXWQZ256rr                      %xmm0                                                 
+  ; CHECK: %ymm0 = VBROADCASTF128              %rip, 1, _, %rax, _
+  %ymm0 = VBROADCASTF32X4Z256rm                %rip, 1, _, %rax, _
+  ; CHECK: %ymm0 = VBROADCASTSDYrm             %rip, 1, _, %rax, _
+  %ymm0 = VBROADCASTF32X2Z256m                 %rip, 1, _, %rax, _
+  ; CHECK: %ymm0 = VBROADCASTSDYrr             %xmm0
+  %ymm0 = VBROADCASTF32X2Z256r                 %xmm0
   ; CHECK: %ymm0 = VBROADCASTSDYrm             %rip, 1, _, %rax, _
   %ymm0 = VBROADCASTSDZ256m                    %rip, 1, _, %rax, _                            
   ; CHECK: %ymm0 = VBROADCASTSDYrr             %xmm0
@@ -707,6 +721,12 @@ body: |
   %ymm0 = VPBROADCASTWZ256m                    %rip, 1, _, %rax, _                            
   ; CHECK: %ymm0 = VPBROADCASTWYrr             %xmm0
   %ymm0 = VPBROADCASTWZ256r                    %xmm0                                          
+  ; CHECK: %ymm0 = VBROADCASTI128              %rip, 1, _, %rax, _
+  %ymm0 = VBROADCASTI32X4Z256rm                %rip, 1, _, %rax, _
+  ; CHECK: %ymm0 = VPBROADCASTQYrm             %rip, 1, _, %rax, _
+  %ymm0 = VBROADCASTI32X2Z256m                 %rip, 1, _, %rax, _
+  ; CHECK: %ymm0 = VPBROADCASTQYrr             %xmm0
+  %ymm0 = VBROADCASTI32X2Z256r                 %xmm0
   ; CHECK: %ymm0 = VPBROADCASTQYrm             %rip, 1, _, %rax, _  
   %ymm0 = VPBROADCASTQZ256m                    %rip, 1, _, %rax, _                            
   ; CHECK: %ymm0 = VPBROADCASTQYrr             %xmm0                                      
@@ -1039,13 +1059,13 @@ body: |
   %xmm0 = VMAXCPSZ128rm                        %xmm0, %rip, 1, _, %rax, _                          
   ; CHECK: %xmm0 = VMAXCPSrr                   %xmm0, %xmm1
   %xmm0 = VMAXCPSZ128rr                        %xmm0, %xmm1                                        
-  ; CHECK: %xmm0 = VMAXPDrm                    %xmm0, %rip, 1, _, %rax, _
+  ; CHECK: %xmm0 = VMAXCPDrm                   %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMAXPDZ128rm                         %xmm0, %rip, 1, _, %rax, _                          
-  ; CHECK: %xmm0 = VMAXPDrr                    %xmm0, %xmm1
+  ; CHECK: %xmm0 = VMAXCPDrr                   %xmm0, %xmm1
   %xmm0 = VMAXPDZ128rr                         %xmm0, %xmm1                                        
-  ; CHECK: %xmm0 = VMAXPSrm                    %xmm0, %rip, 1, _, %rax, _
+  ; CHECK: %xmm0 = VMAXCPSrm                   %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMAXPSZ128rm                         %xmm0, %rip, 1, _, %rax, _                          
-  ; CHECK: %xmm0 = VMAXPSrr                    %xmm0, %xmm1
+  ; CHECK: %xmm0 = VMAXCPSrr                   %xmm0, %xmm1
   %xmm0 = VMAXPSZ128rr                         %xmm0, %xmm1                                        
   ; CHECK: %xmm0 = VMINCPDrm                   %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMINCPDZ128rm                        %xmm0, %rip, 1, _, %rax, _                          
@@ -1055,13 +1075,13 @@ body: |
   %xmm0 = VMINCPSZ128rm                        %xmm0, %rip, 1, _, %rax, _                          
   ; CHECK: %xmm0 = VMINCPSrr                   %xmm0, %xmm1
   %xmm0 = VMINCPSZ128rr                        %xmm0, %xmm1                                        
-  ; CHECK: %xmm0 = VMINPDrm                    %xmm0, %rip, 1, _, %rax, _
+  ; CHECK: %xmm0 = VMINCPDrm                   %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMINPDZ128rm                         %xmm0, %rip, 1, _, %rax, _                          
-  ; CHECK: %xmm0 = VMINPDrr                    %xmm0, %xmm1
+  ; CHECK: %xmm0 = VMINCPDrr                   %xmm0, %xmm1
   %xmm0 = VMINPDZ128rr                         %xmm0, %xmm1                                        
-  ; CHECK: %xmm0 = VMINPSrm                    %xmm0, %rip, 1, _, %rax, _
+  ; CHECK: %xmm0 = VMINCPSrm                   %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMINPSZ128rm                         %xmm0, %rip, 1, _, %rax, _                          
-  ; CHECK: %xmm0 = VMINPSrr                    %xmm0, %xmm1
+  ; CHECK: %xmm0 = VMINCPSrr                   %xmm0, %xmm1
   %xmm0 = VMINPSZ128rr                         %xmm0, %xmm1                                        
   ; CHECK: %xmm0 = VMULPDrm                    %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMULPDZ128rm                         %xmm0, %rip, 1, _, %rax, _                          
@@ -1119,6 +1139,14 @@ body: |
   %xmm0 = VPANDQZ128rm                         %xmm0, %rip, 1, _, %rax, _                          
   ; CHECK: %xmm0 = VPANDrr                     %xmm0, %xmm1  
   %xmm0 = VPANDQZ128rr                         %xmm0, %xmm1                                        
+  ; CHECK: %xmm0 = VPANDNrm                    %xmm0, %rip, 1, _, %rax, _
+  %xmm0 = VPANDNDZ128rm                        %xmm0, %rip, 1, _, %rax, _                          
+  ; CHECK: %xmm0 = VPANDNrr                    %xmm0, %xmm1
+  %xmm0 = VPANDNDZ128rr                        %xmm0, %xmm1                                        
+  ; CHECK: %xmm0 = VPANDNrm                    %xmm0, %rip, 1, _, %rax, _
+  %xmm0 = VPANDNQZ128rm                        %xmm0, %rip, 1, _, %rax, _                          
+  ; CHECK: %xmm0 = VPANDNrr                    %xmm0, %xmm1  
+  %xmm0 = VPANDNQZ128rr                        %xmm0, %xmm1                                        
   ; CHECK: %xmm0 = VPAVGBrm                    %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VPAVGBZ128rm                         %xmm0, %rip, 1, _, %rax, _                          
   ; CHECK: %xmm0 = VPAVGBrr                    %xmm0, %xmm1  
@@ -1707,6 +1735,10 @@ body: |
   %xmm0 = VPBROADCASTWZ128m                    %rip, _, _, _, _                                    
   ; CHECK: %xmm0 = VPBROADCASTWrr              %xmm0                                   
   %xmm0 = VPBROADCASTWZ128r                    %xmm0                                                                                             
+  ; CHECK: %xmm0 = VPBROADCASTQrm              %rip, _, _, _, _
+  %xmm0 = VBROADCASTI32X2Z128m                 %rip, _, _, _, _
+  ; CHECK: %xmm0 = VPBROADCASTQrr              %xmm0
+  %xmm0 = VBROADCASTI32X2Z128r                 %xmm0
   ; CHECK: %xmm0 = VCVTPS2PHrr                 %xmm0, 2
   %xmm0 = VCVTPS2PHZ128rr                      %xmm0, 2                                            
   ; CHECK: VCVTPS2PHmr                         %rdi, %xmm0, 1, _, 0, _, _              
@@ -1778,19 +1810,19 @@ body: |
   %xmm0 = VMAXCSSZrm                           %xmm0, %rip, 1, _, %rax, _                              
   ; CHECK: %xmm0 = VMAXCSSrr                   %xmm0, %xmm1
   %xmm0 = VMAXCSSZrr                           %xmm0, %xmm1                                            
-  ; CHECK: %xmm0 = VMAXSDrm                    %xmm0, %rip, 1, _, %rax, _
+  ; CHECK: %xmm0 = VMAXCSDrm                   %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMAXSDZrm                            %xmm0, %rip, 1, _, %rax, _                              
   ; CHECK: %xmm0 = VMAXSDrm_Int                %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMAXSDZrm_Int                        %xmm0, %rip, 1, _, %rax, _                              
-  ; CHECK: %xmm0 = VMAXSDrr                    %xmm0, %xmm1
+  ; CHECK: %xmm0 = VMAXCSDrr                   %xmm0, %xmm1
   %xmm0 = VMAXSDZrr                            %xmm0, %xmm1                                            
   ; CHECK: %xmm0 = VMAXSDrr_Int                %xmm0, %xmm1
   %xmm0 = VMAXSDZrr_Int                        %xmm0, %xmm1                                            
-  ; CHECK: %xmm0 = VMAXSSrm                    %xmm0, %rip, 1, _, %rax, _
+  ; CHECK: %xmm0 = VMAXCSSrm                   %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMAXSSZrm                            %xmm0, %rip, 1, _, %rax, _                              
   ; CHECK: %xmm0 = VMAXSSrm_Int                %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMAXSSZrm_Int                        %xmm0, %rip, 1, _, %rax, _                              
-  ; CHECK: %xmm0 = VMAXSSrr                    %xmm0, %xmm1
+  ; CHECK: %xmm0 = VMAXCSSrr                   %xmm0, %xmm1
   %xmm0 = VMAXSSZrr                            %xmm0, %xmm1                                            
   ; CHECK: %xmm0 = VMAXSSrr_Int                %xmm0, %xmm1
   %xmm0 = VMAXSSZrr_Int                        %xmm0, %xmm1                                            
@@ -1802,19 +1834,19 @@ body: |
   %xmm0 = VMINCSSZrm                           %xmm0, %rip, 1, _, %rax, _                              
   ; CHECK: %xmm0 = VMINCSSrr                   %xmm0, %xmm1
   %xmm0 = VMINCSSZrr                           %xmm0, %xmm1                                            
-  ; CHECK: %xmm0 = VMINSDrm                    %xmm0, %rip, 1, _, %rax, _
+  ; CHECK: %xmm0 = VMINCSDrm                   %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMINSDZrm                            %xmm0, %rip, 1, _, %rax, _                              
   ; CHECK: %xmm0 = VMINSDrm_Int                %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMINSDZrm_Int                        %xmm0, %rip, 1, _, %rax, _                              
-  ; CHECK: %xmm0 = VMINSDrr                    %xmm0, %xmm1
+  ; CHECK: %xmm0 = VMINCSDrr                   %xmm0, %xmm1
   %xmm0 = VMINSDZrr                            %xmm0, %xmm1                                            
   ; CHECK: %xmm0 = VMINSDrr_Int                %xmm0, %xmm1
   %xmm0 = VMINSDZrr_Int                        %xmm0, %xmm1                                            
-  ; CHECK: %xmm0 = VMINSSrm                    %xmm0, %rip, 1, _, %rax, _
+  ; CHECK: %xmm0 = VMINCSSrm                   %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMINSSZrm                            %xmm0, %rip, 1, _, %rax, _                              
   ; CHECK: %xmm0 = VMINSSrm_Int                %xmm0, %rip, 1, _, %rax, _
   %xmm0 = VMINSSZrm_Int                        %xmm0, %rip, 1, _, %rax, _                              
-  ; CHECK: %xmm0 = VMINSSrr                    %xmm0, %xmm1
+  ; CHECK: %xmm0 = VMINCSSrr                   %xmm0, %xmm1
   %xmm0 = VMINSSZrr                            %xmm0, %xmm1                                            
   ; CHECK: %xmm0 = VMINSSrr_Int                %xmm0, %xmm1
   %xmm0 = VMINSSZrr_Int                        %xmm0, %xmm1                                            
@@ -2058,6 +2090,8 @@ body: |
   VPEXTRWZmr                                   %rdi, 1, _, 0, _,  %xmm0, 3                             
   ; CHECK: %eax = VPEXTRWri                    %xmm0, 1                                                     
   %eax = VPEXTRWZrr                            %xmm0, 1                                                    
+  ; CHECK: %eax = VPEXTRWrr_REV               %xmm0, 1      
+  %eax = VPEXTRWZrr_REV                        %xmm0, 1                                                     
   ; CHECK: %xmm0 = VPINSRBrm                   %xmm0, %rsi, 1, _, 0, _, 3      
   %xmm0 = VPINSRBZrm                           %xmm0, %rsi, 1, _, 0, _, 3                              
   ; CHECK: %xmm0 = VPINSRBrr                   %xmm0, %edi, 5      
@@ -2090,18 +2124,18 @@ body: |
   %xmm0 = VSQRTSSZr                            %xmm0, _                                                
   ; CHECK: %xmm0 = VSQRTSSr_Int                %xmm0, _                                              
   %xmm0 = VSQRTSSZr_Int                        %xmm0, _                                                
-  ; CHECK: %rdi = VCVTSD2SI64rm                %rdi, %xmm0, 1, _, 0
-  %rdi = VCVTSD2SI64Zrm                        %rdi, %xmm0, 1, _, 0                                    
   ; CHECK: %rdi = VCVTSD2SI64rr                %xmm0
   %rdi = VCVTSD2SI64Zrr                        %xmm0                                                   
-  ; CHECK: %edi = VCVTSD2SIrm                  %rdi, %xmm0, 1, _, 0
-  %edi = VCVTSD2SIZrm                          %rdi, %xmm0, 1, _, 0                                    
   ; CHECK: %edi = VCVTSD2SIrr                  %xmm0
   %edi = VCVTSD2SIZrr                          %xmm0                                                   
   ; CHECK: %xmm0 = VCVTSD2SSrm                 %xmm0, %rdi, 1, _, 0, _
   %xmm0 = VCVTSD2SSZrm                         %xmm0, %rdi, 1, _, 0, _                                 
+  ; CHECK: %xmm0 = Int_VCVTSD2SSrm             %xmm0, %rdi, 1, _, 0, _
+  %xmm0 = VCVTSD2SSZrm_Int                     %xmm0, %rdi, 1, _, 0, _                                 
   ; CHECK: %xmm0 = VCVTSD2SSrr                 %xmm0, _
   %xmm0 = VCVTSD2SSZrr                         %xmm0, _                                                
+  ; CHECK: %xmm0 = Int_VCVTSD2SSrr             %xmm0, _
+  %xmm0 = VCVTSD2SSZrr_Int                     %xmm0, _                                                
   ; CHECK: %xmm0 = VCVTSI2SDrm                 %xmm0, %rdi, 1, _, 0, _
   %xmm0 = VCVTSI2SDZrm                         %xmm0, %rdi, 1, _, 0, _                                 
   ; CHECK: %xmm0 = Int_VCVTSI2SDrm             %xmm0, %rdi, 1, _, 0, _
@@ -2118,10 +2152,30 @@ body: |
   %xmm0 = VCVTSI2SSZrr                         %xmm0, _                                                
   ; CHECK: %xmm0 = Int_VCVTSI2SSrr             %xmm0, _
   %xmm0 = VCVTSI2SSZrr_Int                     %xmm0, _                                                
+  ; CHECK: %xmm0 = VCVTSI2SD64rm               %xmm0, %rdi, 1, _, 0, _
+  %xmm0 = VCVTSI642SDZrm                       %xmm0, %rdi, 1, _, 0, _
+  ; CHECK: %xmm0 = Int_VCVTSI2SD64rm           %xmm0, %rdi, 1, _, 0, _
+  %xmm0 = VCVTSI642SDZrm_Int                   %xmm0, %rdi, 1, _, 0, _
+  ; CHECK: %xmm0 = VCVTSI2SD64rr               %xmm0, _
+  %xmm0 = VCVTSI642SDZrr                       %xmm0, _
+  ; CHECK: %xmm0 = Int_VCVTSI2SD64rr           %xmm0, _
+  %xmm0 = VCVTSI642SDZrr_Int                   %xmm0, _
+  ; CHECK: %xmm0 = VCVTSI2SS64rm               %xmm0, %rdi, 1, _, 0, _ 
+  %xmm0 = VCVTSI642SSZrm                       %xmm0, %rdi, 1, _, 0, _
+  ; CHECK: %xmm0 = Int_VCVTSI2SS64rm           %xmm0, %rdi, 1, _, 0, _
+  %xmm0 = VCVTSI642SSZrm_Int                   %xmm0, %rdi, 1, _, 0, _
+  ; CHECK: %xmm0 = VCVTSI2SS64rr               %xmm0, _ 
+  %xmm0 = VCVTSI642SSZrr                       %xmm0, _
+  ; CHECK: %xmm0 = Int_VCVTSI2SS64rr           %xmm0, _
+  %xmm0 = VCVTSI642SSZrr_Int                   %xmm0, _
   ; CHECK: %xmm0 = VCVTSS2SDrm                 %xmm0, %rdi, 1, _, 0, _
   %xmm0 = VCVTSS2SDZrm                         %xmm0, %rdi, 1, _, 0, _                                 
+  ; CHECK: %xmm0 = Int_VCVTSS2SDrm             %xmm0, %rdi, 1, _, 0, _
+  %xmm0 = VCVTSS2SDZrm_Int                     %xmm0, %rdi, 1, _, 0, _                                 
   ; CHECK: %xmm0 = VCVTSS2SDrr                 %xmm0, _
   %xmm0 = VCVTSS2SDZrr                         %xmm0, _                                                
+  ; CHECK: %xmm0 = Int_VCVTSS2SDrr             %xmm0, _
+  %xmm0 = VCVTSS2SDZrr_Int                     %xmm0, _                                                
   ; CHECK: %rdi = VCVTSS2SI64rm                %rdi, %xmm0, 1, _, 0
   %rdi = VCVTSS2SI64Zrm                        %rdi, %xmm0, 1, _, 0                                    
   ; CHECK: %rdi = VCVTSS2SI64rr                %xmm0
@@ -2174,6 +2228,12 @@ body: |
   %xmm0 = VMOVSDZrm                            %rip, _, _, _, _                                        
   ; CHECK: %xmm0 = VMOVSDrr                    %xmm0, _
   %xmm0 = VMOVSDZrr                            %xmm0, _                                                
+  ; CHECK: %xmm0 = VMOVSDrr_REV                %xmm0, _
+  %xmm0 = VMOVSDZrr_REV                        %xmm0, _                                                
+  ; CHECK: %rax = VMOVSDto64rr                 %xmm0
+  %rax = VMOVSDto64Zrr                         %xmm0
+  ; CHECK: VMOVSDto64mr                        %rdi, %xmm0, _, _, _, _
+  VMOVSDto64Zmr                                %rdi, %xmm0, _, _, _, _
   ; CHECK: VMOVSSmr                            %rdi, %xmm0, _, _, _, _
   VMOVSSZmr                                    %rdi, %xmm0, _, _, _, _                                 
   ; CHECK: %xmm0 = VMOVSSrm                    %rip, _, _, _, _
@@ -2182,8 +2242,14 @@ body: |
   %xmm0 = VMOVSSZrr                            %xmm0, _                                                
   ; CHECK: %xmm0 = VMOVSSrr_REV                %xmm0, _
   %xmm0 = VMOVSSZrr_REV                        %xmm0, _                                                
+  ; CHECK: VMOVSS2DImr                         %rdi, %xmm0, _, _, _, _
+  VMOVSS2DIZmr                                 %rdi, %xmm0, _, _, _, _
+  ; CHECK: %eax = VMOVSS2DIrr                  %xmm0
+  %eax = VMOVSS2DIZrr                          %xmm0
   ; CHECK: %xmm0 = VMOV64toPQIrr               %rdi
   %xmm0 = VMOV64toPQIZrr                       %rdi                                                    
+  ; CHECK: %xmm0 = VMOV64toPQIrm               %rdi, _, _, _, _
+  %xmm0 = VMOV64toPQIZrm                       %rdi, _, _, _, _
   ; CHECK: %xmm0 = VMOV64toSDrr                %rdi 
   %xmm0 = VMOV64toSDZrr                        %rdi                                                    
   ; CHECK: %xmm0 = VMOVDI2PDIrm                %rip, _, _, _, _
@@ -2197,11 +2263,15 @@ body: |
   ; CHECK: VMOVPDI2DImr                        %rdi, %xmm0, _, _, _, _
   VMOVPDI2DIZmr                                %rdi, %xmm0, _, _, _, _                                 
   ; CHECK: %edi = VMOVPDI2DIrr                 %xmm0
-  %edi = VMOVPDI2DIZrr                         %xmm0                                                   
+  %edi = VMOVPDI2DIZrr                         %xmm0
+  ; CHECK: %xmm0 = VMOVPQI2QIrr                %xmm0
+  %xmm0 = VMOVPQI2QIZrr                        %xmm0
   ; CHECK: VMOVPQI2QImr                        %rdi, %xmm0, _, _, _, _
   VMOVPQI2QIZmr                                %rdi, %xmm0, _, _, _, _                                 
   ; CHECK: %rdi = VMOVPQIto64rr                %xmm0
   %rdi = VMOVPQIto64Zrr                        %xmm0                                                   
+  ; CHECK: VMOVPQIto64mr                       %rdi, %xmm0, _, _, _, _
+  VMOVPQIto64Zmr                               %rdi, %xmm0, _, _, _, _
   ; CHECK: %xmm0 = VMOVQI2PQIrm                %rip, _, _, _, _
   %xmm0 = VMOVQI2PQIZrm                        %rip, _, _, _, _                                        
   ; CHECK: %xmm0 = VMOVZPQILo2PQIrr            %xmm0                                               
@@ -2238,6 +2308,14 @@ body: |
   VUCOMISSZrm                                  %xmm0, %rdi, _, _, _, _, implicit-def %eflags           
   ; CHECK: VUCOMISSrr                          %xmm0, %xmm1, implicit-def %eflags 
   VUCOMISSZrr                                  %xmm0, %xmm1, implicit-def %eflags 
+  ; CHECK: VEXTRACTPSmr                        %rdi, 1, _, 0, _, %xmm0, _
+  VEXTRACTPSZmr                                %rdi, 1, _, 0, _, %xmm0, _
+  ; CHECK: %eax = VEXTRACTPSrr                 %xmm0, _
+  %eax = VEXTRACTPSZrr                         %xmm0, _
+  ; CHECK: %xmm0 = VINSERTPSrm                 %xmm0, %rdi, _, _, _, _, _
+  %xmm0 = VINSERTPSZrm                         %xmm0, %rdi, _, _, _, _, _
+  ; CHECK: %xmm0 = VINSERTPSrr                 %xmm0, %xmm0, _ 
+  %xmm0 = VINSERTPSZrr                         %xmm0, %xmm0, _
 
   RET 0, %zmm0, %zmm1                          
 ...
@@ -2350,6 +2428,14 @@ body: |
   %ymm16 = VPANDQZ256rm                        %ymm16, %rip, 1, _, %rax, _                   
   ; CHECK: %ymm16 = VPANDQZ256rr               %ymm16, %ymm1
   %ymm16 = VPANDQZ256rr                        %ymm16, %ymm1                                 
+  ; CHECK: %ymm16 = VPANDNDZ256rm               %ymm16, %rip, 1, _, %rax, _
+  %ymm16 = VPANDNDZ256rm                        %ymm16, %rip, 1, _, %rax, _                   
+  ; CHECK: %ymm16 = VPANDNDZ256rr               %ymm16, %ymm1  
+  %ymm16 = VPANDNDZ256rr                        %ymm16, %ymm1                                 
+  ; CHECK: %ymm16 = VPANDNQZ256rm               %ymm16, %rip, 1, _, %rax, _
+  %ymm16 = VPANDNQZ256rm                        %ymm16, %rip, 1, _, %rax, _                   
+  ; CHECK: %ymm16 = VPANDNQZ256rr               %ymm16, %ymm1
+  %ymm16 = VPANDNQZ256rr                        %ymm16, %ymm1                                 
   ; CHECK: %ymm16 = VPAVGBZ256rm               %ymm16, %rip, 1, _, %rax, _
   %ymm16 = VPAVGBZ256rm                        %ymm16, %rip, 1, _, %rax, _                   
   ; CHECK: %ymm16 = VPAVGBZ256rr               %ymm16, %ymm1
@@ -2918,6 +3004,12 @@ body: |
   %ymm16 = VPMOVZXWQZ256rm                     %rip, 1, _, %rax, _                           
   ; CHECK: %ymm16 = VPMOVZXWQZ256rr            %xmm0    
   %ymm16 = VPMOVZXWQZ256rr                     %xmm0                                                 
+  ; CHECK: %ymm16 = VBROADCASTF32X2Z256m       %rip, 1, _, %rax, _
+  %ymm16 = VBROADCASTF32X2Z256m                %rip, 1, _, %rax, _
+  ; CHECK: %ymm16 = VBROADCASTF32X2Z256r       %xmm16
+  %ymm16 = VBROADCASTF32X2Z256r                %xmm16
+  ; CHECK: %ymm16 = VBROADCASTF32X4Z256rm      %rip, 1, _, %rax, _
+  %ymm16 = VBROADCASTF32X4Z256rm               %rip, 1, _, %rax, _
   ; CHECK: %ymm16 = VBROADCASTSDZ256m          %rip, 1, _, %rax, _
   %ymm16 = VBROADCASTSDZ256m                   %rip, 1, _, %rax, _                           
   ; CHECK: %ymm16 = VBROADCASTSDZ256r          %xmm0
@@ -2938,6 +3030,12 @@ body: |
   %ymm16 = VPBROADCASTWZ256m                   %rip, 1, _, %rax, _                           
   ; CHECK: %ymm16 = VPBROADCASTWZ256r          %xmm0
   %ymm16 = VPBROADCASTWZ256r                   %xmm0                                         
+  ; CHECK: %ymm16 = VBROADCASTI32X4Z256rm      %rip, 1, _, %rax, _
+  %ymm16 = VBROADCASTI32X4Z256rm               %rip, 1, _, %rax, _
+  ; CHECK: %ymm16 = VBROADCASTI32X2Z256m       %rip, 1, _, %rax, _
+  %ymm16 = VBROADCASTI32X2Z256m                %rip, 1, _, %rax, _
+  ; CHECK: %ymm16 = VBROADCASTI32X2Z256r       %xmm16
+  %ymm16 = VBROADCASTI32X2Z256r                %xmm16
   ; CHECK: %ymm16 = VPBROADCASTQZ256m          %rip, 1, _, %rax, _  
   %ymm16 = VPBROADCASTQZ256m                   %rip, 1, _, %rax, _                           
   ; CHECK: %ymm16 = VPBROADCASTQZ256r          %xmm0  
@@ -3350,6 +3448,14 @@ body: |
   %xmm16 = VPANDQZ128rm                        %xmm16, %rip, 1, _, %rax, _                               
   ; CHECK: %xmm16 = VPANDQZ128rr               %xmm16, %xmm1  
   %xmm16 = VPANDQZ128rr                        %xmm16, %xmm1                                             
+  ; CHECK: %xmm16 = VPANDNDZ128rm              %xmm16, %rip, 1, _, %rax, _
+  %xmm16 = VPANDNDZ128rm                       %xmm16, %rip, 1, _, %rax, _                               
+  ; CHECK: %xmm16 = VPANDNDZ128rr              %xmm16, %xmm1
+  %xmm16 = VPANDNDZ128rr                       %xmm16, %xmm1                                             
+  ; CHECK: %xmm16 = VPANDNQZ128rm              %xmm16, %rip, 1, _, %rax, _
+  %xmm16 = VPANDNQZ128rm                       %xmm16, %rip, 1, _, %rax, _                               
+  ; CHECK: %xmm16 = VPANDNQZ128rr              %xmm16, %xmm1  
+  %xmm16 = VPANDNQZ128rr                       %xmm16, %xmm1                                             
   ; CHECK: %xmm16 = VPAVGBZ128rm               %xmm16, %rip, 1, _, %rax, _
   %xmm16 = VPAVGBZ128rm                        %xmm16, %rip, 1, _, %rax, _                               
   ; CHECK: %xmm16 = VPAVGBZ128rr               %xmm16, %xmm1  
@@ -3938,6 +4044,10 @@ body: |
   %xmm16 = VPBROADCASTWZ128m                   %rip, _, _, _, _                                          
   ; CHECK: %xmm16 = VPBROADCASTWZ128r          %xmm16
   %xmm16 = VPBROADCASTWZ128r                   %xmm16                                                                                            
+  ; CHECK: %xmm16 = VBROADCASTI32X2Z128m       %rip, _, _, _, _
+  %xmm16 = VBROADCASTI32X2Z128m                %rip, _, _, _, _
+  ; CHECK: %xmm16 = VBROADCASTI32X2Z128r       %xmm0
+  %xmm16 = VBROADCASTI32X2Z128r                %xmm0
   ; CHECK: %xmm16 = VCVTPS2PHZ128rr            %xmm16, 2
   %xmm16 = VCVTPS2PHZ128rr                     %xmm16, 2                                                 
   ; CHECK: VCVTPS2PHZ128mr                     %rdi, %xmm16, 1, _, 0, _, _  
@@ -3958,6 +4068,14 @@ body: |
   %xmm16 = VPALIGNRZ128rmi                     %xmm16, _, _, _, _, _, _                                  
   ; CHECK: %xmm16 = VPALIGNRZ128rri            %xmm16, %xmm1, 15
   %xmm16 = VPALIGNRZ128rri                     %xmm16, %xmm1, 15
+  ; CHECK: VEXTRACTPSZmr                       %rdi, 1, _, 0, _, %xmm16, _
+  VEXTRACTPSZmr                                %rdi, 1, _, 0, _, %xmm16, _
+  ; CHECK: %eax = VEXTRACTPSZrr                %xmm16, _
+  %eax = VEXTRACTPSZrr                         %xmm16, _
+  ; CHECK: %xmm16 = VINSERTPSZrm               %xmm16, %rdi, _, _, _, _, _
+  %xmm16 = VINSERTPSZrm                        %xmm16, %rdi, _, _, _, _, _
+  ; CHECK: %xmm16 = VINSERTPSZrr               %xmm16, %xmm16, _ 
+  %xmm16 = VINSERTPSZrr                        %xmm16, %xmm16, _
     
       RET 0, %zmm0, %zmm1
 ...
@@ -4288,6 +4406,8 @@ body: |
   VPEXTRWZmr                                   %rdi, 1, _, 0, _,  %xmm16, 3                               
   ; CHECK: %eax = VPEXTRWZrr                   %xmm16, 1      
   %eax = VPEXTRWZrr                            %xmm16, 1                                                     
+  ; CHECK: %eax = VPEXTRWZrr_REV               %xmm16, 1      
+  %eax = VPEXTRWZrr_REV                        %xmm16, 1                                                     
   ; CHECK: %xmm16 = VPINSRBZrm                 %xmm16, %rsi, 1, _, 0, _, 3      
   %xmm16 = VPINSRBZrm                          %xmm16, %rsi, 1, _, 0, _, 3                                
   ; CHECK: %xmm16 = VPINSRBZrr                 %xmm16, %edi, 5      
@@ -4330,8 +4450,12 @@ body: |
   %edi = VCVTSD2SIZrr                          %xmm16                                                     
   ; CHECK: %xmm16 = VCVTSD2SSZrm               %xmm16, %rdi, 1, _, 0, _
   %xmm16 = VCVTSD2SSZrm                        %xmm16, %rdi, 1, _, 0, _                                   
+  ; CHECK: %xmm16 = VCVTSD2SSZrm_Int           %xmm16, %rdi, 1, _, 0, _
+  %xmm16 = VCVTSD2SSZrm_Int                    %xmm16, %rdi, 1, _, 0, _                                   
   ; CHECK: %xmm16 = VCVTSD2SSZrr               %xmm16, _
   %xmm16 = VCVTSD2SSZrr                        %xmm16, _                                                  
+  ; CHECK: %xmm16 = VCVTSD2SSZrr_Int           %xmm16, _
+  %xmm16 = VCVTSD2SSZrr_Int                    %xmm16, _                                                  
   ; CHECK: %xmm16 = VCVTSI2SDZrm               %xmm16, %rdi, 1, _, 0, _
   %xmm16 = VCVTSI2SDZrm                        %xmm16, %rdi, 1, _, 0, _                                   
   ; CHECK: %xmm16 = VCVTSI2SDZrm_Int           %xmm16, %rdi, 1, _, 0, _
@@ -4348,10 +4472,30 @@ body: |
   %xmm16 = VCVTSI2SSZrr                        %xmm16, _                                                  
   ; CHECK: %xmm16 = VCVTSI2SSZrr_Int           %xmm16, _
   %xmm16 = VCVTSI2SSZrr_Int                    %xmm16, _                                                  
+  ; CHECK: %xmm16 = VCVTSI642SDZrm             %xmm16, %rdi, 1, _, 0, _
+  %xmm16 = VCVTSI642SDZrm                      %xmm16, %rdi, 1, _, 0, _
+  ; CHECK: %xmm16 = VCVTSI642SDZrm_Int         %xmm16, %rdi, 1, _, 0, _
+  %xmm16 = VCVTSI642SDZrm_Int                  %xmm16, %rdi, 1, _, 0, _
+  ; CHECK: %xmm16 = VCVTSI642SDZrr             %xmm16, _
+  %xmm16 = VCVTSI642SDZrr                      %xmm16, _
+  ; CHECK: %xmm16 = VCVTSI642SDZrr_Int         %xmm16, _
+  %xmm16 = VCVTSI642SDZrr_Int                  %xmm16, _
+  ; CHECK: %xmm16 = VCVTSI642SSZrm             %xmm16, %rdi, 1, _, 0, _ 
+  %xmm16 = VCVTSI642SSZrm                      %xmm16, %rdi, 1, _, 0, _
+  ; CHECK: %xmm16 = VCVTSI642SSZrm_Int         %xmm16, %rdi, 1, _, 0, _
+  %xmm16 = VCVTSI642SSZrm_Int                  %xmm16, %rdi, 1, _, 0, _
+  ; CHECK: %xmm16 = VCVTSI642SSZrr             %xmm16, _ 
+  %xmm16 = VCVTSI642SSZrr                      %xmm16, _
+  ; CHECK: %xmm16 = VCVTSI642SSZrr_Int         %xmm16, _
+  %xmm16 = VCVTSI642SSZrr_Int                  %xmm16, _
   ; CHECK: %xmm16 = VCVTSS2SDZrm               %xmm16, %rdi, 1, _, 0, _
   %xmm16 = VCVTSS2SDZrm                        %xmm16, %rdi, 1, _, 0, _                                   
+  ; CHECK: %xmm16 = VCVTSS2SDZrm_Int           %xmm16, %rdi, 1, _, 0, _
+  %xmm16 = VCVTSS2SDZrm_Int                    %xmm16, %rdi, 1, _, 0, _                                   
   ; CHECK: %xmm16 = VCVTSS2SDZrr               %xmm16, _
   %xmm16 = VCVTSS2SDZrr                        %xmm16, _                                                  
+  ; CHECK: %xmm16 = VCVTSS2SDZrr_Int           %xmm16, _
+  %xmm16 = VCVTSS2SDZrr_Int                    %xmm16, _                                                  
   ; CHECK: %rdi = VCVTSS2SI64Zrm               %rdi, %xmm16, 1, _, 0
   %rdi = VCVTSS2SI64Zrm                        %rdi, %xmm16, 1, _, 0                                      
   ; CHECK: %rdi = VCVTSS2SI64Zrr               %xmm16
@@ -4404,6 +4548,12 @@ body: |
   %xmm16 = VMOVSDZrm                           %rip, _, _, _, _                                           
   ; CHECK: %xmm16 = VMOVSDZrr                  %xmm16, _
   %xmm16 = VMOVSDZrr                           %xmm16, _                                                  
+  ; CHECK: %xmm16 = VMOVSDZrr_REV              %xmm16, _
+  %xmm16 = VMOVSDZrr_REV                       %xmm16, _                                                
+  ; CHECK: %rax = VMOVSDto64Zrr                %xmm16
+  %rax = VMOVSDto64Zrr                         %xmm16
+  ; CHECK: VMOVSDto64Zmr                       %rdi, %xmm16, _, _, _, _
+  VMOVSDto64Zmr                                %rdi, %xmm16, _, _, _, _
   ; CHECK: VMOVSSZmr                           %rdi, %xmm16, _, _, _, _
   VMOVSSZmr                                    %rdi, %xmm16, _, _, _, _                                   
   ; CHECK: %xmm16 = VMOVSSZrm                  %rip, _, _, _, _
@@ -4412,8 +4562,14 @@ body: |
   %xmm16 = VMOVSSZrr                           %xmm16, _                                                  
   ; CHECK: %xmm16 = VMOVSSZrr_REV              %xmm16, _
   %xmm16 = VMOVSSZrr_REV                       %xmm16, _                                                  
+  ; CHECK: VMOVSS2DIZmr                        %rdi, %xmm16, _, _, _, _
+  VMOVSS2DIZmr                                 %rdi, %xmm16, _, _, _, _
+  ; CHECK: %eax = VMOVSS2DIZrr                 %xmm16
+  %eax = VMOVSS2DIZrr                          %xmm16
   ; CHECK: %xmm16 = VMOV64toPQIZrr             %rdi
   %xmm16 = VMOV64toPQIZrr                      %rdi                                                       
+  ; CHECK: %xmm16 = VMOV64toPQIZrm             %rdi, _, _, _, _
+  %xmm16 = VMOV64toPQIZrm                      %rdi, _, _, _, _
   ; CHECK: %xmm16 = VMOV64toSDZrr              %rdi 
   %xmm16 = VMOV64toSDZrr                       %rdi                                                       
   ; CHECK: %xmm16 = VMOVDI2PDIZrm              %rip, _, _, _, _
@@ -4428,10 +4584,14 @@ body: |
   VMOVPDI2DIZmr                                %rdi, %xmm16, _, _, _, _                                   
   ; CHECK: %edi = VMOVPDI2DIZrr                %xmm16
   %edi = VMOVPDI2DIZrr                         %xmm16                                                     
+  ; CHECK: %xmm16 = VMOVPQI2QIZrr              %xmm16
+  %xmm16 = VMOVPQI2QIZrr                       %xmm16
   ; CHECK: VMOVPQI2QIZmr                       %rdi, %xmm16, _, _, _, _
   VMOVPQI2QIZmr                                %rdi, %xmm16, _, _, _, _                                   
   ; CHECK: %rdi = VMOVPQIto64Zrr               %xmm16
   %rdi = VMOVPQIto64Zrr                        %xmm16                                                     
+  ; CHECK: VMOVPQIto64Zmr                      %rdi, %xmm16, _, _, _, _
+  VMOVPQIto64Zmr                               %rdi, %xmm16, _, _, _, _
   ; CHECK: %xmm16 = VMOVQI2PQIZrm              %rip, _, _, _, _
   %xmm16 = VMOVQI2PQIZrm                       %rip, _, _, _, _                                           
   ; CHECK: %xmm16 = VMOVZPQILo2PQIZrr          %xmm16
diff --git a/test/CodeGen/X86/extract-store.ll b/test/CodeGen/X86/extract-store.ll
index f0e4d1407728b5959c5ded189efb73efab421f82..1751f03731d3aaa21b2d42e64cf0bfad359744a4 100644
--- a/test/CodeGen/X86/extract-store.ll
+++ b/test/CodeGen/X86/extract-store.ll
@@ -1,116 +1,537 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=AVX
-
-define void @extract_i8_0(i8* nocapture %dst, <16 x i8> %foo) {
-; SSE2-LABEL: extract_i8_0:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE41-LABEL: extract_i8_0:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrb $0, %xmm0, (%rdi)
-; SSE41-NEXT:    retq
-;
-; AVX-LABEL: extract_i8_0:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpextrb $0, %xmm0, (%rdi)
-; AVX-NEXT:    retq
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2     | FileCheck %s --check-prefix=X32 --check-prefix=SSE-X32 --check-prefix=SSE2-X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2   | FileCheck %s --check-prefix=X64 --check-prefix=SSE-X64 --check-prefix=SSE2-X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse4.1   | FileCheck %s --check-prefix=X32 --check-prefix=SSE-X32 --check-prefix=SSE41-X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64 --check-prefix=SSE-X64 --check-prefix=SSE41-X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx      | FileCheck %s --check-prefix=X32 --check-prefix=AVX-X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx    | FileCheck %s --check-prefix=X64 --check-prefix=AVX-X64
+
+define void @extract_i8_0(i8* nocapture %dst, <16 x i8> %foo) nounwind {
+; SSE2-X32-LABEL: extract_i8_0:
+; SSE2-X32:       # BB#0:
+; SSE2-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT:    movd %xmm0, %ecx
+; SSE2-X32-NEXT:    movb %cl, (%eax)
+; SSE2-X32-NEXT:    retl
+;
+; SSE2-X64-LABEL: extract_i8_0:
+; SSE2-X64:       # BB#0:
+; SSE2-X64-NEXT:    movd %xmm0, %eax
+; SSE2-X64-NEXT:    movb %al, (%rdi)
+; SSE2-X64-NEXT:    retq
+;
+; SSE41-X32-LABEL: extract_i8_0:
+; SSE41-X32:       # BB#0:
+; SSE41-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE41-X32-NEXT:    pextrb $0, %xmm0, (%eax)
+; SSE41-X32-NEXT:    retl
+;
+; SSE41-X64-LABEL: extract_i8_0:
+; SSE41-X64:       # BB#0:
+; SSE41-X64-NEXT:    pextrb $0, %xmm0, (%rdi)
+; SSE41-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_i8_0:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vpextrb $0, %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
+;
+; AVX-X64-LABEL: extract_i8_0:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vpextrb $0, %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
   %vecext = extractelement <16 x i8> %foo, i32 0
   store i8 %vecext, i8* %dst, align 1
   ret void
 }
 
-define void @extract_i8_15(i8* nocapture %dst, <16 x i8> %foo) {
-; SSE2-LABEL: extract_i8_15:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    retq
+define void @extract_i8_3(i8* nocapture %dst, <16 x i8> %foo) nounwind {
+; SSE2-X32-LABEL: extract_i8_3:
+; SSE2-X32:       # BB#0:
+; SSE2-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT:    movd %xmm0, %ecx
+; SSE2-X32-NEXT:    shrl $24, %ecx
+; SSE2-X32-NEXT:    movb %cl, (%eax)
+; SSE2-X32-NEXT:    retl
+;
+; SSE2-X64-LABEL: extract_i8_3:
+; SSE2-X64:       # BB#0:
+; SSE2-X64-NEXT:    movd %xmm0, %eax
+; SSE2-X64-NEXT:    shrl $24, %eax
+; SSE2-X64-NEXT:    movb %al, (%rdi)
+; SSE2-X64-NEXT:    retq
+;
+; SSE41-X32-LABEL: extract_i8_3:
+; SSE41-X32:       # BB#0:
+; SSE41-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE41-X32-NEXT:    pextrb $3, %xmm0, (%eax)
+; SSE41-X32-NEXT:    retl
+;
+; SSE41-X64-LABEL: extract_i8_3:
+; SSE41-X64:       # BB#0:
+; SSE41-X64-NEXT:    pextrb $3, %xmm0, (%rdi)
+; SSE41-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_i8_3:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vpextrb $3, %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
+;
+; AVX-X64-LABEL: extract_i8_3:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vpextrb $3, %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
+  %vecext = extractelement <16 x i8> %foo, i32 3
+  store i8 %vecext, i8* %dst, align 1
+  ret void
+}
+
+define void @extract_i8_15(i8* nocapture %dst, <16 x i8> %foo) nounwind {
+; SSE2-X32-LABEL: extract_i8_15:
+; SSE2-X32:       # BB#0:
+; SSE2-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT:    pextrw $7, %xmm0, %ecx
+; SSE2-X32-NEXT:    movb %ch, (%eax)
+; SSE2-X32-NEXT:    retl
+;
+; SSE2-X64-LABEL: extract_i8_15:
+; SSE2-X64:       # BB#0:
+; SSE2-X64-NEXT:    pextrw $7, %xmm0, %eax
+; SSE2-X64-NEXT:    movb %ah, (%rdi) # NOREX
+; SSE2-X64-NEXT:    retq
+;
+; SSE41-X32-LABEL: extract_i8_15:
+; SSE41-X32:       # BB#0:
+; SSE41-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE41-X32-NEXT:    pextrb $15, %xmm0, (%eax)
+; SSE41-X32-NEXT:    retl
+;
+; SSE41-X64-LABEL: extract_i8_15:
+; SSE41-X64:       # BB#0:
+; SSE41-X64-NEXT:    pextrb $15, %xmm0, (%rdi)
+; SSE41-X64-NEXT:    retq
 ;
-; SSE41-LABEL: extract_i8_15:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrb $15, %xmm0, (%rdi)
-; SSE41-NEXT:    retq
+; AVX-X32-LABEL: extract_i8_15:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vpextrb $15, %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
 ;
-; AVX-LABEL: extract_i8_15:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpextrb $15, %xmm0, (%rdi)
-; AVX-NEXT:    retq
+; AVX-X64-LABEL: extract_i8_15:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vpextrb $15, %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
   %vecext = extractelement <16 x i8> %foo, i32 15
   store i8 %vecext, i8* %dst, align 1
   ret void
 }
 
-define void @extract_i16_0(i16* nocapture %dst, <8 x i16> %foo) {
-; SSE2-LABEL: extract_i16_0:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    movw %ax, (%rdi)
-; SSE2-NEXT:    retq
+define void @extract_i16_0(i16* nocapture %dst, <8 x i16> %foo) nounwind {
+; SSE2-X32-LABEL: extract_i16_0:
+; SSE2-X32:       # BB#0:
+; SSE2-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT:    movd %xmm0, %ecx
+; SSE2-X32-NEXT:    movw %cx, (%eax)
+; SSE2-X32-NEXT:    retl
 ;
-; SSE41-LABEL: extract_i16_0:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrw $0, %xmm0, (%rdi)
-; SSE41-NEXT:    retq
+; SSE2-X64-LABEL: extract_i16_0:
+; SSE2-X64:       # BB#0:
+; SSE2-X64-NEXT:    movd %xmm0, %eax
+; SSE2-X64-NEXT:    movw %ax, (%rdi)
+; SSE2-X64-NEXT:    retq
 ;
-; AVX-LABEL: extract_i16_0:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpextrw $0, %xmm0, (%rdi)
-; AVX-NEXT:    retq
+; SSE41-X32-LABEL: extract_i16_0:
+; SSE41-X32:       # BB#0:
+; SSE41-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE41-X32-NEXT:    pextrw $0, %xmm0, (%eax)
+; SSE41-X32-NEXT:    retl
+;
+; SSE41-X64-LABEL: extract_i16_0:
+; SSE41-X64:       # BB#0:
+; SSE41-X64-NEXT:    pextrw $0, %xmm0, (%rdi)
+; SSE41-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_i16_0:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vpextrw $0, %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
+;
+; AVX-X64-LABEL: extract_i16_0:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vpextrw $0, %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
   %vecext = extractelement <8 x i16> %foo, i32 0
   store i16 %vecext, i16* %dst, align 1
   ret void
 }
 
-define void @extract_i16_7(i16* nocapture %dst, <8 x i16> %foo) {
-; SSE2-LABEL: extract_i16_7:
-; SSE2:       # BB#0:
-; SSE2-NEXT:    pextrw $7, %xmm0, %eax
-; SSE2-NEXT:    movw %ax, (%rdi)
-; SSE2-NEXT:    retq
+define void @extract_i16_7(i16* nocapture %dst, <8 x i16> %foo) nounwind {
+; SSE2-X32-LABEL: extract_i16_7:
+; SSE2-X32:       # BB#0:
+; SSE2-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT:    pextrw $7, %xmm0, %ecx
+; SSE2-X32-NEXT:    movw %cx, (%eax)
+; SSE2-X32-NEXT:    retl
+;
+; SSE2-X64-LABEL: extract_i16_7:
+; SSE2-X64:       # BB#0:
+; SSE2-X64-NEXT:    pextrw $7, %xmm0, %eax
+; SSE2-X64-NEXT:    movw %ax, (%rdi)
+; SSE2-X64-NEXT:    retq
+;
+; SSE41-X32-LABEL: extract_i16_7:
+; SSE41-X32:       # BB#0:
+; SSE41-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE41-X32-NEXT:    pextrw $7, %xmm0, (%eax)
+; SSE41-X32-NEXT:    retl
+;
+; SSE41-X64-LABEL: extract_i16_7:
+; SSE41-X64:       # BB#0:
+; SSE41-X64-NEXT:    pextrw $7, %xmm0, (%rdi)
+; SSE41-X64-NEXT:    retq
 ;
-; SSE41-LABEL: extract_i16_7:
-; SSE41:       # BB#0:
-; SSE41-NEXT:    pextrw $7, %xmm0, (%rdi)
-; SSE41-NEXT:    retq
+; AVX-X32-LABEL: extract_i16_7:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vpextrw $7, %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
 ;
-; AVX-LABEL: extract_i16_7:
-; AVX:       # BB#0:
-; AVX-NEXT:    vpextrw $7, %xmm0, (%rdi)
-; AVX-NEXT:    retq
+; AVX-X64-LABEL: extract_i16_7:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vpextrw $7, %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
   %vecext = extractelement <8 x i16> %foo, i32 7
   store i16 %vecext, i16* %dst, align 1
   ret void
 }
 
-define void @extract_i8_undef(i8* nocapture %dst, <16 x i8> %foo) {
-; SSE-LABEL: extract_i8_undef:
-; SSE:       # BB#0:
-; SSE-NEXT:    retq
+define void @extract_i32_0(i32* nocapture %dst, <4 x i32> %foo) nounwind {
+; SSE-X32-LABEL: extract_i32_0:
+; SSE-X32:       # BB#0:
+; SSE-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X32-NEXT:    movss %xmm0, (%eax)
+; SSE-X32-NEXT:    retl
+;
+; SSE-X64-LABEL: extract_i32_0:
+; SSE-X64:       # BB#0:
+; SSE-X64-NEXT:    movss %xmm0, (%rdi)
+; SSE-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_i32_0:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vmovss %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
+;
+; AVX-X64-LABEL: extract_i32_0:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vmovss %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
+  %vecext = extractelement <4 x i32> %foo, i32 0
+  store i32 %vecext, i32* %dst, align 1
+  ret void
+}
+
+define void @extract_i32_3(i32* nocapture %dst, <4 x i32> %foo) nounwind {
+; SSE2-X32-LABEL: extract_i32_3:
+; SSE2-X32:       # BB#0:
+; SSE2-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-X32-NEXT:    movd %xmm0, (%eax)
+; SSE2-X32-NEXT:    retl
+;
+; SSE2-X64-LABEL: extract_i32_3:
+; SSE2-X64:       # BB#0:
+; SSE2-X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-X64-NEXT:    movd %xmm0, (%rdi)
+; SSE2-X64-NEXT:    retq
+;
+; SSE41-X32-LABEL: extract_i32_3:
+; SSE41-X32:       # BB#0:
+; SSE41-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE41-X32-NEXT:    pextrd $3, %xmm0, (%eax)
+; SSE41-X32-NEXT:    retl
+;
+; SSE41-X64-LABEL: extract_i32_3:
+; SSE41-X64:       # BB#0:
+; SSE41-X64-NEXT:    pextrd $3, %xmm0, (%rdi)
+; SSE41-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_i32_3:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vpextrd $3, %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
+;
+; AVX-X64-LABEL: extract_i32_3:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vpextrd $3, %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
+  %vecext = extractelement <4 x i32> %foo, i32 3
+  store i32 %vecext, i32* %dst, align 1
+  ret void
+}
+
+define void @extract_i64_0(i64* nocapture %dst, <2 x i64> %foo) nounwind {
+; SSE-X32-LABEL: extract_i64_0:
+; SSE-X32:       # BB#0:
+; SSE-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X32-NEXT:    movlps %xmm0, (%eax)
+; SSE-X32-NEXT:    retl
+;
+; SSE-X64-LABEL: extract_i64_0:
+; SSE-X64:       # BB#0:
+; SSE-X64-NEXT:    movlps %xmm0, (%rdi)
+; SSE-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_i64_0:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vmovlps %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
+;
+; AVX-X64-LABEL: extract_i64_0:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
+  %vecext = extractelement <2 x i64> %foo, i32 0
+  store i64 %vecext, i64* %dst, align 1
+  ret void
+}
+
+define void @extract_i64_1(i64* nocapture %dst, <2 x i64> %foo) nounwind {
+; SSE-X32-LABEL: extract_i64_1:
+; SSE-X32:       # BB#0:
+; SSE-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X32-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-X32-NEXT:    movq %xmm0, (%eax)
+; SSE-X32-NEXT:    retl
+;
+; SSE2-X64-LABEL: extract_i64_1:
+; SSE2-X64:       # BB#0:
+; SSE2-X64-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE2-X64-NEXT:    movq %xmm0, (%rdi)
+; SSE2-X64-NEXT:    retq
+;
+; SSE41-X64-LABEL: extract_i64_1:
+; SSE41-X64:       # BB#0:
+; SSE41-X64-NEXT:    pextrq $1, %xmm0, (%rdi)
+; SSE41-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_i64_1:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX-X32-NEXT:    vmovq %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
 ;
-; AVX-LABEL: extract_i8_undef:
-; AVX:       # BB#0:
-; AVX-NEXT:    retq
+; AVX-X64-LABEL: extract_i64_1:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vpextrq $1, %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
+  %vecext = extractelement <2 x i64> %foo, i32 1
+  store i64 %vecext, i64* %dst, align 1
+  ret void
+}
+
+define void @extract_f32_0(float* nocapture %dst, <4 x float> %foo) nounwind {
+; SSE-X32-LABEL: extract_f32_0:
+; SSE-X32:       # BB#0:
+; SSE-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X32-NEXT:    movss %xmm0, (%eax)
+; SSE-X32-NEXT:    retl
+;
+; SSE-X64-LABEL: extract_f32_0:
+; SSE-X64:       # BB#0:
+; SSE-X64-NEXT:    movss %xmm0, (%rdi)
+; SSE-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_f32_0:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vmovss %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
+;
+; AVX-X64-LABEL: extract_f32_0:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vmovss %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
+  %vecext = extractelement <4 x float> %foo, i32 0
+  store float %vecext, float* %dst, align 1
+  ret void
+}
+
+define void @extract_f32_3(float* nocapture %dst, <4 x float> %foo) nounwind {
+; SSE2-X32-LABEL: extract_f32_3:
+; SSE2-X32:       # BB#0:
+; SSE2-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE2-X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-X32-NEXT:    movss %xmm0, (%eax)
+; SSE2-X32-NEXT:    retl
+;
+; SSE2-X64-LABEL: extract_f32_3:
+; SSE2-X64:       # BB#0:
+; SSE2-X64-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE2-X64-NEXT:    movss %xmm0, (%rdi)
+; SSE2-X64-NEXT:    retq
+;
+; SSE41-X32-LABEL: extract_f32_3:
+; SSE41-X32:       # BB#0:
+; SSE41-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE41-X32-NEXT:    extractps $3, %xmm0, (%eax)
+; SSE41-X32-NEXT:    retl
+;
+; SSE41-X64-LABEL: extract_f32_3:
+; SSE41-X64:       # BB#0:
+; SSE41-X64-NEXT:    extractps $3, %xmm0, (%rdi)
+; SSE41-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_f32_3:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vextractps $3, %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
+;
+; AVX-X64-LABEL: extract_f32_3:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vextractps $3, %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
+  %vecext = extractelement <4 x float> %foo, i32 3
+  store float %vecext, float* %dst, align 1
+  ret void
+}
+
+define void @extract_f64_0(double* nocapture %dst, <2 x double> %foo) nounwind {
+; SSE-X32-LABEL: extract_f64_0:
+; SSE-X32:       # BB#0:
+; SSE-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X32-NEXT:    movlps %xmm0, (%eax)
+; SSE-X32-NEXT:    retl
+;
+; SSE-X64-LABEL: extract_f64_0:
+; SSE-X64:       # BB#0:
+; SSE-X64-NEXT:    movlps %xmm0, (%rdi)
+; SSE-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_f64_0:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vmovlps %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
+;
+; AVX-X64-LABEL: extract_f64_0:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vmovlps %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
+  %vecext = extractelement <2 x double> %foo, i32 0
+  store double %vecext, double* %dst, align 1
+  ret void
+}
+
+define void @extract_f64_1(double* nocapture %dst, <2 x double> %foo) nounwind {
+; SSE-X32-LABEL: extract_f64_1:
+; SSE-X32:       # BB#0:
+; SSE-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; SSE-X32-NEXT:    movhpd %xmm0, (%eax)
+; SSE-X32-NEXT:    retl
+;
+; SSE-X64-LABEL: extract_f64_1:
+; SSE-X64:       # BB#0:
+; SSE-X64-NEXT:    movhpd %xmm0, (%rdi)
+; SSE-X64-NEXT:    retq
+;
+; AVX-X32-LABEL: extract_f64_1:
+; AVX-X32:       # BB#0:
+; AVX-X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; AVX-X32-NEXT:    vmovhpd %xmm0, (%eax)
+; AVX-X32-NEXT:    retl
+;
+; AVX-X64-LABEL: extract_f64_1:
+; AVX-X64:       # BB#0:
+; AVX-X64-NEXT:    vmovhpd %xmm0, (%rdi)
+; AVX-X64-NEXT:    retq
+  %vecext = extractelement <2 x double> %foo, i32 1
+  store double %vecext, double* %dst, align 1
+  ret void
+}
+
+define void @extract_i8_undef(i8* nocapture %dst, <16 x i8> %foo) nounwind {
+; X32-LABEL: extract_i8_undef:
+; X32:       # BB#0:
+; X32-NEXT:    retl
+;
+; X64-LABEL: extract_i8_undef:
+; X64:       # BB#0:
+; X64-NEXT:    retq
   %vecext = extractelement <16 x i8> %foo, i32 16 ; undef
   store i8 %vecext, i8* %dst, align 1
   ret void
 }
 
-define void @extract_i16_undef(i16* nocapture %dst, <8 x i16> %foo) {
-; SSE-LABEL: extract_i16_undef:
-; SSE:       # BB#0:
-; SSE-NEXT:    retq
+define void @extract_i16_undef(i16* nocapture %dst, <8 x i16> %foo) nounwind {
+; X32-LABEL: extract_i16_undef:
+; X32:       # BB#0:
+; X32-NEXT:    retl
 ;
-; AVX-LABEL: extract_i16_undef:
-; AVX:       # BB#0:
-; AVX-NEXT:    retq
+; X64-LABEL: extract_i16_undef:
+; X64:       # BB#0:
+; X64-NEXT:    retq
   %vecext = extractelement <8 x i16> %foo, i32 9 ; undef
   store i16 %vecext, i16* %dst, align 1
   ret void
 }
+
+define void @extract_i32_undef(i32* nocapture %dst, <4 x i32> %foo) nounwind {
+; X32-LABEL: extract_i32_undef:
+; X32:       # BB#0:
+; X32-NEXT:    retl
+;
+; X64-LABEL: extract_i32_undef:
+; X64:       # BB#0:
+; X64-NEXT:    retq
+  %vecext = extractelement <4 x i32> %foo, i32 6 ; undef
+  store i32 %vecext, i32* %dst, align 1
+  ret void
+}
+
+define void @extract_i64_undef(i64* nocapture %dst, <2 x i64> %foo) nounwind {
+; X32-LABEL: extract_i64_undef:
+; X32:       # BB#0:
+; X32-NEXT:    retl
+;
+; X64-LABEL: extract_i64_undef:
+; X64:       # BB#0:
+; X64-NEXT:    retq
+  %vecext = extractelement <2 x i64> %foo, i32 2 ; undef
+  store i64 %vecext, i64* %dst, align 1
+  ret void
+}
+
+define void @extract_f32_undef(float* nocapture %dst, <4 x float> %foo) nounwind {
+; X32-LABEL: extract_f32_undef:
+; X32:       # BB#0:
+; X32-NEXT:    retl
+;
+; X64-LABEL: extract_f32_undef:
+; X64:       # BB#0:
+; X64-NEXT:    retq
+  %vecext = extractelement <4 x float> %foo, i32 6 ; undef
+  store float %vecext, float* %dst, align 1
+  ret void
+}
+
+define void @extract_f64_undef(double* nocapture %dst, <2 x double> %foo) nounwind {
+; X32-LABEL: extract_f64_undef:
+; X32:       # BB#0:
+; X32-NEXT:    retl
+;
+; X64-LABEL: extract_f64_undef:
+; X64:       # BB#0:
+; X64-NEXT:    retq
+  %vecext = extractelement <2 x double> %foo, i32 2 ; undef
+  store double %vecext, double* %dst, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/extractelement-index.ll b/test/CodeGen/X86/extractelement-index.ll
index 8c12e7148aa7f91728a9995b93ef0c538367b507..e36e33ffe66b7446adf4ee59e05243ca32fd6e1e 100644
--- a/test/CodeGen/X86/extractelement-index.ll
+++ b/test/CodeGen/X86/extractelement-index.ll
@@ -11,8 +11,9 @@
 define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: extractelement_v16i8_1:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    shrl $8, %eax
+; SSE2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extractelement_v16i8_1:
@@ -33,8 +34,9 @@ define i8 @extractelement_v16i8_1(<16 x i8> %a) nounwind {
 define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: extractelement_v16i8_11:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    pextrw $5, %xmm0, %eax
+; SSE2-NEXT:    shrl $8, %eax
+; SSE2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extractelement_v16i8_11:
@@ -55,8 +57,8 @@ define i8 @extractelement_v16i8_11(<16 x i8> %a) nounwind {
 define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: extractelement_v16i8_14:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    pextrw $7, %xmm0, %eax
+; SSE2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extractelement_v16i8_14:
@@ -77,8 +79,9 @@ define i8 @extractelement_v16i8_14(<16 x i8> %a) nounwind {
 define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind {
 ; SSE2-LABEL: extractelement_v32i8_1:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movd %xmm0, %eax
+; SSE2-NEXT:    shrl $8, %eax
+; SSE2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extractelement_v32i8_1:
@@ -100,8 +103,9 @@ define i8 @extractelement_v32i8_1(<32 x i8> %a) nounwind {
 define i8 @extractelement_v32i8_17(<32 x i8> %a) nounwind {
 ; SSE2-LABEL: extractelement_v32i8_17:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    shrl $8, %eax
+; SSE2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: extractelement_v32i8_17:
@@ -538,27 +542,19 @@ define i32 @extractelement_v8i32_var(<8 x i32> %a, i256 %i) nounwind {
 ; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: extractelement_v8i32_var:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    movq %rsp, %rbp
-; AVX1-NEXT:    andq $-32, %rsp
-; AVX1-NEXT:    subq $64, %rsp
-; AVX1-NEXT:    andl $7, %edi
-; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX1-NEXT:    movl (%rsp,%rdi,4), %eax
-; AVX1-NEXT:    movq %rbp, %rsp
-; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: extractelement_v8i32_var:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovd %edi, %xmm1
-; AVX2-NEXT:    vpermd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: extractelement_v8i32_var:
+; AVX:       # BB#0:
+; AVX-NEXT:    pushq %rbp
+; AVX-NEXT:    movq %rsp, %rbp
+; AVX-NEXT:    andq $-32, %rsp
+; AVX-NEXT:    subq $64, %rsp
+; AVX-NEXT:    andl $7, %edi
+; AVX-NEXT:    vmovaps %ymm0, (%rsp)
+; AVX-NEXT:    movl (%rsp,%rdi,4), %eax
+; AVX-NEXT:    movq %rbp, %rsp
+; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %b = extractelement <8 x i32> %a, i256 %i
   ret i32 %b
 }
diff --git a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
index c418e67ecb670adec96a9a013ecc6a69efb456d9..5d5cbc76f92ee759ff38b4491f94d83a7cc7ed48 100644
--- a/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
+++ b/test/CodeGen/X86/extractelement-legalization-store-ordering.ll
@@ -16,19 +16,20 @@ target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
 ; CHECK-NEXT: movl 20(%esp), %edx
 ; CHECK-NEXT: paddd (%edx), %xmm0
 ; CHECK-NEXT: movdqa %xmm0, (%edx)
-; CHECK-NEXT: movl (%edx), %esi
-; CHECK-NEXT: movl 12(%edx), %edi
-; CHECK-NEXT: movl 8(%edx), %ebx
-; CHECK-NEXT: movl 4(%edx), %edx
-; CHECK-NEXT: shll	$4, %ecx
+; CHECK-NEXT:	movl	(%edx), %esi
+; CHECK-NEXT:	movl	4(%edx), %edi
+; CHECK-NEXT:	shll	$4, %ecx
+; CHECK-NEXT:	movl	8(%edx), %ebx
+; CHECK-NEXT:	movl	12(%edx), %edx
 ; CHECK-NEXT: movl %esi, 12(%eax,%ecx)
-; CHECK-NEXT: movl %edx, (%eax,%ecx)
+; CHECK-NEXT: movl %edi, (%eax,%ecx)
 ; CHECK-NEXT: movl %ebx, 8(%eax,%ecx)
-; CHECK-NEXT: movl %edi, 4(%eax,%ecx)
+; CHECK-NEXT: movl %edx, 4(%eax,%ecx)
 ; CHECK-NEXT: popl %esi
 ; CHECK-NEXT: popl %edi
 ; CHECK-NEXT: popl %ebx
 ; CHECK-NEXT: retl
+
 define void @test_extractelement_legalization_storereuse(<4 x i32> %a, i32* nocapture %x, i32* nocapture readonly %y, i32 %i) #0 {
 entry:
   %0 = bitcast i32* %y to <4 x i32>*
diff --git a/test/CodeGen/X86/fadd-combines.ll b/test/CodeGen/X86/fadd-combines.ll
index 2df0e06dc2528688352f1b51593d36387b1b8d50..28f72f42d01d41d8d713946d75aff1bf97e29f41 100644
--- a/test/CodeGen/X86/fadd-combines.ll
+++ b/test/CodeGen/X86/fadd-combines.ll
@@ -221,4 +221,4 @@ define <4 x float> @fadd_fadd_x_x_fadd_x_x_4f32(<4 x float> %x) #0 {
   ret <4 x float> %z
 }
 
-attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/test/CodeGen/X86/fast-isel-abort-warm.ll b/test/CodeGen/X86/fast-isel-abort-warm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3caa91b11ec699eab86fd2a9dd078dfe866f761e
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-abort-warm.ll
@@ -0,0 +1,14 @@
+; RUN: llc -fast-isel -o - %s -fast-isel-report-on-fallback 2>&1 | FileCheck %s
+; Make sure FastISel report a warming when we asked it to do so.
+; Note: This test needs to use whatever is not supported by FastISel.
+;       Thus, this test may fail because inline asm gets supported in FastISel.
+;       To fix this, use something else that's not supported (e.g., weird types).
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; CHECK: warning: Instruction selection used fallback path for foo
+define void @foo(){
+entry:
+  call void asm sideeffect "nop", "~{dirflag},~{fpsr},~{flags}"()
+  ret void
+}
diff --git a/test/CodeGen/X86/fast-isel-cmp.ll b/test/CodeGen/X86/fast-isel-cmp.ll
index a4833a7d66d6ba84095a4c20ce22b7346fbdf012..59c53636984917bea5abc659f623caffeee434a2 100644
--- a/test/CodeGen/X86/fast-isel-cmp.ll
+++ b/test/CodeGen/X86/fast-isel-cmp.ll
@@ -1,688 +1,1128 @@
-; RUN: llc < %s                             -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=SDAG
-; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=FAST
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s                               -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=ALL --check-prefix=SDAG
+; RUN: llc < %s -fast-isel -fast-isel-abort=1 -mtriple=x86_64-apple-darwin10 | FileCheck %s --check-prefix=ALL --check-prefix=FAST
 
 define zeroext i1 @fcmp_oeq(float %x, float %y) {
-; SDAG-LABEL: fcmp_oeq
-; SDAG:       cmpeqss  %xmm1, %xmm0
-; SDAG-NEXT:  movd     %xmm0, %eax
-; SDAG-NEXT:  andl     $1, %eax
-; FAST-LABEL: fcmp_oeq
-; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  sete     %al
-; FAST-NEXT:  setnp    %cl
-; FAST-NEXT:  andb     %al, %cl
+; SDAG-LABEL: fcmp_oeq:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpeqss %xmm1, %xmm0
+; SDAG-NEXT:    movd %xmm0, %eax
+; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_oeq:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    sete %al
+; FAST-NEXT:    setnp %cl
+; FAST-NEXT:    andb %al, %cl
+; FAST-NEXT:    andb $1, %cl
+; FAST-NEXT:    movzbl %cl, %eax
+; FAST-NEXT:    retq
   %1 = fcmp oeq float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ogt(float %x, float %y) {
-; SDAG-LABEL: fcmp_ogt
-; SDAG:       ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  seta     %al
-; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  seta     %al
+; SDAG-LABEL: fcmp_ogt:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    seta %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ogt:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    seta %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ogt float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_oge(float %x, float %y) {
-; SDAG-LABEL: fcmp_oge
-; SDAG:       ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  setae    %al
-; FAST-LABEL: fcmp_oge
-; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setae    %al
+; SDAG-LABEL: fcmp_oge:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    setae %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_oge:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setae %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp oge float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_olt(float %x, float %y) {
-; SDAG-LABEL: fcmp_olt
-; SDAG:       ucomiss  %xmm0, %xmm1
-; SDAG-NEXT:  seta     %al
-; FAST-LABEL: fcmp_olt
-; FAST:       ucomiss  %xmm0, %xmm1
-; FAST-NEXT:  seta     %al
+; SDAG-LABEL: fcmp_olt:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm1
+; SDAG-NEXT:    seta %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_olt:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm1
+; FAST-NEXT:    seta %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp olt float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ole(float %x, float %y) {
-; SDAG-LABEL: fcmp_ole
-; SDAG:       ucomiss  %xmm0, %xmm1
-; SDAG-NEXT:  setae    %al
-; FAST-LABEL: fcmp_ole
-; FAST:       ucomiss  %xmm0, %xmm1
-; FAST-NEXT:  setae    %al
+; SDAG-LABEL: fcmp_ole:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm1
+; SDAG-NEXT:    setae %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ole:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm1
+; FAST-NEXT:    setae %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ole float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_one(float %x, float %y) {
-; SDAG-LABEL: fcmp_one
-; SDAG:       ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  setne    %al
-; FAST-LABEL: fcmp_one
-; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setne    %al
+; SDAG-LABEL: fcmp_one:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    setne %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_one:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setne %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp one float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ord(float %x, float %y) {
-; SDAG-LABEL: fcmp_ord
-; SDAG:       ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  setnp    %al
-; FAST-LABEL: fcmp_ord
-; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setnp    %al
+; SDAG-LABEL: fcmp_ord:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    setnp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ord:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setnp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ord float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_uno(float %x, float %y) {
-; SDAG-LABEL: fcmp_uno
-; SDAG:       ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  setp     %al
-; FAST-LABEL: fcmp_uno
-; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setp     %al
+; SDAG-LABEL: fcmp_uno:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    setp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_uno:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp uno float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ueq(float %x, float %y) {
-; SDAG-LABEL: fcmp_ueq
-; SDAG:       ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  sete     %al
-; FAST-LABEL: fcmp_ueq
-; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  sete     %al
+; SDAG-LABEL: fcmp_ueq:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    sete %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ueq:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    sete %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ueq float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ugt(float %x, float %y) {
-; SDAG-LABEL: fcmp_ugt
-; SDAG:       ucomiss  %xmm0, %xmm1
-; SDAG-NEXT:  setb     %al
-; FAST-LABEL: fcmp_ugt
-; FAST:       ucomiss  %xmm0, %xmm1
-; FAST-NEXT:  setb     %al
+; SDAG-LABEL: fcmp_ugt:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm1
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ugt:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm1
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ugt float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_uge(float %x, float %y) {
-; SDAG-LABEL: fcmp_uge
-; SDAG:       ucomiss  %xmm0, %xmm1
-; SDAG-NEXT:  setbe    %al
-; FAST-LABEL: fcmp_uge
-; FAST:       ucomiss  %xmm0, %xmm1
-; FAST-NEXT:  setbe    %al
+; SDAG-LABEL: fcmp_uge:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm1
+; SDAG-NEXT:    setbe %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_uge:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm1
+; FAST-NEXT:    setbe %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp uge float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ult(float %x, float %y) {
-; SDAG-LABEL: fcmp_ult
-; SDAG:       ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  setb     %al
-; FAST-LABEL: fcmp_ult
-; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setb     %al
+; SDAG-LABEL: fcmp_ult:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ult:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ult float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ule(float %x, float %y) {
-; SDAG-LABEL: fcmp_ule
-; SDAG:       ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  setbe    %al
-; FAST-LABEL: fcmp_ule
-; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setbe    %al
+; SDAG-LABEL: fcmp_ule:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    setbe %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ule:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setbe %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ule float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_une(float %x, float %y) {
-; SDAG-LABEL: fcmp_une
-; SDAG:       cmpneqss %xmm1, %xmm0
-; SDAG-NEXT:  movd     %xmm0, %eax
-; SDAG-NEXT:  andl     $1, %eax
-; FAST-LABEL: fcmp_une
-; FAST:       ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setne    %al
-; FAST-NEXT:  setp     %cl
-; FAST-NEXT:  orb      %al, %cl
+; SDAG-LABEL: fcmp_une:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpneqss %xmm1, %xmm0
+; SDAG-NEXT:    movd %xmm0, %eax
+; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_une:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setne %al
+; FAST-NEXT:    setp %cl
+; FAST-NEXT:    orb %al, %cl
+; FAST-NEXT:    andb $1, %cl
+; FAST-NEXT:    movzbl %cl, %eax
+; FAST-NEXT:    retq
   %1 = fcmp une float %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @icmp_eq(i32 %x, i32 %y) {
-; SDAG-LABEL: icmp_eq
-; SDAG:       cmpl     %esi, %edi
-; SDAG-NEXT:  sete     %al
-; FAST-LABEL: icmp_eq
-; FAST:       cmpl     %esi, %edi
-; FAST-NEXT:  sete     %al
+; SDAG-LABEL: icmp_eq:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    sete %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_eq:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    sete %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp eq i32 %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @icmp_ne(i32 %x, i32 %y) {
-; SDAG-LABEL: icmp_ne
-; SDAG:       cmpl     %esi, %edi
-; SDAG-NEXT:  setne    %al
-; FAST-LABEL: icmp_ne
-; FAST:       cmpl     %esi, %edi
-; FAST-NEXT:  setne    %al
+; SDAG-LABEL: icmp_ne:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    setne %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_ne:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    setne %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp ne i32 %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @icmp_ugt(i32 %x, i32 %y) {
-; SDAG-LABEL: icmp_ugt
-; SDAG:       cmpl     %edi, %esi
-; SDAG-NEXT:  setb     %al
-; FAST-LABEL: icmp_ugt
-; FAST:       cmpl     %esi, %edi
-; FAST-NEXT:  seta     %al
+; SDAG-LABEL: icmp_ugt:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    seta %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_ugt:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    seta %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp ugt i32 %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @icmp_uge(i32 %x, i32 %y) {
-; SDAG-LABEL: icmp_uge
-; SDAG:       cmpl     %esi, %edi
-; SDAG-NEXT:  setae    %al
-; FAST-LABEL: icmp_uge
-; FAST:       cmpl     %esi, %edi
-; FAST-NEXT:  setae    %al
+; SDAG-LABEL: icmp_uge:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    setae %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_uge:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    setae %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp uge i32 %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @icmp_ult(i32 %x, i32 %y) {
-; SDAG-LABEL: icmp_ult
-; SDAG:       cmpl     %esi, %edi
-; SDAG-NEXT:  setb     %al
-; FAST-LABEL: icmp_ult
-; FAST:       cmpl     %esi, %edi
-; FAST-NEXT:  setb     %al
+; SDAG-LABEL: icmp_ult:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_ult:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp ult i32 %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @icmp_ule(i32 %x, i32 %y) {
-; SDAG-LABEL: icmp_ule
-; SDAG:       cmpl     %esi, %edi
-; SDAG-NEXT:  setbe    %al
-; FAST-LABEL: icmp_ule
-; FAST:       cmpl     %esi, %edi
-; FAST-NEXT:  setbe    %al
+; SDAG-LABEL: icmp_ule:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    setbe %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_ule:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    setbe %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp ule i32 %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @icmp_sgt(i32 %x, i32 %y) {
-; SDAG-LABEL: icmp_sgt
-; SDAG:       cmpl     %esi, %edi
-; SDAG-NEXT:  setg     %al
-; FAST-LABEL: icmp_sgt
-; FAST:       cmpl     %esi, %edi
-; FAST-NEXT:  setg     %al
+; SDAG-LABEL: icmp_sgt:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    setg %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_sgt:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    setg %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp sgt i32 %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @icmp_sge(i32 %x, i32 %y) {
-; SDAG-LABEL: icmp_sge
-; SDAG:       cmpl     %esi, %edi
-; SDAG-NEXT:  setge    %al
-; FAST-LABEL: icmp_sge
-; FAST:       cmpl     %esi, %edi
-; FAST-NEXT:  setge    %al
+; SDAG-LABEL: icmp_sge:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    setge %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_sge:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    setge %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp sge i32 %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @icmp_slt(i32 %x, i32 %y) {
-; SDAG-LABEL: icmp_slt
-; SDAG:       cmpl     %esi, %edi
-; SDAG-NEXT:  setl     %al
-; FAST-LABEL: icmp_slt
-; FAST:       cmpl     %esi, %edi
-; FAST-NEXT:  setl     %al
+; SDAG-LABEL: icmp_slt:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    setl %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_slt:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    setl %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp slt i32 %x, %y
   ret i1 %1
 }
 
 define zeroext i1 @icmp_sle(i32 %x, i32 %y) {
-; SDAG-LABEL: icmp_sle
-; SDAG:       cmpl     %esi, %edi
-; SDAG-NEXT:  setle    %al
-; FAST-LABEL: icmp_sle
-; FAST:       cmpl     %esi, %edi
-; FAST-NEXT:  setle    %al
+; SDAG-LABEL: icmp_sle:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    setle %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_sle:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    setle %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp sle i32 %x, %y
   ret i1 %1
 }
 
 ; Test cmp folding and condition optimization.
 define zeroext i1 @fcmp_oeq2(float %x) {
-; SDAG-LABEL: fcmp_oeq2
-; SDAG:       ucomiss  %xmm0, %xmm0
-; SDAG-NEXT:  setnp    %al
-; FAST-LABEL: fcmp_oeq2
-; FAST:       ucomiss  %xmm0, %xmm0
-; FAST-NEXT:  setnp    %al
+; SDAG-LABEL: fcmp_oeq2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm0
+; SDAG-NEXT:    setnp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_oeq2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm0
+; FAST-NEXT:    setnp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp oeq float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_oeq3(float %x) {
-; SDAG-LABEL: fcmp_oeq3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  cmpeqss  %xmm0, %xmm1
-; SDAG-NEXT:  movd     %xmm1, %eax
-; SDAG-NEXT:  andl     $1, %eax
-; FAST-LABEL: fcmp_oeq3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  sete     %al
-; FAST-NEXT:  setnp    %cl
-; FAST-NEXT:  andb     %al, %cl
+; SDAG-LABEL: fcmp_oeq3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    cmpeqss %xmm0, %xmm1
+; SDAG-NEXT:    movd %xmm1, %eax
+; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_oeq3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    sete %al
+; FAST-NEXT:    setnp %cl
+; FAST-NEXT:    andb %al, %cl
+; FAST-NEXT:    andb $1, %cl
+; FAST-NEXT:    movzbl %cl, %eax
+; FAST-NEXT:    retq
   %1 = fcmp oeq float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ogt2(float %x) {
-; SDAG-LABEL: fcmp_ogt2
-; SDAG:       xorl     %eax, %eax
-; FAST-LABEL: fcmp_ogt2
-; FAST:       xorl     %eax, %eax
+; SDAG-LABEL: fcmp_ogt2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ogt2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ogt float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ogt3(float %x) {
-; SDAG-LABEL: fcmp_ogt3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  seta     %al
-; FAST-LABEL: fcmp_ogt3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  seta     %al
+; SDAG-LABEL: fcmp_ogt3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    seta %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ogt3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    seta %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ogt float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_oge2(float %x) {
-; SDAG-LABEL: fcmp_oge2
-; SDAG:       ucomiss  %xmm0, %xmm0
-; SDAG-NEXT:  setnp    %al
-; FAST-LABEL: fcmp_oge2
-; FAST:       ucomiss  %xmm0, %xmm0
-; FAST-NEXT:  setnp    %al
+; SDAG-LABEL: fcmp_oge2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm0
+; SDAG-NEXT:    setnp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_oge2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm0
+; FAST-NEXT:    setnp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp oge float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_oge3(float %x) {
-; SDAG-LABEL: fcmp_oge3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  setae    %al
-; FAST-LABEL: fcmp_oge3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setae    %al
+; SDAG-LABEL: fcmp_oge3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    setae %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_oge3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setae %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp oge float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_olt2(float %x) {
-; SDAG-LABEL: fcmp_olt2
-; SDAG:       xorl     %eax, %eax
-; FAST-LABEL: fcmp_olt2
-; FAST:       xorl     %eax, %eax
+; SDAG-LABEL: fcmp_olt2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_olt2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp olt float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_olt3(float %x) {
-; SDAG-LABEL: fcmp_olt3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
-; SDAG-NEXT:  seta     %al
-; FAST-LABEL: fcmp_olt3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm0, %xmm1
-; FAST-NEXT:  seta     %al
+; SDAG-LABEL: fcmp_olt3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    ucomiss %xmm0, %xmm1
+; SDAG-NEXT:    seta %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_olt3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm0, %xmm1
+; FAST-NEXT:    seta %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp olt float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ole2(float %x) {
-; SDAG-LABEL: fcmp_ole2
-; SDAG:       ucomiss  %xmm0, %xmm0
-; SDAG-NEXT:  setnp    %al
-; FAST-LABEL: fcmp_ole2
-; FAST:       ucomiss  %xmm0, %xmm0
-; FAST-NEXT:  setnp    %al
+; SDAG-LABEL: fcmp_ole2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm0
+; SDAG-NEXT:    setnp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ole2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm0
+; FAST-NEXT:    setnp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ole float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ole3(float %x) {
-; SDAG-LABEL: fcmp_ole3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
-; SDAG-NEXT:  setae    %al
-; FAST-LABEL: fcmp_ole3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm0, %xmm1
-; FAST-NEXT:  setae    %al
+; SDAG-LABEL: fcmp_ole3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    ucomiss %xmm0, %xmm1
+; SDAG-NEXT:    setae %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ole3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm0, %xmm1
+; FAST-NEXT:    setae %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ole float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_one2(float %x) {
-; SDAG-LABEL: fcmp_one2
-; SDAG:       xorl     %eax, %eax
-; FAST-LABEL: fcmp_one2
-; FAST:       xorl     %eax, %eax
+; SDAG-LABEL: fcmp_one2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_one2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp one float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_one3(float %x) {
-; SDAG-LABEL: fcmp_one3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  setne    %al
-; FAST-LABEL: fcmp_one3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setne    %al
+; SDAG-LABEL: fcmp_one3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    setne %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_one3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setne %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp one float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ord2(float %x) {
-; SDAG-LABEL: fcmp_ord2
-; SDAG:       ucomiss  %xmm0, %xmm0
-; SDAG-NEXT:  setnp    %al
-; FAST-LABEL: fcmp_ord2
-; FAST:       ucomiss  %xmm0, %xmm0
-; FAST-NEXT:  setnp    %al
+; SDAG-LABEL: fcmp_ord2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm0
+; SDAG-NEXT:    setnp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ord2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm0
+; FAST-NEXT:    setnp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ord float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ord3(float %x) {
-; SDAG-LABEL: fcmp_ord3
-; SDAG:       ucomiss  %xmm0, %xmm0
-; SDAG-NEXT:  setnp    %al
-; FAST-LABEL: fcmp_ord3
-; FAST:       ucomiss  %xmm0, %xmm0
-; FAST-NEXT:  setnp    %al
+; SDAG-LABEL: fcmp_ord3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm0
+; SDAG-NEXT:    setnp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ord3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm0
+; FAST-NEXT:    setnp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ord float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_uno2(float %x) {
-; SDAG-LABEL: fcmp_uno2
-; SDAG:       ucomiss  %xmm0, %xmm0
-; SDAG-NEXT:  setp     %al
-; FAST-LABEL: fcmp_uno2
-; FAST:       ucomiss  %xmm0, %xmm0
-; FAST-NEXT:  setp     %al
+; SDAG-LABEL: fcmp_uno2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm0
+; SDAG-NEXT:    setp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_uno2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm0
+; FAST-NEXT:    setp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp uno float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_uno3(float %x) {
-; SDAG-LABEL: fcmp_uno3
-; SDAG:       ucomiss  %xmm0, %xmm0
-; SDAG-NEXT:  setp     %al
-; FAST-LABEL: fcmp_uno3
-; FAST:       ucomiss  %xmm0, %xmm0
-; FAST-NEXT:  setp     %al
+; SDAG-LABEL: fcmp_uno3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm0
+; SDAG-NEXT:    setp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_uno3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm0
+; FAST-NEXT:    setp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp uno float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ueq2(float %x) {
-; SDAG-LABEL: fcmp_ueq2
-; SDAG:       movb     $1, %al
-; FAST-LABEL: fcmp_ueq2
-; FAST:       movb     $1, %al
+; SDAG-LABEL: fcmp_ueq2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ueq2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ueq float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ueq3(float %x) {
-; SDAG-LABEL: fcmp_ueq3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  sete     %al
-; FAST-LABEL: fcmp_ueq3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  sete     %al
+; SDAG-LABEL: fcmp_ueq3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    sete %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ueq3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    sete %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ueq float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ugt2(float %x) {
-; SDAG-LABEL: fcmp_ugt2
-; SDAG:       ucomiss  %xmm0, %xmm0
-; SDAG-NEXT:  setp     %al
-; FAST-LABEL: fcmp_ugt2
-; FAST:       ucomiss  %xmm0, %xmm0
-; FAST-NEXT:  setp     %al
+; SDAG-LABEL: fcmp_ugt2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm0
+; SDAG-NEXT:    setp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ugt2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm0
+; FAST-NEXT:    setp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ugt float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ugt3(float %x) {
-; SDAG-LABEL: fcmp_ugt3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
-; SDAG-NEXT:  setb     %al
-; FAST-LABEL: fcmp_ugt3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm0, %xmm1
-; FAST-NEXT:  setb     %al
+; SDAG-LABEL: fcmp_ugt3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    ucomiss %xmm0, %xmm1
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ugt3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm0, %xmm1
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ugt float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_uge2(float %x) {
-; SDAG-LABEL: fcmp_uge2
-; SDAG:       movb     $1, %al
-; FAST-LABEL: fcmp_uge2
-; FAST:       movb     $1, %al
+; SDAG-LABEL: fcmp_uge2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_uge2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp uge float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_uge3(float %x) {
-; SDAG-LABEL: fcmp_uge3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  ucomiss  %xmm0, %xmm1
-; SDAG-NEXT:  setbe    %al
-; FAST-LABEL: fcmp_uge3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm0, %xmm1
-; FAST-NEXT:  setbe    %al
+; SDAG-LABEL: fcmp_uge3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    ucomiss %xmm0, %xmm1
+; SDAG-NEXT:    setbe %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_uge3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm0, %xmm1
+; FAST-NEXT:    setbe %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp uge float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ult2(float %x) {
-; SDAG-LABEL: fcmp_ult2
-; SDAG:       ucomiss  %xmm0, %xmm0
-; SDAG-NEXT:  setp     %al
-; FAST-LABEL: fcmp_ult2
-; FAST:       ucomiss  %xmm0, %xmm0
-; FAST-NEXT:  setp     %al
+; SDAG-LABEL: fcmp_ult2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm0
+; SDAG-NEXT:    setp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ult2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm0
+; FAST-NEXT:    setp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ult float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ult3(float %x) {
-; SDAG-LABEL: fcmp_ult3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  setb     %al
-; FAST-LABEL: fcmp_ult3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setb     %al
+; SDAG-LABEL: fcmp_ult3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ult3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ult float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ule2(float %x) {
-; SDAG-LABEL: fcmp_ule2
-; SDAG:       movb     $1, %al
-; FAST-LABEL: fcmp_ule2
-; FAST:       movb     $1, %al
+; SDAG-LABEL: fcmp_ule2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ule2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ule float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_ule3(float %x) {
-; SDAG-LABEL: fcmp_ule3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  ucomiss  %xmm1, %xmm0
-; SDAG-NEXT:  setbe    %al
-; FAST-LABEL: fcmp_ule3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setbe    %al
+; SDAG-LABEL: fcmp_ule3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    ucomiss %xmm1, %xmm0
+; SDAG-NEXT:    setbe %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_ule3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setbe %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp ule float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_une2(float %x) {
-; SDAG-LABEL: fcmp_une2
-; SDAG:       ucomiss  %xmm0, %xmm0
-; SDAG-NEXT:  setp     %al
-; FAST-LABEL: fcmp_une2
-; FAST:       ucomiss  %xmm0, %xmm0
-; FAST-NEXT:  setp     %al
+; SDAG-LABEL: fcmp_une2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    ucomiss %xmm0, %xmm0
+; SDAG-NEXT:    setp %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_une2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    ucomiss %xmm0, %xmm0
+; FAST-NEXT:    setp %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = fcmp une float %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @fcmp_une3(float %x) {
-; SDAG-LABEL: fcmp_une3
-; SDAG:       xorps    %xmm1, %xmm1
-; SDAG-NEXT:  cmpneqss %xmm0, %xmm1
-; SDAG-NEXT:  movd     %xmm1, %eax
-; SDAG-NEXT:  andl     $1, %eax
-; FAST-LABEL: fcmp_une3
-; FAST:       xorps    %xmm1, %xmm1
-; FAST-NEXT:  ucomiss  %xmm1, %xmm0
-; FAST-NEXT:  setne    %al
-; FAST-NEXT:  setp     %cl
-; FAST-NEXT:  orb      %al, %cl
+; SDAG-LABEL: fcmp_une3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorps %xmm1, %xmm1
+; SDAG-NEXT:    cmpneqss %xmm0, %xmm1
+; SDAG-NEXT:    movd %xmm1, %eax
+; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: fcmp_une3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorps %xmm1, %xmm1
+; FAST-NEXT:    ucomiss %xmm1, %xmm0
+; FAST-NEXT:    setne %al
+; FAST-NEXT:    setp %cl
+; FAST-NEXT:    orb %al, %cl
+; FAST-NEXT:    andb $1, %cl
+; FAST-NEXT:    movzbl %cl, %eax
+; FAST-NEXT:    retq
   %1 = fcmp une float %x, 0.000000e+00
   ret i1 %1
 }
 
 define zeroext i1 @icmp_eq2(i32 %x) {
-; SDAG-LABEL: icmp_eq2
-; SDAG:       movb     $1, %al
-; FAST-LABEL: icmp_eq2
-; FAST:       movb     $1, %al
+; SDAG-LABEL: icmp_eq2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_eq2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp eq i32 %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @icmp_ne2(i32 %x) {
-; SDAG-LABEL: icmp_ne2
-; SDAG:       xorl     %eax, %eax
-; FAST-LABEL: icmp_ne2
-; FAST:       xorl     %eax, %eax
+; SDAG-LABEL: icmp_ne2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_ne2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp ne i32 %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @icmp_ugt2(i32 %x) {
-; SDAG-LABEL: icmp_ugt2
-; SDAG:       xorl     %eax, %eax
-; FAST-LABEL: icmp_ugt2
-; FAST:       xorl     %eax, %eax
+; SDAG-LABEL: icmp_ugt2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_ugt2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp ugt i32 %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @icmp_uge2(i32 %x) {
-; SDAG-LABEL: icmp_uge2
-; SDAG:       movb     $1, %al
-; FAST-LABEL: icmp_uge2
-; FAST:       movb     $1, %al
+; SDAG-LABEL: icmp_uge2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_uge2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp uge i32 %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @icmp_ult2(i32 %x) {
-; SDAG-LABEL: icmp_ult2
-; SDAG:       xorl     %eax, %eax
-; FAST-LABEL: icmp_ult2
-; FAST:       xorl     %eax, %eax
+; SDAG-LABEL: icmp_ult2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_ult2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp ult i32 %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @icmp_ule2(i32 %x) {
-; SDAG-LABEL: icmp_ule2
-; SDAG:       movb     $1, %al
-; FAST-LABEL: icmp_ule2
-; FAST:       movb     $1, %al
+; SDAG-LABEL: icmp_ule2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_ule2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp ule i32 %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @icmp_sgt2(i32 %x) {
-; SDAG-LABEL: icmp_sgt2
-; SDAG:       xorl     %eax, %eax
-; FAST-LABEL: icmp_sgt2
-; FAST:       xorl     %eax, %eax
+; SDAG-LABEL: icmp_sgt2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_sgt2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp sgt i32 %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @icmp_sge2(i32 %x) {
-; SDAG-LABEL: icmp_sge2
-; SDAG:       movb     $1, %al
-; FAST-LABEL: icmp_sge2
-; FAST:       movb     $1, %al
+; SDAG-LABEL: icmp_sge2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_sge2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp sge i32 %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @icmp_slt2(i32 %x) {
-; SDAG-LABEL: icmp_slt2
-; SDAG:       xorl     %eax, %eax
-; FAST-LABEL: icmp_slt2
-; FAST:       xorl     %eax, %eax
+; SDAG-LABEL: icmp_slt2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_slt2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp slt i32 %x, %x
   ret i1 %1
 }
 
 define zeroext i1 @icmp_sle2(i32 %x) {
-; SDAG-LABEL: icmp_sle2
-; SDAG:       movb     $1, %al
-; FAST-LABEL: icmp_sle2
-; FAST:       movb     $1, %al
+; SDAG-LABEL: icmp_sle2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: icmp_sle2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
   %1 = icmp sle i32 %x, %x
   ret i1 %1
 }
diff --git a/test/CodeGen/X86/fast-isel-deadcode.ll b/test/CodeGen/X86/fast-isel-deadcode.ll
index 0a53d60f8352909aa0bda58dc48fe4c78900eb5f..5381dc4858afa4d58d6bb1704e225c77e08b2228 100644
--- a/test/CodeGen/X86/fast-isel-deadcode.ll
+++ b/test/CodeGen/X86/fast-isel-deadcode.ll
@@ -83,7 +83,7 @@ entry:
   %tmp = alloca { <2 x float>, float }, align 8
   store i32 0, i32* %retval, align 4
   %0 = bitcast %struct.FVector* %v to i8*
-  call void @llvm.lifetime.start(i64 12, i8* %0) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 12, i8* %0) nounwind
   %x.i = getelementptr inbounds %struct.FVector, %struct.FVector* %v, i64 0, i32 0
   store float 1.000000e+00, float* %x.i, align 4
   %y.i = getelementptr inbounds %struct.FVector, %struct.FVector* %v, i64 0, i32 1
@@ -136,12 +136,12 @@ func.exit:                         ; preds = %if.then.i, %if.else.i, %if.end.5.i
   %5 = bitcast %struct.FVector* %ref.tmp to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %5, i64 12, i32 4, i1 false)
   %6 = bitcast %struct.FVector* %v to i8*
-  call void @llvm.lifetime.end(i64 12, i8* %6) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 12, i8* %6) nounwind
   ret i32 0
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) argmemonly nounwind
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) argmemonly nounwind
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) argmemonly nounwind
diff --git a/test/CodeGen/X86/fast-isel-load-i1.ll b/test/CodeGen/X86/fast-isel-load-i1.ll
index 1b2e3c5b9bbf50e22fe23e3bc4bc4d974e4c2d4b..2f3c6c4b84b933803228a7dea268f3c80ffeeb01 100644
--- a/test/CodeGen/X86/fast-isel-load-i1.ll
+++ b/test/CodeGen/X86/fast-isel-load-i1.ll
@@ -1,9 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -fast-isel -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f  | FileCheck %s
 
 define i1 @test_i1(i1* %b) {
 ; CHECK-LABEL: test_i1:
 ; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    testb $1, (%rdi)
+; CHECK-NEXT:    movzbl (%rdi), %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    testb $1, %al
+; CHECK-NEXT:    je .LBB0_2
+; CHECK-NEXT:  # BB#1: # %in
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB0_2: # %out
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    retq
 entry:
   %0 = load i1, i1* %b, align 1
   br i1 %0, label %in, label %out
diff --git a/test/CodeGen/X86/fast-isel-nontemporal.ll b/test/CodeGen/X86/fast-isel-nontemporal.ll
index 2fc08fb4135d4f9a363e0a2793f7f967eaf8896a..4140721bd5f31299df1f7202ee7714dd33cf0f5c 100644
--- a/test/CodeGen/X86/fast-isel-nontemporal.ll
+++ b/test/CodeGen/X86/fast-isel-nontemporal.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+sse2 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+sse4a -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE4A
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx2 -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx512f -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+avx512bw -fast-isel -O0 < %s | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse4a -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE4A
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse4.1 -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx2 -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512f -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown -mattr=+mmx,+avx512bw -fast-isel -O0 | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
 
 ;
 ; Scalar Stores
@@ -91,6 +91,25 @@ entry:
   ret void
 }
 
+;
+; MMX Store
+;
+
+define void @test_mmx(x86_mmx* nocapture %a0, x86_mmx* nocapture %a1) {
+; ALL-LABEL: test_mmx:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    movq (%rdi), %mm0
+; ALL-NEXT:    psrlq $3, %mm0
+; ALL-NEXT:    movntq %mm0, (%rsi)
+; ALL-NEXT:    retq
+entry:
+  %0 = load x86_mmx, x86_mmx* %a0
+  %1 = call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx %0, i32 3)
+  store x86_mmx %1, x86_mmx* %a1, align 8, !nontemporal !1
+  ret void
+}
+declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32) nounwind readnone
+
 ;
 ; 128-bit Vector Stores
 ;
@@ -379,6 +398,7 @@ define void @test_nt8xfloat(<8 x float>* nocapture %ptr, <8 x float> %X) {
 ; AVX512-LABEL: test_nt8xfloat:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vmovntps %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   store <8 x float> %X, <8 x float>* %ptr, align 32, !nontemporal !1
@@ -401,6 +421,7 @@ define void @test_nt4xdouble(<4 x double>* nocapture %ptr, <4 x double> %X) {
 ; AVX512-LABEL: test_nt4xdouble:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vmovntpd %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   store <4 x double> %X, <4 x double>* %ptr, align 32, !nontemporal !1
@@ -423,6 +444,7 @@ define void @test_nt32xi8(<32 x i8>* nocapture %ptr, <32 x i8> %X) {
 ; AVX512-LABEL: test_nt32xi8:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   store <32 x i8> %X, <32 x i8>* %ptr, align 32, !nontemporal !1
@@ -445,6 +467,7 @@ define void @test_nt16xi16(<16 x i16>* nocapture %ptr, <16 x i16> %X) {
 ; AVX512-LABEL: test_nt16xi16:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   store <16 x i16> %X, <16 x i16>* %ptr, align 32, !nontemporal !1
@@ -467,6 +490,7 @@ define void @test_nt8xi32(<8 x i32>* nocapture %ptr, <8 x i32> %X) {
 ; AVX512-LABEL: test_nt8xi32:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   store <8 x i32> %X, <8 x i32>* %ptr, align 32, !nontemporal !1
@@ -489,6 +513,7 @@ define void @test_nt4xi64(<4 x i64>* nocapture %ptr, <4 x i64> %X) {
 ; AVX512-LABEL: test_nt4xi64:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vmovntdq %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   store <4 x i64> %X, <4 x i64>* %ptr, align 32, !nontemporal !1
@@ -750,6 +775,7 @@ define void @test_nt16xfloat(<16 x float>* nocapture %ptr, <16 x float> %X) {
 ; AVX512-LABEL: test_nt16xfloat:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vmovntps %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   store <16 x float> %X, <16 x float>* %ptr, align 64, !nontemporal !1
@@ -775,6 +801,7 @@ define void @test_nt8xdouble(<8 x double>* nocapture %ptr, <8 x double> %X) {
 ; AVX512-LABEL: test_nt8xdouble:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vmovntpd %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   store <8 x double> %X, <8 x double>* %ptr, align 64, !nontemporal !1
@@ -801,11 +828,13 @@ define void @test_nt64xi8(<64 x i8>* nocapture %ptr, <64 x i8> %X) {
 ; AVX512F:       # BB#0: # %entry
 ; AVX512F-NEXT:    vmovntdq %ymm0, (%rdi)
 ; AVX512F-NEXT:    vmovntdq %ymm1, 32(%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_nt64xi8:
 ; AVX512BW:       # BB#0: # %entry
 ; AVX512BW-NEXT:    vmovntdq %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 entry:
   store <64 x i8> %X, <64 x i8>* %ptr, align 64, !nontemporal !1
@@ -832,11 +861,13 @@ define void @test_nt32xi16(<32 x i16>* nocapture %ptr, <32 x i16> %X) {
 ; AVX512F:       # BB#0: # %entry
 ; AVX512F-NEXT:    vmovntdq %ymm0, (%rdi)
 ; AVX512F-NEXT:    vmovntdq %ymm1, 32(%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_nt32xi16:
 ; AVX512BW:       # BB#0: # %entry
 ; AVX512BW-NEXT:    vmovntdq %zmm0, (%rdi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 entry:
   store <32 x i16> %X, <32 x i16>* %ptr, align 64, !nontemporal !1
@@ -862,6 +893,7 @@ define void @test_nt16xi32(<16 x i32>* nocapture %ptr, <16 x i32> %X) {
 ; AVX512-LABEL: test_nt16xi32:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   store <16 x i32> %X, <16 x i32>* %ptr, align 64, !nontemporal !1
@@ -887,6 +919,7 @@ define void @test_nt8xi64(<8 x i64>* nocapture %ptr, <8 x i64> %X) {
 ; AVX512-LABEL: test_nt8xi64:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vmovntdq %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   store <8 x i64> %X, <8 x i64>* %ptr, align 64, !nontemporal !1
diff --git a/test/CodeGen/X86/fast-isel-select-cmov.ll b/test/CodeGen/X86/fast-isel-select-cmov.ll
index a9b2dd841f20228856353e2c79f96546189f6494..e40e917e11e979130c461a611dc1b10997f9d137 100644
--- a/test/CodeGen/X86/fast-isel-select-cmov.ll
+++ b/test/CodeGen/X86/fast-isel-select-cmov.ll
@@ -6,21 +6,12 @@
 ; conditon input (argument or cmp). Currently i8 is not supported.
 
 define zeroext i16 @select_cmov_i16(i1 zeroext %cond, i16 zeroext %a, i16 zeroext %b) {
-; NOAVX512-LABEL: select_cmov_i16:
-; NOAVX512:       ## BB#0:
-; NOAVX512-NEXT:    testb $1, %dil
-; NOAVX512-NEXT:    cmovew %dx, %si
-; NOAVX512-NEXT:    movzwl %si, %eax
-; NOAVX512-NEXT:    retq
-;
-; AVX512-LABEL: select_cmov_i16:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    kmovw %edi, %k0
-; AVX512-NEXT:    kmovw %k0, %eax
-; AVX512-NEXT:    testb $1, %al
-; AVX512-NEXT:    cmovew %dx, %si
-; AVX512-NEXT:    movzwl %si, %eax
-; AVX512-NEXT:    retq
+; CHECK-LABEL: select_cmov_i16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovew %dx, %si
+; CHECK-NEXT:    movzwl %si, %eax
+; CHECK-NEXT:    retq
   %1 = select i1 %cond, i16 %a, i16 %b
   ret i16 %1
 }
@@ -38,21 +29,12 @@ define zeroext i16 @select_cmp_cmov_i16(i16 zeroext %a, i16 zeroext %b) {
 }
 
 define i32 @select_cmov_i32(i1 zeroext %cond, i32 %a, i32 %b) {
-; NOAVX512-LABEL: select_cmov_i32:
-; NOAVX512:       ## BB#0:
-; NOAVX512-NEXT:    testb $1, %dil
-; NOAVX512-NEXT:    cmovel %edx, %esi
-; NOAVX512-NEXT:    movl %esi, %eax
-; NOAVX512-NEXT:    retq
-;
-; AVX512-LABEL: select_cmov_i32:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    kmovw %edi, %k0
-; AVX512-NEXT:    kmovw %k0, %eax
-; AVX512-NEXT:    testb $1, %al
-; AVX512-NEXT:    cmovel %edx, %esi
-; AVX512-NEXT:    movl %esi, %eax
-; AVX512-NEXT:    retq
+; CHECK-LABEL: select_cmov_i32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmovel %edx, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %1 = select i1 %cond, i32 %a, i32 %b
   ret i32 %1
 }
@@ -70,21 +52,12 @@ define i32 @select_cmp_cmov_i32(i32 %a, i32 %b) {
 }
 
 define i64 @select_cmov_i64(i1 zeroext %cond, i64 %a, i64 %b) {
-; NOAVX512-LABEL: select_cmov_i64:
-; NOAVX512:       ## BB#0:
-; NOAVX512-NEXT:    testb $1, %dil
-; NOAVX512-NEXT:    cmoveq %rdx, %rsi
-; NOAVX512-NEXT:    movq %rsi, %rax
-; NOAVX512-NEXT:    retq
-;
-; AVX512-LABEL: select_cmov_i64:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    kmovw %edi, %k0
-; AVX512-NEXT:    kmovw %k0, %eax
-; AVX512-NEXT:    testb $1, %al
-; AVX512-NEXT:    cmoveq %rdx, %rsi
-; AVX512-NEXT:    movq %rsi, %rax
-; AVX512-NEXT:    retq
+; CHECK-LABEL: select_cmov_i64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    cmoveq %rdx, %rsi
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    retq
   %1 = select i1 %cond, i64 %a, i64 %b
   ret i64 %1
 }
diff --git a/test/CodeGen/X86/fast-isel-select-sse.ll b/test/CodeGen/X86/fast-isel-select-sse.ll
index 502260d03f5a0677db314f756f398c4346363b34..499fe5ba54a292a21d57998fbdb4461fdeb377de 100644
--- a/test/CodeGen/X86/fast-isel-select-sse.ll
+++ b/test/CodeGen/X86/fast-isel-select-sse.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown                                          | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -fast-isel -fast-isel-abort=1            | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown                               -mattr=avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -fast-isel -fast-isel-abort=1 -mattr=avx | FileCheck %s --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown                               -mattr=avx512f | FileCheck %s --check-prefix=AVX512
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -fast-isel -fast-isel-abort=1 -mattr=avx512f | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs                                          | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs -fast-isel -fast-isel-abort=1            | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs                               -mattr=avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs -fast-isel -fast-isel-abort=1 -mattr=avx | FileCheck %s --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs                               -mattr=avx512f | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -verify-machineinstrs -fast-isel -fast-isel-abort=1 -mattr=avx512f | FileCheck %s --check-prefix=AVX512
 
 ; Test all cmp predicates that can be used with SSE.
 
@@ -39,9 +39,9 @@ define double @select_fcmp_oeq_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_oeq_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpeqsd %xmm1, %xmm0
-; SSE-NEXT:    andps %xmm0, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm0
-; SSE-NEXT:    orps %xmm2, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm0
+; SSE-NEXT:    orpd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_oeq_f64:
@@ -94,10 +94,10 @@ define double @select_fcmp_ogt_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_ogt_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltsd %xmm0, %xmm1
-; SSE-NEXT:    andps %xmm1, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm1
-; SSE-NEXT:    orps %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    andpd %xmm1, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm1
+; SSE-NEXT:    orpd %xmm2, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_ogt_f64:
@@ -150,10 +150,10 @@ define double @select_fcmp_oge_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_oge_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmplesd %xmm0, %xmm1
-; SSE-NEXT:    andps %xmm1, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm1
-; SSE-NEXT:    orps %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    andpd %xmm1, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm1
+; SSE-NEXT:    orpd %xmm2, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_oge_f64:
@@ -205,9 +205,9 @@ define double @select_fcmp_olt_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_olt_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltsd %xmm1, %xmm0
-; SSE-NEXT:    andps %xmm0, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm0
-; SSE-NEXT:    orps %xmm2, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm0
+; SSE-NEXT:    orpd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_olt_f64:
@@ -259,9 +259,9 @@ define double @select_fcmp_ole_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_ole_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmplesd %xmm1, %xmm0
-; SSE-NEXT:    andps %xmm0, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm0
-; SSE-NEXT:    orps %xmm2, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm0
+; SSE-NEXT:    orpd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_ole_f64:
@@ -313,9 +313,9 @@ define double @select_fcmp_ord_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_ord_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpordsd %xmm1, %xmm0
-; SSE-NEXT:    andps %xmm0, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm0
-; SSE-NEXT:    orps %xmm2, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm0
+; SSE-NEXT:    orpd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_ord_f64:
@@ -367,9 +367,9 @@ define double @select_fcmp_uno_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_uno_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpunordsd %xmm1, %xmm0
-; SSE-NEXT:    andps %xmm0, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm0
-; SSE-NEXT:    orps %xmm2, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm0
+; SSE-NEXT:    orpd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_uno_f64:
@@ -421,9 +421,9 @@ define double @select_fcmp_ugt_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_ugt_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpnlesd %xmm1, %xmm0
-; SSE-NEXT:    andps %xmm0, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm0
-; SSE-NEXT:    orps %xmm2, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm0
+; SSE-NEXT:    orpd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_ugt_f64:
@@ -475,9 +475,9 @@ define double @select_fcmp_uge_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_uge_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpnltsd %xmm1, %xmm0
-; SSE-NEXT:    andps %xmm0, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm0
-; SSE-NEXT:    orps %xmm2, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm0
+; SSE-NEXT:    orpd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_uge_f64:
@@ -530,10 +530,10 @@ define double @select_fcmp_ult_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_ult_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpnlesd %xmm0, %xmm1
-; SSE-NEXT:    andps %xmm1, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm1
-; SSE-NEXT:    orps %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    andpd %xmm1, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm1
+; SSE-NEXT:    orpd %xmm2, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_ult_f64:
@@ -586,10 +586,10 @@ define double @select_fcmp_ule_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_ule_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpnltsd %xmm0, %xmm1
-; SSE-NEXT:    andps %xmm1, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm1
-; SSE-NEXT:    orps %xmm2, %xmm1
-; SSE-NEXT:    movaps %xmm1, %xmm0
+; SSE-NEXT:    andpd %xmm1, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm1
+; SSE-NEXT:    orpd %xmm2, %xmm1
+; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_ule_f64:
@@ -641,9 +641,9 @@ define double @select_fcmp_une_f64(double %a, double %b, double %c, double %d) {
 ; SSE-LABEL: select_fcmp_une_f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpneqsd %xmm1, %xmm0
-; SSE-NEXT:    andps %xmm0, %xmm2
-; SSE-NEXT:    andnps %xmm3, %xmm0
-; SSE-NEXT:    orps %xmm2, %xmm0
+; SSE-NEXT:    andpd %xmm0, %xmm2
+; SSE-NEXT:    andnpd %xmm3, %xmm0
+; SSE-NEXT:    orpd %xmm2, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: select_fcmp_une_f64:
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index 8288fd6f1a9afaa435a840b9fb3cb054356c31f3..3d5c12c03484f466f51474888f900534f335d0f7 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort=1 | FileCheck %s
-; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-verbose 2>&1 >/dev/null | FileCheck %s --check-prefix=STDERR --allow-empty
+; RUN: llc < %s -mattr=-avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -pass-remarks-missed=isel 2>&1 >/dev/null | FileCheck %s --check-prefix=STDERR --allow-empty
 ; RUN: llc < %s -mattr=+avx -fast-isel -mcpu=core2 -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort=1 | FileCheck %s --check-prefix=AVX
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index 643b77638f4545070d040b6f67f7e62314cbf240..aa6d9b7cf056a397d7277454955489f07750dfb6 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -fast-isel -O0 -mcpu=generic -mtriple=i386-apple-darwin10 -relocation-model=pic < %s | FileCheck %s
-; RUN: llc -fast-isel -O0 -mcpu=generic -mtriple=i386-apple-darwin10 -relocation-model=pic < %s -fast-isel-verbose 2>&1 >/dev/null | FileCheck -check-prefix=STDERR -allow-empty %s
+; RUN: llc -fast-isel -O0 -mcpu=generic -mtriple=i386-apple-darwin10 -relocation-model=pic < %s -pass-remarks-missed=isel 2>&1 >/dev/null | FileCheck -check-prefix=STDERR -allow-empty %s
 
 ; This should use flds to set the return value.
 ; CHECK-LABEL: test0:
diff --git a/test/CodeGen/X86/fast-isel.ll b/test/CodeGen/X86/fast-isel.ll
index 36183e48c29987995b2c34d1ef852c7d9f802bc0..375814c8afcd0df568d95e0a2dd6c29a47158ede 100644
--- a/test/CodeGen/X86/fast-isel.ll
+++ b/test/CodeGen/X86/fast-isel.ll
@@ -107,12 +107,12 @@ define void @crash_test1() nounwind ssp {
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
 define i64* @life() nounwind {
   %a1 = alloca i64*, align 8
   %a2 = bitcast i64** %a1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %a2) nounwind      
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %a2) nounwind      
   %a3 = load i64*, i64** %a1, align 8
   ret i64* %a3
 }
diff --git a/test/CodeGen/X86/fentry-insertion.ll b/test/CodeGen/X86/fentry-insertion.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a585d96b209c1e003fd430ea2a7f09d0d2be836d
--- /dev/null
+++ b/test/CodeGen/X86/fentry-insertion.ll
@@ -0,0 +1,16 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @test1() #0 {
+entry:
+  ret void
+
+; CHECK-LABEL: @test1
+; CHECK: callq __fentry__
+; CHECK-NOT: mcount
+; CHECK: retq
+}
+
+attributes #0 = { "fentry-call"="true" }
+
diff --git a/test/CodeGen/X86/file-source-filename.ll b/test/CodeGen/X86/file-source-filename.ll
new file mode 100644
index 0000000000000000000000000000000000000000..146da9e16c959f1b61bafcd462670dadf3674050
--- /dev/null
+++ b/test/CodeGen/X86/file-source-filename.ll
@@ -0,0 +1,4 @@
+; RUN: llc -mtriple=x86_64-linux-gnu < %s | FileCheck %s
+; CHECK: .file "foobar"
+
+source_filename = "foobar"
diff --git a/test/CodeGen/X86/fma-fneg-combine.ll b/test/CodeGen/X86/fma-fneg-combine.ll
index 5329f5b216a416f7afcfcf08c9dc9856f218d192..bb332f7282a8e16739661c361bdaafd5c6a2e170 100644
--- a/test/CodeGen/X86/fma-fneg-combine.ll
+++ b/test/CodeGen/X86/fma-fneg-combine.ll
@@ -127,7 +127,7 @@ define <2 x double> @test10(<2 x double> %a, <2 x double> %b, <2 x double> %c) {
 ; CHECK-LABEL: test10:
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0
-; CHECK-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
+; CHECK-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 entry:
   %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 -1, i32 4) #2
@@ -142,7 +142,7 @@ define <4 x float> @test11(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 ze
 ; SKX:       # BB#0: # %entry
 ; SKX-NEXT:    vxorps {{.*}}(%rip){1to4}, %xmm2, %xmm0
 ; SKX-NEXT:    andl $1, %edi
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vfmadd231ss %xmm1, %xmm1, %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
@@ -165,7 +165,7 @@ declare <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float>, <4 x float>, <
 define <8 x double> @test12(<8 x double> %a, <8 x double> %b, <8 x double> %c, i8 %mask) {
 ; SKX-LABEL: test12:
 ; SKX:       # BB#0: # %entry
-; SKX-NEXT:    kmovb %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vfmadd132pd %zmm1, %zmm2, %zmm0 {%k1}
 ; SKX-NEXT:    vxorpd {{.*}}(%rip){1to8}, %zmm0, %zmm0
 ; SKX-NEXT:    retq
@@ -183,13 +183,21 @@ entry:
 }
 
 define <2 x double> @test13(<2 x double> %a, <2 x double> %b, <2 x double> %c, i8 %mask) {
-; CHECK-LABEL: test13:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    vxorps {{.*}}(%rip), %xmm0, %xmm0
-; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1}
-; CHECK-NEXT:    retq
+; SKX-LABEL: test13:
+; SKX:       # BB#0: # %entry
+; SKX-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; SKX-NEXT:    andl $1, %edi
+; SKX-NEXT:    kmovd %edi, %k1
+; SKX-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1}
+; SKX-NEXT:    retq
+;
+; KNL-LABEL: test13:
+; KNL:       # BB#0: # %entry
+; KNL-NEXT:    vxorpd {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    andl $1, %edi
+; KNL-NEXT:    kmovw %edi, %k1
+; KNL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 {%k1}
+; KNL-NEXT:    retq
 entry:
   %sub.i = fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %a
   %0 = tail call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %sub.i, <2 x double> %b, <2 x double> %c, i8 %mask, i32 4)
@@ -199,7 +207,7 @@ entry:
 define <16 x float> @test14(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask) {
 ; SKX-LABEL: test14:
 ; SKX:       # BB#0: # %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vfnmsub132ps {ru-sae}, %zmm1, %zmm2, %zmm0 {%k1}
 ; SKX-NEXT:    vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm0
 ; SKX-NEXT:    retq
@@ -219,7 +227,7 @@ entry:
 define <16 x float> @test15(<16 x float> %a, <16 x float> %b, <16 x float> %c, i16 %mask)  {
 ; SKX-LABEL: test15:
 ; SKX:       # BB#0: # %entry
-; SKX-NEXT:    kmovw %edi, %k1
+; SKX-NEXT:    kmovd %edi, %k1
 ; SKX-NEXT:    vxorps {{.*}}(%rip){1to16}, %zmm0, %zmm3
 ; SKX-NEXT:    vfnmadd213ps {ru-sae}, %zmm2, %zmm0, %zmm1
 ; SKX-NEXT:    vmovaps %zmm1, %zmm3 {%k1}
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index b91479cda871508dc09df11284cad7b97afa95bb..2c942347d54c1abe2415f82f8c449098479554f3 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,47 +1,413 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma,-fma4  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=-fma,-fma4  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=+avx512f,-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4  | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-FMA-CALL
-
-; CHECK-LABEL: test_f32:
-; CHECK-FMA-INST: vfmadd213ss
-; CHECK-FMA-CALL: fmaf
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+avx,+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=FMA32
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+avx,-fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=FMACALL32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=FMA64
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=-fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=FMACALL64
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=+avx512f,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=AVX512 --check-prefix=AVX51264
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=+avx512vl,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=bdver2 -mattr=-fma4 -show-mc-encoding | FileCheck %s --check-prefix=FMA32
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -mcpu=bdver2 -mattr=-fma,-fma4 -show-mc-encoding | FileCheck %s --check-prefix=FMACALL32
+
 define float @test_f32(float %a, float %b, float %c) #0 {
+; FMA32-LABEL: test_f32:
+; FMA32:       ## BB#0: ## %entry
+; FMA32-NEXT:    pushl %eax ## encoding: [0x50]
+; FMA32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
+; FMA32-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMA32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
+; FMA32-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMA32-NEXT:    vfmadd213ss {{[0-9]+}}(%esp), %xmm0, %xmm1 ## encoding: [0xc4,0xe2,0x79,0xa9,0x4c,0x24,0x10]
+; FMA32-NEXT:    vmovss %xmm1, (%esp) ## encoding: [0xc5,0xfa,0x11,0x0c,0x24]
+; FMA32-NEXT:    flds (%esp) ## encoding: [0xd9,0x04,0x24]
+; FMA32-NEXT:    popl %eax ## encoding: [0x58]
+; FMA32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMACALL32-LABEL: test_f32:
+; FMACALL32:       ## BB#0: ## %entry
+; FMACALL32-NEXT:    jmp _fmaf ## TAILCALL
+; FMACALL32-NEXT:    ## encoding: [0xeb,A]
+; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmaf-1, kind: FK_PCRel_1
+;
+; FMA64-LABEL: test_f32:
+; FMA64:       ## BB#0: ## %entry
+; FMA64-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; FMA64-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL64-LABEL: test_f32:
+; FMACALL64:       ## BB#0: ## %entry
+; FMACALL64-NEXT:    jmp _fmaf ## TAILCALL
+; FMACALL64-NEXT:    ## encoding: [0xeb,A]
+; FMACALL64-NEXT:    ## fixup A - offset: 1, value: _fmaf-1, kind: FK_PCRel_1
+;
+; AVX512-LABEL: test_f32:
+; AVX512:       ## BB#0: ## %entry
+; AVX512-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_f32:
+; AVX512VL:       ## BB#0: ## %entry
+; AVX512VL-NEXT:    vfmadd213ss %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa9,0xc2]
+; AVX512VL-NEXT:    retq ## encoding: [0xc3]
 entry:
   %call = call float @llvm.fma.f32(float %a, float %b, float %c)
   ret float %call
 }
 
-; CHECK-LABEL: test_f64:
-; CHECK-FMA-INST: vfmadd213sd
-; CHECK-FMA-CALL: fma
 define double @test_f64(double %a, double %b, double %c) #0 {
+; FMA32-LABEL: test_f64:
+; FMA32:       ## BB#0: ## %entry
+; FMA32-NEXT:    subl $12, %esp ## encoding: [0x83,0xec,0x0c]
+; FMA32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x10]
+; FMA32-NEXT:    ## xmm0 = mem[0],zero
+; FMA32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x18]
+; FMA32-NEXT:    ## xmm1 = mem[0],zero
+; FMA32-NEXT:    vfmadd213sd {{[0-9]+}}(%esp), %xmm0, %xmm1 ## encoding: [0xc4,0xe2,0xf9,0xa9,0x4c,0x24,0x20]
+; FMA32-NEXT:    vmovsd %xmm1, (%esp) ## encoding: [0xc5,0xfb,0x11,0x0c,0x24]
+; FMA32-NEXT:    fldl (%esp) ## encoding: [0xdd,0x04,0x24]
+; FMA32-NEXT:    addl $12, %esp ## encoding: [0x83,0xc4,0x0c]
+; FMA32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMACALL32-LABEL: test_f64:
+; FMACALL32:       ## BB#0: ## %entry
+; FMACALL32-NEXT:    jmp _fma ## TAILCALL
+; FMACALL32-NEXT:    ## encoding: [0xeb,A]
+; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fma-1, kind: FK_PCRel_1
+;
+; FMA64-LABEL: test_f64:
+; FMA64:       ## BB#0: ## %entry
+; FMA64-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; FMA64-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL64-LABEL: test_f64:
+; FMACALL64:       ## BB#0: ## %entry
+; FMACALL64-NEXT:    jmp _fma ## TAILCALL
+; FMACALL64-NEXT:    ## encoding: [0xeb,A]
+; FMACALL64-NEXT:    ## fixup A - offset: 1, value: _fma-1, kind: FK_PCRel_1
+;
+; AVX512-LABEL: test_f64:
+; AVX512:       ## BB#0: ## %entry
+; AVX512-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_f64:
+; AVX512VL:       ## BB#0: ## %entry
+; AVX512VL-NEXT:    vfmadd213sd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa9,0xc2]
+; AVX512VL-NEXT:    retq ## encoding: [0xc3]
 entry:
   %call = call double @llvm.fma.f64(double %a, double %b, double %c)
   ret double %call
 }
 
-; CHECK-LABEL: test_f80:
-; CHECK: fmal
 define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) #0 {
+; FMA32-LABEL: test_f80:
+; FMA32:       ## BB#0: ## %entry
+; FMA32-NEXT:    subl $60, %esp ## encoding: [0x83,0xec,0x3c]
+; FMA32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x40]
+; FMA32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x50]
+; FMA32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x60]
+; FMA32-NEXT:    fstpt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x7c,0x24,0x20]
+; FMA32-NEXT:    fstpt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x7c,0x24,0x10]
+; FMA32-NEXT:    fstpt (%esp) ## encoding: [0xdb,0x3c,0x24]
+; FMA32-NEXT:    calll _fmal ## encoding: [0xe8,A,A,A,A]
+; FMA32-NEXT:    ## fixup A - offset: 1, value: _fmal-4, kind: FK_PCRel_4
+; FMA32-NEXT:    addl $60, %esp ## encoding: [0x83,0xc4,0x3c]
+; FMA32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMACALL32-LABEL: test_f80:
+; FMACALL32:       ## BB#0: ## %entry
+; FMACALL32-NEXT:    subl $60, %esp ## encoding: [0x83,0xec,0x3c]
+; FMACALL32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x40]
+; FMACALL32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x50]
+; FMACALL32-NEXT:    fldt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x6c,0x24,0x60]
+; FMACALL32-NEXT:    fstpt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x7c,0x24,0x20]
+; FMACALL32-NEXT:    fstpt {{[0-9]+}}(%esp) ## encoding: [0xdb,0x7c,0x24,0x10]
+; FMACALL32-NEXT:    fstpt (%esp) ## encoding: [0xdb,0x3c,0x24]
+; FMACALL32-NEXT:    calll _fmal ## encoding: [0xe8,A,A,A,A]
+; FMACALL32-NEXT:    ## fixup A - offset: 1, value: _fmal-4, kind: FK_PCRel_4
+; FMACALL32-NEXT:    addl $60, %esp ## encoding: [0x83,0xc4,0x3c]
+; FMACALL32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMA64-LABEL: test_f80:
+; FMA64:       ## BB#0: ## %entry
+; FMA64-NEXT:    subq $56, %rsp ## encoding: [0x48,0x83,0xec,0x38]
+; FMA64-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x40]
+; FMA64-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x50]
+; FMA64-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x60]
+; FMA64-NEXT:    fstpt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x7c,0x24,0x20]
+; FMA64-NEXT:    fstpt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x7c,0x24,0x10]
+; FMA64-NEXT:    fstpt (%rsp) ## encoding: [0xdb,0x3c,0x24]
+; FMA64-NEXT:    callq _fmal ## encoding: [0xe8,A,A,A,A]
+; FMA64-NEXT:    ## fixup A - offset: 1, value: _fmal-4, kind: FK_PCRel_4
+; FMA64-NEXT:    addq $56, %rsp ## encoding: [0x48,0x83,0xc4,0x38]
+; FMA64-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL64-LABEL: test_f80:
+; FMACALL64:       ## BB#0: ## %entry
+; FMACALL64-NEXT:    subq $56, %rsp ## encoding: [0x48,0x83,0xec,0x38]
+; FMACALL64-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x40]
+; FMACALL64-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x50]
+; FMACALL64-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x60]
+; FMACALL64-NEXT:    fstpt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x7c,0x24,0x20]
+; FMACALL64-NEXT:    fstpt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x7c,0x24,0x10]
+; FMACALL64-NEXT:    fstpt (%rsp) ## encoding: [0xdb,0x3c,0x24]
+; FMACALL64-NEXT:    callq _fmal ## encoding: [0xe8,A,A,A,A]
+; FMACALL64-NEXT:    ## fixup A - offset: 1, value: _fmal-4, kind: FK_PCRel_4
+; FMACALL64-NEXT:    addq $56, %rsp ## encoding: [0x48,0x83,0xc4,0x38]
+; FMACALL64-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_f80:
+; AVX512:       ## BB#0: ## %entry
+; AVX512-NEXT:    subq $56, %rsp ## encoding: [0x48,0x83,0xec,0x38]
+; AVX512-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x40]
+; AVX512-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x50]
+; AVX512-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x60]
+; AVX512-NEXT:    fstpt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x7c,0x24,0x20]
+; AVX512-NEXT:    fstpt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x7c,0x24,0x10]
+; AVX512-NEXT:    fstpt (%rsp) ## encoding: [0xdb,0x3c,0x24]
+; AVX512-NEXT:    callq _fmal ## encoding: [0xe8,A,A,A,A]
+; AVX512-NEXT:    ## fixup A - offset: 1, value: _fmal-4, kind: FK_PCRel_4
+; AVX512-NEXT:    addq $56, %rsp ## encoding: [0x48,0x83,0xc4,0x38]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_f80:
+; AVX512VL:       ## BB#0: ## %entry
+; AVX512VL-NEXT:    subq $56, %rsp ## encoding: [0x48,0x83,0xec,0x38]
+; AVX512VL-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x40]
+; AVX512VL-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x50]
+; AVX512VL-NEXT:    fldt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x6c,0x24,0x60]
+; AVX512VL-NEXT:    fstpt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x7c,0x24,0x20]
+; AVX512VL-NEXT:    fstpt {{[0-9]+}}(%rsp) ## encoding: [0xdb,0x7c,0x24,0x10]
+; AVX512VL-NEXT:    fstpt (%rsp) ## encoding: [0xdb,0x3c,0x24]
+; AVX512VL-NEXT:    callq _fmal ## encoding: [0xe8,A,A,A,A]
+; AVX512VL-NEXT:    ## fixup A - offset: 1, value: _fmal-4, kind: FK_PCRel_4
+; AVX512VL-NEXT:    addq $56, %rsp ## encoding: [0x48,0x83,0xc4,0x38]
+; AVX512VL-NEXT:    retq ## encoding: [0xc3]
 entry:
   %call = call x86_fp80 @llvm.fma.f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c)
   ret x86_fp80 %call
 }
 
-; CHECK-LABEL: test_f32_cst:
-; CHECK-NOT: vfmadd
 define float @test_f32_cst() #0 {
+; FMA32-LABEL: test_f32_cst:
+; FMA32:       ## BB#0: ## %entry
+; FMA32-NEXT:    flds LCPI3_0 ## encoding: [0xd9,0x05,A,A,A,A]
+; FMA32-NEXT:    ## fixup A - offset: 2, value: LCPI3_0, kind: FK_Data_4
+; FMA32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMACALL32-LABEL: test_f32_cst:
+; FMACALL32:       ## BB#0: ## %entry
+; FMACALL32-NEXT:    flds LCPI3_0 ## encoding: [0xd9,0x05,A,A,A,A]
+; FMACALL32-NEXT:    ## fixup A - offset: 2, value: LCPI3_0, kind: FK_Data_4
+; FMACALL32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMA64-LABEL: test_f32_cst:
+; FMA64:       ## BB#0: ## %entry
+; FMA64-NEXT:    vmovss {{.*}}(%rip), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
+; FMA64-NEXT:    ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
+; FMA64-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMA64-NEXT:    retq ## encoding: [0xc3]
+;
+; FMACALL64-LABEL: test_f32_cst:
+; FMACALL64:       ## BB#0: ## %entry
+; FMACALL64-NEXT:    movss {{.*}}(%rip), %xmm0 ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
+; FMACALL64-NEXT:    ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
+; FMACALL64-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL64-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_f32_cst:
+; AVX512:       ## BB#0: ## %entry
+; AVX512-NEXT:    vmovss {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
+; AVX512-NEXT:    ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
+; AVX512-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_f32_cst:
+; AVX512VL:       ## BB#0: ## %entry
+; AVX512VL-NEXT:    vmovss {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
+; AVX512VL-NEXT:    ## fixup A - offset: 4, value: LCPI3_0-4, kind: reloc_riprel_4byte
+; AVX512VL-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; AVX512VL-NEXT:    retq ## encoding: [0xc3]
 entry:
   %call = call float @llvm.fma.f32(float 3.0, float 3.0, float 3.0)
   ret float %call
 }
 
+define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #0 {
+; FMA32-LABEL: test_v4f32:
+; FMA32:       ## BB#0: ## %entry
+; FMA32-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; FMA32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMA64-LABEL: test_v4f32:
+; FMA64:       ## BB#0: ## %entry
+; FMA64-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; FMA64-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_v4f32:
+; AVX512:       ## BB#0: ## %entry
+; AVX512-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_v4f32:
+; AVX512VL:       ## BB#0: ## %entry
+; AVX512VL-NEXT:    vfmadd213ps %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0xa8,0xc2]
+; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+entry:
+  %call = call <4 x float> @llvm.fma.v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c)
+  ret <4 x float> %call
+}
+
+define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #0 {
+; FMA32-LABEL: test_v8f32:
+; FMA32:       ## BB#0: ## %entry
+; FMA32-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; FMA32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMA64-LABEL: test_v8f32:
+; FMA64:       ## BB#0: ## %entry
+; FMA64-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; FMA64-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_v8f32:
+; AVX512:       ## BB#0: ## %entry
+; AVX512-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_v8f32:
+; AVX512VL:       ## BB#0: ## %entry
+; AVX512VL-NEXT:    vfmadd213ps %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x75,0xa8,0xc2]
+; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+entry:
+  %call = call <8 x float> @llvm.fma.v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c)
+  ret <8 x float> %call
+}
+
+define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c) #0 {
+; FMA32-LABEL: test_v16f32:
+; FMA32:       ## BB#0: ## %entry
+; FMA32-NEXT:    pushl %ebp ## encoding: [0x55]
+; FMA32-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
+; FMA32-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
+; FMA32-NEXT:    subl $32, %esp ## encoding: [0x83,0xec,0x20]
+; FMA32-NEXT:    vfmadd213ps 8(%ebp), %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0x6d,0xa8,0x45,0x08]
+; FMA32-NEXT:    vfmadd213ps 40(%ebp), %ymm3, %ymm1 ## encoding: [0xc4,0xe2,0x65,0xa8,0x4d,0x28]
+; FMA32-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
+; FMA32-NEXT:    popl %ebp ## encoding: [0x5d]
+; FMA32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMA64-LABEL: test_v16f32:
+; FMA64:       ## BB#0: ## %entry
+; FMA64-NEXT:    vfmadd213ps %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0x6d,0xa8,0xc4]
+; FMA64-NEXT:    vfmadd213ps %ymm5, %ymm3, %ymm1 ## encoding: [0xc4,0xe2,0x65,0xa8,0xcd]
+; FMA64-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_v16f32:
+; AVX512:       ## BB#0: ## %entry
+; AVX512-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_v16f32:
+; AVX512VL:       ## BB#0: ## %entry
+; AVX512VL-NEXT:    vfmadd213ps %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0x75,0x48,0xa8,0xc2]
+; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+entry:
+  %call = call <16 x float> @llvm.fma.v16f32(<16 x float> %a, <16 x float> %b, <16 x float> %c)
+  ret <16 x float> %call
+}
+
+define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c) #0 {
+; FMA32-LABEL: test_v2f64:
+; FMA32:       ## BB#0: ## %entry
+; FMA32-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; FMA32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMA64-LABEL: test_v2f64:
+; FMA64:       ## BB#0: ## %entry
+; FMA64-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; FMA64-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_v2f64:
+; AVX512:       ## BB#0: ## %entry
+; AVX512-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_v2f64:
+; AVX512VL:       ## BB#0: ## %entry
+; AVX512VL-NEXT:    vfmadd213pd %xmm2, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf1,0xa8,0xc2]
+; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+entry:
+  %call = call <2 x double> @llvm.fma.v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %c)
+  ret <2 x double> %call
+}
+
+define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c) #0 {
+; FMA32-LABEL: test_v4f64:
+; FMA32:       ## BB#0: ## %entry
+; FMA32-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; FMA32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMA64-LABEL: test_v4f64:
+; FMA64:       ## BB#0: ## %entry
+; FMA64-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; FMA64-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_v4f64:
+; AVX512:       ## BB#0: ## %entry
+; AVX512-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 ## encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_v4f64:
+; AVX512VL:       ## BB#0: ## %entry
+; AVX512VL-NEXT:    vfmadd213pd %ymm2, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf5,0xa8,0xc2]
+; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+entry:
+  %call = call <4 x double> @llvm.fma.v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %c)
+  ret <4 x double> %call
+}
+
+define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c) #0 {
+; FMA32-LABEL: test_v8f64:
+; FMA32:       ## BB#0: ## %entry
+; FMA32-NEXT:    pushl %ebp ## encoding: [0x55]
+; FMA32-NEXT:    movl %esp, %ebp ## encoding: [0x89,0xe5]
+; FMA32-NEXT:    andl $-32, %esp ## encoding: [0x83,0xe4,0xe0]
+; FMA32-NEXT:    subl $32, %esp ## encoding: [0x83,0xec,0x20]
+; FMA32-NEXT:    vfmadd213pd 8(%ebp), %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0xed,0xa8,0x45,0x08]
+; FMA32-NEXT:    vfmadd213pd 40(%ebp), %ymm3, %ymm1 ## encoding: [0xc4,0xe2,0xe5,0xa8,0x4d,0x28]
+; FMA32-NEXT:    movl %ebp, %esp ## encoding: [0x89,0xec]
+; FMA32-NEXT:    popl %ebp ## encoding: [0x5d]
+; FMA32-NEXT:    retl ## encoding: [0xc3]
+;
+; FMA64-LABEL: test_v8f64:
+; FMA64:       ## BB#0: ## %entry
+; FMA64-NEXT:    vfmadd213pd %ymm4, %ymm2, %ymm0 ## encoding: [0xc4,0xe2,0xed,0xa8,0xc4]
+; FMA64-NEXT:    vfmadd213pd %ymm5, %ymm3, %ymm1 ## encoding: [0xc4,0xe2,0xe5,0xa8,0xcd]
+; FMA64-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512-LABEL: test_v8f64:
+; AVX512:       ## BB#0: ## %entry
+; AVX512-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
+; AVX512-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_v8f64:
+; AVX512VL:       ## BB#0: ## %entry
+; AVX512VL-NEXT:    vfmadd213pd %zmm2, %zmm1, %zmm0 ## encoding: [0x62,0xf2,0xf5,0x48,0xa8,0xc2]
+; AVX512VL-NEXT:    retq ## encoding: [0xc3]
+entry:
+  %call = call <8 x double> @llvm.fma.v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %c)
+  ret <8 x double> %call
+}
+
 declare float @llvm.fma.f32(float, float, float)
 declare double @llvm.fma.f64(double, double, double)
 declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80)
 
+declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
+declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
+declare <16 x float> @llvm.fma.v16f32(<16 x float>, <16 x float>, <16 x float>)
+
+declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
+declare <8 x double> @llvm.fma.v8f64(<8 x double>, <8 x double>, <8 x double>)
+
 attributes #0 = { nounwind }
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 2554b0201a66f57742b83c071060d2060ef8acb9..002b0746d3c35baf0dc1b9fc5b89c6119d9b421d 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1483,7 +1483,7 @@ define double @test_f64_fneg_fmul(double %x, double %y) #0 {
 ;
 ; AVX512-LABEL: test_f64_fneg_fmul:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vfnmsub213sd %xmm2, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
   %m = fmul nsz double %x, %y
diff --git a/test/CodeGen/X86/fold-vector-sext-zext.ll b/test/CodeGen/X86/fold-vector-sext-zext.ll
index 3f502efa753e8c57eb4e0f124b6080066ef023db..575bd5897e47a9032c4623799f4a267cb64348ec 100644
--- a/test/CodeGen/X86/fold-vector-sext-zext.ll
+++ b/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -245,9 +245,8 @@ define <4 x i32> @test_zext_4i8_4i32() {
 define <4 x i64> @test_zext_4i8_4i64() {
 ; X32-LABEL: test_zext_4i8_4i64:
 ; X32:       # BB#0:
-; X32-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT:    vpmovzxbq {{.*#+}} xmm1 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = [0,0,255,0]
+; X32-NEXT:    vinsertf128 $1, {{\.LCPI.*}}, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_zext_4i8_4i64:
@@ -301,11 +300,9 @@ define <4 x i32> @test_zext_4i8_4i32_undef() {
 define <4 x i64> @test_zext_4i8_4i64_undef() {
 ; X32-LABEL: test_zext_4i8_4i64_undef:
 ; X32:       # BB#0:
-; X32-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; X32-NEXT:    vpmovzxbq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vmovaps {{.*#+}} xmm0 = <u,u,255,0>
 ; X32-NEXT:    movl $2, %eax
 ; X32-NEXT:    vmovd %eax, %xmm1
-; X32-NEXT:    vpmovzxbq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,zero,zero,zero,zero,xmm1[1],zero,zero,zero,zero,zero,zero,zero
 ; X32-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/fp-select-cmp-and.ll b/test/CodeGen/X86/fp-select-cmp-and.ll
index c9c8922c97f3ce27b65f06ec900c2225d756e4c9..e012809cf480be383aead91af21c0a7ce90ed845 100644
--- a/test/CodeGen/X86/fp-select-cmp-and.ll
+++ b/test/CodeGen/X86/fp-select-cmp-and.ll
@@ -5,7 +5,7 @@ define double @test1(double %a, double %b, double %eps) {
 ; CHECK-LABEL: test1:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    cmpltsd %xmm2, %xmm0
-; CHECK-NEXT:    andps %xmm1, %xmm0
+; CHECK-NEXT:    andpd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %cmp = fcmp olt double %a, %eps
@@ -17,7 +17,7 @@ define double @test2(double %a, double %b, double %eps) {
 ; CHECK-LABEL: test2:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    cmplesd %xmm2, %xmm0
-; CHECK-NEXT:    andps %xmm1, %xmm0
+; CHECK-NEXT:    andpd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %cmp = fcmp ole double %a, %eps
@@ -29,8 +29,8 @@ define double @test3(double %a, double %b, double %eps) {
 ; CHECK-LABEL: test3:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    cmpltsd %xmm0, %xmm2
-; CHECK-NEXT:    andps %xmm1, %xmm2
-; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    andpd %xmm1, %xmm2
+; CHECK-NEXT:    movapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %cmp = fcmp ogt double %a, %eps
@@ -42,8 +42,8 @@ define double @test4(double %a, double %b, double %eps) {
 ; CHECK-LABEL: test4:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    cmplesd %xmm0, %xmm2
-; CHECK-NEXT:    andps %xmm1, %xmm2
-; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    andpd %xmm1, %xmm2
+; CHECK-NEXT:    movapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %cmp = fcmp oge double %a, %eps
@@ -55,7 +55,7 @@ define double @test5(double %a, double %b, double %eps) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    cmpltsd %xmm2, %xmm0
-; CHECK-NEXT:    andnps %xmm1, %xmm0
+; CHECK-NEXT:    andnpd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %cmp = fcmp olt double %a, %eps
@@ -67,7 +67,7 @@ define double @test6(double %a, double %b, double %eps) {
 ; CHECK-LABEL: test6:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    cmplesd %xmm2, %xmm0
-; CHECK-NEXT:    andnps %xmm1, %xmm0
+; CHECK-NEXT:    andnpd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %cmp = fcmp ole double %a, %eps
@@ -79,8 +79,8 @@ define double @test7(double %a, double %b, double %eps) {
 ; CHECK-LABEL: test7:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    cmpltsd %xmm0, %xmm2
-; CHECK-NEXT:    andnps %xmm1, %xmm2
-; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    andnpd %xmm1, %xmm2
+; CHECK-NEXT:    movapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %cmp = fcmp ogt double %a, %eps
@@ -92,8 +92,8 @@ define double @test8(double %a, double %b, double %eps) {
 ; CHECK-LABEL: test8:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    cmplesd %xmm0, %xmm2
-; CHECK-NEXT:    andnps %xmm1, %xmm2
-; CHECK-NEXT:    movaps %xmm2, %xmm0
+; CHECK-NEXT:    andnpd %xmm1, %xmm2
+; CHECK-NEXT:    movapd %xmm2, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %cmp = fcmp oge double %a, %eps
@@ -220,10 +220,10 @@ define double @test18(double %a, double %b, double %c, double %eps) {
 ; CHECK-LABEL: test18:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    cmplesd %xmm0, %xmm3
-; CHECK-NEXT:    andps %xmm3, %xmm2
-; CHECK-NEXT:    andnps %xmm1, %xmm3
-; CHECK-NEXT:    orps %xmm2, %xmm3
-; CHECK-NEXT:    movaps %xmm3, %xmm0
+; CHECK-NEXT:    andpd %xmm3, %xmm2
+; CHECK-NEXT:    andnpd %xmm1, %xmm3
+; CHECK-NEXT:    orpd %xmm2, %xmm3
+; CHECK-NEXT:    movapd %xmm3, %xmm0
 ; CHECK-NEXT:    retq
 ;
   %cmp = fcmp oge double %a, %eps
diff --git a/test/CodeGen/X86/fp-une-cmp.ll b/test/CodeGen/X86/fp-une-cmp.ll
index e3b2a04060ba384b33c4032ac69b5b7f8af6e6bd..1b5af5aba36667ed2d44a38078b126555cc11de3 100644
--- a/test/CodeGen/X86/fp-une-cmp.ll
+++ b/test/CodeGen/X86/fp-une-cmp.ll
@@ -36,8 +36,8 @@ define double @rdar_7859988(double %x, double %y) nounwind readnone optsize ssp
 
 entry:
   %mul = fmul double %x, %y
-  %cmp = fcmp une double %mul, 0.000000e+00
-  br i1 %cmp, label %bb2, label %bb1
+  %cmp = fcmp oeq double %mul, 0.000000e+00
+  br i1 %cmp, label %bb1, label %bb2
 
 bb1:
   %add = fadd double %mul, -1.000000e+00
diff --git a/test/CodeGen/X86/fp128-cast.ll b/test/CodeGen/X86/fp128-cast.ll
index 9408437ecc8ac7b291246feb52cf3351b47b79fd..6568f73029e09362c63ffed7aa9c2e722c0feb9f 100644
--- a/test/CodeGen/X86/fp128-cast.ll
+++ b/test/CodeGen/X86/fp128-cast.ll
@@ -152,7 +152,7 @@ entry:
 ; X32:       retl
 ;
 ; X64-LABEL: TestFPTruncF128_F64:
-; X64:       movapd      vf128(%rip), %xmm0
+; X64:       movaps      vf128(%rip), %xmm0
 ; X64-NEXT:  callq       __trunctfdf2
 ; X64-NEXT:  movsd       %xmm0, vf64(%rip)
 ; X64:       retq
diff --git a/test/CodeGen/X86/fp128-compare.ll b/test/CodeGen/X86/fp128-compare.ll
index 6ad3b74aeafa23e420c1fc50942e44b683177189..7ee2e90657c06e174e0c5d88139e271aaee88262 100644
--- a/test/CodeGen/X86/fp128-compare.ll
+++ b/test/CodeGen/X86/fp128-compare.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx \
+; RUN:     -enable-legalize-types-checking | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx \
+; RUN:     -enable-legalize-types-checking | FileCheck %s
 
 define i32 @TestComp128GT(fp128 %d1, fp128 %d2) {
 entry:
diff --git a/test/CodeGen/X86/fp128-g.ll b/test/CodeGen/X86/fp128-g.ll
index 192ac7af39ffb78f8232eb74d0c3076a2f3dc8d5..5eeef0cb77c40f9a826f272e046832935a4dc9b0 100644
--- a/test/CodeGen/X86/fp128-g.ll
+++ b/test/CodeGen/X86/fp128-g.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx \
+; RUN:    -enable-legalize-types-checking | FileCheck %s --check-prefix=X64
 ;
 ; These cases check if x86_64-linux-android works with -O2 -g,
 ; especially CSE matching needed by SoftenFloatRes_LOAD.
diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll
index 77160674ab204046599ff6be48d79ea9b06a8d56..98082ec611d492171aa11f918d3ef495a52ebaf9 100644
--- a/test/CodeGen/X86/fp128-i128.ll
+++ b/test/CodeGen/X86/fp128-i128.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx -enable-legalize-types-checking | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu     -mattr=+mmx -enable-legalize-types-checking | FileCheck %s
 
 ; These tests were generated from simplified libm C code.
 ; When compiled for the x86_64-linux-android target,
@@ -41,6 +42,19 @@
 ;      foo(w);
 ; }
 define void @TestUnionLD1(fp128 %s, i64 %n) #0 {
+; CHECK-LABEL: TestUnionLD1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    movabsq $281474976710655, %rcx # imm = 0xFFFFFFFFFFFF
+; CHECK-NEXT:    andq %rdi, %rcx
+; CHECK-NEXT:    movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000
+; CHECK-NEXT:    andq -{{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    orq %rcx, %rdx
+; CHECK-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    jmp foo # TAILCALL
 entry:
   %0 = bitcast fp128 %s to i128
   %1 = zext i64 %n to i128
@@ -51,18 +65,6 @@ entry:
   %2 = bitcast i128 %bf.set to fp128
   tail call void @foo(fp128 %2) #2
   ret void
-; CHECK-LABEL: TestUnionLD1:
-; CHECK:       movaps %xmm0, -24(%rsp)
-; CHECK-NEXT:  movq -24(%rsp), %rax
-; CHECK-NEXT:  movabsq $281474976710655, %rcx
-; CHECK-NEXT:  andq %rdi, %rcx
-; CHECK-NEXT:  movabsq $-281474976710656, %rdx
-; CHECK-NEXT:  andq -16(%rsp), %rdx
-; CHECK-NEXT:  movq %rax, -40(%rsp)
-; CHECK-NEXT:  orq %rcx, %rdx
-; CHECK-NEXT:  movq %rdx, -32(%rsp)
-; CHECK-NEXT:  movaps -40(%rsp), %xmm0
-; CHECK-NEXT:  jmp foo
 }
 
 ; C code:
@@ -75,18 +77,19 @@ entry:
 ;      return w;
 ; }
 define fp128 @TestUnionLD2(fp128 %s) #0 {
+; CHECK-LABEL: TestUnionLD2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq -{{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps -{{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %0 = bitcast fp128 %s to i128
   %bf.clear = and i128 %0, -18446744073709551616
   %1 = bitcast i128 %bf.clear to fp128
   ret fp128 %1
-; CHECK-LABEL: TestUnionLD2:
-; CHECK:       movaps %xmm0, -24(%rsp)
-; CHECK-NEXT:  movq -16(%rsp), %rax
-; CHECK-NEXT:  movq %rax, -32(%rsp)
-; CHECK-NEXT:  movq $0, -40(%rsp)
-; CHECK-NEXT:  movaps -40(%rsp), %xmm0
-; CHECK-NEXT:  retq
 }
 
 ; C code:
@@ -98,6 +101,25 @@ entry:
 ;  return (z.e < 0.1L) ? 1.0L : 2.0L;
 ; }
 define fp128 @TestI128_1(fp128 %x) #0 {
+; CHECK-LABEL: TestI128_1:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
+; CHECK-NEXT:    andq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movq %rcx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq %rax, (%rsp)
+; CHECK-NEXT:    movaps (%rsp), %xmm0
+; CHECK-NEXT:    movaps {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    callq __lttf2
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    sets %cl
+; CHECK-NEXT:    shlq $4, %rcx
+; CHECK-NEXT:    movaps {{\.LCPI.*}}(%rcx), %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
 entry:
   %0 = bitcast fp128 %x to i128
   %bf.clear = and i128 %0, 170141183460469231731687303715884105727
@@ -105,13 +127,6 @@ entry:
   %cmp = fcmp olt fp128 %1, 0xL999999999999999A3FFB999999999999
   %cond = select i1 %cmp, fp128 0xL00000000000000003FFF000000000000, fp128 0xL00000000000000004000000000000000
   ret fp128 %cond
-; CHECK-LABEL: TestI128_1:
-; CHECK:       movaps %xmm0,
-; CHECK:       movabsq $9223372036854775807,
-; CHECK:       callq __lttf2
-; CHECK:       testl %eax, %eax
-; CHECK:       movaps {{.*}}, %xmm0
-; CHECK:       retq
 }
 
 ; C code:
@@ -124,17 +139,20 @@ entry:
 ;  return (hx & 0x8000) == 0 ? x : y;
 ; }
 define fp128 @TestI128_2(fp128 %x, fp128 %y) #0 {
+; CHECK-LABEL: TestI128_2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    cmpq $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jns .LBB3_2
+; CHECK-NEXT:  # BB#1: # %entry
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:  .LBB3_2: # %entry
+; CHECK-NEXT:    retq
 entry:
   %0 = bitcast fp128 %x to i128
   %cmp = icmp sgt i128 %0, -1
   %cond = select i1 %cmp, fp128 %x, fp128 %y
   ret fp128 %cond
-; CHECK-LABEL: TestI128_2:
-; CHECK:       movaps %xmm0, -24(%rsp)
-; CHECK-NEXT:  cmpq $0, -16(%rsp)
-; CHECK-NEXT:  jns
-; CHECK:       movaps %xmm1, %xmm0
-; CHECK:       retq
 }
 
 ; C code:
@@ -149,6 +167,32 @@ entry:
 ;  return (u.e);
 ; }
 define fp128 @TestI128_3(fp128 %x, i32* nocapture readnone %ex) #0 {
+; CHECK-LABEL: TestI128_3:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subq $56, %rsp
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    movabsq $9223090561878065152, %rcx # imm = 0x7FFF000000000000
+; CHECK-NEXT:    testq %rcx, %rax
+; CHECK-NEXT:    je .LBB4_2
+; CHECK-NEXT:  # BB#1:
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    jmp .LBB4_3
+; CHECK-NEXT:  .LBB4_2: # %if.then
+; CHECK-NEXT:    movaps {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    callq __multf3
+; CHECK-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movabsq $-9223090561878065153, %rdx # imm = 0x8000FFFFFFFFFFFF
+; CHECK-NEXT:    andq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    movabsq $4611123068473966592, %rax # imm = 0x3FFE000000000000
+; CHECK-NEXT:    orq %rdx, %rax
+; CHECK-NEXT:  .LBB4_3: # %if.end
+; CHECK-NEXT:    movq %rcx, (%rsp)
+; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movaps (%rsp), %xmm0
+; CHECK-NEXT:    addq $56, %rsp
+; CHECK-NEXT:    retq
 entry:
   %0 = bitcast fp128 %x to i128
   %bf.cast = and i128 %0, 170135991163610696904058773219554885632
@@ -166,15 +210,6 @@ if.end:                                           ; preds = %if.then, %entry
   %u.sroa.0.0 = phi i128 [ %bf.set, %if.then ], [ %0, %entry ]
   %2 = bitcast i128 %u.sroa.0.0 to fp128
   ret fp128 %2
-; CHECK-LABEL: TestI128_3:
-; CHECK:       movaps %xmm0,
-; CHECK:       movabsq $9223090561878065152,
-; CHECK:       testq
-; CHECK:       callq __multf3
-; CHECK-NEXT:  movaps %xmm0
-; CHECK:       movabsq $-9223090561878065153,
-; CHECK:       movabsq $4611123068473966592,
-; CHECK:       retq
 }
 
 ; C code:
@@ -188,21 +223,24 @@ if.end:                                           ; preds = %if.then, %entry
 ;  return x + df;
 ; }
 define fp128 @TestI128_4(fp128 %x) #0 {
+; CHECK-LABEL: TestI128_4:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, (%rsp)
+; CHECK-NEXT:    movaps (%rsp), %xmm0
+; CHECK-NEXT:    callq __addtf3
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
 entry:
   %0 = bitcast fp128 %x to i128
   %bf.clear = and i128 %0, -18446744073709551616
   %1 = bitcast i128 %bf.clear to fp128
   %add = fadd fp128 %1, %x
   ret fp128 %add
-; CHECK-LABEL: TestI128_4:
-; CHECK:       movaps %xmm0, %xmm1
-; CHECK-NEXT:  movaps %xmm1, 16(%rsp)
-; CHECK-NEXT:  movq 24(%rsp), %rax
-; CHECK-NEXT:  movq %rax, 8(%rsp)
-; CHECK-NEXT:  movq $0, (%rsp)
-; CHECK-NEXT:  movaps (%rsp), %xmm0
-; CHECK-NEXT:  callq __addtf3
-; CHECK:       retq
 }
 
 @v128 = common global i128 0, align 16
@@ -214,6 +252,15 @@ entry:
 ;   v128 = ((v128 << 96) | v128_2);
 ; }
 define void @TestShift128_2() #2 {
+; CHECK-LABEL: TestShift128_2:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movq {{.*}}(%rip), %rax
+; CHECK-NEXT:    shlq $32, %rax
+; CHECK-NEXT:    movq {{.*}}(%rip), %rcx
+; CHECK-NEXT:    orq v128_2+{{.*}}(%rip), %rax
+; CHECK-NEXT:    movq %rcx, {{.*}}(%rip)
+; CHECK-NEXT:    movq %rax, v128+{{.*}}(%rip)
+; CHECK-NEXT:    retq
 entry:
   %0 = load i128, i128* @v128, align 16
   %shl = shl i128 %0, 96
@@ -221,59 +268,58 @@ entry:
   %or = or i128 %shl, %1
   store i128 %or, i128* @v128, align 16
   ret void
-; CHECK-LABEL: TestShift128_2:
-; CHECK:       movq v128(%rip), %rax
-; CHECK-NEXT:  shlq $32, %rax
-; CHECK-NEXT:  movq v128_2(%rip), %rcx
-; CHECK-NEXT:  orq v128_2+8(%rip), %rax
-; CHECK-NEXT:  movq %rcx, v128(%rip)
-; CHECK-NEXT:  movq %rax, v128+8(%rip)
-; CHECK-NEXT:  retq
 }
 
 define fp128 @acosl(fp128 %x) #0 {
+; CHECK-LABEL: acosl:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, (%rsp)
+; CHECK-NEXT:    movaps (%rsp), %xmm0
+; CHECK-NEXT:    callq __addtf3
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
 entry:
   %0 = bitcast fp128 %x to i128
   %bf.clear = and i128 %0, -18446744073709551616
   %1 = bitcast i128 %bf.clear to fp128
   %add = fadd fp128 %1, %x
   ret fp128 %add
-; CHECK-LABEL: acosl:
-; CHECK:       movaps %xmm0, %xmm1
-; CHECK-NEXT:  movaps %xmm1, 16(%rsp)
-; CHECK-NEXT:  movq 24(%rsp), %rax
-; CHECK-NEXT:  movq %rax, 8(%rsp)
-; CHECK-NEXT:  movq $0, (%rsp)
-; CHECK-NEXT:  movaps (%rsp), %xmm0
-; CHECK-NEXT:  callq __addtf3
-; CHECK:       retq
 }
 
 ; Compare i128 values and check i128 constants.
 define fp128 @TestComp(fp128 %x, fp128 %y) #0 {
+; CHECK-LABEL: TestComp:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    cmpq $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    jns .LBB8_2
+; CHECK-NEXT:  # BB#1: # %entry
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:  .LBB8_2: # %entry
+; CHECK-NEXT:    retq
 entry:
   %0 = bitcast fp128 %x to i128
   %cmp = icmp sgt i128 %0, -1
   %cond = select i1 %cmp, fp128 %x, fp128 %y
   ret fp128 %cond
-; CHECK-LABEL: TestComp:
-; CHECK:       movaps %xmm0, -24(%rsp)
-; CHECK-NEXT:  cmpq $0, -16(%rsp)
-; CHECK-NEXT:  jns
-; CHECK:       movaps %xmm1, %xmm0
-; CHECK:       retq
 }
 
 declare void @foo(fp128) #1
 
 ; Test logical operations on fp128 values.
 define fp128 @TestFABS_LD(fp128 %x) #0 {
+; CHECK-LABEL: TestFABS_LD:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    retq
 entry:
   %call = tail call fp128 @fabsl(fp128 %x) #2
   ret fp128 %call
-; CHECK-LABEL: TestFABS_LD
-; CHECK:       andps {{.*}}, %xmm0
-; CHECK-NEXT:  retq
 }
 
 declare fp128 @fabsl(fp128) #1
@@ -282,6 +328,43 @@ declare fp128 @copysignl(fp128, fp128) #1
 
 ; Test more complicated logical operations generated from copysignl.
 define void @TestCopySign({ fp128, fp128 }* noalias nocapture sret %agg.result, { fp128, fp128 }* byval nocapture readonly align 16 %z) #0 {
+; CHECK-LABEL: TestCopySign:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-NEXT:    movaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    callq __gttf2
+; CHECK-NEXT:    movl %eax, %ebp
+; CHECK-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    callq __subtf3
+; CHECK-NEXT:    testl %ebp, %ebp
+; CHECK-NEXT:    jle .LBB10_1
+; CHECK-NEXT:  # BB#2: # %if.then
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    movaps %xmm1, %xmm2
+; CHECK-NEXT:    jmp .LBB10_3
+; CHECK-NEXT:  .LBB10_1:
+; CHECK-NEXT:    movaps (%rsp), %xmm2 # 16-byte Reload
+; CHECK-NEXT:  .LBB10_3: # %cleanup
+; CHECK-NEXT:    movaps {{.*}}(%rip), %xmm1
+; CHECK-NEXT:    andps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    andps {{.*}}(%rip), %xmm0
+; CHECK-NEXT:    orps %xmm1, %xmm0
+; CHECK-NEXT:    movaps %xmm2, (%rbx)
+; CHECK-NEXT:    movaps %xmm0, 16(%rbx)
+; CHECK-NEXT:    movq %rbx, %rax
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    retq
 entry:
   %z.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %z, i64 0, i32 0
   %z.real = load fp128, fp128* %z.realp, align 16
@@ -304,17 +387,9 @@ cleanup:                                          ; preds = %entry, %if.then
   store fp128 %call.sink, fp128* %0, align 16
   store fp128 %call5, fp128* %1, align 16
   ret void
-; CHECK-LABEL: TestCopySign
-; CHECK-NOT:   call
-; CHECK:       callq __subtf3
-; CHECK-NOT:   call
-; CHECK:       callq __gttf2
-; CHECK-NOT:   call
-; CHECK:       andps {{.*}}, %xmm0
-; CHECK:       retq
 }
 
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
diff --git a/test/CodeGen/X86/fp128-libcalls.ll b/test/CodeGen/X86/fp128-libcalls.ll
index ee5fa447448cc43ea9dd41aafeb010858c39ae13..09bda890fa8cafe0bd99cc7fcb333d7e4f61ba36 100644
--- a/test/CodeGen/X86/fp128-libcalls.ll
+++ b/test/CodeGen/X86/fp128-libcalls.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx \
+; RUN:     -enable-legalize-types-checking | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx \
+; RUN:     -enable-legalize-types-checking | FileCheck %s
 
 ; Check all soft floating point library function calls.
 
diff --git a/test/CodeGen/X86/fp128-load.ll b/test/CodeGen/X86/fp128-load.ll
index 73bacf87275ebcd63d1a8815c1b129171351f71d..bd70ab5a1ac739c2fb4c4dafd0250b529e8968eb 100644
--- a/test/CodeGen/X86/fp128-load.ll
+++ b/test/CodeGen/X86/fp128-load.ll
@@ -1,5 +1,7 @@
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx \
+; RUN:     -enable-legalize-types-checking | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx \
+; RUN:     -enable-legalize-types-checking | FileCheck %s
 
 ; __float128 myFP128 = 1.0L;  // x86_64-linux-android
 @my_fp128 = global fp128 0xL00000000000000003FFF000000000000, align 16
diff --git a/test/CodeGen/X86/fp128-select.ll b/test/CodeGen/X86/fp128-select.ll
index dc41d5095a719f6f5bdbb63daea7101a6cbbc0ba..c02db1fcdde845f59407f576caa1ca47d29a66c4 100644
--- a/test/CodeGen/X86/fp128-select.ll
+++ b/test/CodeGen/X86/fp128-select.ll
@@ -1,8 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx | FileCheck %s --check-prefix=MMX
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx | FileCheck %s --check-prefix=MMX
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-android | FileCheck %s
-; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android -mattr=+mmx \
+; RUN:     -enable-legalize-types-checking | FileCheck %s --check-prefix=MMX
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu -mattr=+mmx \
+; RUN:     -enable-legalize-types-checking | FileCheck %s --check-prefix=MMX
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-android \
+; RUN:     -enable-legalize-types-checking | FileCheck %s
+; RUN: llc < %s -O2 -mtriple=x86_64-linux-gnu \
+; RUN:     -enable-legalize-types-checking | FileCheck %s
 
 define void @test_select(fp128* %p, fp128* %q, i1 zeroext %c) {
 ; MMX-LABEL: test_select:
diff --git a/test/CodeGen/X86/huge-stack-offset2.ll b/test/CodeGen/X86/huge-stack-offset2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9ac85b618dbda74ec6c4edff002961eb5999b0d9
--- /dev/null
+++ b/test/CodeGen/X86/huge-stack-offset2.ll
@@ -0,0 +1,62 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=CHECK
+
+; Test how we handle pathologically large stack frames when RAX is live through
+; the prologue and epilogue.
+
+declare void @bar(i8*)
+declare void @llvm.va_start(i8*)
+
+; For stack frames between 2GB and 16GB, do multiple adjustments.
+
+define i32 @stack_frame_8gb(i32 %x, ...) nounwind {
+; CHECK-LABEL: stack_frame_8gb:
+; CHECK:      subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      subq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      subq ${{.*}}, %rsp
+; CHECK:      callq bar
+; CHECK:      addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      addq ${{.*}}, %rsp # imm = 0x7FFFFFFF
+; CHECK:      addq ${{.*}}, %rsp
+; CHECK:      retq
+  %1 = alloca [u0x200000000 x i8]
+  %va = alloca i8, i32 24
+  call void @llvm.va_start(i8* %va)
+  %2 = getelementptr inbounds [u0x200000000 x i8], [u0x200000000 x i8]* %1, i32 0, i32 0
+  call void @bar(i8* %2)
+  ret i32 %x
+}
+
+; For stack frames larger than 16GB, spill EAX instead of doing a linear number
+; of adjustments.
+
+; This function should have a frame size of 0x4000000D0. The 0xD0 is 208 bytes
+; from 24 bytes of va_list, 176 bytes of spilled varargs regparms, and 8 bytes
+; of alignment. We subtract 8 less and add 8 more in the prologue and epilogue
+; respectively to account for the PUSH.
+
+define i32 @stack_frame_16gb(i32 %x, ...) nounwind {
+; CHECK-LABEL: stack_frame_16gb:
+; CHECK:      pushq %rax
+; CHECK-NEXT: movabsq ${{.*}}, %rax # imm = 0xFFFFFFFBFFFFFF38
+; CHECK-NEXT: addq %rsp, %rax
+; CHECK-NEXT: xchgq %rax, (%rsp)
+; CHECK-NEXT: movq (%rsp), %rsp
+; CHECK:      callq bar
+; CHECK:      pushq %rax
+; CHECK-NEXT: movabsq ${{.*}}, %rax # imm = 0x4000000D8
+; CHECK-NEXT: addq %rsp, %rax
+; CHECK-NEXT: xchgq %rax, (%rsp)
+; CHECK-NEXT: movq (%rsp), %rsp
+; CHECK:      retq
+  %1 = alloca [u0x400000000 x i8]
+  %va = alloca i8, i32 24
+  call void @llvm.va_start(i8* %va)
+  %2 = getelementptr inbounds [u0x400000000 x i8], [u0x400000000 x i8]* %1, i32 0, i32 0
+  call void @bar(i8* %2)
+  ret i32 %x
+}
+
diff --git a/test/CodeGen/X86/i256-add.ll b/test/CodeGen/X86/i256-add.ll
index 6164d898ca11092d13f50395e32ab702e242546e..a745f652d0653a9a20975ebe9042c1ec24ff0ff3 100644
--- a/test/CodeGen/X86/i256-add.ll
+++ b/test/CodeGen/X86/i256-add.ll
@@ -1,8 +1,67 @@
-; RUN: llc < %s -march=x86 > %t
-; RUN: grep adcl %t | count 7
-; RUN: grep sbbl %t | count 7
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
 
 define void @add(i256* %p, i256* %q) nounwind {
+; X32-LABEL: add:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $12, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl 8(%ecx), %edx
+; X32-NEXT:    movl (%ecx), %ebx
+; X32-NEXT:    movl 4(%ecx), %edi
+; X32-NEXT:    movl 28(%eax), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 24(%eax), %ebp
+; X32-NEXT:    addl (%eax), %ebx
+; X32-NEXT:    adcl 4(%eax), %edi
+; X32-NEXT:    adcl 8(%eax), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 20(%eax), %esi
+; X32-NEXT:    movl 12(%eax), %edx
+; X32-NEXT:    movl 16(%eax), %eax
+; X32-NEXT:    adcl 12(%ecx), %edx
+; X32-NEXT:    adcl 16(%ecx), %eax
+; X32-NEXT:    adcl 20(%ecx), %esi
+; X32-NEXT:    adcl 24(%ecx), %ebp
+; X32-NEXT:    movl %ebp, (%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebp # 4-byte Reload
+; X32-NEXT:    adcl %ebp, 28(%ecx)
+; X32-NEXT:    movl %ebx, (%ecx)
+; X32-NEXT:    movl %edi, 4(%ecx)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, 8(%ecx)
+; X32-NEXT:    movl %edx, 12(%ecx)
+; X32-NEXT:    movl %eax, 16(%ecx)
+; X32-NEXT:    movl %esi, 20(%ecx)
+; X32-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X32-NEXT:    movl %eax, 24(%ecx)
+; X32-NEXT:    addl $12, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: add:
+; X64:       # BB#0:
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq 8(%rdi), %rdx
+; X64-NEXT:    movq 24(%rsi), %r8
+; X64-NEXT:    addq (%rsi), %rcx
+; X64-NEXT:    adcq 8(%rsi), %rdx
+; X64-NEXT:    adcq 16(%rsi), %rax
+; X64-NEXT:    adcq %r8, 24(%rdi)
+; X64-NEXT:    movq %rcx, (%rdi)
+; X64-NEXT:    movq %rdx, 8(%rdi)
+; X64-NEXT:    movq %rax, 16(%rdi)
+; X64-NEXT:    retq
   %a = load i256, i256* %p
   %b = load i256, i256* %q
   %c = add i256 %a, %b
@@ -10,6 +69,63 @@ define void @add(i256* %p, i256* %q) nounwind {
   ret void
 }
 define void @sub(i256* %p, i256* %q) nounwind {
+; X32-LABEL: sub:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    subl $8, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl 16(%ecx), %eax
+; X32-NEXT:    movl 12(%ecx), %edx
+; X32-NEXT:    movl 8(%ecx), %edi
+; X32-NEXT:    movl (%ecx), %ebx
+; X32-NEXT:    movl 4(%ecx), %ebp
+; X32-NEXT:    subl (%esi), %ebx
+; X32-NEXT:    sbbl 4(%esi), %ebp
+; X32-NEXT:    sbbl 8(%esi), %edi
+; X32-NEXT:    sbbl 12(%esi), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    sbbl 16(%esi), %eax
+; X32-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X32-NEXT:    movl 20(%ecx), %edx
+; X32-NEXT:    sbbl 20(%esi), %edx
+; X32-NEXT:    movl 24(%ecx), %eax
+; X32-NEXT:    sbbl 24(%esi), %eax
+; X32-NEXT:    movl 28(%esi), %esi
+; X32-NEXT:    sbbl %esi, 28(%ecx)
+; X32-NEXT:    movl %ebx, (%ecx)
+; X32-NEXT:    movl %ebp, 4(%ecx)
+; X32-NEXT:    movl %edi, 8(%ecx)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 12(%ecx)
+; X32-NEXT:    movl (%esp), %esi # 4-byte Reload
+; X32-NEXT:    movl %esi, 16(%ecx)
+; X32-NEXT:    movl %edx, 20(%ecx)
+; X32-NEXT:    movl %eax, 24(%ecx)
+; X32-NEXT:    addl $8, %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: sub:
+; X64:       # BB#0:
+; X64-NEXT:    movq 16(%rdi), %rax
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq 8(%rdi), %rdx
+; X64-NEXT:    movq 24(%rsi), %r8
+; X64-NEXT:    subq (%rsi), %rcx
+; X64-NEXT:    sbbq 8(%rsi), %rdx
+; X64-NEXT:    sbbq 16(%rsi), %rax
+; X64-NEXT:    sbbq %r8, 24(%rdi)
+; X64-NEXT:    movq %rcx, (%rdi)
+; X64-NEXT:    movq %rdx, 8(%rdi)
+; X64-NEXT:    movq %rax, 16(%rdi)
+; X64-NEXT:    retq
   %a = load i256, i256* %p
   %b = load i256, i256* %q
   %c = sub i256 %a, %b
diff --git a/test/CodeGen/X86/i386-shrink-wrapping.ll b/test/CodeGen/X86/i386-shrink-wrapping.ll
index 2c3e384b70a680764b2ff013524bb21b6e55306c..d4e099ac6558e9a0de43702d2fe85eb9e757d1eb 100644
--- a/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -55,8 +55,7 @@ target triple = "i386-apple-macosx10.5"
 ;
 ; CHECK-NEXT: L_e$non_lazy_ptr, [[E:%[a-z]+]]
 ; CHECK-NEXT: movb [[D]], ([[E]])
-; CHECK-NEXT: L_f$non_lazy_ptr, [[F:%[a-z]+]]
-; CHECK-NEXT: movsbl ([[F]]), [[CONV:%[a-z]+]]
+; CHECK-NEXT: movsbl ([[E]]), [[CONV:%[a-z]+]]
 ; CHECK-NEXT: movl $6, [[CONV:%[a-z]+]]
 ; The eflags is used in the next instruction.
 ; If that instruction disappear, we are not exercising the bug
@@ -96,7 +95,7 @@ for.end:                                          ; preds = %for.cond.preheader
   %.b3 = load i1, i1* @d, align 1
   %tmp2 = select i1 %.b3, i8 0, i8 6
   store i8 %tmp2, i8* @e, align 1
-  %tmp3 = load i8, i8* @f, align 1
+  %tmp3 = load i8, i8* @e, align 1
   %conv = sext i8 %tmp3 to i32
   %add = add nsw i32 %conv, 1
   %rem = srem i32 %tmp1, %add
diff --git a/test/CodeGen/X86/illegal-bitfield-loadstore.ll b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ceb4657119065b3b228e49fbcc2a067214f16f06
--- /dev/null
+++ b/test/CodeGen/X86/illegal-bitfield-loadstore.ll
@@ -0,0 +1,141 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+define void @i24_or(i24* %a) {
+; CHECK-LABEL: i24_or:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    movzbl 2(%rdi), %ecx
+; CHECK-NEXT:    movb %cl, 2(%rdi)
+; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    orl $384, %ecx # imm = 0x180
+; CHECK-NEXT:    movw %cx, (%rdi)
+; CHECK-NEXT:    retq
+  %aa = load i24, i24* %a, align 1
+  %b = or i24 %aa, 384
+  store i24 %b, i24* %a, align 1
+  ret void
+}
+
+define void @i24_and_or(i24* %a) {
+; CHECK-LABEL: i24_and_or:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    movzbl 2(%rdi), %ecx
+; CHECK-NEXT:    movb %cl, 2(%rdi)
+; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    orl $384, %ecx # imm = 0x180
+; CHECK-NEXT:    andl $16777088, %ecx # imm = 0xFFFF80
+; CHECK-NEXT:    movw %cx, (%rdi)
+; CHECK-NEXT:    retq
+  %b = load i24, i24* %a, align 1
+  %c = and i24 %b, -128
+  %d = or i24 %c, 384
+  store i24 %d, i24* %a, align 1
+  ret void
+}
+
+define void @i24_insert_bit(i24* %a, i1 zeroext %bit) {
+; CHECK-LABEL: i24_insert_bit:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    movzwl (%rdi), %ecx
+; CHECK-NEXT:    movzbl 2(%rdi), %edx
+; CHECK-NEXT:    movb %dl, 2(%rdi)
+; CHECK-NEXT:    shll $16, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    shll $13, %eax
+; CHECK-NEXT:    andl $16769023, %edx # imm = 0xFFDFFF
+; CHECK-NEXT:    orl %eax, %edx
+; CHECK-NEXT:    movw %dx, (%rdi)
+; CHECK-NEXT:    retq
+  %extbit = zext i1 %bit to i24
+  %b = load i24, i24* %a, align 1
+  %extbit.shl = shl nuw nsw i24 %extbit, 13
+  %c = and i24 %b, -8193
+  %d = or i24 %c, %extbit.shl
+  store i24 %d, i24* %a, align 1
+  ret void
+}
+
+define void @i56_or(i56* %a) {
+; CHECK-LABEL: i56_or:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzwl 4(%rdi), %eax
+; CHECK-NEXT:    movzbl 6(%rdi), %ecx
+; CHECK-NEXT:    movl (%rdi), %edx
+; CHECK-NEXT:    movb %cl, 6(%rdi)
+; CHECK-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
+; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    shlq $32, %rcx
+; CHECK-NEXT:    orq %rcx, %rdx
+; CHECK-NEXT:    orq $384, %rdx # imm = 0x180
+; CHECK-NEXT:    movl %edx, (%rdi)
+; CHECK-NEXT:    shrq $32, %rdx
+; CHECK-NEXT:    movw %dx, 4(%rdi)
+; CHECK-NEXT:    retq
+  %aa = load i56, i56* %a, align 1
+  %b = or i56 %aa, 384
+  store i56 %b, i56* %a, align 1
+  ret void
+}
+
+define void @i56_and_or(i56* %a) {
+; CHECK-LABEL: i56_and_or:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzwl 4(%rdi), %eax
+; CHECK-NEXT:    movzbl 6(%rdi), %ecx
+; CHECK-NEXT:    movl (%rdi), %edx
+; CHECK-NEXT:    movb %cl, 6(%rdi)
+; CHECK-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<kill> %RCX<def>
+; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    orl %eax, %ecx
+; CHECK-NEXT:    shlq $32, %rcx
+; CHECK-NEXT:    orq %rcx, %rdx
+; CHECK-NEXT:    orq $384, %rdx # imm = 0x180
+; CHECK-NEXT:    movabsq $72057594037927808, %rax # imm = 0xFFFFFFFFFFFF80
+; CHECK-NEXT:    andq %rdx, %rax
+; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    shrq $32, %rax
+; CHECK-NEXT:    movw %ax, 4(%rdi)
+; CHECK-NEXT:    retq
+  %b = load i56, i56* %a, align 1
+  %c = and i56 %b, -128
+  %d = or i56 %c, 384
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
+define void @i56_insert_bit(i56* %a, i1 zeroext %bit) {
+; CHECK-LABEL: i56_insert_bit:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    movzwl 4(%rdi), %ecx
+; CHECK-NEXT:    movzbl 6(%rdi), %edx
+; CHECK-NEXT:    movl (%rdi), %esi
+; CHECK-NEXT:    movb %dl, 6(%rdi)
+; CHECK-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<kill> %RDX<def>
+; CHECK-NEXT:    shll $16, %edx
+; CHECK-NEXT:    orl %ecx, %edx
+; CHECK-NEXT:    shlq $32, %rdx
+; CHECK-NEXT:    orq %rdx, %rsi
+; CHECK-NEXT:    shlq $13, %rax
+; CHECK-NEXT:    movabsq $72057594037919743, %rcx # imm = 0xFFFFFFFFFFDFFF
+; CHECK-NEXT:    andq %rsi, %rcx
+; CHECK-NEXT:    orq %rax, %rcx
+; CHECK-NEXT:    movl %ecx, (%rdi)
+; CHECK-NEXT:    shrq $32, %rcx
+; CHECK-NEXT:    movw %cx, 4(%rdi)
+; CHECK-NEXT:    retq
+  %extbit = zext i1 %bit to i56
+  %b = load i56, i56* %a, align 1
+  %extbit.shl = shl nuw nsw i56 %extbit, 13
+  %c = and i56 %b, -8193
+  %d = or i56 %c, %extbit.shl
+  store i56 %d, i56* %a, align 1
+  ret void
+}
+
diff --git a/test/CodeGen/X86/implicit-null-check.ll b/test/CodeGen/X86/implicit-null-check.ll
index 9a8a3a4369d331fabfa1fa645c47fedd09134f58..ee795667cdb19832119b9166d0656c8b798a2525 100644
--- a/test/CodeGen/X86/implicit-null-check.ll
+++ b/test/CodeGen/X86/implicit-null-check.ll
@@ -135,6 +135,53 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
   ret i32 200
 }
 
+define i32 @imp_null_check_gep_load_with_use_dep(i32* %x, i32 %a) {
+; CHECK-LABEL: imp_null_check_gep_load_with_use_dep:
+; CHECK: [[BB0_imp_null_check_gep_load_with_use_dep:L[^:]+]]:
+; CHECK: movl (%rdi), %eax
+; CHECK: addl %edi, %esi
+; CHECK: leal 4(%rax,%rsi), %eax
+; CHECK: retq
+; CHECK: [[BB1_imp_null_check_gep_load_with_use_dep:LBB5_[0-9]+]]:
+; CHECK: movl $42, %eax
+; CHECK: retq
+
+ entry:
+  %c = icmp eq i32* %x, null
+  br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+  ret i32 42
+
+ not_null:
+  %x.loc = getelementptr i32, i32* %x, i32 1
+  %y = ptrtoint i32* %x.loc to i32
+  %b = add i32 %a, %y
+  %t = load i32, i32* %x
+  %z = add i32 %t, %b
+  ret i32 %z
+}
+
+define void @imp_null_check_store(i32* %x) {
+; CHECK-LABEL: _imp_null_check_store:
+; CHECK: [[BB0_imp_null_check_store:L[^:]+]]:
+; CHECK: movl $1, (%rdi)
+; CHECK: retq
+; CHECK: [[BB1_imp_null_check_store:LBB6_[0-9]+]]:
+; CHECK: retq
+
+ entry:
+  %c = icmp eq i32* %x, null
+  br i1 %c, label %is_null, label %not_null, !make.implicit !0
+
+ is_null:
+  ret void
+
+ not_null:
+  store i32 1, i32* %x
+  ret void
+}
+
 !0 = !{}
 
 ; CHECK-LABEL: __LLVM_FaultMaps:
@@ -147,7 +194,7 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
 ; CHECK-NEXT: .short 0
 
 ; # functions:
-; CHECK-NEXT: .long 5
+; CHECK-NEXT: .long 7
 
 ; FunctionAddr:
 ; CHECK-NEXT: .quad _imp_null_check_add_result
@@ -175,6 +222,19 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
 ; Fault[0].HandlerOffset:
 ; CHECK-NEXT: .long [[BB1_imp_null_check_gep_load]]-_imp_null_check_gep_load
 
+; FunctionAddr:
+; CHECK-NEXT: .quad _imp_null_check_gep_load_with_use_dep
+; NumFaultingPCs
+; CHECK-NEXT: .long 1
+; Reserved:
+; CHECK-NEXT: .long 0
+; Fault[0].Type:
+; CHECK-NEXT: .long 1
+; Fault[0].FaultOffset:
+; CHECK-NEXT: .long [[BB0_imp_null_check_gep_load_with_use_dep]]-_imp_null_check_gep_load_with_use_dep
+; Fault[0].HandlerOffset:
+; CHECK-NEXT: .long [[BB1_imp_null_check_gep_load_with_use_dep]]-_imp_null_check_gep_load_with_use_dep
+
 ; FunctionAddr:
 ; CHECK-NEXT: .quad _imp_null_check_hoist_over_unrelated_load
 ; NumFaultingPCs
@@ -201,6 +261,19 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
 ; Fault[0].HandlerOffset:
 ; CHECK-NEXT: .long [[BB1_imp_null_check_load]]-_imp_null_check_load
 
+; FunctionAddr:
+; CHECK-NEXT: .quad _imp_null_check_store
+; NumFaultingPCs
+; CHECK-NEXT: .long 1
+; Reserved:
+; CHECK-NEXT: .long 0
+; Fault[0].Type:
+; CHECK-NEXT: .long 3
+; Fault[0].FaultOffset:
+; CHECK-NEXT: .long [[BB0_imp_null_check_store]]-_imp_null_check_store
+; Fault[0].HandlerOffset:
+; CHECK-NEXT: .long [[BB1_imp_null_check_store]]-_imp_null_check_store
+
 ; FunctionAddr:
 ; CHECK-NEXT: .quad     _imp_null_check_via_mem_comparision
 ; NumFaultingPCs
@@ -216,12 +289,18 @@ define i32 @imp_null_check_via_mem_comparision(i32* %x, i32 %val) {
 
 ; OBJDUMP: FaultMap table:
 ; OBJDUMP-NEXT: Version: 0x1
-; OBJDUMP-NEXT: NumFunctions: 5
+; OBJDUMP-NEXT: NumFunctions: 7
 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 5
 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 7
 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
+; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 9
+; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 7
 ; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
 ; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 3
+; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
+; OBJDUMP-NEXT: Fault kind: FaultingStore, faulting PC offset: 0, handling PC offset: 7
+; OBJDUMP-NEXT: FunctionAddress: 0x000000, NumFaultingPCs: 1
+; OBJDUMP-NEXT: Fault kind: FaultingLoad, faulting PC offset: 0, handling PC offset: 11
diff --git a/test/CodeGen/X86/implicit-null-checks.mir b/test/CodeGen/X86/implicit-null-checks.mir
index d2a9e5e50a27ed9075662cd1b02fca6d6a870ca4..39bfedaa7814a571c1960ccf5da1dbe0c6d8273c 100644
--- a/test/CodeGen/X86/implicit-null-checks.mir
+++ b/test/CodeGen/X86/implicit-null-checks.mir
@@ -143,6 +143,228 @@
     ret i32 0
   }
 
+  define i32 @imp_null_check_gep_load_with_use_dep(i32* %x, i32 %a) {
+  entry:
+    %c = icmp eq i32* %x, null
+    br i1 %c, label %is_null, label %not_null, !make.implicit !0
+  
+  is_null:                                          ; preds = %entry
+    ret i32 42
+  
+  not_null:                                         ; preds = %entry
+    %x.loc = getelementptr i32, i32* %x, i32 1
+    %y = ptrtoint i32* %x.loc to i32
+    %b = add i32 %a, %y
+    %t = load i32, i32* %x
+    %z = add i32 %t, %b
+    ret i32 %z
+  }
+
+  define i32 @imp_null_check_load_with_base_sep(i32* %x, i32 %a) {
+  entry:
+    %c = icmp eq i32* %x, null
+    br i1 %c, label %is_null, label %not_null, !make.implicit !0
+  
+  is_null:                                          ; preds = %entry
+    ret i32 42
+  
+  not_null:                                         ; preds = %entry
+    ret i32 undef
+  }
+
+  define void @inc_store(i32* %ptr, i32 %val) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret void
+
+  is_null:
+    ret void
+  }
+
+  define void @inc_store_plus_offset(i32* %ptr, i32 %val) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret void
+
+  is_null:
+    ret void
+  }
+
+  define void @inc_store_with_dep(i32* %ptr, i32 %val) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret void
+
+  is_null:
+    ret void
+  }
+
+  define i32 @inc_store_with_dep_in_null(i32* %ptr, i32 %val) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret i32 undef
+
+  is_null:
+    ret i32 undef
+  }
+
+  define void @inc_store_with_volatile(i32* %ptr, i32 %val) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret void
+
+  is_null:
+    ret void
+  }
+
+  define void @inc_store_with_two_dep(i32* %ptr, i32 %val) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret void
+
+  is_null:
+    ret void
+  }
+
+  define void @inc_store_with_redefined_base(i32* %ptr, i32 %val) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret void
+
+  is_null:
+    ret void
+  }
+
+  define i32 @inc_store_with_reused_base(i32* %ptr, i32 %val) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret i32 undef
+
+  is_null:
+    ret i32 undef
+  }
+
+  define i32 @inc_store_across_call(i32* %ptr) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    call void @f()
+    ret i32 undef
+
+  is_null:
+    ret i32 undef
+  }
+
+  define i32 @inc_store_with_dep_in_dep(i32* %ptr, i32 %val) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret i32 undef
+
+  is_null:
+    ret i32 undef
+  }
+
+  define i32 @inc_store_with_load_over_store(i32* %ptr, i32* %ptr2) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret i32 undef
+
+  is_null:
+    ret i32 undef
+  }
+
+  define i32 @inc_store_with_store_over_load(i32* %ptr, i32* %ptr2) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret i32 undef
+
+  is_null:
+    ret i32 undef
+  }
+
+  define void @inc_store_with_store_over_store(i32* %ptr, i32* %ptr2) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret void
+
+  is_null:
+    ret void
+  }
+
+  define void @inc_store_with_load_and_store(i32* %ptr, i32* %ptr2) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret void
+
+  is_null:
+    ret void
+  }
+
+  define i32 @inc_store_and_load_no_alias(i32* noalias %ptr, i32* noalias %ptr2) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret i32 undef
+
+  is_null:
+    ret i32 undef
+  }
+
+  define i32 @inc_store_and_load_alias(i32* %ptr, i32* %ptr2) {
+  entry:
+    %ptr_is_null = icmp eq i32* %ptr, null
+    br i1 %ptr_is_null, label %is_null, label %not_null, !make.implicit !0
+
+  not_null:
+    ret i32 undef
+
+  is_null:
+    ret i32 undef
+  }
+
   attributes #0 = { "target-features"="+bmi,+bmi2" }
 
   !0 = !{}
@@ -157,7 +379,7 @@ liveins:
   - { reg: '%esi' }
 # CHECK:  bb.0.entry:
 # CHECK:    %eax = MOV32ri 2200000
-# CHECK-NEXT:    %eax = FAULTING_LOAD_OP %bb.3.is_null, {{[0-9]+}}, killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT:    %eax = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, killed %eax, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
 # CHECK-NEXT:    JMP_1 %bb.1.not_null
 
 body:             |
@@ -179,15 +401,15 @@ body:             |
 
   bb.2.ret_200:
     %eax = MOV32ri 200
-    RET 0, %eax
+    RETQ %eax
 
   bb.3.is_null:
     %eax = MOV32ri 42
-    RET 0, %eax
+    RETQ %eax
 
   bb.4.ret_100:
     %eax = MOV32ri 100
-    RET 0, %eax
+    RETQ %eax
 
 ...
 ---
@@ -229,11 +451,11 @@ body:             |
   bb.3.is_null:
     liveins: %eax, %ah, %al, %ax, %bh, %bl, %bp, %bpl, %bx, %eax, %ebp, %ebx, %rax, %rbp, %rbx, %r12, %r13, %r14, %r15, %r12b, %r13b, %r14b, %r15b, %r12d, %r13d, %r14d, %r15d, %r12w, %r13w, %r14w, %r15w
 
-    RET 0, %eax
+    RETQ %eax
 
   bb.4.ret_100:
     %eax = MOV32ri 100
-    RET 0, %eax
+    RETQ %eax
 
 ...
 ---
@@ -268,15 +490,15 @@ body:             |
 
   bb.2.ret_200:
     %eax = MOV32ri 200
-    RET 0, %eax
+    RETQ %eax
 
   bb.3.is_null:
     %eax = MOV32ri 42
-    RET 0, %eax
+    RETQ %eax
 
   bb.4.ret_100:
     %eax = MOV32ri 100
-    RET 0, %eax
+    RETQ %eax
 
 ...
 ---
@@ -310,15 +532,15 @@ body:             |
 
   bb.2.ret_200:
     %eax = MOV32ri 200
-    RET 0, %eax
+    RETQ %eax
 
   bb.3.is_null:
     %eax = MOV32ri 42
-    RET 0, %eax
+    RETQ %eax
 
   bb.4.ret_100:
     %eax = MOV32ri 100
-    RET 0, %eax
+    RETQ %eax
 
 ...
 ---
@@ -331,7 +553,7 @@ liveins:
   - { reg: '%rsi' }
 # CHECK:  bb.0.entry:
 # CHECK:  %rbx = MOV64rr %rdx
-# CHECK-NEXT:  %rdi = FAULTING_LOAD_OP %bb.3.is_null, {{[0-9]+}}, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
+# CHECK-NEXT:  %rdi = FAULTING_OP 1, %bb.3.is_null, {{[0-9]+}}, killed %rbx, killed %rdi, 1, _, 0, _, implicit-def dead %eflags :: (load 4 from %ir.x)
 
 body:             |
   bb.0.entry:
@@ -353,15 +575,15 @@ body:             |
 
   bb.2.ret_200:
     %eax = MOV32ri 200
-    RET 0, %eax
+    RETQ %eax
 
   bb.3.is_null:
     %eax = MOV32ri 42
-    RET 0, %eax
+    RETQ %eax
 
   bb.4.ret_100:
     %eax = MOV32ri 100
-    RET 0, %eax
+    RETQ %eax
 
 ...
 ---
@@ -376,7 +598,7 @@ calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx',
                         '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d',
                         '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
 # CHECK: body:
-# CHECK-NOT: FAULTING_LOAD_OP
+# CHECK-NOT: FAULTING_OP
 # CHECK: bb.1.stay:
 # CHECK: CALL64pcrel32
 body:             |
@@ -409,7 +631,7 @@ body:             |
 name:            dependency_live_in_hazard
 # CHECK-LABEL: name:            dependency_live_in_hazard
 # CHECK:   bb.0.entry:
-# CHECK-NOT: FAULTING_LOAD_OP
+# CHECK-NOT: FAULTING_OP
 # CHECK: bb.1.not_null:
 
 # Make sure that the BEXTR32rm instruction below is not used to emit
@@ -445,10 +667,10 @@ body:             |
 ...
 ---
 name:            use_alternate_load_op
-# CHECK-LABEL: use_alternate_load_op
+# CHECK-LABEL: name:            use_alternate_load_op
 # CHECK: bb.0.entry:
-# CHECK: TEST64rr %rdi, %rdi, implicit-def %eflags
-# CHECK-NEXT: JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: %rax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _
+# CHECK-NEXT: JMP_1 %bb.1.not_null
 # CHECK: bb.1.not_null
 
 alignment:       4
@@ -468,9 +690,607 @@ body:             |
     liveins: %rdi, %rsi
 
     %rcx = MOV64rm killed %rsi, 1, _, 0, _
-    %rdx = AND64rm killed %rcx, %rdi, 1, _, 0, _, implicit-def dead %eflags
-    %r10 = MOV64rm killed %rdi, 1, _, 0, _
-    RETQ %r10d
+    %rcx = AND64rm killed %rcx, %rdi, 1, _, 0, _, implicit-def dead %eflags
+    %rax = MOV64rm killed %rdi, 1, _, 0, _
+    RETQ %eax
+
+  bb.2.is_null:
+    %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+    RETQ %eax
+
+...
+---
+name:            imp_null_check_gep_load_with_use_dep
+# CHECK-LABEL: name:            imp_null_check_gep_load_with_use_dep
+# CHECK:  bb.0.entry:
+# CHECK:    %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x)
+# CHECK-NEXT:    JMP_1 %bb.1.not_null
+alignment:       4
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1.is_null(0x30000000), %bb.2.not_null(0x50000000)
+    liveins: %rsi, %rdi
+  
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.1.is_null, implicit %eflags
+  
+  bb.2.not_null:
+    liveins: %rdi, %rsi
+  
+    %rsi = ADD64rr %rsi, %rdi, implicit-def dead %eflags
+    %eax = MOV32rm killed %rdi, 1, _, 0, _, implicit-def %rax :: (load 4 from %ir.x)
+    %eax = LEA64_32r killed %rax, 1, killed %rsi, 4, _
+    RETQ %eax
+  
+  bb.1.is_null:
+    %eax = MOV32ri 42
+    RETQ %eax
+
+...
+---
+name:            imp_null_check_load_with_base_sep
+# CHECK-LABEL: name:            imp_null_check_load_with_base_sep
+# CHECK:  bb.0.entry:
+# CHECK:     %rsi = ADD64rr %rsi, %rdi, implicit-def dead %eflags
+# CHECK-NEXT:    %esi = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %esi, %rdi, 1, _, 0, _, implicit-def dead %eflags
+# CHECK-NEXT:    JMP_1 %bb.1.not_null
+alignment:       4
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1.is_null(0x30000000), %bb.2.not_null(0x50000000)
+    liveins: %rsi, %rdi
+  
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.1.is_null, implicit %eflags
+  
+  bb.2.not_null:
+    liveins: %rdi, %rsi
+  
+    %rsi = ADD64rr %rsi, %rdi, implicit-def dead %eflags
+    %esi = AND32rm killed %esi, %rdi, 1, _, 0, _, implicit-def dead %eflags
+    %eax = MOV32rr %esi
+    RETQ %eax
+  
+  bb.1.is_null:
+    %eax = MOV32ri 42
+    RETQ %eax
+
+...
+---
+name:            inc_store
+# CHECK-LABEL: name:            inc_store
+# CHECK: bb.0.entry:
+# CHECK:  _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, killed %rsi
+# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    MOV64mr killed %rdi, 1, _, 0, _, killed %rsi
+    RETQ
+
+  bb.2.is_null:
+    RETQ
+
+...
+---
+name:            inc_store_plus_offset
+# CHECK-LABEL: inc_store_plus_offset
+# CHECK: bb.0.entry:
+# CHECK:  _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %rsi
+# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    MOV64mr killed %rdi, 1, _, 16, _, killed %rsi
+    RETQ
+
+  bb.2.is_null:
+    RETQ
+
+...
+---
+name:            inc_store_with_dep
+# CHECK-LABEL: inc_store_with_dep
+# CHECK: bb.0.entry:
+# CHECK:  %esi = ADD32rr killed %esi, killed %esi, implicit-def dead %eflags
+# CHECK-NEXT:  _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %esi
+# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    %esi = ADD32rr killed %esi, killed %esi, implicit-def dead %eflags
+    MOV32mr killed %rdi, 1, _, 16, _, killed %esi
+    RETQ
+
+  bb.2.is_null:
+    RETQ
+
+...
+---
+name:            inc_store_with_dep_in_null
+# CHECK-LABEL: inc_store_with_dep_in_null
+# CHECK: bb.0.entry:
+# CHECK:    TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT:    JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    %esi = ADD32rr %esi, %esi, implicit-def dead %eflags
+    MOV32mr killed %rdi, 1, _, 0, _, %esi
+    %eax = MOV32rr killed %esi
+    RETQ %eax
+
+  bb.2.is_null:
+    liveins: %rsi
+    
+    %eax = MOV32rr killed %esi
+    RETQ %eax
+
+...
+---
+name:            inc_store_with_volatile
+# CHECK-LABEL: inc_store_with_volatile
+# CHECK: bb.0.entry:
+# CHECK:    TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT:    JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    MOV32mr killed %rdi, 1, _, 0, _, killed %esi :: (volatile store 4 into %ir.ptr)
+    RETQ
+
+  bb.2.is_null:
+    RETQ
+
+...
+---
+name:            inc_store_with_two_dep
+# CHECK-LABEL: inc_store_with_two_dep
+# CHECK: bb.0.entry:
+# CHECK:    TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT:    JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    %esi = ADD32rr killed %esi, killed %esi, implicit-def dead %eflags
+    %esi = ADD32ri killed %esi, 15, implicit-def dead %eflags
+    MOV32mr killed %rdi, 1, _, 16, _, killed %esi
+    RETQ
+
+  bb.2.is_null:
+    RETQ
+
+...
+---
+name:            inc_store_with_redefined_base
+# CHECK-LABEL: inc_store_with_redefined_base
+# CHECK: bb.0.entry:
+# CHECK:    TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT:    JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    %rdi = ADD64rr killed %rdi, killed %rdi, implicit-def dead %eflags
+    MOV32mr killed %rdi, 1, _, 16, _, killed %esi
+    RETQ
+
+  bb.2.is_null:
+    RETQ
+
+...
+---
+name:            inc_store_with_reused_base
+# CHECK-LABEL: inc_store_with_reused_base
+# CHECK: bb.0.entry:
+# CHECK:  _ = FAULTING_OP 3, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 16, _, killed %esi
+# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    %rax = MOV64rr %rdi
+    MOV32mr killed %rdi, 1, _, 16, _, killed %esi
+    RETQ %eax
+
+  bb.2.is_null:
+    %rax = XOR64rr undef %rax, undef %rax, implicit-def dead %eflags
+    RETQ %eax
+
+...
+---
+name:            inc_store_across_call
+# CHECK-LABEL: inc_store_across_call
+# CHECK: bb.0.entry:
+# CHECK:    TEST64rr %rbx, %rbx, implicit-def %eflags
+# CHECK-NEXT:    JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx',
+                        '%rbp', '%rbx', '%r12', '%r13', '%r14', '%r15',
+                        '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d',
+                        '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rbx
+
+    frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    CFI_INSTRUCTION offset %rbx, -16
+    %rbx = MOV64rr killed %rdi
+    TEST64rr %rbx, %rbx, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rbx
+
+    CALL64pcrel32 @f, csr_64, implicit %rsp, implicit-def %rsp
+    MOV32mi %rbx, 1, _, 0, _, 20
+    %rax = MOV64rr killed %rbx
+    %rbx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+
+  bb.2.is_null:
+    %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+    %rbx = POP64r implicit-def %rsp, implicit %rsp
+    RETQ %eax
+
+...
+---
+name:            inc_store_with_dep_in_dep
+# CHECK-LABEL: inc_store_with_dep_in_dep
+# CHECK: bb.0.entry:
+# CHECK:    TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT:    JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    %eax = MOV32rr %esi
+    %esi = ADD32ri killed %esi, 15, implicit-def dead %eflags
+    MOV32mr killed %rdi, 1, _, 0, _, killed %esi
+    RETQ %eax
+
+  bb.2.is_null:
+    %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+    RETQ %eax
+
+...
+---
+name:            inc_store_with_load_over_store
+# CHECK-LABEL: inc_store_with_load_over_store
+# CHECK: bb.0.entry:
+# CHECK:    TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT:    JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    MOV32mi killed %rsi, 1, _, 0, _, 2
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ 
+    RETQ %eax
+
+  bb.2.is_null:
+    %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+    RETQ %eax
+
+...
+---
+name:            inc_store_with_store_over_load
+# CHECK-LABEL: inc_store_with_store_over_load
+# CHECK: bb.0.entry:
+# CHECK:    TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT:    JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    %eax = MOV32rm killed %rsi, 1, _, 0, _ 
+    MOV32mi killed %rdi, 1, _, 0, _, 2
+    RETQ %eax
+
+  bb.2.is_null:
+    %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+    RETQ %eax
+
+...
+---
+name:            inc_store_with_store_over_store
+# CHECK-LABEL: inc_store_with_store_over_store
+# CHECK: bb.0.entry:
+# CHECK:    TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT:    JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    MOV32mi killed %rsi, 1, _, 0, _, 3 
+    MOV32mi killed %rdi, 1, _, 0, _, 2
+    RETQ
+
+  bb.2.is_null:
+    RETQ
+
+...
+---
+name:            inc_store_with_load_and_store
+# CHECK-LABEL: inc_store_with_load_and_store
+# CHECK: bb.0.entry:
+# CHECK:  _ = FAULTING_OP 2, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _, killed %esi, implicit-def dead %eflags
+# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    %esi = ADD32rr %esi, %esi, implicit-def dead %eflags
+    ADD32mr killed %rdi, 1, _, 0, _, killed %esi, implicit-def dead %eflags
+    RETQ
+
+  bb.2.is_null:
+    RETQ
+
+...
+---
+name:            inc_store_and_load_no_alias
+# CHECK-LABEL: inc_store_and_load_no_alias
+# CHECK: bb.0.entry:
+# CHECK:  %eax = FAULTING_OP 1, %bb.2.is_null, {{[0-9]+}}, killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
+# CHECK-NEXT: JMP_1 %bb.1.not_null
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    MOV32mi killed %rsi, 1, _, 0, _, 3 :: (store 4 into %ir.ptr2)
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
+    RETQ %eax
+
+  bb.2.is_null:
+    %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
+    RETQ %eax
+
+...
+---
+name:            inc_store_and_load_alias
+# CHECK-LABEL: inc_store_and_load_alias
+# CHECK: bb.0.entry:
+# CHECK:    TEST64rr %rdi, %rdi, implicit-def %eflags
+# CHECK-NEXT:    JE_1 %bb.2.is_null, implicit killed %eflags
+# CHECK: bb.1.not_null
+
+alignment:       4
+tracksRegLiveness: true
+liveins:
+  - { reg: '%rdi' }
+  - { reg: '%rsi' }
+body:             |
+  bb.0.entry:
+    successors: %bb.2.is_null, %bb.1.not_null
+    liveins: %rdi, %rsi
+
+    TEST64rr %rdi, %rdi, implicit-def %eflags
+    JE_1 %bb.2.is_null, implicit killed %eflags
+
+  bb.1.not_null:
+    liveins: %rdi, %rsi
+
+    MOV32mi killed %rsi, 1, _, 0, _, 3 :: (store 4 into %ir.ptr2)
+    %eax = MOV32rm killed %rdi, 1, _, 0, _ :: (load 4 from %ir.ptr)
+    RETQ %eax
 
   bb.2.is_null:
     %eax = XOR32rr undef %eax, undef %eax, implicit-def dead %eflags
diff --git a/test/CodeGen/X86/implicit-use-spill.mir b/test/CodeGen/X86/implicit-use-spill.mir
index 827f0f186cedd252c8efcefdc1542736d5809c43..94bdd47b4470ff04b820e6dbf7efc94b9a674cce 100644
--- a/test/CodeGen/X86/implicit-use-spill.mir
+++ b/test/CodeGen/X86/implicit-use-spill.mir
@@ -1,4 +1,4 @@
-# RUN: llc -run-pass=greedy -mtriple=x86_64-apple-macosx -o - %s 2>&1 | FileCheck %s
+# RUN: llc -run-pass=greedy -mtriple=x86_64-apple-macosx -o - %s | FileCheck %s
 
 # Make sure we don't assert when we try to reload a value that is just implicitly used.
 ---
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 25853579a4b7ff0f7a7079241fecec021aa70634..db63a8048836ff7d25bb37eb675e6aaf9679e640 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -1,31 +1,27 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin9 -O0 -optimize-regalloc -regalloc=basic -no-integrated-as | FileCheck %s
 ; rdar://6992609
 
-; CHECK: movl %ecx, 4([[ESP:%e..]])
-; CHECK: movl 4([[ESP]]), [[EDX:%e..]]
-; CHECK: movl [[EDX]], 4([[ESP]])
 target triple = "i386-apple-darwin9.0"
-@llvm.used = appending global [1 x i8*] [i8* bitcast (i64 (i64)* @_OSSwapInt64 to i8*)], section "llvm.metadata"		; <[1 x i8*]*> [#uses=0]
 
 define i64 @_OSSwapInt64(i64 %_data) nounwind {
 entry:
-	%retval = alloca i64		; <i64*> [#uses=2]
-	%_data.addr = alloca i64		; <i64*> [#uses=4]
-	store i64 %_data, i64* %_data.addr
-	%tmp = load i64, i64* %_data.addr		; <i64> [#uses=1]
-	%0 = call i64 asm "bswap   %eax\0A\09bswap   %edx\0A\09xchgl   %eax, %edx", "=A,0,~{dirflag},~{fpsr},~{flags}"(i64 %tmp) nounwind		; <i64> [#uses=1]
-	store i64 %0, i64* %_data.addr
-	%tmp1 = load i64, i64* %_data.addr		; <i64> [#uses=1]
-	store i64 %tmp1, i64* %retval
-	%1 = load i64, i64* %retval		; <i64> [#uses=1]
-	ret i64 %1
+  %0 = call i64 asm "bswap   %eax\0A\09bswap   %edx\0A\09xchgl   %eax, %%edx", "=A,0,~{dirflag},~{fpsr},~{flags}"(i64 %_data) nounwind
+  ret i64 %0
 }
 
+; CHECK-LABEL: __OSSwapInt64:
+; CHECK-DAG: movl 8(%esp), %edx
+; CHECK-DAG: movl 4(%esp), %eax
+; CHECK: ## InlineAsm Start
+; CHECK: ## InlineAsm End
+;       Everything is set up in EAX:EDX, return immediately.
+; CHECK-NEXT: retl
+
 ; The tied operands are not necessarily in the same order as the defs.
 ; PR13742
 define i64 @swapped(i64 %x, i64 %y) nounwind {
 entry:
-	%x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
-        %x1 = extractvalue { i64, i64 } %x0, 0
-        ret i64 %x1
+  %x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+  %x1 = extractvalue { i64, i64 } %x0, 0
+  ret i64 %x1
 }
diff --git a/test/CodeGen/X86/insertelement-zero.ll b/test/CodeGen/X86/insertelement-zero.ll
index 054375a12508236d2c49fb16379693766eb293ae..ea780a2fa68c54afa472d58be09e7882b2a1460d 100644
--- a/test/CodeGen/X86/insertelement-zero.ll
+++ b/test/CodeGen/X86/insertelement-zero.ll
@@ -408,25 +408,21 @@ define <16 x i16> @insert_v16i16_z12345z789ABZDEz(<16 x i16> %a) {
 ; AVX1-LABEL: insert_v16i16_z12345z789ABZDEz:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX1-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_v16i16_z12345z789ABZDEz:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5,6,7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,5],xmm1[6],xmm0[7]
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5,6],xmm1[7]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm2 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7]
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm1[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm2, %ymm0
 ; AVX2-NEXT:    retq
   %1 = insertelement <16 x i16> %a, i16 0, i32 0
   %2 = insertelement <16 x i16> %1, i16 0, i32 6
@@ -492,34 +488,32 @@ define <32 x i8> @insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz(<32 x i8> %a) {
 ; SSE41-NEXT:    xorl %eax, %eax
 ; SSE41-NEXT:    pinsrb $0, %eax, %xmm0
 ; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
-; SSE41-NEXT:    pinsrb $14, %eax, %xmm1
-; SSE41-NEXT:    pinsrb $15, %eax, %xmm1
+; SSE41-NEXT:    pxor %xmm2, %xmm2
+; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7]
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    xorl %eax, %eax
 ; AVX1-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm1
-; AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
 ; AVX1-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: insert_v32i8_z123456789ABCDEzGHIJKLMNOPQRSTzz:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm1
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm1
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm1
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX2-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6],xmm2[7]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %1 = insertelement <32 x i8> %a, i8 0, i32 0
   %2 = insertelement <32 x i8> %1, i8 0, i32 15
diff --git a/test/CodeGen/X86/isel-sink.ll b/test/CodeGen/X86/isel-sink.ll
index 27abe051a9b3aaba83c4f4a50c14f186e896ba8c..2f32097a09b2203ea881d4b3c9f60c355a69b147 100644
--- a/test/CodeGen/X86/isel-sink.ll
+++ b/test/CodeGen/X86/isel-sink.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s
-; RUN: llc < %s -march=x86 -addr-sink-using-gep=1 | FileCheck %s
 
 define i32 @test(i32* %X, i32 %B) {
 ; CHECK-LABEL: test:
diff --git a/test/CodeGen/X86/jump_sign.ll b/test/CodeGen/X86/jump_sign.ll
index ca3e8bf71ebab1d10b3008808c3068abb973e038..5d6baad7068dc389e7301be6b16bf0a1ec76e972 100644
--- a/test/CodeGen/X86/jump_sign.ll
+++ b/test/CodeGen/X86/jump_sign.ll
@@ -6,7 +6,7 @@ entry:
 ; CHECK: jns
 	%tmp1 = add i32 %X, 1		; <i32> [#uses=1]
 	%tmp = icmp slt i32 %tmp1, 0		; <i1> [#uses=1]
-	br i1 %tmp, label %cond_true, label %cond_next
+	br i1 %tmp, label %cond_true, label %cond_next, !prof !1
 
 cond_true:		; preds = %entry
 	%tmp2 = tail call i32 (...) @bar( )		; <i32> [#uses=0]
@@ -303,3 +303,5 @@ if.then:
 if.end:
   ret i32 undef
 }
+
+!1 = !{!"branch_weights", i32 2, i32 1}
diff --git a/test/CodeGen/X86/known-bits-vector.ll b/test/CodeGen/X86/known-bits-vector.ll
index 77e3c537dfe70206c395061986da192cd62ad225..eee466a5a60af5931875e39ccc6d4671f725a62a 100644
--- a/test/CodeGen/X86/known-bits-vector.ll
+++ b/test/CodeGen/X86/known-bits-vector.ll
@@ -23,18 +23,14 @@ define i32 @knownbits_mask_extract_sext(<8 x i16> %a0) nounwind {
 define float @knownbits_mask_extract_uitofp(<2 x i64> %a0) nounwind {
 ; X32-LABEL: knownbits_mask_extract_uitofp:
 ; X32:       # BB#0:
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    pushl %eax
 ; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
-; X32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    fildll {{[0-9]+}}(%esp)
-; X32-NEXT:    fstps {{[0-9]+}}(%esp)
-; X32-NEXT:    flds {{[0-9]+}}(%esp)
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
+; X32-NEXT:    vmovd %xmm0, %eax
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; X32-NEXT:    vmovss %xmm0, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: knownbits_mask_extract_uitofp:
@@ -42,7 +38,7 @@ define float @knownbits_mask_extract_uitofp(<2 x i64> %a0) nounwind {
 ; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3],xmm0[4,5,6,7]
 ; X64-NEXT:    vmovq %xmm0, %rax
-; X64-NEXT:    vcvtsi2ssq %rax, %xmm2, %xmm0
+; X64-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
 ; X64-NEXT:    retq
   %1 = and <2 x i64> %a0, <i64 65535, i64 -1>
   %2 = extractelement <2 x i64> %1, i32 0
@@ -83,15 +79,15 @@ define <4 x i32> @knownbits_mask_shuffle_sext(<8 x i16> %a0) nounwind {
 ; X32-LABEL: knownbits_mask_shuffle_sext:
 ; X32:       # BB#0:
 ; X32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: knownbits_mask_shuffle_sext:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; X64-NEXT:    retq
   %1 = and <8 x i16> %a0, <i16 -1, i16 -1, i16 -1, i16 -1, i16 15, i16 15, i16 15, i16 15>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -103,15 +99,15 @@ define <4 x i32> @knownbits_mask_shuffle_shuffle_sext(<8 x i16> %a0) nounwind {
 ; X32-LABEL: knownbits_mask_shuffle_shuffle_sext:
 ; X32:       # BB#0:
 ; X32-NEXT:    vpand {{\.LCPI.*}}, %xmm0, %xmm0
-; X32-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X32-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: knownbits_mask_shuffle_shuffle_sext:
 ; X64:       # BB#0:
 ; X64-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; X64-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; X64-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; X64-NEXT:    retq
   %1 = and <8 x i16> %a0, <i16 -1, i16 -1, i16 -1, i16 -1, i16 15, i16 15, i16 15, i16 15>
   %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -555,3 +551,56 @@ define <4 x i32> @knownbits_mask_bitreverse_ashr(<4 x i32> %a0) {
   ret <4 x i32> %3
 }
 declare <4 x i32> @llvm.bitreverse.v4i32(<4 x i32>) nounwind readnone
+
+; If we don't know that the input isn't INT_MIN we can't combine to sitofp
+define <4 x float> @knownbits_abs_uitofp(<4 x i32> %a0) {
+; X32-LABEL: knownbits_abs_uitofp:
+; X32:       # BB#0:
+; X32-NEXT:    vpabsd %xmm0, %xmm0
+; X32-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X32-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X32-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X32-NEXT:    vaddps {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_abs_uitofp:
+; X64:       # BB#0:
+; X64-NEXT:    vpabsd %xmm0, %xmm0
+; X64-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X64-NEXT:    vpsrld $16, %xmm0, %xmm0
+; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1],xmm0[2],mem[3],xmm0[4],mem[5],xmm0[6],mem[7]
+; X64-NEXT:    vaddps {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; X64-NEXT:    retq
+  %1 = sub <4 x i32> zeroinitializer, %a0
+  %2 = icmp slt <4 x i32> %a0, zeroinitializer
+  %3 = select <4 x i1> %2, <4 x i32> %1, <4 x i32> %a0
+  %4 = uitofp <4 x i32> %3 to <4 x float>
+  ret <4 x float> %4
+}
+
+define <4 x float> @knownbits_or_abs_uitofp(<4 x i32> %a0) {
+; X32-LABEL: knownbits_or_abs_uitofp:
+; X32:       # BB#0:
+; X32-NEXT:    vpor {{\.LCPI.*}}, %xmm0, %xmm0
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; X32-NEXT:    vpabsd %xmm0, %xmm0
+; X32-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_or_abs_uitofp:
+; X64:       # BB#0:
+; X64-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,0,2]
+; X64-NEXT:    vpabsd %xmm0, %xmm0
+; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = or <4 x i32> %a0, <i32 1, i32 0, i32 3, i32 0>
+  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 0, i32 2>
+  %3 = sub <4 x i32> zeroinitializer, %2
+  %4 = icmp slt <4 x i32> %2, zeroinitializer
+  %5 = select <4 x i1> %4, <4 x i32> %3, <4 x i32> %2
+  %6 = uitofp <4 x i32> %5 to <4 x float>
+  ret <4 x float> %6
+}
diff --git a/test/CodeGen/X86/known-bits.ll b/test/CodeGen/X86/known-bits.ll
index 46451f21d8d6ff06c61b487ef24f8d62b0cf3305..81a60cdee3acbb0fc4b90d8bb78e69fb9c17eda2 100644
--- a/test/CodeGen/X86/known-bits.ll
+++ b/test/CodeGen/X86/known-bits.ll
@@ -103,3 +103,173 @@ CF246:                                            ; preds = %CF237
   %E156 = extractelement <4 x i1> %Cmp117, i32 2
   br label %CF
 }
+
+define i32 @knownbits_mask_add_lshr(i32 %a0, i32 %a1) nounwind {
+; X32-LABEL: knownbits_mask_add_lshr:
+; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_mask_add_lshr:
+; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %1 = and i32 %a0, 32767
+  %2 = and i32 %a1, 32766
+  %3 = add i32 %1, %2
+  %4 = lshr i32 %3, 17
+  ret i32 %4
+}
+
+define i128 @knownbits_mask_addc_shl(i64 %a0, i64 %a1, i64 %a2) nounwind {
+; X32-LABEL: knownbits_mask_addc_shl:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl $-1024, %esi # imm = 0xFC00
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    andl %esi, %edi
+; X32-NEXT:    andl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    shldl $22, %edx, %ecx
+; X32-NEXT:    shldl $22, %esi, %edx
+; X32-NEXT:    movl %edx, 8(%eax)
+; X32-NEXT:    movl %ecx, 12(%eax)
+; X32-NEXT:    movl $0, 4(%eax)
+; X32-NEXT:    movl $0, (%eax)
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    retl $4
+;
+; X64-LABEL: knownbits_mask_addc_shl:
+; X64:       # BB#0:
+; X64-NEXT:    andq $-1024, %rdi # imm = 0xFC00
+; X64-NEXT:    andq $-1024, %rsi # imm = 0xFC00
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    adcl $0, %edx
+; X64-NEXT:    shldq $54, %rsi, %rdx
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %1 = and i64 %a0, -1024
+  %2 = zext i64 %1 to i128
+  %3 = and i64 %a1, -1024
+  %4 = zext i64 %3 to i128
+  %5 = add i128 %2, %4
+  %6 = zext i64 %a2 to i128
+  %7 = shl i128 %6, 64
+  %8 = add i128 %5, %7
+  %9 = shl i128 %8, 54
+  ret i128 %9
+}
+
+define {i32, i1} @knownbits_uaddo_saddo(i64 %a0, i64 %a1) nounwind {
+; X32-LABEL: knownbits_uaddo_saddo:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    leal (%ecx,%eax), %edx
+; X32-NEXT:    cmpl %ecx, %edx
+; X32-NEXT:    setb %bl
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    setns %al
+; X32-NEXT:    testl %ecx, %ecx
+; X32-NEXT:    setns %cl
+; X32-NEXT:    cmpb %al, %cl
+; X32-NEXT:    sete %al
+; X32-NEXT:    testl %edx, %edx
+; X32-NEXT:    setns %dl
+; X32-NEXT:    cmpb %dl, %cl
+; X32-NEXT:    setne %dl
+; X32-NEXT:    andb %al, %dl
+; X32-NEXT:    orb %bl, %dl
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_uaddo_saddo:
+; X64:       # BB#0:
+; X64-NEXT:    shlq $32, %rdi
+; X64-NEXT:    shlq $32, %rsi
+; X64-NEXT:    addq %rdi, %rsi
+; X64-NEXT:    setb %al
+; X64-NEXT:    seto %dl
+; X64-NEXT:    orb %al, %dl
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %1 = shl i64 %a0, 32
+  %2 = shl i64 %a1, 32
+  %u = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %1, i64 %2)
+  %uval = extractvalue {i64, i1} %u, 0
+  %uovf = extractvalue {i64, i1} %u, 1
+  %s = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %1, i64 %2)
+  %sval = extractvalue {i64, i1} %s, 0
+  %sovf = extractvalue {i64, i1} %s, 1
+  %sum = add i64 %uval, %sval
+  %3 = trunc i64 %sum to i32
+  %4 = or i1 %uovf, %sovf
+  %ret0 = insertvalue {i32, i1} undef, i32 %3, 0
+  %ret1 = insertvalue {i32, i1} %ret0, i1 %4, 1
+  ret {i32, i1} %ret1
+}
+
+define {i32, i1} @knownbits_usubo_ssubo(i64 %a0, i64 %a1) nounwind {
+; X32-LABEL: knownbits_usubo_ssubo:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl %ecx, %edx
+; X32-NEXT:    subl %eax, %edx
+; X32-NEXT:    setns %bl
+; X32-NEXT:    cmpl %edx, %ecx
+; X32-NEXT:    setb %dh
+; X32-NEXT:    testl %ecx, %ecx
+; X32-NEXT:    setns %cl
+; X32-NEXT:    cmpb %bl, %cl
+; X32-NEXT:    setne %ch
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    setns %al
+; X32-NEXT:    cmpb %al, %cl
+; X32-NEXT:    setne %dl
+; X32-NEXT:    andb %ch, %dl
+; X32-NEXT:    orb %dh, %dl
+; X32-NEXT:    xorl %eax, %eax
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    retl
+;
+; X64-LABEL: knownbits_usubo_ssubo:
+; X64:       # BB#0:
+; X64-NEXT:    shlq $32, %rdi
+; X64-NEXT:    shlq $32, %rsi
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    setb %al
+; X64-NEXT:    seto %dl
+; X64-NEXT:    orb %al, %dl
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    retq
+  %1 = shl i64 %a0, 32
+  %2 = shl i64 %a1, 32
+  %u = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %1, i64 %2)
+  %uval = extractvalue {i64, i1} %u, 0
+  %uovf = extractvalue {i64, i1} %u, 1
+  %s = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %1, i64 %2)
+  %sval = extractvalue {i64, i1} %s, 0
+  %sovf = extractvalue {i64, i1} %s, 1
+  %sum = add i64 %uval, %sval
+  %3 = trunc i64 %sum to i32
+  %4 = or i1 %uovf, %sovf
+  %ret0 = insertvalue {i32, i1} undef, i32 %3, 0
+  %ret1 = insertvalue {i32, i1} %ret0, i1 %4, 1
+  ret {i32, i1} %ret1
+}
+
+declare {i64, i1} @llvm.uadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i64, i1} @llvm.sadd.with.overflow.i64(i64, i64) nounwind readnone
+declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
+declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cea9ac26edbc5578e30ed0035c5a5865121c6ea3
--- /dev/null
+++ b/test/CodeGen/X86/known-signbits-vector.ll
@@ -0,0 +1,139 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
+
+define <2 x double> @signbits_sext_v2i64_sitofp_v2f64(i32 %a0, i32 %a1) nounwind {
+; X32-LABEL: signbits_sext_v2i64_sitofp_v2f64:
+; X32:       # BB#0:
+; X32-NEXT:    vcvtdq2pd {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: signbits_sext_v2i64_sitofp_v2f64:
+; X64:       # BB#0:
+; X64-NEXT:    vmovd %edi, %xmm0
+; X64-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
+; X64-NEXT:    vcvtdq2pd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = sext i32 %a0 to i64
+  %2 = sext i32 %a1 to i64
+  %3 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %4 = insertelement <2 x i64> %3, i64 %2, i32 1
+  %5 = sitofp <2 x i64> %4 to <2 x double>
+  ret <2 x double> %5
+}
+
+define <4 x float> @signbits_sext_v4i64_sitofp_v4f32(i8 signext %a0, i16 signext %a1, i32 %a2, i32 %a3) nounwind {
+; X32-LABEL: signbits_sext_v4i64_sitofp_v4f32:
+; X32:       # BB#0:
+; X32-NEXT:    movsbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movswl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    vmovd %eax, %xmm0
+; X32-NEXT:    sarl $31, %eax
+; X32-NEXT:    vpinsrd $1, %eax, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, %ecx, %xmm0, %xmm0
+; X32-NEXT:    sarl $31, %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    vmovd %eax, %xmm1
+; X32-NEXT:    sarl $31, %eax
+; X32-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X32-NEXT:    vpinsrd $2, %edx, %xmm1, %xmm1
+; X32-NEXT:    sarl $31, %edx
+; X32-NEXT:    vpinsrd $3, %edx, %xmm1, %xmm1
+; X32-NEXT:    vpinsrd $3, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
+; X32-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: signbits_sext_v4i64_sitofp_v4f32:
+; X64:       # BB#0:
+; X64-NEXT:    movslq %edi, %rax
+; X64-NEXT:    movslq %esi, %rsi
+; X64-NEXT:    movslq %edx, %rdx
+; X64-NEXT:    movslq %ecx, %rcx
+; X64-NEXT:    vmovq %rcx, %xmm0
+; X64-NEXT:    vmovq %rdx, %xmm1
+; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NEXT:    vmovq %rsi, %xmm1
+; X64-NEXT:    vmovq %rax, %xmm2
+; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; X64-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
+; X64-NEXT:    vcvtdq2ps %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = sext i8 %a0 to i64
+  %2 = sext i16 %a1 to i64
+  %3 = sext i32 %a2 to i64
+  %4 = sext i32 %a3 to i64
+  %5 = insertelement <4 x i64> undef, i64 %1, i32 0
+  %6 = insertelement <4 x i64> %5, i64 %2, i32 1
+  %7 = insertelement <4 x i64> %6, i64 %3, i32 2
+  %8 = insertelement <4 x i64> %7, i64 %4, i32 3
+  %9 = sitofp <4 x i64> %8 to <4 x float>
+  ret <4 x float> %9
+}
+
+define float @signbits_ashr_extract_sitofp(<2 x i64> %a0) nounwind {
+; X32-LABEL: signbits_ashr_extract_sitofp:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    vpextrd $1, %xmm0, %eax
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm1, %xmm0
+; X32-NEXT:    vmovss %xmm0, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: signbits_ashr_extract_sitofp:
+; X64:       # BB#0:
+; X64-NEXT:    vpsrad $31, %xmm0, %xmm1
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X64-NEXT:    vmovq %xmm0, %rax
+; X64-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; X64-NEXT:    retq
+  %1 = ashr <2 x i64> %a0, <i64 32, i64 32>
+  %2 = extractelement <2 x i64> %1, i32 0
+  %3 = sitofp i64 %2 to float
+  ret float %3
+}
+
+define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwind {
+; X32-LABEL: signbits_ashr_insert_ashr_extract_sitofp:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    shrdl $30, %ecx, %eax
+; X32-NEXT:    sarl $30, %ecx
+; X32-NEXT:    vmovd %eax, %xmm0
+; X32-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; X32-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; X32-NEXT:    vmovd %xmm0, %eax
+; X32-NEXT:    vcvtsi2ssl %eax, %xmm1, %xmm0
+; X32-NEXT:    vmovss %xmm0, (%esp)
+; X32-NEXT:    flds (%esp)
+; X32-NEXT:    popl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: signbits_ashr_insert_ashr_extract_sitofp:
+; X64:       # BB#0:
+; X64-NEXT:    sarq $30, %rdi
+; X64-NEXT:    vmovq %rsi, %xmm0
+; X64-NEXT:    vmovq %rdi, %xmm1
+; X64-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; X64-NEXT:    vpsrad $3, %xmm0, %xmm1
+; X64-NEXT:    vpsrlq $3, %xmm0, %xmm0
+; X64-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]
+; X64-NEXT:    vmovq %xmm0, %rax
+; X64-NEXT:    vcvtsi2ssl %eax, %xmm2, %xmm0
+; X64-NEXT:    retq
+  %1 = ashr i64 %a0, 30
+  %2 = insertelement <2 x i64> undef, i64 %1, i32 0
+  %3 = insertelement <2 x i64> %2, i64 %a1, i32 1
+  %4 = ashr <2 x i64> %3, <i64 3, i64 3>
+  %5 = extractelement <2 x i64> %4, i32 0
+  %6 = sitofp i64 %5 to float
+  ret float %6
+}
diff --git a/test/CodeGen/X86/lea-opt-with-debug.mir b/test/CodeGen/X86/lea-opt-with-debug.mir
new file mode 100644
index 0000000000000000000000000000000000000000..ebf86ff718dbcf220a4fa0a53ea8d1b97204aab3
--- /dev/null
+++ b/test/CodeGen/X86/lea-opt-with-debug.mir
@@ -0,0 +1,122 @@
+# RUN: llc -mtriple=x86_64-unknown-unknown -start-after peephole-opt -stop-before detect-dead-lanes -o - %s | FileCheck %s
+
+# Test that pass optimize LEA can remove a redundant LEA even when it is also
+# used by a DBG_VALUE.
+
+--- |
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+  %struct.A = type { i32, i32, i32 }
+
+  @c = common local_unnamed_addr global %struct.A* null, align 8
+  @a = common local_unnamed_addr global i32 0, align 4
+  @d = common local_unnamed_addr global i32 0, align 4
+  @b = common local_unnamed_addr global i32 0, align 4
+
+  define i32 @fn1() local_unnamed_addr !dbg !8 {
+    %1 = load %struct.A*, %struct.A** @c, align 8, !dbg !13
+    %2 = load i32, i32* @a, align 4, !dbg !13
+    %3 = sext i32 %2 to i64, !dbg !13
+    %4 = getelementptr inbounds %struct.A, %struct.A* %1, i64 %3, !dbg !13
+    %5 = ptrtoint %struct.A* %4 to i64, !dbg !13
+    %6 = trunc i64 %5 to i32, !dbg !13
+    store i32 %6, i32* @d, align 4, !dbg !13
+    %7 = getelementptr inbounds %struct.A, %struct.A* %1, i64 %3, i32 2, !dbg !14
+    tail call void @llvm.dbg.value(metadata i32* %7, i64 0, metadata !11, metadata !15), !dbg !16
+    br label %8, !dbg !17
+
+  ; <label>:8:                                      ; preds = %8, %0
+    %9 = load i32, i32* %7, align 4, !dbg !18
+    store i32 %9, i32* @d, align 4, !dbg !18
+    br label %8, !dbg !19
+  }
+
+  ; Function Attrs: nounwind readnone
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #0
+
+  attributes #0 = { nounwind readnone }
+
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!5, !6, !7}
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, globals: !2)
+  !1 = !DIFile(filename: "test.c", directory: "")
+  !2 = !{}
+  !3 = !{!4}
+  !4 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !5 = !{i32 2, !"Dwarf Version", i32 4}
+  !6 = !{i32 2, !"Debug Info Version", i32 3}
+  !7 = !{i32 1, !"PIC Level", i32 2}
+  !8 = distinct !DISubprogram(name: "fn1", scope: !1, file: !1, line: 7, type: !9, isLocal: false, isDefinition: true, scopeLine: 7, isOptimized: true, unit: !0, variables: !10)
+  !9 = !DISubroutineType(types: !3)
+  !10 = !{!11}
+  !11 = !DILocalVariable(name: "e", scope: !8, file: !1, line: 8, type: !12)
+  !12 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !4, size: 64)
+  !13 = !DILocation(line: 9, scope: !8)
+  !14 = !DILocation(line: 10, scope: !8)
+  !15 = !DIExpression()
+  !16 = !DILocation(line: 8, scope: !8)
+  !17 = !DILocation(line: 11, scope: !8)
+  !18 = !DILocation(line: 13, scope: !8)
+  !19 = !DILocation(line: 14, scope: !8)
+
+...
+---
+name:            fn1
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr64 }
+  - { id: 1, class: gr64 }
+  - { id: 2, class: gr64_nosp }
+  - { id: 3, class: gr64_nosp }
+  - { id: 4, class: gr64 }
+  - { id: 5, class: gr32 }
+  - { id: 6, class: gr32 }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    0
+  adjustsStack:    false
+  hasCalls:        false
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+body:             |
+  bb.0 (%ir-block.0):
+    successors: %bb.1(0x80000000)
+
+    ; CHECK: %3 = LEA64r %2, 2, %2, 0, _, debug-location !13
+    ; CHECK-NEXT: %4 = LEA64r %1, 4, %3, 0, _, debug-location !13
+    ; CHECK-NOT: %0 = LEA64r %1, 4, %3, 8, _, debug-location !14
+    ; CHECK: DBG_VALUE debug-use _, debug-use _, !11, !15, debug-location !16
+
+    %1 = MOV64rm %rip, 1, _, @c, _, debug-location !13 :: (dereferenceable load 8 from @c)
+    %2 = MOVSX64rm32 %rip, 1, _, @a, _, debug-location !13 :: (dereferenceable load 4 from @a)
+    %3 = LEA64r %2, 2, %2, 0, _, debug-location !13
+    %4 = LEA64r %1, 4, %3, 0, _, debug-location !13
+    %5 = COPY %4.sub_32bit, debug-location !13
+    MOV32mr %rip, 1, _, @d, _, killed %5, debug-location !13 :: (store 4 into @d)
+    %0 = LEA64r %1, 4, %3, 8, _, debug-location !14
+    DBG_VALUE debug-use %0, debug-use _, !11, !15, debug-location !16
+
+    ; CHECK-LABEL: bb.1 (%ir-block.8):
+    ; CHECK: %6 = MOV32rm %4, 1, _, 8, _, debug-location !18 :: (load 4 from %ir.7)
+
+  bb.1 (%ir-block.8):
+    successors: %bb.1(0x80000000)
+
+    %6 = MOV32rm %0, 1, _, 0, _, debug-location !18 :: (load 4 from %ir.7)
+    MOV32mr %rip, 1, _, @d, _, killed %6, debug-location !18 :: (store 4 into @d)
+    JMP_1 %bb.1, debug-location !19
+
+...
diff --git a/test/CodeGen/X86/lfence.ll b/test/CodeGen/X86/lfence.ll
deleted file mode 100644
index 1903a1e31b5ce29a84dfea688be889db10bb53a9..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/lfence.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep lfence
-
-declare void @llvm.x86.sse2.lfence() nounwind
-
-define void @test() {
-  call void @llvm.x86.sse2.lfence()
-  ret void
-}
diff --git a/test/CodeGen/X86/licm-nested.ll b/test/CodeGen/X86/licm-nested.ll
index 42e6d12ec1e0104a54b45741f65c8504b5af92b1..63e3c5c3b6b2e2fa3d647216c4140523141778ea 100644
--- a/test/CodeGen/X86/licm-nested.ll
+++ b/test/CodeGen/X86/licm-nested.ll
@@ -1,5 +1,5 @@
 ; REQUIRES: asserts
-; RUN: llc -mtriple=x86_64-apple-darwin -march=x86-64 < %s -o /dev/null -stats -info-output-file - | grep "hoisted out of loops" | grep 4
+; RUN: llc -mtriple=x86_64-apple-darwin -march=x86-64 < %s -o /dev/null -stats -info-output-file - | grep "hoisted out of loops" | grep 5
 
 ; MachineLICM should be able to hoist the symbolic addresses out of
 ; the inner loops.
diff --git a/test/CodeGen/X86/live-range-nosubreg.ll b/test/CodeGen/X86/live-range-nosubreg.ll
index f28d59237b42f1cbfacafbd905e93f8a98f44495..899a375221c4feaaabc3d71bd83c5d67f387fb99 100644
--- a/test/CodeGen/X86/live-range-nosubreg.ll
+++ b/test/CodeGen/X86/live-range-nosubreg.ll
@@ -1,7 +1,6 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -march=x86-64 < %s
 
-; Check for a sane output. This testcase used to crash. See PR29132.
-; CHECK: leal -1
+; This testcase used to crash. See PR29132.
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/X86/load-combine.ll b/test/CodeGen/X86/load-combine.ll
index 08fc1c9c3f4d82cceba52d6a2533cb7e2c887ab4..e737a51cf405a398b6a5a5ad0e41860a1da352d7 100644
--- a/test/CodeGen/X86/load-combine.ll
+++ b/test/CodeGen/X86/load-combine.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK64
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=CHECK --check-prefix=BSWAP
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+movbe | FileCheck %s --check-prefix=CHECK --check-prefix=MOVBE
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=CHECK64 --check-prefix=BSWAP64
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+movbe | FileCheck %s --check-prefix=CHECK64 --check-prefix=MOVBE64
 
 ; i8* p;
 ; (i32) p[0] | ((i32) p[1] << 8) | ((i32) p[2] << 16) | ((i32) p[3] << 24)
@@ -15,7 +17,6 @@ define i32 @load_i32_by_i8(i32* %arg) {
 ; CHECK64:       # BB#0:
 ; CHECK64-NEXT:    movl (%rdi), %eax
 ; CHECK64-NEXT:    retq
-
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = load i8, i8* %tmp, align 1
   %tmp2 = zext i8 %tmp1 to i32
@@ -40,19 +41,29 @@ define i32 @load_i32_by_i8(i32* %arg) {
 ; i8* p;
 ; ((i32) p[0] << 24) | ((i32) p[1] << 16) | ((i32) p[2] << 8) | (i32) p[3]
 define i32 @load_i32_by_i8_bswap(i32* %arg) {
-; CHECK-LABEL: load_i32_by_i8_bswap:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl (%eax), %eax
-; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    retl
+; BSWAP-LABEL: load_i32_by_i8_bswap:
+; BSWAP:       # BB#0:
+; BSWAP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BSWAP-NEXT:    movl (%eax), %eax
+; BSWAP-NEXT:    bswapl %eax
+; BSWAP-NEXT:    retl
 ;
-; CHECK64-LABEL: load_i32_by_i8_bswap:
-; CHECK64:       # BB#0:
-; CHECK64-NEXT:    movl (%rdi), %eax
-; CHECK64-NEXT:    bswapl %eax
-; CHECK64-NEXT:    retq
-
+; MOVBE-LABEL: load_i32_by_i8_bswap:
+; MOVBE:       # BB#0:
+; MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MOVBE-NEXT:    movbel (%eax), %eax
+; MOVBE-NEXT:    retl
+;
+; BSWAP64-LABEL: load_i32_by_i8_bswap:
+; BSWAP64:       # BB#0:
+; BSWAP64-NEXT:    movl (%rdi), %eax
+; BSWAP64-NEXT:    bswapl %eax
+; BSWAP64-NEXT:    retq
+;
+; MOVBE64-LABEL: load_i32_by_i8_bswap:
+; MOVBE64:       # BB#0:
+; MOVBE64-NEXT:    movbel (%rdi), %eax
+; MOVBE64-NEXT:    retq
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = load i8, i8* %tmp, align 1
   %tmp2 = zext i8 %tmp1 to i32
@@ -87,7 +98,6 @@ define i32 @load_i32_by_i16(i32* %arg) {
 ; CHECK64:       # BB#0:
 ; CHECK64-NEXT:    movl (%rdi), %eax
 ; CHECK64-NEXT:    retq
-
   %tmp = bitcast i32* %arg to i16*
   %tmp1 = load i16, i16* %tmp, align 1
   %tmp2 = zext i16 %tmp1 to i32
@@ -113,7 +123,6 @@ define i32 @load_i32_by_i16_i8(i32* %arg) {
 ; CHECK64:       # BB#0:
 ; CHECK64-NEXT:    movl (%rdi), %eax
 ; CHECK64-NEXT:    retq
-
   %tmp = bitcast i32* %arg to i16*
   %tmp1 = bitcast i32* %arg to i8*
   %tmp2 = load i16, i16* %tmp, align 1
@@ -145,7 +154,6 @@ define i32 @load_i32_by_i16_by_i8(i32* %arg) {
 ; CHECK64:       # BB#0:
 ; CHECK64-NEXT:    movl (%rdi), %eax
 ; CHECK64-NEXT:    retq
-
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = load i8, i8* %tmp, align 1
   %tmp2 = zext i8 %tmp1 to i16
@@ -172,19 +180,29 @@ define i32 @load_i32_by_i16_by_i8(i32* %arg) {
 ; i8* p;
 ; ((i32) (((i16) p[0] << 8) | (i16) p[1]) << 16) | (i32) (((i16) p[3] << 8) | (i16) p[4])
 define i32 @load_i32_by_i16_by_i8_bswap(i32* %arg) {
-; CHECK-LABEL: load_i32_by_i16_by_i8_bswap:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl (%eax), %eax
-; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    retl
+; BSWAP-LABEL: load_i32_by_i16_by_i8_bswap:
+; BSWAP:       # BB#0:
+; BSWAP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BSWAP-NEXT:    movl (%eax), %eax
+; BSWAP-NEXT:    bswapl %eax
+; BSWAP-NEXT:    retl
 ;
-; CHECK64-LABEL: load_i32_by_i16_by_i8_bswap:
-; CHECK64:       # BB#0:
-; CHECK64-NEXT:    movl (%rdi), %eax
-; CHECK64-NEXT:    bswapl %eax
-; CHECK64-NEXT:    retq
-
+; MOVBE-LABEL: load_i32_by_i16_by_i8_bswap:
+; MOVBE:       # BB#0:
+; MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MOVBE-NEXT:    movbel (%eax), %eax
+; MOVBE-NEXT:    retl
+;
+; BSWAP64-LABEL: load_i32_by_i16_by_i8_bswap:
+; BSWAP64:       # BB#0:
+; BSWAP64-NEXT:    movl (%rdi), %eax
+; BSWAP64-NEXT:    bswapl %eax
+; BSWAP64-NEXT:    retq
+;
+; MOVBE64-LABEL: load_i32_by_i16_by_i8_bswap:
+; MOVBE64:       # BB#0:
+; MOVBE64-NEXT:    movbel (%rdi), %eax
+; MOVBE64-NEXT:    retq
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = load i8, i8* %tmp, align 1
   %tmp2 = zext i8 %tmp1 to i16
@@ -222,7 +240,6 @@ define i64 @load_i64_by_i8(i64* %arg) {
 ; CHECK64:       # BB#0:
 ; CHECK64-NEXT:    movq (%rdi), %rax
 ; CHECK64-NEXT:    retq
-
   %tmp = bitcast i64* %arg to i8*
   %tmp1 = load i8, i8* %tmp, align 1
   %tmp2 = zext i8 %tmp1 to i64
@@ -267,21 +284,32 @@ define i64 @load_i64_by_i8(i64* %arg) {
 ; i8* p;
 ; ((i64) p[0] << 56) | ((i64) p[1] << 48) | ((i64) p[2] << 40) | ((i64) p[3] << 32) | ((i64) p[4] << 24) | ((i64) p[5] << 16) | ((i64) p[6] << 8) | (i64) p[7]
 define i64 @load_i64_by_i8_bswap(i64* %arg) {
-; CHECK-LABEL: load_i64_by_i8_bswap:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl (%eax), %edx
-; CHECK-NEXT:    movl 4(%eax), %eax
-; CHECK-NEXT:    bswapl %eax
-; CHECK-NEXT:    bswapl %edx
-; CHECK-NEXT:    retl
+; BSWAP-LABEL: load_i64_by_i8_bswap:
+; BSWAP:       # BB#0:
+; BSWAP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BSWAP-NEXT:    movl (%eax), %edx
+; BSWAP-NEXT:    movl 4(%eax), %eax
+; BSWAP-NEXT:    bswapl %eax
+; BSWAP-NEXT:    bswapl %edx
+; BSWAP-NEXT:    retl
 ;
-; CHECK64-LABEL: load_i64_by_i8_bswap:
-; CHECK64:       # BB#0:
-; CHECK64-NEXT:    movq (%rdi), %rax
-; CHECK64-NEXT:    bswapq %rax
-; CHECK64-NEXT:    retq
-
+; MOVBE-LABEL: load_i64_by_i8_bswap:
+; MOVBE:       # BB#0:
+; MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MOVBE-NEXT:    movbel 4(%ecx), %eax
+; MOVBE-NEXT:    movbel (%ecx), %edx
+; MOVBE-NEXT:    retl
+;
+; BSWAP64-LABEL: load_i64_by_i8_bswap:
+; BSWAP64:       # BB#0:
+; BSWAP64-NEXT:    movq (%rdi), %rax
+; BSWAP64-NEXT:    bswapq %rax
+; BSWAP64-NEXT:    retq
+;
+; MOVBE64-LABEL: load_i64_by_i8_bswap:
+; MOVBE64:       # BB#0:
+; MOVBE64-NEXT:    movbeq (%rdi), %rax
+; MOVBE64-NEXT:    retq
   %tmp = bitcast i64* %arg to i8*
   %tmp1 = load i8, i8* %tmp, align 1
   %tmp2 = zext i8 %tmp1 to i64
@@ -367,7 +395,6 @@ define i32 @load_i32_by_i8_bswap_uses(i32* %arg) {
 ; CHECK64-NEXT:    orl %esi, %eax
 ; CHECK64-NEXT:    orl %ecx, %eax
 ; CHECK64-NEXT:    retq
-
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = load i8, i8* %tmp, align 1
   %tmp2 = zext i8 %tmp1 to i32
@@ -424,7 +451,6 @@ define i32 @load_i32_by_i8_bswap_volatile(i32* %arg) {
 ; CHECK64-NEXT:    movzbl 3(%rdi), %eax
 ; CHECK64-NEXT:    orl %edx, %eax
 ; CHECK64-NEXT:    retq
-
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = load volatile i8, i8* %tmp, align 1
   %tmp2 = zext i8 %tmp1 to i32
@@ -490,7 +516,6 @@ define i32 @load_i32_by_i8_bswap_store_in_between(i32* %arg, i32* %arg1) {
 ; CHECK64-NEXT:    movzbl 3(%rdi), %eax
 ; CHECK64-NEXT:    orl %edx, %eax
 ; CHECK64-NEXT:    retq
-
   %tmp = bitcast i32* %arg to i8*
   %tmp2 = load i8, i8* %tmp, align 1
   %tmp3 = zext i8 %tmp2 to i32
@@ -547,7 +572,6 @@ define i32 @load_i32_by_i8_bswap_unrelated_load(i32* %arg, i32* %arg1) {
 ; CHECK64-NEXT:    movzbl 3(%rdi), %eax
 ; CHECK64-NEXT:    orl %edx, %eax
 ; CHECK64-NEXT:    retq
-
   %tmp = bitcast i32* %arg to i8*
   %tmp2 = bitcast i32* %arg1 to i8*
   %tmp3 = load i8, i8* %tmp, align 1
@@ -571,39 +595,19 @@ define i32 @load_i32_by_i8_bswap_unrelated_load(i32* %arg, i32* %arg1) {
   ret i32 %tmp19
 }
 
-; Non-zero offsets are not supported for now
 ; i8* p;
 ; (i32) p[1] | ((i32) p[2] << 8) | ((i32) p[3] << 16) | ((i32) p[4] << 24)
-define i32 @load_i32_by_i8_unsupported_offset(i32* %arg) {
-; CHECK-LABEL: load_i32_by_i8_unsupported_offset:
+define i32 @load_i32_by_i8_nonzero_offset(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movzbl 1(%eax), %ecx
-; CHECK-NEXT:    movzbl 2(%eax), %edx
-; CHECK-NEXT:    shll $8, %edx
-; CHECK-NEXT:    orl %ecx, %edx
-; CHECK-NEXT:    movzbl 3(%eax), %ecx
-; CHECK-NEXT:    shll $16, %ecx
-; CHECK-NEXT:    orl %edx, %ecx
-; CHECK-NEXT:    movzbl 4(%eax), %eax
-; CHECK-NEXT:    shll $24, %eax
-; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    movl 1(%eax), %eax
 ; CHECK-NEXT:    retl
 ;
-; CHECK64-LABEL: load_i32_by_i8_unsupported_offset:
+; CHECK64-LABEL: load_i32_by_i8_nonzero_offset:
 ; CHECK64:       # BB#0:
-; CHECK64-NEXT:    movzbl 1(%rdi), %eax
-; CHECK64-NEXT:    movzbl 2(%rdi), %ecx
-; CHECK64-NEXT:    shll $8, %ecx
-; CHECK64-NEXT:    orl %eax, %ecx
-; CHECK64-NEXT:    movzbl 3(%rdi), %edx
-; CHECK64-NEXT:    shll $16, %edx
-; CHECK64-NEXT:    orl %ecx, %edx
-; CHECK64-NEXT:    movzbl 4(%rdi), %eax
-; CHECK64-NEXT:    shll $24, %eax
-; CHECK64-NEXT:    orl %edx, %eax
+; CHECK64-NEXT:    movl 1(%rdi), %eax
 ; CHECK64-NEXT:    retq
-
   %tmp = bitcast i32* %arg to i8*
   %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
   %tmp2 = load i8, i8* %tmp1, align 1
@@ -626,35 +630,167 @@ define i32 @load_i32_by_i8_unsupported_offset(i32* %arg) {
   ret i32 %tmp18
 }
 
-; i8* p; i32 i;
-; ((i32) p[i] << 24) | ((i32) p[i + 1] << 16) | ((i32) p[i + 2] << 8) | (i32) p[i + 3]
-define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) {
-; CHECK-LABEL: load_i32_by_i8_bswap_base_index_offset:
+; i8* p;
+; (i32) p[-4] | ((i32) p[-3] << 8) | ((i32) p[-2] << 16) | ((i32) p[-1] << 24)
+define i32 @load_i32_by_i8_neg_offset(i32* %arg) {
+; CHECK-LABEL: load_i32_by_i8_neg_offset:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    movl (%ecx,%eax), %eax
-; CHECK-NEXT:    bswapl %eax
+; CHECK-NEXT:    movl -4(%eax), %eax
 ; CHECK-NEXT:    retl
 ;
-; CHECK64-LABEL: load_i32_by_i8_bswap_base_index_offset:
+; CHECK64-LABEL: load_i32_by_i8_neg_offset:
 ; CHECK64:       # BB#0:
-; CHECK64-NEXT:    movslq %esi, %rax
-; CHECK64-NEXT:    movzbl (%rdi,%rax), %ecx
-; CHECK64-NEXT:    shll $24, %ecx
-; CHECK64-NEXT:    movzbl 1(%rdi,%rax), %edx
-; CHECK64-NEXT:    shll $16, %edx
-; CHECK64-NEXT:    orl %ecx, %edx
-; CHECK64-NEXT:    movzbl 2(%rdi,%rax), %ecx
-; CHECK64-NEXT:    shll $8, %ecx
-; CHECK64-NEXT:    orl %edx, %ecx
-; CHECK64-NEXT:    movzbl 3(%rdi,%rax), %eax
-; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    movl -4(%rdi), %eax
 ; CHECK64-NEXT:    retq
-; TODO: Currently we don't fold the pattern for x86-64 target because we don't
-; see that the loads are adjacent. It happens because BaseIndexOffset doesn't
-; look through zexts.
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -4
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -3
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -2
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -1
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p;
+; (i32) p[4] | ((i32) p[3] << 8) | ((i32) p[2] << 16) | ((i32) p[1] << 24)
+define i32 @load_i32_by_i8_nonzero_offset_bswap(i32* %arg) {
+; BSWAP-LABEL: load_i32_by_i8_nonzero_offset_bswap:
+; BSWAP:       # BB#0:
+; BSWAP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BSWAP-NEXT:    movl 1(%eax), %eax
+; BSWAP-NEXT:    bswapl %eax
+; BSWAP-NEXT:    retl
+;
+; MOVBE-LABEL: load_i32_by_i8_nonzero_offset_bswap:
+; MOVBE:       # BB#0:
+; MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MOVBE-NEXT:    movbel 1(%eax), %eax
+; MOVBE-NEXT:    retl
+;
+; BSWAP64-LABEL: load_i32_by_i8_nonzero_offset_bswap:
+; BSWAP64:       # BB#0:
+; BSWAP64-NEXT:    movl 1(%rdi), %eax
+; BSWAP64-NEXT:    bswapl %eax
+; BSWAP64-NEXT:    retq
+;
+; MOVBE64-LABEL: load_i32_by_i8_nonzero_offset_bswap:
+; MOVBE64:       # BB#0:
+; MOVBE64-NEXT:    movbel 1(%rdi), %eax
+; MOVBE64-NEXT:    retq
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 4
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 3
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 2
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
+
+; i8* p;
+; (i32) p[-1] | ((i32) p[-2] << 8) | ((i32) p[-3] << 16) | ((i32) p[-4] << 24)
+define i32 @load_i32_by_i8_neg_offset_bswap(i32* %arg) {
+; BSWAP-LABEL: load_i32_by_i8_neg_offset_bswap:
+; BSWAP:       # BB#0:
+; BSWAP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BSWAP-NEXT:    movl -4(%eax), %eax
+; BSWAP-NEXT:    bswapl %eax
+; BSWAP-NEXT:    retl
+;
+; MOVBE-LABEL: load_i32_by_i8_neg_offset_bswap:
+; MOVBE:       # BB#0:
+; MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MOVBE-NEXT:    movbel -4(%eax), %eax
+; MOVBE-NEXT:    retl
+;
+; BSWAP64-LABEL: load_i32_by_i8_neg_offset_bswap:
+; BSWAP64:       # BB#0:
+; BSWAP64-NEXT:    movl -4(%rdi), %eax
+; BSWAP64-NEXT:    bswapl %eax
+; BSWAP64-NEXT:    retq
+;
+; MOVBE64-LABEL: load_i32_by_i8_neg_offset_bswap:
+; MOVBE64:       # BB#0:
+; MOVBE64-NEXT:    movbel -4(%rdi), %eax
+; MOVBE64-NEXT:    retq
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 -1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 -2
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  %tmp9 = getelementptr inbounds i8, i8* %tmp, i32 -3
+  %tmp10 = load i8, i8* %tmp9, align 1
+  %tmp11 = zext i8 %tmp10 to i32
+  %tmp12 = shl nuw nsw i32 %tmp11, 16
+  %tmp13 = or i32 %tmp8, %tmp12
+  %tmp14 = getelementptr inbounds i8, i8* %tmp, i32 -4
+  %tmp15 = load i8, i8* %tmp14, align 1
+  %tmp16 = zext i8 %tmp15 to i32
+  %tmp17 = shl nuw nsw i32 %tmp16, 24
+  %tmp18 = or i32 %tmp13, %tmp17
+  ret i32 %tmp18
+}
 
+; i8* p; i32 i;
+; ((i32) p[i] << 24) | ((i32) p[i + 1] << 16) | ((i32) p[i + 2] << 8) | (i32) p[i + 3]
+define i32 @load_i32_by_i8_bswap_base_index_offset(i32* %arg, i32 %arg1) {
+; BSWAP-LABEL: load_i32_by_i8_bswap_base_index_offset:
+; BSWAP:       # BB#0:
+; BSWAP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BSWAP-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; BSWAP-NEXT:    movl (%ecx,%eax), %eax
+; BSWAP-NEXT:    bswapl %eax
+; BSWAP-NEXT:    retl
+;
+; MOVBE-LABEL: load_i32_by_i8_bswap_base_index_offset:
+; MOVBE:       # BB#0:
+; MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; MOVBE-NEXT:    movbel (%ecx,%eax), %eax
+; MOVBE-NEXT:    retl
+;
+; BSWAP64-LABEL: load_i32_by_i8_bswap_base_index_offset:
+; BSWAP64:       # BB#0:
+; BSWAP64-NEXT:    movslq %esi, %rax
+; BSWAP64-NEXT:    movl (%rdi,%rax), %eax
+; BSWAP64-NEXT:    bswapl %eax
+; BSWAP64-NEXT:    retq
+;
+; MOVBE64-LABEL: load_i32_by_i8_bswap_base_index_offset:
+; MOVBE64:       # BB#0:
+; MOVBE64-NEXT:    movslq %esi, %rax
+; MOVBE64-NEXT:    movbel (%rdi,%rax), %eax
+; MOVBE64-NEXT:    retq
   %tmp = bitcast i32* %arg to i8*
   %tmp2 = getelementptr inbounds i8, i8* %tmp, i32 %arg1
   %tmp3 = load i8, i8* %tmp2, align 1
@@ -707,3 +843,472 @@ entry:
   store i64 %conv75, i64* %dst, align 8
   ret void
 }
+
+declare i16 @llvm.bswap.i16(i16)
+
+; i16* p;
+; (i32) bswap(p[1]) | (i32) bswap(p[0] << 16)
+define i32 @load_i32_by_bswap_i16(i32* %arg) {
+; BSWAP-LABEL: load_i32_by_bswap_i16:
+; BSWAP:       # BB#0:
+; BSWAP-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; BSWAP-NEXT:    movl (%eax), %eax
+; BSWAP-NEXT:    bswapl %eax
+; BSWAP-NEXT:    retl
+;
+; MOVBE-LABEL: load_i32_by_bswap_i16:
+; MOVBE:       # BB#0:
+; MOVBE-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MOVBE-NEXT:    movbel (%eax), %eax
+; MOVBE-NEXT:    retl
+;
+; BSWAP64-LABEL: load_i32_by_bswap_i16:
+; BSWAP64:       # BB#0:
+; BSWAP64-NEXT:    movl (%rdi), %eax
+; BSWAP64-NEXT:    bswapl %eax
+; BSWAP64-NEXT:    retq
+;
+; MOVBE64-LABEL: load_i32_by_bswap_i16:
+; MOVBE64:       # BB#0:
+; MOVBE64-NEXT:    movbel (%rdi), %eax
+; MOVBE64-NEXT:    retq
+  %tmp = bitcast i32* %arg to i16*
+  %tmp1 = load i16, i16* %tmp, align 4
+  %tmp11 = call i16 @llvm.bswap.i16(i16 %tmp1)
+  %tmp2 = zext i16 %tmp11 to i32
+  %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
+  %tmp4 = load i16, i16* %tmp3, align 1
+  %tmp41 = call i16 @llvm.bswap.i16(i16 %tmp4)
+  %tmp5 = zext i16 %tmp41 to i32
+  %tmp6 = shl nuw nsw i32 %tmp2, 16
+  %tmp7 = or i32 %tmp6, %tmp5
+  ret i32 %tmp7
+}
+
+; i16* p;
+; (i32) p[0] | (sext(p[1] << 16) to i32)
+define i32 @load_i32_by_sext_i16(i32* %arg) {
+; CHECK-LABEL: load_i32_by_sext_i16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_sext_i16:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl (%rdi), %eax
+; CHECK64-NEXT:    retq
+  %tmp = bitcast i32* %arg to i16*
+  %tmp1 = load i16, i16* %tmp, align 1
+  %tmp2 = zext i16 %tmp1 to i32
+  %tmp3 = getelementptr inbounds i16, i16* %tmp, i32 1
+  %tmp4 = load i16, i16* %tmp3, align 1
+  %tmp5 = sext i16 %tmp4 to i32
+  %tmp6 = shl nuw nsw i32 %tmp5, 16
+  %tmp7 = or i32 %tmp6, %tmp2
+  ret i32 %tmp7
+}
+
+; i8* arg; i32 i;
+; p = arg + 12;
+; (i32) p[i] | ((i32) p[i + 1] << 8) | ((i32) p[i + 2] << 16) | ((i32) p[i + 3] << 24)
+define i32 @load_i32_by_i8_base_offset_index(i8* %arg, i32 %i) {
+; CHECK-LABEL: load_i32_by_i8_base_offset_index:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl 12(%eax,%ecx), %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_base_offset_index:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl %esi, %eax
+; CHECK64-NEXT:    movl 12(%rdi,%rax), %eax
+; CHECK64-NEXT:    retq
+  %tmp = add nuw nsw i32 %i, 3
+  %tmp2 = add nuw nsw i32 %i, 2
+  %tmp3 = add nuw nsw i32 %i, 1
+  %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp5 = zext i32 %i to i64
+  %tmp6 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp5
+  %tmp7 = load i8, i8* %tmp6, align 1
+  %tmp8 = zext i8 %tmp7 to i32
+  %tmp9 = zext i32 %tmp3 to i64
+  %tmp10 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp9
+  %tmp11 = load i8, i8* %tmp10, align 1
+  %tmp12 = zext i8 %tmp11 to i32
+  %tmp13 = shl nuw nsw i32 %tmp12, 8
+  %tmp14 = or i32 %tmp13, %tmp8
+  %tmp15 = zext i32 %tmp2 to i64
+  %tmp16 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp15
+  %tmp17 = load i8, i8* %tmp16, align 1
+  %tmp18 = zext i8 %tmp17 to i32
+  %tmp19 = shl nuw nsw i32 %tmp18, 16
+  %tmp20 = or i32 %tmp14, %tmp19
+  %tmp21 = zext i32 %tmp to i64
+  %tmp22 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp21
+  %tmp23 = load i8, i8* %tmp22, align 1
+  %tmp24 = zext i8 %tmp23 to i32
+  %tmp25 = shl nuw i32 %tmp24, 24
+  %tmp26 = or i32 %tmp20, %tmp25
+  ret i32 %tmp26
+}
+
+; i8* arg; i32 i;
+; p = arg + 12;
+; (i32) p[i + 1] | ((i32) p[i + 2] << 8) | ((i32) p[i + 3] << 16) | ((i32) p[i + 4] << 24)
+define i32 @load_i32_by_i8_base_offset_index_2(i8* %arg, i32 %i) {
+; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl 13(%eax,%ecx), %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_base_offset_index_2:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl %esi, %eax
+; CHECK64-NEXT:    movl 13(%rdi,%rax), %eax
+; CHECK64-NEXT:    retq
+  %tmp = add nuw nsw i32 %i, 4
+  %tmp2 = add nuw nsw i32 %i, 3
+  %tmp3 = add nuw nsw i32 %i, 2
+  %tmp4 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp5 = add nuw nsw i32 %i, 1
+  %tmp27 = zext i32 %tmp5 to i64
+  %tmp28 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp27
+  %tmp29 = load i8, i8* %tmp28, align 1
+  %tmp30 = zext i8 %tmp29 to i32
+  %tmp31 = zext i32 %tmp3 to i64
+  %tmp32 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp31
+  %tmp33 = load i8, i8* %tmp32, align 1
+  %tmp34 = zext i8 %tmp33 to i32
+  %tmp35 = shl nuw nsw i32 %tmp34, 8
+  %tmp36 = or i32 %tmp35, %tmp30
+  %tmp37 = zext i32 %tmp2 to i64
+  %tmp38 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp37
+  %tmp39 = load i8, i8* %tmp38, align 1
+  %tmp40 = zext i8 %tmp39 to i32
+  %tmp41 = shl nuw nsw i32 %tmp40, 16
+  %tmp42 = or i32 %tmp36, %tmp41
+  %tmp43 = zext i32 %tmp to i64
+  %tmp44 = getelementptr inbounds i8, i8* %tmp4, i64 %tmp43
+  %tmp45 = load i8, i8* %tmp44, align 1
+  %tmp46 = zext i8 %tmp45 to i32
+  %tmp47 = shl nuw i32 %tmp46, 24
+  %tmp48 = or i32 %tmp42, %tmp47
+  ret i32 %tmp48
+}
+
+; i8* arg; i32 i;
+;
+; p0 = arg;
+; p1 = arg + i + 1;
+; p2 = arg + i + 2;
+; p3 = arg + i + 3;
+;
+; (i32) p0[12] | ((i32) p1[12] << 8) | ((i32) p2[12] << 16) | ((i32) p3[12] << 24)
+;
+; This test excercises zero and any extend loads as a part of load combine pattern.
+; In order to fold the pattern above we need to reassociate the address computation
+; first. By the time the address computation is reassociated loads are combined to
+; to zext and aext loads.
+define i32 @load_i32_by_i8_zaext_loads(i8* %arg, i32 %arg1) {
+; CHECK-LABEL: load_i32_by_i8_zaext_loads:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl 12(%eax,%ecx), %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_zaext_loads:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl %esi, %eax
+; CHECK64-NEXT:    movl 12(%rdi,%rax), %eax
+; CHECK64-NEXT:    retq
+  %tmp = add nuw nsw i32 %arg1, 3
+  %tmp2 = add nuw nsw i32 %arg1, 2
+  %tmp3 = add nuw nsw i32 %arg1, 1
+  %tmp4 = zext i32 %tmp to i64
+  %tmp5 = zext i32 %tmp2 to i64
+  %tmp6 = zext i32 %tmp3 to i64
+  %tmp24 = getelementptr inbounds i8, i8* %arg, i64 %tmp4
+  %tmp30 = getelementptr inbounds i8, i8* %arg, i64 %tmp5
+  %tmp31 = getelementptr inbounds i8, i8* %arg, i64 %tmp6
+  %tmp32 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp33 = zext i32 %arg1 to i64
+  %tmp34 = getelementptr inbounds i8, i8* %tmp32, i64 %tmp33
+  %tmp35 = load i8, i8* %tmp34, align 1
+  %tmp36 = zext i8 %tmp35 to i32
+  %tmp37 = getelementptr inbounds i8, i8* %tmp31, i64 12
+  %tmp38 = load i8, i8* %tmp37, align 1
+  %tmp39 = zext i8 %tmp38 to i32
+  %tmp40 = shl nuw nsw i32 %tmp39, 8
+  %tmp41 = or i32 %tmp40, %tmp36
+  %tmp42 = getelementptr inbounds i8, i8* %tmp30, i64 12
+  %tmp43 = load i8, i8* %tmp42, align 1
+  %tmp44 = zext i8 %tmp43 to i32
+  %tmp45 = shl nuw nsw i32 %tmp44, 16
+  %tmp46 = or i32 %tmp41, %tmp45
+  %tmp47 = getelementptr inbounds i8, i8* %tmp24, i64 12
+  %tmp48 = load i8, i8* %tmp47, align 1
+  %tmp49 = zext i8 %tmp48 to i32
+  %tmp50 = shl nuw i32 %tmp49, 24
+  %tmp51 = or i32 %tmp46, %tmp50
+  ret i32 %tmp51
+}
+
+; The same as load_i32_by_i8_zaext_loads but the last load is combined to
+; a sext load.
+;
+; i8* arg; i32 i;
+;
+; p0 = arg;
+; p1 = arg + i + 1;
+; p2 = arg + i + 2;
+; p3 = arg + i + 3;
+;
+; (i32) p0[12] | ((i32) p1[12] << 8) | ((i32) p2[12] << 16) | ((i32) p3[12] << 24)
+define i32 @load_i32_by_i8_zsext_loads(i8* %arg, i32 %arg1) {
+; CHECK-LABEL: load_i32_by_i8_zsext_loads:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl 12(%eax,%ecx), %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: load_i32_by_i8_zsext_loads:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl %esi, %eax
+; CHECK64-NEXT:    movl 12(%rdi,%rax), %eax
+; CHECK64-NEXT:    retq
+  %tmp = add nuw nsw i32 %arg1, 3
+  %tmp2 = add nuw nsw i32 %arg1, 2
+  %tmp3 = add nuw nsw i32 %arg1, 1
+  %tmp4 = zext i32 %tmp to i64
+  %tmp5 = zext i32 %tmp2 to i64
+  %tmp6 = zext i32 %tmp3 to i64
+  %tmp24 = getelementptr inbounds i8, i8* %arg, i64 %tmp4
+  %tmp30 = getelementptr inbounds i8, i8* %arg, i64 %tmp5
+  %tmp31 = getelementptr inbounds i8, i8* %arg, i64 %tmp6
+  %tmp32 = getelementptr inbounds i8, i8* %arg, i64 12
+  %tmp33 = zext i32 %arg1 to i64
+  %tmp34 = getelementptr inbounds i8, i8* %tmp32, i64 %tmp33
+  %tmp35 = load i8, i8* %tmp34, align 1
+  %tmp36 = zext i8 %tmp35 to i32
+  %tmp37 = getelementptr inbounds i8, i8* %tmp31, i64 12
+  %tmp38 = load i8, i8* %tmp37, align 1
+  %tmp39 = zext i8 %tmp38 to i32
+  %tmp40 = shl nuw nsw i32 %tmp39, 8
+  %tmp41 = or i32 %tmp40, %tmp36
+  %tmp42 = getelementptr inbounds i8, i8* %tmp30, i64 12
+  %tmp43 = load i8, i8* %tmp42, align 1
+  %tmp44 = zext i8 %tmp43 to i32
+  %tmp45 = shl nuw nsw i32 %tmp44, 16
+  %tmp46 = or i32 %tmp41, %tmp45
+  %tmp47 = getelementptr inbounds i8, i8* %tmp24, i64 12
+  %tmp48 = load i8, i8* %tmp47, align 1
+  %tmp49 = sext i8 %tmp48 to i16
+  %tmp50 = zext i16 %tmp49 to i32
+  %tmp51 = shl nuw i32 %tmp50, 24
+  %tmp52 = or i32 %tmp46, %tmp51
+  ret i32 %tmp52
+}
+
+; i8* p;
+; (i32) p[0] | ((i32) p[1] << 8)
+define i32 @zext_load_i32_by_i8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl (%eax), %ecx
+; CHECK-NEXT:    movzbl 1(%eax), %eax
+; CHECK-NEXT:    shll $8, %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: zext_load_i32_by_i8:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl (%rdi), %ecx
+; CHECK64-NEXT:    movzbl 1(%rdi), %eax
+; CHECK64-NEXT:    shll $8, %eax
+; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    retq
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  ret i32 %tmp8
+}
+
+; i8* p;
+; ((i32) p[0] << 8) | ((i32) p[1] << 16)
+define i32 @zext_load_i32_by_i8_shl_8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_shl_8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl (%eax), %ecx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    movzbl 1(%eax), %eax
+; CHECK-NEXT:    shll $16, %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: zext_load_i32_by_i8_shl_8:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl (%rdi), %ecx
+; CHECK64-NEXT:    shll $8, %ecx
+; CHECK64-NEXT:    movzbl 1(%rdi), %eax
+; CHECK64-NEXT:    shll $16, %eax
+; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    retq
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 8
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 16
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p;
+; ((i32) p[0] << 16) | ((i32) p[1] << 24)
+define i32 @zext_load_i32_by_i8_shl_16(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_shl_16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl (%eax), %ecx
+; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    movzbl 1(%eax), %eax
+; CHECK-NEXT:    shll $24, %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: zext_load_i32_by_i8_shl_16:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl (%rdi), %ecx
+; CHECK64-NEXT:    shll $16, %ecx
+; CHECK64-NEXT:    movzbl 1(%rdi), %eax
+; CHECK64-NEXT:    shll $24, %eax
+; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    retq
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 16
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 24
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p;
+; (i32) p[1] | ((i32) p[0] << 8)
+define i32 @zext_load_i32_by_i8_bswap(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl 1(%eax), %ecx
+; CHECK-NEXT:    movzbl (%eax), %eax
+; CHECK-NEXT:    shll $8, %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: zext_load_i32_by_i8_bswap:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl 1(%rdi), %ecx
+; CHECK64-NEXT:    movzbl (%rdi), %eax
+; CHECK64-NEXT:    shll $8, %eax
+; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    retq
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 8
+  %tmp8 = or i32 %tmp7, %tmp3
+  ret i32 %tmp8
+}
+
+; i8* p;
+; ((i32) p[1] << 8) | ((i32) p[0] << 16)
+define i32 @zext_load_i32_by_i8_bswap_shl_8(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl 1(%eax), %ecx
+; CHECK-NEXT:    shll $8, %ecx
+; CHECK-NEXT:    movzbl (%eax), %eax
+; CHECK-NEXT:    shll $16, %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: zext_load_i32_by_i8_bswap_shl_8:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl 1(%rdi), %ecx
+; CHECK64-NEXT:    shll $8, %ecx
+; CHECK64-NEXT:    movzbl (%rdi), %eax
+; CHECK64-NEXT:    shll $16, %eax
+; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    retq
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 8
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 16
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
+
+; i8* p;
+; ((i32) p[1] << 16) | ((i32) p[0] << 24)
+define i32 @zext_load_i32_by_i8_bswap_shl_16(i32* %arg) {
+; CHECK-LABEL: zext_load_i32_by_i8_bswap_shl_16:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movzbl 1(%eax), %ecx
+; CHECK-NEXT:    shll $16, %ecx
+; CHECK-NEXT:    movzbl (%eax), %eax
+; CHECK-NEXT:    shll $24, %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    retl
+;
+; CHECK64-LABEL: zext_load_i32_by_i8_bswap_shl_16:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movzbl 1(%rdi), %ecx
+; CHECK64-NEXT:    shll $16, %ecx
+; CHECK64-NEXT:    movzbl (%rdi), %eax
+; CHECK64-NEXT:    shll $24, %eax
+; CHECK64-NEXT:    orl %ecx, %eax
+; CHECK64-NEXT:    retq
+  %tmp = bitcast i32* %arg to i8*
+  %tmp1 = getelementptr inbounds i8, i8* %tmp, i32 1
+  %tmp2 = load i8, i8* %tmp1, align 1
+  %tmp3 = zext i8 %tmp2 to i32
+  %tmp30 = shl nuw nsw i32 %tmp3, 16
+  %tmp4 = getelementptr inbounds i8, i8* %tmp, i32 0
+  %tmp5 = load i8, i8* %tmp4, align 1
+  %tmp6 = zext i8 %tmp5 to i32
+  %tmp7 = shl nuw nsw i32 %tmp6, 24
+  %tmp8 = or i32 %tmp7, %tmp30
+  ret i32 %tmp8
+}
diff --git a/test/CodeGen/X86/load-slice.ll b/test/CodeGen/X86/load-slice.ll
index 2f90f819d47e02c0c4df5d70d7fa99166313b29e..8803512eec09890cd896c25d1aa42bac7b1ce2b3 100644
--- a/test/CodeGen/X86/load-slice.ll
+++ b/test/CodeGen/X86/load-slice.ll
@@ -19,10 +19,10 @@
 ; STRESS-LABEL: t1:
 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
 ; STRESS: vmovss 64([[BASE:[^(]+]]), [[OUT_Real:%xmm[0-9]+]]
-; Add low slice: out[out_start].real, this is base + 0.
-; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
 ; STRESS-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
+; Add low slice: out[out_start].real, this is base + 0.
+; STRESS-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
 ; Add high slice: out[out_start].imm, this is base + 4.
 ; STRESS-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
 ; Swap Imm and Real.
@@ -34,10 +34,10 @@
 ; REGULAR-LABEL: t1:
 ; Load out[out_start + 8].real, this is base + 8 * 8 + 0.
 ; REGULAR: vmovss 64([[BASE:[^)]+]]), [[OUT_Real:%xmm[0-9]+]]
-; Add low slice: out[out_start].real, this is base + 0.
-; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
 ; Load out[out_start + 8].imm, this is base + 8 * 8 + 4.
 ; REGULAR-NEXT: vmovss 68([[BASE]]), [[OUT_Imm:%xmm[0-9]+]]
+; Add low slice: out[out_start].real, this is base + 0.
+; REGULAR-NEXT: vaddss ([[BASE]]), [[OUT_Real]], [[RES_Real:%xmm[0-9]+]]
 ; Add high slice: out[out_start].imm, this is base + 4.
 ; REGULAR-NEXT: vaddss 4([[BASE]]), [[OUT_Imm]], [[RES_Imm:%xmm[0-9]+]]
 ; Swap Imm and Real.
@@ -73,10 +73,10 @@ entry:
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 ; Check that we do not read outside of the chunk of bits of the original loads.
 ;
diff --git a/test/CodeGen/X86/local_stack_symbol_ordering.ll b/test/CodeGen/X86/local_stack_symbol_ordering.ll
index 1893eeec2f1f781da992f62d2e1eb3564e839f2a..1cd4d6c26c35d884712d2462897fbb35b7651439 100644
--- a/test/CodeGen/X86/local_stack_symbol_ordering.ll
+++ b/test/CodeGen/X86/local_stack_symbol_ordering.ll
@@ -115,21 +115,21 @@ entry:
   %d = alloca i32, align 4
   %aaa = alloca [1000 x i32], align 16
   %0 = bitcast i32* %f to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0) #1
   %1 = bitcast [30 x i32]* %a to i8*
-  call void @llvm.lifetime.start(i64 120, i8* %1) #1
+  call void @llvm.lifetime.start.p0i8(i64 120, i8* %1) #1
   %2 = bitcast [1000 x i32]* %aa to i8*
-  call void @llvm.lifetime.start(i64 4000, i8* %2) #1
+  call void @llvm.lifetime.start.p0i8(i64 4000, i8* %2) #1
   %3 = bitcast i32* %e to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %3) #1
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %3) #1
   %4 = bitcast [1000 x i32]* %cc to i8*
-  call void @llvm.lifetime.start(i64 4000, i8* %4) #1
+  call void @llvm.lifetime.start.p0i8(i64 4000, i8* %4) #1
   %5 = bitcast i32* %b to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %5) #1
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %5) #1
   %6 = bitcast i32* %d to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %6) #1
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %6) #1
   %7 = bitcast [1000 x i32]* %aaa to i8*
-  call void @llvm.lifetime.start(i64 4000, i8* %7) #1
+  call void @llvm.lifetime.start.p0i8(i64 4000, i8* %7) #1
   %call = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @check_a to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
   %call1 = call i32 ([1000 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([1000 x i32]*, ...)*)([1000 x i32]* %aaa)
   call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
@@ -156,19 +156,19 @@ entry:
   %call15 = call i32 (i32*, i32*, i32*, ...) bitcast (i32 (...)* @bar3 to i32 (i32*, i32*, i32*, ...)*)(i32* %d, i32* %e, i32* %f)
   call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
   %call16 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
-  call void @llvm.lifetime.end(i64 4000, i8* %7) #1
-  call void @llvm.lifetime.end(i64 4, i8* %6) #1
-  call void @llvm.lifetime.end(i64 4, i8* %5) #1
-  call void @llvm.lifetime.end(i64 4000, i8* %4) #1
-  call void @llvm.lifetime.end(i64 4, i8* %3) #1
-  call void @llvm.lifetime.end(i64 4000, i8* %2) #1
-  call void @llvm.lifetime.end(i64 120, i8* %1) #1
-  call void @llvm.lifetime.end(i64 4, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 4000, i8* %7) #1
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %6) #1
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %5) #1
+  call void @llvm.lifetime.end.p0i8(i64 4000, i8* %4) #1
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #1
+  call void @llvm.lifetime.end.p0i8(i64 4000, i8* %2) #1
+  call void @llvm.lifetime.end.p0i8(i64 120, i8* %1) #1
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0) #1
   ret void
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare i32 @check_a(...) #2
 declare i32 @bar1(...) #2
@@ -180,5 +180,5 @@ declare i32 @check_e(...) #2
 declare i32 @check_d(...) #2
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
diff --git a/test/CodeGen/X86/logical-load-fold.ll b/test/CodeGen/X86/logical-load-fold.ll
index 73930ca8bca1825b6a5ac2877a8ff2e525e96bed..5f06fce1b7b69df14edee2c7027b2af0290d47e2 100644
--- a/test/CodeGen/X86/logical-load-fold.ll
+++ b/test/CodeGen/X86/logical-load-fold.ll
@@ -15,14 +15,14 @@ define double @load_double_no_fold(double %x, double %y) {
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    cmplesd %xmm0, %xmm1
 ; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    andpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: load_double_no_fold:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmplesd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 
   %cmp = fcmp oge double %x, %y
diff --git a/test/CodeGen/X86/longlong-deadload.ll b/test/CodeGen/X86/longlong-deadload.ll
index 3adaf49e372bb102f5d962049fd07038bf737215..01888f07306aceaeab31bc49d4dccb6505bfb53e 100644
--- a/test/CodeGen/X86/longlong-deadload.ll
+++ b/test/CodeGen/X86/longlong-deadload.ll
@@ -1,14 +1,20 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
-; This should not load or store the top part of *P.
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu | FileCheck %s
+; FIXME: This should not load or store the top part of *P.
 
 define void @test(i64* %P) nounwind  {
 ; CHECK-LABEL: test:
-; CHECK: movl 4(%esp), %[[REGISTER:.*]]
-; CHECK-NOT: 4(%[[REGISTER]])
-; CHECK: ret
-	%tmp1 = load i64, i64* %P, align 8		; <i64> [#uses=1]
-	%tmp2 = xor i64 %tmp1, 1		; <i64> [#uses=1]
-	store i64 %tmp2, i64* %P, align 8
-	ret void
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl (%eax), %ecx
+; CHECK-NEXT:    xorl $1, %ecx
+; CHECK-NEXT:    orl $2, %ecx
+; CHECK-NEXT:    movl %ecx, (%eax)
+; CHECK-NEXT:    retl
+        %tmp1 = load i64, i64* %P, align 8
+        %tmp2 = xor i64 %tmp1, 1
+        %tmp3 = or i64 %tmp2, 2
+        store i64 %tmp3, i64* %P, align 8
+        ret void
 }
 
diff --git a/test/CodeGen/X86/lzcnt-zext-cmp.ll b/test/CodeGen/X86/lzcnt-zext-cmp.ll
index c69dbf573f46d9f4e72f6e8dfcc3ec7f733bd2ce..7c961a98ad5587d136a78913360a6900d108af07 100644
--- a/test/CodeGen/X86/lzcnt-zext-cmp.ll
+++ b/test/CodeGen/X86/lzcnt-zext-cmp.ll
@@ -1,26 +1,19 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Test patterns which generates lzcnt instructions.
 ; Eg: zext(or(setcc(cmp), setcc(cmp))) -> shr(or(lzcnt, lzcnt))
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=znver1 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=znver1 -mattr=-fast-lzcnt | FileCheck --check-prefix=NOFASTLZCNT %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 | FileCheck --check-prefix=ALL --check-prefix=FASTLZCNT %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=btver2 -mattr=-fast-lzcnt | FileCheck --check-prefix=ALL --check-prefix=NOFASTLZCNT %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=znver1 | FileCheck --check-prefix=ALL --check-prefix=FASTLZCNT %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=znver1 -mattr=-fast-lzcnt | FileCheck --check-prefix=ALL --check-prefix=NOFASTLZCNT %s
 
 ; Test one 32-bit input, output is 32-bit, no transformations expected.
 define i32 @test_zext_cmp0(i32 %a) {
-; CHECK-LABEL: test_zext_cmp0:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    testl %edi, %edi
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
-;
-; NOFASTLZCNT-LABEL: test_zext_cmp0:
-; NOFASTLZCNT:       # BB#0: # %entry
-; NOFASTLZCNT-NEXT:    xorl %eax, %eax
-; NOFASTLZCNT-NEXT:    testl %edi, %edi
-; NOFASTLZCNT-NEXT:    sete %al
-; NOFASTLZCNT-NEXT:    retq
+; ALL-LABEL: test_zext_cmp0:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    xorl %eax, %eax
+; ALL-NEXT:    testl %edi, %edi
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    retq
 entry:
   %cmp = icmp eq i32 %a, 0
   %conv = zext i1 %cmp to i32
@@ -29,13 +22,13 @@ entry:
 
 ; Test two 32-bit inputs, output is 32-bit.
 define i32 @test_zext_cmp1(i32 %a, i32 %b) {
-; CHECK-LABEL: test_zext_cmp1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    lzcntl %edi, %ecx
-; CHECK-NEXT:    lzcntl %esi, %eax
-; CHECK-NEXT:    orl %ecx, %eax
-; CHECK-NEXT:    shrl $5, %eax
-; CHECK-NEXT:    retq
+; FASTLZCNT-LABEL: test_zext_cmp1:
+; FASTLZCNT:       # BB#0:
+; FASTLZCNT-NEXT:    lzcntl %edi, %ecx
+; FASTLZCNT-NEXT:    lzcntl %esi, %eax
+; FASTLZCNT-NEXT:    orl %ecx, %eax
+; FASTLZCNT-NEXT:    shrl $5, %eax
+; FASTLZCNT-NEXT:    retq
 ;
 ; NOFASTLZCNT-LABEL: test_zext_cmp1:
 ; NOFASTLZCNT:       # BB#0:
@@ -55,13 +48,13 @@ define i32 @test_zext_cmp1(i32 %a, i32 %b) {
 
 ; Test two 64-bit inputs, output is 64-bit.
 define i64 @test_zext_cmp2(i64 %a, i64 %b) {
-; CHECK-LABEL: test_zext_cmp2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    lzcntq %rdi, %rcx
-; CHECK-NEXT:    lzcntq %rsi, %rax
-; CHECK-NEXT:    orl %ecx, %eax
-; CHECK-NEXT:    shrl $6, %eax
-; CHECK-NEXT:    retq
+; FASTLZCNT-LABEL: test_zext_cmp2:
+; FASTLZCNT:       # BB#0:
+; FASTLZCNT-NEXT:    lzcntq %rdi, %rcx
+; FASTLZCNT-NEXT:    lzcntq %rsi, %rax
+; FASTLZCNT-NEXT:    orl %ecx, %eax
+; FASTLZCNT-NEXT:    shrl $6, %eax
+; FASTLZCNT-NEXT:    retq
 ;
 ; NOFASTLZCNT-LABEL: test_zext_cmp2:
 ; NOFASTLZCNT:       # BB#0:
@@ -83,27 +76,16 @@ define i64 @test_zext_cmp2(i64 %a, i64 %b) {
 ; The transform is disabled for the 16-bit case, as we still have to clear the
 ; upper 16-bits, adding one more instruction.
 define i16 @test_zext_cmp3(i16 %a, i16 %b) {
-; CHECK-LABEL: test_zext_cmp3:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    testw %di, %di
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    testw %si, %si
-; CHECK-NEXT:    sete %cl
-; CHECK-NEXT:    orb %al, %cl
-; CHECK-NEXT:    movzbl %cl, %eax
-; CHECK-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; CHECK-NEXT:    retq
-;
-; NOFASTLZCNT-LABEL: test_zext_cmp3:
-; NOFASTLZCNT:       # BB#0:
-; NOFASTLZCNT-NEXT:    testw %di, %di
-; NOFASTLZCNT-NEXT:    sete %al
-; NOFASTLZCNT-NEXT:    testw %si, %si
-; NOFASTLZCNT-NEXT:    sete %cl
-; NOFASTLZCNT-NEXT:    orb %al, %cl
-; NOFASTLZCNT-NEXT:    movzbl %cl, %eax
-; NOFASTLZCNT-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; NOFASTLZCNT-NEXT:    retq
+; ALL-LABEL: test_zext_cmp3:
+; ALL:       # BB#0:
+; ALL-NEXT:    testw %di, %di
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    testw %si, %si
+; ALL-NEXT:    sete %cl
+; ALL-NEXT:    orb %al, %cl
+; ALL-NEXT:    movzbl %cl, %eax
+; ALL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; ALL-NEXT:    retq
   %cmp = icmp eq i16 %a, 0
   %cmp1 = icmp eq i16 %b, 0
   %or = or i1 %cmp, %cmp1
@@ -113,13 +95,13 @@ define i16 @test_zext_cmp3(i16 %a, i16 %b) {
 
 ; Test two 32-bit inputs, output is 64-bit.
 define i64 @test_zext_cmp4(i32 %a, i32 %b) {
-; CHECK-LABEL: test_zext_cmp4:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    lzcntl %edi, %ecx
-; CHECK-NEXT:    lzcntl %esi, %eax
-; CHECK-NEXT:    orl %ecx, %eax
-; CHECK-NEXT:    shrl $5, %eax
-; CHECK-NEXT:    retq
+; FASTLZCNT-LABEL: test_zext_cmp4:
+; FASTLZCNT:       # BB#0: # %entry
+; FASTLZCNT-NEXT:    lzcntl %edi, %ecx
+; FASTLZCNT-NEXT:    lzcntl %esi, %eax
+; FASTLZCNT-NEXT:    orl %ecx, %eax
+; FASTLZCNT-NEXT:    shrl $5, %eax
+; FASTLZCNT-NEXT:    retq
 ;
 ; NOFASTLZCNT-LABEL: test_zext_cmp4:
 ; NOFASTLZCNT:       # BB#0: # %entry
@@ -140,14 +122,14 @@ entry:
 
 ; Test two 64-bit inputs, output is 32-bit.
 define i32 @test_zext_cmp5(i64 %a, i64 %b) {
-; CHECK-LABEL: test_zext_cmp5:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    lzcntq %rdi, %rcx
-; CHECK-NEXT:    lzcntq %rsi, %rax
-; CHECK-NEXT:    orl %ecx, %eax
-; CHECK-NEXT:    shrl $6, %eax
-; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
-; CHECK-NEXT:    retq
+; FASTLZCNT-LABEL: test_zext_cmp5:
+; FASTLZCNT:       # BB#0: # %entry
+; FASTLZCNT-NEXT:    lzcntq %rdi, %rcx
+; FASTLZCNT-NEXT:    lzcntq %rsi, %rax
+; FASTLZCNT-NEXT:    orl %ecx, %eax
+; FASTLZCNT-NEXT:    shrl $6, %eax
+; FASTLZCNT-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; FASTLZCNT-NEXT:    retq
 ;
 ; NOFASTLZCNT-LABEL: test_zext_cmp5:
 ; NOFASTLZCNT:       # BB#0: # %entry
@@ -168,15 +150,15 @@ entry:
 
 ; Test three 32-bit inputs, output is 32-bit.
 define i32 @test_zext_cmp6(i32 %a, i32 %b, i32 %c) {
-; CHECK-LABEL: test_zext_cmp6:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    lzcntl %edi, %eax
-; CHECK-NEXT:    lzcntl %esi, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    lzcntl %edx, %eax
-; CHECK-NEXT:    orl %ecx, %eax
-; CHECK-NEXT:    shrl $5, %eax
-; CHECK-NEXT:    retq
+; FASTLZCNT-LABEL: test_zext_cmp6:
+; FASTLZCNT:       # BB#0: # %entry
+; FASTLZCNT-NEXT:    lzcntl %edi, %eax
+; FASTLZCNT-NEXT:    lzcntl %esi, %ecx
+; FASTLZCNT-NEXT:    orl %eax, %ecx
+; FASTLZCNT-NEXT:    lzcntl %edx, %eax
+; FASTLZCNT-NEXT:    orl %ecx, %eax
+; FASTLZCNT-NEXT:    shrl $5, %eax
+; FASTLZCNT-NEXT:    retq
 ;
 ; NOFASTLZCNT-LABEL: test_zext_cmp6:
 ; NOFASTLZCNT:       # BB#0: # %entry
@@ -203,15 +185,15 @@ entry:
 ; Test three 32-bit inputs, output is 32-bit, but compared to test_zext_cmp6 test,
 ; %.cmp2 inputs' order is inverted.
 define i32 @test_zext_cmp7(i32 %a, i32 %b, i32 %c) {
-; CHECK-LABEL: test_zext_cmp7:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    lzcntl %edi, %eax
-; CHECK-NEXT:    lzcntl %esi, %ecx
-; CHECK-NEXT:    orl %eax, %ecx
-; CHECK-NEXT:    lzcntl %edx, %eax
-; CHECK-NEXT:    orl %ecx, %eax
-; CHECK-NEXT:    shrl $5, %eax
-; CHECK-NEXT:    retq
+; FASTLZCNT-LABEL: test_zext_cmp7:
+; FASTLZCNT:       # BB#0: # %entry
+; FASTLZCNT-NEXT:    lzcntl %edi, %eax
+; FASTLZCNT-NEXT:    lzcntl %esi, %ecx
+; FASTLZCNT-NEXT:    orl %eax, %ecx
+; FASTLZCNT-NEXT:    lzcntl %edx, %eax
+; FASTLZCNT-NEXT:    orl %ecx, %eax
+; FASTLZCNT-NEXT:    shrl $5, %eax
+; FASTLZCNT-NEXT:    retq
 ;
 ; NOFASTLZCNT-LABEL: test_zext_cmp7:
 ; NOFASTLZCNT:       # BB#0: # %entry
@@ -237,17 +219,17 @@ entry:
 
 ; Test four 32-bit inputs, output is 32-bit.
 define i32 @test_zext_cmp8(i32 %a, i32 %b, i32 %c, i32 %d) {
-; CHECK-LABEL: test_zext_cmp8:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    lzcntl %edi, %eax
-; CHECK-NEXT:    lzcntl %esi, %esi
-; CHECK-NEXT:    lzcntl %edx, %edx
-; CHECK-NEXT:    orl %eax, %esi
-; CHECK-NEXT:    lzcntl %ecx, %eax
-; CHECK-NEXT:    orl %edx, %eax
-; CHECK-NEXT:    orl %esi, %eax
-; CHECK-NEXT:    shrl $5, %eax
-; CHECK-NEXT:    retq
+; FASTLZCNT-LABEL: test_zext_cmp8:
+; FASTLZCNT:       # BB#0: # %entry
+; FASTLZCNT-NEXT:    lzcntl %edi, %eax
+; FASTLZCNT-NEXT:    lzcntl %esi, %esi
+; FASTLZCNT-NEXT:    lzcntl %edx, %edx
+; FASTLZCNT-NEXT:    orl %eax, %esi
+; FASTLZCNT-NEXT:    lzcntl %ecx, %eax
+; FASTLZCNT-NEXT:    orl %edx, %eax
+; FASTLZCNT-NEXT:    orl %esi, %eax
+; FASTLZCNT-NEXT:    shrl $5, %eax
+; FASTLZCNT-NEXT:    retq
 ;
 ; NOFASTLZCNT-LABEL: test_zext_cmp8:
 ; NOFASTLZCNT:       # BB#0: # %entry
@@ -278,15 +260,15 @@ entry:
 
 ; Test one 32-bit input, one 64-bit input, output is 32-bit.
 define i32 @test_zext_cmp9(i32 %a, i64 %b) {
-; CHECK-LABEL: test_zext_cmp9:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    lzcntq %rsi, %rax
-; CHECK-NEXT:    lzcntl %edi, %ecx
-; CHECK-NEXT:    shrl $5, %ecx
-; CHECK-NEXT:    shrl $6, %eax
-; CHECK-NEXT:    orl %ecx, %eax
-; CHECK-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
-; CHECK-NEXT:    retq
+; FASTLZCNT-LABEL: test_zext_cmp9:
+; FASTLZCNT:       # BB#0: # %entry
+; FASTLZCNT-NEXT:    lzcntq %rsi, %rax
+; FASTLZCNT-NEXT:    lzcntl %edi, %ecx
+; FASTLZCNT-NEXT:    shrl $5, %ecx
+; FASTLZCNT-NEXT:    shrl $6, %eax
+; FASTLZCNT-NEXT:    orl %ecx, %eax
+; FASTLZCNT-NEXT:    # kill: %EAX<def> %EAX<kill> %RAX<kill>
+; FASTLZCNT-NEXT:    retq
 ;
 ; NOFASTLZCNT-LABEL: test_zext_cmp9:
 ; NOFASTLZCNT:       # BB#0: # %entry
@@ -307,25 +289,15 @@ entry:
 
 ; Test 2 128-bit inputs, output is 32-bit, no transformations expected.
 define i32 @test_zext_cmp10(i64 %a.coerce0, i64 %a.coerce1, i64 %b.coerce0, i64 %b.coerce1) {
-; CHECK-LABEL: test_zext_cmp10:
-; CHECK:       # BB#0: # %entry
-; CHECK-NEXT:    orq %rsi, %rdi
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    orq %rcx, %rdx
-; CHECK-NEXT:    sete %cl
-; CHECK-NEXT:    orb %al, %cl
-; CHECK-NEXT:    movzbl %cl, %eax
-; CHECK-NEXT:    retq
-;
-; NOFASTLZCNT-LABEL: test_zext_cmp10:
-; NOFASTLZCNT:       # BB#0: # %entry
-; NOFASTLZCNT-NEXT:    orq %rsi, %rdi
-; NOFASTLZCNT-NEXT:    sete %al
-; NOFASTLZCNT-NEXT:    orq %rcx, %rdx
-; NOFASTLZCNT-NEXT:    sete %cl
-; NOFASTLZCNT-NEXT:    orb %al, %cl
-; NOFASTLZCNT-NEXT:    movzbl %cl, %eax
-; NOFASTLZCNT-NEXT:    retq
+; ALL-LABEL: test_zext_cmp10:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    orq %rsi, %rdi
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    orq %rcx, %rdx
+; ALL-NEXT:    sete %cl
+; ALL-NEXT:    orb %al, %cl
+; ALL-NEXT:    movzbl %cl, %eax
+; ALL-NEXT:    retq
 entry:
   %a.sroa.2.0.insert.ext = zext i64 %a.coerce1 to i128
   %a.sroa.2.0.insert.shift = shl nuw i128 %a.sroa.2.0.insert.ext, 64
@@ -341,3 +313,24 @@ entry:
   %lor.ext = zext i1 %0 to i32
   ret i32 %lor.ext
 }
+
+; PR31902 Fix a crash in combineOrCmpEqZeroToCtlzSrl under fast math.
+define i32 @test_zext_cmp11(double %a, double %b) "no-nans-fp-math"="true" {
+;
+; ALL-LABEL: test_zext_cmp11:
+; ALL:       # BB#0: # %entry
+; ALL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; ALL-NEXT:    vucomisd %xmm2, %xmm0
+; ALL-NEXT:    sete %al
+; ALL-NEXT:    vucomisd %xmm2, %xmm1
+; ALL-NEXT:    sete %cl
+; ALL-NEXT:    orb %al, %cl
+; ALL-NEXT:    movzbl %cl, %eax
+; ALL-NEXT:    retq
+entry:
+  %cmp = fcmp fast oeq double %a, 0.000000e+00
+  %cmp1 = fcmp fast oeq double %b, 0.000000e+00
+  %0 = or i1 %cmp, %cmp1
+  %conv = zext i1 %0 to i32
+  ret i32 %conv
+}
diff --git a/test/CodeGen/X86/machine-outliner-debuginfo.ll b/test/CodeGen/X86/machine-outliner-debuginfo.ll
new file mode 100644
index 0000000000000000000000000000000000000000..26a194764086d71bba139eeb61119f83d625a14c
--- /dev/null
+++ b/test/CodeGen/X86/machine-outliner-debuginfo.ll
@@ -0,0 +1,75 @@
+; RUN: llc -enable-machine-outliner -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+@x = global i32 0, align 4, !dbg !0
+
+define i32 @main() #0 !dbg !11 {
+  ; CHECK-LABEL: _main:
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  ; There is a debug value in the middle of this section, make sure debug values are ignored.
+  ; CHECK: callq l_OUTLINED_FUNCTION_0
+  store i32 1, i32* %2, align 4
+  store i32 2, i32* %3, align 4
+  store i32 3, i32* %4, align 4
+  call void @llvm.dbg.value(metadata i32 10, i64 0, metadata !15, metadata !16), !dbg !17
+  store i32 4, i32* %5, align 4
+  store i32 0, i32* @x, align 4, !dbg !24
+  ; This is the same sequence of instructions without a debug value. It should be outlined
+  ; in the same way.
+  ; CHECK: callq l_OUTLINED_FUNCTION_0
+  store i32 1, i32* %2, align 4
+  store i32 2, i32* %3, align 4
+  store i32 3, i32* %4, align 4
+  store i32 4, i32* %5, align 4
+  store i32 1, i32* @x, align 4, !dbg !14
+  ret i32 0, !dbg !25
+}
+
+; CHECK-LABEL: l_OUTLINED_FUNCTION_0:
+; CHECK-NOT:  .loc  {{[0-9]+}} {{[0-9]+}} {{[0-9]+}} {{^(is_stmt)}}
+; CHECK-NOT:  ##DEBUG_VALUE: main:{{[a-z]}} <- {{[0-9]+}}
+; CHECK:      movl  $1, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: movl  $2, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: movl  $3, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: movl  $4, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: retq
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="true" }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8, !9}
+!llvm.ident = !{!10}
+
+!0 = !DIGlobalVariableExpression(var: !1)
+!1 = distinct !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 5.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "debug-test.c", directory: "dir")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+!9 = !{i32 1, !"PIC Level", i32 2}
+!10 = !{!"clang version 5.0.0"}
+!11 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 4, type: !12, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !2, variables: !4)
+!12 = !DISubroutineType(types: !13)
+!13 = !{!6}
+!14 = !DILocation(line: 7, column: 4, scope: !11)
+!15 = !DILocalVariable(name: "a", scope: !11, file: !3, line: 5, type: !6)
+!16 = !DIExpression()
+!17 = !DILocation(line: 5, column: 6, scope: !11)
+!18 = !DILocalVariable(name: "b", scope: !11, file: !3, line: 5, type: !6)
+!19 = !DILocation(line: 5, column: 9, scope: !11)
+!20 = !DILocalVariable(name: "c", scope: !11, file: !3, line: 5, type: !6)
+!21 = !DILocation(line: 5, column: 12, scope: !11)
+!22 = !DILocalVariable(name: "d", scope: !11, file: !3, line: 5, type: !6)
+!23 = !DILocation(line: 5, column: 15, scope: !11)
+!24 = !DILocation(line: 14, column: 4, scope: !11)
+!25 = !DILocation(line: 21, column: 2, scope: !11)
diff --git a/test/CodeGen/X86/machine-outliner-tailcalls.ll b/test/CodeGen/X86/machine-outliner-tailcalls.ll
new file mode 100644
index 0000000000000000000000000000000000000000..020f7eeaaff3c5fc16ba839a1c2512bfc18368e6
--- /dev/null
+++ b/test/CodeGen/X86/machine-outliner-tailcalls.ll
@@ -0,0 +1,35 @@
+; RUN: llc -enable-machine-outliner -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+@x = common local_unnamed_addr global i32 0, align 4
+
+define i32 @foo0(i32) local_unnamed_addr #0 {
+; CHECK-LABEL: _foo0:
+; CHECK: jmp l_OUTLINED_FUNCTION_0
+; CHECK-NEXT: .cfi_endproc
+  store i32 0, i32* @x, align 4, !tbaa !2
+  %2 = tail call i32 @ext(i32 1) #2
+  ret i32 undef
+}
+
+declare i32 @ext(i32) local_unnamed_addr #1
+
+define i32 @foo1(i32) local_unnamed_addr #0 {
+; CHECK-LABEL: _foo1:
+; CHECK: jmp l_OUTLINED_FUNCTION_0
+; CHECK-NEXT: .cfi_endproc
+  store i32 0, i32* @x, align 4, !tbaa !2
+  %2 = tail call i32 @ext(i32 1) #2
+  ret i32 undef
+}
+
+attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" }
+
+!2 = !{!3, !3, i64 0}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+
+; CHECK-LABEL: l_OUTLINED_FUNCTION_0:
+; CHECK: movl  $0, (%rax)
+; CHECK-NEXT: movl  $1, %edi
+; CHECK-NEXT: jmp _ext 
\ No newline at end of file
diff --git a/test/CodeGen/X86/machine-outliner.ll b/test/CodeGen/X86/machine-outliner.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9f8e6ec298f4ea334401a74639cb4c4b15107284
--- /dev/null
+++ b/test/CodeGen/X86/machine-outliner.ll
@@ -0,0 +1,110 @@
+; RUN: llc -enable-machine-outliner -mtriple=x86_64-apple-darwin < %s | FileCheck %s
+
+@x = global i32 0, align 4
+
+define i32 @check_boundaries() #0 {
+  ; CHECK-LABEL: _check_boundaries:
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+  store i32 0, i32* %1, align 4
+  store i32 0, i32* %2, align 4
+  %6 = load i32, i32* %2, align 4
+  %7 = icmp ne i32 %6, 0
+  br i1 %7, label %9, label %8
+
+  ; CHECK: callq [[OFUNC1:l_OUTLINED_FUNCTION_[0-9]+]]
+  ; CHECK: cmpl  $0, -{{[0-9]+}}(%rbp)
+  store i32 1, i32* %2, align 4
+  store i32 2, i32* %3, align 4
+  store i32 3, i32* %4, align 4
+  store i32 4, i32* %5, align 4
+  br label %10
+
+  store i32 1, i32* %4, align 4
+  br label %10
+
+  %11 = load i32, i32* %2, align 4
+  %12 = icmp ne i32 %11, 0
+  br i1 %12, label %14, label %13
+
+  ; CHECK: callq [[OFUNC1]]
+  store i32 1, i32* %2, align 4
+  store i32 2, i32* %3, align 4
+  store i32 3, i32* %4, align 4
+  store i32 4, i32* %5, align 4
+  br label %15
+
+  store i32 1, i32* %4, align 4
+  br label %15
+
+  ret i32 0
+}
+
+define i32 @empty_1() #0 {
+  ; CHECK-LABEL: _empty_1:
+  ; CHECK-NOT: callq l_OUTLINED_FUNCTION_{{[0-9]+}}
+  ret i32 1
+}
+
+define i32 @empty_2() #0 {
+  ; CHECK-LABEL: _empty_2
+  ; CHECK-NOT: callq l_OUTLINED_FUNCTION_{{[0-9]+}}
+  ret i32 1
+}
+
+define i32 @no_empty_outlining() #0 {
+  ; CHECK-LABEL: _no_empty_outlining:
+  %1 = alloca i32, align 4
+  store i32 0, i32* %1, align 4
+  ; CHECK-NOT: callq l_OUTLINED_FUNCTION_{{[0-9]+}}
+  %2 = call i32 @empty_1() #1
+  %3 = call i32 @empty_2() #1
+  %4 = call i32 @empty_1() #1
+  %5 = call i32 @empty_2() #1
+  %6 = call i32 @empty_1() #1
+  %7 = call i32 @empty_2() #1
+  ret i32 0
+}
+
+define i32 @main() #0 {
+  ; CHECK-LABEL: _main:
+  %1 = alloca i32, align 4
+  %2 = alloca i32, align 4
+  %3 = alloca i32, align 4
+  %4 = alloca i32, align 4
+  %5 = alloca i32, align 4
+
+  store i32 0, i32* %1, align 4
+  store i32 0, i32* @x, align 4
+  ; CHECK: callq [[OFUNC2:l_OUTLINED_FUNCTION_[0-9]+]]
+  store i32 1, i32* %2, align 4
+  store i32 2, i32* %3, align 4
+  store i32 3, i32* %4, align 4
+  store i32 4, i32* %5, align 4
+  store i32 1, i32* @x, align 4
+  ; CHECK: callq [[OFUNC2]]
+  store i32 1, i32* %2, align 4
+  store i32 2, i32* %3, align 4
+  store i32 3, i32* %4, align 4
+  store i32 4, i32* %5, align 4
+  ret i32 0
+}
+
+attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="true" }
+
+; CHECK-LABEL: l_OUTLINED_FUNCTION_1:
+; CHECK:      movl  $1, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: movl  $2, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: movl  $3, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: movl  $4, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: retq
+
+; CHECK-LABEL: l_OUTLINED_FUNCTION_0:
+; CHECK:      movl  $1, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: movl  $2, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: movl  $3, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: movl  $4, -{{[0-9]+}}(%rbp)
+; CHECK-NEXT: retq
diff --git a/test/CodeGen/X86/machine-region-info.mir b/test/CodeGen/X86/machine-region-info.mir
new file mode 100644
index 0000000000000000000000000000000000000000..0998fe97c2353178b5b3d1cfecf785e6b95a83d0
--- /dev/null
+++ b/test/CodeGen/X86/machine-region-info.mir
@@ -0,0 +1,83 @@
+# RUN: llc -mtriple=x86_64-- -run-pass=machine-region-info %s -debug-only=machine-region-info -o /dev/null 2>&1 | FileCheck %s
+# REQUIRES: asserts
+---
+name:            fun
+body:             |
+  bb.0:
+    successors: %bb.1, %bb.7
+
+    CMP32ri8 %edi, 40, implicit-def %eflags
+    JNE_1 %bb.7, implicit killed %eflags
+    JMP_1 %bb.1
+
+  bb.1:
+    successors: %bb.2, %bb.11
+
+    CMP32ri8 %edi, 1, implicit-def %eflags
+    JNE_1 %bb.11, implicit killed %eflags
+    JMP_1 %bb.2
+
+  bb.2:
+    successors: %bb.3, %bb.5
+
+    CMP32ri8 %edi, 2, implicit-def %eflags
+    JNE_1 %bb.5, implicit killed %eflags
+    JMP_1 %bb.3
+
+  bb.3:
+    successors: %bb.4, %bb.5
+
+    CMP32ri8 %edi, 90, implicit-def %eflags
+    JNE_1 %bb.5, implicit killed %eflags
+    JMP_1 %bb.4
+
+  bb.4:
+    successors: %bb.5
+
+  bb.5:
+    successors: %bb.6, %bb.11
+
+    CMP32ri8 %edi, 4, implicit-def %eflags
+    JNE_1 %bb.11, implicit killed %eflags
+    JMP_1 %bb.6
+
+  bb.6:
+    successors: %bb.11
+
+    JMP_1 %bb.11
+
+  bb.7:
+    successors: %bb.9, %bb.8
+
+    CMP32ri8 %edi, 5, implicit-def %eflags
+    JE_1 %bb.9, implicit killed %eflags
+    JMP_1 %bb.8
+
+  bb.8:
+    successors: %bb.9
+
+  bb.9:
+    successors: %bb.11, %bb.10
+
+    CMP32ri8 %edi, 6, implicit-def %eflags
+    JE_1 %bb.11, implicit killed %eflags
+    JMP_1 %bb.10
+
+  bb.10:
+    successors: %bb.11
+
+  bb.11:
+    RET 0
+
+...
+
+# CHECK: Region tree:
+# CHECK-NEXT: [0] BB#0 => <Function Return>
+# CHECK-NEXT:   [1] BB#0 => BB#11
+# CHECK-NEXT:     [2] BB#1 => BB#11
+# CHECK-NEXT:       [3] BB#2 => BB#5
+# CHECK-NEXT:         [4] BB#3 => BB#5
+# CHECK-NEXT:       [3] BB#5 => BB#11
+# CHECK-NEXT:     [2] BB#7 => BB#9
+# CHECK-NEXT:     [2] BB#9 => BB#11
+# CHECK-NEXT: End region tree
diff --git a/test/CodeGen/X86/machine-trace-metrics-crash.ll b/test/CodeGen/X86/machine-trace-metrics-crash.ll
index 5b7c5445316cbf15ee0c75bfd871c2fd8b811121..6369ee4eb0ef9da68ce333c4daffc3b58d4254a8 100644
--- a/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -31,7 +31,7 @@ if.end:
   %add.i = fadd fast float %add, %n0
   store float %add.i, float* undef, align 4
   %n1 = bitcast %struct.A* %i to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %n1)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %n1)
   %n2 = load <2 x float>, <2 x float>* undef, align 8
   %conv = uitofp i1 %tobool to float
   %bitcast = extractelement <2 x float> %n2, i32 0
@@ -45,7 +45,7 @@ if.end:
 
 declare void @bar(float)
 declare void @foo(%struct.A*)
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fdc5ace8d9bcf799871438b2f0bee648bbe3dd69
--- /dev/null
+++ b/test/CodeGen/X86/madd.ll
@@ -0,0 +1,103 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=AVX512
+
+;SSE2-label: @_Z10test_shortPsS_i
+;SSE2:        movdqu
+;SSE2-NEXT:   movdqu
+;SSE2-NEXT:   pmaddwd
+;SSE2-NEXT:   paddd
+
+;AVX2-label: @_Z10test_shortPsS_i
+;AVX2:        vmovdqu
+;AVX2-NEXT:   vpmaddwd
+;AVX2-NEXT:   vinserti128
+;AVX2-NEXT:   vpaddd
+
+;AVX512-label: @_Z10test_shortPsS_i
+;AVX512:        vmovdqu
+;AVX512-NEXT:   vpmaddwd
+;AVX512-NEXT:   vinserti128
+;AVX512-NEXT:   vpaddd
+
+define i32 @_Z10test_shortPsS_i(i16* nocapture readonly, i16* nocapture readonly, i32) local_unnamed_addr #0 {
+entry:
+  %3 = zext i32 %2 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %vec.phi = phi <8 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
+  %4 = getelementptr inbounds i16, i16* %0, i64 %index
+  %5 = bitcast i16* %4 to <8 x i16>*
+  %wide.load = load <8 x i16>, <8 x i16>* %5, align 2
+  %6 = sext <8 x i16> %wide.load to <8 x i32>
+  %7 = getelementptr inbounds i16, i16* %1, i64 %index
+  %8 = bitcast i16* %7 to <8 x i16>*
+  %wide.load14 = load <8 x i16>, <8 x i16>* %8, align 2
+  %9 = sext <8 x i16> %wide.load14 to <8 x i32>
+  %10 = mul nsw <8 x i32> %9, %6
+  %11 = add nsw <8 x i32> %10, %vec.phi
+  %index.next = add i64 %index, 8
+  %12 = icmp eq i64 %index.next, %3
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:
+  %rdx.shuf = shufflevector <8 x i32> %11, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %11, %rdx.shuf
+  %rdx.shuf15 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx16 = add <8 x i32> %bin.rdx, %rdx.shuf15
+  %rdx.shuf17 = shufflevector <8 x i32> %bin.rdx16, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx18 = add <8 x i32> %bin.rdx16, %rdx.shuf17
+  %13 = extractelement <8 x i32> %bin.rdx18, i32 0
+  ret i32 %13
+}
+
+;AVX2-label: @_Z9test_charPcS_i
+;AVX2:       vpmovsxbw
+;AVX2-NEXT:  vpmovsxbw
+;AVX2-NEXT:  vpmaddwd
+;AVX2-NEXT:  vpaddd
+
+;AVX512-label: @_Z9test_charPcS_i
+;AVX512:       vpmovsxbw
+;AVX512-NEXT:  vpmovsxbw
+;AVX512-NEXT:  vpmaddwd
+;AVX512-NEXT:  vinserti64x4
+;AVX512-NEXT:  vpaddd
+
+define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i32) local_unnamed_addr #0 {
+entry:
+  %3 = zext i32 %2 to i64
+  br label %vector.body
+
+vector.body:
+  %index = phi i64 [ %index.next, %vector.body ], [ 0, %entry ]
+  %vec.phi = phi <16 x i32> [ %11, %vector.body ], [ zeroinitializer, %entry ]
+  %4 = getelementptr inbounds i8, i8* %0, i64 %index
+  %5 = bitcast i8* %4 to <16 x i8>*
+  %wide.load = load <16 x i8>, <16 x i8>* %5, align 1
+  %6 = sext <16 x i8> %wide.load to <16 x i32>
+  %7 = getelementptr inbounds i8, i8* %1, i64 %index
+  %8 = bitcast i8* %7 to <16 x i8>*
+  %wide.load14 = load <16 x i8>, <16 x i8>* %8, align 1
+  %9 = sext <16 x i8> %wide.load14 to <16 x i32>
+  %10 = mul nsw <16 x i32> %9, %6
+  %11 = add nsw <16 x i32> %10, %vec.phi
+  %index.next = add i64 %index, 16
+  %12 = icmp eq i64 %index.next, %3
+  br i1 %12, label %middle.block, label %vector.body
+
+middle.block:
+  %rdx.shuf = shufflevector <16 x i32> %11, <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <16 x i32> %11, %rdx.shuf
+  %rdx.shuf15 = shufflevector <16 x i32> %bin.rdx, <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx16 = add <16 x i32> %bin.rdx, %rdx.shuf15
+  %rdx.shuf17 = shufflevector <16 x i32> %bin.rdx16, <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx18 = add <16 x i32> %bin.rdx16, %rdx.shuf17
+  %rdx.shuf19 = shufflevector <16 x i32> %bin.rdx18, <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx20 = add <16 x i32> %bin.rdx18, %rdx.shuf19
+  %13 = extractelement <16 x i32> %bin.rdx20, i32 0
+  ret i32 %13
+}
diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll
index 3192561716443cb3a2ff2f9079f1658789b2c5aa..1a15cab97e2e575edf9f74a9180b3425b4be3963 100644
--- a/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/test/CodeGen/X86/masked_gather_scatter.ll
@@ -233,6 +233,7 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
 ; KNL_64-NEXT:    kmovw %k1, %k2
 ; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
 ; KNL_64-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test5:
@@ -242,6 +243,7 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
 ; KNL_32-NEXT:    kmovw %k1, %k2
 ; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
 ; KNL_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test5:
@@ -250,6 +252,7 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
 ; SKX-NEXT:    kmovw %k1, %k2
 ; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
 ; SKX-NEXT:    vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test5:
@@ -259,6 +262,7 @@ define void @test5(i32* %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
 ; SKX_32-NEXT:    kmovw %k1, %k2
 ; SKX_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
 ; SKX_32-NEXT:    vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
+; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
 
   %broadcast.splatinsert = insertelement <16 x i32*> undef, i32* %base, i32 0
@@ -356,7 +360,7 @@ define <8 x i32> @test7(i32* %base, <8 x i32> %ind, i8 %mask) {
 ;
 ; SKX-LABEL: test7:
 ; SKX:       # BB#0:
-; SKX-NEXT:    kmovb %esi, %k1
+; SKX-NEXT:    kmovw %esi, %k1
 ; SKX-NEXT:    kmovw %k1, %k2
 ; SKX-NEXT:    vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
 ; SKX-NEXT:    vmovdqa %ymm1, %ymm2
@@ -714,8 +718,7 @@ define <16 x float> @test13(float* %base, <16 x i32> %ind) {
 define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
 ; KNL_64-LABEL: test14:
 ; KNL_64:       # BB#0:
-; KNL_64-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
-; KNL_64-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_64-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
 ; KNL_64-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; KNL_64-NEXT:    vmovd %esi, %xmm1
 ; KNL_64-NEXT:    vpbroadcastd %xmm1, %ymm1
@@ -731,8 +734,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
 ;
 ; KNL_32-LABEL: test14:
 ; KNL_32:       # BB#0:
-; KNL_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
-; KNL_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; KNL_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; KNL_32-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; KNL_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
 ; KNL_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
@@ -742,8 +744,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
 ;
 ; SKX-LABEL: test14:
 ; SKX:       # BB#0:
-; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
-; SKX-NEXT:    vinserti64x2 $0, %xmm1, %zmm0, %zmm0
+; SKX-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm0
 ; SKX-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; SKX-NEXT:    vpbroadcastd %esi, %ymm1
 ; SKX-NEXT:    vpmovsxdq %ymm1, %zmm1
@@ -758,8 +759,7 @@ define <16 x float> @test14(float* %base, i32 %ind, <16 x float*> %vec) {
 ;
 ; SKX_32-LABEL: test14:
 ; SKX_32:       # BB#0:
-; SKX_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm1
-; SKX_32-NEXT:    vinserti32x4 $0, %xmm1, %zmm0, %zmm0
+; SKX_32-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; SKX_32-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; SKX_32-NEXT:    vpslld $2, {{[0-9]+}}(%esp){1to16}, %zmm1
 ; SKX_32-NEXT:    vpaddd %zmm1, %zmm0, %zmm1
@@ -794,6 +794,7 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
 ; KNL_64-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm2,4), %ymm0 {%k1}
 ; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test15:
@@ -808,6 +809,7 @@ define <4 x float> @test15(float* %base, <4 x i32> %ind, <4 x i1> %mask) {
 ; KNL_32-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm2,4), %ymm0 {%k1}
 ; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test15:
@@ -904,6 +906,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vgatherqpd (%rdi,%zmm0,8), %zmm2 {%k1}
 ; KNL_64-NEXT:    vmovapd %xmm2, %xmm0
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test17:
@@ -917,6 +920,7 @@ define <2 x double> @test17(double* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vgatherqpd (%eax,%zmm0,8), %zmm2 {%k1}
 ; KNL_32-NEXT:    vmovapd %xmm2, %xmm0
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test17:
@@ -960,6 +964,7 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ; KNL_64-NEXT:    vpslld $31, %ymm2, %ymm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test18:
@@ -973,6 +978,7 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ; KNL_32-NEXT:    vpslld $31, %ymm2, %ymm2
 ; KNL_32-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test18:
@@ -980,6 +986,7 @@ define void @test18(<4 x i32>%a1, <4 x i32*> %ptr, <4 x i1>%mask) {
 ; SKX-NEXT:    vpslld $31, %xmm2, %xmm2
 ; SKX-NEXT:    vptestmd %xmm2, %xmm2, %k1
 ; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test18:
@@ -1006,6 +1013,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
 ; KNL_64-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vscatterqpd %zmm0, (%rdi,%zmm2,8) {%k1}
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test19:
@@ -1021,6 +1029,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
 ; KNL_32-NEXT:    vpsllq $63, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vscatterqpd %zmm0, (%eax,%zmm2,8) {%k1}
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test19:
@@ -1028,6 +1037,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
 ; SKX-NEXT:    vptestmd %xmm1, %xmm1, %k1
 ; SKX-NEXT:    vscatterqpd %ymm0, (%rdi,%ymm2,8) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test19:
@@ -1036,6 +1046,7 @@ define void @test19(<4 x double>%a1, double* %ptr, <4 x i1>%mask, <4 x i64> %ind
 ; SKX_32-NEXT:    vptestmd %xmm1, %xmm1, %k1
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SKX_32-NEXT:    vscatterqpd %ymm0, (%eax,%ymm2,8) {%k1}
+; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
   %gep = getelementptr double, double* %ptr, <4 x i64> %ind
   call void @llvm.masked.scatter.v4f64(<4 x double> %a1, <4 x double*> %gep, i32 8, <4 x i1> %mask)
@@ -1055,6 +1066,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
 ; KNL_64-NEXT:    vpslld $31, %ymm2, %ymm2
 ; KNL_64-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test20:
@@ -1068,6 +1080,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
 ; KNL_32-NEXT:    vpslld $31, %ymm2, %ymm2
 ; KNL_32-NEXT:    vptestmd %zmm2, %zmm2, %k1
 ; KNL_32-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test20:
@@ -1078,6 +1091,7 @@ define void @test20(<2 x float>%a1, <2 x float*> %ptr, <2 x i1> %mask) {
 ; SKX-NEXT:    kshiftlb $6, %k0, %k0
 ; SKX-NEXT:    kshiftrb $6, %k0, %k1
 ; SKX-NEXT:    vscatterqps %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test20:
@@ -1105,6 +1119,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
 ; KNL_64-NEXT:    vpsllq $63, %zmm2, %zmm2
 ; KNL_64-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test21:
@@ -1116,6 +1131,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
 ; KNL_32-NEXT:    vpsllq $63, %zmm2, %zmm2
 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test21:
@@ -1127,6 +1143,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
 ; SKX-NEXT:    kshiftrb $6, %k0, %k1
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test21:
@@ -1138,6 +1155,7 @@ define void @test21(<2 x i32>%a1, <2 x i32*> %ptr, <2 x i1>%mask) {
 ; SKX_32-NEXT:    kshiftrb $6, %k0, %k1
 ; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
   call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> %mask)
   ret void
@@ -1161,6 +1179,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
 ; KNL_64-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm0,4), %ymm2 {%k1}
 ; KNL_64-NEXT:    vmovaps %xmm2, %xmm0
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test22:
@@ -1176,6 +1195,7 @@ define <2 x float> @test22(float* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x fl
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm0,4), %ymm2 {%k1}
 ; KNL_32-NEXT:    vmovaps %xmm2, %xmm0
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test22:
@@ -1221,6 +1241,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
 ; KNL_64-NEXT:    vmovdqa %xmm2, %xmm0
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test23:
@@ -1234,6 +1255,7 @@ define <2 x i32> @test23(i32* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
 ; KNL_32-NEXT:    vmovdqa %xmm2, %xmm0
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test23:
@@ -1266,6 +1288,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
 ; KNL_64-NEXT:    vmovdqa %xmm1, %xmm0
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test24:
@@ -1278,6 +1301,7 @@ define <2 x i32> @test24(i32* %base, <2 x i32> %ind) {
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
 ; KNL_32-NEXT:    vmovdqa %xmm1, %xmm0
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test24:
@@ -1312,6 +1336,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
 ; KNL_64-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm2 {%k1}
 ; KNL_64-NEXT:    vmovdqa %xmm2, %xmm0
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test25:
@@ -1325,6 +1350,7 @@ define <2 x i64> @test25(i64* %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %
 ; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm2 {%k1}
 ; KNL_32-NEXT:    vmovdqa %xmm2, %xmm0
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test25:
@@ -1359,6 +1385,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vpgatherqq (%rdi,%zmm0,8), %zmm1 {%k1}
 ; KNL_64-NEXT:    vmovdqa %xmm1, %xmm0
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test26:
@@ -1372,6 +1399,7 @@ define <2 x i64> @test26(i64* %base, <2 x i32> %ind, <2 x i64> %src0) {
 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; KNL_32-NEXT:    vpgatherqq (%eax,%zmm0,8), %zmm1 {%k1}
 ; KNL_32-NEXT:    vmovdqa %xmm1, %xmm0
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test26:
@@ -1405,6 +1433,7 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) {
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vgatherqps (%rdi,%zmm1,4), %ymm0 {%k1}
 ; KNL_64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test27:
@@ -1416,13 +1445,14 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) {
 ; KNL_32-NEXT:    kmovw %ecx, %k1
 ; KNL_32-NEXT:    vgatherqps (%eax,%zmm1,4), %ymm0 {%k1}
 ; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test27:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; SKX-NEXT:    movb $3, %al
-; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    kmovw %eax, %k1
 ; SKX-NEXT:    vgatherdps (%rdi,%xmm1,4), %xmm0 {%k1}
 ; SKX-NEXT:    retq
 ;
@@ -1431,7 +1461,7 @@ define <2 x float> @test27(float* %base, <2 x i32> %ind) {
 ; SKX_32-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; SKX_32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; SKX_32-NEXT:    movb $3, %cl
-; SKX_32-NEXT:    kmovb %ecx, %k1
+; SKX_32-NEXT:    kmovw %ecx, %k1
 ; SKX_32-NEXT:    vgatherdps (%eax,%xmm1,4), %xmm0 {%k1}
 ; SKX_32-NEXT:    retl
   %sext_ind = sext <2 x i32> %ind to <2 x i64>
@@ -1451,6 +1481,7 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
 ; KNL_64-NEXT:    movb $3, %al
 ; KNL_64-NEXT:    kmovw %eax, %k1
 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test28:
@@ -1462,24 +1493,27 @@ define void @test28(<2 x i32>%a1, <2 x i32*> %ptr) {
 ; KNL_32-NEXT:    vpsllq $63, %zmm2, %zmm2
 ; KNL_32-NEXT:    vptestmq %zmm2, %zmm2, %k1
 ; KNL_32-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k1}
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test28:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SKX-NEXT:    movb $3, %al
-; SKX-NEXT:    kmovb %eax, %k1
+; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SKX-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test28:
 ; SKX_32:       # BB#0:
 ; SKX_32-NEXT:    # kill: %XMM1<def> %XMM1<kill> %YMM1<def>
-; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SKX_32-NEXT:    movb $3, %al
-; SKX_32-NEXT:    kmovb %eax, %k1
+; SKX_32-NEXT:    kmovw %eax, %k1
+; SKX_32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SKX_32-NEXT:    vpscatterqd %xmm0, (,%ymm1) {%k1}
+; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
   call void @llvm.masked.scatter.v2i32(<2 x i32> %a1, <2 x i32*> %ptr, i32 4, <2 x i1> <i1 true, i1 true>)
   ret void
@@ -1657,12 +1691,12 @@ define <16 x i64> @test_gather_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i
 ; KNL_32-LABEL: test_gather_16i64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Lcfi0:
+; KNL_32-NEXT:  .Lcfi4:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Lcfi1:
+; KNL_32-NEXT:  .Lcfi5:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Lcfi2:
+; KNL_32-NEXT:  .Lcfi6:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -1780,12 +1814,12 @@ define <16 x double> @test_gather_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <
 ; KNL_32-LABEL: test_gather_16f64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Lcfi3:
+; KNL_32-NEXT:  .Lcfi7:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Lcfi4:
+; KNL_32-NEXT:  .Lcfi8:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Lcfi5:
+; KNL_32-NEXT:  .Lcfi9:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -1852,6 +1886,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
 ; KNL_64-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
 ; KNL_64-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
 ; KNL_64-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test_scatter_16i32:
@@ -1860,6 +1895,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test_scatter_16i32:
@@ -1871,6 +1907,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
 ; SKX-NEXT:    vpscatterqd %ymm3, (,%zmm0) {%k1}
 ; SKX-NEXT:    vextracti32x8 $1, %zmm3, %ymm0
 ; SKX-NEXT:    vpscatterqd %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test_scatter_16i32:
@@ -1879,6 +1916,7 @@ define void @test_scatter_16i32(<16 x i32*> %ptrs, <16 x i1> %mask, <16 x i32> %
 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; SKX_32-NEXT:    vpscatterdd %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
   call void @llvm.masked.scatter.v16i32(<16 x i32> %src0, <16 x i32*> %ptrs, i32 4, <16 x i1> %mask)
   ret void
@@ -1892,17 +1930,18 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
 ; KNL_64-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
 ; KNL_64-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test_scatter_16i64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Lcfi6:
+; KNL_32-NEXT:  .Lcfi10:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Lcfi7:
+; KNL_32-NEXT:  .Lcfi11:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Lcfi8:
+; KNL_32-NEXT:  .Lcfi12:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -1916,6 +1955,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
 ; KNL_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
 ; KNL_32-NEXT:    movl %ebp, %esp
 ; KNL_32-NEXT:    popl %ebp
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test_scatter_16i64:
@@ -1926,6 +1966,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX-NEXT:    vpscatterqq %zmm3, (,%zmm0) {%k1}
 ; SKX-NEXT:    vpscatterqq %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test_scatter_16i64:
@@ -1950,6 +1991,7 @@ define void @test_scatter_16i64(<16 x i64*> %ptrs, <16 x i1> %mask, <16 x i64> %
 ; SKX_32-NEXT:    vpscatterdq %zmm1, (,%ymm0) {%k2}
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp
+; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
   call void @llvm.masked.scatter.v16i64(<16 x i64> %src0, <16 x i64*> %ptrs, i32 4, <16 x i1> %mask)
   ret void
@@ -1965,6 +2007,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
 ; KNL_64-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
 ; KNL_64-NEXT:    vextractf64x4 $1, %zmm3, %ymm0
 ; KNL_64-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test_scatter_16f32:
@@ -1973,6 +2016,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
 ; KNL_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; KNL_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; KNL_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test_scatter_16f32:
@@ -1984,6 +2028,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
 ; SKX-NEXT:    vscatterqps %ymm3, (,%zmm0) {%k1}
 ; SKX-NEXT:    vextractf32x8 $1, %zmm3, %ymm0
 ; SKX-NEXT:    vscatterqps %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test_scatter_16f32:
@@ -1992,6 +2037,7 @@ define void @test_scatter_16f32(<16 x float*> %ptrs, <16 x i1> %mask, <16 x floa
 ; SKX_32-NEXT:    vpslld $31, %zmm1, %zmm1
 ; SKX_32-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; SKX_32-NEXT:    vscatterdps %zmm2, (,%zmm0) {%k1}
+; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
   call void @llvm.masked.scatter.v16f32(<16 x float> %src0, <16 x float*> %ptrs, i32 4, <16 x i1> %mask)
   ret void
@@ -2006,17 +2052,18 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
 ; KNL_64-NEXT:    kshiftrw $8, %k1, %k2
 ; KNL_64-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
 ; KNL_64-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
+; KNL_64-NEXT:    vzeroupper
 ; KNL_64-NEXT:    retq
 ;
 ; KNL_32-LABEL: test_scatter_16f64:
 ; KNL_32:       # BB#0:
 ; KNL_32-NEXT:    pushl %ebp
-; KNL_32-NEXT:  .Lcfi9:
+; KNL_32-NEXT:  .Lcfi13:
 ; KNL_32-NEXT:    .cfi_def_cfa_offset 8
-; KNL_32-NEXT:  .Lcfi10:
+; KNL_32-NEXT:  .Lcfi14:
 ; KNL_32-NEXT:    .cfi_offset %ebp, -8
 ; KNL_32-NEXT:    movl %esp, %ebp
-; KNL_32-NEXT:  .Lcfi11:
+; KNL_32-NEXT:  .Lcfi15:
 ; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
 ; KNL_32-NEXT:    andl $-64, %esp
 ; KNL_32-NEXT:    subl $64, %esp
@@ -2030,6 +2077,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
 ; KNL_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
 ; KNL_32-NEXT:    movl %ebp, %esp
 ; KNL_32-NEXT:    popl %ebp
+; KNL_32-NEXT:    vzeroupper
 ; KNL_32-NEXT:    retl
 ;
 ; SKX-LABEL: test_scatter_16f64:
@@ -2040,6 +2088,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
 ; SKX-NEXT:    kshiftrw $8, %k1, %k2
 ; SKX-NEXT:    vscatterqpd %zmm3, (,%zmm0) {%k1}
 ; SKX-NEXT:    vscatterqpd %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
 ;
 ; SKX_32-LABEL: test_scatter_16f64:
@@ -2064,6 +2113,7 @@ define void @test_scatter_16f64(<16 x double*> %ptrs, <16 x i1> %mask, <16 x dou
 ; SKX_32-NEXT:    vscatterdpd %zmm1, (,%ymm0) {%k2}
 ; SKX_32-NEXT:    movl %ebp, %esp
 ; SKX_32-NEXT:    popl %ebp
+; SKX_32-NEXT:    vzeroupper
 ; SKX_32-NEXT:    retl
   call void @llvm.masked.scatter.v16f64(<16 x double> %src0, <16 x double*> %ptrs, i32 4, <16 x i1> %mask)
   ret void
@@ -2086,6 +2136,34 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; KNL_64-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; KNL_64-NEXT:    retq
 ;
+; KNL_32-LABEL: test_pr28312:
+; KNL_32:       # BB#0:
+; KNL_32-NEXT:    pushl %ebp
+; KNL_32-NEXT:  .Lcfi16:
+; KNL_32-NEXT:    .cfi_def_cfa_offset 8
+; KNL_32-NEXT:  .Lcfi17:
+; KNL_32-NEXT:    .cfi_offset %ebp, -8
+; KNL_32-NEXT:    movl %esp, %ebp
+; KNL_32-NEXT:  .Lcfi18:
+; KNL_32-NEXT:    .cfi_def_cfa_register %ebp
+; KNL_32-NEXT:    andl $-32, %esp
+; KNL_32-NEXT:    subl $32, %esp
+; KNL_32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; KNL_32-NEXT:    vpslld $31, %xmm1, %xmm1
+; KNL_32-NEXT:    vpsrad $31, %xmm1, %xmm1
+; KNL_32-NEXT:    vpmovsxdq %xmm1, %ymm1
+; KNL_32-NEXT:    vpxord %zmm2, %zmm2, %zmm2
+; KNL_32-NEXT:    vinserti64x4 $0, %ymm1, %zmm2, %zmm1
+; KNL_32-NEXT:    vpmovsxdq %ymm0, %zmm0
+; KNL_32-NEXT:    vpsllq $63, %zmm1, %zmm1
+; KNL_32-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; KNL_32-NEXT:    vpgatherqq (,%zmm0), %zmm1 {%k1}
+; KNL_32-NEXT:    vpaddq %ymm1, %ymm1, %ymm0
+; KNL_32-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; KNL_32-NEXT:    movl %ebp, %esp
+; KNL_32-NEXT:    popl %ebp
+; KNL_32-NEXT:    retl
+;
 ; SKX-LABEL: test_pr28312:
 ; SKX:       # BB#0:
 ; SKX-NEXT:    vpslld $31, %xmm1, %xmm1
@@ -2094,6 +2172,27 @@ define <4 x i64> @test_pr28312(<4 x i64*> %p1, <4 x i1> %k, <4 x i1> %k2,<4 x i6
 ; SKX-NEXT:    vpaddq %ymm1, %ymm1, %ymm0
 ; SKX-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
 ; SKX-NEXT:    retq
+;
+; SKX_32-LABEL: test_pr28312:
+; SKX_32:       # BB#0:
+; SKX_32-NEXT:    pushl %ebp
+; SKX_32-NEXT:  .Lcfi13:
+; SKX_32-NEXT:    .cfi_def_cfa_offset 8
+; SKX_32-NEXT:  .Lcfi14:
+; SKX_32-NEXT:    .cfi_offset %ebp, -8
+; SKX_32-NEXT:    movl %esp, %ebp
+; SKX_32-NEXT:  .Lcfi15:
+; SKX_32-NEXT:    .cfi_def_cfa_register %ebp
+; SKX_32-NEXT:    andl $-32, %esp
+; SKX_32-NEXT:    subl $32, %esp
+; SKX_32-NEXT:    vpslld $31, %xmm1, %xmm1
+; SKX_32-NEXT:    vptestmd %xmm1, %xmm1, %k1
+; SKX_32-NEXT:    vpgatherdq (,%xmm0), %ymm1 {%k1}
+; SKX_32-NEXT:    vpaddq %ymm1, %ymm1, %ymm0
+; SKX_32-NEXT:    vpaddq %ymm0, %ymm1, %ymm0
+; SKX_32-NEXT:    movl %ebp, %esp
+; SKX_32-NEXT:    popl %ebp
+; SKX_32-NEXT:    retl
   %g1 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
   %g2 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
   %g3 = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %p1, i32 8, <4 x i1> %k, <4 x i64> undef)
diff --git a/test/CodeGen/X86/masked_memop.ll b/test/CodeGen/X86/masked_memop.ll
index 15ba276e14b7bc6f6f85add8c7371df79f460591..3c616e8a9f43983a2ab648d3520d74fb66a664e0 100644
--- a/test/CodeGen/X86/masked_memop.ll
+++ b/test/CodeGen/X86/masked_memop.ll
@@ -415,6 +415,7 @@ define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
 ; AVX512F-NEXT:    kshiftlw $8, %k0, %k0
 ; AVX512F-NEXT:    kshiftrw $8, %k0, %k1
 ; AVX512F-NEXT:    vmovdqu32 %zmm1, (%rdi) {%k1}
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: test12:
@@ -422,6 +423,7 @@ define void @test12(<8 x i32> %trigger, <8 x i32>* %addr, <8 x i32> %val) {
 ; SKX-NEXT:    vpxor %ymm2, %ymm2, %ymm2
 ; SKX-NEXT:    vpcmpeqd %ymm2, %ymm0, %k1
 ; SKX-NEXT:    vmovdqu32 %ymm1, (%rdi) {%k1}
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   %mask = icmp eq <8 x i32> %trigger, zeroinitializer
   call void @llvm.masked.store.v8i32.p0v8i32(<8 x i32>%val, <8 x i32>* %addr, i32 4, <8 x i1>%mask)
@@ -695,7 +697,7 @@ define <4 x float> @mload_constmask_v4f32(<4 x float>* %addr, <4 x float> %dst)
 ; SKX-LABEL: mload_constmask_v4f32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movb $13, %al
-; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovups (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 1, i1 1>, <4 x float> %dst)
@@ -729,7 +731,7 @@ define <4 x i32> @mload_constmask_v4i32(<4 x i32>* %addr, <4 x i32> %dst) {
 ; SKX-LABEL: mload_constmask_v4i32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movb $14, %al
-; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovdqu32 (%rdi), %xmm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 1>, <4 x i32> %dst)
@@ -758,7 +760,7 @@ define <8 x float> @mload_constmask_v8f32(<8 x float>* %addr, <8 x float> %dst)
 ; SKX-LABEL: mload_constmask_v8f32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movb $7, %al
-; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovups (%rdi), %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <8 x float> @llvm.masked.load.v8f32.p0v8f32(<8 x float>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 0>, <8 x float> %dst)
@@ -783,7 +785,7 @@ define <4 x double> @mload_constmask_v4f64(<4 x double>* %addr, <4 x double> %ds
 ; SKX-LABEL: mload_constmask_v4f64:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movb $7, %al
-; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovupd (%rdi), %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> %dst)
@@ -815,7 +817,7 @@ define <8 x i32> @mload_constmask_v8i32(<8 x i32>* %addr, <8 x i32> %dst) {
 ; SKX-LABEL: mload_constmask_v8i32:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movb $-121, %al
-; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovdqu32 (%rdi), %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <8 x i32> @llvm.masked.load.v8i32.p0v8i32(<8 x i32>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x i32> %dst)
@@ -843,7 +845,7 @@ define <4 x i64> @mload_constmask_v4i64(<4 x i64>* %addr, <4 x i64> %dst) {
 ; SKX-LABEL: mload_constmask_v4i64:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movb $9, %al
-; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1}
 ; SKX-NEXT:    retq
   %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 1, i1 0, i1 0, i1 1>, <4 x i64> %dst)
@@ -859,12 +861,19 @@ define <8 x double> @mload_constmask_v8f64(<8 x double>* %addr, <8 x double> %ds
 ; AVX-NEXT:    vblendpd {{.*#+}} ymm0 = mem[0,1,2],ymm0[3]
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: mload_constmask_v8f64:
-; AVX512:       ## BB#0:
-; AVX512-NEXT:    movb $-121, %al
-; AVX512-NEXT:    kmovw %eax, %k1
-; AVX512-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: mload_constmask_v8f64:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    movb $-121, %al
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; SKX-LABEL: mload_constmask_v8f64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    movb $-121, %al
+; SKX-NEXT:    kmovd %eax, %k1
+; SKX-NEXT:    vmovupd (%rdi), %zmm0 {%k1}
+; SKX-NEXT:    retq
   %res = call <8 x double> @llvm.masked.load.v8f64.p0v8f64(<8 x double>* %addr, i32 4, <8 x i1> <i1 1, i1 1, i1 1, i1 0, i1 0, i1 0, i1 0, i1 1>, <8 x double> %dst)
   ret <8 x double> %res
 }
@@ -887,7 +896,7 @@ define <4 x double> @mload_constmask_v4f64_undef_passthrough(<4 x double>* %addr
 ; SKX-LABEL: mload_constmask_v4f64_undef_passthrough:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movb $7, %al
-; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovupd (%rdi), %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %res = call <4 x double> @llvm.masked.load.v4f64.p0v4f64(<4 x double>* %addr, i32 4, <4 x i1> <i1 1, i1 1, i1 1, i1 0>, <4 x double> undef)
@@ -916,7 +925,7 @@ define <4 x i64> @mload_constmask_v4i64_undef_passthrough(<4 x i64>* %addr) {
 ; SKX-LABEL: mload_constmask_v4i64_undef_passthrough:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movb $6, %al
-; SKX-NEXT:    kmovw %eax, %k1
+; SKX-NEXT:    kmovd %eax, %k1
 ; SKX-NEXT:    vmovdqu64 (%rdi), %ymm0 {%k1} {z}
 ; SKX-NEXT:    retq
   %res = call <4 x i64> @llvm.masked.load.v4i64.p0v4i64(<4 x i64>* %addr, i32 4, <4 x i1> <i1 0, i1 1, i1 1, i1 0>, <4 x i64> undef)
@@ -998,12 +1007,14 @@ define void @one_mask_bit_set3(<4 x i64>* %addr, <4 x i64> %val) {
 ; AVX512F:       ## BB#0:
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovlps %xmm0, 16(%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; SKX-LABEL: one_mask_bit_set3:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; SKX-NEXT:    vmovq %xmm0, 16(%rdi)
+; SKX-NEXT:    vzeroupper
 ; SKX-NEXT:    retq
   call void @llvm.masked.store.v4i64.p0v4i64(<4 x i64> %val, <4 x i64>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
   ret void
@@ -1023,6 +1034,7 @@ define void @one_mask_bit_set4(<4 x double>* %addr, <4 x double> %val) {
 ; AVX512:       ## BB#0:
 ; AVX512-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512-NEXT:    vmovhpd %xmm0, 24(%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   call void @llvm.masked.store.v4f64.p0v4f64(<4 x double> %val, <4 x double>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 false, i1 true>)
   ret void
@@ -1042,6 +1054,7 @@ define void @one_mask_bit_set5(<8 x double>* %addr, <8 x double> %val) {
 ; AVX512:       ## BB#0:
 ; AVX512-NEXT:    vextractf32x4 $3, %zmm0, %xmm0
 ; AVX512-NEXT:    vmovlps %xmm0, 48(%rdi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   call void @llvm.masked.store.v8f64.p0v8f64(<8 x double> %val, <8 x double>* %addr, i32 4, <8 x i1><i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 true, i1 false>)
   ret void
diff --git a/test/CodeGen/X86/mature-mc-support.ll b/test/CodeGen/X86/mature-mc-support.ll
index 9d956f46becafc837b1ca9048643966d7b43bc91..3d6f0f66c1876585ba59458cf511a3426d0bc863 100644
--- a/test/CodeGen/X86/mature-mc-support.ll
+++ b/test/CodeGen/X86/mature-mc-support.ll
@@ -15,4 +15,4 @@
 
 module asm "	.this_directive_is_very_unlikely_to_exist"
 
-; CHECK: LLVM ERROR: Error parsing inline asm
+; CHECK: error: unknown directive
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index 6a51d60f636c3b4b1145cf9c2f6892464a458f81..ce1bb3b06ce57489ee6ae5c902e6c516f1e7fd60 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -1,130 +1,368 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; RUN: llc < %s -disable-simplify-libcalls -mtriple=x86_64-linux | FileCheck %s --check-prefix=NOBUILTIN
-; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=AVX2 | FileCheck %s --check-prefix=X64 --check-prefix=AVX2
 
 ; This tests codegen time inlining/optimization of memcmp
 ; rdar://6480398
 
-@.str = private constant [23 x i8] c"fooooooooooooooooooooo\00", align 1 ; <[23 x i8]*> [#uses=1]
+@.str = private constant [65 x i8] c"0123456789012345678901234567890123456789012345678901234567890123\00", align 1
 
-declare i32 @memcmp(...)
+declare i32 @memcmp(i8*, i8*, i64)
 
-define void @memcmp2(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
-entry:
-  %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 2) nounwind ; <i32> [#uses=1]
-  %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
-  br i1 %1, label %return, label %bb
+define i1 @length2(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
+; X32-LABEL: length2:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movzwl (%ecx), %ecx
+; X32-NEXT:    cmpw (%eax), %cx
+; X32-NEXT:    sete %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length2:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpw (%rsi), %ax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
 
-bb:                                               ; preds = %entry
-  store i32 4, i32* %P, align 4
-  ret void
+define i1 @length2_const(i8* %X, i32* nocapture %P) nounwind {
+; X32-LABEL: length2_const:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movzwl (%eax), %eax
+; X32-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X32-NEXT:    setne %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length2_const:
+; X64:       # BB#0:
+; X64-NEXT:    movzwl (%rdi), %eax
+; X64-NEXT:    cmpl $12849, %eax # imm = 0x3231
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 2) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
+}
 
-return:                                           ; preds = %entry
-  ret void
-; CHECK-LABEL: memcmp2:
-; CHECK: movzwl
-; CHECK-NEXT: cmpw
-; NOBUILTIN-LABEL: memcmp2:
-; NOBUILTIN: callq
+define i1 @length2_nobuiltin_attr(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
+; X32-LABEL: length2_nobuiltin_attr:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $2
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    sete %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length2_nobuiltin_attr:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $2, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 2) nounwind nobuiltin
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
 }
 
-define void @memcmp2a(i8* %X, i32* nocapture %P) nounwind {
-entry:
-  %0 = tail call i32 (...) @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 1), i32 2) nounwind ; <i32> [#uses=1]
-  %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
-  br i1 %1, label %return, label %bb
-
-bb:                                               ; preds = %entry
-  store i32 4, i32* %P, align 4
-  ret void
-
-return:                                           ; preds = %entry
-  ret void
-; CHECK-LABEL: memcmp2a:
-; CHECK: movzwl
-; CHECK-NEXT: cmpl    $28527,
+define i1 @length4(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
+; X32-LABEL: length4:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl (%ecx), %ecx
+; X32-NEXT:    cmpl (%eax), %ecx
+; X32-NEXT:    setne %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length4:
+; X64:       # BB#0:
+; X64-NEXT:    movl (%rdi), %eax
+; X64-NEXT:    cmpl (%rsi), %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 4) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
 }
 
-define void @memcmp2nb(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
-entry:
-  %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 2) nounwind nobuiltin ; <i32> [#uses=1]
-  %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
-  br i1 %1, label %return, label %bb
+define i1 @length4_const(i8* %X, i32* nocapture %P) nounwind {
+; X32-LABEL: length4_const:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    cmpl $875770417, (%eax) # imm = 0x34333231
+; X32-NEXT:    sete %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length4_const:
+; X64:       # BB#0:
+; X64-NEXT:    cmpl $875770417, (%rdi) # imm = 0x34333231
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 1), i64 4) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
 
-bb:                                               ; preds = %entry
-  store i32 4, i32* %P, align 4
-  ret void
+define i1 @length8(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
+; X32-LABEL: length8:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $8
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    sete %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length8:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %rax
+; X64-NEXT:    cmpq (%rsi), %rax
+; X64-NEXT:    sete %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* %Y, i64 8) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
 
-return:                                           ; preds = %entry
-  ret void
-; CHECK-LABEL: memcmp2nb:
-; CHECK: callq
+define i1 @length8_const(i8* %X, i32* nocapture %P) nounwind {
+; X32-LABEL: length8_const:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $8
+; X32-NEXT:    pushl $.L.str
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    setne %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length8_const:
+; X64:       # BB#0:
+; X64-NEXT:    movabsq $3978425819141910832, %rax # imm = 0x3736353433323130
+; X64-NEXT:    cmpq %rax, (%rdi)
+; X64-NEXT:    setne %al
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 8) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
 }
 
-define void @memcmp4(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
-entry:
-  %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 4) nounwind ; <i32> [#uses=1]
-  %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
-  br i1 %1, label %return, label %bb
-
-bb:                                               ; preds = %entry
-  store i32 4, i32* %P, align 4
-  ret void
-
-return:                                           ; preds = %entry
-  ret void
-; CHECK-LABEL: memcmp4:
-; CHECK: movl
-; CHECK-NEXT: cmpl
+define i1 @length16(i8* %x, i8* %y) nounwind {
+; X32-LABEL: length16:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $16
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    setne %al
+; X32-NEXT:    retl
+;
+; SSE2-LABEL: length16:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqu (%rsi), %xmm0
+; SSE2-NEXT:    movdqu (%rdi), %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %eax
+; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: length16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX2-NEXT:    vpcmpeqb (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 16) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
 }
 
-define void @memcmp4a(i8* %X, i32* nocapture %P) nounwind {
-entry:
-  %0 = tail call i32 (...) @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 1), i32 4) nounwind ; <i32> [#uses=1]
-  %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
-  br i1 %1, label %return, label %bb
+define i1 @length16_const(i8* %X, i32* nocapture %P) nounwind {
+; X32-LABEL: length16_const:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $16
+; X32-NEXT:    pushl $.L.str
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    sete %al
+; X32-NEXT:    retl
+;
+; SSE2-LABEL: length16_const:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    movdqu (%rdi), %xmm0
+; SSE2-NEXT:    pcmpeqb {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %eax
+; SSE2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: length16_const:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 16) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
+}
 
-bb:                                               ; preds = %entry
-  store i32 4, i32* %P, align 4
-  ret void
+define i1 @length32(i8* %x, i8* %y) nounwind {
+; X32-LABEL: length32:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $32
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    sete %al
+; X32-NEXT:    retl
+;
+; SSE2-LABEL: length32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movl $32, %edx
+; SSE2-NEXT:    callq memcmp
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: length32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpcmpeqb (%rsi), %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    cmpl $-1, %eax
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 32) nounwind
+  %cmp = icmp eq i32 %call, 0
+  ret i1 %cmp
+}
 
-return:                                           ; preds = %entry
-  ret void
-; CHECK-LABEL: memcmp4a:
-; CHECK: cmpl $1869573999,
+define i1 @length32_const(i8* %X, i32* nocapture %P) nounwind {
+; X32-LABEL: length32_const:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $32
+; X32-NEXT:    pushl $.L.str
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    setne %al
+; X32-NEXT:    retl
+;
+; SSE2-LABEL: length32_const:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movl $.L.str, %esi
+; SSE2-NEXT:    movl $32, %edx
+; SSE2-NEXT:    callq memcmp
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    popq %rcx
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: length32_const:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    vpcmpeqb {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    cmpl $-1, %eax
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 32) nounwind
+  %c = icmp ne i32 %m, 0
+  ret i1 %c
 }
 
-define void @memcmp8(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
-entry:
-  %0 = tail call i32 (...) @memcmp(i8* %X, i8* %Y, i32 8) nounwind ; <i32> [#uses=1]
-  %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
-  br i1 %1, label %return, label %bb
-
-bb:                                               ; preds = %entry
-  store i32 4, i32* %P, align 4
-  ret void
-
-return:                                           ; preds = %entry
-  ret void
-; CHECK-LABEL: memcmp8:
-; CHECK: movq
-; CHECK: cmpq
+define i1 @length64(i8* %x, i8* %y) nounwind {
+; X32-LABEL: length64:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $64
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    setne %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length64:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %call = tail call i32 @memcmp(i8* %x, i8* %y, i64 64) nounwind
+  %cmp = icmp ne i32 %call, 0
+  ret i1 %cmp
 }
 
-define void @memcmp8a(i8* %X, i32* nocapture %P) nounwind {
-entry:
-  %0 = tail call i32 (...) @memcmp(i8* %X, i8* getelementptr inbounds ([23 x i8], [23 x i8]* @.str, i32 0, i32 0), i32 8) nounwind ; <i32> [#uses=1]
-  %1 = icmp eq i32 %0, 0                          ; <i1> [#uses=1]
-  br i1 %1, label %return, label %bb
-
-bb:                                               ; preds = %entry
-  store i32 4, i32* %P, align 4
-  ret void
-
-return:                                           ; preds = %entry
-  ret void
-; CHECK-LABEL: memcmp8a:
-; CHECK: movabsq $8029759185026510694,
-; CHECK: cmpq
+define i1 @length64_const(i8* %X, i32* nocapture %P) nounwind {
+; X32-LABEL: length64_const:
+; X32:       # BB#0:
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $64
+; X32-NEXT:    pushl $.L.str
+; X32-NEXT:    pushl {{[0-9]+}}(%esp)
+; X32-NEXT:    calll memcmp
+; X32-NEXT:    addl $16, %esp
+; X32-NEXT:    testl %eax, %eax
+; X32-NEXT:    sete %al
+; X32-NEXT:    retl
+;
+; X64-LABEL: length64_const:
+; X64:       # BB#0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $.L.str, %esi
+; X64-NEXT:    movl $64, %edx
+; X64-NEXT:    callq memcmp
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    sete %al
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %m = tail call i32 @memcmp(i8* %X, i8* getelementptr inbounds ([65 x i8], [65 x i8]* @.str, i32 0, i32 0), i64 64) nounwind
+  %c = icmp eq i32 %m, 0
+  ret i1 %c
 }
 
diff --git a/test/CodeGen/X86/mempcpy-32.ll b/test/CodeGen/X86/mempcpy-32.ll
new file mode 100644
index 0000000000000000000000000000000000000000..108442f6b6482082618348beb9d97e505a73a03f
--- /dev/null
+++ b/test/CodeGen/X86/mempcpy-32.ll
@@ -0,0 +1,20 @@
+;  RUN: llc < %s -mtriple=i686-unknown-linux -O2 | FileCheck %s
+
+; This tests the i686 lowering of mempcpy.
+; Also see mempcpy.ll
+
+@G = common global i8* null, align 8
+
+; CHECK-LABEL: RET_MEMPCPY:
+; CHECK: movl [[REG:%e[a-z0-9]+]], {{.*}}G
+; CHECK: calll {{.*}}memcpy
+; CHECK: movl [[REG]], %eax
+;
+define i8* @RET_MEMPCPY(i8* %DST, i8* %SRC, i32 %N) {
+  %add.ptr = getelementptr inbounds i8, i8* %DST, i32 %N
+  store i8* %add.ptr, i8** @G, align 8
+  %call = tail call i8* @mempcpy(i8* %DST, i8* %SRC, i32 %N)
+  ret i8* %call
+}
+
+declare i8* @mempcpy(i8*, i8*, i32)
diff --git a/test/CodeGen/X86/mempcpy.ll b/test/CodeGen/X86/mempcpy.ll
index 1c737b644021e98bf0f7d020623c22ad9d52ae62..f8db255c1a4b4a263f5607c10798a1f542db468a 100644
--- a/test/CodeGen/X86/mempcpy.ll
+++ b/test/CodeGen/X86/mempcpy.ll
@@ -1,5 +1,4 @@
 ;  RUN: llc < %s -mtriple=x86_64-unknown-linux -O2 | FileCheck %s
-;  RUN: llc < %s -mtriple=i686-unknown-linux -O2 | FileCheck %s
 
 ; This test checks that:
 ; (1)  mempcpy is lowered as memcpy, and 
@@ -11,12 +10,15 @@
 ; the first instance to be reused as the return value. This allows the check for
 ; (2) to be expressed as verifying that the MOV to store DST+N to G and
 ; the MOV to copy DST+N to %rax use the same source register.
+
+; Also see mempcpy-32.ll
+
 @G = common global i8* null, align 8
 
 ; CHECK-LABEL: RET_MEMPCPY:
-; CHECK: mov{{.*}} [[REG:%[er][a-z0-9]+]], {{.*}}G
-; CHECK: call{{.*}} {{.*}}memcpy
-; CHECK: mov{{.*}} [[REG]], %{{[er]}}ax
+; CHECK: movq [[REG:%r[a-z0-9]+]], {{.*}}G
+; CHECK: callq {{.*}}memcpy
+; CHECK: movq [[REG]], %rax
 ;
 define i8* @RET_MEMPCPY(i8* %DST, i8* %SRC, i64 %N) {
   %add.ptr = getelementptr inbounds i8, i8* %DST, i64 %N
diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 003e2e60521bcfce7597d3fcb2e2400026d5f686..71417694b0d4ba6ae185891295af6ab2e9291d6b 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -1037,12 +1037,12 @@ define <2 x i64> @merge_2i64_i64_12_volatile(i64* %ptr) nounwind uwtable noinlin
 define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable noinline ssp {
 ; SSE2-LABEL: merge_4f32_f32_2345_volatile:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; SSE2-NEXT:    retq
 ;
@@ -1065,12 +1065,12 @@ define <4 x float> @merge_4f32_f32_2345_volatile(float* %ptr) nounwind uwtable n
 ; X32-SSE1-LABEL: merge_4f32_f32_2345_volatile:
 ; X32-SSE1:       # BB#0:
 ; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-SSE1-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; X32-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X32-SSE1-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X32-SSE1-NEXT:    retl
 ;
@@ -1132,3 +1132,41 @@ define <4 x float> @merge_4f32_f32_X0YY(float* %ptr0, float* %ptr1) nounwind uwt
   %res3 = insertelement <4 x float> %res2, float %val1, i32 3
   ret <4 x float> %res3
 }
+
+;
+; Extension tests.
+;
+
+; PR31309
+define <4 x i32> @load_i32_zext_i128_v4i32(i32* %ptr) {
+; SSE-LABEL: load_i32_zext_i128_v4i32:
+; SSE:       # BB#0:
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: load_i32_zext_i128_v4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    retq
+;
+; X32-SSE1-LABEL: load_i32_zext_i128_v4i32:
+; X32-SSE1:       # BB#0:
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-SSE1-NEXT:    movl (%ecx), %ecx
+; X32-SSE1-NEXT:    movl %ecx, (%eax)
+; X32-SSE1-NEXT:    movl $0, 12(%eax)
+; X32-SSE1-NEXT:    movl $0, 8(%eax)
+; X32-SSE1-NEXT:    movl $0, 4(%eax)
+; X32-SSE1-NEXT:    retl $4
+;
+; X32-SSE41-LABEL: load_i32_zext_i128_v4i32:
+; X32-SSE41:       # BB#0:
+; X32-SSE41-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-SSE41-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE41-NEXT:    retl
+  %1 = load i32, i32* %ptr
+  %2 = zext i32 %1 to i128
+  %3 = bitcast i128 %2 to <4 x i32>
+  ret <4 x i32> %3
+}
diff --git a/test/CodeGen/X86/merge-consecutive-loads-256.ll b/test/CodeGen/X86/merge-consecutive-loads-256.ll
index c18a98e05d1de6687c9deaa6d8587ea7bc06f20a..b00d732889e36edab84d57fe22709cf1c971a10c 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-256.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-256.ll
@@ -625,7 +625,7 @@ define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -641,7 +641,7 @@ define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX512F-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; AVX512F-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -650,7 +650,7 @@ define <4 x double> @merge_4f64_f64_34uz_volatile(double* %ptr) nounwind uwtable
 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; X32-AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
-; X32-AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; X32-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; X32-AVX-NEXT:    retl
   %ptr0 = getelementptr inbounds double, double* %ptr, i64 3
@@ -668,10 +668,10 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm1
-; AVX1-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
 ; AVX1-NEXT:    vpinsrw $4, 24(%rdi), %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
 ; AVX1-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -679,10 +679,10 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm1
-; AVX2-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
 ; AVX2-NEXT:    vpinsrw $4, 24(%rdi), %xmm0, %xmm0
 ; AVX2-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
 ; AVX2-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
@@ -690,10 +690,10 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind
 ; AVX512F:       # BB#0:
 ; AVX512F-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpinsrw $0, (%rdi), %xmm0, %xmm1
-; AVX512F-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vpinsrw $4, 24(%rdi), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpinsrw $6, 28(%rdi), %xmm0, %xmm0
 ; AVX512F-NEXT:    vpinsrw $7, 30(%rdi), %xmm0, %xmm0
+; AVX512F-NEXT:    vpinsrw $3, 6(%rdi), %xmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    retq
 ;
@@ -702,10 +702,10 @@ define <16 x i16> @merge_16i16_i16_0uu3zzuuuuuzCuEF_volatile(i16* %ptr) nounwind
 ; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-AVX-NEXT:    vpxor %xmm0, %xmm0, %xmm0
 ; X32-AVX-NEXT:    vpinsrw $0, (%eax), %xmm0, %xmm1
-; X32-AVX-NEXT:    vpinsrw $3, 6(%eax), %xmm1, %xmm1
 ; X32-AVX-NEXT:    vpinsrw $4, 24(%eax), %xmm0, %xmm0
 ; X32-AVX-NEXT:    vpinsrw $6, 28(%eax), %xmm0, %xmm0
 ; X32-AVX-NEXT:    vpinsrw $7, 30(%eax), %xmm0, %xmm0
+; X32-AVX-NEXT:    vpinsrw $3, 6(%eax), %xmm1, %xmm1
 ; X32-AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X32-AVX-NEXT:    retl
   %ptr0 = getelementptr inbounds i16, i16* %ptr, i64 0
diff --git a/test/CodeGen/X86/merge-consecutive-loads-512.ll b/test/CodeGen/X86/merge-consecutive-loads-512.ll
index 0111e341c45927e595c0aa6bafc3cd05a62672a3..c3500f0ad3991d2f0c8ee030f4b7884ce30d8657 100644
--- a/test/CodeGen/X86/merge-consecutive-loads-512.ll
+++ b/test/CodeGen/X86/merge-consecutive-loads-512.ll
@@ -136,13 +136,21 @@ define <8 x double> @merge_8f64_f64_12zzuuzz(double* %ptr) nounwind uwtable noin
 }
 
 define <8 x double> @merge_8f64_f64_1u3u5zu8(double* %ptr) nounwind uwtable noinline ssp {
-; ALL-LABEL: merge_8f64_f64_1u3u5zu8:
-; ALL:       # BB#0:
-; ALL-NEXT:    movb $32, %al
-; ALL-NEXT:    kmovw %eax, %k0
-; ALL-NEXT:    knotw %k0, %k1
-; ALL-NEXT:    vmovupd 8(%rdi), %zmm0 {%k1} {z}
-; ALL-NEXT:    retq
+; AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movb $32, %al
+; AVX512F-NEXT:    kmovw %eax, %k0
+; AVX512F-NEXT:    knotw %k0, %k1
+; AVX512F-NEXT:    vmovupd 8(%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: merge_8f64_f64_1u3u5zu8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    movb $32, %al
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    knotw %k0, %k1
+; AVX512BW-NEXT:    vmovupd 8(%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
 ;
 ; X32-AVX512F-LABEL: merge_8f64_f64_1u3u5zu8:
 ; X32-AVX512F:       # BB#0:
@@ -223,13 +231,21 @@ define <8 x i64> @merge_8i64_i64_56zz9uzz(i64* %ptr) nounwind uwtable noinline s
 }
 
 define <8 x i64> @merge_8i64_i64_1u3u5zu8(i64* %ptr) nounwind uwtable noinline ssp {
-; ALL-LABEL: merge_8i64_i64_1u3u5zu8:
-; ALL:       # BB#0:
-; ALL-NEXT:    movb $32, %al
-; ALL-NEXT:    kmovw %eax, %k0
-; ALL-NEXT:    knotw %k0, %k1
-; ALL-NEXT:    vmovdqu64 8(%rdi), %zmm0 {%k1} {z}
-; ALL-NEXT:    retq
+; AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movb $32, %al
+; AVX512F-NEXT:    kmovw %eax, %k0
+; AVX512F-NEXT:    knotw %k0, %k1
+; AVX512F-NEXT:    vmovdqu64 8(%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: merge_8i64_i64_1u3u5zu8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    movb $32, %al
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    knotw %k0, %k1
+; AVX512BW-NEXT:    vmovdqu64 8(%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
 ;
 ; X32-AVX512F-LABEL: merge_8i64_i64_1u3u5zu8:
 ; X32-AVX512F:       # BB#0:
@@ -446,13 +462,21 @@ define <16 x i32> @merge_16i32_i32_0uu3uuuuuuuuCuEF(i32* %ptr) nounwind uwtable
 }
 
 define <16 x i32> @merge_16i32_i32_0uu3zzuuuuuzCuEF(i32* %ptr) nounwind uwtable noinline ssp {
-; ALL-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
-; ALL:       # BB#0:
-; ALL-NEXT:    movw $8240, %ax # imm = 0x2030
-; ALL-NEXT:    kmovw %eax, %k0
-; ALL-NEXT:    knotw %k0, %k1
-; ALL-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
-; ALL-NEXT:    retq
+; AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movw $8240, %ax # imm = 0x2030
+; AVX512F-NEXT:    kmovw %eax, %k0
+; AVX512F-NEXT:    knotw %k0, %k1
+; AVX512F-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    movw $8240, %ax # imm = 0x2030
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    knotw %k0, %k1
+; AVX512BW-NEXT:    vmovdqu32 (%rdi), %zmm0 {%k1} {z}
+; AVX512BW-NEXT:    retq
 ;
 ; X32-AVX512F-LABEL: merge_16i32_i32_0uu3zzuuuuuzCuEF:
 ; X32-AVX512F:       # BB#0:
diff --git a/test/CodeGen/X86/merge-store-partially-alias-loads.ll b/test/CodeGen/X86/merge-store-partially-alias-loads.ll
index 735e64a076d00aa26b3f8068be24fb84676dcdef..6ca964be9570498309bd8b9d8ac480eaa65a6883 100644
--- a/test/CodeGen/X86/merge-store-partially-alias-loads.ll
+++ b/test/CodeGen/X86/merge-store-partially-alias-loads.ll
@@ -21,11 +21,11 @@
 ; DBGDAG-DAG: [[LD2:t[0-9]+]]: i16,ch = load<LD2[%tmp81](align=1)> [[ENTRYTOKEN]], [[BASEPTR]], undef:i64
 ; DBGDAG-DAG: [[LD1:t[0-9]+]]: i8,ch = load<LD1[%tmp12]> [[ENTRYTOKEN]], [[ADDPTR]], undef:i64
 
-; DBGDAG: [[LOADTOKEN:t[0-9]+]]: ch = TokenFactor [[LD2]]:1, [[LD1]]:1
-
+; DBGDAG-DAG: [[ST1:t[0-9]+]]: ch = store<ST1[%tmp14]> [[ENTRYTOKEN]], [[LD1]], t{{[0-9]+}}, undef:i64
+; DBGDAG-DAG: [[LOADTOKEN:t[0-9]+]]: ch = TokenFactor [[LD2]]:1, [[LD1]]:1
 ; DBGDAG-DAG: [[ST2:t[0-9]+]]: ch = store<ST2[%tmp10](align=1)> [[LOADTOKEN]], [[LD2]], t{{[0-9]+}}, undef:i64
-; DBGDAG-DAG: [[ST1:t[0-9]+]]: ch = store<ST1[%tmp14]> [[ST2]], [[LD1]], t{{[0-9]+}}, undef:i64
-; DBGDAG: X86ISD::RET_FLAG [[ST1]],
+
+; DBGDAG: X86ISD::RET_FLAG t{{[0-9]+}},
 
 ; DBGDAG: Type-legalized selection DAG: BB#0 'merge_store_partial_overlap_load:'
 define void @merge_store_partial_overlap_load([4 x i8]* %tmp) {
diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll
index 2701f369bcde320f91788d1de785cd23c211dc63..31c1f65824260aa5c810e8beaa131fad8aef991d 100644
--- a/test/CodeGen/X86/merge_store.ll
+++ b/test/CodeGen/X86/merge_store.ll
@@ -1,5 +1,4 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -addr-sink-using-gep=1 | FileCheck %s
 
 define void @merge_store(i32* nocapture %a) {
 ; CHECK-LABEL: merge_store:
diff --git a/test/CodeGen/X86/merge_store_duplicated_loads.ll b/test/CodeGen/X86/merge_store_duplicated_loads.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cfc39035e4036d6d7f8858448156c2f11d869d85
--- /dev/null
+++ b/test/CodeGen/X86/merge_store_duplicated_loads.ll
@@ -0,0 +1,88 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -o - | FileCheck %s
+
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @merge_double(double* noalias nocapture %st, double* noalias nocapture readonly %ld) #0 {
+; CHECK-LABEL: merge_double:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd %xmm0, (%rdi)
+; CHECK-NEXT:    movsd %xmm1, 8(%rdi)
+; CHECK-NEXT:    movsd %xmm0, 16(%rdi)
+; CHECK-NEXT:    movsd %xmm1, 24(%rdi)
+; CHECK-NEXT:    retq
+  %ld_idx1 = getelementptr inbounds double, double* %ld, i64 1
+  %ld0 = load double, double* %ld, align 8, !tbaa !2
+  %ld1 = load double, double* %ld_idx1, align 8, !tbaa !2
+
+  %st_idx1 = getelementptr inbounds double, double* %st, i64 1
+  %st_idx2 = getelementptr inbounds double, double* %st, i64 2
+  %st_idx3 = getelementptr inbounds double, double* %st, i64 3
+
+  store double %ld0, double* %st, align 8, !tbaa !2
+  store double %ld1, double* %st_idx1, align 8, !tbaa !2
+  store double %ld0, double* %st_idx2, align 8, !tbaa !2
+  store double %ld1, double* %st_idx3, align 8, !tbaa !2
+  ret void
+}
+
+define void @merge_loadstore_int(i64* noalias nocapture readonly %p, i64* noalias nocapture %q) local_unnamed_addr #0 {
+; CHECK-LABEL: merge_loadstore_int:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rcx
+; CHECK-NEXT:    movq %rax, (%rsi)
+; CHECK-NEXT:    movq %rcx, 8(%rsi)
+; CHECK-NEXT:    movq %rax, 16(%rsi)
+; CHECK-NEXT:    movq %rcx, 24(%rsi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i64, i64* %p, align 8, !tbaa !1
+  %arrayidx1 = getelementptr inbounds i64, i64* %p, i64 1
+  %1 = load i64, i64* %arrayidx1, align 8, !tbaa !1
+  store i64 %0, i64* %q, align 8, !tbaa !1
+  %arrayidx3 = getelementptr inbounds i64, i64* %q, i64 1
+  store i64 %1, i64* %arrayidx3, align 8, !tbaa !1
+  %arrayidx4 = getelementptr inbounds i64, i64* %q, i64 2
+  store i64 %0, i64* %arrayidx4, align 8, !tbaa !1
+  %arrayidx5 = getelementptr inbounds i64, i64* %q, i64 3
+  store i64 %1, i64* %arrayidx5, align 8, !tbaa !1
+  ret void
+}
+
+define i64 @merge_loadstore_int_with_extra_use(i64* noalias nocapture readonly %p, i64* noalias nocapture %q) local_unnamed_addr #0 {
+; CHECK-LABEL: merge_loadstore_int_with_extra_use:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    movq 8(%rdi), %rcx
+; CHECK-NEXT:    movq %rax, (%rsi)
+; CHECK-NEXT:    movq %rcx, 8(%rsi)
+; CHECK-NEXT:    movq %rax, 16(%rsi)
+; CHECK-NEXT:    movq %rcx, 24(%rsi)
+; CHECK-NEXT:    retq
+entry:
+  %0 = load i64, i64* %p, align 8, !tbaa !1
+  %arrayidx1 = getelementptr inbounds i64, i64* %p, i64 1
+  %1 = load i64, i64* %arrayidx1, align 8, !tbaa !1
+  store i64 %0, i64* %q, align 8, !tbaa !1
+  %arrayidx3 = getelementptr inbounds i64, i64* %q, i64 1
+  store i64 %1, i64* %arrayidx3, align 8, !tbaa !1
+  %arrayidx4 = getelementptr inbounds i64, i64* %q, i64 2
+  store i64 %0, i64* %arrayidx4, align 8, !tbaa !1
+  %arrayidx5 = getelementptr inbounds i64, i64* %q, i64 3
+  store i64 %1, i64* %arrayidx5, align 8, !tbaa !1
+  ret i64 %0
+
+}
+
+attributes #0 = { "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
+
+
+!0 = !{!"clang version 5.0.0 (trunk 296467) (llvm/trunk 296476)"}
+!1 = !{!2, !2, i64 0}
+!2 = !{!"double", !3, i64 0}
+!3 = !{!"omnipotent char", !4, i64 0}
+!4 = !{!"Simple C/C++ TBAA"}
diff --git a/test/CodeGen/X86/misched-aa-colored.ll b/test/CodeGen/X86/misched-aa-colored.ll
index 9f8f3a946e667409a6465564da21edd706b61487..e118b00fd09875efa6830865435882f55d47dfd7 100644
--- a/test/CodeGen/X86/misched-aa-colored.ll
+++ b/test/CodeGen/X86/misched-aa-colored.ll
@@ -143,10 +143,10 @@ declare { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } @_ZN4llvm
 declare void @__assert_fail(i8*, i8*, i32, i8*) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 ; Function Attrs: nounwind uwtable
 define hidden { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } @_ZN4llvm16DAGTypeLegalizer18WidenVecRes_BinaryEPNS_6SDNodeE(%"class.llvm::DAGTypeLegalizer.117.717.1077.2037.2157.2397.4197"* %this, %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"* %N) #2 align 2 {
@@ -155,13 +155,13 @@ entry:
   %ref.tmp.i = alloca %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199", align 8
   %Op.i = alloca %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", align 8
   %0 = bitcast %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* %ref.tmp.i to i8*
-  call void @llvm.lifetime.start(i64 24, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 24, i8* %0) #1
   %retval.sroa.0.0.idx.i36 = getelementptr inbounds %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199", %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* %ref.tmp.i, i64 0, i32 1, i32 0, i32 0
   %retval.sroa.0.0.copyload.i37 = load i32, i32* %retval.sroa.0.0.idx.i36, align 8
-  call void @llvm.lifetime.end(i64 24, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 24, i8* %0) #1
   %agg.tmp8.sroa.2.0.copyload = load i32, i32* undef, align 8
   %1 = bitcast %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* %Op.i to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %1) #1
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %1) #1
   %2 = getelementptr %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* %Op.i, i64 0, i32 1
   store i32 %agg.tmp8.sroa.2.0.copyload, i32* %2, align 8
 
diff --git a/test/CodeGen/X86/mmx-cvt.ll b/test/CodeGen/X86/mmx-cvt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8f2da95353993399673ae0478959f4b82294581e
--- /dev/null
+++ b/test/CodeGen/X86/mmx-cvt.ll
@@ -0,0 +1,369 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
+
+; If we are transferring XMM conversion results to MMX registers we could use the MMX equivalents
+; (CVTPD2PI/CVTTPD2PI + CVTPS2PI/CVTTPS2PI) without affecting rounding/exceptions etc.
+
+define void @cvt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
+; X86-LABEL: cvt_v2f64_v2i32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    cvtpd2pi %xmm0, %mm0
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: cvt_v2f64_v2i32:
+; X64:       # BB#0:
+; X64-NEXT:    cvtpd2pi %xmm0, %mm0
+; X64-NEXT:    paddd %mm0, %mm0
+; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    retq
+  %3 = tail call <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double> %0)
+  %4 = bitcast <4 x i32> %3 to <2 x i64>
+  %5 = extractelement <2 x i64> %4, i32 0
+  %6 = bitcast i64 %5 to x86_mmx
+  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
+  %8 = bitcast x86_mmx %7 to i64
+  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
+  store <1 x i64> %9, <1 x i64>* %1
+  ret void
+}
+
+define void @cvtt_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
+; X86-LABEL: cvtt_v2f64_v2i32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    cvttpd2pi %xmm0, %mm0
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: cvtt_v2f64_v2i32:
+; X64:       # BB#0:
+; X64-NEXT:    cvttpd2pi %xmm0, %mm0
+; X64-NEXT:    paddd %mm0, %mm0
+; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    retq
+  %3 = tail call <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double> %0)
+  %4 = bitcast <4 x i32> %3 to <2 x i64>
+  %5 = extractelement <2 x i64> %4, i32 0
+  %6 = bitcast i64 %5 to x86_mmx
+  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
+  %8 = bitcast x86_mmx %7 to i64
+  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
+  store <1 x i64> %9, <1 x i64>* %1
+  ret void
+}
+
+define void @fptosi_v2f64_v2i32(<2 x double>, <1 x i64>*) nounwind {
+; X86-LABEL: fptosi_v2f64_v2i32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    cvttpd2pi %xmm0, %mm0
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptosi_v2f64_v2i32:
+; X64:       # BB#0:
+; X64-NEXT:    cvttpd2pi %xmm0, %mm0
+; X64-NEXT:    paddd %mm0, %mm0
+; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    retq
+  %3 = fptosi <2 x double> %0 to <2 x i32>
+  %4 = bitcast <2 x i32> %3 to x86_mmx
+  %5 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %4, x86_mmx %4)
+  %6 = bitcast x86_mmx %5 to i64
+  %7 = insertelement <1 x i64> undef, i64 %6, i32 0
+  store <1 x i64> %7, <1 x i64>* %1
+  ret void
+}
+
+define void @cvt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
+; X86-LABEL: cvt_v2f32_v2i32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    cvtps2pi %xmm0, %mm0
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: cvt_v2f32_v2i32:
+; X64:       # BB#0:
+; X64-NEXT:    cvtps2pi %xmm0, %mm0
+; X64-NEXT:    paddd %mm0, %mm0
+; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    retq
+  %3 = tail call <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float> %0)
+  %4 = bitcast <4 x i32> %3 to <2 x i64>
+  %5 = extractelement <2 x i64> %4, i32 0
+  %6 = bitcast i64 %5 to x86_mmx
+  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
+  %8 = bitcast x86_mmx %7 to i64
+  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
+  store <1 x i64> %9, <1 x i64>* %1
+  ret void
+}
+
+define void @cvtt_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
+; X86-LABEL: cvtt_v2f32_v2i32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    cvttps2pi %xmm0, %mm0
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: cvtt_v2f32_v2i32:
+; X64:       # BB#0:
+; X64-NEXT:    cvttps2pi %xmm0, %mm0
+; X64-NEXT:    paddd %mm0, %mm0
+; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    retq
+  %3 = tail call <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float> %0)
+  %4 = bitcast <4 x i32> %3 to <2 x i64>
+  %5 = extractelement <2 x i64> %4, i32 0
+  %6 = bitcast i64 %5 to x86_mmx
+  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
+  %8 = bitcast x86_mmx %7 to i64
+  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
+  store <1 x i64> %9, <1 x i64>* %1
+  ret void
+}
+
+define void @fptosi_v4f32_v4i32(<4 x float>, <1 x i64>*) nounwind {
+; X86-LABEL: fptosi_v4f32_v4i32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    cvttps2pi %xmm0, %mm0
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptosi_v4f32_v4i32:
+; X64:       # BB#0:
+; X64-NEXT:    cvttps2pi %xmm0, %mm0
+; X64-NEXT:    paddd %mm0, %mm0
+; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    retq
+  %3 = fptosi <4 x float> %0 to <4 x i32>
+  %4 = shufflevector <4 x i32> %3, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %5 = bitcast <2 x i32> %4 to x86_mmx
+  %6 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %5, x86_mmx %5)
+  %7 = bitcast x86_mmx %6 to i64
+  %8 = insertelement <1 x i64> undef, i64 %7, i32 0
+  store <1 x i64> %8, <1 x i64>* %1
+  ret void
+}
+
+define void @fptosi_v2f32_v2i32(<4 x float>, <1 x i64>*) nounwind {
+; X86-LABEL: fptosi_v2f32_v2i32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    cvttps2pi %xmm0, %mm0
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: fptosi_v2f32_v2i32:
+; X64:       # BB#0:
+; X64-NEXT:    cvttps2pi %xmm0, %mm0
+; X64-NEXT:    paddd %mm0, %mm0
+; X64-NEXT:    movq %mm0, (%rdi)
+; X64-NEXT:    retq
+  %3 = fptosi <4 x float> %0 to <4 x i32>
+  %4 = bitcast <4 x i32> %3 to <2 x i64>
+  %5 = extractelement <2 x i64> %4, i32 0
+  %6 = bitcast i64 %5 to x86_mmx
+  %7 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %6, x86_mmx %6)
+  %8 = bitcast x86_mmx %7 to i64
+  %9 = insertelement <1 x i64> undef, i64 %8, i32 0
+  store <1 x i64> %9, <1 x i64>* %1
+  ret void
+}
+
+; FIXME: If we are transferring MMX registers to XMM for conversion we could use the MMX equivalents
+; (CVTPI2PD + CVTPI2PS) without affecting rounding/exceptions etc.
+
+define <2 x double> @sitofp_v2i32_v2f64(<1 x i64>*) nounwind {
+; X86-LABEL: sitofp_v2i32_v2f64:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm0
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    cvtdq2pd (%esp), %xmm0
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_v2i32_v2f64:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    paddd %mm0, %mm0
+; X64-NEXT:    movq2dq %mm0, %xmm0
+; X64-NEXT:    cvtdq2pd %xmm0, %xmm0
+; X64-NEXT:    retq
+  %2 = bitcast <1 x i64>* %0 to x86_mmx*
+  %3 = load x86_mmx, x86_mmx* %2, align 8
+  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
+  %5 = bitcast x86_mmx %4 to i64
+  %6 = insertelement <2 x i64> undef, i64 %5, i32 0
+  %7 = bitcast <2 x i64> %6 to <4 x i32>
+  %8 = shufflevector <4 x i32> %7, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %9 = sitofp <2 x i32> %8 to <2 x double>
+  ret <2 x double> %9
+}
+
+define <4 x float> @sitofp_v2i32_v2f32(<1 x i64>*) nounwind {
+; X86-LABEL: sitofp_v2i32_v2f32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm0
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: sitofp_v2i32_v2f32:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    paddd %mm0, %mm0
+; X64-NEXT:    movq %mm0, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X64-NEXT:    retq
+  %2 = bitcast <1 x i64>* %0 to x86_mmx*
+  %3 = load x86_mmx, x86_mmx* %2, align 8
+  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
+  %5 = bitcast x86_mmx %4 to <2 x i32>
+  %6 = shufflevector <2 x i32> %5, <2 x i32> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %7 = sitofp <4 x i32> %6 to <4 x float>
+  ret <4 x float> %7
+}
+
+define <4 x float> @cvt_v2i32_v2f32(<1 x i64>*) nounwind {
+; X86-LABEL: cvt_v2i32_v2f32:
+; X86:       # BB#0:
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movq (%eax), %mm0
+; X86-NEXT:    paddd %mm0, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: cvt_v2i32_v2f32:
+; X64:       # BB#0:
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    paddd %mm0, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    movd %rax, %xmm0
+; X64-NEXT:    cvtdq2ps %xmm0, %xmm0
+; X64-NEXT:    retq
+  %2 = bitcast <1 x i64>* %0 to x86_mmx*
+  %3 = load x86_mmx, x86_mmx* %2, align 8
+  %4 = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %3, x86_mmx %3)
+  %5 = bitcast x86_mmx %4 to i64
+  %6 = insertelement <2 x i64> undef, i64 %5, i32 0
+  %7 = insertelement <2 x i64> %6, i64 0, i32 1
+  %8 = bitcast <2 x i64> %7 to <4 x i32>
+  %9 = tail call <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32> %8)
+  ret <4 x float> %9
+}
+
+declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
+declare <4 x i32> @llvm.x86.sse2.cvtpd2dq(<2 x double>)
+declare <4 x i32> @llvm.x86.sse2.cvttpd2dq(<2 x double>)
+declare <4 x i32> @llvm.x86.sse2.cvtps2dq(<4 x float>)
+declare <4 x i32> @llvm.x86.sse2.cvttps2dq(<4 x float>)
+declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>)
diff --git a/test/CodeGen/X86/mmx-fold-load.ll b/test/CodeGen/X86/mmx-fold-load.ll
index 2b9d30f59fd50ff5cbf2c91c76a19a6b9e387587..832743870fb40d5ddc3d7272e228387e098954a9 100644
--- a/test/CodeGen/X86/mmx-fold-load.ll
+++ b/test/CodeGen/X86/mmx-fold-load.ll
@@ -1,12 +1,33 @@
-; RUN: llc < %s -march=x86-64 -mattr=+mmx,+sse2 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+mmx,+sse2 | FileCheck %s --check-prefix=X64
 
-define i64 @t0(<1 x i64>* %a, i32* %b) {
-; CHECK-LABEL: t0:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:         movq (%[[REG1:[a-z]+]]), %mm0
-; CHECK-NEXT:    psllq (%[[REG2:[a-z]+]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    retq
+define i64 @t0(<1 x i64>* %a, i32* %b) nounwind {
+; X86-LABEL: t0:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    movd (%eax), %mm1
+; X86-NEXT:    psllq %mm1, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t0:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movd (%rsi), %mm1
+; X64-NEXT:    psllq %mm1, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64>* %a to x86_mmx*
   %1 = load x86_mmx, x86_mmx* %0, align 8
@@ -17,13 +38,32 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.pslli.q(x86_mmx, i32)
 
-define i64 @t1(<1 x i64>* %a, i32* %b) {
-; CHECK-LABEL: t1:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:         movq (%[[REG1]]), %mm0
-; CHECK-NEXT:    psrlq (%[[REG2]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    retq
+define i64 @t1(<1 x i64>* %a, i32* %b) nounwind {
+; X86-LABEL: t1:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    movd (%eax), %mm1
+; X86-NEXT:    psrlq %mm1, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t1:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movd (%rsi), %mm1
+; X64-NEXT:    psrlq %mm1, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64>* %a to x86_mmx*
   %1 = load x86_mmx, x86_mmx* %0, align 8
@@ -34,13 +74,32 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx, i32)
 
-define i64 @t2(<1 x i64>* %a, i32* %b) {
-; CHECK-LABEL: t2:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:         movq (%[[REG1]]), %mm0
-; CHECK-NEXT:    psllw (%[[REG2]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    retq
+define i64 @t2(<1 x i64>* %a, i32* %b) nounwind {
+; X86-LABEL: t2:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    movd (%eax), %mm1
+; X86-NEXT:    psllw %mm1, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t2:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movd (%rsi), %mm1
+; X64-NEXT:    psllw %mm1, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64>* %a to x86_mmx*
   %1 = load x86_mmx, x86_mmx* %0, align 8
@@ -51,13 +110,32 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.pslli.w(x86_mmx, i32)
 
-define i64 @t3(<1 x i64>* %a, i32* %b) {
-; CHECK-LABEL: t3:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:         movq (%[[REG1]]), %mm0
-; CHECK-NEXT:    psrlw (%[[REG2]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    retq
+define i64 @t3(<1 x i64>* %a, i32* %b) nounwind {
+; X86-LABEL: t3:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    movd (%eax), %mm1
+; X86-NEXT:    psrlw %mm1, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t3:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movd (%rsi), %mm1
+; X64-NEXT:    psrlw %mm1, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64>* %a to x86_mmx*
   %1 = load x86_mmx, x86_mmx* %0, align 8
@@ -68,13 +146,32 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.psrli.w(x86_mmx, i32)
 
-define i64 @t4(<1 x i64>* %a, i32* %b) {
-; CHECK-LABEL: t4:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:         movq (%[[REG1]]), %mm0
-; CHECK-NEXT:    pslld (%[[REG2]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    retq
+define i64 @t4(<1 x i64>* %a, i32* %b) nounwind {
+; X86-LABEL: t4:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    movd (%eax), %mm1
+; X86-NEXT:    pslld %mm1, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t4:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movd (%rsi), %mm1
+; X64-NEXT:    pslld %mm1, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64>* %a to x86_mmx*
   %1 = load x86_mmx, x86_mmx* %0, align 8
@@ -85,13 +182,32 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.pslli.d(x86_mmx, i32)
 
-define i64 @t5(<1 x i64>* %a, i32* %b) {
-; CHECK-LABEL: t5:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:         movq (%[[REG1]]), %mm0
-; CHECK-NEXT:    psrld (%[[REG2]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    retq
+define i64 @t5(<1 x i64>* %a, i32* %b) nounwind {
+; X86-LABEL: t5:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    movd (%eax), %mm1
+; X86-NEXT:    psrld %mm1, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t5:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movd (%rsi), %mm1
+; X64-NEXT:    psrld %mm1, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64>* %a to x86_mmx*
   %1 = load x86_mmx, x86_mmx* %0, align 8
@@ -102,13 +218,32 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.psrli.d(x86_mmx, i32)
 
-define i64 @t6(<1 x i64>* %a, i32* %b) {
-; CHECK-LABEL: t6:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:         movq (%[[REG1]]), %mm0
-; CHECK-NEXT:    psraw (%[[REG2]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    retq
+define i64 @t6(<1 x i64>* %a, i32* %b) nounwind {
+; X86-LABEL: t6:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    movd (%eax), %mm1
+; X86-NEXT:    psraw %mm1, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t6:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movd (%rsi), %mm1
+; X64-NEXT:    psraw %mm1, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64>* %a to x86_mmx*
   %1 = load x86_mmx, x86_mmx* %0, align 8
@@ -119,13 +254,32 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.psrai.w(x86_mmx, i32)
 
-define i64 @t7(<1 x i64>* %a, i32* %b) {
-; CHECK-LABEL: t7:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:         movq (%[[REG1]]), %mm0
-; CHECK-NEXT:    psrad (%[[REG2]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    retq
+define i64 @t7(<1 x i64>* %a, i32* %b) nounwind {
+; X86-LABEL: t7:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 12(%ebp), %eax
+; X86-NEXT:    movl 8(%ebp), %ecx
+; X86-NEXT:    movq (%ecx), %mm0
+; X86-NEXT:    movd (%eax), %mm1
+; X86-NEXT:    psrad %mm1, %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: t7:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movq (%rdi), %mm0
+; X64-NEXT:    movd (%rsi), %mm1
+; X64-NEXT:    psrad %mm1, %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    retq
 entry:
   %0 = bitcast <1 x i64>* %a to x86_mmx*
   %1 = load x86_mmx, x86_mmx* %0, align 8
@@ -136,13 +290,29 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.psrai.d(x86_mmx, i32)
 
-define i64 @tt0(x86_mmx %t, x86_mmx* %q) {
-; CHECK-LABEL: tt0:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    paddb (%[[REG3:[a-z]+]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    emms
-; CHECK-NEXT:    retq
+define i64 @tt0(x86_mmx %t, x86_mmx* %q) nounwind {
+; X86-LABEL: tt0:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    paddb (%eax), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    emms
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: tt0:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    paddb (%rdi), %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    emms
+; X64-NEXT:    retq
 entry:
   %v = load x86_mmx, x86_mmx* %q
   %u = tail call x86_mmx @llvm.x86.mmx.padd.b(x86_mmx %t, x86_mmx %v)
@@ -153,13 +323,29 @@ entry:
 declare x86_mmx @llvm.x86.mmx.padd.b(x86_mmx, x86_mmx)
 declare void @llvm.x86.mmx.emms()
 
-define i64 @tt1(x86_mmx %t, x86_mmx* %q) {
-; CHECK-LABEL: tt1:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    paddw (%[[REG3]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    emms
-; CHECK-NEXT:    retq
+define i64 @tt1(x86_mmx %t, x86_mmx* %q) nounwind {
+; X86-LABEL: tt1:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    paddw (%eax), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    emms
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: tt1:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    paddw (%rdi), %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    emms
+; X64-NEXT:    retq
 entry:
   %v = load x86_mmx, x86_mmx* %q
   %u = tail call x86_mmx @llvm.x86.mmx.padd.w(x86_mmx %t, x86_mmx %v)
@@ -169,13 +355,29 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.padd.w(x86_mmx, x86_mmx)
 
-define i64 @tt2(x86_mmx %t, x86_mmx* %q) {
-; CHECK-LABEL: tt2:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    paddd (%[[REG3]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    emms
-; CHECK-NEXT:    retq
+define i64 @tt2(x86_mmx %t, x86_mmx* %q) nounwind {
+; X86-LABEL: tt2:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    paddd (%eax), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    emms
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: tt2:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    paddd (%rdi), %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    emms
+; X64-NEXT:    retq
 entry:
   %v = load x86_mmx, x86_mmx* %q
   %u = tail call x86_mmx @llvm.x86.mmx.padd.d(x86_mmx %t, x86_mmx %v)
@@ -185,13 +387,29 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.padd.d(x86_mmx, x86_mmx)
 
-define i64 @tt3(x86_mmx %t, x86_mmx* %q) {
-; CHECK-LABEL: tt3:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    paddq (%[[REG3]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    emms
-; CHECK-NEXT:    retq
+define i64 @tt3(x86_mmx %t, x86_mmx* %q) nounwind {
+; X86-LABEL: tt3:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    paddq (%eax), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    emms
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: tt3:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    paddq (%rdi), %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    emms
+; X64-NEXT:    retq
 entry:
   %v = load x86_mmx, x86_mmx* %q
   %u = tail call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %t, x86_mmx %v)
@@ -201,13 +419,29 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.padd.q(x86_mmx, x86_mmx)
 
-define i64 @tt4(x86_mmx %t, x86_mmx* %q) {
-; CHECK-LABEL: tt4:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    paddusb (%[[REG3]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    emms
-; CHECK-NEXT:    retq
+define i64 @tt4(x86_mmx %t, x86_mmx* %q) nounwind {
+; X86-LABEL: tt4:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    paddusb (%eax), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    emms
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: tt4:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    paddusb (%rdi), %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    emms
+; X64-NEXT:    retq
 entry:
   %v = load x86_mmx, x86_mmx* %q
   %u = tail call x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx %t, x86_mmx %v)
@@ -217,13 +451,29 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.paddus.b(x86_mmx, x86_mmx)
 
-define i64 @tt5(x86_mmx %t, x86_mmx* %q) {
-; CHECK-LABEL: tt5:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    paddusw (%[[REG3]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    emms
-; CHECK-NEXT:    retq
+define i64 @tt5(x86_mmx %t, x86_mmx* %q) nounwind {
+; X86-LABEL: tt5:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    paddusw (%eax), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    emms
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: tt5:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    paddusw (%rdi), %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    emms
+; X64-NEXT:    retq
 entry:
   %v = load x86_mmx, x86_mmx* %q
   %u = tail call x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx %t, x86_mmx %v)
@@ -233,13 +483,29 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.paddus.w(x86_mmx, x86_mmx)
 
-define i64 @tt6(x86_mmx %t, x86_mmx* %q) {
-; CHECK-LABEL: tt6:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    psrlw (%[[REG3]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    emms
-; CHECK-NEXT:    retq
+define i64 @tt6(x86_mmx %t, x86_mmx* %q) nounwind {
+; X86-LABEL: tt6:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    psrlw (%eax), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    emms
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: tt6:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    psrlw (%rdi), %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    emms
+; X64-NEXT:    retq
 entry:
   %v = load x86_mmx, x86_mmx* %q
   %u = tail call x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx %t, x86_mmx %v)
@@ -249,13 +515,29 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.psrl.w(x86_mmx, x86_mmx)
 
-define i64 @tt7(x86_mmx %t, x86_mmx* %q) {
-; CHECK-LABEL: tt7:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    psrld (%[[REG3]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    emms
-; CHECK-NEXT:    retq
+define i64 @tt7(x86_mmx %t, x86_mmx* %q) nounwind {
+; X86-LABEL: tt7:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    psrld (%eax), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    emms
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: tt7:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    psrld (%rdi), %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    emms
+; X64-NEXT:    retq
 entry:
   %v = load x86_mmx, x86_mmx* %q
   %u = tail call x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx %t, x86_mmx %v)
@@ -265,13 +547,29 @@ entry:
 }
 declare x86_mmx @llvm.x86.mmx.psrl.d(x86_mmx, x86_mmx)
 
-define i64 @tt8(x86_mmx %t, x86_mmx* %q) {
-; CHECK-LABEL: tt8:
-; CHECK:       # BB#0:{{.*}} %entry
-; CHECK:    psrlq (%[[REG3]]), %mm0
-; CHECK-NEXT:    movd %mm0, %rax
-; CHECK-NEXT:    emms
-; CHECK-NEXT:    retq
+define i64 @tt8(x86_mmx %t, x86_mmx* %q) nounwind {
+; X86-LABEL: tt8:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    psrlq (%eax), %mm0
+; X86-NEXT:    movq %mm0, (%esp)
+; X86-NEXT:    movl (%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    emms
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: tt8:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    psrlq (%rdi), %mm0
+; X64-NEXT:    movd %mm0, %rax
+; X64-NEXT:    emms
+; X64-NEXT:    retq
 entry:
   %v = load x86_mmx, x86_mmx* %q
   %u = tail call x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx %t, x86_mmx %v)
@@ -280,3 +578,46 @@ entry:
   ret i64 %s
 }
 declare x86_mmx @llvm.x86.mmx.psrl.q(x86_mmx, x86_mmx)
+
+define void @test_psrlq_by_volatile_shift_amount(x86_mmx* %t) nounwind {
+; X86-LABEL: test_psrlq_by_volatile_shift_amount:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:    movl %esp, %ebp
+; X86-NEXT:    andl $-8, %esp
+; X86-NEXT:    subl $16, %esp
+; X86-NEXT:    movl 8(%ebp), %eax
+; X86-NEXT:    movl $1, {{[0-9]+}}(%esp)
+; X86-NEXT:    movd {{[0-9]+}}(%esp), %mm0
+; X86-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X86-NEXT:    movl $255, {{[0-9]+}}(%esp)
+; X86-NEXT:    movq {{[0-9]+}}(%esp), %mm1
+; X86-NEXT:    psrlq %mm0, %mm1
+; X86-NEXT:    movq %mm1, (%eax)
+; X86-NEXT:    movl %ebp, %esp
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_psrlq_by_volatile_shift_amount:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movl $1, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movd -{{[0-9]+}}(%rsp), %mm0
+; X64-NEXT:    movl $255, %eax
+; X64-NEXT:    movd %rax, %mm1
+; X64-NEXT:    psrlq %mm0, %mm1
+; X64-NEXT:    movq %mm1, (%rdi)
+; X64-NEXT:    retq
+entry:
+  %0 = alloca i32, align 4
+  %1 = bitcast i32* %0 to i8*
+  call void @llvm.lifetime.start(i64 4, i8* nonnull %1)
+  store volatile i32 1, i32* %0, align 4
+  %2 = load volatile i32, i32* %0, align 4
+  %3 = tail call x86_mmx @llvm.x86.mmx.psrli.q(x86_mmx bitcast (<1 x i64> <i64 255> to x86_mmx), i32 %2)
+  store x86_mmx %3, x86_mmx* %t, align 8
+  call void @llvm.lifetime.end(i64 4, i8* nonnull %1)
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.end(i64, i8* nocapture)
diff --git a/test/CodeGen/X86/mul-constant-i16.ll b/test/CodeGen/X86/mul-constant-i16.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e3e2737cf3e62f44a4e5a7ee7f2be8cf44fabd5e
--- /dev/null
+++ b/test/CodeGen/X86/mul-constant-i16.ll
@@ -0,0 +1,589 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+
+define i16 @test_mul_by_1(i16 %x) {
+; X86-LABEL: test_mul_by_1:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_1:
+; X64:       # BB#0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 1
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_2(i16 %x) {
+; X86-LABEL: test_mul_by_2:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_2:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 2
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_3(i16 %x) {
+; X86-LABEL: test_mul_by_3:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_3:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 3
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_4(i16 %x) {
+; X86-LABEL: test_mul_by_4:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_4:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (,%rdi,4), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 4
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_5(i16 %x) {
+; X86-LABEL: test_mul_by_5:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_5:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 5
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_6(i16 %x) {
+; X86-LABEL: test_mul_by_6:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_6:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 6
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_7(i16 %x) {
+; X86-LABEL: test_mul_by_7:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (,%ecx,8), %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_7:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (,%rdi,8), %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 7
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_8(i16 %x) {
+; X86-LABEL: test_mul_by_8:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_8:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (,%rdi,8), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 8
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_9(i16 %x) {
+; X86-LABEL: test_mul_by_9:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_9:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 9
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_10(i16 %x) {
+; X86-LABEL: test_mul_by_10:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_10:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 10
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_11(i16 %x) {
+; X86-LABEL: test_mul_by_11:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $11, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_11:
+; X64:       # BB#0:
+; X64-NEXT:    imull $11, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 11
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_12(i16 %x) {
+; X86-LABEL: test_mul_by_12:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_12:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    shll $2, %edi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 12
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_13(i16 %x) {
+; X86-LABEL: test_mul_by_13:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $13, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_13:
+; X64:       # BB#0:
+; X64-NEXT:    imull $13, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 13
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_14(i16 %x) {
+; X86-LABEL: test_mul_by_14:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $14, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_14:
+; X64:       # BB#0:
+; X64-NEXT:    imull $14, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 14
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_15(i16 %x) {
+; X86-LABEL: test_mul_by_15:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_15:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 15
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_16(i16 %x) {
+; X86-LABEL: test_mul_by_16:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $4, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_16:
+; X64:       # BB#0:
+; X64-NEXT:    shll $4, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 16
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_17(i16 %x) {
+; X86-LABEL: test_mul_by_17:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shll $4, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_17:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    leal (%rax,%rdi), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 17
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_18(i16 %x) {
+; X86-LABEL: test_mul_by_18:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,8), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_18:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 18
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_19(i16 %x) {
+; X86-LABEL: test_mul_by_19:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $19, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_19:
+; X64:       # BB#0:
+; X64-NEXT:    imull $19, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 19
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_20(i16 %x) {
+; X86-LABEL: test_mul_by_20:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_20:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    shll $2, %edi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 20
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_21(i16 %x) {
+; X86-LABEL: test_mul_by_21:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $21, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_21:
+; X64:       # BB#0:
+; X64-NEXT:    imull $21, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 21
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_22(i16 %x) {
+; X86-LABEL: test_mul_by_22:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $22, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_22:
+; X64:       # BB#0:
+; X64-NEXT:    imull $22, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 22
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_23(i16 %x) {
+; X86-LABEL: test_mul_by_23:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $23, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_23:
+; X64:       # BB#0:
+; X64-NEXT:    imull $23, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 23
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_24(i16 %x) {
+; X86-LABEL: test_mul_by_24:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_24:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    shll $3, %edi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 24
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_25(i16 %x) {
+; X86-LABEL: test_mul_by_25:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_25:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rax,%rax,4), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 25
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_26(i16 %x) {
+; X86-LABEL: test_mul_by_26:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $26, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_26:
+; X64:       # BB#0:
+; X64-NEXT:    imull $26, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 26
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_27(i16 %x) {
+; X86-LABEL: test_mul_by_27:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_27:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 27
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_28(i16 %x) {
+; X86-LABEL: test_mul_by_28:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $28, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_28:
+; X64:       # BB#0:
+; X64-NEXT:    imull $28, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 28
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_29(i16 %x) {
+; X86-LABEL: test_mul_by_29:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $29, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_29:
+; X64:       # BB#0:
+; X64-NEXT:    imull $29, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 29
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_30(i16 %x) {
+; X86-LABEL: test_mul_by_30:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    imull $30, %eax, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_30:
+; X64:       # BB#0:
+; X64-NEXT:    imull $30, %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 30
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_31(i16 %x) {
+; X86-LABEL: test_mul_by_31:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shll $5, %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_31:
+; X64:       # BB#0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $5, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 31
+  ret i16 %mul
+}
+
+define i16 @test_mul_by_32(i16 %x) {
+; X86-LABEL: test_mul_by_32:
+; X86:       # BB#0:
+; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $5, %eax
+; X86-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_32:
+; X64:       # BB#0:
+; X64-NEXT:    shll $5, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i16 %x, 32
+  ret i16 %mul
+}
diff --git a/test/CodeGen/X86/mul-constant-i32.ll b/test/CodeGen/X86/mul-constant-i32.ll
new file mode 100644
index 0000000000000000000000000000000000000000..76e46e1f1b09e75d74dc4fc2addc1257922a8189
--- /dev/null
+++ b/test/CodeGen/X86/mul-constant-i32.ll
@@ -0,0 +1,515 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+
+define i32 @test_mul_by_1(i32 %x) {
+; X86-LABEL: test_mul_by_1:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_1:
+; X64:       # BB#0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 1
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_2(i32 %x) {
+; X86-LABEL: test_mul_by_2:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_2:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 2
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_3(i32 %x) {
+; X86-LABEL: test_mul_by_3:
+; X86:       # BB#0:
+; X86-NEXT:    imull $3, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_3:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 3
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_4(i32 %x) {
+; X86-LABEL: test_mul_by_4:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_4:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (,%rdi,4), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 4
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_5(i32 %x) {
+; X86-LABEL: test_mul_by_5:
+; X86:       # BB#0:
+; X86-NEXT:    imull $5, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_5:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 5
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_6(i32 %x) {
+; X86-LABEL: test_mul_by_6:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_6:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 6
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_7(i32 %x) {
+; X86-LABEL: test_mul_by_7:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    leal (,%ecx,8), %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_7:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (,%rdi,8), %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 7
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_8(i32 %x) {
+; X86-LABEL: test_mul_by_8:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_8:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (,%rdi,8), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 8
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_9(i32 %x) {
+; X86-LABEL: test_mul_by_9:
+; X86:       # BB#0:
+; X86-NEXT:    imull $9, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_9:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 9
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_10(i32 %x) {
+; X86-LABEL: test_mul_by_10:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_10:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 10
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_11(i32 %x) {
+; X86-LABEL: test_mul_by_11:
+; X86:       # BB#0:
+; X86-NEXT:    imull $11, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_11:
+; X64:       # BB#0:
+; X64-NEXT:    imull $11, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 11
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_12(i32 %x) {
+; X86-LABEL: test_mul_by_12:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_12:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    shll $2, %edi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 12
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_13(i32 %x) {
+; X86-LABEL: test_mul_by_13:
+; X86:       # BB#0:
+; X86-NEXT:    imull $13, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_13:
+; X64:       # BB#0:
+; X64-NEXT:    imull $13, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 13
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_14(i32 %x) {
+; X86-LABEL: test_mul_by_14:
+; X86:       # BB#0:
+; X86-NEXT:    imull $14, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_14:
+; X64:       # BB#0:
+; X64-NEXT:    imull $14, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 14
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_15(i32 %x) {
+; X86-LABEL: test_mul_by_15:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_15:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 15
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_16(i32 %x) {
+; X86-LABEL: test_mul_by_16:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $4, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_16:
+; X64:       # BB#0:
+; X64-NEXT:    shll $4, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 16
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_17(i32 %x) {
+; X86-LABEL: test_mul_by_17:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shll $4, %eax
+; X86-NEXT:    addl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_17:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    leal (%rax,%rdi), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 17
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_18(i32 %x) {
+; X86-LABEL: test_mul_by_18:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    leal (%eax,%eax,8), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_18:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    addl %edi, %edi
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 18
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_19(i32 %x) {
+; X86-LABEL: test_mul_by_19:
+; X86:       # BB#0:
+; X86-NEXT:    imull $19, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_19:
+; X64:       # BB#0:
+; X64-NEXT:    imull $19, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 19
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_20(i32 %x) {
+; X86-LABEL: test_mul_by_20:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_20:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    shll $2, %edi
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 20
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_21(i32 %x) {
+; X86-LABEL: test_mul_by_21:
+; X86:       # BB#0:
+; X86-NEXT:    imull $21, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_21:
+; X64:       # BB#0:
+; X64-NEXT:    imull $21, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 21
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_22(i32 %x) {
+; X86-LABEL: test_mul_by_22:
+; X86:       # BB#0:
+; X86-NEXT:    imull $22, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_22:
+; X64:       # BB#0:
+; X64-NEXT:    imull $22, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 22
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_23(i32 %x) {
+; X86-LABEL: test_mul_by_23:
+; X86:       # BB#0:
+; X86-NEXT:    imull $23, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_23:
+; X64:       # BB#0:
+; X64-NEXT:    imull $23, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 23
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_24(i32 %x) {
+; X86-LABEL: test_mul_by_24:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_24:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    shll $3, %edi
+; X64-NEXT:    leal (%rdi,%rdi,2), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 24
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_25(i32 %x) {
+; X86-LABEL: test_mul_by_25:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_25:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,4), %eax
+; X64-NEXT:    leal (%rax,%rax,4), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 25
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_26(i32 %x) {
+; X86-LABEL: test_mul_by_26:
+; X86:       # BB#0:
+; X86-NEXT:    imull $26, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_26:
+; X64:       # BB#0:
+; X64-NEXT:    imull $26, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 26
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_27(i32 %x) {
+; X86-LABEL: test_mul_by_27:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_27:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    leal (%rdi,%rdi,8), %eax
+; X64-NEXT:    leal (%rax,%rax,2), %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 27
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_28(i32 %x) {
+; X86-LABEL: test_mul_by_28:
+; X86:       # BB#0:
+; X86-NEXT:    imull $28, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_28:
+; X64:       # BB#0:
+; X64-NEXT:    imull $28, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 28
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_29(i32 %x) {
+; X86-LABEL: test_mul_by_29:
+; X86:       # BB#0:
+; X86-NEXT:    imull $29, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_29:
+; X64:       # BB#0:
+; X64-NEXT:    imull $29, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 29
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_30(i32 %x) {
+; X86-LABEL: test_mul_by_30:
+; X86:       # BB#0:
+; X86-NEXT:    imull $30, {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_30:
+; X64:       # BB#0:
+; X64-NEXT:    imull $30, %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 30
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_31(i32 %x) {
+; X86-LABEL: test_mul_by_31:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    shll $5, %eax
+; X86-NEXT:    subl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_31:
+; X64:       # BB#0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    shll $5, %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 31
+  ret i32 %mul
+}
+
+define i32 @test_mul_by_32(i32 %x) {
+; X86-LABEL: test_mul_by_32:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    shll $5, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_32:
+; X64:       # BB#0:
+; X64-NEXT:    shll $5, %edi
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    retq
+  %mul = mul nsw i32 %x, 32
+  ret i32 %mul
+}
diff --git a/test/CodeGen/X86/mul-constant-i64.ll b/test/CodeGen/X86/mul-constant-i64.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8579179a82315302d4c5ccbc2bfe620f5b7d566d
--- /dev/null
+++ b/test/CodeGen/X86/mul-constant-i64.ll
@@ -0,0 +1,581 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+
+define i64 @test_mul_by_1(i64 %x) {
+; X86-LABEL: test_mul_by_1:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_1:
+; X64:       # BB#0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 1
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_2(i64 %x) {
+; X86-LABEL: test_mul_by_2:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shldl $1, %eax, %edx
+; X86-NEXT:    addl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_2:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (%rdi,%rdi), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 2
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_3(i64 %x) {
+; X86-LABEL: test_mul_by_3:
+; X86:       # BB#0:
+; X86-NEXT:    movl $3, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $3, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_3:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 3
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_4(i64 %x) {
+; X86-LABEL: test_mul_by_4:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shldl $2, %eax, %edx
+; X86-NEXT:    shll $2, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_4:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (,%rdi,4), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 4
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_5(i64 %x) {
+; X86-LABEL: test_mul_by_5:
+; X86:       # BB#0:
+; X86-NEXT:    movl $5, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $5, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_5:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 5
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_6(i64 %x) {
+; X86-LABEL: test_mul_by_6:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    movl $6, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    leal (%edx,%ecx,2), %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_6:
+; X64:       # BB#0:
+; X64-NEXT:    addq %rdi, %rdi
+; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 6
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_7(i64 %x) {
+; X86-LABEL: test_mul_by_7:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (,%eax,8), %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl $7, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_7:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (,%rdi,8), %rax
+; X64-NEXT:    subq %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 7
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_8(i64 %x) {
+; X86-LABEL: test_mul_by_8:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shldl $3, %eax, %edx
+; X86-NEXT:    shll $3, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_8:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (,%rdi,8), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 8
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_9(i64 %x) {
+; X86-LABEL: test_mul_by_9:
+; X86:       # BB#0:
+; X86-NEXT:    movl $9, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $9, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_9:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (%rdi,%rdi,8), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 9
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_10(i64 %x) {
+; X86-LABEL: test_mul_by_10:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    movl $10, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    leal (%edx,%ecx,2), %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_10:
+; X64:       # BB#0:
+; X64-NEXT:    addq %rdi, %rdi
+; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 10
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_11(i64 %x) {
+; X86-LABEL: test_mul_by_11:
+; X86:       # BB#0:
+; X86-NEXT:    movl $11, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $11, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_11:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $11, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 11
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_12(i64 %x) {
+; X86-LABEL: test_mul_by_12:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    movl $12, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    leal (%edx,%ecx,4), %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_12:
+; X64:       # BB#0:
+; X64-NEXT:    shlq $2, %rdi
+; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 12
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_13(i64 %x) {
+; X86-LABEL: test_mul_by_13:
+; X86:       # BB#0:
+; X86-NEXT:    movl $13, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $13, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_13:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $13, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 13
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_14(i64 %x) {
+; X86-LABEL: test_mul_by_14:
+; X86:       # BB#0:
+; X86-NEXT:    movl $14, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $14, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_14:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $14, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 14
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_15(i64 %x) {
+; X86-LABEL: test_mul_by_15:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $15, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    leal (%ecx,%ecx,4), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_15:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
+; X64-NEXT:    leaq (%rax,%rax,2), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 15
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_16(i64 %x) {
+; X86-LABEL: test_mul_by_16:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shldl $4, %eax, %edx
+; X86-NEXT:    shll $4, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_16:
+; X64:       # BB#0:
+; X64-NEXT:    shlq $4, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 16
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_17(i64 %x) {
+; X86-LABEL: test_mul_by_17:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $4, %ecx
+; X86-NEXT:    addl %eax, %ecx
+; X86-NEXT:    movl $17, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_17:
+; X64:       # BB#0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    leaq (%rax,%rdi), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 17
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_18(i64 %x) {
+; X86-LABEL: test_mul_by_18:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,8), %ecx
+; X86-NEXT:    movl $18, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    leal (%edx,%ecx,2), %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_18:
+; X64:       # BB#0:
+; X64-NEXT:    addq %rdi, %rdi
+; X64-NEXT:    leaq (%rdi,%rdi,8), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 18
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_19(i64 %x) {
+; X86-LABEL: test_mul_by_19:
+; X86:       # BB#0:
+; X86-NEXT:    movl $19, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $19, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_19:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $19, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 19
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_20(i64 %x) {
+; X86-LABEL: test_mul_by_20:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,4), %ecx
+; X86-NEXT:    movl $20, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    leal (%edx,%ecx,4), %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_20:
+; X64:       # BB#0:
+; X64-NEXT:    shlq $2, %rdi
+; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 20
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_21(i64 %x) {
+; X86-LABEL: test_mul_by_21:
+; X86:       # BB#0:
+; X86-NEXT:    movl $21, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $21, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_21:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $21, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 21
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_22(i64 %x) {
+; X86-LABEL: test_mul_by_22:
+; X86:       # BB#0:
+; X86-NEXT:    movl $22, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $22, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_22:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $22, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 22
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_23(i64 %x) {
+; X86-LABEL: test_mul_by_23:
+; X86:       # BB#0:
+; X86-NEXT:    movl $23, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $23, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_23:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $23, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 23
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_24(i64 %x) {
+; X86-LABEL: test_mul_by_24:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    leal (%eax,%eax,2), %ecx
+; X86-NEXT:    movl $24, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    leal (%edx,%ecx,8), %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_24:
+; X64:       # BB#0:
+; X64-NEXT:    shlq $3, %rdi
+; X64-NEXT:    leaq (%rdi,%rdi,2), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 24
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_25(i64 %x) {
+; X86-LABEL: test_mul_by_25:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $25, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    leal (%ecx,%ecx,4), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,4), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_25:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (%rdi,%rdi,4), %rax
+; X64-NEXT:    leaq (%rax,%rax,4), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 25
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_26(i64 %x) {
+; X86-LABEL: test_mul_by_26:
+; X86:       # BB#0:
+; X86-NEXT:    movl $26, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $26, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_26:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $26, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 26
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_27(i64 %x) {
+; X86-LABEL: test_mul_by_27:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl $27, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    leal (%ecx,%ecx,8), %ecx
+; X86-NEXT:    leal (%ecx,%ecx,2), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_27:
+; X64:       # BB#0:
+; X64-NEXT:    leaq (%rdi,%rdi,8), %rax
+; X64-NEXT:    leaq (%rax,%rax,2), %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 27
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_28(i64 %x) {
+; X86-LABEL: test_mul_by_28:
+; X86:       # BB#0:
+; X86-NEXT:    movl $28, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $28, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_28:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $28, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 28
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_29(i64 %x) {
+; X86-LABEL: test_mul_by_29:
+; X86:       # BB#0:
+; X86-NEXT:    movl $29, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $29, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_29:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $29, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 29
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_30(i64 %x) {
+; X86-LABEL: test_mul_by_30:
+; X86:       # BB#0:
+; X86-NEXT:    movl $30, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    imull $30, {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_30:
+; X64:       # BB#0:
+; X64-NEXT:    imulq $30, %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 30
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_31(i64 %x) {
+; X86-LABEL: test_mul_by_31:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    shll $5, %ecx
+; X86-NEXT:    subl %eax, %ecx
+; X86-NEXT:    movl $31, %eax
+; X86-NEXT:    mull {{[0-9]+}}(%esp)
+; X86-NEXT:    addl %ecx, %edx
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_31:
+; X64:       # BB#0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    shlq $5, %rax
+; X64-NEXT:    subq %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 31
+  ret i64 %mul
+}
+
+define i64 @test_mul_by_32(i64 %x) {
+; X86-LABEL: test_mul_by_32:
+; X86:       # BB#0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    shldl $5, %eax, %edx
+; X86-NEXT:    shll $5, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_mul_by_32:
+; X64:       # BB#0:
+; X64-NEXT:    shlq $5, %rdi
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    retq
+  %mul = mul nsw i64 %x, 32
+  ret i64 %mul
+}
diff --git a/test/CodeGen/X86/mul-i256.ll b/test/CodeGen/X86/mul-i256.ll
index 8f207b8dd086e04db3c7a9cf20693ec2d968a3c5..bb2989b9298e892d51d1c289590e4a4f131fb3e9 100644
--- a/test/CodeGen/X86/mul-i256.ll
+++ b/test/CodeGen/X86/mul-i256.ll
@@ -1,8 +1,284 @@
-; RUN: llc < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 define void @test(i256* %a, i256* %b, i256* %out) #0 {
+; X32-LABEL: test:
+; X32:       # BB#0: # %entry
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:  .Lcfi0:
+; X32-NEXT:    .cfi_def_cfa_offset 8
+; X32-NEXT:  .Lcfi1:
+; X32-NEXT:    .cfi_offset %ebp, -8
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:  .Lcfi2:
+; X32-NEXT:    .cfi_def_cfa_register %ebp
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-8, %esp
+; X32-NEXT:    subl $168, %esp
+; X32-NEXT:  .Lcfi3:
+; X32-NEXT:    .cfi_offset %esi, -20
+; X32-NEXT:  .Lcfi4:
+; X32-NEXT:    .cfi_offset %edi, -16
+; X32-NEXT:  .Lcfi5:
+; X32-NEXT:    .cfi_offset %ebx, -12
+; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl 16(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 20(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 24(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 28(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 8(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 12(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 4(%eax), %ebx
+; X32-NEXT:    movl 12(%ebp), %eax
+; X32-NEXT:    movl 16(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 20(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 24(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 28(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl (%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 4(%eax), %ecx
+; X32-NEXT:    movl %ecx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl 8(%eax), %esi
+; X32-NEXT:    movl 12(%eax), %edi
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi # 4-byte Reload
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %ebx
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl %esi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl $0, %edi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    adcl $0, %ebx
+; X32-NEXT:    xorl %edx, %edx
+; X32-NEXT:    addl %ecx, %edi
+; X32-NEXT:    adcl %eax, %ebx
+; X32-NEXT:    adcl $0, %edx
+; X32-NEXT:    sbbl %eax, %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl {{[0-9]+}}(%esp) # 4-byte Folded Reload
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %esi
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %ecx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    addl %edi, %esi
+; X32-NEXT:    adcl %ebx, %ecx
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %edx # 4-byte Folded Reload
+; X32-NEXT:    adcl {{[0-9]+}}(%esp), %eax # 4-byte Folded Reload
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X32-NEXT:    movl 16(%ebp), %edi
+; X32-NEXT:    movl %ebx, 4(%edi)
+; X32-NEXT:    movl 16(%ebp), %ebx
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, (%ebx)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, 8(%ebx)
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %edi # 4-byte Reload
+; X32-NEXT:    movl %edi, 12(%ebx)
+; X32-NEXT:    movl %esi, 16(%ebx)
+; X32-NEXT:    movl %ecx, 20(%ebx)
+; X32-NEXT:    movl %edx, 24(%ebx)
+; X32-NEXT:    movl %eax, 28(%ebx)
+; X32-NEXT:    leal -12(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebx
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl
+;
+; X64-LABEL: test:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    pushq %r15
+; X64-NEXT:  .Lcfi0:
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    pushq %r14
+; X64-NEXT:  .Lcfi1:
+; X64-NEXT:    .cfi_def_cfa_offset 24
+; X64-NEXT:    pushq %r12
+; X64-NEXT:  .Lcfi2:
+; X64-NEXT:    .cfi_def_cfa_offset 32
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:  .Lcfi3:
+; X64-NEXT:    .cfi_def_cfa_offset 40
+; X64-NEXT:  .Lcfi4:
+; X64-NEXT:    .cfi_offset %rbx, -40
+; X64-NEXT:  .Lcfi5:
+; X64-NEXT:    .cfi_offset %r12, -32
+; X64-NEXT:  .Lcfi6:
+; X64-NEXT:    .cfi_offset %r14, -24
+; X64-NEXT:  .Lcfi7:
+; X64-NEXT:    .cfi_offset %r15, -16
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq (%rdi), %r14
+; X64-NEXT:    movq 8(%rdi), %r8
+; X64-NEXT:    movq 16(%rdi), %rcx
+; X64-NEXT:    movq 16(%rsi), %rbx
+; X64-NEXT:    movq (%rsi), %r12
+; X64-NEXT:    movq 8(%rsi), %r15
+; X64-NEXT:    movq 24(%rdi), %rdi
+; X64-NEXT:    imulq %r12, %rdi
+; X64-NEXT:    movq %r12, %rax
+; X64-NEXT:    mulq %rcx
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    addq %rdi, %rdx
+; X64-NEXT:    imulq %r15, %rcx
+; X64-NEXT:    addq %rdx, %rcx
+; X64-NEXT:    movq %rbx, %rdi
+; X64-NEXT:    imulq %r8, %rdi
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    mulq %r14
+; X64-NEXT:    movq %rax, %r11
+; X64-NEXT:    addq %rdi, %rdx
+; X64-NEXT:    movq 24(%rsi), %rbx
+; X64-NEXT:    imulq %r14, %rbx
+; X64-NEXT:    addq %rdx, %rbx
+; X64-NEXT:    addq %r9, %r11
+; X64-NEXT:    adcq %rcx, %rbx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r9
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r12
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rax, %rdi
+; X64-NEXT:    addq %rsi, %rdi
+; X64-NEXT:    adcq $0, %rcx
+; X64-NEXT:    movq %r14, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    movq %rdx, %rsi
+; X64-NEXT:    movq %rax, %r14
+; X64-NEXT:    addq %rdi, %r14
+; X64-NEXT:    adcq $0, %rsi
+; X64-NEXT:    addq %rcx, %rsi
+; X64-NEXT:    sbbq %rcx, %rcx
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    movq %r8, %rax
+; X64-NEXT:    mulq %r15
+; X64-NEXT:    addq %rsi, %rax
+; X64-NEXT:    adcq %rcx, %rdx
+; X64-NEXT:    addq %r11, %rax
+; X64-NEXT:    adcq %rbx, %rdx
+; X64-NEXT:    movq %r9, (%r10)
+; X64-NEXT:    movq %r14, 8(%r10)
+; X64-NEXT:    movq %rax, 16(%r10)
+; X64-NEXT:    movq %rdx, 24(%r10)
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    retq
 entry:
   %av = load i256, i256* %a
   %bv = load i256, i256* %b
@@ -11,22 +287,4 @@ entry:
   ret void
 }
 
-; CHECK-LABEL: @test
-; There is a lot of inter-register motion, and so matching the instruction
-; sequence will be fragile. There should be 6 underlying multiplications.
-; CHECK: imulq
-; CHECK: mulq
-; CHECK: imulq
-; CHECK: imulq
-; CHECK: mulq
-; CHECK: imulq
-; CHECK: mulq
-; CHECK: mulq
-; CHECK: mulq
-; CHECK: mulq
-; CHECK-NOT: imulq
-; CHECK-NOT: mulq
-; CHECK: retq
-
 attributes #0 = { norecurse nounwind uwtable "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" }
-
diff --git a/test/CodeGen/X86/mulx32.ll b/test/CodeGen/X86/mulx32.ll
index 42ef2eb6f6470825f348e3a9011759aeed882b2d..9ebd380170d3017a71e02e0df7d1a3d1b4f6bed4 100644
--- a/test/CodeGen/X86/mulx32.ll
+++ b/test/CodeGen/X86/mulx32.ll
@@ -1,22 +1,29 @@
-; RUN: llc -mcpu=core-avx2 -march=x86 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+bmi2 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=core-avx2 | FileCheck %s
 
 define i64 @f1(i32 %a, i32 %b) {
+; CHECK-LABEL: f1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    mulxl {{[0-9]+}}(%esp), %eax, %edx
+; CHECK-NEXT:    retl
   %x = zext i32 %a to i64
   %y = zext i32 %b to i64
   %r = mul i64 %x, %y
-; CHECK: f1
-; CHECK: mulxl
-; CHECK: ret
   ret i64 %r
 }
 
 define i64 @f2(i32 %a, i32* %p) {
+; CHECK-LABEL: f2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    mulxl (%eax), %eax, %edx
+; CHECK-NEXT:    retl
   %b = load i32, i32* %p
   %x = zext i32 %a to i64
   %y = zext i32 %b to i64
   %r = mul i64 %x, %y
-; CHECK: f2
-; CHECK: mulxl ({{.+}}), %{{.+}}, %{{.+}}
-; CHECK: ret
   ret i64 %r
 }
diff --git a/test/CodeGen/X86/mulx64.ll b/test/CodeGen/X86/mulx64.ll
index 808c02290b7c91c2b3c61543cd2424778ab66f24..7cc10e017fc6c9829009e9a2cc7c7ee6c8a27a34 100644
--- a/test/CodeGen/X86/mulx64.ll
+++ b/test/CodeGen/X86/mulx64.ll
@@ -1,22 +1,28 @@
-; RUN: llc -mcpu=core-avx2 -march=x86-64 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+bmi2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=core-avx2 | FileCheck %s
 
 define i128 @f1(i64 %a, i64 %b) {
+; CHECK-LABEL: f1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rdx
+; CHECK-NEXT:    mulxq %rsi, %rax, %rdx
+; CHECK-NEXT:    retq
   %x = zext i64 %a to i128
   %y = zext i64 %b to i128
   %r = mul i128 %x, %y
-; CHECK: f1
-; CHECK: mulxq
-; CHECK: ret
   ret i128 %r
 }
 
 define i128 @f2(i64 %a, i64* %p) {
+; CHECK-LABEL: f2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movq %rdi, %rdx
+; CHECK-NEXT:    mulxq (%rsi), %rax, %rdx
+; CHECK-NEXT:    retq
   %b = load i64, i64* %p
   %x = zext i64 %a to i128
   %y = zext i64 %b to i128
   %r = mul i128 %x, %y
-; CHECK: f2
-; CHECK: mulxq ({{.+}}), %{{.+}}, %{{.+}}
-; CHECK: ret
   ret i128 %r
 }
diff --git a/test/CodeGen/X86/neg_cmp.ll b/test/CodeGen/X86/neg_cmp.ll
index 79050720d8e7b7342b76d127b9da735bf0f5695a..cc82857706c00d39589eaeb66a5393aecb5ff44a 100644
--- a/test/CodeGen/X86/neg_cmp.ll
+++ b/test/CodeGen/X86/neg_cmp.ll
@@ -1,22 +1,50 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
 ; rdar://11245199
 ; PR12545
-define void @f(i32 %x, i32 %y) nounwind uwtable ssp {
-entry:
-; CHECK-LABEL: f:
-; CHECK-NOT: neg
-; CHECK: add
+
+declare void @g()
+
+define void @neg_cmp(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: neg_cmp:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # BB#2: # %if.then
+; CHECK-NEXT:    jmp g # TAILCALL
+; CHECK-NEXT:  .LBB0_1: # %if.end
+; CHECK-NEXT:    retq
   %sub = sub i32 0, %y
   %cmp = icmp eq i32 %x, %sub
   br i1 %cmp, label %if.then, label %if.end
 
-if.then:                                          ; preds = %entry
+if.then:
   tail call void @g() nounwind
   br label %if.end
 
-if.end:                                           ; preds = %if.then, %entry
+if.end:
+  ret void
+}
+
+define void @neg_cmp_commuted(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: neg_cmp_commuted:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    addl %esi, %edi
+; CHECK-NEXT:    jne .LBB1_1
+; CHECK-NEXT:  # BB#2: # %if.then
+; CHECK-NEXT:    jmp g # TAILCALL
+; CHECK-NEXT:  .LBB1_1: # %if.end
+; CHECK-NEXT:    retq
+  %sub = sub i32 0, %y
+  %cmp = icmp eq i32 %sub, %x
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:
+  tail call void @g() nounwind
+  br label %if.end
+
+if.end:
   ret void
 }
 
-declare void @g()
diff --git a/test/CodeGen/X86/nontemporal-2.ll b/test/CodeGen/X86/nontemporal-2.ll
index 92a35436d90d5da7898be38f5db9c48c2a1f8b0e..d1bb8d3e923b66a720c42749c8f5d2a50eb87760 100644
--- a/test/CodeGen/X86/nontemporal-2.ll
+++ b/test/CodeGen/X86/nontemporal-2.ll
@@ -255,6 +255,7 @@ define void @test_zero_v8f32(<8 x float>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vpxor %ymm0, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <8 x float> zeroinitializer, <8 x float>* %dst, align 32, !nontemporal !1
   ret void
@@ -279,6 +280,7 @@ define void @test_zero_v8i32(<8 x i32>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vpxor %ymm0, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <8 x i32> zeroinitializer, <8 x i32>* %dst, align 32, !nontemporal !1
   ret void
@@ -303,6 +305,7 @@ define void @test_zero_v4f64(<4 x double>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vpxor %ymm0, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <4 x double> zeroinitializer, <4 x double>* %dst, align 32, !nontemporal !1
   ret void
@@ -327,6 +330,7 @@ define void @test_zero_v4i64(<4 x i64>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vpxor %ymm0, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <4 x i64> zeroinitializer, <4 x i64>* %dst, align 32, !nontemporal !1
   ret void
@@ -351,6 +355,7 @@ define void @test_zero_v16i16(<16 x i16>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vpxor %ymm0, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <16 x i16> zeroinitializer, <16 x i16>* %dst, align 32, !nontemporal !1
   ret void
@@ -375,6 +380,7 @@ define void @test_zero_v32i8(<32 x i8>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vpxor %ymm0, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <32 x i8> zeroinitializer, <32 x i8>* %dst, align 32, !nontemporal !1
   ret void
@@ -757,6 +763,7 @@ define void @test_arg_v8f32(<8 x float> %arg, <8 x float>* %dst) {
 ; VLX-LABEL: test_arg_v8f32:
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vmovntps %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <8 x float> %arg, <8 x float>* %dst, align 32, !nontemporal !1
   ret void
@@ -777,7 +784,8 @@ define void @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %dst) {
 ;
 ; VLX-LABEL: test_arg_v8i32:
 ; VLX:       # BB#0:
-; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vmovntps %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <8 x i32> %arg, <8 x i32>* %dst, align 32, !nontemporal !1
   ret void
@@ -798,7 +806,8 @@ define void @test_arg_v4f64(<4 x double> %arg, <4 x double>* %dst) {
 ;
 ; VLX-LABEL: test_arg_v4f64:
 ; VLX:       # BB#0:
-; VLX-NEXT:    vmovntpd %ymm0, (%rdi)
+; VLX-NEXT:    vmovntps %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <4 x double> %arg, <4 x double>* %dst, align 32, !nontemporal !1
   ret void
@@ -819,7 +828,8 @@ define void @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %dst) {
 ;
 ; VLX-LABEL: test_arg_v4i64:
 ; VLX:       # BB#0:
-; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vmovntps %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <4 x i64> %arg, <4 x i64>* %dst, align 32, !nontemporal !1
   ret void
@@ -840,7 +850,8 @@ define void @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %dst) {
 ;
 ; VLX-LABEL: test_arg_v16i16:
 ; VLX:       # BB#0:
-; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vmovntps %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <16 x i16> %arg, <16 x i16>* %dst, align 32, !nontemporal !1
   ret void
@@ -861,7 +872,8 @@ define void @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %dst) {
 ;
 ; VLX-LABEL: test_arg_v32i8:
 ; VLX:       # BB#0:
-; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vmovntps %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   store <32 x i8> %arg, <32 x i8>* %dst, align 32, !nontemporal !1
   ret void
@@ -1031,6 +1043,7 @@ define void @test_op_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntps %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   %r = fadd <8 x float> %a, %b
   store <8 x float> %r, <8 x float>* %dst, align 32, !nontemporal !1
@@ -1068,6 +1081,7 @@ define void @test_op_v8i32(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   %r = add <8 x i32> %a, %b
   store <8 x i32> %r, <8 x i32>* %dst, align 32, !nontemporal !1
@@ -1094,6 +1108,7 @@ define void @test_op_v4f64(<4 x double> %a, <4 x double> %b, <4 x double>* %dst)
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntpd %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   %r = fadd <4 x double> %a, %b
   store <4 x double> %r, <4 x double>* %dst, align 32, !nontemporal !1
@@ -1131,6 +1146,7 @@ define void @test_op_v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   %r = add <4 x i64> %a, %b
   store <4 x i64> %r, <4 x i64>* %dst, align 32, !nontemporal !1
@@ -1168,6 +1184,7 @@ define void @test_op_v16i16(<16 x i16> %a, <16 x i16> %b, <16 x i16>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   %r = add <16 x i16> %a, %b
   store <16 x i16> %r, <16 x i16>* %dst, align 32, !nontemporal !1
@@ -1205,6 +1222,7 @@ define void @test_op_v32i8(<32 x i8> %a, <32 x i8> %b, <32 x i8>* %dst) {
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; VLX-NEXT:    vmovntdq %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   %r = add <32 x i8> %a, %b
   store <32 x i8> %r, <32 x i8>* %dst, align 32, !nontemporal !1
@@ -1235,6 +1253,7 @@ define void @test_unaligned_v8f32(<8 x float> %a, <8 x float> %b, <8 x float>* %
 ; VLX:       # BB#0:
 ; VLX-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; VLX-NEXT:    vmovups %ymm0, (%rdi)
+; VLX-NEXT:    vzeroupper
 ; VLX-NEXT:    retq
   %r = fadd <8 x float> %a, %b
   store <8 x float> %r, <8 x float>* %dst, align 16, !nontemporal !1
diff --git a/test/CodeGen/X86/nontemporal-loads.ll b/test/CodeGen/X86/nontemporal-loads.ll
index 53f4e8d04374caa50b0da2ec658119b845d0c2d2..eaab26ef95474f01c874bdb0971369270f504859 100644
--- a/test/CodeGen/X86/nontemporal-loads.ll
+++ b/test/CodeGen/X86/nontemporal-loads.ll
@@ -752,7 +752,7 @@ define <8 x i32> @test_arg_v8i32(<8 x i32> %arg, <8 x i32>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm1
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
@@ -804,7 +804,7 @@ define <4 x i64> @test_arg_v4i64(<4 x i64> %arg, <4 x i64>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v4i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm1
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vpaddq %xmm3, %xmm2, %xmm2
@@ -835,7 +835,7 @@ define <16 x i16> @test_arg_v16i16(<16 x i16> %arg, <16 x i16>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v16i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm1
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
@@ -866,7 +866,7 @@ define <32 x i8> @test_arg_v32i8(<32 x i8> %arg, <32 x i8>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v32i8:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm1
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm2, %xmm2
@@ -925,8 +925,8 @@ define <16 x i32> @test_arg_v16i32(<16 x i32> %arg, <16 x i32>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v16i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm2
-; AVX1-NEXT:    vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
 ; AVX1-NEXT:    vpaddd %xmm5, %xmm4, %xmm4
@@ -989,8 +989,8 @@ define <8 x i64> @test_arg_v8i64(<8 x i64> %arg, <8 x i64>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v8i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm2
-; AVX1-NEXT:    vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
 ; AVX1-NEXT:    vpaddq %xmm5, %xmm4, %xmm4
@@ -1029,8 +1029,8 @@ define <32 x i16> @test_arg_v32i16(<32 x i16> %arg, <32 x i16>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v32i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm2
-; AVX1-NEXT:    vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
 ; AVX1-NEXT:    vpaddw %xmm5, %xmm4, %xmm4
@@ -1081,8 +1081,8 @@ define <64 x i8> @test_arg_v64i8(<64 x i8> %arg, <64 x i8>* %src) {
 ;
 ; AVX1-LABEL: test_arg_v64i8:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm2
-; AVX1-NEXT:    vmovaps 32(%rdi), %ymm3
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm2
+; AVX1-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm5
 ; AVX1-NEXT:    vpaddb %xmm5, %xmm4, %xmm4
diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll
index 952db42842ef3b08eed889dfeac2442782eac332..d26cf02dd9424a5e415a2d6f9933542c110701f3 100644
--- a/test/CodeGen/X86/oddshuffles.ll
+++ b/test/CodeGen/X86/oddshuffles.ll
@@ -244,40 +244,34 @@ define void @v7i8(<4 x i8> %a, <4 x i8> %b, <7 x i8>* %p) nounwind {
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,1,3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
-; SSE2-NEXT:    movd %xmm1, (%rdi)
-; SSE2-NEXT:    pextrw $4, %xmm0, %eax
+; SSE2-NEXT:    movd %xmm0, (%rdi)
+; SSE2-NEXT:    pextrw $2, %xmm0, %eax
 ; SSE2-NEXT:    movw %ax, 4(%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSE42-LABEL: v7i8:
 ; SSE42:       # BB#0:
 ; SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
+; SSE42-NEXT:    pextrb $0, %xmm1, 6(%rdi)
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
 ; SSE42-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
-; SSE42-NEXT:    pextrb $12, %xmm1, 6(%rdi)
 ; SSE42-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; SSE42-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; SSE42-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE42-NEXT:    pextrw $4, %xmm1, 4(%rdi)
-; SSE42-NEXT:    movd %xmm0, (%rdi)
+; SSE42-NEXT:    pextrw $2, %xmm1, 4(%rdi)
+; SSE42-NEXT:    movd %xmm1, (%rdi)
 ; SSE42-NEXT:    retq
 ;
 ; AVX-LABEL: v7i8:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,3,1,3]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
-; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3],xmm0[4],xmm1[5,6,7]
-; AVX-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; AVX-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX-NEXT:    vpextrb $12, %xmm0, 6(%rdi)
-; AVX-NEXT:    vpextrw $4, %xmm1, 4(%rdi)
-; AVX-NEXT:    vmovd %xmm2, (%rdi)
+; AVX-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[8,9,8,9,4,5,8,9,0,1,12,13,0,1,14,15]
+; AVX-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5,6,7]
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
+; AVX-NEXT:    vpextrb $0, %xmm1, 6(%rdi)
+; AVX-NEXT:    vpextrw $2, %xmm0, 4(%rdi)
+; AVX-NEXT:    vmovd %xmm0, (%rdi)
 ; AVX-NEXT:    retq
   %r = shufflevector <4 x i8> %a, <4 x i8> %b, <7 x i32> <i32 0, i32 6, i32 3, i32 6, i32 1, i32 7, i32 4>
   store <7 x i8> %r, <7 x i8>* %p
@@ -923,7 +917,7 @@ define void @interleave_24i16_out(<24 x i16>* %p, <8 x i16>* %q1, <8 x i16>* %q2
 ; AVX1-LABEL: interleave_24i16_out:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovdqu 32(%rdi), %xmm0
-; AVX1-NEXT:    vmovups (%rdi), %ymm1
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm2[1],xmm1[2,3],xmm2[4],xmm1[5,6],xmm2[7]
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,6,7,12,13,2,3,8,9,14,15,12,13,14,15]
@@ -1445,8 +1439,8 @@ define <2 x double> @wrongorder(<4 x double> %A, <8 x double>* %P) #0 {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT:    vmovapd %ymm1, 32(%rdi)
-; AVX1-NEXT:    vmovapd %ymm1, (%rdi)
+; AVX1-NEXT:    vmovaps %ymm1, 32(%rdi)
+; AVX1-NEXT:    vmovaps %ymm1, (%rdi)
 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
diff --git a/test/CodeGen/X86/overflow.ll b/test/CodeGen/X86/overflow.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ff25b5de493334ad1dfce31d2b237a3405e40e8d
--- /dev/null
+++ b/test/CodeGen/X86/overflow.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
+
+define i128 @mulhioverflow(i64 %a, i64 %b, i64 %c) nounwind {
+; X32-LABEL: mulhioverflow:
+; X32:       # BB#0:
+; X32-NEXT:    pushl %ebp
+; X32-NEXT:    movl %esp, %ebp
+; X32-NEXT:    pushl %edi
+; X32-NEXT:    pushl %esi
+; X32-NEXT:    andl $-8, %esp
+; X32-NEXT:    subl $16, %esp
+; X32-NEXT:    movl 8(%ebp), %esi
+; X32-NEXT:    movl 28(%ebp), %edi
+; X32-NEXT:    movl %esp, %eax
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl 24(%ebp)
+; X32-NEXT:    pushl 20(%ebp)
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl $0
+; X32-NEXT:    pushl 16(%ebp)
+; X32-NEXT:    pushl 12(%ebp)
+; X32-NEXT:    pushl %eax
+; X32-NEXT:    calll __multi3
+; X32-NEXT:    addl $32, %esp
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %edi
+; X32-NEXT:    xorl %ecx, %ecx
+; X32-NEXT:    addl {{[0-9]+}}(%esp), %edi
+; X32-NEXT:    adcl $0, %eax
+; X32-NEXT:    adcl $0, %ecx
+; X32-NEXT:    sbbl %edx, %edx
+; X32-NEXT:    andl $1, %edx
+; X32-NEXT:    movl %edi, (%esi)
+; X32-NEXT:    movl %eax, 4(%esi)
+; X32-NEXT:    movl %ecx, 8(%esi)
+; X32-NEXT:    movl %edx, 12(%esi)
+; X32-NEXT:    movl %esi, %eax
+; X32-NEXT:    leal -8(%ebp), %esp
+; X32-NEXT:    popl %esi
+; X32-NEXT:    popl %edi
+; X32-NEXT:    popl %ebp
+; X32-NEXT:    retl $4
+;
+; X64-LABEL: mulhioverflow:
+; X64:       # BB#0:
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    mulq %rsi
+; X64-NEXT:    andl $1, %ecx
+; X64-NEXT:    leaq (%rcx,%rdx), %rax
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    retq
+  %1 = zext i64 %a to i128
+  %2 = zext i64 %b to i128
+  %3 = mul i128 %1, %2
+  %4 = lshr i128 %3, 64
+  %5 = and i64 %c, 1
+  %6 = zext i64 %5 to i128
+  %7 = add i128 %4, %6
+  ret i128 %7
+}
diff --git a/test/CodeGen/X86/peep-setb.ll b/test/CodeGen/X86/peep-setb.ll
index adae8acd0432b2fe7c4f8df5621751d1c9adf2fd..01e445a86221313e8c2a4933e2f93381494857a0 100644
--- a/test/CodeGen/X86/peep-setb.ll
+++ b/test/CodeGen/X86/peep-setb.ll
@@ -1,82 +1,123 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s
+
+; These tests use cmp+adc/sbb in place of test+set+add/sub. Should this transform
+; be enabled by micro-architecture rather than as part of generic lowering/isel?
 
 define i8 @test1(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpb %sil, %dil
+; CHECK-NEXT:    adcb $0, %sil
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %cmp = icmp ult i8 %a, %b
   %cond = zext i1 %cmp to i8
   %add = add i8 %cond, %b
   ret i8 %add
-; CHECK-LABEL: test1:
-; CHECK: adcb $0
 }
 
 define i32 @test2(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    adcl $0, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %cmp = icmp ult i32 %a, %b
   %cond = zext i1 %cmp to i32
   %add = add i32 %cond, %b
   ret i32 %add
-; CHECK-LABEL: test2:
-; CHECK: adcl $0
 }
 
 define i64 @test3(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    adcq $0, %rsi
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    retq
   %cmp = icmp ult i64 %a, %b
   %conv = zext i1 %cmp to i64
   %add = add i64 %conv, %b
   ret i64 %add
-; CHECK-LABEL: test3:
-; CHECK: adcq $0
 }
 
 define i8 @test4(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: test4:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpb %sil, %dil
+; CHECK-NEXT:    sbbb $0, %sil
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %cmp = icmp ult i8 %a, %b
   %cond = zext i1 %cmp to i8
   %sub = sub i8 %b, %cond
   ret i8 %sub
-; CHECK-LABEL: test4:
-; CHECK: sbbb $0
 }
 
 define i32 @test5(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test5:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    sbbl $0, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %cmp = icmp ult i32 %a, %b
   %cond = zext i1 %cmp to i32
   %sub = sub i32 %b, %cond
   ret i32 %sub
-; CHECK-LABEL: test5:
-; CHECK: sbbl $0
 }
 
 define i64 @test6(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test6:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    sbbq $0, %rsi
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    retq
   %cmp = icmp ult i64 %a, %b
   %conv = zext i1 %cmp to i64
   %sub = sub i64 %b, %conv
   ret i64 %sub
-; CHECK-LABEL: test6:
-; CHECK: sbbq $0
 }
 
 define i8 @test7(i8 %a, i8 %b) nounwind {
+; CHECK-LABEL: test7:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpb %sil, %dil
+; CHECK-NEXT:    adcb $0, %sil
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %cmp = icmp ult i8 %a, %b
   %cond = sext i1 %cmp to i8
   %sub = sub i8 %b, %cond
   ret i8 %sub
-; CHECK-LABEL: test7:
-; CHECK: adcb $0
 }
 
 define i32 @test8(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: test8:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpl %esi, %edi
+; CHECK-NEXT:    adcl $0, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
   %cmp = icmp ult i32 %a, %b
   %cond = sext i1 %cmp to i32
   %sub = sub i32 %b, %cond
   ret i32 %sub
-; CHECK-LABEL: test8:
-; CHECK: adcl $0
 }
 
 define i64 @test9(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: test9:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpq %rsi, %rdi
+; CHECK-NEXT:    adcq $0, %rsi
+; CHECK-NEXT:    movq %rsi, %rax
+; CHECK-NEXT:    retq
   %cmp = icmp ult i64 %a, %b
   %conv = sext i1 %cmp to i64
   %sub = sub i64 %b, %conv
   ret i64 %sub
-; CHECK-LABEL: test9:
-; CHECK: adcq $0
 }
+
diff --git a/test/CodeGen/X86/peep-test-4.ll b/test/CodeGen/X86/peep-test-4.ll
index 1ae621fb1f5886beb0d1a7553aefb7f13e4bb921..832262aba7e45d8cd04b8e3123876c86aea07a89 100644
--- a/test/CodeGen/X86/peep-test-4.ll
+++ b/test/CodeGen/X86/peep-test-4.ll
@@ -1,14 +1,18 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+bmi,+bmi2,+popcnt,+lzcnt | FileCheck %s
 declare void @foo(i32)
 declare void @foo32(i32)
 declare void @foo64(i64)
 
-; CHECK-LABEL: neg:
-; CHECK: negl %edi
-; CHECK-NEXT: je
-; CHECK: jmp foo
-; CHECK: ret
 define void @neg(i32 %x) nounwind {
+; CHECK-LABEL: neg:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    negl %edi
+; CHECK-NEXT:    je .LBB0_1
+; CHECK-NEXT:  # BB#2: # %bb
+; CHECK-NEXT:    jmp foo # TAILCALL
+; CHECK-NEXT:  .LBB0_1: # %return
+; CHECK-NEXT:    retq
   %sub = sub i32 0, %x
   %cmp = icmp eq i32 %sub, 0
   br i1 %cmp, label %return, label %bb
@@ -21,12 +25,15 @@ return:
   ret void
 }
 
-; CHECK-LABEL: sar:
-; CHECK: sarl %edi
-; CHECK-NEXT: je
-; CHECK: jmp foo
-; CHECK: ret
 define void @sar(i32 %x) nounwind {
+; CHECK-LABEL: sar:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    sarl %edi
+; CHECK-NEXT:    je .LBB1_1
+; CHECK-NEXT:  # BB#2: # %bb
+; CHECK-NEXT:    jmp foo # TAILCALL
+; CHECK-NEXT:  .LBB1_1: # %return
+; CHECK-NEXT:    retq
   %ashr = ashr i32 %x, 1
   %cmp = icmp eq i32 %ashr, 0
   br i1 %cmp, label %return, label %bb
@@ -39,12 +46,15 @@ return:
   ret void
 }
 
-; CHECK-LABEL: shr:
-; CHECK: shrl %edi
-; CHECK-NEXT: je
-; CHECK: jmp foo
-; CHECK: ret
 define void @shr(i32 %x) nounwind {
+; CHECK-LABEL: shr:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shrl %edi
+; CHECK-NEXT:    je .LBB2_1
+; CHECK-NEXT:  # BB#2: # %bb
+; CHECK-NEXT:    jmp foo # TAILCALL
+; CHECK-NEXT:  .LBB2_1: # %return
+; CHECK-NEXT:    retq
   %ashr = lshr i32 %x, 1
   %cmp = icmp eq i32 %ashr, 0
   br i1 %cmp, label %return, label %bb
@@ -57,12 +67,15 @@ return:
   ret void
 }
 
-; CHECK-LABEL: shri:
-; CHECK: shrl $3, %edi
-; CHECK-NEXT: je
-; CHECK: jmp foo
-; CHECK: ret
 define void @shri(i32 %x) nounwind {
+; CHECK-LABEL: shri:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shrl $3, %edi
+; CHECK-NEXT:    je .LBB3_1
+; CHECK-NEXT:  # BB#2: # %bb
+; CHECK-NEXT:    jmp foo # TAILCALL
+; CHECK-NEXT:  .LBB3_1: # %return
+; CHECK-NEXT:    retq
   %ashr = lshr i32 %x, 3
   %cmp = icmp eq i32 %ashr, 0
   br i1 %cmp, label %return, label %bb
@@ -75,12 +88,15 @@ return:
   ret void
 }
 
-; CHECK-LABEL: shl:
-; CHECK: addl %edi, %edi
-; CHECK-NEXT: je
-; CHECK: jmp foo
-; CHECK: ret
 define void @shl(i32 %x) nounwind {
+; CHECK-LABEL: shl:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    addl %edi, %edi
+; CHECK-NEXT:    je .LBB4_1
+; CHECK-NEXT:  # BB#2: # %bb
+; CHECK-NEXT:    jmp foo # TAILCALL
+; CHECK-NEXT:  .LBB4_1: # %return
+; CHECK-NEXT:    retq
   %shl = shl i32 %x, 1
   %cmp = icmp eq i32 %shl, 0
   br i1 %cmp, label %return, label %bb
@@ -93,12 +109,15 @@ return:
   ret void
 }
 
-; CHECK-LABEL: shli:
-; CHECK: shll $4, %edi
-; CHECK-NEXT: je
-; CHECK: jmp foo
-; CHECK: ret
 define void @shli(i32 %x) nounwind {
+; CHECK-LABEL: shli:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    shll $4, %edi
+; CHECK-NEXT:    je .LBB5_1
+; CHECK-NEXT:  # BB#2: # %bb
+; CHECK-NEXT:    jmp foo # TAILCALL
+; CHECK-NEXT:  .LBB5_1: # %return
+; CHECK-NEXT:    retq
   %shl = shl i32 %x, 4
   %cmp = icmp eq i32 %shl, 0
   br i1 %cmp, label %return, label %bb
@@ -111,35 +130,40 @@ return:
   ret void
 }
 
-; CHECK-LABEL: adc:
-; CHECK: movabsq $-9223372036854775808, %rax
-; CHECK-NEXT: addq  %rdi, %rax
-; CHECK-NEXT: adcq  $0, %rsi
-; CHECK-NEXT: sete  %al
-; CHECK: ret
 define zeroext i1 @adc(i128 %x) nounwind {
+; CHECK-LABEL: adc:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; CHECK-NEXT:    addq %rdi, %rax
+; CHECK-NEXT:    adcq $0, %rsi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
   %add = add i128 %x, 9223372036854775808
   %cmp = icmp ult i128 %add, 18446744073709551616
   ret i1 %cmp
 }
 
-; CHECK-LABEL: sbb:
-; CHECK: cmpq  %rdx, %rdi
-; CHECK-NEXT: sbbq  %rcx, %rsi
-; CHECK-NEXT: setns %al
-; CHECK: ret
 define zeroext i1 @sbb(i128 %x, i128 %y) nounwind {
+; CHECK-LABEL: sbb:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    cmpq %rdx, %rdi
+; CHECK-NEXT:    sbbq %rcx, %rsi
+; CHECK-NEXT:    setns %al
+; CHECK-NEXT:    retq
   %sub = sub i128 %x, %y
   %cmp = icmp sge i128 %sub, 0
   ret i1 %cmp
 }
 
-; CHECK-LABEL: andn:
-; CHECK: andnl   %esi, %edi, %edi
-; CHECK-NEXT: je
-; CHECK: jmp foo
-; CHECK: ret
 define void @andn(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: andn:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andnl %esi, %edi, %edi
+; CHECK-NEXT:    je .LBB8_1
+; CHECK-NEXT:  # BB#2: # %bb
+; CHECK-NEXT:    jmp foo # TAILCALL
+; CHECK-NEXT:  .LBB8_1: # %return
+; CHECK-NEXT:    retq
   %not = xor i32 %x, -1
   %andn = and i32 %y, %not
   %cmp = icmp eq i32 %andn, 0
@@ -153,13 +177,16 @@ return:
   ret void
 }
 
-; CHECK-LABEL: bextr:
-; CHECK: bextrl   %esi, %edi, %edi
-; CHECK-NEXT: je
-; CHECK: jmp foo
-; CHECK: ret
 declare i32 @llvm.x86.bmi.bextr.32(i32, i32) nounwind readnone
 define void @bextr(i32 %x, i32 %y) nounwind {
+; CHECK-LABEL: bextr:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    bextrl %esi, %edi, %edi
+; CHECK-NEXT:    je .LBB9_1
+; CHECK-NEXT:  # BB#2: # %bb
+; CHECK-NEXT:    jmp foo # TAILCALL
+; CHECK-NEXT:  .LBB9_1: # %return
+; CHECK-NEXT:    retq
   %bextr = tail call i32 @llvm.x86.bmi.bextr.32(i32 %x, i32 %y)
   %cmp = icmp eq i32 %bextr, 0
   br i1 %cmp, label %return, label %bb
@@ -172,43 +199,54 @@ return:
   ret void
 }
 
-; CHECK-LABEL: popcnt:
-; CHECK: popcntl
-; CHECK-NEXT: je
-; CHECK: jmp foo
-; CHECK: ret
 declare i32 @llvm.ctpop.i32(i32) nounwind readnone
 define void @popcnt(i32 %x) nounwind {
+; CHECK-LABEL: popcnt:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    popcntl %edi, %edi
+; CHECK-NEXT:    je .LBB10_1
+; CHECK-NEXT:  # BB#2: # %bb
+; CHECK-NEXT:    jmp foo # TAILCALL
+; CHECK-NEXT:  .LBB10_1: # %return
+; CHECK-NEXT:    retq
   %popcnt = tail call i32 @llvm.ctpop.i32(i32 %x)
   %cmp = icmp eq i32 %popcnt, 0
   br i1 %cmp, label %return, label %bb
-;
 bb:
   tail call void @foo(i32 %popcnt)
   br label %return
-;
 return:
   ret void
 }
 
-; CHECK-LABEL: testCTZ
-; CHECK: tzcntq
-; CHECK-NOT: test
-; CHECK: cmovaeq
 declare i64 @llvm.cttz.i64(i64, i1)
 define i64 @testCTZ(i64 %v) nounwind {
+; CHECK-LABEL: testCTZ:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    tzcntq %rdi, %rcx
+; CHECK-NEXT:    movl $255, %eax
+; CHECK-NEXT:    cmovaeq %rcx, %rax
+; CHECK-NEXT:    retq
   %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
   %tobool = icmp eq i64 %v, 0
   %cond = select i1 %tobool, i64 255, i64 %cnt
   ret i64 %cond
 }
 
-; CHECK-LABEL: testCTZ2
-; CHECK: tzcntl
-; CHECK-NEXT: jb
-; CHECK: jmp foo
 declare i32 @llvm.cttz.i32(i32, i1)
 define void @testCTZ2(i32 %v) nounwind {
+; CHECK-LABEL: testCTZ2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    tzcntl %edi, %ebx
+; CHECK-NEXT:    jb .LBB12_2
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    movl %ebx, %edi
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:  .LBB12_2: # %return
+; CHECK-NEXT:    movl %ebx, %edi
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    jmp foo32 # TAILCALL
   %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
   %cmp = icmp eq i32 %v, 0
   br i1 %cmp, label %return, label %bb
@@ -222,11 +260,19 @@ return:
   ret void
 }
 
-; CHECK-LABEL: testCTZ3
-; CHECK: tzcntl
-; CHECK-NEXT: jae
-; CHECK: jmp foo
 define void @testCTZ3(i32 %v) nounwind {
+; CHECK-LABEL: testCTZ3:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    tzcntl %edi, %ebx
+; CHECK-NEXT:    jae .LBB13_2
+; CHECK-NEXT:  # BB#1: # %bb
+; CHECK-NEXT:    movl %ebx, %edi
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:  .LBB13_2: # %return
+; CHECK-NEXT:    movl %ebx, %edi
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    jmp foo32 # TAILCALL
   %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
   %cmp = icmp ne i32 %v, 0
   br i1 %cmp, label %return, label %bb
@@ -240,24 +286,28 @@ return:
   ret void
 }
 
-; CHECK-LABEL: testCLZ
-; CHECK: lzcntq
-; CHECK-NOT: test
-; CHECK: cmovaeq
 declare i64 @llvm.ctlz.i64(i64, i1)
 define i64 @testCLZ(i64 %v) nounwind {
+; CHECK-LABEL: testCLZ:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    lzcntq %rdi, %rcx
+; CHECK-NEXT:    movl $255, %eax
+; CHECK-NEXT:    cmovaeq %rcx, %rax
+; CHECK-NEXT:    retq
   %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
   %tobool = icmp ne i64 %v, 0
   %cond = select i1 %tobool, i64 %cnt, i64 255
   ret i64 %cond
 }
 
-; CHECK-LABEL: testPOPCNT
-; CHECK: popcntq
-; CHECK-NOT: test
-; CHECK: cmovneq
 declare i64 @llvm.ctpop.i64(i64)
 define i64 @testPOPCNT(i64 %v) nounwind {
+; CHECK-LABEL: testPOPCNT:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    popcntq %rdi, %rcx
+; CHECK-NEXT:    movl $255, %eax
+; CHECK-NEXT:    cmovneq %rcx, %rax
+; CHECK-NEXT:    retq
   %cnt = tail call i64 @llvm.ctpop.i64(i64 %v)
   %tobool = icmp ne i64 %v, 0
   %cond = select i1 %tobool, i64 %cnt, i64 255
diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll
index ac83653efbdc914da1969a2ce43137f62253007f..88cb7a6d58258b3f37daa15905ad5a08b9ead739 100644
--- a/test/CodeGen/X86/pmul.ll
+++ b/test/CodeGen/X86/pmul.ll
@@ -55,6 +55,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind  {
 ; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v16i8c:
@@ -63,6 +64,7 @@ define <16 x i8> @mul_v16i8c(<16 x i8> %i) nounwind  {
 ; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 entry:
   %A = mul <16 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
@@ -195,6 +197,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind  {
 ; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v16i8:
@@ -204,6 +207,7 @@ define <16 x i8> @mul_v16i8(<16 x i8> %i, <16 x i8> %j) nounwind  {
 ; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 entry:
   %A = mul <16 x i8> %i, %j
@@ -1157,35 +1161,26 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) {
 ;
 ; SSE41-LABEL: mul_v4i64_zero_upper:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE41-NEXT:    pmuludq %xmm0, %xmm1
-; SSE41-NEXT:    pmuludq %xmm4, %xmm2
-; SSE41-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3]
-; SSE41-NEXT:    movaps %xmm2, %xmm0
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    pmuludq %xmm3, %xmm0
+; SSE41-NEXT:    pmuludq %xmm2, %xmm4
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
 ; SSE41-NEXT:    retq
 ;
-; AVX2-LABEL: mul_v4i64_zero_upper:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: mul_v4i64_zero_upper:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX512-NEXT:    retq
+; AVX-LABEL: mul_v4i64_zero_upper:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 entry:
   %val1a = zext <4 x i32> %val1 to <4 x i64>
   %val2a = zext <4 x i32> %val2 to <4 x i64>
@@ -1219,48 +1214,36 @@ define <4 x i32> @mul_v4i64_zero_upper_left(<4 x i32> %val1, <4 x i64> %val2) {
 ;
 ; SSE41-LABEL: mul_v4i64_zero_upper_left:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pxor %xmm3, %xmm3
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; SSE41-NEXT:    movdqa %xmm0, %xmm3
-; SSE41-NEXT:    pmuludq %xmm2, %xmm3
-; SSE41-NEXT:    psrlq $32, %xmm2
-; SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; SSE41-NEXT:    psllq $32, %xmm2
-; SSE41-NEXT:    paddq %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    pmuludq %xmm1, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm1
 ; SSE41-NEXT:    pmuludq %xmm4, %xmm1
 ; SSE41-NEXT:    psllq $32, %xmm1
 ; SSE41-NEXT:    paddq %xmm1, %xmm0
+; SSE41-NEXT:    movdqa %xmm3, %xmm1
+; SSE41-NEXT:    pmuludq %xmm2, %xmm1
+; SSE41-NEXT:    psrlq $32, %xmm2
+; SSE41-NEXT:    pmuludq %xmm3, %xmm2
+; SSE41-NEXT:    psllq $32, %xmm2
+; SSE41-NEXT:    paddq %xmm1, %xmm2
 ; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
 ; SSE41-NEXT:    retq
 ;
-; AVX2-LABEL: mul_v4i64_zero_upper_left:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
-; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: mul_v4i64_zero_upper_left:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
-; AVX512-NEXT:    vpsrlq $32, %ymm1, %ymm1
-; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpsllq $32, %ymm0, %ymm0
-; AVX512-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX512-NEXT:    retq
+; AVX-LABEL: mul_v4i64_zero_upper_left:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm2
+; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX-NEXT:    vpaddq %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 entry:
   %val1a = zext <4 x i32> %val1 to <4 x i64>
   %res64 = mul <4 x i64> %val1a, %val2
@@ -1288,39 +1271,28 @@ define <4 x i32> @mul_v4i64_zero_lower(<4 x i32> %val1, <4 x i64> %val2) {
 ;
 ; SSE41-LABEL: mul_v4i64_zero_lower:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    psrlq $32, %xmm1
+; SSE41-NEXT:    pmuludq %xmm1, %xmm0
+; SSE41-NEXT:    psllq $32, %xmm0
 ; SSE41-NEXT:    psrlq $32, %xmm2
-; SSE41-NEXT:    pmuludq %xmm0, %xmm2
+; SSE41-NEXT:    pmuludq %xmm3, %xmm2
 ; SSE41-NEXT:    psllq $32, %xmm2
-; SSE41-NEXT:    psrlq $32, %xmm1
-; SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; SSE41-NEXT:    psllq $32, %xmm3
-; SSE41-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,3],xmm2[1,3]
-; SSE41-NEXT:    movaps %xmm3, %xmm0
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
 ; SSE41-NEXT:    retq
 ;
-; AVX2-LABEL: mul_v4i64_zero_lower:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vpsrlq $32, %ymm1, %ymm1
-; AVX2-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: mul_v4i64_zero_lower:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX512-NEXT:    vpsrlq $32, %ymm1, %ymm1
-; AVX512-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpsllq $32, %ymm0, %ymm0
-; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX512-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
-; AVX512-NEXT:    retq
+; AVX-LABEL: mul_v4i64_zero_lower:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX-NEXT:    vpsrlq $32, %ymm1, %ymm1
+; AVX-NEXT:    vpmuludq %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vpsllq $32, %ymm0, %ymm0
+; AVX-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3]
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 entry:
   %val1a = zext <4 x i32> %val1 to <4 x i64>
   %val2a = and <4 x i64> %val2, <i64 -4294967296, i64 -4294967296, i64 -4294967296, i64 -4294967296>
@@ -1358,23 +1330,24 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) {
 ;
 ; SSE41-LABEL: mul_v8i64_zero_upper:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    pxor %xmm6, %xmm6
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm8 = xmm0[0],zero,xmm0[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero
 ; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm2[0],zero,xmm2[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm3[0],zero,xmm3[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3]
-; SSE41-NEXT:    pmuludq %xmm1, %xmm3
-; SSE41-NEXT:    pmuludq %xmm0, %xmm2
-; SSE41-NEXT:    pmuludq %xmm7, %xmm5
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero
+; SSE41-NEXT:    pmuludq %xmm7, %xmm1
+; SSE41-NEXT:    pmuludq %xmm6, %xmm2
+; SSE41-NEXT:    pmuludq %xmm5, %xmm0
 ; SSE41-NEXT:    pmuludq %xmm8, %xmm4
-; SSE41-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3]
-; SSE41-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3]
-; SSE41-NEXT:    movaps %xmm4, %xmm0
-; SSE41-NEXT:    movaps %xmm5, %xmm1
+; SSE41-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3]
+; SSE41-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3]
 ; SSE41-NEXT:    retq
 ;
 ; AVX2-LABEL: mul_v8i64_zero_upper:
diff --git a/test/CodeGen/X86/pointer-vector.ll b/test/CodeGen/X86/pointer-vector.ll
index 9a0271aa7f00af7db15032bd2f8bd20795d86a32..d5297b9c70ce8532888b966b0777f4a3af2f9db2 100644
--- a/test/CodeGen/X86/pointer-vector.ll
+++ b/test/CodeGen/X86/pointer-vector.ll
@@ -133,7 +133,7 @@ define <4 x i32> @ICMP0(<4 x i8*>* %p0, <4 x i8*>* %p1) nounwind {
 ; CHECK-NEXT:    movdqa (%ecx), %xmm0
 ; CHECK-NEXT:    pcmpgtd (%eax), %xmm0
 ; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [9,8,7,6]
-; CHECK-NEXT:    blendvps {{\.LCPI.*}}, %xmm1
+; CHECK-NEXT:    blendvps %xmm0, {{\.LCPI.*}}, %xmm1
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-NEXT:    retl
 entry:
@@ -152,7 +152,7 @@ define <4 x i32> @ICMP1(<4 x i8*>* %p0, <4 x i8*>* %p1) nounwind {
 ; CHECK-NEXT:    movdqa (%ecx), %xmm0
 ; CHECK-NEXT:    pcmpeqd (%eax), %xmm0
 ; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [9,8,7,6]
-; CHECK-NEXT:    blendvps {{\.LCPI.*}}, %xmm1
+; CHECK-NEXT:    blendvps %xmm0, {{\.LCPI.*}}, %xmm1
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-NEXT:    retl
 entry:
diff --git a/test/CodeGen/X86/pr11334.ll b/test/CodeGen/X86/pr11334.ll
index 7d3d7aaac82b2265097fb5535e73e0bee7e211fa..8a154653414a8b8bac8ac6ea36711f518b7fa217 100644
--- a/test/CodeGen/X86/pr11334.ll
+++ b/test/CodeGen/X86/pr11334.ll
@@ -85,15 +85,15 @@ entry:
 define void @test_vector_creation() nounwind {
 ; SSE-LABEL: test_vector_creation:
 ; SSE:       # BB#0:
-; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, (%rax)
+; SSE-NEXT:    xorpd %xmm0, %xmm0
+; SSE-NEXT:    movhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
+; SSE-NEXT:    movapd %xmm0, (%rax)
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_vector_creation:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7]
+; AVX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
 ; AVX-NEXT:    vmovaps %ymm0, (%rax)
 ; AVX-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
index 81aaf91f26887e4bd1b99eb8060899e5ab400915..6575d2a73d9c28ab90c24a3bce44ad4218b73af2 100644
--- a/test/CodeGen/X86/pr12312.ll
+++ b/test/CodeGen/X86/pr12312.ll
@@ -1,155 +1,243 @@
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx < %s | FileCheck %s --check-prefix SSE41
-; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s | FileCheck %s --check-prefix AVX
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse4.1,-avx < %s | FileCheck %s --check-prefix=SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx,-avx2 < %s   | FileCheck %s --check-prefix=AVX
 
 define i32 @veccond128(<4 x i32> %input) {
+; SSE41-LABEL: veccond128:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    ptest %xmm0, %xmm0
+; SSE41-NEXT:    je .LBB0_2
+; SSE41-NEXT:  # BB#1: # %if-true-block
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    retq
+; SSE41-NEXT:  .LBB0_2: # %endif-block
+; SSE41-NEXT:    movl $1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: veccond128:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vptest %xmm0, %xmm0
+; AVX-NEXT:    je .LBB0_2
+; AVX-NEXT:  # BB#1: # %if-true-block
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    retq
+; AVX-NEXT:  .LBB0_2: # %endif-block
+; AVX-NEXT:    movl $1, %eax
+; AVX-NEXT:    retq
 entry:
   %0 = bitcast <4 x i32> %input to i128
   %1 = icmp ne i128 %0, 0
   br i1 %1, label %if-true-block, label %endif-block
-
-if-true-block:                                    ; preds = %entry
+if-true-block:
   ret i32 0
-endif-block:                                      ; preds = %entry,
+endif-block:
   ret i32 1
-; SSE41: veccond128
-; SSE41: ptest
-; SSE41: ret
-; AVX:   veccond128
-; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
-; AVX:   ret
 }
 
 define i32 @veccond256(<8 x i32> %input) {
+; SSE41-LABEL: veccond256:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
+; SSE41-NEXT:    je .LBB1_2
+; SSE41-NEXT:  # BB#1: # %if-true-block
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    retq
+; SSE41-NEXT:  .LBB1_2: # %endif-block
+; SSE41-NEXT:    movl $1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: veccond256:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vptest %ymm0, %ymm0
+; AVX-NEXT:    je .LBB1_2
+; AVX-NEXT:  # BB#1: # %if-true-block
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+; AVX-NEXT:  .LBB1_2: # %endif-block
+; AVX-NEXT:    movl $1, %eax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 entry:
   %0 = bitcast <8 x i32> %input to i256
   %1 = icmp ne i256 %0, 0
   br i1 %1, label %if-true-block, label %endif-block
-
-if-true-block:                                    ; preds = %entry
+if-true-block:
   ret i32 0
-endif-block:                                      ; preds = %entry,
+endif-block:
   ret i32 1
-; SSE41: veccond256
-; SSE41: por
-; SSE41: ptest
-; SSE41: ret
-; AVX:   veccond256
-; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
-; AVX:   ret
 }
 
 define i32 @veccond512(<16 x i32> %input) {
+; SSE41-LABEL: veccond512:
+; SSE41:       # BB#0: # %entry
+; SSE41-NEXT:    por %xmm3, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    je .LBB2_2
+; SSE41-NEXT:  # BB#1: # %if-true-block
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    retq
+; SSE41-NEXT:  .LBB2_2: # %endif-block
+; SSE41-NEXT:    movl $1, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: veccond512:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vptest %ymm0, %ymm0
+; AVX-NEXT:    je .LBB2_2
+; AVX-NEXT:  # BB#1: # %if-true-block
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+; AVX-NEXT:  .LBB2_2: # %endif-block
+; AVX-NEXT:    movl $1, %eax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 entry:
   %0 = bitcast <16 x i32> %input to i512
   %1 = icmp ne i512 %0, 0
   br i1 %1, label %if-true-block, label %endif-block
-
-if-true-block:                                    ; preds = %entry
+if-true-block:
   ret i32 0
-endif-block:                                      ; preds = %entry,
+endif-block:
   ret i32 1
-; SSE41: veccond512
-; SSE41: por
-; SSE41: por
-; SSE41: por
-; SSE41: ptest
-; SSE41: ret
-; AVX:   veccond512
-; AVX:   vorps
-; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
-; AVX:   ret
 }
 
 define i32 @vectest128(<4 x i32> %input) {
-entry:
-  %0 = bitcast <4 x i32> %input to i128
-  %1 = icmp ne i128 %0, 0
-  %2 = zext i1 %1 to i32
-  ret i32 %2
-; SSE41: vectest128
-; SSE41: ptest
-; SSE41: ret
-; AVX:   vectest128
-; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
-; AVX:   ret
+; SSE41-LABEL: vectest128:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    ptest %xmm0, %xmm0
+; SSE41-NEXT:    setne %al
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vectest128:
+; AVX:       # BB#0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vptest %xmm0, %xmm0
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    retq
+  %t0 = bitcast <4 x i32> %input to i128
+  %t1 = icmp ne i128 %t0, 0
+  %t2 = zext i1 %t1 to i32
+  ret i32 %t2
 }
 
 define i32 @vectest256(<8 x i32> %input) {
-entry:
-  %0 = bitcast <8 x i32> %input to i256
-  %1 = icmp ne i256 %0, 0
-  %2 = zext i1 %1 to i32
-  ret i32 %2
-; SSE41: vectest256
-; SSE41: por
-; SSE41: ptest
-; SSE41: ret
-; AVX:   vectest256
-; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
-; AVX:   ret
+; SSE41-LABEL: vectest256:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    ptest %xmm0, %xmm0
+; SSE41-NEXT:    setne %al
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vectest256:
+; AVX:       # BB#0:
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vptest %ymm0, %ymm0
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %t0 = bitcast <8 x i32> %input to i256
+  %t1 = icmp ne i256 %t0, 0
+  %t2 = zext i1 %t1 to i32
+  ret i32 %t2
 }
 
 define i32 @vectest512(<16 x i32> %input) {
-entry:
-  %0 = bitcast <16 x i32> %input to i512
-  %1 = icmp ne i512 %0, 0
-  %2 = zext i1 %1 to i32
-  ret i32 %2
-; SSE41: vectest512
-; SSE41: por
-; SSE41: por
-; SSE41: por
-; SSE41: ptest
-; SSE41: ret
-; AVX:   vectest512
-; AVX:   vorps
-; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
-; AVX:   ret
+; SSE41-LABEL: vectest512:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    por %xmm3, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    xorl %eax, %eax
+; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    setne %al
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vectest512:
+; AVX:       # BB#0:
+; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    xorl %eax, %eax
+; AVX-NEXT:    vptest %ymm0, %ymm0
+; AVX-NEXT:    setne %al
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %t0 = bitcast <16 x i32> %input to i512
+  %t1 = icmp ne i512 %t0, 0
+  %t2 = zext i1 %t1 to i32
+  ret i32 %t2
 }
 
 define i32 @vecsel128(<4 x i32> %input, i32 %a, i32 %b) {
-entry:
-  %0 = bitcast <4 x i32> %input to i128
-  %1 = icmp ne i128 %0, 0
-  %2 = select i1 %1, i32 %a, i32 %b
-  ret i32 %2
-; SSE41: vecsel128
-; SSE41: ptest
-; SSE41: ret
-; AVX:   vecsel128
-; AVX:   vptest %xmm{{.*}}, %xmm{{.*}}
-; AVX:   ret
+; SSE41-LABEL: vecsel128:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    ptest %xmm0, %xmm0
+; SSE41-NEXT:    cmovel %esi, %edi
+; SSE41-NEXT:    movl %edi, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vecsel128:
+; AVX:       # BB#0:
+; AVX-NEXT:    vptest %xmm0, %xmm0
+; AVX-NEXT:    cmovel %esi, %edi
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    retq
+  %t0 = bitcast <4 x i32> %input to i128
+  %t1 = icmp ne i128 %t0, 0
+  %t2 = select i1 %t1, i32 %a, i32 %b
+  ret i32 %t2
 }
 
 define i32 @vecsel256(<8 x i32> %input, i32 %a, i32 %b) {
-entry:
-  %0 = bitcast <8 x i32> %input to i256
-  %1 = icmp ne i256 %0, 0
-  %2 = select i1 %1, i32 %a, i32 %b
-  ret i32 %2
-; SSE41: vecsel256
-; SSE41: por
-; SSE41: ptest
-; SSE41: ret
-; AVX:   vecsel256
-; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
-; AVX:   ret
+; SSE41-LABEL: vecsel256:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    por %xmm1, %xmm0
+; SSE41-NEXT:    ptest %xmm0, %xmm0
+; SSE41-NEXT:    cmovel %esi, %edi
+; SSE41-NEXT:    movl %edi, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vecsel256:
+; AVX:       # BB#0:
+; AVX-NEXT:    vptest %ymm0, %ymm0
+; AVX-NEXT:    cmovel %esi, %edi
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %t0 = bitcast <8 x i32> %input to i256
+  %t1 = icmp ne i256 %t0, 0
+  %t2 = select i1 %t1, i32 %a, i32 %b
+  ret i32 %t2
 }
 
 define i32 @vecsel512(<16 x i32> %input, i32 %a, i32 %b) {
-entry:
-  %0 = bitcast <16 x i32> %input to i512
-  %1 = icmp ne i512 %0, 0
-  %2 = select i1 %1, i32 %a, i32 %b
-  ret i32 %2
-; SSE41: vecsel512
-; SSE41: por
-; SSE41: por
-; SSE41: por
-; SSE41: ptest
-; SSE41: ret
-; AVX:   vecsel512
-; AVX:   vorps
-; AVX:   vptest %ymm{{.*}}, %ymm{{.*}}
-; AVX:   ret
+; SSE41-LABEL: vecsel512:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    por %xmm3, %xmm1
+; SSE41-NEXT:    por %xmm2, %xmm1
+; SSE41-NEXT:    por %xmm0, %xmm1
+; SSE41-NEXT:    ptest %xmm1, %xmm1
+; SSE41-NEXT:    cmovel %esi, %edi
+; SSE41-NEXT:    movl %edi, %eax
+; SSE41-NEXT:    retq
+;
+; AVX-LABEL: vecsel512:
+; AVX:       # BB#0:
+; AVX-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX-NEXT:    vptest %ymm0, %ymm0
+; AVX-NEXT:    cmovel %esi, %edi
+; AVX-NEXT:    movl %edi, %eax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+  %t0 = bitcast <16 x i32> %input to i512
+  %t1 = icmp ne i512 %t0, 0
+  %t2 = select i1 %t1, i32 %a, i32 %b
+  ret i32 %t2
 }
+
diff --git a/test/CodeGen/X86/pr14314.ll b/test/CodeGen/X86/pr14314.ll
index 0832702244e51688dd90cef017ce77bf50c0b209..10733a47699588e2143a7b1e5699abbc40fe0bec 100644
--- a/test/CodeGen/X86/pr14314.ll
+++ b/test/CodeGen/X86/pr14314.ll
@@ -1,13 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-pc-linux -mcpu=corei7 | FileCheck %s
 
 define i64 @atomicSub(i64* %a, i64 %b) nounwind {
+; CHECK-LABEL: atomicSub:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %ebp
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:    pushl %edi
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; CHECK-NEXT:    movl (%ebp), %eax
+; CHECK-NEXT:    movl 4(%ebp), %edx
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %atomicrmw.start
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    movl %eax, %ebx
+; CHECK-NEXT:    subl %edi, %ebx
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    sbbl %esi, %ecx
+; CHECK-NEXT:    lock cmpxchg8b (%ebp)
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:  # BB#2: # %atomicrmw.end
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    popl %edi
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    popl %ebp
+; CHECK-NEXT:    retl
 entry:
   %0 = atomicrmw sub i64* %a, i64 %b seq_cst
   ret i64 %0
-; CHECK: atomicSub
-; CHECK: movl %eax, %ebx
-; CHECK: subl {{%[a-z]+}}, %ebx
-; CHECK: movl %edx, %ecx
-; CHECK: sbbl {{%[a-z]+}}, %ecx
-; CHECK: ret
 }
diff --git a/test/CodeGen/X86/pr16031.ll b/test/CodeGen/X86/pr16031.ll
index dc16fd9671adfa2fc9e7b4d9327df498d689637c..01bc38a243a5cc8eae91189a82976f3288b0da4a 100644
--- a/test/CodeGen/X86/pr16031.ll
+++ b/test/CodeGen/X86/pr16031.ll
@@ -1,20 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=corei7-avx -enable-misched=false | FileCheck %s
 
-; CHECK-LABEL: main:
-; CHECK: pushl %esi
-; CHECK-NEXT: testb $1, 8(%esp)
-; CHECK-NEXT: movl  $-12, %eax
-; CHECK-NEXT: movl  $-1, %edx
-; CHECK-NEXT: cmovel    %edx, %eax
-; CHECK-NEXT: xorl  %ecx, %ecx
-; CHECK-NEXT: movl  %eax, %esi
-; CHECK-NEXT: addl  $-1, %esi
-; CHECK-NEXT: movl  $-1, %esi
-; CHECK-NEXT: adcl  $-1, %esi
-; CHECK-NEXT: cmovsl    %ecx, %eax
-; CHECK-NEXT: cmovsl    %ecx, %edx
-; CHECK-NEXT: popl  %esi
 define i64 @main(i1 %tobool1) nounwind {
+; CHECK-LABEL: main:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:    testb $1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $-12, %eax
+; CHECK-NEXT:    movl $-1, %edx
+; CHECK-NEXT:    cmovel %edx, %eax
+; CHECK-NEXT:    xorl %ecx, %ecx
+; CHECK-NEXT:    movl %eax, %esi
+; CHECK-NEXT:    addl $-1, %esi
+; CHECK-NEXT:    movl $-1, %esi
+; CHECK-NEXT:    adcl $-1, %esi
+; CHECK-NEXT:    cmovsl %ecx, %eax
+; CHECK-NEXT:    cmovsl %ecx, %edx
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
 entry:
   %0 = zext i1 %tobool1 to i32
   %. = xor i32 %0, 1
diff --git a/test/CodeGen/X86/pr18014.ll b/test/CodeGen/X86/pr18014.ll
index dc9d53fff17366c00d8cfd057c7baca921565158..bb3b9c23f1e3f808b6252f9c18011c8ad9b0c9bc 100644
--- a/test/CodeGen/X86/pr18014.ll
+++ b/test/CodeGen/X86/pr18014.ll
@@ -9,7 +9,7 @@ define <4 x i32> @foo(<4 x i32>* %p, <4 x i1> %cond, <4 x i32> %v1, <4 x i32> %v
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    pslld $31, %xmm0
 ; CHECK-NEXT:    psrad $31, %xmm0
-; CHECK-NEXT:    blendvps %xmm1, %xmm2
+; CHECK-NEXT:    blendvps %xmm0, %xmm1, %xmm2
 ; CHECK-NEXT:    paddd %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm2, (%rdi)
 ; CHECK-NEXT:    movdqa %xmm1, %xmm0
diff --git a/test/CodeGen/X86/pr18023.ll b/test/CodeGen/X86/pr18023.ll
deleted file mode 100644
index c7ea20c281bad830dd5dca2a3dccab68b57064b5..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/pr18023.ll
+++ /dev/null
@@ -1,31 +0,0 @@
-; RUN: llc < %s -mtriple x86_64-apple-macosx10.9.0 | FileCheck %s
-; PR18023
-
-; CHECK: movabsq $4294967296, %rcx
-; CHECK: movq  %rcx, (%rax)
-; CHECK: movl  $1, 4(%rax)
-; CHECK: movl  $0, 4(%rax)
-; CHECK: movq  $1, 4(%rax)
-
-@c = common global i32 0, align 4
-@a = common global [3 x i32] zeroinitializer, align 4
-@b = common global i32 0, align 4
-@.str = private unnamed_addr constant [4 x i8] c"%d\0A\00", align 1
-
-define void @func() {
-  store i32 1, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4
-  store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 0), align 4
-  %1 = load volatile i32, i32* @b, align 4
-  store i32 1, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4
-  store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4
-  %2 = load volatile i32, i32* @b, align 4
-  store i32 1, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4
-  store i32 0, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 2), align 4
-  %3 = load volatile i32, i32* @b, align 4
-  store i32 3, i32* @c, align 4
-  %4 = load i32, i32* getelementptr inbounds ([3 x i32], [3 x i32]* @a, i64 0, i64 1), align 4
-  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), i32 %4)
-  ret void
-}
-
-declare i32 @printf(i8*, ...)
diff --git a/test/CodeGen/X86/pr18344.ll b/test/CodeGen/X86/pr18344.ll
new file mode 100644
index 0000000000000000000000000000000000000000..15bf91031ee8871ad6c08bf648fff9aa11d3080c
--- /dev/null
+++ b/test/CodeGen/X86/pr18344.ll
@@ -0,0 +1,89 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64
+
+%v4_varying_complex = type { <4 x float>, <4 x float> }
+
+define void @FFT(%v4_varying_complex* noalias nocapture %destination, float* noalias %re, <4 x i32>* noalias nocapture %ptr_cast_for_load) nounwind {
+; X86-LABEL: FFT:
+; X86:       # BB#0: # %begin
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movdqu (%edx), %xmm0
+; X86-NEXT:    pslld $4, %xmm0
+; X86-NEXT:    movd %xmm0, %edx
+; X86-NEXT:    pextrd $1, %xmm0, %esi
+; X86-NEXT:    pextrd $2, %xmm0, %edi
+; X86-NEXT:    pextrd $3, %xmm0, %ebx
+; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-NEXT:    movss %xmm0, 128(%eax)
+; X86-NEXT:    movss %xmm1, 164(%eax)
+; X86-NEXT:    movss %xmm2, 200(%eax)
+; X86-NEXT:    movss %xmm3, 236(%eax)
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: FFT:
+; X64:       # BB#0: # %begin
+; X64-NEXT:    movdqu (%rdx), %xmm0
+; X64-NEXT:    pslld $4, %xmm0
+; X64-NEXT:    movd %xmm0, %rax
+; X64-NEXT:    movslq %eax, %r8
+; X64-NEXT:    sarq $32, %rax
+; X64-NEXT:    pextrq $1, %xmm0, %rdx
+; X64-NEXT:    movslq %edx, %rcx
+; X64-NEXT:    sarq $32, %rdx
+; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT:    movss %xmm0, 128(%rdi)
+; X64-NEXT:    movss %xmm1, 164(%rdi)
+; X64-NEXT:    movss %xmm2, 200(%rdi)
+; X64-NEXT:    movss %xmm3, 236(%rdi)
+; X64-NEXT:    retq
+begin:
+  %ptr_masked_load79 = load <4 x i32>, <4 x i32>* %ptr_cast_for_load, align 4
+  %mul__bitReversedProgramIndex_load = shl <4 x i32> %ptr_masked_load79, <i32 4, i32 4, i32 4, i32 4>
+
+  %offset32_1 = extractelement <4 x i32> %mul__bitReversedProgramIndex_load, i32 0
+  %ptroffset_1 = sext i32 %offset32_1 to i64
+  %offset32_2 = extractelement <4 x i32> %mul__bitReversedProgramIndex_load, i32 1
+  %ptroffset_2 = sext i32 %offset32_2 to i64
+  %offset32_3 = extractelement <4 x i32> %mul__bitReversedProgramIndex_load, i32 2
+  %ptroffset_3 = sext i32 %offset32_3 to i64
+  %offset32_4 = extractelement <4 x i32> %mul__bitReversedProgramIndex_load, i32 3
+  %ptroffset_4 = sext i32 %offset32_4 to i64
+
+  %ptrcast_1 = getelementptr float, float* %re, i64 %ptroffset_1
+  %val_1 = load float, float* %ptrcast_1, align 4
+  %ptrcast_2 = getelementptr float, float* %re, i64 %ptroffset_2
+  %val_2 = load float, float* %ptrcast_2, align 4
+  %ptrcast_3 = getelementptr float, float* %re, i64 %ptroffset_3
+  %val_3 = load float, float* %ptrcast_3, align 4
+  %ptrcast_4 = getelementptr float, float* %re, i64 %ptroffset_4
+  %val_4 = load float, float* %ptrcast_4, align 4
+
+  %destination_load_ptr2int_2void = bitcast %v4_varying_complex* %destination to i8*
+  %ptrcast1_1 = getelementptr inbounds %v4_varying_complex, %v4_varying_complex* %destination, i64 4, i32 0, i64 0
+  store float %val_1, float* %ptrcast1_1, align 4
+  %finalptr_2 = getelementptr i8, i8* %destination_load_ptr2int_2void, i64 164
+  %ptrcast1_2 = bitcast i8* %finalptr_2 to float*
+  store float %val_2, float* %ptrcast1_2, align 4
+  %finalptr_3 = getelementptr i8, i8* %destination_load_ptr2int_2void, i64 200
+  %ptrcast1_3 = bitcast i8* %finalptr_3 to float*
+  store float %val_3, float* %ptrcast1_3, align 4
+  %finalptr_4 = getelementptr i8, i8* %destination_load_ptr2int_2void, i64 236
+  %ptrcast1_4 = bitcast i8* %finalptr_4 to float*
+  store float %val_4, float* %ptrcast1_4, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/pr22338.ll b/test/CodeGen/X86/pr22338.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e0645d1ef551205f7cb2703988ee2da38fa993e5
--- /dev/null
+++ b/test/CodeGen/X86/pr22338.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64
+
+define i32 @fn() {
+; X86-LABEL: fn:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    cmpl $1, %eax
+; X86-NEXT:    sete %cl
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:    jne .LBB0_2
+; X86-NEXT:  # BB#1: # %entry
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:  .LBB0_2: # %entry
+; X86-NEXT:    addb %cl, %cl
+; X86-NEXT:    shll %cl, %eax
+; X86-NEXT:    .p2align 4, 0x90
+; X86-NEXT:  .LBB0_3: # %bb1
+; X86-NEXT:    # =>This Inner Loop Header: Depth=1
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    je .LBB0_3
+; X86-NEXT:  # BB#4: # %bb2
+; X86-NEXT:    retl
+;
+; X64-LABEL: fn:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpl $1, %eax
+; X64-NEXT:    sete %cl
+; X64-NEXT:    movl $-1, %eax
+; X64-NEXT:    cmovel %edx, %eax
+; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    shll %cl, %eax
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB0_1: # %bb1
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # BB#2: # %bb2
+; X64-NEXT:    retq
+entry:
+  %cmp1 = icmp ne i32 undef, 1
+  %cmp2 = icmp eq i32 undef, 1
+  %sel1 = select i1 %cmp1, i32 0, i32 2
+  %sel2 = select i1 %cmp2, i32 2, i32 0
+  %sext = sext i1 %cmp1 to i32
+  %shl1 = shl i32 %sext, %sel1
+  %shl2 = shl i32 %sext, %sel2
+  %tobool = icmp eq i32 %shl1, 0
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %entry
+  br i1 %tobool, label %bb1, label %bb2
+
+bb2:                                              ; preds = %bb1
+  ret i32 %shl2
+}
diff --git a/test/CodeGen/X86/pr26350.ll b/test/CodeGen/X86/pr26350.ll
index 6e87cb3e8b7aada0753e29cf4c74f4e1d29bfa2a..5ba5862413b5486e4fd5648ccc52712d9816a54d 100644
--- a/test/CodeGen/X86/pr26350.ll
+++ b/test/CodeGen/X86/pr26350.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -disable-constant-hoisting < %s | FileCheck %s
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
 target triple = "i386-unknown-linux-gnu"
@@ -5,6 +6,18 @@ target triple = "i386-unknown-linux-gnu"
 @d = global i32 8, align 4
 
 define i32 @main() {
+; CHECK-LABEL: main:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl d, %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shrl $31, %ecx
+; CHECK-NEXT:    addl %eax, %eax
+; CHECK-NEXT:    andl $16, %eax
+; CHECK-NEXT:    cmpl $-1, %eax
+; CHECK-NEXT:    sbbl $0, %ecx
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retl
 entry:
   %load = load i32, i32* @d, align 4
   %conv1 = zext i32 %load to i64
@@ -14,8 +27,3 @@ entry:
   %zext = zext i1 %cmp to i32
   ret i32 %zext
 }
-; CHECK: main:
-; CHECK:   movl    d, %[[load:.*]]
-; CHECK:   movl    %[[load]], %[[copy:.*]]
-; CHECK:   shrl    $31, %[[copy]]
-; CHECK:   addl    %[[load]], %[[load]]
diff --git a/test/CodeGen/X86/pr2656.ll b/test/CodeGen/X86/pr2656.ll
index 3005c581866fd3c5be582504951c863ef46e13f9..c54ae3d35029c71b89ad52e23d639f3b72c63c3e 100644
--- a/test/CodeGen/X86/pr2656.ll
+++ b/test/CodeGen/X86/pr2656.ll
@@ -1,9 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
 ; PR2656
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin9.4.0"
-	%struct.anon = type <{ float, float }>
+
+%struct.anon = type <{ float, float }>
 @.str = internal constant [17 x i8] c"pt: %.0f, %.0f\0A\00\00"		; <[17 x i8]*> [#uses=1]
 
 ; We can not fold either stack load into an 'xor' instruction because that
@@ -13,12 +15,21 @@ target triple = "i686-apple-darwin9.4.0"
 
 define void @foo(%struct.anon* byval %p) nounwind {
 ; CHECK-LABEL: foo:
-; CHECK:         movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    subl $28, %esp
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    movaps {{.*#+}} xmm2 = [-0.000000e+00,-0.000000e+00,-0.000000e+00,-0.000000e+00]
 ; CHECK-NEXT:    xorps %xmm2, %xmm0
 ; CHECK-NEXT:    cvtss2sd %xmm0, %xmm0
 ; CHECK-NEXT:    xorps %xmm2, %xmm1
+; CHECK-NEXT:    cvtss2sd %xmm1, %xmm1
+; CHECK-NEXT:    movsd %xmm1, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movl $_.str, (%esp)
+; CHECK-NEXT:    calll _printf
+; CHECK-NEXT:    addl $28, %esp
+; CHECK-NEXT:    retl
 entry:
 	%tmp = getelementptr %struct.anon, %struct.anon* %p, i32 0, i32 0		; <float*> [#uses=1]
 	%tmp1 = load float, float* %tmp		; <float> [#uses=1]
@@ -40,13 +51,19 @@ declare i32 @printf(...)
 
 define double @PR22371(double %x) {
 ; CHECK-LABEL: PR22371:
-; CHECK:       movsd  16(%esp), %xmm0
-; CHECK-NEXT:  andps  LCPI1_0, %xmm0
-; CHECK-NEXT:  movlps  %xmm0, (%esp)
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    subl $12, %esp
+; CHECK-NEXT:  Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    andps LCPI1_0, %xmm0
+; CHECK-NEXT:    movlps %xmm0, (%esp)
+; CHECK-NEXT:    fldl (%esp)
+; CHECK-NEXT:    addl $12, %esp
+; CHECK-NEXT:    retl
   %call = tail call double @fabs(double %x) #0
   ret double %call
 }
 
 declare double @fabs(double) #0
 attributes #0 = { readnone }
-
diff --git a/test/CodeGen/X86/pr27591.ll b/test/CodeGen/X86/pr27591.ll
index 3331a9354fcfe4bfb70bed0fdc948cb8ac2dc347..3ff6c096d0976295333e4c768a537953e5e26a64 100644
--- a/test/CodeGen/X86/pr27591.ll
+++ b/test/CodeGen/X86/pr27591.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -o - -O0 < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -8,11 +9,12 @@ define void @test1(i32 %x) #0 {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:  # implicit-def: %EDI
+; CHECK-NEXT:    # implicit-def: %EDI
 ; CHECK-NEXT:    movb %al, %dil
 ; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    kmovb %k0, %eax
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kmovd %k0, %edi
+; CHECK-NEXT:    movb %dil, %al
 ; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    movzbl %al, %edi
 ; CHECK-NEXT:    callq callee1
@@ -30,11 +32,11 @@ define void @test2(i32 %x) #0 {
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    setne %al
-; CHECK-NEXT:  # implicit-def: %EDI
+; CHECK-NEXT:    # implicit-def: %EDI
 ; CHECK-NEXT:    movb %al, %dil
 ; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    kmovw %edi, %k0
-; CHECK-NEXT:    kmovw %k0, %edi
+; CHECK-NEXT:    kmovd %edi, %k0
+; CHECK-NEXT:    kmovd %k0, %edi
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    movb %dil, %al
 ; CHECK-NEXT:    xorl %edi, %edi
diff --git a/test/CodeGen/X86/pr28173.ll b/test/CodeGen/X86/pr28173.ll
index db7d3335215d37df9d7f32f0383985cd7ef574d4..d9622b99bd98eca86ea58c733aa811de3aace127 100644
--- a/test/CodeGen/X86/pr28173.ll
+++ b/test/CodeGen/X86/pr28173.ll
@@ -5,9 +5,6 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; Note that the kmovs should really *not* appear in the output, this is an
-; artifact of the current poor lowering. This is tracked by PR28175.
-
 define i64 @foo64(i1 zeroext %i) #0 {
 ; CHECK-LABEL: foo64:
 ; CHECK:       # BB#0:
@@ -43,25 +40,13 @@ end:
   ret i16 %v
 }
 
-; This code is still not optimal
 define i16 @foo16_1(i1 zeroext %i, i32 %j) #0 {
-; KNL-LABEL: foo16_1:
-; KNL:       # BB#0:
-; KNL-NEXT:    kmovw %edi, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    orl $2, %eax
-; KNL-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; KNL-NEXT:    retq
-;
-; SKX-LABEL: foo16_1:
-; SKX:       # BB#0:
-; SKX-NEXT:    kmovd %edi, %k0
-; SKX-NEXT:    kmovw %k0, %eax
-; SKX-NEXT:    andl $1, %eax
-; SKX-NEXT:    orl $2, %eax
-; SKX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
-; SKX-NEXT:    retq
+; CHECK-LABEL: foo16_1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    orl $2, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
   br label %bb
 
 bb:
diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll
index 8bf704835ae27b3285d83da0efa740c82a8c1370..8c970b3d47718d64c771d6fdafadd37d7e164e1a 100644
--- a/test/CodeGen/X86/pr29112.ll
+++ b/test/CodeGen/X86/pr29112.ll
@@ -24,11 +24,11 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm3, %xmm4
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm4 = xmm0[0,1,2],xmm4[3]
 ; CHECK-NEXT:    vpermilps {{.*#+}} xmm5 = xmm2[3,1,2,3]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[2,3]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm2[1],xmm5[3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1,2],xmm3[1]
 ; CHECK-NEXT:    vmovshdup {{.*#+}} xmm7 = xmm8[1,1,3,3]
-; CHECK-NEXT:    vinsertps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[2,3]
+; CHECK-NEXT:    vunpcklps {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm10 = xmm7[0,1],xmm2[1],xmm7[3]
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm7 = xmm10[0,1,2],xmm3[3]
 ; CHECK-NEXT:    vblendps {{.*#+}} xmm11 = xmm0[0,1,2],xmm3[3]
@@ -60,6 +60,7 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, <
 ; CHECK-NEXT:    vmovaps %xmm8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovaps %xmm9, (%rsp)
 ; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
 ; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    vaddps {{[0-9]+}}(%rsp), %xmm1, %xmm1 # 16-byte Folded Reload
diff --git a/test/CodeGen/X86/pr29170.ll b/test/CodeGen/X86/pr29170.ll
index d8e27557ab93050a5a87fbb1a4fdb3b34cafc483..ecb4c9785365f2afd9a30aa6b2d2149490b02568 100644
--- a/test/CodeGen/X86/pr29170.ll
+++ b/test/CodeGen/X86/pr29170.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
@@ -5,10 +6,26 @@ target triple = "i386-unknown-linux-gnu"
 
 @b = global i16 0, align 4
 
-; CHECK-LABEL: @main
-; CHECK: cmpl
-; CHECK: sbbl
 define i32 @main() {
+; CHECK-LABEL: main:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB0_3
+; CHECK-NEXT:  # BB#1: # %go
+; CHECK-NEXT:    movl $-1, %ecx
+; CHECK-NEXT:    movsbl b, %edx
+; CHECK-NEXT:    notl %ecx
+; CHECK-NEXT:    movzwl %dx, %edx
+; CHECK-NEXT:    cmpl $-1, %edx
+; CHECK-NEXT:    sbbl %ecx, %eax
+; CHECK-NEXT:    jge .LBB0_3
+; CHECK-NEXT:  # BB#2: # %if.then
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_3: # %if.else
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    retl
 entry:
   %true = icmp eq i32 0, 0
   %const = bitcast i64 -4294967296 to i64
diff --git a/test/CodeGen/X86/pr30284.ll b/test/CodeGen/X86/pr30284.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7ab1b729ea04dd907fb3c3f94ee143016d7e0821
--- /dev/null
+++ b/test/CodeGen/X86/pr30284.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mattr=avx512dq | FileCheck %s
+
+define void @f_f___un_3C_unf_3E_un_3C_unf_3E_() {
+; CHECK-LABEL: f_f___un_3C_unf_3E_un_3C_unf_3E_:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vmovapd 0, %zmm0
+; CHECK-NEXT:    vmovapd 64, %zmm1
+; CHECK-NEXT:    vmovapd {{.*#+}} zmm2 = [0,16,0,16,0,16,0,16,0,16,0,16,0,16,0,16]
+; CHECK-NEXT:    vorpd %zmm2, %zmm0, %zmm0 {%k1}
+; CHECK-NEXT:    vorpd %zmm2, %zmm1, %zmm1 {%k1}
+; CHECK-NEXT:    vmovapd %zmm1, 64
+; CHECK-NEXT:    vmovapd %zmm0, 0
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retl
+  %a_load22 = load <16 x i64>, <16 x i64>* null, align 1
+  %bitop = or <16 x i64> %a_load22, <i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736, i64 68719476736>
+  %v.i = load <16 x i64>, <16 x i64>* null
+  %v1.i41 = select <16 x i1> undef, <16 x i64> %bitop, <16 x i64> %v.i
+  store <16 x i64> %v1.i41, <16 x i64>* null
+  ret void
+}
diff --git a/test/CodeGen/X86/pr30430.ll b/test/CodeGen/X86/pr30430.ll
index 6aa4c91c4a808950e9ae60d1f76ca4bbc68385e4..14d81f14fc32c9046f7c595142b905d3054fca44 100644
--- a/test/CodeGen/X86/pr30430.ll
+++ b/test/CodeGen/X86/pr30430.ll
@@ -30,14 +30,6 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
 ; CHECK-NEXT:    vmovss %xmm5, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm6, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm7, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm15, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm14, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm13, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm12, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm11, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm10, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm9, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm8, (%rsp)
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
@@ -46,14 +38,14 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm9 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm10 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm11 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm12 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm13 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm14 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm15 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm16 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm17 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm18 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm19 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm20 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm21 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm22 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm23 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm1, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm2, {{[0-9]+}}(%rsp)
@@ -62,14 +54,14 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
 ; CHECK-NEXT:    vmovss %xmm5, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm6, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss %xmm7, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm8, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm9, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm10, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm11, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm12, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm13, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm14, {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    vmovss %xmm15, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm17, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm18, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm19, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm20, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm21, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm22, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vmovss %xmm23, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -104,11 +96,19 @@ define <16 x float> @makefloat(float %f1, float %f2, float %f3, float %f4, float
 ; CHECK-NEXT:    # implicit-def: %YMM3
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm3
 ; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm3
-; CHECK-NEXT:    # implicit-def: %ZMM16
-; CHECK-NEXT:    vmovaps %zmm3, %zmm16
-; CHECK-NEXT:    vinsertf64x4 $1, %ymm2, %zmm16, %zmm16
-; CHECK-NEXT:    vmovaps %zmm16, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    # implicit-def: %ZMM24
+; CHECK-NEXT:    vmovaps %zmm3, %zmm24
+; CHECK-NEXT:    vinsertf64x4 $1, %ymm2, %zmm24, %zmm24
+; CHECK-NEXT:    vmovaps %zmm24, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    vmovaps {{[0-9]+}}(%rsp), %zmm0
+; CHECK-NEXT:    vmovss %xmm15, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm8, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm9, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm10, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm11, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm12, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm13, {{[0-9]+}}(%rsp) # 4-byte Spill
+; CHECK-NEXT:    vmovss %xmm14, (%rsp) # 4-byte Spill
 ; CHECK-NEXT:    movq %rbp, %rsp
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    retq
diff --git a/test/CodeGen/X86/pr30693.ll b/test/CodeGen/X86/pr30693.ll
deleted file mode 100644
index 834365911ed51102c548a43276ec19eb0239d364..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/pr30693.ll
+++ /dev/null
@@ -1,147 +0,0 @@
-; PR30693
-; RUN: llc < %s | FileCheck %s
-
-; CHECK:      .p2align	2
-; CHECK-NEXT: .LCPI0_0:
-; CHECK-NOT:  vmovaps	.LCPI0_0(%rip),
-; CHECK:      .cfi_endproc
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@var_35 = external local_unnamed_addr global i32, align 4
-@var_14 = external local_unnamed_addr global i16, align 2
-
-; Function Attrs: uwtable
-define void @_Z3foov() local_unnamed_addr #0 {
-entry:
-  %0 = load i32, i32* @var_35, align 4
-  %1 = load i16, i16* @var_14, align 2
-  %conv34 = zext i16 %1 to i64
-  %conv37 = ashr exact i64 undef, 32
-  %sub316 = add i16 undef, -7198
-  %cmp339981 = icmp sgt i32 undef, 0
-  %cmp401989 = icmp sgt i32 undef, 0
-  %cmp443994 = icmp sgt i32 undef, 0
-  %lcmp.mod = icmp eq i64 undef, 0
-  %broadcast.splat1461 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
-  %broadcast.splat1357 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
-  %broadcast.splat1435 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
-  %broadcast.splat1409 = shufflevector <32 x i16> undef, <32 x i16> undef, <32 x i32> zeroinitializer
-  br label %for.cond11.preheader
-
-for.cond11.preheader:                             ; preds = %for.cond.cleanup477.loopexit, %entry
-  %div = sdiv i32 0, 0
-  %mul31 = mul nsw i32 %div, %0
-  %conv32 = sext i32 %mul31 to i64
-  %div40 = sdiv i64 0, 0
-  %div41 = sdiv i32 0, 0
-  %conv42 = sext i32 %div41 to i64
-  %mul43 = mul nsw i64 %conv32, %conv34
-  %mul44 = mul i64 %mul43, %div40
-  %mul45 = mul i64 %mul44, %conv37
-  %mul46 = mul i64 %mul45, %conv42
-  %add48 = add nsw i64 %mul46, 36611
-  %conv49 = trunc i64 %add48 to i16
-  br label %vector.ph1520
-
-vector.ph1520:                                    ; preds = %for.cond11.preheader
-  %broadcast.splatinsert1531 = insertelement <32 x i16> undef, i16 %conv49, i32 0
-  %broadcast.splat1532 = shufflevector <32 x i16> %broadcast.splatinsert1531, <32 x i16> undef, <32 x i32> zeroinitializer
-  br i1 %lcmp.mod, label %vector.body1512.prol.loopexit, label %vector.body1512.prol.preheader
-
-vector.body1512.prol.preheader:                   ; preds = %vector.ph1520
-  store <32 x i16> %broadcast.splat1532, <32 x i16>* undef, align 8, !tbaa !1
-  unreachable
-
-vector.body1512.prol.loopexit:                    ; preds = %vector.ph1520
-  %add318 = add i16 %sub316, 0
-  %2 = insertelement <16 x i16> undef, i16 %add318, i32 7
-  %3 = insertelement <16 x i16> %2, i16 %add318, i32 8
-  %4 = insertelement <16 x i16> %3, i16 %add318, i32 9
-  %5 = insertelement <16 x i16> %4, i16 %add318, i32 10
-  %6 = insertelement <16 x i16> %5, i16 %add318, i32 11
-  %7 = insertelement <16 x i16> %6, i16 %add318, i32 12
-  %8 = insertelement <16 x i16> %7, i16 %add318, i32 13
-  %9 = insertelement <16 x i16> %8, i16 %add318, i32 14
-  %10 = insertelement <16 x i16> undef, i16 %add318, i32 7
-  %11 = insertelement <16 x i16> %10, i16 %add318, i32 8
-  %12 = insertelement <16 x i16> %11, i16 %add318, i32 9
-  %13 = insertelement <16 x i16> %12, i16 %add318, i32 10
-  %14 = insertelement <16 x i16> %13, i16 %add318, i32 11
-  %15 = insertelement <16 x i16> %14, i16 %add318, i32 12
-  %16 = insertelement <16 x i16> %15, i16 %add318, i32 13
-  %17 = insertelement <16 x i16> %16, i16 %add318, i32 14
-  %18 = insertelement <16 x i16> %17, i16 %add318, i32 15
-  %19 = insertelement <8 x i16> undef, i16 %add318, i32 7
-  br label %for.cond74.loopexit.us
-
-for.cond337.preheader.lr.ph:                      ; preds = %for.cond130.preheader.loopexit
-  br i1 %cmp339981, label %for.cond337.preheader.us.preheader, label %for.cond.cleanup335
-
-for.cond337.preheader.us.preheader:               ; preds = %for.cond337.preheader.lr.ph
-  store <32 x i16> %broadcast.splat1461, <32 x i16>* undef, align 4, !tbaa !1
-  unreachable
-
-for.cond74.loopexit.us:                           ; preds = %for.cond74.loopexit.us, %vector.body1512.prol.loopexit
-  store <8 x i16> zeroinitializer, <8 x i16>* undef, align 2, !tbaa !1
-  %cmp76.us = icmp slt i64 undef, undef
-  br i1 %cmp76.us, label %for.cond74.loopexit.us, label %for.cond130.preheader.loopexit
-
-for.cond130.preheader.loopexit:                   ; preds = %for.cond74.loopexit.us
-  store <16 x i16> zeroinitializer, <16 x i16>* undef, align 2, !tbaa !1
-  store <16 x i16> %18, <16 x i16>* undef, align 2, !tbaa !1
-  store <8 x i16> %19, <8 x i16>* undef, align 2, !tbaa !1
-  br label %for.cond337.preheader.lr.ph
-
-for.cond.cleanup335:                              ; preds = %for.cond337.preheader.lr.ph
-  br label %for.cond380.preheader
-
-for.cond380.preheader:                            ; preds = %for.cond.cleanup335
-  br label %for.cond385.preheader
-
-for.cond.cleanup378.loopexit:                     ; preds = %for.cond.cleanup388
-  br label %for.cond481.preheader
-
-for.cond385.preheader:                            ; preds = %for.cond380.preheader
-  br i1 %cmp443994, label %for.cond392.preheader.us.preheader, label %for.cond392.preheader.preheader
-
-for.cond392.preheader.preheader:                  ; preds = %for.cond385.preheader
-  store <32 x i16> %broadcast.splat1435, <32 x i16>* undef, align 4, !tbaa !1
-  store <32 x i16> %broadcast.splat1409, <32 x i16>* undef, align 4, !tbaa !1
-  unreachable
-
-for.cond392.preheader.us.preheader:               ; preds = %for.cond385.preheader
-  br label %for.cond399.preheader.lr.ph.us.1
-
-for.cond.cleanup388:                              ; preds = %for.cond399.preheader.lr.ph.us.1
-  br label %for.cond.cleanup378.loopexit
-
-for.cond481.preheader:                            ; preds = %for.cond.cleanup486, %for.cond.cleanup378.loopexit
-  br label %for.cond.cleanup486
-
-for.cond.cleanup477.loopexit:                     ; preds = %for.cond.cleanup486
-  store <8 x i32> <i32 1221902566, i32 1221902566, i32 1221902566, i32 1221902566, i32 1221902566, i32 1221902566, i32 1221902566, i32 1221902566>, <8 x i32>* undef, align 4, !tbaa !5
-  br label %for.cond11.preheader
-
-for.cond.cleanup486:                              ; preds = %for.cond481.preheader
-  br i1 undef, label %for.cond481.preheader, label %for.cond.cleanup477.loopexit
-
-for.cond399.preheader.lr.ph.us.1:                 ; preds = %for.cond392.preheader.us.preheader
-  br i1 %cmp401989, label %for.cond399.preheader.us.us.1.preheader, label %for.cond.cleanup388
-
-for.cond399.preheader.us.us.1.preheader:          ; preds = %for.cond399.preheader.lr.ph.us.1
-  store <32 x i16> %broadcast.splat1357, <32 x i16>* undef, align 4, !tbaa !1
-  unreachable
-}
-
-attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="knl" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512er,+avx512f,+avx512pf,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+prefetchwt1,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.ident = !{!0}
-
-!0 = !{!"clang version 4.0.0 (http://llvm.org/git/clang.git ef66d4d58b9a2c6b3d31bbaf3ed2a70a9754a137) (http://llvm.org/git/llvm.git 5e661621191d6133a12effa103bfb2cbbdbb35ad)"}
-!1 = !{!2, !2, i64 0}
-!2 = !{!"short", !3, i64 0}
-!3 = !{!"omnipotent char", !4, i64 0}
-!4 = !{!"Simple C++ TBAA"}
-!5 = !{!6, !6, i64 0}
-!6 = !{!"int", !3, i64 0}
diff --git a/test/CodeGen/X86/pr31956.ll b/test/CodeGen/X86/pr31956.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e9293048f4e5889ec56f71a282f04e7aa6dfc144
--- /dev/null
+++ b/test/CodeGen/X86/pr31956.ll
@@ -0,0 +1,25 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mattr=+avx < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+@G1 = common global <2 x float> zeroinitializer, align 8
+@G2 = common global <8 x float> zeroinitializer, align 32
+
+define <4 x float> @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],mem[1,2,3]
+; CHECK-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT:    vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[2,0]
+; CHECK-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    retq
+entry:
+  %V = load <2 x float>, <2 x float>* @G1, align 8
+  %shuffle = shufflevector <2 x float> %V, <2 x float> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 0, i32 undef, i32 undef, i32 undef>
+  %L = load <8 x float>, <8 x float>* @G2, align 32
+  %shuffle1 = shufflevector <8 x float> %shuffle, <8 x float> %L, <4 x i32> <i32 12, i32 10, i32 14, i32 4>
+  ret <4 x float> %shuffle1
+}
diff --git a/test/CodeGen/X86/pr32108.ll b/test/CodeGen/X86/pr32108.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f14b04802a04f08dbaa30e819895d58905658ef4
--- /dev/null
+++ b/test/CodeGen/X86/pr32108.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define void @pr32108() {
+; CHECK-LABEL: pr32108:
+; CHECK:       # BB#0: # %CF257
+; CHECK-NEXT:    movb $0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    .p2align 4, 0x90
+; CHECK-NEXT:  .LBB0_1: # %CF244
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT:    jmp .LBB0_1
+BB:
+  %Cmp45 = icmp slt <4 x i32> undef, undef
+  br label %CF243
+
+CF243:                                            ; preds = %CF243, %BB
+  br i1 undef, label %CF243, label %CF257
+
+CF257:                                            ; preds = %CF243
+  %Shuff144 = shufflevector <4 x i1> undef, <4 x i1> %Cmp45, <4 x i32> <i32 undef, i32 undef, i32 5, i32 undef>
+  br label %CF244
+
+CF244:                                            ; preds = %CF244, %CF257
+  %Shuff182 = shufflevector <4 x i1> %Shuff144, <4 x i1> zeroinitializer, <4 x i32> <i32 3, i32 5, i32 7, i32 undef>
+  br label %CF244
+}
diff --git a/test/CodeGen/X86/pr32241.ll b/test/CodeGen/X86/pr32241.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d8ce230057ea315ba2bf0b6992327eb8829f76ef
--- /dev/null
+++ b/test/CodeGen/X86/pr32241.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -O0 -mcpu=skx | FileCheck %s
+
+define i32 @_Z3foov() {
+; CHECK-LABEL: _Z3foov:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subl $20, %esp
+; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    movw $10959, {{[0-9]+}}(%esp) # imm = 0x2ACF
+; CHECK-NEXT:    movw $-15498, {{[0-9]+}}(%esp) # imm = 0xC376
+; CHECK-NEXT:    movw $19417, {{[0-9]+}}(%esp) # imm = 0x4BD9
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movw {{[0-9]+}}(%esp), %cx
+; CHECK-NEXT:    kxnorw %k0, %k0, %k0
+; CHECK-NEXT:    kshiftrw $15, %k0, %k0
+; CHECK-NEXT:    testw %cx, %cx
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:    jmp .LBB0_1
+; CHECK-NEXT:  .LBB0_1: # %lor.rhs
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    kmovd %eax, %k0
+; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill
+; CHECK-NEXT:    jmp .LBB0_2
+; CHECK-NEXT:  .LBB0_2: # %lor.end
+; CHECK-NEXT:    kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload
+; CHECK-NEXT:    kxnorw %k0, %k0, %k1
+; CHECK-NEXT:    kshiftrw $15, %k1, %k1
+; CHECK-NEXT:    movb $1, %al
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill
+; CHECK-NEXT:    kmovw %k1, {{[0-9]+}}(%esp) # 2-byte Spill
+; CHECK-NEXT:    jne .LBB0_4
+; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:  .LBB0_3: # %lor.rhs4
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    kmovd %eax, %k0
+; CHECK-NEXT:    kmovw %k0, {{[0-9]+}}(%esp) # 2-byte Spill
+; CHECK-NEXT:    jmp .LBB0_4
+; CHECK-NEXT:  .LBB0_4: # %lor.end5
+; CHECK-NEXT:    kmovw {{[0-9]+}}(%esp), %k0 # 2-byte Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    movw %ax, %cx
+; CHECK-NEXT:    movw %cx, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    movzwl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    addl $20, %esp
+; CHECK-NEXT:    retl
+entry:
+  %aa = alloca i16, align 2
+  %bb = alloca i16, align 2
+  %cc = alloca i16, align 2
+  store i16 10959, i16* %aa, align 2
+  store i16 -15498, i16* %bb, align 2
+  store i16 19417, i16* %cc, align 2
+  %0 = load i16, i16* %aa, align 2
+  %conv = zext i16 %0 to i32
+  %1 = load i16, i16* %cc, align 2
+  %tobool = icmp ne i16 %1, 0
+  br i1 %tobool, label %lor.end, label %lor.rhs
+
+lor.rhs:                                          ; preds = %entry
+  br label %lor.end
+
+lor.end:                                          ; preds = %lor.rhs, %entry
+  %2 = phi i1 [ true, %entry ], [ false, %lor.rhs ]
+  %conv1 = zext i1 %2 to i32
+  %cmp = icmp slt i32 %conv, %conv1
+  %conv2 = zext i1 %cmp to i32
+  %neg = xor i32 %conv2, -1
+  %tobool3 = icmp ne i32 %neg, 0
+  br i1 %tobool3, label %lor.end5, label %lor.rhs4
+
+lor.rhs4:                                         ; preds = %lor.end
+  br label %lor.end5
+
+lor.end5:                                         ; preds = %lor.rhs4, %lor.end
+  %3 = phi i1 [ true, %lor.end ], [ false, %lor.rhs4 ]
+  %conv6 = zext i1 %3 to i16
+  store i16 %conv6, i16* %bb, align 2
+  %4 = load i16, i16* %bb, align 2
+  %conv7 = zext i16 %4 to i32
+  ret i32 %conv7
+}
diff --git a/test/CodeGen/X86/pr32256.ll b/test/CodeGen/X86/pr32256.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cb26c13e53eb2423a42bc208620908fa2324afa9
--- /dev/null
+++ b/test/CodeGen/X86/pr32256.ll
@@ -0,0 +1,59 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -O0 -mcpu=skx | FileCheck %s
+
+@c = external global i8, align 1
+
+; Function Attrs: noinline nounwind
+define void @_Z1av() {
+; CHECK-LABEL: _Z1av:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    subl $6, %esp
+; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 10
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    kmovd %eax, %k0
+; CHECK-NEXT:    movb c, %cl
+; CHECK-NEXT:    # implicit-def: %EAX
+; CHECK-NEXT:    movb %cl, %al
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    kmovd %eax, %k1
+; CHECK-NEXT:    kmovq %k1, %k2
+; CHECK-NEXT:    kxnorw %k0, %k0, %k3
+; CHECK-NEXT:    kshiftrw $15, %k3, %k3
+; CHECK-NEXT:    kxorw %k3, %k1, %k1
+; CHECK-NEXT:    kmovd %k1, %eax
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    testb $1, %cl
+; CHECK-NEXT:    kmovw %k2, {{[0-9]+}}(%esp) # 2-byte Spill
+; CHECK-NEXT:    kmovw %k0, (%esp) # 2-byte Spill
+; CHECK-NEXT:    jne .LBB0_1
+; CHECK-NEXT:    jmp .LBB0_2
+; CHECK-NEXT:  .LBB0_1: # %land.rhs
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    kmovd %eax, %k0
+; CHECK-NEXT:    kmovw %k0, (%esp) # 2-byte Spill
+; CHECK-NEXT:    jmp .LBB0_2
+; CHECK-NEXT:  .LBB0_2: # %land.end
+; CHECK-NEXT:    kmovw (%esp), %k0 # 2-byte Reload
+; CHECK-NEXT:    kmovd %k0, %eax
+; CHECK-NEXT:    movb %al, %cl
+; CHECK-NEXT:    andb $1, %cl
+; CHECK-NEXT:    movb %cl, {{[0-9]+}}(%esp)
+; CHECK-NEXT:    addl $6, %esp
+; CHECK-NEXT:    retl
+entry:
+  %b = alloca i8, align 1
+  %0 = load i8, i8* @c, align 1
+  %tobool = trunc i8 %0 to i1
+  %lnot = xor i1 %tobool, true
+  br i1 %lnot, label %land.rhs, label %land.end
+
+land.rhs:                                         ; preds = %entry
+  br label %land.end
+
+land.end:                                         ; preds = %land.rhs, %entry
+  %1 = phi i1 [ false, %entry ], [ false, %land.rhs ]
+  %conv = zext i1 %1 to i8
+  store i8 %conv, i8* %b, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/pr32278.ll b/test/CodeGen/X86/pr32278.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1b130c838baef384f4066528be2d9a7740da9236
--- /dev/null
+++ b/test/CodeGen/X86/pr32278.ll
@@ -0,0 +1,11 @@
+; PR32278
+
+; RUN: llc -mtriple=x86_64-unknown < %s
+
+define i8 @foo_v4i1_0_0_1_1_2_2_3_3(i8 %in) {
+  %trunc = trunc i8 %in to i4
+  %mask = bitcast i4 %trunc to <4 x i1>
+  %s = shufflevector <4 x i1> %mask, <4 x i1> undef, <8 x i32> <i32 0, i32 0, i32 1, i32 1, i32 2, i32 2, i32 3, i32 3>
+  %b = bitcast <8 x i1> %s to i8
+  ret i8 %b
+}
diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e05fc926b0801e9a45e0df6e912dc4b066d764a3
--- /dev/null
+++ b/test/CodeGen/X86/pr32284.ll
@@ -0,0 +1,117 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx -O0 | FileCheck %s --check-prefix=X86-O0
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx -O0 | FileCheck %s --check-prefix=X64-O0
+
+@c = external constant i8, align 1
+
+define void @foo() {
+; X86-LABEL: foo:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    subl $8, %esp
+; X86-NEXT:  .Lcfi0:
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    movzbl c, %eax
+; X86-NEXT:    xorl %ecx, %ecx
+; X86-NEXT:    testl %eax, %eax
+; X86-NEXT:    setne %cl
+; X86-NEXT:    testb %al, %al
+; X86-NEXT:    setne {{[0-9]+}}(%esp)
+; X86-NEXT:    xorl %edx, %edx
+; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    setle %dl
+; X86-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    retl
+;
+; X86-O0-LABEL: foo:
+; X86-O0:       # BB#0: # %entry
+; X86-O0-NEXT:    subl $12, %esp
+; X86-O0-NEXT:  .Lcfi0:
+; X86-O0-NEXT:    .cfi_def_cfa_offset 16
+; X86-O0-NEXT:    movzbl c, %eax
+; X86-O0-NEXT:    testl %eax, %eax
+; X86-O0-NEXT:    setne %cl
+; X86-O0-NEXT:    movl %eax, %edx
+; X86-O0-NEXT:    movb %dl, %ch
+; X86-O0-NEXT:    testb %ch, %ch
+; X86-O0-NEXT:    setne {{[0-9]+}}(%esp)
+; X86-O0-NEXT:    movzbl %cl, %edx
+; X86-O0-NEXT:    subl %eax, %edx
+; X86-O0-NEXT:    setle %cl
+; X86-O0-NEXT:    # implicit-def: %EAX
+; X86-O0-NEXT:    movb %cl, %al
+; X86-O0-NEXT:    andl $1, %eax
+; X86-O0-NEXT:    kmovd %eax, %k0
+; X86-O0-NEXT:    kmovd %k0, %eax
+; X86-O0-NEXT:    movb %al, %cl
+; X86-O0-NEXT:    andb $1, %cl
+; X86-O0-NEXT:    movzbl %cl, %eax
+; X86-O0-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-O0-NEXT:    movl %edx, (%esp) # 4-byte Spill
+; X86-O0-NEXT:    addl $12, %esp
+; X86-O0-NEXT:    retl
+;
+; X64-LABEL: foo:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movzbl {{.*}}(%rip), %eax
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    setne -{{[0-9]+}}(%rsp)
+; X64-NEXT:    xorl %ecx, %ecx
+; X64-NEXT:    testl %eax, %eax
+; X64-NEXT:    setne %cl
+; X64-NEXT:    xorl %edx, %edx
+; X64-NEXT:    cmpl %eax, %ecx
+; X64-NEXT:    setle %dl
+; X64-NEXT:    movl %edx, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    retq
+;
+; X64-O0-LABEL: foo:
+; X64-O0:       # BB#0: # %entry
+; X64-O0-NEXT:    movzbl {{.*}}(%rip), %eax
+; X64-O0-NEXT:    movl %eax, %ecx
+; X64-O0-NEXT:    movb %cl, %dl
+; X64-O0-NEXT:    movl %ecx, %eax
+; X64-O0-NEXT:    testq %rcx, %rcx
+; X64-O0-NEXT:    setne %sil
+; X64-O0-NEXT:    testb %dl, %dl
+; X64-O0-NEXT:    setne -{{[0-9]+}}(%rsp)
+; X64-O0-NEXT:    movzbl %sil, %edi
+; X64-O0-NEXT:    subl %eax, %edi
+; X64-O0-NEXT:    setle %dl
+; X64-O0-NEXT:    # implicit-def: %EAX
+; X64-O0-NEXT:    movb %dl, %al
+; X64-O0-NEXT:    andl $1, %eax
+; X64-O0-NEXT:    kmovd %eax, %k0
+; X64-O0-NEXT:    kmovd %k0, %eax
+; X64-O0-NEXT:    movb %al, %dl
+; X64-O0-NEXT:    andb $1, %dl
+; X64-O0-NEXT:    movzbl %dl, %eax
+; X64-O0-NEXT:    movl %eax, -{{[0-9]+}}(%rsp)
+; X64-O0-NEXT:    movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill
+; X64-O0-NEXT:    retq
+entry:
+  %a = alloca i8, align 1
+  %b = alloca i32, align 4
+  %0 = load i8, i8* @c, align 1
+  %conv = zext i8 %0 to i32
+  %sub = sub nsw i32 0, %conv
+  %conv1 = sext i32 %sub to i64
+  %sub2 = sub nsw i64 0, %conv1
+  %conv3 = trunc i64 %sub2 to i8
+  %tobool = icmp ne i8 %conv3, 0
+  %frombool = zext i1 %tobool to i8
+  store i8 %frombool, i8* %a, align 1
+  %1 = load i8, i8* @c, align 1
+  %tobool4 = icmp ne i8 %1, 0
+  %lnot = xor i1 %tobool4, true
+  %lnot5 = xor i1 %lnot, true
+  %conv6 = zext i1 %lnot5 to i32
+  %2 = load i8, i8* @c, align 1
+  %conv7 = zext i8 %2 to i32
+  %cmp = icmp sle i32 %conv6, %conv7
+  %conv8 = zext i1 %cmp to i32
+  store i32 %conv8, i32* %b, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/pr32329.ll b/test/CodeGen/X86/pr32329.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f2b79b67877f764109cb4b911321c6500f8791e5
--- /dev/null
+++ b/test/CodeGen/X86/pr32329.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=skx | FileCheck %s -check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=skx | FileCheck %s -check-prefix=X64
+
+%struct.AA = type { i24, [4 x i8] }
+
+@obj = external local_unnamed_addr global %struct.AA, align 8
+@var_27 = external local_unnamed_addr constant i8, align 1
+@var_2 = external local_unnamed_addr constant i16, align 2
+@var_24 = external local_unnamed_addr constant i64, align 8
+@var_310 = external local_unnamed_addr global i64, align 8
+@var_50 = external local_unnamed_addr global i64, align 8
+@var_205 = external local_unnamed_addr global i8, align 1
+@var_218 = external local_unnamed_addr global i8, align 1
+
+define void @foo() local_unnamed_addr {
+; X86-LABEL: foo:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    pushl %ebp
+; X86-NEXT:  .Lcfi0:
+; X86-NEXT:    .cfi_def_cfa_offset 8
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:  .Lcfi1:
+; X86-NEXT:    .cfi_def_cfa_offset 12
+; X86-NEXT:    pushl %edi
+; X86-NEXT:  .Lcfi2:
+; X86-NEXT:    .cfi_def_cfa_offset 16
+; X86-NEXT:    pushl %esi
+; X86-NEXT:  .Lcfi3:
+; X86-NEXT:    .cfi_def_cfa_offset 20
+; X86-NEXT:  .Lcfi4:
+; X86-NEXT:    .cfi_offset %esi, -20
+; X86-NEXT:  .Lcfi5:
+; X86-NEXT:    .cfi_offset %edi, -16
+; X86-NEXT:  .Lcfi6:
+; X86-NEXT:    .cfi_offset %ebx, -12
+; X86-NEXT:  .Lcfi7:
+; X86-NEXT:    .cfi_offset %ebp, -8
+; X86-NEXT:    movl obj, %edx
+; X86-NEXT:    movsbl var_27, %eax
+; X86-NEXT:    movzwl var_2, %esi
+; X86-NEXT:    movl var_310, %ecx
+; X86-NEXT:    imull %eax, %ecx
+; X86-NEXT:    addl var_24, %ecx
+; X86-NEXT:    andl $4194303, %edx # imm = 0x3FFFFF
+; X86-NEXT:    leal (%edx,%edx), %ebx
+; X86-NEXT:    subl %eax, %ebx
+; X86-NEXT:    movl %ebx, %edi
+; X86-NEXT:    subl %esi, %edi
+; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    addl $-1437483407, %ecx # imm = 0xAA51BE71
+; X86-NEXT:    movl $9, %esi
+; X86-NEXT:    xorl %ebp, %ebp
+; X86-NEXT:    shldl %cl, %esi, %ebp
+; X86-NEXT:    shll %cl, %esi
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    cmovnel %esi, %ebp
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    cmovnel %ecx, %esi
+; X86-NEXT:    cmpl %edx, %edi
+; X86-NEXT:    movl %ebp, var_50+4
+; X86-NEXT:    movl %esi, var_50
+; X86-NEXT:    setge var_205
+; X86-NEXT:    imull %eax, %ebx
+; X86-NEXT:    movb %bl, var_218
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    popl %ebp
+; X86-NEXT:    retl
+;
+; X64-LABEL: foo:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movl {{.*}}(%rip), %eax
+; X64-NEXT:    movsbl {{.*}}(%rip), %r9d
+; X64-NEXT:    movzwl {{.*}}(%rip), %r8d
+; X64-NEXT:    movl {{.*}}(%rip), %esi
+; X64-NEXT:    imull %r9d, %esi
+; X64-NEXT:    addl {{.*}}(%rip), %esi
+; X64-NEXT:    andl $4194303, %eax # imm = 0x3FFFFF
+; X64-NEXT:    leal (%rax,%rax), %edi
+; X64-NEXT:    subl %r9d, %edi
+; X64-NEXT:    movl %edi, %edx
+; X64-NEXT:    subl %r8d, %edx
+; X64-NEXT:    imull %edx, %esi
+; X64-NEXT:    addl $-1437483407, %esi # imm = 0xAA51BE71
+; X64-NEXT:    movl $9, %ecx
+; X64-NEXT:    shlxq %rsi, %rcx, %rcx
+; X64-NEXT:    movq %rcx, {{.*}}(%rip)
+; X64-NEXT:    cmpl %eax, %edx
+; X64-NEXT:    setge {{.*}}(%rip)
+; X64-NEXT:    imull %r9d, %edi
+; X64-NEXT:    movb %dil, {{.*}}(%rip)
+; X64-NEXT:    retq
+  entry:
+  %bf.load = load i32, i32* bitcast (%struct.AA* @obj to i32*), align 8
+  %bf.clear = shl i32 %bf.load, 1
+  %add = and i32 %bf.clear, 8388606
+  %0 = load i8, i8* @var_27, align 1
+  %conv5 = sext i8 %0 to i32
+  %sub = sub nsw i32 %add, %conv5
+  %1 = load i16, i16* @var_2, align 2
+  %conv6 = zext i16 %1 to i32
+  %sub7 = sub nsw i32 %sub, %conv6
+  %conv8 = sext i32 %sub7 to i64
+  %2 = load i64, i64* @var_24, align 8
+  %3 = load i64, i64* @var_310, align 8
+  %conv9 = sext i8 %0 to i64
+  %mul = mul i64 %3, %conv9
+  %add10 = add i64 %mul, %2
+  %mul11 = mul i64 %add10, %conv8
+  %sub12 = add i64 %mul11, 8662905354777116273
+  %shl = shl i64 9, %sub12
+  store i64 %shl, i64* @var_50, align 8
+  %bf.clear14 = and i32 %bf.load, 4194303
+  %add21 = shl nuw nsw i32 %bf.clear14, 1
+  %sub23 = sub nsw i32 %add21, %conv5
+  %sub25 = sub nsw i32 %sub23, %conv6
+  %cmp = icmp sge i32 %sub25, %bf.clear14
+  %conv30 = zext i1 %cmp to i8
+  store i8 %conv30, i8* @var_205, align 1
+  %mul43 = mul nsw i32 %sub, %conv5
+  %conv44 = trunc i32 %mul43 to i8
+  store i8 %conv44, i8* @var_218, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/pr32340.ll b/test/CodeGen/X86/pr32340.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cd9b5af1dc56502397bb9d23900b1208d4237433
--- /dev/null
+++ b/test/CodeGen/X86/pr32340.ll
@@ -0,0 +1,77 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s -check-prefix=X64
+
+@var_825 = external global i16, align 2
+@var_32 = external global i16, align 2
+@var_901 = external global i16, align 2
+@var_826 = external global i64, align 8
+@var_57 = external global i64, align 8
+@var_900 = external global i16, align 2
+@var_28 = external constant i64, align 8
+@var_827 = external global i16, align 2
+
+define void @foo() {
+; X64-LABEL: foo:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    movw $0, {{.*}}(%rip)
+; X64-NEXT:    movzwl {{.*}}(%rip), %eax
+; X64-NEXT:    movw %ax, %cx
+; X64-NEXT:    movw {{.*}}(%rip), %dx
+; X64-NEXT:    xorw %dx, %cx
+; X64-NEXT:    # implicit-def: %ESI
+; X64-NEXT:    movw %cx, %si
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    movw %di, %cx
+; X64-NEXT:    movzwl %cx, %esi
+; X64-NEXT:    movl %esi, %edi
+; X64-NEXT:    addl %eax, %edi
+; X64-NEXT:    movl %edi, %r8d
+; X64-NEXT:    movq %r8, {{.*}}(%rip)
+; X64-NEXT:    xorl $-772157262, %esi # imm = 0xD1F9D0B2
+; X64-NEXT:    movl {{.*}}(%rip), %eax
+; X64-NEXT:    movl %esi, %edi
+; X64-NEXT:    orl %eax, %edi
+; X64-NEXT:    orl %edi, %esi
+; X64-NEXT:    movw %si, %cx
+; X64-NEXT:    movw %cx, {{.*}}(%rip)
+; X64-NEXT:    movq {{.*}}(%rip), %r8
+; X64-NEXT:    testq %r8, %r8
+; X64-NEXT:    setne %r9b
+; X64-NEXT:    movzbl %r9b, %eax
+; X64-NEXT:    movw %ax, %cx
+; X64-NEXT:    movw %cx, var_827
+; X64-NEXT:    retq
+entry:
+  store i16 0, i16* @var_825, align 2
+  %v0 = load i16, i16* @var_32, align 2
+  %conv = zext i16 %v0 to i32
+  %v2 = load i16, i16* @var_901, align 2
+  %conv2 = zext i16 %v2 to i32
+  %xor = xor i32 %conv, %conv2
+  %xor3 = xor i32 %conv, %xor
+  %add = add nsw i32 %xor3, %conv
+  %conv5 = sext i32 %add to i64
+  store i64 %conv5, i64* @var_826, align 8
+  %v4 = load i16, i16* @var_32, align 2
+  %conv6 = zext i16 %v4 to i64
+  %v6 = load i16, i16* @var_901, align 2
+  %conv8 = zext i16 %v6 to i32
+  %xor9 = xor i32 51981, %conv8
+  %conv10 = sext i32 %xor9 to i64
+  %xor11 = xor i64 -1142377792914660288, %conv10
+  %xor12 = xor i64 %conv6, %xor11
+  %neg = xor i64 %xor12, -1
+  %xor13 = xor i64 %conv6, %neg
+  %v9 = load i16, i16* @var_901, align 2
+  %v10 = load i64, i64* @var_57, align 8
+  %or = or i64 %xor13, %v10
+  %or23 = or i64 %xor13, %or
+  %conv24 = trunc i64 %or23 to i16
+  store i16 %conv24, i16* @var_900, align 2
+  %v11 = load i64, i64* @var_28, align 8
+  %cmp = icmp ne i64 0, %v11
+  %conv25 = zext i1 %cmp to i16
+  store i16 %conv25, i16* @var_827, align 2
+  ret void
+}
diff --git a/test/CodeGen/X86/pr32345.ll b/test/CodeGen/X86/pr32345.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e9182698dd90067778998244ad78530e7dcd4b25
--- /dev/null
+++ b/test/CodeGen/X86/pr32345.ll
@@ -0,0 +1,169 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s -check-prefix=X640
+; RUN: llc -O0 -mtriple=i686-unknown             -o - %s | FileCheck %s -check-prefix=6860
+; RUN: llc     -mtriple=x86_64-unknown-linux-gnu -o - %s | FileCheck %s -check-prefix=X64
+; RUN: llc     -mtriple=i686-unknown             -o - %s | FileCheck %s -check-prefix=686
+
+@var_22 = external global i16, align 2
+@var_27 = external global i16, align 2
+
+define void @foo() {
+; X640-LABEL: foo:
+; X640:       # BB#0: # %bb
+; X640-NEXT:    # implicit-def: %RAX
+; X640-NEXT:    movzwl var_22, %ecx
+; X640-NEXT:    movzwl var_27, %edx
+; X640-NEXT:    xorl %edx, %ecx
+; X640-NEXT:    movzwl var_27, %edx
+; X640-NEXT:    xorl %edx, %ecx
+; X640-NEXT:    movslq %ecx, %rsi
+; X640-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
+; X640-NEXT:    movzwl var_22, %ecx
+; X640-NEXT:    movzwl var_27, %edx
+; X640-NEXT:    xorl %edx, %ecx
+; X640-NEXT:    movzwl var_27, %edx
+; X640-NEXT:    xorl %edx, %ecx
+; X640-NEXT:    movslq %ecx, %rsi
+; X640-NEXT:    movzwl var_27, %ecx
+; X640-NEXT:    subl $16610, %ecx # imm = 0x40E2
+; X640-NEXT:    movl %ecx, %ecx
+; X640-NEXT:    # kill: %RCX<def> %ECX<kill>
+; X640-NEXT:    # kill: %CL<def> %RCX<kill>
+; X640-NEXT:    sarq %cl, %rsi
+; X640-NEXT:    movb %sil, %cl
+; X640-NEXT:    movb %cl, (%rax)
+; X640-NEXT:    retq
+;
+; 6860-LABEL: foo:
+; 6860:       # BB#0: # %bb
+; 6860-NEXT:    pushl %ebp
+; 6860-NEXT:  .Lcfi0:
+; 6860-NEXT:    .cfi_def_cfa_offset 8
+; 6860-NEXT:  .Lcfi1:
+; 6860-NEXT:    .cfi_offset %ebp, -8
+; 6860-NEXT:    movl %esp, %ebp
+; 6860-NEXT:  .Lcfi2:
+; 6860-NEXT:    .cfi_def_cfa_register %ebp
+; 6860-NEXT:    pushl %ebx
+; 6860-NEXT:    pushl %edi
+; 6860-NEXT:    pushl %esi
+; 6860-NEXT:    andl $-8, %esp
+; 6860-NEXT:    subl $32, %esp
+; 6860-NEXT:  .Lcfi3:
+; 6860-NEXT:    .cfi_offset %esi, -20
+; 6860-NEXT:  .Lcfi4:
+; 6860-NEXT:    .cfi_offset %edi, -16
+; 6860-NEXT:  .Lcfi5:
+; 6860-NEXT:    .cfi_offset %ebx, -12
+; 6860-NEXT:    # implicit-def: %EAX
+; 6860-NEXT:    movw var_22, %cx
+; 6860-NEXT:    movzwl var_27, %edx
+; 6860-NEXT:    movw %dx, %si
+; 6860-NEXT:    xorw %si, %cx
+; 6860-NEXT:    # implicit-def: %EDI
+; 6860-NEXT:    movw %cx, %di
+; 6860-NEXT:    xorl %edx, %edi
+; 6860-NEXT:    movw %di, %cx
+; 6860-NEXT:    movzwl %cx, %edi
+; 6860-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; 6860-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; 6860-NEXT:    addl $-16610, %edx # imm = 0xBF1E
+; 6860-NEXT:    movb %dl, %bl
+; 6860-NEXT:    xorl %edx, %edx
+; 6860-NEXT:    movb %bl, %cl
+; 6860-NEXT:    shrdl %cl, %edx, %edi
+; 6860-NEXT:    testb $32, %bl
+; 6860-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; 6860-NEXT:    movl %edi, {{[0-9]+}}(%esp) # 4-byte Spill
+; 6860-NEXT:    movl %edx, {{[0-9]+}}(%esp) # 4-byte Spill
+; 6860-NEXT:    jne .LBB0_2
+; 6860-NEXT:  # BB#1: # %bb
+; 6860-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; 6860-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; 6860-NEXT:  .LBB0_2: # %bb
+; 6860-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; 6860-NEXT:    movb %al, %cl
+; 6860-NEXT:    movl {{[0-9]+}}(%esp), %eax # 4-byte Reload
+; 6860-NEXT:    movb %cl, (%eax)
+; 6860-NEXT:    leal -12(%ebp), %esp
+; 6860-NEXT:    popl %esi
+; 6860-NEXT:    popl %edi
+; 6860-NEXT:    popl %ebx
+; 6860-NEXT:    popl %ebp
+; 6860-NEXT:    retl
+;
+; X64-LABEL: foo:
+; X64:       # BB#0: # %bb
+; X64-NEXT:    movzwl {{.*}}(%rip), %ecx
+; X64-NEXT:    movw {{.*}}(%rip), %ax
+; X64-NEXT:    xorw %cx, %ax
+; X64-NEXT:    xorl %ecx, %eax
+; X64-NEXT:    movzwl %ax, %eax
+; X64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    addl $-16610, %ecx # imm = 0xBF1E
+; X64-NEXT:    # kill: %CL<def> %CL<kill> %ECX<kill>
+; X64-NEXT:    shrq %cl, %rax
+; X64-NEXT:    movb %al, (%rax)
+; X64-NEXT:    retq
+;
+; 686-LABEL: foo:
+; 686:       # BB#0: # %bb
+; 686-NEXT:    pushl %ebp
+; 686-NEXT:  .Lcfi0:
+; 686-NEXT:    .cfi_def_cfa_offset 8
+; 686-NEXT:  .Lcfi1:
+; 686-NEXT:    .cfi_offset %ebp, -8
+; 686-NEXT:    movl %esp, %ebp
+; 686-NEXT:  .Lcfi2:
+; 686-NEXT:    .cfi_def_cfa_register %ebp
+; 686-NEXT:    andl $-8, %esp
+; 686-NEXT:    subl $8, %esp
+; 686-NEXT:    movzwl var_27, %ecx
+; 686-NEXT:    movw var_22, %ax
+; 686-NEXT:    xorw %cx, %ax
+; 686-NEXT:    xorl %ecx, %eax
+; 686-NEXT:    movzwl %ax, %eax
+; 686-NEXT:    movl %eax, (%esp)
+; 686-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; 686-NEXT:    addl $-16610, %ecx # imm = 0xBF1E
+; 686-NEXT:    xorl %edx, %edx
+; 686-NEXT:    shrdl %cl, %edx, %eax
+; 686-NEXT:    testb $32, %cl
+; 686-NEXT:    jne .LBB0_2
+; 686-NEXT:  # BB#1: # %bb
+; 686-NEXT:    movl %eax, %edx
+; 686-NEXT:  .LBB0_2: # %bb
+; 686-NEXT:    movb %dl, (%eax)
+; 686-NEXT:    movl %ebp, %esp
+; 686-NEXT:    popl %ebp
+; 686-NEXT:    retl
+bb:
+  %tmp = alloca i64, align 8
+  %tmp1 = load i16, i16* @var_22, align 2
+  %tmp2 = zext i16 %tmp1 to i32
+  %tmp3 = load i16, i16* @var_27, align 2
+  %tmp4 = zext i16 %tmp3 to i32
+  %tmp5 = xor i32 %tmp2, %tmp4
+  %tmp6 = load i16, i16* @var_27, align 2
+  %tmp7 = zext i16 %tmp6 to i32
+  %tmp8 = xor i32 %tmp5, %tmp7
+  %tmp9 = sext i32 %tmp8 to i64
+  store i64 %tmp9, i64* %tmp, align 8
+  %tmp10 = load i16, i16* @var_22, align 2
+  %tmp11 = zext i16 %tmp10 to i32
+  %tmp12 = load i16, i16* @var_27, align 2
+  %tmp13 = zext i16 %tmp12 to i32
+  %tmp14 = xor i32 %tmp11, %tmp13
+  %tmp15 = load i16, i16* @var_27, align 2
+  %tmp16 = zext i16 %tmp15 to i32
+  %tmp17 = xor i32 %tmp14, %tmp16
+  %tmp18 = sext i32 %tmp17 to i64
+  %tmp19 = load i16, i16* @var_27, align 2
+  %tmp20 = zext i16 %tmp19 to i32
+  %tmp21 = sub nsw i32 %tmp20, 16610
+  %tmp22 = zext i32 %tmp21 to i64
+  %tmp23 = ashr i64 %tmp18, %tmp22
+  %tmp24 = trunc i64 %tmp23 to i8
+  store i8 %tmp24, i8* undef, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/pr32420.ll b/test/CodeGen/X86/pr32420.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bf3a4720c080af8fbc3029d5ff37f82d07308f46
--- /dev/null
+++ b/test/CodeGen/X86/pr32420.ll
@@ -0,0 +1,36 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+@a = common local_unnamed_addr global i16 0, align 4
+@b = common local_unnamed_addr global i16 0, align 4
+
+define i32 @PR32420() {
+; CHECK-LABEL: PR32420:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movq _a@{{.*}}(%rip), %rax
+; CHECK-NEXT:    movzwl (%rax), %eax
+; CHECK-NEXT:    movl %eax, %ecx
+; CHECK-NEXT:    shll $12, %ecx
+; CHECK-NEXT:    sarw $12, %cx
+; CHECK-NEXT:    movq _b@{{.*}}(%rip), %rdx
+; CHECK-NEXT:    movw %cx, %si
+; CHECK-NEXT:    orw (%rdx), %si
+; CHECK-NEXT:    andl %ecx, %esi
+; CHECK-NEXT:    movw %si, (%rdx)
+; CHECK-NEXT:    retq
+  %load2 = load i16, i16* @a, align 4
+  %shl3 = shl i16 %load2, 12
+  %ashr4 = ashr i16 %shl3, 12
+  %t2 = load volatile i16, i16* @b, align 4
+  %conv8 = or i16 %t2, %ashr4
+  %load9 = load i16, i16* @a, align 4
+  %shl10 = shl i16 %load9, 12
+  %ashr11 = ashr i16 %shl10, 12
+  %and = and i16 %conv8, %ashr11
+  store i16 %and, i16* @b, align 4
+  %cast1629 = zext i16 %load2 to i32
+  ret i32 %cast1629
+}
diff --git a/test/CodeGen/X86/pr32451.ll b/test/CodeGen/X86/pr32451.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d980b7ff284cfc18970168d278267aa694d2e0b1
--- /dev/null
+++ b/test/CodeGen/X86/pr32451.ll
@@ -0,0 +1,69 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -fast-isel -mtriple=i686-unknown-unknown -O0 -mcpu=knl | FileCheck %s
+
+; ModuleID = 'convert'
+source_filename = "convert"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i8** @japi1_convert_690(i8**, i8***, i32) {
+; CHECK-LABEL: japi1_convert_690:
+; CHECK:       # BB#0: # %top
+; CHECK-NEXT:    pushl %ebx
+; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    subl $16, %esp
+; CHECK-NEXT:  .Lcfi1:
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:  .Lcfi2:
+; CHECK-NEXT:    .cfi_offset %ebx, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-NEXT:    calll julia.gc_root_decl
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-NEXT:    calll jl_get_ptls_states
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl 4(%ecx), %edx
+; CHECK-NEXT:    movb (%edx), %bl
+; CHECK-NEXT:    # implicit-def: %EDX
+; CHECK-NEXT:    movb %bl, %dl
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    kmovw %edx, %k0
+; CHECK-NEXT:    kmovw %k0, %edx
+; CHECK-NEXT:    movb %dl, %bl
+; CHECK-NEXT:    andb $1, %bl
+; CHECK-NEXT:    movzbl %bl, %edx
+; CHECK-NEXT:    movl %edx, (%esp)
+; CHECK-NEXT:    movl %eax, {{[0-9]+}}(%esp) # 4-byte Spill
+; CHECK-NEXT:    calll jl_box_int32
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx # 4-byte Reload
+; CHECK-NEXT:    movl %eax, (%ecx)
+; CHECK-NEXT:    addl $16, %esp
+; CHECK-NEXT:    popl %ebx
+; CHECK-NEXT:    retl
+top:
+  %3 = alloca i8***
+  store volatile i8*** %1, i8**** %3
+  %4 = call i8*** @julia.gc_root_decl()
+  %5 = call i8**** @jl_get_ptls_states()
+  %6 = bitcast i8**** %5 to i8***
+  %7 = getelementptr i8**, i8*** %6, i64 3
+  %8 = bitcast i8*** %7 to i64**
+  %9 = load i64*, i64** %8
+  %10 = getelementptr i8**, i8*** %1, i64 1
+  %11 = load i8**, i8*** %10
+  %12 = bitcast i8** %11 to i8*
+  %13 = load i8, i8* %12
+  %14 = trunc i8 %13 to i1
+  %15 = zext i1 %14 to i8
+  %16 = zext i8 %15 to i32
+  %17 = call i8** @jl_box_int32(i32 signext %16)
+  store i8** %17, i8*** %4
+  ret i8** %17
+}
+
+declare i8**** @jl_get_ptls_states()
+
+declare i8** @jl_box_int32(i32)
+
+declare i8*** @julia.gc_root_decl()
diff --git a/test/CodeGen/X86/pr32484.ll b/test/CodeGen/X86/pr32484.ll
new file mode 100644
index 0000000000000000000000000000000000000000..74857f8d006641412fef77712582608496062c21
--- /dev/null
+++ b/test/CodeGen/X86/pr32484.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define void @foo() {
+; CHECK-LABEL: foo:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    # implicit-def: %RAX
+; CHECK-NEXT:    jmpq *%rax
+; CHECK-NEXT:  .LBB0_1:
+; CHECK-NEXT:    # implicit-def: %RAX
+; CHECK-NEXT:    xorps %xmm0, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    movdqu %xmm1, (%rax)
+; CHECK-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; CHECK-NEXT:  .LBB0_2:
+; CHECK-NEXT:    retq
+  indirectbr i8* undef, [label %9, label %1]
+
+; <label>:1:                                      ; preds = %0
+  %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %3 = shufflevector <16 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> zeroinitializer, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %4 = or <16 x i8> %3, %2
+  %5 = shufflevector <16 x i8> %4, <16 x i8> undef, <16 x i32> <i32 8, i32 5, i32 1, i32 13, i32 15, i32 10, i32 14, i32 0, i32 3, i32 2, i32 7, i32 4, i32 6, i32 9, i32 11, i32 12>
+  %6 = bitcast <16 x i8> %5 to <2 x i64>
+  %7 = xor <2 x i64> %6, zeroinitializer
+  %8 = xor <2 x i64> %7, <i64 -1, i64 -1>
+  store <2 x i64> %8, <2 x i64>* undef, align 1
+  unreachable
+
+; <label>:9:                                      ; preds = %0
+  ret void
+}
diff --git a/test/CodeGen/X86/pr32588.ll b/test/CodeGen/X86/pr32588.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eee1d651c3e8283d8564e34b6494d9a4f85dfbc3
--- /dev/null
+++ b/test/CodeGen/X86/pr32588.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux | FileCheck %s
+
+@c = external local_unnamed_addr global i32, align 4
+@b = external local_unnamed_addr global i32, align 4
+@d = external local_unnamed_addr global i32, align 4
+
+; CHECK: cmpl    $1, c(%rip)
+; CHECK-NEXT: sbbl    %eax, %eax
+; CHECK-NEXT: andl    $1, %eax
+; CHECK-NEXT: movl    %eax, d(%rip)
+; CHECK-NEXT: retq
+
+define void @fn1() {
+entry:
+  %0 = load i32, i32* @c, align 4
+  %tobool1 = icmp eq i32 %0, 0
+  %xor = zext i1 %tobool1 to i32
+  %1 = load i32, i32* @b, align 4
+  %tobool2 = icmp ne i32 %1, 0
+  %tobool4 = icmp ne i32 undef, 0
+  %2 = and i1 %tobool4, %tobool2
+  %sub = sext i1 %2 to i32
+  %div = sdiv i32 %sub, 2
+  %add = add nsw i32 %div, %xor
+  store i32 %add, i32* @d, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/prefixdata.ll b/test/CodeGen/X86/prefixdata.ll
index 9bb54a2a39776c779311ada95dee34064293582f..b62f48ddce27dc1ce7916f0ae92257400cdb2d52 100644
--- a/test/CodeGen/X86/prefixdata.ll
+++ b/test/CodeGen/X86/prefixdata.ll
@@ -1,18 +1,29 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck --check-prefix=MACHO %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck --check-prefix=ELF %s
 
 @i = linkonce_odr global i32 1
 
-; CHECK: .type f,@function
-; CHECK-NEXT: .long	1
-; CHECK-NEXT: # 0x1
-; CHECK-NEXT: f:
+; MACHO: ltmp0:
+; MACHO-NEXT: .long 1
+; MACHO-NEXT: .alt_entry _f
+; MACHO-NEXT: _f:
+; ELF: .type f,@function
+; ELF-NEXT: .long	1
+; ELF-NEXT: # 0x1
+; ELF-NEXT: f:
 define void @f() prefix i32 1 {
   ret void
 }
 
-; CHECK: .type g,@function
-; CHECK-NEXT: .quad	i
-; CHECK-NEXT: g:
+; MACHO: ltmp1:
+; MACHO-NEXT: .quad _i
+; MACHO-NEXT: .alt_entry _g
+; MACHO-NEXT: _g:
+; ELF: .type g,@function
+; ELF-NEXT: .quad	i
+; ELF-NEXT: g:
 define void @g() prefix i32* @i {
   ret void
 }
+
+; MACHO: .subsections_via_symbols
diff --git a/test/CodeGen/X86/promote-vec3.ll b/test/CodeGen/X86/promote-vec3.ll
index 7a496714622ad527cf5857a3e5c698631dc7ce9f..42aeeb14739dccf9993b6ddaab2652aa38e15cae 100644
--- a/test/CodeGen/X86/promote-vec3.ll
+++ b/test/CodeGen/X86/promote-vec3.ll
@@ -9,17 +9,16 @@ define <3 x i16> @zext_i8(<3 x i8>) {
 ; SSE3-LABEL: zext_i8:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT:    pxor %xmm0, %xmm0
-; SSE3-NEXT:    pxor %xmm1, %xmm1
-; SSE3-NEXT:    pinsrw $0, %eax, %xmm1
+; SSE3-NEXT:    movd %eax, %xmm0
 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT:    pinsrw $1, %eax, %xmm1
+; SSE3-NEXT:    pinsrw $1, %eax, %xmm0
 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT:    pinsrw $2, %eax, %xmm1
-; SSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE3-NEXT:    movd %xmm1, %eax
-; SSE3-NEXT:    pextrw $2, %xmm1, %edx
-; SSE3-NEXT:    pextrw $4, %xmm1, %ecx
+; SSE3-NEXT:    pinsrw $2, %eax, %xmm0
+; SSE3-NEXT:    pxor %xmm1, %xmm1
+; SSE3-NEXT:    pextrw $1, %xmm0, %edx
+; SSE3-NEXT:    pextrw $2, %xmm0, %ecx
+; SSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE3-NEXT:    movd %xmm0, %eax
 ; SSE3-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; SSE3-NEXT:    # kill: %DX<def> %DX<kill> %EDX<kill>
 ; SSE3-NEXT:    # kill: %CX<def> %CX<kill> %ECX<kill>
@@ -74,7 +73,7 @@ define <3 x i16> @sext_i8(<3 x i8>) {
 ; SSE3-LABEL: sext_i8:
 ; SSE3:       # BB#0:
 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; SSE3-NEXT:    pinsrw $0, %eax, %xmm0
+; SSE3-NEXT:    movd %eax, %xmm0
 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; SSE3-NEXT:    pinsrw $1, %eax, %xmm0
 ; SSE3-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
@@ -93,7 +92,7 @@ define <3 x i16> @sext_i8(<3 x i8>) {
 ;
 ; SSE41-LABEL: sext_i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; SSE41-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm0
 ; SSE41-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm0
 ; SSE41-NEXT:    pslld $24, %xmm0
@@ -108,7 +107,7 @@ define <3 x i16> @sext_i8(<3 x i8>) {
 ;
 ; AVX-32-LABEL: sext_i8:
 ; AVX-32:       # BB#0:
-; AVX-32-NEXT:    vpinsrb $0, {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX-32-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vpinsrb $4, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    vpinsrb $8, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; AVX-32-NEXT:    vpslld $24, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/psubus.ll b/test/CodeGen/X86/psubus.ll
index 9a11eff5331df228f7444d3814677fd7033c0321..35f96eda35e173110aa70c567f587cd7fcaa5de7 100644
--- a/test/CodeGen/X86/psubus.ll
+++ b/test/CodeGen/X86/psubus.ll
@@ -213,7 +213,7 @@ define void @test7(i16* nocapture %head) nounwind {
 ;
 ; AVX1-LABEL: test7:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm2, %xmm1
@@ -257,13 +257,13 @@ define void @test8(i16* nocapture %head) nounwind {
 ;
 ; AVX1-LABEL: test8:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
-; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [32768,32768,32768,32768,32768,32768,32768,32768]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [65534,65534,65534,65534,65534,65534,65534,65534]
 ; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtw %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [32769,32769,32769,32769,32769,32769,32769,32769]
@@ -310,7 +310,7 @@ define void @test9(i16* nocapture %head, i16 zeroext %w) nounwind {
 ;
 ; AVX1-LABEL: test9:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vmovd %esi, %xmm2
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7]
@@ -364,7 +364,7 @@ define void @test10(i8* nocapture %head) nounwind {
 ;
 ; AVX1-LABEL: test10:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm1
@@ -409,13 +409,13 @@ define void @test11(i8* nocapture %head) nounwind {
 ;
 ; AVX1-LABEL: test11:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
-; AVX1-NEXT:    vxorps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; AVX1-NEXT:    vpxor %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [254,254,254,254,254,254,254,254,254,254,254,254,254,254,254,254]
 ; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm3, %xmm3
-; AVX1-NEXT:    vxorps %xmm2, %xmm0, %xmm2
+; AVX1-NEXT:    vpxor %xmm2, %xmm0, %xmm2
 ; AVX1-NEXT:    vpcmpgtb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [129,129,129,129,129,129,129,129,129,129,129,129,129,129,129,129]
@@ -475,7 +475,7 @@ define void @test12(i8* nocapture %head, i8 zeroext %w) nounwind {
 ;
 ; AVX1-LABEL: test12:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX1-NEXT:    vmovd %esi, %xmm1
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
@@ -542,8 +542,6 @@ define void @test13(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT:    psllw $15, %xmm4
-; SSE2-NEXT:    psraw $15, %xmm4
 ; SSE2-NEXT:    psubd %xmm2, %xmm1
 ; SSE2-NEXT:    pslld $16, %xmm0
 ; SSE2-NEXT:    psrad $16, %xmm0
@@ -577,8 +575,6 @@ define void @test13(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm6
 ; SSSE3-NEXT:    pshufb %xmm5, %xmm6
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm3[0]
-; SSSE3-NEXT:    psllw $15, %xmm6
-; SSSE3-NEXT:    psraw $15, %xmm6
 ; SSSE3-NEXT:    psubd %xmm2, %xmm1
 ; SSSE3-NEXT:    pshufb %xmm5, %xmm0
 ; SSSE3-NEXT:    pshufb %xmm5, %xmm1
@@ -589,7 +585,7 @@ define void @test13(i16* nocapture %head, i32* nocapture %w) nounwind {
 ;
 ; AVX1-LABEL: test13:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovups (%rsi), %ymm0
+; AVX1-NEXT:    vmovdqu (%rsi), %ymm0
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
@@ -598,7 +594,7 @@ define void @test13(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm6, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm6
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpacksswb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
@@ -623,7 +619,7 @@ define void @test13(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpandn %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
@@ -649,150 +645,123 @@ define void @test14(i8* nocapture %head, i32* nocapture %w) nounwind {
 ; SSE2-LABEL: test14:
 ; SSE2:       ## BB#0: ## %vector.ph
 ; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu (%rsi), %xmm10
-; SSE2-NEXT:    movdqu 16(%rsi), %xmm4
-; SSE2-NEXT:    movdqu 32(%rsi), %xmm8
-; SSE2-NEXT:    movdqu 48(%rsi), %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm11
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    movdqu (%rsi), %xmm8
+; SSE2-NEXT:    movdqu 16(%rsi), %xmm9
+; SSE2-NEXT:    movdqu 32(%rsi), %xmm10
+; SSE2-NEXT:    movdqu 48(%rsi), %xmm7
+; SSE2-NEXT:    pxor %xmm3, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSE2-NEXT:    movdqa %xmm0, %xmm7
-; SSE2-NEXT:    psubd %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm6, %xmm4
-; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm4
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT:    movdqa %xmm5, %xmm7
-; SSE2-NEXT:    psubd %xmm10, %xmm5
-; SSE2-NEXT:    pxor %xmm6, %xmm10
-; SSE2-NEXT:    pxor %xmm6, %xmm7
-; SSE2-NEXT:    pcmpgtd %xmm7, %xmm10
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm7 = xmm10[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm7 = xmm7[0],xmm4[0]
-; SSE2-NEXT:    psllw $15, %xmm7
-; SSE2-NEXT:    psraw $15, %xmm7
-; SSE2-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
-; SSE2-NEXT:    pand %xmm10, %xmm7
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    psubd %xmm9, %xmm3
-; SSE2-NEXT:    pxor %xmm6, %xmm9
-; SSE2-NEXT:    pxor %xmm6, %xmm2
-; SSE2-NEXT:    pcmpgtd %xmm2, %xmm9
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm9[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE2-NEXT:    psubd %xmm7, %xmm0
+; SSE2-NEXT:    pxor %xmm3, %xmm7
+; SSE2-NEXT:    pxor %xmm3, %xmm5
+; SSE2-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSE2-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255]
+; SSE2-NEXT:    pand %xmm5, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm4
+; SSE2-NEXT:    psubd %xmm10, %xmm6
+; SSE2-NEXT:    pxor %xmm3, %xmm10
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm10
+; SSE2-NEXT:    pand %xmm5, %xmm10
+; SSE2-NEXT:    packuswb %xmm7, %xmm10
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    psubd %xmm9, %xmm1
+; SSE2-NEXT:    pxor %xmm3, %xmm9
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    pcmpgtd %xmm4, %xmm9
+; SSE2-NEXT:    pand %xmm5, %xmm9
 ; SSE2-NEXT:    movdqa %xmm8, %xmm4
-; SSE2-NEXT:    pxor %xmm6, %xmm4
-; SSE2-NEXT:    pxor %xmm11, %xmm6
-; SSE2-NEXT:    pcmpgtd %xmm6, %xmm4
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,2,2,3,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm2[0]
-; SSE2-NEXT:    psllw $15, %xmm4
-; SSE2-NEXT:    psraw $15, %xmm4
-; SSE2-NEXT:    pand %xmm10, %xmm4
-; SSE2-NEXT:    packuswb %xmm4, %xmm7
-; SSE2-NEXT:    psllw $7, %xmm7
-; SSE2-NEXT:    pand {{.*}}(%rip), %xmm7
-; SSE2-NEXT:    pcmpgtb %xmm7, %xmm1
-; SSE2-NEXT:    psubd %xmm8, %xmm11
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE2-NEXT:    pand %xmm2, %xmm0
-; SSE2-NEXT:    pand %xmm2, %xmm5
-; SSE2-NEXT:    packuswb %xmm0, %xmm5
-; SSE2-NEXT:    pand %xmm2, %xmm3
-; SSE2-NEXT:    pand %xmm2, %xmm11
-; SSE2-NEXT:    packuswb %xmm3, %xmm11
-; SSE2-NEXT:    packuswb %xmm11, %xmm5
-; SSE2-NEXT:    pandn %xmm5, %xmm1
-; SSE2-NEXT:    movdqu %xmm1, (%rdi)
+; SSE2-NEXT:    pxor %xmm3, %xmm4
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    pcmpgtd %xmm3, %xmm4
+; SSE2-NEXT:    pand %xmm5, %xmm4
+; SSE2-NEXT:    packuswb %xmm9, %xmm4
+; SSE2-NEXT:    packuswb %xmm10, %xmm4
+; SSE2-NEXT:    psubd %xmm8, %xmm2
+; SSE2-NEXT:    pand %xmm5, %xmm0
+; SSE2-NEXT:    pand %xmm5, %xmm6
+; SSE2-NEXT:    packuswb %xmm0, %xmm6
+; SSE2-NEXT:    pand %xmm5, %xmm1
+; SSE2-NEXT:    pand %xmm5, %xmm2
+; SSE2-NEXT:    packuswb %xmm1, %xmm2
+; SSE2-NEXT:    packuswb %xmm6, %xmm2
+; SSE2-NEXT:    pandn %xmm2, %xmm4
+; SSE2-NEXT:    movdqu %xmm4, (%rdi)
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: test14:
 ; SSSE3:       ## BB#0: ## %vector.ph
-; SSSE3-NEXT:    movdqu (%rdi), %xmm7
-; SSSE3-NEXT:    movdqu (%rsi), %xmm10
-; SSSE3-NEXT:    movdqu 16(%rsi), %xmm4
-; SSSE3-NEXT:    movdqu 32(%rsi), %xmm8
-; SSSE3-NEXT:    movdqu 48(%rsi), %xmm9
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[2,3,0,1]
-; SSSE3-NEXT:    pxor %xmm1, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT:    movdqa %xmm2, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3],xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
-; SSSE3-NEXT:    movdqa %xmm7, %xmm0
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm1[4],xmm7[5],xmm1[5],xmm7[6],xmm1[6],xmm7[7],xmm1[7]
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm6 = [2147483648,2147483648,2147483648,2147483648]
-; SSSE3-NEXT:    movdqa %xmm7, %xmm5
-; SSSE3-NEXT:    psubd %xmm4, %xmm7
-; SSSE3-NEXT:    pxor %xmm6, %xmm4
-; SSSE3-NEXT:    pxor %xmm6, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm4
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm11 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSSE3-NEXT:    pshufb %xmm11, %xmm4
+; SSSE3-NEXT:    movdqu (%rdi), %xmm0
+; SSSE3-NEXT:    movdqu (%rsi), %xmm8
+; SSSE3-NEXT:    movdqu 16(%rsi), %xmm9
+; SSSE3-NEXT:    movdqu 32(%rsi), %xmm10
+; SSSE3-NEXT:    movdqu 48(%rsi), %xmm7
+; SSSE3-NEXT:    pxor %xmm3, %xmm3
+; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm6
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm5
-; SSSE3-NEXT:    psubd %xmm10, %xmm0
-; SSSE3-NEXT:    pxor %xmm6, %xmm10
-; SSSE3-NEXT:    pxor %xmm6, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm10
-; SSSE3-NEXT:    pshufb %xmm11, %xmm10
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm4[0]
-; SSSE3-NEXT:    psllw $15, %xmm10
-; SSSE3-NEXT:    psraw $15, %xmm10
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSSE3-NEXT:    pshufb %xmm4, %xmm10
-; SSSE3-NEXT:    movdqa %xmm2, %xmm5
-; SSSE3-NEXT:    psubd %xmm9, %xmm2
-; SSSE3-NEXT:    pxor %xmm6, %xmm9
-; SSSE3-NEXT:    pxor %xmm6, %xmm5
-; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm9
-; SSSE3-NEXT:    pshufb %xmm11, %xmm9
+; SSSE3-NEXT:    psubd %xmm7, %xmm0
+; SSSE3-NEXT:    pxor %xmm3, %xmm7
+; SSSE3-NEXT:    pxor %xmm3, %xmm5
+; SSSE3-NEXT:    pcmpgtd %xmm5, %xmm7
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm5 = <u,u,u,u,0,4,8,12,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm5, %xmm7
+; SSSE3-NEXT:    movdqa %xmm6, %xmm4
+; SSSE3-NEXT:    psubd %xmm10, %xmm6
+; SSSE3-NEXT:    pxor %xmm3, %xmm10
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm10
+; SSSE3-NEXT:    pshufb %xmm5, %xmm10
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm7[0],xmm10[1],xmm7[1]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    psubd %xmm9, %xmm1
+; SSSE3-NEXT:    pxor %xmm3, %xmm9
+; SSSE3-NEXT:    pxor %xmm3, %xmm4
+; SSSE3-NEXT:    pcmpgtd %xmm4, %xmm9
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = <0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    pshufb %xmm4, %xmm9
 ; SSSE3-NEXT:    movdqa %xmm8, %xmm5
-; SSSE3-NEXT:    pxor %xmm6, %xmm5
-; SSSE3-NEXT:    pxor %xmm3, %xmm6
-; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm5
-; SSSE3-NEXT:    pshufb %xmm11, %xmm5
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0]
-; SSSE3-NEXT:    psllw $15, %xmm5
-; SSSE3-NEXT:    psraw $15, %xmm5
+; SSSE3-NEXT:    pxor %xmm3, %xmm5
+; SSSE3-NEXT:    pxor %xmm2, %xmm3
+; SSSE3-NEXT:    pcmpgtd %xmm3, %xmm5
 ; SSSE3-NEXT:    pshufb %xmm4, %xmm5
-; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm5[0]
-; SSSE3-NEXT:    psllw $7, %xmm10
-; SSSE3-NEXT:    pand {{.*}}(%rip), %xmm10
-; SSSE3-NEXT:    pcmpgtb %xmm10, %xmm1
-; SSSE3-NEXT:    psubd %xmm8, %xmm3
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSSE3-NEXT:    pand %xmm4, %xmm7
-; SSSE3-NEXT:    pand %xmm4, %xmm0
-; SSSE3-NEXT:    packuswb %xmm7, %xmm0
-; SSSE3-NEXT:    pand %xmm4, %xmm2
-; SSSE3-NEXT:    pand %xmm4, %xmm3
-; SSSE3-NEXT:    packuswb %xmm2, %xmm3
-; SSSE3-NEXT:    packuswb %xmm3, %xmm0
-; SSSE3-NEXT:    pandn %xmm0, %xmm1
-; SSSE3-NEXT:    movdqu %xmm1, (%rdi)
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm9[0],xmm5[1],xmm9[1]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm10 = xmm5[0],xmm10[1]
+; SSSE3-NEXT:    psubd %xmm8, %xmm2
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; SSSE3-NEXT:    pand %xmm3, %xmm0
+; SSSE3-NEXT:    pand %xmm3, %xmm6
+; SSSE3-NEXT:    packuswb %xmm0, %xmm6
+; SSSE3-NEXT:    pand %xmm3, %xmm1
+; SSSE3-NEXT:    pand %xmm3, %xmm2
+; SSSE3-NEXT:    packuswb %xmm1, %xmm2
+; SSSE3-NEXT:    packuswb %xmm6, %xmm2
+; SSSE3-NEXT:    andnpd %xmm2, %xmm10
+; SSSE3-NEXT:    movupd %xmm10, (%rdi)
 ; SSSE3-NEXT:    retq
 ;
 ; AVX1-LABEL: test14:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovups (%rsi), %ymm0
-; AVX1-NEXT:    vmovups 32(%rsi), %ymm1
+; AVX1-NEXT:    vmovdqu (%rsi), %ymm0
+; AVX1-NEXT:    vmovdqu 32(%rsi), %ymm1
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
@@ -803,25 +772,20 @@ define void @test14(i8* nocapture %head, i32* nocapture %w) nounwind {
 ; AVX1-NEXT:    vpxor %xmm6, %xmm2, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm3, %xmm3
 ; AVX1-NEXT:    vpxor %xmm6, %xmm10, %xmm7
-; AVX1-NEXT:    vxorps %xmm6, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm1, %xmm4
 ; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm4, %xmm4
+; AVX1-NEXT:    vpacksswb %xmm3, %xmm4, %xmm11
+; AVX1-NEXT:    vpxor %xmm6, %xmm9, %xmm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
+; AVX1-NEXT:    vpxor %xmm6, %xmm7, %xmm3
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT:    vpxor %xmm6, %xmm8, %xmm4
+; AVX1-NEXT:    vpxor %xmm6, %xmm0, %xmm6
+; AVX1-NEXT:    vpcmpgtd %xmm4, %xmm6, %xmm4
 ; AVX1-NEXT:    vpacksswb %xmm3, %xmm4, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX1-NEXT:    vpshufb %xmm11, %xmm3, %xmm12
-; AVX1-NEXT:    vpxor %xmm6, %xmm9, %xmm7
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vxorps %xmm6, %xmm4, %xmm3
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm3, %xmm3
-; AVX1-NEXT:    vpxor %xmm6, %xmm8, %xmm7
-; AVX1-NEXT:    vxorps %xmm6, %xmm0, %xmm6
-; AVX1-NEXT:    vpcmpgtd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vpacksswb %xmm3, %xmm6, %xmm3
-; AVX1-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm12[0]
-; AVX1-NEXT:    vpsllw $7, %xmm3, %xmm3
-; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT:    vpacksswb %xmm11, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm8, %xmm0
-; AVX1-NEXT:    vpsubd %xmm4, %xmm9, %xmm4
+; AVX1-NEXT:    vpsubd %xmm7, %xmm9, %xmm4
 ; AVX1-NEXT:    vpsubd %xmm1, %xmm10, %xmm1
 ; AVX1-NEXT:    vpsubd %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
@@ -850,26 +814,22 @@ define void @test14(i8* nocapture %head, i32* nocapture %w) nounwind {
 ; AVX2-NEXT:    vpcmpgtd %ymm5, %ymm6, %ymm5
 ; AVX2-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX2-NEXT:    vpacksswb %xmm6, %xmm5, %xmm5
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; AVX2-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm7
+; AVX2-NEXT:    vpxor %ymm4, %ymm2, %ymm6
 ; AVX2-NEXT:    vpxor %ymm4, %ymm0, %ymm4
-; AVX2-NEXT:    vpcmpgtd %ymm7, %ymm4, %ymm4
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm7
-; AVX2-NEXT:    vpacksswb %xmm7, %xmm4, %xmm4
-; AVX2-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; AVX2-NEXT:    vpsllw $7, %xmm4, %xmm4
-; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm4, %xmm4
+; AVX2-NEXT:    vpcmpgtd %ymm6, %ymm4, %ymm4
+; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm6
+; AVX2-NEXT:    vpacksswb %xmm6, %xmm4, %xmm4
+; AVX2-NEXT:    vpacksswb %xmm5, %xmm4, %xmm4
 ; AVX2-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
+; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
-; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
+; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vpblendvb %xmm4, %xmm1, %xmm0, %xmm0
@@ -919,8 +879,6 @@ define void @test15(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT:    psllw $15, %xmm4
-; SSE2-NEXT:    psraw $15, %xmm4
 ; SSE2-NEXT:    psubd %xmm2, %xmm1
 ; SSE2-NEXT:    pslld $16, %xmm0
 ; SSE2-NEXT:    psrad $16, %xmm0
@@ -954,8 +912,6 @@ define void @test15(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm3
 ; SSSE3-NEXT:    pshufb %xmm4, %xmm3
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSSE3-NEXT:    psllw $15, %xmm3
-; SSSE3-NEXT:    psraw $15, %xmm3
 ; SSSE3-NEXT:    psubd %xmm2, %xmm1
 ; SSSE3-NEXT:    pshufb %xmm4, %xmm0
 ; SSSE3-NEXT:    pshufb %xmm4, %xmm1
@@ -966,7 +922,7 @@ define void @test15(i16* nocapture %head, i32* nocapture %w) nounwind {
 ;
 ; AVX1-LABEL: test15:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovups (%rsi), %ymm0
+; AVX1-NEXT:    vmovdqu (%rsi), %ymm0
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
@@ -975,7 +931,7 @@ define void @test15(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm6
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm6, %xmm3
 ; AVX1-NEXT:    vpacksswb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
@@ -1000,7 +956,7 @@ define void @test15(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
@@ -1049,8 +1005,6 @@ define void @test16(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,6,6,7]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm3[0]
-; SSE2-NEXT:    psllw $15, %xmm4
-; SSE2-NEXT:    psraw $15, %xmm4
 ; SSE2-NEXT:    psubd %xmm2, %xmm1
 ; SSE2-NEXT:    pslld $16, %xmm0
 ; SSE2-NEXT:    psrad $16, %xmm0
@@ -1084,8 +1038,6 @@ define void @test16(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; SSSE3-NEXT:    pcmpgtd %xmm6, %xmm3
 ; SSSE3-NEXT:    pshufb %xmm4, %xmm3
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
-; SSSE3-NEXT:    psllw $15, %xmm3
-; SSSE3-NEXT:    psraw $15, %xmm3
 ; SSSE3-NEXT:    psubd %xmm2, %xmm1
 ; SSSE3-NEXT:    pshufb %xmm4, %xmm0
 ; SSSE3-NEXT:    pshufb %xmm4, %xmm1
@@ -1096,7 +1048,7 @@ define void @test16(i16* nocapture %head, i32* nocapture %w) nounwind {
 ;
 ; AVX1-LABEL: test16:
 ; AVX1:       ## BB#0: ## %vector.ph
-; AVX1-NEXT:    vmovups (%rsi), %ymm0
+; AVX1-NEXT:    vmovdqu (%rsi), %ymm0
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
@@ -1105,7 +1057,7 @@ define void @test16(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; AVX1-NEXT:    vpxor %xmm3, %xmm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtd %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm6
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm3, %xmm6, %xmm3
 ; AVX1-NEXT:    vpacksswb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
@@ -1130,7 +1082,7 @@ define void @test16(i16* nocapture %head, i32* nocapture %w) nounwind {
 ; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vpsubd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpand %xmm0, %xmm2, %xmm0
 ; AVX2-NEXT:    vmovdqu %xmm0, (%rdi)
diff --git a/test/CodeGen/X86/recip-fastmath.ll b/test/CodeGen/X86/recip-fastmath.ll
index 0a99254cd6253350b33d3b560c38516c9e1fddb5..5fd553b301aab7f658b80c2a98d1d87e059083df 100644
--- a/test/CodeGen/X86/recip-fastmath.ll
+++ b/test/CodeGen/X86/recip-fastmath.ll
@@ -1,6 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell    | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl        | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx        | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
 
 ; If the target's divss/divps instructions are substantially
 ; slower than rcpss/rcpps with a Newton-Raphson refinement,
@@ -39,15 +46,66 @@ define float @f32_one_step(float %x) #1 {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: f32_one_step:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: f32_one_step:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: f32_one_step:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: f32_one_step:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: f32_one_step:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: f32_one_step:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: f32_one_step:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_one_step:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -70,19 +128,94 @@ define float @f32_two_step(float %x) #2 {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: f32_two_step:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: f32_two_step:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: f32_two_step:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: f32_two_step:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: f32_two_step:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: f32_two_step:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
+; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: f32_two_step:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_two_step:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovaps %xmm1, %xmm3
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
+; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast float 1.0, %x
   ret float %div
 }
@@ -95,11 +228,47 @@ define <4 x float> @v4f32_no_estimate(<4 x float> %x) #0 {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v4f32_no_estimate:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-NEXT:    vdivps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v4f32_no_estimate:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v4f32_no_estimate:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v4f32_no_estimate:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v4f32_no_estimate:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v4f32_no_estimate:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; HASWELL-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v4f32_no_estimate:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; HASWELL-NO-FMA-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v4f32_no_estimate:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %xmm1
+; AVX512-NEXT:    vdivps %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -116,15 +285,75 @@ define <4 x float> @v4f32_one_step(<4 x float> %x) #1 {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v4f32_one_step:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v4f32_one_step:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v4f32_one_step:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v4f32_one_step:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v4f32_one_step:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v4f32_one_step:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v4f32_one_step:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v4f32_one_step:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %xmm0, %xmm1
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v4f32_one_step:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; SKX-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -147,19 +376,105 @@ define <4 x float> @v4f32_two_step(<4 x float> %x) #2 {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v4f32_two_step:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; AVX-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v4f32_two_step:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v4f32_two_step:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v4f32_two_step:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v4f32_two_step:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v4f32_two_step:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v4f32_two_step:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm3
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v4f32_two_step:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %xmm0, %xmm1
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; KNL-NEXT:    vmovaps %xmm1, %xmm3
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v4f32_two_step:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
+; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; SKX-NEXT:    vmovaps %xmm1, %xmm3
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
+; SKX-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <4 x float> %div
 }
@@ -175,11 +490,47 @@ define <8 x float> @v8f32_no_estimate(<8 x float> %x) #0 {
 ; SSE-NEXT:    movaps %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v8f32_no_estimate:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-NEXT:    vdivps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v8f32_no_estimate:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v8f32_no_estimate:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v8f32_no_estimate:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v8f32_no_estimate:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm1 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v8f32_no_estimate:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; HASWELL-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_no_estimate:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; HASWELL-NO-FMA-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: v8f32_no_estimate:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vbroadcastss {{.*}}(%rip), %ymm1
+; AVX512-NEXT:    vdivps %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -203,15 +554,75 @@ define <8 x float> @v8f32_one_step(<8 x float> %x) #1 {
 ; SSE-NEXT:    movaps %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v8f32_one_step:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; AVX-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v8f32_one_step:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v8f32_one_step:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v8f32_one_step:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v8f32_one_step:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v8f32_one_step:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_one_step:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v8f32_one_step:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %ymm0, %ymm1
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v8f32_one_step:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -247,19 +658,105 @@ define <8 x float> @v8f32_two_step(<8 x float> %x) #2 {
 ; SSE-NEXT:    movaps %xmm3, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v8f32_two_step:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; AVX-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; AVX-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v8f32_two_step:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v8f32_two_step:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
+; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
+; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v8f32_two_step:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v8f32_two_step:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v8f32_two_step:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; HASWELL-NEXT:    vmovaps %ymm1, %ymm3
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_two_step:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v8f32_two_step:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %ymm0, %ymm1
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; KNL-NEXT:    vmovaps %ymm1, %ymm3
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v8f32_two_step:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
+; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; SKX-NEXT:    vmovaps %ymm1, %ymm3
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
+; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
diff --git a/test/CodeGen/X86/recip-fastmath2.ll b/test/CodeGen/X86/recip-fastmath2.ll
index 0788b036cc5214d9a81cfd8fc1c9672f54146709..730d2f130388011c8dda51016f1d3a0cc8c35366 100644
--- a/test/CodeGen/X86/recip-fastmath2.ll
+++ b/test/CodeGen/X86/recip-fastmath2.ll
@@ -1,6 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx  | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2     | FileCheck %s --check-prefix=CHECK --check-prefix=SSE --check-prefix=SSE-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx      | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=FMA-RECIP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=btver2     | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=BTVER2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=sandybridge| FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=SANDY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell    | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=haswell -mattr=-fma | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=HASWELL-NO-FMA
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl        | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=KNL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=skx        | FileCheck %s --check-prefix=CHECK --check-prefix=AVX --check-prefix=AVX512 --check-prefix=SKX
 
 ; It's the extra tests coverage for recip as discussed on D26855.
 
@@ -11,11 +18,47 @@ define float @f32_no_step_2(float %x) #3 {
 ; SSE-NEXT:    mulss {{.*}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: f32_no_step_2:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: f32_no_step_2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: f32_no_step_2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
+; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: f32_no_step_2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: f32_no_step_2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: f32_no_step_2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: f32_no_step_2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_no_step_2:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast float 1234.0, %x
   ret float %div
 }
@@ -33,20 +76,170 @@ define float @f32_one_step_2(float %x) #1 {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: f32_one_step_2:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-NEXT:    vsubss %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: f32_one_step_2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: f32_one_step_2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: f32_one_step_2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: f32_one_step_2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: f32_one_step_2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: f32_one_step_2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_one_step_2:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast float 3456.0, %x
   ret float %div
 }
 
+define float @f32_one_step_2_divs(float %x) #1 {
+; SSE-LABEL: f32_one_step_2_divs:
+; SSE:       # BB#0:
+; SSE-NEXT:    rcpss %xmm0, %xmm1
+; SSE-NEXT:    mulss %xmm1, %xmm0
+; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    subss %xmm0, %xmm2
+; SSE-NEXT:    mulss %xmm1, %xmm2
+; SSE-NEXT:    addss %xmm1, %xmm2
+; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    mulss %xmm2, %xmm0
+; SSE-NEXT:    mulss %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-RECIP-LABEL: f32_one_step_2_divs:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: f32_one_step_2_divs:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: f32_one_step_2_divs:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: f32_one_step_2_divs:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: f32_one_step_2_divs:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; HASWELL-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: f32_one_step_2_divs:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_one_step_2_divs:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vfnmadd213ss {{.*}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm0
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %div = fdiv fast float 3456.0, %x
+  %div2 = fdiv fast float %div, %x
+  ret float %div2
+}
+
 define float @f32_two_step_2(float %x) #2 {
 ; SSE-LABEL: f32_two_step_2:
 ; SSE:       # BB#0:
@@ -66,20 +259,101 @@ define float @f32_two_step_2(float %x) #2 {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: f32_two_step_2:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-NEXT:    vsubss %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm2
-; AVX-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vsubss %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vmulss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: f32_two_step_2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: f32_two_step_2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: f32_two_step_2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; BTVER2-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; BTVER2-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: f32_two_step_2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: f32_two_step_2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
+; HASWELL-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
+; HASWELL-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
+; HASWELL-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: f32_two_step_2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm3, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; AVX512-LABEL: f32_two_step_2:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vrcp14ss %xmm0, %xmm0, %xmm1
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovaps %xmm1, %xmm3
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm0, %xmm3
+; AVX512-NEXT:    vfmadd132ss %xmm1, %xmm1, %xmm3
+; AVX512-NEXT:    vfnmadd213ss %xmm2, %xmm3, %xmm0
+; AVX512-NEXT:    vfmadd132ss %xmm3, %xmm3, %xmm0
+; AVX512-NEXT:    vmulss {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    retq
   %div = fdiv fast float 6789.0, %x
   ret float %div
 }
@@ -97,20 +371,191 @@ define <4 x float> @v4f32_one_step2(<4 x float> %x) #1 {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v4f32_one_step2:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-NEXT:    vsubps %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v4f32_one_step2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v4f32_one_step2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v4f32_one_step2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v4f32_one_step2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v4f32_one_step2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v4f32_one_step2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v4f32_one_step2:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %xmm0, %xmm1
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v4f32_one_step2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SKX-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
 
+define <4 x float> @v4f32_one_step_2_divs(<4 x float> %x) #1 {
+; SSE-LABEL: v4f32_one_step_2_divs:
+; SSE:       # BB#0:
+; SSE-NEXT:    rcpps %xmm0, %xmm1
+; SSE-NEXT:    mulps %xmm1, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    subps %xmm0, %xmm2
+; SSE-NEXT:    mulps %xmm1, %xmm2
+; SSE-NEXT:    addps %xmm1, %xmm2
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; SSE-NEXT:    mulps %xmm2, %xmm0
+; SSE-NEXT:    mulps %xmm2, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-RECIP-LABEL: v4f32_one_step_2_divs:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v4f32_one_step_2_divs:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v4f32_one_step_2_divs:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v4f32_one_step_2_divs:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v4f32_one_step_2_divs:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; HASWELL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v4f32_one_step_2_divs:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm2, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v4f32_one_step_2_divs:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %xmm0, %xmm1
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm1, %xmm0
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; KNL-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v4f32_one_step_2_divs:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to4}, %xmm1, %xmm0
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm0
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm1
+; SKX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SKX-NEXT:    retq
+  %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
+  %div2 = fdiv fast <4 x float> %div, %x
+  ret <4 x float> %div2
+}
+
 define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; SSE-LABEL: v4f32_two_step2:
 ; SSE:       # BB#0:
@@ -130,20 +575,113 @@ define <4 x float> @v4f32_two_step2(<4 x float> %x) #2 {
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v4f32_two_step2:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %xmm0, %xmm1
-; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm2
-; AVX-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-NEXT:    vsubps %xmm2, %xmm3, %xmm2
-; AVX-NEXT:    vmulps %xmm2, %xmm1, %xmm2
-; AVX-NEXT:    vaddps %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vsubps %xmm0, %xmm3, %xmm0
-; AVX-NEXT:    vmulps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v4f32_two_step2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %xmm0, %xmm1
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; AVX-RECIP-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; AVX-RECIP-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; AVX-RECIP-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; AVX-RECIP-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; AVX-RECIP-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v4f32_two_step2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %xmm0, %xmm1
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} xmm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
+; FMA-RECIP-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v4f32_two_step2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %xmm0, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; BTVER2-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; BTVER2-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; BTVER2-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; BTVER2-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; BTVER2-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; BTVER2-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v4f32_two_step2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %xmm0, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; SANDY-NEXT:    vmovaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; SANDY-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; SANDY-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; SANDY-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; SANDY-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; SANDY-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v4f32_two_step2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
+; HASWELL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
+; HASWELL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; HASWELL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v4f32_two_step2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %xmm0, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm2
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %xmm3
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm2, %xmm3, %xmm2
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm2, %xmm1, %xmm2
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm2, %xmm1, %xmm1
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm1, %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    vsubps %xmm0, %xmm3, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v4f32_two_step2:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %xmm0, %xmm1
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; KNL-NEXT:    vmovaps %xmm1, %xmm3
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
+; KNL-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
+; KNL-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; KNL-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
+; KNL-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v4f32_two_step2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %xmm0, %xmm1
+; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %xmm2
+; SKX-NEXT:    vmovaps %xmm1, %xmm3
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm0, %xmm3
+; SKX-NEXT:    vfmadd132ps %xmm1, %xmm1, %xmm3
+; SKX-NEXT:    vfnmadd213ps %xmm2, %xmm3, %xmm0
+; SKX-NEXT:    vfmadd132ps %xmm3, %xmm3, %xmm0
+; SKX-NEXT:    vmulps {{.*}}(%rip), %xmm0, %xmm0
+; SKX-NEXT:    retq
   %div = fdiv fast <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0>, %x
   ret <4 x float> %div
 }
@@ -169,20 +707,200 @@ define <8 x float> @v8f32_one_step2(<8 x float> %x) #1 {
 ; SSE-NEXT:    movaps %xmm3, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v8f32_one_step2:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; AVX-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v8f32_one_step2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v8f32_one_step2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v8f32_one_step2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v8f32_one_step2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v8f32_one_step2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_one_step2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v8f32_one_step2:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %ymm0, %ymm1
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v8f32_one_step2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
 
+define <8 x float> @v8f32_one_step_2_divs(<8 x float> %x) #1 {
+; SSE-LABEL: v8f32_one_step_2_divs:
+; SSE:       # BB#0:
+; SSE-NEXT:    rcpps %xmm0, %xmm2
+; SSE-NEXT:    mulps %xmm2, %xmm0
+; SSE-NEXT:    movaps {{.*#+}} xmm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SSE-NEXT:    movaps %xmm3, %xmm4
+; SSE-NEXT:    subps %xmm0, %xmm4
+; SSE-NEXT:    mulps %xmm2, %xmm4
+; SSE-NEXT:    addps %xmm2, %xmm4
+; SSE-NEXT:    rcpps %xmm1, %xmm0
+; SSE-NEXT:    mulps %xmm0, %xmm1
+; SSE-NEXT:    subps %xmm1, %xmm3
+; SSE-NEXT:    mulps %xmm0, %xmm3
+; SSE-NEXT:    addps %xmm0, %xmm3
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [5.000000e+00,6.000000e+00,7.000000e+00,8.000000e+00]
+; SSE-NEXT:    mulps %xmm3, %xmm1
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1.000000e+00,2.000000e+00,3.000000e+00,4.000000e+00]
+; SSE-NEXT:    mulps %xmm4, %xmm0
+; SSE-NEXT:    mulps %xmm4, %xmm0
+; SSE-NEXT:    mulps %xmm3, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-RECIP-LABEL: v8f32_one_step_2_divs:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v8f32_one_step_2_divs:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
+; FMA-RECIP-NEXT:    vfnmadd213ps {{.*}}(%rip), %ymm1, %ymm0
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; FMA-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v8f32_one_step_2_divs:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v8f32_one_step_2_divs:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v8f32_one_step_2_divs:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; HASWELL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_one_step_2_divs:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v8f32_one_step_2_divs:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %ymm0, %ymm1
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm1, %ymm0
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; KNL-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v8f32_one_step_2_divs:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
+; SKX-NEXT:    vfnmadd213ps {{.*}}(%rip){1to8}, %ymm1, %ymm0
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm0
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm1
+; SKX-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SKX-NEXT:    retq
+  %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
+  %div2 = fdiv fast <8 x float> %div, %x
+  ret <8 x float> %div2
+}
+
 define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; SSE-LABEL: v8f32_two_step2:
 ; SSE:       # BB#0:
@@ -216,20 +934,113 @@ define <8 x float> @v8f32_two_step2(<8 x float> %x) #2 {
 ; SSE-NEXT:    movaps %xmm3, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v8f32_two_step2:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %ymm0, %ymm1
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm2
-; AVX-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
-; AVX-NEXT:    vsubps %ymm2, %ymm3, %ymm2
-; AVX-NEXT:    vmulps %ymm2, %ymm1, %ymm2
-; AVX-NEXT:    vaddps %ymm2, %ymm1, %ymm1
-; AVX-NEXT:    vmulps %ymm1, %ymm0, %ymm0
-; AVX-NEXT:    vsubps %ymm0, %ymm3, %ymm0
-; AVX-NEXT:    vmulps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vaddps %ymm0, %ymm1, %ymm0
-; AVX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v8f32_two_step2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm1
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; AVX-RECIP-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; AVX-RECIP-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; AVX-RECIP-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; AVX-RECIP-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; AVX-RECIP-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; AVX-RECIP-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v8f32_two_step2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm1
+; FMA-RECIP-NEXT:    vmovaps {{.*#+}} ymm2 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; FMA-RECIP-NEXT:    vmovaps %ymm1, %ymm3
+; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
+; FMA-RECIP-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; FMA-RECIP-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v8f32_two_step2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; BTVER2-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; BTVER2-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; BTVER2-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; BTVER2-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; BTVER2-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; BTVER2-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v8f32_two_step2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %ymm0, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; SANDY-NEXT:    vmovaps {{.*#+}} ymm3 = [1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00,1.000000e+00]
+; SANDY-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; SANDY-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; SANDY-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; SANDY-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; SANDY-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; SANDY-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v8f32_two_step2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; HASWELL-NEXT:    vmovaps %ymm1, %ymm3
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
+; HASWELL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
+; HASWELL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; HASWELL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_two_step2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm2
+; HASWELL-NO-FMA-NEXT:    vbroadcastss {{.*}}(%rip), %ymm3
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm2, %ymm3, %ymm2
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm2, %ymm1, %ymm2
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm2, %ymm1, %ymm1
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm1, %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vsubps %ymm0, %ymm3, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vaddps %ymm0, %ymm1, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v8f32_two_step2:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %ymm0, %ymm1
+; KNL-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; KNL-NEXT:    vmovaps %ymm1, %ymm3
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
+; KNL-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
+; KNL-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; KNL-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v8f32_two_step2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm1
+; SKX-NEXT:    vbroadcastss {{.*}}(%rip), %ymm2
+; SKX-NEXT:    vmovaps %ymm1, %ymm3
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm0, %ymm3
+; SKX-NEXT:    vfmadd132ps %ymm1, %ymm1, %ymm3
+; SKX-NEXT:    vfnmadd213ps %ymm2, %ymm3, %ymm0
+; SKX-NEXT:    vfmadd132ps %ymm3, %ymm3, %ymm0
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
@@ -241,10 +1052,45 @@ define <8 x float> @v8f32_no_step(<8 x float> %x) #3 {
 ; SSE-NEXT:    rcpps %xmm1, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v8f32_no_step:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %ymm0, %ymm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v8f32_no_step:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v8f32_no_step:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v8f32_no_step:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v8f32_no_step:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %ymm0, %ymm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v8f32_no_step:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_no_step:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v8f32_no_step:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v8f32_no_step:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm0
+; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %x
   ret <8 x float> %div
 }
@@ -258,11 +1104,53 @@ define <8 x float> @v8f32_no_step2(<8 x float> %x) #3 {
 ; SSE-NEXT:    mulps {{.*}}(%rip), %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: v8f32_no_step2:
-; AVX:       # BB#0:
-; AVX-NEXT:    vrcpps %ymm0, %ymm0
-; AVX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
-; AVX-NEXT:    retq
+; AVX-RECIP-LABEL: v8f32_no_step2:
+; AVX-RECIP:       # BB#0:
+; AVX-RECIP-NEXT:    vrcpps %ymm0, %ymm0
+; AVX-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; AVX-RECIP-NEXT:    retq
+;
+; FMA-RECIP-LABEL: v8f32_no_step2:
+; FMA-RECIP:       # BB#0:
+; FMA-RECIP-NEXT:    vrcpps %ymm0, %ymm0
+; FMA-RECIP-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; FMA-RECIP-NEXT:    retq
+;
+; BTVER2-LABEL: v8f32_no_step2:
+; BTVER2:       # BB#0:
+; BTVER2-NEXT:    vrcpps %ymm0, %ymm0
+; BTVER2-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; BTVER2-NEXT:    retq
+;
+; SANDY-LABEL: v8f32_no_step2:
+; SANDY:       # BB#0:
+; SANDY-NEXT:    vrcpps %ymm0, %ymm0
+; SANDY-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SANDY-NEXT:    retq
+;
+; HASWELL-LABEL: v8f32_no_step2:
+; HASWELL:       # BB#0:
+; HASWELL-NEXT:    vrcpps %ymm0, %ymm0
+; HASWELL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NEXT:    retq
+;
+; HASWELL-NO-FMA-LABEL: v8f32_no_step2:
+; HASWELL-NO-FMA:       # BB#0:
+; HASWELL-NO-FMA-NEXT:    vrcpps %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; HASWELL-NO-FMA-NEXT:    retq
+;
+; KNL-LABEL: v8f32_no_step2:
+; KNL:       # BB#0:
+; KNL-NEXT:    vrcpps %ymm0, %ymm0
+; KNL-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; KNL-NEXT:    retq
+;
+; SKX-LABEL: v8f32_no_step2:
+; SKX:       # BB#0:
+; SKX-NEXT:    vrcp14ps %ymm0, %ymm0
+; SKX-NEXT:    vmulps {{.*}}(%rip), %ymm0, %ymm0
+; SKX-NEXT:    retq
   %div = fdiv fast <8 x float> <float 1.0, float 2.0, float 3.0, float 4.0, float 5.0, float 6.0, float 7.0, float 8.0>, %x
   ret <8 x float> %div
 }
diff --git a/test/CodeGen/X86/recip-pic.ll b/test/CodeGen/X86/recip-pic.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7a0d03d6072e322d7c09cb006586a55248cb01d6
--- /dev/null
+++ b/test/CodeGen/X86/recip-pic.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu  -enable-unsafe-fp-math -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK
+
+define fastcc float @foo(float %x) unnamed_addr #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    calll .L0$pb
+; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_adjust_cfa_offset 4
+; CHECK-NEXT:  .L0$pb:
+; CHECK-NEXT:    popl %eax
+; CHECK-NEXT:  .Lcfi1:
+; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    divss %xmm0, %xmm1
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    movss %xmm1, (%eax)
+; CHECK-NEXT:    retl
+entry:
+  %div = fdiv fast float 3.0, %x
+  store float %div, float* undef, align 4
+  ret float %div
+}
+
+
diff --git a/test/CodeGen/X86/reduce-trunc-shl.ll b/test/CodeGen/X86/reduce-trunc-shl.ll
index 275327b1486ebb05f8350c80fa642d1f9b3ba388..0638e9e3f6cd8ccdc7b88e286d9e782b3b2830b5 100644
--- a/test/CodeGen/X86/reduce-trunc-shl.ll
+++ b/test/CodeGen/X86/reduce-trunc-shl.ll
@@ -41,7 +41,7 @@ define <8 x i16> @trunc_shl_v8i16_v8i32(<8 x i32> %a) {
 ; AVX2-LABEL: trunc_shl_v8i16_v8i32:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpslld $17, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
diff --git a/test/CodeGen/X86/regparm.ll b/test/CodeGen/X86/regparm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9484e5a9490bd225ebafcc61971e8c1cd92059e1
--- /dev/null
+++ b/test/CodeGen/X86/regparm.ll
@@ -0,0 +1,48 @@
+; RUN: llc %s -mtriple=i386-pc-linux -o - | FileCheck -check-prefix=CHECK %s 
+; RUN: llc %s -mtriple=i386-pc-win32 -o - | FileCheck -check-prefix=WIN %s
+; RUN: llc %s -mtriple=i386-pc-linux -fast-isel -o - | FileCheck -check-prefix=FAST %s 
+; RUN: llc %s -mtriple=i386-pc-win32 -fast-isel -o - | FileCheck -check-prefix=FASTWIN %s
+
+
+
+target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32, i1) #1
+
+define void @use_memset(i8* inreg nocapture %dest, i8 inreg %c, i32 inreg %n) local_unnamed_addr #0 {
+entry:
+;CHECK-LABEL: @use_memset
+;CHECK-NOT: push
+;CHECK: jmp	memset
+;CHECK-NOT: retl
+;WIN-LABEL: @use_memset
+;WIN-NOT: push
+;WIN: jmp	_memset
+;WIN-NOT: retl
+;FAST-LABEL: @use_memset
+;FAST:	subl	$12, %esp
+;FAST-NEXT: 	movzbl	%dl, %edx
+;FAST-NEXT:     calll	memset
+;FAST-NEXT:	addl	$12, %esp
+;FASTWIN-LABEL: @use_memset
+;FASTWIN: 	movzbl	%dl, %edx
+;FASTWIN-NEXT:     calll	_memset
+;FASTWIN-NEXT:     retl
+  tail call void @llvm.memset.p0i8.i32(i8* %dest, i8 %c, i32 %n, i32 1, i1 false)
+  ret void
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i32, i1) #1
+
+
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"NumRegisterParameters", i32 3}
+!1 = !{!"clang version 4.0.0 (trunk 288025) (llvm/trunk 288033)"}
diff --git a/test/CodeGen/X86/rot32.ll b/test/CodeGen/X86/rot32.ll
index 5738f70fa47e628a3a7b89e8bef5c8f1699a1a86..79ecbe0514d0b562792ebef7a0751169e076591d 100644
--- a/test/CodeGen/X86/rot32.ll
+++ b/test/CodeGen/X86/rot32.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
 ; RUN: llc < %s -march=x86 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i32 @foo(i32 %x, i32 %y, i32 %z) nounwind readnone {
@@ -49,6 +50,8 @@ define i32 @xfoo(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK-LABEL: xfoo:
 ; CHECK: roll $7
+; SHLD-LABEL: xfoo:
+; SHLD: shldl $7
 ; BMI2-LABEL: xfoo:
 ; BMI2: rorxl $25
 	%0 = lshr i32 %x, 25
@@ -59,8 +62,12 @@ entry:
 
 define i32 @xfoop(i32* %p) nounwind readnone {
 entry:
+; CHECK-LABEL: xfoop:
+; CHECK: roll $7
+; SHLD-LABEL: xfoop:
+; SHLD: shldl $7
 ; BMI2-LABEL: xfoop:
-; BMI2: rorxl $25, ({{.+}}), %{{.+}}
+; BMI2: rorxl $25
 	%x = load i32, i32* %p
 	%a = lshr i32 %x, 25
 	%b = shl i32 %x, 7
@@ -82,6 +89,8 @@ define i32 @xun(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK-LABEL: xun:
 ; CHECK: roll $25
+; SHLD-LABEL: xun:
+; SHLD: shldl $25
 ; BMI2-LABEL: xun:
 ; BMI2: rorxl $7
 	%0 = lshr i32 %x, 7
@@ -92,8 +101,12 @@ entry:
 
 define i32 @xunp(i32* %p) nounwind readnone {
 entry:
+; CHECK-LABEL: xunp:
+; CHECK: roll $25
+; shld-label: xunp:
+; shld: shldl $25
 ; BMI2-LABEL: xunp:
-; BMI2: rorxl $7, ({{.+}}), %{{.+}}
+; BMI2: rorxl $7
 	%x = load i32, i32* %p
 	%a = lshr i32 %x, 7
 	%b = shl i32 %x, 25
@@ -104,7 +117,7 @@ entry:
 define i32 @xbu(i32 %x, i32 %y, i32 %z) nounwind readnone {
 entry:
 ; CHECK-LABEL: xbu:
-; CHECK: shldl
+; CHECK: shldl $25
 	%0 = lshr i32 %y, 7
 	%1 = shl i32 %x, 25
 	%2 = or i32 %0, %1
diff --git a/test/CodeGen/X86/rot64.ll b/test/CodeGen/X86/rot64.ll
index f77bde050c786c63c76b6f93ce8d2a92f9a09702..976acbb01675395eb2f9e296e91e412d81e39a43 100644
--- a/test/CodeGen/X86/rot64.ll
+++ b/test/CodeGen/X86/rot64.ll
@@ -1,12 +1,11 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 > %t
-; RUN: grep rol %t | count 5
-; RUN: grep ror %t | count 1
-; RUN: grep shld %t | count 2
-; RUN: grep shrd %t | count 2
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s --check-prefix=SHLD
 ; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s --check-prefix=BMI2
 
 define i64 @foo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; CHECK-LABEL: foo:
+; CHECK: rolq %cl
 	%0 = shl i64 %x, %z
 	%1 = sub i64 64, %z
 	%2 = lshr i64 %x, %1
@@ -16,6 +15,8 @@ entry:
 
 define i64 @bar(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; CHECK-LABEL: bar:
+; CHECK: shldq %cl
 	%0 = shl i64 %y, %z
 	%1 = sub i64 64, %z
 	%2 = lshr i64 %x, %1
@@ -25,6 +26,8 @@ entry:
 
 define i64 @un(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; CHECK-LABEL: un:
+; CHECK: rorq %cl
 	%0 = lshr i64 %x, %z
 	%1 = sub i64 64, %z
 	%2 = shl i64 %x, %1
@@ -34,6 +37,8 @@ entry:
 
 define i64 @bu(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; CHECK-LABEL: bu:
+; CHECK: shrdq %cl
 	%0 = lshr i64 %y, %z
 	%1 = sub i64 64, %z
 	%2 = shl i64 %x, %1
@@ -43,6 +48,10 @@ entry:
 
 define i64 @xfoo(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; CHECK-LABEL: xfoo:
+; CHECK: rolq $7
+; SHLD-LABEL: xfoo:
+; SHLD: shldq $7
 ; BMI2-LABEL: xfoo:
 ; BMI2: rorxq $57
 	%0 = lshr i64 %x, 57
@@ -53,8 +62,12 @@ entry:
 
 define i64 @xfoop(i64* %p) nounwind readnone {
 entry:
+; CHECK-LABEL: xfoop:
+; CHECK: rolq $7
+; SHLD-LABEL: xfoop:
+; SHLD: shldq $7
 ; BMI2-LABEL: xfoop:
-; BMI2: rorxq $57, ({{.+}}), %{{.+}}
+; BMI2: rorxq $57
 	%x = load i64, i64* %p
 	%a = lshr i64 %x, 57
 	%b = shl i64 %x, 7
@@ -64,6 +77,8 @@ entry:
 
 define i64 @xbar(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; CHECK-LABEL: xbar:
+; CHECK: shrdq $57
 	%0 = shl i64 %y, 7
 	%1 = lshr i64 %x, 57
 	%2 = or i64 %0, %1
@@ -72,6 +87,10 @@ entry:
 
 define i64 @xun(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; CHECK-LABEL: xun:
+; CHECK: rolq $57
+; SHLD-LABEL: xun:
+; SHLD: shldq $57
 ; BMI2-LABEL: xun:
 ; BMI2: rorxq $7
 	%0 = lshr i64 %x, 7
@@ -82,8 +101,12 @@ entry:
 
 define i64 @xunp(i64* %p) nounwind readnone {
 entry:
+; CHECK-LABEL: xunp:
+; CHECK: rolq $57
+; SHLD-LABEL: xunp:
+; SHLD: shldq $57
 ; BMI2-LABEL: xunp:
-; BMI2: rorxq $7, ({{.+}}), %{{.+}}
+; BMI2: rorxq $7
 	%x = load i64, i64* %p
 	%a = lshr i64 %x, 7
 	%b = shl i64 %x, 57
@@ -93,6 +116,8 @@ entry:
 
 define i64 @xbu(i64 %x, i64 %y, i64 %z) nounwind readnone {
 entry:
+; CHECK-LABEL: xbu:
+; CHECK: shldq $57
 	%0 = lshr i64 %y, 7
 	%1 = shl i64 %x, 57
 	%2 = or i64 %0, %1
diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll
index 657b312b06c9ba30a9b0296c758f6690ea96cf1e..5d5150ad62d6083d63a3931ac74e75691212f99c 100644
--- a/test/CodeGen/X86/rotate.ll
+++ b/test/CodeGen/X86/rotate.ll
@@ -541,3 +541,86 @@ define i8 @rotr1_8(i8 %A) nounwind {
 	%D = or i8 %B, %C
 	ret i8 %D
 }
+
+define void @rotr1_64_mem(i64* %Aptr) nounwind {
+; 32-LABEL: rotr1_64_mem:
+; 32:       # BB#0:
+; 32-NEXT:    pushl %esi
+; 32-NEXT:    movl 8(%esp), %eax
+; 32-NEXT:    movl (%eax), %ecx
+; 32-NEXT:    movl 4(%eax), %edx
+; 32-NEXT:    movl %edx, %esi
+; 32-NEXT:    shldl $31, %ecx, %esi
+; 32-NEXT:    shldl $31, %edx, %ecx
+; 32-NEXT:    movl %ecx, 4(%eax)
+; 32-NEXT:    movl %esi, (%eax)
+; 32-NEXT:    popl %esi
+
+; 64-LABEL: rotr1_64_mem:
+; 64:       # BB#0:
+; 64-NEXT:    rorq (%rdi)
+; 64-NEXT:    retq
+  %A = load i64, i64 *%Aptr
+  %B = shl i64 %A, 63
+  %C = lshr i64 %A, 1
+  %D = or i64 %B, %C
+  store i64 %D, i64* %Aptr
+  ret void
+}
+
+define void @rotr1_32_mem(i32* %Aptr) nounwind {
+; 32-LABEL: rotr1_32_mem:
+; 32:       # BB#0:
+; 32-NEXT:    movl 4(%esp), %eax
+; 32-NEXT:    rorl (%eax)
+; 32-NEXT:    retl
+;
+; 64-LABEL: rotr1_32_mem:
+; 64:       # BB#0:
+; 64-NEXT:    rorl (%rdi)
+; 64-NEXT:    retq
+  %A = load i32, i32 *%Aptr
+  %B = shl i32 %A, 31
+  %C = lshr i32 %A, 1
+  %D = or i32 %B, %C
+  store i32 %D, i32* %Aptr
+  ret void
+}
+
+define void @rotr1_16_mem(i16* %Aptr) nounwind {
+; 32-LABEL: rotr1_16_mem:
+; 32:       # BB#0:
+; 32-NEXT:    movl 4(%esp), %eax
+; 32-NEXT:    rorw (%eax)
+; 32-NEXT:    retl
+;
+; 64-LABEL: rotr1_16_mem:
+; 64:       # BB#0:
+; 64-NEXT:    rorw (%rdi)
+; 64-NEXT:    retq
+  %A = load i16, i16 *%Aptr
+  %B = shl i16 %A, 15
+  %C = lshr i16 %A, 1
+  %D = or i16 %B, %C
+  store i16 %D, i16* %Aptr
+  ret void
+}
+
+define void @rotr1_8_mem(i8* %Aptr) nounwind {
+; 32-LABEL: rotr1_8_mem:
+; 32:       # BB#0:
+; 32-NEXT:    movl 4(%esp), %eax
+; 32-NEXT:    rorb (%eax)
+; 32-NEXT:    retl
+;
+; 64-LABEL: rotr1_8_mem:
+; 64:       # BB#0:
+; 64-NEXT:    rorb (%rdi)
+; 64-NEXT:    retq
+  %A = load i8, i8 *%Aptr
+  %B = shl i8 %A, 7
+  %C = lshr i8 %A, 1
+  %D = or i8 %B, %C
+  store i8 %D, i8* %Aptr
+  ret void
+}
diff --git a/test/CodeGen/X86/rtm.ll b/test/CodeGen/X86/rtm.ll
index fb06cac45fffb0038a7565d764fc97840dbf8d89..7215c482ffa28f9f0b2481434e85d154fb5a479a 100644
--- a/test/CodeGen/X86/rtm.ll
+++ b/test/CodeGen/X86/rtm.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mattr=+rtm -mtriple=x86_64-unknown-unknown | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+rtm | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+rtm | FileCheck %s --check-prefix=X64
 
 declare i32 @llvm.x86.xbegin() nounwind
 declare void @llvm.x86.xend() nounwind
@@ -6,39 +8,78 @@ declare void @llvm.x86.xabort(i8) nounwind
 declare void @f1()
 
 define i32 @test_xbegin() nounwind uwtable {
+; X86-LABEL: test_xbegin:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    xbegin .LBB0_2
+; X86-NEXT:  # BB#1: # %entry
+; X86-NEXT:    movl $-1, %eax
+; X86-NEXT:  .LBB0_2: # %entry
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_xbegin:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    xbegin .LBB0_2
+; X64-NEXT:  # BB#1: # %entry
+; X64-NEXT:    movl $-1, %eax
+; X64-NEXT:  .LBB0_2: # %entry
+; X64-NEXT:    retq
 entry:
   %0 = tail call i32 @llvm.x86.xbegin() nounwind
   ret i32 %0
-; CHECK: test_xbegin
-; CHECK: xbegin [[LABEL:.*BB.*]]
-; CHECK: [[LABEL]]:
 }
 
 define void @test_xend() nounwind uwtable {
+; X86-LABEL: test_xend:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    xend
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_xend:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    xend
+; X64-NEXT:    retq
 entry:
   tail call void @llvm.x86.xend() nounwind
   ret void
-; CHECK: test_xend
-; CHECK: xend
 }
 
 define void @test_xabort() nounwind uwtable {
+; X86-LABEL: test_xabort:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    xabort $2
+; X86-NEXT:    retl
+;
+; X64-LABEL: test_xabort:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    xabort $2
+; X64-NEXT:    retq
 entry:
   tail call void @llvm.x86.xabort(i8 2)
   ret void
-; CHECK: test_xabort
-; CHECK: xabort $2
 }
 
 define void @f2(i32 %x) nounwind uwtable {
+; X86-LABEL: f2:
+; X86:       # BB#0: # %entry
+; X86-NEXT:    xabort $1
+; X86-NEXT:    calll f1
+; X86-NEXT:    retl
+;
+; X64-LABEL: f2:
+; X64:       # BB#0: # %entry
+; X64-NEXT:    pushq %rax
+; X64-NEXT:  .Lcfi0:
+; X64-NEXT:    .cfi_def_cfa_offset 16
+; X64-NEXT:    movl %edi, {{[0-9]+}}(%rsp)
+; X64-NEXT:    xabort $1
+; X64-NEXT:    callq f1
+; X64-NEXT:    popq %rax
+; X64-NEXT:    retq
 entry:
   %x.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
   call void @llvm.x86.xabort(i8 1)
   call void @f1()
   ret void
-; CHECK-LABEL: f2
-; CHECK: xabort  $1
-; CHECK: callq   f1
 }
- 
\ No newline at end of file
+
diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll
index 07c07485c88ef20ea32ef853991c774e5146077c..b8a8b8afd14fd2d6a025a6e67df871cdc75f8b02 100644
--- a/test/CodeGen/X86/sad.ll
+++ b/test/CodeGen/X86/sad.ll
@@ -81,6 +81,7 @@ define i32 @sad_16i8() nounwind {
 ; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: sad_16i8:
@@ -106,6 +107,7 @@ define i32 @sad_16i8() nounwind {
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 entry:
   br label %vector.body
@@ -147,129 +149,123 @@ middle.block:
 define i32 @sad_32i8() nounwind {
 ; SSE2-LABEL: sad_32i8:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm11
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pxor %xmm2, %xmm2
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pxor %xmm12, %xmm12
 ; SSE2-NEXT:    pxor %xmm15, %xmm15
-; SSE2-NEXT:    pxor %xmm5, %xmm5
+; SSE2-NEXT:    pxor %xmm13, %xmm13
 ; SSE2-NEXT:    pxor %xmm14, %xmm14
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB1_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm1
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm6
 ; SSE2-NEXT:    movdqa a+1024(%rax), %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm8 = xmm3[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm12[0],xmm8[1],xmm12[1],xmm8[2],xmm12[2],xmm8[3],xmm12[3],xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3],xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm12[0],xmm3[1],xmm12[1],xmm3[2],xmm12[2],xmm3[3],xmm12[3],xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm3, %xmm6
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3],xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm12[4],xmm1[5],xmm12[5],xmm1[6],xmm12[6],xmm1[7],xmm12[7]
-; SSE2-NEXT:    movdqa b+1040(%rax), %xmm2
-; SSE2-NEXT:    movdqa b+1024(%rax), %xmm5
-; SSE2-NEXT:    pshufd {{.*#+}} xmm9 = xmm2[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE2-NEXT:    movdqa b+1040(%rax), %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15]
+; SSE2-NEXT:    movdqa %xmm9, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
+; SSE2-NEXT:    psubd %xmm9, %xmm6
+; SSE2-NEXT:    movdqa b+1024(%rax), %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; SSE2-NEXT:    psubd %xmm10, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
 ; SSE2-NEXT:    psubd %xmm2, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm11 = xmm5[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm10, %xmm0
-; SSE2-NEXT:    movdqa %xmm5, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm3
-; SSE2-NEXT:    movdqa %xmm7, %xmm5
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm12[4],xmm7[5],xmm12[5],xmm7[6],xmm12[6],xmm7[7],xmm12[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm12[0],xmm9[1],xmm12[1],xmm9[2],xmm12[2],xmm9[3],xmm12[3],xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm6
-; SSE2-NEXT:    movdqa %xmm4, %xmm10
-; SSE2-NEXT:    movdqa %xmm9, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm9, %xmm7
-; SSE2-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3],xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm4, %xmm5
-; SSE2-NEXT:    movdqa %xmm11, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7]
-; SSE2-NEXT:    psubd %xmm11, %xmm8
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3]
-; SSE2-NEXT:    psubd %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm4, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; SSE2-NEXT:    psubd %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
+; SSE2-NEXT:    psubd %xmm4, %xmm3
+; SSE2-NEXT:    movdqa %xmm8, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3]
+; SSE2-NEXT:    psubd %xmm9, %xmm5
 ; SSE2-NEXT:    movdqa %xmm2, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm2
-; SSE2-NEXT:    pxor %xmm4, %xmm2
-; SSE2-NEXT:    movdqa %xmm8, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm8
-; SSE2-NEXT:    pxor %xmm4, %xmm8
-; SSE2-NEXT:    movdqa %xmm5, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm5
-; SSE2-NEXT:    pxor %xmm4, %xmm5
-; SSE2-NEXT:    movdqa %xmm7, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm7
-; SSE2-NEXT:    pxor %xmm4, %xmm7
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm6
-; SSE2-NEXT:    pxor %xmm4, %xmm6
-; SSE2-NEXT:    movdqa %xmm3, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm3
-; SSE2-NEXT:    pxor %xmm4, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    pxor %xmm4, %xmm0
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:    psrad $31, %xmm4
-; SSE2-NEXT:    paddd %xmm4, %xmm1
-; SSE2-NEXT:    pxor %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm10, %xmm4
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7]
+; SSE2-NEXT:    psubd %xmm2, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3]
+; SSE2-NEXT:    psubd %xmm4, %xmm10
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm10
+; SSE2-NEXT:    pxor %xmm2, %xmm10
+; SSE2-NEXT:    movdqa %xmm8, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm8
+; SSE2-NEXT:    pxor %xmm2, %xmm8
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm5
+; SSE2-NEXT:    pxor %xmm2, %xmm5
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm3
+; SSE2-NEXT:    pxor %xmm2, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm1
+; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    movdqa %xmm7, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm7
+; SSE2-NEXT:    pxor %xmm2, %xmm7
+; SSE2-NEXT:    movdqa %xmm6, %xmm2
+; SSE2-NEXT:    psrad $31, %xmm2
+; SSE2-NEXT:    paddd %xmm2, %xmm6
+; SSE2-NEXT:    pxor %xmm2, %xmm6
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm6, %xmm14
+; SSE2-NEXT:    paddd %xmm7, %xmm13
 ; SSE2-NEXT:    paddd %xmm1, %xmm15
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm13
+; SSE2-NEXT:    paddd %xmm0, %xmm12
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm3, %xmm4
-; SSE2-NEXT:    paddd %xmm6, %xmm0
-; SSE2-NEXT:    paddd %xmm7, %xmm14
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm5, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm8, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm5, %xmm2
+; SSE2-NEXT:    paddd %xmm8, %xmm3
+; SSE2-NEXT:    paddd %xmm10, %xmm0
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # BB#2: # %middle.block
-; SSE2-NEXT:    paddd %xmm15, %xmm4
+; SSE2-NEXT:    paddd %xmm15, %xmm3
 ; SSE2-NEXT:    paddd %xmm14, %xmm1
-; SSE2-NEXT:    paddd %xmm13, %xmm0
-; SSE2-NEXT:    paddd %xmm5, %xmm2
-; SSE2-NEXT:    paddd %xmm4, %xmm1
+; SSE2-NEXT:    paddd %xmm12, %xmm0
+; SSE2-NEXT:    paddd %xmm13, %xmm2
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    paddd %xmm2, %xmm1
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
@@ -330,6 +326,7 @@ define i32 @sad_32i8() nounwind {
 ; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: sad_32i8:
@@ -357,6 +354,7 @@ define i32 @sad_32i8() nounwind {
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 entry:
   br label %vector.body
@@ -400,291 +398,288 @@ middle.block:
 define i32 @sad_avx64i8() nounwind {
 ; SSE2-LABEL: sad_avx64i8:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    subq $216, %rsp
-; SSE2-NEXT:    pxor %xmm6, %xmm6
+; SSE2-NEXT:    subq $184, %rsp
+; SSE2-NEXT:    pxor %xmm15, %xmm15
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
+; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pxor %xmm8, %xmm8
+; SSE2-NEXT:    pxor %xmm4, %xmm4
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm14, %xmm14
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm3, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm2
-; SSE2-NEXT:    pxor %xmm13, %xmm13
+; SSE2-NEXT:    pxor %xmm6, %xmm6
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pxor %xmm10, %xmm10
-; SSE2-NEXT:    pxor %xmm12, %xmm12
+; SSE2-NEXT:    pxor %xmm11, %xmm11
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    pxor %xmm7, %xmm7
+; SSE2-NEXT:    pxor %xmm13, %xmm13
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pxor %xmm15, %xmm15
-; SSE2-NEXT:    pxor %xmm11, %xmm11
-; SSE2-NEXT:    pxor %xmm8, %xmm8
-; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    pxor %xmm3, %xmm3
 ; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB2_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
-; SSE2-NEXT:    movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa a+1040(%rax), %xmm13
-; SSE2-NEXT:    movdqa a+1024(%rax), %xmm12
-; SSE2-NEXT:    movdqa a+1056(%rax), %xmm10
-; SSE2-NEXT:    movdqa a+1072(%rax), %xmm8
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm10[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm11, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3],xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7]
-; SSE2-NEXT:    movdqa %xmm10, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3],xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm1
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
+; SSE2-NEXT:    movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa a+1040(%rax), %xmm6
+; SSE2-NEXT:    movdqa a+1024(%rax), %xmm4
+; SSE2-NEXT:    movdqa a+1056(%rax), %xmm11
+; SSE2-NEXT:    movdqa a+1072(%rax), %xmm3
+; SSE2-NEXT:    movdqa %xmm3, %xmm5
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7]
+; SSE2-NEXT:    movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3]
+; SSE2-NEXT:    movdqa %xmm4, %xmm12
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
 ; SSE2-NEXT:    movdqa %xmm12, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE2-NEXT:    movdqa %xmm0, %xmm15
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm6[0],xmm13[1],xmm6[1],xmm13[2],xmm6[2],xmm13[3],xmm6[3],xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
-; SSE2-NEXT:    movdqa %xmm13, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm6[4],xmm13[5],xmm6[5],xmm13[6],xmm6[6],xmm13[7],xmm6[7]
-; SSE2-NEXT:    movdqa b+1040(%rax), %xmm7
-; SSE2-NEXT:    movdqa b+1024(%rax), %xmm11
-; SSE2-NEXT:    movdqa b+1056(%rax), %xmm9
-; SSE2-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT:    movdqa %xmm7, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT:    psubd %xmm7, %xmm13
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm11[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm6[0],xmm11[1],xmm6[1],xmm11[2],xmm6[2],xmm11[3],xmm6[3],xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm11, %xmm2
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
-; SSE2-NEXT:    psubd %xmm11, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm14 = xmm9[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm15
-; SSE2-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm9, %xmm2
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm14
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
+; SSE2-NEXT:    movdqa %xmm14, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm6, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7]
+; SSE2-NEXT:    movdqa b+1040(%rax), %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm13
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm9, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm9, %xmm6
+; SSE2-NEXT:    movdqa b+1024(%rax), %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm10, %xmm8
+; SSE2-NEXT:    movdqa %xmm13, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm13, %xmm14
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm9, %xmm7
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm2, %xmm4
+; SSE2-NEXT:    movdqa b+1056(%rax), %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm10, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm10, %xmm12
+; SSE2-NEXT:    movdqa %xmm2, %xmm10
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3]
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    psubd %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm2, %xmm11
+; SSE2-NEXT:    movdqa %xmm1, %xmm13
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3]
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm9, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm10, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm10, %xmm1
+; SSE2-NEXT:    movdqa %xmm3, %xmm10
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm2, %xmm13
+; SSE2-NEXT:    movdqa b+1072(%rax), %xmm2
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm2, %xmm3
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
 ; SSE2-NEXT:    psubd %xmm9, %xmm10
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm9
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3]
+; SSE2-NEXT:    psubd %xmm0, %xmm5
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7]
+; SSE2-NEXT:    psubd %xmm2, %xmm9
+; SSE2-NEXT:    movdqa %xmm9, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm9
+; SSE2-NEXT:    pxor %xmm0, %xmm9
 ; SSE2-NEXT:    movdqa %xmm5, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm4
-; SSE2-NEXT:    movdqa %xmm3, %xmm15
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE2-NEXT:    psubd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm5
+; SSE2-NEXT:    pxor %xmm0, %xmm5
+; SSE2-NEXT:    movdqa %xmm10, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm10
+; SSE2-NEXT:    pxor %xmm0, %xmm10
+; SSE2-NEXT:    movdqa %xmm3, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm3
+; SSE2-NEXT:    movdqa %xmm13, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm13
+; SSE2-NEXT:    pxor %xmm0, %xmm13
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm1
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm11, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm11
+; SSE2-NEXT:    pxor %xmm0, %xmm11
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm12, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm12
+; SSE2-NEXT:    pxor %xmm0, %xmm12
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm2
+; SSE2-NEXT:    pxor %xmm0, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm4
+; SSE2-NEXT:    pxor %xmm0, %xmm4
 ; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT:    psubd %xmm7, %xmm3
-; SSE2-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7]
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm6[0],xmm15[1],xmm6[1],xmm15[2],xmm6[2],xmm15[3],xmm6[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE2-NEXT:    psubd %xmm0, %xmm15
-; SSE2-NEXT:    movdqa %xmm1, %xmm11
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3]
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3],xmm14[4],xmm6[4],xmm14[5],xmm6[5],xmm14[6],xmm6[6],xmm14[7],xmm6[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE2-NEXT:    psubd %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm7
+; SSE2-NEXT:    pxor %xmm0, %xmm7
 ; SSE2-NEXT:    movdqa %xmm14, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm6[0],xmm14[1],xmm6[1],xmm14[2],xmm6[2],xmm14[3],xmm6[3]
-; SSE2-NEXT:    psubd %xmm14, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, %xmm14
-; SSE2-NEXT:    movdqa %xmm8, %xmm9
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm6[4],xmm11[5],xmm6[5],xmm11[6],xmm6[6],xmm11[7],xmm6[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; SSE2-NEXT:    psubd %xmm0, %xmm11
-; SSE2-NEXT:    movdqa b+1072(%rax), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE2-NEXT:    psubd %xmm0, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm9
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm5
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3]
-; SSE2-NEXT:    psubd %xmm2, %xmm7
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; SSE2-NEXT:    psubd %xmm5, %xmm0
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm0
-; SSE2-NEXT:    pxor %xmm2, %xmm0
-; SSE2-NEXT:    movdqa %xmm7, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm7
-; SSE2-NEXT:    pxor %xmm2, %xmm7
-; SSE2-NEXT:    movdqa %xmm9, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm9
-; SSE2-NEXT:    pxor %xmm2, %xmm9
-; SSE2-NEXT:    movdqa %xmm8, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm8
-; SSE2-NEXT:    pxor %xmm2, %xmm8
-; SSE2-NEXT:    movdqa %xmm11, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm11
-; SSE2-NEXT:    pxor %xmm2, %xmm11
-; SSE2-NEXT:    movdqa %xmm14, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm14
-; SSE2-NEXT:    pxor %xmm2, %xmm14
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm15, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm15
-; SSE2-NEXT:    pxor %xmm2, %xmm15
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm3
-; SSE2-NEXT:    pxor %xmm2, %xmm3
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm4
-; SSE2-NEXT:    pxor %xmm2, %xmm4
-; SSE2-NEXT:    movdqa %xmm10, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm10
-; SSE2-NEXT:    pxor %xmm2, %xmm10
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm14
+; SSE2-NEXT:    pxor %xmm0, %xmm14
+; SSE2-NEXT:    movdqa %xmm8, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm8
+; SSE2-NEXT:    pxor %xmm0, %xmm8
+; SSE2-NEXT:    movdqa %xmm6, %xmm0
+; SSE2-NEXT:    psrad $31, %xmm0
+; SSE2-NEXT:    paddd %xmm0, %xmm6
+; SSE2-NEXT:    pxor %xmm0, %xmm6
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm6, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm8, %xmm6
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm14, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm7, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm4, %xmm2
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm12, %xmm8
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    movdqa %xmm0, %xmm12
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm11, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa (%rsp), %xmm11 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload
+; SSE2-NEXT:    paddd %xmm1, %xmm2
+; SSE2-NEXT:    paddd %xmm13, %xmm7
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
+; SSE2-NEXT:    paddd %xmm3, %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm12, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm12
-; SSE2-NEXT:    pxor %xmm2, %xmm12
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
 ; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    pxor %xmm2, %xmm1
-; SSE2-NEXT:    movdqa %xmm13, %xmm2
-; SSE2-NEXT:    psrad $31, %xmm2
-; SSE2-NEXT:    paddd %xmm2, %xmm13
-; SSE2-NEXT:    pxor %xmm2, %xmm13
+; SSE2-NEXT:    paddd %xmm10, %xmm1
+; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm5, %xmm3
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm13, %xmm5
+; SSE2-NEXT:    paddd %xmm9, %xmm5
 ; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm1, %xmm13
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm12, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm10, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm4, %xmm5
-; SSE2-NEXT:    movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10 # 16-byte Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm12 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm3, %xmm2
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm15, %xmm3
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm15 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm14, %xmm15
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm11, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm11 # 16-byte Reload
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm8, %xmm4
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm9, %xmm4
-; SSE2-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm7, %xmm5
-; SSE2-NEXT:    movdqa (%rsp), %xmm7 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm0, %xmm7
-; SSE2-NEXT:    movdqa %xmm7, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    addq $4, %rax
 ; SSE2-NEXT:    jne .LBB2_1
 ; SSE2-NEXT:  # BB#2: # %middle.block
-; SSE2-NEXT:    paddd %xmm15, %xmm3
-; SSE2-NEXT:    paddd %xmm5, %xmm10
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT:    paddd -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload
-; SSE2-NEXT:    paddd %xmm8, %xmm13
+; SSE2-NEXT:    paddd %xmm2, %xmm4
+; SSE2-NEXT:    paddd %xmm3, %xmm6
+; SSE2-NEXT:    movdqa %xmm12, %xmm2
 ; SSE2-NEXT:    paddd %xmm11, %xmm2
-; SSE2-NEXT:    paddd %xmm0, %xmm12
-; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm1, %xmm5
+; SSE2-NEXT:    paddd %xmm13, %xmm14
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm7, %xmm3
+; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload
+; SSE2-NEXT:    paddd %xmm5, %xmm7
+; SSE2-NEXT:    paddd %xmm0, %xmm8
 ; SSE2-NEXT:    movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
-; SSE2-NEXT:    paddd %xmm4, %xmm0
-; SSE2-NEXT:    paddd %xmm2, %xmm12
-; SSE2-NEXT:    paddd %xmm3, %xmm10
-; SSE2-NEXT:    paddd %xmm13, %xmm10
-; SSE2-NEXT:    paddd %xmm0, %xmm12
-; SSE2-NEXT:    paddd %xmm5, %xmm12
-; SSE2-NEXT:    paddd %xmm10, %xmm12
-; SSE2-NEXT:    paddd %xmm6, %xmm12
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm12[2,3,0,1]
-; SSE2-NEXT:    paddd %xmm12, %xmm0
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm7
+; SSE2-NEXT:    paddd %xmm4, %xmm6
+; SSE2-NEXT:    paddd %xmm14, %xmm6
+; SSE2-NEXT:    paddd %xmm0, %xmm7
+; SSE2-NEXT:    paddd %xmm8, %xmm7
+; SSE2-NEXT:    paddd %xmm6, %xmm7
+; SSE2-NEXT:    paddd %xmm2, %xmm7
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1]
+; SSE2-NEXT:    paddd %xmm7, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm1, %eax
-; SSE2-NEXT:    addq $216, %rsp
+; SSE2-NEXT:    addq $184, %rsp
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: sad_avx64i8:
@@ -808,6 +803,7 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX512F-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
 ; AVX512F-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: sad_avx64i8:
@@ -836,6 +832,7 @@ define i32 @sad_avx64i8() nounwind {
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} zmm1 = zmm0[1,1,2,3,5,5,6,7,9,9,10,11,13,13,14,15]
 ; AVX512BW-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 entry:
   br label %vector.body
@@ -1156,100 +1153,100 @@ define i32 @sad_nonloop_16i8(<16 x i8>* nocapture readonly %p, i64, <16 x i8>* n
 define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* nocapture readonly %q) local_unnamed_addr #0 {
 ; SSE2-LABEL: sad_nonloop_32i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqu (%rdi), %xmm12
-; SSE2-NEXT:    movdqu 16(%rdi), %xmm2
-; SSE2-NEXT:    pshufd {{.*#+}} xmm13 = xmm2[2,3,0,1]
-; SSE2-NEXT:    pxor %xmm5, %xmm5
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm13, %xmm9
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm2, %xmm10
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm12, %xmm11
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3]
-; SSE2-NEXT:    movdqu (%rdx), %xmm7
-; SSE2-NEXT:    movdqu 16(%rdx), %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm6 = xmm0[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm6, %xmm4
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill
-; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm7[2,3,0,1]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm4, %xmm14
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm0, %xmm15
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; SSE2-NEXT:    movdqa %xmm7, %xmm8
-; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; SSE2-NEXT:    psubd %xmm7, %xmm12
-; SSE2-NEXT:    psubd %xmm0, %xmm2
-; SSE2-NEXT:    psubd %xmm4, %xmm1
-; SSE2-NEXT:    psubd %xmm6, %xmm13
-; SSE2-NEXT:    psubd %xmm8, %xmm11
-; SSE2-NEXT:    psubd %xmm15, %xmm10
-; SSE2-NEXT:    psubd %xmm14, %xmm3
+; SSE2-NEXT:    movdqu (%rdi), %xmm0
+; SSE2-NEXT:    movdqu 16(%rdi), %xmm3
+; SSE2-NEXT:    pxor %xmm4, %xmm4
+; SSE2-NEXT:    movdqa %xmm3, %xmm12
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm12, %xmm9
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm0, %xmm13
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm13, %xmm10
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm3, %xmm11
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm6
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
+; SSE2-NEXT:    movdqu (%rdx), %xmm5
+; SSE2-NEXT:    movdqu 16(%rdx), %xmm7
+; SSE2-NEXT:    movdqa %xmm7, %xmm1
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm5, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE2-NEXT:    movdqa %xmm2, %xmm14
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm7, %xmm15
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE2-NEXT:    movdqa %xmm5, %xmm8
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
+; SSE2-NEXT:    psubd %xmm5, %xmm0
+; SSE2-NEXT:    psubd %xmm7, %xmm3
+; SSE2-NEXT:    psubd %xmm2, %xmm13
+; SSE2-NEXT:    psubd %xmm1, %xmm12
+; SSE2-NEXT:    psubd %xmm8, %xmm6
+; SSE2-NEXT:    psubd %xmm15, %xmm11
+; SSE2-NEXT:    psubd %xmm14, %xmm10
 ; SSE2-NEXT:    psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload
-; SSE2-NEXT:    movdqa %xmm9, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm9
-; SSE2-NEXT:    pxor %xmm0, %xmm9
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm3
-; SSE2-NEXT:    pxor %xmm0, %xmm3
-; SSE2-NEXT:    movdqa %xmm10, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm10
-; SSE2-NEXT:    pxor %xmm0, %xmm10
-; SSE2-NEXT:    movdqa %xmm11, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm11
-; SSE2-NEXT:    pxor %xmm0, %xmm11
-; SSE2-NEXT:    movdqa %xmm13, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm13
-; SSE2-NEXT:    pxor %xmm0, %xmm13
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    pxor %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm2
-; SSE2-NEXT:    pxor %xmm0, %xmm2
-; SSE2-NEXT:    movdqa %xmm12, %xmm0
-; SSE2-NEXT:    psrad $31, %xmm0
-; SSE2-NEXT:    paddd %xmm0, %xmm12
-; SSE2-NEXT:    pxor %xmm0, %xmm12
-; SSE2-NEXT:    paddd %xmm13, %xmm1
-; SSE2-NEXT:    paddd %xmm9, %xmm3
-; SSE2-NEXT:    paddd %xmm10, %xmm3
-; SSE2-NEXT:    paddd %xmm11, %xmm3
-; SSE2-NEXT:    paddd %xmm2, %xmm1
-; SSE2-NEXT:    paddd %xmm3, %xmm1
-; SSE2-NEXT:    paddd %xmm12, %xmm1
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm9, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm9
+; SSE2-NEXT:    pxor %xmm1, %xmm9
+; SSE2-NEXT:    movdqa %xmm10, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm10
+; SSE2-NEXT:    pxor %xmm1, %xmm10
+; SSE2-NEXT:    movdqa %xmm11, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm11
+; SSE2-NEXT:    pxor %xmm1, %xmm11
+; SSE2-NEXT:    movdqa %xmm6, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm6
+; SSE2-NEXT:    pxor %xmm1, %xmm6
+; SSE2-NEXT:    movdqa %xmm12, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm12
+; SSE2-NEXT:    pxor %xmm1, %xmm12
+; SSE2-NEXT:    movdqa %xmm13, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm13
+; SSE2-NEXT:    pxor %xmm1, %xmm13
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
+; SSE2-NEXT:    paddd %xmm1, %xmm3
+; SSE2-NEXT:    pxor %xmm1, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    paddd %xmm1, %xmm0
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    paddd %xmm3, %xmm0
+; SSE2-NEXT:    paddd %xmm11, %xmm6
+; SSE2-NEXT:    paddd %xmm9, %xmm6
+; SSE2-NEXT:    paddd %xmm10, %xmm6
+; SSE2-NEXT:    paddd %xmm12, %xmm0
+; SSE2-NEXT:    paddd %xmm6, %xmm0
+; SSE2-NEXT:    paddd %xmm13, %xmm0
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; SSE2-NEXT:    paddd %xmm0, %xmm1
-; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
+; SSE2-NEXT:    paddd %xmm1, %xmm0
+; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    retq
 ;
 ; AVX2-LABEL: sad_nonloop_32i8:
@@ -1273,6 +1270,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
 ; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512F-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: sad_nonloop_32i8:
@@ -1284,6 +1282,7 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512BW-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovd %xmm0, %eax
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %v1 = load <32 x i8>, <32 x i8>* %p, align 1
   %z1 = zext <32 x i8> %v1 to <32 x i32>
diff --git a/test/CodeGen/X86/sad_variations.ll b/test/CodeGen/X86/sad_variations.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1d826cf41a4d0c6da1d4d6f940748d3177e38827
--- /dev/null
+++ b/test/CodeGen/X86/sad_variations.ll
@@ -0,0 +1,347 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+
+define i32 @sad8_32bit_icmp_sge(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #0 {
+; SSE2-LABEL: sad8_32bit_icmp_sge:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_sge:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_sge:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp sgt <8 x i32> %6, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i32 @sad8_32bit_icmp_sgt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #1 {
+; SSE2-LABEL: sad8_32bit_icmp_sgt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_sgt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_sgt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp sgt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %6, <8 x i32> %8
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i32 @sad8_32bit_icmp_sle(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #2 {
+; SSE2-LABEL: sad8_32bit_icmp_sle:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_sle:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_sle:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i32 @sad8_32bit_icmp_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i32 %stride) local_unnamed_addr #3 {
+; SSE2-LABEL: sad8_32bit_icmp_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %eax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_32bit_icmp_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_32bit_icmp_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    retq
+entry:
+  %idx.ext = zext i32 %stride to i64
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %rdx.shuf = shufflevector <8 x i32> %9, <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i32> %9, %rdx.shuf
+  %rdx.shuf229 = shufflevector <8 x i32> %bin.rdx, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx230 = add <8 x i32> %bin.rdx, %rdx.shuf229
+  %rdx.shuf231 = shufflevector <8 x i32> %bin.rdx230, <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx232 = add <8 x i32> %bin.rdx230, %rdx.shuf231
+  %10 = extractelement <8 x i32> %bin.rdx232, i32 0
+  ret i32 %10
+}
+
+define i64 @sad8_64bit_icmp_sext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
+; SSE2-LABEL: sad8_64bit_icmp_sext_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_64bit_icmp_sext_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_64bit_icmp_sext_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %10 = sext <8 x i32> %9 to <8 x i64>
+  %rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i64> %rdx.shuf, %10
+  %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
+  %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
+  %11 = extractelement <8 x i64> %bin.rdx239, i32 0
+  ret i64 %11
+}
+
+define i64 @sad8_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
+; SSE2-LABEL: sad8_64bit_icmp_zext_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_64bit_icmp_zext_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_64bit_icmp_zext_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i32>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i32>
+  %6 = sub nsw <8 x i32> %2, %5
+  %7 = icmp slt <8 x i32> %6, zeroinitializer
+  %8 = sub nsw <8 x i32> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i32> %8, <8 x i32> %6
+  %10 = zext <8 x i32> %9 to <8 x i64>
+  %rdx.shuf = shufflevector <8 x i64> %10, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i64> %rdx.shuf, %10
+  %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
+  %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
+  %11 = extractelement <8 x i64> %bin.rdx239, i32 0
+  ret i64 %11
+}
+
+define i64 @sad8_early_64bit_icmp_zext_slt(i8* nocapture readonly %cur, i8* nocapture readonly %ref, i64 %stride) local_unnamed_addr #4 {
+; SSE2-LABEL: sad8_early_64bit_icmp_zext_slt:
+; SSE2:       # BB#0: # %entry
+; SSE2-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE2-NEXT:    psadbw %xmm0, %xmm1
+; SSE2-NEXT:    movd %xmm1, %rax
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: sad8_early_64bit_icmp_zext_slt:
+; AVX2:       # BB#0: # %entry
+; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: sad8_early_64bit_icmp_zext_slt:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-NEXT:    vpsadbw %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT:    vmovq %xmm0, %rax
+; AVX512F-NEXT:    retq
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry
+  %0 = bitcast i8* %cur to <8 x i8>*
+  %1 = load <8 x i8>, <8 x i8>* %0, align 1
+  %2 = zext <8 x i8> %1 to <8 x i64>
+  %3 = bitcast i8* %ref to <8 x i8>*
+  %4 = load <8 x i8>, <8 x i8>* %3, align 1
+  %5 = zext <8 x i8> %4 to <8 x i64>
+  %6 = sub nsw <8 x i64> %2, %5
+  %7 = icmp slt <8 x i64> %6, zeroinitializer
+  %8 = sub nsw <8 x i64> zeroinitializer, %6
+  %9 = select <8 x i1> %7, <8 x i64> %8, <8 x i64> %6
+  %rdx.shuf = shufflevector <8 x i64> %9, <8 x i64> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = add <8 x i64> %rdx.shuf, %9
+  %rdx.shuf236 = shufflevector <8 x i64> %bin.rdx, <8 x i64> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx237 = add <8 x i64> %bin.rdx, %rdx.shuf236
+  %rdx.shuf238 = shufflevector <8 x i64> %bin.rdx237, <8 x i64> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx239 = add <8 x i64> %bin.rdx237, %rdx.shuf238
+  %10 = extractelement <8 x i64> %bin.rdx239, i32 0
+  ret i64 %10
+}
diff --git a/test/CodeGen/X86/safestack.ll b/test/CodeGen/X86/safestack.ll
index 1ff9a050aefbde0385ea6e2cb41b7fc12a5953ea..bd8f57f5e3c94d2b40de9390fe995738ba6542d9 100644
--- a/test/CodeGen/X86/safestack.ll
+++ b/test/CodeGen/X86/safestack.ll
@@ -2,6 +2,7 @@
 ; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
 ; RUN: llc -mtriple=i386-linux-android < %s -o - | FileCheck --check-prefix=ANDROID-I386 %s
 ; RUN: llc -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefix=ANDROID-X64 %s
+; RUN: llc -mtriple=x86_64-fuchsia < %s -o - | FileCheck --check-prefix=FUCHSIA-X64 %s
 
 define void @_Z1fv() safestack {
 entry:
@@ -30,3 +31,7 @@ declare void @_Z7CapturePi(i32*)
 ; ANDROID-X64: movq %fs:72, %[[A:.*]]
 ; ANDROID-X64: leaq -16(%[[A]]), %[[B:.*]]
 ; ANDROID-X64: movq %[[B]], %fs:72
+
+; FUCHSIA-X64: movq %fs:24, %[[A:.*]]
+; FUCHSIA-X64: leaq -16(%[[A]]), %[[B:.*]]
+; FUCHSIA-X64: movq %[[B]], %fs:24
diff --git a/test/CodeGen/X86/safestack_ssp.ll b/test/CodeGen/X86/safestack_ssp.ll
index 5a1a465158cf3b64c6013bdfc610e74282fa9564..a0415cc98feb5eab1fc411f8efec15fee5aaf374 100644
--- a/test/CodeGen/X86/safestack_ssp.ll
+++ b/test/CodeGen/X86/safestack_ssp.ll
@@ -1,6 +1,7 @@
 ; Test codegen pipeline for SafeStack + StackProtector combination.
 ; RUN: llc -mtriple=i386-linux < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
 ; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+; RUN: llc -mtriple=x86_64-fuchsia < %s -o - | FileCheck --check-prefix=FUCHSIA-X64 %s
 
 define void @_Z1fv() safestack sspreq {
 entry:
@@ -25,3 +26,9 @@ declare void @_Z7CapturePi(i32*)
 ; LINUX-I386-DAG: leal -16(%[[B]]), %[[C:.*]]
 ; LINUX-I386-DAG: movl %[[C]], %gs:(%[[A]])
 ; LINUX-I386-DAG: movl %[[COOKIE]], -4(%[[B]])
+
+; FUCHSIA-X64-DAG: movq %fs:24, %[[B:.*]]
+; FUCHSIA-X64-DAG: movq %fs:16, %[[COOKIE:.*]]
+; FUCHSIA-X64-DAG: leaq -16(%[[B]]), %[[C:.*]]
+; FUCHSIA-X64-DAG: movq %[[C]], %fs:24
+; FUCHSIA-X64-DAG: movq %[[COOKIE]], -8(%[[B]])
diff --git a/test/CodeGen/X86/scalar-int-to-fp.ll b/test/CodeGen/X86/scalar-int-to-fp.ll
index 47774e2289f619a0da18b3f1836e7f66254f7536..2b19d02ba8b57d28b9aaad15f1282c40d3cd7e3a 100644
--- a/test/CodeGen/X86/scalar-int-to-fp.ll
+++ b/test/CodeGen/X86/scalar-int-to-fp.ll
@@ -1,175 +1,736 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i386-unknown-unknown     -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32 --check-prefix=AVX512_32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown   -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64 --check-prefix=AVX512_64
+; RUN: llc < %s -mtriple=i386-unknown-unknown     -mattr=+sse2    | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32 --check-prefix=SSE2_32
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown   -mattr=+sse2    | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK64 --check-prefix=SSE2_64
+; RUN: llc < %s -mtriple=i386-unknown-unknown     -mattr=-sse     | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK32  --check-prefix=X87
+
 ; Verify that scalar integer conversions to FP compile successfully
 ; (at one time long double failed with avx512f), and that reasonable
 ; instruction sequences are selected based on subtarget features.
-; Due to the plethora of reasonable sequences we just check for
-; one key instruction, usually a cvt or fild, allowing the test
-; to be relatively easily updated when sequences are improved.
-;
-; RUN: llc < %s -mtriple=i386-unknown-unknown     -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512_32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown   -mattr=+avx512f | FileCheck %s --check-prefix=CHECK --check-prefix=AVX512_64
-; RUN: llc < %s -mtriple=i386-unknown-unknown     -mattr=+sse2    | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2_32
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown   -mattr=+sse2    | FileCheck %s --check-prefix=CHECK --check-prefix=SSE2_64
-; RUN: llc < %s -mtriple=i386-unknown-unknown     -mattr=-sse     | FileCheck %s --check-prefix=CHECK --check-prefix=X87
 
-; CHECK-LABEL: u32_to_f
-; AVX512_32: vcvtusi2ssl
-; AVX512_64: vcvtusi2ssl
-; SSE2_32: cvtsd2ss
-; SSE2_64: cvtsi2ssq
-; X87: fildll
 define float @u32_to_f(i32 %a) nounwind {
+; AVX512_32-LABEL: u32_to_f:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %eax
+; AVX512_32-NEXT:    vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX512_32-NEXT:    vmovss %xmm0, (%esp)
+; AVX512_32-NEXT:    flds (%esp)
+; AVX512_32-NEXT:    popl %eax
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: u32_to_f:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    vcvtusi2ssl %edi, %xmm0, %xmm0
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: u32_to_f:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %eax
+; SSE2_32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2_32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2_32-NEXT:    orpd %xmm0, %xmm1
+; SSE2_32-NEXT:    subsd %xmm0, %xmm1
+; SSE2_32-NEXT:    xorps %xmm0, %xmm0
+; SSE2_32-NEXT:    cvtsd2ss %xmm1, %xmm0
+; SSE2_32-NEXT:    movss %xmm0, (%esp)
+; SSE2_32-NEXT:    flds (%esp)
+; SSE2_32-NEXT:    popl %eax
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: u32_to_f:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    movl %edi, %eax
+; SSE2_64-NEXT:    cvtsi2ssq %rax, %xmm0
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: u32_to_f:
+; X87:       # BB#0:
+; X87-NEXT:    pushl %ebp
+; X87-NEXT:    movl %esp, %ebp
+; X87-NEXT:    andl $-8, %esp
+; X87-NEXT:    subl $8, %esp
+; X87-NEXT:    movl 8(%ebp), %eax
+; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X87-NEXT:    fildll (%esp)
+; X87-NEXT:    movl %ebp, %esp
+; X87-NEXT:    popl %ebp
+; X87-NEXT:    retl
   %r = uitofp i32 %a to float
   ret float %r
 }
 
-; CHECK-LABEL: s32_to_f
-; AVX512_32: vcvtsi2ssl
-; AVX512_64: vcvtsi2ssl
-; SSE2_32: cvtsi2ssl
-; SSE2_64: cvtsi2ssl
-; X87: fildl
 define float @s32_to_f(i32 %a) nounwind {
+; AVX512_32-LABEL: s32_to_f:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %eax
+; AVX512_32-NEXT:    vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0
+; AVX512_32-NEXT:    vmovss %xmm0, (%esp)
+; AVX512_32-NEXT:    flds (%esp)
+; AVX512_32-NEXT:    popl %eax
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: s32_to_f:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    vcvtsi2ssl %edi, %xmm0, %xmm0
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: s32_to_f:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %eax
+; SSE2_32-NEXT:    cvtsi2ssl {{[0-9]+}}(%esp), %xmm0
+; SSE2_32-NEXT:    movss %xmm0, (%esp)
+; SSE2_32-NEXT:    flds (%esp)
+; SSE2_32-NEXT:    popl %eax
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: s32_to_f:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    cvtsi2ssl %edi, %xmm0
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: s32_to_f:
+; X87:       # BB#0:
+; X87-NEXT:    pushl %eax
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    fildl (%esp)
+; X87-NEXT:    popl %eax
+; X87-NEXT:    retl
   %r = sitofp i32 %a to float
   ret float %r
 }
 
-; CHECK-LABEL: u32_to_d
-; AVX512_32: vcvtusi2sdl
-; AVX512_64: vcvtusi2sdl
-; SSE2_32: subsd
-; SSE2_64: cvtsi2sdq
-; X87: fildll
 define double @u32_to_d(i32 %a) nounwind {
+; AVX512_32-LABEL: u32_to_d:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %ebp
+; AVX512_32-NEXT:    movl %esp, %ebp
+; AVX512_32-NEXT:    andl $-8, %esp
+; AVX512_32-NEXT:    subl $8, %esp
+; AVX512_32-NEXT:    vcvtusi2sdl 8(%ebp), %xmm0, %xmm0
+; AVX512_32-NEXT:    vmovsd %xmm0, (%esp)
+; AVX512_32-NEXT:    fldl (%esp)
+; AVX512_32-NEXT:    movl %ebp, %esp
+; AVX512_32-NEXT:    popl %ebp
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: u32_to_d:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    vcvtusi2sdl %edi, %xmm0, %xmm0
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: u32_to_d:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %ebp
+; SSE2_32-NEXT:    movl %esp, %ebp
+; SSE2_32-NEXT:    andl $-8, %esp
+; SSE2_32-NEXT:    subl $8, %esp
+; SSE2_32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2_32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2_32-NEXT:    orpd %xmm0, %xmm1
+; SSE2_32-NEXT:    subsd %xmm0, %xmm1
+; SSE2_32-NEXT:    movsd %xmm1, (%esp)
+; SSE2_32-NEXT:    fldl (%esp)
+; SSE2_32-NEXT:    movl %ebp, %esp
+; SSE2_32-NEXT:    popl %ebp
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: u32_to_d:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    movl %edi, %eax
+; SSE2_64-NEXT:    cvtsi2sdq %rax, %xmm0
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: u32_to_d:
+; X87:       # BB#0:
+; X87-NEXT:    pushl %ebp
+; X87-NEXT:    movl %esp, %ebp
+; X87-NEXT:    andl $-8, %esp
+; X87-NEXT:    subl $8, %esp
+; X87-NEXT:    movl 8(%ebp), %eax
+; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X87-NEXT:    fildll (%esp)
+; X87-NEXT:    movl %ebp, %esp
+; X87-NEXT:    popl %ebp
+; X87-NEXT:    retl
   %r = uitofp i32 %a to double
   ret double %r
 }
 
-; CHECK-LABEL: s32_to_d
-; AVX512_32: vcvtsi2sdl
-; AVX512_64: vcvtsi2sdl
-; SSE2_32: cvtsi2sdl
-; SSE2_64: cvtsi2sdl
-; X87: fildl
 define double @s32_to_d(i32 %a) nounwind {
+; AVX512_32-LABEL: s32_to_d:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %ebp
+; AVX512_32-NEXT:    movl %esp, %ebp
+; AVX512_32-NEXT:    andl $-8, %esp
+; AVX512_32-NEXT:    subl $8, %esp
+; AVX512_32-NEXT:    vcvtsi2sdl 8(%ebp), %xmm0, %xmm0
+; AVX512_32-NEXT:    vmovsd %xmm0, (%esp)
+; AVX512_32-NEXT:    fldl (%esp)
+; AVX512_32-NEXT:    movl %ebp, %esp
+; AVX512_32-NEXT:    popl %ebp
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: s32_to_d:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    vcvtsi2sdl %edi, %xmm0, %xmm0
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: s32_to_d:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %ebp
+; SSE2_32-NEXT:    movl %esp, %ebp
+; SSE2_32-NEXT:    andl $-8, %esp
+; SSE2_32-NEXT:    subl $8, %esp
+; SSE2_32-NEXT:    cvtsi2sdl 8(%ebp), %xmm0
+; SSE2_32-NEXT:    movsd %xmm0, (%esp)
+; SSE2_32-NEXT:    fldl (%esp)
+; SSE2_32-NEXT:    movl %ebp, %esp
+; SSE2_32-NEXT:    popl %ebp
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: s32_to_d:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    cvtsi2sdl %edi, %xmm0
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: s32_to_d:
+; X87:       # BB#0:
+; X87-NEXT:    pushl %eax
+; X87-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    fildl (%esp)
+; X87-NEXT:    popl %eax
+; X87-NEXT:    retl
   %r = sitofp i32 %a to double
   ret double %r
 }
 
-; CHECK-LABEL: u32_to_x
-; AVX512_32: vsubsd
-; AVX512_64: vsubsd
-; SSE2_32: subsd
-; SSE2_64: fildll
-; X87: fildll
 define x86_fp80 @u32_to_x(i32 %a) nounwind {
+; AVX512_32-LABEL: u32_to_x:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %ebp
+; AVX512_32-NEXT:    movl %esp, %ebp
+; AVX512_32-NEXT:    andl $-8, %esp
+; AVX512_32-NEXT:    subl $8, %esp
+; AVX512_32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512_32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512_32-NEXT:    vorpd %xmm0, %xmm1, %xmm1
+; AVX512_32-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; AVX512_32-NEXT:    vmovsd %xmm0, (%esp)
+; AVX512_32-NEXT:    fldl (%esp)
+; AVX512_32-NEXT:    movl %ebp, %esp
+; AVX512_32-NEXT:    popl %ebp
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: u32_to_x:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512_64-NEXT:    vmovd %edi, %xmm1
+; AVX512_64-NEXT:    vpor %xmm0, %xmm1, %xmm1
+; AVX512_64-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
+; AVX512_64-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
+; AVX512_64-NEXT:    fldl -{{[0-9]+}}(%rsp)
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: u32_to_x:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %ebp
+; SSE2_32-NEXT:    movl %esp, %ebp
+; SSE2_32-NEXT:    andl $-8, %esp
+; SSE2_32-NEXT:    subl $8, %esp
+; SSE2_32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2_32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2_32-NEXT:    orpd %xmm0, %xmm1
+; SSE2_32-NEXT:    subsd %xmm0, %xmm1
+; SSE2_32-NEXT:    movsd %xmm1, (%esp)
+; SSE2_32-NEXT:    fldl (%esp)
+; SSE2_32-NEXT:    movl %ebp, %esp
+; SSE2_32-NEXT:    popl %ebp
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: u32_to_x:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    movl %edi, %eax
+; SSE2_64-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; SSE2_64-NEXT:    fildll -{{[0-9]+}}(%rsp)
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: u32_to_x:
+; X87:       # BB#0:
+; X87-NEXT:    pushl %ebp
+; X87-NEXT:    movl %esp, %ebp
+; X87-NEXT:    andl $-8, %esp
+; X87-NEXT:    subl $8, %esp
+; X87-NEXT:    movl 8(%ebp), %eax
+; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    movl $0, {{[0-9]+}}(%esp)
+; X87-NEXT:    fildll (%esp)
+; X87-NEXT:    movl %ebp, %esp
+; X87-NEXT:    popl %ebp
+; X87-NEXT:    retl
   %r = uitofp i32 %a to x86_fp80
   ret x86_fp80 %r
 }
 
-; CHECK-LABEL: s32_to_x
-; CHECK: fildl
 define x86_fp80 @s32_to_x(i32 %a) nounwind {
+; CHECK32-LABEL: s32_to_x:
+; CHECK32:       # BB#0:
+; CHECK32-NEXT:    pushl %eax
+; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movl %eax, (%esp)
+; CHECK32-NEXT:    fildl (%esp)
+; CHECK32-NEXT:    popl %eax
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: s32_to_x:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movl %edi, -{{[0-9]+}}(%rsp)
+; CHECK64-NEXT:    fildl -{{[0-9]+}}(%rsp)
+; CHECK64-NEXT:    retq
   %r = sitofp i32 %a to x86_fp80
   ret x86_fp80 %r
 }
 
-; CHECK-LABEL: u64_to_f
-; AVX512_32: vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512_32: vmovlps %xmm0, {{[0-9]+}}(%esp)
-; AVX512_32: fildll
-
-; AVX512_64: vcvtusi2ssq
-
-; SSE2_32: movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE2_32: movlps %xmm0, {{[0-9]+}}(%esp)
-; SSE2_32: fildll
-
-; SSE2_64: cvtsi2ssq
-; X87: fildll
 define float @u64_to_f(i64 %a) nounwind {
+; AVX512_32-LABEL: u64_to_f:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %ebp
+; AVX512_32-NEXT:    movl %esp, %ebp
+; AVX512_32-NEXT:    andl $-8, %esp
+; AVX512_32-NEXT:    subl $16, %esp
+; AVX512_32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512_32-NEXT:    vmovlps %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32-NEXT:    xorl %eax, %eax
+; AVX512_32-NEXT:    cmpl $0, 12(%ebp)
+; AVX512_32-NEXT:    setns %al
+; AVX512_32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512_32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
+; AVX512_32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512_32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512_32-NEXT:    vmovss %xmm0, (%esp)
+; AVX512_32-NEXT:    flds (%esp)
+; AVX512_32-NEXT:    movl %ebp, %esp
+; AVX512_32-NEXT:    popl %ebp
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: u64_to_f:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    vcvtusi2ssq %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: u64_to_f:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %ebp
+; SSE2_32-NEXT:    movl %esp, %ebp
+; SSE2_32-NEXT:    andl $-8, %esp
+; SSE2_32-NEXT:    subl $16, %esp
+; SSE2_32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2_32-NEXT:    movlps %xmm0, {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    xorl %eax, %eax
+; SSE2_32-NEXT:    cmpl $0, 12(%ebp)
+; SSE2_32-NEXT:    setns %al
+; SSE2_32-NEXT:    fildll {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
+; SSE2_32-NEXT:    fstps {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2_32-NEXT:    movss %xmm0, (%esp)
+; SSE2_32-NEXT:    flds (%esp)
+; SSE2_32-NEXT:    movl %ebp, %esp
+; SSE2_32-NEXT:    popl %ebp
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: u64_to_f:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    testq %rdi, %rdi
+; SSE2_64-NEXT:    js .LBB6_1
+; SSE2_64-NEXT:  # BB#2:
+; SSE2_64-NEXT:    cvtsi2ssq %rdi, %xmm0
+; SSE2_64-NEXT:    retq
+; SSE2_64-NEXT:  .LBB6_1:
+; SSE2_64-NEXT:    movq %rdi, %rax
+; SSE2_64-NEXT:    shrq %rax
+; SSE2_64-NEXT:    andl $1, %edi
+; SSE2_64-NEXT:    orq %rax, %rdi
+; SSE2_64-NEXT:    cvtsi2ssq %rdi, %xmm0
+; SSE2_64-NEXT:    addss %xmm0, %xmm0
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: u64_to_f:
+; X87:       # BB#0:
+; X87-NEXT:    pushl %ebp
+; X87-NEXT:    movl %esp, %ebp
+; X87-NEXT:    andl $-8, %esp
+; X87-NEXT:    subl $16, %esp
+; X87-NEXT:    movl 8(%ebp), %eax
+; X87-NEXT:    movl 12(%ebp), %ecx
+; X87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X87-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X87-NEXT:    xorl %eax, %eax
+; X87-NEXT:    testl %ecx, %ecx
+; X87-NEXT:    setns %al
+; X87-NEXT:    fildll {{[0-9]+}}(%esp)
+; X87-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
+; X87-NEXT:    fstps {{[0-9]+}}(%esp)
+; X87-NEXT:    flds {{[0-9]+}}(%esp)
+; X87-NEXT:    movl %ebp, %esp
+; X87-NEXT:    popl %ebp
+; X87-NEXT:    retl
   %r = uitofp i64 %a to float
   ret float %r
 }
 
-; CHECK-LABEL: s64_to_f
-; AVX512_32: fildll
-; AVX512_64: vcvtsi2ssq
-; SSE2_32: fildll
-; SSE2_64: cvtsi2ssq
-; X87: fildll
 define float @s64_to_f(i64 %a) nounwind {
+; AVX512_32-LABEL: s64_to_f:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %eax
+; AVX512_32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512_32-NEXT:    fstps (%esp)
+; AVX512_32-NEXT:    flds (%esp)
+; AVX512_32-NEXT:    popl %eax
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: s64_to_f:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: s64_to_f:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %eax
+; SSE2_32-NEXT:    fildll {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    fstps (%esp)
+; SSE2_32-NEXT:    flds (%esp)
+; SSE2_32-NEXT:    popl %eax
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: s64_to_f:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    cvtsi2ssq %rdi, %xmm0
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: s64_to_f:
+; X87:       # BB#0:
+; X87-NEXT:    fildll {{[0-9]+}}(%esp)
+; X87-NEXT:    retl
   %r = sitofp i64 %a to float
   ret float %r
 }
 
-; CHECK-LABEL: s64_to_f_2
-; SSE2_32:    movd %ecx, %xmm0
-; SSE2_32:    movd %eax, %xmm1
-; SSE2_32:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE2_32:    movq %xmm1, {{[0-9]+}}(%esp)
-; SSE2_32:    fildll {{[0-9]+}}(%esp)
-
-; AVX512_32:    vmovd %eax, %xmm0
-; AVX512_32:    vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX512_32:    vmovq %xmm0, {{[0-9]+}}(%esp)
-; AVX512_32:    fildll {{[0-9]+}}(%esp)
-
 define float @s64_to_f_2(i64 %a) nounwind {
+; AVX512_32-LABEL: s64_to_f_2:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %ebp
+; AVX512_32-NEXT:    movl %esp, %ebp
+; AVX512_32-NEXT:    andl $-8, %esp
+; AVX512_32-NEXT:    subl $16, %esp
+; AVX512_32-NEXT:    movl 8(%ebp), %eax
+; AVX512_32-NEXT:    movl 12(%ebp), %ecx
+; AVX512_32-NEXT:    addl $5, %eax
+; AVX512_32-NEXT:    adcl $0, %ecx
+; AVX512_32-NEXT:    vmovd %eax, %xmm0
+; AVX512_32-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512_32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512_32-NEXT:    fstps {{[0-9]+}}(%esp)
+; AVX512_32-NEXT:    flds {{[0-9]+}}(%esp)
+; AVX512_32-NEXT:    movl %ebp, %esp
+; AVX512_32-NEXT:    popl %ebp
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: s64_to_f_2:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    addq $5, %rdi
+; AVX512_64-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: s64_to_f_2:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %ebp
+; SSE2_32-NEXT:    movl %esp, %ebp
+; SSE2_32-NEXT:    andl $-8, %esp
+; SSE2_32-NEXT:    subl $16, %esp
+; SSE2_32-NEXT:    movl 8(%ebp), %eax
+; SSE2_32-NEXT:    movl 12(%ebp), %ecx
+; SSE2_32-NEXT:    addl $5, %eax
+; SSE2_32-NEXT:    adcl $0, %ecx
+; SSE2_32-NEXT:    movd %ecx, %xmm0
+; SSE2_32-NEXT:    movd %eax, %xmm1
+; SSE2_32-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2_32-NEXT:    movq %xmm1, {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    fildll {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    fstps {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    flds {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    movl %ebp, %esp
+; SSE2_32-NEXT:    popl %ebp
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: s64_to_f_2:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    addq $5, %rdi
+; SSE2_64-NEXT:    cvtsi2ssq %rdi, %xmm0
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: s64_to_f_2:
+; X87:       # BB#0:
+; X87-NEXT:    pushl %ebp
+; X87-NEXT:    movl %esp, %ebp
+; X87-NEXT:    andl $-8, %esp
+; X87-NEXT:    subl $8, %esp
+; X87-NEXT:    movl 8(%ebp), %eax
+; X87-NEXT:    movl 12(%ebp), %ecx
+; X87-NEXT:    addl $5, %eax
+; X87-NEXT:    adcl $0, %ecx
+; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X87-NEXT:    fildll (%esp)
+; X87-NEXT:    movl %ebp, %esp
+; X87-NEXT:    popl %ebp
+; X87-NEXT:    retl
   %a1 = add i64 %a, 5
   %r = sitofp i64 %a1 to float
   ret float %r
 }
 
-; CHECK-LABEL: u64_to_d
-; AVX512_32: vpunpckldq
-; AVX512_64: vcvtusi2sdq
-; SSE2_32: punpckldq
-; SSE2_64: punpckldq
-; X87: fildll
 define double @u64_to_d(i64 %a) nounwind {
+; AVX512_32-LABEL: u64_to_d:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %ebp
+; AVX512_32-NEXT:    movl %esp, %ebp
+; AVX512_32-NEXT:    andl $-8, %esp
+; AVX512_32-NEXT:    subl $8, %esp
+; AVX512_32-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; AVX512_32-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512_32-NEXT:    vsubpd {{\.LCPI.*}}, %xmm0, %xmm0
+; AVX512_32-NEXT:    vhaddpd %xmm0, %xmm0, %xmm0
+; AVX512_32-NEXT:    vmovlpd %xmm0, (%esp)
+; AVX512_32-NEXT:    fldl (%esp)
+; AVX512_32-NEXT:    movl %ebp, %esp
+; AVX512_32-NEXT:    popl %ebp
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: u64_to_d:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    vcvtusi2sdq %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: u64_to_d:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %ebp
+; SSE2_32-NEXT:    movl %esp, %ebp
+; SSE2_32-NEXT:    andl $-8, %esp
+; SSE2_32-NEXT:    subl $8, %esp
+; SSE2_32-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; SSE2_32-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; SSE2_32-NEXT:    subpd {{\.LCPI.*}}, %xmm0
+; SSE2_32-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; SSE2_32-NEXT:    addpd %xmm0, %xmm1
+; SSE2_32-NEXT:    movlpd %xmm1, (%esp)
+; SSE2_32-NEXT:    fldl (%esp)
+; SSE2_32-NEXT:    movl %ebp, %esp
+; SSE2_32-NEXT:    popl %ebp
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: u64_to_d:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    movd %rdi, %xmm1
+; SSE2_64-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; SSE2_64-NEXT:    subpd {{.*}}(%rip), %xmm1
+; SSE2_64-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE2_64-NEXT:    addpd %xmm1, %xmm0
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: u64_to_d:
+; X87:       # BB#0:
+; X87-NEXT:    pushl %ebp
+; X87-NEXT:    movl %esp, %ebp
+; X87-NEXT:    andl $-8, %esp
+; X87-NEXT:    subl $16, %esp
+; X87-NEXT:    movl 8(%ebp), %eax
+; X87-NEXT:    movl 12(%ebp), %ecx
+; X87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    xorl %eax, %eax
+; X87-NEXT:    testl %ecx, %ecx
+; X87-NEXT:    setns %al
+; X87-NEXT:    fildll (%esp)
+; X87-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
+; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X87-NEXT:    fldl {{[0-9]+}}(%esp)
+; X87-NEXT:    movl %ebp, %esp
+; X87-NEXT:    popl %ebp
+; X87-NEXT:    retl
   %r = uitofp i64 %a to double
   ret double %r
 }
 
-; CHECK-LABEL: s64_to_d
-; AVX512_32: fildll
-; AVX512_64: vcvtsi2sdq
-; SSE2_32: fildll
-; SSE2_64: cvtsi2sdq
-; X87: fildll
 define double @s64_to_d(i64 %a) nounwind {
+; AVX512_32-LABEL: s64_to_d:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %ebp
+; AVX512_32-NEXT:    movl %esp, %ebp
+; AVX512_32-NEXT:    andl $-8, %esp
+; AVX512_32-NEXT:    subl $8, %esp
+; AVX512_32-NEXT:    fildll 8(%ebp)
+; AVX512_32-NEXT:    fstpl (%esp)
+; AVX512_32-NEXT:    fldl (%esp)
+; AVX512_32-NEXT:    movl %ebp, %esp
+; AVX512_32-NEXT:    popl %ebp
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: s64_to_d:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: s64_to_d:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %ebp
+; SSE2_32-NEXT:    movl %esp, %ebp
+; SSE2_32-NEXT:    andl $-8, %esp
+; SSE2_32-NEXT:    subl $8, %esp
+; SSE2_32-NEXT:    fildll 8(%ebp)
+; SSE2_32-NEXT:    fstpl (%esp)
+; SSE2_32-NEXT:    fldl (%esp)
+; SSE2_32-NEXT:    movl %ebp, %esp
+; SSE2_32-NEXT:    popl %ebp
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: s64_to_d:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    cvtsi2sdq %rdi, %xmm0
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: s64_to_d:
+; X87:       # BB#0:
+; X87-NEXT:    fildll {{[0-9]+}}(%esp)
+; X87-NEXT:    retl
   %r = sitofp i64 %a to double
   ret double %r
 }
 
-; CHECK-LABEL: s64_to_d_2
-; SSE2_32: movd %ecx, %xmm0
-; SSE2_32: movd %eax, %xmm1
-; SSE2_32: punpckldq %xmm0, %xmm1
-; SSE2_32: movq %xmm1, {{[0-9]+}}(%esp)
-; SSE2_32: fildll
-
-; AVX512_32:    vmovd %eax, %xmm0
-; AVX512_32:    vpinsrd $1, %ecx, %xmm0, %xmm0
-; AVX512_32:    vmovq %xmm0, {{[0-9]+}}(%esp)
-; AVX512_32: fildll
-
 define double @s64_to_d_2(i64 %a) nounwind {
+; AVX512_32-LABEL: s64_to_d_2:
+; AVX512_32:       # BB#0:
+; AVX512_32-NEXT:    pushl %ebp
+; AVX512_32-NEXT:    movl %esp, %ebp
+; AVX512_32-NEXT:    andl $-8, %esp
+; AVX512_32-NEXT:    subl $16, %esp
+; AVX512_32-NEXT:    movl 8(%ebp), %eax
+; AVX512_32-NEXT:    movl 12(%ebp), %ecx
+; AVX512_32-NEXT:    addl $5, %eax
+; AVX512_32-NEXT:    adcl $0, %ecx
+; AVX512_32-NEXT:    vmovd %eax, %xmm0
+; AVX512_32-NEXT:    vpinsrd $1, %ecx, %xmm0, %xmm0
+; AVX512_32-NEXT:    vmovq %xmm0, {{[0-9]+}}(%esp)
+; AVX512_32-NEXT:    fildll {{[0-9]+}}(%esp)
+; AVX512_32-NEXT:    fstpl (%esp)
+; AVX512_32-NEXT:    fldl (%esp)
+; AVX512_32-NEXT:    movl %ebp, %esp
+; AVX512_32-NEXT:    popl %ebp
+; AVX512_32-NEXT:    retl
+;
+; AVX512_64-LABEL: s64_to_d_2:
+; AVX512_64:       # BB#0:
+; AVX512_64-NEXT:    addq $5, %rdi
+; AVX512_64-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
+; AVX512_64-NEXT:    retq
+;
+; SSE2_32-LABEL: s64_to_d_2:
+; SSE2_32:       # BB#0:
+; SSE2_32-NEXT:    pushl %ebp
+; SSE2_32-NEXT:    movl %esp, %ebp
+; SSE2_32-NEXT:    andl $-8, %esp
+; SSE2_32-NEXT:    subl $16, %esp
+; SSE2_32-NEXT:    movl 8(%ebp), %eax
+; SSE2_32-NEXT:    movl 12(%ebp), %ecx
+; SSE2_32-NEXT:    addl $5, %eax
+; SSE2_32-NEXT:    adcl $0, %ecx
+; SSE2_32-NEXT:    movd %ecx, %xmm0
+; SSE2_32-NEXT:    movd %eax, %xmm1
+; SSE2_32-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE2_32-NEXT:    movq %xmm1, {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    fildll {{[0-9]+}}(%esp)
+; SSE2_32-NEXT:    fstpl (%esp)
+; SSE2_32-NEXT:    fldl (%esp)
+; SSE2_32-NEXT:    movl %ebp, %esp
+; SSE2_32-NEXT:    popl %ebp
+; SSE2_32-NEXT:    retl
+;
+; SSE2_64-LABEL: s64_to_d_2:
+; SSE2_64:       # BB#0:
+; SSE2_64-NEXT:    addq $5, %rdi
+; SSE2_64-NEXT:    cvtsi2sdq %rdi, %xmm0
+; SSE2_64-NEXT:    retq
+;
+; X87-LABEL: s64_to_d_2:
+; X87:       # BB#0:
+; X87-NEXT:    pushl %ebp
+; X87-NEXT:    movl %esp, %ebp
+; X87-NEXT:    andl $-8, %esp
+; X87-NEXT:    subl $8, %esp
+; X87-NEXT:    movl 8(%ebp), %eax
+; X87-NEXT:    movl 12(%ebp), %ecx
+; X87-NEXT:    addl $5, %eax
+; X87-NEXT:    adcl $0, %ecx
+; X87-NEXT:    movl %eax, (%esp)
+; X87-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X87-NEXT:    fildll (%esp)
+; X87-NEXT:    movl %ebp, %esp
+; X87-NEXT:    popl %ebp
+; X87-NEXT:    retl
   %b = add i64 %a, 5
   %f = sitofp i64 %b to double
   ret double %f
 }
 
-; CHECK-LABEL: u64_to_x
-; CHECK: fildll
 define x86_fp80 @u64_to_x(i64 %a) nounwind {
+; CHECK32-LABEL: u64_to_x:
+; CHECK32:       # BB#0:
+; CHECK32-NEXT:    pushl %ebp
+; CHECK32-NEXT:    movl %esp, %ebp
+; CHECK32-NEXT:    andl $-8, %esp
+; CHECK32-NEXT:    subl $8, %esp
+; CHECK32-NEXT:    movl 8(%ebp), %eax
+; CHECK32-NEXT:    movl 12(%ebp), %ecx
+; CHECK32-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    movl %eax, (%esp)
+; CHECK32-NEXT:    xorl %eax, %eax
+; CHECK32-NEXT:    testl %ecx, %ecx
+; CHECK32-NEXT:    setns %al
+; CHECK32-NEXT:    fildll (%esp)
+; CHECK32-NEXT:    fadds {{\.LCPI.*}}(,%eax,4)
+; CHECK32-NEXT:    movl %ebp, %esp
+; CHECK32-NEXT:    popl %ebp
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: u64_to_x:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; CHECK64-NEXT:    xorl %eax, %eax
+; CHECK64-NEXT:    testq %rdi, %rdi
+; CHECK64-NEXT:    setns %al
+; CHECK64-NEXT:    fildll -{{[0-9]+}}(%rsp)
+; CHECK64-NEXT:    fadds {{\.LCPI.*}}(,%rax,4)
+; CHECK64-NEXT:    retq
   %r = uitofp i64 %a to x86_fp80
   ret x86_fp80 %r
 }
 
-; CHECK-LABEL: s64_to_x
-; CHECK: fildll
 define x86_fp80 @s64_to_x(i64 %a) nounwind {
+; CHECK32-LABEL: s64_to_x:
+; CHECK32:       # BB#0:
+; CHECK32-NEXT:    fildll {{[0-9]+}}(%esp)
+; CHECK32-NEXT:    retl
+;
+; CHECK64-LABEL: s64_to_x:
+; CHECK64:       # BB#0:
+; CHECK64-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; CHECK64-NEXT:    fildll -{{[0-9]+}}(%rsp)
+; CHECK64-NEXT:    retq
   %r = sitofp i64 %a to x86_fp80
   ret x86_fp80 %r
 }
diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll
index e6cce778b218341824ddaae3e73f37a4e317676e..ce42d0d643e8b664b226c89950836e5af5bc3df6 100644
--- a/test/CodeGen/X86/select.ll
+++ b/test/CodeGen/X86/select.ll
@@ -15,6 +15,20 @@ define i32 @test1(%0* %p, %0* %q, i1 %r) nounwind {
 ; CHECK-NEXT:    cmovneq %rdi, %rsi
 ; CHECK-NEXT:    movl (%rsi), %eax
 ; CHECK-NEXT:    retq
+;
+; MCU-LABEL: test1:
+; MCU:       # BB#0:
+; MCU-NEXT:    testb $1, %cl
+; MCU-NEXT:    jne .LBB0_1
+; MCU-NEXT:  # BB#2:
+; MCU-NEXT:    addl $8, %edx
+; MCU-NEXT:    movl %edx, %eax
+; MCU-NEXT:    movl (%eax), %eax
+; MCU-NEXT:    retl
+; MCU-NEXT:  .LBB0_1:
+; MCU-NEXT:    addl $8, %eax
+; MCU-NEXT:    movl (%eax), %eax
+; MCU-NEXT:    retl
   %t0 = load %0, %0* %p
   %t1 = load %0, %0* %q
   %t4 = select i1 %r, %0 %t0, %0 %t1
@@ -41,6 +55,26 @@ define i32 @test2() nounwind {
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  LBB1_1: ## %bb90
+;
+; MCU-LABEL: test2:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    calll return_false
+; MCU-NEXT:    testb $1, %al
+; MCU-NEXT:    jne .LBB1_1
+; MCU-NEXT:  # BB#2: # %entry
+; MCU-NEXT:    movw $-480, %ax # imm = 0xFE20
+; MCU-NEXT:    jmp .LBB1_3
+; MCU-NEXT:  .LBB1_1:
+; MCU-NEXT:    xorl %eax, %eax
+; MCU-NEXT:  .LBB1_3: # %entry
+; MCU-NEXT:    cwtl
+; MCU-NEXT:    shll $3, %eax
+; MCU-NEXT:    cmpl $32768, %eax # imm = 0x8000
+; MCU-NEXT:    jge .LBB1_4
+; MCU-NEXT:  # BB#5: # %bb91
+; MCU-NEXT:    xorl %eax, %eax
+; MCU-NEXT:    retl
+; MCU-NEXT:  .LBB1_4: # %bb90
 entry:
   %tmp73 = tail call i1 @return_false()
   %g.0 = select i1 %tmp73, i16 0, i16 -480
@@ -66,6 +100,14 @@ define float @test3(i32 %x) nounwind readnone {
 ; CHECK-NEXT:    leaq {{.*}}(%rip), %rcx
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    retq
+;
+; MCU-LABEL: test3:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    xorl %ecx, %ecx
+; MCU-NEXT:    testl %eax, %eax
+; MCU-NEXT:    sete %cl
+; MCU-NEXT:    flds {{\.LCPI.*}}(,%ecx,4)
+; MCU-NEXT:    retl
 entry:
   %0 = icmp eq i32 %x, 0
   %iftmp.0.0 = select i1 %0, float 4.200000e+01, float 2.300000e+01
@@ -81,6 +123,20 @@ define signext i8 @test4(i8* nocapture %P, double %F) nounwind readonly {
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    movsbl (%rdi,%rax,4), %eax
 ; CHECK-NEXT:    retq
+;
+; MCU-LABEL: test4:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    movl %eax, %ecx
+; MCU-NEXT:    fldl {{[0-9]+}}(%esp)
+; MCU-NEXT:    flds {{\.LCPI.*}}
+; MCU-NEXT:    fucompp
+; MCU-NEXT:    fnstsw %ax
+; MCU-NEXT:    xorl %edx, %edx
+; MCU-NEXT:    # kill: %AH<def> %AH<kill> %AX<kill>
+; MCU-NEXT:    sahf
+; MCU-NEXT:    seta %dl
+; MCU-NEXT:    movb (%ecx,%edx,4), %al
+; MCU-NEXT:    retl
 entry:
   %0 = fcmp olt double %F, 4.200000e+01
   %iftmp.0.0 = select i1 %0, i32 4, i32 0
@@ -101,6 +157,25 @@ define void @test5(i1 %c, <2 x i16> %a, <2 x i16> %b, <2 x i16>* %p) nounwind {
 ; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; CHECK-NEXT:    movd %xmm0, (%rsi)
 ; CHECK-NEXT:    retq
+;
+; MCU-LABEL: test5:
+; MCU:       # BB#0:
+; MCU-NEXT:    pushl %esi
+; MCU-NEXT:    andb $1, %al
+; MCU-NEXT:    jne .LBB4_2
+; MCU-NEXT:  # BB#1:
+; MCU-NEXT:    movw {{[0-9]+}}(%esp), %dx
+; MCU-NEXT:  .LBB4_2:
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; MCU-NEXT:    testb %al, %al
+; MCU-NEXT:    jne .LBB4_4
+; MCU-NEXT:  # BB#3:
+; MCU-NEXT:    movw {{[0-9]+}}(%esp), %cx
+; MCU-NEXT:  .LBB4_4:
+; MCU-NEXT:    movw %dx, (%esi)
+; MCU-NEXT:    movw %cx, 2(%esi)
+; MCU-NEXT:    popl %esi
+; MCU-NEXT:    retl
   %x = select i1 %c, <2 x i16> %a, <2 x i16> %b
   store <2 x i16> %x, <2 x i16>* %p
   ret void
@@ -121,6 +196,57 @@ define void @test6(i32 %C, <4 x float>* %A, <4 x float>* %B) nounwind {
 ; CHECK-NEXT:    mulps %xmm0, %xmm0
 ; CHECK-NEXT:    movaps %xmm0, (%rsi)
 ; CHECK-NEXT:    retq
+;
+; MCU-LABEL: test6:
+; MCU:       # BB#0:
+; MCU-NEXT:    pushl %eax
+; MCU-NEXT:    flds 12(%edx)
+; MCU-NEXT:    fstps (%esp) # 4-byte Folded Spill
+; MCU-NEXT:    flds 8(%edx)
+; MCU-NEXT:    flds 4(%edx)
+; MCU-NEXT:    flds (%ecx)
+; MCU-NEXT:    flds 4(%ecx)
+; MCU-NEXT:    flds 8(%ecx)
+; MCU-NEXT:    flds 12(%ecx)
+; MCU-NEXT:    fmul %st(0), %st(0)
+; MCU-NEXT:    fxch %st(1)
+; MCU-NEXT:    fmul %st(0), %st(0)
+; MCU-NEXT:    fxch %st(2)
+; MCU-NEXT:    fmul %st(0), %st(0)
+; MCU-NEXT:    fxch %st(3)
+; MCU-NEXT:    fmul %st(0), %st(0)
+; MCU-NEXT:    testl %eax, %eax
+; MCU-NEXT:    flds (%edx)
+; MCU-NEXT:    je .LBB5_2
+; MCU-NEXT:  # BB#1:
+; MCU-NEXT:    fstp %st(1)
+; MCU-NEXT:    fstp %st(3)
+; MCU-NEXT:    fstp %st(1)
+; MCU-NEXT:    fstp %st(0)
+; MCU-NEXT:    flds (%esp) # 4-byte Folded Reload
+; MCU-NEXT:    fldz
+; MCU-NEXT:    fldz
+; MCU-NEXT:    fldz
+; MCU-NEXT:    fxch %st(1)
+; MCU-NEXT:    fxch %st(6)
+; MCU-NEXT:    fxch %st(1)
+; MCU-NEXT:    fxch %st(5)
+; MCU-NEXT:    fxch %st(4)
+; MCU-NEXT:    fxch %st(1)
+; MCU-NEXT:    fxch %st(3)
+; MCU-NEXT:    fxch %st(2)
+; MCU-NEXT:  .LBB5_2:
+; MCU-NEXT:    fstp %st(0)
+; MCU-NEXT:    fstp %st(5)
+; MCU-NEXT:    fstp %st(3)
+; MCU-NEXT:    fxch %st(2)
+; MCU-NEXT:    fstps 12(%edx)
+; MCU-NEXT:    fxch %st(1)
+; MCU-NEXT:    fstps 8(%edx)
+; MCU-NEXT:    fstps 4(%edx)
+; MCU-NEXT:    fstps (%edx)
+; MCU-NEXT:    popl %eax
+; MCU-NEXT:    retl
   %tmp = load <4 x float>, <4 x float>* %A
   %tmp3 = load <4 x float>, <4 x float>* %B
   %tmp9 = fmul <4 x float> %tmp3, %tmp3
@@ -141,6 +267,15 @@ define x86_fp80 @test7(i32 %tmp8) nounwind {
 ; CHECK-NEXT:    leaq {{.*}}(%rip), %rcx
 ; CHECK-NEXT:    fldt (%rax,%rcx)
 ; CHECK-NEXT:    retq
+;
+; MCU-LABEL: test7:
+; MCU:       # BB#0:
+; MCU-NEXT:    xorl %ecx, %ecx
+; MCU-NEXT:    testl %eax, %eax
+; MCU-NEXT:    setns %cl
+; MCU-NEXT:    shll $4, %ecx
+; MCU-NEXT:    fldt {{\.LCPI.*}}(%ecx)
+; MCU-NEXT:    retl
   %tmp9 = icmp sgt i32 %tmp8, -1
   %retval = select i1 %tmp9, x86_fp80 0xK4005B400000000000000, x86_fp80 0xK40078700000000000000
   ret x86_fp80 %retval
@@ -219,6 +354,80 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2)
 ; ATOM-NEXT:    movq %xmm0, 16(%rsi)
 ; ATOM-NEXT:    movdqa %xmm1, (%rsi)
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test8:
+; MCU:       # BB#0:
+; MCU-NEXT:    pushl %ebp
+; MCU-NEXT:    pushl %ebx
+; MCU-NEXT:    pushl %edi
+; MCU-NEXT:    pushl %esi
+; MCU-NEXT:    andb $1, %al
+; MCU-NEXT:    jne .LBB7_1
+; MCU-NEXT:  # BB#2:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; MCU-NEXT:    movl (%ecx), %ecx
+; MCU-NEXT:    je .LBB7_5
+; MCU-NEXT:  .LBB7_4:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; MCU-NEXT:    movl (%esi), %esi
+; MCU-NEXT:    je .LBB7_8
+; MCU-NEXT:  .LBB7_7:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; MCU-NEXT:    movl (%edi), %edi
+; MCU-NEXT:    je .LBB7_11
+; MCU-NEXT:  .LBB7_10:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; MCU-NEXT:    movl (%ebx), %ebx
+; MCU-NEXT:    je .LBB7_14
+; MCU-NEXT:  .LBB7_13:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; MCU-NEXT:    jmp .LBB7_15
+; MCU-NEXT:  .LBB7_1:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ecx
+; MCU-NEXT:    movl (%ecx), %ecx
+; MCU-NEXT:    jne .LBB7_4
+; MCU-NEXT:  .LBB7_5:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; MCU-NEXT:    movl (%esi), %esi
+; MCU-NEXT:    jne .LBB7_7
+; MCU-NEXT:  .LBB7_8:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; MCU-NEXT:    movl (%edi), %edi
+; MCU-NEXT:    jne .LBB7_10
+; MCU-NEXT:  .LBB7_11:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebx
+; MCU-NEXT:    movl (%ebx), %ebx
+; MCU-NEXT:    jne .LBB7_13
+; MCU-NEXT:  .LBB7_14:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %ebp
+; MCU-NEXT:  .LBB7_15:
+; MCU-NEXT:    movl (%ebp), %ebp
+; MCU-NEXT:    testb %al, %al
+; MCU-NEXT:    jne .LBB7_16
+; MCU-NEXT:  # BB#17:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; MCU-NEXT:    jmp .LBB7_18
+; MCU-NEXT:  .LBB7_16:
+; MCU-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; MCU-NEXT:  .LBB7_18:
+; MCU-NEXT:    movl (%eax), %eax
+; MCU-NEXT:    decl %eax
+; MCU-NEXT:    decl %ebp
+; MCU-NEXT:    decl %ebx
+; MCU-NEXT:    decl %edi
+; MCU-NEXT:    decl %esi
+; MCU-NEXT:    decl %ecx
+; MCU-NEXT:    movl %ecx, 20(%edx)
+; MCU-NEXT:    movl %esi, 16(%edx)
+; MCU-NEXT:    movl %edi, 12(%edx)
+; MCU-NEXT:    movl %ebx, 8(%edx)
+; MCU-NEXT:    movl %ebp, 4(%edx)
+; MCU-NEXT:    movl %eax, (%edx)
+; MCU-NEXT:    popl %esi
+; MCU-NEXT:    popl %edi
+; MCU-NEXT:    popl %ebx
+; MCU-NEXT:    popl %ebp
+; MCU-NEXT:    retl
   %x = select i1 %c, <6 x i32> %src1, <6 x i32> %src2
   %val = sub <6 x i32> %x, < i32 1, i32 1, i32 1, i32 1, i32 1, i32 1 >
   store <6 x i32> %val, <6 x i32>* %dst.addr
@@ -244,6 +453,19 @@ define i64 @test9(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test9:
+; MCU:       # BB#0:
+; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    jne .LBB8_1
+; MCU-NEXT:  # BB#2:
+; MCU-NEXT:    movl $-1, %eax
+; MCU-NEXT:    movl $-1, %edx
+; MCU-NEXT:    retl
+; MCU-NEXT:  .LBB8_1:
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; MCU-NEXT:    retl
   %cmp = icmp ne i64 %x, 0
   %cond = select i1 %cmp, i64 %y, i64 -1
   ret i64 %cond
@@ -266,6 +488,18 @@ define i64 @test9a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test9a:
+; MCU:       # BB#0:
+; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    movl $-1, %eax
+; MCU-NEXT:    movl $-1, %edx
+; MCU-NEXT:    je .LBB9_2
+; MCU-NEXT:  # BB#1:
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; MCU-NEXT:  .LBB9_2:
+; MCU-NEXT:    retl
   %cmp = icmp eq i64 %x, 0
   %cond = select i1 %cmp, i64 -1, i64 %y
   ret i64 %cond
@@ -287,6 +521,19 @@ define i64 @test9b(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test9b:
+; MCU:       # BB#0:
+; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    movl $-1, %edx
+; MCU-NEXT:    je .LBB10_2
+; MCU-NEXT:  # BB#1:
+; MCU-NEXT:    xorl %edx, %edx
+; MCU-NEXT:  .LBB10_2:
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    orl {{[0-9]+}}(%esp), %edx
+; MCU-NEXT:    retl
   %cmp = icmp eq i64 %x, 0
   %A = sext i1 %cmp to i64
   %cond = or i64 %y, %A
@@ -310,6 +557,18 @@ define i64 @test10(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test10:
+; MCU:       # BB#0:
+; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    movl $-1, %eax
+; MCU-NEXT:    movl $-1, %edx
+; MCU-NEXT:    je .LBB11_2
+; MCU-NEXT:  # BB#1:
+; MCU-NEXT:    xorl %edx, %edx
+; MCU-NEXT:    movl $1, %eax
+; MCU-NEXT:  .LBB11_2:
+; MCU-NEXT:    retl
   %cmp = icmp eq i64 %x, 0
   %cond = select i1 %cmp, i64 -1, i64 1
   ret i64 %cond
@@ -323,6 +582,19 @@ define i64 @test11(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK-NEXT:    notq %rax
 ; CHECK-NEXT:    orq %rsi, %rax
 ; CHECK-NEXT:    retq
+;
+; MCU-LABEL: test11:
+; MCU:       # BB#0:
+; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    je .LBB12_1
+; MCU-NEXT:  # BB#2:
+; MCU-NEXT:    movl $-1, %eax
+; MCU-NEXT:    movl $-1, %edx
+; MCU-NEXT:    retl
+; MCU-NEXT:  .LBB12_1:
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; MCU-NEXT:    retl
   %cmp = icmp eq i64 %x, 0
   %cond = select i1 %cmp, i64 %y, i64 -1
   ret i64 %cond
@@ -336,6 +608,18 @@ define i64 @test11a(i64 %x, i64 %y) nounwind readnone ssp noredzone {
 ; CHECK-NEXT:    notq %rax
 ; CHECK-NEXT:    orq %rsi, %rax
 ; CHECK-NEXT:    retq
+;
+; MCU-LABEL: test11a:
+; MCU:       # BB#0:
+; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    movl $-1, %eax
+; MCU-NEXT:    movl $-1, %edx
+; MCU-NEXT:    jne .LBB13_2
+; MCU-NEXT:  # BB#1:
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; MCU-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; MCU-NEXT:  .LBB13_2:
+; MCU-NEXT:    retl
   %cmp = icmp ne i64 %x, 0
   %cond = select i1 %cmp, i64 -1, i64 %y
   ret i64 %cond
@@ -362,6 +646,39 @@ define noalias i8* @test12(i64 %count) nounwind ssp noredzone {
 ; ATOM-NEXT:    movq $-1, %rdi
 ; ATOM-NEXT:    cmovnoq %rax, %rdi
 ; ATOM-NEXT:    jmp __Znam ## TAILCALL
+;
+; MCU-LABEL: test12:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    pushl %ebp
+; MCU-NEXT:    pushl %ebx
+; MCU-NEXT:    pushl %edi
+; MCU-NEXT:    pushl %esi
+; MCU-NEXT:    movl %edx, %ebx
+; MCU-NEXT:    movl %eax, %ebp
+; MCU-NEXT:    movl $4, %ecx
+; MCU-NEXT:    mull %ecx
+; MCU-NEXT:    movl %eax, %esi
+; MCU-NEXT:    leal (%edx,%ebx,4), %edi
+; MCU-NEXT:    movl %edi, %edx
+; MCU-NEXT:    pushl $0
+; MCU-NEXT:    pushl $4
+; MCU-NEXT:    calll __udivdi3
+; MCU-NEXT:    addl $8, %esp
+; MCU-NEXT:    xorl %ebx, %edx
+; MCU-NEXT:    xorl %ebp, %eax
+; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    movl $-1, %eax
+; MCU-NEXT:    movl $-1, %edx
+; MCU-NEXT:    jne .LBB14_2
+; MCU-NEXT:  # BB#1: # %entry
+; MCU-NEXT:    movl %esi, %eax
+; MCU-NEXT:    movl %edi, %edx
+; MCU-NEXT:  .LBB14_2: # %entry
+; MCU-NEXT:    popl %esi
+; MCU-NEXT:    popl %edi
+; MCU-NEXT:    popl %ebx
+; MCU-NEXT:    popl %ebp
+; MCU-NEXT:    jmp _Znam # TAILCALL
 entry:
   %A = tail call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %count, i64 4)
   %B = extractvalue { i64, i1 } %A, 1
@@ -389,6 +706,12 @@ define i32 @test13(i32 %a, i32 %b) nounwind {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test13:
+; MCU:       # BB#0:
+; MCU-NEXT:    cmpl %edx, %eax
+; MCU-NEXT:    sbbl %eax, %eax
+; MCU-NEXT:    retl
   %c = icmp ult i32 %a, %b
   %d = sext i1 %c to i32
   ret i32 %d
@@ -410,6 +733,13 @@ define i32 @test14(i32 %a, i32 %b) nounwind {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test14:
+; MCU:       # BB#0:
+; MCU-NEXT:    cmpl %edx, %eax
+; MCU-NEXT:    sbbl %eax, %eax
+; MCU-NEXT:    notl %eax
+; MCU-NEXT:    retl
   %c = icmp uge i32 %a, %b
   %d = sext i1 %c to i32
   ret i32 %d
@@ -432,6 +762,12 @@ define i32 @test15(i32 %x) nounwind {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test15:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    negl %eax
+; MCU-NEXT:    sbbl %eax, %eax
+; MCU-NEXT:    retl
 entry:
   %cmp = icmp ne i32 %x, 0
   %sub = sext i1 %cmp to i32
@@ -454,6 +790,17 @@ define i64 @test16(i64 %x) nounwind uwtable readnone ssp {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test16:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    orl %edx, %eax
+; MCU-NEXT:    movl $-1, %eax
+; MCU-NEXT:    jne .LBB18_2
+; MCU-NEXT:  # BB#1: # %entry
+; MCU-NEXT:    xorl %eax, %eax
+; MCU-NEXT:  .LBB18_2: # %entry
+; MCU-NEXT:    movl %eax, %edx
+; MCU-NEXT:    retl
 entry:
   %cmp = icmp ne i64 %x, 0
   %conv1 = sext i1 %cmp to i64
@@ -476,6 +823,12 @@ define i16 @test17(i16 %x) nounwind {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test17:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    negw %ax
+; MCU-NEXT:    sbbw %ax, %ax
+; MCU-NEXT:    retl
 entry:
   %cmp = icmp ne i16 %x, 0
   %sub = sext i1 %cmp to i16
@@ -498,6 +851,16 @@ define i8 @test18(i32 %x, i8 zeroext %a, i8 zeroext %b) nounwind {
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    nop
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: test18:
+; MCU:       # BB#0:
+; MCU-NEXT:    cmpl $15, %eax
+; MCU-NEXT:    jl .LBB20_2
+; MCU-NEXT:  # BB#1:
+; MCU-NEXT:    movl %ecx, %edx
+; MCU-NEXT:  .LBB20_2:
+; MCU-NEXT:    movl %edx, %eax
+; MCU-NEXT:    retl
   %cmp = icmp slt i32 %x, 15
   %sel = select i1 %cmp, i8 %a, i8 %b
   ret i8 %sel
@@ -511,6 +874,13 @@ define i32 @trunc_select_miscompile(i32 %a, i1 zeroext %cc) {
 ; CHECK-NEXT:    shll %cl, %edi
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
+;
+; MCU-LABEL: trunc_select_miscompile:
+; MCU:       # BB#0:
+; MCU-NEXT:    orb $2, %dl
+; MCU-NEXT:    movl %edx, %ecx
+; MCU-NEXT:    shll %cl, %eax
+; MCU-NEXT:    retl
   %tmp1 = select i1 %cc, i32 3, i32 2
   %tmp2 = shl i32 %a, %tmp1
   ret i32 %tmp2
@@ -545,6 +915,23 @@ define void @clamp_i8(i32 %src, i8* %dst) {
 ; ATOM-NEXT:  LBB22_2:
 ; ATOM-NEXT:    movb %cl, (%rsi)
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: clamp_i8:
+; MCU:       # BB#0:
+; MCU-NEXT:    cmpl $127, %eax
+; MCU-NEXT:    movl $127, %ecx
+; MCU-NEXT:    jg .LBB22_2
+; MCU-NEXT:  # BB#1:
+; MCU-NEXT:    movl %eax, %ecx
+; MCU-NEXT:  .LBB22_2:
+; MCU-NEXT:    cmpl $-128, %ecx
+; MCU-NEXT:    movb $-128, %al
+; MCU-NEXT:    jl .LBB22_4
+; MCU-NEXT:  # BB#3:
+; MCU-NEXT:    movl %ecx, %eax
+; MCU-NEXT:  .LBB22_4:
+; MCU-NEXT:    movb %al, (%edx)
+; MCU-NEXT:    retl
   %cmp = icmp sgt i32 %src, 127
   %sel1 = select i1 %cmp, i32 127, i32 %src
   %cmp1 = icmp slt i32 %sel1, -128
@@ -577,6 +964,23 @@ define void @clamp(i32 %src, i16* %dst) {
 ; ATOM-NEXT:    cmovgew %ax, %cx
 ; ATOM-NEXT:    movw %cx, (%rsi)
 ; ATOM-NEXT:    retq
+;
+; MCU-LABEL: clamp:
+; MCU:       # BB#0:
+; MCU-NEXT:    cmpl $32767, %eax # imm = 0x7FFF
+; MCU-NEXT:    movl $32767, %ecx # imm = 0x7FFF
+; MCU-NEXT:    jg .LBB23_2
+; MCU-NEXT:  # BB#1:
+; MCU-NEXT:    movl %eax, %ecx
+; MCU-NEXT:  .LBB23_2:
+; MCU-NEXT:    cmpl $-32768, %ecx # imm = 0x8000
+; MCU-NEXT:    movw $-32768, %ax # imm = 0x8000
+; MCU-NEXT:    jl .LBB23_4
+; MCU-NEXT:  # BB#3:
+; MCU-NEXT:    movl %ecx, %eax
+; MCU-NEXT:  .LBB23_4:
+; MCU-NEXT:    movw %ax, (%edx)
+; MCU-NEXT:    retl
   %cmp = icmp sgt i32 %src, 32767
   %sel1 = select i1 %cmp, i32 32767, i32 %src
   %cmp1 = icmp slt i32 %sel1, -32768
@@ -613,6 +1017,33 @@ define void @test19() {
 ; CHECK-NEXT:    jp LBB24_3
 ; CHECK-NEXT:  ## BB#4: ## %CF244
 ; CHECK-NEXT:    retq
+;
+; MCU-LABEL: test19:
+; MCU:       # BB#0: # %BB
+; MCU-NEXT:    movl $-1, %ecx
+; MCU-NEXT:    movb $1, %al
+; MCU-NEXT:    .p2align 4, 0x90
+; MCU-NEXT:  .LBB24_1: # %CF
+; MCU-NEXT:    # =>This Inner Loop Header: Depth=1
+; MCU-NEXT:    testb %al, %al
+; MCU-NEXT:    jne .LBB24_1
+; MCU-NEXT:  # BB#2: # %CF250
+; MCU-NEXT:    # in Loop: Header=BB24_1 Depth=1
+; MCU-NEXT:    jne .LBB24_1
+; MCU-NEXT:  # BB#3: # %CF242.preheader
+; MCU-NEXT:    fldz
+; MCU-NEXT:    .p2align 4, 0x90
+; MCU-NEXT:  .LBB24_4: # %CF242
+; MCU-NEXT:    # =>This Inner Loop Header: Depth=1
+; MCU-NEXT:    cmpl %eax, %ecx
+; MCU-NEXT:    fucom %st(0)
+; MCU-NEXT:    fnstsw %ax
+; MCU-NEXT:    # kill: %AH<def> %AH<kill> %AX<kill>
+; MCU-NEXT:    sahf
+; MCU-NEXT:    jp .LBB24_4
+; MCU-NEXT:  # BB#5: # %CF244
+; MCU-NEXT:    fstp %st(0)
+; MCU-NEXT:    retl
 BB:
   br label %CF
 
@@ -639,10 +1070,22 @@ CF244:
 
 define i16 @select_xor_1(i16 %A, i8 %cond) {
 ; CHECK-LABEL: select_xor_1:
-; MCU:    andl    $1, %edx
-; MCU-NEXT:    negl    %edx
-; MCU-NEXT:    andl    $43, %edx
-; MCU-NEXT:    xorl    %edx, %eax
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorl $43, %eax
+; CHECK-NEXT:    testb $1, %sil
+; CHECK-NEXT:    cmovnew %ax, %di
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
+;
+; MCU-LABEL: select_xor_1:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    andl $1, %edx
+; MCU-NEXT:    negl %edx
+; MCU-NEXT:    andl $43, %edx
+; MCU-NEXT:    xorl %edx, %eax
+; MCU-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; MCU-NEXT:    retl
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp eq i8 %and, 0
@@ -653,10 +1096,20 @@ entry:
 
 define i32 @select_xor_2(i32 %A, i32 %B, i8 %cond) {
 ; CHECK-LABEL: select_xor_2:
-; MCU:    andl $1, %ecx
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    xorl %edi, %esi
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %edi, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
+;
+; MCU-LABEL: select_xor_2:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    andl $1, %ecx
 ; MCU-NEXT:    negl %ecx
 ; MCU-NEXT:    andl %edx, %ecx
 ; MCU-NEXT:    xorl %ecx, %eax
+; MCU-NEXT:    retl
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp eq i8 %and, 0
@@ -667,10 +1120,20 @@ entry:
 
 define i32 @select_or(i32 %A, i32 %B, i8 %cond) {
 ; CHECK-LABEL: select_or:
-; MCU:    andl $1, %ecx
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    orl %edi, %esi
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %edi, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
+;
+; MCU-LABEL: select_or:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    andl $1, %ecx
 ; MCU-NEXT:    negl %ecx
 ; MCU-NEXT:    andl %edx, %ecx
 ; MCU-NEXT:    orl %ecx, %eax
+; MCU-NEXT:    retl
 entry:
  %and = and i8 %cond, 1
  %cmp10 = icmp eq i8 %and, 0
@@ -681,14 +1144,24 @@ entry:
 
 define i32 @select_or_1(i32 %A, i32 %B, i32 %cond) {
 ; CHECK-LABEL: select_or_1:
-; MCU:    andl $1, %ecx
+; CHECK:       ## BB#0: ## %entry
+; CHECK-NEXT:    orl %edi, %esi
+; CHECK-NEXT:    testb $1, %dl
+; CHECK-NEXT:    cmovel %edi, %esi
+; CHECK-NEXT:    movl %esi, %eax
+; CHECK-NEXT:    retq
+;
+; MCU-LABEL: select_or_1:
+; MCU:       # BB#0: # %entry
+; MCU-NEXT:    andl $1, %ecx
 ; MCU-NEXT:    negl %ecx
 ; MCU-NEXT:    andl %edx, %ecx
 ; MCU-NEXT:    orl %ecx, %eax
+; MCU-NEXT:    retl
 entry:
  %and = and i32 %cond, 1
  %cmp10 = icmp eq i32 %and, 0
  %0 = or i32 %B, %A
  %1 = select i1 %cmp10, i32 %A, i32 %0
  ret i32 %1
-}
\ No newline at end of file
+}
diff --git a/test/CodeGen/X86/select_const.ll b/test/CodeGen/X86/select_const.ll
index 8c54685644c7cad52b6641488b3c0cdc4f476af8..a97e7c299e73dab14905af7f2cda3b4fc30e914a 100644
--- a/test/CodeGen/X86/select_const.ll
+++ b/test/CodeGen/X86/select_const.ll
@@ -1,6 +1,11 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
 
+; Select of constants: control flow / conditional moves can always be replaced by logic+math (but may not be worth it?).
+; Test the zeroext/signext variants of each pattern to see if that makes a difference.
+
+; select Cond, 0, 1 --> zext (!Cond)
+
 define i32 @select_0_or_1(i1 %cond) {
 ; CHECK-LABEL: select_0_or_1:
 ; CHECK:       # BB#0:
@@ -8,7 +13,6 @@ define i32 @select_0_or_1(i1 %cond) {
 ; CHECK-NEXT:    movzbl %dil, %eax
 ; CHECK-NEXT:    andl $1, %eax
 ; CHECK-NEXT:    retq
-;
   %sel = select i1 %cond, i32 0, i32 1
   ret i32 %sel
 }
@@ -19,18 +23,29 @@ define i32 @select_0_or_1_zeroext(i1 zeroext %cond) {
 ; CHECK-NEXT:    xorb $1, %dil
 ; CHECK-NEXT:    movzbl %dil, %eax
 ; CHECK-NEXT:    retq
-;
   %sel = select i1 %cond, i32 0, i32 1
   ret i32 %sel
 }
 
+define i32 @select_0_or_1_signext(i1 signext %cond) {
+; CHECK-LABEL: select_0_or_1_signext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    notb %dil
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 0, i32 1
+  ret i32 %sel
+}
+
+; select Cond, 1, 0 --> zext (Cond)
+
 define i32 @select_1_or_0(i1 %cond) {
 ; CHECK-LABEL: select_1_or_0:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %sel = select i1 %cond, i32 1, i32 0
   ret i32 %sel
 }
@@ -40,11 +55,22 @@ define i32 @select_1_or_0_zeroext(i1 zeroext %cond) {
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    movzbl %dil, %eax
 ; CHECK-NEXT:    retq
-;
   %sel = select i1 %cond, i32 1, i32 0
   ret i32 %sel
 }
 
+define i32 @select_1_or_0_signext(i1 signext %cond) {
+; CHECK-LABEL: select_1_or_0_signext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andb $1, %dil
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 1, i32 0
+  ret i32 %sel
+}
+
+; select Cond, 0, -1 --> sext (!Cond)
+
 define i32 @select_0_or_neg1(i1 %cond) {
 ; CHECK-LABEL: select_0_or_neg1:
 ; CHECK:       # BB#0:
@@ -52,7 +78,6 @@ define i32 @select_0_or_neg1(i1 %cond) {
 ; CHECK-NEXT:    andl $1, %edi
 ; CHECK-NEXT:    leal -1(%rdi), %eax
 ; CHECK-NEXT:    retq
-;
   %sel = select i1 %cond, i32 0, i32 -1
   ret i32 %sel
 }
@@ -63,20 +88,30 @@ define i32 @select_0_or_neg1_zeroext(i1 zeroext %cond) {
 ; CHECK-NEXT:    movzbl %dil, %eax
 ; CHECK-NEXT:    decl %eax
 ; CHECK-NEXT:    retq
-;
   %sel = select i1 %cond, i32 0, i32 -1
   ret i32 %sel
 }
 
+define i32 @select_0_or_neg1_signext(i1 signext %cond) {
+; CHECK-LABEL: select_0_or_neg1_signext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andb $1, %dil
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    decl %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 0, i32 -1
+  ret i32 %sel
+}
+
+; select Cond, -1, 0 --> sext (Cond)
+
 define i32 @select_neg1_or_0(i1 %cond) {
 ; CHECK-LABEL: select_neg1_or_0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testb $1, %dil
-; CHECK-NEXT:    movl $-1, %eax
-; CHECK-NEXT:    cmovel %ecx, %eax
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    negl %edi
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    retq
-;
   %sel = select i1 %cond, i32 -1, i32 0
   ret i32 %sel
 }
@@ -84,16 +119,133 @@ define i32 @select_neg1_or_0(i1 %cond) {
 define i32 @select_neg1_or_0_zeroext(i1 zeroext %cond) {
 ; CHECK-LABEL: select_neg1_or_0_zeroext:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    testb %dil, %dil
-; CHECK-NEXT:    movl $-1, %eax
-; CHECK-NEXT:    cmovel %ecx, %eax
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    negl %eax
 ; CHECK-NEXT:    retq
-;
   %sel = select i1 %cond, i32 -1, i32 0
   ret i32 %sel
 }
 
+define i32 @select_neg1_or_0_signext(i1 signext %cond) {
+; CHECK-LABEL: select_neg1_or_0_signext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movsbl %dil, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 -1, i32 0
+  ret i32 %sel
+}
+
+; select Cond, C+1, C --> add (zext Cond), C
+
+define i32 @select_Cplus1_C(i1 %cond) {
+; CHECK-LABEL: select_Cplus1_C:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    leal 41(%rdi), %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 42, i32 41
+  ret i32 %sel
+}
+
+define i32 @select_Cplus1_C_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_Cplus1_C_zeroext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    addl $41, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 42, i32 41
+  ret i32 %sel
+}
+
+define i32 @select_Cplus1_C_signext(i1 signext %cond) {
+; CHECK-LABEL: select_Cplus1_C_signext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andb $1, %dil
+; CHECK-NEXT:    movzbl %dil, %eax
+; CHECK-NEXT:    addl $41, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 42, i32 41
+  ret i32 %sel
+}
+
+; select Cond, C, C+1 --> add (sext Cond), C
+
+define i32 @select_C_Cplus1(i1 %cond) {
+; CHECK-LABEL: select_C_Cplus1:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    subl %edi, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 41, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C_Cplus1_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_C_Cplus1_zeroext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    movzbl %dil, %ecx
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 41, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C_Cplus1_signext(i1 signext %cond) {
+; CHECK-LABEL: select_C_Cplus1_signext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andb $1, %dil
+; CHECK-NEXT:    movzbl %dil, %ecx
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    subl %ecx, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 41, i32 42
+  ret i32 %sel
+}
+
+; In general, select of 2 constants could be:
+; select Cond, C1, C2 --> add (mul (zext Cond), C1-C2), C2 --> add (and (sext Cond), C1-C2), C2
+
+define i32 @select_C1_C2(i1 %cond) {
+; CHECK-LABEL: select_C1_C2:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    movl $421, %ecx # imm = 0x1A5
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 421, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C1_C2_zeroext(i1 zeroext %cond) {
+; CHECK-LABEL: select_C1_C2_zeroext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb %dil, %dil
+; CHECK-NEXT:    movl $421, %ecx # imm = 0x1A5
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 421, i32 42
+  ret i32 %sel
+}
+
+define i32 @select_C1_C2_signext(i1 signext %cond) {
+; CHECK-LABEL: select_C1_C2_signext:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    movl $421, %ecx # imm = 0x1A5
+; CHECK-NEXT:    movl $42, %eax
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i32 421, i32 42
+  ret i32 %sel
+}
+
+; select (x == 2), 2, (x + 1) --> select (x == 2), x, (x + 1)
+
 define i64 @select_2_or_inc(i64 %x) {
 ; CHECK-LABEL: select_2_or_inc:
 ; CHECK:       # BB#0:
@@ -101,10 +253,66 @@ define i64 @select_2_or_inc(i64 %x) {
 ; CHECK-NEXT:    cmpq $2, %rdi
 ; CHECK-NEXT:    cmoveq %rdi, %rax
 ; CHECK-NEXT:    retq
-;
   %cmp = icmp eq i64 %x, 2
   %add = add i64 %x, 1
   %retval.0 = select i1 %cmp, i64 2, i64 %add
   ret i64 %retval.0
 }
 
+define <4 x i32> @sel_constants_add_constant_vec(i1 %cond) {
+; CHECK-LABEL: sel_constants_add_constant_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB22_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [12,13,14,15]
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB22_1:
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [4294967293,14,4,4]
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, <4 x i32> <i32 -4, i32 12, i32 1, i32 0>, <4 x i32> <i32 11, i32 11, i32 11, i32 11>
+  %bo = add <4 x i32> %sel, <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i32> %bo
+}
+
+define <2 x double> @sel_constants_fmul_constant_vec(i1 %cond) {
+; CHECK-LABEL: sel_constants_fmul_constant_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    jne .LBB23_1
+; CHECK-NEXT:  # BB#2:
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.188300e+02,3.454000e+01]
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB23_1:
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [-2.040000e+01,3.768000e+01]
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, <2 x double> <double -4.0, double 12.0>, <2 x double> <double 23.3, double 11.0>
+  %bo = fmul <2 x double> %sel, <double 5.1, double 3.14>
+  ret <2 x double> %bo
+}
+
+; 4294967297 = 0x100000001.
+; This becomes an opaque constant via ConstantHoisting, so we don't fold it into the select.
+
+define i64 @opaque_constant(i1 %cond, i64 %x) {
+; CHECK-LABEL: opaque_constant:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testb $1, %dil
+; CHECK-NEXT:    movl $23, %ecx
+; CHECK-NEXT:    movq $-4, %rax
+; CHECK-NEXT:    cmoveq %rcx, %rax
+; CHECK-NEXT:    movabsq $4294967297, %rcx # imm = 0x100000001
+; CHECK-NEXT:    andq %rcx, %rax
+; CHECK-NEXT:    xorl %edx, %edx
+; CHECK-NEXT:    cmpq %rcx, %rsi
+; CHECK-NEXT:    sete %dl
+; CHECK-NEXT:    subq %rdx, %rax
+; CHECK-NEXT:    retq
+  %sel = select i1 %cond, i64 -4, i64 23
+  %bo = and i64 %sel, 4294967297
+  %cmp = icmp eq i64 %x, 4294967297
+  %sext = sext i1 %cmp to i64
+  %add = add i64 %bo, %sext
+  ret i64 %add
+}
+
diff --git a/test/CodeGen/X86/setcc-logic.ll b/test/CodeGen/X86/setcc-logic.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4d1e5ba16540e4806c810d83e8c4c46c9b5eb11b
--- /dev/null
+++ b/test/CodeGen/X86/setcc-logic.ll
@@ -0,0 +1,482 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define zeroext i1 @all_bits_clear(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: all_bits_clear:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %a = icmp eq i32 %P, 0
+  %b = icmp eq i32 %Q, 0
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @all_sign_bits_clear(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: all_sign_bits_clear:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    setns %al
+; CHECK-NEXT:    retq
+  %a = icmp sgt i32 %P, -1
+  %b = icmp sgt i32 %Q, -1
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @all_bits_set(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: all_bits_set:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    cmpl $-1, %edi
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %a = icmp eq i32 %P, -1
+  %b = icmp eq i32 %Q, -1
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @all_sign_bits_set(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: all_sign_bits_set:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    shrl $31, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = and i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @any_bits_set(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: any_bits_set:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    retq
+  %a = icmp ne i32 %P, 0
+  %b = icmp ne i32 %Q, 0
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @any_sign_bits_set(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: any_sign_bits_set:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    shrl $31, %edi
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    retq
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @any_bits_clear(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: any_bits_clear:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    cmpl $-1, %edi
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    retq
+  %a = icmp ne i32 %P, -1
+  %b = icmp ne i32 %Q, -1
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+define zeroext i1 @any_sign_bits_clear(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: any_sign_bits_clear:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    testl %esi, %edi
+; CHECK-NEXT:    setns %al
+; CHECK-NEXT:    retq
+  %a = icmp sgt i32 %P, -1
+  %b = icmp sgt i32 %Q, -1
+  %c = or i1 %a, %b
+  ret i1 %c
+}
+
+; PR3351 - (P == 0) & (Q == 0) -> (P|Q) == 0
+define i32 @all_bits_clear_branch(i32* %P, i32* %Q) nounwind {
+; CHECK-LABEL: all_bits_clear_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    orq %rsi, %rdi
+; CHECK-NEXT:    jne .LBB8_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB8_2: # %return
+; CHECK-NEXT:    movl $192, %eax
+; CHECK-NEXT:    retq
+entry:
+  %a = icmp eq i32* %P, null
+  %b = icmp eq i32* %Q, null
+  %c = and i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @all_sign_bits_clear_branch(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: all_sign_bits_clear_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    js .LBB9_3
+; CHECK-NEXT:  # BB#1: # %entry
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    js .LBB9_3
+; CHECK-NEXT:  # BB#2: # %bb1
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB9_3: # %return
+; CHECK-NEXT:    movl $192, %eax
+; CHECK-NEXT:    retq
+entry:
+  %a = icmp sgt i32 %P, -1
+  %b = icmp sgt i32 %Q, -1
+  %c = and i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @all_bits_set_branch(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: all_bits_set_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpl $-1, %edi
+; CHECK-NEXT:    jne .LBB10_3
+; CHECK-NEXT:  # BB#1: # %entry
+; CHECK-NEXT:    cmpl $-1, %esi
+; CHECK-NEXT:    jne .LBB10_3
+; CHECK-NEXT:  # BB#2: # %bb1
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB10_3: # %return
+; CHECK-NEXT:    movl $192, %eax
+; CHECK-NEXT:    retq
+entry:
+  %a = icmp eq i32 %P, -1
+  %b = icmp eq i32 %Q, -1
+  %c = and i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @all_sign_bits_set_branch(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: all_sign_bits_set_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    jns .LBB11_3
+; CHECK-NEXT:  # BB#1: # %entry
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    jns .LBB11_3
+; CHECK-NEXT:  # BB#2: # %bb1
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB11_3: # %return
+; CHECK-NEXT:    movl $192, %eax
+; CHECK-NEXT:    retq
+entry:
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = and i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+; PR3351 - (P != 0) | (Q != 0) -> (P|Q) != 0
+define i32 @any_bits_set_branch(i32* %P, i32* %Q) nounwind {
+; CHECK-LABEL: any_bits_set_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    orq %rsi, %rdi
+; CHECK-NEXT:    je .LBB12_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB12_2: # %return
+; CHECK-NEXT:    movl $192, %eax
+; CHECK-NEXT:    retq
+entry:
+  %a = icmp ne i32* %P, null
+  %b = icmp ne i32* %Q, null
+  %c = or i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @any_sign_bits_set_branch(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: any_sign_bits_set_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    js .LBB13_2
+; CHECK-NEXT:  # BB#1: # %entry
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    js .LBB13_2
+; CHECK-NEXT:  # BB#3: # %return
+; CHECK-NEXT:    movl $192, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB13_2: # %bb1
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    retq
+entry:
+  %a = icmp slt i32 %P, 0
+  %b = icmp slt i32 %Q, 0
+  %c = or i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @any_bits_clear_branch(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: any_bits_clear_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    cmpl $-1, %edi
+; CHECK-NEXT:    jne .LBB14_2
+; CHECK-NEXT:  # BB#1: # %entry
+; CHECK-NEXT:    cmpl $-1, %esi
+; CHECK-NEXT:    jne .LBB14_2
+; CHECK-NEXT:  # BB#3: # %return
+; CHECK-NEXT:    movl $192, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB14_2: # %bb1
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    retq
+entry:
+  %a = icmp ne i32 %P, -1
+  %b = icmp ne i32 %Q, -1
+  %c = or i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define i32 @any_sign_bits_clear_branch(i32 %P, i32 %Q) nounwind {
+; CHECK-LABEL: any_sign_bits_clear_branch:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    testl %edi, %edi
+; CHECK-NEXT:    jns .LBB15_2
+; CHECK-NEXT:  # BB#1: # %entry
+; CHECK-NEXT:    testl %esi, %esi
+; CHECK-NEXT:    jns .LBB15_2
+; CHECK-NEXT:  # BB#3: # %return
+; CHECK-NEXT:    movl $192, %eax
+; CHECK-NEXT:    retq
+; CHECK-NEXT:  .LBB15_2: # %bb1
+; CHECK-NEXT:    movl $4, %eax
+; CHECK-NEXT:    retq
+entry:
+  %a = icmp sgt i32 %P, -1
+  %b = icmp sgt i32 %Q, -1
+  %c = or i1 %a, %b
+  br i1 %c, label %bb1, label %return
+
+bb1:
+  ret i32 4
+
+return:
+  ret i32 192
+}
+
+define <4 x i1> @all_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
+; CHECK-LABEL: all_bits_clear_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = icmp eq <4 x i32> %P, zeroinitializer
+  %b = icmp eq <4 x i32> %Q, zeroinitializer
+  %c = and <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @all_sign_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
+; CHECK-LABEL: all_sign_bits_clear_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = icmp sgt <4 x i32> %P, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %b = icmp sgt <4 x i32> %Q, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = and <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @all_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
+; CHECK-LABEL: all_bits_set_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = icmp eq <4 x i32> %P, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %b = icmp eq <4 x i32> %Q, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = and <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @all_sign_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
+; CHECK-LABEL: all_sign_bits_set_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = icmp slt <4 x i32> %P, zeroinitializer
+  %b = icmp slt <4 x i32> %Q, zeroinitializer
+  %c = and <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @any_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
+; CHECK-LABEL: any_bits_set_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    pxor %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = icmp ne <4 x i32> %P, zeroinitializer
+  %b = icmp ne <4 x i32> %Q, zeroinitializer
+  %c = or <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @any_sign_bits_set_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
+; CHECK-LABEL: any_sign_bits_set_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    por %xmm1, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm1
+; CHECK-NEXT:    pcmpgtd %xmm0, %xmm1
+; CHECK-NEXT:    movdqa %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = icmp slt <4 x i32> %P, zeroinitializer
+  %b = icmp slt <4 x i32> %Q, zeroinitializer
+  %c = or <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @any_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
+; CHECK-LABEL: any_bits_clear_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-NEXT:    pxor %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = icmp ne <4 x i32> %P, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %b = icmp ne <4 x i32> %Q, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = or <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define <4 x i1> @any_sign_bits_clear_vec(<4 x i32> %P, <4 x i32> %Q) nounwind {
+; CHECK-LABEL: any_sign_bits_clear_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pand %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm1
+; CHECK-NEXT:    pcmpgtd %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %a = icmp sgt <4 x i32> %P, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %b = icmp sgt <4 x i32> %Q, <i32 -1, i32 -1, i32 -1, i32 -1>
+  %c = or <4 x i1> %a, %b
+  ret <4 x i1> %c
+}
+
+define zeroext i1 @ne_neg1_and_ne_zero(i64 %x) nounwind {
+; CHECK-LABEL: ne_neg1_and_ne_zero:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    incq %rdi
+; CHECK-NEXT:    cmpq $1, %rdi
+; CHECK-NEXT:    seta %al
+; CHECK-NEXT:    retq
+  %cmp1 = icmp ne i64 %x, -1
+  %cmp2 = icmp ne i64 %x, 0
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+; PR32401 - https://bugs.llvm.org/show_bug.cgi?id=32401
+
+define zeroext i1 @and_eq(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
+; CHECK-LABEL: and_eq:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    xorl %ecx, %edx
+; CHECK-NEXT:    orb %dl, %dil
+; CHECK-NEXT:    sete %al
+; CHECK-NEXT:    retq
+  %cmp1 = icmp eq i8 %a, %b
+  %cmp2 = icmp eq i8 %c, %d
+  %and = and i1 %cmp1, %cmp2
+  ret i1 %and
+}
+
+define zeroext i1 @or_ne(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
+; CHECK-LABEL: or_ne:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    xorl %ecx, %edx
+; CHECK-NEXT:    orb %dl, %dil
+; CHECK-NEXT:    setne %al
+; CHECK-NEXT:    retq
+  %cmp1 = icmp ne i8 %a, %b
+  %cmp2 = icmp ne i8 %c, %d
+  %or = or i1 %cmp1, %cmp2
+  ret i1 %or
+}
+
+; This should not be transformed because vector compares + bitwise logic are faster.
+
+define <4 x i1> @and_eq_vec(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) nounwind {
+; CHECK-LABEL: and_eq_vec:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    pcmpeqd %xmm1, %xmm0
+; CHECK-NEXT:    pcmpeqd %xmm3, %xmm2
+; CHECK-NEXT:    pand %xmm2, %xmm0
+; CHECK-NEXT:    retq
+  %cmp1 = icmp eq <4 x i32> %a, %b
+  %cmp2 = icmp eq <4 x i32> %c, %d
+  %and = and <4 x i1> %cmp1, %cmp2
+  ret <4 x i1> %and
+}
+
diff --git a/test/CodeGen/X86/setcc-lowering.ll b/test/CodeGen/X86/setcc-lowering.ll
index a5ff277580246924c9fc33037418fb3c373fcfd7..391f1cc9fb43f833e0af0594716c282a9143d220 100644
--- a/test/CodeGen/X86/setcc-lowering.ll
+++ b/test/CodeGen/X86/setcc-lowering.ll
@@ -20,6 +20,19 @@ define <8 x i16> @pr25080(<8 x i32> %a) {
 ; AVX-NEXT:    vpsraw $15, %xmm0, %xmm0
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
+;
+; KNL-32-LABEL: pr25080:
+; KNL-32:       # BB#0: # %entry
+; KNL-32-NEXT:    vpbroadcastd {{\.LCPI.*}}, %ymm1
+; KNL-32-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; KNL-32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; KNL-32-NEXT:    vpcmpeqd %zmm1, %zmm0, %k0
+; KNL-32-NEXT:    movb $15, %al
+; KNL-32-NEXT:    kmovw %eax, %k1
+; KNL-32-NEXT:    korw %k1, %k0, %k1
+; KNL-32-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; KNL-32-NEXT:    vpmovqw %zmm0, %xmm0
+; KNL-32-NEXT:    retl
 entry:
   %0 = trunc <8 x i32> %a to <8 x i23>
   %1 = icmp eq <8 x i23> %0, zeroinitializer
@@ -29,6 +42,18 @@ entry:
 }
 
 define void @pr26232(i64 %a) {
+; AVX-LABEL: pr26232:
+; AVX:       # BB#0: # %for_loop599.preheader
+; AVX-NEXT:    .p2align 4, 0x90
+; AVX-NEXT:  .LBB1_1: # %for_loop599
+; AVX-NEXT:    # =>This Inner Loop Header: Depth=1
+; AVX-NEXT:    cmpq $65536, %rdi # imm = 0x10000
+; AVX-NEXT:    setl -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    cmpw $0, -{{[0-9]+}}(%rsp)
+; AVX-NEXT:    jne .LBB1_1
+; AVX-NEXT:  # BB#2: # %for_exit600
+; AVX-NEXT:    retq
+;
 ; KNL-32-LABEL: pr26232:
 ; KNL-32:       # BB#0: # %for_loop599.preheader
 ; KNL-32-NEXT:    pushl %esi
diff --git a/test/CodeGen/X86/setcc-sentinals.ll b/test/CodeGen/X86/setcc-sentinals.ll
deleted file mode 100644
index d36e678c6048844250851acb99c693d90063c2d3..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/setcc-sentinals.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 -asm-verbose=false | FileCheck %s
-
-define zeroext i1 @test0(i64 %x) nounwind {
-; CHECK-LABEL: test0:
-; CHECK-NEXT: incq %[[X:rdi|rcx]]
-; CHECK-NEXT: cmpq $1, %[[X]]
-; CHECK-NEXT: seta %al
-; CHECK-NEXT: ret
-  %cmp1 = icmp ne i64 %x, -1
-  %not.cmp = icmp ne i64 %x, 0
-  %.cmp1 = and i1 %cmp1, %not.cmp
-  ret i1 %.cmp1
-}
diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b4ec03598aa4ef39c10c91c365dfac768d55f17e
--- /dev/null
+++ b/test/CodeGen/X86/setcc-wide-types.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefix=AVX2
+
+; Equality checks of 128/256-bit values can use PMOVMSK or PTEST to avoid scalarization.
+
+define i32 @ne_i128(<2 x i64> %x, <2 x i64> %y) {
+; SSE2-LABEL: ne_i128:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %ecx
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: ne_i128:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    retq
+  %bcx = bitcast <2 x i64> %x to i128
+  %bcy = bitcast <2 x i64> %y to i128
+  %cmp = icmp ne i128 %bcx, %bcy
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define i32 @eq_i128(<2 x i64> %x, <2 x i64> %y) {
+; SSE2-LABEL: eq_i128:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; SSE2-NEXT:    pmovmskb %xmm0, %ecx
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: eq_i128:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovmskb %xmm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl $65535, %ecx # imm = 0xFFFF
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    retq
+  %bcx = bitcast <2 x i64> %x to i128
+  %bcy = bitcast <2 x i64> %y to i128
+  %cmp = icmp eq i128 %bcx, %bcy
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) {
+; SSE2-LABEL: ne_i256:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm4, %r8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm4, %r9
+; SSE2-NEXT:    movd %xmm0, %r10
+; SSE2-NEXT:    movd %xmm1, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rdi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    movd %xmm2, %rcx
+; SSE2-NEXT:    movd %xmm3, %rdx
+; SSE2-NEXT:    xorq %rsi, %rdx
+; SSE2-NEXT:    xorq %r10, %rcx
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    xorq %r9, %rax
+; SSE2-NEXT:    xorq %r8, %rdi
+; SSE2-NEXT:    orq %rax, %rdi
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    orq %rcx, %rdi
+; SSE2-NEXT:    setne %al
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: ne_i256:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl $-1, %ecx
+; AVX2-NEXT:    setne %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %bcx = bitcast <4 x i64> %x to i256
+  %bcy = bitcast <4 x i64> %y to i256
+  %cmp = icmp ne i256 %bcx, %bcy
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
+define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) {
+; SSE2-LABEL: eq_i256:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE2-NEXT:    movd %xmm4, %r8
+; SSE2-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movd %xmm4, %r9
+; SSE2-NEXT:    movd %xmm0, %r10
+; SSE2-NEXT:    movd %xmm1, %rsi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rdi
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; SSE2-NEXT:    movd %xmm0, %rax
+; SSE2-NEXT:    movd %xmm2, %rcx
+; SSE2-NEXT:    movd %xmm3, %rdx
+; SSE2-NEXT:    xorq %rsi, %rdx
+; SSE2-NEXT:    xorq %r10, %rcx
+; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    xorq %r9, %rax
+; SSE2-NEXT:    xorq %r8, %rdi
+; SSE2-NEXT:    orq %rax, %rdi
+; SSE2-NEXT:    xorl %eax, %eax
+; SSE2-NEXT:    orq %rcx, %rdi
+; SSE2-NEXT:    sete %al
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: eq_i256:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpcmpeqb %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl $-1, %ecx
+; AVX2-NEXT:    sete %al
+; AVX2-NEXT:    vzeroupper
+; AVX2-NEXT:    retq
+  %bcx = bitcast <4 x i64> %x to i256
+  %bcy = bitcast <4 x i64> %y to i256
+  %cmp = icmp eq i256 %bcx, %bcy
+  %zext = zext i1 %cmp to i32
+  ret i32 %zext
+}
+
diff --git a/test/CodeGen/X86/setcc.ll b/test/CodeGen/X86/setcc.ll
index 268460f999b83894088ea00f11470996302bbc79..fab4f41372511f0e2fd30a2999001be1989404af 100644
--- a/test/CodeGen/X86/setcc.ll
+++ b/test/CodeGen/X86/setcc.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
 ; rdar://7329206
 
@@ -13,7 +13,6 @@ define zeroext i16 @t1(i16 zeroext %x) nounwind readnone ssp {
 ; CHECK-NEXT:    seta %al
 ; CHECK-NEXT:    shll $5, %eax
 ; CHECK-NEXT:    retq
-;
   %t0 = icmp ugt i16 %x, 26
   %if = select i1 %t0, i16 32, i16 0
   ret i16 %if
@@ -22,11 +21,11 @@ define zeroext i16 @t1(i16 zeroext %x) nounwind readnone ssp {
 define zeroext i16 @t2(i16 zeroext %x) nounwind readnone ssp {
 ; CHECK-LABEL: t2:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    cmpl $26, %edi
-; CHECK-NEXT:    sbbl %eax, %eax
-; CHECK-NEXT:    andl $32, %eax
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    shll $5, %eax
 ; CHECK-NEXT:    retq
-;
   %t0 = icmp ult i16 %x, 26
   %if = select i1 %t0, i16 32, i16 0
   ret i16 %if
@@ -35,11 +34,11 @@ define zeroext i16 @t2(i16 zeroext %x) nounwind readnone ssp {
 define i64 @t3(i64 %x) nounwind readnone ssp {
 ; CHECK-LABEL: t3:
 ; CHECK:       ## BB#0:
+; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    cmpq $18, %rdi
-; CHECK-NEXT:    sbbq %rax, %rax
-; CHECK-NEXT:    andl $64, %eax
+; CHECK-NEXT:    setb %al
+; CHECK-NEXT:    shlq $6, %rax
 ; CHECK-NEXT:    retq
-;
   %t0 = icmp ult i64 %x, 18
   %if = select i1 %t0, i64 64, i64 0
   ret i64 %if
@@ -52,11 +51,10 @@ define i32 @t4(i32 %a) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movq _v4@{{.*}}(%rip), %rax
 ; CHECK-NEXT:    cmpl $1, (%rax)
-; CHECK-NEXT:    sbbl %eax, %eax
-; CHECK-NEXT:    andl $32768, %eax ## imm = 0x8000
-; CHECK-NEXT:    leal 65536(%rax,%rax), %eax
+; CHECK-NEXT:    movw $1, %ax
+; CHECK-NEXT:    adcw $0, %ax
+; CHECK-NEXT:    shll $16, %eax
 ; CHECK-NEXT:    retq
-;
   %t0 = load i32, i32* @v4, align 4
   %not.tobool = icmp eq i32 %t0, 0
   %conv.i = sext i1 %not.tobool to i16
@@ -73,7 +71,6 @@ define i8 @t5(i32 %a) #0 {
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    setns %al
 ; CHECK-NEXT:    retq
-;
   %.lobit = lshr i32 %a, 31
   %trunc = trunc i32 %.lobit to i8
   %.not = xor i8 %trunc, 1
@@ -86,7 +83,6 @@ define zeroext i1 @t6(i32 %a) #0 {
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    setns %al
 ; CHECK-NEXT:    retq
-;
   %.lobit = lshr i32 %a, 31
   %trunc = trunc i32 %.lobit to i1
   %.not = xor i1 %trunc, 1
diff --git a/test/CodeGen/X86/sext-i1.ll b/test/CodeGen/X86/sext-i1.ll
index 9b86cd0c9a2aca721f77cc8c551a4f9df2a808a3..8c92434db21a2d0be220517bdfa3d087de553318 100644
--- a/test/CodeGen/X86/sext-i1.ll
+++ b/test/CodeGen/X86/sext-i1.ll
@@ -1,4 +1,4 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i386-unknown-unknown -disable-cgp-branch-opts    | FileCheck %s --check-prefix=CHECK --check-prefix=X32
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -disable-cgp-branch-opts  | FileCheck %s --check-prefix=CHECK --check-prefix=X64
 
@@ -6,24 +6,34 @@
 ; PR6146
 
 define i32 @t1(i32 %x) nounwind readnone ssp {
-; CHECK-LABEL: t1:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    cmpl $1
-; CHECK-NEXT:    sbbl %eax, %eax
-; CHECK-NEXT:    ret
+; X32-LABEL: t1:
+; X32:       # BB#0:
+; X32-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sbbl %eax, %eax
+; X32-NEXT:    retl
 ;
+; X64-LABEL: t1:
+; X64:       # BB#0:
+; X64-NEXT:    cmpl $1, %edi
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    retq
   %t0 = icmp eq i32 %x, 0
   %if = select i1 %t0, i32 -1, i32 0
   ret i32 %if
 }
 
 define i32 @t2(i32 %x) nounwind readnone ssp {
-; CHECK-LABEL: t2:
-; CHECK:       # BB#0:
-; CHECK-NEXT:    cmpl $1
-; CHECK-NEXT:    sbbl %eax, %eax
-; CHECK-NEXT:    ret
+; X32-LABEL: t2:
+; X32:       # BB#0:
+; X32-NEXT:    cmpl $1, {{[0-9]+}}(%esp)
+; X32-NEXT:    sbbl %eax, %eax
+; X32-NEXT:    retl
 ;
+; X64-LABEL: t2:
+; X64:       # BB#0:
+; X64-NEXT:    cmpl $1, %edi
+; X64-NEXT:    sbbl %eax, %eax
+; X64-NEXT:    retq
   %t0 = icmp eq i32 %x, 0
   %if = sext i1 %t0 to i32
   ret i32 %if
@@ -46,7 +56,6 @@ define i32 @t3() nounwind readonly {
 ; X64-NEXT:    cmpq %rax, %rax
 ; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    retq
-;
 entry:
   %not.tobool = icmp eq i32 undef, 0
   %cond = sext i1 %not.tobool to i32
@@ -80,7 +89,6 @@ define i32 @t4(i64 %x) nounwind readnone ssp {
 ; X64-NEXT:    cmpq $1, %rdi
 ; X64-NEXT:    sbbl %eax, %eax
 ; X64-NEXT:    retq
-;
   %t0 = icmp eq i64 %x, 0
   %t1 = sext i1 %t0 to i32
   ret i32 %t1
@@ -99,9 +107,70 @@ define i64 @t5(i32 %x) nounwind readnone ssp {
 ; X64-NEXT:    cmpl $1, %edi
 ; X64-NEXT:    sbbq %rax, %rax
 ; X64-NEXT:    retq
-;
   %t0 = icmp eq i32 %x, 0
   %t1 = sext i1 %t0 to i64
   ret i64 %t1
 }
 
+; sext (xor Bool, -1) --> sub (zext Bool), 1
+
+define i32 @select_0_or_1s(i1 %cond) {
+; X32-LABEL: select_0_or_1s:
+; X32:       # BB#0:
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    decl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: select_0_or_1s:
+; X64:       # BB#0:
+; X64-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-NEXT:    andl $1, %edi
+; X64-NEXT:    leal -1(%rdi), %eax
+; X64-NEXT:    retq
+  %not = xor i1 %cond, 1
+  %sext = sext i1 %not to i32
+  ret i32 %sext
+}
+
+; sext (xor Bool, -1) --> sub (zext Bool), 1
+
+define i32 @select_0_or_1s_zeroext(i1 zeroext %cond) {
+; X32-LABEL: select_0_or_1s_zeroext:
+; X32:       # BB#0:
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    decl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: select_0_or_1s_zeroext:
+; X64:       # BB#0:
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    decl %eax
+; X64-NEXT:    retq
+  %not = xor i1 %cond, 1
+  %sext = sext i1 %not to i32
+  ret i32 %sext
+}
+
+; sext (xor Bool, -1) --> sub (zext Bool), 1
+
+define i32 @select_0_or_1s_signext(i1 signext %cond) {
+; X32-LABEL: select_0_or_1s_signext:
+; X32:       # BB#0:
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    andb $1, %al
+; X32-NEXT:    movzbl %al, %eax
+; X32-NEXT:    decl %eax
+; X32-NEXT:    retl
+;
+; X64-LABEL: select_0_or_1s_signext:
+; X64:       # BB#0:
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    decl %eax
+; X64-NEXT:    retq
+  %not = xor i1 %cond, 1
+  %sext = sext i1 %not to i32
+  ret i32 %sext
+}
+
diff --git a/test/CodeGen/X86/sfence.ll b/test/CodeGen/X86/sfence.ll
deleted file mode 100644
index 0c28407b31e952e37948167815afc6f9e2cf76f1..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/sfence.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep sfence
-
-declare void @llvm.x86.sse.sfence() nounwind
-
-define void @test() {
-  call void @llvm.x86.sse.sfence()
-  ret void
-}
diff --git a/test/CodeGen/X86/sha.ll b/test/CodeGen/X86/sha.ll
index fe42637bc538677e7858cb2fba9c5148ec9068a1..eb19664704919f103f4f0d4aaa10f8d04e23f2e2 100644
--- a/test/CodeGen/X86/sha.ll
+++ b/test/CodeGen/X86/sha.ll
@@ -86,7 +86,7 @@ entry:
   ; CHECK: test_sha256rnds2rr
   ; CHECK: movaps %xmm0, [[XMM_TMP1:%xmm[1-9][0-9]?]]
   ; CHECK: movaps %xmm2, %xmm0
-  ; CHECK: sha256rnds2 %xmm1, [[XMM_TMP1]]
+  ; CHECK: sha256rnds2 %xmm0, %xmm1, [[XMM_TMP1]]
 }
 
 define <4 x i32> @test_sha256rnds2rm(<4 x i32> %a, <4 x i32>* %b, <4 x i32> %c) nounwind uwtable {
@@ -97,7 +97,7 @@ entry:
   ; CHECK: test_sha256rnds2rm
   ; CHECK: movaps %xmm0, [[XMM_TMP2:%xmm[1-9][0-9]?]]
   ; CHECK: movaps %xmm1, %xmm0
-  ; CHECK: sha256rnds2 (%rdi), [[XMM_TMP2]]
+  ; CHECK: sha256rnds2 %xmm0, (%rdi), [[XMM_TMP2]]
 }
 
 declare <4 x i32> @llvm.x86.sha256msg1(<4 x i32>, <4 x i32>) nounwind readnone
@@ -136,4 +136,4 @@ entry:
   ret <4 x i32> %1
   ; CHECK: test_sha256msg2rm
   ; CHECK: sha256msg2 (%rdi), %xmm0
-}
\ No newline at end of file
+}
diff --git a/test/CodeGen/X86/shrink_vmul.ll b/test/CodeGen/X86/shrink_vmul.ll
index d7e99afb2f5051726f9fdddc0c2385d6a1a4ff38..930af226b953591fbc222be87f1e7aaa794cab40 100644
--- a/test/CodeGen/X86/shrink_vmul.ll
+++ b/test/CodeGen/X86/shrink_vmul.ll
@@ -449,7 +449,7 @@ define void @mul_2xi16_sext_zext(i8* nocapture readonly %a, i8* nocapture readon
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
 ; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; CHECK-NEXT:    psrad $16, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; CHECK-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
@@ -835,7 +835,7 @@ define void @mul_2xi16_varconst4(i8* nocapture readonly %a, i64 %index) {
 ; CHECK:       # BB#0: # %entry
 ; CHECK-NEXT:    movq {{.*}}(%rip), %rax
 ; CHECK-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; CHECK-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; CHECK-NEXT:    psrad $16, %xmm0
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; CHECK-NEXT:    movl $32768, %ecx # imm = 0x8000
diff --git a/test/CodeGen/X86/shuffle-combine-crash-2.ll b/test/CodeGen/X86/shuffle-combine-crash-2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ea37d5b485312936e220aa11c51483ec30950a7c
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-combine-crash-2.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64
+
+define <4 x i64> @fold_movsd_zero() {
+; X86-LABEL: fold_movsd_zero:
+; X86:       # BB#0:
+; X86-NEXT:    xorps %xmm0, %xmm0
+; X86-NEXT:    xorps %xmm1, %xmm1
+; X86-NEXT:    retl
+;
+; X64-LABEL: fold_movsd_zero:
+; X64:       # BB#0:
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    xorps %xmm1, %xmm1
+; X64-NEXT:    retq
+  %insert = insertelement <4 x i64> zeroinitializer, i64 0, i32 0
+  %shuffle = shufflevector <4 x i64> %insert, <4 x i64> zeroinitializer, <4 x i32> <i32 3, i32 5, i32 7, i32 1>
+  ret <4 x i64> %shuffle
+}
diff --git a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d46082f20a45a2678e4c4ded3a03cfb85a452cda
--- /dev/null
+++ b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll
@@ -0,0 +1,100 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; PR32449
+
+define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind {
+; AVX2-LABEL: foo2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,1]
+; AVX2-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm1[1,0]
+; AVX2-NEXT:    vmovapd %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+  %res = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> <i32 1, i32 1>
+  %res1 = shufflevector<2 x double> %res, <2 x double> undef, <2 x i32> <i32 1, i32 undef>
+  store <2 x double> %res, <2 x double>* %p
+  ret <2 x double> %res1
+}
+
+define <4 x double> @foo4(<4 x double> %v, <4 x double> *%p) nounwind {
+; AVX2-LABEL: foo4:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm1[2,0,2,3]
+; AVX2-NEXT:    vmovapd %ymm1, (%rdi)
+; AVX2-NEXT:    retq
+  %res = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %res1 = shufflevector<4 x double> %res, <4 x double> undef, <4 x i32> <i32 2, i32 0, i32 undef, i32 undef>
+  store <4 x double> %res, <4 x double>* %p
+  ret <4 x double> %res1
+}
+
+define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind {
+; AVX2-LABEL: foo8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2]
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = <2,0,u,u,5,1,3,7>
+; AVX2-NEXT:    vpermps %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vmovapd %ymm1, (%rdi)
+; AVX2-NEXT:    retq
+  %res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> <i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5, i32 5>
+  %res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> <i32 2, i32 0, i32 undef, i32 undef, i32 5, i32 1, i32 3, i32 7>
+  store <8 x float> %res, <8 x float>* %p
+  ret <8 x float> %res1
+}
+
+define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind {
+; AVX2-LABEL: undef_splatmask:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; AVX2-NEXT:    retq
+  %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  ret <4 x i32> %res1
+}
+
+define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind {
+; AVX2-LABEL: undef_splatmask2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3]
+; AVX2-NEXT:    retq
+  %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 1, i32 2, i32 undef>
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  ret <4 x i32> %res1
+}
+
+define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind {
+; AVX2-LABEL: undef_splatmask3:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3]
+; AVX2-NEXT:    retq
+  %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3>
+  ret <4 x i32> %res1
+}
+
+define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind {
+; AVX2-LABEL: undef_splatmask4:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+  %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  store <4 x i32> %res, <4 x i32>* %p
+  ret <4 x i32> %res1
+}
+
+define <4 x i32> @undef_splatmask5(<4 x i32> %v, <4 x i32>* %p) nounwind {
+; AVX2-LABEL: undef_splatmask5:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpbroadcastq %xmm0, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; AVX2-NEXT:    vmovdqa %xmm1, (%rdi)
+; AVX2-NEXT:    retq
+  %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 0, i32 undef>
+  %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 3>
+  store <4 x i32> %res, <4 x i32>* %p
+  ret <4 x i32> %res1
+}
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 893f96e6fb226ec6e6f2d01e44446f2c49c45fe6..b4ea9e2dc91951914252191b3f843fbdd5823c41 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -31,6 +31,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v16i8:
@@ -42,6 +43,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v16i8:
@@ -53,6 +55,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v16i8:
@@ -64,6 +67,7 @@ define void @shuffle_v32i8_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -89,6 +93,7 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512F-NEXT:    vpmovsxwd (%rdi), %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_v16i16_to_v16i8:
@@ -96,6 +101,7 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512VL-NEXT:    vpmovsxwd (%rdi), %zmm0
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_v16i16_to_v16i8:
@@ -103,12 +109,14 @@ define void @trunc_v16i16_to_v16i8(<32 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_v16i16_to_v16i8:
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %bc = bitcast <32 x i8> %vec to <16 x i16>
@@ -139,6 +147,7 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_to_v8i16:
@@ -153,6 +162,7 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v16i16_to_v8i16:
@@ -164,6 +174,7 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v8i16:
@@ -178,6 +189,7 @@ define void @shuffle_v16i16_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -189,7 +201,7 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX-LABEL: trunc_v8i32_to_v8i16:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
 ; AVX-NEXT:    vzeroupper
@@ -200,12 +212,14 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_v8i32_to_v8i16:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512VL-NEXT:    vpmovdw %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_v8i32_to_v8i16:
@@ -213,12 +227,14 @@ define void @trunc_v8i32_to_v8i16(<16 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i16:
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BWVL-NEXT:    vpmovdw %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %bc = bitcast <16 x i16> %vec to <8 x i32>
@@ -243,6 +259,7 @@ define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX512F-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v8i32_to_v4i32:
@@ -251,6 +268,7 @@ define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX512VL-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v8i32_to_v4i32:
@@ -259,6 +277,7 @@ define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 ; AVX512BW-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX512BW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX512BW-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v8i32_to_v4i32:
@@ -267,6 +286,7 @@ define void @shuffle_v8i32_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX512BWVL-NEXT:    vmovaps %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %L
   %strided.vec = shufflevector <8 x i32> %vec, <8 x i32> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
@@ -288,12 +308,14 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_v4i64_to_v4i32:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512VL-NEXT:    vpmovqd %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_v4i64_to_v4i32:
@@ -301,12 +323,14 @@ define void @trunc_v4i64_to_v4i32(<8 x i32>* %L, <4 x i32>* %S) nounwind {
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i32:
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BWVL-NEXT:    vpmovqd %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <8 x i32>, <8 x i32>* %L
   %bc = bitcast <8 x i32> %vec to <4 x i64>
@@ -337,6 +361,7 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v8i8:
@@ -348,6 +373,7 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v8i8:
@@ -359,6 +385,7 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v8i8:
@@ -373,6 +400,7 @@ define void @shuffle_v32i8_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -384,7 +412,7 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX-LABEL: trunc_v8i32_to_v8i8:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vmovq %xmm0, (%rsi)
@@ -397,12 +425,14 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_v8i32_to_v8i8:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512VL-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_v8i32_to_v8i8:
@@ -411,12 +441,14 @@ define void @trunc_v8i32_to_v8i8(<32 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_v8i32_to_v8i8:
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %bc = bitcast <32 x i8> %vec to <8 x i32>
@@ -449,6 +481,7 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_to_v4i16:
@@ -457,6 +490,7 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX512VL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v16i16_to_v4i16:
@@ -469,6 +503,7 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX512BW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v16i16_to_v4i16:
@@ -477,6 +512,7 @@ define void @shuffle_v16i16_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX512BWVL-NEXT:    vpmovdw %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %strided.vec = shufflevector <16 x i16> %vec, <16 x i16> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
@@ -500,12 +536,14 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_v4i64_to_v4i16:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512VL-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_v4i64_to_v4i16:
@@ -514,12 +552,14 @@ define void @trunc_v4i64_to_v4i16(<16 x i16>* %L, <4 x i16>* %S) nounwind {
 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i16:
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BWVL-NEXT:    vpmovqw %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <16 x i16>, <16 x i16>* %L
   %bc = bitcast <16 x i16> %vec to <4 x i64>
@@ -550,6 +590,7 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i8_to_v4i8:
@@ -558,6 +599,7 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512VL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX512VL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i8_to_v4i8:
@@ -569,6 +611,7 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i8_to_v4i8:
@@ -577,6 +620,7 @@ define void @shuffle_v32i8_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512BWVL-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX512BWVL-NEXT:    vpmovdb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %strided.vec = shufflevector <32 x i8> %vec, <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
@@ -600,12 +644,14 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_v4i64_to_v4i8:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512VL-NEXT:    vpmovqb %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_v4i64_to_v4i8:
@@ -614,12 +660,14 @@ define void @trunc_v4i64_to_v4i8(<32 x i8>* %L, <4 x i8>* %S) nounwind {
 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovd %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_v4i64_to_v4i8:
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BWVL-NEXT:    vpmovqb %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i8>, <32 x i8>* %L
   %bc = bitcast <32 x i8> %vec to <4 x i64>
diff --git a/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 923290411ae3c5372e673aa19673fb47e5f2fa0f..d053c63dcdb375c7cbcf13914ffd398b2669a59e 100644
--- a/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -18,6 +18,7 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v32i8:
@@ -29,6 +30,7 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v32i8:
@@ -40,6 +42,7 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v32i8:
@@ -51,6 +54,7 @@ define void @shuffle_v64i8_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512BWVL-NEXT:    vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <32 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30, i32 32, i32 34, i32 36, i32 38, i32 40, i32 42, i32 44, i32 46, i32 48, i32 50, i32 52, i32 54, i32 56, i32 58, i32 60, i32 62>
@@ -67,6 +71,7 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc_v32i16_to_v32i8:
@@ -77,18 +82,21 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 ; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_v32i16_to_v32i8:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc_v32i16_to_v32i8:
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vmovdqu16 (%rdi), %zmm0
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %bc = bitcast <64 x i8> %vec to <32 x i16>
@@ -100,37 +108,40 @@ define void @trunc_v32i16_to_v32i8(<64 x i8>* %L, <32 x i8>* %S) nounwind {
 define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
 ; AVX512F-LABEL: shuffle_v32i16_to_v16i16:
 ; AVX512F:       # BB#0:
-; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
-; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29]
-; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX512F-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512F-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512F-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i16_to_v16i16:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} ymm0 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512VL-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,2,4,5,4,6]
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} ymm1 = mem[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
 ; AVX512VL-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
-; AVX512VL-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
-; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-NEXT:    vshufps {{.*#+}} ymm0 = ymm1[0,2],ymm0[0,2],ymm1[4,6],ymm0[4,6]
 ; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512VL-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v16i16:
 ; AVX512BW:       # BB#0:
 ; AVX512BW-NEXT:    vmovdqu16 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,4,5,6,7,0,1,4,5,8,9,12,13,16,17,20,21,20,21,22,23,16,17,20,21,24,25,28,29]
-; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
-; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512BW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,2,2,3,4,5,6,7,8,10,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
+; AVX512BW-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; AVX512BW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v16i16:
@@ -141,6 +152,7 @@ define void @shuffle_v32i16_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
 ; AVX512BWVL-NEXT:    vpermq {{.*#+}} ymm0 = ymm2[0,2,1,3]
 ; AVX512BWVL-NEXT:    vmovdqu %ymm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
   %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22, i32 24, i32 26, i32 28, i32 30>
@@ -153,6 +165,7 @@ define void @trunc_v16i32_to_v16i16(<32 x i16>* %L, <16 x i16>* %S) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa32 (%rdi), %zmm0
 ; AVX512-NEXT:    vpmovdw %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
   %bc = bitcast <32 x i16> %vec to <16 x i32>
@@ -169,6 +182,7 @@ define void @shuffle_v16i32_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 ; AVX512-NEXT:    vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
 ; AVX512-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
 ; AVX512-NEXT:    vmovdqa %ymm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %L
   %strided.vec = shufflevector <16 x i32> %vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
@@ -181,6 +195,7 @@ define void @trunc_v8i64_to_v8i32(<16 x i32>* %L, <8 x i32>* %S) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512-NEXT:    vpmovqd %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <16 x i32>, <16 x i32>* %L
   %bc = bitcast <16 x i32> %vec to <8 x i64>
@@ -206,6 +221,7 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v16i8:
@@ -224,6 +240,7 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v16i8:
@@ -265,6 +282,7 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512BW-NEXT:    vpextrb $12, %xmm0, %eax
 ; AVX512BW-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v16i8:
@@ -306,6 +324,7 @@ define void @shuffle_v64i8_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vpextrb $12, %xmm0, %eax
 ; AVX512BWVL-NEXT:    vpinsrb $15, %eax, %xmm1, %xmm0
 ; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
@@ -318,6 +337,7 @@ define void @trunc_v16i32_to_v16i8(<64 x i8>* %L, <16 x i8>* %S) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa32 (%rdi), %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %bc = bitcast <64 x i8> %vec to <16 x i32>
@@ -345,6 +365,7 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512F-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v32i16_to_v8i16:
@@ -365,6 +386,7 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512VL-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v32i16_to_v8i16:
@@ -388,6 +410,7 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512BW-NEXT:    vpextrw $4, %xmm0, %eax
 ; AVX512BW-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
 ; AVX512BW-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v32i16_to_v8i16:
@@ -411,6 +434,7 @@ define void @shuffle_v32i16_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vpextrw $4, %xmm0, %eax
 ; AVX512BWVL-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm0
 ; AVX512BWVL-NEXT:    vmovdqu %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
   %strided.vec = shufflevector <32 x i16> %vec, <32 x i16> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
@@ -423,6 +447,7 @@ define void @trunc_v8i64_to_v8i16(<32 x i16>* %L, <8 x i16>* %S) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512-NEXT:    vpmovqw %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <32 x i16>, <32 x i16>* %L
   %bc = bitcast <32 x i16> %vec to <8 x i64>
@@ -448,6 +473,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v64i8_to_v8i8:
@@ -466,6 +492,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512VL-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; AVX512VL-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
 ; AVX512VL-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_to_v8i8:
@@ -482,7 +509,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BW-NEXT:    vpextrb $0, %xmm1, %ecx
 ; AVX512BW-NEXT:    vpextrb $8, %xmm0, %edx
 ; AVX512BW-NEXT:    vpextrb $0, %xmm0, %edi
-; AVX512BW-NEXT:    vpinsrb $0, %edi, %xmm0, %xmm0
+; AVX512BW-NEXT:    vmovd %edi, %xmm0
 ; AVX512BW-NEXT:    vpinsrb $1, %edx, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpinsrb $3, %eax, %xmm0, %xmm0
@@ -491,14 +518,15 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BW-NEXT:    vpinsrb $6, %r9d, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpinsrb $7, %r8d, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: shuffle_v64i8_to_v8i8:
 ; AVX512BWVL:       # BB#0:
 ; AVX512BWVL-NEXT:    vmovdqu8 (%rdi), %zmm0
-; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX512BWVL-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
 ; AVX512BWVL-NEXT:    vpextrb $8, %xmm0, %eax
+; AVX512BWVL-NEXT:    vpextrb $0, %xmm0, %ecx
+; AVX512BWVL-NEXT:    vmovd %ecx, %xmm1
 ; AVX512BWVL-NEXT:    vpinsrb $2, %eax, %xmm1, %xmm1
 ; AVX512BWVL-NEXT:    vextracti32x4 $1, %zmm0, %xmm2
 ; AVX512BWVL-NEXT:    vpextrb $0, %xmm2, %eax
@@ -516,6 +544,7 @@ define void @shuffle_v64i8_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512BWVL-NEXT:    vpextrb $8, %xmm0, %eax
 ; AVX512BWVL-NEXT:    vpinsrb $14, %eax, %xmm1, %xmm0
 ; AVX512BWVL-NEXT:    vpmovwb %xmm0, (%rsi)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %strided.vec = shufflevector <64 x i8> %vec, <64 x i8> undef, <8 x i32> <i32 0, i32 8, i32 16, i32 24, i32 32, i32 40, i32 48, i32 56>
@@ -528,6 +557,7 @@ define void @trunc_v8i64_to_v8i8(<64 x i8>* %L, <8 x i8>* %S) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512-NEXT:    vpmovqb %zmm0, (%rsi)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %vec = load <64 x i8>, <64 x i8>* %L
   %bc = bitcast <64 x i8> %vec to <8 x i64>
diff --git a/test/CodeGen/X86/split-extend-vector-inreg.ll b/test/CodeGen/X86/split-extend-vector-inreg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..692cbdb00be647ecd166b3eddbeef144ecf77fd2
--- /dev/null
+++ b/test/CodeGen/X86/split-extend-vector-inreg.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64
+
+define <4 x i64> @autogen_SD88863() {
+; X32-LABEL: autogen_SD88863:
+; X32:       # BB#0: # %BB
+; X32-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X32-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X32-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; X32-NEXT:    movb $1, %al
+; X32-NEXT:    .p2align 4, 0x90
+; X32-NEXT:  .LBB0_1: # %CF
+; X32-NEXT:    # =>This Inner Loop Header: Depth=1
+; X32-NEXT:    testb %al, %al
+; X32-NEXT:    jne .LBB0_1
+; X32-NEXT:  # BB#2: # %CF240
+; X32-NEXT:    retl
+;
+; X64-LABEL: autogen_SD88863:
+; X64:       # BB#0: # %BB
+; X64-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; X64-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3]
+; X64-NEXT:    movb $1, %al
+; X64-NEXT:    .p2align 4, 0x90
+; X64-NEXT:  .LBB0_1: # %CF
+; X64-NEXT:    # =>This Inner Loop Header: Depth=1
+; X64-NEXT:    testb %al, %al
+; X64-NEXT:    jne .LBB0_1
+; X64-NEXT:  # BB#2: # %CF240
+; X64-NEXT:    retq
+BB:
+  %I26 = insertelement <4 x i64> undef, i64 undef, i32 2
+  br label %CF
+
+CF:
+  %E66 = extractelement <4 x i64> %I26, i32 1
+  %I68 = insertelement <4 x i64> zeroinitializer, i64 %E66, i32 2
+  %Cmp72 = icmp eq i32 0, 0
+  br i1 %Cmp72, label %CF, label %CF240
+
+CF240:
+  ret <4 x i64> %I68
+}
diff --git a/test/CodeGen/X86/split-store.ll b/test/CodeGen/X86/split-store.ll
index c2e67fb25273fb7f4bd110c0e8ee3b2ee25855e9..6e320efb2b26f385d89e8239d509152ee27423fe 100644
--- a/test/CodeGen/X86/split-store.ll
+++ b/test/CodeGen/X86/split-store.ll
@@ -1,8 +1,8 @@
 ; RUN: llc -mtriple=x86_64-unknown-unknown -force-split-store < %s | FileCheck %s
 
 ; CHECK-LABEL: int32_float_pair
-; CHECK: movl %edi, (%rsi)
-; CHECK: movss %xmm0, 4(%rsi)
+; CHECK-DAG: movl %edi, (%rsi)
+; CHECK-DAG: movss %xmm0, 4(%rsi)
 define void @int32_float_pair(i32 %tmp1, float %tmp2, i64* %ref.tmp) {
 entry:
   %t0 = bitcast float %tmp2 to i32
@@ -15,8 +15,8 @@ entry:
 }
 
 ; CHECK-LABEL: float_int32_pair
-; CHECK: movss %xmm0, (%rsi)
-; CHECK: movl %edi, 4(%rsi)
+; CHECK-DAG: movss %xmm0, (%rsi)
+; CHECK-DAG: movl %edi, 4(%rsi)
 define void @float_int32_pair(float %tmp1, i32 %tmp2, i64* %ref.tmp) {
 entry:
   %t0 = bitcast float %tmp1 to i32
@@ -29,9 +29,9 @@ entry:
 }
 
 ; CHECK-LABEL: int16_float_pair
-; CHECK: movzwl	%di, %eax
-; CHECK: movl %eax, (%rsi)
-; CHECK: movss %xmm0, 4(%rsi)
+; CHECK-DAG: movzwl	%di, %eax
+; CHECK-DAG: movl %eax, (%rsi)
+; CHECK-DAG: movss %xmm0, 4(%rsi)
 define void @int16_float_pair(i16 signext %tmp1, float %tmp2, i64* %ref.tmp) {
 entry:
   %t0 = bitcast float %tmp2 to i32
@@ -44,9 +44,9 @@ entry:
 }
 
 ; CHECK-LABEL: int8_float_pair
-; CHECK: movzbl	%dil, %eax
-; CHECK: movl %eax, (%rsi)
-; CHECK: movss %xmm0, 4(%rsi)
+; CHECK-DAG: movzbl	%dil, %eax
+; CHECK-DAG: movl %eax, (%rsi)
+; CHECK-DAG: movss %xmm0, 4(%rsi)
 define void @int8_float_pair(i8 signext %tmp1, float %tmp2, i64* %ref.tmp) {
 entry:
   %t0 = bitcast float %tmp2 to i32
@@ -146,10 +146,9 @@ entry:
 ; CHECK: movw	%di, (%rdx)
 ; CHECK: shrl	$16, %edi
 ; CHECK: movb	%dil, 2(%rdx)
-; CHECK: movl	%esi, %eax
-; CHECK: shrl	$16, %eax
-; CHECK: movb	%al, 6(%rdx)
-; CHECK: movw	%si, 4(%rdx)
+; CHECK: movw    %si, 4(%rdx)
+; CHECK: shrl    $16, %esi
+; CHECK: movb    %sil, 6(%rdx)
 define void @int24_int24_pair(i24 signext %tmp1, i24 signext %tmp2, i48* %ref.tmp) {
 entry:
   %t1 = zext i24 %tmp2 to i48
diff --git a/test/CodeGen/X86/sse-align-10.ll b/test/CodeGen/X86/sse-align-10.ll
index 81bf55354cd2b5e7e3110d1ad63ae7454e9b152f..1e688a56ad449d168b9e77c6581c3f5023cb3f28 100644
--- a/test/CodeGen/X86/sse-align-10.ll
+++ b/test/CodeGen/X86/sse-align-10.ll
@@ -1,6 +1,9 @@
-; RUN: llc < %s -march=x86-64 | grep movups | count 1
+; RUN: llc < %s -march=x86-64 | FileCheck %s
 
 define <2 x i64> @bar(<2 x i64>* %p) nounwind {
+; CHECK-LABEL: bar:
+; CHECK: movups
+; CHECK-NOT: movups
   %t = load <2 x i64>, <2 x i64>* %p, align 8
   ret <2 x i64> %t
 }
diff --git a/test/CodeGen/X86/sse-fsignum.ll b/test/CodeGen/X86/sse-fsignum.ll
index 32594a27698d23bcb1444566d6ffb975688a02b6..8b27941571e8ddf2517b985bff4eff66b9d6aaa7 100644
--- a/test/CodeGen/X86/sse-fsignum.ll
+++ b/test/CodeGen/X86/sse-fsignum.ll
@@ -102,6 +102,7 @@ define void @signum32b(<8 x float>*) {
 ; AVX512F-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX512F-NEXT:    vsubps %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 entry:
   %1 = load <8 x float>, <8 x float>* %0
@@ -161,6 +162,7 @@ define void @signum64b(<4 x double>*) {
 ; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512F-NEXT:    vsubpd %ymm0, %ymm2, %ymm0
 ; AVX512F-NEXT:    vmovapd %ymm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 entry:
   %1 = load <4 x double>, <4 x double>* %0
@@ -178,43 +180,18 @@ entry:
 ;
 
 define void @signum32c(<8 x float>*) {
-; AVX1-LABEL: signum32c:
-; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
-; AVX1-NEXT:    vxorps %ymm1, %ymm1, %ymm1
-; AVX1-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
-; AVX1-NEXT:    vcvtdq2ps %ymm2, %ymm2
-; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX1-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; AVX1-NEXT:    vmovaps %ymm0, (%rdi)
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: signum32c:
-; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %ymm0
-; AVX2-NEXT:    vxorps %ymm1, %ymm1, %ymm1
-; AVX2-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
-; AVX2-NEXT:    vcvtdq2ps %ymm2, %ymm2
-; AVX2-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX2-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vmovaps %ymm0, (%rdi)
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: signum32c:
-; AVX512F:       # BB#0: # %entry
-; AVX512F-NEXT:    vmovaps (%rdi), %ymm0
-; AVX512F-NEXT:    vxorps %ymm1, %ymm1, %ymm1
-; AVX512F-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
-; AVX512F-NEXT:    vcvtdq2ps %ymm2, %ymm2
-; AVX512F-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vcvtdq2ps %ymm0, %ymm0
-; AVX512F-NEXT:    vsubps %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT:    vmovaps %ymm0, (%rdi)
-; AVX512F-NEXT:    retq
+; AVX-LABEL: signum32c:
+; AVX:       # BB#0: # %entry
+; AVX-NEXT:    vmovaps (%rdi), %ymm0
+; AVX-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; AVX-NEXT:    vcmpltps %ymm1, %ymm0, %ymm2
+; AVX-NEXT:    vcvtdq2ps %ymm2, %ymm2
+; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT:    vsubps %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 entry:
   %1 = load <8 x float>, <8 x float>* %0
   %2 = tail call <8 x float> @llvm.x86.avx.cmp.ps.256(<8 x float> %1, <8 x float> zeroinitializer, i8 1)
@@ -270,6 +247,7 @@ define void @signum64c(<4 x double>*) {
 ; AVX512F-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; AVX512F-NEXT:    vcvtdq2pd %xmm0, %ymm0
 ; AVX512F-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 entry:
   %x = load <4 x double>, <4 x double>* %0
diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 18434546262ce143a1243c01e25bbe0b6d885ad6..0b03dffe99b55ad607e46b72e5d1ce1c21746985 100644
--- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1653,12 +1653,8 @@ define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
 define void @test_mm_setcsr(i32 %a0) nounwind {
 ; X32-LABEL: test_mm_setcsr:
 ; X32:       # BB#0:
-; X32-NEXT:    pushl %eax
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    movl %esp, %ecx
-; X32-NEXT:    movl %eax, (%esp)
-; X32-NEXT:    ldmxcsr (%ecx)
-; X32-NEXT:    popl %eax
+; X32-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    ldmxcsr (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_setcsr:
diff --git a/test/CodeGen/X86/sse-intrinsics-x86.ll b/test/CodeGen/X86/sse-intrinsics-x86.ll
index f1c4c74630548f988e8e9831a617e12d4faef2e0..679b1e8b057f5622fc25b4d775926e37c3822a0c 100644
--- a/test/CodeGen/X86/sse-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -322,10 +322,15 @@ define <4 x float> @test_x86_sse_max_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    maxss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5f,0xc1]
 ; SSE-NEXT:    retl ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse_max_ss:
-; VCHECK:       ## BB#0:
-; VCHECK-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5f,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse_max_ss:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5f,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_max_ss:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vmaxss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5f,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.max.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -359,10 +364,15 @@ define <4 x float> @test_x86_sse_min_ss(<4 x float> %a0, <4 x float> %a1) {
 ; SSE-NEXT:    minss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x5d,0xc1]
 ; SSE-NEXT:    retl ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse_min_ss:
-; VCHECK:       ## BB#0:
-; VCHECK-NEXT:    vminss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5d,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse_min_ss:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vminss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x5d,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_min_ss:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vminss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x5d,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse.min.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -682,3 +692,19 @@ define i32 @test_x86_sse_ucomineq_ss(<4 x float> %a0, <4 x float> %a1) {
   ret i32 %res
 }
 declare i32 @llvm.x86.sse.ucomineq.ss(<4 x float>, <4 x float>) nounwind readnone
+
+
+define void @sfence() nounwind {
+; SSE-LABEL: sfence:
+; SSE:       ## BB#0:
+; SSE-NEXT:    sfence ## encoding: [0x0f,0xae,0xf8]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; VCHECK-LABEL: sfence:
+; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    sfence ## encoding: [0x0f,0xae,0xf8]
+; VCHECK-NEXT:    retl ## encoding: [0xc3]
+  tail call void @llvm.x86.sse.sfence()
+  ret void
+}
+declare void @llvm.x86.sse.sfence() nounwind
diff --git a/test/CodeGen/X86/sse-intrinsics-x86_64.ll b/test/CodeGen/X86/sse-intrinsics-x86_64.ll
new file mode 100644
index 0000000000000000000000000000000000000000..61d0cae9acf181cca165f55ee737a6c15616d3de
--- /dev/null
+++ b/test/CodeGen/X86/sse-intrinsics-x86_64.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse -show-mc-encoding | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX
+
+define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_sse_cvtss2si64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtss2si %xmm0, %rax
+; CHECK-NEXT:    retq
+; SSE-LABEL: test_x86_sse_cvtss2si64:
+; SSE:       ## BB#0:
+; SSE-NEXT:    cvtss2si %xmm0, %rax ## encoding: [0xf3,0x48,0x0f,0x2d,0xc0]
+; SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse_cvtss2si64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcvtss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2d,0xc0]
+; AVX2-NEXT:    retq ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_cvtss2si64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vcvtss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2d,0xc0]
+; SKX-NEXT:    retq ## encoding: [0xc3]
+  %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
+
+
+define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
+; CHECK-LABEL: test_x86_sse_cvtsi642ss:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+; SSE-LABEL: test_x86_sse_cvtsi642ss:
+; SSE:       ## BB#0:
+; SSE-NEXT:    cvtsi2ssq %rdi, %xmm0 ## encoding: [0xf3,0x48,0x0f,0x2a,0xc7]
+; SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse_cvtsi642ss:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 ## encoding: [0xc4,0xe1,0xfa,0x2a,0xc7]
+; AVX2-NEXT:    retq ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_cvtsi642ss:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vcvtsi2ssq %rdi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2a,0xc7]
+; SKX-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
+
+
+define i64 @test_x86_sse_cvttss2si64(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_sse_cvttss2si64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttss2si %xmm0, %rax
+; CHECK-NEXT:    retq
+; SSE-LABEL: test_x86_sse_cvttss2si64:
+; SSE:       ## BB#0:
+; SSE-NEXT:    cvttss2si %xmm0, %rax ## encoding: [0xf3,0x48,0x0f,0x2c,0xc0]
+; SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse_cvttss2si64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcvttss2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfa,0x2c,0xc0]
+; AVX2-NEXT:    retq ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse_cvttss2si64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vcvttss2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfa,0x2c,0xc0]
+; SKX-NEXT:    retq ## encoding: [0xc3]
+  %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
diff --git a/test/CodeGen/X86/sse-minmax.ll b/test/CodeGen/X86/sse-minmax.ll
index 796d9afabb1d7328ed85f264da63a63d949c4c73..2944001ed7e9c78641f2292e2637a20bcc4767ae 100644
--- a/test/CodeGen/X86/sse-minmax.ll
+++ b/test/CodeGen/X86/sse-minmax.ll
@@ -80,11 +80,11 @@ define double @olt_inverse(double %x, double %y)  {
 define double @oge(double %x, double %y)  {
 ; STRICT-LABEL: oge:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    movaps %xmm1, %xmm2
+; STRICT-NEXT:    movapd %xmm1, %xmm2
 ; STRICT-NEXT:    cmplesd %xmm0, %xmm2
-; STRICT-NEXT:    andps %xmm2, %xmm0
-; STRICT-NEXT:    andnps %xmm1, %xmm2
-; STRICT-NEXT:    orps %xmm2, %xmm0
+; STRICT-NEXT:    andpd %xmm2, %xmm0
+; STRICT-NEXT:    andnpd %xmm1, %xmm2
+; STRICT-NEXT:    orpd %xmm2, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: oge:
@@ -99,12 +99,12 @@ define double @oge(double %x, double %y)  {
 define double @ole(double %x, double %y)  {
 ; STRICT-LABEL: ole:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    movaps %xmm0, %xmm2
+; STRICT-NEXT:    movapd %xmm0, %xmm2
 ; STRICT-NEXT:    cmplesd %xmm1, %xmm2
-; STRICT-NEXT:    andps %xmm2, %xmm0
-; STRICT-NEXT:    andnps %xmm1, %xmm2
-; STRICT-NEXT:    orps %xmm0, %xmm2
-; STRICT-NEXT:    movaps %xmm2, %xmm0
+; STRICT-NEXT:    andpd %xmm2, %xmm0
+; STRICT-NEXT:    andnpd %xmm1, %xmm2
+; STRICT-NEXT:    orpd %xmm0, %xmm2
+; STRICT-NEXT:    movapd %xmm2, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: ole:
@@ -119,12 +119,12 @@ define double @ole(double %x, double %y)  {
 define double @oge_inverse(double %x, double %y)  {
 ; STRICT-LABEL: oge_inverse:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    movaps %xmm1, %xmm2
+; STRICT-NEXT:    movapd %xmm1, %xmm2
 ; STRICT-NEXT:    cmplesd %xmm0, %xmm2
-; STRICT-NEXT:    andps %xmm2, %xmm1
-; STRICT-NEXT:    andnps %xmm0, %xmm2
-; STRICT-NEXT:    orps %xmm1, %xmm2
-; STRICT-NEXT:    movaps %xmm2, %xmm0
+; STRICT-NEXT:    andpd %xmm2, %xmm1
+; STRICT-NEXT:    andnpd %xmm0, %xmm2
+; STRICT-NEXT:    orpd %xmm1, %xmm2
+; STRICT-NEXT:    movapd %xmm2, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: oge_inverse:
@@ -145,12 +145,12 @@ define double @oge_inverse(double %x, double %y)  {
 define double @ole_inverse(double %x, double %y)  {
 ; STRICT-LABEL: ole_inverse:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    movaps %xmm0, %xmm2
+; STRICT-NEXT:    movapd %xmm0, %xmm2
 ; STRICT-NEXT:    cmplesd %xmm1, %xmm2
-; STRICT-NEXT:    andps %xmm2, %xmm1
-; STRICT-NEXT:    andnps %xmm0, %xmm2
-; STRICT-NEXT:    orps %xmm1, %xmm2
-; STRICT-NEXT:    movaps %xmm2, %xmm0
+; STRICT-NEXT:    andpd %xmm2, %xmm1
+; STRICT-NEXT:    andnpd %xmm0, %xmm2
+; STRICT-NEXT:    orpd %xmm1, %xmm2
+; STRICT-NEXT:    movapd %xmm2, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: ole_inverse:
@@ -243,9 +243,9 @@ define double @olt_inverse_x(double %x)  {
 define double @oge_x(double %x)  {
 ; STRICT-LABEL: oge_x:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    xorps %xmm1, %xmm1
+; STRICT-NEXT:    xorpd %xmm1, %xmm1
 ; STRICT-NEXT:    cmplesd %xmm0, %xmm1
-; STRICT-NEXT:    andps %xmm1, %xmm0
+; STRICT-NEXT:    andpd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: oge_x:
@@ -261,11 +261,11 @@ define double @oge_x(double %x)  {
 define double @ole_x(double %x)  {
 ; STRICT-LABEL: ole_x:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    xorps %xmm2, %xmm2
-; STRICT-NEXT:    movaps %xmm0, %xmm1
+; STRICT-NEXT:    xorpd %xmm2, %xmm2
+; STRICT-NEXT:    movapd %xmm0, %xmm1
 ; STRICT-NEXT:    cmplesd %xmm2, %xmm1
-; STRICT-NEXT:    andps %xmm0, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andpd %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: ole_x:
@@ -281,10 +281,10 @@ define double @ole_x(double %x)  {
 define double @oge_inverse_x(double %x)  {
 ; STRICT-LABEL: oge_inverse_x:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    xorps %xmm1, %xmm1
+; STRICT-NEXT:    xorpd %xmm1, %xmm1
 ; STRICT-NEXT:    cmplesd %xmm0, %xmm1
-; STRICT-NEXT:    andnps %xmm0, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andnpd %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: oge_inverse_x:
@@ -307,11 +307,11 @@ define double @oge_inverse_x(double %x)  {
 define double @ole_inverse_x(double %x)  {
 ; STRICT-LABEL: ole_inverse_x:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    xorps %xmm2, %xmm2
-; STRICT-NEXT:    movaps %xmm0, %xmm1
+; STRICT-NEXT:    xorpd %xmm2, %xmm2
+; STRICT-NEXT:    movapd %xmm0, %xmm1
 ; STRICT-NEXT:    cmplesd %xmm2, %xmm1
-; STRICT-NEXT:    andnps %xmm0, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andnpd %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: ole_inverse_x:
@@ -334,12 +334,12 @@ define double @ole_inverse_x(double %x)  {
 define double @ugt(double %x, double %y)  {
 ; STRICT-LABEL: ugt:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    movaps %xmm0, %xmm2
+; STRICT-NEXT:    movapd %xmm0, %xmm2
 ; STRICT-NEXT:    cmpnlesd %xmm1, %xmm2
-; STRICT-NEXT:    andps %xmm2, %xmm0
-; STRICT-NEXT:    andnps %xmm1, %xmm2
-; STRICT-NEXT:    orps %xmm0, %xmm2
-; STRICT-NEXT:    movaps %xmm2, %xmm0
+; STRICT-NEXT:    andpd %xmm2, %xmm0
+; STRICT-NEXT:    andnpd %xmm1, %xmm2
+; STRICT-NEXT:    orpd %xmm0, %xmm2
+; STRICT-NEXT:    movapd %xmm2, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: ugt:
@@ -354,11 +354,11 @@ define double @ugt(double %x, double %y)  {
 define double @ult(double %x, double %y)  {
 ; STRICT-LABEL: ult:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    movaps %xmm1, %xmm2
+; STRICT-NEXT:    movapd %xmm1, %xmm2
 ; STRICT-NEXT:    cmpnlesd %xmm0, %xmm2
-; STRICT-NEXT:    andps %xmm2, %xmm0
-; STRICT-NEXT:    andnps %xmm1, %xmm2
-; STRICT-NEXT:    orps %xmm2, %xmm0
+; STRICT-NEXT:    andpd %xmm2, %xmm0
+; STRICT-NEXT:    andnpd %xmm1, %xmm2
+; STRICT-NEXT:    orpd %xmm2, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: ult:
@@ -373,12 +373,12 @@ define double @ult(double %x, double %y)  {
 define double @ugt_inverse(double %x, double %y)  {
 ; STRICT-LABEL: ugt_inverse:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    movaps %xmm0, %xmm2
+; STRICT-NEXT:    movapd %xmm0, %xmm2
 ; STRICT-NEXT:    cmpnlesd %xmm1, %xmm2
-; STRICT-NEXT:    andps %xmm2, %xmm1
-; STRICT-NEXT:    andnps %xmm0, %xmm2
-; STRICT-NEXT:    orps %xmm1, %xmm2
-; STRICT-NEXT:    movaps %xmm2, %xmm0
+; STRICT-NEXT:    andpd %xmm2, %xmm1
+; STRICT-NEXT:    andnpd %xmm0, %xmm2
+; STRICT-NEXT:    orpd %xmm1, %xmm2
+; STRICT-NEXT:    movapd %xmm2, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: ugt_inverse:
@@ -399,12 +399,12 @@ define double @ugt_inverse(double %x, double %y)  {
 define double @ult_inverse(double %x, double %y)  {
 ; STRICT-LABEL: ult_inverse:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    movaps %xmm1, %xmm2
+; STRICT-NEXT:    movapd %xmm1, %xmm2
 ; STRICT-NEXT:    cmpnlesd %xmm0, %xmm2
-; STRICT-NEXT:    andps %xmm2, %xmm1
-; STRICT-NEXT:    andnps %xmm0, %xmm2
-; STRICT-NEXT:    orps %xmm1, %xmm2
-; STRICT-NEXT:    movaps %xmm2, %xmm0
+; STRICT-NEXT:    andpd %xmm2, %xmm1
+; STRICT-NEXT:    andnpd %xmm0, %xmm2
+; STRICT-NEXT:    orpd %xmm1, %xmm2
+; STRICT-NEXT:    movapd %xmm2, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: ult_inverse:
@@ -499,11 +499,11 @@ define double @ule_inverse(double %x, double %y)  {
 define double @ugt_x(double %x)  {
 ; STRICT-LABEL: ugt_x:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    xorps %xmm2, %xmm2
-; STRICT-NEXT:    movaps %xmm0, %xmm1
+; STRICT-NEXT:    xorpd %xmm2, %xmm2
+; STRICT-NEXT:    movapd %xmm0, %xmm1
 ; STRICT-NEXT:    cmpnlesd %xmm2, %xmm1
-; STRICT-NEXT:    andps %xmm0, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andpd %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: ugt_x:
@@ -519,9 +519,9 @@ define double @ugt_x(double %x)  {
 define double @ult_x(double %x)  {
 ; STRICT-LABEL: ult_x:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    xorps %xmm1, %xmm1
+; STRICT-NEXT:    xorpd %xmm1, %xmm1
 ; STRICT-NEXT:    cmpnlesd %xmm0, %xmm1
-; STRICT-NEXT:    andps %xmm1, %xmm0
+; STRICT-NEXT:    andpd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: ult_x:
@@ -537,11 +537,11 @@ define double @ult_x(double %x)  {
 define double @ugt_inverse_x(double %x)  {
 ; STRICT-LABEL: ugt_inverse_x:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    xorps %xmm2, %xmm2
-; STRICT-NEXT:    movaps %xmm0, %xmm1
+; STRICT-NEXT:    xorpd %xmm2, %xmm2
+; STRICT-NEXT:    movapd %xmm0, %xmm1
 ; STRICT-NEXT:    cmpnlesd %xmm2, %xmm1
-; STRICT-NEXT:    andnps %xmm0, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andnpd %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: ugt_inverse_x:
@@ -564,10 +564,10 @@ define double @ugt_inverse_x(double %x)  {
 define double @ult_inverse_x(double %x)  {
 ; STRICT-LABEL: ult_inverse_x:
 ; STRICT:       # BB#0:
-; STRICT-NEXT:    xorps %xmm1, %xmm1
+; STRICT-NEXT:    xorpd %xmm1, %xmm1
 ; STRICT-NEXT:    cmpnlesd %xmm0, %xmm1
-; STRICT-NEXT:    andnps %xmm0, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andnpd %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: ult_inverse_x:
@@ -743,11 +743,11 @@ define double @oge_y(double %x)  {
 ; STRICT-LABEL: oge_y:
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; STRICT-NEXT:    movaps %xmm1, %xmm2
+; STRICT-NEXT:    movapd %xmm1, %xmm2
 ; STRICT-NEXT:    cmplesd %xmm0, %xmm2
-; STRICT-NEXT:    andps %xmm2, %xmm0
-; STRICT-NEXT:    andnps %xmm1, %xmm2
-; STRICT-NEXT:    orps %xmm2, %xmm0
+; STRICT-NEXT:    andpd %xmm2, %xmm0
+; STRICT-NEXT:    andnpd %xmm1, %xmm2
+; STRICT-NEXT:    orpd %xmm2, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: oge_y:
@@ -763,12 +763,12 @@ define double @ole_y(double %x)  {
 ; STRICT-LABEL: ole_y:
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; STRICT-NEXT:    movaps %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm0, %xmm1
 ; STRICT-NEXT:    cmplesd %xmm2, %xmm1
-; STRICT-NEXT:    andps %xmm1, %xmm0
-; STRICT-NEXT:    andnps %xmm2, %xmm1
-; STRICT-NEXT:    orps %xmm0, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andpd %xmm1, %xmm0
+; STRICT-NEXT:    andnpd %xmm2, %xmm1
+; STRICT-NEXT:    orpd %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: ole_y:
@@ -784,12 +784,12 @@ define double @oge_inverse_y(double %x)  {
 ; STRICT-LABEL: oge_inverse_y:
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; STRICT-NEXT:    movaps %xmm2, %xmm1
+; STRICT-NEXT:    movapd %xmm2, %xmm1
 ; STRICT-NEXT:    cmplesd %xmm0, %xmm1
-; STRICT-NEXT:    andps %xmm1, %xmm2
-; STRICT-NEXT:    andnps %xmm0, %xmm1
-; STRICT-NEXT:    orps %xmm2, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andpd %xmm1, %xmm2
+; STRICT-NEXT:    andnpd %xmm0, %xmm1
+; STRICT-NEXT:    orpd %xmm2, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: oge_inverse_y:
@@ -812,12 +812,12 @@ define double @ole_inverse_y(double %x)  {
 ; STRICT-LABEL: ole_inverse_y:
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; STRICT-NEXT:    movaps %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm0, %xmm1
 ; STRICT-NEXT:    cmplesd %xmm2, %xmm1
-; STRICT-NEXT:    andps %xmm1, %xmm2
-; STRICT-NEXT:    andnps %xmm0, %xmm1
-; STRICT-NEXT:    orps %xmm2, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andpd %xmm1, %xmm2
+; STRICT-NEXT:    andnpd %xmm0, %xmm1
+; STRICT-NEXT:    orpd %xmm2, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: ole_inverse_y:
@@ -840,12 +840,12 @@ define double @ugt_y(double %x)  {
 ; STRICT-LABEL: ugt_y:
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; STRICT-NEXT:    movaps %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm0, %xmm1
 ; STRICT-NEXT:    cmpnlesd %xmm2, %xmm1
-; STRICT-NEXT:    andps %xmm1, %xmm0
-; STRICT-NEXT:    andnps %xmm2, %xmm1
-; STRICT-NEXT:    orps %xmm0, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andpd %xmm1, %xmm0
+; STRICT-NEXT:    andnpd %xmm2, %xmm1
+; STRICT-NEXT:    orpd %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: ugt_y:
@@ -861,11 +861,11 @@ define double @ult_y(double %x)  {
 ; STRICT-LABEL: ult_y:
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; STRICT-NEXT:    movaps %xmm1, %xmm2
+; STRICT-NEXT:    movapd %xmm1, %xmm2
 ; STRICT-NEXT:    cmpnlesd %xmm0, %xmm2
-; STRICT-NEXT:    andps %xmm2, %xmm0
-; STRICT-NEXT:    andnps %xmm1, %xmm2
-; STRICT-NEXT:    orps %xmm2, %xmm0
+; STRICT-NEXT:    andpd %xmm2, %xmm0
+; STRICT-NEXT:    andnpd %xmm1, %xmm2
+; STRICT-NEXT:    orpd %xmm2, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; RELAX-LABEL: ult_y:
@@ -881,12 +881,12 @@ define double @ugt_inverse_y(double %x)  {
 ; STRICT-LABEL: ugt_inverse_y:
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; STRICT-NEXT:    movaps %xmm0, %xmm1
+; STRICT-NEXT:    movapd %xmm0, %xmm1
 ; STRICT-NEXT:    cmpnlesd %xmm2, %xmm1
-; STRICT-NEXT:    andps %xmm1, %xmm2
-; STRICT-NEXT:    andnps %xmm0, %xmm1
-; STRICT-NEXT:    orps %xmm2, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andpd %xmm1, %xmm2
+; STRICT-NEXT:    andnpd %xmm0, %xmm1
+; STRICT-NEXT:    orpd %xmm2, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: ugt_inverse_y:
@@ -909,12 +909,12 @@ define double @ult_inverse_y(double %x)  {
 ; STRICT-LABEL: ult_inverse_y:
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; STRICT-NEXT:    movaps %xmm2, %xmm1
+; STRICT-NEXT:    movapd %xmm2, %xmm1
 ; STRICT-NEXT:    cmpnlesd %xmm0, %xmm1
-; STRICT-NEXT:    andps %xmm1, %xmm2
-; STRICT-NEXT:    andnps %xmm0, %xmm1
-; STRICT-NEXT:    orps %xmm2, %xmm1
-; STRICT-NEXT:    movaps %xmm1, %xmm0
+; STRICT-NEXT:    andpd %xmm1, %xmm2
+; STRICT-NEXT:    andnpd %xmm0, %xmm1
+; STRICT-NEXT:    orpd %xmm2, %xmm1
+; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
 ; UNSAFE-LABEL: ult_inverse_y:
@@ -1203,7 +1203,7 @@ define <2 x double> @test_maxpd(<2 x double> %x, <2 x double> %y)  {
 ; STRICT-NEXT:    movapd %xmm0, %xmm2
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    cmplepd %xmm2, %xmm0
-; STRICT-NEXT:    blendvpd %xmm2, %xmm1
+; STRICT-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
@@ -1221,7 +1221,7 @@ define <2 x double> @test_minpd(<2 x double> %x, <2 x double> %y)  {
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movapd %xmm0, %xmm2
 ; STRICT-NEXT:    cmplepd %xmm1, %xmm0
-; STRICT-NEXT:    blendvpd %xmm2, %xmm1
+; STRICT-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
@@ -1240,7 +1240,7 @@ define <4 x float> @test_maxps(<4 x float> %x, <4 x float> %y)  {
 ; STRICT-NEXT:    movaps %xmm0, %xmm2
 ; STRICT-NEXT:    movaps %xmm1, %xmm0
 ; STRICT-NEXT:    cmpleps %xmm2, %xmm0
-; STRICT-NEXT:    blendvps %xmm2, %xmm1
+; STRICT-NEXT:    blendvps %xmm0, %xmm2, %xmm1
 ; STRICT-NEXT:    movaps %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
@@ -1258,7 +1258,7 @@ define <4 x float> @test_minps(<4 x float> %x, <4 x float> %y)  {
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movaps %xmm0, %xmm2
 ; STRICT-NEXT:    cmpleps %xmm1, %xmm0
-; STRICT-NEXT:    blendvps %xmm2, %xmm1
+; STRICT-NEXT:    blendvps %xmm0, %xmm2, %xmm1
 ; STRICT-NEXT:    movaps %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
@@ -1277,9 +1277,7 @@ define <2 x float> @test_maxps_illegal_v2f32(<2 x float> %x, <2 x float> %y)  {
 ; STRICT-NEXT:    movaps %xmm0, %xmm2
 ; STRICT-NEXT:    movaps %xmm1, %xmm0
 ; STRICT-NEXT:    cmpleps %xmm2, %xmm0
-; STRICT-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1]
-; STRICT-NEXT:    pslld $31, %xmm0
-; STRICT-NEXT:    blendvps %xmm2, %xmm1
+; STRICT-NEXT:    blendvps %xmm0, %xmm2, %xmm1
 ; STRICT-NEXT:    movaps %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
@@ -1297,9 +1295,7 @@ define <2 x float> @test_minps_illegal_v2f32(<2 x float> %x, <2 x float> %y)  {
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movaps %xmm0, %xmm2
 ; STRICT-NEXT:    cmpleps %xmm1, %xmm0
-; STRICT-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],zero,xmm0[1]
-; STRICT-NEXT:    pslld $31, %xmm0
-; STRICT-NEXT:    blendvps %xmm2, %xmm1
+; STRICT-NEXT:    blendvps %xmm0, %xmm2, %xmm1
 ; STRICT-NEXT:    movaps %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
@@ -1318,7 +1314,7 @@ define <3 x float> @test_maxps_illegal_v3f32(<3 x float> %x, <3 x float> %y)  {
 ; STRICT-NEXT:    movaps %xmm0, %xmm2
 ; STRICT-NEXT:    movaps %xmm1, %xmm0
 ; STRICT-NEXT:    cmpleps %xmm2, %xmm0
-; STRICT-NEXT:    blendvps %xmm2, %xmm1
+; STRICT-NEXT:    blendvps %xmm0, %xmm2, %xmm1
 ; STRICT-NEXT:    movaps %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
@@ -1336,7 +1332,7 @@ define <3 x float> @test_minps_illegal_v3f32(<3 x float> %x, <3 x float> %y)  {
 ; STRICT:       # BB#0:
 ; STRICT-NEXT:    movaps %xmm0, %xmm2
 ; STRICT-NEXT:    cmpleps %xmm1, %xmm0
-; STRICT-NEXT:    blendvps %xmm2, %xmm1
+; STRICT-NEXT:    blendvps %xmm0, %xmm2, %xmm1
 ; STRICT-NEXT:    movaps %xmm1, %xmm0
 ; STRICT-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/sse-regcall.ll b/test/CodeGen/X86/sse-regcall.ll
index b44e544d83c10db6664a43637a7b99dd9b17826a..862b9cc92f6c43d8fcc518627ca6da134f84bbe0 100644
--- a/test/CodeGen/X86/sse-regcall.ll
+++ b/test/CodeGen/X86/sse-regcall.ll
@@ -37,48 +37,42 @@ define x86_regcallcc i1 @test_CallargReti1(i1 %a)  {
 }
 
 ; WIN64-LABEL: testf32_inp
-; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}}  {{#+}} 16-byte Spill
-; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}}  {{#+}} 16-byte Spill
-; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}}  {{#+}} 16-byte Spill
-; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}}  {{#+}} 16-byte Spill
+; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%r(b|s)p).*}}  {{#+}} 16-byte Spill
+; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%r(b|s)p).*}}  {{#+}} 16-byte Spill
+; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%r(b|s)p).*}}  {{#+}} 16-byte Spill
+; WIN64: movaps {{%xmm(1[2-5])}}, {{.*(%r(b|s)p).*}}  {{#+}} 16-byte Spill
 ; WIN64: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}}
 ; WIN64: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}}
 ; WIN64: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}}
 ; WIN64: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}}
-; WIN64: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
-; WIN64: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
-; WIN64: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
-; WIN64: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
+; WIN64: movaps {{.*(%r(b|s)p).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
+; WIN64: movaps {{.*(%r(b|s)p).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
+; WIN64: movaps {{.*(%r(b|s)p).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
+; WIN64: movaps {{.*(%r(b|s)p).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
 ; WIN64: retq
 
 ; WIN32-LABEL: testf32_inp
-; WIN32: movaps {{%xmm([4-7])}}, {{.*(%ebp).*}}  {{#+}} 16-byte Spill
-; WIN32: movaps {{%xmm([4-7])}}, {{.*(%ebp).*}}  {{#+}} 16-byte Spill
-; WIN32: movaps {{%xmm([4-7])}}, {{.*(%ebp).*}}  {{#+}} 16-byte Spill
-; WIN32: movaps {{%xmm([4-7])}}, {{.*(%ebp).*}}  {{#+}} 16-byte Spill
+; WIN32: movaps {{%xmm([0-7])}}, {{.*(%e(b|s)p).*}}  {{#+}} 16-byte Spill
 ; WIN32: {{.*}} {{%xmm[0-7]}}, {{%xmm[4-7]}}
 ; WIN32: {{.*}} {{%xmm[0-7]}}, {{%xmm[4-7]}}
 ; WIN32: {{.*}} {{%xmm[0-7]}}, {{%xmm[4-7]}}
 ; WIN32: {{.*}} {{%xmm[0-7]}}, {{%xmm[4-7]}}
-; WIN32: movaps {{.*(%ebp).*}}, {{%xmm([4-7])}}  {{#+}} 16-byte Reload
-; WIN32: movaps {{.*(%ebp).*}}, {{%xmm([4-7])}}  {{#+}} 16-byte Reload
-; WIN32: movaps {{.*(%ebp).*}}, {{%xmm([4-7])}}  {{#+}} 16-byte Reload
-; WIN32: movaps {{.*(%ebp).*}}, {{%xmm([4-7])}}  {{#+}} 16-byte Reload
+; WIN32: movaps {{.*(%e(b|s)p).*}}, {{%xmm([0-7])}}  {{#+}} 16-byte Reload
 ; WIN32: retl
 
 ; LINUXOSX-LABEL: testf32_inp
-; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}}  {{#+}} 16-byte Spill
-; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}}  {{#+}} 16-byte Spill
-; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}}  {{#+}} 16-byte Spill
-; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%rsp).*}}  {{#+}} 16-byte Spill
+; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%r(b|s)p).*}}  {{#+}} 16-byte Spill
+; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%r(b|s)p).*}}  {{#+}} 16-byte Spill
+; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%r(b|s)p).*}}  {{#+}} 16-byte Spill
+; LINUXOSX: movaps {{%xmm(1[2-5])}}, {{.*(%r(b|s)p).*}}  {{#+}} 16-byte Spill
 ; LINUXOSX: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}}
 ; LINUXOSX: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}}
 ; LINUXOSX: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}}
 ; LINUXOSX: {{.*}} {{%xmm([0-9]|1[0-1])}}, {{%xmm(1[2-5])}}
-; LINUXOSX: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
-; LINUXOSX: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
-; LINUXOSX: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
-; LINUXOSX: movaps {{.*(%rsp).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
+; LINUXOSX: movaps {{.*(%r(b|s)p).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
+; LINUXOSX: movaps {{.*(%r(b|s)p).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
+; LINUXOSX: movaps {{.*(%r(b|s)p).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
+; LINUXOSX: movaps {{.*(%r(b|s)p).*}}, {{%xmm(1[2-5])}}  {{#+}} 16-byte Reload
 ; LINUXOSX: retq
 
 ;test calling conventions - input parameters, callee saved XMMs
@@ -93,10 +87,6 @@ define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b,
 ; WIN32-LABEL: testi32_inp
 ; WIN32: pushl {{%e(si|di|bx|bp)}}
 ; WIN32: pushl {{%e(si|di|bx|bp)}}
-; WIN32: pushl {{%e(si|di|bx|bp)}}
-; WIN32: pushl {{%e(si|di|bx|bp)}}
-; WIN32: popl {{%e(si|di|bx|bp)}}
-; WIN32: popl {{%e(si|di|bx|bp)}}
 ; WIN32: popl {{%e(si|di|bx|bp)}}
 ; WIN32: popl {{%e(si|di|bx|bp)}}
 ; WIN32: retl
@@ -105,10 +95,6 @@ define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b,
 ; WIN64: pushq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: pushq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: pushq	{{%r(bp|bx|1[0-5])}}
-; WIN64: pushq	{{%r(bp|bx|1[0-5])}}
-; WIN64: pushq	{{%r(bp|bx|1[0-5])}}
-; WIN64: popq	{{%r(bp|bx|1[0-5])}}
-; WIN64: popq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: popq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: popq	{{%r(bp|bx|1[0-5])}}
 ; WIN64: popq	{{%r(bp|bx|1[0-5])}}
@@ -117,10 +103,6 @@ define x86_regcallcc <16 x float> @testf32_inp(<16 x float> %a, <16 x float> %b,
 ; LINUXOSX-LABEL: testi32_inp
 ; LINUXOSX: pushq	{{%r(bp|bx|1[2-5])}}
 ; LINUXOSX: pushq	{{%r(bp|bx|1[2-5])}}
-; LINUXOSX: pushq	{{%r(bp|bx|1[2-5])}}
-; LINUXOSX: pushq	{{%r(bp|bx|1[2-5])}}
-; LINUXOSX: popq	{{%r(bp|bx|1[2-5])}}
-; LINUXOSX: popq	{{%r(bp|bx|1[2-5])}}
 ; LINUXOSX: popq	{{%r(bp|bx|1[2-5])}}
 ; LINUXOSX: popq	{{%r(bp|bx|1[2-5])}}
 ; LINUXOSX: retq
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index 9488d6d26056c7d76cc10c37fc073ffd5461fe7e..dfc1aefd31a611951d64fdbd87d191a3c1576713 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -60,7 +60,13 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X32-NEXT:    xorps %xmm1, %xmm1
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    jne .LBB1_5
-; X32-NEXT:    jmp .LBB1_4
+; X32-NEXT:  .LBB1_4:
+; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
+; X32-NEXT:    jne .LBB1_8
+; X32-NEXT:  .LBB1_7:
+; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X32-NEXT:    jmp .LBB1_9
 ; X32-NEXT:  .LBB1_1:
 ; X32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
@@ -68,17 +74,9 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X32-NEXT:  .LBB1_5: # %entry
 ; X32-NEXT:    xorps %xmm2, %xmm2
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
-; X32-NEXT:    jne .LBB1_8
-; X32-NEXT:    jmp .LBB1_7
-; X32-NEXT:  .LBB1_4:
-; X32-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    je .LBB1_7
 ; X32-NEXT:  .LBB1_8: # %entry
 ; X32-NEXT:    xorps %xmm3, %xmm3
-; X32-NEXT:    jmp .LBB1_9
-; X32-NEXT:  .LBB1_7:
-; X32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; X32-NEXT:  .LBB1_9: # %entry
 ; X32-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X32-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
@@ -99,7 +97,13 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    testl %edx, %edx
 ; X64-NEXT:    jne .LBB1_5
-; X64-NEXT:    jmp .LBB1_4
+; X64-NEXT:  .LBB1_4:
+; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    testl %r8d, %r8d
+; X64-NEXT:    jne .LBB1_8
+; X64-NEXT:  .LBB1_7:
+; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT:    jmp .LBB1_9
 ; X64-NEXT:  .LBB1_1:
 ; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X64-NEXT:    testl %edx, %edx
@@ -107,17 +111,9 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
 ; X64-NEXT:  .LBB1_5: # %entry
 ; X64-NEXT:    xorps %xmm2, %xmm2
 ; X64-NEXT:    testl %r8d, %r8d
-; X64-NEXT:    jne .LBB1_8
-; X64-NEXT:    jmp .LBB1_7
-; X64-NEXT:  .LBB1_4:
-; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; X64-NEXT:    testl %r8d, %r8d
 ; X64-NEXT:    je .LBB1_7
 ; X64-NEXT:  .LBB1_8: # %entry
 ; X64-NEXT:    xorps %xmm3, %xmm3
-; X64-NEXT:    jmp .LBB1_9
-; X64-NEXT:  .LBB1_7:
-; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
 ; X64-NEXT:  .LBB1_9: # %entry
 ; X64-NEXT:    testl %esi, %esi
 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
@@ -215,7 +211,7 @@ define <4 x i32> @PR30512(<4 x i32> %x, <4 x i32> %y) nounwind {
   ret <4 x i32> %zext
 }
 
-; Fragile test warning - we need to induce the generation of a vselect 
+; Fragile test warning - we need to induce the generation of a vselect
 ; post-legalization to cause the crash seen in:
 ; https://llvm.org/bugs/show_bug.cgi?id=31672
 ; Is there a way to do that without an unsafe/fast sqrt intrinsic call?
diff --git a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index 972a33f13cd005d9f32eac469c5fca0351a4e35e..3071155172e35359c73046408ad3aef126942abc 100644
--- a/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -2934,13 +2934,13 @@ define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwin
 ; X32-LABEL: test_mm_sqrt_sd:
 ; X32:       # BB#0:
 ; X32-NEXT:    sqrtsd %xmm0, %xmm1
-; X32-NEXT:    movaps %xmm1, %xmm0
+; X32-NEXT:    movapd %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_sqrt_sd:
 ; X64:       # BB#0:
 ; X64-NEXT:    sqrtsd %xmm0, %xmm1
-; X64-NEXT:    movaps %xmm1, %xmm0
+; X64-NEXT:    movapd %xmm1, %xmm0
 ; X64-NEXT:    retq
   %call = call <2 x double> @llvm.x86.sse2.sqrt.sd(<2 x double> %a0)
   %ext0 = extractelement <2 x double> %call, i32 0
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 2fc0d94a1d8daf0c503315a3a586c1c78a54be18..13911eeea6c48f1195bec5f5028a417913538454 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -98,8 +98,8 @@ define void @test_x86_sse2_storeu_pd(i8* %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_sse2_storeu_pd:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7]
+; CHECK-NEXT:    xorpd %xmm1, %xmm1
+; CHECK-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; CHECK-NEXT:    addpd %xmm0, %xmm1
 ; CHECK-NEXT:    movupd %xmm1, (%eax)
 ; CHECK-NEXT:    retl
@@ -147,7 +147,6 @@ define <16 x i8> @max_epu8(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pmaxub %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
   ret <16 x i8> %res
 }
@@ -158,7 +157,6 @@ define <16 x i8> @min_epu8(<16 x i8> %a0, <16 x i8> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pminub %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
   ret <16 x i8> %res
 }
@@ -169,7 +167,6 @@ define <8 x i16> @max_epi16(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pmaxsw %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
   ret <8 x i16> %res
 }
@@ -180,7 +177,6 @@ define <8 x i16> @min_epi16(<8 x i16> %a0, <8 x i16> %a1) {
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    pminsw %xmm1, %xmm0
 ; CHECK-NEXT:    retl
-;
   %res = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
   ret <8 x i16> %res
 }
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index a93ffc6655b74570a4d305860279270e383694a2..b0a8744f5d8042a094103ed4bb46206283d1c9ac 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=-avx,+sse2 -show-mc-encoding | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2
-; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX
+; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=-avx,+sse2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=AVX2
+; RUN: llc < %s -disable-peephole -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX
 
 define <2 x double> @test_x86_sse2_cmp_pd(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-LABEL: test_x86_sse2_cmp_pd:
@@ -587,10 +587,15 @@ define <2 x double> @test_x86_sse2_max_sd(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-NEXT:    maxsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5f,0xc1]
 ; SSE-NEXT:    retl ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse2_max_sd:
-; VCHECK:       ## BB#0:
-; VCHECK-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5f,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_max_sd:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5f,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_max_sd:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vmaxsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x5f,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse2.max.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
@@ -624,10 +629,15 @@ define <2 x double> @test_x86_sse2_min_sd(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-NEXT:    minsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x5d,0xc1]
 ; SSE-NEXT:    retl ## encoding: [0xc3]
 ;
-; VCHECK-LABEL: test_x86_sse2_min_sd:
-; VCHECK:       ## BB#0:
-; VCHECK-NEXT:    vminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5d,0xc1]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; AVX2-LABEL: test_x86_sse2_min_sd:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vminsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x5d,0xc1]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_min_sd:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vminsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x5d,0xc1]
+; SKX-NEXT:    retl ## encoding: [0xc3]
   %res = call <2 x double> @llvm.x86.sse2.min.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
@@ -1502,21 +1512,21 @@ define <2 x double> @test_x86_sse2_sqrt_sd_vec_load(<2 x double>* %a0) {
 ; SSE-LABEL: test_x86_sse2_sqrt_sd_vec_load:
 ; SSE:       ## BB#0:
 ; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; SSE-NEXT:    movaps (%eax), %xmm0 ## encoding: [0x0f,0x28,0x00]
+; SSE-NEXT:    movapd (%eax), %xmm0 ## encoding: [0x66,0x0f,0x28,0x00]
 ; SSE-NEXT:    sqrtsd %xmm0, %xmm0 ## encoding: [0xf2,0x0f,0x51,0xc0]
 ; SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; AVX2-LABEL: test_x86_sse2_sqrt_sd_vec_load:
 ; AVX2:       ## BB#0:
 ; AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; AVX2-NEXT:    vmovaps (%eax), %xmm0 ## encoding: [0xc5,0xf8,0x28,0x00]
+; AVX2-NEXT:    vmovapd (%eax), %xmm0 ## encoding: [0xc5,0xf9,0x28,0x00]
 ; AVX2-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
 ; AVX2-NEXT:    retl ## encoding: [0xc3]
 ;
 ; SKX-LABEL: test_x86_sse2_sqrt_sd_vec_load:
 ; SKX:       ## BB#0:
 ; SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; SKX-NEXT:    vmovaps (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x00]
+; SKX-NEXT:    vmovapd (%eax), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x28,0x00]
 ; SKX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x51,0xc0]
 ; SKX-NEXT:    retl ## encoding: [0xc3]
   %a1 = load <2 x double>, <2 x double>* %a0, align 16
@@ -1699,16 +1709,42 @@ define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
 declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
 
 define void @test_x86_sse2_pause() {
-; SSE-LABEL: test_x86_sse2_pause:
-; SSE:       ## BB#0:
-; SSE-NEXT:    pause ## encoding: [0xf3,0x90]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; VCHECK-LABEL: test_x86_sse2_pause:
-; VCHECK:       ## BB#0:
-; VCHECK-NEXT:    pause ## encoding: [0xf3,0x90]
-; VCHECK-NEXT:    retl ## encoding: [0xc3]
+; CHECK-LABEL: test_x86_sse2_pause:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    pause ## encoding: [0xf3,0x90]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
   tail call void @llvm.x86.sse2.pause()
   ret void
 }
 declare void @llvm.x86.sse2.pause() nounwind
+
+define void @lfence() nounwind {
+; CHECK-LABEL: lfence:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    lfence ## encoding: [0x0f,0xae,0xe8]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  tail call void @llvm.x86.sse2.lfence()
+  ret void
+}
+declare void @llvm.x86.sse2.lfence() nounwind
+
+define void @mfence() nounwind {
+; CHECK-LABEL: mfence:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    mfence ## encoding: [0x0f,0xae,0xf0]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  tail call void @llvm.x86.sse2.mfence()
+  ret void
+}
+declare void @llvm.x86.sse2.mfence() nounwind
+
+define void @clflush(i8* %p) nounwind {
+; CHECK-LABEL: clflush:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    clflush (%eax) ## encoding: [0x0f,0xae,0x38]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  tail call void @llvm.x86.sse2.clflush(i8* %p)
+  ret void
+}
+declare void @llvm.x86.sse2.clflush(i8*) nounwind
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86_64.ll b/test/CodeGen/X86/sse2-intrinsics-x86_64.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cd5e11e12795fdb8638fafcc727792e7013570a5
--- /dev/null
+++ b/test/CodeGen/X86/sse2-intrinsics-x86_64.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=-avx,+sse2 -show-mc-encoding | FileCheck %s --check-prefix=SSE
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2
+; RUN: llc < %s -disable-peephole -mtriple=x86_64-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX
+
+define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvtsd2si64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtsd2si %xmm0, %rax
+; CHECK-NEXT:    retq
+; SSE-LABEL: test_x86_sse2_cvtsd2si64:
+; SSE:       ## BB#0:
+; SSE-NEXT:    cvtsd2si %xmm0, %rax ## encoding: [0xf2,0x48,0x0f,0x2d,0xc0]
+; SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_cvtsd2si64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcvtsd2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfb,0x2d,0xc0]
+; AVX2-NEXT:    retq ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_cvtsd2si64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vcvtsd2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfb,0x2d,0xc0]
+; SKX-NEXT:    retq ## encoding: [0xc3]
+  %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
+
+
+define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
+; CHECK-LABEL: test_x86_sse2_cvtsi642sd:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+; SSE-LABEL: test_x86_sse2_cvtsi642sd:
+; SSE:       ## BB#0:
+; SSE-NEXT:    cvtsi2sdq %rdi, %xmm0 ## encoding: [0xf2,0x48,0x0f,0x2a,0xc7]
+; SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_cvtsi642sd:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 ## encoding: [0xc4,0xe1,0xfb,0x2a,0xc7]
+; AVX2-NEXT:    retq ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_cvtsi642sd:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vcvtsi2sdq %rdi, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfb,0x2a,0xc7]
+; SKX-NEXT:    retq ## encoding: [0xc3]
+  %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
+
+
+define i64 @test_x86_sse2_cvttsd2si64(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_sse2_cvttsd2si64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    vcvttsd2si %xmm0, %rax
+; CHECK-NEXT:    retq
+; SSE-LABEL: test_x86_sse2_cvttsd2si64:
+; SSE:       ## BB#0:
+; SSE-NEXT:    cvttsd2si %xmm0, %rax ## encoding: [0xf2,0x48,0x0f,0x2c,0xc0]
+; SSE-NEXT:    retq ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_sse2_cvttsd2si64:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vcvttsd2si %xmm0, %rax ## encoding: [0xc4,0xe1,0xfb,0x2c,0xc0]
+; AVX2-NEXT:    retq ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_sse2_cvttsd2si64:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vcvttsd2si %xmm0, %rax ## EVEX TO VEX Compression encoding: [0xc4,0xe1,0xfb,0x2c,0xc0]
+; SKX-NEXT:    retq ## encoding: [0xc3]
+  %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
+  ret i64 %res
+}
+declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/sse3-intrinsics-x86.ll b/test/CodeGen/X86/sse3-intrinsics-x86.ll
index 362525f24d2ae07effb04356bb52502179174fe2..fd7f59a015796eb702e65f52983f332c04dc4fc8 100644
--- a/test/CodeGen/X86/sse3-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse3-intrinsics-x86.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse3 -show-mc-encoding | FileCheck %s --check-prefix=SSE
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse3 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=AVX2
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX
 
 define <2 x double> @test_x86_sse3_addsub_pd(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-LABEL: test_x86_sse3_addsub_pd:
@@ -115,3 +115,31 @@ define <16 x i8> @test_x86_sse3_ldu_dq(i8* %a0) {
   ret <16 x i8> %res
 }
 declare <16 x i8> @llvm.x86.sse3.ldu.dq(i8*) nounwind readonly
+
+; Make sure instructions with no AVX equivalents, but are associated with SSEX feature flags still work
+
+define void @monitor(i8* %P, i32 %E, i32 %H) nounwind {
+; CHECK-LABEL: monitor:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x0c]
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    leal (%eax), %eax ## encoding: [0x8d,0x00]
+; CHECK-NEXT:    monitor ## encoding: [0x0f,0x01,0xc8]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  tail call void @llvm.x86.sse3.monitor(i8* %P, i32 %E, i32 %H)
+  ret void
+}
+declare void @llvm.x86.sse3.monitor(i8*, i32, i32) nounwind
+
+define void @mwait(i32 %E, i32 %H) nounwind {
+; CHECK-LABEL: mwait:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
+; CHECK-NEXT:    mwait ## encoding: [0x0f,0x01,0xc9]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  tail call void @llvm.x86.sse3.mwait(i32 %E, i32 %H)
+  ret void
+}
+declare void @llvm.x86.sse3.mwait(i32, i32) nounwind
diff --git a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
index 16868d854df705999c0750722efb47d8a98eece5..f106f7ec5cc1158cb0f83547fd6a931066967ab4 100644
--- a/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-fast-isel.ll
@@ -54,7 +54,7 @@ define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a
 ; X32:       # BB#0:
 ; X32-NEXT:    movdqa %xmm0, %xmm3
 ; X32-NEXT:    movaps %xmm2, %xmm0
-; X32-NEXT:    pblendvb %xmm1, %xmm3
+; X32-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
 ; X32-NEXT:    movdqa %xmm3, %xmm0
 ; X32-NEXT:    retl
 ;
@@ -62,7 +62,7 @@ define <2 x i64> @test_mm_blendv_epi8(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a
 ; X64:       # BB#0:
 ; X64-NEXT:    movdqa %xmm0, %xmm3
 ; X64-NEXT:    movaps %xmm2, %xmm0
-; X64-NEXT:    pblendvb %xmm1, %xmm3
+; X64-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
 ; X64-NEXT:    movdqa %xmm3, %xmm0
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
@@ -79,7 +79,7 @@ define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x
 ; X32:       # BB#0:
 ; X32-NEXT:    movapd %xmm0, %xmm3
 ; X32-NEXT:    movaps %xmm2, %xmm0
-; X32-NEXT:    blendvpd %xmm1, %xmm3
+; X32-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; X32-NEXT:    movapd %xmm3, %xmm0
 ; X32-NEXT:    retl
 ;
@@ -87,7 +87,7 @@ define <2 x double> @test_mm_blendv_pd(<2 x double> %a0, <2 x double> %a1, <2 x
 ; X64:       # BB#0:
 ; X64-NEXT:    movapd %xmm0, %xmm3
 ; X64-NEXT:    movaps %xmm2, %xmm0
-; X64-NEXT:    blendvpd %xmm1, %xmm3
+; X64-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; X64-NEXT:    movapd %xmm3, %xmm0
 ; X64-NEXT:    retq
   %res = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2)
@@ -100,7 +100,7 @@ define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 ; X32:       # BB#0:
 ; X32-NEXT:    movaps %xmm0, %xmm3
 ; X32-NEXT:    movaps %xmm2, %xmm0
-; X32-NEXT:    blendvps %xmm1, %xmm3
+; X32-NEXT:    blendvps %xmm0, %xmm1, %xmm3
 ; X32-NEXT:    movaps %xmm3, %xmm0
 ; X32-NEXT:    retl
 ;
@@ -108,7 +108,7 @@ define <4 x float> @test_mm_blendv_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 ; X64:       # BB#0:
 ; X64-NEXT:    movaps %xmm0, %xmm3
 ; X64-NEXT:    movaps %xmm2, %xmm0
-; X64-NEXT:    blendvps %xmm1, %xmm3
+; X64-NEXT:    blendvps %xmm0, %xmm1, %xmm3
 ; X64-NEXT:    movaps %xmm3, %xmm0
 ; X64-NEXT:    retq
   %res = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2)
@@ -898,17 +898,17 @@ define i32 @test_mm_test_all_ones(<2 x i64> %a0) {
 ; X32-LABEL: test_mm_test_all_ones:
 ; X32:       # BB#0:
 ; X32-NEXT:    pcmpeqd %xmm1, %xmm1
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    ptest %xmm1, %xmm0
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_test_all_ones:
 ; X64:       # BB#0:
 ; X64-NEXT:    pcmpeqd %xmm1, %xmm1
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    ptest %xmm1, %xmm0
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> <i64 -1, i64 -1>)
   ret i32 %res
@@ -956,16 +956,16 @@ declare i32 @llvm.x86.sse41.ptestnzc(<2 x i64>, <2 x i64>) nounwind readnone
 define i32 @test_mm_testc_si128(<2 x i64> %a0, <2 x i64> %a1) {
 ; X32-LABEL: test_mm_testc_si128:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    ptest %xmm1, %xmm0
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_testc_si128:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    ptest %xmm1, %xmm0
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1)
   ret i32 %res
diff --git a/test/CodeGen/X86/sse41-intrinsics-x86.ll b/test/CodeGen/X86/sse41-intrinsics-x86.ll
index 321b4e8108b721337f2be612f4874bc75ea70947..3abfcf4d542e36a691f654457934d8f94cd1713a 100644
--- a/test/CodeGen/X86/sse41-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse41-intrinsics-x86.ll
@@ -8,7 +8,7 @@ define <2 x double> @test_x86_sse41_blendvpd(<2 x double> %a0, <2 x double> %a1,
 ; SSE41:       ## BB#0:
 ; SSE41-NEXT:    movapd %xmm0, %xmm3 ## encoding: [0x66,0x0f,0x28,0xd8]
 ; SSE41-NEXT:    movaps %xmm2, %xmm0 ## encoding: [0x0f,0x28,0xc2]
-; SSE41-NEXT:    blendvpd %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x38,0x15,0xd9]
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x38,0x15,0xd9]
 ; SSE41-NEXT:    movapd %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x28,0xc3]
 ; SSE41-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -27,7 +27,7 @@ define <4 x float> @test_x86_sse41_blendvps(<4 x float> %a0, <4 x float> %a1, <4
 ; SSE41:       ## BB#0:
 ; SSE41-NEXT:    movaps %xmm0, %xmm3 ## encoding: [0x0f,0x28,0xd8]
 ; SSE41-NEXT:    movaps %xmm2, %xmm0 ## encoding: [0x0f,0x28,0xc2]
-; SSE41-NEXT:    blendvps %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x38,0x14,0xd9]
+; SSE41-NEXT:    blendvps %xmm0, %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x38,0x14,0xd9]
 ; SSE41-NEXT:    movaps %xmm3, %xmm0 ## encoding: [0x0f,0x28,0xc3]
 ; SSE41-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -88,7 +88,7 @@ define <4 x float> @test_x86_sse41_insertps(<4 x float> %a0, <4 x float> %a1) {
 ;
 ; SKX-LABEL: test_x86_sse41_insertps:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    vinsertps $17, %xmm1, %xmm0, %xmm0 ## encoding: [0x62,0xf3,0x7d,0x08,0x21,0xc1,0x11]
+; SKX-NEXT:    vinsertps $17, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x11]
 ; SKX-NEXT:    ## xmm0 = zero,xmm1[0],xmm0[2,3]
 ; SKX-NEXT:    retl ## encoding: [0xc3]
   %res = call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a0, <4 x float> %a1, i8 17) ; <<4 x float>> [#uses=1]
@@ -140,7 +140,7 @@ define <16 x i8> @test_x86_sse41_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8
 ; SSE41:       ## BB#0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3 ## encoding: [0x66,0x0f,0x6f,0xd8]
 ; SSE41-NEXT:    movaps %xmm2, %xmm0 ## encoding: [0x0f,0x28,0xc2]
-; SSE41-NEXT:    pblendvb %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x38,0x10,0xd9]
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm3 ## encoding: [0x66,0x0f,0x38,0x10,0xd9]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc3]
 ; SSE41-NEXT:    retl ## encoding: [0xc3]
 ;
@@ -362,16 +362,16 @@ declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
 define i32 @test_x86_sse41_ptestc(<2 x i64> %a0, <2 x i64> %a1) {
 ; SSE41-LABEL: test_x86_sse41_ptestc:
 ; SSE41:       ## BB#0:
+; SSE41-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; SSE41-NEXT:    ptest %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x38,0x17,0xc1]
-; SSE41-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; SSE41-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; SSE41-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
 ; SSE41-NEXT:    retl ## encoding: [0xc3]
 ;
 ; VCHECK-LABEL: test_x86_sse41_ptestc:
 ; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; VCHECK-NEXT:    vptest %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x17,0xc1]
-; VCHECK-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; VCHECK-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; VCHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
 ; VCHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %a0, <2 x i64> %a1) ; <i32> [#uses=1]
   ret i32 %res
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index 4a009023a7bde70628466e9ad28b88d5f863ba35..503b9416c8d3837a2bd337a7c2464be08338956e 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X32
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s --check-prefix=X64
 
@@ -227,16 +228,16 @@ define i32 @ptestz_1(<2 x i64> %t1, <2 x i64> %t2) nounwind {
 define i32 @ptestz_2(<2 x i64> %t1, <2 x i64> %t2) nounwind {
 ; X32-LABEL: ptestz_2:
 ; X32:       ## BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    ptest %xmm1, %xmm0
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: ptestz_2:
 ; X64:       ## BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    ptest %xmm1, %xmm0
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %tmp1 = call i32 @llvm.x86.sse41.ptestc(<2 x i64> %t1, <2 x i64> %t2) nounwind readnone
   ret i32 %tmp1
@@ -544,13 +545,15 @@ define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
 define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
 ; X32-LABEL: shuf_X0YC:
 ; X32:       ## BB#0:
-; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X32-NEXT:    xorps %xmm2, %xmm2
+; X32-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: shuf_X0YC:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; X64-NEXT:    xorps %xmm2, %xmm2
+; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2]
 ; X64-NEXT:    retq
   %vecext = extractelement <4 x float> %x, i32 0
@@ -771,7 +774,7 @@ define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
 ; X32:       ## BB#0:
 ; X32-NEXT:    psllw $15, %xmm0
 ; X32-NEXT:    psraw $15, %xmm0
-; X32-NEXT:    pblendvb %xmm1, %xmm2
+; X32-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; X32-NEXT:    movdqa %xmm2, %xmm0
 ; X32-NEXT:    retl
 ;
@@ -779,7 +782,7 @@ define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
 ; X64:       ## BB#0:
 ; X64-NEXT:    psllw $15, %xmm0
 ; X64-NEXT:    psraw $15, %xmm0
-; X64-NEXT:    pblendvb %xmm1, %xmm2
+; X64-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; X64-NEXT:    movdqa %xmm2, %xmm0
 ; X64-NEXT:    retq
   %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
@@ -791,12 +794,12 @@ define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocap
 ; X32-LABEL: insertps_from_vector_load:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps    $48, (%{{...}}), {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; X64-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
@@ -809,12 +812,12 @@ define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>
 ; X32-LABEL: insertps_from_vector_load_offset:
 ; X32:       ## BB#0:
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset:
 ; X64:       ## BB#0:
-; X64-NEXT:    insertps    $32, 4(%{{...}}), {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
 ; X64-NEXT:    retq
   %1 = load <4 x float>, <4 x float>* %pb, align 16
   %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
@@ -828,13 +831,13 @@ define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x floa
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X32-NEXT:    shll $4, %ecx
-; X32-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; X32-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_from_vector_load_offset_2:
 ; X64:       ## BB#0:
 ; X64-NEXT:    shlq $4, %rsi
-; X64-NEXT:    insertps    $0, 12(%{{...}},%{{...}}), {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
+; X64-NEXT:    insertps {{.*#+}} xmm0 = mem[0],xmm0[1,2,3]
 ; X64-NEXT:    retq
   %1 = getelementptr inbounds <4 x float>, <4 x float>* %pb, i64 %index
   %2 = load <4 x float>, <4 x float>* %1, align 16
@@ -990,15 +993,14 @@ define void @insertps_pr20411(<4 x i32> %shuffle109, <4 x i32> %shuffle116, i32*
 
 define <4 x float> @insertps_4(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_4:
-; X32:       ## BB#0: ## %entry
+; X32:       ## BB#0:
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_4:
-; X64:       ## BB#0: ## %entry
+; X64:       ## BB#0:
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[2],zero
 ; X64-NEXT:    retq
-entry:
   %vecext = extractelement <4 x float> %A, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
@@ -1010,15 +1012,14 @@ entry:
 
 define <4 x float> @insertps_5(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_5:
-; X32:       ## BB#0: ## %entry
+; X32:       ## BB#0:
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_5:
-; X64:       ## BB#0: ## %entry
+; X64:       ## BB#0:
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[1],zero,zero
 ; X64-NEXT:    retq
-entry:
   %vecext = extractelement <4 x float> %A, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %B, i32 1
@@ -1030,15 +1031,14 @@ entry:
 
 define <4 x float> @insertps_6(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_6:
-; X32:       ## BB#0: ## %entry
+; X32:       ## BB#0:
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_6:
-; X64:       ## BB#0: ## %entry
+; X64:       ## BB#0:
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = zero,xmm0[1],xmm1[2],zero
 ; X64-NEXT:    retq
-entry:
   %vecext = extractelement <4 x float> %A, i32 1
   %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
   %vecext1 = extractelement <4 x float> %B, i32 2
@@ -1049,15 +1049,14 @@ entry:
 
 define <4 x float> @insertps_7(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_7:
-; X32:       ## BB#0: ## %entry
+; X32:       ## BB#0:
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_7:
-; X64:       ## BB#0: ## %entry
+; X64:       ## BB#0:
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm1[1],zero
 ; X64-NEXT:    retq
-entry:
   %vecext = extractelement <4 x float> %A, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.000000e+00, i32 1
@@ -1069,15 +1068,14 @@ entry:
 
 define <4 x float> @insertps_8(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_8:
-; X32:       ## BB#0: ## %entry
+; X32:       ## BB#0:
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_8:
-; X64:       ## BB#0: ## %entry
+; X64:       ## BB#0:
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero
 ; X64-NEXT:    retq
-entry:
   %vecext = extractelement <4 x float> %A, i32 0
   %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
   %vecext1 = extractelement <4 x float> %B, i32 0
@@ -1089,17 +1087,16 @@ entry:
 
 define <4 x float> @insertps_9(<4 x float> %A, <4 x float> %B) {
 ; X32-LABEL: insertps_9:
-; X32:       ## BB#0: ## %entry
+; X32:       ## BB#0:
 ; X32-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
 ; X32-NEXT:    movaps %xmm1, %xmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: insertps_9:
-; X64:       ## BB#0: ## %entry
+; X64:       ## BB#0:
 ; X64-NEXT:    insertps {{.*#+}} xmm1 = zero,xmm0[0],xmm1[2],zero
 ; X64-NEXT:    movaps %xmm1, %xmm0
 ; X64-NEXT:    retq
-entry:
   %vecext = extractelement <4 x float> %A, i32 0
   %vecinit = insertelement <4 x float> <float 0.000000e+00, float undef, float undef, float undef>, float %vecext, i32 1
   %vecext1 = extractelement <4 x float> %B, i32 2
@@ -1108,7 +1105,7 @@ entry:
   ret <4 x float> %vecinit3
 }
 
-define <4 x float> @insertps_10(<4 x float> %A)
+define <4 x float> @insertps_10(<4 x float> %A) {
 ; X32-LABEL: insertps_10:
 ; X32:       ## BB#0:
 ; X32-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
@@ -1118,7 +1115,6 @@ define <4 x float> @insertps_10(<4 x float> %A)
 ; X64:       ## BB#0:
 ; X64-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero
 ; X64-NEXT:    retq
-{
   %vecext = extractelement <4 x float> %A, i32 0
   %vecbuild1 = insertelement <4 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %vecext, i32 0
   %vecbuild2 = insertelement <4 x float> %vecbuild1, float %vecext, i32 2
@@ -1127,17 +1123,16 @@ define <4 x float> @insertps_10(<4 x float> %A)
 
 define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) {
 ; X32-LABEL: build_vector_to_shuffle_1:
-; X32:       ## BB#0: ## %entry
+; X32:       ## BB#0:
 ; X32-NEXT:    xorps %xmm1, %xmm1
 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: build_vector_to_shuffle_1:
-; X64:       ## BB#0: ## %entry
+; X64:       ## BB#0:
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
 ; X64-NEXT:    retq
-entry:
   %vecext = extractelement <4 x float> %A, i32 1
   %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
@@ -1147,17 +1142,16 @@ entry:
 
 define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) {
 ; X32-LABEL: build_vector_to_shuffle_2:
-; X32:       ## BB#0: ## %entry
+; X32:       ## BB#0:
 ; X32-NEXT:    xorps %xmm1, %xmm1
 ; X32-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: build_vector_to_shuffle_2:
-; X64:       ## BB#0: ## %entry
+; X64:       ## BB#0:
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
 ; X64-NEXT:    retq
-entry:
   %vecext = extractelement <4 x float> %A, i32 1
   %vecinit = insertelement <4 x float> zeroinitializer, float %vecext, i32 1
   %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 2
diff --git a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
index 53b94e7f0d39cf0ee462ff173f7e74444173930e..383ab21bd404fee501c71616e80e862bb0876b57 100644
--- a/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-fast-isel.ll
@@ -33,23 +33,27 @@ define i32 @test_mm_cmpestra(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nou
 }
 declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
-define i32 @test_mm_cmpestrc(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) {
+define i32 @test_mm_cmpestrc(<2 x i64> %a0, i32 %a1, <2 x i64> %a2, i32 %a3) nounwind {
 ; X32-LABEL: test_mm_cmpestrc:
 ; X32:       # BB#0:
+; X32-NEXT:    pushl %ebx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    xorl %ebx, %ebx
 ; X32-NEXT:    pcmpestri $7, %xmm1, %xmm0
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %bl
+; X32-NEXT:    movl %ebx, %eax
+; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cmpestrc:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %r8d, %r8d
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    movl %esi, %edx
 ; X64-NEXT:    pcmpestri $7, %xmm1, %xmm0
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    setb %r8b
+; X64-NEXT:    movl %r8d, %eax
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg2 = bitcast <2 x i64> %a2 to <16 x i8>
@@ -229,16 +233,16 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
 define i32 @test_mm_cmpistrc(<2 x i64> %a0, <2 x i64> %a1) {
 ; X32-LABEL: test_mm_cmpistrc:
 ; X32:       # BB#0:
+; X32-NEXT:    xorl %eax, %eax
 ; X32-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X32-NEXT:    sbbl %eax, %eax
-; X32-NEXT:    andl $1, %eax
+; X32-NEXT:    setb %al
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm_cmpistrc:
 ; X64:       # BB#0:
+; X64-NEXT:    xorl %eax, %eax
 ; X64-NEXT:    pcmpistri $7, %xmm1, %xmm0
-; X64-NEXT:    sbbl %eax, %eax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    setb %al
 ; X64-NEXT:    retq
   %arg0 = bitcast <2 x i64> %a0 to <16 x i8>
   %arg1 = bitcast <2 x i64> %a1 to <16 x i8>
diff --git a/test/CodeGen/X86/sse42-intrinsics-x86.ll b/test/CodeGen/X86/sse42-intrinsics-x86.ll
index d5d34926fed86d1ddaa14821b3bf63e4d84a347e..d9e103c481112bd727f5a374856aa878714b4894 100644
--- a/test/CodeGen/X86/sse42-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse42-intrinsics-x86.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.2 -show-mc-encoding | FileCheck %s --check-prefix=SSE42
-; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=AVX2
-; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=VCHECK --check-prefix=SKX
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=-avx,+sse4.2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
+; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=AVX2
+; RUN: llc < %s -mtriple=i386-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX
 
 define i32 @test_x86_sse42_pcmpestri128(<16 x i8> %a0, <16 x i8> %a2) {
 ; SSE42-LABEL: test_x86_sse42_pcmpestri128:
@@ -95,23 +95,29 @@ define i32 @test_x86_sse42_pcmpestria128(<16 x i8> %a0, <16 x i8> %a2) nounwind
 declare i32 @llvm.x86.sse42.pcmpestria128(<16 x i8>, i32, <16 x i8>, i32, i8) nounwind readnone
 
 
-define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) {
+define i32 @test_x86_sse42_pcmpestric128(<16 x i8> %a0, <16 x i8> %a2) nounwind {
 ; SSE42-LABEL: test_x86_sse42_pcmpestric128:
 ; SSE42:       ## BB#0:
+; SSE42-NEXT:    pushl %ebx ## encoding: [0x53]
 ; SSE42-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; SSE42-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
+; SSE42-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; SSE42-NEXT:    pcmpestri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x61,0xc1,0x07]
-; SSE42-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; SSE42-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; SSE42-NEXT:    setb %bl ## encoding: [0x0f,0x92,0xc3]
+; SSE42-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
+; SSE42-NEXT:    popl %ebx ## encoding: [0x5b]
 ; SSE42-NEXT:    retl ## encoding: [0xc3]
 ;
 ; VCHECK-LABEL: test_x86_sse42_pcmpestric128:
 ; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    pushl %ebx ## encoding: [0x53]
 ; VCHECK-NEXT:    movl $7, %eax ## encoding: [0xb8,0x07,0x00,0x00,0x00]
 ; VCHECK-NEXT:    movl $7, %edx ## encoding: [0xba,0x07,0x00,0x00,0x00]
+; VCHECK-NEXT:    xorl %ebx, %ebx ## encoding: [0x31,0xdb]
 ; VCHECK-NEXT:    vpcmpestri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x61,0xc1,0x07]
-; VCHECK-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; VCHECK-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; VCHECK-NEXT:    setb %bl ## encoding: [0x0f,0x92,0xc3]
+; VCHECK-NEXT:    movl %ebx, %eax ## encoding: [0x89,0xd8]
+; VCHECK-NEXT:    popl %ebx ## encoding: [0x5b]
 ; VCHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse42.pcmpestric128(<16 x i8> %a0, i32 7, <16 x i8> %a2, i32 7, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -326,16 +332,16 @@ declare i32 @llvm.x86.sse42.pcmpistria128(<16 x i8>, <16 x i8>, i8) nounwind rea
 define i32 @test_x86_sse42_pcmpistric128(<16 x i8> %a0, <16 x i8> %a1) {
 ; SSE42-LABEL: test_x86_sse42_pcmpistric128:
 ; SSE42:       ## BB#0:
+; SSE42-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; SSE42-NEXT:    pcmpistri $7, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x63,0xc1,0x07]
-; SSE42-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; SSE42-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; SSE42-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
 ; SSE42-NEXT:    retl ## encoding: [0xc3]
 ;
 ; VCHECK-LABEL: test_x86_sse42_pcmpistric128:
 ; VCHECK:       ## BB#0:
+; VCHECK-NEXT:    xorl %eax, %eax ## encoding: [0x31,0xc0]
 ; VCHECK-NEXT:    vpcmpistri $7, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x63,0xc1,0x07]
-; VCHECK-NEXT:    sbbl %eax, %eax ## encoding: [0x19,0xc0]
-; VCHECK-NEXT:    andl $1, %eax ## encoding: [0x83,0xe0,0x01]
+; VCHECK-NEXT:    setb %al ## encoding: [0x0f,0x92,0xc0]
 ; VCHECK-NEXT:    retl ## encoding: [0xc3]
   %res = call i32 @llvm.x86.sse42.pcmpistric128(<16 x i8> %a0, <16 x i8> %a1, i8 7) ; <i32> [#uses=1]
   ret i32 %res
@@ -435,3 +441,36 @@ define <16 x i8> @test_x86_sse42_pcmpistrm128_load(<16 x i8> %a0, <16 x i8>* %a1
   %res = call <16 x i8> @llvm.x86.sse42.pcmpistrm128(<16 x i8> %a0, <16 x i8> %1, i8 7) ; <<16 x i8>> [#uses=1]
   ret <16 x i8> %res
 }
+
+define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
+; CHECK-LABEL: crc32_32_8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    crc32b {{[0-9]+}}(%esp), %eax ## encoding: [0xf2,0x0f,0x38,0xf0,0x44,0x24,0x08]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
+  ret i32 %tmp
+}
+declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
+
+define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
+; CHECK-LABEL: crc32_32_16:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    crc32w {{[0-9]+}}(%esp), %eax ## encoding: [0x66,0xf2,0x0f,0x38,0xf1,0x44,0x24,0x08]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
+  ret i32 %tmp
+}
+declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
+
+define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: crc32_32_32:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; CHECK-NEXT:    crc32l {{[0-9]+}}(%esp), %eax ## encoding: [0xf2,0x0f,0x38,0xf1,0x44,0x24,0x08]
+; CHECK-NEXT:    retl ## encoding: [0xc3]
+  %tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
+  ret i32 %tmp
+}
+declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
diff --git a/test/CodeGen/X86/sse42-intrinsics-x86_64.ll b/test/CodeGen/X86/sse42-intrinsics-x86_64.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e90aa455cfd81caa014fd5f3c85d3320ec17fdd5
--- /dev/null
+++ b/test/CodeGen/X86/sse42-intrinsics-x86_64.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=-avx,+sse4.2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=SSE42
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx -show-mc-encoding | FileCheck %s --check-prefix=CHECK --check-prefix=VCHECK --check-prefix=SKX
+
+declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
+declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
+
+define i64 @crc32_64_8(i64 %a, i8 %b) nounwind {
+; CHECK-LABEL: crc32_64_8:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    crc32b %sil, %edi ## encoding: [0xf2,0x40,0x0f,0x38,0xf0,0xfe]
+; CHECK-NEXT:    movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %tmp = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a, i8 %b)
+  ret i64 %tmp
+}
+
+define i64 @crc32_64_64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: crc32_64_64:
+; CHECK:       ## BB#0:
+; CHECK-NEXT:    crc32q %rsi, %rdi ## encoding: [0xf2,0x48,0x0f,0x38,0xf1,0xfe]
+; CHECK-NEXT:    movq %rdi, %rax ## encoding: [0x48,0x89,0xf8]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %tmp = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a, i64 %b)
+  ret i64 %tmp
+}
+
diff --git a/test/CodeGen/X86/sse42.ll b/test/CodeGen/X86/sse42.ll
deleted file mode 100644
index 2d05f9884c4278035b9214a170c91df1711b6c97..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/sse42.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.2 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.2 | FileCheck %s --check-prefix=X64
-
-declare i32 @llvm.x86.sse42.crc32.32.8(i32, i8) nounwind
-declare i32 @llvm.x86.sse42.crc32.32.16(i32, i16) nounwind
-declare i32 @llvm.x86.sse42.crc32.32.32(i32, i32) nounwind
-
-define i32 @crc32_32_8(i32 %a, i8 %b) nounwind {
-; X32-LABEL: crc32_32_8:
-; X32:       ## BB#0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    crc32b {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: crc32_32_8:
-; X64:       ## BB#0:
-; X64-NEXT:    crc32b %sil, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %tmp = call i32 @llvm.x86.sse42.crc32.32.8(i32 %a, i8 %b)
-  ret i32 %tmp
-}
-
-
-define i32 @crc32_32_16(i32 %a, i16 %b) nounwind {
-; X32-LABEL: crc32_32_16:
-; X32:       ## BB#0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    crc32w {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: crc32_32_16:
-; X64:       ## BB#0:
-; X64-NEXT:    crc32w %si, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %tmp = call i32 @llvm.x86.sse42.crc32.32.16(i32 %a, i16 %b)
-  ret i32 %tmp
-}
-
-
-define i32 @crc32_32_32(i32 %a, i32 %b) nounwind {
-; X32-LABEL: crc32_32_32:
-; X32:       ## BB#0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    crc32l {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    retl
-;
-; X64-LABEL: crc32_32_32:
-; X64:       ## BB#0:
-; X64-NEXT:    crc32l %esi, %edi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    retq
-  %tmp = call i32 @llvm.x86.sse42.crc32.32.32(i32 %a, i32 %b)
-  ret i32 %tmp
-}
-
diff --git a/test/CodeGen/X86/sse42_64.ll b/test/CodeGen/X86/sse42_64.ll
deleted file mode 100644
index b39e76c78eb7c4a71be93ef914ac11924d1f6c1a..0000000000000000000000000000000000000000
--- a/test/CodeGen/X86/sse42_64.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.2 | FileCheck %s -check-prefix=X64
-
-declare i64 @llvm.x86.sse42.crc32.64.8(i64, i8) nounwind
-declare i64 @llvm.x86.sse42.crc32.64.64(i64, i64) nounwind
-
-define i64 @crc32_64_8(i64 %a, i8 %b) nounwind {
-  %tmp = call i64 @llvm.x86.sse42.crc32.64.8(i64 %a, i8 %b)
-  ret i64 %tmp
-
-; X64: _crc32_64_8:
-; X64:     crc32b   %sil,
-}
-
-define i64 @crc32_64_64(i64 %a, i64 %b) nounwind {
-  %tmp = call i64 @llvm.x86.sse42.crc32.64.64(i64 %a, i64 %b)
-  ret i64 %tmp
-
-; X64: _crc32_64_64:
-; X64:     crc32q   %rsi,
-}
-
diff --git a/test/CodeGen/X86/ssse3-intrinsics-x86.ll b/test/CodeGen/X86/ssse3-intrinsics-x86.ll
index d2785b4c89bb172f968eab455052e105d61b799d..4f49385fec7f77474ee1b579b6e3eb793d143c37 100644
--- a/test/CodeGen/X86/ssse3-intrinsics-x86.ll
+++ b/test/CodeGen/X86/ssse3-intrinsics-x86.ll
@@ -183,6 +183,35 @@ define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128(<16 x i8> %a0, <16 x i8> %a1) {
 declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
 
 
+; Make sure we don't commute this operation.
+define <8 x i16> @test_x86_ssse3_pmadd_ub_sw_128_load_op0(<16 x i8>* %ptr, <16 x i8> %a1) {
+; SSE-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
+; SSE:       ## BB#0:
+; SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SSE-NEXT:    movdqa (%eax), %xmm1 ## encoding: [0x66,0x0f,0x6f,0x08]
+; SSE-NEXT:    pmaddubsw %xmm0, %xmm1 ## encoding: [0x66,0x0f,0x38,0x04,0xc8]
+; SSE-NEXT:    movdqa %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x6f,0xc1]
+; SSE-NEXT:    retl ## encoding: [0xc3]
+;
+; AVX2-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; AVX2-NEXT:    vmovdqa (%eax), %xmm1 ## encoding: [0xc5,0xf9,0x6f,0x08]
+; AVX2-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe2,0x71,0x04,0xc0]
+; AVX2-NEXT:    retl ## encoding: [0xc3]
+;
+; SKX-LABEL: test_x86_ssse3_pmadd_ub_sw_128_load_op0:
+; SKX:       ## BB#0:
+; SKX-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
+; SKX-NEXT:    vmovdqu (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x6f,0x08]
+; SKX-NEXT:    vpmaddubsw %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x71,0x04,0xc0]
+; SKX-NEXT:    retl ## encoding: [0xc3]
+  %a0 = load <16 x i8>, <16 x i8>* %ptr
+  %res = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1) ; <<8 x i16>> [#uses=1]
+  ret <8 x i16> %res
+}
+
+
 define <8 x i16> @test_x86_ssse3_pmul_hr_sw_128(<8 x i16> %a0, <8 x i16> %a1) {
 ; SSE-LABEL: test_x86_ssse3_pmul_hr_sw_128:
 ; SSE:       ## BB#0:
diff --git a/test/CodeGen/X86/stack-align.ll b/test/CodeGen/X86/stack-align.ll
index 04bae023984f13b13871b4470685d8e6b2993821..192306462d1d69bb284b04b423a66519eb79f408 100644
--- a/test/CodeGen/X86/stack-align.ll
+++ b/test/CodeGen/X86/stack-align.ll
@@ -69,10 +69,10 @@ entry:
 define x86_stdcallcc void @test5(%struct.sixteen* byval nocapture readonly align 4 %s) #0 {
   %d.sroa.0 = alloca [16 x i8], align 1
   %1 = getelementptr inbounds [16 x i8], [16 x i8]* %d.sroa.0, i32 0, i32 0
-  call void @llvm.lifetime.start(i64 16, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %1)
   %2 = getelementptr inbounds %struct.sixteen, %struct.sixteen* %s, i32 0, i32 0, i32 0
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %2, i32 16, i32 1, i1 true)
-  call void @llvm.lifetime.end(i64 16, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %1)
   ret void
 ; CHECK-LABEL: test5:
 ; CHECK: and
@@ -82,10 +82,10 @@ define x86_stdcallcc void @test5(%struct.sixteen* byval nocapture readonly align
 ; CHECK-NEXT: movsd
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) argmemonly nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) argmemonly nounwind
 
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) argmemonly nounwind
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) argmemonly nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) argmemonly nounwind
 
 attributes #0 = { nounwind alignstack=16 "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/stack-folding-adx-x86_64.ll b/test/CodeGen/X86/stack-folding-adx-x86_64.ll
index 5f109f09aa194009fb0b31c891ef647c7b377113..e992e463dc4bf1725d0236ecda5c4bfbb6398009 100644
--- a/test/CodeGen/X86/stack-folding-adx-x86_64.ll
+++ b/test/CodeGen/X86/stack-folding-adx-x86_64.ll
@@ -43,3 +43,21 @@ define i8 @stack_fold_addcarryx_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
   ret i8 %2;
 }
 declare i8 @llvm.x86.addcarryx.u64(i8, i64, i64, i8*)
+
+define i8 @stack_fold_subborrow_u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_subborrow_u32
+  ;CHECK:       sbbl {{-?[0-9]*}}(%rsp), %ecx {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.subborrow.u32(i8 %a0, i32 %a1, i32 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.subborrow.u32(i8, i32, i32, i8*)
+
+define i8 @stack_fold_subborrow_u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3) {
+  ;CHECK-LABEL: stack_fold_subborrow_u64
+  ;CHECK:       sbbq {{-?[0-9]*}}(%rsp), %rcx {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i8 @llvm.x86.subborrow.u64(i8 %a0, i64 %a1, i64 %a2, i8* %a3)
+  ret i8 %2;
+}
+declare i8 @llvm.x86.subborrow.u64(i8, i64, i64, i8*)
diff --git a/test/CodeGen/X86/stack-folding-bmi.ll b/test/CodeGen/X86/stack-folding-bmi.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cabc88432be4b77234b0bdc75d87210122bdf4c9
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-bmi.ll
@@ -0,0 +1,121 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+bmi < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define i32 @stack_fold_andn_u32(i32 %a0, i32 %a1) {
+  ;CHECK-LABEL: stack_fold_andn_u32
+  ;CHECK:       andnl {{-?[0-9]*}}(%rsp), %eax, %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = xor i32 %a0, -1
+  %3 = and i32 %a1, %2
+  ret i32 %3
+}
+
+define i64 @stack_fold_andn_u64(i64 %a0, i64 %a1) {
+  ;CHECK-LABEL: stack_fold_andn_u64
+  ;CHECK:       andnq {{-?[0-9]*}}(%rsp), %rax, %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = xor i64 %a0, -1
+  %3 = and i64 %a1, %2
+  ret i64 %3
+}
+
+define i32 @stack_fold_bextr_u32(i32 %a0, i32 %a1) {
+  ;CHECK-LABEL: stack_fold_bextr_u32
+  ;CHECK:       # BB#0:
+  ;CHECK:       bextrl %eax, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i32 @llvm.x86.bmi.bextr.32(i32 %a0, i32 %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.bmi.bextr.32(i32, i32)
+
+define i64 @stack_fold_bextr_u64(i64 %a0, i64 %a1) {
+  ;CHECK-LABEL: stack_fold_bextr_u64
+  ;CHECK:       # BB#0:
+  ;CHECK:       bextrq %rax, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i64 @llvm.x86.bmi.bextr.64(i64 %a0, i64 %a1)
+  ret i64 %2
+}
+declare i64 @llvm.x86.bmi.bextr.64(i64, i64)
+
+define i32 @stack_fold_blsi_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_blsi_u32
+  ;CHECK:       blsil {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i32 0, %a0
+  %3 = and i32 %2, %a0
+  ret i32 %3
+}
+
+define i64 @stack_fold_blsi_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_blsi_u64
+  ;CHECK:       blsiq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i64 0, %a0
+  %3 = and i64 %2, %a0
+  ret i64 %3
+}
+
+define i32 @stack_fold_blsmsk_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_blsmsk_u32
+  ;CHECK:       blsmskl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i32 %a0, 1
+  %3 = xor i32 %2, %a0
+  ret i32 %3
+}
+
+define i64 @stack_fold_blsmsk_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_blsmsk_u64
+  ;CHECK:       blsmskq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i64 %a0, 1
+  %3 = xor i64 %2, %a0
+  ret i64 %3
+}
+
+define i32 @stack_fold_blsr_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_blsr_u32
+  ;CHECK:       blsrl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i32 %a0, 1
+  %3 = and i32 %2, %a0
+  ret i32 %3
+}
+
+define i64 @stack_fold_blsr_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_blsr_u64
+  ;CHECK:       blsrq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i64 %a0, 1
+  %3 = and i64 %2, %a0
+  ret i64 %3
+}
+
+;TODO stack_fold_tzcnt_u16
+
+define i32 @stack_fold_tzcnt_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_tzcnt_u32
+  ;CHECK:       tzcntl {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i32 @llvm.cttz.i32(i32 %a0, i1 0)
+  ret i32 %2
+}
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define i64 @stack_fold_tzcnt_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_tzcnt_u64
+  ;CHECK:       tzcntq {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i64 @llvm.cttz.i64(i64 %a0, i1 0)
+  ret i64 %2
+}
+declare i64 @llvm.cttz.i64(i64, i1)
diff --git a/test/CodeGen/X86/stack-folding-bmi2.ll b/test/CodeGen/X86/stack-folding-bmi2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b70f7c668d016560213a4c0c308b8cae9d27445d
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-bmi2.ll
@@ -0,0 +1,77 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define i32 @stack_fold_bzhi_u32(i32 %a0, i32 %a1)   {
+  ;CHECK-LABEL: stack_fold_bzhi_u32
+  ;CHECK:       bzhil %eax, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i32 @llvm.x86.bmi.bzhi.32(i32 %a0, i32 %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.bmi.bzhi.32(i32, i32)
+
+define i64 @stack_fold_bzhi_u64(i64 %a0, i64 %a1)   {
+  ;CHECK-LABEL: stack_fold_bzhi_u64
+  ;CHECK:       bzhiq %rax, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i64 @llvm.x86.bmi.bzhi.64(i64 %a0, i64 %a1)
+  ret i64 %2
+}
+declare i64 @llvm.x86.bmi.bzhi.64(i64, i64)
+
+define i64 @stack_fold_mulx_u64(i64 %a0, i64 %a1, i64 *%a2)   {
+  ;CHECK-LABEL: stack_fold_mulx_u64
+  ;CHECK:       mulxq {{-?[0-9]*}}(%rsp), %rax, %rcx {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = zext i64 %a0 to i128
+  %3 = zext i64 %a1 to i128
+  %4 = mul i128 %2, %3
+  %5 = lshr i128 %4, 64
+  %6 = trunc i128 %4 to i64
+  %7 = trunc i128 %5 to i64
+  store i64 %7, i64 *%a2
+  ret i64 %6
+}
+
+define i32 @stack_fold_pdep_u32(i32 %a0, i32 %a1)   {
+  ;CHECK-LABEL: stack_fold_pdep_u32
+  ;CHECK:       pdepl {{-?[0-9]*}}(%rsp), %eax, %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i32 @llvm.x86.bmi.pdep.32(i32 %a0, i32 %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.bmi.pdep.32(i32, i32)
+
+define i64 @stack_fold_pdep_u64(i64 %a0, i64 %a1)   {
+  ;CHECK-LABEL: stack_fold_pdep_u64
+  ;CHECK:       pdepq {{-?[0-9]*}}(%rsp), %rax, %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i64 @llvm.x86.bmi.pdep.64(i64 %a0, i64 %a1)
+  ret i64 %2
+}
+declare i64 @llvm.x86.bmi.pdep.64(i64, i64)
+
+define i32 @stack_fold_pext_u32(i32 %a0, i32 %a1)   {
+  ;CHECK-LABEL: stack_fold_pext_u32
+  ;CHECK:       pextl {{-?[0-9]*}}(%rsp), %eax, %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i32 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i32 @llvm.x86.bmi.pext.32(i32 %a0, i32 %a1)
+  ret i32 %2
+}
+declare i32 @llvm.x86.bmi.pext.32(i32, i32)
+
+define i64 @stack_fold_pext_u64(i64 %a0, i64 %a1)   {
+  ;CHECK-LABEL: stack_fold_pext_u64
+  ;CHECK:       pextq {{-?[0-9]*}}(%rsp), %rax, %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i64 @llvm.x86.bmi.pext.64(i64 %a0, i64 %a1)
+  ret i64 %2
+}
+declare i64 @llvm.x86.bmi.pext.64(i64, i64)
diff --git a/test/CodeGen/X86/stack-folding-fp-avx1.ll b/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 5e939cc034d4a015fe48d4e6462fb9b696fecb5a..72542f499087f16e5cd3da191de75719c2694bad 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -575,17 +575,6 @@ define i64 @stack_fold_cvtsd2si64_int(<2 x double> %a0) {
 }
 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
 
-; TODO stack_fold_cvtsd2ss
-
-define <4 x float> @stack_fold_cvtsd2ss_int(<2 x double> %a0) {
-  ;CHECK-LABEL: stack_fold_cvtsd2ss_int
-  ;CHECK:  vcvtsd2ss {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float> <float 0x0, float 0x0, float 0x0, float 0x0>, <2 x double> %a0)
-  ret <4 x float> %2
-}
-declare <4 x float> @llvm.x86.sse2.cvtsd2ss(<4 x float>, <2 x double>) nounwind readnone
-
 define double @stack_fold_cvtsi2sd(i32 %a0) {
   ;CHECK-LABEL: stack_fold_cvtsi2sd
   ;CHECK:  vcvtsi2sdl {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
@@ -654,17 +643,6 @@ define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) {
 }
 declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
 
-; TODO stack_fold_cvtss2sd
-
-define <2 x double> @stack_fold_cvtss2sd_int(<4 x float> %a0) {
-  ;CHECK-LABEL: stack_fold_cvtss2sd_int
-  ;CHECK:  vcvtss2sd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
-  %2 = call <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double> <double 0x0, double 0x0>, <4 x float> %a0)
-  ret <2 x double> %2
-}
-declare <2 x double> @llvm.x86.sse2.cvtss2sd(<2 x double>, <4 x float>) nounwind readnone
-
 ; TODO stack_fold_cvtss2si
 
 define i32 @stack_fold_cvtss2si_int(<4 x float> %a0) {
diff --git a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
index c6ae85dda43acec0292bd9cbc33f614d357e2a16..292829a01cb3de9549c9fbcbb41755df5b29f6ee 100644
--- a/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -402,6 +402,45 @@ define <8 x float> @stack_fold_orps_ymm(<8 x float> %a0, <8 x float> %a1) {
   ret <8 x float> %6
 }
 
+define <4 x float> @stack_fold_shufps(<4 x float> %a0, <4 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_shufps
+  ;CHECK:       vshufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
+  ret <4 x float> %2
+}
+
+define <4 x float> @stack_fold_shufps_mask(<4 x float>* %passthru, <4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_shufps_mask
+  ;CHECK:       vshufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = load <4 x float>, <4 x float>* %passthru
+  %6 = select <4 x i1> %4, <4 x float> %2, <4 x float> %5
+  ret <4 x float> %6
+}
+
+define <4 x float> @stack_fold_shufps_maskz(<4 x float> %a0, <4 x float> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_shufps_maskz
+  ;CHECK:       vshufps $200, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x float> %a0, <4 x float> %a1, <4 x i32> <i32 0, i32 2, i32 4, i32 7>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %4, <4 x float> %2, <4 x float> zeroinitializer
+  ret <4 x float> %5
+}
+
+define <8 x float> @stack_fold_shufps_ymm(<8 x float> %a0, <8 x float> %a1) {
+  ;CHECK-LABEL: stack_fold_shufps_ymm
+  ;CHECK:       vshufps $148, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x float> %a0, <8 x float> %a1, <8 x i32> <i32 0, i32 1, i32 9, i32 10, i32 4, i32 5, i32 13, i32 14>
+  ret <8 x float> %2
+}
+
 define <2 x double> @stack_fold_subpd(<2 x double> %a0, <2 x double> %a1) {
   ;CHECK-LABEL: stack_fold_subpd
   ;CHECK:       vsubpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
diff --git a/test/CodeGen/X86/stack-folding-fp-sse42.ll b/test/CodeGen/X86/stack-folding-fp-sse42.ll
index c57782721a6677089fbe40635d579f8bc1bf8a86..daa903bc86607b2bb5b18b3828097a09419db381 100644
--- a/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -148,7 +148,7 @@ define <4 x float> @stack_fold_blendps(<4 x float> %a0, <4 x float> %a1) {
 
 define <2 x double> @stack_fold_blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %c) {
   ;CHECK-LABEL: stack_fold_blendvpd
-  ;CHECK:       blendvpd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  ;CHECK:       blendvpd %xmm0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a1, <2 x double> %c, <2 x double> %a0)
   ret <2 x double> %2
@@ -157,7 +157,7 @@ declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x d
 
 define <4 x float> @stack_fold_blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %c) {
   ;CHECK-LABEL: stack_fold_blendvps
-  ;CHECK:       blendvps {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  ;CHECK:       blendvps %xmm0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a1, <4 x float> %c, <4 x float> %a0)
   ret <4 x float> %2
diff --git a/test/CodeGen/X86/stack-folding-int-avx512.ll b/test/CodeGen/X86/stack-folding-int-avx512.ll
index 20572b3730820ff65623bc9e417f253afc23967c..04a7d1159014767eea813214b171f99d3d63c997 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512.ll
@@ -8,6 +8,329 @@ target triple = "x86_64-unknown-unknown"
 ; By including a nop call with sideeffects we can force a partial register spill of the
 ; relevant registers and check that the reload is correctly folded into the instruction.
 
+define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) {
+  ;CHECK-LABEL: stack_fold_valignd
+  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  ret <16 x i32> %2
+}
+
+define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, <16 x i32>* %passthru, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_valignd_mask
+  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = load <16 x i32>, <16 x i32>* %passthru
+  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
+  ret <16 x i32> %5
+}
+
+define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_valignd_maskz
+  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  ret <16 x i32> %4
+}
+
+define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) {
+  ;CHECK-LABEL: stack_fold_valignq
+  ;CHECK:   valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  ret <8 x i64> %2
+}
+
+define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, <8 x i64>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_valignq_mask
+  ;CHECK:   valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = load <8 x i64>, <8 x i64>* %passthru
+  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
+  ret <8 x i64> %5
+}
+
+define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_valignq_maskz
+  ;CHECK:   valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
+  ret <8 x i64> %4
+}
+
+define <64 x i8> @stack_fold_pavgb(<64 x i8> %a0, <64 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgb
+  ;CHECK:       vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %2
+}
+declare <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone
+
+define <64 x i8> @stack_fold_pavgb_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+  ;CHECK-LABEL: stack_fold_pavgb_mask
+  ;CHECK:       vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <64 x i8>, <64 x i8>* %passthru
+  %3 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %2, i64 %mask)
+  ret <64 x i8> %3
+}
+
+define <64 x i8> @stack_fold_pavgb_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+  ;CHECK-LABEL: stack_fold_pavgb_maskz
+  ;CHECK:       vpavgb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.pavg.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> zeroinitializer, i64 %mask)
+  ret <64 x i8> %2
+}
+
+define <32 x i16> @stack_fold_pavgw(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgw
+  ;CHECK:       vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readnone
+
+define <32 x i16> @stack_fold_pavgw_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_pavgw_mask
+  ;CHECK:       vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <32 x i16>, <32 x i16>* %passthru
+  %3 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> %2, i32 %mask)
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @stack_fold_pavgw_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_pavgw_maskz
+  ;CHECK:       vpavgw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.pavg.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %2
+}
+
+define <4 x i32> @stack_fold_extracti32x4(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_extracti32x4
+  ;CHECK:       vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+  ; add forces execution domain
+  %1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @stack_fold_extracti64x2(<8 x i64> %a0, <8 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_extracti64x2
+  ;CHECK:       vextracti64x2 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+  ; add forces execution domain
+  %1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <2 x i64> %2
+}
+
+define <8 x i32> @stack_fold_extracti32x8(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_extracti32x8
+  ;CHECK:       vextracti32x8 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
+  ; add forces execution domain
+  %1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <8 x i32> %2
+}
+
+define <4 x i64> @stack_fold_extracti64x4(<8 x i64> %a0, <8 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_extracti64x4
+  ;CHECK:       vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
+  ; add forces execution domain
+  %1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <4 x i64> %2
+}
+
+define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_inserti32x8
+  ;CHECK:       vinserti32x8 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ; add forces execution domain
+  %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i32> %3
+}
+
+define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_inserti64x4
+  ;CHECK:       vinserti64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ; add forces execution domain
+  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ret <8 x i64> %3
+}
+
+define <64 x i8> @stack_fold_pabsb(<64 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsb
+  ;CHECK:       vpabsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %a0, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %2
+}
+declare <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8>, <64 x i8>, i64) nounwind readnone
+
+define <64 x i8> @stack_fold_pabsb_mask(<64 x i8> %passthru, <64 x i8> %a0, i64 %mask) {
+  ;CHECK-LABEL: stack_fold_pabsb_mask
+  ;CHECK:       vpabsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %a0, <64 x i8> %passthru, i64 %mask)
+  ret <64 x i8> %2
+}
+
+define <64 x i8> @stack_fold_pabsb_maskz(<64 x i8> %a0, i64 %mask) {
+  ;CHECK-LABEL: stack_fold_pabsb_maskz
+  ;CHECK:       vpabsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.pabs.b.512(<64 x i8> %a0, <64 x i8> zeroinitializer, i64 %mask)
+  ret <64 x i8> %2
+}
+
+define <16 x i32> @stack_fold_pabsd(<16 x i32> %a0) {
+  ;check-label: stack_fold_pabsd
+  ;check:       vpabsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte folded reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %a0, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %2
+}
+declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16) nounwind readnone
+
+define <16 x i32> @stack_fold_pabsd_mask(<16 x i32> %passthru, <16 x i32> %a0, i16 %mask) {
+  ;check-label: stack_fold_pabsd
+  ;check:       vpabsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte folded reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %a0, <16 x i32> %passthru, i16 %mask)
+  ret <16 x i32> %2
+}
+
+define <16 x i32> @stack_fold_pabsd_maskz(<16 x i32> %a0, i16 %mask) {
+  ;check-label: stack_fold_pabsd
+  ;check:       vpabsd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte folded reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %a0, <16 x i32> zeroinitializer, i16 %mask)
+  ret <16 x i32> %2
+}
+
+define <8 x i64> @stack_fold_pabsq(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsq
+  ;CHECK:       vpabsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %a0, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %2
+}
+declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8) nounwind readnone
+
+define <8 x i64> @stack_fold_pabsq_mask(<8 x i64> %passthru, <8 x i64> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pabsq_mask
+  ;CHECK:       vpabsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %a0, <8 x i64> %passthru, i8 %mask)
+  ret <8 x i64> %2
+}
+
+define <8 x i64> @stack_fold_pabsq_maskz(<8 x i64> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pabsq_maskz
+  ;CHECK:       vpabsq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %a0, <8 x i64> zeroinitializer, i8 %mask)
+  ret <8 x i64> %2
+}
+
+define <32 x i16> @stack_fold_pabsw(<32 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsw
+  ;CHECK:       vpabsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %a0, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16>, <32 x i16>, i32) nounwind readnone
+
+define <32 x i16> @stack_fold_pabsw_mask(<32 x i16> %passthru, <32 x i16> %a0, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_pabsw_mask
+  ;CHECK:       vpabsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %a0, <32 x i16> %passthru, i32 %mask)
+  ret <32 x i16> %2
+}
+
+define <32 x i16> @stack_fold_pabsw_maskz(<32 x i16> %a0, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_pabsw_maskz
+  ;CHECK:       vpabsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.pabs.w.512(<32 x i16> %a0, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %2
+}
+
+define <32 x i16> @stack_fold_packssdw(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packssdw
+  ;CHECK:       vpackssdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32> %a0, <16 x i32> %a1, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.packssdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32) nounwind readnone
+
+define <64 x i8> @stack_fold_packsswb(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packsswb
+  ;CHECK:       vpacksswb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16> %a0, <32 x i16> %a1, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %2
+}
+declare <64 x i8> @llvm.x86.avx512.mask.packsswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64) nounwind readnone
+
+define <32 x i16> @stack_fold_packusdw(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packusdw
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a0, <16 x i32> %a1, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32>, <16 x i32>, <32 x i16>, i32) nounwind readnone
+
+define <32 x i16> @stack_fold_packusdw_mask(<32 x i16>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_packusdw_mask
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <32 x i16>, <32 x i16>* %passthru
+  %3 = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a0, <16 x i32> %a1, <32 x i16> %2, i32 %mask)
+  ret <32 x i16> %3
+}
+
+define <32 x i16> @stack_fold_packusdw_maskz(<16 x i32> %a0, <16 x i32> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_packusdw_maskz
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.packusdw.512(<16 x i32> %a0, <16 x i32> %a1, <32 x i16> zeroinitializer, i32 %mask)
+  ret <32 x i16> %2
+}
+
+define <64 x i8> @stack_fold_packuswb(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packuswb
+  ;CHECK:       vpackuswb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16> %a0, <32 x i16> %a1, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %2
+}
+declare <64 x i8> @llvm.x86.avx512.mask.packuswb.512(<32 x i16>, <32 x i16>, <64 x i8>, i64) nounwind readnone
+
 define <64 x i8> @stack_fold_paddb(<64 x i8> %a0, <64 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_paddb
   ;CHECK:       vpaddb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
@@ -98,6 +421,35 @@ define <32 x i16> @stack_fold_paddw(<32 x i16> %a0, <32 x i16> %a1) {
   ret <32 x i16> %2
 }
 
+define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_palignr
+  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
+  ret <64 x i8> %2
+}
+
+define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %passthru, i64 %mask) {
+  ;CHECK-LABEL: stack_fold_palignr_mask
+  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
+  %3 = bitcast i64 %mask to <64 x i1>
+  %4 = load <64 x i8>, <64 x i8>* %passthru
+  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
+  ret <64 x i8> %5
+}
+
+define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+  ;CHECK-LABEL: stack_fold_palignr_maskz
+  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
+  %3 = bitcast i64 %mask to <64 x i1>
+  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
+  ret <64 x i8> %4
+}
+
 define i64 @stack_fold_pcmpeqb(<64 x i8> %a0, <64 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_pcmpeqb
   ;CHECK:       vpcmpeqb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 64-byte Folded Reload
@@ -134,291 +486,343 @@ define i32 @stack_fold_pcmpeqw(<32 x i16> %a0, <32 x i16> %a1) {
   ret i32 %3
 }
 
-define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_psubb
-  ;CHECK:       vpsubb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_permbvar
+  ;CHECK:   vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <64 x i8> %a0, %a1
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
   ret <64 x i8> %2
 }
+declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readonly
 
-define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) {
-  ;CHECK-LABEL: stack_fold_psubd
-  ;CHECK:       vpsubd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <16 x i32> %a0, %a1
-  ret <16 x i32> %2
-}
-
-define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) {
-  ;CHECK-LABEL: stack_fold_psubq
-  ;CHECK:       vpsubq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <8 x i64> %a0, %a1
-  ret <8 x i64> %2
-}
-
-define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_psubsb
-  ;CHECK:       vpsubsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1)
-  ret <64 x i8> %2
-}
-declare <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone
-
-define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_psubsw
-  ;CHECK:       vpsubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+  ;CHECK-LABEL: stack_fold_permbvar_mask
+  ;CHECK:   vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1)
-  ret <32 x i16> %2
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
+  %3 = bitcast i64 %mask to <64 x i1>
+  ; load needed to keep the operation from being scheduled above the asm block
+  %4 = load <64 x i8>, <64 x i8>* %passthru
+  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
+  ret <64 x i8> %5
 }
-declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_psubusb
-  ;CHECK:       vpsubusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+  ;CHECK-LABEL: stack_fold_permbvar_maskz
+  ;CHECK:   vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <64 x i8> @llvm.x86.avx512.mask.psubus.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1)
-  ret <64 x i8> %2
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
+  %3 = bitcast i64 %mask to <64 x i1>
+  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
+  ret <64 x i8> %4
 }
-declare <64 x i8> @llvm.x86.avx512.mask.psubus.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone
 
-define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_psubusw
-  ;CHECK:       vpsubusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_permd
+  ;CHECK:   vpermd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1)
-  ret <32 x i16> %2
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %2
 }
-declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readonly
 
-define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_psubw
-  ;CHECK:       vpsubw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <32 x i16> %a0, %a1
-  ret <32 x i16> %2
+define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2b
+  ;CHECK:       vpermi2b {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+  ret <64 x i8> %res
 }
+declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
-  ;CHECK-LABEL: stack_fold_ternlogd
-  ;CHECK:       vpternlogd $33, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2d
+  ;CHECK:       vpermi2d {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
-  ;CHECK-LABEL: stack_fold_ternlogq
-  ;CHECK:       vpternlogq $33, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2q
+  ;CHECK:       vpermi2q {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
+  %res = call <8 x i64> @llvm.x86.avx512.mask.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
   ret <8 x i64> %res
 }
+declare <8 x i64> @llvm.x86.avx512.mask.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
-declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
+define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2w
+  ;CHECK:       vpermi2w {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+  ret <32 x i16> %res
+}
+declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovdb
-  ;CHECK:       vpmovdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <16 x i8> %1
+define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_permq
+  ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+  ; add forces execution domain
+  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ret <8 x i64> %3
 }
-declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
 
-define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovdw
-  ;CHECK:       vpmovdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
-  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <16 x i16> %1
+define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_permq_mask
+  ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+  %3 = bitcast i8 %mask to <8 x i1>
+  ; load needed to keep the operation from being scheduled above the asm block
+  %4 = load <8 x i64>, <8 x i64>* %passthru
+  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
+  ; add forces execution domain
+  %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ret <8 x i64> %6
 }
-declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
 
-define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovqd
-  ;CHECK:       vpmovqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
-  %1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <8 x i32> %1
+define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_permq_maskz
+  ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
+  ret <8 x i64> %4
 }
-declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
 
-define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovqw
-  ;CHECK:       vpmovqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <8 x i16> %1
+define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_permqvar
+  ;CHECK:   vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0, <8 x i64> undef, i8 -1)
+  ; add forces execution domain
+  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ret <8 x i64> %3
 }
-declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readonly
 
-define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovwb
-  ;CHECK:       vpmovwb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
-  %1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <32 x i8> %1
+define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_permqvar_mask
+  ;CHECK:   vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0, <8 x i64> undef, i8 -1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  ; load needed to keep the operation from being scheduled above the asm block
+  %4 = load <8 x i64>, <8 x i64>* %passthru
+  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
+  ; add forces execution domain
+  %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ret <8 x i64> %6
 }
-declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
 
-define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovsdb
-  ;CHECK:       vpmovsdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <16 x i8> %1
+define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2b
+  ;CHECK:       vpermt2b {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
+  ret <64 x i8> %res
 }
-declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
+declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
 
-define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovsdw
-  ;CHECK:       vpmovsdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
-  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <16 x i16> %1
+define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2d
+  ;CHECK:       vpermt2d {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
+  ret <16 x i32> %res
 }
-declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
+declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovsqd
-  ;CHECK:       vpmovsqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
-  %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <8 x i32> %1
+define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2q
+  ;CHECK:       vpermt2q {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
+  ret <8 x i64> %res
 }
-declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
+declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
-define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovsqw
-  ;CHECK:       vpmovsqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <8 x i16> %1
+define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2w
+  ;CHECK:       vpermt2w {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
+  ret <32 x i16> %res
 }
-declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
+declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovswb
-  ;CHECK:       vpmovswb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
-  %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <32 x i8> %1
+define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_permwvar
+  ;CHECK:   vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %2
 }
-declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
+declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readonly
 
-define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovusdb
-  ;CHECK:       vpmovusdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <16 x i8> %1
+define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_permwvar_mask
+  ;CHECK:   vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
+  %3 = bitcast i32 %mask to <32 x i1>
+  ; load needed to keep the operation from being scheduled above the asm block
+  %4 = load <32 x i16>, <32 x i16>* %passthru
+  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
+  ret <32 x i16> %5
 }
-declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
 
-define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovusdw
-  ;CHECK:       vpmovusdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
-  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <16 x i16> %1
+define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_permwvar_maskz
+  ;CHECK:   vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
+  ret <32 x i16> %4
 }
-declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
 
-define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovusqd
-  ;CHECK:       vpmovusqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
-  %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <8 x i32> %1
+define i32 @stack_fold_pextrd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pextrd
+  ;CHECK:       vpextrd $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 4-byte Folded Spill
+  ;CHECK:       movl    {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Reload
+  ; add forces execution domain
+  %1 = add <4 x i32> %a0, <i32 1, i32 2, i32 3, i32 4>
+  %2 = extractelement <4 x i32> %1, i32 1
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i32 %2
+}
+
+define i64 @stack_fold_pextrq(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_pextrq
+  ;CHECK:       vpextrq $1, {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 8-byte Folded Spill
+  ;CHECK:       movq    {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Reload
+  %1 = extractelement <2 x i64> %a0, i32 1
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  ret i64 %1
+}
+
+define <16 x i8> @stack_fold_pinsrb(<16 x i8> %a0, i8 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrb
+  ;CHECK:       vpinsrb $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <16 x i8> %a0, i8 %a1, i32 1
+  ret <16 x i8> %2
+}
+
+define <4 x i32> @stack_fold_pinsrd(<4 x i32> %a0, i32 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrd
+  ;CHECK:       vpinsrd $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <4 x i32> %a0, i32 %a1, i32 1
+  ret <4 x i32> %2
 }
-declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
 
-define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovusqw
-  ;CHECK:       vpmovusqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <8 x i16> %1
+define <2 x i64> @stack_fold_pinsrq(<2 x i64> %a0, i64 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrq
+  ;CHECK:       vpinsrq $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 8-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <2 x i64> %a0, i64 %a1, i32 1
+  ret <2 x i64> %2
 }
-declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
 
-define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovuswb
-  ;CHECK:       vpmovuswb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
-  %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <32 x i8> %1
+define <8 x i16> @stack_fold_pinsrw(<8 x i16> %a0, i16 %a1) {
+  ;CHECK-LABEL: stack_fold_pinsrw
+  ;CHECK:       vpinsrw $1, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 4-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = insertelement <8 x i16> %a0, i16 %a1, i32 1
+  ret <8 x i16> %2
 }
-declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
 
-define <4 x i32> @stack_fold_extracti32x4(<16 x i32> %a0, <16 x i32> %a1) {
-  ;CHECK-LABEL: stack_fold_extracti32x4
-  ;CHECK:       vextracti32x4 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
-  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <4 x i32> %2
+define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw_zmm
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %2
 }
+declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) nounwind readnone
 
-define <2 x i64> @stack_fold_extracti64x2(<8 x i64> %a0, <8 x i64> %a1) {
-  ;CHECK-LABEL: stack_fold_extracti64x2
-  ;CHECK:       vextracti64x2 $3, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-  %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <2 x i32> <i32 6, i32 7>
-  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <2 x i64> %2
+define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(<32 x i16>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1, <32 x i16> undef, i32 -1)
+  %3 = bitcast i32 %mask to <32 x i1>
+  ; load needed to keep the operation from being scheduled about the asm block
+  %4 = load <32 x i16>, <32 x i16>* %passthru
+  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
+  ret <32 x i16> %5
 }
 
-define <8 x i32> @stack_fold_extracti32x8(<16 x i32> %a0, <16 x i32> %a1) {
-  ;CHECK-LABEL: stack_fold_extracti32x8
-  ;CHECK:       vextracti32x8 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <16 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %2 = shufflevector <16 x i32> %1, <16 x i32> %a1, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <8 x i32> %2
+define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1, <32 x i16> undef, i32 -1)
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
+  ret <32 x i16> %4
 }
 
-define <4 x i64> @stack_fold_extracti64x4(<8 x i64> %a0, <8 x i64> %a1) {
-  ;CHECK-LABEL: stack_fold_extracti64x4
-  ;CHECK:       vextracti64x4 $1, {{%zmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 32-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i64> %a0, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-  %2 = shufflevector <8 x i64> %1, <8 x i64> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <4 x i64> %2
+define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddwd_zmm
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> undef, i16 -1)
+  ret <16 x i32> %2
 }
+declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16) nounwind readnone
 
-define <16 x i32> @stack_fold_inserti32x8(<8 x i32> %a0, <8 x i32> %a1) {
-  ;CHECK-LABEL: stack_fold_inserti32x8
-  ;CHECK:       vinserti32x8 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <16 x i32> @stack_fold_pmaddwd_zmm_mask(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddwd_zmm_mask
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i32> %a0, <8 x i32> %a1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-  ; add forces execution domain
-  %3 = add <16 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  ret <16 x i32> %3
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> undef, i16 -1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  ; load needed to keep the operation from being scheduled about the asm block
+  %4 = load <16 x i32>, <16 x i32>* %passthru
+  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
+  ret <16 x i32> %5
 }
 
-define <8 x i64> @stack_fold_inserti64x4(<4 x i64> %a0, <4 x i64> %a1) {
-  ;CHECK-LABEL: stack_fold_inserti64x4
-  ;CHECK:       vinserti64x4 $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <4 x i64> %a0, <4 x i64> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ; add forces execution domain
-  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-  ret <8 x i64> %3
+  %2 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> undef, i16 -1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  ret <16 x i32> %4
+}
+
+define <16 x i8> @stack_fold_vpmovdb(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovdb
+  ;CHECK:       vpmovdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <16 x i8> %1
+}
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i16> @stack_fold_vpmovdw(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovdw
+  ;CHECK:       vpmovdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <16 x i16> %1
 }
+declare <16 x i16> @llvm.x86.avx512.mask.pmov.dw.512(<16 x i32>, <16 x i16>, i16)
 
 define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
   ;CHECK-LABEL: stack_fold_movq_load
-  ;CHECK:       movq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  ;CHECK:       vmovq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = shufflevector <2 x i64> %a0, <2 x i64> zeroinitializer, <2 x i32> <i32 0, i32 2>
   ; add forces execution domain
@@ -426,77 +830,77 @@ define <2 x i64> @stack_fold_movq_load(<2 x i64> %a0) {
   ret <2 x i64> %3
 }
 
-define <16 x i32> @stack_fold_vpermt2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2d
-  ;CHECK:       vpermt2d {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
+define <8 x i32> @stack_fold_vpmovqd(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovqd
+  ;CHECK:       vpmovqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <8 x i32> %1
 }
-declare <16 x i32> @llvm.x86.avx512.mask.vpermi2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <8 x i32> @llvm.x86.avx512.mask.pmov.qd.512(<8 x i64>, <8 x i32>, i8)
 
-define <16 x i32> @stack_fold_vpermi2d(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2d
-  ;CHECK:       vpermi2d {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i16 -1)
-  ret <16 x i32> %res
+define <8 x i16> @stack_fold_vpmovqw(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovqw
+  ;CHECK:       vpmovqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <8 x i16> %1
 }
-declare <16 x i32> @llvm.x86.avx512.mask.vpermt2var.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.qw.512(<8 x i64>, <8 x i16>, i8)
 
-define <8 x i64> @stack_fold_vpermt2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2q
-  ;CHECK:       vpermt2q {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
+define <32 x i8> @stack_fold_vpmovwb(<32 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovwb
+  ;CHECK:       vpmovwb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+  %1 = call <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <32 x i8> %1
 }
-declare <8 x i64> @llvm.x86.avx512.mask.vpermi2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <32 x i8> @llvm.x86.avx512.mask.pmov.wb.512(<32 x i16>, <32 x i8>, i32)
 
-define <8 x i64> @stack_fold_vpermi2q(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2q
-  ;CHECK:       vpermi2q {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x i64> @llvm.x86.avx512.mask.vpermt2var.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1)
-  ret <8 x i64> %res
+define <16 x i8> @stack_fold_vpmovsdb(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovsdb
+  ;CHECK:       vpmovsdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <16 x i8> %1
 }
-declare <8 x i64> @llvm.x86.avx512.mask.vpermt2var.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
 
-define <32 x i16> @stack_fold_vpermt2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2w
-  ;CHECK:       vpermt2w {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
-  ret <32 x i16> %res
+define <16 x i16> @stack_fold_vpmovsdw(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovsdw
+  ;CHECK:       vpmovsdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <16 x i16> %1
 }
-declare <32 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+declare <16 x i16> @llvm.x86.avx512.mask.pmovs.dw.512(<16 x i32>, <16 x i16>, i16)
 
-define <32 x i16> @stack_fold_vpermi2w(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2w
-  ;CHECK:       vpermi2w {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 -1)
-  ret <32 x i16> %res
+define <8 x i32> @stack_fold_vpmovsqd(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovsqd
+  ;CHECK:       vpmovsqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <8 x i32> %1
 }
-declare <32 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
+declare <8 x i32> @llvm.x86.avx512.mask.pmovs.qd.512(<8 x i64>, <8 x i32>, i8)
 
-define <64 x i8> @stack_fold_vpermt2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2b
-  ;CHECK:       vpermt2b {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
-  ret <64 x i8> %res
+define <8 x i16> @stack_fold_vpmovsqw(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovsqw
+  ;CHECK:       vpmovsqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <8 x i16> %1
 }
-declare <64 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.qw.512(<8 x i64>, <8 x i16>, i8)
 
-define <64 x i8> @stack_fold_vpermi2b(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2b
-  ;CHECK:       vpermi2b {{-?[0-9]*}}(%rsp), %zmm1, %zmm0 # 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8> %x0, <64 x i8> %x1, <64 x i8> %x2, i64 -1)
-  ret <64 x i8> %res
+define <32 x i8> @stack_fold_vpmovswb(<32 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovswb
+  ;CHECK:       vpmovswb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+  %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <32 x i8> %1
 }
-declare <64 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64)
+declare <32 x i8> @llvm.x86.avx512.mask.pmovs.wb.512(<32 x i16>, <32 x i8>, i32)
 
 define <16 x i32> @stack_fold_pmovsxbd_zmm(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxbd_zmm
@@ -567,6 +971,51 @@ define <8 x i64> @stack_fold_pmovsxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
   ret <8 x i64> %4
 }
 
+define <16 x i8> @stack_fold_vpmovusdb(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovusdb
+  ;CHECK:       vpmovusdb %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32> %a0, <16 x i8> undef, i16 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <16 x i8> %1
+}
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.db.512(<16 x i32>, <16 x i8>, i16)
+
+define <16 x i16> @stack_fold_vpmovusdw(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovusdw
+  ;CHECK:       vpmovusdw %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+  %1 = call <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32> %a0, <16 x i16> undef, i16 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <16 x i16> %1
+}
+declare <16 x i16> @llvm.x86.avx512.mask.pmovus.dw.512(<16 x i32>, <16 x i16>, i16)
+
+define <8 x i32> @stack_fold_vpmovusqd(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovusqd
+  ;CHECK:       vpmovusqd %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+  %1 = call <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64> %a0, <8 x i32> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <8 x i32> %1
+}
+declare <8 x i32> @llvm.x86.avx512.mask.pmovus.qd.512(<8 x i64>, <8 x i32>, i8)
+
+define <8 x i16> @stack_fold_vpmovusqw(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovusqw
+  ;CHECK:       vpmovusqw %zmm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64> %a0, <8 x i16> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.qw.512(<8 x i64>, <8 x i16>, i8)
+
+define <32 x i8> @stack_fold_vpmovuswb(<32 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovuswb
+  ;CHECK:       vpmovuswb %zmm0, {{-?[0-9]*}}(%rsp) # 32-byte Folded Spill
+  %1 = call <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16> %a0, <32 x i8> undef, i32 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <32 x i8> %1
+}
+declare <32 x i8> @llvm.x86.avx512.mask.pmovus.wb.512(<32 x i16>, <32 x i8>, i32)
+
 define <16 x i32> @stack_fold_pmovzxbd_zmm(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovzxbd_zmm
   ;CHECK:       vpmovzxbd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -636,35 +1085,14 @@ define <8 x i64> @stack_fold_pmovzxwq_maskz_zmm(<8 x i16> %a0, i8 %mask) {
   ret <8 x i64> %4
 }
 
-define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_punpckhbw_zmm
-  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
-  ret <64 x i8> %2
-}
-
-define <64 x i8> @stack_fold_punpckhbw_mask_zmm(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-  ;CHECK-LABEL: stack_fold_punpckhbw_mask_zmm
-  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
-  %3 = bitcast i64 %mask to <64 x i1>
-  ; load needed to keep the operation from being scheduled about the asm block
-  %4 = load <64 x i8>, <64 x i8>* %passthru
-  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
-  ret <64 x i8> %5
-}
-
-define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-  ;CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm
-  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
-  %3 = bitcast i64 %mask to <64 x i1>
-  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
-  ret <64 x i8> %4
+define <8 x i64> @stack_fold_psadbw(<64 x i8> %a0, <64 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psadbw
+  ;CHECK:       vpsadbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8> %a0, <64 x i8> %a1)
+  ret <8 x i64> %2
 }
+declare <8 x i64> @llvm.x86.avx512.psad.bw.512(<64 x i8>, <64 x i8>) nounwind readnone
 
 define <64 x i8> @stack_fold_pshufb_zmm(<64 x i8> %a0, <64 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_pshufb_zmm
@@ -776,280 +1204,440 @@ define <32 x i16> @stack_fold_pshuflw_zmm_maskz(<32 x i16> %a0, i32 %mask) {
   ret <32 x i16> %4
 }
 
-define <32 x i16> @stack_fold_pmaddubsw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_pmaddubsw_zmm
-  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_pslld(<16 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pslld
+  ;CHECK:       vpslld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1, <32 x i16> undef, i32 -1)
-  ret <32 x i16> %2
+  %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
+  ret <16 x i32> %2
 }
-declare <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8>, <64 x i8>, <32 x i16>, i32) nounwind readnone
+declare <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32>, <4 x i32>) nounwind readnone
 
-define <32 x i16> @stack_fold_pmaddubsw_zmm_mask(<32 x i16>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddubsw_zmm_mask
-  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_pslld_mask(<16 x i32>* %passthru, <16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pslld_mask
+  ;CHECK:       vpslld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1, <32 x i16> undef, i32 -1)
-  %3 = bitcast i32 %mask to <32 x i1>
-  ; load needed to keep the operation from being scheduled about the asm block
-  %4 = load <32 x i16>, <32 x i16>* %passthru
-  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
-  ret <32 x i16> %5
+  %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = load <16 x i32>, <16 x i32>* %passthru
+  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
+  ret <16 x i32> %5
 }
 
-define <32 x i16> @stack_fold_pmaddubsw_zmm_maskz(<64 x i8> %a0, <64 x i8> %a1, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddubsw_zmm_maskz
-  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_pslld_maskz(<16 x i32> %a0, <4 x i32> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pslld_maskz
+  ;CHECK:       vpslld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i16> @llvm.x86.avx512.mask.pmaddubs.w.512(<64 x i8> %a0, <64 x i8> %a1, <32 x i16> undef, i32 -1)
-  %3 = bitcast i32 %mask to <32 x i1>
-  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
-  ret <32 x i16> %4
+  %2 = call <16 x i32> @llvm.x86.avx512.psll.d.512(<16 x i32> %a0, <4 x i32> %a1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  ret <16 x i32> %4
 }
 
-define <16 x i32> @stack_fold_pmaddwd_zmm(<32 x i16> %a0, <32 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_pmaddwd_zmm
-  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_pslldi(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pslldi
+  ;CHECK:       vpslld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
+  ret <16 x i32> %2
+}
+declare <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32>, i32) nounwind readnone
+
+define <16 x i32> @stack_fold_pslldi_mask(<16 x i32>* %passthru, <16 x i32> %a0, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pslldi_mask
+  ;CHECK:       vpslld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = load <16 x i32>, <16 x i32>* %passthru
+  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
+  ret <16 x i32> %5
+}
+
+define <16 x i32> @stack_fold_pslldi_maskz(<16 x i32> %a0, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pslldi_maskz
+  ;CHECK:       vpslld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.pslli.d.512(<16 x i32> %a0, i32 1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  ret <16 x i32> %4
+}
+
+define <64 x i8> @stack_fold_pslldq(<64 x i8> %a, <64 x i8> %b) {
+  ;CHECK-LABEL: stack_fold_pslldq
+  ;CHECK:       vpslldq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 79, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 95, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 111, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 127, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62>
+  ret <64 x i8> %2
+}
+
+define <8 x i64> @stack_fold_psllq(<8 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllq
+  ;CHECK:       vpsllq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> undef, i16 -1)
+  %2 = call <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64> %a0, <2 x i64> %a1)
+  ret <8 x i64> %2
+}
+declare <8 x i64> @llvm.x86.avx512.psll.q.512(<8 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i64> @stack_fold_psllqi(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_psllqi
+  ;CHECK:       vpsllq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64> %a0, i32 1)
+  ret <8 x i64> %2
+}
+declare <8 x i64> @llvm.x86.avx512.pslli.q.512(<8 x i64>, i32) nounwind readnone
+
+define <16 x i32> @stack_fold_psllvd(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvd
+  ;CHECK:       vpsllvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
   ret <16 x i32> %2
 }
-declare <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16>, <32 x i16>, <16 x i32>, i16) nounwind readnone
+declare <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
 
-define <16 x i32> @stack_fold_pmaddwd_zmm_mask(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddwd_zmm_mask
-  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_psllvd_mask(<16 x i32>* %passthru, <16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_psllvd_mask
+  ;CHECK:       vpsllvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> undef, i16 -1)
+  %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
   %3 = bitcast i16 %mask to <16 x i1>
-  ; load needed to keep the operation from being scheduled about the asm block
   %4 = load <16 x i32>, <16 x i32>* %passthru
   %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
   ret <16 x i32> %5
 }
 
-define <16 x i32> @stack_fold_pmaddwd_zmm_maskz(<16 x i32>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddwd_zmm_maskz
-  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_psllvd_maskz(<16 x i32> %a0, <16 x i32> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_psllvd_maskz
+  ;CHECK:       vpsllvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.psllv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
+  ret <16 x i32> %4
+}
+
+define <8 x i64> @stack_fold_psllvq(<8 x i64> %a0, <8 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvq
+  ;CHECK:       vpsllvq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  ret <8 x i64> %2
+}
+declare <8 x i64> @llvm.x86.avx512.psllv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+define <32 x i16> @stack_fold_psllvw(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvw
+  ;CHECK:       vpsllvw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16> %a0, <32 x i16> %a1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.psllv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
+
+define <32 x i16> @stack_fold_psllw(<32 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psllw
+  ;CHECK:       vpsllw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16> %a0, <8 x i16> %a1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.psll.w.512(<32 x i16>, <8 x i16>) nounwind readnone
+
+define <32 x i16> @stack_fold_psllwi(<32 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_psllwi
+  ;CHECK:       vpsllw $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16> %a0, i32 1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.pslli.w.512(<32 x i16>, i32) nounwind readnone
+
+define <16 x i32> @stack_fold_psrad(<16 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrad
+  ;CHECK:       vpsrad {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32> %a0, <4 x i32> %a1)
+  ret <16 x i32> %2
+}
+declare <16 x i32> @llvm.x86.avx512.psra.d.512(<16 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i32> @stack_fold_psradi(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_psradi
+  ;CHECK:       vpsrad $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32> %a0, i32 1)
+  ret <16 x i32> %2
+}
+declare <16 x i32> @llvm.x86.avx512.psrai.d.512(<16 x i32>, i32) nounwind readnone
+
+define <8 x i64> @stack_fold_psraq(<8 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psraq
+  ;CHECK:       vpsraq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64> %a0, <2 x i64> %a1)
+  ret <8 x i64> %2
+}
+declare <8 x i64> @llvm.x86.avx512.psra.q.512(<8 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i64> @stack_fold_psraqi(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_psraqi
+  ;CHECK:       vpsraq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64> %a0, i32 1)
+  ret <8 x i64> %2
+}
+declare <8 x i64> @llvm.x86.avx512.psrai.q.512(<8 x i64>, i32) nounwind readnone
+
+define <16 x i32> @stack_fold_psravd(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psravd
+  ;CHECK:       vpsravd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  ret <16 x i32> %2
+}
+declare <16 x i32> @llvm.x86.avx512.psrav.d.512(<16 x i32>, <16 x i32>) nounwind readnone
+
+define <8 x i64> @stack_fold_psravq(<8 x i64> %a0, <8 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psravq
+  ;CHECK:       vpsravq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  ret <8 x i64> %2
+}
+declare <8 x i64> @llvm.x86.avx512.psrav.q.512(<8 x i64>, <8 x i64>) nounwind readnone
+
+define <32 x i16> @stack_fold_psravw(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psravw
+  ;CHECK:       vpsravw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16> %a0, <32 x i16> %a1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.psrav.w.512(<32 x i16>, <32 x i16>) nounwind readnone
+
+define <32 x i16> @stack_fold_psraw(<32 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psraw
+  ;CHECK:       vpsraw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i32> @llvm.x86.avx512.mask.pmaddw.d.512(<32 x i16> %a0, <32 x i16> %a1, <16 x i32> undef, i16 -1)
-  %3 = bitcast i16 %mask to <16 x i1>
-  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
-  ret <16 x i32> %4
+  %2 = call <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16> %a0, <8 x i16> %a1)
+  ret <32 x i16> %2
 }
+declare <32 x i16> @llvm.x86.avx512.psra.w.512(<32 x i16>, <8 x i16>) nounwind readnone
 
-define <16 x i32> @stack_fold_permd(<16 x i32> %a0, <16 x i32> %a1) {
-  ;CHECK-LABEL: stack_fold_permd
-  ;CHECK:   vpermd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <32 x i16> @stack_fold_psrawi(<32 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_psrawi
+  ;CHECK:       vpsraw $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16> %a0, i32 1)
+  ret <32 x i16> %2
+}
+declare <32 x i16> @llvm.x86.avx512.psrai.w.512(<32 x i16>, i32) nounwind readnone
+
+define <16 x i32> @stack_fold_psrld(<16 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrld
+  ;CHECK:       vpsrld {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32> %a1, <16 x i32> %a0, <16 x i32> undef, i16 -1)
+  %2 = call <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32> %a0, <4 x i32> %a1)
   ret <16 x i32> %2
 }
-declare <16 x i32> @llvm.x86.avx512.mask.permvar.si.512(<16 x i32>, <16 x i32>, <16 x i32>, i16) nounwind readonly
+declare <16 x i32> @llvm.x86.avx512.psrl.d.512(<16 x i32>, <4 x i32>) nounwind readnone
 
-define <8 x i64> @stack_fold_permq(<8 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_permq
-  ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_psrldi(<16 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_psrldi
+  ;CHECK:       vpsrld $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
-  ; add forces execution domain
-  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-  ret <8 x i64> %3
+  %2 = call <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32> %a0, i32 1)
+  ret <16 x i32> %2
 }
+declare <16 x i32> @llvm.x86.avx512.psrli.d.512(<16 x i32>, i32) nounwind readnone
 
-define <8 x i64> @stack_fold_permq_mask(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_permq_mask
-  ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+define <64 x i8> @stack_fold_psrldq(<64 x i8> %a, <64 x i8> %b) {
+  ;CHECK-LABEL: stack_fold_psrldq
+  ;CHECK:       vpsrldq $2, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
-  %3 = bitcast i8 %mask to <8 x i1>
-  ; load needed to keep the operation from being scheduled above the asm block
-  %4 = load <8 x i64>, <8 x i64>* %passthru
-  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
-  ; add forces execution domain
-  %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-  ret <8 x i64> %6
+  %2 = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 64, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 64, i32 64, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 64, i32 64, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 64, i32 64>
+  ret <64 x i8> %2
 }
 
-define <8 x i64> @stack_fold_permq_maskz(<8 x i64>* %passthru, <8 x i64> %a0, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_permq_maskz
-  ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+define <8 x i64> @stack_fold_psrlq(<8 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlq
+  ;CHECK:       vpsrlq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64> %a0, <2 x i64> %a1)
+  ret <8 x i64> %2
+}
+declare <8 x i64> @llvm.x86.avx512.psrl.q.512(<8 x i64>, <2 x i64>) nounwind readnone
+
+define <8 x i64> @stack_fold_psrlqi(<8 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_psrlqi
+  ;CHECK:       vpsrlq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> <i32 3, i32 2, i32 2, i32 3, i32 7, i32 6, i32 6, i32 7>
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
-  ret <8 x i64> %4
+  %2 = call <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64> %a0, i32 1)
+  ret <8 x i64> %2
 }
+declare <8 x i64> @llvm.x86.avx512.psrli.q.512(<8 x i64>, i32) nounwind readnone
 
-define <8 x i64> @stack_fold_permqvar(<8 x i64> %a0, <8 x i64> %a1) {
-  ;CHECK-LABEL: stack_fold_permqvar
-  ;CHECK:   vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_psrlvd(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvd
+  ;CHECK:       vpsrlvd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0, <8 x i64> undef, i8 -1)
-  ; add forces execution domain
-  %3 = add <8 x i64> %2, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-  ret <8 x i64> %3
+  %2 = call <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32> %a0, <16 x i32> %a1)
+  ret <16 x i32> %2
 }
-declare <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64>, <8 x i64>, <8 x i64>, i8) nounwind readonly
+declare <16 x i32> @llvm.x86.avx512.psrlv.d.512(<16 x i32>, <16 x i32>) nounwind readnone
 
-define <8 x i64> @stack_fold_permqvar_mask(<8 x i64>* %passthru, <8 x i64> %a0, <8 x i64> %a1, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_permqvar_mask
-  ;CHECK:   vpermq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %a1, <8 x i64> %a0, <8 x i64> undef, i8 -1)
-  %3 = bitcast i8 %mask to <8 x i1>
-  ; load needed to keep the operation from being scheduled above the asm block
-  %4 = load <8 x i64>, <8 x i64>* %passthru
-  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
-  ; add forces execution domain
-  %6 = add <8 x i64> %5, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
-  ret <8 x i64> %6
+define <8 x i64> @stack_fold_psrlvq(<8 x i64> %a0, <8 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvq
+  ;CHECK:       vpsrlvq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <8 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64> %a0, <8 x i64> %a1)
+  ret <8 x i64> %2
 }
+declare <8 x i64> @llvm.x86.avx512.psrlv.q.512(<8 x i64>, <8 x i64>) nounwind readnone
 
-define <32 x i16> @stack_fold_permwvar(<32 x i16> %a0, <32 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_permwvar
-  ;CHECK:   vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <32 x i16> @stack_fold_psrlvw(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvw
+  ;CHECK:       vpsrlvw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
+  %2 = call <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16> %a0, <32 x i16> %a1)
   ret <32 x i16> %2
 }
-declare <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readonly
+declare <32 x i16> @llvm.x86.avx512.psrlv.w.512(<32 x i16>, <32 x i16>) nounwind readnone
 
-define <32 x i16> @stack_fold_permwvar_mask(<32 x i16>* %passthru, <32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_permwvar_mask
-  ;CHECK:   vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+define <32 x i16> @stack_fold_psrlw(<32 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlw
+  ;CHECK:       vpsrlw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
-  %3 = bitcast i32 %mask to <32 x i1>
-  ; load needed to keep the operation from being scheduled above the asm block
-  %4 = load <32 x i16>, <32 x i16>* %passthru
-  %5 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> %4
-  ret <32 x i16> %5
+  %2 = call <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16> %a0, <8 x i16> %a1)
+  ret <32 x i16> %2
 }
+declare <32 x i16> @llvm.x86.avx512.psrl.w.512(<32 x i16>, <8 x i16>) nounwind readnone
 
-define <32 x i16> @stack_fold_permwvar_maskz(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_permwvar_maskz
-  ;CHECK:   vpermw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %a1, <32 x i16> %a0, <32 x i16> undef, i32 -1)
-  %3 = bitcast i32 %mask to <32 x i1>
-  %4 = select <32 x i1> %3, <32 x i16> %2, <32 x i16> zeroinitializer
-  ret <32 x i16> %4
+define <32 x i16> @stack_fold_psrlwi(<32 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_psrlwi
+  ;CHECK:       vpsrlw $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16> %a0, i32 1)
+  ret <32 x i16> %2
 }
+declare <32 x i16> @llvm.x86.avx512.psrli.w.512(<32 x i16>, i32) nounwind readnone
 
-define <64 x i8> @stack_fold_permbvar(<64 x i8> %a0, <64 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_permbvar
-  ;CHECK:   vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+define <64 x i8> @stack_fold_psubb(<64 x i8> %a0, <64 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubb
+  ;CHECK:       vpsubb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
+  %2 = sub <64 x i8> %a0, %a1
   ret <64 x i8> %2
 }
-declare <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readonly
 
-define <64 x i8> @stack_fold_permbvar_mask(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-  ;CHECK-LABEL: stack_fold_permbvar_mask
-  ;CHECK:   vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+define <16 x i32> @stack_fold_psubd(<16 x i32> %a0, <16 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psubd
+  ;CHECK:       vpsubd {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
-  %3 = bitcast i64 %mask to <64 x i1>
-  ; load needed to keep the operation from being scheduled above the asm block
-  %4 = load <64 x i8>, <64 x i8>* %passthru
-  %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
-  ret <64 x i8> %5
+  %2 = sub <16 x i32> %a0, %a1
+  ret <16 x i32> %2
 }
 
-define <64 x i8> @stack_fold_permbvar_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-  ;CHECK-LABEL: stack_fold_permbvar_maskz
-  ;CHECK:   vpermb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+define <8 x i64> @stack_fold_psubq(<8 x i64> %a0, <8 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psubq
+  ;CHECK:       vpsubq {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <64 x i8> @llvm.x86.avx512.mask.permvar.qi.512(<64 x i8> %a1, <64 x i8> %a0, <64 x i8> undef, i64 -1)
-  %3 = bitcast i64 %mask to <64 x i1>
-  %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
-  ret <64 x i8> %4
+  %2 = sub <8 x i64> %a0, %a1
+  ret <8 x i64> %2
 }
 
-define <8 x i64> @stack_fold_valignq(<8 x i64> %a, <8 x i64> %b) {
-  ;CHECK-LABEL: stack_fold_valignq
-  ;CHECK:   valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  ret <8 x i64> %2
+define <64 x i8> @stack_fold_psubsb(<64 x i8> %a0, <64 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsb
+  ;CHECK:       vpsubsb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %2
 }
+declare <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone
 
-define <8 x i64> @stack_fold_valignq_mask(<8 x i64> %a, <8 x i64> %b, <8 x i64>* %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_valignq_mask
-  ;CHECK:   valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = load <8 x i64>, <8 x i64>* %passthru
-  %5 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> %4
-  ret <8 x i64> %5
+define <32 x i16> @stack_fold_psubsw(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsw
+  ;CHECK:       vpsubsw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %2
 }
+declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <8 x i64> @stack_fold_valignq_maskz(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_valignq_maskz
-  ;CHECK:   valignq $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x i64> %2, <8 x i64> zeroinitializer
-  ret <8 x i64> %4
+define <64 x i8> @stack_fold_psubusb(<64 x i8> %a0, <64 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusb
+  ;CHECK:       vpsubusb {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <64 x i8> @llvm.x86.avx512.mask.psubus.b.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> undef, i64 -1)
+  ret <64 x i8> %2
 }
+declare <64 x i8> @llvm.x86.avx512.mask.psubus.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone
 
-define <16 x i32> @stack_fold_valignd(<16 x i32> %a, <16 x i32> %b) {
-  ;CHECK-LABEL: stack_fold_valignd
-  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
-  ret <16 x i32> %2
+define <32 x i16> @stack_fold_psubusw(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusw
+  ;CHECK:       vpsubusw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16> %a0, <32 x i16> %a1, <32 x i16> undef, i32 -1)
+  ret <32 x i16> %2
 }
+declare <32 x i16> @llvm.x86.avx512.mask.psubus.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32)
 
-define <16 x i32> @stack_fold_valignd_mask(<16 x i32> %a, <16 x i32> %b, <16 x i32>* %passthru, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_valignd_mask
-  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
-  %3 = bitcast i16 %mask to <16 x i1>
-  %4 = load <16 x i32>, <16 x i32>* %passthru
-  %5 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> %4
-  ret <16 x i32> %5
+define <32 x i16> @stack_fold_psubw(<32 x i16> %a0, <32 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubw
+  ;CHECK:       vpsubw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = sub <32 x i16> %a0, %a1
+  ret <32 x i16> %2
 }
 
-define <16 x i32> @stack_fold_valignd_maskz(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_valignd_maskz
-  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16>
-  %3 = bitcast i16 %mask to <16 x i1>
-  %4 = select <16 x i1> %3, <16 x i32> %2, <16 x i32> zeroinitializer
-  ret <16 x i32> %4
+define <16 x i32> @stack_fold_ternlogd(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2) {
+  ;CHECK-LABEL: stack_fold_ternlogd
+  ;CHECK:       vpternlogd $33, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32> %x0, <16 x i32> %x1, <16 x i32> %x2, i32 33, i16 -1)
+  ret <16 x i32> %res
 }
+declare <16 x i32> @llvm.x86.avx512.mask.pternlog.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i32, i16)
 
-define <64 x i8> @stack_fold_palignr(<64 x i8> %a0, <64 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_palignr
-  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
+define <8 x i64> @stack_fold_ternlogq(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2) {
+  ;CHECK-LABEL: stack_fold_ternlogq
+  ;CHECK:       vpternlogq $33, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i32 33, i8 -1)
+  ret <8 x i64> %res
+}
+
+declare <8 x i64> @llvm.x86.avx512.mask.pternlog.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i32, i8)
+
+define <64 x i8> @stack_fold_punpckhbw_zmm(<64 x i8> %a0, <64 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhbw_zmm
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
   ret <64 x i8> %2
 }
 
-define <64 x i8> @stack_fold_palignr_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8>* %passthru, i64 %mask) {
-  ;CHECK-LABEL: stack_fold_palignr_mask
-  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
+define <64 x i8> @stack_fold_punpckhbw_mask_zmm(<64 x i8>* %passthru, <64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+  ;CHECK-LABEL: stack_fold_punpckhbw_mask_zmm
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
   %3 = bitcast i64 %mask to <64 x i1>
+  ; load needed to keep the operation from being scheduled about the asm block
   %4 = load <64 x i8>, <64 x i8>* %passthru
   %5 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> %4
   ret <64 x i8> %5
 }
 
-define <64 x i8> @stack_fold_palignr_maskz(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
-  ;CHECK-LABEL: stack_fold_palignr_maskz
-  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <64 x i8> %a1, <64 x i8> %a0, <64 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112>
+define <64 x i8> @stack_fold_punpckhbw_maskz_zmm(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) {
+  ;CHECK-LABEL: stack_fold_punpckhbw_maskz_zmm
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%zmm[0-9][0-9]*}}, {{%zmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 64-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 8, i32 72, i32 9, i32 73, i32 10, i32 74, i32 11, i32 75, i32 12, i32 76, i32 13, i32 77, i32 14, i32 78, i32 15, i32 79, i32 24, i32 88, i32 25, i32 89, i32 26, i32 90, i32 27, i32 91, i32 28, i32 92, i32 29, i32 93, i32 30, i32 94, i32 31, i32 95, i32 40, i32 104, i32 41, i32 105, i32 42, i32 106, i32 43, i32 107, i32 44, i32 108, i32 45, i32 109, i32 46, i32 110, i32 47, i32 111, i32 56, i32 120, i32 57, i32 121, i32 58, i32 122, i32 59, i32 123, i32 60, i32 124, i32 61, i32 125, i32 62, i32 126, i32 63, i32 127>
   %3 = bitcast i64 %mask to <64 x i1>
   %4 = select <64 x i1> %3, <64 x i8> %2, <64 x i8> zeroinitializer
   ret <64 x i8> %4
diff --git a/test/CodeGen/X86/stack-folding-int-avx512vl.ll b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
index 77afc49b25765b9bf749c2bf04aba1d1f0aa1eb1..7ce798f778a3a6802b6cbb734b99c1c08965731a 100644
--- a/test/CodeGen/X86/stack-folding-int-avx512vl.ll
+++ b/test/CodeGen/X86/stack-folding-int-avx512vl.ll
@@ -8,6 +8,263 @@ target triple = "x86_64-unknown-unknown"
 ; By including a nop call with sideeffects we can force a partial register spill of the
 ; relevant registers and check that the reload is correctly folded into the instruction.
 
+define <8 x i32> @stack_fold_valignd_ymm(<8 x i32> %a, <8 x i32> %b) {
+  ;CHECK-LABEL: stack_fold_valignd_ymm
+  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  ret <8 x i32> %2
+}
+
+define <8 x i32> @stack_fold_valignd_ymm_mask(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %passthru, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_valignd_ymm_mask
+  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = load <8 x i32>, <8 x i32>* %passthru
+  %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4
+  ret <8 x i32> %5
+}
+
+define <8 x i32> @stack_fold_valignd_ymm_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_valignd_ymm_maskz
+  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
+  ret <8 x i32> %4
+}
+
+define <4 x i64> @stack_fold_valignq_ymm(<4 x i64> %a, <4 x i64> %b) {
+  ;CHECK-LABEL: stack_fold_valignq_ymm
+  ;CHECK:   valignq $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
+  ret <4 x i64> %2
+}
+
+define <16 x i8> @stack_fold_pavgb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgb
+  ;CHECK:       vpavgb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.pavg.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <32 x i8> @stack_fold_pavgb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgb_ymm
+  ;CHECK:       vpavgb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pavg.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <8 x i16> @stack_fold_pavgw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgw
+  ;CHECK:       vpavgw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_pavgw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pavgw_ymm
+  ;CHECK:       vpavgw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pavg.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_extracti32x4
+  ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+  ; add forces execution domain
+  %1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <4 x i32> %2
+}
+
+define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_extracti64x2
+  ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
+  ; add forces execution domain
+  %1 = add <4 x i64> %a0, <i64 1, i64 1, i64 1, i64 1>
+  %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3>
+  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <2 x i64> %2
+}
+
+define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_inserti32x4
+  ;CHECK:       vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ; add forces execution domain
+  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <8 x i32> %3
+}
+
+define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_inserti64x2
+  ;CHECK:       vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ; add forces execution domain
+  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %3
+}
+
+define <16 x i8> @stack_fold_pabsb(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsb
+  ;CHECK:       vpabsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8> %a0)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.ssse3.pabs.b.128(<16 x i8>) nounwind readnone
+
+define <32 x i8> @stack_fold_pabsb_ymm(<32 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsb_ymm
+  ;CHECK:       vpabsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8> %a0)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pabs.b(<32 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pabsd(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsd
+  ;CHECK:       vpabsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32> %a0)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.ssse3.pabs.d.128(<4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_pabsd_ymm(<8 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsd_ymm
+  ;CHECK:       vpabsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32> %a0)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pabs.d(<8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_pabsq(<2 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsq
+  ;CHECK:       vpabsq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64> %a0, <2 x i64> undef, i8 -1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx512.mask.pabs.q.128(<2 x i64>, <2 x i64>, i8) nounwind readnone
+
+define <4 x i64> @stack_fold_pabsq_ymm(<4 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsq_ymm
+  ;CHECK:       vpabsq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64> %a0, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx512.mask.pabs.q.256(<4 x i64>, <4 x i64>, i8) nounwind readnone
+
+define <8 x i16> @stack_fold_pabsw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsw
+  ;CHECK:       vpabsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16> %a0)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.ssse3.pabs.w.128(<8 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_pabsw_ymm(<16 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pabsw_ymm
+  ;CHECK:       vpabsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16> %a0)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pabs.w(<16 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_packssdw(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packssdw
+  ;CHECK:       vpackssdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_packssdw_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packssdw_ymm
+  ;CHECK:       vpackssdw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_packsswb(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packsswb
+  ;CHECK:       vpacksswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_packsswb_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packsswb_ymm
+  ;CHECK:       vpacksswb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> %a0, <16 x i16> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_packusdw(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packusdw
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i16> @stack_fold_packusdw_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_packusdw_ymm
+  ;CHECK:       vpackusdw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_packuswb(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packuswb
+  ;CHECK:       vpackuswb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <32 x i8> @stack_fold_packuswb_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_packuswb_ymm
+  ;CHECK:       vpackuswb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> %a0, <16 x i16> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
 define <16 x i8> @stack_fold_paddb(<16 x i8> %a0, <16 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_paddb
   ;CHECK:       vpaddb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -188,6 +445,35 @@ define <16 x i16> @stack_fold_paddw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
   ret <16 x i16> %2
 }
 
+define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_palignr
+  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+  ret <32 x i8> %2
+}
+
+define <32 x i8> @stack_fold_palignr_mask(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %passthru, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_palignr_mask
+  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = load <32 x i8>, <32 x i8>* %passthru
+  %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4
+  ret <32 x i8> %5
+}
+
+define <32 x i8> @stack_fold_palignr_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_palignr_maskz
+  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+  %3 = bitcast i32 %mask to <32 x i1>
+  %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
+  ret <32 x i8> %4
+}
+
 define i16 @stack_fold_pcmpeqb(<16 x i8> %a0, <16 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_pcmpeqb
   ;CHECK:       vpcmpeqb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%k[0-7]}} {{.*#+}} 16-byte Folded Reload
@@ -226,513 +512,711 @@ define i8 @stack_fold_pcmpeqw(<8 x i16> %a0, <8 x i16> %a1) {
   ret i8 %3
 }
 
-define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_psubb
-  ;CHECK:       vpsubb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <16 x i8> %a0, %a1
-  ret <16 x i8> %2
+define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_permbvar
+  ;CHECK:   vpermb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a1, <32 x i8> %a0, <32 x i8> undef, i32 -1)
+  ; add forces execution domain
+  %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  ret <32 x i8> %3
 }
+declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) nounwind readonly
 
-define <32 x i8> @stack_fold_psubb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_psubb_ymm
-  ;CHECK:       vpsubb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_permd
+  ;CHECK:   vpermd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <32 x i8> %a0, %a1
-  ret <32 x i8> %2
+  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
+  ret <8 x i32> %2
 }
+declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
 
-define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL: stack_fold_psubd
-  ;CHECK:       vpsubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <4 x i32> %a0, %a1
-  ret <4 x i32> %2
+define <16 x i8> @stack_fold_vpermi2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2b
+  ;CHECK:       vpermi2b {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+  ret <16 x i8> %res
 }
+declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
 
-define <8 x i32> @stack_fold_psubd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
-  ;CHECK-LABEL: stack_fold_psubd_ymm
-  ;CHECK:       vpsubd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <8 x i32> %a0, %a1
-  ret <8 x i32> %2
+define <32 x i8> @stack_fold_vpermi2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2b_ymm
+  ;CHECK:       vpermi2b {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+  ret <32 x i8> %res
 }
+declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
 
-define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
-  ;CHECK-LABEL: stack_fold_psubq
-  ;CHECK:       vpsubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <2 x i64> %a0, %a1
-  ret <2 x i64> %2
+define <4 x i32> @stack_fold_vpermi2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2d
+  ;CHECK:       vpermi2d {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  ret <4 x i32> %res
 }
+declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
 
-define <4 x i64> @stack_fold_psubq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
-  ;CHECK-LABEL: stack_fold_psubq_ymm
-  ;CHECK:       vpsubq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <4 x i64> %a0, %a1
-  ret <4 x i64> %2
+define <8 x i32> @stack_fold_vpermi2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2d_ymm
+  ;CHECK:       vpermi2d {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+  ret <8 x i32> %res
 }
+declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
 
-define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_psubsb
-  ;CHECK:       vpsubsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <2 x i64> @stack_fold_vpermi2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2q
+  ;CHECK:       vpermi2q {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <4 x i64> @stack_fold_vpermi2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2q_ymm
+  ;CHECK:       vpermi2q {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <8 x i16> @stack_fold_vpermi2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2w
+  ;CHECK:       vpermi2w {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <16 x i16> @stack_fold_vpermi2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermi2w_ymm
+  ;CHECK:       vpermi2w {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_permq
+  ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
+  ; add forces execution domain
+  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %3
+}
+
+define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_permqvar
+  ;CHECK:   vpermq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a1, <4 x i64> %a0, <4 x i64> undef, i8 -1)
+  ; add forces execution domain
+  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
+  ret <4 x i64> %3
+}
+declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readonly
+
+define <16 x i8> @stack_fold_vpermt2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2b
+  ;CHECK:       vpermt2b {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
+  ret <16 x i8> %res
+}
+declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+
+define <32 x i8> @stack_fold_vpermt2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2b_ymm
+  ;CHECK:       vpermt2b {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
+  ret <32 x i8> %res
+}
+declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+
+define <4 x i32> @stack_fold_vpermt2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2d
+  ;CHECK:       vpermt2d {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
+  ret <4 x i32> %res
+}
+declare <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+
+define <8 x i32> @stack_fold_vpermt2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2d_ymm
+  ;CHECK:       vpermt2d {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
+  ret <8 x i32> %res
+}
+declare <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+
+define <2 x i64> @stack_fold_vpermt2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2q
+  ;CHECK:       vpermt2q {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+
+define <4 x i64> @stack_fold_vpermt2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2q_ymm
+  ;CHECK:       vpermt2q {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+
+define <8 x i16> @stack_fold_vpermt2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2w
+  ;CHECK:       vpermt2w {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
+  ret <8 x i16> %res
+}
+declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+
+define <16 x i16> @stack_fold_vpermt2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) {
+  ;CHECK-LABEL: stack_fold_vpermt2w_ymm
+  ;CHECK:       vpermt2w {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
+  ret <16 x i16> %res
+}
+declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+
+define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_permwvar
+  ;CHECK:   vpermw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a1, <16 x i16> %a0, <16 x i16> undef, i16 -1)
+  ; add forces execution domain
+  %3 = add <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
+  ret <16 x i16> %3
+}
+declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) nounwind readonly
+
+define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
-  ret <16 x i8> %2
+  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
+  ret <8 x i16> %2
 }
-declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
 
-define <32 x i8> @stack_fold_psubsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_psubsb_ymm
-  ;CHECK:       vpsubsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <8 x i16> @stack_fold_pmaddubsw_mask(<8 x i16>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw_mask
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1)
-  ret <32 x i8> %2
+  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  ; load needed to keep the operation from being scheduled about the asm block
+  %4 = load <8 x i16>, <8 x i16>* %passthru
+  %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4
+  ret <8 x i16> %5
 }
-declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
 
-define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_psubsw
-  ;CHECK:       vpsubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <8 x i16> @stack_fold_pmaddubsw_maskz(<16 x i8> %a0, <16 x i8> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw_maskz
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
-  ret <8 x i16> %2
+  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
+  ret <8 x i16> %4
 }
-declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <16 x i16> @stack_fold_psubsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_psubsw_ymm
-  ;CHECK:       vpsubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <16 x i16> @stack_fold_pmaddubsw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw_ymm
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1)
+  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
   ret <16 x i16> %2
 }
-declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
 
-define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_psubusb
-  ;CHECK:       vpsubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <16 x i16> @stack_fold_pmaddubsw_ymm_mask(<16 x i16>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw_ymm_mask
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
-  ret <16 x i8> %2
+  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  ; load needed to keep the operation from being scheduled about the asm block
+  %4 = load <16 x i16>, <16 x i16>* %passthru
+  %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4
+  ret <16 x i16> %5
 }
-declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 
-define <32 x i8> @stack_fold_psubusb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_psubusb_ymm
-  ;CHECK:       vpsubusb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <16 x i16> @stack_fold_pmaddubsw_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddubsw_ymm_maskz
+  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1)
-  ret <32 x i8> %2
+  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
+  ret <16 x i16> %4
 }
-declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
 
-define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_psubusw
-  ;CHECK:       vpsubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddwd
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
-  ret <8 x i16> %2
+  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
+  ret <4 x i32> %2
 }
-declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
+declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <16 x i16> @stack_fold_psubusw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_psubusw_ymm
-  ;CHECK:       vpsubusw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <4 x i32> @stack_fold_pmaddwd_mask(<4 x i32>* %passthru, <8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddwd_mask
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1)
-  ret <16 x i16> %2
+  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ; load needed to keep the operation from being scheduled about the asm block
+  %5 = load <4 x i32>, <4 x i32>* %passthru
+  %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %5
+  ret <4 x i32> %6
 }
-declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
 
-define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_psubw
-  ;CHECK:       vpsubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <4 x i32> @stack_fold_pmaddwd_maskz(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddwd_maskz
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <8 x i16> %a0, %a1
-  ret <8 x i16> %2
+  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
+  ret <4 x i32> %5
 }
 
-define <16 x i16> @stack_fold_psubw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_psubw_ymm
-  ;CHECK:       vpsubw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <8 x i32> @stack_fold_pmaddwd_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaddwd_ymm
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = sub <16 x i16> %a0, %a1
-  ret <16 x i16> %2
+  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
+  ret <8 x i32> %2
 }
+declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
 
-define <8 x i16> @stack_fold_vpmovdw(<8 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovdw
-  ;CHECK:       vpmovdw %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <8 x i16> %1
+define <8 x i32> @stack_fold_pmaddwd_ymm_mask(<8 x i32>* %passthru, <16 x i16> %a0, <16 x i16> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddwd_ymm_mask
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  ; load needed to keep the operation from being scheduled about the asm block
+  %4 = load <8 x i32>, <8 x i32>* %passthru
+  %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4
+  ret <8 x i32> %5
 }
-declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
 
-define <4 x i32> @stack_fold_vpmovqd(<4 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovqd
-  ;CHECK:       vpmovqd %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <4 x i32> %1
+define <8 x i32> @stack_fold_pmaddwd_ymm_maskz(<16 x i16> %a0, <16 x i16> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaddwd_ymm_maskz
+  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
+  ret <8 x i32> %4
 }
-declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8)
 
-define <16 x i8> @stack_fold_vpmovwb(<16 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovwb
-  ;CHECK:       vpmovwb %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <16 x i8> %1
+define <16 x i8> @stack_fold_pmaxsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsb
+  ;CHECK:       vpmaxsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
 }
-declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
+declare <16 x i8> @llvm.x86.sse41.pmaxsb(<16 x i8>, <16 x i8>) nounwind readnone
 
-define <8 x i16> @stack_fold_vpmovsdw(<8 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovsdw
-  ;CHECK:       vpmovsdw %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <8 x i16> %1
+define <32 x i8> @stack_fold_pmaxsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsb_ymm
+  ;CHECK:       vpmaxsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
 }
-declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
+declare <32 x i8> @llvm.x86.avx2.pmaxs.b(<32 x i8>, <32 x i8>) nounwind readnone
 
-define <4 x i32> @stack_fold_vpmovsqd(<4 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovsqd
-  ;CHECK:       vpmovsqd %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <4 x i32> %1
+define <4 x i32> @stack_fold_pmaxsd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsd
+  ;CHECK:       vpmaxsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
 }
-declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
+declare <4 x i32> @llvm.x86.sse41.pmaxsd(<4 x i32>, <4 x i32>) nounwind readnone
 
-define <16 x i8> @stack_fold_vpmovswb(<16 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovswb
-  ;CHECK:       vpmovswb %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <16 x i8> %1
+define <8 x i32> @stack_fold_pmaxsd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsd_ymm
+  ;CHECK:       vpmaxsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
 }
-declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
+declare <8 x i32> @llvm.x86.avx2.pmaxs.d(<8 x i32>, <8 x i32>) nounwind readnone
 
-define <8 x i16> @stack_fold_vpmovusdw(<8 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovusdw
-  ;CHECK:       vpmovusdw %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <8 x i16> %1
+define <2 x i64> @stack_fold_pmaxsq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsq
+  ;CHECK:       vpmaxsq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
+  ret <2 x i64> %2
 }
-declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.pmaxs.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) nounwind readnone
 
-define <4 x i32> @stack_fold_vpmovusqd(<4 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovusqd
-  ;CHECK:       vpmovusqd %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <4 x i32> %1
+define <4 x i64> @stack_fold_pmaxsq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsq_ymm
+  ;CHECK:       vpmaxsq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %2
 }
-declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.pmaxs.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readnone
 
-define <16 x i8> @stack_fold_vpmovuswb(<16 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_vpmovuswb
-  ;CHECK:       vpmovuswb %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
-  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
-  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  ret <16 x i8> %1
+define <8 x i16> @stack_fold_pmaxsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsw
+  ;CHECK:       vpmaxsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
 }
-declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16)
+declare <8 x i16> @llvm.x86.sse2.pmaxs.w(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <4 x i32> @stack_fold_extracti32x4(<8 x i32> %a0, <8 x i32> %a1) {
-  ;CHECK-LABEL: stack_fold_extracti32x4
-  ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <8 x i32> %a0, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  %2 = shufflevector <8 x i32> %1, <8 x i32> %a1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
-  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+define <16 x i16> @stack_fold_pmaxsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxsw_ymm
+  ;CHECK:       vpmaxsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.pmaxs.w(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_pmaxub(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxub
+  ;CHECK:       vpmaxub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
+}
+declare <16 x i8> @llvm.x86.sse2.pmaxu.b(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <32 x i8> @stack_fold_pmaxub_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxub_ymm
+  ;CHECK:       vpmaxub {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
+}
+declare <32 x i8> @llvm.x86.avx2.pmaxu.b(<32 x i8>, <32 x i8>) nounwind readnone
+
+define <4 x i32> @stack_fold_pmaxud(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxud
+  ;CHECK:       vpmaxud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32> %a0, <4 x i32> %a1)
   ret <4 x i32> %2
 }
+declare <4 x i32> @llvm.x86.sse41.pmaxud(<4 x i32>, <4 x i32>) nounwind readnone
 
-define <2 x i64> @stack_fold_extracti64x2(<4 x i64> %a0, <4 x i64> %a1) {
-  ;CHECK-LABEL: stack_fold_extracti64x2
-  ;CHECK:       vextracti128 $1, {{%ymm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp) {{.*#+}} 16-byte Folded Spill
-  ; add forces execution domain
-  %1 = add <4 x i64> %a0, <i64 1, i64 1, i64 1, i64 1>
-  %2 = shufflevector <4 x i64> %1, <4 x i64> %a1, <2 x i32> <i32 2, i32 3>
-  %3 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+define <8 x i32> @stack_fold_pmaxud_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxud_ymm
+  ;CHECK:       vpmaxud {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.pmaxu.d(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_pmaxuq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxuq
+  ;CHECK:       vpmaxuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
   ret <2 x i64> %2
 }
+declare <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) nounwind readnone
 
-define <8 x i32> @stack_fold_inserti32x4(<4 x i32> %a0, <4 x i32> %a1) {
-  ;CHECK-LABEL: stack_fold_inserti32x4
-  ;CHECK:       vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <2 x i64> @stack_fold_pmaxuq_mask(<2 x i64>* %passthru, <2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaxuq_mask
+  ;CHECK:       vpmaxuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <4 x i32> %a0, <4 x i32> %a1, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  ; add forces execution domain
-  %3 = add <8 x i32> %2, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-  ret <8 x i32> %3
+  %2 = load <2 x i64>, <2 x i64>* %passthru
+  %3 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %2, i8 %mask)
+  ret <2 x i64> %3
 }
 
-define <4 x i64> @stack_fold_inserti64x2(<2 x i64> %a0, <2 x i64> %a1) {
-  ;CHECK-LABEL: stack_fold_inserti64x2
-  ;CHECK:       vinserti128 $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <2 x i64> @stack_fold_pmaxuq_maskz(<2 x i64> %a0, <2 x i64> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaxuq_maskz
+  ;CHECK:       vpmaxuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <2 x i64> %a0, <2 x i64> %a1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ; add forces execution domain
-  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
-  ret <4 x i64> %3
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.pmaxu.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> zeroinitializer, i8 %mask)
+  ret <2 x i64> %2
 }
 
-define <4 x i32> @stack_fold_vpermt2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2d
-  ;CHECK:       vpermt2d {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
-  ret <4 x i32> %res
+define <4 x i64> @stack_fold_pmaxuq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxuq_ymm
+  ;CHECK:       vpmaxuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %2
 }
-declare <4 x i32> @llvm.x86.avx512.mask.vpermi2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readnone
 
-define <4 x i32> @stack_fold_vpermi2d(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2d
-  ;CHECK:       vpermi2d {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1)
-  ret <4 x i32> %res
+define <4 x i64> @stack_fold_pmaxuq_ymm_mask(<4 x i64>* %passthru, <4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaxuq_ymm_mask
+  ;CHECK:       vpmaxuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = load <4 x i64>, <4 x i64>* %passthru
+  %3 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %2, i8 %mask)
+  ret <4 x i64> %3
 }
-declare <4 x i32> @llvm.x86.avx512.mask.vpermt2var.d.128(<4 x i32>, <4 x i32>, <4 x i32>, i8)
 
-define <2 x i64> @stack_fold_vpermt2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2q
-  ;CHECK:       vpermt2q {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
-  ret <2 x i64> %res
+define <4 x i64> @stack_fold_pmaxuq_ymm_maskz(<4 x i64> %a0, <4 x i64> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmaxuq_ymm_maskz
+  ;CHECK:       vpmaxuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.pmaxu.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> zeroinitializer, i8 %mask)
+  ret <4 x i64> %2
 }
-declare <2 x i64> @llvm.x86.avx512.mask.vpermi2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
 
-define <2 x i64> @stack_fold_vpermi2q(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2q
-  ;CHECK:       vpermi2q {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1)
-  ret <2 x i64> %res
+define <8 x i16> @stack_fold_pmaxuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxuw
+  ;CHECK:       vpmaxuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
 }
-declare <2 x i64> @llvm.x86.avx512.mask.vpermt2var.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8)
+declare <8 x i16> @llvm.x86.sse41.pmaxuw(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <8 x i16> @stack_fold_vpermt2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2w
-  ;CHECK:       vpermt2w {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
-  ret <8 x i16> %res
+define <16 x i16> @stack_fold_pmaxuw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pmaxuw_ymm
+  ;CHECK:       vpmaxuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
 }
-declare <8 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+declare <16 x i16> @llvm.x86.avx2.pmaxu.w(<16 x i16>, <16 x i16>) nounwind readnone
 
-define <8 x i16> @stack_fold_vpermi2w(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2w
-  ;CHECK:       vpermi2w {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1)
-  ret <8 x i16> %res
+define <16 x i8> @stack_fold_pminsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsb
+  ;CHECK:       vpminsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
 }
-declare <8 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.128(<8 x i16>, <8 x i16>, <8 x i16>, i8)
+declare <16 x i8> @llvm.x86.sse41.pminsb(<16 x i8>, <16 x i8>) nounwind readnone
 
-define <16 x i8> @stack_fold_vpermt2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2b
-  ;CHECK:       vpermt2b {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
-  ret <16 x i8> %res
+define <32 x i8> @stack_fold_pminsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsb_ymm
+  ;CHECK:       vpminsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
 }
-declare <16 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+declare <32 x i8> @llvm.x86.avx2.pmins.b(<32 x i8>, <32 x i8>) nounwind readnone
 
-define <16 x i8> @stack_fold_vpermi2b(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2b
-  ;CHECK:       vpermi2b {{-?[0-9]*}}(%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8> %x0, <16 x i8> %x1, <16 x i8> %x2, i16 -1)
-  ret <16 x i8> %res
+define <4 x i32> @stack_fold_pminsd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsd
+  ;CHECK:       vpminsd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
 }
-declare <16 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.128(<16 x i8>, <16 x i8>, <16 x i8>, i16)
+declare <4 x i32> @llvm.x86.sse41.pminsd(<4 x i32>, <4 x i32>) nounwind readnone
 
-define <8 x i32> @stack_fold_vpermt2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2d_ymm
-  ;CHECK:       vpermt2d {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
-  ret <8 x i32> %res
+define <8 x i32> @stack_fold_pminsd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsd_ymm
+  ;CHECK:       vpminsd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
 }
-declare <8 x i32> @llvm.x86.avx512.mask.vpermi2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+declare <8 x i32> @llvm.x86.avx2.pmins.d(<8 x i32>, <8 x i32>) nounwind readnone
 
-define <8 x i32> @stack_fold_vpermi2d_ymm(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2d_ymm
-  ;CHECK:       vpermi2d {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32> %x0, <8 x i32> %x1, <8 x i32> %x2, i8 -1)
-  ret <8 x i32> %res
+define <2 x i64> @stack_fold_pminsq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsq
+  ;CHECK:       vpminsq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
+  ret <2 x i64> %2
 }
-declare <8 x i32> @llvm.x86.avx512.mask.vpermt2var.d.256(<8 x i32>, <8 x i32>, <8 x i32>, i8)
+declare <2 x i64> @llvm.x86.avx512.mask.pmins.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) nounwind readnone
 
-define <4 x i64> @stack_fold_vpermt2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2q_ymm
-  ;CHECK:       vpermt2q {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
-  ret <4 x i64> %res
+define <4 x i64> @stack_fold_pminsq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsq_ymm
+  ;CHECK:       vpminsq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %2
 }
-declare <4 x i64> @llvm.x86.avx512.mask.vpermi2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <4 x i64> @llvm.x86.avx512.mask.pmins.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readnone
 
-define <4 x i64> @stack_fold_vpermi2q_ymm(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2q_ymm
-  ;CHECK:       vpermi2q {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64> %x0, <4 x i64> %x1, <4 x i64> %x2, i8 -1)
-  ret <4 x i64> %res
+define <8 x i16> @stack_fold_pminsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsw
+  ;CHECK:       vpminsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
 }
-declare <4 x i64> @llvm.x86.avx512.mask.vpermt2var.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8)
+declare <8 x i16> @llvm.x86.sse2.pmins.w(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <16 x i16> @stack_fold_vpermt2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2w_ymm
-  ;CHECK:       vpermt2w {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
-  ret <16 x i16> %res
+define <16 x i16> @stack_fold_pminsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pminsw_ymm
+  ;CHECK:       vpminsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
 }
-declare <16 x i16> @llvm.x86.avx512.mask.vpermi2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+declare <16 x i16> @llvm.x86.avx2.pmins.w(<16 x i16>, <16 x i16>) nounwind readnone
 
-define <16 x i16> @stack_fold_vpermi2w_ymm(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2w_ymm
-  ;CHECK:       vpermi2w {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16> %x0, <16 x i16> %x1, <16 x i16> %x2, i16 -1)
-  ret <16 x i16> %res
+define <16 x i8> @stack_fold_pminub(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pminub
+  ;CHECK:       vpminub {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
 }
-declare <16 x i16> @llvm.x86.avx512.mask.vpermt2var.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16)
+declare <16 x i8> @llvm.x86.sse2.pminu.b(<16 x i8>, <16 x i8>) nounwind readnone
 
-define <32 x i8> @stack_fold_vpermt2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermt2b_ymm
-  ;CHECK:       vpermt2b {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
-  ret <32 x i8> %res
+define <32 x i8> @stack_fold_pminub_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_pminub_ymm
+  ;CHECK:       vpminub {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
 }
-declare <32 x i8> @llvm.x86.avx512.mask.vpermi2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+declare <32 x i8> @llvm.x86.avx2.pminu.b(<32 x i8>, <32 x i8>) nounwind readnone
 
-define <32 x i8> @stack_fold_vpermi2b_ymm(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2) {
-  ;CHECK-LABEL: stack_fold_vpermi2b_ymm
-  ;CHECK:       vpermi2b {{-?[0-9]*}}(%rsp), %ymm1, %ymm0 # 32-byte Folded Reload
-  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %res = call <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8> %x0, <32 x i8> %x1, <32 x i8> %x2, i32 -1)
-  ret <32 x i8> %res
+define <4 x i32> @stack_fold_pminud(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pminud
+  ;CHECK:       vpminud {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse41.pminud(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
 }
-declare <32 x i8> @llvm.x86.avx512.mask.vpermt2var.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32)
+declare <4 x i32> @llvm.x86.sse41.pminud(<4 x i32>, <4 x i32>) nounwind readnone
 
-define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovsxbd
-  ;CHECK:       vpmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = sext <4 x i8> %2 to <4 x i32>
-  ret <4 x i32> %3
+define <8 x i32> @stack_fold_pminud_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pminud_ymm
+  ;CHECK:       vpminud {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
 }
+declare <8 x i32> @llvm.x86.avx2.pminu.d(<8 x i32>, <8 x i32>) nounwind readnone
 
-define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovsxbq
-  ;CHECK:       vpmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
-  %3 = sext <2 x i8> %2 to <2 x i64>
-  ret <2 x i64> %3
+define <2 x i64> @stack_fold_pminuq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pminuq
+  ;CHECK:       vpminuq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> undef, i8 -1)
+  ret <2 x i64> %2
 }
+declare <2 x i64> @llvm.x86.avx512.mask.pminu.q.128(<2 x i64>, <2 x i64>, <2 x i64>, i8) nounwind readnone
 
-define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovsxbw
-  ;CHECK:       vpmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-  %3 = sext <8 x i8> %2 to <8 x i16>
-  ret <8 x i16> %3
+define <4 x i64> @stack_fold_pminuq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_pminuq_ymm
+  ;CHECK:       vpminuq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> undef, i8 -1)
+  ret <4 x i64> %2
 }
+declare <4 x i64> @llvm.x86.avx512.mask.pminu.q.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readnone
 
-define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovsxdq
-  ;CHECK:       vpmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
-  %3 = sext <2 x i32> %2 to <2 x i64>
-  ret <2 x i64> %3
+define <8 x i16> @stack_fold_pminuw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pminuw
+  ;CHECK:       vpminuw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
 }
+declare <8 x i16> @llvm.x86.sse41.pminuw(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovsxwd
-  ;CHECK:       vpmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %3 = sext <4 x i16> %2 to <4 x i32>
-  ret <4 x i32> %3
+define <16 x i16> @stack_fold_pminuw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_pminuw_ymm
+  ;CHECK:       vpminuw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
 }
+declare <16 x i16> @llvm.x86.avx2.pminu.w(<16 x i16>, <16 x i16>) nounwind readnone
 
-define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovsxwq
-  ;CHECK:       vpmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
-  %3 = sext <2 x i16> %2 to <2 x i64>
-  ret <2 x i64> %3
+define <8 x i16> @stack_fold_vpmovdw(<8 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovdw
+  ;CHECK:       vpmovdw %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <8 x i16> %1
 }
+declare <8 x i16> @llvm.x86.avx512.mask.pmov.dw.256(<8 x i32>, <8 x i16>, i8)
 
-define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovzxbd
-  ;CHECK:       vpmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27>
-  %3 = bitcast <16 x i8> %2 to <4 x i32>
-  ret <4 x i32> %3
+define <4 x i32> @stack_fold_vpmovqd(<4 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovqd
+  ;CHECK:       vpmovqd %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <4 x i32> %1
 }
+declare <4 x i32> @llvm.x86.avx512.mask.pmov.qd.256(<4 x i64>, <4 x i32>, i8)
 
-define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovzxbq
-  ;CHECK:       vpmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28>
-  %3 = bitcast <16 x i8> %2 to <2 x i64>
-  ret <2 x i64> %3
+define <16 x i8> @stack_fold_vpmovwb(<16 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovwb
+  ;CHECK:       vpmovwb %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <16 x i8> %1
 }
+declare <16 x i8> @llvm.x86.avx512.mask.pmov.wb.256(<16 x i16>, <16 x i8>, i16)
 
-define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovzxbw
-  ;CHECK:       vpmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
-  %3 = bitcast <16 x i8> %2 to <8 x i16>
-  ret <8 x i16> %3
+define <8 x i16> @stack_fold_vpmovsdw(<8 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovsdw
+  ;CHECK:       vpmovsdw %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <8 x i16> %1
 }
+declare <8 x i16> @llvm.x86.avx512.mask.pmovs.dw.256(<8 x i32>, <8 x i16>, i8)
 
-define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovzxdq
-  ;CHECK:       vpmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
-  %3 = bitcast <4 x i32> %2 to <2 x i64>
-  ret <2 x i64> %3
+define <4 x i32> @stack_fold_vpmovsqd(<4 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovsqd
+  ;CHECK:       vpmovsqd %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <4 x i32> %1
 }
+declare <4 x i32> @llvm.x86.avx512.mask.pmovs.qd.256(<4 x i64>, <4 x i32>, i8)
 
-define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovzxwd
-  ;CHECK:       vpmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
-  %3 = bitcast <8 x i16> %2 to <4 x i32>
-  ret <4 x i32> %3
+define <16 x i8> @stack_fold_vpmovswb(<16 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovswb
+  ;CHECK:       vpmovswb %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <16 x i8> %1
 }
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.wb.256(<16 x i16>, <16 x i8>, i16)
 
-define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_pmovzxwq
-  ;CHECK:       vpmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <4 x i32> @stack_fold_pmovsxbd(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbd
+  ;CHECK:       vpmovsxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13>
-  %3 = bitcast <8 x i16> %2 to <2 x i64>
-  ret <2 x i64> %3
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = sext <4 x i8> %2 to <4 x i32>
+  ret <4 x i32> %3
 }
 
 define <8 x i32> @stack_fold_pmovsxbd_ymm(<16 x i8> %a0) {
@@ -744,6 +1228,15 @@ define <8 x i32> @stack_fold_pmovsxbd_ymm(<16 x i8> %a0) {
   ret <8 x i32> %3
 }
 
+define <2 x i64> @stack_fold_pmovsxbq(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbq
+  ;CHECK:       vpmovsxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i8> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
 define <4 x i64> @stack_fold_pmovsxbq_ymm(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxbq_ymm
   ;CHECK:       pmovsxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -753,6 +1246,15 @@ define <4 x i64> @stack_fold_pmovsxbq_ymm(<16 x i8> %a0) {
   ret <4 x i64> %3
 }
 
+define <8 x i16> @stack_fold_pmovsxbw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxbw
+  ;CHECK:       vpmovsxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %3 = sext <8 x i8> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
 define <16 x i16> @stack_fold_pmovsxbw_ymm(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxbw_ymm
   ;CHECK:       vpmovsxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -761,6 +1263,15 @@ define <16 x i16> @stack_fold_pmovsxbw_ymm(<16 x i8> %a0) {
   ret <16 x i16> %2
 }
 
+define <2 x i64> @stack_fold_pmovsxdq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxdq
+  ;CHECK:       vpmovsxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i32> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
 define <4 x i64> @stack_fold_pmovsxdq_ymm(<4 x i32> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxdq_ymm
   ;CHECK:       vpmovsxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -769,6 +1280,15 @@ define <4 x i64> @stack_fold_pmovsxdq_ymm(<4 x i32> %a0) {
   ret <4 x i64> %2
 }
 
+define <4 x i32> @stack_fold_pmovsxwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxwd
+  ;CHECK:       vpmovsxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %3 = sext <4 x i16> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
 define <8 x i32> @stack_fold_pmovsxwd_ymm(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxwd_ymm
   ;CHECK:       vpmovsxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -777,6 +1297,15 @@ define <8 x i32> @stack_fold_pmovsxwd_ymm(<8 x i16> %a0) {
   ret <8 x i32> %2
 }
 
+define <2 x i64> @stack_fold_pmovsxwq(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovsxwq
+  ;CHECK:       vpmovsxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <2 x i32> <i32 0, i32 1>
+  %3 = sext <2 x i16> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
 define <4 x i64> @stack_fold_pmovsxwq_ymm(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pmovsxwq_ymm
   ;CHECK:       vpmovsxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -786,6 +1315,42 @@ define <4 x i64> @stack_fold_pmovsxwq_ymm(<8 x i16> %a0) {
   ret <4 x i64> %3
 }
 
+define <8 x i16> @stack_fold_vpmovusdw(<8 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovusdw
+  ;CHECK:       vpmovusdw %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32> %a0, <8 x i16> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <8 x i16> %1
+}
+declare <8 x i16> @llvm.x86.avx512.mask.pmovus.dw.256(<8 x i32>, <8 x i16>, i8)
+
+define <4 x i32> @stack_fold_vpmovusqd(<4 x i64> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovusqd
+  ;CHECK:       vpmovusqd %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64> %a0, <4 x i32> undef, i8 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <4 x i32> %1
+}
+declare <4 x i32> @llvm.x86.avx512.mask.pmovus.qd.256(<4 x i64>, <4 x i32>, i8)
+
+define <16 x i8> @stack_fold_vpmovuswb(<16 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_vpmovuswb
+  ;CHECK:       vpmovuswb %ymm0, {{-?[0-9]*}}(%rsp) # 16-byte Folded Spill
+  %1 = call <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16> %a0, <16 x i8> undef, i16 -1)
+  %2 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  ret <16 x i8> %1
+}
+declare <16 x i8> @llvm.x86.avx512.mask.pmovus.wb.256(<16 x i16>, <16 x i8>, i16)
+
+define <4 x i32> @stack_fold_pmovzxbd(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbd
+  ;CHECK:       vpmovzxbd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 1, i32 19, i32 20, i32 21, i32 2, i32 22, i32 23, i32 24, i32 3, i32 25, i32 26, i32 27>
+  %3 = bitcast <16 x i8> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
 define <8 x i32> @stack_fold_pmovzxbd_ymm(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovzxbd_ymm
   ;CHECK:       vpmovzxbd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -795,6 +1360,15 @@ define <8 x i32> @stack_fold_pmovzxbd_ymm(<16 x i8> %a0) {
   ret <8 x i32> %3
 }
 
+define <2 x i64> @stack_fold_pmovzxbq(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbq
+  ;CHECK:       vpmovzxbq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 1, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28>
+  %3 = bitcast <16 x i8> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
 define <4 x i64> @stack_fold_pmovzxbq_ymm(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovzxbq_ymm
   ;CHECK:       vpmovzxbq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -804,6 +1378,15 @@ define <4 x i64> @stack_fold_pmovzxbq_ymm(<16 x i8> %a0) {
   ret <4 x i64> %3
 }
 
+define <8 x i16> @stack_fold_pmovzxbw(<16 x i8> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxbw
+  ;CHECK:       vpmovzxbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23>
+  %3 = bitcast <16 x i8> %2 to <8 x i16>
+  ret <8 x i16> %3
+}
+
 define <16 x i16> @stack_fold_pmovzxbw_ymm(<16 x i8> %a0) {
   ;CHECK-LABEL: stack_fold_pmovzxbw_ymm
   ;CHECK:       vpmovzxbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -812,6 +1395,15 @@ define <16 x i16> @stack_fold_pmovzxbw_ymm(<16 x i8> %a0) {
   ret <16 x i16> %2
 }
 
+define <2 x i64> @stack_fold_pmovzxdq(<4 x i32> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxdq
+  ;CHECK:       vpmovzxdq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <4 x i32> %a0, <4 x i32> zeroinitializer, <4 x i32> <i32 0, i32 4, i32 1, i32 5>
+  %3 = bitcast <4 x i32> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
 define <4 x i64> @stack_fold_pmovzxdq_ymm(<4 x i32> %a0) {
   ;CHECK-LABEL: stack_fold_pmovzxdq_ymm
   ;CHECK:       vpmovzxdq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -820,6 +1412,15 @@ define <4 x i64> @stack_fold_pmovzxdq_ymm(<4 x i32> %a0) {
   ret <4 x i64> %2
 }
 
+define <4 x i32> @stack_fold_pmovzxwd(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxwd
+  ;CHECK:       vpmovzxwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %3 = bitcast <8 x i16> %2 to <4 x i32>
+  ret <4 x i32> %3
+}
+
 define <8 x i32> @stack_fold_pmovzxwd_ymm(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pmovzxwd_ymm
   ;CHECK:       vpmovzxwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -828,6 +1429,15 @@ define <8 x i32> @stack_fold_pmovzxwd_ymm(<8 x i16> %a0) {
   ret <8 x i32> %2
 }
 
+define <2 x i64> @stack_fold_pmovzxwq(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pmovzxwq
+  ;CHECK:       vpmovzxwq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> zeroinitializer, <8 x i32> <i32 0, i32 8, i32 9, i32 10, i32 1, i32 11, i32 12, i32 13>
+  %3 = bitcast <8 x i16> %2 to <2 x i64>
+  ret <2 x i64> %3
+}
+
 define <4 x i64> @stack_fold_pmovzxwq_ymm(<8 x i16> %a0) {
   ;CHECK-LABEL: stack_fold_pmovzxwq_ymm
   ;CHECK:       vpmovzxwq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
@@ -861,65 +1471,82 @@ define <4 x i64> @stack_fold_pmovzxwq_mask_ymm(<4 x i64> %passthru, <8 x i16> %a
   ret <4 x i64> %6
 }
 
-define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_punpckhbw
-  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <2 x i64> @stack_fold_pmuldq(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmuldq
+  ;CHECK:       vpmuldq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  ret <16 x i8> %2
+  %2 = call <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32> %a0, <4 x i32> %a1)
+  ret <2 x i64> %2
 }
+declare <2 x i64> @llvm.x86.sse41.pmuldq(<4 x i32>, <4 x i32>) nounwind readnone
 
-define <16 x i8> @stack_fold_punpckhbw_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_punpckhbw_mask
-  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+define <4 x i64> @stack_fold_pmuldq_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmuldq_ymm
+  ;CHECK:       vpmuldq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  %3 = bitcast i16 %mask to <16 x i1>
-  ; load needed to keep the operation from being scheduled about the asm block
-  %4 = load <16 x i8>, <16 x i8>* %passthru
-  %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4
-  ret <16 x i8> %5
+  %2 = call <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32> %a0, <8 x i32> %a1)
+  ret <4 x i64> %2
 }
+declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone
 
-define <16 x i8> @stack_fold_punpckhbw_maskz(<16 x i8> %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_punpckhbw_maskz
-  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+define <2 x i64> @stack_fold_pmuludq(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmuludq
+  ;CHECK:       vpmuludq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
-  %3 = bitcast i16 %mask to <16 x i1>
-  %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer
-  ret <16 x i8> %4
+  %2 = call <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32> %a0, <4 x i32> %a1)
+  ret <2 x i64> %2
 }
+declare <2 x i64> @llvm.x86.sse2.pmulu.dq(<4 x i32>, <4 x i32>) nounwind readnone
 
-define <32 x i8> @stack_fold_punpckhbw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_punpckhbw_ymm
-  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <4 x i64> @stack_fold_pmuludq_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pmuludq_ymm
+  ;CHECK:       vpmuludq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
-  ret <32 x i8> %2
+  %2 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1)
+  ret <4 x i64> %2
 }
+declare <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32>, <8 x i32>) nounwind readnone
 
-define <32 x i8> @stack_fold_punpckhbw_mask_ymm(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_punpckhbw_mask_ymm
-  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
-  %3 = bitcast i32 %mask to <32 x i1>
-  ; load needed to keep the operation from being scheduled about the asm block
-  %4 = load <32 x i8>, <32 x i8>* %passthru
-  %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4
-  ret <32 x i8> %5
+define <4 x i64> @stack_fold_pmuludq_ymm_mask(<4 x i64>* %passthru, <8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmuludq_ymm_mask
+  ;CHECK:       vpmuludq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = load <4 x i64>, <4 x i64>* %passthru
+  %6 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> %5
+  ret <4 x i64> %6
 }
 
-define <32 x i8> @stack_fold_punpckhbw_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_punpckhbw_maskz_ymm
-  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
-  %3 = bitcast i32 %mask to <32 x i1>
-  %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
-  ret <32 x i8> %4
+define <4 x i64> @stack_fold_pmuludq_ymm_maskz(<8 x i32> %a0, <8 x i32> %a1, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pmuludq_ymm_maskz
+  ;CHECK:       vpmuludq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.pmulu.dq(<8 x i32> %a0, <8 x i32> %a1)
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %5 = select <4 x i1> %4, <4 x i64> %2, <4 x i64> zeroinitializer
+  ret <4 x i64> %5
+}
+
+define <2 x i64> @stack_fold_psadbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psadbw
+  ;CHECK:       vpsadbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8> %a0, <16 x i8> %a1)
+  ret <2 x i64> %2
 }
+declare <2 x i64> @llvm.x86.sse2.psad.bw(<16 x i8>, <16 x i8>) nounwind readnone
+
+define <4 x i64> @stack_fold_psadbw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psadbw_ymm
+  ;CHECK:       vpsadbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8> %a0, <32 x i8> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.psad.bw(<32 x i8>, <32 x i8>) nounwind readnone
 
 define <16 x i8> @stack_fold_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
   ;CHECK-LABEL: stack_fold_pshufb
@@ -1000,63 +1627,7 @@ define <4 x i32> @stack_fold_pshufd_maskz(<4 x i32> %a0, i8 %mask) {
   %3 = bitcast i8 %mask to <8 x i1>
   %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
-  ret <4 x i32> %5
-}
-
-define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_pshufhw
-  ;CHECK:       vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
-  ret <8 x i16> %2
-}
-
-define <8 x i16> @stack_fold_pshufhw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_pshufhw_mask
-  ;CHECK:       vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru
-  ret <8 x i16> %4
-}
-
-define <8 x i16> @stack_fold_pshufhw_maskz(<8 x i16> %a0, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_pshufhw_maskz
-  ;CHECK:       vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
-  ret <8 x i16> %4
-}
-
-define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_pshuflw
-  ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
-  ret <8 x i16> %2
-}
-
-define <8 x i16> @stack_fold_pshuflw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_pshuflw_mask
-  ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru
-  ret <8 x i16> %4
-}
-
-define <8 x i16> @stack_fold_pshuflw_maskz(<8 x i16> %a0, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_pshuflw_maskz
-  ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
-  ret <8 x i16> %4
+  ret <4 x i32> %5
 }
 
 define <8 x i32> @stack_fold_pshufd_ymm(<8 x i32> %a0) {
@@ -1087,16 +1658,44 @@ define <8 x i32> @stack_fold_pshufd_ymm_maskz(<8 x i32> %a0, i8 %mask) {
   ret <8 x i32> %4
 }
 
-define <16 x i16> @stack_fold_vpshufhw_ymm(<16 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_vpshufhw_ymm
+define <8 x i16> @stack_fold_pshufhw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pshufhw
+  ;CHECK:       vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @stack_fold_pshufhw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pshufhw_mask
+  ;CHECK:       vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru
+  ret <8 x i16> %4
+}
+
+define <8 x i16> @stack_fold_pshufhw_maskz(<8 x i16> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pshufhw_maskz
+  ;CHECK:       vpshufhw $11, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 4, i32 4>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @stack_fold_pshufhw_ymm(<16 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pshufhw_ymm
   ;CHECK:       vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
   ret <16 x i16> %2
 }
 
-define <16 x i16> @stack_fold_vpshufhw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_vpshufhw_ymm_mask
+define <16 x i16> @stack_fold_pshufhw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pshufhw_ymm_mask
   ;CHECK:       vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
@@ -1105,8 +1704,8 @@ define <16 x i16> @stack_fold_vpshufhw_ymm_mask(<16 x i16> %passthru, <16 x i16>
   ret <16 x i16> %4
 }
 
-define <16 x i16> @stack_fold_vpshufhw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_vpshufhw_ymm_maskz
+define <16 x i16> @stack_fold_pshufhw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pshufhw_ymm_maskz
   ;CHECK:       vpshufhw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 7, i32 6, i32 5, i32 4, i32 8, i32 9, i32 10, i32 11, i32 15, i32 14, i32 13, i32 12>
@@ -1115,16 +1714,44 @@ define <16 x i16> @stack_fold_vpshufhw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
   ret <16 x i16> %4
 }
 
-define <16 x i16> @stack_fold_vpshuflw_ymm(<16 x i16> %a0) {
-  ;CHECK-LABEL: stack_fold_vpshuflw_ymm
+define <8 x i16> @stack_fold_pshuflw(<8 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pshuflw
+  ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x i16> %2
+}
+
+define <8 x i16> @stack_fold_pshuflw_mask(<8 x i16> %passthru, <8 x i16> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pshuflw_mask
+  ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %passthru
+  ret <8 x i16> %4
+}
+
+define <8 x i16> @stack_fold_pshuflw_maskz(<8 x i16> %a0, i8 %mask) {
+  ;CHECK-LABEL: stack_fold_pshuflw_maskz
+  ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <8 x i16> %a0, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7>
+  %3 = bitcast i8 %mask to <8 x i1>
+  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
+  ret <8 x i16> %4
+}
+
+define <16 x i16> @stack_fold_pshuflw_ymm(<16 x i16> %a0) {
+  ;CHECK-LABEL: stack_fold_pshuflw_ymm
   ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %2
 }
 
-define <16 x i16> @stack_fold_vpshuflw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_vpshuflw_ymm_mask
+define <16 x i16> @stack_fold_pshuflw_ymm_mask(<16 x i16> %passthru, <16 x i16> %a0, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pshuflw_ymm_mask
   ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -1133,8 +1760,8 @@ define <16 x i16> @stack_fold_vpshuflw_ymm_mask(<16 x i16> %passthru, <16 x i16>
   ret <16 x i16> %4
 }
 
-define <16 x i16> @stack_fold_vpshuflw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_vpshuflw_ymm_maskz
+define <16 x i16> @stack_fold_pshuflw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_pshuflw_ymm_maskz
   ;CHECK:       vpshuflw $27, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
   %2 = shufflevector <16 x i16> %a0, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
@@ -1143,261 +1770,553 @@ define <16 x i16> @stack_fold_vpshuflw_ymm_maskz(<16 x i16> %a0, i16 %mask) {
   ret <16 x i16> %4
 }
 
-define <8 x i16> @stack_fold_pmaddubsw(<16 x i8> %a0, <16 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_pmaddubsw
-  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <4 x i32> @stack_fold_pslld(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pslld
+  ;CHECK:       vpslld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
+  %2 = call <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.psll.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_pslld_ymm(<8 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_pslld_ymm
+  ;CHECK:       vpslld {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psll.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_pslldq(<16 x i8> %a) {
+  ;CHECK-LABEL: stack_fold_pslldq
+  ;CHECK:       vpslldq $12, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 16, i32 17, i32 18, i32 19>
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @stack_fold_pslldq_ymm(<32 x i8> %a) {
+  ;CHECK-LABEL: stack_fold_pslldq_ymm
+  ;CHECK:       vpslldq $15, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 32, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 48>
+  ret <32 x i8> %2
+}
+
+define <2 x i64> @stack_fold_psllq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllq
+  ;CHECK:       vpsllq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse2.psll.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @stack_fold_psllq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllq_ymm
+  ;CHECK:       vpsllq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64> %a0, <2 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.psll.q(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @stack_fold_psllvd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvd
+  ;CHECK:       vpsllvd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.avx2.psllv.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_psllvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvd_ymm
+  ;CHECK:       vpsllvd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psllv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_psllvq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvq
+  ;CHECK:       vpsllvq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx2.psllv.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @stack_fold_psllvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvq_ymm
+  ;CHECK:       vpsllvq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.psllv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <8 x i16> @stack_fold_psllvw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvw
+  ;CHECK:       vpsllvw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16> %a0, <8 x i16> %a1)
   ret <8 x i16> %2
 }
-declare <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8>, <16 x i8>) nounwind readnone
+declare <8 x i16> @llvm.x86.avx512.psllv.w.128(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <8 x i16> @stack_fold_pmaddubsw_mask(<8 x i16>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddubsw_mask
-  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+define <16 x i16> @stack_fold_psllvw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psllvw_ymm
+  ;CHECK:       vpsllvw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
-  %3 = bitcast i8 %mask to <8 x i1>
-  ; load needed to keep the operation from being scheduled about the asm block
-  %4 = load <8 x i16>, <8 x i16>* %passthru
-  %5 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> %4
-  ret <8 x i16> %5
+  %2 = call <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
 }
+declare <16 x i16> @llvm.x86.avx512.psllv.w.256(<16 x i16>, <16 x i16>) nounwind readnone
 
-define <8 x i16> @stack_fold_pmaddubsw_maskz(<16 x i8> %a0, <16 x i8> %a1, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddubsw_maskz
-  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+define <8 x i16> @stack_fold_psllw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psllw
+  ;CHECK:       vpsllw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %a0, <16 x i8> %a1)
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x i16> %2, <8 x i16> zeroinitializer
-  ret <8 x i16> %4
+  %2 = call <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
 }
+declare <8 x i16> @llvm.x86.sse2.psll.w(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <16 x i16> @stack_fold_pmaddubsw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_pmaddubsw_ymm
-  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <16 x i16> @stack_fold_psllw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psllw_ymm
+  ;CHECK:       vpsllw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
+  %2 = call <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16> %a0, <8 x i16> %a1)
   ret <16 x i16> %2
 }
-declare <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8>, <32 x i8>) nounwind readnone
+declare <16 x i16> @llvm.x86.avx2.psll.w(<16 x i16>, <8 x i16>) nounwind readnone
 
-define <16 x i16> @stack_fold_pmaddubsw_ymm_mask(<16 x i16>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddubsw_ymm_mask
-  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+define <4 x i32> @stack_fold_psrad(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrad
+  ;CHECK:       vpsrad {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
-  %3 = bitcast i16 %mask to <16 x i1>
-  ; load needed to keep the operation from being scheduled about the asm block
-  %4 = load <16 x i16>, <16 x i16>* %passthru
-  %5 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> %4
-  ret <16 x i16> %5
+  %2 = call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
 }
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>) nounwind readnone
 
-define <16 x i16> @stack_fold_pmaddubsw_ymm_maskz(<32 x i8> %a0, <32 x i8> %a1, i16 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddubsw_ymm_maskz
-  ;CHECK:       vpmaddubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+define <8 x i32> @stack_fold_psrad_ymm(<8 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrad_ymm
+  ;CHECK:       vpsrad {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i16> @llvm.x86.avx2.pmadd.ub.sw(<32 x i8> %a0, <32 x i8> %a1)
-  %3 = bitcast i16 %mask to <16 x i1>
-  %4 = select <16 x i1> %3, <16 x i16> %2, <16 x i16> zeroinitializer
-  ret <16 x i16> %4
+  %2 = call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i32> %2
 }
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>) nounwind readnone
 
-define <4 x i32> @stack_fold_pmaddwd(<8 x i16> %a0, <8 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_pmaddwd
-  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+define <2 x i64> @stack_fold_psraq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psraq
+  ;CHECK:       vpsraq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @stack_fold_psraq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psraq_ymm
+  ;CHECK:       vpsraq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <4 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @stack_fold_psravd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psravd
+  ;CHECK:       vpsravd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_psravd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psravd_ymm
+  ;CHECK:       vpsravd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_psravq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psravq
+  ;CHECK:       vpsravq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @stack_fold_psravq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psravq_ymm
+  ;CHECK:       vpsravq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <8 x i16> @stack_fold_psravw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psravw
+  ;CHECK:       vpsravw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.avx512.psrav.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_psravw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psravw_ymm
+  ;CHECK:       vpsravw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx512.psrav.w.256(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_psraw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psraw
+  ;CHECK:       vpsraw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_psraw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psraw_ymm
+  ;CHECK:       vpsraw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <4 x i32> @stack_fold_psrld(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrld
+  ;CHECK:       vpsrld {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sse2.psrl.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_psrld_ymm(<8 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrld_ymm
+  ;CHECK:       vpsrld {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32> %a0, <4 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psrl.d(<8 x i32>, <4 x i32>) nounwind readnone
+
+define <16 x i8> @stack_fold_psrldq(<16 x i8> %a) {
+  ;CHECK-LABEL: stack_fold_psrldq
+  ;CHECK:       vpsrldq $12, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i8> zeroinitializer, <16 x i8> %a, <16 x i32> <i32 28, i32 29, i32 30, i32 31, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 09, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @stack_fold_psrldq_ymm(<32 x i8> %a) {
+  ;CHECK-LABEL: stack_fold_psrldq_ymm
+  ;CHECK:       vpsrldq $15, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 47, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 63, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
+  ret <32 x i8> %2
+}
+
+define <2 x i64> @stack_fold_psrlq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlq
+  ;CHECK:       vpsrlq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.sse2.psrl.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @stack_fold_psrlq_ymm(<4 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlq_ymm
+  ;CHECK:       vpsrlq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64> %a0, <2 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.psrl.q(<4 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i32> @stack_fold_psrlvd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvd
+  ;CHECK:       vpsrlvd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.avx2.psrlv.d(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <8 x i32> @stack_fold_psrlvd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvd_ymm
+  ;CHECK:       vpsrlvd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32> %a0, <8 x i32> %a1)
+  ret <8 x i32> %2
+}
+declare <8 x i32> @llvm.x86.avx2.psrlv.d.256(<8 x i32>, <8 x i32>) nounwind readnone
+
+define <2 x i64> @stack_fold_psrlvq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvq
+  ;CHECK:       vpsrlvq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64> %a0, <2 x i64> %a1)
+  ret <2 x i64> %2
+}
+declare <2 x i64> @llvm.x86.avx2.psrlv.q(<2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @stack_fold_psrlvq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvq_ymm
+  ;CHECK:       vpsrlvq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64> %a0, <4 x i64> %a1)
+  ret <4 x i64> %2
+}
+declare <4 x i64> @llvm.x86.avx2.psrlv.q.256(<4 x i64>, <4 x i64>) nounwind readnone
+
+define <8 x i16> @stack_fold_psrlvw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvw
+  ;CHECK:       vpsrlvw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.avx512.psrlv.w.128(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_psrlvw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlvw_ymm
+  ;CHECK:       vpsrlvw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx512.psrlv.w.256(<16 x i16>, <16 x i16>) nounwind readnone
+
+define <8 x i16> @stack_fold_psrlw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlw
+  ;CHECK:       vpsrlw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
+}
+declare <8 x i16> @llvm.x86.sse2.psrl.w(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i16> @stack_fold_psrlw_ymm(<16 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psrlw_ymm
+  ;CHECK:       vpsrlw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16> %a0, <8 x i16> %a1)
+  ret <16 x i16> %2
+}
+declare <16 x i16> @llvm.x86.avx2.psrl.w(<16 x i16>, <8 x i16>) nounwind readnone
+
+define <16 x i8> @stack_fold_psubb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubb
+  ;CHECK:       vpsubb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = sub <16 x i8> %a0, %a1
+  ret <16 x i8> %2
+}
+
+define <32 x i8> @stack_fold_psubb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubb_ymm
+  ;CHECK:       vpsubb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = sub <32 x i8> %a0, %a1
+  ret <32 x i8> %2
+}
+
+define <4 x i32> @stack_fold_psubd(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psubd
+  ;CHECK:       vpsubd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
+  %2 = sub <4 x i32> %a0, %a1
   ret <4 x i32> %2
 }
-declare <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <4 x i32> @stack_fold_pmaddwd_mask(<4 x i32>* %passthru, <8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddwd_mask
-  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+define <8 x i32> @stack_fold_psubd_ymm(<8 x i32> %a0, <8 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_psubd_ymm
+  ;CHECK:       vpsubd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  ; load needed to keep the operation from being scheduled about the asm block
-  %5 = load <4 x i32>, <4 x i32>* %passthru
-  %6 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> %5
-  ret <4 x i32> %6
+  %2 = sub <8 x i32> %a0, %a1
+  ret <8 x i32> %2
 }
 
-define <4 x i32> @stack_fold_pmaddwd_maskz(<8 x i16> %a0, <8 x i16> %a1, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddwd_maskz
-  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+define <2 x i64> @stack_fold_psubq(<2 x i64> %a0, <2 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psubq
+  ;CHECK:       vpsubq {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <4 x i32> @llvm.x86.sse2.pmadd.wd(<8 x i16> %a0, <8 x i16> %a1)
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = shufflevector <8 x i1> %3, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-  %5 = select <4 x i1> %4, <4 x i32> %2, <4 x i32> zeroinitializer
-  ret <4 x i32> %5
+  %2 = sub <2 x i64> %a0, %a1
+  ret <2 x i64> %2
 }
 
-define <8 x i32> @stack_fold_pmaddwd_ymm(<16 x i16> %a0, <16 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_pmaddwd_ymm
-  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <4 x i64> @stack_fold_psubq_ymm(<4 x i64> %a0, <4 x i64> %a1) {
+  ;CHECK-LABEL: stack_fold_psubq_ymm
+  ;CHECK:       vpsubq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
-  ret <8 x i32> %2
+  %2 = sub <4 x i64> %a0, %a1
+  ret <4 x i64> %2
 }
-declare <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16>, <16 x i16>) nounwind readnone
 
-define <8 x i32> @stack_fold_pmaddwd_ymm_mask(<8 x i32>* %passthru, <16 x i16> %a0, <16 x i16> %a1, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddwd_ymm_mask
-  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+define <16 x i8> @stack_fold_psubsb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsb
+  ;CHECK:       vpsubsb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
-  %3 = bitcast i8 %mask to <8 x i1>
-  ; load needed to keep the operation from being scheduled about the asm block
-  %4 = load <8 x i32>, <8 x i32>* %passthru
-  %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4
-  ret <8 x i32> %5
+  %2 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
 }
+declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone
 
-define <8 x i32> @stack_fold_pmaddwd_ymm_maskz(<16 x i16> %a0, <16 x i16> %a1, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_pmaddwd_ymm_maskz
-  ;CHECK:       vpmaddwd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+define <32 x i8> @stack_fold_psubsb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsb_ymm
+  ;CHECK:       vpsubsb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.pmadd.wd(<16 x i16> %a0, <16 x i16> %a1)
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
-  ret <8 x i32> %4
+  %2 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
 }
+declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone
 
-define <8 x i32> @stack_fold_permd(<8 x i32> %a0, <8 x i32> %a1) {
-  ;CHECK-LABEL: stack_fold_permd
-  ;CHECK:   vpermd {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+define <8 x i16> @stack_fold_psubsw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsw
+  ;CHECK:       vpsubsw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a1, <8 x i32> %a0)
-  ret <8 x i32> %2
+  %2 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
 }
-declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) nounwind readonly
+declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <4 x i64> @stack_fold_permq(<4 x i64> %a0) {
-  ;CHECK-LABEL: stack_fold_permq
-  ;CHECK:   vpermq $235, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <4 x i64> %a0, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 2, i32 3>
-  ; add forces execution domain
-  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
-  ret <4 x i64> %3
+define <16 x i16> @stack_fold_psubsw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubsw_ymm
+  ;CHECK:       vpsubsw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
 }
+declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone
 
-define <4 x i64> @stack_fold_permqvar(<4 x i64> %a0, <4 x i64> %a1) {
-  ;CHECK-LABEL: stack_fold_permqvar
-  ;CHECK:   vpermq {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64> %a1, <4 x i64> %a0, <4 x i64> undef, i8 -1)
-  ; add forces execution domain
-  %3 = add <4 x i64> %2, <i64 1, i64 1, i64 1, i64 1>
-  ret <4 x i64> %3
+define <16 x i8> @stack_fold_psubusb(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusb
+  ;CHECK:       vpsubusb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %2
 }
-declare <4 x i64> @llvm.x86.avx512.mask.permvar.di.256(<4 x i64>, <4 x i64>, <4 x i64>, i8) nounwind readonly
+declare <16 x i8> @llvm.x86.sse2.psubus.b(<16 x i8>, <16 x i8>) nounwind readnone
 
-define <16 x i16> @stack_fold_permwvar(<16 x i16> %a0, <16 x i16> %a1) {
-  ;CHECK-LABEL: stack_fold_permwvar
-  ;CHECK:   vpermw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16> %a1, <16 x i16> %a0, <16 x i16> undef, i16 -1)
-  ; add forces execution domain
-  %3 = add <16 x i16> %2, <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-  ret <16 x i16> %3
+define <32 x i8> @stack_fold_psubusb_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusb_ymm
+  ;CHECK:       vpsubusb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %2
 }
-declare <16 x i16> @llvm.x86.avx512.mask.permvar.hi.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) nounwind readonly
+declare <32 x i8> @llvm.x86.avx2.psubus.b(<32 x i8>, <32 x i8>) nounwind readnone
 
-define <32 x i8> @stack_fold_permbvar(<32 x i8> %a0, <32 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_permbvar
-  ;CHECK:   vpermb {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = call <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8> %a1, <32 x i8> %a0, <32 x i8> undef, i32 -1)
-  ; add forces execution domain
-  %3 = add <32 x i8> %2, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
-  ret <32 x i8> %3
+define <8 x i16> @stack_fold_psubusw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusw
+  ;CHECK:       vpsubusw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16> %a0, <8 x i16> %a1)
+  ret <8 x i16> %2
 }
-declare <32 x i8> @llvm.x86.avx512.mask.permvar.qi.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) nounwind readonly
+declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 
-define <2 x i64> @stack_fold_valignq(<2 x i64> %a, <2 x i64> %b) {
-  ;CHECK-LABEL: stack_fold_valignq
-  ;CHECK:   vpalignr $8, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
-  ret <2 x i64> %2
+define <16 x i16> @stack_fold_psubusw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubusw_ymm
+  ;CHECK:       vpsubusw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %a0, <16 x i16> %a1)
+  ret <16 x i16> %2
 }
+declare <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16>, <16 x i16>) nounwind readnone
 
-define <4 x i32> @stack_fold_valignd(<4 x i32> %a, <4 x i32> %b) {
-  ;CHECK-LABEL: stack_fold_valignd
-  ;CHECK:   vpalignr $4, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32><i32 1, i32 2, i32 3, i32 4>
-  ret <4 x i32> %2
+define <8 x i16> @stack_fold_psubw(<8 x i16> %a0, <8 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubw
+  ;CHECK:       vpsubw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = sub <8 x i16> %a0, %a1
+  ret <8 x i16> %2
 }
 
-define <4 x i64> @stack_fold_valignq_ymm(<4 x i64> %a, <4 x i64> %b) {
-  ;CHECK-LABEL: stack_fold_valignq_ymm
-  ;CHECK:   valignq $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-  ret <4 x i64> %2
+define <16 x i16> @stack_fold_psubw_ymm(<16 x i16> %a0, <16 x i16> %a1) {
+  ;CHECK-LABEL: stack_fold_psubw_ymm
+  ;CHECK:       vpsubw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = sub <16 x i16> %a0, %a1
+  ret <16 x i16> %2
 }
 
-define <8 x i32> @stack_fold_valignd_ymm(<8 x i32> %a, <8 x i32> %b) {
-  ;CHECK-LABEL: stack_fold_valignd_ymm
-  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  ret <8 x i32> %2
+define <16 x i8> @stack_fold_punpckhbw(<16 x i8> %a0, <16 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhbw
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i8> %2
 }
 
-define <8 x i32> @stack_fold_valignd_ymm_mask(<8 x i32> %a, <8 x i32> %b, <8 x i32>* %passthru, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_valignd_ymm_mask
-  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = load <8 x i32>, <8 x i32>* %passthru
-  %5 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> %4
-  ret <8 x i32> %5
+define <16 x i8> @stack_fold_punpckhbw_mask(<16 x i8>* %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_punpckhbw_mask
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %3 = bitcast i16 %mask to <16 x i1>
+  ; load needed to keep the operation from being scheduled about the asm block
+  %4 = load <16 x i8>, <16 x i8>* %passthru
+  %5 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> %4
+  ret <16 x i8> %5
 }
 
-define <8 x i32> @stack_fold_valignd_ymm_maskz(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
-  ;CHECK-LABEL: stack_fold_valignd_ymm_maskz
-  ;CHECK:   valignd $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32><i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
-  %3 = bitcast i8 %mask to <8 x i1>
-  %4 = select <8 x i1> %3, <8 x i32> %2, <8 x i32> zeroinitializer
-  ret <8 x i32> %4
+define <16 x i8> @stack_fold_punpckhbw_maskz(<16 x i8> %passthru, <16 x i8> %a0, <16 x i8> %a1, i16 %mask) {
+  ;CHECK-LABEL: stack_fold_punpckhbw_maskz
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}}, {{%xmm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> <i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  %3 = bitcast i16 %mask to <16 x i1>
+  %4 = select <16 x i1> %3, <16 x i8> %2, <16 x i8> zeroinitializer
+  ret <16 x i8> %4
 }
 
-define <32 x i8> @stack_fold_palignr(<32 x i8> %a0, <32 x i8> %a1) {
-  ;CHECK-LABEL: stack_fold_palignr
-  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+define <32 x i8> @stack_fold_punpckhbw_ymm(<32 x i8> %a0, <32 x i8> %a1) {
+  ;CHECK-LABEL: stack_fold_punpckhbw_ymm
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   ret <32 x i8> %2
 }
 
-define <32 x i8> @stack_fold_palignr_mask(<32 x i8> %a0, <32 x i8> %a1, <32 x i8>* %passthru, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_palignr_mask
-  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+define <32 x i8> @stack_fold_punpckhbw_mask_ymm(<32 x i8>* %passthru, <32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_punpckhbw_mask_ymm
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   %3 = bitcast i32 %mask to <32 x i1>
+  ; load needed to keep the operation from being scheduled about the asm block
   %4 = load <32 x i8>, <32 x i8>* %passthru
   %5 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> %4
   ret <32 x i8> %5
 }
 
-define <32 x i8> @stack_fold_palignr_maskz(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
-  ;CHECK-LABEL: stack_fold_palignr_maskz
-  ;CHECK:       vpalignr $1, {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
-  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
-  %2 = shufflevector <32 x i8> %a1, <32 x i8> %a0, <32 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48>
+define <32 x i8> @stack_fold_punpckhbw_maskz_ymm(<32 x i8> %a0, <32 x i8> %a1, i32 %mask) {
+  ;CHECK-LABEL: stack_fold_punpckhbw_maskz_ymm
+  ;CHECK:       vpunpckhbw {{-?[0-9]*}}(%rsp), {{%ymm[0-9][0-9]*}}, {{%ymm[0-9][0-9]*}} {{{%k[0-7]}}} {z} {{.*#+}} 32-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{xmm16},~{xmm17},~{xmm18},~{xmm19},~{xmm20},~{xmm21},~{xmm22},~{xmm23},~{xmm24},~{xmm25},~{xmm26},~{xmm27},~{xmm28},~{xmm29},~{xmm30},~{xmm31},~{flags}"()
+  %2 = shufflevector <32 x i8> %a0, <32 x i8> %a1, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
   %3 = bitcast i32 %mask to <32 x i1>
   %4 = select <32 x i1> %3, <32 x i8> %2, <32 x i8> zeroinitializer
   ret <32 x i8> %4
diff --git a/test/CodeGen/X86/stack-folding-int-sse42.ll b/test/CodeGen/X86/stack-folding-int-sse42.ll
index a839a315e766a6817b9edb09b02c6ac09b2ba7ae..5c6f697610a07003236e5007c7cd230cb57151a9 100644
--- a/test/CodeGen/X86/stack-folding-int-sse42.ll
+++ b/test/CodeGen/X86/stack-folding-int-sse42.ll
@@ -318,7 +318,7 @@ declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <16 x i8> @stack_fold_pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %c) {
   ;CHECK-LABEL: stack_fold_pblendvb
-  ;CHECK:       pblendvb {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  ;CHECK:       pblendvb %xmm0, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
   %2 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a1, <16 x i8> %c, <16 x i8> %a0)
   ret <16 x i8> %2
diff --git a/test/CodeGen/X86/stack-folding-sha.ll b/test/CodeGen/X86/stack-folding-sha.ll
new file mode 100644
index 0000000000000000000000000000000000000000..768c8a0f5e77de9be9c4612454886acf6df18b30
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-sha.ll
@@ -0,0 +1,72 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+sha < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define <4 x i32> @stack_fold_sha1msg1(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_sha1msg1
+  ;CHECK:       sha1msg1 {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = tail call <4 x i32> @llvm.x86.sha1msg1(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sha1msg1(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_sha1msg2(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_sha1msg2
+  ;CHECK:       sha1msg2 {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = tail call <4 x i32> @llvm.x86.sha1msg2(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sha1msg2(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_sha1nexte(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_sha1nexte
+  ;CHECK:       sha1nexte {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = tail call <4 x i32> @llvm.x86.sha1nexte(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sha1nexte(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_sha1rnds4(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_sha1rnds4
+  ;CHECK:       sha1rnds4 $3, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = tail call <4 x i32> @llvm.x86.sha1rnds4(<4 x i32> %a0, <4 x i32> %a1, i8 3)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sha1rnds4(<4 x i32>, <4 x i32>, i8) nounwind readnone
+
+define <4 x i32> @stack_fold_sha256msg1(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_sha256msg1
+  ;CHECK:       sha256msg1 {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = tail call <4 x i32> @llvm.x86.sha256msg1(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sha256msg1(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_sha256msg2(<4 x i32> %a0, <4 x i32> %a1) {
+  ;CHECK-LABEL: stack_fold_sha256msg2
+  ;CHECK:       sha256msg2 {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = tail call <4 x i32> @llvm.x86.sha256msg2(<4 x i32> %a0, <4 x i32> %a1)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sha256msg2(<4 x i32>, <4 x i32>) nounwind readnone
+
+define <4 x i32> @stack_fold_sha256rnds2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2) {
+  ;CHECK-LABEL: stack_fold_sha256rnds2
+  ;CHECK:       sha256rnds2 {{%xmm[0-9][0-9]*}}, {{-?[0-9]*}}(%rsp), {{%xmm[0-9][0-9]*}} {{.*#+}} 16-byte Folded Reload
+  %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
+  %2 = tail call <4 x i32> @llvm.x86.sha256rnds2(<4 x i32> %a0, <4 x i32> %a1, <4 x i32> %a2)
+  ret <4 x i32> %2
+}
+declare <4 x i32> @llvm.x86.sha256rnds2(<4 x i32>, <4 x i32>, <4 x i32>) nounwind readnone
diff --git a/test/CodeGen/X86/stack-folding-tbm.ll b/test/CodeGen/X86/stack-folding-tbm.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fe3c828a69b02afd8c38d29c442e0eaf12d42986
--- /dev/null
+++ b/test/CodeGen/X86/stack-folding-tbm.ll
@@ -0,0 +1,201 @@
+; RUN: llc -O3 -disable-peephole -mtriple=x86_64-unknown-unknown -mattr=+bmi,+tbm < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-unknown"
+
+; Stack reload folding tests.
+;
+; By including a nop call with sideeffects we can force a partial register spill of the
+; relevant registers and check that the reload is correctly folded into the instruction.
+
+define i32 @stack_fold_bextri_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_bextri_u32
+  ;CHECK:       # BB#0:
+  ;CHECK:       bextr $2814, {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i32 @llvm.x86.tbm.bextri.u32(i32 %a0, i32 2814)
+  ret i32 %2
+}
+declare i32 @llvm.x86.tbm.bextri.u32(i32, i32)
+
+define i64 @stack_fold_bextri_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_bextri_u64
+  ;CHECK:       # BB#0:
+  ;CHECK:       bextr $2814, {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = tail call i64 @llvm.x86.tbm.bextri.u64(i64 %a0, i64 2814)
+  ret i64 %2
+}
+declare i64 @llvm.x86.tbm.bextri.u64(i64, i64)
+
+define i32 @stack_fold_blcfill_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_blcfill_u32
+  ;CHECK:       blcfill {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i32 %a0, 1
+  %3 = and i32 %a0, %2
+  ret i32 %3
+}
+
+define i64 @stack_fold_blcfill_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_blcfill_u64
+  ;CHECK:       blcfill {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i64 %a0, 1
+  %3 = and i64 %a0, %2
+  ret i64 %3
+}
+
+define i32 @stack_fold_blci_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_blci_u32
+  ;CHECK:       blci {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i32 %a0, 1
+  %3 = xor i32 %2, -1
+  %4 = or i32 %a0, %3
+  ret i32 %4
+}
+
+define i64 @stack_fold_blci_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_blci_u64
+  ;CHECK:       blci {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i64 %a0, 1
+  %3 = xor i64 %2, -1
+  %4 = or i64 %a0, %3
+  ret i64 %4
+}
+
+define i32 @stack_fold_blcic_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_blcic_u32
+  ;CHECK:       blcic {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i32 %a0, 1
+  %3 = xor i32 %a0, -1
+  %4 = and i32 %2, %3
+  ret i32 %4
+}
+
+define i64 @stack_fold_blcic_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_blcic_u64
+  ;CHECK:       blcic {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i64 %a0, 1
+  %3 = xor i64 %a0, -1
+  %4 = and i64 %2, %3
+  ret i64 %4
+}
+
+define i32 @stack_fold_blcmsk_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_blcmsk_u32
+  ;CHECK:       blcmsk {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i32 %a0, 1
+  %3 = xor i32 %a0, %2
+  ret i32 %3
+}
+
+define i64 @stack_fold_blcmsk_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_blcmsk_u64
+  ;CHECK:       blcmsk {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i64 %a0, 1
+  %3 = xor i64 %a0, %2
+  ret i64 %3
+}
+
+define i32 @stack_fold_blcs_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_blcs_u32
+  ;CHECK:       blcs {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i32 %a0, 1
+  %3 = or i32 %a0, %2
+  ret i32 %3
+}
+
+define i64 @stack_fold_blcs_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_blcs_u64
+  ;CHECK:       blcs {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i64 %a0, 1
+  %3 = or i64 %a0, %2
+  ret i64 %3
+}
+
+define i32 @stack_fold_blsfill_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_blsfill_u32
+  ;CHECK:       blsfill {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i32 %a0, 1
+  %3 = or i32 %a0, %2
+  ret i32 %3
+}
+
+define i64 @stack_fold_blsfill_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_blsfill_u64
+  ;CHECK:       blsfill {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i64 %a0, 1
+  %3 = or i64 %a0, %2
+  ret i64 %3
+}
+
+define i32 @stack_fold_blsic_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_blsic_u32
+  ;CHECK:       blsic {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i32 %a0, 1
+  %3 = xor i32 %a0, -1
+  %4 = or i32 %2, %3
+  ret i32 %4
+}
+
+define i64 @stack_fold_blsic_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_blsic_u64
+  ;CHECK:       blsic {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i64 %a0, 1
+  %3 = xor i64 %a0, -1
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+
+define i32 @stack_fold_t1mskc_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_t1mskc_u32
+  ;CHECK:       t1mskc {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i32 %a0, 1
+  %3 = xor i32 %a0, -1
+  %4 = or i32 %2, %3
+  ret i32 %4
+}
+
+define i64 @stack_fold_t1mskc_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_t1mskc_u64
+  ;CHECK:       t1mskc {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = add i64 %a0, 1
+  %3 = xor i64 %a0, -1
+  %4 = or i64 %2, %3
+  ret i64 %4
+}
+
+define i32 @stack_fold_tzmsk_u32(i32 %a0) {
+  ;CHECK-LABEL: stack_fold_tzmsk_u32
+  ;CHECK:       tzmsk {{-?[0-9]*}}(%rsp), %eax {{.*#+}} 4-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i32 %a0, 1
+  %3 = xor i32 %a0, -1
+  %4 = and i32 %2, %3
+  ret i32 %4
+}
+
+define i64 @stack_fold_tzmsk_u64(i64 %a0) {
+  ;CHECK-LABEL: stack_fold_tzmsk_u64
+  ;CHECK:       tzmsk {{-?[0-9]*}}(%rsp), %rax {{.*#+}} 8-byte Folded Reload
+  %1 = tail call i64 asm sideeffect "nop", "=x,~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{rbp},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"()
+  %2 = sub i64 %a0, 1
+  %3 = xor i64 %a0, -1
+  %4 = and i64 %2, %3
+  ret i64 %4
+}
diff --git a/test/CodeGen/X86/stack-protector-remarks.ll b/test/CodeGen/X86/stack-protector-remarks.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3792bef2742b21d6ba9371c901adb642ce0eeb8f
--- /dev/null
+++ b/test/CodeGen/X86/stack-protector-remarks.ll
@@ -0,0 +1,103 @@
+; RUN: llc %s -mtriple=x86_64-unknown-unknown -pass-remarks=stack-protector -o /dev/null 2>&1 | FileCheck %s
+; CHECK-NOT: nossp
+; CHECK: function attribute_ssp
+; CHECK-SAME: a function attribute or command-line switch
+; CHECK-NOT: alloca_fixed_small_nossp
+; CHECK: function alloca_fixed_small_ssp
+; CHECK-SAME: a call to alloca or use of a variable length array
+; CHECK: function alloca_fixed_large_ssp
+; CHECK-SAME: a call to alloca or use of a variable length array
+; CHECK: function alloca_variable_ssp
+; CHECK-SAME: a call to alloca or use of a variable length array
+; CHECK: function buffer_ssp
+; CHECK-SAME: a stack allocated buffer or struct containing a buffer
+; CHECK: function struct_ssp
+; CHECK-SAME: a stack allocated buffer or struct containing a buffer
+; CHECK: function address_ssp
+; CHECK-SAME: the address of a local variable being taken
+; CHECK: function multiple_ssp
+; CHECK-SAME: a function attribute or command-line switch
+; CHECK: function multiple_ssp
+; CHECK-SAME: a stack allocated buffer or struct containing a buffer
+; CHECK: function multiple_ssp
+; CHECK-SAME: a stack allocated buffer or struct containing a buffer
+; CHECK: function multiple_ssp
+; CHECK-SAME: the address of a local variable being taken
+; CHECK: function multiple_ssp
+; CHECK-SAME: a call to alloca or use of a variable length array
+
+; Check that no remark is emitted when the switch is not specified.
+; RUN: llc %s -mtriple=x86_64-unknown-unknown -o /dev/null 2>&1 | FileCheck %s -check-prefix=NOREMARK -allow-empty
+; NOREMARK-NOT: ssp
+
+; RUN: llc %s -mtriple=x86_64-unknown-unknown -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck %s -check-prefix=YAML
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            stack-protector
+; YAML-NEXT: Name:            StackProtectorRequested
+; YAML-NEXT: Function:        attribute_ssp
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'Stack protection applied to function '
+; YAML-NEXT:   - Function:        attribute_ssp
+; YAML-NEXT:   - String:          ' due to a function attribute or command-line switch'
+; YAML-NEXT: ...
+
+define void @nossp() ssp {
+  ret void
+}
+
+define void @attribute_ssp() sspreq {
+  ret void
+}
+
+define void @alloca_fixed_small_nossp() ssp {
+  %1 = alloca i8, i64 2, align 16
+  ret void
+}
+
+define void @alloca_fixed_small_ssp() sspstrong {
+  %1 = alloca i8, i64 2, align 16
+  ret void
+}
+
+define void @alloca_fixed_large_ssp() ssp {
+  %1 = alloca i8, i64 64, align 16
+  ret void
+}
+
+define void @alloca_variable_ssp(i64 %x) ssp {
+  %1 = alloca i8, i64 %x, align 16
+  ret void
+}
+
+define void @buffer_ssp() sspstrong {
+  %x = alloca [64 x i32], align 16
+  ret void
+}
+
+%struct.X = type { [64 x i32] }
+define void @struct_ssp() sspstrong {
+  %x = alloca %struct.X, align 4
+  ret void
+}
+
+define void @address_ssp() sspstrong {
+entry:
+  %x = alloca i32, align 4
+  %y = alloca i32*, align 8
+  store i32 32, i32* %x, align 4
+  store i32* %x, i32** %y, align 8
+  ret void
+}
+
+define void @multiple_ssp() sspreq {
+entry:
+  %x = alloca %struct.X, align 4
+  %y = alloca [64 x i32], align 16
+  %a = alloca i32, align 4
+  %b = alloca i32*, align 8
+  %0 = alloca i8, i64 2, align 16
+  store i32 32, i32* %a, align 4
+  store i32* %a, i32** %b, align 8
+  ret void
+}
diff --git a/test/CodeGen/X86/stack-protector-target.ll b/test/CodeGen/X86/stack-protector-target.ll
index 66e45055b2b51f78299449a65421e70fa4237be4..fc5a18d79d4b8709d28a489934bf6aa62feaf386 100644
--- a/test/CodeGen/X86/stack-protector-target.ll
+++ b/test/CodeGen/X86/stack-protector-target.ll
@@ -1,10 +1,17 @@
 ; Test target-specific stack cookie location.
-; RUN: llc -mtriple=i386-linux < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
-; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
-; RUN: llc -mtriple=i386-linux-android < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
-; RUN: llc -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
-; RUN: llc -mtriple=i386-kfreebsd < %s -o - | FileCheck --check-prefix=LINUX-I386 %s
-; RUN: llc -mtriple=x86_64-kfreebsd < %s -o - | FileCheck --check-prefix=LINUX-X64 %s
+; RUN: llc -mtriple=i386-linux < %s -o - | FileCheck --check-prefix=I386-TLS %s
+; RUN: llc -mtriple=x86_64-linux < %s -o - | FileCheck --check-prefix=X64-TLS %s
+
+; RUN: llc -mtriple=i386-linux-android < %s -o - | FileCheck --check-prefix=I386 %s
+; RUN: llc -mtriple=i386-linux-android16 < %s -o - | FileCheck --check-prefix=I386 %s
+; RUN: llc -mtriple=i386-linux-android17 < %s -o - | FileCheck --check-prefix=I386-TLS %s
+; RUN: llc -mtriple=i386-linux-android24 < %s -o - | FileCheck --check-prefix=I386-TLS %s
+; RUN: llc -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefix=X64-TLS %s
+; RUN: llc -mtriple=x86_64-linux-android17 < %s -o - | FileCheck --check-prefix=X64-TLS %s
+; RUN: llc -mtriple=x86_64-linux-android24 < %s -o - | FileCheck --check-prefix=X64-TLS %s
+
+; RUN: llc -mtriple=i386-kfreebsd < %s -o - | FileCheck --check-prefix=I386-TLS %s
+; RUN: llc -mtriple=x86_64-kfreebsd < %s -o - | FileCheck --check-prefix=X64-TLS %s
 
 define void @_Z1fv() sspreq {
 entry:
@@ -16,12 +23,17 @@ entry:
 
 declare void @_Z7CapturePi(i32*)
 
-; LINUX-X64: movq %fs:40, %[[B:.*]]
-; LINUX-X64: movq %[[B]], 16(%rsp)
-; LINUX-X64: movq %fs:40, %[[C:.*]]
-; LINUX-X64: cmpq 16(%rsp), %[[C]]
+; X64-TLS: movq %fs:40, %[[B:.*]]
+; X64-TLS: movq %[[B]], 16(%rsp)
+; X64-TLS: movq %fs:40, %[[C:.*]]
+; X64-TLS: cmpq 16(%rsp), %[[C]]
+
+; I386: movl __stack_chk_guard, %[[B:.*]]
+; I386: movl %[[B]], 8(%esp)
+; I386: movl __stack_chk_guard, %[[C:.*]]
+; I386: cmpl 8(%esp), %[[C]]
 
-; LINUX-I386: movl %gs:20, %[[B:.*]]
-; LINUX-I386: movl %[[B]], 8(%esp)
-; LINUX-I386: movl %gs:20, %[[C:.*]]
-; LINUX-I386: cmpl 8(%esp), %[[C]]
+; I386-TLS: movl %gs:20, %[[B:.*]]
+; I386-TLS: movl %[[B]], 8(%esp)
+; I386-TLS: movl %gs:20, %[[C:.*]]
+; I386-TLS: cmpl 8(%esp), %[[C]]
diff --git a/test/CodeGen/X86/stack-protector-weight.ll b/test/CodeGen/X86/stack-protector-weight.ll
index 58c6c713941dea6006295964fe3c4e049c67dd04..d5a65ffb890be994c608aac572861101f479dc6b 100644
--- a/test/CodeGen/X86/stack-protector-weight.ll
+++ b/test/CodeGen/X86/stack-protector-weight.ll
@@ -31,20 +31,20 @@ define i32 @test_branch_weights(i32 %n) #0 {
 entry:
   %a = alloca [128 x i32], align 16
   %0 = bitcast [128 x i32]* %a to i8*
-  call void @llvm.lifetime.start(i64 512, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 512, i8* %0)
   %arraydecay = getelementptr inbounds [128 x i32], [128 x i32]* %a, i64 0, i64 0
   call void @foo2(i32* %arraydecay)
   %idxprom = sext i32 %n to i64
   %arrayidx = getelementptr inbounds [128 x i32], [128 x i32]* %a, i64 0, i64 %idxprom
   %1 = load i32, i32* %arrayidx, align 4
-  call void @llvm.lifetime.end(i64 512, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 512, i8* %0)
   ret i32 %1
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare void @foo2(i32*)
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 attributes #0 = { sspstrong "stack-protector-buffer-size"="8" }
diff --git a/test/CodeGen/X86/stack_guard_remat.ll b/test/CodeGen/X86/stack_guard_remat.ll
index d38c68a8a5bbca5472dee667bcd8d6399a29afe2..cc3cd6b0801a54192e48bfd080dc880760144eaa 100644
--- a/test/CodeGen/X86/stack_guard_remat.ll
+++ b/test/CodeGen/X86/stack_guard_remat.ll
@@ -9,20 +9,20 @@ define i32 @test_stack_guard_remat() #0 {
 entry:
   %a1 = alloca [256 x i32], align 16
   %0 = bitcast [256 x i32]* %a1 to i8*
-  call void @llvm.lifetime.start(i64 1024, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1024, i8* %0)
   %arraydecay = getelementptr inbounds [256 x i32], [256 x i32]* %a1, i64 0, i64 0
   call void @foo3(i32* %arraydecay)
   call void asm sideeffect "foo2", "~{r12},~{r13},~{r14},~{r15},~{ebx},~{esi},~{edi},~{dirflag},~{fpsr},~{flags}"()
-  call void @llvm.lifetime.end(i64 1024, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 1024, i8* %0)
   ret i32 0
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare void @foo3(i32*)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/stores-merging.ll b/test/CodeGen/X86/stores-merging.ll
index 9e479bd71b98b82b7c1ab640e192318530b39476..dbfb06881d82cb90d40b253db3a37643cac2ea97 100644
--- a/test/CodeGen/X86/stores-merging.ll
+++ b/test/CodeGen/X86/stores-merging.ll
@@ -13,9 +13,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ;; the same result in memory in the end.
 
 ; CHECK-LABEL: redundant_stores_merging:
-; CHECK:   movl    $123, e+8(%rip)
-; CHECK:   movabsq $1958505086977, %rax
+; CHECK:   movabsq $528280977409, %rax
 ; CHECK:   movq    %rax, e+4(%rip)
+; CHECK:   movl    $456, e+8(%rip)
 define void @redundant_stores_merging() {
 entry:
   store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4
@@ -26,9 +26,9 @@ entry:
 
 ;; This variant tests PR25154.
 ; CHECK-LABEL: redundant_stores_merging_reverse:
-; CHECK:   movl    $123, e+8(%rip)
-; CHECK:   movabsq $1958505086977, %rax
+; CHECK:   movabsq $528280977409, %rax
 ; CHECK:   movq    %rax, e+4(%rip)
+; CHECK:   movl    $456, e+8(%rip)
 define void @redundant_stores_merging_reverse() {
 entry:
   store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4
@@ -45,9 +45,8 @@ entry:
 ;; a movl, after the store to 3).
 
 ;; CHECK-LABEL: overlapping_stores_merging:
-;; CHECK:  movw    $0, b+2(%rip)
+;; CHECK:  movl    $1, b(%rip)
 ;; CHECK:  movw    $2, b+3(%rip)
-;; CHECK:  movw    $1, b(%rip)
 define void @overlapping_stores_merging() {
 entry:
   store i16 0, i16* bitcast (i8* getelementptr inbounds ([8 x i8], [8 x i8]* @b, i64 0, i64 2) to i16*), align 2
diff --git a/test/CodeGen/X86/subvector-broadcast.ll b/test/CodeGen/X86/subvector-broadcast.ll
index 5082101a6d43c43b24d701effb1fb5255c331867..94d3b22a4c80505aef38c022e3dc8110e4a3d2a4 100644
--- a/test/CodeGen/X86/subvector-broadcast.ll
+++ b/test/CodeGen/X86/subvector-broadcast.ll
@@ -24,13 +24,13 @@ define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
 ; X32-AVX512F-LABEL: test_broadcast_2f64_4f64:
 ; X32-AVX512F:       ## BB#0:
 ; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512F-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512F-NEXT:    retl
 ;
 ; X32-AVX512BW-LABEL: test_broadcast_2f64_4f64:
 ; X32-AVX512BW:       ## BB#0:
 ; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512BW-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512BW-NEXT:    retl
 ;
 ; X32-AVX512DQ-LABEL: test_broadcast_2f64_4f64:
@@ -46,12 +46,12 @@ define <4 x double> @test_broadcast_2f64_4f64(<2 x double> *%p) nounwind {
 ;
 ; X64-AVX512F-LABEL: test_broadcast_2f64_4f64:
 ; X64-AVX512F:       ## BB#0:
-; X64-AVX512F-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512F-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512F-NEXT:    retq
 ;
 ; X64-AVX512BW-LABEL: test_broadcast_2f64_4f64:
 ; X64-AVX512BW:       ## BB#0:
-; X64-AVX512BW-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512BW-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512BW-NEXT:    retq
 ;
 ; X64-AVX512DQ-LABEL: test_broadcast_2f64_4f64:
@@ -153,13 +153,13 @@ define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
 ; X32-AVX512F-LABEL: test_broadcast_2i64_4i64:
 ; X32-AVX512F:       ## BB#0:
 ; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512F-NEXT:    retl
 ;
 ; X32-AVX512BW-LABEL: test_broadcast_2i64_4i64:
 ; X32-AVX512BW:       ## BB#0:
 ; X32-AVX512BW-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512BW-NEXT:    retl
 ;
 ; X32-AVX512DQ-LABEL: test_broadcast_2i64_4i64:
@@ -175,12 +175,12 @@ define <4 x i64> @test_broadcast_2i64_4i64(<2 x i64> *%p) nounwind {
 ;
 ; X64-AVX512F-LABEL: test_broadcast_2i64_4i64:
 ; X64-AVX512F:       ## BB#0:
-; X64-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512F-NEXT:    retq
 ;
 ; X64-AVX512BW-LABEL: test_broadcast_2i64_4i64:
 ; X64-AVX512BW:       ## BB#0:
-; X64-AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512BW-NEXT:    retq
 ;
 ; X64-AVX512DQ-LABEL: test_broadcast_2i64_4i64:
@@ -286,27 +286,16 @@ define <8 x i64> @test_broadcast_4i64_8i64(<4 x i64> *%p) nounwind {
 }
 
 define <8 x float> @test_broadcast_4f32_8f32(<4 x float> *%p) nounwind {
-; X32-AVX-LABEL: test_broadcast_4f32_8f32:
-; X32-AVX:       ## BB#0:
-; X32-AVX-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X32-AVX-NEXT:    retl
-;
-; X32-AVX512-LABEL: test_broadcast_4f32_8f32:
-; X32-AVX512:       ## BB#0:
-; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
-; X32-AVX512-NEXT:    retl
-;
-; X64-AVX-LABEL: test_broadcast_4f32_8f32:
-; X64-AVX:       ## BB#0:
-; X64-AVX-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; X64-AVX-NEXT:    retq
+; X32-LABEL: test_broadcast_4f32_8f32:
+; X32:       ## BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X32-NEXT:    retl
 ;
-; X64-AVX512-LABEL: test_broadcast_4f32_8f32:
-; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
-; X64-AVX512-NEXT:    retq
+; X64-LABEL: test_broadcast_4f32_8f32:
+; X64:       ## BB#0:
+; X64-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; X64-NEXT:    retq
  %1 = load <4 x float>, <4 x float> *%p
  %2 = shufflevector <4 x float> %1, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
  ret <8 x float> %2
@@ -402,7 +391,7 @@ define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
 ; X32-AVX512-LABEL: test_broadcast_4i32_8i32:
 ; X32-AVX512:       ## BB#0:
 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX-LABEL: test_broadcast_4i32_8i32:
@@ -412,7 +401,7 @@ define <8 x i32> @test_broadcast_4i32_8i32(<4 x i32> *%p) nounwind {
 ;
 ; X64-AVX512-LABEL: test_broadcast_4i32_8i32:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512-NEXT:    retq
  %1 = load <4 x i32>, <4 x i32> *%p
  %2 = shufflevector <4 x i32> %1, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
@@ -522,7 +511,7 @@ define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
 ; X32-AVX512-LABEL: test_broadcast_8i16_16i16:
 ; X32-AVX512:       ## BB#0:
 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX-LABEL: test_broadcast_8i16_16i16:
@@ -532,7 +521,7 @@ define <16 x i16> @test_broadcast_8i16_16i16(<8 x i16> *%p) nounwind {
 ;
 ; X64-AVX512-LABEL: test_broadcast_8i16_16i16:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512-NEXT:    retq
  %1 = load <8 x i16>, <8 x i16> *%p
  %2 = shufflevector <8 x i16> %1, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -557,7 +546,7 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
 ; X32-AVX512F-LABEL: test_broadcast_8i16_32i16:
 ; X32-AVX512F:       ## BB#0:
 ; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
 ; X32-AVX512F-NEXT:    retl
 ;
@@ -570,7 +559,7 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
 ; X32-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
 ; X32-AVX512DQ:       ## BB#0:
 ; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
 ; X32-AVX512DQ-NEXT:    retl
 ;
@@ -588,7 +577,7 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
 ;
 ; X64-AVX512F-LABEL: test_broadcast_8i16_32i16:
 ; X64-AVX512F:       ## BB#0:
-; X64-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
 ; X64-AVX512F-NEXT:    retq
 ;
@@ -599,7 +588,7 @@ define <32 x i16> @test_broadcast_8i16_32i16(<8 x i16> *%p) nounwind {
 ;
 ; X64-AVX512DQ-LABEL: test_broadcast_8i16_32i16:
 ; X64-AVX512DQ:       ## BB#0:
-; X64-AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
 ; X64-AVX512DQ-NEXT:    retq
  %1 = load <8 x i16>, <8 x i16> *%p
@@ -672,7 +661,7 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
 ; X32-AVX512-LABEL: test_broadcast_16i8_32i8:
 ; X32-AVX512:       ## BB#0:
 ; X32-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX-LABEL: test_broadcast_16i8_32i8:
@@ -682,7 +671,7 @@ define <32 x i8> @test_broadcast_16i8_32i8(<16 x i8> *%p) nounwind {
 ;
 ; X64-AVX512-LABEL: test_broadcast_16i8_32i8:
 ; X64-AVX512:       ## BB#0:
-; X64-AVX512-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512-NEXT:    retq
  %1 = load <16 x i8>, <16 x i8> *%p
  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -707,7 +696,7 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
 ; X32-AVX512F-LABEL: test_broadcast_16i8_64i8:
 ; X32-AVX512F:       ## BB#0:
 ; X32-AVX512F-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
 ; X32-AVX512F-NEXT:    retl
 ;
@@ -720,7 +709,7 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
 ; X32-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
 ; X32-AVX512DQ:       ## BB#0:
 ; X32-AVX512DQ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X32-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X32-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
 ; X32-AVX512DQ-NEXT:    retl
 ;
@@ -738,7 +727,7 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
 ;
 ; X64-AVX512F-LABEL: test_broadcast_16i8_64i8:
 ; X64-AVX512F:       ## BB#0:
-; X64-AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512F-NEXT:    vmovdqa %ymm0, %ymm1
 ; X64-AVX512F-NEXT:    retq
 ;
@@ -749,7 +738,7 @@ define <64 x i8> @test_broadcast_16i8_64i8(<16 x i8> *%p) nounwind {
 ;
 ; X64-AVX512DQ-LABEL: test_broadcast_16i8_64i8:
 ; X64-AVX512DQ:       ## BB#0:
-; X64-AVX512DQ-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; X64-AVX512DQ-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; X64-AVX512DQ-NEXT:    vmovdqa %ymm0, %ymm1
 ; X64-AVX512DQ-NEXT:    retq
  %1 = load <16 x i8>, <16 x i8> *%p
@@ -1298,7 +1287,7 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
 ; X32-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
 ; X32-AVX1:       ## BB#0: ## %entry
 ; X32-AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; X32-AVX1-NEXT:    vmovaps {{.*#+}} ymm4 = [1,0,2,0,3,0,4,0]
+; X32-AVX1-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,0,2,0,3,0,4,0]
 ; X32-AVX1-NEXT:    vextractf128 $1, %ymm4, %xmm5
 ; X32-AVX1-NEXT:    vpaddq %xmm5, %xmm3, %xmm3
 ; X32-AVX1-NEXT:    vpaddq %xmm4, %xmm0, %xmm0
@@ -1341,6 +1330,7 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
 ; X32-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
 ; X32-AVX512-NEXT:    vmovdqu %ymm0, _ga4
 ; X32-AVX512-NEXT:    vmovdqu64 %zmm1, _gb4
+; X32-AVX512-NEXT:    vzeroupper
 ; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX1-LABEL: fallback_broadcast_v4i64_to_v8i64:
@@ -1391,6 +1381,7 @@ define void @fallback_broadcast_v4i64_to_v8i64(<4 x i64> %a, <8 x i64> %b) {
 ; X64-AVX512-NEXT:    vpandq %zmm2, %zmm1, %zmm1
 ; X64-AVX512-NEXT:    vmovdqu %ymm0, {{.*}}(%rip)
 ; X64-AVX512-NEXT:    vmovdqu64 %zmm1, {{.*}}(%rip)
+; X64-AVX512-NEXT:    vzeroupper
 ; X64-AVX512-NEXT:    retq
 entry:
   %0 = add <4 x i64> %a, <i64 1, i64 2, i64 3, i64 4>
@@ -1429,6 +1420,7 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
 ; X32-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
 ; X32-AVX512-NEXT:    vmovupd %ymm0, _ga2
 ; X32-AVX512-NEXT:    vmovupd %zmm1, _gb2
+; X32-AVX512-NEXT:    vzeroupper
 ; X32-AVX512-NEXT:    retl
 ;
 ; X64-AVX-LABEL: fallback_broadcast_v4f64_to_v8f64:
@@ -1454,6 +1446,7 @@ define void @fallback_broadcast_v4f64_to_v8f64(<4 x double> %a, <8 x double> %b)
 ; X64-AVX512-NEXT:    vdivpd %zmm2, %zmm1, %zmm1
 ; X64-AVX512-NEXT:    vmovupd %ymm0, {{.*}}(%rip)
 ; X64-AVX512-NEXT:    vmovupd %zmm1, {{.*}}(%rip)
+; X64-AVX512-NEXT:    vzeroupper
 ; X64-AVX512-NEXT:    retq
 entry:
   %0 = fadd <4 x double> %a, <double 1.0, double 2.0, double 3.0, double 4.0>
diff --git a/test/CodeGen/X86/swifterror.ll b/test/CodeGen/X86/swifterror.ll
index cd4150597225273c63fc5f733af9e94ee5d1f12b..5704d1919988f8ae0278696dcf7c715553ff4b8e 100644
--- a/test/CodeGen/X86/swifterror.ll
+++ b/test/CodeGen/X86/swifterror.ll
@@ -670,3 +670,45 @@ define swiftcc { i64, i64, i64, i64} @params_and_return_in_reg(i64, i64, i64, i6
 }
 
 declare swiftcc { i64, i64, i64, i64 } @params_and_return_in_reg2(i64, i64, i64, i64, i64, i64, i8* swiftself, %swift_error** nocapture swifterror %err)
+
+
+declare void @acallee(i8*)
+
+; Make sure we don't tail call if the caller returns a swifterror value. We
+; would have to move into the swifterror register before the tail call.
+; CHECK-APPLE: tailcall_from_swifterror:
+; CHECK-APPLE-NOT: jmp _acallee
+; CHECK-APPLE: callq _acallee
+
+define swiftcc void @tailcall_from_swifterror(%swift_error** swifterror %error_ptr_ref) {
+entry:
+  tail call void @acallee(i8* null)
+  ret void
+}
+
+; Make sure we don't crash on this function during -O0.
+; We used to crash because we would insert an IMPLICIT_DEF for the swifterror at
+; beginning of the machine basic block but did not inform FastISel of the
+; inserted instruction. When computing the InsertPoint in the entry block
+; FastISel would choose an insertion point before the IMPLICIT_DEF causing a
+; crash later on.
+declare hidden swiftcc i8* @testFunA()
+
+%TSb = type <{ i1 }>
+
+define swiftcc void @dontCrash()  {
+entry:
+  %swifterror = alloca swifterror %swift_error*, align 8
+  store %swift_error* null, %swift_error** %swifterror, align 8
+  %a = call i8* @testFunA()
+  %b = bitcast i8* %a to %TSb*
+  %._value = getelementptr inbounds %TSb, %TSb* %b, i32 0, i32 0
+  %c = load i1, i1* %._value, align 1
+  br i1 %c, label %trueBB, label %falseBB
+
+trueBB:
+  ret void
+
+falseBB:
+  ret void
+}
diff --git a/test/CodeGen/X86/tail-call-conditional.mir b/test/CodeGen/X86/tail-call-conditional.mir
index 75cb1e451d83928738717bdbe899810093a41d70..e006138ba848573ce34cf3705b212c2d2fd31932 100644
--- a/test/CodeGen/X86/tail-call-conditional.mir
+++ b/test/CodeGen/X86/tail-call-conditional.mir
@@ -48,7 +48,7 @@ body:             |
   ; CHECK-NEXT: %rdi = COPY %rsi
   ; CHECK-NEXT: %rsi = COPY %rax
   ; CHECK-NEXT: CMP64ri8 %rax, 9, implicit-def %eflags
-  ; CHECK-NEXT: TCRETURNdi64cc @f1, 0, 3, csr_64, implicit %rsp, implicit %eflags, implicit %rsp, implicit %rdi, implicit %rsi
+  ; CHECK-NEXT: TCRETURNdi64cc @f1, 0, 3, csr_64, implicit %rsp, implicit %eflags, implicit %rsp, implicit %rdi, implicit %rsi, implicit %rax, implicit-def %rax, implicit %sil, implicit-def %sil, implicit %si, implicit-def %si, implicit %esi, implicit-def %esi, implicit %rsi, implicit-def %rsi, implicit %dil, implicit-def %dil, implicit %di, implicit-def %di, implicit %edi, implicit-def %edi, implicit %rdi, implicit-def %rdi, implicit %ah, implicit-def %ah, implicit %al, implicit-def %al, implicit %ax, implicit-def %ax, implicit %eax, implicit-def %eax
 
   bb.1:
     successors: %bb.2, %bb.3
diff --git a/test/CodeGen/X86/tail-dup-debugloc.ll b/test/CodeGen/X86/tail-dup-debugloc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c5ca6fc5750c378ab132cc29247fb29b44ad55bc
--- /dev/null
+++ b/test/CodeGen/X86/tail-dup-debugloc.ll
@@ -0,0 +1,56 @@
+; RUN: llc -stop-after=tailduplication -march=x86-64 < %s | FileCheck %s
+;
+; Check that DebugLoc attached to the branch instruction of 
+; 'while.cond1.preheader.lr.ph' survives after tailduplication pass.
+;
+; CHECK: [[DLOC:![0-9]+]] = !DILocation(line: 9, column: 5, scope: !{{[0-9]+}})
+; CHECK: [[VREG:%[^ ]+]] = COPY %rdi
+; CHECK: TEST64rr [[VREG]], [[VREG]]
+; CHECK-NEXT: JE_1 {{.+}}, debug-location [[DLOC]]
+; CHECK-NEXT: JMP_1 {{.+}}, debug-location [[DLOC]]
+
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.Node = type { %struct.Node* }
+
+define i32 @foo(%struct.Node* readonly %node, %struct.Node* readnone %root) !dbg !6 {
+entry:
+  %cmp = icmp eq %struct.Node* %node, %root, !dbg !8
+  br i1 %cmp, label %while.end4, label %while.cond1.preheader.lr.ph, !dbg !10
+
+while.cond1.preheader.lr.ph:                      ; preds = %entry
+  %tobool = icmp eq %struct.Node* %node, null
+  br i1 %tobool, label %while.cond1.preheader.us.preheader, label %while.body2.preheader, !dbg !11
+
+while.body2.preheader:                            ; preds = %while.cond1.preheader.lr.ph
+  br label %while.body2, !dbg !11
+
+while.cond1.preheader.us.preheader:               ; preds = %while.cond1.preheader.lr.ph
+  br label %while.cond1.preheader.us, !dbg !10
+
+while.cond1.preheader.us:                         ; preds = %while.cond1.preheader.us.preheader, %while.cond1.preheader.us
+  br label %while.cond1.preheader.us, !dbg !10
+
+while.body2:                                      ; preds = %while.body2.preheader, %while.body2
+  br label %while.body2, !dbg !11
+
+while.end4:                                       ; preds = %entry
+  ret i32 0, !dbg !12
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: true, emissionKind: LineTablesOnly)
+!1 = !DIFile(filename: "foo.c", directory: "b/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 5, type: !7, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 7, column: 15, scope: !9)
+!9 = !DILexicalBlockFile(scope: !6, file: !1, discriminator: 2)
+!10 = !DILocation(line: 7, column: 3, scope: !9)
+!11 = !DILocation(line: 9, column: 5, scope: !9)
+!12 = !DILocation(line: 14, column: 3, scope: !6)
diff --git a/test/CodeGen/X86/tail-dup-no-other-successor.ll b/test/CodeGen/X86/tail-dup-no-other-successor.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6fa6f94e6530a9707a7833dd62ef1cc5842c9b34
--- /dev/null
+++ b/test/CodeGen/X86/tail-dup-no-other-successor.ll
@@ -0,0 +1,53 @@
+; RUN: llc -O3 -o - %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @effect(i32);
+
+; After the loop gets laid out, loop.end is the only successor, but can't be
+; laid out because of the CFG dependency from top.fakephi. The calculations show
+; that it isn't profitable to tail-duplicate in this case, because of the
+; effects on fallthrough from %loop.end
+; CHECK-LABEL: {{^}}no_successor_still_no_taildup:
+; CHECK: %entry
+; CHECK: %loop.top
+; CHECK: %loop.latch
+; CHECK: %top.fakephi
+; CHECK: %loop.end
+; CHECK: %false
+; CHECK: %ret
+define void @no_successor_still_no_taildup (i32 %count, i32 %key) {
+entry:
+  br label %loop.top
+
+loop.top:
+  %i.loop.top = phi i32 [ %count, %entry ], [ %i.latch, %loop.latch ]
+  %cmp.top = icmp eq i32 %i.loop.top, %key
+  call void @effect(i32 0)
+  br i1 %cmp.top, label %top.fakephi, label %loop.latch, !prof !1
+
+loop.latch:
+  %i.latch = sub i32 %i.loop.top, 1
+  %cmp.latch = icmp eq i32 %i.latch, 0
+  call void @effect(i32 1)
+  br i1 %cmp.top, label %loop.top, label %loop.end, !prof !2
+
+top.fakephi:
+  call void @effect(i32 2)
+  br label %loop.end
+
+loop.end:
+  %cmp.end = icmp eq i32 %count, 0
+  br i1 %cmp.end, label %ret, label %false, !prof !3
+
+false:
+  call void @effect(i32 4)
+  br label %ret
+
+ret:
+  ret void
+}
+
+!1 = !{!"branch_weights", i32 1, i32 1}
+!2 = !{!"branch_weights", i32 5, i32 1}
+!3 = !{!"branch_weights", i32 1, i32 2}
diff --git a/test/CodeGen/X86/tail-dup-repeat.ll b/test/CodeGen/X86/tail-dup-repeat.ll
index 21b48e16efb9eaf44c9159ab698f2e1c0d221d98..7d9c0908e571ccd18d69ff2f457e4536bef70bb9 100644
--- a/test/CodeGen/X86/tail-dup-repeat.ll
+++ b/test/CodeGen/X86/tail-dup-repeat.ll
@@ -1,4 +1,4 @@
-; RUN: llc -O2 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
+; RUN: llc -O3 -tail-dup-placement-threshold=4 -o - %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/CodeGen/X86/tail-merge-debugloc.ll b/test/CodeGen/X86/tail-merge-debugloc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..197b0b8032577f3c44dc733ed05bf8f78091cc1a
--- /dev/null
+++ b/test/CodeGen/X86/tail-merge-debugloc.ll
@@ -0,0 +1,42 @@
+; RUN: llc -stop-after=branch-folder < %s | FileCheck %s
+;
+; bb2 and bb3 in the IR below will be tail-merged into a single basic block.
+; As br instructions in bb2 and bb3 have the same debug location, make sure that
+; the branch instruction in the merged basic block still maintains the debug 
+; location info.
+; 
+; CHECK:      [[DLOC:![0-9]+]] = !DILocation(line: 2, column: 2, scope: !{{[0-9]+}})
+; CHECK:      TEST64rr{{.*}}%rsi, %rsi, implicit-def %eflags
+; CHECK-NEXT: JNE_1{{.*}}, debug-location [[DLOC]]
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo(i1 %b, i8* %p) {
+bb1: 
+  br i1 %b, label %bb2, label %bb3
+
+bb2:
+  %a1 = icmp eq i8* %p, null
+  br i1 %a1, label %bb4, label %bb5, !dbg !6
+  
+bb3:
+  %a2 = icmp eq i8* %p, null
+  br i1 %a2, label %bb4, label %bb5, !dbg !6
+
+bb4:
+  ret i32 1
+
+bb5:
+  ret i32 0
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1)
+!1 = !DIFile(filename: "foo.c", directory: "b/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!5 = distinct !DILexicalBlock(scope: !4, file: !1, line: 1, column: 1)
+!6 = !DILocation(line: 2, column: 2, scope: !5)
diff --git a/test/CodeGen/X86/tail-merge-identical.ll b/test/CodeGen/X86/tail-merge-identical.ll
new file mode 100644
index 0000000000000000000000000000000000000000..024ad582d03ff086a363e8eea14f6a31fd46860d
--- /dev/null
+++ b/test/CodeGen/X86/tail-merge-identical.ll
@@ -0,0 +1,41 @@
+; RUN: llc -o - -verify-machineinstrs %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@data = external global [3 x i32], align 4
+@store = external global i32, align 4
+
+; %else1 and %then2 end up lowering to identical blocks. These blocks should be
+; merged during tail-merging.
+; CHECK-LABEL: merge_identical_blocks
+; CHECK: movl $data+4
+; CHECK-NOT: movl $data+4
+; CHECK: retq
+define void @merge_identical_blocks(i1 %a, i1 %b) {
+entry:
+  br label %if1
+
+if1:                                              ; predfs = %entry
+  br i1 %a, label %else1, label %if2
+
+else1:                                            ; preds = %if1
+  %ptr.else1 = getelementptr inbounds [3 x i32], [3 x i32]* @data, i64 0, i32 1
+  br label %phi_join
+
+if2:                                              ; preds = %if1
+  br i1 %b, label %then2, label %else2
+
+then2:                                            ; preds = %if2
+  %ptr.then2 = getelementptr inbounds [3 x i32], [3 x i32]* @data, i64 0, i32 1
+  br label %phi_join
+
+else2:                                            ; preds = %if2
+  %ptr.else2 = getelementptr inbounds [3 x i32], [3 x i32]* @data, i64 0, i32 2
+  br label %phi_join
+
+phi_join:                                         ; preds = %else1, %then2, %else2
+  %val.ptr = phi i32* [ %ptr.else1, %else1 ], [ %ptr.then2, %then2 ], [ %ptr.else2, %else2 ]
+  %val = load i32, i32* %val.ptr, align 4
+  store i32 %val, i32* @store, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/tail-opts.ll b/test/CodeGen/X86/tail-opts.ll
index 12c90c1a5fa97ce86c21b9adc52562dbb6f1f2e1..96ff33ff5f7d79d19f0c670680314a9c37e4dbe4 100644
--- a/test/CodeGen/X86/tail-opts.ll
+++ b/test/CodeGen/X86/tail-opts.ll
@@ -113,16 +113,15 @@ altret:
 ; CHECK-NEXT:   jbe .LBB2_3
 ; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   ja .LBB2_4
-; CHECK-NEXT:   jmp .LBB2_2
+; CHECK-NEXT: .LBB2_2:
+; CHECK-NEXT:   movb $1, %al
+; CHECK-NEXT:   ret
 ; CHECK-NEXT: .LBB2_3:
 ; CHECK-NEXT:   ucomiss %xmm{{[0-2]}}, %xmm{{[0-2]}}
 ; CHECK-NEXT:   jbe .LBB2_2
 ; CHECK-NEXT: .LBB2_4:
 ; CHECK-NEXT:   xorl %eax, %eax
 ; CHECK-NEXT:   ret
-; CHECK-NEXT: .LBB2_2:
-; CHECK-NEXT:   movb $1, %al
-; CHECK-NEXT:   ret
 
 define i1 @dont_merge_oddly(float* %result) nounwind {
 entry:
@@ -299,33 +298,35 @@ declare void @func()
 ; one - One instruction may be tail-duplicated even with optsize.
 
 ; CHECK-LABEL: one:
-; CHECK: movl $0, XYZ(%rip)
-; CHECK: movl $0, XYZ(%rip)
+; CHECK: j{{.*}} tail_call_me
+; CHECK: j{{.*}} tail_call_me
 
 @XYZ = external global i32
 
-define void @one() nounwind optsize {
+declare void @tail_call_me()
+
+define void @one(i32 %v) nounwind optsize {
 entry:
-  %0 = icmp eq i32 undef, 0
+  %0 = icmp eq i32 %v, 0
   br i1 %0, label %bbx, label %bby
 
 bby:
-  switch i32 undef, label %bb7 [
+  switch i32 %v, label %bb7 [
     i32 16, label %return
   ]
 
 bb7:
-  store volatile i32 0, i32* @XYZ
-  unreachable
+  tail call void @tail_call_me()
+  ret void
 
 bbx:
-  switch i32 undef, label %bb12 [
+  switch i32 %v, label %bb12 [
     i32 128, label %return
   ]
 
 bb12:
-  store volatile i32 0, i32* @XYZ
-  unreachable
+  tail call void @tail_call_me()
+  ret void
 
 return:
   ret void
@@ -414,9 +415,9 @@ return:
 
 ; CHECK-LABEL: two_nosize:
 ; CHECK: movl $0, XYZ(%rip)
-; CHECK: movl $1, XYZ(%rip)
+; CHECK: jmp tail_call_me
 ; CHECK: movl $0, XYZ(%rip)
-; CHECK: movl $1, XYZ(%rip)
+; CHECK: jmp tail_call_me
 
 define void @two_nosize() nounwind {
 entry:
@@ -430,8 +431,8 @@ bby:
 
 bb7:
   store volatile i32 0, i32* @XYZ
-  store volatile i32 1, i32* @XYZ
-  unreachable
+  tail call void @tail_call_me()
+  ret void
 
 bbx:
   switch i32 undef, label %bb12 [
@@ -440,8 +441,8 @@ bbx:
 
 bb12:
   store volatile i32 0, i32* @XYZ
-  store volatile i32 1, i32* @XYZ
-  unreachable
+  tail call void @tail_call_me()
+  ret void
 
 return:
   ret void
@@ -469,3 +470,88 @@ bb.nph:                                           ; preds = %entry
 for.end:                                          ; preds = %entry
   ret i64 %varx.0
 }
+
+; We should tail merge small blocks that don't end in a tail call or return
+; instruction. Those blocks are typically unreachable and will be placed
+; out-of-line after the main return, so we should try to eliminate as many of
+; them as possible.
+
+; CHECK-LABEL: merge_aborts:
+; CHECK-NOT: callq abort
+; CHECK: ret
+; CHECK: callq abort
+; CHECK-NOT: callq abort
+; CHECK: .Lfunc_end{{.*}}:
+
+declare void @abort()
+define void @merge_aborts() {
+entry:
+  %c1 = call i1 @qux()
+  br i1 %c1, label %cont1, label %abort1
+abort1:
+  call void @abort()
+  unreachable
+cont1:
+  %c2 = call i1 @qux()
+  br i1 %c2, label %cont2, label %abort2
+abort2:
+  call void @abort()
+  unreachable
+cont2:
+  %c3 = call i1 @qux()
+  br i1 %c3, label %cont3, label %abort3
+abort3:
+  call void @abort()
+  unreachable
+cont3:
+  %c4 = call i1 @qux()
+  br i1 %c4, label %cont4, label %abort4
+abort4:
+  call void @abort()
+  unreachable
+cont4:
+  ret void
+}
+
+; Use alternating abort functions so that the blocks we wish to merge are not
+; layout successors during branch folding.
+
+; CHECK-LABEL: merge_alternating_aborts:
+; CHECK-NOT: callq abort
+; CHECK: ret
+; CHECK: callq abort
+; CHECK: callq alt_abort
+; CHECK-NOT: callq abort
+; CHECK-NOT: callq alt_abort
+; CHECK: .Lfunc_end{{.*}}:
+
+declare void @alt_abort()
+
+define void @merge_alternating_aborts() {
+entry:
+  %c1 = call i1 @qux()
+  br i1 %c1, label %cont1, label %abort1
+abort1:
+  call void @abort()
+  unreachable
+cont1:
+  %c2 = call i1 @qux()
+  br i1 %c2, label %cont2, label %abort2
+abort2:
+  call void @alt_abort()
+  unreachable
+cont2:
+  %c3 = call i1 @qux()
+  br i1 %c3, label %cont3, label %abort3
+abort3:
+  call void @abort()
+  unreachable
+cont3:
+  %c4 = call i1 @qux()
+  br i1 %c4, label %cont4, label %abort4
+abort4:
+  call void @alt_abort()
+  unreachable
+cont4:
+  ret void
+}
diff --git a/test/CodeGen/X86/twoaddr-coalesce-3.ll b/test/CodeGen/X86/twoaddr-coalesce-3.ll
index 33c9d46f13c3ade6be91d48554d6eb1e521902e3..f5a7326c970c185938f407f03168a2c255fe9a96 100644
--- a/test/CodeGen/X86/twoaddr-coalesce-3.ll
+++ b/test/CodeGen/X86/twoaddr-coalesce-3.ll
@@ -19,7 +19,7 @@ for.body.lr.ph:                                   ; preds = %entry
 
 ; Check that only one mov will be generated in the kernel loop.
 ; CHECK-LABEL: foo:
-; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP1:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG1:%[a-z0-9]+]]
 ; CHECK-NOT: mov
@@ -56,7 +56,7 @@ for.body.lr.ph:                                   ; preds = %entry
 
 ; Check that only two mov will be generated in the kernel loop.
 ; CHECK-LABEL: goo:
-; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body
+; CHECK: [[LOOP2:^[a-zA-Z0-9_.]+]]: {{#.*}} %for.body{{$}}
 ; CHECK-NOT: mov
 ; CHECK: movl {{.*}}, [[REG2:%[a-z0-9]+]]
 ; CHECK-NOT: mov
diff --git a/test/CodeGen/X86/unaligned-32-byte-memops.ll b/test/CodeGen/X86/unaligned-32-byte-memops.ll
index b9deb058cb3f1e55df50a72936874292ec9db3f4..391f7a38a379467e62b6402179e8e3084a2cca43 100644
--- a/test/CodeGen/X86/unaligned-32-byte-memops.ll
+++ b/test/CodeGen/X86/unaligned-32-byte-memops.ll
@@ -254,7 +254,7 @@ define <32 x i8> @combine_16_byte_loads_i8(<16 x i8>* %ptr, <32 x i8> %x) {
 define <4 x double> @combine_16_byte_loads_double(<2 x double>* %ptr, <4 x double> %x) {
 ; AVXSLOW-LABEL: combine_16_byte_loads_double:
 ; AVXSLOW:       # BB#0:
-; AVXSLOW-NEXT:    vmovupd 144(%rdi), %xmm1
+; AVXSLOW-NEXT:    vmovups 144(%rdi), %xmm1
 ; AVXSLOW-NEXT:    vinsertf128 $1, 160(%rdi), %ymm1, %ymm1
 ; AVXSLOW-NEXT:    vaddpd %ymm0, %ymm1, %ymm0
 ; AVXSLOW-NEXT:    retq
diff --git a/test/CodeGen/X86/unused_stackslots.ll b/test/CodeGen/X86/unused_stackslots.ll
index 0bb904130f1c9863f60844434b8581bd38b68246..82fd3db1ccb927ebe2dd4348d3e963552b95cb34 100644
--- a/test/CodeGen/X86/unused_stackslots.ll
+++ b/test/CodeGen/X86/unused_stackslots.ll
@@ -24,7 +24,7 @@ define i32 @fn() #0 {
 entry:
   %n = alloca [8 x [8 x i32]], align 16
   %tmp = bitcast [8 x [8 x i32]]* %n to i8*
-  call void @llvm.lifetime.start(i64 256, i8* %tmp) #3
+  call void @llvm.lifetime.start.p0i8(i64 256, i8* %tmp) #3
   %tmp1 = bitcast [8 x [8 x i32]]* %n to i8*
   %arraydecay.1 = getelementptr inbounds [8 x [8 x i32]], [8 x [8 x i32]]* %n, i64 0, i64 1, i64 0
   %tmp2 = bitcast i32* %arraydecay.1 to i8*
@@ -222,12 +222,12 @@ for.inc73:                                        ; preds = %for.body61.preheade
 
 for.end75:                                        ; preds = %for.inc73
   %m.4.lcssa = phi i32 [ %m.4, %for.inc73 ]
-  call void @llvm.lifetime.end(i64 256, i8* %tmp) #3
+  call void @llvm.lifetime.end.p0i8(i64 256, i8* %tmp) #3
   ret i32 %m.4.lcssa
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare void @LumaPrediction4x4(i32, i32, i32, i32, i32, i16 signext, i16 signext) #2
 
@@ -237,7 +237,7 @@ declare i32 @distortion4x4(i32*) #2
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
diff --git a/test/CodeGen/X86/unwindraise.ll b/test/CodeGen/X86/unwindraise.ll
index fb8319b63c2c205cc6a2a7b7b523eccb89774931..db39f4ed45592abdcbfd0bd1a7a2ac255bc77437 100644
--- a/test/CodeGen/X86/unwindraise.ll
+++ b/test/CodeGen/X86/unwindraise.ll
@@ -123,7 +123,7 @@ while.end:                                        ; preds = %if.then4
   store i64 %16, i64* %private_2, align 8
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %2, i8* %3, i64 240, i32 8, i1 false)
   %17 = bitcast %struct._Unwind_FrameState* %fs.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %17)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %17)
   %personality.i = getelementptr inbounds %struct._Unwind_FrameState, %struct._Unwind_FrameState* %fs.i, i64 0, i32 6
   %retaddr_column.i22 = getelementptr inbounds %struct._Unwind_FrameState, %struct._Unwind_FrameState* %fs.i, i64 0, i32 9
   br label %while.body.i
@@ -211,7 +211,7 @@ uw_update_context.exit44:                         ; preds = %if.then10.i.i.i40,
   br label %while.body.i
 
 do.body19:                                        ; preds = %if.then3.i
-  call void @llvm.lifetime.end(i64 -1, i8* %17)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %17)
   %call20 = call fastcc i64 @uw_install_context_1(%struct._Unwind_Context* %this_context, %struct._Unwind_Context* %cur_context)
   %32 = load i8*, i8** %ra.i, align 8
   call void @llvm.eh.return.i64(i64 %call20, i8* %32)
@@ -242,6 +242,6 @@ declare void @llvm.eh.return.i64(i64, i8*) nounwind
 
 declare fastcc void @uw_update_context_1(%struct._Unwind_Context*, %struct._Unwind_FrameState* nocapture) uwtable
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/X86/update-terminator-debugloc.ll b/test/CodeGen/X86/update-terminator-debugloc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..359c348b42cbead8b6c66b0883abaf7f95d74450
--- /dev/null
+++ b/test/CodeGen/X86/update-terminator-debugloc.ll
@@ -0,0 +1,91 @@
+; RUN: llc -stop-after=machine-sink -march=x86-64 < %s | FileCheck %s
+;
+; test code:
+;  1 extern int bar(int x);
+;  2
+;  3 int foo(int *begin, int *end) {
+;  4   int *i;
+;  5   int ret = 0;
+;  6   for (
+;  7       i = begin ;
+;  8       i != end ;
+;  9       i++)
+; 10   {
+; 11       ret += bar(*i);
+; 12   }
+; 13   return ret;
+; 14 }
+; 
+; With the test code, LLVM-IR below shows that loop-control branches have a 
+; debug location of line 6 (branches in entry and for.body block). Make sure that
+; these debug locations are propaged correctly to lowered instructions.
+;
+; CHECK: [[DLOC:![0-9]+]] = !DILocation(line: 6
+; CHECK-DAG: [[VREG1:%[^ ]+]] = COPY %rsi
+; CHECK-DAG: [[VREG2:%[^ ]+]] = COPY %rdi
+; CHECK: SUB64rr [[VREG2]], [[VREG1]]
+; CHECK-NEXT: JNE_1 {{.*}}, debug-location [[DLOC]]{{$}}
+; CHECK: [[VREG3:%[^ ]+]] = PHI [[VREG2]]
+; CHECK: [[VREG4:%[^ ]+]] = ADD64ri8 [[VREG3]], 4
+; CHECK: SUB64rr [[VREG1]], [[VREG4]]
+; CHECK-NEXT: JNE_1 {{.*}}, debug-location [[DLOC]]{{$}}
+; CHECK-NEXT: JMP_1 {{.*}}, debug-location [[DLOC]]{{$}}
+
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @foo(i32* readonly %begin, i32* readnone %end) !dbg !4 {
+entry:
+  %cmp6 = icmp eq i32* %begin, %end, !dbg !9
+  br i1 %cmp6, label %for.end, label %for.body.preheader, !dbg !12
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !13
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %ret.08 = phi i32 [ %add, %for.body ], [ 0, %for.body.preheader ]
+  %i.07 = phi i32* [ %incdec.ptr, %for.body ], [ %begin, %for.body.preheader ]
+  %0 = load i32, i32* %i.07, align 4, !dbg !13, !tbaa !15
+  %call = tail call i32 @bar(i32 %0), !dbg !19
+  %add = add nsw i32 %call, %ret.08, !dbg !20
+  %incdec.ptr = getelementptr inbounds i32, i32* %i.07, i64 1, !dbg !21
+  %cmp = icmp eq i32* %incdec.ptr, %end, !dbg !9
+  br i1 %cmp, label %for.end.loopexit, label %for.body, !dbg !12, !llvm.loop !22
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end, !dbg !24
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  %ret.0.lcssa = phi i32 [ 0, %entry ], [ %add, %for.end.loopexit ]
+  ret i32 %ret.0.lcssa, !dbg !24
+}
+
+declare i32 @bar(i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1)
+!1 = !DIFile(filename: "foo.c", directory: "b/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7, !8, !8}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64)
+!9 = !DILocation(line: 8, column: 9, scope: !10)
+!10 = distinct !DILexicalBlock(scope: !11, file: !1, line: 6, column: 3)
+!11 = distinct !DILexicalBlock(scope: !4, file: !1, line: 6, column: 3)
+!12 = !DILocation(line: 6, column: 3, scope: !11)
+!13 = !DILocation(line: 11, column: 18, scope: !14)
+!14 = distinct !DILexicalBlock(scope: !10, file: !1, line: 10, column: 3)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !17, i64 0}
+!17 = !{!"omnipotent char", !18, i64 0}
+!18 = !{!"Simple C/C++ TBAA"}
+!19 = !DILocation(line: 11, column: 14, scope: !14)
+!20 = !DILocation(line: 11, column: 11, scope: !14)
+!21 = !DILocation(line: 9, column: 8, scope: !10)
+!22 = distinct !{!22, !12, !23}
+!23 = !DILocation(line: 12, column: 3, scope: !11)
+!24 = !DILocation(line: 13, column: 3, scope: !4)
diff --git a/test/CodeGen/X86/vec_cast2.ll b/test/CodeGen/X86/vec_cast2.ll
index dda50b7b94b7fabb108760ee1cc41395ef11cd46..c03b330b88e09ecf01e75a5213335bcf8bc08268 100644
--- a/test/CodeGen/X86/vec_cast2.ll
+++ b/test/CodeGen/X86/vec_cast2.ll
@@ -48,10 +48,10 @@ define <8 x float> @foo2_8(<8 x i8> %src) {
 ; CHECK-LABEL: foo2_8:
 ; CHECK:       ## BB#0:
 ; CHECK-NEXT:    vpand LCPI2_0, %xmm0, %xmm0
-; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; CHECK-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; CHECK-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; CHECK-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; CHECK-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; CHECK-NEXT:    retl
 ;
@@ -97,10 +97,10 @@ define <8 x i8> @foo3_8(<8 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: foo3_8:
 ; CHECK-WIDE:       ## BB#0:
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
+; CHECK-WIDE-NEXT:    vmovd %ecx, %xmm1
 ; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
@@ -134,10 +134,10 @@ define <4 x i8> @foo3_4(<4 x float> %src) {
 ;
 ; CHECK-WIDE-LABEL: foo3_4:
 ; CHECK-WIDE:       ## BB#0:
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %eax
-; CHECK-WIDE-NEXT:    vpinsrb $0, %eax, %xmm0, %xmm1
-; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
+; CHECK-WIDE-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm1, %eax
+; CHECK-WIDE-NEXT:    vcvttss2si %xmm0, %ecx
+; CHECK-WIDE-NEXT:    vmovd %ecx, %xmm1
 ; CHECK-WIDE-NEXT:    vpinsrb $1, %eax, %xmm1, %xmm1
 ; CHECK-WIDE-NEXT:    vpermilpd {{.*#+}} xmm2 = xmm0[1,0]
 ; CHECK-WIDE-NEXT:    vcvttss2si %xmm2, %eax
diff --git a/test/CodeGen/X86/vec_extract-mmx.ll b/test/CodeGen/X86/vec_extract-mmx.ll
index ed957728aeff4c62ae706fc38446fc7cf91eb21e..a137d052d2967fada174a6fd1544f677c2cab1d6 100644
--- a/test/CodeGen/X86/vec_extract-mmx.ll
+++ b/test/CodeGen/X86/vec_extract-mmx.ll
@@ -8,17 +8,14 @@ define i32 @test0(<1 x i64>* %v4) nounwind {
 ; X32-NEXT:    pushl %ebp
 ; X32-NEXT:    movl %esp, %ebp
 ; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $24, %esp
+; X32-NEXT:    subl $8, %esp
 ; X32-NEXT:    movl 8(%ebp), %eax
 ; X32-NEXT:    movl (%eax), %ecx
 ; X32-NEXT:    movl 4(%eax), %eax
 ; X32-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X32-NEXT:    movl %ecx, (%esp)
 ; X32-NEXT:    pshufw $238, (%esp), %mm0 # mm0 = mem[2,3,2,3]
-; X32-NEXT:    movq %mm0, {{[0-9]+}}(%esp)
-; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT:    movd %xmm0, %eax
+; X32-NEXT:    movd %mm0, %eax
 ; X32-NEXT:    addl $32, %eax
 ; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
@@ -47,20 +44,11 @@ entry:
 define i32 @test1(i32* nocapture readonly %ptr) nounwind {
 ; X32-LABEL: test1:
 ; X32:       # BB#0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $16, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    movd (%eax), %mm0
 ; X32-NEXT:    pshufw $232, %mm0, %mm0 # mm0 = mm0[0,2,2,3]
-; X32-NEXT:    movq %mm0, (%esp)
-; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT:    movd %xmm0, %eax
+; X32-NEXT:    movd %mm0, %eax
 ; X32-NEXT:    emms
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test1:
@@ -91,19 +79,10 @@ entry:
 define i32 @test2(i32* nocapture readonly %ptr) nounwind {
 ; X32-LABEL: test2:
 ; X32:       # BB#0: # %entry
-; X32-NEXT:    pushl %ebp
-; X32-NEXT:    movl %esp, %ebp
-; X32-NEXT:    andl $-8, %esp
-; X32-NEXT:    subl $16, %esp
-; X32-NEXT:    movl 8(%ebp), %eax
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    pshufw $232, (%eax), %mm0 # mm0 = mem[0,2,2,3]
-; X32-NEXT:    movq %mm0, (%esp)
-; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
-; X32-NEXT:    movd %xmm0, %eax
+; X32-NEXT:    movd %mm0, %eax
 ; X32-NEXT:    emms
-; X32-NEXT:    movl %ebp, %esp
-; X32-NEXT:    popl %ebp
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test2:
@@ -150,7 +129,7 @@ define i32 @test4(x86_mmx %a) nounwind {
 ; X32-NEXT:    subl $8, %esp
 ; X32-NEXT:    movq %mm0, (%esp)
 ; X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,3,0,1]
+; X32-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,0,1]
 ; X32-NEXT:    movd %xmm0, %eax
 ; X32-NEXT:    movl %ebp, %esp
 ; X32-NEXT:    popl %ebp
diff --git a/test/CodeGen/X86/vec_fp_to_int.ll b/test/CodeGen/X86/vec_fp_to_int.ll
index 2ad20a89cf26b51fc06d1432f5bf2019c64c848c..a345f78e18c13f6ff6cfe13553bc15b8962b959e 100644
--- a/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/test/CodeGen/X86/vec_fp_to_int.ll
@@ -63,6 +63,7 @@ define <2 x i64> @fptosi_2f64_to_2i64(<2 x double> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttpd2qq %zmm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptosi_2f64_to_2i64:
@@ -112,18 +113,12 @@ define <4 x i32> @fptosi_4f64_to_2i32(<2 x double> %a) {
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
-; VEX-LABEL: fptosi_4f64_to_2i32:
-; VEX:       # BB#0:
-; VEX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; VEX-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; VEX-NEXT:    vzeroupper
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: fptosi_4f64_to_2i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
-; AVX512-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: fptosi_4f64_to_2i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
+; AVX-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %cvt = fptosi <4 x double> %ext to <4 x i32>
   ret <4 x i32> %cvt
@@ -243,16 +238,11 @@ define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
-; VEX-LABEL: fptosi_4f64_to_4i32:
-; VEX:       # BB#0:
-; VEX-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; VEX-NEXT:    vzeroupper
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: fptosi_4f64_to_4i32:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vcvttpd2dq %ymm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX-LABEL: fptosi_4f64_to_4i32:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvttpd2dq %ymm0, %xmm0
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %cvt = fptosi <4 x double> %a to <4 x i32>
   ret <4 x i32> %cvt
 }
@@ -334,6 +324,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttpd2uqq %zmm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptoui_2f64_to_2i64:
@@ -400,6 +391,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; AVX512F-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fptoui_2f64_to_4i32:
@@ -412,6 +404,7 @@ define <4 x i32> @fptoui_2f64_to_4i32(<2 x double> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptoui_2f64_to_4i32:
@@ -477,6 +470,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fptoui_2f64_to_2i32:
@@ -489,6 +483,7 @@ define <4 x i32> @fptoui_2f64_to_2i32(<2 x double> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptoui_2f64_to_2i32:
@@ -550,12 +545,14 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fptoui_4f64_to_2i32:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: fptoui_4f64_to_2i32:
@@ -563,12 +560,14 @@ define <4 x i32> @fptoui_4f64_to_2i32(<2 x double> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptoui_4f64_to_2i32:
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512VLDQ-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
   %ext = shufflevector <2 x double> %a, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %cvt = fptoui <4 x double> %ext to <4 x i32>
@@ -816,11 +815,13 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
 ; AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512F-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fptoui_4f64_to_4i32:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: fptoui_4f64_to_4i32:
@@ -828,11 +829,13 @@ define <4 x i32> @fptoui_4f64_to_4i32(<4 x double> %a) {
 ; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttpd2udq %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptoui_4f64_to_4i32:
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    vcvttpd2udq %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
   %cvt = fptoui <4 x double> %a to <4 x i32>
   ret <4 x i32> %cvt
@@ -980,12 +983,14 @@ define <2 x i64> @fptosi_4f32_to_2i64(<4 x float> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512DQ-NEXT:    vcvttps2qq %ymm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptosi_4f32_to_2i64:
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    vcvttps2qq %xmm0, %ymm0
 ; AVX512VLDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
   %cvt = fptosi <4 x float> %a to <4 x i64>
   %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -1281,6 +1286,7 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fptoui_2f32_to_2i32:
@@ -1294,6 +1300,7 @@ define <2 x i32> @fptoui_2f32_to_2i32(<2 x float> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptoui_2f32_to_2i32:
@@ -1347,6 +1354,7 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512F-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fptoui_4f32_to_4i32:
@@ -1359,6 +1367,7 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptoui_4f32_to_4i32:
@@ -1529,12 +1538,14 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512DQ-NEXT:    vcvttps2uqq %ymm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptoui_4f32_to_2i64:
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    vcvttps2uqq %xmm0, %ymm0
 ; AVX512VLDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
   %cvt = fptoui <4 x float> %a to <4 x i64>
   %shuf = shufflevector <4 x i64> %cvt, <4 x i64> undef, <2 x i32> <i32 0, i32 1>
@@ -2291,6 +2302,7 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
 ; AVX512F-NEXT:    vmovq %rax, %xmm0
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fptosi_2f16_to_4i32:
@@ -2321,6 +2333,7 @@ define <4 x i32> @fptosi_2f16_to_4i32(<2 x half> %a) nounwind {
 ; AVX512DQ-NEXT:    vmovq %rax, %xmm0
 ; AVX512DQ-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512DQ-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,2],zero,zero
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: fptosi_2f16_to_4i32:
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 9d9434cb52239a4d63a6b2bc6e288e1a2124c89d..609ed0882092a7cfbb07b269d77cb8b5caacf304 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -82,6 +82,7 @@ define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
 ; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X32-AVX512VL-NEXT:    vcvtps2pd (%ecx), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x01]
 ; X32-AVX512VL-NEXT:    vmovups %ymm0, (%eax) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x00]
+; X32-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: fpext_frommem4:
@@ -103,6 +104,7 @@ define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
 ; X64-AVX512VL:       # BB#0: # %entry
 ; X64-AVX512VL-NEXT:    vcvtps2pd (%rdi), %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x5a,0x07]
 ; X64-AVX512VL-NEXT:    vmovups %ymm0, (%rsi) # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x11,0x06]
+; X64-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = load <4 x float>, <4 x float>* %in
@@ -143,6 +145,7 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
 ; X32-AVX512VL-NEXT:    movl {{[0-9]+}}(%esp), %ecx # encoding: [0x8b,0x4c,0x24,0x04]
 ; X32-AVX512VL-NEXT:    vcvtps2pd (%ecx), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x01]
 ; X32-AVX512VL-NEXT:    vmovups %zmm0, (%eax) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x00]
+; X32-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X32-AVX512VL-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: fpext_frommem8:
@@ -170,6 +173,7 @@ define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
 ; X64-AVX512VL:       # BB#0: # %entry
 ; X64-AVX512VL-NEXT:    vcvtps2pd (%rdi), %zmm0 # encoding: [0x62,0xf1,0x7c,0x48,0x5a,0x07]
 ; X64-AVX512VL-NEXT:    vmovups %zmm0, (%rsi) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x06]
+; X64-AVX512VL-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
 ; X64-AVX512VL-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = load <8 x float>, <8 x float>* %in
diff --git a/test/CodeGen/X86/vec_fptrunc.ll b/test/CodeGen/X86/vec_fptrunc.ll
index 841ac8a44dabebcad936842a6906648f4de82640..e6a0d52c5ae8df14cc58588f98b8046633c7d299 100644
--- a/test/CodeGen/X86/vec_fptrunc.ll
+++ b/test/CodeGen/X86/vec_fptrunc.ll
@@ -102,7 +102,7 @@ define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) {
 ; X32-AVX-NEXT:    vcvtpd2psy (%ecx), %xmm0
 ; X32-AVX-NEXT:    vcvtpd2psy 32(%ecx), %xmm1
 ; X32-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovupd %ymm0, (%eax)
+; X32-AVX-NEXT:    vmovups %ymm0, (%eax)
 ; X32-AVX-NEXT:    vzeroupper
 ; X32-AVX-NEXT:    retl
 ;
@@ -123,7 +123,7 @@ define void @fptrunc_frommem8(<8 x double>* %in, <8 x float>* %out) {
 ; X64-AVX-NEXT:    vcvtpd2psy (%rdi), %xmm0
 ; X64-AVX-NEXT:    vcvtpd2psy 32(%rdi), %xmm1
 ; X64-AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; X64-AVX-NEXT:    vmovupd %ymm0, (%rsi)
+; X64-AVX-NEXT:    vmovups %ymm0, (%rsi)
 ; X64-AVX-NEXT:    vzeroupper
 ; X64-AVX-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll
index 923af1216d05817911cc6734691ba1fcbefb491e..649b45712f57837e43279cf5ca77bed42c6fee4b 100644
--- a/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/test/CodeGen/X86/vec_int_to_fp.ll
@@ -61,6 +61,7 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: sitofp_2i64_to_2f64:
@@ -92,18 +93,12 @@ define <2 x double> @sitofp_4i32_to_2f64(<4 x i32> %a) {
 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
-; VEX-LABEL: sitofp_4i32_to_2f64:
-; VEX:       # BB#0:
-; VEX-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; VEX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; VEX-NEXT:    vzeroupper
-; VEX-NEXT:    retq
-;
-; AVX512-LABEL: sitofp_4i32_to_2f64:
-; AVX512:       # BB#0:
-; AVX512-NEXT:    vcvtdq2pd %xmm0, %ymm0
-; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
-; AVX512-NEXT:    retq
+; AVX-LABEL: sitofp_4i32_to_2f64:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcvtdq2pd %xmm0, %ymm0
+; AVX-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %cvt = sitofp <4 x i32> %a to <4 x double>
   %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
   ret <2 x double> %shuf
@@ -156,6 +151,7 @@ define <2 x double> @sitofp_8i16_to_2f64(<8 x i16> %a) {
 ; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cvt = sitofp <8 x i16> %a to <8 x double>
   %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -211,6 +207,7 @@ define <2 x double> @sitofp_16i8_to_2f64(<16 x i8> %a) {
 ; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cvt = sitofp <16 x i8> %a to <16 x double>
   %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -498,6 +495,7 @@ define <2 x double> @uitofp_2i64_to_2f64(<2 x i64> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_2i64_to_2f64:
@@ -536,6 +534,7 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_2i32_to_2f64:
@@ -548,6 +547,7 @@ define <2 x double> @uitofp_2i32_to_2f64(<4 x i32> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_2i32_to_2f64:
@@ -603,12 +603,14 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_4i32_to_2f64:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vcvtudq2pd %xmm0, %ymm0
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_4i32_to_2f64:
@@ -616,12 +618,14 @@ define <2 x double> @uitofp_4i32_to_2f64(<4 x i32> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_4i32_to_2f64:
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    vcvtudq2pd %xmm0, %ymm0
 ; AVX512VLDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
   %cvt = uitofp <4 x i32> %a to <4 x double>
   %shuf = shufflevector <4 x double> %cvt, <4 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -675,6 +679,7 @@ define <2 x double> @uitofp_8i16_to_2f64(<8 x i16> %a) {
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cvt = uitofp <8 x i16> %a to <8 x double>
   %shuf = shufflevector <8 x double> %cvt, <8 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -730,6 +735,7 @@ define <2 x double> @uitofp_16i8_to_2f64(<16 x i8> %a) {
 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512-NEXT:    vcvtdq2pd %ymm0, %zmm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cvt = uitofp <16 x i8> %a to <16 x double>
   %shuf = shufflevector <16 x double> %cvt, <16 x double> undef, <2 x i32> <i32 0, i32 1>
@@ -1089,6 +1095,7 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32:
@@ -1147,6 +1154,7 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: sitofp_2i64_to_4f32_zero:
@@ -1212,12 +1220,14 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32_undef:
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
   %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %cvt = sitofp <4 x i64> %ext to <4 x float>
@@ -1288,6 +1298,7 @@ define <4 x float> @sitofp_8i16_to_4f32(<8 x i16> %a) {
 ; AVX512-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cvt = sitofp <8 x i16> %a to <8 x float>
   %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1346,6 +1357,7 @@ define <4 x float> @sitofp_16i8_to_4f32(<16 x i8> %a) {
 ; AVX512-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cvt = sitofp <16 x i8> %a to <16 x float>
   %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -1421,6 +1433,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_4i64_to_4f32:
@@ -1437,6 +1450,7 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_4i64_to_4f32:
@@ -1444,11 +1458,13 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: sitofp_4i64_to_4f32:
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    vcvtqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
   %cvt = sitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %cvt
@@ -1697,6 +1713,7 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_2i64_to_4f32:
@@ -1805,6 +1822,7 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    vmovq {{.*#+}} xmm0 = xmm0[0],zero
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_2i64_to_2f32:
@@ -1932,12 +1950,14 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32_undef:
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; AVX512VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
   %ext = shufflevector <2 x i64> %a, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
   %cvt = uitofp <4 x i64> %ext to <4 x float>
@@ -1982,6 +2002,7 @@ define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_4i32_to_4f32:
@@ -1994,6 +2015,7 @@ define <4 x float> @uitofp_4i32_to_4f32(<4 x i32> %a) {
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_4i32_to_4f32:
@@ -2032,10 +2054,10 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
 ;
 ; AVX1-LABEL: uitofp_8i16_to_4f32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX1-NEXT:    vzeroupper
@@ -2054,6 +2076,7 @@ define <4 x float> @uitofp_8i16_to_4f32(<8 x i16> %a) {
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cvt = uitofp <8 x i16> %a to <8 x float>
   %shuf = shufflevector <8 x float> %cvt, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2112,6 +2135,7 @@ define <4 x float> @uitofp_16i8_to_4f32(<16 x i8> %a) {
 ; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
 ; AVX512-NEXT:    vcvtdq2ps %zmm0, %zmm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %cvt = uitofp <16 x i8> %a to <16 x float>
   %shuf = shufflevector <16 x float> %cvt, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -2335,6 +2359,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_4i64_to_4f32:
@@ -2351,6 +2376,7 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_4i64_to_4f32:
@@ -2358,11 +2384,13 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) {
 ; AVX512DQ-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_4i64_to_4f32:
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    vcvtuqq2ps %ymm0, %xmm0
+; AVX512VLDQ-NEXT:    vzeroupper
 ; AVX512VLDQ-NEXT:    retq
   %cvt = uitofp <4 x i64> %a to <4 x float>
   ret <4 x float> %cvt
@@ -2456,10 +2484,10 @@ define <8 x float> @uitofp_8i16_to_8f32(<8 x i16> %a) {
 ;
 ; AVX1-LABEL: uitofp_8i16_to_8f32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -2607,6 +2635,7 @@ define <2 x double> @sitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vcvtqq2pd %zmm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: sitofp_load_2i64_to_2f64:
@@ -2661,7 +2690,7 @@ define <2 x double> @sitofp_load_2i16_to_2f64(<2 x i16> *%a) {
 ; SSE-LABEL: sitofp_load_2i16_to_2f64:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; SSE-NEXT:    psrad $16, %xmm0
 ; SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; SSE-NEXT:    retq
@@ -2723,7 +2752,7 @@ define <4 x double> @sitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ;
 ; AVX1-LABEL: sitofp_load_4i64_to_4f64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpextrq $1, %xmm1, %rax
 ; AVX1-NEXT:    vcvtsi2sdq %rax, %xmm2, %xmm2
@@ -2926,6 +2955,7 @@ define <2 x double> @uitofp_load_2i64_to_2f64(<2 x i64> *%a) {
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vcvtuqq2pd %zmm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_load_2i64_to_2f64:
@@ -2967,6 +2997,7 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
 ; AVX512F-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX512F-NEXT:    vcvtudq2pd %ymm0, %zmm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_2i32_to_2f64:
@@ -2981,6 +3012,7 @@ define <2 x double> @uitofp_load_2i32_to_2f64(<2 x i32> *%a) {
 ; AVX512DQ-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; AVX512DQ-NEXT:    vcvtudq2pd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_load_2i32_to_2f64:
@@ -3021,8 +3053,8 @@ define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512VL-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -3037,8 +3069,8 @@ define <2 x double> @uitofp_load_2i16_to_2f64(<2 x i16> *%a) {
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    vpmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
 ; AVX512VLDQ-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; AVX512VLDQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VLDQ-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3,4,5,6,7]
+; AVX512VLDQ-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512VLDQ-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
 ; AVX512VLDQ-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
   %ld = load <2 x i16>, <2 x i16> *%a
@@ -3076,7 +3108,8 @@ define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
 ; AVX512VL-LABEL: uitofp_load_2i8_to_2f64:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[u],zero,zero,zero,xmm0[u],zero,zero,zero
+; AVX512VL-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX512VL-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -3091,7 +3124,8 @@ define <2 x double> @uitofp_load_2i8_to_2f64(<2 x i8> *%a) {
 ; AVX512VLDQ-LABEL: uitofp_load_2i8_to_2f64:
 ; AVX512VLDQ:       # BB#0:
 ; AVX512VLDQ-NEXT:    vpmovzxbq {{.*#+}} xmm0 = mem[0],zero,zero,zero,zero,zero,zero,zero,mem[1],zero,zero,zero,zero,zero,zero,zero
-; AVX512VLDQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[u],zero,zero,zero,xmm0[u],zero,zero,zero
+; AVX512VLDQ-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; AVX512VLDQ-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; AVX512VLDQ-NEXT:    vcvtdq2pd %xmm0, %xmm0
 ; AVX512VLDQ-NEXT:    retq
   %ld = load <2 x i8>, <2 x i8> *%a
@@ -3130,7 +3164,7 @@ define <4 x double> @uitofp_load_4i64_to_4f64(<4 x i64> *%a) {
 ;
 ; AVX1-LABEL: uitofp_load_4i64_to_4f64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [1127219200,1160773632,0,0]
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
@@ -3416,6 +3450,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512F-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: sitofp_load_4i64_to_4f32:
@@ -3433,6 +3468,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtsi2ssq %rax, %xmm3, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: sitofp_load_4i64_to_4f32:
@@ -3440,6 +3476,7 @@ define <4 x float> @sitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512DQ-NEXT:    vcvtqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: sitofp_load_4i64_to_4f32:
@@ -4003,6 +4040,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; AVX512F-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512F-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_4i64_to_4f32:
@@ -4020,6 +4058,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; AVX512VL-NEXT:    vpextrq $1, %xmm0, %rax
 ; AVX512VL-NEXT:    vcvtusi2ssq %rax, %xmm3, %xmm0
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: uitofp_load_4i64_to_4f32:
@@ -4027,6 +4066,7 @@ define <4 x float> @uitofp_load_4i64_to_4f32(<4 x i64> *%a) {
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX512DQ-NEXT:    vcvtuqq2ps %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_load_4i64_to_4f32:
@@ -4079,6 +4119,7 @@ define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
 ; AVX512F-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: uitofp_load_4i32_to_4f32:
@@ -4091,6 +4132,7 @@ define <4 x float> @uitofp_load_4i32_to_4f32(<4 x i32> *%a) {
 ; AVX512DQ-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512DQ-NEXT:    vcvtudq2ps %zmm0, %zmm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VLDQ-LABEL: uitofp_load_4i32_to_4f32:
@@ -4810,6 +4852,7 @@ define void @aggregate_sitofp_8i16_to_8f32(%Arguments* nocapture readonly %a0) {
 ; AVX512-NEXT:    vpmovsxwd 8(%rdi), %ymm0
 ; AVX512-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX512-NEXT:    vmovaps %ymm0, (%rax)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
  %1 = load %Arguments, %Arguments* %a0, align 1
  %2 = extractvalue %Arguments %1, 1
diff --git a/test/CodeGen/X86/vec_minmax_sint.ll b/test/CodeGen/X86/vec_minmax_sint.ll
index 419eb2bed74309609e576c6fa92805c8dc2a980e..5999116deb9cfe62d20d6b5168d5bef09fea0256 100644
--- a/test/CodeGen/X86/vec_minmax_sint.ll
+++ b/test/CodeGen/X86/vec_minmax_sint.ll
@@ -46,7 +46,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pand %xmm5, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -54,7 +54,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE42:       # BB#0:
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
@@ -130,9 +130,9 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pand %xmm7, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
 ; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
 ; SSE41-NEXT:    retq
@@ -143,9 +143,9 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE42-NEXT:    movdqa %xmm1, %xmm5
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm5
 ; SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE42-NEXT:    movapd %xmm2, %xmm0
 ; SSE42-NEXT:    movapd %xmm3, %xmm1
 ; SSE42-NEXT:    retq
@@ -429,7 +429,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    por %xmm0, %xmm3
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -440,7 +440,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE42-NEXT:    pcmpgtq %xmm2, %xmm3
 ; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE42-NEXT:    pxor %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
@@ -527,9 +527,9 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
 ; SSE41-NEXT:    retq
@@ -544,9 +544,9 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE42-NEXT:    movdqa %xmm2, %xmm6
 ; SSE42-NEXT:    pcmpgtq %xmm4, %xmm6
 ; SSE42-NEXT:    pxor %xmm6, %xmm0
-; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE42-NEXT:    movapd %xmm2, %xmm0
 ; SSE42-NEXT:    movapd %xmm3, %xmm1
 ; SSE42-NEXT:    retq
@@ -844,7 +844,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pand %xmm5, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -853,7 +853,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE42-NEXT:    movdqa %xmm0, %xmm2
 ; SSE42-NEXT:    movdqa %xmm1, %xmm0
 ; SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
-; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
@@ -929,9 +929,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pand %xmm7, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
 ; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
 ; SSE41-NEXT:    retq
@@ -943,9 +943,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm5
 ; SSE42-NEXT:    movdqa %xmm2, %xmm0
 ; SSE42-NEXT:    pcmpgtq %xmm4, %xmm0
-; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE42-NEXT:    movapd %xmm2, %xmm0
 ; SSE42-NEXT:    movapd %xmm3, %xmm1
 ; SSE42-NEXT:    retq
@@ -1223,7 +1223,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    por %xmm0, %xmm3
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1233,7 +1233,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE42-NEXT:    pcmpgtq %xmm1, %xmm0
 ; SSE42-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE42-NEXT:    pxor %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
@@ -1320,9 +1320,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
 ; SSE41-NEXT:    retq
@@ -1336,9 +1336,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE42-NEXT:    pxor %xmm6, %xmm5
 ; SSE42-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE42-NEXT:    pxor %xmm6, %xmm0
-; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE42-NEXT:    movapd %xmm2, %xmm0
 ; SSE42-NEXT:    movapd %xmm3, %xmm1
 ; SSE42-NEXT:    retq
diff --git a/test/CodeGen/X86/vec_minmax_uint.ll b/test/CodeGen/X86/vec_minmax_uint.ll
index 6e48423c1520d4b6d8b1773393287dbfc7f6e19f..ec5f83ea396c2e9ed1a669c655e257012d4c1294 100644
--- a/test/CodeGen/X86/vec_minmax_uint.ll
+++ b/test/CodeGen/X86/vec_minmax_uint.ll
@@ -46,7 +46,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pand %xmm5, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -58,7 +58,7 @@ define <2 x i64> @max_gt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE42-NEXT:    pxor %xmm0, %xmm3
 ; SSE42-NEXT:    pxor %xmm2, %xmm0
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
@@ -137,9 +137,9 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pand %xmm7, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
 ; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
 ; SSE41-NEXT:    retq
@@ -157,9 +157,9 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE42-NEXT:    pxor %xmm0, %xmm6
 ; SSE42-NEXT:    pxor %xmm4, %xmm0
 ; SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
-; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE42-NEXT:    movapd %xmm2, %xmm0
 ; SSE42-NEXT:    movapd %xmm3, %xmm1
 ; SSE42-NEXT:    retq
@@ -167,13 +167,13 @@ define <4 x i64> @max_gt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: max_gt_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
@@ -468,7 +468,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    por %xmm0, %xmm3
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -481,7 +481,7 @@ define <2 x i64> @max_ge_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
 ; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE42-NEXT:    pxor %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
@@ -571,9 +571,9 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
 ; SSE41-NEXT:    retq
@@ -594,9 +594,9 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE42-NEXT:    pxor %xmm2, %xmm0
 ; SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
 ; SSE42-NEXT:    pxor %xmm6, %xmm0
-; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE42-NEXT:    movapd %xmm2, %xmm0
 ; SSE42-NEXT:    movapd %xmm3, %xmm1
 ; SSE42-NEXT:    retq
@@ -604,15 +604,15 @@ define <4 x i64> @max_ge_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: max_ge_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm5
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
@@ -910,7 +910,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    pand %xmm5, %xmm3
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[1,1,3,3]
 ; SSE41-NEXT:    por %xmm3, %xmm0
-; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -922,7 +922,7 @@ define <2 x i64> @min_lt_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE42-NEXT:    pxor %xmm0, %xmm3
 ; SSE42-NEXT:    pxor %xmm1, %xmm0
 ; SSE42-NEXT:    pcmpgtq %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
@@ -1001,9 +1001,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pand %xmm7, %xmm4
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[1,1,3,3]
 ; SSE41-NEXT:    por %xmm4, %xmm0
-; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
 ; SSE41-NEXT:    retq
@@ -1021,9 +1021,9 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE42-NEXT:    pxor %xmm0, %xmm6
 ; SSE42-NEXT:    pxor %xmm2, %xmm0
 ; SSE42-NEXT:    pcmpgtq %xmm6, %xmm0
-; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE42-NEXT:    movapd %xmm2, %xmm0
 ; SSE42-NEXT:    movapd %xmm3, %xmm1
 ; SSE42-NEXT:    retq
@@ -1031,13 +1031,13 @@ define <4 x i64> @min_lt_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: min_lt_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
@@ -1330,7 +1330,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE41-NEXT:    por %xmm0, %xmm3
 ; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE41-NEXT:    pxor %xmm3, %xmm0
-; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1344,7 +1344,7 @@ define <2 x i64> @min_le_v2i64(<2 x i64> %a, <2 x i64> %b) {
 ; SSE42-NEXT:    pcmpgtq %xmm0, %xmm3
 ; SSE42-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE42-NEXT:    pxor %xmm3, %xmm0
-; SSE42-NEXT:    blendvpd %xmm2, %xmm1
+; SSE42-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    retq
 ;
@@ -1434,9 +1434,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[1,1,3,3]
 ; SSE41-NEXT:    por %xmm6, %xmm0
 ; SSE41-NEXT:    pxor %xmm9, %xmm0
-; SSE41-NEXT:    blendvpd %xmm8, %xmm2
+; SSE41-NEXT:    blendvpd %xmm0, %xmm8, %xmm2
 ; SSE41-NEXT:    movdqa %xmm5, %xmm0
-; SSE41-NEXT:    blendvpd %xmm1, %xmm3
+; SSE41-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movapd %xmm2, %xmm0
 ; SSE41-NEXT:    movapd %xmm3, %xmm1
 ; SSE41-NEXT:    retq
@@ -1457,9 +1457,9 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; SSE42-NEXT:    pxor %xmm4, %xmm0
 ; SSE42-NEXT:    pcmpgtq %xmm7, %xmm0
 ; SSE42-NEXT:    pxor %xmm6, %xmm0
-; SSE42-NEXT:    blendvpd %xmm4, %xmm2
+; SSE42-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE42-NEXT:    movdqa %xmm5, %xmm0
-; SSE42-NEXT:    blendvpd %xmm1, %xmm3
+; SSE42-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE42-NEXT:    movapd %xmm2, %xmm0
 ; SSE42-NEXT:    movapd %xmm3, %xmm1
 ; SSE42-NEXT:    retq
@@ -1467,15 +1467,15 @@ define <4 x i64> @min_le_v4i64(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: min_le_v4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm5
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
diff --git a/test/CodeGen/X86/vec_sdiv_to_shift.ll b/test/CodeGen/X86/vec_sdiv_to_shift.ll
index f7151af528b5189558940eb5a7a0ae7f523f7b61..f0c9069d8c797b365365e222916f191a2b145336 100644
--- a/test/CodeGen/X86/vec_sdiv_to_shift.ll
+++ b/test/CodeGen/X86/vec_sdiv_to_shift.ll
@@ -49,56 +49,6 @@ entry:
   ret <8 x i16> %0
 }
 
-define <4 x i32> @sdiv_zero(<4 x i32> %var) {
-; SSE-LABEL: sdiv_zero:
-; SSE:       # BB#0: # %entry
-; SSE-NEXT:    pextrd $1, %xmm0, %eax
-; SSE-NEXT:    xorl %esi, %esi
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %esi
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %esi
-; SSE-NEXT:    movd %eax, %xmm1
-; SSE-NEXT:    pinsrd $1, %ecx, %xmm1
-; SSE-NEXT:    pextrd $2, %xmm0, %eax
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %esi
-; SSE-NEXT:    pinsrd $2, %eax, %xmm1
-; SSE-NEXT:    pextrd $3, %xmm0, %eax
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %esi
-; SSE-NEXT:    pinsrd $3, %eax, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: sdiv_zero:
-; AVX:       # BB#0: # %entry
-; AVX-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX-NEXT:    xorl %esi, %esi
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %esi
-; AVX-NEXT:    movl %eax, %ecx
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %esi
-; AVX-NEXT:    vmovd %eax, %xmm1
-; AVX-NEXT:    vpinsrd $1, %ecx, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %esi
-; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %esi
-; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
-; AVX-NEXT:    retq
-entry:
-  %0 = sdiv <4 x i32> %var, <i32 0, i32 0, i32 0, i32 0>
-  ret <4 x i32> %0
-}
-
 define <4 x i32> @sdiv_vec4x32(<4 x i32> %var) {
 ; SSE-LABEL: sdiv_vec4x32:
 ; SSE:       # BB#0: # %entry
@@ -234,52 +184,15 @@ entry:
   ret <16 x i16> %a0
 }
 
+; Div-by-0 in any lane is UB.
+
 define <4 x i32> @sdiv_non_splat(<4 x i32> %x) {
 ; SSE-LABEL: sdiv_non_splat:
 ; SSE:       # BB#0:
-; SSE-NEXT:    pextrd $1, %xmm0, %eax
-; SSE-NEXT:    xorl %ecx, %ecx
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    movd %xmm0, %edx
-; SSE-NEXT:    movl %edx, %esi
-; SSE-NEXT:    shrl $31, %esi
-; SSE-NEXT:    addl %edx, %esi
-; SSE-NEXT:    sarl %esi
-; SSE-NEXT:    movd %esi, %xmm1
-; SSE-NEXT:    pinsrd $1, %eax, %xmm1
-; SSE-NEXT:    pextrd $2, %xmm0, %eax
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    pinsrd $2, %eax, %xmm1
-; SSE-NEXT:    pextrd $3, %xmm0, %eax
-; SSE-NEXT:    cltd
-; SSE-NEXT:    idivl %ecx
-; SSE-NEXT:    pinsrd $3, %eax, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: sdiv_non_splat:
 ; AVX:       # BB#0:
-; AVX-NEXT:    vpextrd $1, %xmm0, %eax
-; AVX-NEXT:    xorl %ecx, %ecx
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vmovd %xmm0, %edx
-; AVX-NEXT:    movl %edx, %esi
-; AVX-NEXT:    shrl $31, %esi
-; AVX-NEXT:    addl %edx, %esi
-; AVX-NEXT:    sarl %esi
-; AVX-NEXT:    vmovd %esi, %xmm1
-; AVX-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $2, %eax, %xmm1, %xmm1
-; AVX-NEXT:    vpextrd $3, %xmm0, %eax
-; AVX-NEXT:    cltd
-; AVX-NEXT:    idivl %ecx
-; AVX-NEXT:    vpinsrd $3, %eax, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %y = sdiv <4 x i32> %x, <i32 2, i32 0, i32 0, i32 0>
   ret <4 x i32> %y
diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll
index 66229361990f1eb592a1f7b2532797e89e7b8caf..bef2438aecd153258b004acde1e6dcd7fcb603fa 100644
--- a/test/CodeGen/X86/vec_shift4.ll
+++ b/test/CodeGen/X86/vec_shift4.ll
@@ -39,18 +39,18 @@ define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
 ; X32-NEXT:    psllw $4, %xmm3
 ; X32-NEXT:    pand {{\.LCPI.*}}, %xmm3
 ; X32-NEXT:    movdqa %xmm1, %xmm0
-; X32-NEXT:    pblendvb %xmm3, %xmm2
+; X32-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; X32-NEXT:    movdqa %xmm2, %xmm3
 ; X32-NEXT:    psllw $2, %xmm3
 ; X32-NEXT:    pand {{\.LCPI.*}}, %xmm3
 ; X32-NEXT:    paddb %xmm1, %xmm1
 ; X32-NEXT:    movdqa %xmm1, %xmm0
-; X32-NEXT:    pblendvb %xmm3, %xmm2
+; X32-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; X32-NEXT:    movdqa %xmm2, %xmm3
 ; X32-NEXT:    paddb %xmm3, %xmm3
 ; X32-NEXT:    paddb %xmm1, %xmm1
 ; X32-NEXT:    movdqa %xmm1, %xmm0
-; X32-NEXT:    pblendvb %xmm3, %xmm2
+; X32-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; X32-NEXT:    movdqa %xmm2, %xmm0
 ; X32-NEXT:    retl
 ;
@@ -62,18 +62,18 @@ define <2 x i64> @shl2(<16 x i8> %r, <16 x i8> %a) nounwind readnone ssp {
 ; X64-NEXT:    psllw $4, %xmm3
 ; X64-NEXT:    pand {{.*}}(%rip), %xmm3
 ; X64-NEXT:    movdqa %xmm1, %xmm0
-; X64-NEXT:    pblendvb %xmm3, %xmm2
+; X64-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; X64-NEXT:    movdqa %xmm2, %xmm3
 ; X64-NEXT:    psllw $2, %xmm3
 ; X64-NEXT:    pand {{.*}}(%rip), %xmm3
 ; X64-NEXT:    paddb %xmm1, %xmm1
 ; X64-NEXT:    movdqa %xmm1, %xmm0
-; X64-NEXT:    pblendvb %xmm3, %xmm2
+; X64-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; X64-NEXT:    movdqa %xmm2, %xmm3
 ; X64-NEXT:    paddb %xmm3, %xmm3
 ; X64-NEXT:    paddb %xmm1, %xmm1
 ; X64-NEXT:    movdqa %xmm1, %xmm0
-; X64-NEXT:    pblendvb %xmm3, %xmm2
+; X64-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; X64-NEXT:    movdqa %xmm2, %xmm0
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/vec_shift7.ll b/test/CodeGen/X86/vec_shift7.ll
index 80d72a4a986f8089dfb7d4a8a60c20557adfa63e..64c64c39254417ace157c82a104c90301445f531 100644
--- a/test/CodeGen/X86/vec_shift7.ll
+++ b/test/CodeGen/X86/vec_shift7.ll
@@ -10,17 +10,14 @@ define i64 @test1(<2 x i64> %a) {
 ; X32-NEXT:    movdqa %xmm0, %xmm1
 ; X32-NEXT:    psllq $2, %xmm1
 ; X32-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X32-NEXT:    movd %xmm1, %eax
-; X32-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; X32-NEXT:    movd %xmm0, %edx
+; X32-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
+; X32-NEXT:    movd %xmm1, %edx
+; X32-NEXT:    movd %xmm0, %eax
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test1:
 ; X64:       # BB#0: # %entry
-; X64-NEXT:    movdqa %xmm0, %xmm1
-; X64-NEXT:    psllq $2, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1]
-; X64-NEXT:    movd %xmm1, %rax
+; X64-NEXT:    movd %xmm0, %rax
 ; X64-NEXT:    retq
 entry:
  %c = shl <2 x i64> %a, <i64 0, i64 2>
diff --git a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index a1b46b9324d383fcd6c3e42c10d5437d2824c425..7df3c30704222c6d79c8eca77894ba472e153778 100644
--- a/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -77,6 +77,7 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
 ; AVX512F-NEXT:    # kill
 ; AVX512F-NEXT:    vcvtudq2ps %zmm0, %zmm0
 ; AVX512F-NEXT:    # kill
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_uitofp_v4i32_to_v4f32:
@@ -102,9 +103,6 @@ define <4 x float> @test_uitofp_v4i32_to_v4f32(<4 x i32> %arg) {
 ; AVX2: [[FPMASKCSTADDR_v8:.LCPI[0-9_]+]]:
 ; AVX2-NEXT: .long 1199570944 # float 65536
 
-; AVX2: [[MASKCSTADDR_v8:.LCPI[0-9_]+]]:
-; AVX2-NEXT: .long 65535 # 0xffff
-
 define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
 ; SSE2-LABEL: test_uitofp_v8i32_to_v8f32:
 ; SSE2:       # BB#0:
@@ -165,8 +163,8 @@ define <8 x float> @test_uitofp_v8i32_to_v8f32(<8 x i32> %arg) {
 ; AVX2-NEXT:    vcvtdq2ps %ymm1, %ymm1
 ; AVX2-NEXT:    vbroadcastss [[FPMASKCSTADDR_v8]](%rip), %ymm2
 ; AVX2-NEXT:    vmulps %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpbroadcastd [[MASKCSTADDR_v8]](%rip), %ymm2
-; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vxorps %ymm2, %ymm2, %ymm2
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15]
 ; AVX2-NEXT:    vcvtdq2ps %ymm0, %ymm0
 ; AVX2-NEXT:    vaddps %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
diff --git a/test/CodeGen/X86/vec_unsafe-fp-math.ll b/test/CodeGen/X86/vec_unsafe-fp-math.ll
index 827d4184d111bbc6c27a4c9b471e162cc2c38d8a..1c352782fca4fa9507a49968fd91d3d392832d40 100644
--- a/test/CodeGen/X86/vec_unsafe-fp-math.ll
+++ b/test/CodeGen/X86/vec_unsafe-fp-math.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -enable-unsafe-fp-math -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -enable-unsafe-fp-math -enable-no-signed-zeros-fp-math -mtriple=x86_64-unknown-unknown -mcpu=corei7 | FileCheck %s
 
 ; Make sure that vectors get the same benefits as scalars when using unsafe-fp-math.
 
diff --git a/test/CodeGen/X86/vec_zero_cse.ll b/test/CodeGen/X86/vec_zero_cse.ll
index 8ed8083a284fb242a918b8a98ef6718f56330e53..75e85348ba8dd3d8e6eca071a00bcbbb83c807fd 100644
--- a/test/CodeGen/X86/vec_zero_cse.ll
+++ b/test/CodeGen/X86/vec_zero_cse.ll
@@ -1,4 +1,7 @@
-; RUN: llc < %s -relocation-model=static -mtriple=i686-unknown -mattr=+mmx,+sse3 | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -relocation-model=static -mtriple=i686-unknown -mattr=+mmx,+sse3   | FileCheck %s --check-prefix=X32
+; RUN: llc < %s -relocation-model=static -mtriple=x86_64-unknown -mattr=+mmx,+sse3 | FileCheck %s --check-prefix=X64
+
 ; 64-bit stores here do not use MMX.
 
 @M1 = external global <1 x i64>
@@ -8,35 +11,78 @@
 @S2 = external global <4 x i32>
 
 define void @test1() {
-;CHECK-LABEL: @test1
-;CHECK: xorps
+; X32-LABEL: test1:
+; X32:       # BB#0:
+; X32-NEXT:    movl $0, M1+4
+; X32-NEXT:    movl $0, M1
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    movlps %xmm0, M2
+; X32-NEXT:    retl
+;
+; X64-LABEL: test1:
+; X64:       # BB#0:
+; X64-NEXT:    movq $0, {{.*}}(%rip)
+; X64-NEXT:    movq $0, {{.*}}(%rip)
+; X64-NEXT:    retq
   store <1 x i64> zeroinitializer, <1 x i64>* @M1
   store <2 x i32> zeroinitializer, <2 x i32>* @M2
   ret void
 }
 
 define void @test2() {
-;CHECK-LABEL: @test2
-;CHECK: pcmpeqd
+; X32-LABEL: test2:
+; X32:       # BB#0:
+; X32-NEXT:    movl $-1, M1+4
+; X32-NEXT:    movl $-1, M1
+; X32-NEXT:    pcmpeqd %xmm0, %xmm0
+; X32-NEXT:    movq %xmm0, M2
+; X32-NEXT:    retl
+;
+; X64-LABEL: test2:
+; X64:       # BB#0:
+; X64-NEXT:    movq $-1, {{.*}}(%rip)
+; X64-NEXT:    movq {{.*}}(%rip), %rax
+; X64-NEXT:    movq %rax, {{.*}}(%rip)
+; X64-NEXT:    retq
   store <1 x i64> < i64 -1 >, <1 x i64>* @M1
   store <2 x i32> < i32 -1, i32 -1 >, <2 x i32>* @M2
   ret void
 }
 
 define void @test3() {
-;CHECK-LABEL: @test3
-;CHECK: xorps
+; X32-LABEL: test3:
+; X32:       # BB#0:
+; X32-NEXT:    xorps %xmm0, %xmm0
+; X32-NEXT:    movaps %xmm0, S1
+; X32-NEXT:    movaps %xmm0, S2
+; X32-NEXT:    retl
+;
+; X64-LABEL: test3:
+; X64:       # BB#0:
+; X64-NEXT:    xorps %xmm0, %xmm0
+; X64-NEXT:    movaps %xmm0, {{.*}}(%rip)
+; X64-NEXT:    movaps %xmm0, {{.*}}(%rip)
+; X64-NEXT:    retq
   store <2 x i64> zeroinitializer, <2 x i64>* @S1
   store <4 x i32> zeroinitializer, <4 x i32>* @S2
   ret void
 }
 
 define void @test4() {
-;CHECK-LABEL: @test4
-;CHECK: pcmpeqd
+; X32-LABEL: test4:
+; X32:       # BB#0:
+; X32-NEXT:    pcmpeqd %xmm0, %xmm0
+; X32-NEXT:    movdqa %xmm0, S1
+; X32-NEXT:    movdqa %xmm0, S2
+; X32-NEXT:    retl
+;
+; X64-LABEL: test4:
+; X64:       # BB#0:
+; X64-NEXT:    pcmpeqd %xmm0, %xmm0
+; X64-NEXT:    movdqa %xmm0, {{.*}}(%rip)
+; X64-NEXT:    movdqa %xmm0, {{.*}}(%rip)
+; X64-NEXT:    retq
   store <2 x i64> < i64 -1, i64 -1>, <2 x i64>* @S1
   store <4 x i32> < i32 -1, i32 -1, i32 -1, i32 -1 >, <4 x i32>* @S2
   ret void
 }
-
-
diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll
index beec58bdaf7d8e0f9a28965b291fd307e3bbd83a..226c0adbaf3c3b54e2a7369d6d38e55481068bdb 100644
--- a/test/CodeGen/X86/vector-bitreverse.ll
+++ b/test/CodeGen/X86/vector-bitreverse.ll
@@ -613,8 +613,8 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
 ; AVX1-LABEL: test_bitreverse_v32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
@@ -622,7 +622,7 @@ define <32 x i8> @test_bitreverse_v32i8(<32 x i8> %a) nounwind {
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm5, %xmm1
 ; AVX1-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
@@ -1361,8 +1361,8 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ; AVX1-LABEL: test_bitreverse_v64i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,128,64,192,32,160,96,224,16,144,80,208,48,176,112,240]
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
@@ -1370,7 +1370,7 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,8,4,12,2,10,6,14,1,9,5,13,3,11,7,15]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
 ; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm4
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
@@ -1378,13 +1378,13 @@ define <64 x i8> @test_bitreverse_v64i8(<64 x i8> %a) nounwind {
 ; AVX1-NEXT:    vpor %xmm0, %xmm4, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
 ; AVX1-NEXT:    vpor %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm4
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll
index d3bac61959dc2711b6270ec14c607491100f8a31..a05a981daa1f0d7e4683ed6fa14c33aa1daaef91 100644
--- a/test/CodeGen/X86/vector-blend.ll
+++ b/test/CodeGen/X86/vector-blend.ll
@@ -274,7 +274,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -500,7 +500,7 @@ define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
 ; SSE41-NEXT:    movapd %xmm0, %xmm2
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    cmplepd %xmm2, %xmm0
-; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -539,7 +539,7 @@ define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
 ; SSE41-NEXT:    movapd %xmm0, %xmm2
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    cmpnlepd %xmm2, %xmm0
-; SSE41-NEXT:    blendvpd %xmm2, %xmm1
+; SSE41-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movapd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -653,8 +653,8 @@ define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,0,255,0,255,255,255,0,0,0,255,0,255,255,255,0]
-; SSE41-NEXT:    pblendvb %xmm4, %xmm2
-; SSE41-NEXT:    pblendvb %xmm1, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm3
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    movdqa %xmm3, %xmm1
 ; SSE41-NEXT:    retq
@@ -822,7 +822,7 @@ define <4 x i32> @blend_logic_v4i32(<4 x i32> %b, <4 x i32> %a, <4 x i32> %c) {
 ; SSE41-LABEL: blend_logic_v4i32:
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    psrad $31, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -870,9 +870,9 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) {
 ; SSE41:       # BB#0: # %entry
 ; SSE41-NEXT:    psrad $31, %xmm1
 ; SSE41-NEXT:    psrad $31, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm4
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm5
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm5
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
 ; SSE41-NEXT:    movdqa %xmm5, %xmm1
 ; SSE41-NEXT:    retq
@@ -1028,7 +1028,7 @@ define <4 x i32> @blend_neg_logic_v4i32_2(<4 x i32> %v, <4 x i32> %c) {
 ; SSE41-NEXT:    pxor %xmm3, %xmm3
 ; SSE41-NEXT:    psubd %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    blendvps %xmm2, %xmm3
+; SSE41-NEXT:    blendvps %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    movaps %xmm3, %xmm0
 ; SSE41-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-compare-all_of.ll b/test/CodeGen/X86/vector-compare-all_of.ll
index 36ed86fc528f95076cce06860b7b7b78cfa7dc9a..316df2780d16e7fb26d92ea842295940b20f4eb9 100644
--- a/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/test/CodeGen/X86/vector-compare-all_of.ll
@@ -4,8 +4,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
 
-define i64 @test_v2f64(<2 x double> %a0, <2 x double> %a1) {
-; SSE-LABEL: test_v2f64:
+define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: test_v2f64_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
@@ -13,7 +13,7 @@ define i64 @test_v2f64(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-NEXT:    movd %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v2f64:
+; AVX-LABEL: test_v2f64_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
@@ -21,7 +21,7 @@ define i64 @test_v2f64(<2 x double> %a0, <2 x double> %a1) {
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v2f64:
+; AVX512-LABEL: test_v2f64_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltpd %xmm0, %xmm1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -38,8 +38,8 @@ define i64 @test_v2f64(<2 x double> %a0, <2 x double> %a1) {
   ret i64 %3
 }
 
-define i64 @test_v4f64(<4 x double> %a0, <4 x double> %a1) {
-; SSE-LABEL: test_v4f64:
+define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: test_v4f64_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltpd %xmm1, %xmm3
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm2
@@ -49,29 +49,18 @@ define i64 @test_v4f64(<4 x double> %a0, <4 x double> %a1) {
 ; SSE-NEXT:    movd %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v4f64:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v4f64:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vandpd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: test_v4f64_sext:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovmskpd %ymm0, %eax
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    cmpl $15, %eax
+; AVX-NEXT:    movq $-1, %rax
+; AVX-NEXT:    cmovneq %rcx, %rax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f64:
+; AVX512-LABEL: test_v4f64_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -81,6 +70,7 @@ define i64 @test_v4f64(<4 x double> %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = fcmp ogt <4 x double> %a0, %a1
   %s = sext <4 x i1> %c to <4 x i64>
@@ -92,35 +82,35 @@ define i64 @test_v4f64(<4 x double> %a0, <4 x double> %a1) {
   ret i64 %5
 }
 
-define i64 @test_v4f64_legal(<4 x double> %a0, <4 x double> %a1) {
-; SSE-LABEL: test_v4f64_legal:
+define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: test_v4f64_legal_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltpd %xmm1, %xmm3
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm2
 ; SSE-NEXT:    packsswb %xmm3, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    movmskps %xmm2, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $15, %eax
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    cltq
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f64_legal:
+; AVX-LABEL: test_v4f64_legal_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vmovmskps %xmm0, %eax
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    cmpl $15, %eax
+; AVX-NEXT:    movl $-1, %eax
+; AVX-NEXT:    cmovnel %ecx, %eax
 ; AVX-NEXT:    cltq
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f64_legal:
+; AVX512-LABEL: test_v4f64_legal_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -131,6 +121,7 @@ define i64 @test_v4f64_legal(<4 x double> %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    cltq
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = fcmp ogt <4 x double> %a0, %a1
   %s = sext <4 x i1> %c to <4 x i32>
@@ -143,28 +134,28 @@ define i64 @test_v4f64_legal(<4 x double> %a0, <4 x double> %a1) {
   ret i64 %6
 }
 
-define i32 @test_v4f32(<4 x float> %a0, <4 x float> %a1) {
-; SSE-LABEL: test_v4f32:
+define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: test_v4f32_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltps %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    movmskps %xmm1, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $15, %eax
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f32:
+; AVX-LABEL: test_v4f32_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vmovmskps %xmm0, %eax
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    cmpl $15, %eax
+; AVX-NEXT:    movl $-1, %eax
+; AVX-NEXT:    cmovnel %ecx, %eax
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32:
+; AVX512-LABEL: test_v4f32_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltps %xmm0, %xmm1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -185,46 +176,31 @@ define i32 @test_v4f32(<4 x float> %a0, <4 x float> %a1) {
   ret i32 %5
 }
 
-define i32 @test_v8f32(<8 x float> %a0, <8 x float> %a1) {
-; SSE-LABEL: test_v8f32:
+define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: test_v8f32_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltps %xmm1, %xmm3
 ; SSE-NEXT:    cmpltps %xmm0, %xmm2
 ; SSE-NEXT:    andps %xmm3, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    movmskps %xmm2, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $15, %eax
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v8f32:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v8f32:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: test_v8f32_sext:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovmskps %ymm0, %eax
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    cmpl $255, %eax
+; AVX-NEXT:    movl $-1, %eax
+; AVX-NEXT:    cmovnel %ecx, %eax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f32:
+; AVX512-LABEL: test_v8f32_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltps %ymm0, %ymm1, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -236,6 +212,7 @@ define i32 @test_v8f32(<8 x float> %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = fcmp ogt <8 x float> %a0, %a1
   %s = sext <8 x i1> %c to <8 x i32>
@@ -249,40 +226,33 @@ define i32 @test_v8f32(<8 x float> %a0, <8 x float> %a1) {
   ret i32 %7
 }
 
-define i32 @test_v8f32_legal(<8 x float> %a0, <8 x float> %a1) {
-; SSE-LABEL: test_v8f32_legal:
+define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: test_v8f32_legal_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltps %xmm1, %xmm3
 ; SSE-NEXT:    cmpltps %xmm0, %xmm2
 ; SSE-NEXT:    packsswb %xmm3, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    cwtl
+; SSE-NEXT:    pmovmskb %xmm2, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v8f32_legal:
+; AVX-LABEL: test_v8f32_legal_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    cwtl
+; AVX-NEXT:    vpmovmskb %xmm0, %eax
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; AVX-NEXT:    movl $-1, %eax
+; AVX-NEXT:    cmovnel %ecx, %eax
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f32_legal:
+; AVX512-LABEL: test_v8f32_legal_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltps %ymm0, %ymm1, %k0
 ; AVX512-NEXT:    vpmovm2w %k0, %xmm0
@@ -294,6 +264,7 @@ define i32 @test_v8f32_legal(<8 x float> %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    cwtl
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = fcmp ogt <8 x float> %a0, %a1
   %s = sext <8 x i1> %c to <8 x i16>
@@ -308,8 +279,8 @@ define i32 @test_v8f32_legal(<8 x float> %a0, <8 x float> %a1) {
   ret i32 %8
 }
 
-define i64 @test_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
-; SSE-LABEL: test_v2i64:
+define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE-LABEL: test_v2i64_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtq %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -317,7 +288,7 @@ define i64 @test_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; SSE-NEXT:    movd %xmm1, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v2i64:
+; AVX-LABEL: test_v2i64_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -325,7 +296,7 @@ define i64 @test_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v2i64:
+; AVX512-LABEL: test_v2i64_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -342,8 +313,8 @@ define i64 @test_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret i64 %3
 }
 
-define i64 @test_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
-; SSE-LABEL: test_v4i64:
+define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE-LABEL: test_v4i64_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtq %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtq %xmm2, %xmm0
@@ -353,32 +324,33 @@ define i64 @test_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE-NEXT:    movd %xmm1, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v4i64:
+; AVX1-LABEL: test_v4i64_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vmovmskpd %ymm0, %eax
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    cmpl $15, %eax
+; AVX1-NEXT:    movq $-1, %rax
+; AVX1-NEXT:    cmovneq %rcx, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v4i64:
+; AVX2-LABEL: test_v4i64_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vmovmskpd %ymm0, %eax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    cmpl $15, %eax
+; AVX2-NEXT:    movq $-1, %rax
+; AVX2-NEXT:    cmovneq %rcx, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4i64:
+; AVX512-LABEL: test_v4i64_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -388,6 +360,7 @@ define i64 @test_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = icmp sgt <4 x i64> %a0, %a1
   %s = sext <4 x i1> %c to <4 x i64>
@@ -399,51 +372,51 @@ define i64 @test_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret i64 %5
 }
 
-define i64 @test_v4i64_legal(<4 x i64> %a0, <4 x i64> %a1) {
-; SSE-LABEL: test_v4i64_legal:
+define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE-LABEL: test_v4i64_legal_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtq %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE-NEXT:    packsswb %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $15, %eax
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    cltq
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v4i64_legal:
+; AVX1-LABEL: test_v4i64_legal_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vmovmskps %xmm0, %eax
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    cmpl $15, %eax
+; AVX1-NEXT:    movl $-1, %eax
+; AVX1-NEXT:    cmovnel %ecx, %eax
 ; AVX1-NEXT:    cltq
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v4i64_legal:
+; AVX2-LABEL: test_v4i64_legal_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vmovmskps %xmm0, %eax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    cmpl $15, %eax
+; AVX2-NEXT:    movl $-1, %eax
+; AVX2-NEXT:    cmovnel %ecx, %eax
 ; AVX2-NEXT:    cltq
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4i64_legal:
+; AVX512-LABEL: test_v4i64_legal_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -454,6 +427,7 @@ define i64 @test_v4i64_legal(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    cltq
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = icmp sgt <4 x i64> %a0, %a1
   %s = sext <4 x i1> %c to <4 x i32>
@@ -466,28 +440,28 @@ define i64 @test_v4i64_legal(<4 x i64> %a0, <4 x i64> %a1) {
   ret i64 %6
 }
 
-define i32 @test_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
-; SSE-LABEL: test_v4i32:
+define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: test_v4i32_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $15, %eax
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4i32:
+; AVX-LABEL: test_v4i32_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vmovmskps %xmm0, %eax
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    cmpl $15, %eax
+; AVX-NEXT:    movl $-1, %eax
+; AVX-NEXT:    cmovnel %ecx, %eax
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4i32:
+; AVX512-LABEL: test_v4i32_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -508,49 +482,46 @@ define i32 @test_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret i32 %5
 }
 
-define i32 @test_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
-; SSE-LABEL: test_v8i32:
+define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_v8i32_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtd %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $15, %eax
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v8i32:
+; AVX1-LABEL: test_v8i32_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vmovmskps %ymm0, %eax
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    cmpl $255, %eax
+; AVX1-NEXT:    movl $-1, %eax
+; AVX1-NEXT:    cmovnel %ecx, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v8i32:
+; AVX2-LABEL: test_v8i32_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vmovmskps %ymm0, %eax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    cmpl $255, %eax
+; AVX2-NEXT:    movl $-1, %eax
+; AVX2-NEXT:    cmovnel %ecx, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8i32:
+; AVX512-LABEL: test_v8i32_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -562,6 +533,7 @@ define i32 @test_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = icmp sgt <8 x i32> %a0, %a1
   %s = sext <8 x i1> %c to <8 x i32>
@@ -575,58 +547,48 @@ define i32 @test_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret i32 %7
 }
 
-define i32 @test_v8i32_legal(<8 x i32> %a0, <8 x i32> %a1) {
-; SSE-LABEL: test_v8i32_legal:
+define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_v8i32_legal_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtd %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE-NEXT:    packsswb %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    cwtl
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v8i32_legal:
+; AVX1-LABEL: test_v8i32_legal_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    cwtl
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; AVX1-NEXT:    movl $-1, %eax
+; AVX1-NEXT:    cmovnel %ecx, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v8i32_legal:
+; AVX2-LABEL: test_v8i32_legal_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    cwtl
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; AVX2-NEXT:    movl $-1, %eax
+; AVX2-NEXT:    cmovnel %ecx, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8i32_legal:
+; AVX512-LABEL: test_v8i32_legal_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
 ; AVX512-NEXT:    vpmovm2w %k0, %xmm0
@@ -638,6 +600,7 @@ define i32 @test_v8i32_legal(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    cwtl
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = icmp sgt <8 x i32> %a0, %a1
   %s = sext <8 x i1> %c to <8 x i16>
@@ -652,35 +615,30 @@ define i32 @test_v8i32_legal(<8 x i32> %a0, <8 x i32> %a1) {
   ret i32 %8
 }
 
-define i16 @test_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: test_v8i16:
+define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_v8i16_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtw %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v8i16:
+; AVX-LABEL: test_v8i16_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpmovmskb %xmm0, %eax
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; AVX-NEXT:    movl $-1, %eax
+; AVX-NEXT:    cmovnel %ecx, %eax
 ; AVX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8i16:
+; AVX512-LABEL: test_v8i16_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
 ; AVX512-NEXT:    vpmovm2w %k0, %xmm0
@@ -705,24 +663,21 @@ define i16 @test_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret i16 %7
 }
 
-define i16 @test_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
-; SSE-LABEL: test_v16i16:
+define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_v16i16_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtw %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtw %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v16i16:
+; AVX1-LABEL: test_v16i16_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
@@ -741,23 +696,18 @@ define i16 @test_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v16i16:
+; AVX2-LABEL: test_v16i16_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl $-1, %ecx
+; AVX2-NEXT:    cmovel %ecx, %eax
 ; AVX2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16i16:
+; AVX512-LABEL: test_v16i16_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0
 ; AVX512-NEXT:    vpmovm2w %k0, %ymm0
@@ -771,6 +721,7 @@ define i16 @test_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = icmp sgt <16 x i16> %a0, %a1
   %s = sext <16 x i1> %c to <16 x i16>
@@ -786,68 +737,51 @@ define i16 @test_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret i16 %9
 }
 
-define i16 @test_v16i16_legal(<16 x i16> %a0, <16 x i16> %a1) {
-; SSE-LABEL: test_v16i16_legal:
+define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_v16i16_legal_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtw %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtw %xmm2, %xmm0
 ; SSE-NEXT:    packsswb %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pextrb $0, %xmm0, %eax
-; SSE-NEXT:    movsbl %al, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v16i16_legal:
+; AVX1-LABEL: test_v16i16_legal_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX1-NEXT:    movsbl %al, %eax
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    xorl %ecx, %ecx
+; AVX1-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; AVX1-NEXT:    movl $-1, %eax
+; AVX1-NEXT:    cmovnel %ecx, %eax
 ; AVX1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v16i16_legal:
+; AVX2-LABEL: test_v16i16_legal_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX2-NEXT:    movsbl %al, %eax
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; AVX2-NEXT:    movl $-1, %eax
+; AVX2-NEXT:    cmovnel %ecx, %eax
 ; AVX2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16i16_legal:
+; AVX512-LABEL: test_v16i16_legal_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0
 ; AVX512-NEXT:    vpmovm2b %k0, %xmm0
@@ -862,6 +796,7 @@ define i16 @test_v16i16_legal(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    movsbl %al, %eax
 ; AVX512-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c  = icmp sgt <16 x i16> %a0, %a1
   %s  = sext <16 x i1> %c to <16 x i8>
@@ -878,40 +813,30 @@ define i16 @test_v16i16_legal(<16 x i16> %a0, <16 x i16> %a1) {
   ret i16 %10
 }
 
-define i8 @test_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
-; SSE-LABEL: test_v16i8:
+define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_v16i8_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pextrb $0, %xmm0, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v16i8:
+; AVX-LABEL: test_v16i8_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpand %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    vpmovmskb %xmm0, %eax
+; AVX-NEXT:    xorl %ecx, %ecx
+; AVX-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; AVX-NEXT:    movl $-1, %eax
+; AVX-NEXT:    cmovnel %ecx, %eax
 ; AVX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16i8:
+; AVX512-LABEL: test_v16i8_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0
 ; AVX512-NEXT:    vpmovm2b %k0, %xmm0
@@ -940,27 +865,21 @@ define i8 @test_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret i8 %9
 }
 
-define i8 @test_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
-; SSE-LABEL: test_v32i8:
+define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_v32i8_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtb %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtb %xmm2, %xmm0
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pextrb $0, %xmm0, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    xorl %ecx, %ecx
+; SSE-NEXT:    cmpl $65535, %eax # imm = 0xFFFF
+; SSE-NEXT:    movl $-1, %eax
+; SSE-NEXT:    cmovnel %ecx, %eax
 ; SSE-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v32i8:
+; AVX1-LABEL: test_v32i8_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
@@ -981,25 +900,18 @@ define i8 @test_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v32i8:
+; AVX2-LABEL: test_v32i8_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    vpmovmskb %ymm0, %ecx
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    cmpl $-1, %ecx
+; AVX2-NEXT:    cmovel %ecx, %eax
 ; AVX2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v32i8:
+; AVX512-LABEL: test_v32i8_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0
 ; AVX512-NEXT:    vpmovm2b %k0, %ymm0
@@ -1015,6 +927,7 @@ define i8 @test_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c  = icmp sgt <32 x i8> %a0, %a1
   %s  = sext <32 x i1> %c to <32 x i8>
diff --git a/test/CodeGen/X86/vector-compare-any_of.ll b/test/CodeGen/X86/vector-compare-any_of.ll
index 9e040412a2e3876f2aee73f8a5b5baf5d95e1d8f..1d3db6495708fa246df584b3eb986caa455a659f 100644
--- a/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/test/CodeGen/X86/vector-compare-any_of.ll
@@ -4,8 +4,8 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=AVX --check-prefix=AVX2
 ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=AVX512
 
-define i64 @test_v2f64(<2 x double> %a0, <2 x double> %a1) {
-; SSE-LABEL: test_v2f64:
+define i64 @test_v2f64_sext(<2 x double> %a0, <2 x double> %a1) {
+; SSE-LABEL: test_v2f64_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
@@ -13,7 +13,7 @@ define i64 @test_v2f64(<2 x double> %a0, <2 x double> %a1) {
 ; SSE-NEXT:    movd %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v2f64:
+; AVX-LABEL: test_v2f64_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpltpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
@@ -21,7 +21,7 @@ define i64 @test_v2f64(<2 x double> %a0, <2 x double> %a1) {
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v2f64:
+; AVX512-LABEL: test_v2f64_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltpd %xmm0, %xmm1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -38,8 +38,8 @@ define i64 @test_v2f64(<2 x double> %a0, <2 x double> %a1) {
   ret i64 %3
 }
 
-define i64 @test_v4f64(<4 x double> %a0, <4 x double> %a1) {
-; SSE-LABEL: test_v4f64:
+define i64 @test_v4f64_sext(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: test_v4f64_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltpd %xmm1, %xmm3
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm2
@@ -49,29 +49,16 @@ define i64 @test_v4f64(<4 x double> %a0, <4 x double> %a1) {
 ; SSE-NEXT:    movd %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v4f64:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vorpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vorpd %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v4f64:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vorpd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: test_v4f64_sext:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovmskpd %ymm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    sbbq %rax, %rax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f64:
+; AVX512-LABEL: test_v4f64_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -81,6 +68,7 @@ define i64 @test_v4f64(<4 x double> %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = fcmp ogt <4 x double> %a0, %a1
   %s = sext <4 x i1> %c to <4 x i64>
@@ -92,35 +80,31 @@ define i64 @test_v4f64(<4 x double> %a0, <4 x double> %a1) {
   ret i64 %5
 }
 
-define i64 @test_v4f64_legal(<4 x double> %a0, <4 x double> %a1) {
-; SSE-LABEL: test_v4f64_legal:
+define i64 @test_v4f64_legal_sext(<4 x double> %a0, <4 x double> %a1) {
+; SSE-LABEL: test_v4f64_legal_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltpd %xmm1, %xmm3
 ; SSE-NEXT:    cmpltpd %xmm0, %xmm2
 ; SSE-NEXT:    packsswb %xmm3, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    movmskps %xmm2, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    cltq
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f64_legal:
+; AVX-LABEL: test_v4f64_legal_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vmovmskps %xmm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    cltq
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f64_legal:
+; AVX512-LABEL: test_v4f64_legal_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltpd %ymm0, %ymm1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -131,6 +115,7 @@ define i64 @test_v4f64_legal(<4 x double> %a0, <4 x double> %a1) {
 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    cltq
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = fcmp ogt <4 x double> %a0, %a1
   %s = sext <4 x i1> %c to <4 x i32>
@@ -143,28 +128,24 @@ define i64 @test_v4f64_legal(<4 x double> %a0, <4 x double> %a1) {
   ret i64 %6
 }
 
-define i32 @test_v4f32(<4 x float> %a0, <4 x float> %a1) {
-; SSE-LABEL: test_v4f32:
+define i32 @test_v4f32_sext(<4 x float> %a0, <4 x float> %a1) {
+; SSE-LABEL: test_v4f32_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltps %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    movmskps %xmm1, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4f32:
+; AVX-LABEL: test_v4f32_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpltps %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vpermilpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT:    vorpd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vmovmskps %xmm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4f32:
+; AVX512-LABEL: test_v4f32_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltps %xmm0, %xmm1, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -185,46 +166,27 @@ define i32 @test_v4f32(<4 x float> %a0, <4 x float> %a1) {
   ret i32 %5
 }
 
-define i32 @test_v8f32(<8 x float> %a0, <8 x float> %a1) {
-; SSE-LABEL: test_v8f32:
+define i32 @test_v8f32_sext(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: test_v8f32_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltps %xmm1, %xmm3
 ; SSE-NEXT:    cmpltps %xmm0, %xmm2
 ; SSE-NEXT:    orps %xmm3, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    movmskps %xmm2, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v8f32:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    vzeroupper
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: test_v8f32:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    vzeroupper
-; AVX2-NEXT:    retq
+; AVX-LABEL: test_v8f32_sext:
+; AVX:       # BB#0:
+; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
+; AVX-NEXT:    vmovmskps %ymm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    sbbl %eax, %eax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f32:
+; AVX512-LABEL: test_v8f32_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltps %ymm0, %ymm1, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -236,6 +198,7 @@ define i32 @test_v8f32(<8 x float> %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = fcmp ogt <8 x float> %a0, %a1
   %s = sext <8 x i1> %c to <8 x i32>
@@ -249,40 +212,29 @@ define i32 @test_v8f32(<8 x float> %a0, <8 x float> %a1) {
   ret i32 %7
 }
 
-define i32 @test_v8f32_legal(<8 x float> %a0, <8 x float> %a1) {
-; SSE-LABEL: test_v8f32_legal:
+define i32 @test_v8f32_legal_sext(<8 x float> %a0, <8 x float> %a1) {
+; SSE-LABEL: test_v8f32_legal_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    cmpltps %xmm1, %xmm3
 ; SSE-NEXT:    cmpltps %xmm0, %xmm2
 ; SSE-NEXT:    packsswb %xmm3, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
-; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
-; SSE-NEXT:    cwtl
+; SSE-NEXT:    pmovmskb %xmm2, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v8f32_legal:
+; AVX-LABEL: test_v8f32_legal_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
-; AVX-NEXT:    cwtl
+; AVX-NEXT:    vpmovmskb %xmm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8f32_legal:
+; AVX512-LABEL: test_v8f32_legal_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vcmpltps %ymm0, %ymm1, %k0
 ; AVX512-NEXT:    vpmovm2w %k0, %xmm0
@@ -294,6 +246,7 @@ define i32 @test_v8f32_legal(<8 x float> %a0, <8 x float> %a1) {
 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    cwtl
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = fcmp ogt <8 x float> %a0, %a1
   %s = sext <8 x i1> %c to <8 x i16>
@@ -308,8 +261,8 @@ define i32 @test_v8f32_legal(<8 x float> %a0, <8 x float> %a1) {
   ret i32 %8
 }
 
-define i64 @test_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
-; SSE-LABEL: test_v2i64:
+define i64 @test_v2i64_sext(<2 x i64> %a0, <2 x i64> %a1) {
+; SSE-LABEL: test_v2i64_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtq %xmm1, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -317,7 +270,7 @@ define i64 @test_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; SSE-NEXT:    movd %xmm1, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v2i64:
+; AVX-LABEL: test_v2i64_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
@@ -325,7 +278,7 @@ define i64 @test_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
 ; AVX-NEXT:    vmovq %xmm0, %rax
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v2i64:
+; AVX512-LABEL: test_v2i64_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtq %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -342,8 +295,8 @@ define i64 @test_v2i64(<2 x i64> %a0, <2 x i64> %a1) {
   ret i64 %3
 }
 
-define i64 @test_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
-; SSE-LABEL: test_v4i64:
+define i64 @test_v4i64_sext(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE-LABEL: test_v4i64_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtq %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtq %xmm2, %xmm0
@@ -353,32 +306,29 @@ define i64 @test_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; SSE-NEXT:    movd %xmm1, %rax
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v4i64:
+; AVX1-LABEL: test_v4i64_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    vmovmskpd %ymm0, %eax
+; AVX1-NEXT:    negl %eax
+; AVX1-NEXT:    sbbq %rax, %rax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v4i64:
+; AVX2-LABEL: test_v4i64_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    vmovmskpd %ymm0, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    sbbq %rax, %rax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4i64:
+; AVX512-LABEL: test_v4i64_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -388,6 +338,7 @@ define i64 @test_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = icmp sgt <4 x i64> %a0, %a1
   %s = sext <4 x i1> %c to <4 x i64>
@@ -399,51 +350,45 @@ define i64 @test_v4i64(<4 x i64> %a0, <4 x i64> %a1) {
   ret i64 %5
 }
 
-define i64 @test_v4i64_legal(<4 x i64> %a0, <4 x i64> %a1) {
-; SSE-LABEL: test_v4i64_legal:
+define i64 @test_v4i64_legal_sext(<4 x i64> %a0, <4 x i64> %a1) {
+; SSE-LABEL: test_v4i64_legal_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtq %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE-NEXT:    packsswb %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    cltq
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v4i64_legal:
+; AVX1-LABEL: test_v4i64_legal_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtq %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vmovmskps %xmm0, %eax
+; AVX1-NEXT:    negl %eax
+; AVX1-NEXT:    sbbl %eax, %eax
 ; AVX1-NEXT:    cltq
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v4i64_legal:
+; AVX2-LABEL: test_v4i64_legal_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vmovmskps %xmm0, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    cltq
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4i64_legal:
+; AVX512-LABEL: test_v4i64_legal_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -454,6 +399,7 @@ define i64 @test_v4i64_legal(<4 x i64> %a0, <4 x i64> %a1) {
 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    cltq
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = icmp sgt <4 x i64> %a0, %a1
   %s = sext <4 x i1> %c to <4 x i32>
@@ -466,28 +412,24 @@ define i64 @test_v4i64_legal(<4 x i64> %a0, <4 x i64> %a1) {
   ret i64 %6
 }
 
-define i32 @test_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
-; SSE-LABEL: test_v4i32:
+define i32 @test_v4i32_sext(<4 x i32> %a0, <4 x i32> %a1) {
+; SSE-LABEL: test_v4i32_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtd %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v4i32:
+; AVX-LABEL: test_v4i32_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vmovmskps %xmm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v4i32:
+; AVX512-LABEL: test_v4i32_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtd %xmm1, %xmm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
@@ -508,49 +450,40 @@ define i32 @test_v4i32(<4 x i32> %a0, <4 x i32> %a1) {
   ret i32 %5
 }
 
-define i32 @test_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
-; SSE-LABEL: test_v8i32:
+define i32 @test_v8i32_sext(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_v8i32_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtd %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movd %xmm0, %eax
+; SSE-NEXT:    movmskps %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v8i32:
+; AVX1-LABEL: test_v8i32_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
+; AVX1-NEXT:    vmovmskps %ymm0, %eax
+; AVX1-NEXT:    negl %eax
+; AVX1-NEXT:    sbbl %eax, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v8i32:
+; AVX2-LABEL: test_v8i32_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vmovmskps %ymm0, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8i32:
+; AVX512-LABEL: test_v8i32_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k1
 ; AVX512-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -562,6 +495,7 @@ define i32 @test_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX512-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = icmp sgt <8 x i32> %a0, %a1
   %s = sext <8 x i1> %c to <8 x i32>
@@ -575,58 +509,42 @@ define i32 @test_v8i32(<8 x i32> %a0, <8 x i32> %a1) {
   ret i32 %7
 }
 
-define i32 @test_v8i32_legal(<8 x i32> %a0, <8 x i32> %a1) {
-; SSE-LABEL: test_v8i32_legal:
+define i32 @test_v8i32_legal_sext(<8 x i32> %a0, <8 x i32> %a1) {
+; SSE-LABEL: test_v8i32_legal_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtd %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE-NEXT:    packsswb %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
-; SSE-NEXT:    cwtl
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v8i32_legal:
+; AVX1-LABEL: test_v8i32_legal_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtd %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtd %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %xmm0, %eax
-; AVX1-NEXT:    cwtl
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    negl %eax
+; AVX1-NEXT:    sbbl %eax, %eax
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v8i32_legal:
+; AVX2-LABEL: test_v8i32_legal_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    cwtl
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8i32_legal:
+; AVX512-LABEL: test_v8i32_legal_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %k0
 ; AVX512-NEXT:    vpmovm2w %k0, %xmm0
@@ -638,6 +556,7 @@ define i32 @test_v8i32_legal(<8 x i32> %a0, <8 x i32> %a1) {
 ; AVX512-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    cwtl
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = icmp sgt <8 x i32> %a0, %a1
   %s = sext <8 x i1> %c to <8 x i16>
@@ -652,35 +571,26 @@ define i32 @test_v8i32_legal(<8 x i32> %a0, <8 x i32> %a1) {
   ret i32 %8
 }
 
-define i16 @test_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
-; SSE-LABEL: test_v8i16:
+define i16 @test_v8i16_sext(<8 x i16> %a0, <8 x i16> %a1) {
+; SSE-LABEL: test_v8i16_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtw %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v8i16:
+; AVX-LABEL: test_v8i16_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovd %xmm0, %eax
+; AVX-NEXT:    vpmovmskb %xmm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v8i16:
+; AVX512-LABEL: test_v8i16_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtw %xmm1, %xmm0, %k0
 ; AVX512-NEXT:    vpmovm2w %k0, %xmm0
@@ -705,24 +615,19 @@ define i16 @test_v8i16(<8 x i16> %a0, <8 x i16> %a1) {
   ret i16 %7
 }
 
-define i16 @test_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
-; SSE-LABEL: test_v16i16:
+define i16 @test_v16i16_sext(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_v16i16_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtw %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtw %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movd %xmm1, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v16i16:
+; AVX1-LABEL: test_v16i16_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
@@ -741,23 +646,17 @@ define i16 @test_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v16i16:
+; AVX2-LABEL: test_v16i16_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vmovd %xmm0, %eax
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16i16:
+; AVX512-LABEL: test_v16i16_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0
 ; AVX512-NEXT:    vpmovm2w %k0, %ymm0
@@ -771,6 +670,7 @@ define i16 @test_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovd %xmm0, %eax
 ; AVX512-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c = icmp sgt <16 x i16> %a0, %a1
   %s = sext <16 x i1> %c to <16 x i16>
@@ -786,68 +686,45 @@ define i16 @test_v16i16(<16 x i16> %a0, <16 x i16> %a1) {
   ret i16 %9
 }
 
-define i16 @test_v16i16_legal(<16 x i16> %a0, <16 x i16> %a1) {
-; SSE-LABEL: test_v16i16_legal:
+define i16 @test_v16i16_legal_sext(<16 x i16> %a0, <16 x i16> %a1) {
+; SSE-LABEL: test_v16i16_legal_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtw %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtw %xmm2, %xmm0
 ; SSE-NEXT:    packsswb %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pextrb $0, %xmm0, %eax
-; SSE-NEXT:    movsbl %al, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v16i16_legal:
+; AVX1-LABEL: test_v16i16_legal_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpcmpgtw %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpacksswb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX1-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX1-NEXT:    movsbl %al, %eax
+; AVX1-NEXT:    vpmovmskb %xmm0, %eax
+; AVX1-NEXT:    negl %eax
+; AVX1-NEXT:    sbbl %eax, %eax
 ; AVX1-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v16i16_legal:
+; AVX2-LABEL: test_v16i16_legal_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vpacksswb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
-; AVX2-NEXT:    movsbl %al, %eax
+; AVX2-NEXT:    vpmovmskb %xmm0, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16i16_legal:
+; AVX512-LABEL: test_v16i16_legal_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtw %ymm1, %ymm0, %k0
 ; AVX512-NEXT:    vpmovm2b %k0, %xmm0
@@ -862,6 +739,7 @@ define i16 @test_v16i16_legal(<16 x i16> %a0, <16 x i16> %a1) {
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    movsbl %al, %eax
 ; AVX512-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c  = icmp sgt <16 x i16> %a0, %a1
   %s  = sext <16 x i1> %c to <16 x i8>
@@ -878,40 +756,26 @@ define i16 @test_v16i16_legal(<16 x i16> %a0, <16 x i16> %a1) {
   ret i16 %10
 }
 
-define i8 @test_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
-; SSE-LABEL: test_v16i8:
+define i8 @test_v16i8_sext(<16 x i8> %a0, <16 x i8> %a1) {
+; SSE-LABEL: test_v16i8_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pextrb $0, %xmm0, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE-NEXT:    retq
 ;
-; AVX-LABEL: test_v16i8:
+; AVX-LABEL: test_v16i8_sext:
 ; AVX:       # BB#0:
 ; AVX-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX-NEXT:    vpmovmskb %xmm0, %eax
+; AVX-NEXT:    negl %eax
+; AVX-NEXT:    sbbl %eax, %eax
 ; AVX-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; AVX-NEXT:    retq
 ;
-; AVX512-LABEL: test_v16i8:
+; AVX512-LABEL: test_v16i8_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtb %xmm1, %xmm0, %k0
 ; AVX512-NEXT:    vpmovm2b %k0, %xmm0
@@ -940,27 +804,19 @@ define i8 @test_v16i8(<16 x i8> %a0, <16 x i8> %a1) {
   ret i8 %9
 }
 
-define i8 @test_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
-; SSE-LABEL: test_v32i8:
+define i8 @test_v32i8_sext(<32 x i8> %a0, <32 x i8> %a1) {
+; SSE-LABEL: test_v32i8_sext:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    pcmpgtb %xmm3, %xmm1
 ; SSE-NEXT:    pcmpgtb %xmm2, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3]
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    psrlw $8, %xmm0
-; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    pextrb $0, %xmm0, %eax
+; SSE-NEXT:    pmovmskb %xmm0, %eax
+; SSE-NEXT:    negl %eax
+; SSE-NEXT:    sbbl %eax, %eax
 ; SSE-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; SSE-NEXT:    retq
 ;
-; AVX1-LABEL: test_v32i8:
+; AVX1-LABEL: test_v32i8_sext:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
@@ -981,25 +837,17 @@ define i8 @test_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test_v32i8:
+; AVX2-LABEL: test_v32i8_sext:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrld $16, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpsrlw $8, %xmm0, %xmm1
-; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpextrb $0, %xmm0, %eax
+; AVX2-NEXT:    vpmovmskb %ymm0, %eax
+; AVX2-NEXT:    negl %eax
+; AVX2-NEXT:    sbbl %eax, %eax
 ; AVX2-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: test_v32i8:
+; AVX512-LABEL: test_v32i8_sext:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpcmpgtb %ymm1, %ymm0, %k0
 ; AVX512-NEXT:    vpmovm2b %k0, %ymm0
@@ -1015,6 +863,7 @@ define i8 @test_v32i8(<32 x i8> %a0, <32 x i8> %a1) {
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpextrb $0, %xmm0, %eax
 ; AVX512-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %c  = icmp sgt <32 x i8> %a0, %a1
   %s  = sext <32 x i1> %c to <32 x i8>
diff --git a/test/CodeGen/X86/vector-compare-results.ll b/test/CodeGen/X86/vector-compare-results.ll
index c34f333ef785b2c3ca4257de228ea8a1eaf88075..4fa9596192a60a2414dbcefaeef65afe8e5170cc 100644
--- a/test/CodeGen/X86/vector-compare-results.ll
+++ b/test/CodeGen/X86/vector-compare-results.ll
@@ -146,6 +146,7 @@ define <4 x i1> @test_cmp_v4f64(<4 x double> %a0, <4 x double> %a1) nounwind {
 ; AVX512-NEXT:    vcmpltpd %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = fcmp ogt <4 x double> %a0, %a1
   ret <4 x i1> %1
@@ -181,6 +182,7 @@ define <8 x i1> @test_cmp_v8f32(<8 x float> %a0, <8 x float> %a1) nounwind {
 ; AVX512-NEXT:    vcmpltps %ymm0, %ymm1, %ymm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = fcmp ogt <8 x float> %a0, %a1
   ret <8 x i1> %1
@@ -243,6 +245,7 @@ define <4 x i1> @test_cmp_v4i64(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX512-NEXT:    vpcmpgtq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = icmp sgt <4 x i64> %a0, %a1
   ret <4 x i1> %1
@@ -279,6 +282,7 @@ define <8 x i1> @test_cmp_v8i32(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX512-NEXT:    vpcmpgtd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = icmp sgt <8 x i32> %a0, %a1
   ret <8 x i1> %1
@@ -315,6 +319,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; AVX512F-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v16i16:
@@ -322,6 +327,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; AVX512DQ-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v16i16:
@@ -329,6 +335,7 @@ define <16 x i1> @test_cmp_v16i16(<16 x i16> %a0, <16 x i16> %a1) nounwind {
 ; AVX512BW-NEXT:    vpcmpgtw %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = icmp sgt <16 x i16> %a0, %a1
   ret <16 x i1> %1
@@ -343,98 +350,98 @@ define <32 x i1> @test_cmp_v32i8(<32 x i8> %a0, <32 x i8> %a1) nounwind {
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 2(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, (%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movq %rdi, %rax
@@ -610,6 +617,7 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
 ; AVX512F-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v8f64:
@@ -617,13 +625,15 @@ define <8 x i1> @test_cmp_v8f64(<8 x double> %a0, <8 x double> %a1) nounwind {
 ; AVX512DQ-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
 ; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v8f64:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vcmpltpd %zmm0, %zmm1, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vcmpltpd %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = fcmp ogt <8 x double> %a0, %a1
   ret <8 x i1> %1
@@ -670,6 +680,7 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
 ; AVX512F-NEXT:    vcmpltps %zmm0, %zmm1, %k1
 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v16f32:
@@ -677,13 +688,15 @@ define <16 x i1> @test_cmp_v16f32(<16 x float> %a0, <16 x float> %a1) nounwind {
 ; AVX512DQ-NEXT:    vcmpltps %zmm0, %zmm1, %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v16f32:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vcmpltps %zmm0, %zmm1, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vcmpltps %zmm0, %zmm1, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = fcmp ogt <16 x float> %a0, %a1
   ret <16 x i1> %1
@@ -781,6 +794,7 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX512F-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v8i64:
@@ -788,13 +802,15 @@ define <8 x i1> @test_cmp_v8i64(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX512DQ-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
 ; AVX512DQ-NEXT:    vpmovm2q %k0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v8i64:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpcmpgtq %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    vpcmpgtq %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = icmp sgt <8 x i64> %a0, %a1
   ret <8 x i1> %1
@@ -844,6 +860,7 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; AVX512F-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v16i32:
@@ -851,13 +868,15 @@ define <16 x i1> @test_cmp_v16i32(<16 x i32> %a0, <16 x i32> %a1) nounwind {
 ; AVX512DQ-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v16i32:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpcmpgtd %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpcmpgtd %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = icmp sgt <16 x i32> %a0, %a1
   ret <16 x i1> %1
@@ -881,98 +900,98 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 2(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, (%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movq %rdi, %rax
@@ -980,108 +999,101 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ;
 ; SSE42-LABEL: test_cmp_v32i16:
 ; SSE42:       # BB#0:
-; SSE42-NEXT:    pcmpgtw %xmm5, %xmm1
-; SSE42-NEXT:    movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE42-NEXT:    pshufb %xmm5, %xmm1
 ; SSE42-NEXT:    pcmpgtw %xmm4, %xmm0
-; SSE42-NEXT:    pshufb %xmm5, %xmm0
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE42-NEXT:    pcmpgtw %xmm7, %xmm3
-; SSE42-NEXT:    pshufb %xmm5, %xmm3
+; SSE42-NEXT:    pcmpgtw %xmm5, %xmm1
 ; SSE42-NEXT:    pcmpgtw %xmm6, %xmm2
-; SSE42-NEXT:    pshufb %xmm5, %xmm2
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE42-NEXT:    pextrb $15, %xmm2, %eax
+; SSE42-NEXT:    pcmpgtw %xmm7, %xmm3
+; SSE42-NEXT:    pextrb $14, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm2, %eax
+; SSE42-NEXT:    pextrb $12, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm2, %eax
+; SSE42-NEXT:    pextrb $10, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm2, %eax
+; SSE42-NEXT:    pextrb $8, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm2, %eax
+; SSE42-NEXT:    pextrb $6, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm2, %eax
+; SSE42-NEXT:    pextrb $4, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm2, %eax
+; SSE42-NEXT:    pextrb $2, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm2, %eax
+; SSE42-NEXT:    pextrb $0, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm2, %eax
+; SSE42-NEXT:    pextrb $14, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm2, %eax
+; SSE42-NEXT:    pextrb $12, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm2, %eax
+; SSE42-NEXT:    pextrb $10, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm2, %eax
+; SSE42-NEXT:    pextrb $8, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm2, %eax
+; SSE42-NEXT:    pextrb $6, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm2, %eax
+; SSE42-NEXT:    pextrb $4, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm2, %eax
+; SSE42-NEXT:    pextrb $2, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $15, %xmm0, %eax
+; SSE42-NEXT:    pextrb $14, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm0, %eax
+; SSE42-NEXT:    pextrb $12, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm0, %eax
+; SSE42-NEXT:    pextrb $10, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm0, %eax
+; SSE42-NEXT:    pextrb $8, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm0, %eax
+; SSE42-NEXT:    pextrb $6, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm0, %eax
+; SSE42-NEXT:    pextrb $4, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm0, %eax
+; SSE42-NEXT:    pextrb $2, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    pextrb $0, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm0, %eax
+; SSE42-NEXT:    pextrb $14, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm0, %eax
+; SSE42-NEXT:    pextrb $12, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm0, %eax
+; SSE42-NEXT:    pextrb $10, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm0, %eax
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm0, %eax
+; SSE42-NEXT:    pextrb $6, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm0, %eax
+; SSE42-NEXT:    pextrb $4, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm0, %eax
+; SSE42-NEXT:    pextrb $2, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm0, %eax
@@ -1137,10 +1149,9 @@ define <32 x i1> @test_cmp_v32i16(<32 x i16> %a0, <32 x i16> %a1) nounwind {
 ;
 ; AVX512BW-LABEL: test_cmp_v32i16:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpcmpgtw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
   %1 = icmp sgt <32 x i16> %a0, %a1
   ret <32 x i1> %1
@@ -1157,196 +1168,196 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 6(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 4(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 2(%rdi)
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, (%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movq %rdi, %rax
@@ -1969,6 +1980,7 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm4, %xmm3
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512F-NEXT:    vmovdqa %xmm4, %xmm2
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v64i8:
@@ -1979,6 +1991,7 @@ define <64 x i1> @test_cmp_v64i8(<64 x i8> %a0, <64 x i8> %a1) nounwind {
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm4, %xmm3
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512DQ-NEXT:    vmovdqa %xmm4, %xmm2
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v64i8:
@@ -2173,6 +2186,7 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v16f64:
@@ -2288,6 +2302,7 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v16f64:
@@ -2403,6 +2418,7 @@ define <16 x i1> @test_cmp_v16f64(<16 x double> %a0, <16 x double> %a1) nounwind
 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = fcmp ogt <16 x double> %a0, %a1
   ret <16 x i1> %1
@@ -2474,98 +2490,98 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 2(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, (%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movq %rdi, %rax
@@ -2573,141 +2589,134 @@ define <32 x i1> @test_cmp_v32f32(<32 x float> %a0, <32 x float> %a1) nounwind {
 ;
 ; SSE42-LABEL: test_cmp_v32f32:
 ; SSE42:       # BB#0:
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
 ; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm11
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
 ; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm12
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
-; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm10
 ; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm13
+; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm9
+; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm14
+; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm8
 ; SSE42-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm15
-; SSE42-NEXT:    cmpltps %xmm3, %xmm15
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT:    pshufb %xmm3, %xmm15
-; SSE42-NEXT:    cmpltps %xmm2, %xmm13
-; SSE42-NEXT:    pshufb %xmm3, %xmm13
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm13 = xmm13[0],xmm15[0]
-; SSE42-NEXT:    psllw $15, %xmm13
-; SSE42-NEXT:    psraw $15, %xmm13
-; SSE42-NEXT:    movdqa {{.*#+}} xmm2 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE42-NEXT:    pshufb %xmm2, %xmm13
-; SSE42-NEXT:    cmpltps %xmm1, %xmm14
-; SSE42-NEXT:    pshufb %xmm3, %xmm14
+; SSE42-NEXT:    cmpltps %xmm1, %xmm15
+; SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT:    pshufb %xmm1, %xmm15
 ; SSE42-NEXT:    cmpltps %xmm0, %xmm8
-; SSE42-NEXT:    pshufb %xmm3, %xmm8
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm14[0]
+; SSE42-NEXT:    pshufb %xmm1, %xmm8
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm15[0]
 ; SSE42-NEXT:    psllw $15, %xmm8
 ; SSE42-NEXT:    psraw $15, %xmm8
-; SSE42-NEXT:    pshufb %xmm2, %xmm8
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm13[0]
-; SSE42-NEXT:    cmpltps %xmm7, %xmm12
-; SSE42-NEXT:    pshufb %xmm3, %xmm12
-; SSE42-NEXT:    cmpltps %xmm6, %xmm10
-; SSE42-NEXT:    pshufb %xmm3, %xmm10
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm12[0]
-; SSE42-NEXT:    psllw $15, %xmm10
-; SSE42-NEXT:    psraw $15, %xmm10
-; SSE42-NEXT:    pshufb %xmm2, %xmm10
-; SSE42-NEXT:    cmpltps %xmm5, %xmm11
-; SSE42-NEXT:    pshufb %xmm3, %xmm11
-; SSE42-NEXT:    cmpltps %xmm4, %xmm9
-; SSE42-NEXT:    pshufb %xmm3, %xmm9
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm11[0]
+; SSE42-NEXT:    cmpltps %xmm3, %xmm14
+; SSE42-NEXT:    pshufb %xmm1, %xmm14
+; SSE42-NEXT:    cmpltps %xmm2, %xmm9
+; SSE42-NEXT:    pshufb %xmm1, %xmm9
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm14[0]
 ; SSE42-NEXT:    psllw $15, %xmm9
 ; SSE42-NEXT:    psraw $15, %xmm9
-; SSE42-NEXT:    pshufb %xmm2, %xmm9
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0]
-; SSE42-NEXT:    pextrb $15, %xmm9, %eax
+; SSE42-NEXT:    cmpltps %xmm5, %xmm13
+; SSE42-NEXT:    pshufb %xmm1, %xmm13
+; SSE42-NEXT:    cmpltps %xmm4, %xmm10
+; SSE42-NEXT:    pshufb %xmm1, %xmm10
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm10 = xmm10[0],xmm13[0]
+; SSE42-NEXT:    psllw $15, %xmm10
+; SSE42-NEXT:    psraw $15, %xmm10
+; SSE42-NEXT:    cmpltps %xmm7, %xmm12
+; SSE42-NEXT:    pshufb %xmm1, %xmm12
+; SSE42-NEXT:    cmpltps %xmm6, %xmm11
+; SSE42-NEXT:    pshufb %xmm1, %xmm11
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm11 = xmm11[0],xmm12[0]
+; SSE42-NEXT:    psllw $15, %xmm11
+; SSE42-NEXT:    psraw $15, %xmm11
+; SSE42-NEXT:    pextrb $14, %xmm11, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm9, %eax
+; SSE42-NEXT:    pextrb $12, %xmm11, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm9, %eax
+; SSE42-NEXT:    pextrb $10, %xmm11, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm9, %eax
+; SSE42-NEXT:    pextrb $8, %xmm11, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm9, %eax
+; SSE42-NEXT:    pextrb $6, %xmm11, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm9, %eax
+; SSE42-NEXT:    pextrb $4, %xmm11, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm9, %eax
+; SSE42-NEXT:    pextrb $2, %xmm11, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm9, %eax
+; SSE42-NEXT:    pextrb $0, %xmm11, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm9, %eax
+; SSE42-NEXT:    pextrb $14, %xmm10, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm9, %eax
+; SSE42-NEXT:    pextrb $12, %xmm10, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm9, %eax
+; SSE42-NEXT:    pextrb $10, %xmm10, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm9, %eax
+; SSE42-NEXT:    pextrb $8, %xmm10, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm9, %eax
+; SSE42-NEXT:    pextrb $6, %xmm10, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm9, %eax
+; SSE42-NEXT:    pextrb $4, %xmm10, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm9, %eax
+; SSE42-NEXT:    pextrb $2, %xmm10, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $0, %xmm9, %eax
+; SSE42-NEXT:    pextrb $0, %xmm10, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $15, %xmm8, %eax
+; SSE42-NEXT:    pextrb $14, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm8, %eax
+; SSE42-NEXT:    pextrb $12, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm8, %eax
+; SSE42-NEXT:    pextrb $10, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm8, %eax
+; SSE42-NEXT:    pextrb $8, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm8, %eax
+; SSE42-NEXT:    pextrb $6, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm8, %eax
+; SSE42-NEXT:    pextrb $4, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm8, %eax
+; SSE42-NEXT:    pextrb $2, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm8, %eax
+; SSE42-NEXT:    pextrb $0, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm8, %eax
+; SSE42-NEXT:    pextrb $14, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm8, %eax
+; SSE42-NEXT:    pextrb $12, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm8, %eax
+; SSE42-NEXT:    pextrb $10, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm8, %eax
+; SSE42-NEXT:    pextrb $8, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm8, %eax
+; SSE42-NEXT:    pextrb $6, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm8, %eax
+; SSE42-NEXT:    pextrb $4, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm8, %eax
+; SSE42-NEXT:    pextrb $2, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm8, %eax
@@ -3652,6 +3661,7 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v16i64:
@@ -3783,6 +3793,7 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v16i64:
@@ -3914,6 +3925,7 @@ define <16 x i1> @test_cmp_v16i64(<16 x i64> %a0, <16 x i64> %a1) nounwind {
 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
   %1 = icmp sgt <16 x i64> %a0, %a1
   ret <16 x i1> %1
@@ -3977,98 +3989,98 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 2(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, (%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movq %rdi, %rax
@@ -4076,33 +4088,21 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
 ;
 ; SSE42-LABEL: test_cmp_v32i32:
 ; SSE42:       # BB#0:
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT:    pshufb %xmm8, %xmm3
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT:    pshufb %xmm8, %xmm2
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE42-NEXT:    psllw $15, %xmm2
-; SSE42-NEXT:    psraw $15, %xmm2
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE42-NEXT:    pshufb %xmm3, %xmm2
 ; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm1
+; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; SSE42-NEXT:    pshufb %xmm8, %xmm1
 ; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm0
 ; SSE42-NEXT:    pshufb %xmm8, %xmm0
 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE42-NEXT:    psllw $15, %xmm0
 ; SSE42-NEXT:    psraw $15, %xmm0
-; SSE42-NEXT:    pshufb %xmm3, %xmm0
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT:    pshufb %xmm8, %xmm7
-; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    pshufb %xmm8, %xmm6
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; SSE42-NEXT:    psllw $15, %xmm6
-; SSE42-NEXT:    psraw $15, %xmm6
-; SSE42-NEXT:    pshufb %xmm3, %xmm6
+; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT:    pshufb %xmm8, %xmm3
+; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT:    pshufb %xmm8, %xmm2
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
+; SSE42-NEXT:    psllw $15, %xmm2
+; SSE42-NEXT:    psraw $15, %xmm2
 ; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm5
 ; SSE42-NEXT:    pshufb %xmm8, %xmm5
 ; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm4
@@ -4110,99 +4110,104 @@ define <32 x i1> @test_cmp_v32i32(<32 x i32> %a0, <32 x i32> %a1) nounwind {
 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
 ; SSE42-NEXT:    psllw $15, %xmm4
 ; SSE42-NEXT:    psraw $15, %xmm4
-; SSE42-NEXT:    pshufb %xmm3, %xmm4
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; SSE42-NEXT:    pextrb $15, %xmm4, %eax
+; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT:    pshufb %xmm8, %xmm7
+; SSE42-NEXT:    pcmpgtd {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT:    pshufb %xmm8, %xmm6
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
+; SSE42-NEXT:    psllw $15, %xmm6
+; SSE42-NEXT:    psraw $15, %xmm6
+; SSE42-NEXT:    pextrb $14, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm4, %eax
+; SSE42-NEXT:    pextrb $12, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm4, %eax
+; SSE42-NEXT:    pextrb $10, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm4, %eax
+; SSE42-NEXT:    pextrb $8, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm4, %eax
+; SSE42-NEXT:    pextrb $6, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm4, %eax
+; SSE42-NEXT:    pextrb $4, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm4, %eax
+; SSE42-NEXT:    pextrb $2, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm4, %eax
+; SSE42-NEXT:    pextrb $0, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm4, %eax
+; SSE42-NEXT:    pextrb $14, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm4, %eax
+; SSE42-NEXT:    pextrb $12, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm4, %eax
+; SSE42-NEXT:    pextrb $10, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm4, %eax
+; SSE42-NEXT:    pextrb $8, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm4, %eax
+; SSE42-NEXT:    pextrb $6, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm4, %eax
+; SSE42-NEXT:    pextrb $4, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm4, %eax
+; SSE42-NEXT:    pextrb $2, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $15, %xmm0, %eax
+; SSE42-NEXT:    pextrb $14, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm0, %eax
+; SSE42-NEXT:    pextrb $12, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm0, %eax
+; SSE42-NEXT:    pextrb $10, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm0, %eax
+; SSE42-NEXT:    pextrb $8, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm0, %eax
+; SSE42-NEXT:    pextrb $6, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm0, %eax
+; SSE42-NEXT:    pextrb $4, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm0, %eax
+; SSE42-NEXT:    pextrb $2, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    pextrb $0, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm0, %eax
+; SSE42-NEXT:    pextrb $14, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm0, %eax
+; SSE42-NEXT:    pextrb $12, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm0, %eax
+; SSE42-NEXT:    pextrb $10, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm0, %eax
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm0, %eax
+; SSE42-NEXT:    pextrb $6, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm0, %eax
+; SSE42-NEXT:    pextrb $4, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm0, %eax
+; SSE42-NEXT:    pextrb $2, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm0, %eax
@@ -4938,196 +4943,196 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 6(%rdi)
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 4(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 2(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, (%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movq %rdi, %rax
@@ -5135,214 +5140,201 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
 ;
 ; SSE42-LABEL: test_cmp_v64i16:
 ; SSE42:       # BB#0:
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm1
-; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE42-NEXT:    pshufb %xmm8, %xmm1
 ; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    pshufb %xmm8, %xmm0
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm3
-; SSE42-NEXT:    pshufb %xmm8, %xmm3
+; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm1
 ; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT:    pshufb %xmm8, %xmm2
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    pshufb %xmm8, %xmm5
+; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm3
 ; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT:    pshufb %xmm8, %xmm4
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0]
-; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT:    pshufb %xmm8, %xmm7
+; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm5
 ; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    pshufb %xmm8, %xmm6
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm6 = xmm6[0],xmm7[0]
-; SSE42-NEXT:    pextrb $15, %xmm6, %eax
+; SSE42-NEXT:    pcmpgtw {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT:    pextrb $14, %xmm7, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm6, %eax
+; SSE42-NEXT:    pextrb $12, %xmm7, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm6, %eax
+; SSE42-NEXT:    pextrb $10, %xmm7, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm6, %eax
+; SSE42-NEXT:    pextrb $8, %xmm7, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm6, %eax
+; SSE42-NEXT:    pextrb $6, %xmm7, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm6, %eax
+; SSE42-NEXT:    pextrb $4, %xmm7, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm6, %eax
+; SSE42-NEXT:    pextrb $2, %xmm7, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm6, %eax
+; SSE42-NEXT:    pextrb $0, %xmm7, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm6, %eax
+; SSE42-NEXT:    pextrb $14, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm6, %eax
+; SSE42-NEXT:    pextrb $12, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm6, %eax
+; SSE42-NEXT:    pextrb $10, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm6, %eax
+; SSE42-NEXT:    pextrb $8, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm6, %eax
+; SSE42-NEXT:    pextrb $6, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm6, %eax
+; SSE42-NEXT:    pextrb $4, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm6, %eax
+; SSE42-NEXT:    pextrb $2, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm6, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 6(%rdi)
-; SSE42-NEXT:    pextrb $15, %xmm4, %eax
+; SSE42-NEXT:    pextrb $14, %xmm5, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm4, %eax
+; SSE42-NEXT:    pextrb $12, %xmm5, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm4, %eax
+; SSE42-NEXT:    pextrb $10, %xmm5, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm4, %eax
+; SSE42-NEXT:    pextrb $8, %xmm5, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm4, %eax
+; SSE42-NEXT:    pextrb $6, %xmm5, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm4, %eax
+; SSE42-NEXT:    pextrb $4, %xmm5, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm4, %eax
+; SSE42-NEXT:    pextrb $2, %xmm5, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm4, %eax
+; SSE42-NEXT:    pextrb $0, %xmm5, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm4, %eax
+; SSE42-NEXT:    pextrb $14, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm4, %eax
+; SSE42-NEXT:    pextrb $12, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm4, %eax
+; SSE42-NEXT:    pextrb $10, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm4, %eax
+; SSE42-NEXT:    pextrb $8, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm4, %eax
+; SSE42-NEXT:    pextrb $6, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm4, %eax
+; SSE42-NEXT:    pextrb $4, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm4, %eax
+; SSE42-NEXT:    pextrb $2, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 4(%rdi)
-; SSE42-NEXT:    pextrb $15, %xmm2, %eax
+; SSE42-NEXT:    pextrb $14, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm2, %eax
+; SSE42-NEXT:    pextrb $12, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm2, %eax
+; SSE42-NEXT:    pextrb $10, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm2, %eax
+; SSE42-NEXT:    pextrb $8, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm2, %eax
+; SSE42-NEXT:    pextrb $6, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm2, %eax
+; SSE42-NEXT:    pextrb $4, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm2, %eax
+; SSE42-NEXT:    pextrb $2, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm2, %eax
+; SSE42-NEXT:    pextrb $0, %xmm3, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm2, %eax
+; SSE42-NEXT:    pextrb $14, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm2, %eax
+; SSE42-NEXT:    pextrb $12, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm2, %eax
+; SSE42-NEXT:    pextrb $10, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm2, %eax
+; SSE42-NEXT:    pextrb $8, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm2, %eax
+; SSE42-NEXT:    pextrb $6, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm2, %eax
+; SSE42-NEXT:    pextrb $4, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm2, %eax
+; SSE42-NEXT:    pextrb $2, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm2, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $15, %xmm0, %eax
+; SSE42-NEXT:    pextrb $14, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm0, %eax
+; SSE42-NEXT:    pextrb $12, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm0, %eax
+; SSE42-NEXT:    pextrb $10, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm0, %eax
+; SSE42-NEXT:    pextrb $8, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm0, %eax
+; SSE42-NEXT:    pextrb $6, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm0, %eax
+; SSE42-NEXT:    pextrb $4, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm0, %eax
+; SSE42-NEXT:    pextrb $2, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    pextrb $0, %xmm1, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm0, %eax
+; SSE42-NEXT:    pextrb $14, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm0, %eax
+; SSE42-NEXT:    pextrb $12, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm0, %eax
+; SSE42-NEXT:    pextrb $10, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm0, %eax
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm0, %eax
+; SSE42-NEXT:    pextrb $6, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm0, %eax
+; SSE42-NEXT:    pextrb $4, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm0, %eax
+; SSE42-NEXT:    pextrb $2, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm0, %eax
@@ -6063,6 +6055,7 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
 ; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512F-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v64i16:
@@ -6349,6 +6342,7 @@ define <64 x i1> @test_cmp_v64i16(<64 x i16> %a0, <64 x i16> %a1) nounwind {
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX512DQ-NEXT:    # kill: %XMM2<def> %XMM2<kill> %YMM2<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v64i16:
@@ -6780,392 +6774,392 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 14(%rdi)
-; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
-; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
-; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
-; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
-; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
-; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 14(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 14(%rdi)
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movdqa %xmm6, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 12(%rdi)
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movdqa %xmm5, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 10(%rdi)
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movdqa %xmm4, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 8(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 8(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movdqa %xmm3, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 6(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 6(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movdqa %xmm2, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 4(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 4(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 2(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 14(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 12(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 10(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 8(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 6(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 4(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, (%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movq %rdi, %rax
@@ -8416,6 +8410,7 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, (%rdi)
 ; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: test_cmp_v128i8:
@@ -8461,6 +8456,7 @@ define <128 x i1> @test_cmp_v128i8(<128 x i8> %a0, <128 x i8> %a1) nounwind {
 ; AVX512DQ-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512DQ-NEXT:    kmovw %k0, (%rdi)
 ; AVX512DQ-NEXT:    movq %rdi, %rax
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512BW-LABEL: test_cmp_v128i8:
@@ -8584,98 +8580,98 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 2(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, (%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movq %rdi, %rax
@@ -8690,185 +8686,178 @@ define <32 x i1> @test_cmp_v32f64(<32 x double> %a0, <32 x double> %a1) nounwind
 ; SSE42-NEXT:    pushq %r12
 ; SSE42-NEXT:    pushq %rbx
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm8
-; SSE42-NEXT:    cmpltpd %xmm7, %xmm8
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT:    cmpltpd %xmm6, %xmm7
-; SSE42-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2],xmm8[0,2]
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    cmpltpd %xmm5, %xmm6
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    cmpltpd %xmm4, %xmm5
-; SSE42-NEXT:    pslld $31, %xmm7
-; SSE42-NEXT:    psrad $31, %xmm7
-; SSE42-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
-; SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT:    pshufb %xmm4, %xmm7
-; SSE42-NEXT:    pslld $31, %xmm5
-; SSE42-NEXT:    psrad $31, %xmm5
-; SSE42-NEXT:    pshufb %xmm4, %xmm5
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm7[0]
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    cmpltpd %xmm3, %xmm6
+; SSE42-NEXT:    cmpltpd %xmm3, %xmm8
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
 ; SSE42-NEXT:    cmpltpd %xmm2, %xmm3
+; SSE42-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm8[0,2]
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm6[0,2]
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    cmpltpd %xmm1, %xmm6
+; SSE42-NEXT:    cmpltpd %xmm1, %xmm2
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
 ; SSE42-NEXT:    cmpltpd %xmm0, %xmm1
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm6[0,2]
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    psllw $15, %xmm5
-; SSE42-NEXT:    psraw $15, %xmm5
 ; SSE42-NEXT:    pslld $31, %xmm3
 ; SSE42-NEXT:    psrad $31, %xmm3
-; SSE42-NEXT:    pshufb %xmm4, %xmm3
+; SSE42-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm2[0,2]
+; SSE42-NEXT:    movdqa {{.*#+}} xmm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT:    pshufb %xmm8, %xmm3
 ; SSE42-NEXT:    pslld $31, %xmm1
 ; SSE42-NEXT:    psrad $31, %xmm1
-; SSE42-NEXT:    pshufb %xmm4, %xmm1
+; SSE42-NEXT:    pshufb %xmm8, %xmm1
 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0]
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE42-NEXT:    pshufb %xmm3, %xmm5
-; SSE42-NEXT:    psllw $15, %xmm1
-; SSE42-NEXT:    psraw $15, %xmm1
-; SSE42-NEXT:    pshufb %xmm3, %xmm1
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm5[0]
+; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT:    cmpltpd %xmm7, %xmm0
+; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT:    cmpltpd %xmm6, %xmm7
+; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,2],xmm0[0,2]
+; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT:    cmpltpd %xmm5, %xmm6
+; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm0
+; SSE42-NEXT:    cmpltpd %xmm4, %xmm0
+; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
+; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT:    pslld $31, %xmm7
+; SSE42-NEXT:    psrad $31, %xmm7
+; SSE42-NEXT:    pshufb %xmm8, %xmm7
+; SSE42-NEXT:    pslld $31, %xmm0
+; SSE42-NEXT:    psrad $31, %xmm0
+; SSE42-NEXT:    pshufb %xmm8, %xmm0
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm7[0]
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
 ; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,2],xmm5[0,2]
+; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
 ; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm0
-; SSE42-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm5[0,2]
+; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm2
+; SSE42-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm5[0,2]
 ; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    pslld $31, %xmm6
-; SSE42-NEXT:    psrad $31, %xmm6
-; SSE42-NEXT:    pshufb %xmm4, %xmm6
-; SSE42-NEXT:    pslld $31, %xmm0
-; SSE42-NEXT:    psrad $31, %xmm0
-; SSE42-NEXT:    pshufb %xmm4, %xmm0
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm6[0]
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT:    pslld $31, %xmm4
+; SSE42-NEXT:    psrad $31, %xmm4
+; SSE42-NEXT:    pshufb %xmm8, %xmm4
+; SSE42-NEXT:    pslld $31, %xmm2
+; SSE42-NEXT:    psrad $31, %xmm2
+; SSE42-NEXT:    pshufb %xmm8, %xmm2
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm4[0]
+; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm4
 ; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm6[0,2]
-; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm2
-; SSE42-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
+; SSE42-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[0,2]
+; SSE42-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT:    cmpltpd {{[0-9]+}}(%rsp), %xmm3
+; SSE42-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,2],xmm4[0,2]
 ; SSE42-NEXT:    pslld $31, %xmm5
 ; SSE42-NEXT:    psrad $31, %xmm5
-; SSE42-NEXT:    pshufb %xmm4, %xmm5
-; SSE42-NEXT:    pslld $31, %xmm2
-; SSE42-NEXT:    psrad $31, %xmm2
-; SSE42-NEXT:    pshufb %xmm4, %xmm2
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm5[0]
-; SSE42-NEXT:    psllw $15, %xmm0
-; SSE42-NEXT:    psraw $15, %xmm0
-; SSE42-NEXT:    pshufb %xmm3, %xmm0
+; SSE42-NEXT:    pshufb %xmm8, %xmm5
+; SSE42-NEXT:    pslld $31, %xmm3
+; SSE42-NEXT:    psrad $31, %xmm3
+; SSE42-NEXT:    pshufb %xmm8, %xmm3
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm5[0]
+; SSE42-NEXT:    psllw $15, %xmm3
+; SSE42-NEXT:    psraw $15, %xmm3
+; SSE42-NEXT:    pextrb $14, %xmm3, %ecx
+; SSE42-NEXT:    pextrb $12, %xmm3, %edx
+; SSE42-NEXT:    pextrb $10, %xmm3, %r8d
+; SSE42-NEXT:    pextrb $8, %xmm3, %r10d
+; SSE42-NEXT:    pextrb $6, %xmm3, %r14d
+; SSE42-NEXT:    pextrb $4, %xmm3, %r12d
+; SSE42-NEXT:    pextrb $2, %xmm3, %ebx
+; SSE42-NEXT:    pextrb $0, %xmm3, %eax
 ; SSE42-NEXT:    psllw $15, %xmm2
 ; SSE42-NEXT:    psraw $15, %xmm2
-; SSE42-NEXT:    pshufb %xmm3, %xmm2
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
-; SSE42-NEXT:    pextrb $15, %xmm2, %eax
-; SSE42-NEXT:    andb $1, %al
-; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm2, %eax
-; SSE42-NEXT:    andb $1, %al
-; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm2, %r8d
-; SSE42-NEXT:    pextrb $12, %xmm2, %r9d
-; SSE42-NEXT:    pextrb $11, %xmm2, %r10d
-; SSE42-NEXT:    pextrb $10, %xmm2, %r11d
-; SSE42-NEXT:    pextrb $9, %xmm2, %r14d
-; SSE42-NEXT:    pextrb $8, %xmm2, %r15d
-; SSE42-NEXT:    pextrb $7, %xmm2, %r12d
-; SSE42-NEXT:    pextrb $6, %xmm2, %r13d
-; SSE42-NEXT:    pextrb $5, %xmm2, %ebx
-; SSE42-NEXT:    pextrb $4, %xmm2, %ebp
-; SSE42-NEXT:    pextrb $3, %xmm2, %eax
-; SSE42-NEXT:    pextrb $2, %xmm2, %ecx
-; SSE42-NEXT:    pextrb $1, %xmm2, %edx
-; SSE42-NEXT:    pextrb $0, %xmm2, %esi
+; SSE42-NEXT:    andb $1, %cl
+; SSE42-NEXT:    movb %cl, 2(%rdi)
+; SSE42-NEXT:    andb $1, %dl
+; SSE42-NEXT:    movb %dl, 2(%rdi)
+; SSE42-NEXT:    pextrb $14, %xmm2, %edx
+; SSE42-NEXT:    pextrb $12, %xmm2, %esi
+; SSE42-NEXT:    pextrb $10, %xmm2, %r9d
+; SSE42-NEXT:    pextrb $8, %xmm2, %r11d
+; SSE42-NEXT:    pextrb $6, %xmm2, %r15d
+; SSE42-NEXT:    pextrb $4, %xmm2, %r13d
+; SSE42-NEXT:    pextrb $2, %xmm2, %ebp
+; SSE42-NEXT:    pextrb $0, %xmm2, %ecx
+; SSE42-NEXT:    psllw $15, %xmm0
+; SSE42-NEXT:    psraw $15, %xmm0
 ; SSE42-NEXT:    andb $1, %r8b
 ; SSE42-NEXT:    movb %r8b, 2(%rdi)
-; SSE42-NEXT:    andb $1, %r9b
-; SSE42-NEXT:    movb %r9b, 2(%rdi)
 ; SSE42-NEXT:    andb $1, %r10b
 ; SSE42-NEXT:    movb %r10b, 2(%rdi)
-; SSE42-NEXT:    andb $1, %r11b
-; SSE42-NEXT:    movb %r11b, 2(%rdi)
 ; SSE42-NEXT:    andb $1, %r14b
 ; SSE42-NEXT:    movb %r14b, 2(%rdi)
-; SSE42-NEXT:    andb $1, %r15b
-; SSE42-NEXT:    movb %r15b, 2(%rdi)
 ; SSE42-NEXT:    andb $1, %r12b
 ; SSE42-NEXT:    movb %r12b, 2(%rdi)
-; SSE42-NEXT:    andb $1, %r13b
-; SSE42-NEXT:    movb %r13b, 2(%rdi)
 ; SSE42-NEXT:    andb $1, %bl
 ; SSE42-NEXT:    movb %bl, 2(%rdi)
-; SSE42-NEXT:    andb $1, %bpl
-; SSE42-NEXT:    movb %bpl, 2(%rdi)
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    andb $1, %cl
-; SSE42-NEXT:    movb %cl, 2(%rdi)
 ; SSE42-NEXT:    andb $1, %dl
 ; SSE42-NEXT:    movb %dl, 2(%rdi)
 ; SSE42-NEXT:    andb $1, %sil
 ; SSE42-NEXT:    movb %sil, 2(%rdi)
-; SSE42-NEXT:    pextrb $15, %xmm1, %eax
-; SSE42-NEXT:    andb $1, %al
-; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm1, %eax
-; SSE42-NEXT:    andb $1, %al
-; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm1, %r8d
-; SSE42-NEXT:    pextrb $12, %xmm1, %r9d
-; SSE42-NEXT:    pextrb $11, %xmm1, %r10d
-; SSE42-NEXT:    pextrb $10, %xmm1, %r11d
-; SSE42-NEXT:    pextrb $9, %xmm1, %r14d
-; SSE42-NEXT:    pextrb $8, %xmm1, %r15d
-; SSE42-NEXT:    pextrb $7, %xmm1, %r12d
-; SSE42-NEXT:    pextrb $6, %xmm1, %r13d
-; SSE42-NEXT:    pextrb $5, %xmm1, %ebx
-; SSE42-NEXT:    pextrb $4, %xmm1, %ebp
-; SSE42-NEXT:    pextrb $3, %xmm1, %eax
-; SSE42-NEXT:    pextrb $2, %xmm1, %ecx
-; SSE42-NEXT:    pextrb $1, %xmm1, %edx
-; SSE42-NEXT:    pextrb $0, %xmm1, %esi
+; SSE42-NEXT:    pextrb $14, %xmm0, %esi
+; SSE42-NEXT:    pextrb $12, %xmm0, %edx
+; SSE42-NEXT:    pextrb $10, %xmm0, %r8d
+; SSE42-NEXT:    pextrb $8, %xmm0, %r10d
+; SSE42-NEXT:    pextrb $6, %xmm0, %r14d
+; SSE42-NEXT:    pextrb $4, %xmm0, %r12d
+; SSE42-NEXT:    pextrb $2, %xmm0, %ebx
+; SSE42-NEXT:    pextrb $0, %xmm0, %eax
+; SSE42-NEXT:    psllw $15, %xmm1
+; SSE42-NEXT:    psraw $15, %xmm1
+; SSE42-NEXT:    andb $1, %r9b
+; SSE42-NEXT:    movb %r9b, 2(%rdi)
+; SSE42-NEXT:    andb $1, %r11b
+; SSE42-NEXT:    movb %r11b, 2(%rdi)
+; SSE42-NEXT:    andb $1, %r15b
+; SSE42-NEXT:    movb %r15b, 2(%rdi)
+; SSE42-NEXT:    andb $1, %r13b
+; SSE42-NEXT:    movb %r13b, 2(%rdi)
+; SSE42-NEXT:    andb $1, %bpl
+; SSE42-NEXT:    movb %bpl, 2(%rdi)
+; SSE42-NEXT:    andb $1, %cl
+; SSE42-NEXT:    movb %cl, 2(%rdi)
+; SSE42-NEXT:    andb $1, %sil
+; SSE42-NEXT:    movb %sil, (%rdi)
+; SSE42-NEXT:    andb $1, %dl
+; SSE42-NEXT:    movb %dl, (%rdi)
+; SSE42-NEXT:    pextrb $14, %xmm1, %r9d
+; SSE42-NEXT:    pextrb $12, %xmm1, %r11d
+; SSE42-NEXT:    pextrb $10, %xmm1, %r15d
+; SSE42-NEXT:    pextrb $8, %xmm1, %r13d
+; SSE42-NEXT:    pextrb $6, %xmm1, %ecx
+; SSE42-NEXT:    pextrb $4, %xmm1, %edx
+; SSE42-NEXT:    pextrb $2, %xmm1, %esi
+; SSE42-NEXT:    pextrb $0, %xmm1, %ebp
 ; SSE42-NEXT:    andb $1, %r8b
 ; SSE42-NEXT:    movb %r8b, (%rdi)
-; SSE42-NEXT:    andb $1, %r9b
-; SSE42-NEXT:    movb %r9b, (%rdi)
 ; SSE42-NEXT:    andb $1, %r10b
 ; SSE42-NEXT:    movb %r10b, (%rdi)
-; SSE42-NEXT:    andb $1, %r11b
-; SSE42-NEXT:    movb %r11b, (%rdi)
 ; SSE42-NEXT:    andb $1, %r14b
 ; SSE42-NEXT:    movb %r14b, (%rdi)
-; SSE42-NEXT:    andb $1, %r15b
-; SSE42-NEXT:    movb %r15b, (%rdi)
 ; SSE42-NEXT:    andb $1, %r12b
 ; SSE42-NEXT:    movb %r12b, (%rdi)
-; SSE42-NEXT:    andb $1, %r13b
-; SSE42-NEXT:    movb %r13b, (%rdi)
 ; SSE42-NEXT:    andb $1, %bl
 ; SSE42-NEXT:    movb %bl, (%rdi)
-; SSE42-NEXT:    andb $1, %bpl
-; SSE42-NEXT:    movb %bpl, (%rdi)
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
+; SSE42-NEXT:    andb $1, %r9b
+; SSE42-NEXT:    movb %r9b, (%rdi)
+; SSE42-NEXT:    andb $1, %r11b
+; SSE42-NEXT:    movb %r11b, (%rdi)
+; SSE42-NEXT:    andb $1, %r15b
+; SSE42-NEXT:    movb %r15b, (%rdi)
+; SSE42-NEXT:    andb $1, %r13b
+; SSE42-NEXT:    movb %r13b, (%rdi)
 ; SSE42-NEXT:    andb $1, %cl
 ; SSE42-NEXT:    movb %cl, (%rdi)
 ; SSE42-NEXT:    andb $1, %dl
 ; SSE42-NEXT:    movb %dl, (%rdi)
 ; SSE42-NEXT:    andb $1, %sil
 ; SSE42-NEXT:    movb %sil, (%rdi)
+; SSE42-NEXT:    andb $1, %bpl
+; SSE42-NEXT:    movb %bpl, (%rdi)
 ; SSE42-NEXT:    movq %rdi, %rax
 ; SSE42-NEXT:    popq %rbx
 ; SSE42-NEXT:    popq %r12
@@ -9907,98 +9896,98 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
-; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
-; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
-; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, (%rdi)
+; SSE2-NEXT:    movb %al, 2(%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, 2(%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
 ; SSE2-NEXT:    andb $1, %al
-; SSE2-NEXT:    movb %al, 2(%rdi)
+; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %al
+; SSE2-NEXT:    movb -{{[0-9]+}}(%rsp), %cl
+; SSE2-NEXT:    andb $1, %cl
+; SSE2-NEXT:    movb %cl, (%rdi)
 ; SSE2-NEXT:    andb $1, %al
 ; SSE2-NEXT:    movb %al, (%rdi)
 ; SSE2-NEXT:    movq %rdi, %rax
@@ -10006,173 +9995,166 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
 ;
 ; SSE42-LABEL: test_cmp_v32i64:
 ; SSE42:       # BB#0:
-; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm11
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm10
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm12
-; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm9
+; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm8
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm14
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm13
 ; SSE42-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm15
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm7
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm6
-; SSE42-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
-; SSE42-NEXT:    pslld $31, %xmm6
-; SSE42-NEXT:    psrad $31, %xmm6
-; SSE42-NEXT:    movdqa {{.*#+}} xmm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; SSE42-NEXT:    pshufb %xmm7, %xmm6
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm5
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm4
-; SSE42-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
-; SSE42-NEXT:    pslld $31, %xmm4
-; SSE42-NEXT:    psrad $31, %xmm4
-; SSE42-NEXT:    pshufb %xmm7, %xmm4
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
-; SSE42-NEXT:    psllw $15, %xmm4
-; SSE42-NEXT:    psraw $15, %xmm4
-; SSE42-NEXT:    movdqa {{.*#+}} xmm5 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
-; SSE42-NEXT:    pshufb %xmm5, %xmm4
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm3
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm2
 ; SSE42-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,2],xmm3[0,2]
 ; SSE42-NEXT:    pslld $31, %xmm2
 ; SSE42-NEXT:    psrad $31, %xmm2
-; SSE42-NEXT:    pshufb %xmm7, %xmm2
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; SSE42-NEXT:    pshufb %xmm3, %xmm2
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm1
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm0
 ; SSE42-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; SSE42-NEXT:    pslld $31, %xmm0
 ; SSE42-NEXT:    psrad $31, %xmm0
-; SSE42-NEXT:    pshufb %xmm7, %xmm0
+; SSE42-NEXT:    pshufb %xmm3, %xmm0
 ; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE42-NEXT:    psllw $15, %xmm0
 ; SSE42-NEXT:    psraw $15, %xmm0
-; SSE42-NEXT:    pshufb %xmm5, %xmm0
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm7
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm6
+; SSE42-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,2],xmm7[0,2]
+; SSE42-NEXT:    pslld $31, %xmm6
+; SSE42-NEXT:    psrad $31, %xmm6
+; SSE42-NEXT:    pshufb %xmm3, %xmm6
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm5
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm4
+; SSE42-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,2],xmm5[0,2]
+; SSE42-NEXT:    pslld $31, %xmm4
+; SSE42-NEXT:    psrad $31, %xmm4
+; SSE42-NEXT:    pshufb %xmm3, %xmm4
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm6[0]
+; SSE42-NEXT:    psllw $15, %xmm4
+; SSE42-NEXT:    psraw $15, %xmm4
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm15
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm13
 ; SSE42-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,2],xmm15[0,2]
 ; SSE42-NEXT:    pslld $31, %xmm13
 ; SSE42-NEXT:    psrad $31, %xmm13
-; SSE42-NEXT:    pshufb %xmm7, %xmm13
+; SSE42-NEXT:    pshufb %xmm3, %xmm13
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm14
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm9
-; SSE42-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,2],xmm14[0,2]
-; SSE42-NEXT:    pslld $31, %xmm9
-; SSE42-NEXT:    psrad $31, %xmm9
-; SSE42-NEXT:    pshufb %xmm7, %xmm9
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm13[0]
-; SSE42-NEXT:    psllw $15, %xmm9
-; SSE42-NEXT:    psraw $15, %xmm9
-; SSE42-NEXT:    pshufb %xmm5, %xmm9
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm8
+; SSE42-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,2],xmm14[0,2]
+; SSE42-NEXT:    pslld $31, %xmm8
+; SSE42-NEXT:    psrad $31, %xmm8
+; SSE42-NEXT:    pshufb %xmm3, %xmm8
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm13[0]
+; SSE42-NEXT:    psllw $15, %xmm8
+; SSE42-NEXT:    psraw $15, %xmm8
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm12
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm10
 ; SSE42-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,2],xmm12[0,2]
 ; SSE42-NEXT:    pslld $31, %xmm10
 ; SSE42-NEXT:    psrad $31, %xmm10
-; SSE42-NEXT:    pshufb %xmm7, %xmm10
+; SSE42-NEXT:    pshufb %xmm3, %xmm10
 ; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm11
-; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm8
-; SSE42-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,2],xmm11[0,2]
-; SSE42-NEXT:    pslld $31, %xmm8
-; SSE42-NEXT:    psrad $31, %xmm8
-; SSE42-NEXT:    pshufb %xmm7, %xmm8
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm10[0]
-; SSE42-NEXT:    psllw $15, %xmm8
-; SSE42-NEXT:    psraw $15, %xmm8
-; SSE42-NEXT:    pshufb %xmm5, %xmm8
-; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm8 = xmm8[0],xmm9[0]
-; SSE42-NEXT:    pextrb $15, %xmm8, %eax
+; SSE42-NEXT:    pcmpgtq {{[0-9]+}}(%rsp), %xmm9
+; SSE42-NEXT:    shufps {{.*#+}} xmm9 = xmm9[0,2],xmm11[0,2]
+; SSE42-NEXT:    pslld $31, %xmm9
+; SSE42-NEXT:    psrad $31, %xmm9
+; SSE42-NEXT:    pshufb %xmm3, %xmm9
+; SSE42-NEXT:    punpcklqdq {{.*#+}} xmm9 = xmm9[0],xmm10[0]
+; SSE42-NEXT:    psllw $15, %xmm9
+; SSE42-NEXT:    psraw $15, %xmm9
+; SSE42-NEXT:    pextrb $14, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm8, %eax
+; SSE42-NEXT:    pextrb $12, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm8, %eax
+; SSE42-NEXT:    pextrb $10, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm8, %eax
+; SSE42-NEXT:    pextrb $8, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm8, %eax
+; SSE42-NEXT:    pextrb $6, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm8, %eax
+; SSE42-NEXT:    pextrb $4, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm8, %eax
+; SSE42-NEXT:    pextrb $2, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm8, %eax
+; SSE42-NEXT:    pextrb $0, %xmm9, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm8, %eax
+; SSE42-NEXT:    pextrb $14, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm8, %eax
+; SSE42-NEXT:    pextrb $12, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm8, %eax
+; SSE42-NEXT:    pextrb $10, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm8, %eax
+; SSE42-NEXT:    pextrb $8, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm8, %eax
+; SSE42-NEXT:    pextrb $6, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm8, %eax
+; SSE42-NEXT:    pextrb $4, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm8, %eax
+; SSE42-NEXT:    pextrb $2, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm8, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, 2(%rdi)
-; SSE42-NEXT:    pextrb $15, %xmm0, %eax
+; SSE42-NEXT:    pextrb $14, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $14, %xmm0, %eax
+; SSE42-NEXT:    pextrb $12, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $13, %xmm0, %eax
+; SSE42-NEXT:    pextrb $10, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $12, %xmm0, %eax
+; SSE42-NEXT:    pextrb $8, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $11, %xmm0, %eax
+; SSE42-NEXT:    pextrb $6, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $10, %xmm0, %eax
+; SSE42-NEXT:    pextrb $4, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $9, %xmm0, %eax
+; SSE42-NEXT:    pextrb $2, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $8, %xmm0, %eax
+; SSE42-NEXT:    pextrb $0, %xmm4, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $7, %xmm0, %eax
+; SSE42-NEXT:    pextrb $14, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $6, %xmm0, %eax
+; SSE42-NEXT:    pextrb $12, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $5, %xmm0, %eax
+; SSE42-NEXT:    pextrb $10, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $4, %xmm0, %eax
+; SSE42-NEXT:    pextrb $8, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $3, %xmm0, %eax
+; SSE42-NEXT:    pextrb $6, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $2, %xmm0, %eax
+; SSE42-NEXT:    pextrb $4, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
-; SSE42-NEXT:    pextrb $1, %xmm0, %eax
+; SSE42-NEXT:    pextrb $2, %xmm0, %eax
 ; SSE42-NEXT:    andb $1, %al
 ; SSE42-NEXT:    movb %al, (%rdi)
 ; SSE42-NEXT:    pextrb $0, %xmm0, %eax
@@ -10187,24 +10169,24 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $32, %rsp
-; AVX1-NEXT:    vmovaps 240(%rbp), %ymm8
+; AVX1-NEXT:    vmovdqa 240(%rbp), %ymm8
 ; AVX1-NEXT:    vextractf128 $1, %ymm8, %xmm9
 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm10
 ; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm10, %xmm9
-; AVX1-NEXT:    vmovaps 208(%rbp), %ymm10
+; AVX1-NEXT:    vmovdqa 208(%rbp), %ymm10
 ; AVX1-NEXT:    vpcmpgtq %xmm8, %xmm7, %xmm7
 ; AVX1-NEXT:    vpacksswb %xmm9, %xmm7, %xmm8
 ; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm9
 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm7, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm10, %xmm6, %xmm6
-; AVX1-NEXT:    vmovaps 176(%rbp), %ymm9
+; AVX1-NEXT:    vmovdqa 176(%rbp), %ymm9
 ; AVX1-NEXT:    vpacksswb %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpacksswb %xmm8, %xmm6, %xmm8
 ; AVX1-NEXT:    vextractf128 $1, %ymm9, %xmm7
 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
-; AVX1-NEXT:    vmovaps 144(%rbp), %ymm10
+; AVX1-NEXT:    vmovdqa 144(%rbp), %ymm10
 ; AVX1-NEXT:    vpcmpgtq %xmm9, %xmm5, %xmm5
 ; AVX1-NEXT:    vpacksswb %xmm6, %xmm5, %xmm5
 ; AVX1-NEXT:    vextractf128 $1, %ymm10, %xmm6
@@ -10212,26 +10194,26 @@ define <32 x i1> @test_cmp_v32i64(<32 x i64> %a0, <32 x i64> %a1) nounwind {
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm10, %xmm4, %xmm4
 ; AVX1-NEXT:    vpacksswb %xmm6, %xmm4, %xmm4
-; AVX1-NEXT:    vmovaps 112(%rbp), %ymm6
+; AVX1-NEXT:    vmovdqa 112(%rbp), %ymm6
 ; AVX1-NEXT:    vpacksswb %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpacksswb %xmm8, %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm5
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT:    vmovaps 80(%rbp), %ymm7
+; AVX1-NEXT:    vmovdqa 80(%rbp), %ymm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpacksswb %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm7, %xmm5
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm6, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm2, %xmm2
-; AVX1-NEXT:    vmovaps 48(%rbp), %ymm6
+; AVX1-NEXT:    vmovdqa 48(%rbp), %ymm6
 ; AVX1-NEXT:    vpacksswb %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpacksswb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm3, %xmm5, %xmm3
-; AVX1-NEXT:    vmovaps 16(%rbp), %ymm5
+; AVX1-NEXT:    vmovdqa 16(%rbp), %ymm5
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm1, %xmm1
 ; AVX1-NEXT:    vpacksswb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm5, %xmm3
diff --git a/test/CodeGen/X86/vector-extend-inreg.ll b/test/CodeGen/X86/vector-extend-inreg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a8db0d4cd9d88585a07661b8d50b2e92e0344dc3
--- /dev/null
+++ b/test/CodeGen/X86/vector-extend-inreg.ll
@@ -0,0 +1,120 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2   | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X64-SSE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2   | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X64-AVX
+
+define i64 @extract_any_extend_vector_inreg_v16i64(<16 x i64> %a0, i32 %a1) nounwind {
+; X32-SSE-LABEL: extract_any_extend_vector_inreg_v16i64:
+; X32-SSE:       # BB#0:
+; X32-SSE-NEXT:    pushl %ebp
+; X32-SSE-NEXT:    movl %esp, %ebp
+; X32-SSE-NEXT:    andl $-128, %esp
+; X32-SSE-NEXT:    subl $384, %esp # imm = 0x180
+; X32-SSE-NEXT:    movl 88(%ebp), %ecx
+; X32-SSE-NEXT:    movdqa 72(%ebp), %xmm0
+; X32-SSE-NEXT:    xorps %xmm1, %xmm1
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; X32-SSE-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    movaps %xmm1, (%esp)
+; X32-SSE-NEXT:    movdqa %xmm0, {{[0-9]+}}(%esp)
+; X32-SSE-NEXT:    leal (%ecx,%ecx), %eax
+; X32-SSE-NEXT:    andl $31, %eax
+; X32-SSE-NEXT:    movl 128(%esp,%eax,4), %eax
+; X32-SSE-NEXT:    leal 1(%ecx,%ecx), %ecx
+; X32-SSE-NEXT:    andl $31, %ecx
+; X32-SSE-NEXT:    movl (%esp,%ecx,4), %edx
+; X32-SSE-NEXT:    movl %ebp, %esp
+; X32-SSE-NEXT:    popl %ebp
+; X32-SSE-NEXT:    retl
+;
+; X64-SSE-LABEL: extract_any_extend_vector_inreg_v16i64:
+; X64-SSE:       # BB#0:
+; X64-SSE-NEXT:    pushq %rbp
+; X64-SSE-NEXT:    movq %rsp, %rbp
+; X64-SSE-NEXT:    andq $-128, %rsp
+; X64-SSE-NEXT:    subq $256, %rsp # imm = 0x100
+; X64-SSE-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-SSE-NEXT:    psrldq {{.*#+}} xmm7 = xmm7[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero
+; X64-SSE-NEXT:    xorps %xmm0, %xmm0
+; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, {{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    movaps %xmm0, (%rsp)
+; X64-SSE-NEXT:    movdqa %xmm7, {{[0-9]+}}(%rsp)
+; X64-SSE-NEXT:    andl $15, %edi
+; X64-SSE-NEXT:    movq (%rsp,%rdi,8), %rax
+; X64-SSE-NEXT:    movq %rbp, %rsp
+; X64-SSE-NEXT:    popq %rbp
+; X64-SSE-NEXT:    retq
+;
+; X32-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
+; X32-AVX:       # BB#0:
+; X32-AVX-NEXT:    pushl %ebp
+; X32-AVX-NEXT:    movl %esp, %ebp
+; X32-AVX-NEXT:    andl $-128, %esp
+; X32-AVX-NEXT:    subl $384, %esp # imm = 0x180
+; X32-AVX-NEXT:    movl 40(%ebp), %ecx
+; X32-AVX-NEXT:    vbroadcastsd 32(%ebp), %ymm0
+; X32-AVX-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; X32-AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X32-AVX-NEXT:    vmovapd %ymm1, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovapd %ymm1, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovapd %ymm1, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovapd %ymm0, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovapd %ymm1, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovapd %ymm1, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    vmovapd %ymm1, (%esp)
+; X32-AVX-NEXT:    vmovapd %ymm0, {{[0-9]+}}(%esp)
+; X32-AVX-NEXT:    leal (%ecx,%ecx), %eax
+; X32-AVX-NEXT:    andl $31, %eax
+; X32-AVX-NEXT:    movl 128(%esp,%eax,4), %eax
+; X32-AVX-NEXT:    leal 1(%ecx,%ecx), %ecx
+; X32-AVX-NEXT:    andl $31, %ecx
+; X32-AVX-NEXT:    movl (%esp,%ecx,4), %edx
+; X32-AVX-NEXT:    movl %ebp, %esp
+; X32-AVX-NEXT:    popl %ebp
+; X32-AVX-NEXT:    vzeroupper
+; X32-AVX-NEXT:    retl
+;
+; X64-AVX-LABEL: extract_any_extend_vector_inreg_v16i64:
+; X64-AVX:       # BB#0:
+; X64-AVX-NEXT:    pushq %rbp
+; X64-AVX-NEXT:    movq %rsp, %rbp
+; X64-AVX-NEXT:    andq $-128, %rsp
+; X64-AVX-NEXT:    subq $256, %rsp # imm = 0x100
+; X64-AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; X64-AVX-NEXT:    vpermq {{.*#+}} ymm0 = ymm3[3,1,2,3]
+; X64-AVX-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; X64-AVX-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X64-AVX-NEXT:    vmovapd %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovapd %ymm1, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    vmovapd %ymm1, (%rsp)
+; X64-AVX-NEXT:    vmovapd %ymm0, {{[0-9]+}}(%rsp)
+; X64-AVX-NEXT:    andl $15, %edi
+; X64-AVX-NEXT:    movq (%rsp,%rdi,8), %rax
+; X64-AVX-NEXT:    movq %rbp, %rsp
+; X64-AVX-NEXT:    popq %rbp
+; X64-AVX-NEXT:    vzeroupper
+; X64-AVX-NEXT:    retq
+  %1 = extractelement <16 x i64> %a0, i32 15
+  %2 = insertelement <16 x i64> zeroinitializer, i64 %1, i32 4
+  %3 = extractelement <16 x i64> %2, i32 %a1
+  ret i64 %3
+}
diff --git a/test/CodeGen/X86/vector-half-conversions.ll b/test/CodeGen/X86/vector-half-conversions.ll
index 5bf6fbeb6235c82b52328cb6a85270124839537e..a2a7363d789477dac54e95588345377955e7f3b6 100644
--- a/test/CodeGen/X86/vector-half-conversions.ll
+++ b/test/CodeGen/X86/vector-half-conversions.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-f16c | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,-f16c -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl -verify-machineinstrs | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512VL
 
 ;
 ; Half to Float
@@ -29,6 +29,7 @@ define float @cvt_i16_to_f32(i16 %a0) nounwind {
 ; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_i16_to_f32:
@@ -122,6 +123,7 @@ define <4 x float> @cvt_4i16_to_4f32(<4 x i16> %a0) nounwind {
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_4i16_to_4f32:
@@ -232,6 +234,7 @@ define <4 x float> @cvt_8i16_to_4f32(<8 x i16> %a0) nounwind {
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_8i16_to_4f32:
@@ -880,6 +883,7 @@ define float @load_cvt_i16_to_f32(i16* %a0) nounwind {
 ; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_cvt_i16_to_f32:
@@ -950,6 +954,7 @@ define <4 x float> @load_cvt_4i16_to_4f32(<4 x i16>* %a0) nounwind {
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3]
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_cvt_4i16_to_4f32:
@@ -1053,6 +1058,7 @@ define <4 x float> @load_cvt_8i16_to_4f32(<8 x i16>* %a0) nounwind {
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_cvt_8i16_to_4f32:
@@ -1534,6 +1540,7 @@ define double @cvt_i16_to_f64(i16 %a0) nounwind {
 ; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
 ; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_i16_to_f64:
@@ -1598,6 +1605,7 @@ define <2 x double> @cvt_2i16_to_2f64(<2 x i16> %a0) nounwind {
 ; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_2i16_to_2f64:
@@ -1789,6 +1797,7 @@ define <2 x double> @cvt_8i16_to_2f64(<8 x i16> %a0) nounwind {
 ; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_8i16_to_2f64:
@@ -1941,25 +1950,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
 ; AVX1-LABEL: cvt_8i16_to_8f64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vmovq %xmm0, %rdx
-; AVX1-NEXT:    movq %rdx, %r9
+; AVX1-NEXT:    movq %rdx, %r8
 ; AVX1-NEXT:    movl %edx, %r10d
-; AVX1-NEXT:    movswl %dx, %r8d
+; AVX1-NEXT:    movswl %dx, %r9d
 ; AVX1-NEXT:    shrq $48, %rdx
-; AVX1-NEXT:    shrq $32, %r9
+; AVX1-NEXT:    shrq $32, %r8
 ; AVX1-NEXT:    shrl $16, %r10d
 ; AVX1-NEXT:    vpextrq $1, %xmm0, %rdi
-; AVX1-NEXT:    movq %rdi, %rsi
-; AVX1-NEXT:    movl %edi, %eax
+; AVX1-NEXT:    movq %rdi, %rax
+; AVX1-NEXT:    movl %edi, %esi
 ; AVX1-NEXT:    movswl %di, %ecx
 ; AVX1-NEXT:    shrq $48, %rdi
-; AVX1-NEXT:    shrq $32, %rsi
-; AVX1-NEXT:    shrl $16, %eax
-; AVX1-NEXT:    cwtl
-; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    shrq $32, %rax
+; AVX1-NEXT:    shrl $16, %esi
+; AVX1-NEXT:    movswl %si, %esi
+; AVX1-NEXT:    vmovd %esi, %xmm0
 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm1
 ; AVX1-NEXT:    vmovd %ecx, %xmm0
 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm2
-; AVX1-NEXT:    movswl %si, %eax
+; AVX1-NEXT:    cwtl
 ; AVX1-NEXT:    vmovd %eax, %xmm0
 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm3
 ; AVX1-NEXT:    movswl %di, %eax
@@ -1968,9 +1977,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
 ; AVX1-NEXT:    movswl %r10w, %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm0
 ; AVX1-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX1-NEXT:    vmovd %r8d, %xmm5
+; AVX1-NEXT:    vmovd %r9d, %xmm5
 ; AVX1-NEXT:    vcvtph2ps %xmm5, %xmm5
-; AVX1-NEXT:    movswl %r9w, %eax
+; AVX1-NEXT:    movswl %r8w, %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm6
 ; AVX1-NEXT:    vcvtph2ps %xmm6, %xmm6
 ; AVX1-NEXT:    movswl %dx, %eax
@@ -1995,25 +2004,25 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
 ; AVX2-LABEL: cvt_8i16_to_8f64:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vmovq %xmm0, %rdx
-; AVX2-NEXT:    movq %rdx, %r9
+; AVX2-NEXT:    movq %rdx, %r8
 ; AVX2-NEXT:    movl %edx, %r10d
-; AVX2-NEXT:    movswl %dx, %r8d
+; AVX2-NEXT:    movswl %dx, %r9d
 ; AVX2-NEXT:    shrq $48, %rdx
-; AVX2-NEXT:    shrq $32, %r9
+; AVX2-NEXT:    shrq $32, %r8
 ; AVX2-NEXT:    shrl $16, %r10d
 ; AVX2-NEXT:    vpextrq $1, %xmm0, %rdi
-; AVX2-NEXT:    movq %rdi, %rsi
-; AVX2-NEXT:    movl %edi, %eax
+; AVX2-NEXT:    movq %rdi, %rax
+; AVX2-NEXT:    movl %edi, %esi
 ; AVX2-NEXT:    movswl %di, %ecx
 ; AVX2-NEXT:    shrq $48, %rdi
-; AVX2-NEXT:    shrq $32, %rsi
-; AVX2-NEXT:    shrl $16, %eax
-; AVX2-NEXT:    cwtl
-; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    shrq $32, %rax
+; AVX2-NEXT:    shrl $16, %esi
+; AVX2-NEXT:    movswl %si, %esi
+; AVX2-NEXT:    vmovd %esi, %xmm0
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm1
 ; AVX2-NEXT:    vmovd %ecx, %xmm0
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm2
-; AVX2-NEXT:    movswl %si, %eax
+; AVX2-NEXT:    cwtl
 ; AVX2-NEXT:    vmovd %eax, %xmm0
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm3
 ; AVX2-NEXT:    movswl %di, %eax
@@ -2022,9 +2031,9 @@ define <8 x double> @cvt_8i16_to_8f64(<8 x i16> %a0) nounwind {
 ; AVX2-NEXT:    movswl %r10w, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm0
 ; AVX2-NEXT:    vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT:    vmovd %r8d, %xmm5
+; AVX2-NEXT:    vmovd %r9d, %xmm5
 ; AVX2-NEXT:    vcvtph2ps %xmm5, %xmm5
-; AVX2-NEXT:    movswl %r9w, %eax
+; AVX2-NEXT:    movswl %r8w, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm6
 ; AVX2-NEXT:    vcvtph2ps %xmm6, %xmm6
 ; AVX2-NEXT:    movswl %dx, %eax
@@ -2187,6 +2196,7 @@ define double @load_cvt_i16_to_f64(i16* %a0) nounwind {
 ; AVX512F-NEXT:    vmovd %eax, %xmm0
 ; AVX512F-NEXT:    vcvtph2ps %ymm0, %zmm0
 ; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_cvt_i16_to_f64:
@@ -2240,6 +2250,7 @@ define <2 x double> @load_cvt_2i16_to_2f64(<2 x i16>* %a0) nounwind {
 ; AVX512F-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; AVX512F-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_cvt_2i16_to_2f64:
@@ -2684,6 +2695,7 @@ define i16 @cvt_f32_to_i16(float %a0) nounwind {
 ; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
 ; AVX512F-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_f32_to_i16:
@@ -2769,6 +2781,7 @@ define <4 x i16> @cvt_4f32_to_4i16(<4 x float> %a0) nounwind {
 ; AVX512F-NEXT:    shlq $32, %rdx
 ; AVX512F-NEXT:    orq %rcx, %rdx
 ; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_4f32_to_4i16:
@@ -2822,7 +2835,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
 ; AVX1-NEXT:    shlq $32, %rdx
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: cvt_4f32_to_8i16_undef:
@@ -2847,7 +2860,7 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
 ; AVX2-NEXT:    shlq $32, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: cvt_4f32_to_8i16_undef:
@@ -2873,7 +2886,8 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
 ; AVX512F-NEXT:    shlq $32, %rdx
 ; AVX512F-NEXT:    orq %rcx, %rdx
 ; AVX512F-NEXT:    vmovq %rdx, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_4f32_to_8i16_undef:
@@ -2899,7 +2913,6 @@ define <8 x i16> @cvt_4f32_to_8i16_undef(<4 x float> %a0) nounwind {
 ; AVX512VL-NEXT:    orq %rcx, %rdx
 ; AVX512VL-NEXT:    vmovq %rdx, %xmm0
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX512VL-NEXT:    retq
   %1 = fptrunc <4 x float> %a0 to <4 x half>
@@ -2931,7 +2944,7 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
 ; AVX1-NEXT:    shlq $32, %rdx
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: cvt_4f32_to_8i16_zero:
@@ -2956,7 +2969,7 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
 ; AVX2-NEXT:    shlq $32, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: cvt_4f32_to_8i16_zero:
@@ -2982,7 +2995,8 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
 ; AVX512F-NEXT:    shlq $32, %rdx
 ; AVX512F-NEXT:    orq %rcx, %rdx
 ; AVX512F-NEXT:    vmovq %rdx, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_4f32_to_8i16_zero:
@@ -3008,7 +3022,6 @@ define <8 x i16> @cvt_4f32_to_8i16_zero(<4 x float> %a0) nounwind {
 ; AVX512VL-NEXT:    orq %rcx, %rdx
 ; AVX512VL-NEXT:    vmovq %rdx, %xmm0
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
@@ -3159,6 +3172,7 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
 ; AVX512F-NEXT:    vmovq %rsi, %xmm0
 ; AVX512F-NEXT:    vmovq %rax, %xmm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: cvt_8f32_to_8i16:
@@ -3205,6 +3219,7 @@ define <8 x i16> @cvt_8f32_to_8i16(<8 x float> %a0) nounwind {
 ; AVX512VL-NEXT:    vmovq %rsi, %xmm0
 ; AVX512VL-NEXT:    vmovq %rax, %xmm1
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %1 = fptrunc <8 x float> %a0 to <8 x half>
   %2 = bitcast <8 x half> %1 to <8 x i16>
@@ -3511,6 +3526,7 @@ define void @store_cvt_f32_to_i16(float %a0, i16* %a1) nounwind {
 ; AVX512F-NEXT:    vcvtps2ph $4, %zmm0, %ymm0
 ; AVX512F-NEXT:    vmovd %xmm0, %eax
 ; AVX512F-NEXT:    movw %ax, (%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: store_cvt_f32_to_i16:
@@ -3582,6 +3598,7 @@ define void @store_cvt_4f32_to_4i16(<4 x float> %a0, <4 x i16>* %a1) nounwind {
 ; AVX512F-NEXT:    movw %dx, 6(%rdi)
 ; AVX512F-NEXT:    movw %cx, 4(%rdi)
 ; AVX512F-NEXT:    movw %ax, 2(%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: store_cvt_4f32_to_4i16:
@@ -3631,7 +3648,7 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
 ; AVX1-NEXT:    shlq $32, %rdx
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
@@ -3657,7 +3674,7 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
 ; AVX2-NEXT:    shlq $32, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
@@ -3684,8 +3701,9 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
 ; AVX512F-NEXT:    shlq $32, %rdx
 ; AVX512F-NEXT:    orq %rcx, %rdx
 ; AVX512F-NEXT:    vmovq %rdx, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_undef:
@@ -3711,7 +3729,6 @@ define void @store_cvt_4f32_to_8i16_undef(<4 x float> %a0, <8 x i16>* %a1) nounw
 ; AVX512VL-NEXT:    orq %rcx, %rdx
 ; AVX512VL-NEXT:    vmovq %rdx, %xmm0
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%rdi)
 ; AVX512VL-NEXT:    retq
@@ -3745,7 +3762,7 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
 ; AVX1-NEXT:    shlq $32, %rdx
 ; AVX1-NEXT:    orq %rcx, %rdx
 ; AVX1-NEXT:    vmovq %rdx, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX1-NEXT:    vmovdqa %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
@@ -3771,7 +3788,7 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
 ; AVX2-NEXT:    shlq $32, %rdx
 ; AVX2-NEXT:    orq %rcx, %rdx
 ; AVX2-NEXT:    vmovq %rdx, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vmovdqa %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
@@ -3798,8 +3815,9 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
 ; AVX512F-NEXT:    shlq $32, %rdx
 ; AVX512F-NEXT:    orq %rcx, %rdx
 ; AVX512F-NEXT:    vmovq %rdx, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: store_cvt_4f32_to_8i16_zero:
@@ -3825,7 +3843,6 @@ define void @store_cvt_4f32_to_8i16_zero(<4 x float> %a0, <8 x i16>* %a1) nounwi
 ; AVX512VL-NEXT:    orq %rcx, %rdx
 ; AVX512VL-NEXT:    vmovq %rdx, %xmm0
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
@@ -3945,6 +3962,7 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
 ; AVX512F-NEXT:    movw %r10w, 6(%rdi)
 ; AVX512F-NEXT:    movw %r9w, 4(%rdi)
 ; AVX512F-NEXT:    movw %r8w, 2(%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: store_cvt_8f32_to_8i16:
@@ -3980,6 +3998,7 @@ define void @store_cvt_8f32_to_8i16(<8 x float> %a0, <8 x i16>* %a1) nounwind {
 ; AVX512VL-NEXT:    movw %r10w, 6(%rdi)
 ; AVX512VL-NEXT:    movw %r9w, 4(%rdi)
 ; AVX512VL-NEXT:    movw %r8w, 2(%rdi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %1 = fptrunc <8 x float> %a0 to <8 x half>
   %2 = bitcast <8 x half> %1 to <8 x i16>
@@ -4187,6 +4206,7 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin
 ; AVX512F-NEXT:    movw %ax, 4(%rdi)
 ; AVX512F-NEXT:    vmovd %xmm4, %eax
 ; AVX512F-NEXT:    movw %ax, 2(%rdi)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: store_cvt_16f32_to_16i16:
@@ -4254,6 +4274,7 @@ define void @store_cvt_16f32_to_16i16(<16 x float> %a0, <16 x i16>* %a1) nounwin
 ; AVX512VL-NEXT:    movw %ax, 4(%rdi)
 ; AVX512VL-NEXT:    vmovd %xmm4, %eax
 ; AVX512VL-NEXT:    movw %ax, 2(%rdi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
   %1 = fptrunc <16 x float> %a0 to <16 x half>
   %2 = bitcast <16 x half> %1 to <16 x i16>
@@ -4315,7 +4336,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movzwl %ax, %r14d
 ; AVX1-NEXT:    orl %ebx, %r14d
-; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -4379,11 +4400,13 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
 ; AVX512F-NEXT:    subq $40, %rsp
 ; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bx
 ; AVX512F-NEXT:    shll $16, %ebx
 ; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movzwl %ax, %r14d
 ; AVX512F-NEXT:    orl %ebx, %r14d
@@ -4391,6 +4414,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bx
 ; AVX512F-NEXT:    shll $16, %ebx
@@ -4413,11 +4437,13 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    subq $40, %rsp
 ; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bx
 ; AVX512VL-NEXT:    shll $16, %ebx
 ; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movzwl %ax, %r14d
 ; AVX512VL-NEXT:    orl %ebx, %r14d
@@ -4425,6 +4451,7 @@ define <4 x i16> @cvt_4f64_to_4i16(<4 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bx
 ; AVX512VL-NEXT:    shll $16, %ebx
@@ -4462,7 +4489,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movzwl %ax, %r14d
 ; AVX1-NEXT:    orl %ebx, %r14d
-; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -4477,7 +4504,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX1-NEXT:    shlq $32, %rax
 ; AVX1-NEXT:    orq %r14, %rax
 ; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX1-NEXT:    addq $40, %rsp
 ; AVX1-NEXT:    popq %rbx
 ; AVX1-NEXT:    popq %r14
@@ -4515,7 +4542,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX2-NEXT:    shlq $32, %rax
 ; AVX2-NEXT:    orq %r14, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    addq $40, %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
@@ -4528,11 +4555,13 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX512F-NEXT:    subq $40, %rsp
 ; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bx
 ; AVX512F-NEXT:    shll $16, %ebx
 ; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movzwl %ax, %r14d
 ; AVX512F-NEXT:    orl %ebx, %r14d
@@ -4540,6 +4569,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bx
 ; AVX512F-NEXT:    shll $16, %ebx
@@ -4550,7 +4580,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX512F-NEXT:    shlq $32, %rax
 ; AVX512F-NEXT:    orq %r14, %rax
 ; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512F-NEXT:    addq $40, %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
@@ -4563,11 +4593,13 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    subq $40, %rsp
 ; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bx
 ; AVX512VL-NEXT:    shll $16, %ebx
 ; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movzwl %ax, %r14d
 ; AVX512VL-NEXT:    orl %ebx, %r14d
@@ -4575,6 +4607,7 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bx
 ; AVX512VL-NEXT:    shll $16, %ebx
@@ -4586,7 +4619,6 @@ define <8 x i16> @cvt_4f64_to_8i16_undef(<4 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    orq %r14, %rax
 ; AVX512VL-NEXT:    vmovq %rax, %xmm0
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX512VL-NEXT:    addq $40, %rsp
 ; AVX512VL-NEXT:    popq %rbx
@@ -4616,7 +4648,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movzwl %ax, %r14d
 ; AVX1-NEXT:    orl %ebx, %r14d
-; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -4631,7 +4663,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX1-NEXT:    shlq $32, %rax
 ; AVX1-NEXT:    orq %r14, %rax
 ; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX1-NEXT:    addq $40, %rsp
 ; AVX1-NEXT:    popq %rbx
 ; AVX1-NEXT:    popq %r14
@@ -4669,7 +4701,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX2-NEXT:    shlq $32, %rax
 ; AVX2-NEXT:    orq %r14, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    addq $40, %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r14
@@ -4682,11 +4714,13 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX512F-NEXT:    subq $40, %rsp
 ; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bx
 ; AVX512F-NEXT:    shll $16, %ebx
 ; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movzwl %ax, %r14d
 ; AVX512F-NEXT:    orl %ebx, %r14d
@@ -4694,6 +4728,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bx
 ; AVX512F-NEXT:    shll $16, %ebx
@@ -4704,7 +4739,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX512F-NEXT:    shlq $32, %rax
 ; AVX512F-NEXT:    orq %r14, %rax
 ; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    addq $40, %rsp
 ; AVX512F-NEXT:    popq %rbx
 ; AVX512F-NEXT:    popq %r14
@@ -4717,11 +4752,13 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    subq $40, %rsp
 ; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bx
 ; AVX512VL-NEXT:    shll $16, %ebx
 ; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movzwl %ax, %r14d
 ; AVX512VL-NEXT:    orl %ebx, %r14d
@@ -4729,6 +4766,7 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bx
 ; AVX512VL-NEXT:    shll $16, %ebx
@@ -4740,7 +4778,6 @@ define <8 x i16> @cvt_4f64_to_8i16_zero(<4 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    orq %r14, %rax
 ; AVX512VL-NEXT:    vmovq %rax, %xmm0
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
@@ -4774,7 +4811,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movzwl %ax, %r15d
 ; AVX1-NEXT:    orl %ebx, %r15d
-; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -4799,7 +4836,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movzwl %ax, %r15d
 ; AVX1-NEXT:    orl %ebx, %r15d
-; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -4897,11 +4934,13 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX512F-NEXT:    subq $96, %rsp
 ; AVX512F-NEXT:    vmovupd %zmm0, (%rsp) # 64-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bx
 ; AVX512F-NEXT:    shll $16, %ebx
 ; AVX512F-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movzwl %ax, %r15d
 ; AVX512F-NEXT:    orl %ebx, %r15d
@@ -4909,6 +4948,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bx
 ; AVX512F-NEXT:    shll $16, %ebx
@@ -4922,11 +4962,13 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bx
 ; AVX512F-NEXT:    shll $16, %ebx
 ; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movzwl %ax, %r15d
 ; AVX512F-NEXT:    orl %ebx, %r15d
@@ -4934,6 +4976,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bx
 ; AVX512F-NEXT:    shll $16, %ebx
@@ -4960,11 +5003,13 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    subq $96, %rsp
 ; AVX512VL-NEXT:    vmovupd %zmm0, (%rsp) # 64-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bx
 ; AVX512VL-NEXT:    shll $16, %ebx
 ; AVX512VL-NEXT:    vmovups (%rsp), %zmm0 # 64-byte Reload
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movzwl %ax, %r15d
 ; AVX512VL-NEXT:    orl %ebx, %r15d
@@ -4972,6 +5017,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bx
 ; AVX512VL-NEXT:    shll $16, %ebx
@@ -4985,11 +5031,13 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bx
 ; AVX512VL-NEXT:    shll $16, %ebx
 ; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movzwl %ax, %r15d
 ; AVX512VL-NEXT:    orl %ebx, %r15d
@@ -4997,6 +5045,7 @@ define <8 x i16> @cvt_8f64_to_8i16(<8 x double> %a0) nounwind {
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bx
 ; AVX512VL-NEXT:    shll $16, %ebx
@@ -5077,7 +5126,7 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movl %eax, %r14d
-; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -5150,16 +5199,19 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
 ; AVX512F-NEXT:    movq %rdi, %rbx
 ; AVX512F-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movl %eax, %r14d
 ; AVX512F-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movl %eax, %r15d
 ; AVX512F-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movl %eax, %ebp
 ; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
@@ -5185,16 +5237,19 @@ define void @store_cvt_4f64_to_4i16(<4 x double> %a0, <4 x i16>* %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rdi, %rbx
 ; AVX512VL-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movl %eax, %r14d
 ; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movl %eax, %r15d
 ; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movl %eax, %ebp
 ; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
@@ -5235,7 +5290,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movzwl %ax, %ebx
 ; AVX1-NEXT:    orl %ebp, %ebx
-; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -5250,7 +5305,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 ; AVX1-NEXT:    shlq $32, %rax
 ; AVX1-NEXT:    orq %rbx, %rax
 ; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX1-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
 ; AVX1-NEXT:    addq $32, %rsp
 ; AVX1-NEXT:    popq %rbx
@@ -5292,7 +5347,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 ; AVX2-NEXT:    shlq $32, %rax
 ; AVX2-NEXT:    orq %rbx, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX2-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX2-NEXT:    vmovdqa %xmm0, (%r14)
 ; AVX2-NEXT:    addq $32, %rsp
 ; AVX2-NEXT:    popq %rbx
@@ -5309,11 +5364,13 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 ; AVX512F-NEXT:    movq %rdi, %r14
 ; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bp
 ; AVX512F-NEXT:    shll $16, %ebp
 ; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movzwl %ax, %ebx
 ; AVX512F-NEXT:    orl %ebp, %ebx
@@ -5321,6 +5378,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bp
 ; AVX512F-NEXT:    shll $16, %ebp
@@ -5331,7 +5389,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 ; AVX512F-NEXT:    shlq $32, %rax
 ; AVX512F-NEXT:    orq %rbx, %rax
 ; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
+; AVX512F-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%r14)
 ; AVX512F-NEXT:    addq $32, %rsp
 ; AVX512F-NEXT:    popq %rbx
@@ -5348,11 +5406,13 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 ; AVX512VL-NEXT:    movq %rdi, %r14
 ; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bp
 ; AVX512VL-NEXT:    shll $16, %ebp
 ; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movzwl %ax, %ebx
 ; AVX512VL-NEXT:    orl %ebp, %ebx
@@ -5360,6 +5420,7 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bp
 ; AVX512VL-NEXT:    shll $16, %ebp
@@ -5371,7 +5432,6 @@ define void @store_cvt_4f64_to_8i16_undef(<4 x double> %a0, <8 x i16>* %a1) noun
 ; AVX512VL-NEXT:    orq %rbx, %rax
 ; AVX512VL-NEXT:    vmovq %rax, %xmm0
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX512VL-NEXT:    vmovdqa %xmm0, (%r14)
 ; AVX512VL-NEXT:    addq $32, %rsp
@@ -5406,7 +5466,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movzwl %ax, %ebx
 ; AVX1-NEXT:    orl %ebp, %ebx
-; AVX1-NEXT:    vmovupd (%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -5421,7 +5481,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX1-NEXT:    shlq $32, %rax
 ; AVX1-NEXT:    orq %rbx, %rax
 ; AVX1-NEXT:    vmovq %rax, %xmm0
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX1-NEXT:    vmovdqa %xmm0, (%r14)
 ; AVX1-NEXT:    addq $32, %rsp
 ; AVX1-NEXT:    popq %rbx
@@ -5463,7 +5523,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX2-NEXT:    shlq $32, %rax
 ; AVX2-NEXT:    orq %rbx, %rax
 ; AVX2-NEXT:    vmovq %rax, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vmovdqa %xmm0, (%r14)
 ; AVX2-NEXT:    addq $32, %rsp
 ; AVX2-NEXT:    popq %rbx
@@ -5480,11 +5540,13 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX512F-NEXT:    movq %rdi, %r14
 ; AVX512F-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bp
 ; AVX512F-NEXT:    shll $16, %ebp
 ; AVX512F-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movzwl %ax, %ebx
 ; AVX512F-NEXT:    orl %ebp, %ebx
@@ -5492,6 +5554,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovapd %xmm0, (%rsp) # 16-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, %bp
 ; AVX512F-NEXT:    shll $16, %ebp
@@ -5502,7 +5565,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX512F-NEXT:    shlq $32, %rax
 ; AVX512F-NEXT:    orq %rbx, %rax
 ; AVX512F-NEXT:    vmovq %rax, %xmm0
-; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,u,u,u,u],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vmovdqa %xmm0, (%r14)
 ; AVX512F-NEXT:    addq $32, %rsp
 ; AVX512F-NEXT:    popq %rbx
@@ -5519,11 +5582,13 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX512VL-NEXT:    movq %rdi, %r14
 ; AVX512VL-NEXT:    vmovupd %ymm0, (%rsp) # 32-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bp
 ; AVX512VL-NEXT:    shll $16, %ebp
 ; AVX512VL-NEXT:    vmovups (%rsp), %ymm0 # 32-byte Reload
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movzwl %ax, %ebx
 ; AVX512VL-NEXT:    orl %ebp, %ebx
@@ -5531,6 +5596,7 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, %bp
 ; AVX512VL-NEXT:    shll $16, %ebp
@@ -5542,7 +5608,6 @@ define void @store_cvt_4f64_to_8i16_zero(<4 x double> %a0, <8 x i16>* %a1) nounw
 ; AVX512VL-NEXT:    orq %rbx, %rax
 ; AVX512VL-NEXT:    vmovq %rax, %xmm0
 ; AVX512VL-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
@@ -5576,7 +5641,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
-; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -5587,7 +5652,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
 ; AVX1-NEXT:    # xmm0 = mem[1,0]
 ; AVX1-NEXT:    callq __truncdfhf2
 ; AVX1-NEXT:    movl %eax, %r12d
-; AVX1-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
+; AVX1-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX1-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
@@ -5708,28 +5773,33 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
 ; AVX512F-NEXT:    movq %rdi, %rbx
 ; AVX512F-NEXT:    vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
 ; AVX512F-NEXT:    vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
 ; AVX512F-NEXT:    vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movl %eax, %r12d
 ; AVX512F-NEXT:    vmovupd {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512F-NEXT:    vmovapd %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX512F-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movl %eax, %r13d
 ; AVX512F-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movl %eax, %ebp
 ; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
@@ -5737,6 +5807,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
 ; AVX512F-NEXT:    movl %eax, %r14d
 ; AVX512F-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    callq __truncdfhf2
 ; AVX512F-NEXT:    movl %eax, %r15d
 ; AVX512F-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
@@ -5772,28 +5843,33 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
 ; AVX512VL-NEXT:    movq %rdi, %rbx
 ; AVX512VL-NEXT:    vmovupd %zmm0, {{[0-9]+}}(%rsp) # 64-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
 ; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movw %ax, {{[0-9]+}}(%rsp) # 2-byte Spill
 ; AVX512VL-NEXT:    vmovupd {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
 ; AVX512VL-NEXT:    vextractf64x4 $1, %zmm0, %ymm0
 ; AVX512VL-NEXT:    vmovupd %ymm0, {{[0-9]+}}(%rsp) # 32-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movl %eax, %r12d
 ; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
 ; AVX512VL-NEXT:    vmovaps %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movl %eax, %r13d
 ; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %zmm0 # 64-byte Reload
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movl %eax, %ebp
 ; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
@@ -5801,6 +5877,7 @@ define void @store_cvt_8f64_to_8i16(<8 x double> %a0, <8 x i16>* %a1) nounwind {
 ; AVX512VL-NEXT:    movl %eax, %r14d
 ; AVX512VL-NEXT:    vmovups {{[0-9]+}}(%rsp), %ymm0 # 32-byte Reload
 ; AVX512VL-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    callq __truncdfhf2
 ; AVX512VL-NEXT:    movl %eax, %r15d
 ; AVX512VL-NEXT:    vmovaps {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-128.ll b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
index 933359fa084f39677ed3aa4dbf03d3adf87fd2cf..895bf5c0f02d16eb60cda8850cc6cbce566f9e87 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-128.ll
@@ -203,7 +203,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; SSE41-LABEL: test_div7_16i8:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
-; SSE41-NEXT:    pmovsxbw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
 ; SSE41-NEXT:    pmullw %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $8, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
@@ -227,7 +227,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX1-LABEL: test_div7_16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
@@ -249,8 +249,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX2-LABEL: test_div7_16i8:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
-; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm2
-; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
@@ -522,7 +521,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE41-LABEL: test_rem7_16i8:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    pmovsxbw %xmm0, %xmm1
-; SSE41-NEXT:    pmovsxbw {{.*}}(%rip), %xmm2
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
 ; SSE41-NEXT:    pmullw %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $8, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
@@ -556,7 +555,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX1-LABEL: test_rem7_16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmovsxbw %xmm0, %xmm1
-; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [65427,65427,65427,65427,65427,65427,65427,65427]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
@@ -589,8 +588,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX2-LABEL: test_rem7_16i8:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovsxbw %xmm0, %ymm1
-; AVX2-NEXT:    vpmovsxbw {{.*}}(%rip), %ymm2
-; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/vector-idiv-sdiv-256.ll b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
index a1727ea4f7051fbeaa7d9e0155be7a5179f4aa1d..e7bfe3778212c3339c2b779d43c0bee061782011 100644
--- a/test/CodeGen/X86/vector-idiv-sdiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-sdiv-256.ll
@@ -87,7 +87,7 @@ define <4 x i64> @test_div7_4i64(<4 x i64> %a) nounwind {
 define <8 x i32> @test_div7_8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: test_div7_8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
@@ -163,7 +163,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpmovsxbw %xmm1, %xmm2
-; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [65427,65427,65427,65427,65427,65427,65427,65427]
 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
@@ -348,7 +348,7 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
 define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: test_rem7_8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
+; AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027,2454267027]
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
@@ -439,7 +439,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpmovsxbw %xmm2, %xmm3
-; AVX1-NEXT:    vpmovsxbw {{.*}}(%rip), %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [65427,65427,65427,65427,65427,65427,65427,65427]
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
diff --git a/test/CodeGen/X86/vector-idiv-udiv-128.ll b/test/CodeGen/X86/vector-idiv-udiv-128.ll
index 7857e585dca5d767d3bfbbbc81c015b661abdc1a..1b35e2fdddae0aacc5160d84718c60aa5c12f7ab 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-128.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-128.ll
@@ -172,22 +172,21 @@ define <8 x i16> @test_div7_8i16(<8 x i16> %a) nounwind {
 define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: test_div7_16i8:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
-; SSE2-NEXT:    pmullw %xmm2, %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrlw $8, %xmm3
-; SSE2-NEXT:    pmullw %xmm2, %xmm3
-; SSE2-NEXT:    psrlw $8, %xmm3
-; SSE2-NEXT:    packuswb %xmm1, %xmm3
-; SSE2-NEXT:    psubb %xmm3, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
+; SSE2-NEXT:    pmullw %xmm3, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT:    pmullw %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $8, %xmm4
+; SSE2-NEXT:    packuswb %xmm2, %xmm4
+; SSE2-NEXT:    psubb %xmm4, %xmm0
 ; SSE2-NEXT:    psrlw $1, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
-; SSE2-NEXT:    paddb %xmm3, %xmm0
+; SSE2-NEXT:    paddb %xmm4, %xmm0
 ; SSE2-NEXT:    psrlw $2, %xmm0
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
 ; SSE2-NEXT:    retq
@@ -195,7 +194,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; SSE41-LABEL: test_div7_16i8:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
 ; SSE41-NEXT:    pmullw %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $8, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
@@ -214,7 +213,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX1-LABEL: test_div7_16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
@@ -233,8 +232,7 @@ define <16 x i8> @test_div7_16i8(<16 x i8> %a) nounwind {
 ; AVX2-LABEL: test_div7_16i8:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
@@ -464,23 +462,22 @@ define <8 x i16> @test_rem7_8i16(<8 x i16> %a) nounwind {
 define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE2-LABEL: test_rem7_16i8:
 ; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
+; SSE2-NEXT:    pmullw %xmm3, %xmm2
+; SSE2-NEXT:    psrlw $8, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3],xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7]
+; SSE2-NEXT:    pmullw %xmm3, %xmm4
+; SSE2-NEXT:    psrlw $8, %xmm4
+; SSE2-NEXT:    packuswb %xmm2, %xmm4
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [37,0,37,0,37,0,37,0,37,0,37,0,37,0,37,0]
-; SSE2-NEXT:    pmullw %xmm2, %xmm1
-; SSE2-NEXT:    psrlw $8, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    psrlw $8, %xmm3
-; SSE2-NEXT:    pmullw %xmm2, %xmm3
-; SSE2-NEXT:    psrlw $8, %xmm3
-; SSE2-NEXT:    packuswb %xmm1, %xmm3
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
-; SSE2-NEXT:    psubb %xmm3, %xmm1
+; SSE2-NEXT:    psubb %xmm4, %xmm1
 ; SSE2-NEXT:    psrlw $1, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE2-NEXT:    paddb %xmm3, %xmm1
+; SSE2-NEXT:    paddb %xmm4, %xmm1
 ; SSE2-NEXT:    psrlw $2, %xmm1
 ; SSE2-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE2-NEXT:    movdqa %xmm1, %xmm2
@@ -501,7 +498,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; SSE41-LABEL: test_rem7_16i8:
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
 ; SSE41-NEXT:    pmullw %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $8, %xmm1
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
@@ -532,7 +529,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX1-LABEL: test_rem7_16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [37,37,37,37,37,37,37,37]
 ; AVX1-NEXT:    vpmullw %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,3,0,1]
@@ -562,8 +559,7 @@ define <16 x i8> @test_rem7_16i8(<16 x i8> %a) nounwind {
 ; AVX2-LABEL: test_rem7_16i8:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero
-; AVX2-NEXT:    vpmullw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpmullw {{.*}}(%rip), %ymm1, %ymm1
 ; AVX2-NEXT:    vpsrlw $8, %ymm1, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
diff --git a/test/CodeGen/X86/vector-idiv-udiv-256.ll b/test/CodeGen/X86/vector-idiv-udiv-256.ll
index c11ee22d647b9d5d7399f668dc37001c091798a2..4adc2e2fb6c903230548e89255d9cdcd5b7e9c94 100644
--- a/test/CodeGen/X86/vector-idiv-udiv-256.ll
+++ b/test/CodeGen/X86/vector-idiv-udiv-256.ll
@@ -174,7 +174,7 @@ define <32 x i8> @test_div7_32i8(<32 x i8> %a) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [37,37,37,37,37,37,37,37]
 ; AVX1-NEXT:    vpmullw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpsrlw $8, %xmm2, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm1[2,3,0,1]
@@ -359,7 +359,7 @@ define <4 x i64> @test_rem7_4i64(<4 x i64> %a) nounwind {
 define <8 x i32> @test_rem7_8i32(<8 x i32> %a) nounwind {
 ; AVX1-LABEL: test_rem7_8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
+; AVX1-NEXT:    vmovdqa {{.*#+}} ymm1 = [613566757,613566757,613566757,613566757,613566757,613566757,613566757,613566757]
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
@@ -453,7 +453,7 @@ define <32 x i8> @test_rem7_32i8(<32 x i8> %a) nounwind {
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [37,37,37,37,37,37,37,37]
 ; AVX1-NEXT:    vpmullw %xmm1, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,3,0,1]
diff --git a/test/CodeGen/X86/vector-interleave.ll b/test/CodeGen/X86/vector-interleave.ll
index 4f9dbb03fb157fd2369501dc7b91939934bc8ac9..1265ea108977dadc7d0aa97652fd739341ff8fb8 100644
--- a/test/CodeGen/X86/vector-interleave.ll
+++ b/test/CodeGen/X86/vector-interleave.ll
@@ -93,44 +93,32 @@ define <64 x i16> @interleave8x8(<8 x i16> %a, <8 x i16> %b, <8 x i16> %c, <8 x
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,4,u,5,u,6,u,7>
-; AVX2-NEXT:    vpermd %ymm1, %ymm9, %ymm3
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0],ymm3[1],ymm0[2],ymm3[3],ymm0[4],ymm3[5],ymm0[6],ymm3[7]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,0,u,1,u,2,u,3>
-; AVX2-NEXT:    vpermd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm8 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm1[1],ymm8[2],ymm1[3],ymm8[4],ymm1[5],ymm8[6],ymm1[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm9 = xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm3 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm8[0],xmm1[0],xmm8[1],xmm1[1]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
 ; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm5
-; AVX2-NEXT:    vpermd %ymm5, %ymm9, %ymm6
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm4 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0],ymm6[1],ymm4[2],ymm6[3],ymm4[4],ymm6[5],ymm4[6],ymm6[7]
-; AVX2-NEXT:    vpermd %ymm5, %ymm0, %ymm0
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm0[1],ymm2[2],ymm0[3],ymm2[4],ymm0[5],ymm2[6],ymm0[7]
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm5, %ymm0
-; AVX2-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm5, %ymm2
-; AVX2-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-NEXT:    vextracti128 $1, %ymm3, %xmm3
-; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
-; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm3, %ymm3
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm7 = xmm4[2],xmm6[2],xmm4[3],xmm6[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm6 = xmm1[2],xmm5[2],xmm1[3],xmm5[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1]
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
+; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm9[4],xmm7[4],xmm9[5],xmm7[5],xmm9[6],xmm7[6],xmm9[7],xmm7[7]
+; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm7[0],xmm9[1],xmm7[1],xmm9[2],xmm7[2],xmm9[3],xmm7[3]
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
 ; AVX2-NEXT:    retq
   %ab = shufflevector <8 x i16> %a, <8 x i16> %b, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
   %cd = shufflevector <8 x i16> %c, <8 x i16> %d, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll
index 6445a363787cce5dfd847463d8560628b857d94b..9e11edcc29dc5f3ebef0d3d0c85d0b4b068ed31a 100644
--- a/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -1596,35 +1596,8 @@ define <2 x i64> @foldv2i64() nounwind {
 ;
 ; X32-SSE-LABEL: foldv2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [256,0,4294967295,4294967295]
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm4
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
-; X32-SSE-NEXT:    pand %xmm4, %xmm0
-; X32-SSE-NEXT:    paddb %xmm3, %xmm0
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm3
-; X32-SSE-NEXT:    pand %xmm0, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm0
-; X32-SSE-NEXT:    paddw %xmm3, %xmm0
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
-; X32-SSE-NEXT:    psrld $16, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    psrld $16, %xmm0
-; X32-SSE-NEXT:    paddd %xmm1, %xmm0
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; X32-SSE-NEXT:    psrlq $32, %xmm0
-; X32-SSE-NEXT:    paddq %xmm2, %xmm0
+; X32-SSE-NEXT:    movl $55, %eax
+; X32-SSE-NEXT:    movd %eax, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
   ret <2 x i64> %out
@@ -1651,35 +1624,8 @@ define <2 x i64> @foldv2i64u() nounwind {
 ;
 ; X32-SSE-LABEL: foldv2i64u:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [256,0,4294967295,4294967295]
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-SSE-NEXT:    movdqa %xmm3, %xmm4
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm4
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm0
-; X32-SSE-NEXT:    psrlw $4, %xmm0
-; X32-SSE-NEXT:    pand %xmm2, %xmm0
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm0, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm0
-; X32-SSE-NEXT:    pand %xmm4, %xmm0
-; X32-SSE-NEXT:    paddb %xmm3, %xmm0
-; X32-SSE-NEXT:    movdqa %xmm1, %xmm3
-; X32-SSE-NEXT:    pcmpeqb %xmm2, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm3
-; X32-SSE-NEXT:    pand %xmm0, %xmm3
-; X32-SSE-NEXT:    psrlw $8, %xmm0
-; X32-SSE-NEXT:    paddw %xmm3, %xmm0
-; X32-SSE-NEXT:    pcmpeqw %xmm2, %xmm1
-; X32-SSE-NEXT:    psrld $16, %xmm1
-; X32-SSE-NEXT:    pand %xmm0, %xmm1
-; X32-SSE-NEXT:    psrld $16, %xmm0
-; X32-SSE-NEXT:    paddd %xmm1, %xmm0
-; X32-SSE-NEXT:    pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7]
-; X32-SSE-NEXT:    psrlq $32, %xmm0
-; X32-SSE-NEXT:    paddq %xmm2, %xmm0
+; X32-SSE-NEXT:    movl $55, %eax
+; X32-SSE-NEXT:    movd %eax, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
   ret <2 x i64> %out
diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll
index c683954930232e213656351df619cbcecd8b4271..53cb4d8e445ba34be013a92c3c7ed8f97f828c0d 100644
--- a/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -11,8 +11,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm1
@@ -37,7 +37,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm5
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm6
 ; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm3
@@ -143,8 +143,8 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm1
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm1
@@ -169,7 +169,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
 ; AVX1-NEXT:    vpsrlq $32, %xmm5, %xmm5
 ; AVX1-NEXT:    vpaddq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm5
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm6
 ; AVX1-NEXT:    vpand %xmm3, %xmm6, %xmm3
@@ -275,8 +275,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
@@ -296,7 +296,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
 ; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
@@ -387,8 +387,8 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
@@ -408,7 +408,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrld $16, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddd %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
 ; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
@@ -499,8 +499,8 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
@@ -515,7 +515,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
 ; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
@@ -586,8 +586,8 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm5
@@ -602,7 +602,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm1, %xmm3, %xmm1
 ; AVX1-NEXT:    vpsrlw $8, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddw %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm5
 ; AVX1-NEXT:    vpand %xmm2, %xmm5, %xmm2
@@ -673,8 +673,8 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
@@ -684,7 +684,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
@@ -747,8 +747,8 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8u:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
@@ -758,7 +758,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX1-NEXT:    vpand %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm1, %xmm3, %xmm1
-; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
@@ -830,31 +830,7 @@ define <4 x i64> @foldv4i64() nounwind {
 ;
 ; X32-AVX-LABEL: foldv4i64:
 ; X32-AVX:       # BB#0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [256,0,4294967295,4294967295,0,0,255,0]
-; X32-AVX-NEXT:    vpand %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm1, %ymm4
-; X32-AVX-NEXT:    vpand %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpxor %ymm4, %ymm4, %ymm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm5
-; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm2
-; X32-AVX-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpcmpeqw %ymm4, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpsrld $16, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm1
-; X32-AVX-NEXT:    vpsrld $16, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5],ymm0[6],ymm4[7]
-; X32-AVX-NEXT:    vpsrlq $32, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0]
 ; X32-AVX-NEXT:    retl
   %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
   ret <4 x i64> %out
@@ -873,31 +849,7 @@ define <4 x i64> @foldv4i64u() nounwind {
 ;
 ; X32-AVX-LABEL: foldv4i64u:
 ; X32-AVX:       # BB#0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm1 = [256,0,4294967295,4294967295,0,0,255,0]
-; X32-AVX-NEXT:    vpand %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
-; X32-AVX-NEXT:    vpshufb %ymm2, %ymm3, %ymm2
-; X32-AVX-NEXT:    vpsrlw $4, %ymm1, %ymm4
-; X32-AVX-NEXT:    vpand %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpxor %ymm4, %ymm4, %ymm4
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm0, %ymm5
-; X32-AVX-NEXT:    vpand %ymm5, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm3, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm0, %ymm2, %ymm0
-; X32-AVX-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpsrlw $8, %ymm2, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm2
-; X32-AVX-NEXT:    vpsrlw $8, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpcmpeqw %ymm4, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpsrld $16, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpand %ymm1, %ymm0, %ymm1
-; X32-AVX-NEXT:    vpsrld $16, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0],ymm4[1,2,3],ymm0[4],ymm4[5],ymm0[6],ymm4[7]
-; X32-AVX-NEXT:    vpsrlq $32, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [55,0,0,0,64,0,56,0]
 ; X32-AVX-NEXT:    retl
   %out = call <4 x i64> @llvm.ctlz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
   ret <4 x i64> %out
diff --git a/test/CodeGen/X86/vector-popcnt-256.ll b/test/CodeGen/X86/vector-popcnt-256.ll
index 8bbfea934422a36b6b451fbaec4c08f6bce4cdc9..7a675619d720e96998d553e0020c43e451ea375b 100644
--- a/test/CodeGen/X86/vector-popcnt-256.ll
+++ b/test/CodeGen/X86/vector-popcnt-256.ll
@@ -6,8 +6,8 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
@@ -16,7 +16,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
 ; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
@@ -47,8 +47,8 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
@@ -61,7 +61,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
 ; AVX1-NEXT:    vpsadbw %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm5, %xmm1, %xmm1
-; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm5
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm5
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm5
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
@@ -145,15 +145,15 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT:    vandps %xmm2, %xmm1, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm4, %xmm1
 ; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vandps %xmm2, %xmm0, %xmm3
+; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand %xmm2, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-rotate-128.ll b/test/CodeGen/X86/vector-rotate-128.ll
index eda893fc942d562104eeec933fb39d94abc4f219..5eb1a55881e575c852aa2994c5f6f003508ffbaf 100644
--- a/test/CodeGen/X86/vector-rotate-128.ll
+++ b/test/CodeGen/X86/vector-rotate-128.ll
@@ -3,9 +3,11 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
-;
+
 ; Just one 32-bit run to make sure we do reasonable things for i64 rotates.
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE2
 
@@ -75,6 +77,15 @@ define <2 x i64> @var_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: var_rotate_v2i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [64,64]
+; AVX512-NEXT:    vpsubq %xmm1, %xmm2, %xmm2
+; AVX512-NEXT:    vpsllvq %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrlvq %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: var_rotate_v2i64:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotq %xmm1, %xmm0, %xmm0
@@ -203,6 +214,15 @@ define <4 x i32> @var_rotate_v4i32(<4 x i32> %a, <4 x i32> %b) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: var_rotate_v4i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm2
+; AVX512-NEXT:    vpsubd %xmm1, %xmm2, %xmm2
+; AVX512-NEXT:    vpsllvd %xmm1, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrlvd %xmm2, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: var_rotate_v4i32:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotd %xmm1, %xmm0, %xmm0
@@ -336,21 +356,21 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE41-NEXT:    psllw $8, %xmm6
 ; SSE41-NEXT:    movdqa %xmm3, %xmm5
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm6, %xmm5
+; SSE41-NEXT:    pblendvb %xmm0, %xmm6, %xmm5
 ; SSE41-NEXT:    movdqa %xmm5, %xmm1
 ; SSE41-NEXT:    psllw $4, %xmm1
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm5
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm5
 ; SSE41-NEXT:    movdqa %xmm5, %xmm1
 ; SSE41-NEXT:    psllw $2, %xmm1
 ; SSE41-NEXT:    paddw %xmm4, %xmm4
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm5
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm5
 ; SSE41-NEXT:    movdqa %xmm5, %xmm1
 ; SSE41-NEXT:    psllw $1, %xmm1
 ; SSE41-NEXT:    paddw %xmm4, %xmm4
 ; SSE41-NEXT:    movdqa %xmm4, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm5
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm5
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    psllw $12, %xmm0
 ; SSE41-NEXT:    psllw $4, %xmm2
@@ -360,21 +380,21 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; SSE41-NEXT:    psrlw $8, %xmm4
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm2
 ; SSE41-NEXT:    psrlw $4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm2
 ; SSE41-NEXT:    psrlw $2, %xmm2
 ; SSE41-NEXT:    paddw %xmm1, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm2
 ; SSE41-NEXT:    psrlw $1, %xmm2
 ; SSE41-NEXT:    paddw %xmm1, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm3
 ; SSE41-NEXT:    por %xmm5, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
 ; SSE41-NEXT:    retq
@@ -421,7 +441,7 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
@@ -432,6 +452,27 @@ define <8 x i16> @var_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: var_rotate_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM1<def> %XMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqu {{.*#+}} xmm2 = [16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %xmm1, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpsllvw %xmm1, %xmm0, %xmm1
+; AVX512VL-NEXT:    vpsrlvw %xmm2, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; XOP-LABEL: var_rotate_v8i16:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotw %xmm1, %xmm0, %xmm0
@@ -585,18 +626,18 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
 ; SSE41-NEXT:    movdqa %xmm1, %xmm4
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm5, %xmm4
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm4
 ; SSE41-NEXT:    movdqa %xmm4, %xmm5
 ; SSE41-NEXT:    psllw $2, %xmm5
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
 ; SSE41-NEXT:    paddb %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm5, %xmm4
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm4
 ; SSE41-NEXT:    movdqa %xmm4, %xmm5
 ; SSE41-NEXT:    paddb %xmm5, %xmm5
 ; SSE41-NEXT:    paddb %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm5, %xmm4
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm4
 ; SSE41-NEXT:    psllw $5, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    paddb %xmm3, %xmm3
@@ -604,18 +645,18 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE41-NEXT:    psrlw $4, %xmm5
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm5
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
-; SSE41-NEXT:    pblendvb %xmm5, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm5, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $2, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $1, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    paddb %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    por %xmm4, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -650,6 +691,36 @@ define <16 x i8> @var_rotate_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512BW-LABEL: var_rotate_v16i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
+; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512BW-NEXT:    vpsllvd %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512BW-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512BW-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v16i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqu {{.*#+}} xmm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %xmm1, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsllvd %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero,xmm2[4],zero,zero,zero,xmm2[5],zero,zero,zero,xmm2[6],zero,zero,zero,xmm2[7],zero,zero,zero,xmm2[8],zero,zero,zero,xmm2[9],zero,zero,zero,xmm2[10],zero,zero,zero,xmm2[11],zero,zero,zero,xmm2[12],zero,zero,zero,xmm2[13],zero,zero,zero,xmm2[14],zero,zero,zero,xmm2[15],zero,zero,zero
+; AVX512VL-NEXT:    vpsrlvd %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
 ; XOP-LABEL: var_rotate_v16i8:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotb %xmm1, %xmm0, %xmm0
@@ -773,6 +844,13 @@ define <2 x i64> @constant_rotate_v2i64(<2 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: constant_rotate_v2i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: constant_rotate_v2i64:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
@@ -873,6 +951,13 @@ define <4 x i32> @constant_rotate_v4i32(<4 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: constant_rotate_v4i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %xmm0, %xmm1
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: constant_rotate_v4i32:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
@@ -989,12 +1074,30 @@ define <8 x i16> @constant_rotate_v8i16(<8 x i16> %a) nounwind {
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm1
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: constant_rotate_v8i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7]
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [16,15,14,13,12,11,10,9]
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v8i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvw {{.*}}(%rip), %xmm0, %xmm1
+; AVX512VL-NEXT:    vpsrlvw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512VL-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512VL-NEXT:    retq
+;
 ; XOP-LABEL: constant_rotate_v8i16:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
@@ -1108,31 +1211,31 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,57600,41152,24704,8256]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
-; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    psllw $2, %xmm3
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    paddb %xmm3, %xmm3
 ; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    psrlw $4, %xmm3
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [57600,41152,24704,8256,8192,24640,41088,57536]
-; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    psrlw $2, %xmm3
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    psrlw $1, %xmm3
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    por %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -1165,6 +1268,17 @@ define <16 x i8> @constant_rotate_v16i8(<16 x i8> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512-LABEL: constant_rotate_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %zmm0, %zmm1
+; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: constant_rotate_v16i8:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vpshlb {{.*}}(%rip), %xmm0, %xmm1
@@ -1257,6 +1371,13 @@ define <2 x i64> @splatconstant_rotate_v2i64(<2 x i64> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_v2i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllq $14, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrlq $50, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: splatconstant_rotate_v2i64:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotq $14, %xmm0, %xmm0
@@ -1291,6 +1412,13 @@ define <4 x i32> @splatconstant_rotate_v4i32(<4 x i32> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_v4i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpslld $4, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrld $28, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: splatconstant_rotate_v4i32:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
@@ -1325,6 +1453,13 @@ define <8 x i16> @splatconstant_rotate_v8i16(<8 x i16> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_v8i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllw $7, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrlw $9, %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: splatconstant_rotate_v8i16:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotw $7, %xmm0, %xmm0
@@ -1363,6 +1498,15 @@ define <16 x i8> @splatconstant_rotate_v16i8(<16 x i8> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: splatconstant_rotate_v16i8:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
@@ -1408,6 +1552,15 @@ define <2 x i64> @splatconstant_rotate_mask_v2i64(<2 x i64> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_mask_v2i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllq $15, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrlq $49, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: splatconstant_rotate_mask_v2i64:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotq $15, %xmm0, %xmm0
@@ -1453,6 +1606,15 @@ define <4 x i32> @splatconstant_rotate_mask_v4i32(<4 x i32> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_mask_v4i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpslld $4, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrld $28, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: splatconstant_rotate_mask_v4i32:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotd $4, %xmm0, %xmm0
@@ -1498,6 +1660,15 @@ define <8 x i16> @splatconstant_rotate_mask_v8i16(<8 x i16> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_mask_v8i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllw $5, %xmm0, %xmm1
+; AVX512-NEXT:    vpsrlw $11, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: splatconstant_rotate_mask_v8i16:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotw $5, %xmm0, %xmm0
@@ -1547,6 +1718,17 @@ define <16 x i8> @splatconstant_rotate_mask_v16i8(<16 x i8> %a) nounwind {
 ; AVX-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_mask_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllw $4, %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm1, %xmm1
+; AVX512-NEXT:    vpor %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+;
 ; XOP-LABEL: splatconstant_rotate_mask_v16i8:
 ; XOP:       # BB#0:
 ; XOP-NEXT:    vprotb $4, %xmm0, %xmm0
diff --git a/test/CodeGen/X86/vector-rotate-256.ll b/test/CodeGen/X86/vector-rotate-256.ll
index 2e357660ee2c08e30e085bb8d14c25bc81a749ce..3306cd400c1d0ac66cc3b04e6e5740f959570792 100644
--- a/test/CodeGen/X86/vector-rotate-256.ll
+++ b/test/CodeGen/X86/vector-rotate-256.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VL
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=XOP --check-prefix=XOPAVX2
 
@@ -46,6 +48,15 @@ define <4 x i64> @var_rotate_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: var_rotate_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpbroadcastq {{.*}}(%rip), %ymm2
+; AVX512-NEXT:    vpsubq %ymm1, %ymm2, %ymm2
+; AVX512-NEXT:    vpsllvq %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpsrlvq %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: var_rotate_v4i64:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -124,6 +135,15 @@ define <8 x i32> @var_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: var_rotate_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpbroadcastd {{.*}}(%rip), %ymm2
+; AVX512-NEXT:    vpsubd %ymm1, %ymm2, %ymm2
+; AVX512-NEXT:    vpsllvd %ymm1, %ymm0, %ymm1
+; AVX512-NEXT:    vpsrlvd %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: var_rotate_v8i32:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -241,6 +261,26 @@ define <16 x i16> @var_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: var_rotate_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512BW-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqu {{.*#+}} ymm2 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
+; AVX512VL-NEXT:    vpsubw %ymm1, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpsllvw %ymm1, %ymm0, %ymm1
+; AVX512VL-NEXT:    vpsrlvw %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
 ; XOPAVX1-LABEL: var_rotate_v16i16:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -359,6 +399,34 @@ define <32 x i8> @var_rotate_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: var_rotate_v32i8:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512BW-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512BW-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: var_rotate_v32i8:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqu {{.*#+}} ymm2 = [8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8]
+; AVX512VL-NEXT:    vpsubb %ymm1, %ymm2, %ymm2
+; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero,ymm1[8],zero,ymm1[9],zero,ymm1[10],zero,ymm1[11],zero,ymm1[12],zero,ymm1[13],zero,ymm1[14],zero,ymm1[15],zero,ymm1[16],zero,ymm1[17],zero,ymm1[18],zero,ymm1[19],zero,ymm1[20],zero,ymm1[21],zero,ymm1[22],zero,ymm1[23],zero,ymm1[24],zero,ymm1[25],zero,ymm1[26],zero,ymm1[27],zero,ymm1[28],zero,ymm1[29],zero,ymm1[30],zero,ymm1[31],zero
+; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512VL-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512VL-NEXT:    vpmovzxbw {{.*#+}} zmm2 = ymm2[0],zero,ymm2[1],zero,ymm2[2],zero,ymm2[3],zero,ymm2[4],zero,ymm2[5],zero,ymm2[6],zero,ymm2[7],zero,ymm2[8],zero,ymm2[9],zero,ymm2[10],zero,ymm2[11],zero,ymm2[12],zero,ymm2[13],zero,ymm2[14],zero,ymm2[15],zero,ymm2[16],zero,ymm2[17],zero,ymm2[18],zero,ymm2[19],zero,ymm2[20],zero,ymm2[21],zero,ymm2[22],zero,ymm2[23],zero,ymm2[24],zero,ymm2[25],zero,ymm2[26],zero,ymm2[27],zero,ymm2[28],zero,ymm2[29],zero,ymm2[30],zero,ymm2[31],zero
+; AVX512VL-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512VL-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
 ; XOPAVX1-LABEL: var_rotate_v32i8:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
@@ -415,6 +483,13 @@ define <4 x i64> @constant_rotate_v4i64(<4 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: constant_rotate_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllvq {{.*}}(%rip), %ymm0, %ymm1
+; AVX512-NEXT:    vpsrlvq {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: constant_rotate_v4i64:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vpshlq {{.*}}(%rip), %xmm0, %xmm1
@@ -474,6 +549,13 @@ define <8 x i32> @constant_rotate_v8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: constant_rotate_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllvd {{.*}}(%rip), %ymm0, %ymm1
+; AVX512-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: constant_rotate_v8i32:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vpshld {{.*}}(%rip), %xmm0, %xmm1
@@ -542,6 +624,23 @@ define <16 x i16> @constant_rotate_v16i16(<16 x i16> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512BW-LABEL: constant_rotate_v16i16:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
+; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm1
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1]
+; AVX512BW-NEXT:    vpsrlvw %zmm2, %zmm0, %zmm0
+; AVX512BW-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512BW-NEXT:    retq
+;
+; AVX512VL-LABEL: constant_rotate_v16i16:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpsllvw {{.*}}(%rip), %ymm0, %ymm1
+; AVX512VL-NEXT:    vpsrlvw {{.*}}(%rip), %ymm0, %ymm0
+; AVX512VL-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+;
 ; XOPAVX1-LABEL: constant_rotate_v16i16:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vpshlw {{.*}}(%rip), %xmm0, %xmm1
@@ -657,6 +756,16 @@ define <32 x i8> @constant_rotate_v32i8(<32 x i8> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: constant_rotate_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxbw {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero,ymm0[16],zero,ymm0[17],zero,ymm0[18],zero,ymm0[19],zero,ymm0[20],zero,ymm0[21],zero,ymm0[22],zero,ymm0[23],zero,ymm0[24],zero,ymm0[25],zero,ymm0[26],zero,ymm0[27],zero,ymm0[28],zero,ymm0[29],zero,ymm0[30],zero,ymm0[31],zero
+; AVX512-NEXT:    vpsllvw {{.*}}(%rip), %zmm0, %zmm1
+; AVX512-NEXT:    vpmovwb %zmm1, %ymm1
+; AVX512-NEXT:    vpsrlvw {{.*}}(%rip), %zmm0, %zmm0
+; AVX512-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: constant_rotate_v32i8:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7,8,7,6,5,4,3,2,1]
@@ -716,6 +825,13 @@ define <4 x i64> @splatconstant_rotate_v4i64(<4 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllq $14, %ymm0, %ymm1
+; AVX512-NEXT:    vpsrlq $50, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: splatconstant_rotate_v4i64:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vprotq $14, %xmm0, %xmm1
@@ -757,6 +873,13 @@ define <8 x i32> @splatconstant_rotate_v8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpslld $4, %ymm0, %ymm1
+; AVX512-NEXT:    vpsrld $28, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: splatconstant_rotate_v8i32:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
@@ -798,6 +921,13 @@ define <16 x i16> @splatconstant_rotate_v16i16(<16 x i16> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllw $7, %ymm0, %ymm1
+; AVX512-NEXT:    vpsrlw $9, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: splatconstant_rotate_v16i16:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vprotw $7, %xmm0, %xmm1
@@ -847,6 +977,15 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: splatconstant_rotate_v32i8:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
@@ -896,6 +1035,15 @@ define <4 x i64> @splatconstant_rotate_mask_v4i64(<4 x i64> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_mask_v4i64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllq $15, %ymm0, %ymm1
+; AVX512-NEXT:    vpsrlq $49, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v4i64:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vprotq $15, %xmm0, %xmm1
@@ -945,6 +1093,15 @@ define <8 x i32> @splatconstant_rotate_mask_v8i32(<8 x i32> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_mask_v8i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpslld $4, %ymm0, %ymm1
+; AVX512-NEXT:    vpsrld $28, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v8i32:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vprotd $4, %xmm0, %xmm1
@@ -994,6 +1151,15 @@ define <16 x i16> @splatconstant_rotate_mask_v16i16(<16 x i16> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_mask_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllw $5, %ymm0, %ymm1
+; AVX512-NEXT:    vpsrlw $11, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v16i16:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vprotw $5, %xmm0, %xmm1
@@ -1051,6 +1217,17 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind {
 ; AVX2-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
 ;
+; AVX512-LABEL: splatconstant_rotate_mask_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX512-NEXT:    vpand {{.*}}(%rip), %ymm1, %ymm1
+; AVX512-NEXT:    vpor %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+;
 ; XOPAVX1-LABEL: splatconstant_rotate_mask_v32i8:
 ; XOPAVX1:       # BB#0:
 ; XOPAVX1-NEXT:    vprotb $4, %xmm0, %xmm1
diff --git a/test/CodeGen/X86/vector-sext.ll b/test/CodeGen/X86/vector-sext.ll
index 774d615ae8962ffbbe5a5468ed5754880e40f946..e9f1d1d8522b33933a0406ae8eb983a21e369e57 100644
--- a/test/CodeGen/X86/vector-sext.ll
+++ b/test/CodeGen/X86/vector-sext.ll
@@ -1246,6 +1246,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_sext_2i1_to_2i64:
@@ -1254,6 +1255,7 @@ define <2 x i64> @load_sext_2i1_to_2i64(<2 x i1> *%ptr) {
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE41-LABEL: load_sext_2i1_to_2i64:
@@ -1436,6 +1438,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_sext_4i1_to_4i32:
@@ -1445,6 +1448,7 @@ define <4 x i32> @load_sext_4i1_to_4i32(<4 x i1> *%ptr) {
 ; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE41-LABEL: load_sext_4i1_to_4i32:
@@ -1941,14 +1945,16 @@ define <8 x i16> @load_sext_8i1_to_8i16(<8 x i1> *%ptr) {
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: load_sext_8i1_to_8i16:
 ; AVX512BW:       # BB#0: # %entry
 ; AVX512BW-NEXT:    movzbl (%rdi), %eax
-; AVX512BW-NEXT:    kmovd %eax, %k1
-; AVX512BW-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512BW-NEXT:    kmovd %eax, %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE41-LABEL: load_sext_8i1_to_8i16:
@@ -2847,12 +2853,21 @@ define <16 x i8> @load_sext_16i1_to_16i8(<16 x i1> *%ptr) nounwind readnone {
 ; AVX2-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_sext_16i1_to_16i8:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    kmovw (%rdi), %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_sext_16i1_to_16i8:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    kmovw (%rdi), %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_16i1_to_16i8:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
+; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE41-LABEL: load_sext_16i1_to_16i8:
 ; X32-SSE41:       # BB#0: # %entry
@@ -3384,12 +3399,19 @@ define <16 x i16> @load_sext_16i1_to_16i16(<16 x i1> *%ptr) {
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_sext_16i1_to_16i16:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    kmovw (%rdi), %k1
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_sext_16i1_to_16i16:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    kmovw (%rdi), %k1
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_16i1_to_16i16:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    kmovw (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE41-LABEL: load_sext_16i1_to_16i16:
 ; X32-SSE41:       # BB#0: # %entry
@@ -4228,16 +4250,23 @@ define <32 x i8> @load_sext_32i1_to_32i8(<32 x i1> *%ptr) nounwind readnone {
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: load_sext_32i1_to_32i8:
-; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    kmovw (%rdi), %k1
-; AVX512-NEXT:    kmovw 2(%rdi), %k2
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
-; AVX512-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
-; AVX512-NEXT:    vpmovdb %zmm1, %xmm1
-; AVX512-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: load_sext_32i1_to_32i8:
+; AVX512F:       # BB#0: # %entry
+; AVX512F-NEXT:    kmovw (%rdi), %k1
+; AVX512F-NEXT:    kmovw 2(%rdi), %k2
+; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k2} {z}
+; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: load_sext_32i1_to_32i8:
+; AVX512BW:       # BB#0: # %entry
+; AVX512BW-NEXT:    kmovd (%rdi), %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE41-LABEL: load_sext_32i1_to_32i8:
 ; X32-SSE41:       # BB#0: # %entry
@@ -4435,7 +4464,7 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
 ; SSE2-LABEL: load_sext_2i16_to_2i64:
 ; SSE2:       # BB#0: # %entry
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psrad $31, %xmm1
 ; SSE2-NEXT:    psrad $16, %xmm0
@@ -4445,7 +4474,7 @@ define <2 x i64> @load_sext_2i16_to_2i64(<2 x i16> *%ptr) {
 ; SSSE3-LABEL: load_sext_2i16_to_2i64:
 ; SSSE3:       # BB#0: # %entry
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
+; SSSE3-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,2,1,4,5,6,7]
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
 ; SSSE3-NEXT:    psrad $31, %xmm1
 ; SSSE3-NEXT:    psrad $16, %xmm0
@@ -4968,10 +4997,9 @@ define <32 x i8> @sext_32xi1_to_32xi8(<32 x i16> %c1, <32 x i16> %c2)nounwind {
 ;
 ; AVX512BW-LABEL: sext_32xi1_to_32xi8:
 ; AVX512BW:       # BB#0:
-; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k1
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
+; AVX512BW-NEXT:    vpcmpeqw %zmm1, %zmm0, %k0
+; AVX512BW-NEXT:    vpmovm2b %k0, %zmm0
+; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512BW-NEXT:    retq
 ;
 ; X32-SSE41-LABEL: sext_32xi1_to_32xi8:
diff --git a/test/CodeGen/X86/vector-shift-ashr-128.ll b/test/CodeGen/X86/vector-shift-ashr-128.ll
index acad121697803610c6fb7c85c5b41b1dcbbeee13..a5e2cb66eba87ac6f89b08347712fe55fdf798c5 100644
--- a/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -278,21 +278,21 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
 ; SSE41-NEXT:    psraw $8, %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psraw $4, %xmm1
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psraw $2, %xmm1
 ; SSE41-NEXT:    paddw %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psraw $1, %xmm1
 ; SSE41-NEXT:    paddw %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -319,7 +319,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; AVX2-NEXT:    vpsravd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -467,29 +467,29 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; SSE41-NEXT:    psraw $4, %xmm4
-; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; SSE41-NEXT:    psraw $2, %xmm4
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; SSE41-NEXT:    psraw $1, %xmm4
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
 ; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psraw $4, %xmm2
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psraw $2, %xmm2
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psraw $1, %xmm2
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $8, %xmm1
 ; SSE41-NEXT:    packuswb %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -649,8 +649,7 @@ define <2 x i64> @splatvar_shift_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ; AVX512-LABEL: splatvar_shift_v2i64:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vpbroadcastq %xmm1, %xmm1
-; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
@@ -842,29 +841,29 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; SSE41-NEXT:    psraw $4, %xmm4
-; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; SSE41-NEXT:    psraw $2, %xmm4
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm4
 ; SSE41-NEXT:    psraw $1, %xmm4
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm3
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm3
 ; SSE41-NEXT:    psrlw $8, %xmm3
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psraw $4, %xmm2
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psraw $2, %xmm2
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psraw $1, %xmm2
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $8, %xmm1
 ; SSE41-NEXT:    packuswb %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -1244,7 +1243,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovsxwd %xmm0, %ymm0
 ; AVX2-NEXT:    vpsravd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -1372,29 +1371,29 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
 ; SSE41-NEXT:    psraw $4, %xmm4
-; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
 ; SSE41-NEXT:    psraw $2, %xmm4
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
 ; SSE41-NEXT:    psraw $1, %xmm4
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    psrlw $8, %xmm2
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    psraw $4, %xmm3
-; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    psraw $2, %xmm3
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    psraw $1, %xmm3
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm1
 ; SSE41-NEXT:    psrlw $8, %xmm1
 ; SSE41-NEXT:    packuswb %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
@@ -1556,9 +1555,9 @@ define <2 x i64> @splatconstant_shift_v2i64(<2 x i64> %a) nounwind {
 ;
 ; AVX512-LABEL: splatconstant_shift_v2i64:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrad $7, %xmm0, %xmm1
-; AVX512-NEXT:    vpsrlq $7, %xmm0, %xmm0
-; AVX512-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<def>
+; AVX512-NEXT:    vpsraq $7, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_shift_v2i64:
diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll
index c09e6b2bc8d59d929db273acffe5b1fc1722d051..af3ddcf8048e837116cf86934739ad6155fc910d 100644
--- a/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -491,8 +491,7 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind {
 ; AVX512-LABEL: splatvar_shift_v4i64:
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
-; AVX512-NEXT:    vpbroadcastq %xmm1, %ymm1
-; AVX512-NEXT:    vpsravq %zmm1, %zmm0, %zmm0
+; AVX512-NEXT:    vpsraq %xmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
@@ -1198,9 +1197,9 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind {
 ;
 ; AVX512-LABEL: splatconstant_shift_v4i64:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsrad $7, %ymm0, %ymm1
-; AVX512-NEXT:    vpsrlq $7, %ymm0, %ymm0
-; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512-NEXT:    vpsraq $7, %zmm0, %zmm0
+; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
 ; AVX512-NEXT:    retq
 ;
 ; AVX512VL-LABEL: splatconstant_shift_v4i64:
diff --git a/test/CodeGen/X86/vector-shift-lshr-128.ll b/test/CodeGen/X86/vector-shift-lshr-128.ll
index 23c8f8997f7a5ef9ca89260c00030903cf45ce69..9b44ad1dac300fca157c285212436d7be326056a 100644
--- a/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -249,21 +249,21 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
 ; SSE41-NEXT:    psrlw $8, %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $4, %xmm1
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $2, %xmm1
 ; SSE41-NEXT:    paddw %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $1, %xmm1
 ; SSE41-NEXT:    paddw %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -290,7 +290,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpsrlvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -411,19 +411,19 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE41-NEXT:    psrlw $4, %xmm3
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    psrlw $2, %xmm3
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    paddb %xmm1, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    psrlw $1, %xmm3
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    paddb %xmm1, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -684,18 +684,18 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE41-NEXT:    psrlw $4, %xmm4
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $2, %xmm1
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psrlw $1, %xmm1
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    paddb %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -1005,7 +1005,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpsrlvd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -1105,17 +1105,17 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    psrlw $4, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,49376,32928,16480,32]
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $2, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psrlw $1, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-shift-shl-128.ll b/test/CodeGen/X86/vector-shift-shl-128.ll
index d6088906ef8438d4a7cb1325ade835ece369c891..568bf6e974f7348b5335be45ad78565c7248b489 100644
--- a/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -206,21 +206,21 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; SSE41-NEXT:    movdqa %xmm2, %xmm4
 ; SSE41-NEXT:    psllw $8, %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psllw $4, %xmm1
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psllw $2, %xmm1
 ; SSE41-NEXT:    paddw %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psllw $1, %xmm1
 ; SSE41-NEXT:    paddw %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -247,7 +247,7 @@ define <8 x i16> @var_shift_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
 ; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX2-NEXT:    vpsllvd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -365,18 +365,18 @@ define <16 x i8> @var_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE41-NEXT:    psllw $4, %xmm3
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    psllw $2, %xmm3
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm3
 ; SSE41-NEXT:    paddb %xmm1, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    paddb %xmm3, %xmm3
 ; SSE41-NEXT:    paddb %xmm1, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm3, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm3, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -632,17 +632,17 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind {
 ; SSE41-NEXT:    psllw $4, %xmm4
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm4
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
-; SSE41-NEXT:    pblendvb %xmm4, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm4, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    psllw $2, %xmm1
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm1
 ; SSE41-NEXT:    paddb %xmm1, %xmm1
 ; SSE41-NEXT:    paddb %xmm3, %xmm3
 ; SSE41-NEXT:    movdqa %xmm3, %xmm0
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -961,16 +961,16 @@ define <16 x i8> @constant_shift_v16i8(<16 x i8> %a) nounwind {
 ; SSE41-NEXT:    psllw $4, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [8192,24640,41088,57536,49376,32928,16480,32]
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    psllw $2, %xmm2
 ; SSE41-NEXT:    pand {{.*}}(%rip), %xmm2
 ; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    paddb %xmm2, %xmm2
 ; SSE41-NEXT:    paddb %xmm0, %xmm0
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-shuffle-128-v16.ll b/test/CodeGen/X86/vector-shuffle-128-v16.ll
index 3e19da2cd071ad53c17bc814ece3adf162ffacbd..9f4501c1f225cb8f3ebe17adc82b4ac065515b69 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -238,14 +238,12 @@ define <16 x i8> @shuffle_v16i8_08_24_09_25_10_26_11_27_12_28_13_29_14_30_15_31(
 define <16 x i8> @shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07(<16 x i8> %a, <16 x i8> %b) {
 ; SSE2-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
 ; SSE2:       # BB#0:
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
-; SSE2-NEXT:    pand %xmm2, %xmm1
-; SSE2-NEXT:    pandn %xmm0, %xmm2
-; SSE2-NEXT:    por %xmm1, %xmm2
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
+; SSE2-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: shuffle_v16i8_16_00_16_01_16_02_16_03_16_04_16_05_16_06_16_07:
@@ -411,7 +409,7 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0]
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -424,7 +422,7 @@ define <16 x i8> @shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31(
 ; AVX512VL-LABEL: shuffle_v16i8_00_17_02_19_04_21_06_23_08_25_10_27_12_29_14_31:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
@@ -451,7 +449,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,0,255,255,255,0,255,255,255,0,255,255,255,0]
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -464,7 +462,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31(
 ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_19_04_05_06_23_08_09_10_27_12_13_14_31:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movw $-30584, %ax # imm = 0x8888
-; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 4, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 27, i32 12, i32 13, i32 14, i32 31>
@@ -510,7 +508,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,255,0,255,255,0,255,255,255,255,0,255,255,0]
-; SSE41-NEXT:    pblendvb %xmm2, %xmm1
+; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -523,7 +521,7 @@ define <16 x i8> @shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31(
 ; AVX512VL-LABEL: shuffle_v16i8_00_01_02_03_20_05_06_23_08_09_10_11_28_13_14_31:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movw $-28528, %ax # imm = 0x9090
-; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 20, i32 5, i32 6, i32 23, i32 8, i32 9, i32 10, i32 11, i32 28, i32 13, i32 14, i32 31>
@@ -551,7 +549,7 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(
 ; SSE41:       # BB#0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,255,255,0,0,0,0,255,255,0,0,255,0,255,0]
-; SSE41-NEXT:    pblendvb %xmm1, %xmm2
+; SSE41-NEXT:    pblendvb %xmm0, %xmm1, %xmm2
 ; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -564,7 +562,7 @@ define <16 x i8> @shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15(
 ; AVX512VL-LABEL: shuffle_v16i8_16_17_18_19_04_05_06_07_24_25_10_11_28_13_30_15:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movw $-21264, %ax # imm = 0xACF0
-; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i8> %a, <16 x i8> %b, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 24, i32 25, i32 10, i32 11, i32 28, i32 13, i32 30, i32 15>
@@ -1718,17 +1716,17 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
 ; SSE2-NEXT:    movzbl (%rsi), %ecx
 ; SSE2-NEXT:    shll $8, %ecx
 ; SSE2-NEXT:    orl %eax, %ecx
-; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    movzwl %cx, %eax
+; SSE2-NEXT:    movd %eax, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pinsrw $0, %ecx, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,3]
-; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
-; SSE2-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,5,4,4,4]
-; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,1,0,1]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,1,0,3]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7]
+; SSE2-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,4,4,4]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; SSE2-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,7]
-; SSE2-NEXT:    packuswb %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: PR31364:
@@ -1737,8 +1735,8 @@ define <16 x i8> @PR31364(i8* nocapture readonly %a, i8* nocapture readonly %b)
 ; SSSE3-NEXT:    movzbl (%rsi), %ecx
 ; SSSE3-NEXT:    shll $8, %ecx
 ; SSSE3-NEXT:    orl %eax, %ecx
-; SSSE3-NEXT:    pxor %xmm0, %xmm0
-; SSSE3-NEXT:    pinsrw $0, %ecx, %xmm0
+; SSSE3-NEXT:    movzwl %cx, %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
 ; SSSE3-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[1,1,1,1,1,1,1],zero,xmm0[1,1,1,1,1,0,0,0]
 ; SSSE3-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vector-shuffle-128-v2.ll b/test/CodeGen/X86/vector-shuffle-128-v2.ll
index df854ef5dbfc2a7bfa3b3b5d0c5de3c7cbd1b3f1..d0ead653b203d96a0114131bb5870a9893f4e7b2 100644
--- a/test/CodeGen/X86/vector-shuffle-128-v2.ll
+++ b/test/CodeGen/X86/vector-shuffle-128-v2.ll
@@ -910,25 +910,25 @@ define <2 x double> @shuffle_v2f64_bitcast_1z(<2 x double> %a) {
 ; SSE-LABEL: shuffle_v2f64_bitcast_1z:
 ; SSE:       # BB#0:
 ; SSE-NEXT:    xorpd %xmm1, %xmm1
-; SSE-NEXT:    shufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; SSE-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; SSE-NEXT:    retq
 ;
 ; AVX1-LABEL: shuffle_v2f64_bitcast_1z:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; AVX1-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuffle_v2f64_bitcast_1z:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; AVX2-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v2f64_bitcast_1z:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vshufpd {{.*#+}} xmm0 = xmm0[1],xmm1[0]
+; AVX512VL-NEXT:    vunpckhpd {{.*#+}} xmm0 = xmm0[1],xmm1[1]
 ; AVX512VL-NEXT:    retq
   %shuffle64 = shufflevector <2 x double> %a, <2 x double> zeroinitializer, <2 x i32> <i32 2, i32 1>
   %bitcast32 = bitcast <2 x double> %shuffle64 to <4 x float>
diff --git a/test/CodeGen/X86/vector-shuffle-256-v16.ll b/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 682a34d3cdb772aadbdb792a365f4f31b57eb61d..fad5586dd77cd32fbb63f7b855298c041ede7786 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -718,7 +718,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_3
 ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_31:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movw $-32768, %ax # imm = 0x8000
-; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 31>
@@ -743,7 +743,7 @@ define <16 x i16> @shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_1
 ; AVX512VL-LABEL: shuffle_v16i16_16_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movw $1, %ax
-; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -768,7 +768,7 @@ define <16 x i16> @shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_1
 ; AVX512VL-LABEL: shuffle_v16i16_00_17_02_19_04_21_06_23_24_09_26_11_28_13_30_15:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movw $21930, %ax # imm = 0x55AA
-; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 0, i32 17, i32 2, i32 19, i32 4, i32 21, i32 6, i32 23, i32 24, i32 9, i32 26, i32 11, i32 28, i32 13, i32 30, i32 15>
@@ -793,7 +793,7 @@ define <16 x i16> @shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_3
 ; AVX512VL-LABEL: shuffle_v16i16_16_01_18_03_20_05_22_07_08_25_10_27_12_29_14_31:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    movw $-21931, %ax # imm = 0xAA55
-; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu16 %ymm1, %ymm0 {%k1}
 ; AVX512VL-NEXT:    retq
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> <i32 16, i32 1, i32 18, i32 3, i32 20, i32 5, i32 22, i32 7, i32 8, i32 25, i32 10, i32 27, i32 12, i32 29, i32 14, i32 31>
@@ -1664,6 +1664,40 @@ define <16 x i16> @shuffle_v16i16_16_zz_17_zz_18_zz_19_zz_20_zz_21_zz_22_zz_22_z
   ret <16 x i16> %shuffle
 }
 
+define <16 x i16> @shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz(<16 x i16> %a) {
+; AVX1-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v16i16_28_zz_zz_zz_29_zz_zz_zz_30_zz_zz_zz_31_zz_zz_zz:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vmovdqu {{.*#+}} ymm2 = [28,1,2,3,29,5,6,7,30,9,10,11,31,13,14,15]
+; AVX512VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpermt2w %ymm0, %ymm2, %ymm1
+; AVX512VL-NEXT:    vmovdqa %ymm1, %ymm0
+; AVX512VL-NEXT:    retq
+  %shuffle = shufflevector <16 x i16> zeroinitializer, <16 x i16> %a, <16 x i32> <i32 28, i32 0, i32 0, i32 0, i32 29, i32 0, i32 0, i32 0, i32 30, i32 0, i32 0, i32 0, i32 31, i32 0, i32 0, i32 0>
+  ret <16 x i16> %shuffle
+}
+
 define <16 x i16> @shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14(<16 x i16> %a, <16 x i16> %b) {
 ; AVX1-LABEL: shuffle_v16i16_23_00_01_02_03_04_05_06_31_08_09_10_11_12_13_14:
 ; AVX1:       # BB#0:
@@ -2653,8 +2687,7 @@ define <16 x i16> @shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_1
 ; AVX1-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm0[1,1,2,3,4,5,6,7]
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4,5,6,7]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2OR512VL-LABEL: shuffle_v16i16_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v32.ll b/test/CodeGen/X86/vector-shuffle-256-v32.ll
index 8b0e25ce43f6d484cb8ac68577e3d62df3df61f0..f4c4403ed83ff1d8e20a1e903955de43a23dcb8a 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -383,7 +383,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_00_18_00_00_
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512VL-NEXT:    movw $1, %ax
-; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
 ; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VL-NEXT:    retq
@@ -414,7 +414,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_00_00_00_00_00_19_00_00_00_
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vperm2i128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX512VL-NEXT:    movw $1, %ax
-; AVX512VL-NEXT:    kmovw %eax, %k1
+; AVX512VL-NEXT:    kmovd %eax, %k1
 ; AVX512VL-NEXT:    vmovdqu16 %ymm0, %ymm1 {%k1}
 ; AVX512VL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16]
 ; AVX512VL-NEXT:    retq
@@ -1089,8 +1089,7 @@ define <32 x i8> @shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_
 ; AVX1-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[1],zero,xmm0[2],zero,xmm0[4,u,6,7,8,9,10,11,12,13,14,15]
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3]
 ; AVX1-NEXT:    retq
 ;
 ; AVX2OR512VL-LABEL: shuffle_v32i8_01_zz_02_zz_04_uu_06_07_08_09_10_11_12_13_14_15_u6_17_18_19_20_21_22_23_24_25_26_27_28_29_30_31:
@@ -1966,9 +1965,10 @@ define <32 x i8> @shuffle_v32i8_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_
 define <32 x i8> @shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i8> %a) {
 ; AVX1-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2OR512VL-LABEL: shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz:
@@ -1979,6 +1979,44 @@ define <32 x i8> @shuffle_v32i8_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_
   ret <32 x i8> %shuffle
 }
 
+define <32 x i8> @shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz(<32 x i8> %a) {
+; AVX1-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v32i8_56_zz_zz_zz_57_zz_zz_zz_58_zz_zz_zz__zz_59_zz_zz_zz_60_zz_zz_zz_61_zz_zz_zz_62_zz_zz_zz_63_zz_zz_zz:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX512VL-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    movl $286331153, %eax # imm = 0x11111111
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vmovdqu8 %ymm0, %ymm0 {%k1} {z}
+; AVX512VL-NEXT:    retq
+  %shuffle = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 56, i32 1, i32 2, i32 3, i32 57, i32 5, i32 6, i32 7, i32 58, i32 9, i32 10, i32 11, i32 59, i32 13, i32 14, i32 15, i32 60, i32 17, i32 18, i32 19, i32 61, i32 21, i32 22, i32 23, i32 62, i32 25, i32 26, i32 27, i32 63, i32 29, i32 30, i32 31>
+  ret <32 x i8> %shuffle
+}
+
 define <32 x i8> @shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30(<32 x i8> %a, <32 x i8> %b) {
 ; AVX1-LABEL: shuffle_v32i8_47_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_63_16_17_18_19_20_21_22_23_24_25_26_27_28_29_30:
 ; AVX1:       # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-256-v4.ll b/test/CodeGen/X86/vector-shuffle-256-v4.ll
index 1198514be5f16b1587b79dbeca1631a96752075f..ad343e64e1e5c379d4f4cfd1c785527048d9c258 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v4.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v4.ll
@@ -129,6 +129,48 @@ define <4 x double> @shuffle_v4f64_2200(<4 x double> %a, <4 x double> %b) {
   ret <4 x double> %shuffle
 }
 
+define <4 x double> @shuffle_v4f64_2222(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_2222:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_2222:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_2222:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512VL-NEXT:    retq
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
+define <4 x double> @shuffle_v4f64_2222_bc(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4f64_2222_bc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_2222_bc:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_2222_bc:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2]
+; AVX512VL-NEXT:    retq
+  %tmp0 = bitcast <4 x i64> %a to <4 x double>
+  %tmp1 = bitcast <4 x i64> %b to <4 x double>
+  %shuffle = shufflevector <4 x double> %tmp0, <4 x double> %tmp1, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x double> %shuffle
+}
+
 define <4 x double> @shuffle_v4f64_3330(<4 x double> %a, <4 x double> %b) {
 ; AVX1-LABEL: shuffle_v4f64_3330:
 ; AVX1:       # BB#0:
@@ -477,22 +519,22 @@ define <4 x double> @shuffle_v4f64_3333(<4 x double> %a, <4 x double> %b) {
   ret <4 x double> %shuffle
 }
 
-define <4 x double> @shuffle_v4f64_1z3z(<4 x double> %a, <4 x double> %b) {
-; AVX1-LABEL: shuffle_v4f64_1z3z:
+define <4 x double> @shuffle_v4f64_0z3z(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_0z3z:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
 ; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
 ; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: shuffle_v4f64_1z3z:
+; AVX2-LABEL: shuffle_v4f64_0z3z:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
 ; AVX2-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
 ; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3]
 ; AVX2-NEXT:    retq
 ;
-; AVX512VL-LABEL: shuffle_v4f64_1z3z:
+; AVX512VL-LABEL: shuffle_v4f64_0z3z:
 ; AVX512VL:       # BB#0:
 ; AVX512VL-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,3,2]
 ; AVX512VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
@@ -502,6 +544,34 @@ define <4 x double> @shuffle_v4f64_1z3z(<4 x double> %a, <4 x double> %b) {
   ret <4 x double> %shuffle
 }
 
+define <4 x double> @shuffle_v4f64_1z2z(<4 x double> %a, <4 x double> %b) {
+; AVX1-LABEL: shuffle_v4f64_1z2z:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4f64_1z2z:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4f64_1z2z:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX512VL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX512VL-NEXT:    retq
+  %1 = shufflevector <4 x double> %a, <4 x double> <double 0.000000e+00, double undef, double undef, double undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
+  ret <4 x double> %1
+}
+
 define <4 x i64> @shuffle_v4i64_0000(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: shuffle_v4i64_0000:
 ; AVX1:       # BB#0:
@@ -1334,7 +1404,7 @@ define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
 ;
 ; AVX512VL-LABEL: splat128_mem_v4i64_from_v2i64:
 ; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = mem[0,1,0,1]
 ; AVX512VL-NEXT:    retq
   %v = load <2 x i64>, <2 x i64>* %ptr
   %shuffle = shufflevector <2 x i64> %v, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
@@ -1342,20 +1412,10 @@ define <4 x i64> @splat128_mem_v4i64_from_v2i64(<2 x i64>* %ptr) {
 }
 
 define <4 x double> @splat128_mem_v4f64_from_v2f64(<2 x double>* %ptr) {
-; AVX1-LABEL: splat128_mem_v4f64_from_v2f64:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: splat128_mem_v4f64_from_v2f64:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
-; AVX2-NEXT:    retq
-;
-; AVX512VL-LABEL: splat128_mem_v4f64_from_v2f64:
-; AVX512VL:       # BB#0:
-; AVX512VL-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 = mem[0,1,2,3,0,1,2,3]
-; AVX512VL-NEXT:    retq
+; ALL-LABEL: splat128_mem_v4f64_from_v2f64:
+; ALL:       # BB#0:
+; ALL-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1]
+; ALL-NEXT:    retq
   %v = load <2 x double>, <2 x double>* %ptr
   %shuffle = shufflevector <2 x double> %v, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   ret <4 x double> %shuffle
@@ -1504,3 +1564,56 @@ define <4 x i64> @shuffle_v4i64_1230(<4 x i64> %a) {
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
   ret <4 x i64> %shuffle
 }
+
+define <4 x i64> @shuffle_v4i64_z0z3(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_z0z3:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilpd {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3]
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_z0z3:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_z0z3:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,2,3]
+; AVX512VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
+; AVX512VL-NEXT:    retq
+  %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 4, i32 0, i32 4, i32 3>
+  ret <4 x i64> %1
+}
+
+define <4 x i64> @shuffle_v4i64_1z2z(<4 x i64> %a, <4 x i64> %b) {
+; AVX1-LABEL: shuffle_v4i64_1z2z:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; AVX1-NEXT:    vblendpd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3]
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vpermilpd {{.*#+}} xmm0 = xmm0[1,0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: shuffle_v4i64_1z2z:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX2-NEXT:    retq
+;
+; AVX512VL-LABEL: shuffle_v4i64_1z2z:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,2,0]
+; AVX512VL-NEXT:    retq
+  %1 = shufflevector <4 x i64> %a, <4 x i64> <i64 0, i64 undef, i64 undef, i64 undef>, <4 x i32> <i32 1, i32 4, i32 2, i32 4>
+  ret <4 x i64> %1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-256-v8.ll b/test/CodeGen/X86/vector-shuffle-256-v8.ll
index cba15827d32cbb950336810ab6db8fba23785cc1..8d49321a6af80f5854485739f18f6af2c548a066 100644
--- a/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -266,21 +266,12 @@ define <8 x float> @shuffle_v8f32_2a3b6e7f(<8 x float> %a, <8 x float> %b) {
 }
 
 define <8 x float> @shuffle_v8f32_08192a3b(<8 x float> %a, <8 x float> %b) {
-; AVX1-LABEL: shuffle_v8f32_08192a3b:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v8f32_08192a3b:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3>
-; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,1,u,2,u,3,u>
-; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
-; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
-; AVX2-NEXT:    retq
+; AVX1OR2-LABEL: shuffle_v8f32_08192a3b:
+; AVX1OR2:       # BB#0:
+; AVX1OR2-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1OR2-NEXT:    vunpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX1OR2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1OR2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v8f32_08192a3b:
 ; AVX512VL:       # BB#0:
@@ -1221,10 +1212,9 @@ define <8 x i32> @shuffle_v8i32_08192a3b(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2-LABEL: shuffle_v8i32_08192a3b:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,0,u,1,u,2,u,3>
-; AVX2-NEXT:    vpermd %ymm1, %ymm2, %ymm1
-; AVX2-NEXT:    vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
+; AVX2-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512VL-LABEL: shuffle_v8i32_08192a3b:
@@ -2048,6 +2038,24 @@ define <8 x i32> @shuffle_v8i32_44444444(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %shuffle
 }
 
+define <8 x i32> @shuffle_v8i32_44444444_bc(<8 x float> %a, <8 x float> %b) {
+; AVX1-LABEL: shuffle_v8i32_44444444_bc:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpermilps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4]
+; AVX1-NEXT:    vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,2,3]
+; AVX1-NEXT:    retq
+;
+; AVX2OR512VL-LABEL: shuffle_v8i32_44444444_bc:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX2OR512VL-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2OR512VL-NEXT:    retq
+  %tmp0 = bitcast <8 x float> %a to <8 x i32>
+  %tmp1 = bitcast <8 x float> %b to <8 x i32>
+  %shuffle = shufflevector <8 x i32> %tmp0, <8 x i32> %tmp1, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x i32> %shuffle
+}
+
 define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
 ; AVX1-LABEL: shuffle_v8i32_5555uuuu:
 ; AVX1:       # BB#0:
@@ -2064,6 +2072,21 @@ define <8 x i32> @shuffle_v8i32_5555uuuu(<8 x i32> %a, <8 x i32> %b) {
   ret <8 x i32> %shuffle
 }
 
+; PR32453
+define <8 x i32> @shuffle_v8i32_uuuuuu7u(<8 x i32> %a, <8 x i32> %b) nounwind {
+; AVX1-LABEL: shuffle_v8i32_uuuuuu7u:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7]
+; AVX1-NEXT:    retq
+;
+; AVX2OR512VL-LABEL: shuffle_v8i32_uuuuuu7u:
+; AVX2OR512VL:       # BB#0:
+; AVX2OR512VL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,1,3,3,4,5,7,7]
+; AVX2OR512VL-NEXT:    retq
+  %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 7, i32 undef>
+  ret <8 x i32> %shuffle
+}
+
 define <8 x float> @splat_mem_v8f32_2(float* %p) {
 ; ALL-LABEL: splat_mem_v8f32_2:
 ; ALL:       # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-512-v16.ll b/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 482f07bb0bbeb8e74fbf89c64fceee83fd633bb5..fa3471c2fe40682d14477e24ce249e1e7502e084 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -23,6 +23,18 @@ define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08
   ret <16 x float> %shuffle
 }
 
+define <16 x float> @shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc(<16 x i32> %a, <16 x i32> %b) {
+; ALL-LABEL: shuffle_v16f32_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_bc:
+; ALL:       # BB#0:
+; ALL-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; ALL-NEXT:    vpbroadcastd %xmm0, %zmm0
+; ALL-NEXT:    retq
+  %tmp0 = bitcast <16 x i32> %a to <16 x float>
+  %tmp1 = bitcast <16 x i32> %b to <16 x float>
+  %shuffle = shufflevector <16 x float> %tmp0, <16 x float> %tmp1, <16 x i32><i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>
+  ret <16 x float> %shuffle
+}
+
 define <16 x float> @shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d(<16 x float> %a, <16 x float> %b) {
 ; ALL-LABEL: shuffle_v16f32_00_10_01_11_04_14_05_15_08_18_09_19_0c_1c_0d_1d:
 ; ALL:       # BB#0:
@@ -250,12 +262,19 @@ define <16 x i32> @shuffle_v16i32_load_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19
 }
 
 define <16 x i32> @shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u(<16 x i32> %a, <16 x i32> %b)  {
-; ALL-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:
-; ALL:       # BB#0:
-; ALL-NEXT:    movw $8, %ax
-; ALL-NEXT:    kmovw %eax, %k1
-; ALL-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
-; ALL-NEXT:    retq
+; AVX512F-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movw $8, %ax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: shuffle_v16i32_0_1_2_19_u_u_u_u_u_u_u_u_u_u_u_u:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    movw $8, %ax
+; AVX512BW-NEXT:    kmovd %eax, %k1
+; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm0 {%k1}
+; AVX512BW-NEXT:    retq
   %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
   ret <16 x i32> %c
 }
@@ -385,13 +404,30 @@ define <16 x i32> @shuffle_v8i32_17_16_01_00_21_20_05_04_25_24_09_08_29_28_13_12
   ret <16 x i32> %shuffle
 }
 
-define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, <16 x i32> %passthru, i16 %mask) {
-; ALL-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
+define <16 x float> @shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04(<8 x float> %a) {
+; ALL-LABEL: shuffle_v8f32_v16f32_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04_04:
 ; ALL:       # BB#0:
-; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
-; ALL-NEXT:    vmovdqa64 %zmm1, %zmm0
+; ALL-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; ALL-NEXT:    vbroadcastss %xmm0, %zmm0
 ; ALL-NEXT:    retq
+  %shuffle = shufflevector <8 x float> %a, <8 x float> undef, <16 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ret <16 x float> %shuffle
+}
+
+define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, <16 x i32> %passthru, i16 %mask) {
+; AVX512F-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    valignd {{.*#+}} zmm1 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
+; AVX512BW-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
   %mask.cast = bitcast i16 %mask to <16 x i1>
   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
@@ -399,12 +435,19 @@ define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15
 }
 
 define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
-; ALL-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
-; ALL:       # BB#0:
-; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
-; ALL-NEXT:    vmovdqa64 %zmm2, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    valignd {{.*#+}} zmm2 {%k1} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   %mask.cast = bitcast i16 %mask to <16 x i1>
   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
@@ -412,11 +455,17 @@ define <16 x i32> @mask_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15
 }
 
 define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01(<16 x i32> %a, i16 %mask) {
-; ALL-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
-; ALL:       # BB#0:
-; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
-; ALL-NEXT:    retq
+; AVX512F-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_00_01:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15,0,1]
+; AVX512BW-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1>
   %mask.cast = bitcast i16 %mask to <16 x i1>
   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> zeroinitializer
@@ -424,11 +473,17 @@ define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_1
 }
 
 define <16 x i32> @maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17(<16 x i32> %a, <16 x i32> %b, i16 %mask) {
-; ALL-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
-; ALL:       # BB#0:
-; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
-; ALL-NEXT:    retq
+; AVX512F-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: maskz_shuffle_v16i32_02_03_04_05_06_07_08_09_10_11_12_13_14_15_16_17:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    valignd {{.*#+}} zmm0 {%k1} {z} = zmm0[2,3,4,5,6,7,8,9,10,11,12,13,14,15],zmm1[0,1]
+; AVX512BW-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32><i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17>
   %mask.cast = bitcast i16 %mask to <16 x i1>
   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> zeroinitializer
@@ -498,12 +553,19 @@ define <16 x i32> @test_vshufi32x4_512_mask(<16 x i32> %x, <16 x i32> %x1, <16 x
 }
 
 define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) {
-; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
-; ALL:       # BB#0:
-; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; ALL-NEXT:    vmovaps %zmm2, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vinsertf32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   %mask.cast = bitcast i16 %mask to <16 x i1>
   %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru
@@ -511,12 +573,19 @@ define <16 x float> @mask_shuffle_v16f32_00_01_02_03_04_05_06_07_16_17_18_19_20_
 }
 
 define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x float> %a, <16 x float> %b, <16 x float> %passthru, i16 %mask) {
-; ALL-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
-; ALL:       # BB#0:
-; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; ALL-NEXT:    vmovaps %zmm2, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovaps %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %shuffle = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %mask.cast = bitcast i16 %mask to <16 x i1>
   %res = select <16 x i1> %mask.cast, <16 x float> %shuffle, <16 x float> %passthru
@@ -524,12 +593,19 @@ define <16 x float> @mask_shuffle_v16f32_00_01_02_03_16_17_18_19_08_09_10_11_12_
 }
 
 define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
-; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
-; ALL:       # BB#0:
-; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
-; ALL-NEXT:    vmovdqa64 %zmm2, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21_22_23:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
   %mask.cast = bitcast i16 %mask to <16 x i1>
   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
@@ -537,12 +613,19 @@ define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_04_05_06_07_16_17_18_19_20_21
 }
 
 define <16 x i32> @mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15(<16 x i32> %a, <16 x i32> %b, <16 x i32> %passthru, i16 %mask) {
-; ALL-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
-; ALL:       # BB#0:
-; ALL-NEXT:    kmovw %edi, %k1
-; ALL-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
-; ALL-NEXT:    vmovdqa64 %zmm2, %zmm0
-; ALL-NEXT:    retq
+; AVX512F-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    kmovw %edi, %k1
+; AVX512F-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: mask_shuffle_v16i32_00_01_02_03_16_17_18_19_08_09_10_11_12_13_14_15:
+; AVX512BW:       # BB#0:
+; AVX512BW-NEXT:    kmovd %edi, %k1
+; AVX512BW-NEXT:    vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1}
+; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512BW-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %mask.cast = bitcast i16 %mask to <16 x i1>
   %res = select <16 x i1> %mask.cast, <16 x i32> %shuffle, <16 x i32> %passthru
diff --git a/test/CodeGen/X86/vector-shuffle-512-v8.ll b/test/CodeGen/X86/vector-shuffle-512-v8.ll
index a85e74b363bce02090ddd9578e8eb3e980bf1d3e..30c8d1b2373e48f8499f76613790db6f3f815716 100644
--- a/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -48,6 +48,24 @@ define <8 x double> @shuffle_v8f64_44444444(<8 x double> %a, <8 x double> %b) {
   ret <8 x double> %shuffle
 }
 
+define <8 x double> @shuffle_v8f64_44444444_bc(<8 x i64> %a, <8 x i64> %b) {
+; AVX512F-LABEL: shuffle_v8f64_44444444_bc:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_44444444_bc:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vextracti32x4 $2, %zmm0, %xmm0
+; AVX512F-32-NEXT:    vpbroadcastq %xmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %tmp0 = bitcast <8 x i64> %a to <8 x double>
+  %tmp1 = bitcast <8 x i64> %b to <8 x double>
+  %shuffle = shufflevector <8 x double> %tmp0, <8 x double> %tmp1, <8 x i32> <i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4, i32 4>
+  ret <8 x double> %shuffle
+}
+
 define <8 x double> @shuffle_v8f64_00000010(<8 x double> %a, <8 x double> %b) {
 ; AVX512F-LABEL: shuffle_v8f64_00000010:
 ; AVX512F:       # BB#0:
@@ -958,6 +976,24 @@ define <8 x double> @shuffle_v8f64_f511235a(<8 x double> %a, <8 x double> %b) {
   ret <8 x double> %shuffle
 }
 
+define <8 x double> @shuffle_v8f64_1z2z5z6z(<8 x double> %a, <8 x double> %b) {
+; AVX512F-LABEL: shuffle_v8f64_1z2z5z6z:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vmovapd {{.*#+}} zmm2 = [1,8,2,8,5,8,6,8]
+; AVX512F-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v8f64_1z2z5z6z:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vpxord %zmm1, %zmm1, %zmm1
+; AVX512F-32-NEXT:    vmovapd {{.*#+}} zmm2 = [1,0,8,0,2,0,8,0,5,0,8,0,6,0,8,0]
+; AVX512F-32-NEXT:    vpermt2pd %zmm1, %zmm2, %zmm0
+; AVX512F-32-NEXT:    retl
+  %shuffle = shufflevector <8 x double> %a, <8 x double> <double 0.000000e+00, double undef, double undef, double undef, double undef, double undef, double undef, double undef>, <8 x i32> <i32 1, i32 8, i32 2, i32 8, i32 5, i32 8, i32 6, i32 8>
+  ret <8 x double> %shuffle
+}
+
 define <8 x i64> @shuffle_v8i64_00000000(<8 x i64> %a, <8 x i64> %b) {
 ;
 ; AVX512F-LABEL: shuffle_v8i64_00000000:
@@ -2572,6 +2608,22 @@ define <8 x i64> @shuffle_v8i64_01234589(<8 x i64> %a, <8 x i64> %b) {
   ret <8 x i64> %shuffle
 }
 
+define <8 x double> @shuffle_v4f64_v8f64_22222222(<4 x double> %a) {
+; AVX512F-LABEL: shuffle_v4f64_v8f64_22222222:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512F-NEXT:    vbroadcastsd %xmm0, %zmm0
+; AVX512F-NEXT:    retq
+;
+; AVX512F-32-LABEL: shuffle_v4f64_v8f64_22222222:
+; AVX512F-32:       # BB#0:
+; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX512F-32-NEXT:    vbroadcastsd %xmm0, %zmm0
+; AVX512F-32-NEXT:    retl
+  %shuffle = shufflevector <4 x double> %a, <4 x double> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  ret <8 x double> %shuffle
+}
+
 define <8 x i64> @shuffle_v2i64_v8i64_01010101(<2 x i64> %a) {
 ; AVX512F-LABEL: shuffle_v2i64_v8i64_01010101:
 ; AVX512F:       # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-avx512.ll b/test/CodeGen/X86/vector-shuffle-avx512.ll
index 4098d16d288cc7b065387329e0348e9780eab5d1..5aab21749d14b775751e2d44f2aef827d0ed51cb 100644
--- a/test/CodeGen/X86/vector-shuffle-avx512.ll
+++ b/test/CodeGen/X86/vector-shuffle-avx512.ll
@@ -10,7 +10,7 @@ define <8 x float> @expand(<4 x float> %a) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX64-NEXT:    movb $5, %al
-; SKX64-NEXT:    kmovb %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
@@ -25,7 +25,7 @@ define <8 x float> @expand(<4 x float> %a) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX32-NEXT:    movb $5, %al
-; SKX32-NEXT:    kmovb %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
@@ -44,7 +44,7 @@ define <8 x float> @expand1(<4 x float> %a ) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX64-NEXT:    movb $-86, %al
-; SKX64-NEXT:    kmovb %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
@@ -61,7 +61,7 @@ define <8 x float> @expand1(<4 x float> %a ) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX32-NEXT:    movb $-86, %al
-; SKX32-NEXT:    kmovb %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
@@ -83,7 +83,7 @@ define <4 x double> @expand2(<2 x double> %a) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX64-NEXT:    movb $9, %al
-; SKX64-NEXT:    kmovb %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vexpandpd %ymm0, %ymm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
@@ -99,7 +99,7 @@ define <4 x double> @expand2(<2 x double> %a) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX32-NEXT:    movb $9, %al
-; SKX32-NEXT:    kmovb %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vexpandpd %ymm0, %ymm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
@@ -120,13 +120,12 @@ define <8 x i32> @expand3(<4 x i32> %a ) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX64-NEXT:    movb $-127, %al
-; SKX64-NEXT:    kmovb %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
 ; KNL64-LABEL: expand3:
 ; KNL64:       # BB#0:
-; KNL64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; KNL64-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; KNL64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; KNL64-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
@@ -136,13 +135,12 @@ define <8 x i32> @expand3(<4 x i32> %a ) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX32-NEXT:    movb $-127, %al
-; SKX32-NEXT:    kmovb %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vpexpandd %ymm0, %ymm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
 ; KNL32-LABEL: expand3:
 ; KNL32:       # BB#0:
-; KNL32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; KNL32-NEXT:    vpbroadcastq %xmm0, %ymm0
 ; KNL32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
 ; KNL32-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6],ymm0[7]
@@ -157,7 +155,7 @@ define <4 x i64> @expand4(<2 x i64> %a ) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX64-NEXT:    movb $9, %al
-; SKX64-NEXT:    kmovb %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
@@ -173,7 +171,7 @@ define <4 x i64> @expand4(<2 x i64> %a ) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX32-NEXT:    movb $9, %al
-; SKX32-NEXT:    kmovb %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vpexpandq %ymm0, %ymm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
@@ -255,7 +253,7 @@ define <16 x float> @expand7(<8 x float> %a) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; SKX64-NEXT:    movw $1285, %ax # imm = 0x505
-; SKX64-NEXT:    kmovw %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
@@ -271,7 +269,7 @@ define <16 x float> @expand7(<8 x float> %a) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; SKX32-NEXT:    movw $1285, %ax # imm = 0x505
-; SKX32-NEXT:    kmovw %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
@@ -291,7 +289,7 @@ define <16 x float> @expand8(<8 x float> %a ) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; SKX64-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; SKX64-NEXT:    kmovw %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
@@ -307,7 +305,7 @@ define <16 x float> @expand8(<8 x float> %a ) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; SKX32-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; SKX32-NEXT:    kmovw %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vexpandps %zmm0, %zmm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
@@ -328,7 +326,7 @@ define <8 x double> @expand9(<4 x double> %a) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; SKX64-NEXT:    movb $-127, %al
-; SKX64-NEXT:    kmovb %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
@@ -344,7 +342,7 @@ define <8 x double> @expand9(<4 x double> %a) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; SKX32-NEXT:    movb $-127, %al
-; SKX32-NEXT:    kmovb %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vexpandpd %zmm0, %zmm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
@@ -364,7 +362,7 @@ define <16 x i32> @expand10(<8 x i32> %a ) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; SKX64-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; SKX64-NEXT:    kmovw %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
@@ -380,7 +378,7 @@ define <16 x i32> @expand10(<8 x i32> %a ) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; SKX32-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; SKX32-NEXT:    kmovw %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vpexpandd %zmm0, %zmm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
@@ -400,7 +398,7 @@ define <8 x i64> @expand11(<4 x i64> %a) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; SKX64-NEXT:    movb $-127, %al
-; SKX64-NEXT:    kmovb %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
@@ -416,7 +414,7 @@ define <8 x i64> @expand11(<4 x i64> %a) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; SKX32-NEXT:    movb $-127, %al
-; SKX32-NEXT:    kmovb %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
@@ -507,7 +505,7 @@ define <8 x float> @expand14(<4 x float> %a) {
 ; SKX64:       # BB#0:
 ; SKX64-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX64-NEXT:    movb $20, %al
-; SKX64-NEXT:    kmovb %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
 ; SKX64-NEXT:    retq
 ;
@@ -525,7 +523,7 @@ define <8 x float> @expand14(<4 x float> %a) {
 ; SKX32:       # BB#0:
 ; SKX32-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<def>
 ; SKX32-NEXT:    movb $20, %al
-; SKX32-NEXT:    kmovb %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vexpandps %ymm0, %ymm0 {%k1} {z}
 ; SKX32-NEXT:    retl
 ;
@@ -683,7 +681,7 @@ define <16 x i32> @test_mm512_mask_blend_epi32(<16 x i32> %A, <16 x i32> %W){
 ; SKX64-LABEL: test_mm512_mask_blend_epi32:
 ; SKX64:       # BB#0: # %entry
 ; SKX64-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; SKX64-NEXT:    kmovw %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; SKX64-NEXT:    retq
 ;
@@ -697,7 +695,7 @@ define <16 x i32> @test_mm512_mask_blend_epi32(<16 x i32> %A, <16 x i32> %W){
 ; SKX32-LABEL: test_mm512_mask_blend_epi32:
 ; SKX32:       # BB#0: # %entry
 ; SKX32-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; SKX32-NEXT:    kmovw %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
 ; SKX32-NEXT:    retl
 ;
@@ -716,7 +714,7 @@ define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){
 ; SKX64-LABEL: test_mm512_mask_blend_epi64:
 ; SKX64:       # BB#0: # %entry
 ; SKX64-NEXT:    movb $-86, %al
-; SKX64-NEXT:    kmovb %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; SKX64-NEXT:    retq
 ;
@@ -730,7 +728,7 @@ define <8 x i64> @test_mm512_mask_blend_epi64(<8 x i64> %A, <8 x i64> %W){
 ; SKX32-LABEL: test_mm512_mask_blend_epi64:
 ; SKX32:       # BB#0: # %entry
 ; SKX32-NEXT:    movb $-86, %al
-; SKX32-NEXT:    kmovb %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vpblendmq %zmm0, %zmm1, %zmm0 {%k1}
 ; SKX32-NEXT:    retl
 ;
@@ -749,7 +747,7 @@ define <16 x float> @test_mm512_mask_blend_ps(<16 x float> %A, <16 x float> %W){
 ; SKX64-LABEL: test_mm512_mask_blend_ps:
 ; SKX64:       # BB#0: # %entry
 ; SKX64-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; SKX64-NEXT:    kmovw %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; SKX64-NEXT:    retq
 ;
@@ -763,7 +761,7 @@ define <16 x float> @test_mm512_mask_blend_ps(<16 x float> %A, <16 x float> %W){
 ; SKX32-LABEL: test_mm512_mask_blend_ps:
 ; SKX32:       # BB#0: # %entry
 ; SKX32-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; SKX32-NEXT:    kmovw %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vblendmps %zmm0, %zmm1, %zmm0 {%k1}
 ; SKX32-NEXT:    retl
 ;
@@ -782,7 +780,7 @@ define <8 x double> @test_mm512_mask_blend_pd(<8 x double> %A, <8 x double> %W){
 ; SKX64-LABEL: test_mm512_mask_blend_pd:
 ; SKX64:       # BB#0: # %entry
 ; SKX64-NEXT:    movb $-88, %al
-; SKX64-NEXT:    kmovb %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; SKX64-NEXT:    retq
 ;
@@ -796,7 +794,7 @@ define <8 x double> @test_mm512_mask_blend_pd(<8 x double> %A, <8 x double> %W){
 ; SKX32-LABEL: test_mm512_mask_blend_pd:
 ; SKX32:       # BB#0: # %entry
 ; SKX32-NEXT:    movb $-88, %al
-; SKX32-NEXT:    kmovb %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
 ; SKX32-NEXT:    retl
 ;
@@ -847,7 +845,7 @@ define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){
 ; SKX64-LABEL: test_mm_mask_blend_epi8:
 ; SKX64:       # BB#0: # %entry
 ; SKX64-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; SKX64-NEXT:    kmovw %eax, %k1
+; SKX64-NEXT:    kmovd %eax, %k1
 ; SKX64-NEXT:    vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
 ; SKX64-NEXT:    retq
 ;
@@ -860,7 +858,7 @@ define <16 x i8> @test_mm_mask_blend_epi8(<16 x i8> %A, <16 x i8> %W){
 ; SKX32-LABEL: test_mm_mask_blend_epi8:
 ; SKX32:       # BB#0: # %entry
 ; SKX32-NEXT:    movw $-21846, %ax # imm = 0xAAAA
-; SKX32-NEXT:    kmovw %eax, %k1
+; SKX32-NEXT:    kmovd %eax, %k1
 ; SKX32-NEXT:    vpblendmb %xmm0, %xmm1, %xmm0 {%k1}
 ; SKX32-NEXT:    retl
 ;
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index d34bbb60103169340dfa7f227f194a9425e83599..1385929ab8cd3ced360d075d16f29ad1ecd79b0d 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX2
+; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X32 --check-prefix=X32-AVX512
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=X64 --check-prefix=X64-AVX512
 
 declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)
 declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>)
@@ -72,12 +74,14 @@ define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) {
 define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
 ; X32-LABEL: combine_and_pshufb:
 ; X32:       # BB#0:
-; X32-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_and_pshufb:
 ; X64:       # BB#0:
-; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
 ; X64-NEXT:    retq
   %1 = shufflevector <32 x i8> %a0, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -87,12 +91,14 @@ define <32 x i8> @combine_and_pshufb(<32 x i8> %a0) {
 define <32 x i8> @combine_pshufb_and(<32 x i8> %a0) {
 ; X32-LABEL: combine_pshufb_and:
 ; X32:       # BB#0:
-; X32-NEXT:    vandps {{\.LCPI.*}}, %ymm0, %ymm0
+; X32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_pshufb_and:
 ; X64:       # BB#0:
-; X64-NEXT:    vandps {{.*}}(%rip), %ymm0, %ymm0
+; X64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7],ymm0[8],ymm1[9,10,11],ymm0[12],ymm1[13,14,15]
 ; X64-NEXT:    retq
   %1 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %2 = shufflevector <32 x i8> %1, <32 x i8> zeroinitializer, <32 x i32> <i32 0, i32 1, i32 32, i32 32, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
@@ -480,14 +486,12 @@ define <8 x float> @combine_permps_as_permpd(<8 x float> %a) {
 define <4 x i64> @combine_pshufb_as_zext(<32 x i8> %a0) {
 ; X32-LABEL: combine_pshufb_as_zext:
 ; X32:       # BB#0:
-; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,1]
-; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[10,11],zero,zero,zero,zero,zero,zero,ymm0[20,21],zero,zero,zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero
+; X32-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_pshufb_as_zext:
 ; X64:       # BB#0:
-; X64-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,0,0,1]
-; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[8,9],zero,zero,zero,zero,zero,zero,ymm0[10,11],zero,zero,zero,zero,zero,zero,ymm0[20,21],zero,zero,zero,zero,zero,zero,ymm0[22,23],zero,zero,zero,zero,zero,zero
+; X64-NEXT:    vpmovzxwq {{.*#+}} ymm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X64-NEXT:    retq
   %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 8, i8 9, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 10, i8 11, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
@@ -499,7 +503,7 @@ define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
 ; X32-LABEL: combine_pshufb_as_zext128:
 ; X32:       # BB#0:
 ; X32-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
-; X32-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
+; X32-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[15,14],zero,zero,zero,zero,zero,zero,ymm0[13,12],zero,zero,zero,zero,zero,zero,ymm0[31,30],zero,zero,zero,zero,zero,zero,ymm0[29,28],zero,zero,zero,zero,zero,zero
 ; X32-NEXT:    retl
 ;
@@ -516,17 +520,29 @@ define <4 x i64> @combine_pshufb_as_zext128(<32 x i8> %a0) {
 }
 
 define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
-; X32-LABEL: combine_pshufb_as_vzmovl_64:
-; X32:       # BB#0:
-; X32-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; X32-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_pshufb_as_vzmovl_64:
-; X64:       # BB#0:
-; X64-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
-; X64-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
-; X64-NEXT:    retq
+; X32-AVX2-LABEL: combine_pshufb_as_vzmovl_64:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X32-AVX2-NEXT:    retl
+;
+; X32-AVX512-LABEL: combine_pshufb_as_vzmovl_64:
+; X32-AVX512:       # BB#0:
+; X32-AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X32-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; X32-AVX512-NEXT:    retl
+;
+; X64-AVX2-LABEL: combine_pshufb_as_vzmovl_64:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vxorpd %ymm1, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: combine_pshufb_as_vzmovl_64:
+; X64-AVX512:       # BB#0:
+; X64-AVX512-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; X64-AVX512-NEXT:    retq
   %1 = bitcast <4 x double> %a0 to <32 x i8>
   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %3 = bitcast <32 x i8> %2 to <4 x double>
@@ -534,17 +550,29 @@ define <4 x double> @combine_pshufb_as_vzmovl_64(<4 x double> %a0) {
 }
 
 define <8 x float> @combine_pshufb_as_vzmovl_32(<8 x float> %a0) {
-; X32-LABEL: combine_pshufb_as_vzmovl_32:
-; X32:       # BB#0:
-; X32-NEXT:    vxorps %ymm1, %ymm1, %ymm1
-; X32-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; X32-NEXT:    retl
-;
-; X64-LABEL: combine_pshufb_as_vzmovl_32:
-; X64:       # BB#0:
-; X64-NEXT:    vxorps %ymm1, %ymm1, %ymm1
-; X64-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
-; X64-NEXT:    retq
+; X32-AVX2-LABEL: combine_pshufb_as_vzmovl_32:
+; X32-AVX2:       # BB#0:
+; X32-AVX2-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X32-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; X32-AVX2-NEXT:    retl
+;
+; X32-AVX512-LABEL: combine_pshufb_as_vzmovl_32:
+; X32-AVX512:       # BB#0:
+; X32-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X32-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X32-AVX512-NEXT:    retl
+;
+; X64-AVX2-LABEL: combine_pshufb_as_vzmovl_32:
+; X64-AVX2:       # BB#0:
+; X64-AVX2-NEXT:    vxorps %ymm1, %ymm1, %ymm1
+; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
+; X64-AVX2-NEXT:    retq
+;
+; X64-AVX512-LABEL: combine_pshufb_as_vzmovl_32:
+; X64-AVX512:       # BB#0:
+; X64-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1
+; X64-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-AVX512-NEXT:    retq
   %1 = bitcast <8 x float> %a0 to <32 x i8>
   %2 = call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %1, <32 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>)
   %3 = bitcast <32 x i8> %2 to <8 x float>
@@ -664,6 +692,51 @@ define <32 x i8> @combine_pshufb_not_as_pshufw(<32 x i8> %a0) {
   ret <32 x i8> %res1
 }
 
+define <32 x i8> @combine_pshufb_as_unpacklo_undef(<32 x i8> %a0) {
+; X32-LABEL: combine_pshufb_as_unpacklo_undef:
+; X32:       # BB#0:
+; X32-NEXT:    retl
+;
+; X64-LABEL: combine_pshufb_as_unpacklo_undef:
+; X64:       # BB#0:
+; X64-NEXT:    retq
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 undef, i8 0, i8 undef, i8 1, i8 undef, i8 2, i8 undef, i8 3, i8 undef, i8 4, i8 undef, i8 5, i8 undef, i8 6, i8 undef, i8 7, i8 undef, i8 16, i8 undef, i8 17, i8 undef, i8 18, i8 undef, i8 19, i8 undef, i8 20, i8 undef, i8 21, i8 undef, i8 22, i8 undef, i8 23>)
+  %2 = shufflevector <32 x i8> %1, <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14, i32 16, i32 16, i32 18, i32 18, i32 20, i32 20, i32 22, i32 22, i32 24, i32 24, i32 26, i32 26, i32 28, i32 28, i32 30, i32 30>
+  ret <32 x i8> %2
+}
+
+define <32 x i8> @combine_pshufb_as_unpacklo_zero(<32 x i8> %a0) {
+; X32-LABEL: combine_pshufb_as_unpacklo_zero:
+; X32:       # BB#0:
+; X32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; X32-NEXT:    retl
+;
+; X64-LABEL: combine_pshufb_as_unpacklo_zero:
+; X64:       # BB#0:
+; X64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11]
+; X64-NEXT:    retq
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 0, i8 1, i8 -1, i8 -1, i8 2, i8 3, i8 -1, i8 -1, i8 4, i8 5, i8 -1, i8 -1, i8 6, i8 7, i8 -1, i8 -1, i8 16, i8 17, i8 -1, i8 -1, i8 18, i8 19, i8 -1, i8 -1, i8 20, i8 21, i8 -1, i8 -1, i8 22, i8 23, i8 -1, i8 -1>)
+  ret <32 x i8> %1
+}
+
+define <32 x i8> @combine_pshufb_as_unpackhi_zero(<32 x i8> %a0) {
+; X32-LABEL: combine_pshufb_as_unpackhi_zero:
+; X32:       # BB#0:
+; X32-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X32-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; X32-NEXT:    retl
+;
+; X64-LABEL: combine_pshufb_as_unpackhi_zero:
+; X64:       # BB#0:
+; X64-NEXT:    vpxor %ymm1, %ymm1, %ymm1
+; X64-NEXT:    vpunpckhbw {{.*#+}} ymm0 = ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15],ymm1[24],ymm0[24],ymm1[25],ymm0[25],ymm1[26],ymm0[26],ymm1[27],ymm0[27],ymm1[28],ymm0[28],ymm1[29],ymm0[29],ymm1[30],ymm0[30],ymm1[31],ymm0[31]
+; X64-NEXT:    retq
+  %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> %a0, <32 x i8> <i8 -1, i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1, i8 24, i8 -1, i8 25, i8 -1, i8 26, i8 -1, i8 27, i8 -1, i8 28, i8 -1, i8 29, i8 -1, i8 30, i8 -1, i8 31>)
+  ret <32 x i8> %1
+}
+
 define <32 x i8> @combine_psrlw_pshufb(<16 x i16> %a0) {
 ; X32-LABEL: combine_psrlw_pshufb:
 ; X32:       # BB#0:
@@ -712,6 +785,59 @@ define <32 x i8> @combine_psrlq_pshufb(<4 x i64> %a0) {
   ret <32 x i8> %3
 }
 
+define <32 x i8> @combine_unpack_unpack_pshufb(<32 x i8> %a0) {
+; X32-LABEL: combine_unpack_unpack_pshufb:
+; X32:       # BB#0:
+; X32-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
+; X32-NEXT:    retl
+;
+; X64-LABEL: combine_unpack_unpack_pshufb:
+; X64:       # BB#0:
+; X64-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,0,4,8,1,1,5,9,2,2,6,10,3,3,7,11,16,16,20,24,17,17,21,25,18,18,22,26,19,19,23,27]
+; X64-NEXT:    retq
+  %1 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19, i32 16, i32 17, i32 18, i32 19>
+  %2 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %3 = shufflevector <32 x i8> %a0, <32 x i8> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 24, i32 25, i32 26, i32 27, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %4 = shufflevector <32 x i8> %1, <32 x i8> %2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %5 = shufflevector <32 x i8> %1, <32 x i8> %3, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <32 x i8> %4, <32 x i8> %5, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  ret <32 x i8> %6
+}
+
+define <16 x i8> @combine_pshufb_insertion_as_broadcast_v2i64(i64 %a0) {
+; X32-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
+; X32:       # BB#0:
+; X32-NEXT:    vpbroadcastq {{[0-9]+}}(%esp), %xmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: combine_pshufb_insertion_as_broadcast_v2i64:
+; X64:       # BB#0:
+; X64-NEXT:    vmovq %rdi, %xmm0
+; X64-NEXT:    vpbroadcastq %xmm0, %xmm0
+; X64-NEXT:    retq
+  %1 = insertelement <2 x i64> undef, i64 %a0, i32 0
+  %2 = bitcast <2 x i64> %1 to <16 x i8>
+  %3 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %2, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7>)
+  ret <16 x i8> %3
+}
+
+define <8 x i32> @combine_permd_insertion_as_broadcast_v4i64(i64 %a0) {
+; X32-LABEL: combine_permd_insertion_as_broadcast_v4i64:
+; X32:       # BB#0:
+; X32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %ymm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: combine_permd_insertion_as_broadcast_v4i64:
+; X64:       # BB#0:
+; X64-NEXT:    vmovq %rdi, %xmm0
+; X64-NEXT:    vpbroadcastq %xmm0, %ymm0
+; X64-NEXT:    retq
+  %1 = insertelement <4 x i64> undef, i64 %a0, i32 0
+  %2 = bitcast <4 x i64> %1 to <8 x i32>
+  %3 = tail call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %2, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>)
+  ret <8 x i32> %3
+}
+
 define <8 x i32> @constant_fold_permd() {
 ; X32-LABEL: constant_fold_permd:
 ; X32:       # BB#0:
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index 687098f9abf3a8b6850d79f1289ba6a97ce5ede2..b68f609fc65dcf8059629a81a3858d883e5b3c9f 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -51,7 +51,7 @@ define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x d
 ;
 ; X64-LABEL: combine_permvar_8f64_identity_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X64-NEXT:    vpermpd %zmm0, %zmm2, %zmm1 {%k1}
 ; X64-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
@@ -66,10 +66,6 @@ define <8 x double> @combine_permvar_8f64_identity_mask(<8 x double> %x0, <8 x d
 define <8 x i64> @combine_permvar_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
 ; X32-LABEL: combine_permvar_8i64_identity:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
-; X32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,14,0,5,0,12,0,3,0,10,0,1,0,8,0]
-; X32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_permvar_8i64_identity:
@@ -93,7 +89,7 @@ define <8 x i64> @combine_permvar_8i64_identity_mask(<8 x i64> %x0, <8 x i64> %x
 ;
 ; X64-LABEL: combine_permvar_8i64_identity_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X64-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
 ; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
@@ -130,7 +126,7 @@ define <8 x double> @combine_vpermt2var_8f64_identity_mask(<8 x double> %x0, <8
 ;
 ; X64-LABEL: combine_vpermt2var_8f64_identity_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovapd {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X64-NEXT:    vpermi2pd %zmm1, %zmm0, %zmm2 {%k1} {z}
 ; X64-NEXT:    vmovapd {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
@@ -179,7 +175,7 @@ define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x
 ;
 ; X64-LABEL: combine_vpermt2var_8f64_movddup_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6]
 ; X64-NEXT:    retq
   %res0 = call <8 x double> @llvm.x86.avx512.maskz.vpermt2var.pd.512(<8 x i64> <i64 0, i64 0, i64 2, i64 2, i64 4, i64 4, i64 6, i64 6>, <8 x double> %x0, <8 x double> %x1, i8 %m)
@@ -189,10 +185,6 @@ define <8 x double> @combine_vpermt2var_8f64_movddup_mask(<8 x double> %x0, <8 x
 define <8 x i64> @combine_vpermt2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
 ; X32-LABEL: combine_vpermt2var_8i64_identity:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,6,0,5,0,4,0,3,0,2,0,1,0,0,0>
-; X32-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,14,0,5,0,12,0,3,0,10,0,1,0,8,0>
-; X32-NEXT:    vpermi2q %zmm2, %zmm2, %zmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_8i64_identity:
@@ -215,7 +207,7 @@ define <8 x i64> @combine_vpermt2var_8i64_identity_mask(<8 x i64> %x0, <8 x i64>
 ;
 ; X64-LABEL: combine_vpermt2var_8i64_identity_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [7,6,5,4,3,2,1,0]
 ; X64-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2 {%k1} {z}
 ; X64-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,14,5,12,3,10,1,8]
@@ -250,7 +242,7 @@ define <16 x float> @combine_vpermt2var_16f32_identity_mask(<16 x float> %x0, <1
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_identity_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovaps {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    vpermi2ps %zmm1, %zmm0, %zmm2 {%k1} {z}
 ; X64-NEXT:    vmovaps {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
@@ -307,7 +299,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask(<16 x float> %x0, <1
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovaps {{.*#+}} zmm2 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
 ; X64-NEXT:    vpermt2ps %zmm1, %zmm2, %zmm0 {%k1} {z}
 ; X64-NEXT:    retq
@@ -327,7 +319,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovddup_mask_load(<16 x float> *%
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovddup_mask_load:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %esi, %k1
+; X64-NEXT:    kmovd %esi, %k1
 ; X64-NEXT:    vmovaps (%rdi), %zmm2
 ; X64-NEXT:    vmovaps {{.*#+}} zmm1 = [0,1,0,1,4,5,4,5,8,9,8,9,12,13,12,13]
 ; X64-NEXT:    vpermi2ps %zmm0, %zmm2, %zmm1 {%k1} {z}
@@ -375,7 +367,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovshdup_mask(<16 x float> %x0, <
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovshdup_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15]
 ; X64-NEXT:    retq
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>, <16 x float> %x0, <16 x float> %x1, i16 %m)
@@ -419,7 +411,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask(<16 x float> %x0, <
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; X64-NEXT:    retq
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 undef, i32 0, i32 undef, i32 2, i32 4, i32 4, i32 6, i32 6, i32 8, i32 8, i32 10, i32 10, i32 12, i32 12, i32 14, i32 14>, <16 x float> %x0, <16 x float> %x1, i16 %m)
@@ -435,7 +427,7 @@ define <16 x float> @combine_vpermt2var_16f32_vmovsldup_mask_load(<16 x float> *
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vmovsldup_mask_load:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %esi, %k1
+; X64-NEXT:    kmovd %esi, %k1
 ; X64-NEXT:    vmovsldup {{.*#+}} zmm0 {%k1} {z} = mem[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14]
 ; X64-NEXT:    retq
   %x0 = load <16 x float>, <16 x float> *%p0
@@ -480,7 +472,7 @@ define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask(<16 x float> %x0, <
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = zmm0[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; X64-NEXT:    retq
   %res0 = call <16 x float> @llvm.x86.avx512.maskz.vpermt2var.ps.512(<16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>, <16 x float> %x0, <16 x float> %x1, i16 %m)
@@ -496,7 +488,7 @@ define <16 x float> @combine_vpermt2var_16f32_vpermilps_mask_load(<16 x float> *
 ;
 ; X64-LABEL: combine_vpermt2var_16f32_vpermilps_mask_load:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %esi, %k1
+; X64-NEXT:    kmovd %esi, %k1
 ; X64-NEXT:    vpermilps {{.*#+}} zmm0 {%k1} {z} = mem[3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12]
 ; X64-NEXT:    retq
   %x0 = load <16 x float>, <16 x float> *%p0
@@ -528,7 +520,7 @@ define <16 x i32> @combine_vpermt2var_16i32_identity_mask(<16 x i32> %x0, <16 x
 ;
 ; X64-LABEL: combine_vpermt2var_16i32_identity_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovdqa32 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2 {%k1} {z}
 ; X64-NEXT:    vmovdqa32 {{.*#+}} zmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
@@ -648,8 +640,7 @@ define <16 x i32> @combine_permvar_as_vpbroadcastd512(<16 x i32> %x0) {
 define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
 ; X32-LABEL: combine_permvar_as_vpbroadcastq512:
 ; X32:       # BB#0:
-; X32-NEXT:    vpxord %zmm1, %zmm1, %zmm1
-; X32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; X32-NEXT:    vbroadcastsd %xmm0, %zmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_permvar_as_vpbroadcastq512:
@@ -663,8 +654,7 @@ define <8 x i64> @combine_permvar_as_vpbroadcastq512(<8 x i64> %x0) {
 define <8 x i64> @combine_permvar_8i64_as_permq(<8 x i64> %x0, <8 x i64> %x1) {
 ; X32-LABEL: combine_permvar_8i64_as_permq:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <3,0,2,0,1,0,u,u,u,u,6,0,5,0,4,0>
-; X32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
+; X32-NEXT:    vpermq {{.*#+}} zmm0 = zmm0[3,2,1,0,7,6,5,4]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_permvar_8i64_as_permq:
@@ -679,14 +669,13 @@ define <8 x i64> @combine_permvar_8i64_as_permq_mask(<8 x i64> %x0, <8 x i64> %x
 ; X32:       # BB#0:
 ; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    kmovd %eax, %k1
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <3,0,2,0,1,0,u,u,u,u,6,0,5,0,4,0>
-; X32-NEXT:    vpermq %zmm0, %zmm2, %zmm1 {%k1}
+; X32-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
 ; X32-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_permvar_8i64_as_permq_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vpermq {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
 ; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; X64-NEXT:    retq
@@ -718,7 +707,7 @@ define <8 x double> @combine_permvar_8f64_as_permpd_mask(<8 x double> %x0, <8 x
 ;
 ; X64-LABEL: combine_permvar_8f64_as_permpd_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vpermpd {{.*#+}} zmm1 {%k1} = zmm0[3,2,1,0,7,6,5,4]
 ; X64-NEXT:    vmovapd %zmm1, %zmm0
 ; X64-NEXT:    retq
@@ -872,10 +861,6 @@ define <8 x double> @combine_vpermi2var_8f64_as_shufpd(<8 x double> %x0, <8 x do
 define <8 x i64> @combine_vpermi2var_8i64_identity(<8 x i64> %x0, <8 x i64> %x1) {
 ; X32-LABEL: combine_vpermi2var_8i64_identity:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,6,0,5,0,4,0,3,0,2,0,1,0,0,0>
-; X32-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,14,0,5,0,12,0,3,0,10,0,1,0,8,0>
-; X32-NEXT:    vpermi2q %zmm2, %zmm2, %zmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermi2var_8i64_identity:
@@ -973,10 +958,8 @@ define <8 x double> @combine_vpermi2var_8f64_as_vpermpd(<8 x double> %x0, <8 x d
 define <8 x i64> @combine_vpermt2var_8i64_as_vpermq(<8 x i64> %x0, <8 x i64> %x1) {
 ; X32-LABEL: combine_vpermt2var_8i64_as_vpermq:
 ; X32:       # BB#0:
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [3,0,2,0,1,0,0,0,7,0,6,0,5,0,4,0]
-; X32-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; X32-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [12,0,5,0,14,0,7,0,8,0,1,0,10,0,3,0]
-; X32-NEXT:    vpermi2q %zmm2, %zmm2, %zmm0
+; X32-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,0,6,0,5,0,4,0,3,0,2,0,1,0,0,0]
+; X32-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: combine_vpermt2var_8i64_as_vpermq:
@@ -1133,3 +1116,18 @@ define <16 x float> @combine_vpermi2var_vpermvar_16f32_as_vperm2_zero(<16 x floa
   ret <16 x float> %res1
 }
 
+define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) {
+; X32-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
+; X32:       # BB#0:
+; X32-NEXT:    vbroadcastsd {{[0-9]+}}(%esp), %zmm0
+; X32-NEXT:    retl
+;
+; X64-LABEL: combine_vpermvar_insertion_as_broadcast_v8i64:
+; X64:       # BB#0:
+; X64-NEXT:    vmovq %rdi, %xmm0
+; X64-NEXT:    vpbroadcastq %xmm0, %zmm0
+; X64-NEXT:    retq
+  %1 = insertelement <8 x i64> undef, i64 %a0, i32 0
+  %2 = tail call <8 x i64> @llvm.x86.avx512.mask.permvar.di.512(<8 x i64> %1, <8 x i64> zeroinitializer, <8 x i64> undef, i8 -1)
+  ret <8 x i64> %2
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
index ab10ba32e60567e96f35bf0779c604bf3a7781ce..954dbe5edc63e0467372d044a8a6d70c26b99b24 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512bwvl.ll
@@ -30,7 +30,7 @@ define <16 x i16> @combine_vpermt2var_16i16_identity_mask(<16 x i16> %x0, <16 x
 ;
 ; X64-LABEL: combine_vpermt2var_16i16_identity_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovdqu {{.*#+}} ymm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2 {%k1} {z}
 ; X64-NEXT:    vmovdqu {{.*#+}} ymm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
diff --git a/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
index 2f58fa830d5db2fd51b55f8888f9c766bcdda55c..ad6b5ee0549466b716edd87093c6cd8e5d8cfef9 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
@@ -45,7 +45,7 @@ define <16 x i8> @combine_vpermt2var_16i8_identity_mask(<16 x i8> %x0, <16 x i8>
 ;
 ; X64-LABEL: combine_vpermt2var_16i8_identity_mask:
 ; X64:       # BB#0:
-; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    kmovd %edi, %k1
 ; X64-NEXT:    vmovdqu {{.*#+}} xmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    vpermi2b %xmm1, %xmm0, %xmm2 {%k1} {z}
 ; X64-NEXT:    vmovdqu {{.*#+}} xmm0 = [15,30,13,28,11,26,9,24,7,22,5,20,3,18,1,16]
diff --git a/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
new file mode 100644
index 0000000000000000000000000000000000000000..29e2124a168c1cf31f708c6746183a9a0fd81da9
--- /dev/null
+++ b/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -0,0 +1,23 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512F
+;
+; Combine tests involving SSE41 target shuffles (BLEND,INSERTPS,MOVZX)
+
+declare <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8>, <16 x i8>)
+
+define <16 x i8> @combine_vpshufb_as_movzx(<16 x i8> %a0) {
+; SSE-LABEL: combine_vpshufb_as_movzx:
+; SSE:       # BB#0:
+; SSE-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_vpshufb_as_movzx:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX-NEXT:    retq
+  %res0 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 undef, i8 undef, i8 -1, i8 -1, i8 -1, i8 -1>)
+  ret <16 x i8> %res0
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 7e29a48d5cd589717ccb5daacdf52e7cac9cb32d..546b731260396f6246887ea18fe170ec4fde4f50 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -473,6 +473,58 @@ define <16 x i8> @combine_pshufb_as_unary_unpckhwd(<16 x i8> %a0) {
   ret <16 x i8> %1
 }
 
+define <8 x i16> @combine_pshufb_as_unpacklo_undef(<16 x i8> %a0) {
+; ALL-LABEL: combine_pshufb_as_unpacklo_undef:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 undef, i8 undef, i8 0, i8 1, i8 undef, i8 undef, i8 2, i8 3, i8 undef, i8 undef, i8 4, i8 5, i8 undef, i8 undef, i8 6, i8 7>)
+  %2 = bitcast <16 x i8> %1 to <8 x i16>
+  %3 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 6, i32 6>
+  ret <8 x i16> %3
+}
+
+define <16 x i8> @combine_pshufb_as_unpackhi_undef(<16 x i8> %a0) {
+; ALL-LABEL: combine_pshufb_as_unpackhi_undef:
+; ALL:       # BB#0:
+; ALL-NEXT:    retq
+  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 undef, i8 9, i8 undef, i8 10, i8 undef, i8 11, i8 undef, i8 12, i8 undef, i8 13, i8 undef, i8 14, i8 undef, i8 15, i8 undef>)
+  %2 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  ret <16 x i8> %2
+}
+
+define <16 x i8> @combine_pshufb_as_unpacklo_zero(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_unpacklo_zero:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshufb_as_unpacklo_zero:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; AVX-NEXT:    retq
+  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 0, i8 1, i8 2, i8 3, i8 -1, i8 -1, i8 -1, i8 -1, i8 4, i8 5, i8 6, i8 7>)
+  ret <16 x i8> %1
+}
+
+define <16 x i8> @combine_pshufb_as_unpackhi_zero(<16 x i8> %a0) {
+; SSE-LABEL: combine_pshufb_as_unpackhi_zero:
+; SSE:       # BB#0:
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: combine_pshufb_as_unpackhi_zero:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX-NEXT:    retq
+  %1 = call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %a0, <16 x i8> <i8 8, i8 -1, i8 9, i8 -1, i8 10, i8 -1, i8 11, i8 -1, i8 12, i8 -1, i8 13, i8 -1, i8 14, i8 -1, i8 15, i8 -1>)
+  ret <16 x i8> %1
+}
+
 define <16 x i8> @combine_psrlw_pshufb(<8 x i16> %a0) {
 ; SSE-LABEL: combine_psrlw_pshufb:
 ; SSE:       # BB#0:
@@ -552,6 +604,27 @@ define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
   ret <16 x i8> %2
 }
 
+define <8 x i16> @shuffle_combine_unpack_insert(<8 x i16> %a0) {
+; SSE-LABEL: shuffle_combine_unpack_insert:
+; SSE:       # BB#0:
+; SSE-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: shuffle_combine_unpack_insert:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,8,9,8,9,10,11,10,11]
+; AVX-NEXT:    retq
+  %1 = extractelement <8 x i16> %a0, i32 2
+  %2 = extractelement <8 x i16> %a0, i32 4
+  %3 = insertelement <8 x i16> %a0, i16 %1, i32 4
+  %4 = insertelement <8 x i16> %a0, i16 %2, i32 2
+  %5 = shufflevector <8 x i16> %3, <8 x i16> %4, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  %6 = shufflevector <8 x i16> %5, <8 x i16> %3, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
+  %7 = shufflevector <8 x i16> %5, <8 x i16> %a0, <8 x i32> <i32 4, i32 12, i32 5, i32 13, i32 undef, i32 undef, i32 undef, i32 undef>
+  %8 = shufflevector <8 x i16> %6, <8 x i16> %7, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11>
+  ret <8 x i16> %8
+}
+
 define <16 x i8> @constant_fold_pshufb() {
 ; SSE-LABEL: constant_fold_pshufb:
 ; SSE:       # BB#0:
@@ -565,3 +638,38 @@ define <16 x i8> @constant_fold_pshufb() {
   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
   ret <16 x i8> %1
 }
+
+; FIXME - unnecessary pshufb/broadcast being used - pshufb mask only needs lowest byte.
+define <16 x i8> @constant_fold_pshufb_2() {
+; SSE-LABEL: constant_fold_pshufb_2:
+; SSE:       # BB#0:
+; SSE-NEXT:    movl $2, %eax
+; SSE-NEXT:    movd %eax, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pshufb %xmm1, %xmm0
+; SSE-NEXT:    retq
+;
+; AVX1-LABEL: constant_fold_pshufb_2:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    movl $2, %eax
+; AVX1-NEXT:    vmovd %eax, %xmm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: constant_fold_pshufb_2:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    movl $2, %eax
+; AVX2-NEXT:    vmovd %eax, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: constant_fold_pshufb_2:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    movl $2, %eax
+; AVX512F-NEXT:    vmovd %eax, %xmm0
+; AVX512F-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX512F-NEXT:    retq
+  %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 2, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> <i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef>)
+  ret <16 x i8> %1
+}
diff --git a/test/CodeGen/X86/vector-shuffle-combining-xop.ll b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
index 73bb5b1f46e8e6ef78211f6b0547b2db45c50b7c..a9dff916431654f50115a78ad6ee8aef318b76ee 100644
--- a/test/CodeGen/X86/vector-shuffle-combining-xop.ll
+++ b/test/CodeGen/X86/vector-shuffle-combining-xop.ll
@@ -318,6 +318,7 @@ define <4 x i32> @combine_vpperm_10zz32BA(<4 x i32> %a0, <4 x i32> %a1) {
   ret <4 x i32> %res3
 }
 
+; FIXME: Duplicated load in i686
 define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) {
 ; X32-LABEL: buildvector_v4f32_0404:
 ; X32:       # BB#0:
@@ -341,6 +342,30 @@ define void @buildvector_v4f32_0404(float %a, float %b, <4 x float>* %ptr) {
   ret void
 }
 
+define void @buildvector_v4f32_07z6(float %a, <4 x float> %b, <4 x float>* %ptr) {
+; X32-LABEL: buildvector_v4f32_07z6:
+; X32:       # BB#0:
+; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X32-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm1[0],xmm0[3],zero,xmm0[2]
+; X32-NEXT:    vmovaps %xmm0, (%eax)
+; X32-NEXT:    retl
+;
+; X64-LABEL: buildvector_v4f32_07z6:
+; X64:       # BB#0:
+; X64-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[3],zero,xmm1[2]
+; X64-NEXT:    vmovaps %xmm0, (%rdi)
+; X64-NEXT:    retq
+  %b2 = extractelement <4 x float> %b, i32 2
+  %b3 = extractelement <4 x float> %b, i32 3
+  %v0 = insertelement <4 x float> undef, float  %a, i32 0
+  %v1 = insertelement <4 x float> %v0,   float %b3, i32 1
+  %v2 = insertelement <4 x float> %v1,   float 0.0, i32 2
+  %v3 = insertelement <4 x float> %v2,   float %b2, i32 3
+  store <4 x float> %v3, <4 x float>* %ptr
+  ret void
+}
+
 define <2 x double> @constant_fold_vpermil2pd() {
 ; X32-LABEL: constant_fold_vpermil2pd:
 ; X32:       # BB#0:
@@ -416,16 +441,14 @@ define <4 x float> @PR31296(i8* %in) {
 ; X32:       # BB#0: # %entry
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X32-NEXT:    vmovaps {{.*#+}} xmm1 = <0,1,u,u>
-; X32-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,0,1]
+; X32-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,mem[0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: PR31296:
 ; X64:       # BB#0: # %entry
 ; X64-NEXT:    movl (%rdi), %eax
 ; X64-NEXT:    vmovq %rax, %xmm0
-; X64-NEXT:    vmovaps {{.*#+}} xmm1 = <0,1,u,u>
-; X64-NEXT:    vpermil2ps {{.*#+}} xmm0 = xmm0[0],xmm1[0,0,1]
+; X64-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,mem[0]
 ; X64-NEXT:    retq
 entry:
   %0 = getelementptr i8, i8* %in, i32 0
diff --git a/test/CodeGen/X86/vector-shuffle-masked.ll b/test/CodeGen/X86/vector-shuffle-masked.ll
index 3ae5029fe90582f492ad90cbfaa6f4ff01f78643..cedec449f6f426bd0d09b7dcdda21dfeb049f2f4 100644
--- a/test/CodeGen/X86/vector-shuffle-masked.ll
+++ b/test/CodeGen/X86/vector-shuffle-masked.ll
@@ -4,7 +4,7 @@
 define <4 x i32> @mask_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v4i32_1234:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd {{.*#+}} xmm2 {%k1} = xmm0[1,2,3],xmm1[0]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -18,7 +18,7 @@ define <4 x i32> @mask_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 define <4 x i32> @maskz_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v4i32_1234:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd {{.*#+}} xmm0 {%k1} {z} = xmm0[1,2,3],xmm1[0]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -31,7 +31,7 @@ define <4 x i32> @maskz_shuffle_v4i32_1234(<4 x i32> %a, <4 x i32> %b, i8 %mask)
 define <4 x i32> @mask_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, <4 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v4i32_2345:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd {{.*#+}} xmm2 {%k1} = xmm0[2,3],xmm1[0,1]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -45,7 +45,7 @@ define <4 x i32> @mask_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, <4 x i32>
 define <4 x i32> @maskz_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v4i32_2345:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd {{.*#+}} xmm0 {%k1} {z} = xmm0[2,3],xmm1[0,1]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 4, i32 5>
@@ -58,7 +58,7 @@ define <4 x i32> @maskz_shuffle_v4i32_2345(<4 x i32> %a, <4 x i32> %b, i8 %mask)
 define <2 x i64> @mask_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, <2 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v2i64_12:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignq {{.*#+}} xmm2 {%k1} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    vmovdqa %xmm2, %xmm0
 ; CHECK-NEXT:    retq
@@ -72,7 +72,7 @@ define <2 x i64> @mask_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, <2 x i64> %p
 define <2 x i64> @maskz_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v2i64_12:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignq {{.*#+}} xmm0 {%k1} {z} = xmm0[1],xmm1[0]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <2 x i64> %a, <2 x i64> %b, <2 x i32> <i32 1, i32 2>
@@ -85,7 +85,7 @@ define <2 x i64> @maskz_shuffle_v2i64_12(<2 x i64> %a, <2 x i64> %b, i8 %mask) {
 define <4 x i64> @mask_shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b, <4 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v4i64_1234:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignq {{.*#+}} ymm2 {%k1} = ymm0[1,2,3],ymm1[0]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -99,7 +99,7 @@ define <4 x i64> @mask_shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b, <4 x i64>
 define <4 x i64> @maskz_shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v4i64_1234:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3],ymm1[0]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
@@ -112,7 +112,7 @@ define <4 x i64> @maskz_shuffle_v4i64_1234(<4 x i64> %a, <4 x i64> %b, i8 %mask)
 define <4 x i64> @mask_shuffle_v4i64_1230(<4 x i64> %a, <4 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v4i64_1230:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm1 {%k1} = ymm0[1,2,3,0]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -126,7 +126,7 @@ define <4 x i64> @mask_shuffle_v4i64_1230(<4 x i64> %a, <4 x i64> %passthru, i8
 define <4 x i64> @maskz_shuffle_v4i64_1230(<4 x i64> %a, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v4i64_1230:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3,0]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> <i32 1, i32 2, i32 3, i32 0>
@@ -139,7 +139,7 @@ define <4 x i64> @maskz_shuffle_v4i64_1230(<4 x i64> %a, i8 %mask) {
 define <8 x i32> @mask_shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v8i32_12345678:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd {{.*#+}} ymm2 {%k1} = ymm0[1,2,3,4,5,6,7],ymm1[0]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -152,7 +152,7 @@ define <8 x i32> @mask_shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b, <8 x i
 define <8 x i32> @maskz_shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v8i32_12345678:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3,4,5,6,7],ymm1[0]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8>
@@ -164,7 +164,7 @@ define <8 x i32> @maskz_shuffle_v8i32_12345678(<8 x i32> %a, <8 x i32> %b, i8 %m
 define <8 x i32> @mask_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v8i32_23456789:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd {{.*#+}} ymm2 {%k1} = ymm0[2,3,4,5,6,7],ymm1[0,1]
 ; CHECK-NEXT:    vmovdqa %ymm2, %ymm0
 ; CHECK-NEXT:    retq
@@ -177,7 +177,7 @@ define <8 x i32> @mask_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, <8 x i
 define <8 x i32> @maskz_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v8i32_23456789:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[2,3,4,5,6,7],ymm1[0,1]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9>
@@ -189,7 +189,7 @@ define <8 x i32> @maskz_shuffle_v8i32_23456789(<8 x i32> %a, <8 x i32> %b, i8 %m
 define <8 x i32> @mask_shuffle_v8i32_12345670(<8 x i32> %a, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_shuffle_v8i32_12345670:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd {{.*#+}} ymm1 {%k1} = ymm0[1,2,3,4,5,6,7,0]
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -202,7 +202,7 @@ define <8 x i32> @mask_shuffle_v8i32_12345670(<8 x i32> %a, <8 x i32> %passthru,
 define <8 x i32> @maskz_shuffle_v8i32_12345670(<8 x i32> %a, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v8i32_12345670:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    valignd {{.*#+}} ymm0 {%k1} {z} = ymm0[1,2,3,4,5,6,7,0]
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0>
@@ -215,7 +215,7 @@ define <8 x i32> @mask_shuffle_v8i32_23456701(<8 x i32> %a, <8 x i32> %passthru,
 ; CHECK-LABEL: mask_shuffle_v8i32_23456701:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0]
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
@@ -228,7 +228,7 @@ define <8 x i32> @maskz_shuffle_v8i32_23456701(<8 x i32> %a, i8 %mask) {
 ; CHECK-LABEL: maskz_shuffle_v8i32_23456701:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,2,3,0]
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 0, i32 1>
@@ -240,9 +240,10 @@ define <8 x i32> @maskz_shuffle_v8i32_23456701(<8 x i32> %a, i8 %mask) {
 define <4 x i32> @mask_extract_v16i32_v4i32_0(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16i32_v4i32_0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti32x4 $0, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -254,9 +255,10 @@ define <4 x i32> @mask_extract_v16i32_v4i32_0(<16 x i32> %a, <4 x i32> %passthru
 define <4 x i32> @mask_extract_v16i32_v4i32_1(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16i32_v4i32_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti32x4 $1, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -268,9 +270,10 @@ define <4 x i32> @mask_extract_v16i32_v4i32_1(<16 x i32> %a, <4 x i32> %passthru
 define <4 x i32> @mask_extract_v16i32_v4i32_2(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16i32_v4i32_2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti32x4 $2, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -282,9 +285,10 @@ define <4 x i32> @mask_extract_v16i32_v4i32_2(<16 x i32> %a, <4 x i32> %passthru
 define <4 x i32> @mask_extract_v16i32_v4i32_3(<16 x i32> %a, <4 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16i32_v4i32_3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti32x4 $3, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -296,9 +300,10 @@ define <4 x i32> @mask_extract_v16i32_v4i32_3(<16 x i32> %a, <4 x i32> %passthru
 define <4 x float> @mask_extract_v16f32_v4f32_0(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16f32_v4f32_0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x4 $0, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -310,9 +315,10 @@ define <4 x float> @mask_extract_v16f32_v4f32_0(<16 x float> %a, <4 x float> %pa
 define <4 x float> @mask_extract_v16f32_v4f32_1(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16f32_v4f32_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x4 $1, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -324,9 +330,10 @@ define <4 x float> @mask_extract_v16f32_v4f32_1(<16 x float> %a, <4 x float> %pa
 define <4 x float> @mask_extract_v16f32_v4f32_2(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16f32_v4f32_2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x4 $2, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -338,9 +345,10 @@ define <4 x float> @mask_extract_v16f32_v4f32_2(<16 x float> %a, <4 x float> %pa
 define <4 x float> @mask_extract_v16f32_v4f32_3(<16 x float> %a, <4 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16f32_v4f32_3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x4 $3, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -352,7 +360,7 @@ define <4 x float> @mask_extract_v16f32_v4f32_3(<16 x float> %a, <4 x float> %pa
 define <8 x i32> @mask_extract_v16i32_v8i32_0(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16i32_v8i32_0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti32x8 $0, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -365,7 +373,7 @@ define <8 x i32> @mask_extract_v16i32_v8i32_0(<16 x i32> %a, <8 x i32> %passthru
 define <8 x i32> @mask_extract_v16i32_v8i32_1(<16 x i32> %a, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16i32_v8i32_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti32x8 $1, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -378,7 +386,7 @@ define <8 x i32> @mask_extract_v16i32_v8i32_1(<16 x i32> %a, <8 x i32> %passthru
 define <8 x float> @mask_extract_v16f32_v8f32_0(<16 x float> %a, <8 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16f32_v8f32_0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x8 $0, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -391,7 +399,7 @@ define <8 x float> @mask_extract_v16f32_v8f32_0(<16 x float> %a, <8 x float> %pa
 define <8 x float> @mask_extract_v16f32_v8f32_1(<16 x float> %a, <8 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v16f32_v8f32_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -404,9 +412,10 @@ define <8 x float> @mask_extract_v16f32_v8f32_1(<16 x float> %a, <8 x float> %pa
 define <2 x i64> @mask_extract_v8i64_v2i64_0(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8i64_v2i64_0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti64x2 $0, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 0, i32 1>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -418,9 +427,10 @@ define <2 x i64> @mask_extract_v8i64_v2i64_0(<8 x i64> %a, <2 x i64> %passthru,
 define <2 x i64> @mask_extract_v8i64_v2i64_1(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8i64_v2i64_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti64x2 $1, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -432,9 +442,10 @@ define <2 x i64> @mask_extract_v8i64_v2i64_1(<8 x i64> %a, <2 x i64> %passthru,
 define <2 x i64> @mask_extract_v8i64_v2i64_2(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8i64_v2i64_2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti64x2 $2, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 4, i32 5>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -446,9 +457,10 @@ define <2 x i64> @mask_extract_v8i64_v2i64_2(<8 x i64> %a, <2 x i64> %passthru,
 define <2 x i64> @mask_extract_v8i64_v2i64_3(<8 x i64> %a, <2 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8i64_v2i64_3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti64x2 $3, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 6, i32 7>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -460,9 +472,10 @@ define <2 x i64> @mask_extract_v8i64_v2i64_3(<8 x i64> %a, <2 x i64> %passthru,
 define <2 x double> @mask_extract_v8f64_v2f64_0(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8f64_v2f64_0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf64x2 $0, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 0, i32 1>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -474,9 +487,10 @@ define <2 x double> @mask_extract_v8f64_v2f64_0(<8 x double> %a, <2 x double> %p
 define <2 x double> @mask_extract_v8f64_v2f64_1(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8f64_v2f64_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -488,9 +502,10 @@ define <2 x double> @mask_extract_v8f64_v2f64_1(<8 x double> %a, <2 x double> %p
 define <2 x double> @mask_extract_v8f64_v2f64_2(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8f64_v2f64_2:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf64x2 $2, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 4, i32 5>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -502,9 +517,10 @@ define <2 x double> @mask_extract_v8f64_v2f64_2(<8 x double> %a, <2 x double> %p
 define <2 x double> @mask_extract_v8f64_v2f64_3(<8 x double> %a, <2 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8f64_v2f64_3:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf64x2 $3, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 6, i32 7>
   %mask.cast = bitcast i8 %mask to <8 x i1>
@@ -516,7 +532,7 @@ define <2 x double> @mask_extract_v8f64_v2f64_3(<8 x double> %a, <2 x double> %p
 define <4 x i64> @mask_extract_v8i64_v4i64_0(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8i64_v4i64_0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti64x4 $0, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -530,7 +546,7 @@ define <4 x i64> @mask_extract_v8i64_v4i64_0(<8 x i64> %a, <4 x i64> %passthru,
 define <4 x i64> @mask_extract_v8i64_v4i64_1(<8 x i64> %a, <4 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8i64_v4i64_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -544,7 +560,7 @@ define <4 x i64> @mask_extract_v8i64_v4i64_1(<8 x i64> %a, <4 x i64> %passthru,
 define <4 x double> @mask_extract_v8f64_v4f64_0(<8 x double> %a, <4 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8f64_v4f64_0:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf64x4 $0, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -558,7 +574,7 @@ define <4 x double> @mask_extract_v8f64_v4f64_0(<8 x double> %a, <4 x double> %p
 define <4 x double> @mask_extract_v8f64_v4f64_1(<8 x double> %a, <4 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8f64_v4f64_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -572,7 +588,7 @@ define <4 x double> @mask_extract_v8f64_v4f64_1(<8 x double> %a, <4 x double> %p
 define <8 x i32> @mask_extract_v8i64_v8i32_1(<8 x i64> %a, <8 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8i64_v8i32_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti32x8 $1, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -586,7 +602,7 @@ define <8 x i32> @mask_extract_v8i64_v8i32_1(<8 x i64> %a, <8 x i32> %passthru,
 define <8 x float> @mask_extract_v8f64_v8f32_1(<8 x double> %a, <8 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_extract_v8f64_v8f32_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x8 $1, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovaps %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -600,9 +616,10 @@ define <8 x float> @mask_extract_v8f64_v8f32_1(<8 x double> %a, <8 x float> %pas
 define <4 x i32> @mask_cast_extract_v8i64_v4i32_1(<8 x i64> %a, <4 x i32> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_cast_extract_v8i64_v4i32_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti32x4 $1, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.cast = bitcast <2 x i64> %shuffle to <4 x i32>
@@ -615,9 +632,10 @@ define <4 x i32> @mask_cast_extract_v8i64_v4i32_1(<8 x i64> %a, <4 x i32> %passt
 define <4 x float> @mask_cast_extract_v8f64_v4f32_1(<8 x double> %a, <4 x float> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_cast_extract_v8f64_v4f32_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf32x4 $1, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovaps %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <8 x double> %a, <8 x double> undef, <2 x i32> <i32 2, i32 3>
   %shuffle.cast = bitcast <2 x double> %shuffle to <4 x float>
@@ -630,7 +648,7 @@ define <4 x float> @mask_cast_extract_v8f64_v4f32_1(<8 x double> %a, <4 x float>
 define <4 x i64> @mask_cast_extract_v16i32_v4i64_1(<16 x i32> %a, <4 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_cast_extract_v16i32_v4i64_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -645,7 +663,7 @@ define <4 x i64> @mask_cast_extract_v16i32_v4i64_1(<16 x i32> %a, <4 x i64> %pas
 define <4 x double> @mask_cast_extract_v16f32_v4f64_1(<16 x float> %a, <4 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_cast_extract_v16f32_v4f64_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf64x4 $1, %zmm0, %ymm1 {%k1}
 ; CHECK-NEXT:    vmovapd %ymm1, %ymm0
 ; CHECK-NEXT:    retq
@@ -660,9 +678,10 @@ define <4 x double> @mask_cast_extract_v16f32_v4f64_1(<16 x float> %a, <4 x doub
 define <2 x i64> @mask_cast_extract_v16i32_v2i64_1(<16 x i32> %a, <2 x i64> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_cast_extract_v16i32_v2i64_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextracti64x2 $1, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovdqa %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x i32> %a, <16 x i32> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.cast = bitcast <4 x i32> %shuffle to <2 x i64>
@@ -675,9 +694,10 @@ define <2 x i64> @mask_cast_extract_v16i32_v2i64_1(<16 x i32> %a, <2 x i64> %pas
 define <2 x double> @mask_cast_extract_v16f32_v2f64_1(<16 x float> %a, <2 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: mask_cast_extract_v16f32_v2f64_1:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %edi, %k1
+; CHECK-NEXT:    kmovw %edi, %k1
 ; CHECK-NEXT:    vextractf64x2 $1, %zmm0, %xmm1 {%k1}
 ; CHECK-NEXT:    vmovapd %xmm1, %xmm0
+; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
   %shuffle = shufflevector <16 x float> %a, <16 x float> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %shuffle.cast = bitcast <4 x float> %shuffle to <2 x double>
@@ -690,7 +710,7 @@ define <2 x double> @mask_cast_extract_v16f32_v2f64_1(<16 x float> %a, <2 x doub
 define <2 x double> @broadcast_v4f32_0101_from_v2f32_mask(double* %x, <2 x double> %passthru, i8 %mask) {
 ; CHECK-LABEL: broadcast_v4f32_0101_from_v2f32_mask:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} = mem[0,0]
 ; CHECK-NEXT:    retq
   %q = load double, double* %x, align 1
@@ -705,7 +725,7 @@ define <2 x double> @broadcast_v4f32_0101_from_v2f32_mask(double* %x, <2 x doubl
 define <2 x double> @broadcast_v4f32_0101_from_v2f32_maskz(double* %x, i8 %mask) {
 ; CHECK-LABEL: broadcast_v4f32_0101_from_v2f32_maskz:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 {%k1} {z} = mem[0,0]
 ; CHECK-NEXT:    retq
   %q = load double, double* %x, align 1
@@ -720,7 +740,7 @@ define <2 x double> @broadcast_v4f32_0101_from_v2f32_maskz(double* %x, i8 %mask)
 define <8 x float> @test_broadcast_2f64_8f32(<2 x double> *%p, i8 %mask) nounwind {
 ; CHECK-LABEL: test_broadcast_2f64_8f32:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vbroadcastf32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
  %1 = load <2 x double>, <2 x double> *%p
@@ -734,7 +754,7 @@ define <8 x float> @test_broadcast_2f64_8f32(<2 x double> *%p, i8 %mask) nounwin
 define <8 x i32> @test_broadcast_2i64_8i32(<2 x i64> *%p, i8 %mask) nounwind {
 ; CHECK-LABEL: test_broadcast_2i64_8i32:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vbroadcasti32x4 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
  %1 = load <2 x i64>, <2 x i64> *%p
@@ -804,7 +824,7 @@ define <16 x i32> @test_broadcast_4i64_16i32(<4 x i64> *%p, i16 %mask) nounwind
 define <4 x double> @test_broadcast_4f32_4f64(<4 x float> *%p, i8 %mask) nounwind {
 ; CHECK-LABEL: test_broadcast_4f32_4f64:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
  %1 = load <4 x float>, <4 x float> *%p
@@ -819,7 +839,7 @@ define <4 x double> @test_broadcast_4f32_4f64(<4 x float> *%p, i8 %mask) nounwin
 define <4 x i64> @test_broadcast_4i32_4i64(<4 x i32> *%p, i8 %mask) nounwind {
 ; CHECK-LABEL: test_broadcast_4i32_4i64:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} ymm0 {%k1} {z} = mem[0,1,0,1]
 ; CHECK-NEXT:    retq
  %1 = load <4 x i32>, <4 x i32> *%p
@@ -834,7 +854,7 @@ define <4 x i64> @test_broadcast_4i32_4i64(<4 x i32> *%p, i8 %mask) nounwind {
 define <8 x double> @test_broadcast_4f32_8f64(<4 x float> *%p, i8 %mask) nounwind {
 ; CHECK-LABEL: test_broadcast_4f32_8f64:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vbroadcastf64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
  %1 = load <4 x float>, <4 x float> *%p
@@ -848,7 +868,7 @@ define <8 x double> @test_broadcast_4f32_8f64(<4 x float> *%p, i8 %mask) nounwin
 define <8 x i64> @test_broadcast_4i32_8i64(<4 x i32> *%p, i8 %mask) nounwind {
 ; CHECK-LABEL: test_broadcast_4i32_8i64:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vbroadcasti64x2 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,0,1,0,1,0,1]
 ; CHECK-NEXT:    retq
  %1 = load <4 x i32>, <4 x i32> *%p
@@ -862,7 +882,7 @@ define <8 x i64> @test_broadcast_4i32_8i64(<4 x i32> *%p, i8 %mask) nounwind {
 define <8 x double> @test_broadcast_8f32_8f64(<8 x float> *%p, i8 %mask) nounwind {
 ; CHECK-LABEL: test_broadcast_8f32_8f64:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vbroadcastf64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
  %1 = load <8 x float>, <8 x float> *%p
@@ -876,7 +896,7 @@ define <8 x double> @test_broadcast_8f32_8f64(<8 x float> *%p, i8 %mask) nounwin
 define <8 x i64> @test_broadcast_8i32_8i64(<8 x i32> *%p, i8 %mask) nounwind {
 ; CHECK-LABEL: test_broadcast_8i32_8i64:
 ; CHECK:       # BB#0:
-; CHECK-NEXT:    kmovb %esi, %k1
+; CHECK-NEXT:    kmovw %esi, %k1
 ; CHECK-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 {%k1} {z} = mem[0,1,2,3,0,1,2,3]
 ; CHECK-NEXT:    retq
  %1 = load <8 x i32>, <8 x i32> *%p
diff --git a/test/CodeGen/X86/vector-shuffle-v1.ll b/test/CodeGen/X86/vector-shuffle-v1.ll
index 4312b67546d29869290ad8e3126d0a854358916f..4ec6b86247d52770a90c5e4a82c1e0e1b53127f6 100644
--- a/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -35,11 +35,11 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; VL_BW_DQ:       # BB#0:
 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; VL_BW_DQ-NEXT:    vptestmq %xmm0, %xmm0, %k0
-; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
 ; VL_BW_DQ-NEXT:    movb $1, %al
-; VL_BW_DQ-NEXT:    kmovb %eax, %k0
+; VL_BW_DQ-NEXT:    kmovd %eax, %k1
+; VL_BW_DQ-NEXT:    vpmovm2q %k1, %xmm0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm1
-; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
+; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm1[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
 ; VL_BW_DQ-NEXT:    retq
@@ -78,6 +78,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf8i1_3_6_1_0_3_7_7_0:
@@ -88,6 +89,7 @@ define <8 x i1> @shuf8i1_3_6_1_0_3_7_7_0(<8 x i64> %a, <8 x i64> %b, <8 x i64> %
 ; VL_BW_DQ-NEXT:    vpermq %zmm0, %zmm1, %zmm0
 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %a2 = icmp eq <8 x i64> %a, %a1
   %b2 = icmp eq <8 x i64> %b, %b1
@@ -108,6 +110,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0:
@@ -120,6 +123,7 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(<16 x i32> %a, <1
 ; VL_BW_DQ-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; VL_BW_DQ-NEXT:    vpmovd2m %zmm2, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2b %k0, %xmm0
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %a2 = icmp eq <16 x i32> %a, %a1
   %b2 = icmp eq <16 x i32> %b, %b1
@@ -162,16 +166,18 @@ define <8 x i1> @shuf8i1_u_2_u_u_2_u_2_u(i8 %a) {
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf8i1_u_2_u_u_2_u_2_u:
 ; VL_BW_DQ:       # BB#0:
-; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    kmovd %edi, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
 ; VL_BW_DQ-NEXT:    vextracti64x2 $1, %zmm0, %xmm0
 ; VL_BW_DQ-NEXT:    vpbroadcastq %xmm0, %zmm0
 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2w %k0, %xmm0
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %b = bitcast i8 %a to <8 x i1>
   %c = shufflevector < 8 x i1> %b, <8 x i1>undef, <8 x i32> <i32 undef, i32 2, i32 undef, i32 undef, i32 2, i32 undef, i32 2, i32 undef>
@@ -189,17 +195,21 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf8i1_10_2_9_u_3_u_2_u:
 ; VL_BW_DQ:       # BB#0:
-; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    kmovd %edi, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
 ; VL_BW_DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
 ; VL_BW_DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
-; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    kmovd %k0, %eax
+; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %b = bitcast i8 %a to <8 x i1>
   %c = shufflevector < 8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 10, i32 2, i32 9, i32 undef, i32 3, i32 undef, i32 2, i32 undef>
@@ -216,15 +226,19 @@ define i8 @shuf8i1_0_1_4_5_u_u_u_u(i8 %a) {
 ; AVX512F-NEXT:    vpsllq $63, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf8i1_0_1_4_5_u_u_u_u:
 ; VL_BW_DQ:       # BB#0:
-; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    kmovd %edi, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
 ; VL_BW_DQ-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,4,5,0,1,0,1]
 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm0, %k0
-; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    kmovd %k0, %eax
+; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %b = bitcast i8 %a to <8 x i1>
   %c = shufflevector < 8 x i1> %b, <8 x i1> undef, <8 x i32> <i32 0, i32 1, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -243,17 +257,21 @@ define i8 @shuf8i1_9_6_1_0_3_7_7_0(i8 %a) {
 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_0_3_7_7_0:
 ; VL_BW_DQ:       # BB#0:
-; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    kmovd %edi, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
 ; VL_BW_DQ-NEXT:    vpxord %zmm1, %zmm1, %zmm1
 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,6,1,0,3,7,7,0]
 ; VL_BW_DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
-; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    kmovd %k0, %eax
+; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %b = bitcast i8 %a to <8 x i1>
   %c = shufflevector <8 x i1> %b, <8 x i1> zeroinitializer, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
@@ -272,17 +290,21 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0(i8 %a) {
 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0:
 ; VL_BW_DQ:       # BB#0:
-; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    kmovd %edi, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm0
 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [9,1,2,10,4,5,6,7]
 ; VL_BW_DQ-NEXT:    vpxord %zmm2, %zmm2, %zmm2
 ; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
-; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    kmovd %k0, %eax
+; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %b = bitcast i8 %a to <8 x i1>
   %c = shufflevector <8 x i1> zeroinitializer, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 10, i32 3, i32 7, i32 7, i32 0>
@@ -303,19 +325,23 @@ define i8 @shuf8i1__9_6_1_10_3_7_7_1(i8 %a) {
 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf8i1__9_6_1_10_3_7_7_1:
 ; VL_BW_DQ:       # BB#0:
-; VL_BW_DQ-NEXT:    kmovb %edi, %k0
+; VL_BW_DQ-NEXT:    kmovd %edi, %k0
 ; VL_BW_DQ-NEXT:    movb $51, %al
-; VL_BW_DQ-NEXT:    kmovb %eax, %k1
+; VL_BW_DQ-NEXT:    kmovd %eax, %k1
 ; VL_BW_DQ-NEXT:    vpmovm2q %k1, %zmm0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %zmm1
 ; VL_BW_DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [9,6,1,0,3,7,7,1]
 ; VL_BW_DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
-; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    kmovd %k0, %eax
+; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %b = bitcast i8 %a to <8 x i1>
   %c = shufflevector <8 x i1> <i1 1, i1 1, i1 0, i1 0, i1 1, i1 1, i1 0, i1 0>, <8 x i1> %b, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 1>
@@ -336,6 +362,8 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf8i1_9_6_1_10_3_7_7_0_all_ones:
@@ -347,7 +375,9 @@ define i8 @shuf8i1_9_6_1_10_3_7_7_0_all_ones(<8 x i1> %a) {
 ; VL_BW_DQ-NEXT:    vpternlogd $255, %zmm2, %zmm2, %zmm2
 ; VL_BW_DQ-NEXT:    vpermt2q %zmm0, %zmm1, %zmm2
 ; VL_BW_DQ-NEXT:    vpmovq2m %zmm2, %k0
-; VL_BW_DQ-NEXT:    kmovb %k0, %eax
+; VL_BW_DQ-NEXT:    kmovd %k0, %eax
+; VL_BW_DQ-NEXT:    # kill: %AL<def> %AL<kill> %EAX<kill>
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %c = shufflevector <8 x i1> <i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1, i1 1>, <8 x i1> %a, <8 x i32> <i32 9, i32 6, i32 1, i32 0, i32 3, i32 7, i32 7, i32 0>
   %c1 = bitcast <8 x i1>%c to i8
@@ -364,15 +394,19 @@ define i16 @shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0(i16 %a) {
 ; AVX512F-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-NEXT:    vptestmd %zmm0, %zmm0, %k0
 ; AVX512F-NEXT:    kmovw %k0, %eax
+; AVX512F-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf16i1_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0_0:
 ; VL_BW_DQ:       # BB#0:
-; VL_BW_DQ-NEXT:    kmovw %edi, %k0
+; VL_BW_DQ-NEXT:    kmovd %edi, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2d %k0, %zmm0
 ; VL_BW_DQ-NEXT:    vpbroadcastd %xmm0, %zmm0
 ; VL_BW_DQ-NEXT:    vpmovd2m %zmm0, %k0
-; VL_BW_DQ-NEXT:    kmovw %k0, %eax
+; VL_BW_DQ-NEXT:    kmovd %k0, %eax
+; VL_BW_DQ-NEXT:    # kill: %AX<def> %AX<kill> %EAX<kill>
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %b = bitcast i16 %a to <16 x i1>
   %c = shufflevector < 16 x i1> %b, <16 x i1> undef, <16 x i32> zeroinitializer
@@ -413,6 +447,7 @@ define i64 @shuf64i1_zero(i64 %a) {
 ; AVX512F-NEXT:    orq %rcx, %rax
 ; AVX512F-NEXT:    movq %rbp, %rsp
 ; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; VL_BW_DQ-LABEL: shuf64i1_zero:
@@ -422,6 +457,7 @@ define i64 @shuf64i1_zero(i64 %a) {
 ; VL_BW_DQ-NEXT:    vpbroadcastb %xmm0, %zmm0
 ; VL_BW_DQ-NEXT:    vpmovb2m %zmm0, %k0
 ; VL_BW_DQ-NEXT:    kmovq %k0, %rax
+; VL_BW_DQ-NEXT:    vzeroupper
 ; VL_BW_DQ-NEXT:    retq
   %b = bitcast i64 %a to <64 x i1>
   %c = shufflevector < 64 x i1> %b, <64 x i1> undef, <64 x i32> zeroinitializer
diff --git a/test/CodeGen/X86/vector-shuffle-variable-128.ll b/test/CodeGen/X86/vector-shuffle-variable-128.ll
index 5ee3908b02481b9ef206003dcfa26d0f0f5da55e..87fd4a7bf6b9b12b62fcede6edd433f981fd3647 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-128.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-128.ll
@@ -42,8 +42,8 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1)
 ; SSE-NEXT:    andl $1, %edi
 ; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; SSE-NEXT:    andl $1, %esi
-; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    movq {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; SSE-NEXT:    retq
 ;
@@ -56,7 +56,7 @@ define <2 x i64> @var_shuffle_v2i64_v2i64_xx_i64(<2 x i64> %x, i32 %i0, i32 %i1)
 ; AVX-NEXT:    andl $1, %esi
 ; AVX-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX-NEXT:    retq
   %x0 = extractelement <2 x i64> %x, i32 %i0
   %x1 = extractelement <2 x i64> %x, i32 %i1
@@ -79,10 +79,10 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
 ; SSE2-NEXT:    andl $3, %ecx
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
@@ -99,10 +99,10 @@ define <4 x float> @var_shuffle_v4f32_v4f32_xxxx_i32(<4 x float> %x, i32 %i0, i3
 ; SSSE3-NEXT:    andl $3, %ecx
 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSSE3-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
@@ -164,10 +164,10 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
 ; SSE2-NEXT:    andl $3, %ecx
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
@@ -184,10 +184,10 @@ define <4 x i32> @var_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32 %i0, i32 %i
 ; SSSE3-NEXT:    andl $3, %ecx
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
@@ -255,29 +255,29 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSE2-NEXT:    andl $7, %r10d
 ; SSE2-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $7, %eax
-; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
 ; SSE2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
-; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSE2-NEXT:    movd %r10d, %xmm0
-; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
-; SSE2-NEXT:    movd %edx, %xmm1
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSE2-NEXT:    movd %edi, %xmm0
-; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %edx
-; SSE2-NEXT:    movd %edx, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    movd %ecx, %xmm2
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movd %esi, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE2-NEXT:    movzwl -24(%rsp,%r10,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%rdx,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSE2-NEXT:    movzwl -24(%rsp,%r8,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
@@ -299,29 +299,29 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSSE3-NEXT:    andl $7, %r10d
 ; SSSE3-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $7, %eax
-; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
 ; SSSE3-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
-; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %esi
-; SSSE3-NEXT:    movd %r10d, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %edx
-; SSSE3-NEXT:    movd %edx, %xmm1
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSSE3-NEXT:    movd %edi, %xmm0
-; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %edx
-; SSSE3-NEXT:    movd %edx, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    movd %ecx, %xmm2
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSSE3-NEXT:    movd %esi, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
 ; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSSE3-NEXT:    movzwl -24(%rsp,%r10,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%rdx,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
+; SSSE3-NEXT:    movzwl -24(%rsp,%r8,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSSE3-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
@@ -343,8 +343,6 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSE41-NEXT:    andl $7, %r10d
 ; SSE41-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
 ; SSE41-NEXT:    andl $7, %eax
-; SSE41-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
-; SSE41-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; SSE41-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
 ; SSE41-NEXT:    movd %edi, %xmm0
 ; SSE41-NEXT:    pinsrw $1, -24(%rsp,%rsi,2), %xmm0
@@ -352,8 +350,8 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; SSE41-NEXT:    pinsrw $3, -24(%rsp,%rcx,2), %xmm0
 ; SSE41-NEXT:    pinsrw $4, -24(%rsp,%r8,2), %xmm0
 ; SSE41-NEXT:    pinsrw $5, -24(%rsp,%r9,2), %xmm0
-; SSE41-NEXT:    pinsrw $6, %r10d, %xmm0
-; SSE41-NEXT:    pinsrw $7, %eax, %xmm0
+; SSE41-NEXT:    pinsrw $6, -24(%rsp,%r10,2), %xmm0
+; SSE41-NEXT:    pinsrw $7, -24(%rsp,%rax,2), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: var_shuffle_v8i16_v8i16_xxxxxxxx_i16:
@@ -375,8 +373,6 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; AVX-NEXT:    andl $7, %r10d
 ; AVX-NEXT:    movzwl {{[0-9]+}}(%rsp), %eax
 ; AVX-NEXT:    andl $7, %eax
-; AVX-NEXT:    movzwl -24(%rsp,%r10,2), %r10d
-; AVX-NEXT:    movzwl -24(%rsp,%rax,2), %eax
 ; AVX-NEXT:    movzwl -24(%rsp,%rdi,2), %edi
 ; AVX-NEXT:    vmovd %edi, %xmm0
 ; AVX-NEXT:    vpinsrw $1, -24(%rsp,%rsi,2), %xmm0, %xmm0
@@ -384,8 +380,8 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xxxxxxxx_i16(<8 x i16> %x, i16 %i0, i1
 ; AVX-NEXT:    vpinsrw $3, -24(%rsp,%rcx,2), %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrw $4, -24(%rsp,%r8,2), %xmm0, %xmm0
 ; AVX-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $6, %r10d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $6, -24(%rsp,%r10,2), %xmm0, %xmm0
+; AVX-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x0 = extractelement <8 x i16> %x, i16 %i0
   %x1 = extractelement <8 x i16> %x, i16 %i1
@@ -416,80 +412,80 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE2-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE2-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; SSE2-NEXT:    andl $15, %r10d
-; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %r11
-; SSE2-NEXT:    movzbl (%r10,%r11), %eax
-; SSE2-NEXT:    movd %eax, %xmm15
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    movzbl (%rax,%r10), %eax
 ; SSE2-NEXT:    movd %eax, %xmm8
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movd %eax, %xmm15
+; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSE2-NEXT:    andl $15, %eax
+; SSE2-NEXT:    movzbl (%rax,%r10), %eax
 ; SSE2-NEXT:    movd %eax, %xmm9
-; SSE2-NEXT:    andl $15, %edx
-; SSE2-NEXT:    movzbl (%rdx,%r11), %eax
+; SSE2-NEXT:    andl $15, %ecx
+; SSE2-NEXT:    movzbl (%rcx,%r10), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    movzbl (%rax,%r10), %eax
 ; SSE2-NEXT:    movd %eax, %xmm10
-; SSE2-NEXT:    andl $15, %edi
-; SSE2-NEXT:    movzbl (%rdi,%r11), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    andl $15, %r9d
+; SSE2-NEXT:    movzbl (%r9,%r10), %eax
+; SSE2-NEXT:    movd %eax, %xmm7
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    movzbl (%rax,%r10), %eax
 ; SSE2-NEXT:    movd %eax, %xmm11
-; SSE2-NEXT:    andl $15, %r8d
-; SSE2-NEXT:    movzbl (%r8,%r11), %eax
-; SSE2-NEXT:    movd %eax, %xmm7
+; SSE2-NEXT:    andl $15, %esi
+; SSE2-NEXT:    movzbl (%rsi,%r10), %eax
+; SSE2-NEXT:    movd %eax, %xmm6
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movd %eax, %xmm12
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
-; SSE2-NEXT:    movd %eax, %xmm12
+; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movd %eax, %xmm5
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    movzbl (%rax,%r10), %eax
 ; SSE2-NEXT:    movd %eax, %xmm13
-; SSE2-NEXT:    andl $15, %ecx
-; SSE2-NEXT:    movzbl (%rcx,%r11), %eax
-; SSE2-NEXT:    movd %eax, %xmm6
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%r10), %eax
+; SSE2-NEXT:    movd %eax, %xmm4
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
+; SSE2-NEXT:    movzbl (%rax,%r10), %eax
 ; SSE2-NEXT:    movd %eax, %xmm14
-; SSE2-NEXT:    andl $15, %esi
-; SSE2-NEXT:    movzbl (%rsi,%r11), %eax
-; SSE2-NEXT:    movd %eax, %xmm5
+; SSE2-NEXT:    andl $15, %r8d
+; SSE2-NEXT:    movzbl (%r8,%r10), %eax
+; SSE2-NEXT:    movd %eax, %xmm1
 ; SSE2-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%r11), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    andl $15, %r9d
-; SSE2-NEXT:    movzbl (%r9,%r11), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    movzbl (%rax,%r10), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    andl $15, %edi
+; SSE2-NEXT:    movzbl (%rdi,%r10), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
@@ -501,89 +497,84 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSSE3-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSSE3-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; SSSE3-NEXT:    andl $15, %r10d
-; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %r11
-; SSSE3-NEXT:    movzbl (%r10,%r11), %eax
-; SSSE3-NEXT:    movd %eax, %xmm15
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %r10
+; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm8
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movd %eax, %xmm15
+; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
+; SSSE3-NEXT:    andl $15, %eax
+; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm9
-; SSSE3-NEXT:    andl $15, %edx
-; SSSE3-NEXT:    movzbl (%rdx,%r11), %eax
+; SSSE3-NEXT:    andl $15, %ecx
+; SSSE3-NEXT:    movzbl (%rcx,%r10), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm10
-; SSSE3-NEXT:    andl $15, %edi
-; SSSE3-NEXT:    movzbl (%rdi,%r11), %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    andl $15, %r9d
+; SSSE3-NEXT:    movzbl (%r9,%r10), %eax
+; SSSE3-NEXT:    movd %eax, %xmm7
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm11
-; SSSE3-NEXT:    andl $15, %r8d
-; SSSE3-NEXT:    movzbl (%r8,%r11), %eax
-; SSSE3-NEXT:    movd %eax, %xmm7
+; SSSE3-NEXT:    andl $15, %esi
+; SSSE3-NEXT:    movzbl (%rsi,%r10), %eax
+; SSSE3-NEXT:    movd %eax, %xmm6
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movd %eax, %xmm12
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
-; SSSE3-NEXT:    movd %eax, %xmm12
+; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movd %eax, %xmm5
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm13
-; SSSE3-NEXT:    andl $15, %ecx
-; SSSE3-NEXT:    movzbl (%rcx,%r11), %eax
-; SSSE3-NEXT:    movd %eax, %xmm6
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%r10), %eax
+; SSSE3-NEXT:    movd %eax, %xmm4
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
+; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm14
-; SSSE3-NEXT:    andl $15, %esi
-; SSSE3-NEXT:    movzbl (%rsi,%r11), %eax
-; SSSE3-NEXT:    movd %eax, %xmm5
+; SSSE3-NEXT:    andl $15, %r8d
+; SSSE3-NEXT:    movzbl (%r8,%r10), %eax
+; SSSE3-NEXT:    movd %eax, %xmm1
 ; SSSE3-NEXT:    movzbl {{[0-9]+}}(%rsp), %eax
 ; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%r11), %eax
-; SSSE3-NEXT:    movd %eax, %xmm4
-; SSSE3-NEXT:    andl $15, %r9d
-; SSSE3-NEXT:    movzbl (%r9,%r11), %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    movzbl (%rax,%r10), %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    andl $15, %edi
+; SSSE3-NEXT:    movzbl (%rdi,%r10), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1],xmm0[2],xmm10[2],xmm0[3],xmm10[3],xmm0[4],xmm10[4],xmm0[5],xmm10[5],xmm0[6],xmm10[6],xmm0[7],xmm10[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm14[0],xmm5[1],xmm14[1],xmm5[2],xmm14[2],xmm5[3],xmm14[3],xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1],xmm5[2],xmm1[2],xmm5[3],xmm1[3],xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pushq %rbp
-; SSE41-NEXT:    pushq %r15
-; SSE41-NEXT:    pushq %r14
-; SSE41-NEXT:    pushq %r12
-; SSE41-NEXT:    pushq %rbx
 ; SSE41-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
 ; SSE41-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
 ; SSE41-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
@@ -591,74 +582,54 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; SSE41-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; SSE41-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; SSE41-NEXT:    andl $15, %edi
-; SSE41-NEXT:    andl $15, %esi
-; SSE41-NEXT:    andl $15, %edx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    andl $15, %r8d
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    andl $15, %r9d
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; SSE41-NEXT:    andl $15, %r10d
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; SSE41-NEXT:    andl $15, %r11d
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
-; SSE41-NEXT:    andl $15, %r14d
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
-; SSE41-NEXT:    andl $15, %r15d
 ; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; SSE41-NEXT:    movzbl (%rdi,%rax), %edi
 ; SSE41-NEXT:    movd %edi, %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
-; SSE41-NEXT:    andl $15, %r12d
-; SSE41-NEXT:    pinsrb $1, (%rsi,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
 ; SSE41-NEXT:    andl $15, %esi
-; SSE41-NEXT:    pinsrb $2, (%rdx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; SSE41-NEXT:    pinsrb $1, (%rsi,%rax), %xmm0
 ; SSE41-NEXT:    andl $15, %edx
-; SSE41-NEXT:    pinsrb $3, (%rcx,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    pinsrb $2, (%rdx,%rax), %xmm0
 ; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $3, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    andl $15, %r8d
 ; SSE41-NEXT:    pinsrb $4, (%r8,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
-; SSE41-NEXT:    andl $15, %ebx
+; SSE41-NEXT:    andl $15, %r9d
 ; SSE41-NEXT:    pinsrb $5, (%r9,%rax), %xmm0
-; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
-; SSE41-NEXT:    andl $15, %edi
-; SSE41-NEXT:    movzbl (%r10,%rax), %r8d
-; SSE41-NEXT:    movzbl (%r11,%rax), %r9d
-; SSE41-NEXT:    movzbl (%r14,%rax), %r10d
-; SSE41-NEXT:    movzbl (%r15,%rax), %r11d
-; SSE41-NEXT:    movzbl (%r12,%rax), %ebp
-; SSE41-NEXT:    movzbl (%rsi,%rax), %esi
-; SSE41-NEXT:    movzbl (%rdx,%rax), %edx
-; SSE41-NEXT:    movzbl (%rcx,%rax), %ecx
-; SSE41-NEXT:    movzbl (%rbx,%rax), %ebx
-; SSE41-NEXT:    movzbl (%rdi,%rax), %eax
-; SSE41-NEXT:    pinsrb $6, %r8d, %xmm0
-; SSE41-NEXT:    pinsrb $7, %r9d, %xmm0
-; SSE41-NEXT:    pinsrb $8, %r10d, %xmm0
-; SSE41-NEXT:    pinsrb $9, %r11d, %xmm0
-; SSE41-NEXT:    pinsrb $10, %ebp, %xmm0
-; SSE41-NEXT:    pinsrb $11, %esi, %xmm0
-; SSE41-NEXT:    pinsrb $12, %edx, %xmm0
-; SSE41-NEXT:    pinsrb $13, %ecx, %xmm0
-; SSE41-NEXT:    pinsrb $14, %ebx, %xmm0
-; SSE41-NEXT:    pinsrb $15, %eax, %xmm0
-; SSE41-NEXT:    popq %rbx
-; SSE41-NEXT:    popq %r12
-; SSE41-NEXT:    popq %r14
-; SSE41-NEXT:    popq %r15
-; SSE41-NEXT:    popq %rbp
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $6, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $7, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $8, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $9, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $10, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $11, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $12, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $13, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $14, (%rcx,%rax), %xmm0
+; SSE41-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; SSE41-NEXT:    andl $15, %ecx
+; SSE41-NEXT:    pinsrb $15, (%rcx,%rax), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    pushq %rbp
-; AVX-NEXT:    pushq %r15
-; AVX-NEXT:    pushq %r14
-; AVX-NEXT:    pushq %r12
-; AVX-NEXT:    pushq %rbx
 ; AVX-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
 ; AVX-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
 ; AVX-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
@@ -666,65 +637,50 @@ define <16 x i8> @var_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8 %
 ; AVX-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
 ; AVX-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
 ; AVX-NEXT:    andl $15, %edi
-; AVX-NEXT:    andl $15, %esi
-; AVX-NEXT:    andl $15, %edx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    andl $15, %r8d
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    andl $15, %r9d
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
-; AVX-NEXT:    andl $15, %r10d
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; AVX-NEXT:    andl $15, %r11d
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r14d
-; AVX-NEXT:    andl $15, %r14d
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r15d
-; AVX-NEXT:    andl $15, %r15d
 ; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rax
 ; AVX-NEXT:    movzbl (%rdi,%rax), %edi
 ; AVX-NEXT:    vmovd %edi, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %r12d
-; AVX-NEXT:    andl $15, %r12d
-; AVX-NEXT:    vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %esi
 ; AVX-NEXT:    andl $15, %esi
-; AVX-NEXT:    vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %edx
+; AVX-NEXT:    vpinsrb $1, (%rsi,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    andl $15, %edx
-; AVX-NEXT:    vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    vpinsrb $2, (%rdx,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $3, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    andl $15, %r8d
 ; AVX-NEXT:    vpinsrb $4, (%r8,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ebx
-; AVX-NEXT:    andl $15, %ebx
+; AVX-NEXT:    andl $15, %r9d
 ; AVX-NEXT:    vpinsrb $5, (%r9,%rax), %xmm0, %xmm0
-; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
-; AVX-NEXT:    andl $15, %edi
-; AVX-NEXT:    movzbl (%r10,%rax), %r8d
-; AVX-NEXT:    movzbl (%r11,%rax), %r9d
-; AVX-NEXT:    movzbl (%r14,%rax), %r10d
-; AVX-NEXT:    movzbl (%r15,%rax), %r11d
-; AVX-NEXT:    movzbl (%r12,%rax), %ebp
-; AVX-NEXT:    movzbl (%rsi,%rax), %esi
-; AVX-NEXT:    movzbl (%rdx,%rax), %edx
-; AVX-NEXT:    movzbl (%rcx,%rax), %ecx
-; AVX-NEXT:    movzbl (%rbx,%rax), %ebx
-; AVX-NEXT:    movzbl (%rdi,%rax), %eax
-; AVX-NEXT:    vpinsrb $6, %r8d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $7, %r9d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $8, %r10d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $9, %r11d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $10, %ebp, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $11, %esi, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $12, %edx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $14, %ebx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
-; AVX-NEXT:    popq %rbx
-; AVX-NEXT:    popq %r12
-; AVX-NEXT:    popq %r14
-; AVX-NEXT:    popq %r15
-; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $6, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $7, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $8, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $9, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $10, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $11, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $12, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $13, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $14, (%rcx,%rax), %xmm0, %xmm0
+; AVX-NEXT:    movzbl {{[0-9]+}}(%rsp), %ecx
+; AVX-NEXT:    andl $15, %ecx
+; AVX-NEXT:    vpinsrb $15, (%rcx,%rax), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x0  = extractelement <16 x i8> %x, i8 %i0
   %x1  = extractelement <16 x i8> %x, i8 %i1
@@ -779,11 +735,11 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
 ; SSE2-NEXT:    andl $3, %esi
 ; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSE2-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
@@ -799,11 +755,11 @@ define <4 x i32> @mem_shuffle_v4i32_v4i32_xxxx_i32(<4 x i32> %x, i32* %i) nounwi
 ; SSSE3-NEXT:    andl $3, %esi
 ; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
 ; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; SSSE3-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
+; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: mem_shuffle_v4i32_v4i32_xxxx_i32:
@@ -862,341 +818,281 @@ define <16 x i8> @mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8(<16 x i8> %x, i8*
 ; SSE2-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
 ; SSE2:       # BB#0:
 ; SSE2-NEXT:    movzbl (%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movzbl 15(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
 ; SSE2-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm0
-; SSE2-NEXT:    movzbl 8(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm8
-; SSE2-NEXT:    movzbl 12(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm9
-; SSE2-NEXT:    movzbl 4(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    movzbl 14(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm10
-; SSE2-NEXT:    movzbl 6(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm5
-; SSE2-NEXT:    movzbl 10(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm11
-; SSE2-NEXT:    movzbl 2(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm7
-; SSE2-NEXT:    movzbl 15(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm12
-; SSE2-NEXT:    movzbl 7(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    movzbl 11(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm13
-; SSE2-NEXT:    movzbl 3(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm6
-; SSE2-NEXT:    movzbl 13(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm14
-; SSE2-NEXT:    movzbl 5(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm4
-; SSE2-NEXT:    movzbl 9(%rdi), %eax
-; SSE2-NEXT:    andl $15, %eax
-; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm15
-; SSE2-NEXT:    movzbl 1(%rdi), %eax
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm8
+; SSE2-NEXT:    movzbl 7(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm15
+; SSE2-NEXT:    movzbl 11(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm9
+; SSE2-NEXT:    movzbl 3(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm3
+; SSE2-NEXT:    movzbl 13(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm10
+; SSE2-NEXT:    movzbl 5(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm7
+; SSE2-NEXT:    movzbl 9(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm11
+; SSE2-NEXT:    movzbl 1(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm6
+; SSE2-NEXT:    movzbl 14(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm12
+; SSE2-NEXT:    movzbl 6(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm5
+; SSE2-NEXT:    movzbl 10(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm13
+; SSE2-NEXT:    movzbl 2(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm4
+; SSE2-NEXT:    movzbl 12(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm14
+; SSE2-NEXT:    movzbl 4(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm1
+; SSE2-NEXT:    movzbl 8(%rdi), %edx
+; SSE2-NEXT:    andl $15, %edx
+; SSE2-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSE2-NEXT:    movd %edx, %xmm2
 ; SSE2-NEXT:    andl $15, %eax
 ; SSE2-NEXT:    movzbl (%rax,%rcx), %eax
-; SSE2-NEXT:    movd %eax, %xmm1
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
 ; SSSE3:       # BB#0:
 ; SSSE3-NEXT:    movzbl (%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
+; SSSE3-NEXT:    movzbl 15(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
 ; SSSE3-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm0
-; SSSE3-NEXT:    movzbl 8(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm8
-; SSSE3-NEXT:    movzbl 12(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm9
-; SSSE3-NEXT:    movzbl 4(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    movzbl 14(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm10
-; SSSE3-NEXT:    movzbl 6(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm5
-; SSSE3-NEXT:    movzbl 10(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm11
-; SSSE3-NEXT:    movzbl 2(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm7
-; SSSE3-NEXT:    movzbl 15(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm12
-; SSSE3-NEXT:    movzbl 7(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm2
-; SSSE3-NEXT:    movzbl 11(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm13
-; SSSE3-NEXT:    movzbl 3(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm6
-; SSSE3-NEXT:    movzbl 13(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm14
-; SSSE3-NEXT:    movzbl 5(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm4
-; SSSE3-NEXT:    movzbl 9(%rdi), %eax
-; SSSE3-NEXT:    andl $15, %eax
-; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm15
-; SSSE3-NEXT:    movzbl 1(%rdi), %eax
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm8
+; SSSE3-NEXT:    movzbl 7(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm15
+; SSSE3-NEXT:    movzbl 11(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm9
+; SSSE3-NEXT:    movzbl 3(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm3
+; SSSE3-NEXT:    movzbl 13(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm10
+; SSSE3-NEXT:    movzbl 5(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm7
+; SSSE3-NEXT:    movzbl 9(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm11
+; SSSE3-NEXT:    movzbl 1(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm6
+; SSSE3-NEXT:    movzbl 14(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm12
+; SSSE3-NEXT:    movzbl 6(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm5
+; SSSE3-NEXT:    movzbl 10(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm13
+; SSSE3-NEXT:    movzbl 2(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm4
+; SSSE3-NEXT:    movzbl 12(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm14
+; SSSE3-NEXT:    movzbl 4(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm1
+; SSSE3-NEXT:    movzbl 8(%rdi), %edx
+; SSSE3-NEXT:    andl $15, %edx
+; SSSE3-NEXT:    movzbl (%rdx,%rcx), %edx
+; SSSE3-NEXT:    movd %edx, %xmm2
 ; SSSE3-NEXT:    andl $15, %eax
 ; SSSE3-NEXT:    movzbl (%rax,%rcx), %eax
-; SSSE3-NEXT:    movd %eax, %xmm1
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3],xmm5[4],xmm10[4],xmm5[5],xmm10[5],xmm5[6],xmm10[6],xmm5[7],xmm10[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3],xmm7[4],xmm11[4],xmm7[5],xmm11[5],xmm7[6],xmm11[6],xmm7[7],xmm11[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3],xmm2[4],xmm12[4],xmm2[5],xmm12[5],xmm2[6],xmm12[6],xmm2[7],xmm12[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm13[0],xmm6[1],xmm13[1],xmm6[2],xmm13[2],xmm6[3],xmm13[3],xmm6[4],xmm13[4],xmm6[5],xmm13[5],xmm6[6],xmm13[6],xmm6[7],xmm13[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3],xmm1[4],xmm15[4],xmm1[5],xmm15[5],xmm1[6],xmm15[6],xmm1[7],xmm15[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3],xmm1[4],xmm6[4],xmm1[5],xmm6[5],xmm1[6],xmm6[6],xmm1[7],xmm6[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3],xmm5[4],xmm12[4],xmm5[5],xmm12[5],xmm5[6],xmm12[6],xmm5[7],xmm12[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3],xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm14[0],xmm1[1],xmm14[1],xmm1[2],xmm14[2],xmm1[3],xmm14[3],xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
 ; SSE41:       # BB#0:
-; SSE41-NEXT:    pushq %rbp
-; SSE41-NEXT:    pushq %r15
-; SSE41-NEXT:    pushq %r14
-; SSE41-NEXT:    pushq %r13
-; SSE41-NEXT:    pushq %r12
-; SSE41-NEXT:    pushq %rbx
-; SSE41-NEXT:    movzbl (%rdi), %r11d
-; SSE41-NEXT:    andl $15, %r11d
+; SSE41-NEXT:    movzbl (%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
 ; SSE41-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movzbl 1(%rdi), %r9d
-; SSE41-NEXT:    andl $15, %r9d
+; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
+; SSE41-NEXT:    movzbl (%rax,%rcx), %eax
+; SSE41-NEXT:    movd %eax, %xmm0
+; SSE41-NEXT:    movzbl 1(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $1, (%rax,%rcx), %xmm0
 ; SSE41-NEXT:    movzbl 2(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; SSE41-NEXT:    pinsrb $2, (%rax,%rcx), %xmm0
 ; SSE41-NEXT:    movzbl 3(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; SSE41-NEXT:    movzbl 4(%rdi), %r14d
-; SSE41-NEXT:    andl $15, %r14d
-; SSE41-NEXT:    movzbl 5(%rdi), %r15d
-; SSE41-NEXT:    andl $15, %r15d
-; SSE41-NEXT:    movzbl 6(%rdi), %r12d
-; SSE41-NEXT:    andl $15, %r12d
-; SSE41-NEXT:    movzbl 7(%rdi), %r13d
-; SSE41-NEXT:    andl $15, %r13d
-; SSE41-NEXT:    movzbl 8(%rdi), %r8d
-; SSE41-NEXT:    andl $15, %r8d
+; SSE41-NEXT:    pinsrb $3, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 4(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $4, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 5(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $5, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 6(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $6, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 7(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $7, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 8(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $8, (%rax,%rcx), %xmm0
 ; SSE41-NEXT:    movzbl 9(%rdi), %eax
 ; SSE41-NEXT:    andl $15, %eax
-; SSE41-NEXT:    movzbl 10(%rdi), %ecx
-; SSE41-NEXT:    andl $15, %ecx
-; SSE41-NEXT:    movzbl 11(%rdi), %edx
-; SSE41-NEXT:    andl $15, %edx
-; SSE41-NEXT:    movzbl 12(%rdi), %esi
-; SSE41-NEXT:    andl $15, %esi
-; SSE41-NEXT:    leaq -{{[0-9]+}}(%rsp), %rbp
-; SSE41-NEXT:    movzbl (%r11,%rbp), %ebx
-; SSE41-NEXT:    movd %ebx, %xmm0
-; SSE41-NEXT:    movzbl 13(%rdi), %r11d
-; SSE41-NEXT:    andl $15, %r11d
-; SSE41-NEXT:    pinsrb $1, (%r9,%rbp), %xmm0
-; SSE41-NEXT:    movzbl 14(%rdi), %ebx
-; SSE41-NEXT:    andl $15, %ebx
-; SSE41-NEXT:    movzbl 15(%rdi), %edi
-; SSE41-NEXT:    andl $15, %edi
-; SSE41-NEXT:    movzbl (%rdi,%rbp), %r10d
-; SSE41-NEXT:    movzbl (%rbx,%rbp), %r9d
-; SSE41-NEXT:    movzbl (%r11,%rbp), %r11d
-; SSE41-NEXT:    movzbl (%rsi,%rbp), %esi
-; SSE41-NEXT:    movzbl (%rdx,%rbp), %edx
-; SSE41-NEXT:    movzbl (%rcx,%rbp), %ecx
-; SSE41-NEXT:    movzbl (%rax,%rbp), %eax
-; SSE41-NEXT:    movzbl (%r8,%rbp), %r8d
-; SSE41-NEXT:    movzbl (%r13,%rbp), %r13d
-; SSE41-NEXT:    movzbl (%r12,%rbp), %r12d
-; SSE41-NEXT:    movzbl (%r15,%rbp), %r15d
-; SSE41-NEXT:    movzbl (%r14,%rbp), %r14d
-; SSE41-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; SSE41-NEXT:    movzbl (%rdi,%rbp), %edi
-; SSE41-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
-; SSE41-NEXT:    movzbl (%rbx,%rbp), %ebp
-; SSE41-NEXT:    pinsrb $2, %ebp, %xmm0
-; SSE41-NEXT:    pinsrb $3, %edi, %xmm0
-; SSE41-NEXT:    pinsrb $4, %r14d, %xmm0
-; SSE41-NEXT:    pinsrb $5, %r15d, %xmm0
-; SSE41-NEXT:    pinsrb $6, %r12d, %xmm0
-; SSE41-NEXT:    pinsrb $7, %r13d, %xmm0
-; SSE41-NEXT:    pinsrb $8, %r8d, %xmm0
-; SSE41-NEXT:    pinsrb $9, %eax, %xmm0
-; SSE41-NEXT:    pinsrb $10, %ecx, %xmm0
-; SSE41-NEXT:    pinsrb $11, %edx, %xmm0
-; SSE41-NEXT:    pinsrb $12, %esi, %xmm0
-; SSE41-NEXT:    pinsrb $13, %r11d, %xmm0
-; SSE41-NEXT:    pinsrb $14, %r9d, %xmm0
-; SSE41-NEXT:    pinsrb $15, %r10d, %xmm0
-; SSE41-NEXT:    popq %rbx
-; SSE41-NEXT:    popq %r12
-; SSE41-NEXT:    popq %r13
-; SSE41-NEXT:    popq %r14
-; SSE41-NEXT:    popq %r15
-; SSE41-NEXT:    popq %rbp
+; SSE41-NEXT:    pinsrb $9, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 10(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $10, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 11(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $11, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 12(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $12, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 13(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $13, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 14(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $14, (%rax,%rcx), %xmm0
+; SSE41-NEXT:    movzbl 15(%rdi), %eax
+; SSE41-NEXT:    andl $15, %eax
+; SSE41-NEXT:    pinsrb $15, (%rax,%rcx), %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: mem_shuffle_v16i8_v16i8_xxxxxxxxxxxxxxxx_i8:
 ; AVX:       # BB#0:
-; AVX-NEXT:    pushq %rbp
-; AVX-NEXT:    pushq %r15
-; AVX-NEXT:    pushq %r14
-; AVX-NEXT:    pushq %r13
-; AVX-NEXT:    pushq %r12
-; AVX-NEXT:    pushq %rbx
-; AVX-NEXT:    movzbl (%rdi), %r11d
-; AVX-NEXT:    andl $15, %r11d
+; AVX-NEXT:    movzbl (%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
 ; AVX-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
-; AVX-NEXT:    movzbl 1(%rdi), %r9d
-; AVX-NEXT:    andl $15, %r9d
+; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    movzbl (%rax,%rcx), %eax
+; AVX-NEXT:    vmovd %eax, %xmm0
+; AVX-NEXT:    movzbl 1(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $1, (%rax,%rcx), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 2(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
+; AVX-NEXT:    vpinsrb $2, (%rax,%rcx), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 3(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    movq %rax, -{{[0-9]+}}(%rsp) # 8-byte Spill
-; AVX-NEXT:    movzbl 4(%rdi), %r14d
-; AVX-NEXT:    andl $15, %r14d
-; AVX-NEXT:    movzbl 5(%rdi), %r15d
-; AVX-NEXT:    andl $15, %r15d
-; AVX-NEXT:    movzbl 6(%rdi), %r12d
-; AVX-NEXT:    andl $15, %r12d
-; AVX-NEXT:    movzbl 7(%rdi), %r13d
-; AVX-NEXT:    andl $15, %r13d
-; AVX-NEXT:    movzbl 8(%rdi), %r8d
-; AVX-NEXT:    andl $15, %r8d
+; AVX-NEXT:    vpinsrb $3, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 4(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $4, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 5(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $5, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 6(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $6, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 7(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $7, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 8(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $8, (%rax,%rcx), %xmm0, %xmm0
 ; AVX-NEXT:    movzbl 9(%rdi), %eax
 ; AVX-NEXT:    andl $15, %eax
-; AVX-NEXT:    movzbl 10(%rdi), %ecx
-; AVX-NEXT:    andl $15, %ecx
-; AVX-NEXT:    movzbl 11(%rdi), %edx
-; AVX-NEXT:    andl $15, %edx
-; AVX-NEXT:    movzbl 12(%rdi), %esi
-; AVX-NEXT:    andl $15, %esi
-; AVX-NEXT:    leaq -{{[0-9]+}}(%rsp), %rbp
-; AVX-NEXT:    movzbl (%r11,%rbp), %ebx
-; AVX-NEXT:    vmovd %ebx, %xmm0
-; AVX-NEXT:    movzbl 13(%rdi), %r11d
-; AVX-NEXT:    andl $15, %r11d
-; AVX-NEXT:    vpinsrb $1, (%r9,%rbp), %xmm0, %xmm0
-; AVX-NEXT:    movzbl 14(%rdi), %ebx
-; AVX-NEXT:    andl $15, %ebx
-; AVX-NEXT:    movzbl 15(%rdi), %edi
-; AVX-NEXT:    andl $15, %edi
-; AVX-NEXT:    movzbl (%rdi,%rbp), %r10d
-; AVX-NEXT:    movzbl (%rbx,%rbp), %r9d
-; AVX-NEXT:    movzbl (%r11,%rbp), %r11d
-; AVX-NEXT:    movzbl (%rsi,%rbp), %esi
-; AVX-NEXT:    movzbl (%rdx,%rbp), %edx
-; AVX-NEXT:    movzbl (%rcx,%rbp), %ecx
-; AVX-NEXT:    movzbl (%rax,%rbp), %eax
-; AVX-NEXT:    movzbl (%r8,%rbp), %r8d
-; AVX-NEXT:    movzbl (%r13,%rbp), %r13d
-; AVX-NEXT:    movzbl (%r12,%rbp), %r12d
-; AVX-NEXT:    movzbl (%r15,%rbp), %r15d
-; AVX-NEXT:    movzbl (%r14,%rbp), %r14d
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rdi # 8-byte Reload
-; AVX-NEXT:    movzbl (%rdi,%rbp), %edi
-; AVX-NEXT:    movq -{{[0-9]+}}(%rsp), %rbx # 8-byte Reload
-; AVX-NEXT:    movzbl (%rbx,%rbp), %ebp
-; AVX-NEXT:    vpinsrb $2, %ebp, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $3, %edi, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $4, %r14d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $5, %r15d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $6, %r12d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $7, %r13d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $8, %r8d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $9, %eax, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $11, %edx, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $12, %esi, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $13, %r11d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $14, %r9d, %xmm0, %xmm0
-; AVX-NEXT:    vpinsrb $15, %r10d, %xmm0, %xmm0
-; AVX-NEXT:    popq %rbx
-; AVX-NEXT:    popq %r12
-; AVX-NEXT:    popq %r13
-; AVX-NEXT:    popq %r14
-; AVX-NEXT:    popq %r15
-; AVX-NEXT:    popq %rbp
+; AVX-NEXT:    vpinsrb $9, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 10(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $10, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 11(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $11, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 12(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $12, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 13(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $13, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 14(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $14, (%rax,%rcx), %xmm0, %xmm0
+; AVX-NEXT:    movzbl 15(%rdi), %eax
+; AVX-NEXT:    andl $15, %eax
+; AVX-NEXT:    vpinsrb $15, (%rax,%rcx), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %p0  = getelementptr inbounds i8, i8* %i, i64 0
   %p1  = getelementptr inbounds i8, i8* %i, i64 1
@@ -1331,26 +1227,26 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; SSE2-NEXT:    andl $7, %r8d
 ; SSE2-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    andl $7, %r9d
-; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
-; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
 ; SSE2-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    movd %eax, %xmm2
 ; SSE2-NEXT:    movzwl -24(%rsp,%r9,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSE2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
-; SSE2-NEXT:    movzwl -40(%rsp,%rdx,2), %ecx
-; SSE2-NEXT:    movd %ecx, %xmm3
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSE2-NEXT:    movd %eax, %xmm0
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE2-NEXT:    movzwl -40(%rsp,%rdx,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm2
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; SSE2-NEXT:    movzwl -40(%rsp,%r8,2), %eax
 ; SSE2-NEXT:    movd %eax, %xmm1
+; SSE2-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; SSE2-NEXT:    movd %eax, %xmm0
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
@@ -1369,26 +1265,26 @@ define <8 x i16> @var_shuffle_v8i16_v8i16_xyxyxy00_i16(<8 x i16> %x, <8 x i16> %
 ; SSSE3-NEXT:    andl $7, %r8d
 ; SSSE3-NEXT:    movdqa %xmm1, -{{[0-9]+}}(%rsp)
 ; SSSE3-NEXT:    andl $7, %r9d
-; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
-; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm0
+; SSSE3-NEXT:    movzwl -24(%rsp,%rcx,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
 ; SSSE3-NEXT:    pxor %xmm1, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    movd %eax, %xmm2
 ; SSSE3-NEXT:    movzwl -24(%rsp,%r9,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    movzwl -24(%rsp,%rsi,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; SSSE3-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
-; SSSE3-NEXT:    movzwl -40(%rsp,%rdx,2), %ecx
-; SSSE3-NEXT:    movd %ecx, %xmm3
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; SSSE3-NEXT:    movd %eax, %xmm0
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSSE3-NEXT:    movzwl -40(%rsp,%rdx,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm2
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; SSSE3-NEXT:    movzwl -40(%rsp,%r8,2), %eax
 ; SSSE3-NEXT:    movd %eax, %xmm1
+; SSSE3-NEXT:    movzwl -40(%rsp,%rdi,2), %eax
+; SSSE3-NEXT:    movd %eax, %xmm0
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3]
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: var_shuffle_v8i16_v8i16_xyxyxy00_i16:
diff --git a/test/CodeGen/X86/vector-shuffle-variable-256.ll b/test/CodeGen/X86/vector-shuffle-variable-256.ll
index 42b3c11d3d6bf817f6d8faa5ece65c50c31265b1..b076bc993ef8c2d49fdebd02ef4dea93e03771e0 100644
--- a/test/CodeGen/X86/vector-shuffle-variable-256.ll
+++ b/test/CodeGen/X86/vector-shuffle-variable-256.ll
@@ -1,4 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; NOTE: Assertions have been autogenerated by update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2
 
@@ -13,16 +14,16 @@ define <4 x double> @var_shuffle_v4f64_v4f64_xxxx_i64(<4 x double> %x, i64 %i0,
 ; ALL-NEXT:    movq %rsp, %rbp
 ; ALL-NEXT:    andq $-32, %rsp
 ; ALL-NEXT:    subq $64, %rsp
-; ALL-NEXT:    andl $3, %ecx
-; ALL-NEXT:    andl $3, %edx
 ; ALL-NEXT:    andl $3, %esi
 ; ALL-NEXT:    andl $3, %edi
+; ALL-NEXT:    andl $3, %ecx
+; ALL-NEXT:    andl $3, %edx
 ; ALL-NEXT:    vmovaps %ymm0, (%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; ALL-NEXT:    movq %rbp, %rsp
 ; ALL-NEXT:    popq %rbp
 ; ALL-NEXT:    retq
@@ -68,16 +69,16 @@ define <4 x double> @var_shuffle_v4f64_v4f64_uxx0_i64(<4 x double> %x, i64 %i0,
 define <4 x double> @var_shuffle_v4f64_v2f64_xxxx_i64(<2 x double> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
 ; ALL-LABEL: var_shuffle_v4f64_v2f64_xxxx_i64:
 ; ALL:       # BB#0:
-; ALL-NEXT:    andl $1, %ecx
-; ALL-NEXT:    andl $1, %edx
 ; ALL-NEXT:    andl $1, %esi
 ; ALL-NEXT:    andl $1, %edi
+; ALL-NEXT:    andl $1, %ecx
+; ALL-NEXT:    andl $1, %edx
 ; ALL-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm0 = xmm0[0],mem[0]
 ; ALL-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; ALL-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; ALL-NEXT:    retq
   %x0 = extractelement <2 x double> %x, i64 %i0
   %x1 = extractelement <2 x double> %x, i64 %i1
@@ -97,18 +98,18 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
-; AVX1-NEXT:    andl $3, %ecx
-; AVX1-NEXT:    andl $3, %edx
-; AVX1-NEXT:    andl $3, %esi
 ; AVX1-NEXT:    andl $3, %edi
+; AVX1-NEXT:    andl $3, %esi
+; AVX1-NEXT:    andl $3, %edx
+; AVX1-NEXT:    andl $3, %ecx
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    movq %rbp, %rsp
 ; AVX1-NEXT:    popq %rbp
 ; AVX1-NEXT:    retq
@@ -119,18 +120,18 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
-; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    andl $3, %edx
-; AVX2-NEXT:    andl $3, %esi
 ; AVX2-NEXT:    andl $3, %edi
+; AVX2-NEXT:    andl $3, %esi
+; AVX2-NEXT:    andl $3, %edx
+; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    movq %rbp, %rsp
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
@@ -152,12 +153,12 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX1-NEXT:    movq %rsp, %rbp
 ; AVX1-NEXT:    andq $-32, %rsp
 ; AVX1-NEXT:    subq $64, %rsp
-; AVX1-NEXT:    andl $3, %esi
 ; AVX1-NEXT:    andl $3, %edi
+; AVX1-NEXT:    andl $3, %esi
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    movq %rbp, %rsp
@@ -170,12 +171,12 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
 ; AVX2-NEXT:    movq %rsp, %rbp
 ; AVX2-NEXT:    andq $-32, %rsp
 ; AVX2-NEXT:    subq $64, %rsp
-; AVX2-NEXT:    andl $3, %esi
 ; AVX2-NEXT:    andl $3, %edi
+; AVX2-NEXT:    andl $3, %esi
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-NEXT:    movq %rbp, %rsp
@@ -195,34 +196,34 @@ define <4 x i64> @var_shuffle_v4i64_v4i64_xx00_i64(<4 x i64> %x, i64 %i0, i64 %i
 define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i1, i64 %i2, i64 %i3) nounwind {
 ; AVX1-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    andl $1, %ecx
-; AVX1-NEXT:    andl $1, %edx
-; AVX1-NEXT:    andl $1, %esi
 ; AVX1-NEXT:    andl $1, %edi
+; AVX1-NEXT:    andl $1, %esi
+; AVX1-NEXT:    andl $1, %edx
+; AVX1-NEXT:    andl $1, %ecx
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: var_shuffle_v4i64_v2i64_xxxx_i64:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    andl $1, %ecx
-; AVX2-NEXT:    andl $1, %edx
-; AVX2-NEXT:    andl $1, %esi
 ; AVX2-NEXT:    andl $1, %edi
+; AVX2-NEXT:    andl $1, %esi
+; AVX2-NEXT:    andl $1, %edx
+; AVX2-NEXT:    andl $1, %ecx
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %x0 = extractelement <2 x i64> %x, i64 %i0
   %x1 = extractelement <2 x i64> %x, i64 %i1
@@ -236,70 +237,41 @@ define <4 x i64> @var_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64 %i0, i64 %i
 }
 
 define <8 x float> @var_shuffle_v8f32_v8f32_xxxxxxxx_i32(<8 x float> %x, i32 %i0, i32 %i1, i32 %i2, i32 %i3, i32 %i4, i32 %i5, i32 %i6, i32 %i7) nounwind {
-; AVX1-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
-; AVX1:       # BB#0:
-; AVX1-NEXT:    pushq %rbp
-; AVX1-NEXT:    movq %rsp, %rbp
-; AVX1-NEXT:    andq $-32, %rsp
-; AVX1-NEXT:    subq $64, %rsp
-; AVX1-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
-; AVX1-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
-; AVX1-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
-; AVX1-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
-; AVX1-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
-; AVX1-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
-; AVX1-NEXT:    andl $7, %edi
-; AVX1-NEXT:    andl $7, %esi
-; AVX1-NEXT:    andl $7, %edx
-; AVX1-NEXT:    andl $7, %ecx
-; AVX1-NEXT:    andl $7, %r8d
-; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
-; AVX1-NEXT:    andl $7, %r9d
-; AVX1-NEXT:    movl 16(%rbp), %r10d
-; AVX1-NEXT:    andl $7, %r10d
-; AVX1-NEXT:    movl 24(%rbp), %eax
-; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
-; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
-; AVX1-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT:    movq %rbp, %rsp
-; AVX1-NEXT:    popq %rbp
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
-; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovd %edi, %xmm1
-; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm1
-; AVX2-NEXT:    vmovd %esi, %xmm2
-; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm2
-; AVX2-NEXT:    vmovd %edx, %xmm3
-; AVX2-NEXT:    vpermps %ymm0, %ymm3, %ymm3
-; AVX2-NEXT:    vmovd %ecx, %xmm4
-; AVX2-NEXT:    vpermps %ymm0, %ymm4, %ymm4
-; AVX2-NEXT:    vmovd %r8d, %xmm5
-; AVX2-NEXT:    vpermps %ymm0, %ymm5, %ymm5
-; AVX2-NEXT:    vmovd %r9d, %xmm6
-; AVX2-NEXT:    vpermps %ymm0, %ymm6, %ymm6
-; AVX2-NEXT:    vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpermps %ymm0, %ymm7, %ymm7
-; AVX2-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpermps %ymm0, %ymm8, %ymm0
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[2,3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm5 = xmm5[0,1],xmm7[0],xmm5[3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm0 = xmm5[0,1,2],xmm0[0]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3]
-; AVX2-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
-; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    retq
+; ALL-LABEL: var_shuffle_v8f32_v8f32_xxxxxxxx_i32:
+; ALL:       # BB#0:
+; ALL-NEXT:    pushq %rbp
+; ALL-NEXT:    movq %rsp, %rbp
+; ALL-NEXT:    andq $-32, %rsp
+; ALL-NEXT:    subq $64, %rsp
+; ALL-NEXT:    # kill: %R9D<def> %R9D<kill> %R9<def>
+; ALL-NEXT:    # kill: %R8D<def> %R8D<kill> %R8<def>
+; ALL-NEXT:    # kill: %ECX<def> %ECX<kill> %RCX<def>
+; ALL-NEXT:    # kill: %EDX<def> %EDX<kill> %RDX<def>
+; ALL-NEXT:    # kill: %ESI<def> %ESI<kill> %RSI<def>
+; ALL-NEXT:    # kill: %EDI<def> %EDI<kill> %RDI<def>
+; ALL-NEXT:    andl $7, %edi
+; ALL-NEXT:    andl $7, %esi
+; ALL-NEXT:    andl $7, %edx
+; ALL-NEXT:    andl $7, %ecx
+; ALL-NEXT:    andl $7, %r8d
+; ALL-NEXT:    vmovaps %ymm0, (%rsp)
+; ALL-NEXT:    andl $7, %r9d
+; ALL-NEXT:    movl 16(%rbp), %r10d
+; ALL-NEXT:    andl $7, %r10d
+; ALL-NEXT:    movl 24(%rbp), %eax
+; ALL-NEXT:    andl $7, %eax
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
+; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; ALL-NEXT:    movq %rbp, %rsp
+; ALL-NEXT:    popq %rbp
+; ALL-NEXT:    retq
   %x0 = extractelement <8 x float> %x, i32 %i0
   %x1 = extractelement <8 x float> %x, i32 %i1
   %x2 = extractelement <8 x float> %x, i32 %i2
@@ -340,16 +312,14 @@ define <8 x float> @var_shuffle_v8f32_v4f32_xxxxxxxx_i32(<4 x float> %x, i32 %i0
 ; ALL-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; ALL-NEXT:    andl $3, %eax
 ; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
 ; ALL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; ALL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1],mem[0],xmm2[3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],mem[0]
-; ALL-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; ALL-NEXT:    vinsertps {{.*#+}} xmm3 = xmm3[0],mem[0],xmm3[2,3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm3[0,1],xmm0[0],xmm3[3]
-; ALL-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
-; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
+; ALL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],mem[0]
+; ALL-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; ALL-NEXT:    retq
   %x0 = extractelement <4 x float> %x, i32 %i0
   %x1 = extractelement <4 x float> %x, i32 %i1
@@ -390,32 +360,25 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX1-NEXT:    vmovd %eax, %xmm0
 ; AVX1-NEXT:    movl 40(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 48(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 56(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 64(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 72(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 80(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl 88(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    andl $15, %edi
 ; AVX1-NEXT:    movzwl (%rsp,%rdi,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm1
@@ -431,12 +394,10 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX1-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl 16(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl 24(%rbp), %eax
 ; AVX1-NEXT:    andl $15, %eax
-; AVX1-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    movq %rbp, %rsp
 ; AVX1-NEXT:    popq %rbp
@@ -461,32 +422,25 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX2-NEXT:    vmovd %eax, %xmm0
 ; AVX2-NEXT:    movl 40(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $1, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 48(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $2, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 56(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $3, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 64(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $4, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 72(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $5, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 80(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl 88(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    andl $15, %edi
 ; AVX2-NEXT:    movzwl (%rsp,%rdi,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
@@ -502,12 +456,10 @@ define <16 x i16> @var_shuffle_v16i16_v16i16_xxxxxxxxxxxxxxxx_i16(<16 x i16> %x,
 ; AVX2-NEXT:    vpinsrw $5, (%rsp,%r9,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl 16(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $6, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl 24(%rbp), %eax
 ; AVX2-NEXT:    andl $15, %eax
-; AVX2-NEXT:    movzwl (%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $7, (%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    movq %rbp, %rsp
 ; AVX2-NEXT:    popq %rbp
@@ -563,32 +515,25 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
 ; AVX1-NEXT:    vmovd %eax, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX1-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX1-NEXT:    andl $7, %edi
 ; AVX1-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; AVX1-NEXT:    vmovd %eax, %xmm1
@@ -604,12 +549,10 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
 ; AVX1-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX1-NEXT:    andl $7, %eax
-; AVX1-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX1-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX1-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -628,32 +571,25 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
 ; AVX2-NEXT:    vmovd %eax, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $1, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $1, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $2, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $2, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $3, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $3, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $4, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $4, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $5, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $6, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $7, %eax, %xmm0, %xmm0
+; AVX2-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm0, %xmm0
 ; AVX2-NEXT:    andl $7, %edi
 ; AVX2-NEXT:    movzwl -24(%rsp,%rdi,2), %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
@@ -669,12 +605,10 @@ define <16 x i16> @var_shuffle_v16i16_v8i16_xxxxxxxxxxxxxxxx_i16(<8 x i16> %x, i
 ; AVX2-NEXT:    vpinsrw $5, -24(%rsp,%r9,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $6, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $6, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    movl {{[0-9]+}}(%rsp), %eax
 ; AVX2-NEXT:    andl $7, %eax
-; AVX2-NEXT:    movzwl -24(%rsp,%rax,2), %eax
-; AVX2-NEXT:    vpinsrw $7, %eax, %xmm1, %xmm1
+; AVX2-NEXT:    vpinsrw $7, -24(%rsp,%rax,2), %xmm1, %xmm1
 ; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %x0  = extractelement <8 x i16> %x, i32 %i0
@@ -734,11 +668,11 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi
 ; AVX1-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    movq %rbp, %rsp
 ; AVX1-NEXT:    popq %rbp
 ; AVX1-NEXT:    retq
@@ -760,11 +694,11 @@ define <4 x i64> @mem_shuffle_v4i64_v4i64_xxxx_i64(<4 x i64> %x, i64* %i) nounwi
 ; AVX2-NEXT:    vmovaps %ymm0, (%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    movq %rbp, %rsp
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
@@ -801,11 +735,11 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwi
 ; AVX1-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: mem_shuffle_v4i64_v2i64_xxxx_i64:
@@ -821,11 +755,11 @@ define <4 x i64> @mem_shuffle_v4i64_v2i64_xxxx_i64(<2 x i64> %x, i64* %i) nounwi
 ; AVX2-NEXT:    vmovaps %xmm0, -{{[0-9]+}}(%rsp)
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
+; AVX2-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
 ; AVX2-NEXT:    retq
   %p0  = getelementptr inbounds i64, i64* %i, i32 0
   %p1  = getelementptr inbounds i64, i64* %i, i32 1
diff --git a/test/CodeGen/X86/vector-trunc-math.ll b/test/CodeGen/X86/vector-trunc-math.ll
index d6b2b72fb367e1650dbd2c93bcd8382e18e19a4e..ab34ad6a613cc1e69a20f88587cd1dda7bd26d28 100644
--- a/test/CodeGen/X86/vector-trunc-math.ll
+++ b/test/CodeGen/X86/vector-trunc-math.ll
@@ -42,6 +42,7 @@ define <4 x i32> @trunc_add_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = add <4 x i64> %a0, %a1
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -99,7 +100,7 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -109,6 +110,7 @@ define <8 x i16> @trunc_add_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpaddq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = add <8 x i64> %a0, %a1
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -143,7 +145,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX2-LABEL: trunc_add_v8i32_v8i16:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -154,6 +156,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX512-NEXT:    vpaddd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = add <8 x i32> %a0, %a1
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -237,7 +240,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -262,6 +265,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_add_v16i64_v16i8:
@@ -272,6 +276,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_add_v16i64_v16i8:
@@ -282,6 +287,7 @@ define <16 x i8> @trunc_add_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = add <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -330,7 +336,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpaddd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -346,6 +352,7 @@ define <16 x i8> @trunc_add_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpaddd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = add <16 x i32> %a0, %a1
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -392,6 +399,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512F-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_add_v16i16_v16i8:
@@ -399,6 +407,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512BW-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_add_v16i16_v16i8:
@@ -406,6 +415,7 @@ define <16 x i8> @trunc_add_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512DQ-NEXT:    vpaddw %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = add <16 x i16> %a0, %a1
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -440,7 +450,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
 ; AVX2-LABEL: trunc_add_v8i32_v8i16_sext_8i8:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -452,6 +462,7 @@ define <8 x i16> @trunc_add_v8i32_v8i16_sext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
 ; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512-NEXT:    vpmovsxbw %xmm0, %xmm0
 ; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = sext <8 x i8> %1 to <8 x i32>
@@ -492,6 +503,7 @@ define <4 x i32> @trunc_add_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    vpaddd {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = add <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -539,7 +551,7 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -549,6 +561,7 @@ define <8 x i16> @trunc_add_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = add <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -579,7 +592,7 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_add_const_v8i32_v8i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -590,6 +603,7 @@ define <8 x i16> @trunc_add_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    vpaddw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = add <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -621,22 +635,22 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_add_const_v16i64_v16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -651,7 +665,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -676,6 +690,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_add_const_v16i64_v16i8:
@@ -685,6 +700,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_add_const_v16i64_v16i8:
@@ -694,6 +710,7 @@ define <16 x i8> @trunc_add_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = add <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -717,13 +734,13 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_add_const_v16i32_v16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
@@ -732,7 +749,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_add_const_v16i32_v16i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -749,6 +766,7 @@ define <16 x i8> @trunc_add_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = add <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -792,6 +810,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_add_const_v16i16_v16i8:
@@ -799,6 +818,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_add_const_v16i16_v16i8:
@@ -806,6 +826,7 @@ define <16 x i8> @trunc_add_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpaddb {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = add <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -848,6 +869,7 @@ define <4 x i32> @trunc_sub_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX512-NEXT:    vpsubq %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <4 x i64> %a0, %a1
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -905,7 +927,7 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -915,6 +937,7 @@ define <8 x i16> @trunc_sub_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsubq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <8 x i64> %a0, %a1
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -949,7 +972,7 @@ define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX2-LABEL: trunc_sub_v8i32_v8i16:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -960,6 +983,7 @@ define <8 x i16> @trunc_sub_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX512-NEXT:    vpsubd %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <8 x i32> %a0, %a1
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -1043,7 +1067,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -1068,6 +1092,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_sub_v16i64_v16i8:
@@ -1078,6 +1103,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_sub_v16i64_v16i8:
@@ -1088,6 +1114,7 @@ define <16 x i8> @trunc_sub_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = sub <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -1136,7 +1163,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsubd %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpsubd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -1152,6 +1179,7 @@ define <16 x i8> @trunc_sub_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsubd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <16 x i32> %a0, %a1
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -1198,6 +1226,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_sub_v16i16_v16i8:
@@ -1205,6 +1234,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512BW-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_sub_v16i16_v16i8:
@@ -1212,6 +1242,7 @@ define <16 x i8> @trunc_sub_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512DQ-NEXT:    vpsubw %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = sub <16 x i16> %a0, %a1
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -1259,6 +1290,7 @@ define <4 x i32> @trunc_sub_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX512-NEXT:    vpsubq {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -1320,7 +1352,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -1330,6 +1362,7 @@ define <8 x i16> @trunc_sub_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsubq {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -1363,7 +1396,7 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ; AVX2-LABEL: trunc_sub_const_v8i32_v8i16:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -1374,6 +1407,7 @@ define <8 x i16> @trunc_sub_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ; AVX512-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -1459,7 +1493,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -1484,6 +1518,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_sub_const_v16i64_v16i8:
@@ -1494,6 +1529,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_sub_const_v16i64_v16i8:
@@ -1504,6 +1540,7 @@ define <16 x i8> @trunc_sub_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = sub <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -1550,7 +1587,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm0, %ymm0
 ; AVX2-NEXT:    vpsubd {{.*}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -1566,6 +1603,7 @@ define <16 x i8> @trunc_sub_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpsubd {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = sub <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -1611,6 +1649,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512F-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_sub_const_v16i16_v16i8:
@@ -1618,6 +1657,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512BW-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_sub_const_v16i16_v16i8:
@@ -1625,6 +1665,7 @@ define <16 x i8> @trunc_sub_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512DQ-NEXT:    vpsubw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = sub <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -1688,6 +1729,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512F-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512F-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_mul_v4i64_v4i32:
@@ -1697,6 +1739,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512BW-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpmulld %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_mul_v4i64_v4i32:
@@ -1706,6 +1749,7 @@ define <4 x i32> @trunc_mul_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512DQ-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = mul <4 x i64> %a0, %a1
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -1773,7 +1817,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,2,2,3,4,6,6,7]
@@ -1792,6 +1836,7 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX512F-NEXT:    vpmovqw %zmm1, %xmm1
 ; AVX512F-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512F-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_mul_v8i64_v8i16:
@@ -1799,12 +1844,14 @@ define <8 x i16> @trunc_mul_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX512BW-NEXT:    vpmovqw %zmm1, %xmm1
 ; AVX512BW-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_mul_v8i64_v8i16:
 ; AVX512DQ:       # BB#0:
 ; AVX512DQ-NEXT:    vpmullq %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = mul <8 x i64> %a0, %a1
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -1851,7 +1898,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX2-LABEL: trunc_mul_v8i32_v8i16:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -1862,6 +1909,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX512-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = mul <8 x i32> %a0, %a1
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -2083,7 +2131,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vpmulld %xmm6, %xmm2, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -2116,6 +2164,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512F-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_mul_v16i64_v16i8:
@@ -2128,6 +2177,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512BW-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_mul_v16i64_v16i8:
@@ -2138,6 +2188,7 @@ define <16 x i8> @trunc_mul_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = mul <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -2210,7 +2261,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmulld %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpmulld %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -2226,6 +2277,7 @@ define <16 x i8> @trunc_mul_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmulld %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = mul <16 x i32> %a0, %a1
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -2272,6 +2324,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512F-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_mul_v16i16_v16i8:
@@ -2279,6 +2332,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512BW-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_mul_v16i16_v16i8:
@@ -2286,6 +2340,7 @@ define <16 x i8> @trunc_mul_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512DQ-NEXT:    vpmullw %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = mul <16 x i16> %a0, %a1
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -2320,7 +2375,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
 ; AVX2-LABEL: trunc_mul_v8i32_v8i16_zext_8i8:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm1[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -2332,6 +2387,7 @@ define <8 x i16> @trunc_mul_v8i32_v8i16_zext_8i8(<16 x i8> %a0, <8 x i32> %a1) {
 ; AVX512-NEXT:    vpmovdw %zmm1, %ymm1
 ; AVX512-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
 ; AVX512-NEXT:    vpmullw %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = shufflevector <16 x i8> %a0, <16 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = zext <8 x i8> %1 to <8 x i32>
@@ -2387,6 +2443,7 @@ define <4 x i32> @trunc_mul_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    vpmulld {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = mul <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -2434,7 +2491,7 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -2444,6 +2501,7 @@ define <8 x i16> @trunc_mul_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = mul <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -2474,7 +2532,7 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_mul_const_v8i32_v8i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -2485,6 +2543,7 @@ define <8 x i16> @trunc_mul_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    vpmullw {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = mul <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -2654,7 +2713,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vpmulld {{.*}}(%rip), %xmm3, %xmm3
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -2681,6 +2740,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512F-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_mul_const_v16i64_v16i8:
@@ -2691,6 +2751,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512BW-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_mul_const_v16i64_v16i8:
@@ -2701,6 +2762,7 @@ define <16 x i8> @trunc_mul_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512DQ-NEXT:    vpmulld {{.*}}(%rip), %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = mul <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -2773,7 +2835,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_mul_const_v16i32_v16i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vpmullw {{.*}}(%rip), %xmm1, %xmm1
@@ -2791,6 +2853,7 @@ define <16 x i8> @trunc_mul_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmulld {{.*}}(%rip), %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = mul <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -2836,6 +2899,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512F-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_mul_const_v16i16_v16i8:
@@ -2843,6 +2907,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512BW-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_mul_const_v16i16_v16i8:
@@ -2850,6 +2915,7 @@ define <16 x i8> @trunc_mul_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512DQ-NEXT:    vpmullw {{.*}}(%rip), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = mul <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -2890,6 +2956,7 @@ define <4 x i32> @trunc_and_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = and <4 x i64> %a0, %a1
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -2943,7 +3010,7 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -2953,6 +3020,7 @@ define <8 x i16> @trunc_and_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = and <8 x i64> %a0, %a1
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -2985,7 +3053,7 @@ define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX2-LABEL: trunc_and_v8i32_v8i16:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -2996,6 +3064,7 @@ define <8 x i16> @trunc_and_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = and <8 x i32> %a0, %a1
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -3071,7 +3140,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -3096,6 +3165,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_and_v16i64_v16i8:
@@ -3106,6 +3176,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_and_v16i64_v16i8:
@@ -3116,6 +3187,7 @@ define <16 x i8> @trunc_and_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = and <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -3160,7 +3232,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpand %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -3176,6 +3248,7 @@ define <16 x i8> @trunc_and_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpandd %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = and <16 x i32> %a0, %a1
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -3220,6 +3293,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_and_v16i16_v16i8:
@@ -3227,6 +3301,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_and_v16i16_v16i8:
@@ -3234,6 +3309,7 @@ define <16 x i8> @trunc_and_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = and <16 x i16> %a0, %a1
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -3272,6 +3348,7 @@ define <4 x i32> @trunc_and_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = and <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -3319,7 +3396,7 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -3329,6 +3406,7 @@ define <8 x i16> @trunc_and_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = and <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -3359,7 +3437,7 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_and_const_v8i32_v8i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -3370,6 +3448,7 @@ define <8 x i16> @trunc_and_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = and <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -3401,22 +3480,22 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_and_const_v16i64_v16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -3431,7 +3510,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -3456,6 +3535,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_and_const_v16i64_v16i8:
@@ -3465,6 +3545,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_and_const_v16i64_v16i8:
@@ -3474,6 +3555,7 @@ define <16 x i8> @trunc_and_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = and <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -3497,13 +3579,13 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_and_const_v16i32_v16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
@@ -3512,7 +3594,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_and_const_v16i32_v16i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -3529,6 +3611,7 @@ define <16 x i8> @trunc_and_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = and <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -3572,6 +3655,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_and_const_v16i16_v16i8:
@@ -3579,6 +3663,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_and_const_v16i16_v16i8:
@@ -3586,6 +3671,7 @@ define <16 x i8> @trunc_and_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = and <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -3626,6 +3712,7 @@ define <4 x i32> @trunc_xor_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = xor <4 x i64> %a0, %a1
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -3679,7 +3766,7 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -3689,6 +3776,7 @@ define <8 x i16> @trunc_xor_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = xor <8 x i64> %a0, %a1
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -3721,7 +3809,7 @@ define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX2-LABEL: trunc_xor_v8i32_v8i16:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -3732,6 +3820,7 @@ define <8 x i16> @trunc_xor_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = xor <8 x i32> %a0, %a1
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -3807,7 +3896,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -3832,6 +3921,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_xor_v16i64_v16i8:
@@ -3842,6 +3932,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_xor_v16i64_v16i8:
@@ -3852,6 +3943,7 @@ define <16 x i8> @trunc_xor_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwin
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = xor <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -3896,7 +3988,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpxor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpxor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -3912,6 +4004,7 @@ define <16 x i8> @trunc_xor_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwin
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpxord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = xor <16 x i32> %a0, %a1
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -3956,6 +4049,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512F-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_xor_v16i16_v16i8:
@@ -3963,6 +4057,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512BW-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_xor_v16i16_v16i8:
@@ -3970,6 +4065,7 @@ define <16 x i8> @trunc_xor_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwin
 ; AVX512DQ-NEXT:    vpxor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = xor <16 x i16> %a0, %a1
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -4008,6 +4104,7 @@ define <4 x i32> @trunc_xor_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = xor <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -4055,7 +4152,7 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -4065,6 +4162,7 @@ define <8 x i16> @trunc_xor_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = xor <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -4095,7 +4193,7 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_xor_const_v8i32_v8i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -4106,6 +4204,7 @@ define <8 x i16> @trunc_xor_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = xor <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -4137,22 +4236,22 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_xor_const_v16i64_v16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -4167,7 +4266,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -4192,6 +4291,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_xor_const_v16i64_v16i8:
@@ -4201,6 +4301,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_xor_const_v16i64_v16i8:
@@ -4210,6 +4311,7 @@ define <16 x i8> @trunc_xor_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = xor <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -4233,13 +4335,13 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_xor_const_v16i32_v16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
@@ -4248,7 +4350,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_xor_const_v16i32_v16i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -4265,6 +4367,7 @@ define <16 x i8> @trunc_xor_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = xor <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -4308,6 +4411,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_xor_const_v16i16_v16i8:
@@ -4315,6 +4419,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_xor_const_v16i16_v16i8:
@@ -4322,6 +4427,7 @@ define <16 x i8> @trunc_xor_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpxor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = xor <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -4362,6 +4468,7 @@ define <4 x i32> @trunc_or_v4i64_v4i32(<4 x i64> %a0, <4 x i64> %a1) nounwind {
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = or <4 x i64> %a0, %a1
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -4415,7 +4522,7 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -4425,6 +4532,7 @@ define <8 x i16> @trunc_or_v8i64_v8i16(<8 x i64> %a0, <8 x i64> %a1) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vporq %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = or <8 x i64> %a0, %a1
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -4457,7 +4565,7 @@ define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX2-LABEL: trunc_or_v8i32_v8i16:
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -4468,6 +4576,7 @@ define <8 x i16> @trunc_or_v8i32_v8i16(<8 x i32> %a0, <8 x i32> %a1) nounwind {
 ; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = or <8 x i32> %a0, %a1
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -4543,7 +4652,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -4568,6 +4677,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
 ; AVX512F-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_or_v16i64_v16i8:
@@ -4578,6 +4688,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
 ; AVX512BW-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_or_v16i64_v16i8:
@@ -4588,6 +4699,7 @@ define <16 x i8> @trunc_or_v16i64_v16i8(<16 x i64> %a0, <16 x i64> %a1) nounwind
 ; AVX512DQ-NEXT:    vpmovqd %zmm1, %ymm1
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = or <16 x i64> %a0, %a1
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -4632,7 +4744,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
 ; AVX2:       # BB#0:
 ; AVX2-NEXT:    vpor %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpor %ymm3, %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -4648,6 +4760,7 @@ define <16 x i8> @trunc_or_v16i32_v16i8(<16 x i32> %a0, <16 x i32> %a1) nounwind
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpord %zmm1, %zmm0, %zmm0
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = or <16 x i32> %a0, %a1
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -4692,6 +4805,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
 ; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_or_v16i16_v16i8:
@@ -4699,6 +4813,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
 ; AVX512BW-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_or_v16i16_v16i8:
@@ -4706,6 +4821,7 @@ define <16 x i8> @trunc_or_v16i16_v16i8(<16 x i16> %a0, <16 x i16> %a1) nounwind
 ; AVX512DQ-NEXT:    vpor %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = or <16 x i16> %a0, %a1
   %2 = trunc <16 x i16> %1 to <16 x i8>
@@ -4744,6 +4860,7 @@ define <4 x i32> @trunc_or_const_v4i64_v4i32(<4 x i64> %a0) nounwind {
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpmovqd %zmm0, %ymm0
 ; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = or <4 x i64> %a0, <i64 0, i64 1, i64 2, i64 3>
   %2 = trunc <4 x i64> %1 to <4 x i32>
@@ -4791,7 +4908,7 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -4801,6 +4918,7 @@ define <8 x i16> @trunc_or_const_v8i64_v8i16(<8 x i64> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
 ; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = or <8 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>
   %2 = trunc <8 x i64> %1 to <8 x i16>
@@ -4831,7 +4949,7 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_or_const_v8i32_v8i16:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
 ; AVX2-NEXT:    vzeroupper
@@ -4842,6 +4960,7 @@ define <8 x i16> @trunc_or_const_v8i32_v8i16(<8 x i32> %a0) nounwind {
 ; AVX512-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = or <8 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   %2 = trunc <8 x i32> %1 to <8 x i16>
@@ -4873,22 +4992,22 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX1-LABEL: trunc_or_const_v16i64_v16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm4, %xmm4
-; AVX1-NEXT:    vandps %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vpand %xmm5, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm4, %xmm2, %xmm2
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm3, %xmm3
-; AVX1-NEXT:    vandps %xmm5, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm5, %xmm3, %xmm3
+; AVX1-NEXT:    vpand %xmm5, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
@@ -4903,7 +5022,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm3 = ymm3[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -4928,6 +5047,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_or_const_v16i64_v16i8:
@@ -4937,6 +5057,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512BW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_or_const_v16i64_v16i8:
@@ -4946,6 +5067,7 @@ define <16 x i8> @trunc_or_const_v16i64_v16i8(<16 x i64> %a0) nounwind {
 ; AVX512DQ-NEXT:    vinserti32x8 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = or <16 x i64> %a0, <i64 0, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7, i64 8, i64 9, i64 10, i64 11, i64 12, i64 13, i64 14, i64 15>
   %2 = trunc <16 x i64> %1 to <16 x i8>
@@ -4969,13 +5091,13 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX1-LABEL: trunc_or_const_v16i32_v16i8:
 ; AVX1:       # BB#0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
@@ -4984,7 +5106,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ;
 ; AVX2-LABEL: trunc_or_const_v16i32_v16i8:
 ; AVX2:       # BB#0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -5001,6 +5123,7 @@ define <16 x i8> @trunc_or_const_v16i32_v16i8(<16 x i32> %a0) nounwind {
 ; AVX512:       # BB#0:
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
   %1 = or <16 x i32> %a0, <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   %2 = trunc <16 x i32> %1 to <16 x i8>
@@ -5044,6 +5167,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc_or_const_v16i16_v16i8:
@@ -5051,6 +5175,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: trunc_or_const_v16i16_v16i8:
@@ -5058,6 +5183,7 @@ define <16 x i8> @trunc_or_const_v16i16_v16i8(<16 x i16> %a0) nounwind {
 ; AVX512DQ-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512DQ-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512DQ-NEXT:    vpor {{.*}}(%rip), %xmm0, %xmm0
+; AVX512DQ-NEXT:    vzeroupper
 ; AVX512DQ-NEXT:    retq
   %1 = or <16 x i16> %a0, <i16 0, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8, i16 9, i16 10, i16 11, i16 12, i16 13, i16 14, i16 15>
   %2 = trunc <16 x i16> %1 to <16 x i8>
diff --git a/test/CodeGen/X86/vector-trunc.ll b/test/CodeGen/X86/vector-trunc.ll
index 2571a21ce21802f074ec34b484e0c036d87ccfd3..d39a90b066f5ecfa49b8dfb8d3a3ff864b1590b0 100644
--- a/test/CodeGen/X86/vector-trunc.ll
+++ b/test/CodeGen/X86/vector-trunc.ll
@@ -111,7 +111,7 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,2,2,3,4,6,6,7]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -120,6 +120,7 @@ define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
 ; AVX512-LABEL: trunc8i64_8i16:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vpmovqw %zmm0, %xmm0
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   %0 = trunc <8 x i64> %a to <8 x i16>
@@ -144,13 +145,13 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
 ; AVX1-LABEL: trunc8i64_8i8:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,0,0,0,0,255,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
@@ -175,6 +176,7 @@ define void @trunc8i64_8i8(<8 x i64> %a) {
 ; AVX512-LABEL: trunc8i64_8i8:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vpmovqb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   %0 = trunc <8 x i64> %a to <8 x i8>
@@ -220,7 +222,7 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
 ;
 ; AVX2-LABEL: trunc8i32_8i16:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
 ; AVX2-NEXT:    vzeroupper
@@ -231,11 +233,13 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
 ; AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512F-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc8i32_8i16:
 ; AVX512VL:       # BB#0: # %entry
 ; AVX512VL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc8i32_8i16:
@@ -243,11 +247,13 @@ define <8 x i16> @trunc8i32_8i16(<8 x i32> %a) {
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512BW-NEXT:    # kill: %XMM0<def> %XMM0<kill> %YMM0<kill>
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc8i32_8i16:
 ; AVX512BWVL:       # BB#0: # %entry
 ; AVX512BWVL-NEXT:    vpmovdw %ymm0, %xmm0
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 entry:
   %0 = trunc <8 x i32> %a to <8 x i16>
@@ -296,7 +302,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
 ;
 ; AVX2-LABEL: trunc8i32_8i8:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13],zero,zero,zero,zero,zero,zero,zero,zero,ymm0[16,17,20,21,24,25,28,29],zero,zero,zero,zero,zero,zero,zero,zero
+; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; AVX2-NEXT:    vmovq %xmm0, (%rax)
@@ -309,11 +315,13 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovq %xmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc8i32_8i8:
 ; AVX512VL:       # BB#0: # %entry
 ; AVX512VL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc8i32_8i8:
@@ -322,11 +330,13 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
 ; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc8i32_8i8:
 ; AVX512BWVL:       # BB#0: # %entry
 ; AVX512BWVL-NEXT:    vpmovdb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 entry:
   %0 = trunc <8 x i32> %a to <8 x i8>
@@ -398,7 +408,7 @@ define void @trunc16i32_16i16(<16 x i32> %a) {
 ;
 ; AVX2-LABEL: trunc16i32_16i16:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
@@ -411,6 +421,7 @@ define void @trunc16i32_16i16(<16 x i32> %a) {
 ; AVX512-LABEL: trunc16i32_16i16:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vpmovdw %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   %0 = trunc <16 x i32> %a to <16 x i16>
@@ -435,13 +446,13 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
 ; AVX1-LABEL: trunc16i32_16i8:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm1, %xmm1
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0]
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vandps %xmm3, %xmm0, %xmm0
+; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vpand %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpackuswb %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqu %xmm0, (%rax)
@@ -450,7 +461,7 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
 ;
 ; AVX2-LABEL: trunc16i32_16i8:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128,0,1,4,5,8,9,12,13,128,128,128,128,128,128,128,128]
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u>
@@ -466,6 +477,7 @@ define void @trunc16i32_16i8(<16 x i32> %a) {
 ; AVX512-LABEL: trunc16i32_16i8:
 ; AVX512:       # BB#0: # %entry
 ; AVX512-NEXT:    vpmovdb %zmm0, (%rax)
+; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
 entry:
   %0 = trunc <16 x i32> %a to <16 x i8>
@@ -529,6 +541,7 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
 ; AVX512F-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc16i16_16i8:
@@ -536,6 +549,7 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
 ; AVX512VL-NEXT:    vpmovsxwd %ymm0, %zmm0
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512VL-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc16i16_16i8:
@@ -543,11 +557,13 @@ define void @trunc16i16_16i8(<16 x i16> %a) {
 ; AVX512BW-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
 ; AVX512BW-NEXT:    vpmovwb %zmm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqu %xmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc16i16_16i8:
 ; AVX512BWVL:       # BB#0: # %entry
 ; AVX512BWVL-NEXT:    vpmovwb %ymm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 entry:
   %0 = trunc <16 x i16> %a to <16 x i8>
@@ -635,6 +651,7 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
 ; AVX512F-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc32i16_32i8:
@@ -645,16 +662,19 @@ define void @trunc32i16_32i8(<32 x i16> %a) {
 ; AVX512VL-NEXT:    vpmovdb %zmm1, %xmm1
 ; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512VL-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc32i16_32i8:
 ; AVX512BW:       # BB#0: # %entry
 ; AVX512BW-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc32i16_32i8:
 ; AVX512BWVL:       # BB#0: # %entry
 ; AVX512BWVL-NEXT:    vpmovwb %zmm0, (%rax)
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 entry:
   %0 = trunc <32 x i16> %a to <32 x i8>
@@ -810,6 +830,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: trunc2x4i64_8i16:
@@ -823,6 +844,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
 ; AVX512VL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512BW-LABEL: trunc2x4i64_8i16:
@@ -835,6 +857,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BW-NEXT:    vzeroupper
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: trunc2x4i64_8i16:
@@ -848,6 +871,7 @@ define <8 x i16> @trunc2x4i64_8i16(<4 x i64> %a, <4 x i64> %b) {
 ; AVX512BWVL-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX512BWVL-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; AVX512BWVL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX512BWVL-NEXT:    vzeroupper
 ; AVX512BWVL-NEXT:    retq
 entry:
   %0 = trunc <4 x i64> %a to <4 x i16>
diff --git a/test/CodeGen/X86/vector-tzcnt-128.ll b/test/CodeGen/X86/vector-tzcnt-128.ll
index bf32e672138ce0e9387b4cf6af13aa725a096212..56f634c4188fdd64058144d2caa58385a323bb1c 100644
--- a/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -281,6 +281,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ; AVX512CD-NEXT:    vplzcntq %zmm0, %zmm0
 ; AVX512CD-NEXT:    vmovdqa {{.*#+}} xmm1 = [63,63]
 ; AVX512CD-NEXT:    vpsubq %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv2i64u:
@@ -696,6 +697,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ; AVX512CD-NEXT:    vplzcntd %zmm0, %zmm0
 ; AVX512CD-NEXT:    vpbroadcastd {{.*}}(%rip), %xmm1
 ; AVX512CD-NEXT:    vpsubd %xmm0, %xmm1, %xmm0
+; AVX512CD-NEXT:    vzeroupper
 ; AVX512CD-NEXT:    retq
 ;
 ; X32-SSE-LABEL: testv4i32u:
@@ -1258,23 +1260,8 @@ define <2 x i64> @foldv2i64() nounwind {
 ;
 ; X32-SSE-LABEL: foldv2i64:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [256,0,4294967295,4294967295]
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    psubq %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm0, %xmm2
-; X32-SSE-NEXT:    psubq {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X32-SSE-NEXT:    pand %xmm3, %xmm4
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
-; X32-SSE-NEXT:    pshufb %xmm4, %xmm5
-; X32-SSE-NEXT:    psrlw $4, %xmm2
-; X32-SSE-NEXT:    pand %xmm3, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    paddb %xmm5, %xmm0
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
+; X32-SSE-NEXT:    movl $8, %eax
+; X32-SSE-NEXT:    movd %eax, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
   ret <2 x i64> %out
@@ -1295,23 +1282,8 @@ define <2 x i64> @foldv2i64u() nounwind {
 ;
 ; X32-SSE-LABEL: foldv2i64u:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [256,0,4294967295,4294967295]
-; X32-SSE-NEXT:    pxor %xmm1, %xmm1
-; X32-SSE-NEXT:    pxor %xmm2, %xmm2
-; X32-SSE-NEXT:    psubq %xmm0, %xmm2
-; X32-SSE-NEXT:    pand %xmm0, %xmm2
-; X32-SSE-NEXT:    psubq {{\.LCPI.*}}, %xmm2
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X32-SSE-NEXT:    pand %xmm3, %xmm4
-; X32-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-SSE-NEXT:    movdqa %xmm0, %xmm5
-; X32-SSE-NEXT:    pshufb %xmm4, %xmm5
-; X32-SSE-NEXT:    psrlw $4, %xmm2
-; X32-SSE-NEXT:    pand %xmm3, %xmm2
-; X32-SSE-NEXT:    pshufb %xmm2, %xmm0
-; X32-SSE-NEXT:    paddb %xmm5, %xmm0
-; X32-SSE-NEXT:    psadbw %xmm1, %xmm0
+; X32-SSE-NEXT:    movl $8, %eax
+; X32-SSE-NEXT:    movd %eax, %xmm0
 ; X32-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
   ret <2 x i64> %out
diff --git a/test/CodeGen/X86/vector-tzcnt-256.ll b/test/CodeGen/X86/vector-tzcnt-256.ll
index 0ced0c5b263f757f882745f56bd5b432e62b8725..a0b277ddd7327929cadac6aecea4aff3dd74d9a4 100644
--- a/test/CodeGen/X86/vector-tzcnt-256.ll
+++ b/test/CodeGen/X86/vector-tzcnt-256.ll
@@ -871,20 +871,7 @@ define <4 x i64> @foldv4i64() nounwind {
 ;
 ; X32-AVX-LABEL: foldv4i64:
 ; X32-AVX:       # BB#0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [256,0,4294967295,4294967295,0,0,255,0]
-; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0]
 ; X32-AVX-NEXT:    retl
   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 0)
   ret <4 x i64> %out
@@ -898,20 +885,7 @@ define <4 x i64> @foldv4i64u() nounwind {
 ;
 ; X32-AVX-LABEL: foldv4i64u:
 ; X32-AVX:       # BB#0:
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm0 = [256,0,4294967295,4294967295,0,0,255,0]
-; X32-AVX-NEXT:    vpxor %ymm1, %ymm1, %ymm1
-; X32-AVX-NEXT:    vpsubq %ymm0, %ymm1, %ymm2
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpsubq {{\.LCPI.*}}, %ymm0, %ymm0
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm3
-; X32-AVX-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; X32-AVX-NEXT:    vpshufb %ymm3, %ymm4, %ymm3
-; X32-AVX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpand %ymm2, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpshufb %ymm0, %ymm4, %ymm0
-; X32-AVX-NEXT:    vpaddb %ymm3, %ymm0, %ymm0
-; X32-AVX-NEXT:    vpsadbw %ymm1, %ymm0, %ymm0
+; X32-AVX-NEXT:    vmovaps {{.*#+}} ymm0 = [8,0,0,0,64,0,0,0]
 ; X32-AVX-NEXT:    retl
   %out = call <4 x i64> @llvm.cttz.v4i64(<4 x i64> <i64 256, i64 -1, i64 0, i64 255>, i1 -1)
   ret <4 x i64> %out
diff --git a/test/CodeGen/X86/vector-zext.ll b/test/CodeGen/X86/vector-zext.ll
index e1c3d8f7d083e1b11e3849f0c1c7583bd8ef3910..fe3523de357558e40417c4248d3ebc4b5f6de590 100644
--- a/test/CodeGen/X86/vector-zext.ll
+++ b/test/CodeGen/X86/vector-zext.ll
@@ -55,18 +55,18 @@ define <16 x i16> @zext_16i8_to_16i16(<16 x i8> %A) {
 ;
 ; SSE41-LABEL: zext_16i8_to_16i16:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_16i8_to_16i16:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_16i8_to_16i16:
@@ -110,25 +110,27 @@ define <32 x i16> @zext_32i8_to_32i16(<32 x i8> %A) {
 ;
 ; SSE41-LABEL: zext_32i8_to_32i16:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
-; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero,xmm3[4],zero,xmm3[5],zero,xmm3[6],zero,xmm3[7],zero
-; SSE41-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_32i8_to_32i16:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
-; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero,xmm2[4],zero,xmm2[5],zero,xmm2[6],zero,xmm2[7],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -242,14 +244,14 @@ entry:
 define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: zext_16i8_to_16i32:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
 ; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
@@ -257,14 +259,14 @@ define <16 x i32> @zext_16i8_to_16i32(<16 x i8> %A) nounwind uwtable readnone ss
 ;
 ; SSSE3-LABEL: zext_16i8_to_16i32:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
 ; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3]
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
 ; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
@@ -515,18 +517,18 @@ define <8 x i32> @zext_8i16_to_8i32(<8 x i16> %A) nounwind uwtable readnone ssp
 ;
 ; SSE41-LABEL: zext_8i16_to_8i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_8i16_to_8i32:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_8i16_to_8i32:
@@ -570,25 +572,27 @@ define <16 x i32> @zext_16i16_to_16i32(<16 x i16> %A) nounwind uwtable readnone
 ;
 ; SSE41-LABEL: zext_16i16_to_16i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero,xmm3[2],zero,xmm3[3],zero
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_16i16_to_16i32:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero,xmm2[2],zero,xmm2[3],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -694,14 +698,14 @@ entry:
 define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp {
 ; SSE2-LABEL: zext_8i16_to_8i64:
 ; SSE2:       # BB#0: # %entry
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    pxor %xmm4, %xmm4
-; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE2-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
 ; SSE2-NEXT:    movdqa %xmm3, %xmm2
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; SSE2-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -709,14 +713,14 @@ define <8 x i64> @zext_8i16_to_8i64(<8 x i16> %A) nounwind uwtable readnone ssp
 ;
 ; SSSE3-LABEL: zext_8i16_to_8i64:
 ; SSSE3:       # BB#0: # %entry
-; SSSE3-NEXT:    movdqa %xmm0, %xmm1
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
 ; SSSE3-NEXT:    pxor %xmm4, %xmm4
-; SSSE3-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[2,3,0,1]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm1
 ; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm0
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
 ; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
 ; SSSE3-NEXT:    movdqa %xmm3, %xmm2
 ; SSSE3-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
 ; SSSE3-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
@@ -812,18 +816,18 @@ define <4 x i64> @zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone ssp
 ;
 ; SSE41-LABEL: zext_4i32_to_4i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm2[2],xmm1[3],xmm2[3]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_4i32_to_4i64:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_4i32_to_4i64:
@@ -867,25 +871,27 @@ define <8 x i64> @zext_8i32_to_8i64(<8 x i32> %A) nounwind uwtable readnone ssp
 ;
 ; SSE41-LABEL: zext_8i32_to_8i64:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm1, %xmm3
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pxor %xmm4, %xmm4
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm4[2],xmm1[3],xmm4[3]
-; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm3[0],zero,xmm3[1],zero
-; SSE41-NEXT:    punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
+; SSE41-NEXT:    movdqa %xmm5, %xmm0
+; SSE41-NEXT:    movdqa %xmm4, %xmm1
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_8i32_to_8i64:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovaps %ymm2, %ymm0
 ; AVX1-NEXT:    retq
 ;
@@ -1523,20 +1529,20 @@ define <8 x i32> @zext_8i8_to_8i32(<8 x i8> %z) {
 ;
 ; SSE41-LABEL: zext_8i8_to_8i32:
 ; SSE41:       # BB#0: # %entry
-; SSE41-NEXT:    movdqa %xmm0, %xmm1
-; SSE41-NEXT:    pand {{.*}}(%rip), %xmm1
-; SSE41-NEXT:    pxor %xmm2, %xmm2
-; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
-; SSE41-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE41-NEXT:    pand {{.*}}(%rip), %xmm0
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; SSE41-NEXT:    movdqa %xmm2, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX1-LABEL: zext_8i8_to_8i32:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vpand {{.*}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
 ; AVX1-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: zext_8i8_to_8i32:
@@ -1630,11 +1636,10 @@ define <4 x i64> @shuf_zext_4i32_to_4i64(<4 x i32> %A) nounwind uwtable readnone
 ;
 ; AVX1-LABEL: shuf_zext_4i32_to_4i64:
 ; AVX1:       # BB#0: # %entry
-; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vblendpd {{.*#+}} xmm0 = xmm2[0],xmm0[1]
-; AVX1-NEXT:    vpermilps {{.*#+}} xmm0 = xmm0[2,0,3,0]
-; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
+; AVX1-NEXT:    vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: shuf_zext_4i32_to_4i64:
@@ -2091,3 +2096,146 @@ entry:
   %Z = bitcast <8 x i32> %B to <4 x i64>
   ret <4 x i64> %Z
 }
+
+define <32 x i32> @zext_32i8_to_32i32(<32 x i8> %x) {
+; SSE2-LABEL: zext_32i8_to_32i32:
+; SSE2:       # BB#0:
+; SSE2-NEXT:    pxor %xmm2, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm3, %xmm8
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE2-NEXT:    movdqa %xmm0, %xmm5
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, %xmm6
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm6, %xmm7
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSE2-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSE2-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE2-NEXT:    movdqa %xmm1, 112(%rdi)
+; SSE2-NEXT:    movdqa %xmm4, 96(%rdi)
+; SSE2-NEXT:    movdqa %xmm6, 80(%rdi)
+; SSE2-NEXT:    movdqa %xmm7, 64(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, 48(%rdi)
+; SSE2-NEXT:    movdqa %xmm5, 32(%rdi)
+; SSE2-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm8, (%rdi)
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    retq
+;
+; SSSE3-LABEL: zext_32i8_to_32i32:
+; SSSE3:       # BB#0:
+; SSSE3-NEXT:    pxor %xmm2, %xmm2
+; SSSE3-NEXT:    movdqa %xmm0, %xmm3
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm3, %xmm8
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSSE3-NEXT:    movdqa %xmm0, %xmm5
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm6
+; SSSE3-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3],xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm6, %xmm7
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7]
+; SSSE3-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSSE3-NEXT:    movdqa %xmm1, %xmm4
+; SSSE3-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
+; SSSE3-NEXT:    punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSSE3-NEXT:    movdqa %xmm1, 112(%rdi)
+; SSSE3-NEXT:    movdqa %xmm4, 96(%rdi)
+; SSSE3-NEXT:    movdqa %xmm6, 80(%rdi)
+; SSSE3-NEXT:    movdqa %xmm7, 64(%rdi)
+; SSSE3-NEXT:    movdqa %xmm0, 48(%rdi)
+; SSSE3-NEXT:    movdqa %xmm5, 32(%rdi)
+; SSSE3-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSSE3-NEXT:    movdqa %xmm8, (%rdi)
+; SSSE3-NEXT:    movq %rdi, %rax
+; SSSE3-NEXT:    retq
+;
+; SSE41-LABEL: zext_32i8_to_32i32:
+; SSE41:       # BB#0:
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm4 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero,xmm4[2],zero,zero,zero,xmm4[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm5 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[1,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm6 = xmm6[0],zero,zero,zero,xmm6[1],zero,zero,zero,xmm6[2],zero,zero,zero,xmm6[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm7 = xmm1[2,3,0,1]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm7 = xmm7[0],zero,zero,zero,xmm7[1],zero,zero,zero,xmm7[2],zero,zero,zero,xmm7[3],zero,zero,zero
+; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE41-NEXT:    pmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; SSE41-NEXT:    movdqa %xmm1, 112(%rdi)
+; SSE41-NEXT:    movdqa %xmm7, 96(%rdi)
+; SSE41-NEXT:    movdqa %xmm6, 80(%rdi)
+; SSE41-NEXT:    movdqa %xmm5, 64(%rdi)
+; SSE41-NEXT:    movdqa %xmm0, 48(%rdi)
+; SSE41-NEXT:    movdqa %xmm4, 32(%rdi)
+; SSE41-NEXT:    movdqa %xmm3, 16(%rdi)
+; SSE41-NEXT:    movdqa %xmm2, (%rdi)
+; SSE41-NEXT:    movq %rdi, %rax
+; SSE41-NEXT:    retq
+;
+; AVX1-LABEL: zext_32i8_to_32i32:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm4
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[1,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm2
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm1
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm0 = xmm3[2,3,0,1]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
+; AVX1-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm3
+; AVX1-NEXT:    vmovaps %ymm4, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: zext_32i8_to_32i32:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero,xmm2[2],zero,zero,zero,xmm2[3],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm3 = xmm1[3,1,2,3]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero
+; AVX2-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX2-NEXT:    vpand {{.*}}(%rip), %ymm2, %ymm3
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm4 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm2 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero
+; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1]
+; AVX2-NEXT:    vpmovzxbd {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero
+; AVX2-NEXT:    vmovdqa %ymm4, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: zext_32i8_to_32i32:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX512-NEXT:    vpmovzxbd {{.*#+}} zmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero
+; AVX512-NEXT:    vmovdqa64 %zmm2, %zmm0
+; AVX512-NEXT:    retq
+  %res = zext <32 x i8>%x to <32 x i32>
+  ret <32 x i32> %res
+}
diff --git a/test/CodeGen/X86/vectorcall.ll b/test/CodeGen/X86/vectorcall.ll
index 6ba5e10dd21ebce2c68ae718edcc24d2592aebc4..598a339ee2f7e85777444c4e5892633a0b613556 100644
--- a/test/CodeGen/X86/vectorcall.ll
+++ b/test/CodeGen/X86/vectorcall.ll
@@ -103,7 +103,7 @@ entry:
 }
 ; CHECK-LABEL: test_mixed_1
 ; CHECK:       movaps	%xmm1, 16(%{{(e|r)}}sp)
-; CHECK:       movaps	16(%{{(e|r)}}sp), %xmm0
+; CHECK:       movaps	%xmm1, %xmm0
 ; CHECK:       ret{{q|l}}
 
 define x86_vectorcallcc <4 x float> @test_mixed_2(%struct.HVA4 inreg %a, %struct.HVA4* %b, <4 x float> %c) {
@@ -149,7 +149,7 @@ entry:
 }
 ; CHECK-LABEL: test_mixed_5
 ; CHECK:       movaps	%xmm5, 16(%{{(e|r)}}sp)
-; CHECK:       movaps	16(%{{(e|r)}}sp), %xmm0
+; CHECK:       movaps	%xmm5, %xmm0
 ; CHECK:       ret{{[ql]}}
 
 define x86_vectorcallcc %struct.HVA4 @test_mixed_6(%struct.HVA4 inreg %a, %struct.HVA4* %b) {
diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll
index 61fb66bbcbbddd0b38fc5cfd589248aafb2f623d..34a9df1782a49c6922e8d41b03bc3970c5264d52 100644
--- a/test/CodeGen/X86/viabs.ll
+++ b/test/CodeGen/X86/viabs.ll
@@ -147,14 +147,10 @@ define <8 x i32> @test_abs_gt_v8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX1-LABEL: test_abs_gt_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpabsd %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpabsd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_gt_v8i32:
@@ -193,14 +189,10 @@ define <8 x i32> @test_abs_ge_v8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX1-LABEL: test_abs_ge_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpabsd %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpabsd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_ge_v8i32:
@@ -239,14 +231,10 @@ define <16 x i16> @test_abs_gt_v16i16(<16 x i16> %a) nounwind {
 ;
 ; AVX1-LABEL: test_abs_gt_v16i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpsraw $15, %xmm1, %xmm2
-; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm3
-; AVX1-NEXT:    vpaddw %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpabsw %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpabsw %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_gt_v16i16:
@@ -285,15 +273,10 @@ define <32 x i8> @test_abs_lt_v32i8(<32 x i8> %a) nounwind {
 ;
 ; AVX1-LABEL: test_abs_lt_v32i8:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm2, %xmm3
-; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm4
-; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vpaddb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm4, %ymm0, %ymm0
+; AVX1-NEXT:    vpabsb %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpabsb %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_lt_v32i8:
@@ -332,14 +315,10 @@ define <8 x i32> @test_abs_le_v8i32(<8 x i32> %a) nounwind {
 ;
 ; AVX1-LABEL: test_abs_le_v8i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm1
-; AVX1-NEXT:    vxorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    vpabsd %xmm0, %xmm1
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpabsd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_le_v8i32:
@@ -388,22 +367,14 @@ define <16 x i32> @test_abs_le_16i32(<16 x i32> %a) nounwind {
 ;
 ; AVX1-LABEL: test_abs_le_16i32:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm0, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
-; AVX1-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsrad $31, %xmm1, %xmm4
-; AVX1-NEXT:    vpaddd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT:    vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vpabsd %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpabsd %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vpabsd %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpabsd %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_le_16i32:
@@ -450,9 +421,7 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind {
 ;
 ; AVX512-LABEL: test_abs_ge_v2i64:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsraq $63, %xmm0, %xmm1
-; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpabsq %xmm0, %xmm0
 ; AVX512-NEXT:    retq
   %tmp1neg = sub <2 x i64> zeroinitializer, %a
   %b = icmp sge <2 x i64> %a, zeroinitializer
@@ -499,9 +468,7 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind {
 ;
 ; AVX512-LABEL: test_abs_gt_v4i64:
 ; AVX512:       # BB#0:
-; AVX512-NEXT:    vpsraq $63, %ymm0, %ymm1
-; AVX512-NEXT:    vpaddq %ymm1, %ymm0, %ymm0
-; AVX512-NEXT:    vpxor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpabsq %ymm0, %ymm0
 ; AVX512-NEXT:    retq
   %tmp1neg = sub <4 x i64> zeroinitializer, %a
   %b = icmp sgt <4 x i64> %a, <i64 -1, i64 -1, i64 -1, i64 -1>
@@ -611,8 +578,8 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind {
 ;
 ; AVX1-LABEL: test_abs_le_v8i64_fold:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vmovups (%rdi), %ymm0
-; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
+; AVX1-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX1-NEXT:    vmovdqu 32(%rdi), %ymm1
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpsrad $31, %xmm2, %xmm3
 ; AVX1-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3]
@@ -691,23 +658,14 @@ define <64 x i8> @test_abs_lt_v64i8(<64 x i8> %a) nounwind {
 ;
 ; AVX1-LABEL: test_abs_lt_v64i8:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpgtb %xmm0, %xmm3, %xmm5
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm6
-; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vxorps %ymm6, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm3, %xmm4
-; AVX1-NEXT:    vpcmpgtb %xmm1, %xmm3, %xmm3
-; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm3, %ymm5
-; AVX1-NEXT:    vpaddb %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vpaddb %xmm3, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vxorps %ymm5, %ymm1, %ymm1
+; AVX1-NEXT:    vpabsb %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpabsb %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vpabsb %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpabsb %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_lt_v64i8:
@@ -763,22 +721,14 @@ define <32 x i16> @test_abs_gt_v32i16(<32 x i16> %a) nounwind {
 ;
 ; AVX1-LABEL: test_abs_gt_v32i16:
 ; AVX1:       # BB#0:
-; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vpsraw $15, %xmm2, %xmm3
-; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsraw $15, %xmm0, %xmm4
-; AVX1-NEXT:    vpaddw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT:    vxorps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vpsraw $15, %xmm2, %xmm3
-; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vpsraw $15, %xmm1, %xmm4
-; AVX1-NEXT:    vpaddw %xmm4, %xmm1, %xmm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm2
-; AVX1-NEXT:    vxorps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT:    vpabsw %xmm0, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm0
+; AVX1-NEXT:    vpabsw %xmm0, %xmm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX1-NEXT:    vpabsw %xmm1, %xmm2
+; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm1
+; AVX1-NEXT:    vpabsw %xmm1, %xmm1
+; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    retq
 ;
 ; AVX2-LABEL: test_abs_gt_v32i16:
diff --git a/test/CodeGen/X86/vselect-minmax.ll b/test/CodeGen/X86/vselect-minmax.ll
index 8e9f1d9809139df815de90a7404336cc128d02c2..5524eaf397c97a21426caacda5899c578638e686 100644
--- a/test/CodeGen/X86/vselect-minmax.ll
+++ b/test/CodeGen/X86/vselect-minmax.ll
@@ -4839,13 +4839,13 @@ define <8 x i64> @test121(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm11
 ; SSE4-NEXT:    movdqa %xmm4, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
@@ -4983,13 +4983,13 @@ define <8 x i64> @test122(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm12, %xmm11
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
 ; SSE4-NEXT:    pxor %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
@@ -5113,13 +5113,13 @@ define <8 x i64> @test123(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    movdqa %xmm1, %xmm11
 ; SSE4-NEXT:    pcmpgtq %xmm5, %xmm11
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
@@ -5259,13 +5259,13 @@ define <8 x i64> @test124(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    movdqa %xmm4, %xmm12
 ; SSE4-NEXT:    pcmpgtq %xmm8, %xmm12
 ; SSE4-NEXT:    pxor %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
@@ -5402,13 +5402,13 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm12
 ; SSE4-NEXT:    pxor %xmm4, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
@@ -5418,22 +5418,22 @@ define <8 x i64> @test125(<8 x i64> %a, <8 x i64> %b) {
 ; AVX1-LABEL: test125:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
 ; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
@@ -5573,13 +5573,13 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm9, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm13, %xmm0
 ; SSE4-NEXT:    pxor %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm9, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
@@ -5589,26 +5589,26 @@ define <8 x i64> @test126(<8 x i64> %a, <8 x i64> %b) {
 ; AVX1-LABEL: test126:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm8, %xmm8
 ; AVX1-NEXT:    vpxor %xmm8, %xmm4, %xmm4
-; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vpxor %xmm8, %xmm5, %xmm5
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
@@ -5730,13 +5730,13 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm12
 ; SSE4-NEXT:    pxor %xmm8, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm8, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm8, %xmm4
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
@@ -5746,22 +5746,22 @@ define <8 x i64> @test127(<8 x i64> %a, <8 x i64> %b) {
 ; AVX1-LABEL: test127:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
 ; AVX1-NEXT:    vblendvpd %ymm5, %ymm0, %ymm2, %ymm0
@@ -5902,13 +5902,13 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm4, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm13, %xmm0
 ; SSE4-NEXT:    pxor %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm9, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm9, %xmm4
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm5
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm6
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm6
 ; SSE4-NEXT:    movdqa %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm7
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm7
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    movapd %xmm5, %xmm1
 ; SSE4-NEXT:    movapd %xmm6, %xmm2
@@ -5918,26 +5918,26 @@ define <8 x i64> @test128(<8 x i64> %a, <8 x i64> %b) {
 ; AVX1-LABEL: test128:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm8, %xmm8
 ; AVX1-NEXT:    vpxor %xmm8, %xmm4, %xmm4
-; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vpxor %xmm8, %xmm5, %xmm5
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
@@ -7562,13 +7562,13 @@ define <8 x i64> @test153(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm11
 ; SSE4-NEXT:    movdqa %xmm4, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm8, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
 ; SSE4-NEXT:    movapd %xmm8, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -7703,13 +7703,13 @@ define <8 x i64> @test154(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm12, %xmm11
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
 ; SSE4-NEXT:    pxor %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
 ; SSE4-NEXT:    movapd %xmm8, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -7834,13 +7834,13 @@ define <8 x i64> @test155(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    movdqa %xmm1, %xmm11
 ; SSE4-NEXT:    pcmpgtq %xmm5, %xmm11
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
 ; SSE4-NEXT:    movapd %xmm8, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -7977,13 +7977,13 @@ define <8 x i64> @test156(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    movdqa %xmm4, %xmm12
 ; SSE4-NEXT:    pcmpgtq %xmm8, %xmm12
 ; SSE4-NEXT:    pxor %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
 ; SSE4-NEXT:    movapd %xmm8, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -8121,35 +8121,35 @@ define <8 x i64> @test157(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm12
 ; SSE4-NEXT:    pxor %xmm4, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
 ; SSE4-NEXT:    movapd %xmm8, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test157:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
 ; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
@@ -8289,39 +8289,39 @@ define <8 x i64> @test158(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm8, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm13, %xmm0
 ; SSE4-NEXT:    pxor %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
 ; SSE4-NEXT:    movapd %xmm8, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test158:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm8, %xmm8
 ; AVX1-NEXT:    vpxor %xmm8, %xmm4, %xmm4
-; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vpxor %xmm8, %xmm5, %xmm5
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
@@ -8447,35 +8447,35 @@ define <8 x i64> @test159(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm12
 ; SSE4-NEXT:    pxor %xmm8, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
 ; SSE4-NEXT:    movapd %xmm8, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test159:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
-; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
 ; AVX1-NEXT:    vblendvpd %ymm5, %ymm2, %ymm0, %ymm0
@@ -8616,39 +8616,39 @@ define <8 x i64> @test160(<8 x i64> %a, <8 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm4, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm13, %xmm0
 ; SSE4-NEXT:    pxor %xmm12, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm8
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm8
 ; SSE4-NEXT:    movdqa %xmm11, %xmm0
-; SSE4-NEXT:    blendvpd %xmm5, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm5, %xmm1
 ; SSE4-NEXT:    movdqa %xmm10, %xmm0
-; SSE4-NEXT:    blendvpd %xmm6, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm6, %xmm2
 ; SSE4-NEXT:    movdqa %xmm9, %xmm0
-; SSE4-NEXT:    blendvpd %xmm7, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm7, %xmm3
 ; SSE4-NEXT:    movapd %xmm8, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test160:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm5, %xmm4, %xmm4
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm5 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm6, %xmm4
 ; AVX1-NEXT:    vpcmpeqd %xmm8, %xmm8, %xmm8
 ; AVX1-NEXT:    vpxor %xmm8, %xmm4, %xmm4
-; AVX1-NEXT:    vxorps %xmm5, %xmm1, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm3, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm1, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm3, %xmm6
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm6, %xmm6
 ; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
 ; AVX1-NEXT:    vinsertf128 $1, %xmm4, %ymm6, %ymm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm6, %xmm6
+; AVX1-NEXT:    vpxor %xmm5, %xmm6, %xmm6
 ; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm7, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm7, %xmm7
 ; AVX1-NEXT:    vpcmpgtq %xmm6, %xmm7, %xmm6
 ; AVX1-NEXT:    vpxor %xmm8, %xmm6, %xmm6
-; AVX1-NEXT:    vxorps %xmm5, %xmm0, %xmm7
-; AVX1-NEXT:    vxorps %xmm5, %xmm2, %xmm5
+; AVX1-NEXT:    vpxor %xmm5, %xmm0, %xmm7
+; AVX1-NEXT:    vpxor %xmm5, %xmm2, %xmm5
 ; AVX1-NEXT:    vpcmpgtq %xmm7, %xmm5, %xmm5
 ; AVX1-NEXT:    vpxor %xmm8, %xmm5, %xmm5
 ; AVX1-NEXT:    vinsertf128 $1, %xmm6, %ymm5, %ymm5
@@ -8724,9 +8724,9 @@ define <4 x i64> @test161(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm2, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    movapd %xmm3, %xmm1
 ; SSE4-NEXT:    retq
@@ -8807,9 +8807,9 @@ define <4 x i64> @test162(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm6, %xmm5
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE4-NEXT:    pxor %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    movapd %xmm3, %xmm1
 ; SSE4-NEXT:    retq
@@ -8886,9 +8886,9 @@ define <4 x i64> @test163(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    movdqa %xmm1, %xmm5
 ; SSE4-NEXT:    pcmpgtq %xmm3, %xmm5
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    movapd %xmm3, %xmm1
 ; SSE4-NEXT:    retq
@@ -8970,9 +8970,9 @@ define <4 x i64> @test164(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    movdqa %xmm2, %xmm6
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm6
 ; SSE4-NEXT:    pxor %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    movapd %xmm3, %xmm1
 ; SSE4-NEXT:    retq
@@ -9056,9 +9056,9 @@ define <4 x i64> @test165(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm6
 ; SSE4-NEXT:    pxor %xmm2, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    movapd %xmm3, %xmm1
 ; SSE4-NEXT:    retq
@@ -9066,13 +9066,13 @@ define <4 x i64> @test165(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: test165:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
@@ -9154,9 +9154,9 @@ define <4 x i64> @test166(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm4, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
 ; SSE4-NEXT:    pxor %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    movapd %xmm3, %xmm1
 ; SSE4-NEXT:    retq
@@ -9164,15 +9164,15 @@ define <4 x i64> @test166(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: test166:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm5
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
@@ -9248,9 +9248,9 @@ define <4 x i64> @test167(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm6
 ; SSE4-NEXT:    pxor %xmm4, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    movapd %xmm3, %xmm1
 ; SSE4-NEXT:    retq
@@ -9258,13 +9258,13 @@ define <4 x i64> @test167(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: test167:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
@@ -9346,9 +9346,9 @@ define <4 x i64> @test168(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm2, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
 ; SSE4-NEXT:    pxor %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm4, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm4, %xmm2
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm3
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm3
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    movapd %xmm3, %xmm1
 ; SSE4-NEXT:    retq
@@ -9356,15 +9356,15 @@ define <4 x i64> @test168(<4 x i64> %a, <4 x i64> %b) {
 ; AVX1-LABEL: test168:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm5
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
@@ -9436,9 +9436,9 @@ define <4 x i64> @test169(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm5
 ; SSE4-NEXT:    movdqa %xmm2, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -9518,9 +9518,9 @@ define <4 x i64> @test170(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm6, %xmm5
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
 ; SSE4-NEXT:    pxor %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -9598,9 +9598,9 @@ define <4 x i64> @test171(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    movdqa %xmm1, %xmm5
 ; SSE4-NEXT:    pcmpgtq %xmm3, %xmm5
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -9681,9 +9681,9 @@ define <4 x i64> @test172(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    movdqa %xmm2, %xmm6
 ; SSE4-NEXT:    pcmpgtq %xmm4, %xmm6
 ; SSE4-NEXT:    pxor %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -9768,22 +9768,22 @@ define <4 x i64> @test173(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm6
 ; SSE4-NEXT:    pxor %xmm2, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test173:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
@@ -9865,24 +9865,24 @@ define <4 x i64> @test174(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm4, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
 ; SSE4-NEXT:    pxor %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test174:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm5
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm5
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
@@ -9960,22 +9960,22 @@ define <4 x i64> @test175(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm6
 ; SSE4-NEXT:    pxor %xmm4, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test175:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-NEXT:    vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
@@ -10057,24 +10057,24 @@ define <4 x i64> @test176(<4 x i64> %a, <4 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm2, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm7, %xmm0
 ; SSE4-NEXT:    pxor %xmm6, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm4
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm4
 ; SSE4-NEXT:    movdqa %xmm5, %xmm0
-; SSE4-NEXT:    blendvpd %xmm3, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm3, %xmm1
 ; SSE4-NEXT:    movapd %xmm4, %xmm0
 ; SSE4-NEXT:    retq
 ;
 ; AVX1-LABEL: test176:
 ; AVX1:       # BB#0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT:    vxorps %xmm3, %xmm2, %xmm2
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT:    vpxor %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vxorps %xmm3, %xmm4, %xmm4
+; AVX1-NEXT:    vpxor %xmm3, %xmm4, %xmm4
 ; AVX1-NEXT:    vpcmpgtq %xmm2, %xmm4, %xmm2
 ; AVX1-NEXT:    vpcmpeqd %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    vpxor %xmm4, %xmm2, %xmm2
-; AVX1-NEXT:    vxorps %xmm3, %xmm0, %xmm5
-; AVX1-NEXT:    vxorps %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpxor %xmm3, %xmm0, %xmm5
+; AVX1-NEXT:    vpxor %xmm3, %xmm1, %xmm3
 ; AVX1-NEXT:    vpcmpgtq %xmm5, %xmm3, %xmm3
 ; AVX1-NEXT:    vpxor %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
@@ -10127,7 +10127,7 @@ define <2 x i64> @test177(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    movdqa %xmm0, %xmm2
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE4-NEXT:    movapd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10182,7 +10182,7 @@ define <2 x i64> @test178(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE4-NEXT:    pxor %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE4-NEXT:    movapd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10236,7 +10236,7 @@ define <2 x i64> @test179(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4:       # BB#0: # %entry
 ; SSE4-NEXT:    movdqa %xmm0, %xmm2
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE4-NEXT:    movapd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10292,7 +10292,7 @@ define <2 x i64> @test180(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm3
 ; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE4-NEXT:    pxor %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE4-NEXT:    movapd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10350,7 +10350,7 @@ define <2 x i64> @test181(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm3
 ; SSE4-NEXT:    pxor %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE4-NEXT:    movapd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10415,7 +10415,7 @@ define <2 x i64> @test182(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm0, %xmm3
 ; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE4-NEXT:    pxor %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE4-NEXT:    movapd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10479,7 +10479,7 @@ define <2 x i64> @test183(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm3
 ; SSE4-NEXT:    pxor %xmm2, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE4-NEXT:    movapd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10543,7 +10543,7 @@ define <2 x i64> @test184(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm0, %xmm3
 ; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE4-NEXT:    pxor %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm2, %xmm1
+; SSE4-NEXT:    blendvpd %xmm0, %xmm2, %xmm1
 ; SSE4-NEXT:    movapd %xmm1, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10605,7 +10605,7 @@ define <2 x i64> @test185(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    movdqa %xmm0, %xmm2
 ; SSE4-NEXT:    movdqa %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10660,7 +10660,7 @@ define <2 x i64> @test186(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm3, %xmm3
 ; SSE4-NEXT:    pxor %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10715,7 +10715,7 @@ define <2 x i64> @test187(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4:       # BB#0: # %entry
 ; SSE4-NEXT:    movdqa %xmm0, %xmm2
 ; SSE4-NEXT:    pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10771,7 +10771,7 @@ define <2 x i64> @test188(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm2, %xmm3
 ; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE4-NEXT:    pxor %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10830,7 +10830,7 @@ define <2 x i64> @test189(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm3
 ; SSE4-NEXT:    pxor %xmm1, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10895,7 +10895,7 @@ define <2 x i64> @test190(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm0, %xmm3
 ; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE4-NEXT:    pxor %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -10960,7 +10960,7 @@ define <2 x i64> @test191(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pxor %xmm0, %xmm3
 ; SSE4-NEXT:    pxor %xmm2, %xmm0
 ; SSE4-NEXT:    pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    retq
 ;
@@ -11024,7 +11024,7 @@ define <2 x i64> @test192(<2 x i64> %a, <2 x i64> %b) {
 ; SSE4-NEXT:    pcmpgtq %xmm0, %xmm3
 ; SSE4-NEXT:    pcmpeqd %xmm0, %xmm0
 ; SSE4-NEXT:    pxor %xmm3, %xmm0
-; SSE4-NEXT:    blendvpd %xmm1, %xmm2
+; SSE4-NEXT:    blendvpd %xmm0, %xmm1, %xmm2
 ; SSE4-NEXT:    movapd %xmm2, %xmm0
 ; SSE4-NEXT:    retq
 ;
diff --git a/test/CodeGen/X86/vselect-pcmp.ll b/test/CodeGen/X86/vselect-pcmp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d33fda4f49c2b5a1035787cdc8cb54726697efb2
--- /dev/null
+++ b/test/CodeGen/X86/vselect-pcmp.ll
@@ -0,0 +1,323 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx       | FileCheck %s --check-prefix=AVX --check-prefix=AVX12F --check-prefix=AVX12 --check-prefix=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2      | FileCheck %s --check-prefix=AVX --check-prefix=AVX12F --check-prefix=AVX12 --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f   | FileCheck %s --check-prefix=AVX --check-prefix=AVX12F --check-prefix=AVX512 --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512vl  | FileCheck %s --check-prefix=AVX                       --check-prefix=AVX512 --check-prefix=AVX512VL
+
+; The condition vector for BLENDV* only cares about the sign bit of each element.
+; So in these tests, if we generate BLENDV*, we should be able to remove the redundant cmp op.
+
+; Test 128-bit vectors for all legal element types.
+
+; FIXME: Why didn't AVX-512 optimize too?
+
+define <16 x i8> @signbit_sel_v16i8(<16 x i8> %x, <16 x i8> %y, <16 x i8> %mask) {
+; AVX12-LABEL: signbit_sel_v16i8:
+; AVX12:       # BB#0:
+; AVX12-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: signbit_sel_v16i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512-NEXT:    vpcmpgtb %xmm2, %xmm3, %xmm2
+; AVX512-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX512-NEXT:    retq
+  %tr = icmp slt <16 x i8> %mask, zeroinitializer
+  %z = select <16 x i1> %tr, <16 x i8> %x, <16 x i8> %y
+  ret <16 x i8> %z
+}
+
+; Sorry 16-bit, you're not important enough to support?
+
+define <8 x i16> @signbit_sel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) {
+; AVX-LABEL: signbit_sel_v8i16:
+; AVX:       # BB#0:
+; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX-NEXT:    vpcmpgtw %xmm2, %xmm3, %xmm2
+; AVX-NEXT:    vpandn %xmm1, %xmm2, %xmm1
+; AVX-NEXT:    vpand %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpor %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
+  %tr = icmp slt <8 x i16> %mask, zeroinitializer
+  %z = select <8 x i1> %tr, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %z
+}
+
+define <4 x i32> @signbit_sel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) {
+; AVX12F-LABEL: signbit_sel_v4i32:
+; AVX12F:       # BB#0:
+; AVX12F-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX12F-NEXT:    retq
+;
+; AVX512VL-LABEL: signbit_sel_v4i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpgtd %xmm2, %xmm3, %k1
+; AVX512VL-NEXT:    vpblendmd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %tr = icmp slt <4 x i32> %mask, zeroinitializer
+  %z = select <4 x i1> %tr, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %z
+}
+
+define <2 x i64> @signbit_sel_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) {
+; AVX12F-LABEL: signbit_sel_v2i64:
+; AVX12F:       # BB#0:
+; AVX12F-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX12F-NEXT:    retq
+;
+; AVX512VL-LABEL: signbit_sel_v2i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpgtq %xmm2, %xmm3, %k1
+; AVX512VL-NEXT:    vpblendmq %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %tr = icmp slt <2 x i64> %mask, zeroinitializer
+  %z = select <2 x i1> %tr, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %z
+}
+
+define <4 x float> @signbit_sel_v4f32(<4 x float> %x, <4 x float> %y, <4 x i32> %mask) {
+; AVX12F-LABEL: signbit_sel_v4f32:
+; AVX12F:       # BB#0:
+; AVX12F-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX12F-NEXT:    retq
+;
+; AVX512VL-LABEL: signbit_sel_v4f32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpgtd %xmm2, %xmm3, %k1
+; AVX512VL-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %tr = icmp slt <4 x i32> %mask, zeroinitializer
+  %z = select <4 x i1> %tr, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %z
+}
+
+define <2 x double> @signbit_sel_v2f64(<2 x double> %x, <2 x double> %y, <2 x i64> %mask) {
+; AVX12F-LABEL: signbit_sel_v2f64:
+; AVX12F:       # BB#0:
+; AVX12F-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX12F-NEXT:    retq
+;
+; AVX512VL-LABEL: signbit_sel_v2f64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpgtq %xmm2, %xmm3, %k1
+; AVX512VL-NEXT:    vblendmpd %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %tr = icmp slt <2 x i64> %mask, zeroinitializer
+  %z = select <2 x i1> %tr, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %z
+}
+
+; Test 256-bit vectors to see differences between AVX1 and AVX2.
+
+define <32 x i8> @signbit_sel_v32i8(<32 x i8> %x, <32 x i8> %y, <32 x i8> %mask) {
+; AVX1-LABEL: signbit_sel_v32i8:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtb %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: signbit_sel_v32i8:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: signbit_sel_v32i8:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX512-NEXT:    vpcmpgtb %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512-NEXT:    retq
+  %tr = icmp slt <32 x i8> %mask, zeroinitializer
+  %z = select <32 x i1> %tr, <32 x i8> %x, <32 x i8> %y
+  ret <32 x i8> %z
+}
+
+; Sorry 16-bit, you'll never be important enough to support?
+
+define <16 x i16> @signbit_sel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %mask) {
+; AVX1-LABEL: signbit_sel_v16i16:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT:    vpcmpgtw %xmm3, %xmm4, %xmm3
+; AVX1-NEXT:    vpcmpgtw %xmm2, %xmm4, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT:    vandnps %ymm1, %ymm2, %ymm1
+; AVX1-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: signbit_sel_v16i16:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX2-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm2
+; AVX2-NEXT:    vpandn %ymm1, %ymm2, %ymm1
+; AVX2-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: signbit_sel_v16i16:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX512-NEXT:    vpcmpgtw %ymm2, %ymm3, %ymm2
+; AVX512-NEXT:    vpandn %ymm1, %ymm2, %ymm1
+; AVX512-NEXT:    vpand %ymm2, %ymm0, %ymm0
+; AVX512-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %tr = icmp slt <16 x i16> %mask, zeroinitializer
+  %z = select <16 x i1> %tr, <16 x i16> %x, <16 x i16> %y
+  ret <16 x i16> %z
+}
+
+define <8 x i32> @signbit_sel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %mask) {
+; AVX12-LABEL: signbit_sel_v8i32:
+; AVX12:       # BB#0:
+; AVX12-NEXT:    vblendvps %ymm2, %ymm0, %ymm1, %ymm0
+; AVX12-NEXT:    retq
+;
+; AVX512F-LABEL: signbit_sel_v8i32:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    # kill: %YMM2<def> %YMM2<kill> %ZMM2<def>
+; AVX512F-NEXT:    # kill: %YMM1<def> %YMM1<kill> %ZMM1<def>
+; AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<def>
+; AVX512F-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT:    vpcmpgtd %zmm2, %zmm3, %k1
+; AVX512F-NEXT:    vpblendmd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512F-NEXT:    # kill: %YMM0<def> %YMM0<kill> %ZMM0<kill>
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: signbit_sel_v8i32:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpcmpgtd %ymm2, %ymm3, %k1
+; AVX512VL-NEXT:    vpblendmd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %tr = icmp slt <8 x i32> %mask, zeroinitializer
+  %z = select <8 x i1> %tr, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %z
+}
+
+define <4 x i64> @signbit_sel_v4i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %mask) {
+; AVX12F-LABEL: signbit_sel_v4i64:
+; AVX12F:       # BB#0:
+; AVX12F-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX12F-NEXT:    retq
+;
+; AVX512VL-LABEL: signbit_sel_v4i64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpcmpgtq %ymm2, %ymm3, %k1
+; AVX512VL-NEXT:    vpblendmq %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %tr = icmp slt <4 x i64> %mask, zeroinitializer
+  %z = select <4 x i1> %tr, <4 x i64> %x, <4 x i64> %y
+  ret <4 x i64> %z
+}
+
+define <4 x double> @signbit_sel_v4f64(<4 x double> %x, <4 x double> %y, <4 x i64> %mask) {
+; AVX12F-LABEL: signbit_sel_v4f64:
+; AVX12F:       # BB#0:
+; AVX12F-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX12F-NEXT:    retq
+;
+; AVX512VL-LABEL: signbit_sel_v4f64:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %ymm3, %ymm3, %ymm3
+; AVX512VL-NEXT:    vpcmpgtq %ymm2, %ymm3, %k1
+; AVX512VL-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %tr = icmp slt <4 x i64> %mask, zeroinitializer
+  %z = select <4 x i1> %tr, <4 x double> %x, <4 x double> %y
+  ret <4 x double> %z
+}
+
+; Try a condition with a different type than the select operands.
+
+define <4 x double> @signbit_sel_v4f64_small_mask(<4 x double> %x, <4 x double> %y, <4 x i32> %mask) {
+; AVX1-LABEL: signbit_sel_v4f64_small_mask:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm3
+; AVX1-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1]
+; AVX1-NEXT:    vpmovsxdq %xmm2, %xmm2
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: signbit_sel_v4f64_small_mask:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX2-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: signbit_sel_v4f64_small_mask:
+; AVX512F:       # BB#0:
+; AVX512F-NEXT:    vpmovsxdq %xmm2, %ymm2
+; AVX512F-NEXT:    vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: signbit_sel_v4f64_small_mask:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VL-NEXT:    vpcmpgtd %xmm2, %xmm3, %k1
+; AVX512VL-NEXT:    vblendmpd %ymm0, %ymm1, %ymm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %tr = icmp slt <4 x i32> %mask, zeroinitializer
+  %z = select <4 x i1> %tr, <4 x double> %x, <4 x double> %y
+  ret <4 x double> %z
+}
+
+; Try a 512-bit vector to make sure AVX-512 is handled as expected.
+
+define <8 x double> @signbit_sel_v8f64(<8 x double> %x, <8 x double> %y, <8 x i64> %mask) {
+; AVX12-LABEL: signbit_sel_v8f64:
+; AVX12:       # BB#0:
+; AVX12-NEXT:    vblendvpd %ymm4, %ymm0, %ymm2, %ymm0
+; AVX12-NEXT:    vblendvpd %ymm5, %ymm1, %ymm3, %ymm1
+; AVX12-NEXT:    retq
+;
+; AVX512-LABEL: signbit_sel_v8f64:
+; AVX512:       # BB#0:
+; AVX512-NEXT:    vpxord %zmm3, %zmm3, %zmm3
+; AVX512-NEXT:    vpcmpgtq %zmm2, %zmm3, %k1
+; AVX512-NEXT:    vblendmpd %zmm0, %zmm1, %zmm0 {%k1}
+; AVX512-NEXT:    retq
+  %tr = icmp slt <8 x i64> %mask, zeroinitializer
+  %z = select <8 x i1> %tr, <8 x double> %x, <8 x double> %y
+  ret <8 x double> %z
+}
+
+; If we have a floating-point compare:
+; (1) Don't die.
+; (2) FIXME: If we don't care about signed-zero (and NaN?), the compare should still get folded.
+
+define <4 x float> @signbit_sel_v4f32_fcmp(<4 x float> %x, <4 x float> %y, <4 x float> %mask) #0 {
+; AVX12F-LABEL: signbit_sel_v4f32_fcmp:
+; AVX12F:       # BB#0:
+; AVX12F-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX12F-NEXT:    vcmpltps %xmm2, %xmm0, %xmm2
+; AVX12F-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX12F-NEXT:    retq
+;
+; AVX512VL-LABEL: signbit_sel_v4f32_fcmp:
+; AVX512VL:       # BB#0:
+; AVX512VL-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vcmpltps %xmm2, %xmm0, %k1
+; AVX512VL-NEXT:    vblendmps %xmm0, %xmm1, %xmm0 {%k1}
+; AVX512VL-NEXT:    retq
+  %cmp = fcmp olt <4 x float> %x, zeroinitializer
+  %sel = select <4 x i1> %cmp, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %sel
+}
+
+attributes #0 = { "no-nans-fp-math"="true" }
diff --git a/test/CodeGen/X86/wide-integer-cmp.ll b/test/CodeGen/X86/wide-integer-cmp.ll
index c45a0541e6a7c32339fa93fe5875e2b59bf77a54..fbaf500e833332feebf728556cb924aad431df3a 100644
--- a/test/CodeGen/X86/wide-integer-cmp.ll
+++ b/test/CodeGen/X86/wide-integer-cmp.ll
@@ -1,7 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=i686-linux-gnu %s -o - | FileCheck %s
 
 
 define i32 @branch_eq(i64 %a, i64 %b) {
+; CHECK-LABEL: branch_eq:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    xorl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    orl %ecx, %eax
+; CHECK-NEXT:    jne .LBB0_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB0_2: # %bb2
+; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp eq i64 %a, %b
 	br i1 %cmp, label %bb1, label %bb2
@@ -9,22 +24,22 @@ bb1:
   ret i32 1
 bb2:
   ret i32 2
-
-; CHECK-LABEL: branch_eq:
-; CHECK: movl 4(%esp), [[LHSLo:%[a-z]+]]
-; CHECK: movl 8(%esp), [[LHSHi:%[a-z]+]]
-; CHECK: xorl 16(%esp), [[LHSHi]]
-; CHECK: xorl 12(%esp), [[LHSLo]]
-; CHECK: orl [[LHSHi]], [[LHSLo]]
-; CHECK: jne [[FALSE:.LBB[0-9_]+]]
-; CHECK: movl $1, %eax
-; CHECK: retl
-; CHECK: [[FALSE]]:
-; CHECK: movl $2, %eax
-; CHECK: retl
 }
 
 define i32 @branch_slt(i64 %a, i64 %b) {
+; CHECK-LABEL: branch_slt:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    jge .LBB1_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB1_2: # %bb2
+; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp slt i64 %a, %b
 	br i1 %cmp, label %bb1, label %bb2
@@ -32,21 +47,22 @@ bb1:
   ret i32 1
 bb2:
   ret i32 2
-
-; CHECK-LABEL: branch_slt:
-; CHECK: movl 4(%esp), [[LHSLo:%[a-z]+]]
-; CHECK: movl 8(%esp), [[LHSHi:%[a-z]+]]
-; CHECK: cmpl 12(%esp), [[LHSLo]]
-; CHECK: sbbl 16(%esp), [[LHSHi]]
-; CHECK: jge [[FALSE:.LBB[0-9_]+]]
-; CHECK: movl $1, %eax
-; CHECK: retl
-; CHECK: [[FALSE]]:
-; CHECK: movl $2, %eax
-; CHECK: retl
 }
 
 define i32 @branch_ule(i64 %a, i64 %b) {
+; CHECK-LABEL: branch_ule:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    jb .LBB2_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB2_2: # %bb2
+; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp ule i64 %a, %b
 	br i1 %cmp, label %bb1, label %bb2
@@ -54,36 +70,49 @@ bb1:
   ret i32 1
 bb2:
   ret i32 2
-
-; CHECK-LABEL: branch_ule:
-; CHECK: movl 12(%esp), [[RHSLo:%[a-z]+]]
-; CHECK: movl 16(%esp), [[RHSHi:%[a-z]+]]
-; CHECK: cmpl 4(%esp), [[RHSLo]]
-; CHECK: sbbl 8(%esp), [[RHSHi]]
-; CHECK: jb [[FALSE:.LBB[0-9_]+]]
-; CHECK: movl $1, %eax
-; CHECK: retl
-; CHECK: [[FALSE]]:
-; CHECK: movl $2, %eax
-; CHECK: retl
 }
 
 define i32 @set_gt(i64 %a, i64 %b) {
+; CHECK-LABEL: set_gt:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    setl %al
+; CHECK-NEXT:    movzbl %al, %eax
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp sgt i64 %a, %b
   %res = select i1 %cmp, i32 1, i32 0
   ret i32 %res
-
-; CHECK-LABEL: set_gt:
-; CHECK: movl 12(%esp), [[RHSLo:%[a-z]+]]
-; CHECK: movl 16(%esp), [[RHSHi:%[a-z]+]]
-; CHECK: cmpl 4(%esp), [[RHSLo]]
-; CHECK: sbbl 8(%esp), [[RHSHi]]
-; CHECK: setl %al
-; CHECK: retl
 }
 
 define i32 @test_wide(i128 %a, i128 %b) {
+; CHECK-LABEL: test_wide:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    pushl %esi
+; CHECK-NEXT:  .Lcfi0:
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:  .Lcfi1:
+; CHECK-NEXT:    .cfi_offset %esi, -8
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %edx
+; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %esi
+; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    sbbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    jge .LBB4_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB4_2: # %bb2
+; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:    popl %esi
+; CHECK-NEXT:    retl
 entry:
   %cmp = icmp slt i128 %a, %b
 	br i1 %cmp, label %bb1, label %bb2
@@ -91,21 +120,22 @@ bb1:
   ret i32 1
 bb2:
   ret i32 2
-
-; CHECK-LABEL: test_wide:
-; CHECK: cmpl 24(%esp)
-; CHECK: sbbl 28(%esp)
-; CHECK: sbbl 32(%esp)
-; CHECK: sbbl 36(%esp)
-; CHECK: jge [[FALSE:.LBB[0-9_]+]]
-; CHECK: movl $1, %eax
-; CHECK: retl
-; CHECK: [[FALSE]]:
-; CHECK: movl $2, %eax
-; CHECK: retl
 }
 
+; The comparison of the low bits will be folded to a CARRY_FALSE node. Make
+; sure the code can handle that.
 define i32 @test_carry_false(i64 %a, i64 %b) {
+; CHECK-LABEL: test_carry_false:
+; CHECK:       # BB#0: # %entry
+; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; CHECK-NEXT:    jge .LBB5_2
+; CHECK-NEXT:  # BB#1: # %bb1
+; CHECK-NEXT:    movl $1, %eax
+; CHECK-NEXT:    retl
+; CHECK-NEXT:  .LBB5_2: # %bb2
+; CHECK-NEXT:    movl $2, %eax
+; CHECK-NEXT:    retl
 entry:
   %x = and i64 %a, -4294967296 ;0xffffffff00000000
   %y = and i64 %b, -4294967296
@@ -115,16 +145,4 @@ bb1:
   ret i32 1
 bb2:
   ret i32 2
-
-; The comparison of the low bits will be folded to a CARRY_FALSE node. Make
-; sure the code can handle that.
-; CHECK-LABEL: carry_false:
-; CHECK: movl 8(%esp), [[LHSHi:%[a-z]+]]
-; CHECK: cmpl 16(%esp), [[LHSHi]]
-; CHECK: jge [[FALSE:.LBB[0-9_]+]]
-; CHECK: movl $1, %eax
-; CHECK: retl
-; CHECK: [[FALSE]]:
-; CHECK: movl $2, %eax
-; CHECK: retl
 }
diff --git a/test/CodeGen/X86/widen_bitops-0.ll b/test/CodeGen/X86/widen_bitops-0.ll
index f8316d0e1ea2fc76abd10169845bfc7de53d8ec4..132a2fd928f238addc8cbd51f31ef19fa9a4c613 100644
--- a/test/CodeGen/X86/widen_bitops-0.ll
+++ b/test/CodeGen/X86/widen_bitops-0.ll
@@ -131,10 +131,10 @@ define i24 @or_i24_as_v8i3(i24 %a, i24 %b) nounwind {
 define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
 ; X32-SSE-LABEL: and_v3i8_as_i24:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm0
 ; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm1
 ; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm1
 ; X32-SSE-NEXT:    pand %xmm0, %xmm1
@@ -172,10 +172,10 @@ define <3 x i8> @and_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
 define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
 ; X32-SSE-LABEL: xor_v3i8_as_i24:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm0
 ; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm1
 ; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm1
 ; X32-SSE-NEXT:    pxor %xmm0, %xmm1
@@ -213,10 +213,10 @@ define <3 x i8> @xor_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
 define <3 x i8> @or_v3i8_as_i24(<3 x i8> %a, <3 x i8> %b) nounwind {
 ; X32-SSE-LABEL: or_v3i8_as_i24:
 ; X32-SSE:       # BB#0:
-; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
+; X32-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm0
 ; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm0
-; X32-SSE-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm1
+; X32-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; X32-SSE-NEXT:    pinsrb $4, {{[0-9]+}}(%esp), %xmm1
 ; X32-SSE-NEXT:    pinsrb $8, {{[0-9]+}}(%esp), %xmm1
 ; X32-SSE-NEXT:    por %xmm0, %xmm1
diff --git a/test/CodeGen/X86/widen_conv-1.ll b/test/CodeGen/X86/widen_conv-1.ll
index aa2cef4ff8140d850e90350bd2edfbc24634e014..a672e84fcde40f5ee01f0a2c2ec7688dc2c5fca6 100644
--- a/test/CodeGen/X86/widen_conv-1.ll
+++ b/test/CodeGen/X86/widen_conv-1.ll
@@ -38,7 +38,6 @@ define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) no
 ; X86-NEXT:    paddd {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    pextrb $8, %xmm0, 2(%eax)
 ; X86-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X86-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; X86-NEXT:    pextrw $0, %xmm0, (%eax)
 ; X86-NEXT:    popl %eax
 ; X86-NEXT:    retl
@@ -49,7 +48,6 @@ define void @convert_v3i32_to_v3i8(<3 x i8>* %dst.addr, <3 x i32>* %src.addr) no
 ; X64-NEXT:    paddd {{.*}}(%rip), %xmm0
 ; X64-NEXT:    pextrb $8, %xmm0, 2(%rdi)
 ; X64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; X64-NEXT:    pextrw $0, %xmm0, (%rdi)
 ; X64-NEXT:    retq
 entry:
@@ -75,7 +73,6 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) no
 ; X86-NEXT:    paddw {{\.LCPI.*}}, %xmm0
 ; X86-NEXT:    pextrb $8, %xmm0, 4(%eax)
 ; X86-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; X86-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; X86-NEXT:    movd %xmm0, (%eax)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
@@ -87,7 +84,6 @@ define void @convert_v5i16_to_v5i8(<5 x i8>* %dst.addr, <5 x i16>* %src.addr) no
 ; X64-NEXT:    paddw {{.*}}(%rip), %xmm0
 ; X64-NEXT:    pextrb $8, %xmm0, 4(%rdi)
 ; X64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
-; X64-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; X64-NEXT:    movd %xmm0, (%rdi)
 ; X64-NEXT:    retq
 entry:
diff --git a/test/CodeGen/X86/widen_conv-3.ll b/test/CodeGen/X86/widen_conv-3.ll
index f2e29337e6ad0f4b5454a2fc17788d6f430e6715..504485440effffdd390524c838946942acfdd6b8 100644
--- a/test/CodeGen/X86/widen_conv-3.ll
+++ b/test/CodeGen/X86/widen_conv-3.ll
@@ -65,7 +65,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X86-SSE2-NEXT:    shll $8, %edx
 ; X86-SSE2-NEXT:    movzbl (%esp), %esi
 ; X86-SSE2-NEXT:    orl %edx, %esi
-; X86-SSE2-NEXT:    pinsrw $0, %esi, %xmm0
+; X86-SSE2-NEXT:    movd %esi, %xmm0
 ; X86-SSE2-NEXT:    movzbl 2(%ecx), %ecx
 ; X86-SSE2-NEXT:    pinsrw $1, %ecx, %xmm0
 ; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -106,8 +106,6 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X64-SSE2:       # BB#0: # %entry
 ; X64-SSE2-NEXT:    movzwl (%rsi), %eax
 ; X64-SSE2-NEXT:    movd %rax, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; X64-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X64-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X64-SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -115,7 +113,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X64-SSE2-NEXT:    shll $8, %eax
 ; X64-SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; X64-SSE2-NEXT:    orl %eax, %ecx
-; X64-SSE2-NEXT:    pinsrw $0, %ecx, %xmm0
+; X64-SSE2-NEXT:    movd %ecx, %xmm0
 ; X64-SSE2-NEXT:    movzbl 2(%rsi), %eax
 ; X64-SSE2-NEXT:    pinsrw $1, %eax, %xmm0
 ; X64-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -132,8 +130,6 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X64-SSE42-NEXT:    movzbl 2(%rsi), %eax
 ; X64-SSE42-NEXT:    movzwl (%rsi), %ecx
 ; X64-SSE42-NEXT:    movd %rcx, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; X64-SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X64-SSE42-NEXT:    pinsrd $2, %eax, %xmm0
 ; X64-SSE42-NEXT:    pslld $24, %xmm0
diff --git a/test/CodeGen/X86/widen_conv-4.ll b/test/CodeGen/X86/widen_conv-4.ll
index 90c4bbe6bb70578a36020fe21e4429512c5cbd62..ef56692e947cebff4cc3ad5dbb1c91aae0da734b 100644
--- a/test/CodeGen/X86/widen_conv-4.ll
+++ b/test/CodeGen/X86/widen_conv-4.ll
@@ -91,7 +91,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X86-SSE2-NEXT:    shll $8, %edx
 ; X86-SSE2-NEXT:    movzbl (%esp), %esi
 ; X86-SSE2-NEXT:    orl %edx, %esi
-; X86-SSE2-NEXT:    pinsrw $0, %esi, %xmm0
+; X86-SSE2-NEXT:    movd %esi, %xmm0
 ; X86-SSE2-NEXT:    movzbl 2(%ecx), %ecx
 ; X86-SSE2-NEXT:    pinsrw $1, %ecx, %xmm0
 ; X86-SSE2-NEXT:    pxor %xmm1, %xmm1
@@ -131,8 +131,6 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X64-SSE2:       # BB#0: # %entry
 ; X64-SSE2-NEXT:    movzwl (%rsi), %eax
 ; X64-SSE2-NEXT:    movd %rax, %xmm0
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; X64-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; X64-SSE2-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; X64-SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
@@ -140,7 +138,7 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X64-SSE2-NEXT:    shll $8, %eax
 ; X64-SSE2-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; X64-SSE2-NEXT:    orl %eax, %ecx
-; X64-SSE2-NEXT:    pinsrw $0, %ecx, %xmm0
+; X64-SSE2-NEXT:    movd %ecx, %xmm0
 ; X64-SSE2-NEXT:    movzbl 2(%rsi), %eax
 ; X64-SSE2-NEXT:    pinsrw $1, %eax, %xmm0
 ; X64-SSE2-NEXT:    pxor %xmm1, %xmm1
@@ -157,8 +155,6 @@ define void @convert_v3i8_to_v3f32(<3 x float>* %dst.addr, <3 x i8>* %src.addr)
 ; X64-SSE42-NEXT:    movzbl 2(%rsi), %eax
 ; X64-SSE42-NEXT:    movzwl (%rsi), %ecx
 ; X64-SSE42-NEXT:    movd %rcx, %xmm0
-; X64-SSE42-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; X64-SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; X64-SSE42-NEXT:    pmovzxbd {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero
 ; X64-SSE42-NEXT:    pinsrd $2, %eax, %xmm0
 ; X64-SSE42-NEXT:    pand {{.*}}(%rip), %xmm0
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 61297cc11d32cc481f7a15570d0ebe510e296400..9fc0805b899cddb7638cdd48c0fd5a4b3e888e5f 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -164,8 +164,7 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
 ; X86-NEXT:    paddd %xmm0, %xmm1
 ; X86-NEXT:    pextrw $4, %xmm1, 4(%eax)
 ; X86-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; X86-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; X86-NEXT:    movd %xmm0, (%eax)
+; X86-NEXT:    movd %xmm1, (%eax)
 ; X86-NEXT:    movl %ebp, %esp
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl $4
@@ -177,8 +176,7 @@ define void @add3i16(%i16vec3* nocapture sret %ret, %i16vec3* %ap, %i16vec3* %bp
 ; X64-NEXT:    paddd %xmm0, %xmm1
 ; X64-NEXT:    pextrw $4, %xmm1, 4(%rdi)
 ; X64-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
-; X64-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero
-; X64-NEXT:    movd %xmm0, (%rdi)
+; X64-NEXT:    movd %xmm1, (%rdi)
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i16vec3, %i16vec3* %ap, align 16
@@ -301,8 +299,7 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no
 ; X86-NEXT:    paddd %xmm0, %xmm1
 ; X86-NEXT:    pextrb $8, %xmm1, 2(%eax)
 ; X86-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X86-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; X86-NEXT:    pextrw $0, %xmm0, (%eax)
+; X86-NEXT:    pextrw $0, %xmm1, (%eax)
 ; X86-NEXT:    addl $12, %esp
 ; X86-NEXT:    retl $4
 ;
@@ -313,8 +310,7 @@ define void @add3i8(%i8vec3* nocapture sret %ret, %i8vec3* %ap, %i8vec3* %bp) no
 ; X64-NEXT:    paddd %xmm0, %xmm1
 ; X64-NEXT:    pextrb $8, %xmm1, 2(%rdi)
 ; X64-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; X64-NEXT:    pextrw $0, %xmm0, (%rdi)
+; X64-NEXT:    pextrw $0, %xmm1, (%rdi)
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
 	%a = load %i8vec3, %i8vec3* %ap, align 16
@@ -372,38 +368,36 @@ define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pa
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X86-NEXT:    movdqa {{.*#+}} xmm0 = [40606,0,158,0]
 ; X86-NEXT:    pextrw $0, %xmm0, (%edx)
 ; X86-NEXT:    movb $-98, 2(%edx)
-; X86-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X86-NEXT:    movdqa {{.*#+}} xmm0 = [257,0,1,0]
 ; X86-NEXT:    pextrw $0, %xmm0, (%ecx)
 ; X86-NEXT:    movb $1, 2(%ecx)
 ; X86-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X86-NEXT:    movdqa %xmm0, %xmm1
 ; X86-NEXT:    psrld $1, %xmm1
-; X86-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; X86-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; X86-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X86-NEXT:    pextrb $8, %xmm1, 2(%eax)
-; X86-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X86-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; X86-NEXT:    pextrw $0, %xmm0, (%eax)
 ; X86-NEXT:    addl $16, %esp
 ; X86-NEXT:    retl $4
 ;
 ; X64-LABEL: rot:
 ; X64:       # BB#0: # %entry
-; X64-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X64-NEXT:    movdqa {{.*#+}} xmm0 = [40606,158]
 ; X64-NEXT:    pextrw $0, %xmm0, (%rsi)
 ; X64-NEXT:    movb $-98, 2(%rsi)
-; X64-NEXT:    pmovzxwq {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero
+; X64-NEXT:    movdqa {{.*#+}} xmm0 = [257,1]
 ; X64-NEXT:    pextrw $0, %xmm0, (%rdx)
 ; X64-NEXT:    movb $1, 2(%rdx)
 ; X64-NEXT:    pmovzxbd {{.*#+}} xmm0 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
 ; X64-NEXT:    movdqa %xmm0, %xmm1
 ; X64-NEXT:    psrld $1, %xmm1
-; X64-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; X64-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2,3,4,5],xmm0[6,7]
+; X64-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; X64-NEXT:    pextrb $8, %xmm1, 2(%rdi)
-; X64-NEXT:    pshufb {{.*#+}} xmm1 = xmm1[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
-; X64-NEXT:    pmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; X64-NEXT:    pextrw $0, %xmm0, (%rdi)
 ; X64-NEXT:    movq %rdi, %rax
 ; X64-NEXT:    retq
diff --git a/test/CodeGen/X86/widened-broadcast.ll b/test/CodeGen/X86/widened-broadcast.ll
index 5bce383d9bf18f6f9b206f928ae657d3ee180d20..900a7546f15b9d5cd24888b35d19831470755486 100644
--- a/test/CodeGen/X86/widened-broadcast.ll
+++ b/test/CodeGen/X86/widened-broadcast.ll
@@ -51,14 +51,12 @@ define <8 x float> @load_splat_8f32_4f32_01010101(<4 x float>* %ptr) nounwind uw
 ;
 ; AVX2-LABEL: load_splat_8f32_4f32_01010101:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_8f32_4f32_01010101:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <4 x float>, <4 x float>* %ptr
@@ -131,14 +129,12 @@ define <8 x i32> @load_splat_8i32_4i32_01010101(<4 x i32>* %ptr) nounwind uwtabl
 ;
 ; AVX2-LABEL: load_splat_8i32_4i32_01010101:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_8i32_4i32_01010101:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <4 x i32>, <4 x i32>* %ptr
@@ -242,14 +238,12 @@ define <16 x i16> @load_splat_16i16_8i16_0101010101010101(<8 x i16>* %ptr) nounw
 ;
 ; AVX2-LABEL: load_splat_16i16_8i16_0101010101010101:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_16i16_8i16_0101010101010101:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
@@ -272,14 +266,12 @@ define <16 x i16> @load_splat_16i16_8i16_0123012301230123(<8 x i16>* %ptr) nounw
 ;
 ; AVX2-LABEL: load_splat_16i16_8i16_0123012301230123:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_16i16_8i16_0123012301230123:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <8 x i16>, <8 x i16>* %ptr
@@ -442,14 +434,12 @@ define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(<16 x i8
 ;
 ; AVX2-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vpbroadcastw (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX512-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX512-NEXT:    vpbroadcastw (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
@@ -472,14 +462,12 @@ define <32 x i8> @load_splat_32i8_16i8_01230123012301230123012301230123(<16 x i8
 ;
 ; AVX2-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastss (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_32i8_16i8_01230123012301230123012301230123:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastss %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastss (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
@@ -502,14 +490,12 @@ define <32 x i8> @load_splat_32i8_16i8_01234567012345670123456701234567(<16 x i8
 ;
 ; AVX2-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
 ; AVX2:       # BB#0: # %entry
-; AVX2-NEXT:    vmovaps (%rdi), %xmm0
-; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX2-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: load_splat_32i8_16i8_01234567012345670123456701234567:
 ; AVX512:       # BB#0: # %entry
-; AVX512-NEXT:    vmovaps (%rdi), %xmm0
-; AVX512-NEXT:    vbroadcastsd %xmm0, %ymm0
+; AVX512-NEXT:    vbroadcastsd (%rdi), %ymm0
 ; AVX512-NEXT:    retq
 entry:
   %ld = load <16 x i8>, <16 x i8>* %ptr
diff --git a/test/CodeGen/X86/win-alloca-expander.ll b/test/CodeGen/X86/win-alloca-expander.ll
index 45ca3b214ab87a7d81439b8f302bd55c8aad3b9e..4b6e3bb18e60142a2fe51880d3cbe7d95cc41647 100644
--- a/test/CodeGen/X86/win-alloca-expander.ll
+++ b/test/CodeGen/X86/win-alloca-expander.ll
@@ -115,34 +115,36 @@ define void @cfg(i1 %x, i1 %y) {
 ; Test that the blocks are analyzed in the correct order.
 ; CHECK-LABEL: cfg:
 entry:
-  br i1 %x, label %bb1, label %bb2
+  br i1 %x, label %bb1, label %bb3
 
 bb1:
   %p1 = alloca %struct.S
 ; CHECK: pushl %eax
 ; CHECK: subl $1020, %esp
-  br label %bb3
+  br label %bb4
+
 bb2:
-  %p2 = alloca %struct.T
+  %p5 = alloca %struct.T
 ; CHECK: pushl %eax
 ; CHECK: subl $2996, %esp
-  br label %bb3
+  call void @g(%struct.T* %p5)
+  ret void
 
 bb3:
-  br i1 %y, label %bb4, label %bb5
+  %p2 = alloca %struct.T
+; CHECK: pushl %eax
+; CHECK: subl $2996, %esp
+  br label %bb4
 
 bb4:
+  br i1 %y, label %bb5, label %bb2
+
+bb5:
   %p4 = alloca %struct.S
 ; CHECK: subl $1024, %esp
   call void @f(%struct.S* %p4)
   ret void
 
-bb5:
-  %p5 = alloca %struct.T
-; CHECK: pushl %eax
-; CHECK: subl $2996, %esp
-  call void @g(%struct.T* %p5)
-  ret void
 }
 
 
diff --git a/test/CodeGen/X86/win32-eh.ll b/test/CodeGen/X86/win32-eh.ll
index 88403c687403f0fec55bc1bc032b7c1400068731..de8464e4f8b8afb8c2928576f85a150509df07ce 100644
--- a/test/CodeGen/X86/win32-eh.ll
+++ b/test/CodeGen/X86/win32-eh.ll
@@ -27,23 +27,26 @@ catch:
 
 ; CHECK-LABEL: _use_except_handler3:
 ; CHECK: pushl %ebp
-; CHECK: movl %esp, %ebp
-; CHECK: pushl %ebx
-; CHECK: pushl %edi
-; CHECK: pushl %esi
-; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl $-1, -16(%ebp)
-; CHECK: movl $L__ehtable$use_except_handler3, -20(%ebp)
-; CHECK: leal -28(%ebp), %[[node:[^ ,]*]]
-; CHECK: movl $__except_handler3, -24(%ebp)
-; CHECK: movl %fs:0, %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], -28(%ebp)
-; CHECK: movl %[[node]], %fs:0
-; CHECK: calll _may_throw_or_crash
+; CHECK-NEXT: movl %esp, %ebp
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: subl ${{[0-9]+}}, %esp
+; CHECK-NEXT: movl %esp, -36(%ebp)
+; CHECK-NEXT: movl $-1, -16(%ebp)
+; CHECK-NEXT: movl $L__ehtable$use_except_handler3, -20(%ebp)
+; CHECK-NEXT: leal -28(%ebp), %[[node:[^ ,]*]]
+; CHECK-NEXT: movl $__except_handler3, -24(%ebp)
+; CHECK-NEXT: movl %fs:0, %[[next:[^ ,]*]]
+; CHECK-NEXT: movl %[[next]], -28(%ebp)
+; CHECK-NEXT: movl %[[node]], %fs:0
+; CHECK-NEXT: movl $0, -16(%ebp)
+; CHECK-NEXT: calll _may_throw_or_crash
+
 ; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], %fs:0
+; CHECK-NEXT: movl %[[next]], %fs:0
 ; CHECK: retl
-; CHECK: LBB1_2: # %catch{{$}}
+; CHECK-NEXT: LBB1_2: # %catch{{$}}
 
 ; CHECK: .section .xdata,"dr"
 ; CHECK-LABEL: L__ehtable$use_except_handler3:
@@ -66,23 +69,37 @@ catch:
 
 ; CHECK-LABEL: _use_except_handler4:
 ; CHECK: pushl %ebp
-; CHECK: movl %esp, %ebp
-; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl %esp, -36(%ebp)
-; CHECK: movl $-2, -16(%ebp)
-; CHECK: movl $L__ehtable$use_except_handler4, %[[lsda:[^ ,]*]]
-; CHECK: xorl ___security_cookie, %[[lsda]]
-; CHECK: movl %[[lsda]], -20(%ebp)
-; CHECK: leal -28(%ebp), %[[node:[^ ,]*]]
-; CHECK: movl $__except_handler4, -24(%ebp)
-; CHECK: movl %fs:0, %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], -28(%ebp)
-; CHECK: movl %[[node]], %fs:0
-; CHECK: calll _may_throw_or_crash
+; CHECK-NEXT: movl %esp, %ebp
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: subl ${{[0-9]+}}, %esp
+; CHECK-NEXT: movl %ebp, %eax
+; CHECK-NEXT: movl %esp, -36(%ebp)
+; CHECK-NEXT: movl $-2, -16(%ebp)
+; CHECK-NEXT: movl $L__ehtable$use_except_handler4, %[[lsda:[^ ,]*]]
+; CHECK-NEXT: movl ___security_cookie, %[[seccookie:[^ ,]*]]
+; CHECK-NEXT: xorl %[[seccookie]], %[[lsda]]
+; CHECK-NEXT: movl %[[lsda]], -20(%ebp)
+; CHECK-NEXT: xorl %[[seccookie]], %[[tmp1:[^ ,]*]]
+; CHECK-NEXT: movl %[[tmp1]], -40(%ebp)
+; CHECK-NEXT: leal -28(%ebp), %[[node:[^ ,]*]]
+; CHECK-NEXT: movl $__except_handler4, -24(%ebp)
+; CHECK-NEXT: movl %fs:0, %[[next:[^ ,]*]]
+; CHECK-NEXT: movl %[[next]], -28(%ebp)
+; CHECK-NEXT: movl %[[node]], %fs:0
+; CHECK-NEXT: movl $0, -16(%ebp)
+; CHECK-NEXT: calll _may_throw_or_crash
+
 ; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], %fs:0
-; CHECK: retl
-; CHECK: LBB2_2: # %catch{{$}}
+; CHECK-NEXT: movl %[[next]], %fs:0
+; CHECK-NEXT: addl $28, %esp
+; CHECK-NEXT: popl %esi
+; CHECK-NEXT: popl %edi
+; CHECK-NEXT: popl %ebx
+; CHECK-NEXT: popl %ebp
+; CHECK-NEXT: retl
+; CHECK-NEXT: LBB2_2: # %catch{{$}}
 
 ; CHECK: .section .xdata,"dr"
 ; CHECK-LABEL: L__ehtable$use_except_handler4:
@@ -109,26 +126,33 @@ catch:
 
 ; CHECK-LABEL: _use_except_handler4_ssp:
 ; CHECK: pushl %ebp
-; CHECK: movl %esp, %ebp
-; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl %ebp, %[[ehguard:[^ ,]*]]
-; CHECK: movl %esp, -36(%ebp)
-; CHECK: movl $-2, -16(%ebp)
-; CHECK: movl $L__ehtable$use_except_handler4_ssp, %[[lsda:[^ ,]*]]
-; CHECK: xorl ___security_cookie, %[[lsda]]
-; CHECK: movl %[[lsda]], -20(%ebp)
-; CHECK: xorl ___security_cookie, %[[ehguard]]
-; CHECK: movl %[[ehguard]], -40(%ebp)
-; CHECK: leal -28(%ebp), %[[node:[^ ,]*]]
-; CHECK: movl $__except_handler4, -24(%ebp)
-; CHECK: movl %fs:0, %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], -28(%ebp)
-; CHECK: movl %[[node]], %fs:0
-; CHECK: calll _may_throw_or_crash
+; CHECK-NEXT: movl %esp, %ebp
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: subl ${{[0-9]+}}, %esp
+; CHECK-NEXT: movl %ebp, %[[ehguard:[^ ,]*]]
+; CHECK-NEXT: movl %esp, -36(%ebp)
+; CHECK-NEXT: movl $-2, -16(%ebp)
+; CHECK-NEXT: movl $L__ehtable$use_except_handler4_ssp, %[[lsda:[^ ,]*]]
+; CHECK-NEXT: movl ___security_cookie, %[[seccookie:[^ ,]*]]
+; CHECK-NEXT: xorl %[[seccookie]], %[[lsda]]
+; CHECK-NEXT: movl %[[lsda]], -20(%ebp)
+; CHECK-NEXT: xorl %[[seccookie]], %[[ehguard]]
+; CHECK-NEXT: movl %[[ehguard]], -40(%ebp)
+; CHECK-NEXT: leal -28(%ebp), %[[node:[^ ,]*]]
+; CHECK-NEXT: movl $__except_handler4, -24(%ebp)
+; CHECK-NEXT: movl %fs:0, %[[next:[^ ,]*]]
+; CHECK-NEXT: movl %[[next]], -28(%ebp)
+; CHECK-NEXT: movl %[[node]], %fs:0
+; CHECK-NEXT: movl $0, -16(%ebp)
+; CHECK-NEXT: calll _may_throw_or_crash
 ; CHECK: movl -28(%ebp), %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], %fs:0   
+; CHECK-NEXT: movl %[[next]], %fs:0   
 ; CHECK: retl
-; CHECK: [[catch:[^ ,]*]]: # %catch{{$}}
+; CHECK-NEXT: [[catch:[^ ,]*]]: # %catch{{$}}
+
+
 
 ; CHECK: .section .xdata,"dr"
 ; CHECK-LABEL: L__ehtable$use_except_handler4_ssp:
@@ -155,23 +179,26 @@ catch:
 
 ; CHECK-LABEL: _use_CxxFrameHandler3:
 ; CHECK: pushl %ebp
-; CHECK: movl %esp, %ebp
-; CHECK: subl ${{[0-9]+}}, %esp
-; CHECK: movl %esp, -28(%ebp)
-; CHECK: movl $-1, -16(%ebp)
-; CHECK: leal -24(%ebp), %[[node:[^ ,]*]]
-; CHECK: movl $___ehhandler$use_CxxFrameHandler3, -20(%ebp)
-; CHECK: movl %fs:0, %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], -24(%ebp)
-; CHECK: movl %[[node]], %fs:0
-; CHECK: movl $0, -16(%ebp)
-; CHECK: calll _may_throw_or_crash
+; CHECK-NEXT: movl %esp, %ebp
+; CHECK-NEXT: pushl %ebx
+; CHECK-NEXT: pushl %edi
+; CHECK-NEXT: pushl %esi
+; CHECK-NEXT: subl ${{[0-9]+}}, %esp
+; CHECK-NEXT: movl %esp, -28(%ebp)
+; CHECK-NEXT: movl $-1, -16(%ebp)
+; CHECK-NEXT: leal -24(%ebp), %[[node:[^ ,]*]]
+; CHECK-NEXT: movl $___ehhandler$use_CxxFrameHandler3, -20(%ebp)
+; CHECK-NEXT: movl %fs:0, %[[next:[^ ,]*]]
+; CHECK-NEXT: movl %[[next]], -24(%ebp)
+; CHECK-NEXT: movl %[[node]], %fs:0
+; CHECK-NEXT: movl $0, -16(%ebp)
+; CHECK-NEXT: calll _may_throw_or_crash
 ; CHECK: movl -24(%ebp), %[[next:[^ ,]*]]
-; CHECK: movl %[[next]], %fs:0
+; CHECK-NEXT: movl %[[next]], %fs:0
 ; CHECK: retl
 
 ; CHECK: .section .xdata,"dr"
-; CHECK: .p2align 2
+; CHECK-NEXT: .p2align 2
 ; CHECK-LABEL: L__ehtable$use_CxxFrameHandler3:
 ; CHECK-NEXT:  .long   429065506
 ; CHECK-NEXT:  .long   2
@@ -185,8 +212,8 @@ catch:
 
 ; CHECK-LABEL: ___ehhandler$use_CxxFrameHandler3:
 ; CHECK: movl $L__ehtable$use_CxxFrameHandler3, %eax
-; CHECK: jmp  ___CxxFrameHandler3 # TAILCALL
+; CHECK-NEXT: jmp  ___CxxFrameHandler3 # TAILCALL
 
 ; CHECK: .safeseh __except_handler3
-; CHECK: .safeseh __except_handler4
-; CHECK: .safeseh ___ehhandler$use_CxxFrameHandler3
+; CHECK-NEXT: .safeseh __except_handler4
+; CHECK-NEXT: .safeseh ___ehhandler$use_CxxFrameHandler3
diff --git a/test/CodeGen/X86/win64_eh_leaf2.ll b/test/CodeGen/X86/win64_eh_leaf2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a840d948518b13098d58f5653bd8859094003bf5
--- /dev/null
+++ b/test/CodeGen/X86/win64_eh_leaf2.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -O1 -mtriple=x86_64-pc-win32 | FileCheck %s
+
+; Neither of these functions need .seh_ directives. We used to crash.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+declare i32 @__CxxFrameHandler3(...)
+
+define void @f1() uwtable nounwind personality i32 (...)* @__CxxFrameHandler3 {
+  ret void
+}
+
+; CHECK-LABEL: f1:
+; CHECK-NOT: .seh_
+
+define void @f2() uwtable {
+  ret void
+}
+
+; CHECK-LABEL: f2:
+; CHECK-NOT: .seh_
diff --git a/test/CodeGen/X86/x32-va_start.ll b/test/CodeGen/X86/x32-va_start.ll
index a48468880507eeaef8ddcc983658fd4fe0d04880..7202a3fb4cdcaf9553d83106dd09b6c0602f09a6 100644
--- a/test/CodeGen/X86/x32-va_start.ll
+++ b/test/CodeGen/X86/x32-va_start.ll
@@ -24,7 +24,7 @@ define i32 @foo(float %a, i8* nocapture readnone %fmt, ...) nounwind {
 entry:
   %ap = alloca [1 x %struct.__va_list_tag], align 16
   %0 = bitcast [1 x %struct.__va_list_tag]* %ap to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %0) #2
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %0) #2
   call void @llvm.va_start(i8* %0)
 ; SSE: subl $72, %esp
 ; SSE: testb %al, %al
@@ -79,14 +79,14 @@ vaarg.end:                                        ; preds = %vaarg.in_mem, %vaar
   %vaarg.addr = bitcast i8* %vaarg.addr.in to i32*
   %4 = load i32, i32* %vaarg.addr, align 4
   call void @llvm.va_end(i8* %0)
-  call void @llvm.lifetime.end(i64 16, i8* %0) #2
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %0) #2
   ret i32 %4
 ; SSE: movl ([[ADDR]]), %eax
 ; SSE: retq
 }
 
 ; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
 ; Function Attrs: nounwind
 declare void @llvm.va_start(i8*) nounwind
@@ -95,5 +95,5 @@ declare void @llvm.va_start(i8*) nounwind
 declare void @llvm.va_end(i8*) nounwind
 
 ; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 
diff --git a/test/CodeGen/X86/x86-64-intrcc-nosse.ll b/test/CodeGen/X86/x86-64-intrcc-nosse.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ab84088c34447ff0c7057827722ac747f99ae5a5
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-intrcc-nosse.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=-sse < %s | FileCheck %s
+
+%struct.interrupt_frame = type { i64, i64, i64, i64, i64 }
+
+@llvm.used = appending global [1 x i8*] [i8* bitcast (void (%struct.interrupt_frame*, i64)* @test_isr_sse_clobbers to i8*)], section "llvm.metadata"
+
+; Clobbered SSE must not be saved when the target doesn't support SSE
+define x86_intrcc void @test_isr_sse_clobbers(%struct.interrupt_frame* %frame, i64 %ecode) {
+  ; CHECK-LABEL: test_isr_sse_clobbers:
+  ; CHECK:       # BB#0:
+  ; CHECK-NEXT:    pushq %rax
+  ; CHECK-NEXT:    cld
+  ; CHECK-NEXT:    #APP
+  ; CHECK-NEXT:    #NO_APP
+  ; CHECK-NEXT:    addq $16, %rsp
+  ; CHECK-NEXT:    iretq
+  call void asm sideeffect "", "~{xmm0},~{xmm6}"()
+  ret void
+}
diff --git a/test/CodeGen/X86/x86-64-intrcc.ll b/test/CodeGen/X86/x86-64-intrcc.ll
index 2bcf3cde478a659a28b6beda70fb52c920bc5390..c8bc9e716ce54d62ba4497a1307e60ea6fcedc66 100644
--- a/test/CodeGen/X86/x86-64-intrcc.ll
+++ b/test/CodeGen/X86/x86-64-intrcc.ll
@@ -30,22 +30,24 @@ define x86_intrcc void @test_isr_no_ecode(%struct.interrupt_frame* %frame) {
 define x86_intrcc void @test_isr_ecode(%struct.interrupt_frame* %frame, i64 %ecode) {
   ; CHECK-LABEL: test_isr_ecode
   ; CHECK: pushq %rax
+  ; CHECK: pushq %rax
   ; CHECK: pushq %rcx
-  ; CHECK: movq 16(%rsp), %rax
-  ; CHECK: movq 40(%rsp), %rcx
+  ; CHECK: movq 24(%rsp), %rax
+  ; CHECK: movq 48(%rsp), %rcx
   ; CHECK: popq %rcx
   ; CHECK: popq %rax
-  ; CHECK: addq $8, %rsp
+  ; CHECK: addq $16, %rsp
   ; CHECK: iretq
   ; CHECK0-LABEL: test_isr_ecode
   ; CHECK0: pushq %rax
+  ; CHECK0: pushq %rax
   ; CHECK0: pushq %rcx
-  ; CHECK0: movq 16(%rsp), %rax
-  ; CHECK0: leaq 24(%rsp), %rcx
+  ; CHECK0: movq 24(%rsp), %rax
+  ; CHECK0: leaq 32(%rsp), %rcx
   ; CHECK0: movq 16(%rcx), %rcx
   ; CHECK0: popq %rcx
   ; CHECK0: popq %rax
-  ; CHECK0: addq $8, %rsp
+  ; CHECK0: addq $16, %rsp
   ; CHECK0: iretq
   %pflags = getelementptr inbounds %struct.interrupt_frame, %struct.interrupt_frame* %frame, i32 0, i32 2
   %flags = load i64, i64* %pflags, align 4
@@ -58,6 +60,7 @@ define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i64 %
   call void asm sideeffect "", "~{rax},~{rbx},~{rbp},~{r11},~{xmm0}"()
   ; CHECK-LABEL: test_isr_clobbers
   ; CHECK-SSE-NEXT: pushq %rax
+  ; CHECK-SSE-NEXT: pushq %rax
   ; CHECK-SSE-NEXT; pushq %r11
   ; CHECK-SSE-NEXT: pushq %rbp
   ; CHECK-SSE-NEXT: pushq %rbx
@@ -80,7 +83,7 @@ define x86_intrcc void @test_isr_clobbers(%struct.interrupt_frame* %frame, i64 %
   ; CHECK0-SSE-NEXT: popq %rbp
   ; CHECK0-SSE-NEXT: popq %r11
   ; CHECK0-SSE-NEXT: popq %rax
-  ; CHECK0-SSE-NEXT: addq $8, %rsp
+  ; CHECK0-SSE-NEXT: addq $16, %rsp
   ; CHECK0-SSE-NEXT: iretq
   ret void
 }
diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll
index 1fc1b43b0402edee76d5a1de5ee4e29b8bc33179..6fbec91e77a37c19658febc6b985d0d420802b1e 100644
--- a/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/test/CodeGen/X86/x86-interleaved-access.ll
@@ -53,17 +53,29 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) {
 }
 
 define <4 x double> @load_factorf64_1(<16 x double>* %ptr) {
-; AVX-LABEL: load_factorf64_1:
-; AVX:       # BB#0:
-; AVX-NEXT:    vmovupd (%rdi), %ymm0
-; AVX-NEXT:    vmovupd 32(%rdi), %ymm1
-; AVX-NEXT:    vmovupd 64(%rdi), %ymm2
-; AVX-NEXT:    vmovupd 96(%rdi), %ymm3
-; AVX-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
-; AVX-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    retq
+; AVX1-LABEL: load_factorf64_1:
+; AVX1:       # BB#0:
+; AVX1-NEXT:    vmovups (%rdi), %ymm0
+; AVX1-NEXT:    vmovups 32(%rdi), %ymm1
+; AVX1-NEXT:    vmovups 64(%rdi), %ymm2
+; AVX1-NEXT:    vmovups 96(%rdi), %ymm3
+; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX1-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX1-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: load_factorf64_1:
+; AVX2:       # BB#0:
+; AVX2-NEXT:    vmovupd (%rdi), %ymm0
+; AVX2-NEXT:    vmovupd 32(%rdi), %ymm1
+; AVX2-NEXT:    vmovupd 64(%rdi), %ymm2
+; AVX2-NEXT:    vmovupd 96(%rdi), %ymm3
+; AVX2-NEXT:    vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX2-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
+; AVX2-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[2],ymm1[2]
+; AVX2-NEXT:    vmulpd %ymm0, %ymm0, %ymm0
+; AVX2-NEXT:    retq
   %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16
   %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
   %strided.v3 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
diff --git a/test/CodeGen/X86/xaluo.ll b/test/CodeGen/X86/xaluo.ll
index 57e6b4a50429080444efc83d7d84d7684e570615..25fd21d80c6011ac33a1601668f7da6c66c1d6be 100644
--- a/test/CodeGen/X86/xaluo.ll
+++ b/test/CodeGen/X86/xaluo.ll
@@ -1,16 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-darwin-unknown                             < %s | FileCheck %s --check-prefix=CHECK --check-prefix=SDAG
-; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort=1 < %s | FileCheck %s --check-prefix=CHECK --check-prefix=FAST
+; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=SDAG
+; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort=1 < %s | FileCheck %s --check-prefix=FAST
 ; RUN: llc -mtriple=x86_64-darwin-unknown -mcpu=knl < %s | FileCheck %s --check-prefix=KNL
+
 ;
 ; Get the actual value of the overflow bit.
 ;
 ; SADDO reg, reg
-define zeroext i1 @saddo.i8(i8 signext %v1, i8 signext %v2, i8* %res) {
-entry:
-; CHECK-LABEL: saddo.i8
-; CHECK:       addb %sil, %dil
-; CHECK-NEXT:  seto %al
+define zeroext i1 @saddoi8(i8 signext %v1, i8 signext %v2, i8* %res) {
+; SDAG-LABEL: saddoi8:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addb %sil, %dil
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movb %dil, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoi8:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addb %sil, %dil
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movb %dil, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoi8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addb %sil, %dil
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movb %dil, (%rdx)
+; KNL-NEXT:    retq
   %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 %v2)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -18,11 +37,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.i16(i16 %v1, i16 %v2, i16* %res) {
-entry:
-; CHECK-LABEL: saddo.i16
-; CHECK:       addw %si, %di
-; CHECK-NEXT:  seto %al
+define zeroext i1 @saddoi16(i16 %v1, i16 %v2, i16* %res) {
+; SDAG-LABEL: saddoi16:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addw %si, %di
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movw %di, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoi16:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addw %si, %di
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movw %di, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoi16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addw %si, %di
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movw %di, (%rdx)
+; KNL-NEXT:    retq
   %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 %v2)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -30,11 +67,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL: saddo.i32
-; CHECK:       addl %esi, %edi
-; CHECK-NEXT:  seto %al
+define zeroext i1 @saddoi32(i32 %v1, i32 %v2, i32* %res) {
+; SDAG-LABEL: saddoi32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addl %esi, %edi
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movl %edi, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoi32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addl %esi, %edi
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movl %edi, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoi32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addl %esi, %edi
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movl %edi, (%rdx)
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -42,11 +97,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL: saddo.i64
-; CHECK:       addq %rsi, %rdi
-; CHECK-NEXT:  seto %al
+define zeroext i1 @saddoi64(i64 %v1, i64 %v2, i64* %res) {
+; SDAG-LABEL: saddoi64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addq %rsi, %rdi
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movq %rdi, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoi64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addq %rsi, %rdi
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movq %rdi, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoi64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addq %rsi, %rdi
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movq %rdi, (%rdx)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -55,11 +128,29 @@ entry:
 }
 
 ; SADDO reg, 1 | INC
-define zeroext i1 @saddo.inc.i8(i8 %v1, i8* %res) {
-entry:
-; CHECK-LABEL: saddo.inc.i8
-; CHECK:       incb %dil
-; CHECK-NEXT:  seto %al
+define zeroext i1 @saddoinci8(i8 %v1, i8* %res) {
+; SDAG-LABEL: saddoinci8:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    incb %dil
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movb %dil, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoinci8:
+; FAST:       ## BB#0:
+; FAST-NEXT:    incb %dil
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movb %dil, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoinci8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    incb %dil
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movb %dil, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i8, i1} @llvm.sadd.with.overflow.i8(i8 %v1, i8 1)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -67,11 +158,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.inc.i16(i16 %v1, i16* %res) {
-entry:
-; CHECK-LABEL: saddo.inc.i16
-; CHECK:       incw %di
-; CHECK-NEXT:  seto %al
+define zeroext i1 @saddoinci16(i16 %v1, i16* %res) {
+; SDAG-LABEL: saddoinci16:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    incw %di
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movw %di, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoinci16:
+; FAST:       ## BB#0:
+; FAST-NEXT:    incw %di
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movw %di, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoinci16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    incw %di
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movw %di, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i16, i1} @llvm.sadd.with.overflow.i16(i16 %v1, i16 1)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -79,11 +188,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.inc.i32(i32 %v1, i32* %res) {
-entry:
-; CHECK-LABEL: saddo.inc.i32
-; CHECK:       incl %edi
-; CHECK-NEXT:  seto %al
+define zeroext i1 @saddoinci32(i32 %v1, i32* %res) {
+; SDAG-LABEL: saddoinci32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    incl %edi
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movl %edi, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoinci32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    incl %edi
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movl %edi, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoinci32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    incl %edi
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movl %edi, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 1)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -91,11 +218,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.inc.i64(i64 %v1, i64* %res) {
-entry:
-; CHECK-LABEL: saddo.inc.i64
-; CHECK:       incq %rdi
-; CHECK-NEXT:  seto %al
+define zeroext i1 @saddoinci64(i64 %v1, i64* %res) {
+; SDAG-LABEL: saddoinci64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    incq %rdi
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movq %rdi, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoinci64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    incq %rdi
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movq %rdi, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoinci64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    incq %rdi
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movq %rdi, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 1)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -105,15 +250,31 @@ entry:
 
 ; SADDO reg, imm | imm, reg
 ; FIXME: DAG doesn't optimize immediates on the LHS.
-define zeroext i1 @saddo.i64imm1(i64 %v1, i64* %res) {
-entry:
-; SDAG-LABEL: saddo.i64imm1
-; SDAG:       mov
-; SDAG-NEXT:  addq
-; SDAG-NEXT:  seto
-; FAST-LABEL: saddo.i64imm1
-; FAST:       addq $2, %rdi
-; FAST-NEXT:  seto %al
+define zeroext i1 @saddoi64imm1(i64 %v1, i64* %res) {
+; SDAG-LABEL: saddoi64imm1:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl $2, %ecx
+; SDAG-NEXT:    addq %rdi, %rcx
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movq %rcx, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoi64imm1:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addq $2, %rdi
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movq %rdi, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoi64imm1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl $2, %ecx
+; KNL-NEXT:    addq %rdi, %rcx
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movq %rcx, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 2, i64 %v1)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -122,11 +283,29 @@ entry:
 }
 
 ; Check boundary conditions for large immediates.
-define zeroext i1 @saddo.i64imm2(i64 %v1, i64* %res) {
-entry:
-; CHECK-LABEL: saddo.i64imm2
-; CHECK:       addq $-2147483648, %rdi
-; CHECK-NEXT:  seto %al
+define zeroext i1 @saddoi64imm2(i64 %v1, i64* %res) {
+; SDAG-LABEL: saddoi64imm2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addq $-2147483648, %rdi ## imm = 0x80000000
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movq %rdi, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoi64imm2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addq $-2147483648, %rdi ## imm = 0x80000000
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movq %rdi, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoi64imm2:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addq $-2147483648, %rdi ## imm = 0x80000000
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movq %rdi, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -2147483648)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -134,12 +313,32 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.i64imm3(i64 %v1, i64* %res) {
-entry:
-; CHECK-LABEL: saddo.i64imm3
-; CHECK:       movabsq $-21474836489, %[[REG:[a-z]+]]
-; CHECK-NEXT:  addq %rdi, %[[REG]]
-; CHECK-NEXT:  seto
+define zeroext i1 @saddoi64imm3(i64 %v1, i64* %res) {
+; SDAG-LABEL: saddoi64imm3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movabsq $-21474836489, %rcx ## imm = 0xFFFFFFFAFFFFFFF7
+; SDAG-NEXT:    addq %rdi, %rcx
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movq %rcx, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoi64imm3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movabsq $-21474836489, %rax ## imm = 0xFFFFFFFAFFFFFFF7
+; FAST-NEXT:    addq %rdi, %rax
+; FAST-NEXT:    seto %cl
+; FAST-NEXT:    movq %rax, (%rsi)
+; FAST-NEXT:    andb $1, %cl
+; FAST-NEXT:    movzbl %cl, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoi64imm3:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movabsq $-21474836489, %rcx ## imm = 0xFFFFFFFAFFFFFFF7
+; KNL-NEXT:    addq %rdi, %rcx
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movq %rcx, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 -21474836489)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -147,11 +346,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.i64imm4(i64 %v1, i64* %res) {
-entry:
-; CHECK-LABEL: saddo.i64imm4
-; CHECK:       addq $2147483647, %rdi
-; CHECK-NEXT:  seto
+define zeroext i1 @saddoi64imm4(i64 %v1, i64* %res) {
+; SDAG-LABEL: saddoi64imm4:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addq $2147483647, %rdi ## imm = 0x7FFFFFFF
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movq %rdi, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoi64imm4:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addq $2147483647, %rdi ## imm = 0x7FFFFFFF
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movq %rdi, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoi64imm4:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addq $2147483647, %rdi ## imm = 0x7FFFFFFF
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movq %rdi, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483647)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -159,12 +376,32 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @saddo.i64imm5(i64 %v1, i64* %res) {
-entry:
-; CHECK-LABEL: saddo.i64imm5
-; CHECK:       movl $2147483648
-; CHECK:       addq %rdi
-; CHECK-NEXT:  seto
+define zeroext i1 @saddoi64imm5(i64 %v1, i64* %res) {
+; SDAG-LABEL: saddoi64imm5:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl $2147483648, %ecx ## imm = 0x80000000
+; SDAG-NEXT:    addq %rdi, %rcx
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movq %rcx, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoi64imm5:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movl $2147483648, %eax ## imm = 0x80000000
+; FAST-NEXT:    addq %rdi, %rax
+; FAST-NEXT:    seto %cl
+; FAST-NEXT:    movq %rax, (%rsi)
+; FAST-NEXT:    andb $1, %cl
+; FAST-NEXT:    movzbl %cl, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoi64imm5:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl $2147483648, %ecx ## imm = 0x80000000
+; KNL-NEXT:    addq %rdi, %rcx
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movq %rcx, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 2147483648)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -173,11 +410,29 @@ entry:
 }
 
 ; UADDO
-define zeroext i1 @uaddo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL: uaddo.i32
-; CHECK:       addl %esi, %edi
-; CHECK-NEXT:  setb %al
+define zeroext i1 @uaddoi32(i32 %v1, i32 %v2, i32* %res) {
+; SDAG-LABEL: uaddoi32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addl %esi, %edi
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    movl %edi, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddoi32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addl %esi, %edi
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    movl %edi, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddoi32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addl %esi, %edi
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    movl %edi, (%rdx)
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -185,11 +440,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @uaddo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL: uaddo.i64
-; CHECK:       addq %rsi, %rdi
-; CHECK-NEXT:  setb %al
+define zeroext i1 @uaddoi64(i64 %v1, i64 %v2, i64* %res) {
+; SDAG-LABEL: uaddoi64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addq %rsi, %rdi
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    movq %rdi, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddoi64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addq %rsi, %rdi
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    movq %rdi, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddoi64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addq %rsi, %rdi
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    movq %rdi, (%rdx)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -198,10 +471,29 @@ entry:
 }
 
 ; UADDO reg, 1 | NOT INC
-define zeroext i1 @uaddo.inc.i8(i8 %v1, i8* %res) {
-entry:
-; CHECK-LABEL: uaddo.inc.i8
-; CHECK-NOT:   incb %dil
+define zeroext i1 @uaddoinci8(i8 %v1, i8* %res) {
+; SDAG-LABEL: uaddoinci8:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addb $1, %dil
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    movb %dil, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddoinci8:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addb $1, %dil
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    movb %dil, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddoinci8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addb $1, %dil
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    movb %dil, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i8, i1} @llvm.uadd.with.overflow.i8(i8 %v1, i8 1)
   %val = extractvalue {i8, i1} %t, 0
   %obit = extractvalue {i8, i1} %t, 1
@@ -209,10 +501,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @uaddo.inc.i16(i16 %v1, i16* %res) {
-entry:
-; CHECK-LABEL: uaddo.inc.i16
-; CHECK-NOT:   incw %di
+define zeroext i1 @uaddoinci16(i16 %v1, i16* %res) {
+; SDAG-LABEL: uaddoinci16:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addw $1, %di
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    movw %di, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddoinci16:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addw $1, %di
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    movw %di, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddoinci16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addw $1, %di
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    movw %di, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i16, i1} @llvm.uadd.with.overflow.i16(i16 %v1, i16 1)
   %val = extractvalue {i16, i1} %t, 0
   %obit = extractvalue {i16, i1} %t, 1
@@ -220,10 +531,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @uaddo.inc.i32(i32 %v1, i32* %res) {
-entry:
-; CHECK-LABEL: uaddo.inc.i32
-; CHECK-NOT:   incl %edi
+define zeroext i1 @uaddoinci32(i32 %v1, i32* %res) {
+; SDAG-LABEL: uaddoinci32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addl $1, %edi
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    movl %edi, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddoinci32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addl $1, %edi
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    movl %edi, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddoinci32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addl $1, %edi
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    movl %edi, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 1)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -231,10 +561,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @uaddo.inc.i64(i64 %v1, i64* %res) {
-entry:
-; CHECK-LABEL: uaddo.inc.i64
-; CHECK-NOT:   incq %rdi
+define zeroext i1 @uaddoinci64(i64 %v1, i64* %res) {
+; SDAG-LABEL: uaddoinci64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addq $1, %rdi
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    movq %rdi, (%rsi)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddoinci64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addq $1, %rdi
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    movq %rdi, (%rsi)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddoinci64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addq $1, %rdi
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    movq %rdi, (%rsi)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 1)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -243,11 +592,29 @@ entry:
 }
 
 ; SSUBO
-define zeroext i1 @ssubo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL: ssubo.i32
-; CHECK:       subl %esi, %edi
-; CHECK-NEXT:  seto %al
+define zeroext i1 @ssuboi32(i32 %v1, i32 %v2, i32* %res) {
+; SDAG-LABEL: ssuboi32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    subl %esi, %edi
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movl %edi, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: ssuboi32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    subl %esi, %edi
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movl %edi, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: ssuboi32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    subl %esi, %edi
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movl %edi, (%rdx)
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -255,11 +622,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @ssubo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL: ssubo.i64
-; CHECK:       subq %rsi, %rdi
-; CHECK-NEXT:  seto %al
+define zeroext i1 @ssuboi64(i64 %v1, i64 %v2, i64* %res) {
+; SDAG-LABEL: ssuboi64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    subq %rsi, %rdi
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movq %rdi, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: ssuboi64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    subq %rsi, %rdi
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movq %rdi, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: ssuboi64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    subq %rsi, %rdi
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movq %rdi, (%rdx)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -268,11 +653,29 @@ entry:
 }
 
 ; USUBO
-define zeroext i1 @usubo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL: usubo.i32
-; CHECK:       subl %esi, %edi
-; CHECK-NEXT:  setb %al
+define zeroext i1 @usuboi32(i32 %v1, i32 %v2, i32* %res) {
+; SDAG-LABEL: usuboi32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    subl %esi, %edi
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    movl %edi, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: usuboi32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    subl %esi, %edi
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    movl %edi, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: usuboi32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    subl %esi, %edi
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    movl %edi, (%rdx)
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -280,11 +683,29 @@ entry:
   ret i1 %obit
 }
 
-define zeroext i1 @usubo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL: usubo.i64
-; CHECK:       subq %rsi, %rdi
-; CHECK-NEXT:  setb %al
+define zeroext i1 @usuboi64(i64 %v1, i64 %v2, i64* %res) {
+; SDAG-LABEL: usuboi64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    subq %rsi, %rdi
+; SDAG-NEXT:    setb %al
+; SDAG-NEXT:    movq %rdi, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: usuboi64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    subq %rsi, %rdi
+; FAST-NEXT:    setb %al
+; FAST-NEXT:    movq %rdi, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: usuboi64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    subq %rsi, %rdi
+; KNL-NEXT:    setb %al
+; KNL-NEXT:    movq %rdi, (%rdx)
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -292,250 +713,277 @@ entry:
   ret i1 %obit
 }
 
-; SMULO
-define zeroext i1 @smulo.i8(i8 %v1, i8 %v2, i8* %res) {
-entry:
-; CHECK-LABEL:   smulo.i8
-; CHECK:         movl %edi, %eax
-; CHECK-NEXT:    imulb %sil
-; CHECK-NEXT:    seto %cl
-  %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
-  %val = extractvalue {i8, i1} %t, 0
-  %obit = extractvalue {i8, i1} %t, 1
-  store i8 %val, i8* %res
-  ret i1 %obit
-}
-
-define zeroext i1 @smulo.i16(i16 %v1, i16 %v2, i16* %res) {
-entry:
-; CHECK-LABEL: smulo.i16
-; CHECK:       imulw %si, %di
-; CHECK-NEXT:  seto %al
-  %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
-  %val = extractvalue {i16, i1} %t, 0
-  %obit = extractvalue {i16, i1} %t, 1
-  store i16 %val, i16* %res
-  ret i1 %obit
-}
-
-define zeroext i1 @smulo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL: smulo.i32
-; CHECK:       imull %esi, %edi
-; CHECK-NEXT:  seto %al
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define zeroext i1 @smulo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL: smulo.i64
-; CHECK:       imulq %rsi, %rdi
-; CHECK-NEXT:  seto %al
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
-; UMULO
-define zeroext i1 @umulo.i8(i8 %v1, i8 %v2, i8* %res) {
-entry:
-; CHECK-LABEL:   umulo.i8
-; CHECK:         movl %edi, %eax
-; CHECK-NEXT:    mulb %sil
-; CHECK-NEXT:    seto %cl
-  %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
-  %val = extractvalue {i8, i1} %t, 0
-  %obit = extractvalue {i8, i1} %t, 1
-  store i8 %val, i8* %res
-  ret i1 %obit
-}
-
-define zeroext i1 @umulo.i16(i16 %v1, i16 %v2, i16* %res) {
-entry:
-; CHECK-LABEL: umulo.i16
-; CHECK:       mulw %si
-; CHECK-NEXT:  seto
-  %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
-  %val = extractvalue {i16, i1} %t, 0
-  %obit = extractvalue {i16, i1} %t, 1
-  store i16 %val, i16* %res
-  ret i1 %obit
-}
-
-define zeroext i1 @umulo.i32(i32 %v1, i32 %v2, i32* %res) {
-entry:
-; CHECK-LABEL: umulo.i32
-; CHECK:       mull %esi
-; CHECK-NEXT:  seto
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  store i32 %val, i32* %res
-  ret i1 %obit
-}
-
-define zeroext i1 @umulo.i64(i64 %v1, i64 %v2, i64* %res) {
-entry:
-; CHECK-LABEL: umulo.i64
-; CHECK:       mulq %rsi
-; CHECK-NEXT:  seto
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  store i64 %val, i64* %res
-  ret i1 %obit
-}
-
 ;
 ; Check the use of the overflow bit in combination with a select instruction.
 ;
-define i32 @saddo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: saddo.select.i32
-; CHECK:       addl   %esi, %eax
-; CHECK-NEXT:  cmovol %edi, %esi
+define i32 @saddoselecti32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: saddoselecti32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    addl %esi, %eax
+; SDAG-NEXT:    cmovol %edi, %esi
+; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoselecti32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    addl %esi, %eax
+; FAST-NEXT:    cmovol %edi, %esi
+; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoselecti32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    addl %esi, %eax
+; KNL-NEXT:    cmovol %edi, %esi
+; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
   ret i32 %ret
 }
 
-define i64 @saddo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: saddo.select.i64
-; CHECK:       addq   %rsi, %rax
-; CHECK-NEXT:  cmovoq %rdi, %rsi
+define i64 @saddoselecti64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: saddoselecti64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movq %rdi, %rax
+; SDAG-NEXT:    addq %rsi, %rax
+; SDAG-NEXT:    cmovoq %rdi, %rsi
+; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddoselecti64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    addq %rsi, %rax
+; FAST-NEXT:    cmovoq %rdi, %rsi
+; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddoselecti64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    addq %rsi, %rax
+; KNL-NEXT:    cmovoq %rdi, %rsi
+; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
   ret i64 %ret
 }
 
-define i32 @uaddo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: uaddo.select.i32
-; CHECK:       addl   %esi, %eax
-; CHECK-NEXT:  cmovbl %edi, %esi
+define i32 @uaddoselecti32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: uaddoselecti32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    addl %esi, %eax
+; SDAG-NEXT:    cmovbl %edi, %esi
+; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddoselecti32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    addl %esi, %eax
+; FAST-NEXT:    cmovbl %edi, %esi
+; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddoselecti32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    addl %esi, %eax
+; KNL-NEXT:    cmovbl %edi, %esi
+; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
   ret i32 %ret
 }
 
-define i64 @uaddo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: uaddo.select.i64
-; CHECK:       addq   %rsi, %rax
-; CHECK-NEXT:  cmovbq %rdi, %rsi
+define i64 @uaddoselecti64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: uaddoselecti64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movq %rdi, %rax
+; SDAG-NEXT:    addq %rsi, %rax
+; SDAG-NEXT:    cmovbq %rdi, %rsi
+; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddoselecti64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    addq %rsi, %rax
+; FAST-NEXT:    cmovbq %rdi, %rsi
+; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddoselecti64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    addq %rsi, %rax
+; KNL-NEXT:    cmovbq %rdi, %rsi
+; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
   ret i64 %ret
 }
 
-define i32 @ssubo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: ssubo.select.i32
-; CHECK:       cmpl   %esi, %edi
-; CHECK-NEXT:  cmovol %edi, %esi
+define i32 @ssuboselecti32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: ssuboselecti32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    cmovol %edi, %esi
+; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: ssuboselecti32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    cmovol %edi, %esi
+; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: ssuboselecti32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    cmovol %edi, %esi
+; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
   ret i32 %ret
 }
 
-define i64 @ssubo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: ssubo.select.i64
-; CHECK:       cmpq   %rsi, %rdi
-; CHECK-NEXT:  cmovoq %rdi, %rsi
+define i64 @ssuboselecti64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: ssuboselecti64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpq %rsi, %rdi
+; SDAG-NEXT:    cmovoq %rdi, %rsi
+; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: ssuboselecti64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpq %rsi, %rdi
+; FAST-NEXT:    cmovoq %rdi, %rsi
+; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: ssuboselecti64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpq %rsi, %rdi
+; KNL-NEXT:    cmovoq %rdi, %rsi
+; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
   ret i64 %ret
 }
 
-define i32 @usubo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: usubo.select.i32
-; CHECK:       cmpl   %esi, %edi
-; CHECK-NEXT:  cmovbl %edi, %esi
+define i32 @usuboselecti32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: usuboselecti32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    cmovbl %edi, %esi
+; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: usuboselecti32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    cmovbl %edi, %esi
+; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: usuboselecti32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    cmovbl %edi, %esi
+; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %obit = extractvalue {i32, i1} %t, 1
   %ret = select i1 %obit, i32 %v1, i32 %v2
   ret i32 %ret
 }
 
-define i64 @usubo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: usubo.select.i64
-; CHECK:       cmpq   %rsi, %rdi
-; CHECK-NEXT:  cmovbq %rdi, %rsi
+define i64 @usuboselecti64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: usuboselecti64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpq %rsi, %rdi
+; SDAG-NEXT:    cmovbq %rdi, %rsi
+; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: usuboselecti64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpq %rsi, %rdi
+; FAST-NEXT:    cmovbq %rdi, %rsi
+; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: usuboselecti64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpq %rsi, %rdi
+; KNL-NEXT:    cmovbq %rdi, %rsi
+; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %obit = extractvalue {i64, i1} %t, 1
   %ret = select i1 %obit, i64 %v1, i64 %v2
   ret i64 %ret
 }
 
-define i32 @smulo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: smulo.select.i32
-; CHECK:       imull  %esi, %eax
-; CHECK-NEXT:  cmovol %edi, %esi
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @smulo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: smulo.select.i64
-; CHECK:       imulq  %rsi, %rax
-; CHECK-NEXT:  cmovoq %rdi, %rsi
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-define i32 @umulo.select.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: umulo.select.i32
-; CHECK:       mull   %esi
-; CHECK-NEXT:  cmovol %edi, %esi
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %obit = extractvalue {i32, i1} %t, 1
-  %ret = select i1 %obit, i32 %v1, i32 %v2
-  ret i32 %ret
-}
-
-define i64 @umulo.select.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: umulo.select.i64
-; CHECK:       mulq   %rsi
-; CHECK-NEXT:  cmovoq %rdi, %rsi
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %obit = extractvalue {i64, i1} %t, 1
-  %ret = select i1 %obit, i64 %v1, i64 %v2
-  ret i64 %ret
-}
-
-
 ;
 ; Check the use of the overflow bit in combination with a branch instruction.
 ;
-define zeroext i1 @saddo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: saddo.br.i32
-; CHECK:       addl   %esi, %edi
-; CHECK-NEXT:  jo
+define zeroext i1 @saddobri32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: saddobri32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addl %esi, %edi
+; SDAG-NEXT:    jo LBB31_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB31_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddobri32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addl %esi, %edi
+; FAST-NEXT:    jo LBB31_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB31_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddobri32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addl %esi, %edi
+; KNL-NEXT:    jo LBB31_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB31_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.sadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -548,11 +996,43 @@ continue:
   ret i1 true
 }
 
-define zeroext i1 @saddo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: saddo.br.i64
-; CHECK:       addq   %rsi, %rdi
-; CHECK-NEXT:  jo
+define zeroext i1 @saddobri64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: saddobri64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addq %rsi, %rdi
+; SDAG-NEXT:    jo LBB32_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB32_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: saddobri64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addq %rsi, %rdi
+; FAST-NEXT:    jo LBB32_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB32_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: saddobri64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addq %rsi, %rdi
+; KNL-NEXT:    jo LBB32_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB32_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.sadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -565,11 +1045,43 @@ continue:
   ret i1 true
 }
 
-define zeroext i1 @uaddo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: uaddo.br.i32
-; CHECK:       addl   %esi, %edi
-; CHECK-NEXT:  jb
+define zeroext i1 @uaddobri32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: uaddobri32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addl %esi, %edi
+; SDAG-NEXT:    jb LBB33_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB33_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddobri32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addl %esi, %edi
+; FAST-NEXT:    jb LBB33_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB33_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddobri32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addl %esi, %edi
+; KNL-NEXT:    jb LBB33_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB33_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.uadd.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -582,11 +1094,43 @@ continue:
   ret i1 true
 }
 
-define zeroext i1 @uaddo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: uaddo.br.i64
-; CHECK:       addq   %rsi, %rdi
-; CHECK-NEXT:  jb
+define zeroext i1 @uaddobri64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: uaddobri64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    addq %rsi, %rdi
+; SDAG-NEXT:    jb LBB34_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB34_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddobri64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    addq %rsi, %rdi
+; FAST-NEXT:    jb LBB34_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB34_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddobri64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    addq %rsi, %rdi
+; KNL-NEXT:    jb LBB34_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB34_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -599,11 +1143,43 @@ continue:
   ret i1 true
 }
 
-define zeroext i1 @ssubo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: ssubo.br.i32
-; CHECK:       cmpl   %esi, %edi
-; CHECK-NEXT:  jo
+define zeroext i1 @ssubobri32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: ssubobri32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    jo LBB35_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB35_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: ssubobri32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    jo LBB35_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB35_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: ssubobri32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    jo LBB35_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB35_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.ssub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -616,11 +1192,43 @@ continue:
   ret i1 true
 }
 
-define zeroext i1 @ssubo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: ssubo.br.i64
-; CHECK:       cmpq   %rsi, %rdi
-; CHECK-NEXT:  jo
+define zeroext i1 @ssubobri64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: ssubobri64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpq %rsi, %rdi
+; SDAG-NEXT:    jo LBB36_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB36_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: ssubobri64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpq %rsi, %rdi
+; FAST-NEXT:    jo LBB36_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB36_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: ssubobri64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpq %rsi, %rdi
+; KNL-NEXT:    jo LBB36_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB36_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.ssub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -633,11 +1241,43 @@ continue:
   ret i1 true
 }
 
-define zeroext i1 @usubo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: usubo.br.i32
-; CHECK:       cmpl   %esi, %edi
-; CHECK-NEXT:  jb
+define zeroext i1 @usubobri32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: usubobri32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpl %esi, %edi
+; SDAG-NEXT:    jb LBB37_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB37_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: usubobri32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpl %esi, %edi
+; FAST-NEXT:    jb LBB37_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB37_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: usubobri32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpl %esi, %edi
+; KNL-NEXT:    jb LBB37_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB37_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
   %t = call {i32, i1} @llvm.usub.with.overflow.i32(i32 %v1, i32 %v2)
   %val = extractvalue {i32, i1} %t, 0
   %obit = extractvalue {i32, i1} %t, 1
@@ -650,11 +1290,43 @@ continue:
   ret i1 true
 }
 
-define zeroext i1 @usubo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: usubo.br.i64
-; CHECK:       cmpq   %rsi, %rdi
-; CHECK-NEXT:  jb
+define zeroext i1 @usubobri64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: usubobri64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    cmpq %rsi, %rdi
+; SDAG-NEXT:    jb LBB38_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB38_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: usubobri64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    cmpq %rsi, %rdi
+; FAST-NEXT:    jb LBB38_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB38_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: usubobri64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    cmpq %rsi, %rdi
+; KNL-NEXT:    jb LBB38_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB38_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
   %t = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v2)
   %val = extractvalue {i64, i1} %t, 0
   %obit = extractvalue {i64, i1} %t, 1
@@ -667,102 +1339,70 @@ continue:
   ret i1 true
 }
 
-define zeroext i1 @smulo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: smulo.br.i32
-; CHECK:       imull  %esi, %edi
-; CHECK-NEXT:  jo
-  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue, !prof !0
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @smulo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: smulo.br.i64
-; CHECK:       imulq  %rsi, %rdi
-; CHECK-NEXT:  jo
-  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue, !prof !0
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @umulo.br.i32(i32 %v1, i32 %v2) {
-entry:
-; CHECK-LABEL: umulo.br.i32
-; CHECK:       mull  %esi
-; CHECK-NEXT:  jo
-  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
-  %val = extractvalue {i32, i1} %t, 0
-  %obit = extractvalue {i32, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue, !prof !0
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
-}
-
-define zeroext i1 @umulo.br.i64(i64 %v1, i64 %v2) {
-entry:
-; CHECK-LABEL: umulo.br.i64
-; CHECK:       mulq  %rsi
-; CHECK-NEXT:  jo
-  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
-  %val = extractvalue {i64, i1} %t, 0
-  %obit = extractvalue {i64, i1} %t, 1
-  br i1 %obit, label %overflow, label %continue, !prof !0
-
-overflow:
-  ret i1 false
-
-continue:
-  ret i1 true
+define {i64, i1} @uaddoovf(i64 %a, i64 %b) {
+; SDAG-LABEL: uaddoovf:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movzbl %dil, %ecx
+; SDAG-NEXT:    movzbl %sil, %eax
+; SDAG-NEXT:    addq %rcx, %rax
+; SDAG-NEXT:    xorl %edx, %edx
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: uaddoovf:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movzbl %dil, %ecx
+; FAST-NEXT:    movzbl %sil, %eax
+; FAST-NEXT:    addq %rcx, %rax
+; FAST-NEXT:    xorl %edx, %edx
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: uaddoovf:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movzbl %dil, %ecx
+; KNL-NEXT:    movzbl %sil, %eax
+; KNL-NEXT:    addq %rcx, %rax
+; KNL-NEXT:    xorl %edx, %edx
+; KNL-NEXT:    retq
+  %1 = and i64 %a, 255
+  %2 = and i64 %b, 255
+  %t = call {i64, i1} @llvm.uadd.with.overflow.i64(i64 %1, i64 %2)
+  ret {i64, i1} %t
 }
 
-define i1 @bug27873(i64 %c1, i1 %c2) {
-; CHECK-LABEL: bug27873:
-; CHECK:       ## BB#0:
-; CHECK-NEXT:    movl $160, %ecx
-; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    mulq %rcx
-; CHECK-NEXT:    seto %al
-; CHECK-NEXT:    orb %sil, %al
-; CHECK-NEXT:    retq
+define {i64, i1} @usuboovf(i64 %a, i64 %b) {
+; SDAG-LABEL: usuboovf:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    notq %rsi
+; SDAG-NEXT:    xorl %edx, %edx
+; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: usuboovf:
+; FAST:       ## BB#0:
+; FAST-NEXT:    notq %rsi
+; FAST-NEXT:    xorl %edx, %edx
+; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    retq
 ;
-; KNL-LABEL: bug27873:
+; KNL-LABEL: usuboovf:
 ; KNL:       ## BB#0:
-; KNL-NEXT:    andl $1, %esi
-; KNL-NEXT:    kmovw %esi, %k0
-; KNL-NEXT:    movl $160, %ecx
-; KNL-NEXT:    movq %rdi, %rax
-; KNL-NEXT:    mulq %rcx
-; KNL-NEXT:    seto %al
-; KNL-NEXT:    andl $1, %eax
-; KNL-NEXT:    kmovw %eax, %k1
-; KNL-NEXT:    korw %k1, %k0, %k0
-; KNL-NEXT:    kmovw %k0, %eax
-; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT:    notq %rsi
+; KNL-NEXT:    xorl %edx, %edx
+; KNL-NEXT:    movq %rsi, %rax
 ; KNL-NEXT:    retq
-  %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160)
-  %mul.overflow = extractvalue { i64, i1 } %mul, 1
-  %x1 = or i1 %c2, %mul.overflow
-  ret i1 %x1
+  %t0 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %a, i64 %a)
+  %v0 = extractvalue {i64, i1} %t0, 0
+  %o0 = extractvalue {i64, i1} %t0, 1
+  %t1 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 -1, i64 %b)
+  %v1 = extractvalue {i64, i1} %t1, 0
+  %o1 = extractvalue {i64, i1} %t1, 1
+  %oo = or i1 %o0, %o1
+  %t2 = call {i64, i1} @llvm.usub.with.overflow.i64(i64 %v1, i64 %v0)
+  %v2 = extractvalue {i64, i1} %t2, 0
+  %o2 = extractvalue {i64, i1} %t2, 1
+  %ooo = or i1 %oo, %o2
+  %t = insertvalue {i64, i1} %t2, i1 %ooo, 1
+  ret {i64, i1} %t
 }
 
 declare {i8,  i1} @llvm.sadd.with.overflow.i8 (i8,  i8 ) nounwind readnone
@@ -777,13 +1417,5 @@ declare {i32, i1} @llvm.ssub.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.ssub.with.overflow.i64(i64, i64) nounwind readnone
 declare {i32, i1} @llvm.usub.with.overflow.i32(i32, i32) nounwind readnone
 declare {i64, i1} @llvm.usub.with.overflow.i64(i64, i64) nounwind readnone
-declare {i8,  i1} @llvm.smul.with.overflow.i8 (i8,  i8 ) nounwind readnone
-declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16) nounwind readnone
-declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
-declare {i8,  i1} @llvm.umul.with.overflow.i8 (i8,  i8 ) nounwind readnone
-declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) nounwind readnone
-declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
-declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
 
 !0 = !{!"branch_weights", i32 0, i32 2147483647}
diff --git a/test/CodeGen/X86/xmulo.ll b/test/CodeGen/X86/xmulo.ll
index 76a7e72ca961b07a0398f423ff6a43a5209b55da..aed305058f0b66a2c2899cef7362b785412879b3 100644
--- a/test/CodeGen/X86/xmulo.ll
+++ b/test/CodeGen/X86/xmulo.ll
@@ -1,50 +1,742 @@
-; RUN: llc %s -o - | FileCheck %s
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
-target triple = "i386-apple-macosx10.8.0"
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=x86_64-darwin-unknown < %s | FileCheck %s --check-prefix=SDAG
+; RUN: llc -mtriple=x86_64-darwin-unknown -fast-isel -fast-isel-abort=1 < %s | FileCheck %s --check-prefix=FAST
+; RUN: llc -mtriple=x86_64-darwin-unknown -mcpu=knl < %s | FileCheck %s --check-prefix=KNL
 
-declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
-declare i32 @printf(i8*, ...)
-
-@.str = private unnamed_addr constant [10 x i8] c"%llx, %d\0A\00", align 1
-
-define i32 @t1() nounwind {
-; CHECK-LABEL: t1:
-; CHECK:  pushl $0
-; CHECK:  pushl $0
-; CHECK:  pushl $72
-
-    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 8)
-    %2 = extractvalue {i64, i1} %1, 0
-    %3 = extractvalue {i64, i1} %1, 1
-    %4 = zext i1 %3 to i32
-    %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
-    ret i32 0
-}
-
-define i32 @t2() nounwind {
-; CHECK-LABEL: t2:
-; CHECK:  pushl $0
-; CHECK:  pushl $0
-; CHECK:  pushl $0
-
-    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 0)
-    %2 = extractvalue {i64, i1} %1, 0
-    %3 = extractvalue {i64, i1} %1, 1
-    %4 = zext i1 %3 to i32
-    %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
-    ret i32 0
-}
-
-define i32 @t3() nounwind {
-; CHECK-LABEL: t3:
-; CHECK:  pushl $1
-; CHECK:  pushl $-1
-; CHECK:  pushl $-9
-
-    %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 -1)
-    %2 = extractvalue {i64, i1} %1, 0
-    %3 = extractvalue {i64, i1} %1, 1
-    %4 = zext i1 %3 to i32
-    %5 = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0), i64 %2, i32 %4)
-    ret i32 0
+define {i64, i1} @t1() nounwind {
+; SDAG-LABEL: t1:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl $8, %ecx
+; SDAG-NEXT:    movl $9, %eax
+; SDAG-NEXT:    mulq %rcx
+; SDAG-NEXT:    seto %dl
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: t1:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movl $8, %ecx
+; FAST-NEXT:    movl $9, %eax
+; FAST-NEXT:    mulq %rcx
+; FAST-NEXT:    seto %dl
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: t1:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl $8, %ecx
+; KNL-NEXT:    movl $9, %eax
+; KNL-NEXT:    mulq %rcx
+; KNL-NEXT:    seto %dl
+; KNL-NEXT:    retq
+  %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 8)
+  ret {i64, i1} %1
+}
+
+define {i64, i1} @t2() nounwind {
+; SDAG-LABEL: t2:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    xorl %ecx, %ecx
+; SDAG-NEXT:    movl $9, %eax
+; SDAG-NEXT:    mulq %rcx
+; SDAG-NEXT:    seto %dl
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: t2:
+; FAST:       ## BB#0:
+; FAST-NEXT:    xorl %ecx, %ecx
+; FAST-NEXT:    movl $9, %eax
+; FAST-NEXT:    mulq %rcx
+; FAST-NEXT:    seto %dl
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: t2:
+; KNL:       ## BB#0:
+; KNL-NEXT:    xorl %ecx, %ecx
+; KNL-NEXT:    movl $9, %eax
+; KNL-NEXT:    mulq %rcx
+; KNL-NEXT:    seto %dl
+; KNL-NEXT:    retq
+  %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 0)
+  ret {i64, i1} %1
+}
+
+define {i64, i1} @t3() nounwind {
+; SDAG-LABEL: t3:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movq $-1, %rcx
+; SDAG-NEXT:    movl $9, %eax
+; SDAG-NEXT:    mulq %rcx
+; SDAG-NEXT:    seto %dl
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: t3:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movq $-1, %rcx
+; FAST-NEXT:    movl $9, %eax
+; FAST-NEXT:    mulq %rcx
+; FAST-NEXT:    seto %dl
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: t3:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movq $-1, %rcx
+; KNL-NEXT:    movl $9, %eax
+; KNL-NEXT:    mulq %rcx
+; KNL-NEXT:    seto %dl
+; KNL-NEXT:    retq
+  %1 = call {i64, i1} @llvm.umul.with.overflow.i64(i64 9, i64 -1)
+  ret {i64, i1} %1
+}
+
+; SMULO
+define zeroext i1 @smuloi8(i8 %v1, i8 %v2, i8* %res) {
+; SDAG-LABEL: smuloi8:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    imulb %sil
+; SDAG-NEXT:    seto %cl
+; SDAG-NEXT:    movb %al, (%rdx)
+; SDAG-NEXT:    movl %ecx, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: smuloi8:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    imulb %sil
+; FAST-NEXT:    seto %cl
+; FAST-NEXT:    movb %al, (%rdx)
+; FAST-NEXT:    andb $1, %cl
+; FAST-NEXT:    movzbl %cl, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: smuloi8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    imulb %sil
+; KNL-NEXT:    seto %cl
+; KNL-NEXT:    movb %al, (%rdx)
+; KNL-NEXT:    movl %ecx, %eax
+; KNL-NEXT:    retq
+  %t = call {i8, i1} @llvm.smul.with.overflow.i8(i8 %v1, i8 %v2)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smuloi16(i16 %v1, i16 %v2, i16* %res) {
+; SDAG-LABEL: smuloi16:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    imulw %si, %di
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movw %di, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: smuloi16:
+; FAST:       ## BB#0:
+; FAST-NEXT:    imulw %si, %di
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movw %di, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: smuloi16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    imulw %si, %di
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movw %di, (%rdx)
+; KNL-NEXT:    retq
+  %t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smuloi32(i32 %v1, i32 %v2, i32* %res) {
+; SDAG-LABEL: smuloi32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    imull %esi, %edi
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movl %edi, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: smuloi32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    imull %esi, %edi
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movl %edi, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: smuloi32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    imull %esi, %edi
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movl %edi, (%rdx)
+; KNL-NEXT:    retq
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @smuloi64(i64 %v1, i64 %v2, i64* %res) {
+; SDAG-LABEL: smuloi64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    imulq %rsi, %rdi
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    movq %rdi, (%rdx)
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: smuloi64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    imulq %rsi, %rdi
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    movq %rdi, (%rdx)
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: smuloi64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    imulq %rsi, %rdi
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    movq %rdi, (%rdx)
+; KNL-NEXT:    retq
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+; UMULO
+define zeroext i1 @umuloi8(i8 %v1, i8 %v2, i8* %res) {
+; SDAG-LABEL: umuloi8:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    mulb %sil
+; SDAG-NEXT:    seto %cl
+; SDAG-NEXT:    movb %al, (%rdx)
+; SDAG-NEXT:    movl %ecx, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: umuloi8:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    mulb %sil
+; FAST-NEXT:    seto %cl
+; FAST-NEXT:    movb %al, (%rdx)
+; FAST-NEXT:    andb $1, %cl
+; FAST-NEXT:    movzbl %cl, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: umuloi8:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    mulb %sil
+; KNL-NEXT:    seto %cl
+; KNL-NEXT:    movb %al, (%rdx)
+; KNL-NEXT:    movl %ecx, %eax
+; KNL-NEXT:    retq
+  %t = call {i8, i1} @llvm.umul.with.overflow.i8(i8 %v1, i8 %v2)
+  %val = extractvalue {i8, i1} %t, 0
+  %obit = extractvalue {i8, i1} %t, 1
+  store i8 %val, i8* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umuloi16(i16 %v1, i16 %v2, i16* %res) {
+; SDAG-LABEL: umuloi16:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movq %rdx, %rcx
+; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    mulw %si
+; SDAG-NEXT:    seto %dl
+; SDAG-NEXT:    movw %ax, (%rcx)
+; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: umuloi16:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movq %rdx, %rcx
+; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    mulw %si
+; FAST-NEXT:    seto %dl
+; FAST-NEXT:    movw %ax, (%rcx)
+; FAST-NEXT:    andb $1, %dl
+; FAST-NEXT:    movzbl %dl, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: umuloi16:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movq %rdx, %rcx
+; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    mulw %si
+; KNL-NEXT:    seto %dl
+; KNL-NEXT:    movw %ax, (%rcx)
+; KNL-NEXT:    movl %edx, %eax
+; KNL-NEXT:    retq
+  %t = call {i16, i1} @llvm.umul.with.overflow.i16(i16 %v1, i16 %v2)
+  %val = extractvalue {i16, i1} %t, 0
+  %obit = extractvalue {i16, i1} %t, 1
+  store i16 %val, i16* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umuloi32(i32 %v1, i32 %v2, i32* %res) {
+; SDAG-LABEL: umuloi32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movq %rdx, %rcx
+; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    mull %esi
+; SDAG-NEXT:    seto %dl
+; SDAG-NEXT:    movl %eax, (%rcx)
+; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: umuloi32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movq %rdx, %rcx
+; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    mull %esi
+; FAST-NEXT:    seto %dl
+; FAST-NEXT:    movl %eax, (%rcx)
+; FAST-NEXT:    andb $1, %dl
+; FAST-NEXT:    movzbl %dl, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: umuloi32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movq %rdx, %rcx
+; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    mull %esi
+; KNL-NEXT:    seto %dl
+; KNL-NEXT:    movl %eax, (%rcx)
+; KNL-NEXT:    movl %edx, %eax
+; KNL-NEXT:    retq
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  store i32 %val, i32* %res
+  ret i1 %obit
+}
+
+define zeroext i1 @umuloi64(i64 %v1, i64 %v2, i64* %res) {
+; SDAG-LABEL: umuloi64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movq %rdx, %rcx
+; SDAG-NEXT:    movq %rdi, %rax
+; SDAG-NEXT:    mulq %rsi
+; SDAG-NEXT:    seto %dl
+; SDAG-NEXT:    movq %rax, (%rcx)
+; SDAG-NEXT:    movl %edx, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: umuloi64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movq %rdx, %rcx
+; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    mulq %rsi
+; FAST-NEXT:    seto %dl
+; FAST-NEXT:    movq %rax, (%rcx)
+; FAST-NEXT:    andb $1, %dl
+; FAST-NEXT:    movzbl %dl, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: umuloi64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movq %rdx, %rcx
+; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    mulq %rsi
+; KNL-NEXT:    seto %dl
+; KNL-NEXT:    movq %rax, (%rcx)
+; KNL-NEXT:    movl %edx, %eax
+; KNL-NEXT:    retq
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  store i64 %val, i64* %res
+  ret i1 %obit
+}
+
+;
+; Check the use of the overflow bit in combination with a select instruction.
+;
+define i32 @smuloselecti32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: smuloselecti32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    imull %esi, %eax
+; SDAG-NEXT:    cmovol %edi, %esi
+; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: smuloselecti32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    imull %esi, %eax
+; FAST-NEXT:    cmovol %edi, %esi
+; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: smuloselecti32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    imull %esi, %eax
+; KNL-NEXT:    cmovol %edi, %esi
+; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    retq
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @smuloselecti64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: smuloselecti64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movq %rdi, %rax
+; SDAG-NEXT:    imulq %rsi, %rax
+; SDAG-NEXT:    cmovoq %rdi, %rsi
+; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: smuloselecti64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    imulq %rsi, %rax
+; FAST-NEXT:    cmovoq %rdi, %rsi
+; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: smuloselecti64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    imulq %rsi, %rax
+; KNL-NEXT:    cmovoq %rdi, %rsi
+; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    retq
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
 }
+
+define i32 @umuloselecti32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: umuloselecti32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    mull %esi
+; SDAG-NEXT:    cmovol %edi, %esi
+; SDAG-NEXT:    movl %esi, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: umuloselecti32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    mull %esi
+; FAST-NEXT:    cmovol %edi, %esi
+; FAST-NEXT:    movl %esi, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: umuloselecti32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    mull %esi
+; KNL-NEXT:    cmovol %edi, %esi
+; KNL-NEXT:    movl %esi, %eax
+; KNL-NEXT:    retq
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %obit = extractvalue {i32, i1} %t, 1
+  %ret = select i1 %obit, i32 %v1, i32 %v2
+  ret i32 %ret
+}
+
+define i64 @umuloselecti64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: umuloselecti64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movq %rdi, %rax
+; SDAG-NEXT:    mulq %rsi
+; SDAG-NEXT:    cmovoq %rdi, %rsi
+; SDAG-NEXT:    movq %rsi, %rax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: umuloselecti64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    mulq %rsi
+; FAST-NEXT:    cmovoq %rdi, %rsi
+; FAST-NEXT:    movq %rsi, %rax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: umuloselecti64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    mulq %rsi
+; KNL-NEXT:    cmovoq %rdi, %rsi
+; KNL-NEXT:    movq %rsi, %rax
+; KNL-NEXT:    retq
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %obit = extractvalue {i64, i1} %t, 1
+  %ret = select i1 %obit, i64 %v1, i64 %v2
+  ret i64 %ret
+}
+
+;
+; Check the use of the overflow bit in combination with a branch instruction.
+;
+define zeroext i1 @smulobri32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: smulobri32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    imull %esi, %edi
+; SDAG-NEXT:    jo LBB15_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB15_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: smulobri32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    imull %esi, %edi
+; FAST-NEXT:    jo LBB15_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB15_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: smulobri32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    imull %esi, %edi
+; KNL-NEXT:    jo LBB15_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB15_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
+  %t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: smulobri64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    imulq %rsi, %rdi
+; SDAG-NEXT:    jo LBB16_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB16_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: smulobri64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    imulq %rsi, %rdi
+; FAST-NEXT:    jo LBB16_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB16_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: smulobri64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    imulq %rsi, %rdi
+; KNL-NEXT:    jo LBB16_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB16_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
+  %t = call {i64, i1} @llvm.smul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulobri32(i32 %v1, i32 %v2) {
+; SDAG-LABEL: umulobri32:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl %edi, %eax
+; SDAG-NEXT:    mull %esi
+; SDAG-NEXT:    jo LBB17_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB17_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: umulobri32:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movl %edi, %eax
+; FAST-NEXT:    mull %esi
+; FAST-NEXT:    jo LBB17_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB17_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: umulobri32:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movl %edi, %eax
+; KNL-NEXT:    mull %esi
+; KNL-NEXT:    jo LBB17_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB17_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
+  %t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
+  %val = extractvalue {i32, i1} %t, 0
+  %obit = extractvalue {i32, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
+; SDAG-LABEL: umulobri64:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movq %rdi, %rax
+; SDAG-NEXT:    mulq %rsi
+; SDAG-NEXT:    jo LBB18_1
+; SDAG-NEXT:  ## BB#2: ## %continue
+; SDAG-NEXT:    movb $1, %al
+; SDAG-NEXT:    retq
+; SDAG-NEXT:  LBB18_1: ## %overflow
+; SDAG-NEXT:    xorl %eax, %eax
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: umulobri64:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    mulq %rsi
+; FAST-NEXT:    jo LBB18_1
+; FAST-NEXT:  ## BB#2: ## %continue
+; FAST-NEXT:    movb $1, %al
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+; FAST-NEXT:  LBB18_1: ## %overflow
+; FAST-NEXT:    xorl %eax, %eax
+; FAST-NEXT:    andb $1, %al
+; FAST-NEXT:    movzbl %al, %eax
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: umulobri64:
+; KNL:       ## BB#0:
+; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    mulq %rsi
+; KNL-NEXT:    jo LBB18_1
+; KNL-NEXT:  ## BB#2: ## %continue
+; KNL-NEXT:    movb $1, %al
+; KNL-NEXT:    retq
+; KNL-NEXT:  LBB18_1: ## %overflow
+; KNL-NEXT:    xorl %eax, %eax
+; KNL-NEXT:    retq
+  %t = call {i64, i1} @llvm.umul.with.overflow.i64(i64 %v1, i64 %v2)
+  %val = extractvalue {i64, i1} %t, 0
+  %obit = extractvalue {i64, i1} %t, 1
+  br i1 %obit, label %overflow, label %continue, !prof !0
+
+overflow:
+  ret i1 false
+
+continue:
+  ret i1 true
+}
+
+define i1 @bug27873(i64 %c1, i1 %c2) {
+; SDAG-LABEL: bug27873:
+; SDAG:       ## BB#0:
+; SDAG-NEXT:    movl $160, %ecx
+; SDAG-NEXT:    movq %rdi, %rax
+; SDAG-NEXT:    mulq %rcx
+; SDAG-NEXT:    seto %al
+; SDAG-NEXT:    orb %sil, %al
+; SDAG-NEXT:    retq
+;
+; FAST-LABEL: bug27873:
+; FAST:       ## BB#0:
+; FAST-NEXT:    movl $160, %ecx
+; FAST-NEXT:    movq %rdi, %rax
+; FAST-NEXT:    mulq %rcx
+; FAST-NEXT:    seto %al
+; FAST-NEXT:    orb %sil, %al
+; FAST-NEXT:    retq
+;
+; KNL-LABEL: bug27873:
+; KNL:       ## BB#0:
+; KNL-NEXT:    andl $1, %esi
+; KNL-NEXT:    movl $160, %ecx
+; KNL-NEXT:    movq %rdi, %rax
+; KNL-NEXT:    mulq %rcx
+; KNL-NEXT:    kmovw %esi, %k0
+; KNL-NEXT:    seto %al
+; KNL-NEXT:    andl $1, %eax
+; KNL-NEXT:    kmovw %eax, %k1
+; KNL-NEXT:    korw %k1, %k0, %k0
+; KNL-NEXT:    kmovw %k0, %eax
+; KNL-NEXT:    ## kill: %AL<def> %AL<kill> %EAX<kill>
+; KNL-NEXT:    retq
+  %mul = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 %c1, i64 160)
+  %mul.overflow = extractvalue { i64, i1 } %mul, 1
+  %x1 = or i1 %c2, %mul.overflow
+  ret i1 %x1
+}
+
+declare {i8,  i1} @llvm.smul.with.overflow.i8 (i8,  i8 ) nounwind readnone
+declare {i16, i1} @llvm.smul.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.smul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.smul.with.overflow.i64(i64, i64) nounwind readnone
+declare {i8,  i1} @llvm.umul.with.overflow.i8 (i8,  i8 ) nounwind readnone
+declare {i16, i1} @llvm.umul.with.overflow.i16(i16, i16) nounwind readnone
+declare {i32, i1} @llvm.umul.with.overflow.i32(i32, i32) nounwind readnone
+declare {i64, i1} @llvm.umul.with.overflow.i64(i64, i64) nounwind readnone
+
+!0 = !{!"branch_weights", i32 0, i32 2147483647}
diff --git a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
index a9287e7d8c9170fcc40e935ebf96bf6e49bf28d9..a100a1425dd11137d974f9cb2d84a23c89755602 100644
--- a/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
+++ b/test/CodeGen/X86/xop-intrinsics-fast-isel.ll
@@ -499,12 +499,22 @@ declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind
 define <4 x i64> @test_mm256_cmov_si256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
 ; X32-LABEL: test_mm256_cmov_si256:
 ; X32:       # BB#0:
-; X32-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; X32-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; X32-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; X32-NEXT:    vxorps %ymm3, %ymm2, %ymm3
+; X32-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X32-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; X32-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_mm256_cmov_si256:
 ; X64:       # BB#0:
-; X64-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; X64-NEXT:    vpcmpeqd %xmm3, %xmm3, %xmm3
+; X64-NEXT:    vinsertf128 $1, %xmm3, %ymm3, %ymm3
+; X64-NEXT:    vxorps %ymm3, %ymm2, %ymm3
+; X64-NEXT:    vandps %ymm2, %ymm0, %ymm0
+; X64-NEXT:    vandps %ymm3, %ymm1, %ymm1
+; X64-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; X64-NEXT:    retq
   %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2)
   ret <4 x i64> %res
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
index 6fba72f2681b003dc891ca6851660277b2eb94d8..2369beffb6b0b97516faf4ef57d406d4aaeecd6c 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64-upgrade.ll
@@ -725,3 +725,42 @@ define <8 x i16> @test_int_x86_xop_vpcomtruew(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.xop.vpcomtruew(<8 x i16>, <8 x i16>) nounwind readnone
+
+define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpcmov:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    retq
+  %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ;
+  ret <2 x i64> %res
+}
+declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
+
+define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpcmov_256:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ;
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, <4 x i64> %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpcmov_256_mr:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov %ymm1, (%rdi), %ymm0, %ymm0
+; CHECK-NEXT:    retq
+  %vec = load <4 x i64>, <4 x i64>* %a1
+  %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %vec, <4 x i64> %a2) ;
+  ret <4 x i64> %res
+}
+define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>* %a2) {
+; CHECK-LABEL: test_int_x86_xop_vpcmov_256_rm:
+; CHECK:       # BB#0:
+; CHECK-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0
+; CHECK-NEXT:    retq
+ %vec = load <4 x i64>, <4 x i64>* %a2
+ %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %vec) ;
+  ret <4 x i64> %res
+}
+declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
+
diff --git a/test/CodeGen/X86/xop-intrinsics-x86_64.ll b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
index bb6ef50cdc6c34ee2a2f66e72692533167d0d924..76286a26ffa9a061fd69122b1bdc146b09b90808 100644
--- a/test/CodeGen/X86/xop-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/xop-intrinsics-x86_64.ll
@@ -82,18 +82,23 @@ define <2 x i64> @test_int_x86_xop_vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpcmov %xmm2, %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
-  %res = call <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64> %a0, <2 x i64> %a1, <2 x i64> %a2) ;
-  ret <2 x i64> %res
+  %1 = xor <2 x i64> %a2, <i64 -1, i64 -1>
+  %2 = and <2 x i64> %a0, %a2
+  %3 = and <2 x i64> %a1, %1
+  %4 = or <2 x i64> %2, %3
+  ret <2 x i64> %4
 }
-declare <2 x i64> @llvm.x86.xop.vpcmov(<2 x i64>, <2 x i64>, <2 x i64>) nounwind readnone
 
 define <4 x i64> @test_int_x86_xop_vpcmov_256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpcmov_256:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpcmov %ymm2, %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
-  %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %a2) ;
-  ret <4 x i64> %res
+  %1 = xor <4 x i64> %a2, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %2 = and <4 x i64> %a0, %a2
+  %3 = and <4 x i64> %a1, %1
+  %4 = or <4 x i64> %2, %3
+  ret <4 x i64> %4
 }
 define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1, <4 x i64> %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpcmov_256_mr:
@@ -101,19 +106,24 @@ define <4 x i64> @test_int_x86_xop_vpcmov_256_mr(<4 x i64> %a0, <4 x i64>* %a1,
 ; CHECK-NEXT:    vpcmov %ymm1, (%rdi), %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %vec = load <4 x i64>, <4 x i64>* %a1
-  %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %vec, <4 x i64> %a2) ;
-  ret <4 x i64> %res
+  %1 = xor <4 x i64> %a2, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %2 = and <4 x i64> %a0, %a2
+  %3 = and <4 x i64> %vec, %1
+  %4 = or <4 x i64> %2, %3
+  ret <4 x i64> %4
 }
 define <4 x i64> @test_int_x86_xop_vpcmov_256_rm(<4 x i64> %a0, <4 x i64> %a1, <4 x i64>* %a2) {
 ; CHECK-LABEL: test_int_x86_xop_vpcmov_256_rm:
 ; CHECK:       # BB#0:
 ; CHECK-NEXT:    vpcmov (%rdi), %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
- %vec = load <4 x i64>, <4 x i64>* %a2
- %res = call <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64> %a0, <4 x i64> %a1, <4 x i64> %vec) ;
-  ret <4 x i64> %res
+  %vec = load <4 x i64>, <4 x i64>* %a2
+  %1 = xor <4 x i64> %vec, <i64 -1, i64 -1, i64 -1, i64 -1>
+  %2 = and <4 x i64> %a0, %vec
+  %3 = and <4 x i64> %a1, %1
+  %4 = or <4 x i64> %2, %3
+  ret <4 x i64> %4
 }
-declare <4 x i64> @llvm.x86.xop.vpcmov.256(<4 x i64>, <4 x i64>, <4 x i64>) nounwind readnone
 
 define <4 x i32> @test_int_x86_xop_vphaddbd(<16 x i8> %a0) {
 ; CHECK-LABEL: test_int_x86_xop_vphaddbd:
diff --git a/test/CodeGen/X86/xop-mask-comments.ll b/test/CodeGen/X86/xop-mask-comments.ll
index 14c18c311a6f3df9bf879fcac7909b72e3da8f36..4ba47380f89a198da12688dfa2eb062893a34da5 100644
--- a/test/CodeGen/X86/xop-mask-comments.ll
+++ b/test/CodeGen/X86/xop-mask-comments.ll
@@ -95,15 +95,19 @@ define <16 x i8> @vpperm_shuffle_general(<16 x i8> %a0, <16 x i8> %a1) {
 ; VPERMIL2
 ;
 
+; Note: _mm_permute2_pd shouldn't be used for constant shuffles as there will always
+; be a quicker (and smaller) alternative.
 define <2 x double> @vpermil2pd_21(<2 x double> %a0, <2 x double> %a1) {
 ; X32-LABEL: vpermil2pd_21:
 ; X32:       # BB#0:
-; X32-NEXT:    vpermil2pd {{.*#+}} xmm0 = zero,xmm0[0]
+; X32-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X32-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: vpermil2pd_21:
 ; X64:       # BB#0:
-; X64-NEXT:    vpermil2pd {{.*#+}} xmm0 = zero,xmm0[0]
+; X64-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
+; X64-NEXT:    vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64-NEXT:    retq
   %1 = call <2 x double> @llvm.x86.xop.vpermil2pd(<2 x double> %a0, <2 x double> %a1, <2 x i64> <i64 10, i64 1>, i8 2)
   ret <2 x double> %1
diff --git a/test/CodeGen/X86/xor-combine-debugloc.ll b/test/CodeGen/X86/xor-combine-debugloc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..21777c1c572fb95e768cd007cb4ef23bbe040f39
--- /dev/null
+++ b/test/CodeGen/X86/xor-combine-debugloc.ll
@@ -0,0 +1,69 @@
+; RUN: llc -stop-after=expand-isel-pseudos < %s | FileCheck %s
+;
+; Make sure that when the entry block of IR below is lowered, an instruction
+; that implictly defines %eflags has a same debug location with the icmp
+; instruction, and the branch instructions have a same debug location with the
+; br instruction.
+; 
+; CHECK:      [[DLOC1:![0-9]+]] = !DILocation(line: 5, column: 9, scope: !{{[0-9]+}})
+; CHECK:      [[DLOC2:![0-9]+]] = !DILocation(line: 5, column: 7, scope: !{{[0-9]+}})
+; CHECK-DAG:  [[VREG1:%[^ ]+]] = COPY %esi
+; CHECK-DAG:  [[VREG2:%[^ ]+]] = COPY %edi
+; CHECK:      SUB32rr [[VREG2]], [[VREG1]], implicit-def %eflags, debug-location [[DLOC1]]
+; CHECK-NEXT: JE_1{{.*}} implicit %eflags, debug-location [[DLOC2]]
+; CHECK-NEXT: JMP_1{{.*}} debug-location [[DLOC2]]
+
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: nounwind uwtable
+define i32 @foo(i32 %x, i32 %y) !dbg !4 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 %x, i64 0, metadata !9, metadata !11), !dbg !12
+  tail call void @llvm.dbg.value(metadata i32 %y, i64 0, metadata !10, metadata !11), !dbg !13
+  %cmp = icmp ne i32 %x, %y, !dbg !14
+  br i1 %cmp, label %if.then, label %if.else, !dbg !16
+
+if.then:                                          ; preds = %entry
+  %call = tail call i32 (...) @bar() #3, !dbg !17
+  br label %return, !dbg !18
+
+if.else:                                          ; preds = %entry
+  %call1 = tail call i32 (...) @baz() #3, !dbg !19
+  br label %return, !dbg !20
+
+return:                                           ; preds = %if.else, %if.then
+  %retval.0 = phi i32 [ %call, %if.then ], [ %call1, %if.else ]
+  ret i32 %retval.0, !dbg !21
+}
+
+declare i32 @bar(...) 
+declare i32 @baz(...) 
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, emissionKind: FullDebug)
+!1 = !DIFile(filename: "foo.c", directory: "b/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, type: !5, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !8)
+!5 = !DISubroutineType(types: !6)
+!6 = !{!7, !7, !7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !{!9, !10}
+!9 = !DILocalVariable(name: "x", arg: 1, scope: !4, file: !1, line: 4, type: !7)
+!10 = !DILocalVariable(name: "y", arg: 2, scope: !4, file: !1, line: 4, type: !7)
+!11 = !DIExpression()
+!12 = !DILocation(line: 4, column: 13, scope: !4)
+!13 = !DILocation(line: 4, column: 20, scope: !4)
+!14 = !DILocation(line: 5, column: 9, scope: !15)
+!15 = distinct !DILexicalBlock(scope: !4, file: !1, line: 5, column: 7)
+!16 = !DILocation(line: 5, column: 7, scope: !4)
+!17 = !DILocation(line: 6, column: 12, scope: !15)
+!18 = !DILocation(line: 6, column: 5, scope: !15)
+!19 = !DILocation(line: 8, column: 12, scope: !15)
+!20 = !DILocation(line: 8, column: 5, scope: !15)
+!21 = !DILocation(line: 9, column: 1, scope: !4)
diff --git a/test/CodeGen/X86/xray-log-args.ll b/test/CodeGen/X86/xray-log-args.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a551868ffb4a7e2f9be1a14951f626348519c879
--- /dev/null
+++ b/test/CodeGen/X86/xray-log-args.ll
@@ -0,0 +1,35 @@
+; When logging arguments is specified, emit the entry sled accordingly.
+
+; RUN: llc -filetype=asm -o - -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: llc -filetype=asm -o - -mtriple=x86_64-darwin-unknown < %s | FileCheck %s
+
+define i32 @callee(i32 %arg) nounwind noinline uwtable "function-instrument"="xray-always" "xray-log-args"="1" {
+  ret i32 %arg
+}
+; CHECK-LABEL: Lxray_synthetic_0:
+; CHECK:	.quad	{{\.?}}Lxray_sled_0
+; CHECK:	.quad	{{_?}}callee
+; CHECK:	.byte	3
+; CHECK:	.byte	1
+; CHECK:	.{{(zero|space)}}	14
+; CHECK:	.quad	{{\.?}}Lxray_sled_1
+; CHECK:	.quad	{{_?}}callee
+; CHECK:	.byte	1
+; CHECK:	.byte	1
+; CHECK:	.{{(zero|space)}}	14
+
+define i32 @caller(i32 %arg) nounwind noinline uwtable "function-instrument"="xray-always" "xray-log-args"="1" {
+  %retval = tail call i32 @callee(i32 %arg)
+  ret i32 %retval
+}
+; CHECK-LABEL: Lxray_synthetic_1:
+; CHECK:	.quad	{{\.?}}Lxray_sled_2
+; CHECK:	.quad	{{_?}}caller
+; CHECK:	.byte	3
+; CHECK:	.byte	1
+; CHECK:	.{{(zero|space)}}	14
+; CHECK:	.quad	{{\.?}}Lxray_sled_3
+; CHECK:	.quad	{{_?}}caller
+; CHECK:	.byte	2
+; CHECK:	.byte	1
+; CHECK:	.{{(zero|space)}}	14
diff --git a/test/CodeGen/XCore/section-name.ll b/test/CodeGen/XCore/section-name.ll
new file mode 100644
index 0000000000000000000000000000000000000000..65161db34bea49e0bc92377c797a2f585bf894cd
--- /dev/null
+++ b/test/CodeGen/XCore/section-name.ll
@@ -0,0 +1,9 @@
+; RUN: not llc < %s -march=xcore 2>&1 | FileCheck %s
+
+@bar = internal global i32 zeroinitializer
+
+define void @".dp.bss"() {
+  ret void
+}
+
+; CHECK: LLVM ERROR: invalid symbol redefinition
diff --git a/test/CodeGen/XCore/varargs.ll b/test/CodeGen/XCore/varargs.ll
index 28c293390c59761f8e6911eeda45dd694fbef24a..2e364b275610cec0d3a3bf4ed6420bbc29e48c8c 100644
--- a/test/CodeGen/XCore/varargs.ll
+++ b/test/CodeGen/XCore/varargs.ll
@@ -26,10 +26,10 @@ entry:
 ; CHECK-LABEL: test_vararg
 ; CHECK: extsp 6
 ; CHECK: stw lr, sp[1]
+; CHECK: stw r3, sp[6]
 ; CHECK: stw r0, sp[3]
 ; CHECK: stw r1, sp[4]
 ; CHECK: stw r2, sp[5]
-; CHECK: stw r3, sp[6]
 ; CHECK: ldaw r0, sp[3]
 ; CHECK: stw r0, sp[2]
   %list = alloca i8*, align 4
diff --git a/test/DebugInfo/AArch64/asan-stack-vars.ll b/test/DebugInfo/AArch64/asan-stack-vars.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1dff15cb588b149f67303d9831c5e20d819efafb
--- /dev/null
+++ b/test/DebugInfo/AArch64/asan-stack-vars.ll
@@ -0,0 +1,326 @@
+; RUN: llc -O0 -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+;
+; Derived from (clang -O0 -g -fsanitize=address -fobjc-arc)
+;   @protocol NSObject
+;   @end
+;   @interface NSObject<NSObject>{}
+;   + (instancetype)alloc;
+;   @end
+;   struct CGSize {
+;     double width;
+;     double height;
+;   };
+;   typedef struct CGSize CGSize;
+;   @interface Object : NSObject
+;   - (instancetype)initWithSize:(CGSize)size;
+;   - (id)aMessage;
+;   @end            
+;   @implementation MyObject
+;   + (id)doWithSize:(CGSize)imageSize andObject:(id)object {
+;     return [object aMessage];
+;   }
+;   @end
+;
+; CHECK: .debug_info contents:
+; CHECK: DW_TAG_subprogram
+; CHECK-NEXT:   DW_AT_low_pc [DW_FORM_addr]     (0x0000000000000000)
+; CHECK-NEXT:   DW_AT_high_pc [DW_FORM_addr]    ([[FN_END:.*]])
+; CHECK: "_cmd"
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_location {{.*}} ([[OFS:.*]])
+; CHECK-NEXT: DW_AT_name {{.*}}"imageSize"
+;
+; CHECK: .debug_loc contents:
+; CHECK: [[OFS]]: Beginning address offset: 0x0000000000000000
+; CHECK_NOT: 0x{{.*}}: Beginning
+; CHECK:          Ending address offset: [[FN_END]]
+
+; ModuleID = 'm.m'
+source_filename = "m.m"
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-ios"
+
+%0 = type opaque
+%struct._class_t = type { %struct._class_t*, %struct._class_t*, %struct._objc_cache*, i8* (i8*, i8*)**, %struct._class_ro_t* }
+%struct._objc_cache = type opaque
+%struct._class_ro_t = type { i32, i32, i32, i8*, i8*, %struct.__method_list_t*, %struct._objc_protocol_list*, %struct._ivar_list_t*, i8*, %struct._prop_list_t* }
+%struct.__method_list_t = type { i32, i32, [0 x %struct._objc_method] }
+%struct._objc_method = type { i8*, i8*, i8* }
+%struct._objc_protocol_list = type { i64, [0 x %struct._protocol_t*] }
+%struct._protocol_t = type { i8*, i8*, %struct._objc_protocol_list*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct.__method_list_t*, %struct._prop_list_t*, i32, i32, i8**, i8*, %struct._prop_list_t* }
+%struct._ivar_list_t = type { i32, i32, [0 x %struct._ivar_t] }
+%struct._ivar_t = type { i32*, i8*, i8*, i32, i32 }
+%struct._prop_list_t = type { i32, i32, [0 x %struct._prop_t] }
+%struct._prop_t = type { i8*, i8* }
+%struct.CGSize = type { double, double }
+
+@"OBJC_CLASS_$_Object" = external global %struct._class_t
+@"OBJC_CLASSLIST_REFERENCES_$_" = private global %struct._class_t* @"OBJC_CLASS_$_Object", section "__DATA, __objc_classrefs, regular, no_dead_strip", align 8
+@OBJC_METH_VAR_NAME_ = private unnamed_addr constant [6 x i8] c"alloc\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+@OBJC_SELECTOR_REFERENCES_ = private externally_initialized global i8* getelementptr inbounds ([6 x i8], [6 x i8]* @OBJC_METH_VAR_NAME_, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
+@OBJC_METH_VAR_NAME_.1 = private unnamed_addr constant [14 x i8] c"initWithSize:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+@OBJC_SELECTOR_REFERENCES_.2 = private externally_initialized global i8* getelementptr inbounds ([14 x i8], [14 x i8]* @OBJC_METH_VAR_NAME_.1, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
+@OBJC_METH_VAR_NAME_.3 = private unnamed_addr constant [9 x i8] c"aMessage\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+@OBJC_SELECTOR_REFERENCES_.4 = private externally_initialized global i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_METH_VAR_NAME_.3, i32 0, i32 0), section "__DATA, __objc_selrefs, literal_pointers, no_dead_strip", align 8
+@_objc_empty_cache = external global %struct._objc_cache
+@"OBJC_CLASS_$_MyObject" = global %struct._class_t { %struct._class_t* @"OBJC_METACLASS_$_MyObject", %struct._class_t* null, %struct._objc_cache* @_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* @"\01l_OBJC_CLASS_RO_$_MyObject" }, section "__DATA, __objc_data", align 8
+@"OBJC_METACLASS_$_MyObject" = global %struct._class_t { %struct._class_t* @"OBJC_METACLASS_$_MyObject", %struct._class_t* @"OBJC_CLASS_$_MyObject", %struct._objc_cache* @_objc_empty_cache, i8* (i8*, i8*)** null, %struct._class_ro_t* @"\01l_OBJC_METACLASS_RO_$_MyObject" }, section "__DATA, __objc_data", align 8
+@OBJC_CLASS_NAME_ = private unnamed_addr constant [9 x i8] c"MyObject\00", section "__TEXT,__objc_classname,cstring_literals", align 1
+@OBJC_METH_VAR_NAME_.5 = private unnamed_addr constant [12 x i8] c"doWithSize:\00", section "__TEXT,__objc_methname,cstring_literals", align 1
+@OBJC_METH_VAR_TYPE_ = private unnamed_addr constant [21 x i8] c"@32@0:8{CGSize=dd}16\00", section "__TEXT,__objc_methtype,cstring_literals", align 1
+@"\01l_OBJC_$_CLASS_METHODS_MyObject" = private global { i32, i32, [1 x %struct._objc_method] } { i32 24, i32 1, [1 x %struct._objc_method] [%struct._objc_method { i8* getelementptr inbounds ([12 x i8], [12 x i8]* @OBJC_METH_VAR_NAME_.5, i32 0, i32 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast (i8* (i8*, i8*, [2 x double])* @"\01+[MyObject doWithSize:]" to i8*) }] }, section "__DATA, __objc_const", align 8
+@"\01l_OBJC_METACLASS_RO_$_MyObject" = private global %struct._class_ro_t { i32 131, i32 40, i32 40, i8* null, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01l_OBJC_$_CLASS_METHODS_MyObject" to %struct.__method_list_t*), %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
+@"\01l_OBJC_CLASS_RO_$_MyObject" = private global %struct._class_ro_t { i32 130, i32 0, i32 0, i8* null, i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), %struct.__method_list_t* null, %struct._objc_protocol_list* null, %struct._ivar_list_t* null, i8* null, %struct._prop_list_t* null }, section "__DATA, __objc_const", align 8
+@"OBJC_LABEL_CLASS_$" = private global [1 x i8*] [i8* bitcast (%struct._class_t* @"OBJC_CLASS_$_MyObject" to i8*)], section "__DATA, __objc_classlist, regular, no_dead_strip", align 8
+@llvm.compiler.used = appending global [12 x i8*] [i8* bitcast (%struct._class_t** @"OBJC_CLASSLIST_REFERENCES_$_" to i8*), i8* getelementptr inbounds ([6 x i8], [6 x i8]* @OBJC_METH_VAR_NAME_, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_ to i8*), i8* getelementptr inbounds ([14 x i8], [14 x i8]* @OBJC_METH_VAR_NAME_.1, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_.2 to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_METH_VAR_NAME_.3, i32 0, i32 0), i8* bitcast (i8** @OBJC_SELECTOR_REFERENCES_.4 to i8*), i8* getelementptr inbounds ([9 x i8], [9 x i8]* @OBJC_CLASS_NAME_, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @OBJC_METH_VAR_NAME_.5, i32 0, i32 0), i8* getelementptr inbounds ([21 x i8], [21 x i8]* @OBJC_METH_VAR_TYPE_, i32 0, i32 0), i8* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01l_OBJC_$_CLASS_METHODS_MyObject" to i8*), i8* bitcast ([1 x i8*]* @"OBJC_LABEL_CLASS_$" to i8*)], section "llvm.metadata"
+@llvm.global_ctors = appending global [1 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 1, void ()* @asan.module_ctor, i8* null }]
+@__asan_shadow_memory_dynamic_address = external global i64
+@__asan_gen_ = private unnamed_addr constant [34 x i8] c"2 32 16 9 imageSize 64 8 6 object\00", align 1
+
+; Function Attrs: noinline sanitize_address ssp uwtable
+define internal i8* @"\01+[MyObject doWithSize:]"(i8* %self, i8* %_cmd, [2 x double] %imageSize.coerce) #0 !dbg !14 {
+entry:
+  %0 = load i64, i64* @__asan_shadow_memory_dynamic_address
+  %self.addr = alloca i8*, align 8
+  %_cmd.addr = alloca i8*, align 8
+  %MyAlloca = alloca [96 x i8], align 32, !dbg !35
+  %1 = ptrtoint [96 x i8]* %MyAlloca to i64, !dbg !35
+  %2 = add i64 %1, 32, !dbg !35
+  %3 = inttoptr i64 %2 to %struct.CGSize*, !dbg !35
+  %4 = add i64 %1, 64, !dbg !35
+  %5 = inttoptr i64 %4 to %0**, !dbg !35
+  %6 = inttoptr i64 %1 to i64*, !dbg !35
+  store i64 1102416563, i64* %6, !dbg !35
+  %7 = add i64 %1, 8, !dbg !35
+  %8 = inttoptr i64 %7 to i64*, !dbg !35
+  store i64 ptrtoint ([34 x i8]* @__asan_gen_ to i64), i64* %8, !dbg !35
+  %9 = add i64 %1, 16, !dbg !35
+  %10 = inttoptr i64 %9 to i64*, !dbg !35
+  store i64 ptrtoint (i8* (i8*, i8*, [2 x double])* @"\01+[MyObject doWithSize:]" to i64), i64* %10, !dbg !35
+  %11 = lshr i64 %1, 3, !dbg !35
+  %12 = add i64 %11, %0, !dbg !35
+  %13 = add i64 %12, 0, !dbg !35
+  %14 = inttoptr i64 %13 to i64*, !dbg !35
+  store i64 -940689368107847183, i64* %14, align 1, !dbg !35
+  %15 = add i64 %12, 9, !dbg !35
+  %16 = inttoptr i64 %15 to i16*, !dbg !35
+  store i16 -3085, i16* %16, align 1, !dbg !35
+  %17 = add i64 %12, 11, !dbg !35
+  %18 = inttoptr i64 %17 to i8*, !dbg !35
+  store i8 -13, i8* %18, align 1, !dbg !35
+  call void @llvm.dbg.declare(metadata %struct.CGSize* %3, metadata !36, metadata !37), !dbg !38
+  call void @llvm.dbg.declare(metadata %0** %5, metadata !39, metadata !37), !dbg !45
+  %19 = bitcast %struct.CGSize* %3 to [2 x double]*
+  %20 = ptrtoint [2 x double]* %19 to i64
+  %21 = lshr i64 %20, 3
+  %22 = add i64 %21, %0
+  %23 = inttoptr i64 %22 to i16*
+  %24 = load i16, i16* %23
+  %25 = icmp ne i16 %24, 0
+  br i1 %25, label %26, label %27
+
+; <label>:26:                                     ; preds = %entry
+  call void @__asan_report_store16(i64 %20)
+  call void asm sideeffect "", ""()
+  unreachable
+
+; <label>:27:                                     ; preds = %entry
+  store [2 x double] %imageSize.coerce, [2 x double]* %19, align 8
+  store i8* %self, i8** %self.addr, align 8
+  call void @llvm.dbg.declare(metadata i8** %self.addr, metadata !46, metadata !48), !dbg !49
+  store i8* %_cmd, i8** %_cmd.addr, align 8
+  call void @llvm.dbg.declare(metadata i8** %_cmd.addr, metadata !50, metadata !48), !dbg !49
+  %28 = load %struct._class_t*, %struct._class_t** @"OBJC_CLASSLIST_REFERENCES_$_", align 8, !dbg !52
+  %29 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_ to i64), i64 3), %0, !dbg !52
+  %30 = inttoptr i64 %29 to i8*, !dbg !52
+  %31 = load i8, i8* %30, !dbg !52
+  %32 = icmp ne i8 %31, 0, !dbg !52
+  br i1 %32, label %33, label %34, !dbg !52
+
+; <label>:33:                                     ; preds = %27
+  call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_ to i64)), !dbg !52
+  call void asm sideeffect "", ""(), !dbg !52
+  unreachable, !dbg !52
+
+; <label>:34:                                     ; preds = %27
+  %35 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_, align 8, !dbg !52, !invariant.load !2
+  %36 = bitcast %struct._class_t* %28 to i8*, !dbg !52
+  %call = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %36, i8* %35), !dbg !52
+  %37 = bitcast i8* %call to %0*, !dbg !52
+  %38 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.2 to i64), i64 3), %0, !dbg !53
+  %39 = inttoptr i64 %38 to i8*, !dbg !53
+  %40 = load i8, i8* %39, !dbg !53
+  %41 = icmp ne i8 %40, 0, !dbg !53
+  br i1 %41, label %42, label %43, !dbg !53
+
+; <label>:42:                                     ; preds = %34
+  call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.2 to i64)), !dbg !53
+  call void asm sideeffect "", ""(), !dbg !53
+  unreachable, !dbg !53
+
+; <label>:43:                                     ; preds = %34
+  %44 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_.2, align 8, !dbg !53, !invariant.load !2
+  %45 = bitcast %0* %37 to i8*, !dbg !53
+  %46 = bitcast %struct.CGSize* %3 to [2 x double]*, !dbg !53
+  %47 = ptrtoint [2 x double]* %46 to i64, !dbg !53
+  %48 = lshr i64 %47, 3, !dbg !53
+  %49 = add i64 %48, %0, !dbg !53
+  %50 = inttoptr i64 %49 to i16*, !dbg !53
+  %51 = load i16, i16* %50, !dbg !53
+  %52 = icmp ne i16 %51, 0, !dbg !53
+  br i1 %52, label %53, label %54, !dbg !53
+
+; <label>:53:                                     ; preds = %43
+  call void @__asan_report_load16(i64 %47), !dbg !53
+  call void asm sideeffect "", ""(), !dbg !53
+  unreachable, !dbg !53
+
+; <label>:54:                                     ; preds = %43
+  %55 = load [2 x double], [2 x double]* %46, align 8, !dbg !53
+  %call1 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*, [2 x double])*)(i8* %45, i8* %44, [2 x double] %55), !dbg !53
+  %56 = bitcast i8* %call1 to %0*, !dbg !53
+  %57 = ptrtoint %0** %5 to i64, !dbg !45
+  %58 = lshr i64 %57, 3, !dbg !45
+  %59 = add i64 %58, %0, !dbg !45
+  %60 = inttoptr i64 %59 to i8*, !dbg !45
+  %61 = load i8, i8* %60, !dbg !45
+  %62 = icmp ne i8 %61, 0, !dbg !45
+  br i1 %62, label %63, label %64, !dbg !45
+
+; <label>:63:                                     ; preds = %54
+  call void @__asan_report_store8(i64 %57), !dbg !45
+  call void asm sideeffect "", ""(), !dbg !45
+  unreachable, !dbg !45
+
+; <label>:64:                                     ; preds = %54
+  store %0* %56, %0** %5, align 8, !dbg !45
+  %65 = load %0*, %0** %5, align 8, !dbg !54
+  %66 = add i64 lshr (i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.4 to i64), i64 3), %0, !dbg !55
+  %67 = inttoptr i64 %66 to i8*, !dbg !55
+  %68 = load i8, i8* %67, !dbg !55
+  %69 = icmp ne i8 %68, 0, !dbg !55
+  br i1 %69, label %70, label %71, !dbg !55
+
+; <label>:70:                                     ; preds = %64
+  call void @__asan_report_load8(i64 ptrtoint (i8** @OBJC_SELECTOR_REFERENCES_.4 to i64)), !dbg !55
+  call void asm sideeffect "", ""(), !dbg !55
+  unreachable, !dbg !55
+
+; <label>:71:                                     ; preds = %64
+  %72 = load i8*, i8** @OBJC_SELECTOR_REFERENCES_.4, align 8, !dbg !55, !invariant.load !2
+  %73 = bitcast %0* %65 to i8*, !dbg !55
+  %call2 = call i8* bitcast (i8* (i8*, i8*, ...)* @objc_msgSend to i8* (i8*, i8*)*)(i8* %73, i8* %72), !dbg !55
+  call void asm sideeffect "mov\09fp, fp\09\09# marker for objc_retainAutoreleaseReturnValue", ""(), !dbg !55
+  %74 = call i8* @objc_retainAutoreleasedReturnValue(i8* %call2) #3, !dbg !55
+  %75 = bitcast %0** %5 to i8**, !dbg !56
+  call void @objc_storeStrong(i8** %75, i8* null) #3, !dbg !56
+  %76 = tail call i8* @objc_autoreleaseReturnValue(i8* %74) #3, !dbg !56
+  store i64 1172321806, i64* %6, !dbg !56
+  %77 = add i64 %12, 0, !dbg !56
+  %78 = inttoptr i64 %77 to i64*, !dbg !56
+  store i64 0, i64* %78, align 1, !dbg !56
+  %79 = add i64 %12, 9, !dbg !56
+  %80 = inttoptr i64 %79 to i16*, !dbg !56
+  store i16 0, i16* %80, align 1, !dbg !56
+  %81 = add i64 %12, 11, !dbg !56
+  %82 = inttoptr i64 %81 to i8*, !dbg !56
+  store i8 0, i8* %82, align 1, !dbg !56
+  ret i8* %76, !dbg !56
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nonlazybind
+declare i8* @objc_msgSend(i8*, i8*, ...) #2
+
+declare i8* @objc_retainAutoreleasedReturnValue(i8* returned)
+
+declare void @objc_storeStrong(i8**, i8*)
+
+declare i8* @objc_autoreleaseReturnValue(i8* returned)
+
+define internal void @asan.module_ctor() {
+  call void @__asan_init()
+  call void @__asan_version_mismatch_check_v8()
+  ret void
+}
+
+declare void @__asan_init()
+
+declare void @__asan_version_mismatch_check_v8()
+
+declare void @__asan_report_load8(i64)
+
+declare void @__asan_report_load16(i64)
+
+declare void @__asan_report_store8(i64)
+
+declare void @__asan_report_store16(i64)
+
+attributes #0 = { noinline sanitize_address ssp uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nonlazybind }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!5, !6, !7, !8, !9, !10, !11, !12}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !1, producer: "clang version 5.0.0 (trunk 295779) (llvm/trunk 295777)", isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "m.m", directory: "/")
+!2 = !{}
+!3 = !{!4}
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyObject", scope: !1, file: !1, line: 15, flags: DIFlagObjcClassComplete, elements: !2, runtimeLang: DW_LANG_ObjC)
+!5 = !{i32 1, !"Objective-C Version", i32 2}
+!6 = !{i32 1, !"Objective-C Image Info Version", i32 0}
+!7 = !{i32 1, !"Objective-C Image Info Section", !"__DATA, __objc_imageinfo, regular, no_dead_strip"}
+!8 = !{i32 4, !"Objective-C Garbage Collection", i32 0}
+!9 = !{i32 1, !"Objective-C Class Properties", i32 64}
+!10 = !{i32 2, !"Dwarf Version", i32 2}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{i32 1, !"PIC Level", i32 2}
+!13 = !{!"clang version 5.0.0 (trunk 295779) (llvm/trunk 295777)"}
+!14 = distinct !DISubprogram(name: "+[MyObject doWithSize:]", scope: !1, file: !1, line: 16, type: !15, isLocal: true, isDefinition: true, scopeLine: 16, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!15 = !DISubroutineType(types: !16)
+!16 = !{!17, !24, !26, !29}
+!17 = !DIDerivedType(tag: DW_TAG_typedef, name: "id", file: !1, baseType: !18)
+!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !19, size: 64)
+!19 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_object", file: !1, elements: !20)
+!20 = !{!21}
+!21 = !DIDerivedType(tag: DW_TAG_member, name: "isa", scope: !19, file: !1, baseType: !22, size: 64)
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !23, size: 64)
+!23 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_class", file: !1, flags: DIFlagFwdDecl)
+!24 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25, flags: DIFlagArtificial | DIFlagObjectPointer)
+!25 = !DIDerivedType(tag: DW_TAG_typedef, name: "Class", file: !1, baseType: !22)
+!26 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !1, baseType: !27, flags: DIFlagArtificial)
+!27 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !28, size: 64)
+!28 = !DICompositeType(tag: DW_TAG_structure_type, name: "objc_selector", file: !1, flags: DIFlagFwdDecl)
+!29 = !DIDerivedType(tag: DW_TAG_typedef, name: "CGSize", file: !1, line: 10, baseType: !30)
+!30 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "CGSize", file: !1, line: 6, size: 128, elements: !31)
+!31 = !{!32, !34}
+!32 = !DIDerivedType(tag: DW_TAG_member, name: "width", scope: !30, file: !1, line: 7, baseType: !33, size: 64)
+!33 = !DIBasicType(name: "double", size: 64, encoding: DW_ATE_float)
+!34 = !DIDerivedType(tag: DW_TAG_member, name: "height", scope: !30, file: !1, line: 8, baseType: !33, size: 64, offset: 64)
+!35 = !DILocation(line: 16, scope: !14)
+!36 = !DILocalVariable(name: "imageSize", arg: 3, scope: !14, file: !1, line: 16, type: !29)
+!37 = !DIExpression(DW_OP_deref)
+!38 = !DILocation(line: 16, column: 26, scope: !14)
+!39 = !DILocalVariable(name: "object", scope: !14, file: !1, line: 17, type: !40)
+!40 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !41, size: 64)
+!41 = !DICompositeType(tag: DW_TAG_structure_type, name: "Object", scope: !1, file: !1, line: 11, elements: !42, runtimeLang: DW_LANG_ObjC)
+!42 = !{!43}
+!43 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !41, baseType: !44)
+!44 = !DICompositeType(tag: DW_TAG_structure_type, name: "NSObject", scope: !1, file: !1, line: 3, elements: !2, runtimeLang: DW_LANG_ObjC)
+!45 = !DILocation(line: 17, column: 11, scope: !14)
+!46 = !DILocalVariable(name: "self", arg: 1, scope: !14, type: !47, flags: DIFlagArtificial | DIFlagObjectPointer)
+!47 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !25)
+!48 = !DIExpression()
+!49 = !DILocation(line: 0, scope: !14)
+!50 = !DILocalVariable(name: "_cmd", arg: 2, scope: !14, type: !51, flags: DIFlagArtificial)
+!51 = !DIDerivedType(tag: DW_TAG_typedef, name: "SEL", file: !1, baseType: !27)
+!52 = !DILocation(line: 17, column: 21, scope: !14)
+!53 = !DILocation(line: 17, column: 20, scope: !14)
+!54 = !DILocation(line: 18, column: 11, scope: !14)
+!55 = !DILocation(line: 18, column: 10, scope: !14)
+!56 = !DILocation(line: 19, column: 1, scope: !14)
diff --git a/test/DebugInfo/AMDGPU/lit.local.cfg b/test/DebugInfo/AMDGPU/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..2a665f06be72e5515ca6e27018facb35daa201be
--- /dev/null
+++ b/test/DebugInfo/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/DebugInfo/AMDGPU/pointer-address-space-dwarf-v1.ll b/test/DebugInfo/AMDGPU/pointer-address-space-dwarf-v1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cbd5e7688a5a217b2a7a67b5feae991aac1b985d
--- /dev/null
+++ b/test/DebugInfo/AMDGPU/pointer-address-space-dwarf-v1.ll
@@ -0,0 +1,70 @@
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; LLVM IR generated with the following command and OpenCL source:
+;
+; $clang -cl-std=CL2.0 -g -O0 -target amdgcn-amd-amdhsa -S -emit-llvm <path-to-file>
+;
+; kernel void kernel1() {
+;   global int *FuncVar0 = 0;
+;   constant int *FuncVar1 = 0;
+;   local int *FuncVar2 = 0;
+;   private int *FuncVar3 = 0;
+;   int *FuncVar4 = 0;
+; }
+
+; DW_AT_address_class is available since Dwarf Version 2.
+; CHECK-NOT: DW_AT_address_class
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+define amdgpu_kernel void @kernel1() #0 !dbg !7 {
+entry:
+  %FuncVar0 = alloca i32 addrspace(1)*, align 4
+  %FuncVar1 = alloca i32 addrspace(2)*, align 4
+  %FuncVar2 = alloca i32 addrspace(3)*, align 4
+  %FuncVar3 = alloca i32*, align 4
+  %FuncVar4 = alloca i32 addrspace(4)*, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %FuncVar0, metadata !10, metadata !13), !dbg !14
+  store i32 addrspace(1)* null, i32 addrspace(1)** %FuncVar0, align 4, !dbg !14
+  call void @llvm.dbg.declare(metadata i32 addrspace(2)** %FuncVar1, metadata !15, metadata !13), !dbg !16
+  store i32 addrspace(2)* null, i32 addrspace(2)** %FuncVar1, align 4, !dbg !16
+  call void @llvm.dbg.declare(metadata i32 addrspace(3)** %FuncVar2, metadata !17, metadata !13), !dbg !19
+  store i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), i32 addrspace(3)** %FuncVar2, align 4, !dbg !19
+  call void @llvm.dbg.declare(metadata i32** %FuncVar3, metadata !20, metadata !13), !dbg !22
+  store i32* addrspacecast (i32 addrspace(4)* null to i32*), i32** %FuncVar3, align 4, !dbg !22
+  call void @llvm.dbg.declare(metadata i32 addrspace(4)** %FuncVar4, metadata !23, metadata !13), !dbg !24
+  store i32 addrspace(4)* null, i32 addrspace(4)** %FuncVar4, align 4, !dbg !24
+  ret void, !dbg !25
+}
+
+!llvm.dbg.cu = !{!0}
+!opencl.ocl.version = !{!3}
+!llvm.module.flags = !{!4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "pointer-address-space-dwarf-v1.cl", directory: "/some/random/directory")
+!2 = !{}
+!3 = !{i32 2, i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 1}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{!""}
+!7 = distinct !DISubprogram(name: "kernel1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DILocalVariable(name: "FuncVar0", scope: !7, file: !1, line: 2, type: !11)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DIExpression()
+!14 = !DILocation(line: 2, column: 15, scope: !7)
+!15 = !DILocalVariable(name: "FuncVar1", scope: !7, file: !1, line: 3, type: !11)
+!16 = !DILocation(line: 3, column: 17, scope: !7)
+!17 = !DILocalVariable(name: "FuncVar2", scope: !7, file: !1, line: 4, type: !18)
+!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 32, dwarfAddressSpace: 2)
+!19 = !DILocation(line: 4, column: 14, scope: !7)
+!20 = !DILocalVariable(name: "FuncVar3", scope: !7, file: !1, line: 5, type: !21)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 32, dwarfAddressSpace: 1)
+!22 = !DILocation(line: 5, column: 16, scope: !7)
+!23 = !DILocalVariable(name: "FuncVar4", scope: !7, file: !1, line: 6, type: !11)
+!24 = !DILocation(line: 6, column: 8, scope: !7)
+!25 = !DILocation(line: 7, column: 1, scope: !7)
diff --git a/test/DebugInfo/AMDGPU/pointer-address-space.ll b/test/DebugInfo/AMDGPU/pointer-address-space.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a99d690935a190f6285c3ed88644f28581f7e53a
--- /dev/null
+++ b/test/DebugInfo/AMDGPU/pointer-address-space.ll
@@ -0,0 +1,104 @@
+; RUN: llc -O0 -mtriple=amdgcn--amdhsa -mcpu=fiji -verify-machineinstrs -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; LLVM IR generated with the following command and OpenCL source:
+;
+; $clang -cl-std=CL2.0 -g -O0 -target amdgcn-amd-amdhsa -S -emit-llvm <path-to-file>
+;
+; kernel void kernel1() {
+;   global int *FuncVar0 = 0;
+;   constant int *FuncVar1 = 0;
+;   local int *FuncVar2 = 0;
+;   private int *FuncVar3 = 0;
+;   int *FuncVar4 = 0;
+; }
+
+; CHECK:      DW_AT_name {{.*}}"FuncVar0"
+; CHECK-NEXT: DW_AT_decl_file
+; CHECK-NEXT: DW_AT_decl_line
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + 0x{{[a-f0-9]+}} => {0x[[NONE:[a-f0-9]+]]})
+
+; CHECK:      DW_AT_name {{.*}}"FuncVar1"
+; CHECK-NEXT: DW_AT_decl_file
+; CHECK-NEXT: DW_AT_decl_line
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + 0x{{[a-f0-9]+}} => {0x[[NONE]]})
+
+; CHECK:      DW_AT_name {{.*}}"FuncVar2"
+; CHECK-NEXT: DW_AT_decl_file
+; CHECK-NEXT: DW_AT_decl_line
+; CHECK-NEXT:      DW_AT_type [DW_FORM_ref4] (cu + 0x{{[a-f0-9]+}} => {0x[[LOCAL:[a-f0-9]+]]})
+
+; CHECK:      DW_AT_name {{.*}}"FuncVar3"
+; CHECK-NEXT: DW_AT_decl_file
+; CHECK-NEXT: DW_AT_decl_line
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + 0x{{[a-f0-9]+}} => {0x[[PRIVATE:[a-f0-9]+]]})
+
+; CHECK:      DW_AT_name {{.*}}"FuncVar4"
+; CHECK-NEXT: DW_AT_decl_file
+; CHECK-NEXT: DW_AT_decl_line
+; CHECK-NEXT: DW_AT_type [DW_FORM_ref4] (cu + 0x{{[a-f0-9]+}} => {0x[[NONE]]})
+
+; CHECK:      0x[[NONE]]: DW_TAG_pointer_type
+; CHECK-NEXT:               DW_AT_type
+; CHECK-NOT:                DW_AT_address_class
+
+; CHECK:      0x[[LOCAL]]: DW_TAG_pointer_type
+; CHECK-NEXT:                DW_AT_type
+; CHECK-NEXT:                DW_AT_address_class [DW_FORM_data4] (0x00000002)
+
+; CHECK:      0x[[PRIVATE]]: DW_TAG_pointer_type
+; CHECK-NEXT:                  DW_AT_type
+; CHECK-NEXT:                  DW_AT_address_class [DW_FORM_data4] (0x00000001)
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+define amdgpu_kernel void @kernel1() !dbg !7 {
+entry:
+  %FuncVar0 = alloca i32 addrspace(1)*, align 4
+  %FuncVar1 = alloca i32 addrspace(2)*, align 4
+  %FuncVar2 = alloca i32 addrspace(3)*, align 4
+  %FuncVar3 = alloca i32*, align 4
+  %FuncVar4 = alloca i32 addrspace(4)*, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %FuncVar0, metadata !10, metadata !13), !dbg !14
+  store i32 addrspace(1)* null, i32 addrspace(1)** %FuncVar0, align 4, !dbg !14
+  call void @llvm.dbg.declare(metadata i32 addrspace(2)** %FuncVar1, metadata !15, metadata !13), !dbg !16
+  store i32 addrspace(2)* null, i32 addrspace(2)** %FuncVar1, align 4, !dbg !16
+  call void @llvm.dbg.declare(metadata i32 addrspace(3)** %FuncVar2, metadata !17, metadata !13), !dbg !19
+  store i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), i32 addrspace(3)** %FuncVar2, align 4, !dbg !19
+  call void @llvm.dbg.declare(metadata i32** %FuncVar3, metadata !20, metadata !13), !dbg !22
+  store i32* addrspacecast (i32 addrspace(4)* null to i32*), i32** %FuncVar3, align 4, !dbg !22
+  call void @llvm.dbg.declare(metadata i32 addrspace(4)** %FuncVar4, metadata !23, metadata !13), !dbg !24
+  store i32 addrspace(4)* null, i32 addrspace(4)** %FuncVar4, align 4, !dbg !24
+  ret void, !dbg !25
+}
+
+!llvm.dbg.cu = !{!0}
+!opencl.ocl.version = !{!3}
+!llvm.module.flags = !{!4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "pointer-address-space.ll", directory: "/some/random/directory")
+!2 = !{}
+!3 = !{i32 2, i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 2}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{!""}
+!7 = distinct !DISubprogram(name: "kernel1", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null}
+!10 = !DILocalVariable(name: "FuncVar0", scope: !7, file: !1, line: 2, type: !11)
+!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 64)
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DIExpression()
+!14 = !DILocation(line: 2, column: 15, scope: !7)
+!15 = !DILocalVariable(name: "FuncVar1", scope: !7, file: !1, line: 3, type: !11)
+!16 = !DILocation(line: 3, column: 17, scope: !7)
+!17 = !DILocalVariable(name: "FuncVar2", scope: !7, file: !1, line: 4, type: !18)
+!18 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 32, dwarfAddressSpace: 2)
+!19 = !DILocation(line: 4, column: 14, scope: !7)
+!20 = !DILocalVariable(name: "FuncVar3", scope: !7, file: !1, line: 5, type: !21)
+!21 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !12, size: 32, dwarfAddressSpace: 1)
+!22 = !DILocation(line: 5, column: 16, scope: !7)
+!23 = !DILocalVariable(name: "FuncVar4", scope: !7, file: !1, line: 6, type: !11)
+!24 = !DILocation(line: 6, column: 8, scope: !7)
+!25 = !DILocation(line: 7, column: 1, scope: !7)
diff --git a/test/DebugInfo/AMDGPU/variable-locations-dwarf-v1.ll b/test/DebugInfo/AMDGPU/variable-locations-dwarf-v1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d04a8eb74656de94318d1e531dbbcabe6bf906d2
--- /dev/null
+++ b/test/DebugInfo/AMDGPU/variable-locations-dwarf-v1.ll
@@ -0,0 +1,92 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; LLVM IR generated with the following command and OpenCL source:
+;
+; $clang -cl-std=CL2.0 -g -O0 -target amdgcn-amd-amdhsa -S -emit-llvm <path-to-file>
+;
+; global int GlobA;
+; global int GlobB;
+;
+; kernel void kernel1(unsigned int ArgN, global int *ArgA, global int *ArgB) {
+;   ArgA[ArgN] += ArgB[ArgN];
+; }
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+; CHECK-NOT: DW_AT_location [DW_FORM_block1] (<0x05> 03 00 00 00 00 )
+@GlobA = common addrspace(1) global i32 0, align 4, !dbg !0
+; CHECK-NOT: DW_AT_location [DW_FORM_block1] (<0x05> 03 00 00 00 00 )
+@GlobB = common addrspace(1) global i32 0, align 4, !dbg !6
+
+define amdgpu_kernel void @kernel1(
+; CHECK-NOT: DW_AT_location [DW_FORM_block1] (<0x06> 91 04 10 01 16 18 )
+    i32 %ArgN,
+; CHECK-NOT: DW_AT_location [DW_FORM_block1] (<0x06> 91 08 10 01 16 18 )
+    i32 addrspace(1)* %ArgA,
+; CHECK-NOT: DW_AT_location [DW_FORM_block1] (<0x06> 91 10 10 01 16 18 )
+    i32 addrspace(1)* %ArgB) !dbg !13 {
+entry:
+  %ArgN.addr = alloca i32, align 4
+  %ArgA.addr = alloca i32 addrspace(1)*, align 4
+  %ArgB.addr = alloca i32 addrspace(1)*, align 4
+  store i32 %ArgN, i32* %ArgN.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %ArgN.addr, metadata !22, metadata !23), !dbg !24
+  store i32 addrspace(1)* %ArgA, i32 addrspace(1)** %ArgA.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %ArgA.addr, metadata !25, metadata !23), !dbg !26
+  store i32 addrspace(1)* %ArgB, i32 addrspace(1)** %ArgB.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %ArgB.addr, metadata !27, metadata !23), !dbg !28
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %ArgB.addr, align 4, !dbg !29
+  %1 = load i32, i32* %ArgN.addr, align 4, !dbg !30
+  %idxprom = zext i32 %1 to i64, !dbg !29
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %idxprom, !dbg !29
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !29
+  %3 = load i32 addrspace(1)*, i32 addrspace(1)** %ArgA.addr, align 4, !dbg !31
+  %4 = load i32, i32* %ArgN.addr, align 4, !dbg !32
+  %idxprom1 = zext i32 %4 to i64, !dbg !31
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %3, i64 %idxprom1, !dbg !31
+  %5 = load i32, i32 addrspace(1)* %arrayidx2, align 4, !dbg !33
+  %add = add nsw i32 %5, %2, !dbg !33
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !33
+  ret void, !dbg !34
+}
+
+!llvm.dbg.cu = !{!2}
+!opencl.ocl.version = !{!9}
+!llvm.module.flags = !{!10, !11}
+!llvm.ident = !{!12}
+
+!0 = !DIGlobalVariableExpression(var: !1)
+!1 = distinct !DIGlobalVariable(name: "GlobA", scope: !2, file: !3, line: 1, type: !8, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 5.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "variable-locations-dwarf-v1.cl", directory: "/some/random/directory")
+!4 = !{}
+!5 = !{!0, !6}
+!6 = !DIGlobalVariableExpression(var: !7)
+!7 = distinct !DIGlobalVariable(name: "GlobB", scope: !2, file: !3, line: 2, type: !8, isLocal: false, isDefinition: true)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !{i32 2, i32 0}
+!10 = !{i32 2, !"Dwarf Version", i32 1}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{!"clang version 5.0.0"}
+!13 = distinct !DISubprogram(name: "kernel1", scope: !3, file: !3, line: 4, type: !14, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !2, variables: !4)
+!14 = !DISubroutineType(types: !15)
+!15 = !{null, !16, !17, !17}
+!16 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!17 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64)
+!18 = !{i32 0, i32 1, i32 1}
+!19 = !{!"none", !"none", !"none"}
+!20 = !{!"uint", !"int*", !"int*"}
+!21 = !{!"", !"", !""}
+!22 = !DILocalVariable(name: "ArgN", arg: 1, scope: !13, file: !3, line: 4, type: !16)
+!23 = !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)
+!24 = !DILocation(line: 4, column: 34, scope: !13)
+!25 = !DILocalVariable(name: "ArgA", arg: 2, scope: !13, file: !3, line: 4, type: !17)
+!26 = !DILocation(line: 4, column: 52, scope: !13)
+!27 = !DILocalVariable(name: "ArgB", arg: 3, scope: !13, file: !3, line: 4, type: !17)
+!28 = !DILocation(line: 4, column: 70, scope: !13)
+!29 = !DILocation(line: 5, column: 17, scope: !13)
+!30 = !DILocation(line: 5, column: 22, scope: !13)
+!31 = !DILocation(line: 5, column: 3, scope: !13)
+!32 = !DILocation(line: 5, column: 8, scope: !13)
+!33 = !DILocation(line: 5, column: 14, scope: !13)
+!34 = !DILocation(line: 6, column: 1, scope: !13)
diff --git a/test/DebugInfo/AMDGPU/variable-locations.ll b/test/DebugInfo/AMDGPU/variable-locations.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1aab40f946c6a9831fdd956386435e7e83c1eace
--- /dev/null
+++ b/test/DebugInfo/AMDGPU/variable-locations.ll
@@ -0,0 +1,111 @@
+; RUN: llc -O0 -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -verify-machineinstrs -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+
+; LLVM IR generated with the following command and OpenCL source:
+;
+; $clang -cl-std=CL2.0 -g -O0 -target amdgcn-amd-amdhsa -S -emit-llvm <path-to-file>
+;
+; global int GlobA;
+; global int GlobB;
+;
+; kernel void kernel1(unsigned int ArgN, global int *ArgA, global int *ArgB) {
+;   ArgA[ArgN] += ArgB[ArgN];
+; }
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+; CHECK: {{.*}}DW_TAG_variable
+; CHECK-NEXT: DW_AT_name {{.*}}"GlobA"
+; CHECK-NEXT: DW_AT_type
+; CHECK-NEXT: DW_AT_external
+; CHECK-NEXT: DW_AT_decl_file
+; CHECK-NEXT: DW_AT_decl_line
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (<0x05> 03 00 00 00 00 )
+@GlobA = common addrspace(1) global i32 0, align 4, !dbg !0
+
+; CHECK: {{.*}}DW_TAG_variable
+; CHECK-NEXT: DW_AT_name {{.*}}"GlobB"
+; CHECK-NEXT: DW_AT_type
+; CHECK-NEXT: DW_AT_external
+; CHECK-NEXT: DW_AT_decl_file
+; CHECK-NEXT: DW_AT_decl_line
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (<0x05> 03 00 00 00 00 )
+@GlobB = common addrspace(1) global i32 0, align 4, !dbg !6
+
+define amdgpu_kernel void @kernel1(
+; CHECK: {{.*}}DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (<0x06> 91 04 10 01 16 18 )
+; CHECK-NEXT: DW_AT_name {{.*}}"ArgN"
+    i32 %ArgN,
+; CHECK: {{.*}}DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (<0x06> 91 08 10 01 16 18 )
+; CHECK-NEXT: DW_AT_name {{.*}}"ArgA"
+    i32 addrspace(1)* %ArgA,
+; CHECK: {{.*}}DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_location [DW_FORM_block1] (<0x06> 91 10 10 01 16 18 )
+; CHECK-NEXT: DW_AT_name {{.*}}"ArgB"
+    i32 addrspace(1)* %ArgB) !dbg !13 {
+entry:
+  %ArgN.addr = alloca i32, align 4
+  %ArgA.addr = alloca i32 addrspace(1)*, align 4
+  %ArgB.addr = alloca i32 addrspace(1)*, align 4
+  store i32 %ArgN, i32* %ArgN.addr, align 4
+  call void @llvm.dbg.declare(metadata i32* %ArgN.addr, metadata !22, metadata !23), !dbg !24
+  store i32 addrspace(1)* %ArgA, i32 addrspace(1)** %ArgA.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %ArgA.addr, metadata !25, metadata !23), !dbg !26
+  store i32 addrspace(1)* %ArgB, i32 addrspace(1)** %ArgB.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %ArgB.addr, metadata !27, metadata !23), !dbg !28
+  %0 = load i32 addrspace(1)*, i32 addrspace(1)** %ArgB.addr, align 4, !dbg !29
+  %1 = load i32, i32* %ArgN.addr, align 4, !dbg !30
+  %idxprom = zext i32 %1 to i64, !dbg !29
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %0, i64 %idxprom, !dbg !29
+  %2 = load i32, i32 addrspace(1)* %arrayidx, align 4, !dbg !29
+  %3 = load i32 addrspace(1)*, i32 addrspace(1)** %ArgA.addr, align 4, !dbg !31
+  %4 = load i32, i32* %ArgN.addr, align 4, !dbg !32
+  %idxprom1 = zext i32 %4 to i64, !dbg !31
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %3, i64 %idxprom1, !dbg !31
+  %5 = load i32, i32 addrspace(1)* %arrayidx2, align 4, !dbg !33
+  %add = add nsw i32 %5, %2, !dbg !33
+  store i32 %add, i32 addrspace(1)* %arrayidx2, align 4, !dbg !33
+  ret void, !dbg !34
+}
+
+!llvm.dbg.cu = !{!2}
+!opencl.ocl.version = !{!9}
+!llvm.module.flags = !{!10, !11}
+!llvm.ident = !{!12}
+
+!0 = !DIGlobalVariableExpression(var: !1)
+!1 = distinct !DIGlobalVariable(name: "GlobA", scope: !2, file: !3, line: 1, type: !8, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 5.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "variable-locations.cl", directory: "/some/random/directory")
+!4 = !{}
+!5 = !{!0, !6}
+!6 = !DIGlobalVariableExpression(var: !7)
+!7 = distinct !DIGlobalVariable(name: "GlobB", scope: !2, file: !3, line: 2, type: !8, isLocal: false, isDefinition: true)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !{i32 2, i32 0}
+!10 = !{i32 2, !"Dwarf Version", i32 2}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{!"clang version 5.0.0"}
+!13 = distinct !DISubprogram(name: "kernel1", scope: !3, file: !3, line: 4, type: !14, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !2, variables: !4)
+!14 = !DISubroutineType(types: !15)
+!15 = !{null, !16, !17, !17}
+!16 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned)
+!17 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !8, size: 64)
+!18 = !{i32 0, i32 1, i32 1}
+!19 = !{!"none", !"none", !"none"}
+!20 = !{!"uint", !"int*", !"int*"}
+!21 = !{!"", !"", !""}
+!22 = !DILocalVariable(name: "ArgN", arg: 1, scope: !13, file: !3, line: 4, type: !16)
+!23 = !DIExpression(DW_OP_constu, 1, DW_OP_swap, DW_OP_xderef)
+!24 = !DILocation(line: 4, column: 34, scope: !13)
+!25 = !DILocalVariable(name: "ArgA", arg: 2, scope: !13, file: !3, line: 4, type: !17)
+!26 = !DILocation(line: 4, column: 52, scope: !13)
+!27 = !DILocalVariable(name: "ArgB", arg: 3, scope: !13, file: !3, line: 4, type: !17)
+!28 = !DILocation(line: 4, column: 70, scope: !13)
+!29 = !DILocation(line: 5, column: 17, scope: !13)
+!30 = !DILocation(line: 5, column: 22, scope: !13)
+!31 = !DILocation(line: 5, column: 3, scope: !13)
+!32 = !DILocation(line: 5, column: 8, scope: !13)
+!33 = !DILocation(line: 5, column: 14, scope: !13)
+!34 = !DILocation(line: 6, column: 1, scope: !13)
diff --git a/test/DebugInfo/ARM/s-super-register.ll b/test/DebugInfo/ARM/s-super-register.ll
index ef2bc9ac1ec3403c3b2796320823c6577678e45f..de0284a9a55710edda7735afc669ea341a09269e 100644
--- a/test/DebugInfo/ARM/s-super-register.ll
+++ b/test/DebugInfo/ARM/s-super-register.ll
@@ -5,9 +5,7 @@ target triple = "thumbv7-apple-macosx10.6.7"
 ; The S registers on ARM are expressed as pieces of their super-registers in DWARF.
 ;
 ; 0x90   DW_OP_regx of super-register
-; 0x93   DW_OP_piece
-; 0x9d   DW_OP_bit_piece
-; CHECK:            Location description: 90 {{.. .. ((93 ..)|(9d .. ..)) $}}
+; CHECK:            Location description: 90
 
 define void @_Z3foov() optsize ssp !dbg !1 {
 entry:
diff --git a/test/DebugInfo/COFF/array-odr-violation.ll b/test/DebugInfo/COFF/array-odr-violation.ll
new file mode 100644
index 0000000000000000000000000000000000000000..471c18f00afd4b3383d82f286607a7e263218458
--- /dev/null
+++ b/test/DebugInfo/COFF/array-odr-violation.ll
@@ -0,0 +1,100 @@
+; This tests that emitting CodeView arrays doesn't assert when an ODR violation
+; makes our array dimension size calculations inaccurate. (PR32383)
+
+; Here was the scenario:
+; $ cat a.cpp
+; typedef union YYSTYPE { int x; } YYSTYPE;
+; YYSTYPE a;
+; $ cat b.cpp
+; typedef union YYSTYPE { char x; } YYSTYPE;
+; void fn1() { YYSTYPE a[1]; }
+; $ clang-cl -c -Zi -flto a.cpp b.cpp
+; $ llvm-link a.obj b.obj -S -o t.ll  # This is the test case IR.
+; $ llc t.ll  # Used to assert
+
+; RUN: llc < %s | FileCheck %s
+
+; FIXME: sizeof(a) in the user program is 1, but we claim it is 4 because
+; sometimes the frontend lies to us. See array-types-advanced.ll for an example.
+;
+; CHECK: Array ({{.*}}) {
+; CHECK:   TypeLeafKind: LF_ARRAY (0x1503)
+; CHECK:   ElementType: YYSTYPE ({{.*}})
+; CHECK:   IndexType: unsigned __int64 (0x23)
+; CHECK:   SizeOf: 4
+; CHECK:   Name:
+; CHECK: }
+
+; sizeof(YYSTYPE) == 4
+; CHECK: Union ({{.*}}) {
+; CHECK:   TypeLeafKind: LF_UNION (0x1506)
+; CHECK:   MemberCount: 1
+; CHECK:   Properties [ (0x600)
+; CHECK:     HasUniqueName (0x200)
+; CHECK:     Sealed (0x400)
+; CHECK:   ]
+; CHECK:   FieldList: <field list>
+; CHECK:   SizeOf: 4
+; CHECK:   Name: YYSTYPE
+; CHECK:   LinkageName: .?ATYYSTYPE@@
+; CHECK: }
+
+; ModuleID = 'llvm-link'
+source_filename = "llvm-link"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.10.24728"
+
+%union.YYSTYPE = type { i32 }
+%union.YYSTYPE.0 = type { i8 }
+
+@"\01?a@@3TYYSTYPE@@A" = global %union.YYSTYPE zeroinitializer, align 4, !dbg !0
+
+; Function Attrs: noinline nounwind sspstrong uwtable
+define void @"\01?fn1@@YAXXZ"() #0 !dbg !21 {
+entry:
+  %a = alloca [1 x %union.YYSTYPE.0], align 1
+  call void @llvm.dbg.declare(metadata [1 x %union.YYSTYPE.0]* %a, metadata !24, metadata !29), !dbg !30
+  ret void, !dbg !30
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { noinline nounwind sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!2, !11}
+!llvm.ident = !{!13, !13}
+!llvm.module.flags = !{!14, !18, !19, !20}
+
+!0 = !DIGlobalVariableExpression(var: !1)
+!1 = distinct !DIGlobalVariable(name: "a", linkageName: "\01?a@@3TYYSTYPE@@A", scope: !2, file: !3, line: 2, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 5.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "a.cpp", directory: "C:\5Csrc\5Cllvm-project\5Cbuild", checksumkind: CSK_MD5, checksum: "c0005139aa3df153c30d8c6953390a4b")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIDerivedType(tag: DW_TAG_typedef, name: "YYSTYPE", file: !3, line: 1, baseType: !7)
+!7 = distinct !DICompositeType(tag: DW_TAG_union_type, name: "YYSTYPE", file: !3, line: 1, size: 32, elements: !8, identifier: ".?ATYYSTYPE@@")
+!8 = !{!9}
+!9 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !7, file: !3, line: 1, baseType: !10, size: 32)
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !12, producer: "clang version 5.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4)
+!12 = !DIFile(filename: "b.cpp", directory: "C:\5Csrc\5Cllvm-project\5Cbuild", checksumkind: CSK_MD5, checksum: "9cfd390d8827beab36769147bb037abc")
+!13 = !{!"clang version 5.0.0 "}
+!14 = !{i32 6, !"Linker Options", !15}
+!15 = !{!16, !17}
+!16 = !{!"/DEFAULTLIB:libcmt.lib"}
+!17 = !{!"/DEFAULTLIB:oldnames.lib"}
+!18 = !{i32 2, !"CodeView", i32 1}
+!19 = !{i32 2, !"Debug Info Version", i32 3}
+!20 = !{i32 1, !"PIC Level", i32 2}
+!21 = distinct !DISubprogram(name: "fn1", linkageName: "\01?fn1@@YAXXZ", scope: !12, file: !12, line: 2, type: !22, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !11, variables: !4)
+!22 = !DISubroutineType(types: !23)
+!23 = !{null}
+!24 = !DILocalVariable(name: "a", scope: !21, file: !12, line: 2, type: !25)
+!25 = !DICompositeType(tag: DW_TAG_array_type, baseType: !26, size: 8, elements: !27)
+!26 = !DIDerivedType(tag: DW_TAG_typedef, name: "YYSTYPE", file: !12, line: 1, baseType: !7)
+!27 = !{!28}
+!28 = !DISubrange(count: 1)
+!29 = !DIExpression()
+!30 = !DILocation(line: 2, scope: !21)
diff --git a/test/DebugInfo/COFF/globals.ll b/test/DebugInfo/COFF/globals.ll
index e560e4f9806f19992b5ba91abd8816dae404ca86..aadf6ab557f9b1cf4b53b59f39fd24da7cc6ced4 100644
--- a/test/DebugInfo/COFF/globals.ll
+++ b/test/DebugInfo/COFF/globals.ll
@@ -96,7 +96,7 @@
 ; OBJ:   ]
 ; OBJ: ]
 ; OBJ: CodeViewDebugInfo [
-; OBJ:   Section: .debug$S (7)
+; OBJ:   Section: .debug$S (8)
 ; OBJ:   Magic: 0x4
 ; OBJ:   Subsection [
 ; OBJ:     SubSectionType: Symbols (0xF1)
diff --git a/test/DebugInfo/COFF/typedef.ll b/test/DebugInfo/COFF/typedef.ll
index 1c4fe7ac7610f3b923c9340dfda97ce41ece0082..cf4e3df257de6936fd1764161bc11dbc681e17ae 100644
--- a/test/DebugInfo/COFF/typedef.ll
+++ b/test/DebugInfo/COFF/typedef.ll
@@ -31,7 +31,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1)
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, emissionKind: FullDebug)
 !1 = !DIFile(filename: "-", directory: "/usr/local/google/home/majnemer/llvm/src")
 !3 = !{i32 2, !"CodeView", i32 1}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/DebugInfo/COFF/types-data-members.ll b/test/DebugInfo/COFF/types-data-members.ll
index 9276b962ac8cf39b3229ccd16db31ee247097929..275af969a48e9a4b475fee6b66e66d56333d39c0 100644
--- a/test/DebugInfo/COFF/types-data-members.ll
+++ b/test/DebugInfo/COFF/types-data-members.ll
@@ -37,7 +37,7 @@
 ; $ clang t.cpp -S -emit-llvm -g -gcodeview -o t.ll
 
 ; CHECK: CodeViewTypes [
-; CHECK:   Section: .debug$T (10)
+; CHECK:   Section: .debug$T (8)
 ; CHECK:   Magic: 0x4
 ; CHECK:   ArgList (0x1000) {
 ; CHECK:     TypeLeafKind: LF_ARGLIST (0x1201)
diff --git a/test/DebugInfo/Generic/2010-01-05-DbgScope.ll b/test/DebugInfo/Generic/2010-01-05-DbgScope.ll
index 008fd8fbd637ee67b39325313c9cb0d39a145928..031d64e17f697f6fa240598b1759bcb9ed74692d 100644
--- a/test/DebugInfo/Generic/2010-01-05-DbgScope.ll
+++ b/test/DebugInfo/Generic/2010-01-05-DbgScope.ll
@@ -13,7 +13,7 @@ entry:
 
 !0 = !DILocation(line: 571, column: 3, scope: !1)
 !1 = distinct !DILexicalBlock(line: 1, column: 1, file: !11, scope: !2)
-!2 = distinct !DISubprogram(name: "foo", linkageName: "foo", line: 561, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scope: !3, type: !4)
+!2 = distinct !DISubprogram(name: "foo", linkageName: "foo", file: !11, line: 561, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !3, scope: !3, type: !4)
 !3 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang 1.1", isOptimized: true, emissionKind: FullDebug, file: !11, enums: !12, retainedTypes: !12)
 !4 = !DISubroutineType(types: !5)
 !5 = !{!6}
diff --git a/test/DebugInfo/Generic/2010-06-29-InlinedFnLocalVar.ll b/test/DebugInfo/Generic/2010-06-29-InlinedFnLocalVar.ll
index 31d3487db7a67392fbb40bdf658941c32df3d8d2..0996cab00da2ae1b975f713389c925f70427d5a6 100644
--- a/test/DebugInfo/Generic/2010-06-29-InlinedFnLocalVar.ll
+++ b/test/DebugInfo/Generic/2010-06-29-InlinedFnLocalVar.ll
@@ -1,7 +1,7 @@
 ; RUN: %llc_dwarf -O2 %s -o - | FileCheck %s
 ; Check struct X for dead variable xyz from inlined function foo.
 
-; CHECK: section_info
+; CHECK: debug_info,
 ; CHECK:	DW_TAG_structure_type
 ; CHECK-NEXT:	DW_AT_name
 
diff --git a/test/DebugInfo/Generic/array.ll b/test/DebugInfo/Generic/array.ll
index 7b4ff7cb805a7f64c0c1d81c494ef78d4b15d5b2..c3c592885184371707fd7392a2e1ba6ec6f986a8 100644
--- a/test/DebugInfo/Generic/array.ll
+++ b/test/DebugInfo/Generic/array.ll
@@ -25,7 +25,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !7 = distinct !DILexicalBlock(line: 3, column: 12, file: !14, scope: !0)
 !8 = !DICompositeType(tag: DW_TAG_array_type, align: 32, file: !14, scope: !2, baseType: !5, elements: !9)
 !9 = !{!10}
-;CHECK: section_info:
+;CHECK: debug_info,
 ;CHECK: DW_TAG_subrange_type
 ;CHECK-NEXT: DW_AT_type
 ;CHECK-NOT: DW_AT_lower_bound
diff --git a/test/DebugInfo/Generic/debuginfofinder-inlined-cu.ll b/test/DebugInfo/Generic/debuginfofinder-inlined-cu.ll
new file mode 100644
index 0000000000000000000000000000000000000000..313e22d84f352e4f20c2d3a51abe98cdef664043
--- /dev/null
+++ b/test/DebugInfo/Generic/debuginfofinder-inlined-cu.ll
@@ -0,0 +1,31 @@
+; RUN: opt -analyze -module-debuginfo < %s | FileCheck %s
+
+; Verify that both compile units, even though one compile units's functions
+; were entirely inlined into the other.
+;CHECK: Compile unit: DW_LANG_C99 from /tmp/test1.c
+;CHECK: Compile unit: DW_LANG_C99 from /tmp/test2.c
+;CHECK: Subprogram: f from /tmp/test1.c:1
+;CHECK: Subprogram: g from /tmp/test2.c:1
+
+define void @f() !dbg !4 {
+  ret void, !dbg !15
+}
+
+!llvm.dbg.cu = !{!0, !8}
+!llvm.module.flags = !{!13, !16}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (192092)", isOptimized: false, emissionKind: FullDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "test1.c", directory: "/tmp")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "f", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2)
+!5 = !DIFile(filename: "test1.c", directory: "/tmp")
+!6 = !DISubroutineType(types: !7)
+!7 = !{null}
+!8 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.4 (192092)", isOptimized: false, emissionKind: FullDebug, file: !9, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!9 = !DIFile(filename: "test2.c", directory: "/tmp")
+!11 = distinct !DISubprogram(name: "g", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !8, scopeLine: 1, file: !9, scope: !12, type: !6, variables: !2)
+!12 = !DIFile(filename: "test2.c", directory: "/tmp")
+!13 = !{i32 2, !"Dwarf Version", i32 4}
+!14 = !DILocation(line: 1, scope: !4)
+!15 = !DILocation(line: 1, scope: !11, inlinedAt: !14)
+!16 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/DebugInfo/Generic/gmlt.test b/test/DebugInfo/Generic/gmlt.test
index d098d1e9125331524dfba8f6f5aa673a0e8eed3a..e36a3bdea35534d6523a05655907b734612df359 100644
--- a/test/DebugInfo/Generic/gmlt.test
+++ b/test/DebugInfo/Generic/gmlt.test
@@ -1,6 +1,5 @@
 ; REQUIRES: object-emission
 ; RUN: %llc_dwarf -O0 -filetype=obj < %S/../Inputs/gmlt.ll | llvm-dwarfdump - | FileCheck %S/../Inputs/gmlt.ll
-; RUN: %llc_dwarf -O0 -filetype=obj -debug-info-for-profiling < %S/../Inputs/gmlt.ll | llvm-dwarfdump - | FileCheck %S/../Inputs/gmlt.ll --check-prefixes=PROFILING
 
 ; There's a darwin specific test in X86/gmlt, so it's okay to XFAIL this here.
 ; XFAIL: darwin
diff --git a/test/DebugInfo/Generic/gmlt_profiling.ll b/test/DebugInfo/Generic/gmlt_profiling.ll
new file mode 100644
index 0000000000000000000000000000000000000000..551959caa15e80a1c5c1a627bc31caee241819d6
--- /dev/null
+++ b/test/DebugInfo/Generic/gmlt_profiling.ll
@@ -0,0 +1,32 @@
+; REQUIRES: object-emission
+; RUN: %llc_dwarf -O0 -filetype=obj < %S/gmlt_profiling.ll | llvm-dwarfdump - | FileCheck %S/gmlt_profiling.ll
+
+; CHECK: .debug_info
+; CHECK: DW_TAG_subprogram
+; CHECK-NOT: DW_TAG
+; CHECK: DW_AT_name {{.*}} "f1"
+; With debug-info-for-profiling attribute, we need to emit decl_file and
+; decl_line of the subprogram.
+; CHECK-NEXT: DW_AT_decl_file
+; CHECK-NEXT: DW_AT_decl_line
+
+; Function Attrs: nounwind uwtable
+define void @_Z2f1v() !dbg !4 {
+entry:
+  ret void, !dbg !13
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11}
+!llvm.ident = !{!12}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.6.0 ", isOptimized: false, emissionKind: LineTablesOnly, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2, debugInfoForProfiling: true)
+!1 = !DIFile(filename: "gmlt.cpp", directory: "/tmp/dbginfo")
+!2 = !{}
+!4 = distinct !DISubprogram(name: "f1", line: 1, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 1, file: !1, scope: !5, type: !6, variables: !2)
+!5 = !DIFile(filename: "gmlt.cpp", directory: "/tmp/dbginfo")
+!6 = !DISubroutineType(types: !2)
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{!"clang version 3.6.0 "}
+!13 = !DILocation(line: 1, column: 12, scope: !4)
diff --git a/test/DebugInfo/Generic/invalid.ll b/test/DebugInfo/Generic/invalid.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fdb68d9cca3f1bb9399e33285867f6ca25d1b24e
--- /dev/null
+++ b/test/DebugInfo/Generic/invalid.ll
@@ -0,0 +1,17 @@
+; RUN: not opt -verify %s 2>&1 | FileCheck %s
+
+; Make sure we emit this diagnostic only once (which means we don't visit the
+; same DISubprogram twice.
+; CHECK: subprogram definitions must have a compile unit
+; CHECK-NEXT: !3 = distinct !DISubprogram(name: "patatino", scope: null, isLocal: false, isDefinition: true, isOptimized: false)
+; CHECK-NOT: subprogram definitions must have a compile unit
+; CHECK-NOT: !3 = distinct !DISubprogram(name: "patatino", scope: null, isLocal: false, isDefinition: true, isOptimized: false)
+
+define void @tinkywinky() !dbg !3 { ret void }
+
+!llvm.module.flags = !{!4}
+!llvm.dbg.cu = !{!0}
+!0 = distinct !DICompileUnit(language: 12, file: !1)
+!1 = !DIFile(filename: "/home/davide", directory: "/home/davide")
+!3 = distinct !DISubprogram(name: "patatino", isDefinition: true)
+!4 = !{i32 2, !"Debug Info Version", i32 3}
diff --git a/test/DebugInfo/Generic/store-tail-merge.ll b/test/DebugInfo/Generic/store-tail-merge.ll
new file mode 100644
index 0000000000000000000000000000000000000000..624f30416e0fe749fb05ca277bfc629af35be8ed
--- /dev/null
+++ b/test/DebugInfo/Generic/store-tail-merge.ll
@@ -0,0 +1,72 @@
+; RUN: opt -instcombine -S < %s | FileCheck %s
+;
+; Generated with:
+;
+; clang -S -gmlt -emit-llvm test.c -o 1.ll
+; opt -sroa -S 1.ll -o test.ll
+;
+; extern int bar(int i);
+; extern int bar2(int i);
+;
+; int foo(int a, int *d) {
+;   if(a) {
+;       *d = bar(a);
+;   } else {
+;       *d = bar2(a);
+;   }
+;
+;   return a;
+; }
+;
+; CHECK:       define {{.*}}@foo
+; CHECK:       if.end:
+; CHECK-NEXT:  %storemerge = phi
+; This final check is the "real" test, verify no !dbg on the store.
+; CHECK-NEXT:  store i32 %storemerge{{.*}}, align 4{{$}}
+;
+; ModuleID = 'test1.ll'
+source_filename = "test.c"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline nounwind uwtable
+define i32 @foo(i32 %a, i32* %d) !dbg !6 {
+entry:
+  %tobool = icmp ne i32 %a, 0, !dbg !8
+  br i1 %tobool, label %if.then, label %if.else, !dbg !8
+
+if.then:                                          ; preds = %entry
+  %call = call i32 @bar(i32 %a), !dbg !9
+  store i32 %call, i32* %d, align 4, !dbg !10
+  br label %if.end, !dbg !11
+
+if.else:                                          ; preds = %entry
+  %call1 = call i32 @bar2(i32 %a), !dbg !12
+  store i32 %call1, i32* %d, align 4, !dbg !13
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret i32 %a, !dbg !14
+}
+
+declare i32 @bar(i32)
+
+declare i32 @bar2(i32)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "/home/probinson/projects/scratch")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, type: !7, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !2)
+!8 = !DILocation(line: 5, column: 6, scope: !6)
+!9 = !DILocation(line: 6, column: 12, scope: !6)
+!10 = !DILocation(line: 6, column: 10, scope: !6)
+!11 = !DILocation(line: 7, column: 3, scope: !6)
+!12 = !DILocation(line: 8, column: 12, scope: !6)
+!13 = !DILocation(line: 8, column: 10, scope: !6)
+!14 = !DILocation(line: 10, column: 3, scope: !6)
diff --git a/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64 b/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64
new file mode 100644
index 0000000000000000000000000000000000000000..447813419e3ee74e8b0241a2608db17b8acd7b24
Binary files /dev/null and b/test/DebugInfo/Inputs/dwarfdump-header.elf-x86-64 differ
diff --git a/test/DebugInfo/Inputs/dwarfdump-header.s b/test/DebugInfo/Inputs/dwarfdump-header.s
new file mode 100644
index 0000000000000000000000000000000000000000..ce51e987f38a95d32cee345f1f48af35c7c70a0d
--- /dev/null
+++ b/test/DebugInfo/Inputs/dwarfdump-header.s
@@ -0,0 +1,149 @@
+# Test object to verify dwarfdump handles v4 and v5 CU/TU headers.
+# We have a representative set of units: v4 CU, v5 CU, v4 TU, v5 split TU.
+#
+# To generate the test object:
+# llvm-mc -triple x86_64-unknown-linux dwarfdump-header.s -filetype=obj \
+#         -o dwarfdump-header.elf-x86-64
+
+        .section .debug_str,"MS",@progbits,1
+str_producer:
+        .asciz "Handmade DWARF producer"
+str_CU_4:
+        .asciz "V4_compile_unit"
+str_CU_5:
+        .asciz "V5_compile_unit"
+str_TU_4:
+        .asciz "V4_type_unit"
+
+        .section .debug_str.dwo,"MS",@progbits,1
+dwo_TU_5:
+        .asciz "V5_split_type_unit"
+
+# All CUs/TUs use the same abbrev section for simplicity.
+        .section .debug_abbrev,"",@progbits
+        .byte 0x01  # Abbrev code
+        .byte 0x11  # DW_TAG_compile_unit
+        .byte 0x00  # DW_CHILDREN_no
+        .byte 0x25  # DW_AT_producer
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x02  # Abbrev code
+        .byte 0x41  # DW_TAG_type_unit
+        .byte 0x01  # DW_CHILDREN_yes
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x03  # Abbrev code
+        .byte 0x13  # DW_TAG_structure_type
+        .byte 0x00  # DW_CHILDREN_no (no members)
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x00  # EOM(3)
+
+# And a .dwo copy for the .dwo sections.
+        .section .debug_abbrev.dwo,"",@progbits
+        .byte 0x01  # Abbrev code
+        .byte 0x11  # DW_TAG_compile_unit
+        .byte 0x00  # DW_CHILDREN_no
+        .byte 0x25  # DW_AT_producer
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x02  # Abbrev code
+        .byte 0x41  # DW_TAG_type_unit
+        .byte 0x01  # DW_CHILDREN_yes
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x03  # Abbrev code
+        .byte 0x13  # DW_TAG_structure_type
+        .byte 0x00  # DW_CHILDREN_no (no members)
+        .byte 0x03  # DW_AT_name
+        .byte 0x0e  # DW_FORM_strp
+        .byte 0x00  # EOM(1)
+        .byte 0x00  # EOM(2)
+        .byte 0x00  # EOM(3)
+
+        .section .debug_info,"",@progbits
+
+# DWARF v4 CU header. V4 CU headers all look the same so we do only one.
+        .long  CU_4_end-CU_4_version  # Length of Unit
+CU_4_version:
+        .short 4               # DWARF version number
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+        .byte 8                # Address Size (in bytes)
+# The compile-unit DIE, which has just DW_AT_producer and DW_AT_name.
+        .byte 1
+        .long str_producer
+        .long str_CU_4
+        .byte 0 # NULL
+CU_4_end:
+
+# DWARF v5 normal CU header.
+        .long  CU_5_end-CU_5_version  # Length of Unit
+CU_5_version:
+        .short 5               # DWARF version number
+        .byte 1                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+# The compile-unit DIE, which has just DW_AT_producer and DW_AT_name.
+        .byte 1
+        .long str_producer
+        .long str_CU_5
+        .byte 0 # NULL
+CU_5_end:
+
+        .section .debug_types,"",@progbits
+
+# DWARF v4 Type unit header. Normal/split are identical so we do only one.
+TU_4_start:
+        .long  TU_4_end-TU_4_version  # Length of Unit
+TU_4_version:
+        .short 4               # DWARF version number
+        .long .debug_abbrev    # Offset Into Abbrev. Section
+        .byte 8                # Address Size (in bytes)
+        .quad 0x0011223344556677 # Type Signature
+        .long TU_4_type-TU_4_start # Type offset
+# The type-unit DIE, which has a name.
+        .byte 2
+        .long str_TU_4
+# The type DIE, which has a name.
+TU_4_type:
+        .byte 3
+        .long str_TU_4
+        .byte 0 # NULL
+        .byte 0 # NULL
+TU_4_end:
+
+        .section .debug_types.dwo,"",@progbits
+# FIXME: DWARF v5 wants type units in .debug_info[.dwo] not .debug_types[.dwo].
+
+# DWARF v5 split type unit header.
+TU_split_5_start:
+        .long  TU_split_5_end-TU_split_5_version  # Length of Unit
+TU_split_5_version:
+        .short 5               # DWARF version number
+        .byte 6                # DWARF Unit Type
+        .byte 8                # Address Size (in bytes)
+        .long .debug_abbrev.dwo    # Offset Into Abbrev. Section
+        .quad 0x8899aabbccddeeff # Type Signature
+        .long TU_split_5_type-TU_split_5_start  # Type offset
+# The type-unit DIE, which has a name.
+        .byte 2
+        .long dwo_TU_5
+# The type DIE, which has a name.
+TU_split_5_type:
+        .byte 3
+        .long dwo_TU_5
+        .byte 0 # NULL
+        .byte 0 # NULL
+TU_split_5_end:
diff --git a/test/DebugInfo/Inputs/gmlt.ll b/test/DebugInfo/Inputs/gmlt.ll
index b6df50109ff026c819f7bb4f16ba3a7c7c8131ca..116cd75b811007cda3992046595333b1fb3a472d 100644
--- a/test/DebugInfo/Inputs/gmlt.ll
+++ b/test/DebugInfo/Inputs/gmlt.ll
@@ -76,13 +76,6 @@
 ; CHECK-NOT: {{DW_TAG|DW_AT}}
 ; CHECK: NULL
 
-; PROFILING: DW_TAG_subprogram
-; PROFILING: DW_AT_name {{.*}} "f1"
-; With -debug-info-for-profiling, we need to emit decl_file a-nd decl_line
-; of the subprogram.
-; PROFILING: DW_AT_decl_file
-; PROFILING: DW_AT_decl_line
-
 ; CHECK: .debug_ranges contents:
 
 ; ... some addresses (depends on platform (such as platforms with function
diff --git a/test/DebugInfo/MIR/ARM/split-superreg-complex.mir b/test/DebugInfo/MIR/ARM/split-superreg-complex.mir
new file mode 100644
index 0000000000000000000000000000000000000000..2e8d9977a649d3154d484f46d822c1c5b3e6c0ed
--- /dev/null
+++ b/test/DebugInfo/MIR/ARM/split-superreg-complex.mir
@@ -0,0 +1,122 @@
+# RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+#
+# This is an artificial example of a debug value residing in a composite
+# location with a complex expression. Because the semantics of applying a DWARF
+# expression to a composite location are ill-defined, the compiler should bail
+# out of emitting a location.
+#
+# CHECK: .debug_info contents:
+# CHECK: DW_TAG_variable
+# CHECK-NOT: DW_AT_location
+# CHECK: DW_TAG
+--- |
+  ; Generated from:
+  ; typedef float vec2 __attribute__((vector_size(16)));
+  ; vec2 v();
+  ; float f() {
+  ;   vec2 vec = v();
+  ;   return vec[0] + vec[1];
+  ; }
+
+  target datalayout = "e-m:o-p:32:32-f64:32:64-v64:32:64-v128:32:128-a:0:32-n32-S32"
+  target triple = "thumbv7s-apple-ios5.0.0"
+  
+  define float @f() local_unnamed_addr #0 !dbg !9 {
+  entry:
+    %call = tail call <4 x float> bitcast (<4 x float> (...)* @v to <4 x float> ()*)() #0, !dbg !19
+    tail call void @llvm.dbg.value(metadata <4 x float> %call, i64 0, metadata !14, metadata !20), !dbg !21
+    %vecext = extractelement <4 x float> %call, i32 0, !dbg !22
+    %vecext1 = extractelement <4 x float> %call, i32 1, !dbg !23
+    %add = fadd float %vecext, %vecext1, !dbg !24
+    ret float %add, !dbg !25
+  }
+  
+  declare arm_aapcs_vfpcc <4 x float> @v(...) local_unnamed_addr #0
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #0
+  
+  attributes #0 = { nounwind readnone }
+  
+  !llvm.dbg.cu = !{!0}
+  !llvm.module.flags = !{!3, !4}
+  !llvm.ident = !{!8}
+  
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 286322) (llvm/trunk 286305)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+  !1 = !DIFile(filename: "v.c", directory: "/")
+  !2 = !{}
+  !3 = !{i32 2, !"Dwarf Version", i32 2}
+  !4 = !{i32 2, !"Debug Info Version", i32 3}
+  !8 = !{!"clang version 4.0.0 (trunk 286322) (llvm/trunk 286305)"}
+  !9 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 3, type: !10, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0, variables: !13)
+  !10 = !DISubroutineType(types: !11)
+  !11 = !{!12}
+  !12 = !DIBasicType(name: "float", size: 32, encoding: DW_ATE_float)
+  !13 = !{!14}
+  !14 = !DILocalVariable(name: "vec", scope: !9, file: !1, line: 4, type: !15)
+  !15 = !DIDerivedType(tag: DW_TAG_typedef, name: "vec2", file: !1, line: 1, baseType: !16)
+  !16 = !DICompositeType(tag: DW_TAG_array_type, baseType: !12, size: 128, flags: DIFlagVector, elements: !17)
+  !17 = !{!18}
+  !18 = !DISubrange(count: 4)
+  !19 = !DILocation(line: 4, column: 13, scope: !9)
+  !20 = !DIExpression(DW_OP_plus, 1, DW_OP_minus, 1)
+  !21 = !DILocation(line: 4, column: 7, scope: !9)
+  !22 = !DILocation(line: 5, column: 9, scope: !9)
+  !23 = !DILocation(line: 5, column: 18, scope: !9)
+  !24 = !DILocation(line: 5, column: 16, scope: !9)
+  !25 = !DILocation(line: 5, column: 2, scope: !9)
+
+...
+---
+name:            f
+alignment:       1
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+calleeSavedRegisters: [ '%lr', '%d8', '%d9', '%d10', '%d11', '%d12', '%d13', 
+                        '%d14', '%d15', '%q4', '%q5', '%q6', '%q7', '%r4', 
+                        '%r5', '%r6', '%r7', '%r8', '%r10', '%r11', '%s16', 
+                        '%s17', '%s18', '%s19', '%s20', '%s21', '%s22', 
+                        '%s23', '%s24', '%s25', '%s26', '%s27', '%s28', 
+                        '%s29', '%s30', '%s31', '%d8_d10', '%d9_d11', '%d10_d12', 
+                        '%d11_d13', '%d12_d14', '%d13_d15', '%q4_q5', '%q5_q6', 
+                        '%q6_q7', '%q4_q5_q6_q7', '%r4_r5', '%r6_r7', '%r10_r11', 
+                        '%d8_d9_d10', '%d9_d10_d11', '%d10_d11_d12', '%d11_d12_d13', 
+                        '%d12_d13_d14', '%d13_d14_d15', '%d8_d10_d12', 
+                        '%d9_d11_d13', '%d10_d12_d14', '%d11_d13_d15', 
+                        '%d8_d10_d12_d14', '%d9_d11_d13_d15', '%d9_d10', 
+                        '%d11_d12', '%d13_d14', '%d9_d10_d11_d12', '%d11_d12_d13_d14' ]
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       4
+  offsetAdjustment: 0
+  maxAlignment:    4
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+stack:           
+  - { id: 0, type: spill-slot, offset: -4, size: 4, alignment: 4, callee-saved-register: '%lr' }
+body:             |
+  bb.0.entry:
+    liveins: %lr
+  
+    early-clobber %sp = frame-setup t2STR_PRE killed undef %lr, %sp, -4, 14, _
+    frame-setup CFI_INSTRUCTION def_cfa_offset 4
+    frame-setup CFI_INSTRUCTION offset %lr, -4
+    tBL 14, _, @v, csr_ios, implicit-def dead %lr, implicit %sp, implicit-def %sp, implicit-def %r0, implicit-def %r1, implicit-def %r2, implicit-def %r3, debug-location !19
+    %d1 = VMOVDRR killed %r2, killed %r3, 14, _, implicit-def %q0, debug-location !19
+    %d0 = VMOVDRR killed %r0, killed %r1, 14, _, implicit killed %q0, implicit-def %q0, debug-location !19
+    DBG_VALUE debug-use %q0, debug-use _, !14, !20, debug-location !21
+    %s4 = VMOVS %s1, 14, _, implicit-def %d2, debug-location !24
+    %d0 = VADDfd %d0, killed %d2, 14, _, implicit killed %q0, debug-location !24
+    %r0 = VMOVRS %s0, 14, _, implicit killed %d0, debug-location !25
+    %lr, %sp = t2LDR_POST %sp, 4, 14, _, debug-location !25
+    tBX_RET 14, _, implicit %r0, debug-location !25
+
+...
diff --git a/test/DebugInfo/MIR/X86/live-debug-values-spill.mir b/test/DebugInfo/MIR/X86/live-debug-values-spill.mir
new file mode 100644
index 0000000000000000000000000000000000000000..c0d0d70105645339deb10343555527304a063ddd
--- /dev/null
+++ b/test/DebugInfo/MIR/X86/live-debug-values-spill.mir
@@ -0,0 +1,468 @@
+# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck -check-prefix=GENERATE %s
+# RUN: llc -run-pass=livedebugvalues -march=x86-64 -o - %s | FileCheck -check-prefix=TERMINATE %s
+# 
+# Check that spills are recognized in the Live Debug Values pass and that
+# DBG_VALUE instructions are generated to keep track of spilled user
+# variables.  
+# In addition we check that the ranges of spilled debug values are properly
+# extended.
+#
+# Test case generated from:
+#
+# extern void use (int);
+# extern void set (int *, int *, int *);
+# 
+# int glob0, glob1, glob2, glob3, glob4, glob5;
+# 
+# void foo(int b0, int b1, int int0, int int1, int int2,
+#          int int3, int int4)
+# {
+#   int inta = glob0;
+#   int intb = glob1;
+#   int intc = glob2;
+#   int intd = glob3;
+#   int inte = glob4;
+#   int intf = glob5;
+#   int intg;
+# 
+#   if (b0)
+#     return;
+# 
+#   int0 += (int1 + int2 + int3) * int4;
+#   use(intf);
+#   use(inte);
+# 
+#   if (b1) {
+#     set(&inte, &intf, &intg);
+#     int0 = (int1 + int2 + int3) * int4;
+#     inta = (intb*inte + intc*inte + intd) * inte;
+#   }
+#   int0 += int4 * inta;
+#   use(int0);
+# }
+#
+#
+# Generated with 
+# clang -g -O2 -S -emit-llvm -fno-omit-frame-pointer spill1.c
+# llc -stop-after=funclet-layout < spill1.ll > spill1.mir                
+#
+# Make sure that we generated DBG_VALUE instructions for the spills
+# GENERATE:      bb.1.if.end:
+# GENERATE:      MOV32mr %rbp, 1, _, -48, _, killed %edx :: (store 4 into %stack.5)
+# GENERATE-NEXT: DBG_VALUE debug-use %rbp, -48, !26, !38
+# GENERATE:      MOV32mr %rbp, 1, _, -52, _, killed %r8d :: (store 4 into %stack.4)
+# GENERATE-NEXT: DBG_VALUE debug-use %rbp, -52, !32, !38
+# GENERATE:      MOV32mr %rbp, 1, _, -56, _, killed %esi :: (store 4 into %stack.3)
+# GENERATE-NEXT: DBG_VALUE debug-use %rbp, -56, !34, !38
+#
+# Check that the spill locations that are valid at the end of bb.1.if.end are
+# propagated to subsequent BBs.
+#
+# GENERATE:      bb.2.if.then4:
+# GENERATE-NOT:  bb.3:
+# GENERATE-DAG:  DBG_VALUE debug-use %rbp, -56, !34, !38
+# GENERATE-DAG:  DBG_VALUE debug-use %rbp, -52, !32, !38
+#
+# GENERATE:      bb.3:
+# GENERATE-NOT:  bb.4.if.end13:
+# GENERATE-DAG:  DBG_VALUE debug-use %rbp, -56, !34, !38
+# GENERATE-DAG:  DBG_VALUE debug-use %rbp, -52, !32, !38
+#
+# GENERATE:      bb.4.if.end13:
+# GENERATE-NOT:  bb.5.cleanup:
+# GENERATE-DAG:  DBG_VALUE debug-use %rbp, -56, !34, !38
+# GENERATE-DAG:  DBG_VALUE debug-use %rbp, -52, !32, !38
+# 
+# Check that the spill location rbp-48 (the variable int0) is not propagated 
+# because int0 is redefined within the same basic block.
+#
+# TERMINATE:     bb.2.if.then4:
+# TERMINATE-NOT: DBG_VALUE debug-use %rbp, -48, !26, !38
+--- |
+  ; ModuleID = '<stdin>'
+  source_filename = "spill1.c"
+  target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  @glob0 = common local_unnamed_addr global i32 0, align 4, !dbg !0
+  @glob1 = common local_unnamed_addr global i32 0, align 4, !dbg !6
+  @glob2 = common local_unnamed_addr global i32 0, align 4, !dbg !9
+  @glob3 = common local_unnamed_addr global i32 0, align 4, !dbg !11
+  @glob4 = common local_unnamed_addr global i32 0, align 4, !dbg !13
+  @glob5 = common local_unnamed_addr global i32 0, align 4, !dbg !15
+  
+  ; Function Attrs: nounwind uwtable
+  define void @foo(i32 %b0, i32 %b1, i32 %int0, i32 %int1, i32 %int2, i32 %int3, i32 %int4) local_unnamed_addr #0 !dbg !20 {
+  entry:
+    %inte = alloca i32, align 4
+    %intf = alloca i32, align 4
+    %intg = alloca i32, align 4
+    tail call void @llvm.dbg.value(metadata i32 %b0, i64 0, metadata !24, metadata !38), !dbg !39
+    tail call void @llvm.dbg.value(metadata i32 %b1, i64 0, metadata !25, metadata !38), !dbg !40
+    tail call void @llvm.dbg.value(metadata i32 %int0, i64 0, metadata !26, metadata !38), !dbg !41
+    tail call void @llvm.dbg.value(metadata i32 %int1, i64 0, metadata !27, metadata !38), !dbg !42
+    tail call void @llvm.dbg.value(metadata i32 %int2, i64 0, metadata !28, metadata !38), !dbg !43
+    tail call void @llvm.dbg.value(metadata i32 %int3, i64 0, metadata !29, metadata !38), !dbg !44
+    tail call void @llvm.dbg.value(metadata i32 %int4, i64 0, metadata !30, metadata !38), !dbg !45
+    %0 = load i32, i32* @glob0, align 4, !dbg !46, !tbaa !47
+    tail call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !31, metadata !38), !dbg !51
+    %1 = load i32, i32* @glob1, align 4, !dbg !52, !tbaa !47
+    tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !32, metadata !38), !dbg !53
+    %2 = load i32, i32* @glob2, align 4, !dbg !54, !tbaa !47
+    tail call void @llvm.dbg.value(metadata i32 %2, i64 0, metadata !33, metadata !38), !dbg !55
+    %3 = load i32, i32* @glob3, align 4, !dbg !56, !tbaa !47
+    tail call void @llvm.dbg.value(metadata i32 %3, i64 0, metadata !34, metadata !38), !dbg !57
+    %4 = bitcast i32* %inte to i8*, !dbg !58
+    call void @llvm.lifetime.start(i64 4, i8* nonnull %4) #4, !dbg !58
+    %5 = load i32, i32* @glob4, align 4, !dbg !59, !tbaa !47
+    tail call void @llvm.dbg.value(metadata i32 %5, i64 0, metadata !35, metadata !38), !dbg !60
+    tail call void @llvm.dbg.value(metadata i32 %5, i64 0, metadata !35, metadata !38), !dbg !60
+    store i32 %5, i32* %inte, align 4, !dbg !60, !tbaa !47
+    %6 = bitcast i32* %intf to i8*, !dbg !61
+    call void @llvm.lifetime.start(i64 4, i8* nonnull %6) #4, !dbg !61
+    %7 = load i32, i32* @glob5, align 4, !dbg !62, !tbaa !47
+    tail call void @llvm.dbg.value(metadata i32 %7, i64 0, metadata !36, metadata !38), !dbg !63
+    tail call void @llvm.dbg.value(metadata i32 %7, i64 0, metadata !36, metadata !38), !dbg !63
+    store i32 %7, i32* %intf, align 4, !dbg !63, !tbaa !47
+    %8 = bitcast i32* %intg to i8*, !dbg !64
+    call void @llvm.lifetime.start(i64 4, i8* nonnull %8) #4, !dbg !64
+    %tobool = icmp eq i32 %b0, 0, !dbg !65
+    br i1 %tobool, label %if.end, label %cleanup, !dbg !67
+  
+  if.end:                                           ; preds = %entry
+    %add = add nsw i32 %int2, %int1, !dbg !68
+    %add1 = add nsw i32 %add, %int3, !dbg !69
+    %mul = mul nsw i32 %add1, %int4, !dbg !70
+    call void @llvm.dbg.value(metadata i32 %mul, i64 0, metadata !26, metadata !38), !dbg !41
+    %add2 = add nsw i32 %mul, %int0, !dbg !71
+    tail call void @llvm.dbg.value(metadata i32 %add2, i64 0, metadata !26, metadata !38), !dbg !41
+    tail call void @use(i32 %7) #4, !dbg !72
+    tail call void @use(i32 %5) #4, !dbg !73
+    %tobool3 = icmp eq i32 %b1, 0, !dbg !74
+    br i1 %tobool3, label %if.end13, label %if.then4, !dbg !76
+  
+  if.then4:                                         ; preds = %if.end
+    tail call void @llvm.dbg.value(metadata i32* %inte, i64 0, metadata !35, metadata !77), !dbg !60
+    tail call void @llvm.dbg.value(metadata i32* %intf, i64 0, metadata !36, metadata !77), !dbg !63
+    tail call void @llvm.dbg.value(metadata i32* %intg, i64 0, metadata !37, metadata !77), !dbg !78
+    call void @set(i32* nonnull %inte, i32* nonnull %intf, i32* nonnull %intg) #4, !dbg !79
+    %9 = load i32, i32* %inte, align 4, !dbg !81, !tbaa !47
+    call void @llvm.dbg.value(metadata i32 %9, i64 0, metadata !35, metadata !38), !dbg !60
+    %mul833 = add i32 %2, %1, !dbg !82
+    %add10 = mul i32 %9, %mul833, !dbg !82
+    %add11 = add nsw i32 %add10, %3, !dbg !83
+    %mul12 = mul nsw i32 %add11, %9, !dbg !84
+    call void @llvm.dbg.value(metadata i32 %mul12, i64 0, metadata !31, metadata !38), !dbg !51
+    br label %if.end13, !dbg !85
+  
+  if.end13:                                         ; preds = %if.then4, %if.end
+    %inta.0 = phi i32 [ %mul12, %if.then4 ], [ %0, %if.end ]
+    %int0.addr.0 = phi i32 [ %mul, %if.then4 ], [ %add2, %if.end ]
+    call void @llvm.dbg.value(metadata i32 %inta.0, i64 0, metadata !31, metadata !38), !dbg !51
+    call void @llvm.dbg.value(metadata i32 %int0.addr.0, i64 0, metadata !26, metadata !38), !dbg !41
+    %mul14 = mul nsw i32 %inta.0, %int4, !dbg !86
+    %add15 = add nsw i32 %int0.addr.0, %mul14, !dbg !87
+    call void @llvm.dbg.value(metadata i32 %add15, i64 0, metadata !26, metadata !38), !dbg !41
+    call void @use(i32 %add15) #4, !dbg !88
+    br label %cleanup, !dbg !89
+  
+  cleanup:                                          ; preds = %if.end13, %entry
+    %10 = bitcast i32* %intg to i8*
+    %11 = bitcast i32* %intf to i8*
+    %12 = bitcast i32* %inte to i8*
+    call void @llvm.lifetime.end(i64 4, i8* nonnull %10) #4, !dbg !89
+    call void @llvm.lifetime.end(i64 4, i8* nonnull %11) #4, !dbg !89
+    call void @llvm.lifetime.end(i64 4, i8* nonnull %12) #4, !dbg !89
+    ret void, !dbg !90
+  }
+  
+  ; Function Attrs: argmemonly nounwind
+  declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+  
+  declare void @use(i32) local_unnamed_addr #2
+  
+  declare void @set(i32*, i32*, i32*) local_unnamed_addr #2
+  
+  ; Function Attrs: argmemonly nounwind
+  declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+  
+  ; Function Attrs: nounwind readnone
+  declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
+  
+  ; Function Attrs: nounwind
+  declare void @llvm.stackprotector(i8*, i8**) #4
+  
+  attributes #0 = { nounwind uwtable "no-frame-pointer-elim-non-leaf" }
+  attributes #1 = { argmemonly nounwind }
+  attributes #2 = { "no-frame-pointer-elim-non-leaf" }
+  attributes #3 = { nounwind readnone }
+  attributes #4 = { nounwind }
+  
+  !llvm.dbg.cu = !{!2}
+  !llvm.module.flags = !{!17, !18}
+  !llvm.ident = !{!19}
+  
+  !0 = !DIGlobalVariableExpression(var: !1)
+  !1 = distinct !DIGlobalVariable(name: "glob0", scope: !2, file: !3, line: 4, type: !8, isLocal: false, isDefinition: true)
+  !2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 5.0.0 (trunk 292962)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+  !3 = !DIFile(filename: "spill1.c", directory: "/home/test")
+  !4 = !{}
+  !5 = !{!0, !6, !9, !11, !13, !15}
+  !6 = !DIGlobalVariableExpression(var: !7)
+  !7 = distinct !DIGlobalVariable(name: "glob1", scope: !2, file: !3, line: 4, type: !8, isLocal: false, isDefinition: true)
+  !8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+  !9 = !DIGlobalVariableExpression(var: !10)
+  !10 = distinct !DIGlobalVariable(name: "glob2", scope: !2, file: !3, line: 4, type: !8, isLocal: false, isDefinition: true)
+  !11 = !DIGlobalVariableExpression(var: !12)
+  !12 = distinct !DIGlobalVariable(name: "glob3", scope: !2, file: !3, line: 4, type: !8, isLocal: false, isDefinition: true)
+  !13 = !DIGlobalVariableExpression(var: !14)
+  !14 = distinct !DIGlobalVariable(name: "glob4", scope: !2, file: !3, line: 4, type: !8, isLocal: false, isDefinition: true)
+  !15 = !DIGlobalVariableExpression(var: !16)
+  !16 = distinct !DIGlobalVariable(name: "glob5", scope: !2, file: !3, line: 4, type: !8, isLocal: false, isDefinition: true)
+  !17 = !{i32 2, !"Dwarf Version", i32 4}
+  !18 = !{i32 2, !"Debug Info Version", i32 3}
+  !19 = !{!"clang version 5.0.0 (trunk 292962)"}
+  !20 = distinct !DISubprogram(name: "foo", scope: !3, file: !3, line: 6, type: !21, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !23)
+  !21 = !DISubroutineType(types: !22)
+  !22 = !{null, !8, !8, !8, !8, !8, !8, !8}
+  !23 = !{!24, !25, !26, !27, !28, !29, !30, !31, !32, !33, !34, !35, !36, !37}
+  !24 = !DILocalVariable(name: "b0", arg: 1, scope: !20, file: !3, line: 6, type: !8)
+  !25 = !DILocalVariable(name: "b1", arg: 2, scope: !20, file: !3, line: 6, type: !8)
+  !26 = !DILocalVariable(name: "int0", arg: 3, scope: !20, file: !3, line: 6, type: !8)
+  !27 = !DILocalVariable(name: "int1", arg: 4, scope: !20, file: !3, line: 6, type: !8)
+  !28 = !DILocalVariable(name: "int2", arg: 5, scope: !20, file: !3, line: 6, type: !8)
+  !29 = !DILocalVariable(name: "int3", arg: 6, scope: !20, file: !3, line: 7, type: !8)
+  !30 = !DILocalVariable(name: "int4", arg: 7, scope: !20, file: !3, line: 7, type: !8)
+  !31 = !DILocalVariable(name: "inta", scope: !20, file: !3, line: 9, type: !8)
+  !32 = !DILocalVariable(name: "intb", scope: !20, file: !3, line: 10, type: !8)
+  !33 = !DILocalVariable(name: "intc", scope: !20, file: !3, line: 11, type: !8)
+  !34 = !DILocalVariable(name: "intd", scope: !20, file: !3, line: 12, type: !8)
+  !35 = !DILocalVariable(name: "inte", scope: !20, file: !3, line: 13, type: !8)
+  !36 = !DILocalVariable(name: "intf", scope: !20, file: !3, line: 14, type: !8)
+  !37 = !DILocalVariable(name: "intg", scope: !20, file: !3, line: 15, type: !8)
+  !38 = !DIExpression()
+  !39 = !DILocation(line: 6, column: 14, scope: !20)
+  !40 = !DILocation(line: 6, column: 22, scope: !20)
+  !41 = !DILocation(line: 6, column: 30, scope: !20)
+  !42 = !DILocation(line: 6, column: 40, scope: !20)
+  !43 = !DILocation(line: 6, column: 50, scope: !20)
+  !44 = !DILocation(line: 7, column: 14, scope: !20)
+  !45 = !DILocation(line: 7, column: 24, scope: !20)
+  !46 = !DILocation(line: 9, column: 14, scope: !20)
+  !47 = !{!48, !48, i64 0}
+  !48 = !{!"int", !49, i64 0}
+  !49 = !{!"omnipotent char", !50, i64 0}
+  !50 = !{!"Simple C/C++ TBAA"}
+  !51 = !DILocation(line: 9, column: 7, scope: !20)
+  !52 = !DILocation(line: 10, column: 14, scope: !20)
+  !53 = !DILocation(line: 10, column: 7, scope: !20)
+  !54 = !DILocation(line: 11, column: 14, scope: !20)
+  !55 = !DILocation(line: 11, column: 7, scope: !20)
+  !56 = !DILocation(line: 12, column: 14, scope: !20)
+  !57 = !DILocation(line: 12, column: 7, scope: !20)
+  !58 = !DILocation(line: 13, column: 3, scope: !20)
+  !59 = !DILocation(line: 13, column: 14, scope: !20)
+  !60 = !DILocation(line: 13, column: 7, scope: !20)
+  !61 = !DILocation(line: 14, column: 3, scope: !20)
+  !62 = !DILocation(line: 14, column: 14, scope: !20)
+  !63 = !DILocation(line: 14, column: 7, scope: !20)
+  !64 = !DILocation(line: 15, column: 3, scope: !20)
+  !65 = !DILocation(line: 17, column: 7, scope: !66)
+  !66 = distinct !DILexicalBlock(scope: !20, file: !3, line: 17, column: 7)
+  !67 = !DILocation(line: 17, column: 7, scope: !20)
+  !68 = !DILocation(line: 20, column: 17, scope: !20)
+  !69 = !DILocation(line: 20, column: 24, scope: !20)
+  !70 = !DILocation(line: 20, column: 32, scope: !20)
+  !71 = !DILocation(line: 20, column: 8, scope: !20)
+  !72 = !DILocation(line: 21, column: 3, scope: !20)
+  !73 = !DILocation(line: 22, column: 3, scope: !20)
+  !74 = !DILocation(line: 24, column: 7, scope: !75)
+  !75 = distinct !DILexicalBlock(scope: !20, file: !3, line: 24, column: 7)
+  !76 = !DILocation(line: 24, column: 7, scope: !20)
+  !77 = !DIExpression(DW_OP_deref)
+  !78 = !DILocation(line: 15, column: 7, scope: !20)
+  !79 = !DILocation(line: 25, column: 5, scope: !80)
+  !80 = distinct !DILexicalBlock(scope: !75, file: !3, line: 24, column: 11)
+  !81 = !DILocation(line: 27, column: 18, scope: !80)
+  !82 = !DILocation(line: 27, column: 23, scope: !80)
+  !83 = !DILocation(line: 27, column: 35, scope: !80)
+  !84 = !DILocation(line: 27, column: 43, scope: !80)
+  !85 = !DILocation(line: 28, column: 3, scope: !80)
+  !86 = !DILocation(line: 29, column: 16, scope: !20)
+  !87 = !DILocation(line: 29, column: 8, scope: !20)
+  !88 = !DILocation(line: 30, column: 3, scope: !20)
+  !89 = !DILocation(line: 31, column: 1, scope: !20)
+  !90 = !DILocation(line: 31, column: 1, scope: !91)
+  !91 = !DILexicalBlockFile(scope: !20, file: !3, discriminator: 2)
+
+...
+---
+name:            foo
+alignment:       4
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+tracksRegLiveness: true
+liveins:         
+  - { reg: '%edi' }
+  - { reg: '%esi' }
+  - { reg: '%edx' }
+  - { reg: '%ecx' }
+  - { reg: '%r8d' }
+  - { reg: '%r9d' }
+calleeSavedRegisters: [ '%bh', '%bl', '%bp', '%bpl', '%bx', '%ebp', '%ebx', 
+                        '%rbp', '%rbx', '%r12', '%r13', '%r14', '%r15', 
+                        '%r12b', '%r13b', '%r14b', '%r15b', '%r12d', '%r13d', 
+                        '%r14d', '%r15d', '%r12w', '%r13w', '%r14w', '%r15w' ]
+frameInfo:       
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       72
+  offsetAdjustment: -24
+  maxAlignment:    8
+  adjustsStack:    true
+  hasCalls:        true
+  maxCallFrameSize: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+fixedStack:      
+  - { id: 0, type: spill-slot, offset: -56, size: 8, alignment: 8, callee-saved-register: '%rbx' }
+  - { id: 1, type: spill-slot, offset: -48, size: 8, alignment: 16, callee-saved-register: '%r12' }
+  - { id: 2, type: spill-slot, offset: -40, size: 8, alignment: 8, callee-saved-register: '%r13' }
+  - { id: 3, type: spill-slot, offset: -32, size: 8, alignment: 16, callee-saved-register: '%r14' }
+  - { id: 4, type: spill-slot, offset: -24, size: 8, alignment: 8, callee-saved-register: '%r15' }
+  - { id: 5, type: spill-slot, offset: -16, size: 8, alignment: 16 }
+  - { id: 6, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false }
+stack:           
+  - { id: 0, name: inte, offset: -60, size: 4, alignment: 4 }
+  - { id: 1, name: intf, offset: -76, size: 4, alignment: 4 }
+  - { id: 2, name: intg, offset: -80, size: 4, alignment: 4 }
+  - { id: 3, type: spill-slot, offset: -72, size: 4, alignment: 4 }
+  - { id: 4, type: spill-slot, offset: -68, size: 4, alignment: 4 }
+  - { id: 5, type: spill-slot, offset: -64, size: 4, alignment: 4 }
+body:             |
+  bb.0.entry:
+    successors: %bb.1.if.end(0x30000000), %bb.5.cleanup(0x50000000)
+    liveins: %ecx, %edi, %edx, %esi, %r8d, %r9d, %r15, %r14, %r13, %r12, %rbx, %rbp
+  
+    frame-setup PUSH64r killed %rbp, implicit-def %rsp, implicit %rsp
+    CFI_INSTRUCTION def_cfa_offset 16
+    CFI_INSTRUCTION offset %rbp, -16
+    %rbp = frame-setup MOV64rr %rsp
+    CFI_INSTRUCTION def_cfa_register %rbp
+    frame-setup PUSH64r killed %r15, implicit-def %rsp, implicit %rsp
+    frame-setup PUSH64r killed %r14, implicit-def %rsp, implicit %rsp
+    frame-setup PUSH64r killed %r13, implicit-def %rsp, implicit %rsp
+    frame-setup PUSH64r killed %r12, implicit-def %rsp, implicit %rsp
+    frame-setup PUSH64r killed %rbx, implicit-def %rsp, implicit %rsp
+    %rsp = frame-setup SUB64ri8 %rsp, 24, implicit-def dead %eflags
+    CFI_INSTRUCTION offset %rbx, -56
+    CFI_INSTRUCTION offset %r12, -48
+    CFI_INSTRUCTION offset %r13, -40
+    CFI_INSTRUCTION offset %r14, -32
+    CFI_INSTRUCTION offset %r15, -24
+    DBG_VALUE debug-use %edi, debug-use _, !24, !38, debug-location !39
+    DBG_VALUE debug-use %esi, debug-use _, !25, !38, debug-location !40
+    DBG_VALUE debug-use %edx, debug-use _, !26, !38, debug-location !41
+    DBG_VALUE debug-use %ecx, debug-use _, !27, !38, debug-location !42
+    DBG_VALUE debug-use %r8d, debug-use _, !28, !38, debug-location !43
+    DBG_VALUE debug-use %r9d, debug-use _, !29, !38, debug-location !44
+    %r14d = MOV32rr %r8d
+    DBG_VALUE debug-use %r14d, debug-use _, !28, !38, debug-location !43
+    %r12d = MOV32rr %esi
+    DBG_VALUE debug-use %r12d, debug-use _, !25, !38, debug-location !40
+    %eax = MOV32rr %edi
+    DBG_VALUE debug-use %eax, debug-use _, !24, !38, debug-location !39
+    %r13d = MOV32rm %rip, 1, _, @glob0, _, debug-location !46 :: (dereferenceable load 4 from @glob0, !tbaa !47)
+    DBG_VALUE debug-use %r13d, debug-use _, !31, !38, debug-location !51
+    %r8d = MOV32rm %rip, 1, _, @glob1, _, debug-location !52 :: (dereferenceable load 4 from @glob1, !tbaa !47)
+    DBG_VALUE debug-use %r8d, debug-use _, !32, !38, debug-location !53
+    %r15d = MOV32rm %rip, 1, _, @glob2, _, debug-location !54 :: (dereferenceable load 4 from @glob2, !tbaa !47)
+    DBG_VALUE debug-use %r15d, debug-use _, !33, !38, debug-location !55
+    %esi = MOV32rm %rip, 1, _, @glob3, _, debug-location !56 :: (dereferenceable load 4 from @glob3, !tbaa !47)
+    DBG_VALUE debug-use %esi, debug-use _, !34, !38, debug-location !57
+    %ebx = MOV32rm %rip, 1, _, @glob4, _, debug-location !59 :: (dereferenceable load 4 from @glob4, !tbaa !47)
+    DBG_VALUE debug-use %ebx, debug-use _, !35, !38, debug-location !60
+    MOV32mr %rbp, 1, _, -44, _, %ebx, debug-location !60 :: (store 4 into %ir.inte, !tbaa !47)
+    %edi = MOV32rm %rip, 1, _, @glob5, _, debug-location !62 :: (dereferenceable load 4 from @glob5, !tbaa !47)
+    DBG_VALUE debug-use %edi, debug-use _, !36, !38, debug-location !63
+    MOV32mr %rbp, 1, _, -60, _, %edi, debug-location !63 :: (store 4 into %ir.intf, !tbaa !47)
+    TEST32rr killed %eax, %eax, implicit-def %eflags, debug-location !67
+    JNE_1 %bb.5.cleanup, implicit %eflags
+  
+  bb.1.if.end:
+    successors: %bb.2(0x30000000), %bb.3.if.then4(0x50000000)
+    liveins: %ebx, %ecx, %edi, %edx, %esi, %r8d, %r9d, %r12d, %r13d, %r14d, %r15d, %rbp
+  
+    MOV32mr %rbp, 1, _, -48, _, killed %edx :: (store 4 into %stack.5)
+    MOV32mr %rbp, 1, _, -52, _, killed %r8d :: (store 4 into %stack.4)
+    MOV32mr %rbp, 1, _, -56, _, killed %esi :: (store 4 into %stack.3)
+    DBG_VALUE debug-use _, debug-use _, !30, !38, debug-location !45
+    %r14d = ADD32rr killed %r14d, killed %ecx, implicit-def dead %eflags, debug-location !68
+    %r14d = ADD32rr killed %r14d, killed %r9d, implicit-def dead %eflags, debug-location !69
+    %r14d = IMUL32rm killed %r14d, %rbp, 1, _, 16, _, implicit-def dead %eflags, debug-location !70 :: (load 4 from %fixed-stack.6, align 16)
+    DBG_VALUE debug-use %r14d, debug-use _, !26, !38, debug-location !41
+    CALL64pcrel32 @use, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, debug-location !72
+    %edi = MOV32rr killed %ebx, debug-location !73
+    CALL64pcrel32 @use, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, debug-location !73
+    TEST32rr killed %r12d, %r12d, implicit-def %eflags, debug-location !74
+    JE_1 %bb.2, implicit %eflags
+  
+  bb.3.if.then4:
+    successors: %bb.4.if.end13(0x80000000)
+    liveins: %r14d, %r15d, %rbp
+  
+    %rdi = LEA64r %rbp, 1, _, -44, _
+    DBG_VALUE %rbp, -44, !35, !38, debug-location !60
+    %rsi = LEA64r %rbp, 1, _, -60, _
+    DBG_VALUE %rbp, -60, !36, !38, debug-location !63
+    %rdx = LEA64r %rbp, 1, _, -64, _
+    DBG_VALUE %rbp, -64, !37, !38, debug-location !78
+    CALL64pcrel32 @set, csr_64, implicit %rsp, implicit %rdi, implicit %rsi, implicit %rdx, implicit-def %rsp, debug-location !79
+    %eax = MOV32rm %rbp, 1, _, -44, _, debug-location !81 :: (dereferenceable load 4 from %ir.inte, !tbaa !47)
+    DBG_VALUE debug-use %eax, debug-use _, !35, !38, debug-location !60
+    %r15d = ADD32rm killed %r15d, %rbp, 1, _, -52, _, implicit-def dead %eflags, debug-location !82 :: (load 4 from %stack.4)
+    %r15d = IMUL32rr killed %r15d, %eax, implicit-def dead %eflags, debug-location !82
+    %r15d = ADD32rm killed %r15d, %rbp, 1, _, -56, _, implicit-def dead %eflags, debug-location !83 :: (load 4 from %stack.3)
+    %r15d = IMUL32rr killed %r15d, killed %eax, implicit-def dead %eflags, debug-location !84
+    DBG_VALUE debug-use %r15d, debug-use _, !31, !38, debug-location !51
+    %r13d = MOV32rr killed %r15d
+    DBG_VALUE debug-use %r13d, debug-use _, !31, !38, debug-location !51
+    JMP_1 %bb.4.if.end13
+  
+  bb.2:
+    successors: %bb.4.if.end13(0x80000000)
+    liveins: %r13d, %r14d, %rbp
+  
+    %r14d = ADD32rm killed %r14d, %rbp, 1, _, -48, _, implicit-def dead %eflags, debug-location !71 :: (load 4 from %stack.5)
+    DBG_VALUE debug-use %r14d, debug-use _, !26, !38, debug-location !41
+  
+  bb.4.if.end13:
+    successors: %bb.5.cleanup(0x80000000)
+    liveins: %r13d, %r14d, %rbp
+  
+    DBG_VALUE debug-use %r14d, debug-use _, !26, !38, debug-location !41
+    DBG_VALUE debug-use %r13d, debug-use _, !31, !38, debug-location !51
+    %r13d = IMUL32rm killed %r13d, %rbp, 1, _, 16, _, implicit-def dead %eflags, debug-location !86 :: (load 4 from %fixed-stack.6, align 16)
+    %r13d = ADD32rr killed %r13d, killed %r14d, implicit-def dead %eflags, debug-location !87
+    DBG_VALUE debug-use %r13d, debug-use _, !26, !38, debug-location !41
+    %edi = MOV32rr killed %r13d, debug-location !88
+    CALL64pcrel32 @use, csr_64, implicit %rsp, implicit %edi, implicit-def %rsp, debug-location !88
+  
+  bb.5.cleanup:
+    liveins: %rbp
+  
+    %rsp = ADD64ri8 %rsp, 24, implicit-def dead %eflags, debug-location !90
+    %rbx = POP64r implicit-def %rsp, implicit %rsp, debug-location !90
+    %r12 = POP64r implicit-def %rsp, implicit %rsp, debug-location !90
+    %r13 = POP64r implicit-def %rsp, implicit %rsp, debug-location !90
+    %r14 = POP64r implicit-def %rsp, implicit %rsp, debug-location !90
+    %r15 = POP64r implicit-def %rsp, implicit %rsp, debug-location !90
+    %rbp = POP64r implicit-def %rsp, implicit %rsp, debug-location !90
+    RETQ debug-location !90
+
+...
diff --git a/test/DebugInfo/Mips/InlinedFnLocalVar.ll b/test/DebugInfo/Mips/InlinedFnLocalVar.ll
index 51b319c3a5b5ad94ec8cf5ad24ce4ab9657304b1..cd5a03159ef27bbf70882d7147bdedcfeff034de 100644
--- a/test/DebugInfo/Mips/InlinedFnLocalVar.ll
+++ b/test/DebugInfo/Mips/InlinedFnLocalVar.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple mips-linux-gnu -O2 %s -o - | FileCheck %s
 ; Check struct X for dead variable xyz from inlined function foo.
 
-; CHECK: section_info
+; CHECK: .section .debug_info,"",@0x7000001e
 ; CHECK:	DW_TAG_structure_type
 ; CHECK-NEXT:	info_string
 
diff --git a/test/DebugInfo/PDB/DIA/pdbdump-linenumbers.test b/test/DebugInfo/PDB/DIA/pdbdump-linenumbers.test
index 780e0db84665a00d455b3020eb4b3774472c92a9..2a596e4af1493cc04d3c4752cb3e807573f9ebfa 100644
--- a/test/DebugInfo/PDB/DIA/pdbdump-linenumbers.test
+++ b/test/DebugInfo/PDB/DIA/pdbdump-linenumbers.test
@@ -1,12 +1,14 @@
+; RUN: llvm-pdbdump pretty -lines %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=LINE_NUMS_FPO %s
 ; RUN: llvm-pdbdump pretty -lines %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=LINE_NUMS %s
 
-; LINE_NUMS: llvm\test\debuginfo\pdb\inputs\symbolformat-fpo.cpp
-; LINE_NUMS: Line 5, Address: [0x000011a0 - 0x000011a5] (6 bytes)
-; LINE_NUMS: Line 6, Address: [0x000011a6 - 0x000011a6] (1 bytes)
+; LINE_NUMS_FPO: llvm\test\debuginfo\pdb\inputs\symbolformat-fpo.cpp
+; LINE_NUMS_FPO: Line 5, Address: [0x000011a0 - 0x000011a5] (6 bytes)
+; LINE_NUMS_FPO: Line 6, Address: [0x000011a6 - 0x000011a6] (1 bytes)
+
 ; LINE_NUMS: llvm\test\debuginfo\pdb\inputs\symbolformat.cpp
 ; LINE_NUMS: Line 6, Address: [0x00001060 - 0x00001066] (7 bytes)
-; LINE_NUMS: Line 72, Address: [0x000010d0 - 0x000010d1] (2 bytes)
-; LINE_NUMS: Line 73, Address: [0x000010d2 - 0x000010d5] (4 bytes)
+; LINE_NUMS: Line 80, Address: [0x000010d0 - 0x000010d1] (2 bytes)
+; LINE_NUMS: Line 81, Address: [0x000010d2 - 0x000010d5] (4 bytes)
 ; LINE_NUMS: Line 28, Address: [0x00001170 - 0x0000117a] (11 bytes)
 ; LINE_NUMS: Line 21, Address: [0x00001180 - 0x0000118a] (11 bytes)
-; LINE_NUMS: Line 20, Address: [0x00001190 - 0x0000119a] (11 bytes)
\ No newline at end of file
+; LINE_NUMS: Line 20, Address: [0x00001190 - 0x0000119a] (11 bytes)
diff --git a/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test b/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test
index e729e8bc89449d8a78e522caac74a5ac5fc8f353..60a195346875d9f9ce8b0212d4a4ac928d12d01d 100644
--- a/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test
+++ b/test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test
@@ -1,3 +1,4 @@
+; RUN: llvm-pdbdump pretty -symbols %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT_FPO %s
 ; RUN: llvm-pdbdump pretty -symbols %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=SYM_FORMAT %s
 ; RUN: llvm-pdbdump pretty -types %p/../Inputs/symbolformat.pdb > %t.types
 ; RUN: FileCheck --check-prefix=TYPES_FORMAT %s < %t.types
@@ -7,9 +8,11 @@
 ; RUN: llvm-pdbdump pretty -globals %p/../Inputs/symbolformat.pdb | FileCheck --check-prefix=GLOBALS %s
 
 ; The format is func [0x<rva_start>+<prologue_length> - 0x<rva_end>-<epilogue_length>]
+; SYM_FORMAT_FPO: ---SYMBOLS---
+; SYM_FORMAT_FPO: symbolformat-fpo.obj
+; SYM_FORMAT-FPO: func [{{.*}}] (FPO) unsigned int __cdecl fpo_func(unsigned int n)
+
 ; SYM_FORMAT: ---SYMBOLS---
-; SYM_FORMAT: symbolformat-fpo.obj
-; SYM_FORMAT-DAG: func [{{.*}}] (FPO) unsigned int __cdecl fpo_func(unsigned int n)
 ; SYM_FORMAT: symbolformat.obj
 ; SYM_FORMAT-DAG: func [{{.*}}] (EBP) int __cdecl _purecall()
 ; SYM_FORMAT-DAG: func [{{.*}}] (EBP) int __cdecl main(int argc, char** argv)
@@ -30,7 +33,6 @@
 
 ; TYPES_1: Classes
 ; TYPES_1: struct A {
-; TYPES_1: public:
 ; TYPES_1: virtual void PureFunc() = 0
 ; TYPES_1: virtual void VirtualFunc()
 ; TYPES_1: void RegularFunc()
@@ -38,23 +40,31 @@
 
 ; TYPES_2: Classes
 ; TYPES_2: struct MemberTest {
-; TYPES_2: data +0x00 MemberTest::NestedEnum m_nested_enum
-; TYPES_2: data +0x04 int m_typedef
-; TYPES_2: data +0x08 bool m_bool
-; TYPES_2: data +0x09 char m_char
-; TYPES_2: data +0x0a wchar_t m_wchar_t
-; TYPES_2: data +0x0c int m_int
-; TYPES_2: data +0x10 unsigned int m_unsigned
-; TYPES_2: data +0x14 long m_long
-; TYPES_2: data +0x18 unsigned long m_unsigned_long
-; TYPES_2: data +0x20 __int64 m_int64
-; TYPES_2: data +0x28 unsigned __int64 m_unsigned_int64
-; TYPES_2: data +0x30 float m_float
-; TYPES_2: data +0x38 double m_double
-; TYPES_2: data +0x40 void (__cdecl *m_pfn_2_args)(int, double)
+; TYPES_2: data +0x00 [sizeof=4] MemberTest::NestedEnum m_nested_enum
+; TYPES_2: data +0x04 [sizeof=4] int m_typedef
+; TYPES_2: data +0x08 [sizeof=1] bool m_bool
+; TYPES_2: data +0x09 [sizeof=1] char m_char
+; TYPES_2: data +0x0a [sizeof=2] wchar_t m_wchar_t
+; TYPES_2: data +0x0c [sizeof=4] int m_int
+; TYPES_2: data +0x10 [sizeof=4] unsigned int m_unsigned
+; TYPES_2: data +0x14 [sizeof=4] long m_long
+; TYPES_2: data +0x18 [sizeof=4] unsigned long m_unsigned_long
+; TYPES_2: <padding> (4 bytes)
+; TYPES_2: data +0x20 [sizeof=8] __int64 m_int64
+; TYPES_2: data +0x28 [sizeof=8] unsigned __int64 m_unsigned_int64
+; TYPES_2: data +0x30 [sizeof=4] float m_float
+; TYPES_2: <padding> (4 bytes)
+; TYPES_2: data +0x38 [sizeof=8] double m_double
+; TYPES_2: data +0x40 [sizeof=4] void  (__cdecl * m_pfn_2_args)(int, double)
+; TYPES_2: data +0x44 [sizeof=24] int m_multidimensional_array[2][3]
 ; TYPES_2: }
 
 ; GLOBALS: ---GLOBALS---
 ; GLOBALS-DAG: func [{{.*}}] (FPO) unsigned int __cdecl fpo_func(unsigned int n)
-; GLOBALS-DAG: data [{{.*}}] static void* g_global_pointer
-; GLOBALS-DAG: data [{{.*}}] static int g_global_int
+; GLOBALS-DAG: data [{{.*}}, sizeof=4] static void* g_global_pointer
+; GLOBALS-DAG: data [{{.*}}, sizeof=4] static int g_global_int
+; GLOBALS-DAG: data [{{.*}}, sizeof=12] static int g_array[3]
+; GLOBALS-DAG: data [{{.*}}, sizeof=4] static int (* g_pointer_to_array)[3]
+; GLOBALS-DAG: data [{{.*}}, sizeof=4] static const int* g_pointer_to_const_int
+; GLOBALS-DAG: data [sizeof=4] int* const g_const_pointer_to_int = 0
+; GLOBALS-DAG: data [sizeof=4] const int* const g_const_pointer_to_const_int = 0
diff --git a/test/DebugInfo/PDB/Inputs/longname-truncation.yaml b/test/DebugInfo/PDB/Inputs/longname-truncation.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3d6639edc581d6fe531b6866a7b1c7f0f5922fba
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/longname-truncation.yaml
@@ -0,0 +1,26 @@
+---
+TpiStream:       
+  Version:         VC80
+  Records:         
+    - Kind:            LF_STRUCTURE
+      Class:           
+        MemberCount:     0
+        Options:         [ None, HasUniqueName ]
+        FieldList:       0
+        Name:            'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
+        UniqueName:      'bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb'
+        DerivationList:  0
+        VTableShape:     0
+        Size:            1
+
+    - Kind:            LF_STRUCTURE
+      Class:           
+        MemberCount:     0
+        Options:         [ None ]
+        FieldList:       0
+        Name:            'fffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff'
+        UniqueName:      ''
+        DerivationList:  0
+        VTableShape:     0
+        Size:            8
+...
diff --git a/test/DebugInfo/PDB/Inputs/one-symbol.yaml b/test/DebugInfo/PDB/Inputs/one-symbol.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..5728f05d490cb1daddaba493b949ce303135bb5e
--- /dev/null
+++ b/test/DebugInfo/PDB/Inputs/one-symbol.yaml
@@ -0,0 +1,11 @@
+---
+DbiStream:
+  Modules:
+    - Module:          one-symbol.yaml
+      Modi:
+        Records:
+          - Kind:            S_OBJNAME
+            ObjNameSym:
+              Signature:       0
+              ObjectName:      'c:\foo\one-symbol.yaml'
+...
diff --git a/test/DebugInfo/PDB/Inputs/symbolformat.cpp b/test/DebugInfo/PDB/Inputs/symbolformat.cpp
index 5479b717cd9dc7133c5835b8cb4d9253a1a12141..14b44ae3c316386ed18ea52383b6c43566cbee02 100644
--- a/test/DebugInfo/PDB/Inputs/symbolformat.cpp
+++ b/test/DebugInfo/PDB/Inputs/symbolformat.cpp
@@ -50,6 +50,7 @@ struct MemberTest {
   float m_float;
   double m_double;
   void (*m_pfn_2_args)(int, double);
+  int m_multidimensional_array[2][3];
 };
 
 typedef int IntType;
@@ -58,6 +59,13 @@ typedef A ClassAType;
 int g_global_int;
 void *g_global_pointer = nullptr;
 
+typedef int int_array[3];
+int_array g_array = { 1, 2, 3 };
+int_array *g_pointer_to_array = &g_array;
+const int *g_pointer_to_const_int = nullptr;
+int * const g_const_pointer_to_int = nullptr;
+const int * const g_const_pointer_to_const_int = nullptr;
+
 int main(int argc, char **argv) {
   // Force symbol references so the linker generates debug info
   B b;
diff --git a/test/DebugInfo/PDB/Inputs/symbolformat.pdb b/test/DebugInfo/PDB/Inputs/symbolformat.pdb
index 53d8a1b31a39c68c3f29c570f8dc87c179a26b6a..0e509f3a93c33fa65471692bb69155effebdb7d4 100644
Binary files a/test/DebugInfo/PDB/Inputs/symbolformat.pdb and b/test/DebugInfo/PDB/Inputs/symbolformat.pdb differ
diff --git a/test/DebugInfo/PDB/Native/pdb-native-compilands.test b/test/DebugInfo/PDB/Native/pdb-native-compilands.test
new file mode 100644
index 0000000000000000000000000000000000000000..38234d719e50df930b131e9c4c3bbc8a23c9b3df
--- /dev/null
+++ b/test/DebugInfo/PDB/Native/pdb-native-compilands.test
@@ -0,0 +1,65 @@
+; Test that the native PDB reader can enumerate the compilands.
+; RUN: llvm-pdbdump pretty -native -compilands %p/../Inputs/empty.pdb \
+; RUN:   | FileCheck -check-prefix=EMPTY %s
+; RUN: llvm-pdbdump pretty -native -compilands %p/../Inputs/big-read.pdb \
+; RUN:   | FileCheck -check-prefix=BIGREAD %s
+
+; Reference output was generated with the DIA reader to ensure that the
+; `-native` option produces identical output.  The paths output will have
+; backslashes even on non-Windows platforms because they are from PDBs built
+; on Windows.  The path prefixes have been elided because those may be
+; machine-specific.
+
+EMPTY:---COMPILANDS---
+EMPTY:  \llvm\test\DebugInfo\PDB\Inputs\empty.obj
+EMPTY:  * Linker *
+
+BIGREAD:---COMPILANDS---
+BIGREAD:  \llvm\test\tools\llvm-symbolizer\pdb\Inputs\test.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_cpu_disp_.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_initsect_.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_sehprolg4_.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_chandler4gs_.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\_secchk_.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\gs_cookie.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\gs_report.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\gs_support.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\checkcfg.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\guard_support.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\loadcfg.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\dyn_tls_dtor.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\dyn_tls_init.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\matherr_detection.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\ucrt_detection.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\argv_mode.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\commit_mode.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\default_local_stdio_options.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\denormal_control.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\env_mode.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\file_mode.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\invalid_parameter_handler.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\matherr.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\new_mode.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\thread_locale.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\tncleanup.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\exe_main.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\initializers.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\utility.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\ucrt_stubs.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\utility_desktop.obj
+BIGREAD:  f:\dd\vctools\crt\vcstartup\build\md\msvcrt_kernel32\obj1r\i386\default_precision.obj
+BIGREAD:  Import:KERNEL32.dll
+BIGREAD:  KERNEL32.dll
+BIGREAD:  Import:VCRUNTIME140.dll
+BIGREAD:  VCRUNTIME140.dll
+BIGREAD:  Import:api-ms-win-crt-stdio-l1-1-0.dll
+BIGREAD:  api-ms-win-crt-stdio-l1-1-0.dll
+BIGREAD:  Import:api-ms-win-crt-runtime-l1-1-0.dll
+BIGREAD:  api-ms-win-crt-runtime-l1-1-0.dll
+BIGREAD:  Import:api-ms-win-crt-math-l1-1-0.dll
+BIGREAD:  api-ms-win-crt-math-l1-1-0.dll
+BIGREAD:  Import:api-ms-win-crt-locale-l1-1-0.dll
+BIGREAD:  api-ms-win-crt-locale-l1-1-0.dll
+BIGREAD:  Import:api-ms-win-crt-heap-l1-1-0.dll
+BIGREAD:  api-ms-win-crt-heap-l1-1-0.dll
+BIGREAD:  * Linker *
diff --git a/test/DebugInfo/PDB/Native/pdb-native-summary.test b/test/DebugInfo/PDB/Native/pdb-native-summary.test
new file mode 100644
index 0000000000000000000000000000000000000000..bd32f198a390f3143c558e401855f332cbd3dd5e
--- /dev/null
+++ b/test/DebugInfo/PDB/Native/pdb-native-summary.test
@@ -0,0 +1,11 @@
+; Test that the native PDB reader gets the PDB summary correct.
+; RUN: llvm-pdbdump pretty -native -color-output=false %p/../Inputs/empty.pdb \
+; RUN:   | FileCheck -check-prefix=EMPTY %s
+
+; Reference output was generated with the DIA reader to ensure that the
+; `-native` option produces identical output.
+
+; EMPTY:  Size: 102400 bytes
+; EMPTY:  Guid: {0B355641-86A0-A249-896F-9988FAE52FF0}
+; EMPTY:  Age: 1
+; EMPTY:  Attributes: HasPrivateSymbols
diff --git a/test/DebugInfo/PDB/pdb-longname-truncation.test b/test/DebugInfo/PDB/pdb-longname-truncation.test
new file mode 100644
index 0000000000000000000000000000000000000000..2e0284fbe916ca2510114b4634c79af055f61c33
--- /dev/null
+++ b/test/DebugInfo/PDB/pdb-longname-truncation.test
@@ -0,0 +1,3 @@
+; For now just verify that this doesn't cause an error.  Later we pdbdump can
+; do type lookup, we can verify that the name matches what we expect.
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.pdb %p/Inputs/longname-truncation.yaml
diff --git a/test/DebugInfo/PDB/pdb-minimal-construct.test b/test/DebugInfo/PDB/pdb-minimal-construct.test
new file mode 100644
index 0000000000000000000000000000000000000000..d75c51056c9fb8933372d6955b4521256d236c16
--- /dev/null
+++ b/test/DebugInfo/PDB/pdb-minimal-construct.test
@@ -0,0 +1,11 @@
+; This testcase verifies that we can produce a minimal PDB, while
+; serving as an example for how to construct a minimal PDB for other
+; testcases.  It takes as input a small fragment of hand-written yaml
+; that specifies nothing about the PDB other than a definition of one
+; symbol that it contains.  Then it produces a PDB, and uses the
+; resulting PDB to go back to yaml, and verify that the resulting yaml
+; is identical.
+
+; RUN: llvm-pdbdump yaml2pdb -pdb=%t.pdb %p/Inputs/one-symbol.yaml
+; RUN: llvm-pdbdump pdb2yaml -minimal -dbi-module-syms -no-file-headers %t.pdb > %t.pdb.yaml
+; RUN: diff -b %p/Inputs/one-symbol.yaml %t.pdb.yaml
diff --git a/test/DebugInfo/PDB/pdb-yaml-types.test b/test/DebugInfo/PDB/pdb-yaml-types.test
new file mode 100644
index 0000000000000000000000000000000000000000..b3108591271e48b1097701cfa8bd57bc38989649
--- /dev/null
+++ b/test/DebugInfo/PDB/pdb-yaml-types.test
@@ -0,0 +1,74 @@
+RUN: llvm-pdbdump pdb2yaml -tpi-stream %p/Inputs/big-read.pdb > %t.yaml
+RUN: FileCheck -check-prefix=YAML %s < %t.yaml
+RUN: llvm-pdbdump yaml2pdb %t.yaml -pdb %t.pdb
+RUN: llvm-pdbdump raw -tpi-records %t.pdb | FileCheck %s --check-prefix=PDB
+
+Only verify the beginning of the type stream.
+
+YAML:      TpiStream:       
+YAML-NEXT:   Version:         VC80
+YAML-NEXT:   Records:         
+YAML-NEXT:     - Kind:            LF_ARGLIST
+YAML-NEXT:       ArgList:         
+YAML-NEXT:         ArgIndices:      [  ]
+YAML-NEXT:     - Kind:            LF_PROCEDURE
+YAML-NEXT:       Procedure:       
+YAML-NEXT:         ReturnType:      3
+YAML-NEXT:         CallConv:        NearC
+YAML-NEXT:         Options:         [ None ]
+YAML-NEXT:         ParameterCount:  0
+YAML-NEXT:         ArgumentList:    4096
+YAML-NEXT:     - Kind:            LF_PROCEDURE
+YAML-NEXT:       Procedure:       
+YAML-NEXT:         ReturnType:      116
+YAML-NEXT:         CallConv:        NearC
+YAML-NEXT:         Options:         [ None ]
+YAML-NEXT:         ParameterCount:  0
+YAML-NEXT:         ArgumentList:    4096
+
+This test is mostly checking to make sure we include the type index offset
+table, and eventually hash codes. The type index offsets should be similar to
+what are already present in big-read.pdb.
+
+PDB:      Type Info Stream (TPI) {
+PDB-NEXT:   TPI Version: 20040203
+PDB-NEXT:   Record count: 728
+PDB-NEXT:   Records [
+PDB-NEXT:     {
+PDB-NEXT:       ArgList (0x1000) {
+PDB-NEXT:         TypeLeafKind: LF_ARGLIST (0x1201)
+PDB-NEXT:         NumArgs: 0
+PDB-NEXT:         Arguments [
+PDB-NEXT:         ]
+PDB-NEXT:       }
+PDB-NEXT:     }
+PDB-NEXT:     {
+PDB-NEXT:       Procedure (0x1001) {
+PDB-NEXT:         TypeLeafKind: LF_PROCEDURE (0x1008)
+PDB-NEXT:         ReturnType: void (0x3)
+PDB-NEXT:         CallingConvention: NearC (0x0)
+PDB-NEXT:         FunctionOptions [ (0x0)
+PDB-NEXT:         ]
+PDB-NEXT:         NumParameters: 0
+PDB-NEXT:         ArgListType: () (0x1000)
+PDB-NEXT:       }
+PDB-NEXT:     }
+PDB-NEXT:     {
+PDB-NEXT:       Procedure (0x1002) {
+PDB-NEXT:         TypeLeafKind: LF_PROCEDURE (0x1008)
+PDB-NEXT:         ReturnType: int (0x74)
+PDB-NEXT:         CallingConvention: NearC (0x0)
+PDB-NEXT:         FunctionOptions [ (0x0)
+PDB-NEXT:         ]
+PDB-NEXT:         NumParameters: 0
+PDB-NEXT:         ArgListType: () (0x1000)
+PDB-NEXT:       }
+PDB-NEXT:     }
+...
+PDB:          TypeIndexOffsets [
+PDB-NEXT:       Index: 0x1000, Offset: 0
+PDB-NEXT:       Index: 0x106c, Offset: 8,116
+PDB-NEXT:       Index: 0x1118, Offset: 16,372
+PDB-NEXT:       Index: 0x11df, Offset: 24,564
+PDB-NEXT:       Index: 0x128e, Offset: 32,752
+PDB-NEXT:     ]
diff --git a/test/DebugInfo/PDB/pdbdump-headers.test b/test/DebugInfo/PDB/pdbdump-headers.test
index 5fb3ad9220a0d14e31c53f61e792eaec2785d00e..4152f0f9da00fcf494b4c250f2f9d8a8f423175f 100644
--- a/test/DebugInfo/PDB/pdbdump-headers.test
+++ b/test/DebugInfo/PDB/pdbdump-headers.test
@@ -71,6 +71,7 @@
 ; EMPTY-NEXT:   Signature: 0x54E507E2
 ; EMPTY-NEXT:   Age: 1
 ; EMPTY-NEXT:   Guid: {0B355641-86A0-A249-896F-9988FAE52FF0}
+; EMPTY-NEXT:   Features: 0x1
 ; EMPTY-NEXT:   Named Streams {
 ; EMPTY-NEXT:     /names: 13
 ; EMPTY-NEXT:     /LinkInfo: 5
@@ -163,7 +164,7 @@
 ; EMPTY-NEXT:     Record count: 15
 ; EMPTY-NEXT:     Records [
 ; EMPTY-NEXT:       {
-; EMPTY-NEXT:         UdtModSourceLine (0x104B) {
+; EMPTY-NEXT:         UdtModSourceLine (0x1000) {
 ; EMPTY-NEXT:           TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; EMPTY-NEXT:           UDT: __vc_attributes::threadingAttribute (0x100B)
 ; EMPTY-NEXT:           SourceFile: <unknown simple type> (0x1)
@@ -175,7 +176,7 @@
 ; EMPTY-NEXT:         )
 ; EMPTY-NEXT:       }
 ; EMPTY-NEXT:       {
-; EMPTY-NEXT:         UdtModSourceLine (0x104C) {
+; EMPTY-NEXT:         UdtModSourceLine (0x1001) {
 ; EMPTY-NEXT:           TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; EMPTY-NEXT:           UDT: __vc_attributes::event_receiverAttribute (0x1017)
 ; EMPTY-NEXT:           SourceFile: <unknown simple type> (0x1)
@@ -187,7 +188,7 @@
 ; EMPTY-NEXT:         )
 ; EMPTY-NEXT:       }
 ; EMPTY-NEXT:       {
-; EMPTY-NEXT:         UdtModSourceLine (0x104D) {
+; EMPTY-NEXT:         UdtModSourceLine (0x1002) {
 ; EMPTY-NEXT:           TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; EMPTY-NEXT:           UDT: __vc_attributes::aggregatableAttribute (0x1021)
 ; EMPTY-NEXT:           SourceFile: <unknown simple type> (0x1)
@@ -199,7 +200,7 @@
 ; EMPTY-NEXT:         )
 ; EMPTY-NEXT:       }
 ; EMPTY-NEXT:       {
-; EMPTY-NEXT:         UdtModSourceLine (0x104E) {
+; EMPTY-NEXT:         UdtModSourceLine (0x1003) {
 ; EMPTY-NEXT:           TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; EMPTY-NEXT:           UDT: __vc_attributes::event_sourceAttribute (0x102C)
 ; EMPTY-NEXT:           SourceFile: <unknown simple type> (0x1)
@@ -211,7 +212,7 @@
 ; EMPTY-NEXT:         )
 ; EMPTY-NEXT:       }
 ; EMPTY-NEXT:       {
-; EMPTY-NEXT:         UdtModSourceLine (0x104F) {
+; EMPTY-NEXT:         UdtModSourceLine (0x1004) {
 ; EMPTY-NEXT:           TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; EMPTY-NEXT:           UDT: __vc_attributes::moduleAttribute (0x103A)
 ; EMPTY-NEXT:           SourceFile: <unknown simple type> (0x1)
@@ -223,7 +224,7 @@
 ; EMPTY-NEXT:         )
 ; EMPTY-NEXT:       }
 ; EMPTY-NEXT:       {
-; EMPTY-NEXT:         UdtModSourceLine (0x1050) {
+; EMPTY-NEXT:         UdtModSourceLine (0x1005) {
 ; EMPTY-NEXT:           TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; EMPTY-NEXT:           UDT: __vc_attributes::helper_attributes::usageAttribute (0x1042)
 ; EMPTY-NEXT:           SourceFile: <unknown simple type> (0x1)
@@ -234,9 +235,141 @@
 ; EMPTY-NEXT:           0000: 42100000 01000000 6C000000 0100F2F1  |B.......l.......|
 ; EMPTY-NEXT:         )
 ; EMPTY-NEXT:       }
-; EMPTY:          TypeIndexOffsets [
-; EMPTY-NEXT:       Index: 0x1000, Offset: 0
+; EMPTY-NEXT:       {
+; EMPTY-NEXT:         UdtModSourceLine (0x1006) {
+; EMPTY-NEXT:           TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
+; EMPTY-NEXT:           UDT: __vc_attributes::helper_attributes::v1_alttypeAttribute (0x104A)
+; EMPTY-NEXT:           SourceFile: <unknown simple type> (0x1)
+; EMPTY-NEXT:           LineNumber: 96
+; EMPTY-NEXT:           Module: 1
+; EMPTY-NEXT:         }
+; EMPTY-NEXT:         Bytes (
+; EMPTY-NEXT:           0000: 4A100000 01000000 60000000 0100F2F1  |J.......`.......|
+; EMPTY-NEXT:         )
+; EMPTY-NEXT:       }
+; EMPTY-NEXT:       {
+; EMPTY-NEXT:         StringId (0x1007) {
+; EMPTY-NEXT:           TypeLeafKind: LF_STRING_ID (0x1605)
+; EMPTY-NEXT:           Id: 0x0
+; EMPTY-NEXT:           StringData: d:\src\llvm\test\DebugInfo\PDB\Inputs
+; EMPTY-NEXT:         }
+; EMPTY-NEXT:         Bytes (
+; EMPTY-NEXT:           0000: 00000000 643A5C73 72635C6C 6C766D5C  |....d:\src\llvm\|
+; EMPTY-NEXT:           0010: 74657374 5C446562 7567496E 666F5C50  |test\DebugInfo\P|
+; EMPTY-NEXT:           0020: 44425C49 6E707574 7300F2F1           |DB\Inputs...|
+; EMPTY-NEXT:         )
+; EMPTY-NEXT:       }
+; EMPTY-NEXT:       {
+; EMPTY-NEXT:         StringId (0x1008) {
+; EMPTY-NEXT:           TypeLeafKind: LF_STRING_ID (0x1605)
+; EMPTY-NEXT:           Id: 0x0
+; EMPTY-NEXT:           StringData: C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe
+; EMPTY-NEXT:         }
+; EMPTY-NEXT:         Bytes (
+; EMPTY-NEXT:           0000: 00000000 433A5C50 726F6772 616D2046  |....C:\Program F|
+; EMPTY-NEXT:           0010: 696C6573 20287838 36295C4D 6963726F  |iles (x86)\Micro|
+; EMPTY-NEXT:           0020: 736F6674 20566973 75616C20 53747564  |soft Visual Stud|
+; EMPTY-NEXT:           0030: 696F2031 322E305C 56435C42 494E5C63  |io 12.0\VC\BIN\c|
+; EMPTY-NEXT:           0040: 6C2E6578 6500F2F1                    |l.exe...|
+; EMPTY-NEXT:         )
+; EMPTY-NEXT:       }
+; EMPTY-NEXT:       {
+; EMPTY-NEXT:         StringId (0x1009) {
+; EMPTY-NEXT:           TypeLeafKind: LF_STRING_ID (0x1605)
+; EMPTY-NEXT:           Id: 0x0
+; EMPTY-NEXT:           StringData: empty.cpp
+; EMPTY-NEXT:         }
+; EMPTY-NEXT:         Bytes (
+; EMPTY-NEXT:           0000: 00000000 656D7074 792E6370 7000F2F1  |....empty.cpp...|
+; EMPTY-NEXT:         )
+; EMPTY-NEXT:       }
+; EMPTY-NEXT:       {
+; EMPTY-NEXT:         StringId (0x100A) {
+; EMPTY-NEXT:           TypeLeafKind: LF_STRING_ID (0x1605)
+; EMPTY-NEXT:           Id: 0x0
+; EMPTY-NEXT:           StringData: d:\src\llvm\test\DebugInfo\PDB\Inputs\vc120.pdb
+; EMPTY-NEXT:         }
+; EMPTY-NEXT:         Bytes (
+; EMPTY-NEXT:           0000: 00000000 643A5C73 72635C6C 6C766D5C  |....d:\src\llvm\|
+; EMPTY-NEXT:           0010: 74657374 5C446562 7567496E 666F5C50  |test\DebugInfo\P|
+; EMPTY-NEXT:           0020: 44425C49 6E707574 735C7663 3132302E  |DB\Inputs\vc120.|
+; EMPTY-NEXT:           0030: 70646200                             |pdb.|
+; EMPTY-NEXT:         )
+; EMPTY-NEXT:       }
+; EMPTY-NEXT:       {
+; EMPTY-NEXT:         StringId (0x100B) {
+; EMPTY-NEXT:           TypeLeafKind: LF_STRING_ID (0x1605)
+; EMPTY-NEXT:           Id: 0x0
+; EMPTY-NEXT:           StringData: -Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows
+; EMPTY-NEXT:         }
+; EMPTY-NEXT:         Bytes (
+; EMPTY-NEXT:           0000: 00000000 2D5A6920 2D4D5420 2D492243  |....-Zi -MT -I"C|
+; EMPTY-NEXT:           0010: 3A5C5072 6F677261 6D204669 6C657320  |:\Program Files |
+; EMPTY-NEXT:           0020: 28783836 295C4D69 63726F73 6F667420  |(x86)\Microsoft |
+; EMPTY-NEXT:           0030: 56697375 616C2053 74756469 6F203132  |Visual Studio 12|
+; EMPTY-NEXT:           0040: 2E305C56 435C494E 434C5544 4522202D  |.0\VC\INCLUDE" -|
+; EMPTY-NEXT:           0050: 4922433A 5C50726F 6772616D 2046696C  |I"C:\Program Fil|
+; EMPTY-NEXT:           0060: 65732028 78383629 5C4D6963 726F736F  |es (x86)\Microso|
+; EMPTY-NEXT:           0070: 66742056 69737561 6C205374 7564696F  |ft Visual Studio|
+; EMPTY-NEXT:           0080: 2031322E 305C5643 5C41544C 4D46435C  | 12.0\VC\ATLMFC\|
+; EMPTY-NEXT:           0090: 494E434C 55444522 202D4922 433A5C50  |INCLUDE" -I"C:\P|
+; EMPTY-NEXT:           00A0: 726F6772 616D2046 696C6573 20287838  |rogram Files (x8|
+; EMPTY-NEXT:           00B0: 36295C57 696E646F 7773204B 6974735C  |6)\Windows Kits\|
+; EMPTY-NEXT:           00C0: 382E315C 696E636C 7564655C 73686172  |8.1\include\shar|
+; EMPTY-NEXT:           00D0: 65642220 2D492243 3A5C5072 6F677261  |ed" -I"C:\Progra|
+; EMPTY-NEXT:           00E0: 6D204669 6C657320 28783836 295C5769  |m Files (x86)\Wi|
+; EMPTY-NEXT:           00F0: 6E646F77 7300F2F1                    |ndows...|
+; EMPTY-NEXT:         )
+; EMPTY-NEXT:       }
+; EMPTY-NEXT:       {
+; EMPTY-NEXT:         StringList (0x100C) {
+; EMPTY-NEXT:           TypeLeafKind: LF_SUBSTR_LIST (0x1604)
+; EMPTY-NEXT:           NumStrings: 1
+; EMPTY-NEXT:           Strings [
+; EMPTY-NEXT:             String: __vc_attributes::threadingAttribute (0x100B)
+; EMPTY-NEXT:           ]
+; EMPTY-NEXT:         }
+; EMPTY-NEXT:         Bytes (
+; EMPTY-NEXT:           0000: 01000000 0B100000                    |........|
+; EMPTY-NEXT:         )
+; EMPTY-NEXT:       }
+; EMPTY-NEXT:       {
+; EMPTY-NEXT:         StringId (0x100D) {
+; EMPTY-NEXT:           TypeLeafKind: LF_STRING_ID (0x1605)
+; EMPTY-NEXT:           Id: "-Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows" (0x100C)
+; EMPTY-NEXT:           StringData:  Kits\8.1\include\um" -I"C:\Program Files (x86)\Windows Kits\8.1\include\winrt" -TP -X
+; EMPTY-NEXT:         }
+; EMPTY-NEXT:         Bytes (
+; EMPTY-NEXT:           0000: 0C100000 204B6974 735C382E 315C696E  |.... Kits\8.1\in|
+; EMPTY-NEXT:           0010: 636C7564 655C756D 22202D49 22433A5C  |clude\um" -I"C:\|
+; EMPTY-NEXT:           0020: 50726F67 72616D20 46696C65 73202878  |Program Files (x|
+; EMPTY-NEXT:           0030: 3836295C 57696E64 6F777320 4B697473  |86)\Windows Kits|
+; EMPTY-NEXT:           0040: 5C382E31 5C696E63 6C756465 5C77696E  |\8.1\include\win|
+; EMPTY-NEXT:           0050: 72742220 2D545020 2D5800F1           |rt" -TP -X..|
+; EMPTY-NEXT:         )
+; EMPTY-NEXT:       }
+; EMPTY-NEXT:       {
+; EMPTY-NEXT:         BuildInfo (0x100E) {
+; EMPTY-NEXT:           TypeLeafKind: LF_BUILDINFO (0x1603)
+; EMPTY-NEXT:           NumArgs: 5
+; EMPTY-NEXT:           Arguments [
+; EMPTY-NEXT:             ArgType: d:\src\llvm\test\DebugInfo\PDB\Inputs (0x1007)
+; EMPTY-NEXT:             ArgType: C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe (0x1008)
+; EMPTY-NEXT:             ArgType: empty.cpp (0x1009)
+; EMPTY-NEXT:             ArgType: d:\src\llvm\test\DebugInfo\PDB\Inputs\vc120.pdb (0x100A)
+; EMPTY-NEXT:             ArgType:  Kits\8.1\include\um" -I"C:\Program Files (x86)\Windows Kits\8.1\include\winrt" -TP -X (0x100D)
+; EMPTY-NEXT:           ]
+; EMPTY-NEXT:         }
+; EMPTY-NEXT:         Bytes (
+; EMPTY-NEXT:           0000: 05000710 00000810 00000910 00000A10  |................|
+; EMPTY-NEXT:           0010: 00000D10 0000F2F1                    |........|
+; EMPTY-NEXT:         )
+; EMPTY-NEXT:       }
+; EMPTY-NEXT:       TypeIndexOffsets [
+; EMPTY-NEXT:         Index: 0x1000, Offset: 0
+; EMPTY-NEXT:       ]
 ; EMPTY-NEXT:     ]
+; EMPTY-NEXT:   }
 ; EMPTY:      DBI Stream {
 ; EMPTY-NEXT:   Dbi Version: 19990903
 ; EMPTY-NEXT:   Age: 1
@@ -1003,13 +1136,14 @@
 ; ALL:   Signature: 0x54E507E2
 ; ALL:   Age: 1
 ; ALL:   Guid: {0B355641-86A0-A249-896F-9988FAE52FF0}
+; ALL:   Features: 0x1
 ; ALL: }
 ; ALL: Type Info Stream (IPI) {
 ; ALL:   IPI Version: 20040203
 ; ALL:   Record count: 15
 ; ALL:   Records [
 ; ALL:     {
-; ALL:       UdtModSourceLine (0x104B) {
+; ALL:       UdtModSourceLine (0x1000) {
 ; ALL:         TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; ALL:         UDT: __vc_attributes::threadingAttribute (0x100B)
 ; ALL:         SourceFile: <unknown simple type> (0x1)
@@ -1018,7 +1152,7 @@
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       UdtModSourceLine (0x104C) {
+; ALL:       UdtModSourceLine (0x1001) {
 ; ALL:         TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; ALL:         UDT: __vc_attributes::event_receiverAttribute (0x1017)
 ; ALL:         SourceFile: <unknown simple type> (0x1)
@@ -1027,7 +1161,7 @@
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       UdtModSourceLine (0x104D) {
+; ALL:       UdtModSourceLine (0x1002) {
 ; ALL:         TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; ALL:         UDT: __vc_attributes::aggregatableAttribute (0x1021)
 ; ALL:         SourceFile: <unknown simple type> (0x1)
@@ -1036,7 +1170,7 @@
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       UdtModSourceLine (0x104E) {
+; ALL:       UdtModSourceLine (0x1003) {
 ; ALL:         TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; ALL:         UDT: __vc_attributes::event_sourceAttribute (0x102C)
 ; ALL:         SourceFile: <unknown simple type> (0x1)
@@ -1045,7 +1179,7 @@
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       UdtModSourceLine (0x104F) {
+; ALL:       UdtModSourceLine (0x1004) {
 ; ALL:         TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; ALL:         UDT: __vc_attributes::moduleAttribute (0x103A)
 ; ALL:         SourceFile: <unknown simple type> (0x1)
@@ -1054,7 +1188,7 @@
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       UdtModSourceLine (0x1050) {
+; ALL:       UdtModSourceLine (0x1005) {
 ; ALL:         TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; ALL:         UDT: __vc_attributes::helper_attributes::usageAttribute (0x1042)
 ; ALL:         SourceFile: <unknown simple type> (0x1)
@@ -1063,7 +1197,7 @@
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       UdtModSourceLine (0x1051) {
+; ALL:       UdtModSourceLine (0x1006) {
 ; ALL:         TypeLeafKind: LF_UDT_MOD_SRC_LINE (0x1607)
 ; ALL:         UDT: __vc_attributes::helper_attributes::v1_alttypeAttribute (0x104A)
 ; ALL:         SourceFile: <unknown simple type> (0x1)
@@ -1072,66 +1206,66 @@
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       StringId (0x1052) {
+; ALL:       StringId (0x1007) {
 ; ALL:         TypeLeafKind: LF_STRING_ID (0x1605)
 ; ALL:         Id: 0x0
 ; ALL:         StringData: d:\src\llvm\test\DebugInfo\PDB\Inputs
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       StringId (0x1053) {
+; ALL:       StringId (0x1008) {
 ; ALL:         TypeLeafKind: LF_STRING_ID (0x1605)
 ; ALL:         Id: 0x0
 ; ALL:         StringData: C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       StringId (0x1054) {
+; ALL:       StringId (0x1009) {
 ; ALL:         TypeLeafKind: LF_STRING_ID (0x1605)
 ; ALL:         Id: 0x0
 ; ALL:         StringData: empty.cpp
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       StringId (0x1055) {
+; ALL:       StringId (0x100A) {
 ; ALL:         TypeLeafKind: LF_STRING_ID (0x1605)
 ; ALL:         Id: 0x0
 ; ALL:         StringData: d:\src\llvm\test\DebugInfo\PDB\Inputs\vc120.pdb
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       StringId (0x1056) {
+; ALL:       StringId (0x100B) {
 ; ALL:         TypeLeafKind: LF_STRING_ID (0x1605)
 ; ALL:         Id: 0x0
 ; ALL:         StringData: -Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       StringList (0x1057) {
+; ALL:       StringList (0x100C) {
 ; ALL:         TypeLeafKind: LF_SUBSTR_LIST (0x1604)
-; ALL:         NumArgs: 1
-; ALL:         Arguments [
-; ALL:           ArgType: __vc_attributes::threadingAttribute (0x100B)
+; ALL:         NumStrings: 1
+; ALL:         Strings [
+; ALL:           String: __vc_attributes::threadingAttribute (0x100B)
 ; ALL:         ]
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       StringId (0x1058) {
+; ALL:       StringId (0x100D) {
 ; ALL:         TypeLeafKind: LF_STRING_ID (0x1605)
-; ALL:         Id: <field list> (0x100C)
+; ALL:         Id: "-Zi -MT -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\INCLUDE" -I"C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\ATLMFC\INCLUDE" -I"C:\Program Files (x86)\Windows Kits\8.1\include\shared" -I"C:\Program Files (x86)\Windows" (0x100C)
 ; ALL:         StringData:  Kits\8.1\include\um" -I"C:\Program Files (x86)\Windows Kits\8.1\include\winrt" -TP -X
 ; ALL:       }
 ; ALL:     }
 ; ALL:     {
-; ALL:       BuildInfo (0x1059) {
+; ALL:       BuildInfo (0x100E) {
 ; ALL:         TypeLeafKind: LF_BUILDINFO (0x1603)
 ; ALL:         NumArgs: 5
 ; ALL:         Arguments [
-; ALL:           ArgType: void __vc_attributes::threadingAttribute::(__vc_attributes::threadingAttribute::threading_e) (0x1007)
-; ALL:           ArgType: void __vc_attributes::threadingAttribute::() (0x1008)
-; ALL:           ArgType: 0x1009
-; ALL:           ArgType: <field list> (0x100A)
-; ALL:           ArgType: __vc_attributes::event_receiverAttribute::type_e (0x100D)
+; ALL:           ArgType: d:\src\llvm\test\DebugInfo\PDB\Inputs (0x1007)
+; ALL:           ArgType: C:\Program Files (x86)\Microsoft Visual Studio 12.0\VC\BIN\cl.exe (0x1008)
+; ALL:           ArgType: empty.cpp (0x1009)
+; ALL:           ArgType: d:\src\llvm\test\DebugInfo\PDB\Inputs\vc120.pdb (0x100A)
+; ALL:           ArgType:  Kits\8.1\include\um" -I"C:\Program Files (x86)\Windows Kits\8.1\include\winrt" -TP -X (0x100D)
 ; ALL:         ]
 ; ALL:       }
 ; ALL:     }
@@ -1701,6 +1835,7 @@
 ; BIG-NEXT:   Signature: 0x571FFE67
 ; BIG-NEXT:   Age: 1
 ; BIG-NEXT:   Guid: {880ECC89-DF81-0B4F-839C-58CBD052E937}
+; BIG-NEXT:   Features: 0x1
 ; BIG-NEXT:   Named Streams {
 ; BIG-NEXT:     /names: 13
 ; BIG-NEXT:     /LinkInfo: 5
diff --git a/test/DebugInfo/PDB/pdbdump-readwrite.test b/test/DebugInfo/PDB/pdbdump-readwrite.test
index 780612aa65d76b41d7019ec41d9998d8d013cad0..4756faf68c2d625715aa957b9b57317337bec7c8 100644
--- a/test/DebugInfo/PDB/pdbdump-readwrite.test
+++ b/test/DebugInfo/PDB/pdbdump-readwrite.test
@@ -27,6 +27,7 @@ CHECK-NEXT:   Version: 20000404
 CHECK-NEXT:   Signature: 0x54E507E2
 CHECK-NEXT:   Age: 1
 CHECK-NEXT:   Guid: {0B355641-86A0-A249-896F-9988FAE52FF0}
+CHECK-NEXT:   Features: 0x1
 CHECK-NEXT:   Named Streams {
 CHECK:          /names: 
 CHECK:        }
diff --git a/test/DebugInfo/PDB/pdbdump-yaml-types.test b/test/DebugInfo/PDB/pdbdump-yaml-types.test
index 25895f3de2f3fc6307f666d4613d928d7ee7c03c..7e6fcc1ca42010b5792a507b4d745fd15dd7d431 100644
--- a/test/DebugInfo/PDB/pdbdump-yaml-types.test
+++ b/test/DebugInfo/PDB/pdbdump-yaml-types.test
@@ -4,18 +4,14 @@
 YAML: ---
 YAML: MSF:             
 YAML:   SuperBlock:      
-YAML:     BlockSize:       4096
-YAML:     FreeBlockMap:    2
 YAML:     NumBlocks:       25
 YAML:     NumDirectoryBytes: 136
-YAML:     Unknown1:        0
 YAML:     BlockMapAddr:    24
 YAML:   NumDirectoryBlocks: 1
 YAML:   DirectoryBlocks: [ 23 ]
 YAML:   NumStreams:      0
 YAML:   FileSize:        102400
 YAML: TpiStream:       
-YAML:   Version:         VC80
 YAML:   Records:         
 YAML:     - Kind:            LF_ARGLIST
 YAML:       ArgList:         
diff --git a/test/DebugInfo/PDB/pdbdump-yaml.test b/test/DebugInfo/PDB/pdbdump-yaml.test
index e4cb5f5608be2970d6c272789252065dbba2848a..44025be5bca73378a986576d8d2bf993945fa563 100644
--- a/test/DebugInfo/PDB/pdbdump-yaml.test
+++ b/test/DebugInfo/PDB/pdbdump-yaml.test
@@ -44,6 +44,7 @@
 ; YAML-NEXT:   Age:             1
 ; YAML-NEXT:   Guid:            '{0B355641-86A0-A249-896F-9988FAE52FF0}'
 ; YAML-NEXT:   Signature:       1424295906
+; YAML-NEXT:   Features:        [ VC110 ]
 ; YAML-NEXT:   Version:         VC70
 ; YAML-NEXT: ...
 
diff --git a/test/DebugInfo/PowerPC/tls-fission.ll b/test/DebugInfo/PowerPC/tls-fission.ll
index 358fd5b32c498e461f231979160820e9e0b29bff..f456cbcb7146c01be122e095a74b2da59a2982f0 100644
--- a/test/DebugInfo/PowerPC/tls-fission.ll
+++ b/test/DebugInfo/PowerPC/tls-fission.ll
@@ -13,8 +13,7 @@
 ; DW_OP_GNU_push_tls_address
 ; CHECK-NEXT: .byte 224
 ; check that the expected TLS address description is the first thing in the debug_addr section
-; CHECK: debug_addr
-; CHECK-NEXT: .Laddr_sec:
+; CHECK: .section .debug_addr,"",@progbits
 ; CHECK-NEXT: .quad tls@DTPREL+32768
 
 source_filename = "test/DebugInfo/PowerPC/tls-fission.ll"
diff --git a/test/DebugInfo/WebAssembly/dbg-declare.ll b/test/DebugInfo/WebAssembly/dbg-declare.ll
index c48b9122a1aa0d6431c9e9e7283c619e6a7d2ee2..d0f172c6988c4c02cb8ca96cb4bd79709f014df2 100644
--- a/test/DebugInfo/WebAssembly/dbg-declare.ll
+++ b/test/DebugInfo/WebAssembly/dbg-declare.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=wasm32-unknown-unknown | FileCheck %s
-; RUN: llc < %s -verify-machineinstrs -mtriple=wasm32-unknown-unknown -fast-isel | FileCheck --check-prefix=CHECK-FAST %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=wasm32-unknown-unknown-wasm | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=wasm32-unknown-unknown-wasm -fast-isel | FileCheck --check-prefix=CHECK-FAST %s
 ; CHECK: #DEBUG_VALUE: decode:i <- [%vreg
 ; CHECK: #DEBUG_VALUE: decode:v <- [%vreg
 ; CHECK: DW_TAG_variable
@@ -9,7 +9,7 @@
 
 source_filename = "test/DebugInfo/WebAssembly/dbg-declare.ll"
 target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
-target triple = "wasm32-unknown-unknown"
+target triple = "wasm32-unknown-unknown-wasm"
 
 @key = external global [15 x i8], align 1
 
diff --git a/test/DebugInfo/X86/FrameIndexExprs.ll b/test/DebugInfo/X86/FrameIndexExprs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8b2cefc4f8f89b90c7e37c3a759051375c676462
--- /dev/null
+++ b/test/DebugInfo/X86/FrameIndexExprs.ll
@@ -0,0 +1,85 @@
+; PR31381: An assertion in the DWARF backend when fragments in MMI slots are
+; sorted by largest offset first.
+; RUN: llc -mtriple=x86_64-apple-darwin -filetype=obj < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; CHECK: DW_TAG_formal_parameter
+; CHECK: DW_TAG_formal_parameter
+; CHECK-NEXT: DW_AT_location [DW_FORM_exprloc]  (<0xa> 91 78 93 03 93 06 91 7d 93 03 )
+;           fbreg -8, piece 0x00000003, piece 0x00000006, fbreg -3, piece 0x00000003 
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}}"p"
+source_filename = "bugpoint-reduced-simplified.ll"
+target triple = "x86_64-apple-darwin"
+
+@f = common local_unnamed_addr global i32 0, align 4, !dbg !0
+@h = common local_unnamed_addr global i32 0, align 4, !dbg !6
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
+
+define void @fn4() local_unnamed_addr !dbg !12 {
+entry:
+  %l1.sroa.7.i = alloca [3 x i8], align 1
+  tail call void @llvm.dbg.declare(metadata [3 x i8]* %l1.sroa.7.i, metadata !15, metadata !26), !dbg !27
+  %i.sroa.4.i = alloca [3 x i8], align 8
+  tail call void @llvm.dbg.declare(metadata [3 x i8]* %i.sroa.4.i, metadata !15, metadata !32), !dbg !27
+  %0 = load i32, i32* @h, align 4
+  br label %while.body.i.i, !dbg !33
+
+while.body.i.i:                                   ; preds = %while.body.i.i, %entry
+  br label %while.body.i.i, !dbg !34
+
+fn3.exit:                                         ; No predecessors!
+  %1 = load i32, i32* @f, align 4
+  %tobool.i = icmp eq i32 %1, 0
+  br label %while.body.i
+
+while.body.i:                                     ; preds = %if.end.i, %fn3.exit
+  br i1 %tobool.i, label %if.end.i, label %if.then.i
+
+if.then.i:                                        ; preds = %while.body.i
+  br label %if.end.i
+
+if.end.i:                                         ; preds = %if.then.i, %while.body.i
+  br label %while.body.i
+}
+
+attributes #0 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!9, !10, !11}
+
+!0 = !DIGlobalVariableExpression(var: !1)
+!1 = distinct !DIGlobalVariable(name: "f", scope: !2, file: !3, line: 8, type: !8, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "PR31381.c", directory: "/")
+!4 = !{}
+!5 = !{!0, !6}
+!6 = !DIGlobalVariableExpression(var: !7)
+!7 = distinct !DIGlobalVariable(name: "h", scope: !2, file: !3, line: 8, type: !8, isLocal: false, isDefinition: true)
+!8 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!9 = !{i32 2, !"Dwarf Version", i32 4}
+!10 = !{i32 2, !"Debug Info Version", i32 3}
+!11 = !{i32 1, !"PIC Level", i32 2}
+!12 = distinct !DISubprogram(name: "fn4", scope: !3, file: !3, line: 31, type: !13, isLocal: false, isDefinition: true, scopeLine: 32, isOptimized: true, unit: !2, variables: !4)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null}
+!15 = !DILocalVariable(name: "p", arg: 1, scope: !16, file: !3, line: 19, type: !19)
+!16 = distinct !DISubprogram(name: "fn2", scope: !3, file: !3, line: 19, type: !17, isLocal: false, isDefinition: true, scopeLine: 20, flags: DIFlagPrototyped, isOptimized: true, unit: !2, variables: !25)
+!17 = !DISubroutineType(types: !18)
+!18 = !{null, !19}
+!19 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "S", file: !3, line: 1, size: 96, elements: !20)
+!20 = !{!21, !23, !24}
+!21 = !DIDerivedType(tag: DW_TAG_member, name: "b", scope: !19, file: !3, line: 4, baseType: !22, size: 8, offset: 24)
+!22 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!23 = !DIDerivedType(tag: DW_TAG_member, name: "c", scope: !19, file: !3, line: 5, baseType: !8, size: 32, offset: 32)
+!24 = !DIDerivedType(tag: DW_TAG_member, name: "d", scope: !19, file: !3, line: 6, baseType: !8, size: 6, offset: 64, flags: DIFlagBitField, extraData: i64 64)
+!25 = !{!15}
+!26 = !DIExpression(DW_OP_LLVM_fragment, 72, 24)
+!27 = !DILocation(line: 19, column: 20, scope: !16, inlinedAt: !28)
+!28 = distinct !DILocation(line: 27, column: 3, scope: !29, inlinedAt: !30)
+!29 = distinct !DISubprogram(name: "fn3", scope: !3, file: !3, line: 24, type: !13, isLocal: false, isDefinition: true, scopeLine: 25, isOptimized: true, unit: !2, variables: !4)
+!30 = distinct !DILocation(line: 34, column: 7, scope: !31)
+!31 = distinct !DILexicalBlock(scope: !12, file: !3, line: 33, column: 5)
+!32 = !DIExpression(DW_OP_LLVM_fragment, 0, 24)
+!33 = !DILocation(line: 22, column: 9, scope: !16, inlinedAt: !28)
+!34 = !DILocation(line: 21, column: 3, scope: !35, inlinedAt: !28)
+!35 = !DILexicalBlockFile(scope: !16, file: !3, discriminator: 2)
diff --git a/test/DebugInfo/X86/PR26148.ll b/test/DebugInfo/X86/PR26148.ll
index 1f66b7599faae2679088a8904867c9d5bae72197..69e7bbd213b4636dfbbe7bdff1b20c16eef0bf3e 100644
--- a/test/DebugInfo/X86/PR26148.ll
+++ b/test/DebugInfo/X86/PR26148.ll
@@ -19,7 +19,7 @@
 ; AS in 26163, we expect two ranges (as opposed to one), the first one being zero sized
 ;
 ;
-; CHECK: 0x00000025: Beginning address offset: 0x0000000000000004
+; CHECK:             Beginning address offset: 0x0000000000000004
 ; CHECK:                Ending address offset: 0x0000000000000004
 ; CHECK:                 Location description: 10 03 93 04 55 93 02
 ; constu 0x00000003, piece 0x00000004, rdi, piece 0x00000002
diff --git a/test/DebugInfo/X86/dbg-abstract-vars-g-gmlt.ll b/test/DebugInfo/X86/dbg-abstract-vars-g-gmlt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a4e40a20da563b019ef4e757238f17c1f2137dff
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-abstract-vars-g-gmlt.ll
@@ -0,0 +1,105 @@
+; RUN: llc < %s -filetype=obj | llvm-dwarfdump - -debug-dump=info | FileCheck %s
+;
+; IR module created as follows:
+;   clang -emit-llvm -S db-abs-1.cpp -o db-abs-1.ll -g
+;   clang -emit-llvm -S db-abs-2.cpp -o db-abs-2.ll -gmlt
+;   llvm-link db-abs-1.ll db-abs-2.ll -S -o db-abs-3.ll
+; --- db-abs-1.cpp ---
+; void f1();
+; inline __attribute__((always_inline)) void f2(int) {
+;   f1();
+; }
+; void f3() {
+;   f2(0);
+; }
+; --- db-abs-2.cpp ---
+; void f() {
+; }
+; ---
+; The point is that f3() is compiled -g and we get an abstract variable for the
+; unnamed parameter to f2(); then f() is compiled -gmlt and it's okay to have
+; the abstract variable still there.
+; PR31437.
+;
+; (The 'always_inline' attribute forces f2() to be inlined even at -O0, the 
+; 'inline' keyword means the non-inlined definition of f2() can be omitted from
+; the IR.  These are just tactics to simplify the generated test case.)
+;
+; Verify we see the formal parameter in the first compile-unit, and nothing in
+; the second compile-unit.
+;
+; CHECK:      DW_TAG_compile_unit
+; CHECK-NOT:  DW_TAG
+; CHECK:      DW_AT_name {{.*}} "db-abs-1.cpp"
+; CHECK-NOT:  NULL
+; CHECK:      DW_TAG_subprogram
+; CHECK-NEXT: DW_AT_linkage_name {{.*}} "_Z2f2i"
+; CHECK-NOT:  {{DW_TAG|NULL}}
+; CHECK:      DW_TAG_formal_parameter
+; CHECK-NOT:  DW_AT_name
+; CHECK:      {{DW_TAG|NULL}}
+; CHECK:      DW_TAG_inlined_subroutine
+; CHECK-NEXT: DW_AT_abstract_origin {{.*}} "_Z2f2i"
+
+; CHECK:      DW_TAG_compile_unit
+; CHECK-NOT:  DW_TAG
+
+; ModuleID = 'llvm-link'
+source_filename = "llvm-link"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: noinline uwtable
+define void @_Z2f3v() #0 !dbg !8 {
+entry:
+  %.addr.i = alloca i32, align 4
+  call void @llvm.dbg.declare(metadata i32* %.addr.i, metadata !11, metadata !16), !dbg !17
+  store i32 0, i32* %.addr.i, align 4
+  call void @_Z2f1v(), !dbg !19
+  ret void, !dbg !20
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+declare void @_Z2f1v() #2
+
+; Function Attrs: noinline nounwind uwtable
+define void @_Z1fv() #3 !dbg !21 {
+entry:
+  ret void, !dbg !23
+}
+
+attributes #0 = { noinline uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+attributes #3 = { noinline nounwind uwtable }
+
+!llvm.dbg.cu = !{!0, !3}
+!llvm.ident = !{!5, !5}
+!llvm.module.flags = !{!6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 293745)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "db-abs-1.cpp", directory: "/home/probinson/projects/scratch/pr31437")
+!2 = !{}
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 293745)", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!4 = !DIFile(filename: "db-abs-2.cpp", directory: "/home/probinson/projects/scratch/pr31437")
+!5 = !{!"clang version 5.0.0 (trunk 293745)"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = distinct !DISubprogram(name: "f3", linkageName: "_Z2f3v", scope: !1, file: !1, line: 5, type: !9, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null}
+!11 = !DILocalVariable(arg: 1, scope: !12, file: !1, line: 2, type: !15)
+!12 = distinct !DISubprogram(name: "f2", linkageName: "_Z2f2i", scope: !1, file: !1, line: 2, type: !13, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!13 = !DISubroutineType(types: !14)
+!14 = !{null, !15}
+!15 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!16 = !DIExpression()
+!17 = !DILocation(line: 2, column: 50, scope: !12, inlinedAt: !18)
+!18 = distinct !DILocation(line: 6, column: 3, scope: !8)
+!19 = !DILocation(line: 3, column: 3, scope: !12, inlinedAt: !18)
+!20 = !DILocation(line: 7, column: 1, scope: !8)
+!21 = distinct !DISubprogram(name: "f", scope: !4, file: !4, line: 1, type: !22, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !3, variables: !2)
+!22 = !DISubroutineType(types: !2)
+!23 = !DILocation(line: 2, column: 1, scope: !21)
diff --git a/test/DebugInfo/X86/dbg-value-const-byref.ll b/test/DebugInfo/X86/dbg-value-const-byref.ll
index 40b9f726f31ef1a41c630a2a055e81e8bb6ecc8f..77e243702a8a2096814562dd3120d2254f1bba53 100644
--- a/test/DebugInfo/X86/dbg-value-const-byref.ll
+++ b/test/DebugInfo/X86/dbg-value-const-byref.ll
@@ -34,10 +34,10 @@
 ; CHECK: Beginning address offset: [[C1]]
 ; CHECK:    Ending address offset: [[C2:.*]]
 ; CHECK:     Location description: 11 07
-;        rax, piece 0x00000004
+;        rax
 ; CHECK: Beginning address offset: [[C2]]
 ; CHECK:    Ending address offset: [[R1:.*]]
-; CHECK:     Location description: 50 93 04
+; CHECK:     Location description: 50
 ;         rdi+0
 ; CHECK: Beginning address offset: [[R1]]
 ; CHECK:    Ending address offset: [[R2:.*]]
diff --git a/test/DebugInfo/X86/dbg-value-g-gmlt.ll b/test/DebugInfo/X86/dbg-value-g-gmlt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..45b9b0a1686296bdd40ad6147c52ce0ed5c321cd
--- /dev/null
+++ b/test/DebugInfo/X86/dbg-value-g-gmlt.ll
@@ -0,0 +1,100 @@
+; RUN: llc < %s -filetype=obj | llvm-dwarfdump - -debug-dump=info | FileCheck %s
+;
+; IR module created as follows:
+;   clang -emit-llvm -S -O2 foo.cpp -o foo.ll -g
+;   clang -emit-llvm -S -O2 bar.cpp -o bar.ll -gmlt
+;   llvm-link foo.ll bar.ll -S -o linked.ll
+;   opt -std-link-opts linked.ll -S -o opt.ll
+; --- foo.cpp ---
+; void f();
+; void foo(int param) {
+;   if (param) f();
+; }
+; --- bar.cpp ---
+; void foo(int);
+; void bar() {
+;   foo(0);
+; }
+; ---
+; The point is that bar() is compiled -gmlt and calls foo() with a constant 0.
+; foo() is compiled -g and gets inlined into bar(); foo's body is then
+; optimized away, leaving only a dbg.value call describing the inlined copy
+; of 'param', which should be benign.
+; That is, the compile-unit for bar.cpp should have nothing in it.
+; PR31437.
+
+; foo.cpp's unit comes first; skip past it, second unit should be empty.
+; CHECK:     DW_TAG_compile_unit
+; CHECK-NOT: {{DW_TAG|NULL}}
+; CHECK:     DW_AT_name {{.*}} "foo.cpp"
+; CHECK:     DW_TAG_compile_unit
+; CHECK-NOT: DW_TAG
+
+; ModuleID = 'linked.ll'
+source_filename = "llvm-link"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; Function Attrs: uwtable
+define void @_Z3fooi(i32 %param) local_unnamed_addr #0 !dbg !8 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 %param, i64 0, metadata !13, metadata !14), !dbg !15
+  %tobool = icmp eq i32 %param, 0, !dbg !16
+  br i1 %tobool, label %if.end, label %if.then, !dbg !18
+
+if.then:                                          ; preds = %entry
+  tail call void @_Z1fv(), !dbg !19
+  br label %if.end, !dbg !19
+
+if.end:                                           ; preds = %if.then, %entry
+  ret void, !dbg !21
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+declare void @_Z1fv() local_unnamed_addr #2
+
+; Function Attrs: nounwind readnone uwtable
+define void @_Z3barv() local_unnamed_addr #3 !dbg !22 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !13, metadata !14), !dbg !24
+  ret void, !dbg !26
+}
+
+attributes #0 = { uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind }
+attributes #3 = { nounwind readnone uwtable }
+
+!llvm.dbg.cu = !{!0, !3}
+!llvm.ident = !{!5, !5}
+!llvm.module.flags = !{!6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 293745)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "foo.cpp", directory: "/home/probinson/projects/scratch/pr31437")
+!2 = !{}
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 293745)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!4 = !DIFile(filename: "bar.cpp", directory: "/home/probinson/projects/scratch/pr31437")
+!5 = !{!"clang version 5.0.0 (trunk 293745)"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = distinct !DISubprogram(name: "foo", linkageName: "_Z3fooi", scope: !1, file: !1, line: 2, type: !9, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !12)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null, !11}
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{!13}
+!13 = !DILocalVariable(name: "param", arg: 1, scope: !8, file: !1, line: 2, type: !11)
+!14 = !DIExpression()
+!15 = !DILocation(line: 2, column: 14, scope: !8)
+!16 = !DILocation(line: 3, column: 7, scope: !17)
+!17 = distinct !DILexicalBlock(scope: !8, file: !1, line: 3, column: 7)
+!18 = !DILocation(line: 3, column: 7, scope: !8)
+!19 = !DILocation(line: 3, column: 14, scope: !20)
+!20 = !DILexicalBlockFile(scope: !17, file: !1, discriminator: 1)
+!21 = !DILocation(line: 4, column: 1, scope: !8)
+!22 = distinct !DISubprogram(name: "bar", scope: !4, file: !4, line: 2, type: !23, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !3, variables: !2)
+!23 = !DISubroutineType(types: !2)
+!24 = !DILocation(line: 2, column: 14, scope: !8, inlinedAt: !25)
+!25 = distinct !DILocation(line: 3, column: 3, scope: !22)
+!26 = !DILocation(line: 4, column: 1, scope: !22)
diff --git a/test/DebugInfo/X86/dbg-value-regmask-clobber.ll b/test/DebugInfo/X86/dbg-value-regmask-clobber.ll
index 93543e5ed948c97b06447dbca639ceb68c40abc5..b958f080d02e5f4c8cbee8eac8fcaaa5867a43c6 100644
--- a/test/DebugInfo/X86/dbg-value-regmask-clobber.ll
+++ b/test/DebugInfo/X86/dbg-value-regmask-clobber.ll
@@ -16,10 +16,8 @@
 ; ASM: .Ldebug_loc1:
 ; ASM-NEXT: .quad   .Lfunc_begin0-.Lfunc_begin0
 ; ASM-NEXT: .quad   [[argc_range_end]]-.Lfunc_begin0
-; ASM-NEXT: .short  3                       # Loc expr size
+; ASM-NEXT: .short  1                       # Loc expr size
 ; ASM-NEXT: .byte   82                      # super-register DW_OP_reg2
-; ASM-NEXT: .byte   147                     # DW_OP_piece
-; ASM-NEXT: .byte   4                       # 4
 
 ; argc is the first formal parameter.
 ; DWARF: .debug_info contents:
@@ -30,7 +28,7 @@
 ; DWARF: .debug_loc contents:
 ; DWARF: [[argc_loc_offset]]: Beginning address offset: 0x0000000000000000
 ; DWARF-NEXT:                    Ending address offset: 0x0000000000000013
-; DWARF-NEXT:                     Location description: 52 93 04
+; DWARF-NEXT:                     Location description: 52
 
 ; ModuleID = 't.cpp'
 source_filename = "test/DebugInfo/X86/dbg-value-regmask-clobber.ll"
diff --git a/test/DebugInfo/X86/debug-info-producer-with-flags.ll b/test/DebugInfo/X86/debug-info-producer-with-flags.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c004c8a9067ba659c3ef8efdd361016610126b77
--- /dev/null
+++ b/test/DebugInfo/X86/debug-info-producer-with-flags.ll
@@ -0,0 +1,44 @@
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o %t -filetype=obj
+; RUN: llvm-dwarfdump -debug-dump=info %t | FileCheck %s
+;
+; Test the DW_AT_producer DWARG attribute.
+; When producer and flags are both given in DIComileUnit, set DW_AT_producer
+; as two values combined.
+;
+; The test splits into two parts, this is LLVM part. The frontend part can be
+; found at llvm/tools/clang/test/Driver/debug-options.c.
+;
+; Generated and reduced from:
+; clang++ -g -grecord-gcc-switches test.cc -S -llvm-emit -o -
+;
+; test.cc:
+;   int main() {
+;     return 0;
+;   }
+
+; CHECK: DW_AT_producer
+; CHECK-SAME: "clang++ -g -grecord-gcc-switches test.cc -S -emit-llvm -o -"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() !dbg !6 {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval, align 4
+  ret i32 0, !dbg !10
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang++", isOptimized: false, flags: "-g -grecord-gcc-switches test.cc -S -emit-llvm -o -", runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "test.cc", directory: "d")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang"}
+!6 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 4, type: !7, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{!9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DILocation(line: 5, column: 3, scope: !6)
diff --git a/test/DebugInfo/X86/debug_and_nodebug_CUs.ll b/test/DebugInfo/X86/debug_and_nodebug_CUs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8347893ccf35d55e9025855069dac71992ccec7b
--- /dev/null
+++ b/test/DebugInfo/X86/debug_and_nodebug_CUs.ll
@@ -0,0 +1,82 @@
+; Test to ensure that a module containing both a NoDebug CU and one with
+; debug is handled correctly.
+
+; LLVM IR was generated the following way:
+; $ cat a.cpp
+; void f1();
+; __attribute__((always_inline)) void f2() {
+;     f1();
+; }
+; void f3();
+; void f4() {
+;     f3();
+; }
+; $ cat b.cpp
+; void f2();
+; __attribute__((always_inline)) void f3() {
+;     f2();
+; }
+; $ clang++ -flto a.cpp -g -c
+; $ clang++ -flto b.cpp -Rpass=inline -c
+; $ llvm-link {a,b}.o -o - | opt -O2 - -o ab.bc
+; $ llvm-dis ab.bc
+
+; Ensure we can successfully generate assembly, and check that neither
+; "b.cpp" nor "f3" strings show up (which would be in the .debug_str
+; section if we had generated any lexical scopes and debug for them).
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu %s -o - | FileCheck %s
+; CHECK-NOT: .asciz  "b.cpp"
+; CHECK-NOT: .asciz  "f3"
+
+; ModuleID = 'debug_and_nodebug_CUs.bc'
+source_filename = "debug_and_nodebug_CUs.ll"
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @_Z2f2v() local_unnamed_addr !dbg !8 {
+entry:
+  tail call void @_Z2f1v(), !dbg !11
+  ret void, !dbg !12
+}
+
+declare void @_Z2f1v() local_unnamed_addr
+
+define void @_Z2f4v() local_unnamed_addr !dbg !13 {
+entry:
+  tail call void @_Z2f1v(), !dbg !14
+  ret void, !dbg !19
+}
+
+define void @_Z2f3v() local_unnamed_addr !dbg !16 {
+entry:
+  tail call void @_Z2f1v(), !dbg !20
+  ret void, !dbg !22
+}
+
+!llvm.dbg.cu = !{!0, !3}
+!llvm.ident = !{!5, !5}
+!llvm.module.flags = !{!6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 294362) (llvm/trunk 294367)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "a.cpp", directory: ".")
+!2 = !{}
+!3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !4, producer: "clang version 5.0.0 (trunk 294362) (llvm/trunk 294367)", isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!4 = !DIFile(filename: "b.cpp", directory: ".")
+!5 = !{!"clang version 5.0.0 (trunk 294362) (llvm/trunk 294367)"}
+!6 = !{i32 2, !"Dwarf Version", i32 4}
+!7 = !{i32 2, !"Debug Info Version", i32 3}
+!8 = distinct !DISubprogram(name: "f2", linkageName: "_Z2f2v", scope: !1, file: !1, line: 2, type: !9, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!9 = !DISubroutineType(types: !10)
+!10 = !{null}
+!11 = !DILocation(line: 3, column: 3, scope: !8)
+!12 = !DILocation(line: 4, column: 1, scope: !8)
+!13 = distinct !DISubprogram(name: "f4", linkageName: "_Z2f4v", scope: !1, file: !1, line: 6, type: !9, isLocal: false, isDefinition: true, scopeLine: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!14 = !DILocation(line: 3, column: 3, scope: !8, inlinedAt: !15)
+!15 = distinct !DILocation(line: 3, column: 3, scope: !16, inlinedAt: !18)
+!16 = distinct !DISubprogram(name: "f3", scope: !4, file: !4, line: 2, type: !17, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !3, variables: !2)
+!17 = !DISubroutineType(types: !2)
+!18 = distinct !DILocation(line: 7, column: 3, scope: !13)
+!19 = !DILocation(line: 8, column: 1, scope: !13)
+!20 = !DILocation(line: 3, column: 3, scope: !8, inlinedAt: !21)
+!21 = distinct !DILocation(line: 3, column: 3, scope: !16)
+!22 = !DILocation(line: 4, column: 1, scope: !16)
diff --git a/test/DebugInfo/X86/default-subrange-array.ll b/test/DebugInfo/X86/default-subrange-array.ll
new file mode 100644
index 0000000000000000000000000000000000000000..564e195a36f67535366b31cd183aba075dcbebfc
--- /dev/null
+++ b/test/DebugInfo/X86/default-subrange-array.ll
@@ -0,0 +1,53 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -O0 -filetype=obj -dwarf-version 4 \
+; RUN:     -o - < %s | llvm-dwarfdump - -debug-dump=info \
+; RUN:     | FileCheck %s -check-prefixes=CHECK,DWARF4
+; RUN: llc -mtriple=x86_64-apple-darwin -O0 -filetype=obj -dwarf-version 5 \
+; RUN:     -o - < %s | llvm-dwarfdump - -debug-dump=info \
+; RUN:     | FileCheck %s -check-prefixes=CHECK,DWARF5
+
+; Check that we can omit default array lower-bounds.
+; DW_LANG_C_plus_plus_11 is new in DWARF v5, so if we use that with
+; DWARF v4, we should get the DW_AT_lower_bound attribute.
+
+source_filename = "test/DebugInfo/X86/default-subrange-array.ll"
+
+%class.A = type { [42 x i32] }
+
+@a = global %class.A zeroinitializer, align 4, !dbg !0
+
+; CHECK:       DW_TAG_class_type
+; CHECK:         DW_TAG_member
+; CHECK-NEXT:      DW_AT_name {{.*}} "x"
+; CHECK-NEXT:      DW_AT_type [DW_FORM_ref4] {{.*}} => {[[ARRAY:0x[0-9a-f]+]]})
+
+; CHECK: [[ARRAY]]: DW_TAG_array_type
+; CHECK-NEXT:         DW_AT_type
+; CHECK:            DW_TAG_subrange_type
+; CHECK-NEXT:         DW_AT_type
+; DWARF4-NEXT:        DW_AT_lower_bound [DW_FORM_data1] (0x00)
+; CHECK-NEXT:         DW_AT_count [DW_FORM_data1]       (0x2a)
+; DWARF5-NOT:         DW_AT_lower_bound
+
+
+!llvm.dbg.cu = !{!14}
+!llvm.module.flags = !{!17}
+
+!0 = !DIGlobalVariableExpression(var: !1)
+!1 = !DIGlobalVariable(name: "a", scope: null, file: !2, line: 1, type: !3, isLocal: false, isDefinition: true)
+!2 = !DIFile(filename: "t.cpp", directory: "/Volumes/Sandbox/llvm")
+!3 = !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !2, line: 1, align: 32, elements: !4)
+!4 = !{!5, !10}
+!5 = !DIDerivedType(tag: DW_TAG_member, name: "x", scope: !3, file: !2, line: 1, baseType: !6, flags: DIFlagPrivate)
+!6 = !DICompositeType(tag: DW_TAG_array_type, baseType: !7, align: 32, elements: !8)
+!7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+!8 = !{!9}
+!9 = !DISubrange(count: 42, lowerBound: 0)
+!10 = !DISubprogram(name: "A", scope: !3, file: !2, line: 1, type: !11, isLocal: false, isDefinition: false, scopeLine: 1, virtualIndex: 6, flags: DIFlagArtificial | DIFlagPrototyped, isOptimized: false)
+!11 = !DISubroutineType(types: !12)
+!12 = !{null, !13}
+!13 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !3, size: 64, align: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!14 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_11, file: !2, producer: "clang version 3.3 (trunk 169136)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !15, retainedTypes: !15, globals: !16, imports: !15)
+!15 = !{}
+!16 = !{!0}
+!17 = !{i32 1, !"Debug Info Version", i32 3}
+
diff --git a/test/DebugInfo/X86/discriminator.ll b/test/DebugInfo/X86/discriminator.ll
index 49b2326ac7448bf49e17bc51513164d9afb6b6f1..a040137adec46b5f65d2f3b23c75763bd9be547e 100644
--- a/test/DebugInfo/X86/discriminator.ll
+++ b/test/DebugInfo/X86/discriminator.ll
@@ -59,4 +59,4 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 
 ; CHECK: Address            Line   Column File   ISA Discriminator Flags
 ; CHECK: ------------------ ------ ------ ------ --- ------------- -------------
-; CHECK: 0x0000000000000011      2      0      1   0            42 {{$}}
+; CHECK: 0x000000000000000a      2      0      1   0            42 {{$}}
diff --git a/test/DebugInfo/X86/dw_op_minus_direct.ll b/test/DebugInfo/X86/dw_op_minus_direct.ll
index a84c506b90a730bb93d38a4d3c0eb58626ae123f..29e07213abbb242cd11d777222c83cb4b32b23a4 100644
--- a/test/DebugInfo/X86/dw_op_minus_direct.ll
+++ b/test/DebugInfo/X86/dw_op_minus_direct.ll
@@ -8,8 +8,8 @@
 
 ; CHECK: Beginning address offset: 0x0000000000000000
 ; CHECK:    Ending address offset: 0x0000000000000004
-; CHECK:     Location description: 50 10 01 1c 93 04
-;                                  rax, constu 0x00000001, minus, piece 0x00000004
+; CHECK:     Location description: 50 10 ff ff ff ff 0f 1a 10 01 1c
+;                                  rax, constu 0xffffffff, and, constu 0x00000001, minus
 source_filename = "minus.c"
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.12.0"
diff --git a/test/DebugInfo/X86/externaltyperef.ll b/test/DebugInfo/X86/externaltyperef.ll
deleted file mode 100644
index 4cd7f8c00c77ab4040c74bc2a99182eba5287539..0000000000000000000000000000000000000000
--- a/test/DebugInfo/X86/externaltyperef.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; REQUIRES: object-emission
-; RUN: %llc_dwarf -filetype=obj -O0 < %s | llvm-dwarfdump -debug-dump=info - | FileCheck %s
-; Manually derived by externalizing the composite types from:
-;
-;   namespace N { class B; }
-;   using N::B;
-;   class A;
-;   A *a;
-;
-; Test the direct use of an external type.
-; CHECK: DW_TAG_variable
-; CHECK:   DW_AT_type [DW_FORM_ref4]	  {{.*}}{[[PTR:.*]]}
-; CHECK: [[PTR]]: DW_TAG_pointer_type
-; CHECK:   DW_AT_type [DW_FORM_ref4]  	  {{.*}}{[[A:.*]]}
-; CHECK: [[A]]: DW_TAG_class_type
-; CHECK:   DW_AT_declaration [DW_FORM_flag]	(0x01)
-; CHECK:   DW_AT_signature [DW_FORM_ref_sig8]	(0x4e834ea939695c24)
-; CHECK: [[B:.*]]: DW_TAG_class_type
-; CHECK:   DW_AT_declaration [DW_FORM_flag]	(0x01)
-; CHECK:   DW_AT_signature [DW_FORM_ref_sig8]	(0x942e51c7addda5f7)
-; CHECK:   DW_TAG_imported_declaration
-; CHECK:     DW_AT_import [DW_FORM_ref4]  {{.*}}[[B]]
-
-source_filename = "test/DebugInfo/X86/externaltyperef.ll"
-target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.10.0"
-
-%class.A = type opaque
-
-@a = global %class.A* null, align 8, !dbg !0
-
-!llvm.dbg.cu = !{!2}
-!llvm.module.flags = !{!12, !13, !14}
-!llvm.ident = !{!15}
-
-!0 = !DIGlobalVariableExpression(var: !1)
-!1 = !DIGlobalVariable(name: "a", scope: !2, file: !3, line: 2, type: !11, isLocal: false, isDefinition: true)
-!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 3.7.0 (trunk 242039) (llvm/trunk 242046)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !5, globals: !8, imports: !9)
-!3 = !DIFile(filename: "test.cpp", directory: "/")
-!4 = !{}
-!5 = !{!6, !7}
-!6 = !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !3, flags: DIFlagExternalTypeRef, identifier: "_ZTS1A")
-!7 = !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !3, flags: DIFlagExternalTypeRef, identifier: "_ZTSN1N1BE")
-!8 = !{!0}
-!9 = !{!10}
-!10 = !DIImportedEntity(tag: DW_TAG_imported_declaration, scope: !2, entity: !7, line: 4)
-!11 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !6, size: 64, align: 64)
-!12 = !{i32 2, !"Dwarf Version", i32 2}
-!13 = !{i32 2, !"Debug Info Version", i32 3}
-!14 = !{i32 1, !"PIC Level", i32 2}
-!15 = !{!"clang version 3.7.0 (trunk 242039) (llvm/trunk 242046)"}
-
diff --git a/test/DebugInfo/X86/fission-ranges.ll b/test/DebugInfo/X86/fission-ranges.ll
index 0dfb13ab66b74d474d5ae398b73fb4b916bc30b1..60d0f1777a430879a1f1cbf2922f5db78806c996 100644
--- a/test/DebugInfo/X86/fission-ranges.ll
+++ b/test/DebugInfo/X86/fission-ranges.ll
@@ -30,16 +30,16 @@
 ; CHECK-NEXT: {{^$}}
 ; CHECK-NEXT:   Beginning address index: 3
 ; CHECK-NEXT:                    Length: 25
-; CHECK-NEXT:      Location description: 50 93 04
+; CHECK-NEXT:      Location description: 50
 ; CHECK: [[E]]: Beginning address index: 4
 ; CHECK-NEXT:                    Length: 19
-; CHECK-NEXT:      Location description: 50 93 04
+; CHECK-NEXT:      Location description: 50
 ; CHECK: [[B]]: Beginning address index: 5
 ; CHECK-NEXT:                    Length: 17
-; CHECK-NEXT:      Location description: 50 93 04
+; CHECK-NEXT:      Location description: 50
 ; CHECK: [[D]]: Beginning address index: 6
 ; CHECK-NEXT:                    Length: 17
-; CHECK-NEXT:      Location description: 50 93 04
+; CHECK-NEXT:      Location description: 50
 
 ; Make sure we don't produce any relocations in any .dwo section (though in particular, debug_info.dwo)
 ; HDR-NOT: .rela.{{.*}}.dwo
diff --git a/test/DebugInfo/X86/gnu-public-names-tu.ll b/test/DebugInfo/X86/gnu-public-names-tu.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0b7647aa8c78db3a7769aa224a739dfb0386ae97
--- /dev/null
+++ b/test/DebugInfo/X86/gnu-public-names-tu.ll
@@ -0,0 +1,54 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -generate-type-units -generate-gnu-dwarf-pub-sections -filetype=obj < %s | llvm-dwarfdump - | FileCheck %s
+
+; Generated from:
+
+; namespace ns {
+; struct foo {
+; };
+; }
+; struct bar {
+;   ns::foo f;
+; };
+; bar b;
+
+; CHECK-LABEL: .debug_info contents:
+; CHECK: [[CU:0x[0-9a-f]+]]: DW_TAG_compile_unit
+; CHECK: [[BAR:0x[0-9a-f]+]]: DW_TAG_structure_type
+
+
+; CHECK-LABEL: .debug_gnu_pubnames contents:
+; CHECK-NEXT: length = {{.*}} version = 0x0002 unit_offset = 0x00000000 unit_size = {{.*}}
+; CHECK-NEXT: Offset     Linkage  Kind     Name
+; CHECK-NEXT: [[CU]]     EXTERNAL TYPE     "ns"
+; CHECK-NEXT: {{.*}}     EXTERNAL VARIABLE "b"
+
+; CHECK-LABEL: debug_gnu_pubtypes contents:
+; CHECK-NEXT: length = {{.*}} version = 0x0002 unit_offset = 0x00000000 unit_size = {{.*}}
+; CHECK-NEXT: Offset     Linkage  Kind     Name
+; CHECK-NEXT: [[BAR]]    EXTERNAL TYPE     "bar"
+; CHECK-NEXT: [[CU]]     EXTERNAL TYPE     "ns::foo"
+
+%struct.bar = type { %"struct.ns::foo" }
+%"struct.ns::foo" = type { i8 }
+
+@b = global %struct.bar zeroinitializer, align 1, !dbg !0
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!11, !12}
+!llvm.ident = !{!13}
+
+!0 = !DIGlobalVariableExpression(var: !1)
+!1 = distinct !DIGlobalVariable(name: "b", scope: !2, file: !3, line: 8, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 5.0.0 (trunk 293904) (llvm/trunk 293908)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "type.cpp", directory: "/tmp/dbginfo")
+!4 = !{}
+!5 = !{!0}
+!6 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "bar", file: !3, line: 5, size: 8, elements: !7, identifier: "_ZTS3bar")
+!7 = !{!8}
+!8 = !DIDerivedType(tag: DW_TAG_member, name: "f", scope: !6, file: !3, line: 6, baseType: !9, size: 8)
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "foo", scope: !10, file: !3, line: 2, size: 8, elements: !4, identifier: "_ZTSN2ns3fooE")
+!10 = !DINamespace(name: "ns", scope: null, file: !3, line: 1)
+!11 = !{i32 2, !"Dwarf Version", i32 4}
+!12 = !{i32 2, !"Debug Info Version", i32 3}
+!13 = !{!"clang version 5.0.0 (trunk 293904) (llvm/trunk 293908)"}
+
diff --git a/test/DebugInfo/X86/ref_addr_relocation.ll b/test/DebugInfo/X86/ref_addr_relocation.ll
index 58fc236e6bbb6febc7c05043256d7575b04987ac..373ccfd2dea47964a3ed72287ea1b21bc08a0f25 100644
--- a/test/DebugInfo/X86/ref_addr_relocation.ll
+++ b/test/DebugInfo/X86/ref_addr_relocation.ll
@@ -46,7 +46,7 @@
 ; CHECK: DW_TAG_variable
 ; Make sure this is relocatable.
 ; and test that we don't create the labels to emit a correct COFF relocation
-; ELF-ASM: .quad .Lsection_info+[[TYPE]] # DW_AT_type
+; ELF-ASM: .quad .debug_info+[[TYPE]] # DW_AT_type
 ; COFF-ASM: .secrel32 .Lsection_info+[[TYPE]] # DW_AT_type
 ; DARWIN-ASM2: .quad [[TYPE]] ## DW_AT_type
 ; DARWIN-ASM4: .long [[TYPE]] ## DW_AT_type
diff --git a/test/DebugInfo/X86/single-dbg_value.ll b/test/DebugInfo/X86/single-dbg_value.ll
index 0275c37d24e7ff6589e3755b7302f62c4aeebba2..7f77e61092db339b01a9b25b0e5807aa1684260a 100644
--- a/test/DebugInfo/X86/single-dbg_value.ll
+++ b/test/DebugInfo/X86/single-dbg_value.ll
@@ -8,8 +8,8 @@
 ; CHECK-NEXT:   DW_AT_location [DW_FORM_data4]
 ; CHECK-NEXT:   DW_AT_name{{.*}}"a"
 ; CHECK: .debug_loc contents:
-;                               rax, piece 0x00000004
-; CHECK:  Location description: 50 93 04
+;                               rax
+; CHECK:  Location description: 50
 ; SANITY: DBG_VALUE
 ; SANITY-NOT: DBG_VALUE
 ; ModuleID = 'test.ll'
diff --git a/test/DebugInfo/X86/single-fi.ll b/test/DebugInfo/X86/single-fi.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1de4a3bac5955df10f306c9447217b5479f57fa4
--- /dev/null
+++ b/test/DebugInfo/X86/single-fi.ll
@@ -0,0 +1,40 @@
+; RUN: llc -mtriple=x86_64-apple-darwin -o - %s -filetype=obj \
+; RUN:   | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; A single FI location. This used to trigger an assertion in debug libstdc++.
+; CHECK: DW_TAG_formal_parameter
+;                                          fbreg -8
+; CHECK-NEXT: DW_AT_location {{.*}} (<0x2> 91 78 )
+; CHECK-NEXT: DW_AT_name {{.*}} "dipsy"
+define void @tinkywinky(i8* %dipsy) !dbg !6 {
+entry:
+  %dipsy.addr = alloca i8*
+  store i8* %dipsy, i8** %dipsy.addr
+  call void @llvm.dbg.declare(metadata i8** %dipsy.addr, metadata !12, metadata
+!13), !dbg !14
+  ret void, !dbg !15
+}
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (trunk 297917) (llvm/trunk 297929)", isOptimized: false,
+runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "teletubbies.c", directory: "/home/davide/work/llvm/build-clang/bin")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 5.0.0 (trunk 297917) (llvm/trunk 297929)"}
+!6 = distinct !DISubprogram(name: "tinkywinky", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 1, flags:
+DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null, !9}
+!9 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !10, size: 64)
+!10 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !11)
+!11 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!12 = !DILocalVariable(name: "dipsy", arg: 1, scope: !6, file: !1, line: 1, type: !9)
+!13 = !DIExpression()
+!14 = !DILocation(line: 1, column: 29, scope: !6)
+!15 = !DILocation(line: 1, column: 37, scope: !6)
diff --git a/test/DebugInfo/X86/split-global.ll b/test/DebugInfo/X86/split-global.ll
index 536ed045b5d65bc88320385eaab4fa2dd0f031fb..3cdecdafc8d47c414b014a466127a400db135b57 100644
--- a/test/DebugInfo/X86/split-global.ll
+++ b/test/DebugInfo/X86/split-global.ll
@@ -30,7 +30,7 @@ target triple = "x86_64-apple-macosx10.12.0"
 @point.y = global i32 2, align 4, !dbg !13
 @point.x = global i32 1, align 4, !dbg !12
 
-@part_const.x = global i32 1, align 4, !dbg !15
+@part_const.x = global i32 1, align 4, !dbg !14
 
 !llvm.dbg.cu = !{!1}
 !llvm.module.flags = !{!10, !11}
diff --git a/test/DebugInfo/X86/stack-value-dwarf4.ll b/test/DebugInfo/X86/stack-value-dwarf4.ll
index 5f9213f3bdd4a59afa7fc791ab9acdb86e0e7a10..7ad7cceb7ff0d02090ee4564c2ecf4f624db773c 100644
--- a/test/DebugInfo/X86/stack-value-dwarf4.ll
+++ b/test/DebugInfo/X86/stack-value-dwarf4.ll
@@ -1,38 +1,42 @@
 ; RUN: llc -o - %s | FileCheck --check-prefix=CHECK-DWARF2 %s
 ; RUN: llc -dwarf-version=4 -o - %s | FileCheck --check-prefix=CHECK-DWARF4 %s
 
+; Exercise DW_OP_stack_value on global constants.
+
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-; CHECK-DWARF2:      .byte	13                      # DW_AT_location
-; CHECK-DWARF2-NEXT: .byte	3
-; CHECK-DWARF2-NEXT: .quad	g
-; CHECK-DWARF2-NEXT: .byte	16
-; CHECK-DWARF2-NEXT: .byte	4
-; CHECK-DWARF2-NEXT: .byte	16
-; CHECK-DWARF2-NEXT: .byte	4
-
-; CHECK-DWARF4:      .byte	14                      # DW_AT_location
-; CHECK-DWARF4-NEXT: .byte	3
-; CHECK-DWARF4-NEXT: .quad	g
-; CHECK-DWARF4-NEXT: .byte	16
-; CHECK-DWARF4-NEXT: .byte	4
-; CHECK-DWARF4-NEXT: .byte	16
-; CHECK-DWARF4-NEXT: .byte	4
-; CHECK-DWARF4-NEXT: .byte	159
+; CHECK-DWARF2: .byte	8                       # DW_AT_location
+; CHECK-DWARF2	.byte	16
+; CHECK-DWARF2	.byte	4
+; CHECK-DWARF2	.byte	147
+; CHECK-DWARF2	.byte	2
+; CHECK-DWARF2	.byte	16
+; CHECK-DWARF2	.byte	0
+; CHECK-DWARF2	.byte	147
+; CHECK-DWARF2	.byte	2
 
-@g = global i32 0, !dbg !2
+; CHECK-DWARF4:       .byte	10                      # DW_AT_location
+; CHECK-DWARF4-NEXT:  .byte	16
+; CHECK-DWARF4-NEXT:  .byte	4
+; CHECK-DWARF4-NEXT:  .byte	159
+; CHECK-DWARF4-NEXT:  .byte	147
+; CHECK-DWARF4-NEXT:  .byte	2
+; CHECK-DWARF4-NEXT:  .byte	16
+; CHECK-DWARF4-NEXT:  .byte	0
+; CHECK-DWARF4-NEXT:  .byte	159
 
 !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang", file: !4, globals: !1, emissionKind: FullDebug)
-!1 = !{!2}
+!1 = !{!2, !10}
 !2 = !DIGlobalVariableExpression(var: !8, expr: !3)
-!3 = !DIExpression(DW_OP_constu, 4, DW_OP_constu, 4, DW_OP_stack_value)
+!3 = !DIExpression(DW_OP_constu, 4, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 16)
 !4 = !DIFile(filename: "<stdin>", directory: "/")
 !5 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
-
 !6 = !{i32 2, !"Dwarf Version", i32 2}
 !7 = !{i32 2, !"Debug Info Version", i32 3}
 !8 = distinct !DIGlobalVariable(name: "a", scope: null, isLocal: false, isDefinition: true, type: !5)
+!9 = !DIExpression(DW_OP_constu, 0, DW_OP_stack_value, DW_OP_LLVM_fragment, 16, 16)
+!10 = !DIGlobalVariableExpression(var: !8, expr: !9)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!6, !7}
diff --git a/test/DebugInfo/X86/subreg.ll b/test/DebugInfo/X86/subreg.ll
index 5e837edfd2bf6debd913b5c6cef645bc0f4217d6..30c672396e4e96d3daf680de3f5cfea616919d97 100644
--- a/test/DebugInfo/X86/subreg.ll
+++ b/test/DebugInfo/X86/subreg.ll
@@ -4,8 +4,9 @@
 ; being in its superregister.
 
 ; CHECK: .byte   80                      # super-register DW_OP_reg0
-; CHECK-NEXT: .byte   147                # DW_OP_piece
-; CHECK-NEXT: .byte   2                  # 2
+; No need to a piece at offset 0.
+; CHECK-NOT: DW_OP_piece
+; CHECK-NOT: DW_OP_bit_piece
 
 define i16 @f(i16 signext %zzz) nounwind !dbg !1 {
 entry:
diff --git a/test/DebugInfo/X86/subregisters.ll b/test/DebugInfo/X86/subregisters.ll
index d40be0d9e3c1270fad79d7f9f2d7068c092e824d..99f7a10e443bf99f707d356873618c914e6fefae 100644
--- a/test/DebugInfo/X86/subregisters.ll
+++ b/test/DebugInfo/X86/subregisters.ll
@@ -2,7 +2,7 @@
 ; RUN: llvm-dwarfdump %t.o | FileCheck %s
 ;
 ; Test that on x86_64, the 32-bit subregister esi is emitted as
-; DW_OP_piece 32 of the 64-bit rsi.
+; subregister of the 64-bit rsi.
 ;
 ; rdar://problem/16015314
 ;
@@ -11,8 +11,8 @@
 ; CHECK-NEXT:  DW_AT_location [DW_FORM_data4]	(0x00000000)
 ; CHECK-NEXT:  DW_AT_name [DW_FORM_strp]{{.*}} "a"
 ; CHECK: .debug_loc contents:
-;                                    rsi, piece 0x00000004
-; CHECK:       Location description: 54 93 04 
+;                                    rsi
+; CHECK:       Location description: 54
 ;
 ; struct bar {
 ;   int a;
diff --git a/test/DebugInfo/X86/tls.ll b/test/DebugInfo/X86/tls.ll
index 19570d0e0c0acf9fee1bc7b900d2376ee480c89f..b6ea213dd74815bb680c6ea480f816c0114ffce5 100644
--- a/test/DebugInfo/X86/tls.ll
+++ b/test/DebugInfo/X86/tls.ll
@@ -78,7 +78,6 @@
 
 ; check that the expected TLS address description is the first thing in the debug_addr section
 ; FISSION: .section    .debug_addr
-; FISSION: addr_sec:
 ; FISSION-NEXT: .quad  tls@DTPOFF
 ; FISSION-NEXT: .quad  glbl
 ; FISSION-NOT: .quad  glbl
diff --git a/test/DebugInfo/dwarfdump-header.test b/test/DebugInfo/dwarfdump-header.test
new file mode 100644
index 0000000000000000000000000000000000000000..3947c8b438d2acd9dc8e41adca91947a42036250
--- /dev/null
+++ b/test/DebugInfo/dwarfdump-header.test
@@ -0,0 +1,29 @@
+RUN: llvm-dwarfdump %p/Inputs/dwarfdump-header.elf-x86-64 | FileCheck %s
+
+The input file is hand-coded assembler to generate all the units,
+so we're willing to make exact checks for offsets and such.
+
+CHECK-LABEL: .debug_info contents:
+
+The v4 CU header.
+
+CHECK: 0x00000000: Compile Unit: length = 0x00000011 version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000015)
+CHECK: 0x0000000b: DW_TAG_compile_unit
+
+The v5 normal CU header.
+
+CHECK: 0x00000015: Compile Unit: length = 0x00000012 version = 0x0005 unit_type = DW_UT_compile abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000002b)
+CHECK: 0x00000021: DW_TAG_compile_unit
+
+CHECK-LABEL: .debug_types contents:
+
+The v4 type unit header.
+
+CHECK: 0x00000000: Type Unit: length = 0x0000001f version = 0x0004 abbr_offset = 0x0000 addr_size = 0x08 name = 'V4_type_unit' type_signature = 0x0011223344556677 type_offset = 0x001c (next unit at 0x00000023)
+CHECK: 0x00000017: DW_TAG_type_unit
+
+FIXME: DWARF v5 wants type units in .debug_info[.dwo] not .debug_types[.dwo].
+CHECK: .debug_types.dwo contents:
+
+CHECK: 0x00000000: Type Unit: length = 0x00000020 version = 0x0005 unit_type = DW_UT_split_type abbr_offset = 0x0000 addr_size = 0x08 name = 'V5_split_type_unit' type_signature = 0x8899aabbccddeeff type_offset = 0x001d (next unit at 0x00000024)
+CHECK: 0x00000018: DW_TAG_type_unit
diff --git a/test/DebugInfo/strip-loop-metadata.ll b/test/DebugInfo/strip-loop-metadata.ll
index 1b1c9cac4e034b8288bbf67f882267650c4fa52b..e0d8cdfaf46956267ae838535975a6c4765e43f5 100644
--- a/test/DebugInfo/strip-loop-metadata.ll
+++ b/test/DebugInfo/strip-loop-metadata.ll
@@ -18,6 +18,7 @@ return:
 }
 
 declare void @_Z3barv()
+declare i1 @_Z3bazv()
 
 ; CHECK-LABEL: _Z5test2v
 ; CHECK: br {{.*}} !llvm.loop [[LOOP:![0-9]+]]
@@ -33,6 +34,42 @@ return:
   ret void, !dbg !21
 }
 
+; CHECK-LABEL: _Z5test3v
+define void @_Z5test3v() !dbg !22 {
+entry:
+  br label %while.body, !dbg !23
+
+while.body:
+  %c = call i1 @_Z3bazv()
+  br i1 %c, label %if, label %then
+
+if:
+  call void @_Z3barv(), !dbg !24
+; CHECK: br {{.*}} !llvm.loop [[LOOP2:![0-9]+]]
+  br label %while.body, !dbg !25, !llvm.loop !26
+
+then:
+; CHECK: br {{.*}} !llvm.loop [[LOOP2]]
+  br label %while.body, !dbg !25, !llvm.loop !26
+
+return:
+  ret void, !dbg !28
+}
+
+; CHECK-LABEL: _Z5test4v
+; CHECK-NOT: br {{.*}} !llvm.loop
+define void @_Z5test4v() !dbg !30 {
+entry:
+  br label %while.body, !dbg !31
+
+while.body:
+  call void @_Z3barv(), !dbg !32
+  br label %while.body, !dbg !33, !llvm.loop !34
+
+return:
+  ret void, !dbg !36
+}
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
@@ -59,6 +96,21 @@ return:
 !19 = distinct !{!19, !16, !20}
 !20 = !{!"llvm.loop.unroll.enable"}
 !21 = !DILocation(line: 12, column: 1, scope: !15)
+!22 = distinct !DISubprogram(name: "test3", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!23 = !DILocation(line: 8, column: 14, scope: !22)
+!24 = !DILocation(line: 11, column: 5, scope: !22)
+!25 = !DILocation(line: 10, column: 3, scope: !22)
+!26 = distinct !{!26, !23, !29, !27}
+!27 = !{!"llvm.loop.unroll.enable"}
+!28 = !DILocation(line: 12, column: 1, scope: !22)
+!29 = !DILocation(line: 12, column: 1, scope: !22)
+!30 = distinct !DISubprogram(name: "test4", scope: !1, file: !1, line: 8, type: !8, isLocal: false, isDefinition: true, scopeLine: 8, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!31 = !DILocation(line: 8, column: 14, scope: !30)
+!32 = !DILocation(line: 11, column: 5, scope: !30)
+!33 = !DILocation(line: 10, column: 3, scope: !30)
+!34 = distinct !{!34, !31, !35}
+!35 = !DILocation(line: 12, column: 1, scope: !30)
+!36 = !DILocation(line: 12, column: 1, scope: !30)
 
 ; CHECK-NOT: !DICompileUnit
 ; CHECK-NOT: !DIFile
@@ -68,4 +120,5 @@ return:
 ; CHECK-NOT: !DILexicalBlockFile
 ; CHECK: [[LOOP]] = distinct !{[[LOOP]], [[LOOP_UNROLL:![0-9]+]]}
 ; CHECK-NEXT: [[LOOP_UNROLL]] = !{!"llvm.loop.unroll.enable"}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[LOOP_UNROLL]]}
 ; CHECK-NOT: !DILocation
diff --git a/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_PIC_relocations.s b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_PIC_relocations.s
new file mode 100644
index 0000000000000000000000000000000000000000..ba00afc7ad99671eb76a7266066e79cb2f42365c
--- /dev/null
+++ b/test/ExecutionEngine/RuntimeDyld/AArch64/ELF_ARM64_PIC_relocations.s
@@ -0,0 +1,46 @@
+# RUN: llvm-mc -triple=arm64-none-linux-gnu -filetype=obj -o %T/pic-reloc.o %s
+# RUN: llvm-rtdyld -triple=arm64-none-linux-gnu -verify  -check=%s %T/pic-reloc.o \
+# RUN:    -map-section pic-reloc.o,.got=0x20000 -dummy-extern f=0x1234 -dummy-extern g=0x5678
+
+_s:
+  nop
+_a1:
+	adrp	x8, :got:f
+_a2:
+	adrp	x9, :got:g
+_a3:
+  adrp  x10, :got:_s
+_l1:
+  ldr x8, [x8, :got_lo12:f]
+_l2:
+  ldr x9, [x9, :got_lo12:g]
+_l3:
+  ldr x10, [x10, :got_lo12:_s]
+
+
+## We'll end up having two sections .text and .got,
+## each is located on the start of a memory page
+
+## Test that .got section has three entries pointing to f, g and _s
+# *{8}section_addr(pic-reloc.o, .got) = f
+# *{8}(section_addr(pic-reloc.o, .got) + 8) = g
+# *{8}(section_addr(pic-reloc.o, .got) + 16) = _s
+
+## Test that first adrp instruction really takes address of 
+## the .got section (_s label is on the start of a page)
+# rtdyld-check: _s + (((*{4}_a1)[30:29] + ((*{4}_a1)[23:5] << 2)) << 12) = section_addr(pic-reloc.o, .got)
+
+## Test that second adrp takes address of .got
+# rtdyld-check: _s + (((*{4}_a2)[30:29] + ((*{4}_a2)[23:5] << 2)) << 12) = section_addr(pic-reloc.o, .got)
+
+## Test that third adrp takes address of .got
+# rtdyld-check: _s + (((*{4}_a3)[30:29] + ((*{4}_a3)[23:5] << 2)) << 12) = section_addr(pic-reloc.o, .got)
+
+## Test that first ldr immediate value is 0 >> 3 = 0 (1st .got entry)
+# rtdyld-check: (*{4}_l1)[21:10] = 0
+
+## Test that second ldr immediate value is 8 >> 3 = 1 (2nd .got entry)
+# rtdyld-check: (*{4}_l2)[21:10] = 1
+
+## Test that third ldr immediate value is 16 >> 3 = 2 (3rd .got entry, addend is 0)
+# rtdyld-check: (*{4}_l3)[21:10] = 2
diff --git a/test/FileCheck/line-count.txt b/test/FileCheck/line-count.txt
index 6f91c2050bf8d855eacd0195dac24600d76f5d10..d39663e2dbad5f762cac4fab146319684ff75c8e 100644
--- a/test/FileCheck/line-count.txt
+++ b/test/FileCheck/line-count.txt
@@ -1,15 +1,15 @@
 ; RUN: FileCheck  -input-file %s %s
-2
-3 aaa
-4 bbb
-5 ccc
-6 CHECK: [[@LINE-3]] {{a}}aa
-7 CHECK: [[@LINE-3]] {{b}}bb
-8 CHECK: [[@LINE-3]] {{c}}cc
-9 foobar
-10 CHECK: [[@LINE-1]] {{foo}}bar
-11
-12 arst CHECK: [[@LINE]] {{a}}rst
-13
+; RUN: not FileCheck -check-prefix BAD -input-file %s %s
+3
+4 aaa
+5 bbb
+6 ccc
+7 CHECK: [[@LINE-3]] {{a}}aa
+8 CHECK: [[@LINE-3]] {{b}}bb
+9 CHECK: [[@LINE-3]] {{c}}cc
+10 foobar
+11 CHECK: [[@LINE-1]] {{foo}}bar
+12
+13 arst CHECK: [[@LINE]] {{a}}rst
 14
-
+15 BAD: [[@LINE:cant-have-regex]]
diff --git a/test/FileCheck/regex-scope.txt b/test/FileCheck/regex-scope.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e77f3f6513a87c38346252ce744fd2afa3a1aca8
--- /dev/null
+++ b/test/FileCheck/regex-scope.txt
@@ -0,0 +1,23 @@
+// RUN: FileCheck -check-prefix CHECK -input-file %s %s
+// RUN: FileCheck -check-prefixes CHECK,GLOBAL -input-file %s %s
+// RUN: FileCheck -check-prefixes CHECK,LOCAL -input-file %s %s
+// RUN: FileCheck -check-prefixes CHECK,GLOBAL --enable-var-scope -input-file %s %s
+// RUN: not FileCheck -check-prefixes CHECK,LOCAL --enable-var-scope -input-file %s %s
+
+local
+global
+; CHECK: [[LOCAL:loc.*]]
+; CHECK: [[$GLOBAL:glo.*]]
+
+local2
+global2
+; CHECK: [[LOCAL]]2
+; CHECK: [[$GLOBAL]]2
+
+barrier:
+; CHECK-LABEL: barrier
+
+local3
+global3
+; LOCAL: [[LOCAL]]3
+; GLOBAL: [[$GLOBAL]]3
diff --git a/test/Instrumentation/AddressSanitizer/basic.ll b/test/Instrumentation/AddressSanitizer/basic.ll
index 9c4d416a1eff6c9b327128659333e140b5de23bf..9827e7a6792b8bf6dad68acac48712c07ac74043 100644
--- a/test/Instrumentation/AddressSanitizer/basic.ll
+++ b/test/Instrumentation/AddressSanitizer/basic.ll
@@ -170,6 +170,32 @@ define void @memintr_test(i8* %a, i8* %b) nounwind uwtable sanitize_address {
 ; CHECK: __asan_memcpy
 ; CHECK: ret void
 
+; CHECK-LABEL: @test_swifterror
+; CHECK-NOT: __asan_report_load
+; CHECK: ret void
+define void @test_swifterror(i8** swifterror) sanitize_address {
+  %swifterror_ptr_value = load i8*, i8** %0
+  ret void
+}
+
+; CHECK-LABEL: @test_swifterror_2
+; CHECK-NOT: __asan_report_store
+; CHECK: ret void
+define void @test_swifterror_2(i8** swifterror) sanitize_address {
+  store i8* null, i8** %0
+  ret void
+}
+
+; CHECK-LABEL: @test_swifterror_3
+; CHECK-NOT: __asan_report_store
+; CHECK: ret void
+define void @test_swifterror_3() sanitize_address {
+  %swifterror_addr = alloca swifterror i8*
+  store i8* null, i8** %swifterror_addr
+  call void @test_swifterror_2(i8** swifterror %swifterror_addr)
+  ret void
+}
+
 ; CHECK: define internal void @asan.module_ctor()
 ; CHECK: call void @__asan_init()
 
diff --git a/test/Instrumentation/AddressSanitizer/freebsd.ll b/test/Instrumentation/AddressSanitizer/freebsd.ll
index 5178432d63aa8a5a532563471031006cf43fc284..f940b52b41f310b79c80cf86e1df8356d065c3d4 100644
--- a/test/Instrumentation/AddressSanitizer/freebsd.ll
+++ b/test/Instrumentation/AddressSanitizer/freebsd.ll
@@ -1,11 +1,11 @@
 ; RUN: opt < %s -asan -asan-module -S \
 ; RUN:     -mtriple=i386-unknown-freebsd \
-; RUN:     -default-data-layout="e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" | \
+; RUN:     -data-layout="e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" | \
 ; RUN:     FileCheck --check-prefix=CHECK-32 %s
 
 ; RUN: opt < %s -asan -asan-module -S \
 ; RUN:     -mtriple=x86_64-unknown-freebsd \
-; RUN:     -default-data-layout="e-m:e-i64:64-f80:128-n8:16:32:64-S128" | \
+; RUN:     -data-layout="e-m:e-i64:64-f80:128-n8:16:32:64-S128" | \
 ; RUN:     FileCheck --check-prefix=CHECK-64 %s
 
 define i32 @read_4_bytes(i32* %a) sanitize_address {
diff --git a/test/Instrumentation/AddressSanitizer/global_metadata_windows.ll b/test/Instrumentation/AddressSanitizer/global_metadata_windows.ll
index 686b506a96c4720b5a4250d080d8c3f1dc26f816..27cbd61ef81fa43570fab77bf0bfd5480215e62a 100644
--- a/test/Instrumentation/AddressSanitizer/global_metadata_windows.ll
+++ b/test/Instrumentation/AddressSanitizer/global_metadata_windows.ll
@@ -13,7 +13,7 @@ $mystr = comdat any
 
 ; CHECK: $dead_global = comdat noduplicates
 ; CHECK: @dead_global = local_unnamed_addr global { i32, [60 x i8] } { i32 42, [60 x i8] zeroinitializer }, comdat, align 32
-; CHECK: @__asan_global_dead_global = internal global { {{.*}} }, section ".ASAN$GL", comdat($dead_global), align 64
+; CHECK: @__asan_global_dead_global = private global { {{.*}} }, section ".ASAN$GL", comdat($dead_global), align 64
 
 @dead_global = local_unnamed_addr global i32 42, align 4
 @mystr = linkonce_odr unnamed_addr constant [5 x i8] c"main\00", comdat, align 1
diff --git a/test/Instrumentation/AddressSanitizer/instrument_load_then_store.ll b/test/Instrumentation/AddressSanitizer/instrument_load_then_store.ll
index 01a7a6610cafadb483f9d519e01ccfcfa9f91c4d..8341697ff48c99f2948637e06e08a9c3776b76d3 100644
--- a/test/Instrumentation/AddressSanitizer/instrument_load_then_store.ll
+++ b/test/Instrumentation/AddressSanitizer/instrument_load_then_store.ll
@@ -16,10 +16,10 @@ entry:
 ; OPT1: IncrementMe
 ; OPT1: __asan_report_
 ; OPT1-NOT: __asan_report_
-; OPT1: asan.module_ctor
+; OPT1: ret void
 
 ; Without optimizations we should see two calls to __asan_report_*
 ; OPT0: IncrementMe
 ; OPT0: __asan_report_
 ; OPT0: __asan_report_
-; OPT0: asan.module_ctor
+; OPT0: ret void
diff --git a/test/Instrumentation/AddressSanitizer/lifetime-throw.ll b/test/Instrumentation/AddressSanitizer/lifetime-throw.ll
index 6d0cbd9ad5aace366dccdf273d2af3cd6a396632..ff03d10c7c5d7de6ce8313475ad741d4b171bfda 100644
--- a/test/Instrumentation/AddressSanitizer/lifetime-throw.ll
+++ b/test/Instrumentation/AddressSanitizer/lifetime-throw.ll
@@ -23,7 +23,7 @@ entry:
   ; Poison memory in prologue: F1F1F1F1F8F3F3F3
   ; CHECK: store i64 -868082052615769615, i64* %{{[0-9]+}}
 
-  call void @llvm.lifetime.start(i64 4, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0)
   ; CHECK: store i8 4, i8* %{{[0-9]+}}
   ; CHECK-NEXT: @llvm.lifetime.start
 
@@ -37,7 +37,7 @@ lpad:
   %1 = landingpad { i8*, i32 }
           cleanup
   call void @_ZN3ABCD2Ev(%struct.ABC* nonnull %x)
-  call void @llvm.lifetime.end(i64 4, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0)
   ; CHECK: store i8 -8, i8* %{{[0-9]+}}
   ; CHECK-NEXT: @llvm.lifetime.end
 
@@ -77,7 +77,7 @@ entry:
   ; Poison memory in prologue: F1F1F1F1F8F304F2
   ; CHECK: store i64 -935355671561244175, i64* %{{[0-9]+}}
 
-  call void @llvm.lifetime.start(i64 4, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0)
   ; CHECK: store i8 4, i8* %{{[0-9]+}}
   ; CHECK-NEXT: @llvm.lifetime.start
 
@@ -90,7 +90,7 @@ entry:
 ehcleanup:
   %2 = cleanuppad within none []
   call void @"\01??1ABC@@QEAA@XZ"(%struct.ABC* nonnull %x) [ "funclet"(token %2) ]
-  call void @llvm.lifetime.end(i64 4, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0)
   ; CHECK: store i8 -8, i8* %{{[0-9]+}}
   ; CHECK-NEXT: @llvm.lifetime.end
 
@@ -104,8 +104,8 @@ unreachable:
 
 
 declare i32 @__gxx_personality_v0(...)
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @__cxa_throw(i8*, i8*, i8*) local_unnamed_addr
 declare i8* @__cxa_allocate_exception(i64) local_unnamed_addr
 declare void @_ZN3ABCD2Ev(%struct.ABC* %this) unnamed_addr
diff --git a/test/Instrumentation/AddressSanitizer/lifetime-uar-uas.ll b/test/Instrumentation/AddressSanitizer/lifetime-uar-uas.ll
index 93708e350fa38769e76133420f54135fb5cbcc8f..437b6a94185b8a535e3ade57cc704ce951faed38 100644
--- a/test/Instrumentation/AddressSanitizer/lifetime-uar-uas.ll
+++ b/test/Instrumentation/AddressSanitizer/lifetime-uar-uas.ll
@@ -6,8 +6,8 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 
 define i32 @basic_test() sanitize_address {
   ; CHECK-LABEL: define i32 @basic_test()
@@ -19,14 +19,14 @@ entry:
   ; Memory is poisoned in prologue: F1F1F1F104F3F8F2
   ; CHECK-UAS: store i64 -866676825215864335, i64* %{{[0-9]+}}
 
-  call void @llvm.lifetime.start(i64 1, i8* %c)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %c)
   ; Memory is unpoisoned at llvm.lifetime.start: 01
   ; CHECK-UAS: store i8 1, i8* %{{[0-9]+}}
 
   store volatile i32 0, i32* %retval
   store volatile i8 0, i8* %c, align 1
 
-  call void @llvm.lifetime.end(i64 1, i8* %c)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %c)
   ; Memory is poisoned at llvm.lifetime.end: F8
   ; CHECK-UAS: store i8 -8, i8* %{{[0-9]+}}
 
diff --git a/test/Instrumentation/AddressSanitizer/lifetime.ll b/test/Instrumentation/AddressSanitizer/lifetime.ll
index be72124f3ab628a394022d1c1447fbb3a63ec313..b951afdc670f1925daaeaee31f23e6e5717b19c5 100644
--- a/test/Instrumentation/AddressSanitizer/lifetime.ll
+++ b/test/Instrumentation/AddressSanitizer/lifetime.ll
@@ -5,8 +5,8 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 
 define void @lifetime_no_size() sanitize_address {
   ; CHECK-LABEL: define void @lifetime_no_size()
@@ -17,7 +17,7 @@ entry:
   ; Poison memory in prologue: F1F1F1F104F3F3F3
   ; CHECK: store i64 -868083100587789839, i64* %{{[0-9]+}}
 
-  call void @llvm.lifetime.start(i64 -1, i8* %i.ptr)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %i.ptr)
   ; Check that lifetime with no size are ignored.
   ; CHECK-NOT: store
   ; CHECK: call void @llvm.lifetime.start
@@ -25,7 +25,7 @@ entry:
   store volatile i8 0, i8* %i.ptr
   ; CHECK: store volatile
 
-  call void @llvm.lifetime.end(i64 -1, i8* %i.ptr)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %i.ptr)
   ; Check that lifetime with no size are ignored.
   ; CHECK-NOT: store
   ; CHECK: call void @llvm.lifetime.end
@@ -48,19 +48,19 @@ define void @lifetime() sanitize_address {
   ; CHECK: store i64 -868082052615769615, i64* %{{[0-9]+}}
 
   ; Memory is unpoisoned at llvm.lifetime.start
-  call void @llvm.lifetime.start(i64 3, i8* %i.ptr)
+  call void @llvm.lifetime.start.p0i8(i64 3, i8* %i.ptr)
   ; CHECK: store i8 4, i8* %{{[0-9]+}}
   ; CHECK-NEXT: llvm.lifetime.start
 
   store volatile i8 0, i8* %i.ptr
   ; CHECK: store volatile
 
-  call void @llvm.lifetime.end(i64 4, i8* %i.ptr)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %i.ptr)
   ; CHECK: store i8 -8, i8* %{{[0-9]+}}
   ; CHECK-NEXT: call void @llvm.lifetime.end
 
   ; Memory is poisoned at every call to llvm.lifetime.end
-  call void @llvm.lifetime.end(i64 2, i8* %i.ptr)
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* %i.ptr)
   ; CHECK: store i8 -8, i8* %{{[0-9]+}}
   ; CHECK-NEXT: call void @llvm.lifetime.end
 
@@ -68,26 +68,26 @@ define void @lifetime() sanitize_address {
   %arr = alloca [10 x i32], align 16
   %arr.ptr = bitcast [10 x i32]* %arr to i8*
 
-  call void @llvm.lifetime.start(i64 40, i8* %arr.ptr)
+  call void @llvm.lifetime.start.p0i8(i64 40, i8* %arr.ptr)
   ; CHECK-DEFAULT: call void @__asan_unpoison_stack_memory(i64 %{{[^ ]+}}, i64 40)
   ; CHECK-NO-DYNAMIC-NOT: call void @__asan_unpoison_stack_memory(i64 %{{[^ ]+}}, i64 40)
 
   store volatile i8 0, i8* %arr.ptr
   ; CHECK: store volatile
 
-  call void @llvm.lifetime.end(i64 40, i8* %arr.ptr)
+  call void @llvm.lifetime.end.p0i8(i64 40, i8* %arr.ptr)
   ; CHECK-DEFAULT: call void @__asan_poison_stack_memory(i64 %{{[^ ]+}}, i64 40)
   ; CHECK-NO-DYNAMIC-NOT: call void @__asan_poison_stack_memory(i64 %{{[^ ]+}}, i64 40)
 
   ; One more lifetime start/end for the same variable %i.
-  call void @llvm.lifetime.start(i64 2, i8* %i.ptr)
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* %i.ptr)
   ; CHECK: store i8 4, i8* %{{[0-9]+}}
   ; CHECK-NEXT: llvm.lifetime.start
 
   store volatile i8 0, i8* %i.ptr
   ; CHECK: store volatile
 
-  call void @llvm.lifetime.end(i64 4, i8* %i.ptr)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %i.ptr)
   ; CHECK: store i8 -8, i8* %{{[0-9]+}}
   ; CHECK-NEXT: llvm.lifetime.end
 
@@ -108,7 +108,7 @@ entry:
   ; Poison memory in prologue: F1F1F1F1F8F3F3F3
   ; CHECK: store i64 -868082052615769615, i64* %{{[0-9]+}}
 
-  call void @llvm.lifetime.start(i64 8, i8* %i.ptr)
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %i.ptr)
   ; CHECK: store i8 0, i8* %{{[0-9]+}}
   ; CHECK-NEXT: llvm.lifetime.start
 
@@ -123,7 +123,7 @@ bb0:
 
 bb1:
   %i.phi = phi i8* [ %i.ptr, %entry ], [ %i.ptr2, %bb0 ]
-  call void @llvm.lifetime.end(i64 8, i8* %i.phi)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %i.phi)
   ; CHECK: store i8 -8, i8* %{{[0-9]+}}
   ; CHECK-NEXT: llvm.lifetime.end
 
@@ -147,14 +147,14 @@ entry:
   ; CHECK: store i64 -868082074056920077, i64* %{{[0-9]+}}
 
   %0 = getelementptr inbounds [1024 x i8], [1024 x i8]* %x, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 1024, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1024, i8* %0)
   ; CHECK: call void @__asan_set_shadow_00(i64 %{{[0-9]+}}, i64 128)
   ; CHECK-NEXT: call void @llvm.lifetime.start
 
   store i8* %0, i8** %d, align 8
   ; CHECK: store i8
 
-  call void @llvm.lifetime.end(i64 1024, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 1024, i8* %0)
   ; CHECK: call void @__asan_set_shadow_f8(i64 %{{[0-9]+}}, i64 128)
   ; CHECK-NEXT: call void @llvm.lifetime.end
 
@@ -172,12 +172,12 @@ entry:
   store i64 %a, i64* %a.addr, align 8
 
   %0 = bitcast [0 x i8]* %b to i8*
-  call void @llvm.lifetime.start(i64 0, i8* %0) #2
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* %0) #2
   ; CHECK: %{{[0-9]+}} = bitcast
   ; CHECK-NEXT: call void @llvm.lifetime.start
 
   %1 = bitcast [0 x i8]* %b to i8*
-  call void @llvm.lifetime.end(i64 0, i8* %1) #2
+  call void @llvm.lifetime.end.p0i8(i64 0, i8* %1) #2
   ; CHECK-NEXT: %{{[0-9]+}} = bitcast
   ; CHECK-NEXT: call void @llvm.lifetime.end
 
diff --git a/test/Instrumentation/AddressSanitizer/ps4.ll b/test/Instrumentation/AddressSanitizer/ps4.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e160996866b4fe102785fe239b5c5317053e2a1e
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/ps4.ll
@@ -0,0 +1,14 @@
+; RUN: opt < %s -asan -asan-module -S -mtriple=x86_64-scei-ps4 | FileCheck %s
+
+define i32 @read_4_bytes(i32* %a) sanitize_address {
+entry:
+  %tmp1 = load i32, i32* %a, align 4
+  ret i32 %tmp1
+}
+
+; CHECK: @read_4_bytes
+; CHECK-NOT: ret
+; Check for ASAN's Offset on the PS4 (2^40 or 0x10000000000)
+; CHECK: lshr {{.*}} 3
+; CHECK-NEXT: {{1099511627776}}
+; CHECK: ret
diff --git a/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll b/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll
index 059c49a3457cc2257f3248d040439688ca664443..569a67d6d35672dac732d643aa9dd91919d783fe 100644
--- a/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll
+++ b/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime-be.ll
@@ -91,7 +91,7 @@ entry:
   ; CHECK-NEXT: %zz = getelementptr inbounds
 
 
-  call void @llvm.lifetime.start(i64 650, i8* %xx)
+  call void @llvm.lifetime.start.p0i8(i64 650, i8* %xx)
   ; 0000...
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 4
   ; ENTRY-UAS-NEXT: call void @__asan_set_shadow_00(i64 [[OFFSET]], i64 81)
@@ -100,39 +100,39 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i8]]*
   ; ENTRY-UAS-NEXT: store [[TYPE]] 2, [[TYPE]]* [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start(i64 650, i8* %xx)
+  ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 650, i8* %xx)
 
   call void @Foo(i8* %xx)
   ; CHECK-NEXT: call void @Foo(i8* %xx)
 
-  call void @llvm.lifetime.end(i64 650, i8* %xx)
+  call void @llvm.lifetime.end.p0i8(i64 650, i8* %xx)
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 4
   ; ENTRY-UAS-NEXT: call void @__asan_set_shadow_f8(i64 [[OFFSET]], i64 82)
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end(i64 650, i8* %xx)
+  ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 650, i8* %xx)
 
 
-  call void @llvm.lifetime.start(i64 13, i8* %yy)
+  call void @llvm.lifetime.start.p0i8(i64 13, i8* %yy)
   ; 0005
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 102
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i16]]*
   ; ENTRY-UAS-NEXT: store [[TYPE]] 5, [[TYPE]]* [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start(i64 13, i8* %yy)
+  ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 13, i8* %yy)
 
   call void @Foo(i8* %yy)
   ; CHECK-NEXT: call void @Foo(i8* %yy)
 
-  call void @llvm.lifetime.end(i64 13, i8* %yy)
+  call void @llvm.lifetime.end.p0i8(i64 13, i8* %yy)
   ; F8F8
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 102
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i16]]*
   ; ENTRY-UAS-NEXT: store [[TYPE]] -1800, [[TYPE]]* [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end(i64 13, i8* %yy)
+  ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 13, i8* %yy)
 
 
-  call void @llvm.lifetime.start(i64 40, i8* %zz)
+  call void @llvm.lifetime.start.p0i8(i64 40, i8* %zz)
   ; 00000000
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 106
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i32]]*
@@ -142,12 +142,12 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i8]]*
   ; ENTRY-UAS-NEXT: store [[TYPE]] 0, [[TYPE]]* [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start(i64 40, i8* %zz)
+  ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 40, i8* %zz)
 
   call void @Foo(i8* %zz)
   ; CHECK-NEXT: call void @Foo(i8* %zz)
 
-  call void @llvm.lifetime.end(i64 40, i8* %zz)
+  call void @llvm.lifetime.end.p0i8(i64 40, i8* %zz)
   ; F8F8F8F8
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 106
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i32]]*
@@ -157,7 +157,7 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i8]]*
   ; ENTRY-UAS-NEXT: store [[TYPE]] -8, [[TYPE]]* [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end(i64 40, i8* %zz)
+  ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 40, i8* %zz)
 
   ; CHECK-LABEL: <label>
 
@@ -209,8 +209,8 @@ entry:
   ; CHECK: ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 ; CHECK-ON: declare void @__asan_set_shadow_00(i64, i64)
 ; CHECK-ON: declare void @__asan_set_shadow_f1(i64, i64)
diff --git a/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll b/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll
index 48d3b0e53ccbf747e216404797e07f8d588f86e8..0799b03e455ea025e1b2fe6a57bc0df19c07d6bc 100644
--- a/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll
+++ b/test/Instrumentation/AddressSanitizer/stack-poisoning-and-lifetime.ll
@@ -91,7 +91,7 @@ entry:
   ; CHECK-NEXT: %zz = getelementptr inbounds
 
 
-  call void @llvm.lifetime.start(i64 650, i8* %xx)
+  call void @llvm.lifetime.start.p0i8(i64 650, i8* %xx)
   ; 0000...
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 4
   ; ENTRY-UAS-NEXT: call void @__asan_set_shadow_00(i64 [[OFFSET]], i64 81)
@@ -100,39 +100,39 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i8]]*
   ; ENTRY-UAS-NEXT: store [[TYPE]] 2, [[TYPE]]* [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start(i64 650, i8* %xx)
+  ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 650, i8* %xx)
 
   call void @Foo(i8* %xx)
   ; CHECK-NEXT: call void @Foo(i8* %xx)
 
-  call void @llvm.lifetime.end(i64 650, i8* %xx)
+  call void @llvm.lifetime.end.p0i8(i64 650, i8* %xx)
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 4
   ; ENTRY-UAS-NEXT: call void @__asan_set_shadow_f8(i64 [[OFFSET]], i64 82)
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end(i64 650, i8* %xx)
+  ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 650, i8* %xx)
 
 
-  call void @llvm.lifetime.start(i64 13, i8* %yy)
+  call void @llvm.lifetime.start.p0i8(i64 13, i8* %yy)
   ; 0005
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 102
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i16]]*
   ; ENTRY-UAS-NEXT: store [[TYPE]] 1280, [[TYPE]]* [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start(i64 13, i8* %yy)
+  ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 13, i8* %yy)
 
   call void @Foo(i8* %yy)
   ; CHECK-NEXT: call void @Foo(i8* %yy)
 
-  call void @llvm.lifetime.end(i64 13, i8* %yy)
+  call void @llvm.lifetime.end.p0i8(i64 13, i8* %yy)
   ; F8F8
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 102
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i16]]*
   ; ENTRY-UAS-NEXT: store [[TYPE]] -1800, [[TYPE]]* [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end(i64 13, i8* %yy)
+  ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 13, i8* %yy)
 
 
-  call void @llvm.lifetime.start(i64 40, i8* %zz)
+  call void @llvm.lifetime.start.p0i8(i64 40, i8* %zz)
   ; 00000000
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 106
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i32]]*
@@ -142,12 +142,12 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i8]]*
   ; ENTRY-UAS-NEXT: store [[TYPE]] 0, [[TYPE]]* [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.start(i64 40, i8* %zz)
+  ; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 40, i8* %zz)
 
   call void @Foo(i8* %zz)
   ; CHECK-NEXT: call void @Foo(i8* %zz)
 
-  call void @llvm.lifetime.end(i64 40, i8* %zz)
+  call void @llvm.lifetime.end.p0i8(i64 40, i8* %zz)
   ; F8F8F8F8
   ; ENTRY-UAS-NEXT: [[OFFSET:%[0-9]+]] = add i64 [[SHADOW_BASE]], 106
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i32]]*
@@ -157,7 +157,7 @@ entry:
   ; ENTRY-UAS-NEXT: [[PTR:%[0-9]+]] = inttoptr i64 [[OFFSET]] to [[TYPE:i8]]*
   ; ENTRY-UAS-NEXT: store [[TYPE]] -8, [[TYPE]]* [[PTR]], align 1
 
-  ; CHECK-NEXT: call void @llvm.lifetime.end(i64 40, i8* %zz)
+  ; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 40, i8* %zz)
 
   ; CHECK-LABEL: <label>
 
@@ -209,8 +209,8 @@ entry:
   ; CHECK: ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 ; CHECK-ON: declare void @__asan_set_shadow_00(i64, i64)
 ; CHECK-ON: declare void @__asan_set_shadow_f1(i64, i64)
diff --git a/test/Instrumentation/AddressSanitizer/stack_layout.ll b/test/Instrumentation/AddressSanitizer/stack_layout.ll
index 96706f70c83b8a170d79917f02c7a04496e82434..4e756f9ab2f251ad4dcb85ad3d9ed0d2e643cfd4 100644
--- a/test/Instrumentation/AddressSanitizer/stack_layout.ll
+++ b/test/Instrumentation/AddressSanitizer/stack_layout.ll
@@ -9,8 +9,8 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-unknown-linux-gnu"
 
 declare void @Use(i8*)
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 
 ; CHECK: private unnamed_addr constant{{.*}}3 32 10 3 XXX 64 20 3 YYY 128 30 3 ZZZ\0
 ; CHECK: private unnamed_addr constant{{.*}}3 32 5 3 AAA 64 55 3 BBB 160 555 3 CCC\0
@@ -87,13 +87,13 @@ define void @Func5() sanitize_address #0 !dbg !11 {
   %AAA = alloca i32, align 4  ; File is not the same as !11
   %BBB = alloca i32, align 4  ; File is the same as !11
   %BBB.ptr = bitcast i32* %BBB to i8*
-  call void @llvm.lifetime.start(i64 4, i8* nonnull %BBB.ptr), !dbg !12
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %BBB.ptr), !dbg !12
   store volatile i32 5, i32* %BBB, align 4
   %AAA.ptr = bitcast i32* %AAA to i8*
-  call void @llvm.lifetime.start(i64 4, i8* nonnull %AAA.ptr), !dbg !14
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* nonnull %AAA.ptr), !dbg !14
   store volatile i32 3, i32* %AAA, align 4
-  call void @llvm.lifetime.end(i64 4, i8* nonnull %AAA.ptr), !dbg !17
-  call void @llvm.lifetime.end(i64 4, i8* nonnull %BBB.ptr), !dbg !18
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %AAA.ptr), !dbg !17
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* nonnull %BBB.ptr), !dbg !18
   ret void
 }
 
diff --git a/test/Instrumentation/InstrProfiling/PR23499.ll b/test/Instrumentation/InstrProfiling/PR23499.ll
index 47c60fd802ab12fd9085ba47b67898386d69d772..8c4ef3712d0617f2435b7e46d38c7a55daa6c1e6 100644
--- a/test/Instrumentation/InstrProfiling/PR23499.ll
+++ b/test/Instrumentation/InstrProfiling/PR23499.ll
@@ -13,15 +13,15 @@ $_Z3barIvEvv = comdat any
 
 @__profn__Z3barIvEvv = linkonce_odr hidden constant [11 x i8] c"_Z3barIvEvv", align 1
 
-; CHECK: @__profn__Z3barIvEvv = private constant [11 x i8] c"_Z3barIvEvv", align 1
+; CHECK-NOT: __profn__Z3barIvEvv
 ; CHECK: @__profc__Z3barIvEvv = linkonce_odr hidden global [1 x i64] zeroinitializer, section "{{.*}}__llvm_prf_cnts", comdat($__profv__Z3barIvEvv), align 8
-; CHECK: @__profd__Z3barIvEvv = linkonce_odr hidden global { i64, i64, i64*, i8*, i8*, i32, [1 x i16] } { i64 4947693190065689389, i64 0, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__Z3barIvEvv, i32 0, i32 0), i8*{{.*}}, i8* null, i32 1, [1 x i16] zeroinitializer }, section "{{.*}}__llvm_prf_data{{.*}}", comdat($__profv__Z3barIvEvv), align 8
+; CHECK: @__profd__Z3barIvEvv = linkonce_odr hidden global { i64, i64, i64*, i8*, i8*, i32, [2 x i16] } { i64 4947693190065689389, i64 0, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__Z3barIvEvv, i32 0, i32 0), i8*{{.*}}, i8* null, i32 1, [2 x i16] zeroinitializer }, section "{{.*}}__llvm_prf_data{{.*}}", comdat($__profv__Z3barIvEvv), align 8
 ; CHECK: @__llvm_prf_nm = private constant [{{.*}} x i8] c"{{.*}}", section "{{.*}}__llvm_prf_names"
 
 
-; COFF: @__profn__Z3barIvEvv = private constant [11 x i8] c"_Z3barIvEvv", align 1
+; COFF-NOT: __profn__Z3barIvEvv
 ; COFF: @__profc__Z3barIvEvv = linkonce_odr hidden global [1 x i64] zeroinitializer, section "{{.*}}__llvm_prf_cnts", comdat, align 8
-; COFF: @__profd__Z3barIvEvv = linkonce_odr hidden global { i64, i64, i64*, i8*, i8*, i32, [1 x i16] } { i64 4947693190065689389, i64 0, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__Z3barIvEvv, i32 0, i32 0), i8*{{.*}}, i8* null, i32 1, [1 x i16] zeroinitializer }, section "{{.*}}__llvm_prf_data{{.*}}", comdat($__profc__Z3barIvEvv), align 8
+; COFF: @__profd__Z3barIvEvv = linkonce_odr hidden global { i64, i64, i64*, i8*, i8*, i32, [2 x i16] } { i64 4947693190065689389, i64 0, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__Z3barIvEvv, i32 0, i32 0), i8*{{.*}}, i8* null, i32 1, [2 x i16] zeroinitializer }, section "{{.*}}__llvm_prf_data{{.*}}", comdat($__profc__Z3barIvEvv), align 8
 
 
 declare void @llvm.instrprof.increment(i8*, i64, i32, i32) #1
diff --git a/test/Instrumentation/InstrProfiling/icall.ll b/test/Instrumentation/InstrProfiling/icall.ll
index 529ad9ce12f05553a6d8b4a90c6e423bafac6f82..d92de47421d406949814363e815951f1a77f683c 100644
--- a/test/Instrumentation/InstrProfiling/icall.ll
+++ b/test/Instrumentation/InstrProfiling/icall.ll
@@ -37,9 +37,9 @@ attributes #0 = { nounwind }
 ; DYN-NOT: @__profvp_foo
 ; DYN-NOT: @__llvm_prf_vnodes
 
-; STATIC: call void @__llvm_profile_instrument_target(i64 %3, i8* bitcast ({ i64, i64, i64*, i8*, i8*, i32, [1 x i16] }* @__profd_foo to i8*), i32 0)
-; STATIC-EXT: call void @__llvm_profile_instrument_target(i64 %3, i8* bitcast ({ i64, i64, i64*, i8*, i8*, i32, [1 x i16] }* @__profd_foo to i8*), i32 zeroext 0)
-; STATIC-SEXT: call void @__llvm_profile_instrument_target(i64 %3, i8* bitcast ({ i64, i64, i64*, i8*, i8*, i32, [1 x i16] }* @__profd_foo to i8*), i32 signext 0)
+; STATIC: call void @__llvm_profile_instrument_target(i64 %3, i8* bitcast ({ i64, i64, i64*, i8*, i8*, i32, [2 x i16] }* @__profd_foo to i8*), i32 0)
+; STATIC-EXT: call void @__llvm_profile_instrument_target(i64 %3, i8* bitcast ({ i64, i64, i64*, i8*, i8*, i32, [2 x i16] }* @__profd_foo to i8*), i32 zeroext 0)
+; STATIC-SEXT: call void @__llvm_profile_instrument_target(i64 %3, i8* bitcast ({ i64, i64, i64*, i8*, i8*, i32, [2 x i16] }* @__profd_foo to i8*), i32 signext 0)
 
 ; STATIC: declare void @__llvm_profile_instrument_target(i64, i8*, i32)
 ; STATIC-EXT: declare void @__llvm_profile_instrument_target(i64, i8*, i32 zeroext)
diff --git a/test/Instrumentation/InstrProfiling/platform.ll b/test/Instrumentation/InstrProfiling/platform.ll
index b731fc3e5ff5e490e5b026c22ed266b4299d84c0..c0c711054ff1ab4471d297483ee0283206c8dee7 100644
--- a/test/Instrumentation/InstrProfiling/platform.ll
+++ b/test/Instrumentation/InstrProfiling/platform.ll
@@ -12,8 +12,8 @@
 ; RUN: opt < %s -mtriple=x86_64-pc-solaris -passes=instrprof -S | FileCheck %s -check-prefix=SOLARIS
 
 @__profn_foo = hidden constant [3 x i8] c"foo"
-; MACHO: @__profn_foo = private constant [3 x i8] c"foo"
-; ELF: @__profn_foo = private constant [3 x i8] c"foo"
+; MACHO-NOT: __profn_foo
+; ELF-NOT: __profn_foo
 
 ; MACHO: @__profc_foo = hidden global [1 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
 ; ELF: @__profc_foo = hidden global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", align 8
diff --git a/test/Instrumentation/InstrProfiling/profiling.ll b/test/Instrumentation/InstrProfiling/profiling.ll
index 508d3ef8dea00b563e93f70002b19c25509a3504..c4cc1d9ce4387e79a88a8f8164c419f379261d0d 100644
--- a/test/Instrumentation/InstrProfiling/profiling.ll
+++ b/test/Instrumentation/InstrProfiling/profiling.ll
@@ -4,11 +4,11 @@
 target triple = "x86_64-apple-macosx10.10.0"
 
 @__profn_foo = hidden constant [3 x i8] c"foo"
-; CHECK: @__profn_foo = private constant [3 x i8] c"foo"
+; CHECK-NOT: __profn_foo
 @__profn_bar = hidden constant [4 x i8] c"bar\00"
-; CHECK: @__profn_bar = private constant [4 x i8] c"bar\00"
+; CHECK-NOT: __profn_bar
 @__profn_baz = hidden constant [3 x i8] c"baz"
-; CHECK: @__profn_baz = private constant [3 x i8] c"baz"
+; CHECK-NOT: __profn_baz
 
 ; CHECK: @__profc_foo = hidden global [1 x i64] zeroinitializer, section "__DATA,__llvm_prf_cnts", align 8
 ; CHECK: @__profd_foo = hidden {{.*}}, section "__DATA,__llvm_prf_data,regular,live_support", align 8
diff --git a/test/Instrumentation/MemorySanitizer/AArch64/vararg.ll b/test/Instrumentation/MemorySanitizer/AArch64/vararg.ll
index 99b46045623941b675877daaa34d31e086b6a0de..18d2c3bfe4d8367f188dc75dbeb5c018073355e4 100644
--- a/test/Instrumentation/MemorySanitizer/AArch64/vararg.ll
+++ b/test/Instrumentation/MemorySanitizer/AArch64/vararg.ll
@@ -8,10 +8,10 @@ target triple = "aarch64-unknown-linux-gnu"
 define i32 @foo(i32 %guard, ...) {
   %vl = alloca %struct.__va_list, align 8
   %1 = bitcast %struct.__va_list* %vl to i8*
-  call void @llvm.lifetime.start(i64 32, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %1)
   call void @llvm.va_start(i8* %1)
   call void @llvm.va_end(i8* %1)
-  call void @llvm.lifetime.end(i64 32, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %1)
   ret i32 0
 }
 
@@ -46,10 +46,10 @@ define i32 @foo(i32 %guard, ...) {
 ; CHECK: [[STACK:%.*]] = getelementptr inbounds i8, i8* {{%.*}}, i32 192
 ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* {{%.*}}, i8* [[STACK]], i64 {{%.*}}, i32 16, i1 false)
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 declare void @llvm.va_start(i8*) #2
 declare void @llvm.va_end(i8*) #2
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 define i32 @bar() {
   %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i32 2, double 3.000000e+00, 
diff --git a/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll b/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll
index 71397f1db5a44f7a7c6e44021fba2cc1ca159825..46e840c607f97469e2a436017aae1a4b67500a1e 100644
--- a/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll
+++ b/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64.ll
@@ -6,10 +6,10 @@ target triple = "mips64--linux"
 define i32 @foo(i32 %guard, ...) {
   %vl = alloca i8*, align 8
   %1 = bitcast i8** %vl to i8*
-  call void @llvm.lifetime.start(i64 32, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %1)
   call void @llvm.va_start(i8* %1)
   call void @llvm.va_end(i8* %1)
-  call void @llvm.lifetime.end(i64 32, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %1)
   ret i32 0
 }
 
@@ -23,10 +23,10 @@ define i32 @foo(i32 %guard, ...) {
 ; CHECK: [[STACK:%.*]] = bitcast {{.*}} @__msan_va_arg_tls to i8*
 ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[C]], i8* [[STACK]], i64 [[B]], i32 8, i1 false)
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 declare void @llvm.va_start(i8*) #2
 declare void @llvm.va_end(i8*) #2
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 define i32 @bar() {
   %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
diff --git a/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll b/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll
index 9931b13baacb28b92d1d80b15b434b27e6046bee..e0177b63d68d5321208e62f23b5754212c571e66 100644
--- a/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll
+++ b/test/Instrumentation/MemorySanitizer/Mips/vararg-mips64el.ll
@@ -6,10 +6,10 @@ target triple = "mips64el--linux"
 define i32 @foo(i32 %guard, ...) {
   %vl = alloca i8*, align 8
   %1 = bitcast i8** %vl to i8*
-  call void @llvm.lifetime.start(i64 32, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %1)
   call void @llvm.va_start(i8* %1)
   call void @llvm.va_end(i8* %1)
-  call void @llvm.lifetime.end(i64 32, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %1)
   ret i32 0
 }
 
@@ -23,10 +23,10 @@ define i32 @foo(i32 %guard, ...) {
 ; CHECK: [[STACK:%.*]] = bitcast {{.*}} @__msan_va_arg_tls to i8*
 ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[C]], i8* [[STACK]], i64 [[B]], i32 8, i1 false)
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 declare void @llvm.va_start(i8*) #2
 declare void @llvm.va_end(i8*) #2
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 define i32 @bar() {
   %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
diff --git a/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll b/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll
index 71f4b3466595d09048716b29d1b600c1a83ab9db..afc4b775de3526edfd13c615890825085c059581 100644
--- a/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll
+++ b/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64.ll
@@ -6,10 +6,10 @@ target triple = "powerpc64--linux"
 define i32 @foo(i32 %guard, ...) {
   %vl = alloca i8*, align 8
   %1 = bitcast i8** %vl to i8*
-  call void @llvm.lifetime.start(i64 32, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %1)
   call void @llvm.va_start(i8* %1)
   call void @llvm.va_end(i8* %1)
-  call void @llvm.lifetime.end(i64 32, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %1)
   ret i32 0
 }
 
@@ -23,10 +23,10 @@ define i32 @foo(i32 %guard, ...) {
 ; CHECK: [[STACK:%.*]] = bitcast {{.*}} @__msan_va_arg_tls to i8*
 ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[C]], i8* [[STACK]], i64 [[B]], i32 8, i1 false)
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 declare void @llvm.va_start(i8*) #2
 declare void @llvm.va_end(i8*) #2
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 define i32 @bar() {
   %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
diff --git a/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll b/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll
index 6e844dce549124d7e297b2423d936c26ca9aad50..1afe778ad79a59c92f3beb49ef3d47bb346f7876 100644
--- a/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll
+++ b/test/Instrumentation/MemorySanitizer/PowerPC/vararg-ppc64le.ll
@@ -6,10 +6,10 @@ target triple = "powerpc64le--linux"
 define i32 @foo(i32 %guard, ...) {
   %vl = alloca i8*, align 8
   %1 = bitcast i8** %vl to i8*
-  call void @llvm.lifetime.start(i64 32, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %1)
   call void @llvm.va_start(i8* %1)
   call void @llvm.va_end(i8* %1)
-  call void @llvm.lifetime.end(i64 32, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %1)
   ret i32 0
 }
 
@@ -23,10 +23,10 @@ define i32 @foo(i32 %guard, ...) {
 ; CHECK: [[STACK:%.*]] = bitcast {{.*}} @__msan_va_arg_tls to i8*
 ; CHECK: call void @llvm.memcpy.p0i8.p0i8.i64(i8* [[C]], i8* [[STACK]], i64 [[B]], i32 8, i1 false)
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 declare void @llvm.va_start(i8*) #2
 declare void @llvm.va_end(i8*) #2
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 define i32 @bar() {
   %1 = call i32 (i32, ...) @foo(i32 0, i32 1, i64 2, double 3.000000e+00)
diff --git a/test/Instrumentation/MemorySanitizer/alloca.ll b/test/Instrumentation/MemorySanitizer/alloca.ll
new file mode 100644
index 0000000000000000000000000000000000000000..57ee9120ae831a90566a1322e788561890c6fdc7
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/alloca.ll
@@ -0,0 +1,59 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s --check-prefixes=CHECK,INLINE
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-poison-stack-with-call=1 -S | FileCheck %s --check-prefixes=CHECK,CALL
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=1 -S | FileCheck %s --check-prefixes=CHECK,ORIGIN
+; RUN: opt < %s -msan -msan-check-access-address=0 -msan-track-origins=2 -S | FileCheck %s --check-prefixes=CHECK,ORIGIN
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @static() sanitize_memory {
+entry:
+  %x = alloca i32, align 4
+  ret void
+}
+
+; CHECK-LABEL: define void @static(
+; INLINE: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 -1, i64 4, i32 4, i1 false)
+; CALL: call void @__msan_poison_stack(i8* {{.*}}, i64 4)
+; ORIGIN: call void @__msan_set_alloca_origin4(i8* {{.*}}, i64 4,
+; CHECK: ret void
+
+
+define void @dynamic() sanitize_memory {
+entry:
+  br label %l
+l:
+  %x = alloca i32, align 4
+  ret void
+}
+
+; CHECK-LABEL: define void @dynamic(
+; INLINE: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 -1, i64 4, i32 4, i1 false)
+; CALL: call void @__msan_poison_stack(i8* {{.*}}, i64 4)
+; ORIGIN: call void @__msan_set_alloca_origin4(i8* {{.*}}, i64 4,
+; CHECK: ret void
+
+define void @array() sanitize_memory {
+entry:
+  %x = alloca i32, i64 5, align 4
+  ret void
+}
+
+; CHECK-LABEL: define void @array(
+; INLINE: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 -1, i64 20, i32 4, i1 false)
+; CALL: call void @__msan_poison_stack(i8* {{.*}}, i64 20)
+; ORIGIN: call void @__msan_set_alloca_origin4(i8* {{.*}}, i64 20,
+; CHECK: ret void
+
+define void @array_non_const(i64 %cnt) sanitize_memory {
+entry:
+  %x = alloca i32, i64 %cnt, align 4
+  ret void
+}
+
+; CHECK-LABEL: define void @array_non_const(
+; CHECK: %[[A:.*]] = mul i64 4, %cnt
+; INLINE: call void @llvm.memset.p0i8.i64(i8* {{.*}}, i8 -1, i64 %[[A]], i32 4, i1 false)
+; CALL: call void @__msan_poison_stack(i8* {{.*}}, i64 %[[A]])
+; ORIGIN: call void @__msan_set_alloca_origin4(i8* {{.*}}, i64 %[[A]],
+; CHECK: ret void
diff --git a/test/Instrumentation/MemorySanitizer/csr.ll b/test/Instrumentation/MemorySanitizer/csr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c4e3a3f7392005f7d6087d11ef2517f22122a7da
--- /dev/null
+++ b/test/Instrumentation/MemorySanitizer/csr.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -msan -msan-check-access-address=0 -S | FileCheck %s
+; RUN: opt < %s -msan -msan-check-access-address=1 -S | FileCheck %s --check-prefix=ADDR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @llvm.x86.sse.stmxcsr(i8*)
+declare void @llvm.x86.sse.ldmxcsr(i8*)
+
+define void @getcsr(i32 *%p) sanitize_memory {
+entry:
+  %0 = bitcast i32* %p to i8*
+  call void @llvm.x86.sse.stmxcsr(i8* %0)
+  ret void
+}
+
+; CHECK-LABEL: @getcsr(
+; CHECK: store i32 0, i32*
+; CHECK: call void @llvm.x86.sse.stmxcsr(
+; CHECK: ret void
+
+; ADDR-LABEL: @getcsr(
+; ADDR: %[[A:.*]] = load i64, i64* getelementptr inbounds {{.*}} @__msan_param_tls, i32 0, i32 0), align 8
+; ADDR: %[[B:.*]] = icmp ne i64 %[[A]], 0
+; ADDR: br i1 %[[B]], label {{.*}}, label
+; ADDR: call void @__msan_warning_noreturn()
+; ADDR: call void @llvm.x86.sse.stmxcsr(
+; ADDR: ret void
+
+; Function Attrs: nounwind uwtable
+define void @setcsr(i32 *%p) sanitize_memory {
+entry:
+  %0 = bitcast i32* %p to i8*
+  call void @llvm.x86.sse.ldmxcsr(i8* %0)
+  ret void
+}
+
+; CHECK-LABEL: @setcsr(
+; CHECK: %[[A:.*]] = load i32, i32* %{{.*}}, align 1
+; CHECK: %[[B:.*]] = icmp ne i32 %[[A]], 0
+; CHECK: br i1 %[[B]], label {{.*}}, label
+; CHECK: call void @__msan_warning_noreturn()
+; CHECK: call void @llvm.x86.sse.ldmxcsr(
+; CHECK: ret void
+
+; ADDR-LABEL: @setcsr(
+; ADDR: %[[A:.*]] = load i64, i64* getelementptr inbounds {{.*}} @__msan_param_tls, i32 0, i32 0), align 8
+; ADDR: %[[B:.*]] = icmp ne i64 %[[A]], 0
+; ADDR: br i1 %[[B]], label {{.*}}, label
+; ADDR: call void @__msan_warning_noreturn()
+; ADDR: call void @llvm.x86.sse.ldmxcsr(
+; ADDR: ret void
diff --git a/test/Instrumentation/SanitizerCoverage/coverage.ll b/test/Instrumentation/SanitizerCoverage/coverage.ll
index 9dceceb2eefa4e939b9d6bb0f5ece1d88bf177a9..75a341da021c9d6d1e42d0a2e6bc75c465e7136e 100644
--- a/test/Instrumentation/SanitizerCoverage/coverage.ll
+++ b/test/Instrumentation/SanitizerCoverage/coverage.ll
@@ -7,7 +7,6 @@
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-block-threshold=10 -S | FileCheck %s --check-prefix=CHECK3
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -S | FileCheck %s --check-prefix=CHECK4
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc  -S | FileCheck %s --check-prefix=CHECK_TRACE_PC
-; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard  -S | FileCheck %s --check-prefix=CHECK_TRACE_PC
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-8bit-counters=1  -S | FileCheck %s --check-prefix=CHECK-8BIT
 
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=2 -sanitizer-coverage-block-threshold=10 \
diff --git a/test/Instrumentation/SanitizerCoverage/trace-pc-guard-comdat.ll b/test/Instrumentation/SanitizerCoverage/trace-pc-guard-comdat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8ab5f4961b1b58f356b46b5a7c2705013a033e5b
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/trace-pc-guard-comdat.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard  -S | FileCheck %s --check-prefix=CHECK_TRACE_PC_GUARD
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+define void @foo(i32* %a) sanitize_address {
+entry:
+  %tobool = icmp eq i32* %a, null
+  br i1 %tobool, label %if.end, label %if.then
+
+  if.then:                                          ; preds = %entry
+  store i32 0, i32* %a, align 4
+  br label %if.end
+
+  if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+%struct.StructWithVptr = type { i32 (...)** }
+
+define void @CallViaVptr(%struct.StructWithVptr* %foo) uwtable sanitize_address {
+entry:
+  %0 = bitcast %struct.StructWithVptr* %foo to void (%struct.StructWithVptr*)***
+  %vtable = load void (%struct.StructWithVptr*)**, void (%struct.StructWithVptr*)*** %0, align 8
+  %1 = load void (%struct.StructWithVptr*)*, void (%struct.StructWithVptr*)** %vtable, align 8
+  tail call void %1(%struct.StructWithVptr* %foo)
+  tail call void %1(%struct.StructWithVptr* %foo)
+  tail call void asm sideeffect "", ""()
+  ret void
+}
+
+; CHECK_TRACE_PC_GUARD-LABEL: define void @foo
+; CHECK_TRACE_PC_GUARD: call void @__sanitizer_cov_trace_pc
+; CHECK_TRACE_PC_GUARD: call void asm sideeffect "", ""()
+; CHECK_TRACE_PC_GUARD: ret void
+
+; CHECK_TRACE_PC_GUARD-LABEL: define void @CallViaVptr
+; CHECK_TRACE_PC_GUARD: call void @__sanitizer_cov_trace_pc_indir
+; CHECK_TRACE_PC_GUARD: call void @__sanitizer_cov_trace_pc_indir
+; CHECK_TRACE_PC_GUARD: ret void
+
+; CHECK_TRACE_PC_GUARD-LABEL: define internal void @sancov.module_ctor() comdat
+
diff --git a/test/Instrumentation/SanitizerCoverage/trace-pc-guard-nocomdat.ll b/test/Instrumentation/SanitizerCoverage/trace-pc-guard-nocomdat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..392ff8d2932707ccc41085b8a6765e07f8744c72
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/trace-pc-guard-nocomdat.ll
@@ -0,0 +1,42 @@
+; RUN: opt < %s -sancov -sanitizer-coverage-level=4 -sanitizer-coverage-trace-pc-guard  -S | FileCheck %s --check-prefix=CHECK_TRACE_PC_GUARD
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-apple-darwin10.0.0"
+define void @foo(i32* %a) sanitize_address {
+entry:
+  %tobool = icmp eq i32* %a, null
+  br i1 %tobool, label %if.end, label %if.then
+
+  if.then:                                          ; preds = %entry
+  store i32 0, i32* %a, align 4
+  br label %if.end
+
+  if.end:                                           ; preds = %entry, %if.then
+  ret void
+}
+
+%struct.StructWithVptr = type { i32 (...)** }
+
+define void @CallViaVptr(%struct.StructWithVptr* %foo) uwtable sanitize_address {
+entry:
+  %0 = bitcast %struct.StructWithVptr* %foo to void (%struct.StructWithVptr*)***
+  %vtable = load void (%struct.StructWithVptr*)**, void (%struct.StructWithVptr*)*** %0, align 8
+  %1 = load void (%struct.StructWithVptr*)*, void (%struct.StructWithVptr*)** %vtable, align 8
+  tail call void %1(%struct.StructWithVptr* %foo)
+  tail call void %1(%struct.StructWithVptr* %foo)
+  tail call void asm sideeffect "", ""()
+  ret void
+}
+
+; CHECK_TRACE_PC_GUARD-LABEL: define void @foo
+; CHECK_TRACE_PC_GUARD: call void @__sanitizer_cov_trace_pc
+; CHECK_TRACE_PC_GUARD: call void asm sideeffect "", ""()
+; CHECK_TRACE_PC_GUARD: ret void
+
+; CHECK_TRACE_PC_GUARD-LABEL: define void @CallViaVptr
+; CHECK_TRACE_PC_GUARD: call void @__sanitizer_cov_trace_pc_indir
+; CHECK_TRACE_PC_GUARD: call void @__sanitizer_cov_trace_pc_indir
+; CHECK_TRACE_PC_GUARD: ret void
+
+; CHECK_TRACE_PC_GUARD-LABEL: define internal void @sancov.module_ctor() {
+
diff --git a/test/Instrumentation/SanitizerCoverage/tracing.ll b/test/Instrumentation/SanitizerCoverage/tracing.ll
index 49c2a1a635273193ae15da92ba9b5953ca27e453..9e153472eaba293a0212c9ada8d41ceba452b066 100644
--- a/test/Instrumentation/SanitizerCoverage/tracing.ll
+++ b/test/Instrumentation/SanitizerCoverage/tracing.ll
@@ -3,6 +3,7 @@
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-experimental-tracing  -S | FileCheck %s --check-prefix=CHECK3
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc  -S | FileCheck %s --check-prefix=CHECK_PC
 ; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard  -S | FileCheck %s --check-prefix=CHECK_PC_GUARD
+; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc-guard  -S -mtriple=x86_64-apple-macosx | FileCheck %s --check-prefix=CHECK_PC_GUARD_DARWIN
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
@@ -47,3 +48,11 @@ entry:
 ; CHECK_PC_GUARD-NOT: call void @__sanitizer_cov_trace_pc
 ; CHECK_PC_GUARD: ret void
 ; CHECK_PC_GUARD: call void @__sanitizer_cov_trace_pc_guard_init(i32* bitcast (i32** @__start___sancov_guards to i32*), i32* bitcast (i32** @__stop___sancov_guards to i32*))
+
+; CHECK_PC_GUARD_DARWIN-LABEL: define void @foo
+; CHECK_PC_GUARD_DARWIN: call void @__sanitizer_cov_trace_pc_guard
+; CHECK_PC_GUARD_DARWIN: call void @__sanitizer_cov_trace_pc_guard
+; CHECK_PC_GUARD_DARWIN: call void @__sanitizer_cov_trace_pc_guard
+; CHECK_PC_GUARD_DARWIN-NOT: call void @__sanitizer_cov_trace_pc
+; CHECK_PC_GUARD_DARWIN: ret void
+; CHECK_PC_GUARD_DARWIN: call void @__sanitizer_cov_trace_pc_guard_init(i32* bitcast (i32** @"\01section$start$__DATA$__sancov_guards" to i32*), i32* bitcast (i32** @"\01section$end$__DATA$__sancov_guards" to i32*))
diff --git a/test/Instrumentation/SanitizerCoverage/wineh.ll b/test/Instrumentation/SanitizerCoverage/wineh.ll
new file mode 100644
index 0000000000000000000000000000000000000000..87b44be5544f37c9c08f352bbe7d75734d0457d2
--- /dev/null
+++ b/test/Instrumentation/SanitizerCoverage/wineh.ll
@@ -0,0 +1,111 @@
+; RUN: opt < %s -sancov -sanitizer-coverage-level=3 -sanitizer-coverage-trace-pc -S | FileCheck %s --check-prefix=CHECK
+
+; Generated from this C++ source:
+; $ clang -O2 t.cpp -S -emit-llvm
+; void g();
+; struct Foo { Foo(); ~Foo(); };
+; int f() {
+;   Foo v;
+;   g();
+;   try {
+;     g();
+;   } catch (int e) {
+;     g();
+;   } catch (...) {
+;     g();
+;   }
+;   return 0;
+; }
+
+; FIXME: We need to do more than this. In particular, __sanitizer_cov callbacks
+; in funclets need token bundles.
+
+; CHECK-LABEL: define i32 @"\01?f@@YAHXZ"()
+; CHECK: catch.dispatch:
+; CHECK-NEXT: catchswitch within none [label %catch3, label %catch] unwind label %ehcleanup
+
+; ModuleID = 't.cpp'
+source_filename = "t.cpp"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.10.24728"
+
+%rtti.TypeDescriptor2 = type { i8**, i8*, [3 x i8] }
+%struct.Foo = type { i8 }
+
+$"\01??_R0H@8" = comdat any
+
+@"\01??_7type_info@@6B@" = external constant i8*
+@"\01??_R0H@8" = linkonce_odr global %rtti.TypeDescriptor2 { i8** @"\01??_7type_info@@6B@", i8* null, [3 x i8] c".H\00" }, comdat
+
+; Function Attrs: uwtable
+define i32 @"\01?f@@YAHXZ"() local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+entry:
+  %v = alloca %struct.Foo, align 1
+  %e = alloca i32, align 4
+  %0 = getelementptr inbounds %struct.Foo, %struct.Foo* %v, i64 0, i32 0
+  call void @llvm.lifetime.start(i64 1, i8* nonnull %0) #4
+  %call = call %struct.Foo* @"\01??0Foo@@QEAA@XZ"(%struct.Foo* nonnull %v)
+  invoke void @"\01?g@@YAXXZ"()
+          to label %invoke.cont unwind label %ehcleanup
+
+invoke.cont:                                      ; preds = %entry
+  invoke void @"\01?g@@YAXXZ"()
+          to label %try.cont unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %invoke.cont
+  %1 = catchswitch within none [label %catch3, label %catch] unwind label %ehcleanup
+
+catch3:                                           ; preds = %catch.dispatch
+  %2 = catchpad within %1 [%rtti.TypeDescriptor2* @"\01??_R0H@8", i32 0, i32* %e]
+  invoke void @"\01?g@@YAXXZ"() [ "funclet"(token %2) ]
+          to label %invoke.cont4 unwind label %ehcleanup
+
+invoke.cont4:                                     ; preds = %catch3
+  catchret from %2 to label %try.cont
+
+try.cont:                                         ; preds = %invoke.cont, %invoke.cont2, %invoke.cont4
+  call void @"\01??1Foo@@QEAA@XZ"(%struct.Foo* nonnull %v) #4
+  call void @llvm.lifetime.end(i64 1, i8* nonnull %0) #4
+  ret i32 0
+
+catch:                                            ; preds = %catch.dispatch
+  %3 = catchpad within %1 [i8* null, i32 64, i8* null]
+  invoke void @"\01?g@@YAXXZ"() [ "funclet"(token %3) ]
+          to label %invoke.cont2 unwind label %ehcleanup
+
+invoke.cont2:                                     ; preds = %catch
+  catchret from %3 to label %try.cont
+
+ehcleanup:                                        ; preds = %catch3, %catch, %catch.dispatch, %entry
+  %4 = cleanuppad within none []
+  call void @"\01??1Foo@@QEAA@XZ"(%struct.Foo* nonnull %v) #4 [ "funclet"(token %4) ]
+  call void @llvm.lifetime.end(i64 1, i8* nonnull %0) #4
+  cleanupret from %4 unwind to caller
+}
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+declare %struct.Foo* @"\01??0Foo@@QEAA@XZ"(%struct.Foo* returned) unnamed_addr #2
+
+declare void @"\01?g@@YAXXZ"() local_unnamed_addr #2
+
+declare i32 @__CxxFrameHandler3(...)
+
+; Function Attrs: nounwind
+declare void @"\01??1Foo@@QEAA@XZ"(%struct.Foo*) unnamed_addr #3
+
+; Function Attrs: argmemonly nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { argmemonly nounwind }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 "}
diff --git a/test/Instrumentation/ThreadSanitizer/tsan_basic.ll b/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
index 7e049c548f2232309726d6a6908941d1c5697e73..61ab98dc999725e9e536414dfd1dff82d4dd79c6 100644
--- a/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
+++ b/test/Instrumentation/ThreadSanitizer/tsan_basic.ll
@@ -54,5 +54,29 @@ entry:
 ; CHECK: ret void
 }
 
+; CHECK-LABEL: @SwiftError
+; CHECK-NOT: __tsan_read
+; CHECK-NOT: __tsan_write
+; CHECK: ret
+define void @SwiftError(i8** swifterror) sanitize_thread {
+  %swifterror_ptr_value = load i8*, i8** %0
+  store i8* null, i8** %0
+  %swifterror_addr = alloca swifterror i8*
+  %swifterror_ptr_value_2 = load i8*, i8** %swifterror_addr
+  store i8* null, i8** %swifterror_addr
+  ret void
+}
+
+; CHECK-LABEL: @SwiftErrorCall
+; CHECK-NOT: __tsan_read
+; CHECK-NOT: __tsan_write
+; CHECK: ret
+define void @SwiftErrorCall(i8** swifterror) sanitize_thread {
+  %swifterror_addr = alloca swifterror i8*
+  store i8* null, i8** %0
+  call void @SwiftError(i8** %0)
+  ret void
+}
+
 ; CHECK: define internal void @tsan.module_ctor()
 ; CHECK: call void @__tsan_init()
diff --git a/test/LTO/Resolution/X86/Inputs/link-odr-availextern-ae.ll b/test/LTO/Resolution/X86/Inputs/link-odr-availextern-ae.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f2d180afc82dec86a7f0c7592df22d476b2bc27e
--- /dev/null
+++ b/test/LTO/Resolution/X86/Inputs/link-odr-availextern-ae.ll
@@ -0,0 +1,6 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define available_externally i32 @f() {
+  ret i32 2
+}
diff --git a/test/LTO/Resolution/X86/Inputs/link-odr-availextern-odr.ll b/test/LTO/Resolution/X86/Inputs/link-odr-availextern-odr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..76e745a444fedc3e787f9bca3a4a8519ed1211fb
--- /dev/null
+++ b/test/LTO/Resolution/X86/Inputs/link-odr-availextern-odr.ll
@@ -0,0 +1,6 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define linkonce_odr i32 @f() {
+  ret i32 2
+}
diff --git a/test/LTO/Resolution/X86/alias.ll b/test/LTO/Resolution/X86/alias.ll
index 2056112e145da85b800c3c49e1f9f150f4c6f7d6..886eadcfc82e281b9ee66fc688c4bfc18b287a60 100644
--- a/test/LTO/Resolution/X86/alias.ll
+++ b/test/LTO/Resolution/X86/alias.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as %s -o %t1.o
 ; RUN: llvm-as %p/Inputs/alias-1.ll -o %t2.o
-; RUN: llvm-lto2 -o %t3.o %t2.o %t1.o -r %t2.o,a,px -r %t1.o,a, -r %t1.o,b,px -save-temps
+; RUN: llvm-lto2 run -o %t3.o %t2.o %t1.o -r %t2.o,a,px -r %t1.o,a, -r %t1.o,b,px -save-temps
 ; RUN: llvm-dis < %t3.o.0.0.preopt.bc -o - | FileCheck %s
 ; RUN: FileCheck --check-prefix=RES %s < %t3.o.resolution.txt
 
diff --git a/test/LTO/Resolution/X86/asm-output.ll b/test/LTO/Resolution/X86/asm-output.ll
new file mode 100644
index 0000000000000000000000000000000000000000..41d293501dd10a29ce8ca0d12173618afa378977
--- /dev/null
+++ b/test/LTO/Resolution/X86/asm-output.ll
@@ -0,0 +1,19 @@
+; Test the ability to emit assembly code from the resolution-based LTO API
+;
+; RUN: llvm-as < %s > %t1.bc
+;
+; RUN: llvm-lto2 run -filetype=asm -r %t1.bc,main,px -o %t2 %t1.bc
+; RUN: FileCheck --check-prefix=ASM %s < %t2.0
+; RUN: llvm-lto2 run -filetype=obj -r %t1.bc,main,px -o %t2 %t1.bc
+; RUN: llvm-objdump -d %t2.0 | FileCheck --check-prefix=ASM %s
+;
+; ASM: main:
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i32 @main() {
+entry:
+  ret i32 23
+}
+
diff --git a/test/LTO/Resolution/X86/comdat.ll b/test/LTO/Resolution/X86/comdat.ll
index 5124b951bed47c78f8fddcc490d07757413b939d..60d082b3e0f7810c79b379e0308eb0d7a576bc82 100644
--- a/test/LTO/Resolution/X86/comdat.ll
+++ b/test/LTO/Resolution/X86/comdat.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as %s -o %t.o
 ; RUN: llvm-as %p/Inputs/comdat.ll -o %t2.o
-; RUN: llvm-lto2 -save-temps -o %t3.o %t.o %t2.o \
+; RUN: llvm-lto2 run -save-temps -o %t3.o %t.o %t2.o \
 ; RUN:  -r=%t.o,f1,plx \
 ; RUN:  -r=%t.o,v1,px \
 ; RUN:  -r=%t.o,r11,px \
diff --git a/test/LTO/Resolution/X86/common2.ll b/test/LTO/Resolution/X86/common2.ll
index 3328d7c5ec36718dd1aa7ea507b2cc360a9d2eab..3cb0a992d9ac3b744d175f8029606a5123832d9c 100644
--- a/test/LTO/Resolution/X86/common2.ll
+++ b/test/LTO/Resolution/X86/common2.ll
@@ -4,7 +4,7 @@
 ; Test that the common merging (size + alignment) is properly handled
 
 ; Client marked the "large with little alignment" one as prevailing
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:  -r %t1.bc,v,x \
 ; RUN:  -r %t2.bc,v,px \
 ; RUN:  -r %t1.bc,foo,px \
@@ -12,7 +12,7 @@
 ; RUN: llvm-dis < %t.o.0.0.preopt.bc | FileCheck %s --check-prefix=LARGE-PREVAILED
 
 ; Same as before, but reversing the order of the inputs
-; RUN: llvm-lto2 %t2.bc %t1.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t2.bc %t1.bc -o %t.o -save-temps \
 ; RUN:  -r %t1.bc,v,x \
 ; RUN:  -r %t2.bc,v,px \
 ; RUN:  -r %t1.bc,foo,px \
@@ -20,7 +20,7 @@
 ; RUN: llvm-dis < %t.o.0.0.preopt.bc | FileCheck %s --check-prefix=LARGE-PREVAILED
 
 ; Client marked the "small with large alignment" one as prevailing
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:  -r %t1.bc,v,px \
 ; RUN:  -r %t2.bc,v,x \
 ; RUN:  -r %t1.bc,foo,px \
@@ -28,7 +28,7 @@
 ; RUN: llvm-dis < %t.o.0.0.preopt.bc | FileCheck %s --check-prefix=SMALL-PREVAILED
 
 ; Same as before, but reversing the order of the inputs
-; RUN: llvm-lto2 %t2.bc %t1.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t2.bc %t1.bc -o %t.o -save-temps \
 ; RUN:  -r %t1.bc,v,px \
 ; RUN:  -r %t2.bc,v,x \
 ; RUN:  -r %t1.bc,foo,px \
@@ -37,7 +37,7 @@
 
 
 ; Client didn't mark any as prevailing, we keep the first one we see as "external"
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:  -r %t1.bc,v,x \
 ; RUN:  -r %t2.bc,v,x \
 ; RUN:  -r %t1.bc,foo,px \
@@ -45,7 +45,7 @@
 ; RUN: llvm-dis < %t.o.0.0.preopt.bc | FileCheck  %s --check-prefix=NONE-PREVAILED1
 
 ; Same as before, but reversing the order of the inputs
-; RUN: llvm-lto2 %t2.bc %t1.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t2.bc %t1.bc -o %t.o -save-temps \
 ; RUN:  -r %t1.bc,v,x \
 ; RUN:  -r %t2.bc,v,x \
 ; RUN:  -r %t1.bc,foo,px \
@@ -55,7 +55,7 @@
 
 
 ; Client marked both as prevailing
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:  -r %t1.bc,v,px \
 ; RUN:  -r %t2.bc,v,px \
 ; RUN:  -r %t1.bc,foo,px \
@@ -63,7 +63,7 @@
 ; RUN: llvm-dis < %t.o.0.0.preopt.bc | FileCheck %s --check-prefix=BOTH-PREVAILED1
 
 ; Same as before, but reversing the order of the inputs
-; RUN: llvm-lto2 %t2.bc %t1.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t2.bc %t1.bc -o %t.o -save-temps \
 ; RUN:  -r %t1.bc,v,px \
 ; RUN:  -r %t2.bc,v,px \
 ; RUN:  -r %t1.bc,foo,px \
diff --git a/test/LTO/Resolution/X86/commons.ll b/test/LTO/Resolution/X86/commons.ll
index b3e504835afbda75481022235611900237813e16..28bf1ada4a862c810d48a5cc8edff9ddcc67a90b 100644
--- a/test/LTO/Resolution/X86/commons.ll
+++ b/test/LTO/Resolution/X86/commons.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as -o %t1.bc %s
 ; RUN: llvm-as -o %t2.bc %p/Inputs/commons.ll
-; RUN: llvm-lto2 %t1.bc -r=%t1.bc,x,l %t2.bc -r=%t2.bc,x,pl -o %t.out -save-temps
+; RUN: llvm-lto2 run %t1.bc -r=%t1.bc,x,l %t2.bc -r=%t2.bc,x,pl -o %t.out -save-temps
 ; RUN: llvm-dis -o - %t.out.0.0.preopt.bc  | FileCheck %s
 
 ; A strong definition should override the common
diff --git a/test/LTO/Resolution/X86/diagnostic-handler-remarks-with-hotness.ll b/test/LTO/Resolution/X86/diagnostic-handler-remarks-with-hotness.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2469570c26b30f0397bb7d5fbf318894bf4b2a8f
--- /dev/null
+++ b/test/LTO/Resolution/X86/diagnostic-handler-remarks-with-hotness.ll
@@ -0,0 +1,37 @@
+; RUN: llvm-as < %s >%t.bc
+
+; RUN: rm -f %t.yaml
+; RUN: llvm-lto2 run -pass-remarks-output=%t.yaml \
+; RUN:           -pass-remarks-with-hotness \
+; RUN:           -r %t.bc,tinkywinky,p \
+; RUN:           -r %t.bc,patatino,px \
+; RUN:           -r %t.bc,main,px -o %t.o %t.bc
+; RUN: cat %t.yaml | FileCheck %s -check-prefix=YAML
+
+; YAML: --- !Passed
+; YAML-NEXT: Pass:            inline
+; YAML-NEXT: Name:            Inlined
+; YAML-NEXT: Function:        main
+; YAML-NEXT: Hotness:         300
+; YAML-NEXT: Args:
+; YAML-NEXT:   - Callee:          tinkywinky
+; YAML-NEXT:   - String:          ' inlined into '
+; YAML-NEXT:   - Caller:          main
+; YAML-NEXT: ...
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+declare i32 @patatino()
+
+define i32 @tinkywinky() {
+  %a = call i32 @patatino()
+  ret i32 %a
+}
+
+define i32 @main() !prof !0 {
+  %i = call i32 @tinkywinky()
+  ret i32 %i
+}
+
+!0 = !{!"function_entry_count", i64 300}
diff --git a/test/LTO/Resolution/X86/diagnostic-handler-remarks.ll b/test/LTO/Resolution/X86/diagnostic-handler-remarks.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eb1bca3670c6288e35c57d45064818ce070dee9e
--- /dev/null
+++ b/test/LTO/Resolution/X86/diagnostic-handler-remarks.ll
@@ -0,0 +1,33 @@
+; RUN: llvm-as < %s >%t.bc
+
+; RUN: rm -f %t.yaml
+; RUN: llvm-lto2 run -pass-remarks-output=%t.yaml \
+; RUN:           -r %t.bc,tinkywinky,p \
+; RUN:           -r %t.bc,patatino,px \
+; RUN:           -r %t.bc,main,px -o %t.o %t.bc
+; RUN: cat %t.yaml | FileCheck %s -check-prefix=YAML
+
+; YAML:      --- !Passed
+; YAML-NEXT: Pass:            inline
+; YAML-NEXT: Name:            Inlined
+; YAML-NEXT: Function:        main
+; YAML-NEXT: Args:
+; YAML-NEXT:   - Callee:          tinkywinky
+; YAML-NEXT:   - String:          ' inlined into '
+; YAML-NEXT:   - Caller:          main
+; YAML-NEXT: ...
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-scei-ps4"
+
+declare i32 @patatino()
+
+define i32 @tinkywinky() {
+  %a = call i32 @patatino()
+  ret i32 %a
+}
+
+define i32 @main() {
+  %i = call i32 @tinkywinky()
+  ret i32 %i
+}
diff --git a/test/LTO/Resolution/X86/empty-bitcode.test b/test/LTO/Resolution/X86/empty-bitcode.test
index c98c54499ef6d861c78892db86949df78ad0f7f8..c05c5e3824b6b6d3ba14c10ce6321a4d43eff6c8 100644
--- a/test/LTO/Resolution/X86/empty-bitcode.test
+++ b/test/LTO/Resolution/X86/empty-bitcode.test
@@ -1,3 +1,3 @@
 RUN: llvm-cat -o %t.o
-RUN: not llvm-lto2 -o %t2 %t.o 2>&1 | FileCheck %s
+RUN: not llvm-lto2 run -o %t2 %t.o 2>&1 | FileCheck %s
 CHECK: Bitcode file does not contain any modules
diff --git a/test/LTO/Resolution/X86/intrinsic.ll b/test/LTO/Resolution/X86/intrinsic.ll
index f785f8f4f7149c779f98d093aea98d8b0ac48804..dc287ace0f8791a5edc8566fd14f8212909299fc 100644
--- a/test/LTO/Resolution/X86/intrinsic.ll
+++ b/test/LTO/Resolution/X86/intrinsic.ll
@@ -1,6 +1,6 @@
 ; RUN: llvm-as %s -o %t1.o
 ; RUN: llvm-as %p/Inputs/intrinsic.ll -o %t2.o
-; RUN: llvm-lto2 -o %t3.o %t1.o %t2.o -r %t1.o,foo
+; RUN: llvm-lto2 run -o %t3.o %t1.o %t2.o -r %t1.o,foo
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/LTO/Resolution/X86/link-odr-availextern.ll b/test/LTO/Resolution/X86/link-odr-availextern.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cc360338d6d1d0361114851ef769a04c28558f19
--- /dev/null
+++ b/test/LTO/Resolution/X86/link-odr-availextern.ll
@@ -0,0 +1,38 @@
+; Tests for correct behavior for non-prevailing resolutions in cases involving
+; *_odr and available_externally linkages.
+
+; RUN: llvm-as %s -o %t1
+; RUN: llvm-as %S/Inputs/link-odr-availextern-ae.ll -o %t2ae
+; RUN: llvm-as %S/Inputs/link-odr-availextern-odr.ll -o %t2odr
+
+; RUN: llvm-lto2 run -o %t3 %t1 %t2ae -r %t1,f,p -r %t2ae,f, -save-temps
+; RUN: llvm-dis < %t3.0.0.preopt.bc -o - | FileCheck --check-prefix=PREVAILING %s
+
+; RUN: llvm-lto2 run -o %t3 %t1 %t2odr -r %t1,f,p -r %t2odr,f, -save-temps
+; RUN: llvm-dis < %t3.0.0.preopt.bc -o - | FileCheck --check-prefix=PREVAILING %s
+
+; RUN: llvm-lto2 run -o %t3 %t2ae %t1 -r %t1,f,p -r %t2ae,f, -save-temps
+; RUN: llvm-dis < %t3.0.0.preopt.bc -o - | FileCheck --check-prefix=PREVAILING %s
+
+; RUN: llvm-lto2 run -o %t3 %t2odr %t1 -r %t1,f,p -r %t2odr,f, -save-temps
+; RUN: llvm-dis < %t3.0.0.preopt.bc -o - | FileCheck --check-prefix=PREVAILING %s
+
+; RUN: llvm-lto2 run -o %t3 %t2ae -r %t2ae,f, -save-temps
+; RUN: llvm-dis < %t3.0.0.preopt.bc -o - | FileCheck --check-prefix=NONPREVAILING %s
+
+; RUN: llvm-lto2 run -o %t3 %t2odr -r %t2odr,f, -save-temps
+; RUN: llvm-dis < %t3.0.0.preopt.bc -o - | FileCheck --check-prefix=NONPREVAILING %s
+
+; RUN: llvm-lto2 run -o %t3 %t2odr %t1 -r %t1,f, -r %t2odr,f, -save-temps
+; RUN: llvm-dis < %t3.0.0.preopt.bc -o - | FileCheck --check-prefix=NONPREVAILING %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; PREVAILING: define weak_odr i32 @f()
+; PREVAILING-NEXT: ret i32 1
+; NONPREVAILING: define available_externally i32 @f()
+; NONPREVAILING-NEXT: ret i32 2
+define linkonce_odr i32 @f() {
+  ret i32 1
+}
diff --git a/test/LTO/Resolution/X86/lowertypetests.ll b/test/LTO/Resolution/X86/lowertypetests.ll
index 3753689a699fe6b7363688f091f03213f6f524df..c84a786e66fc2f176aff4dbd30236a3a3f2eb2ef 100644
--- a/test/LTO/Resolution/X86/lowertypetests.ll
+++ b/test/LTO/Resolution/X86/lowertypetests.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -thinlto-bc -o %t %s
-; RUN: llvm-lto2 -r %t,f,plx -r %t,foo,lx -r %t,foo,plx -o %t1 %t
+; RUN: llvm-lto2 run -r %t,f,plx -r %t,foo,lx -r %t,foo,plx -o %t1 %t
 ; RUN: llvm-nm %t1.0 | FileCheck --check-prefix=MERGED %s
 ; RUN: llvm-nm %t1.1 | FileCheck %s
 
diff --git a/test/LTO/Resolution/X86/mixed_lto.ll b/test/LTO/Resolution/X86/mixed_lto.ll
index 02b15c61154487bbbb50589c6b4a74549e01a2c0..aa686a8114c960e403ed8e34b9e212f1adf2c82d 100644
--- a/test/LTO/Resolution/X86/mixed_lto.ll
+++ b/test/LTO/Resolution/X86/mixed_lto.ll
@@ -2,7 +2,7 @@
 ; RUN: opt %s -o %t1.o
 ; RUN: opt -module-summary %p/Inputs/mixed_lto.ll -o %t2.o
 
-; RUN: llvm-lto2 -o %t3.o %t2.o %t1.o -r %t2.o,main,px -r %t2.o,g, -r %t1.o,g,px
+; RUN: llvm-lto2 run -o %t3.o %t2.o %t1.o -r %t2.o,main,px -r %t2.o,g, -r %t1.o,g,px
 
 ; Task 0 is the regular LTO file (this file)
 ; RUN: llvm-nm %t3.o.0 | FileCheck %s --check-prefix=NM0
@@ -15,7 +15,7 @@
 
 ; Do the same test again, but with the regular and thin LTO modules in the same file.
 ; RUN: llvm-cat -b -o %t4.o %t2.o %t1.o
-; RUN: llvm-lto2 -o %t5.o %t4.o -r %t4.o,main,px -r %t4.o,g, -r %t4.o,g,px
+; RUN: llvm-lto2 run -o %t5.o %t4.o -r %t4.o,main,px -r %t4.o,g, -r %t4.o,g,px
 ; RUN: llvm-nm %t5.o.0 | FileCheck %s --check-prefix=NM0
 ; RUN: llvm-nm %t5.o.1 | FileCheck %s --check-prefix=NM1
 
diff --git a/test/LTO/Resolution/X86/multi-thinlto.ll b/test/LTO/Resolution/X86/multi-thinlto.ll
index 8af73a328a135358ffae6fbafe04d758854a0d98..06150e44a1670a51665c8b38a9bda89817521230 100644
--- a/test/LTO/Resolution/X86/multi-thinlto.ll
+++ b/test/LTO/Resolution/X86/multi-thinlto.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -module-summary %s -o %t.o
 ; RUN: llvm-cat -b -o %t2.o %t.o %t.o
-; RUN: not llvm-lto2 -o %t3.o %t2.o 2>&1 | FileCheck %s
+; RUN: not llvm-lto2 run -o %t3.o %t2.o 2>&1 | FileCheck %s
 ; CHECK: Expected at most one ThinLTO module per bitcode file
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/LTO/X86/diagnostic-handler-remarks-with-hotness.ll b/test/LTO/X86/diagnostic-handler-remarks-with-hotness.ll
index e5d53c7774a15743b611c86933571a222f3198da..5d0a9b0a4e2211254b3d2fcb41486cb10533df24 100644
--- a/test/LTO/X86/diagnostic-handler-remarks-with-hotness.ll
+++ b/test/LTO/X86/diagnostic-handler-remarks-with-hotness.ll
@@ -2,9 +2,9 @@
 ; with -lto-pass-remarks-with-hotness.
 
 ; RUN: llvm-as < %s >%t.bc
+; RUN: rm -f %t.yaml
 ; RUN: llvm-lto -lto-pass-remarks-output=%t.yaml \
 ; RUN:          -lto-pass-remarks-with-hotness \
-; RUN:          -exported-symbol _func2 \
 ; RUN:          -exported-symbol _main -o %t.o %t.bc
 ; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 
@@ -34,45 +34,4 @@ define i32 @main() !prof !0 {
   ret i32 %i
 }
 
-define i32 @func2(i32* %out, i32* %out2, i32* %A, i32* %B, i32* %C, i32* %D, i32* %E, i32* %F) {
-entry:
-  br label %for.body
-
-for.body:                                         ; preds = %for.body, %entry
-  %i.037 = phi i64 [ 0, %entry ], [ %inc, %for.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %A, i64 %i.037
-  %0 = load i32, i32* %arrayidx, align 4
-  %arrayidx1 = getelementptr inbounds i32, i32* %B, i64 %i.037
-  %1 = load i32, i32* %arrayidx1, align 4
-  %add = add nsw i32 %1, %0
-  %arrayidx2 = getelementptr inbounds i32, i32* %C, i64 %i.037
-  %2 = load i32, i32* %arrayidx2, align 4
-  %add3 = add nsw i32 %add, %2
-  %arrayidx4 = getelementptr inbounds i32, i32* %E, i64 %i.037
-  %3 = load i32, i32* %arrayidx4, align 4
-  %add5 = add nsw i32 %add3, %3
-  %arrayidx6 = getelementptr inbounds i32, i32* %F, i64 %i.037
-  %4 = load i32, i32* %arrayidx6, align 4
-  %add7 = add nsw i32 %add5, %4
-  %arrayidx8 = getelementptr inbounds i32, i32* %out, i64 %i.037
-  store i32 %add7, i32* %arrayidx8, align 4
-  %5 = load i32, i32* %arrayidx, align 4
-  %6 = load i32, i32* %arrayidx1, align 4
-  %add11 = add nsw i32 %6, %5
-  %7 = load i32, i32* %arrayidx2, align 4
-  %add13 = add nsw i32 %add11, %7
-  %8 = load i32, i32* %arrayidx4, align 4
-  %add15 = add nsw i32 %add13, %8
-  %9 = load i32, i32* %arrayidx6, align 4
-  %add17 = add nsw i32 %add15, %9
-  %arrayidx18 = getelementptr inbounds i32, i32* %out2, i64 %i.037
-  store i32 %add17, i32* %arrayidx18, align 4
-  %inc = add i64 %i.037, 1
-  %exitcond = icmp eq i64 %inc, 256
-  br i1 %exitcond, label %for.end, label %for.body
-
-for.end:                                          ; preds = %for.body
-  ret i32 undef
-}
-
 !0 = !{!"function_entry_count", i64 300}
diff --git a/test/LTO/X86/diagnostic-handler-remarks.ll b/test/LTO/X86/diagnostic-handler-remarks.ll
index 456bdb5419c888b5f658e89c3812e32047f65f1d..82627fd24ab49e913d06c791de30e57d07335700 100644
--- a/test/LTO/X86/diagnostic-handler-remarks.ll
+++ b/test/LTO/X86/diagnostic-handler-remarks.ll
@@ -28,6 +28,7 @@
 ; RUN: llvm-nm %t.o | FileCheck %s -check-prefix NM
 
 ; Optimization records are collected regardless of the diagnostic handler
+; RUN: rm -f %t.yaml
 ; RUN: llvm-lto -lto-pass-remarks-output=%t.yaml \
 ; RUN:          -exported-symbol _func2 \
 ; RUN:          -exported-symbol _main -o %t.o %t.bc 2>&1 | \
diff --git a/test/LTO/X86/remangle_intrinsics_tbaa.ll b/test/LTO/X86/remangle_intrinsics_tbaa.ll
index 189674b5b0688350ae6cbef3c22257c124114bde..cac72f4330b3afe4f107179b0bae41ecd19b74f6 100644
--- a/test/LTO/X86/remangle_intrinsics_tbaa.ll
+++ b/test/LTO/X86/remangle_intrinsics_tbaa.ll
@@ -3,7 +3,7 @@
 ; RUN: llvm-link -disable-lazy-loading %t2.bc %t1.bc -S | FileCheck %s
 
 ; Verify that we correctly rename the intrinsic and don't crash
-; CHECK: @llvm.masked.store.v4p0some_named_struct.0.p0v4p0some_named_struct.0
+; CHECK: @llvm.masked.store.v4p0s_some_named_struct.0s.p0v4p0s_some_named_struct.0s
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
diff --git a/test/LTO/X86/strip-debug-info-no-call-loc.ll b/test/LTO/X86/strip-debug-info-no-call-loc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..39b8c40c3a4765ec6d3ba6ee7946affccfe009f4
--- /dev/null
+++ b/test/LTO/X86/strip-debug-info-no-call-loc.ll
@@ -0,0 +1,56 @@
+; RUN: llvm-as %s -disable-verify -o %t.bc
+; RUN: llvm-lto -lto-strip-invalid-debug-info=true \
+; RUN:     -exported-symbol f -exported-symbol _f \
+; RUN:     -o %t.o %t.bc 2>&1 | \
+; RUN:     FileCheck %s -allow-empty -check-prefix=CHECK-WARN
+; RUN: llvm-nm %t.o | FileCheck %s 
+
+; Check that missing debug locations on inlinable calls are a
+; recoverable error.
+
+; CHECK-WARN: Invalid debug info found, debug info will be stripped
+; CHECK: {{f$}}
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+define void @h() #0 !dbg !7 {
+entry:
+  call void (...) @i(), !dbg !9
+  ret void, !dbg !10
+}
+
+declare void @i(...) #1
+
+define void @g() #0 !dbg !11 {
+entry:
+; Manually removed !dbg.
+  call void @h()
+  ret void, !dbg !13
+}
+
+define void @f() #0 !dbg !14 {
+entry:
+  call void @g(), !dbg !15
+  ret void, !dbg !16
+}
+
+attributes #0 = { nounwind ssp uwtable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 2}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!7 = distinct !DISubprogram(name: "h", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 2, column: 12, scope: !7)
+!10 = !DILocation(line: 2, column: 17, scope: !7)
+!11 = distinct !DISubprogram(name: "g", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: false, unit: !0, variables: !2)
+!12 = !DILocation(line: 3, column: 12, scope: !11)
+!13 = !DILocation(line: 3, column: 17, scope: !11)
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, isOptimized: false, unit: !0, variables: !2)
+!15 = !DILocation(line: 4, column: 12, scope: !14)
+!16 = !DILocation(line: 4, column: 17, scope: !14)
diff --git a/test/LTO/X86/symver-asm.ll b/test/LTO/X86/symver-asm.ll
index 03dda2bedd96f0b76b769155b0bb6de096fdcd99..4841892724af5d608f30da954b46e9f570a7de93 100644
--- a/test/LTO/X86/symver-asm.ll
+++ b/test/LTO/X86/symver-asm.ll
@@ -1,16 +1,47 @@
 ; RUN: llvm-as < %s >%t1
-; RUN: llvm-lto -o %t2 %t1
+; RUN: llvm-lto -exported-symbol=io_cancel_0_4 -exported-symbol=io_cancel_weak_0_4 -exported-symbol=foo -o %t2 %t1
 ; RUN: llvm-nm %t2 | FileCheck %s
+; RUN: llvm-lto2 run -r %t1,io_cancel_0_4,plx -r %t1,io_cancel_0_4,plx -r %t1,io_cancel_local_0_4,plx -r %t1,io_cancel_weak_0_4,plx -r %t1,io_cancel_weak_0_4,plx -r %t1,io_cancel@@LIBAIO_0.4,plx -r %t1,io_cancel_weak@@LIBAIO_0.4,plx -r %t1,io_cancel_weak@@LIBAIO_0.4.1,plx -r %t1,foo,plx -r %t1,foo,plx -r %t1,foo@@VER1,plx -o %t3 %t1 -save-temps
+; RUN: llvm-nm %t3.0 | FileCheck %s
+; RUN: llvm-dis %t3.0.2.internalize.bc -o - | FileCheck %s --check-prefix=INTERN
 
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 module asm ".symver io_cancel_0_4,io_cancel@@LIBAIO_0.4"
+module asm ".symver io_cancel_local_0_4,io_cancel_local@@LIBAIO_0.4"
+module asm ".symver io_cancel_weak_0_4,io_cancel_weak@@LIBAIO_0.4"
+; Ensure we handle case of same aliasee with two version aliases.
+module asm ".symver io_cancel_weak_0_4,io_cancel_weak@@LIBAIO_0.4.1"
+module asm ".symver foo,foo@@VER1"
+
+; Local values used in inline assembly must be specified on the
+; llvm.compiler.used so they aren't incorrectly DCE'd during module linking.
+@llvm.compiler.used = appending global [1 x i8*] [i8* bitcast (i32 ()* @io_cancel_local_0_4 to i8*)], section "llvm.metadata"
 
-; Even without -exported-symbol, io_cancel_0_4 should be noticed by LTOModule's
-; RecordStreamer, so it shouldn't get eliminated. However, the object file will
-; contain the aliased symver as well as the original.
 define i32 @io_cancel_0_4() {
-; CHECK: io_cancel@@LIBAIO_0.4
-; CHECK: io_cancel_0_4
+; CHECK-DAG: T io_cancel@@LIBAIO_0.4
+; CHECK-DAG: T io_cancel_0_4
+  ret i32 0
+}
+
+define internal i32 @io_cancel_local_0_4() {
+; INTERN: llvm.compiler.used {{.*}} @io_cancel_local_0_4
+; INTERN: define internal i32 @io_cancel_local_0_4()
+; CHECK-DAG: t io_cancel_local@@LIBAIO_0.4
+; CHECK-DAG: t io_cancel_local_0_4
+  ret i32 0
+}
+
+define weak i32 @io_cancel_weak_0_4() {
+; CHECK-DAG: W io_cancel_weak@@LIBAIO_0.4
+; CHECK-DAG: W io_cancel_weak@@LIBAIO_0.4.1
+; CHECK-DAG: W io_cancel_weak_0_4
+ret i32 0
+}
+
+define i32 @"\01foo"() {
+; CHECK-DAG: T foo@@VER1
+; CHECK-DAG: T foo
   ret i32 0
 }
diff --git a/test/LTO/X86/symver-asm2.ll b/test/LTO/X86/symver-asm2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..42d6e54bd06ae8caf4f1a4ffb817fa25eb47f6b5
--- /dev/null
+++ b/test/LTO/X86/symver-asm2.ll
@@ -0,0 +1,30 @@
+; Test to ensure symbol binding works correctly for symver directives,
+; when the aliased symbols are defined in inline assembly, including
+; cases when the symbol attributes are provided after the .symver
+; directive.
+
+; RUN: llvm-as < %s >%t1
+; RUN: llvm-lto -o %t2 %t1
+; RUN: llvm-nm %t2 | FileCheck %s
+; RUN: llvm-lto2 run -r %t1,_start,plx -r %t1,_start3,plx -r %t1,foo@@SOME_VERSION -r %t1,foo@SOME_VERSION3 -o %t3 %t1 -save-temps
+; RUN: llvm-nm %t3.0 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+module asm ".global _start"
+module asm "_start:"
+module asm "_start2:"
+module asm "_start3:"
+module asm ".symver _start, foo@@SOME_VERSION"
+module asm ".symver _start2, foo@SOME_VERSION2"
+module asm ".symver _start3, foo@SOME_VERSION3"
+module asm ".local _start2"
+module asm ".weak _start3"
+
+; CHECK-DAG: T _start
+; CHECK-DAG: t _start2
+; CHECK-DAG: W _start3
+; CHECK-DAG: T foo@@SOME_VERSION
+; CHECK-DAG: t foo@SOME_VERSION2
+; CHECK-DAG: W foo@SOME_VERSION3
diff --git a/test/Linker/2011-08-18-unique-class-type2.ll b/test/Linker/2011-08-18-unique-class-type2.ll
index f5cd6333b670c83b7c3a534550978c16dca02578..a933cc3fd7d8192e8c50a8a3ba730c693d471652 100644
--- a/test/Linker/2011-08-18-unique-class-type2.ll
+++ b/test/Linker/2011-08-18-unique-class-type2.ll
@@ -21,7 +21,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone
 !0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.0 (trunk 137954)", isOptimized: true, emissionKind: FullDebug, file: !16, enums: !2, retainedTypes: !2, globals: !2)
 !1 = !{!2}
 !2 = !{}
-!5 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barN2N11AE", line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scope: !6, type: !7)
+!5 = distinct !DISubprogram(name: "bar", linkageName: "_Z3barN2N11AE", file: !16, line: 4, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scope: !6, type: !7)
 !6 = !DIFile(filename: "n2.c", directory: "/private/tmp")
 !7 = !DISubroutineType(types: !8)
 !8 = !{null}
diff --git a/test/Linker/Inputs/linkage.d.ll b/test/Linker/Inputs/linkage.d.ll
new file mode 100644
index 0000000000000000000000000000000000000000..aaf010d3885597dca17e64c6940c418cffed1660
--- /dev/null
+++ b/test/Linker/Inputs/linkage.d.ll
@@ -0,0 +1,5 @@
+@Y = global i8 42
+
+define i64 @foo() { ret i64 7 }
+
+@llvm.used = appending global [2 x i8*] [i8* @Y, i8* bitcast (i64 ()* @foo to i8*)], section "llvm.metadata"
diff --git a/test/Linker/available_externally_a.ll b/test/Linker/available_externally_a.ll
index 3ae4ce29140afa80e7d3e74fbbc8be9d7b6da30d..7a000b6a4aa5962e759305c868e8893b7b6bed0d 100644
--- a/test/Linker/available_externally_a.ll
+++ b/test/Linker/available_externally_a.ll
@@ -1,5 +1,7 @@
 ; RUN: llvm-link %s %p/available_externally_b.ll -S -o - | FileCheck %s
+; RUN: llvm-link %s -S -o - | FileCheck --check-prefix=AE-ONLY %s
 
 @foo = available_externally unnamed_addr constant i32 0
 
 ; CHECK: @foo = hidden unnamed_addr constant i32 0
+; AE-ONLY-NOT: @foo
diff --git a/test/Linker/link-flags.ll b/test/Linker/link-flags.ll
index c901b699575a7511c2783a9d812dadda15bcade9..1a57e8aa4d28ec165b8ade1752e06578d1b65e22 100644
--- a/test/Linker/link-flags.ll
+++ b/test/Linker/link-flags.ll
@@ -2,12 +2,15 @@
 ; RUN: llvm-link -S -only-needed %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=C -check-prefix=CN
 ; RUN: llvm-link -S -internalize %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=CI
 ; RUN: llvm-link -S -internalize -only-needed %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll | FileCheck %s -check-prefix=B -check-prefix=CN
+; RUN: llvm-link -S -internalize %S/Inputs/linkage.b.ll %S/Inputs/linkage.c.ll %S/Inputs/linkage.d.ll | FileCheck %s -check-prefix=B -check-prefix=DI
 
 C-LABEL: @X = global i32 5
 CI-LABEL: @X = internal global i32 5
 CU-LABEL:@U = global i32 6
 CI-LABEL:@U = internal global i32 6
 CN-NOT:@U
+DI-LABEL: @Y = global i8 42
+DI-LABEL: @llvm.used = appending global [2 x i8*] [i8* @Y, i8* bitcast (i64 ()* @foo to i8*)], section "llvm.metadata"
 
 B-LABEL: define void @bar() {
 
@@ -17,3 +20,6 @@ CI-LABEL: define internal i32 @foo()
 CU-LABEL:define i32 @unused() {
 CI-LABEL:define internal i32 @unused() {
 CN-NOT:@unused()
+
+DI-LABEL: define internal i32 @foo.6()
+DI-LABEL: define i64 @foo()
diff --git a/test/MC/AArch64/alias-addsubimm.s b/test/MC/AArch64/alias-addsubimm.s
index 75e0a185572e094a1d6f801017c2c5c6c08eb20f..5c1c4799828cab95738a306573a51f3ad1756437 100644
--- a/test/MC/AArch64/alias-addsubimm.s
+++ b/test/MC/AArch64/alias-addsubimm.s
@@ -1,19 +1,24 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu < %s | FileCheck %s
+// RUN: not llvm-mc -mattr=+no-neg-immediates -triple=aarch64-none-linux-gnu < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-NEG-IMM
 
 // CHECK: sub w0, w2, #2, lsl #12
 // CHECK: sub w0, w2, #2, lsl #12
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         sub w0, w2, #2, lsl 12
         add w0, w2, #-2, lsl 12
 // CHECK: sub x1, x3, #2, lsl #12
 // CHECK: sub x1, x3, #2, lsl #12
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         sub x1, x3, #2, lsl 12
         add x1, x3, #-2, lsl 12
 // CHECK: sub x1, x3, #4
 // CHECK: sub x1, x3, #4
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         sub x1, x3, #4
         add x1, x3, #-4
 // CHECK: sub x1, x3, #4095
 // CHECK: sub x1, x3, #4095
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         sub x1, x3, #4095, lsl 0
         add x1, x3, #-4095, lsl 0
 // CHECK: sub x3, x4, #0
@@ -21,18 +26,22 @@
 
 // CHECK: add w0, w2, #2, lsl #12
 // CHECK: add w0, w2, #2, lsl #12
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         add w0, w2, #2, lsl 12
         sub w0, w2, #-2, lsl 12
 // CHECK: add x1, x3, #2, lsl #12
 // CHECK: add x1, x3, #2, lsl #12
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         add x1, x3, #2, lsl 12
         sub x1, x3, #-2, lsl 12
 // CHECK: add x1, x3, #4
 // CHECK: add x1, x3, #4
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         add x1, x3, #4
         sub x1, x3, #-4
 // CHECK: add x1, x3, #4095
 // CHECK: add x1, x3, #4095
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         add x1, x3, #4095, lsl 0
         sub x1, x3, #-4095, lsl 0
 // CHECK: add x2, x5, #0
@@ -40,18 +49,22 @@
 
 // CHECK: subs w0, w2, #2, lsl #12
 // CHECK: subs w0, w2, #2, lsl #12
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         subs w0, w2, #2, lsl 12
         adds w0, w2, #-2, lsl 12
 // CHECK: subs x1, x3, #2, lsl #12
 // CHECK: subs x1, x3, #2, lsl #12
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         subs x1, x3, #2, lsl 12
         adds x1, x3, #-2, lsl 12
 // CHECK: subs x1, x3, #4
 // CHECK: subs x1, x3, #4
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         subs x1, x3, #4
         adds x1, x3, #-4
 // CHECK: subs x1, x3, #4095
 // CHECK: subs x1, x3, #4095
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         subs x1, x3, #4095, lsl 0
         adds x1, x3, #-4095, lsl 0
 // CHECK: subs x3, x4, #0
@@ -59,18 +72,22 @@
 
 // CHECK: adds w0, w2, #2, lsl #12
 // CHECK: adds w0, w2, #2, lsl #12
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         adds w0, w2, #2, lsl 12
         subs w0, w2, #-2, lsl 12
 // CHECK: adds x1, x3, #2, lsl #12
 // CHECK: adds x1, x3, #2, lsl #12
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         adds x1, x3, #2, lsl 12
         subs x1, x3, #-2, lsl 12
 // CHECK: adds x1, x3, #4
 // CHECK: adds x1, x3, #4
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         adds x1, x3, #4
         subs x1, x3, #-4
 // CHECK: adds x1, x3, #4095
 // CHECK: adds x1, x3, #4095
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         adds x1, x3, #4095, lsl 0
         subs x1, x3, #-4095, lsl 0
 // CHECK: adds x2, x5, #0
@@ -78,17 +95,21 @@
 
 // CHECK: {{adds xzr,|cmn}} x5, #5
 // CHECK: {{adds xzr,|cmn}} x5, #5
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         cmn x5, #5
         cmp x5, #-5
 // CHECK: {{subs xzr,|cmp}} x6, #4095
 // CHECK: {{subs xzr,|cmp}} x6, #4095
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         cmp x6, #4095
         cmn x6, #-4095
 // CHECK: {{adds wzr,|cmn}} w7, #5
 // CHECK: {{adds wzr,|cmn}} w7, #5
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         cmn w7, #5
         cmp w7, #-5
 // CHECK: {{subs wzr,|cmp}} w8, #4095
 // CHECK: {{subs wzr,|cmp}} w8, #4095
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         cmp w8, #4095
         cmn w8, #-4095
diff --git a/test/MC/AArch64/alias-logicalimm.s b/test/MC/AArch64/alias-logicalimm.s
index 28ec40beac4d47f1c5efb9b0f563b197361d3fbe..427a06d6514fcdbcf5ced80a9055d908d204422f 100644
--- a/test/MC/AArch64/alias-logicalimm.s
+++ b/test/MC/AArch64/alias-logicalimm.s
@@ -1,41 +1,50 @@
 // RUN: llvm-mc -triple=aarch64-none-linux-gnu < %s | FileCheck %s
+// RUN: not llvm-mc -mattr=+no-neg-immediates -triple=aarch64-none-linux-gnu < %s 2>&1 | FileCheck %s --check-prefix=CHECK-NO-NEG-IMM
 
 // CHECK: and x0, x1, #0xfffffffffffffffd
 // CHECK: and x0, x1, #0xfffffffffffffffd
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         and x0, x1, #~2
         bic x0, x1, #2
 
 // CHECK: and w0, w1, #0xfffffffd
 // CHECK: and w0, w1, #0xfffffffd
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         and w0, w1, #~2
         bic w0, w1, #2
 
 // CHECK: ands x0, x1, #0xfffffffffffffffd
 // CHECK: ands x0, x1, #0xfffffffffffffffd
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         ands x0, x1, #~2
         bics x0, x1, #2
 
 // CHECK: ands w0, w1, #0xfffffffd
 // CHECK: ands w0, w1, #0xfffffffd
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         ands w0, w1, #~2
         bics w0, w1, #2
 
 // CHECK: orr x0, x1, #0xfffffffffffffffd
 // CHECK: orr x0, x1, #0xfffffffffffffffd
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         orr x0, x1, #~2
         orn x0, x1, #2
 
 // CHECK: orr w2, w1, #0xfffffffc
 // CHECK: orr w2, w1, #0xfffffffc
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         orr w2, w1, #~3
         orn w2, w1, #3
 
 // CHECK: eor x0, x1, #0xfffffffffffffffd
 // CHECK: eor x0, x1, #0xfffffffffffffffd
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         eor x0, x1, #~2
         eon x0, x1, #2
 
 // CHECK: eor w2, w1, #0xfffffffc
 // CHECK: eor w2, w1, #0xfffffffc
+// CHECK-NO-NEG-IMM: instruction requires: NegativeImmediates
         eor w2, w1, #~3
         eon w2, w1, #3
diff --git a/test/MC/AArch64/armv8.1a-lse.s b/test/MC/AArch64/armv8.1a-lse.s
new file mode 100644
index 0000000000000000000000000000000000000000..6143d0e138003a36a07b6f881582e4ded9a9ad40
--- /dev/null
+++ b/test/MC/AArch64/armv8.1a-lse.s
@@ -0,0 +1,5175 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a,+lse -show-encoding < %s 2> %t | FileCheck %s
+// RUN: FileCheck -check-prefix=CHECK-ERROR < %t %s
+  .text
+
+  cas w0, w1, [x2]
+  cas w2, w3, [sp]
+  casa w0, w1, [x2]
+  casa w2, w3, [sp]
+  casl w0, w1, [x2]
+  casl w2, w3, [sp]
+  casal w0, w1, [x2]
+  casal w2, w3, [sp]
+  // CHECK: cas w0, w1, [x2]      // encoding: [0x41,0x7c,0xa0,0x88]
+  // CHECK: cas w2, w3, [sp]      // encoding: [0xe3,0x7f,0xa2,0x88]
+  // CHECK: casa w0, w1, [x2]     // encoding: [0x41,0x7c,0xe0,0x88]
+  // CHECK: casa w2, w3, [sp]     // encoding: [0xe3,0x7f,0xe2,0x88]
+  // CHECK: casl w0, w1, [x2]     // encoding: [0x41,0xfc,0xa0,0x88]
+  // CHECK: casl w2, w3, [sp]     // encoding: [0xe3,0xff,0xa2,0x88]
+  // CHECK: casal w0, w1, [x2]    // encoding: [0x41,0xfc,0xe0,0x88]
+  // CHECK: casal w2, w3, [sp]    // encoding: [0xe3,0xff,0xe2,0x88]
+
+  casb w0, w1, [x2]
+  casb w2, w3, [sp]
+  cash w0, w1, [x2]
+  cash w2, w3, [sp]
+  casab w0, w1, [x2]
+  casab w2, w3, [sp]
+  caslb w0, w1, [x2]
+  caslb w2, w3, [sp]
+  // CHECK: casb w0, w1, [x2]     // encoding: [0x41,0x7c,0xa0,0x08]
+  // CHECK: casb w2, w3, [sp]     // encoding: [0xe3,0x7f,0xa2,0x08]
+  // CHECK: cash w0, w1, [x2]     // encoding: [0x41,0x7c,0xa0,0x48]
+  // CHECK: cash w2, w3, [sp]     // encoding: [0xe3,0x7f,0xa2,0x48]
+  // CHECK: casab w0, w1, [x2]    // encoding: [0x41,0x7c,0xe0,0x08]
+  // CHECK: casab w2, w3, [sp]    // encoding: [0xe3,0x7f,0xe2,0x08]
+  // CHECK: caslb w0, w1, [x2]    // encoding: [0x41,0xfc,0xa0,0x08]
+  // CHECK: caslb w2, w3, [sp]    // encoding: [0xe3,0xff,0xa2,0x08]
+
+  casalb w0, w1, [x2]
+  casalb w2, w3, [sp]
+  casah w0, w1, [x2]
+  casah w2, w3, [sp]
+  caslh w0, w1, [x2]
+  caslh w2, w3, [sp]
+  casalh w0, w1, [x2]
+  casalh w2, w3, [sp]
+  // CHECK: casalb w0, w1, [x2]   // encoding: [0x41,0xfc,0xe0,0x08]
+  // CHECK: casalb w2, w3, [sp]   // encoding: [0xe3,0xff,0xe2,0x08]
+  // CHECK: casah w0, w1, [x2]    // encoding: [0x41,0x7c,0xe0,0x48]
+  // CHECK: casah w2, w3, [sp]    // encoding: [0xe3,0x7f,0xe2,0x48]
+  // CHECK: caslh w0, w1, [x2]    // encoding: [0x41,0xfc,0xa0,0x48]
+  // CHECK: caslh w2, w3, [sp]    // encoding: [0xe3,0xff,0xa2,0x48]
+  // CHECK: casalh w0, w1, [x2]   // encoding: [0x41,0xfc,0xe0,0x48]
+  // CHECK: casalh w2, w3, [sp]   // encoding: [0xe3,0xff,0xe2,0x48]
+
+  cas x0, x1, [x2]
+  cas x2, x3, [sp]
+  casa x0, x1, [x2]
+  casa x2, x3, [sp]
+  casl x0, x1, [x2]
+  casl x2, x3, [sp]
+  casal x0, x1, [x2]
+  casal x2, x3, [sp]
+  // CHECK: cas x0, x1, [x2]      // encoding: [0x41,0x7c,0xa0,0xc8]
+  // CHECK: cas x2, x3, [sp]      // encoding: [0xe3,0x7f,0xa2,0xc8]
+  // CHECK: casa x0, x1, [x2]     // encoding: [0x41,0x7c,0xe0,0xc8]
+  // CHECK: casa x2, x3, [sp]     // encoding: [0xe3,0x7f,0xe2,0xc8]
+  // CHECK: casl x0, x1, [x2]     // encoding: [0x41,0xfc,0xa0,0xc8]
+  // CHECK: casl x2, x3, [sp]     // encoding: [0xe3,0xff,0xa2,0xc8]
+  // CHECK: casal x0, x1, [x2]    // encoding: [0x41,0xfc,0xe0,0xc8]
+  // CHECK: casal x2, x3, [sp]    // encoding: [0xe3,0xff,0xe2,0xc8]
+
+  swp w0, w1, [x2]
+  swp w2, w3, [sp]
+  swpa w0, w1, [x2]
+  swpa w2, w3, [sp]
+  swpl w0, w1, [x2]
+  swpl w2, w3, [sp]
+  swpal w0, w1, [x2]
+  swpal w2, w3, [sp]
+  // CHECK: swp w0, w1, [x2]      // encoding: [0x41,0x80,0x20,0xb8]
+  // CHECK: swp w2, w3, [sp]      // encoding: [0xe3,0x83,0x22,0xb8]
+  // CHECK: swpa w0, w1, [x2]     // encoding: [0x41,0x80,0xa0,0xb8]
+  // CHECK: swpa w2, w3, [sp]     // encoding: [0xe3,0x83,0xa2,0xb8]
+  // CHECK: swpl w0, w1, [x2]     // encoding: [0x41,0x80,0x60,0xb8]
+  // CHECK: swpl w2, w3, [sp]     // encoding: [0xe3,0x83,0x62,0xb8]
+  // CHECK: swpal w0, w1, [x2]    // encoding: [0x41,0x80,0xe0,0xb8]
+  // CHECK: swpal w2, w3, [sp]    // encoding: [0xe3,0x83,0xe2,0xb8]
+
+  swpb w0, w1, [x2]
+  swpb w2, w3, [sp]
+  swph w0, w1, [x2]
+  swph w2, w3, [sp]
+  swpab w0, w1, [x2]
+  swpab w2, w3, [sp]
+  swplb w0, w1, [x2]
+  swplb w2, w3, [sp]
+  // CHECK: swpb w0, w1, [x2]     // encoding: [0x41,0x80,0x20,0x38]
+  // CHECK: swpb w2, w3, [sp]     // encoding: [0xe3,0x83,0x22,0x38]
+  // CHECK: swph w0, w1, [x2]     // encoding: [0x41,0x80,0x20,0x78]
+  // CHECK: swph w2, w3, [sp]     // encoding: [0xe3,0x83,0x22,0x78]
+  // CHECK: swpab w0, w1, [x2]    // encoding: [0x41,0x80,0xa0,0x38]
+  // CHECK: swpab w2, w3, [sp]    // encoding: [0xe3,0x83,0xa2,0x38]
+  // CHECK: swplb w0, w1, [x2]    // encoding: [0x41,0x80,0x60,0x38]
+  // CHECK: swplb w2, w3, [sp]    // encoding: [0xe3,0x83,0x62,0x38]
+
+  swpalb w0, w1, [x2]
+  swpalb w2, w3, [sp]
+  swpah w0, w1, [x2]
+  swpah w2, w3, [sp]
+  swplh w0, w1, [x2]
+  swplh w2, w3, [sp]
+  swpalh w0, w1, [x2]
+  swpalh w2, w3, [sp]
+  // CHECK: swpalb w0, w1, [x2]   // encoding: [0x41,0x80,0xe0,0x38]
+  // CHECK: swpalb w2, w3, [sp]   // encoding: [0xe3,0x83,0xe2,0x38]
+  // CHECK: swpah w0, w1, [x2]    // encoding: [0x41,0x80,0xa0,0x78]
+  // CHECK: swpah w2, w3, [sp]    // encoding: [0xe3,0x83,0xa2,0x78]
+  // CHECK: swplh w0, w1, [x2]    // encoding: [0x41,0x80,0x60,0x78]
+  // CHECK: swplh w2, w3, [sp]    // encoding: [0xe3,0x83,0x62,0x78]
+  // CHECK: swpalh w0, w1, [x2]   // encoding: [0x41,0x80,0xe0,0x78]
+  // CHECK: swpalh w2, w3, [sp]   // encoding: [0xe3,0x83,0xe2,0x78]
+
+  swp x0, x1, [x2]
+  swp x2, x3, [sp]
+  swpa x0, x1, [x2]
+  swpa x2, x3, [sp]
+  swpl x0, x1, [x2]
+  swpl x2, x3, [sp]
+  swpal x0, x1, [x2]
+  swpal x2, x3, [sp]
+  // CHECK: swp x0, x1, [x2]      // encoding: [0x41,0x80,0x20,0xf8]
+  // CHECK: swp x2, x3, [sp]      // encoding: [0xe3,0x83,0x22,0xf8]
+  // CHECK: swpa x0, x1, [x2]     // encoding: [0x41,0x80,0xa0,0xf8]
+  // CHECK: swpa x2, x3, [sp]     // encoding: [0xe3,0x83,0xa2,0xf8]
+  // CHECK: swpl x0, x1, [x2]     // encoding: [0x41,0x80,0x60,0xf8]
+  // CHECK: swpl x2, x3, [sp]     // encoding: [0xe3,0x83,0x62,0xf8]
+  // CHECK: swpal x0, x1, [x2]    // encoding: [0x41,0x80,0xe0,0xf8]
+  // CHECK: swpal x2, x3, [sp]    // encoding: [0xe3,0x83,0xe2,0xf8]
+
+  casp w0, w1, w2, w3, [x5]
+  casp w4, w5, w6, w7, [sp]
+  casp x0, x1, x2, x3, [x2]
+  casp x4, x5, x6, x7, [sp]
+  caspa w0, w1, w2, w3, [x5]
+  caspa w4, w5, w6, w7, [sp]
+  caspa x0, x1, x2, x3, [x2]
+  caspa x4, x5, x6, x7, [sp]
+  // CHECK: casp w0, w1, w2, w3, [x5]     // encoding: [0xa2,0x7c,0x20,0x08]
+  // CHECK: casp w4, w5, w6, w7, [sp]     // encoding: [0xe6,0x7f,0x24,0x08]
+  // CHECK: casp x0, x1, x2, x3, [x2]     // encoding: [0x42,0x7c,0x20,0x48]
+  // CHECK: casp x4, x5, x6, x7, [sp]     // encoding: [0xe6,0x7f,0x24,0x48]
+  // CHECK: caspa w0, w1, w2, w3, [x5]    // encoding: [0xa2,0x7c,0x60,0x08]
+  // CHECK: caspa w4, w5, w6, w7, [sp]    // encoding: [0xe6,0x7f,0x64,0x08]
+  // CHECK: caspa x0, x1, x2, x3, [x2]    // encoding: [0x42,0x7c,0x60,0x48]
+  // CHECK: caspa x4, x5, x6, x7, [sp]    // encoding: [0xe6,0x7f,0x64,0x48]
+
+  caspl w0, w1, w2, w3, [x5]
+  caspl w4, w5, w6, w7, [sp]
+  caspl x0, x1, x2, x3, [x2]
+  caspl x4, x5, x6, x7, [sp]
+  caspal w0, w1, w2, w3, [x5]
+  caspal w4, w5, w6, w7, [sp]
+  caspal x0, x1, x2, x3, [x2]
+  caspal x4, x5, x6, x7, [sp]
+  // CHECK: caspl w0, w1, w2, w3, [x5]    // encoding: [0xa2,0xfc,0x20,0x08]
+  // CHECK: caspl w4, w5, w6, w7, [sp]    // encoding: [0xe6,0xff,0x24,0x08]
+  // CHECK: caspl x0, x1, x2, x3, [x2]    // encoding: [0x42,0xfc,0x20,0x48]
+  // CHECK: caspl x4, x5, x6, x7, [sp]    // encoding: [0xe6,0xff,0x24,0x48]
+  // CHECK: caspal w0, w1, w2, w3, [x5]   // encoding: [0xa2,0xfc,0x60,0x08]
+  // CHECK: caspal w4, w5, w6, w7, [sp]   // encoding: [0xe6,0xff,0x64,0x08]
+  // CHECK: caspal x0, x1, x2, x3, [x2]   // encoding: [0x42,0xfc,0x60,0x48]
+  // CHECK: caspal x4, x5, x6, x7, [sp]   // encoding: [0xe6,0xff,0x64,0x48]
+
+  ldadd w0, w1, [x2]
+  ldadd w2, w3, [sp]
+  ldadda w0, w1, [x2]
+  ldadda w2, w3, [sp]
+  ldaddl w0, w1, [x2]
+  ldaddl w2, w3, [sp]
+  ldaddal w0, w1, [x2]
+  ldaddal w2, w3, [sp]
+  // CHECK: ldadd w0, w1, [x2]     // encoding: [0x41,0x00,0x20,0xb8]
+  // CHECK: ldadd w2, w3, [sp]     // encoding: [0xe3,0x03,0x22,0xb8]
+  // CHECK: ldadda w0, w1, [x2]    // encoding: [0x41,0x00,0xa0,0xb8]
+  // CHECK: ldadda w2, w3, [sp]    // encoding: [0xe3,0x03,0xa2,0xb8]
+  // CHECK: ldaddl w0, w1, [x2]    // encoding: [0x41,0x00,0x60,0xb8]
+  // CHECK: ldaddl w2, w3, [sp]    // encoding: [0xe3,0x03,0x62,0xb8]
+  // CHECK: ldaddal w0, w1, [x2]   // encoding: [0x41,0x00,0xe0,0xb8]
+  // CHECK: ldaddal w2, w3, [sp]   // encoding: [0xe3,0x03,0xe2,0xb8]
+
+  ldaddb w0, w1, [x2]
+  ldaddb w2, w3, [sp]
+  ldaddh w0, w1, [x2]
+  ldaddh w2, w3, [sp]
+  ldaddab w0, w1, [x2]
+  ldaddab w2, w3, [sp]
+  ldaddlb w0, w1, [x2]
+  ldaddlb w2, w3, [sp]
+  // CHECK: ldaddb w0, w1, [x2]       // encoding: [0x41,0x00,0x20,0x38]
+  // CHECK: ldaddb w2, w3, [sp]       // encoding: [0xe3,0x03,0x22,0x38]
+  // CHECK: ldaddh w0, w1, [x2]       // encoding: [0x41,0x00,0x20,0x78]
+  // CHECK: ldaddh w2, w3, [sp]       // encoding: [0xe3,0x03,0x22,0x78]
+  // CHECK: ldaddab w0, w1, [x2]      // encoding: [0x41,0x00,0xa0,0x38]
+  // CHECK: ldaddab w2, w3, [sp]      // encoding: [0xe3,0x03,0xa2,0x38]
+  // CHECK: ldaddlb w0, w1, [x2]      // encoding: [0x41,0x00,0x60,0x38]
+  // CHECK: ldaddlb w2, w3, [sp]      // encoding: [0xe3,0x03,0x62,0x38]
+
+  ldaddalb w0, w1, [x2]
+  ldaddalb w2, w3, [sp]
+  ldaddah w0, w1, [x2]
+  ldaddah w2, w3, [sp]
+  ldaddlh w0, w1, [x2]
+  ldaddlh w2, w3, [sp]
+  ldaddalh w0, w1, [x2]
+  ldaddalh w2, w3, [sp]
+  // CHECK: ldaddalb w0, w1, [x2]   // encoding: [0x41,0x00,0xe0,0x38]
+  // CHECK: ldaddalb w2, w3, [sp]   // encoding: [0xe3,0x03,0xe2,0x38]
+  // CHECK: ldaddah w0, w1, [x2]    // encoding: [0x41,0x00,0xa0,0x78]
+  // CHECK: ldaddah w2, w3, [sp]    // encoding: [0xe3,0x03,0xa2,0x78]
+  // CHECK: ldaddlh w0, w1, [x2]    // encoding: [0x41,0x00,0x60,0x78]
+  // CHECK: ldaddlh w2, w3, [sp]    // encoding: [0xe3,0x03,0x62,0x78]
+  // CHECK: ldaddalh w0, w1, [x2]   // encoding: [0x41,0x00,0xe0,0x78]
+  // CHECK: ldaddalh w2, w3, [sp]   // encoding: [0xe3,0x03,0xe2,0x78]
+
+  ldadd x0, x1, [x2]
+  ldadd x2, x3, [sp]
+  ldadda x0, x1, [x2]
+  ldadda x2, x3, [sp]
+  ldaddl x0, x1, [x2]
+  ldaddl x2, x3, [sp]
+  ldaddal x0, x1, [x2]
+  ldaddal x2, x3, [sp]
+  // CHECK: ldadd x0, x1, [x2]    // encoding: [0x41,0x00,0x20,0xf8]
+  // CHECK: ldadd x2, x3, [sp]    // encoding: [0xe3,0x03,0x22,0xf8]
+  // CHECK: ldadda x0, x1, [x2]   // encoding: [0x41,0x00,0xa0,0xf8]
+  // CHECK: ldadda x2, x3, [sp]   // encoding: [0xe3,0x03,0xa2,0xf8]
+  // CHECK: ldaddl x0, x1, [x2]   // encoding: [0x41,0x00,0x60,0xf8]
+  // CHECK: ldaddl x2, x3, [sp]   // encoding: [0xe3,0x03,0x62,0xf8]
+  // CHECK: ldaddal x0, x1, [x2]  // encoding: [0x41,0x00,0xe0,0xf8]
+  // CHECK: ldaddal x2, x3, [sp]  // encoding: [0xe3,0x03,0xe2,0xf8]
+
+  ldclr w0, w1, [x2]
+  ldclr w2, w3, [sp]
+  ldclra w0, w1, [x2]
+  ldclra w2, w3, [sp]
+  ldclrl w0, w1, [x2]
+  ldclrl w2, w3, [sp]
+  ldclral w0, w1, [x2]
+  ldclral w2, w3, [sp]
+  // CHECK: ldclr w0, w1, [x2]    // encoding: [0x41,0x10,0x20,0xb8]
+  // CHECK: ldclr w2, w3, [sp]    // encoding: [0xe3,0x13,0x22,0xb8]
+  // CHECK: ldclra w0, w1, [x2]   // encoding: [0x41,0x10,0xa0,0xb8]
+  // CHECK: ldclra w2, w3, [sp]   // encoding: [0xe3,0x13,0xa2,0xb8]
+  // CHECK: ldclrl w0, w1, [x2]   // encoding: [0x41,0x10,0x60,0xb8]
+  // CHECK: ldclrl w2, w3, [sp]   // encoding: [0xe3,0x13,0x62,0xb8]
+  // CHECK: ldclral w0, w1, [x2]  // encoding: [0x41,0x10,0xe0,0xb8]
+  // CHECK: ldclral w2, w3, [sp]  // encoding: [0xe3,0x13,0xe2,0xb8]
+
+  ldclrb w0, w1, [x2]
+  ldclrb w2, w3, [sp]
+  ldclrh w0, w1, [x2]
+  ldclrh w2, w3, [sp]
+  ldclrab w0, w1, [x2]
+  ldclrab w2, w3, [sp]
+  ldclrlb w0, w1, [x2]
+  ldclrlb w2, w3, [sp]
+  // CHECK: ldclrb w0, w1, [x2]    // encoding: [0x41,0x10,0x20,0x38]
+  // CHECK: ldclrb w2, w3, [sp]    // encoding: [0xe3,0x13,0x22,0x38]
+  // CHECK: ldclrh w0, w1, [x2]    // encoding: [0x41,0x10,0x20,0x78]
+  // CHECK: ldclrh w2, w3, [sp]    // encoding: [0xe3,0x13,0x22,0x78]
+  // CHECK: ldclrab w0, w1, [x2]   // encoding: [0x41,0x10,0xa0,0x38]
+  // CHECK: ldclrab w2, w3, [sp]   // encoding: [0xe3,0x13,0xa2,0x38]
+  // CHECK: ldclrlb w0, w1, [x2]   // encoding: [0x41,0x10,0x60,0x38]
+  // CHECK: ldclrlb w2, w3, [sp]   // encoding: [0xe3,0x13,0x62,0x38]
+
+  ldclralb w0, w1, [x2]
+  ldclralb w2, w3, [sp]
+  ldclrah w0, w1, [x2]
+  ldclrah w2, w3, [sp]
+  ldclrlh w0, w1, [x2]
+  ldclrlh w2, w3, [sp]
+  ldclralh w0, w1, [x2]
+  ldclralh w2, w3, [sp]
+  // CHECK: ldclralb w0, w1, [x2]   // encoding: [0x41,0x10,0xe0,0x38]
+  // CHECK: ldclralb w2, w3, [sp]   // encoding: [0xe3,0x13,0xe2,0x38]
+  // CHECK: ldclrah w0, w1, [x2]    // encoding: [0x41,0x10,0xa0,0x78]
+  // CHECK: ldclrah w2, w3, [sp]    // encoding: [0xe3,0x13,0xa2,0x78]
+  // CHECK: ldclrlh w0, w1, [x2]    // encoding: [0x41,0x10,0x60,0x78]
+  // CHECK: ldclrlh w2, w3, [sp]    // encoding: [0xe3,0x13,0x62,0x78]
+  // CHECK: ldclralh w0, w1, [x2]   // encoding: [0x41,0x10,0xe0,0x78]
+  // CHECK: ldclralh w2, w3, [sp]   // encoding: [0xe3,0x13,0xe2,0x78]
+
+  ldclr x0, x1, [x2]
+  ldclr x2, x3, [sp]
+  ldclra x0, x1, [x2]
+  ldclra x2, x3, [sp]
+  ldclrl x0, x1, [x2]
+  ldclrl x2, x3, [sp]
+  ldclral x0, x1, [x2]
+  ldclral x2, x3, [sp]
+  // CHECK: ldclr x0, x1, [x2]      // encoding: [0x41,0x10,0x20,0xf8]
+  // CHECK: ldclr x2, x3, [sp]      // encoding: [0xe3,0x13,0x22,0xf8]
+  // CHECK: ldclra x0, x1, [x2]     // encoding: [0x41,0x10,0xa0,0xf8]
+  // CHECK: ldclra x2, x3, [sp]     // encoding: [0xe3,0x13,0xa2,0xf8]
+  // CHECK: ldclrl x0, x1, [x2]     // encoding: [0x41,0x10,0x60,0xf8]
+  // CHECK: ldclrl x2, x3, [sp]     // encoding: [0xe3,0x13,0x62,0xf8]
+  // CHECK: ldclral x0, x1, [x2]    // encoding: [0x41,0x10,0xe0,0xf8]
+  // CHECK: ldclral x2, x3, [sp]    // encoding: [0xe3,0x13,0xe2,0xf8]
+
+  ldeor w0, w1, [x2]
+  ldeor w2, w3, [sp]
+  ldeora w0, w1, [x2]
+  ldeora w2, w3, [sp]
+  ldeorl w0, w1, [x2]
+  ldeorl w2, w3, [sp]
+  ldeoral w0, w1, [x2]
+  ldeoral w2, w3, [sp]
+  // CHECK: ldeor w0, w1, [x2]    // encoding: [0x41,0x20,0x20,0xb8]
+  // CHECK: ldeor w2, w3, [sp]    // encoding: [0xe3,0x23,0x22,0xb8]
+  // CHECK: ldeora w0, w1, [x2]   // encoding: [0x41,0x20,0xa0,0xb8]
+  // CHECK: ldeora w2, w3, [sp]   // encoding: [0xe3,0x23,0xa2,0xb8]
+  // CHECK: ldeorl w0, w1, [x2]   // encoding: [0x41,0x20,0x60,0xb8]
+  // CHECK: ldeorl w2, w3, [sp]   // encoding: [0xe3,0x23,0x62,0xb8]
+  // CHECK: ldeoral w0, w1, [x2]  // encoding: [0x41,0x20,0xe0,0xb8]
+  // CHECK: ldeoral w2, w3, [sp]  // encoding: [0xe3,0x23,0xe2,0xb8]
+
+  ldeorb w0, w1, [x2]
+  ldeorb w2, w3, [sp]
+  ldeorh w0, w1, [x2]
+  ldeorh w2, w3, [sp]
+  ldeorab w0, w1, [x2]
+  ldeorab w2, w3, [sp]
+  ldeorlb w0, w1, [x2]
+  ldeorlb w2, w3, [sp]
+  // CHECK: ldeorb w0, w1, [x2]    // encoding: [0x41,0x20,0x20,0x38]
+  // CHECK: ldeorb w2, w3, [sp]    // encoding: [0xe3,0x23,0x22,0x38]
+  // CHECK: ldeorh w0, w1, [x2]    // encoding: [0x41,0x20,0x20,0x78]
+  // CHECK: ldeorh w2, w3, [sp]    // encoding: [0xe3,0x23,0x22,0x78]
+  // CHECK: ldeorab w0, w1, [x2]   // encoding: [0x41,0x20,0xa0,0x38]
+  // CHECK: ldeorab w2, w3, [sp]   // encoding: [0xe3,0x23,0xa2,0x38]
+  // CHECK: ldeorlb w0, w1, [x2]   // encoding: [0x41,0x20,0x60,0x38]
+  // CHECK: ldeorlb w2, w3, [sp]   // encoding: [0xe3,0x23,0x62,0x38]
+
+  ldeoralb w0, w1, [x2]
+  ldeoralb w2, w3, [sp]
+  ldeorah w0, w1, [x2]
+  ldeorah w2, w3, [sp]
+  ldeorlh w0, w1, [x2]
+  ldeorlh w2, w3, [sp]
+  ldeoralh w0, w1, [x2]
+  ldeoralh w2, w3, [sp]
+  // CHECK: ldeoralb w0, w1, [x2]   // encoding: [0x41,0x20,0xe0,0x38]
+  // CHECK: ldeoralb w2, w3, [sp]   // encoding: [0xe3,0x23,0xe2,0x38]
+  // CHECK: ldeorah w0, w1, [x2]    // encoding: [0x41,0x20,0xa0,0x78]
+  // CHECK: ldeorah w2, w3, [sp]    // encoding: [0xe3,0x23,0xa2,0x78]
+  // CHECK: ldeorlh w0, w1, [x2]    // encoding: [0x41,0x20,0x60,0x78]
+  // CHECK: ldeorlh w2, w3, [sp]    // encoding: [0xe3,0x23,0x62,0x78]
+  // CHECK: ldeoralh w0, w1, [x2]   // encoding: [0x41,0x20,0xe0,0x78]
+  // CHECK: ldeoralh w2, w3, [sp]   // encoding: [0xe3,0x23,0xe2,0x78]
+
+  ldeor x0, x1, [x2]
+  ldeor x2, x3, [sp]
+  ldeora x0, x1, [x2]
+  ldeora x2, x3, [sp]
+  ldeorl x0, x1, [x2]
+  ldeorl x2, x3, [sp]
+  ldeoral x0, x1, [x2]
+  ldeoral x2, x3, [sp]
+  // CHECK: ldeor x0, x1, [x2]     // encoding: [0x41,0x20,0x20,0xf8]
+  // CHECK: ldeor x2, x3, [sp]     // encoding: [0xe3,0x23,0x22,0xf8]
+  // CHECK: ldeora x0, x1, [x2]    // encoding: [0x41,0x20,0xa0,0xf8]
+  // CHECK: ldeora x2, x3, [sp]    // encoding: [0xe3,0x23,0xa2,0xf8]
+  // CHECK: ldeorl x0, x1, [x2]    // encoding: [0x41,0x20,0x60,0xf8]
+  // CHECK: ldeorl x2, x3, [sp]    // encoding: [0xe3,0x23,0x62,0xf8]
+  // CHECK: ldeoral x0, x1, [x2]   // encoding: [0x41,0x20,0xe0,0xf8]
+  // CHECK: ldeoral x2, x3, [sp]   // encoding: [0xe3,0x23,0xe2,0xf8]
+
+  ldset w0, w1, [x2]
+  ldset w2, w3, [sp]
+  ldseta w0, w1, [x2]
+  ldseta w2, w3, [sp]
+  ldsetl w0, w1, [x2]
+  ldsetl w2, w3, [sp]
+  ldsetal w0, w1, [x2]
+  ldsetal w2, w3, [sp]
+  // CHECK: ldset w0, w1, [x2]      // encoding: [0x41,0x30,0x20,0xb8]
+  // CHECK: ldset w2, w3, [sp]      // encoding: [0xe3,0x33,0x22,0xb8]
+  // CHECK: ldseta w0, w1, [x2]     // encoding: [0x41,0x30,0xa0,0xb8]
+  // CHECK: ldseta w2, w3, [sp]     // encoding: [0xe3,0x33,0xa2,0xb8]
+  // CHECK: ldsetl w0, w1, [x2]     // encoding: [0x41,0x30,0x60,0xb8]
+  // CHECK: ldsetl w2, w3, [sp]     // encoding: [0xe3,0x33,0x62,0xb8]
+  // CHECK: ldsetal w0, w1, [x2]    // encoding: [0x41,0x30,0xe0,0xb8]
+  // CHECK: ldsetal w2, w3, [sp]    // encoding: [0xe3,0x33,0xe2,0xb8]
+
+  ldsetb w0, w1, [x2]
+  ldsetb w2, w3, [sp]
+  ldseth w0, w1, [x2]
+  ldseth w2, w3, [sp]
+  ldsetab w0, w1, [x2]
+  ldsetab w2, w3, [sp]
+  ldsetlb w0, w1, [x2]
+  ldsetlb w2, w3, [sp]
+  // CHECK: ldsetb w0, w1, [x2]     // encoding: [0x41,0x30,0x20,0x38]
+  // CHECK: ldsetb w2, w3, [sp]     // encoding: [0xe3,0x33,0x22,0x38]
+  // CHECK: ldseth w0, w1, [x2]     // encoding: [0x41,0x30,0x20,0x78]
+  // CHECK: ldseth w2, w3, [sp]     // encoding: [0xe3,0x33,0x22,0x78]
+  // CHECK: ldsetab w0, w1, [x2]    // encoding: [0x41,0x30,0xa0,0x38]
+  // CHECK: ldsetab w2, w3, [sp]    // encoding: [0xe3,0x33,0xa2,0x38]
+  // CHECK: ldsetlb w0, w1, [x2]    // encoding: [0x41,0x30,0x60,0x38]
+  // CHECK: ldsetlb w2, w3, [sp]    // encoding: [0xe3,0x33,0x62,0x38]
+
+  ldsetalb w0, w1, [x2]
+  ldsetalb w2, w3, [sp]
+  ldsetah w0, w1, [x2]
+  ldsetah w2, w3, [sp]
+  ldsetlh w0, w1, [x2]
+  ldsetlh w2, w3, [sp]
+  ldsetalh w0, w1, [x2]
+  ldsetalh w2, w3, [sp]
+  // CHECK: ldsetalb w0, w1, [x2]     // encoding: [0x41,0x30,0xe0,0x38]
+  // CHECK: ldsetalb w2, w3, [sp]     // encoding: [0xe3,0x33,0xe2,0x38]
+  // CHECK: ldsetah w0, w1, [x2]      // encoding: [0x41,0x30,0xa0,0x78]
+  // CHECK: ldsetah w2, w3, [sp]      // encoding: [0xe3,0x33,0xa2,0x78]
+  // CHECK: ldsetlh w0, w1, [x2]      // encoding: [0x41,0x30,0x60,0x78]
+  // CHECK: ldsetlh w2, w3, [sp]      // encoding: [0xe3,0x33,0x62,0x78]
+  // CHECK: ldsetalh w0, w1, [x2]     // encoding: [0x41,0x30,0xe0,0x78]
+  // CHECK: ldsetalh w2, w3, [sp]     // encoding: [0xe3,0x33,0xe2,0x78]
+
+  ldset x0, x1, [x2]
+  ldset x2, x3, [sp]
+  ldseta x0, x1, [x2]
+  ldseta x2, x3, [sp]
+  ldsetl x0, x1, [x2]
+  ldsetl x2, x3, [sp]
+  ldsetal x0, x1, [x2]
+  ldsetal x2, x3, [sp]
+  // CHECK: ldset x0, x1, [x2]     // encoding: [0x41,0x30,0x20,0xf8]
+  // CHECK: ldset x2, x3, [sp]     // encoding: [0xe3,0x33,0x22,0xf8]
+  // CHECK: ldseta x0, x1, [x2]    // encoding: [0x41,0x30,0xa0,0xf8]
+  // CHECK: ldseta x2, x3, [sp]    // encoding: [0xe3,0x33,0xa2,0xf8]
+  // CHECK: ldsetl x0, x1, [x2]    // encoding: [0x41,0x30,0x60,0xf8]
+  // CHECK: ldsetl x2, x3, [sp]    // encoding: [0xe3,0x33,0x62,0xf8]
+  // CHECK: ldsetal x0, x1, [x2]   // encoding: [0x41,0x30,0xe0,0xf8]
+  // CHECK: ldsetal x2, x3, [sp]   // encoding: [0xe3,0x33,0xe2,0xf8]
+
+  ldsmax w0, w1, [x2]
+  ldsmax w2, w3, [sp]
+  ldsmaxa w0, w1, [x2]
+  ldsmaxa w2, w3, [sp]
+  ldsmaxl w0, w1, [x2]
+  ldsmaxl w2, w3, [sp]
+  ldsmaxal w0, w1, [x2]
+  ldsmaxal w2, w3, [sp]
+  // CHECK: ldsmax w0, w1, [x2]     // encoding: [0x41,0x40,0x20,0xb8]
+  // CHECK: ldsmax w2, w3, [sp]     // encoding: [0xe3,0x43,0x22,0xb8]
+  // CHECK: ldsmaxa w0, w1, [x2]    // encoding: [0x41,0x40,0xa0,0xb8]
+  // CHECK: ldsmaxa w2, w3, [sp]    // encoding: [0xe3,0x43,0xa2,0xb8]
+  // CHECK: ldsmaxl w0, w1, [x2]    // encoding: [0x41,0x40,0x60,0xb8]
+  // CHECK: ldsmaxl w2, w3, [sp]    // encoding: [0xe3,0x43,0x62,0xb8]
+  // CHECK: ldsmaxal w0, w1, [x2]   // encoding: [0x41,0x40,0xe0,0xb8]
+  // CHECK: ldsmaxal w2, w3, [sp]   // encoding: [0xe3,0x43,0xe2,0xb8]
+
+  ldsmaxb w0, w1, [x2]
+  ldsmaxb w2, w3, [sp]
+  ldsmaxh w0, w1, [x2]
+  ldsmaxh w2, w3, [sp]
+  ldsmaxab w0, w1, [x2]
+  ldsmaxab w2, w3, [sp]
+  ldsmaxlb w0, w1, [x2]
+  ldsmaxlb w2, w3, [sp]
+  // CHECK: ldsmaxb w0, w1, [x2]     // encoding: [0x41,0x40,0x20,0x38]
+  // CHECK: ldsmaxb w2, w3, [sp]     // encoding: [0xe3,0x43,0x22,0x38]
+  // CHECK: ldsmaxh w0, w1, [x2]     // encoding: [0x41,0x40,0x20,0x78]
+  // CHECK: ldsmaxh w2, w3, [sp]     // encoding: [0xe3,0x43,0x22,0x78]
+  // CHECK: ldsmaxab w0, w1, [x2]    // encoding: [0x41,0x40,0xa0,0x38]
+  // CHECK: ldsmaxab w2, w3, [sp]    // encoding: [0xe3,0x43,0xa2,0x38]
+  // CHECK: ldsmaxlb w0, w1, [x2]    // encoding: [0x41,0x40,0x60,0x38]
+  // CHECK: ldsmaxlb w2, w3, [sp]    // encoding: [0xe3,0x43,0x62,0x38]
+
+  ldsmaxalb w0, w1, [x2]
+  ldsmaxalb w2, w3, [sp]
+  ldsmaxah w0, w1, [x2]
+  ldsmaxah w2, w3, [sp]
+  ldsmaxlh w0, w1, [x2]
+  ldsmaxlh w2, w3, [sp]
+  ldsmaxalh w0, w1, [x2]
+  ldsmaxalh w2, w3, [sp]
+  // CHECK: ldsmaxalb w0, w1, [x2]    // encoding: [0x41,0x40,0xe0,0x38]
+  // CHECK: ldsmaxalb w2, w3, [sp]    // encoding: [0xe3,0x43,0xe2,0x38]
+  // CHECK: ldsmaxah w0, w1, [x2]     // encoding: [0x41,0x40,0xa0,0x78]
+  // CHECK: ldsmaxah w2, w3, [sp]     // encoding: [0xe3,0x43,0xa2,0x78]
+  // CHECK: ldsmaxlh w0, w1, [x2]     // encoding: [0x41,0x40,0x60,0x78]
+  // CHECK: ldsmaxlh w2, w3, [sp]     // encoding: [0xe3,0x43,0x62,0x78]
+  // CHECK: ldsmaxalh w0, w1, [x2]    // encoding: [0x41,0x40,0xe0,0x78]
+  // CHECK: ldsmaxalh w2, w3, [sp]    // encoding: [0xe3,0x43,0xe2,0x78]
+
+  ldsmax x0, x1, [x2]
+  ldsmax x2, x3, [sp]
+  ldsmaxa x0, x1, [x2]
+  ldsmaxa x2, x3, [sp]
+  ldsmaxl x0, x1, [x2]
+  ldsmaxl x2, x3, [sp]
+  ldsmaxal x0, x1, [x2]
+  ldsmaxal x2, x3, [sp]
+  // CHECK: ldsmax x0, x1, [x2]     // encoding: [0x41,0x40,0x20,0xf8]
+  // CHECK: ldsmax x2, x3, [sp]     // encoding: [0xe3,0x43,0x22,0xf8]
+  // CHECK: ldsmaxa x0, x1, [x2]    // encoding: [0x41,0x40,0xa0,0xf8]
+  // CHECK: ldsmaxa x2, x3, [sp]    // encoding: [0xe3,0x43,0xa2,0xf8]
+  // CHECK: ldsmaxl x0, x1, [x2]    // encoding: [0x41,0x40,0x60,0xf8]
+  // CHECK: ldsmaxl x2, x3, [sp]    // encoding: [0xe3,0x43,0x62,0xf8]
+  // CHECK: ldsmaxal x0, x1, [x2]   // encoding: [0x41,0x40,0xe0,0xf8]
+  // CHECK: ldsmaxal x2, x3, [sp]   // encoding: [0xe3,0x43,0xe2,0xf8]
+
+  ldsmin w0, w1, [x2]
+  ldsmin w2, w3, [sp]
+  ldsmina w0, w1, [x2]
+  ldsmina w2, w3, [sp]
+  ldsminl w0, w1, [x2]
+  ldsminl w2, w3, [sp]
+  ldsminal w0, w1, [x2]
+  ldsminal w2, w3, [sp]
+  // CHECK: ldsmin w0, w1, [x2]     // encoding: [0x41,0x50,0x20,0xb8]
+  // CHECK: ldsmin w2, w3, [sp]     // encoding: [0xe3,0x53,0x22,0xb8]
+  // CHECK: ldsmina w0, w1, [x2]    // encoding: [0x41,0x50,0xa0,0xb8]
+  // CHECK: ldsmina w2, w3, [sp]    // encoding: [0xe3,0x53,0xa2,0xb8]
+  // CHECK: ldsminl w0, w1, [x2]    // encoding: [0x41,0x50,0x60,0xb8]
+  // CHECK: ldsminl w2, w3, [sp]    // encoding: [0xe3,0x53,0x62,0xb8]
+  // CHECK: ldsminal w0, w1, [x2]   // encoding: [0x41,0x50,0xe0,0xb8]
+  // CHECK: ldsminal w2, w3, [sp]   // encoding: [0xe3,0x53,0xe2,0xb8]
+
+  ldsminb w0, w1, [x2]
+  ldsminb w2, w3, [sp]
+  ldsminh w0, w1, [x2]
+  ldsminh w2, w3, [sp]
+  ldsminab w0, w1, [x2]
+  ldsminab w2, w3, [sp]
+  ldsminlb w0, w1, [x2]
+  ldsminlb w2, w3, [sp]
+  // CHECK: ldsminb w0, w1, [x2]      // encoding: [0x41,0x50,0x20,0x38]
+  // CHECK: ldsminb w2, w3, [sp]      // encoding: [0xe3,0x53,0x22,0x38]
+  // CHECK: ldsminh w0, w1, [x2]      // encoding: [0x41,0x50,0x20,0x78]
+  // CHECK: ldsminh w2, w3, [sp]      // encoding: [0xe3,0x53,0x22,0x78]
+  // CHECK: ldsminab w0, w1, [x2]     // encoding: [0x41,0x50,0xa0,0x38]
+  // CHECK: ldsminab w2, w3, [sp]     // encoding: [0xe3,0x53,0xa2,0x38]
+  // CHECK: ldsminlb w0, w1, [x2]     // encoding: [0x41,0x50,0x60,0x38]
+  // CHECK: ldsminlb w2, w3, [sp]     // encoding: [0xe3,0x53,0x62,0x38]
+
+  ldsminalb w0, w1, [x2]
+  ldsminalb w2, w3, [sp]
+  ldsminah w0, w1, [x2]
+  ldsminah w2, w3, [sp]
+  ldsminlh w0, w1, [x2]
+  ldsminlh w2, w3, [sp]
+  ldsminalh w0, w1, [x2]
+  ldsminalh w2, w3, [sp]
+  // CHECK: ldsminalb w0, w1, [x2]    // encoding: [0x41,0x50,0xe0,0x38]
+  // CHECK: ldsminalb w2, w3, [sp]    // encoding: [0xe3,0x53,0xe2,0x38]
+  // CHECK: ldsminah w0, w1, [x2]     // encoding: [0x41,0x50,0xa0,0x78]
+  // CHECK: ldsminah w2, w3, [sp]     // encoding: [0xe3,0x53,0xa2,0x78]
+  // CHECK: ldsminlh w0, w1, [x2]     // encoding: [0x41,0x50,0x60,0x78]
+  // CHECK: ldsminlh w2, w3, [sp]     // encoding: [0xe3,0x53,0x62,0x78]
+  // CHECK: ldsminalh w0, w1, [x2]    // encoding: [0x41,0x50,0xe0,0x78]
+  // CHECK: ldsminalh w2, w3, [sp]    // encoding: [0xe3,0x53,0xe2,0x78]
+
+  ldsmin x0, x1, [x2]
+  ldsmin x2, x3, [sp]
+  ldsmina x0, x1, [x2]
+  ldsmina x2, x3, [sp]
+  ldsminl x0, x1, [x2]
+  ldsminl x2, x3, [sp]
+  ldsminal x0, x1, [x2]
+  ldsminal x2, x3, [sp]
+  // CHECK: ldsmin x0, x1, [x2]     // encoding: [0x41,0x50,0x20,0xf8]
+  // CHECK: ldsmin x2, x3, [sp]     // encoding: [0xe3,0x53,0x22,0xf8]
+  // CHECK: ldsmina x0, x1, [x2]    // encoding: [0x41,0x50,0xa0,0xf8]
+  // CHECK: ldsmina x2, x3, [sp]    // encoding: [0xe3,0x53,0xa2,0xf8]
+  // CHECK: ldsminl x0, x1, [x2]    // encoding: [0x41,0x50,0x60,0xf8]
+  // CHECK: ldsminl x2, x3, [sp]    // encoding: [0xe3,0x53,0x62,0xf8]
+  // CHECK: ldsminal x0, x1, [x2]   // encoding: [0x41,0x50,0xe0,0xf8]
+  // CHECK: ldsminal x2, x3, [sp]   // encoding: [0xe3,0x53,0xe2,0xf8]
+
+  ldumax w0, w1, [x2]
+  ldumax w2, w3, [sp]
+  ldumaxa w0, w1, [x2]
+  ldumaxa w2, w3, [sp]
+  ldumaxl w0, w1, [x2]
+  ldumaxl w2, w3, [sp]
+  ldumaxal w0, w1, [x2]
+  ldumaxal w2, w3, [sp]
+  // CHECK: ldumax w0, w1, [x2]     // encoding: [0x41,0x60,0x20,0xb8]
+  // CHECK: ldumax w2, w3, [sp]     // encoding: [0xe3,0x63,0x22,0xb8]
+  // CHECK: ldumaxa w0, w1, [x2]    // encoding: [0x41,0x60,0xa0,0xb8]
+  // CHECK: ldumaxa w2, w3, [sp]    // encoding: [0xe3,0x63,0xa2,0xb8]
+  // CHECK: ldumaxl w0, w1, [x2]    // encoding: [0x41,0x60,0x60,0xb8]
+  // CHECK: ldumaxl w2, w3, [sp]    // encoding: [0xe3,0x63,0x62,0xb8]
+  // CHECK: ldumaxal w0, w1, [x2]   // encoding: [0x41,0x60,0xe0,0xb8]
+  // CHECK: ldumaxal w2, w3, [sp]   // encoding: [0xe3,0x63,0xe2,0xb8]
+
+  ldumaxb w0, w1, [x2]
+  ldumaxb w2, w3, [sp]
+  ldumaxh w0, w1, [x2]
+  ldumaxh w2, w3, [sp]
+  ldumaxab w0, w1, [x2]
+  ldumaxab w2, w3, [sp]
+  ldumaxlb w0, w1, [x2]
+  ldumaxlb w2, w3, [sp]
+  // CHECK: ldumaxb w0, w1, [x2]     // encoding: [0x41,0x60,0x20,0x38]
+  // CHECK: ldumaxb w2, w3, [sp]     // encoding: [0xe3,0x63,0x22,0x38]
+  // CHECK: ldumaxh w0, w1, [x2]     // encoding: [0x41,0x60,0x20,0x78]
+  // CHECK: ldumaxh w2, w3, [sp]     // encoding: [0xe3,0x63,0x22,0x78]
+  // CHECK: ldumaxab w0, w1, [x2]    // encoding: [0x41,0x60,0xa0,0x38]
+  // CHECK: ldumaxab w2, w3, [sp]    // encoding: [0xe3,0x63,0xa2,0x38]
+  // CHECK: ldumaxlb w0, w1, [x2]    // encoding: [0x41,0x60,0x60,0x38]
+  // CHECK: ldumaxlb w2, w3, [sp]    // encoding: [0xe3,0x63,0x62,0x38]
+
+  ldumaxalb w0, w1, [x2]
+  ldumaxalb w2, w3, [sp]
+  ldumaxah w0, w1, [x2]
+  ldumaxah w2, w3, [sp]
+  ldumaxlh w0, w1, [x2]
+  ldumaxlh w2, w3, [sp]
+  ldumaxalh w0, w1, [x2]
+  ldumaxalh w2, w3, [sp]
+  // CHECK: ldumaxalb w0, w1, [x2]    // encoding: [0x41,0x60,0xe0,0x38]
+  // CHECK: ldumaxalb w2, w3, [sp]    // encoding: [0xe3,0x63,0xe2,0x38]
+  // CHECK: ldumaxah w0, w1, [x2]     // encoding: [0x41,0x60,0xa0,0x78]
+  // CHECK: ldumaxah w2, w3, [sp]     // encoding: [0xe3,0x63,0xa2,0x78]
+  // CHECK: ldumaxlh w0, w1, [x2]     // encoding: [0x41,0x60,0x60,0x78]
+  // CHECK: ldumaxlh w2, w3, [sp]     // encoding: [0xe3,0x63,0x62,0x78]
+  // CHECK: ldumaxalh w0, w1, [x2]    // encoding: [0x41,0x60,0xe0,0x78]
+  // CHECK: ldumaxalh w2, w3, [sp]    // encoding: [0xe3,0x63,0xe2,0x78]
+
+  ldumax x0, x1, [x2]
+  ldumax x2, x3, [sp]
+  ldumaxa x0, x1, [x2]
+  ldumaxa x2, x3, [sp]
+  ldumaxl x0, x1, [x2]
+  ldumaxl x2, x3, [sp]
+  ldumaxal x0, x1, [x2]
+  ldumaxal x2, x3, [sp]
+  // CHECK: ldumax x0, x1, [x2]     // encoding: [0x41,0x60,0x20,0xf8]
+  // CHECK: ldumax x2, x3, [sp]     // encoding: [0xe3,0x63,0x22,0xf8]
+  // CHECK: ldumaxa x0, x1, [x2]    // encoding: [0x41,0x60,0xa0,0xf8]
+  // CHECK: ldumaxa x2, x3, [sp]    // encoding: [0xe3,0x63,0xa2,0xf8]
+  // CHECK: ldumaxl x0, x1, [x2]    // encoding: [0x41,0x60,0x60,0xf8]
+  // CHECK: ldumaxl x2, x3, [sp]    // encoding: [0xe3,0x63,0x62,0xf8]
+  // CHECK: ldumaxal x0, x1, [x2]   // encoding: [0x41,0x60,0xe0,0xf8]
+  // CHECK: ldumaxal x2, x3, [sp]   // encoding: [0xe3,0x63,0xe2,0xf8]
+
+  ldumin w0, w1, [x2]
+  ldumin w2, w3, [sp]
+  ldumina w0, w1, [x2]
+  ldumina w2, w3, [sp]
+  lduminl w0, w1, [x2]
+  lduminl w2, w3, [sp]
+  lduminal w0, w1, [x2]
+  lduminal w2, w3, [sp]
+  // CHECK: ldumin w0, w1, [x2]     // encoding: [0x41,0x70,0x20,0xb8]
+  // CHECK: ldumin w2, w3, [sp]     // encoding: [0xe3,0x73,0x22,0xb8]
+  // CHECK: ldumina w0, w1, [x2]    // encoding: [0x41,0x70,0xa0,0xb8]
+  // CHECK: ldumina w2, w3, [sp]    // encoding: [0xe3,0x73,0xa2,0xb8]
+  // CHECK: lduminl w0, w1, [x2]    // encoding: [0x41,0x70,0x60,0xb8]
+  // CHECK: lduminl w2, w3, [sp]    // encoding: [0xe3,0x73,0x62,0xb8]
+  // CHECK: lduminal w0, w1, [x2]   // encoding: [0x41,0x70,0xe0,0xb8]
+  // CHECK: lduminal w2, w3, [sp]   // encoding: [0xe3,0x73,0xe2,0xb8]
+
+  lduminb w0, w1, [x2]
+  lduminb w2, w3, [sp]
+  lduminh w0, w1, [x2]
+  lduminh w2, w3, [sp]
+  lduminab w0, w1, [x2]
+  lduminab w2, w3, [sp]
+  lduminlb w0, w1, [x2]
+  lduminlb w2, w3, [sp]
+  // CHECK: lduminb w0, w1, [x2]     // encoding: [0x41,0x70,0x20,0x38]
+  // CHECK: lduminb w2, w3, [sp]     // encoding: [0xe3,0x73,0x22,0x38]
+  // CHECK: lduminh w0, w1, [x2]     // encoding: [0x41,0x70,0x20,0x78]
+  // CHECK: lduminh w2, w3, [sp]     // encoding: [0xe3,0x73,0x22,0x78]
+  // CHECK: lduminab w0, w1, [x2]    // encoding: [0x41,0x70,0xa0,0x38]
+  // CHECK: lduminab w2, w3, [sp]    // encoding: [0xe3,0x73,0xa2,0x38]
+  // CHECK: lduminlb w0, w1, [x2]    // encoding: [0x41,0x70,0x60,0x38]
+  // CHECK: lduminlb w2, w3, [sp]    // encoding: [0xe3,0x73,0x62,0x38]
+
+  lduminalb w0, w1, [x2]
+  lduminalb w2, w3, [sp]
+  lduminah w0, w1, [x2]
+  lduminah w2, w3, [sp]
+  lduminlh w0, w1, [x2]
+  lduminlh w2, w3, [sp]
+  lduminalh w0, w1, [x2]
+  lduminalh w2, w3, [sp]
+  // CHECK: lduminalb w0, w1, [x2]    // encoding: [0x41,0x70,0xe0,0x38]
+  // CHECK: lduminalb w2, w3, [sp]    // encoding: [0xe3,0x73,0xe2,0x38]
+  // CHECK: lduminah w0, w1, [x2]     // encoding: [0x41,0x70,0xa0,0x78]
+  // CHECK: lduminah w2, w3, [sp]     // encoding: [0xe3,0x73,0xa2,0x78]
+  // CHECK: lduminlh w0, w1, [x2]     // encoding: [0x41,0x70,0x60,0x78]
+  // CHECK: lduminlh w2, w3, [sp]     // encoding: [0xe3,0x73,0x62,0x78]
+  // CHECK: lduminalh w0, w1, [x2]    // encoding: [0x41,0x70,0xe0,0x78]
+  // CHECK: lduminalh w2, w3, [sp]    // encoding: [0xe3,0x73,0xe2,0x78]
+
+  ldumin x0, x1, [x2]
+  ldumin x2, x3, [sp]
+  ldumina x0, x1, [x2]
+  ldumina x2, x3, [sp]
+  lduminl x0, x1, [x2]
+  lduminl x2, x3, [sp]
+  lduminal x0, x1, [x2]
+  lduminal x2, x3, [sp]
+  // CHECK: ldumin x0, x1, [x2]     // encoding: [0x41,0x70,0x20,0xf8]
+  // CHECK: ldumin x2, x3, [sp]     // encoding: [0xe3,0x73,0x22,0xf8]
+  // CHECK: ldumina x0, x1, [x2]    // encoding: [0x41,0x70,0xa0,0xf8]
+  // CHECK: ldumina x2, x3, [sp]    // encoding: [0xe3,0x73,0xa2,0xf8]
+  // CHECK: lduminl x0, x1, [x2]    // encoding: [0x41,0x70,0x60,0xf8]
+  // CHECK: lduminl x2, x3, [sp]    // encoding: [0xe3,0x73,0x62,0xf8]
+  // CHECK: lduminal x0, x1, [x2]   // encoding: [0x41,0x70,0xe0,0xf8]
+  // CHECK: lduminal x2, x3, [sp]   // encoding: [0xe3,0x73,0xe2,0xf8]
+
+  stadd w0, [x2]
+  stadd w2, [sp]
+  staddl w0, [x2]
+  staddl w2, [sp]
+  staddb w0, [x2]
+  staddb w2, [sp]
+  staddh w0, [x2]
+  staddh w2, [sp]
+  // CHECK: stadd w0, [x2]      // encoding: [0x5f,0x00,0x20,0xb8]
+  // CHECK: stadd w2, [sp]      // encoding: [0xff,0x03,0x22,0xb8]
+  // CHECK: staddl w0, [x2]     // encoding: [0x5f,0x00,0x60,0xb8]
+  // CHECK: staddl w2, [sp]     // encoding: [0xff,0x03,0x62,0xb8]
+  // CHECK: staddb w0, [x2]     // encoding: [0x5f,0x00,0x20,0x38]
+  // CHECK: staddb w2, [sp]     // encoding: [0xff,0x03,0x22,0x38]
+  // CHECK: staddh w0, [x2]     // encoding: [0x5f,0x00,0x20,0x78]
+  // CHECK: staddh w2, [sp]     // encoding: [0xff,0x03,0x22,0x78]
+
+  staddlb w0, [x2]
+  staddlb w2, [sp]
+  staddlh w0, [x2]
+  staddlh w2, [sp]
+  stadd x0, [x2]
+  stadd x2, [sp]
+  staddl x0, [x2]
+  staddl x2, [sp]
+  // CHECK: staddlb w0, [x2]    // encoding: [0x5f,0x00,0x60,0x38]
+  // CHECK: staddlb w2, [sp]    // encoding: [0xff,0x03,0x62,0x38]
+  // CHECK: staddlh w0, [x2]    // encoding: [0x5f,0x00,0x60,0x78]
+  // CHECK: staddlh w2, [sp]    // encoding: [0xff,0x03,0x62,0x78]
+  // CHECK: stadd x0, [x2]      // encoding: [0x5f,0x00,0x20,0xf8]
+  // CHECK: stadd x2, [sp]      // encoding: [0xff,0x03,0x22,0xf8]
+  // CHECK: staddl x0, [x2]     // encoding: [0x5f,0x00,0x60,0xf8]
+  // CHECK: staddl x2, [sp]     // encoding: [0xff,0x03,0x62,0xf8]
+
+  stclr w0, [x2]
+  stclr w2, [sp]
+  stclrl w0, [x2]
+  stclrl w2, [sp]
+  stclrb w0, [x2]
+  stclrb w2, [sp]
+  stclrh w0, [x2]
+  stclrh w2, [sp]
+  // CHECK: stclr w0, [x2]      // encoding: [0x5f,0x10,0x20,0xb8]
+  // CHECK: stclr w2, [sp]      // encoding: [0xff,0x13,0x22,0xb8]
+  // CHECK: stclrl w0, [x2]     // encoding: [0x5f,0x10,0x60,0xb8]
+  // CHECK: stclrl w2, [sp]     // encoding: [0xff,0x13,0x62,0xb8]
+  // CHECK: stclrb w0, [x2]     // encoding: [0x5f,0x10,0x20,0x38]
+  // CHECK: stclrb w2, [sp]     // encoding: [0xff,0x13,0x22,0x38]
+  // CHECK: stclrh w0, [x2]     // encoding: [0x5f,0x10,0x20,0x78]
+  // CHECK: stclrh w2, [sp]     // encoding: [0xff,0x13,0x22,0x78]
+
+  stclrlb w0, [x2]
+  stclrlb w2, [sp]
+  stclrlh w0, [x2]
+  stclrlh w2, [sp]
+  stclr x0, [x2]
+  stclr x2, [sp]
+  stclrl x0, [x2]
+  stclrl x2, [sp]
+  // CHECK: stclrlb w0, [x2]    // encoding: [0x5f,0x10,0x60,0x38]
+  // CHECK: stclrlb w2, [sp]    // encoding: [0xff,0x13,0x62,0x38]
+  // CHECK: stclrlh w0, [x2]    // encoding: [0x5f,0x10,0x60,0x78]
+  // CHECK: stclrlh w2, [sp]    // encoding: [0xff,0x13,0x62,0x78]
+  // CHECK: stclr x0, [x2]      // encoding: [0x5f,0x10,0x20,0xf8]
+  // CHECK: stclr x2, [sp]      // encoding: [0xff,0x13,0x22,0xf8]
+  // CHECK: stclrl x0, [x2]     // encoding: [0x5f,0x10,0x60,0xf8]
+  // CHECK: stclrl x2, [sp]     // encoding: [0xff,0x13,0x62,0xf8]
+
+  steor w0, [x2]
+  steor w2, [sp]
+  steorl w0, [x2]
+  steorl w2, [sp]
+  steorb w0, [x2]
+  steorb w2, [sp]
+  steorh w0, [x2]
+  steorh w2, [sp]
+  // CHECK: steor w0, [x2]      // encoding: [0x5f,0x20,0x20,0xb8]
+  // CHECK: steor w2, [sp]      // encoding: [0xff,0x23,0x22,0xb8]
+  // CHECK: steorl w0, [x2]     // encoding: [0x5f,0x20,0x60,0xb8]
+  // CHECK: steorl w2, [sp]     // encoding: [0xff,0x23,0x62,0xb8]
+  // CHECK: steorb w0, [x2]     // encoding: [0x5f,0x20,0x20,0x38]
+  // CHECK: steorb w2, [sp]     // encoding: [0xff,0x23,0x22,0x38]
+  // CHECK: steorh w0, [x2]     // encoding: [0x5f,0x20,0x20,0x78]
+  // CHECK: steorh w2, [sp]     // encoding: [0xff,0x23,0x22,0x78]
+
+  steorlb w0, [x2]
+  steorlb w2, [sp]
+  steorlh w0, [x2]
+  steorlh w2, [sp]
+  steor x0, [x2]
+  steor x2, [sp]
+  steorl x0, [x2]
+  steorl x2, [sp]
+  // CHECK: steorlb w0, [x2]    // encoding: [0x5f,0x20,0x60,0x38]
+  // CHECK: steorlb w2, [sp]    // encoding: [0xff,0x23,0x62,0x38]
+  // CHECK: steorlh w0, [x2]    // encoding: [0x5f,0x20,0x60,0x78]
+  // CHECK: steorlh w2, [sp]    // encoding: [0xff,0x23,0x62,0x78]
+  // CHECK: steor x0, [x2]      // encoding: [0x5f,0x20,0x20,0xf8]
+  // CHECK: steor x2, [sp]      // encoding: [0xff,0x23,0x22,0xf8]
+  // CHECK: steorl x0, [x2]     // encoding: [0x5f,0x20,0x60,0xf8]
+  // CHECK: steorl x2, [sp]     // encoding: [0xff,0x23,0x62,0xf8]
+
+  stset w0, [x2]
+  stset w2, [sp]
+  stsetl w0, [x2]
+  stsetl w2, [sp]
+  stsetb w0, [x2]
+  stsetb w2, [sp]
+  stseth w0, [x2]
+  stseth w2, [sp]
+  // CHECK: stset w0, [x2]      // encoding: [0x5f,0x30,0x20,0xb8]
+  // CHECK: stset w2, [sp]      // encoding: [0xff,0x33,0x22,0xb8]
+  // CHECK: stsetl w0, [x2]     // encoding: [0x5f,0x30,0x60,0xb8]
+  // CHECK: stsetl w2, [sp]     // encoding: [0xff,0x33,0x62,0xb8]
+  // CHECK: stsetb w0, [x2]     // encoding: [0x5f,0x30,0x20,0x38]
+  // CHECK: stsetb w2, [sp]     // encoding: [0xff,0x33,0x22,0x38]
+  // CHECK: stseth w0, [x2]     // encoding: [0x5f,0x30,0x20,0x78]
+  // CHECK: stseth w2, [sp]     // encoding: [0xff,0x33,0x22,0x78]
+
+  stsetlb w0, [x2]
+  stsetlb w2, [sp]
+  stsetlh w0, [x2]
+  stsetlh w2, [sp]
+  stset x0, [x2]
+  stset x2, [sp]
+  stsetl x0, [x2]
+  stsetl x2, [sp]
+  // CHECK: stsetlb w0, [x2]    // encoding: [0x5f,0x30,0x60,0x38]
+  // CHECK: stsetlb w2, [sp]    // encoding: [0xff,0x33,0x62,0x38]
+  // CHECK: stsetlh w0, [x2]    // encoding: [0x5f,0x30,0x60,0x78]
+  // CHECK: stsetlh w2, [sp]    // encoding: [0xff,0x33,0x62,0x78]
+  // CHECK: stset x0, [x2]      // encoding: [0x5f,0x30,0x20,0xf8]
+  // CHECK: stset x2, [sp]      // encoding: [0xff,0x33,0x22,0xf8]
+  // CHECK: stsetl x0, [x2]     // encoding: [0x5f,0x30,0x60,0xf8]
+  // CHECK: stsetl x2, [sp]     // encoding: [0xff,0x33,0x62,0xf8]
+
+  stsmax w0, [x2]
+  stsmax w2, [sp]
+  stsmaxl w0, [x2]
+  stsmaxl w2, [sp]
+  stsmaxb w0, [x2]
+  stsmaxb w2, [sp]
+  stsmaxh w0, [x2]
+  stsmaxh w2, [sp]
+  // CHECK: stsmax w0, [x2]     // encoding: [0x5f,0x40,0x20,0xb8]
+  // CHECK: stsmax w2, [sp]     // encoding: [0xff,0x43,0x22,0xb8]
+  // CHECK: stsmaxl w0, [x2]    // encoding: [0x5f,0x40,0x60,0xb8]
+  // CHECK: stsmaxl w2, [sp]    // encoding: [0xff,0x43,0x62,0xb8]
+  // CHECK: stsmaxb w0, [x2]    // encoding: [0x5f,0x40,0x20,0x38]
+  // CHECK: stsmaxb w2, [sp]    // encoding: [0xff,0x43,0x22,0x38]
+  // CHECK: stsmaxh w0, [x2]    // encoding: [0x5f,0x40,0x20,0x78]
+  // CHECK: stsmaxh w2, [sp]    // encoding: [0xff,0x43,0x22,0x78]
+
+  stsmaxlb w0, [x2]
+  stsmaxlb w2, [sp]
+  stsmaxlh w0, [x2]
+  stsmaxlh w2, [sp]
+  stsmax x0, [x2]
+  stsmax x2, [sp]
+  stsmaxl x0, [x2]
+  stsmaxl x2, [sp]
+  // CHECK: stsmaxlb w0, [x2]   // encoding: [0x5f,0x40,0x60,0x38]
+  // CHECK: stsmaxlb w2, [sp]   // encoding: [0xff,0x43,0x62,0x38]
+  // CHECK: stsmaxlh w0, [x2]   // encoding: [0x5f,0x40,0x60,0x78]
+  // CHECK: stsmaxlh w2, [sp]   // encoding: [0xff,0x43,0x62,0x78]
+  // CHECK: stsmax x0, [x2]     // encoding: [0x5f,0x40,0x20,0xf8]
+  // CHECK: stsmax x2, [sp]     // encoding: [0xff,0x43,0x22,0xf8]
+  // CHECK: stsmaxl x0, [x2]    // encoding: [0x5f,0x40,0x60,0xf8]
+  // CHECK: stsmaxl x2, [sp]    // encoding: [0xff,0x43,0x62,0xf8]
+
+  stsmin w0, [x2]
+  stsmin w2, [sp]
+  stsminl w0, [x2]
+  stsminl w2, [sp]
+  stsminb w0, [x2]
+  stsminb w2, [sp]
+  stsminh w0, [x2]
+  stsminh w2, [sp]
+  // CHECK: stsmin w0, [x2]     // encoding: [0x5f,0x50,0x20,0xb8]
+  // CHECK: stsmin w2, [sp]     // encoding: [0xff,0x53,0x22,0xb8]
+  // CHECK: stsminl w0, [x2]    // encoding: [0x5f,0x50,0x60,0xb8]
+  // CHECK: stsminl w2, [sp]    // encoding: [0xff,0x53,0x62,0xb8]
+  // CHECK: stsminb w0, [x2]    // encoding: [0x5f,0x50,0x20,0x38]
+  // CHECK: stsminb w2, [sp]    // encoding: [0xff,0x53,0x22,0x38]
+  // CHECK: stsminh w0, [x2]    // encoding: [0x5f,0x50,0x20,0x78]
+  // CHECK: stsminh w2, [sp]    // encoding: [0xff,0x53,0x22,0x78]
+
+  stsminlb w0, [x2]
+  stsminlb w2, [sp]
+  stsminlh w0, [x2]
+  stsminlh w2, [sp]
+  stsmin x0, [x2]
+  stsmin x2, [sp]
+  stsminl x0, [x2]
+  stsminl x2, [sp]
+  // CHECK: stsminlb w0, [x2]   // encoding: [0x5f,0x50,0x60,0x38]
+  // CHECK: stsminlb w2, [sp]   // encoding: [0xff,0x53,0x62,0x38]
+  // CHECK: stsminlh w0, [x2]   // encoding: [0x5f,0x50,0x60,0x78]
+  // CHECK: stsminlh w2, [sp]   // encoding: [0xff,0x53,0x62,0x78]
+  // CHECK: stsmin x0, [x2]     // encoding: [0x5f,0x50,0x20,0xf8]
+  // CHECK: stsmin x2, [sp]     // encoding: [0xff,0x53,0x22,0xf8]
+  // CHECK: stsminl x0, [x2]    // encoding: [0x5f,0x50,0x60,0xf8]
+  // CHECK: stsminl x2, [sp]    // encoding: [0xff,0x53,0x62,0xf8]
+
+  stumax w0, [x2]
+  stumax w2, [sp]
+  stumaxl w0, [x2]
+  stumaxl w2, [sp]
+  stumaxb w0, [x2]
+  stumaxb w2, [sp]
+  stumaxh w0, [x2]
+  stumaxh w2, [sp]
+  // CHECK: stumax w0, [x2]     // encoding: [0x5f,0x60,0x20,0xb8]
+  // CHECK: stumax w2, [sp]     // encoding: [0xff,0x63,0x22,0xb8]
+  // CHECK: stumaxl w0, [x2]    // encoding: [0x5f,0x60,0x60,0xb8]
+  // CHECK: stumaxl w2, [sp]    // encoding: [0xff,0x63,0x62,0xb8]
+  // CHECK: stumaxb w0, [x2]    // encoding: [0x5f,0x60,0x20,0x38]
+  // CHECK: stumaxb w2, [sp]    // encoding: [0xff,0x63,0x22,0x38]
+  // CHECK: stumaxh w0, [x2]    // encoding: [0x5f,0x60,0x20,0x78]
+  // CHECK: stumaxh w2, [sp]    // encoding: [0xff,0x63,0x22,0x78]
+
+  stumaxlb w0, [x2]
+  stumaxlb w2, [sp]
+  stumaxlh w0, [x2]
+  stumaxlh w2, [sp]
+  stumax x0, [x2]
+  stumax x2, [sp]
+  stumaxl x0, [x2]
+  stumaxl x2, [sp]
+  // CHECK: stumaxlb w0, [x2]   // encoding: [0x5f,0x60,0x60,0x38]
+  // CHECK: stumaxlb w2, [sp]   // encoding: [0xff,0x63,0x62,0x38]
+  // CHECK: stumaxlh w0, [x2]   // encoding: [0x5f,0x60,0x60,0x78]
+  // CHECK: stumaxlh w2, [sp]   // encoding: [0xff,0x63,0x62,0x78]
+  // CHECK: stumax x0, [x2]     // encoding: [0x5f,0x60,0x20,0xf8]
+  // CHECK: stumax x2, [sp]     // encoding: [0xff,0x63,0x22,0xf8]
+  // CHECK: stumaxl x0, [x2]    // encoding: [0x5f,0x60,0x60,0xf8]
+  // CHECK: stumaxl x2, [sp]    // encoding: [0xff,0x63,0x62,0xf8]
+
+  stumin w0, [x2]
+  stumin w2, [sp]
+  stuminl w0, [x2]
+  stuminl w2, [sp]
+  stuminb w0, [x2]
+  stuminb w2, [sp]
+  stuminh w0, [x2]
+  stuminh w2, [sp]
+  // CHECK: stumin w0, [x2]     // encoding: [0x5f,0x70,0x20,0xb8]
+  // CHECK: stumin w2, [sp]     // encoding: [0xff,0x73,0x22,0xb8]
+  // CHECK: stuminl w0, [x2]    // encoding: [0x5f,0x70,0x60,0xb8]
+  // CHECK: stuminl w2, [sp]    // encoding: [0xff,0x73,0x62,0xb8]
+  // CHECK: stuminb w0, [x2]    // encoding: [0x5f,0x70,0x20,0x38]
+  // CHECK: stuminb w2, [sp]    // encoding: [0xff,0x73,0x22,0x38]
+  // CHECK: stuminh w0, [x2]    // encoding: [0x5f,0x70,0x20,0x78]
+  // CHECK: stuminh w2, [sp]    // encoding: [0xff,0x73,0x22,0x78]
+
+  cas b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   cas b0, b1, [x2]
+  // CHECK-ERROR:       ^
+
+  cas b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   cas b2, b3, [sp]
+  // CHECK-ERROR:       ^
+
+  cas h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   cas h0, h1, [x2]
+  // CHECK-ERROR:       ^
+
+  cas h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   cas h2, h3, [sp]
+  // CHECK-ERROR:       ^
+
+  casa b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casa b0, b1, [x2]
+  // CHECK-ERROR:        ^
+
+  casa b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casa b2, b3, [sp]
+  // CHECK-ERROR:        ^
+
+  casa h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casa h0, h1, [x2]
+  // CHECK-ERROR:        ^
+
+  casa h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casa h2, h3, [sp]
+  // CHECK-ERROR:        ^
+
+  casb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casb b0, b1, [x2]
+  // CHECK-ERROR:        ^
+
+  casb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casb b2, b3, [sp]
+  // CHECK-ERROR:        ^
+
+  casb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casb h0, h1, [x2]
+  // CHECK-ERROR:        ^
+
+  casb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casb h2, h3, [sp]
+  // CHECK-ERROR:        ^
+
+  cash b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   cash b0, b1, [x2]
+  // CHECK-ERROR:        ^
+
+  cash b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   cash b2, b3, [sp]
+  // CHECK-ERROR:        ^
+
+  cash h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   cash h0, h1, [x2]
+  // CHECK-ERROR:        ^
+
+  cash h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   cash h2, h3, [sp]
+  // CHECK-ERROR:        ^
+
+  casah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casah b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  casah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casah b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  casah h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casah h0, h1, [x2]
+  // CHECK-ERROR:         ^
+
+  casah h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casah h2, h3, [sp]
+  // CHECK-ERROR:         ^
+
+  casalh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalh b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  casalh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalh b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  casalh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalh h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  casalh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalh h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+
+  casl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casl b0, b1, [x2]
+  // CHECK-ERROR:        ^
+
+  casl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casl b2, b3, [sp]
+  // CHECK-ERROR:        ^
+
+  casl h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casl h0, h1, [x2]
+  // CHECK-ERROR:        ^
+
+  casl h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casl h2, h3, [sp]
+  // CHECK-ERROR:        ^
+
+  caslb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   caslb b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  caslb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   caslb b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  caslb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   caslb h0, h1, [x2]
+  // CHECK-ERROR:         ^
+
+  caslb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   caslb h2, h3, [sp]
+  // CHECK-ERROR:         ^
+
+
+  casalb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalb b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  casalb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalb b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  casalb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalb h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  casalb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalb h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  casalh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalh b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  casalh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalh b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  casalh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalh h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  casalh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalh h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  cas v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:   ^
+
+  casa v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:   ^
+
+  casl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:   ^
+
+  casal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  casb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:        ^
+
+  casab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  caslb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   caslb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  casalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  casah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  caslh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   caslh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  casalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   casalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  casp b0, b1, [x2]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   casp b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  casp b2, b3, [sp]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   casp b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  casp h0, h1, [x2]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   casp h0, h1, [x2]
+  // CHECK-ERROR:         ^
+
+  casp h2, h3, [sp]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   casp h2, h3, [sp]
+  // CHECK-ERROR:         ^
+
+  caspa b0, b1, [x2]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspa b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  caspa b2, b3, [sp]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspa b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  caspa h0, h1, [x2]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspa h0, h1, [x2]
+  // CHECK-ERROR:         ^
+
+  caspa h2, h3, [sp]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspa h2, h3, [sp]
+  // CHECK-ERROR:         ^
+
+  caspl b0, b1, [x2]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspl b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  caspl b2, b3, [sp]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspl b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  caspl h0, h1, [x2]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspl h0, h1, [x2]
+  // CHECK-ERROR:         ^
+
+  caspl h2, h3, [sp]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspl h2, h3, [sp]
+  // CHECK-ERROR:         ^
+
+  caspal b0, b1, [x2]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspal b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  caspal b2, b3, [sp]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspal b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  caspal h0, h1, [x2]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspal h0, h1, [x2]
+  // CHECK-ERROR:         ^
+
+  caspal h2, h3, [sp]
+  // CHECK-ERROR: error: expected first even register of a consecutive same-size even/odd register pair
+  // CHECK-ERROR:   caspal h2, h3, [sp]
+  // CHECK-ERROR:         ^
+
+  swp b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swp b0, b1, [x2]
+  // CHECK-ERROR:       ^
+
+  swp b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swp b2, b3, [sp]
+  // CHECK-ERROR:       ^
+
+  swpa b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpa b0, b1, [x2]
+  // CHECK-ERROR:        ^
+
+  swpa b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpa b2, b3, [sp]
+  // CHECK-ERROR:        ^
+
+  swpah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpah b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  swpah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpah b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  swpl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpl b0, b1, [x2]
+  // CHECK-ERROR:        ^
+
+  swpl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpl b2, b3, [sp]
+  // CHECK-ERROR:        ^
+
+  swpal b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpal b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  swpal b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpal b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  swpalb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalb b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  swpalb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalb b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  swpalh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalh b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  swpalh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalh b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  swpb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpb b0, b1, [x2]
+  // CHECK-ERROR:        ^
+
+  swpb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpb b2, b3, [sp]
+  // CHECK-ERROR:        ^
+
+  swpab b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpab b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  swpab b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpab b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  swpal b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpal b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  swpal b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpal b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  swpah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpah b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  swpah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpah b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  swpalh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalh b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  swpalh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalh b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  swpl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpl b0, b1, [x2]
+  // CHECK-ERROR:        ^
+
+  swpl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpl b2, b3, [sp]
+  // CHECK-ERROR:        ^
+
+  swplb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swplb b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  swplb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swplb b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  swpalb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalb b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  swpalb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalb b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  swph b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swph b0, b1, [x2]
+  // CHECK-ERROR:        ^
+
+  swph b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swph b2, b3, [sp]
+  // CHECK-ERROR:        ^
+
+  swp v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swp v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:       ^
+
+  swpa v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpa v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:        ^
+
+  swpah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  swpl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:        ^
+
+  swpal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  swpalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  swpalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  swpb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:        ^
+
+  swpab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  swpal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  swpah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  swpalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  swpl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:        ^
+
+  swplb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swplb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  swpalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swpalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  swph v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   swph v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:        ^
+
+  ldadd b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldadd b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  ldadd b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldadd b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  ldadd h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldadd h0, h1, [x2]
+  // CHECK-ERROR:         ^
+
+  ldadd h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldadd h2, h3, [sp]
+  // CHECK-ERROR:         ^
+
+  ldadd v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldadd v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  ldadda b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldadda b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldadda b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldadda b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldadda h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldadda h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldadda h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldadda h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldadda v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldadda v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldaddl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddl b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldaddl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddl b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldaddl h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddl h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldaddl h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddl h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldaddl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldaddal b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddal b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldaddal b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddal b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldaddal h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddal h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldaddal h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddal h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldaddal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldaddb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddb b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldaddb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddb b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldaddb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddb h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldaddb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddb h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldaddb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldaddh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddh b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldaddh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddh b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldaddh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddh h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldaddh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddh h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldaddh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldaddab b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddab b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldaddab b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddab b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldaddab h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddab h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldaddab h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddab h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldaddab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldaddlb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddlb b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldaddlb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddlb b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldaddlb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddlb h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldaddlb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddlb h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldaddlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldaddalb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddalb b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldaddalb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddalb b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldaddalb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddalb h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldaddalb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddalb h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldaddalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldaddah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddah b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldaddah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddah b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldaddah h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddah h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldaddah h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddah h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldaddah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldaddlh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddlh b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldaddlh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddlh b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldaddlh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddlh h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldaddlh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddlh h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldaddlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldaddalh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddalh b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldaddalh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddalh b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldaddalh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddalh h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldaddalh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddalh h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldaddalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldaddalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldclr b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclr b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  ldclr b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclr b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  ldclr h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclr h0, h1, [x2]
+  // CHECK-ERROR:         ^
+
+  ldclr h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclr h2, h3, [sp]
+  // CHECK-ERROR:         ^
+
+  ldclr v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclr v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  ldclra b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclra b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldclra b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclra b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldclra h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclra h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldclra h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclra h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldclra v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclra v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldclra b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclra b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldclra b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclra b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldclra h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclra h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldclra h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclra h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldclra v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclra v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldclrl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrl b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldclrl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrl b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldclrl h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrl h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldclrl h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrl h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldclrl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldclral b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclral b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldclral b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclral b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldclral h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclral h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldclral h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclral h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldclral v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclral v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldclrb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrb b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldclrb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrb b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldclrb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrb h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldclrb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrb h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldclrb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldclrh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrh b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldclrh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrh b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldclrh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrh h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldclrh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrh h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldclrh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldclrab b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrab b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldclrab b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrab b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldclrab h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrab h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldclrab h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrab h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldclrab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldclrlb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrlb b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldclrlb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrlb b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldclrlb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrlb h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldclrlb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrlb h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldclrlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldclralb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclralb b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldclralb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclralb b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldclralb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclralb h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldclralb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclralb h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldclralb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclralb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldclrah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrah b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldclrah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrah b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldclrah h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrah h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldclrah h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrah h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldclrah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldclrlh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrlh b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldclrlh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrlh b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldclrlh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrlh h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldclrlh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrlh h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldclrlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclrlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldclralh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclralh b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldclralh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclralh b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldclralh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclralh h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldclralh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclralh h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldclralh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldclralh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldeor b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeor b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  ldeor b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeor b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  ldeor h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeor h0, h1, [x2]
+  // CHECK-ERROR:         ^
+
+  ldeor h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeor h2, h3, [sp]
+  // CHECK-ERROR:         ^
+
+  ldeor v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeor v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  ldeora b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeora b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldeora b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeora b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldeora h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeora h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldeora h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeora h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldeora v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeora v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldeorl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorl b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldeorl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorl b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldeorl h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorl h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldeorl h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorl h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldeorl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldeoral b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoral b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldeoral b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoral b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldeoral h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoral h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldeoral h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoral h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldeoral v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoral v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldeorb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorb b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldeorb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorb b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldeorb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorb h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldeorb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorb h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldeorb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldeorh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorh b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldeorh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorh b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldeorh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorh h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldeorh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorh h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldeorh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldeorab b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorab b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldeorab b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorab b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldeorab h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorab h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldeorab h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorab h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldeorab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldeorlb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorlb b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldeorlb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorlb b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldeorlb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorlb h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldeorlb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorlb h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldeorlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldeoralb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoralb b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldeoralb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoralb b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldeoralb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoralb h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldeoralb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoralb h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldeoralb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoralb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldeorah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorah b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldeorah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorah b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldeorah h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorah h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldeorah h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorah h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldeorah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldeorlh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorlh b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldeorlh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorlh b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldeorlh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorlh h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldeorlh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorlh h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldeorlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeorlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldeoralh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoralh b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldeoralh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoralh b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldeoralh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoralh h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldeoralh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoralh h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldeoralh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldeoralh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldset b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldset b0, b1, [x2]
+  // CHECK-ERROR:         ^
+
+  ldset b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldset b2, b3, [sp]
+  // CHECK-ERROR:         ^
+
+  ldset h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldset h0, h1, [x2]
+  // CHECK-ERROR:         ^
+
+  ldset h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldset h2, h3, [sp]
+  // CHECK-ERROR:         ^
+
+  ldset v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldset v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  ldseta b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldseta b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldseta b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldseta b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldseta h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldseta h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldseta h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldseta h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldseta v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldseta v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldsetl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetl b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldsetl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetl b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldsetl h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetl h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldsetl h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetl h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldsetl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldsetal b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetal b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsetal b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetal b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsetal h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetal h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsetal h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetal h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsetal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsetb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetb b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldsetb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetb b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldsetb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetb h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldsetb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetb h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldsetb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldseth b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldseth b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldseth b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldseth b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldseth h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldseth h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldseth h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldseth h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldseth v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldseth v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldsetab b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetab b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsetab b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetab b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsetab h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetab h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsetab h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetab h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsetab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsetlb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetlb b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsetlb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetlb b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsetlb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetlb h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsetlb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetlb h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsetlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsetalb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetalb b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsetalb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetalb b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsetalb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetalb h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsetalb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetalb h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsetalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsetah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetah b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsetah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetah b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsetah h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetah h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsetah h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetah h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsetah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsetlh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetlh b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsetlh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetlh b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsetlh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetlh h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsetlh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetlh h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsetlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsetalh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetalh b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsetalh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetalh b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsetalh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetalh h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsetalh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetalh h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsetalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsetalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsmax b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmax b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldsmax b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmax b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldsmax h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmax h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldsmax h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmax h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldsmax v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmax v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldsmaxa b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxa b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsmaxa b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxa b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsmaxa h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxa h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsmaxa h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxa h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsmaxa v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxa v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsmaxl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxl b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsmaxl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxl b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsmaxl h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxl h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsmaxl h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxl h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsmaxl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsmaxal b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxal b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsmaxal b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxal b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsmaxal h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxal h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsmaxal h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxal h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsmaxal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsmaxb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxb b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsmaxb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxb b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsmaxb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxb h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsmaxb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxb h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsmaxb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsmaxh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxh b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsmaxh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxh b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsmaxh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxh h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsmaxh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxh h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsmaxh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsmaxab b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxab b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsmaxab b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxab b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsmaxab h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxab h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsmaxab h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxab h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsmaxab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsmaxlb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxlb b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsmaxlb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxlb b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsmaxlb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxlb h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsmaxlb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxlb h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsmaxlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsmaxalb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxalb b0, b1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldsmaxalb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxalb b2, b3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldsmaxalb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxalb h0, h1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldsmaxalb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxalb h2, h3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldsmaxalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:             ^
+
+  ldsmaxah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxah b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsmaxah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxah b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsmaxah h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxah h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsmaxah h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxah h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsmaxah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsmaxlh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxlh b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsmaxlh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxlh b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsmaxlh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxlh h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsmaxlh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxlh h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsmaxlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsmaxalh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxalh b0, b1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldsmaxalh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxalh b2, b3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldsmaxalh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxalh h0, h1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldsmaxalh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxalh h2, h3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldsmaxalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmaxalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:             ^
+
+  ldsmin b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmin b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldsmin b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmin b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldsmin h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmin h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldsmin h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmin h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldsmin v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmin v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldsmina b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmina b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsmina b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmina b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsmina h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmina h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsmina h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmina h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsmina v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsmina v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsminl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminl b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsminl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminl b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsminl h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminl h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsminl h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminl h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsminl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsminal b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminal b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsminal b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminal b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsminal h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminal h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsminal h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminal h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsminal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsminb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminb b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsminb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminb b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsminb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminb h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsminb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminb h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsminb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsminh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminh b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsminh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminh b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsminh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminh h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldsminh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminh h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldsminh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldsminab b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminab b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsminab b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminab b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsminab h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminab h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsminab h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminab h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsminab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsminlb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminlb b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsminlb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminlb b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsminlb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminlb h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsminlb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminlb h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsminlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsminalb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminalb b0, b1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldsminalb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminalb b2, b3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldsminalb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminalb h0, h1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldsminalb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminalb h2, h3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldsminalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:             ^
+
+  ldsminah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminah b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsminah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminah b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsminah h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminah h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsminah h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminah h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsminah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsminlh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminlh b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsminlh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminlh b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsminlh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminlh h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldsminlh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminlh h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldsminlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldsminalh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminalh b0, b1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldsminalh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminalh b2, b3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldsminalh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminalh h0, h1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldsminalh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminalh h2, h3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldsminalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldsminalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:             ^
+
+  ldumax b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumax b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldumax b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumax b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldumax h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumax h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldumax h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumax h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldumax v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumax v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldumaxa b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxa b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldumaxa b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxa b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldumaxa h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxa h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldumaxa h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxa h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldumaxa v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxa v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldumaxl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxl b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldumaxl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxl b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldumaxl h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxl h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldumaxl h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxl h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldumaxl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldumaxal b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxal b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldumaxal b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxal b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldumaxal h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxal h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldumaxal h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxal h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldumaxal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldumaxb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxb b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldumaxb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxb b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldumaxb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxb h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldumaxb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxb h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldumaxb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldumaxh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxh b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldumaxh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxh b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldumaxh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxh h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldumaxh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxh h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldumaxh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  ldumaxab b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxab b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldumaxab b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxab b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldumaxab h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxab h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldumaxab h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxab h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldumaxab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldumaxlb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxlb b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldumaxlb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxlb b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldumaxlb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxlb h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldumaxlb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxlb h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldumaxlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldumaxalb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxalb b0, b1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldumaxalb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxalb b2, b3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldumaxalb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxalb h0, h1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldumaxalb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxalb h2, h3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldumaxalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:             ^
+
+  ldumaxah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxah b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldumaxah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxah b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldumaxah h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxah h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldumaxah h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxah h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldumaxah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldumaxlh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxlh b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldumaxlh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxlh b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldumaxlh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxlh h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  ldumaxlh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxlh h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  ldumaxlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  ldumaxalh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxalh b0, b1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldumaxalh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxalh b2, b3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldumaxalh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxalh h0, h1, [x2]
+  // CHECK-ERROR:             ^
+
+  ldumaxalh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxalh h2, h3, [sp]
+  // CHECK-ERROR:             ^
+
+  ldumaxalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumaxalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:             ^
+
+  ldumin b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumin b0, b1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldumin b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumin b2, b3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldumin h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumin h0, h1, [x2]
+  // CHECK-ERROR:          ^
+
+  ldumin h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumin h2, h3, [sp]
+  // CHECK-ERROR:          ^
+
+  ldumin v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumin v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  ldumina b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumina b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldumina b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumina b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldumina h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumina h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  ldumina h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumina h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  ldumina v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   ldumina v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  lduminl b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminl b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  lduminl b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminl b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  lduminl h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminl h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  lduminl h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminl h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  lduminl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminl v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  lduminal b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminal b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  lduminal b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminal b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  lduminal h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminal h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  lduminal h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminal h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  lduminal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminal v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  lduminb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminb b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  lduminb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminb b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  lduminb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminb h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  lduminb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminb h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  lduminb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  lduminh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminh b0, b1, [x2]
+  // CHECK-ERROR:           ^
+
+  lduminh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminh b2, b3, [sp]
+  // CHECK-ERROR:           ^
+
+  lduminh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminh h0, h1, [x2]
+  // CHECK-ERROR:           ^
+
+  lduminh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminh h2, h3, [sp]
+  // CHECK-ERROR:           ^
+
+  lduminh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  lduminab b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminab b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  lduminab b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminab b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  lduminab h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminab h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  lduminab h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminab h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  lduminab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminab v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  lduminlb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminlb b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  lduminlb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminlb b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  lduminlb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminlb h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  lduminlb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminlb h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  lduminlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminlb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  lduminalb b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminalb b0, b1, [x2]
+  // CHECK-ERROR:             ^
+
+  lduminalb b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminalb b2, b3, [sp]
+  // CHECK-ERROR:             ^
+
+  lduminalb h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminalb h0, h1, [x2]
+  // CHECK-ERROR:             ^
+
+  lduminalb h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminalb h2, h3, [sp]
+  // CHECK-ERROR:             ^
+
+  lduminalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminalb v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:             ^
+
+  lduminah b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminah b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  lduminah b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminah b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  lduminah h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminah h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  lduminah h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminah h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  lduminah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminah v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  lduminlh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminlh b0, b1, [x2]
+  // CHECK-ERROR:            ^
+
+  lduminlh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminlh b2, b3, [sp]
+  // CHECK-ERROR:            ^
+
+  lduminlh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminlh h0, h1, [x2]
+  // CHECK-ERROR:            ^
+
+  lduminlh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminlh h2, h3, [sp]
+  // CHECK-ERROR:            ^
+
+  lduminlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminlh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  lduminalh b0, b1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminalh b0, b1, [x2]
+  // CHECK-ERROR:             ^
+
+  lduminalh b2, b3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminalh b2, b3, [sp]
+  // CHECK-ERROR:             ^
+
+  lduminalh h0, h1, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminalh h0, h1, [x2]
+  // CHECK-ERROR:             ^
+
+  lduminalh h2, h3, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminalh h2, h3, [sp]
+  // CHECK-ERROR:             ^
+
+  lduminalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   lduminalh v0.4h, v1.4h, v2.4h
+  // CHECK-ERROR:             ^
+
+  stadd b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stadd b0, [x2]
+  // CHECK-ERROR:         ^
+
+  stadd b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stadd b2, [sp]
+  // CHECK-ERROR:         ^
+
+  stadd h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stadd h0, [x2]
+  // CHECK-ERROR:         ^
+
+  stadd h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stadd h2, [sp]
+  // CHECK-ERROR:         ^
+
+  stadd v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stadd v0.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  staddl b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddl b0, [x2]
+  // CHECK-ERROR:          ^
+
+  staddl b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddl b2, [sp]
+  // CHECK-ERROR:          ^
+
+  staddl h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddl h0, [x2]
+  // CHECK-ERROR:          ^
+
+  staddl h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddl h2, [sp]
+  // CHECK-ERROR:          ^
+
+  staddl v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddl v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  staddb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddb b0, [x2]
+  // CHECK-ERROR:          ^
+
+  staddb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddb b2, [sp]
+  // CHECK-ERROR:          ^
+
+  staddb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddb h0, [x2]
+  // CHECK-ERROR:          ^
+
+  staddb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddb h2, [sp]
+  // CHECK-ERROR:          ^
+
+  staddb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddb v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  staddh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddh b0, [x2]
+  // CHECK-ERROR:          ^
+
+  staddh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddh b2, [sp]
+  // CHECK-ERROR:          ^
+
+  staddh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddh h0, [x2]
+  // CHECK-ERROR:          ^
+
+  staddh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddh h2, [sp]
+  // CHECK-ERROR:          ^
+
+  staddh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddh v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  staddlb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddlb b0, [x2]
+  // CHECK-ERROR:           ^
+
+  staddlb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddlb b2, [sp]
+  // CHECK-ERROR:           ^
+
+  staddlb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddlb h0, [x2]
+  // CHECK-ERROR:           ^
+
+  staddlb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddlb h2, [sp]
+  // CHECK-ERROR:           ^
+
+  staddlb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddlb v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  staddlh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddlh b0, [x2]
+  // CHECK-ERROR:           ^
+
+  staddlh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddlh b2, [sp]
+  // CHECK-ERROR:           ^
+
+  staddlh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddlh h0, [x2]
+  // CHECK-ERROR:           ^
+
+  staddlh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddlh h2, [sp]
+  // CHECK-ERROR:           ^
+
+  staddlh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddlh v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stadd b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stadd b0, [x2]
+  // CHECK-ERROR:         ^
+
+  stadd b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stadd b2, [sp]
+  // CHECK-ERROR:         ^
+
+  stadd h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stadd h0, [x2]
+  // CHECK-ERROR:         ^
+
+  stadd h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stadd h2, [sp]
+  // CHECK-ERROR:         ^
+
+  stadd v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stadd v0.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  staddl b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddl b0, [x2]
+  // CHECK-ERROR:          ^
+
+  staddl b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddl b2, [sp]
+  // CHECK-ERROR:          ^
+
+  staddl h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddl h0, [x2]
+  // CHECK-ERROR:          ^
+
+  staddl h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddl h2, [sp]
+  // CHECK-ERROR:          ^
+
+  staddl v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   staddl v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stclr b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclr b0, [x2]
+  // CHECK-ERROR:         ^
+
+  stclr b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclr b2, [sp]
+  // CHECK-ERROR:         ^
+
+  stclr h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclr h0, [x2]
+  // CHECK-ERROR:         ^
+
+  stclr h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclr h2, [sp]
+  // CHECK-ERROR:         ^
+
+  stclr v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclr v0.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  stclrl b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrl b0, [x2]
+  // CHECK-ERROR:          ^
+
+  stclrl b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrl b2, [sp]
+  // CHECK-ERROR:          ^
+
+  stclrl h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrl h0, [x2]
+  // CHECK-ERROR:          ^
+
+  stclrl h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrl h2, [sp]
+  // CHECK-ERROR:          ^
+
+  stclrl v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrl v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stclrb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrb b0, [x2]
+  // CHECK-ERROR:          ^
+
+  stclrb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrb b2, [sp]
+  // CHECK-ERROR:          ^
+
+  stclrb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrb h0, [x2]
+  // CHECK-ERROR:          ^
+
+  stclrb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrb h2, [sp]
+  // CHECK-ERROR:          ^
+
+  stclrb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrb v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stclrh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrh b0, [x2]
+  // CHECK-ERROR:          ^
+
+  stclrh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrh b2, [sp]
+  // CHECK-ERROR:          ^
+
+  stclrh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrh h0, [x2]
+  // CHECK-ERROR:          ^
+
+  stclrh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrh h2, [sp]
+  // CHECK-ERROR:          ^
+
+  stclrh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrh v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stclrlb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrlb b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stclrlb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrlb b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stclrlb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrlb h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stclrlb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrlb h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stclrlb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrlb v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stclrlh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrlh b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stclrlh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrlh b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stclrlh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrlh h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stclrlh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrlh h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stclrlh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stclrlh v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  steor b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steor b0, [x2]
+  // CHECK-ERROR:         ^
+
+  steor b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steor b2, [sp]
+  // CHECK-ERROR:         ^
+
+  steor h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steor h0, [x2]
+  // CHECK-ERROR:         ^
+
+  steor h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steor h2, [sp]
+  // CHECK-ERROR:         ^
+
+  steor v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steor v0.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  steorl b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorl b0, [x2]
+  // CHECK-ERROR:          ^
+
+  steorl b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorl b2, [sp]
+  // CHECK-ERROR:          ^
+
+  steorl h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorl h0, [x2]
+  // CHECK-ERROR:          ^
+
+  steorl h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorl h2, [sp]
+  // CHECK-ERROR:          ^
+
+  steorl v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorl v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  steorb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorb b0, [x2]
+  // CHECK-ERROR:          ^
+
+  steorb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorb b2, [sp]
+  // CHECK-ERROR:          ^
+
+  steorb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorb h0, [x2]
+  // CHECK-ERROR:          ^
+
+  steorb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorb h2, [sp]
+  // CHECK-ERROR:          ^
+
+  steorb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorb v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  steorh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorh b0, [x2]
+  // CHECK-ERROR:          ^
+
+  steorh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorh b2, [sp]
+  // CHECK-ERROR:          ^
+
+  steorh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorh h0, [x2]
+  // CHECK-ERROR:          ^
+
+  steorh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorh h2, [sp]
+  // CHECK-ERROR:          ^
+
+  steorh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorh v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  steorlb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorlb b0, [x2]
+  // CHECK-ERROR:           ^
+
+  steorlb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorlb b2, [sp]
+  // CHECK-ERROR:           ^
+
+  steorlb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorlb h0, [x2]
+  // CHECK-ERROR:           ^
+
+  steorlb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorlb h2, [sp]
+  // CHECK-ERROR:           ^
+
+  steorlb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorlb v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  steorlh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorlh b0, [x2]
+  // CHECK-ERROR:           ^
+
+  steorlh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorlh b2, [sp]
+  // CHECK-ERROR:           ^
+
+  steorlh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorlh h0, [x2]
+  // CHECK-ERROR:           ^
+
+  steorlh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorlh h2, [sp]
+  // CHECK-ERROR:           ^
+
+  steorlh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   steorlh v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stset b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stset b0, [x2]
+  // CHECK-ERROR:         ^
+
+  stset b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stset b2, [sp]
+  // CHECK-ERROR:         ^
+
+  stset h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stset h0, [x2]
+  // CHECK-ERROR:         ^
+
+  stset h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stset h2, [sp]
+  // CHECK-ERROR:         ^
+
+  stset v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stset v0.4h, v2.4h
+  // CHECK-ERROR:         ^
+
+  stsetl b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetl b0, [x2]
+  // CHECK-ERROR:          ^
+
+  stsetl b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetl b2, [sp]
+  // CHECK-ERROR:          ^
+
+  stsetl h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetl h0, [x2]
+  // CHECK-ERROR:          ^
+
+  stsetl h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetl h2, [sp]
+  // CHECK-ERROR:          ^
+
+  stsetl v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetl v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stsetb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetb b0, [x2]
+  // CHECK-ERROR:          ^
+
+  stsetb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetb b2, [sp]
+  // CHECK-ERROR:          ^
+
+  stsetb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetb h0, [x2]
+  // CHECK-ERROR:          ^
+
+  stsetb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetb h2, [sp]
+  // CHECK-ERROR:          ^
+
+  stsetb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetb v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stseth b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stseth b0, [x2]
+  // CHECK-ERROR:          ^
+
+  stseth b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stseth b2, [sp]
+  // CHECK-ERROR:          ^
+
+  stseth h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stseth h0, [x2]
+  // CHECK-ERROR:          ^
+
+  stseth h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stseth h2, [sp]
+  // CHECK-ERROR:          ^
+
+  stseth v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stseth v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stsetlb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetlb b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsetlb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetlb b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsetlb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetlb h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsetlb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetlb h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsetlb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetlb v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stsetlh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetlh b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsetlh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetlh b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsetlh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetlh h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsetlh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetlh h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsetlh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsetlh v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stsmax b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmax b0, [x2]
+  // CHECK-ERROR:          ^
+
+  stsmax b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmax b2, [sp]
+  // CHECK-ERROR:          ^
+
+  stsmax h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmax h0, [x2]
+  // CHECK-ERROR:          ^
+
+  stsmax h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmax h2, [sp]
+  // CHECK-ERROR:          ^
+
+  stsmax v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmax v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stsmaxl b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxl b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsmaxl b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxl b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsmaxl h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxl h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsmaxl h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxl h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsmaxl v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxl v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stsmaxb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxb b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsmaxb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxb b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsmaxb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxb h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsmaxb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxb h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsmaxb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxb v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stsmaxh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxh b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsmaxh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxh b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsmaxh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxh h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsmaxh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxh h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsmaxh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxh v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stsmaxlb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxlb b0, [x2]
+  // CHECK-ERROR:            ^
+
+  stsmaxlb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxlb b2, [sp]
+  // CHECK-ERROR:            ^
+
+  stsmaxlb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxlb h0, [x2]
+  // CHECK-ERROR:            ^
+
+  stsmaxlb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxlb h2, [sp]
+  // CHECK-ERROR:            ^
+
+  stsmaxlb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxlb v0.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  stsmaxlh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxlh b0, [x2]
+  // CHECK-ERROR:            ^
+
+  stsmaxlh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxlh b2, [sp]
+  // CHECK-ERROR:            ^
+
+  stsmaxlh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxlh h0, [x2]
+  // CHECK-ERROR:            ^
+
+  stsmaxlh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxlh h2, [sp]
+  // CHECK-ERROR:            ^
+
+  stsmaxlh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmaxlh v0.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  stsmin b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmin b0, [x2]
+  // CHECK-ERROR:          ^
+
+  stsmin b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmin b2, [sp]
+  // CHECK-ERROR:          ^
+
+  stsmin h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmin h0, [x2]
+  // CHECK-ERROR:          ^
+
+  stsmin h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmin h2, [sp]
+  // CHECK-ERROR:          ^
+
+  stsmin v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsmin v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stsminl b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminl b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsminl b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminl b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsminl h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminl h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsminl h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminl h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsminl v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminl v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stsminb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminb b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsminb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminb b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsminb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminb h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsminb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminb h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsminb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminb v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stsminh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminh b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsminh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminh b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsminh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminh h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stsminh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminh h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stsminh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminh v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stsminlb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminlb b0, [x2]
+  // CHECK-ERROR:            ^
+
+  stsminlb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminlb b2, [sp]
+  // CHECK-ERROR:            ^
+
+  stsminlb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminlb h0, [x2]
+  // CHECK-ERROR:            ^
+
+  stsminlb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminlb h2, [sp]
+  // CHECK-ERROR:            ^
+
+  stsminlb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminlb v0.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  stsminlh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminlh b0, [x2]
+  // CHECK-ERROR:            ^
+
+  stsminlh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminlh b2, [sp]
+  // CHECK-ERROR:            ^
+
+  stsminlh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminlh h0, [x2]
+  // CHECK-ERROR:            ^
+
+  stsminlh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminlh h2, [sp]
+  // CHECK-ERROR:            ^
+
+  stsminlh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stsminlh v0.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  stumax b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumax b0, [x2]
+  // CHECK-ERROR:          ^
+
+  stumax b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumax b2, [sp]
+  // CHECK-ERROR:          ^
+
+  stumax h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumax h0, [x2]
+  // CHECK-ERROR:          ^
+
+  stumax h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumax h2, [sp]
+  // CHECK-ERROR:          ^
+
+  stumax v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumax v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stumaxl b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxl b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stumaxl b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxl b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stumaxl h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxl h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stumaxl h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxl h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stumaxl v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxl v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stumaxb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxb b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stumaxb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxb b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stumaxb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxb h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stumaxb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxb h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stumaxb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxb v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stumaxh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxh b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stumaxh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxh b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stumaxh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxh h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stumaxh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxh h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stumaxh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxh v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stumaxlb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxlb b0, [x2]
+  // CHECK-ERROR:            ^
+
+  stumaxlb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxlb b2, [sp]
+  // CHECK-ERROR:            ^
+
+  stumaxlb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxlb h0, [x2]
+  // CHECK-ERROR:            ^
+
+  stumaxlb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxlb h2, [sp]
+  // CHECK-ERROR:            ^
+
+  stumaxlb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxlb v0.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  stumaxlh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxlh b0, [x2]
+  // CHECK-ERROR:            ^
+
+  stumaxlh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxlh b2, [sp]
+  // CHECK-ERROR:            ^
+
+  stumaxlh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxlh h0, [x2]
+  // CHECK-ERROR:            ^
+
+  stumaxlh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxlh h2, [sp]
+  // CHECK-ERROR:            ^
+
+  stumaxlh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumaxlh v0.4h, v2.4h
+  // CHECK-ERROR:            ^
+
+  stumin b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumin b0, [x2]
+  // CHECK-ERROR:          ^
+
+  stumin b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumin b2, [sp]
+  // CHECK-ERROR:          ^
+
+  stumin h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumin h0, [x2]
+  // CHECK-ERROR:          ^
+
+  stumin h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumin h2, [sp]
+  // CHECK-ERROR:          ^
+
+  stumin v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stumin v0.4h, v2.4h
+  // CHECK-ERROR:          ^
+
+  stuminl b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminl b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stuminl b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminl b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stuminl h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminl h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stuminl h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminl h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stuminl v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminl v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stuminb b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminb b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stuminb b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminb b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stuminb h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminb h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stuminb h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminb h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stuminb v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminb v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
+  stuminh b0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminh b0, [x2]
+  // CHECK-ERROR:           ^
+
+  stuminh b2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminh b2, [sp]
+  // CHECK-ERROR:           ^
+
+  stuminh h0, [x2]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminh h0, [x2]
+  // CHECK-ERROR:           ^
+
+  stuminh h2, [sp]
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminh h2, [sp]
+  // CHECK-ERROR:           ^
+
+  stuminh v0.4h, v2.4h
+  // CHECK-ERROR: error: invalid operand for instruction
+  // CHECK-ERROR:   stuminh v0.4h, v2.4h
+  // CHECK-ERROR:           ^
+
diff --git a/test/MC/AArch64/error-location-post-layout.s b/test/MC/AArch64/error-location-post-layout.s
index 64e2951efba917db84be00f89045f0de731425a4..ac176ca9362f146e8a28b26f9da170e2169137a0 100644
--- a/test/MC/AArch64/error-location-post-layout.s
+++ b/test/MC/AArch64/error-location-post-layout.s
@@ -1,7 +1,7 @@
 // RUN: not llvm-mc -triple aarch64--none-eabi -filetype obj < %s -o /dev/null 2>&1 | FileCheck %s
 
   .set v1, -undef
-// CHECK: <unknown>:0: error: expression could not be evaluated
+// CHECK: 3:12: error: expression could not be evaluated
 
   .comm common, 4
   .set v3, common
diff --git a/test/MC/AArch64/label-arithmetic-diags-elf.s b/test/MC/AArch64/label-arithmetic-diags-elf.s
index 6e928bdf094c96f5bdfe74ebed84cd5f625fda3d..e9d92d591fac285ad52b92689ea600a0244c54d3 100644
--- a/test/MC/AArch64/label-arithmetic-diags-elf.s
+++ b/test/MC/AArch64/label-arithmetic-diags-elf.s
@@ -63,9 +63,9 @@ end_across_sec:
 
   add w0, w1, #(sec_y - sec_x)
   cmp w0, #(sec_y - sec_x)
-  // CHECK: error: symbol 'sec_x' can not be undefined in a subtraction expression
+  // CHECK: error: Cannot represent a difference across sections
   // CHECK-NEXT: add w0, w1, #(sec_y - sec_x)
   // CHECK-NEXT: ^
-  // CHECK: error: symbol 'sec_x' can not be undefined in a subtraction expression
+  // CHECK: error: Cannot represent a difference across sections
   // CHECK-NEXT: cmp w0, #(sec_y - sec_x)
   // CHECK-NEXT: ^
diff --git a/test/MC/AArch64/neon-diagnostics.s b/test/MC/AArch64/neon-diagnostics.s
index a51243dfb344c6510ef11ef98938795f07dd5fca..1172903b03425e4f3e895f657de3994d3dda8bb1 100644
--- a/test/MC/AArch64/neon-diagnostics.s
+++ b/test/MC/AArch64/neon-diagnostics.s
@@ -81,7 +81,7 @@
 // CHECK-ERROR: error: invalid operand for instruction
 // CHECK-ERROR:         and v0.8b, v1.16b, v2.8b
 // CHECK-ERROR:                       ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: immediate must be an integer in range [0, 255]
 // CHECK-ERROR:         orr v0.4h, v1.4h, v2.4h
 // CHECK-ERROR:                ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -152,10 +152,10 @@
       // invalid vector type (2s, 4s, 4h, 8h)
       movi v5.8b, #1, lsl #8
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: immediate must be an integer in range [0, 255]
 // CHECK-ERROR:          movi v0.2s, #-1
 // CHECK-ERROR:                      ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: immediate must be an integer in range [0, 255]
 // CHECK-ERROR:         mvni v1.4s, #256
 // CHECK-ERROR:                     ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -183,10 +183,10 @@
       // invalid vector type (2s, 4s)
       movi v5.4h, #31, msl #8
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: immediate must be an integer in range [0, 255]
 // CHECK-ERROR:         movi v0.2s, #-1, msl #8
 // CHECK-ERROR:                     ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: immediate must be an integer in range [0, 255]
 // CHECK-ERROR:         mvni v7.4s, #256, msl #16
 // CHECK-ERROR:                     ^
 // CHECK-ERROR: error: invalid operand for instruction
@@ -206,10 +206,10 @@
         movi v0.8b, #-1
         movi v1.16b, #256
 
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: immediate must be an integer in range [0, 255]
 // CHECK-ERROR:         movi v0.8b, #-1
 // CHECK-ERROR:                     ^
-// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR: error: immediate must be an integer in range [0, 255]
 // CHECK-ERROR:         movi v1.16b, #256
 // CHECK-ERROR:                      ^
 
diff --git a/test/MC/AArch64/nofp-crypto-diagnostic.s b/test/MC/AArch64/nofp-crypto-diagnostic.s
new file mode 100644
index 0000000000000000000000000000000000000000..36da8a83128deeb98aa68a264cb9498a1e26697a
--- /dev/null
+++ b/test/MC/AArch64/nofp-crypto-diagnostic.s
@@ -0,0 +1,8 @@
+// RUN: not llvm-mc  -triple aarch64-none-linux-gnu -mattr=+neon,+crypto,-fp-armv8 < %s 2> %t
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+
+        sha1h s0, s1
+
+// CHECK-ERROR: error: instruction requires: crypto
+// CHECK-ERROR-NEXT:    sha1h s0, s1
+// CHECK-ERROR-NEXT:    ^
diff --git a/test/MC/AMDGPU/code-object-metadata-kernel-args.s b/test/MC/AMDGPU/code-object-metadata-kernel-args.s
new file mode 100644
index 0000000000000000000000000000000000000000..90915e61f99a40f00b92aa7cbb51454c68c5005b
--- /dev/null
+++ b/test/MC/AMDGPU/code-object-metadata-kernel-args.s
@@ -0,0 +1,68 @@
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+
+// CHECK:  .amdgpu_code_object_metadata
+// CHECK:    Version: [ 1, 0 ]
+// CHECK:    Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ]
+// CHECK:    Kernels:
+// CHECK:      - Name:            test_kernel
+// CHECK:        Language:        OpenCL C
+// CHECK:        LanguageVersion: [ 2, 0 ]
+// CHECK:        Args:
+// CHECK:          - Size:          1
+// CHECK:            Align:         1
+// CHECK:            ValueKind:     ByValue
+// CHECK:            ValueType:     I8
+// CHECK:            AccQual:       Default
+// CHECK:            TypeName:      char
+// CHECK:          - Size:          8
+// CHECK:            Align:         8
+// CHECK:            ValueKind:     HiddenGlobalOffsetX
+// CHECK:            ValueType:     I64
+// CHECK:          - Size:          8
+// CHECK:            Align:         8
+// CHECK:            ValueKind:     HiddenGlobalOffsetY
+// CHECK:            ValueType:     I64
+// CHECK:          - Size:          8
+// CHECK:            Align:         8
+// CHECK:            ValueKind:     HiddenGlobalOffsetZ
+// CHECK:            ValueType:     I64
+// CHECK:          - Size:          8
+// CHECK:            Align:         8
+// CHECK:            ValueKind:     HiddenPrintfBuffer
+// CHECK:            ValueType:     I8
+// CHECK:            AddrSpaceQual: Global
+// CHECK:  .end_amdgpu_code_object_metadata
+.amdgpu_code_object_metadata
+  Version: [ 1, 0 ]
+  Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ]
+  Kernels:
+    - Name:            test_kernel
+      Language:        OpenCL C
+      LanguageVersion: [ 2, 0 ]
+      Args:
+        - Size:          1
+          Align:         1
+          ValueKind:     ByValue
+          ValueType:     I8
+          AccQual:       Default
+          TypeName:      char
+        - Size:          8
+          Align:         8
+          ValueKind:     HiddenGlobalOffsetX
+          ValueType:     I64
+        - Size:          8
+          Align:         8
+          ValueKind:     HiddenGlobalOffsetY
+          ValueType:     I64
+        - Size:          8
+          Align:         8
+          ValueKind:     HiddenGlobalOffsetZ
+          ValueType:     I64
+        - Size:          8
+          Align:         8
+          ValueKind:     HiddenPrintfBuffer
+          ValueType:     I8
+          AddrSpaceQual: Global
+.end_amdgpu_code_object_metadata
diff --git a/test/MC/AMDGPU/code-object-metadata-kernel-attrs.s b/test/MC/AMDGPU/code-object-metadata-kernel-attrs.s
new file mode 100644
index 0000000000000000000000000000000000000000..9669fcf53939587fe9fb94f41b34dff603488c94
--- /dev/null
+++ b/test/MC/AMDGPU/code-object-metadata-kernel-attrs.s
@@ -0,0 +1,28 @@
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+
+// CHECK:  .amdgpu_code_object_metadata
+// CHECK:    Version: [ 1, 0 ]
+// CHECK:    Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ]
+// CHECK:    Kernels:
+// CHECK:      - Name:            test_kernel
+// CHECK:        Language:        OpenCL C
+// CHECK:        LanguageVersion: [ 2, 0 ]
+// CHECK:    Attrs:
+// CHECK:        ReqdWorkGroupSize: [ 1, 2, 4 ]
+// CHECK:        WorkGroupSizeHint: [ 8, 16, 32 ]
+// CHECK:        VecTypeHint:       int
+// CHECK: .end_amdgpu_code_object_metadata
+.amdgpu_code_object_metadata
+  Version: [ 1, 0 ]
+  Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ]
+  Kernels:
+    - Name:            test_kernel
+      Language:        OpenCL C
+      LanguageVersion: [ 2, 0 ]
+      Attrs:
+        ReqdWorkGroupSize: [ 1, 2, 4 ]
+        WorkGroupSizeHint: [ 8, 16, 32 ]
+        VecTypeHint:       int
+.end_amdgpu_code_object_metadata
diff --git a/test/MC/AMDGPU/code-object-metadata-kernel-code-props.s b/test/MC/AMDGPU/code-object-metadata-kernel-code-props.s
new file mode 100644
index 0000000000000000000000000000000000000000..da4c8c1028d790a5925bc0142bb45ca67d737ee4
--- /dev/null
+++ b/test/MC/AMDGPU/code-object-metadata-kernel-code-props.s
@@ -0,0 +1,24 @@
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+
+// CHECK:  .amdgpu_code_object_metadata
+// CHECK:    Version: [ 1, 0 ]
+// CHECK:    Kernels:
+// CHECK:      - Name: test_kernel
+// CHECK:        CodeProps:
+// CHECK:          KernargSegmentSize:         24
+// CHECK:          WorkitemPrivateSegmentSize: 16
+// CHECK:          WavefrontNumSGPRs:          6
+// CHECK:          WorkitemNumVGPRs:           12
+.amdgpu_code_object_metadata
+  Version: [ 1, 0 ]
+  Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ]
+  Kernels:
+    - Name:            test_kernel
+      CodeProps:
+        KernargSegmentSize:         24
+        WorkitemPrivateSegmentSize: 16
+        WavefrontNumSGPRs:          6
+        WorkitemNumVGPRs:           12
+.end_amdgpu_code_object_metadata
diff --git a/test/MC/AMDGPU/code-object-metadata-kernel-debug-props.s b/test/MC/AMDGPU/code-object-metadata-kernel-debug-props.s
new file mode 100644
index 0000000000000000000000000000000000000000..4153737bf33a08304273e99c8ca9d7652d170edf
--- /dev/null
+++ b/test/MC/AMDGPU/code-object-metadata-kernel-debug-props.s
@@ -0,0 +1,26 @@
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX700 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX800 %s
+// RUN: llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=CHECK --check-prefix=GFX900 %s
+
+// CHECK:  .amdgpu_code_object_metadata
+// CHECK:    Version: [ 1, 0 ]
+// CHECK:    Kernels:
+// CHECK:      - Name: test_kernel
+// CHECK:        DebugProps:
+// CHECK:          DebuggerABIVersion:                [ 1, 0 ]
+// CHECK:          ReservedNumVGPRs:                  4
+// CHECK:          ReservedFirstVGPR:                 11
+// CHECK:          PrivateSegmentBufferSGPR:          0
+// CHECK:          WavefrontPrivateSegmentOffsetSGPR: 11
+.amdgpu_code_object_metadata
+  Version: [ 1, 0 ]
+  Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ]
+  Kernels:
+    - Name:            test_kernel
+      DebugProps:
+        DebuggerABIVersion:                [ 1, 0 ]
+        ReservedNumVGPRs:                  4
+        ReservedFirstVGPR:                 11
+        PrivateSegmentBufferSGPR:          0
+        WavefrontPrivateSegmentOffsetSGPR: 11
+.end_amdgpu_code_object_metadata
\ No newline at end of file
diff --git a/test/MC/AMDGPU/code-object-metadata-unknown-key.s b/test/MC/AMDGPU/code-object-metadata-unknown-key.s
new file mode 100644
index 0000000000000000000000000000000000000000..9add19f6e55ca1c9cabf9016f21c1e71f29f7cfd
--- /dev/null
+++ b/test/MC/AMDGPU/code-object-metadata-unknown-key.s
@@ -0,0 +1,41 @@
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx700 -filetype=obj %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -triple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=obj %s 2>&1 | FileCheck %s
+
+// CHECK: error: unknown key 'UnknownKey'
+.amdgpu_code_object_metadata
+  UnknownKey: [ 2, 0 ]
+  Version: [ 1, 0 ]
+  Printf: [ '1:1:4:%d\n', '2:1:8:%g\n' ]
+  Kernels:
+    - Name:            test_kernel
+      Language:        OpenCL C
+      LanguageVersion: [ 2, 0 ]
+      Args:
+        - Size:          1
+          Align:         1
+          ValueKind:     ByValue
+          ValueType:     I8
+          AccQual:       Default
+          TypeName:      char
+        - Size:          8
+          Align:         8
+          ValueKind:     HiddenGlobalOffsetX
+          ValueType:     I64
+        - Size:          8
+          Align:         8
+          ValueKind:     HiddenGlobalOffsetY
+          ValueType:     I64
+        - Size:          8
+          Align:         8
+          ValueKind:     HiddenGlobalOffsetZ
+          ValueType:     I64
+        - Size:          8
+          Align:         8
+          ValueKind:     HiddenPrintfBuffer
+          ValueType:     I8
+          AddrSpaceQual: Global
+.end_amdgpu_code_object_metadata
diff --git a/test/MC/AMDGPU/ds.s b/test/MC/AMDGPU/ds.s
index 4b68a823dd22935ebea2e7e32615bc25eb468e77..bfa4a2f731147e479f1f01a301ea7814f37c8084 100644
--- a/test/MC/AMDGPU/ds.s
+++ b/test/MC/AMDGPU/ds.s
@@ -140,24 +140,32 @@ ds_max_f32 v2, v4
 // VI:   ds_max_f32 v2, v4 ; encoding: [0x00,0x00,0x26,0xd8,0x02,0x04,0x00,0x00]
 
 ds_gws_init v2 gds
-// SICI: ds_gws_init v2 gds ; encoding: [0x00,0x00,0x66,0xd8,0x02,0x00,0x00,0x00]
-// VI:   ds_gws_init v2 gds ; encoding: [0x00,0x00,0x33,0xd8,0x02,0x00,0x00,0x00]
+// SICI: ds_gws_init v2 gds ; encoding: [0x00,0x00,0x66,0xd8,0x00,0x02,0x00,0x00]
+// VI:   ds_gws_init v2 gds ; encoding: [0x00,0x00,0x33,0xd9,0x00,0x02,0x00,0x00]
 
-ds_gws_sema_v v2 gds
-// SICI: ds_gws_sema_v v2 gds ; encoding: [0x00,0x00,0x6a,0xd8,0x02,0x00,0x00,0x00]
-// VI:   ds_gws_sema_v v2 gds ; encoding: [0x00,0x00,0x35,0xd8,0x02,0x00,0x00,0x00]
+ds_gws_init v3 offset:12345 gds
+// SICI: ds_gws_init v3 offset:12345 gds ; encoding: [0x39,0x30,0x66,0xd8,0x00,0x03,0x00,0x00]
+// VI:   ds_gws_init v3 offset:12345 gds ; encoding: [0x39,0x30,0x33,0xd9,0x00,0x03,0x00,0x00]
+
+ds_gws_sema_v gds
+// SICI: ds_gws_sema_v gds ; encoding: [0x00,0x00,0x6a,0xd8,0x00,0x00,0x00,0x00]
+// VI:   ds_gws_sema_v gds ; encoding: [0x00,0x00,0x35,0xd9,0x00,0x00,0x00,0x00]
+
+ds_gws_sema_v offset:257 gds
+// SICI: ds_gws_sema_v offset:257 gds    ; encoding: [0x01,0x01,0x6a,0xd8,0x00,0x00,0x00,0x00]
+// VI:   ds_gws_sema_v offset:257 gds    ; encoding: [0x01,0x01,0x35,0xd9,0x00,0x00,0x00,0x00]
 
 ds_gws_sema_br v2 gds
-// SICI: ds_gws_sema_br v2 gds ; encoding: [0x00,0x00,0x6e,0xd8,0x02,0x00,0x00,0x00]
-// VI:   ds_gws_sema_br v2 gds ; encoding: [0x00,0x00,0x37,0xd8,0x02,0x00,0x00,0x00]
+// SICI: ds_gws_sema_br v2 gds ; encoding: [0x00,0x00,0x6e,0xd8,0x00,0x02,0x00,0x00]
+// VI:   ds_gws_sema_br v2 gds ; encoding: [0x00,0x00,0x37,0xd9,0x00,0x02,0x00,0x00]
 
-ds_gws_sema_p v2 gds
-// SICI: ds_gws_sema_p v2 gds ; encoding: [0x00,0x00,0x72,0xd8,0x02,0x00,0x00,0x00]
-// VI:   ds_gws_sema_p v2 gds ; encoding: [0x00,0x00,0x39,0xd8,0x02,0x00,0x00,0x00]
+ds_gws_sema_p gds
+// SICI: ds_gws_sema_p gds ; encoding: [0x00,0x00,0x72,0xd8,0x00,0x00,0x00,0x00]
+// VI:   ds_gws_sema_p gds ; encoding: [0x00,0x00,0x39,0xd9,0x00,0x00,0x00,0x00]
 
 ds_gws_barrier v2 gds
-// SICI: ds_gws_barrier v2 gds ; encoding: [0x00,0x00,0x76,0xd8,0x02,0x00,0x00,0x00]
-// VI:   ds_gws_barrier v2 gds ; encoding: [0x00,0x00,0x3b,0xd8,0x02,0x00,0x00,0x00]
+// SICI: ds_gws_barrier v2 gds ; encoding: [0x00,0x00,0x76,0xd8,0x00,0x02,0x00,0x00]
+// VI:   ds_gws_barrier v2 gds ; encoding: [0x00,0x00,0x3b,0xd9,0x00,0x02,0x00,0x00]
 
 ds_write_b8 v2, v4
 // SICI: ds_write_b8 v2, v4 ; encoding: [0x00,0x00,0x78,0xd8,0x02,0x04,0x00,0x00]
@@ -231,10 +239,18 @@ ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6
 // SICI: ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0xb8,0xd8,0x02,0x04,0x06,0x08]
 // VI:   ds_wrxchg2_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0x5c,0xd8,0x02,0x04,0x06,0x08]
 
+ds_wrxchg2_rtn_b32 v[0:1], v0, v0, v0 offset0:127 offset1:255
+// SICI: ds_wrxchg2_rtn_b32 v[0:1], v0, v0, v0 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xb8,0xd8,0x00,0x00,0x00,0x00]
+// VI:   ds_wrxchg2_rtn_b32 v[0:1], v0, v0, v0 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5c,0xd8,0x00,0x00,0x00,0x00]
+
 ds_wrxchg2st64_rtn_b32 v[8:9] v2, v4, v6
 // SICI: ds_wrxchg2st64_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0xbc,0xd8,0x02,0x04,0x06,0x08]
 // VI:   ds_wrxchg2st64_rtn_b32 v[8:9], v2, v4, v6 ; encoding: [0x00,0x00,0x5e,0xd8,0x02,0x04,0x06,0x08]
 
+ds_wrxchg2st64_rtn_b32 v[0:1], v0, v255, v0 offset0:127 offset1:255
+// SICI: ds_wrxchg2st64_rtn_b32 v[0:1], v0, v255, v0 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xbc,0xd8,0x00,0xff,0x00,0x00]
+// VI:   ds_wrxchg2st64_rtn_b32 v[0:1], v0, v255, v0 offset0:127 offset1:255 ; encoding: [0x7f,0xff,0x5e,0xd8,0x00,0xff,0x00,0x00]
+
 ds_cmpst_rtn_b32 v8, v2, v4, v6
 // SICI: ds_cmpst_rtn_b32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0xc0,0xd8,0x02,0x04,0x06,0x08]
 // VI:   ds_cmpst_rtn_b32 v8, v2, v4, v6 ; encoding: [0x00,0x00,0x60,0xd8,0x02,0x04,0x06,0x08]
@@ -284,17 +300,17 @@ ds_read_u16 v8, v2
 // VI:   ds_read_u16 v8, v2 ; encoding: [0x00,0x00,0x78,0xd8,0x02,0x00,0x00,0x08]
 
 
-//ds_consume v8
-// FIXMESICI: ds_consume v8 ; encoding: [0x00,0x00,0xf4,0xd8,0x00,0x00,0x00,0x08]
-// FIXMEVI:   ds_consume v8 ; encoding: [0x00,0x00,0x7a,0xd8,0x00,0x00,0x00,0x08]
+ds_consume v8
+// SICI: ds_consume v8 ; encoding: [0x00,0x00,0xf4,0xd8,0x00,0x00,0x00,0x08]
+// VI:   ds_consume v8 ; encoding: [0x00,0x00,0x7a,0xd9,0x00,0x00,0x00,0x08]
 
-//ds_append v8
-// FIXMESICI: ds_append v8 ; encoding: [0x00,0x00,0xf8,0xd8,0x00,0x00,0x00,0x08]
-// FIXMEVI:   ds_append v8 ; encoding: [0x00,0x00,0x7c,0xd8,0x00,0x00,0x00,0x08]
+ds_append v8
+// SICI: ds_append v8 ; encoding: [0x00,0x00,0xf8,0xd8,0x00,0x00,0x00,0x08]
+// VI:   ds_append v8 ; encoding: [0x00,0x00,0x7c,0xd9,0x00,0x00,0x00,0x08]
 
-//ds_ordered_count v8, v2 gds
-// FIXMESICI: ds_ordered_count v8, v2 gds ; encoding: [0x00,0x00,0xfe,0xd8,0x02,0x00,0x00,0x08]
-// FIXMEVI:   ds_ordered_count v8, v2 gds ; encoding: [0x00,0x00,0x7f,0xd8,0x02,0x00,0x00,0x08]
+ds_ordered_count v8, v2 gds
+// SICI: ds_ordered_count v8, v2 gds ; encoding: [0x00,0x00,0xfe,0xd8,0x02,0x00,0x00,0x08]
+// VI:   ds_ordered_count v8, v2 gds ; encoding: [0x00,0x00,0x7f,0xd9,0x02,0x00,0x00,0x08]
 
 ds_add_u64 v2, v[4:5]
 // SICI: ds_add_u64 v2, v[4:5] ; encoding: [0x00,0x00,0x00,0xd9,0x02,0x04,0x00,0x00]
@@ -436,10 +452,18 @@ ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7]
 // SICI: ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xb8,0xd9,0x02,0x04,0x06,0x08]
 // VI:   ds_wrxchg2_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xdc,0xd8,0x02,0x04,0x06,0x08]
 
+ds_wrxchg2_rtn_b64 v[0:3], v0, v[1:2], v[0:1] offset0:127 offset1:255
+// SICI: ds_wrxchg2_rtn_b64 v[0:3], v0, v[1:2], v[0:1] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xb8,0xd9,0x00,0x01,0x00,0x00]
+// VI:   ds_wrxchg2_rtn_b64 v[0:3], v0, v[1:2], v[0:1] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xdc,0xd8,0x00,0x01,0x00,0x00]
+
 ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7]
 // SICI: ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xbc,0xd9,0x02,0x04,0x06,0x08]
 // VI:   ds_wrxchg2st64_rtn_b64 v[8:11], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xde,0xd8,0x02,0x04,0x06,0x08]
 
+ds_wrxchg2st64_rtn_b64 v[0:3], v255, v[0:1], v[0:1] offset0:127 offset1:255
+// SICI: ds_wrxchg2st64_rtn_b64 v[0:3], v255, v[0:1], v[0:1] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xbc,0xd9,0xff,0x00,0x00,0x00]
+// VI:   ds_wrxchg2st64_rtn_b64 v[0:3], v255, v[0:1], v[0:1] offset0:127 offset1:255 ; encoding: [0x7f,0xff,0xde,0xd8,0xff,0x00,0x00,0x00]
+
 ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7]
 // SICI: ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xc0,0xd9,0x02,0x04,0x06,0x08]
 // VI:   ds_cmpst_rtn_b64 v[8:9], v2, v[4:5], v[6:7] ; encoding: [0x00,0x00,0xe0,0xd8,0x02,0x04,0x06,0x08]
@@ -468,3 +492,17 @@ ds_read2st64_b64 v[8:11], v2
 // SICI: ds_read2st64_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xe0,0xd9,0x02,0x00,0x00,0x08]
 // VI:   ds_read2st64_b64 v[8:11], v2 ; encoding: [0x00,0x00,0xf0,0xd8,0x02,0x00,0x00,0x08]
 
+ds_read_b128 v[8:11], v2
+// NOSI: error: instruction not supported on this GPU
+// CI:   ds_read_b128 v[8:11], v2 ; encoding: [0x00,0x00,0xfc,0xdb,0x02,0x00,0x00,0x08]
+// VI:   ds_read_b128 v[8:11], v2 ; encoding: [0x00,0x00,0xfe,0xd9,0x02,0x00,0x00,0x08]
+
+ds_write_b128 v2, v[4:7]
+// NOSI: error: instruction not supported on this GPU
+// CI: ds_write_b128 v2, v[4:7] ; encoding: [0x00,0x00,0x7c,0xdb,0x02,0x04,0x00,0x00]
+// VI:   ds_write_b128 v2, v[4:7] ; encoding: [0x00,0x00,0xbe,0xd9,0x02,0x04,0x00,0x00]
+
+ds_nop
+// NOSI: error: instruction not supported on this GPU
+// CI: ds_nop ; encoding: [0x00,0x00,0x50,0xd8,0x00,0x00,0x00,0x00]
+// VI: ds_nop ; encoding: [0x00,0x00,0x28,0xd8,0x00,0x00,0x00,0x00]
diff --git a/test/MC/AMDGPU/expressions.s b/test/MC/AMDGPU/expressions.s
index 9fc956628f1fdbef1e1d2b7fed5e7947904db1ed..e593bcd75610341c884004e92426ac4bb3cab7b3 100644
--- a/test/MC/AMDGPU/expressions.s
+++ b/test/MC/AMDGPU/expressions.s
@@ -11,7 +11,7 @@ s_mov_b32 s0, global
 
 // Use a token with the same name as a global
 ds_gws_init v2 gds
-// VI: ds_gws_init v2 gds ; encoding: [0x00,0x00,0x33,0xd8,0x02,0x00,0x00,0x00]
+// VI: ds_gws_init v2 gds ; encoding: [0x00,0x00,0x33,0xd9,0x00,0x02,0x00,0x00]
 
 // Use a global with the same name as a token
 s_mov_b32 s0, gds
diff --git a/test/MC/AMDGPU/gfx7_asm_all.s b/test/MC/AMDGPU/gfx7_asm_all.s
index 138cb94d1e70c4dc56f7cfa4235015060c0078d5..d1d864c3ffeba4f3afb05692301e82b06e38c9ef 100644
--- a/test/MC/AMDGPU/gfx7_asm_all.s
+++ b/test/MC/AMDGPU/gfx7_asm_all.s
@@ -1,2662 +1,2688 @@
 // RUN: llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s
 
-ds_add_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x00,0xd8,0x00,0x00,0x00,0x00]
+// *** GENERATED BY TESTGEN, DO NOT EDIT! ***
 
-ds_add_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x00,0xd8,0xff,0x00,0x00,0x00]
+ds_add_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x00,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x00,0xd8,0x00,0xff,0x00,0x00]
+ds_add_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x00,0xd8,0xff,0x02,0x00,0x00]
 
-ds_add_u32 v0, v0
-// CHECK: [0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x00,0xd8,0x01,0xff,0x00,0x00]
 
-ds_add_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u32 v1, v2
+// CHECK: [0x00,0x00,0x00,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x00,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x00,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x02,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x00,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x04,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x02,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x04,0xd8,0xff,0x00,0x00,0x00]
+ds_sub_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x04,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x04,0xd8,0x00,0xff,0x00,0x00]
+ds_sub_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x04,0xd8,0xff,0x02,0x00,0x00]
 
-ds_sub_u32 v0, v0
-// CHECK: [0x00,0x00,0x04,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x04,0xd8,0x01,0xff,0x00,0x00]
 
-ds_sub_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x04,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u32 v1, v2
+// CHECK: [0x00,0x00,0x04,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x04,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x04,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x06,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x04,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x08,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x06,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x08,0xd8,0xff,0x00,0x00,0x00]
+ds_rsub_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x08,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x08,0xd8,0x00,0xff,0x00,0x00]
+ds_rsub_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x08,0xd8,0xff,0x02,0x00,0x00]
 
-ds_rsub_u32 v0, v0
-// CHECK: [0x00,0x00,0x08,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x08,0xd8,0x01,0xff,0x00,0x00]
 
-ds_rsub_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x08,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u32 v1, v2
+// CHECK: [0x00,0x00,0x08,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x08,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x08,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0a,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x08,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x0c,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x0a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x0c,0xd8,0xff,0x00,0x00,0x00]
+ds_inc_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x0c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x0c,0xd8,0x00,0xff,0x00,0x00]
+ds_inc_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x0c,0xd8,0xff,0x02,0x00,0x00]
 
-ds_inc_u32 v0, v0
-// CHECK: [0x00,0x00,0x0c,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x0c,0xd8,0x01,0xff,0x00,0x00]
 
-ds_inc_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x0c,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u32 v1, v2
+// CHECK: [0x00,0x00,0x0c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x0c,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x0c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0e,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x0c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x10,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x0e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x10,0xd8,0xff,0x00,0x00,0x00]
+ds_dec_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x10,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x10,0xd8,0x00,0xff,0x00,0x00]
+ds_dec_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x10,0xd8,0xff,0x02,0x00,0x00]
 
-ds_dec_u32 v0, v0
-// CHECK: [0x00,0x00,0x10,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x10,0xd8,0x01,0xff,0x00,0x00]
 
-ds_dec_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x10,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u32 v1, v2
+// CHECK: [0x00,0x00,0x10,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x10,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x10,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x12,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x10,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x14,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x12,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x14,0xd8,0xff,0x00,0x00,0x00]
+ds_min_i32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x14,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x14,0xd8,0x00,0xff,0x00,0x00]
+ds_min_i32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x14,0xd8,0xff,0x02,0x00,0x00]
 
-ds_min_i32 v0, v0
-// CHECK: [0x00,0x00,0x14,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x14,0xd8,0x01,0xff,0x00,0x00]
 
-ds_min_i32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x14,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i32 v1, v2
+// CHECK: [0x00,0x00,0x14,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x14,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x14,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x16,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x14,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x18,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x16,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x18,0xd8,0xff,0x00,0x00,0x00]
+ds_max_i32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x18,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x18,0xd8,0x00,0xff,0x00,0x00]
+ds_max_i32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x18,0xd8,0xff,0x02,0x00,0x00]
 
-ds_max_i32 v0, v0
-// CHECK: [0x00,0x00,0x18,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x18,0xd8,0x01,0xff,0x00,0x00]
 
-ds_max_i32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x18,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i32 v1, v2
+// CHECK: [0x00,0x00,0x18,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x18,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x18,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x1a,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x18,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x1a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x1c,0xd8,0xff,0x00,0x00,0x00]
+ds_min_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x1c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x1c,0xd8,0x00,0xff,0x00,0x00]
+ds_min_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x1c,0xd8,0xff,0x02,0x00,0x00]
 
-ds_min_u32 v0, v0
-// CHECK: [0x00,0x00,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x1c,0xd8,0x01,0xff,0x00,0x00]
 
-ds_min_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u32 v1, v2
+// CHECK: [0x00,0x00,0x1c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x1c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x1e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x1c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x20,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x1e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x20,0xd8,0xff,0x00,0x00,0x00]
+ds_max_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x20,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x20,0xd8,0x00,0xff,0x00,0x00]
+ds_max_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x20,0xd8,0xff,0x02,0x00,0x00]
 
-ds_max_u32 v0, v0
-// CHECK: [0x00,0x00,0x20,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x20,0xd8,0x01,0xff,0x00,0x00]
 
-ds_max_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x20,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u32 v1, v2
+// CHECK: [0x00,0x00,0x20,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x20,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x20,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x22,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x20,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x24,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x22,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x24,0xd8,0xff,0x00,0x00,0x00]
+ds_and_b32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x24,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x24,0xd8,0x00,0xff,0x00,0x00]
+ds_and_b32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x24,0xd8,0xff,0x02,0x00,0x00]
 
-ds_and_b32 v0, v0
-// CHECK: [0x00,0x00,0x24,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x24,0xd8,0x01,0xff,0x00,0x00]
 
-ds_and_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x24,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b32 v1, v2
+// CHECK: [0x00,0x00,0x24,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x24,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x24,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x26,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x24,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x28,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x26,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x28,0xd8,0xff,0x00,0x00,0x00]
+ds_or_b32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x28,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x28,0xd8,0x00,0xff,0x00,0x00]
+ds_or_b32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x28,0xd8,0xff,0x02,0x00,0x00]
 
-ds_or_b32 v0, v0
-// CHECK: [0x00,0x00,0x28,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x28,0xd8,0x01,0xff,0x00,0x00]
 
-ds_or_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x28,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b32 v1, v2
+// CHECK: [0x00,0x00,0x28,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x28,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x28,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x2a,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x28,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x2c,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x2a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x2c,0xd8,0xff,0x00,0x00,0x00]
+ds_xor_b32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x2c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x2c,0xd8,0x00,0xff,0x00,0x00]
+ds_xor_b32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x2c,0xd8,0xff,0x02,0x00,0x00]
 
-ds_xor_b32 v0, v0
-// CHECK: [0x00,0x00,0x2c,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x2c,0xd8,0x01,0xff,0x00,0x00]
 
-ds_xor_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x2c,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b32 v1, v2
+// CHECK: [0x00,0x00,0x2c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x2c,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x2c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x2e,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x2c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_mskor_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x30,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x2e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_mskor_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x30,0xd8,0xff,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x30,0xd8,0x01,0x02,0x03,0x00]
 
-ds_mskor_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x30,0xd8,0x00,0xff,0x00,0x00]
+ds_mskor_b32 v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x30,0xd8,0xff,0x02,0x03,0x00]
 
-ds_mskor_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x30,0xd8,0x00,0x00,0xff,0x00]
+ds_mskor_b32 v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0x30,0xd8,0x01,0xff,0x03,0x00]
 
-ds_mskor_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0x30,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0x30,0xd8,0x01,0x02,0xff,0x00]
 
-ds_mskor_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x30,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v3
+// CHECK: [0x00,0x00,0x30,0xd8,0x01,0x02,0x03,0x00]
 
-ds_mskor_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x30,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0x30,0xd8,0x01,0x02,0x03,0x00]
 
-ds_mskor_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x32,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0x30,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x34,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0x32,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x34,0xd8,0xff,0x00,0x00,0x00]
+ds_write_b32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x34,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x34,0xd8,0x00,0xff,0x00,0x00]
+ds_write_b32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x34,0xd8,0xff,0x02,0x00,0x00]
 
-ds_write_b32 v0, v0
-// CHECK: [0x00,0x00,0x34,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x34,0xd8,0x01,0xff,0x00,0x00]
 
-ds_write_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x34,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b32 v1, v2
+// CHECK: [0x00,0x00,0x34,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x34,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x34,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x36,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x34,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x38,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x36,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write2_b32 v255, v0, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x38,0xd8,0xff,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x38,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v255, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x38,0xd8,0x00,0xff,0x00,0x00]
+ds_write2_b32 v255, v2, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x38,0xd8,0xff,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x38,0xd8,0x00,0x00,0xff,0x00]
+ds_write2_b32 v1, v255, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x38,0xd8,0x01,0xff,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset1:255
-// CHECK: [0x00,0xff,0x38,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x38,0xd8,0x01,0x02,0xff,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0x38,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset1:255
+// CHECK: [0x00,0xff,0x38,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0x38,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0x38,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:127
-// CHECK: [0x7f,0x00,0x38,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0x38,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0x38,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:127
+// CHECK: [0x7f,0x00,0x38,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0x38,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0x38,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0x3a,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0x38,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0x3a,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v255, v0, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x3c,0xd8,0xff,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x3c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v255, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x3c,0xd8,0x00,0xff,0x00,0x00]
+ds_write2st64_b32 v255, v2, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x3c,0xd8,0xff,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x3c,0xd8,0x00,0x00,0xff,0x00]
+ds_write2st64_b32 v1, v255, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x3c,0xd8,0x01,0xff,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset1:255
-// CHECK: [0x00,0xff,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x3c,0xd8,0x01,0x02,0xff,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset1:255
+// CHECK: [0x00,0xff,0x3c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0x3c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:127
-// CHECK: [0x7f,0x00,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0x3c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:127
+// CHECK: [0x7f,0x00,0x3c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0x3c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0x3e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0x3c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x40,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0x3e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x40,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x40,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x40,0xd8,0x00,0xff,0x00,0x00]
+ds_cmpst_b32 v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x40,0xd8,0xff,0x02,0x03,0x00]
 
-ds_cmpst_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x40,0xd8,0x00,0x00,0xff,0x00]
+ds_cmpst_b32 v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0x40,0xd8,0x01,0xff,0x03,0x00]
 
-ds_cmpst_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0x40,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0x40,0xd8,0x01,0x02,0xff,0x00]
 
-ds_cmpst_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x40,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v3
+// CHECK: [0x00,0x00,0x40,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x40,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0x40,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x42,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0x40,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x44,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0x42,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x44,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x44,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x44,0xd8,0x00,0xff,0x00,0x00]
+ds_cmpst_f32 v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x44,0xd8,0xff,0x02,0x03,0x00]
 
-ds_cmpst_f32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x44,0xd8,0x00,0x00,0xff,0x00]
+ds_cmpst_f32 v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0x44,0xd8,0x01,0xff,0x03,0x00]
 
-ds_cmpst_f32 v0, v0, v0
-// CHECK: [0x00,0x00,0x44,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0x44,0xd8,0x01,0x02,0xff,0x00]
 
-ds_cmpst_f32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x44,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v3
+// CHECK: [0x00,0x00,0x44,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x44,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0x44,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x46,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0x44,0xd8,0x01,0x02,0x03,0x00]
 
-ds_min_f32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x48,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0x46,0xd8,0x01,0x02,0x03,0x00]
 
-ds_min_f32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x48,0xd8,0xff,0x00,0x00,0x00]
+ds_min_f32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x48,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_f32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x48,0xd8,0x00,0xff,0x00,0x00]
+ds_min_f32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x48,0xd8,0xff,0x02,0x00,0x00]
 
-ds_min_f32 v0, v0
-// CHECK: [0x00,0x00,0x48,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x48,0xd8,0x01,0xff,0x00,0x00]
 
-ds_min_f32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x48,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f32 v1, v2
+// CHECK: [0x00,0x00,0x48,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_f32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x48,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x48,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_f32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x4a,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x48,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x4c,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x4a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x4c,0xd8,0xff,0x00,0x00,0x00]
+ds_max_f32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x4c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x4c,0xd8,0x00,0xff,0x00,0x00]
+ds_max_f32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x4c,0xd8,0xff,0x02,0x00,0x00]
 
-ds_max_f32 v0, v0
-// CHECK: [0x00,0x00,0x4c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x4c,0xd8,0x01,0xff,0x00,0x00]
 
-ds_max_f32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x4c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f32 v1, v2
+// CHECK: [0x00,0x00,0x4c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x4c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x4c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x4e,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x4c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_gws_init v0 gds
-// CHECK: [0x00,0x00,0x66,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x4e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_gws_sema_v v0 gds
+ds_gws_init v1 gds
+// CHECK: [0x00,0x00,0x66,0xd8,0x00,0x01,0x00,0x00]
+
+ds_gws_sema_v gds
 // CHECK: [0x00,0x00,0x6a,0xd8,0x00,0x00,0x00,0x00]
 
-ds_gws_sema_br v0 gds
-// CHECK: [0x00,0x00,0x6e,0xd8,0x00,0x00,0x00,0x00]
+ds_gws_sema_br v1 gds
+// CHECK: [0x00,0x00,0x6e,0xd8,0x00,0x01,0x00,0x00]
 
-ds_gws_sema_p v0 gds
+ds_gws_sema_p gds
 // CHECK: [0x00,0x00,0x72,0xd8,0x00,0x00,0x00,0x00]
 
-ds_gws_barrier v0 gds
-// CHECK: [0x00,0x00,0x76,0xd8,0x00,0x00,0x00,0x00]
+ds_gws_barrier v1 gds
+// CHECK: [0x00,0x00,0x76,0xd8,0x00,0x01,0x00,0x00]
+
+ds_gws_sema_release_all offset:65535 gds
+// CHECK: [0xff,0xff,0x62,0xd8,0x00,0x00,0x00,0x00]
 
-ds_write_b8 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x78,0xd8,0x00,0x00,0x00,0x00]
+ds_gws_sema_release_all gds
+// CHECK: [0x00,0x00,0x62,0xd8,0x00,0x00,0x00,0x00]
 
-ds_write_b8 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x78,0xd8,0xff,0x00,0x00,0x00]
+ds_write_b8 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x78,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b8 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x78,0xd8,0x00,0xff,0x00,0x00]
+ds_write_b8 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x78,0xd8,0xff,0x02,0x00,0x00]
 
-ds_write_b8 v0, v0
-// CHECK: [0x00,0x00,0x78,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b8 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x78,0xd8,0x01,0xff,0x00,0x00]
 
-ds_write_b8 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x78,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b8 v1, v2
+// CHECK: [0x00,0x00,0x78,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b8 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x78,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b8 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x78,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b8 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x7a,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b8 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x78,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b16 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x7c,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b8 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x7a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b16 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x7c,0xd8,0xff,0x00,0x00,0x00]
+ds_write_b16 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b16 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x7c,0xd8,0x00,0xff,0x00,0x00]
+ds_write_b16 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x7c,0xd8,0xff,0x02,0x00,0x00]
 
-ds_write_b16 v0, v0
-// CHECK: [0x00,0x00,0x7c,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b16 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x7c,0xd8,0x01,0xff,0x00,0x00]
 
-ds_write_b16 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x7c,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b16 v1, v2
+// CHECK: [0x00,0x00,0x7c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b16 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x7c,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b16 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x7c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b16 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x7e,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b16 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x7c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x80,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b16 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x7e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x80,0xd8,0x00,0x00,0x00,0xff]
+ds_add_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x80,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x80,0xd8,0xff,0x00,0x00,0x00]
+ds_add_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x80,0xd8,0x01,0x02,0x00,0xff]
 
-ds_add_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x80,0xd8,0x00,0xff,0x00,0x00]
+ds_add_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x80,0xd8,0xff,0x02,0x00,0x05]
 
-ds_add_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x80,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x80,0xd8,0x01,0xff,0x00,0x05]
 
-ds_add_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x80,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x80,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x80,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x80,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x82,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x80,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x84,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x82,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x84,0xd8,0x00,0x00,0x00,0xff]
+ds_sub_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x84,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x84,0xd8,0xff,0x00,0x00,0x00]
+ds_sub_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x84,0xd8,0x01,0x02,0x00,0xff]
 
-ds_sub_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x84,0xd8,0x00,0xff,0x00,0x00]
+ds_sub_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x84,0xd8,0xff,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x84,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x84,0xd8,0x01,0xff,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x84,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x84,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x84,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x84,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x86,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x84,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x88,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x86,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x88,0xd8,0x00,0x00,0x00,0xff]
+ds_rsub_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x88,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x88,0xd8,0xff,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x88,0xd8,0x01,0x02,0x00,0xff]
 
-ds_rsub_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x88,0xd8,0x00,0xff,0x00,0x00]
+ds_rsub_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x88,0xd8,0xff,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x88,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x88,0xd8,0x01,0xff,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x88,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x88,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x88,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x88,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x8a,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x88,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x8a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd8,0x00,0x00,0x00,0xff]
+ds_inc_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd8,0xff,0x00,0x00,0x00]
+ds_inc_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd8,0x01,0x02,0x00,0xff]
 
-ds_inc_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd8,0x00,0xff,0x00,0x00]
+ds_inc_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd8,0xff,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x8c,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd8,0x01,0xff,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x8c,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x8c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x8c,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x8c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x8e,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x8c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x90,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x8e,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x90,0xd8,0x00,0x00,0x00,0xff]
+ds_dec_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x90,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x90,0xd8,0xff,0x00,0x00,0x00]
+ds_dec_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x90,0xd8,0x01,0x02,0x00,0xff]
 
-ds_dec_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x90,0xd8,0x00,0xff,0x00,0x00]
+ds_dec_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x90,0xd8,0xff,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x90,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x90,0xd8,0x01,0xff,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x90,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x90,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x90,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x90,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x92,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x90,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x94,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x92,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x94,0xd8,0x00,0x00,0x00,0xff]
+ds_min_rtn_i32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x94,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x94,0xd8,0xff,0x00,0x00,0x00]
+ds_min_rtn_i32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x94,0xd8,0x01,0x02,0x00,0xff]
 
-ds_min_rtn_i32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x94,0xd8,0x00,0xff,0x00,0x00]
+ds_min_rtn_i32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x94,0xd8,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v0, v0, v0
-// CHECK: [0x00,0x00,0x94,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x94,0xd8,0x01,0xff,0x00,0x05]
 
-ds_min_rtn_i32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x94,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i32 v5, v1, v2
+// CHECK: [0x00,0x00,0x94,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x94,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x94,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x96,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x94,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x98,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x96,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x98,0xd8,0x00,0x00,0x00,0xff]
+ds_max_rtn_i32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x98,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x98,0xd8,0xff,0x00,0x00,0x00]
+ds_max_rtn_i32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x98,0xd8,0x01,0x02,0x00,0xff]
 
-ds_max_rtn_i32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x98,0xd8,0x00,0xff,0x00,0x00]
+ds_max_rtn_i32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x98,0xd8,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v0, v0, v0
-// CHECK: [0x00,0x00,0x98,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x98,0xd8,0x01,0xff,0x00,0x05]
 
-ds_max_rtn_i32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x98,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i32 v5, v1, v2
+// CHECK: [0x00,0x00,0x98,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x98,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x98,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x9a,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x98,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x9a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x9c,0xd8,0x00,0x00,0x00,0xff]
+ds_min_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x9c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x9c,0xd8,0xff,0x00,0x00,0x00]
+ds_min_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x9c,0xd8,0x01,0x02,0x00,0xff]
 
-ds_min_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x9c,0xd8,0x00,0xff,0x00,0x00]
+ds_min_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x9c,0xd8,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x9c,0xd8,0x01,0xff,0x00,0x05]
 
-ds_min_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x9c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x9c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x9e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x9c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x9e,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd8,0x00,0x00,0x00,0xff]
+ds_max_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd8,0xff,0x00,0x00,0x00]
+ds_max_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd8,0x01,0x02,0x00,0xff]
 
-ds_max_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd8,0x00,0xff,0x00,0x00]
+ds_max_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd8,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0xa0,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd8,0x01,0xff,0x00,0x05]
 
-ds_max_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0xa0,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0xa0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0xa0,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0xa0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xa2,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0xa0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0xa2,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd8,0x00,0x00,0x00,0xff]
+ds_and_rtn_b32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd8,0xff,0x00,0x00,0x00]
+ds_and_rtn_b32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd8,0x01,0x02,0x00,0xff]
 
-ds_and_rtn_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd8,0x00,0xff,0x00,0x00]
+ds_and_rtn_b32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd8,0xff,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0xa4,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd8,0x01,0xff,0x00,0x05]
 
-ds_and_rtn_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0xa4,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b32 v5, v1, v2
+// CHECK: [0x00,0x00,0xa4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0xa4,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0xa4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xa6,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0xa4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xa8,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0xa6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xa8,0xd8,0x00,0x00,0x00,0xff]
+ds_or_rtn_b32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xa8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xa8,0xd8,0xff,0x00,0x00,0x00]
+ds_or_rtn_b32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xa8,0xd8,0x01,0x02,0x00,0xff]
 
-ds_or_rtn_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xa8,0xd8,0x00,0xff,0x00,0x00]
+ds_or_rtn_b32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0xa8,0xd8,0xff,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0xa8,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0xa8,0xd8,0x01,0xff,0x00,0x05]
 
-ds_or_rtn_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0xa8,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b32 v5, v1, v2
+// CHECK: [0x00,0x00,0xa8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0xa8,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0xa8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xaa,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0xa8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xac,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0xaa,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xac,0xd8,0x00,0x00,0x00,0xff]
+ds_xor_rtn_b32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xac,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xac,0xd8,0xff,0x00,0x00,0x00]
+ds_xor_rtn_b32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xac,0xd8,0x01,0x02,0x00,0xff]
 
-ds_xor_rtn_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xac,0xd8,0x00,0xff,0x00,0x00]
+ds_xor_rtn_b32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0xac,0xd8,0xff,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0xac,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0xac,0xd8,0x01,0xff,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0xac,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b32 v5, v1, v2
+// CHECK: [0x00,0x00,0xac,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0xac,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0xac,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xae,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0xac,0xd8,0x01,0x02,0x00,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xb0,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0xae,0xd8,0x01,0x02,0x00,0x05]
 
-ds_mskor_rtn_b32 v255, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xb0,0xd8,0x00,0x00,0x00,0xff]
+ds_mskor_rtn_b32 v5, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0xb0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b32 v0, v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xb0,0xd8,0xff,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v255, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0xb0,0xd8,0x01,0x02,0x03,0xff]
 
-ds_mskor_rtn_b32 v0, v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xb0,0xd8,0x00,0xff,0x00,0x00]
+ds_mskor_rtn_b32 v5, v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0xb0,0xd8,0xff,0x02,0x03,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xb0,0xd8,0x00,0x00,0xff,0x00]
+ds_mskor_rtn_b32 v5, v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0xb0,0xd8,0x01,0xff,0x03,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v0
-// CHECK: [0x00,0x00,0xb0,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v5, v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0xb0,0xd8,0x01,0x02,0xff,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0xb0,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v5, v1, v2, v3
+// CHECK: [0x00,0x00,0xb0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0xb0,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v5, v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0xb0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xb2,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v5, v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0xb0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_wrxchg_rtn_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xb4,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v5, v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0xb2,0xd8,0x01,0x02,0x03,0x05]
 
-ds_wrxchg_rtn_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xb4,0xd8,0x00,0x00,0x00,0xff]
+ds_wrxchg_rtn_b32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xb4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xb4,0xd8,0xff,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xb4,0xd8,0x01,0x02,0x00,0xff]
 
-ds_wrxchg_rtn_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xb4,0xd8,0x00,0xff,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0xb4,0xd8,0xff,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0xb4,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0xb4,0xd8,0x01,0xff,0x00,0x05]
 
-ds_wrxchg_rtn_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0xb4,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v1, v2
+// CHECK: [0x00,0x00,0xb4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0xb4,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0xb4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xb6,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0xb4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0xb6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_cmpst_rtn_b32 v255, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd8,0x00,0x00,0x00,0xff]
+ds_cmpst_rtn_b32 v5, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b32 v0, v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v255, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd8,0x01,0x02,0x03,0xff]
 
-ds_cmpst_rtn_b32 v0, v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd8,0x00,0xff,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd8,0xff,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd8,0x00,0x00,0xff,0x00]
+ds_cmpst_rtn_b32 v5, v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd8,0x01,0xff,0x03,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v0
-// CHECK: [0x00,0x00,0xc0,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd8,0x01,0x02,0xff,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0xc0,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v1, v2, v3
+// CHECK: [0x00,0x00,0xc0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0xc0,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0xc0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xc2,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0xc0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0xc2,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v255, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd8,0x00,0x00,0x00,0xff]
+ds_cmpst_rtn_f32 v5, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v255, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd8,0x01,0x02,0x03,0xff]
 
-ds_cmpst_rtn_f32 v0, v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd8,0x00,0xff,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd8,0xff,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd8,0x00,0x00,0xff,0x00]
+ds_cmpst_rtn_f32 v5, v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd8,0x01,0xff,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v0
-// CHECK: [0x00,0x00,0xc4,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd8,0x01,0x02,0xff,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0xc4,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v1, v2, v3
+// CHECK: [0x00,0x00,0xc4,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0xc4,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0xc4,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xc6,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0xc4,0xd8,0x01,0x02,0x03,0x05]
 
-ds_min_rtn_f32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0xc6,0xd8,0x01,0x02,0x03,0x05]
 
-ds_min_rtn_f32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd8,0x00,0x00,0x00,0xff]
+ds_min_rtn_f32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd8,0xff,0x00,0x00,0x00]
+ds_min_rtn_f32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd8,0x01,0x02,0x00,0xff]
 
-ds_min_rtn_f32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd8,0x00,0xff,0x00,0x00]
+ds_min_rtn_f32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd8,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_f32 v0, v0, v0
-// CHECK: [0x00,0x00,0xc8,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd8,0x01,0xff,0x00,0x05]
 
-ds_min_rtn_f32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0xc8,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f32 v5, v1, v2
+// CHECK: [0x00,0x00,0xc8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0xc8,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0xc8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xca,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0xc8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0xca,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd8,0x00,0x00,0x00,0xff]
+ds_max_rtn_f32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd8,0xff,0x00,0x00,0x00]
+ds_max_rtn_f32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd8,0x01,0x02,0x00,0xff]
 
-ds_max_rtn_f32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd8,0x00,0xff,0x00,0x00]
+ds_max_rtn_f32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd8,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v0, v0, v0
-// CHECK: [0x00,0x00,0xcc,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd8,0x01,0xff,0x00,0x05]
 
-ds_max_rtn_f32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0xcc,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f32 v5, v1, v2
+// CHECK: [0x00,0x00,0xcc,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0xcc,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0xcc,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xce,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0xcc,0xd8,0x01,0x02,0x00,0x05]
 
-ds_swizzle_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xd4,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0xce,0xd8,0x01,0x02,0x00,0x05]
 
-ds_swizzle_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xd4,0xd8,0x00,0x00,0x00,0xff]
+ds_swizzle_b32 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0x05]
 
-ds_swizzle_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x00]
+ds_swizzle_b32 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0xd4,0xd8,0x01,0x00,0x00,0xff]
 
-ds_swizzle_b32 v0, v0
-// CHECK: [0x00,0x00,0xd4,0xd8,0x00,0x00,0x00,0x00]
+ds_swizzle_b32 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x05]
 
-ds_swizzle_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0xd4,0xd8,0x00,0x00,0x00,0x00]
+ds_swizzle_b32 v5, v1
+// CHECK: [0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05]
 
-ds_swizzle_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0xd4,0xd8,0x00,0x00,0x00,0x00]
+ds_swizzle_b32 v5, v1 offset:0
+// CHECK: [0x00,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05]
 
-ds_swizzle_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xd6,0xd8,0x00,0x00,0x00,0x00]
+ds_swizzle_b32 v5, v1 offset:4
+// CHECK: [0x04,0x00,0xd4,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd8,0x00,0x00,0x00,0x00]
+ds_swizzle_b32 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xd6,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd8,0x00,0x00,0x00,0xff]
+ds_read_b32 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd8,0xff,0x00,0x00,0x00]
+ds_read_b32 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd8,0x01,0x00,0x00,0xff]
 
-ds_read_b32 v0, v0
-// CHECK: [0x00,0x00,0xd8,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b32 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0xd8,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b32 v5, v1
+// CHECK: [0x00,0x00,0xd8,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0xd8,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b32 v5, v1 offset:0
+// CHECK: [0x00,0x00,0xd8,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xda,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b32 v5, v1 offset:4
+// CHECK: [0x04,0x00,0xd8,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xdc,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b32 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xda,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[254:255], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xdc,0xd8,0x00,0x00,0x00,0xfe]
+ds_read2_b32 v[5:6], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xdc,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xdc,0xd8,0xff,0x00,0x00,0x00]
+ds_read2_b32 v[254:255], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xdc,0xd8,0x01,0x00,0x00,0xfe]
 
-ds_read2_b32 v[0:1], v0 offset1:255
-// CHECK: [0x00,0xff,0xdc,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xdc,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0xdc,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset1:255
+// CHECK: [0x00,0xff,0xdc,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0xdc,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0xdc,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:127
-// CHECK: [0x7f,0x00,0xdc,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0xdc,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0xdc,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:127
+// CHECK: [0x7f,0x00,0xdc,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0xdc,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0xdc,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0xde,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0xdc,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0xde,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[254:255], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xe0,0xd8,0x00,0x00,0x00,0xfe]
+ds_read2st64_b32 v[5:6], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xe0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xe0,0xd8,0xff,0x00,0x00,0x00]
+ds_read2st64_b32 v[254:255], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xe0,0xd8,0x01,0x00,0x00,0xfe]
 
-ds_read2st64_b32 v[0:1], v0 offset1:255
-// CHECK: [0x00,0xff,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xe0,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset1:255
+// CHECK: [0x00,0xff,0xe0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0xe0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:127
-// CHECK: [0x7f,0x00,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0xe0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:127
+// CHECK: [0x7f,0x00,0xe0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0xe0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0xe2,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0xe0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i8 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xe4,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0xe2,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i8 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xe4,0xd8,0x00,0x00,0x00,0xff]
+ds_read_i8 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0xe4,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i8 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xe4,0xd8,0xff,0x00,0x00,0x00]
+ds_read_i8 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0xe4,0xd8,0x01,0x00,0x00,0xff]
 
-ds_read_i8 v0, v0
-// CHECK: [0x00,0x00,0xe4,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i8 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0xe4,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_i8 v0, v0 offset:0
-// CHECK: [0x00,0x00,0xe4,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i8 v5, v1
+// CHECK: [0x00,0x00,0xe4,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i8 v0, v0 offset:4
-// CHECK: [0x04,0x00,0xe4,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i8 v5, v1 offset:0
+// CHECK: [0x00,0x00,0xe4,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i8 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xe6,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i8 v5, v1 offset:4
+// CHECK: [0x04,0x00,0xe4,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u8 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xe8,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i8 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xe6,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u8 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xe8,0xd8,0x00,0x00,0x00,0xff]
+ds_read_u8 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0xe8,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u8 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xe8,0xd8,0xff,0x00,0x00,0x00]
+ds_read_u8 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0xe8,0xd8,0x01,0x00,0x00,0xff]
 
-ds_read_u8 v0, v0
-// CHECK: [0x00,0x00,0xe8,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u8 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0xe8,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_u8 v0, v0 offset:0
-// CHECK: [0x00,0x00,0xe8,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u8 v5, v1
+// CHECK: [0x00,0x00,0xe8,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u8 v0, v0 offset:4
-// CHECK: [0x04,0x00,0xe8,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u8 v5, v1 offset:0
+// CHECK: [0x00,0x00,0xe8,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u8 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xea,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u8 v5, v1 offset:4
+// CHECK: [0x04,0x00,0xe8,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i16 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xec,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u8 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xea,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i16 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xec,0xd8,0x00,0x00,0x00,0xff]
+ds_read_i16 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0xec,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i16 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xec,0xd8,0xff,0x00,0x00,0x00]
+ds_read_i16 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0xec,0xd8,0x01,0x00,0x00,0xff]
 
-ds_read_i16 v0, v0
-// CHECK: [0x00,0x00,0xec,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i16 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0xec,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_i16 v0, v0 offset:0
-// CHECK: [0x00,0x00,0xec,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i16 v5, v1
+// CHECK: [0x00,0x00,0xec,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i16 v0, v0 offset:4
-// CHECK: [0x04,0x00,0xec,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i16 v5, v1 offset:0
+// CHECK: [0x00,0x00,0xec,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i16 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xee,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i16 v5, v1 offset:4
+// CHECK: [0x04,0x00,0xec,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u16 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i16 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xee,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u16 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0xf0,0xd8,0x00,0x00,0x00,0xff]
+ds_read_u16 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u16 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0xf0,0xd8,0xff,0x00,0x00,0x00]
+ds_read_u16 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0xf0,0xd8,0x01,0x00,0x00,0xff]
 
-ds_read_u16 v0, v0
-// CHECK: [0x00,0x00,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u16 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0xf0,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_u16 v0, v0 offset:0
-// CHECK: [0x00,0x00,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u16 v5, v1
+// CHECK: [0x00,0x00,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u16 v0, v0 offset:4
-// CHECK: [0x04,0x00,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u16 v5, v1 offset:0
+// CHECK: [0x00,0x00,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u16 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xf2,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u16 v5, v1 offset:4
+// CHECK: [0x04,0x00,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_consume v0 offset:65535
-// CHECK: [0xff,0xff,0xf4,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u16 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xf2,0xd8,0x01,0x00,0x00,0x05]
+
+ds_consume v5 offset:65535
+// CHECK: [0xff,0xff,0xf4,0xd8,0x00,0x00,0x00,0x05]
 
 ds_consume v255 offset:65535
 // CHECK: [0xff,0xff,0xf4,0xd8,0x00,0x00,0x00,0xff]
 
-ds_consume v0
-// CHECK: [0x00,0x00,0xf4,0xd8,0x00,0x00,0x00,0x00]
+ds_consume v5
+// CHECK: [0x00,0x00,0xf4,0xd8,0x00,0x00,0x00,0x05]
 
-ds_consume v0 offset:0
-// CHECK: [0x00,0x00,0xf4,0xd8,0x00,0x00,0x00,0x00]
+ds_consume v5 offset:0
+// CHECK: [0x00,0x00,0xf4,0xd8,0x00,0x00,0x00,0x05]
 
-ds_consume v0 offset:4
-// CHECK: [0x04,0x00,0xf4,0xd8,0x00,0x00,0x00,0x00]
+ds_consume v5 offset:4
+// CHECK: [0x04,0x00,0xf4,0xd8,0x00,0x00,0x00,0x05]
 
-ds_consume v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xf6,0xd8,0x00,0x00,0x00,0x00]
+ds_consume v5 offset:65535 gds
+// CHECK: [0xff,0xff,0xf6,0xd8,0x00,0x00,0x00,0x05]
 
-ds_append v0 offset:65535
-// CHECK: [0xff,0xff,0xf8,0xd8,0x00,0x00,0x00,0x00]
+ds_append v5 offset:65535
+// CHECK: [0xff,0xff,0xf8,0xd8,0x00,0x00,0x00,0x05]
 
 ds_append v255 offset:65535
 // CHECK: [0xff,0xff,0xf8,0xd8,0x00,0x00,0x00,0xff]
 
-ds_append v0
-// CHECK: [0x00,0x00,0xf8,0xd8,0x00,0x00,0x00,0x00]
+ds_append v5
+// CHECK: [0x00,0x00,0xf8,0xd8,0x00,0x00,0x00,0x05]
 
-ds_append v0 offset:0
-// CHECK: [0x00,0x00,0xf8,0xd8,0x00,0x00,0x00,0x00]
+ds_append v5 offset:0
+// CHECK: [0x00,0x00,0xf8,0xd8,0x00,0x00,0x00,0x05]
 
-ds_append v0 offset:4
-// CHECK: [0x04,0x00,0xf8,0xd8,0x00,0x00,0x00,0x00]
+ds_append v5 offset:4
+// CHECK: [0x04,0x00,0xf8,0xd8,0x00,0x00,0x00,0x05]
 
-ds_append v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xfa,0xd8,0x00,0x00,0x00,0x00]
+ds_append v5 offset:65535 gds
+// CHECK: [0xff,0xff,0xfa,0xd8,0x00,0x00,0x00,0x05]
 
-ds_ordered_count v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xfe,0xd8,0x00,0x00,0x00,0x00]
+ds_ordered_count v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xfe,0xd8,0x01,0x00,0x00,0x05]
 
-ds_ordered_count v255, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xfe,0xd8,0x00,0x00,0x00,0xff]
+ds_ordered_count v255, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xfe,0xd8,0x01,0x00,0x00,0xff]
 
-ds_ordered_count v0, v255 offset:65535 gds
-// CHECK: [0xff,0xff,0xfe,0xd8,0xff,0x00,0x00,0x00]
+ds_ordered_count v5, v255 offset:65535 gds
+// CHECK: [0xff,0xff,0xfe,0xd8,0xff,0x00,0x00,0x05]
 
-ds_ordered_count v0, v0 gds
-// CHECK: [0x00,0x00,0xfe,0xd8,0x00,0x00,0x00,0x00]
+ds_ordered_count v5, v1 gds
+// CHECK: [0x00,0x00,0xfe,0xd8,0x01,0x00,0x00,0x05]
 
-ds_ordered_count v0, v0 offset:0 gds
-// CHECK: [0x00,0x00,0xfe,0xd8,0x00,0x00,0x00,0x00]
+ds_ordered_count v5, v1 offset:0 gds
+// CHECK: [0x00,0x00,0xfe,0xd8,0x01,0x00,0x00,0x05]
 
-ds_ordered_count v0, v0 offset:4 gds
-// CHECK: [0x04,0x00,0xfe,0xd8,0x00,0x00,0x00,0x00]
+ds_ordered_count v5, v1 offset:4 gds
+// CHECK: [0x04,0x00,0xfe,0xd8,0x01,0x00,0x00,0x05]
 
-ds_add_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x00,0xd9,0x00,0x00,0x00,0x00]
+ds_add_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x00,0xd9,0x01,0x02,0x00,0x00]
 
-ds_add_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x00,0xd9,0xff,0x00,0x00,0x00]
+ds_add_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x00,0xd9,0xff,0x02,0x00,0x00]
 
-ds_add_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x00,0xd9,0x00,0xfe,0x00,0x00]
+ds_add_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x00,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_add_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0x00]
+ds_add_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x00,0xd9,0x01,0x02,0x00,0x00]
 
-ds_add_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0x00]
+ds_add_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x00,0xd9,0x01,0x02,0x00,0x00]
 
-ds_add_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x00,0xd9,0x00,0x00,0x00,0x00]
+ds_add_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x00,0xd9,0x01,0x02,0x00,0x00]
 
-ds_add_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x02,0xd9,0x00,0x00,0x00,0x00]
+ds_add_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x02,0xd9,0x01,0x02,0x00,0x00]
 
-ds_sub_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x04,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x04,0xd9,0x01,0x02,0x00,0x00]
 
-ds_sub_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x04,0xd9,0xff,0x00,0x00,0x00]
+ds_sub_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x04,0xd9,0xff,0x02,0x00,0x00]
 
-ds_sub_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x04,0xd9,0x00,0xfe,0x00,0x00]
+ds_sub_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x04,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_sub_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x04,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x04,0xd9,0x01,0x02,0x00,0x00]
 
-ds_sub_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x04,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x04,0xd9,0x01,0x02,0x00,0x00]
 
-ds_sub_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x04,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x04,0xd9,0x01,0x02,0x00,0x00]
 
-ds_sub_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x06,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x06,0xd9,0x01,0x02,0x00,0x00]
 
-ds_rsub_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x08,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x08,0xd9,0x01,0x02,0x00,0x00]
 
-ds_rsub_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x08,0xd9,0xff,0x00,0x00,0x00]
+ds_rsub_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x08,0xd9,0xff,0x02,0x00,0x00]
 
-ds_rsub_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x08,0xd9,0x00,0xfe,0x00,0x00]
+ds_rsub_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x08,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_rsub_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x08,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x08,0xd9,0x01,0x02,0x00,0x00]
 
-ds_rsub_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x08,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x08,0xd9,0x01,0x02,0x00,0x00]
 
-ds_rsub_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x08,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x08,0xd9,0x01,0x02,0x00,0x00]
 
-ds_rsub_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x0a,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x0a,0xd9,0x01,0x02,0x00,0x00]
 
-ds_inc_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x0c,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x0c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_inc_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x0c,0xd9,0xff,0x00,0x00,0x00]
+ds_inc_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x0c,0xd9,0xff,0x02,0x00,0x00]
 
-ds_inc_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x0c,0xd9,0x00,0xfe,0x00,0x00]
+ds_inc_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x0c,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_inc_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x0c,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x0c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_inc_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x0c,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x0c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_inc_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x0c,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x0c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_inc_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x0e,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x0e,0xd9,0x01,0x02,0x00,0x00]
 
-ds_dec_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x10,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x10,0xd9,0x01,0x02,0x00,0x00]
 
-ds_dec_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x10,0xd9,0xff,0x00,0x00,0x00]
+ds_dec_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x10,0xd9,0xff,0x02,0x00,0x00]
 
-ds_dec_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x10,0xd9,0x00,0xfe,0x00,0x00]
+ds_dec_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x10,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_dec_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x10,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x10,0xd9,0x01,0x02,0x00,0x00]
 
-ds_dec_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x10,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x10,0xd9,0x01,0x02,0x00,0x00]
 
-ds_dec_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x10,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x10,0xd9,0x01,0x02,0x00,0x00]
 
-ds_dec_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x12,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x12,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_i64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x14,0xd9,0x00,0x00,0x00,0x00]
+ds_min_i64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x14,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_i64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x14,0xd9,0xff,0x00,0x00,0x00]
+ds_min_i64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x14,0xd9,0xff,0x02,0x00,0x00]
 
-ds_min_i64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x14,0xd9,0x00,0xfe,0x00,0x00]
+ds_min_i64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x14,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_min_i64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x14,0xd9,0x00,0x00,0x00,0x00]
+ds_min_i64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x14,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_i64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x14,0xd9,0x00,0x00,0x00,0x00]
+ds_min_i64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x14,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_i64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x14,0xd9,0x00,0x00,0x00,0x00]
+ds_min_i64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x14,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_i64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x16,0xd9,0x00,0x00,0x00,0x00]
+ds_min_i64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x16,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_i64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x18,0xd9,0x00,0x00,0x00,0x00]
+ds_max_i64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x18,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_i64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x18,0xd9,0xff,0x00,0x00,0x00]
+ds_max_i64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x18,0xd9,0xff,0x02,0x00,0x00]
 
-ds_max_i64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x18,0xd9,0x00,0xfe,0x00,0x00]
+ds_max_i64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x18,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_max_i64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x18,0xd9,0x00,0x00,0x00,0x00]
+ds_max_i64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x18,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_i64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x18,0xd9,0x00,0x00,0x00,0x00]
+ds_max_i64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x18,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_i64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x18,0xd9,0x00,0x00,0x00,0x00]
+ds_max_i64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x18,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_i64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x1a,0xd9,0x00,0x00,0x00,0x00]
+ds_max_i64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x1a,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x1c,0xd9,0x00,0x00,0x00,0x00]
+ds_min_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x1c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x1c,0xd9,0xff,0x00,0x00,0x00]
+ds_min_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x1c,0xd9,0xff,0x02,0x00,0x00]
 
-ds_min_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x1c,0xd9,0x00,0xfe,0x00,0x00]
+ds_min_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x1c,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_min_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd9,0x00,0x00,0x00,0x00]
+ds_min_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x1c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x1c,0xd9,0x00,0x00,0x00,0x00]
+ds_min_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x1c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x1c,0xd9,0x00,0x00,0x00,0x00]
+ds_min_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x1c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x1e,0xd9,0x00,0x00,0x00,0x00]
+ds_min_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x1e,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x20,0xd9,0x00,0x00,0x00,0x00]
+ds_max_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x20,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x20,0xd9,0xff,0x00,0x00,0x00]
+ds_max_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x20,0xd9,0xff,0x02,0x00,0x00]
 
-ds_max_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x20,0xd9,0x00,0xfe,0x00,0x00]
+ds_max_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x20,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_max_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x20,0xd9,0x00,0x00,0x00,0x00]
+ds_max_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x20,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x20,0xd9,0x00,0x00,0x00,0x00]
+ds_max_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x20,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x20,0xd9,0x00,0x00,0x00,0x00]
+ds_max_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x20,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x22,0xd9,0x00,0x00,0x00,0x00]
+ds_max_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x22,0xd9,0x01,0x02,0x00,0x00]
 
-ds_and_b64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x24,0xd9,0x00,0x00,0x00,0x00]
+ds_and_b64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x24,0xd9,0x01,0x02,0x00,0x00]
 
-ds_and_b64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x24,0xd9,0xff,0x00,0x00,0x00]
+ds_and_b64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x24,0xd9,0xff,0x02,0x00,0x00]
 
-ds_and_b64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x24,0xd9,0x00,0xfe,0x00,0x00]
+ds_and_b64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x24,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_and_b64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x24,0xd9,0x00,0x00,0x00,0x00]
+ds_and_b64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x24,0xd9,0x01,0x02,0x00,0x00]
 
-ds_and_b64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x24,0xd9,0x00,0x00,0x00,0x00]
+ds_and_b64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x24,0xd9,0x01,0x02,0x00,0x00]
 
-ds_and_b64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x24,0xd9,0x00,0x00,0x00,0x00]
+ds_and_b64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x24,0xd9,0x01,0x02,0x00,0x00]
 
-ds_and_b64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x26,0xd9,0x00,0x00,0x00,0x00]
+ds_and_b64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x26,0xd9,0x01,0x02,0x00,0x00]
 
-ds_or_b64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x28,0xd9,0x00,0x00,0x00,0x00]
+ds_or_b64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x28,0xd9,0x01,0x02,0x00,0x00]
 
-ds_or_b64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x28,0xd9,0xff,0x00,0x00,0x00]
+ds_or_b64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x28,0xd9,0xff,0x02,0x00,0x00]
 
-ds_or_b64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x28,0xd9,0x00,0xfe,0x00,0x00]
+ds_or_b64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x28,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_or_b64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x28,0xd9,0x00,0x00,0x00,0x00]
+ds_or_b64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x28,0xd9,0x01,0x02,0x00,0x00]
 
-ds_or_b64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x28,0xd9,0x00,0x00,0x00,0x00]
+ds_or_b64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x28,0xd9,0x01,0x02,0x00,0x00]
 
-ds_or_b64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x28,0xd9,0x00,0x00,0x00,0x00]
+ds_or_b64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x28,0xd9,0x01,0x02,0x00,0x00]
 
-ds_or_b64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x2a,0xd9,0x00,0x00,0x00,0x00]
+ds_or_b64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x2a,0xd9,0x01,0x02,0x00,0x00]
 
-ds_xor_b64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x2c,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_b64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x2c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_xor_b64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x2c,0xd9,0xff,0x00,0x00,0x00]
+ds_xor_b64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x2c,0xd9,0xff,0x02,0x00,0x00]
 
-ds_xor_b64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x2c,0xd9,0x00,0xfe,0x00,0x00]
+ds_xor_b64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x2c,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_xor_b64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x2c,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_b64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x2c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_xor_b64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x2c,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_b64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x2c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_xor_b64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x2c,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_b64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x2c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_xor_b64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x2e,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_b64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x2e,0xd9,0x01,0x02,0x00,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x30,0xd9,0x00,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x30,0xd9,0x01,0x02,0x03,0x00]
 
-ds_mskor_b64 v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x30,0xd9,0xff,0x00,0x00,0x00]
+ds_mskor_b64 v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x30,0xd9,0xff,0x02,0x03,0x00]
 
-ds_mskor_b64 v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x30,0xd9,0x00,0xfe,0x00,0x00]
+ds_mskor_b64 v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x30,0xd9,0x01,0xfe,0x03,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x30,0xd9,0x00,0x00,0xfe,0x00]
+ds_mskor_b64 v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x30,0xd9,0x01,0x02,0xfe,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x30,0xd9,0x00,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0x30,0xd9,0x01,0x02,0x03,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0x30,0xd9,0x00,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0x30,0xd9,0x01,0x02,0x03,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0x30,0xd9,0x00,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0x30,0xd9,0x01,0x02,0x03,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x32,0xd9,0x00,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0x32,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write_b64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x34,0xd9,0x00,0x00,0x00,0x00]
+ds_write_b64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x34,0xd9,0x01,0x02,0x00,0x00]
 
-ds_write_b64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x34,0xd9,0xff,0x00,0x00,0x00]
+ds_write_b64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x34,0xd9,0xff,0x02,0x00,0x00]
 
-ds_write_b64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x34,0xd9,0x00,0xfe,0x00,0x00]
+ds_write_b64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x34,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_write_b64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x34,0xd9,0x00,0x00,0x00,0x00]
+ds_write_b64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x34,0xd9,0x01,0x02,0x00,0x00]
 
-ds_write_b64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x34,0xd9,0x00,0x00,0x00,0x00]
+ds_write_b64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x34,0xd9,0x01,0x02,0x00,0x00]
 
-ds_write_b64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x34,0xd9,0x00,0x00,0x00,0x00]
+ds_write_b64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x34,0xd9,0x01,0x02,0x00,0x00]
 
-ds_write_b64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x36,0xd9,0x00,0x00,0x00,0x00]
+ds_write_b64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x36,0xd9,0x01,0x02,0x00,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x38,0xd9,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x38,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v255, v[0:1], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x38,0xd9,0xff,0x00,0x00,0x00]
+ds_write2_b64 v255, v[2:3], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x38,0xd9,0xff,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[254:255], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x38,0xd9,0x00,0xfe,0x00,0x00]
+ds_write2_b64 v1, v[254:255], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x38,0xd9,0x01,0xfe,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[254:255] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x38,0xd9,0x00,0x00,0xfe,0x00]
+ds_write2_b64 v1, v[2:3], v[254:255] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x38,0xd9,0x01,0x02,0xfe,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset1:255
-// CHECK: [0x00,0xff,0x38,0xd9,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset1:255
+// CHECK: [0x00,0xff,0x38,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:0 offset1:255
-// CHECK: [0x00,0xff,0x38,0xd9,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:0 offset1:255
+// CHECK: [0x00,0xff,0x38,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:16 offset1:255
-// CHECK: [0x10,0xff,0x38,0xd9,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:16 offset1:255
+// CHECK: [0x10,0xff,0x38,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:127
-// CHECK: [0x7f,0x00,0x38,0xd9,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:127
+// CHECK: [0x7f,0x00,0x38,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0x38,0xd9,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0x38,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0x38,0xd9,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0x38,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0x3a,0xd9,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0x3a,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x3c,0xd9,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x3c,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v255, v[0:1], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x3c,0xd9,0xff,0x00,0x00,0x00]
+ds_write2st64_b64 v255, v[2:3], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x3c,0xd9,0xff,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[254:255], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x3c,0xd9,0x00,0xfe,0x00,0x00]
+ds_write2st64_b64 v1, v[254:255], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x3c,0xd9,0x01,0xfe,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[254:255] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x3c,0xd9,0x00,0x00,0xfe,0x00]
+ds_write2st64_b64 v1, v[2:3], v[254:255] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x3c,0xd9,0x01,0x02,0xfe,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset1:255
-// CHECK: [0x00,0xff,0x3c,0xd9,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset1:255
+// CHECK: [0x00,0xff,0x3c,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:0 offset1:255
-// CHECK: [0x00,0xff,0x3c,0xd9,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:0 offset1:255
+// CHECK: [0x00,0xff,0x3c,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:16 offset1:255
-// CHECK: [0x10,0xff,0x3c,0xd9,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:16 offset1:255
+// CHECK: [0x10,0xff,0x3c,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:127
-// CHECK: [0x7f,0x00,0x3c,0xd9,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:127
+// CHECK: [0x7f,0x00,0x3c,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0x3c,0xd9,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0x3c,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0x3c,0xd9,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0x3c,0xd9,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0x3e,0xd9,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0x3e,0xd9,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x40,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x40,0xd9,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b64 v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x40,0xd9,0xff,0x00,0x00,0x00]
+ds_cmpst_b64 v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x40,0xd9,0xff,0x02,0x03,0x00]
 
-ds_cmpst_b64 v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x40,0xd9,0x00,0xfe,0x00,0x00]
+ds_cmpst_b64 v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x40,0xd9,0x01,0xfe,0x03,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x40,0xd9,0x00,0x00,0xfe,0x00]
+ds_cmpst_b64 v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x40,0xd9,0x01,0x02,0xfe,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x40,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0x40,0xd9,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0x40,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0x40,0xd9,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0x40,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0x40,0xd9,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x42,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0x42,0xd9,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x44,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x44,0xd9,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f64 v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x44,0xd9,0xff,0x00,0x00,0x00]
+ds_cmpst_f64 v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x44,0xd9,0xff,0x02,0x03,0x00]
 
-ds_cmpst_f64 v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x44,0xd9,0x00,0xfe,0x00,0x00]
+ds_cmpst_f64 v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x44,0xd9,0x01,0xfe,0x03,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x44,0xd9,0x00,0x00,0xfe,0x00]
+ds_cmpst_f64 v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x44,0xd9,0x01,0x02,0xfe,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x44,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0x44,0xd9,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0x44,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0x44,0xd9,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0x44,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0x44,0xd9,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x46,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0x46,0xd9,0x01,0x02,0x03,0x00]
 
-ds_min_f64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x48,0xd9,0x00,0x00,0x00,0x00]
+ds_min_f64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x48,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_f64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x48,0xd9,0xff,0x00,0x00,0x00]
+ds_min_f64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x48,0xd9,0xff,0x02,0x00,0x00]
 
-ds_min_f64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x48,0xd9,0x00,0xfe,0x00,0x00]
+ds_min_f64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x48,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_min_f64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x48,0xd9,0x00,0x00,0x00,0x00]
+ds_min_f64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x48,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_f64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x48,0xd9,0x00,0x00,0x00,0x00]
+ds_min_f64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x48,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_f64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x48,0xd9,0x00,0x00,0x00,0x00]
+ds_min_f64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x48,0xd9,0x01,0x02,0x00,0x00]
 
-ds_min_f64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x4a,0xd9,0x00,0x00,0x00,0x00]
+ds_min_f64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x4a,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_f64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x4c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_f64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x4c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_f64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x4c,0xd9,0xff,0x00,0x00,0x00]
+ds_max_f64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x4c,0xd9,0xff,0x02,0x00,0x00]
 
-ds_max_f64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x4c,0xd9,0x00,0xfe,0x00,0x00]
+ds_max_f64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x4c,0xd9,0x01,0xfe,0x00,0x00]
 
-ds_max_f64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_f64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x4c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_f64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x4c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_f64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x4c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_f64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x4c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_f64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x4c,0xd9,0x01,0x02,0x00,0x00]
 
-ds_max_f64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x4e,0xd9,0x00,0x00,0x00,0x00]
+ds_max_f64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x4e,0xd9,0x01,0x02,0x00,0x00]
 
-ds_add_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x80,0xd9,0x00,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x80,0xd9,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x80,0xd9,0x00,0x00,0x00,0xfe]
+ds_add_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x80,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_add_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x80,0xd9,0xff,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x80,0xd9,0xff,0x02,0x00,0x05]
 
-ds_add_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x80,0xd9,0x00,0xfe,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x80,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_add_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0x80,0xd9,0x00,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0x80,0xd9,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x80,0xd9,0x00,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x80,0xd9,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x80,0xd9,0x00,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x80,0xd9,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x82,0xd9,0x00,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x82,0xd9,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x84,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x84,0xd9,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x84,0xd9,0x00,0x00,0x00,0xfe]
+ds_sub_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x84,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_sub_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x84,0xd9,0xff,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x84,0xd9,0xff,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x84,0xd9,0x00,0xfe,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x84,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0x84,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0x84,0xd9,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x84,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x84,0xd9,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x84,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x84,0xd9,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x86,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x86,0xd9,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x88,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x88,0xd9,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x88,0xd9,0x00,0x00,0x00,0xfe]
+ds_rsub_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x88,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_rsub_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x88,0xd9,0xff,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x88,0xd9,0xff,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x88,0xd9,0x00,0xfe,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x88,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0x88,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0x88,0xd9,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x88,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x88,0xd9,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x88,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x88,0xd9,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x8a,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x8a,0xd9,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd9,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd9,0x00,0x00,0x00,0xfe]
+ds_inc_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_inc_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd9,0xff,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd9,0xff,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd9,0x00,0xfe,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0x8c,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0x8c,0xd9,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x8c,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x8c,0xd9,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x8c,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x8c,0xd9,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x8e,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x8e,0xd9,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x90,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x90,0xd9,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x90,0xd9,0x00,0x00,0x00,0xfe]
+ds_dec_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x90,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_dec_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x90,0xd9,0xff,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x90,0xd9,0xff,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x90,0xd9,0x00,0xfe,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x90,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0x90,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0x90,0xd9,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x90,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x90,0xd9,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x90,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x90,0xd9,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x92,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x92,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x94,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x94,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x94,0xd9,0x00,0x00,0x00,0xfe]
+ds_min_rtn_i64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x94,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_min_rtn_i64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x94,0xd9,0xff,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x94,0xd9,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x94,0xd9,0x00,0xfe,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x94,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0x94,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0x94,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x94,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x94,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x94,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x94,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x96,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x96,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x98,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x98,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x98,0xd9,0x00,0x00,0x00,0xfe]
+ds_max_rtn_i64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x98,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_max_rtn_i64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x98,0xd9,0xff,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x98,0xd9,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x98,0xd9,0x00,0xfe,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x98,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0x98,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0x98,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x98,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x98,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x98,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x98,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x9a,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x9a,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x9c,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x9c,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x9c,0xd9,0x00,0x00,0x00,0xfe]
+ds_min_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x9c,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_min_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x9c,0xd9,0xff,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x9c,0xd9,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x9c,0xd9,0x00,0xfe,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x9c,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0x9c,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0x9c,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x9c,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x9c,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x9c,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x9c,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x9e,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x9e,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd9,0x00,0x00,0x00,0xfe]
+ds_max_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_max_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd9,0xff,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd9,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd9,0x00,0xfe,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xa0,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xa0,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xa0,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xa0,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xa0,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xa0,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xa2,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xa2,0xd9,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd9,0x00,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd9,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd9,0x00,0x00,0x00,0xfe]
+ds_and_rtn_b64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_and_rtn_b64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd9,0xff,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd9,0xff,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd9,0x00,0xfe,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xa4,0xd9,0x00,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xa4,0xd9,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xa4,0xd9,0x00,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xa4,0xd9,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xa4,0xd9,0x00,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xa4,0xd9,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xa6,0xd9,0x00,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xa6,0xd9,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa8,0xd9,0x00,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa8,0xd9,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa8,0xd9,0x00,0x00,0x00,0xfe]
+ds_or_rtn_b64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa8,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_or_rtn_b64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa8,0xd9,0xff,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa8,0xd9,0xff,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xa8,0xd9,0x00,0xfe,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xa8,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xa8,0xd9,0x00,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xa8,0xd9,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xa8,0xd9,0x00,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xa8,0xd9,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xa8,0xd9,0x00,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xa8,0xd9,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xaa,0xd9,0x00,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xaa,0xd9,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xac,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xac,0xd9,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xac,0xd9,0x00,0x00,0x00,0xfe]
+ds_xor_rtn_b64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xac,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_xor_rtn_b64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xac,0xd9,0xff,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xac,0xd9,0xff,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xac,0xd9,0x00,0xfe,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xac,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xac,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xac,0xd9,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xac,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xac,0xd9,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xac,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xac,0xd9,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xae,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xae,0xd9,0x01,0x02,0x00,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xb0,0xd9,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xb0,0xd9,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b64 v[254:255], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xb0,0xd9,0x00,0x00,0x00,0xfe]
+ds_mskor_rtn_b64 v[254:255], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xb0,0xd9,0x01,0x02,0x03,0xfe]
 
-ds_mskor_rtn_b64 v[0:1], v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xb0,0xd9,0xff,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xb0,0xd9,0xff,0x02,0x03,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xb0,0xd9,0x00,0xfe,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xb0,0xd9,0x01,0xfe,0x03,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xb0,0xd9,0x00,0x00,0xfe,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xb0,0xd9,0x01,0x02,0xfe,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xb0,0xd9,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0xb0,0xd9,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0xb0,0xd9,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0xb0,0xd9,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0xb0,0xd9,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0xb0,0xd9,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xb2,0xd9,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0xb2,0xd9,0x01,0x02,0x03,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xb4,0xd9,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xb4,0xd9,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xb4,0xd9,0x00,0x00,0x00,0xfe]
+ds_wrxchg_rtn_b64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xb4,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_wrxchg_rtn_b64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xb4,0xd9,0xff,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xb4,0xd9,0xff,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xb4,0xd9,0x00,0xfe,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xb4,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xb4,0xd9,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xb4,0xd9,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xb4,0xd9,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xb4,0xd9,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xb4,0xd9,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xb4,0xd9,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xb6,0xd9,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xb6,0xd9,0x01,0x02,0x00,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd9,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[254:255], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd9,0x00,0x00,0x00,0xfe]
+ds_cmpst_rtn_b64 v[254:255], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd9,0x01,0x02,0x03,0xfe]
 
-ds_cmpst_rtn_b64 v[0:1], v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd9,0xff,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd9,0xff,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd9,0x00,0xfe,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd9,0x01,0xfe,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd9,0x00,0x00,0xfe,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd9,0x01,0x02,0xfe,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0xc0,0xd9,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0xc0,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0xc0,0xd9,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0xc0,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0xc0,0xd9,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xc2,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0xc2,0xd9,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd9,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[254:255], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd9,0x00,0x00,0x00,0xfe]
+ds_cmpst_rtn_f64 v[254:255], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd9,0x01,0x02,0x03,0xfe]
 
-ds_cmpst_rtn_f64 v[0:1], v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd9,0xff,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd9,0xff,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd9,0x00,0xfe,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd9,0x01,0xfe,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd9,0x00,0x00,0xfe,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd9,0x01,0x02,0xfe,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0xc4,0xd9,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0xc4,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0xc4,0xd9,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0xc4,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0xc4,0xd9,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xc6,0xd9,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0xc6,0xd9,0x01,0x02,0x03,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd9,0x00,0x00,0x00,0xfe]
+ds_min_rtn_f64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_min_rtn_f64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd9,0xff,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd9,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd9,0x00,0xfe,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xc8,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xc8,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xc8,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xc8,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xc8,0xd9,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xca,0xd9,0x00,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xca,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd9,0x00,0x00,0x00,0xfe]
+ds_max_rtn_f64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd9,0x01,0x02,0x00,0xfe]
 
-ds_max_rtn_f64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd9,0xff,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd9,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd9,0x00,0xfe,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd9,0x01,0xfe,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xcc,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xcc,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xcc,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xcc,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xcc,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xce,0xd9,0x00,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xce,0xd9,0x01,0x02,0x00,0x05]
 
-ds_read_b64 v[0:1], v0 offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd9,0x00,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v1 offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read_b64 v[254:255], v0 offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd9,0x00,0x00,0x00,0xfe]
+ds_read_b64 v[254:255], v1 offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd9,0x01,0x00,0x00,0xfe]
 
-ds_read_b64 v[0:1], v255 offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd9,0xff,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v255 offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd9,0xff,0x00,0x00,0x05]
 
-ds_read_b64 v[0:1], v0
-// CHECK: [0x00,0x00,0xd8,0xd9,0x00,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v1
+// CHECK: [0x00,0x00,0xd8,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read_b64 v[0:1], v0 offset:0
-// CHECK: [0x00,0x00,0xd8,0xd9,0x00,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v1 offset:0
+// CHECK: [0x00,0x00,0xd8,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read_b64 v[0:1], v0 offset:4
-// CHECK: [0x04,0x00,0xd8,0xd9,0x00,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v1 offset:4
+// CHECK: [0x04,0x00,0xd8,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read_b64 v[0:1], v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xda,0xd9,0x00,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xda,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xdc,0xd9,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xdc,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[252:255], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xdc,0xd9,0x00,0x00,0x00,0xfc]
+ds_read2_b64 v[252:255], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xdc,0xd9,0x01,0x00,0x00,0xfc]
 
-ds_read2_b64 v[0:3], v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xdc,0xd9,0xff,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xdc,0xd9,0xff,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset1:255
-// CHECK: [0x00,0xff,0xdc,0xd9,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset1:255
+// CHECK: [0x00,0xff,0xdc,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0xdc,0xd9,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0xdc,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0xdc,0xd9,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0xdc,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:127
-// CHECK: [0x7f,0x00,0xdc,0xd9,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:127
+// CHECK: [0x7f,0x00,0xdc,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0xdc,0xd9,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0xdc,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0xdc,0xd9,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0xdc,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0xde,0xd9,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0xde,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xe0,0xd9,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xe0,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[252:255], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xe0,0xd9,0x00,0x00,0x00,0xfc]
+ds_read2st64_b64 v[252:255], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xe0,0xd9,0x01,0x00,0x00,0xfc]
 
-ds_read2st64_b64 v[0:3], v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xe0,0xd9,0xff,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xe0,0xd9,0xff,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset1:255
-// CHECK: [0x00,0xff,0xe0,0xd9,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset1:255
+// CHECK: [0x00,0xff,0xe0,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0xe0,0xd9,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0xe0,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0xe0,0xd9,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0xe0,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:127
-// CHECK: [0x7f,0x00,0xe0,0xd9,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:127
+// CHECK: [0x7f,0x00,0xe0,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0xe0,0xd9,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0xe0,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0xe0,0xd9,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0xe0,0xd9,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0xe2,0xd9,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0xe2,0xd9,0x01,0x00,0x00,0x05]
 
-ds_add_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x00,0xda,0x00,0x00,0x00,0x00]
+ds_add_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x00,0xda,0x01,0x00,0x00,0x00]
 
 ds_add_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x00,0xda,0xff,0x00,0x00,0x00]
 
-ds_add_src2_u32 v0
-// CHECK: [0x00,0x00,0x00,0xda,0x00,0x00,0x00,0x00]
+ds_add_src2_u32 v1
+// CHECK: [0x00,0x00,0x00,0xda,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x00,0xda,0x00,0x00,0x00,0x00]
+ds_add_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x00,0xda,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x00,0xda,0x00,0x00,0x00,0x00]
+ds_add_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x00,0xda,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x02,0xda,0x00,0x00,0x00,0x00]
+ds_add_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x02,0xda,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x04,0xda,0x00,0x00,0x00,0x00]
+ds_sub_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x04,0xda,0x01,0x00,0x00,0x00]
 
 ds_sub_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x04,0xda,0xff,0x00,0x00,0x00]
 
-ds_sub_src2_u32 v0
-// CHECK: [0x00,0x00,0x04,0xda,0x00,0x00,0x00,0x00]
+ds_sub_src2_u32 v1
+// CHECK: [0x00,0x00,0x04,0xda,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x04,0xda,0x00,0x00,0x00,0x00]
+ds_sub_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x04,0xda,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x04,0xda,0x00,0x00,0x00,0x00]
+ds_sub_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x04,0xda,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x06,0xda,0x00,0x00,0x00,0x00]
+ds_sub_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x06,0xda,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x08,0xda,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x08,0xda,0x01,0x00,0x00,0x00]
 
 ds_rsub_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x08,0xda,0xff,0x00,0x00,0x00]
 
-ds_rsub_src2_u32 v0
-// CHECK: [0x00,0x00,0x08,0xda,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u32 v1
+// CHECK: [0x00,0x00,0x08,0xda,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x08,0xda,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x08,0xda,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x08,0xda,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x08,0xda,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0a,0xda,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x0a,0xda,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x0c,0xda,0x00,0x00,0x00,0x00]
+ds_inc_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x0c,0xda,0x01,0x00,0x00,0x00]
 
 ds_inc_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x0c,0xda,0xff,0x00,0x00,0x00]
 
-ds_inc_src2_u32 v0
-// CHECK: [0x00,0x00,0x0c,0xda,0x00,0x00,0x00,0x00]
+ds_inc_src2_u32 v1
+// CHECK: [0x00,0x00,0x0c,0xda,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x0c,0xda,0x00,0x00,0x00,0x00]
+ds_inc_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x0c,0xda,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x0c,0xda,0x00,0x00,0x00,0x00]
+ds_inc_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x0c,0xda,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0e,0xda,0x00,0x00,0x00,0x00]
+ds_inc_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x0e,0xda,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x10,0xda,0x00,0x00,0x00,0x00]
+ds_dec_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x10,0xda,0x01,0x00,0x00,0x00]
 
 ds_dec_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x10,0xda,0xff,0x00,0x00,0x00]
 
-ds_dec_src2_u32 v0
-// CHECK: [0x00,0x00,0x10,0xda,0x00,0x00,0x00,0x00]
+ds_dec_src2_u32 v1
+// CHECK: [0x00,0x00,0x10,0xda,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x10,0xda,0x00,0x00,0x00,0x00]
+ds_dec_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x10,0xda,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x10,0xda,0x00,0x00,0x00,0x00]
+ds_dec_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x10,0xda,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x12,0xda,0x00,0x00,0x00,0x00]
+ds_dec_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x12,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i32 v0 offset:65535
-// CHECK: [0xff,0xff,0x14,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_i32 v1 offset:65535
+// CHECK: [0xff,0xff,0x14,0xda,0x01,0x00,0x00,0x00]
 
 ds_min_src2_i32 v255 offset:65535
 // CHECK: [0xff,0xff,0x14,0xda,0xff,0x00,0x00,0x00]
 
-ds_min_src2_i32 v0
-// CHECK: [0x00,0x00,0x14,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_i32 v1
+// CHECK: [0x00,0x00,0x14,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i32 v0 offset:0
-// CHECK: [0x00,0x00,0x14,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_i32 v1 offset:0
+// CHECK: [0x00,0x00,0x14,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i32 v0 offset:4
-// CHECK: [0x04,0x00,0x14,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_i32 v1 offset:4
+// CHECK: [0x04,0x00,0x14,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x16,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_i32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x16,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i32 v0 offset:65535
-// CHECK: [0xff,0xff,0x18,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_i32 v1 offset:65535
+// CHECK: [0xff,0xff,0x18,0xda,0x01,0x00,0x00,0x00]
 
 ds_max_src2_i32 v255 offset:65535
 // CHECK: [0xff,0xff,0x18,0xda,0xff,0x00,0x00,0x00]
 
-ds_max_src2_i32 v0
-// CHECK: [0x00,0x00,0x18,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_i32 v1
+// CHECK: [0x00,0x00,0x18,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i32 v0 offset:0
-// CHECK: [0x00,0x00,0x18,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_i32 v1 offset:0
+// CHECK: [0x00,0x00,0x18,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i32 v0 offset:4
-// CHECK: [0x04,0x00,0x18,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_i32 v1 offset:4
+// CHECK: [0x04,0x00,0x18,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x1a,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_i32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x1a,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x1c,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x1c,0xda,0x01,0x00,0x00,0x00]
 
 ds_min_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x1c,0xda,0xff,0x00,0x00,0x00]
 
-ds_min_src2_u32 v0
-// CHECK: [0x00,0x00,0x1c,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_u32 v1
+// CHECK: [0x00,0x00,0x1c,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x1c,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x1c,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x1c,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x1c,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x1e,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x1e,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x20,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x20,0xda,0x01,0x00,0x00,0x00]
 
 ds_max_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x20,0xda,0xff,0x00,0x00,0x00]
 
-ds_max_src2_u32 v0
-// CHECK: [0x00,0x00,0x20,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_u32 v1
+// CHECK: [0x00,0x00,0x20,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x20,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x20,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x20,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x20,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x22,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x22,0xda,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b32 v0 offset:65535
-// CHECK: [0xff,0xff,0x28,0xda,0x00,0x00,0x00,0x00]
+ds_or_src2_b32 v1 offset:65535
+// CHECK: [0xff,0xff,0x28,0xda,0x01,0x00,0x00,0x00]
 
 ds_or_src2_b32 v255 offset:65535
 // CHECK: [0xff,0xff,0x28,0xda,0xff,0x00,0x00,0x00]
 
-ds_or_src2_b32 v0
-// CHECK: [0x00,0x00,0x28,0xda,0x00,0x00,0x00,0x00]
+ds_or_src2_b32 v1
+// CHECK: [0x00,0x00,0x28,0xda,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b32 v0 offset:0
-// CHECK: [0x00,0x00,0x28,0xda,0x00,0x00,0x00,0x00]
+ds_or_src2_b32 v1 offset:0
+// CHECK: [0x00,0x00,0x28,0xda,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b32 v0 offset:4
-// CHECK: [0x04,0x00,0x28,0xda,0x00,0x00,0x00,0x00]
+ds_or_src2_b32 v1 offset:4
+// CHECK: [0x04,0x00,0x28,0xda,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x2a,0xda,0x00,0x00,0x00,0x00]
+ds_or_src2_b32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x2a,0xda,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b32 v0 offset:65535
-// CHECK: [0xff,0xff,0x2c,0xda,0x00,0x00,0x00,0x00]
+ds_xor_src2_b32 v1 offset:65535
+// CHECK: [0xff,0xff,0x2c,0xda,0x01,0x00,0x00,0x00]
 
 ds_xor_src2_b32 v255 offset:65535
 // CHECK: [0xff,0xff,0x2c,0xda,0xff,0x00,0x00,0x00]
 
-ds_xor_src2_b32 v0
-// CHECK: [0x00,0x00,0x2c,0xda,0x00,0x00,0x00,0x00]
+ds_xor_src2_b32 v1
+// CHECK: [0x00,0x00,0x2c,0xda,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b32 v0 offset:0
-// CHECK: [0x00,0x00,0x2c,0xda,0x00,0x00,0x00,0x00]
+ds_xor_src2_b32 v1 offset:0
+// CHECK: [0x00,0x00,0x2c,0xda,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b32 v0 offset:4
-// CHECK: [0x04,0x00,0x2c,0xda,0x00,0x00,0x00,0x00]
+ds_xor_src2_b32 v1 offset:4
+// CHECK: [0x04,0x00,0x2c,0xda,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x2e,0xda,0x00,0x00,0x00,0x00]
+ds_xor_src2_b32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x2e,0xda,0x01,0x00,0x00,0x00]
 
-ds_write_src2_b32 v0
-// CHECK: [0x00,0x00,0x34,0xda,0x00,0x00,0x00,0x00]
+ds_write_src2_b32 v1
+// CHECK: [0x00,0x00,0x34,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f32 v0 offset:65535
-// CHECK: [0xff,0xff,0x48,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_f32 v1 offset:65535
+// CHECK: [0xff,0xff,0x48,0xda,0x01,0x00,0x00,0x00]
 
 ds_min_src2_f32 v255 offset:65535
 // CHECK: [0xff,0xff,0x48,0xda,0xff,0x00,0x00,0x00]
 
-ds_min_src2_f32 v0
-// CHECK: [0x00,0x00,0x48,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_f32 v1
+// CHECK: [0x00,0x00,0x48,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f32 v0 offset:0
-// CHECK: [0x00,0x00,0x48,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_f32 v1 offset:0
+// CHECK: [0x00,0x00,0x48,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f32 v0 offset:4
-// CHECK: [0x04,0x00,0x48,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_f32 v1 offset:4
+// CHECK: [0x04,0x00,0x48,0xda,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x4a,0xda,0x00,0x00,0x00,0x00]
+ds_min_src2_f32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x4a,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_f32 v0 offset:65535
-// CHECK: [0xff,0xff,0x4c,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_f32 v1 offset:65535
+// CHECK: [0xff,0xff,0x4c,0xda,0x01,0x00,0x00,0x00]
 
 ds_max_src2_f32 v255 offset:65535
 // CHECK: [0xff,0xff,0x4c,0xda,0xff,0x00,0x00,0x00]
 
-ds_max_src2_f32 v0
-// CHECK: [0x00,0x00,0x4c,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_f32 v1
+// CHECK: [0x00,0x00,0x4c,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_f32 v0 offset:0
-// CHECK: [0x00,0x00,0x4c,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_f32 v1 offset:0
+// CHECK: [0x00,0x00,0x4c,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_f32 v0 offset:4
-// CHECK: [0x04,0x00,0x4c,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_f32 v1 offset:4
+// CHECK: [0x04,0x00,0x4c,0xda,0x01,0x00,0x00,0x00]
 
-ds_max_src2_f32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x4e,0xda,0x00,0x00,0x00,0x00]
+ds_max_src2_f32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x4e,0xda,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x00,0xdb,0x00,0x00,0x00,0x00]
+ds_add_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x00,0xdb,0x01,0x00,0x00,0x00]
 
 ds_add_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x00,0xdb,0xff,0x00,0x00,0x00]
 
-ds_add_src2_u64 v0
-// CHECK: [0x00,0x00,0x00,0xdb,0x00,0x00,0x00,0x00]
+ds_add_src2_u64 v1
+// CHECK: [0x00,0x00,0x00,0xdb,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x00,0xdb,0x00,0x00,0x00,0x00]
+ds_add_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x00,0xdb,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x00,0xdb,0x00,0x00,0x00,0x00]
+ds_add_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x00,0xdb,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x02,0xdb,0x00,0x00,0x00,0x00]
+ds_add_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x02,0xdb,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x04,0xdb,0x00,0x00,0x00,0x00]
+ds_sub_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x04,0xdb,0x01,0x00,0x00,0x00]
 
 ds_sub_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x04,0xdb,0xff,0x00,0x00,0x00]
 
-ds_sub_src2_u64 v0
-// CHECK: [0x00,0x00,0x04,0xdb,0x00,0x00,0x00,0x00]
+ds_sub_src2_u64 v1
+// CHECK: [0x00,0x00,0x04,0xdb,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x04,0xdb,0x00,0x00,0x00,0x00]
+ds_sub_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x04,0xdb,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x04,0xdb,0x00,0x00,0x00,0x00]
+ds_sub_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x04,0xdb,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x06,0xdb,0x00,0x00,0x00,0x00]
+ds_sub_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x06,0xdb,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x08,0xdb,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x08,0xdb,0x01,0x00,0x00,0x00]
 
 ds_rsub_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x08,0xdb,0xff,0x00,0x00,0x00]
 
-ds_rsub_src2_u64 v0
-// CHECK: [0x00,0x00,0x08,0xdb,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u64 v1
+// CHECK: [0x00,0x00,0x08,0xdb,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x08,0xdb,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x08,0xdb,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x08,0xdb,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x08,0xdb,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0a,0xdb,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x0a,0xdb,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x0c,0xdb,0x00,0x00,0x00,0x00]
+ds_inc_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x0c,0xdb,0x01,0x00,0x00,0x00]
 
 ds_inc_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x0c,0xdb,0xff,0x00,0x00,0x00]
 
-ds_inc_src2_u64 v0
-// CHECK: [0x00,0x00,0x0c,0xdb,0x00,0x00,0x00,0x00]
+ds_inc_src2_u64 v1
+// CHECK: [0x00,0x00,0x0c,0xdb,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x0c,0xdb,0x00,0x00,0x00,0x00]
+ds_inc_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x0c,0xdb,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x0c,0xdb,0x00,0x00,0x00,0x00]
+ds_inc_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x0c,0xdb,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0e,0xdb,0x00,0x00,0x00,0x00]
+ds_inc_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x0e,0xdb,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x10,0xdb,0x00,0x00,0x00,0x00]
+ds_dec_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x10,0xdb,0x01,0x00,0x00,0x00]
 
 ds_dec_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x10,0xdb,0xff,0x00,0x00,0x00]
 
-ds_dec_src2_u64 v0
-// CHECK: [0x00,0x00,0x10,0xdb,0x00,0x00,0x00,0x00]
+ds_dec_src2_u64 v1
+// CHECK: [0x00,0x00,0x10,0xdb,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x10,0xdb,0x00,0x00,0x00,0x00]
+ds_dec_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x10,0xdb,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x10,0xdb,0x00,0x00,0x00,0x00]
+ds_dec_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x10,0xdb,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x12,0xdb,0x00,0x00,0x00,0x00]
+ds_dec_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x12,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i64 v0 offset:65535
-// CHECK: [0xff,0xff,0x14,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_i64 v1 offset:65535
+// CHECK: [0xff,0xff,0x14,0xdb,0x01,0x00,0x00,0x00]
 
 ds_min_src2_i64 v255 offset:65535
 // CHECK: [0xff,0xff,0x14,0xdb,0xff,0x00,0x00,0x00]
 
-ds_min_src2_i64 v0
-// CHECK: [0x00,0x00,0x14,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_i64 v1
+// CHECK: [0x00,0x00,0x14,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i64 v0 offset:0
-// CHECK: [0x00,0x00,0x14,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_i64 v1 offset:0
+// CHECK: [0x00,0x00,0x14,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i64 v0 offset:4
-// CHECK: [0x04,0x00,0x14,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_i64 v1 offset:4
+// CHECK: [0x04,0x00,0x14,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x16,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_i64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x16,0xdb,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i64 v0 offset:65535
-// CHECK: [0xff,0xff,0x18,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_i64 v1 offset:65535
+// CHECK: [0xff,0xff,0x18,0xdb,0x01,0x00,0x00,0x00]
 
 ds_max_src2_i64 v255 offset:65535
 // CHECK: [0xff,0xff,0x18,0xdb,0xff,0x00,0x00,0x00]
 
-ds_max_src2_i64 v0
-// CHECK: [0x00,0x00,0x18,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_i64 v1
+// CHECK: [0x00,0x00,0x18,0xdb,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i64 v0 offset:0
-// CHECK: [0x00,0x00,0x18,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_i64 v1 offset:0
+// CHECK: [0x00,0x00,0x18,0xdb,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i64 v0 offset:4
-// CHECK: [0x04,0x00,0x18,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_i64 v1 offset:4
+// CHECK: [0x04,0x00,0x18,0xdb,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x1a,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_i64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x1a,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x1c,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x1c,0xdb,0x01,0x00,0x00,0x00]
 
 ds_min_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x1c,0xdb,0xff,0x00,0x00,0x00]
 
-ds_min_src2_u64 v0
-// CHECK: [0x00,0x00,0x1c,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_u64 v1
+// CHECK: [0x00,0x00,0x1c,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x1c,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x1c,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x1c,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x1c,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x1e,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x1e,0xdb,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x20,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x20,0xdb,0x01,0x00,0x00,0x00]
 
 ds_max_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x20,0xdb,0xff,0x00,0x00,0x00]
 
-ds_max_src2_u64 v0
-// CHECK: [0x00,0x00,0x20,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_u64 v1
+// CHECK: [0x00,0x00,0x20,0xdb,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x20,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x20,0xdb,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x20,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x20,0xdb,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x22,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x22,0xdb,0x01,0x00,0x00,0x00]
 
-ds_and_src2_b64 v0 offset:65535
-// CHECK: [0xff,0xff,0x24,0xdb,0x00,0x00,0x00,0x00]
+ds_and_src2_b64 v1 offset:65535
+// CHECK: [0xff,0xff,0x24,0xdb,0x01,0x00,0x00,0x00]
 
 ds_and_src2_b64 v255 offset:65535
 // CHECK: [0xff,0xff,0x24,0xdb,0xff,0x00,0x00,0x00]
 
-ds_and_src2_b64 v0
-// CHECK: [0x00,0x00,0x24,0xdb,0x00,0x00,0x00,0x00]
+ds_and_src2_b64 v1
+// CHECK: [0x00,0x00,0x24,0xdb,0x01,0x00,0x00,0x00]
 
-ds_and_src2_b64 v0 offset:0
-// CHECK: [0x00,0x00,0x24,0xdb,0x00,0x00,0x00,0x00]
+ds_and_src2_b64 v1 offset:0
+// CHECK: [0x00,0x00,0x24,0xdb,0x01,0x00,0x00,0x00]
 
-ds_and_src2_b64 v0 offset:4
-// CHECK: [0x04,0x00,0x24,0xdb,0x00,0x00,0x00,0x00]
+ds_and_src2_b64 v1 offset:4
+// CHECK: [0x04,0x00,0x24,0xdb,0x01,0x00,0x00,0x00]
 
-ds_and_src2_b64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x26,0xdb,0x00,0x00,0x00,0x00]
+ds_and_src2_b64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x26,0xdb,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b64 v0 offset:65535
-// CHECK: [0xff,0xff,0x28,0xdb,0x00,0x00,0x00,0x00]
+ds_or_src2_b64 v1 offset:65535
+// CHECK: [0xff,0xff,0x28,0xdb,0x01,0x00,0x00,0x00]
 
 ds_or_src2_b64 v255 offset:65535
 // CHECK: [0xff,0xff,0x28,0xdb,0xff,0x00,0x00,0x00]
 
-ds_or_src2_b64 v0
-// CHECK: [0x00,0x00,0x28,0xdb,0x00,0x00,0x00,0x00]
+ds_or_src2_b64 v1
+// CHECK: [0x00,0x00,0x28,0xdb,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b64 v0 offset:0
-// CHECK: [0x00,0x00,0x28,0xdb,0x00,0x00,0x00,0x00]
+ds_or_src2_b64 v1 offset:0
+// CHECK: [0x00,0x00,0x28,0xdb,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b64 v0 offset:4
-// CHECK: [0x04,0x00,0x28,0xdb,0x00,0x00,0x00,0x00]
+ds_or_src2_b64 v1 offset:4
+// CHECK: [0x04,0x00,0x28,0xdb,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x2a,0xdb,0x00,0x00,0x00,0x00]
+ds_or_src2_b64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x2a,0xdb,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b64 v0 offset:65535
-// CHECK: [0xff,0xff,0x2c,0xdb,0x00,0x00,0x00,0x00]
+ds_xor_src2_b64 v1 offset:65535
+// CHECK: [0xff,0xff,0x2c,0xdb,0x01,0x00,0x00,0x00]
 
 ds_xor_src2_b64 v255 offset:65535
 // CHECK: [0xff,0xff,0x2c,0xdb,0xff,0x00,0x00,0x00]
 
-ds_xor_src2_b64 v0
-// CHECK: [0x00,0x00,0x2c,0xdb,0x00,0x00,0x00,0x00]
+ds_xor_src2_b64 v1
+// CHECK: [0x00,0x00,0x2c,0xdb,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b64 v0 offset:0
-// CHECK: [0x00,0x00,0x2c,0xdb,0x00,0x00,0x00,0x00]
+ds_xor_src2_b64 v1 offset:0
+// CHECK: [0x00,0x00,0x2c,0xdb,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b64 v0 offset:4
-// CHECK: [0x04,0x00,0x2c,0xdb,0x00,0x00,0x00,0x00]
+ds_xor_src2_b64 v1 offset:4
+// CHECK: [0x04,0x00,0x2c,0xdb,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x2e,0xdb,0x00,0x00,0x00,0x00]
+ds_xor_src2_b64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x2e,0xdb,0x01,0x00,0x00,0x00]
 
-ds_write_src2_b64 v0
-// CHECK: [0x00,0x00,0x34,0xdb,0x00,0x00,0x00,0x00]
+ds_write_src2_b64 v1
+// CHECK: [0x00,0x00,0x34,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f64 v0 offset:65535
-// CHECK: [0xff,0xff,0x48,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_f64 v1 offset:65535
+// CHECK: [0xff,0xff,0x48,0xdb,0x01,0x00,0x00,0x00]
 
 ds_min_src2_f64 v255 offset:65535
 // CHECK: [0xff,0xff,0x48,0xdb,0xff,0x00,0x00,0x00]
 
-ds_min_src2_f64 v0
-// CHECK: [0x00,0x00,0x48,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_f64 v1
+// CHECK: [0x00,0x00,0x48,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f64 v0 offset:0
-// CHECK: [0x00,0x00,0x48,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_f64 v1 offset:0
+// CHECK: [0x00,0x00,0x48,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f64 v0 offset:4
-// CHECK: [0x04,0x00,0x48,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_f64 v1 offset:4
+// CHECK: [0x04,0x00,0x48,0xdb,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x4a,0xdb,0x00,0x00,0x00,0x00]
+ds_min_src2_f64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x4a,0xdb,0x01,0x00,0x00,0x00]
 
-ds_max_src2_f64 v0 offset:65535
-// CHECK: [0xff,0xff,0x4c,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_f64 v1 offset:65535
+// CHECK: [0xff,0xff,0x4c,0xdb,0x01,0x00,0x00,0x00]
 
 ds_max_src2_f64 v255 offset:65535
 // CHECK: [0xff,0xff,0x4c,0xdb,0xff,0x00,0x00,0x00]
 
-ds_max_src2_f64 v0
-// CHECK: [0x00,0x00,0x4c,0xdb,0x00,0x00,0x00,0x00]
+ds_max_src2_f64 v1
+// CHECK: [0x00,0x00,0x4c,0xdb,0x01,0x00,0x00,0x00]
+
+ds_max_src2_f64 v1 offset:0
+// CHECK: [0x00,0x00,0x4c,0xdb,0x01,0x00,0x00,0x00]
+
+ds_max_src2_f64 v1 offset:4
+// CHECK: [0x04,0x00,0x4c,0xdb,0x01,0x00,0x00,0x00]
+
+ds_max_src2_f64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x4e,0xdb,0x01,0x00,0x00,0x00]
+
+ds_wrap_rtn_b32 v255, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0xd0,0xd8,0x01,0x02,0x03,0xff]
+
+ds_wrap_rtn_b32 v255, v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0xd2,0xd8,0x01,0x02,0x03,0xff]
 
-ds_max_src2_f64 v0 offset:0
-// CHECK: [0x00,0x00,0x4c,0xdb,0x00,0x00,0x00,0x00]
+ds_wrap_rtn_b32 v255, v1, v2, v3
+// CHECK: [0x00,0x00,0xd0,0xd8,0x01,0x02,0x03,0xff]
 
-ds_max_src2_f64 v0 offset:4
-// CHECK: [0x04,0x00,0x4c,0xdb,0x00,0x00,0x00,0x00]
+ds_condxchg32_rtn_b64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xf8,0xd9,0x01,0x02,0x00,0x05]
 
-ds_max_src2_f64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x4e,0xdb,0x00,0x00,0x00,0x00]
+ds_condxchg32_rtn_b64 v[5:6], v1, v[2:3] gds
+// CHECK: [0x00,0x00,0xfa,0xd9,0x01,0x02,0x00,0x05]
+
+ds_condxchg32_rtn_b64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xf8,0xd9,0x01,0xfe,0x00,0x05]
 
 exp mrt0, v0, v0, v0, v0
 // CHECK: [0x0f,0x00,0x00,0xf8,0x00,0x00,0x00,0x00]
@@ -2733,6686 +2759,6686 @@ exp mrt0, off, off, off, off
 exp mrt0, v0, v0, v0, v0 vm
 // CHECK: [0x0f,0x10,0x00,0xf8,0x00,0x00,0x00,0x00]
 
-flat_load_ubyte v0, v[0:1]
-// CHECK: [0x00,0x00,0x20,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ubyte v5, v[1:2]
+// CHECK: [0x00,0x00,0x20,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ubyte v255, v[0:1]
-// CHECK: [0x00,0x00,0x20,0xdc,0x00,0x00,0x00,0xff]
+flat_load_ubyte v255, v[1:2]
+// CHECK: [0x00,0x00,0x20,0xdc,0x01,0x00,0x00,0xff]
 
-flat_load_ubyte v0, v[254:255]
-// CHECK: [0x00,0x00,0x20,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_ubyte v5, v[254:255]
+// CHECK: [0x00,0x00,0x20,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_ubyte v0, v[0:1] glc
-// CHECK: [0x00,0x00,0x21,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ubyte v5, v[1:2] glc
+// CHECK: [0x00,0x00,0x21,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ubyte v0, v[0:1] slc
-// CHECK: [0x00,0x00,0x22,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ubyte v5, v[1:2] slc
+// CHECK: [0x00,0x00,0x22,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sbyte v0, v[0:1]
-// CHECK: [0x00,0x00,0x24,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sbyte v5, v[1:2]
+// CHECK: [0x00,0x00,0x24,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sbyte v255, v[0:1]
-// CHECK: [0x00,0x00,0x24,0xdc,0x00,0x00,0x00,0xff]
+flat_load_sbyte v255, v[1:2]
+// CHECK: [0x00,0x00,0x24,0xdc,0x01,0x00,0x00,0xff]
 
-flat_load_sbyte v0, v[254:255]
-// CHECK: [0x00,0x00,0x24,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_sbyte v5, v[254:255]
+// CHECK: [0x00,0x00,0x24,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_sbyte v0, v[0:1] glc
-// CHECK: [0x00,0x00,0x25,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sbyte v5, v[1:2] glc
+// CHECK: [0x00,0x00,0x25,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sbyte v0, v[0:1] slc
-// CHECK: [0x00,0x00,0x26,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sbyte v5, v[1:2] slc
+// CHECK: [0x00,0x00,0x26,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ushort v0, v[0:1]
-// CHECK: [0x00,0x00,0x28,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ushort v5, v[1:2]
+// CHECK: [0x00,0x00,0x28,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ushort v255, v[0:1]
-// CHECK: [0x00,0x00,0x28,0xdc,0x00,0x00,0x00,0xff]
+flat_load_ushort v255, v[1:2]
+// CHECK: [0x00,0x00,0x28,0xdc,0x01,0x00,0x00,0xff]
 
-flat_load_ushort v0, v[254:255]
-// CHECK: [0x00,0x00,0x28,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_ushort v5, v[254:255]
+// CHECK: [0x00,0x00,0x28,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_ushort v0, v[0:1] glc
-// CHECK: [0x00,0x00,0x29,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ushort v5, v[1:2] glc
+// CHECK: [0x00,0x00,0x29,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_ushort v0, v[0:1] slc
-// CHECK: [0x00,0x00,0x2a,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ushort v5, v[1:2] slc
+// CHECK: [0x00,0x00,0x2a,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sshort v0, v[0:1]
-// CHECK: [0x00,0x00,0x2c,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sshort v5, v[1:2]
+// CHECK: [0x00,0x00,0x2c,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sshort v255, v[0:1]
-// CHECK: [0x00,0x00,0x2c,0xdc,0x00,0x00,0x00,0xff]
+flat_load_sshort v255, v[1:2]
+// CHECK: [0x00,0x00,0x2c,0xdc,0x01,0x00,0x00,0xff]
 
-flat_load_sshort v0, v[254:255]
-// CHECK: [0x00,0x00,0x2c,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_sshort v5, v[254:255]
+// CHECK: [0x00,0x00,0x2c,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_sshort v0, v[0:1] glc
-// CHECK: [0x00,0x00,0x2d,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sshort v5, v[1:2] glc
+// CHECK: [0x00,0x00,0x2d,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_sshort v0, v[0:1] slc
-// CHECK: [0x00,0x00,0x2e,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sshort v5, v[1:2] slc
+// CHECK: [0x00,0x00,0x2e,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dword v0, v[0:1]
-// CHECK: [0x00,0x00,0x30,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dword v5, v[1:2]
+// CHECK: [0x00,0x00,0x30,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dword v255, v[0:1]
-// CHECK: [0x00,0x00,0x30,0xdc,0x00,0x00,0x00,0xff]
+flat_load_dword v255, v[1:2]
+// CHECK: [0x00,0x00,0x30,0xdc,0x01,0x00,0x00,0xff]
 
-flat_load_dword v0, v[254:255]
-// CHECK: [0x00,0x00,0x30,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_dword v5, v[254:255]
+// CHECK: [0x00,0x00,0x30,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_dword v0, v[0:1] glc
-// CHECK: [0x00,0x00,0x31,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dword v5, v[1:2] glc
+// CHECK: [0x00,0x00,0x31,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dword v0, v[0:1] slc
-// CHECK: [0x00,0x00,0x32,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dword v5, v[1:2] slc
+// CHECK: [0x00,0x00,0x32,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x34,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx2 v[5:6], v[1:2]
+// CHECK: [0x00,0x00,0x34,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x34,0xdc,0x00,0x00,0x00,0xfe]
+flat_load_dwordx2 v[254:255], v[1:2]
+// CHECK: [0x00,0x00,0x34,0xdc,0x01,0x00,0x00,0xfe]
 
-flat_load_dwordx2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x34,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_dwordx2 v[5:6], v[254:255]
+// CHECK: [0x00,0x00,0x34,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_dwordx2 v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x35,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx2 v[5:6], v[1:2] glc
+// CHECK: [0x00,0x00,0x35,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x36,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx2 v[5:6], v[1:2] slc
+// CHECK: [0x00,0x00,0x36,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx4 v[0:3], v[0:1]
-// CHECK: [0x00,0x00,0x38,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx4 v[5:8], v[1:2]
+// CHECK: [0x00,0x00,0x38,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx4 v[252:255], v[0:1]
-// CHECK: [0x00,0x00,0x38,0xdc,0x00,0x00,0x00,0xfc]
+flat_load_dwordx4 v[252:255], v[1:2]
+// CHECK: [0x00,0x00,0x38,0xdc,0x01,0x00,0x00,0xfc]
 
-flat_load_dwordx4 v[0:3], v[254:255]
-// CHECK: [0x00,0x00,0x38,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_dwordx4 v[5:8], v[254:255]
+// CHECK: [0x00,0x00,0x38,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_dwordx4 v[0:3], v[0:1] glc
-// CHECK: [0x00,0x00,0x39,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx4 v[5:8], v[1:2] glc
+// CHECK: [0x00,0x00,0x39,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx4 v[0:3], v[0:1] slc
-// CHECK: [0x00,0x00,0x3a,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx4 v[5:8], v[1:2] slc
+// CHECK: [0x00,0x00,0x3a,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx3 v[0:2], v[0:1]
-// CHECK: [0x00,0x00,0x3c,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx3 v[5:7], v[1:2]
+// CHECK: [0x00,0x00,0x3c,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx3 v[253:255], v[0:1]
-// CHECK: [0x00,0x00,0x3c,0xdc,0x00,0x00,0x00,0xfd]
+flat_load_dwordx3 v[253:255], v[1:2]
+// CHECK: [0x00,0x00,0x3c,0xdc,0x01,0x00,0x00,0xfd]
 
-flat_load_dwordx3 v[0:2], v[254:255]
-// CHECK: [0x00,0x00,0x3c,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_dwordx3 v[5:7], v[254:255]
+// CHECK: [0x00,0x00,0x3c,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_dwordx3 v[0:2], v[0:1] glc
-// CHECK: [0x00,0x00,0x3d,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx3 v[5:7], v[1:2] glc
+// CHECK: [0x00,0x00,0x3d,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx3 v[0:2], v[0:1] slc
-// CHECK: [0x00,0x00,0x3e,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx3 v[5:7], v[1:2] slc
+// CHECK: [0x00,0x00,0x3e,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_byte v[0:1], v0
-// CHECK: [0x00,0x00,0x60,0xdc,0x00,0x00,0x00,0x00]
+flat_store_byte v[1:2], v2
+// CHECK: [0x00,0x00,0x60,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_byte v[254:255], v0
-// CHECK: [0x00,0x00,0x60,0xdc,0xfe,0x00,0x00,0x00]
+flat_store_byte v[254:255], v2
+// CHECK: [0x00,0x00,0x60,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_store_byte v[0:1], v255
-// CHECK: [0x00,0x00,0x60,0xdc,0x00,0xff,0x00,0x00]
+flat_store_byte v[1:2], v255
+// CHECK: [0x00,0x00,0x60,0xdc,0x01,0xff,0x00,0x00]
 
-flat_store_byte v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x61,0xdc,0x00,0x00,0x00,0x00]
+flat_store_byte v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x61,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_byte v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x62,0xdc,0x00,0x00,0x00,0x00]
+flat_store_byte v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x62,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_short v[0:1], v0
-// CHECK: [0x00,0x00,0x68,0xdc,0x00,0x00,0x00,0x00]
+flat_store_short v[1:2], v2
+// CHECK: [0x00,0x00,0x68,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_short v[254:255], v0
-// CHECK: [0x00,0x00,0x68,0xdc,0xfe,0x00,0x00,0x00]
+flat_store_short v[254:255], v2
+// CHECK: [0x00,0x00,0x68,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_store_short v[0:1], v255
-// CHECK: [0x00,0x00,0x68,0xdc,0x00,0xff,0x00,0x00]
+flat_store_short v[1:2], v255
+// CHECK: [0x00,0x00,0x68,0xdc,0x01,0xff,0x00,0x00]
 
-flat_store_short v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x69,0xdc,0x00,0x00,0x00,0x00]
+flat_store_short v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x69,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_short v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x6a,0xdc,0x00,0x00,0x00,0x00]
+flat_store_short v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x6a,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dword v[0:1], v0
-// CHECK: [0x00,0x00,0x70,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dword v[1:2], v2
+// CHECK: [0x00,0x00,0x70,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dword v[254:255], v0
-// CHECK: [0x00,0x00,0x70,0xdc,0xfe,0x00,0x00,0x00]
+flat_store_dword v[254:255], v2
+// CHECK: [0x00,0x00,0x70,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_store_dword v[0:1], v255
-// CHECK: [0x00,0x00,0x70,0xdc,0x00,0xff,0x00,0x00]
+flat_store_dword v[1:2], v255
+// CHECK: [0x00,0x00,0x70,0xdc,0x01,0xff,0x00,0x00]
 
-flat_store_dword v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x71,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dword v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x71,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dword v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x72,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dword v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x72,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x74,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dwordx2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x74,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x74,0xdc,0xfe,0x00,0x00,0x00]
+flat_store_dwordx2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x74,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_store_dwordx2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x74,0xdc,0x00,0xfe,0x00,0x00]
+flat_store_dwordx2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x74,0xdc,0x01,0xfe,0x00,0x00]
 
-flat_store_dwordx2 v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x75,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dwordx2 v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x75,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x76,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dwordx2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x76,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx4 v[0:1], v[0:3]
-// CHECK: [0x00,0x00,0x78,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dwordx4 v[1:2], v[2:5]
+// CHECK: [0x00,0x00,0x78,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx4 v[254:255], v[0:3]
-// CHECK: [0x00,0x00,0x78,0xdc,0xfe,0x00,0x00,0x00]
+flat_store_dwordx4 v[254:255], v[2:5]
+// CHECK: [0x00,0x00,0x78,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_store_dwordx4 v[0:1], v[252:255]
-// CHECK: [0x00,0x00,0x78,0xdc,0x00,0xfc,0x00,0x00]
+flat_store_dwordx4 v[1:2], v[252:255]
+// CHECK: [0x00,0x00,0x78,0xdc,0x01,0xfc,0x00,0x00]
 
-flat_store_dwordx4 v[0:1], v[0:3] glc
-// CHECK: [0x00,0x00,0x79,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dwordx4 v[1:2], v[2:5] glc
+// CHECK: [0x00,0x00,0x79,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx4 v[0:1], v[0:3] slc
-// CHECK: [0x00,0x00,0x7a,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dwordx4 v[1:2], v[2:5] slc
+// CHECK: [0x00,0x00,0x7a,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx3 v[0:1], v[0:2]
-// CHECK: [0x00,0x00,0x7c,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dwordx3 v[1:2], v[2:4]
+// CHECK: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx3 v[254:255], v[0:2]
-// CHECK: [0x00,0x00,0x7c,0xdc,0xfe,0x00,0x00,0x00]
+flat_store_dwordx3 v[254:255], v[2:4]
+// CHECK: [0x00,0x00,0x7c,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_store_dwordx3 v[0:1], v[253:255]
-// CHECK: [0x00,0x00,0x7c,0xdc,0x00,0xfd,0x00,0x00]
+flat_store_dwordx3 v[1:2], v[253:255]
+// CHECK: [0x00,0x00,0x7c,0xdc,0x01,0xfd,0x00,0x00]
 
-flat_store_dwordx3 v[0:1], v[0:2] glc
-// CHECK: [0x00,0x00,0x7d,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dwordx3 v[1:2], v[2:4] glc
+// CHECK: [0x00,0x00,0x7d,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx3 v[0:1], v[0:2] slc
-// CHECK: [0x00,0x00,0x7e,0xdc,0x00,0x00,0x00,0x00]
+flat_store_dwordx3 v[1:2], v[2:4] slc
+// CHECK: [0x00,0x00,0x7e,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap v[0:1], v0
-// CHECK: [0x00,0x00,0xc0,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_swap v[1:2], v2
+// CHECK: [0x00,0x00,0xc0,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap v[254:255], v0
-// CHECK: [0x00,0x00,0xc0,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_swap v[254:255], v2
+// CHECK: [0x00,0x00,0xc0,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_swap v[0:1], v255
-// CHECK: [0x00,0x00,0xc0,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_swap v[1:2], v255
+// CHECK: [0x00,0x00,0xc0,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_swap v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xc1,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_swap v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xc1,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xc2,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_swap v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xc2,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc4,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0xc4,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0xc4,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_cmpswap v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0xc4,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_cmpswap v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc4,0xdc,0x00,0xfe,0x00,0x00]
+flat_atomic_cmpswap v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0xc4,0xdc,0x01,0xfe,0x00,0x00]
 
-flat_atomic_cmpswap v0, v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0xc5,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap v0, v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0xc5,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0xc6,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0xc6,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_add v[0:1], v0
-// CHECK: [0x00,0x00,0xc8,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_add v[1:2], v2
+// CHECK: [0x00,0x00,0xc8,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_add v[254:255], v0
-// CHECK: [0x00,0x00,0xc8,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_add v[254:255], v2
+// CHECK: [0x00,0x00,0xc8,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_add v[0:1], v255
-// CHECK: [0x00,0x00,0xc8,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_add v[1:2], v255
+// CHECK: [0x00,0x00,0xc8,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_add v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xc9,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_add v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xc9,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_add v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xca,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_add v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xca,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub v[0:1], v0
-// CHECK: [0x00,0x00,0xcc,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_sub v[1:2], v2
+// CHECK: [0x00,0x00,0xcc,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub v[254:255], v0
-// CHECK: [0x00,0x00,0xcc,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_sub v[254:255], v2
+// CHECK: [0x00,0x00,0xcc,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_sub v[0:1], v255
-// CHECK: [0x00,0x00,0xcc,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_sub v[1:2], v255
+// CHECK: [0x00,0x00,0xcc,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_sub v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xcd,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_sub v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xcd,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xce,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_sub v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xce,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin v[0:1], v0
-// CHECK: [0x00,0x00,0xd4,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_smin v[1:2], v2
+// CHECK: [0x00,0x00,0xd4,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin v[254:255], v0
-// CHECK: [0x00,0x00,0xd4,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_smin v[254:255], v2
+// CHECK: [0x00,0x00,0xd4,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_smin v[0:1], v255
-// CHECK: [0x00,0x00,0xd4,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_smin v[1:2], v255
+// CHECK: [0x00,0x00,0xd4,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_smin v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xd5,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_smin v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xd5,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xd6,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_smin v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xd6,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin v[0:1], v0
-// CHECK: [0x00,0x00,0xd8,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_umin v[1:2], v2
+// CHECK: [0x00,0x00,0xd8,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin v[254:255], v0
-// CHECK: [0x00,0x00,0xd8,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_umin v[254:255], v2
+// CHECK: [0x00,0x00,0xd8,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_umin v[0:1], v255
-// CHECK: [0x00,0x00,0xd8,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_umin v[1:2], v255
+// CHECK: [0x00,0x00,0xd8,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_umin v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xd9,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_umin v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xd9,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xda,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_umin v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xda,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax v[0:1], v0
-// CHECK: [0x00,0x00,0xdc,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_smax v[1:2], v2
+// CHECK: [0x00,0x00,0xdc,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax v[254:255], v0
-// CHECK: [0x00,0x00,0xdc,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_smax v[254:255], v2
+// CHECK: [0x00,0x00,0xdc,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_smax v[0:1], v255
-// CHECK: [0x00,0x00,0xdc,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_smax v[1:2], v255
+// CHECK: [0x00,0x00,0xdc,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_smax v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xdd,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_smax v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xdd,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xde,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_smax v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xde,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax v[0:1], v0
-// CHECK: [0x00,0x00,0xe0,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_umax v[1:2], v2
+// CHECK: [0x00,0x00,0xe0,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax v[254:255], v0
-// CHECK: [0x00,0x00,0xe0,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_umax v[254:255], v2
+// CHECK: [0x00,0x00,0xe0,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_umax v[0:1], v255
-// CHECK: [0x00,0x00,0xe0,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_umax v[1:2], v255
+// CHECK: [0x00,0x00,0xe0,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_umax v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xe1,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_umax v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xe1,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xe2,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_umax v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xe2,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_and v[0:1], v0
-// CHECK: [0x00,0x00,0xe4,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_and v[1:2], v2
+// CHECK: [0x00,0x00,0xe4,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_and v[254:255], v0
-// CHECK: [0x00,0x00,0xe4,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_and v[254:255], v2
+// CHECK: [0x00,0x00,0xe4,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_and v[0:1], v255
-// CHECK: [0x00,0x00,0xe4,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_and v[1:2], v255
+// CHECK: [0x00,0x00,0xe4,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_and v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xe5,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_and v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xe5,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_and v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xe6,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_and v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xe6,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_or v[0:1], v0
-// CHECK: [0x00,0x00,0xe8,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_or v[1:2], v2
+// CHECK: [0x00,0x00,0xe8,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_or v[254:255], v0
-// CHECK: [0x00,0x00,0xe8,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_or v[254:255], v2
+// CHECK: [0x00,0x00,0xe8,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_or v[0:1], v255
-// CHECK: [0x00,0x00,0xe8,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_or v[1:2], v255
+// CHECK: [0x00,0x00,0xe8,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_or v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xe9,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_or v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xe9,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_or v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xea,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_or v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xea,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor v[0:1], v0
-// CHECK: [0x00,0x00,0xec,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_xor v[1:2], v2
+// CHECK: [0x00,0x00,0xec,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor v[254:255], v0
-// CHECK: [0x00,0x00,0xec,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_xor v[254:255], v2
+// CHECK: [0x00,0x00,0xec,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_xor v[0:1], v255
-// CHECK: [0x00,0x00,0xec,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_xor v[1:2], v255
+// CHECK: [0x00,0x00,0xec,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_xor v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xed,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_xor v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xed,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xee,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_xor v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xee,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc v[0:1], v0
-// CHECK: [0x00,0x00,0xf0,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_inc v[1:2], v2
+// CHECK: [0x00,0x00,0xf0,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc v[254:255], v0
-// CHECK: [0x00,0x00,0xf0,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_inc v[254:255], v2
+// CHECK: [0x00,0x00,0xf0,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_inc v[0:1], v255
-// CHECK: [0x00,0x00,0xf0,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_inc v[1:2], v255
+// CHECK: [0x00,0x00,0xf0,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_inc v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xf1,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_inc v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xf1,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xf2,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_inc v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xf2,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec v[0:1], v0
-// CHECK: [0x00,0x00,0xf4,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_dec v[1:2], v2
+// CHECK: [0x00,0x00,0xf4,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec v[254:255], v0
-// CHECK: [0x00,0x00,0xf4,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_dec v[254:255], v2
+// CHECK: [0x00,0x00,0xf4,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_dec v[0:1], v255
-// CHECK: [0x00,0x00,0xf4,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_dec v[1:2], v255
+// CHECK: [0x00,0x00,0xf4,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_dec v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xf5,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_dec v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xf5,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xf6,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_dec v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xf6,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_fcmpswap v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf8,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_fcmpswap v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0xf8,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_fcmpswap v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0xf8,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_fcmpswap v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0xf8,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_fcmpswap v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf8,0xdc,0x00,0xfe,0x00,0x00]
+flat_atomic_fcmpswap v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0xf8,0xdc,0x01,0xfe,0x00,0x00]
 
-flat_atomic_fcmpswap v0, v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0xf9,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_fcmpswap v0, v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0xf9,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_fcmpswap v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0xfa,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_fcmpswap v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0xfa,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmin v[0:1], v0
-// CHECK: [0x00,0x00,0xfc,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_fmin v[1:2], v2
+// CHECK: [0x00,0x00,0xfc,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmin v[254:255], v0
-// CHECK: [0x00,0x00,0xfc,0xdc,0xfe,0x00,0x00,0x00]
+flat_atomic_fmin v[254:255], v2
+// CHECK: [0x00,0x00,0xfc,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_fmin v[0:1], v255
-// CHECK: [0x00,0x00,0xfc,0xdc,0x00,0xff,0x00,0x00]
+flat_atomic_fmin v[1:2], v255
+// CHECK: [0x00,0x00,0xfc,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_fmin v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0xfd,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_fmin v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0xfd,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmin v[0:1], v0 slc
-// CHECK: [0x00,0x00,0xfe,0xdc,0x00,0x00,0x00,0x00]
+flat_atomic_fmin v[1:2], v2 slc
+// CHECK: [0x00,0x00,0xfe,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmax v[0:1], v0
-// CHECK: [0x00,0x00,0x00,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fmax v[1:2], v2
+// CHECK: [0x00,0x00,0x00,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmax v[254:255], v0
-// CHECK: [0x00,0x00,0x00,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_fmax v[254:255], v2
+// CHECK: [0x00,0x00,0x00,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_fmax v[0:1], v255
-// CHECK: [0x00,0x00,0x00,0xdd,0x00,0xff,0x00,0x00]
+flat_atomic_fmax v[1:2], v255
+// CHECK: [0x00,0x00,0x00,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_fmax v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x01,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fmax v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x01,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmax v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x02,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fmax v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x02,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x40,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_swap_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x40,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x40,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_swap_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x40,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_swap_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x40,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_swap_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x40,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_swap_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x41,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_swap_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x41,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x42,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_swap_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x42,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap_x2 v[0:1], v[0:3]
-// CHECK: [0x00,0x00,0x44,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap_x2 v[1:2], v[2:5]
+// CHECK: [0x00,0x00,0x44,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap_x2 v[254:255], v[0:3]
-// CHECK: [0x00,0x00,0x44,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_cmpswap_x2 v[254:255], v[2:5]
+// CHECK: [0x00,0x00,0x44,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_cmpswap_x2 v[0:1], v[252:255]
-// CHECK: [0x00,0x00,0x44,0xdd,0x00,0xfc,0x00,0x00]
+flat_atomic_cmpswap_x2 v[1:2], v[252:255]
+// CHECK: [0x00,0x00,0x44,0xdd,0x01,0xfc,0x00,0x00]
 
-flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[0:3] glc
-// CHECK: [0x00,0x00,0x45,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap_x2 v[0:1], v[1:2], v[2:5] glc
+// CHECK: [0x00,0x00,0x45,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap_x2 v[0:1], v[0:3] slc
-// CHECK: [0x00,0x00,0x46,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap_x2 v[1:2], v[2:5] slc
+// CHECK: [0x00,0x00,0x46,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_add_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x48,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_add_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x48,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_add_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x48,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_add_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x48,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_add_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x48,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_add_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x48,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_add_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x49,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_add_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x49,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_add_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x4a,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_add_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x4a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4c,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_sub_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x4c,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x4c,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_sub_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x4c,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_sub_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x4c,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_sub_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x4c,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_sub_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x4d,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_sub_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x4d,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x4e,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_sub_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x4e,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x54,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smin_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x54,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x54,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_smin_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x54,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_smin_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x54,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_smin_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x54,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_smin_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x55,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smin_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x55,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x56,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smin_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x56,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x58,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umin_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x58,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x58,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_umin_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x58,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_umin_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x58,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_umin_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x58,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_umin_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x59,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umin_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x59,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x5a,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umin_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x5a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x5c,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smax_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x5c,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x5c,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_smax_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x5c,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_smax_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x5c,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_smax_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x5c,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_smax_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x5d,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smax_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x5d,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x5e,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smax_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x5e,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x60,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umax_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x60,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x60,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_umax_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x60,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_umax_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x60,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_umax_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x60,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_umax_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x61,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umax_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x61,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x62,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umax_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x62,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_and_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x64,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_and_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x64,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_and_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x64,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_and_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x64,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_and_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x64,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_and_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x64,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_and_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x65,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_and_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x65,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_and_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x66,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_and_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x66,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_or_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x68,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_or_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x68,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_or_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x68,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_or_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x68,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_or_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x68,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_or_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x68,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_or_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x69,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_or_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x69,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_or_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x6a,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_or_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x6a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6c,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_xor_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x6c,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x6c,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_xor_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x6c,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_xor_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6c,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_xor_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x6c,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_xor_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x6d,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_xor_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x6d,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x6e,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_xor_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x6e,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x70,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_inc_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x70,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x70,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_inc_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x70,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_inc_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x70,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_inc_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x70,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_inc_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x71,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_inc_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x71,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x72,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_inc_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x72,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x74,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_dec_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x74,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x74,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_dec_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x74,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_dec_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x74,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_dec_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x74,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_dec_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x75,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_dec_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x75,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x76,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_dec_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x76,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fcmpswap_x2 v[0:1], v[0:3]
-// CHECK: [0x00,0x00,0x78,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fcmpswap_x2 v[1:2], v[2:5]
+// CHECK: [0x00,0x00,0x78,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fcmpswap_x2 v[254:255], v[0:3]
-// CHECK: [0x00,0x00,0x78,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_fcmpswap_x2 v[254:255], v[2:5]
+// CHECK: [0x00,0x00,0x78,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_fcmpswap_x2 v[0:1], v[252:255]
-// CHECK: [0x00,0x00,0x78,0xdd,0x00,0xfc,0x00,0x00]
+flat_atomic_fcmpswap_x2 v[1:2], v[252:255]
+// CHECK: [0x00,0x00,0x78,0xdd,0x01,0xfc,0x00,0x00]
 
-flat_atomic_fcmpswap_x2 v[0:1], v[0:1], v[0:3] glc
-// CHECK: [0x00,0x00,0x79,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fcmpswap_x2 v[0:1], v[1:2], v[2:5] glc
+// CHECK: [0x00,0x00,0x79,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fcmpswap_x2 v[0:1], v[0:3] slc
-// CHECK: [0x00,0x00,0x7a,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fcmpswap_x2 v[1:2], v[2:5] slc
+// CHECK: [0x00,0x00,0x7a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmin_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7c,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fmin_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x7c,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmin_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x7c,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_fmin_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x7c,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_fmin_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7c,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_fmin_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x7c,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_fmin_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x7d,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fmin_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x7d,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmin_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x7e,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fmin_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x7e,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmax_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x80,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fmax_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x80,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmax_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x80,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_fmax_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x80,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_fmax_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x80,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_fmax_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x80,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_fmax_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x81,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fmax_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x81,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_fmax_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x82,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_fmax_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x82,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v252, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf0,0x00,0xfc,0x00,0x00]
+image_load v252, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf0,0x01,0xfc,0x02,0x00]
 
-image_load v0, v[252:255], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf0,0xfc,0x00,0x00,0x00]
+image_load v5, v[252:255], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf0,0xfc,0x05,0x02,0x00]
 
-image_load v0, v[0:3], s[4:11] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf0,0x00,0x00,0x01,0x00]
+image_load v5, v[1:4], s[12:19] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf0,0x01,0x05,0x03,0x00]
 
-image_load v0, v[0:3], s[96:103] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf0,0x00,0x00,0x18,0x00]
+image_load v5, v[1:4], s[96:103] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf0,0x01,0x05,0x18,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x2
-// CHECK: [0x00,0x02,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x2
+// CHECK: [0x00,0x02,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0x3
-// CHECK: [0x00,0x03,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0x3
+// CHECK: [0x00,0x03,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x4
-// CHECK: [0x00,0x04,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x4
+// CHECK: [0x00,0x04,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0x5
-// CHECK: [0x00,0x05,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0x5
+// CHECK: [0x00,0x05,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0x6
-// CHECK: [0x00,0x06,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0x6
+// CHECK: [0x00,0x06,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:2], v[0:3], s[0:7] dmask:0x7
-// CHECK: [0x00,0x07,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:7], v[1:4], s[8:15] dmask:0x7
+// CHECK: [0x00,0x07,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x8
-// CHECK: [0x00,0x08,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x8
+// CHECK: [0x00,0x08,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0x9
-// CHECK: [0x00,0x09,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0x9
+// CHECK: [0x00,0x09,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0xa
-// CHECK: [0x00,0x0a,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:2], v[0:3], s[0:7] dmask:0xb
-// CHECK: [0x00,0x0b,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:7], v[1:4], s[8:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0xc
-// CHECK: [0x00,0x0c,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:2], v[0:3], s[0:7] dmask:0xd
-// CHECK: [0x00,0x0d,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:7], v[1:4], s[8:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:2], v[0:3], s[0:7] dmask:0xe
-// CHECK: [0x00,0x0e,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:7], v[1:4], s[8:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v[0:3], v[0:3], s[0:7] dmask:0xf
-// CHECK: [0x00,0x0f,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:8], v[1:4], s[8:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x0
-// CHECK: [0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x0
+// CHECK: [0x00,0x00,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x00,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v252, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf0,0x00,0xfc,0x00,0x00]
+image_load_mip v252, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf0,0x01,0xfc,0x02,0x00]
 
-image_load_mip v0, v[252:255], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf0,0xfc,0x00,0x00,0x00]
+image_load_mip v5, v[252:255], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf0,0xfc,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[4:11] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf0,0x00,0x00,0x01,0x00]
+image_load_mip v5, v[1:4], s[12:19] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf0,0x01,0x05,0x03,0x00]
 
-image_load_mip v0, v[0:3], s[96:103] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf0,0x00,0x00,0x18,0x00]
+image_load_mip v5, v[1:4], s[96:103] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf0,0x01,0x05,0x18,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x2
-// CHECK: [0x00,0x02,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x2
+// CHECK: [0x00,0x02,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0x3
-// CHECK: [0x00,0x03,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0x3
+// CHECK: [0x00,0x03,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x4
-// CHECK: [0x00,0x04,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x4
+// CHECK: [0x00,0x04,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0x5
-// CHECK: [0x00,0x05,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0x5
+// CHECK: [0x00,0x05,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0x6
-// CHECK: [0x00,0x06,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0x6
+// CHECK: [0x00,0x06,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:2], v[0:3], s[0:7] dmask:0x7
-// CHECK: [0x00,0x07,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:7], v[1:4], s[8:15] dmask:0x7
+// CHECK: [0x00,0x07,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x8
-// CHECK: [0x00,0x08,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x8
+// CHECK: [0x00,0x08,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0x9
-// CHECK: [0x00,0x09,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0x9
+// CHECK: [0x00,0x09,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0xa
-// CHECK: [0x00,0x0a,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:2], v[0:3], s[0:7] dmask:0xb
-// CHECK: [0x00,0x0b,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:7], v[1:4], s[8:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0xc
-// CHECK: [0x00,0x0c,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:2], v[0:3], s[0:7] dmask:0xd
-// CHECK: [0x00,0x0d,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:7], v[1:4], s[8:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:2], v[0:3], s[0:7] dmask:0xe
-// CHECK: [0x00,0x0e,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:7], v[1:4], s[8:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf
-// CHECK: [0x00,0x0f,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:8], v[1:4], s[8:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x0
-// CHECK: [0x00,0x00,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x0
+// CHECK: [0x00,0x00,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v252, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf0,0x00,0xfc,0x00,0x00]
+image_store v252, v[2:5], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf0,0x02,0xfc,0x03,0x00]
 
-image_store v0, v[252:255], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf0,0xfc,0x00,0x00,0x00]
+image_store v1, v[252:255], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf0,0xfc,0x01,0x03,0x00]
 
-image_store v0, v[0:3], s[4:11] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf0,0x00,0x00,0x01,0x00]
+image_store v1, v[2:5], s[16:23] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf0,0x02,0x01,0x04,0x00]
 
-image_store v0, v[0:3], s[96:103] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf0,0x00,0x00,0x18,0x00]
+image_store v1, v[2:5], s[96:103] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf0,0x02,0x01,0x18,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x2 unorm
-// CHECK: [0x00,0x12,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x2 unorm
+// CHECK: [0x00,0x12,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0x3 unorm
-// CHECK: [0x00,0x13,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0x3 unorm
+// CHECK: [0x00,0x13,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x4 unorm
-// CHECK: [0x00,0x14,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x4 unorm
+// CHECK: [0x00,0x14,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0x5 unorm
-// CHECK: [0x00,0x15,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0x5 unorm
+// CHECK: [0x00,0x15,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0x6 unorm
-// CHECK: [0x00,0x16,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0x6 unorm
+// CHECK: [0x00,0x16,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:2], v[0:3], s[0:7] dmask:0x7 unorm
-// CHECK: [0x00,0x17,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:3], v[2:5], s[12:19] dmask:0x7 unorm
+// CHECK: [0x00,0x17,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x8 unorm
-// CHECK: [0x00,0x18,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x8 unorm
+// CHECK: [0x00,0x18,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0x9 unorm
-// CHECK: [0x00,0x19,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0x9 unorm
+// CHECK: [0x00,0x19,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0xa unorm
-// CHECK: [0x00,0x1a,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0xa unorm
+// CHECK: [0x00,0x1a,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:2], v[0:3], s[0:7] dmask:0xb unorm
-// CHECK: [0x00,0x1b,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:3], v[2:5], s[12:19] dmask:0xb unorm
+// CHECK: [0x00,0x1b,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0xc unorm
-// CHECK: [0x00,0x1c,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0xc unorm
+// CHECK: [0x00,0x1c,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:2], v[0:3], s[0:7] dmask:0xd unorm
-// CHECK: [0x00,0x1d,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:3], v[2:5], s[12:19] dmask:0xd unorm
+// CHECK: [0x00,0x1d,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:2], v[0:3], s[0:7] dmask:0xe unorm
-// CHECK: [0x00,0x1e,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:3], v[2:5], s[12:19] dmask:0xe unorm
+// CHECK: [0x00,0x1e,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v[0:3], v[0:3], s[0:7] dmask:0xf unorm
-// CHECK: [0x00,0x1f,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:4], v[2:5], s[12:19] dmask:0xf unorm
+// CHECK: [0x00,0x1f,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x0 unorm
-// CHECK: [0x00,0x10,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x0 unorm
+// CHECK: [0x00,0x10,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x1 unorm glc
-// CHECK: [0x00,0x31,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x1 unorm glc
+// CHECK: [0x00,0x31,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v252, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf0,0x00,0xfc,0x00,0x00]
+image_store_mip v252, v[2:5], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf0,0x02,0xfc,0x03,0x00]
 
-image_store_mip v0, v[252:255], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf0,0xfc,0x00,0x00,0x00]
+image_store_mip v1, v[252:255], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf0,0xfc,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[4:11] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf0,0x00,0x00,0x01,0x00]
+image_store_mip v1, v[2:5], s[16:23] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf0,0x02,0x01,0x04,0x00]
 
-image_store_mip v0, v[0:3], s[96:103] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf0,0x00,0x00,0x18,0x00]
+image_store_mip v1, v[2:5], s[96:103] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf0,0x02,0x01,0x18,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x2 unorm
-// CHECK: [0x00,0x12,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x2 unorm
+// CHECK: [0x00,0x12,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0x3 unorm
-// CHECK: [0x00,0x13,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0x3 unorm
+// CHECK: [0x00,0x13,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x4 unorm
-// CHECK: [0x00,0x14,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x4 unorm
+// CHECK: [0x00,0x14,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0x5 unorm
-// CHECK: [0x00,0x15,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0x5 unorm
+// CHECK: [0x00,0x15,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0x6 unorm
-// CHECK: [0x00,0x16,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0x6 unorm
+// CHECK: [0x00,0x16,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:2], v[0:3], s[0:7] dmask:0x7 unorm
-// CHECK: [0x00,0x17,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:3], v[2:5], s[12:19] dmask:0x7 unorm
+// CHECK: [0x00,0x17,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x8 unorm
-// CHECK: [0x00,0x18,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x8 unorm
+// CHECK: [0x00,0x18,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0x9 unorm
-// CHECK: [0x00,0x19,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0x9 unorm
+// CHECK: [0x00,0x19,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0xa unorm
-// CHECK: [0x00,0x1a,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0xa unorm
+// CHECK: [0x00,0x1a,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:2], v[0:3], s[0:7] dmask:0xb unorm
-// CHECK: [0x00,0x1b,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:3], v[2:5], s[12:19] dmask:0xb unorm
+// CHECK: [0x00,0x1b,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0xc unorm
-// CHECK: [0x00,0x1c,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0xc unorm
+// CHECK: [0x00,0x1c,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:2], v[0:3], s[0:7] dmask:0xd unorm
-// CHECK: [0x00,0x1d,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:3], v[2:5], s[12:19] dmask:0xd unorm
+// CHECK: [0x00,0x1d,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:2], v[0:3], s[0:7] dmask:0xe unorm
-// CHECK: [0x00,0x1e,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:3], v[2:5], s[12:19] dmask:0xe unorm
+// CHECK: [0x00,0x1e,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
-// CHECK: [0x00,0x1f,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:4], v[2:5], s[12:19] dmask:0xf unorm
+// CHECK: [0x00,0x1f,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x0 unorm
-// CHECK: [0x00,0x10,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x0 unorm
+// CHECK: [0x00,0x10,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x1 unorm glc
-// CHECK: [0x00,0x31,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x1 unorm glc
+// CHECK: [0x00,0x31,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v252, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf0,0x00,0xfc,0x00,0x00]
+image_get_resinfo v252, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf0,0x01,0xfc,0x02,0x00]
 
-image_get_resinfo v0, v[252:255], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf0,0xfc,0x00,0x00,0x00]
+image_get_resinfo v5, v[252:255], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf0,0xfc,0x05,0x02,0x00]
 
-image_get_resinfo v0, v[0:3], s[4:11] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf0,0x00,0x00,0x01,0x00]
+image_get_resinfo v5, v[1:4], s[12:19] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf0,0x01,0x05,0x03,0x00]
 
-image_get_resinfo v0, v[0:3], s[96:103] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf0,0x00,0x00,0x18,0x00]
+image_get_resinfo v5, v[1:4], s[96:103] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf0,0x01,0x05,0x18,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x2
-// CHECK: [0x00,0x02,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x2
+// CHECK: [0x00,0x02,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0x3
-// CHECK: [0x00,0x03,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0x3
+// CHECK: [0x00,0x03,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x4
-// CHECK: [0x00,0x04,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x4
+// CHECK: [0x00,0x04,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0x5
-// CHECK: [0x00,0x05,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0x5
+// CHECK: [0x00,0x05,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0x6
-// CHECK: [0x00,0x06,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0x6
+// CHECK: [0x00,0x06,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:2], v[0:3], s[0:7] dmask:0x7
-// CHECK: [0x00,0x07,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:7], v[1:4], s[8:15] dmask:0x7
+// CHECK: [0x00,0x07,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x8
-// CHECK: [0x00,0x08,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x8
+// CHECK: [0x00,0x08,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0x9
-// CHECK: [0x00,0x09,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0x9
+// CHECK: [0x00,0x09,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0xa
-// CHECK: [0x00,0x0a,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:2], v[0:3], s[0:7] dmask:0xb
-// CHECK: [0x00,0x0b,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:7], v[1:4], s[8:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0xc
-// CHECK: [0x00,0x0c,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:2], v[0:3], s[0:7] dmask:0xd
-// CHECK: [0x00,0x0d,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:7], v[1:4], s[8:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:2], v[0:3], s[0:7] dmask:0xe
-// CHECK: [0x00,0x0e,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:7], v[1:4], s[8:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v[0:3], v[0:3], s[0:7] dmask:0xf
-// CHECK: [0x00,0x0f,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:8], v[1:4], s[8:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x0
-// CHECK: [0x00,0x00,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x0
+// CHECK: [0x00,0x00,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0xfc,0x00,0x00]
+image_sample v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0xfc,0x00,0x00,0x00]
+image_sample v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0x01,0x00]
+image_sample v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0x18,0x00]
+image_sample v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0x20,0x00]
+image_sample v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0x20,0x03]
+image_sample v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0xc0,0x03]
+image_sample v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_cl v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_cl v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_cl v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0x01,0x00]
+image_sample_cl v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_cl v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0x18,0x00]
+image_sample_cl v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0x20,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0x20,0x03]
+image_sample_cl v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_cl v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_cl v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_l v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_l v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_l v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0x01,0x00]
+image_sample_l v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_l v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0x18,0x00]
+image_sample_l v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0x20,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0x20,0x03]
+image_sample_l v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_l v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_l v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_b v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_b v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_b v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0x01,0x00]
+image_sample_b v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_b v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0x18,0x00]
+image_sample_b v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0x20,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0x20,0x03]
+image_sample_b v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_b v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_b v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_b_cl v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_b_cl v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_b_cl v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0x01,0x00]
+image_sample_b_cl v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_b_cl v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0x18,0x00]
+image_sample_b_cl v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0x20,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0x20,0x03]
+image_sample_b_cl v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_b_cl v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_b_cl v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_lz v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_lz v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_lz v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0x01,0x00]
+image_sample_lz v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_lz v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0x18,0x00]
+image_sample_lz v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0x20,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0x20,0x03]
+image_sample_lz v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_lz v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_lz v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0x18,0x00]
+image_sample_c v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0x20,0x03]
+image_sample_c v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_c v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c_cl v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_cl v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c_cl v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c_cl v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_cl v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0x18,0x00]
+image_sample_c_cl v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0x20,0x03]
+image_sample_c_cl v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_c_cl v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c_cl v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c_d v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_d v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c_d v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_d v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c_d v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_d v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf0,0x00,0x00,0x18,0x00]
+image_sample_c_d v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_c_d v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c_d v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_d v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf0,0x00,0x00,0x20,0x03]
+image_sample_c_d v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_c_d v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c_d v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_d v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_d v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xa8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_d v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xa8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c_l v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_l v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c_l v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c_l v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_l v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0x18,0x00]
+image_sample_c_l v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0x20,0x03]
+image_sample_c_l v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_c_l v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c_l v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c_b v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_b v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c_b v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c_b v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_b v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0x18,0x00]
+image_sample_c_b v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0x20,0x03]
+image_sample_c_b v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_c_b v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c_b v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c_b_cl v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_b_cl v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c_b_cl v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0x18,0x00]
+image_sample_c_b_cl v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0x20,0x03]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c_b_cl v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c_lz v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_lz v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c_lz v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c_lz v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_lz v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0x18,0x00]
+image_sample_c_lz v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0x78,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0x20,0x03]
+image_sample_c_lz v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0x22,0x03]
 
-image_sample_c_lz v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c_lz v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4 v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4 v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0xff,0x00,0x00,0x00]
+image_gather4 v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0x01,0x00]
+image_gather4 v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4 v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0x18,0x00]
+image_gather4 v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4 v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0x20,0x00]
+image_gather4 v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4 v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0x20,0x03]
+image_gather4 v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4 v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4 v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x00,0xf3,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x00,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x01,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x01,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x02,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x02,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_cl v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_cl v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_cl v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_cl v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_cl v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_cl v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_cl v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_cl v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_cl v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_cl v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x04,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x04,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x05,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x05,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x06,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x06,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_l v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_l v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_l v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_l v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_l v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_l v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_l v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_l v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_l v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_l v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x10,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x10,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x11,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x11,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x12,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x12,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_b v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_b v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_b v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_b v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_b v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_b v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_b v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_b v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_b v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_b v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x14,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x14,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x15,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x15,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x16,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x16,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_b_cl v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_b_cl v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_b_cl v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_b_cl v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_b_cl v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_b_cl v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_b_cl v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_b_cl v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_b_cl v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x18,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x18,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x19,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x19,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x1a,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x1a,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_lz v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_lz v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_lz v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_lz v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_lz v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_lz v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_lz v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_lz v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_lz v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_lz v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x1c,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x1c,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x1d,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x1d,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x1e,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x1e,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x20,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x20,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x21,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x21,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x22,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x22,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_cl v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_cl v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_cl v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_cl v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_cl v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_cl v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_cl v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_cl v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_cl v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x24,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x24,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x25,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x25,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x26,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x26,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_l v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_l v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_l v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_l v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_l v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_l v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_l v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_l v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_l v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x30,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x30,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x31,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x31,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x32,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x32,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_b v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_b v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_b v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_b v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_b v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_b v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_b v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_b v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_b v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x34,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x34,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x35,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x35,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x36,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x36,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_b_cl v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_b_cl v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_b_cl v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_b_cl v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_b_cl v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_b_cl v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_b_cl v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_b_cl v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x38,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x38,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x39,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x39,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x3a,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x3a,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_lz v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_lz v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_lz v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_lz v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_lz v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_lz v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_lz v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_lz v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_lz v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x3c,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x3c,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x3d,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x3d,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x3e,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x3e,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x40,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x40,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x41,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x41,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x42,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x42,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_cl_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_cl_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_cl_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_cl_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_cl_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_cl_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_cl_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_cl_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_cl_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x44,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x44,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x45,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x45,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x46,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x46,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_l_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_l_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_l_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_l_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_l_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_l_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_l_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_l_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_l_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x50,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x50,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x51,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x51,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x52,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x52,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_b_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_b_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_b_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_b_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_b_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_b_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_b_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_b_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_b_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x54,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x54,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x55,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x55,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x56,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x56,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_b_cl_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_b_cl_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_b_cl_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_b_cl_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_b_cl_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_b_cl_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_b_cl_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_b_cl_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x58,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x58,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x59,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x59,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x5a,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x5a,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_lz_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_lz_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_lz_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_lz_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_lz_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_lz_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_lz_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_lz_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_lz_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x5c,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x5c,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x5d,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x5d,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x5e,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x5e,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x60,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x60,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x61,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x61,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x62,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x62,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_cl_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_cl_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_cl_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_cl_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_cl_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_cl_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_cl_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_cl_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x64,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x64,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x65,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x65,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x66,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x66,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_l_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_l_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_l_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_l_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_l_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_l_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_l_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_l_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x70,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x70,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x71,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x71,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x72,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x72,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_b_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_b_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_b_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_b_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_b_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_b_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_b_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_b_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x74,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x74,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x75,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x75,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x76,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x76,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_b_cl_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_b_cl_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_b_cl_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_b_cl_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x78,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x78,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x79,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x79,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x7a,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x7a,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_lz_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_lz_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_lz_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_lz_o v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0x18,0x00]
+image_gather4_c_lz_o v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0x78,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0x20,0x03]
+image_gather4_c_lz_o v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0x22,0x03]
 
-image_gather4_c_lz_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_lz_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x7c,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x7c,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x7d,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x7d,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x7e,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x7e,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0xfc,0x00,0x00]
+image_get_lod v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0xfc,0x62,0x00]
 
-image_get_lod v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0xff,0x00,0x00,0x00]
+image_get_lod v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0xff,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0x01,0x00]
+image_get_lod v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0x63,0x00]
 
-image_get_lod v0, v0, s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0x18,0x00]
+image_get_lod v5, v1, s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0x78,0x00]
 
-image_get_lod v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0x20,0x00]
+image_get_lod v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0x82,0x00]
 
-image_get_lod v0, v0, s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0x20,0x03]
+image_get_lod v5, v1, s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0x22,0x03]
 
-image_get_lod v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0xc0,0x03]
+image_get_lod v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0xc2,0x03]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:3], v0, s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v[5:8], v1, s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf1,0x00,0xfc,0x00,0x00]
+image_sample_c_cd v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0xfc,0x62,0x00]
 
-image_sample_c_cd v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf1,0xfc,0x00,0x00,0x00]
+image_sample_c_cd v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf1,0xfc,0x05,0x62,0x00]
 
-image_sample_c_cd v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf1,0x00,0x00,0x01,0x00]
+image_sample_c_cd v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x63,0x00]
 
-image_sample_c_cd v0, v[0:3], s[96:103], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf1,0x00,0x00,0x18,0x00]
+image_sample_c_cd v5, v[1:4], s[96:103], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x78,0x00]
 
-image_sample_c_cd v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf1,0x00,0x00,0x20,0x00]
+image_sample_c_cd v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x82,0x00]
 
-image_sample_c_cd v0, v[0:3], s[0:7], s[100:103] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf1,0x00,0x00,0x20,0x03]
+image_sample_c_cd v5, v[1:4], s[8:15], s[100:103] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0x22,0x03]
 
-image_sample_c_cd v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xa8,0xf1,0x00,0x00,0xc0,0x03]
+image_sample_c_cd v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xa8,0xf1,0x01,0x05,0xc2,0x03]
 
-image_sample_c_cd v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-image_sample_c_cd v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xa8,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_cd v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xa8,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_x v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_format_x v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_format_x v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_format_x v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_format_x v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_format_x v5, off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_format_x v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_format_x v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_format_x v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_format_x v5, off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_format_x v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_format_x v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_format_x v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_format_x v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_format_x v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_format_x v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_format_x v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_format_x v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_format_x v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_format_x v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_format_x v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x00,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_x v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x00,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_x v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x00,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_x v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_x v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_x v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x00,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_x v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x00,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_x v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_format_x v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0xfe,0x00,0x00]
+buffer_load_format_xy v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0xfe,0x02,0x03]
 
-buffer_load_format_xy v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_format_xy v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_format_xy v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_format_xy v[5:6], off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_format_xy v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_format_xy v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_format_xy v[5:6], off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_format_xy v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_format_xy v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_format_xy v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_format_xy v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_format_xy v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_format_xy v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_format_xy v[0:1], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_format_xy v[5:6], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_format_xy v[0:1], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_format_xy v[5:6], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_format_xy v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[253:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0xfd,0x00,0x00]
+buffer_load_format_xyz v[253:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0xfd,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_format_xyz v[5:7], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_format_xyz v[5:7], off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_format_xyz v[0:2], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_format_xyz v[5:7], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_format_xyz v[5:7], off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_format_xyz v[5:7], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_format_xyz v[5:7], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_format_xyz v[5:7], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_format_xyz v[5:7], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_format_xyz v[5:7], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_format_xyz v[0:2], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[252:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0xfc,0x00,0x00]
+buffer_load_format_xyzw v[252:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0xfc,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_format_xyzw v[5:8], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_format_xyzw v[5:8], off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_format_xyzw v[0:3], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_format_xyzw v[5:8], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_format_xyzw v[5:8], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_format_xyzw v[5:8], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_format_xyzw v[5:8], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_format_xyzw v[5:8], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_format_xyzw v[5:8], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_format_xyzw v[0:3], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_store_format_x v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_x v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0xff,0x00,0x00]
+buffer_store_format_x v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_store_format_x v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_format_x v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_format_x v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x19,0x00]
+buffer_store_format_x v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_store_format_x v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_format_x v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_format_x v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0x67]
+buffer_store_format_x v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_store_format_x v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_format_x v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_format_x v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_format_x v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_format_x v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_format_x v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_format_x v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_format_x v1, off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_format_x v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_format_x v1, off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_format_x v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_x v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_x v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_x v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_x v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_x v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_x v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_x v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x40,0x00]
+buffer_store_format_x v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0xfe,0x00,0x00]
+buffer_store_format_xy v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0xfe,0x03,0x04]
 
-buffer_store_format_xy v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_format_xy v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_format_xy v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x19,0x00]
+buffer_store_format_xy v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_store_format_xy v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_format_xy v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0x67]
+buffer_store_format_xy v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_store_format_xy v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_format_xy v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_format_xy v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_format_xy v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_format_xy v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_format_xy v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_format_xy v[0:1], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_format_xy v[1:2], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_format_xy v[0:1], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_format_xy v[1:2], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_format_xy v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x40,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[253:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0xfd,0x00,0x00]
+buffer_store_format_xyz v[253:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0xfd,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_format_xyz v[1:3], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x19,0x00]
+buffer_store_format_xyz v[1:3], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_store_format_xyz v[0:2], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_format_xyz v[1:3], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0x67]
+buffer_store_format_xyz v[1:3], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_format_xyz v[1:3], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_format_xyz v[1:3], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_format_xyz v[1:3], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_format_xyz v[1:3], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_format_xyz v[1:3], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_format_xyz v[0:2], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x40,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[252:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0xfc,0x00,0x00]
+buffer_store_format_xyzw v[252:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0xfc,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_format_xyzw v[1:4], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x19,0x00]
+buffer_store_format_xyzw v[1:4], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_store_format_xyzw v[0:3], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_format_xyzw v[1:4], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0x67]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_format_xyzw v[1:4], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_format_xyzw v[1:4], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_format_xyzw v[1:4], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_format_xyzw v[1:4], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_format_xyzw v[1:4], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_format_xyzw v[0:3], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x40,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_load_ubyte v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ubyte v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_ubyte v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_ubyte v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_ubyte v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_ubyte v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_ubyte v5, off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_ubyte v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_ubyte v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_ubyte v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_ubyte v5, off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_ubyte v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_ubyte v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_ubyte v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_ubyte v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_ubyte v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_ubyte v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_ubyte v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_ubyte v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_ubyte v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_ubyte v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_ubyte v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x20,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x20,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ubyte v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x20,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x20,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ubyte v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x20,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x20,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ubyte v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x20,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x20,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ubyte v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x20,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x20,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ubyte v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x20,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x20,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ubyte v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x20,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x20,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ubyte v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x20,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_sbyte v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_sbyte v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_sbyte v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_sbyte v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_sbyte v5, off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_sbyte v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_sbyte v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_sbyte v5, off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_sbyte v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_sbyte v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_sbyte v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_sbyte v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_sbyte v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_sbyte v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_sbyte v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_sbyte v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_sbyte v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_sbyte v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_sbyte v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x24,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x24,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x24,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x24,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x24,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x24,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x24,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x24,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x24,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x24,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x24,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x24,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x24,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x24,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x24,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_ushort v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_ushort v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_ushort v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_ushort v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_ushort v5, off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_ushort v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_ushort v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_ushort v5, off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_ushort v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_ushort v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_ushort v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_ushort v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_ushort v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_ushort v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_ushort v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_ushort v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_ushort v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_ushort v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_ushort v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x28,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x28,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x28,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x28,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x28,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x28,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x28,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x28,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x28,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x28,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x28,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x28,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x28,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x28,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_ushort v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x28,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_sshort v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_sshort v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_sshort v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_sshort v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_sshort v5, off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_sshort v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_sshort v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_sshort v5, off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_sshort v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_sshort v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_sshort v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_sshort v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_sshort v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_sshort v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_sshort v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_sshort v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_sshort v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_sshort v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_sshort v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x2c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x2c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x2c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x2c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x2c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x2c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x2c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x2c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_sshort v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x2c,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_load_dword v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_dword v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_dword v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_dword v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_dword v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_dword v5, off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_dword v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_dword v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_dword v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_dword v5, off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_dword v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_dword v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_dword v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_dword v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_dword v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_dword v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_dword v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_dword v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_dword v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_dword v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_dword v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x30,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x30,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x30,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x30,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x30,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x30,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x30,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x30,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x30,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x30,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x30,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x30,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x30,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x30,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_dword v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x30,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0xfe,0x00,0x00]
+buffer_load_dwordx2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0xfe,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_dwordx2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_dwordx2 v[5:6], off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_dwordx2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_dwordx2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_dwordx2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_dwordx2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_dwordx2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_dwordx2 v[5:6], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_dwordx2 v[5:6], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_dwordx2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x34,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x34,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x34,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x34,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x34,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x34,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x34,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x34,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x34,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x34,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x34,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x34,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x34,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x34,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x34,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[252:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0xfc,0x00,0x00]
+buffer_load_dwordx4 v[252:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0xfc,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_dwordx4 v[5:8], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_dwordx4 v[5:8], off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_dwordx4 v[0:3], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_dwordx4 v[5:8], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_dwordx4 v[5:8], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_dwordx4 v[5:8], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_dwordx4 v[5:8], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_dwordx4 v[5:8], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_dwordx4 v[0:3], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x38,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x38,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x38,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x38,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x38,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x38,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x38,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x38,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x38,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x38,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x38,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x38,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x38,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x38,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x38,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[253:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0xfd,0x00,0x00]
+buffer_load_dwordx3 v[253:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0xfd,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_dwordx3 v[5:7], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x19,0x00]
+buffer_load_dwordx3 v[5:7], off, s[100:103], s3 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x19,0x03]
 
-buffer_load_dwordx3 v[0:2], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_dwordx3 v[5:7], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x00,0x67]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s103 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x02,0x67]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_dwordx3 v[5:7], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_dwordx3 v[5:7], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_dwordx3 v[5:7], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_dwordx3 v[5:7], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_dwordx3 v[5:7], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_dwordx3 v[0:2], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x3c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x3c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x3c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x3c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x3c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], v[0:1], s[8:11], s3 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x3c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x3c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x3c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x3c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x3c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x3c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x3c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x3c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x3c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x00,0x40,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x3c,0xe0,0x00,0x05,0x42,0x03]
 
-buffer_store_byte v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_byte v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0xff,0x00,0x00]
+buffer_store_byte v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_store_byte v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_byte v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_byte v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x19,0x00]
+buffer_store_byte v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_store_byte v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_byte v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_byte v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0x67]
+buffer_store_byte v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_store_byte v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_byte v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_byte v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_byte v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_byte v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_byte v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_byte v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_byte v1, off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_byte v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_byte v1, off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_byte v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_byte v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_byte v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_byte v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_byte v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_byte v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_byte v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_byte v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x40,0x00]
+buffer_store_byte v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_store_short v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0xff,0x00,0x00]
+buffer_store_short v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_store_short v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_short v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_short v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x19,0x00]
+buffer_store_short v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_store_short v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_short v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_short v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0x67]
+buffer_store_short v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_store_short v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_short v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_short v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_short v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_short v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_short v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_short v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_short v1, off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_short v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_short v1, off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_short v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x40,0x00]
+buffer_store_short v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_store_dword v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0xff,0x00,0x00]
+buffer_store_dword v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_store_dword v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_dword v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_dword v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x19,0x00]
+buffer_store_dword v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_store_dword v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_dword v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_dword v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0x67]
+buffer_store_dword v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_store_dword v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_dword v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_dword v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_dword v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_dword v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_dword v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_dword v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_dword v1, off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_dword v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_dword v1, off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_dword v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x40,0x00]
+buffer_store_dword v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0xfe,0x00,0x00]
+buffer_store_dwordx2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0xfe,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_dwordx2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x19,0x00]
+buffer_store_dwordx2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_store_dwordx2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_dwordx2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0x67]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_dwordx2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_dwordx2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_dwordx2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_dwordx2 v[1:2], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_dwordx2 v[1:2], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_dwordx2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x40,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[252:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0xfc,0x00,0x00]
+buffer_store_dwordx4 v[252:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0xfc,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_dwordx4 v[1:4], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x19,0x00]
+buffer_store_dwordx4 v[1:4], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_store_dwordx4 v[0:3], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_dwordx4 v[1:4], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0x67]
+buffer_store_dwordx4 v[1:4], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_dwordx4 v[1:4], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_dwordx4 v[1:4], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_dwordx4 v[1:4], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_dwordx4 v[1:4], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_dwordx4 v[1:4], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_dwordx4 v[0:3], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx4 v[1:4], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x40,0x00]
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[253:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0xfd,0x00,0x00]
+buffer_store_dwordx3 v[253:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0xfd,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_dwordx3 v[1:3], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x19,0x00]
+buffer_store_dwordx3 v[1:3], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_store_dwordx3 v[0:2], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_dwordx3 v[1:3], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0x67]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_dwordx3 v[1:3], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_dwordx3 v[1:3], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_dwordx3 v[1:3], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_dwordx3 v[1:3], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_dwordx3 v[1:3], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_dwordx3 v[0:2], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x7c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x7c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x7c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x7c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x7c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x7c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x7c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x40,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_swap v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_swap v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_swap v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_swap v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_swap v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_swap v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_swap v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_swap v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_swap v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_swap v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_swap v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_swap v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_swap v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_swap v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_swap v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_swap v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_swap v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xc0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xc0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xc0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xc0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xc0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xc0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xc0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xc0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xc0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xc0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xc0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xc0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xc0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xc0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_swap v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xc0,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0xfe,0x00,0x00]
+buffer_atomic_cmpswap v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_cmpswap v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_cmpswap v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_cmpswap v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_cmpswap v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_cmpswap v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_cmpswap v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_cmpswap v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_cmpswap v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_cmpswap v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_cmpswap v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_cmpswap v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xc4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xc4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xc4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xc4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xc4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xc4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0xc4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0xc4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xc4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xc4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xc4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xc4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xc4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xc4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_cmpswap v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xc4,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_add v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_add v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_add v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_add v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_add v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_add v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_add v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_add v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_add v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_add v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_add v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_add v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_add v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_add v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_add v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_add v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_add v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_add v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xc8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_add v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xc8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xc8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_add v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xc8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xc8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_add v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xc8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xc8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_add v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xc8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xc8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_add v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xc8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xc8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_add v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xc8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xc8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_add v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xc8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_add v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xc8,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_sub v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_sub v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_sub v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_sub v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_sub v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_sub v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_sub v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_sub v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_sub v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_sub v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_sub v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_sub v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_sub v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_sub v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_sub v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_sub v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_sub v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xcc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xcc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xcc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xcc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xcc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xcc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xcc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xcc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xcc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xcc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xcc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xcc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xcc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xcc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_sub v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xcc,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_smin v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_smin v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_smin v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_smin v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_smin v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_smin v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_smin v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_smin v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_smin v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_smin v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_smin v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_smin v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_smin v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_smin v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_smin v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_smin v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_smin v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xd4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xd4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xd4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xd4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xd4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xd4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xd4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xd4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xd4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xd4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xd4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xd4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xd4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xd4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_smin v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xd4,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_umin v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_umin v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_umin v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_umin v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_umin v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_umin v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_umin v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_umin v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_umin v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_umin v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_umin v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_umin v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_umin v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_umin v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_umin v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_umin v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_umin v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xd8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xd8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xd8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xd8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xd8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xd8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xd8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xd8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xd8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xd8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xd8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xd8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xd8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xd8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_umin v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xd8,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_smax v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_smax v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_smax v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_smax v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_smax v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_smax v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_smax v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_smax v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_smax v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_smax v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_smax v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_smax v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_smax v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_smax v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_smax v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_smax v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_smax v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xdc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xdc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xdc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xdc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xdc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xdc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xdc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xdc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xdc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xdc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xdc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xdc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xdc,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xdc,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_smax v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xdc,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_umax v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_umax v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_umax v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_umax v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_umax v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_umax v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_umax v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_umax v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_umax v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_umax v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_umax v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_umax v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_umax v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_umax v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_umax v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_umax v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_umax v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xe0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xe0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xe0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xe0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xe0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xe0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xe0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xe0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xe0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xe0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xe0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xe0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xe0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xe0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_umax v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xe0,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_and v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_and v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_and v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_and v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_and v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_and v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_and v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_and v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_and v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_and v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_and v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_and v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_and v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_and v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_and v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_and v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_and v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_and v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xe4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_and v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xe4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xe4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_and v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xe4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xe4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_and v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xe4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xe4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_and v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xe4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xe4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_and v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xe4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xe4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_and v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xe4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xe4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_and v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xe4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_and v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xe4,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_or v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_or v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_or v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_or v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_or v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_or v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_or v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_or v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_or v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_or v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_or v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_or v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_or v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_or v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_or v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_or v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_or v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_or v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xe8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_or v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xe8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xe8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_or v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xe8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xe8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_or v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xe8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xe8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_or v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xe8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xe8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_or v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xe8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xe8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_or v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xe8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xe8,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_or v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xe8,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_or v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xe8,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_xor v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_xor v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_xor v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_xor v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_xor v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_xor v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_xor v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_xor v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_xor v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_xor v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_xor v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_xor v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_xor v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_xor v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_xor v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_xor v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_xor v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xec,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xec,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xec,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xec,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xec,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xec,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xec,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xec,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xec,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xec,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xec,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xec,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xec,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xec,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_xor v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xec,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_inc v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_inc v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_inc v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_inc v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_inc v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_inc v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_inc v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_inc v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_inc v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_inc v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_inc v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_inc v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_inc v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_inc v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_inc v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_inc v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_inc v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xf0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xf0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xf0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xf0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xf0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xf0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xf0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xf0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xf0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xf0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xf0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xf0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xf0,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xf0,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_inc v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xf0,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_dec v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0xff,0x00,0x00]
+buffer_atomic_dec v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_atomic_dec v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x00,0x01,0x00]
+buffer_atomic_dec v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_atomic_dec v0, off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x00,0x19,0x00]
+buffer_atomic_dec v1, off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x01,0x19,0x04]
 
-buffer_atomic_dec v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x00,0x1e,0x00]
+buffer_atomic_dec v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_dec v0, off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x00,0x00,0x67]
+buffer_atomic_dec v1, off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x01,0x03,0x67]
 
-buffer_atomic_dec v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x00,0x00,0x7c]
+buffer_atomic_dec v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_dec v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x00,0x00,0x80]
+buffer_atomic_dec v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_atomic_dec v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x00,0x00,0xc1]
+buffer_atomic_dec v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_dec v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xf4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0xf4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xf4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0xf4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec v0, v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0xf4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v1, v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0xf4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0xf4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0xf4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xf4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0xf4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xf4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0xf4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xf4,0xe0,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0xf4,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x00,0x40,0x00]
+buffer_atomic_dec v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0xf4,0xe0,0x00,0x01,0x43,0x04]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_swap_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_swap_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_swap_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_swap_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_swap_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_swap_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_swap_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_swap_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_swap_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_swap_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_swap_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_swap_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x40,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x40,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x40,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x40,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x40,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x40,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x40,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x40,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x40,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x40,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x40,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x40,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x40,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x40,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_swap_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x40,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap_x2 v[252:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0xfc,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[252:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0xfc,0x03,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_cmpswap_x2 v[0:3], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x44,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x44,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x44,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x44,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x44,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x44,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x44,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x44,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x44,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x44,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x44,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x44,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x44,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x44,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_cmpswap_x2 v[1:4], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x44,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_add_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_add_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_add_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_add_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_add_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_add_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_add_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_add_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_add_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_add_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_add_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_add_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x48,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x48,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x48,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x48,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x48,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x48,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x48,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x48,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x48,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x48,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x48,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x48,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x48,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x48,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_add_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x48,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_sub_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_sub_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_sub_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_sub_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_sub_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_sub_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_sub_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_sub_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_sub_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_sub_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_sub_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_sub_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x4c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x4c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x4c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x4c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x4c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x4c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x4c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x4c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x4c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x4c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x4c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x4c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x4c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x4c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_sub_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x4c,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_smin_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_smin_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_smin_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_smin_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_smin_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_smin_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_smin_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_smin_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_smin_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_smin_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_smin_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_smin_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x54,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x54,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x54,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x54,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x54,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x54,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x54,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x54,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x54,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x54,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x54,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x54,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x54,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x54,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_smin_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x54,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_umin_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_umin_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_umin_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_umin_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_umin_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_umin_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_umin_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_umin_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_umin_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_umin_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_umin_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_umin_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x58,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x58,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x58,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x58,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x58,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x58,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x58,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x58,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x58,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x58,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x58,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x58,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x58,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x58,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_umin_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x58,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_smax_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_smax_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_smax_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_smax_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_smax_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_smax_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_smax_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_smax_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_smax_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_smax_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_smax_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_smax_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x5c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x5c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x5c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x5c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x5c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x5c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x5c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x5c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x5c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x5c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x5c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x5c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x5c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x5c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_smax_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x5c,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_umax_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_umax_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_umax_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_umax_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_umax_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_umax_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_umax_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_umax_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_umax_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_umax_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_umax_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_umax_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x60,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x60,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x60,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x60,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x60,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x60,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x60,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x60,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x60,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x60,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x60,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x60,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x60,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x60,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_umax_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x60,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_and_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_and_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_and_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_and_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_and_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_and_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_and_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_and_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_and_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_and_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_and_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_and_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x64,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x64,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x64,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x64,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x64,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x64,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x64,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x64,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x64,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x64,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x64,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x64,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x64,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x64,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_and_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x64,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_or_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_or_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_or_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_or_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_or_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_or_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_or_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_or_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_or_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_or_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_or_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_or_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x68,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x68,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x68,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x68,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x68,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x68,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x68,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x68,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x68,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x68,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x68,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x68,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x68,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x68,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_or_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x68,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_xor_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_xor_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_xor_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_xor_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_xor_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_xor_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_xor_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_xor_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_xor_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_xor_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_xor_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_xor_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x6c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x6c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x6c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x6c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x6c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x6c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x6c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x6c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x6c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x6c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x6c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x6c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x6c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x6c,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_xor_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x6c,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_inc_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_inc_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_inc_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_inc_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_inc_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_inc_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_inc_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_inc_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_inc_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_inc_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_inc_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_inc_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x70,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x70,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x70,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x70,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x70,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x70,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x70,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x70,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x70,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x70,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x70,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x70,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x70,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x70,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_inc_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x70,0xe1,0x00,0x01,0x43,0x04]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_dec_x2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0xfe,0x03,0x04]
 
-buffer_atomic_dec_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_dec_x2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x01,0x04,0x04]
 
-buffer_atomic_dec_x2 v[0:1], off, s[100:103], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x00,0x19,0x00]
+buffer_atomic_dec_x2 v[1:2], off, s[100:103], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x01,0x19,0x04]
 
-buffer_atomic_dec_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_dec_x2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x01,0x1e,0x04]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s103 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x00,0x00,0x67]
+buffer_atomic_dec_x2 v[1:2], off, s[12:15], s103 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x01,0x03,0x67]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_dec_x2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x01,0x03,0x7c]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_dec_x2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x01,0x03,0x80]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_dec_x2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x01,0x03,0xc1]
 
-buffer_atomic_dec_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x74,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x74,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x74,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x74,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec_x2 v[0:1], v[0:1], s[0:3], s0 addr64 offset:4095
-// CHECK: [0xff,0x8f,0x74,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[1:2], v[0:1], s[12:15], s4 addr64 offset:4095
+// CHECK: [0xff,0x8f,0x74,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x74,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x74,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x74,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x74,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x74,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x74,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x74,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x74,0xe1,0x00,0x01,0x03,0x04]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x00,0x40,0x00]
+buffer_atomic_dec_x2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x74,0xe1,0x00,0x01,0x43,0x04]
 
 buffer_wbinvl1_vol
 // CHECK: [0x00,0x00,0xc0,0xe1,0x00,0x00,0x00,0x00]
@@ -9420,656 +9446,656 @@ buffer_wbinvl1_vol
 buffer_wbinvl1
 // CHECK: [0x00,0x00,0xc4,0xe1,0x00,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], s0
-// CHECK: [0x00,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], s2
+// CHECK: [0x02,0x82,0x02,0xc0]
 
-s_load_dword s103, s[0:1], s0
-// CHECK: [0x00,0x80,0x33,0xc0]
+s_load_dword s103, s[2:3], s2
+// CHECK: [0x02,0x82,0x33,0xc0]
 
-s_load_dword vcc_lo, s[0:1], s0
-// CHECK: [0x00,0x00,0x35,0xc0]
+s_load_dword vcc_lo, s[2:3], s2
+// CHECK: [0x02,0x02,0x35,0xc0]
 
-s_load_dword vcc_hi, s[0:1], s0
-// CHECK: [0x00,0x80,0x35,0xc0]
+s_load_dword vcc_hi, s[2:3], s2
+// CHECK: [0x02,0x82,0x35,0xc0]
 
-s_load_dword s0, s[2:3], s0
-// CHECK: [0x00,0x02,0x00,0xc0]
+s_load_dword s5, s[4:5], s2
+// CHECK: [0x02,0x84,0x02,0xc0]
 
-s_load_dword s0, s[102:103], s0
-// CHECK: [0x00,0x66,0x00,0xc0]
+s_load_dword s5, s[102:103], s2
+// CHECK: [0x02,0xe6,0x02,0xc0]
 
-s_load_dword s0, flat_scratch, s0
-// CHECK: [0x00,0x68,0x00,0xc0]
+s_load_dword s5, flat_scratch, s2
+// CHECK: [0x02,0xe8,0x02,0xc0]
 
-s_load_dword s0, vcc, s0
-// CHECK: [0x00,0x6a,0x00,0xc0]
+s_load_dword s5, vcc, s2
+// CHECK: [0x02,0xea,0x02,0xc0]
 
-s_load_dword s0, tba, s0
-// CHECK: [0x00,0x6c,0x00,0xc0]
+s_load_dword s5, tba, s2
+// CHECK: [0x02,0xec,0x02,0xc0]
 
-s_load_dword s0, tma, s0
-// CHECK: [0x00,0x6e,0x00,0xc0]
+s_load_dword s5, tma, s2
+// CHECK: [0x02,0xee,0x02,0xc0]
 
-s_load_dword s0, ttmp[10:11], s0
-// CHECK: [0x00,0x7a,0x00,0xc0]
+s_load_dword s5, ttmp[10:11], s2
+// CHECK: [0x02,0xfa,0x02,0xc0]
 
-s_load_dword s0, s[0:1], s103
-// CHECK: [0x67,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], s103
+// CHECK: [0x67,0x82,0x02,0xc0]
 
-s_load_dword s0, s[0:1], flat_scratch_lo
-// CHECK: [0x68,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], flat_scratch_lo
+// CHECK: [0x68,0x82,0x02,0xc0]
 
-s_load_dword s0, s[0:1], flat_scratch_hi
-// CHECK: [0x69,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], flat_scratch_hi
+// CHECK: [0x69,0x82,0x02,0xc0]
 
-s_load_dword s0, s[0:1], vcc_lo
-// CHECK: [0x6a,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], vcc_lo
+// CHECK: [0x6a,0x82,0x02,0xc0]
 
-s_load_dword s0, s[0:1], vcc_hi
-// CHECK: [0x6b,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], vcc_hi
+// CHECK: [0x6b,0x82,0x02,0xc0]
 
-s_load_dword s0, s[0:1], tba_lo
-// CHECK: [0x6c,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], tba_lo
+// CHECK: [0x6c,0x82,0x02,0xc0]
 
-s_load_dword s0, s[0:1], tba_hi
-// CHECK: [0x6d,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], tba_hi
+// CHECK: [0x6d,0x82,0x02,0xc0]
 
-s_load_dword s0, s[0:1], tma_lo
-// CHECK: [0x6e,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], tma_lo
+// CHECK: [0x6e,0x82,0x02,0xc0]
 
-s_load_dword s0, s[0:1], tma_hi
-// CHECK: [0x6f,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], tma_hi
+// CHECK: [0x6f,0x82,0x02,0xc0]
 
-s_load_dword s0, s[0:1], ttmp11
-// CHECK: [0x7b,0x00,0x00,0xc0]
+s_load_dword s5, s[2:3], ttmp11
+// CHECK: [0x7b,0x82,0x02,0xc0]
 
-s_load_dword s0, s[0:1], 0xaf123456
-// CHECK: [0xff,0x00,0x00,0xc0,0x56,0x34,0x12,0xaf]
+s_load_dword s5, s[2:3], 0xaf123456
+// CHECK: [0xff,0x82,0x02,0xc0,0x56,0x34,0x12,0xaf]
 
-s_load_dword s0, s[0:1], 0x3f717273
-// CHECK: [0xff,0x00,0x00,0xc0,0x73,0x72,0x71,0x3f]
+s_load_dword s5, s[2:3], 0x3f717273
+// CHECK: [0xff,0x82,0x02,0xc0,0x73,0x72,0x71,0x3f]
 
-s_load_dword s0, s[0:1], 0x7f
-// CHECK: [0x7f,0x01,0x00,0xc0]
+s_load_dword s5, s[2:3], 0x7f
+// CHECK: [0x7f,0x83,0x02,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x41,0xc0]
+s_load_dwordx2 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x46,0xc0]
 
-s_load_dwordx2 s[102:103], s[0:1], s0
-// CHECK: [0x00,0x00,0x73,0xc0]
+s_load_dwordx2 s[102:103], s[2:3], s2
+// CHECK: [0x02,0x02,0x73,0xc0]
 
-s_load_dwordx2 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0x75,0xc0]
+s_load_dwordx2 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0x75,0xc0]
 
-s_load_dwordx2 s[0:1], s[2:3], s0
-// CHECK: [0x00,0x02,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[4:5], s2
+// CHECK: [0x02,0x04,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[102:103], s0
-// CHECK: [0x00,0x66,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[102:103], s2
+// CHECK: [0x02,0x66,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], flat_scratch, s0
-// CHECK: [0x00,0x68,0x40,0xc0]
+s_load_dwordx2 s[10:11], flat_scratch, s2
+// CHECK: [0x02,0x68,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], vcc, s0
-// CHECK: [0x00,0x6a,0x40,0xc0]
+s_load_dwordx2 s[10:11], vcc, s2
+// CHECK: [0x02,0x6a,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], tba, s0
-// CHECK: [0x00,0x6c,0x40,0xc0]
+s_load_dwordx2 s[10:11], tba, s2
+// CHECK: [0x02,0x6c,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], tma, s0
-// CHECK: [0x00,0x6e,0x40,0xc0]
+s_load_dwordx2 s[10:11], tma, s2
+// CHECK: [0x02,0x6e,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], ttmp[10:11], s0
-// CHECK: [0x00,0x7a,0x40,0xc0]
+s_load_dwordx2 s[10:11], ttmp[10:11], s2
+// CHECK: [0x02,0x7a,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], s103
-// CHECK: [0x67,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], s103
+// CHECK: [0x67,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x68,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x68,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x69,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x69,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x6a,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x6a,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x6b,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x6b,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], tba_lo
-// CHECK: [0x6c,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], tba_lo
+// CHECK: [0x6c,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], tba_hi
-// CHECK: [0x6d,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], tba_hi
+// CHECK: [0x6d,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], tma_lo
-// CHECK: [0x6e,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], tma_lo
+// CHECK: [0x6e,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], tma_hi
-// CHECK: [0x6f,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], tma_hi
+// CHECK: [0x6f,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], ttmp11
-// CHECK: [0x7b,0x00,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], ttmp11
+// CHECK: [0x7b,0x02,0x45,0xc0]
 
-s_load_dwordx2 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0xff,0x00,0x40,0xc0,0x56,0x34,0x12,0xaf]
+s_load_dwordx2 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0xff,0x02,0x45,0xc0,0x56,0x34,0x12,0xaf]
 
-s_load_dwordx2 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0xff,0x00,0x40,0xc0,0x73,0x72,0x71,0x3f]
+s_load_dwordx2 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0xff,0x02,0x45,0xc0,0x73,0x72,0x71,0x3f]
 
-s_load_dwordx2 s[0:1], s[0:1], 0x7f
-// CHECK: [0x7f,0x01,0x40,0xc0]
+s_load_dwordx2 s[10:11], s[2:3], 0x7f
+// CHECK: [0x7f,0x03,0x45,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], s2
+// CHECK: [0x02,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[4:7], s[0:1], s0
-// CHECK: [0x00,0x00,0x82,0xc0]
+s_load_dwordx4 s[24:27], s[2:3], s2
+// CHECK: [0x02,0x02,0x8c,0xc0]
 
-s_load_dwordx4 s[100:103], s[0:1], s0
-// CHECK: [0x00,0x00,0xb2,0xc0]
+s_load_dwordx4 s[100:103], s[2:3], s2
+// CHECK: [0x02,0x02,0xb2,0xc0]
 
-s_load_dwordx4 s[0:3], s[2:3], s0
-// CHECK: [0x00,0x02,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[4:5], s2
+// CHECK: [0x02,0x04,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[102:103], s0
-// CHECK: [0x00,0x66,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[102:103], s2
+// CHECK: [0x02,0x66,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], flat_scratch, s0
-// CHECK: [0x00,0x68,0x80,0xc0]
+s_load_dwordx4 s[20:23], flat_scratch, s2
+// CHECK: [0x02,0x68,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], vcc, s0
-// CHECK: [0x00,0x6a,0x80,0xc0]
+s_load_dwordx4 s[20:23], vcc, s2
+// CHECK: [0x02,0x6a,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], tba, s0
-// CHECK: [0x00,0x6c,0x80,0xc0]
+s_load_dwordx4 s[20:23], tba, s2
+// CHECK: [0x02,0x6c,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], tma, s0
-// CHECK: [0x00,0x6e,0x80,0xc0]
+s_load_dwordx4 s[20:23], tma, s2
+// CHECK: [0x02,0x6e,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], ttmp[10:11], s0
-// CHECK: [0x00,0x7a,0x80,0xc0]
+s_load_dwordx4 s[20:23], ttmp[10:11], s2
+// CHECK: [0x02,0x7a,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], s103
-// CHECK: [0x67,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], s103
+// CHECK: [0x67,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], flat_scratch_lo
-// CHECK: [0x68,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], flat_scratch_lo
+// CHECK: [0x68,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], flat_scratch_hi
-// CHECK: [0x69,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], flat_scratch_hi
+// CHECK: [0x69,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], vcc_lo
-// CHECK: [0x6a,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], vcc_lo
+// CHECK: [0x6a,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], vcc_hi
-// CHECK: [0x6b,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], vcc_hi
+// CHECK: [0x6b,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], tba_lo
-// CHECK: [0x6c,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], tba_lo
+// CHECK: [0x6c,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], tba_hi
-// CHECK: [0x6d,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], tba_hi
+// CHECK: [0x6d,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], tma_lo
-// CHECK: [0x6e,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], tma_lo
+// CHECK: [0x6e,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], tma_hi
-// CHECK: [0x6f,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], tma_hi
+// CHECK: [0x6f,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], ttmp11
-// CHECK: [0x7b,0x00,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], ttmp11
+// CHECK: [0x7b,0x02,0x8a,0xc0]
 
-s_load_dwordx4 s[0:3], s[0:1], 0xaf123456
-// CHECK: [0xff,0x00,0x80,0xc0,0x56,0x34,0x12,0xaf]
+s_load_dwordx4 s[20:23], s[2:3], 0xaf123456
+// CHECK: [0xff,0x02,0x8a,0xc0,0x56,0x34,0x12,0xaf]
 
-s_load_dwordx4 s[0:3], s[0:1], 0x3f717273
-// CHECK: [0xff,0x00,0x80,0xc0,0x73,0x72,0x71,0x3f]
+s_load_dwordx4 s[20:23], s[2:3], 0x3f717273
+// CHECK: [0xff,0x02,0x8a,0xc0,0x73,0x72,0x71,0x3f]
 
-s_load_dwordx4 s[0:3], s[0:1], 0x7f
-// CHECK: [0x7f,0x01,0x80,0xc0]
+s_load_dwordx4 s[20:23], s[2:3], 0x7f
+// CHECK: [0x7f,0x03,0x8a,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], s0
-// CHECK: [0x00,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], s2
+// CHECK: [0x02,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[4:11], s[0:1], s0
-// CHECK: [0x00,0x00,0xc2,0xc0]
+s_load_dwordx8 s[24:31], s[2:3], s2
+// CHECK: [0x02,0x02,0xcc,0xc0]
 
-s_load_dwordx8 s[96:103], s[0:1], s0
-// CHECK: [0x00,0x00,0xf0,0xc0]
+s_load_dwordx8 s[96:103], s[2:3], s2
+// CHECK: [0x02,0x02,0xf0,0xc0]
 
-s_load_dwordx8 s[0:7], s[2:3], s0
-// CHECK: [0x00,0x02,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[4:5], s2
+// CHECK: [0x02,0x04,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[102:103], s0
-// CHECK: [0x00,0x66,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[102:103], s2
+// CHECK: [0x02,0x66,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], flat_scratch, s0
-// CHECK: [0x00,0x68,0xc0,0xc0]
+s_load_dwordx8 s[20:27], flat_scratch, s2
+// CHECK: [0x02,0x68,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], vcc, s0
-// CHECK: [0x00,0x6a,0xc0,0xc0]
+s_load_dwordx8 s[20:27], vcc, s2
+// CHECK: [0x02,0x6a,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], tba, s0
-// CHECK: [0x00,0x6c,0xc0,0xc0]
+s_load_dwordx8 s[20:27], tba, s2
+// CHECK: [0x02,0x6c,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], tma, s0
-// CHECK: [0x00,0x6e,0xc0,0xc0]
+s_load_dwordx8 s[20:27], tma, s2
+// CHECK: [0x02,0x6e,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], ttmp[10:11], s0
-// CHECK: [0x00,0x7a,0xc0,0xc0]
+s_load_dwordx8 s[20:27], ttmp[10:11], s2
+// CHECK: [0x02,0x7a,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], s103
-// CHECK: [0x67,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], s103
+// CHECK: [0x67,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], flat_scratch_lo
-// CHECK: [0x68,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], flat_scratch_lo
+// CHECK: [0x68,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], flat_scratch_hi
-// CHECK: [0x69,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], flat_scratch_hi
+// CHECK: [0x69,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], vcc_lo
-// CHECK: [0x6a,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], vcc_lo
+// CHECK: [0x6a,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], vcc_hi
-// CHECK: [0x6b,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], vcc_hi
+// CHECK: [0x6b,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], tba_lo
-// CHECK: [0x6c,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], tba_lo
+// CHECK: [0x6c,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], tba_hi
-// CHECK: [0x6d,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], tba_hi
+// CHECK: [0x6d,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], tma_lo
-// CHECK: [0x6e,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], tma_lo
+// CHECK: [0x6e,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], tma_hi
-// CHECK: [0x6f,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], tma_hi
+// CHECK: [0x6f,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], ttmp11
-// CHECK: [0x7b,0x00,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], ttmp11
+// CHECK: [0x7b,0x02,0xca,0xc0]
 
-s_load_dwordx8 s[0:7], s[0:1], 0xaf123456
-// CHECK: [0xff,0x00,0xc0,0xc0,0x56,0x34,0x12,0xaf]
+s_load_dwordx8 s[20:27], s[2:3], 0xaf123456
+// CHECK: [0xff,0x02,0xca,0xc0,0x56,0x34,0x12,0xaf]
 
-s_load_dwordx8 s[0:7], s[0:1], 0x3f717273
-// CHECK: [0xff,0x00,0xc0,0xc0,0x73,0x72,0x71,0x3f]
+s_load_dwordx8 s[20:27], s[2:3], 0x3f717273
+// CHECK: [0xff,0x02,0xca,0xc0,0x73,0x72,0x71,0x3f]
 
-s_load_dwordx8 s[0:7], s[0:1], 0x7f
-// CHECK: [0x7f,0x01,0xc0,0xc0]
+s_load_dwordx8 s[20:27], s[2:3], 0x7f
+// CHECK: [0x7f,0x03,0xca,0xc0]
 
-s_load_dwordx16 s[0:15], s[0:1], s0
-// CHECK: [0x00,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], s2
+// CHECK: [0x02,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[4:19], s[0:1], s0
-// CHECK: [0x00,0x00,0x02,0xc1]
+s_load_dwordx16 s[24:39], s[2:3], s2
+// CHECK: [0x02,0x02,0x0c,0xc1]
 
-s_load_dwordx16 s[88:103], s[0:1], s0
-// CHECK: [0x00,0x00,0x2c,0xc1]
+s_load_dwordx16 s[88:103], s[2:3], s2
+// CHECK: [0x02,0x02,0x2c,0xc1]
 
-s_load_dwordx16 s[0:15], s[2:3], s0
-// CHECK: [0x00,0x02,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[4:5], s2
+// CHECK: [0x02,0x04,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[102:103], s0
-// CHECK: [0x00,0x66,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[102:103], s2
+// CHECK: [0x02,0x66,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], flat_scratch, s0
-// CHECK: [0x00,0x68,0x00,0xc1]
+s_load_dwordx16 s[20:35], flat_scratch, s2
+// CHECK: [0x02,0x68,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], vcc, s0
-// CHECK: [0x00,0x6a,0x00,0xc1]
+s_load_dwordx16 s[20:35], vcc, s2
+// CHECK: [0x02,0x6a,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], tba, s0
-// CHECK: [0x00,0x6c,0x00,0xc1]
+s_load_dwordx16 s[20:35], tba, s2
+// CHECK: [0x02,0x6c,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], tma, s0
-// CHECK: [0x00,0x6e,0x00,0xc1]
+s_load_dwordx16 s[20:35], tma, s2
+// CHECK: [0x02,0x6e,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], ttmp[10:11], s0
-// CHECK: [0x00,0x7a,0x00,0xc1]
+s_load_dwordx16 s[20:35], ttmp[10:11], s2
+// CHECK: [0x02,0x7a,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], s103
-// CHECK: [0x67,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], s103
+// CHECK: [0x67,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], flat_scratch_lo
-// CHECK: [0x68,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], flat_scratch_lo
+// CHECK: [0x68,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], flat_scratch_hi
-// CHECK: [0x69,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], flat_scratch_hi
+// CHECK: [0x69,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], vcc_lo
-// CHECK: [0x6a,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], vcc_lo
+// CHECK: [0x6a,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], vcc_hi
-// CHECK: [0x6b,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], vcc_hi
+// CHECK: [0x6b,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], tba_lo
-// CHECK: [0x6c,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], tba_lo
+// CHECK: [0x6c,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], tba_hi
-// CHECK: [0x6d,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], tba_hi
+// CHECK: [0x6d,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], tma_lo
-// CHECK: [0x6e,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], tma_lo
+// CHECK: [0x6e,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], tma_hi
-// CHECK: [0x6f,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], tma_hi
+// CHECK: [0x6f,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], ttmp11
-// CHECK: [0x7b,0x00,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], ttmp11
+// CHECK: [0x7b,0x02,0x0a,0xc1]
 
-s_load_dwordx16 s[0:15], s[0:1], 0xaf123456
-// CHECK: [0xff,0x00,0x00,0xc1,0x56,0x34,0x12,0xaf]
+s_load_dwordx16 s[20:35], s[2:3], 0xaf123456
+// CHECK: [0xff,0x02,0x0a,0xc1,0x56,0x34,0x12,0xaf]
 
-s_load_dwordx16 s[0:15], s[0:1], 0x3f717273
-// CHECK: [0xff,0x00,0x00,0xc1,0x73,0x72,0x71,0x3f]
+s_load_dwordx16 s[20:35], s[2:3], 0x3f717273
+// CHECK: [0xff,0x02,0x0a,0xc1,0x73,0x72,0x71,0x3f]
 
-s_load_dwordx16 s[0:15], s[0:1], 0x7f
-// CHECK: [0x7f,0x01,0x00,0xc1]
+s_load_dwordx16 s[20:35], s[2:3], 0x7f
+// CHECK: [0x7f,0x03,0x0a,0xc1]
 
-s_buffer_load_dword s0, s[0:3], s0
-// CHECK: [0x00,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], s2
+// CHECK: [0x02,0x84,0x02,0xc2]
 
-s_buffer_load_dword s103, s[0:3], s0
-// CHECK: [0x00,0x80,0x33,0xc2]
+s_buffer_load_dword s103, s[4:7], s2
+// CHECK: [0x02,0x84,0x33,0xc2]
 
-s_buffer_load_dword vcc_lo, s[0:3], s0
-// CHECK: [0x00,0x00,0x35,0xc2]
+s_buffer_load_dword vcc_lo, s[4:7], s2
+// CHECK: [0x02,0x04,0x35,0xc2]
 
-s_buffer_load_dword vcc_hi, s[0:3], s0
-// CHECK: [0x00,0x80,0x35,0xc2]
+s_buffer_load_dword vcc_hi, s[4:7], s2
+// CHECK: [0x02,0x84,0x35,0xc2]
 
-s_buffer_load_dword s0, s[4:7], s0
-// CHECK: [0x00,0x04,0x00,0xc2]
+s_buffer_load_dword s5, s[8:11], s2
+// CHECK: [0x02,0x88,0x02,0xc2]
 
-s_buffer_load_dword s0, s[100:103], s0
-// CHECK: [0x00,0x64,0x00,0xc2]
+s_buffer_load_dword s5, s[100:103], s2
+// CHECK: [0x02,0xe4,0x02,0xc2]
 
-s_buffer_load_dword s0, ttmp[8:11], s0
-// CHECK: [0x00,0x78,0x00,0xc2]
+s_buffer_load_dword s5, ttmp[8:11], s2
+// CHECK: [0x02,0xf8,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], s103
-// CHECK: [0x67,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], s103
+// CHECK: [0x67,0x84,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], flat_scratch_lo
-// CHECK: [0x68,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], flat_scratch_lo
+// CHECK: [0x68,0x84,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], flat_scratch_hi
-// CHECK: [0x69,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], flat_scratch_hi
+// CHECK: [0x69,0x84,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], vcc_lo
-// CHECK: [0x6a,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], vcc_lo
+// CHECK: [0x6a,0x84,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], vcc_hi
-// CHECK: [0x6b,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], vcc_hi
+// CHECK: [0x6b,0x84,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], tba_lo
-// CHECK: [0x6c,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], tba_lo
+// CHECK: [0x6c,0x84,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], tba_hi
-// CHECK: [0x6d,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], tba_hi
+// CHECK: [0x6d,0x84,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], tma_lo
-// CHECK: [0x6e,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], tma_lo
+// CHECK: [0x6e,0x84,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], tma_hi
-// CHECK: [0x6f,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], tma_hi
+// CHECK: [0x6f,0x84,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], ttmp11
-// CHECK: [0x7b,0x00,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], ttmp11
+// CHECK: [0x7b,0x84,0x02,0xc2]
 
-s_buffer_load_dword s0, s[0:3], 0xaf123456
-// CHECK: [0xff,0x00,0x00,0xc2,0x56,0x34,0x12,0xaf]
+s_buffer_load_dword s5, s[4:7], 0xaf123456
+// CHECK: [0xff,0x84,0x02,0xc2,0x56,0x34,0x12,0xaf]
 
-s_buffer_load_dword s0, s[0:3], 0x3f717273
-// CHECK: [0xff,0x00,0x00,0xc2,0x73,0x72,0x71,0x3f]
+s_buffer_load_dword s5, s[4:7], 0x3f717273
+// CHECK: [0xff,0x84,0x02,0xc2,0x73,0x72,0x71,0x3f]
 
-s_buffer_load_dword s0, s[0:3], 0x7f
-// CHECK: [0x7f,0x01,0x00,0xc2]
+s_buffer_load_dword s5, s[4:7], 0x7f
+// CHECK: [0x7f,0x85,0x02,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], s0
-// CHECK: [0x00,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], s2
+// CHECK: [0x02,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[2:3], s[0:3], s0
-// CHECK: [0x00,0x00,0x41,0xc2]
+s_buffer_load_dwordx2 s[12:13], s[4:7], s2
+// CHECK: [0x02,0x04,0x46,0xc2]
 
-s_buffer_load_dwordx2 s[102:103], s[0:3], s0
-// CHECK: [0x00,0x00,0x73,0xc2]
+s_buffer_load_dwordx2 s[102:103], s[4:7], s2
+// CHECK: [0x02,0x04,0x73,0xc2]
 
-s_buffer_load_dwordx2 vcc, s[0:3], s0
-// CHECK: [0x00,0x00,0x75,0xc2]
+s_buffer_load_dwordx2 vcc, s[4:7], s2
+// CHECK: [0x02,0x04,0x75,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[4:7], s0
-// CHECK: [0x00,0x04,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[8:11], s2
+// CHECK: [0x02,0x08,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[100:103], s0
-// CHECK: [0x00,0x64,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[100:103], s2
+// CHECK: [0x02,0x64,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], ttmp[8:11], s0
-// CHECK: [0x00,0x78,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], ttmp[8:11], s2
+// CHECK: [0x02,0x78,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], s103
-// CHECK: [0x67,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], s103
+// CHECK: [0x67,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], flat_scratch_lo
-// CHECK: [0x68,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], flat_scratch_lo
+// CHECK: [0x68,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], flat_scratch_hi
-// CHECK: [0x69,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], flat_scratch_hi
+// CHECK: [0x69,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], vcc_lo
-// CHECK: [0x6a,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], vcc_lo
+// CHECK: [0x6a,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], vcc_hi
-// CHECK: [0x6b,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], vcc_hi
+// CHECK: [0x6b,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], tba_lo
-// CHECK: [0x6c,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], tba_lo
+// CHECK: [0x6c,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], tba_hi
-// CHECK: [0x6d,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], tba_hi
+// CHECK: [0x6d,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], tma_lo
-// CHECK: [0x6e,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], tma_lo
+// CHECK: [0x6e,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], tma_hi
-// CHECK: [0x6f,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], tma_hi
+// CHECK: [0x6f,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], ttmp11
-// CHECK: [0x7b,0x00,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], ttmp11
+// CHECK: [0x7b,0x04,0x45,0xc2]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], 0xaf123456
-// CHECK: [0xff,0x00,0x40,0xc2,0x56,0x34,0x12,0xaf]
+s_buffer_load_dwordx2 s[10:11], s[4:7], 0xaf123456
+// CHECK: [0xff,0x04,0x45,0xc2,0x56,0x34,0x12,0xaf]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], 0x3f717273
-// CHECK: [0xff,0x00,0x40,0xc2,0x73,0x72,0x71,0x3f]
+s_buffer_load_dwordx2 s[10:11], s[4:7], 0x3f717273
+// CHECK: [0xff,0x04,0x45,0xc2,0x73,0x72,0x71,0x3f]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], 0x7f
-// CHECK: [0x7f,0x01,0x40,0xc2]
+s_buffer_load_dwordx2 s[10:11], s[4:7], 0x7f
+// CHECK: [0x7f,0x05,0x45,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], s0
-// CHECK: [0x00,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], s2
+// CHECK: [0x02,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[4:7], s[0:3], s0
-// CHECK: [0x00,0x00,0x82,0xc2]
+s_buffer_load_dwordx4 s[24:27], s[4:7], s2
+// CHECK: [0x02,0x04,0x8c,0xc2]
 
-s_buffer_load_dwordx4 s[100:103], s[0:3], s0
-// CHECK: [0x00,0x00,0xb2,0xc2]
+s_buffer_load_dwordx4 s[100:103], s[4:7], s2
+// CHECK: [0x02,0x04,0xb2,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[4:7], s0
-// CHECK: [0x00,0x04,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[8:11], s2
+// CHECK: [0x02,0x08,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[100:103], s0
-// CHECK: [0x00,0x64,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[100:103], s2
+// CHECK: [0x02,0x64,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], ttmp[8:11], s0
-// CHECK: [0x00,0x78,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], ttmp[8:11], s2
+// CHECK: [0x02,0x78,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], s103
-// CHECK: [0x67,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], s103
+// CHECK: [0x67,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], flat_scratch_lo
-// CHECK: [0x68,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], flat_scratch_lo
+// CHECK: [0x68,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], flat_scratch_hi
-// CHECK: [0x69,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], flat_scratch_hi
+// CHECK: [0x69,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], vcc_lo
-// CHECK: [0x6a,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], vcc_lo
+// CHECK: [0x6a,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], vcc_hi
-// CHECK: [0x6b,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], vcc_hi
+// CHECK: [0x6b,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], tba_lo
-// CHECK: [0x6c,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], tba_lo
+// CHECK: [0x6c,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], tba_hi
-// CHECK: [0x6d,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], tba_hi
+// CHECK: [0x6d,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], tma_lo
-// CHECK: [0x6e,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], tma_lo
+// CHECK: [0x6e,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], tma_hi
-// CHECK: [0x6f,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], tma_hi
+// CHECK: [0x6f,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], ttmp11
-// CHECK: [0x7b,0x00,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], ttmp11
+// CHECK: [0x7b,0x04,0x8a,0xc2]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], 0xaf123456
-// CHECK: [0xff,0x00,0x80,0xc2,0x56,0x34,0x12,0xaf]
+s_buffer_load_dwordx4 s[20:23], s[4:7], 0xaf123456
+// CHECK: [0xff,0x04,0x8a,0xc2,0x56,0x34,0x12,0xaf]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], 0x3f717273
-// CHECK: [0xff,0x00,0x80,0xc2,0x73,0x72,0x71,0x3f]
+s_buffer_load_dwordx4 s[20:23], s[4:7], 0x3f717273
+// CHECK: [0xff,0x04,0x8a,0xc2,0x73,0x72,0x71,0x3f]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], 0x7f
-// CHECK: [0x7f,0x01,0x80,0xc2]
+s_buffer_load_dwordx4 s[20:23], s[4:7], 0x7f
+// CHECK: [0x7f,0x05,0x8a,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], s0
-// CHECK: [0x00,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], s2
+// CHECK: [0x02,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[4:11], s[0:3], s0
-// CHECK: [0x00,0x00,0xc2,0xc2]
+s_buffer_load_dwordx8 s[24:31], s[4:7], s2
+// CHECK: [0x02,0x04,0xcc,0xc2]
 
-s_buffer_load_dwordx8 s[96:103], s[0:3], s0
-// CHECK: [0x00,0x00,0xf0,0xc2]
+s_buffer_load_dwordx8 s[96:103], s[4:7], s2
+// CHECK: [0x02,0x04,0xf0,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[4:7], s0
-// CHECK: [0x00,0x04,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[8:11], s2
+// CHECK: [0x02,0x08,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[100:103], s0
-// CHECK: [0x00,0x64,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[100:103], s2
+// CHECK: [0x02,0x64,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], ttmp[8:11], s0
-// CHECK: [0x00,0x78,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], ttmp[8:11], s2
+// CHECK: [0x02,0x78,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], s103
-// CHECK: [0x67,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], s103
+// CHECK: [0x67,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], flat_scratch_lo
-// CHECK: [0x68,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], flat_scratch_lo
+// CHECK: [0x68,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], flat_scratch_hi
-// CHECK: [0x69,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], flat_scratch_hi
+// CHECK: [0x69,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], vcc_lo
-// CHECK: [0x6a,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], vcc_lo
+// CHECK: [0x6a,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], vcc_hi
-// CHECK: [0x6b,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], vcc_hi
+// CHECK: [0x6b,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], tba_lo
-// CHECK: [0x6c,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], tba_lo
+// CHECK: [0x6c,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], tba_hi
-// CHECK: [0x6d,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], tba_hi
+// CHECK: [0x6d,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], tma_lo
-// CHECK: [0x6e,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], tma_lo
+// CHECK: [0x6e,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], tma_hi
-// CHECK: [0x6f,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], tma_hi
+// CHECK: [0x6f,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], ttmp11
-// CHECK: [0x7b,0x00,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], ttmp11
+// CHECK: [0x7b,0x04,0xca,0xc2]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], 0xaf123456
-// CHECK: [0xff,0x00,0xc0,0xc2,0x56,0x34,0x12,0xaf]
+s_buffer_load_dwordx8 s[20:27], s[4:7], 0xaf123456
+// CHECK: [0xff,0x04,0xca,0xc2,0x56,0x34,0x12,0xaf]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], 0x3f717273
-// CHECK: [0xff,0x00,0xc0,0xc2,0x73,0x72,0x71,0x3f]
+s_buffer_load_dwordx8 s[20:27], s[4:7], 0x3f717273
+// CHECK: [0xff,0x04,0xca,0xc2,0x73,0x72,0x71,0x3f]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], 0x7f
-// CHECK: [0x7f,0x01,0xc0,0xc2]
+s_buffer_load_dwordx8 s[20:27], s[4:7], 0x7f
+// CHECK: [0x7f,0x05,0xca,0xc2]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], s0
-// CHECK: [0x00,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], s2
+// CHECK: [0x02,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[4:19], s[0:3], s0
-// CHECK: [0x00,0x00,0x02,0xc3]
+s_buffer_load_dwordx16 s[24:39], s[4:7], s2
+// CHECK: [0x02,0x04,0x0c,0xc3]
 
-s_buffer_load_dwordx16 s[88:103], s[0:3], s0
-// CHECK: [0x00,0x00,0x2c,0xc3]
+s_buffer_load_dwordx16 s[88:103], s[4:7], s2
+// CHECK: [0x02,0x04,0x2c,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[4:7], s0
-// CHECK: [0x00,0x04,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[8:11], s2
+// CHECK: [0x02,0x08,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[100:103], s0
-// CHECK: [0x00,0x64,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[100:103], s2
+// CHECK: [0x02,0x64,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], ttmp[8:11], s0
-// CHECK: [0x00,0x78,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], ttmp[8:11], s2
+// CHECK: [0x02,0x78,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], s103
-// CHECK: [0x67,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], s103
+// CHECK: [0x67,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], flat_scratch_lo
-// CHECK: [0x68,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], flat_scratch_lo
+// CHECK: [0x68,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], flat_scratch_hi
-// CHECK: [0x69,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], flat_scratch_hi
+// CHECK: [0x69,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], vcc_lo
-// CHECK: [0x6a,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], vcc_lo
+// CHECK: [0x6a,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], vcc_hi
-// CHECK: [0x6b,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], vcc_hi
+// CHECK: [0x6b,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], tba_lo
-// CHECK: [0x6c,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], tba_lo
+// CHECK: [0x6c,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], tba_hi
-// CHECK: [0x6d,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], tba_hi
+// CHECK: [0x6d,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], tma_lo
-// CHECK: [0x6e,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], tma_lo
+// CHECK: [0x6e,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], tma_hi
-// CHECK: [0x6f,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], tma_hi
+// CHECK: [0x6f,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], ttmp11
-// CHECK: [0x7b,0x00,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], ttmp11
+// CHECK: [0x7b,0x04,0x0a,0xc3]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], 0xaf123456
-// CHECK: [0xff,0x00,0x00,0xc3,0x56,0x34,0x12,0xaf]
+s_buffer_load_dwordx16 s[20:35], s[4:7], 0xaf123456
+// CHECK: [0xff,0x04,0x0a,0xc3,0x56,0x34,0x12,0xaf]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], 0x3f717273
-// CHECK: [0xff,0x00,0x00,0xc3,0x73,0x72,0x71,0x3f]
+s_buffer_load_dwordx16 s[20:35], s[4:7], 0x3f717273
+// CHECK: [0xff,0x04,0x0a,0xc3,0x73,0x72,0x71,0x3f]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], 0x7f
-// CHECK: [0x7f,0x01,0x00,0xc3]
+s_buffer_load_dwordx16 s[20:35], s[4:7], 0x7f
+// CHECK: [0x7f,0x05,0x0a,0xc3]
 
 s_dcache_inv_vol
 // CHECK: [0x00,0x00,0x40,0xc7]
 
-s_memtime s[0:1]
-// CHECK: [0x00,0x00,0x80,0xc7]
+s_memtime s[10:11]
+// CHECK: [0x00,0x00,0x85,0xc7]
 
-s_memtime s[2:3]
-// CHECK: [0x00,0x00,0x81,0xc7]
+s_memtime s[12:13]
+// CHECK: [0x00,0x00,0x86,0xc7]
 
 s_memtime s[102:103]
 // CHECK: [0x00,0x00,0xb3,0xc7]
@@ -10080,2513 +10106,2501 @@ s_memtime vcc
 s_dcache_inv
 // CHECK: [0x00,0x00,0xc0,0xc7]
 
-s_mov_b32 s0, s0
-// CHECK: [0x00,0x03,0x80,0xbe]
-
-s_mov_b32 s103, s0
-// CHECK: [0x00,0x03,0xe7,0xbe]
-
-s_mov_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x03,0xe8,0xbe]
-
-s_mov_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x03,0xe9,0xbe]
+s_mov_b32 s5, s1
+// CHECK: [0x01,0x03,0x85,0xbe]
 
-s_mov_b32 vcc_lo, s0
-// CHECK: [0x00,0x03,0xea,0xbe]
+s_mov_b32 s103, s1
+// CHECK: [0x01,0x03,0xe7,0xbe]
 
-s_mov_b32 vcc_hi, s0
-// CHECK: [0x00,0x03,0xeb,0xbe]
+s_mov_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x03,0xe8,0xbe]
 
-s_mov_b32 tba_lo, s0
-// CHECK: [0x00,0x03,0xec,0xbe]
+s_mov_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x03,0xe9,0xbe]
 
-s_mov_b32 tba_hi, s0
-// CHECK: [0x00,0x03,0xed,0xbe]
+s_mov_b32 vcc_lo, s1
+// CHECK: [0x01,0x03,0xea,0xbe]
 
-s_mov_b32 tma_lo, s0
-// CHECK: [0x00,0x03,0xee,0xbe]
+s_mov_b32 vcc_hi, s1
+// CHECK: [0x01,0x03,0xeb,0xbe]
 
-s_mov_b32 tma_hi, s0
-// CHECK: [0x00,0x03,0xef,0xbe]
+s_mov_b32 tba_lo, s1
+// CHECK: [0x01,0x03,0xec,0xbe]
 
-s_mov_b32 ttmp11, s0
-// CHECK: [0x00,0x03,0xfb,0xbe]
+s_mov_b32 tba_hi, s1
+// CHECK: [0x01,0x03,0xed,0xbe]
 
-s_mov_b32 m0, s0
-// CHECK: [0x00,0x03,0xfc,0xbe]
+s_mov_b32 tma_lo, s1
+// CHECK: [0x01,0x03,0xee,0xbe]
 
-s_mov_b32 exec_lo, s0
-// CHECK: [0x00,0x03,0xfe,0xbe]
+s_mov_b32 tma_hi, s1
+// CHECK: [0x01,0x03,0xef,0xbe]
 
-s_mov_b32 exec_hi, s0
-// CHECK: [0x00,0x03,0xff,0xbe]
+s_mov_b32 ttmp11, s1
+// CHECK: [0x01,0x03,0xfb,0xbe]
 
-s_mov_b32 s0, s103
-// CHECK: [0x67,0x03,0x80,0xbe]
+s_mov_b32 m0, s1
+// CHECK: [0x01,0x03,0xfc,0xbe]
 
-s_mov_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x03,0x80,0xbe]
+s_mov_b32 exec_lo, s1
+// CHECK: [0x01,0x03,0xfe,0xbe]
 
-s_mov_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x03,0x80,0xbe]
+s_mov_b32 exec_hi, s1
+// CHECK: [0x01,0x03,0xff,0xbe]
 
-s_mov_b32 s0, vcc_lo
-// CHECK: [0x6a,0x03,0x80,0xbe]
+s_mov_b32 s5, s103
+// CHECK: [0x67,0x03,0x85,0xbe]
 
-s_mov_b32 s0, vcc_hi
-// CHECK: [0x6b,0x03,0x80,0xbe]
+s_mov_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x03,0x85,0xbe]
 
-s_mov_b32 s0, tba_lo
-// CHECK: [0x6c,0x03,0x80,0xbe]
+s_mov_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x03,0x85,0xbe]
 
-s_mov_b32 s0, tba_hi
-// CHECK: [0x6d,0x03,0x80,0xbe]
+s_mov_b32 s5, vcc_lo
+// CHECK: [0x6a,0x03,0x85,0xbe]
 
-s_mov_b32 s0, tma_lo
-// CHECK: [0x6e,0x03,0x80,0xbe]
+s_mov_b32 s5, vcc_hi
+// CHECK: [0x6b,0x03,0x85,0xbe]
 
-s_mov_b32 s0, tma_hi
-// CHECK: [0x6f,0x03,0x80,0xbe]
+s_mov_b32 s5, tba_lo
+// CHECK: [0x6c,0x03,0x85,0xbe]
 
-s_mov_b32 s0, ttmp11
-// CHECK: [0x7b,0x03,0x80,0xbe]
+s_mov_b32 s5, tba_hi
+// CHECK: [0x6d,0x03,0x85,0xbe]
 
-s_mov_b32 s0, m0
-// CHECK: [0x7c,0x03,0x80,0xbe]
+s_mov_b32 s5, tma_lo
+// CHECK: [0x6e,0x03,0x85,0xbe]
 
-s_mov_b32 s0, exec_lo
-// CHECK: [0x7e,0x03,0x80,0xbe]
+s_mov_b32 s5, tma_hi
+// CHECK: [0x6f,0x03,0x85,0xbe]
 
-s_mov_b32 s0, exec_hi
-// CHECK: [0x7f,0x03,0x80,0xbe]
+s_mov_b32 s5, ttmp11
+// CHECK: [0x7b,0x03,0x85,0xbe]
 
-s_mov_b32 s0, 0
-// CHECK: [0x80,0x03,0x80,0xbe]
+s_mov_b32 s5, m0
+// CHECK: [0x7c,0x03,0x85,0xbe]
 
-s_mov_b32 s0, -1
-// CHECK: [0xc1,0x03,0x80,0xbe]
+s_mov_b32 s5, exec_lo
+// CHECK: [0x7e,0x03,0x85,0xbe]
 
-s_mov_b32 s0, 0.5
-// CHECK: [0xf0,0x03,0x80,0xbe]
+s_mov_b32 s5, exec_hi
+// CHECK: [0x7f,0x03,0x85,0xbe]
 
-s_mov_b32 s0, -4.0
-// CHECK: [0xf7,0x03,0x80,0xbe]
+s_mov_b32 s5, 0
+// CHECK: [0x80,0x03,0x85,0xbe]
 
-s_mov_b32 s0, 0xaf123456
-// CHECK: [0xff,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_mov_b32 s5, -1
+// CHECK: [0xc1,0x03,0x85,0xbe]
 
-s_mov_b32 s0, 0x3f717273
-// CHECK: [0xff,0x03,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_mov_b32 s5, 0.5
+// CHECK: [0xf0,0x03,0x85,0xbe]
 
-s_mov_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x04,0x80,0xbe]
+s_mov_b32 s5, -4.0
+// CHECK: [0xf7,0x03,0x85,0xbe]
 
-s_mov_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x04,0x82,0xbe]
+s_mov_b32 s5, 0xaf123456
+// CHECK: [0xff,0x03,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_mov_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x04,0xe6,0xbe]
+s_mov_b32 s5, 0x3f717273
+// CHECK: [0xff,0x03,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_mov_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x04,0xe8,0xbe]
+s_mov_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x04,0x8a,0xbe]
 
-s_mov_b64 vcc, s[0:1]
-// CHECK: [0x00,0x04,0xea,0xbe]
+s_mov_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x04,0x8c,0xbe]
 
-s_mov_b64 tba, s[0:1]
-// CHECK: [0x00,0x04,0xec,0xbe]
+s_mov_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x04,0xe6,0xbe]
 
-s_mov_b64 tma, s[0:1]
-// CHECK: [0x00,0x04,0xee,0xbe]
+s_mov_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x04,0xe8,0xbe]
 
-s_mov_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x04,0xfa,0xbe]
+s_mov_b64 vcc, s[2:3]
+// CHECK: [0x02,0x04,0xea,0xbe]
 
-s_mov_b64 exec, s[0:1]
-// CHECK: [0x00,0x04,0xfe,0xbe]
+s_mov_b64 tba, s[2:3]
+// CHECK: [0x02,0x04,0xec,0xbe]
 
-s_mov_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x04,0x80,0xbe]
+s_mov_b64 tma, s[2:3]
+// CHECK: [0x02,0x04,0xee,0xbe]
 
-s_mov_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x04,0x80,0xbe]
+s_mov_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x04,0xfa,0xbe]
 
-s_mov_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x04,0x80,0xbe]
+s_mov_b64 exec, s[2:3]
+// CHECK: [0x02,0x04,0xfe,0xbe]
 
-s_mov_b64 s[0:1], vcc
-// CHECK: [0x6a,0x04,0x80,0xbe]
+s_mov_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0xbe]
 
-s_mov_b64 s[0:1], tba
-// CHECK: [0x6c,0x04,0x80,0xbe]
+s_mov_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x04,0x8a,0xbe]
 
-s_mov_b64 s[0:1], tma
-// CHECK: [0x6e,0x04,0x80,0xbe]
+s_mov_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x04,0x8a,0xbe]
 
-s_mov_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x04,0x80,0xbe]
+s_mov_b64 s[10:11], vcc
+// CHECK: [0x6a,0x04,0x8a,0xbe]
 
-s_mov_b64 s[0:1], exec
-// CHECK: [0x7e,0x04,0x80,0xbe]
+s_mov_b64 s[10:11], tba
+// CHECK: [0x6c,0x04,0x8a,0xbe]
 
-s_mov_b64 s[0:1], 0
-// CHECK: [0x80,0x04,0x80,0xbe]
+s_mov_b64 s[10:11], tma
+// CHECK: [0x6e,0x04,0x8a,0xbe]
 
-s_mov_b64 s[0:1], -1
-// CHECK: [0xc1,0x04,0x80,0xbe]
+s_mov_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x04,0x8a,0xbe]
 
-s_mov_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x04,0x80,0xbe]
+s_mov_b64 s[10:11], exec
+// CHECK: [0x7e,0x04,0x8a,0xbe]
 
-s_mov_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x04,0x80,0xbe]
+s_mov_b64 s[10:11], 0
+// CHECK: [0x80,0x04,0x8a,0xbe]
 
-s_mov_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x04,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_mov_b64 s[10:11], -1
+// CHECK: [0xc1,0x04,0x8a,0xbe]
 
-s_mov_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x04,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_mov_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x04,0x8a,0xbe]
 
-s_cmov_b32 s0, s0
-// CHECK: [0x00,0x05,0x80,0xbe]
+s_mov_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x04,0x8a,0xbe]
 
-s_cmov_b32 s103, s0
-// CHECK: [0x00,0x05,0xe7,0xbe]
+s_mov_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x04,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_cmov_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x05,0xe8,0xbe]
+s_mov_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x04,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_cmov_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x05,0xe9,0xbe]
+s_cmov_b32 s5, s1
+// CHECK: [0x01,0x05,0x85,0xbe]
 
-s_cmov_b32 vcc_lo, s0
-// CHECK: [0x00,0x05,0xea,0xbe]
+s_cmov_b32 s103, s1
+// CHECK: [0x01,0x05,0xe7,0xbe]
 
-s_cmov_b32 vcc_hi, s0
-// CHECK: [0x00,0x05,0xeb,0xbe]
+s_cmov_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x05,0xe8,0xbe]
 
-s_cmov_b32 tba_lo, s0
-// CHECK: [0x00,0x05,0xec,0xbe]
+s_cmov_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x05,0xe9,0xbe]
 
-s_cmov_b32 tba_hi, s0
-// CHECK: [0x00,0x05,0xed,0xbe]
+s_cmov_b32 vcc_lo, s1
+// CHECK: [0x01,0x05,0xea,0xbe]
 
-s_cmov_b32 tma_lo, s0
-// CHECK: [0x00,0x05,0xee,0xbe]
+s_cmov_b32 vcc_hi, s1
+// CHECK: [0x01,0x05,0xeb,0xbe]
 
-s_cmov_b32 tma_hi, s0
-// CHECK: [0x00,0x05,0xef,0xbe]
+s_cmov_b32 tba_lo, s1
+// CHECK: [0x01,0x05,0xec,0xbe]
 
-s_cmov_b32 ttmp11, s0
-// CHECK: [0x00,0x05,0xfb,0xbe]
+s_cmov_b32 tba_hi, s1
+// CHECK: [0x01,0x05,0xed,0xbe]
 
-s_cmov_b32 m0, s0
-// CHECK: [0x00,0x05,0xfc,0xbe]
+s_cmov_b32 tma_lo, s1
+// CHECK: [0x01,0x05,0xee,0xbe]
 
-s_cmov_b32 exec_lo, s0
-// CHECK: [0x00,0x05,0xfe,0xbe]
+s_cmov_b32 tma_hi, s1
+// CHECK: [0x01,0x05,0xef,0xbe]
 
-s_cmov_b32 exec_hi, s0
-// CHECK: [0x00,0x05,0xff,0xbe]
+s_cmov_b32 ttmp11, s1
+// CHECK: [0x01,0x05,0xfb,0xbe]
 
-s_cmov_b32 s0, s103
-// CHECK: [0x67,0x05,0x80,0xbe]
+s_cmov_b32 m0, s1
+// CHECK: [0x01,0x05,0xfc,0xbe]
 
-s_cmov_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x05,0x80,0xbe]
+s_cmov_b32 exec_lo, s1
+// CHECK: [0x01,0x05,0xfe,0xbe]
 
-s_cmov_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x05,0x80,0xbe]
+s_cmov_b32 exec_hi, s1
+// CHECK: [0x01,0x05,0xff,0xbe]
 
-s_cmov_b32 s0, vcc_lo
-// CHECK: [0x6a,0x05,0x80,0xbe]
+s_cmov_b32 s5, s103
+// CHECK: [0x67,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, vcc_hi
-// CHECK: [0x6b,0x05,0x80,0xbe]
+s_cmov_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, tba_lo
-// CHECK: [0x6c,0x05,0x80,0xbe]
+s_cmov_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, tba_hi
-// CHECK: [0x6d,0x05,0x80,0xbe]
+s_cmov_b32 s5, vcc_lo
+// CHECK: [0x6a,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, tma_lo
-// CHECK: [0x6e,0x05,0x80,0xbe]
+s_cmov_b32 s5, vcc_hi
+// CHECK: [0x6b,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, tma_hi
-// CHECK: [0x6f,0x05,0x80,0xbe]
+s_cmov_b32 s5, tba_lo
+// CHECK: [0x6c,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, ttmp11
-// CHECK: [0x7b,0x05,0x80,0xbe]
+s_cmov_b32 s5, tba_hi
+// CHECK: [0x6d,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, m0
-// CHECK: [0x7c,0x05,0x80,0xbe]
+s_cmov_b32 s5, tma_lo
+// CHECK: [0x6e,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, exec_lo
-// CHECK: [0x7e,0x05,0x80,0xbe]
+s_cmov_b32 s5, tma_hi
+// CHECK: [0x6f,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, exec_hi
-// CHECK: [0x7f,0x05,0x80,0xbe]
+s_cmov_b32 s5, ttmp11
+// CHECK: [0x7b,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, 0
-// CHECK: [0x80,0x05,0x80,0xbe]
+s_cmov_b32 s5, m0
+// CHECK: [0x7c,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, -1
-// CHECK: [0xc1,0x05,0x80,0xbe]
+s_cmov_b32 s5, exec_lo
+// CHECK: [0x7e,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, 0.5
-// CHECK: [0xf0,0x05,0x80,0xbe]
+s_cmov_b32 s5, exec_hi
+// CHECK: [0x7f,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, -4.0
-// CHECK: [0xf7,0x05,0x80,0xbe]
+s_cmov_b32 s5, 0
+// CHECK: [0x80,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, 0xaf123456
-// CHECK: [0xff,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_cmov_b32 s5, -1
+// CHECK: [0xc1,0x05,0x85,0xbe]
 
-s_cmov_b32 s0, 0x3f717273
-// CHECK: [0xff,0x05,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_cmov_b32 s5, 0.5
+// CHECK: [0xf0,0x05,0x85,0xbe]
 
-s_cmov_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x06,0x80,0xbe]
+s_cmov_b32 s5, -4.0
+// CHECK: [0xf7,0x05,0x85,0xbe]
 
-s_cmov_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x06,0x82,0xbe]
+s_cmov_b32 s5, 0xaf123456
+// CHECK: [0xff,0x05,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_cmov_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x06,0xe6,0xbe]
+s_cmov_b32 s5, 0x3f717273
+// CHECK: [0xff,0x05,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_cmov_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x06,0xe8,0xbe]
+s_cmov_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x06,0x8a,0xbe]
 
-s_cmov_b64 vcc, s[0:1]
-// CHECK: [0x00,0x06,0xea,0xbe]
+s_cmov_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x06,0x8c,0xbe]
 
-s_cmov_b64 tba, s[0:1]
-// CHECK: [0x00,0x06,0xec,0xbe]
+s_cmov_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x06,0xe6,0xbe]
 
-s_cmov_b64 tma, s[0:1]
-// CHECK: [0x00,0x06,0xee,0xbe]
+s_cmov_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x06,0xe8,0xbe]
 
-s_cmov_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x06,0xfa,0xbe]
+s_cmov_b64 vcc, s[2:3]
+// CHECK: [0x02,0x06,0xea,0xbe]
 
-s_cmov_b64 exec, s[0:1]
-// CHECK: [0x00,0x06,0xfe,0xbe]
+s_cmov_b64 tba, s[2:3]
+// CHECK: [0x02,0x06,0xec,0xbe]
 
-s_cmov_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x06,0x80,0xbe]
+s_cmov_b64 tma, s[2:3]
+// CHECK: [0x02,0x06,0xee,0xbe]
 
-s_cmov_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x06,0x80,0xbe]
+s_cmov_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x06,0xfa,0xbe]
 
-s_cmov_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x06,0x80,0xbe]
+s_cmov_b64 exec, s[2:3]
+// CHECK: [0x02,0x06,0xfe,0xbe]
 
-s_cmov_b64 s[0:1], vcc
-// CHECK: [0x6a,0x06,0x80,0xbe]
+s_cmov_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x06,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], tba
-// CHECK: [0x6c,0x06,0x80,0xbe]
+s_cmov_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x06,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], tma
-// CHECK: [0x6e,0x06,0x80,0xbe]
+s_cmov_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x06,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x06,0x80,0xbe]
+s_cmov_b64 s[10:11], vcc
+// CHECK: [0x6a,0x06,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], exec
-// CHECK: [0x7e,0x06,0x80,0xbe]
+s_cmov_b64 s[10:11], tba
+// CHECK: [0x6c,0x06,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], 0
-// CHECK: [0x80,0x06,0x80,0xbe]
+s_cmov_b64 s[10:11], tma
+// CHECK: [0x6e,0x06,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], -1
-// CHECK: [0xc1,0x06,0x80,0xbe]
+s_cmov_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x06,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x06,0x80,0xbe]
+s_cmov_b64 s[10:11], exec
+// CHECK: [0x7e,0x06,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x06,0x80,0xbe]
+s_cmov_b64 s[10:11], 0
+// CHECK: [0x80,0x06,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x06,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_cmov_b64 s[10:11], -1
+// CHECK: [0xc1,0x06,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x06,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_cmov_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x06,0x8a,0xbe]
 
-s_not_b32 s0, s0
-// CHECK: [0x00,0x07,0x80,0xbe]
+s_cmov_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x06,0x8a,0xbe]
 
-s_not_b32 s103, s0
-// CHECK: [0x00,0x07,0xe7,0xbe]
+s_cmov_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x06,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_not_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x07,0xe8,0xbe]
+s_cmov_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x06,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_not_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x07,0xe9,0xbe]
+s_not_b32 s5, s1
+// CHECK: [0x01,0x07,0x85,0xbe]
 
-s_not_b32 vcc_lo, s0
-// CHECK: [0x00,0x07,0xea,0xbe]
+s_not_b32 s103, s1
+// CHECK: [0x01,0x07,0xe7,0xbe]
 
-s_not_b32 vcc_hi, s0
-// CHECK: [0x00,0x07,0xeb,0xbe]
+s_not_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x07,0xe8,0xbe]
 
-s_not_b32 tba_lo, s0
-// CHECK: [0x00,0x07,0xec,0xbe]
+s_not_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x07,0xe9,0xbe]
 
-s_not_b32 tba_hi, s0
-// CHECK: [0x00,0x07,0xed,0xbe]
+s_not_b32 vcc_lo, s1
+// CHECK: [0x01,0x07,0xea,0xbe]
 
-s_not_b32 tma_lo, s0
-// CHECK: [0x00,0x07,0xee,0xbe]
+s_not_b32 vcc_hi, s1
+// CHECK: [0x01,0x07,0xeb,0xbe]
 
-s_not_b32 tma_hi, s0
-// CHECK: [0x00,0x07,0xef,0xbe]
+s_not_b32 tba_lo, s1
+// CHECK: [0x01,0x07,0xec,0xbe]
 
-s_not_b32 ttmp11, s0
-// CHECK: [0x00,0x07,0xfb,0xbe]
+s_not_b32 tba_hi, s1
+// CHECK: [0x01,0x07,0xed,0xbe]
 
-s_not_b32 m0, s0
-// CHECK: [0x00,0x07,0xfc,0xbe]
+s_not_b32 tma_lo, s1
+// CHECK: [0x01,0x07,0xee,0xbe]
 
-s_not_b32 exec_lo, s0
-// CHECK: [0x00,0x07,0xfe,0xbe]
+s_not_b32 tma_hi, s1
+// CHECK: [0x01,0x07,0xef,0xbe]
 
-s_not_b32 exec_hi, s0
-// CHECK: [0x00,0x07,0xff,0xbe]
+s_not_b32 ttmp11, s1
+// CHECK: [0x01,0x07,0xfb,0xbe]
 
-s_not_b32 s0, s103
-// CHECK: [0x67,0x07,0x80,0xbe]
+s_not_b32 m0, s1
+// CHECK: [0x01,0x07,0xfc,0xbe]
 
-s_not_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x07,0x80,0xbe]
+s_not_b32 exec_lo, s1
+// CHECK: [0x01,0x07,0xfe,0xbe]
 
-s_not_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x07,0x80,0xbe]
+s_not_b32 exec_hi, s1
+// CHECK: [0x01,0x07,0xff,0xbe]
 
-s_not_b32 s0, vcc_lo
-// CHECK: [0x6a,0x07,0x80,0xbe]
+s_not_b32 s5, s103
+// CHECK: [0x67,0x07,0x85,0xbe]
 
-s_not_b32 s0, vcc_hi
-// CHECK: [0x6b,0x07,0x80,0xbe]
+s_not_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x07,0x85,0xbe]
 
-s_not_b32 s0, tba_lo
-// CHECK: [0x6c,0x07,0x80,0xbe]
+s_not_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x07,0x85,0xbe]
 
-s_not_b32 s0, tba_hi
-// CHECK: [0x6d,0x07,0x80,0xbe]
+s_not_b32 s5, vcc_lo
+// CHECK: [0x6a,0x07,0x85,0xbe]
 
-s_not_b32 s0, tma_lo
-// CHECK: [0x6e,0x07,0x80,0xbe]
+s_not_b32 s5, vcc_hi
+// CHECK: [0x6b,0x07,0x85,0xbe]
 
-s_not_b32 s0, tma_hi
-// CHECK: [0x6f,0x07,0x80,0xbe]
+s_not_b32 s5, tba_lo
+// CHECK: [0x6c,0x07,0x85,0xbe]
 
-s_not_b32 s0, ttmp11
-// CHECK: [0x7b,0x07,0x80,0xbe]
+s_not_b32 s5, tba_hi
+// CHECK: [0x6d,0x07,0x85,0xbe]
 
-s_not_b32 s0, m0
-// CHECK: [0x7c,0x07,0x80,0xbe]
+s_not_b32 s5, tma_lo
+// CHECK: [0x6e,0x07,0x85,0xbe]
 
-s_not_b32 s0, exec_lo
-// CHECK: [0x7e,0x07,0x80,0xbe]
+s_not_b32 s5, tma_hi
+// CHECK: [0x6f,0x07,0x85,0xbe]
 
-s_not_b32 s0, exec_hi
-// CHECK: [0x7f,0x07,0x80,0xbe]
+s_not_b32 s5, ttmp11
+// CHECK: [0x7b,0x07,0x85,0xbe]
 
-s_not_b32 s0, 0
-// CHECK: [0x80,0x07,0x80,0xbe]
+s_not_b32 s5, m0
+// CHECK: [0x7c,0x07,0x85,0xbe]
 
-s_not_b32 s0, -1
-// CHECK: [0xc1,0x07,0x80,0xbe]
+s_not_b32 s5, exec_lo
+// CHECK: [0x7e,0x07,0x85,0xbe]
 
-s_not_b32 s0, 0.5
-// CHECK: [0xf0,0x07,0x80,0xbe]
+s_not_b32 s5, exec_hi
+// CHECK: [0x7f,0x07,0x85,0xbe]
 
-s_not_b32 s0, -4.0
-// CHECK: [0xf7,0x07,0x80,0xbe]
+s_not_b32 s5, 0
+// CHECK: [0x80,0x07,0x85,0xbe]
 
-s_not_b32 s0, 0xaf123456
-// CHECK: [0xff,0x07,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_not_b32 s5, -1
+// CHECK: [0xc1,0x07,0x85,0xbe]
 
-s_not_b32 s0, 0x3f717273
-// CHECK: [0xff,0x07,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_not_b32 s5, 0.5
+// CHECK: [0xf0,0x07,0x85,0xbe]
 
-s_not_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x08,0x80,0xbe]
+s_not_b32 s5, -4.0
+// CHECK: [0xf7,0x07,0x85,0xbe]
 
-s_not_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x08,0x82,0xbe]
+s_not_b32 s5, 0xaf123456
+// CHECK: [0xff,0x07,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_not_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x08,0xe6,0xbe]
+s_not_b32 s5, 0x3f717273
+// CHECK: [0xff,0x07,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_not_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x08,0xe8,0xbe]
+s_not_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x08,0x8a,0xbe]
 
-s_not_b64 vcc, s[0:1]
-// CHECK: [0x00,0x08,0xea,0xbe]
+s_not_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x08,0x8c,0xbe]
 
-s_not_b64 tba, s[0:1]
-// CHECK: [0x00,0x08,0xec,0xbe]
+s_not_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x08,0xe6,0xbe]
 
-s_not_b64 tma, s[0:1]
-// CHECK: [0x00,0x08,0xee,0xbe]
+s_not_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x08,0xe8,0xbe]
 
-s_not_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x08,0xfa,0xbe]
+s_not_b64 vcc, s[2:3]
+// CHECK: [0x02,0x08,0xea,0xbe]
 
-s_not_b64 exec, s[0:1]
-// CHECK: [0x00,0x08,0xfe,0xbe]
+s_not_b64 tba, s[2:3]
+// CHECK: [0x02,0x08,0xec,0xbe]
 
-s_not_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x08,0x80,0xbe]
+s_not_b64 tma, s[2:3]
+// CHECK: [0x02,0x08,0xee,0xbe]
 
-s_not_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x08,0x80,0xbe]
+s_not_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x08,0xfa,0xbe]
 
-s_not_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x08,0x80,0xbe]
+s_not_b64 exec, s[2:3]
+// CHECK: [0x02,0x08,0xfe,0xbe]
 
-s_not_b64 s[0:1], vcc
-// CHECK: [0x6a,0x08,0x80,0xbe]
+s_not_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x08,0x8a,0xbe]
 
-s_not_b64 s[0:1], tba
-// CHECK: [0x6c,0x08,0x80,0xbe]
+s_not_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x08,0x8a,0xbe]
 
-s_not_b64 s[0:1], tma
-// CHECK: [0x6e,0x08,0x80,0xbe]
+s_not_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x08,0x8a,0xbe]
 
-s_not_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x08,0x80,0xbe]
+s_not_b64 s[10:11], vcc
+// CHECK: [0x6a,0x08,0x8a,0xbe]
 
-s_not_b64 s[0:1], exec
-// CHECK: [0x7e,0x08,0x80,0xbe]
+s_not_b64 s[10:11], tba
+// CHECK: [0x6c,0x08,0x8a,0xbe]
 
-s_not_b64 s[0:1], 0
-// CHECK: [0x80,0x08,0x80,0xbe]
+s_not_b64 s[10:11], tma
+// CHECK: [0x6e,0x08,0x8a,0xbe]
 
-s_not_b64 s[0:1], -1
-// CHECK: [0xc1,0x08,0x80,0xbe]
+s_not_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x08,0x8a,0xbe]
 
-s_not_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x08,0x80,0xbe]
+s_not_b64 s[10:11], exec
+// CHECK: [0x7e,0x08,0x8a,0xbe]
 
-s_not_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x08,0x80,0xbe]
+s_not_b64 s[10:11], 0
+// CHECK: [0x80,0x08,0x8a,0xbe]
 
-s_not_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x08,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_not_b64 s[10:11], -1
+// CHECK: [0xc1,0x08,0x8a,0xbe]
 
-s_not_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x08,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_not_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x08,0x8a,0xbe]
 
-s_wqm_b32 s0, s0
-// CHECK: [0x00,0x09,0x80,0xbe]
+s_not_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x08,0x8a,0xbe]
 
-s_wqm_b32 s103, s0
-// CHECK: [0x00,0x09,0xe7,0xbe]
+s_not_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x08,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_wqm_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x09,0xe8,0xbe]
+s_not_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x08,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_wqm_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x09,0xe9,0xbe]
+s_wqm_b32 s5, s1
+// CHECK: [0x01,0x09,0x85,0xbe]
 
-s_wqm_b32 vcc_lo, s0
-// CHECK: [0x00,0x09,0xea,0xbe]
+s_wqm_b32 s103, s1
+// CHECK: [0x01,0x09,0xe7,0xbe]
 
-s_wqm_b32 vcc_hi, s0
-// CHECK: [0x00,0x09,0xeb,0xbe]
+s_wqm_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x09,0xe8,0xbe]
 
-s_wqm_b32 tba_lo, s0
-// CHECK: [0x00,0x09,0xec,0xbe]
+s_wqm_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x09,0xe9,0xbe]
 
-s_wqm_b32 tba_hi, s0
-// CHECK: [0x00,0x09,0xed,0xbe]
+s_wqm_b32 vcc_lo, s1
+// CHECK: [0x01,0x09,0xea,0xbe]
 
-s_wqm_b32 tma_lo, s0
-// CHECK: [0x00,0x09,0xee,0xbe]
+s_wqm_b32 vcc_hi, s1
+// CHECK: [0x01,0x09,0xeb,0xbe]
 
-s_wqm_b32 tma_hi, s0
-// CHECK: [0x00,0x09,0xef,0xbe]
+s_wqm_b32 tba_lo, s1
+// CHECK: [0x01,0x09,0xec,0xbe]
 
-s_wqm_b32 ttmp11, s0
-// CHECK: [0x00,0x09,0xfb,0xbe]
+s_wqm_b32 tba_hi, s1
+// CHECK: [0x01,0x09,0xed,0xbe]
 
-s_wqm_b32 m0, s0
-// CHECK: [0x00,0x09,0xfc,0xbe]
+s_wqm_b32 tma_lo, s1
+// CHECK: [0x01,0x09,0xee,0xbe]
 
-s_wqm_b32 exec_lo, s0
-// CHECK: [0x00,0x09,0xfe,0xbe]
+s_wqm_b32 tma_hi, s1
+// CHECK: [0x01,0x09,0xef,0xbe]
 
-s_wqm_b32 exec_hi, s0
-// CHECK: [0x00,0x09,0xff,0xbe]
+s_wqm_b32 ttmp11, s1
+// CHECK: [0x01,0x09,0xfb,0xbe]
 
-s_wqm_b32 s0, s103
-// CHECK: [0x67,0x09,0x80,0xbe]
+s_wqm_b32 m0, s1
+// CHECK: [0x01,0x09,0xfc,0xbe]
 
-s_wqm_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x09,0x80,0xbe]
+s_wqm_b32 exec_lo, s1
+// CHECK: [0x01,0x09,0xfe,0xbe]
 
-s_wqm_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x09,0x80,0xbe]
+s_wqm_b32 exec_hi, s1
+// CHECK: [0x01,0x09,0xff,0xbe]
 
-s_wqm_b32 s0, vcc_lo
-// CHECK: [0x6a,0x09,0x80,0xbe]
+s_wqm_b32 s5, s103
+// CHECK: [0x67,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, vcc_hi
-// CHECK: [0x6b,0x09,0x80,0xbe]
+s_wqm_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, tba_lo
-// CHECK: [0x6c,0x09,0x80,0xbe]
+s_wqm_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, tba_hi
-// CHECK: [0x6d,0x09,0x80,0xbe]
+s_wqm_b32 s5, vcc_lo
+// CHECK: [0x6a,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, tma_lo
-// CHECK: [0x6e,0x09,0x80,0xbe]
+s_wqm_b32 s5, vcc_hi
+// CHECK: [0x6b,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, tma_hi
-// CHECK: [0x6f,0x09,0x80,0xbe]
+s_wqm_b32 s5, tba_lo
+// CHECK: [0x6c,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, ttmp11
-// CHECK: [0x7b,0x09,0x80,0xbe]
+s_wqm_b32 s5, tba_hi
+// CHECK: [0x6d,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, m0
-// CHECK: [0x7c,0x09,0x80,0xbe]
+s_wqm_b32 s5, tma_lo
+// CHECK: [0x6e,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, exec_lo
-// CHECK: [0x7e,0x09,0x80,0xbe]
+s_wqm_b32 s5, tma_hi
+// CHECK: [0x6f,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, exec_hi
-// CHECK: [0x7f,0x09,0x80,0xbe]
+s_wqm_b32 s5, ttmp11
+// CHECK: [0x7b,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, 0
-// CHECK: [0x80,0x09,0x80,0xbe]
+s_wqm_b32 s5, m0
+// CHECK: [0x7c,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, -1
-// CHECK: [0xc1,0x09,0x80,0xbe]
+s_wqm_b32 s5, exec_lo
+// CHECK: [0x7e,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, 0.5
-// CHECK: [0xf0,0x09,0x80,0xbe]
+s_wqm_b32 s5, exec_hi
+// CHECK: [0x7f,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, -4.0
-// CHECK: [0xf7,0x09,0x80,0xbe]
+s_wqm_b32 s5, 0
+// CHECK: [0x80,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, 0xaf123456
-// CHECK: [0xff,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_wqm_b32 s5, -1
+// CHECK: [0xc1,0x09,0x85,0xbe]
 
-s_wqm_b32 s0, 0x3f717273
-// CHECK: [0xff,0x09,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_wqm_b32 s5, 0.5
+// CHECK: [0xf0,0x09,0x85,0xbe]
 
-s_wqm_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x0a,0x80,0xbe]
+s_wqm_b32 s5, -4.0
+// CHECK: [0xf7,0x09,0x85,0xbe]
 
-s_wqm_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x0a,0x82,0xbe]
+s_wqm_b32 s5, 0xaf123456
+// CHECK: [0xff,0x09,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_wqm_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x0a,0xe6,0xbe]
+s_wqm_b32 s5, 0x3f717273
+// CHECK: [0xff,0x09,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_wqm_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x0a,0xe8,0xbe]
+s_wqm_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x0a,0x8a,0xbe]
 
-s_wqm_b64 vcc, s[0:1]
-// CHECK: [0x00,0x0a,0xea,0xbe]
+s_wqm_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x0a,0x8c,0xbe]
 
-s_wqm_b64 tba, s[0:1]
-// CHECK: [0x00,0x0a,0xec,0xbe]
+s_wqm_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x0a,0xe6,0xbe]
 
-s_wqm_b64 tma, s[0:1]
-// CHECK: [0x00,0x0a,0xee,0xbe]
+s_wqm_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x0a,0xe8,0xbe]
 
-s_wqm_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x0a,0xfa,0xbe]
+s_wqm_b64 vcc, s[2:3]
+// CHECK: [0x02,0x0a,0xea,0xbe]
 
-s_wqm_b64 exec, s[0:1]
-// CHECK: [0x00,0x0a,0xfe,0xbe]
+s_wqm_b64 tba, s[2:3]
+// CHECK: [0x02,0x0a,0xec,0xbe]
 
-s_wqm_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x0a,0x80,0xbe]
+s_wqm_b64 tma, s[2:3]
+// CHECK: [0x02,0x0a,0xee,0xbe]
 
-s_wqm_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x0a,0x80,0xbe]
+s_wqm_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x0a,0xfa,0xbe]
 
-s_wqm_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x0a,0x80,0xbe]
+s_wqm_b64 exec, s[2:3]
+// CHECK: [0x02,0x0a,0xfe,0xbe]
 
-s_wqm_b64 s[0:1], vcc
-// CHECK: [0x6a,0x0a,0x80,0xbe]
+s_wqm_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x0a,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], tba
-// CHECK: [0x6c,0x0a,0x80,0xbe]
+s_wqm_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x0a,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], tma
-// CHECK: [0x6e,0x0a,0x80,0xbe]
+s_wqm_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x0a,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x0a,0x80,0xbe]
+s_wqm_b64 s[10:11], vcc
+// CHECK: [0x6a,0x0a,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], exec
-// CHECK: [0x7e,0x0a,0x80,0xbe]
+s_wqm_b64 s[10:11], tba
+// CHECK: [0x6c,0x0a,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], 0
-// CHECK: [0x80,0x0a,0x80,0xbe]
+s_wqm_b64 s[10:11], tma
+// CHECK: [0x6e,0x0a,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], -1
-// CHECK: [0xc1,0x0a,0x80,0xbe]
+s_wqm_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x0a,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x0a,0x80,0xbe]
+s_wqm_b64 s[10:11], exec
+// CHECK: [0x7e,0x0a,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x0a,0x80,0xbe]
+s_wqm_b64 s[10:11], 0
+// CHECK: [0x80,0x0a,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x0a,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_wqm_b64 s[10:11], -1
+// CHECK: [0xc1,0x0a,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x0a,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_wqm_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x0a,0x8a,0xbe]
 
-s_brev_b32 s0, s0
-// CHECK: [0x00,0x0b,0x80,0xbe]
+s_wqm_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x0a,0x8a,0xbe]
 
-s_brev_b32 s103, s0
-// CHECK: [0x00,0x0b,0xe7,0xbe]
+s_wqm_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x0a,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_brev_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x0b,0xe8,0xbe]
+s_wqm_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x0a,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_brev_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x0b,0xe9,0xbe]
+s_brev_b32 s5, s1
+// CHECK: [0x01,0x0b,0x85,0xbe]
 
-s_brev_b32 vcc_lo, s0
-// CHECK: [0x00,0x0b,0xea,0xbe]
+s_brev_b32 s103, s1
+// CHECK: [0x01,0x0b,0xe7,0xbe]
 
-s_brev_b32 vcc_hi, s0
-// CHECK: [0x00,0x0b,0xeb,0xbe]
+s_brev_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x0b,0xe8,0xbe]
 
-s_brev_b32 tba_lo, s0
-// CHECK: [0x00,0x0b,0xec,0xbe]
+s_brev_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x0b,0xe9,0xbe]
 
-s_brev_b32 tba_hi, s0
-// CHECK: [0x00,0x0b,0xed,0xbe]
+s_brev_b32 vcc_lo, s1
+// CHECK: [0x01,0x0b,0xea,0xbe]
 
-s_brev_b32 tma_lo, s0
-// CHECK: [0x00,0x0b,0xee,0xbe]
+s_brev_b32 vcc_hi, s1
+// CHECK: [0x01,0x0b,0xeb,0xbe]
 
-s_brev_b32 tma_hi, s0
-// CHECK: [0x00,0x0b,0xef,0xbe]
+s_brev_b32 tba_lo, s1
+// CHECK: [0x01,0x0b,0xec,0xbe]
 
-s_brev_b32 ttmp11, s0
-// CHECK: [0x00,0x0b,0xfb,0xbe]
+s_brev_b32 tba_hi, s1
+// CHECK: [0x01,0x0b,0xed,0xbe]
 
-s_brev_b32 m0, s0
-// CHECK: [0x00,0x0b,0xfc,0xbe]
+s_brev_b32 tma_lo, s1
+// CHECK: [0x01,0x0b,0xee,0xbe]
 
-s_brev_b32 exec_lo, s0
-// CHECK: [0x00,0x0b,0xfe,0xbe]
+s_brev_b32 tma_hi, s1
+// CHECK: [0x01,0x0b,0xef,0xbe]
 
-s_brev_b32 exec_hi, s0
-// CHECK: [0x00,0x0b,0xff,0xbe]
+s_brev_b32 ttmp11, s1
+// CHECK: [0x01,0x0b,0xfb,0xbe]
 
-s_brev_b32 s0, s103
-// CHECK: [0x67,0x0b,0x80,0xbe]
+s_brev_b32 m0, s1
+// CHECK: [0x01,0x0b,0xfc,0xbe]
 
-s_brev_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x0b,0x80,0xbe]
+s_brev_b32 exec_lo, s1
+// CHECK: [0x01,0x0b,0xfe,0xbe]
 
-s_brev_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x0b,0x80,0xbe]
+s_brev_b32 exec_hi, s1
+// CHECK: [0x01,0x0b,0xff,0xbe]
 
-s_brev_b32 s0, vcc_lo
-// CHECK: [0x6a,0x0b,0x80,0xbe]
+s_brev_b32 s5, s103
+// CHECK: [0x67,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, vcc_hi
-// CHECK: [0x6b,0x0b,0x80,0xbe]
+s_brev_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, tba_lo
-// CHECK: [0x6c,0x0b,0x80,0xbe]
+s_brev_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, tba_hi
-// CHECK: [0x6d,0x0b,0x80,0xbe]
+s_brev_b32 s5, vcc_lo
+// CHECK: [0x6a,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, tma_lo
-// CHECK: [0x6e,0x0b,0x80,0xbe]
+s_brev_b32 s5, vcc_hi
+// CHECK: [0x6b,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, tma_hi
-// CHECK: [0x6f,0x0b,0x80,0xbe]
+s_brev_b32 s5, tba_lo
+// CHECK: [0x6c,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, ttmp11
-// CHECK: [0x7b,0x0b,0x80,0xbe]
+s_brev_b32 s5, tba_hi
+// CHECK: [0x6d,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, m0
-// CHECK: [0x7c,0x0b,0x80,0xbe]
+s_brev_b32 s5, tma_lo
+// CHECK: [0x6e,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, exec_lo
-// CHECK: [0x7e,0x0b,0x80,0xbe]
+s_brev_b32 s5, tma_hi
+// CHECK: [0x6f,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, exec_hi
-// CHECK: [0x7f,0x0b,0x80,0xbe]
+s_brev_b32 s5, ttmp11
+// CHECK: [0x7b,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, 0
-// CHECK: [0x80,0x0b,0x80,0xbe]
+s_brev_b32 s5, m0
+// CHECK: [0x7c,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, -1
-// CHECK: [0xc1,0x0b,0x80,0xbe]
+s_brev_b32 s5, exec_lo
+// CHECK: [0x7e,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, 0.5
-// CHECK: [0xf0,0x0b,0x80,0xbe]
+s_brev_b32 s5, exec_hi
+// CHECK: [0x7f,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, -4.0
-// CHECK: [0xf7,0x0b,0x80,0xbe]
+s_brev_b32 s5, 0
+// CHECK: [0x80,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, 0xaf123456
-// CHECK: [0xff,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_brev_b32 s5, -1
+// CHECK: [0xc1,0x0b,0x85,0xbe]
 
-s_brev_b32 s0, 0x3f717273
-// CHECK: [0xff,0x0b,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_brev_b32 s5, 0.5
+// CHECK: [0xf0,0x0b,0x85,0xbe]
 
-s_brev_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x0c,0x80,0xbe]
+s_brev_b32 s5, -4.0
+// CHECK: [0xf7,0x0b,0x85,0xbe]
 
-s_brev_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x0c,0x82,0xbe]
+s_brev_b32 s5, 0xaf123456
+// CHECK: [0xff,0x0b,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_brev_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x0c,0xe6,0xbe]
+s_brev_b32 s5, 0x3f717273
+// CHECK: [0xff,0x0b,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_brev_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x0c,0xe8,0xbe]
+s_brev_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x0c,0x8a,0xbe]
 
-s_brev_b64 vcc, s[0:1]
-// CHECK: [0x00,0x0c,0xea,0xbe]
+s_brev_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x0c,0x8c,0xbe]
 
-s_brev_b64 tba, s[0:1]
-// CHECK: [0x00,0x0c,0xec,0xbe]
+s_brev_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x0c,0xe6,0xbe]
 
-s_brev_b64 tma, s[0:1]
-// CHECK: [0x00,0x0c,0xee,0xbe]
+s_brev_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x0c,0xe8,0xbe]
 
-s_brev_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x0c,0xfa,0xbe]
+s_brev_b64 vcc, s[2:3]
+// CHECK: [0x02,0x0c,0xea,0xbe]
 
-s_brev_b64 exec, s[0:1]
-// CHECK: [0x00,0x0c,0xfe,0xbe]
+s_brev_b64 tba, s[2:3]
+// CHECK: [0x02,0x0c,0xec,0xbe]
 
-s_brev_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x0c,0x80,0xbe]
+s_brev_b64 tma, s[2:3]
+// CHECK: [0x02,0x0c,0xee,0xbe]
 
-s_brev_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x0c,0x80,0xbe]
+s_brev_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x0c,0xfa,0xbe]
 
-s_brev_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x0c,0x80,0xbe]
+s_brev_b64 exec, s[2:3]
+// CHECK: [0x02,0x0c,0xfe,0xbe]
 
-s_brev_b64 s[0:1], vcc
-// CHECK: [0x6a,0x0c,0x80,0xbe]
+s_brev_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x0c,0x8a,0xbe]
 
-s_brev_b64 s[0:1], tba
-// CHECK: [0x6c,0x0c,0x80,0xbe]
+s_brev_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x0c,0x8a,0xbe]
 
-s_brev_b64 s[0:1], tma
-// CHECK: [0x6e,0x0c,0x80,0xbe]
+s_brev_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x0c,0x8a,0xbe]
 
-s_brev_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x0c,0x80,0xbe]
+s_brev_b64 s[10:11], vcc
+// CHECK: [0x6a,0x0c,0x8a,0xbe]
 
-s_brev_b64 s[0:1], exec
-// CHECK: [0x7e,0x0c,0x80,0xbe]
+s_brev_b64 s[10:11], tba
+// CHECK: [0x6c,0x0c,0x8a,0xbe]
 
-s_brev_b64 s[0:1], 0
-// CHECK: [0x80,0x0c,0x80,0xbe]
+s_brev_b64 s[10:11], tma
+// CHECK: [0x6e,0x0c,0x8a,0xbe]
 
-s_brev_b64 s[0:1], -1
-// CHECK: [0xc1,0x0c,0x80,0xbe]
+s_brev_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x0c,0x8a,0xbe]
 
-s_brev_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x0c,0x80,0xbe]
+s_brev_b64 s[10:11], exec
+// CHECK: [0x7e,0x0c,0x8a,0xbe]
 
-s_brev_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x0c,0x80,0xbe]
+s_brev_b64 s[10:11], 0
+// CHECK: [0x80,0x0c,0x8a,0xbe]
 
-s_brev_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x0c,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_brev_b64 s[10:11], -1
+// CHECK: [0xc1,0x0c,0x8a,0xbe]
 
-s_brev_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x0c,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_brev_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x0c,0x8a,0xbe]
 
-s_bcnt0_i32_b32 s0, s0
-// CHECK: [0x00,0x0d,0x80,0xbe]
+s_brev_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x0c,0x8a,0xbe]
 
-s_bcnt0_i32_b32 s103, s0
-// CHECK: [0x00,0x0d,0xe7,0xbe]
+s_brev_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x0c,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bcnt0_i32_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x0d,0xe8,0xbe]
+s_brev_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x0c,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bcnt0_i32_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x0d,0xe9,0xbe]
+s_bcnt0_i32_b32 s5, s1
+// CHECK: [0x01,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 vcc_lo, s0
-// CHECK: [0x00,0x0d,0xea,0xbe]
+s_bcnt0_i32_b32 s103, s1
+// CHECK: [0x01,0x0d,0xe7,0xbe]
 
-s_bcnt0_i32_b32 vcc_hi, s0
-// CHECK: [0x00,0x0d,0xeb,0xbe]
+s_bcnt0_i32_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x0d,0xe8,0xbe]
 
-s_bcnt0_i32_b32 tba_lo, s0
-// CHECK: [0x00,0x0d,0xec,0xbe]
+s_bcnt0_i32_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x0d,0xe9,0xbe]
 
-s_bcnt0_i32_b32 tba_hi, s0
-// CHECK: [0x00,0x0d,0xed,0xbe]
+s_bcnt0_i32_b32 vcc_lo, s1
+// CHECK: [0x01,0x0d,0xea,0xbe]
 
-s_bcnt0_i32_b32 tma_lo, s0
-// CHECK: [0x00,0x0d,0xee,0xbe]
+s_bcnt0_i32_b32 vcc_hi, s1
+// CHECK: [0x01,0x0d,0xeb,0xbe]
 
-s_bcnt0_i32_b32 tma_hi, s0
-// CHECK: [0x00,0x0d,0xef,0xbe]
+s_bcnt0_i32_b32 tba_lo, s1
+// CHECK: [0x01,0x0d,0xec,0xbe]
 
-s_bcnt0_i32_b32 ttmp11, s0
-// CHECK: [0x00,0x0d,0xfb,0xbe]
+s_bcnt0_i32_b32 tba_hi, s1
+// CHECK: [0x01,0x0d,0xed,0xbe]
 
-s_bcnt0_i32_b32 m0, s0
-// CHECK: [0x00,0x0d,0xfc,0xbe]
+s_bcnt0_i32_b32 tma_lo, s1
+// CHECK: [0x01,0x0d,0xee,0xbe]
 
-s_bcnt0_i32_b32 exec_lo, s0
-// CHECK: [0x00,0x0d,0xfe,0xbe]
+s_bcnt0_i32_b32 tma_hi, s1
+// CHECK: [0x01,0x0d,0xef,0xbe]
 
-s_bcnt0_i32_b32 exec_hi, s0
-// CHECK: [0x00,0x0d,0xff,0xbe]
+s_bcnt0_i32_b32 ttmp11, s1
+// CHECK: [0x01,0x0d,0xfb,0xbe]
 
-s_bcnt0_i32_b32 s0, s103
-// CHECK: [0x67,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 m0, s1
+// CHECK: [0x01,0x0d,0xfc,0xbe]
 
-s_bcnt0_i32_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 exec_lo, s1
+// CHECK: [0x01,0x0d,0xfe,0xbe]
 
-s_bcnt0_i32_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 exec_hi, s1
+// CHECK: [0x01,0x0d,0xff,0xbe]
 
-s_bcnt0_i32_b32 s0, vcc_lo
-// CHECK: [0x6a,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, s103
+// CHECK: [0x67,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, vcc_hi
-// CHECK: [0x6b,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, tba_lo
-// CHECK: [0x6c,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, tba_hi
-// CHECK: [0x6d,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, vcc_lo
+// CHECK: [0x6a,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, tma_lo
-// CHECK: [0x6e,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, vcc_hi
+// CHECK: [0x6b,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, tma_hi
-// CHECK: [0x6f,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, tba_lo
+// CHECK: [0x6c,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, ttmp11
-// CHECK: [0x7b,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, tba_hi
+// CHECK: [0x6d,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, m0
-// CHECK: [0x7c,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, tma_lo
+// CHECK: [0x6e,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, exec_lo
-// CHECK: [0x7e,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, tma_hi
+// CHECK: [0x6f,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, exec_hi
-// CHECK: [0x7f,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, ttmp11
+// CHECK: [0x7b,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, 0
-// CHECK: [0x80,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, m0
+// CHECK: [0x7c,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, -1
-// CHECK: [0xc1,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, exec_lo
+// CHECK: [0x7e,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, 0.5
-// CHECK: [0xf0,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, exec_hi
+// CHECK: [0x7f,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, -4.0
-// CHECK: [0xf7,0x0d,0x80,0xbe]
+s_bcnt0_i32_b32 s5, 0
+// CHECK: [0x80,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, 0xaf123456
-// CHECK: [0xff,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bcnt0_i32_b32 s5, -1
+// CHECK: [0xc1,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, 0x3f717273
-// CHECK: [0xff,0x0d,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bcnt0_i32_b32 s5, 0.5
+// CHECK: [0xf0,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, s[0:1]
-// CHECK: [0x00,0x0e,0x80,0xbe]
+s_bcnt0_i32_b32 s5, -4.0
+// CHECK: [0xf7,0x0d,0x85,0xbe]
 
-s_bcnt0_i32_b64 s103, s[0:1]
-// CHECK: [0x00,0x0e,0xe7,0xbe]
+s_bcnt0_i32_b32 s5, 0xaf123456
+// CHECK: [0xff,0x0d,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bcnt0_i32_b64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x0e,0xe8,0xbe]
+s_bcnt0_i32_b32 s5, 0x3f717273
+// CHECK: [0xff,0x0d,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bcnt0_i32_b64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x0e,0xe9,0xbe]
+s_bcnt0_i32_b64 s5, s[2:3]
+// CHECK: [0x02,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x0e,0xea,0xbe]
+s_bcnt0_i32_b64 s103, s[2:3]
+// CHECK: [0x02,0x0e,0xe7,0xbe]
 
-s_bcnt0_i32_b64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x0e,0xeb,0xbe]
+s_bcnt0_i32_b64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x0e,0xe8,0xbe]
 
-s_bcnt0_i32_b64 tba_lo, s[0:1]
-// CHECK: [0x00,0x0e,0xec,0xbe]
+s_bcnt0_i32_b64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x0e,0xe9,0xbe]
 
-s_bcnt0_i32_b64 tba_hi, s[0:1]
-// CHECK: [0x00,0x0e,0xed,0xbe]
+s_bcnt0_i32_b64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x0e,0xea,0xbe]
 
-s_bcnt0_i32_b64 tma_lo, s[0:1]
-// CHECK: [0x00,0x0e,0xee,0xbe]
+s_bcnt0_i32_b64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x0e,0xeb,0xbe]
 
-s_bcnt0_i32_b64 tma_hi, s[0:1]
-// CHECK: [0x00,0x0e,0xef,0xbe]
+s_bcnt0_i32_b64 tba_lo, s[2:3]
+// CHECK: [0x02,0x0e,0xec,0xbe]
 
-s_bcnt0_i32_b64 ttmp11, s[0:1]
-// CHECK: [0x00,0x0e,0xfb,0xbe]
+s_bcnt0_i32_b64 tba_hi, s[2:3]
+// CHECK: [0x02,0x0e,0xed,0xbe]
 
-s_bcnt0_i32_b64 m0, s[0:1]
-// CHECK: [0x00,0x0e,0xfc,0xbe]
+s_bcnt0_i32_b64 tma_lo, s[2:3]
+// CHECK: [0x02,0x0e,0xee,0xbe]
 
-s_bcnt0_i32_b64 exec_lo, s[0:1]
-// CHECK: [0x00,0x0e,0xfe,0xbe]
+s_bcnt0_i32_b64 tma_hi, s[2:3]
+// CHECK: [0x02,0x0e,0xef,0xbe]
 
-s_bcnt0_i32_b64 exec_hi, s[0:1]
-// CHECK: [0x00,0x0e,0xff,0xbe]
+s_bcnt0_i32_b64 ttmp11, s[2:3]
+// CHECK: [0x02,0x0e,0xfb,0xbe]
 
-s_bcnt0_i32_b64 s0, s[2:3]
-// CHECK: [0x02,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 m0, s[2:3]
+// CHECK: [0x02,0x0e,0xfc,0xbe]
 
-s_bcnt0_i32_b64 s0, s[102:103]
-// CHECK: [0x66,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 exec_lo, s[2:3]
+// CHECK: [0x02,0x0e,0xfe,0xbe]
 
-s_bcnt0_i32_b64 s0, flat_scratch
-// CHECK: [0x68,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 exec_hi, s[2:3]
+// CHECK: [0x02,0x0e,0xff,0xbe]
 
-s_bcnt0_i32_b64 s0, vcc
-// CHECK: [0x6a,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 s5, s[4:5]
+// CHECK: [0x04,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, tba
-// CHECK: [0x6c,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 s5, s[102:103]
+// CHECK: [0x66,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, tma
-// CHECK: [0x6e,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 s5, flat_scratch
+// CHECK: [0x68,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 s5, vcc
+// CHECK: [0x6a,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, exec
-// CHECK: [0x7e,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 s5, tba
+// CHECK: [0x6c,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, 0
-// CHECK: [0x80,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 s5, tma
+// CHECK: [0x6e,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, -1
-// CHECK: [0xc1,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, 0.5
-// CHECK: [0xf0,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 s5, exec
+// CHECK: [0x7e,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, -4.0
-// CHECK: [0xf7,0x0e,0x80,0xbe]
+s_bcnt0_i32_b64 s5, 0
+// CHECK: [0x80,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, 0xaf123456
-// CHECK: [0xff,0x0e,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bcnt0_i32_b64 s5, -1
+// CHECK: [0xc1,0x0e,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, 0x3f717273
-// CHECK: [0xff,0x0e,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bcnt0_i32_b64 s5, 0.5
+// CHECK: [0xf0,0x0e,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, s0
-// CHECK: [0x00,0x0f,0x80,0xbe]
+s_bcnt0_i32_b64 s5, -4.0
+// CHECK: [0xf7,0x0e,0x85,0xbe]
 
-s_bcnt1_i32_b32 s103, s0
-// CHECK: [0x00,0x0f,0xe7,0xbe]
+s_bcnt0_i32_b64 s5, 0xaf123456
+// CHECK: [0xff,0x0e,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bcnt1_i32_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x0f,0xe8,0xbe]
+s_bcnt0_i32_b64 s5, 0x3f717273
+// CHECK: [0xff,0x0e,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bcnt1_i32_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x0f,0xe9,0xbe]
+s_bcnt1_i32_b32 s5, s1
+// CHECK: [0x01,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 vcc_lo, s0
-// CHECK: [0x00,0x0f,0xea,0xbe]
+s_bcnt1_i32_b32 s103, s1
+// CHECK: [0x01,0x0f,0xe7,0xbe]
 
-s_bcnt1_i32_b32 vcc_hi, s0
-// CHECK: [0x00,0x0f,0xeb,0xbe]
+s_bcnt1_i32_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x0f,0xe8,0xbe]
 
-s_bcnt1_i32_b32 tba_lo, s0
-// CHECK: [0x00,0x0f,0xec,0xbe]
+s_bcnt1_i32_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x0f,0xe9,0xbe]
 
-s_bcnt1_i32_b32 tba_hi, s0
-// CHECK: [0x00,0x0f,0xed,0xbe]
+s_bcnt1_i32_b32 vcc_lo, s1
+// CHECK: [0x01,0x0f,0xea,0xbe]
 
-s_bcnt1_i32_b32 tma_lo, s0
-// CHECK: [0x00,0x0f,0xee,0xbe]
+s_bcnt1_i32_b32 vcc_hi, s1
+// CHECK: [0x01,0x0f,0xeb,0xbe]
 
-s_bcnt1_i32_b32 tma_hi, s0
-// CHECK: [0x00,0x0f,0xef,0xbe]
+s_bcnt1_i32_b32 tba_lo, s1
+// CHECK: [0x01,0x0f,0xec,0xbe]
 
-s_bcnt1_i32_b32 ttmp11, s0
-// CHECK: [0x00,0x0f,0xfb,0xbe]
+s_bcnt1_i32_b32 tba_hi, s1
+// CHECK: [0x01,0x0f,0xed,0xbe]
 
-s_bcnt1_i32_b32 m0, s0
-// CHECK: [0x00,0x0f,0xfc,0xbe]
+s_bcnt1_i32_b32 tma_lo, s1
+// CHECK: [0x01,0x0f,0xee,0xbe]
 
-s_bcnt1_i32_b32 exec_lo, s0
-// CHECK: [0x00,0x0f,0xfe,0xbe]
+s_bcnt1_i32_b32 tma_hi, s1
+// CHECK: [0x01,0x0f,0xef,0xbe]
 
-s_bcnt1_i32_b32 exec_hi, s0
-// CHECK: [0x00,0x0f,0xff,0xbe]
+s_bcnt1_i32_b32 ttmp11, s1
+// CHECK: [0x01,0x0f,0xfb,0xbe]
 
-s_bcnt1_i32_b32 s0, s103
-// CHECK: [0x67,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 m0, s1
+// CHECK: [0x01,0x0f,0xfc,0xbe]
 
-s_bcnt1_i32_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 exec_lo, s1
+// CHECK: [0x01,0x0f,0xfe,0xbe]
 
-s_bcnt1_i32_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 exec_hi, s1
+// CHECK: [0x01,0x0f,0xff,0xbe]
 
-s_bcnt1_i32_b32 s0, vcc_lo
-// CHECK: [0x6a,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, s103
+// CHECK: [0x67,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, vcc_hi
-// CHECK: [0x6b,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, tba_lo
-// CHECK: [0x6c,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, tba_hi
-// CHECK: [0x6d,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, vcc_lo
+// CHECK: [0x6a,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, tma_lo
-// CHECK: [0x6e,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, vcc_hi
+// CHECK: [0x6b,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, tma_hi
-// CHECK: [0x6f,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, tba_lo
+// CHECK: [0x6c,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, ttmp11
-// CHECK: [0x7b,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, tba_hi
+// CHECK: [0x6d,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, m0
-// CHECK: [0x7c,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, tma_lo
+// CHECK: [0x6e,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, exec_lo
-// CHECK: [0x7e,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, tma_hi
+// CHECK: [0x6f,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, exec_hi
-// CHECK: [0x7f,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, ttmp11
+// CHECK: [0x7b,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, 0
-// CHECK: [0x80,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, m0
+// CHECK: [0x7c,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, -1
-// CHECK: [0xc1,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, exec_lo
+// CHECK: [0x7e,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, 0.5
-// CHECK: [0xf0,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, exec_hi
+// CHECK: [0x7f,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, -4.0
-// CHECK: [0xf7,0x0f,0x80,0xbe]
+s_bcnt1_i32_b32 s5, 0
+// CHECK: [0x80,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, 0xaf123456
-// CHECK: [0xff,0x0f,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bcnt1_i32_b32 s5, -1
+// CHECK: [0xc1,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, 0x3f717273
-// CHECK: [0xff,0x0f,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bcnt1_i32_b32 s5, 0.5
+// CHECK: [0xf0,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, s[0:1]
-// CHECK: [0x00,0x10,0x80,0xbe]
+s_bcnt1_i32_b32 s5, -4.0
+// CHECK: [0xf7,0x0f,0x85,0xbe]
 
-s_bcnt1_i32_b64 s103, s[0:1]
-// CHECK: [0x00,0x10,0xe7,0xbe]
+s_bcnt1_i32_b32 s5, 0xaf123456
+// CHECK: [0xff,0x0f,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bcnt1_i32_b64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x10,0xe8,0xbe]
+s_bcnt1_i32_b32 s5, 0x3f717273
+// CHECK: [0xff,0x0f,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bcnt1_i32_b64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x10,0xe9,0xbe]
+s_bcnt1_i32_b64 s5, s[2:3]
+// CHECK: [0x02,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x10,0xea,0xbe]
+s_bcnt1_i32_b64 s103, s[2:3]
+// CHECK: [0x02,0x10,0xe7,0xbe]
 
-s_bcnt1_i32_b64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x10,0xeb,0xbe]
+s_bcnt1_i32_b64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x10,0xe8,0xbe]
 
-s_bcnt1_i32_b64 tba_lo, s[0:1]
-// CHECK: [0x00,0x10,0xec,0xbe]
+s_bcnt1_i32_b64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x10,0xe9,0xbe]
 
-s_bcnt1_i32_b64 tba_hi, s[0:1]
-// CHECK: [0x00,0x10,0xed,0xbe]
+s_bcnt1_i32_b64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x10,0xea,0xbe]
 
-s_bcnt1_i32_b64 tma_lo, s[0:1]
-// CHECK: [0x00,0x10,0xee,0xbe]
+s_bcnt1_i32_b64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x10,0xeb,0xbe]
 
-s_bcnt1_i32_b64 tma_hi, s[0:1]
-// CHECK: [0x00,0x10,0xef,0xbe]
+s_bcnt1_i32_b64 tba_lo, s[2:3]
+// CHECK: [0x02,0x10,0xec,0xbe]
 
-s_bcnt1_i32_b64 ttmp11, s[0:1]
-// CHECK: [0x00,0x10,0xfb,0xbe]
+s_bcnt1_i32_b64 tba_hi, s[2:3]
+// CHECK: [0x02,0x10,0xed,0xbe]
 
-s_bcnt1_i32_b64 m0, s[0:1]
-// CHECK: [0x00,0x10,0xfc,0xbe]
+s_bcnt1_i32_b64 tma_lo, s[2:3]
+// CHECK: [0x02,0x10,0xee,0xbe]
 
-s_bcnt1_i32_b64 exec_lo, s[0:1]
-// CHECK: [0x00,0x10,0xfe,0xbe]
+s_bcnt1_i32_b64 tma_hi, s[2:3]
+// CHECK: [0x02,0x10,0xef,0xbe]
 
-s_bcnt1_i32_b64 exec_hi, s[0:1]
-// CHECK: [0x00,0x10,0xff,0xbe]
+s_bcnt1_i32_b64 ttmp11, s[2:3]
+// CHECK: [0x02,0x10,0xfb,0xbe]
 
-s_bcnt1_i32_b64 s0, s[2:3]
-// CHECK: [0x02,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 m0, s[2:3]
+// CHECK: [0x02,0x10,0xfc,0xbe]
 
-s_bcnt1_i32_b64 s0, s[102:103]
-// CHECK: [0x66,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 exec_lo, s[2:3]
+// CHECK: [0x02,0x10,0xfe,0xbe]
 
-s_bcnt1_i32_b64 s0, flat_scratch
-// CHECK: [0x68,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 exec_hi, s[2:3]
+// CHECK: [0x02,0x10,0xff,0xbe]
 
-s_bcnt1_i32_b64 s0, vcc
-// CHECK: [0x6a,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 s5, s[4:5]
+// CHECK: [0x04,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, tba
-// CHECK: [0x6c,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 s5, s[102:103]
+// CHECK: [0x66,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, tma
-// CHECK: [0x6e,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 s5, flat_scratch
+// CHECK: [0x68,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 s5, vcc
+// CHECK: [0x6a,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, exec
-// CHECK: [0x7e,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 s5, tba
+// CHECK: [0x6c,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, 0
-// CHECK: [0x80,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 s5, tma
+// CHECK: [0x6e,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, -1
-// CHECK: [0xc1,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, 0.5
-// CHECK: [0xf0,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 s5, exec
+// CHECK: [0x7e,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, -4.0
-// CHECK: [0xf7,0x10,0x80,0xbe]
+s_bcnt1_i32_b64 s5, 0
+// CHECK: [0x80,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, 0xaf123456
-// CHECK: [0xff,0x10,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bcnt1_i32_b64 s5, -1
+// CHECK: [0xc1,0x10,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, 0x3f717273
-// CHECK: [0xff,0x10,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bcnt1_i32_b64 s5, 0.5
+// CHECK: [0xf0,0x10,0x85,0xbe]
 
-s_ff0_i32_b32 s0, s0
-// CHECK: [0x00,0x11,0x80,0xbe]
+s_bcnt1_i32_b64 s5, -4.0
+// CHECK: [0xf7,0x10,0x85,0xbe]
 
-s_ff0_i32_b32 s103, s0
-// CHECK: [0x00,0x11,0xe7,0xbe]
+s_bcnt1_i32_b64 s5, 0xaf123456
+// CHECK: [0xff,0x10,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_ff0_i32_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x11,0xe8,0xbe]
+s_bcnt1_i32_b64 s5, 0x3f717273
+// CHECK: [0xff,0x10,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_ff0_i32_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x11,0xe9,0xbe]
+s_ff0_i32_b32 s5, s1
+// CHECK: [0x01,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 vcc_lo, s0
-// CHECK: [0x00,0x11,0xea,0xbe]
+s_ff0_i32_b32 s103, s1
+// CHECK: [0x01,0x11,0xe7,0xbe]
 
-s_ff0_i32_b32 vcc_hi, s0
-// CHECK: [0x00,0x11,0xeb,0xbe]
+s_ff0_i32_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x11,0xe8,0xbe]
 
-s_ff0_i32_b32 tba_lo, s0
-// CHECK: [0x00,0x11,0xec,0xbe]
+s_ff0_i32_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x11,0xe9,0xbe]
 
-s_ff0_i32_b32 tba_hi, s0
-// CHECK: [0x00,0x11,0xed,0xbe]
+s_ff0_i32_b32 vcc_lo, s1
+// CHECK: [0x01,0x11,0xea,0xbe]
 
-s_ff0_i32_b32 tma_lo, s0
-// CHECK: [0x00,0x11,0xee,0xbe]
+s_ff0_i32_b32 vcc_hi, s1
+// CHECK: [0x01,0x11,0xeb,0xbe]
 
-s_ff0_i32_b32 tma_hi, s0
-// CHECK: [0x00,0x11,0xef,0xbe]
+s_ff0_i32_b32 tba_lo, s1
+// CHECK: [0x01,0x11,0xec,0xbe]
 
-s_ff0_i32_b32 ttmp11, s0
-// CHECK: [0x00,0x11,0xfb,0xbe]
+s_ff0_i32_b32 tba_hi, s1
+// CHECK: [0x01,0x11,0xed,0xbe]
 
-s_ff0_i32_b32 m0, s0
-// CHECK: [0x00,0x11,0xfc,0xbe]
+s_ff0_i32_b32 tma_lo, s1
+// CHECK: [0x01,0x11,0xee,0xbe]
 
-s_ff0_i32_b32 exec_lo, s0
-// CHECK: [0x00,0x11,0xfe,0xbe]
+s_ff0_i32_b32 tma_hi, s1
+// CHECK: [0x01,0x11,0xef,0xbe]
 
-s_ff0_i32_b32 exec_hi, s0
-// CHECK: [0x00,0x11,0xff,0xbe]
+s_ff0_i32_b32 ttmp11, s1
+// CHECK: [0x01,0x11,0xfb,0xbe]
 
-s_ff0_i32_b32 s0, s103
-// CHECK: [0x67,0x11,0x80,0xbe]
+s_ff0_i32_b32 m0, s1
+// CHECK: [0x01,0x11,0xfc,0xbe]
 
-s_ff0_i32_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x11,0x80,0xbe]
+s_ff0_i32_b32 exec_lo, s1
+// CHECK: [0x01,0x11,0xfe,0xbe]
 
-s_ff0_i32_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x11,0x80,0xbe]
+s_ff0_i32_b32 exec_hi, s1
+// CHECK: [0x01,0x11,0xff,0xbe]
 
-s_ff0_i32_b32 s0, vcc_lo
-// CHECK: [0x6a,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, s103
+// CHECK: [0x67,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, vcc_hi
-// CHECK: [0x6b,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, tba_lo
-// CHECK: [0x6c,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, tba_hi
-// CHECK: [0x6d,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, vcc_lo
+// CHECK: [0x6a,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, tma_lo
-// CHECK: [0x6e,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, vcc_hi
+// CHECK: [0x6b,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, tma_hi
-// CHECK: [0x6f,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, tba_lo
+// CHECK: [0x6c,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, ttmp11
-// CHECK: [0x7b,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, tba_hi
+// CHECK: [0x6d,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, m0
-// CHECK: [0x7c,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, tma_lo
+// CHECK: [0x6e,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, exec_lo
-// CHECK: [0x7e,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, tma_hi
+// CHECK: [0x6f,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, exec_hi
-// CHECK: [0x7f,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, ttmp11
+// CHECK: [0x7b,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, 0
-// CHECK: [0x80,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, m0
+// CHECK: [0x7c,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, -1
-// CHECK: [0xc1,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, exec_lo
+// CHECK: [0x7e,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, 0.5
-// CHECK: [0xf0,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, exec_hi
+// CHECK: [0x7f,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, -4.0
-// CHECK: [0xf7,0x11,0x80,0xbe]
+s_ff0_i32_b32 s5, 0
+// CHECK: [0x80,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, 0xaf123456
-// CHECK: [0xff,0x11,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_ff0_i32_b32 s5, -1
+// CHECK: [0xc1,0x11,0x85,0xbe]
 
-s_ff0_i32_b32 s0, 0x3f717273
-// CHECK: [0xff,0x11,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_ff0_i32_b32 s5, 0.5
+// CHECK: [0xf0,0x11,0x85,0xbe]
 
-s_ff0_i32_b64 s0, s[0:1]
-// CHECK: [0x00,0x12,0x80,0xbe]
+s_ff0_i32_b32 s5, -4.0
+// CHECK: [0xf7,0x11,0x85,0xbe]
 
-s_ff0_i32_b64 s103, s[0:1]
-// CHECK: [0x00,0x12,0xe7,0xbe]
+s_ff0_i32_b32 s5, 0xaf123456
+// CHECK: [0xff,0x11,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_ff0_i32_b64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x12,0xe8,0xbe]
+s_ff0_i32_b32 s5, 0x3f717273
+// CHECK: [0xff,0x11,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_ff0_i32_b64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x12,0xe9,0xbe]
+s_ff0_i32_b64 s5, s[2:3]
+// CHECK: [0x02,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x12,0xea,0xbe]
+s_ff0_i32_b64 s103, s[2:3]
+// CHECK: [0x02,0x12,0xe7,0xbe]
 
-s_ff0_i32_b64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x12,0xeb,0xbe]
+s_ff0_i32_b64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x12,0xe8,0xbe]
 
-s_ff0_i32_b64 tba_lo, s[0:1]
-// CHECK: [0x00,0x12,0xec,0xbe]
+s_ff0_i32_b64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x12,0xe9,0xbe]
 
-s_ff0_i32_b64 tba_hi, s[0:1]
-// CHECK: [0x00,0x12,0xed,0xbe]
+s_ff0_i32_b64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x12,0xea,0xbe]
 
-s_ff0_i32_b64 tma_lo, s[0:1]
-// CHECK: [0x00,0x12,0xee,0xbe]
+s_ff0_i32_b64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x12,0xeb,0xbe]
 
-s_ff0_i32_b64 tma_hi, s[0:1]
-// CHECK: [0x00,0x12,0xef,0xbe]
+s_ff0_i32_b64 tba_lo, s[2:3]
+// CHECK: [0x02,0x12,0xec,0xbe]
 
-s_ff0_i32_b64 ttmp11, s[0:1]
-// CHECK: [0x00,0x12,0xfb,0xbe]
+s_ff0_i32_b64 tba_hi, s[2:3]
+// CHECK: [0x02,0x12,0xed,0xbe]
 
-s_ff0_i32_b64 m0, s[0:1]
-// CHECK: [0x00,0x12,0xfc,0xbe]
+s_ff0_i32_b64 tma_lo, s[2:3]
+// CHECK: [0x02,0x12,0xee,0xbe]
 
-s_ff0_i32_b64 exec_lo, s[0:1]
-// CHECK: [0x00,0x12,0xfe,0xbe]
+s_ff0_i32_b64 tma_hi, s[2:3]
+// CHECK: [0x02,0x12,0xef,0xbe]
 
-s_ff0_i32_b64 exec_hi, s[0:1]
-// CHECK: [0x00,0x12,0xff,0xbe]
+s_ff0_i32_b64 ttmp11, s[2:3]
+// CHECK: [0x02,0x12,0xfb,0xbe]
 
-s_ff0_i32_b64 s0, s[2:3]
-// CHECK: [0x02,0x12,0x80,0xbe]
+s_ff0_i32_b64 m0, s[2:3]
+// CHECK: [0x02,0x12,0xfc,0xbe]
 
-s_ff0_i32_b64 s0, s[102:103]
-// CHECK: [0x66,0x12,0x80,0xbe]
+s_ff0_i32_b64 exec_lo, s[2:3]
+// CHECK: [0x02,0x12,0xfe,0xbe]
 
-s_ff0_i32_b64 s0, flat_scratch
-// CHECK: [0x68,0x12,0x80,0xbe]
+s_ff0_i32_b64 exec_hi, s[2:3]
+// CHECK: [0x02,0x12,0xff,0xbe]
 
-s_ff0_i32_b64 s0, vcc
-// CHECK: [0x6a,0x12,0x80,0xbe]
+s_ff0_i32_b64 s5, s[4:5]
+// CHECK: [0x04,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 s0, tba
-// CHECK: [0x6c,0x12,0x80,0xbe]
+s_ff0_i32_b64 s5, s[102:103]
+// CHECK: [0x66,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 s0, tma
-// CHECK: [0x6e,0x12,0x80,0xbe]
+s_ff0_i32_b64 s5, flat_scratch
+// CHECK: [0x68,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x12,0x80,0xbe]
+s_ff0_i32_b64 s5, vcc
+// CHECK: [0x6a,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 s0, exec
-// CHECK: [0x7e,0x12,0x80,0xbe]
+s_ff0_i32_b64 s5, tba
+// CHECK: [0x6c,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 s0, 0
-// CHECK: [0x80,0x12,0x80,0xbe]
+s_ff0_i32_b64 s5, tma
+// CHECK: [0x6e,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 s0, -1
-// CHECK: [0xc1,0x12,0x80,0xbe]
+s_ff0_i32_b64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 s0, 0.5
-// CHECK: [0xf0,0x12,0x80,0xbe]
+s_ff0_i32_b64 s5, exec
+// CHECK: [0x7e,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 s0, -4.0
-// CHECK: [0xf7,0x12,0x80,0xbe]
+s_ff0_i32_b64 s5, 0
+// CHECK: [0x80,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 s0, 0xaf123456
-// CHECK: [0xff,0x12,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_ff0_i32_b64 s5, -1
+// CHECK: [0xc1,0x12,0x85,0xbe]
 
-s_ff0_i32_b64 s0, 0x3f717273
-// CHECK: [0xff,0x12,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_ff0_i32_b64 s5, 0.5
+// CHECK: [0xf0,0x12,0x85,0xbe]
 
-s_ff1_i32_b32 s0, s0
-// CHECK: [0x00,0x13,0x80,0xbe]
+s_ff0_i32_b64 s5, -4.0
+// CHECK: [0xf7,0x12,0x85,0xbe]
 
-s_ff1_i32_b32 s103, s0
-// CHECK: [0x00,0x13,0xe7,0xbe]
+s_ff0_i32_b64 s5, 0xaf123456
+// CHECK: [0xff,0x12,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_ff1_i32_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x13,0xe8,0xbe]
+s_ff0_i32_b64 s5, 0x3f717273
+// CHECK: [0xff,0x12,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_ff1_i32_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x13,0xe9,0xbe]
+s_ff1_i32_b32 s5, s1
+// CHECK: [0x01,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 vcc_lo, s0
-// CHECK: [0x00,0x13,0xea,0xbe]
+s_ff1_i32_b32 s103, s1
+// CHECK: [0x01,0x13,0xe7,0xbe]
 
-s_ff1_i32_b32 vcc_hi, s0
-// CHECK: [0x00,0x13,0xeb,0xbe]
+s_ff1_i32_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x13,0xe8,0xbe]
 
-s_ff1_i32_b32 tba_lo, s0
-// CHECK: [0x00,0x13,0xec,0xbe]
+s_ff1_i32_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x13,0xe9,0xbe]
 
-s_ff1_i32_b32 tba_hi, s0
-// CHECK: [0x00,0x13,0xed,0xbe]
+s_ff1_i32_b32 vcc_lo, s1
+// CHECK: [0x01,0x13,0xea,0xbe]
 
-s_ff1_i32_b32 tma_lo, s0
-// CHECK: [0x00,0x13,0xee,0xbe]
+s_ff1_i32_b32 vcc_hi, s1
+// CHECK: [0x01,0x13,0xeb,0xbe]
 
-s_ff1_i32_b32 tma_hi, s0
-// CHECK: [0x00,0x13,0xef,0xbe]
+s_ff1_i32_b32 tba_lo, s1
+// CHECK: [0x01,0x13,0xec,0xbe]
 
-s_ff1_i32_b32 ttmp11, s0
-// CHECK: [0x00,0x13,0xfb,0xbe]
+s_ff1_i32_b32 tba_hi, s1
+// CHECK: [0x01,0x13,0xed,0xbe]
 
-s_ff1_i32_b32 m0, s0
-// CHECK: [0x00,0x13,0xfc,0xbe]
+s_ff1_i32_b32 tma_lo, s1
+// CHECK: [0x01,0x13,0xee,0xbe]
 
-s_ff1_i32_b32 exec_lo, s0
-// CHECK: [0x00,0x13,0xfe,0xbe]
+s_ff1_i32_b32 tma_hi, s1
+// CHECK: [0x01,0x13,0xef,0xbe]
 
-s_ff1_i32_b32 exec_hi, s0
-// CHECK: [0x00,0x13,0xff,0xbe]
+s_ff1_i32_b32 ttmp11, s1
+// CHECK: [0x01,0x13,0xfb,0xbe]
 
-s_ff1_i32_b32 s0, s103
-// CHECK: [0x67,0x13,0x80,0xbe]
+s_ff1_i32_b32 m0, s1
+// CHECK: [0x01,0x13,0xfc,0xbe]
 
-s_ff1_i32_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x13,0x80,0xbe]
+s_ff1_i32_b32 exec_lo, s1
+// CHECK: [0x01,0x13,0xfe,0xbe]
 
-s_ff1_i32_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x13,0x80,0xbe]
+s_ff1_i32_b32 exec_hi, s1
+// CHECK: [0x01,0x13,0xff,0xbe]
 
-s_ff1_i32_b32 s0, vcc_lo
-// CHECK: [0x6a,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, s103
+// CHECK: [0x67,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, vcc_hi
-// CHECK: [0x6b,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, tba_lo
-// CHECK: [0x6c,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, tba_hi
-// CHECK: [0x6d,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, vcc_lo
+// CHECK: [0x6a,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, tma_lo
-// CHECK: [0x6e,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, vcc_hi
+// CHECK: [0x6b,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, tma_hi
-// CHECK: [0x6f,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, tba_lo
+// CHECK: [0x6c,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, ttmp11
-// CHECK: [0x7b,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, tba_hi
+// CHECK: [0x6d,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, m0
-// CHECK: [0x7c,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, tma_lo
+// CHECK: [0x6e,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, exec_lo
-// CHECK: [0x7e,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, tma_hi
+// CHECK: [0x6f,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, exec_hi
-// CHECK: [0x7f,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, ttmp11
+// CHECK: [0x7b,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, 0
-// CHECK: [0x80,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, m0
+// CHECK: [0x7c,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, -1
-// CHECK: [0xc1,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, exec_lo
+// CHECK: [0x7e,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, 0.5
-// CHECK: [0xf0,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, exec_hi
+// CHECK: [0x7f,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, -4.0
-// CHECK: [0xf7,0x13,0x80,0xbe]
+s_ff1_i32_b32 s5, 0
+// CHECK: [0x80,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, 0xaf123456
-// CHECK: [0xff,0x13,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_ff1_i32_b32 s5, -1
+// CHECK: [0xc1,0x13,0x85,0xbe]
 
-s_ff1_i32_b32 s0, 0x3f717273
-// CHECK: [0xff,0x13,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_ff1_i32_b32 s5, 0.5
+// CHECK: [0xf0,0x13,0x85,0xbe]
 
-s_ff1_i32_b64 s0, s[0:1]
-// CHECK: [0x00,0x14,0x80,0xbe]
+s_ff1_i32_b32 s5, -4.0
+// CHECK: [0xf7,0x13,0x85,0xbe]
 
-s_ff1_i32_b64 s103, s[0:1]
-// CHECK: [0x00,0x14,0xe7,0xbe]
+s_ff1_i32_b32 s5, 0xaf123456
+// CHECK: [0xff,0x13,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_ff1_i32_b64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x14,0xe8,0xbe]
+s_ff1_i32_b32 s5, 0x3f717273
+// CHECK: [0xff,0x13,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_ff1_i32_b64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x14,0xe9,0xbe]
+s_ff1_i32_b64 s5, s[2:3]
+// CHECK: [0x02,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x14,0xea,0xbe]
+s_ff1_i32_b64 s103, s[2:3]
+// CHECK: [0x02,0x14,0xe7,0xbe]
 
-s_ff1_i32_b64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x14,0xeb,0xbe]
+s_ff1_i32_b64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x14,0xe8,0xbe]
 
-s_ff1_i32_b64 tba_lo, s[0:1]
-// CHECK: [0x00,0x14,0xec,0xbe]
+s_ff1_i32_b64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x14,0xe9,0xbe]
 
-s_ff1_i32_b64 tba_hi, s[0:1]
-// CHECK: [0x00,0x14,0xed,0xbe]
+s_ff1_i32_b64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x14,0xea,0xbe]
 
-s_ff1_i32_b64 tma_lo, s[0:1]
-// CHECK: [0x00,0x14,0xee,0xbe]
+s_ff1_i32_b64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x14,0xeb,0xbe]
 
-s_ff1_i32_b64 tma_hi, s[0:1]
-// CHECK: [0x00,0x14,0xef,0xbe]
+s_ff1_i32_b64 tba_lo, s[2:3]
+// CHECK: [0x02,0x14,0xec,0xbe]
 
-s_ff1_i32_b64 ttmp11, s[0:1]
-// CHECK: [0x00,0x14,0xfb,0xbe]
+s_ff1_i32_b64 tba_hi, s[2:3]
+// CHECK: [0x02,0x14,0xed,0xbe]
 
-s_ff1_i32_b64 m0, s[0:1]
-// CHECK: [0x00,0x14,0xfc,0xbe]
+s_ff1_i32_b64 tma_lo, s[2:3]
+// CHECK: [0x02,0x14,0xee,0xbe]
 
-s_ff1_i32_b64 exec_lo, s[0:1]
-// CHECK: [0x00,0x14,0xfe,0xbe]
+s_ff1_i32_b64 tma_hi, s[2:3]
+// CHECK: [0x02,0x14,0xef,0xbe]
 
-s_ff1_i32_b64 exec_hi, s[0:1]
-// CHECK: [0x00,0x14,0xff,0xbe]
+s_ff1_i32_b64 ttmp11, s[2:3]
+// CHECK: [0x02,0x14,0xfb,0xbe]
 
-s_ff1_i32_b64 s0, s[2:3]
-// CHECK: [0x02,0x14,0x80,0xbe]
+s_ff1_i32_b64 m0, s[2:3]
+// CHECK: [0x02,0x14,0xfc,0xbe]
 
-s_ff1_i32_b64 s0, s[102:103]
-// CHECK: [0x66,0x14,0x80,0xbe]
+s_ff1_i32_b64 exec_lo, s[2:3]
+// CHECK: [0x02,0x14,0xfe,0xbe]
 
-s_ff1_i32_b64 s0, flat_scratch
-// CHECK: [0x68,0x14,0x80,0xbe]
+s_ff1_i32_b64 exec_hi, s[2:3]
+// CHECK: [0x02,0x14,0xff,0xbe]
 
-s_ff1_i32_b64 s0, vcc
-// CHECK: [0x6a,0x14,0x80,0xbe]
+s_ff1_i32_b64 s5, s[4:5]
+// CHECK: [0x04,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 s0, tba
-// CHECK: [0x6c,0x14,0x80,0xbe]
+s_ff1_i32_b64 s5, s[102:103]
+// CHECK: [0x66,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 s0, tma
-// CHECK: [0x6e,0x14,0x80,0xbe]
+s_ff1_i32_b64 s5, flat_scratch
+// CHECK: [0x68,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x14,0x80,0xbe]
+s_ff1_i32_b64 s5, vcc
+// CHECK: [0x6a,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 s0, exec
-// CHECK: [0x7e,0x14,0x80,0xbe]
+s_ff1_i32_b64 s5, tba
+// CHECK: [0x6c,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 s0, 0
-// CHECK: [0x80,0x14,0x80,0xbe]
+s_ff1_i32_b64 s5, tma
+// CHECK: [0x6e,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 s0, -1
-// CHECK: [0xc1,0x14,0x80,0xbe]
+s_ff1_i32_b64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 s0, 0.5
-// CHECK: [0xf0,0x14,0x80,0xbe]
+s_ff1_i32_b64 s5, exec
+// CHECK: [0x7e,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 s0, -4.0
-// CHECK: [0xf7,0x14,0x80,0xbe]
+s_ff1_i32_b64 s5, 0
+// CHECK: [0x80,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 s0, 0xaf123456
-// CHECK: [0xff,0x14,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_ff1_i32_b64 s5, -1
+// CHECK: [0xc1,0x14,0x85,0xbe]
 
-s_ff1_i32_b64 s0, 0x3f717273
-// CHECK: [0xff,0x14,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_ff1_i32_b64 s5, 0.5
+// CHECK: [0xf0,0x14,0x85,0xbe]
 
-s_flbit_i32_b32 s0, s0
-// CHECK: [0x00,0x15,0x80,0xbe]
+s_ff1_i32_b64 s5, -4.0
+// CHECK: [0xf7,0x14,0x85,0xbe]
 
-s_flbit_i32_b32 s103, s0
-// CHECK: [0x00,0x15,0xe7,0xbe]
+s_ff1_i32_b64 s5, 0xaf123456
+// CHECK: [0xff,0x14,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_flbit_i32_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x15,0xe8,0xbe]
+s_ff1_i32_b64 s5, 0x3f717273
+// CHECK: [0xff,0x14,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_flbit_i32_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x15,0xe9,0xbe]
+s_flbit_i32_b32 s5, s1
+// CHECK: [0x01,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 vcc_lo, s0
-// CHECK: [0x00,0x15,0xea,0xbe]
+s_flbit_i32_b32 s103, s1
+// CHECK: [0x01,0x15,0xe7,0xbe]
 
-s_flbit_i32_b32 vcc_hi, s0
-// CHECK: [0x00,0x15,0xeb,0xbe]
+s_flbit_i32_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x15,0xe8,0xbe]
 
-s_flbit_i32_b32 tba_lo, s0
-// CHECK: [0x00,0x15,0xec,0xbe]
+s_flbit_i32_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x15,0xe9,0xbe]
 
-s_flbit_i32_b32 tba_hi, s0
-// CHECK: [0x00,0x15,0xed,0xbe]
+s_flbit_i32_b32 vcc_lo, s1
+// CHECK: [0x01,0x15,0xea,0xbe]
 
-s_flbit_i32_b32 tma_lo, s0
-// CHECK: [0x00,0x15,0xee,0xbe]
+s_flbit_i32_b32 vcc_hi, s1
+// CHECK: [0x01,0x15,0xeb,0xbe]
 
-s_flbit_i32_b32 tma_hi, s0
-// CHECK: [0x00,0x15,0xef,0xbe]
+s_flbit_i32_b32 tba_lo, s1
+// CHECK: [0x01,0x15,0xec,0xbe]
 
-s_flbit_i32_b32 ttmp11, s0
-// CHECK: [0x00,0x15,0xfb,0xbe]
+s_flbit_i32_b32 tba_hi, s1
+// CHECK: [0x01,0x15,0xed,0xbe]
 
-s_flbit_i32_b32 m0, s0
-// CHECK: [0x00,0x15,0xfc,0xbe]
+s_flbit_i32_b32 tma_lo, s1
+// CHECK: [0x01,0x15,0xee,0xbe]
 
-s_flbit_i32_b32 exec_lo, s0
-// CHECK: [0x00,0x15,0xfe,0xbe]
+s_flbit_i32_b32 tma_hi, s1
+// CHECK: [0x01,0x15,0xef,0xbe]
 
-s_flbit_i32_b32 exec_hi, s0
-// CHECK: [0x00,0x15,0xff,0xbe]
+s_flbit_i32_b32 ttmp11, s1
+// CHECK: [0x01,0x15,0xfb,0xbe]
 
-s_flbit_i32_b32 s0, s103
-// CHECK: [0x67,0x15,0x80,0xbe]
+s_flbit_i32_b32 m0, s1
+// CHECK: [0x01,0x15,0xfc,0xbe]
 
-s_flbit_i32_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x15,0x80,0xbe]
+s_flbit_i32_b32 exec_lo, s1
+// CHECK: [0x01,0x15,0xfe,0xbe]
 
-s_flbit_i32_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x15,0x80,0xbe]
+s_flbit_i32_b32 exec_hi, s1
+// CHECK: [0x01,0x15,0xff,0xbe]
 
-s_flbit_i32_b32 s0, vcc_lo
-// CHECK: [0x6a,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, s103
+// CHECK: [0x67,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, vcc_hi
-// CHECK: [0x6b,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, tba_lo
-// CHECK: [0x6c,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, tba_hi
-// CHECK: [0x6d,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, vcc_lo
+// CHECK: [0x6a,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, tma_lo
-// CHECK: [0x6e,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, vcc_hi
+// CHECK: [0x6b,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, tma_hi
-// CHECK: [0x6f,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, tba_lo
+// CHECK: [0x6c,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, ttmp11
-// CHECK: [0x7b,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, tba_hi
+// CHECK: [0x6d,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, m0
-// CHECK: [0x7c,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, tma_lo
+// CHECK: [0x6e,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, exec_lo
-// CHECK: [0x7e,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, tma_hi
+// CHECK: [0x6f,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, exec_hi
-// CHECK: [0x7f,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, ttmp11
+// CHECK: [0x7b,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, 0
-// CHECK: [0x80,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, m0
+// CHECK: [0x7c,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, -1
-// CHECK: [0xc1,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, exec_lo
+// CHECK: [0x7e,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, 0.5
-// CHECK: [0xf0,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, exec_hi
+// CHECK: [0x7f,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, -4.0
-// CHECK: [0xf7,0x15,0x80,0xbe]
+s_flbit_i32_b32 s5, 0
+// CHECK: [0x80,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, 0xaf123456
-// CHECK: [0xff,0x15,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_flbit_i32_b32 s5, -1
+// CHECK: [0xc1,0x15,0x85,0xbe]
 
-s_flbit_i32_b32 s0, 0x3f717273
-// CHECK: [0xff,0x15,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_flbit_i32_b32 s5, 0.5
+// CHECK: [0xf0,0x15,0x85,0xbe]
 
-s_flbit_i32_b64 s0, s[0:1]
-// CHECK: [0x00,0x16,0x80,0xbe]
+s_flbit_i32_b32 s5, -4.0
+// CHECK: [0xf7,0x15,0x85,0xbe]
 
-s_flbit_i32_b64 s103, s[0:1]
-// CHECK: [0x00,0x16,0xe7,0xbe]
+s_flbit_i32_b32 s5, 0xaf123456
+// CHECK: [0xff,0x15,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_flbit_i32_b64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x16,0xe8,0xbe]
+s_flbit_i32_b32 s5, 0x3f717273
+// CHECK: [0xff,0x15,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_flbit_i32_b64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x16,0xe9,0xbe]
+s_flbit_i32_b64 s5, s[2:3]
+// CHECK: [0x02,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x16,0xea,0xbe]
+s_flbit_i32_b64 s103, s[2:3]
+// CHECK: [0x02,0x16,0xe7,0xbe]
 
-s_flbit_i32_b64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x16,0xeb,0xbe]
+s_flbit_i32_b64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x16,0xe8,0xbe]
 
-s_flbit_i32_b64 tba_lo, s[0:1]
-// CHECK: [0x00,0x16,0xec,0xbe]
+s_flbit_i32_b64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x16,0xe9,0xbe]
 
-s_flbit_i32_b64 tba_hi, s[0:1]
-// CHECK: [0x00,0x16,0xed,0xbe]
+s_flbit_i32_b64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x16,0xea,0xbe]
 
-s_flbit_i32_b64 tma_lo, s[0:1]
-// CHECK: [0x00,0x16,0xee,0xbe]
+s_flbit_i32_b64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x16,0xeb,0xbe]
 
-s_flbit_i32_b64 tma_hi, s[0:1]
-// CHECK: [0x00,0x16,0xef,0xbe]
+s_flbit_i32_b64 tba_lo, s[2:3]
+// CHECK: [0x02,0x16,0xec,0xbe]
 
-s_flbit_i32_b64 ttmp11, s[0:1]
-// CHECK: [0x00,0x16,0xfb,0xbe]
+s_flbit_i32_b64 tba_hi, s[2:3]
+// CHECK: [0x02,0x16,0xed,0xbe]
 
-s_flbit_i32_b64 m0, s[0:1]
-// CHECK: [0x00,0x16,0xfc,0xbe]
+s_flbit_i32_b64 tma_lo, s[2:3]
+// CHECK: [0x02,0x16,0xee,0xbe]
 
-s_flbit_i32_b64 exec_lo, s[0:1]
-// CHECK: [0x00,0x16,0xfe,0xbe]
+s_flbit_i32_b64 tma_hi, s[2:3]
+// CHECK: [0x02,0x16,0xef,0xbe]
 
-s_flbit_i32_b64 exec_hi, s[0:1]
-// CHECK: [0x00,0x16,0xff,0xbe]
+s_flbit_i32_b64 ttmp11, s[2:3]
+// CHECK: [0x02,0x16,0xfb,0xbe]
 
-s_flbit_i32_b64 s0, s[2:3]
-// CHECK: [0x02,0x16,0x80,0xbe]
+s_flbit_i32_b64 m0, s[2:3]
+// CHECK: [0x02,0x16,0xfc,0xbe]
 
-s_flbit_i32_b64 s0, s[102:103]
-// CHECK: [0x66,0x16,0x80,0xbe]
+s_flbit_i32_b64 exec_lo, s[2:3]
+// CHECK: [0x02,0x16,0xfe,0xbe]
 
-s_flbit_i32_b64 s0, flat_scratch
-// CHECK: [0x68,0x16,0x80,0xbe]
+s_flbit_i32_b64 exec_hi, s[2:3]
+// CHECK: [0x02,0x16,0xff,0xbe]
 
-s_flbit_i32_b64 s0, vcc
-// CHECK: [0x6a,0x16,0x80,0xbe]
+s_flbit_i32_b64 s5, s[4:5]
+// CHECK: [0x04,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 s0, tba
-// CHECK: [0x6c,0x16,0x80,0xbe]
+s_flbit_i32_b64 s5, s[102:103]
+// CHECK: [0x66,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 s0, tma
-// CHECK: [0x6e,0x16,0x80,0xbe]
+s_flbit_i32_b64 s5, flat_scratch
+// CHECK: [0x68,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x16,0x80,0xbe]
+s_flbit_i32_b64 s5, vcc
+// CHECK: [0x6a,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 s0, exec
-// CHECK: [0x7e,0x16,0x80,0xbe]
+s_flbit_i32_b64 s5, tba
+// CHECK: [0x6c,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 s0, 0
-// CHECK: [0x80,0x16,0x80,0xbe]
+s_flbit_i32_b64 s5, tma
+// CHECK: [0x6e,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 s0, -1
-// CHECK: [0xc1,0x16,0x80,0xbe]
+s_flbit_i32_b64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 s0, 0.5
-// CHECK: [0xf0,0x16,0x80,0xbe]
+s_flbit_i32_b64 s5, exec
+// CHECK: [0x7e,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 s0, -4.0
-// CHECK: [0xf7,0x16,0x80,0xbe]
+s_flbit_i32_b64 s5, 0
+// CHECK: [0x80,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 s0, 0xaf123456
-// CHECK: [0xff,0x16,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_flbit_i32_b64 s5, -1
+// CHECK: [0xc1,0x16,0x85,0xbe]
 
-s_flbit_i32_b64 s0, 0x3f717273
-// CHECK: [0xff,0x16,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_flbit_i32_b64 s5, 0.5
+// CHECK: [0xf0,0x16,0x85,0xbe]
 
-s_flbit_i32 s0, s0
-// CHECK: [0x00,0x17,0x80,0xbe]
+s_flbit_i32_b64 s5, -4.0
+// CHECK: [0xf7,0x16,0x85,0xbe]
 
-s_flbit_i32 s103, s0
-// CHECK: [0x00,0x17,0xe7,0xbe]
+s_flbit_i32_b64 s5, 0xaf123456
+// CHECK: [0xff,0x16,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_flbit_i32 flat_scratch_lo, s0
-// CHECK: [0x00,0x17,0xe8,0xbe]
+s_flbit_i32_b64 s5, 0x3f717273
+// CHECK: [0xff,0x16,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_flbit_i32 flat_scratch_hi, s0
-// CHECK: [0x00,0x17,0xe9,0xbe]
+s_flbit_i32 s5, s1
+// CHECK: [0x01,0x17,0x85,0xbe]
 
-s_flbit_i32 vcc_lo, s0
-// CHECK: [0x00,0x17,0xea,0xbe]
+s_flbit_i32 s103, s1
+// CHECK: [0x01,0x17,0xe7,0xbe]
 
-s_flbit_i32 vcc_hi, s0
-// CHECK: [0x00,0x17,0xeb,0xbe]
+s_flbit_i32 flat_scratch_lo, s1
+// CHECK: [0x01,0x17,0xe8,0xbe]
 
-s_flbit_i32 tba_lo, s0
-// CHECK: [0x00,0x17,0xec,0xbe]
+s_flbit_i32 flat_scratch_hi, s1
+// CHECK: [0x01,0x17,0xe9,0xbe]
 
-s_flbit_i32 tba_hi, s0
-// CHECK: [0x00,0x17,0xed,0xbe]
+s_flbit_i32 vcc_lo, s1
+// CHECK: [0x01,0x17,0xea,0xbe]
 
-s_flbit_i32 tma_lo, s0
-// CHECK: [0x00,0x17,0xee,0xbe]
+s_flbit_i32 vcc_hi, s1
+// CHECK: [0x01,0x17,0xeb,0xbe]
 
-s_flbit_i32 tma_hi, s0
-// CHECK: [0x00,0x17,0xef,0xbe]
+s_flbit_i32 tba_lo, s1
+// CHECK: [0x01,0x17,0xec,0xbe]
 
-s_flbit_i32 ttmp11, s0
-// CHECK: [0x00,0x17,0xfb,0xbe]
+s_flbit_i32 tba_hi, s1
+// CHECK: [0x01,0x17,0xed,0xbe]
 
-s_flbit_i32 m0, s0
-// CHECK: [0x00,0x17,0xfc,0xbe]
+s_flbit_i32 tma_lo, s1
+// CHECK: [0x01,0x17,0xee,0xbe]
 
-s_flbit_i32 exec_lo, s0
-// CHECK: [0x00,0x17,0xfe,0xbe]
+s_flbit_i32 tma_hi, s1
+// CHECK: [0x01,0x17,0xef,0xbe]
 
-s_flbit_i32 exec_hi, s0
-// CHECK: [0x00,0x17,0xff,0xbe]
+s_flbit_i32 ttmp11, s1
+// CHECK: [0x01,0x17,0xfb,0xbe]
 
-s_flbit_i32 s0, s103
-// CHECK: [0x67,0x17,0x80,0xbe]
+s_flbit_i32 m0, s1
+// CHECK: [0x01,0x17,0xfc,0xbe]
 
-s_flbit_i32 s0, flat_scratch_lo
-// CHECK: [0x68,0x17,0x80,0xbe]
+s_flbit_i32 exec_lo, s1
+// CHECK: [0x01,0x17,0xfe,0xbe]
 
-s_flbit_i32 s0, flat_scratch_hi
-// CHECK: [0x69,0x17,0x80,0xbe]
+s_flbit_i32 exec_hi, s1
+// CHECK: [0x01,0x17,0xff,0xbe]
 
-s_flbit_i32 s0, vcc_lo
-// CHECK: [0x6a,0x17,0x80,0xbe]
+s_flbit_i32 s5, s103
+// CHECK: [0x67,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, vcc_hi
-// CHECK: [0x6b,0x17,0x80,0xbe]
+s_flbit_i32 s5, flat_scratch_lo
+// CHECK: [0x68,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, tba_lo
-// CHECK: [0x6c,0x17,0x80,0xbe]
+s_flbit_i32 s5, flat_scratch_hi
+// CHECK: [0x69,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, tba_hi
-// CHECK: [0x6d,0x17,0x80,0xbe]
+s_flbit_i32 s5, vcc_lo
+// CHECK: [0x6a,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, tma_lo
-// CHECK: [0x6e,0x17,0x80,0xbe]
+s_flbit_i32 s5, vcc_hi
+// CHECK: [0x6b,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, tma_hi
-// CHECK: [0x6f,0x17,0x80,0xbe]
+s_flbit_i32 s5, tba_lo
+// CHECK: [0x6c,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, ttmp11
-// CHECK: [0x7b,0x17,0x80,0xbe]
+s_flbit_i32 s5, tba_hi
+// CHECK: [0x6d,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, m0
-// CHECK: [0x7c,0x17,0x80,0xbe]
+s_flbit_i32 s5, tma_lo
+// CHECK: [0x6e,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, exec_lo
-// CHECK: [0x7e,0x17,0x80,0xbe]
+s_flbit_i32 s5, tma_hi
+// CHECK: [0x6f,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, exec_hi
-// CHECK: [0x7f,0x17,0x80,0xbe]
+s_flbit_i32 s5, ttmp11
+// CHECK: [0x7b,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, 0
-// CHECK: [0x80,0x17,0x80,0xbe]
+s_flbit_i32 s5, m0
+// CHECK: [0x7c,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, -1
-// CHECK: [0xc1,0x17,0x80,0xbe]
+s_flbit_i32 s5, exec_lo
+// CHECK: [0x7e,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, 0.5
-// CHECK: [0xf0,0x17,0x80,0xbe]
+s_flbit_i32 s5, exec_hi
+// CHECK: [0x7f,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, -4.0
-// CHECK: [0xf7,0x17,0x80,0xbe]
+s_flbit_i32 s5, 0
+// CHECK: [0x80,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, 0xaf123456
-// CHECK: [0xff,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_flbit_i32 s5, -1
+// CHECK: [0xc1,0x17,0x85,0xbe]
 
-s_flbit_i32 s0, 0x3f717273
-// CHECK: [0xff,0x17,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_flbit_i32 s5, 0.5
+// CHECK: [0xf0,0x17,0x85,0xbe]
 
-s_flbit_i32_i64 s0, s[0:1]
-// CHECK: [0x00,0x18,0x80,0xbe]
+s_flbit_i32 s5, -4.0
+// CHECK: [0xf7,0x17,0x85,0xbe]
 
-s_flbit_i32_i64 s103, s[0:1]
-// CHECK: [0x00,0x18,0xe7,0xbe]
+s_flbit_i32 s5, 0xaf123456
+// CHECK: [0xff,0x17,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_flbit_i32_i64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x18,0xe8,0xbe]
+s_flbit_i32 s5, 0x3f717273
+// CHECK: [0xff,0x17,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_flbit_i32_i64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x18,0xe9,0xbe]
+s_flbit_i32_i64 s5, s[2:3]
+// CHECK: [0x02,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x18,0xea,0xbe]
+s_flbit_i32_i64 s103, s[2:3]
+// CHECK: [0x02,0x18,0xe7,0xbe]
 
-s_flbit_i32_i64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x18,0xeb,0xbe]
+s_flbit_i32_i64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x18,0xe8,0xbe]
 
-s_flbit_i32_i64 tba_lo, s[0:1]
-// CHECK: [0x00,0x18,0xec,0xbe]
+s_flbit_i32_i64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x18,0xe9,0xbe]
 
-s_flbit_i32_i64 tba_hi, s[0:1]
-// CHECK: [0x00,0x18,0xed,0xbe]
+s_flbit_i32_i64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x18,0xea,0xbe]
 
-s_flbit_i32_i64 tma_lo, s[0:1]
-// CHECK: [0x00,0x18,0xee,0xbe]
+s_flbit_i32_i64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x18,0xeb,0xbe]
 
-s_flbit_i32_i64 tma_hi, s[0:1]
-// CHECK: [0x00,0x18,0xef,0xbe]
+s_flbit_i32_i64 tba_lo, s[2:3]
+// CHECK: [0x02,0x18,0xec,0xbe]
 
-s_flbit_i32_i64 ttmp11, s[0:1]
-// CHECK: [0x00,0x18,0xfb,0xbe]
+s_flbit_i32_i64 tba_hi, s[2:3]
+// CHECK: [0x02,0x18,0xed,0xbe]
 
-s_flbit_i32_i64 m0, s[0:1]
-// CHECK: [0x00,0x18,0xfc,0xbe]
+s_flbit_i32_i64 tma_lo, s[2:3]
+// CHECK: [0x02,0x18,0xee,0xbe]
 
-s_flbit_i32_i64 exec_lo, s[0:1]
-// CHECK: [0x00,0x18,0xfe,0xbe]
+s_flbit_i32_i64 tma_hi, s[2:3]
+// CHECK: [0x02,0x18,0xef,0xbe]
 
-s_flbit_i32_i64 exec_hi, s[0:1]
-// CHECK: [0x00,0x18,0xff,0xbe]
+s_flbit_i32_i64 ttmp11, s[2:3]
+// CHECK: [0x02,0x18,0xfb,0xbe]
 
-s_flbit_i32_i64 s0, s[2:3]
-// CHECK: [0x02,0x18,0x80,0xbe]
+s_flbit_i32_i64 m0, s[2:3]
+// CHECK: [0x02,0x18,0xfc,0xbe]
 
-s_flbit_i32_i64 s0, s[102:103]
-// CHECK: [0x66,0x18,0x80,0xbe]
+s_flbit_i32_i64 exec_lo, s[2:3]
+// CHECK: [0x02,0x18,0xfe,0xbe]
 
-s_flbit_i32_i64 s0, flat_scratch
-// CHECK: [0x68,0x18,0x80,0xbe]
+s_flbit_i32_i64 exec_hi, s[2:3]
+// CHECK: [0x02,0x18,0xff,0xbe]
 
-s_flbit_i32_i64 s0, vcc
-// CHECK: [0x6a,0x18,0x80,0xbe]
+s_flbit_i32_i64 s5, s[4:5]
+// CHECK: [0x04,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 s0, tba
-// CHECK: [0x6c,0x18,0x80,0xbe]
+s_flbit_i32_i64 s5, s[102:103]
+// CHECK: [0x66,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 s0, tma
-// CHECK: [0x6e,0x18,0x80,0xbe]
+s_flbit_i32_i64 s5, flat_scratch
+// CHECK: [0x68,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x18,0x80,0xbe]
+s_flbit_i32_i64 s5, vcc
+// CHECK: [0x6a,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 s0, exec
-// CHECK: [0x7e,0x18,0x80,0xbe]
+s_flbit_i32_i64 s5, tba
+// CHECK: [0x6c,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 s0, 0
-// CHECK: [0x80,0x18,0x80,0xbe]
+s_flbit_i32_i64 s5, tma
+// CHECK: [0x6e,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 s0, -1
-// CHECK: [0xc1,0x18,0x80,0xbe]
+s_flbit_i32_i64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 s0, 0.5
-// CHECK: [0xf0,0x18,0x80,0xbe]
+s_flbit_i32_i64 s5, exec
+// CHECK: [0x7e,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 s0, -4.0
-// CHECK: [0xf7,0x18,0x80,0xbe]
+s_flbit_i32_i64 s5, 0
+// CHECK: [0x80,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 s0, 0xaf123456
-// CHECK: [0xff,0x18,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_flbit_i32_i64 s5, -1
+// CHECK: [0xc1,0x18,0x85,0xbe]
 
-s_flbit_i32_i64 s0, 0x3f717273
-// CHECK: [0xff,0x18,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_flbit_i32_i64 s5, 0.5
+// CHECK: [0xf0,0x18,0x85,0xbe]
 
-s_sext_i32_i8 s0, s0
-// CHECK: [0x00,0x19,0x80,0xbe]
+s_flbit_i32_i64 s5, -4.0
+// CHECK: [0xf7,0x18,0x85,0xbe]
 
-s_sext_i32_i8 s103, s0
-// CHECK: [0x00,0x19,0xe7,0xbe]
+s_flbit_i32_i64 s5, 0xaf123456
+// CHECK: [0xff,0x18,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_sext_i32_i8 flat_scratch_lo, s0
-// CHECK: [0x00,0x19,0xe8,0xbe]
+s_flbit_i32_i64 s5, 0x3f717273
+// CHECK: [0xff,0x18,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_sext_i32_i8 flat_scratch_hi, s0
-// CHECK: [0x00,0x19,0xe9,0xbe]
+s_sext_i32_i8 s5, s1
+// CHECK: [0x01,0x19,0x85,0xbe]
 
-s_sext_i32_i8 vcc_lo, s0
-// CHECK: [0x00,0x19,0xea,0xbe]
+s_sext_i32_i8 s103, s1
+// CHECK: [0x01,0x19,0xe7,0xbe]
 
-s_sext_i32_i8 vcc_hi, s0
-// CHECK: [0x00,0x19,0xeb,0xbe]
+s_sext_i32_i8 flat_scratch_lo, s1
+// CHECK: [0x01,0x19,0xe8,0xbe]
 
-s_sext_i32_i8 tba_lo, s0
-// CHECK: [0x00,0x19,0xec,0xbe]
+s_sext_i32_i8 flat_scratch_hi, s1
+// CHECK: [0x01,0x19,0xe9,0xbe]
 
-s_sext_i32_i8 tba_hi, s0
-// CHECK: [0x00,0x19,0xed,0xbe]
+s_sext_i32_i8 vcc_lo, s1
+// CHECK: [0x01,0x19,0xea,0xbe]
 
-s_sext_i32_i8 tma_lo, s0
-// CHECK: [0x00,0x19,0xee,0xbe]
+s_sext_i32_i8 vcc_hi, s1
+// CHECK: [0x01,0x19,0xeb,0xbe]
 
-s_sext_i32_i8 tma_hi, s0
-// CHECK: [0x00,0x19,0xef,0xbe]
+s_sext_i32_i8 tba_lo, s1
+// CHECK: [0x01,0x19,0xec,0xbe]
 
-s_sext_i32_i8 ttmp11, s0
-// CHECK: [0x00,0x19,0xfb,0xbe]
+s_sext_i32_i8 tba_hi, s1
+// CHECK: [0x01,0x19,0xed,0xbe]
 
-s_sext_i32_i8 m0, s0
-// CHECK: [0x00,0x19,0xfc,0xbe]
+s_sext_i32_i8 tma_lo, s1
+// CHECK: [0x01,0x19,0xee,0xbe]
 
-s_sext_i32_i8 exec_lo, s0
-// CHECK: [0x00,0x19,0xfe,0xbe]
+s_sext_i32_i8 tma_hi, s1
+// CHECK: [0x01,0x19,0xef,0xbe]
 
-s_sext_i32_i8 exec_hi, s0
-// CHECK: [0x00,0x19,0xff,0xbe]
+s_sext_i32_i8 ttmp11, s1
+// CHECK: [0x01,0x19,0xfb,0xbe]
 
-s_sext_i32_i8 s0, s103
-// CHECK: [0x67,0x19,0x80,0xbe]
+s_sext_i32_i8 m0, s1
+// CHECK: [0x01,0x19,0xfc,0xbe]
 
-s_sext_i32_i8 s0, flat_scratch_lo
-// CHECK: [0x68,0x19,0x80,0xbe]
+s_sext_i32_i8 exec_lo, s1
+// CHECK: [0x01,0x19,0xfe,0xbe]
 
-s_sext_i32_i8 s0, flat_scratch_hi
-// CHECK: [0x69,0x19,0x80,0xbe]
+s_sext_i32_i8 exec_hi, s1
+// CHECK: [0x01,0x19,0xff,0xbe]
 
-s_sext_i32_i8 s0, vcc_lo
-// CHECK: [0x6a,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, s103
+// CHECK: [0x67,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, vcc_hi
-// CHECK: [0x6b,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, flat_scratch_lo
+// CHECK: [0x68,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, tba_lo
-// CHECK: [0x6c,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, flat_scratch_hi
+// CHECK: [0x69,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, tba_hi
-// CHECK: [0x6d,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, vcc_lo
+// CHECK: [0x6a,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, tma_lo
-// CHECK: [0x6e,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, vcc_hi
+// CHECK: [0x6b,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, tma_hi
-// CHECK: [0x6f,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, tba_lo
+// CHECK: [0x6c,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, ttmp11
-// CHECK: [0x7b,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, tba_hi
+// CHECK: [0x6d,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, m0
-// CHECK: [0x7c,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, tma_lo
+// CHECK: [0x6e,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, exec_lo
-// CHECK: [0x7e,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, tma_hi
+// CHECK: [0x6f,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, exec_hi
-// CHECK: [0x7f,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, ttmp11
+// CHECK: [0x7b,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, 0
-// CHECK: [0x80,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, m0
+// CHECK: [0x7c,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, -1
-// CHECK: [0xc1,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, exec_lo
+// CHECK: [0x7e,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, 0.5
-// CHECK: [0xf0,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, exec_hi
+// CHECK: [0x7f,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, -4.0
-// CHECK: [0xf7,0x19,0x80,0xbe]
+s_sext_i32_i8 s5, 0
+// CHECK: [0x80,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, 0x71
-// CHECK: [0xff,0x19,0x80,0xbe,0x71,0x00,0x00,0x00]
+s_sext_i32_i8 s5, -1
+// CHECK: [0xc1,0x19,0x85,0xbe]
 
-s_sext_i32_i8 s0, 0xf0
-// CHECK: [0xff,0x19,0x80,0xbe,0xf0,0x00,0x00,0x00]
+s_sext_i32_i8 s5, 0x71
+// CHECK: [0xff,0x19,0x85,0xbe,0x71,0x00,0x00,0x00]
 
-s_sext_i32_i16 s0, s0
-// CHECK: [0x00,0x1a,0x80,0xbe]
+s_sext_i32_i8 s5, 0xf0
+// CHECK: [0xff,0x19,0x85,0xbe,0xf0,0x00,0x00,0x00]
 
-s_sext_i32_i16 s103, s0
-// CHECK: [0x00,0x1a,0xe7,0xbe]
+s_sext_i32_i16 s5, s1
+// CHECK: [0x01,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 flat_scratch_lo, s0
-// CHECK: [0x00,0x1a,0xe8,0xbe]
+s_sext_i32_i16 s103, s1
+// CHECK: [0x01,0x1a,0xe7,0xbe]
 
-s_sext_i32_i16 flat_scratch_hi, s0
-// CHECK: [0x00,0x1a,0xe9,0xbe]
+s_sext_i32_i16 flat_scratch_lo, s1
+// CHECK: [0x01,0x1a,0xe8,0xbe]
 
-s_sext_i32_i16 vcc_lo, s0
-// CHECK: [0x00,0x1a,0xea,0xbe]
+s_sext_i32_i16 flat_scratch_hi, s1
+// CHECK: [0x01,0x1a,0xe9,0xbe]
 
-s_sext_i32_i16 vcc_hi, s0
-// CHECK: [0x00,0x1a,0xeb,0xbe]
+s_sext_i32_i16 vcc_lo, s1
+// CHECK: [0x01,0x1a,0xea,0xbe]
 
-s_sext_i32_i16 tba_lo, s0
-// CHECK: [0x00,0x1a,0xec,0xbe]
+s_sext_i32_i16 vcc_hi, s1
+// CHECK: [0x01,0x1a,0xeb,0xbe]
 
-s_sext_i32_i16 tba_hi, s0
-// CHECK: [0x00,0x1a,0xed,0xbe]
+s_sext_i32_i16 tba_lo, s1
+// CHECK: [0x01,0x1a,0xec,0xbe]
 
-s_sext_i32_i16 tma_lo, s0
-// CHECK: [0x00,0x1a,0xee,0xbe]
+s_sext_i32_i16 tba_hi, s1
+// CHECK: [0x01,0x1a,0xed,0xbe]
 
-s_sext_i32_i16 tma_hi, s0
-// CHECK: [0x00,0x1a,0xef,0xbe]
+s_sext_i32_i16 tma_lo, s1
+// CHECK: [0x01,0x1a,0xee,0xbe]
 
-s_sext_i32_i16 ttmp11, s0
-// CHECK: [0x00,0x1a,0xfb,0xbe]
+s_sext_i32_i16 tma_hi, s1
+// CHECK: [0x01,0x1a,0xef,0xbe]
 
-s_sext_i32_i16 m0, s0
-// CHECK: [0x00,0x1a,0xfc,0xbe]
+s_sext_i32_i16 ttmp11, s1
+// CHECK: [0x01,0x1a,0xfb,0xbe]
 
-s_sext_i32_i16 exec_lo, s0
-// CHECK: [0x00,0x1a,0xfe,0xbe]
+s_sext_i32_i16 m0, s1
+// CHECK: [0x01,0x1a,0xfc,0xbe]
 
-s_sext_i32_i16 exec_hi, s0
-// CHECK: [0x00,0x1a,0xff,0xbe]
+s_sext_i32_i16 exec_lo, s1
+// CHECK: [0x01,0x1a,0xfe,0xbe]
 
-s_sext_i32_i16 s0, s103
-// CHECK: [0x67,0x1a,0x80,0xbe]
+s_sext_i32_i16 exec_hi, s1
+// CHECK: [0x01,0x1a,0xff,0xbe]
 
-s_sext_i32_i16 s0, flat_scratch_lo
-// CHECK: [0x68,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, s103
+// CHECK: [0x67,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, flat_scratch_hi
-// CHECK: [0x69,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, flat_scratch_lo
+// CHECK: [0x68,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, vcc_lo
-// CHECK: [0x6a,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, flat_scratch_hi
+// CHECK: [0x69,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, vcc_hi
-// CHECK: [0x6b,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, vcc_lo
+// CHECK: [0x6a,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, tba_lo
-// CHECK: [0x6c,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, vcc_hi
+// CHECK: [0x6b,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, tba_hi
-// CHECK: [0x6d,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, tba_lo
+// CHECK: [0x6c,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, tma_lo
-// CHECK: [0x6e,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, tba_hi
+// CHECK: [0x6d,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, tma_hi
-// CHECK: [0x6f,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, tma_lo
+// CHECK: [0x6e,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, ttmp11
-// CHECK: [0x7b,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, tma_hi
+// CHECK: [0x6f,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, m0
-// CHECK: [0x7c,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, ttmp11
+// CHECK: [0x7b,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, exec_lo
-// CHECK: [0x7e,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, m0
+// CHECK: [0x7c,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, exec_hi
-// CHECK: [0x7f,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, exec_lo
+// CHECK: [0x7e,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, 0
-// CHECK: [0x80,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, exec_hi
+// CHECK: [0x7f,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, -1
-// CHECK: [0xc1,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, 0
+// CHECK: [0x80,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, 0.5
-// CHECK: [0xf0,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, -1
+// CHECK: [0xc1,0x1a,0x85,0xbe]
 
-s_sext_i32_i16 s0, -4.0
-// CHECK: [0xf7,0x1a,0x80,0xbe]
+s_sext_i32_i16 s5, 0xfe0b
+// CHECK: [0xff,0x1a,0x85,0xbe,0x0b,0xfe,0x00,0x00]
 
-s_sext_i32_i16 s0, 0xfe0b
-// CHECK: [0xff,0x1a,0x80,0xbe,0x0b,0xfe,0x00,0x00]
+s_sext_i32_i16 s5, 0x3456
+// CHECK: [0xff,0x1a,0x85,0xbe,0x56,0x34,0x00,0x00]
 
-s_sext_i32_i16 s0, 0x3456
-// CHECK: [0xff,0x1a,0x80,0xbe,0x56,0x34,0x00,0x00]
+s_bitset0_b32 s5, s1
+// CHECK: [0x01,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, s0
-// CHECK: [0x00,0x1b,0x80,0xbe]
+s_bitset0_b32 s103, s1
+// CHECK: [0x01,0x1b,0xe7,0xbe]
 
-s_bitset0_b32 s103, s0
-// CHECK: [0x00,0x1b,0xe7,0xbe]
+s_bitset0_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x1b,0xe8,0xbe]
 
-s_bitset0_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x1b,0xe8,0xbe]
+s_bitset0_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x1b,0xe9,0xbe]
 
-s_bitset0_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x1b,0xe9,0xbe]
+s_bitset0_b32 vcc_lo, s1
+// CHECK: [0x01,0x1b,0xea,0xbe]
 
-s_bitset0_b32 vcc_lo, s0
-// CHECK: [0x00,0x1b,0xea,0xbe]
+s_bitset0_b32 vcc_hi, s1
+// CHECK: [0x01,0x1b,0xeb,0xbe]
 
-s_bitset0_b32 vcc_hi, s0
-// CHECK: [0x00,0x1b,0xeb,0xbe]
+s_bitset0_b32 tba_lo, s1
+// CHECK: [0x01,0x1b,0xec,0xbe]
 
-s_bitset0_b32 tba_lo, s0
-// CHECK: [0x00,0x1b,0xec,0xbe]
+s_bitset0_b32 tba_hi, s1
+// CHECK: [0x01,0x1b,0xed,0xbe]
 
-s_bitset0_b32 tba_hi, s0
-// CHECK: [0x00,0x1b,0xed,0xbe]
+s_bitset0_b32 tma_lo, s1
+// CHECK: [0x01,0x1b,0xee,0xbe]
 
-s_bitset0_b32 tma_lo, s0
-// CHECK: [0x00,0x1b,0xee,0xbe]
+s_bitset0_b32 tma_hi, s1
+// CHECK: [0x01,0x1b,0xef,0xbe]
 
-s_bitset0_b32 tma_hi, s0
-// CHECK: [0x00,0x1b,0xef,0xbe]
+s_bitset0_b32 ttmp11, s1
+// CHECK: [0x01,0x1b,0xfb,0xbe]
 
-s_bitset0_b32 ttmp11, s0
-// CHECK: [0x00,0x1b,0xfb,0xbe]
+s_bitset0_b32 m0, s1
+// CHECK: [0x01,0x1b,0xfc,0xbe]
 
-s_bitset0_b32 m0, s0
-// CHECK: [0x00,0x1b,0xfc,0xbe]
+s_bitset0_b32 exec_lo, s1
+// CHECK: [0x01,0x1b,0xfe,0xbe]
 
-s_bitset0_b32 exec_lo, s0
-// CHECK: [0x00,0x1b,0xfe,0xbe]
+s_bitset0_b32 exec_hi, s1
+// CHECK: [0x01,0x1b,0xff,0xbe]
 
-s_bitset0_b32 exec_hi, s0
-// CHECK: [0x00,0x1b,0xff,0xbe]
+s_bitset0_b32 s5, s103
+// CHECK: [0x67,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, s103
-// CHECK: [0x67,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, vcc_lo
+// CHECK: [0x6a,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, vcc_lo
-// CHECK: [0x6a,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, vcc_hi
+// CHECK: [0x6b,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, vcc_hi
-// CHECK: [0x6b,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, tba_lo
+// CHECK: [0x6c,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, tba_lo
-// CHECK: [0x6c,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, tba_hi
+// CHECK: [0x6d,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, tba_hi
-// CHECK: [0x6d,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, tma_lo
+// CHECK: [0x6e,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, tma_lo
-// CHECK: [0x6e,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, tma_hi
+// CHECK: [0x6f,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, tma_hi
-// CHECK: [0x6f,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, ttmp11
+// CHECK: [0x7b,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, ttmp11
-// CHECK: [0x7b,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, m0
+// CHECK: [0x7c,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, m0
-// CHECK: [0x7c,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, exec_lo
+// CHECK: [0x7e,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, exec_lo
-// CHECK: [0x7e,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, exec_hi
+// CHECK: [0x7f,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, exec_hi
-// CHECK: [0x7f,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, 0
+// CHECK: [0x80,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, 0
-// CHECK: [0x80,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, -1
+// CHECK: [0xc1,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, -1
-// CHECK: [0xc1,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, 0.5
+// CHECK: [0xf0,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, 0.5
-// CHECK: [0xf0,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, -4.0
+// CHECK: [0xf7,0x1b,0x85,0xbe]
 
-s_bitset0_b32 s0, -4.0
-// CHECK: [0xf7,0x1b,0x80,0xbe]
+s_bitset0_b32 s5, 0xaf123456
+// CHECK: [0xff,0x1b,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bitset0_b32 s0, 0xaf123456
-// CHECK: [0xff,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bitset0_b32 s5, 0x3f717273
+// CHECK: [0xff,0x1b,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bitset0_b32 s0, 0x3f717273
-// CHECK: [0xff,0x1b,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bitset0_b64 s[10:11], s1
+// CHECK: [0x01,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], s0
-// CHECK: [0x00,0x1c,0x80,0xbe]
+s_bitset0_b64 s[12:13], s1
+// CHECK: [0x01,0x1c,0x8c,0xbe]
 
-s_bitset0_b64 s[2:3], s0
-// CHECK: [0x00,0x1c,0x82,0xbe]
+s_bitset0_b64 s[102:103], s1
+// CHECK: [0x01,0x1c,0xe6,0xbe]
 
-s_bitset0_b64 s[102:103], s0
-// CHECK: [0x00,0x1c,0xe6,0xbe]
+s_bitset0_b64 flat_scratch, s1
+// CHECK: [0x01,0x1c,0xe8,0xbe]
 
-s_bitset0_b64 flat_scratch, s0
-// CHECK: [0x00,0x1c,0xe8,0xbe]
+s_bitset0_b64 vcc, s1
+// CHECK: [0x01,0x1c,0xea,0xbe]
 
-s_bitset0_b64 vcc, s0
-// CHECK: [0x00,0x1c,0xea,0xbe]
+s_bitset0_b64 tba, s1
+// CHECK: [0x01,0x1c,0xec,0xbe]
 
-s_bitset0_b64 tba, s0
-// CHECK: [0x00,0x1c,0xec,0xbe]
+s_bitset0_b64 tma, s1
+// CHECK: [0x01,0x1c,0xee,0xbe]
 
-s_bitset0_b64 tma, s0
-// CHECK: [0x00,0x1c,0xee,0xbe]
+s_bitset0_b64 ttmp[10:11], s1
+// CHECK: [0x01,0x1c,0xfa,0xbe]
 
-s_bitset0_b64 ttmp[10:11], s0
-// CHECK: [0x00,0x1c,0xfa,0xbe]
+s_bitset0_b64 exec, s1
+// CHECK: [0x01,0x1c,0xfe,0xbe]
 
-s_bitset0_b64 exec, s0
-// CHECK: [0x00,0x1c,0xfe,0xbe]
+s_bitset0_b64 s[10:11], s103
+// CHECK: [0x67,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], s103
-// CHECK: [0x67,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], flat_scratch_lo
+// CHECK: [0x68,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], flat_scratch_lo
-// CHECK: [0x68,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], flat_scratch_hi
+// CHECK: [0x69,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], flat_scratch_hi
-// CHECK: [0x69,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], vcc_lo
+// CHECK: [0x6a,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], vcc_lo
-// CHECK: [0x6a,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], vcc_hi
+// CHECK: [0x6b,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], vcc_hi
-// CHECK: [0x6b,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], tba_lo
+// CHECK: [0x6c,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], tba_lo
-// CHECK: [0x6c,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], tba_hi
+// CHECK: [0x6d,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], tba_hi
-// CHECK: [0x6d,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], tma_lo
+// CHECK: [0x6e,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], tma_lo
-// CHECK: [0x6e,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], tma_hi
+// CHECK: [0x6f,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], tma_hi
-// CHECK: [0x6f,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], ttmp11
+// CHECK: [0x7b,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], ttmp11
-// CHECK: [0x7b,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], m0
+// CHECK: [0x7c,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], m0
-// CHECK: [0x7c,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], exec_lo
+// CHECK: [0x7e,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], exec_lo
-// CHECK: [0x7e,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], exec_hi
+// CHECK: [0x7f,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], exec_hi
-// CHECK: [0x7f,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], 0
+// CHECK: [0x80,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], 0
-// CHECK: [0x80,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], -1
+// CHECK: [0xc1,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], -1
-// CHECK: [0xc1,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x1c,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x1c,0x80,0xbe]
+s_bitset0_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x1c,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bitset0_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x1c,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bitset0_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x1c,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bitset0_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x1c,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bitset1_b32 s5, s1
+// CHECK: [0x01,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, s0
-// CHECK: [0x00,0x1d,0x80,0xbe]
+s_bitset1_b32 s103, s1
+// CHECK: [0x01,0x1d,0xe7,0xbe]
 
-s_bitset1_b32 s103, s0
-// CHECK: [0x00,0x1d,0xe7,0xbe]
+s_bitset1_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x1d,0xe8,0xbe]
 
-s_bitset1_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x1d,0xe8,0xbe]
+s_bitset1_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x1d,0xe9,0xbe]
 
-s_bitset1_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x1d,0xe9,0xbe]
+s_bitset1_b32 vcc_lo, s1
+// CHECK: [0x01,0x1d,0xea,0xbe]
 
-s_bitset1_b32 vcc_lo, s0
-// CHECK: [0x00,0x1d,0xea,0xbe]
+s_bitset1_b32 vcc_hi, s1
+// CHECK: [0x01,0x1d,0xeb,0xbe]
 
-s_bitset1_b32 vcc_hi, s0
-// CHECK: [0x00,0x1d,0xeb,0xbe]
+s_bitset1_b32 tba_lo, s1
+// CHECK: [0x01,0x1d,0xec,0xbe]
 
-s_bitset1_b32 tba_lo, s0
-// CHECK: [0x00,0x1d,0xec,0xbe]
+s_bitset1_b32 tba_hi, s1
+// CHECK: [0x01,0x1d,0xed,0xbe]
 
-s_bitset1_b32 tba_hi, s0
-// CHECK: [0x00,0x1d,0xed,0xbe]
+s_bitset1_b32 tma_lo, s1
+// CHECK: [0x01,0x1d,0xee,0xbe]
 
-s_bitset1_b32 tma_lo, s0
-// CHECK: [0x00,0x1d,0xee,0xbe]
+s_bitset1_b32 tma_hi, s1
+// CHECK: [0x01,0x1d,0xef,0xbe]
 
-s_bitset1_b32 tma_hi, s0
-// CHECK: [0x00,0x1d,0xef,0xbe]
+s_bitset1_b32 ttmp11, s1
+// CHECK: [0x01,0x1d,0xfb,0xbe]
 
-s_bitset1_b32 ttmp11, s0
-// CHECK: [0x00,0x1d,0xfb,0xbe]
+s_bitset1_b32 m0, s1
+// CHECK: [0x01,0x1d,0xfc,0xbe]
 
-s_bitset1_b32 m0, s0
-// CHECK: [0x00,0x1d,0xfc,0xbe]
+s_bitset1_b32 exec_lo, s1
+// CHECK: [0x01,0x1d,0xfe,0xbe]
 
-s_bitset1_b32 exec_lo, s0
-// CHECK: [0x00,0x1d,0xfe,0xbe]
+s_bitset1_b32 exec_hi, s1
+// CHECK: [0x01,0x1d,0xff,0xbe]
 
-s_bitset1_b32 exec_hi, s0
-// CHECK: [0x00,0x1d,0xff,0xbe]
+s_bitset1_b32 s5, s103
+// CHECK: [0x67,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, s103
-// CHECK: [0x67,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, vcc_lo
+// CHECK: [0x6a,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, vcc_lo
-// CHECK: [0x6a,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, vcc_hi
+// CHECK: [0x6b,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, vcc_hi
-// CHECK: [0x6b,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, tba_lo
+// CHECK: [0x6c,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, tba_lo
-// CHECK: [0x6c,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, tba_hi
+// CHECK: [0x6d,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, tba_hi
-// CHECK: [0x6d,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, tma_lo
+// CHECK: [0x6e,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, tma_lo
-// CHECK: [0x6e,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, tma_hi
+// CHECK: [0x6f,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, tma_hi
-// CHECK: [0x6f,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, ttmp11
+// CHECK: [0x7b,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, ttmp11
-// CHECK: [0x7b,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, m0
+// CHECK: [0x7c,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, m0
-// CHECK: [0x7c,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, exec_lo
+// CHECK: [0x7e,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, exec_lo
-// CHECK: [0x7e,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, exec_hi
+// CHECK: [0x7f,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, exec_hi
-// CHECK: [0x7f,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, 0
+// CHECK: [0x80,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, 0
-// CHECK: [0x80,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, -1
+// CHECK: [0xc1,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, -1
-// CHECK: [0xc1,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, 0.5
+// CHECK: [0xf0,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, 0.5
-// CHECK: [0xf0,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, -4.0
+// CHECK: [0xf7,0x1d,0x85,0xbe]
 
-s_bitset1_b32 s0, -4.0
-// CHECK: [0xf7,0x1d,0x80,0xbe]
+s_bitset1_b32 s5, 0xaf123456
+// CHECK: [0xff,0x1d,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bitset1_b32 s0, 0xaf123456
-// CHECK: [0xff,0x1d,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bitset1_b32 s5, 0x3f717273
+// CHECK: [0xff,0x1d,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bitset1_b32 s0, 0x3f717273
-// CHECK: [0xff,0x1d,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bitset1_b64 s[10:11], s1
+// CHECK: [0x01,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], s0
-// CHECK: [0x00,0x1e,0x80,0xbe]
+s_bitset1_b64 s[12:13], s1
+// CHECK: [0x01,0x1e,0x8c,0xbe]
 
-s_bitset1_b64 s[2:3], s0
-// CHECK: [0x00,0x1e,0x82,0xbe]
+s_bitset1_b64 s[102:103], s1
+// CHECK: [0x01,0x1e,0xe6,0xbe]
 
-s_bitset1_b64 s[102:103], s0
-// CHECK: [0x00,0x1e,0xe6,0xbe]
+s_bitset1_b64 flat_scratch, s1
+// CHECK: [0x01,0x1e,0xe8,0xbe]
 
-s_bitset1_b64 flat_scratch, s0
-// CHECK: [0x00,0x1e,0xe8,0xbe]
+s_bitset1_b64 vcc, s1
+// CHECK: [0x01,0x1e,0xea,0xbe]
 
-s_bitset1_b64 vcc, s0
-// CHECK: [0x00,0x1e,0xea,0xbe]
+s_bitset1_b64 tba, s1
+// CHECK: [0x01,0x1e,0xec,0xbe]
 
-s_bitset1_b64 tba, s0
-// CHECK: [0x00,0x1e,0xec,0xbe]
+s_bitset1_b64 tma, s1
+// CHECK: [0x01,0x1e,0xee,0xbe]
 
-s_bitset1_b64 tma, s0
-// CHECK: [0x00,0x1e,0xee,0xbe]
+s_bitset1_b64 ttmp[10:11], s1
+// CHECK: [0x01,0x1e,0xfa,0xbe]
 
-s_bitset1_b64 ttmp[10:11], s0
-// CHECK: [0x00,0x1e,0xfa,0xbe]
+s_bitset1_b64 exec, s1
+// CHECK: [0x01,0x1e,0xfe,0xbe]
 
-s_bitset1_b64 exec, s0
-// CHECK: [0x00,0x1e,0xfe,0xbe]
+s_bitset1_b64 s[10:11], s103
+// CHECK: [0x67,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], s103
-// CHECK: [0x67,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], flat_scratch_lo
+// CHECK: [0x68,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], flat_scratch_lo
-// CHECK: [0x68,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], flat_scratch_hi
+// CHECK: [0x69,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], flat_scratch_hi
-// CHECK: [0x69,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], vcc_lo
+// CHECK: [0x6a,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], vcc_lo
-// CHECK: [0x6a,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], vcc_hi
+// CHECK: [0x6b,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], vcc_hi
-// CHECK: [0x6b,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], tba_lo
+// CHECK: [0x6c,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], tba_lo
-// CHECK: [0x6c,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], tba_hi
+// CHECK: [0x6d,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], tba_hi
-// CHECK: [0x6d,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], tma_lo
+// CHECK: [0x6e,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], tma_lo
-// CHECK: [0x6e,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], tma_hi
+// CHECK: [0x6f,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], tma_hi
-// CHECK: [0x6f,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], ttmp11
+// CHECK: [0x7b,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], ttmp11
-// CHECK: [0x7b,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], m0
+// CHECK: [0x7c,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], m0
-// CHECK: [0x7c,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], exec_lo
+// CHECK: [0x7e,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], exec_lo
-// CHECK: [0x7e,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], exec_hi
+// CHECK: [0x7f,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], exec_hi
-// CHECK: [0x7f,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], 0
+// CHECK: [0x80,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], 0
-// CHECK: [0x80,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], -1
+// CHECK: [0xc1,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], -1
-// CHECK: [0xc1,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x1e,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x1e,0x80,0xbe]
+s_bitset1_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x1e,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bitset1_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x1e,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bitset1_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x1e,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bitset1_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x1e,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_getpc_b64 s[10:11]
+// CHECK: [0x00,0x1f,0x8a,0xbe]
 
-s_getpc_b64 s[0:1]
-// CHECK: [0x00,0x1f,0x80,0xbe]
-
-s_getpc_b64 s[2:3]
-// CHECK: [0x00,0x1f,0x82,0xbe]
+s_getpc_b64 s[12:13]
+// CHECK: [0x00,0x1f,0x8c,0xbe]
 
 s_getpc_b64 s[102:103]
 // CHECK: [0x00,0x1f,0xe6,0xbe]
@@ -12609,12 +12623,12 @@ s_getpc_b64 ttmp[10:11]
 s_getpc_b64 exec
 // CHECK: [0x00,0x1f,0xfe,0xbe]
 
-s_setpc_b64 s[0:1]
-// CHECK: [0x00,0x20,0x80,0xbe]
-
 s_setpc_b64 s[2:3]
 // CHECK: [0x02,0x20,0x80,0xbe]
 
+s_setpc_b64 s[4:5]
+// CHECK: [0x04,0x20,0x80,0xbe]
+
 s_setpc_b64 s[102:103]
 // CHECK: [0x66,0x20,0x80,0xbe]
 
@@ -12633,60 +12647,60 @@ s_setpc_b64 tma
 s_setpc_b64 ttmp[10:11]
 // CHECK: [0x7a,0x20,0x80,0xbe]
 
-s_swappc_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x21,0x80,0xbe]
-
-s_swappc_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x21,0x82,0xbe]
+s_swappc_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x21,0x8a,0xbe]
 
-s_swappc_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x21,0xe6,0xbe]
+s_swappc_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x21,0x8c,0xbe]
 
-s_swappc_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x21,0xe8,0xbe]
+s_swappc_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x21,0xe6,0xbe]
 
-s_swappc_b64 vcc, s[0:1]
-// CHECK: [0x00,0x21,0xea,0xbe]
+s_swappc_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x21,0xe8,0xbe]
 
-s_swappc_b64 tba, s[0:1]
-// CHECK: [0x00,0x21,0xec,0xbe]
+s_swappc_b64 vcc, s[2:3]
+// CHECK: [0x02,0x21,0xea,0xbe]
 
-s_swappc_b64 tma, s[0:1]
-// CHECK: [0x00,0x21,0xee,0xbe]
+s_swappc_b64 tba, s[2:3]
+// CHECK: [0x02,0x21,0xec,0xbe]
 
-s_swappc_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x21,0xfa,0xbe]
+s_swappc_b64 tma, s[2:3]
+// CHECK: [0x02,0x21,0xee,0xbe]
 
-s_swappc_b64 exec, s[0:1]
-// CHECK: [0x00,0x21,0xfe,0xbe]
+s_swappc_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x21,0xfa,0xbe]
 
-s_swappc_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x21,0x80,0xbe]
+s_swappc_b64 exec, s[2:3]
+// CHECK: [0x02,0x21,0xfe,0xbe]
 
-s_swappc_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x21,0x80,0xbe]
+s_swappc_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x21,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x21,0x80,0xbe]
+s_swappc_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x21,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], vcc
-// CHECK: [0x6a,0x21,0x80,0xbe]
+s_swappc_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x21,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], tba
-// CHECK: [0x6c,0x21,0x80,0xbe]
+s_swappc_b64 s[10:11], vcc
+// CHECK: [0x6a,0x21,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], tma
-// CHECK: [0x6e,0x21,0x80,0xbe]
+s_swappc_b64 s[10:11], tba
+// CHECK: [0x6c,0x21,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x21,0x80,0xbe]
+s_swappc_b64 s[10:11], tma
+// CHECK: [0x6e,0x21,0x8a,0xbe]
 
-s_rfe_b64 s[0:1]
-// CHECK: [0x00,0x22,0x80,0xbe]
+s_swappc_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x21,0x8a,0xbe]
 
 s_rfe_b64 s[2:3]
 // CHECK: [0x02,0x22,0x80,0xbe]
 
+s_rfe_b64 s[4:5]
+// CHECK: [0x04,0x22,0x80,0xbe]
+
 s_rfe_b64 s[102:103]
 // CHECK: [0x66,0x22,0x80,0xbe]
 
@@ -12705,9161 +12719,9161 @@ s_rfe_b64 tma
 s_rfe_b64 ttmp[10:11]
 // CHECK: [0x7a,0x22,0x80,0xbe]
 
-s_and_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x24,0x82,0xbe]
+s_and_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x24,0x8c,0xbe]
 
-s_and_saveexec_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x24,0xe6,0xbe]
+s_and_saveexec_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x24,0xe6,0xbe]
 
-s_and_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x24,0xe8,0xbe]
+s_and_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x24,0xe8,0xbe]
 
-s_and_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x24,0xea,0xbe]
+s_and_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x24,0xea,0xbe]
 
-s_and_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x24,0xec,0xbe]
+s_and_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x24,0xec,0xbe]
 
-s_and_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x24,0xee,0xbe]
+s_and_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x24,0xee,0xbe]
 
-s_and_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x24,0xfa,0xbe]
+s_and_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x24,0xfa,0xbe]
 
-s_and_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x24,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x24,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x24,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_and_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x24,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_and_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x24,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_and_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x24,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_or_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x25,0x82,0xbe]
+s_or_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x25,0x8c,0xbe]
 
-s_or_saveexec_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x25,0xe6,0xbe]
+s_or_saveexec_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x25,0xe6,0xbe]
 
-s_or_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x25,0xe8,0xbe]
+s_or_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x25,0xe8,0xbe]
 
-s_or_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x25,0xea,0xbe]
+s_or_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x25,0xea,0xbe]
 
-s_or_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x25,0xec,0xbe]
+s_or_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x25,0xec,0xbe]
 
-s_or_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x25,0xee,0xbe]
+s_or_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x25,0xee,0xbe]
 
-s_or_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x25,0xfa,0xbe]
+s_or_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x25,0xfa,0xbe]
 
-s_or_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x25,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x25,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_or_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x25,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_or_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x25,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_or_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x25,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_xor_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x26,0x82,0xbe]
+s_xor_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x26,0x8c,0xbe]
 
-s_xor_saveexec_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x26,0xe6,0xbe]
+s_xor_saveexec_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x26,0xe6,0xbe]
 
-s_xor_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x26,0xe8,0xbe]
+s_xor_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x26,0xe8,0xbe]
 
-s_xor_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x26,0xea,0xbe]
+s_xor_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x26,0xea,0xbe]
 
-s_xor_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x26,0xec,0xbe]
+s_xor_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x26,0xec,0xbe]
 
-s_xor_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x26,0xee,0xbe]
+s_xor_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x26,0xee,0xbe]
 
-s_xor_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x26,0xfa,0xbe]
+s_xor_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x26,0xfa,0xbe]
 
-s_xor_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x26,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x26,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x26,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_xor_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x26,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_xor_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x26,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_xor_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x26,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_andn2_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x27,0x82,0xbe]
+s_andn2_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x27,0x8c,0xbe]
 
-s_andn2_saveexec_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x27,0xe6,0xbe]
+s_andn2_saveexec_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x27,0xe6,0xbe]
 
-s_andn2_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x27,0xe8,0xbe]
+s_andn2_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x27,0xe8,0xbe]
 
-s_andn2_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x27,0xea,0xbe]
+s_andn2_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x27,0xea,0xbe]
 
-s_andn2_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x27,0xec,0xbe]
+s_andn2_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x27,0xec,0xbe]
 
-s_andn2_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x27,0xee,0xbe]
+s_andn2_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x27,0xee,0xbe]
 
-s_andn2_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x27,0xfa,0xbe]
+s_andn2_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x27,0xfa,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x27,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x27,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_andn2_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x27,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_andn2_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x27,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_andn2_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x27,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_orn2_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x28,0x82,0xbe]
+s_orn2_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x28,0x8c,0xbe]
 
-s_orn2_saveexec_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x28,0xe6,0xbe]
+s_orn2_saveexec_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x28,0xe6,0xbe]
 
-s_orn2_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x28,0xe8,0xbe]
+s_orn2_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x28,0xe8,0xbe]
 
-s_orn2_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x28,0xea,0xbe]
+s_orn2_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x28,0xea,0xbe]
 
-s_orn2_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x28,0xec,0xbe]
+s_orn2_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x28,0xec,0xbe]
 
-s_orn2_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x28,0xee,0xbe]
+s_orn2_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x28,0xee,0xbe]
 
-s_orn2_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x28,0xfa,0xbe]
+s_orn2_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x28,0xfa,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x28,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x28,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x28,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_orn2_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x28,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_orn2_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x28,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_orn2_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x28,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_nand_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x29,0x82,0xbe]
+s_nand_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x29,0x8c,0xbe]
 
-s_nand_saveexec_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x29,0xe6,0xbe]
+s_nand_saveexec_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x29,0xe6,0xbe]
 
-s_nand_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x29,0xe8,0xbe]
+s_nand_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x29,0xe8,0xbe]
 
-s_nand_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x29,0xea,0xbe]
+s_nand_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x29,0xea,0xbe]
 
-s_nand_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x29,0xec,0xbe]
+s_nand_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x29,0xec,0xbe]
 
-s_nand_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x29,0xee,0xbe]
+s_nand_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x29,0xee,0xbe]
 
-s_nand_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x29,0xfa,0xbe]
+s_nand_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x29,0xfa,0xbe]
 
-s_nand_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x29,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x29,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_nand_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x29,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_nand_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x29,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_nand_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x29,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_nor_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x2a,0x82,0xbe]
+s_nor_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x2a,0x8c,0xbe]
 
-s_nor_saveexec_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x2a,0xe6,0xbe]
+s_nor_saveexec_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x2a,0xe6,0xbe]
 
-s_nor_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x2a,0xe8,0xbe]
+s_nor_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x2a,0xe8,0xbe]
 
-s_nor_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x2a,0xea,0xbe]
+s_nor_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x2a,0xea,0xbe]
 
-s_nor_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x2a,0xec,0xbe]
+s_nor_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x2a,0xec,0xbe]
 
-s_nor_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x2a,0xee,0xbe]
+s_nor_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x2a,0xee,0xbe]
 
-s_nor_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x2a,0xfa,0xbe]
+s_nor_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x2a,0xfa,0xbe]
 
-s_nor_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x2a,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x2a,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x2a,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_nor_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x2a,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_nor_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x2a,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_nor_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x2a,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_xnor_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x2b,0x82,0xbe]
+s_xnor_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x2b,0x8c,0xbe]
 
-s_xnor_saveexec_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x2b,0xe6,0xbe]
+s_xnor_saveexec_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x2b,0xe6,0xbe]
 
-s_xnor_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x2b,0xe8,0xbe]
+s_xnor_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x2b,0xe8,0xbe]
 
-s_xnor_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x2b,0xea,0xbe]
+s_xnor_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x2b,0xea,0xbe]
 
-s_xnor_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x2b,0xec,0xbe]
+s_xnor_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x2b,0xec,0xbe]
 
-s_xnor_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x2b,0xee,0xbe]
+s_xnor_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x2b,0xee,0xbe]
 
-s_xnor_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x2b,0xfa,0xbe]
+s_xnor_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x2b,0xfa,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x2b,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x2b,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x2b,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_xnor_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x2b,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_xnor_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x2b,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_xnor_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x2b,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_quadmask_b32 s0, s0
-// CHECK: [0x00,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, s1
+// CHECK: [0x01,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s103, s0
-// CHECK: [0x00,0x2c,0xe7,0xbe]
+s_quadmask_b32 s103, s1
+// CHECK: [0x01,0x2c,0xe7,0xbe]
 
-s_quadmask_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x2c,0xe8,0xbe]
+s_quadmask_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x2c,0xe8,0xbe]
 
-s_quadmask_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x2c,0xe9,0xbe]
+s_quadmask_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x2c,0xe9,0xbe]
 
-s_quadmask_b32 vcc_lo, s0
-// CHECK: [0x00,0x2c,0xea,0xbe]
+s_quadmask_b32 vcc_lo, s1
+// CHECK: [0x01,0x2c,0xea,0xbe]
 
-s_quadmask_b32 vcc_hi, s0
-// CHECK: [0x00,0x2c,0xeb,0xbe]
+s_quadmask_b32 vcc_hi, s1
+// CHECK: [0x01,0x2c,0xeb,0xbe]
 
-s_quadmask_b32 tba_lo, s0
-// CHECK: [0x00,0x2c,0xec,0xbe]
+s_quadmask_b32 tba_lo, s1
+// CHECK: [0x01,0x2c,0xec,0xbe]
 
-s_quadmask_b32 tba_hi, s0
-// CHECK: [0x00,0x2c,0xed,0xbe]
+s_quadmask_b32 tba_hi, s1
+// CHECK: [0x01,0x2c,0xed,0xbe]
 
-s_quadmask_b32 tma_lo, s0
-// CHECK: [0x00,0x2c,0xee,0xbe]
+s_quadmask_b32 tma_lo, s1
+// CHECK: [0x01,0x2c,0xee,0xbe]
 
-s_quadmask_b32 tma_hi, s0
-// CHECK: [0x00,0x2c,0xef,0xbe]
+s_quadmask_b32 tma_hi, s1
+// CHECK: [0x01,0x2c,0xef,0xbe]
 
-s_quadmask_b32 ttmp11, s0
-// CHECK: [0x00,0x2c,0xfb,0xbe]
+s_quadmask_b32 ttmp11, s1
+// CHECK: [0x01,0x2c,0xfb,0xbe]
 
-s_quadmask_b32 m0, s0
-// CHECK: [0x00,0x2c,0xfc,0xbe]
+s_quadmask_b32 m0, s1
+// CHECK: [0x01,0x2c,0xfc,0xbe]
 
-s_quadmask_b32 exec_lo, s0
-// CHECK: [0x00,0x2c,0xfe,0xbe]
+s_quadmask_b32 exec_lo, s1
+// CHECK: [0x01,0x2c,0xfe,0xbe]
 
-s_quadmask_b32 exec_hi, s0
-// CHECK: [0x00,0x2c,0xff,0xbe]
+s_quadmask_b32 exec_hi, s1
+// CHECK: [0x01,0x2c,0xff,0xbe]
 
-s_quadmask_b32 s0, s103
-// CHECK: [0x67,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, s103
+// CHECK: [0x67,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, vcc_lo
-// CHECK: [0x6a,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, vcc_lo
+// CHECK: [0x6a,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, vcc_hi
-// CHECK: [0x6b,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, vcc_hi
+// CHECK: [0x6b,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, tba_lo
-// CHECK: [0x6c,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, tba_lo
+// CHECK: [0x6c,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, tba_hi
-// CHECK: [0x6d,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, tba_hi
+// CHECK: [0x6d,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, tma_lo
-// CHECK: [0x6e,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, tma_lo
+// CHECK: [0x6e,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, tma_hi
-// CHECK: [0x6f,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, tma_hi
+// CHECK: [0x6f,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, ttmp11
-// CHECK: [0x7b,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, ttmp11
+// CHECK: [0x7b,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, m0
-// CHECK: [0x7c,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, m0
+// CHECK: [0x7c,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, exec_lo
-// CHECK: [0x7e,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, exec_lo
+// CHECK: [0x7e,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, exec_hi
-// CHECK: [0x7f,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, exec_hi
+// CHECK: [0x7f,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, 0
-// CHECK: [0x80,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, 0
+// CHECK: [0x80,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, -1
-// CHECK: [0xc1,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, -1
+// CHECK: [0xc1,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, 0.5
-// CHECK: [0xf0,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, 0.5
+// CHECK: [0xf0,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, -4.0
-// CHECK: [0xf7,0x2c,0x80,0xbe]
+s_quadmask_b32 s5, -4.0
+// CHECK: [0xf7,0x2c,0x85,0xbe]
 
-s_quadmask_b32 s0, 0xaf123456
-// CHECK: [0xff,0x2c,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_quadmask_b32 s5, 0xaf123456
+// CHECK: [0xff,0x2c,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_quadmask_b32 s0, 0x3f717273
-// CHECK: [0xff,0x2c,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_quadmask_b32 s5, 0x3f717273
+// CHECK: [0xff,0x2c,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_quadmask_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x2d,0x82,0xbe]
+s_quadmask_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x2d,0x8c,0xbe]
 
-s_quadmask_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x2d,0xe6,0xbe]
+s_quadmask_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x2d,0xe6,0xbe]
 
-s_quadmask_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x2d,0xe8,0xbe]
+s_quadmask_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x2d,0xe8,0xbe]
 
-s_quadmask_b64 vcc, s[0:1]
-// CHECK: [0x00,0x2d,0xea,0xbe]
+s_quadmask_b64 vcc, s[2:3]
+// CHECK: [0x02,0x2d,0xea,0xbe]
 
-s_quadmask_b64 tba, s[0:1]
-// CHECK: [0x00,0x2d,0xec,0xbe]
+s_quadmask_b64 tba, s[2:3]
+// CHECK: [0x02,0x2d,0xec,0xbe]
 
-s_quadmask_b64 tma, s[0:1]
-// CHECK: [0x00,0x2d,0xee,0xbe]
+s_quadmask_b64 tma, s[2:3]
+// CHECK: [0x02,0x2d,0xee,0xbe]
 
-s_quadmask_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x2d,0xfa,0xbe]
+s_quadmask_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x2d,0xfa,0xbe]
 
-s_quadmask_b64 exec, s[0:1]
-// CHECK: [0x00,0x2d,0xfe,0xbe]
+s_quadmask_b64 exec, s[2:3]
+// CHECK: [0x02,0x2d,0xfe,0xbe]
 
-s_quadmask_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], vcc
-// CHECK: [0x6a,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], vcc
+// CHECK: [0x6a,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], tba
-// CHECK: [0x6c,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], tba
+// CHECK: [0x6c,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], tma
-// CHECK: [0x6e,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], tma
+// CHECK: [0x6e,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], exec
-// CHECK: [0x7e,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], exec
+// CHECK: [0x7e,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], 0
-// CHECK: [0x80,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], 0
+// CHECK: [0x80,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], -1
-// CHECK: [0xc1,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], -1
+// CHECK: [0xc1,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x2d,0x80,0xbe]
+s_quadmask_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x2d,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_quadmask_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x2d,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_quadmask_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x2d,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_quadmask_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x2d,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_movrels_b32 s0, s0
-// CHECK: [0x00,0x2e,0x80,0xbe]
+s_movrels_b32 s5, s1
+// CHECK: [0x01,0x2e,0x85,0xbe]
 
-s_movrels_b32 s103, s0
-// CHECK: [0x00,0x2e,0xe7,0xbe]
+s_movrels_b32 s103, s1
+// CHECK: [0x01,0x2e,0xe7,0xbe]
 
-s_movrels_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x2e,0xe8,0xbe]
+s_movrels_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x2e,0xe8,0xbe]
 
-s_movrels_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x2e,0xe9,0xbe]
+s_movrels_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x2e,0xe9,0xbe]
 
-s_movrels_b32 vcc_lo, s0
-// CHECK: [0x00,0x2e,0xea,0xbe]
+s_movrels_b32 vcc_lo, s1
+// CHECK: [0x01,0x2e,0xea,0xbe]
 
-s_movrels_b32 vcc_hi, s0
-// CHECK: [0x00,0x2e,0xeb,0xbe]
+s_movrels_b32 vcc_hi, s1
+// CHECK: [0x01,0x2e,0xeb,0xbe]
 
-s_movrels_b32 tba_lo, s0
-// CHECK: [0x00,0x2e,0xec,0xbe]
+s_movrels_b32 tba_lo, s1
+// CHECK: [0x01,0x2e,0xec,0xbe]
 
-s_movrels_b32 tba_hi, s0
-// CHECK: [0x00,0x2e,0xed,0xbe]
+s_movrels_b32 tba_hi, s1
+// CHECK: [0x01,0x2e,0xed,0xbe]
 
-s_movrels_b32 tma_lo, s0
-// CHECK: [0x00,0x2e,0xee,0xbe]
+s_movrels_b32 tma_lo, s1
+// CHECK: [0x01,0x2e,0xee,0xbe]
 
-s_movrels_b32 tma_hi, s0
-// CHECK: [0x00,0x2e,0xef,0xbe]
+s_movrels_b32 tma_hi, s1
+// CHECK: [0x01,0x2e,0xef,0xbe]
 
-s_movrels_b32 ttmp11, s0
-// CHECK: [0x00,0x2e,0xfb,0xbe]
+s_movrels_b32 ttmp11, s1
+// CHECK: [0x01,0x2e,0xfb,0xbe]
 
-s_movrels_b32 m0, s0
-// CHECK: [0x00,0x2e,0xfc,0xbe]
+s_movrels_b32 m0, s1
+// CHECK: [0x01,0x2e,0xfc,0xbe]
 
-s_movrels_b32 exec_lo, s0
-// CHECK: [0x00,0x2e,0xfe,0xbe]
+s_movrels_b32 exec_lo, s1
+// CHECK: [0x01,0x2e,0xfe,0xbe]
 
-s_movrels_b32 exec_hi, s0
-// CHECK: [0x00,0x2e,0xff,0xbe]
+s_movrels_b32 exec_hi, s1
+// CHECK: [0x01,0x2e,0xff,0xbe]
 
-s_movrels_b32 s0, s103
-// CHECK: [0x67,0x2e,0x80,0xbe]
+s_movrels_b32 s5, s103
+// CHECK: [0x67,0x2e,0x85,0xbe]
 
-s_movrels_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x2e,0x80,0xbe]
+s_movrels_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x2e,0x85,0xbe]
 
-s_movrels_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x2e,0x80,0xbe]
+s_movrels_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x2e,0x85,0xbe]
 
-s_movrels_b32 s0, vcc_lo
-// CHECK: [0x6a,0x2e,0x80,0xbe]
+s_movrels_b32 s5, vcc_lo
+// CHECK: [0x6a,0x2e,0x85,0xbe]
 
-s_movrels_b32 s0, vcc_hi
-// CHECK: [0x6b,0x2e,0x80,0xbe]
+s_movrels_b32 s5, vcc_hi
+// CHECK: [0x6b,0x2e,0x85,0xbe]
 
-s_movrels_b32 s0, tba_lo
-// CHECK: [0x6c,0x2e,0x80,0xbe]
+s_movrels_b32 s5, tba_lo
+// CHECK: [0x6c,0x2e,0x85,0xbe]
 
-s_movrels_b32 s0, tba_hi
-// CHECK: [0x6d,0x2e,0x80,0xbe]
+s_movrels_b32 s5, tba_hi
+// CHECK: [0x6d,0x2e,0x85,0xbe]
 
-s_movrels_b32 s0, tma_lo
-// CHECK: [0x6e,0x2e,0x80,0xbe]
+s_movrels_b32 s5, tma_lo
+// CHECK: [0x6e,0x2e,0x85,0xbe]
 
-s_movrels_b32 s0, tma_hi
-// CHECK: [0x6f,0x2e,0x80,0xbe]
+s_movrels_b32 s5, tma_hi
+// CHECK: [0x6f,0x2e,0x85,0xbe]
 
-s_movrels_b32 s0, ttmp11
-// CHECK: [0x7b,0x2e,0x80,0xbe]
+s_movrels_b32 s5, ttmp11
+// CHECK: [0x7b,0x2e,0x85,0xbe]
 
-s_movrels_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x2f,0x80,0xbe]
+s_movrels_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x2f,0x8a,0xbe]
 
-s_movrels_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x2f,0x82,0xbe]
+s_movrels_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x2f,0x8c,0xbe]
 
-s_movrels_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x2f,0xe6,0xbe]
+s_movrels_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x2f,0xe6,0xbe]
 
-s_movrels_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x2f,0xe8,0xbe]
+s_movrels_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x2f,0xe8,0xbe]
 
-s_movrels_b64 vcc, s[0:1]
-// CHECK: [0x00,0x2f,0xea,0xbe]
+s_movrels_b64 vcc, s[2:3]
+// CHECK: [0x02,0x2f,0xea,0xbe]
 
-s_movrels_b64 tba, s[0:1]
-// CHECK: [0x00,0x2f,0xec,0xbe]
+s_movrels_b64 tba, s[2:3]
+// CHECK: [0x02,0x2f,0xec,0xbe]
 
-s_movrels_b64 tma, s[0:1]
-// CHECK: [0x00,0x2f,0xee,0xbe]
+s_movrels_b64 tma, s[2:3]
+// CHECK: [0x02,0x2f,0xee,0xbe]
 
-s_movrels_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x2f,0xfa,0xbe]
+s_movrels_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x2f,0xfa,0xbe]
 
-s_movrels_b64 exec, s[0:1]
-// CHECK: [0x00,0x2f,0xfe,0xbe]
+s_movrels_b64 exec, s[2:3]
+// CHECK: [0x02,0x2f,0xfe,0xbe]
 
-s_movrels_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x2f,0x80,0xbe]
+s_movrels_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x2f,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x2f,0x80,0xbe]
+s_movrels_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x2f,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x2f,0x80,0xbe]
+s_movrels_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x2f,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], vcc
-// CHECK: [0x6a,0x2f,0x80,0xbe]
+s_movrels_b64 s[10:11], vcc
+// CHECK: [0x6a,0x2f,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], tba
-// CHECK: [0x6c,0x2f,0x80,0xbe]
+s_movrels_b64 s[10:11], tba
+// CHECK: [0x6c,0x2f,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], tma
-// CHECK: [0x6e,0x2f,0x80,0xbe]
+s_movrels_b64 s[10:11], tma
+// CHECK: [0x6e,0x2f,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x2f,0x80,0xbe]
+s_movrels_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x2f,0x8a,0xbe]
 
-s_movreld_b32 s0, s0
-// CHECK: [0x00,0x30,0x80,0xbe]
+s_movreld_b32 s5, s1
+// CHECK: [0x01,0x30,0x85,0xbe]
 
-s_movreld_b32 s103, s0
-// CHECK: [0x00,0x30,0xe7,0xbe]
+s_movreld_b32 s103, s1
+// CHECK: [0x01,0x30,0xe7,0xbe]
 
-s_movreld_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x30,0xe8,0xbe]
+s_movreld_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x30,0xe8,0xbe]
 
-s_movreld_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x30,0xe9,0xbe]
+s_movreld_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x30,0xe9,0xbe]
 
-s_movreld_b32 vcc_lo, s0
-// CHECK: [0x00,0x30,0xea,0xbe]
+s_movreld_b32 vcc_lo, s1
+// CHECK: [0x01,0x30,0xea,0xbe]
 
-s_movreld_b32 vcc_hi, s0
-// CHECK: [0x00,0x30,0xeb,0xbe]
+s_movreld_b32 vcc_hi, s1
+// CHECK: [0x01,0x30,0xeb,0xbe]
 
-s_movreld_b32 tba_lo, s0
-// CHECK: [0x00,0x30,0xec,0xbe]
+s_movreld_b32 tba_lo, s1
+// CHECK: [0x01,0x30,0xec,0xbe]
 
-s_movreld_b32 tba_hi, s0
-// CHECK: [0x00,0x30,0xed,0xbe]
+s_movreld_b32 tba_hi, s1
+// CHECK: [0x01,0x30,0xed,0xbe]
 
-s_movreld_b32 tma_lo, s0
-// CHECK: [0x00,0x30,0xee,0xbe]
+s_movreld_b32 tma_lo, s1
+// CHECK: [0x01,0x30,0xee,0xbe]
 
-s_movreld_b32 tma_hi, s0
-// CHECK: [0x00,0x30,0xef,0xbe]
+s_movreld_b32 tma_hi, s1
+// CHECK: [0x01,0x30,0xef,0xbe]
 
-s_movreld_b32 ttmp11, s0
-// CHECK: [0x00,0x30,0xfb,0xbe]
+s_movreld_b32 ttmp11, s1
+// CHECK: [0x01,0x30,0xfb,0xbe]
 
-s_movreld_b32 s0, s103
-// CHECK: [0x67,0x30,0x80,0xbe]
+s_movreld_b32 s5, s103
+// CHECK: [0x67,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x30,0x80,0xbe]
+s_movreld_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x30,0x80,0xbe]
+s_movreld_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, vcc_lo
-// CHECK: [0x6a,0x30,0x80,0xbe]
+s_movreld_b32 s5, vcc_lo
+// CHECK: [0x6a,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, vcc_hi
-// CHECK: [0x6b,0x30,0x80,0xbe]
+s_movreld_b32 s5, vcc_hi
+// CHECK: [0x6b,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, tba_lo
-// CHECK: [0x6c,0x30,0x80,0xbe]
+s_movreld_b32 s5, tba_lo
+// CHECK: [0x6c,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, tba_hi
-// CHECK: [0x6d,0x30,0x80,0xbe]
+s_movreld_b32 s5, tba_hi
+// CHECK: [0x6d,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, tma_lo
-// CHECK: [0x6e,0x30,0x80,0xbe]
+s_movreld_b32 s5, tma_lo
+// CHECK: [0x6e,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, tma_hi
-// CHECK: [0x6f,0x30,0x80,0xbe]
+s_movreld_b32 s5, tma_hi
+// CHECK: [0x6f,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, ttmp11
-// CHECK: [0x7b,0x30,0x80,0xbe]
+s_movreld_b32 s5, ttmp11
+// CHECK: [0x7b,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, m0
-// CHECK: [0x7c,0x30,0x80,0xbe]
+s_movreld_b32 s5, m0
+// CHECK: [0x7c,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, 0
-// CHECK: [0x80,0x30,0x80,0xbe]
+s_movreld_b32 s5, 0
+// CHECK: [0x80,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, -1
-// CHECK: [0xc1,0x30,0x80,0xbe]
+s_movreld_b32 s5, -1
+// CHECK: [0xc1,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, 0.5
-// CHECK: [0xf0,0x30,0x80,0xbe]
+s_movreld_b32 s5, 0.5
+// CHECK: [0xf0,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, -4.0
-// CHECK: [0xf7,0x30,0x80,0xbe]
+s_movreld_b32 s5, -4.0
+// CHECK: [0xf7,0x30,0x85,0xbe]
 
-s_movreld_b32 s0, 0xaf123456
-// CHECK: [0xff,0x30,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_movreld_b32 s5, 0xaf123456
+// CHECK: [0xff,0x30,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_movreld_b32 s0, 0x3f717273
-// CHECK: [0xff,0x30,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_movreld_b32 s5, 0x3f717273
+// CHECK: [0xff,0x30,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_movreld_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x31,0x82,0xbe]
+s_movreld_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x31,0x8c,0xbe]
 
-s_movreld_b64 s[102:103], s[0:1]
-// CHECK: [0x00,0x31,0xe6,0xbe]
+s_movreld_b64 s[102:103], s[2:3]
+// CHECK: [0x02,0x31,0xe6,0xbe]
 
-s_movreld_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x31,0xe8,0xbe]
+s_movreld_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x31,0xe8,0xbe]
 
-s_movreld_b64 vcc, s[0:1]
-// CHECK: [0x00,0x31,0xea,0xbe]
+s_movreld_b64 vcc, s[2:3]
+// CHECK: [0x02,0x31,0xea,0xbe]
 
-s_movreld_b64 tba, s[0:1]
-// CHECK: [0x00,0x31,0xec,0xbe]
+s_movreld_b64 tba, s[2:3]
+// CHECK: [0x02,0x31,0xec,0xbe]
 
-s_movreld_b64 tma, s[0:1]
-// CHECK: [0x00,0x31,0xee,0xbe]
+s_movreld_b64 tma, s[2:3]
+// CHECK: [0x02,0x31,0xee,0xbe]
 
-s_movreld_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x31,0xfa,0xbe]
+s_movreld_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x31,0xfa,0xbe]
 
-s_movreld_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], s[102:103]
-// CHECK: [0x66,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], s[102:103]
+// CHECK: [0x66,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], flat_scratch
-// CHECK: [0x68,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], flat_scratch
+// CHECK: [0x68,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], vcc
-// CHECK: [0x6a,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], vcc
+// CHECK: [0x6a,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], tba
-// CHECK: [0x6c,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], tba
+// CHECK: [0x6c,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], tma
-// CHECK: [0x6e,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], tma
+// CHECK: [0x6e,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], 0
-// CHECK: [0x80,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], 0
+// CHECK: [0x80,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], -1
-// CHECK: [0xc1,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], -1
+// CHECK: [0xc1,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x31,0x80,0xbe]
+s_movreld_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x31,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_movreld_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x31,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_movreld_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x31,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_movreld_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x31,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_abs_i32 s0, s0
-// CHECK: [0x00,0x34,0x80,0xbe]
+s_abs_i32 s5, s1
+// CHECK: [0x01,0x34,0x85,0xbe]
 
-s_abs_i32 s103, s0
-// CHECK: [0x00,0x34,0xe7,0xbe]
+s_abs_i32 s103, s1
+// CHECK: [0x01,0x34,0xe7,0xbe]
 
-s_abs_i32 flat_scratch_lo, s0
-// CHECK: [0x00,0x34,0xe8,0xbe]
+s_abs_i32 flat_scratch_lo, s1
+// CHECK: [0x01,0x34,0xe8,0xbe]
 
-s_abs_i32 flat_scratch_hi, s0
-// CHECK: [0x00,0x34,0xe9,0xbe]
+s_abs_i32 flat_scratch_hi, s1
+// CHECK: [0x01,0x34,0xe9,0xbe]
 
-s_abs_i32 vcc_lo, s0
-// CHECK: [0x00,0x34,0xea,0xbe]
+s_abs_i32 vcc_lo, s1
+// CHECK: [0x01,0x34,0xea,0xbe]
 
-s_abs_i32 vcc_hi, s0
-// CHECK: [0x00,0x34,0xeb,0xbe]
+s_abs_i32 vcc_hi, s1
+// CHECK: [0x01,0x34,0xeb,0xbe]
 
-s_abs_i32 tba_lo, s0
-// CHECK: [0x00,0x34,0xec,0xbe]
+s_abs_i32 tba_lo, s1
+// CHECK: [0x01,0x34,0xec,0xbe]
 
-s_abs_i32 tba_hi, s0
-// CHECK: [0x00,0x34,0xed,0xbe]
+s_abs_i32 tba_hi, s1
+// CHECK: [0x01,0x34,0xed,0xbe]
 
-s_abs_i32 tma_lo, s0
-// CHECK: [0x00,0x34,0xee,0xbe]
+s_abs_i32 tma_lo, s1
+// CHECK: [0x01,0x34,0xee,0xbe]
 
-s_abs_i32 tma_hi, s0
-// CHECK: [0x00,0x34,0xef,0xbe]
+s_abs_i32 tma_hi, s1
+// CHECK: [0x01,0x34,0xef,0xbe]
 
-s_abs_i32 ttmp11, s0
-// CHECK: [0x00,0x34,0xfb,0xbe]
+s_abs_i32 ttmp11, s1
+// CHECK: [0x01,0x34,0xfb,0xbe]
 
-s_abs_i32 m0, s0
-// CHECK: [0x00,0x34,0xfc,0xbe]
+s_abs_i32 m0, s1
+// CHECK: [0x01,0x34,0xfc,0xbe]
 
-s_abs_i32 exec_lo, s0
-// CHECK: [0x00,0x34,0xfe,0xbe]
+s_abs_i32 exec_lo, s1
+// CHECK: [0x01,0x34,0xfe,0xbe]
 
-s_abs_i32 exec_hi, s0
-// CHECK: [0x00,0x34,0xff,0xbe]
+s_abs_i32 exec_hi, s1
+// CHECK: [0x01,0x34,0xff,0xbe]
 
-s_abs_i32 s0, s103
-// CHECK: [0x67,0x34,0x80,0xbe]
+s_abs_i32 s5, s103
+// CHECK: [0x67,0x34,0x85,0xbe]
 
-s_abs_i32 s0, flat_scratch_lo
-// CHECK: [0x68,0x34,0x80,0xbe]
+s_abs_i32 s5, flat_scratch_lo
+// CHECK: [0x68,0x34,0x85,0xbe]
 
-s_abs_i32 s0, flat_scratch_hi
-// CHECK: [0x69,0x34,0x80,0xbe]
+s_abs_i32 s5, flat_scratch_hi
+// CHECK: [0x69,0x34,0x85,0xbe]
 
-s_abs_i32 s0, vcc_lo
-// CHECK: [0x6a,0x34,0x80,0xbe]
+s_abs_i32 s5, vcc_lo
+// CHECK: [0x6a,0x34,0x85,0xbe]
 
-s_abs_i32 s0, vcc_hi
-// CHECK: [0x6b,0x34,0x80,0xbe]
+s_abs_i32 s5, vcc_hi
+// CHECK: [0x6b,0x34,0x85,0xbe]
 
-s_abs_i32 s0, tba_lo
-// CHECK: [0x6c,0x34,0x80,0xbe]
+s_abs_i32 s5, tba_lo
+// CHECK: [0x6c,0x34,0x85,0xbe]
 
-s_abs_i32 s0, tba_hi
-// CHECK: [0x6d,0x34,0x80,0xbe]
+s_abs_i32 s5, tba_hi
+// CHECK: [0x6d,0x34,0x85,0xbe]
 
-s_abs_i32 s0, tma_lo
-// CHECK: [0x6e,0x34,0x80,0xbe]
+s_abs_i32 s5, tma_lo
+// CHECK: [0x6e,0x34,0x85,0xbe]
 
-s_abs_i32 s0, tma_hi
-// CHECK: [0x6f,0x34,0x80,0xbe]
+s_abs_i32 s5, tma_hi
+// CHECK: [0x6f,0x34,0x85,0xbe]
 
-s_abs_i32 s0, ttmp11
-// CHECK: [0x7b,0x34,0x80,0xbe]
+s_abs_i32 s5, ttmp11
+// CHECK: [0x7b,0x34,0x85,0xbe]
 
-s_abs_i32 s0, m0
-// CHECK: [0x7c,0x34,0x80,0xbe]
+s_abs_i32 s5, m0
+// CHECK: [0x7c,0x34,0x85,0xbe]
 
-s_abs_i32 s0, exec_lo
-// CHECK: [0x7e,0x34,0x80,0xbe]
+s_abs_i32 s5, exec_lo
+// CHECK: [0x7e,0x34,0x85,0xbe]
 
-s_abs_i32 s0, exec_hi
-// CHECK: [0x7f,0x34,0x80,0xbe]
+s_abs_i32 s5, exec_hi
+// CHECK: [0x7f,0x34,0x85,0xbe]
 
-s_abs_i32 s0, 0
-// CHECK: [0x80,0x34,0x80,0xbe]
+s_abs_i32 s5, 0
+// CHECK: [0x80,0x34,0x85,0xbe]
 
-s_abs_i32 s0, -1
-// CHECK: [0xc1,0x34,0x80,0xbe]
+s_abs_i32 s5, -1
+// CHECK: [0xc1,0x34,0x85,0xbe]
 
-s_abs_i32 s0, 0.5
-// CHECK: [0xf0,0x34,0x80,0xbe]
+s_abs_i32 s5, 0.5
+// CHECK: [0xf0,0x34,0x85,0xbe]
 
-s_abs_i32 s0, -4.0
-// CHECK: [0xf7,0x34,0x80,0xbe]
+s_abs_i32 s5, -4.0
+// CHECK: [0xf7,0x34,0x85,0xbe]
 
-s_abs_i32 s0, 0xaf123456
-// CHECK: [0xff,0x34,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_abs_i32 s5, 0xaf123456
+// CHECK: [0xff,0x34,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_abs_i32 s0, 0x3f717273
-// CHECK: [0xff,0x34,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_abs_i32 s5, 0x3f717273
+// CHECK: [0xff,0x34,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_mov_fed_b32 s0, s0
-// CHECK: [0x00,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, s1
+// CHECK: [0x01,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s103, s0
-// CHECK: [0x00,0x35,0xe7,0xbe]
+s_mov_fed_b32 s103, s1
+// CHECK: [0x01,0x35,0xe7,0xbe]
 
-s_mov_fed_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x35,0xe8,0xbe]
+s_mov_fed_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x35,0xe8,0xbe]
 
-s_mov_fed_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x35,0xe9,0xbe]
+s_mov_fed_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x35,0xe9,0xbe]
 
-s_mov_fed_b32 vcc_lo, s0
-// CHECK: [0x00,0x35,0xea,0xbe]
+s_mov_fed_b32 vcc_lo, s1
+// CHECK: [0x01,0x35,0xea,0xbe]
 
-s_mov_fed_b32 vcc_hi, s0
-// CHECK: [0x00,0x35,0xeb,0xbe]
+s_mov_fed_b32 vcc_hi, s1
+// CHECK: [0x01,0x35,0xeb,0xbe]
 
-s_mov_fed_b32 tba_lo, s0
-// CHECK: [0x00,0x35,0xec,0xbe]
+s_mov_fed_b32 tba_lo, s1
+// CHECK: [0x01,0x35,0xec,0xbe]
 
-s_mov_fed_b32 tba_hi, s0
-// CHECK: [0x00,0x35,0xed,0xbe]
+s_mov_fed_b32 tba_hi, s1
+// CHECK: [0x01,0x35,0xed,0xbe]
 
-s_mov_fed_b32 tma_lo, s0
-// CHECK: [0x00,0x35,0xee,0xbe]
+s_mov_fed_b32 tma_lo, s1
+// CHECK: [0x01,0x35,0xee,0xbe]
 
-s_mov_fed_b32 tma_hi, s0
-// CHECK: [0x00,0x35,0xef,0xbe]
+s_mov_fed_b32 tma_hi, s1
+// CHECK: [0x01,0x35,0xef,0xbe]
 
-s_mov_fed_b32 ttmp11, s0
-// CHECK: [0x00,0x35,0xfb,0xbe]
+s_mov_fed_b32 ttmp11, s1
+// CHECK: [0x01,0x35,0xfb,0xbe]
 
-s_mov_fed_b32 m0, s0
-// CHECK: [0x00,0x35,0xfc,0xbe]
+s_mov_fed_b32 m0, s1
+// CHECK: [0x01,0x35,0xfc,0xbe]
 
-s_mov_fed_b32 exec_lo, s0
-// CHECK: [0x00,0x35,0xfe,0xbe]
+s_mov_fed_b32 exec_lo, s1
+// CHECK: [0x01,0x35,0xfe,0xbe]
 
-s_mov_fed_b32 exec_hi, s0
-// CHECK: [0x00,0x35,0xff,0xbe]
+s_mov_fed_b32 exec_hi, s1
+// CHECK: [0x01,0x35,0xff,0xbe]
 
-s_mov_fed_b32 s0, s103
-// CHECK: [0x67,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, s103
+// CHECK: [0x67,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, flat_scratch_lo
-// CHECK: [0x68,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, flat_scratch_lo
+// CHECK: [0x68,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, flat_scratch_hi
-// CHECK: [0x69,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, flat_scratch_hi
+// CHECK: [0x69,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, vcc_lo
-// CHECK: [0x6a,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, vcc_lo
+// CHECK: [0x6a,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, vcc_hi
-// CHECK: [0x6b,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, vcc_hi
+// CHECK: [0x6b,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, tba_lo
-// CHECK: [0x6c,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, tba_lo
+// CHECK: [0x6c,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, tba_hi
-// CHECK: [0x6d,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, tba_hi
+// CHECK: [0x6d,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, tma_lo
-// CHECK: [0x6e,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, tma_lo
+// CHECK: [0x6e,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, tma_hi
-// CHECK: [0x6f,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, tma_hi
+// CHECK: [0x6f,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, ttmp11
-// CHECK: [0x7b,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, ttmp11
+// CHECK: [0x7b,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, m0
-// CHECK: [0x7c,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, m0
+// CHECK: [0x7c,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, exec_lo
-// CHECK: [0x7e,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, exec_lo
+// CHECK: [0x7e,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, exec_hi
-// CHECK: [0x7f,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, exec_hi
+// CHECK: [0x7f,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, 0
-// CHECK: [0x80,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, 0
+// CHECK: [0x80,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, -1
-// CHECK: [0xc1,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, -1
+// CHECK: [0xc1,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, 0.5
-// CHECK: [0xf0,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, 0.5
+// CHECK: [0xf0,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, -4.0
-// CHECK: [0xf7,0x35,0x80,0xbe]
+s_mov_fed_b32 s5, -4.0
+// CHECK: [0xf7,0x35,0x85,0xbe]
 
-s_mov_fed_b32 s0, 0xaf123456
-// CHECK: [0xff,0x35,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_mov_fed_b32 s5, 0xaf123456
+// CHECK: [0xff,0x35,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_mov_fed_b32 s0, 0x3f717273
-// CHECK: [0xff,0x35,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_mov_fed_b32 s5, 0x3f717273
+// CHECK: [0xff,0x35,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_add_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x80]
+s_add_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x80]
 
-s_add_u32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x80]
+s_add_u32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x80]
 
-s_add_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x80]
+s_add_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x80]
 
-s_add_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x80]
+s_add_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x80]
 
-s_add_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x80]
+s_add_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x80]
 
-s_add_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x80]
+s_add_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x80]
 
-s_add_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x80]
+s_add_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x80]
 
-s_add_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x80]
+s_add_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x80]
 
-s_add_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x80]
+s_add_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x80]
 
-s_add_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x80]
+s_add_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x80]
 
-s_add_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x80]
+s_add_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x80]
 
-s_add_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x80]
+s_add_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x80]
 
-s_add_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x80]
+s_add_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x80]
 
-s_add_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x80]
+s_add_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x80]
 
-s_add_u32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x80]
+s_add_u32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x80]
 
-s_add_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x80]
+s_add_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x80]
 
-s_add_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x80]
+s_add_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x80]
 
-s_add_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x80]
+s_add_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x80]
 
-s_add_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x80]
+s_add_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x80]
 
-s_add_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x80]
+s_add_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x80]
 
-s_add_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x80]
+s_add_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x80]
 
-s_add_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x80]
+s_add_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x80]
 
-s_add_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x80]
+s_add_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x80]
 
-s_add_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x80]
+s_add_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x80]
 
-s_add_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x80]
+s_add_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x80]
 
-s_add_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x80]
+s_add_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x80]
 
-s_add_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x80]
+s_add_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x80]
 
-s_add_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x80]
+s_add_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x80]
 
-s_add_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x80]
+s_add_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x80]
 
-s_add_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x80]
+s_add_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x80]
 
-s_add_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x80]
+s_add_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x80]
 
-s_add_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x80,0x56,0x34,0x12,0xaf]
+s_add_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x80,0x56,0x34,0x12,0xaf]
 
-s_add_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x80,0x73,0x72,0x71,0x3f]
+s_add_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x80,0x73,0x72,0x71,0x3f]
 
-s_add_u32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x80]
+s_add_u32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x80]
 
-s_add_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x80]
+s_add_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x80]
 
-s_add_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x80]
+s_add_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x80]
 
-s_add_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x80]
+s_add_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x80]
 
-s_add_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x80]
+s_add_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x80]
 
-s_add_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x80]
+s_add_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x80]
 
-s_add_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x80]
+s_add_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x80]
 
-s_add_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x80]
+s_add_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x80]
 
-s_add_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x80]
+s_add_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x80]
 
-s_add_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x80]
+s_add_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x80]
 
-s_add_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x80]
+s_add_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x80]
 
-s_add_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x80]
+s_add_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x80]
 
-s_add_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x80]
+s_add_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x80]
 
-s_add_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x80]
+s_add_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x80]
 
-s_add_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x80]
+s_add_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x80]
 
-s_add_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x80]
+s_add_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x80]
 
-s_add_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x80]
+s_add_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x80]
 
-s_add_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x80,0x56,0x34,0x12,0xaf]
+s_add_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x80,0x56,0x34,0x12,0xaf]
 
-s_add_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x80,0x73,0x72,0x71,0x3f]
+s_add_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x80,0x73,0x72,0x71,0x3f]
 
-s_sub_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x80]
+s_sub_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x80]
 
-s_sub_u32 s103, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x80]
+s_sub_u32 s103, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x80]
 
-s_sub_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe8,0x80]
+s_sub_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe8,0x80]
 
-s_sub_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe9,0x80]
+s_sub_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe9,0x80]
 
-s_sub_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x80]
+s_sub_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x80]
 
-s_sub_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x80]
+s_sub_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x80]
 
-s_sub_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x80]
+s_sub_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x80]
 
-s_sub_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x80]
+s_sub_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x80]
 
-s_sub_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x80]
+s_sub_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x80]
 
-s_sub_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x80]
+s_sub_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x80]
 
-s_sub_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x80]
+s_sub_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x80]
 
-s_sub_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x80]
+s_sub_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x80]
 
-s_sub_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x80]
+s_sub_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x80]
 
-s_sub_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x80]
+s_sub_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x80]
 
-s_sub_u32 s0, s103, s0
-// CHECK: [0x67,0x00,0x80,0x80]
+s_sub_u32 s5, s103, s2
+// CHECK: [0x67,0x02,0x85,0x80]
 
-s_sub_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x80,0x80]
+s_sub_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x85,0x80]
 
-s_sub_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x80,0x80]
+s_sub_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x85,0x80]
 
-s_sub_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x80]
+s_sub_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x80]
 
-s_sub_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x80]
+s_sub_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x80]
 
-s_sub_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x80]
+s_sub_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x80]
 
-s_sub_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x80]
+s_sub_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x80]
 
-s_sub_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x80]
+s_sub_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x80]
 
-s_sub_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x80]
+s_sub_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x80]
 
-s_sub_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x80]
+s_sub_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x80]
 
-s_sub_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x80]
+s_sub_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x80]
 
-s_sub_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x80]
+s_sub_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x80]
 
-s_sub_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x80]
+s_sub_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x80]
 
-s_sub_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x80]
+s_sub_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x80]
 
-s_sub_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x80]
+s_sub_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x80]
 
-s_sub_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x80]
+s_sub_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x80]
 
-s_sub_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x80]
+s_sub_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x80]
 
-s_sub_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x80,0x56,0x34,0x12,0xaf]
+s_sub_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x80,0x56,0x34,0x12,0xaf]
 
-s_sub_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x80,0x73,0x72,0x71,0x3f]
+s_sub_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x80,0x73,0x72,0x71,0x3f]
 
-s_sub_u32 s0, s0, s103
-// CHECK: [0x00,0x67,0x80,0x80]
+s_sub_u32 s5, s1, s103
+// CHECK: [0x01,0x67,0x85,0x80]
 
-s_sub_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x80]
+s_sub_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x85,0x80]
 
-s_sub_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x80]
+s_sub_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x85,0x80]
 
-s_sub_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x80]
+s_sub_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x80]
 
-s_sub_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x80]
+s_sub_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x80]
 
-s_sub_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x80]
+s_sub_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x80]
 
-s_sub_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x80]
+s_sub_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x80]
 
-s_sub_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x80]
+s_sub_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x80]
 
-s_sub_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x80]
+s_sub_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x80]
 
-s_sub_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x80]
+s_sub_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x80]
 
-s_sub_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x80]
+s_sub_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x80]
 
-s_sub_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x80]
+s_sub_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x80]
 
-s_sub_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x80]
+s_sub_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x80]
 
-s_sub_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x80]
+s_sub_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x80]
 
-s_sub_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x80]
+s_sub_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x80]
 
-s_sub_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x80]
+s_sub_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x80]
 
-s_sub_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x80]
+s_sub_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x80]
 
-s_sub_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x80,0x56,0x34,0x12,0xaf]
+s_sub_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x80,0x56,0x34,0x12,0xaf]
 
-s_sub_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x80,0x73,0x72,0x71,0x3f]
+s_sub_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x80,0x73,0x72,0x71,0x3f]
 
-s_add_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x81]
+s_add_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x81]
 
-s_add_i32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x81]
+s_add_i32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x81]
 
-s_add_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x81]
+s_add_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x81]
 
-s_add_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x81]
+s_add_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x81]
 
-s_add_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x81]
+s_add_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x81]
 
-s_add_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x81]
+s_add_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x81]
 
-s_add_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x81]
+s_add_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x81]
 
-s_add_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x81]
+s_add_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x81]
 
-s_add_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x81]
+s_add_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x81]
 
-s_add_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x81]
+s_add_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x81]
 
-s_add_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x81]
+s_add_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x81]
 
-s_add_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x81]
+s_add_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x81]
 
-s_add_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x81]
+s_add_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x81]
 
-s_add_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x81]
+s_add_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x81]
 
-s_add_i32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x81]
+s_add_i32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x81]
 
-s_add_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x81]
+s_add_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x81]
 
-s_add_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x81]
+s_add_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x81]
 
-s_add_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x81]
+s_add_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x81]
 
-s_add_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x81]
+s_add_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x81]
 
-s_add_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x81]
+s_add_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x81]
 
-s_add_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x81]
+s_add_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x81]
 
-s_add_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x81]
+s_add_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x81]
 
-s_add_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x81]
+s_add_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x81]
 
-s_add_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x81]
+s_add_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x81]
 
-s_add_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x81]
+s_add_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x81]
 
-s_add_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x81]
+s_add_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x81]
 
-s_add_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x81]
+s_add_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x81]
 
-s_add_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x81]
+s_add_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x81]
 
-s_add_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x81]
+s_add_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x81]
 
-s_add_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x81]
+s_add_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x81]
 
-s_add_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x81]
+s_add_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x81]
 
-s_add_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x81,0x56,0x34,0x12,0xaf]
+s_add_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x81,0x56,0x34,0x12,0xaf]
 
-s_add_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x81,0x73,0x72,0x71,0x3f]
+s_add_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x81,0x73,0x72,0x71,0x3f]
 
-s_add_i32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x81]
+s_add_i32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x81]
 
-s_add_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x81]
+s_add_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x81]
 
-s_add_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x81]
+s_add_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x81]
 
-s_add_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x81]
+s_add_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x81]
 
-s_add_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x81]
+s_add_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x81]
 
-s_add_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x81]
+s_add_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x81]
 
-s_add_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x81]
+s_add_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x81]
 
-s_add_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x81]
+s_add_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x81]
 
-s_add_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x81]
+s_add_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x81]
 
-s_add_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x81]
+s_add_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x81]
 
-s_add_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x81]
+s_add_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x81]
 
-s_add_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x81]
+s_add_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x81]
 
-s_add_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x81]
+s_add_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x81]
 
-s_add_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x81]
+s_add_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x81]
 
-s_add_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x81]
+s_add_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x81]
 
-s_add_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x81]
+s_add_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x81]
 
-s_add_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x81]
+s_add_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x81]
 
-s_add_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x81,0x56,0x34,0x12,0xaf]
+s_add_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x81,0x56,0x34,0x12,0xaf]
 
-s_add_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x81,0x73,0x72,0x71,0x3f]
+s_add_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x81,0x73,0x72,0x71,0x3f]
 
-s_sub_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x81]
+s_sub_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x81]
 
-s_sub_i32 s103, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x81]
+s_sub_i32 s103, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x81]
 
-s_sub_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe8,0x81]
+s_sub_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe8,0x81]
 
-s_sub_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe9,0x81]
+s_sub_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe9,0x81]
 
-s_sub_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x81]
+s_sub_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x81]
 
-s_sub_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x81]
+s_sub_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x81]
 
-s_sub_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x81]
+s_sub_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x81]
 
-s_sub_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x81]
+s_sub_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x81]
 
-s_sub_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x81]
+s_sub_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x81]
 
-s_sub_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x81]
+s_sub_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x81]
 
-s_sub_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x81]
+s_sub_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x81]
 
-s_sub_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x81]
+s_sub_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x81]
 
-s_sub_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x81]
+s_sub_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x81]
 
-s_sub_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x81]
+s_sub_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x81]
 
-s_sub_i32 s0, s103, s0
-// CHECK: [0x67,0x00,0x80,0x81]
+s_sub_i32 s5, s103, s2
+// CHECK: [0x67,0x02,0x85,0x81]
 
-s_sub_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x80,0x81]
+s_sub_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x85,0x81]
 
-s_sub_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x80,0x81]
+s_sub_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x85,0x81]
 
-s_sub_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x81]
+s_sub_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x81]
 
-s_sub_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x81]
+s_sub_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x81]
 
-s_sub_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x81]
+s_sub_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x81]
 
-s_sub_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x81]
+s_sub_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x81]
 
-s_sub_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x81]
+s_sub_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x81]
 
-s_sub_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x81]
+s_sub_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x81]
 
-s_sub_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x81]
+s_sub_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x81]
 
-s_sub_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x81]
+s_sub_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x81]
 
-s_sub_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x81]
+s_sub_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x81]
 
-s_sub_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x81]
+s_sub_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x81]
 
-s_sub_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x81]
+s_sub_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x81]
 
-s_sub_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x81]
+s_sub_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x81]
 
-s_sub_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x81]
+s_sub_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x81]
 
-s_sub_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x81]
+s_sub_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x81]
 
-s_sub_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x81,0x56,0x34,0x12,0xaf]
+s_sub_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x81,0x56,0x34,0x12,0xaf]
 
-s_sub_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x81,0x73,0x72,0x71,0x3f]
+s_sub_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x81,0x73,0x72,0x71,0x3f]
 
-s_sub_i32 s0, s0, s103
-// CHECK: [0x00,0x67,0x80,0x81]
+s_sub_i32 s5, s1, s103
+// CHECK: [0x01,0x67,0x85,0x81]
 
-s_sub_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x81]
+s_sub_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x85,0x81]
 
-s_sub_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x81]
+s_sub_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x85,0x81]
 
-s_sub_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x81]
+s_sub_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x81]
 
-s_sub_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x81]
+s_sub_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x81]
 
-s_sub_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x81]
+s_sub_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x81]
 
-s_sub_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x81]
+s_sub_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x81]
 
-s_sub_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x81]
+s_sub_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x81]
 
-s_sub_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x81]
+s_sub_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x81]
 
-s_sub_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x81]
+s_sub_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x81]
 
-s_sub_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x81]
+s_sub_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x81]
 
-s_sub_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x81]
+s_sub_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x81]
 
-s_sub_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x81]
+s_sub_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x81]
 
-s_sub_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x81]
+s_sub_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x81]
 
-s_sub_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x81]
+s_sub_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x81]
 
-s_sub_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x81]
+s_sub_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x81]
 
-s_sub_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x81]
+s_sub_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x81]
 
-s_sub_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x81,0x56,0x34,0x12,0xaf]
+s_sub_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x81,0x56,0x34,0x12,0xaf]
 
-s_sub_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x81,0x73,0x72,0x71,0x3f]
+s_sub_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x81,0x73,0x72,0x71,0x3f]
 
-s_addc_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x82]
+s_addc_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x82]
 
-s_addc_u32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x82]
+s_addc_u32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x82]
 
-s_addc_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x82]
+s_addc_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x82]
 
-s_addc_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x82]
+s_addc_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x82]
 
-s_addc_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x82]
+s_addc_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x82]
 
-s_addc_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x82]
+s_addc_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x82]
 
-s_addc_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x82]
+s_addc_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x82]
 
-s_addc_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x82]
+s_addc_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x82]
 
-s_addc_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x82]
+s_addc_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x82]
 
-s_addc_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x82]
+s_addc_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x82]
 
-s_addc_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x82]
+s_addc_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x82]
 
-s_addc_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x82]
+s_addc_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x82]
 
-s_addc_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x82]
+s_addc_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x82]
 
-s_addc_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x82]
+s_addc_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x82]
 
-s_addc_u32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x82]
+s_addc_u32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x82]
 
-s_addc_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x82]
+s_addc_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x82]
 
-s_addc_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x82]
+s_addc_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x82]
 
-s_addc_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x82]
+s_addc_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x82]
 
-s_addc_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x82]
+s_addc_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x82]
 
-s_addc_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x82]
+s_addc_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x82]
 
-s_addc_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x82]
+s_addc_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x82]
 
-s_addc_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x82]
+s_addc_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x82]
 
-s_addc_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x82]
+s_addc_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x82]
 
-s_addc_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x82]
+s_addc_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x82]
 
-s_addc_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x82]
+s_addc_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x82]
 
-s_addc_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x82]
+s_addc_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x82]
 
-s_addc_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x82]
+s_addc_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x82]
 
-s_addc_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x82]
+s_addc_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x82]
 
-s_addc_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x82]
+s_addc_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x82]
 
-s_addc_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x82]
+s_addc_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x82]
 
-s_addc_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x82]
+s_addc_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x82]
 
-s_addc_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x82,0x56,0x34,0x12,0xaf]
+s_addc_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x82,0x56,0x34,0x12,0xaf]
 
-s_addc_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x82,0x73,0x72,0x71,0x3f]
+s_addc_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x82,0x73,0x72,0x71,0x3f]
 
-s_addc_u32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x82]
+s_addc_u32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x82]
 
-s_addc_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x82]
+s_addc_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x82]
 
-s_addc_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x82]
+s_addc_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x82]
 
-s_addc_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x82]
+s_addc_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x82]
 
-s_addc_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x82]
+s_addc_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x82]
 
-s_addc_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x82]
+s_addc_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x82]
 
-s_addc_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x82]
+s_addc_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x82]
 
-s_addc_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x82]
+s_addc_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x82]
 
-s_addc_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x82]
+s_addc_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x82]
 
-s_addc_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x82]
+s_addc_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x82]
 
-s_addc_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x82]
+s_addc_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x82]
 
-s_addc_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x82]
+s_addc_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x82]
 
-s_addc_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x82]
+s_addc_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x82]
 
-s_addc_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x82]
+s_addc_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x82]
 
-s_addc_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x82]
+s_addc_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x82]
 
-s_addc_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x82]
+s_addc_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x82]
 
-s_addc_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x82]
+s_addc_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x82]
 
-s_addc_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x82,0x56,0x34,0x12,0xaf]
+s_addc_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x82,0x56,0x34,0x12,0xaf]
 
-s_addc_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x82,0x73,0x72,0x71,0x3f]
+s_addc_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x82,0x73,0x72,0x71,0x3f]
 
-s_subb_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x82]
+s_subb_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x82]
 
-s_subb_u32 s103, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x82]
+s_subb_u32 s103, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x82]
 
-s_subb_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe8,0x82]
+s_subb_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe8,0x82]
 
-s_subb_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe9,0x82]
+s_subb_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe9,0x82]
 
-s_subb_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x82]
+s_subb_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x82]
 
-s_subb_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x82]
+s_subb_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x82]
 
-s_subb_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x82]
+s_subb_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x82]
 
-s_subb_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x82]
+s_subb_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x82]
 
-s_subb_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x82]
+s_subb_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x82]
 
-s_subb_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x82]
+s_subb_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x82]
 
-s_subb_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x82]
+s_subb_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x82]
 
-s_subb_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x82]
+s_subb_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x82]
 
-s_subb_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x82]
+s_subb_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x82]
 
-s_subb_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x82]
+s_subb_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x82]
 
-s_subb_u32 s0, s103, s0
-// CHECK: [0x67,0x00,0x80,0x82]
+s_subb_u32 s5, s103, s2
+// CHECK: [0x67,0x02,0x85,0x82]
 
-s_subb_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x80,0x82]
+s_subb_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x85,0x82]
 
-s_subb_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x80,0x82]
+s_subb_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x85,0x82]
 
-s_subb_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x82]
+s_subb_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x82]
 
-s_subb_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x82]
+s_subb_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x82]
 
-s_subb_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x82]
+s_subb_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x82]
 
-s_subb_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x82]
+s_subb_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x82]
 
-s_subb_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x82]
+s_subb_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x82]
 
-s_subb_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x82]
+s_subb_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x82]
 
-s_subb_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x82]
+s_subb_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x82]
 
-s_subb_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x82]
+s_subb_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x82]
 
-s_subb_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x82]
+s_subb_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x82]
 
-s_subb_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x82]
+s_subb_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x82]
 
-s_subb_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x82]
+s_subb_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x82]
 
-s_subb_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x82]
+s_subb_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x82]
 
-s_subb_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x82]
+s_subb_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x82]
 
-s_subb_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x82]
+s_subb_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x82]
 
-s_subb_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x82,0x56,0x34,0x12,0xaf]
+s_subb_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x82,0x56,0x34,0x12,0xaf]
 
-s_subb_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x82,0x73,0x72,0x71,0x3f]
+s_subb_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x82,0x73,0x72,0x71,0x3f]
 
-s_subb_u32 s0, s0, s103
-// CHECK: [0x00,0x67,0x80,0x82]
+s_subb_u32 s5, s1, s103
+// CHECK: [0x01,0x67,0x85,0x82]
 
-s_subb_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x82]
+s_subb_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x85,0x82]
 
-s_subb_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x82]
+s_subb_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x85,0x82]
 
-s_subb_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x82]
+s_subb_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x82]
 
-s_subb_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x82]
+s_subb_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x82]
 
-s_subb_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x82]
+s_subb_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x82]
 
-s_subb_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x82]
+s_subb_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x82]
 
-s_subb_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x82]
+s_subb_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x82]
 
-s_subb_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x82]
+s_subb_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x82]
 
-s_subb_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x82]
+s_subb_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x82]
 
-s_subb_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x82]
+s_subb_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x82]
 
-s_subb_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x82]
+s_subb_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x82]
 
-s_subb_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x82]
+s_subb_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x82]
 
-s_subb_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x82]
+s_subb_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x82]
 
-s_subb_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x82]
+s_subb_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x82]
 
-s_subb_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x82]
+s_subb_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x82]
 
-s_subb_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x82]
+s_subb_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x82]
 
-s_subb_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x82,0x56,0x34,0x12,0xaf]
+s_subb_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x82,0x56,0x34,0x12,0xaf]
 
-s_subb_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x82,0x73,0x72,0x71,0x3f]
+s_subb_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x82,0x73,0x72,0x71,0x3f]
 
-s_min_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x83]
+s_min_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x83]
 
-s_min_i32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x83]
+s_min_i32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x83]
 
-s_min_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x83]
+s_min_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x83]
 
-s_min_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x83]
+s_min_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x83]
 
-s_min_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x83]
+s_min_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x83]
 
-s_min_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x83]
+s_min_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x83]
 
-s_min_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x83]
+s_min_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x83]
 
-s_min_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x83]
+s_min_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x83]
 
-s_min_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x83]
+s_min_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x83]
 
-s_min_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x83]
+s_min_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x83]
 
-s_min_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x83]
+s_min_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x83]
 
-s_min_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x83]
+s_min_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x83]
 
-s_min_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x83]
+s_min_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x83]
 
-s_min_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x83]
+s_min_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x83]
 
-s_min_i32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x83]
+s_min_i32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x83]
 
-s_min_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x83]
+s_min_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x83]
 
-s_min_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x83]
+s_min_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x83]
 
-s_min_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x83]
+s_min_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x83]
 
-s_min_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x83]
+s_min_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x83]
 
-s_min_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x83]
+s_min_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x83]
 
-s_min_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x83]
+s_min_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x83]
 
-s_min_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x83]
+s_min_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x83]
 
-s_min_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x83]
+s_min_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x83]
 
-s_min_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x83]
+s_min_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x83]
 
-s_min_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x83]
+s_min_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x83]
 
-s_min_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x83]
+s_min_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x83]
 
-s_min_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x83]
+s_min_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x83]
 
-s_min_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x83]
+s_min_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x83]
 
-s_min_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x83]
+s_min_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x83]
 
-s_min_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x83]
+s_min_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x83]
 
-s_min_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x83]
+s_min_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x83]
 
-s_min_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x83,0x56,0x34,0x12,0xaf]
+s_min_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x83,0x56,0x34,0x12,0xaf]
 
-s_min_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x83,0x73,0x72,0x71,0x3f]
+s_min_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x83,0x73,0x72,0x71,0x3f]
 
-s_min_i32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x83]
+s_min_i32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x83]
 
-s_min_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x83]
+s_min_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x83]
 
-s_min_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x83]
+s_min_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x83]
 
-s_min_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x83]
+s_min_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x83]
 
-s_min_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x83]
+s_min_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x83]
 
-s_min_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x83]
+s_min_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x83]
 
-s_min_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x83]
+s_min_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x83]
 
-s_min_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x83]
+s_min_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x83]
 
-s_min_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x83]
+s_min_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x83]
 
-s_min_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x83]
+s_min_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x83]
 
-s_min_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x83]
+s_min_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x83]
 
-s_min_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x83]
+s_min_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x83]
 
-s_min_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x83]
+s_min_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x83]
 
-s_min_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x83]
+s_min_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x83]
 
-s_min_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x83]
+s_min_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x83]
 
-s_min_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x83]
+s_min_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x83]
 
-s_min_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x83]
+s_min_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x83]
 
-s_min_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x83,0x56,0x34,0x12,0xaf]
+s_min_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x83,0x56,0x34,0x12,0xaf]
 
-s_min_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x83,0x73,0x72,0x71,0x3f]
+s_min_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x83,0x73,0x72,0x71,0x3f]
 
-s_min_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x83]
+s_min_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x83]
 
-s_min_u32 s103, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x83]
+s_min_u32 s103, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x83]
 
-s_min_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe8,0x83]
+s_min_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe8,0x83]
 
-s_min_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe9,0x83]
+s_min_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe9,0x83]
 
-s_min_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x83]
+s_min_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x83]
 
-s_min_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x83]
+s_min_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x83]
 
-s_min_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x83]
+s_min_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x83]
 
-s_min_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x83]
+s_min_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x83]
 
-s_min_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x83]
+s_min_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x83]
 
-s_min_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x83]
+s_min_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x83]
 
-s_min_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x83]
+s_min_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x83]
 
-s_min_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x83]
+s_min_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x83]
 
-s_min_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x83]
+s_min_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x83]
 
-s_min_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x83]
+s_min_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x83]
 
-s_min_u32 s0, s103, s0
-// CHECK: [0x67,0x00,0x80,0x83]
+s_min_u32 s5, s103, s2
+// CHECK: [0x67,0x02,0x85,0x83]
 
-s_min_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x80,0x83]
+s_min_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x85,0x83]
 
-s_min_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x80,0x83]
+s_min_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x85,0x83]
 
-s_min_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x83]
+s_min_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x83]
 
-s_min_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x83]
+s_min_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x83]
 
-s_min_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x83]
+s_min_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x83]
 
-s_min_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x83]
+s_min_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x83]
 
-s_min_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x83]
+s_min_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x83]
 
-s_min_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x83]
+s_min_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x83]
 
-s_min_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x83]
+s_min_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x83]
 
-s_min_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x83]
+s_min_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x83]
 
-s_min_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x83]
+s_min_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x83]
 
-s_min_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x83]
+s_min_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x83]
 
-s_min_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x83]
+s_min_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x83]
 
-s_min_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x83]
+s_min_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x83]
 
-s_min_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x83]
+s_min_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x83]
 
-s_min_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x83]
+s_min_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x83]
 
-s_min_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x83,0x56,0x34,0x12,0xaf]
+s_min_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x83,0x56,0x34,0x12,0xaf]
 
-s_min_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x83,0x73,0x72,0x71,0x3f]
+s_min_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x83,0x73,0x72,0x71,0x3f]
 
-s_min_u32 s0, s0, s103
-// CHECK: [0x00,0x67,0x80,0x83]
+s_min_u32 s5, s1, s103
+// CHECK: [0x01,0x67,0x85,0x83]
 
-s_min_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x83]
+s_min_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x85,0x83]
 
-s_min_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x83]
+s_min_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x85,0x83]
 
-s_min_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x83]
+s_min_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x83]
 
-s_min_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x83]
+s_min_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x83]
 
-s_min_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x83]
+s_min_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x83]
 
-s_min_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x83]
+s_min_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x83]
 
-s_min_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x83]
+s_min_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x83]
 
-s_min_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x83]
+s_min_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x83]
 
-s_min_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x83]
+s_min_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x83]
 
-s_min_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x83]
+s_min_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x83]
 
-s_min_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x83]
+s_min_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x83]
 
-s_min_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x83]
+s_min_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x83]
 
-s_min_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x83]
+s_min_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x83]
 
-s_min_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x83]
+s_min_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x83]
 
-s_min_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x83]
+s_min_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x83]
 
-s_min_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x83]
+s_min_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x83]
 
-s_min_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x83,0x56,0x34,0x12,0xaf]
+s_min_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x83,0x56,0x34,0x12,0xaf]
 
-s_min_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x83,0x73,0x72,0x71,0x3f]
+s_min_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x83,0x73,0x72,0x71,0x3f]
 
-s_max_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x84]
+s_max_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x84]
 
-s_max_i32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x84]
+s_max_i32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x84]
 
-s_max_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x84]
+s_max_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x84]
 
-s_max_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x84]
+s_max_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x84]
 
-s_max_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x84]
+s_max_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x84]
 
-s_max_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x84]
+s_max_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x84]
 
-s_max_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x84]
+s_max_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x84]
 
-s_max_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x84]
+s_max_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x84]
 
-s_max_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x84]
+s_max_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x84]
 
-s_max_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x84]
+s_max_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x84]
 
-s_max_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x84]
+s_max_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x84]
 
-s_max_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x84]
+s_max_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x84]
 
-s_max_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x84]
+s_max_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x84]
 
-s_max_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x84]
+s_max_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x84]
 
-s_max_i32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x84]
+s_max_i32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x84]
 
-s_max_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x84]
+s_max_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x84]
 
-s_max_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x84]
+s_max_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x84]
 
-s_max_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x84]
+s_max_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x84]
 
-s_max_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x84]
+s_max_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x84]
 
-s_max_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x84]
+s_max_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x84]
 
-s_max_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x84]
+s_max_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x84]
 
-s_max_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x84]
+s_max_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x84]
 
-s_max_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x84]
+s_max_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x84]
 
-s_max_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x84]
+s_max_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x84]
 
-s_max_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x84]
+s_max_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x84]
 
-s_max_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x84]
+s_max_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x84]
 
-s_max_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x84]
+s_max_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x84]
 
-s_max_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x84]
+s_max_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x84]
 
-s_max_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x84]
+s_max_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x84]
 
-s_max_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x84]
+s_max_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x84]
 
-s_max_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x84]
+s_max_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x84]
 
-s_max_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x84,0x56,0x34,0x12,0xaf]
+s_max_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x84,0x56,0x34,0x12,0xaf]
 
-s_max_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x84,0x73,0x72,0x71,0x3f]
+s_max_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x84,0x73,0x72,0x71,0x3f]
 
-s_max_i32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x84]
+s_max_i32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x84]
 
-s_max_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x84]
+s_max_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x84]
 
-s_max_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x84]
+s_max_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x84]
 
-s_max_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x84]
+s_max_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x84]
 
-s_max_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x84]
+s_max_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x84]
 
-s_max_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x84]
+s_max_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x84]
 
-s_max_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x84]
+s_max_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x84]
 
-s_max_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x84]
+s_max_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x84]
 
-s_max_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x84]
+s_max_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x84]
 
-s_max_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x84]
+s_max_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x84]
 
-s_max_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x84]
+s_max_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x84]
 
-s_max_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x84]
+s_max_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x84]
 
-s_max_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x84]
+s_max_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x84]
 
-s_max_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x84]
+s_max_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x84]
 
-s_max_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x84]
+s_max_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x84]
 
-s_max_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x84]
+s_max_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x84]
 
-s_max_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x84]
+s_max_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x84]
 
-s_max_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x84,0x56,0x34,0x12,0xaf]
+s_max_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x84,0x56,0x34,0x12,0xaf]
 
-s_max_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x84,0x73,0x72,0x71,0x3f]
+s_max_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x84,0x73,0x72,0x71,0x3f]
 
-s_max_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x84]
+s_max_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x84]
 
-s_max_u32 s103, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x84]
+s_max_u32 s103, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x84]
 
-s_max_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe8,0x84]
+s_max_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe8,0x84]
 
-s_max_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe9,0x84]
+s_max_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe9,0x84]
 
-s_max_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x84]
+s_max_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x84]
 
-s_max_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x84]
+s_max_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x84]
 
-s_max_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x84]
+s_max_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x84]
 
-s_max_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x84]
+s_max_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x84]
 
-s_max_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x84]
+s_max_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x84]
 
-s_max_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x84]
+s_max_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x84]
 
-s_max_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x84]
+s_max_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x84]
 
-s_max_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x84]
+s_max_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x84]
 
-s_max_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x84]
+s_max_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x84]
 
-s_max_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x84]
+s_max_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x84]
 
-s_max_u32 s0, s103, s0
-// CHECK: [0x67,0x00,0x80,0x84]
+s_max_u32 s5, s103, s2
+// CHECK: [0x67,0x02,0x85,0x84]
 
-s_max_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x80,0x84]
+s_max_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x85,0x84]
 
-s_max_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x80,0x84]
+s_max_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x85,0x84]
 
-s_max_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x84]
+s_max_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x84]
 
-s_max_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x84]
+s_max_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x84]
 
-s_max_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x84]
+s_max_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x84]
 
-s_max_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x84]
+s_max_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x84]
 
-s_max_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x84]
+s_max_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x84]
 
-s_max_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x84]
+s_max_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x84]
 
-s_max_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x84]
+s_max_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x84]
 
-s_max_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x84]
+s_max_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x84]
 
-s_max_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x84]
+s_max_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x84]
 
-s_max_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x84]
+s_max_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x84]
 
-s_max_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x84]
+s_max_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x84]
 
-s_max_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x84]
+s_max_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x84]
 
-s_max_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x84]
+s_max_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x84]
 
-s_max_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x84]
+s_max_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x84]
 
-s_max_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x84,0x56,0x34,0x12,0xaf]
+s_max_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x84,0x56,0x34,0x12,0xaf]
 
-s_max_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x84,0x73,0x72,0x71,0x3f]
+s_max_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x84,0x73,0x72,0x71,0x3f]
 
-s_max_u32 s0, s0, s103
-// CHECK: [0x00,0x67,0x80,0x84]
+s_max_u32 s5, s1, s103
+// CHECK: [0x01,0x67,0x85,0x84]
 
-s_max_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x84]
+s_max_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x85,0x84]
 
-s_max_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x84]
+s_max_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x85,0x84]
 
-s_max_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x84]
+s_max_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x84]
 
-s_max_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x84]
+s_max_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x84]
 
-s_max_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x84]
+s_max_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x84]
 
-s_max_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x84]
+s_max_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x84]
 
-s_max_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x84]
+s_max_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x84]
 
-s_max_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x84]
+s_max_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x84]
 
-s_max_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x84]
+s_max_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x84]
 
-s_max_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x84]
+s_max_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x84]
 
-s_max_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x84]
+s_max_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x84]
 
-s_max_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x84]
+s_max_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x84]
 
-s_max_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x84]
+s_max_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x84]
 
-s_max_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x84]
+s_max_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x84]
 
-s_max_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x84]
+s_max_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x84]
 
-s_max_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x84]
+s_max_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x84]
 
-s_max_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x84,0x56,0x34,0x12,0xaf]
+s_max_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x84,0x56,0x34,0x12,0xaf]
 
-s_max_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x84,0x73,0x72,0x71,0x3f]
+s_max_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x84,0x73,0x72,0x71,0x3f]
 
-s_cselect_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x85]
+s_cselect_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x85]
 
-s_cselect_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x85]
+s_cselect_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x85]
 
-s_cselect_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x85]
+s_cselect_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x85]
 
-s_cselect_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x85]
+s_cselect_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x85]
 
-s_cselect_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x85]
+s_cselect_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x85]
 
-s_cselect_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x85]
+s_cselect_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x85]
 
-s_cselect_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x85]
+s_cselect_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x85]
 
-s_cselect_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x85]
+s_cselect_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x85]
 
-s_cselect_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x85]
+s_cselect_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x85]
 
-s_cselect_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x85]
+s_cselect_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x85]
 
-s_cselect_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x85]
+s_cselect_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x85]
 
-s_cselect_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x85]
+s_cselect_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x85]
 
-s_cselect_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x85]
+s_cselect_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x85]
 
-s_cselect_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x85]
+s_cselect_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x85]
 
-s_cselect_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x85]
+s_cselect_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x85]
 
-s_cselect_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x85]
+s_cselect_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x85]
 
-s_cselect_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x85]
+s_cselect_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x85]
 
-s_cselect_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x85]
+s_cselect_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x85]
 
-s_cselect_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x85]
+s_cselect_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x85]
 
-s_cselect_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x85]
+s_cselect_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x85]
 
-s_cselect_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x85]
+s_cselect_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x85]
 
-s_cselect_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x85]
+s_cselect_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x85]
 
-s_cselect_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x85]
+s_cselect_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x85]
 
-s_cselect_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x85]
+s_cselect_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x85]
 
-s_cselect_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x85]
+s_cselect_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x85]
 
-s_cselect_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x85]
+s_cselect_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x85]
 
-s_cselect_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x85]
+s_cselect_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x85]
 
-s_cselect_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x85]
+s_cselect_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x85]
 
-s_cselect_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x85]
+s_cselect_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x85]
 
-s_cselect_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x85]
+s_cselect_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x85]
 
-s_cselect_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x85]
+s_cselect_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x85]
 
-s_cselect_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x85,0x56,0x34,0x12,0xaf]
+s_cselect_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x85,0x56,0x34,0x12,0xaf]
 
-s_cselect_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x85,0x73,0x72,0x71,0x3f]
+s_cselect_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x85,0x73,0x72,0x71,0x3f]
 
-s_cselect_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x85]
+s_cselect_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x85]
 
-s_cselect_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x85]
+s_cselect_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x85]
 
-s_cselect_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x85]
+s_cselect_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x85]
 
-s_cselect_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x85]
+s_cselect_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x85]
 
-s_cselect_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x85]
+s_cselect_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x85]
 
-s_cselect_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x85]
+s_cselect_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x85]
 
-s_cselect_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x85]
+s_cselect_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x85]
 
-s_cselect_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x85]
+s_cselect_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x85]
 
-s_cselect_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x85]
+s_cselect_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x85]
 
-s_cselect_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x85]
+s_cselect_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x85]
 
-s_cselect_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x85]
+s_cselect_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x85]
 
-s_cselect_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x85]
+s_cselect_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x85]
 
-s_cselect_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x85]
+s_cselect_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x85]
 
-s_cselect_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x85]
+s_cselect_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x85]
 
-s_cselect_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x85]
+s_cselect_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x85]
 
-s_cselect_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x85]
+s_cselect_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x85]
 
-s_cselect_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x85]
+s_cselect_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x85]
 
-s_cselect_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x85,0x56,0x34,0x12,0xaf]
+s_cselect_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x85,0x56,0x34,0x12,0xaf]
 
-s_cselect_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x85,0x73,0x72,0x71,0x3f]
+s_cselect_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x85,0x73,0x72,0x71,0x3f]
 
-s_cselect_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x85]
 
-s_cselect_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x85]
+s_cselect_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x85]
 
-s_cselect_b64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x85]
+s_cselect_b64 s[102:103], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x85]
 
-s_cselect_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0x85]
+s_cselect_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe8,0x85]
 
-s_cselect_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x85]
+s_cselect_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x85]
 
-s_cselect_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x85]
+s_cselect_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x85]
 
-s_cselect_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x85]
+s_cselect_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x85]
 
-s_cselect_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x85]
+s_cselect_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x85]
 
-s_cselect_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x85]
+s_cselect_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x85]
 
-s_cselect_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[102:103], s[0:1]
-// CHECK: [0x66,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], s[102:103], s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x68,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x68,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x85,0x56,0x34,0x12,0xaf]
+s_cselect_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x85,0x56,0x34,0x12,0xaf]
 
-s_cselect_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x85,0x73,0x72,0x71,0x3f]
+s_cselect_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x85,0x73,0x72,0x71,0x3f]
 
-s_cselect_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], s[102:103]
-// CHECK: [0x00,0x66,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], s[102:103]
+// CHECK: [0x02,0x66,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x68,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x68,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x85,0x56,0x34,0x12,0xaf]
+s_cselect_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x85,0x56,0x34,0x12,0xaf]
 
-s_cselect_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x85,0x73,0x72,0x71,0x3f]
+s_cselect_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x85,0x73,0x72,0x71,0x3f]
 
-s_and_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x87]
+s_and_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x87]
 
-s_and_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x87]
+s_and_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x87]
 
-s_and_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x87]
+s_and_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x87]
 
-s_and_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x87]
+s_and_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x87]
 
-s_and_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x87]
+s_and_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x87]
 
-s_and_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x87]
+s_and_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x87]
 
-s_and_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x87]
+s_and_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x87]
 
-s_and_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x87]
+s_and_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x87]
 
-s_and_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x87]
+s_and_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x87]
 
-s_and_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x87]
+s_and_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x87]
 
-s_and_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x87]
+s_and_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x87]
 
-s_and_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x87]
+s_and_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x87]
 
-s_and_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x87]
+s_and_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x87]
 
-s_and_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x87]
+s_and_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x87]
 
-s_and_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x87]
+s_and_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x87]
 
-s_and_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x87]
+s_and_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x87]
 
-s_and_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x87]
+s_and_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x87]
 
-s_and_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x87]
+s_and_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x87]
 
-s_and_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x87]
+s_and_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x87]
 
-s_and_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x87]
+s_and_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x87]
 
-s_and_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x87]
+s_and_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x87]
 
-s_and_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x87]
+s_and_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x87]
 
-s_and_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x87]
+s_and_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x87]
 
-s_and_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x87]
+s_and_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x87]
 
-s_and_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x87]
+s_and_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x87]
 
-s_and_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x87]
+s_and_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x87]
 
-s_and_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x87]
+s_and_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x87]
 
-s_and_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x87]
+s_and_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x87]
 
-s_and_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x87]
+s_and_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x87]
 
-s_and_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x87]
+s_and_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x87]
 
-s_and_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x87]
+s_and_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x87]
 
-s_and_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x87,0x56,0x34,0x12,0xaf]
+s_and_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x87,0x56,0x34,0x12,0xaf]
 
-s_and_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x87,0x73,0x72,0x71,0x3f]
+s_and_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x87,0x73,0x72,0x71,0x3f]
 
-s_and_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x87]
+s_and_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x87]
 
-s_and_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x87]
+s_and_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x87]
 
-s_and_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x87]
+s_and_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x87]
 
-s_and_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x87]
+s_and_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x87]
 
-s_and_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x87]
+s_and_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x87]
 
-s_and_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x87]
+s_and_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x87]
 
-s_and_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x87]
+s_and_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x87]
 
-s_and_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x87]
+s_and_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x87]
 
-s_and_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x87]
+s_and_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x87]
 
-s_and_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x87]
+s_and_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x87]
 
-s_and_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x87]
+s_and_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x87]
 
-s_and_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x87]
+s_and_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x87]
 
-s_and_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x87]
+s_and_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x87]
 
-s_and_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x87]
+s_and_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x87]
 
-s_and_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x87]
+s_and_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x87]
 
-s_and_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x87]
+s_and_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x87]
 
-s_and_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x87]
+s_and_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x87]
 
-s_and_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x87,0x56,0x34,0x12,0xaf]
+s_and_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x87,0x56,0x34,0x12,0xaf]
 
-s_and_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x87,0x73,0x72,0x71,0x3f]
+s_and_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x87,0x73,0x72,0x71,0x3f]
 
-s_and_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x87]
 
-s_and_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x87]
+s_and_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x87]
 
-s_and_b64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x87]
+s_and_b64 s[102:103], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x87]
 
-s_and_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0x87]
+s_and_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe8,0x87]
 
-s_and_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x87]
+s_and_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x87]
 
-s_and_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x87]
+s_and_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x87]
 
-s_and_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x87]
+s_and_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x87]
 
-s_and_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x87]
+s_and_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x87]
 
-s_and_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x87]
+s_and_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x87]
 
-s_and_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x87]
+s_and_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], s[102:103], s[0:1]
-// CHECK: [0x66,0x00,0x80,0x87]
+s_and_b64 s[10:11], s[102:103], s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x68,0x00,0x80,0x87]
+s_and_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x68,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x87]
+s_and_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x87]
+s_and_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x87]
+s_and_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x87]
+s_and_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x87]
+s_and_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x87]
+s_and_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x87]
+s_and_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x87]
+s_and_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x87]
+s_and_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x87]
 
-s_and_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x87,0x56,0x34,0x12,0xaf]
+s_and_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x87,0x56,0x34,0x12,0xaf]
 
-s_and_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x87,0x73,0x72,0x71,0x3f]
+s_and_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x87,0x73,0x72,0x71,0x3f]
 
-s_and_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], s[102:103]
-// CHECK: [0x00,0x66,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], s[102:103]
+// CHECK: [0x02,0x66,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x68,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x68,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x87]
+s_and_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x87]
 
-s_and_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x87,0x56,0x34,0x12,0xaf]
+s_and_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x87,0x56,0x34,0x12,0xaf]
 
-s_and_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x87,0x73,0x72,0x71,0x3f]
+s_and_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x87,0x73,0x72,0x71,0x3f]
 
-s_or_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x88]
+s_or_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x88]
 
-s_or_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x88]
+s_or_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x88]
 
-s_or_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x88]
+s_or_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x88]
 
-s_or_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x88]
+s_or_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x88]
 
-s_or_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x88]
+s_or_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x88]
 
-s_or_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x88]
+s_or_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x88]
 
-s_or_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x88]
+s_or_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x88]
 
-s_or_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x88]
+s_or_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x88]
 
-s_or_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x88]
+s_or_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x88]
 
-s_or_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x88]
+s_or_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x88]
 
-s_or_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x88]
+s_or_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x88]
 
-s_or_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x88]
+s_or_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x88]
 
-s_or_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x88]
+s_or_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x88]
 
-s_or_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x88]
+s_or_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x88]
 
-s_or_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x88]
+s_or_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x88]
 
-s_or_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x88]
+s_or_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x88]
 
-s_or_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x88]
+s_or_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x88]
 
-s_or_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x88]
+s_or_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x88]
 
-s_or_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x88]
+s_or_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x88]
 
-s_or_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x88]
+s_or_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x88]
 
-s_or_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x88]
+s_or_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x88]
 
-s_or_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x88]
+s_or_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x88]
 
-s_or_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x88]
+s_or_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x88]
 
-s_or_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x88]
+s_or_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x88]
 
-s_or_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x88]
+s_or_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x88]
 
-s_or_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x88]
+s_or_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x88]
 
-s_or_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x88]
+s_or_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x88]
 
-s_or_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x88]
+s_or_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x88]
 
-s_or_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x88]
+s_or_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x88]
 
-s_or_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x88]
+s_or_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x88]
 
-s_or_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x88]
+s_or_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x88]
 
-s_or_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x88,0x56,0x34,0x12,0xaf]
+s_or_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x88,0x56,0x34,0x12,0xaf]
 
-s_or_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x88,0x73,0x72,0x71,0x3f]
+s_or_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x88,0x73,0x72,0x71,0x3f]
 
-s_or_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x88]
+s_or_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x88]
 
-s_or_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x88]
+s_or_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x88]
 
-s_or_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x88]
+s_or_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x88]
 
-s_or_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x88]
+s_or_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x88]
 
-s_or_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x88]
+s_or_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x88]
 
-s_or_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x88]
+s_or_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x88]
 
-s_or_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x88]
+s_or_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x88]
 
-s_or_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x88]
+s_or_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x88]
 
-s_or_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x88]
+s_or_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x88]
 
-s_or_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x88]
+s_or_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x88]
 
-s_or_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x88]
+s_or_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x88]
 
-s_or_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x88]
+s_or_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x88]
 
-s_or_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x88]
+s_or_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x88]
 
-s_or_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x88]
+s_or_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x88]
 
-s_or_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x88]
+s_or_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x88]
 
-s_or_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x88]
+s_or_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x88]
 
-s_or_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x88]
+s_or_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x88]
 
-s_or_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x88,0x56,0x34,0x12,0xaf]
+s_or_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x88,0x56,0x34,0x12,0xaf]
 
-s_or_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x88,0x73,0x72,0x71,0x3f]
+s_or_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x88,0x73,0x72,0x71,0x3f]
 
-s_or_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x88]
 
-s_or_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x88]
+s_or_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x88]
 
-s_or_b64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x88]
+s_or_b64 s[102:103], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x88]
 
-s_or_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0x88]
+s_or_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe8,0x88]
 
-s_or_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x88]
+s_or_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x88]
 
-s_or_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x88]
+s_or_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x88]
 
-s_or_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x88]
+s_or_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x88]
 
-s_or_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x88]
+s_or_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x88]
 
-s_or_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x88]
+s_or_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x88]
 
-s_or_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x88]
+s_or_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], s[102:103], s[0:1]
-// CHECK: [0x66,0x00,0x80,0x88]
+s_or_b64 s[10:11], s[102:103], s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x68,0x00,0x80,0x88]
+s_or_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x68,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x88]
+s_or_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x88]
+s_or_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x88]
+s_or_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x88]
+s_or_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x88]
+s_or_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x88]
+s_or_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x88]
+s_or_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x88]
+s_or_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x88]
+s_or_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x88]
 
-s_or_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x88,0x56,0x34,0x12,0xaf]
+s_or_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x88,0x56,0x34,0x12,0xaf]
 
-s_or_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x88,0x73,0x72,0x71,0x3f]
+s_or_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x88,0x73,0x72,0x71,0x3f]
 
-s_or_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], s[102:103]
-// CHECK: [0x00,0x66,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], s[102:103]
+// CHECK: [0x02,0x66,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x68,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x68,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x88]
+s_or_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x88]
 
-s_or_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x88,0x56,0x34,0x12,0xaf]
+s_or_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x88,0x56,0x34,0x12,0xaf]
 
-s_or_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x88,0x73,0x72,0x71,0x3f]
+s_or_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x88,0x73,0x72,0x71,0x3f]
 
-s_xor_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x89]
+s_xor_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x89]
 
-s_xor_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x89]
+s_xor_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x89]
 
-s_xor_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x89]
+s_xor_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x89]
 
-s_xor_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x89]
+s_xor_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x89]
 
-s_xor_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x89]
+s_xor_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x89]
 
-s_xor_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x89]
+s_xor_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x89]
 
-s_xor_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x89]
+s_xor_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x89]
 
-s_xor_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x89]
+s_xor_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x89]
 
-s_xor_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x89]
+s_xor_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x89]
 
-s_xor_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x89]
+s_xor_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x89]
 
-s_xor_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x89]
+s_xor_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x89]
 
-s_xor_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x89]
+s_xor_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x89]
 
-s_xor_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x89]
+s_xor_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x89]
 
-s_xor_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x89]
+s_xor_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x89]
 
-s_xor_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x89]
+s_xor_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x89]
 
-s_xor_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x89]
+s_xor_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x89]
 
-s_xor_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x89]
+s_xor_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x89]
 
-s_xor_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x89]
+s_xor_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x89]
 
-s_xor_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x89]
+s_xor_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x89]
 
-s_xor_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x89]
+s_xor_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x89]
 
-s_xor_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x89]
+s_xor_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x89]
 
-s_xor_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x89]
+s_xor_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x89]
 
-s_xor_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x89]
+s_xor_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x89]
 
-s_xor_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x89]
+s_xor_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x89]
 
-s_xor_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x89]
+s_xor_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x89]
 
-s_xor_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x89]
+s_xor_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x89]
 
-s_xor_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x89]
+s_xor_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x89]
 
-s_xor_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x89]
+s_xor_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x89]
 
-s_xor_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x89]
+s_xor_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x89]
 
-s_xor_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x89]
+s_xor_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x89]
 
-s_xor_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x89]
+s_xor_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x89]
 
-s_xor_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x89,0x56,0x34,0x12,0xaf]
+s_xor_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x89,0x56,0x34,0x12,0xaf]
 
-s_xor_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x89,0x73,0x72,0x71,0x3f]
+s_xor_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x89,0x73,0x72,0x71,0x3f]
 
-s_xor_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x89]
+s_xor_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x89]
 
-s_xor_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x89]
+s_xor_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x89]
 
-s_xor_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x89]
+s_xor_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x89]
 
-s_xor_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x89]
+s_xor_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x89]
 
-s_xor_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x89]
+s_xor_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x89]
 
-s_xor_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x89]
+s_xor_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x89]
 
-s_xor_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x89]
+s_xor_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x89]
 
-s_xor_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x89]
+s_xor_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x89]
 
-s_xor_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x89]
+s_xor_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x89]
 
-s_xor_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x89]
+s_xor_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x89]
 
-s_xor_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x89]
+s_xor_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x89]
 
-s_xor_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x89]
+s_xor_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x89]
 
-s_xor_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x89]
+s_xor_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x89]
 
-s_xor_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x89]
+s_xor_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x89]
 
-s_xor_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x89]
+s_xor_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x89]
 
-s_xor_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x89]
+s_xor_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x89]
 
-s_xor_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x89]
+s_xor_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x89]
 
-s_xor_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x89,0x56,0x34,0x12,0xaf]
+s_xor_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x89,0x56,0x34,0x12,0xaf]
 
-s_xor_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x89,0x73,0x72,0x71,0x3f]
+s_xor_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x89,0x73,0x72,0x71,0x3f]
 
-s_xor_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x89]
 
-s_xor_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x89]
+s_xor_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x89]
 
-s_xor_b64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x89]
+s_xor_b64 s[102:103], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x89]
 
-s_xor_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0x89]
+s_xor_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe8,0x89]
 
-s_xor_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x89]
+s_xor_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x89]
 
-s_xor_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x89]
+s_xor_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x89]
 
-s_xor_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x89]
+s_xor_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x89]
 
-s_xor_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x89]
+s_xor_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x89]
 
-s_xor_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x89]
+s_xor_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x89]
 
-s_xor_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x89]
+s_xor_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[102:103], s[0:1]
-// CHECK: [0x66,0x00,0x80,0x89]
+s_xor_b64 s[10:11], s[102:103], s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x68,0x00,0x80,0x89]
+s_xor_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x68,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x89]
+s_xor_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x89]
+s_xor_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x89]
+s_xor_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x89]
+s_xor_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x89]
+s_xor_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x89]
+s_xor_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x89]
+s_xor_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x89]
+s_xor_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x89]
+s_xor_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x89]
 
-s_xor_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x89,0x56,0x34,0x12,0xaf]
+s_xor_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x89,0x56,0x34,0x12,0xaf]
 
-s_xor_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x89,0x73,0x72,0x71,0x3f]
+s_xor_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x89,0x73,0x72,0x71,0x3f]
 
-s_xor_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], s[102:103]
-// CHECK: [0x00,0x66,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], s[102:103]
+// CHECK: [0x02,0x66,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x68,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x68,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x89]
+s_xor_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x89]
 
-s_xor_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x89,0x56,0x34,0x12,0xaf]
+s_xor_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x89,0x56,0x34,0x12,0xaf]
 
-s_xor_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x89,0x73,0x72,0x71,0x3f]
+s_xor_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x89,0x73,0x72,0x71,0x3f]
 
-s_andn2_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8a]
+s_andn2_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8a]
 
-s_andn2_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8a]
+s_andn2_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8a]
 
-s_andn2_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x8a]
+s_andn2_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x8a]
 
-s_andn2_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x8a]
+s_andn2_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x8a]
 
-s_andn2_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8a]
+s_andn2_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8a]
 
-s_andn2_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8a]
+s_andn2_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8a]
 
-s_andn2_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8a]
+s_andn2_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8a]
 
-s_andn2_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8a]
+s_andn2_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8a]
 
-s_andn2_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8a]
+s_andn2_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8a]
 
-s_andn2_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8a]
+s_andn2_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8a]
 
-s_andn2_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8a]
+s_andn2_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8a]
 
-s_andn2_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8a]
+s_andn2_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8a]
 
-s_andn2_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8a]
+s_andn2_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8a]
 
-s_andn2_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8a]
+s_andn2_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8a]
 
-s_andn2_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x8a]
+s_andn2_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x8a]
+s_andn2_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x8a]
+s_andn2_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8a]
+s_andn2_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8a]
+s_andn2_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8a]
+s_andn2_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8a]
+s_andn2_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8a]
+s_andn2_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8a]
+s_andn2_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8a]
+s_andn2_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8a]
+s_andn2_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8a]
+s_andn2_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8a]
+s_andn2_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8a]
+s_andn2_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8a]
+s_andn2_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8a]
+s_andn2_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8a]
+s_andn2_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8a]
 
-s_andn2_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8a,0x56,0x34,0x12,0xaf]
+s_andn2_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8a,0x56,0x34,0x12,0xaf]
 
-s_andn2_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8a,0x73,0x72,0x71,0x3f]
+s_andn2_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8a,0x73,0x72,0x71,0x3f]
 
-s_andn2_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x8a]
+s_andn2_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x8a]
 
-s_andn2_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x8a]
+s_andn2_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x8a]
 
-s_andn2_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x8a]
+s_andn2_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x8a]
 
-s_andn2_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8a]
+s_andn2_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8a]
 
-s_andn2_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8a]
+s_andn2_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8a]
 
-s_andn2_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8a]
+s_andn2_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8a]
 
-s_andn2_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8a]
+s_andn2_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8a]
 
-s_andn2_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8a]
+s_andn2_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8a]
 
-s_andn2_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8a]
+s_andn2_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8a]
 
-s_andn2_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8a]
+s_andn2_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8a]
 
-s_andn2_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8a]
+s_andn2_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8a]
 
-s_andn2_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8a]
+s_andn2_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8a]
 
-s_andn2_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8a]
+s_andn2_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8a]
 
-s_andn2_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8a]
+s_andn2_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8a]
 
-s_andn2_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8a]
+s_andn2_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8a]
 
-s_andn2_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8a]
+s_andn2_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8a]
 
-s_andn2_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8a]
+s_andn2_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8a]
 
-s_andn2_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8a,0x56,0x34,0x12,0xaf]
+s_andn2_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8a,0x56,0x34,0x12,0xaf]
 
-s_andn2_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8a,0x73,0x72,0x71,0x3f]
+s_andn2_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8a,0x73,0x72,0x71,0x3f]
 
-s_andn2_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x8a]
+s_andn2_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x8a]
 
-s_andn2_b64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x8a]
+s_andn2_b64 s[102:103], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x8a]
 
-s_andn2_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0x8a]
+s_andn2_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe8,0x8a]
 
-s_andn2_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x8a]
+s_andn2_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x8a]
 
-s_andn2_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x8a]
+s_andn2_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x8a]
 
-s_andn2_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x8a]
+s_andn2_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x8a]
 
-s_andn2_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x8a]
+s_andn2_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x8a]
 
-s_andn2_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x8a]
+s_andn2_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x8a]
 
-s_andn2_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[102:103], s[0:1]
-// CHECK: [0x66,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], s[102:103], s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x68,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x68,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x8a]
+s_andn2_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8a,0x56,0x34,0x12,0xaf]
+s_andn2_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8a,0x56,0x34,0x12,0xaf]
 
-s_andn2_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8a,0x73,0x72,0x71,0x3f]
+s_andn2_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8a,0x73,0x72,0x71,0x3f]
 
-s_andn2_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], s[102:103]
-// CHECK: [0x00,0x66,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], s[102:103]
+// CHECK: [0x02,0x66,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x68,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x68,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8a]
+s_andn2_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8a]
 
-s_andn2_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8a,0x56,0x34,0x12,0xaf]
+s_andn2_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8a,0x56,0x34,0x12,0xaf]
 
-s_andn2_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8a,0x73,0x72,0x71,0x3f]
+s_andn2_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8a,0x73,0x72,0x71,0x3f]
 
-s_orn2_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8b]
+s_orn2_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8b]
 
-s_orn2_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8b]
+s_orn2_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8b]
 
-s_orn2_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x8b]
+s_orn2_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x8b]
 
-s_orn2_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x8b]
+s_orn2_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x8b]
 
-s_orn2_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8b]
+s_orn2_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8b]
 
-s_orn2_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8b]
+s_orn2_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8b]
 
-s_orn2_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8b]
+s_orn2_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8b]
 
-s_orn2_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8b]
+s_orn2_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8b]
 
-s_orn2_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8b]
+s_orn2_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8b]
 
-s_orn2_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8b]
+s_orn2_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8b]
 
-s_orn2_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8b]
+s_orn2_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8b]
 
-s_orn2_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8b]
+s_orn2_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8b]
 
-s_orn2_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8b]
+s_orn2_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8b]
 
-s_orn2_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8b]
+s_orn2_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8b]
 
-s_orn2_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x8b]
+s_orn2_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x8b]
+s_orn2_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x8b]
+s_orn2_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8b]
+s_orn2_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8b]
+s_orn2_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8b]
+s_orn2_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8b]
+s_orn2_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8b]
+s_orn2_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8b]
+s_orn2_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8b]
+s_orn2_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8b]
+s_orn2_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8b]
+s_orn2_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8b]
+s_orn2_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8b]
+s_orn2_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8b]
+s_orn2_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8b]
+s_orn2_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8b]
+s_orn2_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8b]
 
-s_orn2_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8b,0x56,0x34,0x12,0xaf]
+s_orn2_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8b,0x56,0x34,0x12,0xaf]
 
-s_orn2_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8b,0x73,0x72,0x71,0x3f]
+s_orn2_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8b,0x73,0x72,0x71,0x3f]
 
-s_orn2_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x8b]
+s_orn2_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x8b]
 
-s_orn2_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x8b]
+s_orn2_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x8b]
 
-s_orn2_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x8b]
+s_orn2_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x8b]
 
-s_orn2_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8b]
+s_orn2_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8b]
 
-s_orn2_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8b]
+s_orn2_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8b]
 
-s_orn2_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8b]
+s_orn2_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8b]
 
-s_orn2_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8b]
+s_orn2_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8b]
 
-s_orn2_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8b]
+s_orn2_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8b]
 
-s_orn2_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8b]
+s_orn2_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8b]
 
-s_orn2_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8b]
+s_orn2_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8b]
 
-s_orn2_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8b]
+s_orn2_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8b]
 
-s_orn2_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8b]
+s_orn2_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8b]
 
-s_orn2_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8b]
+s_orn2_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8b]
 
-s_orn2_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8b]
+s_orn2_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8b]
 
-s_orn2_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8b]
+s_orn2_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8b]
 
-s_orn2_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8b]
+s_orn2_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8b]
 
-s_orn2_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8b]
+s_orn2_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8b]
 
-s_orn2_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8b,0x56,0x34,0x12,0xaf]
+s_orn2_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8b,0x56,0x34,0x12,0xaf]
 
-s_orn2_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8b,0x73,0x72,0x71,0x3f]
+s_orn2_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8b,0x73,0x72,0x71,0x3f]
 
-s_orn2_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x8b]
+s_orn2_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x8b]
 
-s_orn2_b64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x8b]
+s_orn2_b64 s[102:103], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x8b]
 
-s_orn2_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0x8b]
+s_orn2_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe8,0x8b]
 
-s_orn2_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x8b]
+s_orn2_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x8b]
 
-s_orn2_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x8b]
+s_orn2_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x8b]
 
-s_orn2_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x8b]
+s_orn2_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x8b]
 
-s_orn2_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x8b]
+s_orn2_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x8b]
 
-s_orn2_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x8b]
+s_orn2_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x8b]
 
-s_orn2_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[102:103], s[0:1]
-// CHECK: [0x66,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], s[102:103], s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x68,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x68,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x8b]
+s_orn2_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8b,0x56,0x34,0x12,0xaf]
+s_orn2_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8b,0x56,0x34,0x12,0xaf]
 
-s_orn2_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8b,0x73,0x72,0x71,0x3f]
+s_orn2_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8b,0x73,0x72,0x71,0x3f]
 
-s_orn2_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], s[102:103]
-// CHECK: [0x00,0x66,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], s[102:103]
+// CHECK: [0x02,0x66,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x68,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x68,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8b]
+s_orn2_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8b]
 
-s_orn2_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8b,0x56,0x34,0x12,0xaf]
+s_orn2_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8b,0x56,0x34,0x12,0xaf]
 
-s_orn2_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8b,0x73,0x72,0x71,0x3f]
+s_orn2_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8b,0x73,0x72,0x71,0x3f]
 
-s_nand_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8c]
+s_nand_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8c]
 
-s_nand_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8c]
+s_nand_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8c]
 
-s_nand_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x8c]
+s_nand_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x8c]
 
-s_nand_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x8c]
+s_nand_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x8c]
 
-s_nand_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8c]
+s_nand_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8c]
 
-s_nand_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8c]
+s_nand_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8c]
 
-s_nand_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8c]
+s_nand_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8c]
 
-s_nand_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8c]
+s_nand_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8c]
 
-s_nand_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8c]
+s_nand_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8c]
 
-s_nand_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8c]
+s_nand_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8c]
 
-s_nand_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8c]
+s_nand_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8c]
 
-s_nand_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8c]
+s_nand_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8c]
 
-s_nand_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8c]
+s_nand_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8c]
 
-s_nand_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8c]
+s_nand_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8c]
 
-s_nand_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x8c]
+s_nand_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x8c]
 
-s_nand_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x8c]
+s_nand_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x8c]
 
-s_nand_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x8c]
+s_nand_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x8c]
 
-s_nand_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8c]
+s_nand_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8c]
 
-s_nand_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8c]
+s_nand_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8c]
 
-s_nand_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8c]
+s_nand_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8c]
 
-s_nand_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8c]
+s_nand_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8c]
 
-s_nand_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8c]
+s_nand_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8c]
 
-s_nand_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8c]
+s_nand_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8c]
 
-s_nand_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8c]
+s_nand_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8c]
 
-s_nand_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8c]
+s_nand_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8c]
 
-s_nand_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8c]
+s_nand_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8c]
 
-s_nand_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8c]
+s_nand_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8c]
 
-s_nand_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8c]
+s_nand_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8c]
 
-s_nand_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8c]
+s_nand_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8c]
 
-s_nand_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8c]
+s_nand_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8c]
 
-s_nand_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8c]
+s_nand_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8c]
 
-s_nand_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8c,0x56,0x34,0x12,0xaf]
+s_nand_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8c,0x56,0x34,0x12,0xaf]
 
-s_nand_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8c,0x73,0x72,0x71,0x3f]
+s_nand_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8c,0x73,0x72,0x71,0x3f]
 
-s_nand_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x8c]
+s_nand_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x8c]
 
-s_nand_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x8c]
+s_nand_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x8c]
 
-s_nand_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x8c]
+s_nand_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x8c]
 
-s_nand_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8c]
+s_nand_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8c]
 
-s_nand_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8c]
+s_nand_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8c]
 
-s_nand_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8c]
+s_nand_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8c]
 
-s_nand_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8c]
+s_nand_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8c]
 
-s_nand_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8c]
+s_nand_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8c]
 
-s_nand_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8c]
+s_nand_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8c]
 
-s_nand_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8c]
+s_nand_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8c]
 
-s_nand_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8c]
+s_nand_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8c]
 
-s_nand_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8c]
+s_nand_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8c]
 
-s_nand_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8c]
+s_nand_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8c]
 
-s_nand_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8c]
+s_nand_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8c]
 
-s_nand_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8c]
+s_nand_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8c]
 
-s_nand_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8c]
+s_nand_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8c]
 
-s_nand_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8c]
+s_nand_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8c]
 
-s_nand_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8c,0x56,0x34,0x12,0xaf]
+s_nand_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8c,0x56,0x34,0x12,0xaf]
 
-s_nand_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8c,0x73,0x72,0x71,0x3f]
+s_nand_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8c,0x73,0x72,0x71,0x3f]
 
-s_nand_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x8c]
 
-s_nand_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x8c]
+s_nand_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x8c]
 
-s_nand_b64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x8c]
+s_nand_b64 s[102:103], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x8c]
 
-s_nand_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0x8c]
+s_nand_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe8,0x8c]
 
-s_nand_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x8c]
+s_nand_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x8c]
 
-s_nand_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x8c]
+s_nand_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x8c]
 
-s_nand_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x8c]
+s_nand_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x8c]
 
-s_nand_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x8c]
+s_nand_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x8c]
 
-s_nand_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x8c]
+s_nand_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x8c]
 
-s_nand_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[102:103], s[0:1]
-// CHECK: [0x66,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], s[102:103], s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x68,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x68,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x8c]
+s_nand_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x8c]
 
-s_nand_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8c,0x56,0x34,0x12,0xaf]
+s_nand_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8c,0x56,0x34,0x12,0xaf]
 
-s_nand_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8c,0x73,0x72,0x71,0x3f]
+s_nand_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8c,0x73,0x72,0x71,0x3f]
 
-s_nand_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], s[102:103]
-// CHECK: [0x00,0x66,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], s[102:103]
+// CHECK: [0x02,0x66,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x68,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x68,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8c]
+s_nand_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8c]
 
-s_nand_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8c,0x56,0x34,0x12,0xaf]
+s_nand_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8c,0x56,0x34,0x12,0xaf]
 
-s_nand_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8c,0x73,0x72,0x71,0x3f]
+s_nand_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8c,0x73,0x72,0x71,0x3f]
 
-s_nor_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8d]
+s_nor_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8d]
 
-s_nor_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8d]
+s_nor_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8d]
 
-s_nor_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x8d]
+s_nor_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x8d]
 
-s_nor_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x8d]
+s_nor_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x8d]
 
-s_nor_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8d]
+s_nor_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8d]
 
-s_nor_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8d]
+s_nor_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8d]
 
-s_nor_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8d]
+s_nor_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8d]
 
-s_nor_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8d]
+s_nor_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8d]
 
-s_nor_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8d]
+s_nor_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8d]
 
-s_nor_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8d]
+s_nor_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8d]
 
-s_nor_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8d]
+s_nor_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8d]
 
-s_nor_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8d]
+s_nor_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8d]
 
-s_nor_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8d]
+s_nor_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8d]
 
-s_nor_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8d]
+s_nor_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8d]
 
-s_nor_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x8d]
+s_nor_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x8d]
 
-s_nor_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x8d]
+s_nor_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x8d]
 
-s_nor_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x8d]
+s_nor_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x8d]
 
-s_nor_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8d]
+s_nor_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8d]
 
-s_nor_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8d]
+s_nor_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8d]
 
-s_nor_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8d]
+s_nor_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8d]
 
-s_nor_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8d]
+s_nor_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8d]
 
-s_nor_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8d]
+s_nor_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8d]
 
-s_nor_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8d]
+s_nor_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8d]
 
-s_nor_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8d]
+s_nor_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8d]
 
-s_nor_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8d]
+s_nor_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8d]
 
-s_nor_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8d]
+s_nor_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8d]
 
-s_nor_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8d]
+s_nor_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8d]
 
-s_nor_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8d]
+s_nor_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8d]
 
-s_nor_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8d]
+s_nor_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8d]
 
-s_nor_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8d]
+s_nor_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8d]
 
-s_nor_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8d]
+s_nor_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8d]
 
-s_nor_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8d,0x56,0x34,0x12,0xaf]
+s_nor_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8d,0x56,0x34,0x12,0xaf]
 
-s_nor_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8d,0x73,0x72,0x71,0x3f]
+s_nor_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8d,0x73,0x72,0x71,0x3f]
 
-s_nor_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x8d]
+s_nor_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x8d]
 
-s_nor_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x8d]
+s_nor_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x8d]
 
-s_nor_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x8d]
+s_nor_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x8d]
 
-s_nor_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8d]
+s_nor_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8d]
 
-s_nor_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8d]
+s_nor_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8d]
 
-s_nor_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8d]
+s_nor_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8d]
 
-s_nor_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8d]
+s_nor_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8d]
 
-s_nor_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8d]
+s_nor_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8d]
 
-s_nor_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8d]
+s_nor_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8d]
 
-s_nor_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8d]
+s_nor_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8d]
 
-s_nor_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8d]
+s_nor_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8d]
 
-s_nor_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8d]
+s_nor_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8d]
 
-s_nor_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8d]
+s_nor_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8d]
 
-s_nor_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8d]
+s_nor_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8d]
 
-s_nor_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8d]
+s_nor_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8d]
 
-s_nor_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8d]
+s_nor_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8d]
 
-s_nor_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8d]
+s_nor_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8d]
 
-s_nor_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8d,0x56,0x34,0x12,0xaf]
+s_nor_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8d,0x56,0x34,0x12,0xaf]
 
-s_nor_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8d,0x73,0x72,0x71,0x3f]
+s_nor_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8d,0x73,0x72,0x71,0x3f]
 
-s_nor_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x8d]
 
-s_nor_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x8d]
+s_nor_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x8d]
 
-s_nor_b64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x8d]
+s_nor_b64 s[102:103], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x8d]
 
-s_nor_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0x8d]
+s_nor_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe8,0x8d]
 
-s_nor_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x8d]
+s_nor_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x8d]
 
-s_nor_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x8d]
+s_nor_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x8d]
 
-s_nor_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x8d]
+s_nor_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x8d]
 
-s_nor_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x8d]
+s_nor_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x8d]
 
-s_nor_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x8d]
+s_nor_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x8d]
 
-s_nor_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[102:103], s[0:1]
-// CHECK: [0x66,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], s[102:103], s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x68,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x68,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x8d]
+s_nor_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x8d]
 
-s_nor_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8d,0x56,0x34,0x12,0xaf]
+s_nor_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8d,0x56,0x34,0x12,0xaf]
 
-s_nor_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8d,0x73,0x72,0x71,0x3f]
+s_nor_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8d,0x73,0x72,0x71,0x3f]
 
-s_nor_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], s[102:103]
-// CHECK: [0x00,0x66,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], s[102:103]
+// CHECK: [0x02,0x66,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x68,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x68,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8d]
+s_nor_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8d]
 
-s_nor_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8d,0x56,0x34,0x12,0xaf]
+s_nor_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8d,0x56,0x34,0x12,0xaf]
 
-s_nor_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8d,0x73,0x72,0x71,0x3f]
+s_nor_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8d,0x73,0x72,0x71,0x3f]
 
-s_xnor_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8e]
+s_xnor_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8e]
 
-s_xnor_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8e]
+s_xnor_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8e]
 
-s_xnor_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x8e]
+s_xnor_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x8e]
 
-s_xnor_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x8e]
+s_xnor_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x8e]
 
-s_xnor_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8e]
+s_xnor_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8e]
 
-s_xnor_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8e]
+s_xnor_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8e]
 
-s_xnor_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8e]
+s_xnor_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8e]
 
-s_xnor_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8e]
+s_xnor_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8e]
 
-s_xnor_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8e]
+s_xnor_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8e]
 
-s_xnor_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8e]
+s_xnor_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8e]
 
-s_xnor_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8e]
+s_xnor_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8e]
 
-s_xnor_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8e]
+s_xnor_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8e]
 
-s_xnor_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8e]
+s_xnor_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8e]
 
-s_xnor_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8e]
+s_xnor_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8e]
 
-s_xnor_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x8e]
+s_xnor_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x8e]
+s_xnor_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x8e]
+s_xnor_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8e]
+s_xnor_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8e]
+s_xnor_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8e]
+s_xnor_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8e]
+s_xnor_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8e]
+s_xnor_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8e]
+s_xnor_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8e]
+s_xnor_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8e]
+s_xnor_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8e]
+s_xnor_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8e]
+s_xnor_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8e]
+s_xnor_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8e]
+s_xnor_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8e]
+s_xnor_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8e]
+s_xnor_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8e]
 
-s_xnor_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8e,0x56,0x34,0x12,0xaf]
+s_xnor_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8e,0x56,0x34,0x12,0xaf]
 
-s_xnor_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8e,0x73,0x72,0x71,0x3f]
+s_xnor_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8e,0x73,0x72,0x71,0x3f]
 
-s_xnor_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x8e]
+s_xnor_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x8e]
 
-s_xnor_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x8e]
+s_xnor_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x8e]
 
-s_xnor_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x8e]
+s_xnor_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x8e]
 
-s_xnor_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8e]
+s_xnor_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8e]
 
-s_xnor_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8e]
+s_xnor_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8e]
 
-s_xnor_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8e]
+s_xnor_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8e]
 
-s_xnor_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8e]
+s_xnor_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8e]
 
-s_xnor_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8e]
+s_xnor_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8e]
 
-s_xnor_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8e]
+s_xnor_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8e]
 
-s_xnor_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8e]
+s_xnor_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8e]
 
-s_xnor_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8e]
+s_xnor_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8e]
 
-s_xnor_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8e]
+s_xnor_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8e]
 
-s_xnor_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8e]
+s_xnor_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8e]
 
-s_xnor_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8e]
+s_xnor_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8e]
 
-s_xnor_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8e]
+s_xnor_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8e]
 
-s_xnor_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8e]
+s_xnor_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8e]
 
-s_xnor_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8e]
+s_xnor_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8e]
 
-s_xnor_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8e,0x56,0x34,0x12,0xaf]
+s_xnor_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8e,0x56,0x34,0x12,0xaf]
 
-s_xnor_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8e,0x73,0x72,0x71,0x3f]
+s_xnor_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8e,0x73,0x72,0x71,0x3f]
 
-s_xnor_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x8e]
+s_xnor_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x8e]
 
-s_xnor_b64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x8e]
+s_xnor_b64 s[102:103], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x8e]
 
-s_xnor_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0x8e]
+s_xnor_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe8,0x8e]
 
-s_xnor_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x8e]
+s_xnor_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x8e]
 
-s_xnor_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x8e]
+s_xnor_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x8e]
 
-s_xnor_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x8e]
+s_xnor_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x8e]
 
-s_xnor_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x8e]
+s_xnor_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x8e]
 
-s_xnor_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x8e]
+s_xnor_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x8e]
 
-s_xnor_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[102:103], s[0:1]
-// CHECK: [0x66,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], s[102:103], s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x68,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x68,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x8e]
+s_xnor_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8e,0x56,0x34,0x12,0xaf]
+s_xnor_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8e,0x56,0x34,0x12,0xaf]
 
-s_xnor_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8e,0x73,0x72,0x71,0x3f]
+s_xnor_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8e,0x73,0x72,0x71,0x3f]
 
-s_xnor_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], s[102:103]
-// CHECK: [0x00,0x66,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], s[102:103]
+// CHECK: [0x02,0x66,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x68,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x68,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8e]
+s_xnor_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8e]
 
-s_xnor_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8e,0x56,0x34,0x12,0xaf]
+s_xnor_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8e,0x56,0x34,0x12,0xaf]
 
-s_xnor_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8e,0x73,0x72,0x71,0x3f]
+s_xnor_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8e,0x73,0x72,0x71,0x3f]
 
-s_lshl_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8f]
+s_lshl_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8f]
 
-s_lshl_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8f]
+s_lshl_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8f]
 
-s_lshl_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x8f]
+s_lshl_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x8f]
 
-s_lshl_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x8f]
+s_lshl_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x8f]
 
-s_lshl_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8f]
+s_lshl_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8f]
 
-s_lshl_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8f]
+s_lshl_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8f]
 
-s_lshl_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8f]
+s_lshl_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8f]
 
-s_lshl_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8f]
+s_lshl_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8f]
 
-s_lshl_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8f]
+s_lshl_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8f]
 
-s_lshl_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8f]
+s_lshl_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8f]
 
-s_lshl_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8f]
+s_lshl_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8f]
 
-s_lshl_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8f]
+s_lshl_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8f]
 
-s_lshl_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8f]
+s_lshl_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8f]
 
-s_lshl_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8f]
+s_lshl_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8f]
 
-s_lshl_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x8f]
+s_lshl_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x8f]
+s_lshl_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x8f]
+s_lshl_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8f]
+s_lshl_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8f]
+s_lshl_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8f]
+s_lshl_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8f]
+s_lshl_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8f]
+s_lshl_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8f]
+s_lshl_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8f]
+s_lshl_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8f]
+s_lshl_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8f]
+s_lshl_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8f]
+s_lshl_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8f]
+s_lshl_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8f]
+s_lshl_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8f]
+s_lshl_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8f]
+s_lshl_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8f]
 
-s_lshl_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8f,0x56,0x34,0x12,0xaf]
+s_lshl_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8f,0x56,0x34,0x12,0xaf]
 
-s_lshl_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8f,0x73,0x72,0x71,0x3f]
+s_lshl_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8f,0x73,0x72,0x71,0x3f]
 
-s_lshl_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x8f]
+s_lshl_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x8f]
 
-s_lshl_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x8f]
+s_lshl_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x8f]
 
-s_lshl_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x8f]
+s_lshl_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x8f]
 
-s_lshl_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8f]
+s_lshl_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8f]
 
-s_lshl_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8f]
+s_lshl_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8f]
 
-s_lshl_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8f]
+s_lshl_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8f]
 
-s_lshl_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8f]
+s_lshl_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8f]
 
-s_lshl_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8f]
+s_lshl_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8f]
 
-s_lshl_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8f]
+s_lshl_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8f]
 
-s_lshl_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8f]
+s_lshl_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8f]
 
-s_lshl_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8f]
+s_lshl_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8f]
 
-s_lshl_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8f]
+s_lshl_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8f]
 
-s_lshl_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8f]
+s_lshl_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8f]
 
-s_lshl_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8f]
+s_lshl_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8f]
 
-s_lshl_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8f]
+s_lshl_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8f]
 
-s_lshl_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8f]
+s_lshl_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8f]
 
-s_lshl_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8f]
+s_lshl_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8f]
 
-s_lshl_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8f,0x56,0x34,0x12,0xaf]
+s_lshl_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8f,0x56,0x34,0x12,0xaf]
 
-s_lshl_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8f,0x73,0x72,0x71,0x3f]
+s_lshl_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8f,0x73,0x72,0x71,0x3f]
 
-s_lshl_b64 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x82,0x8f]
+s_lshl_b64 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x8c,0x8f]
 
-s_lshl_b64 s[102:103], s[0:1], s0
-// CHECK: [0x00,0x00,0xe6,0x8f]
+s_lshl_b64 s[102:103], s[2:3], s2
+// CHECK: [0x02,0x02,0xe6,0x8f]
 
-s_lshl_b64 flat_scratch, s[0:1], s0
-// CHECK: [0x00,0x00,0xe8,0x8f]
+s_lshl_b64 flat_scratch, s[2:3], s2
+// CHECK: [0x02,0x02,0xe8,0x8f]
 
-s_lshl_b64 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0xea,0x8f]
+s_lshl_b64 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0xea,0x8f]
 
-s_lshl_b64 tba, s[0:1], s0
-// CHECK: [0x00,0x00,0xec,0x8f]
+s_lshl_b64 tba, s[2:3], s2
+// CHECK: [0x02,0x02,0xec,0x8f]
 
-s_lshl_b64 tma, s[0:1], s0
-// CHECK: [0x00,0x00,0xee,0x8f]
+s_lshl_b64 tma, s[2:3], s2
+// CHECK: [0x02,0x02,0xee,0x8f]
 
-s_lshl_b64 ttmp[10:11], s[0:1], s0
-// CHECK: [0x00,0x00,0xfa,0x8f]
+s_lshl_b64 ttmp[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0xfa,0x8f]
 
-s_lshl_b64 exec, s[0:1], s0
-// CHECK: [0x00,0x00,0xfe,0x8f]
+s_lshl_b64 exec, s[2:3], s2
+// CHECK: [0x02,0x02,0xfe,0x8f]
 
-s_lshl_b64 s[0:1], s[2:3], s0
-// CHECK: [0x02,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], s[4:5], s2
+// CHECK: [0x04,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[102:103], s0
-// CHECK: [0x66,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], s[102:103], s2
+// CHECK: [0x66,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], flat_scratch, s0
-// CHECK: [0x68,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], flat_scratch, s2
+// CHECK: [0x68,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], vcc, s0
-// CHECK: [0x6a,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], vcc, s2
+// CHECK: [0x6a,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], tba, s0
-// CHECK: [0x6c,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], tba, s2
+// CHECK: [0x6c,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], tma, s0
-// CHECK: [0x6e,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], tma, s2
+// CHECK: [0x6e,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], exec, s0
-// CHECK: [0x7e,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], exec, s2
+// CHECK: [0x7e,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x8f]
+s_lshl_b64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x8f,0x56,0x34,0x12,0xaf]
+s_lshl_b64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x8a,0x8f,0x56,0x34,0x12,0xaf]
 
-s_lshl_b64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x8f,0x73,0x72,0x71,0x3f]
+s_lshl_b64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x8a,0x8f,0x73,0x72,0x71,0x3f]
 
-s_lshl_b64 s[0:1], s[0:1], s103
-// CHECK: [0x00,0x67,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], s103
+// CHECK: [0x02,0x67,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x68,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x69,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x7c,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], m0
+// CHECK: [0x02,0x7c,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8f]
+s_lshl_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8f]
 
-s_lshl_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8f,0x56,0x34,0x12,0xaf]
+s_lshl_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8f,0x56,0x34,0x12,0xaf]
 
-s_lshl_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8f,0x73,0x72,0x71,0x3f]
+s_lshl_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8f,0x73,0x72,0x71,0x3f]
 
-s_lshr_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x90]
+s_lshr_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x90]
 
-s_lshr_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x90]
+s_lshr_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x90]
 
-s_lshr_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x90]
+s_lshr_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x90]
 
-s_lshr_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x90]
+s_lshr_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x90]
 
-s_lshr_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x90]
+s_lshr_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x90]
 
-s_lshr_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x90]
+s_lshr_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x90]
 
-s_lshr_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x90]
+s_lshr_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x90]
 
-s_lshr_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x90]
+s_lshr_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x90]
 
-s_lshr_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x90]
+s_lshr_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x90]
 
-s_lshr_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x90]
+s_lshr_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x90]
 
-s_lshr_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x90]
+s_lshr_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x90]
 
-s_lshr_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x90]
+s_lshr_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x90]
 
-s_lshr_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x90]
+s_lshr_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x90]
 
-s_lshr_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x90]
+s_lshr_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x90]
 
-s_lshr_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x90]
+s_lshr_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x90]
 
-s_lshr_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x90]
+s_lshr_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x90]
 
-s_lshr_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x90]
+s_lshr_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x90]
 
-s_lshr_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x90]
+s_lshr_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x90]
 
-s_lshr_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x90]
+s_lshr_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x90]
 
-s_lshr_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x90]
+s_lshr_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x90]
 
-s_lshr_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x90]
+s_lshr_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x90]
 
-s_lshr_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x90]
+s_lshr_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x90]
 
-s_lshr_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x90]
+s_lshr_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x90]
 
-s_lshr_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x90]
+s_lshr_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x90]
 
-s_lshr_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x90]
+s_lshr_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x90]
 
-s_lshr_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x90]
+s_lshr_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x90]
 
-s_lshr_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x90]
+s_lshr_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x90]
 
-s_lshr_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x90]
+s_lshr_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x90]
 
-s_lshr_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x90]
+s_lshr_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x90]
 
-s_lshr_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x90]
+s_lshr_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x90]
 
-s_lshr_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x90]
+s_lshr_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x90]
 
-s_lshr_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x90,0x56,0x34,0x12,0xaf]
+s_lshr_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x90,0x56,0x34,0x12,0xaf]
 
-s_lshr_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x90,0x73,0x72,0x71,0x3f]
+s_lshr_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x90,0x73,0x72,0x71,0x3f]
 
-s_lshr_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x90]
+s_lshr_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x90]
 
-s_lshr_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x90]
+s_lshr_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x90]
 
-s_lshr_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x90]
+s_lshr_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x90]
 
-s_lshr_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x90]
+s_lshr_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x90]
 
-s_lshr_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x90]
+s_lshr_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x90]
 
-s_lshr_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x90]
+s_lshr_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x90]
 
-s_lshr_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x90]
+s_lshr_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x90]
 
-s_lshr_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x90]
+s_lshr_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x90]
 
-s_lshr_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x90]
+s_lshr_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x90]
 
-s_lshr_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x90]
+s_lshr_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x90]
 
-s_lshr_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x90]
+s_lshr_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x90]
 
-s_lshr_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x90]
+s_lshr_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x90]
 
-s_lshr_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x90]
+s_lshr_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x90]
 
-s_lshr_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x90]
+s_lshr_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x90]
 
-s_lshr_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x90]
+s_lshr_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x90]
 
-s_lshr_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x90]
+s_lshr_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x90]
 
-s_lshr_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x90]
+s_lshr_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x90]
 
-s_lshr_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x90,0x56,0x34,0x12,0xaf]
+s_lshr_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x90,0x56,0x34,0x12,0xaf]
 
-s_lshr_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x90,0x73,0x72,0x71,0x3f]
+s_lshr_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x90,0x73,0x72,0x71,0x3f]
 
-s_lshr_b64 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x8a,0x90]
 
-s_lshr_b64 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x82,0x90]
+s_lshr_b64 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x8c,0x90]
 
-s_lshr_b64 s[102:103], s[0:1], s0
-// CHECK: [0x00,0x00,0xe6,0x90]
+s_lshr_b64 s[102:103], s[2:3], s2
+// CHECK: [0x02,0x02,0xe6,0x90]
 
-s_lshr_b64 flat_scratch, s[0:1], s0
-// CHECK: [0x00,0x00,0xe8,0x90]
+s_lshr_b64 flat_scratch, s[2:3], s2
+// CHECK: [0x02,0x02,0xe8,0x90]
 
-s_lshr_b64 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0xea,0x90]
+s_lshr_b64 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0xea,0x90]
 
-s_lshr_b64 tba, s[0:1], s0
-// CHECK: [0x00,0x00,0xec,0x90]
+s_lshr_b64 tba, s[2:3], s2
+// CHECK: [0x02,0x02,0xec,0x90]
 
-s_lshr_b64 tma, s[0:1], s0
-// CHECK: [0x00,0x00,0xee,0x90]
+s_lshr_b64 tma, s[2:3], s2
+// CHECK: [0x02,0x02,0xee,0x90]
 
-s_lshr_b64 ttmp[10:11], s[0:1], s0
-// CHECK: [0x00,0x00,0xfa,0x90]
+s_lshr_b64 ttmp[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0xfa,0x90]
 
-s_lshr_b64 exec, s[0:1], s0
-// CHECK: [0x00,0x00,0xfe,0x90]
+s_lshr_b64 exec, s[2:3], s2
+// CHECK: [0x02,0x02,0xfe,0x90]
 
-s_lshr_b64 s[0:1], s[2:3], s0
-// CHECK: [0x02,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], s[4:5], s2
+// CHECK: [0x04,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[102:103], s0
-// CHECK: [0x66,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], s[102:103], s2
+// CHECK: [0x66,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], flat_scratch, s0
-// CHECK: [0x68,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], flat_scratch, s2
+// CHECK: [0x68,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], vcc, s0
-// CHECK: [0x6a,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], vcc, s2
+// CHECK: [0x6a,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], tba, s0
-// CHECK: [0x6c,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], tba, s2
+// CHECK: [0x6c,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], tma, s0
-// CHECK: [0x6e,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], tma, s2
+// CHECK: [0x6e,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], exec, s0
-// CHECK: [0x7e,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], exec, s2
+// CHECK: [0x7e,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x90]
+s_lshr_b64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x8a,0x90]
 
-s_lshr_b64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x90,0x56,0x34,0x12,0xaf]
+s_lshr_b64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x8a,0x90,0x56,0x34,0x12,0xaf]
 
-s_lshr_b64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x90,0x73,0x72,0x71,0x3f]
+s_lshr_b64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x8a,0x90,0x73,0x72,0x71,0x3f]
 
-s_lshr_b64 s[0:1], s[0:1], s103
-// CHECK: [0x00,0x67,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], s103
+// CHECK: [0x02,0x67,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x68,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x69,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x7c,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], m0
+// CHECK: [0x02,0x7c,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x90]
+s_lshr_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x90]
 
-s_lshr_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x90,0x56,0x34,0x12,0xaf]
+s_lshr_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x90,0x56,0x34,0x12,0xaf]
 
-s_lshr_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x90,0x73,0x72,0x71,0x3f]
+s_lshr_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x90,0x73,0x72,0x71,0x3f]
 
-s_ashr_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x91]
+s_ashr_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x91]
 
-s_ashr_i32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x91]
+s_ashr_i32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x91]
 
-s_ashr_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x91]
+s_ashr_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x91]
 
-s_ashr_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x91]
+s_ashr_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x91]
 
-s_ashr_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x91]
+s_ashr_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x91]
 
-s_ashr_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x91]
+s_ashr_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x91]
 
-s_ashr_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x91]
+s_ashr_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x91]
 
-s_ashr_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x91]
+s_ashr_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x91]
 
-s_ashr_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x91]
+s_ashr_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x91]
 
-s_ashr_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x91]
+s_ashr_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x91]
 
-s_ashr_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x91]
+s_ashr_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x91]
 
-s_ashr_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x91]
+s_ashr_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x91]
 
-s_ashr_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x91]
+s_ashr_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x91]
 
-s_ashr_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x91]
+s_ashr_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x91]
 
-s_ashr_i32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x91]
+s_ashr_i32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x91]
 
-s_ashr_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x91]
+s_ashr_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x91]
 
-s_ashr_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x91]
+s_ashr_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x91]
 
-s_ashr_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x91]
+s_ashr_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x91]
 
-s_ashr_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x91]
+s_ashr_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x91]
 
-s_ashr_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x91]
+s_ashr_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x91]
 
-s_ashr_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x91]
+s_ashr_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x91]
 
-s_ashr_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x91]
+s_ashr_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x91]
 
-s_ashr_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x91]
+s_ashr_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x91]
 
-s_ashr_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x91]
+s_ashr_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x91]
 
-s_ashr_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x91]
+s_ashr_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x91]
 
-s_ashr_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x91]
+s_ashr_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x91]
 
-s_ashr_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x91]
+s_ashr_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x91]
 
-s_ashr_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x91]
+s_ashr_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x91]
 
-s_ashr_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x91]
+s_ashr_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x91]
 
-s_ashr_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x91]
+s_ashr_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x91]
 
-s_ashr_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x91]
+s_ashr_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x91]
 
-s_ashr_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x91,0x56,0x34,0x12,0xaf]
+s_ashr_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x91,0x56,0x34,0x12,0xaf]
 
-s_ashr_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x91,0x73,0x72,0x71,0x3f]
+s_ashr_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x91,0x73,0x72,0x71,0x3f]
 
-s_ashr_i32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x91]
+s_ashr_i32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x91]
 
-s_ashr_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x91]
+s_ashr_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x91]
 
-s_ashr_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x91]
+s_ashr_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x91]
 
-s_ashr_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x91]
+s_ashr_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x91]
 
-s_ashr_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x91]
+s_ashr_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x91]
 
-s_ashr_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x91]
+s_ashr_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x91]
 
-s_ashr_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x91]
+s_ashr_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x91]
 
-s_ashr_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x91]
+s_ashr_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x91]
 
-s_ashr_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x91]
+s_ashr_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x91]
 
-s_ashr_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x91]
+s_ashr_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x91]
 
-s_ashr_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x91]
+s_ashr_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x91]
 
-s_ashr_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x91]
+s_ashr_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x91]
 
-s_ashr_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x91]
+s_ashr_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x91]
 
-s_ashr_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x91]
+s_ashr_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x91]
 
-s_ashr_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x91]
+s_ashr_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x91]
 
-s_ashr_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x91]
+s_ashr_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x91]
 
-s_ashr_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x91]
+s_ashr_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x91]
 
-s_ashr_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x91,0x56,0x34,0x12,0xaf]
+s_ashr_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x91,0x56,0x34,0x12,0xaf]
 
-s_ashr_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x91,0x73,0x72,0x71,0x3f]
+s_ashr_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x91,0x73,0x72,0x71,0x3f]
 
-s_ashr_i64 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x8a,0x91]
 
-s_ashr_i64 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x82,0x91]
+s_ashr_i64 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x8c,0x91]
 
-s_ashr_i64 s[102:103], s[0:1], s0
-// CHECK: [0x00,0x00,0xe6,0x91]
+s_ashr_i64 s[102:103], s[2:3], s2
+// CHECK: [0x02,0x02,0xe6,0x91]
 
-s_ashr_i64 flat_scratch, s[0:1], s0
-// CHECK: [0x00,0x00,0xe8,0x91]
+s_ashr_i64 flat_scratch, s[2:3], s2
+// CHECK: [0x02,0x02,0xe8,0x91]
 
-s_ashr_i64 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0xea,0x91]
+s_ashr_i64 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0xea,0x91]
 
-s_ashr_i64 tba, s[0:1], s0
-// CHECK: [0x00,0x00,0xec,0x91]
+s_ashr_i64 tba, s[2:3], s2
+// CHECK: [0x02,0x02,0xec,0x91]
 
-s_ashr_i64 tma, s[0:1], s0
-// CHECK: [0x00,0x00,0xee,0x91]
+s_ashr_i64 tma, s[2:3], s2
+// CHECK: [0x02,0x02,0xee,0x91]
 
-s_ashr_i64 ttmp[10:11], s[0:1], s0
-// CHECK: [0x00,0x00,0xfa,0x91]
+s_ashr_i64 ttmp[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0xfa,0x91]
 
-s_ashr_i64 exec, s[0:1], s0
-// CHECK: [0x00,0x00,0xfe,0x91]
+s_ashr_i64 exec, s[2:3], s2
+// CHECK: [0x02,0x02,0xfe,0x91]
 
-s_ashr_i64 s[0:1], s[2:3], s0
-// CHECK: [0x02,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], s[4:5], s2
+// CHECK: [0x04,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[102:103], s0
-// CHECK: [0x66,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], s[102:103], s2
+// CHECK: [0x66,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], flat_scratch, s0
-// CHECK: [0x68,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], flat_scratch, s2
+// CHECK: [0x68,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], vcc, s0
-// CHECK: [0x6a,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], vcc, s2
+// CHECK: [0x6a,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], tba, s0
-// CHECK: [0x6c,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], tba, s2
+// CHECK: [0x6c,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], tma, s0
-// CHECK: [0x6e,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], tma, s2
+// CHECK: [0x6e,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], exec, s0
-// CHECK: [0x7e,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], exec, s2
+// CHECK: [0x7e,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x91]
+s_ashr_i64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x8a,0x91]
 
-s_ashr_i64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x91,0x56,0x34,0x12,0xaf]
+s_ashr_i64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x8a,0x91,0x56,0x34,0x12,0xaf]
 
-s_ashr_i64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x91,0x73,0x72,0x71,0x3f]
+s_ashr_i64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x8a,0x91,0x73,0x72,0x71,0x3f]
 
-s_ashr_i64 s[0:1], s[0:1], s103
-// CHECK: [0x00,0x67,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], s103
+// CHECK: [0x02,0x67,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x68,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x69,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x7c,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], m0
+// CHECK: [0x02,0x7c,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x91]
+s_ashr_i64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x91]
 
-s_ashr_i64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x91,0x56,0x34,0x12,0xaf]
+s_ashr_i64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x91,0x56,0x34,0x12,0xaf]
 
-s_ashr_i64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x91,0x73,0x72,0x71,0x3f]
+s_ashr_i64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x91,0x73,0x72,0x71,0x3f]
 
-s_bfm_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x92]
+s_bfm_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x92]
 
-s_bfm_b32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x92]
+s_bfm_b32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x92]
 
-s_bfm_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x92]
+s_bfm_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x92]
 
-s_bfm_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x92]
+s_bfm_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x92]
 
-s_bfm_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x92]
+s_bfm_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x92]
 
-s_bfm_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x92]
+s_bfm_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x92]
 
-s_bfm_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x92]
+s_bfm_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x92]
 
-s_bfm_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x92]
+s_bfm_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x92]
 
-s_bfm_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x92]
+s_bfm_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x92]
 
-s_bfm_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x92]
+s_bfm_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x92]
 
-s_bfm_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x92]
+s_bfm_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x92]
 
-s_bfm_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x92]
+s_bfm_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x92]
 
-s_bfm_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x92]
+s_bfm_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x92]
 
-s_bfm_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x92]
+s_bfm_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x92]
 
-s_bfm_b32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x92]
+s_bfm_b32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x92]
 
-s_bfm_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x92]
+s_bfm_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x92]
 
-s_bfm_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x92]
+s_bfm_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x92]
 
-s_bfm_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x92]
+s_bfm_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x92]
 
-s_bfm_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x92]
+s_bfm_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x92]
 
-s_bfm_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x92]
+s_bfm_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x92]
 
-s_bfm_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x92]
+s_bfm_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x92]
 
-s_bfm_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x92]
+s_bfm_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x92]
 
-s_bfm_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x92]
+s_bfm_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x92]
 
-s_bfm_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x92]
+s_bfm_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x92]
 
-s_bfm_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x92]
+s_bfm_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x92]
 
-s_bfm_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x92]
+s_bfm_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x92]
 
-s_bfm_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x92]
+s_bfm_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x92]
 
-s_bfm_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x92]
+s_bfm_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x92]
 
-s_bfm_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x92]
+s_bfm_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x92]
 
-s_bfm_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x92]
+s_bfm_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x92]
 
-s_bfm_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x92]
+s_bfm_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x92]
 
-s_bfm_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x92,0x56,0x34,0x12,0xaf]
+s_bfm_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x92,0x56,0x34,0x12,0xaf]
 
-s_bfm_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x92,0x73,0x72,0x71,0x3f]
+s_bfm_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x92,0x73,0x72,0x71,0x3f]
 
-s_bfm_b32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x92]
+s_bfm_b32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x92]
 
-s_bfm_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x92]
+s_bfm_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x92]
 
-s_bfm_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x92]
+s_bfm_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x92]
 
-s_bfm_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x92]
+s_bfm_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x92]
 
-s_bfm_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x92]
+s_bfm_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x92]
 
-s_bfm_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x92]
+s_bfm_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x92]
 
-s_bfm_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x92]
+s_bfm_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x92]
 
-s_bfm_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x92]
+s_bfm_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x92]
 
-s_bfm_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x92]
+s_bfm_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x92]
 
-s_bfm_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x92]
+s_bfm_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x92]
 
-s_bfm_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x92]
+s_bfm_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x92]
 
-s_bfm_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x92]
+s_bfm_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x92]
 
-s_bfm_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x92]
+s_bfm_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x92]
 
-s_bfm_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x92]
+s_bfm_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x92]
 
-s_bfm_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x92]
+s_bfm_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x92]
 
-s_bfm_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x92]
+s_bfm_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x92]
 
-s_bfm_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x92]
+s_bfm_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x92]
 
-s_bfm_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x92,0x56,0x34,0x12,0xaf]
+s_bfm_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x92,0x56,0x34,0x12,0xaf]
 
-s_bfm_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x92,0x73,0x72,0x71,0x3f]
+s_bfm_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x92,0x73,0x72,0x71,0x3f]
 
-s_bfm_b64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], s1, s2
+// CHECK: [0x01,0x02,0x8a,0x92]
 
-s_bfm_b64 s[2:3], s0, s0
-// CHECK: [0x00,0x00,0x82,0x92]
+s_bfm_b64 s[12:13], s1, s2
+// CHECK: [0x01,0x02,0x8c,0x92]
 
-s_bfm_b64 s[102:103], s0, s0
-// CHECK: [0x00,0x00,0xe6,0x92]
+s_bfm_b64 s[102:103], s1, s2
+// CHECK: [0x01,0x02,0xe6,0x92]
 
-s_bfm_b64 flat_scratch, s0, s0
-// CHECK: [0x00,0x00,0xe8,0x92]
+s_bfm_b64 flat_scratch, s1, s2
+// CHECK: [0x01,0x02,0xe8,0x92]
 
-s_bfm_b64 vcc, s0, s0
-// CHECK: [0x00,0x00,0xea,0x92]
+s_bfm_b64 vcc, s1, s2
+// CHECK: [0x01,0x02,0xea,0x92]
 
-s_bfm_b64 tba, s0, s0
-// CHECK: [0x00,0x00,0xec,0x92]
+s_bfm_b64 tba, s1, s2
+// CHECK: [0x01,0x02,0xec,0x92]
 
-s_bfm_b64 tma, s0, s0
-// CHECK: [0x00,0x00,0xee,0x92]
+s_bfm_b64 tma, s1, s2
+// CHECK: [0x01,0x02,0xee,0x92]
 
-s_bfm_b64 ttmp[10:11], s0, s0
-// CHECK: [0x00,0x00,0xfa,0x92]
+s_bfm_b64 ttmp[10:11], s1, s2
+// CHECK: [0x01,0x02,0xfa,0x92]
 
-s_bfm_b64 exec, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x92]
+s_bfm_b64 exec, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x92]
 
-s_bfm_b64 s[0:1], s103, s0
-// CHECK: [0x67,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], s103, s2
+// CHECK: [0x67,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], vcc_lo, s2
+// CHECK: [0x6a,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], vcc_hi, s2
+// CHECK: [0x6b,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], tba_lo, s2
+// CHECK: [0x6c,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], tba_hi, s2
+// CHECK: [0x6d,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], tma_lo, s2
+// CHECK: [0x6e,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], tma_hi, s2
+// CHECK: [0x6f,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], ttmp11, s2
+// CHECK: [0x7b,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], m0, s0
-// CHECK: [0x7c,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], m0, s2
+// CHECK: [0x7c,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], exec_lo, s2
+// CHECK: [0x7e,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], exec_hi, s2
+// CHECK: [0x7f,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x92]
+s_bfm_b64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x8a,0x92]
 
-s_bfm_b64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x92,0x56,0x34,0x12,0xaf]
+s_bfm_b64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x8a,0x92,0x56,0x34,0x12,0xaf]
 
-s_bfm_b64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x92,0x73,0x72,0x71,0x3f]
+s_bfm_b64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x8a,0x92,0x73,0x72,0x71,0x3f]
 
-s_bfm_b64 s[0:1], s0, s103
-// CHECK: [0x00,0x67,0x80,0x92]
+s_bfm_b64 s[10:11], s1, s103
+// CHECK: [0x01,0x67,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x92]
+s_bfm_b64 s[10:11], s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x92]
+s_bfm_b64 s[10:11], s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x92]
+s_bfm_b64 s[10:11], s1, vcc_lo
+// CHECK: [0x01,0x6a,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x92]
+s_bfm_b64 s[10:11], s1, vcc_hi
+// CHECK: [0x01,0x6b,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x92]
+s_bfm_b64 s[10:11], s1, tba_lo
+// CHECK: [0x01,0x6c,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x92]
+s_bfm_b64 s[10:11], s1, tba_hi
+// CHECK: [0x01,0x6d,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x92]
+s_bfm_b64 s[10:11], s1, tma_lo
+// CHECK: [0x01,0x6e,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x92]
+s_bfm_b64 s[10:11], s1, tma_hi
+// CHECK: [0x01,0x6f,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x92]
+s_bfm_b64 s[10:11], s1, ttmp11
+// CHECK: [0x01,0x7b,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, m0
-// CHECK: [0x00,0x7c,0x80,0x92]
+s_bfm_b64 s[10:11], s1, m0
+// CHECK: [0x01,0x7c,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x92]
+s_bfm_b64 s[10:11], s1, exec_lo
+// CHECK: [0x01,0x7e,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x92]
+s_bfm_b64 s[10:11], s1, exec_hi
+// CHECK: [0x01,0x7f,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, 0
-// CHECK: [0x00,0x80,0x80,0x92]
+s_bfm_b64 s[10:11], s1, 0
+// CHECK: [0x01,0x80,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, -1
-// CHECK: [0x00,0xc1,0x80,0x92]
+s_bfm_b64 s[10:11], s1, -1
+// CHECK: [0x01,0xc1,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x92]
+s_bfm_b64 s[10:11], s1, 0.5
+// CHECK: [0x01,0xf0,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x92]
+s_bfm_b64 s[10:11], s1, -4.0
+// CHECK: [0x01,0xf7,0x8a,0x92]
 
-s_bfm_b64 s[0:1], s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x92,0x56,0x34,0x12,0xaf]
+s_bfm_b64 s[10:11], s1, 0xaf123456
+// CHECK: [0x01,0xff,0x8a,0x92,0x56,0x34,0x12,0xaf]
 
-s_bfm_b64 s[0:1], s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x92,0x73,0x72,0x71,0x3f]
+s_bfm_b64 s[10:11], s1, 0x3f717273
+// CHECK: [0x01,0xff,0x8a,0x92,0x73,0x72,0x71,0x3f]
 
-s_mul_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x93]
+s_mul_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x93]
 
-s_mul_i32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x93]
+s_mul_i32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x93]
 
-s_mul_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x93]
+s_mul_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x93]
 
-s_mul_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x93]
+s_mul_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x93]
 
-s_mul_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x93]
+s_mul_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x93]
 
-s_mul_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x93]
+s_mul_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x93]
 
-s_mul_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x93]
+s_mul_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x93]
 
-s_mul_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x93]
+s_mul_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x93]
 
-s_mul_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x93]
+s_mul_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x93]
 
-s_mul_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x93]
+s_mul_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x93]
 
-s_mul_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x93]
+s_mul_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x93]
 
-s_mul_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x93]
+s_mul_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x93]
 
-s_mul_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x93]
+s_mul_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x93]
 
-s_mul_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x93]
+s_mul_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x93]
 
-s_mul_i32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x93]
+s_mul_i32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x93]
 
-s_mul_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x93]
+s_mul_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x93]
 
-s_mul_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x93]
+s_mul_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x93]
 
-s_mul_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x93]
+s_mul_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x93]
 
-s_mul_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x93]
+s_mul_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x93]
 
-s_mul_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x93]
+s_mul_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x93]
 
-s_mul_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x93]
+s_mul_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x93]
 
-s_mul_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x93]
+s_mul_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x93]
 
-s_mul_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x93]
+s_mul_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x93]
 
-s_mul_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x93]
+s_mul_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x93]
 
-s_mul_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x93]
+s_mul_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x93]
 
-s_mul_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x93]
+s_mul_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x93]
 
-s_mul_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x93]
+s_mul_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x93]
 
-s_mul_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x93]
+s_mul_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x93]
 
-s_mul_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x93]
+s_mul_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x93]
 
-s_mul_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x93]
+s_mul_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x93]
 
-s_mul_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x93]
+s_mul_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x93]
 
-s_mul_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x93,0x56,0x34,0x12,0xaf]
+s_mul_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x93,0x56,0x34,0x12,0xaf]
 
-s_mul_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x93,0x73,0x72,0x71,0x3f]
+s_mul_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x93,0x73,0x72,0x71,0x3f]
 
-s_mul_i32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x93]
+s_mul_i32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x93]
 
-s_mul_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x93]
+s_mul_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x93]
 
-s_mul_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x93]
+s_mul_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x93]
 
-s_mul_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x93]
+s_mul_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x93]
 
-s_mul_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x93]
+s_mul_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x93]
 
-s_mul_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x93]
+s_mul_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x93]
 
-s_mul_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x93]
+s_mul_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x93]
 
-s_mul_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x93]
+s_mul_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x93]
 
-s_mul_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x93]
+s_mul_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x93]
 
-s_mul_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x93]
+s_mul_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x93]
 
-s_mul_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x93]
+s_mul_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x93]
 
-s_mul_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x93]
+s_mul_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x93]
 
-s_mul_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x93]
+s_mul_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x93]
 
-s_mul_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x93]
+s_mul_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x93]
 
-s_mul_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x93]
+s_mul_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x93]
 
-s_mul_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x93]
+s_mul_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x93]
 
-s_mul_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x93]
+s_mul_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x93]
 
-s_mul_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x93,0x56,0x34,0x12,0xaf]
+s_mul_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x93,0x56,0x34,0x12,0xaf]
 
-s_mul_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x93,0x73,0x72,0x71,0x3f]
+s_mul_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x93,0x73,0x72,0x71,0x3f]
 
-s_bfe_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x93]
+s_bfe_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x93]
 
-s_bfe_u32 s103, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x93]
+s_bfe_u32 s103, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x93]
 
-s_bfe_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe8,0x93]
+s_bfe_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe8,0x93]
 
-s_bfe_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe9,0x93]
+s_bfe_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe9,0x93]
 
-s_bfe_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x93]
+s_bfe_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x93]
 
-s_bfe_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x93]
+s_bfe_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x93]
 
-s_bfe_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x93]
+s_bfe_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x93]
 
-s_bfe_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x93]
+s_bfe_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x93]
 
-s_bfe_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x93]
+s_bfe_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x93]
 
-s_bfe_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x93]
+s_bfe_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x93]
 
-s_bfe_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x93]
+s_bfe_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x93]
 
-s_bfe_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x93]
+s_bfe_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x93]
 
-s_bfe_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x93]
+s_bfe_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x93]
 
-s_bfe_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x93]
+s_bfe_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x93]
 
-s_bfe_u32 s0, s103, s0
-// CHECK: [0x67,0x00,0x80,0x93]
+s_bfe_u32 s5, s103, s2
+// CHECK: [0x67,0x02,0x85,0x93]
 
-s_bfe_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x80,0x93]
+s_bfe_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x85,0x93]
 
-s_bfe_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x80,0x93]
+s_bfe_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x85,0x93]
 
-s_bfe_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x93]
+s_bfe_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x93]
 
-s_bfe_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x93]
+s_bfe_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x93]
 
-s_bfe_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x93]
+s_bfe_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x93]
 
-s_bfe_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x93]
+s_bfe_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x93]
 
-s_bfe_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x93]
+s_bfe_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x93]
 
-s_bfe_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x93]
+s_bfe_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x93]
 
-s_bfe_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x93]
+s_bfe_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x93]
 
-s_bfe_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x93]
+s_bfe_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x93]
 
-s_bfe_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x93]
+s_bfe_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x93]
 
-s_bfe_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x93]
+s_bfe_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x93]
 
-s_bfe_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x93]
+s_bfe_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x93]
 
-s_bfe_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x93]
+s_bfe_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x93]
 
-s_bfe_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x93]
+s_bfe_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x93]
 
-s_bfe_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x93]
+s_bfe_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x93]
 
-s_bfe_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x93,0x56,0x34,0x12,0xaf]
+s_bfe_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x93,0x56,0x34,0x12,0xaf]
 
-s_bfe_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x93,0x73,0x72,0x71,0x3f]
+s_bfe_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x93,0x73,0x72,0x71,0x3f]
 
-s_bfe_u32 s0, s0, s103
-// CHECK: [0x00,0x67,0x80,0x93]
+s_bfe_u32 s5, s1, s103
+// CHECK: [0x01,0x67,0x85,0x93]
 
-s_bfe_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x93]
+s_bfe_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x85,0x93]
 
-s_bfe_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x93]
+s_bfe_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x85,0x93]
 
-s_bfe_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x93]
+s_bfe_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x93]
 
-s_bfe_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x93]
+s_bfe_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x93]
 
-s_bfe_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x93]
+s_bfe_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x93]
 
-s_bfe_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x93]
+s_bfe_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x93]
 
-s_bfe_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x93]
+s_bfe_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x93]
 
-s_bfe_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x93]
+s_bfe_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x93]
 
-s_bfe_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x93]
+s_bfe_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x93]
 
-s_bfe_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x93]
+s_bfe_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x93]
 
-s_bfe_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x93]
+s_bfe_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x93]
 
-s_bfe_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x93]
+s_bfe_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x93]
 
-s_bfe_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x93]
+s_bfe_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x93]
 
-s_bfe_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x93]
+s_bfe_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x93]
 
-s_bfe_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x93]
+s_bfe_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x93]
 
-s_bfe_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x93]
+s_bfe_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x93]
 
-s_bfe_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x93,0x56,0x34,0x12,0xaf]
+s_bfe_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x93,0x56,0x34,0x12,0xaf]
 
-s_bfe_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x93,0x73,0x72,0x71,0x3f]
+s_bfe_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x93,0x73,0x72,0x71,0x3f]
 
-s_bfe_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x94]
+s_bfe_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x94]
 
-s_bfe_i32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x94]
+s_bfe_i32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x94]
 
-s_bfe_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x94]
+s_bfe_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x94]
 
-s_bfe_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x94]
+s_bfe_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x94]
 
-s_bfe_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x94]
+s_bfe_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x94]
 
-s_bfe_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x94]
+s_bfe_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x94]
 
-s_bfe_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x94]
+s_bfe_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x94]
 
-s_bfe_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x94]
+s_bfe_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x94]
 
-s_bfe_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x94]
+s_bfe_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x94]
 
-s_bfe_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x94]
+s_bfe_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x94]
 
-s_bfe_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x94]
+s_bfe_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x94]
 
-s_bfe_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x94]
+s_bfe_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x94]
 
-s_bfe_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x94]
+s_bfe_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x94]
 
-s_bfe_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x94]
+s_bfe_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x94]
 
-s_bfe_i32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x94]
+s_bfe_i32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x94]
 
-s_bfe_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x94]
+s_bfe_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x94]
 
-s_bfe_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x94]
+s_bfe_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x94]
 
-s_bfe_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x94]
+s_bfe_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x94]
 
-s_bfe_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x94]
+s_bfe_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x94]
 
-s_bfe_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x94]
+s_bfe_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x94]
 
-s_bfe_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x94]
+s_bfe_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x94]
 
-s_bfe_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x94]
+s_bfe_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x94]
 
-s_bfe_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x94]
+s_bfe_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x94]
 
-s_bfe_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x94]
+s_bfe_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x94]
 
-s_bfe_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x94]
+s_bfe_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x94]
 
-s_bfe_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x94]
+s_bfe_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x94]
 
-s_bfe_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x94]
+s_bfe_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x94]
 
-s_bfe_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x94]
+s_bfe_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x94]
 
-s_bfe_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x94]
+s_bfe_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x94]
 
-s_bfe_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x94]
+s_bfe_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x94]
 
-s_bfe_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x94]
+s_bfe_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x94]
 
-s_bfe_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x94,0x56,0x34,0x12,0xaf]
+s_bfe_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x94,0x56,0x34,0x12,0xaf]
 
-s_bfe_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x94,0x73,0x72,0x71,0x3f]
+s_bfe_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x94,0x73,0x72,0x71,0x3f]
 
-s_bfe_i32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x94]
+s_bfe_i32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x94]
 
-s_bfe_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x94]
+s_bfe_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x94]
 
-s_bfe_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x94]
+s_bfe_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x94]
 
-s_bfe_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x94]
+s_bfe_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x94]
 
-s_bfe_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x94]
+s_bfe_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x94]
 
-s_bfe_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x94]
+s_bfe_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x94]
 
-s_bfe_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x94]
+s_bfe_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x94]
 
-s_bfe_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x94]
+s_bfe_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x94]
 
-s_bfe_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x94]
+s_bfe_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x94]
 
-s_bfe_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x94]
+s_bfe_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x94]
 
-s_bfe_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x94]
+s_bfe_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x94]
 
-s_bfe_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x94]
+s_bfe_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x94]
 
-s_bfe_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x94]
+s_bfe_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x94]
 
-s_bfe_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x94]
+s_bfe_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x94]
 
-s_bfe_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x94]
+s_bfe_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x94]
 
-s_bfe_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x94]
+s_bfe_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x94]
 
-s_bfe_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x94]
+s_bfe_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x94]
 
-s_bfe_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x94,0x56,0x34,0x12,0xaf]
+s_bfe_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x94,0x56,0x34,0x12,0xaf]
 
-s_bfe_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x94,0x73,0x72,0x71,0x3f]
+s_bfe_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x94,0x73,0x72,0x71,0x3f]
 
-s_bfe_u64 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x8a,0x94]
 
-s_bfe_u64 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x82,0x94]
+s_bfe_u64 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x8c,0x94]
 
-s_bfe_u64 s[102:103], s[0:1], s0
-// CHECK: [0x00,0x00,0xe6,0x94]
+s_bfe_u64 s[102:103], s[2:3], s2
+// CHECK: [0x02,0x02,0xe6,0x94]
 
-s_bfe_u64 flat_scratch, s[0:1], s0
-// CHECK: [0x00,0x00,0xe8,0x94]
+s_bfe_u64 flat_scratch, s[2:3], s2
+// CHECK: [0x02,0x02,0xe8,0x94]
 
-s_bfe_u64 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0xea,0x94]
+s_bfe_u64 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0xea,0x94]
 
-s_bfe_u64 tba, s[0:1], s0
-// CHECK: [0x00,0x00,0xec,0x94]
+s_bfe_u64 tba, s[2:3], s2
+// CHECK: [0x02,0x02,0xec,0x94]
 
-s_bfe_u64 tma, s[0:1], s0
-// CHECK: [0x00,0x00,0xee,0x94]
+s_bfe_u64 tma, s[2:3], s2
+// CHECK: [0x02,0x02,0xee,0x94]
 
-s_bfe_u64 ttmp[10:11], s[0:1], s0
-// CHECK: [0x00,0x00,0xfa,0x94]
+s_bfe_u64 ttmp[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0xfa,0x94]
 
-s_bfe_u64 exec, s[0:1], s0
-// CHECK: [0x00,0x00,0xfe,0x94]
+s_bfe_u64 exec, s[2:3], s2
+// CHECK: [0x02,0x02,0xfe,0x94]
 
-s_bfe_u64 s[0:1], s[2:3], s0
-// CHECK: [0x02,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], s[4:5], s2
+// CHECK: [0x04,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[102:103], s0
-// CHECK: [0x66,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], s[102:103], s2
+// CHECK: [0x66,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], flat_scratch, s0
-// CHECK: [0x68,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], flat_scratch, s2
+// CHECK: [0x68,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], vcc, s0
-// CHECK: [0x6a,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], vcc, s2
+// CHECK: [0x6a,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], tba, s0
-// CHECK: [0x6c,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], tba, s2
+// CHECK: [0x6c,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], tma, s0
-// CHECK: [0x6e,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], tma, s2
+// CHECK: [0x6e,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], exec, s0
-// CHECK: [0x7e,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], exec, s2
+// CHECK: [0x7e,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x94]
+s_bfe_u64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x8a,0x94]
 
-s_bfe_u64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x94,0x56,0x34,0x12,0xaf]
+s_bfe_u64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x8a,0x94,0x56,0x34,0x12,0xaf]
 
-s_bfe_u64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x94,0x73,0x72,0x71,0x3f]
+s_bfe_u64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x8a,0x94,0x73,0x72,0x71,0x3f]
 
-s_bfe_u64 s[0:1], s[0:1], s103
-// CHECK: [0x00,0x67,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], s103
+// CHECK: [0x02,0x67,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x68,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x68,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x69,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x69,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x7c,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], m0
+// CHECK: [0x02,0x7c,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x94]
+s_bfe_u64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x94]
 
-s_bfe_u64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x94,0x56,0x34,0x12,0xaf]
+s_bfe_u64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x94,0x56,0x34,0x12,0xaf]
 
-s_bfe_u64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x94,0x73,0x72,0x71,0x3f]
+s_bfe_u64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x94,0x73,0x72,0x71,0x3f]
 
-s_bfe_i64 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x0a,0x95]
 
-s_bfe_i64 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x02,0x95]
+s_bfe_i64 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x0c,0x95]
 
-s_bfe_i64 s[102:103], s[0:1], s0
-// CHECK: [0x00,0x00,0x66,0x95]
+s_bfe_i64 s[102:103], s[2:3], s2
+// CHECK: [0x02,0x02,0x66,0x95]
 
-s_bfe_i64 flat_scratch, s[0:1], s0
-// CHECK: [0x00,0x00,0x68,0x95]
+s_bfe_i64 flat_scratch, s[2:3], s2
+// CHECK: [0x02,0x02,0x68,0x95]
 
-s_bfe_i64 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0x6a,0x95]
+s_bfe_i64 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0x6a,0x95]
 
-s_bfe_i64 tba, s[0:1], s0
-// CHECK: [0x00,0x00,0x6c,0x95]
+s_bfe_i64 tba, s[2:3], s2
+// CHECK: [0x02,0x02,0x6c,0x95]
 
-s_bfe_i64 tma, s[0:1], s0
-// CHECK: [0x00,0x00,0x6e,0x95]
+s_bfe_i64 tma, s[2:3], s2
+// CHECK: [0x02,0x02,0x6e,0x95]
 
-s_bfe_i64 ttmp[10:11], s[0:1], s0
-// CHECK: [0x00,0x00,0x7a,0x95]
+s_bfe_i64 ttmp[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x7a,0x95]
 
-s_bfe_i64 exec, s[0:1], s0
-// CHECK: [0x00,0x00,0x7e,0x95]
+s_bfe_i64 exec, s[2:3], s2
+// CHECK: [0x02,0x02,0x7e,0x95]
 
-s_bfe_i64 s[0:1], s[2:3], s0
-// CHECK: [0x02,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], s[4:5], s2
+// CHECK: [0x04,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[102:103], s0
-// CHECK: [0x66,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], s[102:103], s2
+// CHECK: [0x66,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], flat_scratch, s0
-// CHECK: [0x68,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], flat_scratch, s2
+// CHECK: [0x68,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], vcc, s0
-// CHECK: [0x6a,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], vcc, s2
+// CHECK: [0x6a,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], tba, s0
-// CHECK: [0x6c,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], tba, s2
+// CHECK: [0x6c,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], tma, s0
-// CHECK: [0x6e,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], tma, s2
+// CHECK: [0x6e,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], exec, s0
-// CHECK: [0x7e,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], exec, s2
+// CHECK: [0x7e,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x95]
+s_bfe_i64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x0a,0x95]
 
-s_bfe_i64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x95,0x56,0x34,0x12,0xaf]
+s_bfe_i64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0a,0x95,0x56,0x34,0x12,0xaf]
 
-s_bfe_i64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x95,0x73,0x72,0x71,0x3f]
+s_bfe_i64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0a,0x95,0x73,0x72,0x71,0x3f]
 
-s_bfe_i64 s[0:1], s[0:1], s103
-// CHECK: [0x00,0x67,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], s103
+// CHECK: [0x02,0x67,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x68,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x69,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x7c,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], m0
+// CHECK: [0x02,0x7c,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x00,0x95]
+s_bfe_i64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x0a,0x95]
 
-s_bfe_i64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x95,0x56,0x34,0x12,0xaf]
+s_bfe_i64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x0a,0x95,0x56,0x34,0x12,0xaf]
 
-s_bfe_i64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x95,0x73,0x72,0x71,0x3f]
+s_bfe_i64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x0a,0x95,0x73,0x72,0x71,0x3f]
 
-s_cbranch_g_fork s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x95]
+s_cbranch_g_fork s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x80,0x95]
 
-s_cbranch_g_fork s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x95]
+s_cbranch_g_fork s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x80,0x95]
 
-s_cbranch_g_fork s[102:103], s[0:1]
-// CHECK: [0x66,0x00,0x80,0x95]
+s_cbranch_g_fork s[102:103], s[4:5]
+// CHECK: [0x66,0x04,0x80,0x95]
 
-s_cbranch_g_fork flat_scratch, s[0:1]
-// CHECK: [0x68,0x00,0x80,0x95]
+s_cbranch_g_fork flat_scratch, s[4:5]
+// CHECK: [0x68,0x04,0x80,0x95]
 
-s_cbranch_g_fork vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x95]
+s_cbranch_g_fork vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x80,0x95]
 
-s_cbranch_g_fork tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x95]
+s_cbranch_g_fork tba, s[4:5]
+// CHECK: [0x6c,0x04,0x80,0x95]
 
-s_cbranch_g_fork tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x95]
+s_cbranch_g_fork tma, s[4:5]
+// CHECK: [0x6e,0x04,0x80,0x95]
 
-s_cbranch_g_fork ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x95]
+s_cbranch_g_fork ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x80,0x95]
 
-s_cbranch_g_fork exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x95]
+s_cbranch_g_fork exec, s[4:5]
+// CHECK: [0x7e,0x04,0x80,0x95]
 
-s_cbranch_g_fork s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x95]
+s_cbranch_g_fork s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x80,0x95]
 
-s_cbranch_g_fork s[0:1], s[102:103]
-// CHECK: [0x00,0x66,0x80,0x95]
+s_cbranch_g_fork s[2:3], s[102:103]
+// CHECK: [0x02,0x66,0x80,0x95]
 
-s_cbranch_g_fork s[0:1], flat_scratch
-// CHECK: [0x00,0x68,0x80,0x95]
+s_cbranch_g_fork s[2:3], flat_scratch
+// CHECK: [0x02,0x68,0x80,0x95]
 
-s_cbranch_g_fork s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x95]
+s_cbranch_g_fork s[2:3], vcc
+// CHECK: [0x02,0x6a,0x80,0x95]
 
-s_cbranch_g_fork s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x95]
+s_cbranch_g_fork s[2:3], tba
+// CHECK: [0x02,0x6c,0x80,0x95]
 
-s_cbranch_g_fork s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x95]
+s_cbranch_g_fork s[2:3], tma
+// CHECK: [0x02,0x6e,0x80,0x95]
 
-s_cbranch_g_fork s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x95]
+s_cbranch_g_fork s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x80,0x95]
 
-s_cbranch_g_fork s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x95]
+s_cbranch_g_fork s[2:3], exec
+// CHECK: [0x02,0x7e,0x80,0x95]
 
-s_absdiff_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x96]
+s_absdiff_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x96]
 
-s_absdiff_i32 s103, s0, s0
-// CHECK: [0x00,0x00,0x67,0x96]
+s_absdiff_i32 s103, s1, s2
+// CHECK: [0x01,0x02,0x67,0x96]
 
-s_absdiff_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x68,0x96]
+s_absdiff_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x68,0x96]
 
-s_absdiff_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x69,0x96]
+s_absdiff_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x69,0x96]
 
-s_absdiff_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x96]
+s_absdiff_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x96]
 
-s_absdiff_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x96]
+s_absdiff_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x96]
 
-s_absdiff_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x96]
+s_absdiff_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x96]
 
-s_absdiff_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x96]
+s_absdiff_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x96]
 
-s_absdiff_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x96]
+s_absdiff_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x96]
 
-s_absdiff_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x96]
+s_absdiff_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x96]
 
-s_absdiff_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x96]
+s_absdiff_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x96]
 
-s_absdiff_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x96]
+s_absdiff_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x96]
 
-s_absdiff_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x96]
+s_absdiff_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x96]
 
-s_absdiff_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x96]
+s_absdiff_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x96]
 
-s_absdiff_i32 s0, s103, s0
-// CHECK: [0x67,0x00,0x00,0x96]
+s_absdiff_i32 s5, s103, s2
+// CHECK: [0x67,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0x96]
+s_absdiff_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0x96]
+s_absdiff_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x96]
+s_absdiff_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x96]
+s_absdiff_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x96]
+s_absdiff_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x96]
+s_absdiff_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x96]
+s_absdiff_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x96]
+s_absdiff_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x96]
+s_absdiff_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x96]
+s_absdiff_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x96]
+s_absdiff_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x96]
+s_absdiff_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x96]
+s_absdiff_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x96]
+s_absdiff_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x96]
+s_absdiff_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x96]
+s_absdiff_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x96]
 
-s_absdiff_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x96,0x56,0x34,0x12,0xaf]
+s_absdiff_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x96,0x56,0x34,0x12,0xaf]
 
-s_absdiff_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x96,0x73,0x72,0x71,0x3f]
+s_absdiff_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x96,0x73,0x72,0x71,0x3f]
 
-s_absdiff_i32 s0, s0, s103
-// CHECK: [0x00,0x67,0x00,0x96]
+s_absdiff_i32 s5, s1, s103
+// CHECK: [0x01,0x67,0x05,0x96]
 
-s_absdiff_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0x96]
+s_absdiff_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0x96]
 
-s_absdiff_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0x96]
+s_absdiff_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0x96]
 
-s_absdiff_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x96]
+s_absdiff_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x96]
 
-s_absdiff_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x96]
+s_absdiff_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x96]
 
-s_absdiff_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x96]
+s_absdiff_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x96]
 
-s_absdiff_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x96]
+s_absdiff_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x96]
 
-s_absdiff_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x96]
+s_absdiff_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x96]
 
-s_absdiff_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x96]
+s_absdiff_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x96]
 
-s_absdiff_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x96]
+s_absdiff_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x96]
 
-s_absdiff_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x96]
+s_absdiff_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x96]
 
-s_absdiff_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x96]
+s_absdiff_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x96]
 
-s_absdiff_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x96]
+s_absdiff_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x96]
 
-s_absdiff_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x96]
+s_absdiff_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x96]
 
-s_absdiff_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x96]
+s_absdiff_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x96]
 
-s_absdiff_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x96]
+s_absdiff_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x96]
 
-s_absdiff_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x96]
+s_absdiff_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x96]
 
-s_absdiff_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x96,0x56,0x34,0x12,0xaf]
+s_absdiff_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x96,0x56,0x34,0x12,0xaf]
 
-s_absdiff_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x96,0x73,0x72,0x71,0x3f]
+s_absdiff_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x96,0x73,0x72,0x71,0x3f]
 
-s_cmp_eq_i32 s0, s0
-// CHECK: [0x00,0x00,0x00,0xbf]
+s_cmp_eq_i32 s1, s2
+// CHECK: [0x01,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 s103, s0
-// CHECK: [0x67,0x00,0x00,0xbf]
+s_cmp_eq_i32 s103, s2
+// CHECK: [0x67,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x00,0xbf]
+s_cmp_eq_i32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x00,0xbf]
+s_cmp_eq_i32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0xbf]
+s_cmp_eq_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0xbf]
+s_cmp_eq_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0xbf]
+s_cmp_eq_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0xbf]
+s_cmp_eq_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0xbf]
+s_cmp_eq_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0xbf]
+s_cmp_eq_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0xbf]
+s_cmp_eq_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 m0, s0
-// CHECK: [0x7c,0x00,0x00,0xbf]
+s_cmp_eq_i32 m0, s2
+// CHECK: [0x7c,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0xbf]
+s_cmp_eq_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0xbf]
+s_cmp_eq_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 0, s0
-// CHECK: [0x80,0x00,0x00,0xbf]
+s_cmp_eq_i32 0, s2
+// CHECK: [0x80,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 -1, s0
-// CHECK: [0xc1,0x00,0x00,0xbf]
+s_cmp_eq_i32 -1, s2
+// CHECK: [0xc1,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0xbf]
+s_cmp_eq_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0xbf]
+s_cmp_eq_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_eq_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x00,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_eq_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_eq_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x00,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_eq_i32 s0, s103
-// CHECK: [0x00,0x67,0x00,0xbf]
+s_cmp_eq_i32 s1, s103
+// CHECK: [0x01,0x67,0x00,0xbf]
 
-s_cmp_eq_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x00,0xbf]
+s_cmp_eq_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x00,0xbf]
 
-s_cmp_eq_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x00,0xbf]
+s_cmp_eq_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x00,0xbf]
 
-s_cmp_eq_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0xbf]
+s_cmp_eq_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x00,0xbf]
 
-s_cmp_eq_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0xbf]
+s_cmp_eq_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x00,0xbf]
 
-s_cmp_eq_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0xbf]
+s_cmp_eq_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x00,0xbf]
 
-s_cmp_eq_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0xbf]
+s_cmp_eq_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x00,0xbf]
 
-s_cmp_eq_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0xbf]
+s_cmp_eq_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x00,0xbf]
 
-s_cmp_eq_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0xbf]
+s_cmp_eq_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x00,0xbf]
 
-s_cmp_eq_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0xbf]
+s_cmp_eq_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x00,0xbf]
 
-s_cmp_eq_i32 s0, m0
-// CHECK: [0x00,0x7c,0x00,0xbf]
+s_cmp_eq_i32 s1, m0
+// CHECK: [0x01,0x7c,0x00,0xbf]
 
-s_cmp_eq_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0xbf]
+s_cmp_eq_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x00,0xbf]
 
-s_cmp_eq_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0xbf]
+s_cmp_eq_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x00,0xbf]
 
-s_cmp_eq_i32 s0, 0
-// CHECK: [0x00,0x80,0x00,0xbf]
+s_cmp_eq_i32 s1, 0
+// CHECK: [0x01,0x80,0x00,0xbf]
 
-s_cmp_eq_i32 s0, -1
-// CHECK: [0x00,0xc1,0x00,0xbf]
+s_cmp_eq_i32 s1, -1
+// CHECK: [0x01,0xc1,0x00,0xbf]
 
-s_cmp_eq_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0xbf]
+s_cmp_eq_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x00,0xbf]
 
-s_cmp_eq_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0xbf]
+s_cmp_eq_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x00,0xbf]
 
-s_cmp_eq_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_eq_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x00,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_eq_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_eq_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x00,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lg_i32 s0, s0
-// CHECK: [0x00,0x00,0x01,0xbf]
+s_cmp_lg_i32 s1, s2
+// CHECK: [0x01,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 s103, s0
-// CHECK: [0x67,0x00,0x01,0xbf]
+s_cmp_lg_i32 s103, s2
+// CHECK: [0x67,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x01,0xbf]
+s_cmp_lg_i32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x01,0xbf]
+s_cmp_lg_i32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x01,0xbf]
+s_cmp_lg_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x01,0xbf]
+s_cmp_lg_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x01,0xbf]
+s_cmp_lg_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x01,0xbf]
+s_cmp_lg_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x01,0xbf]
+s_cmp_lg_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x01,0xbf]
+s_cmp_lg_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x01,0xbf]
+s_cmp_lg_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 m0, s0
-// CHECK: [0x7c,0x00,0x01,0xbf]
+s_cmp_lg_i32 m0, s2
+// CHECK: [0x7c,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x01,0xbf]
+s_cmp_lg_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x01,0xbf]
+s_cmp_lg_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 0, s0
-// CHECK: [0x80,0x00,0x01,0xbf]
+s_cmp_lg_i32 0, s2
+// CHECK: [0x80,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 -1, s0
-// CHECK: [0xc1,0x00,0x01,0xbf]
+s_cmp_lg_i32 -1, s2
+// CHECK: [0xc1,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x01,0xbf]
+s_cmp_lg_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x01,0xbf]
+s_cmp_lg_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x01,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lg_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x01,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lg_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x01,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lg_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x01,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lg_i32 s0, s103
-// CHECK: [0x00,0x67,0x01,0xbf]
+s_cmp_lg_i32 s1, s103
+// CHECK: [0x01,0x67,0x01,0xbf]
 
-s_cmp_lg_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x01,0xbf]
+s_cmp_lg_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x01,0xbf]
 
-s_cmp_lg_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x01,0xbf]
+s_cmp_lg_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x01,0xbf]
 
-s_cmp_lg_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x01,0xbf]
+s_cmp_lg_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x01,0xbf]
 
-s_cmp_lg_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x01,0xbf]
+s_cmp_lg_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x01,0xbf]
 
-s_cmp_lg_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x01,0xbf]
+s_cmp_lg_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x01,0xbf]
 
-s_cmp_lg_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x01,0xbf]
+s_cmp_lg_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x01,0xbf]
 
-s_cmp_lg_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x01,0xbf]
+s_cmp_lg_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x01,0xbf]
 
-s_cmp_lg_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x01,0xbf]
+s_cmp_lg_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x01,0xbf]
 
-s_cmp_lg_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x01,0xbf]
+s_cmp_lg_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x01,0xbf]
 
-s_cmp_lg_i32 s0, m0
-// CHECK: [0x00,0x7c,0x01,0xbf]
+s_cmp_lg_i32 s1, m0
+// CHECK: [0x01,0x7c,0x01,0xbf]
 
-s_cmp_lg_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x01,0xbf]
+s_cmp_lg_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x01,0xbf]
 
-s_cmp_lg_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x01,0xbf]
+s_cmp_lg_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x01,0xbf]
 
-s_cmp_lg_i32 s0, 0
-// CHECK: [0x00,0x80,0x01,0xbf]
+s_cmp_lg_i32 s1, 0
+// CHECK: [0x01,0x80,0x01,0xbf]
 
-s_cmp_lg_i32 s0, -1
-// CHECK: [0x00,0xc1,0x01,0xbf]
+s_cmp_lg_i32 s1, -1
+// CHECK: [0x01,0xc1,0x01,0xbf]
 
-s_cmp_lg_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x01,0xbf]
+s_cmp_lg_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x01,0xbf]
 
-s_cmp_lg_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x01,0xbf]
+s_cmp_lg_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x01,0xbf]
 
-s_cmp_lg_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x01,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lg_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x01,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lg_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x01,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lg_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x01,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_gt_i32 s0, s0
-// CHECK: [0x00,0x00,0x02,0xbf]
+s_cmp_gt_i32 s1, s2
+// CHECK: [0x01,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 s103, s0
-// CHECK: [0x67,0x00,0x02,0xbf]
+s_cmp_gt_i32 s103, s2
+// CHECK: [0x67,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x02,0xbf]
+s_cmp_gt_i32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x02,0xbf]
+s_cmp_gt_i32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x02,0xbf]
+s_cmp_gt_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x02,0xbf]
+s_cmp_gt_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x02,0xbf]
+s_cmp_gt_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x02,0xbf]
+s_cmp_gt_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x02,0xbf]
+s_cmp_gt_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x02,0xbf]
+s_cmp_gt_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x02,0xbf]
+s_cmp_gt_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 m0, s0
-// CHECK: [0x7c,0x00,0x02,0xbf]
+s_cmp_gt_i32 m0, s2
+// CHECK: [0x7c,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x02,0xbf]
+s_cmp_gt_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x02,0xbf]
+s_cmp_gt_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 0, s0
-// CHECK: [0x80,0x00,0x02,0xbf]
+s_cmp_gt_i32 0, s2
+// CHECK: [0x80,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 -1, s0
-// CHECK: [0xc1,0x00,0x02,0xbf]
+s_cmp_gt_i32 -1, s2
+// CHECK: [0xc1,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x02,0xbf]
+s_cmp_gt_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x02,0xbf]
+s_cmp_gt_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x02,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_gt_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x02,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_gt_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x02,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_gt_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x02,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_gt_i32 s0, s103
-// CHECK: [0x00,0x67,0x02,0xbf]
+s_cmp_gt_i32 s1, s103
+// CHECK: [0x01,0x67,0x02,0xbf]
 
-s_cmp_gt_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x02,0xbf]
+s_cmp_gt_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x02,0xbf]
 
-s_cmp_gt_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x02,0xbf]
+s_cmp_gt_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x02,0xbf]
 
-s_cmp_gt_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x02,0xbf]
+s_cmp_gt_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x02,0xbf]
 
-s_cmp_gt_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x02,0xbf]
+s_cmp_gt_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x02,0xbf]
 
-s_cmp_gt_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x02,0xbf]
+s_cmp_gt_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x02,0xbf]
 
-s_cmp_gt_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x02,0xbf]
+s_cmp_gt_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x02,0xbf]
 
-s_cmp_gt_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x02,0xbf]
+s_cmp_gt_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x02,0xbf]
 
-s_cmp_gt_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x02,0xbf]
+s_cmp_gt_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x02,0xbf]
 
-s_cmp_gt_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x02,0xbf]
+s_cmp_gt_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x02,0xbf]
 
-s_cmp_gt_i32 s0, m0
-// CHECK: [0x00,0x7c,0x02,0xbf]
+s_cmp_gt_i32 s1, m0
+// CHECK: [0x01,0x7c,0x02,0xbf]
 
-s_cmp_gt_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x02,0xbf]
+s_cmp_gt_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x02,0xbf]
 
-s_cmp_gt_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x02,0xbf]
+s_cmp_gt_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x02,0xbf]
 
-s_cmp_gt_i32 s0, 0
-// CHECK: [0x00,0x80,0x02,0xbf]
+s_cmp_gt_i32 s1, 0
+// CHECK: [0x01,0x80,0x02,0xbf]
 
-s_cmp_gt_i32 s0, -1
-// CHECK: [0x00,0xc1,0x02,0xbf]
+s_cmp_gt_i32 s1, -1
+// CHECK: [0x01,0xc1,0x02,0xbf]
 
-s_cmp_gt_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x02,0xbf]
+s_cmp_gt_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x02,0xbf]
 
-s_cmp_gt_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x02,0xbf]
+s_cmp_gt_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x02,0xbf]
 
-s_cmp_gt_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x02,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_gt_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x02,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_gt_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x02,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_gt_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x02,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_ge_i32 s0, s0
-// CHECK: [0x00,0x00,0x03,0xbf]
+s_cmp_ge_i32 s1, s2
+// CHECK: [0x01,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 s103, s0
-// CHECK: [0x67,0x00,0x03,0xbf]
+s_cmp_ge_i32 s103, s2
+// CHECK: [0x67,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x03,0xbf]
+s_cmp_ge_i32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x03,0xbf]
+s_cmp_ge_i32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x03,0xbf]
+s_cmp_ge_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x03,0xbf]
+s_cmp_ge_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x03,0xbf]
+s_cmp_ge_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x03,0xbf]
+s_cmp_ge_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x03,0xbf]
+s_cmp_ge_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x03,0xbf]
+s_cmp_ge_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x03,0xbf]
+s_cmp_ge_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 m0, s0
-// CHECK: [0x7c,0x00,0x03,0xbf]
+s_cmp_ge_i32 m0, s2
+// CHECK: [0x7c,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x03,0xbf]
+s_cmp_ge_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x03,0xbf]
+s_cmp_ge_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 0, s0
-// CHECK: [0x80,0x00,0x03,0xbf]
+s_cmp_ge_i32 0, s2
+// CHECK: [0x80,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 -1, s0
-// CHECK: [0xc1,0x00,0x03,0xbf]
+s_cmp_ge_i32 -1, s2
+// CHECK: [0xc1,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x03,0xbf]
+s_cmp_ge_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x03,0xbf]
+s_cmp_ge_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x03,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_ge_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x03,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_ge_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x03,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_ge_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x03,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_ge_i32 s0, s103
-// CHECK: [0x00,0x67,0x03,0xbf]
+s_cmp_ge_i32 s1, s103
+// CHECK: [0x01,0x67,0x03,0xbf]
 
-s_cmp_ge_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x03,0xbf]
+s_cmp_ge_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x03,0xbf]
 
-s_cmp_ge_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x03,0xbf]
+s_cmp_ge_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x03,0xbf]
 
-s_cmp_ge_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x03,0xbf]
+s_cmp_ge_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x03,0xbf]
 
-s_cmp_ge_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x03,0xbf]
+s_cmp_ge_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x03,0xbf]
 
-s_cmp_ge_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x03,0xbf]
+s_cmp_ge_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x03,0xbf]
 
-s_cmp_ge_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x03,0xbf]
+s_cmp_ge_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x03,0xbf]
 
-s_cmp_ge_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x03,0xbf]
+s_cmp_ge_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x03,0xbf]
 
-s_cmp_ge_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x03,0xbf]
+s_cmp_ge_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x03,0xbf]
 
-s_cmp_ge_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x03,0xbf]
+s_cmp_ge_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x03,0xbf]
 
-s_cmp_ge_i32 s0, m0
-// CHECK: [0x00,0x7c,0x03,0xbf]
+s_cmp_ge_i32 s1, m0
+// CHECK: [0x01,0x7c,0x03,0xbf]
 
-s_cmp_ge_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x03,0xbf]
+s_cmp_ge_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x03,0xbf]
 
-s_cmp_ge_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x03,0xbf]
+s_cmp_ge_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x03,0xbf]
 
-s_cmp_ge_i32 s0, 0
-// CHECK: [0x00,0x80,0x03,0xbf]
+s_cmp_ge_i32 s1, 0
+// CHECK: [0x01,0x80,0x03,0xbf]
 
-s_cmp_ge_i32 s0, -1
-// CHECK: [0x00,0xc1,0x03,0xbf]
+s_cmp_ge_i32 s1, -1
+// CHECK: [0x01,0xc1,0x03,0xbf]
 
-s_cmp_ge_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x03,0xbf]
+s_cmp_ge_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x03,0xbf]
 
-s_cmp_ge_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x03,0xbf]
+s_cmp_ge_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x03,0xbf]
 
-s_cmp_ge_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x03,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_ge_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x03,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_ge_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x03,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_ge_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x03,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lt_i32 s0, s0
-// CHECK: [0x00,0x00,0x04,0xbf]
+s_cmp_lt_i32 s1, s2
+// CHECK: [0x01,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 s103, s0
-// CHECK: [0x67,0x00,0x04,0xbf]
+s_cmp_lt_i32 s103, s2
+// CHECK: [0x67,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x04,0xbf]
+s_cmp_lt_i32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x04,0xbf]
+s_cmp_lt_i32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x04,0xbf]
+s_cmp_lt_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x04,0xbf]
+s_cmp_lt_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x04,0xbf]
+s_cmp_lt_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x04,0xbf]
+s_cmp_lt_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x04,0xbf]
+s_cmp_lt_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x04,0xbf]
+s_cmp_lt_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x04,0xbf]
+s_cmp_lt_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 m0, s0
-// CHECK: [0x7c,0x00,0x04,0xbf]
+s_cmp_lt_i32 m0, s2
+// CHECK: [0x7c,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x04,0xbf]
+s_cmp_lt_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x04,0xbf]
+s_cmp_lt_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 0, s0
-// CHECK: [0x80,0x00,0x04,0xbf]
+s_cmp_lt_i32 0, s2
+// CHECK: [0x80,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 -1, s0
-// CHECK: [0xc1,0x00,0x04,0xbf]
+s_cmp_lt_i32 -1, s2
+// CHECK: [0xc1,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x04,0xbf]
+s_cmp_lt_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x04,0xbf]
+s_cmp_lt_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x04,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lt_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x04,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lt_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x04,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lt_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x04,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lt_i32 s0, s103
-// CHECK: [0x00,0x67,0x04,0xbf]
+s_cmp_lt_i32 s1, s103
+// CHECK: [0x01,0x67,0x04,0xbf]
 
-s_cmp_lt_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x04,0xbf]
+s_cmp_lt_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x04,0xbf]
 
-s_cmp_lt_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x04,0xbf]
+s_cmp_lt_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x04,0xbf]
 
-s_cmp_lt_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x04,0xbf]
+s_cmp_lt_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x04,0xbf]
 
-s_cmp_lt_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x04,0xbf]
+s_cmp_lt_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x04,0xbf]
 
-s_cmp_lt_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x04,0xbf]
+s_cmp_lt_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x04,0xbf]
 
-s_cmp_lt_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x04,0xbf]
+s_cmp_lt_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x04,0xbf]
 
-s_cmp_lt_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x04,0xbf]
+s_cmp_lt_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x04,0xbf]
 
-s_cmp_lt_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x04,0xbf]
+s_cmp_lt_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x04,0xbf]
 
-s_cmp_lt_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x04,0xbf]
+s_cmp_lt_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x04,0xbf]
 
-s_cmp_lt_i32 s0, m0
-// CHECK: [0x00,0x7c,0x04,0xbf]
+s_cmp_lt_i32 s1, m0
+// CHECK: [0x01,0x7c,0x04,0xbf]
 
-s_cmp_lt_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x04,0xbf]
+s_cmp_lt_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x04,0xbf]
 
-s_cmp_lt_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x04,0xbf]
+s_cmp_lt_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x04,0xbf]
 
-s_cmp_lt_i32 s0, 0
-// CHECK: [0x00,0x80,0x04,0xbf]
+s_cmp_lt_i32 s1, 0
+// CHECK: [0x01,0x80,0x04,0xbf]
 
-s_cmp_lt_i32 s0, -1
-// CHECK: [0x00,0xc1,0x04,0xbf]
+s_cmp_lt_i32 s1, -1
+// CHECK: [0x01,0xc1,0x04,0xbf]
 
-s_cmp_lt_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x04,0xbf]
+s_cmp_lt_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x04,0xbf]
 
-s_cmp_lt_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x04,0xbf]
+s_cmp_lt_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x04,0xbf]
 
-s_cmp_lt_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x04,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lt_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x04,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lt_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x04,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lt_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x04,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_le_i32 s0, s0
-// CHECK: [0x00,0x00,0x05,0xbf]
+s_cmp_le_i32 s1, s2
+// CHECK: [0x01,0x02,0x05,0xbf]
 
-s_cmp_le_i32 s103, s0
-// CHECK: [0x67,0x00,0x05,0xbf]
+s_cmp_le_i32 s103, s2
+// CHECK: [0x67,0x02,0x05,0xbf]
 
-s_cmp_le_i32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x05,0xbf]
+s_cmp_le_i32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x05,0xbf]
 
-s_cmp_le_i32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x05,0xbf]
+s_cmp_le_i32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x05,0xbf]
 
-s_cmp_le_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x05,0xbf]
+s_cmp_le_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0xbf]
 
-s_cmp_le_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x05,0xbf]
+s_cmp_le_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0xbf]
 
-s_cmp_le_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x05,0xbf]
+s_cmp_le_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0xbf]
 
-s_cmp_le_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x05,0xbf]
+s_cmp_le_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0xbf]
 
-s_cmp_le_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x05,0xbf]
+s_cmp_le_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0xbf]
 
-s_cmp_le_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x05,0xbf]
+s_cmp_le_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0xbf]
 
-s_cmp_le_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x05,0xbf]
+s_cmp_le_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0xbf]
 
-s_cmp_le_i32 m0, s0
-// CHECK: [0x7c,0x00,0x05,0xbf]
+s_cmp_le_i32 m0, s2
+// CHECK: [0x7c,0x02,0x05,0xbf]
 
-s_cmp_le_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x05,0xbf]
+s_cmp_le_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0xbf]
 
-s_cmp_le_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x05,0xbf]
+s_cmp_le_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0xbf]
 
-s_cmp_le_i32 0, s0
-// CHECK: [0x80,0x00,0x05,0xbf]
+s_cmp_le_i32 0, s2
+// CHECK: [0x80,0x02,0x05,0xbf]
 
-s_cmp_le_i32 -1, s0
-// CHECK: [0xc1,0x00,0x05,0xbf]
+s_cmp_le_i32 -1, s2
+// CHECK: [0xc1,0x02,0x05,0xbf]
 
-s_cmp_le_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x05,0xbf]
+s_cmp_le_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0xbf]
 
-s_cmp_le_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x05,0xbf]
+s_cmp_le_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0xbf]
 
-s_cmp_le_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x05,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_le_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_le_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x05,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_le_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_le_i32 s0, s103
-// CHECK: [0x00,0x67,0x05,0xbf]
+s_cmp_le_i32 s1, s103
+// CHECK: [0x01,0x67,0x05,0xbf]
 
-s_cmp_le_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x05,0xbf]
+s_cmp_le_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x05,0xbf]
 
-s_cmp_le_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x05,0xbf]
+s_cmp_le_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x05,0xbf]
 
-s_cmp_le_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x05,0xbf]
+s_cmp_le_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0xbf]
 
-s_cmp_le_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x05,0xbf]
+s_cmp_le_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0xbf]
 
-s_cmp_le_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x05,0xbf]
+s_cmp_le_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0xbf]
 
-s_cmp_le_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x05,0xbf]
+s_cmp_le_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0xbf]
 
-s_cmp_le_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x05,0xbf]
+s_cmp_le_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0xbf]
 
-s_cmp_le_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x05,0xbf]
+s_cmp_le_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0xbf]
 
-s_cmp_le_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x05,0xbf]
+s_cmp_le_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0xbf]
 
-s_cmp_le_i32 s0, m0
-// CHECK: [0x00,0x7c,0x05,0xbf]
+s_cmp_le_i32 s1, m0
+// CHECK: [0x01,0x7c,0x05,0xbf]
 
-s_cmp_le_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x05,0xbf]
+s_cmp_le_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0xbf]
 
-s_cmp_le_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x05,0xbf]
+s_cmp_le_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0xbf]
 
-s_cmp_le_i32 s0, 0
-// CHECK: [0x00,0x80,0x05,0xbf]
+s_cmp_le_i32 s1, 0
+// CHECK: [0x01,0x80,0x05,0xbf]
 
-s_cmp_le_i32 s0, -1
-// CHECK: [0x00,0xc1,0x05,0xbf]
+s_cmp_le_i32 s1, -1
+// CHECK: [0x01,0xc1,0x05,0xbf]
 
-s_cmp_le_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x05,0xbf]
+s_cmp_le_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0xbf]
 
-s_cmp_le_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x05,0xbf]
+s_cmp_le_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0xbf]
 
-s_cmp_le_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x05,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_le_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_le_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x05,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_le_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_eq_u32 s0, s0
-// CHECK: [0x00,0x00,0x06,0xbf]
+s_cmp_eq_u32 s1, s2
+// CHECK: [0x01,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 s103, s0
-// CHECK: [0x67,0x00,0x06,0xbf]
+s_cmp_eq_u32 s103, s2
+// CHECK: [0x67,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x06,0xbf]
+s_cmp_eq_u32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x06,0xbf]
+s_cmp_eq_u32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x06,0xbf]
+s_cmp_eq_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x06,0xbf]
+s_cmp_eq_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x06,0xbf]
+s_cmp_eq_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x06,0xbf]
+s_cmp_eq_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x06,0xbf]
+s_cmp_eq_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x06,0xbf]
+s_cmp_eq_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x06,0xbf]
+s_cmp_eq_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 m0, s0
-// CHECK: [0x7c,0x00,0x06,0xbf]
+s_cmp_eq_u32 m0, s2
+// CHECK: [0x7c,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x06,0xbf]
+s_cmp_eq_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x06,0xbf]
+s_cmp_eq_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 0, s0
-// CHECK: [0x80,0x00,0x06,0xbf]
+s_cmp_eq_u32 0, s2
+// CHECK: [0x80,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 -1, s0
-// CHECK: [0xc1,0x00,0x06,0xbf]
+s_cmp_eq_u32 -1, s2
+// CHECK: [0xc1,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x06,0xbf]
+s_cmp_eq_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x06,0xbf]
+s_cmp_eq_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x06,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_eq_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x06,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_eq_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x06,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_eq_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x06,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_eq_u32 s0, s103
-// CHECK: [0x00,0x67,0x06,0xbf]
+s_cmp_eq_u32 s1, s103
+// CHECK: [0x01,0x67,0x06,0xbf]
 
-s_cmp_eq_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x06,0xbf]
+s_cmp_eq_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x06,0xbf]
 
-s_cmp_eq_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x06,0xbf]
+s_cmp_eq_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x06,0xbf]
 
-s_cmp_eq_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x06,0xbf]
+s_cmp_eq_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x06,0xbf]
 
-s_cmp_eq_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x06,0xbf]
+s_cmp_eq_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x06,0xbf]
 
-s_cmp_eq_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x06,0xbf]
+s_cmp_eq_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x06,0xbf]
 
-s_cmp_eq_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x06,0xbf]
+s_cmp_eq_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x06,0xbf]
 
-s_cmp_eq_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x06,0xbf]
+s_cmp_eq_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x06,0xbf]
 
-s_cmp_eq_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x06,0xbf]
+s_cmp_eq_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x06,0xbf]
 
-s_cmp_eq_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x06,0xbf]
+s_cmp_eq_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x06,0xbf]
 
-s_cmp_eq_u32 s0, m0
-// CHECK: [0x00,0x7c,0x06,0xbf]
+s_cmp_eq_u32 s1, m0
+// CHECK: [0x01,0x7c,0x06,0xbf]
 
-s_cmp_eq_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x06,0xbf]
+s_cmp_eq_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x06,0xbf]
 
-s_cmp_eq_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x06,0xbf]
+s_cmp_eq_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x06,0xbf]
 
-s_cmp_eq_u32 s0, 0
-// CHECK: [0x00,0x80,0x06,0xbf]
+s_cmp_eq_u32 s1, 0
+// CHECK: [0x01,0x80,0x06,0xbf]
 
-s_cmp_eq_u32 s0, -1
-// CHECK: [0x00,0xc1,0x06,0xbf]
+s_cmp_eq_u32 s1, -1
+// CHECK: [0x01,0xc1,0x06,0xbf]
 
-s_cmp_eq_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x06,0xbf]
+s_cmp_eq_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x06,0xbf]
 
-s_cmp_eq_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x06,0xbf]
+s_cmp_eq_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x06,0xbf]
 
-s_cmp_eq_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x06,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_eq_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x06,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_eq_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x06,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_eq_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x06,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lg_u32 s0, s0
-// CHECK: [0x00,0x00,0x07,0xbf]
+s_cmp_lg_u32 s1, s2
+// CHECK: [0x01,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 s103, s0
-// CHECK: [0x67,0x00,0x07,0xbf]
+s_cmp_lg_u32 s103, s2
+// CHECK: [0x67,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x07,0xbf]
+s_cmp_lg_u32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x07,0xbf]
+s_cmp_lg_u32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x07,0xbf]
+s_cmp_lg_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x07,0xbf]
+s_cmp_lg_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x07,0xbf]
+s_cmp_lg_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x07,0xbf]
+s_cmp_lg_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x07,0xbf]
+s_cmp_lg_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x07,0xbf]
+s_cmp_lg_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x07,0xbf]
+s_cmp_lg_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 m0, s0
-// CHECK: [0x7c,0x00,0x07,0xbf]
+s_cmp_lg_u32 m0, s2
+// CHECK: [0x7c,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x07,0xbf]
+s_cmp_lg_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x07,0xbf]
+s_cmp_lg_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 0, s0
-// CHECK: [0x80,0x00,0x07,0xbf]
+s_cmp_lg_u32 0, s2
+// CHECK: [0x80,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 -1, s0
-// CHECK: [0xc1,0x00,0x07,0xbf]
+s_cmp_lg_u32 -1, s2
+// CHECK: [0xc1,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x07,0xbf]
+s_cmp_lg_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x07,0xbf]
+s_cmp_lg_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x07,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lg_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x07,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lg_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x07,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lg_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x07,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lg_u32 s0, s103
-// CHECK: [0x00,0x67,0x07,0xbf]
+s_cmp_lg_u32 s1, s103
+// CHECK: [0x01,0x67,0x07,0xbf]
 
-s_cmp_lg_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x07,0xbf]
+s_cmp_lg_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x07,0xbf]
 
-s_cmp_lg_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x07,0xbf]
+s_cmp_lg_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x07,0xbf]
 
-s_cmp_lg_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x07,0xbf]
+s_cmp_lg_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x07,0xbf]
 
-s_cmp_lg_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x07,0xbf]
+s_cmp_lg_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x07,0xbf]
 
-s_cmp_lg_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x07,0xbf]
+s_cmp_lg_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x07,0xbf]
 
-s_cmp_lg_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x07,0xbf]
+s_cmp_lg_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x07,0xbf]
 
-s_cmp_lg_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x07,0xbf]
+s_cmp_lg_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x07,0xbf]
 
-s_cmp_lg_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x07,0xbf]
+s_cmp_lg_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x07,0xbf]
 
-s_cmp_lg_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x07,0xbf]
+s_cmp_lg_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x07,0xbf]
 
-s_cmp_lg_u32 s0, m0
-// CHECK: [0x00,0x7c,0x07,0xbf]
+s_cmp_lg_u32 s1, m0
+// CHECK: [0x01,0x7c,0x07,0xbf]
 
-s_cmp_lg_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x07,0xbf]
+s_cmp_lg_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x07,0xbf]
 
-s_cmp_lg_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x07,0xbf]
+s_cmp_lg_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x07,0xbf]
 
-s_cmp_lg_u32 s0, 0
-// CHECK: [0x00,0x80,0x07,0xbf]
+s_cmp_lg_u32 s1, 0
+// CHECK: [0x01,0x80,0x07,0xbf]
 
-s_cmp_lg_u32 s0, -1
-// CHECK: [0x00,0xc1,0x07,0xbf]
+s_cmp_lg_u32 s1, -1
+// CHECK: [0x01,0xc1,0x07,0xbf]
 
-s_cmp_lg_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x07,0xbf]
+s_cmp_lg_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x07,0xbf]
 
-s_cmp_lg_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x07,0xbf]
+s_cmp_lg_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x07,0xbf]
 
-s_cmp_lg_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x07,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lg_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x07,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lg_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x07,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lg_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x07,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_gt_u32 s0, s0
-// CHECK: [0x00,0x00,0x08,0xbf]
+s_cmp_gt_u32 s1, s2
+// CHECK: [0x01,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 s103, s0
-// CHECK: [0x67,0x00,0x08,0xbf]
+s_cmp_gt_u32 s103, s2
+// CHECK: [0x67,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x08,0xbf]
+s_cmp_gt_u32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x08,0xbf]
+s_cmp_gt_u32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x08,0xbf]
+s_cmp_gt_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x08,0xbf]
+s_cmp_gt_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x08,0xbf]
+s_cmp_gt_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x08,0xbf]
+s_cmp_gt_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x08,0xbf]
+s_cmp_gt_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x08,0xbf]
+s_cmp_gt_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x08,0xbf]
+s_cmp_gt_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 m0, s0
-// CHECK: [0x7c,0x00,0x08,0xbf]
+s_cmp_gt_u32 m0, s2
+// CHECK: [0x7c,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x08,0xbf]
+s_cmp_gt_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x08,0xbf]
+s_cmp_gt_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 0, s0
-// CHECK: [0x80,0x00,0x08,0xbf]
+s_cmp_gt_u32 0, s2
+// CHECK: [0x80,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 -1, s0
-// CHECK: [0xc1,0x00,0x08,0xbf]
+s_cmp_gt_u32 -1, s2
+// CHECK: [0xc1,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x08,0xbf]
+s_cmp_gt_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x08,0xbf]
+s_cmp_gt_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x08,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_gt_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x08,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_gt_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x08,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_gt_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x08,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_gt_u32 s0, s103
-// CHECK: [0x00,0x67,0x08,0xbf]
+s_cmp_gt_u32 s1, s103
+// CHECK: [0x01,0x67,0x08,0xbf]
 
-s_cmp_gt_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x08,0xbf]
+s_cmp_gt_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x08,0xbf]
 
-s_cmp_gt_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x08,0xbf]
+s_cmp_gt_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x08,0xbf]
 
-s_cmp_gt_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x08,0xbf]
+s_cmp_gt_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x08,0xbf]
 
-s_cmp_gt_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x08,0xbf]
+s_cmp_gt_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x08,0xbf]
 
-s_cmp_gt_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x08,0xbf]
+s_cmp_gt_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x08,0xbf]
 
-s_cmp_gt_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x08,0xbf]
+s_cmp_gt_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x08,0xbf]
 
-s_cmp_gt_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x08,0xbf]
+s_cmp_gt_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x08,0xbf]
 
-s_cmp_gt_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x08,0xbf]
+s_cmp_gt_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x08,0xbf]
 
-s_cmp_gt_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x08,0xbf]
+s_cmp_gt_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x08,0xbf]
 
-s_cmp_gt_u32 s0, m0
-// CHECK: [0x00,0x7c,0x08,0xbf]
+s_cmp_gt_u32 s1, m0
+// CHECK: [0x01,0x7c,0x08,0xbf]
 
-s_cmp_gt_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x08,0xbf]
+s_cmp_gt_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x08,0xbf]
 
-s_cmp_gt_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x08,0xbf]
+s_cmp_gt_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x08,0xbf]
 
-s_cmp_gt_u32 s0, 0
-// CHECK: [0x00,0x80,0x08,0xbf]
+s_cmp_gt_u32 s1, 0
+// CHECK: [0x01,0x80,0x08,0xbf]
 
-s_cmp_gt_u32 s0, -1
-// CHECK: [0x00,0xc1,0x08,0xbf]
+s_cmp_gt_u32 s1, -1
+// CHECK: [0x01,0xc1,0x08,0xbf]
 
-s_cmp_gt_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x08,0xbf]
+s_cmp_gt_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x08,0xbf]
 
-s_cmp_gt_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x08,0xbf]
+s_cmp_gt_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x08,0xbf]
 
-s_cmp_gt_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x08,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_gt_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x08,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_gt_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x08,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_gt_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x08,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_ge_u32 s0, s0
-// CHECK: [0x00,0x00,0x09,0xbf]
+s_cmp_ge_u32 s1, s2
+// CHECK: [0x01,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 s103, s0
-// CHECK: [0x67,0x00,0x09,0xbf]
+s_cmp_ge_u32 s103, s2
+// CHECK: [0x67,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x09,0xbf]
+s_cmp_ge_u32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x09,0xbf]
+s_cmp_ge_u32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x09,0xbf]
+s_cmp_ge_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x09,0xbf]
+s_cmp_ge_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x09,0xbf]
+s_cmp_ge_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x09,0xbf]
+s_cmp_ge_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x09,0xbf]
+s_cmp_ge_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x09,0xbf]
+s_cmp_ge_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x09,0xbf]
+s_cmp_ge_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 m0, s0
-// CHECK: [0x7c,0x00,0x09,0xbf]
+s_cmp_ge_u32 m0, s2
+// CHECK: [0x7c,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x09,0xbf]
+s_cmp_ge_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x09,0xbf]
+s_cmp_ge_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 0, s0
-// CHECK: [0x80,0x00,0x09,0xbf]
+s_cmp_ge_u32 0, s2
+// CHECK: [0x80,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 -1, s0
-// CHECK: [0xc1,0x00,0x09,0xbf]
+s_cmp_ge_u32 -1, s2
+// CHECK: [0xc1,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x09,0xbf]
+s_cmp_ge_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x09,0xbf]
+s_cmp_ge_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x09,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_ge_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x09,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_ge_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x09,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_ge_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x09,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_ge_u32 s0, s103
-// CHECK: [0x00,0x67,0x09,0xbf]
+s_cmp_ge_u32 s1, s103
+// CHECK: [0x01,0x67,0x09,0xbf]
 
-s_cmp_ge_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x09,0xbf]
+s_cmp_ge_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x09,0xbf]
 
-s_cmp_ge_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x09,0xbf]
+s_cmp_ge_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x09,0xbf]
 
-s_cmp_ge_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x09,0xbf]
+s_cmp_ge_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x09,0xbf]
 
-s_cmp_ge_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x09,0xbf]
+s_cmp_ge_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x09,0xbf]
 
-s_cmp_ge_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x09,0xbf]
+s_cmp_ge_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x09,0xbf]
 
-s_cmp_ge_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x09,0xbf]
+s_cmp_ge_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x09,0xbf]
 
-s_cmp_ge_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x09,0xbf]
+s_cmp_ge_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x09,0xbf]
 
-s_cmp_ge_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x09,0xbf]
+s_cmp_ge_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x09,0xbf]
 
-s_cmp_ge_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x09,0xbf]
+s_cmp_ge_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x09,0xbf]
 
-s_cmp_ge_u32 s0, m0
-// CHECK: [0x00,0x7c,0x09,0xbf]
+s_cmp_ge_u32 s1, m0
+// CHECK: [0x01,0x7c,0x09,0xbf]
 
-s_cmp_ge_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x09,0xbf]
+s_cmp_ge_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x09,0xbf]
 
-s_cmp_ge_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x09,0xbf]
+s_cmp_ge_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x09,0xbf]
 
-s_cmp_ge_u32 s0, 0
-// CHECK: [0x00,0x80,0x09,0xbf]
+s_cmp_ge_u32 s1, 0
+// CHECK: [0x01,0x80,0x09,0xbf]
 
-s_cmp_ge_u32 s0, -1
-// CHECK: [0x00,0xc1,0x09,0xbf]
+s_cmp_ge_u32 s1, -1
+// CHECK: [0x01,0xc1,0x09,0xbf]
 
-s_cmp_ge_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x09,0xbf]
+s_cmp_ge_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x09,0xbf]
 
-s_cmp_ge_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x09,0xbf]
+s_cmp_ge_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x09,0xbf]
 
-s_cmp_ge_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x09,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_ge_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x09,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_ge_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x09,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_ge_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x09,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lt_u32 s0, s0
-// CHECK: [0x00,0x00,0x0a,0xbf]
+s_cmp_lt_u32 s1, s2
+// CHECK: [0x01,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 s103, s0
-// CHECK: [0x67,0x00,0x0a,0xbf]
+s_cmp_lt_u32 s103, s2
+// CHECK: [0x67,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x0a,0xbf]
+s_cmp_lt_u32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x0a,0xbf]
+s_cmp_lt_u32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x0a,0xbf]
+s_cmp_lt_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x0a,0xbf]
+s_cmp_lt_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x0a,0xbf]
+s_cmp_lt_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x0a,0xbf]
+s_cmp_lt_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x0a,0xbf]
+s_cmp_lt_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x0a,0xbf]
+s_cmp_lt_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x0a,0xbf]
+s_cmp_lt_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 m0, s0
-// CHECK: [0x7c,0x00,0x0a,0xbf]
+s_cmp_lt_u32 m0, s2
+// CHECK: [0x7c,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x0a,0xbf]
+s_cmp_lt_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x0a,0xbf]
+s_cmp_lt_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 0, s0
-// CHECK: [0x80,0x00,0x0a,0xbf]
+s_cmp_lt_u32 0, s2
+// CHECK: [0x80,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 -1, s0
-// CHECK: [0xc1,0x00,0x0a,0xbf]
+s_cmp_lt_u32 -1, s2
+// CHECK: [0xc1,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x0a,0xbf]
+s_cmp_lt_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x0a,0xbf]
+s_cmp_lt_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0a,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lt_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0a,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lt_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0a,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lt_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0a,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lt_u32 s0, s103
-// CHECK: [0x00,0x67,0x0a,0xbf]
+s_cmp_lt_u32 s1, s103
+// CHECK: [0x01,0x67,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x0a,0xbf]
+s_cmp_lt_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x0a,0xbf]
+s_cmp_lt_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x0a,0xbf]
+s_cmp_lt_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x0a,0xbf]
+s_cmp_lt_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x0a,0xbf]
+s_cmp_lt_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x0a,0xbf]
+s_cmp_lt_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x0a,0xbf]
+s_cmp_lt_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x0a,0xbf]
+s_cmp_lt_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x0a,0xbf]
+s_cmp_lt_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, m0
-// CHECK: [0x00,0x7c,0x0a,0xbf]
+s_cmp_lt_u32 s1, m0
+// CHECK: [0x01,0x7c,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x0a,0xbf]
+s_cmp_lt_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x0a,0xbf]
+s_cmp_lt_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, 0
-// CHECK: [0x00,0x80,0x0a,0xbf]
+s_cmp_lt_u32 s1, 0
+// CHECK: [0x01,0x80,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, -1
-// CHECK: [0x00,0xc1,0x0a,0xbf]
+s_cmp_lt_u32 s1, -1
+// CHECK: [0x01,0xc1,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x0a,0xbf]
+s_cmp_lt_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x0a,0xbf]
+s_cmp_lt_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x0a,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lt_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x0a,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lt_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x0a,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lt_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x0a,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_le_u32 s0, s0
-// CHECK: [0x00,0x00,0x0b,0xbf]
+s_cmp_le_u32 s1, s2
+// CHECK: [0x01,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 s103, s0
-// CHECK: [0x67,0x00,0x0b,0xbf]
+s_cmp_le_u32 s103, s2
+// CHECK: [0x67,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x0b,0xbf]
+s_cmp_le_u32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x0b,0xbf]
+s_cmp_le_u32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x0b,0xbf]
+s_cmp_le_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x0b,0xbf]
+s_cmp_le_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x0b,0xbf]
+s_cmp_le_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x0b,0xbf]
+s_cmp_le_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x0b,0xbf]
+s_cmp_le_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x0b,0xbf]
+s_cmp_le_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x0b,0xbf]
+s_cmp_le_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 m0, s0
-// CHECK: [0x7c,0x00,0x0b,0xbf]
+s_cmp_le_u32 m0, s2
+// CHECK: [0x7c,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x0b,0xbf]
+s_cmp_le_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x0b,0xbf]
+s_cmp_le_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 0, s0
-// CHECK: [0x80,0x00,0x0b,0xbf]
+s_cmp_le_u32 0, s2
+// CHECK: [0x80,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 -1, s0
-// CHECK: [0xc1,0x00,0x0b,0xbf]
+s_cmp_le_u32 -1, s2
+// CHECK: [0xc1,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x0b,0xbf]
+s_cmp_le_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x0b,0xbf]
+s_cmp_le_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0b,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_le_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0b,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_le_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0b,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_le_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0b,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_le_u32 s0, s103
-// CHECK: [0x00,0x67,0x0b,0xbf]
+s_cmp_le_u32 s1, s103
+// CHECK: [0x01,0x67,0x0b,0xbf]
 
-s_cmp_le_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x0b,0xbf]
+s_cmp_le_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x0b,0xbf]
 
-s_cmp_le_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x0b,0xbf]
+s_cmp_le_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x0b,0xbf]
 
-s_cmp_le_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x0b,0xbf]
+s_cmp_le_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x0b,0xbf]
 
-s_cmp_le_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x0b,0xbf]
+s_cmp_le_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x0b,0xbf]
 
-s_cmp_le_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x0b,0xbf]
+s_cmp_le_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x0b,0xbf]
 
-s_cmp_le_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x0b,0xbf]
+s_cmp_le_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x0b,0xbf]
 
-s_cmp_le_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x0b,0xbf]
+s_cmp_le_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x0b,0xbf]
 
-s_cmp_le_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x0b,0xbf]
+s_cmp_le_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x0b,0xbf]
 
-s_cmp_le_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x0b,0xbf]
+s_cmp_le_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x0b,0xbf]
 
-s_cmp_le_u32 s0, m0
-// CHECK: [0x00,0x7c,0x0b,0xbf]
+s_cmp_le_u32 s1, m0
+// CHECK: [0x01,0x7c,0x0b,0xbf]
 
-s_cmp_le_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x0b,0xbf]
+s_cmp_le_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x0b,0xbf]
 
-s_cmp_le_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x0b,0xbf]
+s_cmp_le_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x0b,0xbf]
 
-s_cmp_le_u32 s0, 0
-// CHECK: [0x00,0x80,0x0b,0xbf]
+s_cmp_le_u32 s1, 0
+// CHECK: [0x01,0x80,0x0b,0xbf]
 
-s_cmp_le_u32 s0, -1
-// CHECK: [0x00,0xc1,0x0b,0xbf]
+s_cmp_le_u32 s1, -1
+// CHECK: [0x01,0xc1,0x0b,0xbf]
 
-s_cmp_le_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x0b,0xbf]
+s_cmp_le_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x0b,0xbf]
 
-s_cmp_le_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x0b,0xbf]
+s_cmp_le_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x0b,0xbf]
 
-s_cmp_le_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x0b,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_le_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x0b,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_le_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x0b,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_le_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x0b,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp0_b32 s0, s0
-// CHECK: [0x00,0x00,0x0c,0xbf]
+s_bitcmp0_b32 s1, s2
+// CHECK: [0x01,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 s103, s0
-// CHECK: [0x67,0x00,0x0c,0xbf]
+s_bitcmp0_b32 s103, s2
+// CHECK: [0x67,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x0c,0xbf]
+s_bitcmp0_b32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x0c,0xbf]
+s_bitcmp0_b32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x0c,0xbf]
+s_bitcmp0_b32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x0c,0xbf]
+s_bitcmp0_b32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x0c,0xbf]
+s_bitcmp0_b32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x0c,0xbf]
+s_bitcmp0_b32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x0c,0xbf]
+s_bitcmp0_b32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x0c,0xbf]
+s_bitcmp0_b32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x0c,0xbf]
+s_bitcmp0_b32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 m0, s0
-// CHECK: [0x7c,0x00,0x0c,0xbf]
+s_bitcmp0_b32 m0, s2
+// CHECK: [0x7c,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x0c,0xbf]
+s_bitcmp0_b32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x0c,0xbf]
+s_bitcmp0_b32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 0, s0
-// CHECK: [0x80,0x00,0x0c,0xbf]
+s_bitcmp0_b32 0, s2
+// CHECK: [0x80,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 -1, s0
-// CHECK: [0xc1,0x00,0x0c,0xbf]
+s_bitcmp0_b32 -1, s2
+// CHECK: [0xc1,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 0.5, s0
-// CHECK: [0xf0,0x00,0x0c,0xbf]
+s_bitcmp0_b32 0.5, s2
+// CHECK: [0xf0,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 -4.0, s0
-// CHECK: [0xf7,0x00,0x0c,0xbf]
+s_bitcmp0_b32 -4.0, s2
+// CHECK: [0xf7,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0c,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp0_b32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0c,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp0_b32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0c,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp0_b32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0c,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp0_b32 s0, s103
-// CHECK: [0x00,0x67,0x0c,0xbf]
+s_bitcmp0_b32 s1, s103
+// CHECK: [0x01,0x67,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x0c,0xbf]
+s_bitcmp0_b32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x0c,0xbf]
+s_bitcmp0_b32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x0c,0xbf]
+s_bitcmp0_b32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x0c,0xbf]
+s_bitcmp0_b32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x0c,0xbf]
+s_bitcmp0_b32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x0c,0xbf]
+s_bitcmp0_b32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x0c,0xbf]
+s_bitcmp0_b32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x0c,0xbf]
+s_bitcmp0_b32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x0c,0xbf]
+s_bitcmp0_b32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, m0
-// CHECK: [0x00,0x7c,0x0c,0xbf]
+s_bitcmp0_b32 s1, m0
+// CHECK: [0x01,0x7c,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x0c,0xbf]
+s_bitcmp0_b32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x0c,0xbf]
+s_bitcmp0_b32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, 0
-// CHECK: [0x00,0x80,0x0c,0xbf]
+s_bitcmp0_b32 s1, 0
+// CHECK: [0x01,0x80,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, -1
-// CHECK: [0x00,0xc1,0x0c,0xbf]
+s_bitcmp0_b32 s1, -1
+// CHECK: [0x01,0xc1,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, 0.5
-// CHECK: [0x00,0xf0,0x0c,0xbf]
+s_bitcmp0_b32 s1, 0.5
+// CHECK: [0x01,0xf0,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, -4.0
-// CHECK: [0x00,0xf7,0x0c,0xbf]
+s_bitcmp0_b32 s1, -4.0
+// CHECK: [0x01,0xf7,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x0c,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp0_b32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x0c,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp0_b32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x0c,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp0_b32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x0c,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp1_b32 s0, s0
-// CHECK: [0x00,0x00,0x0d,0xbf]
+s_bitcmp1_b32 s1, s2
+// CHECK: [0x01,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 s103, s0
-// CHECK: [0x67,0x00,0x0d,0xbf]
+s_bitcmp1_b32 s103, s2
+// CHECK: [0x67,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x0d,0xbf]
+s_bitcmp1_b32 flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x0d,0xbf]
+s_bitcmp1_b32 flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x0d,0xbf]
+s_bitcmp1_b32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x0d,0xbf]
+s_bitcmp1_b32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x0d,0xbf]
+s_bitcmp1_b32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x0d,0xbf]
+s_bitcmp1_b32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x0d,0xbf]
+s_bitcmp1_b32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x0d,0xbf]
+s_bitcmp1_b32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x0d,0xbf]
+s_bitcmp1_b32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 m0, s0
-// CHECK: [0x7c,0x00,0x0d,0xbf]
+s_bitcmp1_b32 m0, s2
+// CHECK: [0x7c,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x0d,0xbf]
+s_bitcmp1_b32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x0d,0xbf]
+s_bitcmp1_b32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 0, s0
-// CHECK: [0x80,0x00,0x0d,0xbf]
+s_bitcmp1_b32 0, s2
+// CHECK: [0x80,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 -1, s0
-// CHECK: [0xc1,0x00,0x0d,0xbf]
+s_bitcmp1_b32 -1, s2
+// CHECK: [0xc1,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 0.5, s0
-// CHECK: [0xf0,0x00,0x0d,0xbf]
+s_bitcmp1_b32 0.5, s2
+// CHECK: [0xf0,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 -4.0, s0
-// CHECK: [0xf7,0x00,0x0d,0xbf]
+s_bitcmp1_b32 -4.0, s2
+// CHECK: [0xf7,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0d,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp1_b32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0d,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp1_b32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0d,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp1_b32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0d,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp1_b32 s0, s103
-// CHECK: [0x00,0x67,0x0d,0xbf]
+s_bitcmp1_b32 s1, s103
+// CHECK: [0x01,0x67,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x0d,0xbf]
+s_bitcmp1_b32 s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x0d,0xbf]
+s_bitcmp1_b32 s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x0d,0xbf]
+s_bitcmp1_b32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x0d,0xbf]
+s_bitcmp1_b32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x0d,0xbf]
+s_bitcmp1_b32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x0d,0xbf]
+s_bitcmp1_b32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x0d,0xbf]
+s_bitcmp1_b32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x0d,0xbf]
+s_bitcmp1_b32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x0d,0xbf]
+s_bitcmp1_b32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, m0
-// CHECK: [0x00,0x7c,0x0d,0xbf]
+s_bitcmp1_b32 s1, m0
+// CHECK: [0x01,0x7c,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x0d,0xbf]
+s_bitcmp1_b32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x0d,0xbf]
+s_bitcmp1_b32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, 0
-// CHECK: [0x00,0x80,0x0d,0xbf]
+s_bitcmp1_b32 s1, 0
+// CHECK: [0x01,0x80,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, -1
-// CHECK: [0x00,0xc1,0x0d,0xbf]
+s_bitcmp1_b32 s1, -1
+// CHECK: [0x01,0xc1,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, 0.5
-// CHECK: [0x00,0xf0,0x0d,0xbf]
+s_bitcmp1_b32 s1, 0.5
+// CHECK: [0x01,0xf0,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, -4.0
-// CHECK: [0x00,0xf7,0x0d,0xbf]
+s_bitcmp1_b32 s1, -4.0
+// CHECK: [0x01,0xf7,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x0d,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp1_b32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x0d,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp1_b32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x0d,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp1_b32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x0d,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp0_b64 s[0:1], s0
-// CHECK: [0x00,0x00,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], s2
+// CHECK: [0x02,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 s[2:3], s0
-// CHECK: [0x02,0x00,0x0e,0xbf]
+s_bitcmp0_b64 s[4:5], s2
+// CHECK: [0x04,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 s[102:103], s0
-// CHECK: [0x66,0x00,0x0e,0xbf]
+s_bitcmp0_b64 s[102:103], s2
+// CHECK: [0x66,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 flat_scratch, s0
-// CHECK: [0x68,0x00,0x0e,0xbf]
+s_bitcmp0_b64 flat_scratch, s2
+// CHECK: [0x68,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 vcc, s0
-// CHECK: [0x6a,0x00,0x0e,0xbf]
+s_bitcmp0_b64 vcc, s2
+// CHECK: [0x6a,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 tba, s0
-// CHECK: [0x6c,0x00,0x0e,0xbf]
+s_bitcmp0_b64 tba, s2
+// CHECK: [0x6c,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 tma, s0
-// CHECK: [0x6e,0x00,0x0e,0xbf]
+s_bitcmp0_b64 tma, s2
+// CHECK: [0x6e,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x0e,0xbf]
+s_bitcmp0_b64 ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 exec, s0
-// CHECK: [0x7e,0x00,0x0e,0xbf]
+s_bitcmp0_b64 exec, s2
+// CHECK: [0x7e,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 0, s0
-// CHECK: [0x80,0x00,0x0e,0xbf]
+s_bitcmp0_b64 0, s2
+// CHECK: [0x80,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 -1, s0
-// CHECK: [0xc1,0x00,0x0e,0xbf]
+s_bitcmp0_b64 -1, s2
+// CHECK: [0xc1,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 0.5, s0
-// CHECK: [0xf0,0x00,0x0e,0xbf]
+s_bitcmp0_b64 0.5, s2
+// CHECK: [0xf0,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 -4.0, s0
-// CHECK: [0xf7,0x00,0x0e,0xbf]
+s_bitcmp0_b64 -4.0, s2
+// CHECK: [0xf7,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0e,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp0_b64 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0e,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp0_b64 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0e,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp0_b64 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0e,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp0_b64 s[0:1], s103
-// CHECK: [0x00,0x67,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], s103
+// CHECK: [0x02,0x67,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x68,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x68,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x69,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x69,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], m0
-// CHECK: [0x00,0x7c,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], m0
+// CHECK: [0x02,0x7c,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], 0
-// CHECK: [0x00,0x80,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], 0
+// CHECK: [0x02,0x80,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], -1
-// CHECK: [0x00,0xc1,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], -1
+// CHECK: [0x02,0xc1,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x0e,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp0_b64 s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x0e,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp0_b64 s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x0e,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp0_b64 s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x0e,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp1_b64 s[0:1], s0
-// CHECK: [0x00,0x00,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], s2
+// CHECK: [0x02,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 s[2:3], s0
-// CHECK: [0x02,0x00,0x0f,0xbf]
+s_bitcmp1_b64 s[4:5], s2
+// CHECK: [0x04,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 s[102:103], s0
-// CHECK: [0x66,0x00,0x0f,0xbf]
+s_bitcmp1_b64 s[102:103], s2
+// CHECK: [0x66,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 flat_scratch, s0
-// CHECK: [0x68,0x00,0x0f,0xbf]
+s_bitcmp1_b64 flat_scratch, s2
+// CHECK: [0x68,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 vcc, s0
-// CHECK: [0x6a,0x00,0x0f,0xbf]
+s_bitcmp1_b64 vcc, s2
+// CHECK: [0x6a,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 tba, s0
-// CHECK: [0x6c,0x00,0x0f,0xbf]
+s_bitcmp1_b64 tba, s2
+// CHECK: [0x6c,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 tma, s0
-// CHECK: [0x6e,0x00,0x0f,0xbf]
+s_bitcmp1_b64 tma, s2
+// CHECK: [0x6e,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x0f,0xbf]
+s_bitcmp1_b64 ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 exec, s0
-// CHECK: [0x7e,0x00,0x0f,0xbf]
+s_bitcmp1_b64 exec, s2
+// CHECK: [0x7e,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 0, s0
-// CHECK: [0x80,0x00,0x0f,0xbf]
+s_bitcmp1_b64 0, s2
+// CHECK: [0x80,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 -1, s0
-// CHECK: [0xc1,0x00,0x0f,0xbf]
+s_bitcmp1_b64 -1, s2
+// CHECK: [0xc1,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 0.5, s0
-// CHECK: [0xf0,0x00,0x0f,0xbf]
+s_bitcmp1_b64 0.5, s2
+// CHECK: [0xf0,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 -4.0, s0
-// CHECK: [0xf7,0x00,0x0f,0xbf]
+s_bitcmp1_b64 -4.0, s2
+// CHECK: [0xf7,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0f,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp1_b64 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0f,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp1_b64 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0f,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp1_b64 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0f,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp1_b64 s[0:1], s103
-// CHECK: [0x00,0x67,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], s103
+// CHECK: [0x02,0x67,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x68,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x68,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x69,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x69,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], m0
-// CHECK: [0x00,0x7c,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], m0
+// CHECK: [0x02,0x7c,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], 0
-// CHECK: [0x00,0x80,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], 0
+// CHECK: [0x02,0x80,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], -1
-// CHECK: [0x00,0xc1,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], -1
+// CHECK: [0x02,0xc1,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x0f,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp1_b64 s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x0f,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp1_b64 s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x0f,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp1_b64 s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x0f,0xbf,0x73,0x72,0x71,0x3f]
 
-s_setvskip s0, s0
-// CHECK: [0x00,0x00,0x10,0xbf]
+s_setvskip s1, s2
+// CHECK: [0x01,0x02,0x10,0xbf]
 
-s_setvskip s103, s0
-// CHECK: [0x67,0x00,0x10,0xbf]
+s_setvskip s103, s2
+// CHECK: [0x67,0x02,0x10,0xbf]
 
-s_setvskip flat_scratch_lo, s0
-// CHECK: [0x68,0x00,0x10,0xbf]
+s_setvskip flat_scratch_lo, s2
+// CHECK: [0x68,0x02,0x10,0xbf]
 
-s_setvskip flat_scratch_hi, s0
-// CHECK: [0x69,0x00,0x10,0xbf]
+s_setvskip flat_scratch_hi, s2
+// CHECK: [0x69,0x02,0x10,0xbf]
 
-s_setvskip vcc_lo, s0
-// CHECK: [0x6a,0x00,0x10,0xbf]
+s_setvskip vcc_lo, s2
+// CHECK: [0x6a,0x02,0x10,0xbf]
 
-s_setvskip vcc_hi, s0
-// CHECK: [0x6b,0x00,0x10,0xbf]
+s_setvskip vcc_hi, s2
+// CHECK: [0x6b,0x02,0x10,0xbf]
 
-s_setvskip tba_lo, s0
-// CHECK: [0x6c,0x00,0x10,0xbf]
+s_setvskip tba_lo, s2
+// CHECK: [0x6c,0x02,0x10,0xbf]
 
-s_setvskip tba_hi, s0
-// CHECK: [0x6d,0x00,0x10,0xbf]
+s_setvskip tba_hi, s2
+// CHECK: [0x6d,0x02,0x10,0xbf]
 
-s_setvskip tma_lo, s0
-// CHECK: [0x6e,0x00,0x10,0xbf]
+s_setvskip tma_lo, s2
+// CHECK: [0x6e,0x02,0x10,0xbf]
 
-s_setvskip tma_hi, s0
-// CHECK: [0x6f,0x00,0x10,0xbf]
+s_setvskip tma_hi, s2
+// CHECK: [0x6f,0x02,0x10,0xbf]
 
-s_setvskip ttmp11, s0
-// CHECK: [0x7b,0x00,0x10,0xbf]
+s_setvskip ttmp11, s2
+// CHECK: [0x7b,0x02,0x10,0xbf]
 
-s_setvskip m0, s0
-// CHECK: [0x7c,0x00,0x10,0xbf]
+s_setvskip m0, s2
+// CHECK: [0x7c,0x02,0x10,0xbf]
 
-s_setvskip exec_lo, s0
-// CHECK: [0x7e,0x00,0x10,0xbf]
+s_setvskip exec_lo, s2
+// CHECK: [0x7e,0x02,0x10,0xbf]
 
-s_setvskip exec_hi, s0
-// CHECK: [0x7f,0x00,0x10,0xbf]
+s_setvskip exec_hi, s2
+// CHECK: [0x7f,0x02,0x10,0xbf]
 
-s_setvskip 0, s0
-// CHECK: [0x80,0x00,0x10,0xbf]
+s_setvskip 0, s2
+// CHECK: [0x80,0x02,0x10,0xbf]
 
-s_setvskip -1, s0
-// CHECK: [0xc1,0x00,0x10,0xbf]
+s_setvskip -1, s2
+// CHECK: [0xc1,0x02,0x10,0xbf]
 
-s_setvskip 0.5, s0
-// CHECK: [0xf0,0x00,0x10,0xbf]
+s_setvskip 0.5, s2
+// CHECK: [0xf0,0x02,0x10,0xbf]
 
-s_setvskip -4.0, s0
-// CHECK: [0xf7,0x00,0x10,0xbf]
+s_setvskip -4.0, s2
+// CHECK: [0xf7,0x02,0x10,0xbf]
 
-s_setvskip 0xaf123456, s0
-// CHECK: [0xff,0x00,0x10,0xbf,0x56,0x34,0x12,0xaf]
+s_setvskip 0xaf123456, s2
+// CHECK: [0xff,0x02,0x10,0xbf,0x56,0x34,0x12,0xaf]
 
-s_setvskip 0x3f717273, s0
-// CHECK: [0xff,0x00,0x10,0xbf,0x73,0x72,0x71,0x3f]
+s_setvskip 0x3f717273, s2
+// CHECK: [0xff,0x02,0x10,0xbf,0x73,0x72,0x71,0x3f]
 
-s_setvskip s0, s103
-// CHECK: [0x00,0x67,0x10,0xbf]
+s_setvskip s1, s103
+// CHECK: [0x01,0x67,0x10,0xbf]
 
-s_setvskip s0, flat_scratch_lo
-// CHECK: [0x00,0x68,0x10,0xbf]
+s_setvskip s1, flat_scratch_lo
+// CHECK: [0x01,0x68,0x10,0xbf]
 
-s_setvskip s0, flat_scratch_hi
-// CHECK: [0x00,0x69,0x10,0xbf]
+s_setvskip s1, flat_scratch_hi
+// CHECK: [0x01,0x69,0x10,0xbf]
 
-s_setvskip s0, vcc_lo
-// CHECK: [0x00,0x6a,0x10,0xbf]
+s_setvskip s1, vcc_lo
+// CHECK: [0x01,0x6a,0x10,0xbf]
 
-s_setvskip s0, vcc_hi
-// CHECK: [0x00,0x6b,0x10,0xbf]
+s_setvskip s1, vcc_hi
+// CHECK: [0x01,0x6b,0x10,0xbf]
 
-s_setvskip s0, tba_lo
-// CHECK: [0x00,0x6c,0x10,0xbf]
+s_setvskip s1, tba_lo
+// CHECK: [0x01,0x6c,0x10,0xbf]
 
-s_setvskip s0, tba_hi
-// CHECK: [0x00,0x6d,0x10,0xbf]
+s_setvskip s1, tba_hi
+// CHECK: [0x01,0x6d,0x10,0xbf]
 
-s_setvskip s0, tma_lo
-// CHECK: [0x00,0x6e,0x10,0xbf]
+s_setvskip s1, tma_lo
+// CHECK: [0x01,0x6e,0x10,0xbf]
 
-s_setvskip s0, tma_hi
-// CHECK: [0x00,0x6f,0x10,0xbf]
+s_setvskip s1, tma_hi
+// CHECK: [0x01,0x6f,0x10,0xbf]
 
-s_setvskip s0, ttmp11
-// CHECK: [0x00,0x7b,0x10,0xbf]
+s_setvskip s1, ttmp11
+// CHECK: [0x01,0x7b,0x10,0xbf]
 
-s_setvskip s0, m0
-// CHECK: [0x00,0x7c,0x10,0xbf]
+s_setvskip s1, m0
+// CHECK: [0x01,0x7c,0x10,0xbf]
 
-s_setvskip s0, exec_lo
-// CHECK: [0x00,0x7e,0x10,0xbf]
+s_setvskip s1, exec_lo
+// CHECK: [0x01,0x7e,0x10,0xbf]
 
-s_setvskip s0, exec_hi
-// CHECK: [0x00,0x7f,0x10,0xbf]
+s_setvskip s1, exec_hi
+// CHECK: [0x01,0x7f,0x10,0xbf]
 
-s_setvskip s0, 0
-// CHECK: [0x00,0x80,0x10,0xbf]
+s_setvskip s1, 0
+// CHECK: [0x01,0x80,0x10,0xbf]
 
-s_setvskip s0, -1
-// CHECK: [0x00,0xc1,0x10,0xbf]
+s_setvskip s1, -1
+// CHECK: [0x01,0xc1,0x10,0xbf]
 
-s_setvskip s0, 0.5
-// CHECK: [0x00,0xf0,0x10,0xbf]
+s_setvskip s1, 0.5
+// CHECK: [0x01,0xf0,0x10,0xbf]
 
-s_setvskip s0, -4.0
-// CHECK: [0x00,0xf7,0x10,0xbf]
+s_setvskip s1, -4.0
+// CHECK: [0x01,0xf7,0x10,0xbf]
 
-s_setvskip s0, 0xaf123456
-// CHECK: [0x00,0xff,0x10,0xbf,0x56,0x34,0x12,0xaf]
+s_setvskip s1, 0xaf123456
+// CHECK: [0x01,0xff,0x10,0xbf,0x56,0x34,0x12,0xaf]
 
-s_setvskip s0, 0x3f717273
-// CHECK: [0x00,0xff,0x10,0xbf,0x73,0x72,0x71,0x3f]
+s_setvskip s1, 0x3f717273
+// CHECK: [0x01,0xff,0x10,0xbf,0x73,0x72,0x71,0x3f]
 
-s_movk_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb0]
+s_movk_i32 s5, 0x3141
+// CHECK: [0x41,0x31,0x05,0xb0]
 
 s_movk_i32 s103, 0x3141
 // CHECK: [0x41,0x31,0x67,0xb0]
@@ -21900,11 +21914,11 @@ s_movk_i32 exec_lo, 0x3141
 s_movk_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb0]
 
-s_movk_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb0]
+s_movk_i32 s5, 0xc1d1
+// CHECK: [0xd1,0xc1,0x05,0xb0]
 
-s_cmovk_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb1]
+s_cmovk_i32 s5, 0x3141
+// CHECK: [0x41,0x31,0x05,0xb1]
 
 s_cmovk_i32 s103, 0x3141
 // CHECK: [0x41,0x31,0x67,0xb1]
@@ -21945,11 +21959,11 @@ s_cmovk_i32 exec_lo, 0x3141
 s_cmovk_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb1]
 
-s_cmovk_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb1]
+s_cmovk_i32 s5, 0xc1d1
+// CHECK: [0xd1,0xc1,0x05,0xb1]
 
-s_cmpk_eq_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb1]
+s_cmpk_eq_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb1]
 
 s_cmpk_eq_i32 s103, 0x3141
 // CHECK: [0x41,0x31,0xe7,0xb1]
@@ -21990,11 +22004,11 @@ s_cmpk_eq_i32 exec_lo, 0x3141
 s_cmpk_eq_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb1]
 
-s_cmpk_eq_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb1]
+s_cmpk_eq_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb1]
 
-s_cmpk_lg_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb2]
+s_cmpk_lg_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb2]
 
 s_cmpk_lg_i32 s103, 0x3141
 // CHECK: [0x41,0x31,0x67,0xb2]
@@ -22035,11 +22049,11 @@ s_cmpk_lg_i32 exec_lo, 0x3141
 s_cmpk_lg_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb2]
 
-s_cmpk_lg_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb2]
+s_cmpk_lg_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb2]
 
-s_cmpk_gt_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb2]
+s_cmpk_gt_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb2]
 
 s_cmpk_gt_i32 s103, 0x3141
 // CHECK: [0x41,0x31,0xe7,0xb2]
@@ -22080,11 +22094,11 @@ s_cmpk_gt_i32 exec_lo, 0x3141
 s_cmpk_gt_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb2]
 
-s_cmpk_gt_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb2]
+s_cmpk_gt_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb2]
 
-s_cmpk_ge_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb3]
+s_cmpk_ge_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb3]
 
 s_cmpk_ge_i32 s103, 0x3141
 // CHECK: [0x41,0x31,0x67,0xb3]
@@ -22125,11 +22139,11 @@ s_cmpk_ge_i32 exec_lo, 0x3141
 s_cmpk_ge_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb3]
 
-s_cmpk_ge_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb3]
+s_cmpk_ge_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb3]
 
-s_cmpk_lt_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb3]
+s_cmpk_lt_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb3]
 
 s_cmpk_lt_i32 s103, 0x3141
 // CHECK: [0x41,0x31,0xe7,0xb3]
@@ -22170,11 +22184,11 @@ s_cmpk_lt_i32 exec_lo, 0x3141
 s_cmpk_lt_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb3]
 
-s_cmpk_lt_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb3]
+s_cmpk_lt_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb3]
 
-s_cmpk_le_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb4]
+s_cmpk_le_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb4]
 
 s_cmpk_le_i32 s103, 0x3141
 // CHECK: [0x41,0x31,0x67,0xb4]
@@ -22215,11 +22229,11 @@ s_cmpk_le_i32 exec_lo, 0x3141
 s_cmpk_le_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb4]
 
-s_cmpk_le_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb4]
+s_cmpk_le_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb4]
 
-s_cmpk_eq_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb4]
+s_cmpk_eq_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb4]
 
 s_cmpk_eq_u32 s103, 0x3141
 // CHECK: [0x41,0x31,0xe7,0xb4]
@@ -22260,11 +22274,11 @@ s_cmpk_eq_u32 exec_lo, 0x3141
 s_cmpk_eq_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb4]
 
-s_cmpk_eq_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb4]
+s_cmpk_eq_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb4]
 
-s_cmpk_lg_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb5]
+s_cmpk_lg_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb5]
 
 s_cmpk_lg_u32 s103, 0x3141
 // CHECK: [0x41,0x31,0x67,0xb5]
@@ -22305,11 +22319,11 @@ s_cmpk_lg_u32 exec_lo, 0x3141
 s_cmpk_lg_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb5]
 
-s_cmpk_lg_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb5]
+s_cmpk_lg_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb5]
 
-s_cmpk_gt_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb5]
+s_cmpk_gt_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb5]
 
 s_cmpk_gt_u32 s103, 0x3141
 // CHECK: [0x41,0x31,0xe7,0xb5]
@@ -22350,11 +22364,11 @@ s_cmpk_gt_u32 exec_lo, 0x3141
 s_cmpk_gt_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb5]
 
-s_cmpk_gt_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb5]
+s_cmpk_gt_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb5]
 
-s_cmpk_ge_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb6]
+s_cmpk_ge_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb6]
 
 s_cmpk_ge_u32 s103, 0x3141
 // CHECK: [0x41,0x31,0x67,0xb6]
@@ -22395,11 +22409,11 @@ s_cmpk_ge_u32 exec_lo, 0x3141
 s_cmpk_ge_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb6]
 
-s_cmpk_ge_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb6]
+s_cmpk_ge_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb6]
 
-s_cmpk_lt_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb6]
+s_cmpk_lt_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb6]
 
 s_cmpk_lt_u32 s103, 0x3141
 // CHECK: [0x41,0x31,0xe7,0xb6]
@@ -22440,11 +22454,11 @@ s_cmpk_lt_u32 exec_lo, 0x3141
 s_cmpk_lt_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb6]
 
-s_cmpk_lt_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb6]
+s_cmpk_lt_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb6]
 
-s_cmpk_le_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb7]
+s_cmpk_le_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb7]
 
 s_cmpk_le_u32 s103, 0x3141
 // CHECK: [0x41,0x31,0x67,0xb7]
@@ -22485,11 +22499,11 @@ s_cmpk_le_u32 exec_lo, 0x3141
 s_cmpk_le_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb7]
 
-s_cmpk_le_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb7]
+s_cmpk_le_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb7]
 
-s_addk_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb7]
+s_addk_i32 s5, 0x3141
+// CHECK: [0x41,0x31,0x85,0xb7]
 
 s_addk_i32 s103, 0x3141
 // CHECK: [0x41,0x31,0xe7,0xb7]
@@ -22530,11 +22544,11 @@ s_addk_i32 exec_lo, 0x3141
 s_addk_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb7]
 
-s_addk_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb7]
+s_addk_i32 s5, 0xc1d1
+// CHECK: [0xd1,0xc1,0x85,0xb7]
 
-s_mulk_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb8]
+s_mulk_i32 s5, 0x3141
+// CHECK: [0x41,0x31,0x05,0xb8]
 
 s_mulk_i32 s103, 0x3141
 // CHECK: [0x41,0x31,0x67,0xb8]
@@ -22575,15 +22589,15 @@ s_mulk_i32 exec_lo, 0x3141
 s_mulk_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb8]
 
-s_mulk_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb8]
-
-s_cbranch_i_fork s[0:1], 12609
-// CHECK: [0x41,0x31,0x80,0xb8]
+s_mulk_i32 s5, 0xc1d1
+// CHECK: [0xd1,0xc1,0x05,0xb8]
 
 s_cbranch_i_fork s[2:3], 12609
 // CHECK: [0x41,0x31,0x82,0xb8]
 
+s_cbranch_i_fork s[4:5], 12609
+// CHECK: [0x41,0x31,0x84,0xb8]
+
 s_cbranch_i_fork s[102:103], 12609
 // CHECK: [0x41,0x31,0xe6,0xb8]
 
@@ -22605,11 +22619,11 @@ s_cbranch_i_fork ttmp[10:11], 12609
 s_cbranch_i_fork exec, 12609
 // CHECK: [0x41,0x31,0xfe,0xb8]
 
-s_cbranch_i_fork s[0:1], 49617
-// CHECK: [0xd1,0xc1,0x80,0xb8]
+s_cbranch_i_fork s[2:3], 49617
+// CHECK: [0xd1,0xc1,0x82,0xb8]
 
-s_getreg_b32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb9]
+s_getreg_b32 s5, 0x3141
+// CHECK: [0x41,0x31,0x05,0xb9]
 
 s_getreg_b32 s103, 0x3141
 // CHECK: [0x41,0x31,0x67,0xb9]
@@ -22650,14 +22664,14 @@ s_getreg_b32 exec_lo, 0x3141
 s_getreg_b32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb9]
 
-s_getreg_b32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb9]
+s_getreg_b32 s5, 0xc1d1
+// CHECK: [0xd1,0xc1,0x05,0xb9]
 
-s_setreg_b32 0x3141, s0
-// CHECK: [0x41,0x31,0x80,0xb9]
+s_setreg_b32 0x3141, s1
+// CHECK: [0x41,0x31,0x81,0xb9]
 
-s_setreg_b32 0xc1d1, s0
-// CHECK: [0xd1,0xc1,0x80,0xb9]
+s_setreg_b32 0xc1d1, s1
+// CHECK: [0xd1,0xc1,0x81,0xb9]
 
 s_setreg_b32 0x3141, s103
 // CHECK: [0x41,0x31,0xe7,0xb9]
@@ -22821,83 +22835,89 @@ s_decperflevel 0xc1d1
 s_ttracedata
 // CHECK: [0x00,0x00,0x96,0xbf]
 
-v_interp_p1_f32 v255, v0, attr0.x
-// CHECK: [0x00,0x00,0xfc,0xcb]
+v_interp_p1_f32 v5, v1, attr0.x
+// CHECK: [0x01,0x00,0x14,0xc8]
+
+v_interp_p1_f32 v255, v1, attr0.x
+// CHECK: [0x01,0x00,0xfc,0xcb]
 
-v_interp_p1_f32 v255, v0, attr1.x
-// CHECK: [0x00,0x04,0xfc,0xcb]
+v_interp_p1_f32 v5, v255, attr0.x
+// CHECK: [0xff,0x00,0x14,0xc8]
 
-v_interp_p1_f32 v255, v0, attr31.x
-// CHECK: [0x00,0x7c,0xfc,0xcb]
+v_interp_p1_f32 v5, v1, attr1.x
+// CHECK: [0x01,0x04,0x14,0xc8]
 
-v_interp_p1_f32 v255, v0, attr32.x
-// CHECK: [0x00,0x80,0xfc,0xcb]
+v_interp_p1_f32 v5, v1, attr31.x
+// CHECK: [0x01,0x7c,0x14,0xc8]
 
-v_interp_p1_f32 v255, v0, attr0.y
-// CHECK: [0x00,0x01,0xfc,0xcb]
+v_interp_p1_f32 v5, v1, attr32.x
+// CHECK: [0x01,0x80,0x14,0xc8]
 
-v_interp_p1_f32 v255, v0, attr0.z
-// CHECK: [0x00,0x02,0xfc,0xcb]
+v_interp_p1_f32 v5, v1, attr0.y
+// CHECK: [0x01,0x01,0x14,0xc8]
 
-v_interp_p1_f32 v255, v0, attr0.w
-// CHECK: [0x00,0x03,0xfc,0xcb]
+v_interp_p1_f32 v5, v1, attr0.z
+// CHECK: [0x01,0x02,0x14,0xc8]
 
-v_interp_p2_f32 v0, v0, attr0.x
-// CHECK: [0x00,0x00,0x01,0xc8]
+v_interp_p1_f32 v5, v1, attr0.w
+// CHECK: [0x01,0x03,0x14,0xc8]
 
-v_interp_p2_f32 v255, v0, attr0.x
-// CHECK: [0x00,0x00,0xfd,0xcb]
+v_interp_p2_f32 v5, v1, attr0.x
+// CHECK: [0x01,0x00,0x15,0xc8]
 
-v_interp_p2_f32 v0, v255, attr0.x
-// CHECK: [0xff,0x00,0x01,0xc8]
+v_interp_p2_f32 v255, v1, attr0.x
+// CHECK: [0x01,0x00,0xfd,0xcb]
 
-v_interp_p2_f32 v0, v0, attr1.x
-// CHECK: [0x00,0x04,0x01,0xc8]
+v_interp_p2_f32 v5, v255, attr0.x
+// CHECK: [0xff,0x00,0x15,0xc8]
 
-v_interp_p2_f32 v0, v0, attr31.x
-// CHECK: [0x00,0x7c,0x01,0xc8]
+v_interp_p2_f32 v5, v1, attr1.x
+// CHECK: [0x01,0x04,0x15,0xc8]
 
-v_interp_p2_f32 v0, v0, attr32.x
-// CHECK: [0x00,0x80,0x01,0xc8]
+v_interp_p2_f32 v5, v1, attr31.x
+// CHECK: [0x01,0x7c,0x15,0xc8]
 
-v_interp_p2_f32 v0, v0, attr0.y
-// CHECK: [0x00,0x01,0x01,0xc8]
+v_interp_p2_f32 v5, v1, attr32.x
+// CHECK: [0x01,0x80,0x15,0xc8]
 
-v_interp_p2_f32 v0, v0, attr0.z
-// CHECK: [0x00,0x02,0x01,0xc8]
+v_interp_p2_f32 v5, v1, attr0.y
+// CHECK: [0x01,0x01,0x15,0xc8]
 
-v_interp_p2_f32 v0, v0, attr0.w
-// CHECK: [0x00,0x03,0x01,0xc8]
+v_interp_p2_f32 v5, v1, attr0.z
+// CHECK: [0x01,0x02,0x15,0xc8]
 
-v_interp_mov_f32 v0, p10, attr0.x
-// CHECK: [0x00,0x00,0x02,0xc8]
+v_interp_p2_f32 v5, v1, attr0.w
+// CHECK: [0x01,0x03,0x15,0xc8]
+
+v_interp_mov_f32 v5, p10, attr0.x
+// CHECK: [0x00,0x00,0x16,0xc8]
 
 v_interp_mov_f32 v255, p10, attr0.x
 // CHECK: [0x00,0x00,0xfe,0xcb]
 
-v_interp_mov_f32 v0, p20, attr0.x
-// CHECK: [0x01,0x00,0x02,0xc8]
+v_interp_mov_f32 v5, p20, attr0.x
+// CHECK: [0x01,0x00,0x16,0xc8]
 
-v_interp_mov_f32 v0, p0, attr0.x
-// CHECK: [0x02,0x00,0x02,0xc8]
+v_interp_mov_f32 v5, p0, attr0.x
+// CHECK: [0x02,0x00,0x16,0xc8]
 
-v_interp_mov_f32 v0, p10, attr1.x
-// CHECK: [0x00,0x04,0x02,0xc8]
+v_interp_mov_f32 v5, p10, attr1.x
+// CHECK: [0x00,0x04,0x16,0xc8]
 
-v_interp_mov_f32 v0, p10, attr31.x
-// CHECK: [0x00,0x7c,0x02,0xc8]
+v_interp_mov_f32 v5, p10, attr31.x
+// CHECK: [0x00,0x7c,0x16,0xc8]
 
-v_interp_mov_f32 v0, p10, attr32.x
-// CHECK: [0x00,0x80,0x02,0xc8]
+v_interp_mov_f32 v5, p10, attr32.x
+// CHECK: [0x00,0x80,0x16,0xc8]
 
-v_interp_mov_f32 v0, p10, attr0.y
-// CHECK: [0x00,0x01,0x02,0xc8]
+v_interp_mov_f32 v5, p10, attr0.y
+// CHECK: [0x00,0x01,0x16,0xc8]
 
-v_interp_mov_f32 v0, p10, attr0.z
-// CHECK: [0x00,0x02,0x02,0xc8]
+v_interp_mov_f32 v5, p10, attr0.z
+// CHECK: [0x00,0x02,0x16,0xc8]
 
-v_interp_mov_f32 v0, p10, attr0.w
-// CHECK: [0x00,0x03,0x02,0xc8]
+v_interp_mov_f32 v5, p10, attr0.w
+// CHECK: [0x00,0x03,0x16,0xc8]
 
 v_nop
 // CHECK: [0x00,0x00,0x00,0x7e]
@@ -22905,7484 +22925,7475 @@ v_nop
 v_nop_e64
 // CHECK: [0x00,0x00,0x00,0xd3,0x00,0x00,0x00,0x00]
 
-v_mov_b32 v0, s0
-// CHECK: [0x00,0x02,0x00,0x7e]
-
-v_mov_b32 v255, s0
-// CHECK: [0x00,0x02,0xfe,0x7f]
-
-v_mov_b32 v0, s103
-// CHECK: [0x67,0x02,0x00,0x7e]
-
-v_mov_b32 v0, flat_scratch_lo
-// CHECK: [0x68,0x02,0x00,0x7e]
+v_mov_b32 v5, s1
+// CHECK: [0x01,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, flat_scratch_hi
-// CHECK: [0x69,0x02,0x00,0x7e]
+v_mov_b32 v255, s1
+// CHECK: [0x01,0x02,0xfe,0x7f]
 
-v_mov_b32 v0, vcc_lo
-// CHECK: [0x6a,0x02,0x00,0x7e]
+v_mov_b32 v5, s103
+// CHECK: [0x67,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, vcc_hi
-// CHECK: [0x6b,0x02,0x00,0x7e]
+v_mov_b32 v5, flat_scratch_lo
+// CHECK: [0x68,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, tba_lo
-// CHECK: [0x6c,0x02,0x00,0x7e]
+v_mov_b32 v5, flat_scratch_hi
+// CHECK: [0x69,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, tba_hi
-// CHECK: [0x6d,0x02,0x00,0x7e]
+v_mov_b32 v5, vcc_lo
+// CHECK: [0x6a,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, tma_lo
-// CHECK: [0x6e,0x02,0x00,0x7e]
+v_mov_b32 v5, vcc_hi
+// CHECK: [0x6b,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, tma_hi
-// CHECK: [0x6f,0x02,0x00,0x7e]
+v_mov_b32 v5, tba_lo
+// CHECK: [0x6c,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, ttmp11
-// CHECK: [0x7b,0x02,0x00,0x7e]
+v_mov_b32 v5, tba_hi
+// CHECK: [0x6d,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, m0
-// CHECK: [0x7c,0x02,0x00,0x7e]
+v_mov_b32 v5, tma_lo
+// CHECK: [0x6e,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, exec_lo
-// CHECK: [0x7e,0x02,0x00,0x7e]
+v_mov_b32 v5, tma_hi
+// CHECK: [0x6f,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, exec_hi
-// CHECK: [0x7f,0x02,0x00,0x7e]
+v_mov_b32 v5, ttmp11
+// CHECK: [0x7b,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, 0
-// CHECK: [0x80,0x02,0x00,0x7e]
+v_mov_b32 v5, m0
+// CHECK: [0x7c,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, -1
-// CHECK: [0xc1,0x02,0x00,0x7e]
+v_mov_b32 v5, exec_lo
+// CHECK: [0x7e,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, 0.5
-// CHECK: [0xf0,0x02,0x00,0x7e]
+v_mov_b32 v5, exec_hi
+// CHECK: [0x7f,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, -4.0
-// CHECK: [0xf7,0x02,0x00,0x7e]
+v_mov_b32 v5, 0
+// CHECK: [0x80,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, 0xaf123456
-// CHECK: [0xff,0x02,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_mov_b32 v5, -1
+// CHECK: [0xc1,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, 0x3f717273
-// CHECK: [0xff,0x02,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_mov_b32 v5, 0.5
+// CHECK: [0xf0,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, v0
-// CHECK: [0x00,0x03,0x00,0x7e]
+v_mov_b32 v5, -4.0
+// CHECK: [0xf7,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, v255
-// CHECK: [0xff,0x03,0x00,0x7e]
+v_mov_b32 v5, 0xaf123456
+// CHECK: [0xff,0x02,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_mov_b32_e64 v0, s0
-// CHECK: [0x00,0x00,0x02,0xd3,0x00,0x00,0x00,0x00]
+v_mov_b32 v5, 0x3f717273
+// CHECK: [0xff,0x02,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_mov_b32_e64 v255, s0
-// CHECK: [0xff,0x00,0x02,0xd3,0x00,0x00,0x00,0x00]
+v_mov_b32 v5, v1
+// CHECK: [0x01,0x03,0x0a,0x7e]
 
-v_mov_b32_e64 v0, s103
-// CHECK: [0x00,0x00,0x02,0xd3,0x67,0x00,0x00,0x00]
+v_mov_b32 v5, v255
+// CHECK: [0xff,0x03,0x0a,0x7e]
 
-v_mov_b32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x02,0xd3,0x68,0x00,0x00,0x00]
+v_mov_b32_e64 v5, s1
+// CHECK: [0x05,0x00,0x02,0xd3,0x01,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x02,0xd3,0x69,0x00,0x00,0x00]
+v_mov_b32_e64 v255, s1
+// CHECK: [0xff,0x00,0x02,0xd3,0x01,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x02,0xd3,0x6a,0x00,0x00,0x00]
+v_mov_b32_e64 v5, s103
+// CHECK: [0x05,0x00,0x02,0xd3,0x67,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x02,0xd3,0x6b,0x00,0x00,0x00]
+v_mov_b32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x02,0xd3,0x68,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x02,0xd3,0x6c,0x00,0x00,0x00]
+v_mov_b32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x02,0xd3,0x69,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x02,0xd3,0x6d,0x00,0x00,0x00]
+v_mov_b32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x02,0xd3,0x6a,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x02,0xd3,0x6e,0x00,0x00,0x00]
+v_mov_b32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x02,0xd3,0x6b,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x02,0xd3,0x6f,0x00,0x00,0x00]
+v_mov_b32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x02,0xd3,0x6c,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x02,0xd3,0x7b,0x00,0x00,0x00]
+v_mov_b32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x02,0xd3,0x6d,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x02,0xd3,0x7c,0x00,0x00,0x00]
+v_mov_b32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x02,0xd3,0x6e,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x02,0xd3,0x7e,0x00,0x00,0x00]
+v_mov_b32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x02,0xd3,0x6f,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x02,0xd3,0x7f,0x00,0x00,0x00]
+v_mov_b32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x02,0xd3,0x7b,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x02,0xd3,0x80,0x00,0x00,0x00]
+v_mov_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x02,0xd3,0x7c,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x02,0xd3,0xc1,0x00,0x00,0x00]
+v_mov_b32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x02,0xd3,0x7e,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x02,0xd3,0xf0,0x00,0x00,0x00]
+v_mov_b32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x02,0xd3,0x7f,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x02,0xd3,0xf7,0x00,0x00,0x00]
+v_mov_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x02,0xd3,0x80,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x02,0xd3,0x00,0x01,0x00,0x00]
+v_mov_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x02,0xd3,0xc1,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x02,0xd3,0xff,0x01,0x00,0x00]
+v_mov_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x02,0xd3,0xf0,0x00,0x00,0x00]
 
-v_readfirstlane_b32 s0, v0
-// CHECK: [0x00,0x05,0x00,0x7e]
+v_mov_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x02,0xd3,0xf7,0x00,0x00,0x00]
 
-v_readfirstlane_b32 s103, v0
-// CHECK: [0x00,0x05,0xce,0x7e]
+v_mov_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x02,0xd3,0x01,0x01,0x00,0x00]
 
-v_readfirstlane_b32 tba_lo, v0
-// CHECK: [0x00,0x05,0xd8,0x7e]
+v_mov_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x02,0xd3,0xff,0x01,0x00,0x00]
 
-v_readfirstlane_b32 tba_hi, v0
-// CHECK: [0x00,0x05,0xda,0x7e]
+v_readfirstlane_b32 s5, v1
+// CHECK: [0x01,0x05,0x0a,0x7e]
 
-v_readfirstlane_b32 tma_lo, v0
-// CHECK: [0x00,0x05,0xdc,0x7e]
+v_readfirstlane_b32 s103, v1
+// CHECK: [0x01,0x05,0xce,0x7e]
 
-v_readfirstlane_b32 tma_hi, v0
-// CHECK: [0x00,0x05,0xde,0x7e]
+v_readfirstlane_b32 tba_lo, v1
+// CHECK: [0x01,0x05,0xd8,0x7e]
 
-v_readfirstlane_b32 ttmp11, v0
-// CHECK: [0x00,0x05,0xf6,0x7e]
+v_readfirstlane_b32 tba_hi, v1
+// CHECK: [0x01,0x05,0xda,0x7e]
 
-v_readfirstlane_b32 s0, v255
-// CHECK: [0xff,0x05,0x00,0x7e]
+v_readfirstlane_b32 tma_lo, v1
+// CHECK: [0x01,0x05,0xdc,0x7e]
 
-v_cvt_i32_f64 v0, s[0:1]
-// CHECK: [0x00,0x06,0x00,0x7e]
+v_readfirstlane_b32 tma_hi, v1
+// CHECK: [0x01,0x05,0xde,0x7e]
 
-v_cvt_i32_f64 v255, s[0:1]
-// CHECK: [0x00,0x06,0xfe,0x7f]
+v_readfirstlane_b32 ttmp11, v1
+// CHECK: [0x01,0x05,0xf6,0x7e]
 
-v_cvt_i32_f64 v0, s[2:3]
-// CHECK: [0x02,0x06,0x00,0x7e]
+v_readfirstlane_b32 s5, v255
+// CHECK: [0xff,0x05,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, s[102:103]
-// CHECK: [0x66,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, s[2:3]
+// CHECK: [0x02,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, flat_scratch
-// CHECK: [0x68,0x06,0x00,0x7e]
+v_cvt_i32_f64 v255, s[2:3]
+// CHECK: [0x02,0x06,0xfe,0x7f]
 
-v_cvt_i32_f64 v0, vcc
-// CHECK: [0x6a,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, s[4:5]
+// CHECK: [0x04,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, tba
-// CHECK: [0x6c,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, s[102:103]
+// CHECK: [0x66,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, tma
-// CHECK: [0x6e,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, flat_scratch
+// CHECK: [0x68,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, ttmp[10:11]
-// CHECK: [0x7a,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, vcc
+// CHECK: [0x6a,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, exec
-// CHECK: [0x7e,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, tba
+// CHECK: [0x6c,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, 0
-// CHECK: [0x80,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, tma
+// CHECK: [0x6e,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, -1
-// CHECK: [0xc1,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, ttmp[10:11]
+// CHECK: [0x7a,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, 0.5
-// CHECK: [0xf0,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, exec
+// CHECK: [0x7e,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, -4.0
-// CHECK: [0xf7,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, 0
+// CHECK: [0x80,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, 0xaf123456
-// CHECK: [0xff,0x06,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_i32_f64 v5, -1
+// CHECK: [0xc1,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, 0x3f717273
-// CHECK: [0xff,0x06,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_i32_f64 v5, 0.5
+// CHECK: [0xf0,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, v[0:1]
-// CHECK: [0x00,0x07,0x00,0x7e]
+v_cvt_i32_f64 v5, -4.0
+// CHECK: [0xf7,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, v[254:255]
-// CHECK: [0xfe,0x07,0x00,0x7e]
+v_cvt_i32_f64 v5, 0xaf123456
+// CHECK: [0xff,0x06,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_f64_e64 v0, s[0:1]
-// CHECK: [0x00,0x00,0x06,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_i32_f64 v5, 0x3f717273
+// CHECK: [0xff,0x06,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_i32_f64_e64 v255, s[0:1]
-// CHECK: [0xff,0x00,0x06,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_i32_f64 v5, v[1:2]
+// CHECK: [0x01,0x07,0x0a,0x7e]
 
-v_cvt_i32_f64_e64 v0, s[2:3]
-// CHECK: [0x00,0x00,0x06,0xd3,0x02,0x00,0x00,0x00]
+v_cvt_i32_f64 v5, v[254:255]
+// CHECK: [0xfe,0x07,0x0a,0x7e]
 
-v_cvt_i32_f64_e64 v0, s[102:103]
-// CHECK: [0x00,0x00,0x06,0xd3,0x66,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, s[2:3]
+// CHECK: [0x05,0x00,0x06,0xd3,0x02,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, flat_scratch
-// CHECK: [0x00,0x00,0x06,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v255, s[2:3]
+// CHECK: [0xff,0x00,0x06,0xd3,0x02,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, vcc
-// CHECK: [0x00,0x00,0x06,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, s[4:5]
+// CHECK: [0x05,0x00,0x06,0xd3,0x04,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, tba
-// CHECK: [0x00,0x00,0x06,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, s[102:103]
+// CHECK: [0x05,0x00,0x06,0xd3,0x66,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, tma
-// CHECK: [0x00,0x00,0x06,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, flat_scratch
+// CHECK: [0x05,0x00,0x06,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, ttmp[10:11]
-// CHECK: [0x00,0x00,0x06,0xd3,0x7a,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, vcc
+// CHECK: [0x05,0x00,0x06,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, exec
-// CHECK: [0x00,0x00,0x06,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, tba
+// CHECK: [0x05,0x00,0x06,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, scc
-// CHECK: [0x00,0x00,0x06,0xd3,0xfd,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, tma
+// CHECK: [0x05,0x00,0x06,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x06,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_i32_f64_e64 v5, ttmp[10:11]
+// CHECK: [0x05,0x00,0x06,0xd3,0x7a,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, v[254:255]
-// CHECK: [0x00,0x00,0x06,0xd3,0xfe,0x01,0x00,0x00]
+v_cvt_i32_f64_e64 v5, exec
+// CHECK: [0x05,0x00,0x06,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, -s[0:1]
-// CHECK: [0x00,0x00,0x06,0xd3,0x00,0x00,0x00,0x20]
+v_cvt_i32_f64_e64 v5, scc
+// CHECK: [0x05,0x00,0x06,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, |s[0:1]|
-// CHECK: [0x00,0x01,0x06,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, v[1:2]
+// CHECK: [0x05,0x00,0x06,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f64_i32 v[0:1], s0
-// CHECK: [0x00,0x08,0x00,0x7e]
+v_cvt_i32_f64_e64 v5, v[254:255]
+// CHECK: [0x05,0x00,0x06,0xd3,0xfe,0x01,0x00,0x00]
 
-v_cvt_f64_i32 v[254:255], s0
-// CHECK: [0x00,0x08,0xfc,0x7f]
+v_cvt_i32_f64_e64 v5, -s[2:3]
+// CHECK: [0x05,0x00,0x06,0xd3,0x02,0x00,0x00,0x20]
 
-v_cvt_f64_i32 v[0:1], s103
-// CHECK: [0x67,0x08,0x00,0x7e]
+v_cvt_i32_f64_e64 v5, |s[2:3]|
+// CHECK: [0x05,0x01,0x06,0xd3,0x02,0x00,0x00,0x00]
 
-v_cvt_f64_i32 v[0:1], flat_scratch_lo
-// CHECK: [0x68,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], s1
+// CHECK: [0x01,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], flat_scratch_hi
-// CHECK: [0x69,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[254:255], s1
+// CHECK: [0x01,0x08,0xfc,0x7f]
 
-v_cvt_f64_i32 v[0:1], vcc_lo
-// CHECK: [0x6a,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], s103
+// CHECK: [0x67,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], vcc_hi
-// CHECK: [0x6b,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], flat_scratch_lo
+// CHECK: [0x68,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], tba_lo
-// CHECK: [0x6c,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], flat_scratch_hi
+// CHECK: [0x69,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], tba_hi
-// CHECK: [0x6d,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], vcc_lo
+// CHECK: [0x6a,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], tma_lo
-// CHECK: [0x6e,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], vcc_hi
+// CHECK: [0x6b,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], tma_hi
-// CHECK: [0x6f,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], tba_lo
+// CHECK: [0x6c,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], ttmp11
-// CHECK: [0x7b,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], tba_hi
+// CHECK: [0x6d,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], m0
-// CHECK: [0x7c,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], tma_lo
+// CHECK: [0x6e,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], exec_lo
-// CHECK: [0x7e,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], tma_hi
+// CHECK: [0x6f,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], exec_hi
-// CHECK: [0x7f,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], ttmp11
+// CHECK: [0x7b,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], 0
-// CHECK: [0x80,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], m0
+// CHECK: [0x7c,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], -1
-// CHECK: [0xc1,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], exec_lo
+// CHECK: [0x7e,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], 0.5
-// CHECK: [0xf0,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], exec_hi
+// CHECK: [0x7f,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], -4.0
-// CHECK: [0xf7,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], 0
+// CHECK: [0x80,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], 0xaf123456
-// CHECK: [0xff,0x08,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f64_i32 v[5:6], -1
+// CHECK: [0xc1,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], 0x3f717273
-// CHECK: [0xff,0x08,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f64_i32 v[5:6], 0.5
+// CHECK: [0xf0,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], v0
-// CHECK: [0x00,0x09,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], -4.0
+// CHECK: [0xf7,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], v255
-// CHECK: [0xff,0x09,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], 0xaf123456
+// CHECK: [0xff,0x08,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f64_i32_e64 v[0:1], s0
-// CHECK: [0x00,0x00,0x08,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f64_i32 v[5:6], 0x3f717273
+// CHECK: [0xff,0x08,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f64_i32_e64 v[254:255], s0
-// CHECK: [0xfe,0x00,0x08,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f64_i32 v[5:6], v1
+// CHECK: [0x01,0x09,0x0a,0x7e]
 
-v_cvt_f64_i32_e64 v[0:1], s103
-// CHECK: [0x00,0x00,0x08,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f64_i32 v[5:6], v255
+// CHECK: [0xff,0x09,0x0a,0x7e]
 
-v_cvt_f64_i32_e64 v[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x08,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], s1
+// CHECK: [0x05,0x00,0x08,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x08,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[254:255], s1
+// CHECK: [0xfe,0x00,0x08,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x08,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], s103
+// CHECK: [0x05,0x00,0x08,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x08,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], flat_scratch_lo
+// CHECK: [0x05,0x00,0x08,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], tba_lo
-// CHECK: [0x00,0x00,0x08,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], flat_scratch_hi
+// CHECK: [0x05,0x00,0x08,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], tba_hi
-// CHECK: [0x00,0x00,0x08,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], vcc_lo
+// CHECK: [0x05,0x00,0x08,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], tma_lo
-// CHECK: [0x00,0x00,0x08,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], vcc_hi
+// CHECK: [0x05,0x00,0x08,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], tma_hi
-// CHECK: [0x00,0x00,0x08,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], tba_lo
+// CHECK: [0x05,0x00,0x08,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], ttmp11
-// CHECK: [0x00,0x00,0x08,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], tba_hi
+// CHECK: [0x05,0x00,0x08,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], m0
-// CHECK: [0x00,0x00,0x08,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], tma_lo
+// CHECK: [0x05,0x00,0x08,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], exec_lo
-// CHECK: [0x00,0x00,0x08,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], tma_hi
+// CHECK: [0x05,0x00,0x08,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], exec_hi
-// CHECK: [0x00,0x00,0x08,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], ttmp11
+// CHECK: [0x05,0x00,0x08,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], 0
-// CHECK: [0x00,0x00,0x08,0xd3,0x80,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], m0
+// CHECK: [0x05,0x00,0x08,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], -1
-// CHECK: [0x00,0x00,0x08,0xd3,0xc1,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], exec_lo
+// CHECK: [0x05,0x00,0x08,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], 0.5
-// CHECK: [0x00,0x00,0x08,0xd3,0xf0,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], exec_hi
+// CHECK: [0x05,0x00,0x08,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], -4.0
-// CHECK: [0x00,0x00,0x08,0xd3,0xf7,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], 0
+// CHECK: [0x05,0x00,0x08,0xd3,0x80,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], v0
-// CHECK: [0x00,0x00,0x08,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], -1
+// CHECK: [0x05,0x00,0x08,0xd3,0xc1,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], v255
-// CHECK: [0x00,0x00,0x08,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], 0.5
+// CHECK: [0x05,0x00,0x08,0xd3,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_i32 v0, s0
-// CHECK: [0x00,0x0a,0x00,0x7e]
+v_cvt_f64_i32_e64 v[5:6], -4.0
+// CHECK: [0x05,0x00,0x08,0xd3,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_i32 v255, s0
-// CHECK: [0x00,0x0a,0xfe,0x7f]
+v_cvt_f64_i32_e64 v[5:6], v1
+// CHECK: [0x05,0x00,0x08,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_i32 v0, s103
-// CHECK: [0x67,0x0a,0x00,0x7e]
+v_cvt_f64_i32_e64 v[5:6], v255
+// CHECK: [0x05,0x00,0x08,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_i32 v0, flat_scratch_lo
-// CHECK: [0x68,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, s1
+// CHECK: [0x01,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, flat_scratch_hi
-// CHECK: [0x69,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v255, s1
+// CHECK: [0x01,0x0a,0xfe,0x7f]
 
-v_cvt_f32_i32 v0, vcc_lo
-// CHECK: [0x6a,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, s103
+// CHECK: [0x67,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, vcc_hi
-// CHECK: [0x6b,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, flat_scratch_lo
+// CHECK: [0x68,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, tba_lo
-// CHECK: [0x6c,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, flat_scratch_hi
+// CHECK: [0x69,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, tba_hi
-// CHECK: [0x6d,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, vcc_lo
+// CHECK: [0x6a,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, tma_lo
-// CHECK: [0x6e,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, vcc_hi
+// CHECK: [0x6b,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, tma_hi
-// CHECK: [0x6f,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, tba_lo
+// CHECK: [0x6c,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, ttmp11
-// CHECK: [0x7b,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, tba_hi
+// CHECK: [0x6d,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, m0
-// CHECK: [0x7c,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, tma_lo
+// CHECK: [0x6e,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, exec_lo
-// CHECK: [0x7e,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, tma_hi
+// CHECK: [0x6f,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, exec_hi
-// CHECK: [0x7f,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, ttmp11
+// CHECK: [0x7b,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, 0
-// CHECK: [0x80,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, m0
+// CHECK: [0x7c,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, -1
-// CHECK: [0xc1,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, exec_lo
+// CHECK: [0x7e,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, 0.5
-// CHECK: [0xf0,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, exec_hi
+// CHECK: [0x7f,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, -4.0
-// CHECK: [0xf7,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, 0
+// CHECK: [0x80,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, 0xaf123456
-// CHECK: [0xff,0x0a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_i32 v5, -1
+// CHECK: [0xc1,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, 0x3f717273
-// CHECK: [0xff,0x0a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_i32 v5, 0.5
+// CHECK: [0xf0,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, v0
-// CHECK: [0x00,0x0b,0x00,0x7e]
+v_cvt_f32_i32 v5, -4.0
+// CHECK: [0xf7,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, v255
-// CHECK: [0xff,0x0b,0x00,0x7e]
+v_cvt_f32_i32 v5, 0xaf123456
+// CHECK: [0xff,0x0a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_i32_e64 v0, s0
-// CHECK: [0x00,0x00,0x0a,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32 v5, 0x3f717273
+// CHECK: [0xff,0x0a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_i32_e64 v255, s0
-// CHECK: [0xff,0x00,0x0a,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32 v5, v1
+// CHECK: [0x01,0x0b,0x0a,0x7e]
 
-v_cvt_f32_i32_e64 v0, s103
-// CHECK: [0x00,0x00,0x0a,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f32_i32 v5, v255
+// CHECK: [0xff,0x0b,0x0a,0x7e]
 
-v_cvt_f32_i32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x0a,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, s1
+// CHECK: [0x05,0x00,0x0a,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x0a,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v255, s1
+// CHECK: [0xff,0x00,0x0a,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x0a,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, s103
+// CHECK: [0x05,0x00,0x0a,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x0a,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0a,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x0a,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0a,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x0a,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x0a,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x0a,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x0a,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x0a,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x0a,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x0a,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x0a,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, m0
-// CHECK: [0x00,0x00,0x0a,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x0a,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x0a,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x0a,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x0a,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x0a,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, 0
-// CHECK: [0x00,0x00,0x0a,0xd3,0x80,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, m0
+// CHECK: [0x05,0x00,0x0a,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, -1
-// CHECK: [0x00,0x00,0x0a,0xd3,0xc1,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x0a,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x0a,0xd3,0xf0,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x0a,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x0a,0xd3,0xf7,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, 0
+// CHECK: [0x05,0x00,0x0a,0xd3,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, v0
-// CHECK: [0x00,0x00,0x0a,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f32_i32_e64 v5, -1
+// CHECK: [0x05,0x00,0x0a,0xd3,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, v255
-// CHECK: [0x00,0x00,0x0a,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f32_i32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x0a,0xd3,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_u32 v0, s0
-// CHECK: [0x00,0x0c,0x00,0x7e]
+v_cvt_f32_i32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x0a,0xd3,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_u32 v255, s0
-// CHECK: [0x00,0x0c,0xfe,0x7f]
+v_cvt_f32_i32_e64 v5, v1
+// CHECK: [0x05,0x00,0x0a,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_u32 v0, s103
-// CHECK: [0x67,0x0c,0x00,0x7e]
+v_cvt_f32_i32_e64 v5, v255
+// CHECK: [0x05,0x00,0x0a,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_u32 v0, flat_scratch_lo
-// CHECK: [0x68,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, s1
+// CHECK: [0x01,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, flat_scratch_hi
-// CHECK: [0x69,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v255, s1
+// CHECK: [0x01,0x0c,0xfe,0x7f]
 
-v_cvt_f32_u32 v0, vcc_lo
-// CHECK: [0x6a,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, s103
+// CHECK: [0x67,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, vcc_hi
-// CHECK: [0x6b,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, flat_scratch_lo
+// CHECK: [0x68,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, tba_lo
-// CHECK: [0x6c,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, flat_scratch_hi
+// CHECK: [0x69,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, tba_hi
-// CHECK: [0x6d,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, vcc_lo
+// CHECK: [0x6a,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, tma_lo
-// CHECK: [0x6e,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, vcc_hi
+// CHECK: [0x6b,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, tma_hi
-// CHECK: [0x6f,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, tba_lo
+// CHECK: [0x6c,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, ttmp11
-// CHECK: [0x7b,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, tba_hi
+// CHECK: [0x6d,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, m0
-// CHECK: [0x7c,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, tma_lo
+// CHECK: [0x6e,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, exec_lo
-// CHECK: [0x7e,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, tma_hi
+// CHECK: [0x6f,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, exec_hi
-// CHECK: [0x7f,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, ttmp11
+// CHECK: [0x7b,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, 0
-// CHECK: [0x80,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, m0
+// CHECK: [0x7c,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, -1
-// CHECK: [0xc1,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, exec_lo
+// CHECK: [0x7e,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, 0.5
-// CHECK: [0xf0,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, exec_hi
+// CHECK: [0x7f,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, -4.0
-// CHECK: [0xf7,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, 0
+// CHECK: [0x80,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, 0xaf123456
-// CHECK: [0xff,0x0c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_u32 v5, -1
+// CHECK: [0xc1,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, 0x3f717273
-// CHECK: [0xff,0x0c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_u32 v5, 0.5
+// CHECK: [0xf0,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, v0
-// CHECK: [0x00,0x0d,0x00,0x7e]
+v_cvt_f32_u32 v5, -4.0
+// CHECK: [0xf7,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, v255
-// CHECK: [0xff,0x0d,0x00,0x7e]
+v_cvt_f32_u32 v5, 0xaf123456
+// CHECK: [0xff,0x0c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_u32_e64 v0, s0
-// CHECK: [0x00,0x00,0x0c,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32 v5, 0x3f717273
+// CHECK: [0xff,0x0c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_u32_e64 v255, s0
-// CHECK: [0xff,0x00,0x0c,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32 v5, v1
+// CHECK: [0x01,0x0d,0x0a,0x7e]
 
-v_cvt_f32_u32_e64 v0, s103
-// CHECK: [0x00,0x00,0x0c,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f32_u32 v5, v255
+// CHECK: [0xff,0x0d,0x0a,0x7e]
 
-v_cvt_f32_u32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x0c,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, s1
+// CHECK: [0x05,0x00,0x0c,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x0c,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v255, s1
+// CHECK: [0xff,0x00,0x0c,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x0c,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, s103
+// CHECK: [0x05,0x00,0x0c,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x0c,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0c,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x0c,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0c,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x0c,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x0c,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x0c,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x0c,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x0c,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x0c,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x0c,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x0c,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, m0
-// CHECK: [0x00,0x00,0x0c,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x0c,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x0c,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x0c,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x0c,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x0c,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, 0
-// CHECK: [0x00,0x00,0x0c,0xd3,0x80,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, m0
+// CHECK: [0x05,0x00,0x0c,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, -1
-// CHECK: [0x00,0x00,0x0c,0xd3,0xc1,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x0c,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x0c,0xd3,0xf0,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x0c,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x0c,0xd3,0xf7,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, 0
+// CHECK: [0x05,0x00,0x0c,0xd3,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, v0
-// CHECK: [0x00,0x00,0x0c,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f32_u32_e64 v5, -1
+// CHECK: [0x05,0x00,0x0c,0xd3,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, v255
-// CHECK: [0x00,0x00,0x0c,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f32_u32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x0c,0xd3,0xf0,0x00,0x00,0x00]
 
-v_cvt_u32_f32 v0, s0
-// CHECK: [0x00,0x0e,0x00,0x7e]
+v_cvt_f32_u32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x0c,0xd3,0xf7,0x00,0x00,0x00]
 
-v_cvt_u32_f32 v255, s0
-// CHECK: [0x00,0x0e,0xfe,0x7f]
+v_cvt_f32_u32_e64 v5, v1
+// CHECK: [0x05,0x00,0x0c,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_u32_f32 v0, s103
-// CHECK: [0x67,0x0e,0x00,0x7e]
+v_cvt_f32_u32_e64 v5, v255
+// CHECK: [0x05,0x00,0x0c,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_u32_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, s1
+// CHECK: [0x01,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v255, s1
+// CHECK: [0x01,0x0e,0xfe,0x7f]
 
-v_cvt_u32_f32 v0, vcc_lo
-// CHECK: [0x6a,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, s103
+// CHECK: [0x67,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, vcc_hi
-// CHECK: [0x6b,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, tba_lo
-// CHECK: [0x6c,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, tba_hi
-// CHECK: [0x6d,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, vcc_lo
+// CHECK: [0x6a,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, tma_lo
-// CHECK: [0x6e,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, vcc_hi
+// CHECK: [0x6b,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, tma_hi
-// CHECK: [0x6f,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, tba_lo
+// CHECK: [0x6c,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, ttmp11
-// CHECK: [0x7b,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, tba_hi
+// CHECK: [0x6d,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, m0
-// CHECK: [0x7c,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, tma_lo
+// CHECK: [0x6e,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, exec_lo
-// CHECK: [0x7e,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, tma_hi
+// CHECK: [0x6f,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, exec_hi
-// CHECK: [0x7f,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, ttmp11
+// CHECK: [0x7b,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, 0
-// CHECK: [0x80,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, m0
+// CHECK: [0x7c,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, -1
-// CHECK: [0xc1,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, exec_lo
+// CHECK: [0x7e,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, 0.5
-// CHECK: [0xf0,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, exec_hi
+// CHECK: [0x7f,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, -4.0
-// CHECK: [0xf7,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, 0
+// CHECK: [0x80,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, 0xaf123456
-// CHECK: [0xff,0x0e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_u32_f32 v5, -1
+// CHECK: [0xc1,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, 0x3f717273
-// CHECK: [0xff,0x0e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_u32_f32 v5, 0.5
+// CHECK: [0xf0,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, v0
-// CHECK: [0x00,0x0f,0x00,0x7e]
+v_cvt_u32_f32 v5, -4.0
+// CHECK: [0xf7,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, v255
-// CHECK: [0xff,0x0f,0x00,0x7e]
+v_cvt_u32_f32 v5, 0xaf123456
+// CHECK: [0xff,0x0e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x0e,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32 v5, 0x3f717273
+// CHECK: [0xff,0x0e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_u32_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x0e,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32 v5, v1
+// CHECK: [0x01,0x0f,0x0a,0x7e]
 
-v_cvt_u32_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x0e,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_u32_f32 v5, v255
+// CHECK: [0xff,0x0f,0x0a,0x7e]
 
-v_cvt_u32_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x0e,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x0e,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x0e,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x0e,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x0e,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x0e,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x0e,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0e,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x0e,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0e,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x0e,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x0e,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x0e,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x0e,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x0e,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x0e,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x0e,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x0e,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x0e,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x0e,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x0e,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x0e,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x0e,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x0e,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x0e,0xd3,0xfd,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x0e,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x0e,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_u32_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x0e,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x0e,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_u32_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x0e,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x0e,0xd3,0x00,0x00,0x00,0x20]
+v_cvt_u32_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x0e,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x0e,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x0e,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_i32_f32 v0, s0
-// CHECK: [0x00,0x10,0x00,0x7e]
+v_cvt_u32_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x0e,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_i32_f32 v255, s0
-// CHECK: [0x00,0x10,0xfe,0x7f]
+v_cvt_u32_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x0e,0xd3,0x01,0x00,0x00,0x20]
 
-v_cvt_i32_f32 v0, s103
-// CHECK: [0x67,0x10,0x00,0x7e]
+v_cvt_u32_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x0e,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_i32_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, s1
+// CHECK: [0x01,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x10,0x00,0x7e]
+v_cvt_i32_f32 v255, s1
+// CHECK: [0x01,0x10,0xfe,0x7f]
 
-v_cvt_i32_f32 v0, vcc_lo
-// CHECK: [0x6a,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, s103
+// CHECK: [0x67,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, vcc_hi
-// CHECK: [0x6b,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, tba_lo
-// CHECK: [0x6c,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, tba_hi
-// CHECK: [0x6d,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, vcc_lo
+// CHECK: [0x6a,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, tma_lo
-// CHECK: [0x6e,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, vcc_hi
+// CHECK: [0x6b,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, tma_hi
-// CHECK: [0x6f,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, tba_lo
+// CHECK: [0x6c,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, ttmp11
-// CHECK: [0x7b,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, tba_hi
+// CHECK: [0x6d,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, m0
-// CHECK: [0x7c,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, tma_lo
+// CHECK: [0x6e,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, exec_lo
-// CHECK: [0x7e,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, tma_hi
+// CHECK: [0x6f,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, exec_hi
-// CHECK: [0x7f,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, ttmp11
+// CHECK: [0x7b,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, 0
-// CHECK: [0x80,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, m0
+// CHECK: [0x7c,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, -1
-// CHECK: [0xc1,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, exec_lo
+// CHECK: [0x7e,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, 0.5
-// CHECK: [0xf0,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, exec_hi
+// CHECK: [0x7f,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, -4.0
-// CHECK: [0xf7,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, 0
+// CHECK: [0x80,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, 0xaf123456
-// CHECK: [0xff,0x10,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_i32_f32 v5, -1
+// CHECK: [0xc1,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, 0x3f717273
-// CHECK: [0xff,0x10,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_i32_f32 v5, 0.5
+// CHECK: [0xf0,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, v0
-// CHECK: [0x00,0x11,0x00,0x7e]
+v_cvt_i32_f32 v5, -4.0
+// CHECK: [0xf7,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, v255
-// CHECK: [0xff,0x11,0x00,0x7e]
+v_cvt_i32_f32 v5, 0xaf123456
+// CHECK: [0xff,0x10,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x10,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_i32_f32 v5, 0x3f717273
+// CHECK: [0xff,0x10,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_i32_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x10,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_i32_f32 v5, v1
+// CHECK: [0x01,0x11,0x0a,0x7e]
 
-v_cvt_i32_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x10,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_i32_f32 v5, v255
+// CHECK: [0xff,0x11,0x0a,0x7e]
 
-v_cvt_i32_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x10,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x10,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x10,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x10,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x10,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x10,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x10,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x10,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x10,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x10,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x10,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x10,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x10,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x10,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x10,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x10,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x10,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x10,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x10,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x10,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x10,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x10,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x10,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x10,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x10,0xd3,0xfd,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x10,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x10,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_i32_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x10,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x10,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_i32_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x10,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x10,0xd3,0x00,0x00,0x00,0x20]
+v_cvt_i32_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x10,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x10,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x10,0xd3,0x01,0x01,0x00,0x00]
 
-v_mov_fed_b32 v0, s0
-// CHECK: [0x00,0x12,0x00,0x7e]
+v_cvt_i32_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x10,0xd3,0xff,0x01,0x00,0x00]
 
-v_mov_fed_b32 v255, s0
-// CHECK: [0x00,0x12,0xfe,0x7f]
+v_cvt_i32_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x10,0xd3,0x01,0x00,0x00,0x20]
 
-v_mov_fed_b32 v0, s103
-// CHECK: [0x67,0x12,0x00,0x7e]
+v_cvt_i32_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x10,0xd3,0x01,0x00,0x00,0x00]
 
-v_mov_fed_b32 v0, flat_scratch_lo
-// CHECK: [0x68,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, s1
+// CHECK: [0x01,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, flat_scratch_hi
-// CHECK: [0x69,0x12,0x00,0x7e]
+v_mov_fed_b32 v255, s1
+// CHECK: [0x01,0x12,0xfe,0x7f]
 
-v_mov_fed_b32 v0, vcc_lo
-// CHECK: [0x6a,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, s103
+// CHECK: [0x67,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, vcc_hi
-// CHECK: [0x6b,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, flat_scratch_lo
+// CHECK: [0x68,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, tba_lo
-// CHECK: [0x6c,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, flat_scratch_hi
+// CHECK: [0x69,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, tba_hi
-// CHECK: [0x6d,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, vcc_lo
+// CHECK: [0x6a,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, tma_lo
-// CHECK: [0x6e,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, vcc_hi
+// CHECK: [0x6b,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, tma_hi
-// CHECK: [0x6f,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, tba_lo
+// CHECK: [0x6c,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, ttmp11
-// CHECK: [0x7b,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, tba_hi
+// CHECK: [0x6d,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, m0
-// CHECK: [0x7c,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, tma_lo
+// CHECK: [0x6e,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, exec_lo
-// CHECK: [0x7e,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, tma_hi
+// CHECK: [0x6f,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, exec_hi
-// CHECK: [0x7f,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, ttmp11
+// CHECK: [0x7b,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, 0
-// CHECK: [0x80,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, m0
+// CHECK: [0x7c,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, -1
-// CHECK: [0xc1,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, exec_lo
+// CHECK: [0x7e,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, 0.5
-// CHECK: [0xf0,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, exec_hi
+// CHECK: [0x7f,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, -4.0
-// CHECK: [0xf7,0x12,0x00,0x7e]
+v_mov_fed_b32 v5, 0
+// CHECK: [0x80,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, 0xaf123456
-// CHECK: [0xff,0x12,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_mov_fed_b32 v5, -1
+// CHECK: [0xc1,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, 0x3f717273
-// CHECK: [0xff,0x12,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_mov_fed_b32 v5, 0.5
+// CHECK: [0xf0,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, v0
-// CHECK: [0x00,0x13,0x00,0x7e]
+v_mov_fed_b32 v5, -4.0
+// CHECK: [0xf7,0x12,0x0a,0x7e]
 
-v_mov_fed_b32 v0, v255
-// CHECK: [0xff,0x13,0x00,0x7e]
+v_mov_fed_b32 v5, 0xaf123456
+// CHECK: [0xff,0x12,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_mov_fed_b32_e64 v0, s0
-// CHECK: [0x00,0x00,0x12,0xd3,0x00,0x00,0x00,0x00]
+v_mov_fed_b32 v5, 0x3f717273
+// CHECK: [0xff,0x12,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_mov_fed_b32_e64 v255, s0
-// CHECK: [0xff,0x00,0x12,0xd3,0x00,0x00,0x00,0x00]
+v_mov_fed_b32 v5, v1
+// CHECK: [0x01,0x13,0x0a,0x7e]
 
-v_mov_fed_b32_e64 v0, s103
-// CHECK: [0x00,0x00,0x12,0xd3,0x67,0x00,0x00,0x00]
+v_mov_fed_b32 v5, v255
+// CHECK: [0xff,0x13,0x0a,0x7e]
 
-v_mov_fed_b32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x12,0xd3,0x68,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, s1
+// CHECK: [0x05,0x00,0x12,0xd3,0x01,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x12,0xd3,0x69,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v255, s1
+// CHECK: [0xff,0x00,0x12,0xd3,0x01,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x12,0xd3,0x6a,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, s103
+// CHECK: [0x05,0x00,0x12,0xd3,0x67,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x12,0xd3,0x6b,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x12,0xd3,0x68,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x12,0xd3,0x6c,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x12,0xd3,0x69,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x12,0xd3,0x6d,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x12,0xd3,0x6a,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x12,0xd3,0x6e,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x12,0xd3,0x6b,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x12,0xd3,0x6f,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x12,0xd3,0x6c,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x12,0xd3,0x7b,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x12,0xd3,0x6d,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x12,0xd3,0x7c,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x12,0xd3,0x6e,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x12,0xd3,0x7e,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x12,0xd3,0x6f,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x12,0xd3,0x7f,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x12,0xd3,0x7b,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x12,0xd3,0x80,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x12,0xd3,0x7c,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x12,0xd3,0xc1,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x12,0xd3,0x7e,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x12,0xd3,0xf0,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x12,0xd3,0x7f,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x12,0xd3,0xf7,0x00,0x00,0x00]
+v_mov_fed_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x12,0xd3,0x80,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x12,0xd3,0x00,0x01,0x00,0x00]
+v_mov_fed_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x12,0xd3,0xc1,0x00,0x00,0x00]
 
-v_mov_fed_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x12,0xd3,0xff,0x01,0x00,0x00]
+v_mov_fed_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x12,0xd3,0xf0,0x00,0x00,0x00]
 
-v_cvt_f16_f32 v0, s0
-// CHECK: [0x00,0x14,0x00,0x7e]
+v_mov_fed_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x12,0xd3,0xf7,0x00,0x00,0x00]
 
-v_cvt_f16_f32 v255, s0
-// CHECK: [0x00,0x14,0xfe,0x7f]
+v_mov_fed_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x12,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f16_f32 v0, s103
-// CHECK: [0x67,0x14,0x00,0x7e]
+v_mov_fed_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x12,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_f16_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, s1
+// CHECK: [0x01,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x14,0x00,0x7e]
+v_cvt_f16_f32 v255, s1
+// CHECK: [0x01,0x14,0xfe,0x7f]
 
-v_cvt_f16_f32 v0, vcc_lo
-// CHECK: [0x6a,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, s103
+// CHECK: [0x67,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, vcc_hi
-// CHECK: [0x6b,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, tba_lo
-// CHECK: [0x6c,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, tba_hi
-// CHECK: [0x6d,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, vcc_lo
+// CHECK: [0x6a,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, tma_lo
-// CHECK: [0x6e,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, vcc_hi
+// CHECK: [0x6b,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, tma_hi
-// CHECK: [0x6f,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, tba_lo
+// CHECK: [0x6c,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, ttmp11
-// CHECK: [0x7b,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, tba_hi
+// CHECK: [0x6d,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, m0
-// CHECK: [0x7c,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, tma_lo
+// CHECK: [0x6e,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, exec_lo
-// CHECK: [0x7e,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, tma_hi
+// CHECK: [0x6f,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, exec_hi
-// CHECK: [0x7f,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, ttmp11
+// CHECK: [0x7b,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, 0
-// CHECK: [0x80,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, m0
+// CHECK: [0x7c,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, -1
-// CHECK: [0xc1,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, exec_lo
+// CHECK: [0x7e,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, 0.5
-// CHECK: [0xf0,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, exec_hi
+// CHECK: [0x7f,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, -4.0
-// CHECK: [0xf7,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, 0
+// CHECK: [0x80,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, 0xaf123456
-// CHECK: [0xff,0x14,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f16_f32 v5, -1
+// CHECK: [0xc1,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, 0x3f717273
-// CHECK: [0xff,0x14,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f16_f32 v5, 0.5
+// CHECK: [0xf0,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, v0
-// CHECK: [0x00,0x15,0x00,0x7e]
+v_cvt_f16_f32 v5, -4.0
+// CHECK: [0xf7,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, v255
-// CHECK: [0xff,0x15,0x00,0x7e]
+v_cvt_f16_f32 v5, 0xaf123456
+// CHECK: [0xff,0x14,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f16_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x14,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f16_f32 v5, 0x3f717273
+// CHECK: [0xff,0x14,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f16_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x14,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f16_f32 v5, v1
+// CHECK: [0x01,0x15,0x0a,0x7e]
 
-v_cvt_f16_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x14,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f16_f32 v5, v255
+// CHECK: [0xff,0x15,0x0a,0x7e]
 
-v_cvt_f16_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x14,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x14,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x14,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x14,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x14,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x14,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x14,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x14,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x14,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x14,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x14,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x14,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x14,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x14,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x14,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x14,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x14,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x14,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x14,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x14,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x14,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x14,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x14,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x14,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x14,0xd3,0xfd,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x14,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x14,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f16_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x14,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x14,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f16_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x14,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x14,0xd3,0x00,0x00,0x00,0x20]
+v_cvt_f16_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x14,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x14,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x14,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_f16 v0, s0
-// CHECK: [0x00,0x16,0x00,0x7e]
+v_cvt_f16_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x14,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_f16 v255, s0
-// CHECK: [0x00,0x16,0xfe,0x7f]
+v_cvt_f16_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x14,0xd3,0x01,0x00,0x00,0x20]
 
-v_cvt_f32_f16 v0, s103
-// CHECK: [0x67,0x16,0x00,0x7e]
+v_cvt_f16_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x14,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_f16 v0, flat_scratch_lo
-// CHECK: [0x68,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, s1
+// CHECK: [0x01,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, flat_scratch_hi
-// CHECK: [0x69,0x16,0x00,0x7e]
+v_cvt_f32_f16 v255, s1
+// CHECK: [0x01,0x16,0xfe,0x7f]
 
-v_cvt_f32_f16 v0, vcc_lo
-// CHECK: [0x6a,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, s103
+// CHECK: [0x67,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, vcc_hi
-// CHECK: [0x6b,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, flat_scratch_lo
+// CHECK: [0x68,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, tba_lo
-// CHECK: [0x6c,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, flat_scratch_hi
+// CHECK: [0x69,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, tba_hi
-// CHECK: [0x6d,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, vcc_lo
+// CHECK: [0x6a,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, tma_lo
-// CHECK: [0x6e,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, vcc_hi
+// CHECK: [0x6b,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, tma_hi
-// CHECK: [0x6f,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, tba_lo
+// CHECK: [0x6c,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, ttmp11
-// CHECK: [0x7b,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, tba_hi
+// CHECK: [0x6d,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, m0
-// CHECK: [0x7c,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, tma_lo
+// CHECK: [0x6e,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, exec_lo
-// CHECK: [0x7e,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, tma_hi
+// CHECK: [0x6f,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, exec_hi
-// CHECK: [0x7f,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, ttmp11
+// CHECK: [0x7b,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, 0
-// CHECK: [0x80,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, m0
+// CHECK: [0x7c,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, -1
-// CHECK: [0xc1,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, exec_lo
+// CHECK: [0x7e,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, 0.5
-// CHECK: [0xf0,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, exec_hi
+// CHECK: [0x7f,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, -4.0
-// CHECK: [0xf7,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, 0
+// CHECK: [0x80,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, 0xfe0b
-// CHECK: [0xff,0x16,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_cvt_f32_f16 v5, -1
+// CHECK: [0xc1,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, 0x3456
-// CHECK: [0xff,0x16,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_cvt_f32_f16 v5, v1
+// CHECK: [0x01,0x17,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, v0
-// CHECK: [0x00,0x17,0x00,0x7e]
+v_cvt_f32_f16 v5, v255
+// CHECK: [0xff,0x17,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, v255
-// CHECK: [0xff,0x17,0x00,0x7e]
+v_cvt_f32_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x16,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x16,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x16,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x16,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, s103
+// CHECK: [0x05,0x00,0x16,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, s103
-// CHECK: [0x00,0x00,0x16,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x16,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x16,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x16,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x16,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x16,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x16,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x16,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x16,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x16,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x16,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x16,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x16,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x16,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x16,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x16,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x16,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x16,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x16,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x16,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x16,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x16,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x16,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x16,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x16,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x16,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, 0
-// CHECK: [0x00,0x00,0x16,0xd3,0x80,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x16,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, -1
-// CHECK: [0x00,0x00,0x16,0xd3,0xc1,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x16,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x16,0xd3,0xf0,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x16,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x16,0xd3,0xf7,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x16,0xd3,0x01,0x00,0x00,0x08]
 
-v_cvt_f32_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x16,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f32_f16_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x16,0xd3,0x01,0x00,0x00,0x10]
 
-v_cvt_f32_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x16,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f32_f16_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x16,0xd3,0x01,0x00,0x00,0x18]
 
-v_cvt_rpi_i32_f32 v0, s0
-// CHECK: [0x00,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, s1
+// CHECK: [0x01,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v255, s0
-// CHECK: [0x00,0x18,0xfe,0x7f]
+v_cvt_rpi_i32_f32 v255, s1
+// CHECK: [0x01,0x18,0xfe,0x7f]
 
-v_cvt_rpi_i32_f32 v0, s103
-// CHECK: [0x67,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, s103
+// CHECK: [0x67,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, vcc_lo
-// CHECK: [0x6a,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, vcc_lo
+// CHECK: [0x6a,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, vcc_hi
-// CHECK: [0x6b,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, vcc_hi
+// CHECK: [0x6b,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, tba_lo
-// CHECK: [0x6c,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, tba_lo
+// CHECK: [0x6c,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, tba_hi
-// CHECK: [0x6d,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, tba_hi
+// CHECK: [0x6d,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, tma_lo
-// CHECK: [0x6e,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, tma_lo
+// CHECK: [0x6e,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, tma_hi
-// CHECK: [0x6f,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, tma_hi
+// CHECK: [0x6f,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, ttmp11
-// CHECK: [0x7b,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, ttmp11
+// CHECK: [0x7b,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, m0
-// CHECK: [0x7c,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, m0
+// CHECK: [0x7c,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, exec_lo
-// CHECK: [0x7e,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, exec_lo
+// CHECK: [0x7e,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, exec_hi
-// CHECK: [0x7f,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, exec_hi
+// CHECK: [0x7f,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, 0
-// CHECK: [0x80,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, 0
+// CHECK: [0x80,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, -1
-// CHECK: [0xc1,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, -1
+// CHECK: [0xc1,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, 0.5
-// CHECK: [0xf0,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, 0.5
+// CHECK: [0xf0,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, -4.0
-// CHECK: [0xf7,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, -4.0
+// CHECK: [0xf7,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, 0xaf123456
-// CHECK: [0xff,0x18,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_rpi_i32_f32 v5, 0xaf123456
+// CHECK: [0xff,0x18,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_rpi_i32_f32 v0, 0x3f717273
-// CHECK: [0xff,0x18,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_rpi_i32_f32 v5, 0x3f717273
+// CHECK: [0xff,0x18,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_rpi_i32_f32 v0, v0
-// CHECK: [0x00,0x19,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, v1
+// CHECK: [0x01,0x19,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, v255
-// CHECK: [0xff,0x19,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, v255
+// CHECK: [0xff,0x19,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x18,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x18,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x18,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x18,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x18,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x18,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x18,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x18,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x18,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x18,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x18,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x18,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x18,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x18,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x18,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x18,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x18,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x18,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x18,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x18,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x18,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x18,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x18,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x18,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x18,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x18,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x18,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x18,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x18,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x18,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x18,0xd3,0xfd,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x18,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x18,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x18,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x18,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x18,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x18,0xd3,0x00,0x00,0x00,0x20]
+v_cvt_rpi_i32_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x18,0xd3,0x01,0x00,0x00,0x20]
 
-v_cvt_rpi_i32_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x18,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x18,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32 v0, s0
-// CHECK: [0x00,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, s1
+// CHECK: [0x01,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v255, s0
-// CHECK: [0x00,0x1a,0xfe,0x7f]
+v_cvt_flr_i32_f32 v255, s1
+// CHECK: [0x01,0x1a,0xfe,0x7f]
 
-v_cvt_flr_i32_f32 v0, s103
-// CHECK: [0x67,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, s103
+// CHECK: [0x67,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, vcc_lo
-// CHECK: [0x6a,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, vcc_lo
+// CHECK: [0x6a,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, vcc_hi
-// CHECK: [0x6b,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, vcc_hi
+// CHECK: [0x6b,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, tba_lo
-// CHECK: [0x6c,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, tba_lo
+// CHECK: [0x6c,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, tba_hi
-// CHECK: [0x6d,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, tba_hi
+// CHECK: [0x6d,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, tma_lo
-// CHECK: [0x6e,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, tma_lo
+// CHECK: [0x6e,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, tma_hi
-// CHECK: [0x6f,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, tma_hi
+// CHECK: [0x6f,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, ttmp11
-// CHECK: [0x7b,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, ttmp11
+// CHECK: [0x7b,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, m0
-// CHECK: [0x7c,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, m0
+// CHECK: [0x7c,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, exec_lo
-// CHECK: [0x7e,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, exec_lo
+// CHECK: [0x7e,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, exec_hi
-// CHECK: [0x7f,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, exec_hi
+// CHECK: [0x7f,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, 0
-// CHECK: [0x80,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, 0
+// CHECK: [0x80,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, -1
-// CHECK: [0xc1,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, -1
+// CHECK: [0xc1,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, 0.5
-// CHECK: [0xf0,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, 0.5
+// CHECK: [0xf0,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, -4.0
-// CHECK: [0xf7,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, -4.0
+// CHECK: [0xf7,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, 0xaf123456
-// CHECK: [0xff,0x1a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_flr_i32_f32 v5, 0xaf123456
+// CHECK: [0xff,0x1a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_flr_i32_f32 v0, 0x3f717273
-// CHECK: [0xff,0x1a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_flr_i32_f32 v5, 0x3f717273
+// CHECK: [0xff,0x1a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_flr_i32_f32 v0, v0
-// CHECK: [0x00,0x1b,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, v1
+// CHECK: [0x01,0x1b,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, v255
-// CHECK: [0xff,0x1b,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, v255
+// CHECK: [0xff,0x1b,0x0a,0x7e]
 
-v_cvt_flr_i32_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x1a,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x1a,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x1a,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x1a,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x1a,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x1a,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x1a,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x1a,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x1a,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x1a,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x1a,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x1a,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x1a,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x1a,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x1a,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x1a,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x1a,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x1a,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x1a,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x1a,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x1a,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x1a,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x1a,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x1a,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x1a,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x1a,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x1a,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x1a,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x1a,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x1a,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x1a,0xd3,0xfd,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x1a,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x1a,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x1a,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x1a,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x1a,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x1a,0xd3,0x00,0x00,0x00,0x20]
+v_cvt_flr_i32_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x1a,0xd3,0x01,0x00,0x00,0x20]
 
-v_cvt_flr_i32_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x1a,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x1a,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4 v0, s0
-// CHECK: [0x00,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, s1
+// CHECK: [0x01,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v255, s0
-// CHECK: [0x00,0x1c,0xfe,0x7f]
+v_cvt_off_f32_i4 v255, s1
+// CHECK: [0x01,0x1c,0xfe,0x7f]
 
-v_cvt_off_f32_i4 v0, s103
-// CHECK: [0x67,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, s103
+// CHECK: [0x67,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, flat_scratch_lo
-// CHECK: [0x68,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, flat_scratch_lo
+// CHECK: [0x68,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, flat_scratch_hi
-// CHECK: [0x69,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, flat_scratch_hi
+// CHECK: [0x69,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, vcc_lo
-// CHECK: [0x6a,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, vcc_lo
+// CHECK: [0x6a,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, vcc_hi
-// CHECK: [0x6b,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, vcc_hi
+// CHECK: [0x6b,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, tba_lo
-// CHECK: [0x6c,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, tba_lo
+// CHECK: [0x6c,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, tba_hi
-// CHECK: [0x6d,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, tba_hi
+// CHECK: [0x6d,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, tma_lo
-// CHECK: [0x6e,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, tma_lo
+// CHECK: [0x6e,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, tma_hi
-// CHECK: [0x6f,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, tma_hi
+// CHECK: [0x6f,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, ttmp11
-// CHECK: [0x7b,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, ttmp11
+// CHECK: [0x7b,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, m0
-// CHECK: [0x7c,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, m0
+// CHECK: [0x7c,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, exec_lo
-// CHECK: [0x7e,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, exec_lo
+// CHECK: [0x7e,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, exec_hi
-// CHECK: [0x7f,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, exec_hi
+// CHECK: [0x7f,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, 0
-// CHECK: [0x80,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, 0
+// CHECK: [0x80,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, -1
-// CHECK: [0xc1,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, -1
+// CHECK: [0xc1,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, 0.5
-// CHECK: [0xf0,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, 0.5
+// CHECK: [0xf0,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, -4.0
-// CHECK: [0xf7,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, -4.0
+// CHECK: [0xf7,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, 0x4f
-// CHECK: [0xff,0x1c,0x00,0x7e,0x4f,0x00,0x00,0x00]
+v_cvt_off_f32_i4 v5, 0x4f
+// CHECK: [0xff,0x1c,0x0a,0x7e,0x4f,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4 v0, 0x41
-// CHECK: [0xff,0x1c,0x00,0x7e,0x41,0x00,0x00,0x00]
+v_cvt_off_f32_i4 v5, 0x41
+// CHECK: [0xff,0x1c,0x0a,0x7e,0x41,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4 v0, v0
-// CHECK: [0x00,0x1d,0x00,0x7e]
+v_cvt_off_f32_i4 v5, v1
+// CHECK: [0x01,0x1d,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, v255
-// CHECK: [0xff,0x1d,0x00,0x7e]
+v_cvt_off_f32_i4 v5, v255
+// CHECK: [0xff,0x1d,0x0a,0x7e]
 
-v_cvt_off_f32_i4_e64 v0, s0
-// CHECK: [0x00,0x00,0x1c,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, s1
+// CHECK: [0x05,0x00,0x1c,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v255, s0
-// CHECK: [0xff,0x00,0x1c,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v255, s1
+// CHECK: [0xff,0x00,0x1c,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, s103
-// CHECK: [0x00,0x00,0x1c,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, s103
+// CHECK: [0x05,0x00,0x1c,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x1c,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x1c,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x1c,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x1c,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x1c,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x1c,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x1c,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x1c,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x1c,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x1c,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x1c,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x1c,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x1c,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x1c,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x1c,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x1c,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x1c,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x1c,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, m0
-// CHECK: [0x00,0x00,0x1c,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, m0
+// CHECK: [0x05,0x00,0x1c,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x1c,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x1c,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x1c,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x1c,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, 0
-// CHECK: [0x00,0x00,0x1c,0xd3,0x80,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, 0
+// CHECK: [0x05,0x00,0x1c,0xd3,0x80,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, -1
-// CHECK: [0x00,0x00,0x1c,0xd3,0xc1,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, -1
+// CHECK: [0x05,0x00,0x1c,0xd3,0xc1,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x1c,0xd3,0xf0,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x1c,0xd3,0xf0,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x1c,0xd3,0xf7,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x1c,0xd3,0xf7,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, v0
-// CHECK: [0x00,0x00,0x1c,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, v1
+// CHECK: [0x05,0x00,0x1c,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, v255
-// CHECK: [0x00,0x00,0x1c,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, v255
+// CHECK: [0x05,0x00,0x1c,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_f64 v0, s[0:1]
-// CHECK: [0x00,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, s[2:3]
+// CHECK: [0x02,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v255, s[0:1]
-// CHECK: [0x00,0x1e,0xfe,0x7f]
+v_cvt_f32_f64 v255, s[2:3]
+// CHECK: [0x02,0x1e,0xfe,0x7f]
 
-v_cvt_f32_f64 v0, s[2:3]
-// CHECK: [0x02,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, s[4:5]
+// CHECK: [0x04,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, s[102:103]
-// CHECK: [0x66,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, s[102:103]
+// CHECK: [0x66,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, flat_scratch
-// CHECK: [0x68,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, flat_scratch
+// CHECK: [0x68,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, vcc
-// CHECK: [0x6a,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, vcc
+// CHECK: [0x6a,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, tba
-// CHECK: [0x6c,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, tba
+// CHECK: [0x6c,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, tma
-// CHECK: [0x6e,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, tma
+// CHECK: [0x6e,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, ttmp[10:11]
-// CHECK: [0x7a,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, ttmp[10:11]
+// CHECK: [0x7a,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, exec
-// CHECK: [0x7e,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, exec
+// CHECK: [0x7e,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, 0
-// CHECK: [0x80,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, 0
+// CHECK: [0x80,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, -1
-// CHECK: [0xc1,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, -1
+// CHECK: [0xc1,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, 0.5
-// CHECK: [0xf0,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, 0.5
+// CHECK: [0xf0,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, -4.0
-// CHECK: [0xf7,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, -4.0
+// CHECK: [0xf7,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, 0xaf123456
-// CHECK: [0xff,0x1e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_f64 v5, 0xaf123456
+// CHECK: [0xff,0x1e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_f64 v0, 0x3f717273
-// CHECK: [0xff,0x1e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_f64 v5, 0x3f717273
+// CHECK: [0xff,0x1e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_f64 v0, v[0:1]
-// CHECK: [0x00,0x1f,0x00,0x7e]
+v_cvt_f32_f64 v5, v[1:2]
+// CHECK: [0x01,0x1f,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, v[254:255]
-// CHECK: [0xfe,0x1f,0x00,0x7e]
+v_cvt_f32_f64 v5, v[254:255]
+// CHECK: [0xfe,0x1f,0x0a,0x7e]
 
-v_cvt_f32_f64_e64 v0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, s[2:3]
+// CHECK: [0x05,0x00,0x1e,0xd3,0x02,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v255, s[0:1]
-// CHECK: [0xff,0x00,0x1e,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v255, s[2:3]
+// CHECK: [0xff,0x00,0x1e,0xd3,0x02,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, s[2:3]
-// CHECK: [0x00,0x00,0x1e,0xd3,0x02,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, s[4:5]
+// CHECK: [0x05,0x00,0x1e,0xd3,0x04,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, s[102:103]
-// CHECK: [0x00,0x00,0x1e,0xd3,0x66,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, s[102:103]
+// CHECK: [0x05,0x00,0x1e,0xd3,0x66,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, flat_scratch
-// CHECK: [0x00,0x00,0x1e,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, flat_scratch
+// CHECK: [0x05,0x00,0x1e,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, vcc
-// CHECK: [0x00,0x00,0x1e,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, vcc
+// CHECK: [0x05,0x00,0x1e,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, tba
-// CHECK: [0x00,0x00,0x1e,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, tba
+// CHECK: [0x05,0x00,0x1e,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, tma
-// CHECK: [0x00,0x00,0x1e,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, tma
+// CHECK: [0x05,0x00,0x1e,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, ttmp[10:11]
-// CHECK: [0x00,0x00,0x1e,0xd3,0x7a,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, ttmp[10:11]
+// CHECK: [0x05,0x00,0x1e,0xd3,0x7a,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, exec
-// CHECK: [0x00,0x00,0x1e,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, exec
+// CHECK: [0x05,0x00,0x1e,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, scc
-// CHECK: [0x00,0x00,0x1e,0xd3,0xfd,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, scc
+// CHECK: [0x05,0x00,0x1e,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f32_f64_e64 v5, v[1:2]
+// CHECK: [0x05,0x00,0x1e,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, v[254:255]
-// CHECK: [0x00,0x00,0x1e,0xd3,0xfe,0x01,0x00,0x00]
+v_cvt_f32_f64_e64 v5, v[254:255]
+// CHECK: [0x05,0x00,0x1e,0xd3,0xfe,0x01,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, -s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd3,0x00,0x00,0x00,0x20]
+v_cvt_f32_f64_e64 v5, -s[2:3]
+// CHECK: [0x05,0x00,0x1e,0xd3,0x02,0x00,0x00,0x20]
 
-v_cvt_f32_f64_e64 v0, |s[0:1]|
-// CHECK: [0x00,0x01,0x1e,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, |s[2:3]|
+// CHECK: [0x05,0x01,0x1e,0xd3,0x02,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, s[0:1] clamp
-// CHECK: [0x00,0x08,0x1e,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, s[2:3] clamp
+// CHECK: [0x05,0x08,0x1e,0xd3,0x02,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, s[0:1] mul:2
-// CHECK: [0x00,0x00,0x1e,0xd3,0x00,0x00,0x00,0x08]
+v_cvt_f32_f64_e64 v5, s[2:3] mul:2
+// CHECK: [0x05,0x00,0x1e,0xd3,0x02,0x00,0x00,0x08]
 
-v_cvt_f32_f64_e64 v0, s[0:1] mul:4
-// CHECK: [0x00,0x00,0x1e,0xd3,0x00,0x00,0x00,0x10]
+v_cvt_f32_f64_e64 v5, s[2:3] mul:4
+// CHECK: [0x05,0x00,0x1e,0xd3,0x02,0x00,0x00,0x10]
 
-v_cvt_f32_f64_e64 v0, s[0:1] div:2
-// CHECK: [0x00,0x00,0x1e,0xd3,0x00,0x00,0x00,0x18]
+v_cvt_f32_f64_e64 v5, s[2:3] div:2
+// CHECK: [0x05,0x00,0x1e,0xd3,0x02,0x00,0x00,0x18]
 
-v_cvt_f64_f32 v[0:1], s0
-// CHECK: [0x00,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], s1
+// CHECK: [0x01,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[254:255], s0
-// CHECK: [0x00,0x20,0xfc,0x7f]
+v_cvt_f64_f32 v[254:255], s1
+// CHECK: [0x01,0x20,0xfc,0x7f]
 
-v_cvt_f64_f32 v[0:1], s103
-// CHECK: [0x67,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], s103
+// CHECK: [0x67,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], flat_scratch_lo
-// CHECK: [0x68,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], flat_scratch_lo
+// CHECK: [0x68,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], flat_scratch_hi
-// CHECK: [0x69,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], flat_scratch_hi
+// CHECK: [0x69,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], vcc_lo
-// CHECK: [0x6a,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], vcc_lo
+// CHECK: [0x6a,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], vcc_hi
-// CHECK: [0x6b,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], vcc_hi
+// CHECK: [0x6b,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], tba_lo
-// CHECK: [0x6c,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], tba_lo
+// CHECK: [0x6c,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], tba_hi
-// CHECK: [0x6d,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], tba_hi
+// CHECK: [0x6d,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], tma_lo
-// CHECK: [0x6e,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], tma_lo
+// CHECK: [0x6e,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], tma_hi
-// CHECK: [0x6f,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], tma_hi
+// CHECK: [0x6f,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], ttmp11
-// CHECK: [0x7b,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], ttmp11
+// CHECK: [0x7b,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], m0
-// CHECK: [0x7c,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], m0
+// CHECK: [0x7c,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], exec_lo
-// CHECK: [0x7e,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], exec_lo
+// CHECK: [0x7e,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], exec_hi
-// CHECK: [0x7f,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], exec_hi
+// CHECK: [0x7f,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], 0
-// CHECK: [0x80,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], 0
+// CHECK: [0x80,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], -1
-// CHECK: [0xc1,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], -1
+// CHECK: [0xc1,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], 0.5
-// CHECK: [0xf0,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], 0.5
+// CHECK: [0xf0,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], -4.0
-// CHECK: [0xf7,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], -4.0
+// CHECK: [0xf7,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], 0xaf123456
-// CHECK: [0xff,0x20,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f64_f32 v[5:6], 0xaf123456
+// CHECK: [0xff,0x20,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f64_f32 v[0:1], 0x3f717273
-// CHECK: [0xff,0x20,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f64_f32 v[5:6], 0x3f717273
+// CHECK: [0xff,0x20,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f64_f32 v[0:1], v0
-// CHECK: [0x00,0x21,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], v1
+// CHECK: [0x01,0x21,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], v255
-// CHECK: [0xff,0x21,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], v255
+// CHECK: [0xff,0x21,0x0a,0x7e]
 
-v_cvt_f64_f32_e64 v[0:1], s0
-// CHECK: [0x00,0x00,0x20,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], s1
+// CHECK: [0x05,0x00,0x20,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[254:255], s0
-// CHECK: [0xfe,0x00,0x20,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[254:255], s1
+// CHECK: [0xfe,0x00,0x20,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], s103
-// CHECK: [0x00,0x00,0x20,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], s103
+// CHECK: [0x05,0x00,0x20,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x20,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], flat_scratch_lo
+// CHECK: [0x05,0x00,0x20,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x20,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], flat_scratch_hi
+// CHECK: [0x05,0x00,0x20,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x20,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], vcc_lo
+// CHECK: [0x05,0x00,0x20,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x20,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], vcc_hi
+// CHECK: [0x05,0x00,0x20,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], tba_lo
-// CHECK: [0x00,0x00,0x20,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], tba_lo
+// CHECK: [0x05,0x00,0x20,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], tba_hi
-// CHECK: [0x00,0x00,0x20,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], tba_hi
+// CHECK: [0x05,0x00,0x20,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], tma_lo
-// CHECK: [0x00,0x00,0x20,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], tma_lo
+// CHECK: [0x05,0x00,0x20,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], tma_hi
-// CHECK: [0x00,0x00,0x20,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], tma_hi
+// CHECK: [0x05,0x00,0x20,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], ttmp11
-// CHECK: [0x00,0x00,0x20,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], ttmp11
+// CHECK: [0x05,0x00,0x20,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], m0
-// CHECK: [0x00,0x00,0x20,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], m0
+// CHECK: [0x05,0x00,0x20,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], exec_lo
-// CHECK: [0x00,0x00,0x20,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], exec_lo
+// CHECK: [0x05,0x00,0x20,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], exec_hi
-// CHECK: [0x00,0x00,0x20,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], exec_hi
+// CHECK: [0x05,0x00,0x20,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x20,0xd3,0xfd,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x20,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], v0
-// CHECK: [0x00,0x00,0x20,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], v1
+// CHECK: [0x05,0x00,0x20,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], v255
-// CHECK: [0x00,0x00,0x20,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], v255
+// CHECK: [0x05,0x00,0x20,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], -s0
-// CHECK: [0x00,0x00,0x20,0xd3,0x00,0x00,0x00,0x20]
+v_cvt_f64_f32_e64 v[5:6], -s1
+// CHECK: [0x05,0x00,0x20,0xd3,0x01,0x00,0x00,0x20]
 
-v_cvt_f64_f32_e64 v[0:1], |s0|
-// CHECK: [0x00,0x01,0x20,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], |s1|
+// CHECK: [0x05,0x01,0x20,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], s0 clamp
-// CHECK: [0x00,0x08,0x20,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], s1 clamp
+// CHECK: [0x05,0x08,0x20,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], s0 mul:2
-// CHECK: [0x00,0x00,0x20,0xd3,0x00,0x00,0x00,0x08]
+v_cvt_f64_f32_e64 v[5:6], s1 mul:2
+// CHECK: [0x05,0x00,0x20,0xd3,0x01,0x00,0x00,0x08]
 
-v_cvt_f64_f32_e64 v[0:1], s0 mul:4
-// CHECK: [0x00,0x00,0x20,0xd3,0x00,0x00,0x00,0x10]
+v_cvt_f64_f32_e64 v[5:6], s1 mul:4
+// CHECK: [0x05,0x00,0x20,0xd3,0x01,0x00,0x00,0x10]
 
-v_cvt_f64_f32_e64 v[0:1], s0 div:2
-// CHECK: [0x00,0x00,0x20,0xd3,0x00,0x00,0x00,0x18]
+v_cvt_f64_f32_e64 v[5:6], s1 div:2
+// CHECK: [0x05,0x00,0x20,0xd3,0x01,0x00,0x00,0x18]
 
-v_cvt_f32_ubyte0 v0, s0
-// CHECK: [0x00,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, s1
+// CHECK: [0x01,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v255, s0
-// CHECK: [0x00,0x22,0xfe,0x7f]
+v_cvt_f32_ubyte0 v255, s1
+// CHECK: [0x01,0x22,0xfe,0x7f]
 
-v_cvt_f32_ubyte0 v0, s103
-// CHECK: [0x67,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, s103
+// CHECK: [0x67,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, flat_scratch_lo
-// CHECK: [0x68,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, flat_scratch_lo
+// CHECK: [0x68,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, flat_scratch_hi
-// CHECK: [0x69,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, flat_scratch_hi
+// CHECK: [0x69,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, vcc_lo
-// CHECK: [0x6a,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, vcc_lo
+// CHECK: [0x6a,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, vcc_hi
-// CHECK: [0x6b,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, vcc_hi
+// CHECK: [0x6b,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, tba_lo
-// CHECK: [0x6c,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, tba_lo
+// CHECK: [0x6c,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, tba_hi
-// CHECK: [0x6d,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, tba_hi
+// CHECK: [0x6d,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, tma_lo
-// CHECK: [0x6e,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, tma_lo
+// CHECK: [0x6e,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, tma_hi
-// CHECK: [0x6f,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, tma_hi
+// CHECK: [0x6f,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, ttmp11
-// CHECK: [0x7b,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, ttmp11
+// CHECK: [0x7b,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, m0
-// CHECK: [0x7c,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, m0
+// CHECK: [0x7c,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, exec_lo
-// CHECK: [0x7e,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, exec_lo
+// CHECK: [0x7e,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, exec_hi
-// CHECK: [0x7f,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, exec_hi
+// CHECK: [0x7f,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, 0
-// CHECK: [0x80,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, 0
+// CHECK: [0x80,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, -1
-// CHECK: [0xc1,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, -1
+// CHECK: [0xc1,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, 0.5
-// CHECK: [0xf0,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, 0.5
+// CHECK: [0xf0,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, -4.0
-// CHECK: [0xf7,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, -4.0
+// CHECK: [0xf7,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, 0xaf123456
-// CHECK: [0xff,0x22,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_ubyte0 v5, 0xaf123456
+// CHECK: [0xff,0x22,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_ubyte0 v0, 0x3f717273
-// CHECK: [0xff,0x22,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_ubyte0 v5, 0x3f717273
+// CHECK: [0xff,0x22,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_ubyte0 v0, v0
-// CHECK: [0x00,0x23,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, v1
+// CHECK: [0x01,0x23,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, v255
-// CHECK: [0xff,0x23,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, v255
+// CHECK: [0xff,0x23,0x0a,0x7e]
 
-v_cvt_f32_ubyte0_e64 v0, s0
-// CHECK: [0x00,0x00,0x22,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, s1
+// CHECK: [0x05,0x00,0x22,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v255, s0
-// CHECK: [0xff,0x00,0x22,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v255, s1
+// CHECK: [0xff,0x00,0x22,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, s103
-// CHECK: [0x00,0x00,0x22,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, s103
+// CHECK: [0x05,0x00,0x22,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x22,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x22,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x22,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x22,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x22,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x22,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x22,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x22,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x22,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x22,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x22,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x22,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x22,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x22,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x22,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x22,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x22,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x22,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, m0
-// CHECK: [0x00,0x00,0x22,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, m0
+// CHECK: [0x05,0x00,0x22,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x22,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x22,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x22,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x22,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, 0
-// CHECK: [0x00,0x00,0x22,0xd3,0x80,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, 0
+// CHECK: [0x05,0x00,0x22,0xd3,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, -1
-// CHECK: [0x00,0x00,0x22,0xd3,0xc1,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, -1
+// CHECK: [0x05,0x00,0x22,0xd3,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x22,0xd3,0xf0,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x22,0xd3,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x22,0xd3,0xf7,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x22,0xd3,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, v0
-// CHECK: [0x00,0x00,0x22,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, v1
+// CHECK: [0x05,0x00,0x22,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, v255
-// CHECK: [0x00,0x00,0x22,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, v255
+// CHECK: [0x05,0x00,0x22,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte1 v0, s0
-// CHECK: [0x00,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, s1
+// CHECK: [0x01,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v255, s0
-// CHECK: [0x00,0x24,0xfe,0x7f]
+v_cvt_f32_ubyte1 v255, s1
+// CHECK: [0x01,0x24,0xfe,0x7f]
 
-v_cvt_f32_ubyte1 v0, s103
-// CHECK: [0x67,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, s103
+// CHECK: [0x67,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, flat_scratch_lo
-// CHECK: [0x68,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, flat_scratch_lo
+// CHECK: [0x68,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, flat_scratch_hi
-// CHECK: [0x69,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, flat_scratch_hi
+// CHECK: [0x69,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, vcc_lo
-// CHECK: [0x6a,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, vcc_lo
+// CHECK: [0x6a,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, vcc_hi
-// CHECK: [0x6b,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, vcc_hi
+// CHECK: [0x6b,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, tba_lo
-// CHECK: [0x6c,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, tba_lo
+// CHECK: [0x6c,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, tba_hi
-// CHECK: [0x6d,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, tba_hi
+// CHECK: [0x6d,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, tma_lo
-// CHECK: [0x6e,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, tma_lo
+// CHECK: [0x6e,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, tma_hi
-// CHECK: [0x6f,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, tma_hi
+// CHECK: [0x6f,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, ttmp11
-// CHECK: [0x7b,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, ttmp11
+// CHECK: [0x7b,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, m0
-// CHECK: [0x7c,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, m0
+// CHECK: [0x7c,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, exec_lo
-// CHECK: [0x7e,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, exec_lo
+// CHECK: [0x7e,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, exec_hi
-// CHECK: [0x7f,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, exec_hi
+// CHECK: [0x7f,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, 0
-// CHECK: [0x80,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, 0
+// CHECK: [0x80,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, -1
-// CHECK: [0xc1,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, -1
+// CHECK: [0xc1,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, 0.5
-// CHECK: [0xf0,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, 0.5
+// CHECK: [0xf0,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, -4.0
-// CHECK: [0xf7,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, -4.0
+// CHECK: [0xf7,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, 0xaf123456
-// CHECK: [0xff,0x24,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_ubyte1 v5, 0xaf123456
+// CHECK: [0xff,0x24,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_ubyte1 v0, 0x3f717273
-// CHECK: [0xff,0x24,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_ubyte1 v5, 0x3f717273
+// CHECK: [0xff,0x24,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_ubyte1 v0, v0
-// CHECK: [0x00,0x25,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, v1
+// CHECK: [0x01,0x25,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, v255
-// CHECK: [0xff,0x25,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, v255
+// CHECK: [0xff,0x25,0x0a,0x7e]
 
-v_cvt_f32_ubyte1_e64 v0, s0
-// CHECK: [0x00,0x00,0x24,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, s1
+// CHECK: [0x05,0x00,0x24,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v255, s0
-// CHECK: [0xff,0x00,0x24,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v255, s1
+// CHECK: [0xff,0x00,0x24,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, s103
-// CHECK: [0x00,0x00,0x24,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, s103
+// CHECK: [0x05,0x00,0x24,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x24,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x24,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x24,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x24,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x24,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x24,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x24,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x24,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x24,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x24,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x24,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x24,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x24,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x24,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x24,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x24,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x24,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x24,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, m0
-// CHECK: [0x00,0x00,0x24,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, m0
+// CHECK: [0x05,0x00,0x24,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x24,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x24,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x24,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x24,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, 0
-// CHECK: [0x00,0x00,0x24,0xd3,0x80,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, 0
+// CHECK: [0x05,0x00,0x24,0xd3,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, -1
-// CHECK: [0x00,0x00,0x24,0xd3,0xc1,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, -1
+// CHECK: [0x05,0x00,0x24,0xd3,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x24,0xd3,0xf0,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x24,0xd3,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x24,0xd3,0xf7,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x24,0xd3,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, v0
-// CHECK: [0x00,0x00,0x24,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, v1
+// CHECK: [0x05,0x00,0x24,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, v255
-// CHECK: [0x00,0x00,0x24,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, v255
+// CHECK: [0x05,0x00,0x24,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte2 v0, s0
-// CHECK: [0x00,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, s1
+// CHECK: [0x01,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v255, s0
-// CHECK: [0x00,0x26,0xfe,0x7f]
+v_cvt_f32_ubyte2 v255, s1
+// CHECK: [0x01,0x26,0xfe,0x7f]
 
-v_cvt_f32_ubyte2 v0, s103
-// CHECK: [0x67,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, s103
+// CHECK: [0x67,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, flat_scratch_lo
-// CHECK: [0x68,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, flat_scratch_lo
+// CHECK: [0x68,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, flat_scratch_hi
-// CHECK: [0x69,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, flat_scratch_hi
+// CHECK: [0x69,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, vcc_lo
-// CHECK: [0x6a,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, vcc_lo
+// CHECK: [0x6a,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, vcc_hi
-// CHECK: [0x6b,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, vcc_hi
+// CHECK: [0x6b,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, tba_lo
-// CHECK: [0x6c,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, tba_lo
+// CHECK: [0x6c,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, tba_hi
-// CHECK: [0x6d,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, tba_hi
+// CHECK: [0x6d,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, tma_lo
-// CHECK: [0x6e,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, tma_lo
+// CHECK: [0x6e,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, tma_hi
-// CHECK: [0x6f,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, tma_hi
+// CHECK: [0x6f,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, ttmp11
-// CHECK: [0x7b,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, ttmp11
+// CHECK: [0x7b,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, m0
-// CHECK: [0x7c,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, m0
+// CHECK: [0x7c,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, exec_lo
-// CHECK: [0x7e,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, exec_lo
+// CHECK: [0x7e,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, exec_hi
-// CHECK: [0x7f,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, exec_hi
+// CHECK: [0x7f,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, 0
-// CHECK: [0x80,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, 0
+// CHECK: [0x80,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, -1
-// CHECK: [0xc1,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, -1
+// CHECK: [0xc1,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, 0.5
-// CHECK: [0xf0,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, 0.5
+// CHECK: [0xf0,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, -4.0
-// CHECK: [0xf7,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, -4.0
+// CHECK: [0xf7,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, 0xaf123456
-// CHECK: [0xff,0x26,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_ubyte2 v5, 0xaf123456
+// CHECK: [0xff,0x26,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_ubyte2 v0, 0x3f717273
-// CHECK: [0xff,0x26,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_ubyte2 v5, 0x3f717273
+// CHECK: [0xff,0x26,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_ubyte2 v0, v0
-// CHECK: [0x00,0x27,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, v1
+// CHECK: [0x01,0x27,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, v255
-// CHECK: [0xff,0x27,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, v255
+// CHECK: [0xff,0x27,0x0a,0x7e]
 
-v_cvt_f32_ubyte2_e64 v0, s0
-// CHECK: [0x00,0x00,0x26,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, s1
+// CHECK: [0x05,0x00,0x26,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v255, s0
-// CHECK: [0xff,0x00,0x26,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v255, s1
+// CHECK: [0xff,0x00,0x26,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, s103
-// CHECK: [0x00,0x00,0x26,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, s103
+// CHECK: [0x05,0x00,0x26,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x26,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x26,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x26,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x26,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x26,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x26,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x26,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x26,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x26,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x26,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x26,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x26,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x26,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x26,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x26,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x26,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x26,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x26,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, m0
-// CHECK: [0x00,0x00,0x26,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, m0
+// CHECK: [0x05,0x00,0x26,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x26,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x26,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x26,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x26,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, 0
-// CHECK: [0x00,0x00,0x26,0xd3,0x80,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, 0
+// CHECK: [0x05,0x00,0x26,0xd3,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, -1
-// CHECK: [0x00,0x00,0x26,0xd3,0xc1,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, -1
+// CHECK: [0x05,0x00,0x26,0xd3,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x26,0xd3,0xf0,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x26,0xd3,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x26,0xd3,0xf7,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x26,0xd3,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, v0
-// CHECK: [0x00,0x00,0x26,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, v1
+// CHECK: [0x05,0x00,0x26,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, v255
-// CHECK: [0x00,0x00,0x26,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, v255
+// CHECK: [0x05,0x00,0x26,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte3 v0, s0
-// CHECK: [0x00,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, s1
+// CHECK: [0x01,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v255, s0
-// CHECK: [0x00,0x28,0xfe,0x7f]
+v_cvt_f32_ubyte3 v255, s1
+// CHECK: [0x01,0x28,0xfe,0x7f]
 
-v_cvt_f32_ubyte3 v0, s103
-// CHECK: [0x67,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, s103
+// CHECK: [0x67,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, flat_scratch_lo
-// CHECK: [0x68,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, flat_scratch_lo
+// CHECK: [0x68,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, flat_scratch_hi
-// CHECK: [0x69,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, flat_scratch_hi
+// CHECK: [0x69,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, vcc_lo
-// CHECK: [0x6a,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, vcc_lo
+// CHECK: [0x6a,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, vcc_hi
-// CHECK: [0x6b,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, vcc_hi
+// CHECK: [0x6b,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, tba_lo
-// CHECK: [0x6c,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, tba_lo
+// CHECK: [0x6c,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, tba_hi
-// CHECK: [0x6d,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, tba_hi
+// CHECK: [0x6d,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, tma_lo
-// CHECK: [0x6e,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, tma_lo
+// CHECK: [0x6e,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, tma_hi
-// CHECK: [0x6f,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, tma_hi
+// CHECK: [0x6f,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, ttmp11
-// CHECK: [0x7b,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, ttmp11
+// CHECK: [0x7b,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, m0
-// CHECK: [0x7c,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, m0
+// CHECK: [0x7c,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, exec_lo
-// CHECK: [0x7e,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, exec_lo
+// CHECK: [0x7e,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, exec_hi
-// CHECK: [0x7f,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, exec_hi
+// CHECK: [0x7f,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, 0
-// CHECK: [0x80,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, 0
+// CHECK: [0x80,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, -1
-// CHECK: [0xc1,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, -1
+// CHECK: [0xc1,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, 0.5
-// CHECK: [0xf0,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, 0.5
+// CHECK: [0xf0,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, -4.0
-// CHECK: [0xf7,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, -4.0
+// CHECK: [0xf7,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, 0xaf123456
-// CHECK: [0xff,0x28,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_ubyte3 v5, 0xaf123456
+// CHECK: [0xff,0x28,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_ubyte3 v0, 0x3f717273
-// CHECK: [0xff,0x28,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_ubyte3 v5, 0x3f717273
+// CHECK: [0xff,0x28,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_ubyte3 v0, v0
-// CHECK: [0x00,0x29,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, v1
+// CHECK: [0x01,0x29,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, v255
-// CHECK: [0xff,0x29,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, v255
+// CHECK: [0xff,0x29,0x0a,0x7e]
 
-v_cvt_f32_ubyte3_e64 v0, s0
-// CHECK: [0x00,0x00,0x28,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, s1
+// CHECK: [0x05,0x00,0x28,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v255, s0
-// CHECK: [0xff,0x00,0x28,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v255, s1
+// CHECK: [0xff,0x00,0x28,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, s103
-// CHECK: [0x00,0x00,0x28,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, s103
+// CHECK: [0x05,0x00,0x28,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x28,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x28,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x28,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x28,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x28,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x28,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x28,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x28,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x28,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x28,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x28,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x28,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x28,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x28,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x28,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x28,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x28,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x28,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, m0
-// CHECK: [0x00,0x00,0x28,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, m0
+// CHECK: [0x05,0x00,0x28,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x28,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x28,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x28,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x28,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, 0
-// CHECK: [0x00,0x00,0x28,0xd3,0x80,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, 0
+// CHECK: [0x05,0x00,0x28,0xd3,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, -1
-// CHECK: [0x00,0x00,0x28,0xd3,0xc1,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, -1
+// CHECK: [0x05,0x00,0x28,0xd3,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x28,0xd3,0xf0,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x28,0xd3,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x28,0xd3,0xf7,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x28,0xd3,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, v0
-// CHECK: [0x00,0x00,0x28,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, v1
+// CHECK: [0x05,0x00,0x28,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, v255
-// CHECK: [0x00,0x00,0x28,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, v255
+// CHECK: [0x05,0x00,0x28,0xd3,0xff,0x01,0x00,0x00]
 
-v_cvt_u32_f64 v0, s[0:1]
-// CHECK: [0x00,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, s[2:3]
+// CHECK: [0x02,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v255, s[0:1]
-// CHECK: [0x00,0x2a,0xfe,0x7f]
+v_cvt_u32_f64 v255, s[2:3]
+// CHECK: [0x02,0x2a,0xfe,0x7f]
 
-v_cvt_u32_f64 v0, s[2:3]
-// CHECK: [0x02,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, s[4:5]
+// CHECK: [0x04,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, s[102:103]
-// CHECK: [0x66,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, s[102:103]
+// CHECK: [0x66,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, flat_scratch
-// CHECK: [0x68,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, flat_scratch
+// CHECK: [0x68,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, vcc
-// CHECK: [0x6a,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, vcc
+// CHECK: [0x6a,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, tba
-// CHECK: [0x6c,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, tba
+// CHECK: [0x6c,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, tma
-// CHECK: [0x6e,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, tma
+// CHECK: [0x6e,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, ttmp[10:11]
-// CHECK: [0x7a,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, ttmp[10:11]
+// CHECK: [0x7a,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, exec
-// CHECK: [0x7e,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, exec
+// CHECK: [0x7e,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, 0
-// CHECK: [0x80,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, 0
+// CHECK: [0x80,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, -1
-// CHECK: [0xc1,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, -1
+// CHECK: [0xc1,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, 0.5
-// CHECK: [0xf0,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, 0.5
+// CHECK: [0xf0,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, -4.0
-// CHECK: [0xf7,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, -4.0
+// CHECK: [0xf7,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, 0xaf123456
-// CHECK: [0xff,0x2a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_u32_f64 v5, 0xaf123456
+// CHECK: [0xff,0x2a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_f64 v0, 0x3f717273
-// CHECK: [0xff,0x2a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_u32_f64 v5, 0x3f717273
+// CHECK: [0xff,0x2a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_u32_f64 v0, v[0:1]
-// CHECK: [0x00,0x2b,0x00,0x7e]
+v_cvt_u32_f64 v5, v[1:2]
+// CHECK: [0x01,0x2b,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, v[254:255]
-// CHECK: [0xfe,0x2b,0x00,0x7e]
+v_cvt_u32_f64 v5, v[254:255]
+// CHECK: [0xfe,0x2b,0x0a,0x7e]
 
-v_cvt_u32_f64_e64 v0, s[0:1]
-// CHECK: [0x00,0x00,0x2a,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, s[2:3]
+// CHECK: [0x05,0x00,0x2a,0xd3,0x02,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v255, s[0:1]
-// CHECK: [0xff,0x00,0x2a,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v255, s[2:3]
+// CHECK: [0xff,0x00,0x2a,0xd3,0x02,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, s[2:3]
-// CHECK: [0x00,0x00,0x2a,0xd3,0x02,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, s[4:5]
+// CHECK: [0x05,0x00,0x2a,0xd3,0x04,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, s[102:103]
-// CHECK: [0x00,0x00,0x2a,0xd3,0x66,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, s[102:103]
+// CHECK: [0x05,0x00,0x2a,0xd3,0x66,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, flat_scratch
-// CHECK: [0x00,0x00,0x2a,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, flat_scratch
+// CHECK: [0x05,0x00,0x2a,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, vcc
-// CHECK: [0x00,0x00,0x2a,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, vcc
+// CHECK: [0x05,0x00,0x2a,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, tba
-// CHECK: [0x00,0x00,0x2a,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, tba
+// CHECK: [0x05,0x00,0x2a,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, tma
-// CHECK: [0x00,0x00,0x2a,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, tma
+// CHECK: [0x05,0x00,0x2a,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, ttmp[10:11]
-// CHECK: [0x00,0x00,0x2a,0xd3,0x7a,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, ttmp[10:11]
+// CHECK: [0x05,0x00,0x2a,0xd3,0x7a,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, exec
-// CHECK: [0x00,0x00,0x2a,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, exec
+// CHECK: [0x05,0x00,0x2a,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, scc
-// CHECK: [0x00,0x00,0x2a,0xd3,0xfd,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, scc
+// CHECK: [0x05,0x00,0x2a,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x2a,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_u32_f64_e64 v5, v[1:2]
+// CHECK: [0x05,0x00,0x2a,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, v[254:255]
-// CHECK: [0x00,0x00,0x2a,0xd3,0xfe,0x01,0x00,0x00]
+v_cvt_u32_f64_e64 v5, v[254:255]
+// CHECK: [0x05,0x00,0x2a,0xd3,0xfe,0x01,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, -s[0:1]
-// CHECK: [0x00,0x00,0x2a,0xd3,0x00,0x00,0x00,0x20]
+v_cvt_u32_f64_e64 v5, -s[2:3]
+// CHECK: [0x05,0x00,0x2a,0xd3,0x02,0x00,0x00,0x20]
 
-v_cvt_u32_f64_e64 v0, |s[0:1]|
-// CHECK: [0x00,0x01,0x2a,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, |s[2:3]|
+// CHECK: [0x05,0x01,0x2a,0xd3,0x02,0x00,0x00,0x00]
 
-v_cvt_f64_u32 v[0:1], s0
-// CHECK: [0x00,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], s1
+// CHECK: [0x01,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[254:255], s0
-// CHECK: [0x00,0x2c,0xfc,0x7f]
+v_cvt_f64_u32 v[254:255], s1
+// CHECK: [0x01,0x2c,0xfc,0x7f]
 
-v_cvt_f64_u32 v[0:1], s103
-// CHECK: [0x67,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], s103
+// CHECK: [0x67,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], flat_scratch_lo
-// CHECK: [0x68,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], flat_scratch_lo
+// CHECK: [0x68,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], flat_scratch_hi
-// CHECK: [0x69,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], flat_scratch_hi
+// CHECK: [0x69,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], vcc_lo
-// CHECK: [0x6a,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], vcc_lo
+// CHECK: [0x6a,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], vcc_hi
-// CHECK: [0x6b,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], vcc_hi
+// CHECK: [0x6b,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], tba_lo
-// CHECK: [0x6c,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], tba_lo
+// CHECK: [0x6c,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], tba_hi
-// CHECK: [0x6d,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], tba_hi
+// CHECK: [0x6d,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], tma_lo
-// CHECK: [0x6e,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], tma_lo
+// CHECK: [0x6e,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], tma_hi
-// CHECK: [0x6f,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], tma_hi
+// CHECK: [0x6f,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], ttmp11
-// CHECK: [0x7b,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], ttmp11
+// CHECK: [0x7b,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], m0
-// CHECK: [0x7c,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], m0
+// CHECK: [0x7c,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], exec_lo
-// CHECK: [0x7e,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], exec_lo
+// CHECK: [0x7e,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], exec_hi
-// CHECK: [0x7f,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], exec_hi
+// CHECK: [0x7f,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], 0
-// CHECK: [0x80,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], 0
+// CHECK: [0x80,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], -1
-// CHECK: [0xc1,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], -1
+// CHECK: [0xc1,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], 0.5
-// CHECK: [0xf0,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], 0.5
+// CHECK: [0xf0,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], -4.0
-// CHECK: [0xf7,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], -4.0
+// CHECK: [0xf7,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], 0xaf123456
-// CHECK: [0xff,0x2c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f64_u32 v[5:6], 0xaf123456
+// CHECK: [0xff,0x2c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f64_u32 v[0:1], 0x3f717273
-// CHECK: [0xff,0x2c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f64_u32 v[5:6], 0x3f717273
+// CHECK: [0xff,0x2c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f64_u32 v[0:1], v0
-// CHECK: [0x00,0x2d,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], v1
+// CHECK: [0x01,0x2d,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], v255
-// CHECK: [0xff,0x2d,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], v255
+// CHECK: [0xff,0x2d,0x0a,0x7e]
 
-v_cvt_f64_u32_e64 v[0:1], s0
-// CHECK: [0x00,0x00,0x2c,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], s1
+// CHECK: [0x05,0x00,0x2c,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[254:255], s0
-// CHECK: [0xfe,0x00,0x2c,0xd3,0x00,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[254:255], s1
+// CHECK: [0xfe,0x00,0x2c,0xd3,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], s103
-// CHECK: [0x00,0x00,0x2c,0xd3,0x67,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], s103
+// CHECK: [0x05,0x00,0x2c,0xd3,0x67,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x2c,0xd3,0x68,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], flat_scratch_lo
+// CHECK: [0x05,0x00,0x2c,0xd3,0x68,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x2c,0xd3,0x69,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], flat_scratch_hi
+// CHECK: [0x05,0x00,0x2c,0xd3,0x69,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x2c,0xd3,0x6a,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], vcc_lo
+// CHECK: [0x05,0x00,0x2c,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x2c,0xd3,0x6b,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], vcc_hi
+// CHECK: [0x05,0x00,0x2c,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], tba_lo
-// CHECK: [0x00,0x00,0x2c,0xd3,0x6c,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], tba_lo
+// CHECK: [0x05,0x00,0x2c,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], tba_hi
-// CHECK: [0x00,0x00,0x2c,0xd3,0x6d,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], tba_hi
+// CHECK: [0x05,0x00,0x2c,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], tma_lo
-// CHECK: [0x00,0x00,0x2c,0xd3,0x6e,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], tma_lo
+// CHECK: [0x05,0x00,0x2c,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], tma_hi
-// CHECK: [0x00,0x00,0x2c,0xd3,0x6f,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], tma_hi
+// CHECK: [0x05,0x00,0x2c,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], ttmp11
-// CHECK: [0x00,0x00,0x2c,0xd3,0x7b,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], ttmp11
+// CHECK: [0x05,0x00,0x2c,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], m0
-// CHECK: [0x00,0x00,0x2c,0xd3,0x7c,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], m0
+// CHECK: [0x05,0x00,0x2c,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], exec_lo
-// CHECK: [0x00,0x00,0x2c,0xd3,0x7e,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], exec_lo
+// CHECK: [0x05,0x00,0x2c,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], exec_hi
-// CHECK: [0x00,0x00,0x2c,0xd3,0x7f,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], exec_hi
+// CHECK: [0x05,0x00,0x2c,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], 0
-// CHECK: [0x00,0x00,0x2c,0xd3,0x80,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], 0
+// CHECK: [0x05,0x00,0x2c,0xd3,0x80,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], -1
-// CHECK: [0x00,0x00,0x2c,0xd3,0xc1,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], -1
+// CHECK: [0x05,0x00,0x2c,0xd3,0xc1,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], 0.5
-// CHECK: [0x00,0x00,0x2c,0xd3,0xf0,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], 0.5
+// CHECK: [0x05,0x00,0x2c,0xd3,0xf0,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], -4.0
-// CHECK: [0x00,0x00,0x2c,0xd3,0xf7,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], -4.0
+// CHECK: [0x05,0x00,0x2c,0xd3,0xf7,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], v0
-// CHECK: [0x00,0x00,0x2c,0xd3,0x00,0x01,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], v1
+// CHECK: [0x05,0x00,0x2c,0xd3,0x01,0x01,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], v255
-// CHECK: [0x00,0x00,0x2c,0xd3,0xff,0x01,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], v255
+// CHECK: [0x05,0x00,0x2c,0xd3,0xff,0x01,0x00,0x00]
 
-v_trunc_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x2e,0xfc,0x7f]
+v_trunc_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x2e,0xfc,0x7f]
 
-v_trunc_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], vcc
-// CHECK: [0x6a,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], vcc
+// CHECK: [0x6a,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], tba
-// CHECK: [0x6c,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], tba
+// CHECK: [0x6c,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], tma
-// CHECK: [0x6e,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], tma
+// CHECK: [0x6e,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], exec
-// CHECK: [0x7e,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], exec
+// CHECK: [0x7e,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], 0
-// CHECK: [0x80,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], 0
+// CHECK: [0x80,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], -1
-// CHECK: [0xc1,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], -1
+// CHECK: [0xc1,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x2e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_trunc_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x2e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_trunc_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x2e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_trunc_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x2e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_trunc_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x2f,0x00,0x7e]
+v_trunc_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x2f,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x2f,0x00,0x7e]
+v_trunc_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x2f,0x0a,0x7e]
 
-v_trunc_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x2e,0xd3,0x00,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x2e,0xd3,0x02,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x2e,0xd3,0x00,0x00,0x00,0x00]
+v_trunc_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x2e,0xd3,0x02,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x2e,0xd3,0x02,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x2e,0xd3,0x04,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x2e,0xd3,0x66,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x2e,0xd3,0x66,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x2e,0xd3,0x68,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x2e,0xd3,0x68,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x2e,0xd3,0x6a,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x2e,0xd3,0x6a,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x2e,0xd3,0x6c,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x2e,0xd3,0x6c,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x2e,0xd3,0x6e,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x2e,0xd3,0x6e,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x2e,0xd3,0x7a,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x2e,0xd3,0x7a,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x2e,0xd3,0x7e,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x2e,0xd3,0x7e,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x2e,0xd3,0xfd,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x2e,0xd3,0xfd,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x2e,0xd3,0x00,0x01,0x00,0x00]
+v_trunc_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x2e,0xd3,0x01,0x01,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x2e,0xd3,0xfe,0x01,0x00,0x00]
+v_trunc_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x2e,0xd3,0xfe,0x01,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x2e,0xd3,0x00,0x00,0x00,0x20]
+v_trunc_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x2e,0xd3,0x02,0x00,0x00,0x20]
 
-v_trunc_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x2e,0xd3,0x00,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x2e,0xd3,0x02,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x2e,0xd3,0x00,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x2e,0xd3,0x02,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x2e,0xd3,0x00,0x00,0x00,0x08]
+v_trunc_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x2e,0xd3,0x02,0x00,0x00,0x08]
 
-v_trunc_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x2e,0xd3,0x00,0x00,0x00,0x10]
+v_trunc_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x2e,0xd3,0x02,0x00,0x00,0x10]
 
-v_trunc_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x2e,0xd3,0x00,0x00,0x00,0x18]
+v_trunc_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x2e,0xd3,0x02,0x00,0x00,0x18]
 
-v_ceil_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x30,0xfc,0x7f]
+v_ceil_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x30,0xfc,0x7f]
 
-v_ceil_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], vcc
-// CHECK: [0x6a,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], vcc
+// CHECK: [0x6a,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], tba
-// CHECK: [0x6c,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], tba
+// CHECK: [0x6c,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], tma
-// CHECK: [0x6e,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], tma
+// CHECK: [0x6e,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], exec
-// CHECK: [0x7e,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], exec
+// CHECK: [0x7e,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], 0
-// CHECK: [0x80,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], 0
+// CHECK: [0x80,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], -1
-// CHECK: [0xc1,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], -1
+// CHECK: [0xc1,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x30,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_ceil_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x30,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_ceil_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x30,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_ceil_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x30,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_ceil_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x31,0x00,0x7e]
+v_ceil_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x31,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x31,0x00,0x7e]
+v_ceil_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x31,0x0a,0x7e]
 
-v_ceil_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x30,0xd3,0x00,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x30,0xd3,0x02,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x30,0xd3,0x00,0x00,0x00,0x00]
+v_ceil_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x30,0xd3,0x02,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x30,0xd3,0x02,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x30,0xd3,0x04,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x30,0xd3,0x66,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x30,0xd3,0x66,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x30,0xd3,0x68,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x30,0xd3,0x68,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x30,0xd3,0x6a,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x30,0xd3,0x6a,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x30,0xd3,0x6c,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x30,0xd3,0x6c,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x30,0xd3,0x6e,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x30,0xd3,0x6e,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x30,0xd3,0x7a,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x30,0xd3,0x7a,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x30,0xd3,0x7e,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x30,0xd3,0x7e,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x30,0xd3,0xfd,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x30,0xd3,0xfd,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x30,0xd3,0x00,0x01,0x00,0x00]
+v_ceil_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x30,0xd3,0x01,0x01,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x30,0xd3,0xfe,0x01,0x00,0x00]
+v_ceil_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x30,0xd3,0xfe,0x01,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x30,0xd3,0x00,0x00,0x00,0x20]
+v_ceil_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x30,0xd3,0x02,0x00,0x00,0x20]
 
-v_ceil_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x30,0xd3,0x00,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x30,0xd3,0x02,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x30,0xd3,0x00,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x30,0xd3,0x02,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x30,0xd3,0x00,0x00,0x00,0x08]
+v_ceil_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x30,0xd3,0x02,0x00,0x00,0x08]
 
-v_ceil_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x30,0xd3,0x00,0x00,0x00,0x10]
+v_ceil_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x30,0xd3,0x02,0x00,0x00,0x10]
 
-v_ceil_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x30,0xd3,0x00,0x00,0x00,0x18]
+v_ceil_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x30,0xd3,0x02,0x00,0x00,0x18]
 
-v_rndne_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x32,0xfc,0x7f]
+v_rndne_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x32,0xfc,0x7f]
 
-v_rndne_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], vcc
-// CHECK: [0x6a,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], vcc
+// CHECK: [0x6a,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], tba
-// CHECK: [0x6c,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], tba
+// CHECK: [0x6c,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], tma
-// CHECK: [0x6e,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], tma
+// CHECK: [0x6e,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], exec
-// CHECK: [0x7e,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], exec
+// CHECK: [0x7e,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], 0
-// CHECK: [0x80,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], 0
+// CHECK: [0x80,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], -1
-// CHECK: [0xc1,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], -1
+// CHECK: [0xc1,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x32,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rndne_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x32,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rndne_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x32,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rndne_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x32,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rndne_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x33,0x00,0x7e]
+v_rndne_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x33,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x33,0x00,0x7e]
+v_rndne_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x33,0x0a,0x7e]
 
-v_rndne_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x32,0xd3,0x00,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x32,0xd3,0x02,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x32,0xd3,0x00,0x00,0x00,0x00]
+v_rndne_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x32,0xd3,0x02,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x32,0xd3,0x02,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x32,0xd3,0x04,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x32,0xd3,0x66,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x32,0xd3,0x66,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x32,0xd3,0x68,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x32,0xd3,0x68,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x32,0xd3,0x6a,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x32,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x32,0xd3,0x6c,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x32,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x32,0xd3,0x6e,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x32,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x32,0xd3,0x7a,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x32,0xd3,0x7a,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x32,0xd3,0x7e,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x32,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x32,0xd3,0xfd,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x32,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x32,0xd3,0x00,0x01,0x00,0x00]
+v_rndne_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x32,0xd3,0x01,0x01,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x32,0xd3,0xfe,0x01,0x00,0x00]
+v_rndne_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x32,0xd3,0xfe,0x01,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x32,0xd3,0x00,0x00,0x00,0x20]
+v_rndne_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x32,0xd3,0x02,0x00,0x00,0x20]
 
-v_rndne_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x32,0xd3,0x00,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x32,0xd3,0x02,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x32,0xd3,0x00,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x32,0xd3,0x02,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x32,0xd3,0x00,0x00,0x00,0x08]
+v_rndne_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x32,0xd3,0x02,0x00,0x00,0x08]
 
-v_rndne_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x32,0xd3,0x00,0x00,0x00,0x10]
+v_rndne_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x32,0xd3,0x02,0x00,0x00,0x10]
 
-v_rndne_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x32,0xd3,0x00,0x00,0x00,0x18]
+v_rndne_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x32,0xd3,0x02,0x00,0x00,0x18]
 
-v_floor_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x34,0x0a,0x7e]
 
-v_floor_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x34,0xfc,0x7f]
+v_floor_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x34,0xfc,0x7f]
 
-v_floor_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], vcc
-// CHECK: [0x6a,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], vcc
+// CHECK: [0x6a,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], tba
-// CHECK: [0x6c,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], tba
+// CHECK: [0x6c,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], tma
-// CHECK: [0x6e,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], tma
+// CHECK: [0x6e,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], exec
-// CHECK: [0x7e,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], exec
+// CHECK: [0x7e,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], 0
-// CHECK: [0x80,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], 0
+// CHECK: [0x80,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], -1
-// CHECK: [0xc1,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], -1
+// CHECK: [0xc1,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x34,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_floor_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x34,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_floor_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x34,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_floor_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x34,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_floor_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x35,0x00,0x7e]
+v_floor_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x35,0x0a,0x7e]
 
-v_floor_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x35,0x00,0x7e]
+v_floor_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x35,0x0a,0x7e]
 
-v_floor_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x34,0xd3,0x00,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x34,0xd3,0x02,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x34,0xd3,0x00,0x00,0x00,0x00]
+v_floor_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x34,0xd3,0x02,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x34,0xd3,0x02,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x34,0xd3,0x04,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x34,0xd3,0x66,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x34,0xd3,0x66,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x34,0xd3,0x68,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x34,0xd3,0x68,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x34,0xd3,0x6a,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x34,0xd3,0x6a,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x34,0xd3,0x6c,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x34,0xd3,0x6c,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x34,0xd3,0x6e,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x34,0xd3,0x6e,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x34,0xd3,0x7a,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x34,0xd3,0x7a,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x34,0xd3,0x7e,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x34,0xd3,0x7e,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x34,0xd3,0xfd,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x34,0xd3,0xfd,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x34,0xd3,0x00,0x01,0x00,0x00]
+v_floor_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x34,0xd3,0x01,0x01,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x34,0xd3,0xfe,0x01,0x00,0x00]
+v_floor_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x34,0xd3,0xfe,0x01,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x34,0xd3,0x00,0x00,0x00,0x20]
+v_floor_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x34,0xd3,0x02,0x00,0x00,0x20]
 
-v_floor_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x34,0xd3,0x00,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x34,0xd3,0x02,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x34,0xd3,0x00,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x34,0xd3,0x02,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x34,0xd3,0x00,0x00,0x00,0x08]
+v_floor_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x34,0xd3,0x02,0x00,0x00,0x08]
 
-v_floor_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x34,0xd3,0x00,0x00,0x00,0x10]
+v_floor_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x34,0xd3,0x02,0x00,0x00,0x10]
 
-v_floor_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x34,0xd3,0x00,0x00,0x00,0x18]
+v_floor_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x34,0xd3,0x02,0x00,0x00,0x18]
 
-v_fract_f32 v0, s0
-// CHECK: [0x00,0x40,0x00,0x7e]
+v_fract_f32 v5, s1
+// CHECK: [0x01,0x40,0x0a,0x7e]
 
-v_fract_f32 v255, s0
-// CHECK: [0x00,0x40,0xfe,0x7f]
+v_fract_f32 v255, s1
+// CHECK: [0x01,0x40,0xfe,0x7f]
 
-v_fract_f32 v0, s103
-// CHECK: [0x67,0x40,0x00,0x7e]
+v_fract_f32 v5, s103
+// CHECK: [0x67,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x40,0x00,0x7e]
+v_fract_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x40,0x00,0x7e]
+v_fract_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, vcc_lo
-// CHECK: [0x6a,0x40,0x00,0x7e]
+v_fract_f32 v5, vcc_lo
+// CHECK: [0x6a,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, vcc_hi
-// CHECK: [0x6b,0x40,0x00,0x7e]
+v_fract_f32 v5, vcc_hi
+// CHECK: [0x6b,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, tba_lo
-// CHECK: [0x6c,0x40,0x00,0x7e]
+v_fract_f32 v5, tba_lo
+// CHECK: [0x6c,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, tba_hi
-// CHECK: [0x6d,0x40,0x00,0x7e]
+v_fract_f32 v5, tba_hi
+// CHECK: [0x6d,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, tma_lo
-// CHECK: [0x6e,0x40,0x00,0x7e]
+v_fract_f32 v5, tma_lo
+// CHECK: [0x6e,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, tma_hi
-// CHECK: [0x6f,0x40,0x00,0x7e]
+v_fract_f32 v5, tma_hi
+// CHECK: [0x6f,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, ttmp11
-// CHECK: [0x7b,0x40,0x00,0x7e]
+v_fract_f32 v5, ttmp11
+// CHECK: [0x7b,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, m0
-// CHECK: [0x7c,0x40,0x00,0x7e]
+v_fract_f32 v5, m0
+// CHECK: [0x7c,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, exec_lo
-// CHECK: [0x7e,0x40,0x00,0x7e]
+v_fract_f32 v5, exec_lo
+// CHECK: [0x7e,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, exec_hi
-// CHECK: [0x7f,0x40,0x00,0x7e]
+v_fract_f32 v5, exec_hi
+// CHECK: [0x7f,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, 0
-// CHECK: [0x80,0x40,0x00,0x7e]
+v_fract_f32 v5, 0
+// CHECK: [0x80,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, -1
-// CHECK: [0xc1,0x40,0x00,0x7e]
+v_fract_f32 v5, -1
+// CHECK: [0xc1,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, 0.5
-// CHECK: [0xf0,0x40,0x00,0x7e]
+v_fract_f32 v5, 0.5
+// CHECK: [0xf0,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, -4.0
-// CHECK: [0xf7,0x40,0x00,0x7e]
+v_fract_f32 v5, -4.0
+// CHECK: [0xf7,0x40,0x0a,0x7e]
 
-v_fract_f32 v0, 0xaf123456
-// CHECK: [0xff,0x40,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_fract_f32 v5, 0xaf123456
+// CHECK: [0xff,0x40,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_fract_f32 v0, 0x3f717273
-// CHECK: [0xff,0x40,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_fract_f32 v5, 0x3f717273
+// CHECK: [0xff,0x40,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_fract_f32 v0, v0
-// CHECK: [0x00,0x41,0x00,0x7e]
+v_fract_f32 v5, v1
+// CHECK: [0x01,0x41,0x0a,0x7e]
 
-v_fract_f32 v0, v255
-// CHECK: [0xff,0x41,0x00,0x7e]
+v_fract_f32 v5, v255
+// CHECK: [0xff,0x41,0x0a,0x7e]
 
-v_fract_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x40,0xd3,0x00,0x00,0x00,0x00]
+v_fract_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x40,0xd3,0x01,0x00,0x00,0x00]
 
-v_fract_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x40,0xd3,0x00,0x00,0x00,0x00]
+v_fract_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x40,0xd3,0x01,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x40,0xd3,0x67,0x00,0x00,0x00]
+v_fract_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x40,0xd3,0x67,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x40,0xd3,0x68,0x00,0x00,0x00]
+v_fract_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x40,0xd3,0x68,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x40,0xd3,0x69,0x00,0x00,0x00]
+v_fract_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x40,0xd3,0x69,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x40,0xd3,0x6a,0x00,0x00,0x00]
+v_fract_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x40,0xd3,0x6a,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x40,0xd3,0x6b,0x00,0x00,0x00]
+v_fract_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x40,0xd3,0x6b,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x40,0xd3,0x6c,0x00,0x00,0x00]
+v_fract_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x40,0xd3,0x6c,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x40,0xd3,0x6d,0x00,0x00,0x00]
+v_fract_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x40,0xd3,0x6d,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x40,0xd3,0x6e,0x00,0x00,0x00]
+v_fract_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x40,0xd3,0x6e,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x40,0xd3,0x6f,0x00,0x00,0x00]
+v_fract_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x40,0xd3,0x6f,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x40,0xd3,0x7b,0x00,0x00,0x00]
+v_fract_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x40,0xd3,0x7b,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x40,0xd3,0x7c,0x00,0x00,0x00]
+v_fract_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x40,0xd3,0x7c,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x40,0xd3,0x7e,0x00,0x00,0x00]
+v_fract_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x40,0xd3,0x7e,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x40,0xd3,0x7f,0x00,0x00,0x00]
+v_fract_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x40,0xd3,0x7f,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x40,0xd3,0xfd,0x00,0x00,0x00]
+v_fract_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x40,0xd3,0xfd,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x40,0xd3,0x00,0x01,0x00,0x00]
+v_fract_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x40,0xd3,0x01,0x01,0x00,0x00]
 
-v_fract_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x40,0xd3,0xff,0x01,0x00,0x00]
+v_fract_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x40,0xd3,0xff,0x01,0x00,0x00]
 
-v_fract_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x40,0xd3,0x00,0x00,0x00,0x20]
+v_fract_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x40,0xd3,0x01,0x00,0x00,0x20]
 
-v_fract_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x40,0xd3,0x00,0x00,0x00,0x00]
+v_fract_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x40,0xd3,0x01,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x40,0xd3,0x00,0x00,0x00,0x00]
+v_fract_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x40,0xd3,0x01,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x40,0xd3,0x00,0x00,0x00,0x08]
+v_fract_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x40,0xd3,0x01,0x00,0x00,0x08]
 
-v_fract_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x40,0xd3,0x00,0x00,0x00,0x10]
+v_fract_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x40,0xd3,0x01,0x00,0x00,0x10]
 
-v_fract_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x40,0xd3,0x00,0x00,0x00,0x18]
+v_fract_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x40,0xd3,0x01,0x00,0x00,0x18]
 
-v_trunc_f32 v0, s0
-// CHECK: [0x00,0x42,0x00,0x7e]
+v_trunc_f32 v5, s1
+// CHECK: [0x01,0x42,0x0a,0x7e]
 
-v_trunc_f32 v255, s0
-// CHECK: [0x00,0x42,0xfe,0x7f]
+v_trunc_f32 v255, s1
+// CHECK: [0x01,0x42,0xfe,0x7f]
 
-v_trunc_f32 v0, s103
-// CHECK: [0x67,0x42,0x00,0x7e]
+v_trunc_f32 v5, s103
+// CHECK: [0x67,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x42,0x00,0x7e]
+v_trunc_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x42,0x00,0x7e]
+v_trunc_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, vcc_lo
-// CHECK: [0x6a,0x42,0x00,0x7e]
+v_trunc_f32 v5, vcc_lo
+// CHECK: [0x6a,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, vcc_hi
-// CHECK: [0x6b,0x42,0x00,0x7e]
+v_trunc_f32 v5, vcc_hi
+// CHECK: [0x6b,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, tba_lo
-// CHECK: [0x6c,0x42,0x00,0x7e]
+v_trunc_f32 v5, tba_lo
+// CHECK: [0x6c,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, tba_hi
-// CHECK: [0x6d,0x42,0x00,0x7e]
+v_trunc_f32 v5, tba_hi
+// CHECK: [0x6d,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, tma_lo
-// CHECK: [0x6e,0x42,0x00,0x7e]
+v_trunc_f32 v5, tma_lo
+// CHECK: [0x6e,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, tma_hi
-// CHECK: [0x6f,0x42,0x00,0x7e]
+v_trunc_f32 v5, tma_hi
+// CHECK: [0x6f,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, ttmp11
-// CHECK: [0x7b,0x42,0x00,0x7e]
+v_trunc_f32 v5, ttmp11
+// CHECK: [0x7b,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, m0
-// CHECK: [0x7c,0x42,0x00,0x7e]
+v_trunc_f32 v5, m0
+// CHECK: [0x7c,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, exec_lo
-// CHECK: [0x7e,0x42,0x00,0x7e]
+v_trunc_f32 v5, exec_lo
+// CHECK: [0x7e,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, exec_hi
-// CHECK: [0x7f,0x42,0x00,0x7e]
+v_trunc_f32 v5, exec_hi
+// CHECK: [0x7f,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, 0
-// CHECK: [0x80,0x42,0x00,0x7e]
+v_trunc_f32 v5, 0
+// CHECK: [0x80,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, -1
-// CHECK: [0xc1,0x42,0x00,0x7e]
+v_trunc_f32 v5, -1
+// CHECK: [0xc1,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, 0.5
-// CHECK: [0xf0,0x42,0x00,0x7e]
+v_trunc_f32 v5, 0.5
+// CHECK: [0xf0,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, -4.0
-// CHECK: [0xf7,0x42,0x00,0x7e]
+v_trunc_f32 v5, -4.0
+// CHECK: [0xf7,0x42,0x0a,0x7e]
 
-v_trunc_f32 v0, 0xaf123456
-// CHECK: [0xff,0x42,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_trunc_f32 v5, 0xaf123456
+// CHECK: [0xff,0x42,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_trunc_f32 v0, 0x3f717273
-// CHECK: [0xff,0x42,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_trunc_f32 v5, 0x3f717273
+// CHECK: [0xff,0x42,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_trunc_f32 v0, v0
-// CHECK: [0x00,0x43,0x00,0x7e]
+v_trunc_f32 v5, v1
+// CHECK: [0x01,0x43,0x0a,0x7e]
 
-v_trunc_f32 v0, v255
-// CHECK: [0xff,0x43,0x00,0x7e]
+v_trunc_f32 v5, v255
+// CHECK: [0xff,0x43,0x0a,0x7e]
 
-v_trunc_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x42,0xd3,0x00,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x42,0xd3,0x01,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x42,0xd3,0x00,0x00,0x00,0x00]
+v_trunc_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x42,0xd3,0x01,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x42,0xd3,0x67,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x42,0xd3,0x67,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x42,0xd3,0x68,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x42,0xd3,0x68,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x42,0xd3,0x69,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x42,0xd3,0x69,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x42,0xd3,0x6a,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x42,0xd3,0x6a,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x42,0xd3,0x6b,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x42,0xd3,0x6b,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x42,0xd3,0x6c,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x42,0xd3,0x6c,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x42,0xd3,0x6d,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x42,0xd3,0x6d,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x42,0xd3,0x6e,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x42,0xd3,0x6e,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x42,0xd3,0x6f,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x42,0xd3,0x6f,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x42,0xd3,0x7b,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x42,0xd3,0x7b,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x42,0xd3,0x7c,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x42,0xd3,0x7c,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x42,0xd3,0x7e,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x42,0xd3,0x7e,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x42,0xd3,0x7f,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x42,0xd3,0x7f,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x42,0xd3,0xfd,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x42,0xd3,0xfd,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x42,0xd3,0x00,0x01,0x00,0x00]
+v_trunc_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x42,0xd3,0x01,0x01,0x00,0x00]
 
-v_trunc_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x42,0xd3,0xff,0x01,0x00,0x00]
+v_trunc_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x42,0xd3,0xff,0x01,0x00,0x00]
 
-v_trunc_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x42,0xd3,0x00,0x00,0x00,0x20]
+v_trunc_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x42,0xd3,0x01,0x00,0x00,0x20]
 
-v_trunc_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x42,0xd3,0x00,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x42,0xd3,0x01,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x42,0xd3,0x00,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x42,0xd3,0x01,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x42,0xd3,0x00,0x00,0x00,0x08]
+v_trunc_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x42,0xd3,0x01,0x00,0x00,0x08]
 
-v_trunc_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x42,0xd3,0x00,0x00,0x00,0x10]
+v_trunc_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x42,0xd3,0x01,0x00,0x00,0x10]
 
-v_trunc_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x42,0xd3,0x00,0x00,0x00,0x18]
+v_trunc_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x42,0xd3,0x01,0x00,0x00,0x18]
 
-v_ceil_f32 v0, s0
-// CHECK: [0x00,0x44,0x00,0x7e]
+v_ceil_f32 v5, s1
+// CHECK: [0x01,0x44,0x0a,0x7e]
 
-v_ceil_f32 v255, s0
-// CHECK: [0x00,0x44,0xfe,0x7f]
+v_ceil_f32 v255, s1
+// CHECK: [0x01,0x44,0xfe,0x7f]
 
-v_ceil_f32 v0, s103
-// CHECK: [0x67,0x44,0x00,0x7e]
+v_ceil_f32 v5, s103
+// CHECK: [0x67,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x44,0x00,0x7e]
+v_ceil_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x44,0x00,0x7e]
+v_ceil_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, vcc_lo
-// CHECK: [0x6a,0x44,0x00,0x7e]
+v_ceil_f32 v5, vcc_lo
+// CHECK: [0x6a,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, vcc_hi
-// CHECK: [0x6b,0x44,0x00,0x7e]
+v_ceil_f32 v5, vcc_hi
+// CHECK: [0x6b,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, tba_lo
-// CHECK: [0x6c,0x44,0x00,0x7e]
+v_ceil_f32 v5, tba_lo
+// CHECK: [0x6c,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, tba_hi
-// CHECK: [0x6d,0x44,0x00,0x7e]
+v_ceil_f32 v5, tba_hi
+// CHECK: [0x6d,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, tma_lo
-// CHECK: [0x6e,0x44,0x00,0x7e]
+v_ceil_f32 v5, tma_lo
+// CHECK: [0x6e,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, tma_hi
-// CHECK: [0x6f,0x44,0x00,0x7e]
+v_ceil_f32 v5, tma_hi
+// CHECK: [0x6f,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, ttmp11
-// CHECK: [0x7b,0x44,0x00,0x7e]
+v_ceil_f32 v5, ttmp11
+// CHECK: [0x7b,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, m0
-// CHECK: [0x7c,0x44,0x00,0x7e]
+v_ceil_f32 v5, m0
+// CHECK: [0x7c,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, exec_lo
-// CHECK: [0x7e,0x44,0x00,0x7e]
+v_ceil_f32 v5, exec_lo
+// CHECK: [0x7e,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, exec_hi
-// CHECK: [0x7f,0x44,0x00,0x7e]
+v_ceil_f32 v5, exec_hi
+// CHECK: [0x7f,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, 0
-// CHECK: [0x80,0x44,0x00,0x7e]
+v_ceil_f32 v5, 0
+// CHECK: [0x80,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, -1
-// CHECK: [0xc1,0x44,0x00,0x7e]
+v_ceil_f32 v5, -1
+// CHECK: [0xc1,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, 0.5
-// CHECK: [0xf0,0x44,0x00,0x7e]
+v_ceil_f32 v5, 0.5
+// CHECK: [0xf0,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, -4.0
-// CHECK: [0xf7,0x44,0x00,0x7e]
+v_ceil_f32 v5, -4.0
+// CHECK: [0xf7,0x44,0x0a,0x7e]
 
-v_ceil_f32 v0, 0xaf123456
-// CHECK: [0xff,0x44,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_ceil_f32 v5, 0xaf123456
+// CHECK: [0xff,0x44,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_ceil_f32 v0, 0x3f717273
-// CHECK: [0xff,0x44,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_ceil_f32 v5, 0x3f717273
+// CHECK: [0xff,0x44,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_ceil_f32 v0, v0
-// CHECK: [0x00,0x45,0x00,0x7e]
+v_ceil_f32 v5, v1
+// CHECK: [0x01,0x45,0x0a,0x7e]
 
-v_ceil_f32 v0, v255
-// CHECK: [0xff,0x45,0x00,0x7e]
+v_ceil_f32 v5, v255
+// CHECK: [0xff,0x45,0x0a,0x7e]
 
-v_ceil_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x44,0xd3,0x00,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x44,0xd3,0x01,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x44,0xd3,0x00,0x00,0x00,0x00]
+v_ceil_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x44,0xd3,0x01,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x44,0xd3,0x67,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x44,0xd3,0x67,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x44,0xd3,0x68,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x44,0xd3,0x68,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x44,0xd3,0x69,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x44,0xd3,0x69,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x44,0xd3,0x6a,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x44,0xd3,0x6a,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x44,0xd3,0x6b,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x44,0xd3,0x6b,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x44,0xd3,0x6c,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x44,0xd3,0x6c,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x44,0xd3,0x6d,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x44,0xd3,0x6d,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x44,0xd3,0x6e,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x44,0xd3,0x6e,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x44,0xd3,0x6f,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x44,0xd3,0x6f,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x44,0xd3,0x7b,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x44,0xd3,0x7b,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x44,0xd3,0x7c,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x44,0xd3,0x7c,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x44,0xd3,0x7e,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x44,0xd3,0x7e,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x44,0xd3,0x7f,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x44,0xd3,0x7f,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x44,0xd3,0xfd,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x44,0xd3,0xfd,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x44,0xd3,0x00,0x01,0x00,0x00]
+v_ceil_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x44,0xd3,0x01,0x01,0x00,0x00]
 
-v_ceil_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x44,0xd3,0xff,0x01,0x00,0x00]
+v_ceil_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x44,0xd3,0xff,0x01,0x00,0x00]
 
-v_ceil_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x44,0xd3,0x00,0x00,0x00,0x20]
+v_ceil_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x44,0xd3,0x01,0x00,0x00,0x20]
 
-v_ceil_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x44,0xd3,0x00,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x44,0xd3,0x01,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x44,0xd3,0x00,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x44,0xd3,0x01,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x44,0xd3,0x00,0x00,0x00,0x08]
+v_ceil_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x44,0xd3,0x01,0x00,0x00,0x08]
 
-v_ceil_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x44,0xd3,0x00,0x00,0x00,0x10]
+v_ceil_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x44,0xd3,0x01,0x00,0x00,0x10]
 
-v_ceil_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x44,0xd3,0x00,0x00,0x00,0x18]
+v_ceil_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x44,0xd3,0x01,0x00,0x00,0x18]
 
-v_rndne_f32 v0, s0
-// CHECK: [0x00,0x46,0x00,0x7e]
+v_rndne_f32 v5, s1
+// CHECK: [0x01,0x46,0x0a,0x7e]
 
-v_rndne_f32 v255, s0
-// CHECK: [0x00,0x46,0xfe,0x7f]
+v_rndne_f32 v255, s1
+// CHECK: [0x01,0x46,0xfe,0x7f]
 
-v_rndne_f32 v0, s103
-// CHECK: [0x67,0x46,0x00,0x7e]
+v_rndne_f32 v5, s103
+// CHECK: [0x67,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x46,0x00,0x7e]
+v_rndne_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x46,0x00,0x7e]
+v_rndne_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, vcc_lo
-// CHECK: [0x6a,0x46,0x00,0x7e]
+v_rndne_f32 v5, vcc_lo
+// CHECK: [0x6a,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, vcc_hi
-// CHECK: [0x6b,0x46,0x00,0x7e]
+v_rndne_f32 v5, vcc_hi
+// CHECK: [0x6b,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, tba_lo
-// CHECK: [0x6c,0x46,0x00,0x7e]
+v_rndne_f32 v5, tba_lo
+// CHECK: [0x6c,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, tba_hi
-// CHECK: [0x6d,0x46,0x00,0x7e]
+v_rndne_f32 v5, tba_hi
+// CHECK: [0x6d,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, tma_lo
-// CHECK: [0x6e,0x46,0x00,0x7e]
+v_rndne_f32 v5, tma_lo
+// CHECK: [0x6e,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, tma_hi
-// CHECK: [0x6f,0x46,0x00,0x7e]
+v_rndne_f32 v5, tma_hi
+// CHECK: [0x6f,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, ttmp11
-// CHECK: [0x7b,0x46,0x00,0x7e]
+v_rndne_f32 v5, ttmp11
+// CHECK: [0x7b,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, m0
-// CHECK: [0x7c,0x46,0x00,0x7e]
+v_rndne_f32 v5, m0
+// CHECK: [0x7c,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, exec_lo
-// CHECK: [0x7e,0x46,0x00,0x7e]
+v_rndne_f32 v5, exec_lo
+// CHECK: [0x7e,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, exec_hi
-// CHECK: [0x7f,0x46,0x00,0x7e]
+v_rndne_f32 v5, exec_hi
+// CHECK: [0x7f,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, 0
-// CHECK: [0x80,0x46,0x00,0x7e]
+v_rndne_f32 v5, 0
+// CHECK: [0x80,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, -1
-// CHECK: [0xc1,0x46,0x00,0x7e]
+v_rndne_f32 v5, -1
+// CHECK: [0xc1,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, 0.5
-// CHECK: [0xf0,0x46,0x00,0x7e]
+v_rndne_f32 v5, 0.5
+// CHECK: [0xf0,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, -4.0
-// CHECK: [0xf7,0x46,0x00,0x7e]
+v_rndne_f32 v5, -4.0
+// CHECK: [0xf7,0x46,0x0a,0x7e]
 
-v_rndne_f32 v0, 0xaf123456
-// CHECK: [0xff,0x46,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rndne_f32 v5, 0xaf123456
+// CHECK: [0xff,0x46,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rndne_f32 v0, 0x3f717273
-// CHECK: [0xff,0x46,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rndne_f32 v5, 0x3f717273
+// CHECK: [0xff,0x46,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rndne_f32 v0, v0
-// CHECK: [0x00,0x47,0x00,0x7e]
+v_rndne_f32 v5, v1
+// CHECK: [0x01,0x47,0x0a,0x7e]
 
-v_rndne_f32 v0, v255
-// CHECK: [0xff,0x47,0x00,0x7e]
+v_rndne_f32 v5, v255
+// CHECK: [0xff,0x47,0x0a,0x7e]
 
-v_rndne_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x46,0xd3,0x00,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x46,0xd3,0x01,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x46,0xd3,0x00,0x00,0x00,0x00]
+v_rndne_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x46,0xd3,0x01,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x46,0xd3,0x67,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x46,0xd3,0x67,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x46,0xd3,0x68,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x46,0xd3,0x68,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x46,0xd3,0x69,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x46,0xd3,0x69,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x46,0xd3,0x6a,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x46,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x46,0xd3,0x6b,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x46,0xd3,0x6b,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x46,0xd3,0x6c,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x46,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x46,0xd3,0x6d,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x46,0xd3,0x6d,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x46,0xd3,0x6e,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x46,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x46,0xd3,0x6f,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x46,0xd3,0x6f,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x46,0xd3,0x7b,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x46,0xd3,0x7b,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x46,0xd3,0x7c,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x46,0xd3,0x7c,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x46,0xd3,0x7e,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x46,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x46,0xd3,0x7f,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x46,0xd3,0x7f,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x46,0xd3,0xfd,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x46,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x46,0xd3,0x00,0x01,0x00,0x00]
+v_rndne_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x46,0xd3,0x01,0x01,0x00,0x00]
 
-v_rndne_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x46,0xd3,0xff,0x01,0x00,0x00]
+v_rndne_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x46,0xd3,0xff,0x01,0x00,0x00]
 
-v_rndne_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x46,0xd3,0x00,0x00,0x00,0x20]
+v_rndne_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x46,0xd3,0x01,0x00,0x00,0x20]
 
-v_rndne_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x46,0xd3,0x00,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x46,0xd3,0x01,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x46,0xd3,0x00,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x46,0xd3,0x01,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x46,0xd3,0x00,0x00,0x00,0x08]
+v_rndne_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x46,0xd3,0x01,0x00,0x00,0x08]
 
-v_rndne_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x46,0xd3,0x00,0x00,0x00,0x10]
+v_rndne_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x46,0xd3,0x01,0x00,0x00,0x10]
 
-v_rndne_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x46,0xd3,0x00,0x00,0x00,0x18]
+v_rndne_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x46,0xd3,0x01,0x00,0x00,0x18]
 
-v_floor_f32 v0, s0
-// CHECK: [0x00,0x48,0x00,0x7e]
+v_floor_f32 v5, s1
+// CHECK: [0x01,0x48,0x0a,0x7e]
 
-v_floor_f32 v255, s0
-// CHECK: [0x00,0x48,0xfe,0x7f]
+v_floor_f32 v255, s1
+// CHECK: [0x01,0x48,0xfe,0x7f]
 
-v_floor_f32 v0, s103
-// CHECK: [0x67,0x48,0x00,0x7e]
+v_floor_f32 v5, s103
+// CHECK: [0x67,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x48,0x00,0x7e]
+v_floor_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x48,0x00,0x7e]
+v_floor_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, vcc_lo
-// CHECK: [0x6a,0x48,0x00,0x7e]
+v_floor_f32 v5, vcc_lo
+// CHECK: [0x6a,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, vcc_hi
-// CHECK: [0x6b,0x48,0x00,0x7e]
+v_floor_f32 v5, vcc_hi
+// CHECK: [0x6b,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, tba_lo
-// CHECK: [0x6c,0x48,0x00,0x7e]
+v_floor_f32 v5, tba_lo
+// CHECK: [0x6c,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, tba_hi
-// CHECK: [0x6d,0x48,0x00,0x7e]
+v_floor_f32 v5, tba_hi
+// CHECK: [0x6d,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, tma_lo
-// CHECK: [0x6e,0x48,0x00,0x7e]
+v_floor_f32 v5, tma_lo
+// CHECK: [0x6e,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, tma_hi
-// CHECK: [0x6f,0x48,0x00,0x7e]
+v_floor_f32 v5, tma_hi
+// CHECK: [0x6f,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, ttmp11
-// CHECK: [0x7b,0x48,0x00,0x7e]
+v_floor_f32 v5, ttmp11
+// CHECK: [0x7b,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, m0
-// CHECK: [0x7c,0x48,0x00,0x7e]
+v_floor_f32 v5, m0
+// CHECK: [0x7c,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, exec_lo
-// CHECK: [0x7e,0x48,0x00,0x7e]
+v_floor_f32 v5, exec_lo
+// CHECK: [0x7e,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, exec_hi
-// CHECK: [0x7f,0x48,0x00,0x7e]
+v_floor_f32 v5, exec_hi
+// CHECK: [0x7f,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, 0
-// CHECK: [0x80,0x48,0x00,0x7e]
+v_floor_f32 v5, 0
+// CHECK: [0x80,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, -1
-// CHECK: [0xc1,0x48,0x00,0x7e]
+v_floor_f32 v5, -1
+// CHECK: [0xc1,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, 0.5
-// CHECK: [0xf0,0x48,0x00,0x7e]
+v_floor_f32 v5, 0.5
+// CHECK: [0xf0,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, -4.0
-// CHECK: [0xf7,0x48,0x00,0x7e]
+v_floor_f32 v5, -4.0
+// CHECK: [0xf7,0x48,0x0a,0x7e]
 
-v_floor_f32 v0, 0xaf123456
-// CHECK: [0xff,0x48,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_floor_f32 v5, 0xaf123456
+// CHECK: [0xff,0x48,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_floor_f32 v0, 0x3f717273
-// CHECK: [0xff,0x48,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_floor_f32 v5, 0x3f717273
+// CHECK: [0xff,0x48,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_floor_f32 v0, v0
-// CHECK: [0x00,0x49,0x00,0x7e]
+v_floor_f32 v5, v1
+// CHECK: [0x01,0x49,0x0a,0x7e]
 
-v_floor_f32 v0, v255
-// CHECK: [0xff,0x49,0x00,0x7e]
+v_floor_f32 v5, v255
+// CHECK: [0xff,0x49,0x0a,0x7e]
 
-v_floor_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x48,0xd3,0x00,0x00,0x00,0x00]
+v_floor_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x48,0xd3,0x01,0x00,0x00,0x00]
 
-v_floor_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x48,0xd3,0x00,0x00,0x00,0x00]
+v_floor_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x48,0xd3,0x01,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x48,0xd3,0x67,0x00,0x00,0x00]
+v_floor_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x48,0xd3,0x67,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x48,0xd3,0x68,0x00,0x00,0x00]
+v_floor_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x48,0xd3,0x68,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x48,0xd3,0x69,0x00,0x00,0x00]
+v_floor_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x48,0xd3,0x69,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x48,0xd3,0x6a,0x00,0x00,0x00]
+v_floor_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x48,0xd3,0x6a,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x48,0xd3,0x6b,0x00,0x00,0x00]
+v_floor_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x48,0xd3,0x6b,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x48,0xd3,0x6c,0x00,0x00,0x00]
+v_floor_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x48,0xd3,0x6c,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x48,0xd3,0x6d,0x00,0x00,0x00]
+v_floor_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x48,0xd3,0x6d,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x48,0xd3,0x6e,0x00,0x00,0x00]
+v_floor_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x48,0xd3,0x6e,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x48,0xd3,0x6f,0x00,0x00,0x00]
+v_floor_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x48,0xd3,0x6f,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x48,0xd3,0x7b,0x00,0x00,0x00]
+v_floor_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x48,0xd3,0x7b,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x48,0xd3,0x7c,0x00,0x00,0x00]
+v_floor_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x48,0xd3,0x7c,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x48,0xd3,0x7e,0x00,0x00,0x00]
+v_floor_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x48,0xd3,0x7e,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x48,0xd3,0x7f,0x00,0x00,0x00]
+v_floor_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x48,0xd3,0x7f,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x48,0xd3,0xfd,0x00,0x00,0x00]
+v_floor_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x48,0xd3,0xfd,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x48,0xd3,0x00,0x01,0x00,0x00]
+v_floor_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x48,0xd3,0x01,0x01,0x00,0x00]
 
-v_floor_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x48,0xd3,0xff,0x01,0x00,0x00]
+v_floor_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x48,0xd3,0xff,0x01,0x00,0x00]
 
-v_floor_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x48,0xd3,0x00,0x00,0x00,0x20]
+v_floor_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x48,0xd3,0x01,0x00,0x00,0x20]
 
-v_floor_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x48,0xd3,0x00,0x00,0x00,0x00]
+v_floor_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x48,0xd3,0x01,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x48,0xd3,0x00,0x00,0x00,0x00]
+v_floor_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x48,0xd3,0x01,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x48,0xd3,0x00,0x00,0x00,0x08]
+v_floor_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x48,0xd3,0x01,0x00,0x00,0x08]
 
-v_floor_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x48,0xd3,0x00,0x00,0x00,0x10]
+v_floor_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x48,0xd3,0x01,0x00,0x00,0x10]
 
-v_floor_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x48,0xd3,0x00,0x00,0x00,0x18]
+v_floor_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x48,0xd3,0x01,0x00,0x00,0x18]
 
-v_exp_f32 v0, s0
-// CHECK: [0x00,0x4a,0x00,0x7e]
+v_exp_f32 v5, s1
+// CHECK: [0x01,0x4a,0x0a,0x7e]
 
-v_exp_f32 v255, s0
-// CHECK: [0x00,0x4a,0xfe,0x7f]
+v_exp_f32 v255, s1
+// CHECK: [0x01,0x4a,0xfe,0x7f]
 
-v_exp_f32 v0, s103
-// CHECK: [0x67,0x4a,0x00,0x7e]
+v_exp_f32 v5, s103
+// CHECK: [0x67,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x4a,0x00,0x7e]
+v_exp_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x4a,0x00,0x7e]
+v_exp_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, vcc_lo
-// CHECK: [0x6a,0x4a,0x00,0x7e]
+v_exp_f32 v5, vcc_lo
+// CHECK: [0x6a,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, vcc_hi
-// CHECK: [0x6b,0x4a,0x00,0x7e]
+v_exp_f32 v5, vcc_hi
+// CHECK: [0x6b,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, tba_lo
-// CHECK: [0x6c,0x4a,0x00,0x7e]
+v_exp_f32 v5, tba_lo
+// CHECK: [0x6c,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, tba_hi
-// CHECK: [0x6d,0x4a,0x00,0x7e]
+v_exp_f32 v5, tba_hi
+// CHECK: [0x6d,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, tma_lo
-// CHECK: [0x6e,0x4a,0x00,0x7e]
+v_exp_f32 v5, tma_lo
+// CHECK: [0x6e,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, tma_hi
-// CHECK: [0x6f,0x4a,0x00,0x7e]
+v_exp_f32 v5, tma_hi
+// CHECK: [0x6f,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, ttmp11
-// CHECK: [0x7b,0x4a,0x00,0x7e]
+v_exp_f32 v5, ttmp11
+// CHECK: [0x7b,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, m0
-// CHECK: [0x7c,0x4a,0x00,0x7e]
+v_exp_f32 v5, m0
+// CHECK: [0x7c,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, exec_lo
-// CHECK: [0x7e,0x4a,0x00,0x7e]
+v_exp_f32 v5, exec_lo
+// CHECK: [0x7e,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, exec_hi
-// CHECK: [0x7f,0x4a,0x00,0x7e]
+v_exp_f32 v5, exec_hi
+// CHECK: [0x7f,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, 0
-// CHECK: [0x80,0x4a,0x00,0x7e]
+v_exp_f32 v5, 0
+// CHECK: [0x80,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, -1
-// CHECK: [0xc1,0x4a,0x00,0x7e]
+v_exp_f32 v5, -1
+// CHECK: [0xc1,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, 0.5
-// CHECK: [0xf0,0x4a,0x00,0x7e]
+v_exp_f32 v5, 0.5
+// CHECK: [0xf0,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, -4.0
-// CHECK: [0xf7,0x4a,0x00,0x7e]
+v_exp_f32 v5, -4.0
+// CHECK: [0xf7,0x4a,0x0a,0x7e]
 
-v_exp_f32 v0, 0xaf123456
-// CHECK: [0xff,0x4a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_exp_f32 v5, 0xaf123456
+// CHECK: [0xff,0x4a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_exp_f32 v0, 0x3f717273
-// CHECK: [0xff,0x4a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_exp_f32 v5, 0x3f717273
+// CHECK: [0xff,0x4a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_exp_f32 v0, v0
-// CHECK: [0x00,0x4b,0x00,0x7e]
+v_exp_f32 v5, v1
+// CHECK: [0x01,0x4b,0x0a,0x7e]
 
-v_exp_f32 v0, v255
-// CHECK: [0xff,0x4b,0x00,0x7e]
+v_exp_f32 v5, v255
+// CHECK: [0xff,0x4b,0x0a,0x7e]
 
-v_exp_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x4a,0xd3,0x00,0x00,0x00,0x00]
+v_exp_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x4a,0xd3,0x01,0x00,0x00,0x00]
 
-v_exp_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x4a,0xd3,0x00,0x00,0x00,0x00]
+v_exp_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x4a,0xd3,0x01,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x4a,0xd3,0x67,0x00,0x00,0x00]
+v_exp_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x4a,0xd3,0x67,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x4a,0xd3,0x68,0x00,0x00,0x00]
+v_exp_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x4a,0xd3,0x68,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x4a,0xd3,0x69,0x00,0x00,0x00]
+v_exp_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x4a,0xd3,0x69,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x4a,0xd3,0x6a,0x00,0x00,0x00]
+v_exp_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x4a,0xd3,0x6a,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x4a,0xd3,0x6b,0x00,0x00,0x00]
+v_exp_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x4a,0xd3,0x6b,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x4a,0xd3,0x6c,0x00,0x00,0x00]
+v_exp_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x4a,0xd3,0x6c,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x4a,0xd3,0x6d,0x00,0x00,0x00]
+v_exp_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x4a,0xd3,0x6d,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x4a,0xd3,0x6e,0x00,0x00,0x00]
+v_exp_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x4a,0xd3,0x6e,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x4a,0xd3,0x6f,0x00,0x00,0x00]
+v_exp_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x4a,0xd3,0x6f,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x4a,0xd3,0x7b,0x00,0x00,0x00]
+v_exp_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x4a,0xd3,0x7b,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x4a,0xd3,0x7c,0x00,0x00,0x00]
+v_exp_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x4a,0xd3,0x7c,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x4a,0xd3,0x7e,0x00,0x00,0x00]
+v_exp_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x4a,0xd3,0x7e,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x4a,0xd3,0x7f,0x00,0x00,0x00]
+v_exp_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x4a,0xd3,0x7f,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x4a,0xd3,0x80,0x00,0x00,0x00]
+v_exp_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x4a,0xd3,0x80,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x4a,0xd3,0xf0,0x00,0x00,0x00]
+v_exp_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x4a,0xd3,0xf0,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x4a,0xd3,0xfd,0x00,0x00,0x00]
+v_exp_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x4a,0xd3,0xfd,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x4a,0xd3,0x00,0x01,0x00,0x00]
+v_exp_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x4a,0xd3,0x01,0x01,0x00,0x00]
 
-v_exp_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x4a,0xd3,0xff,0x01,0x00,0x00]
+v_exp_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x4a,0xd3,0xff,0x01,0x00,0x00]
 
-v_exp_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x4a,0xd3,0x00,0x00,0x00,0x20]
+v_exp_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x4a,0xd3,0x01,0x00,0x00,0x20]
 
-v_exp_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x4a,0xd3,0x00,0x00,0x00,0x08]
+v_exp_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x4a,0xd3,0x01,0x00,0x00,0x08]
 
-v_exp_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x4a,0xd3,0x00,0x00,0x00,0x10]
+v_exp_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x4a,0xd3,0x01,0x00,0x00,0x10]
 
-v_exp_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x4a,0xd3,0x00,0x00,0x00,0x18]
+v_exp_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x4a,0xd3,0x01,0x00,0x00,0x18]
 
-v_log_clamp_f32 v0, s0
-// CHECK: [0x00,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, s1
+// CHECK: [0x01,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v255, s0
-// CHECK: [0x00,0x4c,0xfe,0x7f]
+v_log_clamp_f32 v255, s1
+// CHECK: [0x01,0x4c,0xfe,0x7f]
 
-v_log_clamp_f32 v0, s103
-// CHECK: [0x67,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, s103
+// CHECK: [0x67,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, vcc_lo
-// CHECK: [0x6a,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, vcc_lo
+// CHECK: [0x6a,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, vcc_hi
-// CHECK: [0x6b,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, vcc_hi
+// CHECK: [0x6b,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, tba_lo
-// CHECK: [0x6c,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, tba_lo
+// CHECK: [0x6c,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, tba_hi
-// CHECK: [0x6d,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, tba_hi
+// CHECK: [0x6d,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, tma_lo
-// CHECK: [0x6e,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, tma_lo
+// CHECK: [0x6e,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, tma_hi
-// CHECK: [0x6f,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, tma_hi
+// CHECK: [0x6f,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, ttmp11
-// CHECK: [0x7b,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, ttmp11
+// CHECK: [0x7b,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, m0
-// CHECK: [0x7c,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, m0
+// CHECK: [0x7c,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, exec_lo
-// CHECK: [0x7e,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, exec_lo
+// CHECK: [0x7e,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, exec_hi
-// CHECK: [0x7f,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, exec_hi
+// CHECK: [0x7f,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, 0
-// CHECK: [0x80,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, 0
+// CHECK: [0x80,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, -1
-// CHECK: [0xc1,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, -1
+// CHECK: [0xc1,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, 0.5
-// CHECK: [0xf0,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, 0.5
+// CHECK: [0xf0,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, -4.0
-// CHECK: [0xf7,0x4c,0x00,0x7e]
+v_log_clamp_f32 v5, -4.0
+// CHECK: [0xf7,0x4c,0x0a,0x7e]
 
-v_log_clamp_f32 v0, 0xaf123456
-// CHECK: [0xff,0x4c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_log_clamp_f32 v5, 0xaf123456
+// CHECK: [0xff,0x4c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_log_clamp_f32 v0, 0x3f717273
-// CHECK: [0xff,0x4c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_log_clamp_f32 v5, 0x3f717273
+// CHECK: [0xff,0x4c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_log_clamp_f32 v0, v0
-// CHECK: [0x00,0x4d,0x00,0x7e]
+v_log_clamp_f32 v5, v1
+// CHECK: [0x01,0x4d,0x0a,0x7e]
 
-v_log_clamp_f32 v0, v255
-// CHECK: [0xff,0x4d,0x00,0x7e]
+v_log_clamp_f32 v5, v255
+// CHECK: [0xff,0x4d,0x0a,0x7e]
 
-v_log_clamp_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x4c,0xd3,0x00,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x4c,0xd3,0x01,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x4c,0xd3,0x00,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x4c,0xd3,0x01,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x4c,0xd3,0x67,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x4c,0xd3,0x67,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x4c,0xd3,0x68,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x4c,0xd3,0x68,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x4c,0xd3,0x69,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x4c,0xd3,0x69,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x4c,0xd3,0x6a,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x4c,0xd3,0x6a,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x4c,0xd3,0x6b,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x4c,0xd3,0x6b,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x4c,0xd3,0x6c,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x4c,0xd3,0x6c,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x4c,0xd3,0x6d,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x4c,0xd3,0x6d,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x4c,0xd3,0x6e,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x4c,0xd3,0x6e,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x4c,0xd3,0x6f,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x4c,0xd3,0x6f,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x4c,0xd3,0x7b,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x4c,0xd3,0x7b,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x4c,0xd3,0x7c,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x4c,0xd3,0x7c,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x4c,0xd3,0x7e,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x4c,0xd3,0x7e,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x4c,0xd3,0x7f,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x4c,0xd3,0x7f,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x4c,0xd3,0x80,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x4c,0xd3,0x80,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x4c,0xd3,0xf0,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x4c,0xd3,0xf0,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x4c,0xd3,0xfd,0x00,0x00,0x00]
+v_log_clamp_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x4c,0xd3,0xfd,0x00,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x4c,0xd3,0x00,0x01,0x00,0x00]
+v_log_clamp_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x4c,0xd3,0x01,0x01,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x4c,0xd3,0xff,0x01,0x00,0x00]
+v_log_clamp_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x4c,0xd3,0xff,0x01,0x00,0x00]
 
-v_log_clamp_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x4c,0xd3,0x00,0x00,0x00,0x20]
+v_log_clamp_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x4c,0xd3,0x01,0x00,0x00,0x20]
 
-v_log_clamp_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x4c,0xd3,0x00,0x00,0x00,0x08]
+v_log_clamp_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x4c,0xd3,0x01,0x00,0x00,0x08]
 
-v_log_clamp_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x4c,0xd3,0x00,0x00,0x00,0x10]
+v_log_clamp_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x4c,0xd3,0x01,0x00,0x00,0x10]
 
-v_log_clamp_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x4c,0xd3,0x00,0x00,0x00,0x18]
+v_log_clamp_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x4c,0xd3,0x01,0x00,0x00,0x18]
 
-v_log_f32 v0, s0
-// CHECK: [0x00,0x4e,0x00,0x7e]
+v_log_f32 v5, s1
+// CHECK: [0x01,0x4e,0x0a,0x7e]
 
-v_log_f32 v255, s0
-// CHECK: [0x00,0x4e,0xfe,0x7f]
+v_log_f32 v255, s1
+// CHECK: [0x01,0x4e,0xfe,0x7f]
 
-v_log_f32 v0, s103
-// CHECK: [0x67,0x4e,0x00,0x7e]
+v_log_f32 v5, s103
+// CHECK: [0x67,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x4e,0x00,0x7e]
+v_log_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x4e,0x00,0x7e]
+v_log_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, vcc_lo
-// CHECK: [0x6a,0x4e,0x00,0x7e]
+v_log_f32 v5, vcc_lo
+// CHECK: [0x6a,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, vcc_hi
-// CHECK: [0x6b,0x4e,0x00,0x7e]
+v_log_f32 v5, vcc_hi
+// CHECK: [0x6b,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, tba_lo
-// CHECK: [0x6c,0x4e,0x00,0x7e]
+v_log_f32 v5, tba_lo
+// CHECK: [0x6c,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, tba_hi
-// CHECK: [0x6d,0x4e,0x00,0x7e]
+v_log_f32 v5, tba_hi
+// CHECK: [0x6d,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, tma_lo
-// CHECK: [0x6e,0x4e,0x00,0x7e]
+v_log_f32 v5, tma_lo
+// CHECK: [0x6e,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, tma_hi
-// CHECK: [0x6f,0x4e,0x00,0x7e]
+v_log_f32 v5, tma_hi
+// CHECK: [0x6f,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, ttmp11
-// CHECK: [0x7b,0x4e,0x00,0x7e]
+v_log_f32 v5, ttmp11
+// CHECK: [0x7b,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, m0
-// CHECK: [0x7c,0x4e,0x00,0x7e]
+v_log_f32 v5, m0
+// CHECK: [0x7c,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, exec_lo
-// CHECK: [0x7e,0x4e,0x00,0x7e]
+v_log_f32 v5, exec_lo
+// CHECK: [0x7e,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, exec_hi
-// CHECK: [0x7f,0x4e,0x00,0x7e]
+v_log_f32 v5, exec_hi
+// CHECK: [0x7f,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, 0
-// CHECK: [0x80,0x4e,0x00,0x7e]
+v_log_f32 v5, 0
+// CHECK: [0x80,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, -1
-// CHECK: [0xc1,0x4e,0x00,0x7e]
+v_log_f32 v5, -1
+// CHECK: [0xc1,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, 0.5
-// CHECK: [0xf0,0x4e,0x00,0x7e]
+v_log_f32 v5, 0.5
+// CHECK: [0xf0,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, -4.0
-// CHECK: [0xf7,0x4e,0x00,0x7e]
+v_log_f32 v5, -4.0
+// CHECK: [0xf7,0x4e,0x0a,0x7e]
 
-v_log_f32 v0, 0xaf123456
-// CHECK: [0xff,0x4e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_log_f32 v5, 0xaf123456
+// CHECK: [0xff,0x4e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_log_f32 v0, 0x3f717273
-// CHECK: [0xff,0x4e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_log_f32 v5, 0x3f717273
+// CHECK: [0xff,0x4e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_log_f32 v0, v0
-// CHECK: [0x00,0x4f,0x00,0x7e]
+v_log_f32 v5, v1
+// CHECK: [0x01,0x4f,0x0a,0x7e]
 
-v_log_f32 v0, v255
-// CHECK: [0xff,0x4f,0x00,0x7e]
+v_log_f32 v5, v255
+// CHECK: [0xff,0x4f,0x0a,0x7e]
 
-v_log_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x4e,0xd3,0x00,0x00,0x00,0x00]
+v_log_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x4e,0xd3,0x01,0x00,0x00,0x00]
 
-v_log_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x4e,0xd3,0x00,0x00,0x00,0x00]
+v_log_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x4e,0xd3,0x01,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x4e,0xd3,0x67,0x00,0x00,0x00]
+v_log_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x4e,0xd3,0x67,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x4e,0xd3,0x68,0x00,0x00,0x00]
+v_log_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x4e,0xd3,0x68,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x4e,0xd3,0x69,0x00,0x00,0x00]
+v_log_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x4e,0xd3,0x69,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x4e,0xd3,0x6a,0x00,0x00,0x00]
+v_log_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x4e,0xd3,0x6a,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x4e,0xd3,0x6b,0x00,0x00,0x00]
+v_log_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x4e,0xd3,0x6b,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x4e,0xd3,0x6c,0x00,0x00,0x00]
+v_log_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x4e,0xd3,0x6c,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x4e,0xd3,0x6d,0x00,0x00,0x00]
+v_log_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x4e,0xd3,0x6d,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x4e,0xd3,0x6e,0x00,0x00,0x00]
+v_log_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x4e,0xd3,0x6e,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x4e,0xd3,0x6f,0x00,0x00,0x00]
+v_log_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x4e,0xd3,0x6f,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x4e,0xd3,0x7b,0x00,0x00,0x00]
+v_log_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x4e,0xd3,0x7b,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x4e,0xd3,0x7c,0x00,0x00,0x00]
+v_log_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x4e,0xd3,0x7c,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x4e,0xd3,0x7e,0x00,0x00,0x00]
+v_log_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x4e,0xd3,0x7e,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x4e,0xd3,0x7f,0x00,0x00,0x00]
+v_log_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x4e,0xd3,0x7f,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x4e,0xd3,0x80,0x00,0x00,0x00]
+v_log_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x4e,0xd3,0x80,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x4e,0xd3,0xf0,0x00,0x00,0x00]
+v_log_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x4e,0xd3,0xf0,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x4e,0xd3,0xfd,0x00,0x00,0x00]
+v_log_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x4e,0xd3,0xfd,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x4e,0xd3,0x00,0x01,0x00,0x00]
+v_log_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x4e,0xd3,0x01,0x01,0x00,0x00]
 
-v_log_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x4e,0xd3,0xff,0x01,0x00,0x00]
+v_log_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x4e,0xd3,0xff,0x01,0x00,0x00]
 
-v_log_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x4e,0xd3,0x00,0x00,0x00,0x20]
+v_log_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x4e,0xd3,0x01,0x00,0x00,0x20]
 
-v_log_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x4e,0xd3,0x00,0x00,0x00,0x08]
+v_log_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x4e,0xd3,0x01,0x00,0x00,0x08]
 
-v_log_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x4e,0xd3,0x00,0x00,0x00,0x10]
+v_log_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x4e,0xd3,0x01,0x00,0x00,0x10]
 
-v_log_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x4e,0xd3,0x00,0x00,0x00,0x18]
+v_log_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x4e,0xd3,0x01,0x00,0x00,0x18]
 
-v_rcp_clamp_f32 v0, s0
-// CHECK: [0x00,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, s1
+// CHECK: [0x01,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v255, s0
-// CHECK: [0x00,0x50,0xfe,0x7f]
+v_rcp_clamp_f32 v255, s1
+// CHECK: [0x01,0x50,0xfe,0x7f]
 
-v_rcp_clamp_f32 v0, s103
-// CHECK: [0x67,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, s103
+// CHECK: [0x67,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, vcc_lo
-// CHECK: [0x6a,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, vcc_lo
+// CHECK: [0x6a,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, vcc_hi
-// CHECK: [0x6b,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, vcc_hi
+// CHECK: [0x6b,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, tba_lo
-// CHECK: [0x6c,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, tba_lo
+// CHECK: [0x6c,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, tba_hi
-// CHECK: [0x6d,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, tba_hi
+// CHECK: [0x6d,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, tma_lo
-// CHECK: [0x6e,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, tma_lo
+// CHECK: [0x6e,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, tma_hi
-// CHECK: [0x6f,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, tma_hi
+// CHECK: [0x6f,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, ttmp11
-// CHECK: [0x7b,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, ttmp11
+// CHECK: [0x7b,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, m0
-// CHECK: [0x7c,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, m0
+// CHECK: [0x7c,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, exec_lo
-// CHECK: [0x7e,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, exec_lo
+// CHECK: [0x7e,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, exec_hi
-// CHECK: [0x7f,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, exec_hi
+// CHECK: [0x7f,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, 0
-// CHECK: [0x80,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, 0
+// CHECK: [0x80,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, -1
-// CHECK: [0xc1,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, -1
+// CHECK: [0xc1,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, 0.5
-// CHECK: [0xf0,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, 0.5
+// CHECK: [0xf0,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, -4.0
-// CHECK: [0xf7,0x50,0x00,0x7e]
+v_rcp_clamp_f32 v5, -4.0
+// CHECK: [0xf7,0x50,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, 0xaf123456
-// CHECK: [0xff,0x50,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rcp_clamp_f32 v5, 0xaf123456
+// CHECK: [0xff,0x50,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rcp_clamp_f32 v0, 0x3f717273
-// CHECK: [0xff,0x50,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rcp_clamp_f32 v5, 0x3f717273
+// CHECK: [0xff,0x50,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rcp_clamp_f32 v0, v0
-// CHECK: [0x00,0x51,0x00,0x7e]
+v_rcp_clamp_f32 v5, v1
+// CHECK: [0x01,0x51,0x0a,0x7e]
 
-v_rcp_clamp_f32 v0, v255
-// CHECK: [0xff,0x51,0x00,0x7e]
+v_rcp_clamp_f32 v5, v255
+// CHECK: [0xff,0x51,0x0a,0x7e]
 
-v_rcp_clamp_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x50,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x50,0xd3,0x01,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x50,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x50,0xd3,0x01,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x50,0xd3,0x67,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x50,0xd3,0x67,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x50,0xd3,0x68,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x50,0xd3,0x68,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x50,0xd3,0x69,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x50,0xd3,0x69,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x50,0xd3,0x6a,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x50,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x50,0xd3,0x6b,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x50,0xd3,0x6b,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x50,0xd3,0x6c,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x50,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x50,0xd3,0x6d,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x50,0xd3,0x6d,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x50,0xd3,0x6e,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x50,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x50,0xd3,0x6f,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x50,0xd3,0x6f,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x50,0xd3,0x7b,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x50,0xd3,0x7b,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x50,0xd3,0x7c,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x50,0xd3,0x7c,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x50,0xd3,0x7e,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x50,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x50,0xd3,0x7f,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x50,0xd3,0x7f,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x50,0xd3,0x80,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x50,0xd3,0x80,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x50,0xd3,0xf0,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x50,0xd3,0xf0,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x50,0xd3,0xfd,0x00,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x50,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x50,0xd3,0x00,0x01,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x50,0xd3,0x01,0x01,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x50,0xd3,0xff,0x01,0x00,0x00]
+v_rcp_clamp_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x50,0xd3,0xff,0x01,0x00,0x00]
 
-v_rcp_clamp_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x50,0xd3,0x00,0x00,0x00,0x20]
+v_rcp_clamp_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x50,0xd3,0x01,0x00,0x00,0x20]
 
-v_rcp_clamp_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x50,0xd3,0x00,0x00,0x00,0x08]
+v_rcp_clamp_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x50,0xd3,0x01,0x00,0x00,0x08]
 
-v_rcp_clamp_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x50,0xd3,0x00,0x00,0x00,0x10]
+v_rcp_clamp_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x50,0xd3,0x01,0x00,0x00,0x10]
 
-v_rcp_clamp_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x50,0xd3,0x00,0x00,0x00,0x18]
+v_rcp_clamp_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x50,0xd3,0x01,0x00,0x00,0x18]
 
-v_rcp_legacy_f32 v0, s0
-// CHECK: [0x00,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, s1
+// CHECK: [0x01,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v255, s0
-// CHECK: [0x00,0x52,0xfe,0x7f]
+v_rcp_legacy_f32 v255, s1
+// CHECK: [0x01,0x52,0xfe,0x7f]
 
-v_rcp_legacy_f32 v0, s103
-// CHECK: [0x67,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, s103
+// CHECK: [0x67,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, vcc_lo
-// CHECK: [0x6a,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, vcc_lo
+// CHECK: [0x6a,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, vcc_hi
-// CHECK: [0x6b,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, vcc_hi
+// CHECK: [0x6b,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, tba_lo
-// CHECK: [0x6c,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, tba_lo
+// CHECK: [0x6c,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, tba_hi
-// CHECK: [0x6d,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, tba_hi
+// CHECK: [0x6d,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, tma_lo
-// CHECK: [0x6e,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, tma_lo
+// CHECK: [0x6e,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, tma_hi
-// CHECK: [0x6f,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, tma_hi
+// CHECK: [0x6f,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, ttmp11
-// CHECK: [0x7b,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, ttmp11
+// CHECK: [0x7b,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, m0
-// CHECK: [0x7c,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, m0
+// CHECK: [0x7c,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, exec_lo
-// CHECK: [0x7e,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, exec_lo
+// CHECK: [0x7e,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, exec_hi
-// CHECK: [0x7f,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, exec_hi
+// CHECK: [0x7f,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, 0
-// CHECK: [0x80,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, 0
+// CHECK: [0x80,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, -1
-// CHECK: [0xc1,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, -1
+// CHECK: [0xc1,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, 0.5
-// CHECK: [0xf0,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, 0.5
+// CHECK: [0xf0,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, -4.0
-// CHECK: [0xf7,0x52,0x00,0x7e]
+v_rcp_legacy_f32 v5, -4.0
+// CHECK: [0xf7,0x52,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, 0xaf123456
-// CHECK: [0xff,0x52,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rcp_legacy_f32 v5, 0xaf123456
+// CHECK: [0xff,0x52,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rcp_legacy_f32 v0, 0x3f717273
-// CHECK: [0xff,0x52,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rcp_legacy_f32 v5, 0x3f717273
+// CHECK: [0xff,0x52,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rcp_legacy_f32 v0, v0
-// CHECK: [0x00,0x53,0x00,0x7e]
+v_rcp_legacy_f32 v5, v1
+// CHECK: [0x01,0x53,0x0a,0x7e]
 
-v_rcp_legacy_f32 v0, v255
-// CHECK: [0xff,0x53,0x00,0x7e]
+v_rcp_legacy_f32 v5, v255
+// CHECK: [0xff,0x53,0x0a,0x7e]
 
-v_rcp_legacy_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x52,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x52,0xd3,0x01,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x52,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x52,0xd3,0x01,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x52,0xd3,0x67,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x52,0xd3,0x67,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x52,0xd3,0x68,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x52,0xd3,0x68,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x52,0xd3,0x69,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x52,0xd3,0x69,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x52,0xd3,0x6a,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x52,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x52,0xd3,0x6b,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x52,0xd3,0x6b,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x52,0xd3,0x6c,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x52,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x52,0xd3,0x6d,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x52,0xd3,0x6d,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x52,0xd3,0x6e,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x52,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x52,0xd3,0x6f,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x52,0xd3,0x6f,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x52,0xd3,0x7b,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x52,0xd3,0x7b,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x52,0xd3,0x7c,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x52,0xd3,0x7c,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x52,0xd3,0x7e,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x52,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x52,0xd3,0x7f,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x52,0xd3,0x7f,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x52,0xd3,0x80,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x52,0xd3,0x80,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x52,0xd3,0xf0,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x52,0xd3,0xf0,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x52,0xd3,0xfd,0x00,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x52,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x52,0xd3,0x00,0x01,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x52,0xd3,0x01,0x01,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x52,0xd3,0xff,0x01,0x00,0x00]
+v_rcp_legacy_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x52,0xd3,0xff,0x01,0x00,0x00]
 
-v_rcp_legacy_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x52,0xd3,0x00,0x00,0x00,0x20]
+v_rcp_legacy_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x52,0xd3,0x01,0x00,0x00,0x20]
 
-v_rcp_legacy_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x52,0xd3,0x00,0x00,0x00,0x08]
+v_rcp_legacy_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x52,0xd3,0x01,0x00,0x00,0x08]
 
-v_rcp_legacy_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x52,0xd3,0x00,0x00,0x00,0x10]
+v_rcp_legacy_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x52,0xd3,0x01,0x00,0x00,0x10]
 
-v_rcp_legacy_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x52,0xd3,0x00,0x00,0x00,0x18]
+v_rcp_legacy_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x52,0xd3,0x01,0x00,0x00,0x18]
 
-v_rcp_f32 v0, s0
-// CHECK: [0x00,0x54,0x00,0x7e]
+v_rcp_f32 v5, s1
+// CHECK: [0x01,0x54,0x0a,0x7e]
 
-v_rcp_f32 v255, s0
-// CHECK: [0x00,0x54,0xfe,0x7f]
+v_rcp_f32 v255, s1
+// CHECK: [0x01,0x54,0xfe,0x7f]
 
-v_rcp_f32 v0, s103
-// CHECK: [0x67,0x54,0x00,0x7e]
+v_rcp_f32 v5, s103
+// CHECK: [0x67,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x54,0x00,0x7e]
+v_rcp_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x54,0x00,0x7e]
+v_rcp_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, vcc_lo
-// CHECK: [0x6a,0x54,0x00,0x7e]
+v_rcp_f32 v5, vcc_lo
+// CHECK: [0x6a,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, vcc_hi
-// CHECK: [0x6b,0x54,0x00,0x7e]
+v_rcp_f32 v5, vcc_hi
+// CHECK: [0x6b,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, tba_lo
-// CHECK: [0x6c,0x54,0x00,0x7e]
+v_rcp_f32 v5, tba_lo
+// CHECK: [0x6c,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, tba_hi
-// CHECK: [0x6d,0x54,0x00,0x7e]
+v_rcp_f32 v5, tba_hi
+// CHECK: [0x6d,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, tma_lo
-// CHECK: [0x6e,0x54,0x00,0x7e]
+v_rcp_f32 v5, tma_lo
+// CHECK: [0x6e,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, tma_hi
-// CHECK: [0x6f,0x54,0x00,0x7e]
+v_rcp_f32 v5, tma_hi
+// CHECK: [0x6f,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, ttmp11
-// CHECK: [0x7b,0x54,0x00,0x7e]
+v_rcp_f32 v5, ttmp11
+// CHECK: [0x7b,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, m0
-// CHECK: [0x7c,0x54,0x00,0x7e]
+v_rcp_f32 v5, m0
+// CHECK: [0x7c,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, exec_lo
-// CHECK: [0x7e,0x54,0x00,0x7e]
+v_rcp_f32 v5, exec_lo
+// CHECK: [0x7e,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, exec_hi
-// CHECK: [0x7f,0x54,0x00,0x7e]
+v_rcp_f32 v5, exec_hi
+// CHECK: [0x7f,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, 0
-// CHECK: [0x80,0x54,0x00,0x7e]
+v_rcp_f32 v5, 0
+// CHECK: [0x80,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, -1
-// CHECK: [0xc1,0x54,0x00,0x7e]
+v_rcp_f32 v5, -1
+// CHECK: [0xc1,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, 0.5
-// CHECK: [0xf0,0x54,0x00,0x7e]
+v_rcp_f32 v5, 0.5
+// CHECK: [0xf0,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, -4.0
-// CHECK: [0xf7,0x54,0x00,0x7e]
+v_rcp_f32 v5, -4.0
+// CHECK: [0xf7,0x54,0x0a,0x7e]
 
-v_rcp_f32 v0, 0xaf123456
-// CHECK: [0xff,0x54,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rcp_f32 v5, 0xaf123456
+// CHECK: [0xff,0x54,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rcp_f32 v0, 0x3f717273
-// CHECK: [0xff,0x54,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rcp_f32 v5, 0x3f717273
+// CHECK: [0xff,0x54,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rcp_f32 v0, v0
-// CHECK: [0x00,0x55,0x00,0x7e]
+v_rcp_f32 v5, v1
+// CHECK: [0x01,0x55,0x0a,0x7e]
 
-v_rcp_f32 v0, v255
-// CHECK: [0xff,0x55,0x00,0x7e]
+v_rcp_f32 v5, v255
+// CHECK: [0xff,0x55,0x0a,0x7e]
 
-v_rcp_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x54,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x54,0xd3,0x01,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x54,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x54,0xd3,0x01,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x54,0xd3,0x67,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x54,0xd3,0x67,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x54,0xd3,0x68,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x54,0xd3,0x68,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x54,0xd3,0x69,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x54,0xd3,0x69,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x54,0xd3,0x6a,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x54,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x54,0xd3,0x6b,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x54,0xd3,0x6b,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x54,0xd3,0x6c,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x54,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x54,0xd3,0x6d,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x54,0xd3,0x6d,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x54,0xd3,0x6e,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x54,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x54,0xd3,0x6f,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x54,0xd3,0x6f,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x54,0xd3,0x7b,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x54,0xd3,0x7b,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x54,0xd3,0x7c,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x54,0xd3,0x7c,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x54,0xd3,0x7e,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x54,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x54,0xd3,0x7f,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x54,0xd3,0x7f,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x54,0xd3,0x80,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x54,0xd3,0x80,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x54,0xd3,0xf0,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x54,0xd3,0xf0,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x54,0xd3,0xfd,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x54,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x54,0xd3,0x00,0x01,0x00,0x00]
+v_rcp_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x54,0xd3,0x01,0x01,0x00,0x00]
 
-v_rcp_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x54,0xd3,0xff,0x01,0x00,0x00]
+v_rcp_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x54,0xd3,0xff,0x01,0x00,0x00]
 
-v_rcp_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x54,0xd3,0x00,0x00,0x00,0x20]
+v_rcp_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x54,0xd3,0x01,0x00,0x00,0x20]
 
-v_rcp_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x54,0xd3,0x00,0x00,0x00,0x08]
+v_rcp_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x54,0xd3,0x01,0x00,0x00,0x08]
 
-v_rcp_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x54,0xd3,0x00,0x00,0x00,0x10]
+v_rcp_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x54,0xd3,0x01,0x00,0x00,0x10]
 
-v_rcp_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x54,0xd3,0x00,0x00,0x00,0x18]
+v_rcp_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x54,0xd3,0x01,0x00,0x00,0x18]
 
-v_rcp_iflag_f32 v0, s0
-// CHECK: [0x00,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, s1
+// CHECK: [0x01,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v255, s0
-// CHECK: [0x00,0x56,0xfe,0x7f]
+v_rcp_iflag_f32 v255, s1
+// CHECK: [0x01,0x56,0xfe,0x7f]
 
-v_rcp_iflag_f32 v0, s103
-// CHECK: [0x67,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, s103
+// CHECK: [0x67,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, vcc_lo
-// CHECK: [0x6a,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, vcc_lo
+// CHECK: [0x6a,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, vcc_hi
-// CHECK: [0x6b,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, vcc_hi
+// CHECK: [0x6b,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, tba_lo
-// CHECK: [0x6c,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, tba_lo
+// CHECK: [0x6c,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, tba_hi
-// CHECK: [0x6d,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, tba_hi
+// CHECK: [0x6d,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, tma_lo
-// CHECK: [0x6e,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, tma_lo
+// CHECK: [0x6e,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, tma_hi
-// CHECK: [0x6f,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, tma_hi
+// CHECK: [0x6f,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, ttmp11
-// CHECK: [0x7b,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, ttmp11
+// CHECK: [0x7b,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, m0
-// CHECK: [0x7c,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, m0
+// CHECK: [0x7c,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, exec_lo
-// CHECK: [0x7e,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, exec_lo
+// CHECK: [0x7e,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, exec_hi
-// CHECK: [0x7f,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, exec_hi
+// CHECK: [0x7f,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, 0
-// CHECK: [0x80,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, 0
+// CHECK: [0x80,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, -1
-// CHECK: [0xc1,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, -1
+// CHECK: [0xc1,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, 0.5
-// CHECK: [0xf0,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, 0.5
+// CHECK: [0xf0,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, -4.0
-// CHECK: [0xf7,0x56,0x00,0x7e]
+v_rcp_iflag_f32 v5, -4.0
+// CHECK: [0xf7,0x56,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, 0xaf123456
-// CHECK: [0xff,0x56,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rcp_iflag_f32 v5, 0xaf123456
+// CHECK: [0xff,0x56,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rcp_iflag_f32 v0, 0x3f717273
-// CHECK: [0xff,0x56,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rcp_iflag_f32 v5, 0x3f717273
+// CHECK: [0xff,0x56,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rcp_iflag_f32 v0, v0
-// CHECK: [0x00,0x57,0x00,0x7e]
+v_rcp_iflag_f32 v5, v1
+// CHECK: [0x01,0x57,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, v255
-// CHECK: [0xff,0x57,0x00,0x7e]
+v_rcp_iflag_f32 v5, v255
+// CHECK: [0xff,0x57,0x0a,0x7e]
 
-v_rcp_iflag_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x56,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x56,0xd3,0x01,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x56,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x56,0xd3,0x01,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x56,0xd3,0x67,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x56,0xd3,0x67,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x56,0xd3,0x68,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x56,0xd3,0x68,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x56,0xd3,0x69,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x56,0xd3,0x69,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x56,0xd3,0x6a,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x56,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x56,0xd3,0x6b,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x56,0xd3,0x6b,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x56,0xd3,0x6c,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x56,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x56,0xd3,0x6d,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x56,0xd3,0x6d,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x56,0xd3,0x6e,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x56,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x56,0xd3,0x6f,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x56,0xd3,0x6f,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x56,0xd3,0x7b,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x56,0xd3,0x7b,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x56,0xd3,0x7c,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x56,0xd3,0x7c,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x56,0xd3,0x7e,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x56,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x56,0xd3,0x7f,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x56,0xd3,0x7f,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x56,0xd3,0xfd,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x56,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x56,0xd3,0x00,0x01,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x56,0xd3,0x01,0x01,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x56,0xd3,0xff,0x01,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x56,0xd3,0xff,0x01,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x56,0xd3,0x00,0x00,0x00,0x20]
+v_rcp_iflag_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x56,0xd3,0x01,0x00,0x00,0x20]
 
-v_rcp_iflag_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x56,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x56,0xd3,0x01,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x56,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x56,0xd3,0x01,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x56,0xd3,0x00,0x00,0x00,0x08]
+v_rcp_iflag_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x56,0xd3,0x01,0x00,0x00,0x08]
 
-v_rcp_iflag_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x56,0xd3,0x00,0x00,0x00,0x10]
+v_rcp_iflag_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x56,0xd3,0x01,0x00,0x00,0x10]
 
-v_rcp_iflag_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x56,0xd3,0x00,0x00,0x00,0x18]
+v_rcp_iflag_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x56,0xd3,0x01,0x00,0x00,0x18]
 
-v_rsq_clamp_f32 v0, s0
-// CHECK: [0x00,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, s1
+// CHECK: [0x01,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v255, s0
-// CHECK: [0x00,0x58,0xfe,0x7f]
+v_rsq_clamp_f32 v255, s1
+// CHECK: [0x01,0x58,0xfe,0x7f]
 
-v_rsq_clamp_f32 v0, s103
-// CHECK: [0x67,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, s103
+// CHECK: [0x67,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, vcc_lo
-// CHECK: [0x6a,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, vcc_lo
+// CHECK: [0x6a,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, vcc_hi
-// CHECK: [0x6b,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, vcc_hi
+// CHECK: [0x6b,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, tba_lo
-// CHECK: [0x6c,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, tba_lo
+// CHECK: [0x6c,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, tba_hi
-// CHECK: [0x6d,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, tba_hi
+// CHECK: [0x6d,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, tma_lo
-// CHECK: [0x6e,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, tma_lo
+// CHECK: [0x6e,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, tma_hi
-// CHECK: [0x6f,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, tma_hi
+// CHECK: [0x6f,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, ttmp11
-// CHECK: [0x7b,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, ttmp11
+// CHECK: [0x7b,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, m0
-// CHECK: [0x7c,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, m0
+// CHECK: [0x7c,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, exec_lo
-// CHECK: [0x7e,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, exec_lo
+// CHECK: [0x7e,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, exec_hi
-// CHECK: [0x7f,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, exec_hi
+// CHECK: [0x7f,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, 0
-// CHECK: [0x80,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, 0
+// CHECK: [0x80,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, -1
-// CHECK: [0xc1,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, -1
+// CHECK: [0xc1,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, 0.5
-// CHECK: [0xf0,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, 0.5
+// CHECK: [0xf0,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, -4.0
-// CHECK: [0xf7,0x58,0x00,0x7e]
+v_rsq_clamp_f32 v5, -4.0
+// CHECK: [0xf7,0x58,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, 0xaf123456
-// CHECK: [0xff,0x58,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rsq_clamp_f32 v5, 0xaf123456
+// CHECK: [0xff,0x58,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rsq_clamp_f32 v0, 0x3f717273
-// CHECK: [0xff,0x58,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rsq_clamp_f32 v5, 0x3f717273
+// CHECK: [0xff,0x58,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rsq_clamp_f32 v0, v0
-// CHECK: [0x00,0x59,0x00,0x7e]
+v_rsq_clamp_f32 v5, v1
+// CHECK: [0x01,0x59,0x0a,0x7e]
 
-v_rsq_clamp_f32 v0, v255
-// CHECK: [0xff,0x59,0x00,0x7e]
+v_rsq_clamp_f32 v5, v255
+// CHECK: [0xff,0x59,0x0a,0x7e]
 
-v_rsq_clamp_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x58,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x58,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x58,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x58,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x58,0xd3,0x67,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x58,0xd3,0x67,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x58,0xd3,0x68,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x58,0xd3,0x68,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x58,0xd3,0x69,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x58,0xd3,0x69,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x58,0xd3,0x6a,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x58,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x58,0xd3,0x6b,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x58,0xd3,0x6b,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x58,0xd3,0x6c,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x58,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x58,0xd3,0x6d,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x58,0xd3,0x6d,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x58,0xd3,0x6e,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x58,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x58,0xd3,0x6f,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x58,0xd3,0x6f,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x58,0xd3,0x7b,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x58,0xd3,0x7b,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x58,0xd3,0x7c,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x58,0xd3,0x7c,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x58,0xd3,0x7e,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x58,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x58,0xd3,0x7f,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x58,0xd3,0x7f,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x58,0xd3,0xfd,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x58,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x58,0xd3,0x00,0x01,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x58,0xd3,0x01,0x01,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x58,0xd3,0xff,0x01,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x58,0xd3,0xff,0x01,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x58,0xd3,0x00,0x00,0x00,0x20]
+v_rsq_clamp_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x58,0xd3,0x01,0x00,0x00,0x20]
 
-v_rsq_clamp_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x58,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x58,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x58,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_clamp_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x58,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_clamp_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x58,0xd3,0x00,0x00,0x00,0x08]
+v_rsq_clamp_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x58,0xd3,0x01,0x00,0x00,0x08]
 
-v_rsq_clamp_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x58,0xd3,0x00,0x00,0x00,0x10]
+v_rsq_clamp_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x58,0xd3,0x01,0x00,0x00,0x10]
 
-v_rsq_clamp_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x58,0xd3,0x00,0x00,0x00,0x18]
+v_rsq_clamp_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x58,0xd3,0x01,0x00,0x00,0x18]
 
-v_rsq_legacy_f32 v0, s0
-// CHECK: [0x00,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, s1
+// CHECK: [0x01,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v255, s0
-// CHECK: [0x00,0x5a,0xfe,0x7f]
+v_rsq_legacy_f32 v255, s1
+// CHECK: [0x01,0x5a,0xfe,0x7f]
 
-v_rsq_legacy_f32 v0, s103
-// CHECK: [0x67,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, s103
+// CHECK: [0x67,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, vcc_lo
-// CHECK: [0x6a,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, vcc_lo
+// CHECK: [0x6a,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, vcc_hi
-// CHECK: [0x6b,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, vcc_hi
+// CHECK: [0x6b,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, tba_lo
-// CHECK: [0x6c,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, tba_lo
+// CHECK: [0x6c,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, tba_hi
-// CHECK: [0x6d,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, tba_hi
+// CHECK: [0x6d,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, tma_lo
-// CHECK: [0x6e,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, tma_lo
+// CHECK: [0x6e,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, tma_hi
-// CHECK: [0x6f,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, tma_hi
+// CHECK: [0x6f,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, ttmp11
-// CHECK: [0x7b,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, ttmp11
+// CHECK: [0x7b,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, m0
-// CHECK: [0x7c,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, m0
+// CHECK: [0x7c,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, exec_lo
-// CHECK: [0x7e,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, exec_lo
+// CHECK: [0x7e,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, exec_hi
-// CHECK: [0x7f,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, exec_hi
+// CHECK: [0x7f,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, 0
-// CHECK: [0x80,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, 0
+// CHECK: [0x80,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, -1
-// CHECK: [0xc1,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, -1
+// CHECK: [0xc1,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, 0.5
-// CHECK: [0xf0,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, 0.5
+// CHECK: [0xf0,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, -4.0
-// CHECK: [0xf7,0x5a,0x00,0x7e]
+v_rsq_legacy_f32 v5, -4.0
+// CHECK: [0xf7,0x5a,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, 0xaf123456
-// CHECK: [0xff,0x5a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rsq_legacy_f32 v5, 0xaf123456
+// CHECK: [0xff,0x5a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rsq_legacy_f32 v0, 0x3f717273
-// CHECK: [0xff,0x5a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rsq_legacy_f32 v5, 0x3f717273
+// CHECK: [0xff,0x5a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rsq_legacy_f32 v0, v0
-// CHECK: [0x00,0x5b,0x00,0x7e]
+v_rsq_legacy_f32 v5, v1
+// CHECK: [0x01,0x5b,0x0a,0x7e]
 
-v_rsq_legacy_f32 v0, v255
-// CHECK: [0xff,0x5b,0x00,0x7e]
+v_rsq_legacy_f32 v5, v255
+// CHECK: [0xff,0x5b,0x0a,0x7e]
 
-v_rsq_legacy_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x5a,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x5a,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x5a,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x5a,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x5a,0xd3,0x67,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x5a,0xd3,0x67,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x5a,0xd3,0x68,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x5a,0xd3,0x68,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x5a,0xd3,0x69,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x5a,0xd3,0x69,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x5a,0xd3,0x6a,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x5a,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x5a,0xd3,0x6b,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x5a,0xd3,0x6b,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x5a,0xd3,0x6c,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x5a,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x5a,0xd3,0x6d,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x5a,0xd3,0x6d,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x5a,0xd3,0x6e,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x5a,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x5a,0xd3,0x6f,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x5a,0xd3,0x6f,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x5a,0xd3,0x7b,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x5a,0xd3,0x7b,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x5a,0xd3,0x7c,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x5a,0xd3,0x7c,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x5a,0xd3,0x7e,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x5a,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x5a,0xd3,0x7f,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x5a,0xd3,0x7f,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x5a,0xd3,0xfd,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x5a,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x5a,0xd3,0x00,0x01,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x5a,0xd3,0x01,0x01,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x5a,0xd3,0xff,0x01,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x5a,0xd3,0xff,0x01,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x5a,0xd3,0x00,0x00,0x00,0x20]
+v_rsq_legacy_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x5a,0xd3,0x01,0x00,0x00,0x20]
 
-v_rsq_legacy_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x5a,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x5a,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x5a,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_legacy_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x5a,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_legacy_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x5a,0xd3,0x00,0x00,0x00,0x08]
+v_rsq_legacy_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x5a,0xd3,0x01,0x00,0x00,0x08]
 
-v_rsq_legacy_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x5a,0xd3,0x00,0x00,0x00,0x10]
+v_rsq_legacy_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x5a,0xd3,0x01,0x00,0x00,0x10]
 
-v_rsq_legacy_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x5a,0xd3,0x00,0x00,0x00,0x18]
+v_rsq_legacy_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x5a,0xd3,0x01,0x00,0x00,0x18]
 
-v_rsq_f32 v0, s0
-// CHECK: [0x00,0x5c,0x00,0x7e]
+v_rsq_f32 v5, s1
+// CHECK: [0x01,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v255, s0
-// CHECK: [0x00,0x5c,0xfe,0x7f]
+v_rsq_f32 v255, s1
+// CHECK: [0x01,0x5c,0xfe,0x7f]
 
-v_rsq_f32 v0, s103
-// CHECK: [0x67,0x5c,0x00,0x7e]
+v_rsq_f32 v5, s103
+// CHECK: [0x67,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x5c,0x00,0x7e]
+v_rsq_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x5c,0x00,0x7e]
+v_rsq_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, vcc_lo
-// CHECK: [0x6a,0x5c,0x00,0x7e]
+v_rsq_f32 v5, vcc_lo
+// CHECK: [0x6a,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, vcc_hi
-// CHECK: [0x6b,0x5c,0x00,0x7e]
+v_rsq_f32 v5, vcc_hi
+// CHECK: [0x6b,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, tba_lo
-// CHECK: [0x6c,0x5c,0x00,0x7e]
+v_rsq_f32 v5, tba_lo
+// CHECK: [0x6c,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, tba_hi
-// CHECK: [0x6d,0x5c,0x00,0x7e]
+v_rsq_f32 v5, tba_hi
+// CHECK: [0x6d,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, tma_lo
-// CHECK: [0x6e,0x5c,0x00,0x7e]
+v_rsq_f32 v5, tma_lo
+// CHECK: [0x6e,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, tma_hi
-// CHECK: [0x6f,0x5c,0x00,0x7e]
+v_rsq_f32 v5, tma_hi
+// CHECK: [0x6f,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, ttmp11
-// CHECK: [0x7b,0x5c,0x00,0x7e]
+v_rsq_f32 v5, ttmp11
+// CHECK: [0x7b,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, m0
-// CHECK: [0x7c,0x5c,0x00,0x7e]
+v_rsq_f32 v5, m0
+// CHECK: [0x7c,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, exec_lo
-// CHECK: [0x7e,0x5c,0x00,0x7e]
+v_rsq_f32 v5, exec_lo
+// CHECK: [0x7e,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, exec_hi
-// CHECK: [0x7f,0x5c,0x00,0x7e]
+v_rsq_f32 v5, exec_hi
+// CHECK: [0x7f,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, 0
-// CHECK: [0x80,0x5c,0x00,0x7e]
+v_rsq_f32 v5, 0
+// CHECK: [0x80,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, -1
-// CHECK: [0xc1,0x5c,0x00,0x7e]
+v_rsq_f32 v5, -1
+// CHECK: [0xc1,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, 0.5
-// CHECK: [0xf0,0x5c,0x00,0x7e]
+v_rsq_f32 v5, 0.5
+// CHECK: [0xf0,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, -4.0
-// CHECK: [0xf7,0x5c,0x00,0x7e]
+v_rsq_f32 v5, -4.0
+// CHECK: [0xf7,0x5c,0x0a,0x7e]
 
-v_rsq_f32 v0, 0xaf123456
-// CHECK: [0xff,0x5c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rsq_f32 v5, 0xaf123456
+// CHECK: [0xff,0x5c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rsq_f32 v0, 0x3f717273
-// CHECK: [0xff,0x5c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rsq_f32 v5, 0x3f717273
+// CHECK: [0xff,0x5c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rsq_f32 v0, v0
-// CHECK: [0x00,0x5d,0x00,0x7e]
+v_rsq_f32 v5, v1
+// CHECK: [0x01,0x5d,0x0a,0x7e]
 
-v_rsq_f32 v0, v255
-// CHECK: [0xff,0x5d,0x00,0x7e]
+v_rsq_f32 v5, v255
+// CHECK: [0xff,0x5d,0x0a,0x7e]
 
-v_rsq_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x5c,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x5c,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x5c,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x5c,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x5c,0xd3,0x67,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x5c,0xd3,0x67,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x5c,0xd3,0x68,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x5c,0xd3,0x68,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x5c,0xd3,0x69,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x5c,0xd3,0x69,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x5c,0xd3,0x6a,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x5c,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x5c,0xd3,0x6b,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x5c,0xd3,0x6b,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x5c,0xd3,0x6c,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x5c,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x5c,0xd3,0x6d,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x5c,0xd3,0x6d,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x5c,0xd3,0x6e,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x5c,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x5c,0xd3,0x6f,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x5c,0xd3,0x6f,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x5c,0xd3,0x7b,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x5c,0xd3,0x7b,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x5c,0xd3,0x7c,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x5c,0xd3,0x7c,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x5c,0xd3,0x7e,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x5c,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x5c,0xd3,0x7f,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x5c,0xd3,0x7f,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x5c,0xd3,0xfd,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x5c,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x5c,0xd3,0x00,0x01,0x00,0x00]
+v_rsq_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x5c,0xd3,0x01,0x01,0x00,0x00]
 
-v_rsq_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x5c,0xd3,0xff,0x01,0x00,0x00]
+v_rsq_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x5c,0xd3,0xff,0x01,0x00,0x00]
 
-v_rsq_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x5c,0xd3,0x00,0x00,0x00,0x20]
+v_rsq_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x5c,0xd3,0x01,0x00,0x00,0x20]
 
-v_rsq_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x5c,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x5c,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x5c,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x5c,0xd3,0x01,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x5c,0xd3,0x00,0x00,0x00,0x08]
+v_rsq_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x5c,0xd3,0x01,0x00,0x00,0x08]
 
-v_rsq_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x5c,0xd3,0x00,0x00,0x00,0x10]
+v_rsq_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x5c,0xd3,0x01,0x00,0x00,0x10]
 
-v_rsq_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x5c,0xd3,0x00,0x00,0x00,0x18]
+v_rsq_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x5c,0xd3,0x01,0x00,0x00,0x18]
 
-v_rcp_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x5e,0xfc,0x7f]
+v_rcp_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x5e,0xfc,0x7f]
 
-v_rcp_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], vcc
-// CHECK: [0x6a,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], vcc
+// CHECK: [0x6a,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], tba
-// CHECK: [0x6c,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], tba
+// CHECK: [0x6c,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], tma
-// CHECK: [0x6e,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], tma
+// CHECK: [0x6e,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], exec
-// CHECK: [0x7e,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], exec
+// CHECK: [0x7e,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], 0
-// CHECK: [0x80,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], 0
+// CHECK: [0x80,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], -1
-// CHECK: [0xc1,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], -1
+// CHECK: [0xc1,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x5e,0x00,0x7e]
+v_rcp_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x5e,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x5e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rcp_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x5e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rcp_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x5e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rcp_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x5e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rcp_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x5f,0x00,0x7e]
+v_rcp_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x5f,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x5f,0x00,0x7e]
+v_rcp_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x5f,0x0a,0x7e]
 
-v_rcp_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x5e,0xd3,0x02,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x5e,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x5e,0xd3,0x02,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x5e,0xd3,0x02,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x5e,0xd3,0x04,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x5e,0xd3,0x66,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x5e,0xd3,0x66,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x5e,0xd3,0x68,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x5e,0xd3,0x68,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x5e,0xd3,0x6a,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x5e,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x5e,0xd3,0x6c,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x5e,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x5e,0xd3,0x6e,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x5e,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x5e,0xd3,0x7a,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x5e,0xd3,0x7a,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x5e,0xd3,0x7e,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x5e,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x5e,0xd3,0xfd,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x5e,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd3,0x00,0x01,0x00,0x00]
+v_rcp_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x5e,0xd3,0x01,0x01,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x5e,0xd3,0xfe,0x01,0x00,0x00]
+v_rcp_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x5e,0xd3,0xfe,0x01,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd3,0x00,0x00,0x00,0x20]
+v_rcp_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x5e,0xd3,0x02,0x00,0x00,0x20]
 
-v_rcp_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x5e,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x5e,0xd3,0x02,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x5e,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x5e,0xd3,0x02,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x5e,0xd3,0x00,0x00,0x00,0x08]
+v_rcp_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x5e,0xd3,0x02,0x00,0x00,0x08]
 
-v_rcp_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x5e,0xd3,0x00,0x00,0x00,0x10]
+v_rcp_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x5e,0xd3,0x02,0x00,0x00,0x10]
 
-v_rcp_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x5e,0xd3,0x00,0x00,0x00,0x18]
+v_rcp_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x5e,0xd3,0x02,0x00,0x00,0x18]
 
-v_rcp_clamp_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x60,0xfc,0x7f]
+v_rcp_clamp_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x60,0xfc,0x7f]
 
-v_rcp_clamp_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], vcc
-// CHECK: [0x6a,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], vcc
+// CHECK: [0x6a,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], tba
-// CHECK: [0x6c,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], tba
+// CHECK: [0x6c,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], tma
-// CHECK: [0x6e,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], tma
+// CHECK: [0x6e,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], exec
-// CHECK: [0x7e,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], exec
+// CHECK: [0x7e,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], 0
-// CHECK: [0x80,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], 0
+// CHECK: [0x80,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], -1
-// CHECK: [0xc1,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], -1
+// CHECK: [0xc1,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x60,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x60,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x60,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rcp_clamp_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x60,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rcp_clamp_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x60,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rcp_clamp_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x60,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rcp_clamp_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x61,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x61,0x0a,0x7e]
 
-v_rcp_clamp_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x61,0x00,0x7e]
+v_rcp_clamp_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x61,0x0a,0x7e]
 
-v_rcp_clamp_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x60,0xd3,0x02,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x60,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x60,0xd3,0x02,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x60,0xd3,0x02,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x60,0xd3,0x04,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x60,0xd3,0x66,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x60,0xd3,0x66,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x60,0xd3,0x68,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x60,0xd3,0x68,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x60,0xd3,0x6a,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x60,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x60,0xd3,0x6c,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x60,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x60,0xd3,0x6e,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x60,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x60,0xd3,0x7a,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x60,0xd3,0x7a,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x60,0xd3,0x7e,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x60,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x60,0xd3,0xfd,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x60,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x60,0xd3,0x00,0x01,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x60,0xd3,0x01,0x01,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x60,0xd3,0xfe,0x01,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x60,0xd3,0xfe,0x01,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd3,0x00,0x00,0x00,0x20]
+v_rcp_clamp_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x60,0xd3,0x02,0x00,0x00,0x20]
 
-v_rcp_clamp_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x60,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x60,0xd3,0x02,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x60,0xd3,0x00,0x00,0x00,0x00]
+v_rcp_clamp_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x60,0xd3,0x02,0x00,0x00,0x00]
 
-v_rcp_clamp_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x60,0xd3,0x00,0x00,0x00,0x08]
+v_rcp_clamp_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x60,0xd3,0x02,0x00,0x00,0x08]
 
-v_rcp_clamp_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x60,0xd3,0x00,0x00,0x00,0x10]
+v_rcp_clamp_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x60,0xd3,0x02,0x00,0x00,0x10]
 
-v_rcp_clamp_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x60,0xd3,0x00,0x00,0x00,0x18]
+v_rcp_clamp_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x60,0xd3,0x02,0x00,0x00,0x18]
 
-v_rsq_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x62,0xfc,0x7f]
+v_rsq_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x62,0xfc,0x7f]
 
-v_rsq_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], vcc
-// CHECK: [0x6a,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], vcc
+// CHECK: [0x6a,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], tba
-// CHECK: [0x6c,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], tba
+// CHECK: [0x6c,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], tma
-// CHECK: [0x6e,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], tma
+// CHECK: [0x6e,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], exec
-// CHECK: [0x7e,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], exec
+// CHECK: [0x7e,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], 0
-// CHECK: [0x80,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], 0
+// CHECK: [0x80,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], -1
-// CHECK: [0xc1,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], -1
+// CHECK: [0xc1,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x62,0x00,0x7e]
+v_rsq_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x62,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x62,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rsq_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x62,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rsq_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x62,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rsq_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x62,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rsq_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x63,0x00,0x7e]
+v_rsq_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x63,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x63,0x00,0x7e]
+v_rsq_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x63,0x0a,0x7e]
 
-v_rsq_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x62,0xd3,0x02,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x62,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x62,0xd3,0x02,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x62,0xd3,0x02,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x62,0xd3,0x04,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x62,0xd3,0x66,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x62,0xd3,0x66,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x62,0xd3,0x68,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x62,0xd3,0x68,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x62,0xd3,0x6a,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x62,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x62,0xd3,0x6c,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x62,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x62,0xd3,0x6e,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x62,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x62,0xd3,0x7a,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x62,0xd3,0x7a,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x62,0xd3,0x7e,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x62,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x62,0xd3,0xfd,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x62,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x62,0xd3,0x00,0x01,0x00,0x00]
+v_rsq_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x62,0xd3,0x01,0x01,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x62,0xd3,0xfe,0x01,0x00,0x00]
+v_rsq_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x62,0xd3,0xfe,0x01,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd3,0x00,0x00,0x00,0x20]
+v_rsq_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x62,0xd3,0x02,0x00,0x00,0x20]
 
-v_rsq_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x62,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x62,0xd3,0x02,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x62,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x62,0xd3,0x02,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x62,0xd3,0x00,0x00,0x00,0x08]
+v_rsq_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x62,0xd3,0x02,0x00,0x00,0x08]
 
-v_rsq_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x62,0xd3,0x00,0x00,0x00,0x10]
+v_rsq_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x62,0xd3,0x02,0x00,0x00,0x10]
 
-v_rsq_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x62,0xd3,0x00,0x00,0x00,0x18]
+v_rsq_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x62,0xd3,0x02,0x00,0x00,0x18]
 
-v_rsq_clamp_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x64,0xfc,0x7f]
+v_rsq_clamp_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x64,0xfc,0x7f]
 
-v_rsq_clamp_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], vcc
-// CHECK: [0x6a,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], vcc
+// CHECK: [0x6a,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], tba
-// CHECK: [0x6c,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], tba
+// CHECK: [0x6c,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], tma
-// CHECK: [0x6e,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], tma
+// CHECK: [0x6e,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], exec
-// CHECK: [0x7e,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], exec
+// CHECK: [0x7e,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], 0
-// CHECK: [0x80,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], 0
+// CHECK: [0x80,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], -1
-// CHECK: [0xc1,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], -1
+// CHECK: [0xc1,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x64,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x64,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x64,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rsq_clamp_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x64,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rsq_clamp_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x64,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rsq_clamp_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x64,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rsq_clamp_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x65,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x65,0x0a,0x7e]
 
-v_rsq_clamp_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x65,0x00,0x7e]
+v_rsq_clamp_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x65,0x0a,0x7e]
 
-v_rsq_clamp_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x64,0xd3,0x02,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x64,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x64,0xd3,0x02,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x64,0xd3,0x02,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x64,0xd3,0x04,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x64,0xd3,0x66,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x64,0xd3,0x66,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x64,0xd3,0x68,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x64,0xd3,0x68,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x64,0xd3,0x6a,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x64,0xd3,0x6a,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x64,0xd3,0x6c,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x64,0xd3,0x6c,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x64,0xd3,0x6e,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x64,0xd3,0x6e,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x64,0xd3,0x7a,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x64,0xd3,0x7a,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x64,0xd3,0x7e,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x64,0xd3,0x7e,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x64,0xd3,0xfd,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x64,0xd3,0xfd,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x64,0xd3,0x00,0x01,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x64,0xd3,0x01,0x01,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x64,0xd3,0xfe,0x01,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x64,0xd3,0xfe,0x01,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd3,0x00,0x00,0x00,0x20]
+v_rsq_clamp_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x64,0xd3,0x02,0x00,0x00,0x20]
 
-v_rsq_clamp_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x64,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x64,0xd3,0x02,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x64,0xd3,0x00,0x00,0x00,0x00]
+v_rsq_clamp_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x64,0xd3,0x02,0x00,0x00,0x00]
 
-v_rsq_clamp_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x64,0xd3,0x00,0x00,0x00,0x08]
+v_rsq_clamp_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x64,0xd3,0x02,0x00,0x00,0x08]
 
-v_rsq_clamp_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x64,0xd3,0x00,0x00,0x00,0x10]
+v_rsq_clamp_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x64,0xd3,0x02,0x00,0x00,0x10]
 
-v_rsq_clamp_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x64,0xd3,0x00,0x00,0x00,0x18]
+v_rsq_clamp_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x64,0xd3,0x02,0x00,0x00,0x18]
 
-v_sqrt_f32 v0, s0
-// CHECK: [0x00,0x66,0x00,0x7e]
+v_sqrt_f32 v5, s1
+// CHECK: [0x01,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v255, s0
-// CHECK: [0x00,0x66,0xfe,0x7f]
+v_sqrt_f32 v255, s1
+// CHECK: [0x01,0x66,0xfe,0x7f]
 
-v_sqrt_f32 v0, s103
-// CHECK: [0x67,0x66,0x00,0x7e]
+v_sqrt_f32 v5, s103
+// CHECK: [0x67,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x66,0x00,0x7e]
+v_sqrt_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x66,0x00,0x7e]
+v_sqrt_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, vcc_lo
-// CHECK: [0x6a,0x66,0x00,0x7e]
+v_sqrt_f32 v5, vcc_lo
+// CHECK: [0x6a,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, vcc_hi
-// CHECK: [0x6b,0x66,0x00,0x7e]
+v_sqrt_f32 v5, vcc_hi
+// CHECK: [0x6b,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, tba_lo
-// CHECK: [0x6c,0x66,0x00,0x7e]
+v_sqrt_f32 v5, tba_lo
+// CHECK: [0x6c,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, tba_hi
-// CHECK: [0x6d,0x66,0x00,0x7e]
+v_sqrt_f32 v5, tba_hi
+// CHECK: [0x6d,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, tma_lo
-// CHECK: [0x6e,0x66,0x00,0x7e]
+v_sqrt_f32 v5, tma_lo
+// CHECK: [0x6e,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, tma_hi
-// CHECK: [0x6f,0x66,0x00,0x7e]
+v_sqrt_f32 v5, tma_hi
+// CHECK: [0x6f,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, ttmp11
-// CHECK: [0x7b,0x66,0x00,0x7e]
+v_sqrt_f32 v5, ttmp11
+// CHECK: [0x7b,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, m0
-// CHECK: [0x7c,0x66,0x00,0x7e]
+v_sqrt_f32 v5, m0
+// CHECK: [0x7c,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, exec_lo
-// CHECK: [0x7e,0x66,0x00,0x7e]
+v_sqrt_f32 v5, exec_lo
+// CHECK: [0x7e,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, exec_hi
-// CHECK: [0x7f,0x66,0x00,0x7e]
+v_sqrt_f32 v5, exec_hi
+// CHECK: [0x7f,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, 0
-// CHECK: [0x80,0x66,0x00,0x7e]
+v_sqrt_f32 v5, 0
+// CHECK: [0x80,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, -1
-// CHECK: [0xc1,0x66,0x00,0x7e]
+v_sqrt_f32 v5, -1
+// CHECK: [0xc1,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, 0.5
-// CHECK: [0xf0,0x66,0x00,0x7e]
+v_sqrt_f32 v5, 0.5
+// CHECK: [0xf0,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, -4.0
-// CHECK: [0xf7,0x66,0x00,0x7e]
+v_sqrt_f32 v5, -4.0
+// CHECK: [0xf7,0x66,0x0a,0x7e]
 
-v_sqrt_f32 v0, 0xaf123456
-// CHECK: [0xff,0x66,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_sqrt_f32 v5, 0xaf123456
+// CHECK: [0xff,0x66,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_sqrt_f32 v0, 0x3f717273
-// CHECK: [0xff,0x66,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_sqrt_f32 v5, 0x3f717273
+// CHECK: [0xff,0x66,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_sqrt_f32 v0, v0
-// CHECK: [0x00,0x67,0x00,0x7e]
+v_sqrt_f32 v5, v1
+// CHECK: [0x01,0x67,0x0a,0x7e]
 
-v_sqrt_f32 v0, v255
-// CHECK: [0xff,0x67,0x00,0x7e]
+v_sqrt_f32 v5, v255
+// CHECK: [0xff,0x67,0x0a,0x7e]
 
-v_sqrt_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x66,0xd3,0x00,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x66,0xd3,0x01,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x66,0xd3,0x00,0x00,0x00,0x00]
+v_sqrt_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x66,0xd3,0x01,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x66,0xd3,0x67,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x66,0xd3,0x67,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x66,0xd3,0x68,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x66,0xd3,0x68,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x66,0xd3,0x69,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x66,0xd3,0x69,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x66,0xd3,0x6a,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x66,0xd3,0x6a,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x66,0xd3,0x6b,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x66,0xd3,0x6b,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x66,0xd3,0x6c,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x66,0xd3,0x6c,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x66,0xd3,0x6d,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x66,0xd3,0x6d,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x66,0xd3,0x6e,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x66,0xd3,0x6e,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x66,0xd3,0x6f,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x66,0xd3,0x6f,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x66,0xd3,0x7b,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x66,0xd3,0x7b,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x66,0xd3,0x7c,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x66,0xd3,0x7c,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x66,0xd3,0x7e,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x66,0xd3,0x7e,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x66,0xd3,0x7f,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x66,0xd3,0x7f,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x66,0xd3,0xfd,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x66,0xd3,0xfd,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x66,0xd3,0x00,0x01,0x00,0x00]
+v_sqrt_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x66,0xd3,0x01,0x01,0x00,0x00]
 
-v_sqrt_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x66,0xd3,0xff,0x01,0x00,0x00]
+v_sqrt_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x66,0xd3,0xff,0x01,0x00,0x00]
 
-v_sqrt_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x66,0xd3,0x00,0x00,0x00,0x20]
+v_sqrt_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x66,0xd3,0x01,0x00,0x00,0x20]
 
-v_sqrt_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x66,0xd3,0x00,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x66,0xd3,0x01,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x66,0xd3,0x00,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x66,0xd3,0x01,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x66,0xd3,0x00,0x00,0x00,0x08]
+v_sqrt_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x66,0xd3,0x01,0x00,0x00,0x08]
 
-v_sqrt_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x66,0xd3,0x00,0x00,0x00,0x10]
+v_sqrt_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x66,0xd3,0x01,0x00,0x00,0x10]
 
-v_sqrt_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x66,0xd3,0x00,0x00,0x00,0x18]
+v_sqrt_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x66,0xd3,0x01,0x00,0x00,0x18]
 
-v_sqrt_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x68,0xfc,0x7f]
+v_sqrt_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x68,0xfc,0x7f]
 
-v_sqrt_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], vcc
-// CHECK: [0x6a,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], vcc
+// CHECK: [0x6a,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], tba
-// CHECK: [0x6c,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], tba
+// CHECK: [0x6c,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], tma
-// CHECK: [0x6e,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], tma
+// CHECK: [0x6e,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], exec
-// CHECK: [0x7e,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], exec
+// CHECK: [0x7e,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], 0
-// CHECK: [0x80,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], 0
+// CHECK: [0x80,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], -1
-// CHECK: [0xc1,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], -1
+// CHECK: [0xc1,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x68,0x00,0x7e]
+v_sqrt_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x68,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x68,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_sqrt_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x68,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_sqrt_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x68,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_sqrt_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x68,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_sqrt_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x69,0x00,0x7e]
+v_sqrt_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x69,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x69,0x00,0x7e]
+v_sqrt_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x69,0x0a,0x7e]
 
-v_sqrt_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd3,0x00,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x68,0xd3,0x02,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x68,0xd3,0x00,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x68,0xd3,0x02,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x68,0xd3,0x02,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x68,0xd3,0x04,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x68,0xd3,0x66,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x68,0xd3,0x66,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x68,0xd3,0x68,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x68,0xd3,0x68,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x68,0xd3,0x6a,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x68,0xd3,0x6a,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x68,0xd3,0x6c,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x68,0xd3,0x6c,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x68,0xd3,0x6e,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x68,0xd3,0x6e,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x68,0xd3,0x7a,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x68,0xd3,0x7a,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x68,0xd3,0x7e,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x68,0xd3,0x7e,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x68,0xd3,0xfd,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x68,0xd3,0xfd,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x68,0xd3,0x00,0x01,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x68,0xd3,0x01,0x01,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x68,0xd3,0xfe,0x01,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x68,0xd3,0xfe,0x01,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd3,0x00,0x00,0x00,0x20]
+v_sqrt_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x68,0xd3,0x02,0x00,0x00,0x20]
 
-v_sqrt_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x68,0xd3,0x00,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x68,0xd3,0x02,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x68,0xd3,0x00,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x68,0xd3,0x02,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x68,0xd3,0x00,0x00,0x00,0x08]
+v_sqrt_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x68,0xd3,0x02,0x00,0x00,0x08]
 
-v_sqrt_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x68,0xd3,0x00,0x00,0x00,0x10]
+v_sqrt_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x68,0xd3,0x02,0x00,0x00,0x10]
 
-v_sqrt_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x68,0xd3,0x00,0x00,0x00,0x18]
+v_sqrt_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x68,0xd3,0x02,0x00,0x00,0x18]
 
-v_sin_f32 v0, s0
-// CHECK: [0x00,0x6a,0x00,0x7e]
+v_sin_f32 v5, s1
+// CHECK: [0x01,0x6a,0x0a,0x7e]
 
-v_sin_f32 v255, s0
-// CHECK: [0x00,0x6a,0xfe,0x7f]
+v_sin_f32 v255, s1
+// CHECK: [0x01,0x6a,0xfe,0x7f]
 
-v_sin_f32 v0, s103
-// CHECK: [0x67,0x6a,0x00,0x7e]
+v_sin_f32 v5, s103
+// CHECK: [0x67,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x6a,0x00,0x7e]
+v_sin_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x6a,0x00,0x7e]
+v_sin_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, vcc_lo
-// CHECK: [0x6a,0x6a,0x00,0x7e]
+v_sin_f32 v5, vcc_lo
+// CHECK: [0x6a,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, vcc_hi
-// CHECK: [0x6b,0x6a,0x00,0x7e]
+v_sin_f32 v5, vcc_hi
+// CHECK: [0x6b,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, tba_lo
-// CHECK: [0x6c,0x6a,0x00,0x7e]
+v_sin_f32 v5, tba_lo
+// CHECK: [0x6c,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, tba_hi
-// CHECK: [0x6d,0x6a,0x00,0x7e]
+v_sin_f32 v5, tba_hi
+// CHECK: [0x6d,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, tma_lo
-// CHECK: [0x6e,0x6a,0x00,0x7e]
+v_sin_f32 v5, tma_lo
+// CHECK: [0x6e,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, tma_hi
-// CHECK: [0x6f,0x6a,0x00,0x7e]
+v_sin_f32 v5, tma_hi
+// CHECK: [0x6f,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, ttmp11
-// CHECK: [0x7b,0x6a,0x00,0x7e]
+v_sin_f32 v5, ttmp11
+// CHECK: [0x7b,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, m0
-// CHECK: [0x7c,0x6a,0x00,0x7e]
+v_sin_f32 v5, m0
+// CHECK: [0x7c,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, exec_lo
-// CHECK: [0x7e,0x6a,0x00,0x7e]
+v_sin_f32 v5, exec_lo
+// CHECK: [0x7e,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, exec_hi
-// CHECK: [0x7f,0x6a,0x00,0x7e]
+v_sin_f32 v5, exec_hi
+// CHECK: [0x7f,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, 0
-// CHECK: [0x80,0x6a,0x00,0x7e]
+v_sin_f32 v5, 0
+// CHECK: [0x80,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, -1
-// CHECK: [0xc1,0x6a,0x00,0x7e]
+v_sin_f32 v5, -1
+// CHECK: [0xc1,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, 0.5
-// CHECK: [0xf0,0x6a,0x00,0x7e]
+v_sin_f32 v5, 0.5
+// CHECK: [0xf0,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, -4.0
-// CHECK: [0xf7,0x6a,0x00,0x7e]
+v_sin_f32 v5, -4.0
+// CHECK: [0xf7,0x6a,0x0a,0x7e]
 
-v_sin_f32 v0, 0xaf123456
-// CHECK: [0xff,0x6a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_sin_f32 v5, 0xaf123456
+// CHECK: [0xff,0x6a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_sin_f32 v0, 0x3f717273
-// CHECK: [0xff,0x6a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_sin_f32 v5, 0x3f717273
+// CHECK: [0xff,0x6a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_sin_f32 v0, v0
-// CHECK: [0x00,0x6b,0x00,0x7e]
+v_sin_f32 v5, v1
+// CHECK: [0x01,0x6b,0x0a,0x7e]
 
-v_sin_f32 v0, v255
-// CHECK: [0xff,0x6b,0x00,0x7e]
+v_sin_f32 v5, v255
+// CHECK: [0xff,0x6b,0x0a,0x7e]
 
-v_sin_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x6a,0xd3,0x00,0x00,0x00,0x00]
+v_sin_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x6a,0xd3,0x01,0x00,0x00,0x00]
 
-v_sin_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x6a,0xd3,0x00,0x00,0x00,0x00]
+v_sin_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x6a,0xd3,0x01,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x6a,0xd3,0x67,0x00,0x00,0x00]
+v_sin_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x6a,0xd3,0x67,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x6a,0xd3,0x68,0x00,0x00,0x00]
+v_sin_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x6a,0xd3,0x68,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x6a,0xd3,0x69,0x00,0x00,0x00]
+v_sin_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x6a,0xd3,0x69,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x6a,0xd3,0x6a,0x00,0x00,0x00]
+v_sin_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x6a,0xd3,0x6a,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x6a,0xd3,0x6b,0x00,0x00,0x00]
+v_sin_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x6a,0xd3,0x6b,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x6a,0xd3,0x6c,0x00,0x00,0x00]
+v_sin_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x6a,0xd3,0x6c,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x6a,0xd3,0x6d,0x00,0x00,0x00]
+v_sin_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x6a,0xd3,0x6d,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x6a,0xd3,0x6e,0x00,0x00,0x00]
+v_sin_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x6a,0xd3,0x6e,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x6a,0xd3,0x6f,0x00,0x00,0x00]
+v_sin_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x6a,0xd3,0x6f,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x6a,0xd3,0x7b,0x00,0x00,0x00]
+v_sin_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x6a,0xd3,0x7b,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x6a,0xd3,0x7c,0x00,0x00,0x00]
+v_sin_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x6a,0xd3,0x7c,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x6a,0xd3,0x7e,0x00,0x00,0x00]
+v_sin_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x6a,0xd3,0x7e,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x6a,0xd3,0x7f,0x00,0x00,0x00]
+v_sin_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x6a,0xd3,0x7f,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x6a,0xd3,0xfd,0x00,0x00,0x00]
+v_sin_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x6a,0xd3,0xfd,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x6a,0xd3,0x00,0x01,0x00,0x00]
+v_sin_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x6a,0xd3,0x01,0x01,0x00,0x00]
 
-v_sin_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x6a,0xd3,0xff,0x01,0x00,0x00]
+v_sin_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x6a,0xd3,0xff,0x01,0x00,0x00]
 
-v_sin_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x6a,0xd3,0x00,0x00,0x00,0x20]
+v_sin_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x6a,0xd3,0x01,0x00,0x00,0x20]
 
-v_sin_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x6a,0xd3,0x00,0x00,0x00,0x00]
+v_sin_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x6a,0xd3,0x01,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x6a,0xd3,0x00,0x00,0x00,0x00]
+v_sin_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x6a,0xd3,0x01,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x6a,0xd3,0x00,0x00,0x00,0x08]
+v_sin_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x6a,0xd3,0x01,0x00,0x00,0x08]
 
-v_sin_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x6a,0xd3,0x00,0x00,0x00,0x10]
+v_sin_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x6a,0xd3,0x01,0x00,0x00,0x10]
 
-v_sin_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x6a,0xd3,0x00,0x00,0x00,0x18]
+v_sin_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x6a,0xd3,0x01,0x00,0x00,0x18]
 
-v_cos_f32 v0, s0
-// CHECK: [0x00,0x6c,0x00,0x7e]
+v_cos_f32 v5, s1
+// CHECK: [0x01,0x6c,0x0a,0x7e]
 
-v_cos_f32 v255, s0
-// CHECK: [0x00,0x6c,0xfe,0x7f]
+v_cos_f32 v255, s1
+// CHECK: [0x01,0x6c,0xfe,0x7f]
 
-v_cos_f32 v0, s103
-// CHECK: [0x67,0x6c,0x00,0x7e]
+v_cos_f32 v5, s103
+// CHECK: [0x67,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x6c,0x00,0x7e]
+v_cos_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x6c,0x00,0x7e]
+v_cos_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, vcc_lo
-// CHECK: [0x6a,0x6c,0x00,0x7e]
+v_cos_f32 v5, vcc_lo
+// CHECK: [0x6a,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, vcc_hi
-// CHECK: [0x6b,0x6c,0x00,0x7e]
+v_cos_f32 v5, vcc_hi
+// CHECK: [0x6b,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, tba_lo
-// CHECK: [0x6c,0x6c,0x00,0x7e]
+v_cos_f32 v5, tba_lo
+// CHECK: [0x6c,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, tba_hi
-// CHECK: [0x6d,0x6c,0x00,0x7e]
+v_cos_f32 v5, tba_hi
+// CHECK: [0x6d,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, tma_lo
-// CHECK: [0x6e,0x6c,0x00,0x7e]
+v_cos_f32 v5, tma_lo
+// CHECK: [0x6e,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, tma_hi
-// CHECK: [0x6f,0x6c,0x00,0x7e]
+v_cos_f32 v5, tma_hi
+// CHECK: [0x6f,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, ttmp11
-// CHECK: [0x7b,0x6c,0x00,0x7e]
+v_cos_f32 v5, ttmp11
+// CHECK: [0x7b,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, m0
-// CHECK: [0x7c,0x6c,0x00,0x7e]
+v_cos_f32 v5, m0
+// CHECK: [0x7c,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, exec_lo
-// CHECK: [0x7e,0x6c,0x00,0x7e]
+v_cos_f32 v5, exec_lo
+// CHECK: [0x7e,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, exec_hi
-// CHECK: [0x7f,0x6c,0x00,0x7e]
+v_cos_f32 v5, exec_hi
+// CHECK: [0x7f,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, 0
-// CHECK: [0x80,0x6c,0x00,0x7e]
+v_cos_f32 v5, 0
+// CHECK: [0x80,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, -1
-// CHECK: [0xc1,0x6c,0x00,0x7e]
+v_cos_f32 v5, -1
+// CHECK: [0xc1,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, 0.5
-// CHECK: [0xf0,0x6c,0x00,0x7e]
+v_cos_f32 v5, 0.5
+// CHECK: [0xf0,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, -4.0
-// CHECK: [0xf7,0x6c,0x00,0x7e]
+v_cos_f32 v5, -4.0
+// CHECK: [0xf7,0x6c,0x0a,0x7e]
 
-v_cos_f32 v0, 0xaf123456
-// CHECK: [0xff,0x6c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cos_f32 v5, 0xaf123456
+// CHECK: [0xff,0x6c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cos_f32 v0, 0x3f717273
-// CHECK: [0xff,0x6c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cos_f32 v5, 0x3f717273
+// CHECK: [0xff,0x6c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cos_f32 v0, v0
-// CHECK: [0x00,0x6d,0x00,0x7e]
+v_cos_f32 v5, v1
+// CHECK: [0x01,0x6d,0x0a,0x7e]
 
-v_cos_f32 v0, v255
-// CHECK: [0xff,0x6d,0x00,0x7e]
+v_cos_f32 v5, v255
+// CHECK: [0xff,0x6d,0x0a,0x7e]
 
-v_cos_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x6c,0xd3,0x00,0x00,0x00,0x00]
+v_cos_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x6c,0xd3,0x01,0x00,0x00,0x00]
 
-v_cos_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x6c,0xd3,0x00,0x00,0x00,0x00]
+v_cos_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x6c,0xd3,0x01,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x6c,0xd3,0x67,0x00,0x00,0x00]
+v_cos_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x6c,0xd3,0x67,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x6c,0xd3,0x68,0x00,0x00,0x00]
+v_cos_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x6c,0xd3,0x68,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x6c,0xd3,0x69,0x00,0x00,0x00]
+v_cos_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x6c,0xd3,0x69,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x6c,0xd3,0x6a,0x00,0x00,0x00]
+v_cos_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x6c,0xd3,0x6a,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x6c,0xd3,0x6b,0x00,0x00,0x00]
+v_cos_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x6c,0xd3,0x6b,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x6c,0xd3,0x6c,0x00,0x00,0x00]
+v_cos_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x6c,0xd3,0x6c,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x6c,0xd3,0x6d,0x00,0x00,0x00]
+v_cos_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x6c,0xd3,0x6d,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x6c,0xd3,0x6e,0x00,0x00,0x00]
+v_cos_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x6c,0xd3,0x6e,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x6c,0xd3,0x6f,0x00,0x00,0x00]
+v_cos_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x6c,0xd3,0x6f,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x6c,0xd3,0x7b,0x00,0x00,0x00]
+v_cos_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x6c,0xd3,0x7b,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x6c,0xd3,0x7c,0x00,0x00,0x00]
+v_cos_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x6c,0xd3,0x7c,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x6c,0xd3,0x7e,0x00,0x00,0x00]
+v_cos_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x6c,0xd3,0x7e,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x6c,0xd3,0x7f,0x00,0x00,0x00]
+v_cos_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x6c,0xd3,0x7f,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x6c,0xd3,0xfd,0x00,0x00,0x00]
+v_cos_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x6c,0xd3,0xfd,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x6c,0xd3,0x00,0x01,0x00,0x00]
+v_cos_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x6c,0xd3,0x01,0x01,0x00,0x00]
 
-v_cos_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x6c,0xd3,0xff,0x01,0x00,0x00]
+v_cos_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x6c,0xd3,0xff,0x01,0x00,0x00]
 
-v_cos_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x6c,0xd3,0x00,0x00,0x00,0x20]
+v_cos_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x6c,0xd3,0x01,0x00,0x00,0x20]
 
-v_cos_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x6c,0xd3,0x00,0x00,0x00,0x00]
+v_cos_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x6c,0xd3,0x01,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x6c,0xd3,0x00,0x00,0x00,0x00]
+v_cos_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x6c,0xd3,0x01,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x6c,0xd3,0x00,0x00,0x00,0x08]
+v_cos_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x6c,0xd3,0x01,0x00,0x00,0x08]
 
-v_cos_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x6c,0xd3,0x00,0x00,0x00,0x10]
+v_cos_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x6c,0xd3,0x01,0x00,0x00,0x10]
 
-v_cos_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x6c,0xd3,0x00,0x00,0x00,0x18]
+v_cos_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x6c,0xd3,0x01,0x00,0x00,0x18]
 
-v_not_b32 v0, s0
-// CHECK: [0x00,0x6e,0x00,0x7e]
+v_not_b32 v5, s1
+// CHECK: [0x01,0x6e,0x0a,0x7e]
 
-v_not_b32 v255, s0
-// CHECK: [0x00,0x6e,0xfe,0x7f]
+v_not_b32 v255, s1
+// CHECK: [0x01,0x6e,0xfe,0x7f]
 
-v_not_b32 v0, s103
-// CHECK: [0x67,0x6e,0x00,0x7e]
+v_not_b32 v5, s103
+// CHECK: [0x67,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, flat_scratch_lo
-// CHECK: [0x68,0x6e,0x00,0x7e]
+v_not_b32 v5, flat_scratch_lo
+// CHECK: [0x68,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, flat_scratch_hi
-// CHECK: [0x69,0x6e,0x00,0x7e]
+v_not_b32 v5, flat_scratch_hi
+// CHECK: [0x69,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, vcc_lo
-// CHECK: [0x6a,0x6e,0x00,0x7e]
+v_not_b32 v5, vcc_lo
+// CHECK: [0x6a,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, vcc_hi
-// CHECK: [0x6b,0x6e,0x00,0x7e]
+v_not_b32 v5, vcc_hi
+// CHECK: [0x6b,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, tba_lo
-// CHECK: [0x6c,0x6e,0x00,0x7e]
+v_not_b32 v5, tba_lo
+// CHECK: [0x6c,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, tba_hi
-// CHECK: [0x6d,0x6e,0x00,0x7e]
+v_not_b32 v5, tba_hi
+// CHECK: [0x6d,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, tma_lo
-// CHECK: [0x6e,0x6e,0x00,0x7e]
+v_not_b32 v5, tma_lo
+// CHECK: [0x6e,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, tma_hi
-// CHECK: [0x6f,0x6e,0x00,0x7e]
+v_not_b32 v5, tma_hi
+// CHECK: [0x6f,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, ttmp11
-// CHECK: [0x7b,0x6e,0x00,0x7e]
+v_not_b32 v5, ttmp11
+// CHECK: [0x7b,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, m0
-// CHECK: [0x7c,0x6e,0x00,0x7e]
+v_not_b32 v5, m0
+// CHECK: [0x7c,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, exec_lo
-// CHECK: [0x7e,0x6e,0x00,0x7e]
+v_not_b32 v5, exec_lo
+// CHECK: [0x7e,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, exec_hi
-// CHECK: [0x7f,0x6e,0x00,0x7e]
+v_not_b32 v5, exec_hi
+// CHECK: [0x7f,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, 0
-// CHECK: [0x80,0x6e,0x00,0x7e]
+v_not_b32 v5, 0
+// CHECK: [0x80,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, -1
-// CHECK: [0xc1,0x6e,0x00,0x7e]
+v_not_b32 v5, -1
+// CHECK: [0xc1,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, 0.5
-// CHECK: [0xf0,0x6e,0x00,0x7e]
+v_not_b32 v5, 0.5
+// CHECK: [0xf0,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, -4.0
-// CHECK: [0xf7,0x6e,0x00,0x7e]
+v_not_b32 v5, -4.0
+// CHECK: [0xf7,0x6e,0x0a,0x7e]
 
-v_not_b32 v0, 0xaf123456
-// CHECK: [0xff,0x6e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_not_b32 v5, 0xaf123456
+// CHECK: [0xff,0x6e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_not_b32 v0, 0x3f717273
-// CHECK: [0xff,0x6e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_not_b32 v5, 0x3f717273
+// CHECK: [0xff,0x6e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_not_b32 v0, v0
-// CHECK: [0x00,0x6f,0x00,0x7e]
+v_not_b32 v5, v1
+// CHECK: [0x01,0x6f,0x0a,0x7e]
 
-v_not_b32 v0, v255
-// CHECK: [0xff,0x6f,0x00,0x7e]
+v_not_b32 v5, v255
+// CHECK: [0xff,0x6f,0x0a,0x7e]
 
-v_not_b32_e64 v0, s0
-// CHECK: [0x00,0x00,0x6e,0xd3,0x00,0x00,0x00,0x00]
+v_not_b32_e64 v5, s1
+// CHECK: [0x05,0x00,0x6e,0xd3,0x01,0x00,0x00,0x00]
 
-v_not_b32_e64 v255, s0
-// CHECK: [0xff,0x00,0x6e,0xd3,0x00,0x00,0x00,0x00]
+v_not_b32_e64 v255, s1
+// CHECK: [0xff,0x00,0x6e,0xd3,0x01,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, s103
-// CHECK: [0x00,0x00,0x6e,0xd3,0x67,0x00,0x00,0x00]
+v_not_b32_e64 v5, s103
+// CHECK: [0x05,0x00,0x6e,0xd3,0x67,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x6e,0xd3,0x68,0x00,0x00,0x00]
+v_not_b32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x6e,0xd3,0x68,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x6e,0xd3,0x69,0x00,0x00,0x00]
+v_not_b32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x6e,0xd3,0x69,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x6e,0xd3,0x6a,0x00,0x00,0x00]
+v_not_b32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x6e,0xd3,0x6a,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x6e,0xd3,0x6b,0x00,0x00,0x00]
+v_not_b32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x6e,0xd3,0x6b,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x6e,0xd3,0x6c,0x00,0x00,0x00]
+v_not_b32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x6e,0xd3,0x6c,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x6e,0xd3,0x6d,0x00,0x00,0x00]
+v_not_b32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x6e,0xd3,0x6d,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x6e,0xd3,0x6e,0x00,0x00,0x00]
+v_not_b32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x6e,0xd3,0x6e,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x6e,0xd3,0x6f,0x00,0x00,0x00]
+v_not_b32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x6e,0xd3,0x6f,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x6e,0xd3,0x7b,0x00,0x00,0x00]
+v_not_b32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x6e,0xd3,0x7b,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x6e,0xd3,0x7c,0x00,0x00,0x00]
+v_not_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x6e,0xd3,0x7c,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x6e,0xd3,0x7e,0x00,0x00,0x00]
+v_not_b32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x6e,0xd3,0x7e,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x6e,0xd3,0x7f,0x00,0x00,0x00]
+v_not_b32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x6e,0xd3,0x7f,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x6e,0xd3,0x80,0x00,0x00,0x00]
+v_not_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x6e,0xd3,0x80,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x6e,0xd3,0xc1,0x00,0x00,0x00]
+v_not_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x6e,0xd3,0xc1,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x6e,0xd3,0xf0,0x00,0x00,0x00]
+v_not_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x6e,0xd3,0xf0,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x6e,0xd3,0xf7,0x00,0x00,0x00]
+v_not_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x6e,0xd3,0xf7,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x6e,0xd3,0x00,0x01,0x00,0x00]
+v_not_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x6e,0xd3,0x01,0x01,0x00,0x00]
 
-v_not_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x6e,0xd3,0xff,0x01,0x00,0x00]
+v_not_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x6e,0xd3,0xff,0x01,0x00,0x00]
 
-v_bfrev_b32 v0, s0
-// CHECK: [0x00,0x70,0x00,0x7e]
+v_bfrev_b32 v5, s1
+// CHECK: [0x01,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v255, s0
-// CHECK: [0x00,0x70,0xfe,0x7f]
+v_bfrev_b32 v255, s1
+// CHECK: [0x01,0x70,0xfe,0x7f]
 
-v_bfrev_b32 v0, s103
-// CHECK: [0x67,0x70,0x00,0x7e]
+v_bfrev_b32 v5, s103
+// CHECK: [0x67,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, flat_scratch_lo
-// CHECK: [0x68,0x70,0x00,0x7e]
+v_bfrev_b32 v5, flat_scratch_lo
+// CHECK: [0x68,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, flat_scratch_hi
-// CHECK: [0x69,0x70,0x00,0x7e]
+v_bfrev_b32 v5, flat_scratch_hi
+// CHECK: [0x69,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, vcc_lo
-// CHECK: [0x6a,0x70,0x00,0x7e]
+v_bfrev_b32 v5, vcc_lo
+// CHECK: [0x6a,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, vcc_hi
-// CHECK: [0x6b,0x70,0x00,0x7e]
+v_bfrev_b32 v5, vcc_hi
+// CHECK: [0x6b,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, tba_lo
-// CHECK: [0x6c,0x70,0x00,0x7e]
+v_bfrev_b32 v5, tba_lo
+// CHECK: [0x6c,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, tba_hi
-// CHECK: [0x6d,0x70,0x00,0x7e]
+v_bfrev_b32 v5, tba_hi
+// CHECK: [0x6d,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, tma_lo
-// CHECK: [0x6e,0x70,0x00,0x7e]
+v_bfrev_b32 v5, tma_lo
+// CHECK: [0x6e,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, tma_hi
-// CHECK: [0x6f,0x70,0x00,0x7e]
+v_bfrev_b32 v5, tma_hi
+// CHECK: [0x6f,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, ttmp11
-// CHECK: [0x7b,0x70,0x00,0x7e]
+v_bfrev_b32 v5, ttmp11
+// CHECK: [0x7b,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, m0
-// CHECK: [0x7c,0x70,0x00,0x7e]
+v_bfrev_b32 v5, m0
+// CHECK: [0x7c,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, exec_lo
-// CHECK: [0x7e,0x70,0x00,0x7e]
+v_bfrev_b32 v5, exec_lo
+// CHECK: [0x7e,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, exec_hi
-// CHECK: [0x7f,0x70,0x00,0x7e]
+v_bfrev_b32 v5, exec_hi
+// CHECK: [0x7f,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, 0
-// CHECK: [0x80,0x70,0x00,0x7e]
+v_bfrev_b32 v5, 0
+// CHECK: [0x80,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, -1
-// CHECK: [0xc1,0x70,0x00,0x7e]
+v_bfrev_b32 v5, -1
+// CHECK: [0xc1,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, 0.5
-// CHECK: [0xf0,0x70,0x00,0x7e]
+v_bfrev_b32 v5, 0.5
+// CHECK: [0xf0,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, -4.0
-// CHECK: [0xf7,0x70,0x00,0x7e]
+v_bfrev_b32 v5, -4.0
+// CHECK: [0xf7,0x70,0x0a,0x7e]
 
-v_bfrev_b32 v0, 0xaf123456
-// CHECK: [0xff,0x70,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_bfrev_b32 v5, 0xaf123456
+// CHECK: [0xff,0x70,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_bfrev_b32 v0, 0x3f717273
-// CHECK: [0xff,0x70,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_bfrev_b32 v5, 0x3f717273
+// CHECK: [0xff,0x70,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_bfrev_b32 v0, v0
-// CHECK: [0x00,0x71,0x00,0x7e]
+v_bfrev_b32 v5, v1
+// CHECK: [0x01,0x71,0x0a,0x7e]
 
-v_bfrev_b32 v0, v255
-// CHECK: [0xff,0x71,0x00,0x7e]
+v_bfrev_b32 v5, v255
+// CHECK: [0xff,0x71,0x0a,0x7e]
 
-v_bfrev_b32_e64 v0, s0
-// CHECK: [0x00,0x00,0x70,0xd3,0x00,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, s1
+// CHECK: [0x05,0x00,0x70,0xd3,0x01,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v255, s0
-// CHECK: [0xff,0x00,0x70,0xd3,0x00,0x00,0x00,0x00]
+v_bfrev_b32_e64 v255, s1
+// CHECK: [0xff,0x00,0x70,0xd3,0x01,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, s103
-// CHECK: [0x00,0x00,0x70,0xd3,0x67,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, s103
+// CHECK: [0x05,0x00,0x70,0xd3,0x67,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x70,0xd3,0x68,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x70,0xd3,0x68,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x70,0xd3,0x69,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x70,0xd3,0x69,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x70,0xd3,0x6a,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x70,0xd3,0x6a,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x70,0xd3,0x6b,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x70,0xd3,0x6b,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x70,0xd3,0x6c,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x70,0xd3,0x6c,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x70,0xd3,0x6d,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x70,0xd3,0x6d,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x70,0xd3,0x6e,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x70,0xd3,0x6e,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x70,0xd3,0x6f,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x70,0xd3,0x6f,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x70,0xd3,0x7b,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x70,0xd3,0x7b,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x70,0xd3,0x7c,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x70,0xd3,0x7c,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x70,0xd3,0x7e,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x70,0xd3,0x7e,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x70,0xd3,0x7f,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x70,0xd3,0x7f,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x70,0xd3,0x80,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x70,0xd3,0x80,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x70,0xd3,0xc1,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x70,0xd3,0xc1,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x70,0xd3,0xf0,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x70,0xd3,0xf0,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x70,0xd3,0xf7,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x70,0xd3,0xf7,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x70,0xd3,0x00,0x01,0x00,0x00]
+v_bfrev_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x70,0xd3,0x01,0x01,0x00,0x00]
 
-v_bfrev_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x70,0xd3,0xff,0x01,0x00,0x00]
+v_bfrev_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x70,0xd3,0xff,0x01,0x00,0x00]
 
-v_ffbh_u32 v0, s0
-// CHECK: [0x00,0x72,0x00,0x7e]
+v_ffbh_u32 v5, s1
+// CHECK: [0x01,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v255, s0
-// CHECK: [0x00,0x72,0xfe,0x7f]
+v_ffbh_u32 v255, s1
+// CHECK: [0x01,0x72,0xfe,0x7f]
 
-v_ffbh_u32 v0, s103
-// CHECK: [0x67,0x72,0x00,0x7e]
+v_ffbh_u32 v5, s103
+// CHECK: [0x67,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, flat_scratch_lo
-// CHECK: [0x68,0x72,0x00,0x7e]
+v_ffbh_u32 v5, flat_scratch_lo
+// CHECK: [0x68,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, flat_scratch_hi
-// CHECK: [0x69,0x72,0x00,0x7e]
+v_ffbh_u32 v5, flat_scratch_hi
+// CHECK: [0x69,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, vcc_lo
-// CHECK: [0x6a,0x72,0x00,0x7e]
+v_ffbh_u32 v5, vcc_lo
+// CHECK: [0x6a,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, vcc_hi
-// CHECK: [0x6b,0x72,0x00,0x7e]
+v_ffbh_u32 v5, vcc_hi
+// CHECK: [0x6b,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, tba_lo
-// CHECK: [0x6c,0x72,0x00,0x7e]
+v_ffbh_u32 v5, tba_lo
+// CHECK: [0x6c,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, tba_hi
-// CHECK: [0x6d,0x72,0x00,0x7e]
+v_ffbh_u32 v5, tba_hi
+// CHECK: [0x6d,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, tma_lo
-// CHECK: [0x6e,0x72,0x00,0x7e]
+v_ffbh_u32 v5, tma_lo
+// CHECK: [0x6e,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, tma_hi
-// CHECK: [0x6f,0x72,0x00,0x7e]
+v_ffbh_u32 v5, tma_hi
+// CHECK: [0x6f,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, ttmp11
-// CHECK: [0x7b,0x72,0x00,0x7e]
+v_ffbh_u32 v5, ttmp11
+// CHECK: [0x7b,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, m0
-// CHECK: [0x7c,0x72,0x00,0x7e]
+v_ffbh_u32 v5, m0
+// CHECK: [0x7c,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, exec_lo
-// CHECK: [0x7e,0x72,0x00,0x7e]
+v_ffbh_u32 v5, exec_lo
+// CHECK: [0x7e,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, exec_hi
-// CHECK: [0x7f,0x72,0x00,0x7e]
+v_ffbh_u32 v5, exec_hi
+// CHECK: [0x7f,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, 0
-// CHECK: [0x80,0x72,0x00,0x7e]
+v_ffbh_u32 v5, 0
+// CHECK: [0x80,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, -1
-// CHECK: [0xc1,0x72,0x00,0x7e]
+v_ffbh_u32 v5, -1
+// CHECK: [0xc1,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, 0.5
-// CHECK: [0xf0,0x72,0x00,0x7e]
+v_ffbh_u32 v5, 0.5
+// CHECK: [0xf0,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, -4.0
-// CHECK: [0xf7,0x72,0x00,0x7e]
+v_ffbh_u32 v5, -4.0
+// CHECK: [0xf7,0x72,0x0a,0x7e]
 
-v_ffbh_u32 v0, 0xaf123456
-// CHECK: [0xff,0x72,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_ffbh_u32 v5, 0xaf123456
+// CHECK: [0xff,0x72,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_ffbh_u32 v0, 0x3f717273
-// CHECK: [0xff,0x72,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_ffbh_u32 v5, 0x3f717273
+// CHECK: [0xff,0x72,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_ffbh_u32 v0, v0
-// CHECK: [0x00,0x73,0x00,0x7e]
+v_ffbh_u32 v5, v1
+// CHECK: [0x01,0x73,0x0a,0x7e]
 
-v_ffbh_u32 v0, v255
-// CHECK: [0xff,0x73,0x00,0x7e]
+v_ffbh_u32 v5, v255
+// CHECK: [0xff,0x73,0x0a,0x7e]
 
-v_ffbh_u32_e64 v0, s0
-// CHECK: [0x00,0x00,0x72,0xd3,0x00,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, s1
+// CHECK: [0x05,0x00,0x72,0xd3,0x01,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v255, s0
-// CHECK: [0xff,0x00,0x72,0xd3,0x00,0x00,0x00,0x00]
+v_ffbh_u32_e64 v255, s1
+// CHECK: [0xff,0x00,0x72,0xd3,0x01,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, s103
-// CHECK: [0x00,0x00,0x72,0xd3,0x67,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, s103
+// CHECK: [0x05,0x00,0x72,0xd3,0x67,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x72,0xd3,0x68,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x72,0xd3,0x68,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x72,0xd3,0x69,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x72,0xd3,0x69,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x72,0xd3,0x6a,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x72,0xd3,0x6a,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x72,0xd3,0x6b,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x72,0xd3,0x6b,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x72,0xd3,0x6c,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x72,0xd3,0x6c,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x72,0xd3,0x6d,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x72,0xd3,0x6d,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x72,0xd3,0x6e,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x72,0xd3,0x6e,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x72,0xd3,0x6f,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x72,0xd3,0x6f,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x72,0xd3,0x7b,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x72,0xd3,0x7b,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, m0
-// CHECK: [0x00,0x00,0x72,0xd3,0x7c,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, m0
+// CHECK: [0x05,0x00,0x72,0xd3,0x7c,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x72,0xd3,0x7e,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x72,0xd3,0x7e,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x72,0xd3,0x7f,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x72,0xd3,0x7f,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, 0
-// CHECK: [0x00,0x00,0x72,0xd3,0x80,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, 0
+// CHECK: [0x05,0x00,0x72,0xd3,0x80,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, -1
-// CHECK: [0x00,0x00,0x72,0xd3,0xc1,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, -1
+// CHECK: [0x05,0x00,0x72,0xd3,0xc1,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x72,0xd3,0xf0,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x72,0xd3,0xf0,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x72,0xd3,0xf7,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x72,0xd3,0xf7,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, v0
-// CHECK: [0x00,0x00,0x72,0xd3,0x00,0x01,0x00,0x00]
+v_ffbh_u32_e64 v5, v1
+// CHECK: [0x05,0x00,0x72,0xd3,0x01,0x01,0x00,0x00]
 
-v_ffbh_u32_e64 v0, v255
-// CHECK: [0x00,0x00,0x72,0xd3,0xff,0x01,0x00,0x00]
+v_ffbh_u32_e64 v5, v255
+// CHECK: [0x05,0x00,0x72,0xd3,0xff,0x01,0x00,0x00]
 
-v_ffbl_b32 v0, s0
-// CHECK: [0x00,0x74,0x00,0x7e]
+v_ffbl_b32 v5, s1
+// CHECK: [0x01,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v255, s0
-// CHECK: [0x00,0x74,0xfe,0x7f]
+v_ffbl_b32 v255, s1
+// CHECK: [0x01,0x74,0xfe,0x7f]
 
-v_ffbl_b32 v0, s103
-// CHECK: [0x67,0x74,0x00,0x7e]
+v_ffbl_b32 v5, s103
+// CHECK: [0x67,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, flat_scratch_lo
-// CHECK: [0x68,0x74,0x00,0x7e]
+v_ffbl_b32 v5, flat_scratch_lo
+// CHECK: [0x68,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, flat_scratch_hi
-// CHECK: [0x69,0x74,0x00,0x7e]
+v_ffbl_b32 v5, flat_scratch_hi
+// CHECK: [0x69,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, vcc_lo
-// CHECK: [0x6a,0x74,0x00,0x7e]
+v_ffbl_b32 v5, vcc_lo
+// CHECK: [0x6a,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, vcc_hi
-// CHECK: [0x6b,0x74,0x00,0x7e]
+v_ffbl_b32 v5, vcc_hi
+// CHECK: [0x6b,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, tba_lo
-// CHECK: [0x6c,0x74,0x00,0x7e]
+v_ffbl_b32 v5, tba_lo
+// CHECK: [0x6c,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, tba_hi
-// CHECK: [0x6d,0x74,0x00,0x7e]
+v_ffbl_b32 v5, tba_hi
+// CHECK: [0x6d,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, tma_lo
-// CHECK: [0x6e,0x74,0x00,0x7e]
+v_ffbl_b32 v5, tma_lo
+// CHECK: [0x6e,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, tma_hi
-// CHECK: [0x6f,0x74,0x00,0x7e]
+v_ffbl_b32 v5, tma_hi
+// CHECK: [0x6f,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, ttmp11
-// CHECK: [0x7b,0x74,0x00,0x7e]
+v_ffbl_b32 v5, ttmp11
+// CHECK: [0x7b,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, m0
-// CHECK: [0x7c,0x74,0x00,0x7e]
+v_ffbl_b32 v5, m0
+// CHECK: [0x7c,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, exec_lo
-// CHECK: [0x7e,0x74,0x00,0x7e]
+v_ffbl_b32 v5, exec_lo
+// CHECK: [0x7e,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, exec_hi
-// CHECK: [0x7f,0x74,0x00,0x7e]
+v_ffbl_b32 v5, exec_hi
+// CHECK: [0x7f,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, 0
-// CHECK: [0x80,0x74,0x00,0x7e]
+v_ffbl_b32 v5, 0
+// CHECK: [0x80,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, -1
-// CHECK: [0xc1,0x74,0x00,0x7e]
+v_ffbl_b32 v5, -1
+// CHECK: [0xc1,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, 0.5
-// CHECK: [0xf0,0x74,0x00,0x7e]
+v_ffbl_b32 v5, 0.5
+// CHECK: [0xf0,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, -4.0
-// CHECK: [0xf7,0x74,0x00,0x7e]
+v_ffbl_b32 v5, -4.0
+// CHECK: [0xf7,0x74,0x0a,0x7e]
 
-v_ffbl_b32 v0, 0xaf123456
-// CHECK: [0xff,0x74,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_ffbl_b32 v5, 0xaf123456
+// CHECK: [0xff,0x74,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_ffbl_b32 v0, 0x3f717273
-// CHECK: [0xff,0x74,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_ffbl_b32 v5, 0x3f717273
+// CHECK: [0xff,0x74,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_ffbl_b32 v0, v0
-// CHECK: [0x00,0x75,0x00,0x7e]
+v_ffbl_b32 v5, v1
+// CHECK: [0x01,0x75,0x0a,0x7e]
 
-v_ffbl_b32 v0, v255
-// CHECK: [0xff,0x75,0x00,0x7e]
+v_ffbl_b32 v5, v255
+// CHECK: [0xff,0x75,0x0a,0x7e]
 
-v_ffbl_b32_e64 v0, s0
-// CHECK: [0x00,0x00,0x74,0xd3,0x00,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, s1
+// CHECK: [0x05,0x00,0x74,0xd3,0x01,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v255, s0
-// CHECK: [0xff,0x00,0x74,0xd3,0x00,0x00,0x00,0x00]
+v_ffbl_b32_e64 v255, s1
+// CHECK: [0xff,0x00,0x74,0xd3,0x01,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, s103
-// CHECK: [0x00,0x00,0x74,0xd3,0x67,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, s103
+// CHECK: [0x05,0x00,0x74,0xd3,0x67,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x74,0xd3,0x68,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x74,0xd3,0x68,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x74,0xd3,0x69,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x74,0xd3,0x69,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x74,0xd3,0x6a,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x74,0xd3,0x6a,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x74,0xd3,0x6b,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x74,0xd3,0x6b,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x74,0xd3,0x6c,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x74,0xd3,0x6c,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x74,0xd3,0x6d,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x74,0xd3,0x6d,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x74,0xd3,0x6e,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x74,0xd3,0x6e,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x74,0xd3,0x6f,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x74,0xd3,0x6f,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x74,0xd3,0x7b,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x74,0xd3,0x7b,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x74,0xd3,0x7c,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x74,0xd3,0x7c,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x74,0xd3,0x7e,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x74,0xd3,0x7e,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x74,0xd3,0x7f,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x74,0xd3,0x7f,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x74,0xd3,0x80,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x74,0xd3,0x80,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x74,0xd3,0xc1,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x74,0xd3,0xc1,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x74,0xd3,0xf0,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x74,0xd3,0xf0,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x74,0xd3,0xf7,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x74,0xd3,0xf7,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x74,0xd3,0x00,0x01,0x00,0x00]
+v_ffbl_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x74,0xd3,0x01,0x01,0x00,0x00]
 
-v_ffbl_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x74,0xd3,0xff,0x01,0x00,0x00]
+v_ffbl_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x74,0xd3,0xff,0x01,0x00,0x00]
 
-v_ffbh_i32 v0, s0
-// CHECK: [0x00,0x76,0x00,0x7e]
+v_ffbh_i32 v5, s1
+// CHECK: [0x01,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v255, s0
-// CHECK: [0x00,0x76,0xfe,0x7f]
+v_ffbh_i32 v255, s1
+// CHECK: [0x01,0x76,0xfe,0x7f]
 
-v_ffbh_i32 v0, s103
-// CHECK: [0x67,0x76,0x00,0x7e]
+v_ffbh_i32 v5, s103
+// CHECK: [0x67,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, flat_scratch_lo
-// CHECK: [0x68,0x76,0x00,0x7e]
+v_ffbh_i32 v5, flat_scratch_lo
+// CHECK: [0x68,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, flat_scratch_hi
-// CHECK: [0x69,0x76,0x00,0x7e]
+v_ffbh_i32 v5, flat_scratch_hi
+// CHECK: [0x69,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, vcc_lo
-// CHECK: [0x6a,0x76,0x00,0x7e]
+v_ffbh_i32 v5, vcc_lo
+// CHECK: [0x6a,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, vcc_hi
-// CHECK: [0x6b,0x76,0x00,0x7e]
+v_ffbh_i32 v5, vcc_hi
+// CHECK: [0x6b,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, tba_lo
-// CHECK: [0x6c,0x76,0x00,0x7e]
+v_ffbh_i32 v5, tba_lo
+// CHECK: [0x6c,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, tba_hi
-// CHECK: [0x6d,0x76,0x00,0x7e]
+v_ffbh_i32 v5, tba_hi
+// CHECK: [0x6d,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, tma_lo
-// CHECK: [0x6e,0x76,0x00,0x7e]
+v_ffbh_i32 v5, tma_lo
+// CHECK: [0x6e,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, tma_hi
-// CHECK: [0x6f,0x76,0x00,0x7e]
+v_ffbh_i32 v5, tma_hi
+// CHECK: [0x6f,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, ttmp11
-// CHECK: [0x7b,0x76,0x00,0x7e]
+v_ffbh_i32 v5, ttmp11
+// CHECK: [0x7b,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, m0
-// CHECK: [0x7c,0x76,0x00,0x7e]
+v_ffbh_i32 v5, m0
+// CHECK: [0x7c,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, exec_lo
-// CHECK: [0x7e,0x76,0x00,0x7e]
+v_ffbh_i32 v5, exec_lo
+// CHECK: [0x7e,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, exec_hi
-// CHECK: [0x7f,0x76,0x00,0x7e]
+v_ffbh_i32 v5, exec_hi
+// CHECK: [0x7f,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, 0
-// CHECK: [0x80,0x76,0x00,0x7e]
+v_ffbh_i32 v5, 0
+// CHECK: [0x80,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, -1
-// CHECK: [0xc1,0x76,0x00,0x7e]
+v_ffbh_i32 v5, -1
+// CHECK: [0xc1,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, 0.5
-// CHECK: [0xf0,0x76,0x00,0x7e]
+v_ffbh_i32 v5, 0.5
+// CHECK: [0xf0,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, -4.0
-// CHECK: [0xf7,0x76,0x00,0x7e]
+v_ffbh_i32 v5, -4.0
+// CHECK: [0xf7,0x76,0x0a,0x7e]
 
-v_ffbh_i32 v0, 0xaf123456
-// CHECK: [0xff,0x76,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_ffbh_i32 v5, 0xaf123456
+// CHECK: [0xff,0x76,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_ffbh_i32 v0, 0x3f717273
-// CHECK: [0xff,0x76,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_ffbh_i32 v5, 0x3f717273
+// CHECK: [0xff,0x76,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_ffbh_i32 v0, v0
-// CHECK: [0x00,0x77,0x00,0x7e]
+v_ffbh_i32 v5, v1
+// CHECK: [0x01,0x77,0x0a,0x7e]
 
-v_ffbh_i32 v0, v255
-// CHECK: [0xff,0x77,0x00,0x7e]
+v_ffbh_i32 v5, v255
+// CHECK: [0xff,0x77,0x0a,0x7e]
 
-v_ffbh_i32_e64 v0, s0
-// CHECK: [0x00,0x00,0x76,0xd3,0x00,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, s1
+// CHECK: [0x05,0x00,0x76,0xd3,0x01,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v255, s0
-// CHECK: [0xff,0x00,0x76,0xd3,0x00,0x00,0x00,0x00]
+v_ffbh_i32_e64 v255, s1
+// CHECK: [0xff,0x00,0x76,0xd3,0x01,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, s103
-// CHECK: [0x00,0x00,0x76,0xd3,0x67,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, s103
+// CHECK: [0x05,0x00,0x76,0xd3,0x67,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x76,0xd3,0x68,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x76,0xd3,0x68,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x76,0xd3,0x69,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x76,0xd3,0x69,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x76,0xd3,0x6a,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x76,0xd3,0x6a,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x76,0xd3,0x6b,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x76,0xd3,0x6b,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x76,0xd3,0x6c,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x76,0xd3,0x6c,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x76,0xd3,0x6d,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x76,0xd3,0x6d,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x76,0xd3,0x6e,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x76,0xd3,0x6e,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x76,0xd3,0x6f,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x76,0xd3,0x6f,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x76,0xd3,0x7b,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x76,0xd3,0x7b,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, m0
-// CHECK: [0x00,0x00,0x76,0xd3,0x7c,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, m0
+// CHECK: [0x05,0x00,0x76,0xd3,0x7c,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x76,0xd3,0x7e,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x76,0xd3,0x7e,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x76,0xd3,0x7f,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x76,0xd3,0x7f,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, 0
-// CHECK: [0x00,0x00,0x76,0xd3,0x80,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, 0
+// CHECK: [0x05,0x00,0x76,0xd3,0x80,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, -1
-// CHECK: [0x00,0x00,0x76,0xd3,0xc1,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, -1
+// CHECK: [0x05,0x00,0x76,0xd3,0xc1,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x76,0xd3,0xf0,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x76,0xd3,0xf0,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x76,0xd3,0xf7,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x76,0xd3,0xf7,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, v0
-// CHECK: [0x00,0x00,0x76,0xd3,0x00,0x01,0x00,0x00]
+v_ffbh_i32_e64 v5, v1
+// CHECK: [0x05,0x00,0x76,0xd3,0x01,0x01,0x00,0x00]
 
-v_ffbh_i32_e64 v0, v255
-// CHECK: [0x00,0x00,0x76,0xd3,0xff,0x01,0x00,0x00]
+v_ffbh_i32_e64 v5, v255
+// CHECK: [0x05,0x00,0x76,0xd3,0xff,0x01,0x00,0x00]
 
-v_frexp_exp_i32_f64 v0, s[0:1]
-// CHECK: [0x00,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, s[2:3]
+// CHECK: [0x02,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v255, s[0:1]
-// CHECK: [0x00,0x78,0xfe,0x7f]
+v_frexp_exp_i32_f64 v255, s[2:3]
+// CHECK: [0x02,0x78,0xfe,0x7f]
 
-v_frexp_exp_i32_f64 v0, s[2:3]
-// CHECK: [0x02,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, s[4:5]
+// CHECK: [0x04,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, s[102:103]
-// CHECK: [0x66,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, s[102:103]
+// CHECK: [0x66,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, flat_scratch
-// CHECK: [0x68,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, flat_scratch
+// CHECK: [0x68,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, vcc
-// CHECK: [0x6a,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, vcc
+// CHECK: [0x6a,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, tba
-// CHECK: [0x6c,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, tba
+// CHECK: [0x6c,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, tma
-// CHECK: [0x6e,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, tma
+// CHECK: [0x6e,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, ttmp[10:11]
-// CHECK: [0x7a,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, ttmp[10:11]
+// CHECK: [0x7a,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, exec
-// CHECK: [0x7e,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, exec
+// CHECK: [0x7e,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, 0
-// CHECK: [0x80,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, 0
+// CHECK: [0x80,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, -1
-// CHECK: [0xc1,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, -1
+// CHECK: [0xc1,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, 0.5
-// CHECK: [0xf0,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, 0.5
+// CHECK: [0xf0,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, -4.0
-// CHECK: [0xf7,0x78,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, -4.0
+// CHECK: [0xf7,0x78,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, 0xaf123456
-// CHECK: [0xff,0x78,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_frexp_exp_i32_f64 v5, 0xaf123456
+// CHECK: [0xff,0x78,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_frexp_exp_i32_f64 v0, 0x3f717273
-// CHECK: [0xff,0x78,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_frexp_exp_i32_f64 v5, 0x3f717273
+// CHECK: [0xff,0x78,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_frexp_exp_i32_f64 v0, v[0:1]
-// CHECK: [0x00,0x79,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, v[1:2]
+// CHECK: [0x01,0x79,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, v[254:255]
-// CHECK: [0xfe,0x79,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, v[254:255]
+// CHECK: [0xfe,0x79,0x0a,0x7e]
 
-v_frexp_exp_i32_f64_e64 v0, s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, s[2:3]
+// CHECK: [0x05,0x00,0x78,0xd3,0x02,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v255, s[0:1]
-// CHECK: [0xff,0x00,0x78,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v255, s[2:3]
+// CHECK: [0xff,0x00,0x78,0xd3,0x02,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, s[2:3]
-// CHECK: [0x00,0x00,0x78,0xd3,0x02,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, s[4:5]
+// CHECK: [0x05,0x00,0x78,0xd3,0x04,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, s[102:103]
-// CHECK: [0x00,0x00,0x78,0xd3,0x66,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, s[102:103]
+// CHECK: [0x05,0x00,0x78,0xd3,0x66,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, flat_scratch
-// CHECK: [0x00,0x00,0x78,0xd3,0x68,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, flat_scratch
+// CHECK: [0x05,0x00,0x78,0xd3,0x68,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, vcc
-// CHECK: [0x00,0x00,0x78,0xd3,0x6a,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, vcc
+// CHECK: [0x05,0x00,0x78,0xd3,0x6a,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, tba
-// CHECK: [0x00,0x00,0x78,0xd3,0x6c,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, tba
+// CHECK: [0x05,0x00,0x78,0xd3,0x6c,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, tma
-// CHECK: [0x00,0x00,0x78,0xd3,0x6e,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, tma
+// CHECK: [0x05,0x00,0x78,0xd3,0x6e,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, ttmp[10:11]
-// CHECK: [0x00,0x00,0x78,0xd3,0x7a,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, ttmp[10:11]
+// CHECK: [0x05,0x00,0x78,0xd3,0x7a,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, exec
-// CHECK: [0x00,0x00,0x78,0xd3,0x7e,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, exec
+// CHECK: [0x05,0x00,0x78,0xd3,0x7e,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, scc
-// CHECK: [0x00,0x00,0x78,0xd3,0xfd,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, scc
+// CHECK: [0x05,0x00,0x78,0xd3,0xfd,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x78,0xd3,0x00,0x01,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, v[1:2]
+// CHECK: [0x05,0x00,0x78,0xd3,0x01,0x01,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, v[254:255]
-// CHECK: [0x00,0x00,0x78,0xd3,0xfe,0x01,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, v[254:255]
+// CHECK: [0x05,0x00,0x78,0xd3,0xfe,0x01,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, -s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd3,0x00,0x00,0x00,0x20]
+v_frexp_exp_i32_f64_e64 v5, -s[2:3]
+// CHECK: [0x05,0x00,0x78,0xd3,0x02,0x00,0x00,0x20]
 
-v_frexp_exp_i32_f64_e64 v0, |s[0:1]|
-// CHECK: [0x00,0x01,0x78,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, |s[2:3]|
+// CHECK: [0x05,0x01,0x78,0xd3,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x7a,0xfc,0x7f]
+v_frexp_mant_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x7a,0xfc,0x7f]
 
-v_frexp_mant_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], vcc
-// CHECK: [0x6a,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], vcc
+// CHECK: [0x6a,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], tba
-// CHECK: [0x6c,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], tba
+// CHECK: [0x6c,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], tma
-// CHECK: [0x6e,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], tma
+// CHECK: [0x6e,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], exec
-// CHECK: [0x7e,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], exec
+// CHECK: [0x7e,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], 0
-// CHECK: [0x80,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], 0
+// CHECK: [0x80,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], -1
-// CHECK: [0xc1,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], -1
+// CHECK: [0xc1,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x7a,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x7a,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x7a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_frexp_mant_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x7a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x7a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_frexp_mant_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x7a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_frexp_mant_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x7b,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x7b,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x7b,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x7b,0x0a,0x7e]
 
-v_frexp_mant_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x7a,0xd3,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x7a,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x7a,0xd3,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x7a,0xd3,0x02,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x7a,0xd3,0x04,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x7a,0xd3,0x66,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x7a,0xd3,0x66,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x7a,0xd3,0x68,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x7a,0xd3,0x68,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x7a,0xd3,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x7a,0xd3,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x7a,0xd3,0x6c,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x7a,0xd3,0x6c,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x7a,0xd3,0x6e,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x7a,0xd3,0x6e,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x7a,0xd3,0x7a,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x7a,0xd3,0x7a,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x7a,0xd3,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x7a,0xd3,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x7a,0xd3,0xfd,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x7a,0xd3,0xfd,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd3,0x00,0x01,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x7a,0xd3,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7a,0xd3,0xfe,0x01,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x7a,0xd3,0xfe,0x01,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd3,0x00,0x00,0x00,0x20]
+v_frexp_mant_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x7a,0xd3,0x02,0x00,0x00,0x20]
 
-v_frexp_mant_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x7a,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x7a,0xd3,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x7a,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x7a,0xd3,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x7a,0xd3,0x00,0x00,0x00,0x08]
+v_frexp_mant_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x7a,0xd3,0x02,0x00,0x00,0x08]
 
-v_frexp_mant_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x7a,0xd3,0x00,0x00,0x00,0x10]
+v_frexp_mant_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x7a,0xd3,0x02,0x00,0x00,0x10]
 
-v_frexp_mant_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x7a,0xd3,0x00,0x00,0x00,0x18]
+v_frexp_mant_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x7a,0xd3,0x02,0x00,0x00,0x18]
 
-v_fract_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x7c,0xfc,0x7f]
+v_fract_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x7c,0xfc,0x7f]
 
-v_fract_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], s[102:103]
-// CHECK: [0x66,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], s[102:103]
+// CHECK: [0x66,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], flat_scratch
-// CHECK: [0x68,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], flat_scratch
+// CHECK: [0x68,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], vcc
-// CHECK: [0x6a,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], vcc
+// CHECK: [0x6a,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], tba
-// CHECK: [0x6c,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], tba
+// CHECK: [0x6c,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], tma
-// CHECK: [0x6e,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], tma
+// CHECK: [0x6e,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], exec
-// CHECK: [0x7e,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], exec
+// CHECK: [0x7e,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], 0
-// CHECK: [0x80,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], 0
+// CHECK: [0x80,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], -1
-// CHECK: [0xc1,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], -1
+// CHECK: [0xc1,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x7c,0x00,0x7e]
+v_fract_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x7c,0x0a,0x7e]
 
-v_fract_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x7c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_fract_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x7c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_fract_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x7c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_fract_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x7c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_fract_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x7d,0x00,0x7e]
+v_fract_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x7d,0x0a,0x7e]
 
-v_fract_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x7d,0x00,0x7e]
+v_fract_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x7d,0x0a,0x7e]
 
-v_fract_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd3,0x00,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x7c,0xd3,0x02,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x7c,0xd3,0x00,0x00,0x00,0x00]
+v_fract_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x7c,0xd3,0x02,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x7c,0xd3,0x02,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x7c,0xd3,0x04,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], s[102:103]
-// CHECK: [0x00,0x00,0x7c,0xd3,0x66,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], s[102:103]
+// CHECK: [0x05,0x00,0x7c,0xd3,0x66,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x7c,0xd3,0x68,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x7c,0xd3,0x68,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x7c,0xd3,0x6a,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x7c,0xd3,0x6a,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x7c,0xd3,0x6c,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x7c,0xd3,0x6c,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x7c,0xd3,0x6e,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x7c,0xd3,0x6e,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x7c,0xd3,0x7a,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x7c,0xd3,0x7a,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x7c,0xd3,0x7e,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x7c,0xd3,0x7e,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x7c,0xd3,0xfd,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x7c,0xd3,0xfd,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd3,0x00,0x01,0x00,0x00]
+v_fract_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x7c,0xd3,0x01,0x01,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7c,0xd3,0xfe,0x01,0x00,0x00]
+v_fract_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x7c,0xd3,0xfe,0x01,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd3,0x00,0x00,0x00,0x20]
+v_fract_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x7c,0xd3,0x02,0x00,0x00,0x20]
 
-v_fract_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x7c,0xd3,0x00,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x7c,0xd3,0x02,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x7c,0xd3,0x00,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x08,0x7c,0xd3,0x02,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x7c,0xd3,0x00,0x00,0x00,0x08]
+v_fract_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x7c,0xd3,0x02,0x00,0x00,0x08]
 
-v_fract_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x7c,0xd3,0x00,0x00,0x00,0x10]
+v_fract_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x7c,0xd3,0x02,0x00,0x00,0x10]
 
-v_fract_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x7c,0xd3,0x00,0x00,0x00,0x18]
+v_fract_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x7c,0xd3,0x02,0x00,0x00,0x18]
 
-v_frexp_exp_i32_f32 v0, s0
-// CHECK: [0x00,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, s1
+// CHECK: [0x01,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v255, s0
-// CHECK: [0x00,0x7e,0xfe,0x7f]
+v_frexp_exp_i32_f32 v255, s1
+// CHECK: [0x01,0x7e,0xfe,0x7f]
 
-v_frexp_exp_i32_f32 v0, s103
-// CHECK: [0x67,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, s103
+// CHECK: [0x67,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, vcc_lo
-// CHECK: [0x6a,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, vcc_lo
+// CHECK: [0x6a,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, vcc_hi
-// CHECK: [0x6b,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, vcc_hi
+// CHECK: [0x6b,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, tba_lo
-// CHECK: [0x6c,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, tba_lo
+// CHECK: [0x6c,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, tba_hi
-// CHECK: [0x6d,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, tba_hi
+// CHECK: [0x6d,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, tma_lo
-// CHECK: [0x6e,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, tma_lo
+// CHECK: [0x6e,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, tma_hi
-// CHECK: [0x6f,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, tma_hi
+// CHECK: [0x6f,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, ttmp11
-// CHECK: [0x7b,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, ttmp11
+// CHECK: [0x7b,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, m0
-// CHECK: [0x7c,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, m0
+// CHECK: [0x7c,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, exec_lo
-// CHECK: [0x7e,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, exec_lo
+// CHECK: [0x7e,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, exec_hi
-// CHECK: [0x7f,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, exec_hi
+// CHECK: [0x7f,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, 0
-// CHECK: [0x80,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, 0
+// CHECK: [0x80,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, -1
-// CHECK: [0xc1,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, -1
+// CHECK: [0xc1,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, 0.5
-// CHECK: [0xf0,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, 0.5
+// CHECK: [0xf0,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, -4.0
-// CHECK: [0xf7,0x7e,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, -4.0
+// CHECK: [0xf7,0x7e,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, 0xaf123456
-// CHECK: [0xff,0x7e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_frexp_exp_i32_f32 v5, 0xaf123456
+// CHECK: [0xff,0x7e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_frexp_exp_i32_f32 v0, 0x3f717273
-// CHECK: [0xff,0x7e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_frexp_exp_i32_f32 v5, 0x3f717273
+// CHECK: [0xff,0x7e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_frexp_exp_i32_f32 v0, v0
-// CHECK: [0x00,0x7f,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, v1
+// CHECK: [0x01,0x7f,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, v255
-// CHECK: [0xff,0x7f,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, v255
+// CHECK: [0xff,0x7f,0x0a,0x7e]
 
-v_frexp_exp_i32_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x7e,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x7e,0xd3,0x01,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x7e,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x7e,0xd3,0x01,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x7e,0xd3,0x67,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x7e,0xd3,0x67,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x7e,0xd3,0x68,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x7e,0xd3,0x68,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x7e,0xd3,0x69,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x7e,0xd3,0x69,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x7e,0xd3,0x6a,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x7e,0xd3,0x6a,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x7e,0xd3,0x6b,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x7e,0xd3,0x6b,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x7e,0xd3,0x6c,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x7e,0xd3,0x6c,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x7e,0xd3,0x6d,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x7e,0xd3,0x6d,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x7e,0xd3,0x6e,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x7e,0xd3,0x6e,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x7e,0xd3,0x6f,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x7e,0xd3,0x6f,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x7e,0xd3,0x7b,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x7e,0xd3,0x7b,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x7e,0xd3,0x7c,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x7e,0xd3,0x7c,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x7e,0xd3,0x7e,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x7e,0xd3,0x7e,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x7e,0xd3,0x7f,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x7e,0xd3,0x7f,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x7e,0xd3,0x80,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x7e,0xd3,0x80,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x7e,0xd3,0xf0,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x7e,0xd3,0xf0,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x7e,0xd3,0xfd,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x7e,0xd3,0xfd,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x7e,0xd3,0x00,0x01,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x7e,0xd3,0x01,0x01,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x7e,0xd3,0xff,0x01,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x7e,0xd3,0xff,0x01,0x00,0x00]
 
-v_frexp_mant_f32 v0, s0
-// CHECK: [0x00,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, s1
+// CHECK: [0x01,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v255, s0
-// CHECK: [0x00,0x80,0xfe,0x7f]
+v_frexp_mant_f32 v255, s1
+// CHECK: [0x01,0x80,0xfe,0x7f]
 
-v_frexp_mant_f32 v0, s103
-// CHECK: [0x67,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, s103
+// CHECK: [0x67,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, vcc_lo
-// CHECK: [0x6a,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, vcc_lo
+// CHECK: [0x6a,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, vcc_hi
-// CHECK: [0x6b,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, vcc_hi
+// CHECK: [0x6b,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, tba_lo
-// CHECK: [0x6c,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, tba_lo
+// CHECK: [0x6c,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, tba_hi
-// CHECK: [0x6d,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, tba_hi
+// CHECK: [0x6d,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, tma_lo
-// CHECK: [0x6e,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, tma_lo
+// CHECK: [0x6e,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, tma_hi
-// CHECK: [0x6f,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, tma_hi
+// CHECK: [0x6f,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, ttmp11
-// CHECK: [0x7b,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, ttmp11
+// CHECK: [0x7b,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, m0
-// CHECK: [0x7c,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, m0
+// CHECK: [0x7c,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, exec_lo
-// CHECK: [0x7e,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, exec_lo
+// CHECK: [0x7e,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, exec_hi
-// CHECK: [0x7f,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, exec_hi
+// CHECK: [0x7f,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, 0
-// CHECK: [0x80,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, 0
+// CHECK: [0x80,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, -1
-// CHECK: [0xc1,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, -1
+// CHECK: [0xc1,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, 0.5
-// CHECK: [0xf0,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, 0.5
+// CHECK: [0xf0,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, -4.0
-// CHECK: [0xf7,0x80,0x00,0x7e]
+v_frexp_mant_f32 v5, -4.0
+// CHECK: [0xf7,0x80,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, 0xaf123456
-// CHECK: [0xff,0x80,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_frexp_mant_f32 v5, 0xaf123456
+// CHECK: [0xff,0x80,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f32 v0, 0x3f717273
-// CHECK: [0xff,0x80,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_frexp_mant_f32 v5, 0x3f717273
+// CHECK: [0xff,0x80,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_frexp_mant_f32 v0, v0
-// CHECK: [0x00,0x81,0x00,0x7e]
+v_frexp_mant_f32 v5, v1
+// CHECK: [0x01,0x81,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, v255
-// CHECK: [0xff,0x81,0x00,0x7e]
+v_frexp_mant_f32 v5, v255
+// CHECK: [0xff,0x81,0x0a,0x7e]
 
-v_frexp_mant_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x80,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x80,0xd3,0x00,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x80,0xd3,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x80,0xd3,0x67,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x80,0xd3,0x67,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x80,0xd3,0x68,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x80,0xd3,0x68,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x80,0xd3,0x69,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x80,0xd3,0x69,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x80,0xd3,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x80,0xd3,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x80,0xd3,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x80,0xd3,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x80,0xd3,0x6c,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x80,0xd3,0x6c,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x80,0xd3,0x6d,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x80,0xd3,0x6d,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x80,0xd3,0x6e,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x80,0xd3,0x6e,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x80,0xd3,0x6f,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x80,0xd3,0x6f,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x80,0xd3,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x80,0xd3,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x80,0xd3,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x80,0xd3,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x80,0xd3,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x80,0xd3,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x80,0xd3,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x80,0xd3,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x80,0xd3,0x80,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x80,0xd3,0x80,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x80,0xd3,0xf0,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x80,0xd3,0xf0,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x80,0xd3,0xfd,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x80,0xd3,0xfd,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x80,0xd3,0x00,0x01,0x00,0x00]
+v_frexp_mant_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x80,0xd3,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x80,0xd3,0xff,0x01,0x00,0x00]
+v_frexp_mant_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x80,0xd3,0xff,0x01,0x00,0x00]
 
 v_clrexcp
 // CHECK: [0x00,0x82,0x00,0x7e]
@@ -30390,32912 +30401,38229 @@ v_clrexcp
 v_clrexcp_e64
 // CHECK: [0x00,0x00,0x82,0xd3,0x00,0x00,0x00,0x00]
 
-v_movreld_b32 v0, m0
-// CHECK: [0x7c,0x84,0x00,0x7e]
+v_movreld_b32 v5, m0
+// CHECK: [0x7c,0x84,0x0a,0x7e]
 
 v_movreld_b32 v255, m0
 // CHECK: [0x7c,0x84,0xfe,0x7f]
 
-v_movreld_b32 v0, 0
-// CHECK: [0x80,0x84,0x00,0x7e]
+v_movreld_b32 v5, 0
+// CHECK: [0x80,0x84,0x0a,0x7e]
 
-v_movreld_b32 v0, -1
-// CHECK: [0xc1,0x84,0x00,0x7e]
+v_movreld_b32 v5, -1
+// CHECK: [0xc1,0x84,0x0a,0x7e]
 
-v_movreld_b32 v0, 0.5
-// CHECK: [0xf0,0x84,0x00,0x7e]
+v_movreld_b32 v5, 0.5
+// CHECK: [0xf0,0x84,0x0a,0x7e]
 
-v_movreld_b32 v0, -4.0
-// CHECK: [0xf7,0x84,0x00,0x7e]
+v_movreld_b32 v5, -4.0
+// CHECK: [0xf7,0x84,0x0a,0x7e]
 
-v_movreld_b32 v0, v0
-// CHECK: [0x00,0x85,0x00,0x7e]
+v_movreld_b32 v5, v1
+// CHECK: [0x01,0x85,0x0a,0x7e]
 
-v_movreld_b32 v0, v255
-// CHECK: [0xff,0x85,0x00,0x7e]
+v_movreld_b32 v5, v255
+// CHECK: [0xff,0x85,0x0a,0x7e]
 
-v_movreld_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x84,0xd3,0x7c,0x00,0x00,0x00]
+v_movreld_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x84,0xd3,0x7c,0x00,0x00,0x00]
 
 v_movreld_b32_e64 v255, m0
 // CHECK: [0xff,0x00,0x84,0xd3,0x7c,0x00,0x00,0x00]
 
-v_movreld_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x84,0xd3,0x80,0x00,0x00,0x00]
+v_movreld_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x84,0xd3,0x80,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x84,0xd3,0xc1,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x84,0xd3,0xf0,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x84,0xd3,0xf7,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x84,0xd3,0x01,0x01,0x00,0x00]
+
+v_movreld_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x84,0xd3,0xff,0x01,0x00,0x00]
+
+v_movrels_b32 v5, v1
+// CHECK: [0x01,0x87,0x0a,0x7e]
+
+v_movrels_b32 v255, v1
+// CHECK: [0x01,0x87,0xfe,0x7f]
+
+v_movrels_b32 v5, v255
+// CHECK: [0xff,0x87,0x0a,0x7e]
+
+v_movrels_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x86,0xd3,0x01,0x01,0x00,0x00]
+
+v_movrels_b32_e64 v255, v1
+// CHECK: [0xff,0x00,0x86,0xd3,0x01,0x01,0x00,0x00]
+
+v_movrels_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x86,0xd3,0xff,0x01,0x00,0x00]
+
+v_movrelsd_b32 v5, v1
+// CHECK: [0x01,0x89,0x0a,0x7e]
+
+v_movrelsd_b32 v255, v1
+// CHECK: [0x01,0x89,0xfe,0x7f]
+
+v_movrelsd_b32 v5, v255
+// CHECK: [0xff,0x89,0x0a,0x7e]
+
+v_movrelsd_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x88,0xd3,0x01,0x01,0x00,0x00]
+
+v_movrelsd_b32_e64 v255, v1
+// CHECK: [0xff,0x00,0x88,0xd3,0x01,0x01,0x00,0x00]
+
+v_movrelsd_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x88,0xd3,0xff,0x01,0x00,0x00]
+
+v_log_legacy_f32 v5, s1
+// CHECK: [0x01,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v255, s1
+// CHECK: [0x01,0x8a,0xfe,0x7f]
+
+v_log_legacy_f32 v5, s103
+// CHECK: [0x67,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, vcc_lo
+// CHECK: [0x6a,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, vcc_hi
+// CHECK: [0x6b,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, tba_lo
+// CHECK: [0x6c,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, tba_hi
+// CHECK: [0x6d,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, tma_lo
+// CHECK: [0x6e,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, tma_hi
+// CHECK: [0x6f,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, ttmp11
+// CHECK: [0x7b,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, m0
+// CHECK: [0x7c,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, exec_lo
+// CHECK: [0x7e,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, exec_hi
+// CHECK: [0x7f,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, 0
+// CHECK: [0x80,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, -1
+// CHECK: [0xc1,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, 0.5
+// CHECK: [0xf0,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, -4.0
+// CHECK: [0xf7,0x8a,0x0a,0x7e]
+
+v_log_legacy_f32 v5, 0xaf123456
+// CHECK: [0xff,0x8a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
+
+v_log_legacy_f32 v5, 0x3f717273
+// CHECK: [0xff,0x8a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
+
+v_log_legacy_f32 v5, v1
+// CHECK: [0x01,0x8b,0x0a,0x7e]
+
+v_log_legacy_f32 v5, v255
+// CHECK: [0xff,0x8b,0x0a,0x7e]
+
+v_log_legacy_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x8a,0xd3,0x01,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x8a,0xd3,0x01,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x8a,0xd3,0x67,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x8a,0xd3,0x68,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x8a,0xd3,0x69,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x8a,0xd3,0x6a,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x8a,0xd3,0x6b,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x8a,0xd3,0x6c,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x8a,0xd3,0x6d,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x8a,0xd3,0x6e,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x8a,0xd3,0x6f,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x8a,0xd3,0x7b,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x8a,0xd3,0x7c,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x8a,0xd3,0x7e,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x8a,0xd3,0x7f,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x8a,0xd3,0xfd,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x8a,0xd3,0x01,0x01,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x8a,0xd3,0xff,0x01,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x8a,0xd3,0x01,0x00,0x00,0x20]
+
+v_log_legacy_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x8a,0xd3,0x01,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x8a,0xd3,0x01,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x8a,0xd3,0x01,0x00,0x00,0x08]
+
+v_log_legacy_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x8a,0xd3,0x01,0x00,0x00,0x10]
+
+v_log_legacy_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x8a,0xd3,0x01,0x00,0x00,0x18]
+
+v_exp_legacy_f32 v5, s1
+// CHECK: [0x01,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v255, s1
+// CHECK: [0x01,0x8c,0xfe,0x7f]
+
+v_exp_legacy_f32 v5, s103
+// CHECK: [0x67,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, flat_scratch_lo
+// CHECK: [0x68,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, flat_scratch_hi
+// CHECK: [0x69,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, vcc_lo
+// CHECK: [0x6a,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, vcc_hi
+// CHECK: [0x6b,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, tba_lo
+// CHECK: [0x6c,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, tba_hi
+// CHECK: [0x6d,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, tma_lo
+// CHECK: [0x6e,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, tma_hi
+// CHECK: [0x6f,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, ttmp11
+// CHECK: [0x7b,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, m0
+// CHECK: [0x7c,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, exec_lo
+// CHECK: [0x7e,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, exec_hi
+// CHECK: [0x7f,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, 0
+// CHECK: [0x80,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, -1
+// CHECK: [0xc1,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, 0.5
+// CHECK: [0xf0,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, -4.0
+// CHECK: [0xf7,0x8c,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, 0xaf123456
+// CHECK: [0xff,0x8c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
+
+v_exp_legacy_f32 v5, 0x3f717273
+// CHECK: [0xff,0x8c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
+
+v_exp_legacy_f32 v5, v1
+// CHECK: [0x01,0x8d,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, v255
+// CHECK: [0xff,0x8d,0x0a,0x7e]
+
+v_exp_legacy_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x8c,0xd3,0x01,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x8c,0xd3,0x01,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, s103
+// CHECK: [0x05,0x00,0x8c,0xd3,0x67,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x8c,0xd3,0x68,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x8c,0xd3,0x69,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x8c,0xd3,0x6a,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x8c,0xd3,0x6b,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x8c,0xd3,0x6c,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x8c,0xd3,0x6d,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x8c,0xd3,0x6e,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x8c,0xd3,0x6f,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x8c,0xd3,0x7b,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x8c,0xd3,0x7c,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x8c,0xd3,0x7e,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x8c,0xd3,0x7f,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x8c,0xd3,0xfd,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x8c,0xd3,0x01,0x01,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x8c,0xd3,0xff,0x01,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x8c,0xd3,0x01,0x00,0x00,0x20]
+
+v_exp_legacy_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x8c,0xd3,0x01,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x08,0x8c,0xd3,0x01,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x8c,0xd3,0x01,0x00,0x00,0x08]
+
+v_exp_legacy_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x8c,0xd3,0x01,0x00,0x00,0x10]
+
+v_exp_legacy_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x8c,0xd3,0x01,0x00,0x00,0x18]
+
+v_cndmask_b32 v5, 0, v2, vcc
+// CHECK: [0x80,0x04,0x0a,0x00]
+
+v_cndmask_b32 v255, 0, v2, vcc
+// CHECK: [0x80,0x04,0xfe,0x01]
+
+v_cndmask_b32 v5, -1, v2, vcc
+// CHECK: [0xc1,0x04,0x0a,0x00]
+
+v_cndmask_b32 v5, 0.5, v2, vcc
+// CHECK: [0xf0,0x04,0x0a,0x00]
+
+v_cndmask_b32 v5, -4.0, v2, vcc
+// CHECK: [0xf7,0x04,0x0a,0x00]
+
+v_cndmask_b32 v5, v1, v2, vcc
+// CHECK: [0x01,0x05,0x0a,0x00]
+
+v_cndmask_b32 v5, v255, v2, vcc
+// CHECK: [0xff,0x05,0x0a,0x00]
+
+v_cndmask_b32 v5, 0, v255, vcc
+// CHECK: [0x80,0xfe,0x0b,0x00]
+
+v_cndmask_b32_e64 v5, 0, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0x00,0x19,0x00]
+
+v_cndmask_b32_e64 v255, 0, 0, s[6:7]
+// CHECK: [0xff,0x00,0x00,0xd2,0x80,0x00,0x19,0x00]
+
+v_cndmask_b32_e64 v5, -1, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0xc1,0x00,0x19,0x00]
+
+v_cndmask_b32_e64 v5, 0.5, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0xf0,0x00,0x19,0x00]
+
+v_cndmask_b32_e64 v5, -4.0, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0xf7,0x00,0x19,0x00]
+
+v_cndmask_b32_e64 v5, v1, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0x01,0x01,0x19,0x00]
+
+v_cndmask_b32_e64 v5, v255, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0xff,0x01,0x19,0x00]
+
+v_cndmask_b32_e64 v5, 0, -1, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0x82,0x19,0x00]
+
+v_cndmask_b32_e64 v5, 0, 0.5, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0xe0,0x19,0x00]
+
+v_cndmask_b32_e64 v5, 0, -4.0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0xee,0x19,0x00]
+
+v_cndmask_b32_e64 v5, 0, v2, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0x04,0x1a,0x00]
+
+v_cndmask_b32_e64 v5, 0, v255, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0xfe,0x1b,0x00]
+
+v_cndmask_b32_e64 v5, 0, 0, s[8:9]
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0x00,0x21,0x00]
+
+v_cndmask_b32_e64 v5, 0, 0, s[102:103]
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0x00,0x99,0x01]
+
+v_cndmask_b32_e64 v5, 0, 0, flat_scratch
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0x00,0xa1,0x01]
+
+v_cndmask_b32_e64 v5, 0, 0, vcc
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0x00,0xa9,0x01]
+
+v_cndmask_b32_e64 v5, 0, 0, tba
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0x00,0xb1,0x01]
+
+v_cndmask_b32_e64 v5, 0, 0, tma
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0x00,0xb9,0x01]
+
+v_cndmask_b32_e64 v5, 0, 0, ttmp[10:11]
+// CHECK: [0x05,0x00,0x00,0xd2,0x80,0x00,0xe9,0x01]
+
+v_readlane_b32 s5, v1, s2
+// CHECK: [0x01,0x05,0x0a,0x02]
+
+v_readlane_b32 s103, v1, s2
+// CHECK: [0x01,0x05,0xce,0x02]
+
+v_readlane_b32 tba_lo, v1, s2
+// CHECK: [0x01,0x05,0xd8,0x02]
+
+v_readlane_b32 tba_hi, v1, s2
+// CHECK: [0x01,0x05,0xda,0x02]
+
+v_readlane_b32 tma_lo, v1, s2
+// CHECK: [0x01,0x05,0xdc,0x02]
+
+v_readlane_b32 tma_hi, v1, s2
+// CHECK: [0x01,0x05,0xde,0x02]
+
+v_readlane_b32 ttmp11, v1, s2
+// CHECK: [0x01,0x05,0xf6,0x02]
+
+v_readlane_b32 s5, v255, s2
+// CHECK: [0xff,0x05,0x0a,0x02]
+
+v_readlane_b32 s5, v1, s103
+// CHECK: [0x01,0xcf,0x0a,0x02]
+
+v_readlane_b32 s5, v1, flat_scratch_lo
+// CHECK: [0x01,0xd1,0x0a,0x02]
+
+v_readlane_b32 s5, v1, flat_scratch_hi
+// CHECK: [0x01,0xd3,0x0a,0x02]
+
+v_readlane_b32 s5, v1, vcc_lo
+// CHECK: [0x01,0xd5,0x0a,0x02]
+
+v_readlane_b32 s5, v1, vcc_hi
+// CHECK: [0x01,0xd7,0x0a,0x02]
+
+v_readlane_b32 s5, v1, tba_lo
+// CHECK: [0x01,0xd9,0x0a,0x02]
+
+v_readlane_b32 s5, v1, tba_hi
+// CHECK: [0x01,0xdb,0x0a,0x02]
+
+v_readlane_b32 s5, v1, tma_lo
+// CHECK: [0x01,0xdd,0x0a,0x02]
+
+v_readlane_b32 s5, v1, tma_hi
+// CHECK: [0x01,0xdf,0x0a,0x02]
+
+v_readlane_b32 s5, v1, ttmp11
+// CHECK: [0x01,0xf7,0x0a,0x02]
+
+v_readlane_b32 s5, v1, m0
+// CHECK: [0x01,0xf9,0x0a,0x02]
+
+v_readlane_b32 s5, v1, 0
+// CHECK: [0x01,0x01,0x0b,0x02]
+
+v_writelane_b32 v5, s1, 0
+// CHECK: [0x01,0x00,0x0b,0x04]
+
+v_writelane_b32 v255, s1, 0
+// CHECK: [0x01,0x00,0xff,0x05]
+
+v_writelane_b32 v5, s103, 0
+// CHECK: [0x67,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, flat_scratch_lo, 0
+// CHECK: [0x68,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, flat_scratch_hi, 0
+// CHECK: [0x69,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, vcc_lo, 0
+// CHECK: [0x6a,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, vcc_hi, 0
+// CHECK: [0x6b,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, tba_lo, 0
+// CHECK: [0x6c,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, tba_hi, 0
+// CHECK: [0x6d,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, tma_lo, 0
+// CHECK: [0x6e,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, tma_hi, 0
+// CHECK: [0x6f,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, ttmp11, 0
+// CHECK: [0x7b,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, m0, 0
+// CHECK: [0x7c,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, exec_lo, 0
+// CHECK: [0x7e,0x00,0x0b,0x04]
+
+v_writelane_b32 v5, exec_hi, 0
+// CHECK: [0x7f,0x00,0x0b,0x04]
+
+v_add_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x06]
+
+v_add_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x07]
+
+v_add_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x06]
+
+v_add_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x06]
+
+v_add_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x06]
+
+v_add_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x06]
+
+v_add_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x06]
+
+v_add_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x06]
+
+v_add_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x06]
+
+v_add_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x06]
+
+v_add_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x06]
+
+v_add_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x06]
+
+v_add_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x06]
+
+v_add_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x06]
+
+v_add_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x06]
+
+v_add_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x06]
+
+v_add_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x06]
+
+v_add_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x06]
+
+v_add_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x06]
+
+v_add_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x06,0x56,0x34,0x12,0xaf]
+
+v_add_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x06,0x73,0x72,0x71,0x3f]
+
+v_add_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x06]
+
+v_add_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x06]
+
+v_add_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x06]
+
+v_add_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x06,0xd2,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x06,0xd2,0xff,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xcf,0x00,0x00]
+
+v_add_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xd1,0x00,0x00]
+
+v_add_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xd3,0x00,0x00]
+
+v_add_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xd5,0x00,0x00]
+
+v_add_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xd7,0x00,0x00]
+
+v_add_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xd9,0x00,0x00]
+
+v_add_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xdb,0x00,0x00]
+
+v_add_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xdd,0x00,0x00]
+
+v_add_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xdf,0x00,0x00]
+
+v_add_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xf7,0x00,0x00]
+
+v_add_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xf9,0x00,0x00]
+
+v_add_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xfd,0x00,0x00]
+
+v_add_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xff,0x00,0x00]
+
+v_add_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xfb,0x01,0x00]
+
+v_add_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0x05,0x02,0x00]
+
+v_add_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0xff,0x03,0x00]
+
+v_add_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0x05,0x00,0x20]
+
+v_add_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0x05,0x00,0x40]
+
+v_add_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0x05,0x00,0x60]
+
+v_add_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x06,0xd2,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x06,0xd2,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x06,0xd2,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x06,0xd2,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0x05,0x00,0x08]
+
+v_add_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0x05,0x00,0x10]
+
+v_add_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x06,0xd2,0x01,0x05,0x00,0x18]
+
+v_sub_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x08]
+
+v_sub_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x09]
+
+v_sub_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x08]
+
+v_sub_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x08]
+
+v_sub_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x08]
+
+v_sub_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x08]
+
+v_sub_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x08]
+
+v_sub_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x08]
+
+v_sub_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x08]
+
+v_sub_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x08]
+
+v_sub_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x08]
+
+v_sub_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x08]
+
+v_sub_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x08]
+
+v_sub_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x08]
+
+v_sub_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x08]
+
+v_sub_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x08]
+
+v_sub_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x08]
+
+v_sub_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x08]
+
+v_sub_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x08]
+
+v_sub_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x08,0x56,0x34,0x12,0xaf]
+
+v_sub_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x08,0x73,0x72,0x71,0x3f]
+
+v_sub_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x08]
+
+v_sub_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x08]
+
+v_sub_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x08]
+
+v_sub_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x08,0xd2,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x08,0xd2,0xff,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xcf,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xd1,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xd3,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xd5,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xd7,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xd9,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xdb,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xdd,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xdf,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xf7,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xf9,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xfd,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xff,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xfb,0x01,0x00]
+
+v_sub_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0x05,0x02,0x00]
+
+v_sub_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0xff,0x03,0x00]
+
+v_sub_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0x05,0x00,0x20]
+
+v_sub_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0x05,0x00,0x40]
+
+v_sub_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0x05,0x00,0x60]
+
+v_sub_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x08,0xd2,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x08,0xd2,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x08,0xd2,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x08,0xd2,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0x05,0x00,0x08]
+
+v_sub_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0x05,0x00,0x10]
+
+v_sub_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x08,0xd2,0x01,0x05,0x00,0x18]
+
+v_subrev_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x0a]
+
+v_subrev_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x0b]
+
+v_subrev_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x0a]
+
+v_subrev_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x0a,0x56,0x34,0x12,0xaf]
+
+v_subrev_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x0a,0x73,0x72,0x71,0x3f]
+
+v_subrev_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x0a]
+
+v_subrev_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x0a]
+
+v_subrev_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x0a]
+
+v_subrev_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x0a,0xd2,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x0a,0xd2,0xff,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xcf,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xd1,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xd3,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xd5,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xd7,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xd9,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xdb,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xdd,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xdf,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xf7,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xf9,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xfd,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xff,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xfb,0x01,0x00]
+
+v_subrev_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0x05,0x02,0x00]
+
+v_subrev_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0xff,0x03,0x00]
+
+v_subrev_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0x05,0x00,0x20]
+
+v_subrev_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0x05,0x00,0x40]
+
+v_subrev_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0x05,0x00,0x60]
+
+v_subrev_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x0a,0xd2,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x0a,0xd2,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x0a,0xd2,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x0a,0xd2,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0x05,0x00,0x08]
+
+v_subrev_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0x05,0x00,0x10]
+
+v_subrev_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x0a,0xd2,0x01,0x05,0x00,0x18]
+
+v_mac_legacy_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x0d]
+
+v_mac_legacy_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x0c,0x56,0x34,0x12,0xaf]
+
+v_mac_legacy_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x0c,0x73,0x72,0x71,0x3f]
+
+v_mac_legacy_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x0c]
+
+v_mac_legacy_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x0c]
+
+v_mac_legacy_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_legacy_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x0c,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x0c,0xd2,0xff,0x05,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xcf,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xd1,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xd3,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xd5,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xd7,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xd9,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xdb,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xdd,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xdf,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xf7,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xf9,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xfd,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xff,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xfb,0x01,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0x05,0x02,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0xff,0x03,0x00]
+
+v_mac_legacy_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0x05,0x00,0x20]
+
+v_mac_legacy_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0x05,0x00,0x40]
+
+v_mac_legacy_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0x05,0x00,0x60]
+
+v_mac_legacy_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x0c,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x0c,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x0c,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x0c,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_legacy_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0x05,0x00,0x08]
+
+v_mac_legacy_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0x05,0x00,0x10]
+
+v_mac_legacy_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x0c,0xd2,0x01,0x05,0x00,0x18]
+
+v_mul_legacy_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x0f]
+
+v_mul_legacy_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x0e,0x56,0x34,0x12,0xaf]
+
+v_mul_legacy_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x0e,0x73,0x72,0x71,0x3f]
+
+v_mul_legacy_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x0e]
+
+v_mul_legacy_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x0e]
+
+v_mul_legacy_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x0e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x0e,0xd2,0xff,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xcf,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xd1,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xd3,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xd5,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xd7,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xd9,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xdb,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xdd,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xdf,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xf7,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xf9,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xfd,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xff,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xfb,0x01,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0x05,0x02,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0xff,0x03,0x00]
+
+v_mul_legacy_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0x05,0x00,0x20]
+
+v_mul_legacy_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0x05,0x00,0x40]
+
+v_mul_legacy_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0x05,0x00,0x60]
+
+v_mul_legacy_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x0e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x0e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x0e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x0e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0x05,0x00,0x08]
+
+v_mul_legacy_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0x05,0x00,0x10]
+
+v_mul_legacy_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x0e,0xd2,0x01,0x05,0x00,0x18]
+
+v_mul_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x10]
+
+v_mul_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x11]
+
+v_mul_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x10]
+
+v_mul_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x10]
+
+v_mul_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x10]
+
+v_mul_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x10]
+
+v_mul_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x10]
+
+v_mul_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x10]
+
+v_mul_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x10]
+
+v_mul_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x10]
+
+v_mul_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x10]
+
+v_mul_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x10]
+
+v_mul_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x10]
+
+v_mul_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x10]
+
+v_mul_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x10]
+
+v_mul_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x10]
+
+v_mul_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x10]
+
+v_mul_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x10]
+
+v_mul_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x10]
+
+v_mul_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x10,0x56,0x34,0x12,0xaf]
+
+v_mul_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x10,0x73,0x72,0x71,0x3f]
+
+v_mul_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x10]
+
+v_mul_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x10]
+
+v_mul_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x10]
+
+v_mul_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x10,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x10,0xd2,0xff,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xcf,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xd1,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xd3,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xd5,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xd7,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xd9,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xdb,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xdd,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xdf,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xf7,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xf9,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xfd,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xff,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xfb,0x01,0x00]
+
+v_mul_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0x05,0x02,0x00]
+
+v_mul_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0xff,0x03,0x00]
+
+v_mul_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0x05,0x00,0x20]
+
+v_mul_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0x05,0x00,0x40]
+
+v_mul_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0x05,0x00,0x60]
+
+v_mul_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x10,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x10,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x10,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x10,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0x05,0x00,0x08]
+
+v_mul_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0x05,0x00,0x10]
+
+v_mul_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x10,0xd2,0x01,0x05,0x00,0x18]
+
+v_mul_i32_i24 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x13]
+
+v_mul_i32_i24 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x12]
+
+v_mul_i32_i24 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x12,0x56,0x34,0x12,0xaf]
+
+v_mul_i32_i24 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x12,0x73,0x72,0x71,0x3f]
+
+v_mul_i32_i24 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x12]
+
+v_mul_i32_i24 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x12]
+
+v_mul_i32_i24 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x12]
+
+v_mul_i32_i24_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0x04,0x00,0x00]
+
+v_mul_i32_i24_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x12,0xd2,0x80,0x04,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x12,0xd2,0xc1,0x04,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x12,0xd2,0xf0,0x04,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x12,0xd2,0xf7,0x04,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x12,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x12,0xd2,0xff,0x05,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xce,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xd0,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xd2,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xd4,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xd6,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xd8,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xda,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xdc,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xde,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xf6,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xf8,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xfc,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xfe,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0x00,0x01,0x00]
+
+v_mul_i32_i24_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0x82,0x01,0x00]
+
+v_mul_i32_i24_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xe0,0x01,0x00]
+
+v_mul_i32_i24_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xee,0x01,0x00]
+
+v_mul_i32_i24_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0x04,0x02,0x00]
+
+v_mul_i32_i24_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x12,0xd2,0x80,0xfe,0x03,0x00]
+
+v_mul_hi_i32_i24 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x15]
+
+v_mul_hi_i32_i24 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x14,0x56,0x34,0x12,0xaf]
+
+v_mul_hi_i32_i24 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x14,0x73,0x72,0x71,0x3f]
+
+v_mul_hi_i32_i24 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x14]
+
+v_mul_hi_i32_i24 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x14]
+
+v_mul_hi_i32_i24_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0x04,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x14,0xd2,0x80,0x04,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x14,0xd2,0xc1,0x04,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x14,0xd2,0xf0,0x04,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x14,0xd2,0xf7,0x04,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x14,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x14,0xd2,0xff,0x05,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xce,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xd0,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xd2,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xd4,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xd6,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xd8,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xda,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xdc,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xde,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xf6,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xf8,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xfc,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xfe,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0x00,0x01,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0x82,0x01,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xe0,0x01,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xee,0x01,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0x04,0x02,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x14,0xd2,0x80,0xfe,0x03,0x00]
+
+v_mul_u32_u24 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x17]
+
+v_mul_u32_u24 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x16]
+
+v_mul_u32_u24 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x16,0x56,0x34,0x12,0xaf]
+
+v_mul_u32_u24 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x16,0x73,0x72,0x71,0x3f]
+
+v_mul_u32_u24 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x16]
+
+v_mul_u32_u24 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x16]
+
+v_mul_u32_u24 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x16]
+
+v_mul_u32_u24_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0x04,0x00,0x00]
+
+v_mul_u32_u24_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x16,0xd2,0x80,0x04,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x16,0xd2,0xc1,0x04,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x16,0xd2,0xf0,0x04,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x16,0xd2,0xf7,0x04,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x16,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x16,0xd2,0xff,0x05,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xce,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xd0,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xd2,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xd4,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xd6,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xd8,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xda,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xdc,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xde,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xf6,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xf8,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xfc,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xfe,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0x00,0x01,0x00]
+
+v_mul_u32_u24_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0x82,0x01,0x00]
+
+v_mul_u32_u24_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xe0,0x01,0x00]
+
+v_mul_u32_u24_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xee,0x01,0x00]
+
+v_mul_u32_u24_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0x04,0x02,0x00]
+
+v_mul_u32_u24_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x16,0xd2,0x80,0xfe,0x03,0x00]
+
+v_mul_hi_u32_u24 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x19]
+
+v_mul_hi_u32_u24 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x18,0x56,0x34,0x12,0xaf]
+
+v_mul_hi_u32_u24 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x18,0x73,0x72,0x71,0x3f]
+
+v_mul_hi_u32_u24 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x18]
+
+v_mul_hi_u32_u24 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x18]
+
+v_mul_hi_u32_u24_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0x04,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x18,0xd2,0x80,0x04,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x18,0xd2,0xc1,0x04,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x18,0xd2,0xf0,0x04,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x18,0xd2,0xf7,0x04,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x18,0xd2,0x01,0x05,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x18,0xd2,0xff,0x05,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xce,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xd0,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xd2,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xd4,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xd6,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xd8,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xda,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xdc,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xde,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xf6,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xf8,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xfc,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xfe,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0x00,0x01,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0x82,0x01,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xe0,0x01,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xee,0x01,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0x04,0x02,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x18,0xd2,0x80,0xfe,0x03,0x00]
+
+v_min_legacy_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x1b]
+
+v_min_legacy_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x1a]
+
+v_min_legacy_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x1a,0x56,0x34,0x12,0xaf]
+
+v_min_legacy_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x1a,0x73,0x72,0x71,0x3f]
+
+v_min_legacy_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x1a]
+
+v_min_legacy_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x1a]
+
+v_min_legacy_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x1a]
+
+v_min_legacy_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_legacy_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x1a,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x1a,0xd2,0xff,0x05,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xcf,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xd1,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xd3,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xd5,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xd7,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xd9,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xdb,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xdd,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xdf,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xf7,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xf9,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xfd,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xff,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xfb,0x01,0x00]
+
+v_min_legacy_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0x05,0x02,0x00]
+
+v_min_legacy_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0xff,0x03,0x00]
+
+v_min_legacy_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0x05,0x00,0x20]
+
+v_min_legacy_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0x05,0x00,0x40]
+
+v_min_legacy_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0x05,0x00,0x60]
+
+v_min_legacy_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x1a,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x1a,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x1a,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x1a,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_legacy_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0x05,0x00,0x08]
+
+v_min_legacy_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0x05,0x00,0x10]
+
+v_min_legacy_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x1a,0xd2,0x01,0x05,0x00,0x18]
+
+v_max_legacy_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x1d]
+
+v_max_legacy_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x1c]
+
+v_max_legacy_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x1c,0x56,0x34,0x12,0xaf]
+
+v_max_legacy_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x1c,0x73,0x72,0x71,0x3f]
+
+v_max_legacy_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x1c]
+
+v_max_legacy_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x1c]
+
+v_max_legacy_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x1c]
+
+v_max_legacy_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_legacy_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x1c,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x1c,0xd2,0xff,0x05,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xcf,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xd1,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xd3,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xd5,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xd7,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xd9,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xdb,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xdd,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xdf,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xf7,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xf9,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xfd,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xff,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xfb,0x01,0x00]
+
+v_max_legacy_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0x05,0x02,0x00]
+
+v_max_legacy_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0xff,0x03,0x00]
+
+v_max_legacy_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0x05,0x00,0x20]
+
+v_max_legacy_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0x05,0x00,0x40]
+
+v_max_legacy_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0x05,0x00,0x60]
+
+v_max_legacy_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x1c,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x1c,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x1c,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x1c,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_legacy_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0x05,0x00,0x08]
+
+v_max_legacy_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0x05,0x00,0x10]
+
+v_max_legacy_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x1c,0xd2,0x01,0x05,0x00,0x18]
+
+v_min_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x1e]
+
+v_min_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x1f]
+
+v_min_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x1e]
+
+v_min_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x1e]
+
+v_min_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x1e]
+
+v_min_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x1e]
+
+v_min_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x1e]
+
+v_min_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x1e]
+
+v_min_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x1e]
+
+v_min_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x1e]
+
+v_min_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x1e]
+
+v_min_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x1e]
+
+v_min_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x1e]
+
+v_min_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x1e]
+
+v_min_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x1e]
+
+v_min_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x1e]
+
+v_min_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x1e]
+
+v_min_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x1e]
+
+v_min_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x1e]
+
+v_min_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x1e,0x56,0x34,0x12,0xaf]
+
+v_min_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x1e,0x73,0x72,0x71,0x3f]
+
+v_min_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x1e]
+
+v_min_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x1e]
+
+v_min_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x1e]
+
+v_min_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x1e,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x1e,0xd2,0xff,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xcf,0x00,0x00]
+
+v_min_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xd1,0x00,0x00]
+
+v_min_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xd3,0x00,0x00]
+
+v_min_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xd5,0x00,0x00]
+
+v_min_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xd7,0x00,0x00]
+
+v_min_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xd9,0x00,0x00]
+
+v_min_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xdb,0x00,0x00]
+
+v_min_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xdd,0x00,0x00]
+
+v_min_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xdf,0x00,0x00]
+
+v_min_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xf7,0x00,0x00]
+
+v_min_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xf9,0x00,0x00]
+
+v_min_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xfd,0x00,0x00]
+
+v_min_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xff,0x00,0x00]
+
+v_min_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xfb,0x01,0x00]
+
+v_min_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0x05,0x02,0x00]
+
+v_min_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0xff,0x03,0x00]
+
+v_min_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0x05,0x00,0x20]
+
+v_min_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0x05,0x00,0x40]
+
+v_min_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0x05,0x00,0x60]
+
+v_min_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x1e,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x1e,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x1e,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x1e,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0x05,0x00,0x08]
+
+v_min_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0x05,0x00,0x10]
+
+v_min_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x1e,0xd2,0x01,0x05,0x00,0x18]
+
+v_max_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x20]
+
+v_max_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x21]
+
+v_max_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x20]
+
+v_max_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x20]
+
+v_max_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x20]
+
+v_max_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x20]
+
+v_max_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x20]
+
+v_max_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x20]
+
+v_max_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x20]
+
+v_max_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x20]
+
+v_max_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x20]
+
+v_max_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x20]
+
+v_max_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x20]
+
+v_max_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x20]
+
+v_max_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x20]
+
+v_max_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x20]
+
+v_max_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x20]
+
+v_max_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x20]
+
+v_max_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x20]
+
+v_max_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x20,0x56,0x34,0x12,0xaf]
+
+v_max_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x20,0x73,0x72,0x71,0x3f]
+
+v_max_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x20]
+
+v_max_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x20]
+
+v_max_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x20]
+
+v_max_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x20,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x20,0xd2,0xff,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xcf,0x00,0x00]
+
+v_max_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xd1,0x00,0x00]
+
+v_max_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xd3,0x00,0x00]
+
+v_max_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xd5,0x00,0x00]
+
+v_max_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xd7,0x00,0x00]
+
+v_max_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xd9,0x00,0x00]
+
+v_max_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xdb,0x00,0x00]
+
+v_max_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xdd,0x00,0x00]
+
+v_max_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xdf,0x00,0x00]
+
+v_max_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xf7,0x00,0x00]
+
+v_max_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xf9,0x00,0x00]
+
+v_max_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xfd,0x00,0x00]
+
+v_max_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xff,0x00,0x00]
+
+v_max_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xfb,0x01,0x00]
+
+v_max_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0x05,0x02,0x00]
+
+v_max_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0xff,0x03,0x00]
+
+v_max_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0x05,0x00,0x20]
+
+v_max_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0x05,0x00,0x40]
+
+v_max_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0x05,0x00,0x60]
+
+v_max_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x20,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x20,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x20,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x20,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0x05,0x00,0x08]
+
+v_max_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0x05,0x00,0x10]
+
+v_max_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x20,0xd2,0x01,0x05,0x00,0x18]
+
+v_min_i32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x22]
+
+v_min_i32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x23]
+
+v_min_i32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x22]
+
+v_min_i32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x22]
+
+v_min_i32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x22]
+
+v_min_i32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x22]
+
+v_min_i32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x22]
+
+v_min_i32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x22]
+
+v_min_i32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x22]
+
+v_min_i32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x22]
+
+v_min_i32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x22]
+
+v_min_i32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x22]
+
+v_min_i32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x22]
+
+v_min_i32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x22]
+
+v_min_i32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x22]
+
+v_min_i32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x22]
+
+v_min_i32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x22]
+
+v_min_i32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x22]
+
+v_min_i32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x22]
+
+v_min_i32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x22,0x56,0x34,0x12,0xaf]
+
+v_min_i32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x22,0x73,0x72,0x71,0x3f]
+
+v_min_i32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x22]
+
+v_min_i32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x22]
+
+v_min_i32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x22]
+
+v_min_i32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0x04,0x00,0x00]
+
+v_min_i32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x22,0xd2,0x80,0x04,0x00,0x00]
+
+v_min_i32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x22,0xd2,0xc1,0x04,0x00,0x00]
+
+v_min_i32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x22,0xd2,0xf0,0x04,0x00,0x00]
+
+v_min_i32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x22,0xd2,0xf7,0x04,0x00,0x00]
+
+v_min_i32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x22,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_i32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x22,0xd2,0xff,0x05,0x00,0x00]
+
+v_min_i32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xce,0x00,0x00]
+
+v_min_i32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xd0,0x00,0x00]
+
+v_min_i32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xd2,0x00,0x00]
+
+v_min_i32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xd4,0x00,0x00]
+
+v_min_i32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xd6,0x00,0x00]
+
+v_min_i32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xd8,0x00,0x00]
+
+v_min_i32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xda,0x00,0x00]
+
+v_min_i32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xdc,0x00,0x00]
+
+v_min_i32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xde,0x00,0x00]
+
+v_min_i32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xf6,0x00,0x00]
+
+v_min_i32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xf8,0x00,0x00]
+
+v_min_i32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xfc,0x00,0x00]
+
+v_min_i32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xfe,0x00,0x00]
+
+v_min_i32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0x00,0x01,0x00]
+
+v_min_i32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0x82,0x01,0x00]
+
+v_min_i32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xe0,0x01,0x00]
+
+v_min_i32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xee,0x01,0x00]
+
+v_min_i32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0x04,0x02,0x00]
+
+v_min_i32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x22,0xd2,0x80,0xfe,0x03,0x00]
+
+v_max_i32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x24]
+
+v_max_i32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x25]
+
+v_max_i32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x24]
+
+v_max_i32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x24]
+
+v_max_i32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x24]
+
+v_max_i32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x24]
+
+v_max_i32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x24]
+
+v_max_i32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x24]
+
+v_max_i32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x24]
+
+v_max_i32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x24]
+
+v_max_i32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x24]
+
+v_max_i32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x24]
+
+v_max_i32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x24]
+
+v_max_i32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x24]
+
+v_max_i32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x24]
+
+v_max_i32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x24]
+
+v_max_i32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x24]
+
+v_max_i32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x24]
+
+v_max_i32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x24]
+
+v_max_i32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x24,0x56,0x34,0x12,0xaf]
+
+v_max_i32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x24,0x73,0x72,0x71,0x3f]
+
+v_max_i32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x24]
+
+v_max_i32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x24]
+
+v_max_i32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x24]
+
+v_max_i32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0x04,0x00,0x00]
+
+v_max_i32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x24,0xd2,0x80,0x04,0x00,0x00]
+
+v_max_i32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x24,0xd2,0xc1,0x04,0x00,0x00]
+
+v_max_i32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x24,0xd2,0xf0,0x04,0x00,0x00]
+
+v_max_i32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x24,0xd2,0xf7,0x04,0x00,0x00]
+
+v_max_i32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x24,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_i32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x24,0xd2,0xff,0x05,0x00,0x00]
+
+v_max_i32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xce,0x00,0x00]
+
+v_max_i32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xd0,0x00,0x00]
+
+v_max_i32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xd2,0x00,0x00]
+
+v_max_i32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xd4,0x00,0x00]
+
+v_max_i32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xd6,0x00,0x00]
+
+v_max_i32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xd8,0x00,0x00]
+
+v_max_i32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xda,0x00,0x00]
+
+v_max_i32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xdc,0x00,0x00]
+
+v_max_i32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xde,0x00,0x00]
+
+v_max_i32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xf6,0x00,0x00]
+
+v_max_i32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xf8,0x00,0x00]
+
+v_max_i32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xfc,0x00,0x00]
+
+v_max_i32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xfe,0x00,0x00]
+
+v_max_i32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0x00,0x01,0x00]
+
+v_max_i32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0x82,0x01,0x00]
+
+v_max_i32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xe0,0x01,0x00]
+
+v_max_i32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xee,0x01,0x00]
+
+v_max_i32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0x04,0x02,0x00]
+
+v_max_i32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x24,0xd2,0x80,0xfe,0x03,0x00]
+
+v_min_u32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x26]
+
+v_min_u32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x27]
+
+v_min_u32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x26]
+
+v_min_u32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x26]
+
+v_min_u32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x26]
+
+v_min_u32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x26]
+
+v_min_u32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x26]
+
+v_min_u32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x26]
+
+v_min_u32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x26]
+
+v_min_u32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x26]
+
+v_min_u32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x26]
+
+v_min_u32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x26]
+
+v_min_u32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x26]
+
+v_min_u32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x26]
+
+v_min_u32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x26]
+
+v_min_u32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x26]
+
+v_min_u32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x26]
+
+v_min_u32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x26]
+
+v_min_u32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x26]
+
+v_min_u32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x26,0x56,0x34,0x12,0xaf]
+
+v_min_u32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x26,0x73,0x72,0x71,0x3f]
+
+v_min_u32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x26]
+
+v_min_u32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x26]
+
+v_min_u32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x26]
+
+v_min_u32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0x04,0x00,0x00]
+
+v_min_u32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x26,0xd2,0x80,0x04,0x00,0x00]
+
+v_min_u32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x26,0xd2,0xc1,0x04,0x00,0x00]
+
+v_min_u32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x26,0xd2,0xf0,0x04,0x00,0x00]
+
+v_min_u32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x26,0xd2,0xf7,0x04,0x00,0x00]
+
+v_min_u32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x26,0xd2,0x01,0x05,0x00,0x00]
+
+v_min_u32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x26,0xd2,0xff,0x05,0x00,0x00]
+
+v_min_u32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xce,0x00,0x00]
+
+v_min_u32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xd0,0x00,0x00]
+
+v_min_u32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xd2,0x00,0x00]
+
+v_min_u32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xd4,0x00,0x00]
+
+v_min_u32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xd6,0x00,0x00]
+
+v_min_u32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xd8,0x00,0x00]
+
+v_min_u32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xda,0x00,0x00]
+
+v_min_u32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xdc,0x00,0x00]
+
+v_min_u32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xde,0x00,0x00]
+
+v_min_u32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xf6,0x00,0x00]
+
+v_min_u32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xf8,0x00,0x00]
+
+v_min_u32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xfc,0x00,0x00]
+
+v_min_u32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xfe,0x00,0x00]
+
+v_min_u32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0x00,0x01,0x00]
+
+v_min_u32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0x82,0x01,0x00]
+
+v_min_u32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xe0,0x01,0x00]
+
+v_min_u32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xee,0x01,0x00]
+
+v_min_u32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0x04,0x02,0x00]
+
+v_min_u32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x26,0xd2,0x80,0xfe,0x03,0x00]
+
+v_max_u32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x28]
+
+v_max_u32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x29]
+
+v_max_u32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x28]
+
+v_max_u32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x28]
+
+v_max_u32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x28]
+
+v_max_u32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x28]
+
+v_max_u32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x28]
+
+v_max_u32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x28]
+
+v_max_u32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x28]
+
+v_max_u32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x28]
+
+v_max_u32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x28]
+
+v_max_u32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x28]
+
+v_max_u32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x28]
+
+v_max_u32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x28]
+
+v_max_u32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x28]
+
+v_max_u32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x28]
+
+v_max_u32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x28]
+
+v_max_u32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x28]
+
+v_max_u32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x28]
+
+v_max_u32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x28,0x56,0x34,0x12,0xaf]
+
+v_max_u32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x28,0x73,0x72,0x71,0x3f]
+
+v_max_u32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x28]
+
+v_max_u32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x28]
+
+v_max_u32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x28]
+
+v_max_u32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0x04,0x00,0x00]
+
+v_max_u32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x28,0xd2,0x80,0x04,0x00,0x00]
+
+v_max_u32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x28,0xd2,0xc1,0x04,0x00,0x00]
+
+v_max_u32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x28,0xd2,0xf0,0x04,0x00,0x00]
+
+v_max_u32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x28,0xd2,0xf7,0x04,0x00,0x00]
+
+v_max_u32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x28,0xd2,0x01,0x05,0x00,0x00]
+
+v_max_u32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x28,0xd2,0xff,0x05,0x00,0x00]
+
+v_max_u32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xce,0x00,0x00]
+
+v_max_u32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xd0,0x00,0x00]
+
+v_max_u32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xd2,0x00,0x00]
+
+v_max_u32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xd4,0x00,0x00]
+
+v_max_u32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xd6,0x00,0x00]
+
+v_max_u32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xd8,0x00,0x00]
+
+v_max_u32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xda,0x00,0x00]
+
+v_max_u32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xdc,0x00,0x00]
+
+v_max_u32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xde,0x00,0x00]
+
+v_max_u32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xf6,0x00,0x00]
+
+v_max_u32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xf8,0x00,0x00]
+
+v_max_u32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xfc,0x00,0x00]
+
+v_max_u32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xfe,0x00,0x00]
+
+v_max_u32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0x00,0x01,0x00]
+
+v_max_u32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0x82,0x01,0x00]
+
+v_max_u32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xe0,0x01,0x00]
+
+v_max_u32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xee,0x01,0x00]
+
+v_max_u32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0x04,0x02,0x00]
+
+v_max_u32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x28,0xd2,0x80,0xfe,0x03,0x00]
+
+v_lshr_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x2a]
+
+v_lshr_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x2b]
+
+v_lshr_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x2a]
+
+v_lshr_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x2a,0x56,0x34,0x12,0xaf]
+
+v_lshr_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x2a,0x73,0x72,0x71,0x3f]
+
+v_lshr_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x2a]
+
+v_lshr_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x2a]
+
+v_lshr_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x2a]
+
+v_lshr_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0x04,0x00,0x00]
+
+v_lshr_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x2a,0xd2,0x80,0x04,0x00,0x00]
+
+v_lshr_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x2a,0xd2,0xc1,0x04,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x2a,0xd2,0xf0,0x04,0x00,0x00]
+
+v_lshr_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x2a,0xd2,0xf7,0x04,0x00,0x00]
+
+v_lshr_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x2a,0xd2,0x01,0x05,0x00,0x00]
+
+v_lshr_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x2a,0xd2,0xff,0x05,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xce,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xd0,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xd2,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xd4,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xd6,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xd8,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xda,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xdc,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xde,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xf6,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xf8,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xfc,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xfe,0x00,0x00]
+
+v_lshr_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0x00,0x01,0x00]
+
+v_lshr_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0x82,0x01,0x00]
+
+v_lshr_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xe0,0x01,0x00]
+
+v_lshr_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xee,0x01,0x00]
+
+v_lshr_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0x04,0x02,0x00]
+
+v_lshr_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x2a,0xd2,0x80,0xfe,0x03,0x00]
+
+v_lshrrev_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x2d]
+
+v_lshrrev_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x2c]
+
+v_lshrrev_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x2c,0x56,0x34,0x12,0xaf]
+
+v_lshrrev_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x2c,0x73,0x72,0x71,0x3f]
+
+v_lshrrev_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x2c]
+
+v_lshrrev_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x2c]
+
+v_lshrrev_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x2c]
+
+v_lshrrev_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0x04,0x00,0x00]
+
+v_lshrrev_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x2c,0xd2,0x80,0x04,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x2c,0xd2,0xc1,0x04,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x2c,0xd2,0xf0,0x04,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x2c,0xd2,0xf7,0x04,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x2c,0xd2,0x01,0x05,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x2c,0xd2,0xff,0x05,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xce,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xd0,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xd2,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xd4,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xd6,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xd8,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xda,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xdc,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xde,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xf6,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xf8,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xfc,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xfe,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0x00,0x01,0x00]
+
+v_lshrrev_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0x82,0x01,0x00]
+
+v_lshrrev_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xe0,0x01,0x00]
+
+v_lshrrev_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xee,0x01,0x00]
+
+v_lshrrev_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0x04,0x02,0x00]
+
+v_lshrrev_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x2c,0xd2,0x80,0xfe,0x03,0x00]
+
+v_ashr_i32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x2e]
+
+v_ashr_i32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x2f]
+
+v_ashr_i32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x2e]
+
+v_ashr_i32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x2e,0x56,0x34,0x12,0xaf]
+
+v_ashr_i32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x2e,0x73,0x72,0x71,0x3f]
+
+v_ashr_i32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x2e]
+
+v_ashr_i32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x2e]
+
+v_ashr_i32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x2e]
+
+v_ashr_i32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0x04,0x00,0x00]
+
+v_ashr_i32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x2e,0xd2,0x80,0x04,0x00,0x00]
+
+v_ashr_i32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x2e,0xd2,0xc1,0x04,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x2e,0xd2,0xf0,0x04,0x00,0x00]
+
+v_ashr_i32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x2e,0xd2,0xf7,0x04,0x00,0x00]
+
+v_ashr_i32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x2e,0xd2,0x01,0x05,0x00,0x00]
+
+v_ashr_i32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x2e,0xd2,0xff,0x05,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xce,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xd0,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xd2,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xd4,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xd6,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xd8,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xda,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xdc,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xde,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xf6,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xf8,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xfc,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xfe,0x00,0x00]
+
+v_ashr_i32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0x00,0x01,0x00]
+
+v_ashr_i32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0x82,0x01,0x00]
+
+v_ashr_i32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xe0,0x01,0x00]
+
+v_ashr_i32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xee,0x01,0x00]
+
+v_ashr_i32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0x04,0x02,0x00]
+
+v_ashr_i32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x2e,0xd2,0x80,0xfe,0x03,0x00]
+
+v_ashrrev_i32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x31]
+
+v_ashrrev_i32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x30]
+
+v_ashrrev_i32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x30,0x56,0x34,0x12,0xaf]
+
+v_ashrrev_i32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x30,0x73,0x72,0x71,0x3f]
+
+v_ashrrev_i32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x30]
+
+v_ashrrev_i32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x30]
+
+v_ashrrev_i32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x30]
+
+v_ashrrev_i32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0x04,0x00,0x00]
+
+v_ashrrev_i32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x30,0xd2,0x80,0x04,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x30,0xd2,0xc1,0x04,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x30,0xd2,0xf0,0x04,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x30,0xd2,0xf7,0x04,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x30,0xd2,0x01,0x05,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x30,0xd2,0xff,0x05,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xce,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xd0,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xd2,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xd4,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xd6,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xd8,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xda,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xdc,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xde,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xf6,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xf8,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xfc,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xfe,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0x00,0x01,0x00]
+
+v_ashrrev_i32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0x82,0x01,0x00]
+
+v_ashrrev_i32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xe0,0x01,0x00]
+
+v_ashrrev_i32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xee,0x01,0x00]
+
+v_ashrrev_i32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0x04,0x02,0x00]
+
+v_ashrrev_i32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x30,0xd2,0x80,0xfe,0x03,0x00]
+
+v_lshl_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x32]
+
+v_lshl_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x33]
+
+v_lshl_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x32]
+
+v_lshl_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x32,0x56,0x34,0x12,0xaf]
+
+v_lshl_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x32,0x73,0x72,0x71,0x3f]
+
+v_lshl_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x32]
+
+v_lshl_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x32]
+
+v_lshl_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x32]
+
+v_lshl_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0x04,0x00,0x00]
+
+v_lshl_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x32,0xd2,0x80,0x04,0x00,0x00]
+
+v_lshl_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x32,0xd2,0xc1,0x04,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x32,0xd2,0xf0,0x04,0x00,0x00]
+
+v_lshl_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x32,0xd2,0xf7,0x04,0x00,0x00]
+
+v_lshl_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x32,0xd2,0x01,0x05,0x00,0x00]
+
+v_lshl_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x32,0xd2,0xff,0x05,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xce,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xd0,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xd2,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xd4,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xd6,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xd8,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xda,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xdc,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xde,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xf6,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xf8,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xfc,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xfe,0x00,0x00]
+
+v_lshl_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0x00,0x01,0x00]
+
+v_lshl_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0x82,0x01,0x00]
+
+v_lshl_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xe0,0x01,0x00]
+
+v_lshl_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xee,0x01,0x00]
+
+v_lshl_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0x04,0x02,0x00]
+
+v_lshl_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x32,0xd2,0x80,0xfe,0x03,0x00]
+
+v_lshlrev_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x35]
+
+v_lshlrev_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x34]
+
+v_lshlrev_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x34,0x56,0x34,0x12,0xaf]
+
+v_lshlrev_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x34,0x73,0x72,0x71,0x3f]
+
+v_lshlrev_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x34]
+
+v_lshlrev_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x34]
+
+v_lshlrev_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x34]
+
+v_lshlrev_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0x04,0x00,0x00]
+
+v_lshlrev_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x34,0xd2,0x80,0x04,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x34,0xd2,0xc1,0x04,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x34,0xd2,0xf0,0x04,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x34,0xd2,0xf7,0x04,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x34,0xd2,0x01,0x05,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x34,0xd2,0xff,0x05,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xce,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xd0,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xd2,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xd4,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xd6,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xd8,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xda,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xdc,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xde,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xf6,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xf8,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xfc,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xfe,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0x00,0x01,0x00]
+
+v_lshlrev_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0x82,0x01,0x00]
+
+v_lshlrev_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xe0,0x01,0x00]
+
+v_lshlrev_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xee,0x01,0x00]
+
+v_lshlrev_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0x04,0x02,0x00]
+
+v_lshlrev_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x34,0xd2,0x80,0xfe,0x03,0x00]
+
+v_and_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x36]
+
+v_and_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x37]
+
+v_and_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x36]
+
+v_and_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x36]
+
+v_and_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x36]
+
+v_and_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x36]
+
+v_and_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x36]
+
+v_and_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x36]
+
+v_and_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x36]
+
+v_and_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x36]
+
+v_and_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x36]
+
+v_and_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x36]
+
+v_and_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x36]
+
+v_and_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x36]
+
+v_and_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x36]
+
+v_and_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x36]
+
+v_and_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x36]
+
+v_and_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x36]
+
+v_and_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x36]
+
+v_and_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x36,0x56,0x34,0x12,0xaf]
+
+v_and_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x36,0x73,0x72,0x71,0x3f]
+
+v_and_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x36]
+
+v_and_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x36]
+
+v_and_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x36]
+
+v_and_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0x04,0x00,0x00]
+
+v_and_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x36,0xd2,0x80,0x04,0x00,0x00]
+
+v_and_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x36,0xd2,0xc1,0x04,0x00,0x00]
+
+v_and_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x36,0xd2,0xf0,0x04,0x00,0x00]
+
+v_and_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x36,0xd2,0xf7,0x04,0x00,0x00]
+
+v_and_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x36,0xd2,0x01,0x05,0x00,0x00]
+
+v_and_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x36,0xd2,0xff,0x05,0x00,0x00]
+
+v_and_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xce,0x00,0x00]
+
+v_and_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xd0,0x00,0x00]
+
+v_and_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xd2,0x00,0x00]
+
+v_and_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xd4,0x00,0x00]
+
+v_and_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xd6,0x00,0x00]
+
+v_and_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xd8,0x00,0x00]
+
+v_and_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xda,0x00,0x00]
+
+v_and_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xdc,0x00,0x00]
+
+v_and_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xde,0x00,0x00]
+
+v_and_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xf6,0x00,0x00]
+
+v_and_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xf8,0x00,0x00]
+
+v_and_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xfc,0x00,0x00]
+
+v_and_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xfe,0x00,0x00]
+
+v_and_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0x00,0x01,0x00]
+
+v_and_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0x82,0x01,0x00]
+
+v_and_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xe0,0x01,0x00]
+
+v_and_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xee,0x01,0x00]
+
+v_and_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0x04,0x02,0x00]
+
+v_and_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x36,0xd2,0x80,0xfe,0x03,0x00]
+
+v_or_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x38]
+
+v_or_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x39]
+
+v_or_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x38]
+
+v_or_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x38]
+
+v_or_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x38]
+
+v_or_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x38]
+
+v_or_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x38]
+
+v_or_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x38]
+
+v_or_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x38]
+
+v_or_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x38]
+
+v_or_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x38]
+
+v_or_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x38]
+
+v_or_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x38]
+
+v_or_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x38]
+
+v_or_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x38]
+
+v_or_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x38]
+
+v_or_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x38]
+
+v_or_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x38]
+
+v_or_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x38]
+
+v_or_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x38,0x56,0x34,0x12,0xaf]
+
+v_or_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x38,0x73,0x72,0x71,0x3f]
+
+v_or_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x38]
+
+v_or_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x38]
+
+v_or_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x38]
+
+v_or_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0x04,0x00,0x00]
+
+v_or_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x38,0xd2,0x80,0x04,0x00,0x00]
+
+v_or_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x38,0xd2,0xc1,0x04,0x00,0x00]
+
+v_or_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x38,0xd2,0xf0,0x04,0x00,0x00]
+
+v_or_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x38,0xd2,0xf7,0x04,0x00,0x00]
+
+v_or_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x38,0xd2,0x01,0x05,0x00,0x00]
+
+v_or_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x38,0xd2,0xff,0x05,0x00,0x00]
+
+v_or_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xce,0x00,0x00]
+
+v_or_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xd0,0x00,0x00]
+
+v_or_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xd2,0x00,0x00]
+
+v_or_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xd4,0x00,0x00]
+
+v_or_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xd6,0x00,0x00]
+
+v_or_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xd8,0x00,0x00]
+
+v_or_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xda,0x00,0x00]
+
+v_or_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xdc,0x00,0x00]
+
+v_or_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xde,0x00,0x00]
+
+v_or_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xf6,0x00,0x00]
+
+v_or_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xf8,0x00,0x00]
+
+v_or_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xfc,0x00,0x00]
+
+v_or_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xfe,0x00,0x00]
+
+v_or_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0x00,0x01,0x00]
+
+v_or_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0x82,0x01,0x00]
+
+v_or_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xe0,0x01,0x00]
+
+v_or_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xee,0x01,0x00]
+
+v_or_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0x04,0x02,0x00]
+
+v_or_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x38,0xd2,0x80,0xfe,0x03,0x00]
+
+v_xor_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x3a]
+
+v_xor_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x3b]
+
+v_xor_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x3a]
+
+v_xor_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x3a,0x56,0x34,0x12,0xaf]
+
+v_xor_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x3a,0x73,0x72,0x71,0x3f]
+
+v_xor_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x3a]
+
+v_xor_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x3a]
+
+v_xor_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x3a]
+
+v_xor_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0x04,0x00,0x00]
+
+v_xor_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x3a,0xd2,0x80,0x04,0x00,0x00]
+
+v_xor_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x3a,0xd2,0xc1,0x04,0x00,0x00]
+
+v_xor_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x3a,0xd2,0xf0,0x04,0x00,0x00]
+
+v_xor_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x3a,0xd2,0xf7,0x04,0x00,0x00]
+
+v_xor_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x3a,0xd2,0x01,0x05,0x00,0x00]
+
+v_xor_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x3a,0xd2,0xff,0x05,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xce,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xd0,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xd2,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xd4,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xd6,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xd8,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xda,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xdc,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xde,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xf6,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xf8,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xfc,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xfe,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0x00,0x01,0x00]
+
+v_xor_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0x82,0x01,0x00]
+
+v_xor_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xe0,0x01,0x00]
+
+v_xor_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xee,0x01,0x00]
+
+v_xor_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0x04,0x02,0x00]
+
+v_xor_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x3a,0xd2,0x80,0xfe,0x03,0x00]
+
+v_bfm_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x3c]
+
+v_bfm_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x3d]
+
+v_bfm_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x3c]
+
+v_bfm_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x3c,0x56,0x34,0x12,0xaf]
+
+v_bfm_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x3c,0x73,0x72,0x71,0x3f]
+
+v_bfm_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x3c]
+
+v_bfm_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x3c]
+
+v_bfm_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x3c]
+
+v_bfm_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0x04,0x00,0x00]
+
+v_bfm_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x3c,0xd2,0x80,0x04,0x00,0x00]
+
+v_bfm_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x3c,0xd2,0xc1,0x04,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x3c,0xd2,0xf0,0x04,0x00,0x00]
+
+v_bfm_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x3c,0xd2,0xf7,0x04,0x00,0x00]
+
+v_bfm_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x3c,0xd2,0x01,0x05,0x00,0x00]
+
+v_bfm_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x3c,0xd2,0xff,0x05,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xce,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xd0,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xd2,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xd4,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xd6,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xd8,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xda,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xdc,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xde,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xf6,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xf8,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xfc,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xfe,0x00,0x00]
+
+v_bfm_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0x00,0x01,0x00]
+
+v_bfm_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0x82,0x01,0x00]
+
+v_bfm_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xe0,0x01,0x00]
+
+v_bfm_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xee,0x01,0x00]
+
+v_bfm_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0x04,0x02,0x00]
+
+v_bfm_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x3c,0xd2,0x80,0xfe,0x03,0x00]
+
+v_mac_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x3e]
+
+v_mac_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x3f]
+
+v_mac_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x3e]
+
+v_mac_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x3e,0x56,0x34,0x12,0xaf]
+
+v_mac_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x3e,0x73,0x72,0x71,0x3f]
+
+v_mac_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x3e]
+
+v_mac_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x3e]
+
+v_mac_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x3e]
+
+v_mac_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x3e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x3e,0xd2,0xff,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xcf,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xd1,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xd3,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xd5,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xd7,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xd9,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xdb,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xdd,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xdf,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xf7,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xf9,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xfd,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xff,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xfb,0x01,0x00]
+
+v_mac_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0x05,0x02,0x00]
+
+v_mac_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0xff,0x03,0x00]
+
+v_mac_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0x05,0x00,0x20]
+
+v_mac_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0x05,0x00,0x40]
+
+v_mac_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0x05,0x00,0x60]
+
+v_mac_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x3e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x3e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x3e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x08,0x3e,0xd2,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0x05,0x00,0x08]
+
+v_mac_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0x05,0x00,0x10]
+
+v_mac_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x3e,0xd2,0x01,0x05,0x00,0x18]
+
+v_madmk_f32 v5, 0, 0x11213141, v3
+// CHECK: [0x80,0x06,0x0a,0x40,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v255, 0, 0x11213141, v3
+// CHECK: [0x80,0x06,0xfe,0x41,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, -1, 0x11213141, v3
+// CHECK: [0xc1,0x06,0x0a,0x40,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, 0.5, 0x11213141, v3
+// CHECK: [0xf0,0x06,0x0a,0x40,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, -4.0, 0x11213141, v3
+// CHECK: [0xf7,0x06,0x0a,0x40,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, v1, 0x11213141, v3
+// CHECK: [0x01,0x07,0x0a,0x40,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, v255, 0x11213141, v3
+// CHECK: [0xff,0x07,0x0a,0x40,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, 0, 0xa1b1c1d1, v3
+// CHECK: [0x80,0x06,0x0a,0x40,0xd1,0xc1,0xb1,0xa1]
+
+v_madmk_f32 v5, 0, 0x11213141, v255
+// CHECK: [0x80,0xfe,0x0b,0x40,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, 0, v2, 0x11213141
+// CHECK: [0x80,0x04,0x0a,0x42,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v255, 0, v2, 0x11213141
+// CHECK: [0x80,0x04,0xfe,0x43,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, -1, v2, 0x11213141
+// CHECK: [0xc1,0x04,0x0a,0x42,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, 0.5, v2, 0x11213141
+// CHECK: [0xf0,0x04,0x0a,0x42,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, -4.0, v2, 0x11213141
+// CHECK: [0xf7,0x04,0x0a,0x42,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, v1, v2, 0x11213141
+// CHECK: [0x01,0x05,0x0a,0x42,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, v255, v2, 0x11213141
+// CHECK: [0xff,0x05,0x0a,0x42,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, 0, v255, 0x11213141
+// CHECK: [0x80,0xfe,0x0b,0x42,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, 0, v2, 0xa1b1c1d1
+// CHECK: [0x80,0x04,0x0a,0x42,0xd1,0xc1,0xb1,0xa1]
+
+v_bcnt_u32_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x45]
+
+v_bcnt_u32_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x44,0x56,0x34,0x12,0xaf]
+
+v_bcnt_u32_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x44,0x73,0x72,0x71,0x3f]
+
+v_bcnt_u32_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x44]
+
+v_bcnt_u32_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x44]
+
+v_bcnt_u32_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0x04,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x44,0xd2,0x80,0x04,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x44,0xd2,0xc1,0x04,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x44,0xd2,0xf0,0x04,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x44,0xd2,0xf7,0x04,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x44,0xd2,0x01,0x05,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x44,0xd2,0xff,0x05,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xce,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xd0,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xd2,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xd4,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xd6,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xd8,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xda,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xdc,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xde,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xf6,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xf8,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xfc,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xfe,0x00,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0x00,0x01,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0x82,0x01,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xe0,0x01,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xee,0x01,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0x04,0x02,0x00]
+
+v_bcnt_u32_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x44,0xd2,0x80,0xfe,0x03,0x00]
+
+v_mbcnt_lo_u32_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x47]
+
+v_mbcnt_lo_u32_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x46,0x56,0x34,0x12,0xaf]
+
+v_mbcnt_lo_u32_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x46,0x73,0x72,0x71,0x3f]
+
+v_mbcnt_lo_u32_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x46]
+
+v_mbcnt_lo_u32_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x46]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0x04,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x46,0xd2,0x80,0x04,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x46,0xd2,0xc1,0x04,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x46,0xd2,0xf0,0x04,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x46,0xd2,0xf7,0x04,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x46,0xd2,0x01,0x05,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x46,0xd2,0xff,0x05,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xce,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xd0,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xd2,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xd4,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xd6,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xd8,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xda,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xdc,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xde,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xf6,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xf8,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xfc,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xfe,0x00,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0x00,0x01,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0x82,0x01,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xe0,0x01,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xee,0x01,0x00]
+
+v_mbcnt_lo_u32_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0x04,0x02,0x00]
 
-v_movreld_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x84,0xd3,0xc1,0x00,0x00,0x00]
+v_mbcnt_lo_u32_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x46,0xd2,0x80,0xfe,0x03,0x00]
 
-v_movreld_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x84,0xd3,0xf0,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x48]
 
-v_movreld_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x84,0xd3,0xf7,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x49]
 
-v_movreld_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x84,0xd3,0x00,0x01,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x48]
 
-v_movreld_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x84,0xd3,0xff,0x01,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x48]
 
-v_movrels_b32 v0, v0
-// CHECK: [0x00,0x87,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x48]
 
-v_movrels_b32 v255, v0
-// CHECK: [0x00,0x87,0xfe,0x7f]
+v_mbcnt_hi_u32_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x48]
 
-v_movrels_b32 v0, v255
-// CHECK: [0xff,0x87,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x48]
 
-v_movrels_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x86,0xd3,0x00,0x01,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x48]
 
-v_movrels_b32_e64 v255, v0
-// CHECK: [0xff,0x00,0x86,0xd3,0x00,0x01,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x48]
 
-v_movrels_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x86,0xd3,0xff,0x01,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x48]
 
-v_movrelsd_b32 v0, v0
-// CHECK: [0x00,0x89,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x48]
 
-v_movrelsd_b32 v255, v0
-// CHECK: [0x00,0x89,0xfe,0x7f]
+v_mbcnt_hi_u32_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x48]
 
-v_movrelsd_b32 v0, v255
-// CHECK: [0xff,0x89,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x48]
 
-v_movrelsd_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x88,0xd3,0x00,0x01,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x48]
 
-v_movrelsd_b32_e64 v255, v0
-// CHECK: [0xff,0x00,0x88,0xd3,0x00,0x01,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x48]
 
-v_movrelsd_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x88,0xd3,0xff,0x01,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x48]
 
-v_log_legacy_f32 v0, s0
-// CHECK: [0x00,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x48]
 
-v_log_legacy_f32 v255, s0
-// CHECK: [0x00,0x8a,0xfe,0x7f]
+v_mbcnt_hi_u32_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x48]
 
-v_log_legacy_f32 v0, s103
-// CHECK: [0x67,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x48]
 
-v_log_legacy_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x48,0x56,0x34,0x12,0xaf]
 
-v_log_legacy_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x48,0x73,0x72,0x71,0x3f]
 
-v_log_legacy_f32 v0, vcc_lo
-// CHECK: [0x6a,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x48]
 
-v_log_legacy_f32 v0, vcc_hi
-// CHECK: [0x6b,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x48]
 
-v_log_legacy_f32 v0, tba_lo
-// CHECK: [0x6c,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x48]
 
-v_log_legacy_f32 v0, tba_hi
-// CHECK: [0x6d,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0x04,0x00,0x00]
 
-v_log_legacy_f32 v0, tma_lo
-// CHECK: [0x6e,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x48,0xd2,0x80,0x04,0x00,0x00]
 
-v_log_legacy_f32 v0, tma_hi
-// CHECK: [0x6f,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x48,0xd2,0xc1,0x04,0x00,0x00]
 
-v_log_legacy_f32 v0, ttmp11
-// CHECK: [0x7b,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x48,0xd2,0xf0,0x04,0x00,0x00]
 
-v_log_legacy_f32 v0, m0
-// CHECK: [0x7c,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x48,0xd2,0xf7,0x04,0x00,0x00]
 
-v_log_legacy_f32 v0, exec_lo
-// CHECK: [0x7e,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x48,0xd2,0x01,0x05,0x00,0x00]
 
-v_log_legacy_f32 v0, exec_hi
-// CHECK: [0x7f,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x48,0xd2,0xff,0x05,0x00,0x00]
 
-v_log_legacy_f32 v0, 0
-// CHECK: [0x80,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xce,0x00,0x00]
 
-v_log_legacy_f32 v0, -1
-// CHECK: [0xc1,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xd0,0x00,0x00]
 
-v_log_legacy_f32 v0, 0.5
-// CHECK: [0xf0,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xd2,0x00,0x00]
 
-v_log_legacy_f32 v0, -4.0
-// CHECK: [0xf7,0x8a,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xd4,0x00,0x00]
 
-v_log_legacy_f32 v0, 0xaf123456
-// CHECK: [0xff,0x8a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_mbcnt_hi_u32_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xd6,0x00,0x00]
 
-v_log_legacy_f32 v0, 0x3f717273
-// CHECK: [0xff,0x8a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_mbcnt_hi_u32_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xd8,0x00,0x00]
 
-v_log_legacy_f32 v0, v0
-// CHECK: [0x00,0x8b,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xda,0x00,0x00]
 
-v_log_legacy_f32 v0, v255
-// CHECK: [0xff,0x8b,0x00,0x7e]
+v_mbcnt_hi_u32_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xdc,0x00,0x00]
 
-v_log_legacy_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x8a,0xd3,0x00,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xde,0x00,0x00]
 
-v_log_legacy_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x8a,0xd3,0x00,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xf6,0x00,0x00]
 
-v_log_legacy_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x8a,0xd3,0x67,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xf8,0x00,0x00]
 
-v_log_legacy_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x8a,0xd3,0x68,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xfc,0x00,0x00]
 
-v_log_legacy_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x8a,0xd3,0x69,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xfe,0x00,0x00]
 
-v_log_legacy_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x8a,0xd3,0x6a,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0x00,0x01,0x00]
 
-v_log_legacy_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x8a,0xd3,0x6b,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0x82,0x01,0x00]
 
-v_log_legacy_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x8a,0xd3,0x6c,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xe0,0x01,0x00]
 
-v_log_legacy_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x8a,0xd3,0x6d,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xee,0x01,0x00]
 
-v_log_legacy_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x8a,0xd3,0x6e,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0x04,0x02,0x00]
 
-v_log_legacy_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x8a,0xd3,0x6f,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x48,0xd2,0x80,0xfe,0x03,0x00]
 
-v_log_legacy_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x8a,0xd3,0x7b,0x00,0x00,0x00]
+v_add_i32 v5, vcc, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x8a,0xd3,0x7c,0x00,0x00,0x00]
+v_add_i32 v255, vcc, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x4b]
 
-v_log_legacy_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x8a,0xd3,0x7e,0x00,0x00,0x00]
+v_add_i32 v5, vcc, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x8a,0xd3,0x7f,0x00,0x00,0x00]
+v_add_i32 v5, vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x8a,0xd3,0xfd,0x00,0x00,0x00]
+v_add_i32 v5, vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x8a,0xd3,0x00,0x01,0x00,0x00]
+v_add_i32 v5, vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x8a,0xd3,0xff,0x01,0x00,0x00]
+v_add_i32 v5, vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x8a,0xd3,0x00,0x00,0x00,0x20]
+v_add_i32 v5, vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x8a,0xd3,0x00,0x00,0x00,0x00]
+v_add_i32 v5, vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x8a,0xd3,0x00,0x00,0x00,0x00]
+v_add_i32 v5, vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x8a,0xd3,0x00,0x00,0x00,0x08]
+v_add_i32 v5, vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x8a,0xd3,0x00,0x00,0x00,0x10]
+v_add_i32 v5, vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x4a]
 
-v_log_legacy_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x8a,0xd3,0x00,0x00,0x00,0x18]
+v_add_i32 v5, vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x4a]
 
-v_exp_legacy_f32 v0, s0
-// CHECK: [0x00,0x8c,0x00,0x7e]
+v_add_i32 v5, vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x4a]
 
-v_exp_legacy_f32 v255, s0
-// CHECK: [0x00,0x8c,0xfe,0x7f]
+v_add_i32 v5, vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x4a]
 
-v_exp_legacy_f32 v0, s103
-// CHECK: [0x67,0x8c,0x00,0x7e]
+v_add_i32 v5, vcc, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x4a]
 
-v_exp_legacy_f32 v0, flat_scratch_lo
-// CHECK: [0x68,0x8c,0x00,0x7e]
+v_add_i32 v5, vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x4a]
 
-v_exp_legacy_f32 v0, flat_scratch_hi
-// CHECK: [0x69,0x8c,0x00,0x7e]
+v_add_i32 v5, vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x4a]
 
-v_exp_legacy_f32 v0, vcc_lo
-// CHECK: [0x6a,0x8c,0x00,0x7e]
+v_add_i32 v5, vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x4a]
 
-v_exp_legacy_f32 v0, vcc_hi
-// CHECK: [0x6b,0x8c,0x00,0x7e]
+v_add_i32 v5, vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x4a,0x56,0x34,0x12,0xaf]
 
-v_exp_legacy_f32 v0, tba_lo
-// CHECK: [0x6c,0x8c,0x00,0x7e]
+v_add_i32 v5, vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x4a,0x73,0x72,0x71,0x3f]
 
-v_exp_legacy_f32 v0, tba_hi
-// CHECK: [0x6d,0x8c,0x00,0x7e]
+v_add_i32 v5, vcc, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x4a]
 
-v_exp_legacy_f32 v0, tma_lo
-// CHECK: [0x6e,0x8c,0x00,0x7e]
+v_add_i32 v5, vcc, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x4a]
 
-v_exp_legacy_f32 v0, tma_hi
-// CHECK: [0x6f,0x8c,0x00,0x7e]
+v_add_i32 v5, vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x4a]
 
-v_exp_legacy_f32 v0, ttmp11
-// CHECK: [0x7b,0x8c,0x00,0x7e]
+v_add_i32_e64 v5, s[12:13], 0, s2
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, m0
-// CHECK: [0x7c,0x8c,0x00,0x7e]
+v_add_i32_e64 v255, s[12:13], 0, s2
+// CHECK: [0xff,0x0c,0x4a,0xd2,0x80,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, exec_lo
-// CHECK: [0x7e,0x8c,0x00,0x7e]
+v_add_i32_e64 v5, s[14:15], 0, s2
+// CHECK: [0x05,0x0e,0x4a,0xd2,0x80,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, exec_hi
-// CHECK: [0x7f,0x8c,0x00,0x7e]
+v_add_i32_e64 v5, s[102:103], 0, s2
+// CHECK: [0x05,0x66,0x4a,0xd2,0x80,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, 0
-// CHECK: [0x80,0x8c,0x00,0x7e]
+v_add_i32_e64 v5, flat_scratch, 0, s2
+// CHECK: [0x05,0x68,0x4a,0xd2,0x80,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, -1
-// CHECK: [0xc1,0x8c,0x00,0x7e]
+v_add_i32_e64 v5, vcc, 0, s2
+// CHECK: [0x05,0x6a,0x4a,0xd2,0x80,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, 0.5
-// CHECK: [0xf0,0x8c,0x00,0x7e]
+v_add_i32_e64 v5, tba, 0, s2
+// CHECK: [0x05,0x6c,0x4a,0xd2,0x80,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, -4.0
-// CHECK: [0xf7,0x8c,0x00,0x7e]
+v_add_i32_e64 v5, tma, 0, s2
+// CHECK: [0x05,0x6e,0x4a,0xd2,0x80,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, 0xaf123456
-// CHECK: [0xff,0x8c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_add_i32_e64 v5, ttmp[10:11], 0, s2
+// CHECK: [0x05,0x7a,0x4a,0xd2,0x80,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, 0x3f717273
-// CHECK: [0xff,0x8c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_add_i32_e64 v5, s[12:13], -1, s2
+// CHECK: [0x05,0x0c,0x4a,0xd2,0xc1,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, v0
-// CHECK: [0x00,0x8d,0x00,0x7e]
+v_add_i32_e64 v5, s[12:13], 0.5, s2
+// CHECK: [0x05,0x0c,0x4a,0xd2,0xf0,0x04,0x00,0x00]
 
-v_exp_legacy_f32 v0, v255
-// CHECK: [0xff,0x8d,0x00,0x7e]
+v_add_i32_e64 v5, s[12:13], -4.0, s2
+// CHECK: [0x05,0x0c,0x4a,0xd2,0xf7,0x04,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x8c,0xd3,0x00,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], v1, s2
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x01,0x05,0x00,0x00]
 
-v_exp_legacy_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x8c,0xd3,0x00,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], v255, s2
+// CHECK: [0x05,0x0c,0x4a,0xd2,0xff,0x05,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, s103
-// CHECK: [0x00,0x00,0x8c,0xd3,0x67,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, s103
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xce,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x8c,0xd3,0x68,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, flat_scratch_lo
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xd0,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x8c,0xd3,0x69,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, flat_scratch_hi
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xd2,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x8c,0xd3,0x6a,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, vcc_lo
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xd4,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x8c,0xd3,0x6b,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, vcc_hi
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xd6,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x8c,0xd3,0x6c,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, tba_lo
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xd8,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x8c,0xd3,0x6d,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, tba_hi
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xda,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x8c,0xd3,0x6e,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, tma_lo
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xdc,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x8c,0xd3,0x6f,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, tma_hi
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xde,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x8c,0xd3,0x7b,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, ttmp11
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xf6,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x8c,0xd3,0x7c,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, m0
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xf8,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x8c,0xd3,0x7e,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, exec_lo
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xfc,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x8c,0xd3,0x7f,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, exec_hi
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xfe,0x00,0x00]
 
-v_exp_legacy_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x8c,0xd3,0xfd,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, 0
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0x00,0x01,0x00]
 
-v_exp_legacy_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x8c,0xd3,0x00,0x01,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, -1
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0x82,0x01,0x00]
 
-v_exp_legacy_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x8c,0xd3,0xff,0x01,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, 0.5
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xe0,0x01,0x00]
 
-v_exp_legacy_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x8c,0xd3,0x00,0x00,0x00,0x20]
+v_add_i32_e64 v5, s[12:13], 0, -4.0
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xee,0x01,0x00]
 
-v_exp_legacy_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x8c,0xd3,0x00,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, v2
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0x04,0x02,0x00]
 
-v_exp_legacy_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x08,0x8c,0xd3,0x00,0x00,0x00,0x00]
+v_add_i32_e64 v5, s[12:13], 0, v255
+// CHECK: [0x05,0x0c,0x4a,0xd2,0x80,0xfe,0x03,0x00]
 
-v_exp_legacy_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x8c,0xd3,0x00,0x00,0x00,0x08]
+v_sub_i32 v5, vcc, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x4c]
 
-v_exp_legacy_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x8c,0xd3,0x00,0x00,0x00,0x10]
+v_sub_i32 v255, vcc, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x4d]
 
-v_exp_legacy_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x8c,0xd3,0x00,0x00,0x00,0x18]
+v_sub_i32 v5, vcc, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x4c]
 
-v_cndmask_b32 v0, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x4c]
 
-v_cndmask_b32 v255, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0xfe,0x01]
+v_sub_i32 v5, vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x4c]
 
-v_cndmask_b32 v0, vcc_hi, v0, vcc
-// CHECK: [0x6b,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x4c]
 
-v_cndmask_b32 v0, 0, v0, vcc
-// CHECK: [0x80,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x4c]
 
-v_cndmask_b32 v0, -1, v0, vcc
-// CHECK: [0xc1,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x4c]
 
-v_cndmask_b32 v0, 0.5, v0, vcc
-// CHECK: [0xf0,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x4c]
 
-v_cndmask_b32 v0, -4.0, v0, vcc
-// CHECK: [0xf7,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x4c]
 
-v_cndmask_b32 v0, v0, v0, vcc
-// CHECK: [0x00,0x01,0x00,0x00]
+v_sub_i32 v5, vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x4c]
 
-v_cndmask_b32 v0, v255, v0, vcc
-// CHECK: [0xff,0x01,0x00,0x00]
+v_sub_i32 v5, vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x4c]
 
-v_cndmask_b32 v0, vcc_lo, v255, vcc
-// CHECK: [0x6a,0xfe,0x01,0x00]
+v_sub_i32 v5, vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x4c]
 
-v_cndmask_b32_e64 v0, s0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0x00,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x4c]
 
-v_cndmask_b32_e64 v255, s0, s0, s[0:1]
-// CHECK: [0xff,0x00,0x00,0xd2,0x00,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x4c]
 
-v_cndmask_b32_e64 v0, 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0x80,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x4c]
 
-v_cndmask_b32_e64 v0, -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0xc1,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x4c]
 
-v_cndmask_b32_e64 v0, 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0xf0,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x4c]
 
-v_cndmask_b32_e64 v0, -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0xf7,0x00,0x00,0x00]
+v_sub_i32 v5, vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x4c]
 
-v_cndmask_b32_e64 v0, v0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0x00,0x01,0x00,0x00]
+v_sub_i32 v5, vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x4c,0x56,0x34,0x12,0xaf]
 
-v_cndmask_b32_e64 v0, v255, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0xff,0x01,0x00,0x00]
+v_sub_i32 v5, vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x4c,0x73,0x72,0x71,0x3f]
 
-v_cndmask_b32_e64 v0, s0, 0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0x00,0x00,0x01,0x00]
+v_sub_i32 v5, vcc, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x4c]
 
-v_cndmask_b32_e64 v0, s0, -1, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0x00,0x82,0x01,0x00]
+v_sub_i32 v5, vcc, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x4c]
 
-v_cndmask_b32_e64 v0, s0, 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0x00,0xe0,0x01,0x00]
+v_sub_i32 v5, vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x4c]
 
-v_cndmask_b32_e64 v0, s0, -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0x00,0xee,0x01,0x00]
+v_sub_i32_e64 v5, s[12:13], 0, s2
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0x04,0x00,0x00]
 
-v_cndmask_b32_e64 v0, s0, v0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0x00,0x00,0x02,0x00]
+v_sub_i32_e64 v255, s[12:13], 0, s2
+// CHECK: [0xff,0x0c,0x4c,0xd2,0x80,0x04,0x00,0x00]
 
-v_cndmask_b32_e64 v0, s0, v255, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd2,0x00,0xfe,0x03,0x00]
+v_sub_i32_e64 v5, s[14:15], 0, s2
+// CHECK: [0x05,0x0e,0x4c,0xd2,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 s0, v0, s0
-// CHECK: [0x00,0x01,0x00,0x02]
+v_sub_i32_e64 v5, s[102:103], 0, s2
+// CHECK: [0x05,0x66,0x4c,0xd2,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 s103, v0, s0
-// CHECK: [0x00,0x01,0xce,0x02]
+v_sub_i32_e64 v5, flat_scratch, 0, s2
+// CHECK: [0x05,0x68,0x4c,0xd2,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 tba_lo, v0, s0
-// CHECK: [0x00,0x01,0xd8,0x02]
+v_sub_i32_e64 v5, vcc, 0, s2
+// CHECK: [0x05,0x6a,0x4c,0xd2,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 tba_hi, v0, s0
-// CHECK: [0x00,0x01,0xda,0x02]
+v_sub_i32_e64 v5, tba, 0, s2
+// CHECK: [0x05,0x6c,0x4c,0xd2,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 tma_lo, v0, s0
-// CHECK: [0x00,0x01,0xdc,0x02]
+v_sub_i32_e64 v5, tma, 0, s2
+// CHECK: [0x05,0x6e,0x4c,0xd2,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 tma_hi, v0, s0
-// CHECK: [0x00,0x01,0xde,0x02]
+v_sub_i32_e64 v5, ttmp[10:11], 0, s2
+// CHECK: [0x05,0x7a,0x4c,0xd2,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 ttmp11, v0, s0
-// CHECK: [0x00,0x01,0xf6,0x02]
+v_sub_i32_e64 v5, s[12:13], -1, s2
+// CHECK: [0x05,0x0c,0x4c,0xd2,0xc1,0x04,0x00,0x00]
 
-v_readlane_b32 s0, v255, s0
-// CHECK: [0xff,0x01,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], 0.5, s2
+// CHECK: [0x05,0x0c,0x4c,0xd2,0xf0,0x04,0x00,0x00]
 
-v_readlane_b32 s0, v0, s103
-// CHECK: [0x00,0xcf,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], -4.0, s2
+// CHECK: [0x05,0x0c,0x4c,0xd2,0xf7,0x04,0x00,0x00]
 
-v_readlane_b32 s0, v0, flat_scratch_lo
-// CHECK: [0x00,0xd1,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], v1, s2
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x01,0x05,0x00,0x00]
 
-v_readlane_b32 s0, v0, flat_scratch_hi
-// CHECK: [0x00,0xd3,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], v255, s2
+// CHECK: [0x05,0x0c,0x4c,0xd2,0xff,0x05,0x00,0x00]
 
-v_readlane_b32 s0, v0, vcc_lo
-// CHECK: [0x00,0xd5,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], 0, s103
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xce,0x00,0x00]
 
-v_readlane_b32 s0, v0, vcc_hi
-// CHECK: [0x00,0xd7,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], 0, flat_scratch_lo
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xd0,0x00,0x00]
 
-v_readlane_b32 s0, v0, tba_lo
-// CHECK: [0x00,0xd9,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], 0, flat_scratch_hi
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xd2,0x00,0x00]
 
-v_readlane_b32 s0, v0, tba_hi
-// CHECK: [0x00,0xdb,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], 0, vcc_lo
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xd4,0x00,0x00]
 
-v_readlane_b32 s0, v0, tma_lo
-// CHECK: [0x00,0xdd,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], 0, vcc_hi
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xd6,0x00,0x00]
 
-v_readlane_b32 s0, v0, tma_hi
-// CHECK: [0x00,0xdf,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], 0, tba_lo
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xd8,0x00,0x00]
 
-v_readlane_b32 s0, v0, ttmp11
-// CHECK: [0x00,0xf7,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], 0, tba_hi
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xda,0x00,0x00]
 
-v_readlane_b32 s0, v0, m0
-// CHECK: [0x00,0xf9,0x00,0x02]
+v_sub_i32_e64 v5, s[12:13], 0, tma_lo
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xdc,0x00,0x00]
 
-v_readlane_b32 s0, v0, 0
-// CHECK: [0x00,0x01,0x01,0x02]
+v_sub_i32_e64 v5, s[12:13], 0, tma_hi
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xde,0x00,0x00]
 
-v_writelane_b32 v0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x04]
+v_sub_i32_e64 v5, s[12:13], 0, ttmp11
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xf6,0x00,0x00]
 
-v_writelane_b32 v255, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x05]
+v_sub_i32_e64 v5, s[12:13], 0, m0
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xf8,0x00,0x00]
 
-v_writelane_b32 v0, s0, 0
-// CHECK: [0x00,0x00,0x01,0x04]
+v_sub_i32_e64 v5, s[12:13], 0, exec_lo
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xfc,0x00,0x00]
 
-v_add_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x06]
+v_sub_i32_e64 v5, s[12:13], 0, exec_hi
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xfe,0x00,0x00]
 
-v_add_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x07]
+v_sub_i32_e64 v5, s[12:13], 0, 0
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0x00,0x01,0x00]
 
-v_add_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x06]
+v_sub_i32_e64 v5, s[12:13], 0, -1
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0x82,0x01,0x00]
 
-v_add_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x06]
+v_sub_i32_e64 v5, s[12:13], 0, 0.5
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xe0,0x01,0x00]
 
-v_add_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x06]
+v_sub_i32_e64 v5, s[12:13], 0, -4.0
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xee,0x01,0x00]
 
-v_add_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x06]
+v_sub_i32_e64 v5, s[12:13], 0, v2
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0x04,0x02,0x00]
 
-v_add_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x06]
+v_sub_i32_e64 v5, s[12:13], 0, v255
+// CHECK: [0x05,0x0c,0x4c,0xd2,0x80,0xfe,0x03,0x00]
 
-v_add_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x4e]
 
-v_add_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x06]
+v_subrev_i32 v255, vcc, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x4f]
 
-v_add_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x4e]
 
-v_add_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x4e]
 
-v_add_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x4e]
 
-v_add_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x4e]
 
-v_add_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x4e]
 
-v_add_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x4e]
 
-v_add_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x4e]
 
-v_add_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x4e]
 
-v_add_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x4e]
 
-v_add_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x06]
+v_subrev_i32 v5, vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x4e]
 
-v_add_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x06,0x56,0x34,0x12,0xaf]
+v_subrev_i32 v5, vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x4e]
 
-v_add_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x06,0x73,0x72,0x71,0x3f]
+v_subrev_i32 v5, vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x4e]
 
-v_add_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x06]
+v_subrev_i32 v5, vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x4e]
 
-v_add_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x06]
+v_subrev_i32 v5, vcc, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x4e]
 
-v_add_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x06]
+v_subrev_i32 v5, vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x4e]
 
-v_add_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0x00,0x00,0x00]
+v_subrev_i32 v5, vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x4e]
 
-v_add_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x06,0xd2,0x00,0x00,0x00,0x00]
+v_subrev_i32 v5, vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x4e]
 
-v_add_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x06,0xd2,0xfd,0x00,0x00,0x00]
+v_subrev_i32 v5, vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x4e,0x56,0x34,0x12,0xaf]
 
-v_add_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0x01,0x00,0x00]
+v_subrev_i32 v5, vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x4e,0x73,0x72,0x71,0x3f]
 
-v_add_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x06,0xd2,0xff,0x01,0x00,0x00]
+v_subrev_i32 v5, vcc, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x4e]
 
-v_add_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0xfa,0x01,0x00]
+v_subrev_i32 v5, vcc, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x4e]
 
-v_add_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0x00,0x02,0x00]
+v_subrev_i32 v5, vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x4e]
 
-v_add_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0xfe,0x03,0x00]
+v_subrev_i32_e64 v5, s[12:13], 0, s2
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0x00,0x00,0x20]
+v_subrev_i32_e64 v255, s[12:13], 0, s2
+// CHECK: [0xff,0x0c,0x4e,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0x00,0x00,0x40]
+v_subrev_i32_e64 v5, s[14:15], 0, s2
+// CHECK: [0x05,0x0e,0x4e,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0x00,0x00,0x60]
+v_subrev_i32_e64 v5, s[102:103], 0, s2
+// CHECK: [0x05,0x66,0x4e,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x06,0xd2,0x00,0x00,0x00,0x00]
+v_subrev_i32_e64 v5, flat_scratch, 0, s2
+// CHECK: [0x05,0x68,0x4e,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x06,0xd2,0x00,0x00,0x00,0x00]
+v_subrev_i32_e64 v5, vcc, 0, s2
+// CHECK: [0x05,0x6a,0x4e,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x06,0xd2,0x00,0x00,0x00,0x00]
+v_subrev_i32_e64 v5, tba, 0, s2
+// CHECK: [0x05,0x6c,0x4e,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x06,0xd2,0x00,0x00,0x00,0x00]
+v_subrev_i32_e64 v5, tma, 0, s2
+// CHECK: [0x05,0x6e,0x4e,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, ttmp[10:11], 0, s2
+// CHECK: [0x05,0x7a,0x4e,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0x00,0x00,0x10]
+v_subrev_i32_e64 v5, s[12:13], -1, s2
+// CHECK: [0x05,0x0c,0x4e,0xd2,0xc1,0x04,0x00,0x00]
 
-v_add_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x06,0xd2,0x00,0x00,0x00,0x18]
+v_subrev_i32_e64 v5, s[12:13], 0.5, s2
+// CHECK: [0x05,0x0c,0x4e,0xd2,0xf0,0x04,0x00,0x00]
 
-v_sub_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], -4.0, s2
+// CHECK: [0x05,0x0c,0x4e,0xd2,0xf7,0x04,0x00,0x00]
 
-v_sub_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x09]
+v_subrev_i32_e64 v5, s[12:13], v1, s2
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x01,0x05,0x00,0x00]
 
-v_sub_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], v255, s2
+// CHECK: [0x05,0x0c,0x4e,0xd2,0xff,0x05,0x00,0x00]
 
-v_sub_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, s103
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xce,0x00,0x00]
 
-v_sub_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, flat_scratch_lo
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xd0,0x00,0x00]
 
-v_sub_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, flat_scratch_hi
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xd2,0x00,0x00]
 
-v_sub_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, vcc_lo
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xd4,0x00,0x00]
 
-v_sub_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, vcc_hi
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xd6,0x00,0x00]
 
-v_sub_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, tba_lo
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xd8,0x00,0x00]
 
-v_sub_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, tba_hi
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xda,0x00,0x00]
 
-v_sub_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, tma_lo
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xdc,0x00,0x00]
 
-v_sub_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, tma_hi
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xde,0x00,0x00]
 
-v_sub_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, ttmp11
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xf6,0x00,0x00]
 
-v_sub_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, m0
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xf8,0x00,0x00]
 
-v_sub_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, exec_lo
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xfc,0x00,0x00]
 
-v_sub_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, exec_hi
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xfe,0x00,0x00]
 
-v_sub_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, 0
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0x00,0x01,0x00]
 
-v_sub_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, -1
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0x82,0x01,0x00]
 
-v_sub_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, 0.5
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xe0,0x01,0x00]
 
-v_sub_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x08,0x56,0x34,0x12,0xaf]
+v_subrev_i32_e64 v5, s[12:13], 0, -4.0
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xee,0x01,0x00]
 
-v_sub_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x08,0x73,0x72,0x71,0x3f]
+v_subrev_i32_e64 v5, s[12:13], 0, v2
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0x04,0x02,0x00]
 
-v_sub_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x08]
+v_subrev_i32_e64 v5, s[12:13], 0, v255
+// CHECK: [0x05,0x0c,0x4e,0xd2,0x80,0xfe,0x03,0x00]
 
-v_sub_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x08]
+v_addc_u32 v5, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0x0a,0x50]
 
-v_sub_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x08]
+v_addc_u32 v255, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0xfe,0x51]
 
-v_sub_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0x00,0x00,0x00]
+v_addc_u32 v5, vcc, -1, v2, vcc
+// CHECK: [0xc1,0x04,0x0a,0x50]
 
-v_sub_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x08,0xd2,0x00,0x00,0x00,0x00]
+v_addc_u32 v5, vcc, 0.5, v2, vcc
+// CHECK: [0xf0,0x04,0x0a,0x50]
 
-v_sub_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x08,0xd2,0xfd,0x00,0x00,0x00]
+v_addc_u32 v5, vcc, -4.0, v2, vcc
+// CHECK: [0xf7,0x04,0x0a,0x50]
 
-v_sub_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0x01,0x00,0x00]
+v_addc_u32 v5, vcc, v1, v2, vcc
+// CHECK: [0x01,0x05,0x0a,0x50]
 
-v_sub_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x08,0xd2,0xff,0x01,0x00,0x00]
+v_addc_u32 v5, vcc, v255, v2, vcc
+// CHECK: [0xff,0x05,0x0a,0x50]
 
-v_sub_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0xfa,0x01,0x00]
+v_addc_u32 v5, vcc, 0, v255, vcc
+// CHECK: [0x80,0xfe,0x0b,0x50]
 
-v_sub_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0x00,0x02,0x00]
+v_addc_u32_e64 v5, s[12:13], 0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0xfe,0x03,0x00]
+v_addc_u32_e64 v255, s[12:13], 0, 0, s[6:7]
+// CHECK: [0xff,0x0c,0x50,0xd2,0x80,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0x00,0x00,0x20]
+v_addc_u32_e64 v5, s[14:15], 0, 0, s[6:7]
+// CHECK: [0x05,0x0e,0x50,0xd2,0x80,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0x00,0x00,0x40]
+v_addc_u32_e64 v5, s[102:103], 0, 0, s[6:7]
+// CHECK: [0x05,0x66,0x50,0xd2,0x80,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0x00,0x00,0x60]
+v_addc_u32_e64 v5, flat_scratch, 0, 0, s[6:7]
+// CHECK: [0x05,0x68,0x50,0xd2,0x80,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x08,0xd2,0x00,0x00,0x00,0x00]
+v_addc_u32_e64 v5, vcc, 0, 0, s[6:7]
+// CHECK: [0x05,0x6a,0x50,0xd2,0x80,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x08,0xd2,0x00,0x00,0x00,0x00]
+v_addc_u32_e64 v5, tba, 0, 0, s[6:7]
+// CHECK: [0x05,0x6c,0x50,0xd2,0x80,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x08,0xd2,0x00,0x00,0x00,0x00]
+v_addc_u32_e64 v5, tma, 0, 0, s[6:7]
+// CHECK: [0x05,0x6e,0x50,0xd2,0x80,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x08,0xd2,0x00,0x00,0x00,0x00]
+v_addc_u32_e64 v5, ttmp[10:11], 0, 0, s[6:7]
+// CHECK: [0x05,0x7a,0x50,0xd2,0x80,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0x00,0x00,0x08]
+v_addc_u32_e64 v5, s[12:13], -1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0xc1,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0x00,0x00,0x10]
+v_addc_u32_e64 v5, s[12:13], 0.5, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0xf0,0x00,0x19,0x00]
 
-v_sub_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x08,0xd2,0x00,0x00,0x00,0x18]
+v_addc_u32_e64 v5, s[12:13], -4.0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0xf7,0x00,0x19,0x00]
 
-v_subrev_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], v1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0x01,0x01,0x19,0x00]
 
-v_subrev_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x0b]
+v_addc_u32_e64 v5, s[12:13], v255, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0xff,0x01,0x19,0x00]
 
-v_subrev_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, -1, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0x82,0x19,0x00]
 
-v_subrev_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, 0.5, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0xe0,0x19,0x00]
 
-v_subrev_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, -4.0, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0xee,0x19,0x00]
 
-v_subrev_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, v2, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0x04,0x1a,0x00]
 
-v_subrev_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, v255, s[6:7]
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0xfe,0x1b,0x00]
 
-v_subrev_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, 0, s[8:9]
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0x00,0x21,0x00]
 
-v_subrev_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, 0, s[102:103]
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0x00,0x99,0x01]
 
-v_subrev_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, 0, flat_scratch
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0x00,0xa1,0x01]
 
-v_subrev_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, 0, vcc
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0x00,0xa9,0x01]
 
-v_subrev_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, 0, tba
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0x00,0xb1,0x01]
 
-v_subrev_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, 0, tma
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0x00,0xb9,0x01]
 
-v_subrev_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x0a]
+v_addc_u32_e64 v5, s[12:13], 0, 0, ttmp[10:11]
+// CHECK: [0x05,0x0c,0x50,0xd2,0x80,0x00,0xe9,0x01]
 
-v_subrev_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x0a]
+v_subb_u32 v5, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0x0a,0x52]
 
-v_subrev_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x0a]
+v_subb_u32 v255, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0xfe,0x53]
 
-v_subrev_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x0a]
+v_subb_u32 v5, vcc, -1, v2, vcc
+// CHECK: [0xc1,0x04,0x0a,0x52]
 
-v_subrev_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x0a]
+v_subb_u32 v5, vcc, 0.5, v2, vcc
+// CHECK: [0xf0,0x04,0x0a,0x52]
 
-v_subrev_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x0a]
+v_subb_u32 v5, vcc, -4.0, v2, vcc
+// CHECK: [0xf7,0x04,0x0a,0x52]
 
-v_subrev_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x0a,0x56,0x34,0x12,0xaf]
+v_subb_u32 v5, vcc, v1, v2, vcc
+// CHECK: [0x01,0x05,0x0a,0x52]
 
-v_subrev_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x0a,0x73,0x72,0x71,0x3f]
+v_subb_u32 v5, vcc, v255, v2, vcc
+// CHECK: [0xff,0x05,0x0a,0x52]
 
-v_subrev_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x0a]
+v_subb_u32 v5, vcc, 0, v255, vcc
+// CHECK: [0x80,0xfe,0x0b,0x52]
 
-v_subrev_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x0a]
+v_subb_u32_e64 v5, s[12:13], 0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0x00,0x19,0x00]
 
-v_subrev_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x0a]
+v_subb_u32_e64 v255, s[12:13], 0, 0, s[6:7]
+// CHECK: [0xff,0x0c,0x52,0xd2,0x80,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0x00,0x00,0x00]
+v_subb_u32_e64 v5, s[14:15], 0, 0, s[6:7]
+// CHECK: [0x05,0x0e,0x52,0xd2,0x80,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x0a,0xd2,0x00,0x00,0x00,0x00]
+v_subb_u32_e64 v5, s[102:103], 0, 0, s[6:7]
+// CHECK: [0x05,0x66,0x52,0xd2,0x80,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x0a,0xd2,0xfd,0x00,0x00,0x00]
+v_subb_u32_e64 v5, flat_scratch, 0, 0, s[6:7]
+// CHECK: [0x05,0x68,0x52,0xd2,0x80,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0x01,0x00,0x00]
+v_subb_u32_e64 v5, vcc, 0, 0, s[6:7]
+// CHECK: [0x05,0x6a,0x52,0xd2,0x80,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x0a,0xd2,0xff,0x01,0x00,0x00]
+v_subb_u32_e64 v5, tba, 0, 0, s[6:7]
+// CHECK: [0x05,0x6c,0x52,0xd2,0x80,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0xfa,0x01,0x00]
+v_subb_u32_e64 v5, tma, 0, 0, s[6:7]
+// CHECK: [0x05,0x6e,0x52,0xd2,0x80,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0x00,0x02,0x00]
+v_subb_u32_e64 v5, ttmp[10:11], 0, 0, s[6:7]
+// CHECK: [0x05,0x7a,0x52,0xd2,0x80,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0xfe,0x03,0x00]
+v_subb_u32_e64 v5, s[12:13], -1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0xc1,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0x00,0x00,0x20]
+v_subb_u32_e64 v5, s[12:13], 0.5, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0xf0,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0x00,0x00,0x40]
+v_subb_u32_e64 v5, s[12:13], -4.0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0xf7,0x00,0x19,0x00]
 
-v_subrev_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0x00,0x00,0x60]
+v_subb_u32_e64 v5, s[12:13], v1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0x01,0x01,0x19,0x00]
 
-v_subrev_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x0a,0xd2,0x00,0x00,0x00,0x00]
+v_subb_u32_e64 v5, s[12:13], v255, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0xff,0x01,0x19,0x00]
 
-v_subrev_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x0a,0xd2,0x00,0x00,0x00,0x00]
+v_subb_u32_e64 v5, s[12:13], 0, -1, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0x82,0x19,0x00]
 
-v_subrev_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x0a,0xd2,0x00,0x00,0x00,0x00]
+v_subb_u32_e64 v5, s[12:13], 0, 0.5, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0xe0,0x19,0x00]
 
-v_subrev_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x0a,0xd2,0x00,0x00,0x00,0x00]
+v_subb_u32_e64 v5, s[12:13], 0, -4.0, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0xee,0x19,0x00]
 
-v_subrev_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0x00,0x00,0x08]
+v_subb_u32_e64 v5, s[12:13], 0, v2, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0x04,0x1a,0x00]
 
-v_subrev_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0x00,0x00,0x10]
+v_subb_u32_e64 v5, s[12:13], 0, v255, s[6:7]
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0xfe,0x1b,0x00]
 
-v_subrev_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x0a,0xd2,0x00,0x00,0x00,0x18]
+v_subb_u32_e64 v5, s[12:13], 0, 0, s[8:9]
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0x00,0x21,0x00]
 
-v_mac_legacy_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x0c]
+v_subb_u32_e64 v5, s[12:13], 0, 0, s[102:103]
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0x00,0x99,0x01]
 
-v_mac_legacy_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x0d]
+v_subb_u32_e64 v5, s[12:13], 0, 0, flat_scratch
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0x00,0xa1,0x01]
 
-v_mac_legacy_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x0c]
+v_subb_u32_e64 v5, s[12:13], 0, 0, vcc
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0x00,0xa9,0x01]
 
-v_mac_legacy_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x0c]
+v_subb_u32_e64 v5, s[12:13], 0, 0, tba
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0x00,0xb1,0x01]
 
-v_mac_legacy_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x0c]
+v_subb_u32_e64 v5, s[12:13], 0, 0, tma
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0x00,0xb9,0x01]
 
-v_mac_legacy_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x0c]
+v_subb_u32_e64 v5, s[12:13], 0, 0, ttmp[10:11]
+// CHECK: [0x05,0x0c,0x52,0xd2,0x80,0x00,0xe9,0x01]
 
-v_mac_legacy_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x0c]
+v_subbrev_u32 v5, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0x0a,0x54]
 
-v_mac_legacy_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x0c]
+v_subbrev_u32 v255, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0xfe,0x55]
 
-v_mac_legacy_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x0c]
+v_subbrev_u32 v5, vcc, -1, v2, vcc
+// CHECK: [0xc1,0x04,0x0a,0x54]
 
-v_mac_legacy_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x0c]
+v_subbrev_u32 v5, vcc, 0.5, v2, vcc
+// CHECK: [0xf0,0x04,0x0a,0x54]
 
-v_mac_legacy_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x0c]
+v_subbrev_u32 v5, vcc, -4.0, v2, vcc
+// CHECK: [0xf7,0x04,0x0a,0x54]
 
-v_mac_legacy_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x0c]
+v_subbrev_u32 v5, vcc, v1, v2, vcc
+// CHECK: [0x01,0x05,0x0a,0x54]
 
-v_mac_legacy_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x0c]
+v_subbrev_u32 v5, vcc, v255, v2, vcc
+// CHECK: [0xff,0x05,0x0a,0x54]
 
-v_mac_legacy_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x0c]
+v_subbrev_u32 v5, vcc, 0, v255, vcc
+// CHECK: [0x80,0xfe,0x0b,0x54]
 
-v_mac_legacy_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x0c]
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0x00,0x19,0x00]
 
-v_mac_legacy_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x0c]
+v_subbrev_u32_e64 v255, s[12:13], 0, 0, s[6:7]
+// CHECK: [0xff,0x0c,0x54,0xd2,0x80,0x00,0x19,0x00]
 
-v_mac_legacy_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x0c]
+v_subbrev_u32_e64 v5, s[14:15], 0, 0, s[6:7]
+// CHECK: [0x05,0x0e,0x54,0xd2,0x80,0x00,0x19,0x00]
 
-v_mac_legacy_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x0c]
+v_subbrev_u32_e64 v5, s[102:103], 0, 0, s[6:7]
+// CHECK: [0x05,0x66,0x54,0xd2,0x80,0x00,0x19,0x00]
 
-v_mac_legacy_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x0c]
+v_subbrev_u32_e64 v5, flat_scratch, 0, 0, s[6:7]
+// CHECK: [0x05,0x68,0x54,0xd2,0x80,0x00,0x19,0x00]
 
-v_mac_legacy_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x0c,0x56,0x34,0x12,0xaf]
+v_subbrev_u32_e64 v5, vcc, 0, 0, s[6:7]
+// CHECK: [0x05,0x6a,0x54,0xd2,0x80,0x00,0x19,0x00]
 
-v_mac_legacy_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x0c,0x73,0x72,0x71,0x3f]
+v_subbrev_u32_e64 v5, tba, 0, 0, s[6:7]
+// CHECK: [0x05,0x6c,0x54,0xd2,0x80,0x00,0x19,0x00]
 
-v_mac_legacy_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x0c]
+v_subbrev_u32_e64 v5, tma, 0, 0, s[6:7]
+// CHECK: [0x05,0x6e,0x54,0xd2,0x80,0x00,0x19,0x00]
 
-v_mac_legacy_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x0c]
+v_subbrev_u32_e64 v5, ttmp[10:11], 0, 0, s[6:7]
+// CHECK: [0x05,0x7a,0x54,0xd2,0x80,0x00,0x19,0x00]
 
-v_mac_legacy_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x0c]
+v_subbrev_u32_e64 v5, s[12:13], -1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0xc1,0x00,0x19,0x00]
 
-v_mac_legacy_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0x00,0x00,0x00]
+v_subbrev_u32_e64 v5, s[12:13], 0.5, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0xf0,0x00,0x19,0x00]
 
-v_mac_legacy_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x0c,0xd2,0x00,0x00,0x00,0x00]
+v_subbrev_u32_e64 v5, s[12:13], -4.0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0xf7,0x00,0x19,0x00]
 
-v_mac_legacy_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x0c,0xd2,0xfd,0x00,0x00,0x00]
+v_subbrev_u32_e64 v5, s[12:13], v1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0x01,0x01,0x19,0x00]
 
-v_mac_legacy_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0x01,0x00,0x00]
+v_subbrev_u32_e64 v5, s[12:13], v255, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0xff,0x01,0x19,0x00]
 
-v_mac_legacy_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x0c,0xd2,0xff,0x01,0x00,0x00]
+v_subbrev_u32_e64 v5, s[12:13], 0, -1, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0x82,0x19,0x00]
 
-v_mac_legacy_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0xfa,0x01,0x00]
+v_subbrev_u32_e64 v5, s[12:13], 0, 0.5, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0xe0,0x19,0x00]
 
-v_mac_legacy_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0x00,0x02,0x00]
+v_subbrev_u32_e64 v5, s[12:13], 0, -4.0, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0xee,0x19,0x00]
 
-v_mac_legacy_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0xfe,0x03,0x00]
+v_subbrev_u32_e64 v5, s[12:13], 0, v2, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0x04,0x1a,0x00]
 
-v_mac_legacy_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0x00,0x00,0x20]
+v_subbrev_u32_e64 v5, s[12:13], 0, v255, s[6:7]
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0xfe,0x1b,0x00]
 
-v_mac_legacy_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0x00,0x00,0x40]
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, s[8:9]
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0x00,0x21,0x00]
 
-v_mac_legacy_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0x00,0x00,0x60]
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, s[102:103]
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0x00,0x99,0x01]
 
-v_mac_legacy_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x0c,0xd2,0x00,0x00,0x00,0x00]
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, flat_scratch
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0x00,0xa1,0x01]
 
-v_mac_legacy_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x0c,0xd2,0x00,0x00,0x00,0x00]
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, vcc
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0x00,0xa9,0x01]
 
-v_mac_legacy_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x0c,0xd2,0x00,0x00,0x00,0x00]
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, tba
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0x00,0xb1,0x01]
 
-v_mac_legacy_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x0c,0xd2,0x00,0x00,0x00,0x00]
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, tma
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0x00,0xb9,0x01]
 
-v_mac_legacy_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0x00,0x00,0x08]
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, ttmp[10:11]
+// CHECK: [0x05,0x0c,0x54,0xd2,0x80,0x00,0xe9,0x01]
 
-v_mac_legacy_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0x00,0x00,0x10]
+v_ldexp_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x56]
 
-v_mac_legacy_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x0c,0xd2,0x00,0x00,0x00,0x18]
+v_ldexp_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x57]
 
-v_mul_legacy_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x0e]
+v_ldexp_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x0f]
+v_ldexp_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x0e]
+v_ldexp_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x0e]
+v_ldexp_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x0e]
+v_ldexp_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x0e]
+v_ldexp_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x0e]
+v_ldexp_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x0e]
+v_ldexp_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x0e]
+v_ldexp_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x0e]
+v_ldexp_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x0e]
+v_ldexp_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x0e]
+v_ldexp_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x0e]
+v_ldexp_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x0e]
+v_ldexp_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x0e]
+v_ldexp_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x0e]
+v_ldexp_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x0e]
+v_ldexp_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x56]
 
-v_mul_legacy_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x0e]
+v_ldexp_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x56,0x56,0x34,0x12,0xaf]
 
-v_mul_legacy_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x0e]
+v_ldexp_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x56,0x73,0x72,0x71,0x3f]
 
-v_mul_legacy_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x0e,0x56,0x34,0x12,0xaf]
+v_ldexp_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x56]
 
-v_mul_legacy_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x0e,0x73,0x72,0x71,0x3f]
+v_ldexp_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x56]
 
-v_mul_legacy_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x0e]
+v_ldexp_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x56]
 
-v_mul_legacy_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x0e]
+v_ldexp_f32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0x04,0x00,0x00]
 
-v_mul_legacy_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x0e]
+v_ldexp_f32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x56,0xd2,0x80,0x04,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0x00,0x00,0x00]
+v_ldexp_f32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x56,0xd2,0xf0,0x04,0x00,0x00]
 
-v_mul_legacy_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x0e,0xd2,0x00,0x00,0x00,0x00]
+v_ldexp_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x56,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x0e,0xd2,0xfd,0x00,0x00,0x00]
+v_ldexp_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x56,0xd2,0xff,0x05,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0x01,0x00,0x00]
+v_ldexp_f32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xce,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x0e,0xd2,0xff,0x01,0x00,0x00]
+v_ldexp_f32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xd0,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0xfa,0x01,0x00]
+v_ldexp_f32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xd2,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0x00,0x02,0x00]
+v_ldexp_f32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xd4,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0xfe,0x03,0x00]
+v_ldexp_f32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xd6,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0x00,0x00,0x20]
+v_ldexp_f32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xd8,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0x00,0x00,0x40]
+v_ldexp_f32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xda,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0x00,0x00,0x60]
+v_ldexp_f32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xdc,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x0e,0xd2,0x00,0x00,0x00,0x00]
+v_ldexp_f32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xde,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x0e,0xd2,0x00,0x00,0x00,0x00]
+v_ldexp_f32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xf6,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x0e,0xd2,0x00,0x00,0x00,0x00]
+v_ldexp_f32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xf8,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x0e,0xd2,0x00,0x00,0x00,0x00]
+v_ldexp_f32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xfc,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0x00,0x00,0x08]
+v_ldexp_f32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xfe,0x00,0x00]
 
-v_mul_legacy_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0x00,0x00,0x10]
+v_ldexp_f32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0x00,0x01,0x00]
 
-v_mul_legacy_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x0e,0xd2,0x00,0x00,0x00,0x18]
+v_ldexp_f32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0x82,0x01,0x00]
 
-v_mul_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x10]
+v_ldexp_f32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xe0,0x01,0x00]
 
-v_mul_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x11]
+v_ldexp_f32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xee,0x01,0x00]
 
-v_mul_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x10]
+v_ldexp_f32_e64 v5, 0, scc
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xfa,0x01,0x00]
 
-v_mul_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x10]
+v_ldexp_f32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0x04,0x02,0x00]
 
-v_mul_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x10]
+v_ldexp_f32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x56,0xd2,0x80,0xfe,0x03,0x00]
 
-v_mul_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x58]
 
-v_mul_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x59]
 
-v_mul_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x58]
 
-v_mul_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x58]
 
-v_mul_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x58]
 
-v_mul_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x58]
 
-v_mul_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x58]
 
-v_mul_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x58]
 
-v_mul_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x58]
 
-v_mul_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x58]
 
-v_mul_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x58]
 
-v_mul_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x58]
 
-v_mul_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x58]
 
-v_mul_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x58]
 
-v_mul_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x10,0x56,0x34,0x12,0xaf]
+v_cvt_pkaccum_u8_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x58]
 
-v_mul_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x10,0x73,0x72,0x71,0x3f]
+v_cvt_pkaccum_u8_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x58]
 
-v_mul_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x58]
 
-v_mul_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x10]
+v_cvt_pkaccum_u8_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x58]
 
-v_mul_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x10]
+v_cvt_pkaccum_u8_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x58]
 
-v_mul_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pkaccum_u8_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x58,0x56,0x34,0x12,0xaf]
 
-v_mul_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x10,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pkaccum_u8_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x58,0x73,0x72,0x71,0x3f]
 
-v_mul_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x10,0xd2,0xfd,0x00,0x00,0x00]
+v_cvt_pkaccum_u8_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x58]
 
-v_mul_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0x01,0x00,0x00]
+v_cvt_pkaccum_u8_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x58]
 
-v_mul_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x10,0xd2,0xff,0x01,0x00,0x00]
+v_cvt_pkaccum_u8_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x58]
 
-v_mul_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0xfa,0x01,0x00]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0x00,0x02,0x00]
+v_cvt_pkaccum_u8_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x58,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0xfe,0x03,0x00]
+v_cvt_pkaccum_u8_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x58,0xd2,0xff,0x05,0x00,0x00]
 
-v_mul_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0x00,0x00,0x20]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xcf,0x00,0x00]
 
-v_mul_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0x00,0x00,0x40]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xd1,0x00,0x00]
 
-v_mul_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0x00,0x00,0x60]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xd3,0x00,0x00]
 
-v_mul_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x10,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xd5,0x00,0x00]
 
-v_mul_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x10,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xd7,0x00,0x00]
 
-v_mul_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x10,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xd9,0x00,0x00]
 
-v_mul_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x10,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xdb,0x00,0x00]
 
-v_mul_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0x00,0x00,0x08]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xdd,0x00,0x00]
 
-v_mul_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0x00,0x00,0x10]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xdf,0x00,0x00]
 
-v_mul_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x10,0xd2,0x00,0x00,0x00,0x18]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xf7,0x00,0x00]
 
-v_mul_i32_i24 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xf9,0x00,0x00]
 
-v_mul_i32_i24 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x13]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xfd,0x00,0x00]
 
-v_mul_i32_i24 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xff,0x00,0x00]
 
-v_mul_i32_i24 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, 0
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0x01,0x01,0x00]
 
-v_mul_i32_i24 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, -1
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0x83,0x01,0x00]
 
-v_mul_i32_i24 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, 0.5
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xe1,0x01,0x00]
 
-v_mul_i32_i24 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, -4.0
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xef,0x01,0x00]
 
-v_mul_i32_i24 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xfb,0x01,0x00]
 
-v_mul_i32_i24 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0x05,0x02,0x00]
 
-v_mul_i32_i24 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0xff,0x03,0x00]
 
-v_mul_i32_i24 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x58,0xd2,0x01,0x05,0x00,0x20]
 
-v_mul_i32_i24 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x12]
+v_cvt_pkaccum_u8_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x58,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_i32_i24 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x12]
+v_cvt_pknorm_i16_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x5a]
 
-v_mul_i32_i24 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x12]
+v_cvt_pknorm_i16_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x5b]
 
-v_mul_i32_i24 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x12]
+v_cvt_pknorm_i16_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x5a]
 
-v_mul_i32_i24 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x12]
+v_cvt_pknorm_i16_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x5a]
 
-v_mul_i32_i24 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x12]
+v_cvt_pknorm_i16_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x5a]
 
-v_mul_i32_i24 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x12]
+v_cvt_pknorm_i16_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x5a]
 
-v_mul_i32_i24 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x12]
+v_cvt_pknorm_i16_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x5a]
 
-v_mul_i32_i24 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x12,0x56,0x34,0x12,0xaf]
+v_cvt_pknorm_i16_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x5a]
 
-v_mul_i32_i24 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x12,0x73,0x72,0x71,0x3f]
+v_cvt_pknorm_i16_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x5a]
 
-v_mul_i32_i24 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x12]
+v_cvt_pknorm_i16_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x5a]
 
-v_mul_i32_i24 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x12]
+v_cvt_pknorm_i16_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x5a]
 
-v_mul_i32_i24 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x12]
+v_cvt_pknorm_i16_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x5a]
 
-v_mul_i32_i24_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x12,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x5a]
 
-v_mul_i32_i24_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x12,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x5a]
 
-v_mul_i32_i24_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x12,0xd2,0x80,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x5a]
 
-v_mul_i32_i24_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x12,0xd2,0xc1,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x5a]
 
-v_mul_i32_i24_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x12,0xd2,0xf0,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x5a]
 
-v_mul_i32_i24_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x12,0xd2,0xf7,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x5a]
 
-v_mul_i32_i24_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x12,0xd2,0x00,0x01,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x5a]
 
-v_mul_i32_i24_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x12,0xd2,0xff,0x01,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x5a,0x56,0x34,0x12,0xaf]
 
-v_mul_i32_i24_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x12,0xd2,0x00,0x00,0x01,0x00]
+v_cvt_pknorm_i16_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x5a,0x73,0x72,0x71,0x3f]
 
-v_mul_i32_i24_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x12,0xd2,0x00,0x82,0x01,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x5a]
 
-v_mul_i32_i24_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x12,0xd2,0x00,0xe0,0x01,0x00]
+v_cvt_pknorm_i16_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x5a]
 
-v_mul_i32_i24_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x12,0xd2,0x00,0xee,0x01,0x00]
+v_cvt_pknorm_i16_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x5a]
 
-v_mul_i32_i24_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x12,0xd2,0x00,0x00,0x02,0x00]
+v_cvt_pknorm_i16_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_i32_i24_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x12,0xd2,0x00,0xfe,0x03,0x00]
+v_cvt_pknorm_i16_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x5a,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x5a,0xd2,0xff,0x05,0x00,0x00]
 
-v_mul_hi_i32_i24 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x15]
+v_cvt_pknorm_i16_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xcf,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xd1,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xd3,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xd5,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xd7,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xd9,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xdb,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xdd,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xdf,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xf7,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xf9,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xfd,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xff,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xfb,0x01,0x00]
 
-v_mul_hi_i32_i24 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0x05,0x02,0x00]
 
-v_mul_hi_i32_i24 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0xff,0x03,0x00]
 
-v_mul_hi_i32_i24 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0x05,0x00,0x20]
 
-v_mul_hi_i32_i24 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0x05,0x00,0x40]
 
-v_mul_hi_i32_i24 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x14,0x56,0x34,0x12,0xaf]
+v_cvt_pknorm_i16_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x5a,0xd2,0x01,0x05,0x00,0x60]
 
-v_mul_hi_i32_i24 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x14,0x73,0x72,0x71,0x3f]
+v_cvt_pknorm_i16_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x5a,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x5a,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x14]
+v_cvt_pknorm_i16_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x5a,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_hi_i32_i24 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x14]
+v_cvt_pknorm_u16_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x14,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x5d]
 
-v_mul_hi_i32_i24_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x14,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x14,0xd2,0x80,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x14,0xd2,0xc1,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x14,0xd2,0xf0,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x14,0xd2,0xf7,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x14,0xd2,0x00,0x01,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x14,0xd2,0xff,0x01,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x14,0xd2,0x00,0x00,0x01,0x00]
+v_cvt_pknorm_u16_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x14,0xd2,0x00,0x82,0x01,0x00]
+v_cvt_pknorm_u16_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x14,0xd2,0x00,0xe0,0x01,0x00]
+v_cvt_pknorm_u16_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x14,0xd2,0x00,0xee,0x01,0x00]
+v_cvt_pknorm_u16_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x14,0xd2,0x00,0x00,0x02,0x00]
+v_cvt_pknorm_u16_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x5c]
 
-v_mul_hi_i32_i24_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x14,0xd2,0x00,0xfe,0x03,0x00]
+v_cvt_pknorm_u16_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x5c]
 
-v_mul_u32_u24 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x5c]
 
-v_mul_u32_u24 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x17]
+v_cvt_pknorm_u16_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x5c]
 
-v_mul_u32_u24 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x5c]
 
-v_mul_u32_u24 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x5c]
 
-v_mul_u32_u24 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x5c,0x56,0x34,0x12,0xaf]
 
-v_mul_u32_u24 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x5c,0x73,0x72,0x71,0x3f]
 
-v_mul_u32_u24 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x5c]
 
-v_mul_u32_u24 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x5c]
 
-v_mul_u32_u24 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x5c]
 
-v_mul_u32_u24 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_u32_u24 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x5c,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_u32_u24 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x5c,0xd2,0xff,0x05,0x00,0x00]
 
-v_mul_u32_u24 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xcf,0x00,0x00]
 
-v_mul_u32_u24 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xd1,0x00,0x00]
 
-v_mul_u32_u24 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xd3,0x00,0x00]
 
-v_mul_u32_u24 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xd5,0x00,0x00]
 
-v_mul_u32_u24 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xd7,0x00,0x00]
 
-v_mul_u32_u24 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xd9,0x00,0x00]
 
-v_mul_u32_u24 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xdb,0x00,0x00]
 
-v_mul_u32_u24 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x16,0x56,0x34,0x12,0xaf]
+v_cvt_pknorm_u16_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xdd,0x00,0x00]
 
-v_mul_u32_u24 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x16,0x73,0x72,0x71,0x3f]
+v_cvt_pknorm_u16_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xdf,0x00,0x00]
 
-v_mul_u32_u24 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xf7,0x00,0x00]
 
-v_mul_u32_u24 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xf9,0x00,0x00]
 
-v_mul_u32_u24 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x16]
+v_cvt_pknorm_u16_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xfd,0x00,0x00]
 
-v_mul_u32_u24_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x16,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xff,0x00,0x00]
 
-v_mul_u32_u24_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x16,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xfb,0x01,0x00]
 
-v_mul_u32_u24_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x16,0xd2,0x80,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0x05,0x02,0x00]
 
-v_mul_u32_u24_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x16,0xd2,0xc1,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0xff,0x03,0x00]
 
-v_mul_u32_u24_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x16,0xd2,0xf0,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0x05,0x00,0x20]
 
-v_mul_u32_u24_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x16,0xd2,0xf7,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0x05,0x00,0x40]
 
-v_mul_u32_u24_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x16,0xd2,0x00,0x01,0x00,0x00]
+v_cvt_pknorm_u16_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x5c,0xd2,0x01,0x05,0x00,0x60]
 
-v_mul_u32_u24_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x16,0xd2,0xff,0x01,0x00,0x00]
+v_cvt_pknorm_u16_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x5c,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_u32_u24_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x16,0xd2,0x00,0x00,0x01,0x00]
+v_cvt_pknorm_u16_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x5c,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_u32_u24_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x16,0xd2,0x00,0x82,0x01,0x00]
+v_cvt_pknorm_u16_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x5c,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_u32_u24_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x16,0xd2,0x00,0xe0,0x01,0x00]
+v_cvt_pkrtz_f16_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x5e]
 
-v_mul_u32_u24_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x16,0xd2,0x00,0xee,0x01,0x00]
+v_cvt_pkrtz_f16_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x5f]
 
-v_mul_u32_u24_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x16,0xd2,0x00,0x00,0x02,0x00]
+v_cvt_pkrtz_f16_f32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x5e]
 
-v_mul_u32_u24_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x16,0xd2,0x00,0xfe,0x03,0x00]
+v_cvt_pkrtz_f16_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x19]
+v_cvt_pkrtz_f16_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x5e,0x56,0x34,0x12,0xaf]
 
-v_mul_hi_u32_u24 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x5e,0x73,0x72,0x71,0x3f]
 
-v_mul_hi_u32_u24 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x18]
+v_cvt_pkrtz_f16_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x5e]
 
-v_mul_hi_u32_u24 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+v_cvt_pkrtz_f16_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x5e]
 
-v_mul_hi_u32_u24 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x18,0x73,0x72,0x71,0x3f]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_hi_u32_u24 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x18]
+v_cvt_pkrtz_f16_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x5e,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_hi_u32_u24 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x18]
+v_cvt_pkrtz_f16_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x5e,0xd2,0xff,0x05,0x00,0x00]
 
-v_mul_hi_u32_u24 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x18]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, s103
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xcf,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x18,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xd1,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x18,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xd3,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x18,0xd2,0x80,0x00,0x00,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xd5,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x18,0xd2,0xc1,0x00,0x00,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xd7,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x18,0xd2,0xf0,0x00,0x00,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xd9,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x18,0xd2,0xf7,0x00,0x00,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xdb,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x18,0xd2,0x00,0x01,0x00,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xdd,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x18,0xd2,0xff,0x01,0x00,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xdf,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x18,0xd2,0x00,0x00,0x01,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xf7,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x18,0xd2,0x00,0x82,0x01,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xf9,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x18,0xd2,0x00,0xe0,0x01,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xfd,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x18,0xd2,0x00,0xee,0x01,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xff,0x00,0x00]
 
-v_mul_hi_u32_u24_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x18,0xd2,0x00,0x00,0x02,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xfb,0x01,0x00]
 
-v_mul_hi_u32_u24_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x18,0xd2,0x00,0xfe,0x03,0x00]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0x05,0x02,0x00]
 
-v_min_legacy_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x1a]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0xff,0x03,0x00]
 
-v_min_legacy_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x1b]
+v_cvt_pkrtz_f16_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0x05,0x00,0x20]
 
-v_min_legacy_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x1a]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0x05,0x00,0x40]
 
-v_min_legacy_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x1a]
+v_cvt_pkrtz_f16_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x5e,0xd2,0x01,0x05,0x00,0x60]
 
-v_min_legacy_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x1a]
+v_cvt_pkrtz_f16_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x5e,0xd2,0x01,0x05,0x00,0x00]
 
-v_min_legacy_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x1a]
+v_cvt_pkrtz_f16_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x5e,0xd2,0x01,0x05,0x00,0x00]
 
-v_min_legacy_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x1a]
+v_cvt_pkrtz_f16_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x5e,0xd2,0x01,0x05,0x00,0x00]
 
-v_min_legacy_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x61]
 
-v_min_legacy_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x1a,0x56,0x34,0x12,0xaf]
+v_cvt_pk_u16_u32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x1a,0x73,0x72,0x71,0x3f]
+v_cvt_pk_u16_u32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x1a]
+v_cvt_pk_u16_u32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x60]
 
-v_min_legacy_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x1a]
+v_cvt_pk_u16_u32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x60]
 
-v_min_legacy_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u16_u32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x60]
 
-v_min_legacy_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x1a,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u16_u32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x60]
 
-v_min_legacy_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x1a,0xd2,0xfd,0x00,0x00,0x00]
+v_cvt_pk_u16_u32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x60,0x56,0x34,0x12,0xaf]
 
-v_min_legacy_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0x01,0x00,0x00]
+v_cvt_pk_u16_u32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x60,0x73,0x72,0x71,0x3f]
 
-v_min_legacy_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x1a,0xd2,0xff,0x01,0x00,0x00]
+v_cvt_pk_u16_u32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x60]
 
-v_min_legacy_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0xfa,0x01,0x00]
+v_cvt_pk_u16_u32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x60]
 
-v_min_legacy_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0x00,0x02,0x00]
+v_cvt_pk_u16_u32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x60]
 
-v_min_legacy_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0xfe,0x03,0x00]
+v_cvt_pk_u16_u32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0x04,0x00,0x00]
 
-v_min_legacy_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0x00,0x00,0x20]
+v_cvt_pk_u16_u32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x60,0xd2,0x80,0x04,0x00,0x00]
 
-v_min_legacy_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0x00,0x00,0x40]
+v_cvt_pk_u16_u32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x60,0xd2,0xc1,0x04,0x00,0x00]
 
-v_min_legacy_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0x00,0x00,0x60]
+v_cvt_pk_u16_u32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x60,0xd2,0xf0,0x04,0x00,0x00]
 
-v_min_legacy_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x1a,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u16_u32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x60,0xd2,0xf7,0x04,0x00,0x00]
 
-v_min_legacy_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x1a,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u16_u32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x60,0xd2,0x01,0x05,0x00,0x00]
 
-v_min_legacy_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x1a,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u16_u32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x60,0xd2,0xff,0x05,0x00,0x00]
 
-v_min_legacy_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x1a,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u16_u32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xce,0x00,0x00]
 
-v_min_legacy_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0x00,0x00,0x08]
+v_cvt_pk_u16_u32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xd0,0x00,0x00]
 
-v_min_legacy_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0x00,0x00,0x10]
+v_cvt_pk_u16_u32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xd2,0x00,0x00]
 
-v_min_legacy_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x1a,0xd2,0x00,0x00,0x00,0x18]
+v_cvt_pk_u16_u32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xd4,0x00,0x00]
 
-v_max_legacy_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xd6,0x00,0x00]
 
-v_max_legacy_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x1d]
+v_cvt_pk_u16_u32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xd8,0x00,0x00]
 
-v_max_legacy_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xda,0x00,0x00]
 
-v_max_legacy_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xdc,0x00,0x00]
 
-v_max_legacy_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xde,0x00,0x00]
 
-v_max_legacy_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xf6,0x00,0x00]
 
-v_max_legacy_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xf8,0x00,0x00]
 
-v_max_legacy_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xfc,0x00,0x00]
 
-v_max_legacy_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xfe,0x00,0x00]
 
-v_max_legacy_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0x00,0x01,0x00]
 
-v_max_legacy_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0x82,0x01,0x00]
 
-v_max_legacy_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xe0,0x01,0x00]
 
-v_max_legacy_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xee,0x01,0x00]
 
-v_max_legacy_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0x04,0x02,0x00]
 
-v_max_legacy_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x1c]
+v_cvt_pk_u16_u32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x60,0xd2,0x80,0xfe,0x03,0x00]
 
-v_max_legacy_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x1c]
+v_cvt_pk_i16_i32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x62]
 
-v_max_legacy_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x1c]
+v_cvt_pk_i16_i32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x63]
 
-v_max_legacy_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x1c]
+v_cvt_pk_i16_i32 v5, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x62]
 
-v_max_legacy_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x1c]
+v_cvt_pk_i16_i32 v5, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x62]
 
-v_max_legacy_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x1c,0x56,0x34,0x12,0xaf]
+v_cvt_pk_i16_i32 v5, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x62]
 
-v_max_legacy_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x1c,0x73,0x72,0x71,0x3f]
+v_cvt_pk_i16_i32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x62]
 
-v_max_legacy_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x1c]
+v_cvt_pk_i16_i32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x62]
 
-v_max_legacy_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x1c]
+v_cvt_pk_i16_i32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x62]
 
-v_max_legacy_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x1c]
+v_cvt_pk_i16_i32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x1c,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x1c,0xd2,0xfd,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0x01,0x00,0x00]
+v_cvt_pk_i16_i32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x1c,0xd2,0xff,0x01,0x00,0x00]
+v_cvt_pk_i16_i32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0xfa,0x01,0x00]
+v_cvt_pk_i16_i32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0x00,0x02,0x00]
+v_cvt_pk_i16_i32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0xfe,0x03,0x00]
+v_cvt_pk_i16_i32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0x00,0x00,0x20]
+v_cvt_pk_i16_i32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0x00,0x00,0x40]
+v_cvt_pk_i16_i32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0x00,0x00,0x60]
+v_cvt_pk_i16_i32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x62,0x56,0x34,0x12,0xaf]
 
-v_max_legacy_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x1c,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x62,0x73,0x72,0x71,0x3f]
 
-v_max_legacy_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x1c,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x1c,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x62]
 
-v_max_legacy_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x1c,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x62]
 
-v_max_legacy_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0x00,0x00,0x08]
+v_cvt_pk_i16_i32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0x04,0x00,0x00]
 
-v_max_legacy_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0x00,0x00,0x10]
+v_cvt_pk_i16_i32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x62,0xd2,0x80,0x04,0x00,0x00]
 
-v_max_legacy_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x1c,0xd2,0x00,0x00,0x00,0x18]
+v_cvt_pk_i16_i32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x62,0xd2,0xc1,0x04,0x00,0x00]
 
-v_min_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x62,0xd2,0xf0,0x04,0x00,0x00]
 
-v_min_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x1f]
+v_cvt_pk_i16_i32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x62,0xd2,0xf7,0x04,0x00,0x00]
 
-v_min_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x62,0xd2,0x01,0x05,0x00,0x00]
 
-v_min_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x62,0xd2,0xff,0x05,0x00,0x00]
 
-v_min_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, s103
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xce,0x00,0x00]
 
-v_min_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xd0,0x00,0x00]
 
-v_min_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xd2,0x00,0x00]
 
-v_min_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xd4,0x00,0x00]
 
-v_min_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xd6,0x00,0x00]
 
-v_min_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xd8,0x00,0x00]
 
-v_min_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xda,0x00,0x00]
 
-v_min_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xdc,0x00,0x00]
 
-v_min_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xde,0x00,0x00]
 
-v_min_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xf6,0x00,0x00]
 
-v_min_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xf8,0x00,0x00]
 
-v_min_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xfc,0x00,0x00]
 
-v_min_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xfe,0x00,0x00]
 
-v_min_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0x00,0x01,0x00]
 
-v_min_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0x82,0x01,0x00]
 
-v_min_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x1e,0x56,0x34,0x12,0xaf]
+v_cvt_pk_i16_i32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xe0,0x01,0x00]
 
-v_min_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x1e,0x73,0x72,0x71,0x3f]
+v_cvt_pk_i16_i32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xee,0x01,0x00]
 
-v_min_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0x04,0x02,0x00]
 
-v_min_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x1e]
+v_cvt_pk_i16_i32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x62,0xd2,0x80,0xfe,0x03,0x00]
 
-v_min_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x1e]
+v_mad_legacy_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0x00,0x00,0x00]
+v_mad_legacy_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0x80,0xd2,0x01,0x04,0x0e,0x04]
 
-v_min_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x1e,0xd2,0x00,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x67,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x1e,0xd2,0xfd,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x68,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0x01,0x00,0x00]
+v_mad_legacy_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x69,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x1e,0xd2,0xff,0x01,0x00,0x00]
+v_mad_legacy_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0xfa,0x01,0x00]
+v_mad_legacy_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0x00,0x02,0x00]
+v_mad_legacy_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0xfe,0x03,0x00]
+v_mad_legacy_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0x00,0x00,0x40]
+v_mad_legacy_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0x00,0x00,0x60]
+v_mad_legacy_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x1e,0xd2,0x00,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x1e,0xd2,0x00,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x1e,0xd2,0x00,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x1e,0xd2,0x00,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_min_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0x00,0x00,0x08]
+v_mad_legacy_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x05,0x0e,0x04]
 
-v_min_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0x00,0x00,0x10]
+v_mad_legacy_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0xff,0x05,0x0e,0x04]
 
-v_min_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x1e,0xd2,0x00,0x00,0x00,0x18]
+v_mad_legacy_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_max_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x04,0xfe,0x07]
 
-v_max_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x21]
+v_mad_legacy_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x04,0x0e,0x24]
 
-v_max_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x04,0x0e,0x44]
 
-v_max_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x04,0x0e,0x84]
 
-v_max_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_max_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0x80,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0x80,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0x80,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0x80,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0x80,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_max_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x04,0x0e,0x14]
 
-v_max_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x20]
+v_mad_legacy_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_max_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x20]
+v_mad_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x20]
+v_mad_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0x82,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x20]
+v_mad_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x67,0x04,0x0e,0x04]
 
-v_max_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x20]
+v_mad_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x68,0x04,0x0e,0x04]
 
-v_max_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x20]
+v_mad_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x69,0x04,0x0e,0x04]
 
-v_max_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x20]
+v_mad_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_max_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+v_mad_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_max_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x20,0x73,0x72,0x71,0x3f]
+v_mad_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_max_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x20]
+v_mad_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_max_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x20]
+v_mad_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_max_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x20]
+v_mad_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_max_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0x00,0x00,0x00]
+v_mad_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_max_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x20,0xd2,0x00,0x00,0x00,0x00]
+v_mad_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_max_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x20,0xd2,0xfd,0x00,0x00,0x00]
+v_mad_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_max_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0x01,0x00,0x00]
+v_mad_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_max_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x20,0xd2,0xff,0x01,0x00,0x00]
+v_mad_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_max_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0xfa,0x01,0x00]
+v_mad_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x05,0x0e,0x04]
 
-v_max_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0x00,0x02,0x00]
+v_mad_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0xff,0x05,0x0e,0x04]
 
-v_max_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0xfe,0x03,0x00]
+v_mad_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_max_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0x00,0x00,0x20]
+v_mad_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x04,0xfe,0x07]
 
-v_max_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0x00,0x00,0x40]
+v_mad_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x04,0x0e,0x24]
 
-v_max_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0x00,0x00,0x60]
+v_mad_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x04,0x0e,0x44]
 
-v_max_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x20,0xd2,0x00,0x00,0x00,0x00]
+v_mad_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x04,0x0e,0x84]
 
-v_max_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x20,0xd2,0x00,0x00,0x00,0x00]
+v_mad_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_max_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x20,0xd2,0x00,0x00,0x00,0x00]
+v_mad_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0x82,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x20,0xd2,0x00,0x00,0x00,0x00]
+v_mad_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0x82,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0x00,0x00,0x08]
+v_mad_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0x82,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0x00,0x00,0x10]
+v_mad_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0x82,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x20,0xd2,0x00,0x00,0x00,0x18]
+v_mad_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0x82,0xd2,0x01,0x04,0x0e,0x04]
 
-v_min_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x22]
+v_mad_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_min_i32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x23]
+v_mad_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x04,0x0e,0x14]
 
-v_min_i32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x22]
+v_mad_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_min_i32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x22]
+v_mad_i32_i24 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0x00,0x01,0x02]
 
-v_min_i32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x22]
+v_mad_i32_i24 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0x84,0xd2,0x01,0x00,0x01,0x02]
 
-v_min_i32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x22]
+v_mad_i32_i24 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x67,0x00,0x01,0x02]
 
-v_min_i32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x22]
+v_mad_i32_i24 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x68,0x00,0x01,0x02]
 
-v_min_i32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x22]
+v_mad_i32_i24 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x69,0x00,0x01,0x02]
 
-v_min_i32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x22]
+v_mad_i32_i24 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x6a,0x00,0x01,0x02]
 
-v_min_i32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x22]
+v_mad_i32_i24 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x6b,0x00,0x01,0x02]
 
-v_min_i32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x22]
+v_mad_i32_i24 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x6c,0x00,0x01,0x02]
 
-v_min_i32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x22]
+v_mad_i32_i24 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x6d,0x00,0x01,0x02]
 
-v_min_i32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x22]
+v_mad_i32_i24 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x6e,0x00,0x01,0x02]
 
-v_min_i32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x22]
+v_mad_i32_i24 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x6f,0x00,0x01,0x02]
 
-v_min_i32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x22]
+v_mad_i32_i24 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x7b,0x00,0x01,0x02]
 
-v_min_i32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x22]
+v_mad_i32_i24 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x7c,0x00,0x01,0x02]
 
-v_min_i32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x22]
+v_mad_i32_i24 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x7e,0x00,0x01,0x02]
 
-v_min_i32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x22]
+v_mad_i32_i24 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x7f,0x00,0x01,0x02]
 
-v_min_i32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x22]
+v_mad_i32_i24 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0x00,0x01,0x02]
 
-v_min_i32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x22,0x56,0x34,0x12,0xaf]
+v_mad_i32_i24 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0xc1,0x00,0x01,0x02]
 
-v_min_i32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x22,0x73,0x72,0x71,0x3f]
+v_mad_i32_i24 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0xf0,0x00,0x01,0x02]
 
-v_min_i32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x22]
+v_mad_i32_i24 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0xf7,0x00,0x01,0x02]
 
-v_min_i32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x22]
+v_mad_i32_i24 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0x01,0x01,0x02]
 
-v_min_i32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x22]
+v_mad_i32_i24 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0xff,0x01,0x01,0x02]
 
-v_min_i32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x22,0xd2,0x00,0x00,0x00,0x00]
+v_mad_i32_i24 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0x82,0x01,0x02]
 
-v_min_i32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x22,0xd2,0x00,0x00,0x00,0x00]
+v_mad_i32_i24 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0xe0,0x01,0x02]
 
-v_min_i32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x22,0xd2,0x80,0x00,0x00,0x00]
+v_mad_i32_i24 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0xee,0x01,0x02]
 
-v_min_i32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x22,0xd2,0xc1,0x00,0x00,0x00]
+v_mad_i32_i24 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0x04,0x02,0x02]
 
-v_min_i32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x22,0xd2,0xf0,0x00,0x00,0x00]
+v_mad_i32_i24 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0xfe,0x03,0x02]
 
-v_min_i32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x22,0xd2,0xf7,0x00,0x00,0x00]
+v_mad_i32_i24 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0x00,0x05,0x03]
 
-v_min_i32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x22,0xd2,0x00,0x01,0x00,0x00]
+v_mad_i32_i24 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0x00,0xc1,0x03]
 
-v_min_i32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x22,0xd2,0xff,0x01,0x00,0x00]
+v_mad_i32_i24 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0x00,0xdd,0x03]
 
-v_min_i32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x22,0xd2,0x00,0x00,0x01,0x00]
+v_mad_i32_i24 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0x00,0x0d,0x04]
 
-v_min_i32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x22,0xd2,0x00,0x82,0x01,0x00]
+v_mad_i32_i24 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0x00,0xfd,0x07]
 
-v_min_i32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x22,0xd2,0x00,0xe0,0x01,0x00]
+v_mad_u32_u24 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0x00,0x01,0x02]
 
-v_min_i32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x22,0xd2,0x00,0xee,0x01,0x00]
+v_mad_u32_u24 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0x86,0xd2,0x01,0x00,0x01,0x02]
 
-v_min_i32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x22,0xd2,0x00,0x00,0x02,0x00]
+v_mad_u32_u24 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x67,0x00,0x01,0x02]
 
-v_min_i32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x22,0xd2,0x00,0xfe,0x03,0x00]
+v_mad_u32_u24 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x68,0x00,0x01,0x02]
 
-v_max_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x24]
+v_mad_u32_u24 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x69,0x00,0x01,0x02]
 
-v_max_i32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x25]
+v_mad_u32_u24 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x6a,0x00,0x01,0x02]
 
-v_max_i32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x24]
+v_mad_u32_u24 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x6b,0x00,0x01,0x02]
 
-v_max_i32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x24]
+v_mad_u32_u24 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x6c,0x00,0x01,0x02]
 
-v_max_i32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x24]
+v_mad_u32_u24 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x6d,0x00,0x01,0x02]
 
-v_max_i32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x24]
+v_mad_u32_u24 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x6e,0x00,0x01,0x02]
 
-v_max_i32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x24]
+v_mad_u32_u24 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x6f,0x00,0x01,0x02]
 
-v_max_i32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x24]
+v_mad_u32_u24 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x7b,0x00,0x01,0x02]
 
-v_max_i32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x24]
+v_mad_u32_u24 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x7c,0x00,0x01,0x02]
 
-v_max_i32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x24]
+v_mad_u32_u24 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x7e,0x00,0x01,0x02]
 
-v_max_i32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x24]
+v_mad_u32_u24 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x7f,0x00,0x01,0x02]
 
-v_max_i32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x24]
+v_mad_u32_u24 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0x00,0x01,0x02]
 
-v_max_i32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x24]
+v_mad_u32_u24 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0xc1,0x00,0x01,0x02]
 
-v_max_i32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x24]
+v_mad_u32_u24 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0xf0,0x00,0x01,0x02]
 
-v_max_i32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x24]
+v_mad_u32_u24 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0xf7,0x00,0x01,0x02]
 
-v_max_i32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x24]
+v_mad_u32_u24 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0x01,0x01,0x02]
 
-v_max_i32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x24]
+v_mad_u32_u24 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0xff,0x01,0x01,0x02]
 
-v_max_i32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x24]
+v_mad_u32_u24 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0x82,0x01,0x02]
 
-v_max_i32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x24]
+v_mad_u32_u24 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0xe0,0x01,0x02]
 
-v_max_i32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x24,0x56,0x34,0x12,0xaf]
+v_mad_u32_u24 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0xee,0x01,0x02]
 
-v_max_i32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x24,0x73,0x72,0x71,0x3f]
+v_mad_u32_u24 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0x04,0x02,0x02]
 
-v_max_i32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x24]
+v_mad_u32_u24 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0xfe,0x03,0x02]
 
-v_max_i32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x24]
+v_mad_u32_u24 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0x00,0x05,0x03]
 
-v_max_i32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x24]
+v_mad_u32_u24 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0x00,0xc1,0x03]
 
-v_max_i32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x24,0xd2,0x00,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0x00,0xdd,0x03]
 
-v_max_i32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x24,0xd2,0x00,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0x00,0x0d,0x04]
 
-v_max_i32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x24,0xd2,0x80,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0x00,0xfd,0x07]
 
-v_max_i32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x24,0xd2,0xc1,0x00,0x00,0x00]
+v_cubeid_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x24,0xd2,0xf0,0x00,0x00,0x00]
+v_cubeid_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0x88,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x24,0xd2,0xf7,0x00,0x00,0x00]
+v_cubeid_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x67,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x24,0xd2,0x00,0x01,0x00,0x00]
+v_cubeid_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x68,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x24,0xd2,0xff,0x01,0x00,0x00]
+v_cubeid_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x69,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x24,0xd2,0x00,0x00,0x01,0x00]
+v_cubeid_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x24,0xd2,0x00,0x82,0x01,0x00]
+v_cubeid_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x24,0xd2,0x00,0xe0,0x01,0x00]
+v_cubeid_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x24,0xd2,0x00,0xee,0x01,0x00]
+v_cubeid_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x24,0xd2,0x00,0x00,0x02,0x00]
+v_cubeid_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x24,0xd2,0x00,0xfe,0x03,0x00]
+v_cubeid_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_min_u32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x26]
+v_cubeid_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_min_u32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x27]
+v_cubeid_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_min_u32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x26]
+v_cubeid_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_min_u32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x26]
+v_cubeid_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_min_u32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x26]
+v_cubeid_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_min_u32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x26]
+v_cubeid_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x05,0x0e,0x04]
 
-v_min_u32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x26]
+v_cubeid_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0xff,0x05,0x0e,0x04]
 
-v_min_u32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x26]
+v_cubeid_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_min_u32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x26]
+v_cubeid_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x04,0xfe,0x07]
 
-v_min_u32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x26]
+v_cubeid_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x04,0x0e,0x24]
 
-v_min_u32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x26]
+v_cubeid_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x04,0x0e,0x44]
 
-v_min_u32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x26]
+v_cubeid_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x04,0x0e,0x84]
 
-v_min_u32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x26]
+v_cubeid_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_min_u32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x26]
+v_cubeid_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0x88,0xd2,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x26]
+v_cubeid_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0x88,0xd2,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x26]
+v_cubeid_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0x88,0xd2,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x26]
+v_cubeid_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0x88,0xd2,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x26]
+v_cubeid_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0x88,0xd2,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x26]
+v_cubeid_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_min_u32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x26,0x56,0x34,0x12,0xaf]
+v_cubeid_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x04,0x0e,0x14]
 
-v_min_u32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x26,0x73,0x72,0x71,0x3f]
+v_cubeid_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_min_u32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x26]
+v_cubesc_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x26]
+v_cubesc_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0x8a,0xd2,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x26]
+v_cubesc_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x67,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x26,0xd2,0x00,0x00,0x00,0x00]
+v_cubesc_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x68,0x04,0x0e,0x04]
 
-v_min_u32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x26,0xd2,0x00,0x00,0x00,0x00]
+v_cubesc_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x69,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x26,0xd2,0x80,0x00,0x00,0x00]
+v_cubesc_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x26,0xd2,0xc1,0x00,0x00,0x00]
+v_cubesc_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x26,0xd2,0xf0,0x00,0x00,0x00]
+v_cubesc_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x26,0xd2,0xf7,0x00,0x00,0x00]
+v_cubesc_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x26,0xd2,0x00,0x01,0x00,0x00]
+v_cubesc_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x26,0xd2,0xff,0x01,0x00,0x00]
+v_cubesc_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x26,0xd2,0x00,0x00,0x01,0x00]
+v_cubesc_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x26,0xd2,0x00,0x82,0x01,0x00]
+v_cubesc_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x26,0xd2,0x00,0xe0,0x01,0x00]
+v_cubesc_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x26,0xd2,0x00,0xee,0x01,0x00]
+v_cubesc_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x26,0xd2,0x00,0x00,0x02,0x00]
+v_cubesc_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x26,0xd2,0x00,0xfe,0x03,0x00]
+v_cubesc_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x05,0x0e,0x04]
 
-v_max_u32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x28]
+v_cubesc_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0xff,0x05,0x0e,0x04]
 
-v_max_u32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x29]
+v_cubesc_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_max_u32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x28]
+v_cubesc_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x04,0xfe,0x07]
 
-v_max_u32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x28]
+v_cubesc_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x04,0x0e,0x24]
 
-v_max_u32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x28]
+v_cubesc_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x04,0x0e,0x44]
 
-v_max_u32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x28]
+v_cubesc_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x04,0x0e,0x84]
 
-v_max_u32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x28]
+v_cubesc_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_max_u32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x28]
+v_cubesc_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0x8a,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_u32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x28]
+v_cubesc_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0x8a,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_u32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x28]
+v_cubesc_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0x8a,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_u32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x28]
+v_cubesc_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0x8a,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_u32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x28]
+v_cubesc_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0x8a,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_u32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x28]
+v_cubesc_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_max_u32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x28]
+v_cubesc_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x04,0x0e,0x14]
 
-v_max_u32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x28]
+v_cubesc_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_max_u32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x28]
+v_cubetc_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_u32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x28]
+v_cubetc_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0x8c,0xd2,0x01,0x04,0x0e,0x04]
 
-v_max_u32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x28]
+v_cubetc_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x67,0x04,0x0e,0x04]
 
-v_max_u32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x28]
+v_cubetc_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x68,0x04,0x0e,0x04]
 
-v_max_u32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x28,0x56,0x34,0x12,0xaf]
+v_cubetc_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x69,0x04,0x0e,0x04]
 
-v_max_u32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x28,0x73,0x72,0x71,0x3f]
+v_cubetc_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_max_u32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x28]
+v_cubetc_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_max_u32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x28]
+v_cubetc_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_max_u32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x28]
+v_cubetc_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_max_u32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x28,0xd2,0x00,0x00,0x00,0x00]
+v_cubetc_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_max_u32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x28,0xd2,0x00,0x00,0x00,0x00]
+v_cubetc_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_max_u32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x28,0xd2,0x80,0x00,0x00,0x00]
+v_cubetc_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_max_u32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x28,0xd2,0xc1,0x00,0x00,0x00]
+v_cubetc_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_max_u32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x28,0xd2,0xf0,0x00,0x00,0x00]
+v_cubetc_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_max_u32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x28,0xd2,0xf7,0x00,0x00,0x00]
+v_cubetc_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_max_u32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x28,0xd2,0x00,0x01,0x00,0x00]
+v_cubetc_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_max_u32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x28,0xd2,0xff,0x01,0x00,0x00]
+v_cubetc_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x05,0x0e,0x04]
 
-v_max_u32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x28,0xd2,0x00,0x00,0x01,0x00]
+v_cubetc_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0xff,0x05,0x0e,0x04]
 
-v_max_u32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x28,0xd2,0x00,0x82,0x01,0x00]
+v_cubetc_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_max_u32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x28,0xd2,0x00,0xe0,0x01,0x00]
+v_cubetc_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x04,0xfe,0x07]
 
-v_max_u32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x28,0xd2,0x00,0xee,0x01,0x00]
+v_cubetc_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x04,0x0e,0x24]
 
-v_max_u32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x28,0xd2,0x00,0x00,0x02,0x00]
+v_cubetc_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x04,0x0e,0x44]
 
-v_max_u32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x28,0xd2,0x00,0xfe,0x03,0x00]
+v_cubetc_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x04,0x0e,0x84]
 
-v_lshr_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x2a]
+v_cubetc_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_lshr_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x2b]
+v_cubetc_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0x8c,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x2a]
+v_cubetc_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0x8c,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x2a]
+v_cubetc_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0x8c,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x2a]
+v_cubetc_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0x8c,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x2a]
+v_cubetc_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0x8c,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x2a]
+v_cubetc_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_lshr_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x2a]
+v_cubetc_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x04,0x0e,0x14]
 
-v_lshr_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x2a]
+v_cubetc_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_lshr_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x2a]
+v_cubema_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x2a]
+v_cubema_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0x8e,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x2a]
+v_cubema_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x67,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x2a]
+v_cubema_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x68,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x2a]
+v_cubema_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x69,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x2a]
+v_cubema_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x2a]
+v_cubema_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x2a]
+v_cubema_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x2a]
+v_cubema_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x2a]
+v_cubema_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x2a,0x56,0x34,0x12,0xaf]
+v_cubema_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x2a,0x73,0x72,0x71,0x3f]
+v_cubema_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x2a]
+v_cubema_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x2a]
+v_cubema_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_lshr_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x2a]
+v_cubema_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_lshr_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x2a,0xd2,0x00,0x00,0x00,0x00]
+v_cubema_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_lshr_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x2a,0xd2,0x00,0x00,0x00,0x00]
+v_cubema_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0x05,0x0e,0x04]
 
-v_lshr_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x2a,0xd2,0x80,0x00,0x00,0x00]
+v_cubema_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0xff,0x05,0x0e,0x04]
 
-v_lshr_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x2a,0xd2,0xc1,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_lshr_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x2a,0xd2,0xf0,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0x04,0xfe,0x07]
 
-v_lshr_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x2a,0xd2,0xf7,0x00,0x00,0x00]
+v_cubema_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0x04,0x0e,0x24]
 
-v_lshr_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x2a,0xd2,0x00,0x01,0x00,0x00]
+v_cubema_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0x04,0x0e,0x44]
 
-v_lshr_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x2a,0xd2,0xff,0x01,0x00,0x00]
+v_cubema_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0x04,0x0e,0x84]
 
-v_lshr_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x2a,0xd2,0x00,0x00,0x01,0x00]
+v_cubema_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_lshr_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x2a,0xd2,0x00,0x82,0x01,0x00]
+v_cubema_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0x8e,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x2a,0xd2,0x00,0xe0,0x01,0x00]
+v_cubema_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0x8e,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x2a,0xd2,0x00,0xee,0x01,0x00]
+v_cubema_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0x8e,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x2a,0xd2,0x00,0x00,0x02,0x00]
+v_cubema_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0x8e,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshr_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x2a,0xd2,0x00,0xfe,0x03,0x00]
+v_cubema_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0x8e,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshrrev_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x2c]
+v_cubema_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_lshrrev_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x2d]
+v_cubema_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0x04,0x0e,0x14]
 
-v_lshrrev_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x2c]
+v_cubema_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0x8e,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_lshrrev_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x2c]
+v_bfe_u32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x2c]
+v_bfe_u32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0x90,0xd2,0x01,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x2c]
+v_bfe_u32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x67,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x2c]
+v_bfe_u32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x68,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x2c]
+v_bfe_u32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x69,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x2c]
+v_bfe_u32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x6a,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x2c]
+v_bfe_u32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x6b,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x2c]
+v_bfe_u32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x6c,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x2c]
+v_bfe_u32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x6d,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x2c]
+v_bfe_u32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x6e,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x2c]
+v_bfe_u32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x6f,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x2c]
+v_bfe_u32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x7b,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x2c]
+v_bfe_u32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x7c,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x2c]
+v_bfe_u32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x7e,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x2c]
+v_bfe_u32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x7f,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x2c]
+v_bfe_u32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x2c,0x56,0x34,0x12,0xaf]
+v_bfe_u32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0xc1,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x2c,0x73,0x72,0x71,0x3f]
+v_bfe_u32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0xf0,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x2c]
+v_bfe_u32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0xf7,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x2c]
+v_bfe_u32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0x01,0x01,0x02]
 
-v_lshrrev_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x2c]
+v_bfe_u32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0xff,0x01,0x01,0x02]
 
-v_lshrrev_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x2c,0xd2,0x00,0x00,0x00,0x00]
+v_bfe_u32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0x82,0x01,0x02]
 
-v_lshrrev_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x2c,0xd2,0x00,0x00,0x00,0x00]
+v_bfe_u32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0xe0,0x01,0x02]
 
-v_lshrrev_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x2c,0xd2,0x80,0x00,0x00,0x00]
+v_bfe_u32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0xee,0x01,0x02]
 
-v_lshrrev_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x2c,0xd2,0xc1,0x00,0x00,0x00]
+v_bfe_u32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0x04,0x02,0x02]
 
-v_lshrrev_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x2c,0xd2,0xf0,0x00,0x00,0x00]
+v_bfe_u32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0xfe,0x03,0x02]
 
-v_lshrrev_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x2c,0xd2,0xf7,0x00,0x00,0x00]
+v_bfe_u32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0x00,0x05,0x03]
 
-v_lshrrev_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x2c,0xd2,0x00,0x01,0x00,0x00]
+v_bfe_u32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0x00,0xc1,0x03]
 
-v_lshrrev_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x2c,0xd2,0xff,0x01,0x00,0x00]
+v_bfe_u32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0x00,0xdd,0x03]
 
-v_lshrrev_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x2c,0xd2,0x00,0x00,0x01,0x00]
+v_bfe_u32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0x00,0x0d,0x04]
 
-v_lshrrev_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x2c,0xd2,0x00,0x82,0x01,0x00]
+v_bfe_u32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0x00,0xfd,0x07]
 
-v_lshrrev_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x2c,0xd2,0x00,0xe0,0x01,0x00]
+v_bfe_i32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0x00,0x01,0x02]
 
-v_lshrrev_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x2c,0xd2,0x00,0xee,0x01,0x00]
+v_bfe_i32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0x92,0xd2,0x01,0x00,0x01,0x02]
 
-v_lshrrev_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x2c,0xd2,0x00,0x00,0x02,0x00]
+v_bfe_i32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x67,0x00,0x01,0x02]
 
-v_lshrrev_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x2c,0xd2,0x00,0xfe,0x03,0x00]
+v_bfe_i32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x68,0x00,0x01,0x02]
 
-v_ashr_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x2e]
+v_bfe_i32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x69,0x00,0x01,0x02]
 
-v_ashr_i32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x2f]
+v_bfe_i32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x6a,0x00,0x01,0x02]
 
-v_ashr_i32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x2e]
+v_bfe_i32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x6b,0x00,0x01,0x02]
 
-v_ashr_i32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x2e]
+v_bfe_i32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x6c,0x00,0x01,0x02]
 
-v_ashr_i32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x2e]
+v_bfe_i32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x6d,0x00,0x01,0x02]
 
-v_ashr_i32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x2e]
+v_bfe_i32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x6e,0x00,0x01,0x02]
 
-v_ashr_i32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x2e]
+v_bfe_i32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x6f,0x00,0x01,0x02]
 
-v_ashr_i32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x2e]
+v_bfe_i32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x7b,0x00,0x01,0x02]
 
-v_ashr_i32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x2e]
+v_bfe_i32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x7c,0x00,0x01,0x02]
 
-v_ashr_i32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x2e]
+v_bfe_i32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x7e,0x00,0x01,0x02]
 
-v_ashr_i32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x2e]
+v_bfe_i32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x7f,0x00,0x01,0x02]
 
-v_ashr_i32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x2e]
+v_bfe_i32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0x00,0x01,0x02]
 
-v_ashr_i32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x2e]
+v_bfe_i32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0xc1,0x00,0x01,0x02]
 
-v_ashr_i32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x2e]
+v_bfe_i32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0xf0,0x00,0x01,0x02]
 
-v_ashr_i32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x2e]
+v_bfe_i32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0xf7,0x00,0x01,0x02]
 
-v_ashr_i32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x2e]
+v_bfe_i32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0x01,0x01,0x02]
 
-v_ashr_i32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x2e]
+v_bfe_i32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0xff,0x01,0x01,0x02]
 
-v_ashr_i32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x2e]
+v_bfe_i32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0x82,0x01,0x02]
 
-v_ashr_i32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x2e]
+v_bfe_i32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0xe0,0x01,0x02]
 
-v_ashr_i32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x2e,0x56,0x34,0x12,0xaf]
+v_bfe_i32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0xee,0x01,0x02]
 
-v_ashr_i32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x2e,0x73,0x72,0x71,0x3f]
+v_bfe_i32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0x04,0x02,0x02]
 
-v_ashr_i32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x2e]
+v_bfe_i32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0xfe,0x03,0x02]
 
-v_ashr_i32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x2e]
+v_bfe_i32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0x00,0x05,0x03]
 
-v_ashr_i32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x2e]
+v_bfe_i32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0x00,0xc1,0x03]
 
-v_ashr_i32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x2e,0xd2,0x00,0x00,0x00,0x00]
+v_bfe_i32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0x00,0xdd,0x03]
 
-v_ashr_i32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x2e,0xd2,0x00,0x00,0x00,0x00]
+v_bfe_i32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0x00,0x0d,0x04]
 
-v_ashr_i32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x2e,0xd2,0x80,0x00,0x00,0x00]
+v_bfe_i32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0x00,0xfd,0x07]
 
-v_ashr_i32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x2e,0xd2,0xc1,0x00,0x00,0x00]
+v_bfi_b32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x00,0x01,0x02]
 
-v_ashr_i32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x2e,0xd2,0xf0,0x00,0x00,0x00]
+v_bfi_b32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0x94,0xd2,0x01,0x00,0x01,0x02]
 
-v_ashr_i32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x2e,0xd2,0xf7,0x00,0x00,0x00]
+v_bfi_b32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x67,0x00,0x01,0x02]
 
-v_ashr_i32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x2e,0xd2,0x00,0x01,0x00,0x00]
+v_bfi_b32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x68,0x00,0x01,0x02]
 
-v_ashr_i32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x2e,0xd2,0xff,0x01,0x00,0x00]
+v_bfi_b32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x69,0x00,0x01,0x02]
 
-v_ashr_i32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x2e,0xd2,0x00,0x00,0x01,0x00]
+v_bfi_b32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x6a,0x00,0x01,0x02]
 
-v_ashr_i32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x2e,0xd2,0x00,0x82,0x01,0x00]
+v_bfi_b32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x6b,0x00,0x01,0x02]
 
-v_ashr_i32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x2e,0xd2,0x00,0xe0,0x01,0x00]
+v_bfi_b32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x6c,0x00,0x01,0x02]
 
-v_ashr_i32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x2e,0xd2,0x00,0xee,0x01,0x00]
+v_bfi_b32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x6d,0x00,0x01,0x02]
 
-v_ashr_i32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x2e,0xd2,0x00,0x00,0x02,0x00]
+v_bfi_b32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x6e,0x00,0x01,0x02]
 
-v_ashr_i32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x2e,0xd2,0x00,0xfe,0x03,0x00]
+v_bfi_b32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x6f,0x00,0x01,0x02]
 
-v_ashrrev_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x30]
+v_bfi_b32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x7b,0x00,0x01,0x02]
 
-v_ashrrev_i32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x31]
+v_bfi_b32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x7c,0x00,0x01,0x02]
 
-v_ashrrev_i32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x30]
+v_bfi_b32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x7e,0x00,0x01,0x02]
 
-v_ashrrev_i32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x30]
+v_bfi_b32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x7f,0x00,0x01,0x02]
 
-v_ashrrev_i32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x30]
+v_bfi_b32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x80,0x00,0x01,0x02]
 
-v_ashrrev_i32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x30]
+v_bfi_b32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0xc1,0x00,0x01,0x02]
 
-v_ashrrev_i32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x30]
+v_bfi_b32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0xf0,0x00,0x01,0x02]
 
-v_ashrrev_i32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x30]
+v_bfi_b32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0xf7,0x00,0x01,0x02]
 
-v_ashrrev_i32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x30]
+v_bfi_b32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x01,0x01,0x02]
 
-v_ashrrev_i32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x30]
+v_bfi_b32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0xff,0x01,0x01,0x02]
 
-v_ashrrev_i32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x30]
+v_bfi_b32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x82,0x01,0x02]
 
-v_ashrrev_i32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x30]
+v_bfi_b32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xe0,0x01,0x02]
 
-v_ashrrev_i32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x30]
+v_bfi_b32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xee,0x01,0x02]
 
-v_ashrrev_i32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x30]
+v_bfi_b32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x04,0x02,0x02]
 
-v_ashrrev_i32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x30]
+v_bfi_b32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xfe,0x03,0x02]
 
-v_ashrrev_i32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x30]
+v_bfi_b32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x00,0x05,0x03]
 
-v_ashrrev_i32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x30]
+v_bfi_b32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x00,0xc1,0x03]
 
-v_ashrrev_i32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x30]
+v_bfi_b32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x00,0xdd,0x03]
 
-v_ashrrev_i32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x30]
+v_bfi_b32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x00,0x0d,0x04]
 
-v_ashrrev_i32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x30,0x56,0x34,0x12,0xaf]
+v_bfi_b32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x00,0xfd,0x07]
 
-v_ashrrev_i32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x30,0x73,0x72,0x71,0x3f]
+v_fma_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x04,0x0e,0x04]
 
-v_ashrrev_i32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x30]
+v_fma_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0x96,0xd2,0x01,0x04,0x0e,0x04]
 
-v_ashrrev_i32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x30]
+v_fma_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x67,0x04,0x0e,0x04]
 
-v_ashrrev_i32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x30]
+v_fma_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x68,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x30,0xd2,0x00,0x00,0x00,0x00]
+v_fma_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x69,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x30,0xd2,0x00,0x00,0x00,0x00]
+v_fma_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x30,0xd2,0x80,0x00,0x00,0x00]
+v_fma_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x30,0xd2,0xc1,0x00,0x00,0x00]
+v_fma_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x30,0xd2,0xf0,0x00,0x00,0x00]
+v_fma_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x30,0xd2,0xf7,0x00,0x00,0x00]
+v_fma_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x30,0xd2,0x00,0x01,0x00,0x00]
+v_fma_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x30,0xd2,0xff,0x01,0x00,0x00]
+v_fma_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x30,0xd2,0x00,0x00,0x01,0x00]
+v_fma_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x30,0xd2,0x00,0x82,0x01,0x00]
+v_fma_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x30,0xd2,0x00,0xe0,0x01,0x00]
+v_fma_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x30,0xd2,0x00,0xee,0x01,0x00]
+v_fma_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x30,0xd2,0x00,0x00,0x02,0x00]
+v_fma_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x05,0x0e,0x04]
 
-v_ashrrev_i32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x30,0xd2,0x00,0xfe,0x03,0x00]
+v_fma_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0xff,0x05,0x0e,0x04]
 
-v_lshl_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x32]
+v_fma_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_lshl_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x33]
+v_fma_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x04,0xfe,0x07]
 
-v_lshl_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x32]
+v_fma_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x04,0x0e,0x24]
 
-v_lshl_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x32]
+v_fma_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x04,0x0e,0x44]
 
-v_lshl_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x32]
+v_fma_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x04,0x0e,0x84]
 
-v_lshl_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x32]
+v_fma_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_lshl_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x32]
+v_fma_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0x96,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x32]
+v_fma_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0x96,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x32]
+v_fma_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0x96,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x32]
+v_fma_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0x96,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x32]
+v_fma_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0x96,0xd2,0x01,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x32]
+v_fma_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_lshl_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x32]
+v_fma_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x04,0x0e,0x14]
 
-v_lshl_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x32]
+v_fma_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_lshl_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x32]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x02,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x32]
+v_fma_f64 v[254:255], s[2:3], v[2:3], v[3:4]
+// CHECK: [0xfe,0x00,0x98,0xd2,0x02,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x32]
+v_fma_f64 v[5:6], s[4:5], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x04,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x32]
+v_fma_f64 v[5:6], s[102:103], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x66,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x32]
+v_fma_f64 v[5:6], flat_scratch, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x68,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x32,0x56,0x34,0x12,0xaf]
+v_fma_f64 v[5:6], vcc, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x32,0x73,0x72,0x71,0x3f]
+v_fma_f64 v[5:6], tba, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x32]
+v_fma_f64 v[5:6], tma, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x32]
+v_fma_f64 v[5:6], ttmp[10:11], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x7a,0x04,0x0e,0x04]
 
-v_lshl_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x32]
+v_fma_f64 v[5:6], exec, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_lshl_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x32,0xd2,0x00,0x00,0x00,0x00]
+v_fma_f64 v[5:6], scc, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_lshl_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x32,0xd2,0x00,0x00,0x00,0x00]
+v_fma_f64 v[5:6], v[1:2], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x01,0x05,0x0e,0x04]
 
-v_lshl_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x32,0xd2,0x80,0x00,0x00,0x00]
+v_fma_f64 v[5:6], v[254:255], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0xfe,0x05,0x0e,0x04]
 
-v_lshl_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x32,0xd2,0xc1,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[254:255], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x02,0xfc,0x0f,0x04]
 
-v_lshl_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x32,0xd2,0xf0,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[254:255]
+// CHECK: [0x05,0x00,0x98,0xd2,0x02,0x04,0xfa,0x07]
 
-v_lshl_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x32,0xd2,0xf7,0x00,0x00,0x00]
+v_fma_f64 v[5:6], -s[2:3], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x02,0x04,0x0e,0x24]
 
-v_lshl_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x32,0xd2,0x00,0x01,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], -v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x02,0x04,0x0e,0x44]
 
-v_lshl_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x32,0xd2,0xff,0x01,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], -v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x02,0x04,0x0e,0x84]
 
-v_lshl_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x32,0xd2,0x00,0x00,0x01,0x00]
+v_fma_f64 v[5:6], -s[2:3], -v[2:3], -v[3:4]
+// CHECK: [0x05,0x00,0x98,0xd2,0x02,0x04,0x0e,0xe4]
 
-v_lshl_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x32,0xd2,0x00,0x82,0x01,0x00]
+v_fma_f64 v[5:6], |s[2:3]|, v[2:3], v[3:4]
+// CHECK: [0x05,0x01,0x98,0xd2,0x02,0x04,0x0e,0x04]
 
-v_lshl_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x32,0xd2,0x00,0xe0,0x01,0x00]
+v_fma_f64 v[5:6], s[2:3], |v[2:3]|, v[3:4]
+// CHECK: [0x05,0x02,0x98,0xd2,0x02,0x04,0x0e,0x04]
 
-v_lshl_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x32,0xd2,0x00,0xee,0x01,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], |v[3:4]|
+// CHECK: [0x05,0x04,0x98,0xd2,0x02,0x04,0x0e,0x04]
 
-v_lshl_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x32,0xd2,0x00,0x00,0x02,0x00]
+v_fma_f64 v[5:6], |s[2:3]|, |v[2:3]|, |v[3:4]|
+// CHECK: [0x05,0x07,0x98,0xd2,0x02,0x04,0x0e,0x04]
 
-v_lshl_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x32,0xd2,0x00,0xfe,0x03,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[3:4] clamp
+// CHECK: [0x05,0x08,0x98,0xd2,0x02,0x04,0x0e,0x04]
 
-v_lshlrev_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x34]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[3:4] mul:2
+// CHECK: [0x05,0x00,0x98,0xd2,0x02,0x04,0x0e,0x0c]
 
-v_lshlrev_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x35]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[3:4] mul:4
+// CHECK: [0x05,0x00,0x98,0xd2,0x02,0x04,0x0e,0x14]
 
-v_lshlrev_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x34]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[3:4] div:2
+// CHECK: [0x05,0x00,0x98,0xd2,0x02,0x04,0x0e,0x1c]
 
-v_lshlrev_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x34]
+v_lerp_u8 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x01,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x34]
+v_lerp_u8 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0x9a,0xd2,0x01,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x34]
+v_lerp_u8 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x67,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x34]
+v_lerp_u8 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x68,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x34]
+v_lerp_u8 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x69,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x34]
+v_lerp_u8 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x6a,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x34]
+v_lerp_u8 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x6b,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x34]
+v_lerp_u8 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x6c,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x34]
+v_lerp_u8 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x6d,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x34]
+v_lerp_u8 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x6e,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x34]
+v_lerp_u8 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x6f,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x34]
+v_lerp_u8 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x7b,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x34]
+v_lerp_u8 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x7c,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x34]
+v_lerp_u8 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x7e,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x34]
+v_lerp_u8 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x7f,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x34]
+v_lerp_u8 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x80,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x34,0x56,0x34,0x12,0xaf]
+v_lerp_u8 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0xc1,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x34,0x73,0x72,0x71,0x3f]
+v_lerp_u8 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x01,0x01,0x01,0x02]
 
-v_lshlrev_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x34]
+v_lerp_u8 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0xff,0x01,0x01,0x02]
 
-v_lshlrev_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x34]
+v_lerp_u8 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x01,0x82,0x01,0x02]
 
-v_lshlrev_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x34]
+v_lerp_u8 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x01,0x04,0x02,0x02]
 
-v_lshlrev_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x34,0xd2,0x00,0x00,0x00,0x00]
+v_lerp_u8 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0x9a,0xd2,0x01,0xfe,0x03,0x02]
 
-v_lshlrev_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x34,0xd2,0x00,0x00,0x00,0x00]
+v_lerp_u8 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0x9a,0xd2,0x01,0x00,0x05,0x03]
 
-v_lshlrev_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x34,0xd2,0x80,0x00,0x00,0x00]
+v_lerp_u8 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0x9a,0xd2,0x01,0x00,0x0d,0x04]
 
-v_lshlrev_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x34,0xd2,0xc1,0x00,0x00,0x00]
+v_lerp_u8 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0x9a,0xd2,0x01,0x00,0xfd,0x07]
 
-v_lshlrev_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x34,0xd2,0xf0,0x00,0x00,0x00]
+v_alignbit_b32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x34,0xd2,0xf7,0x00,0x00,0x00]
+v_alignbit_b32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0x9c,0xd2,0x01,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x34,0xd2,0x00,0x01,0x00,0x00]
+v_alignbit_b32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x67,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x34,0xd2,0xff,0x01,0x00,0x00]
+v_alignbit_b32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x68,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x34,0xd2,0x00,0x00,0x01,0x00]
+v_alignbit_b32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x69,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x34,0xd2,0x00,0x82,0x01,0x00]
+v_alignbit_b32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x6a,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x34,0xd2,0x00,0xe0,0x01,0x00]
+v_alignbit_b32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x6b,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x34,0xd2,0x00,0xee,0x01,0x00]
+v_alignbit_b32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x6c,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x34,0xd2,0x00,0x00,0x02,0x00]
+v_alignbit_b32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x6d,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x34,0xd2,0x00,0xfe,0x03,0x00]
+v_alignbit_b32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x6e,0x00,0x01,0x02]
 
-v_and_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x36]
+v_alignbit_b32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x6f,0x00,0x01,0x02]
 
-v_and_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x37]
+v_alignbit_b32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x7b,0x00,0x01,0x02]
 
-v_and_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x36]
+v_alignbit_b32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x7c,0x00,0x01,0x02]
 
-v_and_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x36]
+v_alignbit_b32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x7e,0x00,0x01,0x02]
 
-v_and_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x36]
+v_alignbit_b32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x7f,0x00,0x01,0x02]
 
-v_and_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x36]
+v_alignbit_b32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x80,0x00,0x01,0x02]
 
-v_and_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x36]
+v_alignbit_b32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0xc1,0x00,0x01,0x02]
 
-v_and_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x36]
+v_alignbit_b32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0xf0,0x00,0x01,0x02]
 
-v_and_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x36]
+v_alignbit_b32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0xf7,0x00,0x01,0x02]
 
-v_and_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x36]
+v_alignbit_b32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0x01,0x01,0x02]
 
-v_and_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x36]
+v_alignbit_b32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0xff,0x01,0x01,0x02]
 
-v_and_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x36]
+v_alignbit_b32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0x82,0x01,0x02]
 
-v_and_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x36]
+v_alignbit_b32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0xe0,0x01,0x02]
 
-v_and_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x36]
+v_alignbit_b32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0xee,0x01,0x02]
 
-v_and_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x36]
+v_alignbit_b32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0x04,0x02,0x02]
 
-v_and_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x36]
+v_alignbit_b32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0xfe,0x03,0x02]
 
-v_and_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x36]
+v_alignbit_b32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0x00,0x05,0x03]
 
-v_and_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x36]
+v_alignbit_b32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0x00,0xc1,0x03]
 
-v_and_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x36]
+v_alignbit_b32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0x00,0xdd,0x03]
 
-v_and_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x36,0x56,0x34,0x12,0xaf]
+v_alignbit_b32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0x00,0x0d,0x04]
 
-v_and_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x36,0x73,0x72,0x71,0x3f]
+v_alignbit_b32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0x9c,0xd2,0x01,0x00,0xfd,0x07]
 
-v_and_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x36]
+v_alignbyte_b32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0x00,0x01,0x02]
 
-v_and_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x36]
+v_alignbyte_b32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0x9e,0xd2,0x01,0x00,0x01,0x02]
 
-v_and_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x36]
+v_alignbyte_b32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x67,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x36,0xd2,0x00,0x00,0x00,0x00]
+v_alignbyte_b32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x68,0x00,0x01,0x02]
 
-v_and_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x36,0xd2,0x00,0x00,0x00,0x00]
+v_alignbyte_b32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x69,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x36,0xd2,0x80,0x00,0x00,0x00]
+v_alignbyte_b32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x6a,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x36,0xd2,0xc1,0x00,0x00,0x00]
+v_alignbyte_b32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x6b,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x36,0xd2,0xf0,0x00,0x00,0x00]
+v_alignbyte_b32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x6c,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x36,0xd2,0xf7,0x00,0x00,0x00]
+v_alignbyte_b32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x6d,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x36,0xd2,0x00,0x01,0x00,0x00]
+v_alignbyte_b32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x6e,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x36,0xd2,0xff,0x01,0x00,0x00]
+v_alignbyte_b32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x6f,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x36,0xd2,0x00,0x00,0x01,0x00]
+v_alignbyte_b32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x7b,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x36,0xd2,0x00,0x82,0x01,0x00]
+v_alignbyte_b32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x7c,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x36,0xd2,0x00,0xe0,0x01,0x00]
+v_alignbyte_b32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x7e,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x36,0xd2,0x00,0xee,0x01,0x00]
+v_alignbyte_b32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x7f,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x36,0xd2,0x00,0x00,0x02,0x00]
+v_alignbyte_b32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x80,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x36,0xd2,0x00,0xfe,0x03,0x00]
+v_alignbyte_b32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0xc1,0x00,0x01,0x02]
 
-v_or_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x38]
+v_alignbyte_b32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0xf0,0x00,0x01,0x02]
 
-v_or_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x39]
+v_alignbyte_b32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0xf7,0x00,0x01,0x02]
 
-v_or_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x38]
+v_alignbyte_b32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0x01,0x01,0x02]
 
-v_or_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x38]
+v_alignbyte_b32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0xff,0x01,0x01,0x02]
 
-v_or_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x38]
+v_alignbyte_b32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0x82,0x01,0x02]
 
-v_or_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x38]
+v_alignbyte_b32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0xe0,0x01,0x02]
 
-v_or_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x38]
+v_alignbyte_b32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0xee,0x01,0x02]
 
-v_or_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x38]
+v_alignbyte_b32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0x04,0x02,0x02]
 
-v_or_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x38]
+v_alignbyte_b32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0xfe,0x03,0x02]
 
-v_or_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x38]
+v_alignbyte_b32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0x00,0x05,0x03]
 
-v_or_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x38]
+v_alignbyte_b32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0x00,0xc1,0x03]
 
-v_or_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x38]
+v_alignbyte_b32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0x00,0xdd,0x03]
 
-v_or_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x38]
+v_alignbyte_b32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0x00,0x0d,0x04]
 
-v_or_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x38]
+v_alignbyte_b32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0x9e,0xd2,0x01,0x00,0xfd,0x07]
 
-v_or_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x38]
+v_mullit_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0x04,0x0e,0x04]
 
-v_or_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x38]
+v_mullit_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xa0,0xd2,0x01,0x04,0x0e,0x04]
 
-v_or_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x38]
+v_mullit_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x67,0x04,0x0e,0x04]
 
-v_or_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x38]
+v_mullit_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x68,0x04,0x0e,0x04]
 
-v_or_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x38]
+v_mullit_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x69,0x04,0x0e,0x04]
 
-v_or_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x38,0x56,0x34,0x12,0xaf]
+v_mullit_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_or_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x38,0x73,0x72,0x71,0x3f]
+v_mullit_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_or_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x38]
+v_mullit_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_or_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x38]
+v_mullit_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_or_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x38]
+v_mullit_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x38,0xd2,0x00,0x00,0x00,0x00]
+v_mullit_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_or_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x38,0xd2,0x00,0x00,0x00,0x00]
+v_mullit_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x38,0xd2,0x80,0x00,0x00,0x00]
+v_mullit_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x38,0xd2,0xc1,0x00,0x00,0x00]
+v_mullit_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x38,0xd2,0xf0,0x00,0x00,0x00]
+v_mullit_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x38,0xd2,0xf7,0x00,0x00,0x00]
+v_mullit_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x38,0xd2,0x00,0x01,0x00,0x00]
+v_mullit_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0x05,0x0e,0x04]
 
-v_or_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x38,0xd2,0xff,0x01,0x00,0x00]
+v_mullit_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0xff,0x05,0x0e,0x04]
 
-v_or_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x38,0xd2,0x00,0x00,0x01,0x00]
+v_mullit_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_or_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x38,0xd2,0x00,0x82,0x01,0x00]
+v_mullit_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0x04,0xfe,0x07]
 
-v_or_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x38,0xd2,0x00,0xe0,0x01,0x00]
+v_mullit_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0x04,0x0e,0x24]
 
-v_or_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x38,0xd2,0x00,0xee,0x01,0x00]
+v_mullit_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0x04,0x0e,0x44]
 
-v_or_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x38,0xd2,0x00,0x00,0x02,0x00]
+v_mullit_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0x04,0x0e,0x84]
 
-v_or_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x38,0xd2,0x00,0xfe,0x03,0x00]
+v_mullit_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_xor_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x3a]
+v_mullit_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xa0,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x3b]
+v_mullit_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xa0,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x3a]
+v_mullit_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xa0,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x3a]
+v_mullit_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xa0,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x3a]
+v_mullit_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0xa0,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x3a]
+v_mullit_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_xor_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x3a]
+v_mullit_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0x04,0x0e,0x14]
 
-v_xor_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x3a]
+v_mullit_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xa0,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_xor_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x3a]
+v_min3_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x3a]
+v_min3_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xa2,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x3a]
+v_min3_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x67,0x04,0x0e,0x04]
 
-v_xor_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x3a]
+v_min3_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x68,0x04,0x0e,0x04]
 
-v_xor_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x3a]
+v_min3_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x69,0x04,0x0e,0x04]
 
-v_xor_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x3a]
+v_min3_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_xor_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x3a]
+v_min3_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_xor_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x3a]
+v_min3_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_xor_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x3a]
+v_min3_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_xor_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x3a]
+v_min3_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_xor_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x3a]
+v_min3_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_xor_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x3a,0x56,0x34,0x12,0xaf]
+v_min3_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_xor_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x3a,0x73,0x72,0x71,0x3f]
+v_min3_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_xor_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x3a]
+v_min3_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_xor_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x3a]
+v_min3_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_xor_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x3a]
+v_min3_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_xor_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x3a,0xd2,0x00,0x00,0x00,0x00]
+v_min3_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0x05,0x0e,0x04]
 
-v_xor_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x3a,0xd2,0x00,0x00,0x00,0x00]
+v_min3_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0xff,0x05,0x0e,0x04]
 
-v_xor_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x3a,0xd2,0x80,0x00,0x00,0x00]
+v_min3_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_xor_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x3a,0xd2,0xc1,0x00,0x00,0x00]
+v_min3_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0x04,0xfe,0x07]
 
-v_xor_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x3a,0xd2,0xf0,0x00,0x00,0x00]
+v_min3_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0x04,0x0e,0x24]
 
-v_xor_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x3a,0xd2,0xf7,0x00,0x00,0x00]
+v_min3_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0x04,0x0e,0x44]
 
-v_xor_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x3a,0xd2,0x00,0x01,0x00,0x00]
+v_min3_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0x04,0x0e,0x84]
 
-v_xor_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x3a,0xd2,0xff,0x01,0x00,0x00]
+v_min3_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_xor_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x3a,0xd2,0x00,0x00,0x01,0x00]
+v_min3_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xa2,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x3a,0xd2,0x00,0x82,0x01,0x00]
+v_min3_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xa2,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x3a,0xd2,0x00,0xe0,0x01,0x00]
+v_min3_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xa2,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x3a,0xd2,0x00,0xee,0x01,0x00]
+v_min3_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xa2,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x3a,0xd2,0x00,0x00,0x02,0x00]
+v_min3_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0xa2,0xd2,0x01,0x04,0x0e,0x04]
 
-v_xor_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x3a,0xd2,0x00,0xfe,0x03,0x00]
+v_min3_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_bfm_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x3c]
+v_min3_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0x04,0x0e,0x14]
 
-v_bfm_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x3d]
+v_min3_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xa2,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_bfm_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x3c]
+v_min3_i32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0x00,0x01,0x02]
 
-v_bfm_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x3c]
+v_min3_i32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xa4,0xd2,0x01,0x00,0x01,0x02]
 
-v_bfm_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x3c]
+v_min3_i32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x67,0x00,0x01,0x02]
 
-v_bfm_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x3c]
+v_min3_i32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x68,0x00,0x01,0x02]
 
-v_bfm_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x3c]
+v_min3_i32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x69,0x00,0x01,0x02]
 
-v_bfm_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x3c]
+v_min3_i32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x6a,0x00,0x01,0x02]
 
-v_bfm_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x3c]
+v_min3_i32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x6b,0x00,0x01,0x02]
 
-v_bfm_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x3c]
+v_min3_i32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x6c,0x00,0x01,0x02]
 
-v_bfm_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x3c]
+v_min3_i32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x6d,0x00,0x01,0x02]
 
-v_bfm_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x3c]
+v_min3_i32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x6e,0x00,0x01,0x02]
 
-v_bfm_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x3c]
+v_min3_i32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x6f,0x00,0x01,0x02]
 
-v_bfm_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x3c]
+v_min3_i32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x7b,0x00,0x01,0x02]
 
-v_bfm_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x3c]
+v_min3_i32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x7c,0x00,0x01,0x02]
 
-v_bfm_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x3c]
+v_min3_i32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x7e,0x00,0x01,0x02]
 
-v_bfm_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x3c]
+v_min3_i32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x7f,0x00,0x01,0x02]
 
-v_bfm_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x3c]
+v_min3_i32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x80,0x00,0x01,0x02]
 
-v_bfm_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x3c]
+v_min3_i32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0xc1,0x00,0x01,0x02]
 
-v_bfm_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x3c,0x56,0x34,0x12,0xaf]
+v_min3_i32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0xf0,0x00,0x01,0x02]
 
-v_bfm_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x3c,0x73,0x72,0x71,0x3f]
+v_min3_i32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0xf7,0x00,0x01,0x02]
 
-v_bfm_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x3c]
+v_min3_i32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0x01,0x01,0x02]
 
-v_bfm_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x3c]
+v_min3_i32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0xff,0x01,0x01,0x02]
 
-v_bfm_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x3c]
+v_min3_i32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0x82,0x01,0x02]
 
-v_bfm_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x3c,0xd2,0x00,0x00,0x00,0x00]
+v_min3_i32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0xe0,0x01,0x02]
 
-v_bfm_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x3c,0xd2,0x00,0x00,0x00,0x00]
+v_min3_i32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0xee,0x01,0x02]
 
-v_bfm_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x3c,0xd2,0x80,0x00,0x00,0x00]
+v_min3_i32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0x04,0x02,0x02]
 
-v_bfm_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x3c,0xd2,0xc1,0x00,0x00,0x00]
+v_min3_i32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0xfe,0x03,0x02]
 
-v_bfm_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x3c,0xd2,0xf0,0x00,0x00,0x00]
+v_min3_i32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0x00,0x05,0x03]
 
-v_bfm_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x3c,0xd2,0xf7,0x00,0x00,0x00]
+v_min3_i32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0x00,0xc1,0x03]
 
-v_bfm_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x3c,0xd2,0x00,0x01,0x00,0x00]
+v_min3_i32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0x00,0xdd,0x03]
 
-v_bfm_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x3c,0xd2,0xff,0x01,0x00,0x00]
+v_min3_i32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0x00,0x0d,0x04]
 
-v_bfm_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x3c,0xd2,0x00,0x00,0x01,0x00]
+v_min3_i32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xa4,0xd2,0x01,0x00,0xfd,0x07]
 
-v_bfm_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x3c,0xd2,0x00,0x82,0x01,0x00]
+v_min3_u32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0x00,0x01,0x02]
 
-v_bfm_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x3c,0xd2,0x00,0xe0,0x01,0x00]
+v_min3_u32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xa6,0xd2,0x01,0x00,0x01,0x02]
 
-v_bfm_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x3c,0xd2,0x00,0xee,0x01,0x00]
+v_min3_u32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x67,0x00,0x01,0x02]
 
-v_bfm_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x3c,0xd2,0x00,0x00,0x02,0x00]
+v_min3_u32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x68,0x00,0x01,0x02]
 
-v_bfm_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x3c,0xd2,0x00,0xfe,0x03,0x00]
+v_min3_u32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x69,0x00,0x01,0x02]
 
-v_mac_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x3e]
+v_min3_u32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x6a,0x00,0x01,0x02]
 
-v_mac_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x3f]
+v_min3_u32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x6b,0x00,0x01,0x02]
 
-v_mac_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x3e]
+v_min3_u32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x6c,0x00,0x01,0x02]
 
-v_mac_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x3e]
+v_min3_u32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x6d,0x00,0x01,0x02]
 
-v_mac_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x3e]
+v_min3_u32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x6e,0x00,0x01,0x02]
 
-v_mac_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x3e]
+v_min3_u32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x6f,0x00,0x01,0x02]
 
-v_mac_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x3e]
+v_min3_u32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x7b,0x00,0x01,0x02]
 
-v_mac_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x3e]
+v_min3_u32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x7c,0x00,0x01,0x02]
 
-v_mac_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x3e]
+v_min3_u32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x7e,0x00,0x01,0x02]
 
-v_mac_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x3e]
+v_min3_u32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x7f,0x00,0x01,0x02]
 
-v_mac_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x3e]
+v_min3_u32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x80,0x00,0x01,0x02]
 
-v_mac_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x3e]
+v_min3_u32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0xc1,0x00,0x01,0x02]
 
-v_mac_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x3e]
+v_min3_u32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0xf0,0x00,0x01,0x02]
 
-v_mac_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x3e]
+v_min3_u32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0xf7,0x00,0x01,0x02]
 
-v_mac_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x3e]
+v_min3_u32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0x01,0x01,0x02]
 
-v_mac_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x3e]
+v_min3_u32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0xff,0x01,0x01,0x02]
 
-v_mac_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x3e]
+v_min3_u32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0x82,0x01,0x02]
 
-v_mac_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x3e]
+v_min3_u32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0xe0,0x01,0x02]
 
-v_mac_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x3e]
+v_min3_u32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0xee,0x01,0x02]
 
-v_mac_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x3e,0x56,0x34,0x12,0xaf]
+v_min3_u32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0x04,0x02,0x02]
 
-v_mac_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x3e,0x73,0x72,0x71,0x3f]
+v_min3_u32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0xfe,0x03,0x02]
 
-v_mac_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x3e]
+v_min3_u32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0x00,0x05,0x03]
 
-v_mac_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x3e]
+v_min3_u32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0x00,0xc1,0x03]
 
-v_mac_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x3e]
+v_min3_u32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0x00,0xdd,0x03]
 
-v_mac_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0x00,0x00,0x00]
+v_min3_u32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0x00,0x0d,0x04]
 
-v_mac_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x3e,0xd2,0x00,0x00,0x00,0x00]
+v_min3_u32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xa6,0xd2,0x01,0x00,0xfd,0x07]
 
-v_mac_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x3e,0xd2,0xfd,0x00,0x00,0x00]
+v_max3_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0x01,0x00,0x00]
+v_max3_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xa8,0xd2,0x01,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x3e,0xd2,0xff,0x01,0x00,0x00]
+v_max3_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x67,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0xfa,0x01,0x00]
+v_max3_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x68,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0x00,0x02,0x00]
+v_max3_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x69,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0xfe,0x03,0x00]
+v_max3_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0x00,0x00,0x20]
+v_max3_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0x00,0x00,0x40]
+v_max3_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0x00,0x00,0x60]
+v_max3_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x3e,0xd2,0x00,0x00,0x00,0x00]
+v_max3_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x3e,0xd2,0x00,0x00,0x00,0x00]
+v_max3_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x3e,0xd2,0x00,0x00,0x00,0x00]
+v_max3_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x3e,0xd2,0x00,0x00,0x00,0x00]
+v_max3_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0x00,0x00,0x08]
+v_max3_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0x00,0x00,0x10]
+v_max3_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x3e,0xd2,0x00,0x00,0x00,0x18]
+v_max3_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_madmk_f32 v0, 0, 0x11213141, v0
-// CHECK: [0x80,0x00,0x00,0x40,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0x05,0x0e,0x04]
 
-v_madmk_f32 v255, 0, 0x11213141, v0
-// CHECK: [0x80,0x00,0xfe,0x41,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0xff,0x05,0x0e,0x04]
 
-v_madmk_f32 v0, -1, 0x11213141, v0
-// CHECK: [0xc1,0x00,0x00,0x40,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_madmk_f32 v0, 0.5, 0x11213141, v0
-// CHECK: [0xf0,0x00,0x00,0x40,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0x04,0xfe,0x07]
 
-v_madmk_f32 v0, -4.0, 0x11213141, v0
-// CHECK: [0xf7,0x00,0x00,0x40,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0x04,0x0e,0x24]
 
-v_madmk_f32 v0, v0, 0x11213141, v0
-// CHECK: [0x00,0x01,0x00,0x40,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0x04,0x0e,0x44]
 
-v_madmk_f32 v0, v255, 0x11213141, v0
-// CHECK: [0xff,0x01,0x00,0x40,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0x04,0x0e,0x84]
 
-v_madmk_f32 v0, 0, 0xa1b1c1d1, v0
-// CHECK: [0x80,0x00,0x00,0x40,0xd1,0xc1,0xb1,0xa1]
+v_max3_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_madmk_f32 v0, 0, 0x11213141, v255
-// CHECK: [0x80,0xfe,0x01,0x40,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xa8,0xd2,0x01,0x04,0x0e,0x04]
 
-v_madak_f32 v0, 0, v0, 0x11213141
-// CHECK: [0x80,0x00,0x00,0x42,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xa8,0xd2,0x01,0x04,0x0e,0x04]
 
-v_madak_f32 v255, 0, v0, 0x11213141
-// CHECK: [0x80,0x00,0xfe,0x43,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xa8,0xd2,0x01,0x04,0x0e,0x04]
 
-v_madak_f32 v0, -1, v0, 0x11213141
-// CHECK: [0xc1,0x00,0x00,0x42,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xa8,0xd2,0x01,0x04,0x0e,0x04]
 
-v_madak_f32 v0, 0.5, v0, 0x11213141
-// CHECK: [0xf0,0x00,0x00,0x42,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0xa8,0xd2,0x01,0x04,0x0e,0x04]
 
-v_madak_f32 v0, -4.0, v0, 0x11213141
-// CHECK: [0xf7,0x00,0x00,0x42,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_madak_f32 v0, v0, v0, 0x11213141
-// CHECK: [0x00,0x01,0x00,0x42,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0x04,0x0e,0x14]
 
-v_madak_f32 v0, v255, v0, 0x11213141
-// CHECK: [0xff,0x01,0x00,0x42,0x41,0x31,0x21,0x11]
+v_max3_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xa8,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_madak_f32 v0, 0, v255, 0x11213141
-// CHECK: [0x80,0xfe,0x01,0x42,0x41,0x31,0x21,0x11]
+v_max3_i32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0x00,0x01,0x02]
 
-v_madak_f32 v0, 0, v0, 0xa1b1c1d1
-// CHECK: [0x80,0x00,0x00,0x42,0xd1,0xc1,0xb1,0xa1]
+v_max3_i32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xaa,0xd2,0x01,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x44]
+v_max3_i32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x67,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x45]
+v_max3_i32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x68,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x44]
+v_max3_i32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x69,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x44]
+v_max3_i32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x6a,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x44]
+v_max3_i32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x6b,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x44]
+v_max3_i32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x6c,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x44]
+v_max3_i32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x6d,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x44]
+v_max3_i32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x6e,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x44]
+v_max3_i32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x6f,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x44]
+v_max3_i32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x7b,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x44]
+v_max3_i32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x7c,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x44]
+v_max3_i32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x7e,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x44]
+v_max3_i32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x7f,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x44]
+v_max3_i32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x80,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x44]
+v_max3_i32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0xc1,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x44]
+v_max3_i32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0xf0,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x44]
+v_max3_i32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0xf7,0x00,0x01,0x02]
 
-v_bcnt_u32_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x44]
+v_max3_i32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0x01,0x01,0x02]
 
-v_bcnt_u32_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x44]
+v_max3_i32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0xff,0x01,0x01,0x02]
 
-v_bcnt_u32_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x44,0x56,0x34,0x12,0xaf]
+v_max3_i32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0x82,0x01,0x02]
 
-v_bcnt_u32_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x44,0x73,0x72,0x71,0x3f]
+v_max3_i32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0xe0,0x01,0x02]
 
-v_bcnt_u32_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x44]
+v_max3_i32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0xee,0x01,0x02]
 
-v_bcnt_u32_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x44]
+v_max3_i32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0x04,0x02,0x02]
 
-v_bcnt_u32_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x44]
+v_max3_i32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0xfe,0x03,0x02]
 
-v_bcnt_u32_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x44,0xd2,0x00,0x00,0x00,0x00]
+v_max3_i32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0x00,0x05,0x03]
 
-v_bcnt_u32_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x44,0xd2,0x00,0x00,0x00,0x00]
+v_max3_i32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0x00,0xc1,0x03]
 
-v_bcnt_u32_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x44,0xd2,0x80,0x00,0x00,0x00]
+v_max3_i32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0x00,0xdd,0x03]
 
-v_bcnt_u32_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x44,0xd2,0xc1,0x00,0x00,0x00]
+v_max3_i32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0x00,0x0d,0x04]
 
-v_bcnt_u32_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x44,0xd2,0xf0,0x00,0x00,0x00]
+v_max3_i32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xaa,0xd2,0x01,0x00,0xfd,0x07]
 
-v_bcnt_u32_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x44,0xd2,0xf7,0x00,0x00,0x00]
+v_max3_u32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0x00,0x01,0x02]
 
-v_bcnt_u32_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x44,0xd2,0x00,0x01,0x00,0x00]
+v_max3_u32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xac,0xd2,0x01,0x00,0x01,0x02]
 
-v_bcnt_u32_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x44,0xd2,0xff,0x01,0x00,0x00]
+v_max3_u32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x67,0x00,0x01,0x02]
 
-v_bcnt_u32_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x44,0xd2,0x00,0x00,0x01,0x00]
+v_max3_u32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x68,0x00,0x01,0x02]
 
-v_bcnt_u32_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x44,0xd2,0x00,0x82,0x01,0x00]
+v_max3_u32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x69,0x00,0x01,0x02]
 
-v_bcnt_u32_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x44,0xd2,0x00,0xe0,0x01,0x00]
+v_max3_u32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x6a,0x00,0x01,0x02]
 
-v_bcnt_u32_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x44,0xd2,0x00,0xee,0x01,0x00]
+v_max3_u32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x6b,0x00,0x01,0x02]
 
-v_bcnt_u32_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x44,0xd2,0x00,0x00,0x02,0x00]
+v_max3_u32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x6c,0x00,0x01,0x02]
 
-v_bcnt_u32_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x44,0xd2,0x00,0xfe,0x03,0x00]
+v_max3_u32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x6d,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x46]
+v_max3_u32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x6e,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x47]
+v_max3_u32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x6f,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x46]
+v_max3_u32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x7b,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x46]
+v_max3_u32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x7c,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x46]
+v_max3_u32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x7e,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x46]
+v_max3_u32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x7f,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x46]
+v_max3_u32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x80,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x46]
+v_max3_u32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0xc1,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x46]
+v_max3_u32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0xf0,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x46]
+v_max3_u32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0xf7,0x00,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x46]
+v_max3_u32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0x01,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x46]
+v_max3_u32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0xff,0x01,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x46]
+v_max3_u32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0x82,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x46]
+v_max3_u32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0xe0,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x46]
+v_max3_u32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0xee,0x01,0x02]
 
-v_mbcnt_lo_u32_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x46]
+v_max3_u32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0x04,0x02,0x02]
 
-v_mbcnt_lo_u32_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x46]
+v_max3_u32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0xfe,0x03,0x02]
 
-v_mbcnt_lo_u32_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x46]
+v_max3_u32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0x00,0x05,0x03]
 
-v_mbcnt_lo_u32_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x46]
+v_max3_u32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0x00,0xc1,0x03]
 
-v_mbcnt_lo_u32_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x46,0x56,0x34,0x12,0xaf]
+v_max3_u32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0x00,0xdd,0x03]
 
-v_mbcnt_lo_u32_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x46,0x73,0x72,0x71,0x3f]
+v_max3_u32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0x00,0x0d,0x04]
 
-v_mbcnt_lo_u32_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x46]
+v_max3_u32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xac,0xd2,0x01,0x00,0xfd,0x07]
 
-v_mbcnt_lo_u32_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x46]
+v_med3_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x46]
+v_med3_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xae,0xd2,0x01,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x46,0xd2,0x00,0x00,0x00,0x00]
+v_med3_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x67,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x46,0xd2,0x00,0x00,0x00,0x00]
+v_med3_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x68,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x46,0xd2,0x80,0x00,0x00,0x00]
+v_med3_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x69,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x46,0xd2,0xc1,0x00,0x00,0x00]
+v_med3_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x46,0xd2,0xf0,0x00,0x00,0x00]
+v_med3_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x46,0xd2,0xf7,0x00,0x00,0x00]
+v_med3_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x46,0xd2,0x00,0x01,0x00,0x00]
+v_med3_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x46,0xd2,0xff,0x01,0x00,0x00]
+v_med3_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x46,0xd2,0x00,0x00,0x01,0x00]
+v_med3_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x46,0xd2,0x00,0x82,0x01,0x00]
+v_med3_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x46,0xd2,0x00,0xe0,0x01,0x00]
+v_med3_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x46,0xd2,0x00,0xee,0x01,0x00]
+v_med3_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x46,0xd2,0x00,0x00,0x02,0x00]
+v_med3_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_mbcnt_lo_u32_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x46,0xd2,0x00,0xfe,0x03,0x00]
+v_med3_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_mbcnt_hi_u32_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x48]
+v_med3_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0x05,0x0e,0x04]
 
-v_mbcnt_hi_u32_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x49]
+v_med3_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0xff,0x05,0x0e,0x04]
 
-v_mbcnt_hi_u32_b32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x48]
+v_med3_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_mbcnt_hi_u32_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x48]
+v_med3_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0x04,0xfe,0x07]
 
-v_mbcnt_hi_u32_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x48]
+v_med3_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0x04,0x0e,0x24]
 
-v_mbcnt_hi_u32_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x48]
+v_med3_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0x04,0x0e,0x44]
 
-v_mbcnt_hi_u32_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x48]
+v_med3_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0x04,0x0e,0x84]
 
-v_mbcnt_hi_u32_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x48]
+v_med3_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_mbcnt_hi_u32_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x48]
+v_med3_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xae,0xd2,0x01,0x04,0x0e,0x04]
 
-v_mbcnt_hi_u32_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x48]
+v_med3_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xae,0xd2,0x01,0x04,0x0e,0x04]
 
-v_mbcnt_hi_u32_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x48]
+v_med3_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xae,0xd2,0x01,0x04,0x0e,0x04]
 
-v_mbcnt_hi_u32_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x48]
+v_med3_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xae,0xd2,0x01,0x04,0x0e,0x04]
 
-v_mbcnt_hi_u32_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x48]
+v_med3_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0xae,0xd2,0x01,0x04,0x0e,0x04]
 
-v_mbcnt_hi_u32_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x48]
+v_med3_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_mbcnt_hi_u32_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x48]
+v_med3_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0x04,0x0e,0x14]
 
-v_mbcnt_hi_u32_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x48]
+v_med3_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xae,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_mbcnt_hi_u32_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x48]
+v_med3_i32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x48]
+v_med3_i32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xb0,0xd2,0x01,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x48]
+v_med3_i32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x67,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x48,0x56,0x34,0x12,0xaf]
+v_med3_i32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x68,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x48,0x73,0x72,0x71,0x3f]
+v_med3_i32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x69,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x48]
+v_med3_i32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x6a,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x48]
+v_med3_i32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x6b,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x48]
+v_med3_i32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x6c,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x48,0xd2,0x00,0x00,0x00,0x00]
+v_med3_i32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x6d,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x48,0xd2,0x00,0x00,0x00,0x00]
+v_med3_i32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x6e,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x48,0xd2,0x80,0x00,0x00,0x00]
+v_med3_i32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x6f,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x48,0xd2,0xc1,0x00,0x00,0x00]
+v_med3_i32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x7b,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x48,0xd2,0xf0,0x00,0x00,0x00]
+v_med3_i32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x7c,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x48,0xd2,0xf7,0x00,0x00,0x00]
+v_med3_i32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x7e,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x48,0xd2,0x00,0x01,0x00,0x00]
+v_med3_i32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x7f,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x48,0xd2,0xff,0x01,0x00,0x00]
+v_med3_i32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x80,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x48,0xd2,0x00,0x00,0x01,0x00]
+v_med3_i32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0xc1,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x48,0xd2,0x00,0x82,0x01,0x00]
+v_med3_i32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0xf0,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x48,0xd2,0x00,0xe0,0x01,0x00]
+v_med3_i32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0xf7,0x00,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x48,0xd2,0x00,0xee,0x01,0x00]
+v_med3_i32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0x01,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x48,0xd2,0x00,0x00,0x02,0x00]
+v_med3_i32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0xff,0x01,0x01,0x02]
 
-v_mbcnt_hi_u32_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x48,0xd2,0x00,0xfe,0x03,0x00]
+v_med3_i32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0x82,0x01,0x02]
 
-v_add_i32 v0, vcc, s0, v0
-// CHECK: [0x00,0x00,0x00,0x4a]
+v_med3_i32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0xe0,0x01,0x02]
 
-v_add_i32 v255, vcc, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x4b]
+v_med3_i32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0xee,0x01,0x02]
 
-v_add_i32 v0, vcc, s103, v0
-// CHECK: [0x67,0x00,0x00,0x4a]
+v_med3_i32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0x04,0x02,0x02]
 
-v_add_i32 v0, vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x4a]
+v_med3_i32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0xfe,0x03,0x02]
 
-v_add_i32 v0, vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x4a]
+v_med3_i32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0x00,0x05,0x03]
 
-v_add_i32 v0, vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x4a]
+v_med3_i32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0x00,0xc1,0x03]
 
-v_add_i32 v0, vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x4a]
+v_med3_i32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0x00,0xdd,0x03]
 
-v_add_i32 v0, vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x4a]
+v_med3_i32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0x00,0x0d,0x04]
 
-v_add_i32 v0, vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x4a]
+v_med3_i32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xb0,0xd2,0x01,0x00,0xfd,0x07]
 
-v_add_i32 v0, vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x4a]
+v_med3_u32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x4a]
+v_med3_u32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xb2,0xd2,0x01,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x4a]
+v_med3_u32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x67,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x4a]
+v_med3_u32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x68,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x4a]
+v_med3_u32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x69,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x4a]
+v_med3_u32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x6a,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, 0, v0
-// CHECK: [0x80,0x00,0x00,0x4a]
+v_med3_u32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x6b,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x4a]
+v_med3_u32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x6c,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x4a]
+v_med3_u32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x6d,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x4a]
+v_med3_u32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x6e,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x4a,0x56,0x34,0x12,0xaf]
+v_med3_u32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x6f,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x4a,0x73,0x72,0x71,0x3f]
+v_med3_u32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x7b,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, v0, v0
-// CHECK: [0x00,0x01,0x00,0x4a]
+v_med3_u32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x7c,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, v255, v0
-// CHECK: [0xff,0x01,0x00,0x4a]
+v_med3_u32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x7e,0x00,0x01,0x02]
 
-v_add_i32 v0, vcc, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x4a]
+v_med3_u32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x7f,0x00,0x01,0x02]
 
-v_add_i32_e64 v0, s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x4a,0xd2,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x80,0x00,0x01,0x02]
 
-v_add_i32_e64 v255, s[0:1], s0, s0
-// CHECK: [0xff,0x00,0x4a,0xd2,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0xc1,0x00,0x01,0x02]
 
-v_add_i32_e64 v0, s[2:3], s0, s0
-// CHECK: [0x00,0x02,0x4a,0xd2,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0xf0,0x00,0x01,0x02]
 
-v_add_i32_e64 v0, s[102:103], s0, s0
-// CHECK: [0x00,0x66,0x4a,0xd2,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0xf7,0x00,0x01,0x02]
 
-v_add_i32_e64 v0, flat_scratch, s0, s0
-// CHECK: [0x00,0x68,0x4a,0xd2,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0x01,0x01,0x02]
 
-v_add_i32_e64 v0, vcc, s0, s0
-// CHECK: [0x00,0x6a,0x4a,0xd2,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0xff,0x01,0x01,0x02]
 
-v_add_i32_e64 v0, tba, s0, s0
-// CHECK: [0x00,0x6c,0x4a,0xd2,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0x82,0x01,0x02]
 
-v_add_i32_e64 v0, tma, s0, s0
-// CHECK: [0x00,0x6e,0x4a,0xd2,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0xe0,0x01,0x02]
 
-v_add_i32_e64 v0, ttmp[10:11], s0, s0
-// CHECK: [0x00,0x7a,0x4a,0xd2,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0xee,0x01,0x02]
 
-v_add_i32_e64 v0, s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x4a,0xd2,0x80,0x00,0x00,0x00]
+v_med3_u32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0x04,0x02,0x02]
 
-v_add_i32_e64 v0, s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x4a,0xd2,0xc1,0x00,0x00,0x00]
+v_med3_u32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0xfe,0x03,0x02]
 
-v_add_i32_e64 v0, s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x4a,0xd2,0xf0,0x00,0x00,0x00]
+v_med3_u32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0x00,0x05,0x03]
 
-v_add_i32_e64 v0, s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x4a,0xd2,0xf7,0x00,0x00,0x00]
+v_med3_u32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0x00,0xc1,0x03]
 
-v_add_i32_e64 v0, s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x4a,0xd2,0x00,0x01,0x00,0x00]
+v_med3_u32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0x00,0xdd,0x03]
 
-v_add_i32_e64 v0, s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x4a,0xd2,0xff,0x01,0x00,0x00]
+v_med3_u32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0x00,0x0d,0x04]
 
-v_add_i32_e64 v0, s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x4a,0xd2,0x00,0x00,0x01,0x00]
+v_med3_u32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xb2,0xd2,0x01,0x00,0xfd,0x07]
 
-v_add_i32_e64 v0, s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x4a,0xd2,0x00,0x82,0x01,0x00]
+v_sad_u8 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x01,0x00,0x01,0x02]
 
-v_add_i32_e64 v0, s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x4a,0xd2,0x00,0xe0,0x01,0x00]
+v_sad_u8 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xb4,0xd2,0x01,0x00,0x01,0x02]
 
-v_add_i32_e64 v0, s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x4a,0xd2,0x00,0xee,0x01,0x00]
+v_sad_u8 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x67,0x00,0x01,0x02]
 
-v_add_i32_e64 v0, s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x4a,0xd2,0x00,0x00,0x02,0x00]
+v_sad_u8 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x68,0x00,0x01,0x02]
 
-v_add_i32_e64 v0, s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x4a,0xd2,0x00,0xfe,0x03,0x00]
+v_sad_u8 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x69,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, s0, v0
-// CHECK: [0x00,0x00,0x00,0x4c]
+v_sad_u8 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x6a,0x00,0x01,0x02]
 
-v_sub_i32 v255, vcc, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x4d]
+v_sad_u8 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x6b,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, s103, v0
-// CHECK: [0x67,0x00,0x00,0x4c]
+v_sad_u8 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x6c,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x4c]
+v_sad_u8 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x6d,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x4c]
+v_sad_u8 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x6e,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x4c]
+v_sad_u8 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x6f,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x4c]
+v_sad_u8 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x7b,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x4c]
+v_sad_u8 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x7c,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x4c]
+v_sad_u8 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x7e,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x4c]
+v_sad_u8 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x7f,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x4c]
+v_sad_u8 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x80,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x4c]
+v_sad_u8 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0xc1,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x4c]
+v_sad_u8 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x01,0x01,0x01,0x02]
 
-v_sub_i32 v0, vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x4c]
+v_sad_u8 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0xff,0x01,0x01,0x02]
 
-v_sub_i32 v0, vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x4c]
+v_sad_u8 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x01,0x82,0x01,0x02]
 
-v_sub_i32 v0, vcc, 0, v0
-// CHECK: [0x80,0x00,0x00,0x4c]
+v_sad_u8 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x01,0x04,0x02,0x02]
 
-v_sub_i32 v0, vcc, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x4c]
+v_sad_u8 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x01,0xfe,0x03,0x02]
 
-v_sub_i32 v0, vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x4c]
+v_sad_u8 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xb4,0xd2,0x01,0x00,0x05,0x03]
 
-v_sub_i32 v0, vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x4c]
+v_sad_u8 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xb4,0xd2,0x01,0x00,0xc1,0x03]
 
-v_sub_i32 v0, vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x4c,0x56,0x34,0x12,0xaf]
+v_sad_u8 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xb4,0xd2,0x01,0x00,0xdd,0x03]
 
-v_sub_i32 v0, vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x4c,0x73,0x72,0x71,0x3f]
+v_sad_u8 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xb4,0xd2,0x01,0x00,0x0d,0x04]
 
-v_sub_i32 v0, vcc, v0, v0
-// CHECK: [0x00,0x01,0x00,0x4c]
+v_sad_u8 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xb4,0xd2,0x01,0x00,0xfd,0x07]
 
-v_sub_i32 v0, vcc, v255, v0
-// CHECK: [0xff,0x01,0x00,0x4c]
+v_sad_hi_u8 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x01,0x00,0x01,0x02]
 
-v_sub_i32 v0, vcc, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x4c]
+v_sad_hi_u8 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xb6,0xd2,0x01,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x4c,0xd2,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x67,0x00,0x01,0x02]
 
-v_sub_i32_e64 v255, s[0:1], s0, s0
-// CHECK: [0xff,0x00,0x4c,0xd2,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x68,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, s[2:3], s0, s0
-// CHECK: [0x00,0x02,0x4c,0xd2,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x69,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, s[102:103], s0, s0
-// CHECK: [0x00,0x66,0x4c,0xd2,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x6a,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, flat_scratch, s0, s0
-// CHECK: [0x00,0x68,0x4c,0xd2,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x6b,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, vcc, s0, s0
-// CHECK: [0x00,0x6a,0x4c,0xd2,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x6c,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, tba, s0, s0
-// CHECK: [0x00,0x6c,0x4c,0xd2,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x6d,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, tma, s0, s0
-// CHECK: [0x00,0x6e,0x4c,0xd2,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x6e,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, ttmp[10:11], s0, s0
-// CHECK: [0x00,0x7a,0x4c,0xd2,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x6f,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x4c,0xd2,0x80,0x00,0x00,0x00]
+v_sad_hi_u8 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x7b,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x4c,0xd2,0xc1,0x00,0x00,0x00]
+v_sad_hi_u8 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x7c,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x4c,0xd2,0xf0,0x00,0x00,0x00]
+v_sad_hi_u8 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x7e,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x4c,0xd2,0xf7,0x00,0x00,0x00]
+v_sad_hi_u8 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x7f,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x4c,0xd2,0x00,0x01,0x00,0x00]
+v_sad_hi_u8 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x80,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x4c,0xd2,0xff,0x01,0x00,0x00]
+v_sad_hi_u8 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0xc1,0x00,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x4c,0xd2,0x00,0x00,0x01,0x00]
+v_sad_hi_u8 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x01,0x01,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x4c,0xd2,0x00,0x82,0x01,0x00]
+v_sad_hi_u8 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0xff,0x01,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x4c,0xd2,0x00,0xe0,0x01,0x00]
+v_sad_hi_u8 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x01,0x82,0x01,0x02]
 
-v_sub_i32_e64 v0, s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x4c,0xd2,0x00,0xee,0x01,0x00]
+v_sad_hi_u8 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x01,0x04,0x02,0x02]
 
-v_sub_i32_e64 v0, s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x4c,0xd2,0x00,0x00,0x02,0x00]
+v_sad_hi_u8 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x01,0xfe,0x03,0x02]
 
-v_sub_i32_e64 v0, s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x4c,0xd2,0x00,0xfe,0x03,0x00]
+v_sad_hi_u8 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xb6,0xd2,0x01,0x00,0x05,0x03]
 
-v_subrev_i32 v0, vcc, s0, v0
-// CHECK: [0x00,0x00,0x00,0x4e]
+v_sad_hi_u8 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xb6,0xd2,0x01,0x00,0xc1,0x03]
 
-v_subrev_i32 v255, vcc, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x4f]
+v_sad_hi_u8 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xb6,0xd2,0x01,0x00,0xdd,0x03]
 
-v_subrev_i32 v0, vcc, s103, v0
-// CHECK: [0x67,0x00,0x00,0x4e]
+v_sad_hi_u8 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xb6,0xd2,0x01,0x00,0x0d,0x04]
 
-v_subrev_i32 v0, vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x4e]
+v_sad_hi_u8 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xb6,0xd2,0x01,0x00,0xfd,0x07]
 
-v_subrev_i32 v0, vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x4e]
+v_sad_u16 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x01,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x4e]
+v_sad_u16 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xb8,0xd2,0x01,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x4e]
+v_sad_u16 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x67,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x4e]
+v_sad_u16 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x68,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x4e]
+v_sad_u16 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x69,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x4e]
+v_sad_u16 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x6a,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x4e]
+v_sad_u16 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x6b,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x4e]
+v_sad_u16 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x6c,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x4e]
+v_sad_u16 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x6d,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x4e]
+v_sad_u16 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x6e,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x4e]
+v_sad_u16 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x6f,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, 0, v0
-// CHECK: [0x80,0x00,0x00,0x4e]
+v_sad_u16 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x7b,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x4e]
+v_sad_u16 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x7c,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x4e]
+v_sad_u16 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x7e,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x4e]
+v_sad_u16 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x7f,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x4e,0x56,0x34,0x12,0xaf]
+v_sad_u16 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x80,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x4e,0x73,0x72,0x71,0x3f]
+v_sad_u16 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0xc1,0x00,0x01,0x02]
 
-v_subrev_i32 v0, vcc, v0, v0
-// CHECK: [0x00,0x01,0x00,0x4e]
+v_sad_u16 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x01,0x01,0x01,0x02]
 
-v_subrev_i32 v0, vcc, v255, v0
-// CHECK: [0xff,0x01,0x00,0x4e]
+v_sad_u16 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0xff,0x01,0x01,0x02]
 
-v_subrev_i32 v0, vcc, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x4e]
+v_sad_u16 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x01,0x82,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x4e,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x01,0x04,0x02,0x02]
 
-v_subrev_i32_e64 v255, s[0:1], s0, s0
-// CHECK: [0xff,0x00,0x4e,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x01,0xfe,0x03,0x02]
 
-v_subrev_i32_e64 v0, s[2:3], s0, s0
-// CHECK: [0x00,0x02,0x4e,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xb8,0xd2,0x01,0x00,0x05,0x03]
 
-v_subrev_i32_e64 v0, s[102:103], s0, s0
-// CHECK: [0x00,0x66,0x4e,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xb8,0xd2,0x01,0x00,0xc1,0x03]
 
-v_subrev_i32_e64 v0, flat_scratch, s0, s0
-// CHECK: [0x00,0x68,0x4e,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xb8,0xd2,0x01,0x00,0xdd,0x03]
 
-v_subrev_i32_e64 v0, vcc, s0, s0
-// CHECK: [0x00,0x6a,0x4e,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xb8,0xd2,0x01,0x00,0x0d,0x04]
 
-v_subrev_i32_e64 v0, tba, s0, s0
-// CHECK: [0x00,0x6c,0x4e,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xb8,0xd2,0x01,0x00,0xfd,0x07]
 
-v_subrev_i32_e64 v0, tma, s0, s0
-// CHECK: [0x00,0x6e,0x4e,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, ttmp[10:11], s0, s0
-// CHECK: [0x00,0x7a,0x4e,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xba,0xd2,0x01,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x4e,0xd2,0x80,0x00,0x00,0x00]
+v_sad_u32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x67,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x4e,0xd2,0xc1,0x00,0x00,0x00]
+v_sad_u32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x68,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x4e,0xd2,0xf0,0x00,0x00,0x00]
+v_sad_u32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x69,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x4e,0xd2,0xf7,0x00,0x00,0x00]
+v_sad_u32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x6a,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x4e,0xd2,0x00,0x01,0x00,0x00]
+v_sad_u32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x6b,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x4e,0xd2,0xff,0x01,0x00,0x00]
+v_sad_u32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x6c,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x4e,0xd2,0x00,0x00,0x01,0x00]
+v_sad_u32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x6d,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x4e,0xd2,0x00,0x82,0x01,0x00]
+v_sad_u32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x6e,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x4e,0xd2,0x00,0xe0,0x01,0x00]
+v_sad_u32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x6f,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x4e,0xd2,0x00,0xee,0x01,0x00]
+v_sad_u32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x7b,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x4e,0xd2,0x00,0x00,0x02,0x00]
+v_sad_u32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x7c,0x00,0x01,0x02]
 
-v_subrev_i32_e64 v0, s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x4e,0xd2,0x00,0xfe,0x03,0x00]
+v_sad_u32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x7e,0x00,0x01,0x02]
 
-v_addc_u32 v0, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0x00,0x50]
+v_sad_u32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x7f,0x00,0x01,0x02]
 
-v_addc_u32 v255, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0xfe,0x51]
+v_sad_u32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x80,0x00,0x01,0x02]
 
-v_addc_u32 v0, vcc, vcc_hi, v0, vcc
-// CHECK: [0x6b,0x00,0x00,0x50]
+v_sad_u32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0xc1,0x00,0x01,0x02]
 
-v_addc_u32 v0, vcc, 0, v0, vcc
-// CHECK: [0x80,0x00,0x00,0x50]
+v_sad_u32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0xf0,0x00,0x01,0x02]
 
-v_addc_u32 v0, vcc, -1, v0, vcc
-// CHECK: [0xc1,0x00,0x00,0x50]
+v_sad_u32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0xf7,0x00,0x01,0x02]
 
-v_addc_u32 v0, vcc, 0.5, v0, vcc
-// CHECK: [0xf0,0x00,0x00,0x50]
+v_sad_u32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0x01,0x01,0x02]
 
-v_addc_u32 v0, vcc, -4.0, v0, vcc
-// CHECK: [0xf7,0x00,0x00,0x50]
+v_sad_u32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0xff,0x01,0x01,0x02]
 
-v_addc_u32 v0, vcc, v0, v0, vcc
-// CHECK: [0x00,0x01,0x00,0x50]
+v_sad_u32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0x82,0x01,0x02]
 
-v_addc_u32 v0, vcc, v255, v0, vcc
-// CHECK: [0xff,0x01,0x00,0x50]
+v_sad_u32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0xe0,0x01,0x02]
 
-v_addc_u32 v0, vcc, vcc_lo, v255, vcc
-// CHECK: [0x6a,0xfe,0x01,0x50]
+v_sad_u32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0xee,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], s0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0x04,0x02,0x02]
 
-v_addc_u32_e64 v255, s[0:1], s0, s0, s[0:1]
-// CHECK: [0xff,0x00,0x50,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0xfe,0x03,0x02]
 
-v_addc_u32_e64 v0, s[2:3], s0, s0, s[0:1]
-// CHECK: [0x00,0x02,0x50,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0x00,0x05,0x03]
 
-v_addc_u32_e64 v0, s[102:103], s0, s0, s[0:1]
-// CHECK: [0x00,0x66,0x50,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0x00,0xc1,0x03]
 
-v_addc_u32_e64 v0, flat_scratch, s0, s0, s[0:1]
-// CHECK: [0x00,0x68,0x50,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0x00,0xdd,0x03]
 
-v_addc_u32_e64 v0, vcc, s0, s0, s[0:1]
-// CHECK: [0x00,0x6a,0x50,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0x00,0x0d,0x04]
 
-v_addc_u32_e64 v0, tba, s0, s0, s[0:1]
-// CHECK: [0x00,0x6c,0x50,0xd2,0x00,0x00,0x00,0x00]
+v_sad_u32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xba,0xd2,0x01,0x00,0xfd,0x07]
 
-v_addc_u32_e64 v0, tma, s0, s0, s[0:1]
-// CHECK: [0x00,0x6e,0x50,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, ttmp[10:11], s0, s0, s[0:1]
-// CHECK: [0x00,0x7a,0x50,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xbc,0xd2,0x01,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0x80,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x67,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0xc1,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x68,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0xf0,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x69,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0xf7,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x6a,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], v0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0x00,0x01,0x00,0x00]
+v_cvt_pk_u8_f32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x6b,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], v255, s0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0xff,0x01,0x00,0x00]
+v_cvt_pk_u8_f32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x6c,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], s0, 0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0x00,0x00,0x01,0x00]
+v_cvt_pk_u8_f32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x6d,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], s0, -1, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0x00,0x82,0x01,0x00]
+v_cvt_pk_u8_f32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x6e,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], s0, 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0x00,0xe0,0x01,0x00]
+v_cvt_pk_u8_f32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x6f,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], s0, -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0x00,0xee,0x01,0x00]
+v_cvt_pk_u8_f32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x7b,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], s0, v0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0x00,0x00,0x02,0x00]
+v_cvt_pk_u8_f32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x7c,0x00,0x01,0x02]
 
-v_addc_u32_e64 v0, s[0:1], s0, v255, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd2,0x00,0xfe,0x03,0x00]
+v_cvt_pk_u8_f32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x7e,0x00,0x01,0x02]
 
-v_subb_u32 v0, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0x00,0x52]
+v_cvt_pk_u8_f32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x7f,0x00,0x01,0x02]
 
-v_subb_u32 v255, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0xfe,0x53]
+v_cvt_pk_u8_f32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x80,0x00,0x01,0x02]
 
-v_subb_u32 v0, vcc, vcc_hi, v0, vcc
-// CHECK: [0x6b,0x00,0x00,0x52]
+v_cvt_pk_u8_f32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0xf0,0x00,0x01,0x02]
 
-v_subb_u32 v0, vcc, 0, v0, vcc
-// CHECK: [0x80,0x00,0x00,0x52]
+v_cvt_pk_u8_f32 v5, scc, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0xfd,0x00,0x01,0x02]
 
-v_subb_u32 v0, vcc, -1, v0, vcc
-// CHECK: [0xc1,0x00,0x00,0x52]
+v_cvt_pk_u8_f32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0x01,0x01,0x02]
 
-v_subb_u32 v0, vcc, 0.5, v0, vcc
-// CHECK: [0xf0,0x00,0x00,0x52]
+v_cvt_pk_u8_f32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0xff,0x01,0x01,0x02]
 
-v_subb_u32 v0, vcc, -4.0, v0, vcc
-// CHECK: [0xf7,0x00,0x00,0x52]
+v_cvt_pk_u8_f32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0x82,0x01,0x02]
 
-v_subb_u32 v0, vcc, v0, v0, vcc
-// CHECK: [0x00,0x01,0x00,0x52]
+v_cvt_pk_u8_f32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0xe0,0x01,0x02]
 
-v_subb_u32 v0, vcc, v255, v0, vcc
-// CHECK: [0xff,0x01,0x00,0x52]
+v_cvt_pk_u8_f32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0xee,0x01,0x02]
 
-v_subb_u32 v0, vcc, vcc_lo, v255, vcc
-// CHECK: [0x6a,0xfe,0x01,0x52]
+v_cvt_pk_u8_f32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0x04,0x02,0x02]
 
-v_subb_u32_e64 v0, s[0:1], s0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0xfe,0x03,0x02]
 
-v_subb_u32_e64 v255, s[0:1], s0, s0, s[0:1]
-// CHECK: [0xff,0x00,0x52,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0x00,0x05,0x03]
 
-v_subb_u32_e64 v0, s[2:3], s0, s0, s[0:1]
-// CHECK: [0x00,0x02,0x52,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0x00,0xc1,0x03]
 
-v_subb_u32_e64 v0, s[102:103], s0, s0, s[0:1]
-// CHECK: [0x00,0x66,0x52,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0x00,0xdd,0x03]
 
-v_subb_u32_e64 v0, flat_scratch, s0, s0, s[0:1]
-// CHECK: [0x00,0x68,0x52,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0x00,0x0d,0x04]
 
-v_subb_u32_e64 v0, vcc, s0, s0, s[0:1]
-// CHECK: [0x00,0x6a,0x52,0xd2,0x00,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xbc,0xd2,0x01,0x00,0xfd,0x07]
 
-v_subb_u32_e64 v0, tba, s0, s0, s[0:1]
-// CHECK: [0x00,0x6c,0x52,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, tma, s0, s0, s[0:1]
-// CHECK: [0x00,0x6e,0x52,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xbe,0xd2,0x01,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, ttmp[10:11], s0, s0, s[0:1]
-// CHECK: [0x00,0x7a,0x52,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f32 v5, s103, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x67,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0x80,0x00,0x00,0x00]
+v_div_fixup_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x68,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0xc1,0x00,0x00,0x00]
+v_div_fixup_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x69,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0xf0,0x00,0x00,0x00]
+v_div_fixup_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0xf7,0x00,0x00,0x00]
+v_div_fixup_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x6b,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], v0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0x00,0x01,0x00,0x00]
+v_div_fixup_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], v255, s0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0xff,0x01,0x00,0x00]
+v_div_fixup_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x6d,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], s0, 0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0x00,0x00,0x01,0x00]
+v_div_fixup_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], s0, -1, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0x00,0x82,0x01,0x00]
+v_div_fixup_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x6f,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], s0, 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0x00,0xe0,0x01,0x00]
+v_div_fixup_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x7b,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], s0, -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0x00,0xee,0x01,0x00]
+v_div_fixup_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x7c,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], s0, v0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0x00,0x00,0x02,0x00]
+v_div_fixup_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_subb_u32_e64 v0, s[0:1], s0, v255, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd2,0x00,0xfe,0x03,0x00]
+v_div_fixup_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x7f,0x04,0x0e,0x04]
 
-v_subbrev_u32 v0, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0x00,0x54]
+v_div_fixup_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_subbrev_u32 v255, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0xfe,0x55]
+v_div_fixup_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0x05,0x0e,0x04]
 
-v_subbrev_u32 v0, vcc, vcc_hi, v0, vcc
-// CHECK: [0x6b,0x00,0x00,0x54]
+v_div_fixup_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0xff,0x05,0x0e,0x04]
 
-v_subbrev_u32 v0, vcc, 0, v0, vcc
-// CHECK: [0x80,0x00,0x00,0x54]
+v_div_fixup_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0xfe,0x0f,0x04]
 
-v_subbrev_u32 v0, vcc, -1, v0, vcc
-// CHECK: [0xc1,0x00,0x00,0x54]
+v_div_fixup_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0x04,0xfe,0x07]
 
-v_subbrev_u32 v0, vcc, 0.5, v0, vcc
-// CHECK: [0xf0,0x00,0x00,0x54]
+v_div_fixup_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0x04,0x0e,0x24]
 
-v_subbrev_u32 v0, vcc, -4.0, v0, vcc
-// CHECK: [0xf7,0x00,0x00,0x54]
+v_div_fixup_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0x04,0x0e,0x44]
 
-v_subbrev_u32 v0, vcc, v0, v0, vcc
-// CHECK: [0x00,0x01,0x00,0x54]
+v_div_fixup_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0x04,0x0e,0x84]
 
-v_subbrev_u32 v0, vcc, v255, v0, vcc
-// CHECK: [0xff,0x01,0x00,0x54]
+v_div_fixup_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0x04,0x0e,0xe4]
 
-v_subbrev_u32 v0, vcc, vcc_lo, v255, vcc
-// CHECK: [0x6a,0xfe,0x01,0x54]
+v_div_fixup_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xbe,0xd2,0x01,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xbe,0xd2,0x01,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v255, s[0:1], s0, s0, s[0:1]
-// CHECK: [0xff,0x00,0x54,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xbe,0xd2,0x01,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[2:3], s0, s0, s[0:1]
-// CHECK: [0x00,0x02,0x54,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xbe,0xd2,0x01,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[102:103], s0, s0, s[0:1]
-// CHECK: [0x00,0x66,0x54,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x08,0xbe,0xd2,0x01,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, flat_scratch, s0, s0, s[0:1]
-// CHECK: [0x00,0x68,0x54,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0x04,0x0e,0x0c]
 
-v_subbrev_u32_e64 v0, vcc, s0, s0, s[0:1]
-// CHECK: [0x00,0x6a,0x54,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0x04,0x0e,0x14]
 
-v_subbrev_u32_e64 v0, tba, s0, s0, s[0:1]
-// CHECK: [0x00,0x6c,0x54,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xbe,0xd2,0x01,0x04,0x0e,0x1c]
 
-v_subbrev_u32_e64 v0, tma, s0, s0, s[0:1]
-// CHECK: [0x00,0x6e,0x54,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x02,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, ttmp[10:11], s0, s0, s[0:1]
-// CHECK: [0x00,0x7a,0x54,0xd2,0x00,0x00,0x00,0x00]
+v_div_fixup_f64 v[254:255], s[2:3], v[2:3], v[3:4]
+// CHECK: [0xfe,0x00,0xc0,0xd2,0x02,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0x80,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], s[4:5], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x04,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0xc1,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], s[102:103], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x66,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0xf0,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], flat_scratch, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x68,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0xf7,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], vcc, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x6a,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], v0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0x00,0x01,0x00,0x00]
+v_div_fixup_f64 v[5:6], tba, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x6c,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], v255, s0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0xff,0x01,0x00,0x00]
+v_div_fixup_f64 v[5:6], tma, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x6e,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, 0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0x00,0x00,0x01,0x00]
+v_div_fixup_f64 v[5:6], ttmp[10:11], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x7a,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, -1, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0x00,0x82,0x01,0x00]
+v_div_fixup_f64 v[5:6], exec, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x7e,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0x00,0xe0,0x01,0x00]
+v_div_fixup_f64 v[5:6], scc, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0xfd,0x04,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0x00,0xee,0x01,0x00]
+v_div_fixup_f64 v[5:6], v[1:2], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x01,0x05,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, v0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0x00,0x00,0x02,0x00]
+v_div_fixup_f64 v[5:6], v[254:255], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0xfe,0x05,0x0e,0x04]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, v255, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd2,0x00,0xfe,0x03,0x00]
+v_div_fixup_f64 v[5:6], s[2:3], v[254:255], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x02,0xfc,0x0f,0x04]
 
-v_ldexp_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[254:255]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x02,0x04,0xfa,0x07]
 
-v_ldexp_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x57]
+v_div_fixup_f64 v[5:6], -s[2:3], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x02,0x04,0x0e,0x24]
 
-v_ldexp_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], s[2:3], -v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x02,0x04,0x0e,0x44]
 
-v_ldexp_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], -v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x02,0x04,0x0e,0x84]
 
-v_ldexp_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], -s[2:3], -v[2:3], -v[3:4]
+// CHECK: [0x05,0x00,0xc0,0xd2,0x02,0x04,0x0e,0xe4]
 
-v_ldexp_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], |s[2:3]|, v[2:3], v[3:4]
+// CHECK: [0x05,0x01,0xc0,0xd2,0x02,0x04,0x0e,0x04]
 
-v_ldexp_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], s[2:3], |v[2:3]|, v[3:4]
+// CHECK: [0x05,0x02,0xc0,0xd2,0x02,0x04,0x0e,0x04]
 
-v_ldexp_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], |v[3:4]|
+// CHECK: [0x05,0x04,0xc0,0xd2,0x02,0x04,0x0e,0x04]
 
-v_ldexp_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], |s[2:3]|, |v[2:3]|, |v[3:4]|
+// CHECK: [0x05,0x07,0xc0,0xd2,0x02,0x04,0x0e,0x04]
 
-v_ldexp_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[3:4] clamp
+// CHECK: [0x05,0x08,0xc0,0xd2,0x02,0x04,0x0e,0x04]
 
-v_ldexp_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[3:4] mul:2
+// CHECK: [0x05,0x00,0xc0,0xd2,0x02,0x04,0x0e,0x0c]
 
-v_ldexp_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[3:4] mul:4
+// CHECK: [0x05,0x00,0xc0,0xd2,0x02,0x04,0x0e,0x14]
 
-v_ldexp_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x56]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[3:4] div:2
+// CHECK: [0x05,0x00,0xc0,0xd2,0x02,0x04,0x0e,0x1c]
 
-v_ldexp_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x56]
+v_lshl_b64 v[5:6], 0, s2
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0x04,0x00,0x00]
 
-v_ldexp_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x56]
+v_lshl_b64 v[254:255], 0, s2
+// CHECK: [0xfe,0x00,0xc2,0xd2,0x80,0x04,0x00,0x00]
 
-v_ldexp_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x56]
+v_lshl_b64 v[5:6], -1, s2
+// CHECK: [0x05,0x00,0xc2,0xd2,0xc1,0x04,0x00,0x00]
 
-v_ldexp_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x56]
+v_lshl_b64 v[5:6], 0.5, s2
+// CHECK: [0x05,0x00,0xc2,0xd2,0xf0,0x04,0x00,0x00]
 
-v_ldexp_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x56]
+v_lshl_b64 v[5:6], -4.0, s2
+// CHECK: [0x05,0x00,0xc2,0xd2,0xf7,0x04,0x00,0x00]
 
-v_ldexp_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x56]
+v_lshl_b64 v[5:6], v[1:2], s2
+// CHECK: [0x05,0x00,0xc2,0xd2,0x01,0x05,0x00,0x00]
 
-v_ldexp_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x56,0x56,0x34,0x12,0xaf]
+v_lshl_b64 v[5:6], v[254:255], s2
+// CHECK: [0x05,0x00,0xc2,0xd2,0xfe,0x05,0x00,0x00]
 
-v_ldexp_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x56,0x73,0x72,0x71,0x3f]
+v_lshl_b64 v[5:6], 0, s103
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xce,0x00,0x00]
 
-v_ldexp_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x56]
+v_lshl_b64 v[5:6], 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xd0,0x00,0x00]
 
-v_ldexp_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x56]
+v_lshl_b64 v[5:6], 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xd2,0x00,0x00]
 
-v_ldexp_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x56]
+v_lshl_b64 v[5:6], 0, vcc_lo
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xd4,0x00,0x00]
 
-v_ldexp_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x56,0xd2,0x00,0x00,0x00,0x00]
+v_lshl_b64 v[5:6], 0, vcc_hi
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xd6,0x00,0x00]
 
-v_ldexp_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x56,0xd2,0x00,0x00,0x00,0x00]
+v_lshl_b64 v[5:6], 0, tba_lo
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xd8,0x00,0x00]
 
-v_ldexp_f32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x56,0xd2,0x80,0x00,0x00,0x00]
+v_lshl_b64 v[5:6], 0, tba_hi
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xda,0x00,0x00]
 
-v_ldexp_f32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x56,0xd2,0xf0,0x00,0x00,0x00]
+v_lshl_b64 v[5:6], 0, tma_lo
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xdc,0x00,0x00]
 
-v_ldexp_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x56,0xd2,0xfd,0x00,0x00,0x00]
+v_lshl_b64 v[5:6], 0, tma_hi
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xde,0x00,0x00]
 
-v_ldexp_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x56,0xd2,0x00,0x01,0x00,0x00]
+v_lshl_b64 v[5:6], 0, ttmp11
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xf6,0x00,0x00]
 
-v_ldexp_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x56,0xd2,0xff,0x01,0x00,0x00]
+v_lshl_b64 v[5:6], 0, m0
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xf8,0x00,0x00]
 
-v_ldexp_f32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x56,0xd2,0x00,0x00,0x01,0x00]
+v_lshl_b64 v[5:6], 0, exec_lo
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xfc,0x00,0x00]
 
-v_ldexp_f32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x56,0xd2,0x00,0x82,0x01,0x00]
+v_lshl_b64 v[5:6], 0, exec_hi
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xfe,0x00,0x00]
 
-v_ldexp_f32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x56,0xd2,0x00,0xe0,0x01,0x00]
+v_lshl_b64 v[5:6], 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0x00,0x01,0x00]
 
-v_ldexp_f32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x56,0xd2,0x00,0xee,0x01,0x00]
+v_lshl_b64 v[5:6], 0, -1
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0x82,0x01,0x00]
 
-v_ldexp_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x56,0xd2,0x00,0xfa,0x01,0x00]
+v_lshl_b64 v[5:6], 0, 0.5
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xe0,0x01,0x00]
 
-v_ldexp_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x56,0xd2,0x00,0x00,0x02,0x00]
+v_lshl_b64 v[5:6], 0, -4.0
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xee,0x01,0x00]
 
-v_ldexp_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x56,0xd2,0x00,0xfe,0x03,0x00]
+v_lshl_b64 v[5:6], 0, v2
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0x04,0x02,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x58]
+v_lshl_b64 v[5:6], 0, v255
+// CHECK: [0x05,0x00,0xc2,0xd2,0x80,0xfe,0x03,0x00]
 
-v_cvt_pkaccum_u8_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x59]
+v_lshr_b64 v[5:6], 0, s2
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x58]
+v_lshr_b64 v[254:255], 0, s2
+// CHECK: [0xfe,0x00,0xc4,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], -1, s2
+// CHECK: [0x05,0x00,0xc4,0xd2,0xc1,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0.5, s2
+// CHECK: [0x05,0x00,0xc4,0xd2,0xf0,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], -4.0, s2
+// CHECK: [0x05,0x00,0xc4,0xd2,0xf7,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], v[1:2], s2
+// CHECK: [0x05,0x00,0xc4,0xd2,0x01,0x05,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], v[254:255], s2
+// CHECK: [0x05,0x00,0xc4,0xd2,0xfe,0x05,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, s103
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xce,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xd0,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xd2,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, vcc_lo
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xd4,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, vcc_hi
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xd6,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, tba_lo
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xd8,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, tba_hi
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xda,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, tma_lo
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xdc,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, tma_hi
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xde,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, ttmp11
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xf6,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x58]
+v_lshr_b64 v[5:6], 0, m0
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xf8,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x58,0x56,0x34,0x12,0xaf]
+v_lshr_b64 v[5:6], 0, exec_lo
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xfc,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x58,0x73,0x72,0x71,0x3f]
+v_lshr_b64 v[5:6], 0, exec_hi
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xfe,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x58]
+v_lshr_b64 v[5:6], 0, 0
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0x00,0x01,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x58]
+v_lshr_b64 v[5:6], 0, -1
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0x82,0x01,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x58]
+v_lshr_b64 v[5:6], 0, 0.5
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xe0,0x01,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x58,0xd2,0x00,0x00,0x00,0x00]
+v_lshr_b64 v[5:6], 0, -4.0
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xee,0x01,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x58,0xd2,0x00,0x00,0x00,0x00]
+v_lshr_b64 v[5:6], 0, v2
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0x04,0x02,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x58,0xd2,0xfd,0x00,0x00,0x00]
+v_lshr_b64 v[5:6], 0, v255
+// CHECK: [0x05,0x00,0xc4,0xd2,0x80,0xfe,0x03,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x58,0xd2,0x00,0x01,0x00,0x00]
+v_ashr_i64 v[5:6], 0, s2
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x58,0xd2,0xff,0x01,0x00,0x00]
+v_ashr_i64 v[254:255], 0, s2
+// CHECK: [0xfe,0x00,0xc6,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x58,0xd2,0x00,0x00,0x01,0x00]
+v_ashr_i64 v[5:6], -1, s2
+// CHECK: [0x05,0x00,0xc6,0xd2,0xc1,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x58,0xd2,0x00,0x82,0x01,0x00]
+v_ashr_i64 v[5:6], 0.5, s2
+// CHECK: [0x05,0x00,0xc6,0xd2,0xf0,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x58,0xd2,0x00,0xe0,0x01,0x00]
+v_ashr_i64 v[5:6], -4.0, s2
+// CHECK: [0x05,0x00,0xc6,0xd2,0xf7,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x58,0xd2,0x00,0xee,0x01,0x00]
+v_ashr_i64 v[5:6], v[1:2], s2
+// CHECK: [0x05,0x00,0xc6,0xd2,0x01,0x05,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x58,0xd2,0x00,0xfa,0x01,0x00]
+v_ashr_i64 v[5:6], v[254:255], s2
+// CHECK: [0x05,0x00,0xc6,0xd2,0xfe,0x05,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x58,0xd2,0x00,0x00,0x02,0x00]
+v_ashr_i64 v[5:6], 0, s103
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xce,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x58,0xd2,0x00,0xfe,0x03,0x00]
+v_ashr_i64 v[5:6], 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xd0,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x58,0xd2,0x00,0x00,0x00,0x20]
+v_ashr_i64 v[5:6], 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xd2,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x58,0xd2,0x00,0x00,0x00,0x00]
+v_ashr_i64 v[5:6], 0, vcc_lo
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xd4,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, vcc_hi
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xd6,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x5b]
+v_ashr_i64 v[5:6], 0, tba_lo
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xd8,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, tba_hi
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xda,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, tma_lo
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xdc,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, tma_hi
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xde,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, ttmp11
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xf6,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, m0
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xf8,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, exec_lo
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xfc,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, exec_hi
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xfe,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, 0
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0x00,0x01,0x00]
 
-v_cvt_pknorm_i16_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, -1
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0x82,0x01,0x00]
 
-v_cvt_pknorm_i16_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, 0.5
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xe0,0x01,0x00]
 
-v_cvt_pknorm_i16_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, -4.0
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xee,0x01,0x00]
 
-v_cvt_pknorm_i16_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, v2
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0x04,0x02,0x00]
 
-v_cvt_pknorm_i16_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x5a]
+v_ashr_i64 v[5:6], 0, v255
+// CHECK: [0x05,0x00,0xc6,0xd2,0x80,0xfe,0x03,0x00]
 
-v_cvt_pknorm_i16_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x5a]
+v_add_f64 v[5:6], s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0xc8,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x5a]
+v_add_f64 v[254:255], s[4:5], s[4:5]
+// CHECK: [0xfe,0x00,0xc8,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x5a]
+v_add_f64 v[5:6], v[1:2], s[4:5]
+// CHECK: [0x05,0x00,0xc8,0xd2,0x01,0x09,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x5a]
+v_add_f64 v[5:6], v[254:255], s[4:5]
+// CHECK: [0x05,0x00,0xc8,0xd2,0xfe,0x09,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x5a,0x56,0x34,0x12,0xaf]
+v_add_f64 v[5:6], s[4:5], v[2:3]
+// CHECK: [0x05,0x00,0xc8,0xd2,0x04,0x04,0x02,0x00]
 
-v_cvt_pknorm_i16_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x5a,0x73,0x72,0x71,0x3f]
+v_add_f64 v[5:6], s[4:5], v[254:255]
+// CHECK: [0x05,0x00,0xc8,0xd2,0x04,0xfc,0x03,0x00]
 
-v_cvt_pknorm_i16_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x5a]
+v_add_f64 v[5:6], -s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0xc8,0xd2,0x04,0x08,0x00,0x20]
 
-v_cvt_pknorm_i16_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x5a]
+v_add_f64 v[5:6], s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0xc8,0xd2,0x04,0x08,0x00,0x40]
 
-v_cvt_pknorm_i16_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x5a]
+v_add_f64 v[5:6], -s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0xc8,0xd2,0x04,0x08,0x00,0x60]
 
-v_cvt_pknorm_i16_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x5a,0xd2,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], |s[4:5]|, s[4:5]
+// CHECK: [0x05,0x01,0xc8,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_i16_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x5a,0xd2,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], s[4:5], |s[4:5]|
+// CHECK: [0x05,0x02,0xc8,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_i16_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x5a,0xd2,0xfd,0x00,0x00,0x00]
+v_add_f64 v[5:6], |s[4:5]|, |s[4:5]|
+// CHECK: [0x05,0x03,0xc8,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_i16_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x5a,0xd2,0x00,0x01,0x00,0x00]
+v_add_f64 v[5:6], s[4:5], s[4:5] clamp
+// CHECK: [0x05,0x08,0xc8,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_i16_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x5a,0xd2,0xff,0x01,0x00,0x00]
+v_add_f64 v[5:6], s[4:5], s[4:5] mul:2
+// CHECK: [0x05,0x00,0xc8,0xd2,0x04,0x08,0x00,0x08]
 
-v_cvt_pknorm_i16_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x5a,0xd2,0x00,0xfa,0x01,0x00]
+v_add_f64 v[5:6], s[4:5], s[4:5] mul:4
+// CHECK: [0x05,0x00,0xc8,0xd2,0x04,0x08,0x00,0x10]
 
-v_cvt_pknorm_i16_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x5a,0xd2,0x00,0x00,0x02,0x00]
+v_add_f64 v[5:6], s[4:5], s[4:5] div:2
+// CHECK: [0x05,0x00,0xc8,0xd2,0x04,0x08,0x00,0x18]
 
-v_cvt_pknorm_i16_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x5a,0xd2,0x00,0xfe,0x03,0x00]
+v_mul_f64 v[5:6], s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0xca,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_i16_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x5a,0xd2,0x00,0x00,0x00,0x20]
+v_mul_f64 v[254:255], s[4:5], s[4:5]
+// CHECK: [0xfe,0x00,0xca,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_i16_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x5a,0xd2,0x00,0x00,0x00,0x40]
+v_mul_f64 v[5:6], v[1:2], s[4:5]
+// CHECK: [0x05,0x00,0xca,0xd2,0x01,0x09,0x00,0x00]
 
-v_cvt_pknorm_i16_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x5a,0xd2,0x00,0x00,0x00,0x60]
+v_mul_f64 v[5:6], v[254:255], s[4:5]
+// CHECK: [0x05,0x00,0xca,0xd2,0xfe,0x09,0x00,0x00]
 
-v_cvt_pknorm_i16_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x5a,0xd2,0x00,0x00,0x00,0x00]
+v_mul_f64 v[5:6], s[4:5], v[2:3]
+// CHECK: [0x05,0x00,0xca,0xd2,0x04,0x04,0x02,0x00]
 
-v_cvt_pknorm_i16_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x5a,0xd2,0x00,0x00,0x00,0x00]
+v_mul_f64 v[5:6], s[4:5], v[254:255]
+// CHECK: [0x05,0x00,0xca,0xd2,0x04,0xfc,0x03,0x00]
 
-v_cvt_pknorm_i16_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x5a,0xd2,0x00,0x00,0x00,0x00]
+v_mul_f64 v[5:6], -s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0xca,0xd2,0x04,0x08,0x00,0x20]
 
-v_cvt_pknorm_u16_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x5c]
+v_mul_f64 v[5:6], s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0xca,0xd2,0x04,0x08,0x00,0x40]
 
-v_cvt_pknorm_u16_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x5d]
+v_mul_f64 v[5:6], -s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0xca,0xd2,0x04,0x08,0x00,0x60]
 
-v_cvt_pknorm_u16_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x5c]
+v_mul_f64 v[5:6], |s[4:5]|, s[4:5]
+// CHECK: [0x05,0x01,0xca,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x5c]
+v_mul_f64 v[5:6], s[4:5], |s[4:5]|
+// CHECK: [0x05,0x02,0xca,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x5c]
+v_mul_f64 v[5:6], |s[4:5]|, |s[4:5]|
+// CHECK: [0x05,0x03,0xca,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x5c]
+v_mul_f64 v[5:6], s[4:5], s[4:5] clamp
+// CHECK: [0x05,0x08,0xca,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x5c]
+v_mul_f64 v[5:6], s[4:5], s[4:5] mul:2
+// CHECK: [0x05,0x00,0xca,0xd2,0x04,0x08,0x00,0x08]
 
-v_cvt_pknorm_u16_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x5c]
+v_mul_f64 v[5:6], s[4:5], s[4:5] mul:4
+// CHECK: [0x05,0x00,0xca,0xd2,0x04,0x08,0x00,0x10]
 
-v_cvt_pknorm_u16_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x5c]
+v_mul_f64 v[5:6], s[4:5], s[4:5] div:2
+// CHECK: [0x05,0x00,0xca,0xd2,0x04,0x08,0x00,0x18]
 
-v_cvt_pknorm_u16_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x5c]
+v_min_f64 v[5:6], s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0xcc,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x5c]
+v_min_f64 v[254:255], s[4:5], s[4:5]
+// CHECK: [0xfe,0x00,0xcc,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x5c]
+v_min_f64 v[5:6], v[1:2], s[4:5]
+// CHECK: [0x05,0x00,0xcc,0xd2,0x01,0x09,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x5c]
+v_min_f64 v[5:6], v[254:255], s[4:5]
+// CHECK: [0x05,0x00,0xcc,0xd2,0xfe,0x09,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x5c]
+v_min_f64 v[5:6], s[4:5], v[2:3]
+// CHECK: [0x05,0x00,0xcc,0xd2,0x04,0x04,0x02,0x00]
 
-v_cvt_pknorm_u16_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x5c]
+v_min_f64 v[5:6], s[4:5], v[254:255]
+// CHECK: [0x05,0x00,0xcc,0xd2,0x04,0xfc,0x03,0x00]
 
-v_cvt_pknorm_u16_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x5c]
+v_min_f64 v[5:6], -s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0xcc,0xd2,0x04,0x08,0x00,0x20]
 
-v_cvt_pknorm_u16_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x5c]
+v_min_f64 v[5:6], s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0xcc,0xd2,0x04,0x08,0x00,0x40]
 
-v_cvt_pknorm_u16_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x5c]
+v_min_f64 v[5:6], -s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0xcc,0xd2,0x04,0x08,0x00,0x60]
 
-v_cvt_pknorm_u16_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x5c]
+v_min_f64 v[5:6], |s[4:5]|, s[4:5]
+// CHECK: [0x05,0x01,0xcc,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x5c,0x56,0x34,0x12,0xaf]
+v_min_f64 v[5:6], s[4:5], |s[4:5]|
+// CHECK: [0x05,0x02,0xcc,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x5c,0x73,0x72,0x71,0x3f]
+v_min_f64 v[5:6], |s[4:5]|, |s[4:5]|
+// CHECK: [0x05,0x03,0xcc,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x5c]
+v_min_f64 v[5:6], s[4:5], s[4:5] clamp
+// CHECK: [0x05,0x08,0xcc,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x5c]
+v_min_f64 v[5:6], s[4:5], s[4:5] mul:2
+// CHECK: [0x05,0x00,0xcc,0xd2,0x04,0x08,0x00,0x08]
 
-v_cvt_pknorm_u16_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x5c]
+v_min_f64 v[5:6], s[4:5], s[4:5] mul:4
+// CHECK: [0x05,0x00,0xcc,0xd2,0x04,0x08,0x00,0x10]
 
-v_cvt_pknorm_u16_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x5c,0xd2,0x00,0x00,0x00,0x00]
+v_min_f64 v[5:6], s[4:5], s[4:5] div:2
+// CHECK: [0x05,0x00,0xcc,0xd2,0x04,0x08,0x00,0x18]
 
-v_cvt_pknorm_u16_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x5c,0xd2,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0xce,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x5c,0xd2,0xfd,0x00,0x00,0x00]
+v_max_f64 v[254:255], s[4:5], s[4:5]
+// CHECK: [0xfe,0x00,0xce,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x5c,0xd2,0x00,0x01,0x00,0x00]
+v_max_f64 v[5:6], v[1:2], s[4:5]
+// CHECK: [0x05,0x00,0xce,0xd2,0x01,0x09,0x00,0x00]
 
-v_cvt_pknorm_u16_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x5c,0xd2,0xff,0x01,0x00,0x00]
+v_max_f64 v[5:6], v[254:255], s[4:5]
+// CHECK: [0x05,0x00,0xce,0xd2,0xfe,0x09,0x00,0x00]
 
-v_cvt_pknorm_u16_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x5c,0xd2,0x00,0xfa,0x01,0x00]
+v_max_f64 v[5:6], s[4:5], v[2:3]
+// CHECK: [0x05,0x00,0xce,0xd2,0x04,0x04,0x02,0x00]
 
-v_cvt_pknorm_u16_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x5c,0xd2,0x00,0x00,0x02,0x00]
+v_max_f64 v[5:6], s[4:5], v[254:255]
+// CHECK: [0x05,0x00,0xce,0xd2,0x04,0xfc,0x03,0x00]
 
-v_cvt_pknorm_u16_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x5c,0xd2,0x00,0xfe,0x03,0x00]
+v_max_f64 v[5:6], -s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0xce,0xd2,0x04,0x08,0x00,0x20]
 
-v_cvt_pknorm_u16_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x5c,0xd2,0x00,0x00,0x00,0x20]
+v_max_f64 v[5:6], s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0xce,0xd2,0x04,0x08,0x00,0x40]
 
-v_cvt_pknorm_u16_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x5c,0xd2,0x00,0x00,0x00,0x40]
+v_max_f64 v[5:6], -s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0xce,0xd2,0x04,0x08,0x00,0x60]
 
-v_cvt_pknorm_u16_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x5c,0xd2,0x00,0x00,0x00,0x60]
+v_max_f64 v[5:6], |s[4:5]|, s[4:5]
+// CHECK: [0x05,0x01,0xce,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x5c,0xd2,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], |s[4:5]|
+// CHECK: [0x05,0x02,0xce,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x5c,0xd2,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], |s[4:5]|, |s[4:5]|
+// CHECK: [0x05,0x03,0xce,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pknorm_u16_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x5c,0xd2,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], s[4:5] clamp
+// CHECK: [0x05,0x08,0xce,0xd2,0x04,0x08,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x5e]
+v_max_f64 v[5:6], s[4:5], s[4:5] mul:2
+// CHECK: [0x05,0x00,0xce,0xd2,0x04,0x08,0x00,0x08]
 
-v_cvt_pkrtz_f16_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x5f]
+v_max_f64 v[5:6], s[4:5], s[4:5] mul:4
+// CHECK: [0x05,0x00,0xce,0xd2,0x04,0x08,0x00,0x10]
 
-v_cvt_pkrtz_f16_f32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x5e]
+v_max_f64 v[5:6], s[4:5], s[4:5] div:2
+// CHECK: [0x05,0x00,0xce,0xd2,0x04,0x08,0x00,0x18]
 
-v_cvt_pkrtz_f16_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, s2
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x5e]
+v_ldexp_f64 v[254:255], 0, s2
+// CHECK: [0xfe,0x00,0xd0,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0.5, s2
+// CHECK: [0x05,0x00,0xd0,0xd2,0xf0,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], v[1:2], s2
+// CHECK: [0x05,0x00,0xd0,0xd2,0x01,0x05,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], v[254:255], s2
+// CHECK: [0x05,0x00,0xd0,0xd2,0xfe,0x05,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, s103
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xce,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xd0,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xd2,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, vcc_lo
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xd4,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, vcc_hi
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xd6,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, tba_lo
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xd8,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, tba_hi
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xda,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, tma_lo
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xdc,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, tma_hi
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xde,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, ttmp11
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xf6,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, m0
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xf8,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x5e,0x56,0x34,0x12,0xaf]
+v_ldexp_f64 v[5:6], 0, exec_lo
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xfc,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x5e,0x73,0x72,0x71,0x3f]
+v_ldexp_f64 v[5:6], 0, exec_hi
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xfe,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, 0
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0x00,0x01,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x5e]
+v_ldexp_f64 v[5:6], 0, -1
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0x82,0x01,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x5e]
+v_ldexp_f64 v[5:6], 0, 0.5
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xe0,0x01,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x5e,0xd2,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, -4.0
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xee,0x01,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x5e,0xd2,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, scc
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xfa,0x01,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x5e,0xd2,0xfd,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, v2
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0x04,0x02,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x5e,0xd2,0x00,0x01,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, v255
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0xfe,0x03,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x5e,0xd2,0xff,0x01,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, s2 clamp
+// CHECK: [0x05,0x08,0xd0,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x5e,0xd2,0x00,0xfa,0x01,0x00]
+v_ldexp_f64 v[5:6], 0, s2 mul:2
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0x04,0x00,0x08]
 
-v_cvt_pkrtz_f16_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x5e,0xd2,0x00,0x00,0x02,0x00]
+v_ldexp_f64 v[5:6], 0, s2 mul:4
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0x04,0x00,0x10]
 
-v_cvt_pkrtz_f16_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x5e,0xd2,0x00,0xfe,0x03,0x00]
+v_ldexp_f64 v[5:6], 0, s2 div:2
+// CHECK: [0x05,0x00,0xd0,0xd2,0x80,0x04,0x00,0x18]
 
-v_cvt_pkrtz_f16_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x5e,0xd2,0x00,0x00,0x00,0x20]
+v_mul_lo_u32 v5, 0, s2
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x5e,0xd2,0x00,0x00,0x00,0x40]
+v_mul_lo_u32 v255, 0, s2
+// CHECK: [0xff,0x00,0xd2,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x5e,0xd2,0x00,0x00,0x00,0x60]
+v_mul_lo_u32 v5, -1, s2
+// CHECK: [0x05,0x00,0xd2,0xd2,0xc1,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x5e,0xd2,0x00,0x00,0x00,0x00]
+v_mul_lo_u32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0xd2,0xd2,0xf0,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x5e,0xd2,0x00,0x00,0x00,0x00]
+v_mul_lo_u32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0xd2,0xd2,0xf7,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x5e,0xd2,0x00,0x00,0x00,0x00]
+v_mul_lo_u32 v5, v1, s2
+// CHECK: [0x05,0x00,0xd2,0xd2,0x01,0x05,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x60]
+v_mul_lo_u32 v5, v255, s2
+// CHECK: [0x05,0x00,0xd2,0xd2,0xff,0x05,0x00,0x00]
 
-v_cvt_pk_u16_u32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x61]
+v_mul_lo_u32 v5, 0, s103
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xce,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xd0,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xd2,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xd4,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xd6,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xd8,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xda,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xdc,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xde,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xf6,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, m0
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xf8,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xfc,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xfe,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0x00,0x01,0x00]
 
-v_cvt_pk_u16_u32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, -1
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0x82,0x01,0x00]
 
-v_cvt_pk_u16_u32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xe0,0x01,0x00]
 
-v_cvt_pk_u16_u32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xee,0x01,0x00]
 
-v_cvt_pk_u16_u32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x60]
+v_mul_lo_u32 v5, 0, v2
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0x04,0x02,0x00]
 
-v_cvt_pk_u16_u32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x60,0x56,0x34,0x12,0xaf]
+v_mul_lo_u32 v5, 0, v255
+// CHECK: [0x05,0x00,0xd2,0xd2,0x80,0xfe,0x03,0x00]
 
-v_cvt_pk_u16_u32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x60,0x73,0x72,0x71,0x3f]
+v_mul_hi_u32 v5, 0, s2
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x60]
+v_mul_hi_u32 v255, 0, s2
+// CHECK: [0xff,0x00,0xd4,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x60]
+v_mul_hi_u32 v5, -1, s2
+// CHECK: [0x05,0x00,0xd4,0xd2,0xc1,0x04,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x60]
+v_mul_hi_u32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0xd4,0xd2,0xf0,0x04,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x60,0xd2,0x00,0x00,0x00,0x00]
+v_mul_hi_u32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0xd4,0xd2,0xf7,0x04,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x60,0xd2,0x00,0x00,0x00,0x00]
+v_mul_hi_u32 v5, v1, s2
+// CHECK: [0x05,0x00,0xd4,0xd2,0x01,0x05,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x60,0xd2,0x80,0x00,0x00,0x00]
+v_mul_hi_u32 v5, v255, s2
+// CHECK: [0x05,0x00,0xd4,0xd2,0xff,0x05,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x60,0xd2,0xc1,0x00,0x00,0x00]
+v_mul_hi_u32 v5, 0, s103
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xce,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x60,0xd2,0xf0,0x00,0x00,0x00]
+v_mul_hi_u32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xd0,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x60,0xd2,0xf7,0x00,0x00,0x00]
+v_mul_hi_u32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xd2,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x60,0xd2,0x00,0x01,0x00,0x00]
+v_mul_hi_u32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xd4,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x60,0xd2,0xff,0x01,0x00,0x00]
+v_mul_hi_u32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xd6,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x60,0xd2,0x00,0x00,0x01,0x00]
+v_mul_hi_u32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xd8,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x60,0xd2,0x00,0x82,0x01,0x00]
+v_mul_hi_u32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xda,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x60,0xd2,0x00,0xe0,0x01,0x00]
+v_mul_hi_u32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xdc,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x60,0xd2,0x00,0xee,0x01,0x00]
+v_mul_hi_u32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xde,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x60,0xd2,0x00,0x00,0x02,0x00]
+v_mul_hi_u32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xf6,0x00,0x00]
 
-v_cvt_pk_u16_u32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x60,0xd2,0x00,0xfe,0x03,0x00]
+v_mul_hi_u32 v5, 0, m0
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xf8,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x62]
+v_mul_hi_u32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xfc,0x00,0x00]
 
-v_cvt_pk_i16_i32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x63]
+v_mul_hi_u32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xfe,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, s103, v0
-// CHECK: [0x67,0x00,0x00,0x62]
+v_mul_hi_u32 v5, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0x00,0x01,0x00]
 
-v_cvt_pk_i16_i32 v0, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x62]
+v_mul_hi_u32 v5, 0, -1
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0x82,0x01,0x00]
 
-v_cvt_pk_i16_i32 v0, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x62]
+v_mul_hi_u32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xe0,0x01,0x00]
 
-v_cvt_pk_i16_i32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x62]
+v_mul_hi_u32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xee,0x01,0x00]
 
-v_cvt_pk_i16_i32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x62]
+v_mul_hi_u32 v5, 0, v2
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0x04,0x02,0x00]
 
-v_cvt_pk_i16_i32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x62]
+v_mul_hi_u32 v5, 0, v255
+// CHECK: [0x05,0x00,0xd4,0xd2,0x80,0xfe,0x03,0x00]
 
-v_cvt_pk_i16_i32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x62]
+v_mul_lo_i32 v5, 0, s2
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x62]
+v_mul_lo_i32 v255, 0, s2
+// CHECK: [0xff,0x00,0xd6,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x62]
+v_mul_lo_i32 v5, -1, s2
+// CHECK: [0x05,0x00,0xd6,0xd2,0xc1,0x04,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x62]
+v_mul_lo_i32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0xd6,0xd2,0xf0,0x04,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x62]
+v_mul_lo_i32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0xd6,0xd2,0xf7,0x04,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x62]
+v_mul_lo_i32 v5, v1, s2
+// CHECK: [0x05,0x00,0xd6,0xd2,0x01,0x05,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x62]
+v_mul_lo_i32 v5, v255, s2
+// CHECK: [0x05,0x00,0xd6,0xd2,0xff,0x05,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x62]
+v_mul_lo_i32 v5, 0, s103
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xce,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x62]
+v_mul_lo_i32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xd0,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x62]
+v_mul_lo_i32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xd2,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x62]
+v_mul_lo_i32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xd4,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x62,0x56,0x34,0x12,0xaf]
+v_mul_lo_i32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xd6,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x62,0x73,0x72,0x71,0x3f]
+v_mul_lo_i32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xd8,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x62]
+v_mul_lo_i32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xda,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x62]
+v_mul_lo_i32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xdc,0x00,0x00]
 
-v_cvt_pk_i16_i32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x62]
+v_mul_lo_i32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xde,0x00,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x62,0xd2,0x00,0x00,0x00,0x00]
+v_mul_lo_i32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xf6,0x00,0x00]
 
-v_cvt_pk_i16_i32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x62,0xd2,0x00,0x00,0x00,0x00]
+v_mul_lo_i32 v5, 0, m0
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xf8,0x00,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x62,0xd2,0x80,0x00,0x00,0x00]
+v_mul_lo_i32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xfc,0x00,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x62,0xd2,0xc1,0x00,0x00,0x00]
+v_mul_lo_i32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xfe,0x00,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x62,0xd2,0xf0,0x00,0x00,0x00]
+v_mul_lo_i32 v5, 0, 0
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0x00,0x01,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x62,0xd2,0xf7,0x00,0x00,0x00]
+v_mul_lo_i32 v5, 0, -1
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0x82,0x01,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x62,0xd2,0x00,0x01,0x00,0x00]
+v_mul_lo_i32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xe0,0x01,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x62,0xd2,0xff,0x01,0x00,0x00]
+v_mul_lo_i32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xee,0x01,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x62,0xd2,0x00,0x00,0x01,0x00]
+v_mul_lo_i32 v5, 0, v2
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0x04,0x02,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x62,0xd2,0x00,0x82,0x01,0x00]
+v_mul_lo_i32 v5, 0, v255
+// CHECK: [0x05,0x00,0xd6,0xd2,0x80,0xfe,0x03,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x62,0xd2,0x00,0xe0,0x01,0x00]
+v_mul_hi_i32 v5, 0, s2
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x62,0xd2,0x00,0xee,0x01,0x00]
+v_mul_hi_i32 v255, 0, s2
+// CHECK: [0xff,0x00,0xd8,0xd2,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x62,0xd2,0x00,0x00,0x02,0x00]
+v_mul_hi_i32 v5, -1, s2
+// CHECK: [0x05,0x00,0xd8,0xd2,0xc1,0x04,0x00,0x00]
 
-v_cvt_pk_i16_i32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x62,0xd2,0x00,0xfe,0x03,0x00]
+v_mul_hi_i32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0xd8,0xd2,0xf0,0x04,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0xd8,0xd2,0xf7,0x04,0x00,0x00]
 
-v_mad_legacy_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, v1, s2
+// CHECK: [0x05,0x00,0xd8,0xd2,0x01,0x05,0x00,0x00]
 
-v_mad_legacy_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0x80,0xd2,0xfd,0x00,0x00,0x00]
+v_mul_hi_i32 v5, v255, s2
+// CHECK: [0x05,0x00,0xd8,0xd2,0xff,0x05,0x00,0x00]
 
-v_mad_legacy_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x01,0x00,0x00]
+v_mul_hi_i32 v5, 0, s103
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xce,0x00,0x00]
 
-v_mad_legacy_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x80,0xd2,0xff,0x01,0x00,0x00]
+v_mul_hi_i32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xd0,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0xfa,0x01,0x00]
+v_mul_hi_i32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xd2,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x02,0x00]
+v_mul_hi_i32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xd4,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0xfe,0x03,0x00]
+v_mul_hi_i32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xd6,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0xf4,0x03]
+v_mul_hi_i32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xd8,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x04]
+v_mul_hi_i32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xda,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0xfc,0x07]
+v_mul_hi_i32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xdc,0x00,0x00]
 
-v_mad_legacy_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x20]
+v_mul_hi_i32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xde,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x40]
+v_mul_hi_i32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xf6,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x80]
+v_mul_hi_i32 v5, 0, m0
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xf8,0x00,0x00]
 
-v_mad_legacy_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0xe0]
+v_mul_hi_i32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xfc,0x00,0x00]
 
-v_mad_legacy_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xfe,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0x00,0x01,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0, -1
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0x82,0x01,0x00]
 
-v_mad_legacy_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xe0,0x01,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xee,0x01,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x08]
+v_mul_hi_i32 v5, 0, v2
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0x04,0x02,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x10]
+v_mul_hi_i32 v5, 0, v255
+// CHECK: [0x05,0x00,0xd8,0xd2,0x80,0xfe,0x03,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x18]
+v_div_scale_f32 v5, vcc, s1, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v255, vcc, s1, 0, 0
+// CHECK: [0xff,0x6a,0xda,0xd2,0x01,0x00,0x01,0x02]
 
-v_mad_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, s103, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x67,0x00,0x01,0x02]
 
-v_mad_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0x82,0xd2,0xfd,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x68,0x00,0x01,0x02]
 
-v_mad_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x01,0x00,0x00]
+v_div_scale_f32 v5, vcc, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x69,0x00,0x01,0x02]
 
-v_mad_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x82,0xd2,0xff,0x01,0x00,0x00]
+v_div_scale_f32 v5, vcc, vcc_lo, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x6a,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0xfa,0x01,0x00]
+v_div_scale_f32 v5, vcc, vcc_hi, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x6b,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x02,0x00]
+v_div_scale_f32 v5, vcc, tba_lo, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x6c,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0xfe,0x03,0x00]
+v_div_scale_f32 v5, vcc, tba_hi, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x6d,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0xf4,0x03]
+v_div_scale_f32 v5, vcc, tma_lo, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x6e,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x04]
+v_div_scale_f32 v5, vcc, tma_hi, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x6f,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0xfc,0x07]
+v_div_scale_f32 v5, vcc, ttmp11, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x7b,0x00,0x01,0x02]
 
-v_mad_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x20]
+v_div_scale_f32 v5, vcc, m0, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x7c,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x40]
+v_div_scale_f32 v5, vcc, exec_lo, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x7e,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x80]
+v_div_scale_f32 v5, vcc, exec_hi, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x7f,0x00,0x01,0x02]
 
-v_mad_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0xe0]
+v_div_scale_f32 v5, vcc, 0, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x80,0x00,0x01,0x02]
 
-v_mad_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, -1, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0xc1,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, 0.5, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0xf0,0x00,0x01,0x02]
 
-v_mad_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, -4.0, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0xf7,0x00,0x01,0x02]
 
-v_mad_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, v1, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0x01,0x01,0x02]
 
-v_mad_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, v255, 0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0xff,0x01,0x01,0x02]
 
-v_mad_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x08]
+v_div_scale_f32 v5, vcc, s1, -1, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0x82,0x01,0x02]
 
-v_mad_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x10]
+v_div_scale_f32 v5, vcc, s1, 0.5, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0xe0,0x01,0x02]
 
-v_mad_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x18]
+v_div_scale_f32 v5, vcc, s1, -4.0, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0xee,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, s1, v2, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0x04,0x02,0x02]
 
-v_mad_i32_i24 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x84,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, s1, v255, 0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0xfe,0x03,0x02]
 
-v_mad_i32_i24 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x80,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, s1, 0, -1
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0x00,0x05,0x03]
 
-v_mad_i32_i24 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0xc1,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, s1, 0, 0.5
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0x00,0xc1,0x03]
 
-v_mad_i32_i24 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0xf0,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, s1, 0, -4.0
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0x00,0xdd,0x03]
 
-v_mad_i32_i24 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0xf7,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, s1, 0, v3
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0x00,0x0d,0x04]
 
-v_mad_i32_i24 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x01,0x00,0x00]
+v_div_scale_f32 v5, vcc, s1, 0, v255
+// CHECK: [0x05,0x6a,0xda,0xd2,0x01,0x00,0xfd,0x07]
 
-v_mad_i32_i24 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0xff,0x01,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x01,0x00]
+v_div_scale_f64 v[254:255], vcc, s[2:3], 0, 0
+// CHECK: [0xfe,0x6a,0xdc,0xd2,0x02,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x82,0x01,0x00]
+v_div_scale_f64 v[5:6], vcc, s[4:5], 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x04,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0xe0,0x01,0x00]
+v_div_scale_f64 v[5:6], vcc, s[102:103], 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x66,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0xee,0x01,0x00]
+v_div_scale_f64 v[5:6], vcc, flat_scratch, 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x68,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x02,0x00]
+v_div_scale_f64 v[5:6], vcc, vcc, 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x6a,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0xfe,0x03,0x00]
+v_div_scale_f64 v[5:6], vcc, tba, 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x6c,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x00,0x02]
+v_div_scale_f64 v[5:6], vcc, tma, 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x6e,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x04,0x03]
+v_div_scale_f64 v[5:6], vcc, ttmp[10:11], 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x7a,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0xc0,0x03]
+v_div_scale_f64 v[5:6], vcc, exec, 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x7e,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0xdc,0x03]
+v_div_scale_f64 v[5:6], vcc, 0, 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x80,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x00,0x04]
+v_div_scale_f64 v[5:6], vcc, -1, 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0xc1,0x00,0x01,0x02]
 
-v_mad_i32_i24 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0xfc,0x07]
+v_div_scale_f64 v[5:6], vcc, 0.5, 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0xf0,0x00,0x01,0x02]
 
-v_mad_u32_u24 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, -4.0, 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0xf7,0x00,0x01,0x02]
 
-v_mad_u32_u24 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x86,0xd2,0x00,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, v[1:2], 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x01,0x01,0x01,0x02]
 
-v_mad_u32_u24 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x80,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, v[254:255], 0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0xfe,0x01,0x01,0x02]
 
-v_mad_u32_u24 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0xc1,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], -1, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0x82,0x01,0x02]
 
-v_mad_u32_u24 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0xf0,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0.5, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0xe0,0x01,0x02]
 
-v_mad_u32_u24 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0xf7,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], -4.0, 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0xee,0x01,0x02]
 
-v_mad_u32_u24 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x01,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], v[2:3], 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0x04,0x02,0x02]
 
-v_mad_u32_u24 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0xff,0x01,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], v[254:255], 0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0xfc,0x03,0x02]
 
-v_mad_u32_u24 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0x01,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, -1
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0x00,0x05,0x03]
 
-v_mad_u32_u24 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x82,0x01,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, 0.5
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0x00,0xc1,0x03]
 
-v_mad_u32_u24 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0xe0,0x01,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, -4.0
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0x00,0xdd,0x03]
 
-v_mad_u32_u24 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0xee,0x01,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, v[3:4]
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0x00,0x0d,0x04]
 
-v_mad_u32_u24 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0x02,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, v[254:255]
+// CHECK: [0x05,0x6a,0xdc,0xd2,0x02,0x00,0xf9,0x07]
 
-v_mad_u32_u24 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0xfe,0x03,0x00]
+v_div_fmas_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd2,0x01,0x05,0x0e,0x04]
 
-v_mad_u32_u24 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0x00,0x02]
+v_div_fmas_f32 v255, v1, v2, v3
+// CHECK: [0xff,0x00,0xde,0xd2,0x01,0x05,0x0e,0x04]
 
-v_mad_u32_u24 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0x04,0x03]
+v_div_fmas_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd2,0xff,0x05,0x0e,0x04]
 
-v_mad_u32_u24 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0xc0,0x03]
+v_div_fmas_f32 v5, v1, v255, v3
+// CHECK: [0x05,0x00,0xde,0xd2,0x01,0xff,0x0f,0x04]
 
-v_mad_u32_u24 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0xdc,0x03]
+v_div_fmas_f32 v5, v1, v2, v255
+// CHECK: [0x05,0x00,0xde,0xd2,0x01,0x05,0xfe,0x07]
 
-v_mad_u32_u24 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0x00,0x04]
+v_div_fmas_f32 v5, -v1, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd2,0x01,0x05,0x0e,0x24]
 
-v_mad_u32_u24 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0xfc,0x07]
+v_div_fmas_f32 v5, v1, -v2, v3
+// CHECK: [0x05,0x00,0xde,0xd2,0x01,0x05,0x0e,0x44]
 
-v_cubeid_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_div_fmas_f32 v5, v1, v2, -v3
+// CHECK: [0x05,0x00,0xde,0xd2,0x01,0x05,0x0e,0x84]
 
-v_cubeid_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_div_fmas_f32 v5, -v1, -v2, -v3
+// CHECK: [0x05,0x00,0xde,0xd2,0x01,0x05,0x0e,0xe4]
 
-v_cubeid_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0xfd,0x00,0x00,0x00]
+v_div_fmas_f32 v5, |v1|, v2, v3
+// CHECK: [0x05,0x01,0xde,0xd2,0x01,0x05,0x0e,0x04]
 
-v_cubeid_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x01,0x00,0x00]
+v_div_fmas_f32 v5, v1, |v2|, v3
+// CHECK: [0x05,0x02,0xde,0xd2,0x01,0x05,0x0e,0x04]
 
-v_cubeid_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0xff,0x01,0x00,0x00]
+v_div_fmas_f32 v5, v1, v2, |v3|
+// CHECK: [0x05,0x04,0xde,0xd2,0x01,0x05,0x0e,0x04]
 
-v_cubeid_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0xfa,0x01,0x00]
+v_div_fmas_f32 v5, |v1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xde,0xd2,0x01,0x05,0x0e,0x04]
 
-v_cubeid_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x02,0x00]
+v_div_fmas_f32 v5, v1, v2, v3 clamp
+// CHECK: [0x05,0x08,0xde,0xd2,0x01,0x05,0x0e,0x04]
 
-v_cubeid_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0xfe,0x03,0x00]
+v_div_fmas_f32 v5, v1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xde,0xd2,0x01,0x05,0x0e,0x0c]
 
-v_cubeid_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0xf4,0x03]
+v_div_fmas_f32 v5, v1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xde,0xd2,0x01,0x05,0x0e,0x14]
 
-v_cubeid_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x04]
+v_div_fmas_f32 v5, v1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xde,0xd2,0x01,0x05,0x0e,0x1c]
 
-v_cubeid_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0xfc,0x07]
+v_div_fmas_f64 v[5:6], vcc, vcc, vcc
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xd4,0xa8,0x01]
 
-v_cubeid_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x20]
+v_div_fmas_f64 v[254:255], vcc, vcc, vcc
+// CHECK: [0xfe,0x00,0xe0,0xd2,0x6a,0xd4,0xa8,0x01]
 
-v_cubeid_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x40]
+v_div_fmas_f64 v[5:6], v[1:2], vcc, vcc
+// CHECK: [0x05,0x00,0xe0,0xd2,0x01,0xd5,0xa8,0x01]
 
-v_cubeid_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x80]
+v_div_fmas_f64 v[5:6], v[254:255], vcc, vcc
+// CHECK: [0x05,0x00,0xe0,0xd2,0xfe,0xd5,0xa8,0x01]
 
-v_cubeid_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0xe0]
+v_div_fmas_f64 v[5:6], vcc, v[2:3], vcc
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0x04,0xaa,0x01]
 
-v_cubeid_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, v[254:255], vcc
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xfc,0xab,0x01]
 
-v_cubeid_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, v[3:4]
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xd4,0x0c,0x04]
 
-v_cubeid_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, v[254:255]
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xd4,0xf8,0x07]
 
-v_cubeid_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], -vcc, vcc, vcc
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xd4,0xa8,0x21]
 
-v_cubeid_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, -vcc, vcc
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xd4,0xa8,0x41]
 
-v_cubeid_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x08]
+v_div_fmas_f64 v[5:6], vcc, vcc, -vcc
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xd4,0xa8,0x81]
 
-v_cubeid_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x10]
+v_div_fmas_f64 v[5:6], -vcc, -vcc, -vcc
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xd4,0xa8,0xe1]
 
-v_cubeid_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x18]
+v_div_fmas_f64 v[5:6], |vcc|, vcc, vcc
+// CHECK: [0x05,0x01,0xe0,0xd2,0x6a,0xd4,0xa8,0x01]
 
-v_cubesc_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, |vcc|, vcc
+// CHECK: [0x05,0x02,0xe0,0xd2,0x6a,0xd4,0xa8,0x01]
 
-v_cubesc_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x8a,0xd2,0x00,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, |vcc|
+// CHECK: [0x05,0x04,0xe0,0xd2,0x6a,0xd4,0xa8,0x01]
 
-v_cubesc_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0xfd,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], |vcc|, |vcc|, |vcc|
+// CHECK: [0x05,0x07,0xe0,0xd2,0x6a,0xd4,0xa8,0x01]
 
-v_cubesc_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x01,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, vcc clamp
+// CHECK: [0x05,0x08,0xe0,0xd2,0x6a,0xd4,0xa8,0x01]
 
-v_cubesc_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0xff,0x01,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, vcc mul:2
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xd4,0xa8,0x09]
 
-v_cubesc_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0xfa,0x01,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, vcc mul:4
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xd4,0xa8,0x11]
 
-v_cubesc_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x02,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, vcc div:2
+// CHECK: [0x05,0x00,0xe0,0xd2,0x6a,0xd4,0xa8,0x19]
 
-v_cubesc_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0xfe,0x03,0x00]
+v_msad_u8 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x01,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0xf4,0x03]
+v_msad_u8 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xe2,0xd2,0x01,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x00,0x04]
+v_msad_u8 v5, s103, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x67,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0xfc,0x07]
+v_msad_u8 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x68,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x00,0x20]
+v_msad_u8 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x69,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x00,0x40]
+v_msad_u8 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x6a,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x00,0x80]
+v_msad_u8 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x6b,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x00,0xe0]
+v_msad_u8 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x6c,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0x8a,0xd2,0x00,0x00,0x00,0x00]
+v_msad_u8 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x6d,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0x8a,0xd2,0x00,0x00,0x00,0x00]
+v_msad_u8 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x6e,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0x8a,0xd2,0x00,0x00,0x00,0x00]
+v_msad_u8 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x6f,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0x8a,0xd2,0x00,0x00,0x00,0x00]
+v_msad_u8 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x7b,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x8a,0xd2,0x00,0x00,0x00,0x00]
+v_msad_u8 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x7c,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x00,0x08]
+v_msad_u8 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x7e,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x00,0x10]
+v_msad_u8 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x7f,0x00,0x01,0x02]
 
-v_cubesc_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x00,0x18]
+v_msad_u8 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x80,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x00,0x00]
+v_msad_u8 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0xc1,0x00,0x01,0x02]
 
-v_cubetc_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x8c,0xd2,0x00,0x00,0x00,0x00]
+v_msad_u8 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x01,0x01,0x01,0x02]
 
-v_cubetc_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0xfd,0x00,0x00,0x00]
+v_msad_u8 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0xff,0x01,0x01,0x02]
 
-v_cubetc_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x01,0x00,0x00]
+v_msad_u8 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x01,0x82,0x01,0x02]
 
-v_cubetc_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0xff,0x01,0x00,0x00]
+v_msad_u8 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x01,0x04,0x02,0x02]
 
-v_cubetc_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0xfa,0x01,0x00]
+v_msad_u8 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xe2,0xd2,0x01,0xfe,0x03,0x02]
 
-v_cubetc_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x02,0x00]
+v_msad_u8 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xe2,0xd2,0x01,0x00,0x05,0x03]
 
-v_cubetc_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0xfe,0x03,0x00]
+v_msad_u8 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xe2,0xd2,0x01,0x00,0x0d,0x04]
 
-v_cubetc_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0xf4,0x03]
+v_msad_u8 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xe2,0xd2,0x01,0x00,0xfd,0x07]
 
-v_cubetc_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x00,0x04]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x02,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0xfc,0x07]
+v_qsad_pk_u16_u8 v[254:255], s[2:3], 0, 0
+// CHECK: [0xfe,0x00,0xe4,0xd2,0x02,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x00,0x20]
+v_qsad_pk_u16_u8 v[5:6], s[4:5], 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x04,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x00,0x40]
+v_qsad_pk_u16_u8 v[5:6], s[102:103], 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x66,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x00,0x80]
+v_qsad_pk_u16_u8 v[5:6], flat_scratch, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x68,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x00,0xe0]
+v_qsad_pk_u16_u8 v[5:6], vcc, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x6a,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0x8c,0xd2,0x00,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], tba, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x6c,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0x8c,0xd2,0x00,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], tma, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x6e,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0x8c,0xd2,0x00,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], ttmp[10:11], 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x7a,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0x8c,0xd2,0x00,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], exec, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x7e,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x8c,0xd2,0x00,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], 0, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x80,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x00,0x08]
+v_qsad_pk_u16_u8 v[5:6], -1, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0xc1,0x00,0x01,0x02]
 
-v_cubetc_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x00,0x10]
+v_qsad_pk_u16_u8 v[5:6], v[1:2], 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x01,0x01,0x01,0x02]
 
-v_cubetc_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x00,0x18]
+v_qsad_pk_u16_u8 v[5:6], v[254:255], 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0xfe,0x01,0x01,0x02]
 
-v_cubema_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], -1, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x02,0x82,0x01,0x02]
 
-v_cubema_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x8e,0xd2,0x00,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], v2, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x02,0x04,0x02,0x02]
 
-v_cubema_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0xfd,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], v255, 0
+// CHECK: [0x05,0x00,0xe4,0xd2,0x02,0xfe,0x03,0x02]
 
-v_cubema_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x01,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0, -1
+// CHECK: [0x05,0x00,0xe4,0xd2,0x02,0x00,0x05,0x03]
 
-v_cubema_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0xff,0x01,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0, v[3:4]
+// CHECK: [0x05,0x00,0xe4,0xd2,0x02,0x00,0x0d,0x04]
 
-v_cubema_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0xfa,0x01,0x00]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0, v[254:255]
+// CHECK: [0x05,0x00,0xe4,0xd2,0x02,0x00,0xf9,0x07]
 
-v_cubema_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0x02,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x02,0x00,0x01,0x02]
 
-v_cubema_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0xfe,0x03,0x00]
+v_mqsad_pk_u16_u8 v[254:255], s[2:3], 0, 0
+// CHECK: [0xfe,0x00,0xe6,0xd2,0x02,0x00,0x01,0x02]
 
-v_cubema_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0xf4,0x03]
+v_mqsad_pk_u16_u8 v[5:6], s[4:5], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x04,0x00,0x01,0x02]
 
-v_cubema_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0x00,0x04]
+v_mqsad_pk_u16_u8 v[5:6], s[102:103], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x66,0x00,0x01,0x02]
 
-v_cubema_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0xfc,0x07]
+v_mqsad_pk_u16_u8 v[5:6], flat_scratch, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x68,0x00,0x01,0x02]
 
-v_cubema_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0x00,0x20]
+v_mqsad_pk_u16_u8 v[5:6], vcc, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x6a,0x00,0x01,0x02]
 
-v_cubema_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0x00,0x40]
+v_mqsad_pk_u16_u8 v[5:6], tba, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x6c,0x00,0x01,0x02]
 
-v_cubema_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0x00,0x80]
+v_mqsad_pk_u16_u8 v[5:6], tma, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x6e,0x00,0x01,0x02]
 
-v_cubema_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0x00,0xe0]
+v_mqsad_pk_u16_u8 v[5:6], ttmp[10:11], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x7a,0x00,0x01,0x02]
 
-v_cubema_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0x8e,0xd2,0x00,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], exec, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x7e,0x00,0x01,0x02]
 
-v_cubema_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0x8e,0xd2,0x00,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], 0, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x80,0x00,0x01,0x02]
 
-v_cubema_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0x8e,0xd2,0x00,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], -1, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0xc1,0x00,0x01,0x02]
 
-v_cubema_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0x8e,0xd2,0x00,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], v[1:2], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x01,0x01,0x01,0x02]
 
-v_cubema_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x8e,0xd2,0x00,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], v[254:255], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0xfe,0x01,0x01,0x02]
 
-v_cubema_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0x00,0x08]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], -1, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x02,0x82,0x01,0x02]
 
-v_cubema_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0x00,0x10]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], v2, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x02,0x04,0x02,0x02]
 
-v_cubema_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x8e,0xd2,0x00,0x00,0x00,0x18]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], v255, 0
+// CHECK: [0x05,0x00,0xe6,0xd2,0x02,0xfe,0x03,0x02]
 
-v_bfe_u32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0, -1
+// CHECK: [0x05,0x00,0xe6,0xd2,0x02,0x00,0x05,0x03]
 
-v_bfe_u32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x90,0xd2,0x00,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0, v[3:4]
+// CHECK: [0x05,0x00,0xe6,0xd2,0x02,0x00,0x0d,0x04]
 
-v_bfe_u32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0x80,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0, v[254:255]
+// CHECK: [0x05,0x00,0xe6,0xd2,0x02,0x00,0xf9,0x07]
 
-v_bfe_u32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0xc1,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, s2
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0x04,0x00,0x00]
 
-v_bfe_u32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0xf0,0x00,0x00,0x00]
+v_trig_preop_f64 v[254:255], 0, s2
+// CHECK: [0xfe,0x00,0xe8,0xd2,0x80,0x04,0x00,0x00]
 
-v_bfe_u32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0xf7,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0.5, s2
+// CHECK: [0x05,0x00,0xe8,0xd2,0xf0,0x04,0x00,0x00]
 
-v_bfe_u32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x01,0x00,0x00]
+v_trig_preop_f64 v[5:6], v[1:2], s2
+// CHECK: [0x05,0x00,0xe8,0xd2,0x01,0x05,0x00,0x00]
 
-v_bfe_u32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0xff,0x01,0x00,0x00]
+v_trig_preop_f64 v[5:6], v[254:255], s2
+// CHECK: [0x05,0x00,0xe8,0xd2,0xfe,0x05,0x00,0x00]
 
-v_bfe_u32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0x01,0x00]
+v_trig_preop_f64 v[5:6], 0, s103
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xce,0x00,0x00]
 
-v_bfe_u32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x82,0x01,0x00]
+v_trig_preop_f64 v[5:6], 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xd0,0x00,0x00]
 
-v_bfe_u32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0xe0,0x01,0x00]
+v_trig_preop_f64 v[5:6], 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xd2,0x00,0x00]
 
-v_bfe_u32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0xee,0x01,0x00]
+v_trig_preop_f64 v[5:6], 0, vcc_lo
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xd4,0x00,0x00]
 
-v_bfe_u32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0x02,0x00]
+v_trig_preop_f64 v[5:6], 0, vcc_hi
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xd6,0x00,0x00]
 
-v_bfe_u32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0xfe,0x03,0x00]
+v_trig_preop_f64 v[5:6], 0, tba_lo
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xd8,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0x00,0x02]
+v_trig_preop_f64 v[5:6], 0, tba_hi
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xda,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0x04,0x03]
+v_trig_preop_f64 v[5:6], 0, tma_lo
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xdc,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0xc0,0x03]
+v_trig_preop_f64 v[5:6], 0, tma_hi
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xde,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0xdc,0x03]
+v_trig_preop_f64 v[5:6], 0, ttmp11
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xf6,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0x00,0x04]
+v_trig_preop_f64 v[5:6], 0, m0
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xf8,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0xfc,0x07]
+v_trig_preop_f64 v[5:6], 0, exec_lo
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xfc,0x00,0x00]
 
-v_bfe_i32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, exec_hi
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xfe,0x00,0x00]
 
-v_bfe_i32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x92,0xd2,0x00,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, 0
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0x00,0x01,0x00]
 
-v_bfe_i32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x80,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, -1
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0x82,0x01,0x00]
 
-v_bfe_i32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0xc1,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, 0.5
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xe0,0x01,0x00]
 
-v_bfe_i32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0xf0,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, -4.0
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xee,0x01,0x00]
 
-v_bfe_i32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0xf7,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, scc
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xfa,0x01,0x00]
 
-v_bfe_i32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x01,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, v2
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0x04,0x02,0x00]
 
-v_bfe_i32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0xff,0x01,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, v255
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0xfe,0x03,0x00]
 
-v_bfe_i32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x01,0x00]
+v_trig_preop_f64 v[5:6], 0, s2 clamp
+// CHECK: [0x05,0x08,0xe8,0xd2,0x80,0x04,0x00,0x00]
 
-v_bfe_i32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x82,0x01,0x00]
+v_trig_preop_f64 v[5:6], 0, s2 mul:2
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0x04,0x00,0x08]
 
-v_bfe_i32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0xe0,0x01,0x00]
+v_trig_preop_f64 v[5:6], 0, s2 mul:4
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0x04,0x00,0x10]
 
-v_bfe_i32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0xee,0x01,0x00]
+v_trig_preop_f64 v[5:6], 0, s2 div:2
+// CHECK: [0x05,0x00,0xe8,0xd2,0x80,0x04,0x00,0x18]
 
-v_bfe_i32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_f_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x00,0x7c]
 
-v_bfe_i32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_f_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x00,0x7c]
 
-v_bfe_i32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_f_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x00,0x7c]
 
-v_bfe_i32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_f_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x00,0x7c]
 
-v_bfe_i32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_f_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x00,0x7c]
 
-v_bfe_i32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_f_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x00,0x7c]
 
-v_bfe_i32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_f_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x00,0x7c]
 
-v_bfe_i32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_f_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x00,0x7c]
 
-v_bfi_b32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x00,0x7c]
 
-v_bfi_b32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x94,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x00,0x7c]
 
-v_bfi_b32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x00,0x7c]
 
-v_bfi_b32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x00,0x7c]
 
-v_bfi_b32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x00,0x7c]
 
-v_bfi_b32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x00,0x7c]
 
-v_bfi_b32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_f_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x00,0x7c]
 
-v_bfi_b32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_f_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x00,0x7c]
 
-v_bfi_b32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_f_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x00,0x7c]
 
-v_bfi_b32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_f_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x00,0x7c]
 
-v_bfi_b32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_f_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x00,0x7c,0x56,0x34,0x12,0xaf]
 
-v_bfi_b32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_f_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x00,0x7c,0x73,0x72,0x71,0x3f]
 
-v_bfi_b32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_f_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x00,0x7c]
 
-v_bfi_b32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_f_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x00,0x7c]
 
-v_bfi_b32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_f_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x01,0x7c]
 
-v_bfi_b32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_f_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfi_b32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_f_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x00,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfi_b32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_f_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x00,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfi_b32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_f_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x00,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfi_b32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_f_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x00,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x00,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x00,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_f_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x00,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x00,0xd0,0xf0,0x04,0x00,0x00]
 
-v_fma_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x00,0xd0,0x01,0x05,0x00,0x00]
 
-v_fma_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_f_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x00,0xd0,0xff,0x05,0x00,0x00]
 
-v_fma_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xce,0x00,0x00]
 
-v_fma_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xd0,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_f_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xd2,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_f_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xd4,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_f_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xd6,0x00,0x00]
 
-v_fma_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_f_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xd8,0x00,0x00]
 
-v_fma_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_f_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xda,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x80]
+v_cmp_f_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xdc,0x00,0x00]
 
-v_fma_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0xe0]
+v_cmp_f_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xde,0x00,0x00]
 
-v_fma_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xf6,0x00,0x00]
 
-v_fma_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xf8,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xfc,0x00,0x00]
 
-v_fma_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xfe,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0x00,0x01,0x00]
 
-v_fma_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_f_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xe0,0x01,0x00]
 
-v_fma_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_f_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xfa,0x01,0x00]
 
-v_fma_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_f_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0x04,0x02,0x00]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0xfe,0x03,0x00]
 
-v_fma_f64 v[254:255], s[0:1], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0x98,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x00,0xd0,0x80,0x04,0x00,0x40]
 
-v_fma_f64 v[0:1], scc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_lt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_lt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], v[254:255], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_lt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_lt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_lt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0xfc,0x03,0x00]
+v_cmp_lt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_lt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_lt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0xf8,0x07]
+v_cmp_lt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], -s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_lt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_lt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x00,0x80]
+v_cmp_lt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], -s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x00,0xe0]
+v_cmp_lt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], |s[0:1]|, s[0:1], s[0:1]
-// CHECK: [0x00,0x01,0x98,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x02,0x98,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x04,0x98,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], |s[0:1]|, |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x07,0x98,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0x98,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x02,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_lt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x02,0x7c,0x56,0x34,0x12,0xaf]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_lt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x02,0x7c,0x73,0x72,0x71,0x3f]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_lt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x02,0x7c]
 
-v_lerp_u8 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x02,0x7c]
 
-v_lerp_u8 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x9a,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x03,0x7c]
 
-v_lerp_u8 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x02,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x02,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x02,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_lt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x02,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_lt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x02,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_lt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x02,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_lt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x02,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x02,0xd0,0xf0,0x04,0x00,0x00]
 
-v_lerp_u8 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_lt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x02,0xd0,0x01,0x05,0x00,0x00]
 
-v_lerp_u8 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_lt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x02,0xd0,0xff,0x05,0x00,0x00]
 
-v_lerp_u8 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xce,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_lt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xd0,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_lt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xd2,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_lt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xd4,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_lt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xd6,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_lt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xd8,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x9a,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_lt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xda,0x00,0x00]
 
-v_alignbit_b32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xdc,0x00,0x00]
 
-v_alignbit_b32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x9c,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xde,0x00,0x00]
 
-v_alignbit_b32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xf6,0x00,0x00]
 
-v_alignbit_b32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xf8,0x00,0x00]
 
-v_alignbit_b32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xfc,0x00,0x00]
 
-v_alignbit_b32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xfe,0x00,0x00]
 
-v_alignbit_b32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0x00,0x01,0x00]
 
-v_alignbit_b32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xe0,0x01,0x00]
 
-v_alignbit_b32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xfa,0x01,0x00]
 
-v_alignbit_b32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0x04,0x02,0x00]
 
-v_alignbit_b32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0xfe,0x03,0x00]
 
-v_alignbit_b32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x02,0xd0,0x80,0x04,0x00,0x40]
 
-v_alignbit_b32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_eq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x04,0x7c]
 
-v_alignbit_b32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_eq_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x04,0x7c]
 
-v_alignbit_b32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_eq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x04,0x7c]
 
-v_alignbit_b32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_eq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x04,0x7c]
 
-v_alignbit_b32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_eq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x04,0x7c]
 
-v_alignbit_b32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_eq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x04,0x7c]
 
-v_alignbit_b32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_eq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x04,0x7c]
 
-v_alignbit_b32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x9c,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_eq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0x9e,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_eq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_eq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_eq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_eq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x04,0x7c]
 
-v_alignbyte_b32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_eq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x04,0x7c,0x56,0x34,0x12,0xaf]
 
-v_alignbyte_b32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_eq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x04,0x7c,0x73,0x72,0x71,0x3f]
 
-v_alignbyte_b32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_eq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x04,0x7c]
 
-v_alignbyte_b32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_eq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x04,0x7c]
 
-v_alignbyte_b32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_eq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x05,0x7c]
 
-v_alignbyte_b32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_eq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0x04,0x00,0x00]
 
-v_alignbyte_b32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_eq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x04,0xd0,0x80,0x04,0x00,0x00]
 
-v_alignbyte_b32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_eq_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x04,0xd0,0x80,0x04,0x00,0x00]
 
-v_alignbyte_b32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_eq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x04,0xd0,0x80,0x04,0x00,0x00]
 
-v_alignbyte_b32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0x9e,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_eq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x04,0xd0,0x80,0x04,0x00,0x00]
 
-v_mullit_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x04,0xd0,0x80,0x04,0x00,0x00]
 
-v_mullit_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xa0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x04,0xd0,0x80,0x04,0x00,0x00]
 
-v_mullit_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x04,0xd0,0x80,0x04,0x00,0x00]
 
-v_mullit_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x04,0xd0,0xf0,0x04,0x00,0x00]
 
-v_mullit_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x04,0xd0,0x01,0x05,0x00,0x00]
 
-v_mullit_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_eq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x04,0xd0,0xff,0x05,0x00,0x00]
 
-v_mullit_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xce,0x00,0x00]
 
-v_mullit_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xd0,0x00,0x00]
 
-v_mullit_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_eq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xd2,0x00,0x00]
 
-v_mullit_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_eq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xd4,0x00,0x00]
 
-v_mullit_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_eq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xd6,0x00,0x00]
 
-v_mullit_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_eq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xd8,0x00,0x00]
 
-v_mullit_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_eq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xda,0x00,0x00]
 
-v_mullit_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0x00,0x80]
+v_cmp_eq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xdc,0x00,0x00]
 
-v_mullit_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0x00,0xe0]
+v_cmp_eq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xde,0x00,0x00]
 
-v_mullit_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xa0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xf6,0x00,0x00]
 
-v_mullit_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xa0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xf8,0x00,0x00]
 
-v_mullit_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xa0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xfc,0x00,0x00]
 
-v_mullit_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xa0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xfe,0x00,0x00]
 
-v_mullit_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0xa0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0x00,0x01,0x00]
 
-v_mullit_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_eq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xe0,0x01,0x00]
 
-v_mullit_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_eq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xfa,0x01,0x00]
 
-v_mullit_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xa0,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_eq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0x04,0x02,0x00]
 
-v_min3_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0xfe,0x03,0x00]
 
-v_min3_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xa2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x04,0xd0,0x80,0x04,0x00,0x40]
 
-v_min3_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x06,0x7c]
 
-v_min3_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_le_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x06,0x7c]
 
-v_min3_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_le_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_le_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_le_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_le_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_le_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_le_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_le_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x06,0x7c]
 
-v_min3_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_le_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_le_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0x00,0x80]
+v_cmp_le_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x06,0x7c]
 
-v_min3_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0x00,0xe0]
+v_cmp_le_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x06,0x7c]
 
-v_min3_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xa2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xa2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xa2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x06,0x7c]
 
-v_min3_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xa2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0xa2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x06,0x7c]
 
-v_min3_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_le_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x06,0x7c,0x56,0x34,0x12,0xaf]
 
-v_min3_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_le_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x06,0x7c,0x73,0x72,0x71,0x3f]
 
-v_min3_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xa2,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_le_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x06,0x7c]
 
-v_min3_i32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x06,0x7c]
 
-v_min3_i32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xa4,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x07,0x7c]
 
-v_min3_i32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_i32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x06,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_i32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x06,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_i32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_le_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x06,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_i32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_le_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x06,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_i32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_le_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x06,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_i32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_le_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x06,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_i32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_le_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x06,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_i32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_le_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x06,0xd0,0xf0,0x04,0x00,0x00]
 
-v_min3_i32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_le_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x06,0xd0,0x01,0x05,0x00,0x00]
 
-v_min3_i32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_le_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x06,0xd0,0xff,0x05,0x00,0x00]
 
-v_min3_i32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xce,0x00,0x00]
 
-v_min3_i32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_le_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xd0,0x00,0x00]
 
-v_min3_i32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_le_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xd2,0x00,0x00]
 
-v_min3_i32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_le_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xd4,0x00,0x00]
 
-v_min3_i32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_le_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xd6,0x00,0x00]
 
-v_min3_i32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_le_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xd8,0x00,0x00]
 
-v_min3_i32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xa4,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_le_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xda,0x00,0x00]
 
-v_min3_u32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xdc,0x00,0x00]
 
-v_min3_u32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xa6,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xde,0x00,0x00]
 
-v_min3_u32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xf6,0x00,0x00]
 
-v_min3_u32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xf8,0x00,0x00]
 
-v_min3_u32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xfc,0x00,0x00]
 
-v_min3_u32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xfe,0x00,0x00]
 
-v_min3_u32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0x00,0x01,0x00]
 
-v_min3_u32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xe0,0x01,0x00]
 
-v_min3_u32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xfa,0x01,0x00]
 
-v_min3_u32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0x04,0x02,0x00]
 
-v_min3_u32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0xfe,0x03,0x00]
 
-v_min3_u32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x06,0xd0,0x80,0x04,0x00,0x40]
 
-v_min3_u32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_gt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x08,0x7c]
 
-v_min3_u32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_gt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x08,0x7c]
 
-v_min3_u32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_gt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x08,0x7c]
 
-v_min3_u32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_gt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x08,0x7c]
 
-v_min3_u32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_gt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x08,0x7c]
 
-v_min3_u32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_gt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x08,0x7c]
 
-v_min3_u32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_gt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x08,0x7c]
 
-v_min3_u32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xa6,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_gt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x08,0x7c]
 
-v_max3_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x08,0x7c]
 
-v_max3_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xa8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x08,0x7c]
 
-v_max3_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_gt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x08,0x7c]
 
-v_max3_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_gt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x08,0x7c]
 
-v_max3_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_gt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x08,0x7c]
 
-v_max3_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_gt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x08,0x7c]
 
-v_max3_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_gt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x08,0x7c]
 
-v_max3_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_gt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x08,0x7c]
 
-v_max3_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_gt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x08,0x7c]
 
-v_max3_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_gt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x08,0x7c]
 
-v_max3_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_gt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x08,0x7c,0x56,0x34,0x12,0xaf]
 
-v_max3_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_gt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x08,0x7c,0x73,0x72,0x71,0x3f]
 
-v_max3_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_gt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x08,0x7c]
 
-v_max3_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0x00,0x80]
+v_cmp_gt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x08,0x7c]
 
-v_max3_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0x00,0xe0]
+v_cmp_gt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x09,0x7c]
 
-v_max3_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xa8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xa8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x08,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xa8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x08,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xa8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x08,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0xa8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x08,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_gt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x08,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_gt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x08,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xa8,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_gt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x08,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_i32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x08,0xd0,0xf0,0x04,0x00,0x00]
 
-v_max3_i32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xaa,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x08,0xd0,0x01,0x05,0x00,0x00]
 
-v_max3_i32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x08,0xd0,0xff,0x05,0x00,0x00]
 
-v_max3_i32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xce,0x00,0x00]
 
-v_max3_i32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xd0,0x00,0x00]
 
-v_max3_i32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xd2,0x00,0x00]
 
-v_max3_i32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xd4,0x00,0x00]
 
-v_max3_i32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xd6,0x00,0x00]
 
-v_max3_i32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xd8,0x00,0x00]
 
-v_max3_i32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xda,0x00,0x00]
 
-v_max3_i32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xdc,0x00,0x00]
 
-v_max3_i32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xde,0x00,0x00]
 
-v_max3_i32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xf6,0x00,0x00]
 
-v_max3_i32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xf8,0x00,0x00]
 
-v_max3_i32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_gt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xfc,0x00,0x00]
 
-v_max3_i32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_gt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xfe,0x00,0x00]
 
-v_max3_i32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_gt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0x00,0x01,0x00]
 
-v_max3_i32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_gt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xe0,0x01,0x00]
 
-v_max3_i32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_gt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xfa,0x01,0x00]
 
-v_max3_i32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xaa,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_gt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0x04,0x02,0x00]
 
-v_max3_u32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0xfe,0x03,0x00]
 
-v_max3_u32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xac,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x08,0xd0,0x80,0x04,0x00,0x40]
 
-v_max3_u32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_lg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_lg_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_lg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_lg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_lg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_lg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_lg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_lg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_lg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_lg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_lg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_lg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_lg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_lg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_lg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_lg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_lg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x7c]
 
-v_max3_u32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xac,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_lg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x7c]
 
-v_med3_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_med3_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xae,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_med3_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xae,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_lg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x7c]
 
-v_med3_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_lg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x7c]
 
-v_med3_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xae,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_lg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x7c]
 
-v_med3_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_lg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x0a,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_lg_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x0a,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_lg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x0a,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_lg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x0a,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_lg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x0a,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_lg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x0a,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_lg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x0a,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0x00,0x80]
+v_cmp_lg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x0a,0xd0,0xf0,0x04,0x00,0x00]
 
-v_med3_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0x00,0xe0]
+v_cmp_lg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x01,0x05,0x00,0x00]
 
-v_med3_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xae,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x0a,0xd0,0xff,0x05,0x00,0x00]
 
-v_med3_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xae,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xce,0x00,0x00]
 
-v_med3_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xae,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xd0,0x00,0x00]
 
-v_med3_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xae,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xd2,0x00,0x00]
 
-v_med3_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0xae,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xd4,0x00,0x00]
 
-v_med3_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_lg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xd6,0x00,0x00]
 
-v_med3_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_lg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xd8,0x00,0x00]
 
-v_med3_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xae,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_lg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xda,0x00,0x00]
 
-v_med3_i32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xdc,0x00,0x00]
 
-v_med3_i32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xb0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xde,0x00,0x00]
 
-v_med3_i32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xf6,0x00,0x00]
 
-v_med3_i32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xf8,0x00,0x00]
 
-v_med3_i32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xfc,0x00,0x00]
 
-v_med3_i32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xfe,0x00,0x00]
 
-v_med3_i32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0x00,0x01,0x00]
 
-v_med3_i32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xe0,0x01,0x00]
 
-v_med3_i32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xfa,0x01,0x00]
 
-v_med3_i32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0x04,0x02,0x00]
 
-v_med3_i32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0xfe,0x03,0x00]
 
-v_med3_i32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x0a,0xd0,0x80,0x04,0x00,0x40]
 
-v_med3_i32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_ge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x0c,0x7c]
 
-v_med3_i32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_ge_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x0c,0x7c]
 
-v_med3_i32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_ge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0c,0x7c]
 
-v_med3_i32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_ge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0c,0x7c]
 
-v_med3_i32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_ge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0c,0x7c]
 
-v_med3_i32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_ge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0c,0x7c]
 
-v_med3_i32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_ge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0c,0x7c]
 
-v_med3_i32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xb0,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_ge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0c,0x7c]
 
-v_med3_u32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0c,0x7c]
 
-v_med3_u32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xb2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0c,0x7c]
 
-v_med3_u32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0c,0x7c]
 
-v_med3_u32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0c,0x7c]
 
-v_med3_u32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0c,0x7c]
 
-v_med3_u32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0c,0x7c]
 
-v_med3_u32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_ge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x0c,0x7c]
 
-v_med3_u32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_ge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0c,0x7c]
 
-v_med3_u32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_ge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0c,0x7c]
 
-v_med3_u32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_ge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0c,0x7c]
 
-v_med3_u32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_ge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_med3_u32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_ge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_med3_u32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_ge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x0c,0x7c]
 
-v_med3_u32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_ge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x0c,0x7c]
 
-v_med3_u32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_ge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0d,0x7c]
 
-v_med3_u32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_ge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_u32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_ge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x0c,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_u32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_ge_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x0c,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_u32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_ge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x0c,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_u32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xb2,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_ge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x0c,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u8 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x0c,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u8 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xb4,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x0c,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u8 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x0c,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u8 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x0c,0xd0,0xf0,0x04,0x00,0x00]
 
-v_sad_u8 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x01,0x05,0x00,0x00]
 
-v_sad_u8 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x0c,0xd0,0xff,0x05,0x00,0x00]
 
-v_sad_u8 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xce,0x00,0x00]
 
-v_sad_u8 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xd0,0x00,0x00]
 
-v_sad_u8 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xd2,0x00,0x00]
 
-v_sad_u8 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xd4,0x00,0x00]
 
-v_sad_u8 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xd6,0x00,0x00]
 
-v_sad_u8 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xd8,0x00,0x00]
 
-v_sad_u8 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xda,0x00,0x00]
 
-v_sad_u8 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xdc,0x00,0x00]
 
-v_sad_u8 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_ge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xde,0x00,0x00]
 
-v_sad_u8 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_ge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xf6,0x00,0x00]
 
-v_sad_u8 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_ge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xf8,0x00,0x00]
 
-v_sad_u8 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_ge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xfc,0x00,0x00]
 
-v_sad_u8 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_ge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xfe,0x00,0x00]
 
-v_sad_u8 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xb4,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_ge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0x00,0x01,0x00]
 
-v_sad_hi_u8 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xe0,0x01,0x00]
 
-v_sad_hi_u8 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xb6,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xfa,0x01,0x00]
 
-v_sad_hi_u8 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0x04,0x02,0x00]
 
-v_sad_hi_u8 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0xfe,0x03,0x00]
 
-v_sad_hi_u8 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x0c,0xd0,0x80,0x04,0x00,0x40]
 
-v_sad_hi_u8 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_o_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_o_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_o_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_o_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_o_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_o_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_o_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_o_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_o_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_o_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_o_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_o_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_o_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0e,0x7c]
 
-v_sad_hi_u8 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xb6,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_o_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x0e,0x7c]
 
-v_sad_u16 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0e,0x7c]
 
-v_sad_u16 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xb8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0e,0x7c]
 
-v_sad_u16 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0e,0x7c]
 
-v_sad_u16 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_sad_u16 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_sad_u16 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x0e,0x7c]
 
-v_sad_u16 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_o_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x0e,0x7c]
 
-v_sad_u16 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_o_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0f,0x7c]
 
-v_sad_u16 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u16 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_o_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x0e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u16 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_o_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x0e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u16 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_o_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x0e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u16 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_o_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x0e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u16 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_o_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x0e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u16 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_o_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x0e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u16 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_o_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x0e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u16 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_o_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x0e,0xd0,0xf0,0x04,0x00,0x00]
 
-v_sad_u16 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_o_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x01,0x05,0x00,0x00]
 
-v_sad_u16 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_o_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x0e,0xd0,0xff,0x05,0x00,0x00]
 
-v_sad_u16 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xb8,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_o_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xce,0x00,0x00]
 
-v_sad_u32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xd0,0x00,0x00]
 
-v_sad_u32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xba,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xd2,0x00,0x00]
 
-v_sad_u32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xd4,0x00,0x00]
 
-v_sad_u32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xd6,0x00,0x00]
 
-v_sad_u32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xd8,0x00,0x00]
 
-v_sad_u32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xda,0x00,0x00]
 
-v_sad_u32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xdc,0x00,0x00]
 
-v_sad_u32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xde,0x00,0x00]
 
-v_sad_u32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xf6,0x00,0x00]
 
-v_sad_u32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xf8,0x00,0x00]
 
-v_sad_u32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xfc,0x00,0x00]
 
-v_sad_u32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xfe,0x00,0x00]
 
-v_sad_u32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0x00,0x01,0x00]
 
-v_sad_u32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xe0,0x01,0x00]
 
-v_sad_u32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_o_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xfa,0x01,0x00]
 
-v_sad_u32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_o_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0x04,0x02,0x00]
 
-v_sad_u32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_o_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0xfe,0x03,0x00]
 
-v_sad_u32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_o_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x0e,0xd0,0x80,0x04,0x00,0x40]
 
-v_sad_u32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_u_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x10,0x7c]
 
-v_sad_u32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xba,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_u_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xbc,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_u_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_u_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_u_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_u_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_u_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_u_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_u_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_u_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_u_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_u_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_u_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_u_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_u_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_u_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_u_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x10,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cvt_pk_u8_f32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_u_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x10,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cvt_pk_u8_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_u_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_u_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x10,0x7c]
 
-v_cvt_pk_u8_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xbc,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_u_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x11,0x7c]
 
-v_div_fixup_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xbe,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x10,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x10,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_u_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x10,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_u_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x10,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_u_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x10,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_u_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x10,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_u_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x10,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_u_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x10,0xd0,0xf0,0x04,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_u_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x10,0xd0,0x01,0x05,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_u_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x10,0xd0,0xff,0x05,0x00,0x00]
 
-v_div_fixup_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_u_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xce,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_u_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xd0,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0x00,0x80]
+v_cmp_u_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xd2,0x00,0x00]
 
-v_div_fixup_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0x00,0xe0]
+v_cmp_u_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xd4,0x00,0x00]
 
-v_div_fixup_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xbe,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xd6,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xbe,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xd8,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xbe,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xda,0x00,0x00]
 
-v_div_fixup_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xbe,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xdc,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0xbe,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xde,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_u_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xf6,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_u_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xf8,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xbe,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_u_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xfc,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xfe,0x00,0x00]
 
-v_div_fixup_f64 v[254:255], s[0:1], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0xc0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0x00,0x01,0x00]
 
-v_div_fixup_f64 v[0:1], scc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xe0,0x01,0x00]
 
-v_div_fixup_f64 v[0:1], v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xfa,0x01,0x00]
 
-v_div_fixup_f64 v[0:1], v[254:255], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0x04,0x02,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0xfe,0x03,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x10,0xd0,0x80,0x04,0x00,0x40]
 
-v_div_fixup_f64 v[0:1], s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0xfc,0x03,0x00]
+v_cmp_nge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_nge_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_nge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0xf8,0x07]
+v_cmp_nge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], -s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_nge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_nge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0x00,0x80]
+v_cmp_nge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], -s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0x00,0xe0]
+v_cmp_nge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], |s[0:1]|, s[0:1], s[0:1]
-// CHECK: [0x00,0x01,0xc0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x02,0xc0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x04,0xc0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], |s[0:1]|, |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x07,0xc0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0xc0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_nge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_nge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x12,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0xc0,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_nge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x12,0x7c]
 
-v_lshl_b64 v[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0xc2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x12,0x7c]
 
-v_lshl_b64 v[254:255], s[0:1], s0
-// CHECK: [0xfe,0x00,0xc2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x12,0x7c]
 
-v_lshl_b64 v[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc2,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x12,0x7c,0x56,0x34,0x12,0xaf]
 
-v_lshl_b64 v[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc2,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x12,0x7c,0x73,0x72,0x71,0x3f]
 
-v_lshl_b64 v[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc2,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x12,0x7c]
 
-v_lshl_b64 v[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc2,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x12,0x7c]
 
-v_lshl_b64 v[0:1], v[0:1], s0
-// CHECK: [0x00,0x00,0xc2,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x13,0x7c]
 
-v_lshl_b64 v[0:1], v[254:255], s0
-// CHECK: [0x00,0x00,0xc2,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0x04,0x00,0x00]
 
-v_lshl_b64 v[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc2,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_nge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x12,0xd0,0x80,0x04,0x00,0x00]
 
-v_lshl_b64 v[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xc2,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_nge_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x12,0xd0,0x80,0x04,0x00,0x00]
 
-v_lshl_b64 v[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc2,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_nge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x12,0xd0,0x80,0x04,0x00,0x00]
 
-v_lshl_b64 v[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xc2,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_nge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x12,0xd0,0x80,0x04,0x00,0x00]
 
-v_lshl_b64 v[0:1], s[0:1], v0
-// CHECK: [0x00,0x00,0xc2,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x12,0xd0,0x80,0x04,0x00,0x00]
 
-v_lshl_b64 v[0:1], s[0:1], v255
-// CHECK: [0x00,0x00,0xc2,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_nge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x12,0xd0,0x80,0x04,0x00,0x00]
 
-v_lshr_b64 v[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0xc4,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x12,0xd0,0x80,0x04,0x00,0x00]
 
-v_lshr_b64 v[254:255], s[0:1], s0
-// CHECK: [0xfe,0x00,0xc4,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x12,0xd0,0xf0,0x04,0x00,0x00]
 
-v_lshr_b64 v[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc4,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x12,0xd0,0x01,0x05,0x00,0x00]
 
-v_lshr_b64 v[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc4,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x12,0xd0,0xff,0x05,0x00,0x00]
 
-v_lshr_b64 v[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc4,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xce,0x00,0x00]
 
-v_lshr_b64 v[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc4,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xd0,0x00,0x00]
 
-v_lshr_b64 v[0:1], v[0:1], s0
-// CHECK: [0x00,0x00,0xc4,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xd2,0x00,0x00]
 
-v_lshr_b64 v[0:1], v[254:255], s0
-// CHECK: [0x00,0x00,0xc4,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xd4,0x00,0x00]
 
-v_lshr_b64 v[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc4,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xd6,0x00,0x00]
 
-v_lshr_b64 v[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xc4,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xd8,0x00,0x00]
 
-v_lshr_b64 v[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc4,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xda,0x00,0x00]
 
-v_lshr_b64 v[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xc4,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xdc,0x00,0x00]
 
-v_lshr_b64 v[0:1], s[0:1], v0
-// CHECK: [0x00,0x00,0xc4,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xde,0x00,0x00]
 
-v_lshr_b64 v[0:1], s[0:1], v255
-// CHECK: [0x00,0x00,0xc4,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xf6,0x00,0x00]
 
-v_ashr_i64 v[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0xc6,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xf8,0x00,0x00]
 
-v_ashr_i64 v[254:255], s[0:1], s0
-// CHECK: [0xfe,0x00,0xc6,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xfc,0x00,0x00]
 
-v_ashr_i64 v[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc6,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xfe,0x00,0x00]
 
-v_ashr_i64 v[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc6,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0x00,0x01,0x00]
 
-v_ashr_i64 v[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc6,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xe0,0x01,0x00]
 
-v_ashr_i64 v[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc6,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xfa,0x01,0x00]
 
-v_ashr_i64 v[0:1], v[0:1], s0
-// CHECK: [0x00,0x00,0xc6,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0x04,0x02,0x00]
 
-v_ashr_i64 v[0:1], v[254:255], s0
-// CHECK: [0x00,0x00,0xc6,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0xfe,0x03,0x00]
 
-v_ashr_i64 v[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc6,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x12,0xd0,0x80,0x04,0x00,0x40]
 
-v_ashr_i64 v[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xc6,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_nlg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x14,0x7c]
 
-v_ashr_i64 v[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc6,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_nlg_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x14,0x7c]
 
-v_ashr_i64 v[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xc6,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_nlg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x14,0x7c]
 
-v_ashr_i64 v[0:1], s[0:1], v0
-// CHECK: [0x00,0x00,0xc6,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nlg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x14,0x7c]
 
-v_ashr_i64 v[0:1], s[0:1], v255
-// CHECK: [0x00,0x00,0xc6,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_nlg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x14,0x7c]
 
-v_add_f64 v[254:255], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0xc8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nlg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_nlg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_nlg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nlg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0xfc,0x03,0x00]
+v_cmp_nlg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_nlg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_nlg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0x00,0x00,0x60]
+v_cmp_nlg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x01,0xc8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x02,0xc8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x14,0x7c]
 
-v_add_f64 v[0:1], |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x03,0xc8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x14,0x7c,0x56,0x34,0x12,0xaf]
 
-v_add_f64 v[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0xc8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x14,0x7c,0x73,0x72,0x71,0x3f]
 
-v_add_f64 v[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_nlg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x14,0x7c]
 
-v_add_f64 v[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_nlg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x14,0x7c]
 
-v_add_f64 v[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0xc8,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_nlg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x15,0x7c]
 
-v_mul_f64 v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_f64 v[254:255], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0xca,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x14,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_f64 v[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x14,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_f64 v[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nlg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x14,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_f64 v[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_nlg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x14,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_nlg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x14,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_f64 v[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nlg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x14,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_f64 v[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0xfc,0x03,0x00]
+v_cmp_nlg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x14,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_f64 v[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_nlg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x14,0xd0,0xf0,0x04,0x00,0x00]
 
-v_mul_f64 v[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_nlg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x14,0xd0,0x01,0x05,0x00,0x00]
 
-v_mul_f64 v[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0x00,0x00,0x60]
+v_cmp_nlg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x14,0xd0,0xff,0x05,0x00,0x00]
 
-v_mul_f64 v[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x01,0xca,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xce,0x00,0x00]
 
-v_mul_f64 v[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x02,0xca,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xd0,0x00,0x00]
 
-v_mul_f64 v[0:1], |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x03,0xca,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xd2,0x00,0x00]
 
-v_mul_f64 v[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0xca,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xd4,0x00,0x00]
 
-v_mul_f64 v[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_nlg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xd6,0x00,0x00]
 
-v_mul_f64 v[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_nlg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xd8,0x00,0x00]
 
-v_mul_f64 v[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0xca,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_nlg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xda,0x00,0x00]
 
-v_min_f64 v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xdc,0x00,0x00]
 
-v_min_f64 v[254:255], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0xcc,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xde,0x00,0x00]
 
-v_min_f64 v[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xf6,0x00,0x00]
 
-v_min_f64 v[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xf8,0x00,0x00]
 
-v_min_f64 v[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xfc,0x00,0x00]
 
-v_min_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xfe,0x00,0x00]
 
-v_min_f64 v[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0x00,0x01,0x00]
 
-v_min_f64 v[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0xfc,0x03,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xe0,0x01,0x00]
 
-v_min_f64 v[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_nlg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xfa,0x01,0x00]
 
-v_min_f64 v[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_nlg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0x04,0x02,0x00]
 
-v_min_f64 v[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0x00,0x00,0x60]
+v_cmp_nlg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0xfe,0x03,0x00]
 
-v_min_f64 v[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x01,0xcc,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x14,0xd0,0x80,0x04,0x00,0x40]
 
-v_min_f64 v[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x02,0xcc,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x16,0x7c]
 
-v_min_f64 v[0:1], |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x03,0xcc,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x16,0x7c]
 
-v_min_f64 v[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0xcc,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x16,0x7c]
 
-v_min_f64 v[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_ngt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x16,0x7c]
 
-v_min_f64 v[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_ngt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x16,0x7c]
 
-v_min_f64 v[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0xcc,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_ngt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x16,0x7c]
 
-v_max_f64 v[254:255], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0xce,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_ngt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_ngt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_ngt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_ngt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0xfc,0x03,0x00]
+v_cmp_ngt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_ngt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_ngt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0x00,0x00,0x60]
+v_cmp_ngt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x01,0xce,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x16,0x7c]
 
-v_max_f64 v[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x02,0xce,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x16,0x7c,0x56,0x34,0x12,0xaf]
 
-v_max_f64 v[0:1], |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x03,0xce,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x16,0x7c,0x73,0x72,0x71,0x3f]
 
-v_max_f64 v[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0xce,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x16,0x7c]
 
-v_max_f64 v[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_ngt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x16,0x7c]
 
-v_max_f64 v[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_ngt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x17,0x7c]
 
-v_max_f64 v[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0xce,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_ngt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x16,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f64 v[254:255], s[0:1], s0
-// CHECK: [0xfe,0x00,0xd0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x16,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f64 v[0:1], scc, s0
-// CHECK: [0x00,0x00,0xd0,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x16,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f64 v[0:1], v[0:1], s0
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_ngt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x16,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f64 v[0:1], v[254:255], s0
-// CHECK: [0x00,0x00,0xd0,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_ngt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x16,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_ngt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x16,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_ngt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x16,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x16,0xd0,0xf0,0x04,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_ngt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x16,0xd0,0x01,0x05,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_ngt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x16,0xd0,0xff,0x05,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], v0
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xce,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], v255
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xd0,0x00,0x00]
 
-v_ldexp_f64 v[0:1], -s[0:1], s0
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_ngt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xd2,0x00,0x00]
 
-v_ldexp_f64 v[0:1], |s[0:1]|, s0
-// CHECK: [0x00,0x01,0xd0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xd4,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], s0 clamp
-// CHECK: [0x00,0x08,0xd0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xd6,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], s0 mul:2
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_ngt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xd8,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], s0 mul:4
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_ngt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xda,0x00,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], s0 div:2
-// CHECK: [0x00,0x00,0xd0,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_ngt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xdc,0x00,0x00]
 
-v_mul_lo_u32 v0, s0, s0
-// CHECK: [0x00,0x00,0xd2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xde,0x00,0x00]
 
-v_mul_lo_u32 v255, s0, s0
-// CHECK: [0xff,0x00,0xd2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xf6,0x00,0x00]
 
-v_mul_lo_u32 v0, 0, s0
-// CHECK: [0x00,0x00,0xd2,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xf8,0x00,0x00]
 
-v_mul_lo_u32 v0, -1, s0
-// CHECK: [0x00,0x00,0xd2,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xfc,0x00,0x00]
 
-v_mul_lo_u32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0xd2,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xfe,0x00,0x00]
 
-v_mul_lo_u32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0xd2,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0x00,0x01,0x00]
 
-v_mul_lo_u32 v0, v0, s0
-// CHECK: [0x00,0x00,0xd2,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xe0,0x01,0x00]
 
-v_mul_lo_u32 v0, v255, s0
-// CHECK: [0x00,0x00,0xd2,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xfa,0x01,0x00]
 
-v_mul_lo_u32 v0, s0, 0
-// CHECK: [0x00,0x00,0xd2,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0x04,0x02,0x00]
 
-v_mul_lo_u32 v0, s0, -1
-// CHECK: [0x00,0x00,0xd2,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0xfe,0x03,0x00]
 
-v_mul_lo_u32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0xd2,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x16,0xd0,0x80,0x04,0x00,0x40]
 
-v_mul_lo_u32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0xd2,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_nle_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x18,0x7c]
 
-v_mul_lo_u32 v0, s0, v0
-// CHECK: [0x00,0x00,0xd2,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nle_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x18,0x7c]
 
-v_mul_lo_u32 v0, s0, v255
-// CHECK: [0x00,0x00,0xd2,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, s0, s0
-// CHECK: [0x00,0x00,0xd4,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v255, s0, s0
-// CHECK: [0xff,0x00,0xd4,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, 0, s0
-// CHECK: [0x00,0x00,0xd4,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, -1, s0
-// CHECK: [0x00,0x00,0xd4,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0xd4,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0xd4,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, v0, s0
-// CHECK: [0x00,0x00,0xd4,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nle_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, v255, s0
-// CHECK: [0x00,0x00,0xd4,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_nle_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, s0, 0
-// CHECK: [0x00,0x00,0xd4,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_nle_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, s0, -1
-// CHECK: [0x00,0x00,0xd4,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_nle_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0xd4,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_nle_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0xd4,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_nle_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, s0, v0
-// CHECK: [0x00,0x00,0xd4,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nle_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x18,0x7c]
 
-v_mul_hi_u32 v0, s0, v255
-// CHECK: [0x00,0x00,0xd4,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x18,0x7c]
 
-v_mul_lo_i32 v0, s0, s0
-// CHECK: [0x00,0x00,0xd6,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x18,0x7c]
 
-v_mul_lo_i32 v255, s0, s0
-// CHECK: [0xff,0x00,0xd6,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x18,0x7c,0x56,0x34,0x12,0xaf]
 
-v_mul_lo_i32 v0, 0, s0
-// CHECK: [0x00,0x00,0xd6,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x18,0x7c,0x73,0x72,0x71,0x3f]
 
-v_mul_lo_i32 v0, -1, s0
-// CHECK: [0x00,0x00,0xd6,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x18,0x7c]
 
-v_mul_lo_i32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0xd6,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x18,0x7c]
 
-v_mul_lo_i32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0xd6,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x19,0x7c]
 
-v_mul_lo_i32 v0, v0, s0
-// CHECK: [0x00,0x00,0xd6,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_i32 v0, v255, s0
-// CHECK: [0x00,0x00,0xd6,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_nle_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x18,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_i32 v0, s0, 0
-// CHECK: [0x00,0x00,0xd6,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_nle_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x18,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_i32 v0, s0, -1
-// CHECK: [0x00,0x00,0xd6,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_nle_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x18,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_i32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0xd6,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_nle_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x18,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_i32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0xd6,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_nle_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x18,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0xd6,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nle_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x18,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_i32 v0, s0, v255
-// CHECK: [0x00,0x00,0xd6,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x18,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_hi_i32 v0, s0, s0
-// CHECK: [0x00,0x00,0xd8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x18,0xd0,0xf0,0x04,0x00,0x00]
 
-v_mul_hi_i32 v255, s0, s0
-// CHECK: [0xff,0x00,0xd8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x18,0xd0,0x01,0x05,0x00,0x00]
 
-v_mul_hi_i32 v0, 0, s0
-// CHECK: [0x00,0x00,0xd8,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x18,0xd0,0xff,0x05,0x00,0x00]
 
-v_mul_hi_i32 v0, -1, s0
-// CHECK: [0x00,0x00,0xd8,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xce,0x00,0x00]
 
-v_mul_hi_i32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0xd8,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xd0,0x00,0x00]
 
-v_mul_hi_i32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0xd8,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xd2,0x00,0x00]
 
-v_mul_hi_i32 v0, v0, s0
-// CHECK: [0x00,0x00,0xd8,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xd4,0x00,0x00]
 
-v_mul_hi_i32 v0, v255, s0
-// CHECK: [0x00,0x00,0xd8,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xd6,0x00,0x00]
 
-v_mul_hi_i32 v0, s0, 0
-// CHECK: [0x00,0x00,0xd8,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xd8,0x00,0x00]
 
-v_mul_hi_i32 v0, s0, -1
-// CHECK: [0x00,0x00,0xd8,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xda,0x00,0x00]
 
-v_mul_hi_i32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0xd8,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xdc,0x00,0x00]
 
-v_mul_hi_i32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0xd8,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xde,0x00,0x00]
 
-v_mul_hi_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0xd8,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xf6,0x00,0x00]
 
-v_mul_hi_i32 v0, s0, v255
-// CHECK: [0x00,0x00,0xd8,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xf8,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, s0, s0, s0
-// CHECK: [0x00,0x6a,0xda,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xfc,0x00,0x00]
 
-v_div_scale_f32 v255, vcc, s0, s0, s0
-// CHECK: [0xff,0x6a,0xda,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xfe,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, 0, s0, s0
-// CHECK: [0x00,0x6a,0xda,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0x00,0x01,0x00]
 
-v_div_scale_f32 v0, vcc, 0.5, s0, s0
-// CHECK: [0x00,0x6a,0xda,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xe0,0x01,0x00]
 
-v_div_scale_f32 v0, vcc, v0, s0, s0
-// CHECK: [0x00,0x6a,0xda,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xfa,0x01,0x00]
 
-v_div_scale_f32 v0, vcc, v255, s0, s0
-// CHECK: [0x00,0x6a,0xda,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0x04,0x02,0x00]
 
-v_div_scale_f32 v0, vcc, s0, 0, s0
-// CHECK: [0x00,0x6a,0xda,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0xfe,0x03,0x00]
 
-v_div_scale_f32 v0, vcc, s0, 0.5, s0
-// CHECK: [0x00,0x6a,0xda,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x18,0xd0,0x80,0x04,0x00,0x40]
 
-v_div_scale_f32 v0, vcc, s0, v0, s0
-// CHECK: [0x00,0x6a,0xda,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_neq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x1a,0x7c]
 
-v_div_scale_f32 v0, vcc, s0, v255, s0
-// CHECK: [0x00,0x6a,0xda,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_neq_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x1a,0x7c]
 
-v_div_scale_f32 v0, vcc, s0, s0, 0
-// CHECK: [0x00,0x6a,0xda,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_neq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x1a,0x7c]
 
-v_div_scale_f32 v0, vcc, s0, s0, 0.5
-// CHECK: [0x00,0x6a,0xda,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_neq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x1a,0x7c]
 
-v_div_scale_f32 v0, vcc, s0, s0, v0
-// CHECK: [0x00,0x6a,0xda,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_neq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x1a,0x7c]
 
-v_div_scale_f32 v0, vcc, s0, s0, v255
-// CHECK: [0x00,0x6a,0xda,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_neq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[254:255], vcc, s[0:1], s[0:1], s[0:1]
-// CHECK: [0xfe,0x6a,0xdc,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, 0, s[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_neq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, 0.5, s[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_neq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_neq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, v[254:255], s[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_neq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_neq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_neq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_neq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x00,0xfc,0x03,0x00]
+v_cmp_neq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], 0
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_neq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_neq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x1a,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_neq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x1a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x6a,0xdc,0xd2,0x00,0x00,0xf8,0x07]
+v_cmp_neq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x1a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_div_fmas_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x1a,0x7c]
 
-v_div_fmas_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xde,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x1a,0x7c]
 
-v_div_fmas_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xde,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_neq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x1b,0x7c]
 
-v_div_fmas_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xde,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_neq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x1a,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_neq_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x1a,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_neq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x1a,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_neq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x1a,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_neq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x1a,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_neq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x1a,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_neq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x1a,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_neq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x1a,0xd0,0xf0,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_neq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x01,0x05,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0x00,0x80]
+v_cmp_neq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x1a,0xd0,0xff,0x05,0x00,0x00]
 
-v_div_fmas_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0x00,0xe0]
+v_cmp_neq_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xce,0x00,0x00]
 
-v_div_fmas_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xde,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xd0,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xde,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xd2,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xde,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xd4,0x00,0x00]
 
-v_div_fmas_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xde,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xd6,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x08,0xde,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xd8,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_neq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xda,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_neq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xdc,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xde,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_neq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xde,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xf6,0x00,0x00]
 
-v_div_fmas_f64 v[254:255], s[0:1], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0xe0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xf8,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], scc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xfc,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xfe,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], v[254:255], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0x00,0x01,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xe0,0x01,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xfa,0x01,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0xfc,0x03,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0x04,0x02,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0xf4,0x03]
+v_cmp_neq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0xfe,0x03,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_neq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x1a,0xd0,0x80,0x04,0x00,0x40]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0xf8,0x07]
+v_cmp_nlt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], -s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_nlt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0x00,0x40]
+v_cmp_nlt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0x00,0x80]
+v_cmp_nlt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], -s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0x00,0xe0]
+v_cmp_nlt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], |s[0:1]|, s[0:1], s[0:1]
-// CHECK: [0x00,0x01,0xe0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x02,0xe0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x04,0xe0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], |s[0:1]|, |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x07,0xe0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x08,0xe0,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_nlt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_nlt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x1c,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0xe0,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_nlt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x1c,0x7c]
 
-v_msad_u8 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x1c,0x7c]
 
-v_msad_u8 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xe2,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x1c,0x7c]
 
-v_msad_u8 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x1c,0x7c]
 
-v_msad_u8 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x1c,0x7c]
 
-v_msad_u8 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x1c,0x7c]
 
-v_msad_u8 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x1c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_msad_u8 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nlt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x1c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_msad_u8 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0xff,0x01,0x00,0x00]
+v_cmp_nlt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x1c,0x7c]
 
-v_msad_u8 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_nlt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x1c,0x7c]
 
-v_msad_u8 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_nlt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x1d,0x7c]
 
-v_msad_u8 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0x04,0x00,0x00]
 
-v_msad_u8 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_nlt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x1c,0xd0,0x80,0x04,0x00,0x00]
 
-v_msad_u8 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nlt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x1c,0xd0,0x80,0x04,0x00,0x00]
 
-v_msad_u8 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_nlt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x1c,0xd0,0x80,0x04,0x00,0x00]
 
-v_msad_u8 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_nlt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x1c,0xd0,0x80,0x04,0x00,0x00]
 
-v_msad_u8 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_nlt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x1c,0xd0,0x80,0x04,0x00,0x00]
 
-v_msad_u8 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_nlt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x1c,0xd0,0x80,0x04,0x00,0x00]
 
-v_msad_u8 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_nlt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x1c,0xd0,0x80,0x04,0x00,0x00]
 
-v_msad_u8 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_nlt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x1c,0xd0,0xf0,0x04,0x00,0x00]
 
-v_msad_u8 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xe2,0xd2,0x00,0x00,0xfc,0x07]
+v_cmp_nlt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x01,0x05,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x1c,0xd0,0xff,0x05,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[254:255], s[0:1], s0, s[0:1]
-// CHECK: [0xfe,0x00,0xe4,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xce,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xd0,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xd2,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xd4,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xd6,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], v[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xd8,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], v[254:255], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xda,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xdc,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xde,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xf6,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xf8,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], v0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xfc,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], v255, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xfe,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_nlt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0x00,0x01,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_nlt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xe0,0x01,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_nlt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xfa,0x01,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_nlt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0x04,0x02,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, v[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_nlt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0xfe,0x03,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, v[254:255]
-// CHECK: [0x00,0x00,0xe4,0xd2,0x00,0x00,0xf8,0x07]
+v_cmp_nlt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x1c,0xd0,0x80,0x04,0x00,0x40]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[254:255], s[0:1], s0, s[0:1]
-// CHECK: [0xfe,0x00,0xe6,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x80,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0xc1,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0xf0,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0xf7,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], v[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_tru_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], v[254:255], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_tru_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_tru_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_tru_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_tru_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_tru_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], v0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_tru_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], v255, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_tru_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x00,0x00,0x02]
+v_cmp_tru_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x00,0x04,0x03]
+v_cmp_tru_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x00,0xc0,0x03]
+v_cmp_tru_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x00,0xdc,0x03]
+v_cmp_tru_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x1e,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, v[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x00,0x00,0x04]
+v_cmp_tru_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x1e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, v[254:255]
-// CHECK: [0x00,0x00,0xe6,0xd2,0x00,0x00,0xf8,0x07]
+v_cmp_tru_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x1e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_trig_preop_f64 v[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x1e,0x7c]
 
-v_trig_preop_f64 v[254:255], s[0:1], s0
-// CHECK: [0xfe,0x00,0xe8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x1e,0x7c]
 
-v_trig_preop_f64 v[0:1], scc, s0
-// CHECK: [0x00,0x00,0xe8,0xd2,0xfd,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x1f,0x7c]
 
-v_trig_preop_f64 v[0:1], v[0:1], s0
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0x01,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], v[254:255], s0
-// CHECK: [0x00,0x00,0xe8,0xd2,0xfe,0x01,0x00,0x00]
+v_cmp_tru_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x1e,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0x00,0x01,0x00]
+v_cmp_tru_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x1e,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0x82,0x01,0x00]
+v_cmp_tru_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x1e,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0xe0,0x01,0x00]
+v_cmp_tru_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x1e,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0xee,0x01,0x00]
+v_cmp_tru_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x1e,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0xfa,0x01,0x00]
+v_cmp_tru_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x1e,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], v0
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0x00,0x02,0x00]
+v_cmp_tru_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x1e,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], v255
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0xfe,0x03,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x1e,0xd0,0xf0,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], -s[0:1], s0
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0x00,0x00,0x20]
+v_cmp_tru_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x01,0x05,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], |s[0:1]|, s0
-// CHECK: [0x00,0x01,0xe8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x1e,0xd0,0xff,0x05,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], s0 clamp
-// CHECK: [0x00,0x08,0xe8,0xd2,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xce,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], s0 mul:2
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0x00,0x00,0x08]
+v_cmp_tru_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xd0,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], s0 mul:4
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0x00,0x00,0x10]
+v_cmp_tru_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xd2,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], s0 div:2
-// CHECK: [0x00,0x00,0xe8,0xd2,0x00,0x00,0x00,0x18]
+v_cmp_tru_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_f_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_f_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_f_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_f_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_f_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_f_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_f_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_f_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_f_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_f_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_f_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_f_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_f_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_f_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_f_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x00,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x1e,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_f_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x7c]
+v_cmpx_f_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x20,0x7c]
 
-v_cmp_f_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x7c]
+v_cmpx_f_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x20,0x7c]
 
-v_cmp_f_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x7c]
+v_cmpx_f_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x20,0x7c]
 
-v_cmp_f_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_f_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x20,0x7c]
 
-v_cmp_f_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_f_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x20,0x7c]
 
-v_cmp_f_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x00,0x7c]
+v_cmpx_f_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x20,0x7c]
 
-v_cmp_f_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x00,0x7c]
+v_cmpx_f_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x20,0x7c]
 
-v_cmp_f_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x7c]
+v_cmpx_f_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x00,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x00,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x00,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x00,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x00,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x00,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x00,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x00,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x00,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x20,0x7c]
 
-v_cmp_f_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x00,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x20,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_f_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x00,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_f_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x20,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x00,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_f_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x20,0x7c]
 
-v_cmp_f_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x00,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_f_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x20,0x7c]
 
-v_cmp_f_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x00,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_f_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x21,0x7c]
 
-v_cmp_f_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x00,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x00,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_f_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x00,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_f_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_f_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_f_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x00,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_f_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x20,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_lt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x20,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_lt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x20,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_lt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_lt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_lt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_lt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_lt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_lt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_lt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_lt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_lt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_lt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_lt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_lt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_lt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x02,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_f_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_lt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x02,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_f_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_lt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_lt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x02,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_lt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x03,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x02,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_lt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x02,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x02,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x02,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x02,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x02,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x02,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x02,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x02,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x02,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x02,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x02,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x02,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x02,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x02,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x02,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_lt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x02,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x02,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_lt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x02,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_lt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x22,0x7c]
 
-v_cmp_lt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x02,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_lt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x22,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x02,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_lt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x22,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x04,0x7c]
+v_cmpx_lt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x22,0x7c]
 
-v_cmp_eq_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x04,0x7c]
+v_cmpx_lt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x22,0x7c]
 
-v_cmp_eq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x04,0x7c]
+v_cmpx_lt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x23,0x7c]
 
-v_cmp_eq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x22,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_eq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x22,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_eq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x22,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_eq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_eq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_eq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_eq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_eq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x04,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_eq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x04,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_eq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_eq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x04,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_eq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x05,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x04,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x04,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x04,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_eq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x04,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_eq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x04,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_eq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x04,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_eq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x04,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_eq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x04,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x04,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x04,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_eq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x04,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x24,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x04,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x24,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x04,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_eq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x24,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x04,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x24,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x04,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x24,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x04,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_eq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x24,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x04,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x24,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x04,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x24,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x04,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_eq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x24,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x04,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_eq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x24,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x04,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_eq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x24,0x7c]
 
-v_cmp_le_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x24,0x7c]
 
-v_cmp_le_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x24,0x7c]
 
-v_cmp_le_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x24,0x7c]
 
-v_cmp_le_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x24,0x7c]
 
-v_cmp_le_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x24,0x7c]
 
-v_cmp_le_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x24,0x7c]
 
-v_cmp_le_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x24,0x7c]
 
-v_cmp_le_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x24,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x24,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x24,0x7c]
 
-v_cmp_le_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x24,0x7c]
 
-v_cmp_le_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x06,0x7c]
+v_cmpx_eq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x25,0x7c]
 
-v_cmp_le_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x06,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x06,0x7c]
+v_cmpx_eq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x06,0x7c]
+v_cmpx_eq_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x06,0x7c]
+v_cmpx_eq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x06,0x7c]
+v_cmpx_eq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x06,0x7c]
+v_cmpx_eq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x06,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x06,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x06,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x24,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_le_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x06,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x24,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_le_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x07,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x24,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x06,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_le_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x06,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_le_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x06,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_le_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x06,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_le_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x06,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_le_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x06,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_le_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x06,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_le_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x06,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x06,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x06,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x06,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x06,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x06,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x06,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_le_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x06,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_le_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x06,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_le_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x06,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_le_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x06,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_le_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x06,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_eq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_le_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x06,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_le_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x26,0x7c]
 
-v_cmp_le_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x06,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_le_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x26,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x08,0x7c]
+v_cmpx_le_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x26,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x08,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_le_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x08,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_le_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x26,0x7c]
 
-v_cmp_gt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x08,0x7c]
+v_cmpx_le_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x27,0x7c]
 
-v_cmp_gt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x08,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x09,0x7c]
+v_cmpx_le_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x08,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x08,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x08,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x08,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x08,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x08,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x08,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x26,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_gt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x08,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x26,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x08,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x26,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x08,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x08,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x08,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x08,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x08,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x08,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x08,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x08,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x08,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x08,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_le_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x08,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_le_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x08,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_le_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_lg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x0a,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_lg_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x0a,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_lg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x0a,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_lg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x0a,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_lg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x0a,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_lg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x0a,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_lg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x0a,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_lg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x0a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x0a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x0a,0x7c]
+v_cmpx_gt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x28,0x7c]
 
-v_cmp_lg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x0b,0x7c]
+v_cmpx_gt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x28,0x7c]
 
-v_cmp_lg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x0a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x28,0x7c]
 
-v_cmp_lg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x0a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x28,0x7c]
 
-v_cmp_lg_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x0a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x28,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_lg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x0a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x28,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_lg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x0a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x28,0x7c]
 
-v_cmp_lg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x0a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x28,0x7c]
 
-v_cmp_lg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x0a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x29,0x7c]
 
-v_cmp_lg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x0a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x0a,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x0a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x0a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x0a,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x0a,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_gt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x0a,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x0a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x0a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x28,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x0a,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x28,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x0a,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x28,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x0a,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_gt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x0a,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_gt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x0a,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_gt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_ge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ge_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_ge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x0c,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_ge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x0c,0x7c]
+v_cmpx_lg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x0c,0x7c]
+v_cmpx_lg_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x0c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_lg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x0c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_lg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x0c,0x7c]
+v_cmpx_lg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x0c,0x7c]
+v_cmpx_lg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x0d,0x7c]
+v_cmpx_lg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x0c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x0c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x0c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x0c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x0c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x0c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x0c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x0c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x0c,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x0c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x0c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_lg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x0c,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x2a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x0c,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x2a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x0c,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x0c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x2a,0x7c]
 
-v_cmp_ge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x0c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_lg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x2b,0x7c]
 
-v_cmp_ge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x0c,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x0c,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_lg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x0c,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_lg_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x0c,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_lg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x0c,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_lg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_o_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_o_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_o_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_o_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_o_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_o_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_o_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_o_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_o_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_o_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_o_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_o_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_o_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_o_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_o_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x0e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_lg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_o_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x0e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_lg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_o_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_o_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x0e,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_o_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x0f,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_o_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x0e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_o_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x0e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_o_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x0e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x0e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x0e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x0e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x0e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x0e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x0e,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x0e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x0e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x0e,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x0e,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x0e,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x0e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x0e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_ge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x0e,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x0e,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x0e,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_ge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x0e,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_ge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x2c,0x7c]
 
-v_cmp_o_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x0e,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_ge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x2c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_u_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x10,0x7c]
+v_cmpx_ge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x2c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_u_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x10,0x7c]
+v_cmpx_ge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x2c,0x7c]
 
-v_cmp_u_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x10,0x7c]
+v_cmpx_ge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x2c,0x7c]
 
-v_cmp_u_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x10,0x7c]
+v_cmpx_ge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x2d,0x7c]
 
-v_cmp_u_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_u_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_u_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_u_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_u_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_u_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_u_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x10,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_u_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x10,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_u_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_u_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x10,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_u_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x11,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x10,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_u_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x10,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_u_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x10,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_u_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x10,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_u_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x10,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_u_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x10,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_u_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x10,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_u_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x10,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_u_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x10,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_u_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x10,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_u_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x10,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_u_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x10,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_o_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x2e,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x10,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_o_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x2e,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x10,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_o_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x2e,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x10,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_o_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x2e,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x10,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_o_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x2e,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x10,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_o_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x2e,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x10,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_o_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x2e,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x10,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_o_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x2e,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x10,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_o_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x2e,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x10,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_o_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x2e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x2e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x2e,0x7c]
 
-v_cmp_nge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x12,0x7c]
+v_cmpx_o_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x2f,0x7c]
 
-v_cmp_nge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x12,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x12,0x7c]
+v_cmpx_o_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x12,0x7c]
+v_cmpx_o_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x12,0x7c]
+v_cmpx_o_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x12,0x7c]
+v_cmpx_o_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x12,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_o_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x12,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_o_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x12,0x7c]
+v_cmpx_o_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x12,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_nge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x13,0x7c]
+v_cmpx_o_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x12,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x12,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x12,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_nge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x12,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_nge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x12,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_nge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x12,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_nge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x12,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_nge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x12,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x12,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x12,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x12,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x12,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x12,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x12,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x12,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x12,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x12,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x12,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x12,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_o_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x12,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_o_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_nge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x12,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_u_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x14,0x7c]
+v_cmpx_u_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x30,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nlg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x14,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_u_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x30,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nlg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x14,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_u_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x14,0x7c]
+v_cmpx_u_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x30,0x7c]
 
-v_cmp_nlg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x14,0x7c]
+v_cmpx_u_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x31,0x7c]
 
-v_cmp_nlg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x15,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x14,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x14,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x14,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x14,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x14,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x14,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x14,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x14,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x30,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x14,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x30,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x14,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x30,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x14,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x14,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x14,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x14,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x14,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x14,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x14,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x14,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x14,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_u_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x14,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_u_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x14,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_u_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ngt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x16,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ngt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x16,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ngt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x16,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ngt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x16,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ngt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x16,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_ngt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x16,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ngt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x16,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ngt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x16,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_ngt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x16,0x7c]
+v_cmpx_nge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x16,0x7c]
+v_cmpx_nge_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x16,0x7c]
+v_cmpx_nge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x16,0x7c]
+v_cmpx_nge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x16,0x7c]
+v_cmpx_nge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x16,0x7c]
+v_cmpx_nge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x16,0x7c]
+v_cmpx_nge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x16,0x7c]
+v_cmpx_nge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x16,0x7c]
+v_cmpx_nge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x16,0x7c]
+v_cmpx_nge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x16,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x16,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x16,0x7c]
+v_cmpx_nge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x16,0x7c]
+v_cmpx_nge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x17,0x7c]
+v_cmpx_nge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x16,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x16,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x16,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x32,0x7c]
 
-v_cmp_ngt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x16,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x32,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_ngt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x16,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x32,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_ngt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x16,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x32,0x7c]
 
-v_cmp_ngt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x16,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x32,0x7c]
 
-v_cmp_ngt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x16,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x33,0x7c]
 
-v_cmp_ngt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x16,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x16,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x16,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x16,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x16,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x16,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x16,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x16,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x16,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x32,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x16,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x32,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x16,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x32,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x16,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nge_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x16,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_nle_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_nle_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_nle_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_nle_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_nle_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_nle_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_nle_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_nle_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_nle_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_nle_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_nle_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_nle_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_nle_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nle_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_nle_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nle_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nle_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x18,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_nle_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x18,0x7c]
+v_cmpx_nlg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x34,0x7c]
 
-v_cmp_nle_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x18,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nlg_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x34,0x7c]
 
-v_cmp_nle_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x18,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nlg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x34,0x7c]
 
-v_cmp_nle_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x18,0x7c]
+v_cmpx_nlg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x34,0x7c]
 
-v_cmp_nle_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x18,0x7c]
+v_cmpx_nlg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x34,0x7c]
 
-v_cmp_nle_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x19,0x7c]
+v_cmpx_nlg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x18,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x18,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x18,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x18,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x18,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x18,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x18,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x18,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x18,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x18,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x18,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x18,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nlg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x34,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x18,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nlg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x34,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nle_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x18,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nlg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x34,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nle_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x18,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nlg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x34,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x18,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nlg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x34,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x18,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nlg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x35,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x18,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nlg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x18,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nlg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x18,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nlg_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x18,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nlg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x34,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_neq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x34,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_neq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x34,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_neq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_neq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_neq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_neq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_neq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_neq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_neq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_neq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_neq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_neq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_neq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_neq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x1a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nlg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_neq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x1a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nlg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_neq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_neq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x1a,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_neq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x1b,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_neq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x1a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_neq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x1a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_neq_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x1a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_neq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x1a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x1a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x1a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x1a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x1a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x1a,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x1a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x1a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x1a,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ngt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x1a,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ngt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x1a,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ngt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x1a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ngt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x1a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_ngt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x1a,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ngt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x1a,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ngt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x1a,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_ngt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x1a,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_ngt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x36,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x1a,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_ngt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x36,0x7c]
 
-v_cmp_nlt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x36,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nlt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x36,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nlt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x36,0x7c]
 
-v_cmp_nlt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x36,0x7c]
 
-v_cmp_nlt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x37,0x7c]
 
-v_cmp_nlt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x36,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x36,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x36,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x1c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_ngt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x1c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_ngt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x1c,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x1d,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x1c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_nlt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x1c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_nlt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x1c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_nlt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x1c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_nlt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x1c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_nlt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x1c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_nlt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x1c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_nlt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x1c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x1c,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x1c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x1c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x1c,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_nlt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x1c,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nle_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x38,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x1c,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nle_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x38,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x1c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nle_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x38,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x1c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nle_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x38,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x1c,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nle_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x38,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x1c,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nle_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x38,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x1c,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nle_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x38,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x1c,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nle_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x38,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x1c,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nle_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x38,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_tru_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x38,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_tru_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x38,0x7c]
 
-v_cmp_tru_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x1e,0x7c]
+v_cmpx_nle_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x39,0x7c]
 
-v_cmp_tru_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x1e,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x1e,0x7c]
+v_cmpx_nle_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x1e,0x7c]
+v_cmpx_nle_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x1e,0x7c]
+v_cmpx_nle_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x1e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nle_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x1e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nle_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x1e,0x7c]
+v_cmpx_nle_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x1e,0x7c]
+v_cmpx_nle_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x1f,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x38,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x1e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x38,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x1e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x38,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x1e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_tru_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x1e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmp_tru_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x1e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmp_tru_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x1e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_tru_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x1e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_tru_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x1e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x1e,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x1e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x1e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x1e,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x1e,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x1e,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x1e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x1e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x1e,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x1e,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x1e,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nle_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x1e,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nle_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x1e,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nle_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_f_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x20,0x7c]
+v_cmpx_neq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x20,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_neq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x3a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x20,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_neq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x3a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x20,0x7c]
+v_cmpx_neq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x20,0x7c]
+v_cmpx_neq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x3a,0x7c]
 
-v_cmpx_f_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x21,0x7c]
+v_cmpx_neq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x3b,0x7c]
 
-v_cmpx_f_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_neq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_neq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_neq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x22,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x22,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x22,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x22,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x22,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x22,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_lt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x22,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x22,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x22,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_lt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x22,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nlt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x22,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nlt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x22,0x7c]
+v_cmpx_nlt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x23,0x7c]
+v_cmpx_nlt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x3c,0x7c]
 
-v_cmpx_lt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x3c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x3c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x3c,0x7c]
 
-v_cmpx_lt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x3c,0x7c]
 
-v_cmpx_lt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nlt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x3d,0x7c]
 
-v_cmpx_lt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nlt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nlt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nlt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nlt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nlt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nlt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nlt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nlt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_eq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_eq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_eq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_eq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_eq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x24,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_eq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x24,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_tru_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x24,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_tru_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x24,0x7c]
+v_cmpx_tru_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x24,0x7c]
+v_cmpx_tru_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x25,0x7c]
+v_cmpx_tru_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_tru_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_tru_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_tru_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x3e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_tru_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x3e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_tru_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_tru_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x3e,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_tru_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x3f,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_tru_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_tru_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_tru_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_le_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_le_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_le_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_le_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_le_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_le_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_le_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_le_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_le_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_le_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_le_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_le_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_le_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x26,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_tru_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_le_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x26,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_tru_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_le_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_le_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x26,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_le_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x27,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_le_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_le_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_le_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_le_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_le_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_f_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_f_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_f_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_f_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x40,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_f_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x40,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_f_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x40,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_f_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x40,0x7c]
 
-v_cmpx_gt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x28,0x7c]
+v_cmp_f_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x41,0x7c]
 
-v_cmpx_gt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x40,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x40,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x40,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x40,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x40,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x40,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x40,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x40,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_gt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x40,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_gt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x40,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_gt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x40,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_gt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x28,0x7c]
+v_cmp_f_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_gt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x28,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_f_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_gt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x28,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_f_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_gt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x28,0x7c]
+v_cmp_lt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x28,0x7c]
+v_cmp_lt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x29,0x7c]
+v_cmp_lt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x42,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x42,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_lt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x42,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_lt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x43,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x42,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x42,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x42,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x42,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x42,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x42,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x42,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_lg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_lg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_lg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_lg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x42,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_lg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x42,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_lg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x42,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_lg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x42,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_lg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_lg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_lg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x2a,0x7c]
+v_cmp_lt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_lg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x2a,0x7c]
+v_cmp_eq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x2a,0x7c]
+v_cmp_eq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x2a,0x7c]
+v_cmp_eq_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x2a,0x7c]
+v_cmp_eq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x2a,0x7c]
+v_cmp_eq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x2a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_eq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x2a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_eq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x2a,0x7c]
+v_cmp_eq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x2a,0x7c]
+v_cmp_eq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x2b,0x7c]
+v_cmp_eq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x44,0x7c]
 
-v_cmpx_lg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x44,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x44,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x44,0x7c]
 
-v_cmpx_lg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x44,0x7c]
 
-v_cmpx_lg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x45,0x7c]
 
-v_cmpx_lg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x44,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x44,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x44,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_eq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x44,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x44,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_eq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x44,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_eq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x44,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_eq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_eq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_eq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x44,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_ge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x2c,0x7c]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x44,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ge_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x2c,0x7c]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x44,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x2c,0x7c]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x44,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x2c,0x7c]
+v_cmp_eq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_ge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x2c,0x7c]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_ge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x2c,0x7c]
+v_cmp_eq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_ge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x2c,0x7c]
+v_cmp_le_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x2c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_le_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x2c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_le_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x46,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x2c,0x7c]
+v_cmp_le_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x46,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x2c,0x7c]
+v_cmp_le_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x46,0x7c]
 
-v_cmpx_ge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x2d,0x7c]
+v_cmp_le_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x46,0x7c]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x47,0x7c]
 
-v_cmpx_ge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x46,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x46,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x46,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x46,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x46,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x46,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x46,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x46,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x46,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x46,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x46,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_le_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_ge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_le_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_le_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_ge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_gt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x48,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_o_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x48,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_o_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x48,0x7c]
 
-v_cmpx_o_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x2e,0x7c]
+v_cmp_gt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x49,0x7c]
 
-v_cmpx_o_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x2e,0x7c]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x2e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_gt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x48,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x2e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_gt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x48,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x2e,0x7c]
+v_cmp_gt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x48,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x2e,0x7c]
+v_cmp_gt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x48,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x2f,0x7c]
+v_cmp_gt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x48,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x48,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x48,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_o_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_o_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_o_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_o_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x48,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_o_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x48,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_o_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x48,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_o_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x48,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_o_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_o_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_o_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_o_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x4a,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_lg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x4a,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_lg_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x4a,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x4a,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x4a,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x4a,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x4a,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x4a,0x7c]
 
-v_cmpx_u_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x30,0x7c]
+v_cmp_lg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x4a,0x7c]
 
-v_cmpx_u_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x30,0x7c]
+v_cmp_lg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x4a,0x7c]
 
-v_cmpx_u_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x30,0x7c]
+v_cmp_lg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x4a,0x7c]
 
-v_cmpx_u_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x30,0x7c]
+v_cmp_lg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x4a,0x7c]
 
-v_cmpx_u_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x30,0x7c]
+v_cmp_lg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x4a,0x7c]
 
-v_cmpx_u_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x30,0x7c]
+v_cmp_lg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x4a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_u_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x30,0x7c]
+v_cmp_lg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x4a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_u_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x30,0x7c]
+v_cmp_lg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x4a,0x7c]
 
-v_cmpx_u_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x30,0x7c]
+v_cmp_lg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x4a,0x7c]
 
-v_cmpx_u_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x30,0x7c]
+v_cmp_lg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x4b,0x7c]
 
-v_cmpx_u_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x30,0x7c]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x30,0x7c]
+v_cmp_lg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x4a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x30,0x7c]
+v_cmp_lg_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x4a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x30,0x7c]
+v_cmp_lg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x4a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x30,0x7c]
+v_cmp_lg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x4a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x30,0x7c]
+v_cmp_lg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x4a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x30,0x7c]
+v_cmp_lg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x4a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x30,0x7c]
+v_cmp_lg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x4a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x30,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_lg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x30,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_lg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x30,0x7c]
+v_cmp_lg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_u_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x30,0x7c]
+v_cmp_lg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_u_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x31,0x7c]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_u_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_u_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_u_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_u_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_u_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_u_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_u_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x4c,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x4c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_u_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x4c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x32,0x7c]
+v_cmp_ge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x4c,0x7c]
 
-v_cmpx_nge_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x32,0x7c]
+v_cmp_ge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x4c,0x7c]
 
-v_cmpx_nge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x32,0x7c]
+v_cmp_ge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x4d,0x7c]
 
-v_cmpx_nge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x4c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x4c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x4c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x4c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x4c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x4c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x4c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_nge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_nge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x32,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_nge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x32,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_nge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x32,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_nge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x32,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_nge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x32,0x7c]
+v_cmp_ge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_nge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x33,0x7c]
+v_cmp_o_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_o_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_o_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x4e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_o_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x4e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_o_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_o_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x4e,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_o_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x4f,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_o_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_o_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x4e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_o_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x4e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_o_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x4e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x4e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x4e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x4e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x4e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_nlg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_nlg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_nlg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_nlg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_nlg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_nlg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x34,0x7c]
+v_cmp_o_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_nlg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x34,0x7c]
+v_cmp_u_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x34,0x7c]
+v_cmp_u_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x34,0x7c]
+v_cmp_u_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x34,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_u_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x34,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_u_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x34,0x7c]
+v_cmp_u_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x34,0x7c]
+v_cmp_u_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x35,0x7c]
+v_cmp_u_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x50,0x7c]
 
-v_cmpx_nlg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x50,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x50,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nlg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x50,0x7c]
 
-v_cmpx_nlg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x50,0x7c]
 
-v_cmpx_nlg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x51,0x7c]
 
-v_cmpx_nlg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_u_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x50,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_u_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x50,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_u_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x50,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_u_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x50,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_u_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x50,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_u_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x50,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_u_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x50,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_u_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x50,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_u_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_u_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x50,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_u_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x50,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_ngt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x36,0x7c]
+v_cmp_u_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x50,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ngt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x36,0x7c]
+v_cmp_u_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x50,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_ngt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x36,0x7c]
+v_cmp_u_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x50,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ngt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x36,0x7c]
+v_cmp_u_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x50,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ngt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x36,0x7c]
+v_cmp_u_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x50,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ngt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x36,0x7c]
+v_cmp_u_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x50,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_ngt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x36,0x7c]
+v_cmp_u_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x50,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_ngt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x36,0x7c]
+v_cmp_u_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x50,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_ngt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x36,0x7c]
+v_cmp_nge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x36,0x7c]
+v_cmp_nge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x36,0x7c]
+v_cmp_nge_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x36,0x7c]
+v_cmp_nge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x36,0x7c]
+v_cmp_nge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x36,0x7c]
+v_cmp_nge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x36,0x7c]
+v_cmp_nge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x36,0x7c]
+v_cmp_nge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x36,0x7c]
+v_cmp_nge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x36,0x7c]
+v_cmp_nge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x36,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_nge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x36,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_nge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x36,0x7c]
+v_cmp_nge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x52,0x7c]
 
-v_cmpx_ngt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x36,0x7c]
+v_cmp_nge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x52,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ngt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x37,0x7c]
+v_cmp_nge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x52,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x52,0x7c]
 
-v_cmpx_ngt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x52,0x7c]
 
-v_cmpx_ngt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x53,0x7c]
 
-v_cmpx_ngt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x52,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x52,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x52,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x52,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x52,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x52,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x52,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x52,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x52,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x52,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_nge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x52,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x52,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x52,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x52,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x52,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x52,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_ngt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x52,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_nle_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x38,0x7c]
+v_cmp_nge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x52,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_nle_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x54,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nle_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x54,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nle_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x38,0x7c]
+v_cmp_nlg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x54,0x7c]
 
-v_cmpx_nle_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x38,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_nlg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x55,0x7c]
 
-v_cmpx_nle_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x38,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x54,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x38,0x7c]
+v_cmp_nlg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x54,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x38,0x7c]
+v_cmp_nlg_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x54,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x39,0x7c]
+v_cmp_nlg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x54,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x54,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x54,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x54,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x54,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x54,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x54,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_nle_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x54,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_nle_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x54,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_nle_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x54,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_nle_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x54,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_nle_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x54,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_nle_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x54,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x54,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_nlg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x54,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ngt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x56,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ngt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x56,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ngt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x56,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ngt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x56,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ngt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x56,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ngt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x56,0x7c]
 
-v_cmpx_neq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x56,0x7c]
 
-v_cmpx_neq_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x56,0x7c]
 
-v_cmpx_neq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x56,0x7c]
 
-v_cmpx_neq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x56,0x7c]
 
-v_cmpx_neq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x56,0x7c]
 
-v_cmpx_neq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x56,0x7c]
 
-v_cmpx_neq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x56,0x7c]
 
-v_cmpx_neq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x56,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_neq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x56,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_neq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x56,0x7c]
 
-v_cmpx_neq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x56,0x7c]
 
-v_cmpx_neq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x3a,0x7c]
+v_cmp_ngt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x57,0x7c]
 
-v_cmpx_neq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x3a,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x56,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x3a,0x7c]
+v_cmp_ngt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x56,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x3a,0x7c]
+v_cmp_ngt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x56,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x3a,0x7c]
+v_cmp_ngt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x56,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x3a,0x7c]
+v_cmp_ngt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x56,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x3a,0x7c]
+v_cmp_ngt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x56,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x3a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ngt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x56,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x3a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ngt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x56,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x3a,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x3a,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x56,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x3b,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x56,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x56,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_neq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x56,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_neq_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x56,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_neq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x56,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_neq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x56,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_neq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x56,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_neq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x56,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_neq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x56,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_neq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nle_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nle_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nle_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_nle_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nle_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nle_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nle_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nle_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x58,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nle_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x58,0x7c]
 
-v_cmpx_nlt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x3c,0x7c]
+v_cmp_nle_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x58,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x3c,0x7c]
+v_cmp_nle_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x58,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nlt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x3c,0x7c]
+v_cmp_nle_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x58,0x7c]
 
-v_cmpx_nlt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x3c,0x7c]
+v_cmp_nle_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x58,0x7c]
 
-v_cmpx_nlt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x3c,0x7c]
+v_cmp_nle_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x59,0x7c]
 
-v_cmpx_nlt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x58,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x58,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x58,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x58,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x58,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x58,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x58,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x58,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x58,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x58,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x58,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x3c,0x7c]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x58,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_nlt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x3c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x58,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_nlt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x3c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x58,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_nlt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x3c,0x7c]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x58,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_nlt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x3c,0x7c]
+v_cmp_nle_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x58,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_nlt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x3d,0x7c]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x58,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x58,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_nlt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_neq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_neq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_neq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_neq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x5a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_neq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x5a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_neq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_neq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x5a,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_neq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x5b,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_neq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x5a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x5a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x5a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x5a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x5a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x5a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x5a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x5a,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x5a,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_tru_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_tru_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_tru_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_tru_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_tru_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_tru_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x3e,0x7c]
+v_cmp_neq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_tru_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x3e,0x7c]
+v_cmp_nlt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x3e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_nlt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x3e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_nlt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x3e,0x7c]
+v_cmp_nlt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x3e,0x7c]
+v_cmp_nlt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x3f,0x7c]
+v_cmp_nlt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x5c,0x7c]
 
-v_cmpx_tru_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x5c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_tru_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x5c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_tru_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x5c,0x7c]
 
-v_cmpx_tru_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x5c,0x7c]
 
-v_cmpx_tru_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nlt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x5d,0x7c]
 
-v_cmpx_tru_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nlt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x5c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_nlt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x5c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nlt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x5c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nlt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x5c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nlt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x5c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nlt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x5c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nlt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x5c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nlt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_f_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x40,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x5c,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_f_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x40,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_f_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x40,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x5c,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_f_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x40,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_f_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x40,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_f_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x40,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_f_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x40,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_f_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x40,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_f_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x40,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_f_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x40,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_f_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x40,0x7c]
+v_cmp_tru_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x5e,0x7c]
 
-v_cmp_f_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x40,0x7c]
+v_cmp_tru_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x5e,0x7c]
 
-v_cmp_f_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x40,0x7c]
+v_cmp_tru_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x5e,0x7c]
 
-v_cmp_f_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x40,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_tru_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x5e,0x7c]
 
-v_cmp_f_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x40,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_tru_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x5e,0x7c]
 
-v_cmp_f_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x40,0x7c]
+v_cmp_tru_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x5e,0x7c]
 
-v_cmp_f_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x40,0x7c]
+v_cmp_tru_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x5e,0x7c]
 
-v_cmp_f_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x41,0x7c]
+v_cmp_tru_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x5e,0x7c]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x5e,0x7c]
 
-v_cmp_f_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x5e,0x7c]
 
-v_cmp_f_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x5e,0x7c]
 
-v_cmp_f_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x5e,0x7c]
 
-v_cmp_f_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x5e,0x7c]
 
-v_cmp_f_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x5e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_f_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x5e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x5e,0x7c]
 
-v_cmp_f_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_tru_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x5e,0x7c]
 
-v_cmp_f_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_tru_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x5f,0x7c]
 
-v_cmp_f_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_tru_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x5e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_tru_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x5e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_tru_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x5e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_tru_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x5e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_tru_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x5e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_tru_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x5e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_tru_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x5e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_tru_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_tru_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x5e,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_tru_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_lt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x42,0x7c]
+v_cmp_tru_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x5e,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_lt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x42,0x7c]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_lt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x42,0x7c]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_lt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x42,0x7c]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_lt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x42,0x7c]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_lt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x42,0x7c]
+v_cmp_tru_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_lt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x42,0x7c]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_lt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x42,0x7c]
+v_cmp_tru_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_lt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x42,0x7c]
+v_cmpx_f_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x60,0x7c]
 
-v_cmp_lt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x42,0x7c]
+v_cmpx_f_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x60,0x7c]
 
-v_cmp_lt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x42,0x7c]
+v_cmpx_f_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x60,0x7c]
 
-v_cmp_lt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x42,0x7c]
+v_cmpx_f_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x60,0x7c]
 
-v_cmp_lt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x42,0x7c]
+v_cmpx_f_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x60,0x7c]
 
-v_cmp_lt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x42,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_f_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x60,0x7c]
 
-v_cmp_lt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x42,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_f_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x60,0x7c]
 
-v_cmp_lt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x42,0x7c]
+v_cmpx_f_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x60,0x7c]
 
-v_cmp_lt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x42,0x7c]
+v_cmpx_f_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x60,0x7c]
 
-v_cmp_lt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x43,0x7c]
+v_cmpx_f_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x60,0x7c]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x60,0x7c]
 
-v_cmp_lt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x60,0x7c]
 
-v_cmp_lt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x60,0x7c]
 
-v_cmp_lt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x60,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x60,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_lt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x60,0x7c]
 
-v_cmp_lt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x60,0x7c]
 
-v_cmp_lt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x61,0x7c]
 
-v_cmp_lt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_f_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_f_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_f_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_f_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_f_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_f_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_f_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_f_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_f_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_eq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x44,0x7c]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_eq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x44,0x7c]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_eq_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x44,0x7c]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_eq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x44,0x7c]
+v_cmpx_f_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_eq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x44,0x7c]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_eq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x44,0x7c]
+v_cmpx_f_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_eq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x44,0x7c]
+v_cmpx_lt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x44,0x7c]
+v_cmpx_lt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x44,0x7c]
+v_cmpx_lt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x44,0x7c]
+v_cmpx_lt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x44,0x7c]
+v_cmpx_lt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x44,0x7c]
+v_cmpx_lt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x44,0x7c]
+v_cmpx_lt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x44,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x44,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x44,0x7c]
+v_cmpx_lt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x44,0x7c]
+v_cmpx_lt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x62,0x7c]
 
-v_cmp_eq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x45,0x7c]
+v_cmpx_lt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x62,0x7c]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x62,0x7c]
 
-v_cmp_eq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x62,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x62,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x62,0x7c]
 
-v_cmp_eq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x62,0x7c]
 
-v_cmp_eq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x63,0x7c]
 
-v_cmp_eq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_lt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_lt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_lt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_lt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_le_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x46,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_le_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x46,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_le_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x46,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_le_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x46,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_le_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x46,0x7c]
+v_cmpx_eq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x46,0x7c]
+v_cmpx_eq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x46,0x7c]
+v_cmpx_eq_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x46,0x7c]
+v_cmpx_eq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x46,0x7c]
+v_cmpx_eq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x46,0x7c]
+v_cmpx_eq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x46,0x7c]
+v_cmpx_eq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x46,0x7c]
+v_cmpx_eq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x46,0x7c]
+v_cmpx_eq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x46,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x46,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x46,0x7c]
+v_cmpx_eq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x46,0x7c]
+v_cmpx_eq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x64,0x7c]
 
-v_cmp_le_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x47,0x7c]
+v_cmpx_eq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x64,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x64,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x64,0x7c]
 
-v_cmp_le_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x64,0x7c]
 
-v_cmp_le_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x65,0x7c]
 
-v_cmp_le_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_eq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_eq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_eq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_eq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_le_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_le_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_eq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_gt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x48,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_gt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x48,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_gt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x48,0x7c]
+v_cmpx_le_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x48,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_le_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x48,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_le_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x66,0x7c]
 
-v_cmp_gt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x48,0x7c]
+v_cmpx_le_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x66,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x48,0x7c]
+v_cmpx_le_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x66,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x49,0x7c]
+v_cmpx_le_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x66,0x7c]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x66,0x7c]
 
-v_cmp_gt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x67,0x7c]
 
-v_cmp_gt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_le_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_le_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_le_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_le_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_le_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_le_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_gt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_le_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_lg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x4a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x68,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_lg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x4a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x68,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_lg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x4a,0x7c]
+v_cmpx_gt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x68,0x7c]
 
-v_cmp_lg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x4b,0x7c]
+v_cmpx_gt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x69,0x7c]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_lg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_gt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_lg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x6a,0x7c]
 
-v_cmp_lg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_lg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x6a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x6a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x4c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_lg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x4c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_lg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x6a,0x7c]
 
-v_cmp_ge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x4c,0x7c]
+v_cmpx_lg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x6b,0x7c]
 
-v_cmp_ge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x4c,0x7c]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x4d,0x7c]
+v_cmpx_lg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_ge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_ge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x6c,0x7c]
 
-v_cmp_ge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_ge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x6c,0x7c]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_ge_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x6c,0x7c]
 
-v_cmp_ge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_ge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x6c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_o_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x6c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_o_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x4e,0x7c]
+v_cmpx_ge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x6c,0x7c]
 
-v_cmp_o_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x4e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x6d,0x7c]
 
-v_cmp_o_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x4e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_o_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x4e,0x7c]
+v_cmpx_ge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_o_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x4e,0x7c]
+v_cmpx_ge_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_o_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x4f,0x7c]
+v_cmpx_ge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_o_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_o_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_o_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_o_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_o_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_o_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_o_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_o_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_o_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_o_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_o_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_o_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_o_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x6e,0x7c]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_o_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x6e,0x7c]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_o_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x6e,0x7c]
 
-v_cmp_o_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_o_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x6e,0x7c]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_o_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x6e,0x7c]
 
-v_cmp_o_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_o_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x6e,0x7c]
 
-v_cmp_u_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x6e,0x7c]
 
-v_cmp_u_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x6e,0x7c]
 
-v_cmp_u_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x6e,0x7c]
 
-v_cmp_u_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x6e,0x7c]
 
-v_cmp_u_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x6e,0x7c]
 
-v_cmp_u_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x6e,0x7c]
 
-v_cmp_u_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x6e,0x7c]
 
-v_cmp_u_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x6e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_u_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x6e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_u_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x6e,0x7c]
 
-v_cmp_u_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x6e,0x7c]
 
-v_cmp_u_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x50,0x7c]
+v_cmpx_o_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x6f,0x7c]
 
-v_cmp_u_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x50,0x7c]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_u_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x50,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_o_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_u_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x50,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_o_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_u_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x50,0x7c]
+v_cmpx_o_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_u_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x50,0x7c]
+v_cmpx_o_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_u_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x51,0x7c]
+v_cmpx_o_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_u_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_u_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_u_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_u_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_u_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_u_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_u_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_u_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_u_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_u_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_u_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_u_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_u_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x70,0x7c]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_u_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x70,0x7c]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_u_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x70,0x7c]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_u_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x70,0x7c]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_u_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x70,0x7c]
 
-v_cmp_u_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_u_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x70,0x7c]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_u_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x70,0x7c]
 
-v_cmp_u_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_u_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x70,0x7c]
 
-v_cmp_nge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x52,0x7c]
+v_cmpx_u_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x70,0x7c]
 
-v_cmp_nge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x52,0x7c]
+v_cmpx_u_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x70,0x7c]
 
-v_cmp_nge_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x52,0x7c]
+v_cmpx_u_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x70,0x7c]
 
-v_cmp_nge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x52,0x7c]
+v_cmpx_u_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x70,0x7c]
 
-v_cmp_nge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x52,0x7c]
+v_cmpx_u_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x70,0x7c]
 
-v_cmp_nge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x52,0x7c]
+v_cmpx_u_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x70,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x52,0x7c]
+v_cmpx_u_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x70,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x52,0x7c]
+v_cmpx_u_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x70,0x7c]
 
-v_cmp_nge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x52,0x7c]
+v_cmpx_u_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x70,0x7c]
 
-v_cmp_nge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x52,0x7c]
+v_cmpx_u_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x71,0x7c]
 
-v_cmp_nge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x52,0x7c]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x52,0x7c]
+v_cmpx_u_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x52,0x7c]
+v_cmpx_u_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x52,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_u_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x52,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_u_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x52,0x7c]
+v_cmpx_u_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x52,0x7c]
+v_cmpx_u_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x53,0x7c]
+v_cmpx_u_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_nge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_nge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_nge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_nge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_nge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_nge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_nge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_u_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_nge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x72,0x7c]
 
-v_cmp_nge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_nge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x72,0x7c]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nge_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x72,0x7c]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x72,0x7c]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x72,0x7c]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x72,0x7c]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_nge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x72,0x7c]
 
-v_cmp_nge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x72,0x7c]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x72,0x7c]
 
-v_cmp_nge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x72,0x7c]
 
-v_cmp_nlg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x54,0x7c]
+v_cmpx_nge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x72,0x7c]
 
-v_cmp_nlg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x54,0x7c]
+v_cmpx_nge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x72,0x7c]
 
-v_cmp_nlg_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x54,0x7c]
+v_cmpx_nge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x72,0x7c]
 
-v_cmp_nlg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x54,0x7c]
+v_cmpx_nge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x72,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nlg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x54,0x7c]
+v_cmpx_nge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x72,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nlg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x54,0x7c]
+v_cmpx_nge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x72,0x7c]
 
-v_cmp_nlg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x54,0x7c]
+v_cmpx_nge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x72,0x7c]
 
-v_cmp_nlg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x54,0x7c]
+v_cmpx_nge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x73,0x7c]
 
-v_cmp_nlg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x54,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x54,0x7c]
+v_cmpx_nge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x54,0x7c]
+v_cmpx_nge_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x54,0x7c]
+v_cmpx_nge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x54,0x7c]
+v_cmpx_nge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x54,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x54,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x54,0x7c]
+v_cmpx_nge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x54,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x55,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_nlg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_nlg_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_nlg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_nlg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_nlg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_nlg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_nlg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_nlg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_nlg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nlg_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_nlg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nlg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nlg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nlg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nlg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_nlg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nlg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nlg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x74,0x7c]
 
-v_cmp_nlg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nlg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x74,0x7c]
 
-v_cmp_ngt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x56,0x7c]
+v_cmpx_nlg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x74,0x7c]
 
-v_cmp_ngt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x56,0x7c]
+v_cmpx_nlg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x74,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_ngt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x56,0x7c]
+v_cmpx_nlg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x74,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_ngt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x56,0x7c]
+v_cmpx_nlg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x74,0x7c]
 
-v_cmp_ngt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x56,0x7c]
+v_cmpx_nlg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x74,0x7c]
 
-v_cmp_ngt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x56,0x7c]
+v_cmpx_nlg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x75,0x7c]
 
-v_cmp_ngt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x56,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x56,0x7c]
+v_cmpx_nlg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x56,0x7c]
+v_cmpx_nlg_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x56,0x7c]
+v_cmpx_nlg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x56,0x7c]
+v_cmpx_nlg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x56,0x7c]
+v_cmpx_nlg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x56,0x7c]
+v_cmpx_nlg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x56,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nlg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x56,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nlg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x56,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x56,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x57,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_ngt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_ngt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_ngt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_ngt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_ngt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_ngt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_ngt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x56,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x56,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x56,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ngt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x56,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_ngt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ngt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ngt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_ngt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ngt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_ngt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_ngt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_ngt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x76,0x7c]
 
-v_cmp_ngt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_ngt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x76,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nle_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x58,0x7c]
+v_cmpx_ngt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x76,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nle_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x58,0x7c]
+v_cmpx_ngt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x76,0x7c]
 
-v_cmp_nle_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x58,0x7c]
+v_cmpx_ngt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x76,0x7c]
 
-v_cmp_nle_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x58,0x7c]
+v_cmpx_ngt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x77,0x7c]
 
-v_cmp_nle_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x58,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x58,0x7c]
+v_cmpx_ngt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x58,0x7c]
+v_cmpx_ngt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x58,0x7c]
+v_cmpx_ngt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x58,0x7c]
+v_cmpx_ngt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x58,0x7c]
+v_cmpx_ngt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x58,0x7c]
+v_cmpx_ngt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x58,0x7c]
+v_cmpx_ngt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x58,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_nle_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x58,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_ngt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_nle_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x58,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_ngt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_nle_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x58,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_nle_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x58,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_nle_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x59,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_nle_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_nle_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_nle_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_nle_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_nle_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nle_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_nle_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nle_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nle_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nle_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nle_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_nle_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x78,0x7c]
 
-v_cmp_nle_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nle_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x78,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nle_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x78,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nle_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nle_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x78,0x7c]
 
-v_cmp_neq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x5a,0x7c]
+v_cmpx_nle_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x78,0x7c]
 
-v_cmp_neq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x5a,0x7c]
+v_cmpx_nle_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x79,0x7c]
 
-v_cmp_neq_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_neq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_neq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x5a,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_neq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x5a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nle_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_neq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x5a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_neq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x5a,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_neq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x5a,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_neq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x5b,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_neq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_neq_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_neq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_neq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_neq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_neq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_neq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_neq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_neq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x7a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_neq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x7a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_neq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_neq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_neq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x7a,0x7c]
 
-v_cmp_neq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_neq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x7b,0x7c]
 
-v_cmp_nlt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x5c,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_nlt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x5c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_nlt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x5c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_nlt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x5c,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_nlt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x5c,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_nlt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x5d,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_nlt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x5c,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x5c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x5c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nlt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x5c,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_nlt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nlt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nlt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x7c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nlt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x7c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nlt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_nlt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x7c,0x7c]
 
-v_cmp_nlt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nlt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x7d,0x7c]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nlt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_tru_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_tru_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_tru_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_tru_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_tru_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_tru_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_tru_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x5e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_tru_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x5e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nlt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_tru_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_tru_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x5e,0x7c]
+v_cmpx_nlt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_tru_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x5f,0x7c]
+v_cmpx_tru_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_tru_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_tru_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_tru_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x7e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_tru_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x7e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_tru_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_tru_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x7e,0x7c]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_tru_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x7f,0x7c]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_tru_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_tru_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_tru_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_f_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_f_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_f_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_f_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_f_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_f_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_f_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_f_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x60,0x7c]
+v_cmpx_tru_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_f_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x60,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_f_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x60,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_tru_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_f_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x60,0x7c]
+v_cmps_f_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x80,0x7c]
 
-v_cmpx_f_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x60,0x7c]
+v_cmps_f_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x80,0x7c]
 
-v_cmpx_f_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x61,0x7c]
+v_cmps_f_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_f_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_f_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_f_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_f_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_f_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_f_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x80,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_f_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x80,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_f_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_f_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x80,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_f_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x81,0x7c]
 
-v_cmpx_f_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_f_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x80,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x80,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x80,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x80,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x80,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x80,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x80,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x80,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x80,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x80,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x62,0x7c]
+v_cmps_f_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x62,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_f_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x62,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_f_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x62,0x7c]
+v_cmps_f_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x62,0x7c]
+v_cmps_f_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x63,0x7c]
+v_cmps_f_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_lt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_lt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_f_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x80,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_lt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_lt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x82,0x7c]
 
-v_cmpx_lt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_lt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x82,0x7c]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_lt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x82,0x7c]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_lt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x82,0x7c]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_lt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x82,0x7c]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_lt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x82,0x7c]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_lt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x82,0x7c]
 
-v_cmpx_lt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_lt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x82,0x7c]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_lt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x82,0x7c]
 
-v_cmpx_lt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_lt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x82,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x82,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x82,0x7c]
 
-v_cmpx_eq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x64,0x7c]
+v_cmps_lt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x83,0x7c]
 
-v_cmpx_eq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x64,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_lt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x64,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_lt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x82,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x64,0x7c]
+v_cmps_lt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x82,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x64,0x7c]
+v_cmps_lt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x82,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x65,0x7c]
+v_cmps_lt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x82,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x82,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x82,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x82,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x82,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_eq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x82,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_eq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x82,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_eq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_eq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_lt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_lt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_lt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_lt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_le_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x66,0x7c]
+v_cmps_lt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_le_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x66,0x7c]
+v_cmps_lt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_le_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x66,0x7c]
+v_cmps_lt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_le_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x66,0x7c]
+v_cmps_lt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x82,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_le_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x66,0x7c]
+v_cmps_eq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x66,0x7c]
+v_cmps_eq_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x66,0x7c]
+v_cmps_eq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x66,0x7c]
+v_cmps_eq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x66,0x7c]
+v_cmps_eq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x66,0x7c]
+v_cmps_eq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x66,0x7c]
+v_cmps_eq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x66,0x7c]
+v_cmps_eq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x66,0x7c]
+v_cmps_eq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x66,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_eq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x66,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_eq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x66,0x7c]
+v_cmps_eq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x66,0x7c]
+v_cmps_eq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x84,0x7c]
 
-v_cmpx_le_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x67,0x7c]
+v_cmps_eq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x84,0x7c]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x84,0x7c]
 
-v_cmpx_le_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x84,0x7c]
 
-v_cmpx_le_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x84,0x7c]
 
-v_cmpx_le_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x84,0x7c]
 
-v_cmpx_le_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x84,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x84,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x84,0x7c]
 
-v_cmpx_le_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x84,0x7c]
 
-v_cmpx_le_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_eq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x85,0x7c]
 
-v_cmpx_le_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_eq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_eq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x84,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_eq_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x84,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_eq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x84,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_eq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x84,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_eq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x84,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_eq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x84,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_eq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x84,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_eq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x84,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_eq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x84,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_eq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x84,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_eq_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_gt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_gt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x68,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_eq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_gt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x68,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_eq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_gt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_gt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x68,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_gt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x69,0x7c]
+v_cmps_eq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x84,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_le_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_le_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_le_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_le_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_le_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_le_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_le_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_le_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x86,0x7c]
 
-v_cmpx_gt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_le_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x86,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_le_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x86,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_le_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x86,0x7c]
 
-v_cmpx_lg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6a,0x7c]
+v_cmps_le_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x86,0x7c]
 
-v_cmpx_lg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x6a,0x7c]
+v_cmps_le_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x87,0x7c]
 
-v_cmpx_lg_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x86,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x86,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x86,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x86,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x86,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x86,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x86,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x86,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x86,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x6a,0x7c]
+v_cmps_le_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x86,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x6a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_le_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x6a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_le_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x6a,0x7c]
+v_cmps_le_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x6a,0x7c]
+v_cmps_le_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x6b,0x7c]
+v_cmps_le_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_lg_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_lg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_lg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_le_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x86,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_gt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x88,0x7c]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_gt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x88,0x7c]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_gt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x88,0x7c]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_gt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x88,0x7c]
 
-v_cmpx_lg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_gt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x88,0x7c]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_gt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x88,0x7c]
 
-v_cmpx_lg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_gt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x88,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x6c,0x7c]
+v_cmps_gt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x88,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x6c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_gt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x6c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_gt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x88,0x7c]
 
-v_cmpx_ge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x6c,0x7c]
+v_cmps_gt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x89,0x7c]
 
-v_cmpx_ge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x6c,0x7c]
+v_cmps_gt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x6d,0x7c]
+v_cmps_gt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x88,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x88,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x88,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x88,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x88,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x88,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x88,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x88,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x88,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x88,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_gt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_gt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_gt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_gt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_gt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_gt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_gt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_gt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_gt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_gt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_gt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_o_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6e,0x7c]
+v_cmps_gt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_o_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x6e,0x7c]
+v_cmps_gt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_o_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x6e,0x7c]
+v_cmps_gt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_o_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x6e,0x7c]
+v_cmps_gt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_o_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x6e,0x7c]
+v_cmps_gt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_o_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x6e,0x7c]
+v_cmps_gt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_o_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x6e,0x7c]
+v_cmps_gt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x88,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_o_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x6e,0x7c]
+v_cmps_lg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x6e,0x7c]
+v_cmps_lg_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x6e,0x7c]
+v_cmps_lg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x6e,0x7c]
+v_cmps_lg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x6e,0x7c]
+v_cmps_lg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x6e,0x7c]
+v_cmps_lg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x6e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_lg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x6e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_lg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x6e,0x7c]
+v_cmps_lg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x6e,0x7c]
+v_cmps_lg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x6f,0x7c]
+v_cmps_lg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8a,0x7c]
 
-v_cmpx_o_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_o_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_o_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8a,0x7c]
 
-v_cmpx_o_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_lg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8a,0x7c]
 
-v_cmpx_o_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_lg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8b,0x7c]
 
-v_cmpx_o_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_lg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_lg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x8a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_lg_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x8a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_lg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x8a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_lg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x8a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_lg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x8a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_lg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x8a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_lg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x8a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_lg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x8a,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_u_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_u_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x8a,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_u_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_u_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_u_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_u_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_u_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_u_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_u_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_u_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_u_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_u_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_u_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_u_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x70,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_lg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_u_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x70,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_lg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_u_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_u_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x70,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_u_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x71,0x7c]
+v_cmps_lg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_u_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_u_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x8a,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_u_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_ge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_ge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_ge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_ge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_ge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_ge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_ge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_ge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_ge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_ge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_ge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_ge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8c,0x7c]
 
-v_cmpx_u_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_ge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8c,0x7c]
 
-v_cmpx_nge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x72,0x7c]
+v_cmps_ge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x72,0x7c]
+v_cmps_ge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nge_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x72,0x7c]
+v_cmps_ge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8c,0x7c]
 
-v_cmpx_nge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x72,0x7c]
+v_cmps_ge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8c,0x7c]
 
-v_cmpx_nge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x72,0x7c]
+v_cmps_ge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8d,0x7c]
 
-v_cmpx_nge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x72,0x7c]
+v_cmps_ge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x72,0x7c]
+v_cmps_ge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x8c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x72,0x7c]
+v_cmps_ge_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x8c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x72,0x7c]
+v_cmps_ge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x8c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x72,0x7c]
+v_cmps_ge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x8c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x72,0x7c]
+v_cmps_ge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x8c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x72,0x7c]
+v_cmps_ge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x8c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x72,0x7c]
+v_cmps_ge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x8c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x72,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_ge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x8c,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x72,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_ge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x72,0x7c]
+v_cmps_ge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x8c,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x72,0x7c]
+v_cmps_ge_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x73,0x7c]
+v_cmps_ge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_nge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nge_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_nge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_ge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x8c,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_o_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8e,0x7c]
 
-v_cmpx_nge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_o_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x8e,0x7c]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_o_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x8e,0x7c]
 
-v_cmpx_nge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_o_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x74,0x7c]
+v_cmps_o_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x74,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_o_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x74,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_o_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x74,0x7c]
+v_cmps_o_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nlg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x74,0x7c]
+v_cmps_o_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8e,0x7c]
 
-v_cmpx_nlg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x75,0x7c]
+v_cmps_o_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8e,0x7c]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8f,0x7c]
 
-v_cmpx_nlg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x8e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x8e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x8e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x8e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x8e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x8e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_o_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x8e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_o_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x8e,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_o_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_o_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x8e,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_o_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_o_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_o_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_o_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_o_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_o_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_o_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_o_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_o_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_ngt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x76,0x7c]
+v_cmps_o_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ngt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x76,0x7c]
+v_cmps_o_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ngt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x76,0x7c]
+v_cmps_o_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ngt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x76,0x7c]
+v_cmps_o_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ngt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x76,0x7c]
+v_cmps_o_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_ngt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x76,0x7c]
+v_cmps_o_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ngt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x76,0x7c]
+v_cmps_o_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_ngt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x76,0x7c]
+v_cmps_o_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_ngt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x76,0x7c]
+v_cmps_o_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ngt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x76,0x7c]
+v_cmps_o_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x8e,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_ngt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x76,0x7c]
+v_cmps_u_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x76,0x7c]
+v_cmps_u_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x76,0x7c]
+v_cmps_u_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x76,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_u_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x76,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_u_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x76,0x7c]
+v_cmps_u_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x76,0x7c]
+v_cmps_u_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x77,0x7c]
+v_cmps_u_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_u_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x90,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ngt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_u_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x90,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ngt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_u_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_u_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x90,0x7c]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_u_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x91,0x7c]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_u_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_u_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x90,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_u_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x90,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_u_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x90,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_u_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x90,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_u_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x90,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x90,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x90,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x90,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x90,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x90,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x78,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_u_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x78,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_u_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x78,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x79,0x7c]
+v_cmps_u_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_nle_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_nle_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_nle_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_nle_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_nle_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x90,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_nle_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_nge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_nge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_nge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_nge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_nge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_nge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_nge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_nge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_nge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_nge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_nge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_nge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x92,0x7c]
 
-v_cmpx_nle_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_nge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x92,0x7c]
 
-v_cmpx_neq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7a,0x7c]
+v_cmps_nge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x92,0x7c]
 
-v_cmpx_neq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x7a,0x7c]
+v_cmps_nge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x92,0x7c]
 
-v_cmpx_neq_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x7a,0x7c]
+v_cmps_nge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x92,0x7c]
 
-v_cmpx_neq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x7a,0x7c]
+v_cmps_nge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x92,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_neq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x7a,0x7c]
+v_cmps_nge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x92,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_neq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x7a,0x7c]
+v_cmps_nge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x92,0x7c]
 
-v_cmpx_neq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x7a,0x7c]
+v_cmps_nge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x92,0x7c]
 
-v_cmpx_neq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x7a,0x7c]
+v_cmps_nge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x93,0x7c]
 
-v_cmpx_neq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x7a,0x7c]
+v_cmps_nge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x7a,0x7c]
+v_cmps_nge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x92,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x7a,0x7c]
+v_cmps_nge_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x92,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x7a,0x7c]
+v_cmps_nge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x92,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x7a,0x7c]
+v_cmps_nge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x92,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x7a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_nge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x92,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x7a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_nge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x92,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x7a,0x7c]
+v_cmps_nge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x92,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x7a,0x7c]
+v_cmps_nge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x92,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x7b,0x7c]
+v_cmps_nge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x92,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x92,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_neq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_neq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_neq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_neq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_neq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_nge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_nge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_nge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x92,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_neq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_nlg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x7c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_nlg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x7c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_nlg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x7c,0x7c]
+v_cmps_nlg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x94,0x7c]
 
-v_cmpx_nlt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x7d,0x7c]
+v_cmps_nlg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x94,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x94,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nlt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x94,0x7c]
 
-v_cmpx_nlt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x94,0x7c]
 
-v_cmpx_nlt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x95,0x7c]
 
-v_cmpx_nlt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x94,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x94,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x94,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_nlg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x94,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_nlg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x94,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_nlg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x94,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_nlg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x94,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_nlg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x94,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_nlg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x94,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_nlg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x94,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_nlg_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_nlg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_nlg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_nlg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_nlg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_nlg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_tru_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_tru_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_tru_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_tru_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_tru_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_tru_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_tru_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_tru_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_tru_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_tru_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_tru_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_tru_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_tru_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x7e,0x7c]
+v_cmps_nlg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x94,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_tru_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x7e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_ngt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x7e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_ngt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x7e,0x7c]
+v_cmps_ngt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x7e,0x7c]
+v_cmps_ngt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x7f,0x7c]
+v_cmps_ngt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_ngt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_ngt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_ngt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_ngt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x96,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_ngt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x96,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_ngt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_ngt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x96,0x7c]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_ngt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x97,0x7c]
 
-v_cmpx_tru_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_ngt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_ngt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x96,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_ngt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x96,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_f_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x96,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_f_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x96,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_f_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x96,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_f_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x96,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_f_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x96,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_f_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x96,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_f_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x96,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_f_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x96,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_f_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_f_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_f_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_f_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_f_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_f_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_f_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_f_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_f_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_f_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_f_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x80,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_ngt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_f_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x80,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_ngt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_f_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_f_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x80,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_f_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x81,0x7c]
+v_cmps_ngt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_f_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x80,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_f_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x80,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_f_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x80,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_f_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x80,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x96,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_f_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x80,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x80,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x80,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x80,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x80,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_nle_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x80,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_nle_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x80,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_nle_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x80,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_nle_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x80,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_nle_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x80,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_nle_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x80,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_nle_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x80,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_nle_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x80,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_nle_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x80,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_nle_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x80,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_nle_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x80,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_nle_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x98,0x7c]
 
-v_cmps_f_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x80,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_nle_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x98,0x7c]
 
-v_cmps_lt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x82,0x7c]
+v_cmps_nle_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x98,0x7c]
 
-v_cmps_lt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x82,0x7c]
+v_cmps_nle_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x98,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_lt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x82,0x7c]
+v_cmps_nle_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x98,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_lt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x82,0x7c]
+v_cmps_nle_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x98,0x7c]
 
-v_cmps_lt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x82,0x7c]
+v_cmps_nle_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x98,0x7c]
 
-v_cmps_lt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x82,0x7c]
+v_cmps_nle_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x99,0x7c]
 
-v_cmps_lt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_lt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x98,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_lt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x98,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_lt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x98,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_lt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x98,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_lt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x98,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_lt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x98,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_lt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x98,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_lt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x98,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_lt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x98,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_lt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x98,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_lt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x82,0x7c]
+v_cmps_nle_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_lt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x82,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_nle_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_lt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x82,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_nle_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_lt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x82,0x7c]
+v_cmps_nle_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_lt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x82,0x7c]
+v_cmps_nle_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_lt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x83,0x7c]
+v_cmps_nle_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_lt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x82,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_lt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x82,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_lt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x82,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_lt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x82,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_lt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x82,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_lt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x82,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_lt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x82,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_lt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x82,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_lt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x82,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_lt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x82,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_lt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x82,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_lt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x82,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_lt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x82,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_nle_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x98,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_lt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x82,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_neq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x9a,0x7c]
 
-v_cmps_lt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x82,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_neq_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x9a,0x7c]
 
-v_cmps_lt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x82,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_neq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x9a,0x7c]
 
-v_cmps_lt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x82,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_neq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x9a,0x7c]
 
-v_cmps_lt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x82,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_neq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x9a,0x7c]
 
-v_cmps_lt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x82,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_neq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x9a,0x7c]
 
-v_cmps_lt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x82,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_neq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x9a,0x7c]
 
-v_cmps_lt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x82,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_neq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x9a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_eq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x9a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_eq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x9a,0x7c]
 
-v_cmps_eq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x84,0x7c]
+v_cmps_neq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x9b,0x7c]
 
-v_cmps_eq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x84,0x7c]
+v_cmps_neq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_eq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x84,0x7c]
+v_cmps_neq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x9a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_eq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x84,0x7c]
+v_cmps_neq_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x9a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_eq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x84,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_neq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x9a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_eq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x84,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_neq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x9a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_eq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x84,0x7c]
+v_cmps_neq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x9a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_eq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x84,0x7c]
+v_cmps_neq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x9a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_eq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x85,0x7c]
+v_cmps_neq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x9a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x84,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x9a,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x84,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x84,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x9a,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_eq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x84,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_eq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x84,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_eq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x84,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_eq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x84,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_eq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x84,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x84,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x84,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x84,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x84,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x84,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x84,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x84,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x84,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x84,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x84,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_neq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x84,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_neq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x84,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_neq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_eq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x84,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_neq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_le_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x86,0x7c]
+v_cmps_neq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x9a,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_le_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x86,0x7c]
+v_cmps_nlt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x86,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_nlt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x86,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_nlt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x9c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_le_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x86,0x7c]
+v_cmps_nlt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x9c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_le_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x86,0x7c]
+v_cmps_nlt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x9c,0x7c]
 
-v_cmps_le_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x87,0x7c]
+v_cmps_nlt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x9c,0x7c]
 
-v_cmps_le_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x86,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x9d,0x7c]
 
-v_cmps_le_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x86,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_le_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x86,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x9c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_le_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x86,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x9c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_le_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x86,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x9c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_le_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x86,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x9c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_le_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x86,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x9c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_le_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x86,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x9c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x86,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_nlt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x9c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x86,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_nlt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x9c,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x86,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_nlt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x86,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_nlt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x9c,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x86,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_nlt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x86,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_nlt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x86,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_nlt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x86,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_nlt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x86,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_nlt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x86,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_nlt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x86,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_nlt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x86,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_nlt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_le_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x86,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_nlt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_gt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x88,0x7c]
+v_cmps_nlt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_gt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x88,0x7c]
+v_cmps_nlt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_gt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x88,0x7c]
+v_cmps_nlt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_gt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x88,0x7c]
+v_cmps_nlt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_gt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x88,0x7c]
+v_cmps_nlt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_gt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x88,0x7c]
+v_cmps_nlt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_gt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x88,0x7c]
+v_cmps_nlt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_gt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x88,0x7c]
+v_cmps_nlt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_gt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x88,0x7c]
+v_cmps_nlt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_gt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x88,0x7c]
+v_cmps_nlt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x9c,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_gt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x88,0x7c]
+v_cmps_tru_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x88,0x7c]
+v_cmps_tru_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x88,0x7c]
+v_cmps_tru_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x88,0x7c]
+v_cmps_tru_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x88,0x7c]
+v_cmps_tru_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x88,0x7c]
+v_cmps_tru_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x88,0x7c]
+v_cmps_tru_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x88,0x7c]
+v_cmps_tru_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x88,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_tru_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x88,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_tru_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x88,0x7c]
+v_cmps_tru_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x88,0x7c]
+v_cmps_tru_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x89,0x7c]
+v_cmps_tru_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x88,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x88,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x88,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x88,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x88,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x9e,0x7c]
 
-v_cmps_gt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x88,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x9e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_gt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x88,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x9e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_gt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x88,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x9e,0x7c]
 
-v_cmps_gt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x88,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_tru_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x9e,0x7c]
 
-v_cmps_gt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x88,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_tru_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x9f,0x7c]
 
-v_cmps_gt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x88,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_tru_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_gt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x88,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_tru_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x9e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_gt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x88,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_tru_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x9e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_gt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x88,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_tru_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x9e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_gt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x88,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_tru_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x9e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_gt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x88,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_tru_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x9e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_gt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x88,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_tru_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x9e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_gt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x88,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_tru_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x9e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_gt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x88,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_tru_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x9e,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_gt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x88,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_tru_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_gt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x88,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_tru_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x9e,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_lg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_lg_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_lg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_lg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_lg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_lg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_lg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_lg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_lg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_lg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_lg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_lg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_lg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_lg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_lg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_lg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_lg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_lg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8a,0x7c]
+v_cmps_tru_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_lg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_tru_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x9e,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_lg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_f_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8a,0x7c]
+v_cmpsx_f_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8a,0x7c]
+v_cmpsx_f_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8b,0x7c]
+v_cmpsx_f_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x8a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x8a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x8a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x8a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x8a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x8a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x8a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x8a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x8a,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x8a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x8a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_f_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x8a,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_f_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x8a,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_f_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x8a,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_f_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x8a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_f_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_lg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x8a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_f_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_lg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x8a,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_f_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x8a,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_f_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa0,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x8a,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_f_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa1,0x7c]
 
-v_cmps_lg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x8a,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_f_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_lg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x8a,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_f_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_ge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_ge_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_ge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_ge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_ge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_ge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_ge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_ge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_ge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_ge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_ge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_ge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_ge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_ge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_ge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_ge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_ge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_ge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_ge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_f_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_ge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_f_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_ge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_ge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8c,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_ge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8d,0x7c]
+v_cmpsx_f_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_ge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x8c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_ge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x8c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_ge_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x8c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_ge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x8c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_ge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x8c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_ge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x8c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x8c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x8c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x8c,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_lt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x8c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_lt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x8c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_lt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x8c,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_lt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x8c,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_lt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x8c,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_lt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x8c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_lt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x8c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_lt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x8c,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_lt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x8c,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_lt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x8c,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_lt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x8c,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_lt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa2,0x7c]
 
-v_cmps_ge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x8c,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_lt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa2,0x7c]
 
-v_cmps_o_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa2,0x7c]
 
-v_cmps_o_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa2,0x7c]
 
-v_cmps_o_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_o_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_o_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa2,0x7c]
 
-v_cmps_o_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa2,0x7c]
 
-v_cmps_o_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa3,0x7c]
 
-v_cmps_o_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_o_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_o_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_o_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_o_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_o_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_o_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_o_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_o_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_o_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_o_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_o_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_lt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_o_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_lt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_o_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_o_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8e,0x7c]
+v_cmpsx_lt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_o_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8f,0x7c]
+v_cmpsx_lt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_o_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x8e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_o_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x8e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_o_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x8e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_o_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x8e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_o_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x8e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_o_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x8e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_o_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x8e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_o_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x8e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_o_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x8e,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_o_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x8e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_o_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x8e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_o_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x8e,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_o_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x8e,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_o_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x8e,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_lt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_o_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x8e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_eq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa4,0x7c]
 
-v_cmps_o_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x8e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_eq_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xa4,0x7c]
 
-v_cmps_o_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x8e,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_eq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xa4,0x7c]
 
-v_cmps_o_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x8e,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_eq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xa4,0x7c]
 
-v_cmps_o_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x8e,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_eq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa4,0x7c]
 
-v_cmps_o_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x8e,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_eq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa4,0x7c]
 
-v_cmps_o_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x8e,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_eq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_u_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_u_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa4,0x7c]
 
-v_cmps_u_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x90,0x7c]
+v_cmpsx_eq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa5,0x7c]
 
-v_cmps_u_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x90,0x7c]
+v_cmpsx_eq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_u_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x90,0x7c]
+v_cmpsx_eq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_u_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x90,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_eq_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_u_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x90,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_eq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_u_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x90,0x7c]
+v_cmpsx_eq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_u_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x90,0x7c]
+v_cmpsx_eq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_u_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x91,0x7c]
+v_cmpsx_eq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x90,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_u_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x90,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_u_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x90,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_u_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x90,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_u_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x90,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_u_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x90,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_u_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x90,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_u_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x90,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x90,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x90,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x90,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x90,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x90,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x90,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x90,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x90,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x90,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_u_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x90,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_eq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_u_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x90,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_eq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_u_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x90,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_eq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_u_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x90,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_eq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_nge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x92,0x7c]
+v_cmpsx_eq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_nge_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x92,0x7c]
+v_cmpsx_eq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_nge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x92,0x7c]
+v_cmpsx_le_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x92,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_le_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x92,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_le_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa6,0x7c]
 
-v_cmps_nge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x92,0x7c]
+v_cmpsx_le_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_nge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x92,0x7c]
+v_cmpsx_le_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_nge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x93,0x7c]
+v_cmpsx_le_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa6,0x7c]
 
-v_cmps_nge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x92,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa6,0x7c]
 
-v_cmps_nge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x92,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa7,0x7c]
 
-v_cmps_nge_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x92,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x92,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x92,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x92,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x92,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x92,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x92,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_le_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x92,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_le_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x92,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_le_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x92,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_le_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x92,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_le_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x92,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_le_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x92,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_le_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x92,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_le_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x92,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_le_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x92,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_le_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x92,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_le_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x92,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_le_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_nge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x92,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_le_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_nlg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_nlg_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_nlg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_nlg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_nlg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_nlg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_nlg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_nlg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_nlg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_nlg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_nlg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x94,0x7c]
+v_cmpsx_le_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_nlg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x94,0x7c]
+v_cmpsx_gt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x94,0x7c]
+v_cmpsx_gt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x94,0x7c]
+v_cmpsx_gt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x94,0x7c]
+v_cmpsx_gt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x94,0x7c]
+v_cmpsx_gt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x94,0x7c]
+v_cmpsx_gt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x94,0x7c]
+v_cmpsx_gt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x94,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_gt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x94,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_gt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x94,0x7c]
+v_cmpsx_gt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x94,0x7c]
+v_cmpsx_gt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x95,0x7c]
+v_cmpsx_gt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x94,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x94,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x94,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x94,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x94,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x94,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa8,0x7c]
 
-v_cmps_nlg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x94,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_nlg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x94,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_nlg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x94,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa8,0x7c]
 
-v_cmps_nlg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x94,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa8,0x7c]
 
-v_cmps_nlg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x94,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_gt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa9,0x7c]
 
-v_cmps_nlg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x94,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_gt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x94,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_gt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x94,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_gt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x94,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_gt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x94,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_gt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x94,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_gt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x94,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_gt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x94,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_gt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x94,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_gt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_nlg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x94,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_gt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_ngt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_ngt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_ngt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_ngt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x96,0x7c]
+v_cmpsx_gt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_ngt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x96,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_gt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_ngt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x96,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_gt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_ngt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x96,0x7c]
+v_cmpsx_lg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x96,0x7c]
+v_cmpsx_lg_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x97,0x7c]
+v_cmpsx_lg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x96,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x96,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x96,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x96,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x96,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x96,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x96,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x96,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x96,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x96,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x96,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_lg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x96,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_lg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x96,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_lg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x96,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_lg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x96,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_lg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x96,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_lg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xaa,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_ngt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x96,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_lg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xaa,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_ngt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x96,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_lg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x96,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_lg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xaa,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x96,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_lg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xab,0x7c]
 
-v_cmps_ngt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x96,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_lg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nle_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nle_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nle_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nle_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nle_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nle_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nle_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nle_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_nle_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_nle_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_nle_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_nle_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_nle_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_nle_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_nle_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_nle_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_nle_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_nle_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_nle_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x98,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_lg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_nle_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x98,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_lg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_nle_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_nle_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x98,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_nle_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x99,0x7c]
+v_cmpsx_lg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_nle_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x98,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_nle_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x98,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_nle_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x98,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_nle_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x98,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_nle_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x98,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_nle_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x98,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_nle_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x98,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x98,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x98,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_ge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x98,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_ge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x98,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_ge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x98,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_ge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x98,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_ge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x98,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_ge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x98,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_ge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x98,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_ge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x98,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_ge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x98,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_ge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x98,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_ge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x98,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_ge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xac,0x7c]
 
-v_cmps_nle_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x98,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_ge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xac,0x7c]
 
-v_cmps_neq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xac,0x7c]
 
-v_cmps_neq_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xac,0x7c]
 
-v_cmps_neq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xac,0x7c]
 
-v_cmps_neq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xac,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_neq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xac,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_neq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xac,0x7c]
 
-v_cmps_neq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xac,0x7c]
 
-v_cmps_neq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xad,0x7c]
 
-v_cmps_neq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_neq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_neq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_neq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_neq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_neq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_neq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_neq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_neq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_neq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_neq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x9a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_ge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_neq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x9a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_ge_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_neq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_neq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x9a,0x7c]
+v_cmpsx_ge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_neq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x9b,0x7c]
+v_cmpsx_ge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_neq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x9a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_neq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x9a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_neq_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x9a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_neq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x9a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_neq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x9a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_neq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x9a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_neq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x9a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_neq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x9a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_neq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x9a,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_neq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x9a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_neq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x9a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_neq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x9a,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_neq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x9a,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_neq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x9a,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_neq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x9a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_ge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_neq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x9a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_o_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xae,0x7c]
 
-v_cmps_neq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x9a,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_o_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xae,0x7c]
 
-v_cmps_neq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x9a,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_o_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xae,0x7c]
 
-v_cmps_neq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x9a,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_o_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xae,0x7c]
 
-v_cmps_neq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x9a,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_o_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xae,0x7c]
 
-v_cmps_neq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x9a,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_o_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xae,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_nlt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xae,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_nlt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xae,0x7c]
 
-v_cmps_nlt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x9c,0x7c]
+v_cmpsx_o_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xaf,0x7c]
 
-v_cmps_nlt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x9c,0x7c]
+v_cmpsx_o_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x9c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_o_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x9c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_o_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x9c,0x7c]
+v_cmpsx_o_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x9c,0x7c]
+v_cmpsx_o_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x9d,0x7c]
+v_cmpsx_o_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x9c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x9c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x9c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_nlt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x9c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_nlt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x9c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_nlt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x9c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_nlt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x9c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_nlt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x9c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x9c,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x9c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x9c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x9c,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x9c,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x9c,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x9c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x9c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x9c,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x9c,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_o_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x9c,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_o_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x9c,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_o_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmps_nlt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x9c,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_o_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmps_tru_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x9e,0x7c]
+v_cmpsx_o_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmps_tru_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x9e,0x7c]
+v_cmpsx_o_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmps_tru_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x9e,0x7c]
+v_cmpsx_o_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmps_tru_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x9e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_u_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x9e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_u_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb0,0x7c]
 
-v_cmps_tru_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x9e,0x7c]
+v_cmpsx_u_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_tru_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x9f,0x7c]
+v_cmpsx_u_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_tru_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x9e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb0,0x7c]
 
-v_cmps_tru_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x9e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb0,0x7c]
 
-v_cmps_tru_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x9e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb1,0x7c]
 
-v_cmps_tru_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x9e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_tru_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x9e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_tru_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x9e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_tru_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x9e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_tru_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x9e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x9e,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_u_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x9e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_u_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x9e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_u_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x9e,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_u_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x9e,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_u_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x9e,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_u_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x9e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_u_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x9e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_u_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x9e,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_u_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x9e,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_u_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x9e,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_u_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x9e,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_u_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmps_tru_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x9e,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_u_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpsx_f_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_f_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpsx_f_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_f_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_f_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_f_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_f_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpsx_f_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_f_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpsx_f_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpsx_f_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_f_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa0,0x7c]
+v_cmpsx_u_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpsx_f_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa0,0x7c]
+v_cmpsx_nge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa0,0x7c]
+v_cmpsx_nge_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa0,0x7c]
+v_cmpsx_nge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa0,0x7c]
+v_cmpsx_nge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa0,0x7c]
+v_cmpsx_nge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa0,0x7c]
+v_cmpsx_nge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_nge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_nge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa0,0x7c]
+v_cmpsx_nge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa0,0x7c]
+v_cmpsx_nge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa1,0x7c]
+v_cmpsx_nge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb2,0x7c]
 
-v_cmpsx_f_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_f_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_f_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb2,0x7c]
 
-v_cmpsx_f_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_nge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb2,0x7c]
 
-v_cmpsx_f_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_nge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb3,0x7c]
 
-v_cmpsx_f_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_nge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_f_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_nge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_f_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_nge_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_f_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_nge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_f_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_nge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_f_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_nge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_f_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_nge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_f_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_nge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_f_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_nge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_lt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpsx_lt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_lt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpsx_lt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_nge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpsx_lt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_nge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_lt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa2,0x7c]
+v_cmpsx_nge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpsx_lt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa2,0x7c]
+v_cmpsx_nlg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa3,0x7c]
+v_cmpsx_nlg_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_nlg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_nlg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_nlg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_nlg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_nlg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_lt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_nlg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_lt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_nlg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_nlg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb4,0x7c]
 
-v_cmpsx_lt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_nlg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb5,0x7c]
 
-v_cmpsx_eq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa4,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_eq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa5,0x7c]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_eq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_eq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpsx_eq_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_eq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpsx_eq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpsx_eq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_eq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpsx_eq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_ngt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_ngt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_ngt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_ngt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_ngt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_ngt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_ngt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_ngt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_ngt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_ngt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_ngt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_ngt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb6,0x7c]
 
-v_cmpsx_eq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_ngt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb6,0x7c]
 
-v_cmpsx_le_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb6,0x7c]
 
-v_cmpsx_le_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb6,0x7c]
 
-v_cmpsx_le_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb6,0x7c]
 
-v_cmpsx_le_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb6,0x7c]
 
-v_cmpsx_le_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_le_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_le_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb6,0x7c]
 
-v_cmpsx_le_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb6,0x7c]
 
-v_cmpsx_le_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb7,0x7c]
 
-v_cmpsx_le_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_ngt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_ngt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa6,0x7c]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_le_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa7,0x7c]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_le_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_le_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_le_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_le_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpsx_le_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_le_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpsx_le_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_le_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_le_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_le_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_le_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpsx_le_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_le_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpsx_le_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpsx_le_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_le_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_ngt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpsx_le_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_nle_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb8,0x7c]
 
-v_cmpsx_le_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_nle_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xb8,0x7c]
 
-v_cmpsx_le_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_nle_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xb8,0x7c]
 
-v_cmpsx_le_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_nle_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xb8,0x7c]
 
-v_cmpsx_le_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_nle_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_gt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_gt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb8,0x7c]
 
-v_cmpsx_gt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa8,0x7c]
+v_cmpsx_nle_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb9,0x7c]
 
-v_cmpsx_gt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_nle_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_gt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_nle_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_gt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa8,0x7c]
+v_cmpsx_nle_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_gt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa8,0x7c]
+v_cmpsx_nle_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_gt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa9,0x7c]
+v_cmpsx_nle_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_nle_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_nle_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_nle_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpsx_gt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_nle_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_lg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xaa,0x7c]
+v_cmpsx_nle_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpsx_lg_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xaa,0x7c]
+v_cmpsx_nle_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpsx_lg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xaa,0x7c]
+v_cmpsx_nle_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_lg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xaa,0x7c]
+v_cmpsx_nle_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpsx_lg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xaa,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_neq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xaa,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_neq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xaa,0x7c]
+v_cmpsx_neq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xba,0x7c]
 
-v_cmpsx_lg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xab,0x7c]
+v_cmpsx_neq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xba,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_lg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xba,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_lg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xba,0x7c]
 
-v_cmpsx_lg_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xba,0x7c]
 
-v_cmpsx_lg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xbb,0x7c]
 
-v_cmpsx_lg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_neq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_neq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_neq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_neq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_neq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_neq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_neq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_neq_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_neq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_neq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_neq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_neq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_lg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_neq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_ge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpsx_ge_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_ge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpsx_ge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_ge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_ge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_ge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_ge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpsx_ge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_ge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpsx_ge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpsx_ge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_ge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xac,0x7c]
+v_cmpsx_neq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpsx_ge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xac,0x7c]
+v_cmpsx_nlt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xac,0x7c]
+v_cmpsx_nlt_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xac,0x7c]
+v_cmpsx_nlt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xac,0x7c]
+v_cmpsx_nlt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xac,0x7c]
+v_cmpsx_nlt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xac,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_nlt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xac,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_nlt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xac,0x7c]
+v_cmpsx_nlt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xac,0x7c]
+v_cmpsx_nlt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xad,0x7c]
+v_cmpsx_nlt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xbc,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_ge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xbc,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_ge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xbc,0x7c]
 
-v_cmpsx_ge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_nlt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xbd,0x7c]
 
-v_cmpsx_ge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_nlt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_nlt_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_nlt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_nlt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_nlt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_nlt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_nlt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_o_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpsx_o_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_o_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xae,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpsx_o_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xae,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpsx_o_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_o_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xae,0x7c]
+v_cmpsx_nlt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpsx_o_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xaf,0x7c]
+v_cmpsx_tru_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_tru_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_tru_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0xff,0x01,0x00,0x00]
+v_cmpsx_tru_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_tru_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_tru_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_tru_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_tru_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpsx_tru_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xbe,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_o_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_tru_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xbe,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_o_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_tru_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xbe,0x7c]
 
-v_cmpsx_o_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_tru_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xbe,0x7c]
 
-v_cmpsx_u_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xbf,0x7c]
 
-v_cmpsx_u_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_tru_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_tru_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb0,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_u_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb1,0x7c]
+v_cmpsx_tru_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_u_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_u_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_u_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpsx_u_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_u_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpsx_u_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpsx_u_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_u_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpsx_u_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_f_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_f_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_f_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_f_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_f_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_f_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_f_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_f_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_f_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_f_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_f_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_f_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc0,0x7c]
 
-v_cmpsx_u_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_f_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc0,0x7c]
 
-v_cmpsx_nge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb2,0x7c]
+v_cmps_f_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_nge_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xb2,0x7c]
+v_cmps_f_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_nge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xb2,0x7c]
+v_cmps_f_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc0,0x7c]
 
-v_cmpsx_nge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xb2,0x7c]
+v_cmps_f_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc0,0x7c]
 
-v_cmpsx_nge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb2,0x7c]
+v_cmps_f_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc1,0x7c]
 
-v_cmpsx_nge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xc0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xc0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xc0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xc0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xc0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xc0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xc0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_nge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb2,0x7c]
+v_cmps_f_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_nge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_f_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_nge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_f_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_nge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb2,0x7c]
+v_cmps_f_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_nge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb2,0x7c]
+v_cmps_f_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_nge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb3,0x7c]
+v_cmps_f_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_nge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_f_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_nge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_lt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_lt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_lt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_lt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_lt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_lt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_lt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_nge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_lt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_nge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_lt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_lt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc2,0x7c]
 
-v_cmpsx_nge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_lt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc3,0x7c]
 
-v_cmpsx_nge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_lt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_lt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xc2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xc2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xc2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xc2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xc2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xc2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xc2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_nlg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_nlg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_nlg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_nlg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_nlg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_nlg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_nlg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb4,0x7c]
+v_cmps_lt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_nlg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb4,0x7c]
+v_cmps_eq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_eq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_eq_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb4,0x7c]
+v_cmps_eq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb4,0x7c]
+v_cmps_eq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb5,0x7c]
+v_cmps_eq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc4,0x7c]
 
-v_cmpsx_nlg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_nlg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_nlg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc4,0x7c]
 
-v_cmpsx_nlg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_eq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc4,0x7c]
 
-v_cmpsx_nlg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_eq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc5,0x7c]
 
-v_cmpsx_nlg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_eq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_eq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xc4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_eq_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xc4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_eq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xc4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_eq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xc4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_eq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xc4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_eq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xc4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_eq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xc4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_eq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb6,0x7c]
+v_cmps_eq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xb6,0x7c]
+v_cmps_eq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_ngt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xb6,0x7c]
+v_cmps_eq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_ngt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xb6,0x7c]
+v_cmps_eq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_ngt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb6,0x7c]
+v_cmps_eq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_ngt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb6,0x7c]
+v_cmps_eq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_ngt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb6,0x7c]
+v_cmps_eq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_ngt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb6,0x7c]
+v_cmps_eq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_ngt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb6,0x7c]
+v_cmps_eq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_ngt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb6,0x7c]
+v_cmps_eq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_ngt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb6,0x7c]
+v_cmps_le_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb6,0x7c]
+v_cmps_le_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb6,0x7c]
+v_cmps_le_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb6,0x7c]
+v_cmps_le_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb6,0x7c]
+v_cmps_le_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb6,0x7c]
+v_cmps_le_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb6,0x7c]
+v_cmps_le_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb6,0x7c]
+v_cmps_le_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_le_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_le_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb6,0x7c]
+v_cmps_le_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb6,0x7c]
+v_cmps_le_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb7,0x7c]
+v_cmps_le_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc6,0x7c]
 
-v_cmpsx_ngt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_ngt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_ngt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc6,0x7c]
 
-v_cmpsx_ngt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc6,0x7c]
 
-v_cmpsx_ngt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc7,0x7c]
 
-v_cmpsx_ngt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xc6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_le_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xc6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_le_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xc6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_le_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xc6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_le_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xc6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_le_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xc6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_le_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xc6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_le_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_le_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_le_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_le_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_le_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_le_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_le_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_ngt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_le_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_nle_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb8,0x7c]
+v_cmps_le_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_nle_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xb8,0x7c]
+v_cmps_le_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_nle_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xb8,0x7c]
+v_cmps_le_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_nle_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_nle_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb8,0x7c]
+v_cmps_gt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_nle_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_gt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_gt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc8,0x7c]
 
-v_cmpsx_nle_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb8,0x7c]
+v_cmps_gt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc9,0x7c]
 
-v_cmpsx_nle_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb8,0x7c]
+v_cmps_gt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nle_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb9,0x7c]
+v_cmps_gt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xc8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xc8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xc8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xc8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xc8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xc8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xc8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_nle_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_gt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_nle_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_gt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_nle_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_gt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_nle_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_gt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_nle_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_gt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_nle_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_gt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_nle_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_gt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_nle_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_lg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xca,0x7c]
 
-v_cmpsx_nle_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_lg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xca,0x7c]
 
-v_cmpsx_nle_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_lg_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xca,0x7c]
 
-v_cmpsx_nle_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_lg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xca,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_neq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xca,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_neq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xca,0x7c]
 
-v_cmpsx_neq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xba,0x7c]
+v_cmps_lg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcb,0x7c]
 
-v_cmpsx_neq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xba,0x7c]
+v_cmps_lg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_neq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xba,0x7c]
+v_cmps_lg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xca,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_neq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xba,0x7c]
+v_cmps_lg_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xca,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_neq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xba,0x7c]
+v_cmps_lg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xca,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_neq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xba,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_lg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xca,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_neq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xba,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_lg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xca,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_neq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xba,0x7c]
+v_cmps_lg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xca,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_neq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xba,0x7c]
+v_cmps_lg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xca,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_neq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xbb,0x7c]
+v_cmps_lg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_neq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_neq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_neq_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_neq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xca,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_neq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xca,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_neq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xca,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_neq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xca,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_neq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_lg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_neq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_lg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_neq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_lg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_neq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_ge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xcc,0x7c]
 
-v_cmpsx_neq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_ge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xcc,0x7c]
 
-v_cmpsx_neq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_ge_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xcc,0x7c]
 
-v_cmpsx_neq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_ge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xcc,0x7c]
 
-v_cmpsx_neq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_ge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xcc,0x7c]
 
-v_cmpsx_neq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_ge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xcc,0x7c]
 
-v_cmpsx_neq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_ge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xcc,0x7c]
 
-v_cmpsx_neq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_ge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xcc,0x7c]
 
-v_cmpsx_neq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_ge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xcc,0x7c]
 
-v_cmpsx_neq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_ge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xcc,0x7c]
 
-v_cmpsx_neq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_ge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xcc,0x7c]
 
-v_cmpsx_nlt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xbc,0x7c]
+v_cmps_ge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xcc,0x7c]
 
-v_cmpsx_nlt_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xbc,0x7c]
+v_cmps_ge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xcc,0x7c]
 
-v_cmpsx_nlt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xbc,0x7c]
+v_cmps_ge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xcc,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_nlt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xbc,0x7c]
+v_cmps_ge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xcc,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_nlt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xbc,0x7c]
+v_cmps_ge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xcc,0x7c]
 
-v_cmpsx_nlt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xbc,0x7c]
+v_cmps_ge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xcc,0x7c]
 
-v_cmpsx_nlt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xbc,0x7c]
+v_cmps_ge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcd,0x7c]
 
-v_cmpsx_nlt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xcc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xcc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xcc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xcc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xcc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xcc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xcc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xbc,0x7c]
+v_cmps_ge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xbc,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_ge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_nlt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xbc,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_ge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_nlt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xbc,0x7c]
+v_cmps_ge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_nlt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xbc,0x7c]
+v_cmps_ge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_nlt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xbd,0x7c]
+v_cmps_ge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_nlt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_nlt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_nlt_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_nlt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_o_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_o_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_o_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_o_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_o_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_o_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_o_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_o_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_o_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_o_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xce,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_nlt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_o_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xce,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_nlt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_o_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_o_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xce,0x7c]
 
-v_cmpsx_nlt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_o_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcf,0x7c]
 
-v_cmpsx_tru_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xce,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xce,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xce,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xce,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xce,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xce,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xce,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_tru_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xce,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_tru_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xce,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_tru_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xce,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_tru_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xce,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_tru_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_tru_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xbe,0x7c]
+v_cmps_o_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_tru_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xbe,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_o_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_tru_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xbe,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_u_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xbe,0x7c]
+v_cmps_u_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xbe,0x7c]
+v_cmps_u_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xbf,0x7c]
+v_cmps_u_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_tru_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_u_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_tru_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_u_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0xff,0x01,0x00,0x00]
+v_cmps_u_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd0,0x7c]
 
-v_cmpsx_tru_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_u_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd1,0x7c]
 
-v_cmpsx_tru_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_u_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_u_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xd0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_u_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xd0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0xfe,0x03,0x00]
+v_cmps_u_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xd0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_u_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xd0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_u_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xd0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_tru_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_u_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xd0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_f_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xd0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_f_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_f_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xd0,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_f_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_f_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xd0,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_f_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_f_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_f_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_f_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_f_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_f_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_f_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc0,0x7c]
+v_cmps_u_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_f_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc0,0x7c]
+v_cmps_nge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd2,0x7c]
 
-v_cmps_f_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_nge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd2,0x7c]
 
-v_cmps_f_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_nge_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xd2,0x7c]
 
-v_cmps_f_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc0,0x7c]
+v_cmps_nge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xd2,0x7c]
 
-v_cmps_f_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc0,0x7c]
+v_cmps_nge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd2,0x7c]
 
-v_cmps_f_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc1,0x7c]
+v_cmps_nge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd2,0x7c]
 
-v_cmps_f_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd2,0x7c]
 
-v_cmps_f_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd2,0x7c]
 
-v_cmps_f_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd2,0x7c]
 
-v_cmps_f_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd2,0x7c]
 
-v_cmps_f_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd2,0x7c]
 
-v_cmps_f_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd2,0x7c]
 
-v_cmps_f_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd2,0x7c]
 
-v_cmps_f_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_f_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_f_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd2,0x7c]
 
-v_cmps_f_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_nge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd2,0x7c]
 
-v_cmps_f_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_nge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd3,0x7c]
 
-v_cmps_f_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_nge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_f_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_nge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xd2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_f_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_nge_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xd2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_f_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_nge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xd2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_f_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_nge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xd2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_f_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_nge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xd2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_f_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_nge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xd2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_f_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_nge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xd2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_f_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_nge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_lt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc2,0x7c]
+v_cmps_nge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xd2,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_lt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc2,0x7c]
+v_cmps_nge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_lt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xc2,0x7c]
+v_cmps_nge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xd2,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_lt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xc2,0x7c]
+v_cmps_nge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_lt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc2,0x7c]
+v_cmps_nge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_lt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc2,0x7c]
+v_cmps_nge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_lt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc2,0x7c]
+v_cmps_nge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_lt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc2,0x7c]
+v_cmps_nge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_lt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc2,0x7c]
+v_cmps_nge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_lt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc2,0x7c]
+v_cmps_nge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_lt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc2,0x7c]
+v_cmps_nlg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc2,0x7c]
+v_cmps_nlg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc2,0x7c]
+v_cmps_nlg_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_nlg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_nlg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc2,0x7c]
+v_cmps_nlg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc2,0x7c]
+v_cmps_nlg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc3,0x7c]
+v_cmps_nlg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd4,0x7c]
 
-v_cmps_lt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_lt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_lt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd4,0x7c]
 
-v_cmps_lt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_nlg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd4,0x7c]
 
-v_cmps_lt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_nlg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd5,0x7c]
 
-v_cmps_lt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_nlg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_nlg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xd4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_nlg_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xd4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_nlg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xd4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_nlg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xd4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_nlg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xd4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_nlg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xd4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_nlg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xd4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_nlg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_lt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_nlg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xd4,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_lt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_nlg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_eq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc4,0x7c]
+v_cmps_nlg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xd4,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_eq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc4,0x7c]
+v_cmps_nlg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_eq_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xc4,0x7c]
+v_cmps_nlg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_eq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xc4,0x7c]
+v_cmps_nlg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_eq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc4,0x7c]
+v_cmps_nlg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_eq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc4,0x7c]
+v_cmps_nlg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_eq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc4,0x7c]
+v_cmps_nlg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_eq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc4,0x7c]
+v_cmps_nlg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_eq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc4,0x7c]
+v_cmps_ngt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc4,0x7c]
+v_cmps_ngt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc4,0x7c]
+v_cmps_ngt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc4,0x7c]
+v_cmps_ngt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc4,0x7c]
+v_cmps_ngt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_ngt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_ngt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc4,0x7c]
+v_cmps_ngt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc4,0x7c]
+v_cmps_ngt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc5,0x7c]
+v_cmps_ngt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd6,0x7c]
 
-v_cmps_eq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_eq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_eq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd6,0x7c]
 
-v_cmps_eq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd6,0x7c]
 
-v_cmps_eq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_ngt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd7,0x7c]
 
-v_cmps_eq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_ngt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_ngt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xd6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_ngt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xd6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_ngt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xd6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_ngt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xd6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_ngt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xd6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_ngt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xd6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_ngt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xd6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_ngt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_ngt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xd6,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_ngt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_ngt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xd6,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_eq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_ngt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_le_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc6,0x7c]
+v_cmps_ngt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_le_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc6,0x7c]
+v_cmps_ngt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_le_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xc6,0x7c]
+v_cmps_ngt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_le_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xc6,0x7c]
+v_cmps_ngt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_le_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc6,0x7c]
+v_cmps_ngt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_le_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc6,0x7c]
+v_cmps_ngt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_le_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc6,0x7c]
+v_cmps_nle_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc6,0x7c]
+v_cmps_nle_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc6,0x7c]
+v_cmps_nle_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc6,0x7c]
+v_cmps_nle_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc6,0x7c]
+v_cmps_nle_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc6,0x7c]
+v_cmps_nle_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc6,0x7c]
+v_cmps_nle_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_nle_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_nle_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc6,0x7c]
+v_cmps_nle_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc6,0x7c]
+v_cmps_nle_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd8,0x7c]
 
-v_cmps_le_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc7,0x7c]
+v_cmps_nle_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd8,0x7c]
 
-v_cmps_le_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd8,0x7c]
 
-v_cmps_le_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_le_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_le_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd8,0x7c]
 
-v_cmps_le_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd8,0x7c]
 
-v_cmps_le_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd9,0x7c]
 
-v_cmps_le_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_le_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nle_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xd8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_nle_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xd8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_nle_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xd8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_nle_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xd8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_nle_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xd8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_nle_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xd8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_nle_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xd8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_nle_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_nle_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xd8,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_nle_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_nle_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xd8,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_le_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_nle_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_le_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_nle_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_le_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_nle_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_gt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc8,0x7c]
+v_cmps_nle_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_gt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc8,0x7c]
+v_cmps_nle_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_gt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xc8,0x7c]
+v_cmps_nle_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_gt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xc8,0x7c]
+v_cmps_nle_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_gt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc8,0x7c]
+v_cmps_neq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc8,0x7c]
+v_cmps_neq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc8,0x7c]
+v_cmps_neq_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc8,0x7c]
+v_cmps_neq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc8,0x7c]
+v_cmps_neq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc8,0x7c]
+v_cmps_neq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc8,0x7c]
+v_cmps_neq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc8,0x7c]
+v_cmps_neq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc8,0x7c]
+v_cmps_neq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_neq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_neq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc8,0x7c]
+v_cmps_neq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc8,0x7c]
+v_cmps_neq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xda,0x7c]
 
-v_cmps_gt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc9,0x7c]
+v_cmps_neq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xda,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_gt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xda,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_gt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xda,0x7c]
 
-v_cmps_gt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xda,0x7c]
 
-v_cmps_gt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xdb,0x7c]
 
-v_cmps_gt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xda,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_gt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xda,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_gt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xda,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_gt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_neq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xda,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_neq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xda,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_neq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xda,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_neq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xda,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_neq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xda,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_neq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_neq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xda,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_neq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xda,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_neq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xda,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_neq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xda,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_neq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xda,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_neq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xda,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_neq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xda,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_gt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_neq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xda,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_lg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xca,0x7c]
+v_cmps_neq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xda,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_lg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xca,0x7c]
+v_cmps_neq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xda,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_lg_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xca,0x7c]
+v_cmps_nlt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xca,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_nlt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xca,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_nlt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xdc,0x7c]
 
-v_cmps_lg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xca,0x7c]
+v_cmps_nlt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xdc,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_lg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xca,0x7c]
+v_cmps_nlt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xdc,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_lg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcb,0x7c]
+v_cmps_nlt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xdc,0x7c]
 
-v_cmps_lg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xdc,0x7c]
 
-v_cmps_lg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xdd,0x7c]
 
-v_cmps_lg_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xdc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xdc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xdc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xdc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_nlt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xdc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_nlt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xdc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_nlt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xdc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_nlt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_nlt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xdc,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_nlt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_nlt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xdc,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_nlt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_nlt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_nlt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_nlt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_lg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_nlt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_lg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x00,0x00,0x40]
+v_cmps_nlt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_lg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x00,0x00,0x60]
+v_cmps_nlt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_ge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xcc,0x7c]
+v_cmps_tru_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xcc,0x7c,0x56,0x34,0x12,0xaf]
+v_cmps_tru_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xde,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_ge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xcc,0x7c,0x73,0x72,0x71,0x3f]
+v_cmps_tru_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xde,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_ge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xcc,0x7c]
+v_cmps_tru_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xcc,0x7c]
+v_cmps_tru_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xde,0x7c]
 
-v_cmps_ge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcd,0x7c]
+v_cmps_tru_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xdf,0x7c]
 
-v_cmps_ge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xde,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xde,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ge_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xde,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xde,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xde,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xde,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xde,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xde,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd0,0x80,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_ge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd0,0xf0,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xde,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_ge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd0,0xfd,0x00,0x00,0x00]
+v_cmps_tru_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xde,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_ge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x01,0x00,0x00]
+v_cmps_tru_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xde,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_ge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd0,0xfe,0x01,0x00,0x00]
+v_cmps_tru_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xde,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_ge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x00,0x01,0x00]
+v_cmps_tru_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xde,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_ge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0xe0,0x01,0x00]
+v_cmps_tru_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xde,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_ge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0xfa,0x01,0x00]
+v_cmps_tru_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xde,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_ge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x00,0x02,0x00]
+v_cmps_tru_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xde,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_ge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0xfc,0x03,0x00]
+v_cmps_tru_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xde,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_ge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x00,0x00,0x20]
+v_cmps_tru_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xde,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_ge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_f_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe0,0x7c]
 
-v_cmps_ge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_f_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_o_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xce,0x7c]
+v_cmpsx_f_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_o_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xce,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_f_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xce,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_f_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe0,0x7c]
 
-v_cmps_o_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xce,0x7c]
+v_cmpsx_f_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe1,0x7c]
 
-v_cmps_o_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xce,0x7c]
+v_cmpsx_f_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_o_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcf,0x7c]
+v_cmpsx_f_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_o_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_o_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_o_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_o_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_o_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_o_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_o_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_o_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_o_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_o_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_o_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_f_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_o_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_f_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_o_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_f_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_o_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_f_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_o_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_f_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_o_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_f_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_o_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_f_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_o_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_lt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe2,0x7c]
 
-v_cmps_o_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_lt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe2,0x7c]
 
-v_cmps_o_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_lt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xe2,0x7c]
 
-v_cmps_o_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_lt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_u_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_u_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd0,0x7c]
+v_cmpsx_lt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe2,0x7c]
 
-v_cmps_u_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_lt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe3,0x7c]
 
-v_cmps_u_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_lt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_u_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd0,0x7c]
+v_cmpsx_lt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_u_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd0,0x7c]
+v_cmpsx_lt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_u_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd1,0x7c]
+v_cmpsx_lt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_u_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_u_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_u_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_u_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_u_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_u_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_u_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_u_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_u_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_u_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_u_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_u_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_u_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_u_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_u_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_lt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_u_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_eq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe4,0x7c]
 
-v_cmps_u_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_eq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe4,0x7c]
 
-v_cmps_u_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_eq_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xe4,0x7c]
 
-v_cmps_u_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_eq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xe4,0x7c]
 
-v_cmps_u_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_eq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe4,0x7c]
 
-v_cmps_u_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_eq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe4,0x7c]
 
-v_cmps_nge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe4,0x7c]
 
-v_cmps_nge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe4,0x7c]
 
-v_cmps_nge_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe4,0x7c]
 
-v_cmps_nge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe4,0x7c]
 
-v_cmps_nge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe4,0x7c]
 
-v_cmps_nge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe4,0x7c]
 
-v_cmps_nge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe4,0x7c]
 
-v_cmps_nge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_nge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_nge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe4,0x7c]
 
-v_cmps_nge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe4,0x7c]
 
-v_cmps_nge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe5,0x7c]
 
-v_cmps_nge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd2,0x7c]
+v_cmpsx_eq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_eq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_eq_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd2,0x7c]
+v_cmpsx_eq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd2,0x7c]
+v_cmpsx_eq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd3,0x7c]
+v_cmpsx_eq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nge_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_nge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_nge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_nge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_nge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_nge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_nge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_nge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_nge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_nge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_nge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_eq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_nge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_le_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe6,0x7c]
 
-v_cmps_nge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_le_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe6,0x7c]
 
-v_cmps_nge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_le_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xe6,0x7c]
 
-v_cmps_nge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_le_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xe6,0x7c]
 
-v_cmps_nge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_le_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe6,0x7c]
 
-v_cmps_nge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_le_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe6,0x7c]
 
-v_cmps_nge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_le_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe6,0x7c]
 
-v_cmps_nge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_le_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe6,0x7c]
 
-v_cmps_nlg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd4,0x7c]
+v_cmpsx_le_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe6,0x7c]
 
-v_cmps_nlg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd4,0x7c]
+v_cmpsx_le_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe6,0x7c]
 
-v_cmps_nlg_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xd4,0x7c]
+v_cmpsx_le_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe6,0x7c]
 
-v_cmps_nlg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xd4,0x7c]
+v_cmpsx_le_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe6,0x7c]
 
-v_cmps_nlg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd4,0x7c]
+v_cmpsx_le_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe6,0x7c]
 
-v_cmps_nlg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd4,0x7c]
+v_cmpsx_le_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_nlg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd4,0x7c]
+v_cmpsx_le_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_nlg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd4,0x7c]
+v_cmpsx_le_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe6,0x7c]
 
-v_cmps_nlg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd4,0x7c]
+v_cmpsx_le_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe6,0x7c]
 
-v_cmps_nlg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd4,0x7c]
+v_cmpsx_le_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe7,0x7c]
 
-v_cmps_nlg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd4,0x7c]
+v_cmpsx_le_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd4,0x7c]
+v_cmpsx_le_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd4,0x7c]
+v_cmpsx_le_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_le_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_le_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd4,0x7c]
+v_cmpsx_le_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd4,0x7c]
+v_cmpsx_le_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd5,0x7c]
+v_cmpsx_le_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_nlg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_nlg_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_nlg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_nlg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_nlg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_nlg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_nlg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_nlg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_nlg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_nlg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_le_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_nlg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_gt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe8,0x7c]
 
-v_cmps_nlg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_gt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe8,0x7c]
 
-v_cmps_nlg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_gt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xe8,0x7c]
 
-v_cmps_nlg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_gt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xe8,0x7c]
 
-v_cmps_nlg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_gt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe8,0x7c]
 
-v_cmps_nlg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_gt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe8,0x7c]
 
-v_cmps_nlg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_gt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe8,0x7c]
 
-v_cmps_nlg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_gt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe8,0x7c]
 
-v_cmps_nlg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_gt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe8,0x7c]
 
-v_cmps_nlg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_gt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe8,0x7c]
 
-v_cmps_ngt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe8,0x7c]
 
-v_cmps_ngt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe8,0x7c]
 
-v_cmps_ngt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe8,0x7c]
 
-v_cmps_ngt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_ngt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_ngt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe8,0x7c]
 
-v_cmps_ngt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe8,0x7c]
 
-v_cmps_ngt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe9,0x7c]
 
-v_cmps_ngt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ngt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ngt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ngt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ngt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd6,0x7c]
+v_cmpsx_gt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ngt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_gt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ngt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_gt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ngt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd6,0x7c]
+v_cmpsx_gt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_ngt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd6,0x7c]
+v_cmpsx_gt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_ngt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd7,0x7c]
+v_cmpsx_gt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_ngt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_ngt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_ngt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_ngt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_ngt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_ngt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_ngt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_ngt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_gt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_ngt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_gt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_ngt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_lg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_lg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_lg_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_lg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_lg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_lg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_lg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_lg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_lg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_lg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_lg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xea,0x7c]
 
-v_cmps_ngt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_lg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xea,0x7c]
 
-v_cmps_nle_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xea,0x7c]
 
-v_cmps_nle_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xea,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_nle_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xea,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_nle_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xea,0x7c]
 
-v_cmps_nle_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xea,0x7c]
 
-v_cmps_nle_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xeb,0x7c]
 
-v_cmps_nle_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nle_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nle_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nle_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nle_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nle_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nle_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd8,0x7c]
+v_cmpsx_lg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nle_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_lg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nle_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_lg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_nle_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd8,0x7c]
+v_cmpsx_lg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_nle_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd8,0x7c]
+v_cmpsx_lg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_nle_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd9,0x7c]
+v_cmpsx_lg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_nle_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_nle_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_nle_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_nle_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_nle_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_nle_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_nle_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_lg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_nle_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_ge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_ge_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_ge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_ge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_ge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_ge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_ge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_ge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_ge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_ge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_ge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_ge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xec,0x7c]
 
-v_cmps_nle_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_ge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xec,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_neq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xda,0x7c]
+v_cmpsx_ge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xec,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_neq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xda,0x7c]
+v_cmpsx_ge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xec,0x7c]
 
-v_cmps_neq_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xda,0x7c]
+v_cmpsx_ge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xec,0x7c]
 
-v_cmps_neq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xda,0x7c]
+v_cmpsx_ge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xed,0x7c]
 
-v_cmps_neq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xda,0x7c]
+v_cmpsx_ge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_neq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xda,0x7c]
+v_cmpsx_ge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_neq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xda,0x7c]
+v_cmpsx_ge_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_neq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xda,0x7c]
+v_cmpsx_ge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_neq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xda,0x7c]
+v_cmpsx_ge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_neq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xda,0x7c]
+v_cmpsx_ge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_neq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xda,0x7c]
+v_cmpsx_ge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_neq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xda,0x7c]
+v_cmpsx_ge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_neq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xda,0x7c]
+v_cmpsx_ge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_neq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xda,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_ge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_neq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xda,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_ge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_neq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xda,0x7c]
+v_cmpsx_ge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_neq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xda,0x7c]
+v_cmpsx_ge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_neq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xdb,0x7c]
+v_cmpsx_ge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_neq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_neq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_neq_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_neq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_neq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_neq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xda,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_o_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xda,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_o_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xda,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_o_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_o_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xda,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_o_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_o_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_o_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_o_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_o_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_o_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xee,0x7c]
 
-v_cmps_neq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_o_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xee,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_neq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_o_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xee,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_neq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_o_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xee,0x7c]
 
-v_cmps_nlt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xdc,0x7c]
+v_cmpsx_o_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xee,0x7c]
 
-v_cmps_nlt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xdc,0x7c]
+v_cmpsx_o_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xef,0x7c]
 
-v_cmps_nlt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xdc,0x7c]
+v_cmpsx_o_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xdc,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_o_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_nlt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xdc,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_o_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_nlt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xdc,0x7c]
+v_cmpsx_o_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_nlt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xdc,0x7c]
+v_cmpsx_o_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_nlt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xdd,0x7c]
+v_cmpsx_o_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_nlt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_nlt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_nlt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_o_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_nlt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xdc,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_u_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xdc,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_u_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xdc,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_u_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_u_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xdc,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_u_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_u_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_u_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_u_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_u_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_nlt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_u_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_nlt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_u_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_u_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf0,0x7c]
 
-v_cmps_nlt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_u_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf1,0x7c]
 
-v_cmps_tru_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_tru_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_tru_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_tru_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_tru_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_tru_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_tru_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_tru_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_tru_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmps_tru_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmps_tru_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmps_tru_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmps_tru_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xde,0x7c]
+v_cmpsx_u_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmps_tru_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xde,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_u_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmps_tru_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xde,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_u_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmps_tru_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xde,0x7c]
+v_cmpsx_u_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmps_tru_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xde,0x7c]
+v_cmpsx_u_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmps_tru_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xdf,0x7c]
+v_cmpsx_u_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmps_tru_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_u_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmps_tru_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xde,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_nge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xde,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_nge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xde,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_nge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_nge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xde,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_nge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_nge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_nge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmps_tru_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_nge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmps_tru_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_nge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_nge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf2,0x7c]
 
-v_cmps_tru_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_nge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf3,0x7c]
 
-v_cmps_tru_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_nge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmps_tru_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_nge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_f_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_f_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_f_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_f_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_nge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_f_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_nge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_f_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_f_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe0,0x7c]
+v_cmpsx_nge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_f_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe1,0x7c]
+v_cmpsx_nlg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_nlg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_f_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_nlg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_f_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_nlg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_nlg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf4,0x7c]
 
-v_cmpsx_f_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_nlg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf5,0x7c]
 
-v_cmpsx_f_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_nlg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_f_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_nlg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_f_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_nlg_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_f_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_nlg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_lt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_lt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_lt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_lt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_lt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_lt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe2,0x7c]
+v_cmpsx_nlg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_lt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_nlg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_lt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_nlg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_lt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe2,0x7c]
+v_cmpsx_ngt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe2,0x7c]
+v_cmpsx_ngt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe3,0x7c]
+v_cmpsx_ngt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_lt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_lt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_ngt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_ngt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf6,0x7c]
 
-v_cmpsx_lt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_ngt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf7,0x7c]
 
-v_cmpsx_lt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_ngt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_ngt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_ngt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_ngt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_ngt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_ngt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_eq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_eq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_eq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_eq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_eq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_eq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_eq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_eq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe4,0x7c]
+v_cmpsx_ngt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_eq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_nle_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_nle_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe4,0x7c]
+v_cmpsx_nle_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe4,0x7c]
+v_cmpsx_nle_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe5,0x7c]
+v_cmpsx_nle_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_eq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_eq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_nle_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_nle_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf8,0x7c]
 
-v_cmpsx_eq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_nle_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf9,0x7c]
 
-v_cmpsx_eq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_nle_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_nle_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_nle_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_nle_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_nle_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_nle_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_nle_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_eq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_nle_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_le_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_le_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_le_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_le_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_le_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_le_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_le_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_le_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_le_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_le_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_le_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe6,0x7c]
+v_cmpsx_nle_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_le_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe6,0x7c]
+v_cmpsx_neq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe6,0x7c]
+v_cmpsx_neq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_neq_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_neq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe6,0x7c]
+v_cmpsx_neq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe6,0x7c]
+v_cmpsx_neq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe7,0x7c]
+v_cmpsx_neq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xfa,0x7c]
 
-v_cmpsx_le_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xfa,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_le_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xfa,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_le_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xfa,0x7c]
 
-v_cmpsx_le_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xfa,0x7c]
 
-v_cmpsx_le_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_neq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xfb,0x7c]
 
-v_cmpsx_le_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_neq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_le_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_neq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_le_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_neq_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_le_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_neq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_le_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_neq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_le_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_neq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_le_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_neq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_le_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_neq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_le_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_neq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_le_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_neq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe8,0x7c]
+v_cmpsx_neq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_gt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe8,0x7c]
+v_cmpsx_neq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_gt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xe8,0x7c]
+v_cmpsx_neq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_gt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xe8,0x7c]
+v_cmpsx_neq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_gt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe8,0x7c]
+v_cmpsx_neq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_gt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe8,0x7c]
+v_cmpsx_neq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_gt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe8,0x7c]
+v_cmpsx_neq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_gt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe8,0x7c]
+v_cmpsx_neq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_gt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe8,0x7c]
+v_cmpsx_neq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_gt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe8,0x7c]
+v_cmpsx_nlt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe8,0x7c]
+v_cmpsx_nlt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe8,0x7c]
+v_cmpsx_nlt_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe8,0x7c]
+v_cmpsx_nlt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_nlt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_nlt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe8,0x7c]
+v_cmpsx_nlt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe8,0x7c]
+v_cmpsx_nlt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe9,0x7c]
+v_cmpsx_nlt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xfc,0x7c]
 
-v_cmpsx_gt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xfc,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_gt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xfc,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_gt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xfc,0x7c]
 
-v_cmpsx_gt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_nlt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xfc,0x7c]
 
-v_cmpsx_gt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_nlt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xfd,0x7c]
 
-v_cmpsx_gt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_nlt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_nlt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_nlt_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_nlt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_nlt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_nlt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_nlt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_nlt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_nlt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_nlt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_nlt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_gt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_nlt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_lg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xea,0x7c]
+v_cmpsx_nlt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_lg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xea,0x7c]
+v_cmpsx_nlt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_lg_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xea,0x7c]
+v_cmpsx_nlt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_lg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xea,0x7c]
+v_cmpsx_nlt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_lg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xea,0x7c]
+v_cmpsx_nlt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_lg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xea,0x7c]
+v_cmpsx_nlt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_lg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xea,0x7c]
+v_cmpsx_nlt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_lg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xea,0x7c]
+v_cmpsx_tru_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xea,0x7c]
+v_cmpsx_tru_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xea,0x7c]
+v_cmpsx_tru_f64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xea,0x7c]
+v_cmpsx_tru_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xea,0x7c]
+v_cmpsx_tru_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xea,0x7c]
+v_cmpsx_tru_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xea,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpsx_tru_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xea,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpsx_tru_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xea,0x7c]
+v_cmpsx_tru_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xea,0x7c]
+v_cmpsx_tru_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xeb,0x7c]
+v_cmpsx_tru_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xfe,0x7c]
 
-v_cmpsx_lg_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xfe,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_lg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xfe,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_lg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xfe,0x7c]
 
-v_cmpsx_lg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xfe,0x7c]
 
-v_cmpsx_lg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xff,0x7c]
 
-v_cmpsx_lg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmpsx_tru_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x80,0x00,0x00,0x00]
+v_cmpsx_tru_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpsx_tru_f64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpsx_tru_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x01,0x00,0x00]
+v_cmpsx_tru_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpsx_tru_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x00,0x01,0x00]
+v_cmpsx_tru_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpsx_tru_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpsx_tru_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x00,0x02,0x00]
+v_cmpsx_tru_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpsx_tru_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x00,0x00,0x20]
+v_cmpsx_tru_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x00,0x00,0x40]
+v_cmpsx_tru_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpsx_lg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x00,0x00,0x60]
+v_cmpsx_tru_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpsx_ge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xec,0x7c]
+v_cmpsx_tru_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpsx_ge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xec,0x7c]
+v_cmpsx_tru_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpsx_ge_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xec,0x7c]
+v_cmpsx_tru_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpsx_ge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xec,0x7c]
+v_cmpsx_tru_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpsx_ge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xec,0x7c]
+v_cmpsx_tru_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpsx_ge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xec,0x7c]
+v_cmp_f_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xec,0x7c]
+v_cmp_f_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xec,0x7c]
+v_cmp_f_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xec,0x7c]
+v_cmp_f_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xec,0x7c]
+v_cmp_f_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xec,0x7c]
+v_cmp_f_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xec,0x7c]
+v_cmp_f_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xec,0x7c]
+v_cmp_f_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xec,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_f_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xec,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_f_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xec,0x7c]
+v_cmp_f_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xec,0x7c]
+v_cmp_f_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xed,0x7c]
+v_cmp_f_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x00,0x7d]
 
-v_cmpsx_ge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x00,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_ge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x00,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_ge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x00,0x7d]
 
-v_cmpsx_ge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x00,0x7d]
 
-v_cmpsx_ge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x01,0x7d]
 
-v_cmpsx_ge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x00,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_f_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x00,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x00,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_f_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x00,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_f_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x00,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x00,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_f_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x00,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_f_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x00,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpsx_ge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_f_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x00,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_ge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_f_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x00,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x00,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x00,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xee,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_f_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xee,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_f_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_o_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpsx_o_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xee,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpsx_o_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xef,0x7c]
+v_cmp_f_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_o_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpsx_o_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpsx_o_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x00,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_o_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lt_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_lt_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_lt_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_lt_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_lt_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_lt_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lt_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lt_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x02,0x7d]
 
-v_cmpsx_o_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lt_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x02,0x7d]
 
-v_cmpsx_u_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf0,0x7c]
+v_cmp_lt_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x02,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_u_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf0,0x7c]
+v_cmp_lt_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x02,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_u_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xf0,0x7c]
+v_cmp_lt_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x02,0x7d]
 
-v_cmpsx_u_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xf0,0x7c]
+v_cmp_lt_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x02,0x7d]
 
-v_cmpsx_u_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf0,0x7c]
+v_cmp_lt_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x03,0x7d]
 
-v_cmpsx_u_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf0,0x7c]
+v_cmp_lt_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf0,0x7c]
+v_cmp_lt_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x02,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf0,0x7c]
+v_cmp_lt_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x02,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf0,0x7c]
+v_cmp_lt_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x02,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf0,0x7c]
+v_cmp_lt_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x02,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf0,0x7c]
+v_cmp_lt_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x02,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf0,0x7c]
+v_cmp_lt_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x02,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf0,0x7c]
+v_cmp_lt_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x02,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_lt_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x02,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_lt_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x02,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf0,0x7c]
+v_cmp_lt_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x02,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf0,0x7c]
+v_cmp_lt_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x02,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpsx_u_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf1,0x7c]
+v_cmp_lt_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x02,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpsx_u_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_u_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_u_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_u_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_u_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_u_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpsx_u_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lt_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x02,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_u_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_eq_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x04,0x7d]
 
-v_cmpsx_u_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_eq_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf2,0x7c]
+v_cmp_eq_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_eq_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_eq_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf2,0x7c]
+v_cmp_eq_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x04,0x7d]
 
-v_cmpsx_nge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf2,0x7c]
+v_cmp_eq_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x04,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_nge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf3,0x7c]
+v_cmp_eq_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x04,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_nge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x04,0x7d]
 
-v_cmpsx_nge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x04,0x7d]
 
-v_cmpsx_nge_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x05,0x7d]
 
-v_cmpsx_nge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x04,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x04,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x04,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x04,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x04,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x04,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x04,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x04,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_eq_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x04,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x04,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_eq_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x04,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_eq_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x04,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_eq_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_eq_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_eq_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_nge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_eq_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_nlg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_nlg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpsx_nlg_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_nlg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpsx_nlg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_nlg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_nlg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_nlg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_nlg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpsx_nlg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpsx_nlg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_nlg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpsx_nlg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf4,0x7c]
+v_cmp_eq_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpsx_nlg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_eq_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x04,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_nlg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_le_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf4,0x7c]
+v_cmp_le_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf4,0x7c]
+v_cmp_le_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf5,0x7c]
+v_cmp_le_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_le_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_le_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x06,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_nlg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_le_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x06,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_nlg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_le_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x06,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_le_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x07,0x7d]
 
-v_cmpsx_nlg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_le_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nlg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_le_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x06,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x06,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x06,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x06,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x06,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x06,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x06,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x06,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x06,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x06,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x06,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x06,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf6,0x7c]
+v_cmp_le_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_le_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_le_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf6,0x7c]
+v_cmp_le_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf6,0x7c]
+v_cmp_le_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_ngt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf7,0x7c]
+v_cmp_le_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpsx_ngt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_ngt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpsx_ngt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_ngt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_ngt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_ngt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_ngt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpsx_ngt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpsx_ngt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_ngt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpsx_ngt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpsx_ngt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x06,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_ngt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_gt_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x08,0x7d]
 
-v_cmpsx_ngt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x08,0x7d]
 
-v_cmpsx_ngt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_gt_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x08,0x7d]
 
-v_cmpsx_ngt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_gt_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x08,0x7d]
 
-v_cmpsx_ngt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x08,0x7d]
 
-v_cmpsx_ngt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_gt_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x08,0x7d]
 
-v_cmpsx_ngt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_gt_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x08,0x7d]
 
-v_cmpsx_ngt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_gt_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x08,0x7d]
 
-v_cmpsx_ngt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_gt_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x08,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_nle_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x08,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_nle_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf8,0x7c]
+v_cmp_gt_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x08,0x7d]
 
-v_cmpsx_nle_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_gt_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x09,0x7d]
 
-v_cmpsx_nle_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_gt_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf8,0x7c]
+v_cmp_gt_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x08,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf8,0x7c]
+v_cmp_gt_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x08,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf9,0x7c]
+v_cmp_gt_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x08,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x08,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x08,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x08,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x08,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x08,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x08,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x08,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x08,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x08,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_gt_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_gt_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_nle_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_gt_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_neq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfa,0x7c]
+v_cmp_gt_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_neq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xfa,0x7c]
+v_cmp_gt_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpsx_neq_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xfa,0x7c]
+v_cmp_gt_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpsx_neq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xfa,0x7c]
+v_cmp_gt_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_neq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xfa,0x7c]
+v_cmp_gt_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpsx_neq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xfa,0x7c]
+v_cmp_gt_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpsx_neq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xfa,0x7c]
+v_cmp_gt_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x08,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_neq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xfa,0x7c]
+v_cmp_ne_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xfa,0x7c]
+v_cmp_ne_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xfa,0x7c]
+v_cmp_ne_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xfa,0x7c]
+v_cmp_ne_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xfa,0x7c]
+v_cmp_ne_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xfa,0x7c]
+v_cmp_ne_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xfa,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ne_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xfa,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ne_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xfa,0x7c]
+v_cmp_ne_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xfa,0x7c]
+v_cmp_ne_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xfb,0x7c]
+v_cmp_ne_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x7d]
 
-v_cmpsx_neq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_neq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_neq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x7d]
 
-v_cmpsx_neq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x7d]
 
-v_cmpsx_neq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ne_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x7d]
 
-v_cmpsx_neq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_neq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ne_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x0a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_neq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ne_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x0a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_neq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ne_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x0a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_neq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ne_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x0a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_neq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ne_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x0a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_neq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ne_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x0a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_neq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ne_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x0a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_neq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ne_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x0a,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x0a,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x0a,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x0a,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xfc,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ne_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xfc,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ne_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xfc,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_nlt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xfd,0x7c]
+v_cmp_ne_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpsx_nlt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpsx_nlt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_nlt_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpsx_nlt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpsx_nlt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x0a,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpsx_nlt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ge_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ge_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ge_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ge_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ge_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ge_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ge_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ge_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x0c,0x7d]
 
-v_cmpsx_nlt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ge_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0c,0x7d]
 
-v_cmpsx_tru_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfe,0x7c]
+v_cmp_ge_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0c,0x7d]
 
-v_cmpsx_tru_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xfe,0x7c]
+v_cmp_ge_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0c,0x7d]
 
-v_cmpsx_tru_f64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xfe,0x7c]
+v_cmp_ge_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0c,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpsx_tru_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xfe,0x7c]
+v_cmp_ge_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0c,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpsx_tru_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xfe,0x7c]
+v_cmp_ge_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x0c,0x7d]
 
-v_cmpsx_tru_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xfe,0x7c]
+v_cmp_ge_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x0c,0x7d]
 
-v_cmpsx_tru_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xfe,0x7c]
+v_cmp_ge_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0d,0x7d]
 
-v_cmpsx_tru_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xfe,0x7c]
+v_cmp_ge_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xfe,0x7c]
+v_cmp_ge_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x0c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xfe,0x7c]
+v_cmp_ge_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x0c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xfe,0x7c]
+v_cmp_ge_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x0c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xfe,0x7c]
+v_cmp_ge_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x0c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xfe,0x7c]
+v_cmp_ge_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x0c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xfe,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ge_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x0c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xfe,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ge_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x0c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xfe,0x7c]
+v_cmp_ge_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x0c,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xfe,0x7c]
+v_cmp_ge_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x0c,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xff,0x7c]
+v_cmp_ge_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x0c,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x0c,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ge_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ge_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpsx_tru_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ge_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x0c,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_f_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x7d]
+v_cmp_t_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_t_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0e,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_f_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_t_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0e,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x00,0x7d]
+v_cmp_t_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x00,0x7d]
+v_cmp_t_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x0e,0x7d]
 
-v_cmp_f_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x7d]
+v_cmp_t_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0f,0x7d]
 
-v_cmp_f_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x00,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x0e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x00,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x0e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x00,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x0e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x00,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x0e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x00,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x0e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x00,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x0e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x00,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x0e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x00,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x0e,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x00,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x0e,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x00,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x0e,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x00,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x0e,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x00,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_lt_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_lt_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_lt_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_lt_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_lt_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_lt_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_lt_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_lt_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_lt_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_lt_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_lt_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_lt_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x02,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x0e,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_lt_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x02,0x7d]
+v_cmpx_f_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x20,0x7d]
 
-v_cmp_lt_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x02,0x7d]
+v_cmpx_f_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x20,0x7d]
 
-v_cmp_lt_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x02,0x7d]
+v_cmpx_f_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x20,0x7d]
 
-v_cmp_lt_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x02,0x7d]
+v_cmpx_f_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x20,0x7d]
 
-v_cmp_lt_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x02,0x7d]
+v_cmpx_f_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x20,0x7d]
 
-v_cmp_lt_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x02,0x7d]
+v_cmpx_f_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x20,0x7d]
 
-v_cmp_lt_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x02,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_f_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x20,0x7d]
 
-v_cmp_lt_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x02,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x20,0x7d]
 
-v_cmp_lt_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x02,0x7d]
+v_cmpx_f_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x20,0x7d]
 
-v_cmp_lt_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x02,0x7d]
+v_cmpx_f_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x20,0x7d]
 
-v_cmp_lt_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x03,0x7d]
+v_cmpx_f_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x20,0x7d]
 
-v_cmp_lt_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x20,0x7d]
 
-v_cmp_lt_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x20,0x7d]
 
-v_cmp_lt_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x20,0x7d]
 
-v_cmp_lt_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x20,0x7d]
 
-v_cmp_lt_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x20,0x7d]
 
-v_cmp_lt_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x20,0x7d]
 
-v_cmp_lt_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x20,0x7d]
 
-v_cmp_lt_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x20,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x20,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_lt_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x20,0x7d]
 
-v_cmp_lt_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x20,0x7d]
 
-v_cmp_lt_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x21,0x7d]
 
-v_cmp_lt_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_f_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_f_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x20,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_f_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x20,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_f_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x20,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_f_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x20,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_f_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x20,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_f_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x20,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_f_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x20,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x20,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_eq_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x20,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_eq_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x20,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_eq_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x20,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_eq_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x20,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_eq_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_eq_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_eq_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_eq_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_eq_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_eq_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_eq_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_eq_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_eq_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_eq_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_eq_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_eq_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_eq_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_eq_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x04,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_f_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_eq_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x04,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_eq_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_eq_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x04,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_eq_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x05,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x20,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_eq_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_lt_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_lt_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_lt_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_lt_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_lt_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_lt_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x22,0x7d]
 
-v_cmp_eq_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_lt_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x22,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x06,0x7d]
+v_cmpx_lt_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x22,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x06,0x7d]
+v_cmpx_lt_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x22,0x7d]
 
-v_cmp_le_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x06,0x7d]
+v_cmpx_lt_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x22,0x7d]
 
-v_cmp_le_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x06,0x7d]
+v_cmpx_lt_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x23,0x7d]
 
-v_cmp_le_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x22,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x22,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x22,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x22,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x22,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x22,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x22,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x22,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x22,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x22,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_le_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x22,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_le_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x22,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_le_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_le_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x06,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_le_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x06,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_le_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_le_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x06,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_le_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x07,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_le_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_le_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x06,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_le_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x06,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_le_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x06,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_le_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x06,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_le_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x06,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_le_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x06,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_le_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x06,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_le_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_le_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_le_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_le_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_le_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x22,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_le_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_eq_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x24,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_eq_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x24,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_eq_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x24,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x24,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_eq_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x24,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_eq_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x24,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x24,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x24,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x24,0x7d]
 
-v_cmp_gt_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x08,0x7d]
+v_cmpx_eq_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x25,0x7d]
 
-v_cmp_gt_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x08,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x08,0x7d]
+v_cmpx_eq_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x24,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x08,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x24,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x08,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x24,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x08,0x7d]
+v_cmpx_eq_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x24,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x08,0x7d]
+v_cmpx_eq_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x24,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x09,0x7d]
+v_cmpx_eq_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x24,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x24,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x08,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x24,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x08,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x24,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_gt_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x08,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x24,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_gt_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x08,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x24,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_gt_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x08,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x24,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_gt_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x08,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_gt_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x08,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_ne_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x0a,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_ne_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x0a,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_ne_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x0a,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_ne_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x0a,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_ne_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x0a,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x24,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_ne_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x0a,0x7d]
+v_cmpx_le_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x0a,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_le_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x0a,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_le_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x0a,0x7d]
+v_cmpx_le_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x0a,0x7d]
+v_cmpx_le_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x26,0x7d]
 
-v_cmp_ne_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x0b,0x7d]
+v_cmpx_le_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x26,0x7d]
 
-v_cmp_ne_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x26,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ne_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x26,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ne_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x26,0x7d]
 
-v_cmp_ne_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x26,0x7d]
 
-v_cmp_ne_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x27,0x7d]
 
-v_cmp_ne_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x26,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x26,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x26,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x26,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x26,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x26,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_le_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x26,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_le_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x26,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x26,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_le_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x26,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_le_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x26,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_le_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x26,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_ge_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_ge_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_ge_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_ge_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_ge_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_ge_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_ge_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_ge_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_ge_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_ge_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_ge_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_ge_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_ge_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_ge_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_ge_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_ge_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_ge_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x0c,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x26,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_ge_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x0c,0x7d]
+v_cmpx_gt_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x28,0x7d]
 
-v_cmp_ge_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x0c,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x28,0x7d]
 
-v_cmp_ge_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x0c,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x28,0x7d]
 
-v_cmp_ge_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x0c,0x7d]
+v_cmpx_gt_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x28,0x7d]
 
-v_cmp_ge_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x0c,0x7d]
+v_cmpx_gt_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x28,0x7d]
 
-v_cmp_ge_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x0d,0x7d]
+v_cmpx_gt_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x0c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x0c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x0c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x0c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x0c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x0c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x0c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x28,0x7d]
 
-v_cmp_ge_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_gt_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x28,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_gt_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x28,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_gt_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x28,0x7d]
 
-v_cmp_ge_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_gt_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x28,0x7d]
 
-v_cmp_ge_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x29,0x7d]
 
-v_cmp_ge_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_gt_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x28,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x28,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x28,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x28,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x28,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x28,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x28,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x28,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_t_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x28,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_t_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x28,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_t_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x28,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_t_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x28,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_t_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_t_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_t_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_t_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_t_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_t_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_t_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_t_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_t_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x0e,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_t_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x0e,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_t_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_t_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x0e,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_t_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x0f,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_t_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x0e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_t_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x0e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_t_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x0e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_t_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x0e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_t_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x0e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x28,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_t_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x0e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x0e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_ne_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_ne_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_ne_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_ne_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_ne_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_ne_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_ne_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x2a,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_ne_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x2a,0x7d]
 
-v_cmpx_f_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x20,0x7d]
+v_cmpx_ne_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x2a,0x7d]
 
-v_cmpx_f_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x20,0x7d]
+v_cmpx_ne_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x2a,0x7d]
 
-v_cmpx_f_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x20,0x7d]
+v_cmpx_ne_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x2a,0x7d]
 
-v_cmpx_f_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x20,0x7d]
+v_cmpx_ne_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x2a,0x7d]
 
-v_cmpx_f_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x20,0x7d]
+v_cmpx_ne_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x2a,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x20,0x7d]
+v_cmpx_ne_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x2a,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x20,0x7d]
+v_cmpx_ne_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x2a,0x7d]
 
-v_cmpx_f_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x20,0x7d]
+v_cmpx_ne_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x2a,0x7d]
 
-v_cmpx_f_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x20,0x7d]
+v_cmpx_ne_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x2b,0x7d]
 
-v_cmpx_f_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x20,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x20,0x7d]
+v_cmpx_ne_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x20,0x7d]
+v_cmpx_ne_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x2a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x20,0x7d]
+v_cmpx_ne_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x2a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x20,0x7d]
+v_cmpx_ne_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x20,0x7d]
+v_cmpx_ne_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x20,0x7d]
+v_cmpx_ne_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x20,0x7d]
+v_cmpx_ne_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x20,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x2a,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x20,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ne_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2a,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x20,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ne_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x2a,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_f_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x20,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_f_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x20,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2a,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_f_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x21,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_f_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_f_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_f_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_f_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_f_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2a,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_ge_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x2c,0x7d]
 
-v_cmpx_f_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_ge_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x2c,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x22,0x7d]
+v_cmpx_ge_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x2c,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x22,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x22,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x2c,0x7d]
 
-v_cmpx_lt_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x22,0x7d]
+v_cmpx_ge_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x2d,0x7d]
 
-v_cmpx_lt_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x22,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x23,0x7d]
+v_cmpx_ge_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x2c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x2c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x2c,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_lt_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2c,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x2c,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2c,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x24,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x24,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x24,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x24,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x24,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_eq_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x24,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_eq_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x24,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_eq_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x24,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_eq_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x24,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_eq_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x24,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2c,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_eq_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x24,0x7d]
+v_cmpx_t_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x24,0x7d]
+v_cmpx_t_i32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x24,0x7d]
+v_cmpx_t_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x24,0x7d]
+v_cmpx_t_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x24,0x7d]
+v_cmpx_t_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x24,0x7d]
+v_cmpx_t_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x24,0x7d]
+v_cmpx_t_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x24,0x7d]
+v_cmpx_t_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x24,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x24,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_t_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x24,0x7d]
+v_cmpx_t_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x24,0x7d]
+v_cmpx_t_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x25,0x7d]
+v_cmpx_t_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x24,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x24,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x24,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x24,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x24,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x2e,0x7d]
 
-v_cmpx_eq_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x24,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x2e,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x24,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x2e,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x24,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x2e,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x24,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x2e,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x24,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x2f,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x24,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_t_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x24,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_t_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x24,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_t_i32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x2e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x24,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_t_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x2e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x24,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_t_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x24,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_t_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x24,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_t_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x24,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_t_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x24,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_t_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x2e,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x24,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_t_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2e,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x2e,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_le_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2e,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_le_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_le_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_le_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_le_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_le_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_le_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_le_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_le_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_le_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_le_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_le_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_le_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_le_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_le_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_le_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_le_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x26,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_le_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x26,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_t_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_le_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_le_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x26,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2e,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_le_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x27,0x7d]
+v_cmp_f_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x26,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x26,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x26,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x26,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x26,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x26,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x26,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x40,0x7d]
 
-v_cmpx_le_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_f_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x40,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_f_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x40,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_f_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x40,0x7d]
 
-v_cmpx_le_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_f_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x40,0x7d]
 
-v_cmpx_le_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_f_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x41,0x7d]
 
-v_cmpx_le_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_f_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_f_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x40,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_f_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x40,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x40,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x40,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x40,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x40,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x40,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x40,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_gt_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x40,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_gt_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x40,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_gt_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x40,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_gt_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x40,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_gt_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x40,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_gt_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x28,0x7d]
+v_cmp_f_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x40,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_gt_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x28,0x7d]
+v_cmp_lt_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x28,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_lt_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x28,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_lt_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x28,0x7d]
+v_cmp_lt_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x28,0x7d]
+v_cmp_lt_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x29,0x7d]
+v_cmp_lt_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x28,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x28,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x28,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x28,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x28,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x28,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x42,0x7d]
 
-v_cmpx_gt_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x28,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x42,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x42,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x42,0x7d]
 
-v_cmpx_gt_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x42,0x7d]
 
-v_cmpx_gt_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x43,0x7d]
 
-v_cmpx_gt_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_lt_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x42,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_lt_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x42,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_lt_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x42,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_lt_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x42,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_lt_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x42,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_lt_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x42,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_lt_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x42,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x42,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x42,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_ne_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x42,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_ne_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x42,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ne_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x42,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_ne_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x42,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_ne_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x2a,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x42,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ne_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x2a,0x7d]
+v_cmp_eq_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x2a,0x7d]
+v_cmp_eq_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x2a,0x7d]
+v_cmp_eq_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x2a,0x7d]
+v_cmp_eq_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x2a,0x7d]
+v_cmp_eq_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x2a,0x7d]
+v_cmp_eq_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x2a,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_eq_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x2a,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_eq_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x2a,0x7d]
+v_cmp_eq_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x2a,0x7d]
+v_cmp_eq_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x2b,0x7d]
+v_cmp_eq_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x44,0x7d]
 
-v_cmpx_ne_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x2a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x44,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ne_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x2a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x44,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ne_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x44,0x7d]
 
-v_cmpx_ne_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x44,0x7d]
 
-v_cmpx_ne_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x45,0x7d]
 
-v_cmpx_ne_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_eq_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x44,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_eq_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x44,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_eq_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x44,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_eq_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x44,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_eq_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x44,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_eq_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x44,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_eq_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x44,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_eq_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_eq_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_eq_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_eq_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_eq_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x2c,0x7d]
+v_cmp_eq_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x44,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x2c,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x44,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_ge_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x2c,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x44,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_ge_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x2c,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x44,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ge_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x2c,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x44,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_ge_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x2c,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x44,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_ge_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x2c,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x44,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ge_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x2c,0x7d]
+v_cmp_le_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x2c,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_le_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x2c,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_le_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x46,0x7d]
 
-v_cmpx_ge_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x2c,0x7d]
+v_cmp_le_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x46,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x2c,0x7d]
+v_cmp_le_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x46,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x2d,0x7d]
+v_cmp_le_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x46,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x46,0x7d]
 
-v_cmpx_ge_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x47,0x7d]
 
-v_cmpx_ge_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x2c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x2c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x46,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x46,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x46,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x46,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x46,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_le_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x46,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_le_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x46,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_le_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_le_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_le_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_le_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_le_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_le_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x46,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_le_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x46,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_le_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x46,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_le_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x46,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_le_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x46,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_t_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x2e,0x7d]
+v_cmp_le_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x46,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_t_i32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x2e,0x7d]
+v_cmp_le_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x46,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_t_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x48,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x48,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_t_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x2e,0x7d]
+v_cmp_gt_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x2e,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_gt_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x48,0x7d]
 
-v_cmpx_t_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x2e,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_gt_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x49,0x7d]
 
-v_cmpx_t_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x2e,0x7d]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x2e,0x7d]
+v_cmp_gt_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x48,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x2f,0x7d]
+v_cmp_gt_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x48,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x48,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x48,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x48,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x48,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x48,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_t_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_t_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x48,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x48,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x48,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x48,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x48,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x48,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x48,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_ne_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x4a,0x7d]
 
-v_cmpx_t_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ne_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x4a,0x7d]
 
-v_cmpx_t_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_ne_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x4a,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_f_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x4a,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x40,0x7d]
+v_cmp_ne_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x40,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ne_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x4a,0x7d]
 
-v_cmp_f_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x40,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ne_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x4b,0x7d]
 
-v_cmp_f_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x40,0x7d]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x40,0x7d]
+v_cmp_ne_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x4a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x41,0x7d]
+v_cmp_ne_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x4a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x4a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x40,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x4a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x40,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x4a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x40,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x4a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x40,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x4a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x40,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x40,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x40,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x4a,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x4a,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_f_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x4a,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_f_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x40,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x4a,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x40,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x4a,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x40,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x4a,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x40,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x4a,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x40,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_ge_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x4c,0x7d]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x40,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ge_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x4c,0x7d]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x40,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_ge_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x4c,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x4c,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_lt_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x42,0x7d]
+v_cmp_ge_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x42,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ge_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x4c,0x7d]
 
-v_cmp_lt_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x42,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ge_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x4d,0x7d]
 
-v_cmp_lt_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x42,0x7d]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x42,0x7d]
+v_cmp_ge_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x4c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x43,0x7d]
+v_cmp_ge_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x4c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x4c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x42,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x4c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x42,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x4c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x42,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x4c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x42,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x4c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x42,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x42,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x42,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x4c,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x4c,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x4c,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x42,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x4c,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x42,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x4c,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x42,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x4c,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x42,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x4c,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x42,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_t_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x4e,0x7d]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x42,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_t_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x4e,0x7d]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x42,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_t_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x4e,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x4e,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x44,0x7d]
+v_cmp_t_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x44,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_t_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x4e,0x7d]
 
-v_cmp_eq_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x44,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_t_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x4f,0x7d]
 
-v_cmp_eq_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x44,0x7d]
+v_cmp_t_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x44,0x7d]
+v_cmp_t_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x4e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x45,0x7d]
+v_cmp_t_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x4e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x4e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x44,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x4e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x44,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x4e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x44,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x4e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x44,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x4e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x44,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x44,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x44,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x4e,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x4e,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x4e,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x44,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x4e,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x44,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_t_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x4e,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x44,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_t_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x4e,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x44,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_t_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x4e,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x44,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_f_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x60,0x7d]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x44,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_f_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x60,0x7d]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x44,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_f_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x60,0x7d]
 
-v_cmp_le_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x60,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x60,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x46,0x7d]
+v_cmpx_f_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x60,0x7d]
 
-v_cmp_le_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x46,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_f_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x60,0x7d]
 
-v_cmp_le_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x46,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x61,0x7d]
 
-v_cmp_le_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x46,0x7d]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x46,0x7d]
+v_cmpx_f_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x60,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x47,0x7d]
+v_cmpx_f_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x60,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x60,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x46,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x60,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x46,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x60,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x46,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x60,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x46,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x60,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x46,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x46,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x46,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x60,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_le_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x60,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_le_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x46,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x60,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x46,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x60,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x46,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x60,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x46,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x60,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x46,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_lt_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x62,0x7d]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x46,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_lt_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x62,0x7d]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x46,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_lt_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x62,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x62,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x48,0x7d]
+v_cmpx_lt_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x48,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x62,0x7d]
 
-v_cmp_gt_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x48,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x63,0x7d]
 
-v_cmp_gt_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x48,0x7d]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x48,0x7d]
+v_cmpx_lt_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x62,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x49,0x7d]
+v_cmpx_lt_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x62,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x62,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x62,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x62,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x62,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x62,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x62,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x62,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x48,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x62,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x62,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x62,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x62,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_eq_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x64,0x7d]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_eq_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x64,0x7d]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_eq_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x64,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ne_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x64,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ne_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x4a,0x7d]
+v_cmpx_eq_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x4a,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x64,0x7d]
 
-v_cmp_ne_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x4a,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x65,0x7d]
 
-v_cmp_ne_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x4a,0x7d]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x4a,0x7d]
+v_cmpx_eq_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x64,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x4b,0x7d]
+v_cmpx_eq_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x64,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x64,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x64,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x64,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x64,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x64,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x64,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x64,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x64,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x64,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x64,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x64,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_le_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x66,0x7d]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_le_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x66,0x7d]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_le_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x66,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x66,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x4c,0x7d]
+v_cmpx_le_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x4c,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_le_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x66,0x7d]
 
-v_cmp_ge_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x4c,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_le_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x67,0x7d]
 
-v_cmp_ge_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x4c,0x7d]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x4c,0x7d]
+v_cmpx_le_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x66,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x4d,0x7d]
+v_cmpx_le_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x66,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x66,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x66,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x66,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x66,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x66,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x66,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x66,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x66,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x66,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x66,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x66,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_gt_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x68,0x7d]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_gt_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x68,0x7d]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_gt_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x68,0x7d]
 
-v_cmp_t_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x68,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_t_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x68,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_t_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x4e,0x7d]
+v_cmpx_gt_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x68,0x7d]
 
-v_cmp_t_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x4e,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x68,0x7d]
 
-v_cmp_t_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x4e,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x69,0x7d]
 
-v_cmp_t_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x4e,0x7d]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x4e,0x7d]
+v_cmpx_gt_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x68,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x4f,0x7d]
+v_cmpx_gt_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x68,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x68,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x4e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x68,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x4e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x68,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x4e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x68,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x4e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x68,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x4e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x4e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x4e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x68,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_t_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x68,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_t_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x68,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x4e,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x68,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x4e,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x68,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x4e,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x68,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x4e,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_ne_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x6a,0x7d]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x4e,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_ne_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x6a,0x7d]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x4e,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_ne_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x6a,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x6a,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x60,0x7d]
+v_cmpx_ne_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x60,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ne_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x6a,0x7d]
 
-v_cmpx_f_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x60,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ne_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x6b,0x7d]
 
-v_cmpx_f_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x60,0x7d]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x60,0x7d]
+v_cmpx_ne_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x61,0x7d]
+v_cmpx_ne_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x6a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6a,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6a,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x6a,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6a,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x6a,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6a,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6a,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_ge_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x6c,0x7d]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_ge_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x6c,0x7d]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_ge_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x6c,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x6c,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x62,0x7d]
+v_cmpx_ge_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x62,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x6c,0x7d]
 
-v_cmpx_lt_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x62,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x6d,0x7d]
 
-v_cmpx_lt_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x62,0x7d]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x62,0x7d]
+v_cmpx_ge_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x63,0x7d]
+v_cmpx_ge_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x6c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6c,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6c,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x6c,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6c,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x6c,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6c,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6c,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_t_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0x6e,0x7d]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_t_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0x6e,0x7d]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_t_i64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0x6e,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0x6e,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x64,0x7d]
+v_cmpx_t_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x64,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0x6e,0x7d]
 
-v_cmpx_eq_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x64,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_t_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0x6f,0x7d]
 
-v_cmpx_eq_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x64,0x7d]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x64,0x7d]
+v_cmpx_t_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x65,0x7d]
+v_cmpx_t_i64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0x6e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6e,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6e,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0x6e,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6e,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0x6e,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6e,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6e,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_f_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x80,0x7d]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_f_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x80,0x7d]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_f_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x66,0x7d]
+v_cmp_f_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x66,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_f_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x66,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_f_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x80,0x7d]
 
-v_cmpx_le_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x66,0x7d]
+v_cmp_f_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x80,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x66,0x7d]
+v_cmp_f_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x80,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x67,0x7d]
+v_cmp_f_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x80,0x7d]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x80,0x7d]
 
-v_cmpx_le_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x81,0x7d]
 
-v_cmpx_le_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x80,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x80,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x80,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x80,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x80,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_f_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x80,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_f_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x80,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x80,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x80,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x80,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x80,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_f_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x80,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_gt_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_gt_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_gt_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_gt_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_gt_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_gt_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_gt_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_gt_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_gt_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_gt_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_gt_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_gt_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_gt_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x68,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_gt_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x68,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_f_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x80,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_gt_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x68,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_lt_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x68,0x7d]
+v_cmp_lt_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x68,0x7d]
+v_cmp_lt_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x69,0x7d]
+v_cmp_lt_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_lt_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_lt_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_lt_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x82,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_lt_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x82,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_lt_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_lt_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x82,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_lt_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x83,0x7d]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x82,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x82,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x82,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x82,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x82,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x82,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x82,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x82,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x82,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x82,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x82,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x82,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x6a,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x6a,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_lt_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x6a,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_lt_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x6a,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x6a,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ne_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x6b,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_ne_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ne_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ne_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ne_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ne_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x82,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_eq_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x84,0x7d]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_eq_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x84,0x7d]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_eq_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x84,0x7d]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_eq_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x84,0x7d]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_eq_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x84,0x7d]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_eq_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x84,0x7d]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_eq_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x84,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x6c,0x7d]
+v_cmp_eq_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x84,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x6c,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_eq_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x6c,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_eq_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x84,0x7d]
 
-v_cmpx_ge_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x6c,0x7d]
+v_cmp_eq_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x85,0x7d]
 
-v_cmpx_ge_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x6c,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x6d,0x7d]
+v_cmp_eq_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x84,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x84,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x84,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x84,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x6c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x84,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x84,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x84,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x84,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_ge_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6c,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x84,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x84,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x84,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x84,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6c,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x6c,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6c,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x6c,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6c,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_t_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6e,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_t_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0x6e,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_t_i64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0x6e,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_t_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0x6e,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_t_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0x6e,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_t_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0x6e,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_t_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0x6e,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_t_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0x6e,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_t_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0x6e,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_t_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0x6e,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x84,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_t_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0x6e,0x7d]
+v_cmp_le_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x86,0x7d]
 
-v_cmpx_t_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0x6e,0x7d]
+v_cmp_le_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x86,0x7d]
 
-v_cmpx_t_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0x6e,0x7d]
+v_cmp_le_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x86,0x7d]
 
-v_cmpx_t_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0x6e,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_le_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x86,0x7d]
 
-v_cmpx_t_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0x6e,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_le_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x86,0x7d]
 
-v_cmpx_t_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0x6e,0x7d]
+v_cmp_le_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x86,0x7d]
 
-v_cmpx_t_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0x6e,0x7d]
+v_cmp_le_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x86,0x7d]
 
-v_cmpx_t_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0x6f,0x7d]
+v_cmp_le_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0x6e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x86,0x7d]
 
-v_cmpx_t_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x86,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_le_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x86,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_t_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_le_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x86,0x7d]
 
-v_cmpx_t_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_le_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x86,0x7d]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6e,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_le_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x87,0x7d]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x6e,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6e,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_le_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x86,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x6e,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_le_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x86,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_le_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x86,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6e,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_le_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x86,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x86,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x86,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x86,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x86,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_f_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x86,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_f_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x86,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_f_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x86,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_f_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x86,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_f_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_f_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_f_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_f_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_f_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_f_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_f_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_f_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_f_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_f_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_f_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x80,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_le_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_f_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x80,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_le_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_f_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_f_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x80,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_f_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x81,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_f_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_f_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_f_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_f_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x86,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_f_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x80,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x80,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x80,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x80,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_gt_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x80,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_gt_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_gt_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_gt_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_gt_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_gt_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_gt_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x88,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_gt_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x88,0x7d]
 
-v_cmp_lt_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x82,0x7d]
+v_cmp_gt_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x88,0x7d]
 
-v_cmp_lt_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x82,0x7d]
+v_cmp_gt_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x88,0x7d]
 
-v_cmp_lt_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x82,0x7d]
+v_cmp_gt_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x88,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x82,0x7d]
+v_cmp_gt_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x88,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_lt_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x82,0x7d]
+v_cmp_gt_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x88,0x7d]
 
-v_cmp_lt_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x82,0x7d]
+v_cmp_gt_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x88,0x7d]
 
-v_cmp_lt_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x82,0x7d]
+v_cmp_gt_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x89,0x7d]
 
-v_cmp_lt_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x88,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x88,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x88,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x88,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x88,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x88,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x88,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x88,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x88,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x82,0x7d]
+v_cmp_gt_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x88,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x82,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_gt_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x88,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_lt_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x82,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_gt_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x88,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_lt_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x82,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_lt_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x82,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_lt_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x83,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_lt_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_lt_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_lt_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_lt_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_lt_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_lt_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_lt_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x82,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x82,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x82,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x82,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x82,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x88,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_ne_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8a,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_ne_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x8a,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ne_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x8a,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_ne_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8a,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8a,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x84,0x7d]
+v_cmp_ne_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8a,0x7d]
 
-v_cmp_eq_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x84,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ne_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8b,0x7d]
 
-v_cmp_eq_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x84,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ne_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x84,0x7d]
+v_cmp_ne_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x8a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x84,0x7d]
+v_cmp_ne_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x8a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x85,0x7d]
+v_cmp_ne_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x8a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x8a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x8a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x8a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x8a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x8a,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_eq_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x8a,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_eq_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x8a,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_eq_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x84,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x8a,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x84,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x84,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x84,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x84,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_le_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x86,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_le_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x86,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_le_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x86,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_le_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x86,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_le_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x86,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_le_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x86,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_le_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x86,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_le_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x86,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x8a,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_le_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x86,0x7d]
+v_cmp_ge_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x86,0x7d]
+v_cmp_ge_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x86,0x7d]
+v_cmp_ge_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x86,0x7d]
+v_cmp_ge_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x86,0x7d]
+v_cmp_ge_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x86,0x7d]
+v_cmp_ge_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x86,0x7d]
+v_cmp_ge_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x86,0x7d]
+v_cmp_ge_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x86,0x7d]
+v_cmp_ge_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x86,0x7d]
+v_cmp_ge_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x86,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ge_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x86,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ge_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x86,0x7d]
+v_cmp_ge_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x86,0x7d]
+v_cmp_ge_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8c,0x7d]
 
-v_cmp_le_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x87,0x7d]
+v_cmp_ge_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8c,0x7d]
 
-v_cmp_le_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8c,0x7d]
 
-v_cmp_le_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8c,0x7d]
 
-v_cmp_le_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8c,0x7d]
 
-v_cmp_le_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8c,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8c,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8c,0x7d]
 
-v_cmp_le_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8c,0x7d]
 
-v_cmp_le_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8d,0x7d]
 
-v_cmp_le_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x86,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x86,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x8c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x86,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x8c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x86,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x8c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ge_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x8c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x86,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_ge_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x8c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_ge_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x8c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_ge_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x8c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_ge_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x8c,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x8c,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ge_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x8c,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_ge_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_gt_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x8c,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_gt_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_gt_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_gt_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_gt_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_gt_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_gt_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_gt_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_gt_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_gt_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_gt_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_gt_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_gt_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_gt_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_gt_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_gt_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_gt_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_gt_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x88,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_gt_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x88,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ge_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_gt_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x88,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ge_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x8c,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_gt_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x88,0x7d]
+v_cmp_t_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x88,0x7d]
+v_cmp_t_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x89,0x7d]
+v_cmp_t_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x88,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x88,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x88,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x88,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_t_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_t_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x88,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_t_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_t_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_t_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8e,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_t_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8e,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_t_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_t_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8e,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_t_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8f,0x7d]
 
-v_cmp_ne_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x8e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0x8e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0x8e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x8e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x8e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x8e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x8e,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0x8e,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x8e,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0x8e,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_ne_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x8e,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_ne_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_ne_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_ne_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_ne_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_ne_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_ne_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8a,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_t_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_ne_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8a,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_t_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_ne_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_ne_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8a,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_ne_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8b,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_ne_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_ne_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_ne_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_ne_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_ne_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_ne_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_ne_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_ne_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_ne_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x8a,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x8e,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_ne_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x8a,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa0,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x8a,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xa0,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x8a,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xa0,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_f_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xa0,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x8a,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_f_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa0,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_f_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa0,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_f_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa0,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_f_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa0,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_f_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa0,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_f_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa0,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_f_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa0,0x7d]
 
-v_cmp_ge_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa0,0x7d]
 
-v_cmp_ge_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa0,0x7d]
 
-v_cmp_ge_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa0,0x7d]
 
-v_cmp_ge_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa0,0x7d]
 
-v_cmp_ge_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa0,0x7d]
 
-v_cmp_ge_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa0,0x7d]
 
-v_cmp_ge_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa0,0x7d]
 
-v_cmp_ge_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa0,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa0,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa0,0x7d]
 
-v_cmp_ge_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa0,0x7d]
 
-v_cmp_ge_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8c,0x7d]
+v_cmpx_f_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa1,0x7d]
 
-v_cmp_ge_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8c,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8c,0x7d]
+v_cmpx_f_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa0,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8c,0x7d]
+v_cmpx_f_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xa0,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8c,0x7d]
+v_cmpx_f_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xa0,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8c,0x7d]
+v_cmpx_f_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa0,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8c,0x7d]
+v_cmpx_f_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa0,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8c,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_f_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa0,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8c,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa0,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8c,0x7d]
+v_cmpx_f_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa0,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8c,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa0,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8d,0x7d]
+v_cmpx_f_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa0,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa0,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_ge_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_ge_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_ge_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_ge_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_ge_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x8c,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x8c,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x8c,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x8c,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x8c,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmp_t_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8e,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa0,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmp_t_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8e,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8e,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa2,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_t_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa2,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_t_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8e,0x7d]
+v_cmpx_lt_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa2,0x7d]
 
-v_cmp_t_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8f,0x7d]
+v_cmpx_lt_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa2,0x7d]
 
-v_cmp_t_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x8e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa3,0x7d]
 
-v_cmp_t_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x8e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0x8e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa2,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0x8e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xa2,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x8e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xa2,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x8e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa2,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x8e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa2,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x8e,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa2,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x8e,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa2,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x8e,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa2,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x8e,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa2,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x8e,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa2,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x8e,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x8e,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa2,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x8e,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x8e,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x8e,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x8e,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x8e,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x8e,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_f_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_f_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_f_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_f_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_f_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_f_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_f_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_f_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_f_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_f_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_f_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_f_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_f_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa0,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa2,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_f_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa0,0x7d]
+v_cmpx_eq_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa0,0x7d]
+v_cmpx_eq_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa0,0x7d]
+v_cmpx_eq_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa0,0x7d]
+v_cmpx_eq_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa0,0x7d]
+v_cmpx_eq_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa0,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa0,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa0,0x7d]
+v_cmpx_eq_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa0,0x7d]
+v_cmpx_eq_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa1,0x7d]
+v_cmpx_eq_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xa0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xa0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa0,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa4,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa0,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa4,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xa0,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xa0,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa4,0x7d]
 
-v_cmpx_f_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa0,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_eq_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa5,0x7d]
 
-v_cmpx_f_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa0,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa0,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_eq_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa4,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa0,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_eq_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xa4,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xa0,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xa4,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xa0,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_eq_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa4,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa0,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_eq_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa4,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa0,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa4,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa4,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa4,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa4,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa4,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa4,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa2,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lt_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa2,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_lt_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_lt_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa2,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lt_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa3,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_lt_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa4,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lt_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xa2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xa2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa2,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa2,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xa2,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xa2,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa2,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_le_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa2,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_le_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa2,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_le_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa2,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_le_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xa2,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_le_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xa2,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_le_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa2,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_le_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa6,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa2,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_le_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa6,0x7d]
 
-v_cmpx_eq_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa4,0x7d]
+v_cmpx_le_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa6,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xa4,0x7d]
+v_cmpx_le_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa6,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xa4,0x7d]
+v_cmpx_le_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa6,0x7d]
 
-v_cmpx_eq_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xa4,0x7d]
+v_cmpx_le_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa6,0x7d]
 
-v_cmpx_eq_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa4,0x7d]
+v_cmpx_le_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa7,0x7d]
 
-v_cmpx_eq_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa6,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xa6,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xa6,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa6,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa6,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa6,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa6,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa6,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa6,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa6,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa4,0x7d]
+v_cmpx_le_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa6,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa4,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_le_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa4,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_le_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa4,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa4,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa5,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_eq_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_eq_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xa4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_eq_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xa4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_eq_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_eq_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_eq_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_eq_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_eq_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa4,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_eq_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa4,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_eq_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xa4,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_eq_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xa4,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_eq_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa4,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_eq_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa4,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa6,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa4,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_gt_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa8,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa4,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_gt_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xa8,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xa4,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xa8,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xa4,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_gt_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xa8,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa4,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_gt_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa8,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa4,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa8,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa8,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa8,0x7d]
 
-v_cmpx_le_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa6,0x7d]
+v_cmpx_gt_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa9,0x7d]
 
-v_cmpx_le_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa6,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa6,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa8,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa6,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xa8,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa6,0x7d]
+v_cmpx_gt_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xa8,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa6,0x7d]
+v_cmpx_gt_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa8,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa7,0x7d]
+v_cmpx_gt_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa8,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa8,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa8,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xa6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa8,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_le_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xa6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa8,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_le_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa8,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_le_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_le_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa8,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_le_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa6,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa6,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xa6,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xa6,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa6,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa6,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa6,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa6,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xa6,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xa6,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa6,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa6,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa8,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_gt_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xa8,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_gt_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xa8,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_gt_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xa8,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_gt_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa8,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_gt_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa8,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa8,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_gt_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa8,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ne_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa8,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ne_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa8,0x7d]
+v_cmpx_ne_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa9,0x7d]
+v_cmpx_ne_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xaa,0x7d]
 
-v_cmpx_gt_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xaa,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xa8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xaa,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xa8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xaa,0x7d]
 
-v_cmpx_gt_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xaa,0x7d]
 
-v_cmpx_gt_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xab,0x7d]
 
-v_cmpx_gt_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xaa,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa8,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xaa,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa8,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xaa,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xa8,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xaa,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xa8,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xaa,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa8,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_ne_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xaa,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa8,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_ne_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xaa,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa8,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_ne_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xaa,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa8,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xaa,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xa8,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_ne_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xaa,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xa8,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_ne_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa8,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_ne_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xaa,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa8,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_ne_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_ne_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ne_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_ne_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_ne_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xaa,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xaa,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ne_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xaa,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xaa,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xaa,0x7d]
+v_cmpx_ge_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xaa,0x7d]
+v_cmpx_ge_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xab,0x7d]
+v_cmpx_ge_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xaa,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xaa,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xaa,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xaa,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xaa,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xaa,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xaa,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xaa,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xaa,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xaa,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xaa,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xaa,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_ge_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xaa,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_ge_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xac,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xaa,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_ge_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xac,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xaa,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_ge_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xaa,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xac,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xaa,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_ge_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xad,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xaa,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xaa,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_ge_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xac,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xac,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xac,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xac,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xac,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xac,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xac,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xac,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xac,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xac,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xac,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xac,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xac,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xac,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xac,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xad,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xac,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xac,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_ge_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xac,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_ge_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xac,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ge_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xac,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_ge_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xac,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_ge_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xac,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xac,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ge_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xac,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xac,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, s103, v2
+// CHECK: [0x67,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xac,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xac,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xac,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xac,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_t_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xac,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_t_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xac,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_t_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xac,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_t_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xac,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_t_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xac,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_t_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xac,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_t_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xae,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xac,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_t_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xae,0x7d]
 
-v_cmpx_t_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xae,0x7d]
+v_cmpx_t_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xae,0x7d]
 
-v_cmpx_t_u32 vcc, s103, v0
-// CHECK: [0x67,0x00,0xae,0x7d]
+v_cmpx_t_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xae,0x7d]
 
-v_cmpx_t_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0xae,0x7d]
+v_cmpx_t_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xae,0x7d]
 
-v_cmpx_t_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0xae,0x7d]
+v_cmpx_t_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xae,0x7d]
 
-v_cmpx_t_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xae,0x7d]
+v_cmpx_t_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xae,0x7d]
 
-v_cmpx_t_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xae,0x7d]
+v_cmpx_t_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xae,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xae,0x7d]
+v_cmpx_t_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xae,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_t_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xae,0x7d]
+v_cmpx_t_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xae,0x7d]
 
-v_cmpx_t_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xae,0x7d]
+v_cmpx_t_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xae,0x7d]
 
-v_cmpx_t_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xae,0x7d]
+v_cmpx_t_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xaf,0x7d]
 
-v_cmpx_t_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xae,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xae,0x7d]
+v_cmpx_t_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xae,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xae,0x7d]
+v_cmpx_t_u32_e64 s[102:103], 0, s2
+// CHECK: [0x66,0x00,0xae,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xae,0x7d]
+v_cmpx_t_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x68,0x00,0xae,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xae,0x7d]
+v_cmpx_t_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xae,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xae,0x7d]
+v_cmpx_t_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xae,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xae,0x7d]
+v_cmpx_t_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xae,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xae,0x7d]
+v_cmpx_t_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xae,0xd1,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xae,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xae,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xae,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_t_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xae,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xae,0x7d]
+v_cmpx_t_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xae,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cmpx_t_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xae,0x7d]
+v_cmpx_t_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xae,0xd1,0x01,0x05,0x00,0x00]
 
-v_cmpx_t_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xaf,0x7d]
+v_cmpx_t_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xae,0xd1,0xff,0x05,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xae,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, s103
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xce,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xae,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xd0,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[102:103], s0, s0
-// CHECK: [0x66,0x00,0xae,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xd2,0x00,0x00]
 
-v_cmpx_t_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x68,0x00,0xae,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cmpx_t_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xae,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cmpx_t_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xae,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cmpx_t_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xae,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xda,0x00,0x00]
 
-v_cmpx_t_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xae,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xae,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xde,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xae,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xae,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xae,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xae,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xae,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0x00,0x01,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xae,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0x82,0x01,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xae,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xae,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xee,0x01,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xae,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0x04,0x02,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xae,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xae,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xae,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_f_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc0,0x7d]
+v_cmp_f_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc0,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_f_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc0,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_f_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc0,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc0,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_f_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc0,0x7d]
+v_cmp_f_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc0,0x7d]
 
-v_cmp_f_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc0,0x7d]
+v_cmp_f_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc1,0x7d]
 
-v_cmp_f_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc1,0x7d]
+v_cmp_f_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xc0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xc0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xc0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xc0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xc0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xc0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xc0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_f_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_f_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_f_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_f_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_f_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xc0,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_f_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xc0,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_f_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xc0,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_f_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xc0,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_f_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xc0,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_f_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xc0,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_f_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xc0,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_lt_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc2,0x7d]
+v_cmp_lt_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc2,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc2,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_lt_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc2,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_lt_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc2,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_lt_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc2,0x7d]
+v_cmp_lt_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc2,0x7d]
 
-v_cmp_lt_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc2,0x7d]
+v_cmp_lt_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc3,0x7d]
 
-v_cmp_lt_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc3,0x7d]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xc2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xc2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xc2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xc2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xc2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xc2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xc2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xc2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xc2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xc2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xc2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xc2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xc2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xc2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_lt_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xc2,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xc2,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xc2,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xc2,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xc2,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xc2,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xc2,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_eq_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc4,0x7d]
+v_cmp_eq_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc4,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc4,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_eq_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc4,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc4,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_eq_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc4,0x7d]
+v_cmp_eq_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc4,0x7d]
 
-v_cmp_eq_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc4,0x7d]
+v_cmp_eq_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc5,0x7d]
 
-v_cmp_eq_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc5,0x7d]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xc4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xc4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xc4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xc4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xc4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xc4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xc4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_eq_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xc4,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xc4,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xc4,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xc4,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xc4,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xc4,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xc4,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_le_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc6,0x7d]
+v_cmp_le_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc6,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc6,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_le_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc6,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc6,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_le_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc6,0x7d]
+v_cmp_le_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc6,0x7d]
 
-v_cmp_le_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc6,0x7d]
+v_cmp_le_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc7,0x7d]
 
-v_cmp_le_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc7,0x7d]
+v_cmp_le_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xc6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xc6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xc6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xc6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xc6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xc6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xc6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_le_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_le_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_le_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_le_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_le_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xc6,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_le_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xc6,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_le_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xc6,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_le_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xc6,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_le_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xc6,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_le_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xc6,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_le_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xc6,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_gt_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc8,0x7d]
+v_cmp_gt_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc8,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc8,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_gt_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc8,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc8,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_gt_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc8,0x7d]
+v_cmp_gt_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc8,0x7d]
 
-v_cmp_gt_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc8,0x7d]
+v_cmp_gt_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc9,0x7d]
 
-v_cmp_gt_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc9,0x7d]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xc8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xc8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xc8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xc8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xc8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xc8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xc8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xc8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xc8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xc8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xc8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xc8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xc8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xc8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_gt_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xc8,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xc8,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xc8,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xc8,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xc8,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xc8,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xc8,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_ne_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xca,0x7d]
+v_cmp_ne_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xca,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ne_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xca,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ne_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xca,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ne_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xca,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ne_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xca,0x7d]
+v_cmp_ne_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xca,0x7d]
 
-v_cmp_ne_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xca,0x7d]
+v_cmp_ne_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcb,0x7d]
 
-v_cmp_ne_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcb,0x7d]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xca,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xca,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xca,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xca,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xca,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xca,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xca,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xca,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xca,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xca,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xca,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xca,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xca,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xca,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ne_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xca,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xca,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xca,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xca,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xca,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xca,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xca,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xca,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_ge_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xcc,0x7d]
+v_cmp_ge_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xcc,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xcc,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ge_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xcc,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xcc,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ge_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xcc,0x7d]
+v_cmp_ge_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xcc,0x7d]
 
-v_cmp_ge_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xcc,0x7d]
+v_cmp_ge_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcd,0x7d]
 
-v_cmp_ge_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcd,0x7d]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xcc,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xcc,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xcc,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xcc,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xcc,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xcc,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xcc,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ge_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xcc,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xcc,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xcc,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xcc,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xcc,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xcc,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xcc,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_t_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xce,0x7d]
 
-v_cmp_t_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xce,0x7d]
+v_cmp_t_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xce,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_t_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xce,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_t_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xce,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_t_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xce,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_t_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xce,0x7d]
 
-v_cmp_t_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xce,0x7d]
+v_cmp_t_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xce,0x7d]
 
-v_cmp_t_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xce,0x7d]
+v_cmp_t_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcf,0x7d]
 
-v_cmp_t_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcf,0x7d]
+v_cmp_t_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xce,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xce,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xce,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xce,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xce,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xce,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xce,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xce,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xce,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xce,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xce,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xce,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xce,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xce,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_t_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_t_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_t_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_t_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_t_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xce,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xce,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_t_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xce,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_t_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xce,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_t_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xce,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_t_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xce,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_t_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xce,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_t_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xce,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_f_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe0,0x7d]
+v_cmpx_f_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe0,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe0,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_f_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe0,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe0,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe0,0x7d]
+v_cmpx_f_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe0,0x7d]
 
-v_cmpx_f_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe0,0x7d]
+v_cmpx_f_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe1,0x7d]
 
-v_cmpx_f_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe1,0x7d]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xe0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xe0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe0,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_f_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe0,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe0,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe0,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe0,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe0,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe0,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe0,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe0,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe0,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe0,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe0,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_lt_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe2,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe2,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe2,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe2,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe2,0x7d]
 
-v_cmpx_lt_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe2,0x7d]
+v_cmpx_lt_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe3,0x7d]
 
-v_cmpx_lt_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe3,0x7d]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xe2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe2,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_lt_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe2,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe2,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe2,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe2,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe2,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe2,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_eq_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe4,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe4,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe4,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe4,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe4,0x7d]
 
-v_cmpx_eq_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe4,0x7d]
+v_cmpx_eq_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe5,0x7d]
 
-v_cmpx_eq_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe5,0x7d]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xe4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xe4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe4,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_eq_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe4,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe4,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe4,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe4,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe4,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe4,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_le_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe6,0x7d]
+v_cmpx_le_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe6,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe6,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_le_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe6,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe6,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_le_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe6,0x7d]
+v_cmpx_le_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe6,0x7d]
 
-v_cmpx_le_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe6,0x7d]
+v_cmpx_le_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe7,0x7d]
 
-v_cmpx_le_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe7,0x7d]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xe6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xe6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe6,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_le_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe6,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe6,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe6,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe6,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe6,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe6,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_gt_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe8,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe8,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe8,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe8,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe8,0x7d]
 
-v_cmpx_gt_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe8,0x7d]
+v_cmpx_gt_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe9,0x7d]
 
-v_cmpx_gt_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe9,0x7d]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xe8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xe8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe8,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe8,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_gt_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe8,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe8,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe8,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe8,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe8,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe8,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe8,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe8,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe8,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe8,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe8,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_ne_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xea,0x7d]
+v_cmpx_ne_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xea,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ne_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xea,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ne_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xea,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ne_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xea,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ne_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xea,0x7d]
+v_cmpx_ne_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xea,0x7d]
 
-v_cmpx_ne_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xea,0x7d]
+v_cmpx_ne_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xeb,0x7d]
 
-v_cmpx_ne_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xeb,0x7d]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xea,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xea,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xea,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xea,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xea,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xea,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xea,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_ne_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xea,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xea,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xea,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xea,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xea,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xea,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_ge_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xec,0x7d]
+v_cmpx_ge_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xec,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xec,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xec,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xec,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xec,0x7d]
+v_cmpx_ge_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xec,0x7d]
 
-v_cmpx_ge_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xec,0x7d]
+v_cmpx_ge_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xed,0x7d]
 
-v_cmpx_ge_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xed,0x7d]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xec,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xec,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xec,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xec,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xec,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xec,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xec,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xec,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xec,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xec,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xec,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xec,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xec,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xec,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_ge_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xec,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xec,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xec,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xec,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xec,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xec,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_t_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, s[102:103], v[2:3]
+// CHECK: [0x66,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, s[102:103], v[0:1]
-// CHECK: [0x66,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x68,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x68,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xee,0x7d]
+v_cmpx_t_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xee,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xee,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xee,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_t_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xee,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_t_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xee,0x7d]
+v_cmpx_t_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xee,0x7d]
 
-v_cmpx_t_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xee,0x7d]
+v_cmpx_t_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xef,0x7d]
 
-v_cmpx_t_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xef,0x7d]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xee,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 s[102:103], s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xee,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 s[102:103], s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x68,0x00,0xee,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x68,0x00,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xee,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xee,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xee,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xee,0xd1,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd1,0x80,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd1,0xc1,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd1,0xf0,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd1,0xf7,0x08,0x00,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd1,0x01,0x09,0x00,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_t_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd1,0xfe,0x09,0x00,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xee,0xd1,0x04,0x00,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xee,0xd1,0x04,0x82,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xee,0xd1,0x04,0xe0,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xee,0xd1,0x04,0xee,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xee,0xd1,0x04,0x04,0x02,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xee,0xd1,0x04,0xfc,0x03,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_class_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x10,0x7d]
 
-v_cmp_class_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x10,0x7d]
+v_cmp_class_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x10,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_class_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x10,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_class_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x10,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_class_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x10,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_class_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x10,0x7d]
 
-v_cmp_class_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x10,0x7d]
+v_cmp_class_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x10,0x7d]
 
-v_cmp_class_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x10,0x7d]
+v_cmp_class_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x11,0x7d]
 
-v_cmp_class_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x11,0x7d]
+v_cmpx_class_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, s103, v2
+// CHECK: [0x67,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, s103, v0
-// CHECK: [0x67,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x68,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x68,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x69,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x69,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x30,0x7d]
+v_cmpx_class_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x30,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_class_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x30,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_class_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x30,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_class_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x30,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_class_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x30,0x7d]
+v_cmpx_class_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x30,0x7d]
 
-v_cmpx_class_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x30,0x7d]
+v_cmpx_class_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x31,0x7d]
 
-v_cmpx_class_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x31,0x7d]
+v_cmp_class_f64 vcc, s[2:3], v2
+// CHECK: [0x02,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, s[0:1], v0
-// CHECK: [0x00,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, s[4:5], v2
+// CHECK: [0x04,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, s[2:3], v0
-// CHECK: [0x02,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, s[102:103], v2
+// CHECK: [0x66,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, s[102:103], v0
-// CHECK: [0x66,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, flat_scratch, v2
+// CHECK: [0x68,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, flat_scratch, v0
-// CHECK: [0x68,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, vcc, v2
+// CHECK: [0x6a,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, vcc, v0
-// CHECK: [0x6a,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, tba, v2
+// CHECK: [0x6c,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, tba, v0
-// CHECK: [0x6c,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, tma, v2
+// CHECK: [0x6e,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, tma, v0
-// CHECK: [0x6e,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, ttmp[10:11], v2
+// CHECK: [0x7a,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, ttmp[10:11], v0
-// CHECK: [0x7a,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, exec, v2
+// CHECK: [0x7e,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, exec, v0
-// CHECK: [0x7e,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, 0, v2
+// CHECK: [0x80,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, 0, v0
-// CHECK: [0x80,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x50,0x7d]
 
-v_cmp_class_f64 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x50,0x7d]
+v_cmp_class_f64 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x50,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_class_f64 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x50,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_class_f64 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x50,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_class_f64 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x50,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_class_f64 vcc, v[1:2], v2
+// CHECK: [0x01,0x05,0x50,0x7d]
 
-v_cmp_class_f64 vcc, v[0:1], v0
-// CHECK: [0x00,0x01,0x50,0x7d]
+v_cmp_class_f64 vcc, v[254:255], v2
+// CHECK: [0xfe,0x05,0x50,0x7d]
 
-v_cmp_class_f64 vcc, v[254:255], v0
-// CHECK: [0xfe,0x01,0x50,0x7d]
+v_cmp_class_f64 vcc, s[2:3], v255
+// CHECK: [0x02,0xfe,0x51,0x7d]
 
-v_cmp_class_f64 vcc, s[0:1], v255
-// CHECK: [0x00,0xfe,0x51,0x7d]
+v_cmpx_class_f64 vcc, s[2:3], v2
+// CHECK: [0x02,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, s[0:1], v0
-// CHECK: [0x00,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, s[4:5], v2
+// CHECK: [0x04,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, s[2:3], v0
-// CHECK: [0x02,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, s[102:103], v2
+// CHECK: [0x66,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, s[102:103], v0
-// CHECK: [0x66,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, flat_scratch, v2
+// CHECK: [0x68,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, flat_scratch, v0
-// CHECK: [0x68,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, vcc, v2
+// CHECK: [0x6a,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, vcc, v0
-// CHECK: [0x6a,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, tba, v2
+// CHECK: [0x6c,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, tba, v0
-// CHECK: [0x6c,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, tma, v2
+// CHECK: [0x6e,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, tma, v0
-// CHECK: [0x6e,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, ttmp[10:11], v2
+// CHECK: [0x7a,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, ttmp[10:11], v0
-// CHECK: [0x7a,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, exec, v2
+// CHECK: [0x7e,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, exec, v0
-// CHECK: [0x7e,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, 0, v2
+// CHECK: [0x80,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, 0, v0
-// CHECK: [0x80,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x70,0x7d]
+v_cmpx_class_f64 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x70,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_class_f64 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x70,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_class_f64 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x70,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_class_f64 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x70,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_class_f64 vcc, v[1:2], v2
+// CHECK: [0x01,0x05,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, v[0:1], v0
-// CHECK: [0x00,0x01,0x70,0x7d]
+v_cmpx_class_f64 vcc, v[254:255], v2
+// CHECK: [0xfe,0x05,0x70,0x7d]
 
-v_cmpx_class_f64 vcc, v[254:255], v0
-// CHECK: [0xfe,0x01,0x70,0x7d]
+v_cmpx_class_f64 vcc, s[2:3], v255
+// CHECK: [0x02,0xfe,0x71,0x7d]
 
-v_cmpx_class_f64 vcc, s[0:1], v255
-// CHECK: [0x00,0xfe,0x71,0x7d]
diff --git a/test/MC/AMDGPU/gfx8_asm_all.s b/test/MC/AMDGPU/gfx8_asm_all.s
index 30e7eeeae12530cef476fd9a8114147c1ce62c34..0a0d42c208f91501adfe07bc79d22732ccb604e0 100644
--- a/test/MC/AMDGPU/gfx8_asm_all.s
+++ b/test/MC/AMDGPU/gfx8_asm_all.s
@@ -1,7735 +1,7896 @@
 // RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s
 
-ds_add_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x00,0xd8,0x00,0x00,0x00,0x00]
+// *** GENERATED BY TESTGEN, DO NOT EDIT! ***
 
-ds_add_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x00,0xd8,0xff,0x00,0x00,0x00]
+ds_add_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x00,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x00,0xd8,0x00,0xff,0x00,0x00]
+ds_add_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x00,0xd8,0xff,0x02,0x00,0x00]
 
-ds_add_u32 v0, v0
-// CHECK: [0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x00,0xd8,0x01,0xff,0x00,0x00]
 
-ds_add_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x00,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u32 v1, v2
+// CHECK: [0x00,0x00,0x00,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x00,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x00,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x01,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x00,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x02,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x01,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x02,0xd8,0xff,0x00,0x00,0x00]
+ds_sub_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x02,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x02,0xd8,0x00,0xff,0x00,0x00]
+ds_sub_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x02,0xd8,0xff,0x02,0x00,0x00]
 
-ds_sub_u32 v0, v0
-// CHECK: [0x00,0x00,0x02,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x02,0xd8,0x01,0xff,0x00,0x00]
 
-ds_sub_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x02,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u32 v1, v2
+// CHECK: [0x00,0x00,0x02,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x02,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x02,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x03,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x02,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x04,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x03,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x04,0xd8,0xff,0x00,0x00,0x00]
+ds_rsub_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x04,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x04,0xd8,0x00,0xff,0x00,0x00]
+ds_rsub_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x04,0xd8,0xff,0x02,0x00,0x00]
 
-ds_rsub_u32 v0, v0
-// CHECK: [0x00,0x00,0x04,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x04,0xd8,0x01,0xff,0x00,0x00]
 
-ds_rsub_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x04,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u32 v1, v2
+// CHECK: [0x00,0x00,0x04,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x04,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x04,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x05,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x04,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x06,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x05,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x06,0xd8,0xff,0x00,0x00,0x00]
+ds_inc_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x06,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x06,0xd8,0x00,0xff,0x00,0x00]
+ds_inc_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x06,0xd8,0xff,0x02,0x00,0x00]
 
-ds_inc_u32 v0, v0
-// CHECK: [0x00,0x00,0x06,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x06,0xd8,0x01,0xff,0x00,0x00]
 
-ds_inc_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x06,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u32 v1, v2
+// CHECK: [0x00,0x00,0x06,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x06,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x06,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x07,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x06,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x08,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x07,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x08,0xd8,0xff,0x00,0x00,0x00]
+ds_dec_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x08,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x08,0xd8,0x00,0xff,0x00,0x00]
+ds_dec_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x08,0xd8,0xff,0x02,0x00,0x00]
 
-ds_dec_u32 v0, v0
-// CHECK: [0x00,0x00,0x08,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x08,0xd8,0x01,0xff,0x00,0x00]
 
-ds_dec_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x08,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u32 v1, v2
+// CHECK: [0x00,0x00,0x08,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x08,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x08,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x09,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x08,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x0a,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x09,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x0a,0xd8,0xff,0x00,0x00,0x00]
+ds_min_i32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x0a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x0a,0xd8,0x00,0xff,0x00,0x00]
+ds_min_i32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x0a,0xd8,0xff,0x02,0x00,0x00]
 
-ds_min_i32 v0, v0
-// CHECK: [0x00,0x00,0x0a,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x0a,0xd8,0x01,0xff,0x00,0x00]
 
-ds_min_i32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x0a,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i32 v1, v2
+// CHECK: [0x00,0x00,0x0a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x0a,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x0a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0b,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x0a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x0c,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x0b,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x0c,0xd8,0xff,0x00,0x00,0x00]
+ds_max_i32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x0c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x0c,0xd8,0x00,0xff,0x00,0x00]
+ds_max_i32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x0c,0xd8,0xff,0x02,0x00,0x00]
 
-ds_max_i32 v0, v0
-// CHECK: [0x00,0x00,0x0c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x0c,0xd8,0x01,0xff,0x00,0x00]
 
-ds_max_i32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x0c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i32 v1, v2
+// CHECK: [0x00,0x00,0x0c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x0c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x0c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0d,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x0c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x0e,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x0d,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x0e,0xd8,0xff,0x00,0x00,0x00]
+ds_min_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x0e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x0e,0xd8,0x00,0xff,0x00,0x00]
+ds_min_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x0e,0xd8,0xff,0x02,0x00,0x00]
 
-ds_min_u32 v0, v0
-// CHECK: [0x00,0x00,0x0e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x0e,0xd8,0x01,0xff,0x00,0x00]
 
-ds_min_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x0e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u32 v1, v2
+// CHECK: [0x00,0x00,0x0e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x0e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x0e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0f,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x0e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x10,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x0f,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x10,0xd8,0xff,0x00,0x00,0x00]
+ds_max_u32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x10,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x10,0xd8,0x00,0xff,0x00,0x00]
+ds_max_u32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x10,0xd8,0xff,0x02,0x00,0x00]
 
-ds_max_u32 v0, v0
-// CHECK: [0x00,0x00,0x10,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x10,0xd8,0x01,0xff,0x00,0x00]
 
-ds_max_u32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x10,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u32 v1, v2
+// CHECK: [0x00,0x00,0x10,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x10,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x10,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x11,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x10,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x12,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x11,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x12,0xd8,0xff,0x00,0x00,0x00]
+ds_and_b32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x12,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x12,0xd8,0x00,0xff,0x00,0x00]
+ds_and_b32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x12,0xd8,0xff,0x02,0x00,0x00]
 
-ds_and_b32 v0, v0
-// CHECK: [0x00,0x00,0x12,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x12,0xd8,0x01,0xff,0x00,0x00]
 
-ds_and_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x12,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b32 v1, v2
+// CHECK: [0x00,0x00,0x12,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x12,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x12,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x13,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x12,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x14,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x13,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x14,0xd8,0xff,0x00,0x00,0x00]
+ds_or_b32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x14,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x14,0xd8,0x00,0xff,0x00,0x00]
+ds_or_b32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x14,0xd8,0xff,0x02,0x00,0x00]
 
-ds_or_b32 v0, v0
-// CHECK: [0x00,0x00,0x14,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x14,0xd8,0x01,0xff,0x00,0x00]
 
-ds_or_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x14,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b32 v1, v2
+// CHECK: [0x00,0x00,0x14,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x14,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x14,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x15,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x14,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x16,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x15,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x16,0xd8,0xff,0x00,0x00,0x00]
+ds_xor_b32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x16,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x16,0xd8,0x00,0xff,0x00,0x00]
+ds_xor_b32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x16,0xd8,0xff,0x02,0x00,0x00]
 
-ds_xor_b32 v0, v0
-// CHECK: [0x00,0x00,0x16,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x16,0xd8,0x01,0xff,0x00,0x00]
 
-ds_xor_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x16,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b32 v1, v2
+// CHECK: [0x00,0x00,0x16,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x16,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x16,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x17,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x16,0xd8,0x01,0x02,0x00,0x00]
 
-ds_mskor_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x18,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x17,0xd8,0x01,0x02,0x00,0x00]
 
-ds_mskor_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x18,0xd8,0xff,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x18,0xd8,0x01,0x02,0x03,0x00]
 
-ds_mskor_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x18,0xd8,0x00,0xff,0x00,0x00]
+ds_mskor_b32 v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x18,0xd8,0xff,0x02,0x03,0x00]
 
-ds_mskor_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x18,0xd8,0x00,0x00,0xff,0x00]
+ds_mskor_b32 v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0x18,0xd8,0x01,0xff,0x03,0x00]
 
-ds_mskor_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0x18,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0x18,0xd8,0x01,0x02,0xff,0x00]
 
-ds_mskor_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x18,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v3
+// CHECK: [0x00,0x00,0x18,0xd8,0x01,0x02,0x03,0x00]
 
-ds_mskor_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x18,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0x18,0xd8,0x01,0x02,0x03,0x00]
 
-ds_mskor_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x19,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0x18,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x1a,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b32 v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0x19,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x1a,0xd8,0xff,0x00,0x00,0x00]
+ds_write_b32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x1a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x1a,0xd8,0x00,0xff,0x00,0x00]
+ds_write_b32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x1a,0xd8,0xff,0x02,0x00,0x00]
 
-ds_write_b32 v0, v0
-// CHECK: [0x00,0x00,0x1a,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x1a,0xd8,0x01,0xff,0x00,0x00]
 
-ds_write_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x1a,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b32 v1, v2
+// CHECK: [0x00,0x00,0x1a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x1a,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x1a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x1b,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x1a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x1b,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write2_b32 v255, v0, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x1c,0xd8,0xff,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x1c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v255, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x1c,0xd8,0x00,0xff,0x00,0x00]
+ds_write2_b32 v255, v2, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x1c,0xd8,0xff,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x1c,0xd8,0x00,0x00,0xff,0x00]
+ds_write2_b32 v1, v255, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x1c,0xd8,0x01,0xff,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset1:255
-// CHECK: [0x00,0xff,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x1c,0xd8,0x01,0x02,0xff,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset1:255
+// CHECK: [0x00,0xff,0x1c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0x1c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:127
-// CHECK: [0x7f,0x00,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0x1c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:127
+// CHECK: [0x7f,0x00,0x1c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0x1c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0x1c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b32 v0, v0, v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0x1d,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0x1c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x1e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b32 v1, v2, v3 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0x1d,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v255, v0, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x1e,0xd8,0xff,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x1e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v255, v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x1e,0xd8,0x00,0xff,0x00,0x00]
+ds_write2st64_b32 v255, v2, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x1e,0xd8,0xff,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x1e,0xd8,0x00,0x00,0xff,0x00]
+ds_write2st64_b32 v1, v255, v3 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x1e,0xd8,0x01,0xff,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset1:255
-// CHECK: [0x00,0xff,0x1e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x1e,0xd8,0x01,0x02,0xff,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0x1e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset1:255
+// CHECK: [0x00,0xff,0x1e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0x1e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0x1e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:127
-// CHECK: [0x7f,0x00,0x1e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0x1e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0x1e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:127
+// CHECK: [0x7f,0x00,0x1e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0x1e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0x1e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b32 v0, v0, v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0x1f,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0x1e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x20,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b32 v1, v2, v3 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0x1f,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x20,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x20,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x20,0xd8,0x00,0xff,0x00,0x00]
+ds_cmpst_b32 v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x20,0xd8,0xff,0x02,0x03,0x00]
 
-ds_cmpst_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x20,0xd8,0x00,0x00,0xff,0x00]
+ds_cmpst_b32 v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0x20,0xd8,0x01,0xff,0x03,0x00]
 
-ds_cmpst_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0x20,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0x20,0xd8,0x01,0x02,0xff,0x00]
 
-ds_cmpst_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x20,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v3
+// CHECK: [0x00,0x00,0x20,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x20,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0x20,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x21,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0x20,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x22,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b32 v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0x21,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x22,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x22,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x22,0xd8,0x00,0xff,0x00,0x00]
+ds_cmpst_f32 v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x22,0xd8,0xff,0x02,0x03,0x00]
 
-ds_cmpst_f32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x22,0xd8,0x00,0x00,0xff,0x00]
+ds_cmpst_f32 v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0x22,0xd8,0x01,0xff,0x03,0x00]
 
-ds_cmpst_f32 v0, v0, v0
-// CHECK: [0x00,0x00,0x22,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0x22,0xd8,0x01,0x02,0xff,0x00]
 
-ds_cmpst_f32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x22,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v3
+// CHECK: [0x00,0x00,0x22,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x22,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0x22,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x23,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0x22,0xd8,0x01,0x02,0x03,0x00]
 
-ds_min_f32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x24,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f32 v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0x23,0xd8,0x01,0x02,0x03,0x00]
 
-ds_min_f32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x24,0xd8,0xff,0x00,0x00,0x00]
+ds_min_f32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x24,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_f32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x24,0xd8,0x00,0xff,0x00,0x00]
+ds_min_f32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x24,0xd8,0xff,0x02,0x00,0x00]
 
-ds_min_f32 v0, v0
-// CHECK: [0x00,0x00,0x24,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x24,0xd8,0x01,0xff,0x00,0x00]
 
-ds_min_f32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x24,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f32 v1, v2
+// CHECK: [0x00,0x00,0x24,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_f32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x24,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x24,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_f32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x25,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x24,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x26,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x25,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x26,0xd8,0xff,0x00,0x00,0x00]
+ds_max_f32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x26,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x26,0xd8,0x00,0xff,0x00,0x00]
+ds_max_f32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x26,0xd8,0xff,0x02,0x00,0x00]
 
-ds_max_f32 v0, v0
-// CHECK: [0x00,0x00,0x26,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x26,0xd8,0x01,0xff,0x00,0x00]
 
-ds_max_f32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x26,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f32 v1, v2
+// CHECK: [0x00,0x00,0x26,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x26,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x26,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x27,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x26,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_f32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x2a,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x27,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_f32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x2a,0xd8,0xff,0x00,0x00,0x00]
+ds_add_f32 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x2a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_f32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x2a,0xd8,0x00,0xff,0x00,0x00]
+ds_add_f32 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x2a,0xd8,0xff,0x02,0x00,0x00]
 
-ds_add_f32 v0, v0
-// CHECK: [0x00,0x00,0x2a,0xd8,0x00,0x00,0x00,0x00]
+ds_add_f32 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x2a,0xd8,0x01,0xff,0x00,0x00]
 
-ds_add_f32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x2a,0xd8,0x00,0x00,0x00,0x00]
+ds_add_f32 v1, v2
+// CHECK: [0x00,0x00,0x2a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_f32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x2a,0xd8,0x00,0x00,0x00,0x00]
+ds_add_f32 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x2a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_f32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x2b,0xd8,0x00,0x00,0x00,0x00]
+ds_add_f32 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x2a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b8 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_add_f32 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x2b,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b8 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x3c,0xd8,0xff,0x00,0x00,0x00]
+ds_write_b8 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x3c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b8 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x3c,0xd8,0x00,0xff,0x00,0x00]
+ds_write_b8 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x3c,0xd8,0xff,0x02,0x00,0x00]
 
-ds_write_b8 v0, v0
-// CHECK: [0x00,0x00,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b8 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x3c,0xd8,0x01,0xff,0x00,0x00]
 
-ds_write_b8 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b8 v1, v2
+// CHECK: [0x00,0x00,0x3c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b8 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x3c,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b8 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x3c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b8 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x3d,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b8 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x3c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b16 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x3e,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b8 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x3d,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b16 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x3e,0xd8,0xff,0x00,0x00,0x00]
+ds_write_b16 v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x3e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b16 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x3e,0xd8,0x00,0xff,0x00,0x00]
+ds_write_b16 v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x3e,0xd8,0xff,0x02,0x00,0x00]
 
-ds_write_b16 v0, v0
-// CHECK: [0x00,0x00,0x3e,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b16 v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x3e,0xd8,0x01,0xff,0x00,0x00]
 
-ds_write_b16 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x3e,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b16 v1, v2
+// CHECK: [0x00,0x00,0x3e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b16 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x3e,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b16 v1, v2 offset:0
+// CHECK: [0x00,0x00,0x3e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b16 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x3f,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b16 v1, v2 offset:4
+// CHECK: [0x04,0x00,0x3e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x40,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b16 v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x3f,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x40,0xd8,0x00,0x00,0x00,0xff]
+ds_add_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x40,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x40,0xd8,0xff,0x00,0x00,0x00]
+ds_add_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x40,0xd8,0x01,0x02,0x00,0xff]
 
-ds_add_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x40,0xd8,0x00,0xff,0x00,0x00]
+ds_add_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x40,0xd8,0xff,0x02,0x00,0x05]
 
-ds_add_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x40,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x40,0xd8,0x01,0xff,0x00,0x05]
 
-ds_add_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x40,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x40,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x40,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x40,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x41,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x40,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x42,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x41,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x42,0xd8,0x00,0x00,0x00,0xff]
+ds_sub_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x42,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x42,0xd8,0xff,0x00,0x00,0x00]
+ds_sub_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x42,0xd8,0x01,0x02,0x00,0xff]
 
-ds_sub_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x42,0xd8,0x00,0xff,0x00,0x00]
+ds_sub_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x42,0xd8,0xff,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x42,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x42,0xd8,0x01,0xff,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x42,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x42,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x42,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x42,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x43,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x42,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x44,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x43,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x44,0xd8,0x00,0x00,0x00,0xff]
+ds_rsub_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x44,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x44,0xd8,0xff,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x44,0xd8,0x01,0x02,0x00,0xff]
 
-ds_rsub_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x44,0xd8,0x00,0xff,0x00,0x00]
+ds_rsub_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x44,0xd8,0xff,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x44,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x44,0xd8,0x01,0xff,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x44,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x44,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x44,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x44,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x45,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x44,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x46,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x45,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x46,0xd8,0x00,0x00,0x00,0xff]
+ds_inc_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x46,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x46,0xd8,0xff,0x00,0x00,0x00]
+ds_inc_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x46,0xd8,0x01,0x02,0x00,0xff]
 
-ds_inc_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x46,0xd8,0x00,0xff,0x00,0x00]
+ds_inc_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x46,0xd8,0xff,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x46,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x46,0xd8,0x01,0xff,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x46,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x46,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x46,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x46,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x47,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x46,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x48,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x47,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x48,0xd8,0x00,0x00,0x00,0xff]
+ds_dec_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x48,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x48,0xd8,0xff,0x00,0x00,0x00]
+ds_dec_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x48,0xd8,0x01,0x02,0x00,0xff]
 
-ds_dec_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x48,0xd8,0x00,0xff,0x00,0x00]
+ds_dec_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x48,0xd8,0xff,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x48,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x48,0xd8,0x01,0xff,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x48,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x48,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x48,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x48,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x49,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x48,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x4a,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x49,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x4a,0xd8,0x00,0x00,0x00,0xff]
+ds_min_rtn_i32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x4a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x4a,0xd8,0xff,0x00,0x00,0x00]
+ds_min_rtn_i32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x4a,0xd8,0x01,0x02,0x00,0xff]
 
-ds_min_rtn_i32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x4a,0xd8,0x00,0xff,0x00,0x00]
+ds_min_rtn_i32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x4a,0xd8,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v0, v0, v0
-// CHECK: [0x00,0x00,0x4a,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x4a,0xd8,0x01,0xff,0x00,0x05]
 
-ds_min_rtn_i32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x4a,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i32 v5, v1, v2
+// CHECK: [0x00,0x00,0x4a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x4a,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x4a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x4b,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x4a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x4c,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x4b,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x4c,0xd8,0x00,0x00,0x00,0xff]
+ds_max_rtn_i32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x4c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x4c,0xd8,0xff,0x00,0x00,0x00]
+ds_max_rtn_i32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x4c,0xd8,0x01,0x02,0x00,0xff]
 
-ds_max_rtn_i32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x4c,0xd8,0x00,0xff,0x00,0x00]
+ds_max_rtn_i32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x4c,0xd8,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v0, v0, v0
-// CHECK: [0x00,0x00,0x4c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x4c,0xd8,0x01,0xff,0x00,0x05]
 
-ds_max_rtn_i32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x4c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i32 v5, v1, v2
+// CHECK: [0x00,0x00,0x4c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x4c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x4c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x4d,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x4c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x4e,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x4d,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x4e,0xd8,0x00,0x00,0x00,0xff]
+ds_min_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x4e,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x4e,0xd8,0xff,0x00,0x00,0x00]
+ds_min_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x4e,0xd8,0x01,0x02,0x00,0xff]
 
-ds_min_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x4e,0xd8,0x00,0xff,0x00,0x00]
+ds_min_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x4e,0xd8,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x4e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x4e,0xd8,0x01,0xff,0x00,0x05]
 
-ds_min_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x4e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x4e,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x4e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x4e,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x4f,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x4e,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x50,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x4f,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x50,0xd8,0x00,0x00,0x00,0xff]
+ds_max_rtn_u32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x50,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x50,0xd8,0xff,0x00,0x00,0x00]
+ds_max_rtn_u32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x50,0xd8,0x01,0x02,0x00,0xff]
 
-ds_max_rtn_u32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x50,0xd8,0x00,0xff,0x00,0x00]
+ds_max_rtn_u32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x50,0xd8,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v0, v0, v0
-// CHECK: [0x00,0x00,0x50,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x50,0xd8,0x01,0xff,0x00,0x05]
 
-ds_max_rtn_u32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x50,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u32 v5, v1, v2
+// CHECK: [0x00,0x00,0x50,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x50,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x50,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x51,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x50,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x52,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x51,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x52,0xd8,0x00,0x00,0x00,0xff]
+ds_and_rtn_b32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x52,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x52,0xd8,0xff,0x00,0x00,0x00]
+ds_and_rtn_b32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x52,0xd8,0x01,0x02,0x00,0xff]
 
-ds_and_rtn_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x52,0xd8,0x00,0xff,0x00,0x00]
+ds_and_rtn_b32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x52,0xd8,0xff,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0x52,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x52,0xd8,0x01,0xff,0x00,0x05]
 
-ds_and_rtn_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x52,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b32 v5, v1, v2
+// CHECK: [0x00,0x00,0x52,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x52,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x52,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x53,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x52,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x54,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x53,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x54,0xd8,0x00,0x00,0x00,0xff]
+ds_or_rtn_b32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x54,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x54,0xd8,0xff,0x00,0x00,0x00]
+ds_or_rtn_b32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x54,0xd8,0x01,0x02,0x00,0xff]
 
-ds_or_rtn_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x54,0xd8,0x00,0xff,0x00,0x00]
+ds_or_rtn_b32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x54,0xd8,0xff,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0x54,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x54,0xd8,0x01,0xff,0x00,0x05]
 
-ds_or_rtn_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x54,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b32 v5, v1, v2
+// CHECK: [0x00,0x00,0x54,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x54,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x54,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x55,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x54,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x56,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x55,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x56,0xd8,0x00,0x00,0x00,0xff]
+ds_xor_rtn_b32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x56,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x56,0xd8,0xff,0x00,0x00,0x00]
+ds_xor_rtn_b32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x56,0xd8,0x01,0x02,0x00,0xff]
 
-ds_xor_rtn_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x56,0xd8,0x00,0xff,0x00,0x00]
+ds_xor_rtn_b32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x56,0xd8,0xff,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0x56,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x56,0xd8,0x01,0xff,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x56,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b32 v5, v1, v2
+// CHECK: [0x00,0x00,0x56,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x56,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x56,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x57,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x56,0xd8,0x01,0x02,0x00,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x58,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x57,0xd8,0x01,0x02,0x00,0x05]
 
-ds_mskor_rtn_b32 v255, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x58,0xd8,0x00,0x00,0x00,0xff]
+ds_mskor_rtn_b32 v5, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x58,0xd8,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b32 v0, v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x58,0xd8,0xff,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v255, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x58,0xd8,0x01,0x02,0x03,0xff]
 
-ds_mskor_rtn_b32 v0, v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x58,0xd8,0x00,0xff,0x00,0x00]
+ds_mskor_rtn_b32 v5, v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x58,0xd8,0xff,0x02,0x03,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x58,0xd8,0x00,0x00,0xff,0x00]
+ds_mskor_rtn_b32 v5, v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0x58,0xd8,0x01,0xff,0x03,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v0
-// CHECK: [0x00,0x00,0x58,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v5, v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0x58,0xd8,0x01,0x02,0xff,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x58,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v5, v1, v2, v3
+// CHECK: [0x00,0x00,0x58,0xd8,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x58,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v5, v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0x58,0xd8,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b32 v0, v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x59,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v5, v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0x58,0xd8,0x01,0x02,0x03,0x05]
 
-ds_wrxchg_rtn_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x5a,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b32 v5, v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0x59,0xd8,0x01,0x02,0x03,0x05]
 
-ds_wrxchg_rtn_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x5a,0xd8,0x00,0x00,0x00,0xff]
+ds_wrxchg_rtn_b32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x5a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x5a,0xd8,0xff,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x5a,0xd8,0x01,0x02,0x00,0xff]
 
-ds_wrxchg_rtn_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x5a,0xd8,0x00,0xff,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x5a,0xd8,0xff,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0x5a,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x5a,0xd8,0x01,0xff,0x00,0x05]
 
-ds_wrxchg_rtn_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x5a,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v1, v2
+// CHECK: [0x00,0x00,0x5a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x5a,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x5a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x5b,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x5a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x60,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x5b,0xd8,0x01,0x02,0x00,0x05]
 
-ds_cmpst_rtn_b32 v255, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x60,0xd8,0x00,0x00,0x00,0xff]
+ds_cmpst_rtn_b32 v5, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x60,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b32 v0, v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x60,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v255, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x60,0xd8,0x01,0x02,0x03,0xff]
 
-ds_cmpst_rtn_b32 v0, v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x60,0xd8,0x00,0xff,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x60,0xd8,0xff,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x60,0xd8,0x00,0x00,0xff,0x00]
+ds_cmpst_rtn_b32 v5, v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0x60,0xd8,0x01,0xff,0x03,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v0
-// CHECK: [0x00,0x00,0x60,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0x60,0xd8,0x01,0x02,0xff,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x60,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v1, v2, v3
+// CHECK: [0x00,0x00,0x60,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x60,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0x60,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b32 v0, v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x61,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0x60,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x62,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b32 v5, v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0x61,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v255, v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x62,0xd8,0x00,0x00,0x00,0xff]
+ds_cmpst_rtn_f32 v5, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x62,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x62,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v255, v1, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x62,0xd8,0x01,0x02,0x03,0xff]
 
-ds_cmpst_rtn_f32 v0, v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x62,0xd8,0x00,0xff,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v255, v2, v3 offset:65535
+// CHECK: [0xff,0xff,0x62,0xd8,0xff,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x62,0xd8,0x00,0x00,0xff,0x00]
+ds_cmpst_rtn_f32 v5, v1, v255, v3 offset:65535
+// CHECK: [0xff,0xff,0x62,0xd8,0x01,0xff,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v0
-// CHECK: [0x00,0x00,0x62,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v1, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0x62,0xd8,0x01,0x02,0xff,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x62,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v1, v2, v3
+// CHECK: [0x00,0x00,0x62,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x62,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v1, v2, v3 offset:0
+// CHECK: [0x00,0x00,0x62,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f32 v0, v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x63,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v1, v2, v3 offset:4
+// CHECK: [0x04,0x00,0x62,0xd8,0x01,0x02,0x03,0x05]
 
-ds_min_rtn_f32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x64,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f32 v5, v1, v2, v3 offset:65535 gds
+// CHECK: [0xff,0xff,0x63,0xd8,0x01,0x02,0x03,0x05]
 
-ds_min_rtn_f32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x64,0xd8,0x00,0x00,0x00,0xff]
+ds_min_rtn_f32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x64,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x64,0xd8,0xff,0x00,0x00,0x00]
+ds_min_rtn_f32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x64,0xd8,0x01,0x02,0x00,0xff]
 
-ds_min_rtn_f32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x64,0xd8,0x00,0xff,0x00,0x00]
+ds_min_rtn_f32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x64,0xd8,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_f32 v0, v0, v0
-// CHECK: [0x00,0x00,0x64,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x64,0xd8,0x01,0xff,0x00,0x05]
 
-ds_min_rtn_f32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x64,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f32 v5, v1, v2
+// CHECK: [0x00,0x00,0x64,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x64,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x64,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x65,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x64,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x66,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x65,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x66,0xd8,0x00,0x00,0x00,0xff]
+ds_max_rtn_f32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x66,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x66,0xd8,0xff,0x00,0x00,0x00]
+ds_max_rtn_f32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x66,0xd8,0x01,0x02,0x00,0xff]
 
-ds_max_rtn_f32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x66,0xd8,0x00,0xff,0x00,0x00]
+ds_max_rtn_f32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x66,0xd8,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v0, v0, v0
-// CHECK: [0x00,0x00,0x66,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x66,0xd8,0x01,0xff,0x00,0x05]
 
-ds_max_rtn_f32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x66,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f32 v5, v1, v2
+// CHECK: [0x00,0x00,0x66,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x66,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x66,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x67,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x66,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_f32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x6a,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x67,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_f32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x6a,0xd8,0x00,0x00,0x00,0xff]
+ds_add_rtn_f32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x6a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_f32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x6a,0xd8,0xff,0x00,0x00,0x00]
+ds_add_rtn_f32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x6a,0xd8,0x01,0x02,0x00,0xff]
 
-ds_add_rtn_f32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x6a,0xd8,0x00,0xff,0x00,0x00]
+ds_add_rtn_f32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x6a,0xd8,0xff,0x02,0x00,0x05]
 
-ds_add_rtn_f32 v0, v0, v0
-// CHECK: [0x00,0x00,0x6a,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_f32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x6a,0xd8,0x01,0xff,0x00,0x05]
 
-ds_add_rtn_f32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x6a,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_f32 v5, v1, v2
+// CHECK: [0x00,0x00,0x6a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_f32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x6a,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_f32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x6a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_f32 v0, v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x6b,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_f32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x6a,0xd8,0x01,0x02,0x00,0x05]
 
-ds_read_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x6c,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_f32 v5, v1, v2 offset:65535 gds
+// CHECK: [0xff,0xff,0x6b,0xd8,0x01,0x02,0x00,0x05]
 
-ds_read_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x6c,0xd8,0x00,0x00,0x00,0xff]
+ds_read_b32 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0x6c,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x6c,0xd8,0xff,0x00,0x00,0x00]
+ds_read_b32 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0x6c,0xd8,0x01,0x00,0x00,0xff]
 
-ds_read_b32 v0, v0
-// CHECK: [0x00,0x00,0x6c,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b32 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0x6c,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x6c,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b32 v5, v1
+// CHECK: [0x00,0x00,0x6c,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x6c,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b32 v5, v1 offset:0
+// CHECK: [0x00,0x00,0x6c,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x6d,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b32 v5, v1 offset:4
+// CHECK: [0x04,0x00,0x6c,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x6e,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b32 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x6d,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[254:255], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x6e,0xd8,0x00,0x00,0x00,0xfe]
+ds_read2_b32 v[5:6], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x6e,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x6e,0xd8,0xff,0x00,0x00,0x00]
+ds_read2_b32 v[254:255], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x6e,0xd8,0x01,0x00,0x00,0xfe]
 
-ds_read2_b32 v[0:1], v0 offset1:255
-// CHECK: [0x00,0xff,0x6e,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x6e,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0x6e,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset1:255
+// CHECK: [0x00,0xff,0x6e,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0x6e,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0x6e,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:127
-// CHECK: [0x7f,0x00,0x6e,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0x6e,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0x6e,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:127
+// CHECK: [0x7f,0x00,0x6e,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0x6e,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0x6e,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b32 v[0:1], v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0x6f,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0x6e,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x70,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b32 v[5:6], v1 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0x6f,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[254:255], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x70,0xd8,0x00,0x00,0x00,0xfe]
+ds_read2st64_b32 v[5:6], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x70,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x70,0xd8,0xff,0x00,0x00,0x00]
+ds_read2st64_b32 v[254:255], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x70,0xd8,0x01,0x00,0x00,0xfe]
 
-ds_read2st64_b32 v[0:1], v0 offset1:255
-// CHECK: [0x00,0xff,0x70,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x70,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0x70,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset1:255
+// CHECK: [0x00,0xff,0x70,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0x70,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0x70,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:127
-// CHECK: [0x7f,0x00,0x70,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0x70,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0x70,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:127
+// CHECK: [0x7f,0x00,0x70,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0x70,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0x70,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b32 v[0:1], v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0x71,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0x70,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i8 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x72,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b32 v[5:6], v1 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0x71,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i8 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x72,0xd8,0x00,0x00,0x00,0xff]
+ds_read_i8 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0x72,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i8 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x72,0xd8,0xff,0x00,0x00,0x00]
+ds_read_i8 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0x72,0xd8,0x01,0x00,0x00,0xff]
 
-ds_read_i8 v0, v0
-// CHECK: [0x00,0x00,0x72,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i8 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0x72,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_i8 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x72,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i8 v5, v1
+// CHECK: [0x00,0x00,0x72,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i8 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x72,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i8 v5, v1 offset:0
+// CHECK: [0x00,0x00,0x72,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i8 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x73,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i8 v5, v1 offset:4
+// CHECK: [0x04,0x00,0x72,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u8 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x74,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i8 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x73,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u8 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x74,0xd8,0x00,0x00,0x00,0xff]
+ds_read_u8 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0x74,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u8 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x74,0xd8,0xff,0x00,0x00,0x00]
+ds_read_u8 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0x74,0xd8,0x01,0x00,0x00,0xff]
 
-ds_read_u8 v0, v0
-// CHECK: [0x00,0x00,0x74,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u8 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0x74,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_u8 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x74,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u8 v5, v1
+// CHECK: [0x00,0x00,0x74,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u8 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x74,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u8 v5, v1 offset:0
+// CHECK: [0x00,0x00,0x74,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u8 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x75,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u8 v5, v1 offset:4
+// CHECK: [0x04,0x00,0x74,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i16 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x76,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u8 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x75,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i16 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x76,0xd8,0x00,0x00,0x00,0xff]
+ds_read_i16 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0x76,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i16 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x76,0xd8,0xff,0x00,0x00,0x00]
+ds_read_i16 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0x76,0xd8,0x01,0x00,0x00,0xff]
 
-ds_read_i16 v0, v0
-// CHECK: [0x00,0x00,0x76,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i16 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0x76,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_i16 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x76,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i16 v5, v1
+// CHECK: [0x00,0x00,0x76,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i16 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x76,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i16 v5, v1 offset:0
+// CHECK: [0x00,0x00,0x76,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_i16 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x77,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i16 v5, v1 offset:4
+// CHECK: [0x04,0x00,0x76,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u16 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x78,0xd8,0x00,0x00,0x00,0x00]
+ds_read_i16 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x77,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u16 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x78,0xd8,0x00,0x00,0x00,0xff]
+ds_read_u16 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0x78,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u16 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x78,0xd8,0xff,0x00,0x00,0x00]
+ds_read_u16 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0x78,0xd8,0x01,0x00,0x00,0xff]
 
-ds_read_u16 v0, v0
-// CHECK: [0x00,0x00,0x78,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u16 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0x78,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_u16 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x78,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u16 v5, v1
+// CHECK: [0x00,0x00,0x78,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u16 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x78,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u16 v5, v1 offset:0
+// CHECK: [0x00,0x00,0x78,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_u16 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x79,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u16 v5, v1 offset:4
+// CHECK: [0x04,0x00,0x78,0xd8,0x01,0x00,0x00,0x05]
 
-ds_swizzle_b32 v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x7a,0xd8,0x00,0x00,0x00,0x00]
+ds_read_u16 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x79,0xd8,0x01,0x00,0x00,0x05]
 
-ds_swizzle_b32 v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x7a,0xd8,0x00,0x00,0x00,0xff]
+ds_swizzle_b32 v5, v1 offset:65535
+// CHECK: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0x05]
 
-ds_swizzle_b32 v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x00]
+ds_swizzle_b32 v255, v1 offset:65535
+// CHECK: [0xff,0xff,0x7a,0xd8,0x01,0x00,0x00,0xff]
 
-ds_swizzle_b32 v0, v0
-// CHECK: [0x00,0x00,0x7a,0xd8,0x00,0x00,0x00,0x00]
+ds_swizzle_b32 v5, v255 offset:65535
+// CHECK: [0xff,0xff,0x7a,0xd8,0xff,0x00,0x00,0x05]
 
-ds_swizzle_b32 v0, v0 offset:0
-// CHECK: [0x00,0x00,0x7a,0xd8,0x00,0x00,0x00,0x00]
+ds_swizzle_b32 v5, v1
+// CHECK: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05]
 
-ds_swizzle_b32 v0, v0 offset:4
-// CHECK: [0x04,0x00,0x7a,0xd8,0x00,0x00,0x00,0x00]
+ds_swizzle_b32 v5, v1 offset:0
+// CHECK: [0x00,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05]
 
-ds_swizzle_b32 v0, v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x7b,0xd8,0x00,0x00,0x00,0x00]
+ds_swizzle_b32 v5, v1 offset:4
+// CHECK: [0x04,0x00,0x7a,0xd8,0x01,0x00,0x00,0x05]
 
-ds_permute_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x7c,0xd8,0x00,0x00,0x00,0x00]
+ds_swizzle_b32 v5, v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x7b,0xd8,0x01,0x00,0x00,0x05]
 
-ds_permute_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x7c,0xd8,0x00,0x00,0x00,0xff]
+ds_permute_b32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_permute_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x7c,0xd8,0xff,0x00,0x00,0x00]
+ds_permute_b32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x7c,0xd8,0x01,0x02,0x00,0xff]
 
-ds_permute_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x7c,0xd8,0x00,0xff,0x00,0x00]
+ds_permute_b32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x7c,0xd8,0xff,0x02,0x00,0x05]
 
-ds_permute_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0x7c,0xd8,0x00,0x00,0x00,0x00]
+ds_permute_b32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x7c,0xd8,0x01,0xff,0x00,0x05]
 
-ds_permute_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x7c,0xd8,0x00,0x00,0x00,0x00]
+ds_permute_b32 v5, v1, v2
+// CHECK: [0x00,0x00,0x7c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_permute_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x7c,0xd8,0x00,0x00,0x00,0x00]
+ds_permute_b32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x7c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_bpermute_b32 v0, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x7e,0xd8,0x00,0x00,0x00,0x00]
+ds_permute_b32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x7c,0xd8,0x01,0x02,0x00,0x05]
 
-ds_bpermute_b32 v255, v0, v0 offset:65535
-// CHECK: [0xff,0xff,0x7e,0xd8,0x00,0x00,0x00,0xff]
+ds_bpermute_b32 v5, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x7e,0xd8,0x01,0x02,0x00,0x05]
 
-ds_bpermute_b32 v0, v255, v0 offset:65535
-// CHECK: [0xff,0xff,0x7e,0xd8,0xff,0x00,0x00,0x00]
+ds_bpermute_b32 v255, v1, v2 offset:65535
+// CHECK: [0xff,0xff,0x7e,0xd8,0x01,0x02,0x00,0xff]
 
-ds_bpermute_b32 v0, v0, v255 offset:65535
-// CHECK: [0xff,0xff,0x7e,0xd8,0x00,0xff,0x00,0x00]
+ds_bpermute_b32 v5, v255, v2 offset:65535
+// CHECK: [0xff,0xff,0x7e,0xd8,0xff,0x02,0x00,0x05]
 
-ds_bpermute_b32 v0, v0, v0
-// CHECK: [0x00,0x00,0x7e,0xd8,0x00,0x00,0x00,0x00]
+ds_bpermute_b32 v5, v1, v255 offset:65535
+// CHECK: [0xff,0xff,0x7e,0xd8,0x01,0xff,0x00,0x05]
 
-ds_bpermute_b32 v0, v0, v0 offset:0
-// CHECK: [0x00,0x00,0x7e,0xd8,0x00,0x00,0x00,0x00]
+ds_bpermute_b32 v5, v1, v2
+// CHECK: [0x00,0x00,0x7e,0xd8,0x01,0x02,0x00,0x05]
 
-ds_bpermute_b32 v0, v0, v0 offset:4
-// CHECK: [0x04,0x00,0x7e,0xd8,0x00,0x00,0x00,0x00]
+ds_bpermute_b32 v5, v1, v2 offset:0
+// CHECK: [0x00,0x00,0x7e,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x80,0xd8,0x00,0x00,0x00,0x00]
+ds_bpermute_b32 v5, v1, v2 offset:4
+// CHECK: [0x04,0x00,0x7e,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x80,0xd8,0xff,0x00,0x00,0x00]
+ds_add_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x80,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x80,0xd8,0x00,0xfe,0x00,0x00]
+ds_add_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x80,0xd8,0xff,0x02,0x00,0x00]
 
-ds_add_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x80,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x80,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_add_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x80,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x80,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x80,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x80,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x81,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x80,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x82,0xd8,0x00,0x00,0x00,0x00]
+ds_add_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x81,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x82,0xd8,0xff,0x00,0x00,0x00]
+ds_sub_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x82,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x82,0xd8,0x00,0xfe,0x00,0x00]
+ds_sub_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x82,0xd8,0xff,0x02,0x00,0x00]
 
-ds_sub_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x82,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x82,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_sub_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x82,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x82,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x82,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x82,0xd8,0x01,0x02,0x00,0x00]
 
-ds_sub_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x83,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x82,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x84,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x83,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x84,0xd8,0xff,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x84,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x84,0xd8,0x00,0xfe,0x00,0x00]
+ds_rsub_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x84,0xd8,0xff,0x02,0x00,0x00]
 
-ds_rsub_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x84,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x84,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_rsub_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x84,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x84,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x84,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x84,0xd8,0x01,0x02,0x00,0x00]
 
-ds_rsub_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x85,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x84,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x86,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x85,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x86,0xd8,0xff,0x00,0x00,0x00]
+ds_inc_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x86,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x86,0xd8,0x00,0xfe,0x00,0x00]
+ds_inc_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x86,0xd8,0xff,0x02,0x00,0x00]
 
-ds_inc_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x86,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x86,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_inc_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x86,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x86,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x86,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x86,0xd8,0x01,0x02,0x00,0x00]
 
-ds_inc_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x87,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x86,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x88,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x87,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x88,0xd8,0xff,0x00,0x00,0x00]
+ds_dec_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x88,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x88,0xd8,0x00,0xfe,0x00,0x00]
+ds_dec_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x88,0xd8,0xff,0x02,0x00,0x00]
 
-ds_dec_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x88,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x88,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_dec_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x88,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x88,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x88,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x88,0xd8,0x01,0x02,0x00,0x00]
 
-ds_dec_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x89,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x88,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x8a,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x89,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x8a,0xd8,0xff,0x00,0x00,0x00]
+ds_min_i64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x8a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x8a,0xd8,0x00,0xfe,0x00,0x00]
+ds_min_i64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x8a,0xd8,0xff,0x02,0x00,0x00]
 
-ds_min_i64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x8a,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x8a,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_min_i64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x8a,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x8a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x8a,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x8a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_i64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x8b,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x8a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd8,0x00,0x00,0x00,0x00]
+ds_min_i64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x8b,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd8,0xff,0x00,0x00,0x00]
+ds_max_i64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd8,0x00,0xfe,0x00,0x00]
+ds_max_i64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd8,0xff,0x02,0x00,0x00]
 
-ds_max_i64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x8c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_max_i64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x8c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x8c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x8c,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x8c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_i64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x8d,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x8c,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x8e,0xd8,0x00,0x00,0x00,0x00]
+ds_max_i64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x8d,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x8e,0xd8,0xff,0x00,0x00,0x00]
+ds_min_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x8e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x8e,0xd8,0x00,0xfe,0x00,0x00]
+ds_min_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x8e,0xd8,0xff,0x02,0x00,0x00]
 
-ds_min_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x8e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x8e,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_min_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x8e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x8e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x8e,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x8e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x8f,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x8e,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x90,0xd8,0x00,0x00,0x00,0x00]
+ds_min_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x8f,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x90,0xd8,0xff,0x00,0x00,0x00]
+ds_max_u64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x90,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x90,0xd8,0x00,0xfe,0x00,0x00]
+ds_max_u64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x90,0xd8,0xff,0x02,0x00,0x00]
 
-ds_max_u64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x90,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x90,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_max_u64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x90,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x90,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x90,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x90,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_u64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x91,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x90,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x92,0xd8,0x00,0x00,0x00,0x00]
+ds_max_u64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x91,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x92,0xd8,0xff,0x00,0x00,0x00]
+ds_and_b64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x92,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x92,0xd8,0x00,0xfe,0x00,0x00]
+ds_and_b64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x92,0xd8,0xff,0x02,0x00,0x00]
 
-ds_and_b64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x92,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x92,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_and_b64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x92,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x92,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x92,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x92,0xd8,0x01,0x02,0x00,0x00]
 
-ds_and_b64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x93,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x92,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x94,0xd8,0x00,0x00,0x00,0x00]
+ds_and_b64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x93,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x94,0xd8,0xff,0x00,0x00,0x00]
+ds_or_b64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x94,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x94,0xd8,0x00,0xfe,0x00,0x00]
+ds_or_b64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x94,0xd8,0xff,0x02,0x00,0x00]
 
-ds_or_b64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x94,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x94,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_or_b64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x94,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x94,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x94,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x94,0xd8,0x01,0x02,0x00,0x00]
 
-ds_or_b64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x95,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x94,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x96,0xd8,0x00,0x00,0x00,0x00]
+ds_or_b64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x95,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x96,0xd8,0xff,0x00,0x00,0x00]
+ds_xor_b64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x96,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x96,0xd8,0x00,0xfe,0x00,0x00]
+ds_xor_b64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x96,0xd8,0xff,0x02,0x00,0x00]
 
-ds_xor_b64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x96,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x96,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_xor_b64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x96,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x96,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x96,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x96,0xd8,0x01,0x02,0x00,0x00]
 
-ds_xor_b64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x97,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x96,0xd8,0x01,0x02,0x00,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x98,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_b64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x97,0xd8,0x01,0x02,0x00,0x00]
 
-ds_mskor_b64 v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x98,0xd8,0xff,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x98,0xd8,0x01,0x02,0x03,0x00]
 
-ds_mskor_b64 v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x98,0xd8,0x00,0xfe,0x00,0x00]
+ds_mskor_b64 v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x98,0xd8,0xff,0x02,0x03,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x98,0xd8,0x00,0x00,0xfe,0x00]
+ds_mskor_b64 v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0x98,0xd8,0x01,0xfe,0x03,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x98,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x98,0xd8,0x01,0x02,0xfe,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0x98,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0x98,0xd8,0x01,0x02,0x03,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0x98,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0x98,0xd8,0x01,0x02,0x03,0x00]
 
-ds_mskor_b64 v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x99,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0x98,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write_b64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x9a,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_b64 v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0x99,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write_b64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0x9a,0xd8,0xff,0x00,0x00,0x00]
+ds_write_b64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x9a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0x9a,0xd8,0x00,0xfe,0x00,0x00]
+ds_write_b64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0x9a,0xd8,0xff,0x02,0x00,0x00]
 
-ds_write_b64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x9a,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0x9a,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_write_b64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0x9a,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b64 v1, v[2:3]
+// CHECK: [0x00,0x00,0x9a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0x9a,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0x9a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write_b64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0x9b,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0x9a,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_write_b64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0x9b,0xd8,0x01,0x02,0x00,0x00]
 
-ds_write2_b64 v255, v[0:1], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x9c,0xd8,0xff,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x9c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[254:255], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x9c,0xd8,0x00,0xfe,0x00,0x00]
+ds_write2_b64 v255, v[2:3], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x9c,0xd8,0xff,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[254:255] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x9c,0xd8,0x00,0x00,0xfe,0x00]
+ds_write2_b64 v1, v[254:255], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x9c,0xd8,0x01,0xfe,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset1:255
-// CHECK: [0x00,0xff,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[254:255] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x9c,0xd8,0x01,0x02,0xfe,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:0 offset1:255
-// CHECK: [0x00,0xff,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset1:255
+// CHECK: [0x00,0xff,0x9c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:16 offset1:255
-// CHECK: [0x10,0xff,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:0 offset1:255
+// CHECK: [0x00,0xff,0x9c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:127
-// CHECK: [0x7f,0x00,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:16 offset1:255
+// CHECK: [0x10,0xff,0x9c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:127
+// CHECK: [0x7f,0x00,0x9c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0x9c,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0x9c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2_b64 v0, v[0:1], v[0:1] offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0x9d,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0x9c,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x9e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2_b64 v1, v[2:3], v[3:4] offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0x9d,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v255, v[0:1], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x9e,0xd8,0xff,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x9e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[254:255], v[0:1] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x9e,0xd8,0x00,0xfe,0x00,0x00]
+ds_write2st64_b64 v255, v[2:3], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x9e,0xd8,0xff,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[254:255] offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0x9e,0xd8,0x00,0x00,0xfe,0x00]
+ds_write2st64_b64 v1, v[254:255], v[3:4] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x9e,0xd8,0x01,0xfe,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset1:255
-// CHECK: [0x00,0xff,0x9e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[254:255] offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0x9e,0xd8,0x01,0x02,0xfe,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:0 offset1:255
-// CHECK: [0x00,0xff,0x9e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset1:255
+// CHECK: [0x00,0xff,0x9e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:16 offset1:255
-// CHECK: [0x10,0xff,0x9e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:0 offset1:255
+// CHECK: [0x00,0xff,0x9e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:127
-// CHECK: [0x7f,0x00,0x9e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:16 offset1:255
+// CHECK: [0x10,0xff,0x9e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0x9e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:127
+// CHECK: [0x7f,0x00,0x9e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0x9e,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0x9e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_write2st64_b64 v0, v[0:1], v[0:1] offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0x9f,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0x9e,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd8,0x00,0x00,0x00,0x00]
+ds_write2st64_b64 v1, v[2:3], v[3:4] offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0x9f,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b64 v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b64 v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd8,0x00,0xfe,0x00,0x00]
+ds_cmpst_b64 v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd8,0xff,0x02,0x03,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xa0,0xd8,0x00,0x00,0xfe,0x00]
+ds_cmpst_b64 v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd8,0x01,0xfe,0x03,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xa0,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xa0,0xd8,0x01,0x02,0xfe,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0xa0,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0xa0,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0xa0,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0xa0,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_b64 v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xa1,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0xa0,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa2,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_b64 v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0xa1,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f64 v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa2,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xa2,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f64 v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa2,0xd8,0x00,0xfe,0x00,0x00]
+ds_cmpst_f64 v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xa2,0xd8,0xff,0x02,0x03,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xa2,0xd8,0x00,0x00,0xfe,0x00]
+ds_cmpst_f64 v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xa2,0xd8,0x01,0xfe,0x03,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xa2,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xa2,0xd8,0x01,0x02,0xfe,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0xa2,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0xa2,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0xa2,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0xa2,0xd8,0x01,0x02,0x03,0x00]
 
-ds_cmpst_f64 v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xa3,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0xa2,0xd8,0x01,0x02,0x03,0x00]
 
-ds_min_f64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_f64 v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0xa3,0xd8,0x01,0x02,0x03,0x00]
 
-ds_min_f64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd8,0xff,0x00,0x00,0x00]
+ds_min_f64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_f64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd8,0x00,0xfe,0x00,0x00]
+ds_min_f64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd8,0xff,0x02,0x00,0x00]
 
-ds_min_f64 v0, v[0:1]
-// CHECK: [0x00,0x00,0xa4,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_min_f64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xa4,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f64 v1, v[2:3]
+// CHECK: [0x00,0x00,0xa4,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_f64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xa4,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xa4,0xd8,0x01,0x02,0x00,0x00]
 
-ds_min_f64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xa5,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xa4,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f64 v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa6,0xd8,0x00,0x00,0x00,0x00]
+ds_min_f64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xa5,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f64 v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xa6,0xd8,0xff,0x00,0x00,0x00]
+ds_max_f64 v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa6,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f64 v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xa6,0xd8,0x00,0xfe,0x00,0x00]
+ds_max_f64 v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xa6,0xd8,0xff,0x02,0x00,0x00]
 
-ds_max_f64 v0, v[0:1]
-// CHECK: [0x00,0x00,0xa6,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f64 v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xa6,0xd8,0x01,0xfe,0x00,0x00]
 
-ds_max_f64 v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xa6,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f64 v1, v[2:3]
+// CHECK: [0x00,0x00,0xa6,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f64 v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xa6,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f64 v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xa6,0xd8,0x01,0x02,0x00,0x00]
 
-ds_max_f64 v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xa7,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f64 v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xa6,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd8,0x00,0x00,0x00,0x00]
+ds_max_f64 v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xa7,0xd8,0x01,0x02,0x00,0x00]
 
-ds_add_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd8,0x00,0x00,0x00,0xfe]
+ds_add_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd8,0xff,0x00,0x00,0x00]
+ds_add_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_add_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xc0,0xd8,0x00,0xfe,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd8,0xff,0x02,0x00,0x05]
 
-ds_add_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xc0,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xc0,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_add_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xc0,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xc0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xc0,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xc0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_add_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xc1,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xc0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc2,0xd8,0x00,0x00,0x00,0x00]
+ds_add_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xc1,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc2,0xd8,0x00,0x00,0x00,0xfe]
+ds_sub_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc2,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc2,0xd8,0xff,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc2,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_sub_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xc2,0xd8,0x00,0xfe,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc2,0xd8,0xff,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xc2,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xc2,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xc2,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xc2,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xc2,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xc2,0xd8,0x01,0x02,0x00,0x05]
 
-ds_sub_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xc3,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xc2,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd8,0x00,0x00,0x00,0x00]
+ds_sub_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xc3,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd8,0x00,0x00,0x00,0xfe]
+ds_rsub_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd8,0xff,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xc4,0xd8,0x00,0xfe,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd8,0xff,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xc4,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xc4,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xc4,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xc4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xc4,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xc4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_rsub_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xc5,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xc4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc6,0xd8,0x00,0x00,0x00,0x00]
+ds_rsub_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xc5,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc6,0xd8,0x00,0x00,0x00,0xfe]
+ds_inc_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc6,0xd8,0xff,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc6,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_inc_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xc6,0xd8,0x00,0xfe,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc6,0xd8,0xff,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xc6,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xc6,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xc6,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xc6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xc6,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xc6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_inc_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xc7,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xc6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd8,0x00,0x00,0x00,0x00]
+ds_inc_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xc7,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd8,0x00,0x00,0x00,0xfe]
+ds_dec_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd8,0xff,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_dec_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xc8,0xd8,0x00,0xfe,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd8,0xff,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xc8,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xc8,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xc8,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xc8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xc8,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xc8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_dec_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xc9,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xc8,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xca,0xd8,0x00,0x00,0x00,0x00]
+ds_dec_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xc9,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xca,0xd8,0x00,0x00,0x00,0xfe]
+ds_min_rtn_i64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xca,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xca,0xd8,0xff,0x00,0x00,0x00]
+ds_min_rtn_i64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xca,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_min_rtn_i64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xca,0xd8,0x00,0xfe,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xca,0xd8,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xca,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xca,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xca,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xca,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xca,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xca,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_i64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xcb,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xca,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_i64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xcb,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd8,0x00,0x00,0x00,0xfe]
+ds_max_rtn_i64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd8,0xff,0x00,0x00,0x00]
+ds_max_rtn_i64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_max_rtn_i64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xcc,0xd8,0x00,0xfe,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd8,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xcc,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xcc,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xcc,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xcc,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xcc,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_i64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xcd,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xcc,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xce,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_i64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xcd,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xce,0xd8,0x00,0x00,0x00,0xfe]
+ds_min_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xce,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xce,0xd8,0xff,0x00,0x00,0x00]
+ds_min_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xce,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_min_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xce,0xd8,0x00,0xfe,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xce,0xd8,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xce,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xce,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xce,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xce,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xce,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xce,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xcf,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xce,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd0,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xcf,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd0,0xd8,0x00,0x00,0x00,0xfe]
+ds_max_rtn_u64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd0,0xd8,0xff,0x00,0x00,0x00]
+ds_max_rtn_u64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd0,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_max_rtn_u64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xd0,0xd8,0x00,0xfe,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd0,0xd8,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xd0,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xd0,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xd0,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xd0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xd0,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xd0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_u64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xd1,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xd0,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd2,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_u64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xd1,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd2,0xd8,0x00,0x00,0x00,0xfe]
+ds_and_rtn_b64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd2,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd2,0xd8,0xff,0x00,0x00,0x00]
+ds_and_rtn_b64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd2,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_and_rtn_b64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xd2,0xd8,0x00,0xfe,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd2,0xd8,0xff,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xd2,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xd2,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xd2,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xd2,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xd2,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xd2,0xd8,0x01,0x02,0x00,0x05]
 
-ds_and_rtn_b64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xd3,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xd2,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd4,0xd8,0x00,0x00,0x00,0x00]
+ds_and_rtn_b64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xd3,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd4,0xd8,0x00,0x00,0x00,0xfe]
+ds_or_rtn_b64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd4,0xd8,0xff,0x00,0x00,0x00]
+ds_or_rtn_b64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd4,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_or_rtn_b64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xd4,0xd8,0x00,0xfe,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd4,0xd8,0xff,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xd4,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xd4,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xd4,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xd4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xd4,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xd4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_or_rtn_b64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xd5,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xd4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd6,0xd8,0x00,0x00,0x00,0x00]
+ds_or_rtn_b64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xd5,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd6,0xd8,0x00,0x00,0x00,0xfe]
+ds_xor_rtn_b64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd6,0xd8,0xff,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd6,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_xor_rtn_b64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xd6,0xd8,0x00,0xfe,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xd6,0xd8,0xff,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xd6,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xd6,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xd6,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xd6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xd6,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xd6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_xor_rtn_b64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xd7,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xd6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd8,0x00,0x00,0x00,0x00]
+ds_xor_rtn_b64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xd7,0xd8,0x01,0x02,0x00,0x05]
 
-ds_mskor_rtn_b64 v[254:255], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd8,0x00,0x00,0x00,0xfe]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd8,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd8,0xff,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[254:255], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd8,0x01,0x02,0x03,0xfe]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd8,0x00,0xfe,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd8,0xff,0x02,0x03,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xd8,0xd8,0x00,0x00,0xfe,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd8,0x01,0xfe,0x03,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd8,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xd8,0xd8,0x01,0x02,0xfe,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0xd8,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0xd8,0xd8,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0xd8,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0xd8,0xd8,0x01,0x02,0x03,0x05]
 
-ds_mskor_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xd9,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0xd8,0xd8,0x01,0x02,0x03,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xda,0xd8,0x00,0x00,0x00,0x00]
+ds_mskor_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0xd9,0xd8,0x01,0x02,0x03,0x05]
 
-ds_wrxchg_rtn_b64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xda,0xd8,0x00,0x00,0x00,0xfe]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xda,0xd8,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xda,0xd8,0xff,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xda,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xda,0xd8,0x00,0xfe,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xda,0xd8,0xff,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xda,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xda,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xda,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xda,0xd8,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xda,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xda,0xd8,0x01,0x02,0x00,0x05]
 
-ds_wrxchg_rtn_b64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xdb,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xda,0xd8,0x01,0x02,0x00,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_wrxchg_rtn_b64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xdb,0xd8,0x01,0x02,0x00,0x05]
 
-ds_cmpst_rtn_b64 v[254:255], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe0,0xd8,0x00,0x00,0x00,0xfe]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xe0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe0,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[254:255], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xe0,0xd8,0x01,0x02,0x03,0xfe]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe0,0xd8,0x00,0xfe,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xe0,0xd8,0xff,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xe0,0xd8,0x00,0x00,0xfe,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xe0,0xd8,0x01,0xfe,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xe0,0xd8,0x01,0x02,0xfe,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0xe0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0xe0,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0xe0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_b64 v[0:1], v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xe1,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0xe0,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe2,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_b64 v[5:6], v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0xe1,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[254:255], v0, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe2,0xd8,0x00,0x00,0x00,0xfe]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xe2,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v255, v[0:1], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe2,0xd8,0xff,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[254:255], v1, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xe2,0xd8,0x01,0x02,0x03,0xfe]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[254:255], v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe2,0xd8,0x00,0xfe,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v255, v[2:3], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xe2,0xd8,0xff,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xe2,0xd8,0x00,0x00,0xfe,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[254:255], v[3:4] offset:65535
+// CHECK: [0xff,0xff,0xe2,0xd8,0x01,0xfe,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xe2,0xd8,0x01,0x02,0xfe,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[0:1] offset:0
-// CHECK: [0x00,0x00,0xe2,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[3:4]
+// CHECK: [0x00,0x00,0xe2,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[0:1] offset:4
-// CHECK: [0x04,0x00,0xe2,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[3:4] offset:0
+// CHECK: [0x00,0x00,0xe2,0xd8,0x01,0x02,0x03,0x05]
 
-ds_cmpst_rtn_f64 v[0:1], v0, v[0:1], v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xe3,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[3:4] offset:4
+// CHECK: [0x04,0x00,0xe2,0xd8,0x01,0x02,0x03,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe4,0xd8,0x00,0x00,0x00,0x00]
+ds_cmpst_rtn_f64 v[5:6], v1, v[2:3], v[3:4] offset:65535 gds
+// CHECK: [0xff,0xff,0xe3,0xd8,0x01,0x02,0x03,0x05]
 
-ds_min_rtn_f64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe4,0xd8,0x00,0x00,0x00,0xfe]
+ds_min_rtn_f64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xe4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe4,0xd8,0xff,0x00,0x00,0x00]
+ds_min_rtn_f64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xe4,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_min_rtn_f64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xe4,0xd8,0x00,0xfe,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xe4,0xd8,0xff,0x02,0x00,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xe4,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xe4,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xe4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xe4,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xe4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_min_rtn_f64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xe5,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xe4,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe6,0xd8,0x00,0x00,0x00,0x00]
+ds_min_rtn_f64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xe5,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[254:255], v0, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe6,0xd8,0x00,0x00,0x00,0xfe]
+ds_max_rtn_f64 v[5:6], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xe6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v255, v[0:1] offset:65535
-// CHECK: [0xff,0xff,0xe6,0xd8,0xff,0x00,0x00,0x00]
+ds_max_rtn_f64 v[254:255], v1, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xe6,0xd8,0x01,0x02,0x00,0xfe]
 
-ds_max_rtn_f64 v[0:1], v0, v[254:255] offset:65535
-// CHECK: [0xff,0xff,0xe6,0xd8,0x00,0xfe,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v255, v[2:3] offset:65535
+// CHECK: [0xff,0xff,0xe6,0xd8,0xff,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[254:255] offset:65535
+// CHECK: [0xff,0xff,0xe6,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[0:1] offset:0
-// CHECK: [0x00,0x00,0xe6,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[2:3]
+// CHECK: [0x00,0x00,0xe6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[0:1] offset:4
-// CHECK: [0x04,0x00,0xe6,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[2:3] offset:0
+// CHECK: [0x00,0x00,0xe6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_max_rtn_f64 v[0:1], v0, v[0:1] offset:65535 gds
-// CHECK: [0xff,0xff,0xe7,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[2:3] offset:4
+// CHECK: [0x04,0x00,0xe6,0xd8,0x01,0x02,0x00,0x05]
 
-ds_read_b64 v[0:1], v0 offset:65535
-// CHECK: [0xff,0xff,0xec,0xd8,0x00,0x00,0x00,0x00]
+ds_max_rtn_f64 v[5:6], v1, v[2:3] offset:65535 gds
+// CHECK: [0xff,0xff,0xe7,0xd8,0x01,0x02,0x00,0x05]
 
-ds_read_b64 v[254:255], v0 offset:65535
-// CHECK: [0xff,0xff,0xec,0xd8,0x00,0x00,0x00,0xfe]
+ds_read_b64 v[5:6], v1 offset:65535
+// CHECK: [0xff,0xff,0xec,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b64 v[0:1], v255 offset:65535
-// CHECK: [0xff,0xff,0xec,0xd8,0xff,0x00,0x00,0x00]
+ds_read_b64 v[254:255], v1 offset:65535
+// CHECK: [0xff,0xff,0xec,0xd8,0x01,0x00,0x00,0xfe]
 
-ds_read_b64 v[0:1], v0
-// CHECK: [0x00,0x00,0xec,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v255 offset:65535
+// CHECK: [0xff,0xff,0xec,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read_b64 v[0:1], v0 offset:0
-// CHECK: [0x00,0x00,0xec,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v1
+// CHECK: [0x00,0x00,0xec,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b64 v[0:1], v0 offset:4
-// CHECK: [0x04,0x00,0xec,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v1 offset:0
+// CHECK: [0x00,0x00,0xec,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read_b64 v[0:1], v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xed,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v1 offset:4
+// CHECK: [0x04,0x00,0xec,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xee,0xd8,0x00,0x00,0x00,0x00]
+ds_read_b64 v[5:6], v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xed,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[252:255], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xee,0xd8,0x00,0x00,0x00,0xfc]
+ds_read2_b64 v[5:8], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xee,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xee,0xd8,0xff,0x00,0x00,0x00]
+ds_read2_b64 v[252:255], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xee,0xd8,0x01,0x00,0x00,0xfc]
 
-ds_read2_b64 v[0:3], v0 offset1:255
-// CHECK: [0x00,0xff,0xee,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xee,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0xee,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset1:255
+// CHECK: [0x00,0xff,0xee,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0xee,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0xee,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:127
-// CHECK: [0x7f,0x00,0xee,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0xee,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0xee,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:127
+// CHECK: [0x7f,0x00,0xee,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0xee,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0xee,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2_b64 v[0:3], v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0xef,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0xee,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2_b64 v[5:8], v1 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0xef,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[252:255], v0 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xf0,0xd8,0x00,0x00,0x00,0xfc]
+ds_read2st64_b64 v[5:8], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v255 offset0:127 offset1:255
-// CHECK: [0x7f,0xff,0xf0,0xd8,0xff,0x00,0x00,0x00]
+ds_read2st64_b64 v[252:255], v1 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xf0,0xd8,0x01,0x00,0x00,0xfc]
 
-ds_read2st64_b64 v[0:3], v0 offset1:255
-// CHECK: [0x00,0xff,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v255 offset0:127 offset1:255
+// CHECK: [0x7f,0xff,0xf0,0xd8,0xff,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:0 offset1:255
-// CHECK: [0x00,0xff,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset1:255
+// CHECK: [0x00,0xff,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:16 offset1:255
-// CHECK: [0x10,0xff,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:0 offset1:255
+// CHECK: [0x00,0xff,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:127
-// CHECK: [0x7f,0x00,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:16 offset1:255
+// CHECK: [0x10,0xff,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:127 offset1:0
-// CHECK: [0x7f,0x00,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:127
+// CHECK: [0x7f,0x00,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:127 offset1:1
-// CHECK: [0x7f,0x01,0xf0,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:127 offset1:0
+// CHECK: [0x7f,0x00,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_read2st64_b64 v[0:3], v0 offset0:127 offset1:255 gds
-// CHECK: [0x7f,0xff,0xf1,0xd8,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:127 offset1:1
+// CHECK: [0x7f,0x01,0xf0,0xd8,0x01,0x00,0x00,0x05]
 
-ds_add_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x00,0xd9,0x00,0x00,0x00,0x00]
+ds_read2st64_b64 v[5:8], v1 offset0:127 offset1:255 gds
+// CHECK: [0x7f,0xff,0xf1,0xd8,0x01,0x00,0x00,0x05]
+
+ds_add_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x00,0xd9,0x01,0x00,0x00,0x00]
 
 ds_add_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x00,0xd9,0xff,0x00,0x00,0x00]
 
-ds_add_src2_u32 v0
-// CHECK: [0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0x00]
+ds_add_src2_u32 v1
+// CHECK: [0x00,0x00,0x00,0xd9,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x00,0xd9,0x00,0x00,0x00,0x00]
+ds_add_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x00,0xd9,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x00,0xd9,0x00,0x00,0x00,0x00]
+ds_add_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x00,0xd9,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x01,0xd9,0x00,0x00,0x00,0x00]
+ds_add_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x01,0xd9,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x02,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x02,0xd9,0x01,0x00,0x00,0x00]
 
 ds_sub_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x02,0xd9,0xff,0x00,0x00,0x00]
 
-ds_sub_src2_u32 v0
-// CHECK: [0x00,0x00,0x02,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_src2_u32 v1
+// CHECK: [0x00,0x00,0x02,0xd9,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x02,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x02,0xd9,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x02,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x02,0xd9,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x03,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x03,0xd9,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x04,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x04,0xd9,0x01,0x00,0x00,0x00]
 
 ds_rsub_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x04,0xd9,0xff,0x00,0x00,0x00]
 
-ds_rsub_src2_u32 v0
-// CHECK: [0x00,0x00,0x04,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u32 v1
+// CHECK: [0x00,0x00,0x04,0xd9,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x04,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x04,0xd9,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x04,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x04,0xd9,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x05,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x05,0xd9,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x06,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x06,0xd9,0x01,0x00,0x00,0x00]
 
 ds_inc_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x06,0xd9,0xff,0x00,0x00,0x00]
 
-ds_inc_src2_u32 v0
-// CHECK: [0x00,0x00,0x06,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_src2_u32 v1
+// CHECK: [0x00,0x00,0x06,0xd9,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x06,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x06,0xd9,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x06,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x06,0xd9,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x07,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x07,0xd9,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x08,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x08,0xd9,0x01,0x00,0x00,0x00]
 
 ds_dec_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x08,0xd9,0xff,0x00,0x00,0x00]
 
-ds_dec_src2_u32 v0
-// CHECK: [0x00,0x00,0x08,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_src2_u32 v1
+// CHECK: [0x00,0x00,0x08,0xd9,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x08,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x08,0xd9,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x08,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x08,0xd9,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x09,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x09,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i32 v0 offset:65535
-// CHECK: [0xff,0xff,0x0a,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_i32 v1 offset:65535
+// CHECK: [0xff,0xff,0x0a,0xd9,0x01,0x00,0x00,0x00]
 
 ds_min_src2_i32 v255 offset:65535
 // CHECK: [0xff,0xff,0x0a,0xd9,0xff,0x00,0x00,0x00]
 
-ds_min_src2_i32 v0
-// CHECK: [0x00,0x00,0x0a,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_i32 v1
+// CHECK: [0x00,0x00,0x0a,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i32 v0 offset:0
-// CHECK: [0x00,0x00,0x0a,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_i32 v1 offset:0
+// CHECK: [0x00,0x00,0x0a,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i32 v0 offset:4
-// CHECK: [0x04,0x00,0x0a,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_i32 v1 offset:4
+// CHECK: [0x04,0x00,0x0a,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0b,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_i32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x0b,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i32 v0 offset:65535
-// CHECK: [0xff,0xff,0x0c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_i32 v1 offset:65535
+// CHECK: [0xff,0xff,0x0c,0xd9,0x01,0x00,0x00,0x00]
 
 ds_max_src2_i32 v255 offset:65535
 // CHECK: [0xff,0xff,0x0c,0xd9,0xff,0x00,0x00,0x00]
 
-ds_max_src2_i32 v0
-// CHECK: [0x00,0x00,0x0c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_i32 v1
+// CHECK: [0x00,0x00,0x0c,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i32 v0 offset:0
-// CHECK: [0x00,0x00,0x0c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_i32 v1 offset:0
+// CHECK: [0x00,0x00,0x0c,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i32 v0 offset:4
-// CHECK: [0x04,0x00,0x0c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_i32 v1 offset:4
+// CHECK: [0x04,0x00,0x0c,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0d,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_i32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x0d,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x0e,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x0e,0xd9,0x01,0x00,0x00,0x00]
 
 ds_min_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x0e,0xd9,0xff,0x00,0x00,0x00]
 
-ds_min_src2_u32 v0
-// CHECK: [0x00,0x00,0x0e,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_u32 v1
+// CHECK: [0x00,0x00,0x0e,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x0e,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x0e,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x0e,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x0e,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x0f,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x0f,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u32 v0 offset:65535
-// CHECK: [0xff,0xff,0x10,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_u32 v1 offset:65535
+// CHECK: [0xff,0xff,0x10,0xd9,0x01,0x00,0x00,0x00]
 
 ds_max_src2_u32 v255 offset:65535
 // CHECK: [0xff,0xff,0x10,0xd9,0xff,0x00,0x00,0x00]
 
-ds_max_src2_u32 v0
-// CHECK: [0x00,0x00,0x10,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_u32 v1
+// CHECK: [0x00,0x00,0x10,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u32 v0 offset:0
-// CHECK: [0x00,0x00,0x10,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_u32 v1 offset:0
+// CHECK: [0x00,0x00,0x10,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u32 v0 offset:4
-// CHECK: [0x04,0x00,0x10,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_u32 v1 offset:4
+// CHECK: [0x04,0x00,0x10,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x11,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_u32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x11,0xd9,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b32 v0 offset:65535
-// CHECK: [0xff,0xff,0x14,0xd9,0x00,0x00,0x00,0x00]
+ds_or_src2_b32 v1 offset:65535
+// CHECK: [0xff,0xff,0x14,0xd9,0x01,0x00,0x00,0x00]
 
 ds_or_src2_b32 v255 offset:65535
 // CHECK: [0xff,0xff,0x14,0xd9,0xff,0x00,0x00,0x00]
 
-ds_or_src2_b32 v0
-// CHECK: [0x00,0x00,0x14,0xd9,0x00,0x00,0x00,0x00]
+ds_or_src2_b32 v1
+// CHECK: [0x00,0x00,0x14,0xd9,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b32 v0 offset:0
-// CHECK: [0x00,0x00,0x14,0xd9,0x00,0x00,0x00,0x00]
+ds_or_src2_b32 v1 offset:0
+// CHECK: [0x00,0x00,0x14,0xd9,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b32 v0 offset:4
-// CHECK: [0x04,0x00,0x14,0xd9,0x00,0x00,0x00,0x00]
+ds_or_src2_b32 v1 offset:4
+// CHECK: [0x04,0x00,0x14,0xd9,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x15,0xd9,0x00,0x00,0x00,0x00]
+ds_or_src2_b32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x15,0xd9,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b32 v0 offset:65535
-// CHECK: [0xff,0xff,0x16,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_src2_b32 v1 offset:65535
+// CHECK: [0xff,0xff,0x16,0xd9,0x01,0x00,0x00,0x00]
 
 ds_xor_src2_b32 v255 offset:65535
 // CHECK: [0xff,0xff,0x16,0xd9,0xff,0x00,0x00,0x00]
 
-ds_xor_src2_b32 v0
-// CHECK: [0x00,0x00,0x16,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_src2_b32 v1
+// CHECK: [0x00,0x00,0x16,0xd9,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b32 v0 offset:0
-// CHECK: [0x00,0x00,0x16,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_src2_b32 v1 offset:0
+// CHECK: [0x00,0x00,0x16,0xd9,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b32 v0 offset:4
-// CHECK: [0x04,0x00,0x16,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_src2_b32 v1 offset:4
+// CHECK: [0x04,0x00,0x16,0xd9,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x17,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_src2_b32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x17,0xd9,0x01,0x00,0x00,0x00]
 
-ds_write_src2_b32 v0
-// CHECK: [0x00,0x00,0x1a,0xd9,0x00,0x00,0x00,0x00]
+ds_write_src2_b32 v1
+// CHECK: [0x00,0x00,0x1a,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f32 v0 offset:65535
-// CHECK: [0xff,0xff,0x24,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_f32 v1 offset:65535
+// CHECK: [0xff,0xff,0x24,0xd9,0x01,0x00,0x00,0x00]
 
 ds_min_src2_f32 v255 offset:65535
 // CHECK: [0xff,0xff,0x24,0xd9,0xff,0x00,0x00,0x00]
 
-ds_min_src2_f32 v0
-// CHECK: [0x00,0x00,0x24,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_f32 v1
+// CHECK: [0x00,0x00,0x24,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f32 v0 offset:0
-// CHECK: [0x00,0x00,0x24,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_f32 v1 offset:0
+// CHECK: [0x00,0x00,0x24,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f32 v0 offset:4
-// CHECK: [0x04,0x00,0x24,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_f32 v1 offset:4
+// CHECK: [0x04,0x00,0x24,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x25,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_f32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x25,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_f32 v0 offset:65535
-// CHECK: [0xff,0xff,0x26,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_f32 v1 offset:65535
+// CHECK: [0xff,0xff,0x26,0xd9,0x01,0x00,0x00,0x00]
 
 ds_max_src2_f32 v255 offset:65535
 // CHECK: [0xff,0xff,0x26,0xd9,0xff,0x00,0x00,0x00]
 
-ds_max_src2_f32 v0
-// CHECK: [0x00,0x00,0x26,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_f32 v1
+// CHECK: [0x00,0x00,0x26,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_f32 v0 offset:0
-// CHECK: [0x00,0x00,0x26,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_f32 v1 offset:0
+// CHECK: [0x00,0x00,0x26,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_f32 v0 offset:4
-// CHECK: [0x04,0x00,0x26,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_f32 v1 offset:4
+// CHECK: [0x04,0x00,0x26,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_f32 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x27,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_f32 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x27,0xd9,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x80,0xd9,0x00,0x00,0x00,0x00]
+ds_add_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x80,0xd9,0x01,0x00,0x00,0x00]
 
 ds_add_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x80,0xd9,0xff,0x00,0x00,0x00]
 
-ds_add_src2_u64 v0
-// CHECK: [0x00,0x00,0x80,0xd9,0x00,0x00,0x00,0x00]
+ds_add_src2_u64 v1
+// CHECK: [0x00,0x00,0x80,0xd9,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x80,0xd9,0x00,0x00,0x00,0x00]
+ds_add_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x80,0xd9,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x80,0xd9,0x00,0x00,0x00,0x00]
+ds_add_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x80,0xd9,0x01,0x00,0x00,0x00]
 
-ds_add_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x81,0xd9,0x00,0x00,0x00,0x00]
+ds_add_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x81,0xd9,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x82,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x82,0xd9,0x01,0x00,0x00,0x00]
 
 ds_sub_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x82,0xd9,0xff,0x00,0x00,0x00]
 
-ds_sub_src2_u64 v0
-// CHECK: [0x00,0x00,0x82,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_src2_u64 v1
+// CHECK: [0x00,0x00,0x82,0xd9,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x82,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x82,0xd9,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x82,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x82,0xd9,0x01,0x00,0x00,0x00]
 
-ds_sub_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x83,0xd9,0x00,0x00,0x00,0x00]
+ds_sub_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x83,0xd9,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x84,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x84,0xd9,0x01,0x00,0x00,0x00]
 
 ds_rsub_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x84,0xd9,0xff,0x00,0x00,0x00]
 
-ds_rsub_src2_u64 v0
-// CHECK: [0x00,0x00,0x84,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u64 v1
+// CHECK: [0x00,0x00,0x84,0xd9,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x84,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x84,0xd9,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x84,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x84,0xd9,0x01,0x00,0x00,0x00]
 
-ds_rsub_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x85,0xd9,0x00,0x00,0x00,0x00]
+ds_rsub_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x85,0xd9,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x86,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x86,0xd9,0x01,0x00,0x00,0x00]
 
 ds_inc_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x86,0xd9,0xff,0x00,0x00,0x00]
 
-ds_inc_src2_u64 v0
-// CHECK: [0x00,0x00,0x86,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_src2_u64 v1
+// CHECK: [0x00,0x00,0x86,0xd9,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x86,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x86,0xd9,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x86,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x86,0xd9,0x01,0x00,0x00,0x00]
 
-ds_inc_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x87,0xd9,0x00,0x00,0x00,0x00]
+ds_inc_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x87,0xd9,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x88,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x88,0xd9,0x01,0x00,0x00,0x00]
 
 ds_dec_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x88,0xd9,0xff,0x00,0x00,0x00]
 
-ds_dec_src2_u64 v0
-// CHECK: [0x00,0x00,0x88,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_src2_u64 v1
+// CHECK: [0x00,0x00,0x88,0xd9,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x88,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x88,0xd9,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x88,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x88,0xd9,0x01,0x00,0x00,0x00]
 
-ds_dec_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x89,0xd9,0x00,0x00,0x00,0x00]
+ds_dec_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x89,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i64 v0 offset:65535
-// CHECK: [0xff,0xff,0x8a,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_i64 v1 offset:65535
+// CHECK: [0xff,0xff,0x8a,0xd9,0x01,0x00,0x00,0x00]
 
 ds_min_src2_i64 v255 offset:65535
 // CHECK: [0xff,0xff,0x8a,0xd9,0xff,0x00,0x00,0x00]
 
-ds_min_src2_i64 v0
-// CHECK: [0x00,0x00,0x8a,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_i64 v1
+// CHECK: [0x00,0x00,0x8a,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i64 v0 offset:0
-// CHECK: [0x00,0x00,0x8a,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_i64 v1 offset:0
+// CHECK: [0x00,0x00,0x8a,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i64 v0 offset:4
-// CHECK: [0x04,0x00,0x8a,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_i64 v1 offset:4
+// CHECK: [0x04,0x00,0x8a,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_i64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x8b,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_i64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x8b,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i64 v0 offset:65535
-// CHECK: [0xff,0xff,0x8c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_i64 v1 offset:65535
+// CHECK: [0xff,0xff,0x8c,0xd9,0x01,0x00,0x00,0x00]
 
 ds_max_src2_i64 v255 offset:65535
 // CHECK: [0xff,0xff,0x8c,0xd9,0xff,0x00,0x00,0x00]
 
-ds_max_src2_i64 v0
-// CHECK: [0x00,0x00,0x8c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_i64 v1
+// CHECK: [0x00,0x00,0x8c,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i64 v0 offset:0
-// CHECK: [0x00,0x00,0x8c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_i64 v1 offset:0
+// CHECK: [0x00,0x00,0x8c,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i64 v0 offset:4
-// CHECK: [0x04,0x00,0x8c,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_i64 v1 offset:4
+// CHECK: [0x04,0x00,0x8c,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_i64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x8d,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_i64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x8d,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x8e,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x8e,0xd9,0x01,0x00,0x00,0x00]
 
 ds_min_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x8e,0xd9,0xff,0x00,0x00,0x00]
 
-ds_min_src2_u64 v0
-// CHECK: [0x00,0x00,0x8e,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_u64 v1
+// CHECK: [0x00,0x00,0x8e,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x8e,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x8e,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x8e,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x8e,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x8f,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x8f,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u64 v0 offset:65535
-// CHECK: [0xff,0xff,0x90,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_u64 v1 offset:65535
+// CHECK: [0xff,0xff,0x90,0xd9,0x01,0x00,0x00,0x00]
 
 ds_max_src2_u64 v255 offset:65535
 // CHECK: [0xff,0xff,0x90,0xd9,0xff,0x00,0x00,0x00]
 
-ds_max_src2_u64 v0
-// CHECK: [0x00,0x00,0x90,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_u64 v1
+// CHECK: [0x00,0x00,0x90,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u64 v0 offset:0
-// CHECK: [0x00,0x00,0x90,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_u64 v1 offset:0
+// CHECK: [0x00,0x00,0x90,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u64 v0 offset:4
-// CHECK: [0x04,0x00,0x90,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_u64 v1 offset:4
+// CHECK: [0x04,0x00,0x90,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_u64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x91,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_u64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x91,0xd9,0x01,0x00,0x00,0x00]
 
-ds_and_src2_b64 v0 offset:65535
-// CHECK: [0xff,0xff,0x92,0xd9,0x00,0x00,0x00,0x00]
+ds_and_src2_b64 v1 offset:65535
+// CHECK: [0xff,0xff,0x92,0xd9,0x01,0x00,0x00,0x00]
 
 ds_and_src2_b64 v255 offset:65535
 // CHECK: [0xff,0xff,0x92,0xd9,0xff,0x00,0x00,0x00]
 
-ds_and_src2_b64 v0
-// CHECK: [0x00,0x00,0x92,0xd9,0x00,0x00,0x00,0x00]
+ds_and_src2_b64 v1
+// CHECK: [0x00,0x00,0x92,0xd9,0x01,0x00,0x00,0x00]
 
-ds_and_src2_b64 v0 offset:0
-// CHECK: [0x00,0x00,0x92,0xd9,0x00,0x00,0x00,0x00]
+ds_and_src2_b64 v1 offset:0
+// CHECK: [0x00,0x00,0x92,0xd9,0x01,0x00,0x00,0x00]
 
-ds_and_src2_b64 v0 offset:4
-// CHECK: [0x04,0x00,0x92,0xd9,0x00,0x00,0x00,0x00]
+ds_and_src2_b64 v1 offset:4
+// CHECK: [0x04,0x00,0x92,0xd9,0x01,0x00,0x00,0x00]
 
-ds_and_src2_b64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x93,0xd9,0x00,0x00,0x00,0x00]
+ds_and_src2_b64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x93,0xd9,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b64 v0 offset:65535
-// CHECK: [0xff,0xff,0x94,0xd9,0x00,0x00,0x00,0x00]
+ds_or_src2_b64 v1 offset:65535
+// CHECK: [0xff,0xff,0x94,0xd9,0x01,0x00,0x00,0x00]
 
 ds_or_src2_b64 v255 offset:65535
 // CHECK: [0xff,0xff,0x94,0xd9,0xff,0x00,0x00,0x00]
 
-ds_or_src2_b64 v0
-// CHECK: [0x00,0x00,0x94,0xd9,0x00,0x00,0x00,0x00]
+ds_or_src2_b64 v1
+// CHECK: [0x00,0x00,0x94,0xd9,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b64 v0 offset:0
-// CHECK: [0x00,0x00,0x94,0xd9,0x00,0x00,0x00,0x00]
+ds_or_src2_b64 v1 offset:0
+// CHECK: [0x00,0x00,0x94,0xd9,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b64 v0 offset:4
-// CHECK: [0x04,0x00,0x94,0xd9,0x00,0x00,0x00,0x00]
+ds_or_src2_b64 v1 offset:4
+// CHECK: [0x04,0x00,0x94,0xd9,0x01,0x00,0x00,0x00]
 
-ds_or_src2_b64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x95,0xd9,0x00,0x00,0x00,0x00]
+ds_or_src2_b64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x95,0xd9,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b64 v0 offset:65535
-// CHECK: [0xff,0xff,0x96,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_src2_b64 v1 offset:65535
+// CHECK: [0xff,0xff,0x96,0xd9,0x01,0x00,0x00,0x00]
 
 ds_xor_src2_b64 v255 offset:65535
 // CHECK: [0xff,0xff,0x96,0xd9,0xff,0x00,0x00,0x00]
 
-ds_xor_src2_b64 v0
-// CHECK: [0x00,0x00,0x96,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_src2_b64 v1
+// CHECK: [0x00,0x00,0x96,0xd9,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b64 v0 offset:0
-// CHECK: [0x00,0x00,0x96,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_src2_b64 v1 offset:0
+// CHECK: [0x00,0x00,0x96,0xd9,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b64 v0 offset:4
-// CHECK: [0x04,0x00,0x96,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_src2_b64 v1 offset:4
+// CHECK: [0x04,0x00,0x96,0xd9,0x01,0x00,0x00,0x00]
 
-ds_xor_src2_b64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0x97,0xd9,0x00,0x00,0x00,0x00]
+ds_xor_src2_b64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0x97,0xd9,0x01,0x00,0x00,0x00]
 
-ds_write_src2_b64 v0
-// CHECK: [0x00,0x00,0x9a,0xd9,0x00,0x00,0x00,0x00]
+ds_write_src2_b64 v1
+// CHECK: [0x00,0x00,0x9a,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f64 v0 offset:65535
-// CHECK: [0xff,0xff,0xa4,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_f64 v1 offset:65535
+// CHECK: [0xff,0xff,0xa4,0xd9,0x01,0x00,0x00,0x00]
 
 ds_min_src2_f64 v255 offset:65535
 // CHECK: [0xff,0xff,0xa4,0xd9,0xff,0x00,0x00,0x00]
 
-ds_min_src2_f64 v0
-// CHECK: [0x00,0x00,0xa4,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_f64 v1
+// CHECK: [0x00,0x00,0xa4,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f64 v0 offset:0
-// CHECK: [0x00,0x00,0xa4,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_f64 v1 offset:0
+// CHECK: [0x00,0x00,0xa4,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f64 v0 offset:4
-// CHECK: [0x04,0x00,0xa4,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_f64 v1 offset:4
+// CHECK: [0x04,0x00,0xa4,0xd9,0x01,0x00,0x00,0x00]
 
-ds_min_src2_f64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xa5,0xd9,0x00,0x00,0x00,0x00]
+ds_min_src2_f64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xa5,0xd9,0x01,0x00,0x00,0x00]
 
-ds_max_src2_f64 v0 offset:65535
-// CHECK: [0xff,0xff,0xa6,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_f64 v1 offset:65535
+// CHECK: [0xff,0xff,0xa6,0xd9,0x01,0x00,0x00,0x00]
 
 ds_max_src2_f64 v255 offset:65535
 // CHECK: [0xff,0xff,0xa6,0xd9,0xff,0x00,0x00,0x00]
 
-ds_max_src2_f64 v0
-// CHECK: [0x00,0x00,0xa6,0xd9,0x00,0x00,0x00,0x00]
+ds_max_src2_f64 v1
+// CHECK: [0x00,0x00,0xa6,0xd9,0x01,0x00,0x00,0x00]
+
+ds_max_src2_f64 v1 offset:0
+// CHECK: [0x00,0x00,0xa6,0xd9,0x01,0x00,0x00,0x00]
+
+ds_max_src2_f64 v1 offset:4
+// CHECK: [0x04,0x00,0xa6,0xd9,0x01,0x00,0x00,0x00]
+
+ds_max_src2_f64 v1 offset:65535 gds
+// CHECK: [0xff,0xff,0xa7,0xd9,0x01,0x00,0x00,0x00]
+
+ds_and_src2_b32 v1
+// CHECK: [0x00,0x00,0x12,0xd9,0x01,0x00,0x00,0x00]
+
+ds_and_src2_b32 v1 gds
+// CHECK: [0x00,0x00,0x13,0xd9,0x01,0x00,0x00,0x00]
+
+ds_and_src2_b32 v255 offset:65535
+// CHECK: [0xff,0xff,0x12,0xd9,0xff,0x00,0x00,0x00]
+
+ds_append v5
+// CHECK: [0x00,0x00,0x7c,0xd9,0x00,0x00,0x00,0x05]
+
+ds_append v5 gds
+// CHECK: [0x00,0x00,0x7d,0xd9,0x00,0x00,0x00,0x05]
+
+ds_append v255 offset:65535
+// CHECK: [0xff,0xff,0x7c,0xd9,0x00,0x00,0x00,0xff]
+
+ds_consume v5
+// CHECK: [0x00,0x00,0x7a,0xd9,0x00,0x00,0x00,0x05]
+
+ds_consume v5 gds
+// CHECK: [0x00,0x00,0x7b,0xd9,0x00,0x00,0x00,0x05]
+
+ds_consume v255 offset:65535
+// CHECK: [0xff,0xff,0x7a,0xd9,0x00,0x00,0x00,0xff]
+
+ds_ordered_count v5, v1 gds
+// CHECK: [0x00,0x00,0x7f,0xd9,0x01,0x00,0x00,0x05]
+
+ds_ordered_count v5, v255 offset:65535 gds
+// CHECK: [0xff,0xff,0x7f,0xd9,0xff,0x00,0x00,0x05]
+
+ds_ordered_count v5, v255 gds
+// CHECK: [0x00,0x00,0x7f,0xd9,0xff,0x00,0x00,0x05]
+
+ds_gws_barrier v1 gds
+// CHECK: [0x00,0x00,0x3b,0xd9,0x00,0x01,0x00,0x00]
+
+ds_gws_barrier v255 offset:65535 gds
+// CHECK: [0xff,0xff,0x3b,0xd9,0x00,0xff,0x00,0x00]
+
+ds_gws_init v1 gds
+// CHECK: [0x00,0x00,0x33,0xd9,0x00,0x01,0x00,0x00]
+
+ds_gws_init v255 offset:65535 gds
+// CHECK: [0xff,0xff,0x33,0xd9,0x00,0xff,0x00,0x00]
+
+ds_gws_sema_br v1 gds
+// CHECK: [0x00,0x00,0x37,0xd9,0x00,0x01,0x00,0x00]
+
+ds_gws_sema_br v255 offset:65535 gds
+// CHECK: [0xff,0xff,0x37,0xd9,0x00,0xff,0x00,0x00]
+
+ds_gws_sema_p offset:65535 gds
+// CHECK: [0xff,0xff,0x39,0xd9,0x00,0x00,0x00,0x00]
+
+ds_gws_sema_p gds
+// CHECK: [0x00,0x00,0x39,0xd9,0x00,0x00,0x00,0x00]
+
+ds_gws_sema_release_all offset:65535 gds
+// CHECK: [0xff,0xff,0x31,0xd9,0x00,0x00,0x00,0x00]
+
+ds_gws_sema_release_all gds
+// CHECK: [0x00,0x00,0x31,0xd9,0x00,0x00,0x00,0x00]
+
+ds_gws_sema_v offset:65535 gds
+// CHECK: [0xff,0xff,0x35,0xd9,0x00,0x00,0x00,0x00]
+
+ds_gws_sema_v gds
+// CHECK: [0x00,0x00,0x35,0xd9,0x00,0x00,0x00,0x00]
+
+ds_wrap_rtn_b32 v5, v255, v2, v3 gds
+// CHECK: [0x00,0x00,0x69,0xd8,0xff,0x02,0x03,0x05]
+
+ds_wrap_rtn_b32 v5, v255, v2, v255 offset:65535
+// CHECK: [0xff,0xff,0x68,0xd8,0xff,0x02,0xff,0x05]
+
+ds_condxchg32_rtn_b64 v[5:6], v1, v[254:255] offset:65535 gds
+// CHECK: [0xff,0xff,0xfd,0xd8,0x01,0xfe,0x00,0x05]
+
+ds_condxchg32_rtn_b64 v[5:6], v1, v[254:255]
+// CHECK: [0x00,0x00,0xfc,0xd8,0x01,0xfe,0x00,0x05]
 
-ds_max_src2_f64 v0 offset:0
-// CHECK: [0x00,0x00,0xa6,0xd9,0x00,0x00,0x00,0x00]
+exp mrt0, v0, v0, v0, v0
+// CHECK: [0x0f,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-ds_max_src2_f64 v0 offset:4
-// CHECK: [0x04,0x00,0xa6,0xd9,0x00,0x00,0x00,0x00]
+exp mrtz, v0, v0, v0, v0
+// CHECK: [0x8f,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-ds_max_src2_f64 v0 offset:65535 gds
-// CHECK: [0xff,0xff,0xa7,0xd9,0x00,0x00,0x00,0x00]
+exp null, v0, v0, v0, v0
+// CHECK: [0x9f,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_ubyte v0, v[0:1]
-// CHECK: [0x00,0x00,0x40,0xdc,0x00,0x00,0x00,0x00]
+exp pos0, v0, v0, v0, v0
+// CHECK: [0xcf,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_ubyte v255, v[0:1]
-// CHECK: [0x00,0x00,0x40,0xdc,0x00,0x00,0x00,0xff]
+exp param0, v0, v0, v0, v0
+// CHECK: [0x0f,0x02,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_ubyte v0, v[254:255]
-// CHECK: [0x00,0x00,0x40,0xdc,0xfe,0x00,0x00,0x00]
+exp mrt0, v255, v0, v0, v0
+// CHECK: [0x0f,0x00,0x00,0xc4,0xff,0x00,0x00,0x00]
 
-flat_load_ubyte v0, v[0:1] glc
-// CHECK: [0x00,0x00,0x41,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, v0, v255, v0, v0
+// CHECK: [0x0f,0x00,0x00,0xc4,0x00,0xff,0x00,0x00]
 
-flat_load_ubyte v0, v[0:1] slc
-// CHECK: [0x00,0x00,0x42,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, v0, v0, v255, v0
+// CHECK: [0x0f,0x00,0x00,0xc4,0x00,0x00,0xff,0x00]
 
-flat_load_sbyte v0, v[0:1]
-// CHECK: [0x00,0x00,0x44,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, v0, v0, v0, v255
+// CHECK: [0x0f,0x00,0x00,0xc4,0x00,0x00,0x00,0xff]
 
-flat_load_sbyte v255, v[0:1]
-// CHECK: [0x00,0x00,0x44,0xdc,0x00,0x00,0x00,0xff]
+exp mrt0, v0, off, off, off
+// CHECK: [0x01,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_sbyte v0, v[254:255]
-// CHECK: [0x00,0x00,0x44,0xdc,0xfe,0x00,0x00,0x00]
+exp mrt0, off, v0, off, off
+// CHECK: [0x02,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_sbyte v0, v[0:1] glc
-// CHECK: [0x00,0x00,0x45,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, v0, v0, off, off
+// CHECK: [0x03,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_sbyte v0, v[0:1] slc
-// CHECK: [0x00,0x00,0x46,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, off, off, v0, off
+// CHECK: [0x04,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_ushort v0, v[0:1]
-// CHECK: [0x00,0x00,0x48,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, v0, off, v0, off
+// CHECK: [0x05,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_ushort v255, v[0:1]
-// CHECK: [0x00,0x00,0x48,0xdc,0x00,0x00,0x00,0xff]
+exp mrt0, off, v0, v0, off
+// CHECK: [0x06,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_ushort v0, v[254:255]
-// CHECK: [0x00,0x00,0x48,0xdc,0xfe,0x00,0x00,0x00]
+exp mrt0, v0, v0, v0, off
+// CHECK: [0x07,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_ushort v0, v[0:1] glc
-// CHECK: [0x00,0x00,0x49,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, off, off, off, v0
+// CHECK: [0x08,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_ushort v0, v[0:1] slc
-// CHECK: [0x00,0x00,0x4a,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, v0, off, off, v0
+// CHECK: [0x09,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_sshort v0, v[0:1]
-// CHECK: [0x00,0x00,0x4c,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, off, v0, off, v0
+// CHECK: [0x0a,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_sshort v255, v[0:1]
-// CHECK: [0x00,0x00,0x4c,0xdc,0x00,0x00,0x00,0xff]
+exp mrt0, v0, v0, off, v0
+// CHECK: [0x0b,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_sshort v0, v[254:255]
-// CHECK: [0x00,0x00,0x4c,0xdc,0xfe,0x00,0x00,0x00]
+exp mrt0, off, off, v0, v0
+// CHECK: [0x0c,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_sshort v0, v[0:1] glc
-// CHECK: [0x00,0x00,0x4d,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, v0, off, v0, v0
+// CHECK: [0x0d,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_sshort v0, v[0:1] slc
-// CHECK: [0x00,0x00,0x4e,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, off, v0, v0, v0
+// CHECK: [0x0e,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_dword v0, v[0:1]
-// CHECK: [0x00,0x00,0x50,0xdc,0x00,0x00,0x00,0x00]
+exp mrt0, off, off, off, off
+// CHECK: [0x00,0x00,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_dword v255, v[0:1]
-// CHECK: [0x00,0x00,0x50,0xdc,0x00,0x00,0x00,0xff]
+exp mrt0, v0, v0, v0, v0 vm
+// CHECK: [0x0f,0x10,0x00,0xc4,0x00,0x00,0x00,0x00]
 
-flat_load_dword v0, v[254:255]
-// CHECK: [0x00,0x00,0x50,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_ubyte v5, v[1:2]
+// CHECK: [0x00,0x00,0x40,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dword v0, v[0:1] glc
-// CHECK: [0x00,0x00,0x51,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ubyte v255, v[1:2]
+// CHECK: [0x00,0x00,0x40,0xdc,0x01,0x00,0x00,0xff]
 
-flat_load_dword v0, v[0:1] slc
-// CHECK: [0x00,0x00,0x52,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ubyte v5, v[254:255]
+// CHECK: [0x00,0x00,0x40,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_dwordx2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x54,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ubyte v5, v[1:2] glc
+// CHECK: [0x00,0x00,0x41,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x54,0xdc,0x00,0x00,0x00,0xfe]
+flat_load_ubyte v5, v[1:2] slc
+// CHECK: [0x00,0x00,0x42,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x54,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_sbyte v5, v[1:2]
+// CHECK: [0x00,0x00,0x44,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx2 v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x55,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sbyte v255, v[1:2]
+// CHECK: [0x00,0x00,0x44,0xdc,0x01,0x00,0x00,0xff]
 
-flat_load_dwordx2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x56,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sbyte v5, v[254:255]
+// CHECK: [0x00,0x00,0x44,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_dwordx3 v[0:2], v[0:1]
-// CHECK: [0x00,0x00,0x58,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sbyte v5, v[1:2] glc
+// CHECK: [0x00,0x00,0x45,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx3 v[253:255], v[0:1]
-// CHECK: [0x00,0x00,0x58,0xdc,0x00,0x00,0x00,0xfd]
+flat_load_sbyte v5, v[1:2] slc
+// CHECK: [0x00,0x00,0x46,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx3 v[0:2], v[254:255]
-// CHECK: [0x00,0x00,0x58,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_ushort v5, v[1:2]
+// CHECK: [0x00,0x00,0x48,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx3 v[0:2], v[0:1] glc
-// CHECK: [0x00,0x00,0x59,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ushort v255, v[1:2]
+// CHECK: [0x00,0x00,0x48,0xdc,0x01,0x00,0x00,0xff]
 
-flat_load_dwordx3 v[0:2], v[0:1] slc
-// CHECK: [0x00,0x00,0x5a,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ushort v5, v[254:255]
+// CHECK: [0x00,0x00,0x48,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_load_dwordx4 v[0:3], v[0:1]
-// CHECK: [0x00,0x00,0x5c,0xdc,0x00,0x00,0x00,0x00]
+flat_load_ushort v5, v[1:2] glc
+// CHECK: [0x00,0x00,0x49,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx4 v[252:255], v[0:1]
-// CHECK: [0x00,0x00,0x5c,0xdc,0x00,0x00,0x00,0xfc]
+flat_load_ushort v5, v[1:2] slc
+// CHECK: [0x00,0x00,0x4a,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx4 v[0:3], v[254:255]
-// CHECK: [0x00,0x00,0x5c,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_sshort v5, v[1:2]
+// CHECK: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x00,0x05]
 
-flat_load_dwordx4 v[0:3], v[0:1] glc
-// CHECK: [0x00,0x00,0x5d,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sshort v255, v[1:2]
+// CHECK: [0x00,0x00,0x4c,0xdc,0x01,0x00,0x00,0xff]
 
-flat_load_dwordx4 v[0:3], v[0:1] slc
-// CHECK: [0x00,0x00,0x5e,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sshort v5, v[254:255]
+// CHECK: [0x00,0x00,0x4c,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_store_byte v[0:1], v0
-// CHECK: [0x00,0x00,0x60,0xdc,0x00,0x00,0x00,0x00]
+flat_load_sshort v5, v[1:2] glc
+// CHECK: [0x00,0x00,0x4d,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_byte v[254:255], v0
-// CHECK: [0x00,0x00,0x60,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_sshort v5, v[1:2] slc
+// CHECK: [0x00,0x00,0x4e,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_byte v[0:1], v255
-// CHECK: [0x00,0x00,0x60,0xdc,0x00,0xff,0x00,0x00]
+flat_load_dword v5, v[1:2]
+// CHECK: [0x00,0x00,0x50,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_byte v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x61,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dword v255, v[1:2]
+// CHECK: [0x00,0x00,0x50,0xdc,0x01,0x00,0x00,0xff]
 
-flat_store_byte v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x62,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dword v5, v[254:255]
+// CHECK: [0x00,0x00,0x50,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_store_short v[0:1], v0
-// CHECK: [0x00,0x00,0x68,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dword v5, v[1:2] glc
+// CHECK: [0x00,0x00,0x51,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_short v[254:255], v0
-// CHECK: [0x00,0x00,0x68,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_dword v5, v[1:2] slc
+// CHECK: [0x00,0x00,0x52,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_short v[0:1], v255
-// CHECK: [0x00,0x00,0x68,0xdc,0x00,0xff,0x00,0x00]
+flat_load_dwordx2 v[5:6], v[1:2]
+// CHECK: [0x00,0x00,0x54,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_short v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x69,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx2 v[254:255], v[1:2]
+// CHECK: [0x00,0x00,0x54,0xdc,0x01,0x00,0x00,0xfe]
 
-flat_store_short v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x6a,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx2 v[5:6], v[254:255]
+// CHECK: [0x00,0x00,0x54,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_store_dword v[0:1], v0
-// CHECK: [0x00,0x00,0x70,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx2 v[5:6], v[1:2] glc
+// CHECK: [0x00,0x00,0x55,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_dword v[254:255], v0
-// CHECK: [0x00,0x00,0x70,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_dwordx2 v[5:6], v[1:2] slc
+// CHECK: [0x00,0x00,0x56,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_dword v[0:1], v255
-// CHECK: [0x00,0x00,0x70,0xdc,0x00,0xff,0x00,0x00]
+flat_load_dwordx3 v[5:7], v[1:2]
+// CHECK: [0x00,0x00,0x58,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_dword v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x71,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx3 v[253:255], v[1:2]
+// CHECK: [0x00,0x00,0x58,0xdc,0x01,0x00,0x00,0xfd]
 
-flat_store_dword v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x72,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx3 v[5:7], v[254:255]
+// CHECK: [0x00,0x00,0x58,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_store_dwordx2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x74,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx3 v[5:7], v[1:2] glc
+// CHECK: [0x00,0x00,0x59,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_dwordx2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x74,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_dwordx3 v[5:7], v[1:2] slc
+// CHECK: [0x00,0x00,0x5a,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_dwordx2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x74,0xdc,0x00,0xfe,0x00,0x00]
+flat_load_dwordx4 v[5:8], v[1:2]
+// CHECK: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_dwordx2 v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x75,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx4 v[252:255], v[1:2]
+// CHECK: [0x00,0x00,0x5c,0xdc,0x01,0x00,0x00,0xfc]
 
-flat_store_dwordx2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x76,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx4 v[5:8], v[254:255]
+// CHECK: [0x00,0x00,0x5c,0xdc,0xfe,0x00,0x00,0x05]
 
-flat_store_dwordx3 v[0:1], v[0:2]
-// CHECK: [0x00,0x00,0x78,0xdc,0x00,0x00,0x00,0x00]
+flat_load_dwordx4 v[5:8], v[1:2] glc
+// CHECK: [0x00,0x00,0x5d,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_dwordx3 v[254:255], v[0:2]
-// CHECK: [0x00,0x00,0x78,0xdc,0xfe,0x00,0x00,0x00]
+flat_load_dwordx4 v[5:8], v[1:2] slc
+// CHECK: [0x00,0x00,0x5e,0xdc,0x01,0x00,0x00,0x05]
 
-flat_store_dwordx3 v[0:1], v[253:255]
-// CHECK: [0x00,0x00,0x78,0xdc,0x00,0xfd,0x00,0x00]
+flat_store_byte v[1:2], v2
+// CHECK: [0x00,0x00,0x60,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx3 v[0:1], v[0:2] glc
-// CHECK: [0x00,0x00,0x79,0xdc,0x00,0x00,0x00,0x00]
+flat_store_byte v[254:255], v2
+// CHECK: [0x00,0x00,0x60,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_store_dwordx3 v[0:1], v[0:2] slc
-// CHECK: [0x00,0x00,0x7a,0xdc,0x00,0x00,0x00,0x00]
+flat_store_byte v[1:2], v255
+// CHECK: [0x00,0x00,0x60,0xdc,0x01,0xff,0x00,0x00]
 
-flat_store_dwordx4 v[0:1], v[0:3]
-// CHECK: [0x00,0x00,0x7c,0xdc,0x00,0x00,0x00,0x00]
+flat_store_byte v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x61,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx4 v[254:255], v[0:3]
-// CHECK: [0x00,0x00,0x7c,0xdc,0xfe,0x00,0x00,0x00]
+flat_store_byte v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x62,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx4 v[0:1], v[252:255]
-// CHECK: [0x00,0x00,0x7c,0xdc,0x00,0xfc,0x00,0x00]
+flat_store_short v[1:2], v2
+// CHECK: [0x00,0x00,0x68,0xdc,0x01,0x02,0x00,0x00]
 
-flat_store_dwordx4 v[0:1], v[0:3] glc
-// CHECK: [0x00,0x00,0x7d,0xdc,0x00,0x00,0x00,0x00]
+flat_store_short v[254:255], v2
+// CHECK: [0x00,0x00,0x68,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_store_dwordx4 v[0:1], v[0:3] slc
-// CHECK: [0x00,0x00,0x7e,0xdc,0x00,0x00,0x00,0x00]
+flat_store_short v[1:2], v255
+// CHECK: [0x00,0x00,0x68,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_swap v[0:1], v0
-// CHECK: [0x00,0x00,0x00,0xdd,0x00,0x00,0x00,0x00]
+flat_store_short v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x69,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap v[254:255], v0
-// CHECK: [0x00,0x00,0x00,0xdd,0xfe,0x00,0x00,0x00]
+flat_store_short v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x6a,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap v[0:1], v255
-// CHECK: [0x00,0x00,0x00,0xdd,0x00,0xff,0x00,0x00]
+flat_store_dword v[1:2], v2
+// CHECK: [0x00,0x00,0x70,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x01,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dword v[254:255], v2
+// CHECK: [0x00,0x00,0x70,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_swap v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x02,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dword v[1:2], v255
+// CHECK: [0x00,0x00,0x70,0xdc,0x01,0xff,0x00,0x00]
 
-flat_atomic_cmpswap v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x04,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dword v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x71,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x04,0xdd,0xfe,0x00,0x00,0x00]
+flat_store_dword v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x72,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x04,0xdd,0x00,0xfe,0x00,0x00]
+flat_store_dwordx2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x74,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap v0, v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x05,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dwordx2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x74,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_cmpswap v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x06,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dwordx2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x74,0xdc,0x01,0xfe,0x00,0x00]
 
-flat_atomic_add v[0:1], v0
-// CHECK: [0x00,0x00,0x08,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dwordx2 v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x75,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_add v[254:255], v0
-// CHECK: [0x00,0x00,0x08,0xdd,0xfe,0x00,0x00,0x00]
+flat_store_dwordx2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x76,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_add v[0:1], v255
-// CHECK: [0x00,0x00,0x08,0xdd,0x00,0xff,0x00,0x00]
+flat_store_dwordx3 v[1:2], v[2:4]
+// CHECK: [0x00,0x00,0x78,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_add v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x09,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dwordx3 v[254:255], v[2:4]
+// CHECK: [0x00,0x00,0x78,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_add v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x0a,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dwordx3 v[1:2], v[253:255]
+// CHECK: [0x00,0x00,0x78,0xdc,0x01,0xfd,0x00,0x00]
 
-flat_atomic_sub v[0:1], v0
-// CHECK: [0x00,0x00,0x0c,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dwordx3 v[1:2], v[2:4] glc
+// CHECK: [0x00,0x00,0x79,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub v[254:255], v0
-// CHECK: [0x00,0x00,0x0c,0xdd,0xfe,0x00,0x00,0x00]
+flat_store_dwordx3 v[1:2], v[2:4] slc
+// CHECK: [0x00,0x00,0x7a,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub v[0:1], v255
-// CHECK: [0x00,0x00,0x0c,0xdd,0x00,0xff,0x00,0x00]
+flat_store_dwordx4 v[1:2], v[2:5]
+// CHECK: [0x00,0x00,0x7c,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x0d,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dwordx4 v[254:255], v[2:5]
+// CHECK: [0x00,0x00,0x7c,0xdc,0xfe,0x02,0x00,0x00]
 
-flat_atomic_sub v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x0e,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dwordx4 v[1:2], v[252:255]
+// CHECK: [0x00,0x00,0x7c,0xdc,0x01,0xfc,0x00,0x00]
 
-flat_atomic_smin v[0:1], v0
-// CHECK: [0x00,0x00,0x10,0xdd,0x00,0x00,0x00,0x00]
+flat_store_dwordx4 v[1:2], v[2:5] glc
+// CHECK: [0x00,0x00,0x7d,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin v[254:255], v0
-// CHECK: [0x00,0x00,0x10,0xdd,0xfe,0x00,0x00,0x00]
+flat_store_dwordx4 v[1:2], v[2:5] slc
+// CHECK: [0x00,0x00,0x7e,0xdc,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin v[0:1], v255
-// CHECK: [0x00,0x00,0x10,0xdd,0x00,0xff,0x00,0x00]
+flat_atomic_swap v[1:2], v2
+// CHECK: [0x00,0x00,0x00,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x11,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_swap v[254:255], v2
+// CHECK: [0x00,0x00,0x00,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_smin v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x12,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_swap v[1:2], v255
+// CHECK: [0x00,0x00,0x00,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_umin v[0:1], v0
-// CHECK: [0x00,0x00,0x14,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_swap v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x01,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin v[254:255], v0
-// CHECK: [0x00,0x00,0x14,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_swap v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x02,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin v[0:1], v255
-// CHECK: [0x00,0x00,0x14,0xdd,0x00,0xff,0x00,0x00]
+flat_atomic_cmpswap v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x04,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x15,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x04,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_umin v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x16,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x04,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_smax v[0:1], v0
-// CHECK: [0x00,0x00,0x18,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap v0, v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x05,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax v[254:255], v0
-// CHECK: [0x00,0x00,0x18,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_cmpswap v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x06,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax v[0:1], v255
-// CHECK: [0x00,0x00,0x18,0xdd,0x00,0xff,0x00,0x00]
+flat_atomic_add v[1:2], v2
+// CHECK: [0x00,0x00,0x08,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x19,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_add v[254:255], v2
+// CHECK: [0x00,0x00,0x08,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_smax v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x1a,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_add v[1:2], v255
+// CHECK: [0x00,0x00,0x08,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_umax v[0:1], v0
-// CHECK: [0x00,0x00,0x1c,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_add v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x09,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax v[254:255], v0
-// CHECK: [0x00,0x00,0x1c,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_add v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x0a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax v[0:1], v255
-// CHECK: [0x00,0x00,0x1c,0xdd,0x00,0xff,0x00,0x00]
+flat_atomic_sub v[1:2], v2
+// CHECK: [0x00,0x00,0x0c,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x1d,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_sub v[254:255], v2
+// CHECK: [0x00,0x00,0x0c,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_umax v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x1e,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_sub v[1:2], v255
+// CHECK: [0x00,0x00,0x0c,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_and v[0:1], v0
-// CHECK: [0x00,0x00,0x20,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_sub v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x0d,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_and v[254:255], v0
-// CHECK: [0x00,0x00,0x20,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_sub v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x0e,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_and v[0:1], v255
-// CHECK: [0x00,0x00,0x20,0xdd,0x00,0xff,0x00,0x00]
+flat_atomic_smin v[1:2], v2
+// CHECK: [0x00,0x00,0x10,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_and v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x21,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smin v[254:255], v2
+// CHECK: [0x00,0x00,0x10,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_and v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x22,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smin v[1:2], v255
+// CHECK: [0x00,0x00,0x10,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_or v[0:1], v0
-// CHECK: [0x00,0x00,0x24,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smin v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x11,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_or v[254:255], v0
-// CHECK: [0x00,0x00,0x24,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_smin v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x12,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_or v[0:1], v255
-// CHECK: [0x00,0x00,0x24,0xdd,0x00,0xff,0x00,0x00]
+flat_atomic_umin v[1:2], v2
+// CHECK: [0x00,0x00,0x14,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_or v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x25,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umin v[254:255], v2
+// CHECK: [0x00,0x00,0x14,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_or v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x26,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umin v[1:2], v255
+// CHECK: [0x00,0x00,0x14,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_xor v[0:1], v0
-// CHECK: [0x00,0x00,0x28,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umin v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x15,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor v[254:255], v0
-// CHECK: [0x00,0x00,0x28,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_umin v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x16,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor v[0:1], v255
-// CHECK: [0x00,0x00,0x28,0xdd,0x00,0xff,0x00,0x00]
+flat_atomic_smax v[1:2], v2
+// CHECK: [0x00,0x00,0x18,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x29,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smax v[254:255], v2
+// CHECK: [0x00,0x00,0x18,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_xor v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x2a,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smax v[1:2], v255
+// CHECK: [0x00,0x00,0x18,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_inc v[0:1], v0
-// CHECK: [0x00,0x00,0x2c,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smax v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x19,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc v[254:255], v0
-// CHECK: [0x00,0x00,0x2c,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_smax v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x1a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc v[0:1], v255
-// CHECK: [0x00,0x00,0x2c,0xdd,0x00,0xff,0x00,0x00]
+flat_atomic_umax v[1:2], v2
+// CHECK: [0x00,0x00,0x1c,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x2d,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umax v[254:255], v2
+// CHECK: [0x00,0x00,0x1c,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_inc v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x2e,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umax v[1:2], v255
+// CHECK: [0x00,0x00,0x1c,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_dec v[0:1], v0
-// CHECK: [0x00,0x00,0x30,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umax v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x1d,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec v[254:255], v0
-// CHECK: [0x00,0x00,0x30,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_umax v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x1e,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec v[0:1], v255
-// CHECK: [0x00,0x00,0x30,0xdd,0x00,0xff,0x00,0x00]
+flat_atomic_and v[1:2], v2
+// CHECK: [0x00,0x00,0x20,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec v0, v[0:1], v0 glc
-// CHECK: [0x00,0x00,0x31,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_and v[254:255], v2
+// CHECK: [0x00,0x00,0x20,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_dec v[0:1], v0 slc
-// CHECK: [0x00,0x00,0x32,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_and v[1:2], v255
+// CHECK: [0x00,0x00,0x20,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_swap_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x80,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_and v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x21,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x80,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_and v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x22,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x80,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_or v[1:2], v2
+// CHECK: [0x00,0x00,0x24,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_swap_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x81,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_or v[254:255], v2
+// CHECK: [0x00,0x00,0x24,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_swap_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x82,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_or v[1:2], v255
+// CHECK: [0x00,0x00,0x24,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_cmpswap_x2 v[0:1], v[0:3]
-// CHECK: [0x00,0x00,0x84,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_or v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x25,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap_x2 v[254:255], v[0:3]
-// CHECK: [0x00,0x00,0x84,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_or v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x26,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap_x2 v[0:1], v[252:255]
-// CHECK: [0x00,0x00,0x84,0xdd,0x00,0xfc,0x00,0x00]
+flat_atomic_xor v[1:2], v2
+// CHECK: [0x00,0x00,0x28,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_cmpswap_x2 v[0:1], v[0:1], v[0:3] glc
-// CHECK: [0x00,0x00,0x85,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_xor v[254:255], v2
+// CHECK: [0x00,0x00,0x28,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_cmpswap_x2 v[0:1], v[0:3] slc
-// CHECK: [0x00,0x00,0x86,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_xor v[1:2], v255
+// CHECK: [0x00,0x00,0x28,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_add_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x88,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_xor v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x29,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_add_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x88,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_xor v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x2a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_add_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x88,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_inc v[1:2], v2
+// CHECK: [0x00,0x00,0x2c,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_add_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x89,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_inc v[254:255], v2
+// CHECK: [0x00,0x00,0x2c,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_add_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x8a,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_inc v[1:2], v255
+// CHECK: [0x00,0x00,0x2c,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_sub_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x8c,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_inc v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x2d,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x8c,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_inc v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x2e,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x8c,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_dec v[1:2], v2
+// CHECK: [0x00,0x00,0x30,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_sub_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x8d,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_dec v[254:255], v2
+// CHECK: [0x00,0x00,0x30,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_sub_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x8e,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_dec v[1:2], v255
+// CHECK: [0x00,0x00,0x30,0xdd,0x01,0xff,0x00,0x00]
 
-flat_atomic_smin_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x90,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_dec v0, v[1:2], v2 glc
+// CHECK: [0x00,0x00,0x31,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x90,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_dec v[1:2], v2 slc
+// CHECK: [0x00,0x00,0x32,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x90,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_swap_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x80,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smin_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x91,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_swap_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x80,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_smin_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x92,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_swap_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x80,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_umin_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x94,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_swap_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x81,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x94,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_swap_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x82,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x94,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_cmpswap_x2 v[1:2], v[2:5]
+// CHECK: [0x00,0x00,0x84,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umin_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x95,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap_x2 v[254:255], v[2:5]
+// CHECK: [0x00,0x00,0x84,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_umin_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x96,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap_x2 v[1:2], v[252:255]
+// CHECK: [0x00,0x00,0x84,0xdd,0x01,0xfc,0x00,0x00]
 
-flat_atomic_smax_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x98,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_cmpswap_x2 v[0:1], v[1:2], v[2:5] glc
+// CHECK: [0x00,0x00,0x85,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x98,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_cmpswap_x2 v[1:2], v[2:5] slc
+// CHECK: [0x00,0x00,0x86,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x98,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_add_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x88,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_smax_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x99,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_add_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x88,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_smax_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x9a,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_add_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x88,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_umax_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x9c,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_add_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x89,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0x9c,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_add_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x8a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x9c,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_sub_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x8c,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_umax_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0x9d,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_sub_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x8c,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_umax_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0x9e,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_sub_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x8c,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_and_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xa0,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_sub_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x8d,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_and_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0xa0,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_sub_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x8e,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_and_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xa0,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_smin_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x90,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_and_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0xa1,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smin_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x90,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_and_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0xa2,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smin_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x90,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_or_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xa4,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smin_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x91,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_or_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0xa4,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_smin_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x92,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_or_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xa4,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_umin_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x94,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_or_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0xa5,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umin_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x94,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_or_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0xa6,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umin_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x94,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_xor_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xa8,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umin_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x95,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0xa8,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_umin_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x96,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xa8,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_smax_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x98,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_xor_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0xa9,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smax_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x98,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_xor_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0xaa,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smax_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x98,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_inc_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xac,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_smax_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x99,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0xac,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_smax_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x9a,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xac,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_umax_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0x9c,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_inc_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0xad,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umax_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0x9c,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_inc_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0xae,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umax_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0x9c,0xdd,0x01,0xfe,0x00,0x00]
 
-flat_atomic_dec_x2 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xb0,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_umax_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0x9d,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec_x2 v[254:255], v[0:1]
-// CHECK: [0x00,0x00,0xb0,0xdd,0xfe,0x00,0x00,0x00]
+flat_atomic_umax_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0x9e,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec_x2 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xb0,0xdd,0x00,0xfe,0x00,0x00]
+flat_atomic_and_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0xa0,0xdd,0x01,0x02,0x00,0x00]
 
-flat_atomic_dec_x2 v[0:1], v[0:1], v[0:1] glc
-// CHECK: [0x00,0x00,0xb1,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_and_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0xa0,0xdd,0xfe,0x02,0x00,0x00]
 
-flat_atomic_dec_x2 v[0:1], v[0:1] slc
-// CHECK: [0x00,0x00,0xb2,0xdd,0x00,0x00,0x00,0x00]
+flat_atomic_and_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0xa0,0xdd,0x01,0xfe,0x00,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_and_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0xa1,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v252, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf0,0x00,0xfc,0x00,0x00]
+flat_atomic_and_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0xa2,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v0, v[252:255], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf0,0xfc,0x00,0x00,0x00]
+flat_atomic_or_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0xa4,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v0, v[0:3], s[4:11] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf0,0x00,0x00,0x01,0x00]
+flat_atomic_or_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0xa4,0xdd,0xfe,0x02,0x00,0x00]
 
-image_load v0, v[0:3], s[92:99] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf0,0x00,0x00,0x17,0x00]
+flat_atomic_or_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0xa4,0xdd,0x01,0xfe,0x00,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x2
-// CHECK: [0x00,0x02,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_or_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0xa5,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0x3
-// CHECK: [0x00,0x03,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_or_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0xa6,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x4
-// CHECK: [0x00,0x04,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_xor_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0xa8,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0x5
-// CHECK: [0x00,0x05,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_xor_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0xa8,0xdd,0xfe,0x02,0x00,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0x6
-// CHECK: [0x00,0x06,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_xor_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0xa8,0xdd,0x01,0xfe,0x00,0x00]
 
-image_load v[0:2], v[0:3], s[0:7] dmask:0x7
-// CHECK: [0x00,0x07,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_xor_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0xa9,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x8
-// CHECK: [0x00,0x08,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_xor_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0xaa,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0x9
-// CHECK: [0x00,0x09,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_inc_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0xac,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0xa
-// CHECK: [0x00,0x0a,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_inc_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0xac,0xdd,0xfe,0x02,0x00,0x00]
 
-image_load v[0:2], v[0:3], s[0:7] dmask:0xb
-// CHECK: [0x00,0x0b,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_inc_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0xac,0xdd,0x01,0xfe,0x00,0x00]
 
-image_load v[0:1], v[0:3], s[0:7] dmask:0xc
-// CHECK: [0x00,0x0c,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_inc_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0xad,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v[0:2], v[0:3], s[0:7] dmask:0xd
-// CHECK: [0x00,0x0d,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_inc_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0xae,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v[0:2], v[0:3], s[0:7] dmask:0xe
-// CHECK: [0x00,0x0e,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_dec_x2 v[1:2], v[2:3]
+// CHECK: [0x00,0x00,0xb0,0xdd,0x01,0x02,0x00,0x00]
 
-image_load v[0:3], v[0:3], s[0:7] dmask:0xf
-// CHECK: [0x00,0x0f,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_dec_x2 v[254:255], v[2:3]
+// CHECK: [0x00,0x00,0xb0,0xdd,0xfe,0x02,0x00,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x0
-// CHECK: [0x00,0x00,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_dec_x2 v[1:2], v[254:255]
+// CHECK: [0x00,0x00,0xb0,0xdd,0x01,0xfe,0x00,0x00]
 
-image_load v0, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x00,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_dec_x2 v[0:1], v[1:2], v[2:3] glc
+// CHECK: [0x00,0x00,0xb1,0xdd,0x01,0x02,0x00,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf0,0x00,0x00,0x00,0x00]
+flat_atomic_dec_x2 v[1:2], v[2:3] slc
+// CHECK: [0x00,0x00,0xb2,0xdd,0x01,0x02,0x00,0x00]
 
-image_load_mip v252, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf0,0x00,0xfc,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v0, v[252:255], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf0,0xfc,0x00,0x00,0x00]
+image_load v252, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf0,0x01,0xfc,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[4:11] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf0,0x00,0x00,0x01,0x00]
+image_load v5, v[252:255], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf0,0xfc,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[92:99] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf0,0x00,0x00,0x17,0x00]
+image_load v5, v[1:4], s[12:19] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf0,0x01,0x05,0x03,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x2
-// CHECK: [0x00,0x02,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[92:99] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf0,0x01,0x05,0x17,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0x3
-// CHECK: [0x00,0x03,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x2
+// CHECK: [0x00,0x02,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x4
-// CHECK: [0x00,0x04,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0x3
+// CHECK: [0x00,0x03,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0x5
-// CHECK: [0x00,0x05,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x4
+// CHECK: [0x00,0x04,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0x6
-// CHECK: [0x00,0x06,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0x5
+// CHECK: [0x00,0x05,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:2], v[0:3], s[0:7] dmask:0x7
-// CHECK: [0x00,0x07,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0x6
+// CHECK: [0x00,0x06,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x8
-// CHECK: [0x00,0x08,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:7], v[1:4], s[8:15] dmask:0x7
+// CHECK: [0x00,0x07,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0x9
-// CHECK: [0x00,0x09,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x8
+// CHECK: [0x00,0x08,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0xa
-// CHECK: [0x00,0x0a,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0x9
+// CHECK: [0x00,0x09,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:2], v[0:3], s[0:7] dmask:0xb
-// CHECK: [0x00,0x0b,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:1], v[0:3], s[0:7] dmask:0xc
-// CHECK: [0x00,0x0c,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:7], v[1:4], s[8:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:2], v[0:3], s[0:7] dmask:0xd
-// CHECK: [0x00,0x0d,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:6], v[1:4], s[8:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:2], v[0:3], s[0:7] dmask:0xe
-// CHECK: [0x00,0x0e,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:7], v[1:4], s[8:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v[0:3], v[0:3], s[0:7] dmask:0xf
-// CHECK: [0x00,0x0f,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:7], v[1:4], s[8:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x0
-// CHECK: [0x00,0x00,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v[5:8], v[1:4], s[8:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_load_mip v0, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x04,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x0
+// CHECK: [0x00,0x00,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load v5, v[1:4], s[8:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x00,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v252, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf0,0x00,0xfc,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v0, v[252:255], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf0,0xfc,0x00,0x00,0x00]
+image_load_mip v252, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf0,0x01,0xfc,0x02,0x00]
 
-image_store v0, v[0:3], s[4:11] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf0,0x00,0x00,0x01,0x00]
+image_load_mip v5, v[252:255], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf0,0xfc,0x05,0x02,0x00]
 
-image_store v0, v[0:3], s[92:99] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf0,0x00,0x00,0x17,0x00]
+image_load_mip v5, v[1:4], s[12:19] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf0,0x01,0x05,0x03,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x2 unorm
-// CHECK: [0x00,0x12,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[92:99] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf0,0x01,0x05,0x17,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0x3 unorm
-// CHECK: [0x00,0x13,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x2
+// CHECK: [0x00,0x02,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x4 unorm
-// CHECK: [0x00,0x14,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0x3
+// CHECK: [0x00,0x03,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0x5 unorm
-// CHECK: [0x00,0x15,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x4
+// CHECK: [0x00,0x04,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0x6 unorm
-// CHECK: [0x00,0x16,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0x5
+// CHECK: [0x00,0x05,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v[0:2], v[0:3], s[0:7] dmask:0x7 unorm
-// CHECK: [0x00,0x17,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0x6
+// CHECK: [0x00,0x06,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x8 unorm
-// CHECK: [0x00,0x18,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:7], v[1:4], s[8:15] dmask:0x7
+// CHECK: [0x00,0x07,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0x9 unorm
-// CHECK: [0x00,0x19,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x8
+// CHECK: [0x00,0x08,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0xa unorm
-// CHECK: [0x00,0x1a,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0x9
+// CHECK: [0x00,0x09,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v[0:2], v[0:3], s[0:7] dmask:0xb unorm
-// CHECK: [0x00,0x1b,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v[0:1], v[0:3], s[0:7] dmask:0xc unorm
-// CHECK: [0x00,0x1c,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:7], v[1:4], s[8:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v[0:2], v[0:3], s[0:7] dmask:0xd unorm
-// CHECK: [0x00,0x1d,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:6], v[1:4], s[8:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v[0:2], v[0:3], s[0:7] dmask:0xe unorm
-// CHECK: [0x00,0x1e,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:7], v[1:4], s[8:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v[0:3], v[0:3], s[0:7] dmask:0xf unorm
-// CHECK: [0x00,0x1f,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:7], v[1:4], s[8:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x0 unorm
-// CHECK: [0x00,0x10,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v[5:8], v[1:4], s[8:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store v0, v[0:3], s[0:7] dmask:0x1 unorm glc
-// CHECK: [0x00,0x31,0x20,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x0
+// CHECK: [0x00,0x00,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_load_mip v5, v[1:4], s[8:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x04,0xf0,0x01,0x05,0x02,0x00]
 
-image_store_mip v252, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf0,0x00,0xfc,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v0, v[252:255], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf0,0xfc,0x00,0x00,0x00]
+image_store v252, v[2:5], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf0,0x02,0xfc,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[4:11] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf0,0x00,0x00,0x01,0x00]
+image_store v1, v[252:255], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf0,0xfc,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[92:99] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf0,0x00,0x00,0x17,0x00]
+image_store v1, v[2:5], s[16:23] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf0,0x02,0x01,0x04,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x2 unorm
-// CHECK: [0x00,0x12,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[92:99] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf0,0x02,0x01,0x17,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0x3 unorm
-// CHECK: [0x00,0x13,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x2 unorm
+// CHECK: [0x00,0x12,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x4 unorm
-// CHECK: [0x00,0x14,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0x3 unorm
+// CHECK: [0x00,0x13,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0x5 unorm
-// CHECK: [0x00,0x15,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x4 unorm
+// CHECK: [0x00,0x14,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0x6 unorm
-// CHECK: [0x00,0x16,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0x5 unorm
+// CHECK: [0x00,0x15,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:2], v[0:3], s[0:7] dmask:0x7 unorm
-// CHECK: [0x00,0x17,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0x6 unorm
+// CHECK: [0x00,0x16,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x8 unorm
-// CHECK: [0x00,0x18,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:3], v[2:5], s[12:19] dmask:0x7 unorm
+// CHECK: [0x00,0x17,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0x9 unorm
-// CHECK: [0x00,0x19,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x8 unorm
+// CHECK: [0x00,0x18,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0xa unorm
-// CHECK: [0x00,0x1a,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0x9 unorm
+// CHECK: [0x00,0x19,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:2], v[0:3], s[0:7] dmask:0xb unorm
-// CHECK: [0x00,0x1b,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0xa unorm
+// CHECK: [0x00,0x1a,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:1], v[0:3], s[0:7] dmask:0xc unorm
-// CHECK: [0x00,0x1c,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:3], v[2:5], s[12:19] dmask:0xb unorm
+// CHECK: [0x00,0x1b,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:2], v[0:3], s[0:7] dmask:0xd unorm
-// CHECK: [0x00,0x1d,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:2], v[2:5], s[12:19] dmask:0xc unorm
+// CHECK: [0x00,0x1c,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:2], v[0:3], s[0:7] dmask:0xe unorm
-// CHECK: [0x00,0x1e,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:3], v[2:5], s[12:19] dmask:0xd unorm
+// CHECK: [0x00,0x1d,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v[0:3], v[0:3], s[0:7] dmask:0xf unorm
-// CHECK: [0x00,0x1f,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:3], v[2:5], s[12:19] dmask:0xe unorm
+// CHECK: [0x00,0x1e,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x0 unorm
-// CHECK: [0x00,0x10,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v[1:4], v[2:5], s[12:19] dmask:0xf unorm
+// CHECK: [0x00,0x1f,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_store_mip v0, v[0:3], s[0:7] dmask:0x1 unorm glc
-// CHECK: [0x00,0x31,0x24,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x0 unorm
+// CHECK: [0x00,0x10,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store v1, v[2:5], s[12:19] dmask:0x1 unorm glc
+// CHECK: [0x00,0x31,0x20,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v252, v[0:3], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf0,0x00,0xfc,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v0, v[252:255], s[0:7] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf0,0xfc,0x00,0x00,0x00]
+image_store_mip v252, v[2:5], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf0,0x02,0xfc,0x03,0x00]
 
-image_get_resinfo v0, v[0:3], s[4:11] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf0,0x00,0x00,0x01,0x00]
+image_store_mip v1, v[252:255], s[12:19] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf0,0xfc,0x01,0x03,0x00]
 
-image_get_resinfo v0, v[0:3], s[92:99] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf0,0x00,0x00,0x17,0x00]
+image_store_mip v1, v[2:5], s[16:23] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf0,0x02,0x01,0x04,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x2
-// CHECK: [0x00,0x02,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[92:99] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf0,0x02,0x01,0x17,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0x3
-// CHECK: [0x00,0x03,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x2 unorm
+// CHECK: [0x00,0x12,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x4
-// CHECK: [0x00,0x04,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0x3 unorm
+// CHECK: [0x00,0x13,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0x5
-// CHECK: [0x00,0x05,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x4 unorm
+// CHECK: [0x00,0x14,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0x6
-// CHECK: [0x00,0x06,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0x5 unorm
+// CHECK: [0x00,0x15,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v[0:2], v[0:3], s[0:7] dmask:0x7
-// CHECK: [0x00,0x07,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0x6 unorm
+// CHECK: [0x00,0x16,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x8
-// CHECK: [0x00,0x08,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:3], v[2:5], s[12:19] dmask:0x7 unorm
+// CHECK: [0x00,0x17,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0x9
-// CHECK: [0x00,0x09,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x8 unorm
+// CHECK: [0x00,0x18,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0xa
-// CHECK: [0x00,0x0a,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0x9 unorm
+// CHECK: [0x00,0x19,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v[0:2], v[0:3], s[0:7] dmask:0xb
-// CHECK: [0x00,0x0b,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0xa unorm
+// CHECK: [0x00,0x1a,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v[0:1], v[0:3], s[0:7] dmask:0xc
-// CHECK: [0x00,0x0c,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:3], v[2:5], s[12:19] dmask:0xb unorm
+// CHECK: [0x00,0x1b,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v[0:2], v[0:3], s[0:7] dmask:0xd
-// CHECK: [0x00,0x0d,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:2], v[2:5], s[12:19] dmask:0xc unorm
+// CHECK: [0x00,0x1c,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v[0:2], v[0:3], s[0:7] dmask:0xe
-// CHECK: [0x00,0x0e,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:3], v[2:5], s[12:19] dmask:0xd unorm
+// CHECK: [0x00,0x1d,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v[0:3], v[0:3], s[0:7] dmask:0xf
-// CHECK: [0x00,0x0f,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:3], v[2:5], s[12:19] dmask:0xe unorm
+// CHECK: [0x00,0x1e,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x0
-// CHECK: [0x00,0x00,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v[1:4], v[2:5], s[12:19] dmask:0xf unorm
+// CHECK: [0x00,0x1f,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_get_resinfo v0, v[0:3], s[0:7] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x38,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x0 unorm
+// CHECK: [0x00,0x10,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_store_mip v1, v[2:5], s[12:19] dmask:0x1 unorm glc
+// CHECK: [0x00,0x31,0x24,0xf0,0x02,0x01,0x03,0x00]
 
-image_sample v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0xfc,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0xfc,0x00,0x00,0x00]
+image_get_resinfo v252, v[1:4], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf0,0x01,0xfc,0x02,0x00]
 
-image_sample v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0x01,0x00]
+image_get_resinfo v5, v[252:255], s[8:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf0,0xfc,0x05,0x02,0x00]
 
-image_sample v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0x17,0x00]
+image_get_resinfo v5, v[1:4], s[12:19] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf0,0x01,0x05,0x03,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0x20,0x00]
+image_get_resinfo v5, v[1:4], s[92:99] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf0,0x01,0x05,0x17,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0x00,0x03]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x2
+// CHECK: [0x00,0x02,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf0,0x00,0x00,0xc0,0x03]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0x3
+// CHECK: [0x00,0x03,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x4
+// CHECK: [0x00,0x04,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0x5
+// CHECK: [0x00,0x05,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0x6
+// CHECK: [0x00,0x06,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:7], v[1:4], s[8:15] dmask:0x7
+// CHECK: [0x00,0x07,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x8
+// CHECK: [0x00,0x08,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0x9
+// CHECK: [0x00,0x09,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:7], v[1:4], s[8:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:6], v[1:4], s[8:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:7], v[1:4], s[8:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:7], v[1:4], s[8:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v[5:8], v[1:4], s[8:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x0
+// CHECK: [0x00,0x00,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_get_resinfo v5, v[1:4], s[8:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x38,0xf0,0x01,0x05,0x02,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x80,0xf0,0x00,0x00,0x00,0x00]
+image_sample v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_cl v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0xfc,0x00,0x00]
+image_sample v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_cl v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0xfc,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_cl v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0x01,0x00]
+image_sample v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_cl v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0x17,0x00]
+image_sample v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_cl v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0x20,0x00]
+image_sample v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_cl v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0x00,0x03]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x84,0xf0,0x00,0x00,0xc0,0x03]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x80,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x84,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_l v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_cl v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_l v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_l v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0x01,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_l v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0x17,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_l v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0x20,0x00]
+image_sample_cl v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x84,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_l v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0x00,0x03]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x90,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x84,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_l v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x90,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_b v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_l v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_b v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_b v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0x01,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_b v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0x17,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_b v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0x20,0x00]
+image_sample_l v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x90,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_b v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0x00,0x03]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x94,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_l v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x90,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x94,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_b_cl v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_b v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_b_cl v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_b_cl v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0x01,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_b_cl v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0x17,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0x20,0x00]
+image_sample_b v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x94,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0x00,0x03]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x98,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x94,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x98,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_lz v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_lz v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_lz v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0x01,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_lz v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0x17,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_lz v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0x20,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x98,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_lz v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0x00,0x03]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x9c,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x98,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x9c,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_lz v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_c v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0x01,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0x17,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_c v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0x20,0x00]
+image_sample_lz v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x9c,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0x00,0x03]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xa0,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x9c,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xa0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_cl v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_cl v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_c_cl v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_cl v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0x17,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xa0,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0x00,0x03]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xa4,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xa0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xa4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_l v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_l v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_c_l v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_l v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0x17,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xa4,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0x00,0x03]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xb0,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xa4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_l v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xb0,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_b v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_b v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_c_b v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_b v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0x17,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xb0,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0x00,0x03]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xb4,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_l v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xb0,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xb4,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_b_cl v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_b_cl v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0x17,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xb4,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0x00,0x03]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xb8,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xb4,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_b_cl v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xb8,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0xfc,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0xfc,0x05,0x62,0x00]
 
-image_sample_c_lz v252, v[0:3], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0xfc,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0x63,0x00]
 
-image_sample_c_lz v0, v[252:255], s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0xfc,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0x77,0x00]
 
-image_sample_c_lz v0, v[0:3], s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0x01,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0x82,0x00]
 
-image_sample_c_lz v0, v[0:3], s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0x17,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0x02,0x03]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0x20,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xb8,0xf0,0x01,0x05,0xc2,0x03]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0x00,0x03]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0xbc,0xf0,0x00,0x00,0xc0,0x03]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:1], v[0:3], s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:2], v[0:3], s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v[0:3], v[0:3], s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_b_cl v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xb8,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_sample_c_lz v0, v[0:3], s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0xbc,0xf0,0x00,0x00,0x00,0x00]
+image_sample_c_lz v252, v[1:4], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0xfc,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[252:255], s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0xfc,0x05,0x62,0x00]
 
-image_gather4 v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0xfc,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0x63,0x00]
 
-image_gather4 v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0xff,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0x77,0x00]
 
-image_gather4 v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0x01,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0x82,0x00]
 
-image_gather4 v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0x17,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0x02,0x03]
 
-image_gather4 v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0x20,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0xbc,0xf0,0x01,0x05,0xc2,0x03]
 
-image_gather4 v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0x00,0x03]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x00,0xf1,0x00,0x00,0xc0,0x03]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:6], v[1:4], s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:7], v[1:4], s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v[5:8], v[1:4], s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_sample_c_lz v5, v[1:4], s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0xbc,0xf0,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x00,0xf3,0x00,0x00,0x00,0x00]
+image_gather4 v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4 v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x01,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x02,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4 v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x00,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_cl v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4 v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x00,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_cl v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0xff,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0x01,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0x17,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0x20,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0x00,0x03]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x04,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4 v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x00,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x01,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x02,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4 v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x00,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x04,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x05,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x06,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_cl v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x04,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_l v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x04,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_l v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x10,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_cl v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x04,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x05,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x06,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x04,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x10,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_l v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x11,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x12,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_l v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x10,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_b v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x10,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_b v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x14,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_l v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x10,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x11,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x12,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x10,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x14,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_b v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x15,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x16,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_b v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x14,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_b_cl v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x14,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_b_cl v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x18,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_b v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x14,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x15,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x16,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x14,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x18,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x19,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x1a,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x18,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_lz v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x18,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_lz v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x1c,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x18,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x19,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x1a,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x18,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x1c,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x1d,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x1e,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_lz v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x1c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x1c,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x20,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_lz v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x1c,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x1d,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x1e,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x1c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x20,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x21,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x22,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x20,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_cl v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x20,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_cl v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x24,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x20,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x21,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x22,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x20,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x24,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x25,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x26,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_cl v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x24,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_l v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x24,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_l v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x30,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_cl v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x24,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x25,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x26,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x24,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x30,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_l v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x31,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x32,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_l v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x30,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_b v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x30,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_b v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x34,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_l v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x30,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x31,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x32,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x30,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x34,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_b v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x35,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x36,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_b v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x34,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_b_cl v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x34,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_b_cl v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x38,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_b v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x34,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x35,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x36,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x34,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x38,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_b_cl v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x39,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x3a,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_b_cl v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x38,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_lz v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x38,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_lz v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x3c,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x38,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x39,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x3a,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x38,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x3c,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_lz v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x3d,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x3e,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_lz v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x3c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x3c,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x40,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_lz v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x3c,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x3d,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x3e,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x3c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x40,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x41,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x42,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x40,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_cl_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x40,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_cl_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x44,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x40,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x41,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x42,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x40,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x44,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x45,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x46,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x44,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_l_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x44,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_l_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x50,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x44,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x45,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x46,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x44,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x50,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x51,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x52,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x50,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_b_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x50,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_b_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x54,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x50,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x51,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x52,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x50,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x54,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x55,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x56,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x54,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_b_cl_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x54,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_b_cl_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x58,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x54,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x55,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x56,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x54,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x58,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x59,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x5a,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x58,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_lz_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x58,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_lz_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x5c,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x58,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x59,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x5a,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x58,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x5c,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x5d,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x5e,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x5c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x5c,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x60,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x5c,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x5d,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x5e,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x5c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x60,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x61,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x62,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x60,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_cl_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x60,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_cl_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x64,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x60,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x61,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x62,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x60,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x64,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x65,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x66,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x64,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_l_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x64,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_l_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x70,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x64,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x65,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x66,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x64,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x70,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_l_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x71,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x72,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_l_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x70,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_b_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x70,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_b_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x74,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x70,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x71,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x72,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_l_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x70,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x74,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_b_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x75,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x76,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_b_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x74,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_b_cl_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x74,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_b_cl_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x78,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x74,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x75,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x76,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x74,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x78,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_b_cl_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x79,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x7a,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_b_cl_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x78,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0x82,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0x02,0x03]
 
-image_gather4_c_lz_o v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x78,0xf1,0x01,0x05,0xc2,0x03]
 
-image_gather4_c_lz_o v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x7c,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x78,0xf3,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x79,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x7a,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_b_cl_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x78,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 glc
-// CHECK: [0x00,0x21,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0xfc,0x62,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 slc
-// CHECK: [0x00,0x01,0x7c,0xf3,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0xff,0x05,0x62,0x00]
 
-image_gather4_c_lz_o v[0:1], v0, s[0:7], s[0:3] dmask:0x1 tfe
-// CHECK: [0x00,0x01,0x7d,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0x63,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 lwe
-// CHECK: [0x00,0x01,0x7e,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0x77,0x00]
 
-image_gather4_c_lz_o v0, v0, s[0:7], s[0:3] dmask:0x1 da
-// CHECK: [0x00,0x41,0x7c,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0x82,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0x02,0x03]
 
-image_get_lod v252, v0, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0xfc,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x7c,0xf1,0x01,0x05,0xc2,0x03]
 
-image_get_lod v0, v255, s[0:7], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0xff,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[4:11], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0x01,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[92:99], s[0:3] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0x17,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[4:7] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0x20,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[96:99] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0x00,0x03]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], ttmp[8:11] dmask:0x1
-// CHECK: [0x00,0x01,0x80,0xf1,0x00,0x00,0xc0,0x03]
+image_gather4_c_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x2
-// CHECK: [0x00,0x02,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0x3
-// CHECK: [0x00,0x03,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x4
-// CHECK: [0x00,0x04,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0x5
-// CHECK: [0x00,0x05,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0x6
-// CHECK: [0x00,0x06,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:2], v0, s[0:7], s[0:3] dmask:0x7
-// CHECK: [0x00,0x07,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x8
-// CHECK: [0x00,0x08,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0x9
-// CHECK: [0x00,0x09,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0xa
-// CHECK: [0x00,0x0a,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:2], v0, s[0:7], s[0:3] dmask:0xb
-// CHECK: [0x00,0x0b,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 glc
+// CHECK: [0x00,0x21,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:1], v0, s[0:7], s[0:3] dmask:0xc
-// CHECK: [0x00,0x0c,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 slc
+// CHECK: [0x00,0x01,0x7c,0xf3,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:2], v0, s[0:7], s[0:3] dmask:0xd
-// CHECK: [0x00,0x0d,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v[5:6], v1, s[8:15], s[12:15] dmask:0x1 tfe
+// CHECK: [0x00,0x01,0x7d,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:2], v0, s[0:7], s[0:3] dmask:0xe
-// CHECK: [0x00,0x0e,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 lwe
+// CHECK: [0x00,0x01,0x7e,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v[0:3], v0, s[0:7], s[0:3] dmask:0xf
-// CHECK: [0x00,0x0f,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_gather4_c_lz_o v5, v1, s[8:15], s[12:15] dmask:0x1 da
+// CHECK: [0x00,0x41,0x7c,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x0
-// CHECK: [0x00,0x00,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-image_get_lod v0, v0, s[0:7], s[0:3] dmask:0x1 unorm
-// CHECK: [0x00,0x11,0x80,0xf1,0x00,0x00,0x00,0x00]
+image_get_lod v252, v1, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0xfc,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0x00]
+image_get_lod v5, v255, s[8:15], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0xff,0x05,0x62,0x00]
 
-buffer_load_format_x v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x00,0x00]
+image_get_lod v5, v1, s[12:19], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0x63,0x00]
 
-buffer_load_format_x v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x01,0x00]
+image_get_lod v5, v1, s[92:99], s[12:15] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0x77,0x00]
 
-buffer_load_format_x v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x18,0x00]
+image_get_lod v5, v1, s[8:15], s[16:19] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0x82,0x00]
 
-buffer_load_format_x v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x1e,0x00]
+image_get_lod v5, v1, s[8:15], s[96:99] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0x02,0x03]
 
-buffer_load_format_x v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0x65]
+image_get_lod v5, v1, s[8:15], ttmp[8:11] dmask:0x1
+// CHECK: [0x00,0x01,0x80,0xf1,0x01,0x05,0xc2,0x03]
 
-buffer_load_format_x v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0x7c]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x2
+// CHECK: [0x00,0x02,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0x80]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0x3
+// CHECK: [0x00,0x03,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0xc1]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x4
+// CHECK: [0x00,0x04,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0xf0]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0x5
+// CHECK: [0x00,0x05,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x00,0x00,0xf7]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0x6
+// CHECK: [0x00,0x06,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x00,0xe0,0x00,0x00,0x00,0x00]
+image_get_lod v[5:7], v1, s[8:15], s[12:15] dmask:0x7
+// CHECK: [0x00,0x07,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x00,0xe0,0x00,0x00,0x00,0x00]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x8
+// CHECK: [0x00,0x08,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0x00]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0x9
+// CHECK: [0x00,0x09,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x00,0xe0,0x00,0x00,0x00,0x00]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0xa
+// CHECK: [0x00,0x0a,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x00,0xe0,0x00,0x00,0x00,0x00]
+image_get_lod v[5:7], v1, s[8:15], s[12:15] dmask:0xb
+// CHECK: [0x00,0x0b,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x00,0xe0,0x00,0x00,0x00,0x00]
+image_get_lod v[5:6], v1, s[8:15], s[12:15] dmask:0xc
+// CHECK: [0x00,0x0c,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_x v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x02,0xe0,0x00,0x00,0x00,0x00]
+image_get_lod v[5:7], v1, s[8:15], s[12:15] dmask:0xd
+// CHECK: [0x00,0x0d,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0x00]
+image_get_lod v[5:7], v1, s[8:15], s[12:15] dmask:0xe
+// CHECK: [0x00,0x0e,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_xy v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0xfe,0x00,0x00]
+image_get_lod v[5:8], v1, s[8:15], s[12:15] dmask:0xf
+// CHECK: [0x00,0x0f,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_xy v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x01,0x00]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x0
+// CHECK: [0x00,0x00,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_xy v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x18,0x00]
+image_get_lod v5, v1, s[8:15], s[12:15] dmask:0x1 unorm
+// CHECK: [0x00,0x11,0x80,0xf1,0x01,0x05,0x62,0x00]
 
-buffer_load_format_xy v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_format_x v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_format_x v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_format_x v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_format_x v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_format_x v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_format_x v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_load_format_xy v[0:1], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_format_x v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_format_xy v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_format_xy v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x04,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xy v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x06,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[253:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0xfd,0x00,0x00]
+buffer_load_format_x v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_format_x v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x00,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_format_x v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x02,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_format_xy v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0xfe,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_format_xy v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_format_xy v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_format_xy v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_format_xy v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_format_xy v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_format_xyz v[0:2], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_format_xyz v[0:2], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x08,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyz v[0:2], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x0a,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[252:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0xfc,0x00,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x04,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_format_xy v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x06,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_format_xyz v[253:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0xfd,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_format_xyz v[5:7], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_format_xyz v[5:7], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_format_xyz v[5:7], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_format_xyz v[5:7], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_format_xyz v[5:7], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_format_xyzw v[0:3], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_format_xyzw v[0:3], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x0c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_format_xyzw v[0:3], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x0e,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_x v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_x v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_x v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x08,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_x v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_format_xyz v[5:7], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x0a,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_x v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_x v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_format_xyzw v[252:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0xfc,0x02,0x03]
 
-buffer_store_format_x v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_format_xyzw v[5:8], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_store_format_x v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_format_xyzw v[5:8], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_store_format_x v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_format_xyzw v[5:8], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_store_format_x v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_store_format_x v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_format_xyzw v[5:8], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_store_format_x v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_store_format_x v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_store_format_x v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_store_format_x v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_store_format_x v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_x v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x10,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_x v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x12,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_xy v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0xfe,0x00,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_xy v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x0c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_xy v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_format_xyzw v[5:8], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x0e,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_format_xy v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_format_x v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0x65]
+buffer_store_format_x v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_format_x v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_format_x v1, off, s[96:99], s4 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x18,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_format_x v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_format_x v1, off, s[12:15], s101 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x65]
 
-buffer_store_format_xy v[0:1], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_format_x v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_format_xy v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_format_xy v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x14,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xy v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x16,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[253:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0xfd,0x00,0x00]
+buffer_store_format_x v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_format_x v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x10,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x18,0x00]
+buffer_store_format_x v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x12,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0x65]
+buffer_store_format_xy v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0xfe,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_format_xy v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_format_xy v[1:2], off, s[96:99], s4 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x18,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_format_xy v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_format_xy v[1:2], off, s[12:15], s101 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0x65]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_format_xy v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_format_xyz v[0:2], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_format_xyz v[0:2], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x18,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyz v[0:2], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x1a,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[252:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0xfc,0x00,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x14,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x18,0x00]
+buffer_store_format_xy v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x16,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0x65]
+buffer_store_format_xyz v[253:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0xfd,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_format_xyz v[1:3], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_format_xyz v[1:3], off, s[96:99], s4 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x18,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_format_xyz v[1:3], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_format_xyz v[1:3], off, s[12:15], s101 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0x65]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_format_xyz v[1:3], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_format_xyzw v[0:3], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_format_xyzw v[0:3], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x1c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_format_xyzw v[0:3], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x1e,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_ubyte v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_ubyte v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0xff,0x00,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_ubyte v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x18,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_ubyte v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x00,0x18,0x00]
+buffer_store_format_xyz v[1:3], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x1a,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_ubyte v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_ubyte v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x00,0x00,0x65]
+buffer_store_format_xyzw v[252:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0xfc,0x03,0x04]
 
-buffer_load_ubyte v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_format_xyzw v[1:4], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_load_ubyte v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_format_xyzw v[1:4], off, s[96:99], s4 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x18,0x04]
 
-buffer_load_ubyte v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_format_xyzw v[1:4], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_load_ubyte v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s101 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0x65]
 
-buffer_load_ubyte v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_format_xyzw v[1:4], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_load_ubyte v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x40,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_load_ubyte v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x40,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_load_ubyte v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x40,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_load_ubyte v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x40,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_load_ubyte v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x40,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_ubyte v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x40,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_ubyte v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x42,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_sbyte v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_sbyte v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0xff,0x00,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_sbyte v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x1c,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_sbyte v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x00,0x18,0x00]
+buffer_store_format_xyzw v[1:4], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x1e,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_load_sbyte v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_ubyte v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_ubyte v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_ubyte v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_ubyte v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_ubyte v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_load_sbyte v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_ubyte v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_sbyte v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x44,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_sbyte v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x44,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_sbyte v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x44,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_sbyte v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x44,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x40,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_sbyte v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x44,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x40,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x44,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x40,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sbyte v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x46,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x40,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x40,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x40,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x40,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_ubyte v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x42,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_sbyte v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_ushort v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_sbyte v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_ushort v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_sbyte v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_load_ushort v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_sbyte v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_ushort v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_sbyte v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_load_ushort v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_sbyte v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_ushort v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x48,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_ushort v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x48,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_ushort v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x48,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_ushort v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x48,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x44,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_ushort v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x48,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x44,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x48,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x44,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_ushort v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x4a,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x44,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x44,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x44,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x44,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_sbyte v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x46,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_ushort v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_ushort v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_sshort v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_ushort v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_sshort v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_ushort v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_load_sshort v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_ushort v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_sshort v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_ushort v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_load_sshort v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_ushort v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_sshort v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x4c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_sshort v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x4c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_sshort v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x4c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_sshort v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x4c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x48,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_sshort v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x4c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x48,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x4c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x48,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_sshort v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x4e,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x48,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x48,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_ushort v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x48,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_ushort v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x48,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_ushort v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x4a,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_sshort v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_sshort v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_dword v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_sshort v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_dword v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_sshort v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_load_dword v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_sshort v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_dword v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_sshort v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_load_dword v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_sshort v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_dword v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x50,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_dword v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x50,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_dword v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x50,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_dword v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x50,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x4c,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_dword v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x50,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x4c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x50,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x4c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dword v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x52,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x4c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0xfe,0x00,0x00]
+buffer_load_sshort v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x4c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_sshort v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x4c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_sshort v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x4e,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_dword v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_dword v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0xff,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_dword v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_dword v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_dword v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_dword v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_dword v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_dwordx2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x54,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_dwordx2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x54,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x54,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x54,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x50,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x54,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x50,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x54,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x50,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x56,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x50,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x50,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[253:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0xfd,0x00,0x00]
+buffer_load_dword v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x50,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_dword v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x50,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_dword v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x52,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_dwordx2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0xfe,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_dwordx2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_dwordx2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_dwordx2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_dwordx2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_dwordx3 v[0:2], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x58,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_dwordx3 v[0:2], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x58,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x58,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x58,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x54,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x58,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x54,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x58,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x54,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx3 v[0:2], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x5a,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x54,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x54,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[252:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0xfc,0x00,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x54,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x54,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_dwordx2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x56,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_dwordx3 v[253:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0xfd,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_dwordx3 v[5:7], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_dwordx3 v[5:7], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_dwordx3 v[5:7], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_dwordx3 v[5:7], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_load_dwordx4 v[0:3], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x5c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_load_dwordx4 v[0:3], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x5c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x5c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x5c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x58,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x5c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x58,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x5c,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x58,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_load_dwordx4 v[0:3], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x5e,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x58,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_byte v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x58,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_byte v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x58,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_byte v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x58,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_byte v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_dwordx3 v[5:7], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x5a,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_byte v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x1e,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_byte v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0x65]
+buffer_load_dwordx4 v[252:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0xfc,0x02,0x03]
 
-buffer_store_byte v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0x7c]
+buffer_load_dwordx4 v[5:8], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x03,0x03]
 
-buffer_store_byte v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0x80]
+buffer_load_dwordx4 v[5:8], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x18,0x03]
 
-buffer_store_byte v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0xc1]
+buffer_load_dwordx4 v[5:8], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x1e,0x03]
 
-buffer_store_byte v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0xf0]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x65]
 
-buffer_store_byte v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x00,0x00,0xf7]
+buffer_load_dwordx4 v[5:8], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x7c]
 
-buffer_store_byte v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0x80]
 
-buffer_store_byte v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0xc1]
 
-buffer_store_byte v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0xf0]
 
-buffer_store_byte v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x5c,0xe0,0x00,0x05,0x02,0xf7]
 
-buffer_store_byte v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x5c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_byte v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x60,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x5c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_byte v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x62,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x5c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_short v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x5c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_short v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0xff,0x00,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x5c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_short v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x01,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x5c,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_short v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x18,0x00]
+buffer_load_dwordx4 v[5:8], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x5e,0xe0,0x00,0x05,0x02,0x03]
 
-buffer_store_short v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_byte v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0x65]
+buffer_store_byte v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_store_short v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_byte v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_short v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_byte v1, off, s[96:99], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x18,0x04]
 
-buffer_store_short v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_byte v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_short v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_byte v1, off, s[12:15], s101 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x65]
 
-buffer_store_short v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_byte v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_short v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_short v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_short v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_short v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x60,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_short v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x68,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_short v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x6a,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0xff,0x00,0x00]
+buffer_store_byte v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_byte v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x60,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x18,0x00]
+buffer_store_byte v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x62,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_short v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0x65]
+buffer_store_short v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_store_dword v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_short v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_dword v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_short v1, off, s[96:99], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x18,0x04]
 
-buffer_store_dword v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_short v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_dword v0, off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_short v1, off, s[12:15], s101 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x65]
 
-buffer_store_dword v0, off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_short v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_dword v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_dword v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_dword v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_dword v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x68,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_dword v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x70,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dword v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x72,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_short v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0xfe,0x00,0x00]
+buffer_store_short v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_short v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x68,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x18,0x00]
+buffer_store_short v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x6a,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_dword v1, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0x65]
+buffer_store_dword v255, off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0xff,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_dword v1, off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_dword v1, off, s[96:99], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x18,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_dword v1, off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_dword v1, off, s[12:15], s101 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x65]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_dword v1, off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_dwordx2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_dwordx2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x70,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x74,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x76,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], s4
+// CHECK: [0x00,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[253:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0xfd,0x00,0x00]
+buffer_store_dword v1, off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_dword v1, off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x70,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x18,0x00]
+buffer_store_dword v1, off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x72,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0x65]
+buffer_store_dwordx2 v[254:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0xfe,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_dwordx2 v[1:2], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_dwordx2 v[1:2], off, s[96:99], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x18,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_dwordx2 v[1:2], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s101 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x65]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_dwordx2 v[1:2], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_dwordx3 v[0:2], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_dwordx3 v[0:2], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x74,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x78,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx3 v[0:2], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x7a,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[252:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0xfc,0x00,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x01,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x74,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x18,0x00]
+buffer_store_dwordx2 v[1:2], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x76,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x1e,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0x65]
+buffer_store_dwordx3 v[253:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0xfd,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0x7c]
+buffer_store_dwordx3 v[1:3], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x04,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0x80]
+buffer_store_dwordx3 v[1:3], off, s[96:99], s4 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x18,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0xc1]
+buffer_store_dwordx3 v[1:3], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x1e,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], 0.5 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0xf0]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s101 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0x65]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], -4.0 offset:4095
-// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x00,0x00,0xf7]
+buffer_store_dwordx3 v[1:3], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0x7c]
 
-buffer_store_dwordx4 v[0:3], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0x80]
 
-buffer_store_dwordx4 v[0:3], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0xc1]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0xf0]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x78,0xe0,0x00,0x01,0x03,0xf7]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x7c,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x78,0xe0,0x00,0x01,0x03,0x04]
 
-buffer_store_dwordx4 v[0:3], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x7e,0xe0,0x00,0x00,0x00,0x00]
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x78,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x78,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x78,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x78,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx3 v[1:3], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x7a,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx4 v[252:255], off, s[12:15], s4 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0xfc,0x03,0x04]
+
+buffer_store_dwordx4 v[1:4], off, s[16:19], s4 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x04,0x04]
+
+buffer_store_dwordx4 v[1:4], off, s[96:99], s4 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x18,0x04]
+
+buffer_store_dwordx4 v[1:4], off, ttmp[8:11], s4 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x1e,0x04]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], s101 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0x65]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], m0 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0x7c]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], 0 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0x80]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], -1 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0xc1]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], 0.5 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0xf0]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], -4.0 offset:4095
+// CHECK: [0xff,0x0f,0x7c,0xe0,0x00,0x01,0x03,0xf7]
+
+buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 idxen offset:4095
+// CHECK: [0xff,0x2f,0x7c,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx4 v[1:4], v0, s[12:15], s4 offen offset:4095
+// CHECK: [0xff,0x1f,0x7c,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4
+// CHECK: [0x00,0x00,0x7c,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:0
+// CHECK: [0x00,0x00,0x7c,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:7
+// CHECK: [0x07,0x00,0x7c,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 glc
+// CHECK: [0xff,0x4f,0x7c,0xe0,0x00,0x01,0x03,0x04]
+
+buffer_store_dwordx4 v[1:4], off, s[12:15], s4 offset:4095 slc
+// CHECK: [0xff,0x0f,0x7e,0xe0,0x00,0x01,0x03,0x04]
 
 buffer_wbinvl1
 // CHECK: [0x00,0x00,0xf8,0xe0,0x00,0x00,0x00,0x00]
@@ -7737,2231 +7898,2231 @@ buffer_wbinvl1
 buffer_wbinvl1_vol
 // CHECK: [0x00,0x00,0xfc,0xe0,0x00,0x00,0x00,0x00]
 
-buffer_atomic_swap v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_swap v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_swap v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_swap v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_swap v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_swap v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_swap v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_swap v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_swap v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_swap v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_swap v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_swap v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_swap v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_swap v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_swap v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_swap v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x00,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_swap v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x00,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x00,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x00,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x00,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x00,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x00,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x00,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x00,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x00,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x00,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x00,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x02,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x02,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_cmpswap v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_cmpswap v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_cmpswap v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_cmpswap v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_cmpswap v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_cmpswap v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_cmpswap v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x04,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_cmpswap v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x04,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x04,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x04,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x04,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x04,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x04,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x04,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x04,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x04,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x04,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x04,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x04,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x06,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x06,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_add v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_add v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_add v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_add v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_add v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_add v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_add v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_add v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_add v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_add v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_add v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_add v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_add v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_add v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_add v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x08,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_add v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x08,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x08,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x08,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x08,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x08,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x08,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x08,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x08,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x08,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x08,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x08,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x08,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x0a,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x0a,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_sub v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_sub v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_sub v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_sub v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_sub v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_sub v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_sub v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_sub v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_sub v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_sub v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_sub v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_sub v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_sub v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_sub v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_sub v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x0c,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_sub v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x0c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x0c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x0c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x0c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x0c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x0c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x0c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x0c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x0c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x0c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x0c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x0e,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x0e,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_smin v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_smin v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_smin v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_smin v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_smin v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_smin v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_smin v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_smin v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_smin v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_smin v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_smin v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_smin v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_smin v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_smin v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_smin v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x10,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_smin v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x10,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x10,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x10,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x10,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x10,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x10,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x10,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x10,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x10,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x10,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x10,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x10,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x12,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x12,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_umin v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_umin v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_umin v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_umin v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_umin v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_umin v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_umin v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_umin v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_umin v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_umin v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_umin v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_umin v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_umin v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_umin v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_umin v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x14,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_umin v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x14,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x14,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x14,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x14,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x14,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x14,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x14,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x14,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x14,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x14,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x14,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x14,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x16,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x16,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_smax v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_smax v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_smax v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_smax v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_smax v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_smax v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_smax v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_smax v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_smax v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_smax v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_smax v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_smax v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_smax v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_smax v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_smax v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x18,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_smax v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x18,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x18,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x18,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x18,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x18,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x18,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x18,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x18,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x18,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x18,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x18,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x18,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x1a,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x1a,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_umax v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_umax v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_umax v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_umax v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_umax v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_umax v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_umax v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_umax v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_umax v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_umax v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_umax v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_umax v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_umax v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_umax v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_umax v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x1c,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_umax v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x1c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x1c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x1c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x1c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x1c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x1c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x1c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x1c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x1c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x1c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x1c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x1e,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x1e,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_and v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_and v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_and v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_and v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_and v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_and v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_and v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_and v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_and v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_and v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_and v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_and v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_and v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_and v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_and v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x20,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_and v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x20,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x20,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x20,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x20,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x20,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x20,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x20,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x20,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x20,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x20,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x20,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x20,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x22,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x22,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_or v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_or v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_or v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_or v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_or v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_or v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_or v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_or v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_or v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_or v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_or v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_or v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_or v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_or v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_or v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x24,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_or v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x24,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x24,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x24,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x24,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x24,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x24,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x24,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x24,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x24,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x24,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x24,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x24,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x26,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x26,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_xor v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_xor v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_xor v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_xor v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_xor v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_xor v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_xor v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_xor v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_xor v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_xor v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_xor v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_xor v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_xor v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_xor v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_xor v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x28,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_xor v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x28,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x28,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x28,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x28,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x28,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x28,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x28,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x28,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x28,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x28,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x28,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x28,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x2a,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x2a,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_inc v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_inc v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_inc v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_inc v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_inc v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_inc v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_inc v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_inc v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_inc v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_inc v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_inc v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_inc v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_inc v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_inc v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_inc v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x2c,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_inc v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x2c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x2c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x2c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x2c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x2c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x2c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x2c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x2c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x2c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x2c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x2c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x2e,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x2e,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec v0, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v5, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec v255, off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0xff,0x00,0x00]
+buffer_atomic_dec v255, off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0xff,0x02,0x03]
 
-buffer_atomic_dec v0, off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_dec v5, off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_dec v0, off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_dec v5, off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_dec v0, off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_dec v5, off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_dec v0, off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_dec v5, off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_dec v0, off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_dec v5, off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_dec v0, off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_dec v5, off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_dec v0, off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_dec v5, off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x30,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_dec v0, v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x30,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v5, v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x30,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec v0, v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x30,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v5, v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x30,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec v0, off, s[0:3], s0
-// CHECK: [0x00,0x00,0x30,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v5, off, s[8:11], s3
+// CHECK: [0x00,0x00,0x30,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec v0, off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x30,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v5, off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x30,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec v0, off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x30,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v5, off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x30,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec v0, off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x30,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v5, off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x30,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec v0, off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x32,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec v5, off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x32,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_swap_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_swap_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_swap_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_swap_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_swap_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_swap_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_swap_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_swap_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_swap_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_swap_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_swap_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x80,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_swap_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x80,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x80,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x80,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x80,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x80,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x80,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x80,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x80,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x80,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x80,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x80,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x80,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_swap_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x82,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_swap_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x82,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap_x2 v[252:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0xfc,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[252:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0xfc,0x02,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x84,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_cmpswap_x2 v[0:3], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x84,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x84,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x84,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x84,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x84,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x84,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x84,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x84,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x84,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x84,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x84,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x84,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_cmpswap_x2 v[0:3], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x86,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_cmpswap_x2 v[5:8], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x86,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_add_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_add_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_add_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_add_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_add_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_add_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_add_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x88,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_add_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x88,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x88,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x88,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x88,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x88,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x88,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x88,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x88,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x88,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x88,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x88,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x88,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_add_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x8a,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_add_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x8a,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_sub_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_sub_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_sub_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_sub_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_sub_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_sub_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_sub_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x8c,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_sub_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x8c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x8c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x8c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x8c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x8c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x8c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x8c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x8c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x8c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x8c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x8c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x8c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_sub_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x8e,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_sub_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x8e,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_smin_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_smin_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_smin_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_smin_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_smin_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_smin_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_smin_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x90,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_smin_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x90,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x90,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x90,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x90,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x90,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x90,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x90,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x90,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x90,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x90,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x90,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x90,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smin_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x92,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smin_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x92,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_umin_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_umin_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_umin_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_umin_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_umin_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_umin_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_umin_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x94,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_umin_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x94,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x94,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x94,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x94,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x94,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x94,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x94,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x94,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x94,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x94,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x94,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x94,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umin_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x96,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umin_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x96,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_smax_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_smax_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_smax_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_smax_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_smax_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_smax_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_smax_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x98,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_smax_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x98,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x98,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x98,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x98,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x98,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x98,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x98,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x98,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x98,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x98,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x98,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x98,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_smax_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x9a,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_smax_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x9a,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_umax_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_umax_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_umax_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_umax_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_umax_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_umax_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_umax_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0x9c,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_umax_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0x9c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0x9c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0x9c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0x9c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0x9c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0x9c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0x9c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0x9c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0x9c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0x9c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0x9c,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0x9c,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_umax_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0x9e,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_umax_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0x9e,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_and_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_and_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_and_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_and_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_and_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_and_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_and_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0xa0,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_and_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xa0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0xa0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xa0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0xa0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0xa0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0xa0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xa0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0xa0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xa0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0xa0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xa0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0xa0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_and_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xa2,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_and_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0xa2,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_or_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_or_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_or_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_or_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_or_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_or_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_or_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0xa4,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_or_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xa4,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0xa4,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xa4,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0xa4,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0xa4,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0xa4,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xa4,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0xa4,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xa4,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0xa4,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xa4,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0xa4,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_or_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xa6,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_or_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0xa6,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_xor_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_xor_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_xor_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_xor_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_xor_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_xor_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_xor_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_xor_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_xor_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_xor_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_xor_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0xa8,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_xor_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xa8,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0xa8,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xa8,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0xa8,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0xa8,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0xa8,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xa8,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0xa8,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xa8,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0xa8,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xa8,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0xa8,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_xor_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xaa,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_xor_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0xaa,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_inc_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_inc_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_inc_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_inc_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_inc_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_inc_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_inc_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0xac,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_inc_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xac,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0xac,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xac,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0xac,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0xac,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0xac,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xac,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0xac,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xac,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0xac,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xac,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0xac,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_inc_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xae,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_inc_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0xae,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec_x2 v[254:255], off, s[0:3], s0 offset:4095
-// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0xfe,0x00,0x00]
+buffer_atomic_dec_x2 v[254:255], off, s[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0xfe,0x02,0x03]
 
-buffer_atomic_dec_x2 v[0:1], off, s[4:7], s0 offset:4095
-// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x00,0x01,0x00]
+buffer_atomic_dec_x2 v[5:6], off, s[12:15], s3 offset:4095
+// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x05,0x03,0x03]
 
-buffer_atomic_dec_x2 v[0:1], off, s[96:99], s0 offset:4095
-// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x00,0x18,0x00]
+buffer_atomic_dec_x2 v[5:6], off, s[96:99], s3 offset:4095
+// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x05,0x18,0x03]
 
-buffer_atomic_dec_x2 v[0:1], off, ttmp[8:11], s0 offset:4095
-// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x00,0x1e,0x00]
+buffer_atomic_dec_x2 v[5:6], off, ttmp[8:11], s3 offset:4095
+// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x05,0x1e,0x03]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s101 offset:4095
-// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x00,0x00,0x65]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], s101 offset:4095
+// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x05,0x02,0x65]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], m0 offset:4095
-// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x00,0x00,0x7c]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], m0 offset:4095
+// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x05,0x02,0x7c]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], 0 offset:4095
-// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x00,0x00,0x80]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], 0 offset:4095
+// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x05,0x02,0x80]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], -1 offset:4095
-// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x00,0x00,0xc1]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], -1 offset:4095
+// CHECK: [0xff,0x0f,0xb0,0xe1,0x00,0x05,0x02,0xc1]
 
-buffer_atomic_dec_x2 v[0:1], v0, s[0:3], s0 idxen offset:4095
-// CHECK: [0xff,0x2f,0xb0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[5:6], v0, s[8:11], s3 idxen offset:4095
+// CHECK: [0xff,0x2f,0xb0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec_x2 v[0:1], v0, s[0:3], s0 offen offset:4095
-// CHECK: [0xff,0x1f,0xb0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[5:6], v0, s[8:11], s3 offen offset:4095
+// CHECK: [0xff,0x1f,0xb0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0
-// CHECK: [0x00,0x00,0xb0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3
+// CHECK: [0x00,0x00,0xb0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0 offset:0
-// CHECK: [0x00,0x00,0xb0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:0
+// CHECK: [0x00,0x00,0xb0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0 offset:7
-// CHECK: [0x07,0x00,0xb0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:7
+// CHECK: [0x07,0x00,0xb0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0 offset:4095 glc
-// CHECK: [0xff,0x4f,0xb0,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095 glc
+// CHECK: [0xff,0x4f,0xb0,0xe1,0x00,0x05,0x02,0x03]
 
-buffer_atomic_dec_x2 v[0:1], off, s[0:3], s0 offset:4095 slc
-// CHECK: [0xff,0x0f,0xb2,0xe1,0x00,0x00,0x00,0x00]
+buffer_atomic_dec_x2 v[5:6], off, s[8:11], s3 offset:4095 slc
+// CHECK: [0xff,0x0f,0xb2,0xe1,0x00,0x05,0x02,0x03]
 
-s_load_dword s0, s[0:1], s0
-// CHECK: [0x00,0x00,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], s2
+// CHECK: [0x41,0x01,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword s101, s[0:1], s0
-// CHECK: [0x40,0x19,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword s101, s[2:3], s2
+// CHECK: [0x41,0x19,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword flat_scratch_lo, s[0:1], s0
-// CHECK: [0x80,0x19,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword flat_scratch_lo, s[2:3], s2
+// CHECK: [0x81,0x19,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword flat_scratch_hi, s[0:1], s0
-// CHECK: [0xc0,0x19,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword flat_scratch_hi, s[2:3], s2
+// CHECK: [0xc1,0x19,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword vcc_lo, s[0:1], s0
-// CHECK: [0x80,0x1a,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword vcc_lo, s[2:3], s2
+// CHECK: [0x81,0x1a,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword vcc_hi, s[0:1], s0
-// CHECK: [0xc0,0x1a,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword vcc_hi, s[2:3], s2
+// CHECK: [0xc1,0x1a,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword tba_lo, s[0:1], s0
-// CHECK: [0x00,0x1b,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword tba_lo, s[2:3], s2
+// CHECK: [0x01,0x1b,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword tba_hi, s[0:1], s0
-// CHECK: [0x40,0x1b,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword tba_hi, s[2:3], s2
+// CHECK: [0x41,0x1b,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword tma_lo, s[0:1], s0
-// CHECK: [0x80,0x1b,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword tma_lo, s[2:3], s2
+// CHECK: [0x81,0x1b,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword tma_hi, s[0:1], s0
-// CHECK: [0xc0,0x1b,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword tma_hi, s[2:3], s2
+// CHECK: [0xc1,0x1b,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword ttmp11, s[0:1], s0
-// CHECK: [0xc0,0x1e,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword ttmp11, s[2:3], s2
+// CHECK: [0xc1,0x1e,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword s0, s[2:3], s0
-// CHECK: [0x01,0x00,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword s5, s[4:5], s2
+// CHECK: [0x42,0x01,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword s0, s[100:101], s0
-// CHECK: [0x32,0x00,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword s5, s[100:101], s2
+// CHECK: [0x72,0x01,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword s0, flat_scratch, s0
-// CHECK: [0x33,0x00,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword s5, flat_scratch, s2
+// CHECK: [0x73,0x01,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword s0, vcc, s0
-// CHECK: [0x35,0x00,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword s5, vcc, s2
+// CHECK: [0x75,0x01,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword s0, tba, s0
-// CHECK: [0x36,0x00,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword s5, tba, s2
+// CHECK: [0x76,0x01,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword s0, tma, s0
-// CHECK: [0x37,0x00,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword s5, tma, s2
+// CHECK: [0x77,0x01,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword s0, ttmp[10:11], s0
-// CHECK: [0x3d,0x00,0x00,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword s5, ttmp[10:11], s2
+// CHECK: [0x7d,0x01,0x00,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], s101
-// CHECK: [0x00,0x00,0x00,0xc0,0x65,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], s101
+// CHECK: [0x41,0x01,0x00,0xc0,0x65,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x00,0xc0,0x66,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], flat_scratch_lo
+// CHECK: [0x41,0x01,0x00,0xc0,0x66,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x00,0xc0,0x67,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], flat_scratch_hi
+// CHECK: [0x41,0x01,0x00,0xc0,0x67,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x00,0xc0,0x6a,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], vcc_lo
+// CHECK: [0x41,0x01,0x00,0xc0,0x6a,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x00,0xc0,0x6b,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], vcc_hi
+// CHECK: [0x41,0x01,0x00,0xc0,0x6b,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], tba_lo
-// CHECK: [0x00,0x00,0x00,0xc0,0x6c,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], tba_lo
+// CHECK: [0x41,0x01,0x00,0xc0,0x6c,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], tba_hi
-// CHECK: [0x00,0x00,0x00,0xc0,0x6d,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], tba_hi
+// CHECK: [0x41,0x01,0x00,0xc0,0x6d,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], tma_lo
-// CHECK: [0x00,0x00,0x00,0xc0,0x6e,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], tma_lo
+// CHECK: [0x41,0x01,0x00,0xc0,0x6e,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], tma_hi
-// CHECK: [0x00,0x00,0x00,0xc0,0x6f,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], tma_hi
+// CHECK: [0x41,0x01,0x00,0xc0,0x6f,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], ttmp11
-// CHECK: [0x00,0x00,0x00,0xc0,0x7b,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], ttmp11
+// CHECK: [0x41,0x01,0x00,0xc0,0x7b,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], m0
-// CHECK: [0x00,0x00,0x00,0xc0,0x7c,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], m0
+// CHECK: [0x41,0x01,0x00,0xc0,0x7c,0x00,0x00,0x00]
 
-s_load_dword s0, s[0:1], 0x7ffff
-// CHECK: [0x00,0x00,0x02,0xc0,0xff,0xff,0x07,0x00]
+s_load_dword s5, s[2:3], 0x7ffff
+// CHECK: [0x41,0x01,0x02,0xc0,0xff,0xff,0x07,0x00]
 
-s_load_dword s0, s[0:1], s0 glc
-// CHECK: [0x00,0x00,0x01,0xc0,0x00,0x00,0x00,0x00]
+s_load_dword s5, s[2:3], s2 glc
+// CHECK: [0x41,0x01,0x01,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], s2
+// CHECK: [0x81,0x02,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[2:3], s[0:1], s0
-// CHECK: [0x80,0x00,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[12:13], s[2:3], s2
+// CHECK: [0x01,0x03,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[100:101], s[0:1], s0
-// CHECK: [0x00,0x19,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[100:101], s[2:3], s2
+// CHECK: [0x01,0x19,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 flat_scratch, s[0:1], s0
-// CHECK: [0x80,0x19,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 flat_scratch, s[2:3], s2
+// CHECK: [0x81,0x19,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 vcc, s[0:1], s0
-// CHECK: [0x80,0x1a,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 vcc, s[2:3], s2
+// CHECK: [0x81,0x1a,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 tba, s[0:1], s0
-// CHECK: [0x00,0x1b,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 tba, s[2:3], s2
+// CHECK: [0x01,0x1b,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 tma, s[0:1], s0
-// CHECK: [0x80,0x1b,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 tma, s[2:3], s2
+// CHECK: [0x81,0x1b,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 ttmp[10:11], s[0:1], s0
-// CHECK: [0x80,0x1e,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 ttmp[10:11], s[2:3], s2
+// CHECK: [0x81,0x1e,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[2:3], s0
-// CHECK: [0x01,0x00,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[4:5], s2
+// CHECK: [0x82,0x02,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[100:101], s0
-// CHECK: [0x32,0x00,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[100:101], s2
+// CHECK: [0xb2,0x02,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], flat_scratch, s0
-// CHECK: [0x33,0x00,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], flat_scratch, s2
+// CHECK: [0xb3,0x02,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], vcc, s0
-// CHECK: [0x35,0x00,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], vcc, s2
+// CHECK: [0xb5,0x02,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], tba, s0
-// CHECK: [0x36,0x00,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], tba, s2
+// CHECK: [0xb6,0x02,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], tma, s0
-// CHECK: [0x37,0x00,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], tma, s2
+// CHECK: [0xb7,0x02,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], ttmp[10:11], s0
-// CHECK: [0x3d,0x00,0x04,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], ttmp[10:11], s2
+// CHECK: [0xbd,0x02,0x04,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], s101
-// CHECK: [0x00,0x00,0x04,0xc0,0x65,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], s101
+// CHECK: [0x81,0x02,0x04,0xc0,0x65,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x04,0xc0,0x66,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x81,0x02,0x04,0xc0,0x66,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x04,0xc0,0x67,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x81,0x02,0x04,0xc0,0x67,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x04,0xc0,0x6a,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x81,0x02,0x04,0xc0,0x6a,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x04,0xc0,0x6b,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x81,0x02,0x04,0xc0,0x6b,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x00,0x04,0xc0,0x6c,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], tba_lo
+// CHECK: [0x81,0x02,0x04,0xc0,0x6c,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x00,0x04,0xc0,0x6d,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], tba_hi
+// CHECK: [0x81,0x02,0x04,0xc0,0x6d,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x00,0x04,0xc0,0x6e,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], tma_lo
+// CHECK: [0x81,0x02,0x04,0xc0,0x6e,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x00,0x04,0xc0,0x6f,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], tma_hi
+// CHECK: [0x81,0x02,0x04,0xc0,0x6f,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x00,0x04,0xc0,0x7b,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], ttmp11
+// CHECK: [0x81,0x02,0x04,0xc0,0x7b,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x00,0x04,0xc0,0x7c,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], m0
+// CHECK: [0x81,0x02,0x04,0xc0,0x7c,0x00,0x00,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], 0x7ffff
-// CHECK: [0x00,0x00,0x06,0xc0,0xff,0xff,0x07,0x00]
+s_load_dwordx2 s[10:11], s[2:3], 0x7ffff
+// CHECK: [0x81,0x02,0x06,0xc0,0xff,0xff,0x07,0x00]
 
-s_load_dwordx2 s[0:1], s[0:1], s0 glc
-// CHECK: [0x00,0x00,0x05,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx2 s[10:11], s[2:3], s2 glc
+// CHECK: [0x81,0x02,0x05,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], s2
+// CHECK: [0x01,0x05,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[4:7], s[0:1], s0
-// CHECK: [0x00,0x01,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[24:27], s[2:3], s2
+// CHECK: [0x01,0x06,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[96:99], s[0:1], s0
-// CHECK: [0x00,0x18,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[96:99], s[2:3], s2
+// CHECK: [0x01,0x18,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 ttmp[8:11], s[0:1], s0
-// CHECK: [0x00,0x1e,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 ttmp[8:11], s[2:3], s2
+// CHECK: [0x01,0x1e,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[2:3], s0
-// CHECK: [0x01,0x00,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[4:5], s2
+// CHECK: [0x02,0x05,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[100:101], s0
-// CHECK: [0x32,0x00,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[100:101], s2
+// CHECK: [0x32,0x05,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], flat_scratch, s0
-// CHECK: [0x33,0x00,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], flat_scratch, s2
+// CHECK: [0x33,0x05,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], vcc, s0
-// CHECK: [0x35,0x00,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], vcc, s2
+// CHECK: [0x35,0x05,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], tba, s0
-// CHECK: [0x36,0x00,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], tba, s2
+// CHECK: [0x36,0x05,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], tma, s0
-// CHECK: [0x37,0x00,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], tma, s2
+// CHECK: [0x37,0x05,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], ttmp[10:11], s0
-// CHECK: [0x3d,0x00,0x08,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], ttmp[10:11], s2
+// CHECK: [0x3d,0x05,0x08,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], s101
-// CHECK: [0x00,0x00,0x08,0xc0,0x65,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], s101
+// CHECK: [0x01,0x05,0x08,0xc0,0x65,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x08,0xc0,0x66,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], flat_scratch_lo
+// CHECK: [0x01,0x05,0x08,0xc0,0x66,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x08,0xc0,0x67,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], flat_scratch_hi
+// CHECK: [0x01,0x05,0x08,0xc0,0x67,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x08,0xc0,0x6a,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], vcc_lo
+// CHECK: [0x01,0x05,0x08,0xc0,0x6a,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x08,0xc0,0x6b,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], vcc_hi
+// CHECK: [0x01,0x05,0x08,0xc0,0x6b,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], tba_lo
-// CHECK: [0x00,0x00,0x08,0xc0,0x6c,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], tba_lo
+// CHECK: [0x01,0x05,0x08,0xc0,0x6c,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], tba_hi
-// CHECK: [0x00,0x00,0x08,0xc0,0x6d,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], tba_hi
+// CHECK: [0x01,0x05,0x08,0xc0,0x6d,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], tma_lo
-// CHECK: [0x00,0x00,0x08,0xc0,0x6e,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], tma_lo
+// CHECK: [0x01,0x05,0x08,0xc0,0x6e,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], tma_hi
-// CHECK: [0x00,0x00,0x08,0xc0,0x6f,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], tma_hi
+// CHECK: [0x01,0x05,0x08,0xc0,0x6f,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], ttmp11
-// CHECK: [0x00,0x00,0x08,0xc0,0x7b,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], ttmp11
+// CHECK: [0x01,0x05,0x08,0xc0,0x7b,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], m0
-// CHECK: [0x00,0x00,0x08,0xc0,0x7c,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], m0
+// CHECK: [0x01,0x05,0x08,0xc0,0x7c,0x00,0x00,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], 0x7ffff
-// CHECK: [0x00,0x00,0x0a,0xc0,0xff,0xff,0x07,0x00]
+s_load_dwordx4 s[20:23], s[2:3], 0x7ffff
+// CHECK: [0x01,0x05,0x0a,0xc0,0xff,0xff,0x07,0x00]
 
-s_load_dwordx4 s[0:3], s[0:1], s0 glc
-// CHECK: [0x00,0x00,0x09,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx4 s[20:23], s[2:3], s2 glc
+// CHECK: [0x01,0x05,0x09,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], s0
-// CHECK: [0x00,0x00,0x0c,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], s2
+// CHECK: [0x01,0x05,0x0c,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[4:11], s[0:1], s0
-// CHECK: [0x00,0x01,0x0c,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[24:31], s[2:3], s2
+// CHECK: [0x01,0x06,0x0c,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[92:99], s[0:1], s0
-// CHECK: [0x00,0x17,0x0c,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[92:99], s[2:3], s2
+// CHECK: [0x01,0x17,0x0c,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[2:3], s0
-// CHECK: [0x01,0x00,0x0c,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[4:5], s2
+// CHECK: [0x02,0x05,0x0c,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[100:101], s0
-// CHECK: [0x32,0x00,0x0c,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[100:101], s2
+// CHECK: [0x32,0x05,0x0c,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], flat_scratch, s0
-// CHECK: [0x33,0x00,0x0c,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], flat_scratch, s2
+// CHECK: [0x33,0x05,0x0c,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], vcc, s0
-// CHECK: [0x35,0x00,0x0c,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], vcc, s2
+// CHECK: [0x35,0x05,0x0c,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], tba, s0
-// CHECK: [0x36,0x00,0x0c,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], tba, s2
+// CHECK: [0x36,0x05,0x0c,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], tma, s0
-// CHECK: [0x37,0x00,0x0c,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], tma, s2
+// CHECK: [0x37,0x05,0x0c,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], ttmp[10:11], s0
-// CHECK: [0x3d,0x00,0x0c,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], ttmp[10:11], s2
+// CHECK: [0x3d,0x05,0x0c,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], s101
-// CHECK: [0x00,0x00,0x0c,0xc0,0x65,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], s101
+// CHECK: [0x01,0x05,0x0c,0xc0,0x65,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x0c,0xc0,0x66,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], flat_scratch_lo
+// CHECK: [0x01,0x05,0x0c,0xc0,0x66,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x0c,0xc0,0x67,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], flat_scratch_hi
+// CHECK: [0x01,0x05,0x0c,0xc0,0x67,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x0c,0xc0,0x6a,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], vcc_lo
+// CHECK: [0x01,0x05,0x0c,0xc0,0x6a,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x0c,0xc0,0x6b,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], vcc_hi
+// CHECK: [0x01,0x05,0x0c,0xc0,0x6b,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], tba_lo
-// CHECK: [0x00,0x00,0x0c,0xc0,0x6c,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], tba_lo
+// CHECK: [0x01,0x05,0x0c,0xc0,0x6c,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], tba_hi
-// CHECK: [0x00,0x00,0x0c,0xc0,0x6d,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], tba_hi
+// CHECK: [0x01,0x05,0x0c,0xc0,0x6d,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], tma_lo
-// CHECK: [0x00,0x00,0x0c,0xc0,0x6e,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], tma_lo
+// CHECK: [0x01,0x05,0x0c,0xc0,0x6e,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], tma_hi
-// CHECK: [0x00,0x00,0x0c,0xc0,0x6f,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], tma_hi
+// CHECK: [0x01,0x05,0x0c,0xc0,0x6f,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], ttmp11
-// CHECK: [0x00,0x00,0x0c,0xc0,0x7b,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], ttmp11
+// CHECK: [0x01,0x05,0x0c,0xc0,0x7b,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], m0
-// CHECK: [0x00,0x00,0x0c,0xc0,0x7c,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], m0
+// CHECK: [0x01,0x05,0x0c,0xc0,0x7c,0x00,0x00,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], 0x7ffff
-// CHECK: [0x00,0x00,0x0e,0xc0,0xff,0xff,0x07,0x00]
+s_load_dwordx8 s[20:27], s[2:3], 0x7ffff
+// CHECK: [0x01,0x05,0x0e,0xc0,0xff,0xff,0x07,0x00]
 
-s_load_dwordx8 s[0:7], s[0:1], s0 glc
-// CHECK: [0x00,0x00,0x0d,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx8 s[20:27], s[2:3], s2 glc
+// CHECK: [0x01,0x05,0x0d,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], s0
-// CHECK: [0x00,0x00,0x10,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], s2
+// CHECK: [0x01,0x05,0x10,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[4:19], s[0:1], s0
-// CHECK: [0x00,0x01,0x10,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[24:39], s[2:3], s2
+// CHECK: [0x01,0x06,0x10,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[84:99], s[0:1], s0
-// CHECK: [0x00,0x15,0x10,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[84:99], s[2:3], s2
+// CHECK: [0x01,0x15,0x10,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[2:3], s0
-// CHECK: [0x01,0x00,0x10,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[4:5], s2
+// CHECK: [0x02,0x05,0x10,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[100:101], s0
-// CHECK: [0x32,0x00,0x10,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[100:101], s2
+// CHECK: [0x32,0x05,0x10,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], flat_scratch, s0
-// CHECK: [0x33,0x00,0x10,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], flat_scratch, s2
+// CHECK: [0x33,0x05,0x10,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], vcc, s0
-// CHECK: [0x35,0x00,0x10,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], vcc, s2
+// CHECK: [0x35,0x05,0x10,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], tba, s0
-// CHECK: [0x36,0x00,0x10,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], tba, s2
+// CHECK: [0x36,0x05,0x10,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], tma, s0
-// CHECK: [0x37,0x00,0x10,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], tma, s2
+// CHECK: [0x37,0x05,0x10,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], ttmp[10:11], s0
-// CHECK: [0x3d,0x00,0x10,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], ttmp[10:11], s2
+// CHECK: [0x3d,0x05,0x10,0xc0,0x02,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], s101
-// CHECK: [0x00,0x00,0x10,0xc0,0x65,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], s101
+// CHECK: [0x01,0x05,0x10,0xc0,0x65,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x10,0xc0,0x66,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], flat_scratch_lo
+// CHECK: [0x01,0x05,0x10,0xc0,0x66,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x10,0xc0,0x67,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], flat_scratch_hi
+// CHECK: [0x01,0x05,0x10,0xc0,0x67,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x10,0xc0,0x6a,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], vcc_lo
+// CHECK: [0x01,0x05,0x10,0xc0,0x6a,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x10,0xc0,0x6b,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], vcc_hi
+// CHECK: [0x01,0x05,0x10,0xc0,0x6b,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], tba_lo
-// CHECK: [0x00,0x00,0x10,0xc0,0x6c,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], tba_lo
+// CHECK: [0x01,0x05,0x10,0xc0,0x6c,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], tba_hi
-// CHECK: [0x00,0x00,0x10,0xc0,0x6d,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], tba_hi
+// CHECK: [0x01,0x05,0x10,0xc0,0x6d,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], tma_lo
-// CHECK: [0x00,0x00,0x10,0xc0,0x6e,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], tma_lo
+// CHECK: [0x01,0x05,0x10,0xc0,0x6e,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], tma_hi
-// CHECK: [0x00,0x00,0x10,0xc0,0x6f,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], tma_hi
+// CHECK: [0x01,0x05,0x10,0xc0,0x6f,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], ttmp11
-// CHECK: [0x00,0x00,0x10,0xc0,0x7b,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], ttmp11
+// CHECK: [0x01,0x05,0x10,0xc0,0x7b,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], m0
-// CHECK: [0x00,0x00,0x10,0xc0,0x7c,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], m0
+// CHECK: [0x01,0x05,0x10,0xc0,0x7c,0x00,0x00,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], 0x7ffff
-// CHECK: [0x00,0x00,0x12,0xc0,0xff,0xff,0x07,0x00]
+s_load_dwordx16 s[20:35], s[2:3], 0x7ffff
+// CHECK: [0x01,0x05,0x12,0xc0,0xff,0xff,0x07,0x00]
 
-s_load_dwordx16 s[0:15], s[0:1], s0 glc
-// CHECK: [0x00,0x00,0x11,0xc0,0x00,0x00,0x00,0x00]
+s_load_dwordx16 s[20:35], s[2:3], s2 glc
+// CHECK: [0x01,0x05,0x11,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], s0
-// CHECK: [0x00,0x00,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], s2
+// CHECK: [0x42,0x01,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword s101, s[0:3], s0
-// CHECK: [0x40,0x19,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword s101, s[4:7], s2
+// CHECK: [0x42,0x19,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword flat_scratch_lo, s[0:3], s0
-// CHECK: [0x80,0x19,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword flat_scratch_lo, s[4:7], s2
+// CHECK: [0x82,0x19,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword flat_scratch_hi, s[0:3], s0
-// CHECK: [0xc0,0x19,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword flat_scratch_hi, s[4:7], s2
+// CHECK: [0xc2,0x19,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword vcc_lo, s[0:3], s0
-// CHECK: [0x80,0x1a,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword vcc_lo, s[4:7], s2
+// CHECK: [0x82,0x1a,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword vcc_hi, s[0:3], s0
-// CHECK: [0xc0,0x1a,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword vcc_hi, s[4:7], s2
+// CHECK: [0xc2,0x1a,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword tba_lo, s[0:3], s0
-// CHECK: [0x00,0x1b,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword tba_lo, s[4:7], s2
+// CHECK: [0x02,0x1b,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword tba_hi, s[0:3], s0
-// CHECK: [0x40,0x1b,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword tba_hi, s[4:7], s2
+// CHECK: [0x42,0x1b,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword tma_lo, s[0:3], s0
-// CHECK: [0x80,0x1b,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword tma_lo, s[4:7], s2
+// CHECK: [0x82,0x1b,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword tma_hi, s[0:3], s0
-// CHECK: [0xc0,0x1b,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword tma_hi, s[4:7], s2
+// CHECK: [0xc2,0x1b,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword ttmp11, s[0:3], s0
-// CHECK: [0xc0,0x1e,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword ttmp11, s[4:7], s2
+// CHECK: [0xc2,0x1e,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[4:7], s0
-// CHECK: [0x02,0x00,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[8:11], s2
+// CHECK: [0x44,0x01,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[96:99], s0
-// CHECK: [0x30,0x00,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[96:99], s2
+// CHECK: [0x70,0x01,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, ttmp[8:11], s0
-// CHECK: [0x3c,0x00,0x20,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword s5, ttmp[8:11], s2
+// CHECK: [0x7c,0x01,0x20,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], s101
-// CHECK: [0x00,0x00,0x20,0xc0,0x65,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], s101
+// CHECK: [0x42,0x01,0x20,0xc0,0x65,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], flat_scratch_lo
-// CHECK: [0x00,0x00,0x20,0xc0,0x66,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], flat_scratch_lo
+// CHECK: [0x42,0x01,0x20,0xc0,0x66,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], flat_scratch_hi
-// CHECK: [0x00,0x00,0x20,0xc0,0x67,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], flat_scratch_hi
+// CHECK: [0x42,0x01,0x20,0xc0,0x67,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], vcc_lo
-// CHECK: [0x00,0x00,0x20,0xc0,0x6a,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], vcc_lo
+// CHECK: [0x42,0x01,0x20,0xc0,0x6a,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], vcc_hi
-// CHECK: [0x00,0x00,0x20,0xc0,0x6b,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], vcc_hi
+// CHECK: [0x42,0x01,0x20,0xc0,0x6b,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], tba_lo
-// CHECK: [0x00,0x00,0x20,0xc0,0x6c,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], tba_lo
+// CHECK: [0x42,0x01,0x20,0xc0,0x6c,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], tba_hi
-// CHECK: [0x00,0x00,0x20,0xc0,0x6d,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], tba_hi
+// CHECK: [0x42,0x01,0x20,0xc0,0x6d,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], tma_lo
-// CHECK: [0x00,0x00,0x20,0xc0,0x6e,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], tma_lo
+// CHECK: [0x42,0x01,0x20,0xc0,0x6e,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], tma_hi
-// CHECK: [0x00,0x00,0x20,0xc0,0x6f,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], tma_hi
+// CHECK: [0x42,0x01,0x20,0xc0,0x6f,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], ttmp11
-// CHECK: [0x00,0x00,0x20,0xc0,0x7b,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], ttmp11
+// CHECK: [0x42,0x01,0x20,0xc0,0x7b,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], m0
-// CHECK: [0x00,0x00,0x20,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], m0
+// CHECK: [0x42,0x01,0x20,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_load_dword s0, s[0:3], 0x7ffff
-// CHECK: [0x00,0x00,0x22,0xc0,0xff,0xff,0x07,0x00]
+s_buffer_load_dword s5, s[4:7], 0x7ffff
+// CHECK: [0x42,0x01,0x22,0xc0,0xff,0xff,0x07,0x00]
 
-s_buffer_load_dword s0, s[0:3], s0 glc
-// CHECK: [0x00,0x00,0x21,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dword s5, s[4:7], s2 glc
+// CHECK: [0x42,0x01,0x21,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], s0
-// CHECK: [0x00,0x00,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], s2
+// CHECK: [0x82,0x02,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[2:3], s[0:3], s0
-// CHECK: [0x80,0x00,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[12:13], s[4:7], s2
+// CHECK: [0x02,0x03,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[100:101], s[0:3], s0
-// CHECK: [0x00,0x19,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[100:101], s[4:7], s2
+// CHECK: [0x02,0x19,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 flat_scratch, s[0:3], s0
-// CHECK: [0x80,0x19,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 flat_scratch, s[4:7], s2
+// CHECK: [0x82,0x19,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 vcc, s[0:3], s0
-// CHECK: [0x80,0x1a,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 vcc, s[4:7], s2
+// CHECK: [0x82,0x1a,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 tba, s[0:3], s0
-// CHECK: [0x00,0x1b,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 tba, s[4:7], s2
+// CHECK: [0x02,0x1b,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 tma, s[0:3], s0
-// CHECK: [0x80,0x1b,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 tma, s[4:7], s2
+// CHECK: [0x82,0x1b,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 ttmp[10:11], s[0:3], s0
-// CHECK: [0x80,0x1e,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 ttmp[10:11], s[4:7], s2
+// CHECK: [0x82,0x1e,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[4:7], s0
-// CHECK: [0x02,0x00,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[8:11], s2
+// CHECK: [0x84,0x02,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[96:99], s0
-// CHECK: [0x30,0x00,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[96:99], s2
+// CHECK: [0xb0,0x02,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], ttmp[8:11], s0
-// CHECK: [0x3c,0x00,0x24,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], ttmp[8:11], s2
+// CHECK: [0xbc,0x02,0x24,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], s101
-// CHECK: [0x00,0x00,0x24,0xc0,0x65,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], s101
+// CHECK: [0x82,0x02,0x24,0xc0,0x65,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], flat_scratch_lo
-// CHECK: [0x00,0x00,0x24,0xc0,0x66,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], flat_scratch_lo
+// CHECK: [0x82,0x02,0x24,0xc0,0x66,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], flat_scratch_hi
-// CHECK: [0x00,0x00,0x24,0xc0,0x67,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], flat_scratch_hi
+// CHECK: [0x82,0x02,0x24,0xc0,0x67,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], vcc_lo
-// CHECK: [0x00,0x00,0x24,0xc0,0x6a,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], vcc_lo
+// CHECK: [0x82,0x02,0x24,0xc0,0x6a,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], vcc_hi
-// CHECK: [0x00,0x00,0x24,0xc0,0x6b,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], vcc_hi
+// CHECK: [0x82,0x02,0x24,0xc0,0x6b,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], tba_lo
-// CHECK: [0x00,0x00,0x24,0xc0,0x6c,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], tba_lo
+// CHECK: [0x82,0x02,0x24,0xc0,0x6c,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], tba_hi
-// CHECK: [0x00,0x00,0x24,0xc0,0x6d,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], tba_hi
+// CHECK: [0x82,0x02,0x24,0xc0,0x6d,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], tma_lo
-// CHECK: [0x00,0x00,0x24,0xc0,0x6e,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], tma_lo
+// CHECK: [0x82,0x02,0x24,0xc0,0x6e,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], tma_hi
-// CHECK: [0x00,0x00,0x24,0xc0,0x6f,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], tma_hi
+// CHECK: [0x82,0x02,0x24,0xc0,0x6f,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], ttmp11
-// CHECK: [0x00,0x00,0x24,0xc0,0x7b,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], ttmp11
+// CHECK: [0x82,0x02,0x24,0xc0,0x7b,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], m0
-// CHECK: [0x00,0x00,0x24,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], m0
+// CHECK: [0x82,0x02,0x24,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], 0x7ffff
-// CHECK: [0x00,0x00,0x26,0xc0,0xff,0xff,0x07,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], 0x7ffff
+// CHECK: [0x82,0x02,0x26,0xc0,0xff,0xff,0x07,0x00]
 
-s_buffer_load_dwordx2 s[0:1], s[0:3], s0 glc
-// CHECK: [0x00,0x00,0x25,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx2 s[10:11], s[4:7], s2 glc
+// CHECK: [0x82,0x02,0x25,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], s0
-// CHECK: [0x00,0x00,0x28,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], s2
+// CHECK: [0x02,0x05,0x28,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[4:7], s[0:3], s0
-// CHECK: [0x00,0x01,0x28,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[24:27], s[4:7], s2
+// CHECK: [0x02,0x06,0x28,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[96:99], s[0:3], s0
-// CHECK: [0x00,0x18,0x28,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[96:99], s[4:7], s2
+// CHECK: [0x02,0x18,0x28,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 ttmp[8:11], s[0:3], s0
-// CHECK: [0x00,0x1e,0x28,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx4 ttmp[8:11], s[4:7], s2
+// CHECK: [0x02,0x1e,0x28,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[4:7], s0
-// CHECK: [0x02,0x00,0x28,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[8:11], s2
+// CHECK: [0x04,0x05,0x28,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[96:99], s0
-// CHECK: [0x30,0x00,0x28,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[96:99], s2
+// CHECK: [0x30,0x05,0x28,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], ttmp[8:11], s0
-// CHECK: [0x3c,0x00,0x28,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], ttmp[8:11], s2
+// CHECK: [0x3c,0x05,0x28,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], s101
-// CHECK: [0x00,0x00,0x28,0xc0,0x65,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], s101
+// CHECK: [0x02,0x05,0x28,0xc0,0x65,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], flat_scratch_lo
-// CHECK: [0x00,0x00,0x28,0xc0,0x66,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], flat_scratch_lo
+// CHECK: [0x02,0x05,0x28,0xc0,0x66,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], flat_scratch_hi
-// CHECK: [0x00,0x00,0x28,0xc0,0x67,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], flat_scratch_hi
+// CHECK: [0x02,0x05,0x28,0xc0,0x67,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], vcc_lo
-// CHECK: [0x00,0x00,0x28,0xc0,0x6a,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], vcc_lo
+// CHECK: [0x02,0x05,0x28,0xc0,0x6a,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], vcc_hi
-// CHECK: [0x00,0x00,0x28,0xc0,0x6b,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], vcc_hi
+// CHECK: [0x02,0x05,0x28,0xc0,0x6b,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], tba_lo
-// CHECK: [0x00,0x00,0x28,0xc0,0x6c,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], tba_lo
+// CHECK: [0x02,0x05,0x28,0xc0,0x6c,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], tba_hi
-// CHECK: [0x00,0x00,0x28,0xc0,0x6d,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], tba_hi
+// CHECK: [0x02,0x05,0x28,0xc0,0x6d,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], tma_lo
-// CHECK: [0x00,0x00,0x28,0xc0,0x6e,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], tma_lo
+// CHECK: [0x02,0x05,0x28,0xc0,0x6e,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], tma_hi
-// CHECK: [0x00,0x00,0x28,0xc0,0x6f,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], tma_hi
+// CHECK: [0x02,0x05,0x28,0xc0,0x6f,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], ttmp11
-// CHECK: [0x00,0x00,0x28,0xc0,0x7b,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], ttmp11
+// CHECK: [0x02,0x05,0x28,0xc0,0x7b,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], m0
-// CHECK: [0x00,0x00,0x28,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], m0
+// CHECK: [0x02,0x05,0x28,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], 0x7ffff
-// CHECK: [0x00,0x00,0x2a,0xc0,0xff,0xff,0x07,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], 0x7ffff
+// CHECK: [0x02,0x05,0x2a,0xc0,0xff,0xff,0x07,0x00]
 
-s_buffer_load_dwordx4 s[0:3], s[0:3], s0 glc
-// CHECK: [0x00,0x00,0x29,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx4 s[20:23], s[4:7], s2 glc
+// CHECK: [0x02,0x05,0x29,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], s0
-// CHECK: [0x00,0x00,0x2c,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], s2
+// CHECK: [0x02,0x05,0x2c,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[4:11], s[0:3], s0
-// CHECK: [0x00,0x01,0x2c,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[24:31], s[4:7], s2
+// CHECK: [0x02,0x06,0x2c,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[92:99], s[0:3], s0
-// CHECK: [0x00,0x17,0x2c,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[92:99], s[4:7], s2
+// CHECK: [0x02,0x17,0x2c,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[4:7], s0
-// CHECK: [0x02,0x00,0x2c,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[8:11], s2
+// CHECK: [0x04,0x05,0x2c,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[96:99], s0
-// CHECK: [0x30,0x00,0x2c,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[96:99], s2
+// CHECK: [0x30,0x05,0x2c,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], ttmp[8:11], s0
-// CHECK: [0x3c,0x00,0x2c,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], ttmp[8:11], s2
+// CHECK: [0x3c,0x05,0x2c,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], s101
-// CHECK: [0x00,0x00,0x2c,0xc0,0x65,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], s101
+// CHECK: [0x02,0x05,0x2c,0xc0,0x65,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], flat_scratch_lo
-// CHECK: [0x00,0x00,0x2c,0xc0,0x66,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], flat_scratch_lo
+// CHECK: [0x02,0x05,0x2c,0xc0,0x66,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], flat_scratch_hi
-// CHECK: [0x00,0x00,0x2c,0xc0,0x67,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], flat_scratch_hi
+// CHECK: [0x02,0x05,0x2c,0xc0,0x67,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], vcc_lo
-// CHECK: [0x00,0x00,0x2c,0xc0,0x6a,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], vcc_lo
+// CHECK: [0x02,0x05,0x2c,0xc0,0x6a,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], vcc_hi
-// CHECK: [0x00,0x00,0x2c,0xc0,0x6b,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], vcc_hi
+// CHECK: [0x02,0x05,0x2c,0xc0,0x6b,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], tba_lo
-// CHECK: [0x00,0x00,0x2c,0xc0,0x6c,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], tba_lo
+// CHECK: [0x02,0x05,0x2c,0xc0,0x6c,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], tba_hi
-// CHECK: [0x00,0x00,0x2c,0xc0,0x6d,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], tba_hi
+// CHECK: [0x02,0x05,0x2c,0xc0,0x6d,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], tma_lo
-// CHECK: [0x00,0x00,0x2c,0xc0,0x6e,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], tma_lo
+// CHECK: [0x02,0x05,0x2c,0xc0,0x6e,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], tma_hi
-// CHECK: [0x00,0x00,0x2c,0xc0,0x6f,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], tma_hi
+// CHECK: [0x02,0x05,0x2c,0xc0,0x6f,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], ttmp11
-// CHECK: [0x00,0x00,0x2c,0xc0,0x7b,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], ttmp11
+// CHECK: [0x02,0x05,0x2c,0xc0,0x7b,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], m0
-// CHECK: [0x00,0x00,0x2c,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], m0
+// CHECK: [0x02,0x05,0x2c,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], 0x7ffff
-// CHECK: [0x00,0x00,0x2e,0xc0,0xff,0xff,0x07,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], 0x7ffff
+// CHECK: [0x02,0x05,0x2e,0xc0,0xff,0xff,0x07,0x00]
 
-s_buffer_load_dwordx8 s[0:7], s[0:3], s0 glc
-// CHECK: [0x00,0x00,0x2d,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx8 s[20:27], s[4:7], s2 glc
+// CHECK: [0x02,0x05,0x2d,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], s0
-// CHECK: [0x00,0x00,0x30,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], s2
+// CHECK: [0x02,0x05,0x30,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[4:19], s[0:3], s0
-// CHECK: [0x00,0x01,0x30,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[24:39], s[4:7], s2
+// CHECK: [0x02,0x06,0x30,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[84:99], s[0:3], s0
-// CHECK: [0x00,0x15,0x30,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[84:99], s[4:7], s2
+// CHECK: [0x02,0x15,0x30,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[4:7], s0
-// CHECK: [0x02,0x00,0x30,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[8:11], s2
+// CHECK: [0x04,0x05,0x30,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[96:99], s0
-// CHECK: [0x30,0x00,0x30,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[96:99], s2
+// CHECK: [0x30,0x05,0x30,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], ttmp[8:11], s0
-// CHECK: [0x3c,0x00,0x30,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], ttmp[8:11], s2
+// CHECK: [0x3c,0x05,0x30,0xc0,0x02,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], s101
-// CHECK: [0x00,0x00,0x30,0xc0,0x65,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], s101
+// CHECK: [0x02,0x05,0x30,0xc0,0x65,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], flat_scratch_lo
-// CHECK: [0x00,0x00,0x30,0xc0,0x66,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], flat_scratch_lo
+// CHECK: [0x02,0x05,0x30,0xc0,0x66,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], flat_scratch_hi
-// CHECK: [0x00,0x00,0x30,0xc0,0x67,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], flat_scratch_hi
+// CHECK: [0x02,0x05,0x30,0xc0,0x67,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], vcc_lo
-// CHECK: [0x00,0x00,0x30,0xc0,0x6a,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], vcc_lo
+// CHECK: [0x02,0x05,0x30,0xc0,0x6a,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], vcc_hi
-// CHECK: [0x00,0x00,0x30,0xc0,0x6b,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], vcc_hi
+// CHECK: [0x02,0x05,0x30,0xc0,0x6b,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], tba_lo
-// CHECK: [0x00,0x00,0x30,0xc0,0x6c,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], tba_lo
+// CHECK: [0x02,0x05,0x30,0xc0,0x6c,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], tba_hi
-// CHECK: [0x00,0x00,0x30,0xc0,0x6d,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], tba_hi
+// CHECK: [0x02,0x05,0x30,0xc0,0x6d,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], tma_lo
-// CHECK: [0x00,0x00,0x30,0xc0,0x6e,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], tma_lo
+// CHECK: [0x02,0x05,0x30,0xc0,0x6e,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], tma_hi
-// CHECK: [0x00,0x00,0x30,0xc0,0x6f,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], tma_hi
+// CHECK: [0x02,0x05,0x30,0xc0,0x6f,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], ttmp11
-// CHECK: [0x00,0x00,0x30,0xc0,0x7b,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], ttmp11
+// CHECK: [0x02,0x05,0x30,0xc0,0x7b,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], m0
-// CHECK: [0x00,0x00,0x30,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], m0
+// CHECK: [0x02,0x05,0x30,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], 0x7ffff
-// CHECK: [0x00,0x00,0x32,0xc0,0xff,0xff,0x07,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], 0x7ffff
+// CHECK: [0x02,0x05,0x32,0xc0,0xff,0xff,0x07,0x00]
 
-s_buffer_load_dwordx16 s[0:15], s[0:3], s0 glc
-// CHECK: [0x00,0x00,0x31,0xc0,0x00,0x00,0x00,0x00]
+s_buffer_load_dwordx16 s[20:35], s[4:7], s2 glc
+// CHECK: [0x02,0x05,0x31,0xc0,0x02,0x00,0x00,0x00]
 
-s_store_dword s0, s[0:1], m0
-// CHECK: [0x00,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword s1, s[4:5], m0
+// CHECK: [0x42,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword s101, s[0:1], m0
-// CHECK: [0x40,0x19,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword s101, s[4:5], m0
+// CHECK: [0x42,0x19,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword flat_scratch_lo, s[0:1], m0
-// CHECK: [0x80,0x19,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword flat_scratch_lo, s[4:5], m0
+// CHECK: [0x82,0x19,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword flat_scratch_hi, s[0:1], m0
-// CHECK: [0xc0,0x19,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword flat_scratch_hi, s[4:5], m0
+// CHECK: [0xc2,0x19,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword vcc_lo, s[0:1], m0
-// CHECK: [0x80,0x1a,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword vcc_lo, s[4:5], m0
+// CHECK: [0x82,0x1a,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword vcc_hi, s[0:1], m0
-// CHECK: [0xc0,0x1a,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword vcc_hi, s[4:5], m0
+// CHECK: [0xc2,0x1a,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword tba_lo, s[0:1], m0
-// CHECK: [0x00,0x1b,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword tba_lo, s[4:5], m0
+// CHECK: [0x02,0x1b,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword tba_hi, s[0:1], m0
-// CHECK: [0x40,0x1b,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword tba_hi, s[4:5], m0
+// CHECK: [0x42,0x1b,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword tma_lo, s[0:1], m0
-// CHECK: [0x80,0x1b,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword tma_lo, s[4:5], m0
+// CHECK: [0x82,0x1b,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword tma_hi, s[0:1], m0
-// CHECK: [0xc0,0x1b,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword tma_hi, s[4:5], m0
+// CHECK: [0xc2,0x1b,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword ttmp11, s[0:1], m0
-// CHECK: [0xc0,0x1e,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword ttmp11, s[4:5], m0
+// CHECK: [0xc2,0x1e,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword s0, s[2:3], m0
-// CHECK: [0x01,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword s1, s[6:7], m0
+// CHECK: [0x43,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword s0, s[100:101], m0
-// CHECK: [0x32,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword s1, s[100:101], m0
+// CHECK: [0x72,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword s0, flat_scratch, m0
-// CHECK: [0x33,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword s1, flat_scratch, m0
+// CHECK: [0x73,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword s0, vcc, m0
-// CHECK: [0x35,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword s1, vcc, m0
+// CHECK: [0x75,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword s0, tba, m0
-// CHECK: [0x36,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword s1, tba, m0
+// CHECK: [0x76,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword s0, tma, m0
-// CHECK: [0x37,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword s1, tma, m0
+// CHECK: [0x77,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword s0, ttmp[10:11], m0
-// CHECK: [0x3d,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword s1, ttmp[10:11], m0
+// CHECK: [0x7d,0x00,0x40,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dword s0, s[0:1], 0x7ffff
-// CHECK: [0x00,0x00,0x42,0xc0,0xff,0xff,0x07,0x00]
+s_store_dword s1, s[4:5], 0x7ffff
+// CHECK: [0x42,0x00,0x42,0xc0,0xff,0xff,0x07,0x00]
 
-s_store_dword s0, s[0:1], m0 glc
-// CHECK: [0x00,0x00,0x41,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dword s1, s[4:5], m0 glc
+// CHECK: [0x42,0x00,0x41,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[2:3], s[4:5], m0
+// CHECK: [0x82,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[2:3], s[0:1], m0
-// CHECK: [0x80,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[4:5], s[4:5], m0
+// CHECK: [0x02,0x01,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[100:101], s[0:1], m0
-// CHECK: [0x00,0x19,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[100:101], s[4:5], m0
+// CHECK: [0x02,0x19,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 flat_scratch, s[0:1], m0
-// CHECK: [0x80,0x19,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 flat_scratch, s[4:5], m0
+// CHECK: [0x82,0x19,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 vcc, s[0:1], m0
-// CHECK: [0x80,0x1a,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 vcc, s[4:5], m0
+// CHECK: [0x82,0x1a,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 tba, s[0:1], m0
-// CHECK: [0x00,0x1b,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 tba, s[4:5], m0
+// CHECK: [0x02,0x1b,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 tma, s[0:1], m0
-// CHECK: [0x80,0x1b,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 tma, s[4:5], m0
+// CHECK: [0x82,0x1b,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 ttmp[10:11], s[0:1], m0
-// CHECK: [0x80,0x1e,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 ttmp[10:11], s[4:5], m0
+// CHECK: [0x82,0x1e,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[0:1], s[2:3], m0
-// CHECK: [0x01,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[2:3], s[6:7], m0
+// CHECK: [0x83,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[0:1], s[100:101], m0
-// CHECK: [0x32,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[2:3], s[100:101], m0
+// CHECK: [0xb2,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[0:1], flat_scratch, m0
-// CHECK: [0x33,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[2:3], flat_scratch, m0
+// CHECK: [0xb3,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[0:1], vcc, m0
-// CHECK: [0x35,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[2:3], vcc, m0
+// CHECK: [0xb5,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[0:1], tba, m0
-// CHECK: [0x36,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[2:3], tba, m0
+// CHECK: [0xb6,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[0:1], tma, m0
-// CHECK: [0x37,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[2:3], tma, m0
+// CHECK: [0xb7,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[0:1], ttmp[10:11], m0
-// CHECK: [0x3d,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[2:3], ttmp[10:11], m0
+// CHECK: [0xbd,0x00,0x44,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx2 s[0:1], s[0:1], 0x7ffff
-// CHECK: [0x00,0x00,0x46,0xc0,0xff,0xff,0x07,0x00]
+s_store_dwordx2 s[2:3], s[4:5], 0x7ffff
+// CHECK: [0x82,0x00,0x46,0xc0,0xff,0xff,0x07,0x00]
 
-s_store_dwordx2 s[0:1], s[0:1], m0 glc
-// CHECK: [0x00,0x00,0x45,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx2 s[2:3], s[4:5], m0 glc
+// CHECK: [0x82,0x00,0x45,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[0:3], s[0:1], m0
-// CHECK: [0x00,0x00,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[4:7], s[4:5], m0
+// CHECK: [0x02,0x01,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[4:7], s[0:1], m0
-// CHECK: [0x00,0x01,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[8:11], s[4:5], m0
+// CHECK: [0x02,0x02,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[96:99], s[0:1], m0
-// CHECK: [0x00,0x18,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[96:99], s[4:5], m0
+// CHECK: [0x02,0x18,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 ttmp[8:11], s[0:1], m0
-// CHECK: [0x00,0x1e,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 ttmp[8:11], s[4:5], m0
+// CHECK: [0x02,0x1e,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[0:3], s[2:3], m0
-// CHECK: [0x01,0x00,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[4:7], s[6:7], m0
+// CHECK: [0x03,0x01,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[0:3], s[100:101], m0
-// CHECK: [0x32,0x00,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[4:7], s[100:101], m0
+// CHECK: [0x32,0x01,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[0:3], flat_scratch, m0
-// CHECK: [0x33,0x00,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[4:7], flat_scratch, m0
+// CHECK: [0x33,0x01,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[0:3], vcc, m0
-// CHECK: [0x35,0x00,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[4:7], vcc, m0
+// CHECK: [0x35,0x01,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[0:3], tba, m0
-// CHECK: [0x36,0x00,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[4:7], tba, m0
+// CHECK: [0x36,0x01,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[0:3], tma, m0
-// CHECK: [0x37,0x00,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[4:7], tma, m0
+// CHECK: [0x37,0x01,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[0:3], ttmp[10:11], m0
-// CHECK: [0x3d,0x00,0x48,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[4:7], ttmp[10:11], m0
+// CHECK: [0x3d,0x01,0x48,0xc0,0x7c,0x00,0x00,0x00]
 
-s_store_dwordx4 s[0:3], s[0:1], 0x7ffff
-// CHECK: [0x00,0x00,0x4a,0xc0,0xff,0xff,0x07,0x00]
+s_store_dwordx4 s[4:7], s[4:5], 0x7ffff
+// CHECK: [0x02,0x01,0x4a,0xc0,0xff,0xff,0x07,0x00]
 
-s_store_dwordx4 s[0:3], s[0:1], m0 glc
-// CHECK: [0x00,0x00,0x49,0xc0,0x7c,0x00,0x00,0x00]
+s_store_dwordx4 s[4:7], s[4:5], m0 glc
+// CHECK: [0x02,0x01,0x49,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword s0, s[0:3], m0
-// CHECK: [0x00,0x00,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword s1, s[8:11], m0
+// CHECK: [0x44,0x00,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword s101, s[0:3], m0
-// CHECK: [0x40,0x19,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword s101, s[8:11], m0
+// CHECK: [0x44,0x19,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword flat_scratch_lo, s[0:3], m0
-// CHECK: [0x80,0x19,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword flat_scratch_lo, s[8:11], m0
+// CHECK: [0x84,0x19,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword flat_scratch_hi, s[0:3], m0
-// CHECK: [0xc0,0x19,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword flat_scratch_hi, s[8:11], m0
+// CHECK: [0xc4,0x19,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword vcc_lo, s[0:3], m0
-// CHECK: [0x80,0x1a,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword vcc_lo, s[8:11], m0
+// CHECK: [0x84,0x1a,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword vcc_hi, s[0:3], m0
-// CHECK: [0xc0,0x1a,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword vcc_hi, s[8:11], m0
+// CHECK: [0xc4,0x1a,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword tba_lo, s[0:3], m0
-// CHECK: [0x00,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword tba_lo, s[8:11], m0
+// CHECK: [0x04,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword tba_hi, s[0:3], m0
-// CHECK: [0x40,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword tba_hi, s[8:11], m0
+// CHECK: [0x44,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword tma_lo, s[0:3], m0
-// CHECK: [0x80,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword tma_lo, s[8:11], m0
+// CHECK: [0x84,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword tma_hi, s[0:3], m0
-// CHECK: [0xc0,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword tma_hi, s[8:11], m0
+// CHECK: [0xc4,0x1b,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword ttmp11, s[0:3], m0
-// CHECK: [0xc0,0x1e,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword ttmp11, s[8:11], m0
+// CHECK: [0xc4,0x1e,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword s0, s[4:7], m0
-// CHECK: [0x02,0x00,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword s1, s[12:15], m0
+// CHECK: [0x46,0x00,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword s0, s[96:99], m0
-// CHECK: [0x30,0x00,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword s1, s[96:99], m0
+// CHECK: [0x70,0x00,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword s0, ttmp[8:11], m0
-// CHECK: [0x3c,0x00,0x60,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword s1, ttmp[8:11], m0
+// CHECK: [0x7c,0x00,0x60,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dword s0, s[0:3], 0x7ffff
-// CHECK: [0x00,0x00,0x62,0xc0,0xff,0xff,0x07,0x00]
+s_buffer_store_dword s1, s[8:11], 0x7ffff
+// CHECK: [0x44,0x00,0x62,0xc0,0xff,0xff,0x07,0x00]
 
-s_buffer_store_dword s0, s[0:3], m0 glc
-// CHECK: [0x00,0x00,0x61,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dword s1, s[8:11], m0 glc
+// CHECK: [0x44,0x00,0x61,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 s[0:1], s[0:3], m0
-// CHECK: [0x00,0x00,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 s[2:3], s[8:11], m0
+// CHECK: [0x84,0x00,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 s[2:3], s[0:3], m0
-// CHECK: [0x80,0x00,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 s[4:5], s[8:11], m0
+// CHECK: [0x04,0x01,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 s[100:101], s[0:3], m0
-// CHECK: [0x00,0x19,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 s[100:101], s[8:11], m0
+// CHECK: [0x04,0x19,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 flat_scratch, s[0:3], m0
-// CHECK: [0x80,0x19,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 flat_scratch, s[8:11], m0
+// CHECK: [0x84,0x19,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 vcc, s[0:3], m0
-// CHECK: [0x80,0x1a,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 vcc, s[8:11], m0
+// CHECK: [0x84,0x1a,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 tba, s[0:3], m0
-// CHECK: [0x00,0x1b,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 tba, s[8:11], m0
+// CHECK: [0x04,0x1b,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 tma, s[0:3], m0
-// CHECK: [0x80,0x1b,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 tma, s[8:11], m0
+// CHECK: [0x84,0x1b,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 ttmp[10:11], s[0:3], m0
-// CHECK: [0x80,0x1e,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 ttmp[10:11], s[8:11], m0
+// CHECK: [0x84,0x1e,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 s[0:1], s[4:7], m0
-// CHECK: [0x02,0x00,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 s[2:3], s[12:15], m0
+// CHECK: [0x86,0x00,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 s[0:1], s[96:99], m0
-// CHECK: [0x30,0x00,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 s[2:3], s[96:99], m0
+// CHECK: [0xb0,0x00,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 s[0:1], ttmp[8:11], m0
-// CHECK: [0x3c,0x00,0x64,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 s[2:3], ttmp[8:11], m0
+// CHECK: [0xbc,0x00,0x64,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx2 s[0:1], s[0:3], 0x7ffff
-// CHECK: [0x00,0x00,0x66,0xc0,0xff,0xff,0x07,0x00]
+s_buffer_store_dwordx2 s[2:3], s[8:11], 0x7ffff
+// CHECK: [0x84,0x00,0x66,0xc0,0xff,0xff,0x07,0x00]
 
-s_buffer_store_dwordx2 s[0:1], s[0:3], m0 glc
-// CHECK: [0x00,0x00,0x65,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx2 s[2:3], s[8:11], m0 glc
+// CHECK: [0x84,0x00,0x65,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx4 s[0:3], s[0:3], m0
-// CHECK: [0x00,0x00,0x68,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx4 s[4:7], s[8:11], m0
+// CHECK: [0x04,0x01,0x68,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx4 s[4:7], s[0:3], m0
-// CHECK: [0x00,0x01,0x68,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx4 s[8:11], s[8:11], m0
+// CHECK: [0x04,0x02,0x68,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx4 s[96:99], s[0:3], m0
-// CHECK: [0x00,0x18,0x68,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx4 s[96:99], s[8:11], m0
+// CHECK: [0x04,0x18,0x68,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx4 ttmp[8:11], s[0:3], m0
-// CHECK: [0x00,0x1e,0x68,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx4 ttmp[8:11], s[8:11], m0
+// CHECK: [0x04,0x1e,0x68,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx4 s[0:3], s[4:7], m0
-// CHECK: [0x02,0x00,0x68,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx4 s[4:7], s[12:15], m0
+// CHECK: [0x06,0x01,0x68,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx4 s[0:3], s[96:99], m0
-// CHECK: [0x30,0x00,0x68,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx4 s[4:7], s[96:99], m0
+// CHECK: [0x30,0x01,0x68,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx4 s[0:3], ttmp[8:11], m0
-// CHECK: [0x3c,0x00,0x68,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx4 s[4:7], ttmp[8:11], m0
+// CHECK: [0x3c,0x01,0x68,0xc0,0x7c,0x00,0x00,0x00]
 
-s_buffer_store_dwordx4 s[0:3], s[0:3], 0x7ffff
-// CHECK: [0x00,0x00,0x6a,0xc0,0xff,0xff,0x07,0x00]
+s_buffer_store_dwordx4 s[4:7], s[8:11], 0x7ffff
+// CHECK: [0x04,0x01,0x6a,0xc0,0xff,0xff,0x07,0x00]
 
-s_buffer_store_dwordx4 s[0:3], s[0:3], m0 glc
-// CHECK: [0x00,0x00,0x69,0xc0,0x7c,0x00,0x00,0x00]
+s_buffer_store_dwordx4 s[4:7], s[8:11], m0 glc
+// CHECK: [0x04,0x01,0x69,0xc0,0x7c,0x00,0x00,0x00]
 
 s_dcache_inv
 // CHECK: [0x00,0x00,0x80,0xc0,0x00,0x00,0x00,0x00]
@@ -9975,11 +10136,11 @@ s_dcache_inv_vol
 s_dcache_wb_vol
 // CHECK: [0x00,0x00,0x8c,0xc0,0x00,0x00,0x00,0x00]
 
-s_memtime s[0:1]
-// CHECK: [0x00,0x00,0x90,0xc0,0x00,0x00,0x00,0x00]
+s_memtime s[10:11]
+// CHECK: [0x80,0x02,0x90,0xc0,0x00,0x00,0x00,0x00]
 
-s_memtime s[2:3]
-// CHECK: [0x80,0x00,0x90,0xc0,0x00,0x00,0x00,0x00]
+s_memtime s[12:13]
+// CHECK: [0x00,0x03,0x90,0xc0,0x00,0x00,0x00,0x00]
 
 s_memtime s[100:101]
 // CHECK: [0x00,0x19,0x90,0xc0,0x00,0x00,0x00,0x00]
@@ -9999,11 +10160,11 @@ s_memtime tma
 s_memtime ttmp[10:11]
 // CHECK: [0x80,0x1e,0x90,0xc0,0x00,0x00,0x00,0x00]
 
-s_memrealtime s[0:1]
-// CHECK: [0x00,0x00,0x94,0xc0,0x00,0x00,0x00,0x00]
+s_memrealtime s[10:11]
+// CHECK: [0x80,0x02,0x94,0xc0,0x00,0x00,0x00,0x00]
 
-s_memrealtime s[2:3]
-// CHECK: [0x80,0x00,0x94,0xc0,0x00,0x00,0x00,0x00]
+s_memrealtime s[12:13]
+// CHECK: [0x00,0x03,0x94,0xc0,0x00,0x00,0x00,0x00]
 
 s_memrealtime s[100:101]
 // CHECK: [0x00,0x19,0x94,0xc0,0x00,0x00,0x00,0x00]
@@ -10023,2513 +10184,2513 @@ s_memrealtime tma
 s_memrealtime ttmp[10:11]
 // CHECK: [0x80,0x1e,0x94,0xc0,0x00,0x00,0x00,0x00]
 
-s_mov_b32 s0, s0
-// CHECK: [0x00,0x00,0x80,0xbe]
+s_mov_b32 s5, s1
+// CHECK: [0x01,0x00,0x85,0xbe]
 
-s_mov_b32 s101, s0
-// CHECK: [0x00,0x00,0xe5,0xbe]
+s_mov_b32 s101, s1
+// CHECK: [0x01,0x00,0xe5,0xbe]
 
-s_mov_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x00,0xe6,0xbe]
+s_mov_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x00,0xe6,0xbe]
 
-s_mov_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x00,0xe7,0xbe]
+s_mov_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x00,0xe7,0xbe]
 
-s_mov_b32 vcc_lo, s0
-// CHECK: [0x00,0x00,0xea,0xbe]
+s_mov_b32 vcc_lo, s1
+// CHECK: [0x01,0x00,0xea,0xbe]
 
-s_mov_b32 vcc_hi, s0
-// CHECK: [0x00,0x00,0xeb,0xbe]
+s_mov_b32 vcc_hi, s1
+// CHECK: [0x01,0x00,0xeb,0xbe]
 
-s_mov_b32 tba_lo, s0
-// CHECK: [0x00,0x00,0xec,0xbe]
+s_mov_b32 tba_lo, s1
+// CHECK: [0x01,0x00,0xec,0xbe]
 
-s_mov_b32 tba_hi, s0
-// CHECK: [0x00,0x00,0xed,0xbe]
+s_mov_b32 tba_hi, s1
+// CHECK: [0x01,0x00,0xed,0xbe]
 
-s_mov_b32 tma_lo, s0
-// CHECK: [0x00,0x00,0xee,0xbe]
+s_mov_b32 tma_lo, s1
+// CHECK: [0x01,0x00,0xee,0xbe]
 
-s_mov_b32 tma_hi, s0
-// CHECK: [0x00,0x00,0xef,0xbe]
+s_mov_b32 tma_hi, s1
+// CHECK: [0x01,0x00,0xef,0xbe]
 
-s_mov_b32 ttmp11, s0
-// CHECK: [0x00,0x00,0xfb,0xbe]
+s_mov_b32 ttmp11, s1
+// CHECK: [0x01,0x00,0xfb,0xbe]
 
-s_mov_b32 m0, s0
-// CHECK: [0x00,0x00,0xfc,0xbe]
+s_mov_b32 m0, s1
+// CHECK: [0x01,0x00,0xfc,0xbe]
 
-s_mov_b32 exec_lo, s0
-// CHECK: [0x00,0x00,0xfe,0xbe]
+s_mov_b32 exec_lo, s1
+// CHECK: [0x01,0x00,0xfe,0xbe]
 
-s_mov_b32 exec_hi, s0
-// CHECK: [0x00,0x00,0xff,0xbe]
+s_mov_b32 exec_hi, s1
+// CHECK: [0x01,0x00,0xff,0xbe]
 
-s_mov_b32 s0, s101
-// CHECK: [0x65,0x00,0x80,0xbe]
+s_mov_b32 s5, s101
+// CHECK: [0x65,0x00,0x85,0xbe]
 
-s_mov_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x00,0x80,0xbe]
+s_mov_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x00,0x85,0xbe]
 
-s_mov_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x00,0x80,0xbe]
+s_mov_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x00,0x85,0xbe]
 
-s_mov_b32 s0, vcc_lo
-// CHECK: [0x6a,0x00,0x80,0xbe]
+s_mov_b32 s5, vcc_lo
+// CHECK: [0x6a,0x00,0x85,0xbe]
 
-s_mov_b32 s0, vcc_hi
-// CHECK: [0x6b,0x00,0x80,0xbe]
+s_mov_b32 s5, vcc_hi
+// CHECK: [0x6b,0x00,0x85,0xbe]
 
-s_mov_b32 s0, tba_lo
-// CHECK: [0x6c,0x00,0x80,0xbe]
+s_mov_b32 s5, tba_lo
+// CHECK: [0x6c,0x00,0x85,0xbe]
 
-s_mov_b32 s0, tba_hi
-// CHECK: [0x6d,0x00,0x80,0xbe]
+s_mov_b32 s5, tba_hi
+// CHECK: [0x6d,0x00,0x85,0xbe]
 
-s_mov_b32 s0, tma_lo
-// CHECK: [0x6e,0x00,0x80,0xbe]
+s_mov_b32 s5, tma_lo
+// CHECK: [0x6e,0x00,0x85,0xbe]
 
-s_mov_b32 s0, tma_hi
-// CHECK: [0x6f,0x00,0x80,0xbe]
+s_mov_b32 s5, tma_hi
+// CHECK: [0x6f,0x00,0x85,0xbe]
 
-s_mov_b32 s0, ttmp11
-// CHECK: [0x7b,0x00,0x80,0xbe]
+s_mov_b32 s5, ttmp11
+// CHECK: [0x7b,0x00,0x85,0xbe]
 
-s_mov_b32 s0, m0
-// CHECK: [0x7c,0x00,0x80,0xbe]
+s_mov_b32 s5, m0
+// CHECK: [0x7c,0x00,0x85,0xbe]
 
-s_mov_b32 s0, exec_lo
-// CHECK: [0x7e,0x00,0x80,0xbe]
+s_mov_b32 s5, exec_lo
+// CHECK: [0x7e,0x00,0x85,0xbe]
 
-s_mov_b32 s0, exec_hi
-// CHECK: [0x7f,0x00,0x80,0xbe]
+s_mov_b32 s5, exec_hi
+// CHECK: [0x7f,0x00,0x85,0xbe]
 
-s_mov_b32 s0, 0
-// CHECK: [0x80,0x00,0x80,0xbe]
+s_mov_b32 s5, 0
+// CHECK: [0x80,0x00,0x85,0xbe]
 
-s_mov_b32 s0, -1
-// CHECK: [0xc1,0x00,0x80,0xbe]
+s_mov_b32 s5, -1
+// CHECK: [0xc1,0x00,0x85,0xbe]
 
-s_mov_b32 s0, 0.5
-// CHECK: [0xf0,0x00,0x80,0xbe]
+s_mov_b32 s5, 0.5
+// CHECK: [0xf0,0x00,0x85,0xbe]
 
-s_mov_b32 s0, -4.0
-// CHECK: [0xf7,0x00,0x80,0xbe]
+s_mov_b32 s5, -4.0
+// CHECK: [0xf7,0x00,0x85,0xbe]
 
-s_mov_b32 s0, 0xaf123456
-// CHECK: [0xff,0x00,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_mov_b32 s5, 0xaf123456
+// CHECK: [0xff,0x00,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_mov_b32 s0, 0x3f717273
-// CHECK: [0xff,0x00,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_mov_b32 s5, 0x3f717273
+// CHECK: [0xff,0x00,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_mov_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x01,0x8a,0xbe]
 
-s_mov_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x01,0x82,0xbe]
+s_mov_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x01,0x8c,0xbe]
 
-s_mov_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x01,0xe4,0xbe]
+s_mov_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x01,0xe4,0xbe]
 
-s_mov_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x01,0xe6,0xbe]
+s_mov_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x01,0xe6,0xbe]
 
-s_mov_b64 vcc, s[0:1]
-// CHECK: [0x00,0x01,0xea,0xbe]
+s_mov_b64 vcc, s[2:3]
+// CHECK: [0x02,0x01,0xea,0xbe]
 
-s_mov_b64 tba, s[0:1]
-// CHECK: [0x00,0x01,0xec,0xbe]
+s_mov_b64 tba, s[2:3]
+// CHECK: [0x02,0x01,0xec,0xbe]
 
-s_mov_b64 tma, s[0:1]
-// CHECK: [0x00,0x01,0xee,0xbe]
+s_mov_b64 tma, s[2:3]
+// CHECK: [0x02,0x01,0xee,0xbe]
 
-s_mov_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x01,0xfa,0xbe]
+s_mov_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x01,0xfa,0xbe]
 
-s_mov_b64 exec, s[0:1]
-// CHECK: [0x00,0x01,0xfe,0xbe]
+s_mov_b64 exec, s[2:3]
+// CHECK: [0x02,0x01,0xfe,0xbe]
 
-s_mov_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], vcc
-// CHECK: [0x6a,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], vcc
+// CHECK: [0x6a,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], tba
-// CHECK: [0x6c,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], tba
+// CHECK: [0x6c,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], tma
-// CHECK: [0x6e,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], tma
+// CHECK: [0x6e,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], exec
-// CHECK: [0x7e,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], exec
+// CHECK: [0x7e,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], 0
-// CHECK: [0x80,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], 0
+// CHECK: [0x80,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], -1
-// CHECK: [0xc1,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], -1
+// CHECK: [0xc1,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x01,0x80,0xbe]
+s_mov_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x01,0x8a,0xbe]
 
-s_mov_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x01,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_mov_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x01,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_mov_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x01,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_mov_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x01,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_cmov_b32 s0, s0
-// CHECK: [0x00,0x02,0x80,0xbe]
+s_cmov_b32 s5, s1
+// CHECK: [0x01,0x02,0x85,0xbe]
 
-s_cmov_b32 s101, s0
-// CHECK: [0x00,0x02,0xe5,0xbe]
+s_cmov_b32 s101, s1
+// CHECK: [0x01,0x02,0xe5,0xbe]
 
-s_cmov_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x02,0xe6,0xbe]
+s_cmov_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x02,0xe6,0xbe]
 
-s_cmov_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x02,0xe7,0xbe]
+s_cmov_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x02,0xe7,0xbe]
 
-s_cmov_b32 vcc_lo, s0
-// CHECK: [0x00,0x02,0xea,0xbe]
+s_cmov_b32 vcc_lo, s1
+// CHECK: [0x01,0x02,0xea,0xbe]
 
-s_cmov_b32 vcc_hi, s0
-// CHECK: [0x00,0x02,0xeb,0xbe]
+s_cmov_b32 vcc_hi, s1
+// CHECK: [0x01,0x02,0xeb,0xbe]
 
-s_cmov_b32 tba_lo, s0
-// CHECK: [0x00,0x02,0xec,0xbe]
+s_cmov_b32 tba_lo, s1
+// CHECK: [0x01,0x02,0xec,0xbe]
 
-s_cmov_b32 tba_hi, s0
-// CHECK: [0x00,0x02,0xed,0xbe]
+s_cmov_b32 tba_hi, s1
+// CHECK: [0x01,0x02,0xed,0xbe]
 
-s_cmov_b32 tma_lo, s0
-// CHECK: [0x00,0x02,0xee,0xbe]
+s_cmov_b32 tma_lo, s1
+// CHECK: [0x01,0x02,0xee,0xbe]
 
-s_cmov_b32 tma_hi, s0
-// CHECK: [0x00,0x02,0xef,0xbe]
+s_cmov_b32 tma_hi, s1
+// CHECK: [0x01,0x02,0xef,0xbe]
 
-s_cmov_b32 ttmp11, s0
-// CHECK: [0x00,0x02,0xfb,0xbe]
+s_cmov_b32 ttmp11, s1
+// CHECK: [0x01,0x02,0xfb,0xbe]
 
-s_cmov_b32 m0, s0
-// CHECK: [0x00,0x02,0xfc,0xbe]
+s_cmov_b32 m0, s1
+// CHECK: [0x01,0x02,0xfc,0xbe]
 
-s_cmov_b32 exec_lo, s0
-// CHECK: [0x00,0x02,0xfe,0xbe]
+s_cmov_b32 exec_lo, s1
+// CHECK: [0x01,0x02,0xfe,0xbe]
 
-s_cmov_b32 exec_hi, s0
-// CHECK: [0x00,0x02,0xff,0xbe]
+s_cmov_b32 exec_hi, s1
+// CHECK: [0x01,0x02,0xff,0xbe]
 
-s_cmov_b32 s0, s101
-// CHECK: [0x65,0x02,0x80,0xbe]
+s_cmov_b32 s5, s101
+// CHECK: [0x65,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x02,0x80,0xbe]
+s_cmov_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x02,0x80,0xbe]
+s_cmov_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, vcc_lo
-// CHECK: [0x6a,0x02,0x80,0xbe]
+s_cmov_b32 s5, vcc_lo
+// CHECK: [0x6a,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, vcc_hi
-// CHECK: [0x6b,0x02,0x80,0xbe]
+s_cmov_b32 s5, vcc_hi
+// CHECK: [0x6b,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, tba_lo
-// CHECK: [0x6c,0x02,0x80,0xbe]
+s_cmov_b32 s5, tba_lo
+// CHECK: [0x6c,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, tba_hi
-// CHECK: [0x6d,0x02,0x80,0xbe]
+s_cmov_b32 s5, tba_hi
+// CHECK: [0x6d,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, tma_lo
-// CHECK: [0x6e,0x02,0x80,0xbe]
+s_cmov_b32 s5, tma_lo
+// CHECK: [0x6e,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, tma_hi
-// CHECK: [0x6f,0x02,0x80,0xbe]
+s_cmov_b32 s5, tma_hi
+// CHECK: [0x6f,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, ttmp11
-// CHECK: [0x7b,0x02,0x80,0xbe]
+s_cmov_b32 s5, ttmp11
+// CHECK: [0x7b,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, m0
-// CHECK: [0x7c,0x02,0x80,0xbe]
+s_cmov_b32 s5, m0
+// CHECK: [0x7c,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, exec_lo
-// CHECK: [0x7e,0x02,0x80,0xbe]
+s_cmov_b32 s5, exec_lo
+// CHECK: [0x7e,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, exec_hi
-// CHECK: [0x7f,0x02,0x80,0xbe]
+s_cmov_b32 s5, exec_hi
+// CHECK: [0x7f,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, 0
-// CHECK: [0x80,0x02,0x80,0xbe]
+s_cmov_b32 s5, 0
+// CHECK: [0x80,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, -1
-// CHECK: [0xc1,0x02,0x80,0xbe]
+s_cmov_b32 s5, -1
+// CHECK: [0xc1,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, 0.5
-// CHECK: [0xf0,0x02,0x80,0xbe]
+s_cmov_b32 s5, 0.5
+// CHECK: [0xf0,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, -4.0
-// CHECK: [0xf7,0x02,0x80,0xbe]
+s_cmov_b32 s5, -4.0
+// CHECK: [0xf7,0x02,0x85,0xbe]
 
-s_cmov_b32 s0, 0xaf123456
-// CHECK: [0xff,0x02,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_cmov_b32 s5, 0xaf123456
+// CHECK: [0xff,0x02,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_cmov_b32 s0, 0x3f717273
-// CHECK: [0xff,0x02,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_cmov_b32 s5, 0x3f717273
+// CHECK: [0xff,0x02,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_cmov_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x03,0x82,0xbe]
+s_cmov_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x03,0x8c,0xbe]
 
-s_cmov_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x03,0xe4,0xbe]
+s_cmov_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x03,0xe4,0xbe]
 
-s_cmov_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x03,0xe6,0xbe]
+s_cmov_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x03,0xe6,0xbe]
 
-s_cmov_b64 vcc, s[0:1]
-// CHECK: [0x00,0x03,0xea,0xbe]
+s_cmov_b64 vcc, s[2:3]
+// CHECK: [0x02,0x03,0xea,0xbe]
 
-s_cmov_b64 tba, s[0:1]
-// CHECK: [0x00,0x03,0xec,0xbe]
+s_cmov_b64 tba, s[2:3]
+// CHECK: [0x02,0x03,0xec,0xbe]
 
-s_cmov_b64 tma, s[0:1]
-// CHECK: [0x00,0x03,0xee,0xbe]
+s_cmov_b64 tma, s[2:3]
+// CHECK: [0x02,0x03,0xee,0xbe]
 
-s_cmov_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x03,0xfa,0xbe]
+s_cmov_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x03,0xfa,0xbe]
 
-s_cmov_b64 exec, s[0:1]
-// CHECK: [0x00,0x03,0xfe,0xbe]
+s_cmov_b64 exec, s[2:3]
+// CHECK: [0x02,0x03,0xfe,0xbe]
 
-s_cmov_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], vcc
-// CHECK: [0x6a,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], vcc
+// CHECK: [0x6a,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], tba
-// CHECK: [0x6c,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], tba
+// CHECK: [0x6c,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], tma
-// CHECK: [0x6e,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], tma
+// CHECK: [0x6e,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], exec
-// CHECK: [0x7e,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], exec
+// CHECK: [0x7e,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], 0
-// CHECK: [0x80,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], 0
+// CHECK: [0x80,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], -1
-// CHECK: [0xc1,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], -1
+// CHECK: [0xc1,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x03,0x80,0xbe]
+s_cmov_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x03,0x8a,0xbe]
 
-s_cmov_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x03,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_cmov_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x03,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_cmov_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x03,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_cmov_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x03,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_not_b32 s0, s0
-// CHECK: [0x00,0x04,0x80,0xbe]
+s_not_b32 s5, s1
+// CHECK: [0x01,0x04,0x85,0xbe]
 
-s_not_b32 s101, s0
-// CHECK: [0x00,0x04,0xe5,0xbe]
+s_not_b32 s101, s1
+// CHECK: [0x01,0x04,0xe5,0xbe]
 
-s_not_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x04,0xe6,0xbe]
+s_not_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x04,0xe6,0xbe]
 
-s_not_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x04,0xe7,0xbe]
+s_not_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x04,0xe7,0xbe]
 
-s_not_b32 vcc_lo, s0
-// CHECK: [0x00,0x04,0xea,0xbe]
+s_not_b32 vcc_lo, s1
+// CHECK: [0x01,0x04,0xea,0xbe]
 
-s_not_b32 vcc_hi, s0
-// CHECK: [0x00,0x04,0xeb,0xbe]
+s_not_b32 vcc_hi, s1
+// CHECK: [0x01,0x04,0xeb,0xbe]
 
-s_not_b32 tba_lo, s0
-// CHECK: [0x00,0x04,0xec,0xbe]
+s_not_b32 tba_lo, s1
+// CHECK: [0x01,0x04,0xec,0xbe]
 
-s_not_b32 tba_hi, s0
-// CHECK: [0x00,0x04,0xed,0xbe]
+s_not_b32 tba_hi, s1
+// CHECK: [0x01,0x04,0xed,0xbe]
 
-s_not_b32 tma_lo, s0
-// CHECK: [0x00,0x04,0xee,0xbe]
+s_not_b32 tma_lo, s1
+// CHECK: [0x01,0x04,0xee,0xbe]
 
-s_not_b32 tma_hi, s0
-// CHECK: [0x00,0x04,0xef,0xbe]
+s_not_b32 tma_hi, s1
+// CHECK: [0x01,0x04,0xef,0xbe]
 
-s_not_b32 ttmp11, s0
-// CHECK: [0x00,0x04,0xfb,0xbe]
+s_not_b32 ttmp11, s1
+// CHECK: [0x01,0x04,0xfb,0xbe]
 
-s_not_b32 m0, s0
-// CHECK: [0x00,0x04,0xfc,0xbe]
+s_not_b32 m0, s1
+// CHECK: [0x01,0x04,0xfc,0xbe]
 
-s_not_b32 exec_lo, s0
-// CHECK: [0x00,0x04,0xfe,0xbe]
+s_not_b32 exec_lo, s1
+// CHECK: [0x01,0x04,0xfe,0xbe]
 
-s_not_b32 exec_hi, s0
-// CHECK: [0x00,0x04,0xff,0xbe]
+s_not_b32 exec_hi, s1
+// CHECK: [0x01,0x04,0xff,0xbe]
 
-s_not_b32 s0, s101
-// CHECK: [0x65,0x04,0x80,0xbe]
+s_not_b32 s5, s101
+// CHECK: [0x65,0x04,0x85,0xbe]
 
-s_not_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x04,0x80,0xbe]
+s_not_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x04,0x85,0xbe]
 
-s_not_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x04,0x80,0xbe]
+s_not_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x04,0x85,0xbe]
 
-s_not_b32 s0, vcc_lo
-// CHECK: [0x6a,0x04,0x80,0xbe]
+s_not_b32 s5, vcc_lo
+// CHECK: [0x6a,0x04,0x85,0xbe]
 
-s_not_b32 s0, vcc_hi
-// CHECK: [0x6b,0x04,0x80,0xbe]
+s_not_b32 s5, vcc_hi
+// CHECK: [0x6b,0x04,0x85,0xbe]
 
-s_not_b32 s0, tba_lo
-// CHECK: [0x6c,0x04,0x80,0xbe]
+s_not_b32 s5, tba_lo
+// CHECK: [0x6c,0x04,0x85,0xbe]
 
-s_not_b32 s0, tba_hi
-// CHECK: [0x6d,0x04,0x80,0xbe]
+s_not_b32 s5, tba_hi
+// CHECK: [0x6d,0x04,0x85,0xbe]
 
-s_not_b32 s0, tma_lo
-// CHECK: [0x6e,0x04,0x80,0xbe]
+s_not_b32 s5, tma_lo
+// CHECK: [0x6e,0x04,0x85,0xbe]
 
-s_not_b32 s0, tma_hi
-// CHECK: [0x6f,0x04,0x80,0xbe]
+s_not_b32 s5, tma_hi
+// CHECK: [0x6f,0x04,0x85,0xbe]
 
-s_not_b32 s0, ttmp11
-// CHECK: [0x7b,0x04,0x80,0xbe]
+s_not_b32 s5, ttmp11
+// CHECK: [0x7b,0x04,0x85,0xbe]
 
-s_not_b32 s0, m0
-// CHECK: [0x7c,0x04,0x80,0xbe]
+s_not_b32 s5, m0
+// CHECK: [0x7c,0x04,0x85,0xbe]
 
-s_not_b32 s0, exec_lo
-// CHECK: [0x7e,0x04,0x80,0xbe]
+s_not_b32 s5, exec_lo
+// CHECK: [0x7e,0x04,0x85,0xbe]
 
-s_not_b32 s0, exec_hi
-// CHECK: [0x7f,0x04,0x80,0xbe]
+s_not_b32 s5, exec_hi
+// CHECK: [0x7f,0x04,0x85,0xbe]
 
-s_not_b32 s0, 0
-// CHECK: [0x80,0x04,0x80,0xbe]
+s_not_b32 s5, 0
+// CHECK: [0x80,0x04,0x85,0xbe]
 
-s_not_b32 s0, -1
-// CHECK: [0xc1,0x04,0x80,0xbe]
+s_not_b32 s5, -1
+// CHECK: [0xc1,0x04,0x85,0xbe]
 
-s_not_b32 s0, 0.5
-// CHECK: [0xf0,0x04,0x80,0xbe]
+s_not_b32 s5, 0.5
+// CHECK: [0xf0,0x04,0x85,0xbe]
 
-s_not_b32 s0, -4.0
-// CHECK: [0xf7,0x04,0x80,0xbe]
+s_not_b32 s5, -4.0
+// CHECK: [0xf7,0x04,0x85,0xbe]
 
-s_not_b32 s0, 0xaf123456
-// CHECK: [0xff,0x04,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_not_b32 s5, 0xaf123456
+// CHECK: [0xff,0x04,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_not_b32 s0, 0x3f717273
-// CHECK: [0xff,0x04,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_not_b32 s5, 0x3f717273
+// CHECK: [0xff,0x04,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_not_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x05,0x80,0xbe]
+s_not_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x05,0x8a,0xbe]
 
-s_not_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x05,0x82,0xbe]
+s_not_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x05,0x8c,0xbe]
 
-s_not_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x05,0xe4,0xbe]
+s_not_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x05,0xe4,0xbe]
 
-s_not_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x05,0xe6,0xbe]
+s_not_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x05,0xe6,0xbe]
 
-s_not_b64 vcc, s[0:1]
-// CHECK: [0x00,0x05,0xea,0xbe]
+s_not_b64 vcc, s[2:3]
+// CHECK: [0x02,0x05,0xea,0xbe]
 
-s_not_b64 tba, s[0:1]
-// CHECK: [0x00,0x05,0xec,0xbe]
+s_not_b64 tba, s[2:3]
+// CHECK: [0x02,0x05,0xec,0xbe]
 
-s_not_b64 tma, s[0:1]
-// CHECK: [0x00,0x05,0xee,0xbe]
+s_not_b64 tma, s[2:3]
+// CHECK: [0x02,0x05,0xee,0xbe]
 
-s_not_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x05,0xfa,0xbe]
+s_not_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x05,0xfa,0xbe]
 
-s_not_b64 exec, s[0:1]
-// CHECK: [0x00,0x05,0xfe,0xbe]
+s_not_b64 exec, s[2:3]
+// CHECK: [0x02,0x05,0xfe,0xbe]
 
-s_not_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x05,0x80,0xbe]
+s_not_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x05,0x80,0xbe]
+s_not_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x05,0x80,0xbe]
+s_not_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], vcc
-// CHECK: [0x6a,0x05,0x80,0xbe]
+s_not_b64 s[10:11], vcc
+// CHECK: [0x6a,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], tba
-// CHECK: [0x6c,0x05,0x80,0xbe]
+s_not_b64 s[10:11], tba
+// CHECK: [0x6c,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], tma
-// CHECK: [0x6e,0x05,0x80,0xbe]
+s_not_b64 s[10:11], tma
+// CHECK: [0x6e,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x05,0x80,0xbe]
+s_not_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], exec
-// CHECK: [0x7e,0x05,0x80,0xbe]
+s_not_b64 s[10:11], exec
+// CHECK: [0x7e,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], 0
-// CHECK: [0x80,0x05,0x80,0xbe]
+s_not_b64 s[10:11], 0
+// CHECK: [0x80,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], -1
-// CHECK: [0xc1,0x05,0x80,0xbe]
+s_not_b64 s[10:11], -1
+// CHECK: [0xc1,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x05,0x80,0xbe]
+s_not_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x05,0x80,0xbe]
+s_not_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x05,0x8a,0xbe]
 
-s_not_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x05,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_not_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x05,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_not_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x05,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_not_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x05,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_wqm_b32 s0, s0
-// CHECK: [0x00,0x06,0x80,0xbe]
+s_wqm_b32 s5, s1
+// CHECK: [0x01,0x06,0x85,0xbe]
 
-s_wqm_b32 s101, s0
-// CHECK: [0x00,0x06,0xe5,0xbe]
+s_wqm_b32 s101, s1
+// CHECK: [0x01,0x06,0xe5,0xbe]
 
-s_wqm_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x06,0xe6,0xbe]
+s_wqm_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x06,0xe6,0xbe]
 
-s_wqm_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x06,0xe7,0xbe]
+s_wqm_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x06,0xe7,0xbe]
 
-s_wqm_b32 vcc_lo, s0
-// CHECK: [0x00,0x06,0xea,0xbe]
+s_wqm_b32 vcc_lo, s1
+// CHECK: [0x01,0x06,0xea,0xbe]
 
-s_wqm_b32 vcc_hi, s0
-// CHECK: [0x00,0x06,0xeb,0xbe]
+s_wqm_b32 vcc_hi, s1
+// CHECK: [0x01,0x06,0xeb,0xbe]
 
-s_wqm_b32 tba_lo, s0
-// CHECK: [0x00,0x06,0xec,0xbe]
+s_wqm_b32 tba_lo, s1
+// CHECK: [0x01,0x06,0xec,0xbe]
 
-s_wqm_b32 tba_hi, s0
-// CHECK: [0x00,0x06,0xed,0xbe]
+s_wqm_b32 tba_hi, s1
+// CHECK: [0x01,0x06,0xed,0xbe]
 
-s_wqm_b32 tma_lo, s0
-// CHECK: [0x00,0x06,0xee,0xbe]
+s_wqm_b32 tma_lo, s1
+// CHECK: [0x01,0x06,0xee,0xbe]
 
-s_wqm_b32 tma_hi, s0
-// CHECK: [0x00,0x06,0xef,0xbe]
+s_wqm_b32 tma_hi, s1
+// CHECK: [0x01,0x06,0xef,0xbe]
 
-s_wqm_b32 ttmp11, s0
-// CHECK: [0x00,0x06,0xfb,0xbe]
+s_wqm_b32 ttmp11, s1
+// CHECK: [0x01,0x06,0xfb,0xbe]
 
-s_wqm_b32 m0, s0
-// CHECK: [0x00,0x06,0xfc,0xbe]
+s_wqm_b32 m0, s1
+// CHECK: [0x01,0x06,0xfc,0xbe]
 
-s_wqm_b32 exec_lo, s0
-// CHECK: [0x00,0x06,0xfe,0xbe]
+s_wqm_b32 exec_lo, s1
+// CHECK: [0x01,0x06,0xfe,0xbe]
 
-s_wqm_b32 exec_hi, s0
-// CHECK: [0x00,0x06,0xff,0xbe]
+s_wqm_b32 exec_hi, s1
+// CHECK: [0x01,0x06,0xff,0xbe]
 
-s_wqm_b32 s0, s101
-// CHECK: [0x65,0x06,0x80,0xbe]
+s_wqm_b32 s5, s101
+// CHECK: [0x65,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x06,0x80,0xbe]
+s_wqm_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x06,0x80,0xbe]
+s_wqm_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, vcc_lo
-// CHECK: [0x6a,0x06,0x80,0xbe]
+s_wqm_b32 s5, vcc_lo
+// CHECK: [0x6a,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, vcc_hi
-// CHECK: [0x6b,0x06,0x80,0xbe]
+s_wqm_b32 s5, vcc_hi
+// CHECK: [0x6b,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, tba_lo
-// CHECK: [0x6c,0x06,0x80,0xbe]
+s_wqm_b32 s5, tba_lo
+// CHECK: [0x6c,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, tba_hi
-// CHECK: [0x6d,0x06,0x80,0xbe]
+s_wqm_b32 s5, tba_hi
+// CHECK: [0x6d,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, tma_lo
-// CHECK: [0x6e,0x06,0x80,0xbe]
+s_wqm_b32 s5, tma_lo
+// CHECK: [0x6e,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, tma_hi
-// CHECK: [0x6f,0x06,0x80,0xbe]
+s_wqm_b32 s5, tma_hi
+// CHECK: [0x6f,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, ttmp11
-// CHECK: [0x7b,0x06,0x80,0xbe]
+s_wqm_b32 s5, ttmp11
+// CHECK: [0x7b,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, m0
-// CHECK: [0x7c,0x06,0x80,0xbe]
+s_wqm_b32 s5, m0
+// CHECK: [0x7c,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, exec_lo
-// CHECK: [0x7e,0x06,0x80,0xbe]
+s_wqm_b32 s5, exec_lo
+// CHECK: [0x7e,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, exec_hi
-// CHECK: [0x7f,0x06,0x80,0xbe]
+s_wqm_b32 s5, exec_hi
+// CHECK: [0x7f,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, 0
-// CHECK: [0x80,0x06,0x80,0xbe]
+s_wqm_b32 s5, 0
+// CHECK: [0x80,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, -1
-// CHECK: [0xc1,0x06,0x80,0xbe]
+s_wqm_b32 s5, -1
+// CHECK: [0xc1,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, 0.5
-// CHECK: [0xf0,0x06,0x80,0xbe]
+s_wqm_b32 s5, 0.5
+// CHECK: [0xf0,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, -4.0
-// CHECK: [0xf7,0x06,0x80,0xbe]
+s_wqm_b32 s5, -4.0
+// CHECK: [0xf7,0x06,0x85,0xbe]
 
-s_wqm_b32 s0, 0xaf123456
-// CHECK: [0xff,0x06,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_wqm_b32 s5, 0xaf123456
+// CHECK: [0xff,0x06,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_wqm_b32 s0, 0x3f717273
-// CHECK: [0xff,0x06,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_wqm_b32 s5, 0x3f717273
+// CHECK: [0xff,0x06,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_wqm_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x07,0x82,0xbe]
+s_wqm_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x07,0x8c,0xbe]
 
-s_wqm_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x07,0xe4,0xbe]
+s_wqm_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x07,0xe4,0xbe]
 
-s_wqm_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x07,0xe6,0xbe]
+s_wqm_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x07,0xe6,0xbe]
 
-s_wqm_b64 vcc, s[0:1]
-// CHECK: [0x00,0x07,0xea,0xbe]
+s_wqm_b64 vcc, s[2:3]
+// CHECK: [0x02,0x07,0xea,0xbe]
 
-s_wqm_b64 tba, s[0:1]
-// CHECK: [0x00,0x07,0xec,0xbe]
+s_wqm_b64 tba, s[2:3]
+// CHECK: [0x02,0x07,0xec,0xbe]
 
-s_wqm_b64 tma, s[0:1]
-// CHECK: [0x00,0x07,0xee,0xbe]
+s_wqm_b64 tma, s[2:3]
+// CHECK: [0x02,0x07,0xee,0xbe]
 
-s_wqm_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x07,0xfa,0xbe]
+s_wqm_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x07,0xfa,0xbe]
 
-s_wqm_b64 exec, s[0:1]
-// CHECK: [0x00,0x07,0xfe,0xbe]
+s_wqm_b64 exec, s[2:3]
+// CHECK: [0x02,0x07,0xfe,0xbe]
 
-s_wqm_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], vcc
-// CHECK: [0x6a,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], vcc
+// CHECK: [0x6a,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], tba
-// CHECK: [0x6c,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], tba
+// CHECK: [0x6c,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], tma
-// CHECK: [0x6e,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], tma
+// CHECK: [0x6e,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], exec
-// CHECK: [0x7e,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], exec
+// CHECK: [0x7e,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], 0
-// CHECK: [0x80,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], 0
+// CHECK: [0x80,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], -1
-// CHECK: [0xc1,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], -1
+// CHECK: [0xc1,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x07,0x80,0xbe]
+s_wqm_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x07,0x8a,0xbe]
 
-s_wqm_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x07,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_wqm_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x07,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_wqm_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x07,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_wqm_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x07,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_brev_b32 s0, s0
-// CHECK: [0x00,0x08,0x80,0xbe]
+s_brev_b32 s5, s1
+// CHECK: [0x01,0x08,0x85,0xbe]
 
-s_brev_b32 s101, s0
-// CHECK: [0x00,0x08,0xe5,0xbe]
+s_brev_b32 s101, s1
+// CHECK: [0x01,0x08,0xe5,0xbe]
 
-s_brev_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x08,0xe6,0xbe]
+s_brev_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x08,0xe6,0xbe]
 
-s_brev_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x08,0xe7,0xbe]
+s_brev_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x08,0xe7,0xbe]
 
-s_brev_b32 vcc_lo, s0
-// CHECK: [0x00,0x08,0xea,0xbe]
+s_brev_b32 vcc_lo, s1
+// CHECK: [0x01,0x08,0xea,0xbe]
 
-s_brev_b32 vcc_hi, s0
-// CHECK: [0x00,0x08,0xeb,0xbe]
+s_brev_b32 vcc_hi, s1
+// CHECK: [0x01,0x08,0xeb,0xbe]
 
-s_brev_b32 tba_lo, s0
-// CHECK: [0x00,0x08,0xec,0xbe]
+s_brev_b32 tba_lo, s1
+// CHECK: [0x01,0x08,0xec,0xbe]
 
-s_brev_b32 tba_hi, s0
-// CHECK: [0x00,0x08,0xed,0xbe]
+s_brev_b32 tba_hi, s1
+// CHECK: [0x01,0x08,0xed,0xbe]
 
-s_brev_b32 tma_lo, s0
-// CHECK: [0x00,0x08,0xee,0xbe]
+s_brev_b32 tma_lo, s1
+// CHECK: [0x01,0x08,0xee,0xbe]
 
-s_brev_b32 tma_hi, s0
-// CHECK: [0x00,0x08,0xef,0xbe]
+s_brev_b32 tma_hi, s1
+// CHECK: [0x01,0x08,0xef,0xbe]
 
-s_brev_b32 ttmp11, s0
-// CHECK: [0x00,0x08,0xfb,0xbe]
+s_brev_b32 ttmp11, s1
+// CHECK: [0x01,0x08,0xfb,0xbe]
 
-s_brev_b32 m0, s0
-// CHECK: [0x00,0x08,0xfc,0xbe]
+s_brev_b32 m0, s1
+// CHECK: [0x01,0x08,0xfc,0xbe]
 
-s_brev_b32 exec_lo, s0
-// CHECK: [0x00,0x08,0xfe,0xbe]
+s_brev_b32 exec_lo, s1
+// CHECK: [0x01,0x08,0xfe,0xbe]
 
-s_brev_b32 exec_hi, s0
-// CHECK: [0x00,0x08,0xff,0xbe]
+s_brev_b32 exec_hi, s1
+// CHECK: [0x01,0x08,0xff,0xbe]
 
-s_brev_b32 s0, s101
-// CHECK: [0x65,0x08,0x80,0xbe]
+s_brev_b32 s5, s101
+// CHECK: [0x65,0x08,0x85,0xbe]
 
-s_brev_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x08,0x80,0xbe]
+s_brev_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x08,0x85,0xbe]
 
-s_brev_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x08,0x80,0xbe]
+s_brev_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x08,0x85,0xbe]
 
-s_brev_b32 s0, vcc_lo
-// CHECK: [0x6a,0x08,0x80,0xbe]
+s_brev_b32 s5, vcc_lo
+// CHECK: [0x6a,0x08,0x85,0xbe]
 
-s_brev_b32 s0, vcc_hi
-// CHECK: [0x6b,0x08,0x80,0xbe]
+s_brev_b32 s5, vcc_hi
+// CHECK: [0x6b,0x08,0x85,0xbe]
 
-s_brev_b32 s0, tba_lo
-// CHECK: [0x6c,0x08,0x80,0xbe]
+s_brev_b32 s5, tba_lo
+// CHECK: [0x6c,0x08,0x85,0xbe]
 
-s_brev_b32 s0, tba_hi
-// CHECK: [0x6d,0x08,0x80,0xbe]
+s_brev_b32 s5, tba_hi
+// CHECK: [0x6d,0x08,0x85,0xbe]
 
-s_brev_b32 s0, tma_lo
-// CHECK: [0x6e,0x08,0x80,0xbe]
+s_brev_b32 s5, tma_lo
+// CHECK: [0x6e,0x08,0x85,0xbe]
 
-s_brev_b32 s0, tma_hi
-// CHECK: [0x6f,0x08,0x80,0xbe]
+s_brev_b32 s5, tma_hi
+// CHECK: [0x6f,0x08,0x85,0xbe]
 
-s_brev_b32 s0, ttmp11
-// CHECK: [0x7b,0x08,0x80,0xbe]
+s_brev_b32 s5, ttmp11
+// CHECK: [0x7b,0x08,0x85,0xbe]
 
-s_brev_b32 s0, m0
-// CHECK: [0x7c,0x08,0x80,0xbe]
+s_brev_b32 s5, m0
+// CHECK: [0x7c,0x08,0x85,0xbe]
 
-s_brev_b32 s0, exec_lo
-// CHECK: [0x7e,0x08,0x80,0xbe]
+s_brev_b32 s5, exec_lo
+// CHECK: [0x7e,0x08,0x85,0xbe]
 
-s_brev_b32 s0, exec_hi
-// CHECK: [0x7f,0x08,0x80,0xbe]
+s_brev_b32 s5, exec_hi
+// CHECK: [0x7f,0x08,0x85,0xbe]
 
-s_brev_b32 s0, 0
-// CHECK: [0x80,0x08,0x80,0xbe]
+s_brev_b32 s5, 0
+// CHECK: [0x80,0x08,0x85,0xbe]
 
-s_brev_b32 s0, -1
-// CHECK: [0xc1,0x08,0x80,0xbe]
+s_brev_b32 s5, -1
+// CHECK: [0xc1,0x08,0x85,0xbe]
 
-s_brev_b32 s0, 0.5
-// CHECK: [0xf0,0x08,0x80,0xbe]
+s_brev_b32 s5, 0.5
+// CHECK: [0xf0,0x08,0x85,0xbe]
 
-s_brev_b32 s0, -4.0
-// CHECK: [0xf7,0x08,0x80,0xbe]
+s_brev_b32 s5, -4.0
+// CHECK: [0xf7,0x08,0x85,0xbe]
 
-s_brev_b32 s0, 0xaf123456
-// CHECK: [0xff,0x08,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_brev_b32 s5, 0xaf123456
+// CHECK: [0xff,0x08,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_brev_b32 s0, 0x3f717273
-// CHECK: [0xff,0x08,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_brev_b32 s5, 0x3f717273
+// CHECK: [0xff,0x08,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_brev_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x09,0x8a,0xbe]
 
-s_brev_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x09,0x82,0xbe]
+s_brev_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x09,0x8c,0xbe]
 
-s_brev_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x09,0xe4,0xbe]
+s_brev_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x09,0xe4,0xbe]
 
-s_brev_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x09,0xe6,0xbe]
+s_brev_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x09,0xe6,0xbe]
 
-s_brev_b64 vcc, s[0:1]
-// CHECK: [0x00,0x09,0xea,0xbe]
+s_brev_b64 vcc, s[2:3]
+// CHECK: [0x02,0x09,0xea,0xbe]
 
-s_brev_b64 tba, s[0:1]
-// CHECK: [0x00,0x09,0xec,0xbe]
+s_brev_b64 tba, s[2:3]
+// CHECK: [0x02,0x09,0xec,0xbe]
 
-s_brev_b64 tma, s[0:1]
-// CHECK: [0x00,0x09,0xee,0xbe]
+s_brev_b64 tma, s[2:3]
+// CHECK: [0x02,0x09,0xee,0xbe]
 
-s_brev_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x09,0xfa,0xbe]
+s_brev_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x09,0xfa,0xbe]
 
-s_brev_b64 exec, s[0:1]
-// CHECK: [0x00,0x09,0xfe,0xbe]
+s_brev_b64 exec, s[2:3]
+// CHECK: [0x02,0x09,0xfe,0xbe]
 
-s_brev_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], vcc
-// CHECK: [0x6a,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], vcc
+// CHECK: [0x6a,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], tba
-// CHECK: [0x6c,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], tba
+// CHECK: [0x6c,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], tma
-// CHECK: [0x6e,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], tma
+// CHECK: [0x6e,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], exec
-// CHECK: [0x7e,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], exec
+// CHECK: [0x7e,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], 0
-// CHECK: [0x80,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], 0
+// CHECK: [0x80,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], -1
-// CHECK: [0xc1,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], -1
+// CHECK: [0xc1,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x09,0x80,0xbe]
+s_brev_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x09,0x8a,0xbe]
 
-s_brev_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x09,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_brev_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x09,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_brev_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x09,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_brev_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x09,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bcnt0_i32_b32 s0, s0
-// CHECK: [0x00,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, s1
+// CHECK: [0x01,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s101, s0
-// CHECK: [0x00,0x0a,0xe5,0xbe]
+s_bcnt0_i32_b32 s101, s1
+// CHECK: [0x01,0x0a,0xe5,0xbe]
 
-s_bcnt0_i32_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x0a,0xe6,0xbe]
+s_bcnt0_i32_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x0a,0xe6,0xbe]
 
-s_bcnt0_i32_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x0a,0xe7,0xbe]
+s_bcnt0_i32_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x0a,0xe7,0xbe]
 
-s_bcnt0_i32_b32 vcc_lo, s0
-// CHECK: [0x00,0x0a,0xea,0xbe]
+s_bcnt0_i32_b32 vcc_lo, s1
+// CHECK: [0x01,0x0a,0xea,0xbe]
 
-s_bcnt0_i32_b32 vcc_hi, s0
-// CHECK: [0x00,0x0a,0xeb,0xbe]
+s_bcnt0_i32_b32 vcc_hi, s1
+// CHECK: [0x01,0x0a,0xeb,0xbe]
 
-s_bcnt0_i32_b32 tba_lo, s0
-// CHECK: [0x00,0x0a,0xec,0xbe]
+s_bcnt0_i32_b32 tba_lo, s1
+// CHECK: [0x01,0x0a,0xec,0xbe]
 
-s_bcnt0_i32_b32 tba_hi, s0
-// CHECK: [0x00,0x0a,0xed,0xbe]
+s_bcnt0_i32_b32 tba_hi, s1
+// CHECK: [0x01,0x0a,0xed,0xbe]
 
-s_bcnt0_i32_b32 tma_lo, s0
-// CHECK: [0x00,0x0a,0xee,0xbe]
+s_bcnt0_i32_b32 tma_lo, s1
+// CHECK: [0x01,0x0a,0xee,0xbe]
 
-s_bcnt0_i32_b32 tma_hi, s0
-// CHECK: [0x00,0x0a,0xef,0xbe]
+s_bcnt0_i32_b32 tma_hi, s1
+// CHECK: [0x01,0x0a,0xef,0xbe]
 
-s_bcnt0_i32_b32 ttmp11, s0
-// CHECK: [0x00,0x0a,0xfb,0xbe]
+s_bcnt0_i32_b32 ttmp11, s1
+// CHECK: [0x01,0x0a,0xfb,0xbe]
 
-s_bcnt0_i32_b32 m0, s0
-// CHECK: [0x00,0x0a,0xfc,0xbe]
+s_bcnt0_i32_b32 m0, s1
+// CHECK: [0x01,0x0a,0xfc,0xbe]
 
-s_bcnt0_i32_b32 exec_lo, s0
-// CHECK: [0x00,0x0a,0xfe,0xbe]
+s_bcnt0_i32_b32 exec_lo, s1
+// CHECK: [0x01,0x0a,0xfe,0xbe]
 
-s_bcnt0_i32_b32 exec_hi, s0
-// CHECK: [0x00,0x0a,0xff,0xbe]
+s_bcnt0_i32_b32 exec_hi, s1
+// CHECK: [0x01,0x0a,0xff,0xbe]
 
-s_bcnt0_i32_b32 s0, s101
-// CHECK: [0x65,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, s101
+// CHECK: [0x65,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, vcc_lo
-// CHECK: [0x6a,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, vcc_lo
+// CHECK: [0x6a,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, vcc_hi
-// CHECK: [0x6b,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, vcc_hi
+// CHECK: [0x6b,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, tba_lo
-// CHECK: [0x6c,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, tba_lo
+// CHECK: [0x6c,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, tba_hi
-// CHECK: [0x6d,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, tba_hi
+// CHECK: [0x6d,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, tma_lo
-// CHECK: [0x6e,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, tma_lo
+// CHECK: [0x6e,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, tma_hi
-// CHECK: [0x6f,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, tma_hi
+// CHECK: [0x6f,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, ttmp11
-// CHECK: [0x7b,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, ttmp11
+// CHECK: [0x7b,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, m0
-// CHECK: [0x7c,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, m0
+// CHECK: [0x7c,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, exec_lo
-// CHECK: [0x7e,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, exec_lo
+// CHECK: [0x7e,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, exec_hi
-// CHECK: [0x7f,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, exec_hi
+// CHECK: [0x7f,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, 0
-// CHECK: [0x80,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, 0
+// CHECK: [0x80,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, -1
-// CHECK: [0xc1,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, -1
+// CHECK: [0xc1,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, 0.5
-// CHECK: [0xf0,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, 0.5
+// CHECK: [0xf0,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, -4.0
-// CHECK: [0xf7,0x0a,0x80,0xbe]
+s_bcnt0_i32_b32 s5, -4.0
+// CHECK: [0xf7,0x0a,0x85,0xbe]
 
-s_bcnt0_i32_b32 s0, 0xaf123456
-// CHECK: [0xff,0x0a,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bcnt0_i32_b32 s5, 0xaf123456
+// CHECK: [0xff,0x0a,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bcnt0_i32_b32 s0, 0x3f717273
-// CHECK: [0xff,0x0a,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bcnt0_i32_b32 s5, 0x3f717273
+// CHECK: [0xff,0x0a,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bcnt0_i32_b64 s0, s[0:1]
-// CHECK: [0x00,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, s[2:3]
+// CHECK: [0x02,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s101, s[0:1]
-// CHECK: [0x00,0x0b,0xe5,0xbe]
+s_bcnt0_i32_b64 s101, s[2:3]
+// CHECK: [0x02,0x0b,0xe5,0xbe]
 
-s_bcnt0_i32_b64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x0b,0xe6,0xbe]
+s_bcnt0_i32_b64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x0b,0xe6,0xbe]
 
-s_bcnt0_i32_b64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x0b,0xe7,0xbe]
+s_bcnt0_i32_b64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x0b,0xe7,0xbe]
 
-s_bcnt0_i32_b64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x0b,0xea,0xbe]
+s_bcnt0_i32_b64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x0b,0xea,0xbe]
 
-s_bcnt0_i32_b64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x0b,0xeb,0xbe]
+s_bcnt0_i32_b64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x0b,0xeb,0xbe]
 
-s_bcnt0_i32_b64 tba_lo, s[0:1]
-// CHECK: [0x00,0x0b,0xec,0xbe]
+s_bcnt0_i32_b64 tba_lo, s[2:3]
+// CHECK: [0x02,0x0b,0xec,0xbe]
 
-s_bcnt0_i32_b64 tba_hi, s[0:1]
-// CHECK: [0x00,0x0b,0xed,0xbe]
+s_bcnt0_i32_b64 tba_hi, s[2:3]
+// CHECK: [0x02,0x0b,0xed,0xbe]
 
-s_bcnt0_i32_b64 tma_lo, s[0:1]
-// CHECK: [0x00,0x0b,0xee,0xbe]
+s_bcnt0_i32_b64 tma_lo, s[2:3]
+// CHECK: [0x02,0x0b,0xee,0xbe]
 
-s_bcnt0_i32_b64 tma_hi, s[0:1]
-// CHECK: [0x00,0x0b,0xef,0xbe]
+s_bcnt0_i32_b64 tma_hi, s[2:3]
+// CHECK: [0x02,0x0b,0xef,0xbe]
 
-s_bcnt0_i32_b64 ttmp11, s[0:1]
-// CHECK: [0x00,0x0b,0xfb,0xbe]
+s_bcnt0_i32_b64 ttmp11, s[2:3]
+// CHECK: [0x02,0x0b,0xfb,0xbe]
 
-s_bcnt0_i32_b64 m0, s[0:1]
-// CHECK: [0x00,0x0b,0xfc,0xbe]
+s_bcnt0_i32_b64 m0, s[2:3]
+// CHECK: [0x02,0x0b,0xfc,0xbe]
 
-s_bcnt0_i32_b64 exec_lo, s[0:1]
-// CHECK: [0x00,0x0b,0xfe,0xbe]
+s_bcnt0_i32_b64 exec_lo, s[2:3]
+// CHECK: [0x02,0x0b,0xfe,0xbe]
 
-s_bcnt0_i32_b64 exec_hi, s[0:1]
-// CHECK: [0x00,0x0b,0xff,0xbe]
+s_bcnt0_i32_b64 exec_hi, s[2:3]
+// CHECK: [0x02,0x0b,0xff,0xbe]
 
-s_bcnt0_i32_b64 s0, s[2:3]
-// CHECK: [0x02,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, s[4:5]
+// CHECK: [0x04,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, s[100:101]
-// CHECK: [0x64,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, s[100:101]
+// CHECK: [0x64,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, flat_scratch
-// CHECK: [0x66,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, flat_scratch
+// CHECK: [0x66,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, vcc
-// CHECK: [0x6a,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, vcc
+// CHECK: [0x6a,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, tba
-// CHECK: [0x6c,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, tba
+// CHECK: [0x6c,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, tma
-// CHECK: [0x6e,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, tma
+// CHECK: [0x6e,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, exec
-// CHECK: [0x7e,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, exec
+// CHECK: [0x7e,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, 0
-// CHECK: [0x80,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, 0
+// CHECK: [0x80,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, -1
-// CHECK: [0xc1,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, -1
+// CHECK: [0xc1,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, 0.5
-// CHECK: [0xf0,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, 0.5
+// CHECK: [0xf0,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, -4.0
-// CHECK: [0xf7,0x0b,0x80,0xbe]
+s_bcnt0_i32_b64 s5, -4.0
+// CHECK: [0xf7,0x0b,0x85,0xbe]
 
-s_bcnt0_i32_b64 s0, 0xaf123456
-// CHECK: [0xff,0x0b,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bcnt0_i32_b64 s5, 0xaf123456
+// CHECK: [0xff,0x0b,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bcnt0_i32_b64 s0, 0x3f717273
-// CHECK: [0xff,0x0b,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bcnt0_i32_b64 s5, 0x3f717273
+// CHECK: [0xff,0x0b,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bcnt1_i32_b32 s0, s0
-// CHECK: [0x00,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, s1
+// CHECK: [0x01,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s101, s0
-// CHECK: [0x00,0x0c,0xe5,0xbe]
+s_bcnt1_i32_b32 s101, s1
+// CHECK: [0x01,0x0c,0xe5,0xbe]
 
-s_bcnt1_i32_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x0c,0xe6,0xbe]
+s_bcnt1_i32_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x0c,0xe6,0xbe]
 
-s_bcnt1_i32_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x0c,0xe7,0xbe]
+s_bcnt1_i32_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x0c,0xe7,0xbe]
 
-s_bcnt1_i32_b32 vcc_lo, s0
-// CHECK: [0x00,0x0c,0xea,0xbe]
+s_bcnt1_i32_b32 vcc_lo, s1
+// CHECK: [0x01,0x0c,0xea,0xbe]
 
-s_bcnt1_i32_b32 vcc_hi, s0
-// CHECK: [0x00,0x0c,0xeb,0xbe]
+s_bcnt1_i32_b32 vcc_hi, s1
+// CHECK: [0x01,0x0c,0xeb,0xbe]
 
-s_bcnt1_i32_b32 tba_lo, s0
-// CHECK: [0x00,0x0c,0xec,0xbe]
+s_bcnt1_i32_b32 tba_lo, s1
+// CHECK: [0x01,0x0c,0xec,0xbe]
 
-s_bcnt1_i32_b32 tba_hi, s0
-// CHECK: [0x00,0x0c,0xed,0xbe]
+s_bcnt1_i32_b32 tba_hi, s1
+// CHECK: [0x01,0x0c,0xed,0xbe]
 
-s_bcnt1_i32_b32 tma_lo, s0
-// CHECK: [0x00,0x0c,0xee,0xbe]
+s_bcnt1_i32_b32 tma_lo, s1
+// CHECK: [0x01,0x0c,0xee,0xbe]
 
-s_bcnt1_i32_b32 tma_hi, s0
-// CHECK: [0x00,0x0c,0xef,0xbe]
+s_bcnt1_i32_b32 tma_hi, s1
+// CHECK: [0x01,0x0c,0xef,0xbe]
 
-s_bcnt1_i32_b32 ttmp11, s0
-// CHECK: [0x00,0x0c,0xfb,0xbe]
+s_bcnt1_i32_b32 ttmp11, s1
+// CHECK: [0x01,0x0c,0xfb,0xbe]
 
-s_bcnt1_i32_b32 m0, s0
-// CHECK: [0x00,0x0c,0xfc,0xbe]
+s_bcnt1_i32_b32 m0, s1
+// CHECK: [0x01,0x0c,0xfc,0xbe]
 
-s_bcnt1_i32_b32 exec_lo, s0
-// CHECK: [0x00,0x0c,0xfe,0xbe]
+s_bcnt1_i32_b32 exec_lo, s1
+// CHECK: [0x01,0x0c,0xfe,0xbe]
 
-s_bcnt1_i32_b32 exec_hi, s0
-// CHECK: [0x00,0x0c,0xff,0xbe]
+s_bcnt1_i32_b32 exec_hi, s1
+// CHECK: [0x01,0x0c,0xff,0xbe]
 
-s_bcnt1_i32_b32 s0, s101
-// CHECK: [0x65,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, s101
+// CHECK: [0x65,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, vcc_lo
-// CHECK: [0x6a,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, vcc_lo
+// CHECK: [0x6a,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, vcc_hi
-// CHECK: [0x6b,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, vcc_hi
+// CHECK: [0x6b,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, tba_lo
-// CHECK: [0x6c,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, tba_lo
+// CHECK: [0x6c,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, tba_hi
-// CHECK: [0x6d,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, tba_hi
+// CHECK: [0x6d,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, tma_lo
-// CHECK: [0x6e,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, tma_lo
+// CHECK: [0x6e,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, tma_hi
-// CHECK: [0x6f,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, tma_hi
+// CHECK: [0x6f,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, ttmp11
-// CHECK: [0x7b,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, ttmp11
+// CHECK: [0x7b,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, m0
-// CHECK: [0x7c,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, m0
+// CHECK: [0x7c,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, exec_lo
-// CHECK: [0x7e,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, exec_lo
+// CHECK: [0x7e,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, exec_hi
-// CHECK: [0x7f,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, exec_hi
+// CHECK: [0x7f,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, 0
-// CHECK: [0x80,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, 0
+// CHECK: [0x80,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, -1
-// CHECK: [0xc1,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, -1
+// CHECK: [0xc1,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, 0.5
-// CHECK: [0xf0,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, 0.5
+// CHECK: [0xf0,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, -4.0
-// CHECK: [0xf7,0x0c,0x80,0xbe]
+s_bcnt1_i32_b32 s5, -4.0
+// CHECK: [0xf7,0x0c,0x85,0xbe]
 
-s_bcnt1_i32_b32 s0, 0xaf123456
-// CHECK: [0xff,0x0c,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bcnt1_i32_b32 s5, 0xaf123456
+// CHECK: [0xff,0x0c,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bcnt1_i32_b32 s0, 0x3f717273
-// CHECK: [0xff,0x0c,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bcnt1_i32_b32 s5, 0x3f717273
+// CHECK: [0xff,0x0c,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bcnt1_i32_b64 s0, s[0:1]
-// CHECK: [0x00,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, s[2:3]
+// CHECK: [0x02,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s101, s[0:1]
-// CHECK: [0x00,0x0d,0xe5,0xbe]
+s_bcnt1_i32_b64 s101, s[2:3]
+// CHECK: [0x02,0x0d,0xe5,0xbe]
 
-s_bcnt1_i32_b64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x0d,0xe6,0xbe]
+s_bcnt1_i32_b64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x0d,0xe6,0xbe]
 
-s_bcnt1_i32_b64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x0d,0xe7,0xbe]
+s_bcnt1_i32_b64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x0d,0xe7,0xbe]
 
-s_bcnt1_i32_b64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x0d,0xea,0xbe]
+s_bcnt1_i32_b64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x0d,0xea,0xbe]
 
-s_bcnt1_i32_b64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x0d,0xeb,0xbe]
+s_bcnt1_i32_b64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x0d,0xeb,0xbe]
 
-s_bcnt1_i32_b64 tba_lo, s[0:1]
-// CHECK: [0x00,0x0d,0xec,0xbe]
+s_bcnt1_i32_b64 tba_lo, s[2:3]
+// CHECK: [0x02,0x0d,0xec,0xbe]
 
-s_bcnt1_i32_b64 tba_hi, s[0:1]
-// CHECK: [0x00,0x0d,0xed,0xbe]
+s_bcnt1_i32_b64 tba_hi, s[2:3]
+// CHECK: [0x02,0x0d,0xed,0xbe]
 
-s_bcnt1_i32_b64 tma_lo, s[0:1]
-// CHECK: [0x00,0x0d,0xee,0xbe]
+s_bcnt1_i32_b64 tma_lo, s[2:3]
+// CHECK: [0x02,0x0d,0xee,0xbe]
 
-s_bcnt1_i32_b64 tma_hi, s[0:1]
-// CHECK: [0x00,0x0d,0xef,0xbe]
+s_bcnt1_i32_b64 tma_hi, s[2:3]
+// CHECK: [0x02,0x0d,0xef,0xbe]
 
-s_bcnt1_i32_b64 ttmp11, s[0:1]
-// CHECK: [0x00,0x0d,0xfb,0xbe]
+s_bcnt1_i32_b64 ttmp11, s[2:3]
+// CHECK: [0x02,0x0d,0xfb,0xbe]
 
-s_bcnt1_i32_b64 m0, s[0:1]
-// CHECK: [0x00,0x0d,0xfc,0xbe]
+s_bcnt1_i32_b64 m0, s[2:3]
+// CHECK: [0x02,0x0d,0xfc,0xbe]
 
-s_bcnt1_i32_b64 exec_lo, s[0:1]
-// CHECK: [0x00,0x0d,0xfe,0xbe]
+s_bcnt1_i32_b64 exec_lo, s[2:3]
+// CHECK: [0x02,0x0d,0xfe,0xbe]
 
-s_bcnt1_i32_b64 exec_hi, s[0:1]
-// CHECK: [0x00,0x0d,0xff,0xbe]
+s_bcnt1_i32_b64 exec_hi, s[2:3]
+// CHECK: [0x02,0x0d,0xff,0xbe]
 
-s_bcnt1_i32_b64 s0, s[2:3]
-// CHECK: [0x02,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, s[4:5]
+// CHECK: [0x04,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, s[100:101]
-// CHECK: [0x64,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, s[100:101]
+// CHECK: [0x64,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, flat_scratch
-// CHECK: [0x66,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, flat_scratch
+// CHECK: [0x66,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, vcc
-// CHECK: [0x6a,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, vcc
+// CHECK: [0x6a,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, tba
-// CHECK: [0x6c,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, tba
+// CHECK: [0x6c,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, tma
-// CHECK: [0x6e,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, tma
+// CHECK: [0x6e,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, exec
-// CHECK: [0x7e,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, exec
+// CHECK: [0x7e,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, 0
-// CHECK: [0x80,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, 0
+// CHECK: [0x80,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, -1
-// CHECK: [0xc1,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, -1
+// CHECK: [0xc1,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, 0.5
-// CHECK: [0xf0,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, 0.5
+// CHECK: [0xf0,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, -4.0
-// CHECK: [0xf7,0x0d,0x80,0xbe]
+s_bcnt1_i32_b64 s5, -4.0
+// CHECK: [0xf7,0x0d,0x85,0xbe]
 
-s_bcnt1_i32_b64 s0, 0xaf123456
-// CHECK: [0xff,0x0d,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bcnt1_i32_b64 s5, 0xaf123456
+// CHECK: [0xff,0x0d,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bcnt1_i32_b64 s0, 0x3f717273
-// CHECK: [0xff,0x0d,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bcnt1_i32_b64 s5, 0x3f717273
+// CHECK: [0xff,0x0d,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_ff0_i32_b32 s0, s0
-// CHECK: [0x00,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, s1
+// CHECK: [0x01,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s101, s0
-// CHECK: [0x00,0x0e,0xe5,0xbe]
+s_ff0_i32_b32 s101, s1
+// CHECK: [0x01,0x0e,0xe5,0xbe]
 
-s_ff0_i32_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x0e,0xe6,0xbe]
+s_ff0_i32_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x0e,0xe6,0xbe]
 
-s_ff0_i32_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x0e,0xe7,0xbe]
+s_ff0_i32_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x0e,0xe7,0xbe]
 
-s_ff0_i32_b32 vcc_lo, s0
-// CHECK: [0x00,0x0e,0xea,0xbe]
+s_ff0_i32_b32 vcc_lo, s1
+// CHECK: [0x01,0x0e,0xea,0xbe]
 
-s_ff0_i32_b32 vcc_hi, s0
-// CHECK: [0x00,0x0e,0xeb,0xbe]
+s_ff0_i32_b32 vcc_hi, s1
+// CHECK: [0x01,0x0e,0xeb,0xbe]
 
-s_ff0_i32_b32 tba_lo, s0
-// CHECK: [0x00,0x0e,0xec,0xbe]
+s_ff0_i32_b32 tba_lo, s1
+// CHECK: [0x01,0x0e,0xec,0xbe]
 
-s_ff0_i32_b32 tba_hi, s0
-// CHECK: [0x00,0x0e,0xed,0xbe]
+s_ff0_i32_b32 tba_hi, s1
+// CHECK: [0x01,0x0e,0xed,0xbe]
 
-s_ff0_i32_b32 tma_lo, s0
-// CHECK: [0x00,0x0e,0xee,0xbe]
+s_ff0_i32_b32 tma_lo, s1
+// CHECK: [0x01,0x0e,0xee,0xbe]
 
-s_ff0_i32_b32 tma_hi, s0
-// CHECK: [0x00,0x0e,0xef,0xbe]
+s_ff0_i32_b32 tma_hi, s1
+// CHECK: [0x01,0x0e,0xef,0xbe]
 
-s_ff0_i32_b32 ttmp11, s0
-// CHECK: [0x00,0x0e,0xfb,0xbe]
+s_ff0_i32_b32 ttmp11, s1
+// CHECK: [0x01,0x0e,0xfb,0xbe]
 
-s_ff0_i32_b32 m0, s0
-// CHECK: [0x00,0x0e,0xfc,0xbe]
+s_ff0_i32_b32 m0, s1
+// CHECK: [0x01,0x0e,0xfc,0xbe]
 
-s_ff0_i32_b32 exec_lo, s0
-// CHECK: [0x00,0x0e,0xfe,0xbe]
+s_ff0_i32_b32 exec_lo, s1
+// CHECK: [0x01,0x0e,0xfe,0xbe]
 
-s_ff0_i32_b32 exec_hi, s0
-// CHECK: [0x00,0x0e,0xff,0xbe]
+s_ff0_i32_b32 exec_hi, s1
+// CHECK: [0x01,0x0e,0xff,0xbe]
 
-s_ff0_i32_b32 s0, s101
-// CHECK: [0x65,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, s101
+// CHECK: [0x65,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, vcc_lo
-// CHECK: [0x6a,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, vcc_lo
+// CHECK: [0x6a,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, vcc_hi
-// CHECK: [0x6b,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, vcc_hi
+// CHECK: [0x6b,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, tba_lo
-// CHECK: [0x6c,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, tba_lo
+// CHECK: [0x6c,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, tba_hi
-// CHECK: [0x6d,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, tba_hi
+// CHECK: [0x6d,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, tma_lo
-// CHECK: [0x6e,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, tma_lo
+// CHECK: [0x6e,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, tma_hi
-// CHECK: [0x6f,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, tma_hi
+// CHECK: [0x6f,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, ttmp11
-// CHECK: [0x7b,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, ttmp11
+// CHECK: [0x7b,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, m0
-// CHECK: [0x7c,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, m0
+// CHECK: [0x7c,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, exec_lo
-// CHECK: [0x7e,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, exec_lo
+// CHECK: [0x7e,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, exec_hi
-// CHECK: [0x7f,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, exec_hi
+// CHECK: [0x7f,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, 0
-// CHECK: [0x80,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, 0
+// CHECK: [0x80,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, -1
-// CHECK: [0xc1,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, -1
+// CHECK: [0xc1,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, 0.5
-// CHECK: [0xf0,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, 0.5
+// CHECK: [0xf0,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, -4.0
-// CHECK: [0xf7,0x0e,0x80,0xbe]
+s_ff0_i32_b32 s5, -4.0
+// CHECK: [0xf7,0x0e,0x85,0xbe]
 
-s_ff0_i32_b32 s0, 0xaf123456
-// CHECK: [0xff,0x0e,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_ff0_i32_b32 s5, 0xaf123456
+// CHECK: [0xff,0x0e,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_ff0_i32_b32 s0, 0x3f717273
-// CHECK: [0xff,0x0e,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_ff0_i32_b32 s5, 0x3f717273
+// CHECK: [0xff,0x0e,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_ff0_i32_b64 s0, s[0:1]
-// CHECK: [0x00,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, s[2:3]
+// CHECK: [0x02,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s101, s[0:1]
-// CHECK: [0x00,0x0f,0xe5,0xbe]
+s_ff0_i32_b64 s101, s[2:3]
+// CHECK: [0x02,0x0f,0xe5,0xbe]
 
-s_ff0_i32_b64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x0f,0xe6,0xbe]
+s_ff0_i32_b64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x0f,0xe6,0xbe]
 
-s_ff0_i32_b64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x0f,0xe7,0xbe]
+s_ff0_i32_b64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x0f,0xe7,0xbe]
 
-s_ff0_i32_b64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x0f,0xea,0xbe]
+s_ff0_i32_b64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x0f,0xea,0xbe]
 
-s_ff0_i32_b64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x0f,0xeb,0xbe]
+s_ff0_i32_b64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x0f,0xeb,0xbe]
 
-s_ff0_i32_b64 tba_lo, s[0:1]
-// CHECK: [0x00,0x0f,0xec,0xbe]
+s_ff0_i32_b64 tba_lo, s[2:3]
+// CHECK: [0x02,0x0f,0xec,0xbe]
 
-s_ff0_i32_b64 tba_hi, s[0:1]
-// CHECK: [0x00,0x0f,0xed,0xbe]
+s_ff0_i32_b64 tba_hi, s[2:3]
+// CHECK: [0x02,0x0f,0xed,0xbe]
 
-s_ff0_i32_b64 tma_lo, s[0:1]
-// CHECK: [0x00,0x0f,0xee,0xbe]
+s_ff0_i32_b64 tma_lo, s[2:3]
+// CHECK: [0x02,0x0f,0xee,0xbe]
 
-s_ff0_i32_b64 tma_hi, s[0:1]
-// CHECK: [0x00,0x0f,0xef,0xbe]
+s_ff0_i32_b64 tma_hi, s[2:3]
+// CHECK: [0x02,0x0f,0xef,0xbe]
 
-s_ff0_i32_b64 ttmp11, s[0:1]
-// CHECK: [0x00,0x0f,0xfb,0xbe]
+s_ff0_i32_b64 ttmp11, s[2:3]
+// CHECK: [0x02,0x0f,0xfb,0xbe]
 
-s_ff0_i32_b64 m0, s[0:1]
-// CHECK: [0x00,0x0f,0xfc,0xbe]
+s_ff0_i32_b64 m0, s[2:3]
+// CHECK: [0x02,0x0f,0xfc,0xbe]
 
-s_ff0_i32_b64 exec_lo, s[0:1]
-// CHECK: [0x00,0x0f,0xfe,0xbe]
+s_ff0_i32_b64 exec_lo, s[2:3]
+// CHECK: [0x02,0x0f,0xfe,0xbe]
 
-s_ff0_i32_b64 exec_hi, s[0:1]
-// CHECK: [0x00,0x0f,0xff,0xbe]
+s_ff0_i32_b64 exec_hi, s[2:3]
+// CHECK: [0x02,0x0f,0xff,0xbe]
 
-s_ff0_i32_b64 s0, s[2:3]
-// CHECK: [0x02,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, s[4:5]
+// CHECK: [0x04,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, s[100:101]
-// CHECK: [0x64,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, s[100:101]
+// CHECK: [0x64,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, flat_scratch
-// CHECK: [0x66,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, flat_scratch
+// CHECK: [0x66,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, vcc
-// CHECK: [0x6a,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, vcc
+// CHECK: [0x6a,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, tba
-// CHECK: [0x6c,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, tba
+// CHECK: [0x6c,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, tma
-// CHECK: [0x6e,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, tma
+// CHECK: [0x6e,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, exec
-// CHECK: [0x7e,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, exec
+// CHECK: [0x7e,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, 0
-// CHECK: [0x80,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, 0
+// CHECK: [0x80,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, -1
-// CHECK: [0xc1,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, -1
+// CHECK: [0xc1,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, 0.5
-// CHECK: [0xf0,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, 0.5
+// CHECK: [0xf0,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, -4.0
-// CHECK: [0xf7,0x0f,0x80,0xbe]
+s_ff0_i32_b64 s5, -4.0
+// CHECK: [0xf7,0x0f,0x85,0xbe]
 
-s_ff0_i32_b64 s0, 0xaf123456
-// CHECK: [0xff,0x0f,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_ff0_i32_b64 s5, 0xaf123456
+// CHECK: [0xff,0x0f,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_ff0_i32_b64 s0, 0x3f717273
-// CHECK: [0xff,0x0f,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_ff0_i32_b64 s5, 0x3f717273
+// CHECK: [0xff,0x0f,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_ff1_i32_b32 s0, s0
-// CHECK: [0x00,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, s1
+// CHECK: [0x01,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s101, s0
-// CHECK: [0x00,0x10,0xe5,0xbe]
+s_ff1_i32_b32 s101, s1
+// CHECK: [0x01,0x10,0xe5,0xbe]
 
-s_ff1_i32_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x10,0xe6,0xbe]
+s_ff1_i32_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x10,0xe6,0xbe]
 
-s_ff1_i32_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x10,0xe7,0xbe]
+s_ff1_i32_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x10,0xe7,0xbe]
 
-s_ff1_i32_b32 vcc_lo, s0
-// CHECK: [0x00,0x10,0xea,0xbe]
+s_ff1_i32_b32 vcc_lo, s1
+// CHECK: [0x01,0x10,0xea,0xbe]
 
-s_ff1_i32_b32 vcc_hi, s0
-// CHECK: [0x00,0x10,0xeb,0xbe]
+s_ff1_i32_b32 vcc_hi, s1
+// CHECK: [0x01,0x10,0xeb,0xbe]
 
-s_ff1_i32_b32 tba_lo, s0
-// CHECK: [0x00,0x10,0xec,0xbe]
+s_ff1_i32_b32 tba_lo, s1
+// CHECK: [0x01,0x10,0xec,0xbe]
 
-s_ff1_i32_b32 tba_hi, s0
-// CHECK: [0x00,0x10,0xed,0xbe]
+s_ff1_i32_b32 tba_hi, s1
+// CHECK: [0x01,0x10,0xed,0xbe]
 
-s_ff1_i32_b32 tma_lo, s0
-// CHECK: [0x00,0x10,0xee,0xbe]
+s_ff1_i32_b32 tma_lo, s1
+// CHECK: [0x01,0x10,0xee,0xbe]
 
-s_ff1_i32_b32 tma_hi, s0
-// CHECK: [0x00,0x10,0xef,0xbe]
+s_ff1_i32_b32 tma_hi, s1
+// CHECK: [0x01,0x10,0xef,0xbe]
 
-s_ff1_i32_b32 ttmp11, s0
-// CHECK: [0x00,0x10,0xfb,0xbe]
+s_ff1_i32_b32 ttmp11, s1
+// CHECK: [0x01,0x10,0xfb,0xbe]
 
-s_ff1_i32_b32 m0, s0
-// CHECK: [0x00,0x10,0xfc,0xbe]
+s_ff1_i32_b32 m0, s1
+// CHECK: [0x01,0x10,0xfc,0xbe]
 
-s_ff1_i32_b32 exec_lo, s0
-// CHECK: [0x00,0x10,0xfe,0xbe]
+s_ff1_i32_b32 exec_lo, s1
+// CHECK: [0x01,0x10,0xfe,0xbe]
 
-s_ff1_i32_b32 exec_hi, s0
-// CHECK: [0x00,0x10,0xff,0xbe]
+s_ff1_i32_b32 exec_hi, s1
+// CHECK: [0x01,0x10,0xff,0xbe]
 
-s_ff1_i32_b32 s0, s101
-// CHECK: [0x65,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, s101
+// CHECK: [0x65,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, vcc_lo
-// CHECK: [0x6a,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, vcc_lo
+// CHECK: [0x6a,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, vcc_hi
-// CHECK: [0x6b,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, vcc_hi
+// CHECK: [0x6b,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, tba_lo
-// CHECK: [0x6c,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, tba_lo
+// CHECK: [0x6c,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, tba_hi
-// CHECK: [0x6d,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, tba_hi
+// CHECK: [0x6d,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, tma_lo
-// CHECK: [0x6e,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, tma_lo
+// CHECK: [0x6e,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, tma_hi
-// CHECK: [0x6f,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, tma_hi
+// CHECK: [0x6f,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, ttmp11
-// CHECK: [0x7b,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, ttmp11
+// CHECK: [0x7b,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, m0
-// CHECK: [0x7c,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, m0
+// CHECK: [0x7c,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, exec_lo
-// CHECK: [0x7e,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, exec_lo
+// CHECK: [0x7e,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, exec_hi
-// CHECK: [0x7f,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, exec_hi
+// CHECK: [0x7f,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, 0
-// CHECK: [0x80,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, 0
+// CHECK: [0x80,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, -1
-// CHECK: [0xc1,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, -1
+// CHECK: [0xc1,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, 0.5
-// CHECK: [0xf0,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, 0.5
+// CHECK: [0xf0,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, -4.0
-// CHECK: [0xf7,0x10,0x80,0xbe]
+s_ff1_i32_b32 s5, -4.0
+// CHECK: [0xf7,0x10,0x85,0xbe]
 
-s_ff1_i32_b32 s0, 0xaf123456
-// CHECK: [0xff,0x10,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_ff1_i32_b32 s5, 0xaf123456
+// CHECK: [0xff,0x10,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_ff1_i32_b32 s0, 0x3f717273
-// CHECK: [0xff,0x10,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_ff1_i32_b32 s5, 0x3f717273
+// CHECK: [0xff,0x10,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_ff1_i32_b64 s0, s[0:1]
-// CHECK: [0x00,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, s[2:3]
+// CHECK: [0x02,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s101, s[0:1]
-// CHECK: [0x00,0x11,0xe5,0xbe]
+s_ff1_i32_b64 s101, s[2:3]
+// CHECK: [0x02,0x11,0xe5,0xbe]
 
-s_ff1_i32_b64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x11,0xe6,0xbe]
+s_ff1_i32_b64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x11,0xe6,0xbe]
 
-s_ff1_i32_b64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x11,0xe7,0xbe]
+s_ff1_i32_b64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x11,0xe7,0xbe]
 
-s_ff1_i32_b64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x11,0xea,0xbe]
+s_ff1_i32_b64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x11,0xea,0xbe]
 
-s_ff1_i32_b64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x11,0xeb,0xbe]
+s_ff1_i32_b64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x11,0xeb,0xbe]
 
-s_ff1_i32_b64 tba_lo, s[0:1]
-// CHECK: [0x00,0x11,0xec,0xbe]
+s_ff1_i32_b64 tba_lo, s[2:3]
+// CHECK: [0x02,0x11,0xec,0xbe]
 
-s_ff1_i32_b64 tba_hi, s[0:1]
-// CHECK: [0x00,0x11,0xed,0xbe]
+s_ff1_i32_b64 tba_hi, s[2:3]
+// CHECK: [0x02,0x11,0xed,0xbe]
 
-s_ff1_i32_b64 tma_lo, s[0:1]
-// CHECK: [0x00,0x11,0xee,0xbe]
+s_ff1_i32_b64 tma_lo, s[2:3]
+// CHECK: [0x02,0x11,0xee,0xbe]
 
-s_ff1_i32_b64 tma_hi, s[0:1]
-// CHECK: [0x00,0x11,0xef,0xbe]
+s_ff1_i32_b64 tma_hi, s[2:3]
+// CHECK: [0x02,0x11,0xef,0xbe]
 
-s_ff1_i32_b64 ttmp11, s[0:1]
-// CHECK: [0x00,0x11,0xfb,0xbe]
+s_ff1_i32_b64 ttmp11, s[2:3]
+// CHECK: [0x02,0x11,0xfb,0xbe]
 
-s_ff1_i32_b64 m0, s[0:1]
-// CHECK: [0x00,0x11,0xfc,0xbe]
+s_ff1_i32_b64 m0, s[2:3]
+// CHECK: [0x02,0x11,0xfc,0xbe]
 
-s_ff1_i32_b64 exec_lo, s[0:1]
-// CHECK: [0x00,0x11,0xfe,0xbe]
+s_ff1_i32_b64 exec_lo, s[2:3]
+// CHECK: [0x02,0x11,0xfe,0xbe]
 
-s_ff1_i32_b64 exec_hi, s[0:1]
-// CHECK: [0x00,0x11,0xff,0xbe]
+s_ff1_i32_b64 exec_hi, s[2:3]
+// CHECK: [0x02,0x11,0xff,0xbe]
 
-s_ff1_i32_b64 s0, s[2:3]
-// CHECK: [0x02,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, s[4:5]
+// CHECK: [0x04,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, s[100:101]
-// CHECK: [0x64,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, s[100:101]
+// CHECK: [0x64,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, flat_scratch
-// CHECK: [0x66,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, flat_scratch
+// CHECK: [0x66,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, vcc
-// CHECK: [0x6a,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, vcc
+// CHECK: [0x6a,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, tba
-// CHECK: [0x6c,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, tba
+// CHECK: [0x6c,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, tma
-// CHECK: [0x6e,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, tma
+// CHECK: [0x6e,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, exec
-// CHECK: [0x7e,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, exec
+// CHECK: [0x7e,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, 0
-// CHECK: [0x80,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, 0
+// CHECK: [0x80,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, -1
-// CHECK: [0xc1,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, -1
+// CHECK: [0xc1,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, 0.5
-// CHECK: [0xf0,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, 0.5
+// CHECK: [0xf0,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, -4.0
-// CHECK: [0xf7,0x11,0x80,0xbe]
+s_ff1_i32_b64 s5, -4.0
+// CHECK: [0xf7,0x11,0x85,0xbe]
 
-s_ff1_i32_b64 s0, 0xaf123456
-// CHECK: [0xff,0x11,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_ff1_i32_b64 s5, 0xaf123456
+// CHECK: [0xff,0x11,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_ff1_i32_b64 s0, 0x3f717273
-// CHECK: [0xff,0x11,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_ff1_i32_b64 s5, 0x3f717273
+// CHECK: [0xff,0x11,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_flbit_i32_b32 s0, s0
-// CHECK: [0x00,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, s1
+// CHECK: [0x01,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s101, s0
-// CHECK: [0x00,0x12,0xe5,0xbe]
+s_flbit_i32_b32 s101, s1
+// CHECK: [0x01,0x12,0xe5,0xbe]
 
-s_flbit_i32_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x12,0xe6,0xbe]
+s_flbit_i32_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x12,0xe6,0xbe]
 
-s_flbit_i32_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x12,0xe7,0xbe]
+s_flbit_i32_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x12,0xe7,0xbe]
 
-s_flbit_i32_b32 vcc_lo, s0
-// CHECK: [0x00,0x12,0xea,0xbe]
+s_flbit_i32_b32 vcc_lo, s1
+// CHECK: [0x01,0x12,0xea,0xbe]
 
-s_flbit_i32_b32 vcc_hi, s0
-// CHECK: [0x00,0x12,0xeb,0xbe]
+s_flbit_i32_b32 vcc_hi, s1
+// CHECK: [0x01,0x12,0xeb,0xbe]
 
-s_flbit_i32_b32 tba_lo, s0
-// CHECK: [0x00,0x12,0xec,0xbe]
+s_flbit_i32_b32 tba_lo, s1
+// CHECK: [0x01,0x12,0xec,0xbe]
 
-s_flbit_i32_b32 tba_hi, s0
-// CHECK: [0x00,0x12,0xed,0xbe]
+s_flbit_i32_b32 tba_hi, s1
+// CHECK: [0x01,0x12,0xed,0xbe]
 
-s_flbit_i32_b32 tma_lo, s0
-// CHECK: [0x00,0x12,0xee,0xbe]
+s_flbit_i32_b32 tma_lo, s1
+// CHECK: [0x01,0x12,0xee,0xbe]
 
-s_flbit_i32_b32 tma_hi, s0
-// CHECK: [0x00,0x12,0xef,0xbe]
+s_flbit_i32_b32 tma_hi, s1
+// CHECK: [0x01,0x12,0xef,0xbe]
 
-s_flbit_i32_b32 ttmp11, s0
-// CHECK: [0x00,0x12,0xfb,0xbe]
+s_flbit_i32_b32 ttmp11, s1
+// CHECK: [0x01,0x12,0xfb,0xbe]
 
-s_flbit_i32_b32 m0, s0
-// CHECK: [0x00,0x12,0xfc,0xbe]
+s_flbit_i32_b32 m0, s1
+// CHECK: [0x01,0x12,0xfc,0xbe]
 
-s_flbit_i32_b32 exec_lo, s0
-// CHECK: [0x00,0x12,0xfe,0xbe]
+s_flbit_i32_b32 exec_lo, s1
+// CHECK: [0x01,0x12,0xfe,0xbe]
 
-s_flbit_i32_b32 exec_hi, s0
-// CHECK: [0x00,0x12,0xff,0xbe]
+s_flbit_i32_b32 exec_hi, s1
+// CHECK: [0x01,0x12,0xff,0xbe]
 
-s_flbit_i32_b32 s0, s101
-// CHECK: [0x65,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, s101
+// CHECK: [0x65,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, vcc_lo
-// CHECK: [0x6a,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, vcc_lo
+// CHECK: [0x6a,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, vcc_hi
-// CHECK: [0x6b,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, vcc_hi
+// CHECK: [0x6b,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, tba_lo
-// CHECK: [0x6c,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, tba_lo
+// CHECK: [0x6c,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, tba_hi
-// CHECK: [0x6d,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, tba_hi
+// CHECK: [0x6d,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, tma_lo
-// CHECK: [0x6e,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, tma_lo
+// CHECK: [0x6e,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, tma_hi
-// CHECK: [0x6f,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, tma_hi
+// CHECK: [0x6f,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, ttmp11
-// CHECK: [0x7b,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, ttmp11
+// CHECK: [0x7b,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, m0
-// CHECK: [0x7c,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, m0
+// CHECK: [0x7c,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, exec_lo
-// CHECK: [0x7e,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, exec_lo
+// CHECK: [0x7e,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, exec_hi
-// CHECK: [0x7f,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, exec_hi
+// CHECK: [0x7f,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, 0
-// CHECK: [0x80,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, 0
+// CHECK: [0x80,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, -1
-// CHECK: [0xc1,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, -1
+// CHECK: [0xc1,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, 0.5
-// CHECK: [0xf0,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, 0.5
+// CHECK: [0xf0,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, -4.0
-// CHECK: [0xf7,0x12,0x80,0xbe]
+s_flbit_i32_b32 s5, -4.0
+// CHECK: [0xf7,0x12,0x85,0xbe]
 
-s_flbit_i32_b32 s0, 0xaf123456
-// CHECK: [0xff,0x12,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_flbit_i32_b32 s5, 0xaf123456
+// CHECK: [0xff,0x12,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_flbit_i32_b32 s0, 0x3f717273
-// CHECK: [0xff,0x12,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_flbit_i32_b32 s5, 0x3f717273
+// CHECK: [0xff,0x12,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_flbit_i32_b64 s0, s[0:1]
-// CHECK: [0x00,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, s[2:3]
+// CHECK: [0x02,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s101, s[0:1]
-// CHECK: [0x00,0x13,0xe5,0xbe]
+s_flbit_i32_b64 s101, s[2:3]
+// CHECK: [0x02,0x13,0xe5,0xbe]
 
-s_flbit_i32_b64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x13,0xe6,0xbe]
+s_flbit_i32_b64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x13,0xe6,0xbe]
 
-s_flbit_i32_b64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x13,0xe7,0xbe]
+s_flbit_i32_b64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x13,0xe7,0xbe]
 
-s_flbit_i32_b64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x13,0xea,0xbe]
+s_flbit_i32_b64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x13,0xea,0xbe]
 
-s_flbit_i32_b64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x13,0xeb,0xbe]
+s_flbit_i32_b64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x13,0xeb,0xbe]
 
-s_flbit_i32_b64 tba_lo, s[0:1]
-// CHECK: [0x00,0x13,0xec,0xbe]
+s_flbit_i32_b64 tba_lo, s[2:3]
+// CHECK: [0x02,0x13,0xec,0xbe]
 
-s_flbit_i32_b64 tba_hi, s[0:1]
-// CHECK: [0x00,0x13,0xed,0xbe]
+s_flbit_i32_b64 tba_hi, s[2:3]
+// CHECK: [0x02,0x13,0xed,0xbe]
 
-s_flbit_i32_b64 tma_lo, s[0:1]
-// CHECK: [0x00,0x13,0xee,0xbe]
+s_flbit_i32_b64 tma_lo, s[2:3]
+// CHECK: [0x02,0x13,0xee,0xbe]
 
-s_flbit_i32_b64 tma_hi, s[0:1]
-// CHECK: [0x00,0x13,0xef,0xbe]
+s_flbit_i32_b64 tma_hi, s[2:3]
+// CHECK: [0x02,0x13,0xef,0xbe]
 
-s_flbit_i32_b64 ttmp11, s[0:1]
-// CHECK: [0x00,0x13,0xfb,0xbe]
+s_flbit_i32_b64 ttmp11, s[2:3]
+// CHECK: [0x02,0x13,0xfb,0xbe]
 
-s_flbit_i32_b64 m0, s[0:1]
-// CHECK: [0x00,0x13,0xfc,0xbe]
+s_flbit_i32_b64 m0, s[2:3]
+// CHECK: [0x02,0x13,0xfc,0xbe]
 
-s_flbit_i32_b64 exec_lo, s[0:1]
-// CHECK: [0x00,0x13,0xfe,0xbe]
+s_flbit_i32_b64 exec_lo, s[2:3]
+// CHECK: [0x02,0x13,0xfe,0xbe]
 
-s_flbit_i32_b64 exec_hi, s[0:1]
-// CHECK: [0x00,0x13,0xff,0xbe]
+s_flbit_i32_b64 exec_hi, s[2:3]
+// CHECK: [0x02,0x13,0xff,0xbe]
 
-s_flbit_i32_b64 s0, s[2:3]
-// CHECK: [0x02,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, s[4:5]
+// CHECK: [0x04,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, s[100:101]
-// CHECK: [0x64,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, s[100:101]
+// CHECK: [0x64,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, flat_scratch
-// CHECK: [0x66,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, flat_scratch
+// CHECK: [0x66,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, vcc
-// CHECK: [0x6a,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, vcc
+// CHECK: [0x6a,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, tba
-// CHECK: [0x6c,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, tba
+// CHECK: [0x6c,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, tma
-// CHECK: [0x6e,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, tma
+// CHECK: [0x6e,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, exec
-// CHECK: [0x7e,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, exec
+// CHECK: [0x7e,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, 0
-// CHECK: [0x80,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, 0
+// CHECK: [0x80,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, -1
-// CHECK: [0xc1,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, -1
+// CHECK: [0xc1,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, 0.5
-// CHECK: [0xf0,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, 0.5
+// CHECK: [0xf0,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, -4.0
-// CHECK: [0xf7,0x13,0x80,0xbe]
+s_flbit_i32_b64 s5, -4.0
+// CHECK: [0xf7,0x13,0x85,0xbe]
 
-s_flbit_i32_b64 s0, 0xaf123456
-// CHECK: [0xff,0x13,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_flbit_i32_b64 s5, 0xaf123456
+// CHECK: [0xff,0x13,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_flbit_i32_b64 s0, 0x3f717273
-// CHECK: [0xff,0x13,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_flbit_i32_b64 s5, 0x3f717273
+// CHECK: [0xff,0x13,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_flbit_i32 s0, s0
-// CHECK: [0x00,0x14,0x80,0xbe]
+s_flbit_i32 s5, s1
+// CHECK: [0x01,0x14,0x85,0xbe]
 
-s_flbit_i32 s101, s0
-// CHECK: [0x00,0x14,0xe5,0xbe]
+s_flbit_i32 s101, s1
+// CHECK: [0x01,0x14,0xe5,0xbe]
 
-s_flbit_i32 flat_scratch_lo, s0
-// CHECK: [0x00,0x14,0xe6,0xbe]
+s_flbit_i32 flat_scratch_lo, s1
+// CHECK: [0x01,0x14,0xe6,0xbe]
 
-s_flbit_i32 flat_scratch_hi, s0
-// CHECK: [0x00,0x14,0xe7,0xbe]
+s_flbit_i32 flat_scratch_hi, s1
+// CHECK: [0x01,0x14,0xe7,0xbe]
 
-s_flbit_i32 vcc_lo, s0
-// CHECK: [0x00,0x14,0xea,0xbe]
+s_flbit_i32 vcc_lo, s1
+// CHECK: [0x01,0x14,0xea,0xbe]
 
-s_flbit_i32 vcc_hi, s0
-// CHECK: [0x00,0x14,0xeb,0xbe]
+s_flbit_i32 vcc_hi, s1
+// CHECK: [0x01,0x14,0xeb,0xbe]
 
-s_flbit_i32 tba_lo, s0
-// CHECK: [0x00,0x14,0xec,0xbe]
+s_flbit_i32 tba_lo, s1
+// CHECK: [0x01,0x14,0xec,0xbe]
 
-s_flbit_i32 tba_hi, s0
-// CHECK: [0x00,0x14,0xed,0xbe]
+s_flbit_i32 tba_hi, s1
+// CHECK: [0x01,0x14,0xed,0xbe]
 
-s_flbit_i32 tma_lo, s0
-// CHECK: [0x00,0x14,0xee,0xbe]
+s_flbit_i32 tma_lo, s1
+// CHECK: [0x01,0x14,0xee,0xbe]
 
-s_flbit_i32 tma_hi, s0
-// CHECK: [0x00,0x14,0xef,0xbe]
+s_flbit_i32 tma_hi, s1
+// CHECK: [0x01,0x14,0xef,0xbe]
 
-s_flbit_i32 ttmp11, s0
-// CHECK: [0x00,0x14,0xfb,0xbe]
+s_flbit_i32 ttmp11, s1
+// CHECK: [0x01,0x14,0xfb,0xbe]
 
-s_flbit_i32 m0, s0
-// CHECK: [0x00,0x14,0xfc,0xbe]
+s_flbit_i32 m0, s1
+// CHECK: [0x01,0x14,0xfc,0xbe]
 
-s_flbit_i32 exec_lo, s0
-// CHECK: [0x00,0x14,0xfe,0xbe]
+s_flbit_i32 exec_lo, s1
+// CHECK: [0x01,0x14,0xfe,0xbe]
 
-s_flbit_i32 exec_hi, s0
-// CHECK: [0x00,0x14,0xff,0xbe]
+s_flbit_i32 exec_hi, s1
+// CHECK: [0x01,0x14,0xff,0xbe]
 
-s_flbit_i32 s0, s101
-// CHECK: [0x65,0x14,0x80,0xbe]
+s_flbit_i32 s5, s101
+// CHECK: [0x65,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, flat_scratch_lo
-// CHECK: [0x66,0x14,0x80,0xbe]
+s_flbit_i32 s5, flat_scratch_lo
+// CHECK: [0x66,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, flat_scratch_hi
-// CHECK: [0x67,0x14,0x80,0xbe]
+s_flbit_i32 s5, flat_scratch_hi
+// CHECK: [0x67,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, vcc_lo
-// CHECK: [0x6a,0x14,0x80,0xbe]
+s_flbit_i32 s5, vcc_lo
+// CHECK: [0x6a,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, vcc_hi
-// CHECK: [0x6b,0x14,0x80,0xbe]
+s_flbit_i32 s5, vcc_hi
+// CHECK: [0x6b,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, tba_lo
-// CHECK: [0x6c,0x14,0x80,0xbe]
+s_flbit_i32 s5, tba_lo
+// CHECK: [0x6c,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, tba_hi
-// CHECK: [0x6d,0x14,0x80,0xbe]
+s_flbit_i32 s5, tba_hi
+// CHECK: [0x6d,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, tma_lo
-// CHECK: [0x6e,0x14,0x80,0xbe]
+s_flbit_i32 s5, tma_lo
+// CHECK: [0x6e,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, tma_hi
-// CHECK: [0x6f,0x14,0x80,0xbe]
+s_flbit_i32 s5, tma_hi
+// CHECK: [0x6f,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, ttmp11
-// CHECK: [0x7b,0x14,0x80,0xbe]
+s_flbit_i32 s5, ttmp11
+// CHECK: [0x7b,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, m0
-// CHECK: [0x7c,0x14,0x80,0xbe]
+s_flbit_i32 s5, m0
+// CHECK: [0x7c,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, exec_lo
-// CHECK: [0x7e,0x14,0x80,0xbe]
+s_flbit_i32 s5, exec_lo
+// CHECK: [0x7e,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, exec_hi
-// CHECK: [0x7f,0x14,0x80,0xbe]
+s_flbit_i32 s5, exec_hi
+// CHECK: [0x7f,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, 0
-// CHECK: [0x80,0x14,0x80,0xbe]
+s_flbit_i32 s5, 0
+// CHECK: [0x80,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, -1
-// CHECK: [0xc1,0x14,0x80,0xbe]
+s_flbit_i32 s5, -1
+// CHECK: [0xc1,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, 0.5
-// CHECK: [0xf0,0x14,0x80,0xbe]
+s_flbit_i32 s5, 0.5
+// CHECK: [0xf0,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, -4.0
-// CHECK: [0xf7,0x14,0x80,0xbe]
+s_flbit_i32 s5, -4.0
+// CHECK: [0xf7,0x14,0x85,0xbe]
 
-s_flbit_i32 s0, 0xaf123456
-// CHECK: [0xff,0x14,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_flbit_i32 s5, 0xaf123456
+// CHECK: [0xff,0x14,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_flbit_i32 s0, 0x3f717273
-// CHECK: [0xff,0x14,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_flbit_i32 s5, 0x3f717273
+// CHECK: [0xff,0x14,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_flbit_i32_i64 s0, s[0:1]
-// CHECK: [0x00,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, s[2:3]
+// CHECK: [0x02,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s101, s[0:1]
-// CHECK: [0x00,0x15,0xe5,0xbe]
+s_flbit_i32_i64 s101, s[2:3]
+// CHECK: [0x02,0x15,0xe5,0xbe]
 
-s_flbit_i32_i64 flat_scratch_lo, s[0:1]
-// CHECK: [0x00,0x15,0xe6,0xbe]
+s_flbit_i32_i64 flat_scratch_lo, s[2:3]
+// CHECK: [0x02,0x15,0xe6,0xbe]
 
-s_flbit_i32_i64 flat_scratch_hi, s[0:1]
-// CHECK: [0x00,0x15,0xe7,0xbe]
+s_flbit_i32_i64 flat_scratch_hi, s[2:3]
+// CHECK: [0x02,0x15,0xe7,0xbe]
 
-s_flbit_i32_i64 vcc_lo, s[0:1]
-// CHECK: [0x00,0x15,0xea,0xbe]
+s_flbit_i32_i64 vcc_lo, s[2:3]
+// CHECK: [0x02,0x15,0xea,0xbe]
 
-s_flbit_i32_i64 vcc_hi, s[0:1]
-// CHECK: [0x00,0x15,0xeb,0xbe]
+s_flbit_i32_i64 vcc_hi, s[2:3]
+// CHECK: [0x02,0x15,0xeb,0xbe]
 
-s_flbit_i32_i64 tba_lo, s[0:1]
-// CHECK: [0x00,0x15,0xec,0xbe]
+s_flbit_i32_i64 tba_lo, s[2:3]
+// CHECK: [0x02,0x15,0xec,0xbe]
 
-s_flbit_i32_i64 tba_hi, s[0:1]
-// CHECK: [0x00,0x15,0xed,0xbe]
+s_flbit_i32_i64 tba_hi, s[2:3]
+// CHECK: [0x02,0x15,0xed,0xbe]
 
-s_flbit_i32_i64 tma_lo, s[0:1]
-// CHECK: [0x00,0x15,0xee,0xbe]
+s_flbit_i32_i64 tma_lo, s[2:3]
+// CHECK: [0x02,0x15,0xee,0xbe]
 
-s_flbit_i32_i64 tma_hi, s[0:1]
-// CHECK: [0x00,0x15,0xef,0xbe]
+s_flbit_i32_i64 tma_hi, s[2:3]
+// CHECK: [0x02,0x15,0xef,0xbe]
 
-s_flbit_i32_i64 ttmp11, s[0:1]
-// CHECK: [0x00,0x15,0xfb,0xbe]
+s_flbit_i32_i64 ttmp11, s[2:3]
+// CHECK: [0x02,0x15,0xfb,0xbe]
 
-s_flbit_i32_i64 m0, s[0:1]
-// CHECK: [0x00,0x15,0xfc,0xbe]
+s_flbit_i32_i64 m0, s[2:3]
+// CHECK: [0x02,0x15,0xfc,0xbe]
 
-s_flbit_i32_i64 exec_lo, s[0:1]
-// CHECK: [0x00,0x15,0xfe,0xbe]
+s_flbit_i32_i64 exec_lo, s[2:3]
+// CHECK: [0x02,0x15,0xfe,0xbe]
 
-s_flbit_i32_i64 exec_hi, s[0:1]
-// CHECK: [0x00,0x15,0xff,0xbe]
+s_flbit_i32_i64 exec_hi, s[2:3]
+// CHECK: [0x02,0x15,0xff,0xbe]
 
-s_flbit_i32_i64 s0, s[2:3]
-// CHECK: [0x02,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, s[4:5]
+// CHECK: [0x04,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, s[100:101]
-// CHECK: [0x64,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, s[100:101]
+// CHECK: [0x64,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, flat_scratch
-// CHECK: [0x66,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, flat_scratch
+// CHECK: [0x66,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, vcc
-// CHECK: [0x6a,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, vcc
+// CHECK: [0x6a,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, tba
-// CHECK: [0x6c,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, tba
+// CHECK: [0x6c,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, tma
-// CHECK: [0x6e,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, tma
+// CHECK: [0x6e,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, ttmp[10:11]
-// CHECK: [0x7a,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, ttmp[10:11]
+// CHECK: [0x7a,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, exec
-// CHECK: [0x7e,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, exec
+// CHECK: [0x7e,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, 0
-// CHECK: [0x80,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, 0
+// CHECK: [0x80,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, -1
-// CHECK: [0xc1,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, -1
+// CHECK: [0xc1,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, 0.5
-// CHECK: [0xf0,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, 0.5
+// CHECK: [0xf0,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, -4.0
-// CHECK: [0xf7,0x15,0x80,0xbe]
+s_flbit_i32_i64 s5, -4.0
+// CHECK: [0xf7,0x15,0x85,0xbe]
 
-s_flbit_i32_i64 s0, 0xaf123456
-// CHECK: [0xff,0x15,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_flbit_i32_i64 s5, 0xaf123456
+// CHECK: [0xff,0x15,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_flbit_i32_i64 s0, 0x3f717273
-// CHECK: [0xff,0x15,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_flbit_i32_i64 s5, 0x3f717273
+// CHECK: [0xff,0x15,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_sext_i32_i8 s0, s0
-// CHECK: [0x00,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, s1
+// CHECK: [0x01,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s101, s0
-// CHECK: [0x00,0x16,0xe5,0xbe]
+s_sext_i32_i8 s101, s1
+// CHECK: [0x01,0x16,0xe5,0xbe]
 
-s_sext_i32_i8 flat_scratch_lo, s0
-// CHECK: [0x00,0x16,0xe6,0xbe]
+s_sext_i32_i8 flat_scratch_lo, s1
+// CHECK: [0x01,0x16,0xe6,0xbe]
 
-s_sext_i32_i8 flat_scratch_hi, s0
-// CHECK: [0x00,0x16,0xe7,0xbe]
+s_sext_i32_i8 flat_scratch_hi, s1
+// CHECK: [0x01,0x16,0xe7,0xbe]
 
-s_sext_i32_i8 vcc_lo, s0
-// CHECK: [0x00,0x16,0xea,0xbe]
+s_sext_i32_i8 vcc_lo, s1
+// CHECK: [0x01,0x16,0xea,0xbe]
 
-s_sext_i32_i8 vcc_hi, s0
-// CHECK: [0x00,0x16,0xeb,0xbe]
+s_sext_i32_i8 vcc_hi, s1
+// CHECK: [0x01,0x16,0xeb,0xbe]
 
-s_sext_i32_i8 tba_lo, s0
-// CHECK: [0x00,0x16,0xec,0xbe]
+s_sext_i32_i8 tba_lo, s1
+// CHECK: [0x01,0x16,0xec,0xbe]
 
-s_sext_i32_i8 tba_hi, s0
-// CHECK: [0x00,0x16,0xed,0xbe]
+s_sext_i32_i8 tba_hi, s1
+// CHECK: [0x01,0x16,0xed,0xbe]
 
-s_sext_i32_i8 tma_lo, s0
-// CHECK: [0x00,0x16,0xee,0xbe]
+s_sext_i32_i8 tma_lo, s1
+// CHECK: [0x01,0x16,0xee,0xbe]
 
-s_sext_i32_i8 tma_hi, s0
-// CHECK: [0x00,0x16,0xef,0xbe]
+s_sext_i32_i8 tma_hi, s1
+// CHECK: [0x01,0x16,0xef,0xbe]
 
-s_sext_i32_i8 ttmp11, s0
-// CHECK: [0x00,0x16,0xfb,0xbe]
+s_sext_i32_i8 ttmp11, s1
+// CHECK: [0x01,0x16,0xfb,0xbe]
 
-s_sext_i32_i8 m0, s0
-// CHECK: [0x00,0x16,0xfc,0xbe]
+s_sext_i32_i8 m0, s1
+// CHECK: [0x01,0x16,0xfc,0xbe]
 
-s_sext_i32_i8 exec_lo, s0
-// CHECK: [0x00,0x16,0xfe,0xbe]
+s_sext_i32_i8 exec_lo, s1
+// CHECK: [0x01,0x16,0xfe,0xbe]
 
-s_sext_i32_i8 exec_hi, s0
-// CHECK: [0x00,0x16,0xff,0xbe]
+s_sext_i32_i8 exec_hi, s1
+// CHECK: [0x01,0x16,0xff,0xbe]
 
-s_sext_i32_i8 s0, s101
-// CHECK: [0x65,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, s101
+// CHECK: [0x65,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, flat_scratch_lo
-// CHECK: [0x66,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, flat_scratch_lo
+// CHECK: [0x66,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, flat_scratch_hi
-// CHECK: [0x67,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, flat_scratch_hi
+// CHECK: [0x67,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, vcc_lo
-// CHECK: [0x6a,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, vcc_lo
+// CHECK: [0x6a,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, vcc_hi
-// CHECK: [0x6b,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, vcc_hi
+// CHECK: [0x6b,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, tba_lo
-// CHECK: [0x6c,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, tba_lo
+// CHECK: [0x6c,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, tba_hi
-// CHECK: [0x6d,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, tba_hi
+// CHECK: [0x6d,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, tma_lo
-// CHECK: [0x6e,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, tma_lo
+// CHECK: [0x6e,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, tma_hi
-// CHECK: [0x6f,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, tma_hi
+// CHECK: [0x6f,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, ttmp11
-// CHECK: [0x7b,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, ttmp11
+// CHECK: [0x7b,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, m0
-// CHECK: [0x7c,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, m0
+// CHECK: [0x7c,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, exec_lo
-// CHECK: [0x7e,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, exec_lo
+// CHECK: [0x7e,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, exec_hi
-// CHECK: [0x7f,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, exec_hi
+// CHECK: [0x7f,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, 0
-// CHECK: [0x80,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, 0
+// CHECK: [0x80,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, -1
-// CHECK: [0xc1,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, -1
+// CHECK: [0xc1,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, 0.5
-// CHECK: [0xf0,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, 0.5
+// CHECK: [0xf0,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, -4.0
-// CHECK: [0xf7,0x16,0x80,0xbe]
+s_sext_i32_i8 s5, -4.0
+// CHECK: [0xf7,0x16,0x85,0xbe]
 
-s_sext_i32_i8 s0, 0x71
-// CHECK: [0xff,0x16,0x80,0xbe,0x71,0x00,0x00,0x00]
+s_sext_i32_i8 s5, 0x71
+// CHECK: [0xff,0x16,0x85,0xbe,0x71,0x00,0x00,0x00]
 
-s_sext_i32_i8 s0, 0xf0
-// CHECK: [0xff,0x16,0x80,0xbe,0xf0,0x00,0x00,0x00]
+s_sext_i32_i8 s5, 0xf0
+// CHECK: [0xff,0x16,0x85,0xbe,0xf0,0x00,0x00,0x00]
 
-s_sext_i32_i16 s0, s0
-// CHECK: [0x00,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, s1
+// CHECK: [0x01,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s101, s0
-// CHECK: [0x00,0x17,0xe5,0xbe]
+s_sext_i32_i16 s101, s1
+// CHECK: [0x01,0x17,0xe5,0xbe]
 
-s_sext_i32_i16 flat_scratch_lo, s0
-// CHECK: [0x00,0x17,0xe6,0xbe]
+s_sext_i32_i16 flat_scratch_lo, s1
+// CHECK: [0x01,0x17,0xe6,0xbe]
 
-s_sext_i32_i16 flat_scratch_hi, s0
-// CHECK: [0x00,0x17,0xe7,0xbe]
+s_sext_i32_i16 flat_scratch_hi, s1
+// CHECK: [0x01,0x17,0xe7,0xbe]
 
-s_sext_i32_i16 vcc_lo, s0
-// CHECK: [0x00,0x17,0xea,0xbe]
+s_sext_i32_i16 vcc_lo, s1
+// CHECK: [0x01,0x17,0xea,0xbe]
 
-s_sext_i32_i16 vcc_hi, s0
-// CHECK: [0x00,0x17,0xeb,0xbe]
+s_sext_i32_i16 vcc_hi, s1
+// CHECK: [0x01,0x17,0xeb,0xbe]
 
-s_sext_i32_i16 tba_lo, s0
-// CHECK: [0x00,0x17,0xec,0xbe]
+s_sext_i32_i16 tba_lo, s1
+// CHECK: [0x01,0x17,0xec,0xbe]
 
-s_sext_i32_i16 tba_hi, s0
-// CHECK: [0x00,0x17,0xed,0xbe]
+s_sext_i32_i16 tba_hi, s1
+// CHECK: [0x01,0x17,0xed,0xbe]
 
-s_sext_i32_i16 tma_lo, s0
-// CHECK: [0x00,0x17,0xee,0xbe]
+s_sext_i32_i16 tma_lo, s1
+// CHECK: [0x01,0x17,0xee,0xbe]
 
-s_sext_i32_i16 tma_hi, s0
-// CHECK: [0x00,0x17,0xef,0xbe]
+s_sext_i32_i16 tma_hi, s1
+// CHECK: [0x01,0x17,0xef,0xbe]
 
-s_sext_i32_i16 ttmp11, s0
-// CHECK: [0x00,0x17,0xfb,0xbe]
+s_sext_i32_i16 ttmp11, s1
+// CHECK: [0x01,0x17,0xfb,0xbe]
 
-s_sext_i32_i16 m0, s0
-// CHECK: [0x00,0x17,0xfc,0xbe]
+s_sext_i32_i16 m0, s1
+// CHECK: [0x01,0x17,0xfc,0xbe]
 
-s_sext_i32_i16 exec_lo, s0
-// CHECK: [0x00,0x17,0xfe,0xbe]
+s_sext_i32_i16 exec_lo, s1
+// CHECK: [0x01,0x17,0xfe,0xbe]
 
-s_sext_i32_i16 exec_hi, s0
-// CHECK: [0x00,0x17,0xff,0xbe]
+s_sext_i32_i16 exec_hi, s1
+// CHECK: [0x01,0x17,0xff,0xbe]
 
-s_sext_i32_i16 s0, s101
-// CHECK: [0x65,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, s101
+// CHECK: [0x65,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, flat_scratch_lo
-// CHECK: [0x66,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, flat_scratch_lo
+// CHECK: [0x66,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, flat_scratch_hi
-// CHECK: [0x67,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, flat_scratch_hi
+// CHECK: [0x67,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, vcc_lo
-// CHECK: [0x6a,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, vcc_lo
+// CHECK: [0x6a,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, vcc_hi
-// CHECK: [0x6b,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, vcc_hi
+// CHECK: [0x6b,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, tba_lo
-// CHECK: [0x6c,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, tba_lo
+// CHECK: [0x6c,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, tba_hi
-// CHECK: [0x6d,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, tba_hi
+// CHECK: [0x6d,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, tma_lo
-// CHECK: [0x6e,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, tma_lo
+// CHECK: [0x6e,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, tma_hi
-// CHECK: [0x6f,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, tma_hi
+// CHECK: [0x6f,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, ttmp11
-// CHECK: [0x7b,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, ttmp11
+// CHECK: [0x7b,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, m0
-// CHECK: [0x7c,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, m0
+// CHECK: [0x7c,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, exec_lo
-// CHECK: [0x7e,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, exec_lo
+// CHECK: [0x7e,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, exec_hi
-// CHECK: [0x7f,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, exec_hi
+// CHECK: [0x7f,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, 0
-// CHECK: [0x80,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, 0
+// CHECK: [0x80,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, -1
-// CHECK: [0xc1,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, -1
+// CHECK: [0xc1,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, 0.5
-// CHECK: [0xf0,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, 0.5
+// CHECK: [0xf0,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, -4.0
-// CHECK: [0xf7,0x17,0x80,0xbe]
+s_sext_i32_i16 s5, -4.0
+// CHECK: [0xf7,0x17,0x85,0xbe]
 
-s_sext_i32_i16 s0, 0xaf123456
-// CHECK: [0xff,0x17,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_sext_i32_i16 s5, 0xaf123456
+// CHECK: [0xff,0x17,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_sext_i32_i16 s0, 0x3f717273
-// CHECK: [0xff,0x17,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_sext_i32_i16 s5, 0x3f717273
+// CHECK: [0xff,0x17,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bitset0_b32 s0, s0
-// CHECK: [0x00,0x18,0x80,0xbe]
+s_bitset0_b32 s5, s1
+// CHECK: [0x01,0x18,0x85,0xbe]
 
-s_bitset0_b32 s101, s0
-// CHECK: [0x00,0x18,0xe5,0xbe]
+s_bitset0_b32 s101, s1
+// CHECK: [0x01,0x18,0xe5,0xbe]
 
-s_bitset0_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x18,0xe6,0xbe]
+s_bitset0_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x18,0xe6,0xbe]
 
-s_bitset0_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x18,0xe7,0xbe]
+s_bitset0_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x18,0xe7,0xbe]
 
-s_bitset0_b32 vcc_lo, s0
-// CHECK: [0x00,0x18,0xea,0xbe]
+s_bitset0_b32 vcc_lo, s1
+// CHECK: [0x01,0x18,0xea,0xbe]
 
-s_bitset0_b32 vcc_hi, s0
-// CHECK: [0x00,0x18,0xeb,0xbe]
+s_bitset0_b32 vcc_hi, s1
+// CHECK: [0x01,0x18,0xeb,0xbe]
 
-s_bitset0_b32 tba_lo, s0
-// CHECK: [0x00,0x18,0xec,0xbe]
+s_bitset0_b32 tba_lo, s1
+// CHECK: [0x01,0x18,0xec,0xbe]
 
-s_bitset0_b32 tba_hi, s0
-// CHECK: [0x00,0x18,0xed,0xbe]
+s_bitset0_b32 tba_hi, s1
+// CHECK: [0x01,0x18,0xed,0xbe]
 
-s_bitset0_b32 tma_lo, s0
-// CHECK: [0x00,0x18,0xee,0xbe]
+s_bitset0_b32 tma_lo, s1
+// CHECK: [0x01,0x18,0xee,0xbe]
 
-s_bitset0_b32 tma_hi, s0
-// CHECK: [0x00,0x18,0xef,0xbe]
+s_bitset0_b32 tma_hi, s1
+// CHECK: [0x01,0x18,0xef,0xbe]
 
-s_bitset0_b32 ttmp11, s0
-// CHECK: [0x00,0x18,0xfb,0xbe]
+s_bitset0_b32 ttmp11, s1
+// CHECK: [0x01,0x18,0xfb,0xbe]
 
-s_bitset0_b32 m0, s0
-// CHECK: [0x00,0x18,0xfc,0xbe]
+s_bitset0_b32 m0, s1
+// CHECK: [0x01,0x18,0xfc,0xbe]
 
-s_bitset0_b32 exec_lo, s0
-// CHECK: [0x00,0x18,0xfe,0xbe]
+s_bitset0_b32 exec_lo, s1
+// CHECK: [0x01,0x18,0xfe,0xbe]
 
-s_bitset0_b32 exec_hi, s0
-// CHECK: [0x00,0x18,0xff,0xbe]
+s_bitset0_b32 exec_hi, s1
+// CHECK: [0x01,0x18,0xff,0xbe]
 
-s_bitset0_b32 s0, s101
-// CHECK: [0x65,0x18,0x80,0xbe]
+s_bitset0_b32 s5, s101
+// CHECK: [0x65,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x18,0x80,0xbe]
+s_bitset0_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x18,0x80,0xbe]
+s_bitset0_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, vcc_lo
-// CHECK: [0x6a,0x18,0x80,0xbe]
+s_bitset0_b32 s5, vcc_lo
+// CHECK: [0x6a,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, vcc_hi
-// CHECK: [0x6b,0x18,0x80,0xbe]
+s_bitset0_b32 s5, vcc_hi
+// CHECK: [0x6b,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, tba_lo
-// CHECK: [0x6c,0x18,0x80,0xbe]
+s_bitset0_b32 s5, tba_lo
+// CHECK: [0x6c,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, tba_hi
-// CHECK: [0x6d,0x18,0x80,0xbe]
+s_bitset0_b32 s5, tba_hi
+// CHECK: [0x6d,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, tma_lo
-// CHECK: [0x6e,0x18,0x80,0xbe]
+s_bitset0_b32 s5, tma_lo
+// CHECK: [0x6e,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, tma_hi
-// CHECK: [0x6f,0x18,0x80,0xbe]
+s_bitset0_b32 s5, tma_hi
+// CHECK: [0x6f,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, ttmp11
-// CHECK: [0x7b,0x18,0x80,0xbe]
+s_bitset0_b32 s5, ttmp11
+// CHECK: [0x7b,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, m0
-// CHECK: [0x7c,0x18,0x80,0xbe]
+s_bitset0_b32 s5, m0
+// CHECK: [0x7c,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, exec_lo
-// CHECK: [0x7e,0x18,0x80,0xbe]
+s_bitset0_b32 s5, exec_lo
+// CHECK: [0x7e,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, exec_hi
-// CHECK: [0x7f,0x18,0x80,0xbe]
+s_bitset0_b32 s5, exec_hi
+// CHECK: [0x7f,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, 0
-// CHECK: [0x80,0x18,0x80,0xbe]
+s_bitset0_b32 s5, 0
+// CHECK: [0x80,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, -1
-// CHECK: [0xc1,0x18,0x80,0xbe]
+s_bitset0_b32 s5, -1
+// CHECK: [0xc1,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, 0.5
-// CHECK: [0xf0,0x18,0x80,0xbe]
+s_bitset0_b32 s5, 0.5
+// CHECK: [0xf0,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, -4.0
-// CHECK: [0xf7,0x18,0x80,0xbe]
+s_bitset0_b32 s5, -4.0
+// CHECK: [0xf7,0x18,0x85,0xbe]
 
-s_bitset0_b32 s0, 0xaf123456
-// CHECK: [0xff,0x18,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bitset0_b32 s5, 0xaf123456
+// CHECK: [0xff,0x18,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bitset0_b32 s0, 0x3f717273
-// CHECK: [0xff,0x18,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bitset0_b32 s5, 0x3f717273
+// CHECK: [0xff,0x18,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bitset0_b64 s[0:1], s0
-// CHECK: [0x00,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], s1
+// CHECK: [0x01,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[2:3], s0
-// CHECK: [0x00,0x19,0x82,0xbe]
+s_bitset0_b64 s[12:13], s1
+// CHECK: [0x01,0x19,0x8c,0xbe]
 
-s_bitset0_b64 s[100:101], s0
-// CHECK: [0x00,0x19,0xe4,0xbe]
+s_bitset0_b64 s[100:101], s1
+// CHECK: [0x01,0x19,0xe4,0xbe]
 
-s_bitset0_b64 flat_scratch, s0
-// CHECK: [0x00,0x19,0xe6,0xbe]
+s_bitset0_b64 flat_scratch, s1
+// CHECK: [0x01,0x19,0xe6,0xbe]
 
-s_bitset0_b64 vcc, s0
-// CHECK: [0x00,0x19,0xea,0xbe]
+s_bitset0_b64 vcc, s1
+// CHECK: [0x01,0x19,0xea,0xbe]
 
-s_bitset0_b64 tba, s0
-// CHECK: [0x00,0x19,0xec,0xbe]
+s_bitset0_b64 tba, s1
+// CHECK: [0x01,0x19,0xec,0xbe]
 
-s_bitset0_b64 tma, s0
-// CHECK: [0x00,0x19,0xee,0xbe]
+s_bitset0_b64 tma, s1
+// CHECK: [0x01,0x19,0xee,0xbe]
 
-s_bitset0_b64 ttmp[10:11], s0
-// CHECK: [0x00,0x19,0xfa,0xbe]
+s_bitset0_b64 ttmp[10:11], s1
+// CHECK: [0x01,0x19,0xfa,0xbe]
 
-s_bitset0_b64 exec, s0
-// CHECK: [0x00,0x19,0xfe,0xbe]
+s_bitset0_b64 exec, s1
+// CHECK: [0x01,0x19,0xfe,0xbe]
 
-s_bitset0_b64 s[0:1], s101
-// CHECK: [0x65,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], s101
+// CHECK: [0x65,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], flat_scratch_lo
-// CHECK: [0x66,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], flat_scratch_lo
+// CHECK: [0x66,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], flat_scratch_hi
-// CHECK: [0x67,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], flat_scratch_hi
+// CHECK: [0x67,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], vcc_lo
-// CHECK: [0x6a,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], vcc_lo
+// CHECK: [0x6a,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], vcc_hi
-// CHECK: [0x6b,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], vcc_hi
+// CHECK: [0x6b,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], tba_lo
-// CHECK: [0x6c,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], tba_lo
+// CHECK: [0x6c,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], tba_hi
-// CHECK: [0x6d,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], tba_hi
+// CHECK: [0x6d,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], tma_lo
-// CHECK: [0x6e,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], tma_lo
+// CHECK: [0x6e,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], tma_hi
-// CHECK: [0x6f,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], tma_hi
+// CHECK: [0x6f,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], ttmp11
-// CHECK: [0x7b,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], ttmp11
+// CHECK: [0x7b,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], m0
-// CHECK: [0x7c,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], m0
+// CHECK: [0x7c,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], exec_lo
-// CHECK: [0x7e,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], exec_lo
+// CHECK: [0x7e,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], exec_hi
-// CHECK: [0x7f,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], exec_hi
+// CHECK: [0x7f,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], 0
-// CHECK: [0x80,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], 0
+// CHECK: [0x80,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], -1
-// CHECK: [0xc1,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], -1
+// CHECK: [0xc1,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x19,0x80,0xbe]
+s_bitset0_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x19,0x8a,0xbe]
 
-s_bitset0_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x19,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bitset0_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x19,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bitset0_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x19,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bitset0_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x19,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bitset1_b32 s0, s0
-// CHECK: [0x00,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, s1
+// CHECK: [0x01,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s101, s0
-// CHECK: [0x00,0x1a,0xe5,0xbe]
+s_bitset1_b32 s101, s1
+// CHECK: [0x01,0x1a,0xe5,0xbe]
 
-s_bitset1_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x1a,0xe6,0xbe]
+s_bitset1_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x1a,0xe6,0xbe]
 
-s_bitset1_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x1a,0xe7,0xbe]
+s_bitset1_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x1a,0xe7,0xbe]
 
-s_bitset1_b32 vcc_lo, s0
-// CHECK: [0x00,0x1a,0xea,0xbe]
+s_bitset1_b32 vcc_lo, s1
+// CHECK: [0x01,0x1a,0xea,0xbe]
 
-s_bitset1_b32 vcc_hi, s0
-// CHECK: [0x00,0x1a,0xeb,0xbe]
+s_bitset1_b32 vcc_hi, s1
+// CHECK: [0x01,0x1a,0xeb,0xbe]
 
-s_bitset1_b32 tba_lo, s0
-// CHECK: [0x00,0x1a,0xec,0xbe]
+s_bitset1_b32 tba_lo, s1
+// CHECK: [0x01,0x1a,0xec,0xbe]
 
-s_bitset1_b32 tba_hi, s0
-// CHECK: [0x00,0x1a,0xed,0xbe]
+s_bitset1_b32 tba_hi, s1
+// CHECK: [0x01,0x1a,0xed,0xbe]
 
-s_bitset1_b32 tma_lo, s0
-// CHECK: [0x00,0x1a,0xee,0xbe]
+s_bitset1_b32 tma_lo, s1
+// CHECK: [0x01,0x1a,0xee,0xbe]
 
-s_bitset1_b32 tma_hi, s0
-// CHECK: [0x00,0x1a,0xef,0xbe]
+s_bitset1_b32 tma_hi, s1
+// CHECK: [0x01,0x1a,0xef,0xbe]
 
-s_bitset1_b32 ttmp11, s0
-// CHECK: [0x00,0x1a,0xfb,0xbe]
+s_bitset1_b32 ttmp11, s1
+// CHECK: [0x01,0x1a,0xfb,0xbe]
 
-s_bitset1_b32 m0, s0
-// CHECK: [0x00,0x1a,0xfc,0xbe]
+s_bitset1_b32 m0, s1
+// CHECK: [0x01,0x1a,0xfc,0xbe]
 
-s_bitset1_b32 exec_lo, s0
-// CHECK: [0x00,0x1a,0xfe,0xbe]
+s_bitset1_b32 exec_lo, s1
+// CHECK: [0x01,0x1a,0xfe,0xbe]
 
-s_bitset1_b32 exec_hi, s0
-// CHECK: [0x00,0x1a,0xff,0xbe]
+s_bitset1_b32 exec_hi, s1
+// CHECK: [0x01,0x1a,0xff,0xbe]
 
-s_bitset1_b32 s0, s101
-// CHECK: [0x65,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, s101
+// CHECK: [0x65,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, vcc_lo
-// CHECK: [0x6a,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, vcc_lo
+// CHECK: [0x6a,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, vcc_hi
-// CHECK: [0x6b,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, vcc_hi
+// CHECK: [0x6b,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, tba_lo
-// CHECK: [0x6c,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, tba_lo
+// CHECK: [0x6c,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, tba_hi
-// CHECK: [0x6d,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, tba_hi
+// CHECK: [0x6d,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, tma_lo
-// CHECK: [0x6e,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, tma_lo
+// CHECK: [0x6e,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, tma_hi
-// CHECK: [0x6f,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, tma_hi
+// CHECK: [0x6f,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, ttmp11
-// CHECK: [0x7b,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, ttmp11
+// CHECK: [0x7b,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, m0
-// CHECK: [0x7c,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, m0
+// CHECK: [0x7c,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, exec_lo
-// CHECK: [0x7e,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, exec_lo
+// CHECK: [0x7e,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, exec_hi
-// CHECK: [0x7f,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, exec_hi
+// CHECK: [0x7f,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, 0
-// CHECK: [0x80,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, 0
+// CHECK: [0x80,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, -1
-// CHECK: [0xc1,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, -1
+// CHECK: [0xc1,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, 0.5
-// CHECK: [0xf0,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, 0.5
+// CHECK: [0xf0,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, -4.0
-// CHECK: [0xf7,0x1a,0x80,0xbe]
+s_bitset1_b32 s5, -4.0
+// CHECK: [0xf7,0x1a,0x85,0xbe]
 
-s_bitset1_b32 s0, 0xaf123456
-// CHECK: [0xff,0x1a,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bitset1_b32 s5, 0xaf123456
+// CHECK: [0xff,0x1a,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bitset1_b32 s0, 0x3f717273
-// CHECK: [0xff,0x1a,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bitset1_b32 s5, 0x3f717273
+// CHECK: [0xff,0x1a,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_bitset1_b64 s[0:1], s0
-// CHECK: [0x00,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], s1
+// CHECK: [0x01,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[2:3], s0
-// CHECK: [0x00,0x1b,0x82,0xbe]
+s_bitset1_b64 s[12:13], s1
+// CHECK: [0x01,0x1b,0x8c,0xbe]
 
-s_bitset1_b64 s[100:101], s0
-// CHECK: [0x00,0x1b,0xe4,0xbe]
+s_bitset1_b64 s[100:101], s1
+// CHECK: [0x01,0x1b,0xe4,0xbe]
 
-s_bitset1_b64 flat_scratch, s0
-// CHECK: [0x00,0x1b,0xe6,0xbe]
+s_bitset1_b64 flat_scratch, s1
+// CHECK: [0x01,0x1b,0xe6,0xbe]
 
-s_bitset1_b64 vcc, s0
-// CHECK: [0x00,0x1b,0xea,0xbe]
+s_bitset1_b64 vcc, s1
+// CHECK: [0x01,0x1b,0xea,0xbe]
 
-s_bitset1_b64 tba, s0
-// CHECK: [0x00,0x1b,0xec,0xbe]
+s_bitset1_b64 tba, s1
+// CHECK: [0x01,0x1b,0xec,0xbe]
 
-s_bitset1_b64 tma, s0
-// CHECK: [0x00,0x1b,0xee,0xbe]
+s_bitset1_b64 tma, s1
+// CHECK: [0x01,0x1b,0xee,0xbe]
 
-s_bitset1_b64 ttmp[10:11], s0
-// CHECK: [0x00,0x1b,0xfa,0xbe]
+s_bitset1_b64 ttmp[10:11], s1
+// CHECK: [0x01,0x1b,0xfa,0xbe]
 
-s_bitset1_b64 exec, s0
-// CHECK: [0x00,0x1b,0xfe,0xbe]
+s_bitset1_b64 exec, s1
+// CHECK: [0x01,0x1b,0xfe,0xbe]
 
-s_bitset1_b64 s[0:1], s101
-// CHECK: [0x65,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], s101
+// CHECK: [0x65,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], flat_scratch_lo
-// CHECK: [0x66,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], flat_scratch_lo
+// CHECK: [0x66,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], flat_scratch_hi
-// CHECK: [0x67,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], flat_scratch_hi
+// CHECK: [0x67,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], vcc_lo
-// CHECK: [0x6a,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], vcc_lo
+// CHECK: [0x6a,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], vcc_hi
-// CHECK: [0x6b,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], vcc_hi
+// CHECK: [0x6b,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], tba_lo
-// CHECK: [0x6c,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], tba_lo
+// CHECK: [0x6c,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], tba_hi
-// CHECK: [0x6d,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], tba_hi
+// CHECK: [0x6d,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], tma_lo
-// CHECK: [0x6e,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], tma_lo
+// CHECK: [0x6e,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], tma_hi
-// CHECK: [0x6f,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], tma_hi
+// CHECK: [0x6f,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], ttmp11
-// CHECK: [0x7b,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], ttmp11
+// CHECK: [0x7b,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], m0
-// CHECK: [0x7c,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], m0
+// CHECK: [0x7c,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], exec_lo
-// CHECK: [0x7e,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], exec_lo
+// CHECK: [0x7e,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], exec_hi
-// CHECK: [0x7f,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], exec_hi
+// CHECK: [0x7f,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], 0
-// CHECK: [0x80,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], 0
+// CHECK: [0x80,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], -1
-// CHECK: [0xc1,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], -1
+// CHECK: [0xc1,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x1b,0x80,0xbe]
+s_bitset1_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x1b,0x8a,0xbe]
 
-s_bitset1_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x1b,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_bitset1_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x1b,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_bitset1_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x1b,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_bitset1_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x1b,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_getpc_b64 s[0:1]
-// CHECK: [0x00,0x1c,0x80,0xbe]
+s_getpc_b64 s[10:11]
+// CHECK: [0x00,0x1c,0x8a,0xbe]
 
-s_getpc_b64 s[2:3]
-// CHECK: [0x00,0x1c,0x82,0xbe]
+s_getpc_b64 s[12:13]
+// CHECK: [0x00,0x1c,0x8c,0xbe]
 
 s_getpc_b64 s[100:101]
 // CHECK: [0x00,0x1c,0xe4,0xbe]
@@ -12552,12 +12713,12 @@ s_getpc_b64 ttmp[10:11]
 s_getpc_b64 exec
 // CHECK: [0x00,0x1c,0xfe,0xbe]
 
-s_setpc_b64 s[0:1]
-// CHECK: [0x00,0x1d,0x80,0xbe]
-
 s_setpc_b64 s[2:3]
 // CHECK: [0x02,0x1d,0x80,0xbe]
 
+s_setpc_b64 s[4:5]
+// CHECK: [0x04,0x1d,0x80,0xbe]
+
 s_setpc_b64 s[100:101]
 // CHECK: [0x64,0x1d,0x80,0xbe]
 
@@ -12576,60 +12737,60 @@ s_setpc_b64 tma
 s_setpc_b64 ttmp[10:11]
 // CHECK: [0x7a,0x1d,0x80,0xbe]
 
-s_swappc_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x1e,0x80,0xbe]
+s_swappc_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x1e,0x8a,0xbe]
 
-s_swappc_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x1e,0x82,0xbe]
+s_swappc_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x1e,0x8c,0xbe]
 
-s_swappc_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x1e,0xe4,0xbe]
+s_swappc_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x1e,0xe4,0xbe]
 
-s_swappc_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x1e,0xe6,0xbe]
+s_swappc_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x1e,0xe6,0xbe]
 
-s_swappc_b64 vcc, s[0:1]
-// CHECK: [0x00,0x1e,0xea,0xbe]
+s_swappc_b64 vcc, s[2:3]
+// CHECK: [0x02,0x1e,0xea,0xbe]
 
-s_swappc_b64 tba, s[0:1]
-// CHECK: [0x00,0x1e,0xec,0xbe]
+s_swappc_b64 tba, s[2:3]
+// CHECK: [0x02,0x1e,0xec,0xbe]
 
-s_swappc_b64 tma, s[0:1]
-// CHECK: [0x00,0x1e,0xee,0xbe]
+s_swappc_b64 tma, s[2:3]
+// CHECK: [0x02,0x1e,0xee,0xbe]
 
-s_swappc_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x1e,0xfa,0xbe]
+s_swappc_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x1e,0xfa,0xbe]
 
-s_swappc_b64 exec, s[0:1]
-// CHECK: [0x00,0x1e,0xfe,0xbe]
+s_swappc_b64 exec, s[2:3]
+// CHECK: [0x02,0x1e,0xfe,0xbe]
 
-s_swappc_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x1e,0x80,0xbe]
+s_swappc_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x1e,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x1e,0x80,0xbe]
+s_swappc_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x1e,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x1e,0x80,0xbe]
+s_swappc_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x1e,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], vcc
-// CHECK: [0x6a,0x1e,0x80,0xbe]
+s_swappc_b64 s[10:11], vcc
+// CHECK: [0x6a,0x1e,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], tba
-// CHECK: [0x6c,0x1e,0x80,0xbe]
+s_swappc_b64 s[10:11], tba
+// CHECK: [0x6c,0x1e,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], tma
-// CHECK: [0x6e,0x1e,0x80,0xbe]
+s_swappc_b64 s[10:11], tma
+// CHECK: [0x6e,0x1e,0x8a,0xbe]
 
-s_swappc_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x1e,0x80,0xbe]
-
-s_rfe_b64 s[0:1]
-// CHECK: [0x00,0x1f,0x80,0xbe]
+s_swappc_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x1e,0x8a,0xbe]
 
 s_rfe_b64 s[2:3]
 // CHECK: [0x02,0x1f,0x80,0xbe]
 
+s_rfe_b64 s[4:5]
+// CHECK: [0x04,0x1f,0x80,0xbe]
+
 s_rfe_b64 s[100:101]
 // CHECK: [0x64,0x1f,0x80,0xbe]
 
@@ -12648,1169 +12809,1169 @@ s_rfe_b64 tma
 s_rfe_b64 ttmp[10:11]
 // CHECK: [0x7a,0x1f,0x80,0xbe]
 
-s_and_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x20,0x82,0xbe]
+s_and_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x20,0x8c,0xbe]
 
-s_and_saveexec_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x20,0xe4,0xbe]
+s_and_saveexec_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x20,0xe4,0xbe]
 
-s_and_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x20,0xe6,0xbe]
+s_and_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x20,0xe6,0xbe]
 
-s_and_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x20,0xea,0xbe]
+s_and_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x20,0xea,0xbe]
 
-s_and_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x20,0xec,0xbe]
+s_and_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x20,0xec,0xbe]
 
-s_and_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x20,0xee,0xbe]
+s_and_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x20,0xee,0xbe]
 
-s_and_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x20,0xfa,0xbe]
+s_and_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x20,0xfa,0xbe]
 
-s_and_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x20,0x80,0xbe]
+s_and_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x20,0x8a,0xbe]
 
-s_and_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x20,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_and_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x20,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_and_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x20,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_and_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x20,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_or_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x21,0x82,0xbe]
+s_or_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x21,0x8c,0xbe]
 
-s_or_saveexec_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x21,0xe4,0xbe]
+s_or_saveexec_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x21,0xe4,0xbe]
 
-s_or_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x21,0xe6,0xbe]
+s_or_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x21,0xe6,0xbe]
 
-s_or_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x21,0xea,0xbe]
+s_or_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x21,0xea,0xbe]
 
-s_or_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x21,0xec,0xbe]
+s_or_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x21,0xec,0xbe]
 
-s_or_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x21,0xee,0xbe]
+s_or_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x21,0xee,0xbe]
 
-s_or_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x21,0xfa,0xbe]
+s_or_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x21,0xfa,0xbe]
 
-s_or_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x21,0x80,0xbe]
+s_or_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x21,0x8a,0xbe]
 
-s_or_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x21,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_or_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x21,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_or_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x21,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_or_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x21,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_xor_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x22,0x82,0xbe]
+s_xor_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x22,0x8c,0xbe]
 
-s_xor_saveexec_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x22,0xe4,0xbe]
+s_xor_saveexec_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x22,0xe4,0xbe]
 
-s_xor_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x22,0xe6,0xbe]
+s_xor_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x22,0xe6,0xbe]
 
-s_xor_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x22,0xea,0xbe]
+s_xor_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x22,0xea,0xbe]
 
-s_xor_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x22,0xec,0xbe]
+s_xor_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x22,0xec,0xbe]
 
-s_xor_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x22,0xee,0xbe]
+s_xor_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x22,0xee,0xbe]
 
-s_xor_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x22,0xfa,0xbe]
+s_xor_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x22,0xfa,0xbe]
 
-s_xor_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x22,0x80,0xbe]
+s_xor_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x22,0x8a,0xbe]
 
-s_xor_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x22,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_xor_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x22,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_xor_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x22,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_xor_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x22,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_andn2_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x23,0x82,0xbe]
+s_andn2_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x23,0x8c,0xbe]
 
-s_andn2_saveexec_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x23,0xe4,0xbe]
+s_andn2_saveexec_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x23,0xe4,0xbe]
 
-s_andn2_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x23,0xe6,0xbe]
+s_andn2_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x23,0xe6,0xbe]
 
-s_andn2_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x23,0xea,0xbe]
+s_andn2_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x23,0xea,0xbe]
 
-s_andn2_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x23,0xec,0xbe]
+s_andn2_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x23,0xec,0xbe]
 
-s_andn2_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x23,0xee,0xbe]
+s_andn2_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x23,0xee,0xbe]
 
-s_andn2_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x23,0xfa,0xbe]
+s_andn2_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x23,0xfa,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x23,0x80,0xbe]
+s_andn2_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x23,0x8a,0xbe]
 
-s_andn2_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x23,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_andn2_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x23,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_andn2_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x23,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_andn2_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x23,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_orn2_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x24,0x82,0xbe]
+s_orn2_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x24,0x8c,0xbe]
 
-s_orn2_saveexec_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x24,0xe4,0xbe]
+s_orn2_saveexec_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x24,0xe4,0xbe]
 
-s_orn2_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x24,0xe6,0xbe]
+s_orn2_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x24,0xe6,0xbe]
 
-s_orn2_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x24,0xea,0xbe]
+s_orn2_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x24,0xea,0xbe]
 
-s_orn2_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x24,0xec,0xbe]
+s_orn2_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x24,0xec,0xbe]
 
-s_orn2_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x24,0xee,0xbe]
+s_orn2_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x24,0xee,0xbe]
 
-s_orn2_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x24,0xfa,0xbe]
+s_orn2_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x24,0xfa,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x24,0x80,0xbe]
+s_orn2_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x24,0x8a,0xbe]
 
-s_orn2_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x24,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_orn2_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x24,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_orn2_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x24,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_orn2_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x24,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_nand_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x25,0x82,0xbe]
+s_nand_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x25,0x8c,0xbe]
 
-s_nand_saveexec_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x25,0xe4,0xbe]
+s_nand_saveexec_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x25,0xe4,0xbe]
 
-s_nand_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x25,0xe6,0xbe]
+s_nand_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x25,0xe6,0xbe]
 
-s_nand_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x25,0xea,0xbe]
+s_nand_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x25,0xea,0xbe]
 
-s_nand_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x25,0xec,0xbe]
+s_nand_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x25,0xec,0xbe]
 
-s_nand_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x25,0xee,0xbe]
+s_nand_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x25,0xee,0xbe]
 
-s_nand_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x25,0xfa,0xbe]
+s_nand_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x25,0xfa,0xbe]
 
-s_nand_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x25,0x80,0xbe]
+s_nand_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x25,0x8a,0xbe]
 
-s_nand_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x25,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_nand_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x25,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_nand_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x25,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_nand_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x25,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_nor_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x26,0x82,0xbe]
+s_nor_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x26,0x8c,0xbe]
 
-s_nor_saveexec_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x26,0xe4,0xbe]
+s_nor_saveexec_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x26,0xe4,0xbe]
 
-s_nor_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x26,0xe6,0xbe]
+s_nor_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x26,0xe6,0xbe]
 
-s_nor_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x26,0xea,0xbe]
+s_nor_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x26,0xea,0xbe]
 
-s_nor_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x26,0xec,0xbe]
+s_nor_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x26,0xec,0xbe]
 
-s_nor_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x26,0xee,0xbe]
+s_nor_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x26,0xee,0xbe]
 
-s_nor_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x26,0xfa,0xbe]
+s_nor_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x26,0xfa,0xbe]
 
-s_nor_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x26,0x80,0xbe]
+s_nor_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x26,0x8a,0xbe]
 
-s_nor_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x26,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_nor_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x26,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_nor_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x26,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_nor_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x26,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_xnor_saveexec_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x27,0x82,0xbe]
+s_xnor_saveexec_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x27,0x8c,0xbe]
 
-s_xnor_saveexec_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x27,0xe4,0xbe]
+s_xnor_saveexec_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x27,0xe4,0xbe]
 
-s_xnor_saveexec_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x27,0xe6,0xbe]
+s_xnor_saveexec_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x27,0xe6,0xbe]
 
-s_xnor_saveexec_b64 vcc, s[0:1]
-// CHECK: [0x00,0x27,0xea,0xbe]
+s_xnor_saveexec_b64 vcc, s[2:3]
+// CHECK: [0x02,0x27,0xea,0xbe]
 
-s_xnor_saveexec_b64 tba, s[0:1]
-// CHECK: [0x00,0x27,0xec,0xbe]
+s_xnor_saveexec_b64 tba, s[2:3]
+// CHECK: [0x02,0x27,0xec,0xbe]
 
-s_xnor_saveexec_b64 tma, s[0:1]
-// CHECK: [0x00,0x27,0xee,0xbe]
+s_xnor_saveexec_b64 tma, s[2:3]
+// CHECK: [0x02,0x27,0xee,0xbe]
 
-s_xnor_saveexec_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x27,0xfa,0xbe]
+s_xnor_saveexec_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x27,0xfa,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], vcc
-// CHECK: [0x6a,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], vcc
+// CHECK: [0x6a,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], tba
-// CHECK: [0x6c,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], tba
+// CHECK: [0x6c,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], tma
-// CHECK: [0x6e,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], tma
+// CHECK: [0x6e,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], exec
-// CHECK: [0x7e,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], exec
+// CHECK: [0x7e,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], 0
-// CHECK: [0x80,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], 0
+// CHECK: [0x80,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], -1
-// CHECK: [0xc1,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], -1
+// CHECK: [0xc1,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x27,0x80,0xbe]
+s_xnor_saveexec_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x27,0x8a,0xbe]
 
-s_xnor_saveexec_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x27,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_xnor_saveexec_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x27,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_xnor_saveexec_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x27,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_xnor_saveexec_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x27,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_quadmask_b32 s0, s0
-// CHECK: [0x00,0x28,0x80,0xbe]
+s_quadmask_b32 s5, s1
+// CHECK: [0x01,0x28,0x85,0xbe]
 
-s_quadmask_b32 s101, s0
-// CHECK: [0x00,0x28,0xe5,0xbe]
+s_quadmask_b32 s101, s1
+// CHECK: [0x01,0x28,0xe5,0xbe]
 
-s_quadmask_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x28,0xe6,0xbe]
+s_quadmask_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x28,0xe6,0xbe]
 
-s_quadmask_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x28,0xe7,0xbe]
+s_quadmask_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x28,0xe7,0xbe]
 
-s_quadmask_b32 vcc_lo, s0
-// CHECK: [0x00,0x28,0xea,0xbe]
+s_quadmask_b32 vcc_lo, s1
+// CHECK: [0x01,0x28,0xea,0xbe]
 
-s_quadmask_b32 vcc_hi, s0
-// CHECK: [0x00,0x28,0xeb,0xbe]
+s_quadmask_b32 vcc_hi, s1
+// CHECK: [0x01,0x28,0xeb,0xbe]
 
-s_quadmask_b32 tba_lo, s0
-// CHECK: [0x00,0x28,0xec,0xbe]
+s_quadmask_b32 tba_lo, s1
+// CHECK: [0x01,0x28,0xec,0xbe]
 
-s_quadmask_b32 tba_hi, s0
-// CHECK: [0x00,0x28,0xed,0xbe]
+s_quadmask_b32 tba_hi, s1
+// CHECK: [0x01,0x28,0xed,0xbe]
 
-s_quadmask_b32 tma_lo, s0
-// CHECK: [0x00,0x28,0xee,0xbe]
+s_quadmask_b32 tma_lo, s1
+// CHECK: [0x01,0x28,0xee,0xbe]
 
-s_quadmask_b32 tma_hi, s0
-// CHECK: [0x00,0x28,0xef,0xbe]
+s_quadmask_b32 tma_hi, s1
+// CHECK: [0x01,0x28,0xef,0xbe]
 
-s_quadmask_b32 ttmp11, s0
-// CHECK: [0x00,0x28,0xfb,0xbe]
+s_quadmask_b32 ttmp11, s1
+// CHECK: [0x01,0x28,0xfb,0xbe]
 
-s_quadmask_b32 m0, s0
-// CHECK: [0x00,0x28,0xfc,0xbe]
+s_quadmask_b32 m0, s1
+// CHECK: [0x01,0x28,0xfc,0xbe]
 
-s_quadmask_b32 exec_lo, s0
-// CHECK: [0x00,0x28,0xfe,0xbe]
+s_quadmask_b32 exec_lo, s1
+// CHECK: [0x01,0x28,0xfe,0xbe]
 
-s_quadmask_b32 exec_hi, s0
-// CHECK: [0x00,0x28,0xff,0xbe]
+s_quadmask_b32 exec_hi, s1
+// CHECK: [0x01,0x28,0xff,0xbe]
 
-s_quadmask_b32 s0, s101
-// CHECK: [0x65,0x28,0x80,0xbe]
+s_quadmask_b32 s5, s101
+// CHECK: [0x65,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x28,0x80,0xbe]
+s_quadmask_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x28,0x80,0xbe]
+s_quadmask_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, vcc_lo
-// CHECK: [0x6a,0x28,0x80,0xbe]
+s_quadmask_b32 s5, vcc_lo
+// CHECK: [0x6a,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, vcc_hi
-// CHECK: [0x6b,0x28,0x80,0xbe]
+s_quadmask_b32 s5, vcc_hi
+// CHECK: [0x6b,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, tba_lo
-// CHECK: [0x6c,0x28,0x80,0xbe]
+s_quadmask_b32 s5, tba_lo
+// CHECK: [0x6c,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, tba_hi
-// CHECK: [0x6d,0x28,0x80,0xbe]
+s_quadmask_b32 s5, tba_hi
+// CHECK: [0x6d,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, tma_lo
-// CHECK: [0x6e,0x28,0x80,0xbe]
+s_quadmask_b32 s5, tma_lo
+// CHECK: [0x6e,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, tma_hi
-// CHECK: [0x6f,0x28,0x80,0xbe]
+s_quadmask_b32 s5, tma_hi
+// CHECK: [0x6f,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, ttmp11
-// CHECK: [0x7b,0x28,0x80,0xbe]
+s_quadmask_b32 s5, ttmp11
+// CHECK: [0x7b,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, m0
-// CHECK: [0x7c,0x28,0x80,0xbe]
+s_quadmask_b32 s5, m0
+// CHECK: [0x7c,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, exec_lo
-// CHECK: [0x7e,0x28,0x80,0xbe]
+s_quadmask_b32 s5, exec_lo
+// CHECK: [0x7e,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, exec_hi
-// CHECK: [0x7f,0x28,0x80,0xbe]
+s_quadmask_b32 s5, exec_hi
+// CHECK: [0x7f,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, 0
-// CHECK: [0x80,0x28,0x80,0xbe]
+s_quadmask_b32 s5, 0
+// CHECK: [0x80,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, -1
-// CHECK: [0xc1,0x28,0x80,0xbe]
+s_quadmask_b32 s5, -1
+// CHECK: [0xc1,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, 0.5
-// CHECK: [0xf0,0x28,0x80,0xbe]
+s_quadmask_b32 s5, 0.5
+// CHECK: [0xf0,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, -4.0
-// CHECK: [0xf7,0x28,0x80,0xbe]
+s_quadmask_b32 s5, -4.0
+// CHECK: [0xf7,0x28,0x85,0xbe]
 
-s_quadmask_b32 s0, 0xaf123456
-// CHECK: [0xff,0x28,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_quadmask_b32 s5, 0xaf123456
+// CHECK: [0xff,0x28,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_quadmask_b32 s0, 0x3f717273
-// CHECK: [0xff,0x28,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_quadmask_b32 s5, 0x3f717273
+// CHECK: [0xff,0x28,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_quadmask_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x29,0x82,0xbe]
+s_quadmask_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x29,0x8c,0xbe]
 
-s_quadmask_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x29,0xe4,0xbe]
+s_quadmask_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x29,0xe4,0xbe]
 
-s_quadmask_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x29,0xe6,0xbe]
+s_quadmask_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x29,0xe6,0xbe]
 
-s_quadmask_b64 vcc, s[0:1]
-// CHECK: [0x00,0x29,0xea,0xbe]
+s_quadmask_b64 vcc, s[2:3]
+// CHECK: [0x02,0x29,0xea,0xbe]
 
-s_quadmask_b64 tba, s[0:1]
-// CHECK: [0x00,0x29,0xec,0xbe]
+s_quadmask_b64 tba, s[2:3]
+// CHECK: [0x02,0x29,0xec,0xbe]
 
-s_quadmask_b64 tma, s[0:1]
-// CHECK: [0x00,0x29,0xee,0xbe]
+s_quadmask_b64 tma, s[2:3]
+// CHECK: [0x02,0x29,0xee,0xbe]
 
-s_quadmask_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x29,0xfa,0xbe]
+s_quadmask_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x29,0xfa,0xbe]
 
-s_quadmask_b64 exec, s[0:1]
-// CHECK: [0x00,0x29,0xfe,0xbe]
+s_quadmask_b64 exec, s[2:3]
+// CHECK: [0x02,0x29,0xfe,0xbe]
 
-s_quadmask_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], vcc
-// CHECK: [0x6a,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], vcc
+// CHECK: [0x6a,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], tba
-// CHECK: [0x6c,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], tba
+// CHECK: [0x6c,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], tma
-// CHECK: [0x6e,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], tma
+// CHECK: [0x6e,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], exec
-// CHECK: [0x7e,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], exec
+// CHECK: [0x7e,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], 0
-// CHECK: [0x80,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], 0
+// CHECK: [0x80,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], -1
-// CHECK: [0xc1,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], -1
+// CHECK: [0xc1,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x29,0x80,0xbe]
+s_quadmask_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x29,0x8a,0xbe]
 
-s_quadmask_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x29,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_quadmask_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x29,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_quadmask_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x29,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_quadmask_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x29,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_movrels_b32 s0, s0
-// CHECK: [0x00,0x2a,0x80,0xbe]
+s_movrels_b32 s5, s1
+// CHECK: [0x01,0x2a,0x85,0xbe]
 
-s_movrels_b32 s101, s0
-// CHECK: [0x00,0x2a,0xe5,0xbe]
+s_movrels_b32 s101, s1
+// CHECK: [0x01,0x2a,0xe5,0xbe]
 
-s_movrels_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x2a,0xe6,0xbe]
+s_movrels_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x2a,0xe6,0xbe]
 
-s_movrels_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x2a,0xe7,0xbe]
+s_movrels_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x2a,0xe7,0xbe]
 
-s_movrels_b32 vcc_lo, s0
-// CHECK: [0x00,0x2a,0xea,0xbe]
+s_movrels_b32 vcc_lo, s1
+// CHECK: [0x01,0x2a,0xea,0xbe]
 
-s_movrels_b32 vcc_hi, s0
-// CHECK: [0x00,0x2a,0xeb,0xbe]
+s_movrels_b32 vcc_hi, s1
+// CHECK: [0x01,0x2a,0xeb,0xbe]
 
-s_movrels_b32 tba_lo, s0
-// CHECK: [0x00,0x2a,0xec,0xbe]
+s_movrels_b32 tba_lo, s1
+// CHECK: [0x01,0x2a,0xec,0xbe]
 
-s_movrels_b32 tba_hi, s0
-// CHECK: [0x00,0x2a,0xed,0xbe]
+s_movrels_b32 tba_hi, s1
+// CHECK: [0x01,0x2a,0xed,0xbe]
 
-s_movrels_b32 tma_lo, s0
-// CHECK: [0x00,0x2a,0xee,0xbe]
+s_movrels_b32 tma_lo, s1
+// CHECK: [0x01,0x2a,0xee,0xbe]
 
-s_movrels_b32 tma_hi, s0
-// CHECK: [0x00,0x2a,0xef,0xbe]
+s_movrels_b32 tma_hi, s1
+// CHECK: [0x01,0x2a,0xef,0xbe]
 
-s_movrels_b32 ttmp11, s0
-// CHECK: [0x00,0x2a,0xfb,0xbe]
+s_movrels_b32 ttmp11, s1
+// CHECK: [0x01,0x2a,0xfb,0xbe]
 
-s_movrels_b32 m0, s0
-// CHECK: [0x00,0x2a,0xfc,0xbe]
+s_movrels_b32 m0, s1
+// CHECK: [0x01,0x2a,0xfc,0xbe]
 
-s_movrels_b32 exec_lo, s0
-// CHECK: [0x00,0x2a,0xfe,0xbe]
+s_movrels_b32 exec_lo, s1
+// CHECK: [0x01,0x2a,0xfe,0xbe]
 
-s_movrels_b32 exec_hi, s0
-// CHECK: [0x00,0x2a,0xff,0xbe]
+s_movrels_b32 exec_hi, s1
+// CHECK: [0x01,0x2a,0xff,0xbe]
 
-s_movrels_b32 s0, s101
-// CHECK: [0x65,0x2a,0x80,0xbe]
+s_movrels_b32 s5, s101
+// CHECK: [0x65,0x2a,0x85,0xbe]
 
-s_movrels_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x2a,0x80,0xbe]
+s_movrels_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x2a,0x85,0xbe]
 
-s_movrels_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x2a,0x80,0xbe]
+s_movrels_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x2a,0x85,0xbe]
 
-s_movrels_b32 s0, vcc_lo
-// CHECK: [0x6a,0x2a,0x80,0xbe]
+s_movrels_b32 s5, vcc_lo
+// CHECK: [0x6a,0x2a,0x85,0xbe]
 
-s_movrels_b32 s0, vcc_hi
-// CHECK: [0x6b,0x2a,0x80,0xbe]
+s_movrels_b32 s5, vcc_hi
+// CHECK: [0x6b,0x2a,0x85,0xbe]
 
-s_movrels_b32 s0, tba_lo
-// CHECK: [0x6c,0x2a,0x80,0xbe]
+s_movrels_b32 s5, tba_lo
+// CHECK: [0x6c,0x2a,0x85,0xbe]
 
-s_movrels_b32 s0, tba_hi
-// CHECK: [0x6d,0x2a,0x80,0xbe]
+s_movrels_b32 s5, tba_hi
+// CHECK: [0x6d,0x2a,0x85,0xbe]
 
-s_movrels_b32 s0, tma_lo
-// CHECK: [0x6e,0x2a,0x80,0xbe]
+s_movrels_b32 s5, tma_lo
+// CHECK: [0x6e,0x2a,0x85,0xbe]
 
-s_movrels_b32 s0, tma_hi
-// CHECK: [0x6f,0x2a,0x80,0xbe]
+s_movrels_b32 s5, tma_hi
+// CHECK: [0x6f,0x2a,0x85,0xbe]
 
-s_movrels_b32 s0, ttmp11
-// CHECK: [0x7b,0x2a,0x80,0xbe]
+s_movrels_b32 s5, ttmp11
+// CHECK: [0x7b,0x2a,0x85,0xbe]
 
-s_movrels_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x2b,0x80,0xbe]
+s_movrels_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x2b,0x8a,0xbe]
 
-s_movrels_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x2b,0x82,0xbe]
+s_movrels_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x2b,0x8c,0xbe]
 
-s_movrels_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x2b,0xe4,0xbe]
+s_movrels_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x2b,0xe4,0xbe]
 
-s_movrels_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x2b,0xe6,0xbe]
+s_movrels_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x2b,0xe6,0xbe]
 
-s_movrels_b64 vcc, s[0:1]
-// CHECK: [0x00,0x2b,0xea,0xbe]
+s_movrels_b64 vcc, s[2:3]
+// CHECK: [0x02,0x2b,0xea,0xbe]
 
-s_movrels_b64 tba, s[0:1]
-// CHECK: [0x00,0x2b,0xec,0xbe]
+s_movrels_b64 tba, s[2:3]
+// CHECK: [0x02,0x2b,0xec,0xbe]
 
-s_movrels_b64 tma, s[0:1]
-// CHECK: [0x00,0x2b,0xee,0xbe]
+s_movrels_b64 tma, s[2:3]
+// CHECK: [0x02,0x2b,0xee,0xbe]
 
-s_movrels_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x2b,0xfa,0xbe]
+s_movrels_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x2b,0xfa,0xbe]
 
-s_movrels_b64 exec, s[0:1]
-// CHECK: [0x00,0x2b,0xfe,0xbe]
+s_movrels_b64 exec, s[2:3]
+// CHECK: [0x02,0x2b,0xfe,0xbe]
 
-s_movrels_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x2b,0x80,0xbe]
+s_movrels_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x2b,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x2b,0x80,0xbe]
+s_movrels_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x2b,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x2b,0x80,0xbe]
+s_movrels_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x2b,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], vcc
-// CHECK: [0x6a,0x2b,0x80,0xbe]
+s_movrels_b64 s[10:11], vcc
+// CHECK: [0x6a,0x2b,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], tba
-// CHECK: [0x6c,0x2b,0x80,0xbe]
+s_movrels_b64 s[10:11], tba
+// CHECK: [0x6c,0x2b,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], tma
-// CHECK: [0x6e,0x2b,0x80,0xbe]
+s_movrels_b64 s[10:11], tma
+// CHECK: [0x6e,0x2b,0x8a,0xbe]
 
-s_movrels_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x2b,0x80,0xbe]
+s_movrels_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x2b,0x8a,0xbe]
 
-s_movreld_b32 s0, s0
-// CHECK: [0x00,0x2c,0x80,0xbe]
+s_movreld_b32 s5, s1
+// CHECK: [0x01,0x2c,0x85,0xbe]
 
-s_movreld_b32 s101, s0
-// CHECK: [0x00,0x2c,0xe5,0xbe]
+s_movreld_b32 s101, s1
+// CHECK: [0x01,0x2c,0xe5,0xbe]
 
-s_movreld_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x2c,0xe6,0xbe]
+s_movreld_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x2c,0xe6,0xbe]
 
-s_movreld_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x2c,0xe7,0xbe]
+s_movreld_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x2c,0xe7,0xbe]
 
-s_movreld_b32 vcc_lo, s0
-// CHECK: [0x00,0x2c,0xea,0xbe]
+s_movreld_b32 vcc_lo, s1
+// CHECK: [0x01,0x2c,0xea,0xbe]
 
-s_movreld_b32 vcc_hi, s0
-// CHECK: [0x00,0x2c,0xeb,0xbe]
+s_movreld_b32 vcc_hi, s1
+// CHECK: [0x01,0x2c,0xeb,0xbe]
 
-s_movreld_b32 tba_lo, s0
-// CHECK: [0x00,0x2c,0xec,0xbe]
+s_movreld_b32 tba_lo, s1
+// CHECK: [0x01,0x2c,0xec,0xbe]
 
-s_movreld_b32 tba_hi, s0
-// CHECK: [0x00,0x2c,0xed,0xbe]
+s_movreld_b32 tba_hi, s1
+// CHECK: [0x01,0x2c,0xed,0xbe]
 
-s_movreld_b32 tma_lo, s0
-// CHECK: [0x00,0x2c,0xee,0xbe]
+s_movreld_b32 tma_lo, s1
+// CHECK: [0x01,0x2c,0xee,0xbe]
 
-s_movreld_b32 tma_hi, s0
-// CHECK: [0x00,0x2c,0xef,0xbe]
+s_movreld_b32 tma_hi, s1
+// CHECK: [0x01,0x2c,0xef,0xbe]
 
-s_movreld_b32 ttmp11, s0
-// CHECK: [0x00,0x2c,0xfb,0xbe]
+s_movreld_b32 ttmp11, s1
+// CHECK: [0x01,0x2c,0xfb,0xbe]
 
-s_movreld_b32 s0, s101
-// CHECK: [0x65,0x2c,0x80,0xbe]
+s_movreld_b32 s5, s101
+// CHECK: [0x65,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x2c,0x80,0xbe]
+s_movreld_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x2c,0x80,0xbe]
+s_movreld_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, vcc_lo
-// CHECK: [0x6a,0x2c,0x80,0xbe]
+s_movreld_b32 s5, vcc_lo
+// CHECK: [0x6a,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, vcc_hi
-// CHECK: [0x6b,0x2c,0x80,0xbe]
+s_movreld_b32 s5, vcc_hi
+// CHECK: [0x6b,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, tba_lo
-// CHECK: [0x6c,0x2c,0x80,0xbe]
+s_movreld_b32 s5, tba_lo
+// CHECK: [0x6c,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, tba_hi
-// CHECK: [0x6d,0x2c,0x80,0xbe]
+s_movreld_b32 s5, tba_hi
+// CHECK: [0x6d,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, tma_lo
-// CHECK: [0x6e,0x2c,0x80,0xbe]
+s_movreld_b32 s5, tma_lo
+// CHECK: [0x6e,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, tma_hi
-// CHECK: [0x6f,0x2c,0x80,0xbe]
+s_movreld_b32 s5, tma_hi
+// CHECK: [0x6f,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, ttmp11
-// CHECK: [0x7b,0x2c,0x80,0xbe]
+s_movreld_b32 s5, ttmp11
+// CHECK: [0x7b,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, m0
-// CHECK: [0x7c,0x2c,0x80,0xbe]
+s_movreld_b32 s5, m0
+// CHECK: [0x7c,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, 0
-// CHECK: [0x80,0x2c,0x80,0xbe]
+s_movreld_b32 s5, 0
+// CHECK: [0x80,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, -1
-// CHECK: [0xc1,0x2c,0x80,0xbe]
+s_movreld_b32 s5, -1
+// CHECK: [0xc1,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, 0.5
-// CHECK: [0xf0,0x2c,0x80,0xbe]
+s_movreld_b32 s5, 0.5
+// CHECK: [0xf0,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, -4.0
-// CHECK: [0xf7,0x2c,0x80,0xbe]
+s_movreld_b32 s5, -4.0
+// CHECK: [0xf7,0x2c,0x85,0xbe]
 
-s_movreld_b32 s0, 0xaf123456
-// CHECK: [0xff,0x2c,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_movreld_b32 s5, 0xaf123456
+// CHECK: [0xff,0x2c,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_movreld_b32 s0, 0x3f717273
-// CHECK: [0xff,0x2c,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_movreld_b32 s5, 0x3f717273
+// CHECK: [0xff,0x2c,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_movreld_b64 s[0:1], s[0:1]
-// CHECK: [0x00,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], s[2:3]
+// CHECK: [0x02,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[2:3], s[0:1]
-// CHECK: [0x00,0x2d,0x82,0xbe]
+s_movreld_b64 s[12:13], s[2:3]
+// CHECK: [0x02,0x2d,0x8c,0xbe]
 
-s_movreld_b64 s[100:101], s[0:1]
-// CHECK: [0x00,0x2d,0xe4,0xbe]
+s_movreld_b64 s[100:101], s[2:3]
+// CHECK: [0x02,0x2d,0xe4,0xbe]
 
-s_movreld_b64 flat_scratch, s[0:1]
-// CHECK: [0x00,0x2d,0xe6,0xbe]
+s_movreld_b64 flat_scratch, s[2:3]
+// CHECK: [0x02,0x2d,0xe6,0xbe]
 
-s_movreld_b64 vcc, s[0:1]
-// CHECK: [0x00,0x2d,0xea,0xbe]
+s_movreld_b64 vcc, s[2:3]
+// CHECK: [0x02,0x2d,0xea,0xbe]
 
-s_movreld_b64 tba, s[0:1]
-// CHECK: [0x00,0x2d,0xec,0xbe]
+s_movreld_b64 tba, s[2:3]
+// CHECK: [0x02,0x2d,0xec,0xbe]
 
-s_movreld_b64 tma, s[0:1]
-// CHECK: [0x00,0x2d,0xee,0xbe]
+s_movreld_b64 tma, s[2:3]
+// CHECK: [0x02,0x2d,0xee,0xbe]
 
-s_movreld_b64 ttmp[10:11], s[0:1]
-// CHECK: [0x00,0x2d,0xfa,0xbe]
+s_movreld_b64 ttmp[10:11], s[2:3]
+// CHECK: [0x02,0x2d,0xfa,0xbe]
 
-s_movreld_b64 s[0:1], s[2:3]
-// CHECK: [0x02,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], s[4:5]
+// CHECK: [0x04,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], s[100:101]
-// CHECK: [0x64,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], s[100:101]
+// CHECK: [0x64,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], flat_scratch
-// CHECK: [0x66,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], flat_scratch
+// CHECK: [0x66,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], vcc
-// CHECK: [0x6a,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], vcc
+// CHECK: [0x6a,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], tba
-// CHECK: [0x6c,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], tba
+// CHECK: [0x6c,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], tma
-// CHECK: [0x6e,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], tma
+// CHECK: [0x6e,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], ttmp[10:11]
+// CHECK: [0x7a,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], 0
-// CHECK: [0x80,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], 0
+// CHECK: [0x80,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], -1
-// CHECK: [0xc1,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], -1
+// CHECK: [0xc1,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], 0.5
-// CHECK: [0xf0,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], 0.5
+// CHECK: [0xf0,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], -4.0
-// CHECK: [0xf7,0x2d,0x80,0xbe]
+s_movreld_b64 s[10:11], -4.0
+// CHECK: [0xf7,0x2d,0x8a,0xbe]
 
-s_movreld_b64 s[0:1], 0xaf123456
-// CHECK: [0xff,0x2d,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_movreld_b64 s[10:11], 0xaf123456
+// CHECK: [0xff,0x2d,0x8a,0xbe,0x56,0x34,0x12,0xaf]
 
-s_movreld_b64 s[0:1], 0x3f717273
-// CHECK: [0xff,0x2d,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_movreld_b64 s[10:11], 0x3f717273
+// CHECK: [0xff,0x2d,0x8a,0xbe,0x73,0x72,0x71,0x3f]
 
-s_abs_i32 s0, s0
-// CHECK: [0x00,0x30,0x80,0xbe]
+s_abs_i32 s5, s1
+// CHECK: [0x01,0x30,0x85,0xbe]
 
-s_abs_i32 s101, s0
-// CHECK: [0x00,0x30,0xe5,0xbe]
+s_abs_i32 s101, s1
+// CHECK: [0x01,0x30,0xe5,0xbe]
 
-s_abs_i32 flat_scratch_lo, s0
-// CHECK: [0x00,0x30,0xe6,0xbe]
+s_abs_i32 flat_scratch_lo, s1
+// CHECK: [0x01,0x30,0xe6,0xbe]
 
-s_abs_i32 flat_scratch_hi, s0
-// CHECK: [0x00,0x30,0xe7,0xbe]
+s_abs_i32 flat_scratch_hi, s1
+// CHECK: [0x01,0x30,0xe7,0xbe]
 
-s_abs_i32 vcc_lo, s0
-// CHECK: [0x00,0x30,0xea,0xbe]
+s_abs_i32 vcc_lo, s1
+// CHECK: [0x01,0x30,0xea,0xbe]
 
-s_abs_i32 vcc_hi, s0
-// CHECK: [0x00,0x30,0xeb,0xbe]
+s_abs_i32 vcc_hi, s1
+// CHECK: [0x01,0x30,0xeb,0xbe]
 
-s_abs_i32 tba_lo, s0
-// CHECK: [0x00,0x30,0xec,0xbe]
+s_abs_i32 tba_lo, s1
+// CHECK: [0x01,0x30,0xec,0xbe]
 
-s_abs_i32 tba_hi, s0
-// CHECK: [0x00,0x30,0xed,0xbe]
+s_abs_i32 tba_hi, s1
+// CHECK: [0x01,0x30,0xed,0xbe]
 
-s_abs_i32 tma_lo, s0
-// CHECK: [0x00,0x30,0xee,0xbe]
+s_abs_i32 tma_lo, s1
+// CHECK: [0x01,0x30,0xee,0xbe]
 
-s_abs_i32 tma_hi, s0
-// CHECK: [0x00,0x30,0xef,0xbe]
+s_abs_i32 tma_hi, s1
+// CHECK: [0x01,0x30,0xef,0xbe]
 
-s_abs_i32 ttmp11, s0
-// CHECK: [0x00,0x30,0xfb,0xbe]
+s_abs_i32 ttmp11, s1
+// CHECK: [0x01,0x30,0xfb,0xbe]
 
-s_abs_i32 m0, s0
-// CHECK: [0x00,0x30,0xfc,0xbe]
+s_abs_i32 m0, s1
+// CHECK: [0x01,0x30,0xfc,0xbe]
 
-s_abs_i32 exec_lo, s0
-// CHECK: [0x00,0x30,0xfe,0xbe]
+s_abs_i32 exec_lo, s1
+// CHECK: [0x01,0x30,0xfe,0xbe]
 
-s_abs_i32 exec_hi, s0
-// CHECK: [0x00,0x30,0xff,0xbe]
+s_abs_i32 exec_hi, s1
+// CHECK: [0x01,0x30,0xff,0xbe]
 
-s_abs_i32 s0, s101
-// CHECK: [0x65,0x30,0x80,0xbe]
+s_abs_i32 s5, s101
+// CHECK: [0x65,0x30,0x85,0xbe]
 
-s_abs_i32 s0, flat_scratch_lo
-// CHECK: [0x66,0x30,0x80,0xbe]
+s_abs_i32 s5, flat_scratch_lo
+// CHECK: [0x66,0x30,0x85,0xbe]
 
-s_abs_i32 s0, flat_scratch_hi
-// CHECK: [0x67,0x30,0x80,0xbe]
+s_abs_i32 s5, flat_scratch_hi
+// CHECK: [0x67,0x30,0x85,0xbe]
 
-s_abs_i32 s0, vcc_lo
-// CHECK: [0x6a,0x30,0x80,0xbe]
+s_abs_i32 s5, vcc_lo
+// CHECK: [0x6a,0x30,0x85,0xbe]
 
-s_abs_i32 s0, vcc_hi
-// CHECK: [0x6b,0x30,0x80,0xbe]
+s_abs_i32 s5, vcc_hi
+// CHECK: [0x6b,0x30,0x85,0xbe]
 
-s_abs_i32 s0, tba_lo
-// CHECK: [0x6c,0x30,0x80,0xbe]
+s_abs_i32 s5, tba_lo
+// CHECK: [0x6c,0x30,0x85,0xbe]
 
-s_abs_i32 s0, tba_hi
-// CHECK: [0x6d,0x30,0x80,0xbe]
+s_abs_i32 s5, tba_hi
+// CHECK: [0x6d,0x30,0x85,0xbe]
 
-s_abs_i32 s0, tma_lo
-// CHECK: [0x6e,0x30,0x80,0xbe]
+s_abs_i32 s5, tma_lo
+// CHECK: [0x6e,0x30,0x85,0xbe]
 
-s_abs_i32 s0, tma_hi
-// CHECK: [0x6f,0x30,0x80,0xbe]
+s_abs_i32 s5, tma_hi
+// CHECK: [0x6f,0x30,0x85,0xbe]
 
-s_abs_i32 s0, ttmp11
-// CHECK: [0x7b,0x30,0x80,0xbe]
+s_abs_i32 s5, ttmp11
+// CHECK: [0x7b,0x30,0x85,0xbe]
 
-s_abs_i32 s0, m0
-// CHECK: [0x7c,0x30,0x80,0xbe]
+s_abs_i32 s5, m0
+// CHECK: [0x7c,0x30,0x85,0xbe]
 
-s_abs_i32 s0, exec_lo
-// CHECK: [0x7e,0x30,0x80,0xbe]
+s_abs_i32 s5, exec_lo
+// CHECK: [0x7e,0x30,0x85,0xbe]
 
-s_abs_i32 s0, exec_hi
-// CHECK: [0x7f,0x30,0x80,0xbe]
+s_abs_i32 s5, exec_hi
+// CHECK: [0x7f,0x30,0x85,0xbe]
 
-s_abs_i32 s0, 0
-// CHECK: [0x80,0x30,0x80,0xbe]
+s_abs_i32 s5, 0
+// CHECK: [0x80,0x30,0x85,0xbe]
 
-s_abs_i32 s0, -1
-// CHECK: [0xc1,0x30,0x80,0xbe]
+s_abs_i32 s5, -1
+// CHECK: [0xc1,0x30,0x85,0xbe]
 
-s_abs_i32 s0, 0.5
-// CHECK: [0xf0,0x30,0x80,0xbe]
+s_abs_i32 s5, 0.5
+// CHECK: [0xf0,0x30,0x85,0xbe]
 
-s_abs_i32 s0, -4.0
-// CHECK: [0xf7,0x30,0x80,0xbe]
+s_abs_i32 s5, -4.0
+// CHECK: [0xf7,0x30,0x85,0xbe]
 
-s_abs_i32 s0, 0xaf123456
-// CHECK: [0xff,0x30,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_abs_i32 s5, 0xaf123456
+// CHECK: [0xff,0x30,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_abs_i32 s0, 0x3f717273
-// CHECK: [0xff,0x30,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_abs_i32 s5, 0x3f717273
+// CHECK: [0xff,0x30,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_mov_fed_b32 s0, s0
-// CHECK: [0x00,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, s1
+// CHECK: [0x01,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s101, s0
-// CHECK: [0x00,0x31,0xe5,0xbe]
+s_mov_fed_b32 s101, s1
+// CHECK: [0x01,0x31,0xe5,0xbe]
 
-s_mov_fed_b32 flat_scratch_lo, s0
-// CHECK: [0x00,0x31,0xe6,0xbe]
+s_mov_fed_b32 flat_scratch_lo, s1
+// CHECK: [0x01,0x31,0xe6,0xbe]
 
-s_mov_fed_b32 flat_scratch_hi, s0
-// CHECK: [0x00,0x31,0xe7,0xbe]
+s_mov_fed_b32 flat_scratch_hi, s1
+// CHECK: [0x01,0x31,0xe7,0xbe]
 
-s_mov_fed_b32 vcc_lo, s0
-// CHECK: [0x00,0x31,0xea,0xbe]
+s_mov_fed_b32 vcc_lo, s1
+// CHECK: [0x01,0x31,0xea,0xbe]
 
-s_mov_fed_b32 vcc_hi, s0
-// CHECK: [0x00,0x31,0xeb,0xbe]
+s_mov_fed_b32 vcc_hi, s1
+// CHECK: [0x01,0x31,0xeb,0xbe]
 
-s_mov_fed_b32 tba_lo, s0
-// CHECK: [0x00,0x31,0xec,0xbe]
+s_mov_fed_b32 tba_lo, s1
+// CHECK: [0x01,0x31,0xec,0xbe]
 
-s_mov_fed_b32 tba_hi, s0
-// CHECK: [0x00,0x31,0xed,0xbe]
+s_mov_fed_b32 tba_hi, s1
+// CHECK: [0x01,0x31,0xed,0xbe]
 
-s_mov_fed_b32 tma_lo, s0
-// CHECK: [0x00,0x31,0xee,0xbe]
+s_mov_fed_b32 tma_lo, s1
+// CHECK: [0x01,0x31,0xee,0xbe]
 
-s_mov_fed_b32 tma_hi, s0
-// CHECK: [0x00,0x31,0xef,0xbe]
+s_mov_fed_b32 tma_hi, s1
+// CHECK: [0x01,0x31,0xef,0xbe]
 
-s_mov_fed_b32 ttmp11, s0
-// CHECK: [0x00,0x31,0xfb,0xbe]
+s_mov_fed_b32 ttmp11, s1
+// CHECK: [0x01,0x31,0xfb,0xbe]
 
-s_mov_fed_b32 m0, s0
-// CHECK: [0x00,0x31,0xfc,0xbe]
+s_mov_fed_b32 m0, s1
+// CHECK: [0x01,0x31,0xfc,0xbe]
 
-s_mov_fed_b32 exec_lo, s0
-// CHECK: [0x00,0x31,0xfe,0xbe]
+s_mov_fed_b32 exec_lo, s1
+// CHECK: [0x01,0x31,0xfe,0xbe]
 
-s_mov_fed_b32 exec_hi, s0
-// CHECK: [0x00,0x31,0xff,0xbe]
+s_mov_fed_b32 exec_hi, s1
+// CHECK: [0x01,0x31,0xff,0xbe]
 
-s_mov_fed_b32 s0, s101
-// CHECK: [0x65,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, s101
+// CHECK: [0x65,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, flat_scratch_lo
-// CHECK: [0x66,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, flat_scratch_lo
+// CHECK: [0x66,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, flat_scratch_hi
-// CHECK: [0x67,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, flat_scratch_hi
+// CHECK: [0x67,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, vcc_lo
-// CHECK: [0x6a,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, vcc_lo
+// CHECK: [0x6a,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, vcc_hi
-// CHECK: [0x6b,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, vcc_hi
+// CHECK: [0x6b,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, tba_lo
-// CHECK: [0x6c,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, tba_lo
+// CHECK: [0x6c,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, tba_hi
-// CHECK: [0x6d,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, tba_hi
+// CHECK: [0x6d,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, tma_lo
-// CHECK: [0x6e,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, tma_lo
+// CHECK: [0x6e,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, tma_hi
-// CHECK: [0x6f,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, tma_hi
+// CHECK: [0x6f,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, ttmp11
-// CHECK: [0x7b,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, ttmp11
+// CHECK: [0x7b,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, m0
-// CHECK: [0x7c,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, m0
+// CHECK: [0x7c,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, exec_lo
-// CHECK: [0x7e,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, exec_lo
+// CHECK: [0x7e,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, exec_hi
-// CHECK: [0x7f,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, exec_hi
+// CHECK: [0x7f,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, 0
-// CHECK: [0x80,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, 0
+// CHECK: [0x80,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, -1
-// CHECK: [0xc1,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, -1
+// CHECK: [0xc1,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, 0.5
-// CHECK: [0xf0,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, 0.5
+// CHECK: [0xf0,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, -4.0
-// CHECK: [0xf7,0x31,0x80,0xbe]
+s_mov_fed_b32 s5, -4.0
+// CHECK: [0xf7,0x31,0x85,0xbe]
 
-s_mov_fed_b32 s0, 0xaf123456
-// CHECK: [0xff,0x31,0x80,0xbe,0x56,0x34,0x12,0xaf]
+s_mov_fed_b32 s5, 0xaf123456
+// CHECK: [0xff,0x31,0x85,0xbe,0x56,0x34,0x12,0xaf]
 
-s_mov_fed_b32 s0, 0x3f717273
-// CHECK: [0xff,0x31,0x80,0xbe,0x73,0x72,0x71,0x3f]
+s_mov_fed_b32 s5, 0x3f717273
+// CHECK: [0xff,0x31,0x85,0xbe,0x73,0x72,0x71,0x3f]
 
-s_set_gpr_idx_idx s0
-// CHECK: [0x00,0x32,0x80,0xbe]
+s_set_gpr_idx_idx s1
+// CHECK: [0x01,0x32,0x80,0xbe]
 
 s_set_gpr_idx_idx s101
 // CHECK: [0x65,0x32,0x80,0xbe]
@@ -13863,8000 +14024,8000 @@ s_set_gpr_idx_idx 0xaf123456
 s_set_gpr_idx_idx 0x3f717273
 // CHECK: [0xff,0x32,0x80,0xbe,0x73,0x72,0x71,0x3f]
 
-s_add_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x80]
+s_add_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x80]
 
-s_add_u32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x80]
+s_add_u32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x80]
 
-s_add_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x80]
+s_add_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x80]
 
-s_add_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x80]
+s_add_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x80]
 
-s_add_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x80]
+s_add_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x80]
 
-s_add_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x80]
+s_add_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x80]
 
-s_add_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x80]
+s_add_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x80]
 
-s_add_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x80]
+s_add_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x80]
 
-s_add_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x80]
+s_add_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x80]
 
-s_add_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x80]
+s_add_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x80]
 
-s_add_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x80]
+s_add_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x80]
 
-s_add_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x80]
+s_add_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x80]
 
-s_add_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x80]
+s_add_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x80]
 
-s_add_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x80]
+s_add_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x80]
 
-s_add_u32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x80]
+s_add_u32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x80]
 
-s_add_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x80]
+s_add_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x80]
 
-s_add_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x80]
+s_add_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x80]
 
-s_add_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x80]
+s_add_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x80]
 
-s_add_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x80]
+s_add_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x80]
 
-s_add_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x80]
+s_add_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x80]
 
-s_add_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x80]
+s_add_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x80]
 
-s_add_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x80]
+s_add_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x80]
 
-s_add_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x80]
+s_add_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x80]
 
-s_add_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x80]
+s_add_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x80]
 
-s_add_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x80]
+s_add_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x80]
 
-s_add_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x80]
+s_add_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x80]
 
-s_add_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x80]
+s_add_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x80]
 
-s_add_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x80]
+s_add_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x80]
 
-s_add_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x80]
+s_add_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x80]
 
-s_add_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x80]
+s_add_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x80]
 
-s_add_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x80]
+s_add_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x80]
 
-s_add_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x80,0x56,0x34,0x12,0xaf]
+s_add_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x80,0x56,0x34,0x12,0xaf]
 
-s_add_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x80,0x73,0x72,0x71,0x3f]
+s_add_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x80,0x73,0x72,0x71,0x3f]
 
-s_add_u32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x80]
+s_add_u32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x80]
 
-s_add_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x80]
+s_add_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x80]
 
-s_add_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x80]
+s_add_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x80]
 
-s_add_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x80]
+s_add_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x80]
 
-s_add_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x80]
+s_add_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x80]
 
-s_add_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x80]
+s_add_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x80]
 
-s_add_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x80]
+s_add_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x80]
 
-s_add_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x80]
+s_add_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x80]
 
-s_add_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x80]
+s_add_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x80]
 
-s_add_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x80]
+s_add_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x80]
 
-s_add_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x80]
+s_add_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x80]
 
-s_add_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x80]
+s_add_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x80]
 
-s_add_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x80]
+s_add_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x80]
 
-s_add_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x80]
+s_add_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x80]
 
-s_add_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x80]
+s_add_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x80]
 
-s_add_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x80]
+s_add_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x80]
 
-s_add_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x80]
+s_add_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x80]
 
-s_add_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x80,0x56,0x34,0x12,0xaf]
+s_add_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x80,0x56,0x34,0x12,0xaf]
 
-s_add_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x80,0x73,0x72,0x71,0x3f]
+s_add_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x80,0x73,0x72,0x71,0x3f]
 
-s_sub_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x80]
+s_sub_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x80]
 
-s_sub_u32 s101, s0, s0
-// CHECK: [0x00,0x00,0xe5,0x80]
+s_sub_u32 s101, s1, s2
+// CHECK: [0x01,0x02,0xe5,0x80]
 
-s_sub_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe6,0x80]
+s_sub_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe6,0x80]
 
-s_sub_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x80]
+s_sub_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x80]
 
-s_sub_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x80]
+s_sub_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x80]
 
-s_sub_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x80]
+s_sub_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x80]
 
-s_sub_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x80]
+s_sub_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x80]
 
-s_sub_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x80]
+s_sub_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x80]
 
-s_sub_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x80]
+s_sub_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x80]
 
-s_sub_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x80]
+s_sub_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x80]
 
-s_sub_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x80]
+s_sub_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x80]
 
-s_sub_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x80]
+s_sub_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x80]
 
-s_sub_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x80]
+s_sub_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x80]
 
-s_sub_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x80]
+s_sub_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x80]
 
-s_sub_u32 s0, s101, s0
-// CHECK: [0x65,0x00,0x80,0x80]
+s_sub_u32 s5, s101, s2
+// CHECK: [0x65,0x02,0x85,0x80]
 
-s_sub_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x80,0x80]
+s_sub_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x85,0x80]
 
-s_sub_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x80,0x80]
+s_sub_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x85,0x80]
 
-s_sub_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x80]
+s_sub_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x80]
 
-s_sub_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x80]
+s_sub_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x80]
 
-s_sub_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x80]
+s_sub_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x80]
 
-s_sub_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x80]
+s_sub_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x80]
 
-s_sub_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x80]
+s_sub_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x80]
 
-s_sub_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x80]
+s_sub_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x80]
 
-s_sub_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x80]
+s_sub_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x80]
 
-s_sub_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x80]
+s_sub_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x80]
 
-s_sub_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x80]
+s_sub_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x80]
 
-s_sub_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x80]
+s_sub_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x80]
 
-s_sub_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x80]
+s_sub_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x80]
 
-s_sub_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x80]
+s_sub_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x80]
 
-s_sub_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x80]
+s_sub_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x80]
 
-s_sub_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x80]
+s_sub_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x80]
 
-s_sub_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x80,0x56,0x34,0x12,0xaf]
+s_sub_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x80,0x56,0x34,0x12,0xaf]
 
-s_sub_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x80,0x73,0x72,0x71,0x3f]
+s_sub_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x80,0x73,0x72,0x71,0x3f]
 
-s_sub_u32 s0, s0, s101
-// CHECK: [0x00,0x65,0x80,0x80]
+s_sub_u32 s5, s1, s101
+// CHECK: [0x01,0x65,0x85,0x80]
 
-s_sub_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x80]
+s_sub_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x85,0x80]
 
-s_sub_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x80]
+s_sub_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x85,0x80]
 
-s_sub_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x80]
+s_sub_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x80]
 
-s_sub_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x80]
+s_sub_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x80]
 
-s_sub_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x80]
+s_sub_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x80]
 
-s_sub_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x80]
+s_sub_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x80]
 
-s_sub_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x80]
+s_sub_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x80]
 
-s_sub_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x80]
+s_sub_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x80]
 
-s_sub_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x80]
+s_sub_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x80]
 
-s_sub_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x80]
+s_sub_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x80]
 
-s_sub_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x80]
+s_sub_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x80]
 
-s_sub_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x80]
+s_sub_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x80]
 
-s_sub_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x80]
+s_sub_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x80]
 
-s_sub_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x80]
+s_sub_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x80]
 
-s_sub_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x80]
+s_sub_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x80]
 
-s_sub_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x80]
+s_sub_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x80]
 
-s_sub_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x80,0x56,0x34,0x12,0xaf]
+s_sub_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x80,0x56,0x34,0x12,0xaf]
 
-s_sub_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x80,0x73,0x72,0x71,0x3f]
+s_sub_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x80,0x73,0x72,0x71,0x3f]
 
-s_add_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x81]
+s_add_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x81]
 
-s_add_i32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x81]
+s_add_i32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x81]
 
-s_add_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x81]
+s_add_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x81]
 
-s_add_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x81]
+s_add_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x81]
 
-s_add_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x81]
+s_add_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x81]
 
-s_add_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x81]
+s_add_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x81]
 
-s_add_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x81]
+s_add_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x81]
 
-s_add_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x81]
+s_add_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x81]
 
-s_add_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x81]
+s_add_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x81]
 
-s_add_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x81]
+s_add_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x81]
 
-s_add_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x81]
+s_add_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x81]
 
-s_add_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x81]
+s_add_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x81]
 
-s_add_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x81]
+s_add_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x81]
 
-s_add_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x81]
+s_add_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x81]
 
-s_add_i32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x81]
+s_add_i32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x81]
 
-s_add_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x81]
+s_add_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x81]
 
-s_add_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x81]
+s_add_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x81]
 
-s_add_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x81]
+s_add_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x81]
 
-s_add_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x81]
+s_add_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x81]
 
-s_add_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x81]
+s_add_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x81]
 
-s_add_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x81]
+s_add_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x81]
 
-s_add_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x81]
+s_add_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x81]
 
-s_add_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x81]
+s_add_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x81]
 
-s_add_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x81]
+s_add_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x81]
 
-s_add_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x81]
+s_add_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x81]
 
-s_add_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x81]
+s_add_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x81]
 
-s_add_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x81]
+s_add_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x81]
 
-s_add_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x81]
+s_add_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x81]
 
-s_add_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x81]
+s_add_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x81]
 
-s_add_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x81]
+s_add_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x81]
 
-s_add_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x81]
+s_add_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x81]
 
-s_add_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x81,0x56,0x34,0x12,0xaf]
+s_add_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x81,0x56,0x34,0x12,0xaf]
 
-s_add_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x81,0x73,0x72,0x71,0x3f]
+s_add_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x81,0x73,0x72,0x71,0x3f]
 
-s_add_i32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x81]
+s_add_i32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x81]
 
-s_add_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x81]
+s_add_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x81]
 
-s_add_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x81]
+s_add_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x81]
 
-s_add_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x81]
+s_add_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x81]
 
-s_add_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x81]
+s_add_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x81]
 
-s_add_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x81]
+s_add_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x81]
 
-s_add_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x81]
+s_add_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x81]
 
-s_add_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x81]
+s_add_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x81]
 
-s_add_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x81]
+s_add_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x81]
 
-s_add_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x81]
+s_add_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x81]
 
-s_add_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x81]
+s_add_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x81]
 
-s_add_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x81]
+s_add_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x81]
 
-s_add_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x81]
+s_add_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x81]
 
-s_add_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x81]
+s_add_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x81]
 
-s_add_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x81]
+s_add_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x81]
 
-s_add_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x81]
+s_add_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x81]
 
-s_add_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x81]
+s_add_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x81]
 
-s_add_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x81,0x56,0x34,0x12,0xaf]
+s_add_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x81,0x56,0x34,0x12,0xaf]
 
-s_add_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x81,0x73,0x72,0x71,0x3f]
+s_add_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x81,0x73,0x72,0x71,0x3f]
 
-s_sub_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x81]
+s_sub_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x81]
 
-s_sub_i32 s101, s0, s0
-// CHECK: [0x00,0x00,0xe5,0x81]
+s_sub_i32 s101, s1, s2
+// CHECK: [0x01,0x02,0xe5,0x81]
 
-s_sub_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe6,0x81]
+s_sub_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe6,0x81]
 
-s_sub_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x81]
+s_sub_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x81]
 
-s_sub_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x81]
+s_sub_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x81]
 
-s_sub_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x81]
+s_sub_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x81]
 
-s_sub_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x81]
+s_sub_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x81]
 
-s_sub_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x81]
+s_sub_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x81]
 
-s_sub_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x81]
+s_sub_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x81]
 
-s_sub_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x81]
+s_sub_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x81]
 
-s_sub_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x81]
+s_sub_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x81]
 
-s_sub_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x81]
+s_sub_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x81]
 
-s_sub_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x81]
+s_sub_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x81]
 
-s_sub_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x81]
+s_sub_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x81]
 
-s_sub_i32 s0, s101, s0
-// CHECK: [0x65,0x00,0x80,0x81]
+s_sub_i32 s5, s101, s2
+// CHECK: [0x65,0x02,0x85,0x81]
 
-s_sub_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x80,0x81]
+s_sub_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x85,0x81]
 
-s_sub_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x80,0x81]
+s_sub_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x85,0x81]
 
-s_sub_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x81]
+s_sub_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x81]
 
-s_sub_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x81]
+s_sub_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x81]
 
-s_sub_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x81]
+s_sub_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x81]
 
-s_sub_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x81]
+s_sub_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x81]
 
-s_sub_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x81]
+s_sub_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x81]
 
-s_sub_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x81]
+s_sub_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x81]
 
-s_sub_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x81]
+s_sub_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x81]
 
-s_sub_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x81]
+s_sub_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x81]
 
-s_sub_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x81]
+s_sub_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x81]
 
-s_sub_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x81]
+s_sub_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x81]
 
-s_sub_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x81]
+s_sub_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x81]
 
-s_sub_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x81]
+s_sub_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x81]
 
-s_sub_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x81]
+s_sub_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x81]
 
-s_sub_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x81]
+s_sub_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x81]
 
-s_sub_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x81,0x56,0x34,0x12,0xaf]
+s_sub_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x81,0x56,0x34,0x12,0xaf]
 
-s_sub_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x81,0x73,0x72,0x71,0x3f]
+s_sub_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x81,0x73,0x72,0x71,0x3f]
 
-s_sub_i32 s0, s0, s101
-// CHECK: [0x00,0x65,0x80,0x81]
+s_sub_i32 s5, s1, s101
+// CHECK: [0x01,0x65,0x85,0x81]
 
-s_sub_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x81]
+s_sub_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x85,0x81]
 
-s_sub_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x81]
+s_sub_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x85,0x81]
 
-s_sub_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x81]
+s_sub_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x81]
 
-s_sub_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x81]
+s_sub_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x81]
 
-s_sub_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x81]
+s_sub_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x81]
 
-s_sub_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x81]
+s_sub_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x81]
 
-s_sub_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x81]
+s_sub_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x81]
 
-s_sub_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x81]
+s_sub_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x81]
 
-s_sub_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x81]
+s_sub_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x81]
 
-s_sub_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x81]
+s_sub_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x81]
 
-s_sub_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x81]
+s_sub_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x81]
 
-s_sub_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x81]
+s_sub_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x81]
 
-s_sub_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x81]
+s_sub_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x81]
 
-s_sub_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x81]
+s_sub_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x81]
 
-s_sub_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x81]
+s_sub_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x81]
 
-s_sub_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x81]
+s_sub_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x81]
 
-s_sub_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x81,0x56,0x34,0x12,0xaf]
+s_sub_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x81,0x56,0x34,0x12,0xaf]
 
-s_sub_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x81,0x73,0x72,0x71,0x3f]
+s_sub_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x81,0x73,0x72,0x71,0x3f]
 
-s_addc_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x82]
+s_addc_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x82]
 
-s_addc_u32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x82]
+s_addc_u32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x82]
 
-s_addc_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x82]
+s_addc_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x82]
 
-s_addc_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x82]
+s_addc_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x82]
 
-s_addc_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x82]
+s_addc_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x82]
 
-s_addc_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x82]
+s_addc_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x82]
 
-s_addc_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x82]
+s_addc_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x82]
 
-s_addc_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x82]
+s_addc_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x82]
 
-s_addc_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x82]
+s_addc_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x82]
 
-s_addc_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x82]
+s_addc_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x82]
 
-s_addc_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x82]
+s_addc_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x82]
 
-s_addc_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x82]
+s_addc_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x82]
 
-s_addc_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x82]
+s_addc_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x82]
 
-s_addc_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x82]
+s_addc_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x82]
 
-s_addc_u32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x82]
+s_addc_u32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x82]
 
-s_addc_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x82]
+s_addc_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x82]
 
-s_addc_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x82]
+s_addc_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x82]
 
-s_addc_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x82]
+s_addc_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x82]
 
-s_addc_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x82]
+s_addc_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x82]
 
-s_addc_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x82]
+s_addc_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x82]
 
-s_addc_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x82]
+s_addc_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x82]
 
-s_addc_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x82]
+s_addc_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x82]
 
-s_addc_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x82]
+s_addc_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x82]
 
-s_addc_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x82]
+s_addc_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x82]
 
-s_addc_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x82]
+s_addc_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x82]
 
-s_addc_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x82]
+s_addc_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x82]
 
-s_addc_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x82]
+s_addc_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x82]
 
-s_addc_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x82]
+s_addc_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x82]
 
-s_addc_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x82]
+s_addc_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x82]
 
-s_addc_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x82]
+s_addc_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x82]
 
-s_addc_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x82]
+s_addc_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x82]
 
-s_addc_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x82,0x56,0x34,0x12,0xaf]
+s_addc_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x82,0x56,0x34,0x12,0xaf]
 
-s_addc_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x82,0x73,0x72,0x71,0x3f]
+s_addc_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x82,0x73,0x72,0x71,0x3f]
 
-s_addc_u32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x82]
+s_addc_u32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x82]
 
-s_addc_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x82]
+s_addc_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x82]
 
-s_addc_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x82]
+s_addc_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x82]
 
-s_addc_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x82]
+s_addc_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x82]
 
-s_addc_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x82]
+s_addc_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x82]
 
-s_addc_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x82]
+s_addc_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x82]
 
-s_addc_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x82]
+s_addc_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x82]
 
-s_addc_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x82]
+s_addc_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x82]
 
-s_addc_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x82]
+s_addc_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x82]
 
-s_addc_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x82]
+s_addc_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x82]
 
-s_addc_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x82]
+s_addc_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x82]
 
-s_addc_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x82]
+s_addc_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x82]
 
-s_addc_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x82]
+s_addc_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x82]
 
-s_addc_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x82]
+s_addc_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x82]
 
-s_addc_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x82]
+s_addc_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x82]
 
-s_addc_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x82]
+s_addc_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x82]
 
-s_addc_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x82]
+s_addc_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x82]
 
-s_addc_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x82,0x56,0x34,0x12,0xaf]
+s_addc_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x82,0x56,0x34,0x12,0xaf]
 
-s_addc_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x82,0x73,0x72,0x71,0x3f]
+s_addc_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x82,0x73,0x72,0x71,0x3f]
 
-s_subb_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x82]
+s_subb_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x82]
 
-s_subb_u32 s101, s0, s0
-// CHECK: [0x00,0x00,0xe5,0x82]
+s_subb_u32 s101, s1, s2
+// CHECK: [0x01,0x02,0xe5,0x82]
 
-s_subb_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe6,0x82]
+s_subb_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe6,0x82]
 
-s_subb_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x82]
+s_subb_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x82]
 
-s_subb_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x82]
+s_subb_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x82]
 
-s_subb_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x82]
+s_subb_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x82]
 
-s_subb_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x82]
+s_subb_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x82]
 
-s_subb_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x82]
+s_subb_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x82]
 
-s_subb_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x82]
+s_subb_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x82]
 
-s_subb_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x82]
+s_subb_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x82]
 
-s_subb_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x82]
+s_subb_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x82]
 
-s_subb_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x82]
+s_subb_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x82]
 
-s_subb_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x82]
+s_subb_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x82]
 
-s_subb_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x82]
+s_subb_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x82]
 
-s_subb_u32 s0, s101, s0
-// CHECK: [0x65,0x00,0x80,0x82]
+s_subb_u32 s5, s101, s2
+// CHECK: [0x65,0x02,0x85,0x82]
 
-s_subb_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x80,0x82]
+s_subb_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x85,0x82]
 
-s_subb_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x80,0x82]
+s_subb_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x85,0x82]
 
-s_subb_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x82]
+s_subb_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x82]
 
-s_subb_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x82]
+s_subb_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x82]
 
-s_subb_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x82]
+s_subb_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x82]
 
-s_subb_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x82]
+s_subb_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x82]
 
-s_subb_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x82]
+s_subb_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x82]
 
-s_subb_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x82]
+s_subb_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x82]
 
-s_subb_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x82]
+s_subb_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x82]
 
-s_subb_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x82]
+s_subb_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x82]
 
-s_subb_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x82]
+s_subb_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x82]
 
-s_subb_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x82]
+s_subb_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x82]
 
-s_subb_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x82]
+s_subb_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x82]
 
-s_subb_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x82]
+s_subb_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x82]
 
-s_subb_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x82]
+s_subb_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x82]
 
-s_subb_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x82]
+s_subb_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x82]
 
-s_subb_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x82,0x56,0x34,0x12,0xaf]
+s_subb_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x82,0x56,0x34,0x12,0xaf]
 
-s_subb_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x82,0x73,0x72,0x71,0x3f]
+s_subb_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x82,0x73,0x72,0x71,0x3f]
 
-s_subb_u32 s0, s0, s101
-// CHECK: [0x00,0x65,0x80,0x82]
+s_subb_u32 s5, s1, s101
+// CHECK: [0x01,0x65,0x85,0x82]
 
-s_subb_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x82]
+s_subb_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x85,0x82]
 
-s_subb_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x82]
+s_subb_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x85,0x82]
 
-s_subb_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x82]
+s_subb_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x82]
 
-s_subb_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x82]
+s_subb_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x82]
 
-s_subb_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x82]
+s_subb_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x82]
 
-s_subb_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x82]
+s_subb_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x82]
 
-s_subb_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x82]
+s_subb_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x82]
 
-s_subb_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x82]
+s_subb_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x82]
 
-s_subb_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x82]
+s_subb_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x82]
 
-s_subb_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x82]
+s_subb_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x82]
 
-s_subb_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x82]
+s_subb_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x82]
 
-s_subb_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x82]
+s_subb_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x82]
 
-s_subb_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x82]
+s_subb_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x82]
 
-s_subb_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x82]
+s_subb_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x82]
 
-s_subb_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x82]
+s_subb_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x82]
 
-s_subb_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x82]
+s_subb_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x82]
 
-s_subb_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x82,0x56,0x34,0x12,0xaf]
+s_subb_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x82,0x56,0x34,0x12,0xaf]
 
-s_subb_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x82,0x73,0x72,0x71,0x3f]
+s_subb_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x82,0x73,0x72,0x71,0x3f]
 
-s_min_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x83]
+s_min_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x83]
 
-s_min_i32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x83]
+s_min_i32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x83]
 
-s_min_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x83]
+s_min_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x83]
 
-s_min_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x83]
+s_min_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x83]
 
-s_min_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x83]
+s_min_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x83]
 
-s_min_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x83]
+s_min_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x83]
 
-s_min_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x83]
+s_min_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x83]
 
-s_min_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x83]
+s_min_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x83]
 
-s_min_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x83]
+s_min_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x83]
 
-s_min_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x83]
+s_min_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x83]
 
-s_min_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x83]
+s_min_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x83]
 
-s_min_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x83]
+s_min_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x83]
 
-s_min_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x83]
+s_min_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x83]
 
-s_min_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x83]
+s_min_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x83]
 
-s_min_i32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x83]
+s_min_i32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x83]
 
-s_min_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x83]
+s_min_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x83]
 
-s_min_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x83]
+s_min_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x83]
 
-s_min_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x83]
+s_min_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x83]
 
-s_min_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x83]
+s_min_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x83]
 
-s_min_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x83]
+s_min_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x83]
 
-s_min_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x83]
+s_min_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x83]
 
-s_min_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x83]
+s_min_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x83]
 
-s_min_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x83]
+s_min_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x83]
 
-s_min_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x83]
+s_min_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x83]
 
-s_min_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x83]
+s_min_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x83]
 
-s_min_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x83]
+s_min_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x83]
 
-s_min_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x83]
+s_min_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x83]
 
-s_min_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x83]
+s_min_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x83]
 
-s_min_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x83]
+s_min_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x83]
 
-s_min_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x83]
+s_min_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x83]
 
-s_min_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x83]
+s_min_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x83]
 
-s_min_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x83,0x56,0x34,0x12,0xaf]
+s_min_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x83,0x56,0x34,0x12,0xaf]
 
-s_min_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x83,0x73,0x72,0x71,0x3f]
+s_min_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x83,0x73,0x72,0x71,0x3f]
 
-s_min_i32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x83]
+s_min_i32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x83]
 
-s_min_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x83]
+s_min_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x83]
 
-s_min_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x83]
+s_min_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x83]
 
-s_min_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x83]
+s_min_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x83]
 
-s_min_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x83]
+s_min_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x83]
 
-s_min_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x83]
+s_min_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x83]
 
-s_min_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x83]
+s_min_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x83]
 
-s_min_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x83]
+s_min_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x83]
 
-s_min_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x83]
+s_min_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x83]
 
-s_min_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x83]
+s_min_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x83]
 
-s_min_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x83]
+s_min_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x83]
 
-s_min_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x83]
+s_min_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x83]
 
-s_min_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x83]
+s_min_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x83]
 
-s_min_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x83]
+s_min_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x83]
 
-s_min_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x83]
+s_min_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x83]
 
-s_min_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x83]
+s_min_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x83]
 
-s_min_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x83]
+s_min_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x83]
 
-s_min_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x83,0x56,0x34,0x12,0xaf]
+s_min_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x83,0x56,0x34,0x12,0xaf]
 
-s_min_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x83,0x73,0x72,0x71,0x3f]
+s_min_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x83,0x73,0x72,0x71,0x3f]
 
-s_min_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x83]
+s_min_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x83]
 
-s_min_u32 s101, s0, s0
-// CHECK: [0x00,0x00,0xe5,0x83]
+s_min_u32 s101, s1, s2
+// CHECK: [0x01,0x02,0xe5,0x83]
 
-s_min_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe6,0x83]
+s_min_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe6,0x83]
 
-s_min_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x83]
+s_min_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x83]
 
-s_min_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x83]
+s_min_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x83]
 
-s_min_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x83]
+s_min_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x83]
 
-s_min_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x83]
+s_min_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x83]
 
-s_min_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x83]
+s_min_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x83]
 
-s_min_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x83]
+s_min_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x83]
 
-s_min_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x83]
+s_min_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x83]
 
-s_min_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x83]
+s_min_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x83]
 
-s_min_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x83]
+s_min_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x83]
 
-s_min_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x83]
+s_min_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x83]
 
-s_min_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x83]
+s_min_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x83]
 
-s_min_u32 s0, s101, s0
-// CHECK: [0x65,0x00,0x80,0x83]
+s_min_u32 s5, s101, s2
+// CHECK: [0x65,0x02,0x85,0x83]
 
-s_min_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x80,0x83]
+s_min_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x85,0x83]
 
-s_min_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x80,0x83]
+s_min_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x85,0x83]
 
-s_min_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x83]
+s_min_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x83]
 
-s_min_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x83]
+s_min_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x83]
 
-s_min_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x83]
+s_min_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x83]
 
-s_min_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x83]
+s_min_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x83]
 
-s_min_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x83]
+s_min_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x83]
 
-s_min_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x83]
+s_min_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x83]
 
-s_min_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x83]
+s_min_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x83]
 
-s_min_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x83]
+s_min_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x83]
 
-s_min_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x83]
+s_min_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x83]
 
-s_min_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x83]
+s_min_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x83]
 
-s_min_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x83]
+s_min_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x83]
 
-s_min_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x83]
+s_min_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x83]
 
-s_min_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x83]
+s_min_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x83]
 
-s_min_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x83]
+s_min_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x83]
 
-s_min_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x83,0x56,0x34,0x12,0xaf]
+s_min_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x83,0x56,0x34,0x12,0xaf]
 
-s_min_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x83,0x73,0x72,0x71,0x3f]
+s_min_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x83,0x73,0x72,0x71,0x3f]
 
-s_min_u32 s0, s0, s101
-// CHECK: [0x00,0x65,0x80,0x83]
+s_min_u32 s5, s1, s101
+// CHECK: [0x01,0x65,0x85,0x83]
 
-s_min_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x83]
+s_min_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x85,0x83]
 
-s_min_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x83]
+s_min_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x85,0x83]
 
-s_min_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x83]
+s_min_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x83]
 
-s_min_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x83]
+s_min_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x83]
 
-s_min_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x83]
+s_min_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x83]
 
-s_min_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x83]
+s_min_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x83]
 
-s_min_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x83]
+s_min_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x83]
 
-s_min_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x83]
+s_min_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x83]
 
-s_min_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x83]
+s_min_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x83]
 
-s_min_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x83]
+s_min_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x83]
 
-s_min_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x83]
+s_min_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x83]
 
-s_min_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x83]
+s_min_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x83]
 
-s_min_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x83]
+s_min_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x83]
 
-s_min_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x83]
+s_min_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x83]
 
-s_min_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x83]
+s_min_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x83]
 
-s_min_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x83]
+s_min_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x83]
 
-s_min_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x83,0x56,0x34,0x12,0xaf]
+s_min_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x83,0x56,0x34,0x12,0xaf]
 
-s_min_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x83,0x73,0x72,0x71,0x3f]
+s_min_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x83,0x73,0x72,0x71,0x3f]
 
-s_max_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x84]
+s_max_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x84]
 
-s_max_i32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x84]
+s_max_i32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x84]
 
-s_max_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x84]
+s_max_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x84]
 
-s_max_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x84]
+s_max_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x84]
 
-s_max_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x84]
+s_max_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x84]
 
-s_max_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x84]
+s_max_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x84]
 
-s_max_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x84]
+s_max_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x84]
 
-s_max_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x84]
+s_max_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x84]
 
-s_max_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x84]
+s_max_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x84]
 
-s_max_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x84]
+s_max_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x84]
 
-s_max_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x84]
+s_max_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x84]
 
-s_max_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x84]
+s_max_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x84]
 
-s_max_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x84]
+s_max_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x84]
 
-s_max_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x84]
+s_max_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x84]
 
-s_max_i32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x84]
+s_max_i32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x84]
 
-s_max_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x84]
+s_max_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x84]
 
-s_max_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x84]
+s_max_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x84]
 
-s_max_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x84]
+s_max_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x84]
 
-s_max_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x84]
+s_max_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x84]
 
-s_max_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x84]
+s_max_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x84]
 
-s_max_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x84]
+s_max_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x84]
 
-s_max_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x84]
+s_max_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x84]
 
-s_max_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x84]
+s_max_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x84]
 
-s_max_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x84]
+s_max_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x84]
 
-s_max_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x84]
+s_max_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x84]
 
-s_max_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x84]
+s_max_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x84]
 
-s_max_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x84]
+s_max_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x84]
 
-s_max_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x84]
+s_max_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x84]
 
-s_max_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x84]
+s_max_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x84]
 
-s_max_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x84]
+s_max_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x84]
 
-s_max_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x84]
+s_max_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x84]
 
-s_max_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x84,0x56,0x34,0x12,0xaf]
+s_max_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x84,0x56,0x34,0x12,0xaf]
 
-s_max_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x84,0x73,0x72,0x71,0x3f]
+s_max_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x84,0x73,0x72,0x71,0x3f]
 
-s_max_i32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x84]
+s_max_i32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x84]
 
-s_max_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x84]
+s_max_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x84]
 
-s_max_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x84]
+s_max_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x84]
 
-s_max_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x84]
+s_max_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x84]
 
-s_max_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x84]
+s_max_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x84]
 
-s_max_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x84]
+s_max_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x84]
 
-s_max_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x84]
+s_max_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x84]
 
-s_max_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x84]
+s_max_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x84]
 
-s_max_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x84]
+s_max_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x84]
 
-s_max_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x84]
+s_max_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x84]
 
-s_max_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x84]
+s_max_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x84]
 
-s_max_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x84]
+s_max_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x84]
 
-s_max_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x84]
+s_max_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x84]
 
-s_max_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x84]
+s_max_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x84]
 
-s_max_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x84]
+s_max_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x84]
 
-s_max_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x84]
+s_max_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x84]
 
-s_max_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x84]
+s_max_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x84]
 
-s_max_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x84,0x56,0x34,0x12,0xaf]
+s_max_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x84,0x56,0x34,0x12,0xaf]
 
-s_max_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x84,0x73,0x72,0x71,0x3f]
+s_max_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x84,0x73,0x72,0x71,0x3f]
 
-s_max_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x84]
+s_max_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x84]
 
-s_max_u32 s101, s0, s0
-// CHECK: [0x00,0x00,0xe5,0x84]
+s_max_u32 s101, s1, s2
+// CHECK: [0x01,0x02,0xe5,0x84]
 
-s_max_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe6,0x84]
+s_max_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe6,0x84]
 
-s_max_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x84]
+s_max_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x84]
 
-s_max_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x84]
+s_max_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x84]
 
-s_max_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x84]
+s_max_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x84]
 
-s_max_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x84]
+s_max_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x84]
 
-s_max_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x84]
+s_max_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x84]
 
-s_max_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x84]
+s_max_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x84]
 
-s_max_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x84]
+s_max_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x84]
 
-s_max_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x84]
+s_max_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x84]
 
-s_max_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x84]
+s_max_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x84]
 
-s_max_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x84]
+s_max_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x84]
 
-s_max_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x84]
+s_max_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x84]
 
-s_max_u32 s0, s101, s0
-// CHECK: [0x65,0x00,0x80,0x84]
+s_max_u32 s5, s101, s2
+// CHECK: [0x65,0x02,0x85,0x84]
 
-s_max_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x80,0x84]
+s_max_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x85,0x84]
 
-s_max_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x80,0x84]
+s_max_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x85,0x84]
 
-s_max_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x84]
+s_max_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x84]
 
-s_max_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x84]
+s_max_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x84]
 
-s_max_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x84]
+s_max_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x84]
 
-s_max_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x84]
+s_max_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x84]
 
-s_max_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x84]
+s_max_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x84]
 
-s_max_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x84]
+s_max_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x84]
 
-s_max_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x84]
+s_max_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x84]
 
-s_max_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x84]
+s_max_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x84]
 
-s_max_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x84]
+s_max_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x84]
 
-s_max_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x84]
+s_max_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x84]
 
-s_max_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x84]
+s_max_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x84]
 
-s_max_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x84]
+s_max_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x84]
 
-s_max_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x84]
+s_max_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x84]
 
-s_max_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x84]
+s_max_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x84]
 
-s_max_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x84,0x56,0x34,0x12,0xaf]
+s_max_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x84,0x56,0x34,0x12,0xaf]
 
-s_max_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x84,0x73,0x72,0x71,0x3f]
+s_max_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x84,0x73,0x72,0x71,0x3f]
 
-s_max_u32 s0, s0, s101
-// CHECK: [0x00,0x65,0x80,0x84]
+s_max_u32 s5, s1, s101
+// CHECK: [0x01,0x65,0x85,0x84]
 
-s_max_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x84]
+s_max_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x85,0x84]
 
-s_max_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x84]
+s_max_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x85,0x84]
 
-s_max_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x84]
+s_max_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x84]
 
-s_max_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x84]
+s_max_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x84]
 
-s_max_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x84]
+s_max_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x84]
 
-s_max_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x84]
+s_max_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x84]
 
-s_max_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x84]
+s_max_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x84]
 
-s_max_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x84]
+s_max_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x84]
 
-s_max_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x84]
+s_max_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x84]
 
-s_max_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x84]
+s_max_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x84]
 
-s_max_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x84]
+s_max_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x84]
 
-s_max_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x84]
+s_max_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x84]
 
-s_max_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x84]
+s_max_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x84]
 
-s_max_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x84]
+s_max_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x84]
 
-s_max_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x84]
+s_max_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x84]
 
-s_max_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x84]
+s_max_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x84]
 
-s_max_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x84,0x56,0x34,0x12,0xaf]
+s_max_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x84,0x56,0x34,0x12,0xaf]
 
-s_max_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x84,0x73,0x72,0x71,0x3f]
+s_max_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x84,0x73,0x72,0x71,0x3f]
 
-s_cselect_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x85]
+s_cselect_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x85]
 
-s_cselect_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x85]
+s_cselect_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x85]
 
-s_cselect_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x85]
+s_cselect_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x85]
 
-s_cselect_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x85]
+s_cselect_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x85]
 
-s_cselect_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x85]
+s_cselect_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x85]
 
-s_cselect_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x85]
+s_cselect_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x85]
 
-s_cselect_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x85]
+s_cselect_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x85]
 
-s_cselect_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x85]
+s_cselect_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x85]
 
-s_cselect_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x85]
+s_cselect_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x85]
 
-s_cselect_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x85]
+s_cselect_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x85]
 
-s_cselect_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x85]
+s_cselect_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x85]
 
-s_cselect_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x85]
+s_cselect_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x85]
 
-s_cselect_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x85]
+s_cselect_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x85]
 
-s_cselect_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x85]
+s_cselect_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x85]
 
-s_cselect_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x85]
+s_cselect_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x85]
 
-s_cselect_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x85]
+s_cselect_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x85]
 
-s_cselect_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x85]
+s_cselect_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x85]
 
-s_cselect_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x85]
+s_cselect_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x85]
 
-s_cselect_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x85]
+s_cselect_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x85]
 
-s_cselect_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x85]
+s_cselect_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x85]
 
-s_cselect_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x85]
+s_cselect_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x85]
 
-s_cselect_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x85]
+s_cselect_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x85]
 
-s_cselect_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x85]
+s_cselect_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x85]
 
-s_cselect_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x85]
+s_cselect_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x85]
 
-s_cselect_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x85]
+s_cselect_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x85]
 
-s_cselect_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x85]
+s_cselect_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x85]
 
-s_cselect_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x85]
+s_cselect_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x85]
 
-s_cselect_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x85]
+s_cselect_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x85]
 
-s_cselect_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x85]
+s_cselect_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x85]
 
-s_cselect_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x85]
+s_cselect_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x85]
 
-s_cselect_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x85]
+s_cselect_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x85]
 
-s_cselect_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x85,0x56,0x34,0x12,0xaf]
+s_cselect_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x85,0x56,0x34,0x12,0xaf]
 
-s_cselect_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x85,0x73,0x72,0x71,0x3f]
+s_cselect_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x85,0x73,0x72,0x71,0x3f]
 
-s_cselect_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x85]
+s_cselect_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x85]
 
-s_cselect_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x85]
+s_cselect_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x85]
 
-s_cselect_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x85]
+s_cselect_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x85]
 
-s_cselect_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x85]
+s_cselect_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x85]
 
-s_cselect_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x85]
+s_cselect_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x85]
 
-s_cselect_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x85]
+s_cselect_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x85]
 
-s_cselect_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x85]
+s_cselect_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x85]
 
-s_cselect_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x85]
+s_cselect_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x85]
 
-s_cselect_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x85]
+s_cselect_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x85]
 
-s_cselect_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x85]
+s_cselect_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x85]
 
-s_cselect_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x85]
+s_cselect_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x85]
 
-s_cselect_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x85]
+s_cselect_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x85]
 
-s_cselect_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x85]
+s_cselect_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x85]
 
-s_cselect_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x85]
+s_cselect_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x85]
 
-s_cselect_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x85]
+s_cselect_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x85]
 
-s_cselect_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x85]
+s_cselect_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x85]
 
-s_cselect_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x85]
+s_cselect_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x85]
 
-s_cselect_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x85,0x56,0x34,0x12,0xaf]
+s_cselect_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x85,0x56,0x34,0x12,0xaf]
 
-s_cselect_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x85,0x73,0x72,0x71,0x3f]
+s_cselect_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x85,0x73,0x72,0x71,0x3f]
 
-s_cselect_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x85]
 
-s_cselect_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x85]
+s_cselect_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x85]
 
-s_cselect_b64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0x85]
+s_cselect_b64 s[100:101], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe4,0x85]
 
-s_cselect_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x85]
+s_cselect_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x85]
 
-s_cselect_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x85]
+s_cselect_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x85]
 
-s_cselect_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x85]
+s_cselect_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x85]
 
-s_cselect_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x85]
+s_cselect_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x85]
 
-s_cselect_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x85]
+s_cselect_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x85]
 
-s_cselect_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x85]
+s_cselect_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x85]
 
-s_cselect_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x85]
+s_cselect_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x85]
 
-s_cselect_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x85,0x56,0x34,0x12,0xaf]
+s_cselect_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x85,0x56,0x34,0x12,0xaf]
 
-s_cselect_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x85,0x73,0x72,0x71,0x3f]
+s_cselect_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x85,0x73,0x72,0x71,0x3f]
 
-s_cselect_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x85]
+s_cselect_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x85]
 
-s_cselect_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x85,0x56,0x34,0x12,0xaf]
+s_cselect_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x85,0x56,0x34,0x12,0xaf]
 
-s_cselect_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x85,0x73,0x72,0x71,0x3f]
+s_cselect_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x85,0x73,0x72,0x71,0x3f]
 
-s_and_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x86]
+s_and_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x86]
 
-s_and_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x86]
+s_and_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x86]
 
-s_and_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x86]
+s_and_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x86]
 
-s_and_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x86]
+s_and_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x86]
 
-s_and_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x86]
+s_and_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x86]
 
-s_and_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x86]
+s_and_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x86]
 
-s_and_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x86]
+s_and_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x86]
 
-s_and_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x86]
+s_and_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x86]
 
-s_and_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x86]
+s_and_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x86]
 
-s_and_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x86]
+s_and_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x86]
 
-s_and_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x86]
+s_and_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x86]
 
-s_and_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x86]
+s_and_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x86]
 
-s_and_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x86]
+s_and_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x86]
 
-s_and_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x86]
+s_and_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x86]
 
-s_and_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x86]
+s_and_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x86]
 
-s_and_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x86]
+s_and_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x86]
 
-s_and_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x86]
+s_and_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x86]
 
-s_and_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x86]
+s_and_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x86]
 
-s_and_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x86]
+s_and_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x86]
 
-s_and_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x86]
+s_and_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x86]
 
-s_and_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x86]
+s_and_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x86]
 
-s_and_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x86]
+s_and_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x86]
 
-s_and_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x86]
+s_and_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x86]
 
-s_and_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x86]
+s_and_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x86]
 
-s_and_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x86]
+s_and_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x86]
 
-s_and_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x86]
+s_and_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x86]
 
-s_and_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x86]
+s_and_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x86]
 
-s_and_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x86]
+s_and_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x86]
 
-s_and_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x86]
+s_and_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x86]
 
-s_and_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x86]
+s_and_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x86]
 
-s_and_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x86]
+s_and_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x86]
 
-s_and_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x86,0x56,0x34,0x12,0xaf]
+s_and_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x86,0x56,0x34,0x12,0xaf]
 
-s_and_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x86,0x73,0x72,0x71,0x3f]
+s_and_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x86,0x73,0x72,0x71,0x3f]
 
-s_and_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x86]
+s_and_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x86]
 
-s_and_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x86]
+s_and_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x86]
 
-s_and_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x86]
+s_and_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x86]
 
-s_and_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x86]
+s_and_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x86]
 
-s_and_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x86]
+s_and_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x86]
 
-s_and_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x86]
+s_and_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x86]
 
-s_and_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x86]
+s_and_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x86]
 
-s_and_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x86]
+s_and_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x86]
 
-s_and_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x86]
+s_and_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x86]
 
-s_and_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x86]
+s_and_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x86]
 
-s_and_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x86]
+s_and_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x86]
 
-s_and_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x86]
+s_and_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x86]
 
-s_and_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x86]
+s_and_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x86]
 
-s_and_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x86]
+s_and_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x86]
 
-s_and_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x86]
+s_and_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x86]
 
-s_and_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x86]
+s_and_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x86]
 
-s_and_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x86]
+s_and_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x86]
 
-s_and_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x86,0x56,0x34,0x12,0xaf]
+s_and_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x86,0x56,0x34,0x12,0xaf]
 
-s_and_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x86,0x73,0x72,0x71,0x3f]
+s_and_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x86,0x73,0x72,0x71,0x3f]
 
-s_and_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x86]
 
-s_and_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x86]
+s_and_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x86]
 
-s_and_b64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0x86]
+s_and_b64 s[100:101], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe4,0x86]
 
-s_and_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x86]
+s_and_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x86]
 
-s_and_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x86]
+s_and_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x86]
 
-s_and_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x86]
+s_and_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x86]
 
-s_and_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x86]
+s_and_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x86]
 
-s_and_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x86]
+s_and_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x86]
 
-s_and_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x86]
+s_and_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x86]
 
-s_and_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x86]
+s_and_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x80,0x86]
+s_and_b64 s[10:11], s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x80,0x86]
+s_and_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x86]
+s_and_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x86]
+s_and_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x86]
+s_and_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x86]
+s_and_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x86]
+s_and_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x86]
+s_and_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x86]
+s_and_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x86]
+s_and_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x86]
+s_and_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x86]
 
-s_and_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x86,0x56,0x34,0x12,0xaf]
+s_and_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x86,0x56,0x34,0x12,0xaf]
 
-s_and_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x86,0x73,0x72,0x71,0x3f]
+s_and_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x86,0x73,0x72,0x71,0x3f]
 
-s_and_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x86]
+s_and_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x86]
 
-s_and_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x86,0x56,0x34,0x12,0xaf]
+s_and_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x86,0x56,0x34,0x12,0xaf]
 
-s_and_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x86,0x73,0x72,0x71,0x3f]
+s_and_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x86,0x73,0x72,0x71,0x3f]
 
-s_or_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x87]
+s_or_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x87]
 
-s_or_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x87]
+s_or_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x87]
 
-s_or_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x87]
+s_or_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x87]
 
-s_or_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x87]
+s_or_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x87]
 
-s_or_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x87]
+s_or_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x87]
 
-s_or_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x87]
+s_or_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x87]
 
-s_or_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x87]
+s_or_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x87]
 
-s_or_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x87]
+s_or_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x87]
 
-s_or_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x87]
+s_or_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x87]
 
-s_or_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x87]
+s_or_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x87]
 
-s_or_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x87]
+s_or_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x87]
 
-s_or_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x87]
+s_or_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x87]
 
-s_or_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x87]
+s_or_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x87]
 
-s_or_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x87]
+s_or_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x87]
 
-s_or_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x87]
+s_or_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x87]
 
-s_or_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x87]
+s_or_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x87]
 
-s_or_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x87]
+s_or_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x87]
 
-s_or_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x87]
+s_or_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x87]
 
-s_or_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x87]
+s_or_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x87]
 
-s_or_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x87]
+s_or_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x87]
 
-s_or_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x87]
+s_or_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x87]
 
-s_or_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x87]
+s_or_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x87]
 
-s_or_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x87]
+s_or_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x87]
 
-s_or_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x87]
+s_or_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x87]
 
-s_or_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x87]
+s_or_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x87]
 
-s_or_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x87]
+s_or_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x87]
 
-s_or_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x87]
+s_or_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x87]
 
-s_or_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x87]
+s_or_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x87]
 
-s_or_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x87]
+s_or_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x87]
 
-s_or_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x87]
+s_or_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x87]
 
-s_or_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x87]
+s_or_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x87]
 
-s_or_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x87,0x56,0x34,0x12,0xaf]
+s_or_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x87,0x56,0x34,0x12,0xaf]
 
-s_or_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x87,0x73,0x72,0x71,0x3f]
+s_or_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x87,0x73,0x72,0x71,0x3f]
 
-s_or_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x87]
+s_or_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x87]
 
-s_or_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x87]
+s_or_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x87]
 
-s_or_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x87]
+s_or_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x87]
 
-s_or_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x87]
+s_or_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x87]
 
-s_or_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x87]
+s_or_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x87]
 
-s_or_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x87]
+s_or_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x87]
 
-s_or_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x87]
+s_or_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x87]
 
-s_or_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x87]
+s_or_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x87]
 
-s_or_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x87]
+s_or_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x87]
 
-s_or_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x87]
+s_or_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x87]
 
-s_or_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x87]
+s_or_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x87]
 
-s_or_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x87]
+s_or_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x87]
 
-s_or_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x87]
+s_or_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x87]
 
-s_or_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x87]
+s_or_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x87]
 
-s_or_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x87]
+s_or_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x87]
 
-s_or_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x87]
+s_or_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x87]
 
-s_or_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x87]
+s_or_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x87]
 
-s_or_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x87,0x56,0x34,0x12,0xaf]
+s_or_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x87,0x56,0x34,0x12,0xaf]
 
-s_or_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x87,0x73,0x72,0x71,0x3f]
+s_or_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x87,0x73,0x72,0x71,0x3f]
 
-s_or_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x87]
 
-s_or_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x87]
+s_or_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x87]
 
-s_or_b64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0x87]
+s_or_b64 s[100:101], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe4,0x87]
 
-s_or_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x87]
+s_or_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x87]
 
-s_or_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x87]
+s_or_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x87]
 
-s_or_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x87]
+s_or_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x87]
 
-s_or_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x87]
+s_or_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x87]
 
-s_or_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x87]
+s_or_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x87]
 
-s_or_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x87]
+s_or_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x87]
 
-s_or_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x87]
+s_or_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x80,0x87]
+s_or_b64 s[10:11], s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x80,0x87]
+s_or_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x87]
+s_or_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x87]
+s_or_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x87]
+s_or_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x87]
+s_or_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x87]
+s_or_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x87]
+s_or_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x87]
+s_or_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x87]
+s_or_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x87]
+s_or_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x87]
 
-s_or_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x87,0x56,0x34,0x12,0xaf]
+s_or_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x87,0x56,0x34,0x12,0xaf]
 
-s_or_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x87,0x73,0x72,0x71,0x3f]
+s_or_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x87,0x73,0x72,0x71,0x3f]
 
-s_or_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x87]
+s_or_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x87]
 
-s_or_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x87,0x56,0x34,0x12,0xaf]
+s_or_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x87,0x56,0x34,0x12,0xaf]
 
-s_or_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x87,0x73,0x72,0x71,0x3f]
+s_or_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x87,0x73,0x72,0x71,0x3f]
 
-s_xor_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x88]
+s_xor_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x88]
 
-s_xor_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x88]
+s_xor_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x88]
 
-s_xor_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x88]
+s_xor_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x88]
 
-s_xor_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x88]
+s_xor_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x88]
 
-s_xor_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x88]
+s_xor_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x88]
 
-s_xor_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x88]
+s_xor_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x88]
 
-s_xor_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x88]
+s_xor_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x88]
 
-s_xor_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x88]
+s_xor_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x88]
 
-s_xor_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x88]
+s_xor_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x88]
 
-s_xor_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x88]
+s_xor_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x88]
 
-s_xor_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x88]
+s_xor_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x88]
 
-s_xor_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x88]
+s_xor_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x88]
 
-s_xor_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x88]
+s_xor_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x88]
 
-s_xor_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x88]
+s_xor_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x88]
 
-s_xor_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x88]
+s_xor_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x88]
 
-s_xor_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x88]
+s_xor_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x88]
 
-s_xor_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x88]
+s_xor_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x88]
 
-s_xor_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x88]
+s_xor_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x88]
 
-s_xor_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x88]
+s_xor_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x88]
 
-s_xor_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x88]
+s_xor_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x88]
 
-s_xor_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x88]
+s_xor_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x88]
 
-s_xor_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x88]
+s_xor_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x88]
 
-s_xor_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x88]
+s_xor_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x88]
 
-s_xor_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x88]
+s_xor_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x88]
 
-s_xor_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x88]
+s_xor_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x88]
 
-s_xor_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x88]
+s_xor_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x88]
 
-s_xor_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x88]
+s_xor_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x88]
 
-s_xor_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x88]
+s_xor_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x88]
 
-s_xor_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x88]
+s_xor_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x88]
 
-s_xor_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x88]
+s_xor_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x88]
 
-s_xor_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x88]
+s_xor_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x88]
 
-s_xor_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x88,0x56,0x34,0x12,0xaf]
+s_xor_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x88,0x56,0x34,0x12,0xaf]
 
-s_xor_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x88,0x73,0x72,0x71,0x3f]
+s_xor_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x88,0x73,0x72,0x71,0x3f]
 
-s_xor_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x88]
+s_xor_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x88]
 
-s_xor_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x88]
+s_xor_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x88]
 
-s_xor_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x88]
+s_xor_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x88]
 
-s_xor_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x88]
+s_xor_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x88]
 
-s_xor_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x88]
+s_xor_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x88]
 
-s_xor_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x88]
+s_xor_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x88]
 
-s_xor_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x88]
+s_xor_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x88]
 
-s_xor_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x88]
+s_xor_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x88]
 
-s_xor_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x88]
+s_xor_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x88]
 
-s_xor_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x88]
+s_xor_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x88]
 
-s_xor_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x88]
+s_xor_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x88]
 
-s_xor_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x88]
+s_xor_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x88]
 
-s_xor_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x88]
+s_xor_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x88]
 
-s_xor_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x88]
+s_xor_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x88]
 
-s_xor_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x88]
+s_xor_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x88]
 
-s_xor_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x88]
+s_xor_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x88]
 
-s_xor_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x88]
+s_xor_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x88]
 
-s_xor_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x88,0x56,0x34,0x12,0xaf]
+s_xor_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x88,0x56,0x34,0x12,0xaf]
 
-s_xor_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x88,0x73,0x72,0x71,0x3f]
+s_xor_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x88,0x73,0x72,0x71,0x3f]
 
-s_xor_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x88]
 
-s_xor_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x88]
+s_xor_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x88]
 
-s_xor_b64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0x88]
+s_xor_b64 s[100:101], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe4,0x88]
 
-s_xor_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x88]
+s_xor_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x88]
 
-s_xor_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x88]
+s_xor_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x88]
 
-s_xor_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x88]
+s_xor_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x88]
 
-s_xor_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x88]
+s_xor_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x88]
 
-s_xor_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x88]
+s_xor_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x88]
 
-s_xor_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x88]
+s_xor_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x88]
 
-s_xor_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x88]
+s_xor_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x80,0x88]
+s_xor_b64 s[10:11], s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x80,0x88]
+s_xor_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x88]
+s_xor_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x88]
+s_xor_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x88]
+s_xor_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x88]
+s_xor_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x88]
+s_xor_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x88]
+s_xor_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x88]
+s_xor_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x88]
+s_xor_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x88]
+s_xor_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x88]
 
-s_xor_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x88,0x56,0x34,0x12,0xaf]
+s_xor_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x88,0x56,0x34,0x12,0xaf]
 
-s_xor_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x88,0x73,0x72,0x71,0x3f]
+s_xor_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x88,0x73,0x72,0x71,0x3f]
 
-s_xor_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x88]
+s_xor_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x88]
 
-s_xor_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x88,0x56,0x34,0x12,0xaf]
+s_xor_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x88,0x56,0x34,0x12,0xaf]
 
-s_xor_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x88,0x73,0x72,0x71,0x3f]
+s_xor_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x88,0x73,0x72,0x71,0x3f]
 
-s_andn2_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x89]
+s_andn2_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x89]
 
-s_andn2_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x89]
+s_andn2_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x89]
 
-s_andn2_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x89]
+s_andn2_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x89]
 
-s_andn2_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x89]
+s_andn2_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x89]
 
-s_andn2_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x89]
+s_andn2_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x89]
 
-s_andn2_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x89]
+s_andn2_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x89]
 
-s_andn2_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x89]
+s_andn2_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x89]
 
-s_andn2_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x89]
+s_andn2_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x89]
 
-s_andn2_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x89]
+s_andn2_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x89]
 
-s_andn2_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x89]
+s_andn2_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x89]
 
-s_andn2_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x89]
+s_andn2_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x89]
 
-s_andn2_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x89]
+s_andn2_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x89]
 
-s_andn2_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x89]
+s_andn2_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x89]
 
-s_andn2_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x89]
+s_andn2_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x89]
 
-s_andn2_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x89]
+s_andn2_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x89]
 
-s_andn2_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x89]
+s_andn2_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x89]
 
-s_andn2_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x89]
+s_andn2_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x89]
 
-s_andn2_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x89]
+s_andn2_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x89]
 
-s_andn2_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x89]
+s_andn2_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x89]
 
-s_andn2_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x89]
+s_andn2_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x89]
 
-s_andn2_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x89]
+s_andn2_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x89]
 
-s_andn2_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x89]
+s_andn2_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x89]
 
-s_andn2_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x89]
+s_andn2_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x89]
 
-s_andn2_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x89]
+s_andn2_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x89]
 
-s_andn2_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x89]
+s_andn2_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x89]
 
-s_andn2_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x89]
+s_andn2_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x89]
 
-s_andn2_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x89]
+s_andn2_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x89]
 
-s_andn2_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x89]
+s_andn2_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x89]
 
-s_andn2_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x89]
+s_andn2_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x89]
 
-s_andn2_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x89]
+s_andn2_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x89]
 
-s_andn2_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x89]
+s_andn2_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x89]
 
-s_andn2_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x89,0x56,0x34,0x12,0xaf]
+s_andn2_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x89,0x56,0x34,0x12,0xaf]
 
-s_andn2_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x89,0x73,0x72,0x71,0x3f]
+s_andn2_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x89,0x73,0x72,0x71,0x3f]
 
-s_andn2_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x89]
+s_andn2_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x89]
 
-s_andn2_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x89]
+s_andn2_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x89]
 
-s_andn2_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x89]
+s_andn2_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x89]
 
-s_andn2_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x89]
+s_andn2_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x89]
 
-s_andn2_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x89]
+s_andn2_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x89]
 
-s_andn2_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x89]
+s_andn2_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x89]
 
-s_andn2_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x89]
+s_andn2_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x89]
 
-s_andn2_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x89]
+s_andn2_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x89]
 
-s_andn2_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x89]
+s_andn2_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x89]
 
-s_andn2_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x89]
+s_andn2_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x89]
 
-s_andn2_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x89]
+s_andn2_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x89]
 
-s_andn2_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x89]
+s_andn2_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x89]
 
-s_andn2_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x89]
+s_andn2_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x89]
 
-s_andn2_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x89]
+s_andn2_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x89]
 
-s_andn2_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x89]
+s_andn2_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x89]
 
-s_andn2_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x89]
+s_andn2_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x89]
 
-s_andn2_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x89]
+s_andn2_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x89]
 
-s_andn2_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x89,0x56,0x34,0x12,0xaf]
+s_andn2_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x89,0x56,0x34,0x12,0xaf]
 
-s_andn2_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x89,0x73,0x72,0x71,0x3f]
+s_andn2_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x89,0x73,0x72,0x71,0x3f]
 
-s_andn2_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x89]
 
-s_andn2_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x89]
+s_andn2_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x89]
 
-s_andn2_b64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0x89]
+s_andn2_b64 s[100:101], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe4,0x89]
 
-s_andn2_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x89]
+s_andn2_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x89]
 
-s_andn2_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x89]
+s_andn2_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x89]
 
-s_andn2_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x89]
+s_andn2_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x89]
 
-s_andn2_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x89]
+s_andn2_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x89]
 
-s_andn2_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x89]
+s_andn2_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x89]
 
-s_andn2_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x89]
+s_andn2_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x89]
 
-s_andn2_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x89]
+s_andn2_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x89]
 
-s_andn2_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x89,0x56,0x34,0x12,0xaf]
+s_andn2_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x89,0x56,0x34,0x12,0xaf]
 
-s_andn2_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x89,0x73,0x72,0x71,0x3f]
+s_andn2_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x89,0x73,0x72,0x71,0x3f]
 
-s_andn2_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x89]
+s_andn2_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x89]
 
-s_andn2_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x89,0x56,0x34,0x12,0xaf]
+s_andn2_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x89,0x56,0x34,0x12,0xaf]
 
-s_andn2_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x89,0x73,0x72,0x71,0x3f]
+s_andn2_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x89,0x73,0x72,0x71,0x3f]
 
-s_orn2_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8a]
+s_orn2_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8a]
 
-s_orn2_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x8a]
+s_orn2_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x8a]
 
-s_orn2_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x8a]
+s_orn2_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x8a]
 
-s_orn2_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8a]
+s_orn2_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8a]
 
-s_orn2_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8a]
+s_orn2_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8a]
 
-s_orn2_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8a]
+s_orn2_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8a]
 
-s_orn2_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8a]
+s_orn2_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8a]
 
-s_orn2_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8a]
+s_orn2_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8a]
 
-s_orn2_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8a]
+s_orn2_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8a]
 
-s_orn2_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8a]
+s_orn2_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8a]
 
-s_orn2_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8a]
+s_orn2_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8a]
 
-s_orn2_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8a]
+s_orn2_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8a]
 
-s_orn2_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8a]
+s_orn2_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8a]
 
-s_orn2_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8a]
+s_orn2_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8a]
 
-s_orn2_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x8a]
+s_orn2_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x8a]
+s_orn2_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x8a]
+s_orn2_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8a]
+s_orn2_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8a]
+s_orn2_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8a]
+s_orn2_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8a]
+s_orn2_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8a]
+s_orn2_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8a]
+s_orn2_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8a]
+s_orn2_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8a]
+s_orn2_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8a]
+s_orn2_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8a]
+s_orn2_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8a]
+s_orn2_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8a]
+s_orn2_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8a]
+s_orn2_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8a]
+s_orn2_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8a]
 
-s_orn2_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8a,0x56,0x34,0x12,0xaf]
+s_orn2_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8a,0x56,0x34,0x12,0xaf]
 
-s_orn2_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8a,0x73,0x72,0x71,0x3f]
+s_orn2_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8a,0x73,0x72,0x71,0x3f]
 
-s_orn2_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x8a]
+s_orn2_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x8a]
 
-s_orn2_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x8a]
+s_orn2_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x8a]
 
-s_orn2_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x8a]
+s_orn2_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x8a]
 
-s_orn2_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8a]
+s_orn2_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8a]
 
-s_orn2_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8a]
+s_orn2_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8a]
 
-s_orn2_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8a]
+s_orn2_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8a]
 
-s_orn2_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8a]
+s_orn2_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8a]
 
-s_orn2_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8a]
+s_orn2_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8a]
 
-s_orn2_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8a]
+s_orn2_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8a]
 
-s_orn2_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8a]
+s_orn2_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8a]
 
-s_orn2_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8a]
+s_orn2_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8a]
 
-s_orn2_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8a]
+s_orn2_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8a]
 
-s_orn2_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8a]
+s_orn2_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8a]
 
-s_orn2_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8a]
+s_orn2_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8a]
 
-s_orn2_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8a]
+s_orn2_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8a]
 
-s_orn2_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8a]
+s_orn2_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8a]
 
-s_orn2_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8a]
+s_orn2_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8a]
 
-s_orn2_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8a,0x56,0x34,0x12,0xaf]
+s_orn2_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8a,0x56,0x34,0x12,0xaf]
 
-s_orn2_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8a,0x73,0x72,0x71,0x3f]
+s_orn2_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8a,0x73,0x72,0x71,0x3f]
 
-s_orn2_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x8a]
+s_orn2_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x8a]
 
-s_orn2_b64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0x8a]
+s_orn2_b64 s[100:101], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe4,0x8a]
 
-s_orn2_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x8a]
+s_orn2_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x8a]
 
-s_orn2_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x8a]
+s_orn2_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x8a]
 
-s_orn2_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x8a]
+s_orn2_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x8a]
 
-s_orn2_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x8a]
+s_orn2_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x8a]
 
-s_orn2_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x8a]
+s_orn2_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x8a]
 
-s_orn2_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x8a]
+s_orn2_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x8a]
 
-s_orn2_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x8a]
+s_orn2_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8a,0x56,0x34,0x12,0xaf]
+s_orn2_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8a,0x56,0x34,0x12,0xaf]
 
-s_orn2_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8a,0x73,0x72,0x71,0x3f]
+s_orn2_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8a,0x73,0x72,0x71,0x3f]
 
-s_orn2_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8a]
+s_orn2_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8a]
 
-s_orn2_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8a,0x56,0x34,0x12,0xaf]
+s_orn2_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8a,0x56,0x34,0x12,0xaf]
 
-s_orn2_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8a,0x73,0x72,0x71,0x3f]
+s_orn2_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8a,0x73,0x72,0x71,0x3f]
 
-s_nand_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8b]
+s_nand_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8b]
 
-s_nand_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x8b]
+s_nand_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x8b]
 
-s_nand_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x8b]
+s_nand_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x8b]
 
-s_nand_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8b]
+s_nand_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8b]
 
-s_nand_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8b]
+s_nand_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8b]
 
-s_nand_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8b]
+s_nand_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8b]
 
-s_nand_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8b]
+s_nand_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8b]
 
-s_nand_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8b]
+s_nand_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8b]
 
-s_nand_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8b]
+s_nand_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8b]
 
-s_nand_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8b]
+s_nand_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8b]
 
-s_nand_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8b]
+s_nand_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8b]
 
-s_nand_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8b]
+s_nand_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8b]
 
-s_nand_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8b]
+s_nand_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8b]
 
-s_nand_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8b]
+s_nand_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8b]
 
-s_nand_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x8b]
+s_nand_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x8b]
 
-s_nand_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x8b]
+s_nand_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x8b]
 
-s_nand_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x8b]
+s_nand_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x8b]
 
-s_nand_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8b]
+s_nand_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8b]
 
-s_nand_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8b]
+s_nand_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8b]
 
-s_nand_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8b]
+s_nand_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8b]
 
-s_nand_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8b]
+s_nand_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8b]
 
-s_nand_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8b]
+s_nand_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8b]
 
-s_nand_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8b]
+s_nand_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8b]
 
-s_nand_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8b]
+s_nand_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8b]
 
-s_nand_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8b]
+s_nand_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8b]
 
-s_nand_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8b]
+s_nand_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8b]
 
-s_nand_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8b]
+s_nand_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8b]
 
-s_nand_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8b]
+s_nand_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8b]
 
-s_nand_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8b]
+s_nand_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8b]
 
-s_nand_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8b]
+s_nand_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8b]
 
-s_nand_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8b]
+s_nand_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8b]
 
-s_nand_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8b,0x56,0x34,0x12,0xaf]
+s_nand_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8b,0x56,0x34,0x12,0xaf]
 
-s_nand_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8b,0x73,0x72,0x71,0x3f]
+s_nand_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8b,0x73,0x72,0x71,0x3f]
 
-s_nand_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x8b]
+s_nand_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x8b]
 
-s_nand_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x8b]
+s_nand_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x8b]
 
-s_nand_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x8b]
+s_nand_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x8b]
 
-s_nand_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8b]
+s_nand_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8b]
 
-s_nand_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8b]
+s_nand_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8b]
 
-s_nand_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8b]
+s_nand_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8b]
 
-s_nand_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8b]
+s_nand_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8b]
 
-s_nand_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8b]
+s_nand_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8b]
 
-s_nand_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8b]
+s_nand_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8b]
 
-s_nand_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8b]
+s_nand_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8b]
 
-s_nand_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8b]
+s_nand_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8b]
 
-s_nand_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8b]
+s_nand_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8b]
 
-s_nand_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8b]
+s_nand_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8b]
 
-s_nand_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8b]
+s_nand_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8b]
 
-s_nand_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8b]
+s_nand_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8b]
 
-s_nand_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8b]
+s_nand_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8b]
 
-s_nand_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8b]
+s_nand_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8b]
 
-s_nand_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8b,0x56,0x34,0x12,0xaf]
+s_nand_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8b,0x56,0x34,0x12,0xaf]
 
-s_nand_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8b,0x73,0x72,0x71,0x3f]
+s_nand_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8b,0x73,0x72,0x71,0x3f]
 
-s_nand_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x8b]
 
-s_nand_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x8b]
+s_nand_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x8b]
 
-s_nand_b64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0x8b]
+s_nand_b64 s[100:101], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe4,0x8b]
 
-s_nand_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x8b]
+s_nand_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x8b]
 
-s_nand_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x8b]
+s_nand_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x8b]
 
-s_nand_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x8b]
+s_nand_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x8b]
 
-s_nand_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x8b]
+s_nand_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x8b]
 
-s_nand_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x8b]
+s_nand_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x8b]
 
-s_nand_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x8b]
+s_nand_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x8b]
 
-s_nand_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x8b]
+s_nand_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x8b]
 
-s_nand_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8b,0x56,0x34,0x12,0xaf]
+s_nand_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8b,0x56,0x34,0x12,0xaf]
 
-s_nand_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8b,0x73,0x72,0x71,0x3f]
+s_nand_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8b,0x73,0x72,0x71,0x3f]
 
-s_nand_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8b]
+s_nand_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8b]
 
-s_nand_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8b,0x56,0x34,0x12,0xaf]
+s_nand_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8b,0x56,0x34,0x12,0xaf]
 
-s_nand_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8b,0x73,0x72,0x71,0x3f]
+s_nand_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8b,0x73,0x72,0x71,0x3f]
 
-s_nor_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8c]
+s_nor_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8c]
 
-s_nor_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x8c]
+s_nor_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x8c]
 
-s_nor_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x8c]
+s_nor_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x8c]
 
-s_nor_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8c]
+s_nor_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8c]
 
-s_nor_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8c]
+s_nor_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8c]
 
-s_nor_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8c]
+s_nor_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8c]
 
-s_nor_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8c]
+s_nor_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8c]
 
-s_nor_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8c]
+s_nor_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8c]
 
-s_nor_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8c]
+s_nor_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8c]
 
-s_nor_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8c]
+s_nor_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8c]
 
-s_nor_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8c]
+s_nor_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8c]
 
-s_nor_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8c]
+s_nor_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8c]
 
-s_nor_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8c]
+s_nor_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8c]
 
-s_nor_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8c]
+s_nor_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8c]
 
-s_nor_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x8c]
+s_nor_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x8c]
 
-s_nor_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x8c]
+s_nor_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x8c]
 
-s_nor_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x8c]
+s_nor_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x8c]
 
-s_nor_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8c]
+s_nor_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8c]
 
-s_nor_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8c]
+s_nor_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8c]
 
-s_nor_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8c]
+s_nor_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8c]
 
-s_nor_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8c]
+s_nor_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8c]
 
-s_nor_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8c]
+s_nor_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8c]
 
-s_nor_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8c]
+s_nor_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8c]
 
-s_nor_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8c]
+s_nor_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8c]
 
-s_nor_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8c]
+s_nor_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8c]
 
-s_nor_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8c]
+s_nor_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8c]
 
-s_nor_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8c]
+s_nor_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8c]
 
-s_nor_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8c]
+s_nor_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8c]
 
-s_nor_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8c]
+s_nor_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8c]
 
-s_nor_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8c]
+s_nor_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8c]
 
-s_nor_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8c]
+s_nor_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8c]
 
-s_nor_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8c,0x56,0x34,0x12,0xaf]
+s_nor_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8c,0x56,0x34,0x12,0xaf]
 
-s_nor_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8c,0x73,0x72,0x71,0x3f]
+s_nor_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8c,0x73,0x72,0x71,0x3f]
 
-s_nor_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x8c]
+s_nor_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x8c]
 
-s_nor_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x8c]
+s_nor_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x8c]
 
-s_nor_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x8c]
+s_nor_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x8c]
 
-s_nor_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8c]
+s_nor_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8c]
 
-s_nor_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8c]
+s_nor_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8c]
 
-s_nor_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8c]
+s_nor_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8c]
 
-s_nor_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8c]
+s_nor_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8c]
 
-s_nor_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8c]
+s_nor_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8c]
 
-s_nor_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8c]
+s_nor_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8c]
 
-s_nor_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8c]
+s_nor_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8c]
 
-s_nor_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8c]
+s_nor_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8c]
 
-s_nor_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8c]
+s_nor_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8c]
 
-s_nor_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8c]
+s_nor_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8c]
 
-s_nor_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8c]
+s_nor_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8c]
 
-s_nor_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8c]
+s_nor_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8c]
 
-s_nor_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8c]
+s_nor_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8c]
 
-s_nor_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8c]
+s_nor_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8c]
 
-s_nor_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8c,0x56,0x34,0x12,0xaf]
+s_nor_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8c,0x56,0x34,0x12,0xaf]
 
-s_nor_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8c,0x73,0x72,0x71,0x3f]
+s_nor_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8c,0x73,0x72,0x71,0x3f]
 
-s_nor_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x8c]
 
-s_nor_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x8c]
+s_nor_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x8c]
 
-s_nor_b64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0x8c]
+s_nor_b64 s[100:101], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe4,0x8c]
 
-s_nor_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x8c]
+s_nor_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x8c]
 
-s_nor_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x8c]
+s_nor_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x8c]
 
-s_nor_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x8c]
+s_nor_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x8c]
 
-s_nor_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x8c]
+s_nor_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x8c]
 
-s_nor_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x8c]
+s_nor_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x8c]
 
-s_nor_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x8c]
+s_nor_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x8c]
 
-s_nor_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x8c]
+s_nor_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x8c]
 
-s_nor_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8c,0x56,0x34,0x12,0xaf]
+s_nor_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8c,0x56,0x34,0x12,0xaf]
 
-s_nor_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8c,0x73,0x72,0x71,0x3f]
+s_nor_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8c,0x73,0x72,0x71,0x3f]
 
-s_nor_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8c]
+s_nor_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8c]
 
-s_nor_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8c,0x56,0x34,0x12,0xaf]
+s_nor_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8c,0x56,0x34,0x12,0xaf]
 
-s_nor_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8c,0x73,0x72,0x71,0x3f]
+s_nor_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8c,0x73,0x72,0x71,0x3f]
 
-s_xnor_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8d]
+s_xnor_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8d]
 
-s_xnor_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x8d]
+s_xnor_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x8d]
 
-s_xnor_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x8d]
+s_xnor_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x8d]
 
-s_xnor_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8d]
+s_xnor_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8d]
 
-s_xnor_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8d]
+s_xnor_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8d]
 
-s_xnor_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8d]
+s_xnor_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8d]
 
-s_xnor_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8d]
+s_xnor_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8d]
 
-s_xnor_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8d]
+s_xnor_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8d]
 
-s_xnor_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8d]
+s_xnor_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8d]
 
-s_xnor_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8d]
+s_xnor_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8d]
 
-s_xnor_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8d]
+s_xnor_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8d]
 
-s_xnor_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8d]
+s_xnor_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8d]
 
-s_xnor_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8d]
+s_xnor_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8d]
 
-s_xnor_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8d]
+s_xnor_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8d]
 
-s_xnor_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x8d]
+s_xnor_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x8d]
+s_xnor_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x8d]
+s_xnor_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8d]
+s_xnor_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8d]
+s_xnor_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8d]
+s_xnor_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8d]
+s_xnor_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8d]
+s_xnor_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8d]
+s_xnor_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8d]
+s_xnor_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8d]
+s_xnor_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8d]
+s_xnor_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8d]
+s_xnor_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8d]
+s_xnor_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8d]
+s_xnor_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8d]
+s_xnor_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8d]
+s_xnor_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8d]
 
-s_xnor_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8d,0x56,0x34,0x12,0xaf]
+s_xnor_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8d,0x56,0x34,0x12,0xaf]
 
-s_xnor_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8d,0x73,0x72,0x71,0x3f]
+s_xnor_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8d,0x73,0x72,0x71,0x3f]
 
-s_xnor_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x8d]
+s_xnor_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x8d]
 
-s_xnor_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x8d]
+s_xnor_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x8d]
 
-s_xnor_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x8d]
+s_xnor_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x8d]
 
-s_xnor_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8d]
+s_xnor_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8d]
 
-s_xnor_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8d]
+s_xnor_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8d]
 
-s_xnor_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8d]
+s_xnor_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8d]
 
-s_xnor_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8d]
+s_xnor_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8d]
 
-s_xnor_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8d]
+s_xnor_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8d]
 
-s_xnor_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8d]
+s_xnor_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8d]
 
-s_xnor_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8d]
+s_xnor_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8d]
 
-s_xnor_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8d]
+s_xnor_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8d]
 
-s_xnor_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8d]
+s_xnor_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8d]
 
-s_xnor_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8d]
+s_xnor_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8d]
 
-s_xnor_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8d]
+s_xnor_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8d]
 
-s_xnor_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8d]
+s_xnor_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8d]
 
-s_xnor_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8d]
+s_xnor_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8d]
 
-s_xnor_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8d]
+s_xnor_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8d]
 
-s_xnor_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8d,0x56,0x34,0x12,0xaf]
+s_xnor_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8d,0x56,0x34,0x12,0xaf]
 
-s_xnor_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8d,0x73,0x72,0x71,0x3f]
+s_xnor_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8d,0x73,0x72,0x71,0x3f]
 
-s_xnor_b64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0x8d]
+s_xnor_b64 s[12:13], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x8c,0x8d]
 
-s_xnor_b64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0x8d]
+s_xnor_b64 s[100:101], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe4,0x8d]
 
-s_xnor_b64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0x8d]
+s_xnor_b64 flat_scratch, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xe6,0x8d]
 
-s_xnor_b64 vcc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0x8d]
+s_xnor_b64 vcc, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xea,0x8d]
 
-s_xnor_b64 tba, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0x8d]
+s_xnor_b64 tba, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xec,0x8d]
 
-s_xnor_b64 tma, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0x8d]
+s_xnor_b64 tma, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xee,0x8d]
 
-s_xnor_b64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0x8d]
+s_xnor_b64 ttmp[10:11], s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfa,0x8d]
 
-s_xnor_b64 exec, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0x8d]
+s_xnor_b64 exec, s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0xfe,0x8d]
 
-s_xnor_b64 s[0:1], s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], tba, s[4:5]
+// CHECK: [0x6c,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], tma, s[4:5]
+// CHECK: [0x6e,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], exec, s[4:5]
+// CHECK: [0x7e,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], 0, s[0:1]
-// CHECK: [0x80,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], 0, s[4:5]
+// CHECK: [0x80,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], -1, s[0:1]
-// CHECK: [0xc1,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], -1, s[4:5]
+// CHECK: [0xc1,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x80,0x8d]
+s_xnor_b64 s[10:11], -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8d,0x56,0x34,0x12,0xaf]
+s_xnor_b64 s[10:11], 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8d,0x56,0x34,0x12,0xaf]
 
-s_xnor_b64 s[0:1], 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x80,0x8d,0x73,0x72,0x71,0x3f]
+s_xnor_b64 s[10:11], 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x8a,0x8d,0x73,0x72,0x71,0x3f]
 
-s_xnor_b64 s[0:1], s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], vcc
+// CHECK: [0x02,0x6a,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], tba
+// CHECK: [0x02,0x6c,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], tma
+// CHECK: [0x02,0x6e,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], exec
+// CHECK: [0x02,0x7e,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8d]
+s_xnor_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8d]
 
-s_xnor_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8d,0x56,0x34,0x12,0xaf]
+s_xnor_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8d,0x56,0x34,0x12,0xaf]
 
-s_xnor_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8d,0x73,0x72,0x71,0x3f]
+s_xnor_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8d,0x73,0x72,0x71,0x3f]
 
-s_lshl_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8e]
+s_lshl_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8e]
 
-s_lshl_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x8e]
+s_lshl_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x8e]
 
-s_lshl_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x8e]
+s_lshl_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x8e]
 
-s_lshl_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8e]
+s_lshl_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8e]
 
-s_lshl_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8e]
+s_lshl_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8e]
 
-s_lshl_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8e]
+s_lshl_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8e]
 
-s_lshl_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8e]
+s_lshl_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8e]
 
-s_lshl_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8e]
+s_lshl_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8e]
 
-s_lshl_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8e]
+s_lshl_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8e]
 
-s_lshl_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8e]
+s_lshl_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8e]
 
-s_lshl_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8e]
+s_lshl_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8e]
 
-s_lshl_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8e]
+s_lshl_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8e]
 
-s_lshl_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8e]
+s_lshl_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8e]
 
-s_lshl_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8e]
+s_lshl_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8e]
 
-s_lshl_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x8e]
+s_lshl_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x8e]
+s_lshl_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x8e]
+s_lshl_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8e]
+s_lshl_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8e]
+s_lshl_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8e]
+s_lshl_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8e]
+s_lshl_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8e]
+s_lshl_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8e]
+s_lshl_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8e]
+s_lshl_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8e]
+s_lshl_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8e]
+s_lshl_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8e]
+s_lshl_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8e]
+s_lshl_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8e]
+s_lshl_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8e]
+s_lshl_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8e]
+s_lshl_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8e]
 
-s_lshl_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8e,0x56,0x34,0x12,0xaf]
+s_lshl_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8e,0x56,0x34,0x12,0xaf]
 
-s_lshl_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8e,0x73,0x72,0x71,0x3f]
+s_lshl_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8e,0x73,0x72,0x71,0x3f]
 
-s_lshl_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x8e]
+s_lshl_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x8e]
 
-s_lshl_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x8e]
+s_lshl_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x8e]
 
-s_lshl_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x8e]
+s_lshl_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x8e]
 
-s_lshl_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8e]
+s_lshl_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8e]
 
-s_lshl_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8e]
+s_lshl_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8e]
 
-s_lshl_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8e]
+s_lshl_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8e]
 
-s_lshl_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8e]
+s_lshl_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8e]
 
-s_lshl_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8e]
+s_lshl_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8e]
 
-s_lshl_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8e]
+s_lshl_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8e]
 
-s_lshl_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8e]
+s_lshl_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8e]
 
-s_lshl_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8e]
+s_lshl_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8e]
 
-s_lshl_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8e]
+s_lshl_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8e]
 
-s_lshl_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8e]
+s_lshl_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8e]
 
-s_lshl_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8e]
+s_lshl_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8e]
 
-s_lshl_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8e]
+s_lshl_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8e]
 
-s_lshl_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8e]
+s_lshl_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8e]
 
-s_lshl_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8e]
+s_lshl_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8e]
 
-s_lshl_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8e,0x56,0x34,0x12,0xaf]
+s_lshl_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8e,0x56,0x34,0x12,0xaf]
 
-s_lshl_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8e,0x73,0x72,0x71,0x3f]
+s_lshl_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8e,0x73,0x72,0x71,0x3f]
 
-s_lshl_b64 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x82,0x8e]
+s_lshl_b64 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x8c,0x8e]
 
-s_lshl_b64 s[100:101], s[0:1], s0
-// CHECK: [0x00,0x00,0xe4,0x8e]
+s_lshl_b64 s[100:101], s[2:3], s2
+// CHECK: [0x02,0x02,0xe4,0x8e]
 
-s_lshl_b64 flat_scratch, s[0:1], s0
-// CHECK: [0x00,0x00,0xe6,0x8e]
+s_lshl_b64 flat_scratch, s[2:3], s2
+// CHECK: [0x02,0x02,0xe6,0x8e]
 
-s_lshl_b64 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0xea,0x8e]
+s_lshl_b64 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0xea,0x8e]
 
-s_lshl_b64 tba, s[0:1], s0
-// CHECK: [0x00,0x00,0xec,0x8e]
+s_lshl_b64 tba, s[2:3], s2
+// CHECK: [0x02,0x02,0xec,0x8e]
 
-s_lshl_b64 tma, s[0:1], s0
-// CHECK: [0x00,0x00,0xee,0x8e]
+s_lshl_b64 tma, s[2:3], s2
+// CHECK: [0x02,0x02,0xee,0x8e]
 
-s_lshl_b64 ttmp[10:11], s[0:1], s0
-// CHECK: [0x00,0x00,0xfa,0x8e]
+s_lshl_b64 ttmp[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0xfa,0x8e]
 
-s_lshl_b64 exec, s[0:1], s0
-// CHECK: [0x00,0x00,0xfe,0x8e]
+s_lshl_b64 exec, s[2:3], s2
+// CHECK: [0x02,0x02,0xfe,0x8e]
 
-s_lshl_b64 s[0:1], s[2:3], s0
-// CHECK: [0x02,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], s[4:5], s2
+// CHECK: [0x04,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[100:101], s0
-// CHECK: [0x64,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], s[100:101], s2
+// CHECK: [0x64,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], flat_scratch, s0
-// CHECK: [0x66,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], flat_scratch, s2
+// CHECK: [0x66,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], vcc, s0
-// CHECK: [0x6a,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], vcc, s2
+// CHECK: [0x6a,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], tba, s0
-// CHECK: [0x6c,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], tba, s2
+// CHECK: [0x6c,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], tma, s0
-// CHECK: [0x6e,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], tma, s2
+// CHECK: [0x6e,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], exec, s0
-// CHECK: [0x7e,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], exec, s2
+// CHECK: [0x7e,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x8e]
+s_lshl_b64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x8e,0x56,0x34,0x12,0xaf]
+s_lshl_b64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x8a,0x8e,0x56,0x34,0x12,0xaf]
 
-s_lshl_b64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x8e,0x73,0x72,0x71,0x3f]
+s_lshl_b64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x8a,0x8e,0x73,0x72,0x71,0x3f]
 
-s_lshl_b64 s[0:1], s[0:1], s101
-// CHECK: [0x00,0x65,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], s101
+// CHECK: [0x02,0x65,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x66,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x67,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x7c,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], m0
+// CHECK: [0x02,0x7c,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8e]
+s_lshl_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8e]
 
-s_lshl_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8e,0x56,0x34,0x12,0xaf]
+s_lshl_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8e,0x56,0x34,0x12,0xaf]
 
-s_lshl_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8e,0x73,0x72,0x71,0x3f]
+s_lshl_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8e,0x73,0x72,0x71,0x3f]
 
-s_lshr_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x8f]
+s_lshr_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x8f]
 
-s_lshr_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x8f]
+s_lshr_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x8f]
 
-s_lshr_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x8f]
+s_lshr_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x8f]
 
-s_lshr_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x8f]
+s_lshr_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x8f]
 
-s_lshr_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x8f]
+s_lshr_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x8f]
 
-s_lshr_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x8f]
+s_lshr_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x8f]
 
-s_lshr_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x8f]
+s_lshr_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x8f]
 
-s_lshr_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x8f]
+s_lshr_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x8f]
 
-s_lshr_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x8f]
+s_lshr_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x8f]
 
-s_lshr_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x8f]
+s_lshr_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x8f]
 
-s_lshr_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x8f]
+s_lshr_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x8f]
 
-s_lshr_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x8f]
+s_lshr_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x8f]
 
-s_lshr_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x8f]
+s_lshr_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x8f]
 
-s_lshr_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x8f]
+s_lshr_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x8f]
 
-s_lshr_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x8f]
+s_lshr_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x8f]
+s_lshr_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x8f]
+s_lshr_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x8f]
+s_lshr_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x8f]
+s_lshr_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x8f]
+s_lshr_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x8f]
+s_lshr_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x8f]
+s_lshr_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x8f]
+s_lshr_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x8f]
+s_lshr_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x8f]
+s_lshr_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x8f]
+s_lshr_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x8f]
+s_lshr_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x8f]
+s_lshr_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x8f]
+s_lshr_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x8f]
+s_lshr_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x8f]
+s_lshr_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x8f]
 
-s_lshr_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x8f,0x56,0x34,0x12,0xaf]
+s_lshr_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x8f,0x56,0x34,0x12,0xaf]
 
-s_lshr_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x8f,0x73,0x72,0x71,0x3f]
+s_lshr_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x8f,0x73,0x72,0x71,0x3f]
 
-s_lshr_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x8f]
+s_lshr_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x8f]
 
-s_lshr_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x8f]
+s_lshr_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x8f]
 
-s_lshr_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x8f]
+s_lshr_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x8f]
 
-s_lshr_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x8f]
+s_lshr_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x8f]
 
-s_lshr_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x8f]
+s_lshr_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x8f]
 
-s_lshr_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x8f]
+s_lshr_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x8f]
 
-s_lshr_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x8f]
+s_lshr_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x8f]
 
-s_lshr_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x8f]
+s_lshr_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x8f]
 
-s_lshr_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x8f]
+s_lshr_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x8f]
 
-s_lshr_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x8f]
+s_lshr_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x8f]
 
-s_lshr_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x8f]
+s_lshr_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x8f]
 
-s_lshr_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x8f]
+s_lshr_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x8f]
 
-s_lshr_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x8f]
+s_lshr_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x8f]
 
-s_lshr_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x8f]
+s_lshr_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x8f]
 
-s_lshr_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x8f]
+s_lshr_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x8f]
 
-s_lshr_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x8f]
+s_lshr_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x8f]
 
-s_lshr_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x8f]
+s_lshr_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x8f]
 
-s_lshr_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x8f,0x56,0x34,0x12,0xaf]
+s_lshr_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x8f,0x56,0x34,0x12,0xaf]
 
-s_lshr_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x8f,0x73,0x72,0x71,0x3f]
+s_lshr_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x8f,0x73,0x72,0x71,0x3f]
 
-s_lshr_b64 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x82,0x8f]
+s_lshr_b64 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x8c,0x8f]
 
-s_lshr_b64 s[100:101], s[0:1], s0
-// CHECK: [0x00,0x00,0xe4,0x8f]
+s_lshr_b64 s[100:101], s[2:3], s2
+// CHECK: [0x02,0x02,0xe4,0x8f]
 
-s_lshr_b64 flat_scratch, s[0:1], s0
-// CHECK: [0x00,0x00,0xe6,0x8f]
+s_lshr_b64 flat_scratch, s[2:3], s2
+// CHECK: [0x02,0x02,0xe6,0x8f]
 
-s_lshr_b64 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0xea,0x8f]
+s_lshr_b64 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0xea,0x8f]
 
-s_lshr_b64 tba, s[0:1], s0
-// CHECK: [0x00,0x00,0xec,0x8f]
+s_lshr_b64 tba, s[2:3], s2
+// CHECK: [0x02,0x02,0xec,0x8f]
 
-s_lshr_b64 tma, s[0:1], s0
-// CHECK: [0x00,0x00,0xee,0x8f]
+s_lshr_b64 tma, s[2:3], s2
+// CHECK: [0x02,0x02,0xee,0x8f]
 
-s_lshr_b64 ttmp[10:11], s[0:1], s0
-// CHECK: [0x00,0x00,0xfa,0x8f]
+s_lshr_b64 ttmp[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0xfa,0x8f]
 
-s_lshr_b64 exec, s[0:1], s0
-// CHECK: [0x00,0x00,0xfe,0x8f]
+s_lshr_b64 exec, s[2:3], s2
+// CHECK: [0x02,0x02,0xfe,0x8f]
 
-s_lshr_b64 s[0:1], s[2:3], s0
-// CHECK: [0x02,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], s[4:5], s2
+// CHECK: [0x04,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[100:101], s0
-// CHECK: [0x64,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], s[100:101], s2
+// CHECK: [0x64,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], flat_scratch, s0
-// CHECK: [0x66,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], flat_scratch, s2
+// CHECK: [0x66,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], vcc, s0
-// CHECK: [0x6a,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], vcc, s2
+// CHECK: [0x6a,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], tba, s0
-// CHECK: [0x6c,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], tba, s2
+// CHECK: [0x6c,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], tma, s0
-// CHECK: [0x6e,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], tma, s2
+// CHECK: [0x6e,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], exec, s0
-// CHECK: [0x7e,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], exec, s2
+// CHECK: [0x7e,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x8f]
+s_lshr_b64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x8f,0x56,0x34,0x12,0xaf]
+s_lshr_b64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x8a,0x8f,0x56,0x34,0x12,0xaf]
 
-s_lshr_b64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x8f,0x73,0x72,0x71,0x3f]
+s_lshr_b64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x8a,0x8f,0x73,0x72,0x71,0x3f]
 
-s_lshr_b64 s[0:1], s[0:1], s101
-// CHECK: [0x00,0x65,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], s101
+// CHECK: [0x02,0x65,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x66,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x67,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x7c,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], m0
+// CHECK: [0x02,0x7c,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x8f]
+s_lshr_b64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x8f]
 
-s_lshr_b64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x8f,0x56,0x34,0x12,0xaf]
+s_lshr_b64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x8f,0x56,0x34,0x12,0xaf]
 
-s_lshr_b64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x8f,0x73,0x72,0x71,0x3f]
+s_lshr_b64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x8f,0x73,0x72,0x71,0x3f]
 
-s_ashr_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x90]
+s_ashr_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x90]
 
-s_ashr_i32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x90]
+s_ashr_i32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x90]
 
-s_ashr_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x90]
+s_ashr_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x90]
 
-s_ashr_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x90]
+s_ashr_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x90]
 
-s_ashr_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x90]
+s_ashr_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x90]
 
-s_ashr_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x90]
+s_ashr_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x90]
 
-s_ashr_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x90]
+s_ashr_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x90]
 
-s_ashr_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x90]
+s_ashr_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x90]
 
-s_ashr_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x90]
+s_ashr_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x90]
 
-s_ashr_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x90]
+s_ashr_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x90]
 
-s_ashr_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x90]
+s_ashr_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x90]
 
-s_ashr_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x90]
+s_ashr_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x90]
 
-s_ashr_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x90]
+s_ashr_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x90]
 
-s_ashr_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x90]
+s_ashr_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x90]
 
-s_ashr_i32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x90]
+s_ashr_i32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x90]
 
-s_ashr_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x90]
+s_ashr_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x90]
 
-s_ashr_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x90]
+s_ashr_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x90]
 
-s_ashr_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x90]
+s_ashr_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x90]
 
-s_ashr_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x90]
+s_ashr_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x90]
 
-s_ashr_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x90]
+s_ashr_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x90]
 
-s_ashr_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x90]
+s_ashr_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x90]
 
-s_ashr_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x90]
+s_ashr_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x90]
 
-s_ashr_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x90]
+s_ashr_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x90]
 
-s_ashr_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x90]
+s_ashr_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x90]
 
-s_ashr_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x90]
+s_ashr_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x90]
 
-s_ashr_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x90]
+s_ashr_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x90]
 
-s_ashr_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x90]
+s_ashr_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x90]
 
-s_ashr_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x90]
+s_ashr_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x90]
 
-s_ashr_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x90]
+s_ashr_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x90]
 
-s_ashr_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x90]
+s_ashr_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x90]
 
-s_ashr_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x90]
+s_ashr_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x90]
 
-s_ashr_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x90,0x56,0x34,0x12,0xaf]
+s_ashr_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x90,0x56,0x34,0x12,0xaf]
 
-s_ashr_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x90,0x73,0x72,0x71,0x3f]
+s_ashr_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x90,0x73,0x72,0x71,0x3f]
 
-s_ashr_i32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x90]
+s_ashr_i32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x90]
 
-s_ashr_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x90]
+s_ashr_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x90]
 
-s_ashr_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x90]
+s_ashr_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x90]
 
-s_ashr_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x90]
+s_ashr_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x90]
 
-s_ashr_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x90]
+s_ashr_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x90]
 
-s_ashr_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x90]
+s_ashr_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x90]
 
-s_ashr_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x90]
+s_ashr_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x90]
 
-s_ashr_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x90]
+s_ashr_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x90]
 
-s_ashr_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x90]
+s_ashr_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x90]
 
-s_ashr_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x90]
+s_ashr_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x90]
 
-s_ashr_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x90]
+s_ashr_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x90]
 
-s_ashr_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x90]
+s_ashr_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x90]
 
-s_ashr_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x90]
+s_ashr_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x90]
 
-s_ashr_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x90]
+s_ashr_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x90]
 
-s_ashr_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x90]
+s_ashr_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x90]
 
-s_ashr_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x90]
+s_ashr_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x90]
 
-s_ashr_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x90]
+s_ashr_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x90]
 
-s_ashr_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x90,0x56,0x34,0x12,0xaf]
+s_ashr_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x90,0x56,0x34,0x12,0xaf]
 
-s_ashr_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x90,0x73,0x72,0x71,0x3f]
+s_ashr_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x90,0x73,0x72,0x71,0x3f]
 
-s_ashr_i64 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x8a,0x90]
 
-s_ashr_i64 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x82,0x90]
+s_ashr_i64 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x8c,0x90]
 
-s_ashr_i64 s[100:101], s[0:1], s0
-// CHECK: [0x00,0x00,0xe4,0x90]
+s_ashr_i64 s[100:101], s[2:3], s2
+// CHECK: [0x02,0x02,0xe4,0x90]
 
-s_ashr_i64 flat_scratch, s[0:1], s0
-// CHECK: [0x00,0x00,0xe6,0x90]
+s_ashr_i64 flat_scratch, s[2:3], s2
+// CHECK: [0x02,0x02,0xe6,0x90]
 
-s_ashr_i64 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0xea,0x90]
+s_ashr_i64 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0xea,0x90]
 
-s_ashr_i64 tba, s[0:1], s0
-// CHECK: [0x00,0x00,0xec,0x90]
+s_ashr_i64 tba, s[2:3], s2
+// CHECK: [0x02,0x02,0xec,0x90]
 
-s_ashr_i64 tma, s[0:1], s0
-// CHECK: [0x00,0x00,0xee,0x90]
+s_ashr_i64 tma, s[2:3], s2
+// CHECK: [0x02,0x02,0xee,0x90]
 
-s_ashr_i64 ttmp[10:11], s[0:1], s0
-// CHECK: [0x00,0x00,0xfa,0x90]
+s_ashr_i64 ttmp[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0xfa,0x90]
 
-s_ashr_i64 exec, s[0:1], s0
-// CHECK: [0x00,0x00,0xfe,0x90]
+s_ashr_i64 exec, s[2:3], s2
+// CHECK: [0x02,0x02,0xfe,0x90]
 
-s_ashr_i64 s[0:1], s[2:3], s0
-// CHECK: [0x02,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], s[4:5], s2
+// CHECK: [0x04,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[100:101], s0
-// CHECK: [0x64,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], s[100:101], s2
+// CHECK: [0x64,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], flat_scratch, s0
-// CHECK: [0x66,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], flat_scratch, s2
+// CHECK: [0x66,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], vcc, s0
-// CHECK: [0x6a,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], vcc, s2
+// CHECK: [0x6a,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], tba, s0
-// CHECK: [0x6c,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], tba, s2
+// CHECK: [0x6c,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], tma, s0
-// CHECK: [0x6e,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], tma, s2
+// CHECK: [0x6e,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], exec, s0
-// CHECK: [0x7e,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], exec, s2
+// CHECK: [0x7e,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x90]
+s_ashr_i64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x8a,0x90]
 
-s_ashr_i64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x90,0x56,0x34,0x12,0xaf]
+s_ashr_i64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x8a,0x90,0x56,0x34,0x12,0xaf]
 
-s_ashr_i64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x90,0x73,0x72,0x71,0x3f]
+s_ashr_i64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x8a,0x90,0x73,0x72,0x71,0x3f]
 
-s_ashr_i64 s[0:1], s[0:1], s101
-// CHECK: [0x00,0x65,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], s101
+// CHECK: [0x02,0x65,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x66,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x67,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x7c,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], m0
+// CHECK: [0x02,0x7c,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x90]
+s_ashr_i64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x90]
 
-s_ashr_i64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x90,0x56,0x34,0x12,0xaf]
+s_ashr_i64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x90,0x56,0x34,0x12,0xaf]
 
-s_ashr_i64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x90,0x73,0x72,0x71,0x3f]
+s_ashr_i64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x90,0x73,0x72,0x71,0x3f]
 
-s_bfm_b32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x91]
+s_bfm_b32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x91]
 
-s_bfm_b32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x91]
+s_bfm_b32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x91]
 
-s_bfm_b32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x91]
+s_bfm_b32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x91]
 
-s_bfm_b32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x91]
+s_bfm_b32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x91]
 
-s_bfm_b32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x91]
+s_bfm_b32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x91]
 
-s_bfm_b32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x91]
+s_bfm_b32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x91]
 
-s_bfm_b32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x91]
+s_bfm_b32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x91]
 
-s_bfm_b32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x91]
+s_bfm_b32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x91]
 
-s_bfm_b32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x91]
+s_bfm_b32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x91]
 
-s_bfm_b32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x91]
+s_bfm_b32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x91]
 
-s_bfm_b32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x91]
+s_bfm_b32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x91]
 
-s_bfm_b32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x91]
+s_bfm_b32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x91]
 
-s_bfm_b32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x91]
+s_bfm_b32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x91]
 
-s_bfm_b32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x91]
+s_bfm_b32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x91]
 
-s_bfm_b32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x91]
+s_bfm_b32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x91]
 
-s_bfm_b32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x91]
+s_bfm_b32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x91]
 
-s_bfm_b32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x91]
+s_bfm_b32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x91]
 
-s_bfm_b32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x91]
+s_bfm_b32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x91]
 
-s_bfm_b32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x91]
+s_bfm_b32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x91]
 
-s_bfm_b32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x91]
+s_bfm_b32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x91]
 
-s_bfm_b32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x91]
+s_bfm_b32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x91]
 
-s_bfm_b32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x91]
+s_bfm_b32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x91]
 
-s_bfm_b32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x91]
+s_bfm_b32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x91]
 
-s_bfm_b32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x91]
+s_bfm_b32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x91]
 
-s_bfm_b32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x91]
+s_bfm_b32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x91]
 
-s_bfm_b32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x91]
+s_bfm_b32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x91]
 
-s_bfm_b32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x91]
+s_bfm_b32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x91]
 
-s_bfm_b32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x91]
+s_bfm_b32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x91]
 
-s_bfm_b32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x91]
+s_bfm_b32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x91]
 
-s_bfm_b32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x91]
+s_bfm_b32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x91]
 
-s_bfm_b32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x91]
+s_bfm_b32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x91]
 
-s_bfm_b32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x91,0x56,0x34,0x12,0xaf]
+s_bfm_b32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x91,0x56,0x34,0x12,0xaf]
 
-s_bfm_b32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x91,0x73,0x72,0x71,0x3f]
+s_bfm_b32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x91,0x73,0x72,0x71,0x3f]
 
-s_bfm_b32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x91]
+s_bfm_b32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x91]
 
-s_bfm_b32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x91]
+s_bfm_b32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x91]
 
-s_bfm_b32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x91]
+s_bfm_b32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x91]
 
-s_bfm_b32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x91]
+s_bfm_b32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x91]
 
-s_bfm_b32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x91]
+s_bfm_b32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x91]
 
-s_bfm_b32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x91]
+s_bfm_b32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x91]
 
-s_bfm_b32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x91]
+s_bfm_b32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x91]
 
-s_bfm_b32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x91]
+s_bfm_b32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x91]
 
-s_bfm_b32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x91]
+s_bfm_b32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x91]
 
-s_bfm_b32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x91]
+s_bfm_b32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x91]
 
-s_bfm_b32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x91]
+s_bfm_b32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x91]
 
-s_bfm_b32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x91]
+s_bfm_b32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x91]
 
-s_bfm_b32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x91]
+s_bfm_b32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x91]
 
-s_bfm_b32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x91]
+s_bfm_b32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x91]
 
-s_bfm_b32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x91]
+s_bfm_b32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x91]
 
-s_bfm_b32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x91]
+s_bfm_b32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x91]
 
-s_bfm_b32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x91]
+s_bfm_b32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x91]
 
-s_bfm_b32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x91,0x56,0x34,0x12,0xaf]
+s_bfm_b32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x91,0x56,0x34,0x12,0xaf]
 
-s_bfm_b32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x91,0x73,0x72,0x71,0x3f]
+s_bfm_b32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x91,0x73,0x72,0x71,0x3f]
 
-s_bfm_b64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], s1, s2
+// CHECK: [0x01,0x02,0x8a,0x91]
 
-s_bfm_b64 s[2:3], s0, s0
-// CHECK: [0x00,0x00,0x82,0x91]
+s_bfm_b64 s[12:13], s1, s2
+// CHECK: [0x01,0x02,0x8c,0x91]
 
-s_bfm_b64 s[100:101], s0, s0
-// CHECK: [0x00,0x00,0xe4,0x91]
+s_bfm_b64 s[100:101], s1, s2
+// CHECK: [0x01,0x02,0xe4,0x91]
 
-s_bfm_b64 flat_scratch, s0, s0
-// CHECK: [0x00,0x00,0xe6,0x91]
+s_bfm_b64 flat_scratch, s1, s2
+// CHECK: [0x01,0x02,0xe6,0x91]
 
-s_bfm_b64 vcc, s0, s0
-// CHECK: [0x00,0x00,0xea,0x91]
+s_bfm_b64 vcc, s1, s2
+// CHECK: [0x01,0x02,0xea,0x91]
 
-s_bfm_b64 tba, s0, s0
-// CHECK: [0x00,0x00,0xec,0x91]
+s_bfm_b64 tba, s1, s2
+// CHECK: [0x01,0x02,0xec,0x91]
 
-s_bfm_b64 tma, s0, s0
-// CHECK: [0x00,0x00,0xee,0x91]
+s_bfm_b64 tma, s1, s2
+// CHECK: [0x01,0x02,0xee,0x91]
 
-s_bfm_b64 ttmp[10:11], s0, s0
-// CHECK: [0x00,0x00,0xfa,0x91]
+s_bfm_b64 ttmp[10:11], s1, s2
+// CHECK: [0x01,0x02,0xfa,0x91]
 
-s_bfm_b64 exec, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x91]
+s_bfm_b64 exec, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x91]
 
-s_bfm_b64 s[0:1], s101, s0
-// CHECK: [0x65,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], s101, s2
+// CHECK: [0x65,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], vcc_lo, s2
+// CHECK: [0x6a,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], vcc_hi, s2
+// CHECK: [0x6b,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], tba_lo, s2
+// CHECK: [0x6c,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], tba_hi, s2
+// CHECK: [0x6d,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], tma_lo, s2
+// CHECK: [0x6e,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], tma_hi, s2
+// CHECK: [0x6f,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], ttmp11, s2
+// CHECK: [0x7b,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], m0, s0
-// CHECK: [0x7c,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], m0, s2
+// CHECK: [0x7c,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], exec_lo, s2
+// CHECK: [0x7e,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], exec_hi, s2
+// CHECK: [0x7f,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x91]
+s_bfm_b64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x8a,0x91]
 
-s_bfm_b64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x91,0x56,0x34,0x12,0xaf]
+s_bfm_b64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x8a,0x91,0x56,0x34,0x12,0xaf]
 
-s_bfm_b64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x91,0x73,0x72,0x71,0x3f]
+s_bfm_b64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x8a,0x91,0x73,0x72,0x71,0x3f]
 
-s_bfm_b64 s[0:1], s0, s101
-// CHECK: [0x00,0x65,0x80,0x91]
+s_bfm_b64 s[10:11], s1, s101
+// CHECK: [0x01,0x65,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x91]
+s_bfm_b64 s[10:11], s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x91]
+s_bfm_b64 s[10:11], s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x91]
+s_bfm_b64 s[10:11], s1, vcc_lo
+// CHECK: [0x01,0x6a,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x91]
+s_bfm_b64 s[10:11], s1, vcc_hi
+// CHECK: [0x01,0x6b,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x91]
+s_bfm_b64 s[10:11], s1, tba_lo
+// CHECK: [0x01,0x6c,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x91]
+s_bfm_b64 s[10:11], s1, tba_hi
+// CHECK: [0x01,0x6d,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x91]
+s_bfm_b64 s[10:11], s1, tma_lo
+// CHECK: [0x01,0x6e,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x91]
+s_bfm_b64 s[10:11], s1, tma_hi
+// CHECK: [0x01,0x6f,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x91]
+s_bfm_b64 s[10:11], s1, ttmp11
+// CHECK: [0x01,0x7b,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, m0
-// CHECK: [0x00,0x7c,0x80,0x91]
+s_bfm_b64 s[10:11], s1, m0
+// CHECK: [0x01,0x7c,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x91]
+s_bfm_b64 s[10:11], s1, exec_lo
+// CHECK: [0x01,0x7e,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x91]
+s_bfm_b64 s[10:11], s1, exec_hi
+// CHECK: [0x01,0x7f,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, 0
-// CHECK: [0x00,0x80,0x80,0x91]
+s_bfm_b64 s[10:11], s1, 0
+// CHECK: [0x01,0x80,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, -1
-// CHECK: [0x00,0xc1,0x80,0x91]
+s_bfm_b64 s[10:11], s1, -1
+// CHECK: [0x01,0xc1,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x91]
+s_bfm_b64 s[10:11], s1, 0.5
+// CHECK: [0x01,0xf0,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x91]
+s_bfm_b64 s[10:11], s1, -4.0
+// CHECK: [0x01,0xf7,0x8a,0x91]
 
-s_bfm_b64 s[0:1], s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x91,0x56,0x34,0x12,0xaf]
+s_bfm_b64 s[10:11], s1, 0xaf123456
+// CHECK: [0x01,0xff,0x8a,0x91,0x56,0x34,0x12,0xaf]
 
-s_bfm_b64 s[0:1], s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x91,0x73,0x72,0x71,0x3f]
+s_bfm_b64 s[10:11], s1, 0x3f717273
+// CHECK: [0x01,0xff,0x8a,0x91,0x73,0x72,0x71,0x3f]
 
-s_mul_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x92]
+s_mul_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x92]
 
-s_mul_i32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x92]
+s_mul_i32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x92]
 
-s_mul_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x92]
+s_mul_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x92]
 
-s_mul_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x92]
+s_mul_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x92]
 
-s_mul_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x92]
+s_mul_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x92]
 
-s_mul_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x92]
+s_mul_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x92]
 
-s_mul_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x92]
+s_mul_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x92]
 
-s_mul_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x92]
+s_mul_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x92]
 
-s_mul_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x92]
+s_mul_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x92]
 
-s_mul_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x92]
+s_mul_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x92]
 
-s_mul_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x92]
+s_mul_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x92]
 
-s_mul_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x92]
+s_mul_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x92]
 
-s_mul_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x92]
+s_mul_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x92]
 
-s_mul_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x92]
+s_mul_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x92]
 
-s_mul_i32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x92]
+s_mul_i32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x92]
 
-s_mul_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x92]
+s_mul_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x92]
 
-s_mul_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x92]
+s_mul_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x92]
 
-s_mul_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x92]
+s_mul_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x92]
 
-s_mul_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x92]
+s_mul_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x92]
 
-s_mul_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x92]
+s_mul_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x92]
 
-s_mul_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x92]
+s_mul_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x92]
 
-s_mul_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x92]
+s_mul_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x92]
 
-s_mul_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x92]
+s_mul_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x92]
 
-s_mul_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x92]
+s_mul_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x92]
 
-s_mul_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x92]
+s_mul_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x92]
 
-s_mul_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x92]
+s_mul_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x92]
 
-s_mul_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x92]
+s_mul_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x92]
 
-s_mul_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x92]
+s_mul_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x92]
 
-s_mul_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x92]
+s_mul_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x92]
 
-s_mul_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x92]
+s_mul_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x92]
 
-s_mul_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x92]
+s_mul_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x92]
 
-s_mul_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x92,0x56,0x34,0x12,0xaf]
+s_mul_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x92,0x56,0x34,0x12,0xaf]
 
-s_mul_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x92,0x73,0x72,0x71,0x3f]
+s_mul_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x92,0x73,0x72,0x71,0x3f]
 
-s_mul_i32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x92]
+s_mul_i32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x92]
 
-s_mul_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x92]
+s_mul_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x92]
 
-s_mul_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x92]
+s_mul_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x92]
 
-s_mul_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x92]
+s_mul_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x92]
 
-s_mul_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x92]
+s_mul_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x92]
 
-s_mul_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x92]
+s_mul_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x92]
 
-s_mul_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x92]
+s_mul_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x92]
 
-s_mul_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x92]
+s_mul_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x92]
 
-s_mul_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x92]
+s_mul_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x92]
 
-s_mul_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x92]
+s_mul_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x92]
 
-s_mul_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x92]
+s_mul_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x92]
 
-s_mul_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x92]
+s_mul_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x92]
 
-s_mul_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x92]
+s_mul_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x92]
 
-s_mul_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x92]
+s_mul_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x92]
 
-s_mul_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x92]
+s_mul_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x92]
 
-s_mul_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x92]
+s_mul_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x92]
 
-s_mul_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x92]
+s_mul_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x92]
 
-s_mul_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x92,0x56,0x34,0x12,0xaf]
+s_mul_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x92,0x56,0x34,0x12,0xaf]
 
-s_mul_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x92,0x73,0x72,0x71,0x3f]
+s_mul_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x92,0x73,0x72,0x71,0x3f]
 
-s_bfe_u32 s0, s0, s0
-// CHECK: [0x00,0x00,0x80,0x92]
+s_bfe_u32 s5, s1, s2
+// CHECK: [0x01,0x02,0x85,0x92]
 
-s_bfe_u32 s101, s0, s0
-// CHECK: [0x00,0x00,0xe5,0x92]
+s_bfe_u32 s101, s1, s2
+// CHECK: [0x01,0x02,0xe5,0x92]
 
-s_bfe_u32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0xe6,0x92]
+s_bfe_u32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0xe6,0x92]
 
-s_bfe_u32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0xe7,0x92]
+s_bfe_u32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0xe7,0x92]
 
-s_bfe_u32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0xea,0x92]
+s_bfe_u32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0xea,0x92]
 
-s_bfe_u32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0xeb,0x92]
+s_bfe_u32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0xeb,0x92]
 
-s_bfe_u32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0xec,0x92]
+s_bfe_u32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0xec,0x92]
 
-s_bfe_u32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0xed,0x92]
+s_bfe_u32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0xed,0x92]
 
-s_bfe_u32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0xee,0x92]
+s_bfe_u32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0xee,0x92]
 
-s_bfe_u32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0xef,0x92]
+s_bfe_u32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0xef,0x92]
 
-s_bfe_u32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0xfb,0x92]
+s_bfe_u32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0xfb,0x92]
 
-s_bfe_u32 m0, s0, s0
-// CHECK: [0x00,0x00,0xfc,0x92]
+s_bfe_u32 m0, s1, s2
+// CHECK: [0x01,0x02,0xfc,0x92]
 
-s_bfe_u32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0xfe,0x92]
+s_bfe_u32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0xfe,0x92]
 
-s_bfe_u32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0xff,0x92]
+s_bfe_u32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0xff,0x92]
 
-s_bfe_u32 s0, s101, s0
-// CHECK: [0x65,0x00,0x80,0x92]
+s_bfe_u32 s5, s101, s2
+// CHECK: [0x65,0x02,0x85,0x92]
 
-s_bfe_u32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x80,0x92]
+s_bfe_u32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x85,0x92]
 
-s_bfe_u32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x80,0x92]
+s_bfe_u32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x85,0x92]
 
-s_bfe_u32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x80,0x92]
+s_bfe_u32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x85,0x92]
 
-s_bfe_u32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x80,0x92]
+s_bfe_u32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x85,0x92]
 
-s_bfe_u32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x80,0x92]
+s_bfe_u32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x85,0x92]
 
-s_bfe_u32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x80,0x92]
+s_bfe_u32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x85,0x92]
 
-s_bfe_u32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x80,0x92]
+s_bfe_u32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x85,0x92]
 
-s_bfe_u32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x80,0x92]
+s_bfe_u32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x85,0x92]
 
-s_bfe_u32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x80,0x92]
+s_bfe_u32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x85,0x92]
 
-s_bfe_u32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x80,0x92]
+s_bfe_u32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x85,0x92]
 
-s_bfe_u32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x80,0x92]
+s_bfe_u32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x85,0x92]
 
-s_bfe_u32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x80,0x92]
+s_bfe_u32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x85,0x92]
 
-s_bfe_u32 s0, 0, s0
-// CHECK: [0x80,0x00,0x80,0x92]
+s_bfe_u32 s5, 0, s2
+// CHECK: [0x80,0x02,0x85,0x92]
 
-s_bfe_u32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x80,0x92]
+s_bfe_u32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x85,0x92]
 
-s_bfe_u32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x92]
+s_bfe_u32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x85,0x92]
 
-s_bfe_u32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x92]
+s_bfe_u32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x85,0x92]
 
-s_bfe_u32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x92,0x56,0x34,0x12,0xaf]
+s_bfe_u32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x85,0x92,0x56,0x34,0x12,0xaf]
 
-s_bfe_u32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x92,0x73,0x72,0x71,0x3f]
+s_bfe_u32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x85,0x92,0x73,0x72,0x71,0x3f]
 
-s_bfe_u32 s0, s0, s101
-// CHECK: [0x00,0x65,0x80,0x92]
+s_bfe_u32 s5, s1, s101
+// CHECK: [0x01,0x65,0x85,0x92]
 
-s_bfe_u32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x92]
+s_bfe_u32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x85,0x92]
 
-s_bfe_u32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x92]
+s_bfe_u32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x85,0x92]
 
-s_bfe_u32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x92]
+s_bfe_u32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x85,0x92]
 
-s_bfe_u32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x92]
+s_bfe_u32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x85,0x92]
 
-s_bfe_u32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x80,0x92]
+s_bfe_u32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x85,0x92]
 
-s_bfe_u32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x80,0x92]
+s_bfe_u32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x85,0x92]
 
-s_bfe_u32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x80,0x92]
+s_bfe_u32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x85,0x92]
 
-s_bfe_u32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x80,0x92]
+s_bfe_u32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x85,0x92]
 
-s_bfe_u32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x80,0x92]
+s_bfe_u32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x85,0x92]
 
-s_bfe_u32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x80,0x92]
+s_bfe_u32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x85,0x92]
 
-s_bfe_u32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x80,0x92]
+s_bfe_u32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x85,0x92]
 
-s_bfe_u32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x80,0x92]
+s_bfe_u32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x85,0x92]
 
-s_bfe_u32 s0, s0, 0
-// CHECK: [0x00,0x80,0x80,0x92]
+s_bfe_u32 s5, s1, 0
+// CHECK: [0x01,0x80,0x85,0x92]
 
-s_bfe_u32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x80,0x92]
+s_bfe_u32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x85,0x92]
 
-s_bfe_u32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x80,0x92]
+s_bfe_u32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x85,0x92]
 
-s_bfe_u32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x80,0x92]
+s_bfe_u32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x85,0x92]
 
-s_bfe_u32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x92,0x56,0x34,0x12,0xaf]
+s_bfe_u32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x85,0x92,0x56,0x34,0x12,0xaf]
 
-s_bfe_u32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x92,0x73,0x72,0x71,0x3f]
+s_bfe_u32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x85,0x92,0x73,0x72,0x71,0x3f]
 
-s_bfe_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x93]
+s_bfe_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x93]
 
-s_bfe_i32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x93]
+s_bfe_i32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x93]
 
-s_bfe_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x93]
+s_bfe_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x93]
 
-s_bfe_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x93]
+s_bfe_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x93]
 
-s_bfe_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x93]
+s_bfe_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x93]
 
-s_bfe_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x93]
+s_bfe_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x93]
 
-s_bfe_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x93]
+s_bfe_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x93]
 
-s_bfe_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x93]
+s_bfe_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x93]
 
-s_bfe_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x93]
+s_bfe_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x93]
 
-s_bfe_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x93]
+s_bfe_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x93]
 
-s_bfe_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x93]
+s_bfe_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x93]
 
-s_bfe_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x93]
+s_bfe_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x93]
 
-s_bfe_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x93]
+s_bfe_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x93]
 
-s_bfe_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x93]
+s_bfe_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x93]
 
-s_bfe_i32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x93]
+s_bfe_i32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x93]
 
-s_bfe_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x93]
+s_bfe_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x93]
 
-s_bfe_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x93]
+s_bfe_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x93]
 
-s_bfe_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x93]
+s_bfe_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x93]
 
-s_bfe_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x93]
+s_bfe_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x93]
 
-s_bfe_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x93]
+s_bfe_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x93]
 
-s_bfe_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x93]
+s_bfe_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x93]
 
-s_bfe_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x93]
+s_bfe_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x93]
 
-s_bfe_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x93]
+s_bfe_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x93]
 
-s_bfe_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x93]
+s_bfe_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x93]
 
-s_bfe_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x93]
+s_bfe_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x93]
 
-s_bfe_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x93]
+s_bfe_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x93]
 
-s_bfe_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x93]
+s_bfe_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x93]
 
-s_bfe_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x93]
+s_bfe_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x93]
 
-s_bfe_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x93]
+s_bfe_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x93]
 
-s_bfe_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x93]
+s_bfe_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x93]
 
-s_bfe_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x93]
+s_bfe_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x93]
 
-s_bfe_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x93,0x56,0x34,0x12,0xaf]
+s_bfe_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x93,0x56,0x34,0x12,0xaf]
 
-s_bfe_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x93,0x73,0x72,0x71,0x3f]
+s_bfe_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x93,0x73,0x72,0x71,0x3f]
 
-s_bfe_i32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x93]
+s_bfe_i32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x93]
 
-s_bfe_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x93]
+s_bfe_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x93]
 
-s_bfe_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x93]
+s_bfe_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x93]
 
-s_bfe_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x93]
+s_bfe_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x93]
 
-s_bfe_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x93]
+s_bfe_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x93]
 
-s_bfe_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x93]
+s_bfe_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x93]
 
-s_bfe_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x93]
+s_bfe_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x93]
 
-s_bfe_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x93]
+s_bfe_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x93]
 
-s_bfe_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x93]
+s_bfe_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x93]
 
-s_bfe_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x93]
+s_bfe_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x93]
 
-s_bfe_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x93]
+s_bfe_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x93]
 
-s_bfe_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x93]
+s_bfe_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x93]
 
-s_bfe_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x93]
+s_bfe_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x93]
 
-s_bfe_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x93]
+s_bfe_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x93]
 
-s_bfe_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x93]
+s_bfe_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x93]
 
-s_bfe_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x93]
+s_bfe_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x93]
 
-s_bfe_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x93]
+s_bfe_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x93]
 
-s_bfe_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x93,0x56,0x34,0x12,0xaf]
+s_bfe_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x93,0x56,0x34,0x12,0xaf]
 
-s_bfe_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x93,0x73,0x72,0x71,0x3f]
+s_bfe_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x93,0x73,0x72,0x71,0x3f]
 
-s_bfe_u64 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x8a,0x93]
 
-s_bfe_u64 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x82,0x93]
+s_bfe_u64 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x8c,0x93]
 
-s_bfe_u64 s[100:101], s[0:1], s0
-// CHECK: [0x00,0x00,0xe4,0x93]
+s_bfe_u64 s[100:101], s[2:3], s2
+// CHECK: [0x02,0x02,0xe4,0x93]
 
-s_bfe_u64 flat_scratch, s[0:1], s0
-// CHECK: [0x00,0x00,0xe6,0x93]
+s_bfe_u64 flat_scratch, s[2:3], s2
+// CHECK: [0x02,0x02,0xe6,0x93]
 
-s_bfe_u64 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0xea,0x93]
+s_bfe_u64 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0xea,0x93]
 
-s_bfe_u64 tba, s[0:1], s0
-// CHECK: [0x00,0x00,0xec,0x93]
+s_bfe_u64 tba, s[2:3], s2
+// CHECK: [0x02,0x02,0xec,0x93]
 
-s_bfe_u64 tma, s[0:1], s0
-// CHECK: [0x00,0x00,0xee,0x93]
+s_bfe_u64 tma, s[2:3], s2
+// CHECK: [0x02,0x02,0xee,0x93]
 
-s_bfe_u64 ttmp[10:11], s[0:1], s0
-// CHECK: [0x00,0x00,0xfa,0x93]
+s_bfe_u64 ttmp[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0xfa,0x93]
 
-s_bfe_u64 exec, s[0:1], s0
-// CHECK: [0x00,0x00,0xfe,0x93]
+s_bfe_u64 exec, s[2:3], s2
+// CHECK: [0x02,0x02,0xfe,0x93]
 
-s_bfe_u64 s[0:1], s[2:3], s0
-// CHECK: [0x02,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], s[4:5], s2
+// CHECK: [0x04,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[100:101], s0
-// CHECK: [0x64,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], s[100:101], s2
+// CHECK: [0x64,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], flat_scratch, s0
-// CHECK: [0x66,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], flat_scratch, s2
+// CHECK: [0x66,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], vcc, s0
-// CHECK: [0x6a,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], vcc, s2
+// CHECK: [0x6a,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], tba, s0
-// CHECK: [0x6c,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], tba, s2
+// CHECK: [0x6c,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], tma, s0
-// CHECK: [0x6e,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], tma, s2
+// CHECK: [0x6e,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], exec, s0
-// CHECK: [0x7e,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], exec, s2
+// CHECK: [0x7e,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x80,0x93]
+s_bfe_u64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x8a,0x93]
 
-s_bfe_u64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x80,0x93,0x56,0x34,0x12,0xaf]
+s_bfe_u64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x8a,0x93,0x56,0x34,0x12,0xaf]
 
-s_bfe_u64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x80,0x93,0x73,0x72,0x71,0x3f]
+s_bfe_u64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x8a,0x93,0x73,0x72,0x71,0x3f]
 
-s_bfe_u64 s[0:1], s[0:1], s101
-// CHECK: [0x00,0x65,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], s101
+// CHECK: [0x02,0x65,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x66,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x66,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x67,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x67,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x7c,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], m0
+// CHECK: [0x02,0x7c,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x80,0x93]
+s_bfe_u64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x8a,0x93]
 
-s_bfe_u64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x80,0x93,0x56,0x34,0x12,0xaf]
+s_bfe_u64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x8a,0x93,0x56,0x34,0x12,0xaf]
 
-s_bfe_u64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x80,0x93,0x73,0x72,0x71,0x3f]
+s_bfe_u64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x8a,0x93,0x73,0x72,0x71,0x3f]
 
-s_bfe_i64 s[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x0a,0x94]
 
-s_bfe_i64 s[2:3], s[0:1], s0
-// CHECK: [0x00,0x00,0x02,0x94]
+s_bfe_i64 s[12:13], s[2:3], s2
+// CHECK: [0x02,0x02,0x0c,0x94]
 
-s_bfe_i64 s[100:101], s[0:1], s0
-// CHECK: [0x00,0x00,0x64,0x94]
+s_bfe_i64 s[100:101], s[2:3], s2
+// CHECK: [0x02,0x02,0x64,0x94]
 
-s_bfe_i64 flat_scratch, s[0:1], s0
-// CHECK: [0x00,0x00,0x66,0x94]
+s_bfe_i64 flat_scratch, s[2:3], s2
+// CHECK: [0x02,0x02,0x66,0x94]
 
-s_bfe_i64 vcc, s[0:1], s0
-// CHECK: [0x00,0x00,0x6a,0x94]
+s_bfe_i64 vcc, s[2:3], s2
+// CHECK: [0x02,0x02,0x6a,0x94]
 
-s_bfe_i64 tba, s[0:1], s0
-// CHECK: [0x00,0x00,0x6c,0x94]
+s_bfe_i64 tba, s[2:3], s2
+// CHECK: [0x02,0x02,0x6c,0x94]
 
-s_bfe_i64 tma, s[0:1], s0
-// CHECK: [0x00,0x00,0x6e,0x94]
+s_bfe_i64 tma, s[2:3], s2
+// CHECK: [0x02,0x02,0x6e,0x94]
 
-s_bfe_i64 ttmp[10:11], s[0:1], s0
-// CHECK: [0x00,0x00,0x7a,0x94]
+s_bfe_i64 ttmp[10:11], s[2:3], s2
+// CHECK: [0x02,0x02,0x7a,0x94]
 
-s_bfe_i64 exec, s[0:1], s0
-// CHECK: [0x00,0x00,0x7e,0x94]
+s_bfe_i64 exec, s[2:3], s2
+// CHECK: [0x02,0x02,0x7e,0x94]
 
-s_bfe_i64 s[0:1], s[2:3], s0
-// CHECK: [0x02,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], s[4:5], s2
+// CHECK: [0x04,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[100:101], s0
-// CHECK: [0x64,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], s[100:101], s2
+// CHECK: [0x64,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], flat_scratch, s0
-// CHECK: [0x66,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], flat_scratch, s2
+// CHECK: [0x66,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], vcc, s0
-// CHECK: [0x6a,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], vcc, s2
+// CHECK: [0x6a,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], tba, s0
-// CHECK: [0x6c,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], tba, s2
+// CHECK: [0x6c,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], tma, s0
-// CHECK: [0x6e,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], tma, s2
+// CHECK: [0x6e,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], exec, s0
-// CHECK: [0x7e,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], exec, s2
+// CHECK: [0x7e,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], 0, s0
-// CHECK: [0x80,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], 0, s2
+// CHECK: [0x80,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], -1, s0
-// CHECK: [0xc1,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], -1, s2
+// CHECK: [0xc1,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], 0.5, s2
+// CHECK: [0xf0,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x94]
+s_bfe_i64 s[10:11], -4.0, s2
+// CHECK: [0xf7,0x02,0x0a,0x94]
 
-s_bfe_i64 s[0:1], 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x94,0x56,0x34,0x12,0xaf]
+s_bfe_i64 s[10:11], 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0a,0x94,0x56,0x34,0x12,0xaf]
 
-s_bfe_i64 s[0:1], 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x94,0x73,0x72,0x71,0x3f]
+s_bfe_i64 s[10:11], 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0a,0x94,0x73,0x72,0x71,0x3f]
 
-s_bfe_i64 s[0:1], s[0:1], s101
-// CHECK: [0x00,0x65,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], s101
+// CHECK: [0x02,0x65,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x66,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x67,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], m0
-// CHECK: [0x00,0x7c,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], m0
+// CHECK: [0x02,0x7c,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x80,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], 0
+// CHECK: [0x02,0x80,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0xc1,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], -1
+// CHECK: [0x02,0xc1,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x00,0x94]
+s_bfe_i64 s[10:11], s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x0a,0x94]
 
-s_bfe_i64 s[0:1], s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x94,0x56,0x34,0x12,0xaf]
+s_bfe_i64 s[10:11], s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x0a,0x94,0x56,0x34,0x12,0xaf]
 
-s_bfe_i64 s[0:1], s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x94,0x73,0x72,0x71,0x3f]
+s_bfe_i64 s[10:11], s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x0a,0x94,0x73,0x72,0x71,0x3f]
 
-s_cbranch_g_fork s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0x94]
+s_cbranch_g_fork s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x80,0x94]
 
-s_cbranch_g_fork s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x80,0x94]
+s_cbranch_g_fork s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x80,0x94]
 
-s_cbranch_g_fork s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x80,0x94]
+s_cbranch_g_fork s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x80,0x94]
 
-s_cbranch_g_fork flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x80,0x94]
+s_cbranch_g_fork flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x80,0x94]
 
-s_cbranch_g_fork vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x80,0x94]
+s_cbranch_g_fork vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x80,0x94]
 
-s_cbranch_g_fork tba, s[0:1]
-// CHECK: [0x6c,0x00,0x80,0x94]
+s_cbranch_g_fork tba, s[4:5]
+// CHECK: [0x6c,0x04,0x80,0x94]
 
-s_cbranch_g_fork tma, s[0:1]
-// CHECK: [0x6e,0x00,0x80,0x94]
+s_cbranch_g_fork tma, s[4:5]
+// CHECK: [0x6e,0x04,0x80,0x94]
 
-s_cbranch_g_fork ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x80,0x94]
+s_cbranch_g_fork ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x80,0x94]
 
-s_cbranch_g_fork exec, s[0:1]
-// CHECK: [0x7e,0x00,0x80,0x94]
+s_cbranch_g_fork exec, s[4:5]
+// CHECK: [0x7e,0x04,0x80,0x94]
 
-s_cbranch_g_fork s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x80,0x94]
+s_cbranch_g_fork s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x80,0x94]
 
-s_cbranch_g_fork s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x80,0x94]
+s_cbranch_g_fork s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x80,0x94]
 
-s_cbranch_g_fork s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x80,0x94]
+s_cbranch_g_fork s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x80,0x94]
 
-s_cbranch_g_fork s[0:1], vcc
-// CHECK: [0x00,0x6a,0x80,0x94]
+s_cbranch_g_fork s[2:3], vcc
+// CHECK: [0x02,0x6a,0x80,0x94]
 
-s_cbranch_g_fork s[0:1], tba
-// CHECK: [0x00,0x6c,0x80,0x94]
+s_cbranch_g_fork s[2:3], tba
+// CHECK: [0x02,0x6c,0x80,0x94]
 
-s_cbranch_g_fork s[0:1], tma
-// CHECK: [0x00,0x6e,0x80,0x94]
+s_cbranch_g_fork s[2:3], tma
+// CHECK: [0x02,0x6e,0x80,0x94]
 
-s_cbranch_g_fork s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x80,0x94]
+s_cbranch_g_fork s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x80,0x94]
 
-s_cbranch_g_fork s[0:1], exec
-// CHECK: [0x00,0x7e,0x80,0x94]
+s_cbranch_g_fork s[2:3], exec
+// CHECK: [0x02,0x7e,0x80,0x94]
 
-s_absdiff_i32 s0, s0, s0
-// CHECK: [0x00,0x00,0x00,0x95]
+s_absdiff_i32 s5, s1, s2
+// CHECK: [0x01,0x02,0x05,0x95]
 
-s_absdiff_i32 s101, s0, s0
-// CHECK: [0x00,0x00,0x65,0x95]
+s_absdiff_i32 s101, s1, s2
+// CHECK: [0x01,0x02,0x65,0x95]
 
-s_absdiff_i32 flat_scratch_lo, s0, s0
-// CHECK: [0x00,0x00,0x66,0x95]
+s_absdiff_i32 flat_scratch_lo, s1, s2
+// CHECK: [0x01,0x02,0x66,0x95]
 
-s_absdiff_i32 flat_scratch_hi, s0, s0
-// CHECK: [0x00,0x00,0x67,0x95]
+s_absdiff_i32 flat_scratch_hi, s1, s2
+// CHECK: [0x01,0x02,0x67,0x95]
 
-s_absdiff_i32 vcc_lo, s0, s0
-// CHECK: [0x00,0x00,0x6a,0x95]
+s_absdiff_i32 vcc_lo, s1, s2
+// CHECK: [0x01,0x02,0x6a,0x95]
 
-s_absdiff_i32 vcc_hi, s0, s0
-// CHECK: [0x00,0x00,0x6b,0x95]
+s_absdiff_i32 vcc_hi, s1, s2
+// CHECK: [0x01,0x02,0x6b,0x95]
 
-s_absdiff_i32 tba_lo, s0, s0
-// CHECK: [0x00,0x00,0x6c,0x95]
+s_absdiff_i32 tba_lo, s1, s2
+// CHECK: [0x01,0x02,0x6c,0x95]
 
-s_absdiff_i32 tba_hi, s0, s0
-// CHECK: [0x00,0x00,0x6d,0x95]
+s_absdiff_i32 tba_hi, s1, s2
+// CHECK: [0x01,0x02,0x6d,0x95]
 
-s_absdiff_i32 tma_lo, s0, s0
-// CHECK: [0x00,0x00,0x6e,0x95]
+s_absdiff_i32 tma_lo, s1, s2
+// CHECK: [0x01,0x02,0x6e,0x95]
 
-s_absdiff_i32 tma_hi, s0, s0
-// CHECK: [0x00,0x00,0x6f,0x95]
+s_absdiff_i32 tma_hi, s1, s2
+// CHECK: [0x01,0x02,0x6f,0x95]
 
-s_absdiff_i32 ttmp11, s0, s0
-// CHECK: [0x00,0x00,0x7b,0x95]
+s_absdiff_i32 ttmp11, s1, s2
+// CHECK: [0x01,0x02,0x7b,0x95]
 
-s_absdiff_i32 m0, s0, s0
-// CHECK: [0x00,0x00,0x7c,0x95]
+s_absdiff_i32 m0, s1, s2
+// CHECK: [0x01,0x02,0x7c,0x95]
 
-s_absdiff_i32 exec_lo, s0, s0
-// CHECK: [0x00,0x00,0x7e,0x95]
+s_absdiff_i32 exec_lo, s1, s2
+// CHECK: [0x01,0x02,0x7e,0x95]
 
-s_absdiff_i32 exec_hi, s0, s0
-// CHECK: [0x00,0x00,0x7f,0x95]
+s_absdiff_i32 exec_hi, s1, s2
+// CHECK: [0x01,0x02,0x7f,0x95]
 
-s_absdiff_i32 s0, s101, s0
-// CHECK: [0x65,0x00,0x00,0x95]
+s_absdiff_i32 s5, s101, s2
+// CHECK: [0x65,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0x95]
+s_absdiff_i32 s5, flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0x95]
+s_absdiff_i32 s5, flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0x95]
+s_absdiff_i32 s5, vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0x95]
+s_absdiff_i32 s5, vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0x95]
+s_absdiff_i32 s5, tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0x95]
+s_absdiff_i32 s5, tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0x95]
+s_absdiff_i32 s5, tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0x95]
+s_absdiff_i32 s5, tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0x95]
+s_absdiff_i32 s5, ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, m0, s0
-// CHECK: [0x7c,0x00,0x00,0x95]
+s_absdiff_i32 s5, m0, s2
+// CHECK: [0x7c,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0x95]
+s_absdiff_i32 s5, exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0x95]
+s_absdiff_i32 s5, exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, 0, s0
-// CHECK: [0x80,0x00,0x00,0x95]
+s_absdiff_i32 s5, 0, s2
+// CHECK: [0x80,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, -1, s0
-// CHECK: [0xc1,0x00,0x00,0x95]
+s_absdiff_i32 s5, -1, s2
+// CHECK: [0xc1,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0x95]
+s_absdiff_i32 s5, 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0x95]
+s_absdiff_i32 s5, -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0x95]
 
-s_absdiff_i32 s0, 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0x95,0x56,0x34,0x12,0xaf]
+s_absdiff_i32 s5, 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0x95,0x56,0x34,0x12,0xaf]
 
-s_absdiff_i32 s0, 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0x95,0x73,0x72,0x71,0x3f]
+s_absdiff_i32 s5, 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0x95,0x73,0x72,0x71,0x3f]
 
-s_absdiff_i32 s0, s0, s101
-// CHECK: [0x00,0x65,0x00,0x95]
+s_absdiff_i32 s5, s1, s101
+// CHECK: [0x01,0x65,0x05,0x95]
 
-s_absdiff_i32 s0, s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0x95]
+s_absdiff_i32 s5, s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0x95]
 
-s_absdiff_i32 s0, s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0x95]
+s_absdiff_i32 s5, s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0x95]
 
-s_absdiff_i32 s0, s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0x95]
+s_absdiff_i32 s5, s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0x95]
 
-s_absdiff_i32 s0, s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0x95]
+s_absdiff_i32 s5, s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0x95]
 
-s_absdiff_i32 s0, s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0x95]
+s_absdiff_i32 s5, s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0x95]
 
-s_absdiff_i32 s0, s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0x95]
+s_absdiff_i32 s5, s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0x95]
 
-s_absdiff_i32 s0, s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0x95]
+s_absdiff_i32 s5, s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0x95]
 
-s_absdiff_i32 s0, s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0x95]
+s_absdiff_i32 s5, s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0x95]
 
-s_absdiff_i32 s0, s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0x95]
+s_absdiff_i32 s5, s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0x95]
 
-s_absdiff_i32 s0, s0, m0
-// CHECK: [0x00,0x7c,0x00,0x95]
+s_absdiff_i32 s5, s1, m0
+// CHECK: [0x01,0x7c,0x05,0x95]
 
-s_absdiff_i32 s0, s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0x95]
+s_absdiff_i32 s5, s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0x95]
 
-s_absdiff_i32 s0, s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0x95]
+s_absdiff_i32 s5, s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0x95]
 
-s_absdiff_i32 s0, s0, 0
-// CHECK: [0x00,0x80,0x00,0x95]
+s_absdiff_i32 s5, s1, 0
+// CHECK: [0x01,0x80,0x05,0x95]
 
-s_absdiff_i32 s0, s0, -1
-// CHECK: [0x00,0xc1,0x00,0x95]
+s_absdiff_i32 s5, s1, -1
+// CHECK: [0x01,0xc1,0x05,0x95]
 
-s_absdiff_i32 s0, s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0x95]
+s_absdiff_i32 s5, s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0x95]
 
-s_absdiff_i32 s0, s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0x95]
+s_absdiff_i32 s5, s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0x95]
 
-s_absdiff_i32 s0, s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0x95,0x56,0x34,0x12,0xaf]
+s_absdiff_i32 s5, s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0x95,0x56,0x34,0x12,0xaf]
 
-s_absdiff_i32 s0, s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0x95,0x73,0x72,0x71,0x3f]
+s_absdiff_i32 s5, s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0x95,0x73,0x72,0x71,0x3f]
 
-s_cmp_eq_i32 s0, s0
-// CHECK: [0x00,0x00,0x00,0xbf]
+s_cmp_eq_i32 s1, s2
+// CHECK: [0x01,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 s101, s0
-// CHECK: [0x65,0x00,0x00,0xbf]
+s_cmp_eq_i32 s101, s2
+// CHECK: [0x65,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x00,0xbf]
+s_cmp_eq_i32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x00,0xbf]
+s_cmp_eq_i32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x00,0xbf]
+s_cmp_eq_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x00,0xbf]
+s_cmp_eq_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x00,0xbf]
+s_cmp_eq_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x00,0xbf]
+s_cmp_eq_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x00,0xbf]
+s_cmp_eq_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x00,0xbf]
+s_cmp_eq_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x00,0xbf]
+s_cmp_eq_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 m0, s0
-// CHECK: [0x7c,0x00,0x00,0xbf]
+s_cmp_eq_i32 m0, s2
+// CHECK: [0x7c,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x00,0xbf]
+s_cmp_eq_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x00,0xbf]
+s_cmp_eq_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 0, s0
-// CHECK: [0x80,0x00,0x00,0xbf]
+s_cmp_eq_i32 0, s2
+// CHECK: [0x80,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 -1, s0
-// CHECK: [0xc1,0x00,0x00,0xbf]
+s_cmp_eq_i32 -1, s2
+// CHECK: [0xc1,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x00,0xbf]
+s_cmp_eq_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x00,0xbf]
+s_cmp_eq_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x00,0xbf]
 
-s_cmp_eq_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x00,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_eq_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x00,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_eq_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x00,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_eq_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x00,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_eq_i32 s0, s101
-// CHECK: [0x00,0x65,0x00,0xbf]
+s_cmp_eq_i32 s1, s101
+// CHECK: [0x01,0x65,0x00,0xbf]
 
-s_cmp_eq_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x00,0xbf]
+s_cmp_eq_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x00,0xbf]
 
-s_cmp_eq_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x00,0xbf]
+s_cmp_eq_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x00,0xbf]
 
-s_cmp_eq_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x00,0xbf]
+s_cmp_eq_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x00,0xbf]
 
-s_cmp_eq_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x00,0xbf]
+s_cmp_eq_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x00,0xbf]
 
-s_cmp_eq_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x00,0xbf]
+s_cmp_eq_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x00,0xbf]
 
-s_cmp_eq_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x00,0xbf]
+s_cmp_eq_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x00,0xbf]
 
-s_cmp_eq_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x00,0xbf]
+s_cmp_eq_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x00,0xbf]
 
-s_cmp_eq_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x00,0xbf]
+s_cmp_eq_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x00,0xbf]
 
-s_cmp_eq_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x00,0xbf]
+s_cmp_eq_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x00,0xbf]
 
-s_cmp_eq_i32 s0, m0
-// CHECK: [0x00,0x7c,0x00,0xbf]
+s_cmp_eq_i32 s1, m0
+// CHECK: [0x01,0x7c,0x00,0xbf]
 
-s_cmp_eq_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x00,0xbf]
+s_cmp_eq_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x00,0xbf]
 
-s_cmp_eq_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x00,0xbf]
+s_cmp_eq_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x00,0xbf]
 
-s_cmp_eq_i32 s0, 0
-// CHECK: [0x00,0x80,0x00,0xbf]
+s_cmp_eq_i32 s1, 0
+// CHECK: [0x01,0x80,0x00,0xbf]
 
-s_cmp_eq_i32 s0, -1
-// CHECK: [0x00,0xc1,0x00,0xbf]
+s_cmp_eq_i32 s1, -1
+// CHECK: [0x01,0xc1,0x00,0xbf]
 
-s_cmp_eq_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x00,0xbf]
+s_cmp_eq_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x00,0xbf]
 
-s_cmp_eq_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x00,0xbf]
+s_cmp_eq_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x00,0xbf]
 
-s_cmp_eq_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x00,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_eq_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x00,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_eq_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x00,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_eq_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x00,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lg_i32 s0, s0
-// CHECK: [0x00,0x00,0x01,0xbf]
+s_cmp_lg_i32 s1, s2
+// CHECK: [0x01,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 s101, s0
-// CHECK: [0x65,0x00,0x01,0xbf]
+s_cmp_lg_i32 s101, s2
+// CHECK: [0x65,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x01,0xbf]
+s_cmp_lg_i32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x01,0xbf]
+s_cmp_lg_i32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x01,0xbf]
+s_cmp_lg_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x01,0xbf]
+s_cmp_lg_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x01,0xbf]
+s_cmp_lg_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x01,0xbf]
+s_cmp_lg_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x01,0xbf]
+s_cmp_lg_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x01,0xbf]
+s_cmp_lg_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x01,0xbf]
+s_cmp_lg_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 m0, s0
-// CHECK: [0x7c,0x00,0x01,0xbf]
+s_cmp_lg_i32 m0, s2
+// CHECK: [0x7c,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x01,0xbf]
+s_cmp_lg_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x01,0xbf]
+s_cmp_lg_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 0, s0
-// CHECK: [0x80,0x00,0x01,0xbf]
+s_cmp_lg_i32 0, s2
+// CHECK: [0x80,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 -1, s0
-// CHECK: [0xc1,0x00,0x01,0xbf]
+s_cmp_lg_i32 -1, s2
+// CHECK: [0xc1,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x01,0xbf]
+s_cmp_lg_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x01,0xbf]
+s_cmp_lg_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x01,0xbf]
 
-s_cmp_lg_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x01,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lg_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x01,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lg_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x01,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lg_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x01,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lg_i32 s0, s101
-// CHECK: [0x00,0x65,0x01,0xbf]
+s_cmp_lg_i32 s1, s101
+// CHECK: [0x01,0x65,0x01,0xbf]
 
-s_cmp_lg_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x01,0xbf]
+s_cmp_lg_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x01,0xbf]
 
-s_cmp_lg_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x01,0xbf]
+s_cmp_lg_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x01,0xbf]
 
-s_cmp_lg_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x01,0xbf]
+s_cmp_lg_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x01,0xbf]
 
-s_cmp_lg_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x01,0xbf]
+s_cmp_lg_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x01,0xbf]
 
-s_cmp_lg_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x01,0xbf]
+s_cmp_lg_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x01,0xbf]
 
-s_cmp_lg_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x01,0xbf]
+s_cmp_lg_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x01,0xbf]
 
-s_cmp_lg_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x01,0xbf]
+s_cmp_lg_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x01,0xbf]
 
-s_cmp_lg_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x01,0xbf]
+s_cmp_lg_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x01,0xbf]
 
-s_cmp_lg_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x01,0xbf]
+s_cmp_lg_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x01,0xbf]
 
-s_cmp_lg_i32 s0, m0
-// CHECK: [0x00,0x7c,0x01,0xbf]
+s_cmp_lg_i32 s1, m0
+// CHECK: [0x01,0x7c,0x01,0xbf]
 
-s_cmp_lg_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x01,0xbf]
+s_cmp_lg_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x01,0xbf]
 
-s_cmp_lg_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x01,0xbf]
+s_cmp_lg_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x01,0xbf]
 
-s_cmp_lg_i32 s0, 0
-// CHECK: [0x00,0x80,0x01,0xbf]
+s_cmp_lg_i32 s1, 0
+// CHECK: [0x01,0x80,0x01,0xbf]
 
-s_cmp_lg_i32 s0, -1
-// CHECK: [0x00,0xc1,0x01,0xbf]
+s_cmp_lg_i32 s1, -1
+// CHECK: [0x01,0xc1,0x01,0xbf]
 
-s_cmp_lg_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x01,0xbf]
+s_cmp_lg_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x01,0xbf]
 
-s_cmp_lg_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x01,0xbf]
+s_cmp_lg_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x01,0xbf]
 
-s_cmp_lg_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x01,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lg_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x01,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lg_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x01,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lg_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x01,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_gt_i32 s0, s0
-// CHECK: [0x00,0x00,0x02,0xbf]
+s_cmp_gt_i32 s1, s2
+// CHECK: [0x01,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 s101, s0
-// CHECK: [0x65,0x00,0x02,0xbf]
+s_cmp_gt_i32 s101, s2
+// CHECK: [0x65,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x02,0xbf]
+s_cmp_gt_i32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x02,0xbf]
+s_cmp_gt_i32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x02,0xbf]
+s_cmp_gt_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x02,0xbf]
+s_cmp_gt_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x02,0xbf]
+s_cmp_gt_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x02,0xbf]
+s_cmp_gt_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x02,0xbf]
+s_cmp_gt_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x02,0xbf]
+s_cmp_gt_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x02,0xbf]
+s_cmp_gt_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 m0, s0
-// CHECK: [0x7c,0x00,0x02,0xbf]
+s_cmp_gt_i32 m0, s2
+// CHECK: [0x7c,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x02,0xbf]
+s_cmp_gt_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x02,0xbf]
+s_cmp_gt_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 0, s0
-// CHECK: [0x80,0x00,0x02,0xbf]
+s_cmp_gt_i32 0, s2
+// CHECK: [0x80,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 -1, s0
-// CHECK: [0xc1,0x00,0x02,0xbf]
+s_cmp_gt_i32 -1, s2
+// CHECK: [0xc1,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x02,0xbf]
+s_cmp_gt_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x02,0xbf]
+s_cmp_gt_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x02,0xbf]
 
-s_cmp_gt_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x02,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_gt_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x02,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_gt_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x02,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_gt_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x02,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_gt_i32 s0, s101
-// CHECK: [0x00,0x65,0x02,0xbf]
+s_cmp_gt_i32 s1, s101
+// CHECK: [0x01,0x65,0x02,0xbf]
 
-s_cmp_gt_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x02,0xbf]
+s_cmp_gt_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x02,0xbf]
 
-s_cmp_gt_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x02,0xbf]
+s_cmp_gt_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x02,0xbf]
 
-s_cmp_gt_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x02,0xbf]
+s_cmp_gt_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x02,0xbf]
 
-s_cmp_gt_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x02,0xbf]
+s_cmp_gt_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x02,0xbf]
 
-s_cmp_gt_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x02,0xbf]
+s_cmp_gt_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x02,0xbf]
 
-s_cmp_gt_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x02,0xbf]
+s_cmp_gt_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x02,0xbf]
 
-s_cmp_gt_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x02,0xbf]
+s_cmp_gt_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x02,0xbf]
 
-s_cmp_gt_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x02,0xbf]
+s_cmp_gt_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x02,0xbf]
 
-s_cmp_gt_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x02,0xbf]
+s_cmp_gt_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x02,0xbf]
 
-s_cmp_gt_i32 s0, m0
-// CHECK: [0x00,0x7c,0x02,0xbf]
+s_cmp_gt_i32 s1, m0
+// CHECK: [0x01,0x7c,0x02,0xbf]
 
-s_cmp_gt_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x02,0xbf]
+s_cmp_gt_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x02,0xbf]
 
-s_cmp_gt_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x02,0xbf]
+s_cmp_gt_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x02,0xbf]
 
-s_cmp_gt_i32 s0, 0
-// CHECK: [0x00,0x80,0x02,0xbf]
+s_cmp_gt_i32 s1, 0
+// CHECK: [0x01,0x80,0x02,0xbf]
 
-s_cmp_gt_i32 s0, -1
-// CHECK: [0x00,0xc1,0x02,0xbf]
+s_cmp_gt_i32 s1, -1
+// CHECK: [0x01,0xc1,0x02,0xbf]
 
-s_cmp_gt_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x02,0xbf]
+s_cmp_gt_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x02,0xbf]
 
-s_cmp_gt_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x02,0xbf]
+s_cmp_gt_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x02,0xbf]
 
-s_cmp_gt_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x02,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_gt_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x02,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_gt_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x02,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_gt_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x02,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_ge_i32 s0, s0
-// CHECK: [0x00,0x00,0x03,0xbf]
+s_cmp_ge_i32 s1, s2
+// CHECK: [0x01,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 s101, s0
-// CHECK: [0x65,0x00,0x03,0xbf]
+s_cmp_ge_i32 s101, s2
+// CHECK: [0x65,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x03,0xbf]
+s_cmp_ge_i32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x03,0xbf]
+s_cmp_ge_i32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x03,0xbf]
+s_cmp_ge_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x03,0xbf]
+s_cmp_ge_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x03,0xbf]
+s_cmp_ge_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x03,0xbf]
+s_cmp_ge_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x03,0xbf]
+s_cmp_ge_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x03,0xbf]
+s_cmp_ge_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x03,0xbf]
+s_cmp_ge_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 m0, s0
-// CHECK: [0x7c,0x00,0x03,0xbf]
+s_cmp_ge_i32 m0, s2
+// CHECK: [0x7c,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x03,0xbf]
+s_cmp_ge_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x03,0xbf]
+s_cmp_ge_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 0, s0
-// CHECK: [0x80,0x00,0x03,0xbf]
+s_cmp_ge_i32 0, s2
+// CHECK: [0x80,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 -1, s0
-// CHECK: [0xc1,0x00,0x03,0xbf]
+s_cmp_ge_i32 -1, s2
+// CHECK: [0xc1,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x03,0xbf]
+s_cmp_ge_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x03,0xbf]
+s_cmp_ge_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x03,0xbf]
 
-s_cmp_ge_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x03,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_ge_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x03,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_ge_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x03,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_ge_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x03,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_ge_i32 s0, s101
-// CHECK: [0x00,0x65,0x03,0xbf]
+s_cmp_ge_i32 s1, s101
+// CHECK: [0x01,0x65,0x03,0xbf]
 
-s_cmp_ge_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x03,0xbf]
+s_cmp_ge_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x03,0xbf]
 
-s_cmp_ge_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x03,0xbf]
+s_cmp_ge_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x03,0xbf]
 
-s_cmp_ge_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x03,0xbf]
+s_cmp_ge_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x03,0xbf]
 
-s_cmp_ge_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x03,0xbf]
+s_cmp_ge_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x03,0xbf]
 
-s_cmp_ge_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x03,0xbf]
+s_cmp_ge_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x03,0xbf]
 
-s_cmp_ge_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x03,0xbf]
+s_cmp_ge_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x03,0xbf]
 
-s_cmp_ge_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x03,0xbf]
+s_cmp_ge_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x03,0xbf]
 
-s_cmp_ge_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x03,0xbf]
+s_cmp_ge_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x03,0xbf]
 
-s_cmp_ge_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x03,0xbf]
+s_cmp_ge_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x03,0xbf]
 
-s_cmp_ge_i32 s0, m0
-// CHECK: [0x00,0x7c,0x03,0xbf]
+s_cmp_ge_i32 s1, m0
+// CHECK: [0x01,0x7c,0x03,0xbf]
 
-s_cmp_ge_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x03,0xbf]
+s_cmp_ge_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x03,0xbf]
 
-s_cmp_ge_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x03,0xbf]
+s_cmp_ge_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x03,0xbf]
 
-s_cmp_ge_i32 s0, 0
-// CHECK: [0x00,0x80,0x03,0xbf]
+s_cmp_ge_i32 s1, 0
+// CHECK: [0x01,0x80,0x03,0xbf]
 
-s_cmp_ge_i32 s0, -1
-// CHECK: [0x00,0xc1,0x03,0xbf]
+s_cmp_ge_i32 s1, -1
+// CHECK: [0x01,0xc1,0x03,0xbf]
 
-s_cmp_ge_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x03,0xbf]
+s_cmp_ge_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x03,0xbf]
 
-s_cmp_ge_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x03,0xbf]
+s_cmp_ge_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x03,0xbf]
 
-s_cmp_ge_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x03,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_ge_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x03,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_ge_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x03,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_ge_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x03,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lt_i32 s0, s0
-// CHECK: [0x00,0x00,0x04,0xbf]
+s_cmp_lt_i32 s1, s2
+// CHECK: [0x01,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 s101, s0
-// CHECK: [0x65,0x00,0x04,0xbf]
+s_cmp_lt_i32 s101, s2
+// CHECK: [0x65,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x04,0xbf]
+s_cmp_lt_i32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x04,0xbf]
+s_cmp_lt_i32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x04,0xbf]
+s_cmp_lt_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x04,0xbf]
+s_cmp_lt_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x04,0xbf]
+s_cmp_lt_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x04,0xbf]
+s_cmp_lt_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x04,0xbf]
+s_cmp_lt_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x04,0xbf]
+s_cmp_lt_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x04,0xbf]
+s_cmp_lt_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 m0, s0
-// CHECK: [0x7c,0x00,0x04,0xbf]
+s_cmp_lt_i32 m0, s2
+// CHECK: [0x7c,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x04,0xbf]
+s_cmp_lt_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x04,0xbf]
+s_cmp_lt_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 0, s0
-// CHECK: [0x80,0x00,0x04,0xbf]
+s_cmp_lt_i32 0, s2
+// CHECK: [0x80,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 -1, s0
-// CHECK: [0xc1,0x00,0x04,0xbf]
+s_cmp_lt_i32 -1, s2
+// CHECK: [0xc1,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x04,0xbf]
+s_cmp_lt_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x04,0xbf]
+s_cmp_lt_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x04,0xbf]
 
-s_cmp_lt_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x04,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lt_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x04,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lt_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x04,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lt_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x04,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lt_i32 s0, s101
-// CHECK: [0x00,0x65,0x04,0xbf]
+s_cmp_lt_i32 s1, s101
+// CHECK: [0x01,0x65,0x04,0xbf]
 
-s_cmp_lt_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x04,0xbf]
+s_cmp_lt_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x04,0xbf]
 
-s_cmp_lt_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x04,0xbf]
+s_cmp_lt_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x04,0xbf]
 
-s_cmp_lt_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x04,0xbf]
+s_cmp_lt_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x04,0xbf]
 
-s_cmp_lt_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x04,0xbf]
+s_cmp_lt_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x04,0xbf]
 
-s_cmp_lt_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x04,0xbf]
+s_cmp_lt_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x04,0xbf]
 
-s_cmp_lt_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x04,0xbf]
+s_cmp_lt_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x04,0xbf]
 
-s_cmp_lt_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x04,0xbf]
+s_cmp_lt_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x04,0xbf]
 
-s_cmp_lt_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x04,0xbf]
+s_cmp_lt_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x04,0xbf]
 
-s_cmp_lt_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x04,0xbf]
+s_cmp_lt_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x04,0xbf]
 
-s_cmp_lt_i32 s0, m0
-// CHECK: [0x00,0x7c,0x04,0xbf]
+s_cmp_lt_i32 s1, m0
+// CHECK: [0x01,0x7c,0x04,0xbf]
 
-s_cmp_lt_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x04,0xbf]
+s_cmp_lt_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x04,0xbf]
 
-s_cmp_lt_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x04,0xbf]
+s_cmp_lt_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x04,0xbf]
 
-s_cmp_lt_i32 s0, 0
-// CHECK: [0x00,0x80,0x04,0xbf]
+s_cmp_lt_i32 s1, 0
+// CHECK: [0x01,0x80,0x04,0xbf]
 
-s_cmp_lt_i32 s0, -1
-// CHECK: [0x00,0xc1,0x04,0xbf]
+s_cmp_lt_i32 s1, -1
+// CHECK: [0x01,0xc1,0x04,0xbf]
 
-s_cmp_lt_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x04,0xbf]
+s_cmp_lt_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x04,0xbf]
 
-s_cmp_lt_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x04,0xbf]
+s_cmp_lt_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x04,0xbf]
 
-s_cmp_lt_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x04,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lt_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x04,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lt_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x04,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lt_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x04,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_le_i32 s0, s0
-// CHECK: [0x00,0x00,0x05,0xbf]
+s_cmp_le_i32 s1, s2
+// CHECK: [0x01,0x02,0x05,0xbf]
 
-s_cmp_le_i32 s101, s0
-// CHECK: [0x65,0x00,0x05,0xbf]
+s_cmp_le_i32 s101, s2
+// CHECK: [0x65,0x02,0x05,0xbf]
 
-s_cmp_le_i32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x05,0xbf]
+s_cmp_le_i32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x05,0xbf]
 
-s_cmp_le_i32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x05,0xbf]
+s_cmp_le_i32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x05,0xbf]
 
-s_cmp_le_i32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x05,0xbf]
+s_cmp_le_i32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x05,0xbf]
 
-s_cmp_le_i32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x05,0xbf]
+s_cmp_le_i32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x05,0xbf]
 
-s_cmp_le_i32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x05,0xbf]
+s_cmp_le_i32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x05,0xbf]
 
-s_cmp_le_i32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x05,0xbf]
+s_cmp_le_i32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x05,0xbf]
 
-s_cmp_le_i32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x05,0xbf]
+s_cmp_le_i32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x05,0xbf]
 
-s_cmp_le_i32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x05,0xbf]
+s_cmp_le_i32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x05,0xbf]
 
-s_cmp_le_i32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x05,0xbf]
+s_cmp_le_i32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x05,0xbf]
 
-s_cmp_le_i32 m0, s0
-// CHECK: [0x7c,0x00,0x05,0xbf]
+s_cmp_le_i32 m0, s2
+// CHECK: [0x7c,0x02,0x05,0xbf]
 
-s_cmp_le_i32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x05,0xbf]
+s_cmp_le_i32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x05,0xbf]
 
-s_cmp_le_i32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x05,0xbf]
+s_cmp_le_i32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x05,0xbf]
 
-s_cmp_le_i32 0, s0
-// CHECK: [0x80,0x00,0x05,0xbf]
+s_cmp_le_i32 0, s2
+// CHECK: [0x80,0x02,0x05,0xbf]
 
-s_cmp_le_i32 -1, s0
-// CHECK: [0xc1,0x00,0x05,0xbf]
+s_cmp_le_i32 -1, s2
+// CHECK: [0xc1,0x02,0x05,0xbf]
 
-s_cmp_le_i32 0.5, s0
-// CHECK: [0xf0,0x00,0x05,0xbf]
+s_cmp_le_i32 0.5, s2
+// CHECK: [0xf0,0x02,0x05,0xbf]
 
-s_cmp_le_i32 -4.0, s0
-// CHECK: [0xf7,0x00,0x05,0xbf]
+s_cmp_le_i32 -4.0, s2
+// CHECK: [0xf7,0x02,0x05,0xbf]
 
-s_cmp_le_i32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x05,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_le_i32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x05,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_le_i32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x05,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_le_i32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x05,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_le_i32 s0, s101
-// CHECK: [0x00,0x65,0x05,0xbf]
+s_cmp_le_i32 s1, s101
+// CHECK: [0x01,0x65,0x05,0xbf]
 
-s_cmp_le_i32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x05,0xbf]
+s_cmp_le_i32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x05,0xbf]
 
-s_cmp_le_i32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x05,0xbf]
+s_cmp_le_i32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x05,0xbf]
 
-s_cmp_le_i32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x05,0xbf]
+s_cmp_le_i32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x05,0xbf]
 
-s_cmp_le_i32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x05,0xbf]
+s_cmp_le_i32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x05,0xbf]
 
-s_cmp_le_i32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x05,0xbf]
+s_cmp_le_i32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x05,0xbf]
 
-s_cmp_le_i32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x05,0xbf]
+s_cmp_le_i32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x05,0xbf]
 
-s_cmp_le_i32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x05,0xbf]
+s_cmp_le_i32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x05,0xbf]
 
-s_cmp_le_i32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x05,0xbf]
+s_cmp_le_i32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x05,0xbf]
 
-s_cmp_le_i32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x05,0xbf]
+s_cmp_le_i32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x05,0xbf]
 
-s_cmp_le_i32 s0, m0
-// CHECK: [0x00,0x7c,0x05,0xbf]
+s_cmp_le_i32 s1, m0
+// CHECK: [0x01,0x7c,0x05,0xbf]
 
-s_cmp_le_i32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x05,0xbf]
+s_cmp_le_i32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x05,0xbf]
 
-s_cmp_le_i32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x05,0xbf]
+s_cmp_le_i32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x05,0xbf]
 
-s_cmp_le_i32 s0, 0
-// CHECK: [0x00,0x80,0x05,0xbf]
+s_cmp_le_i32 s1, 0
+// CHECK: [0x01,0x80,0x05,0xbf]
 
-s_cmp_le_i32 s0, -1
-// CHECK: [0x00,0xc1,0x05,0xbf]
+s_cmp_le_i32 s1, -1
+// CHECK: [0x01,0xc1,0x05,0xbf]
 
-s_cmp_le_i32 s0, 0.5
-// CHECK: [0x00,0xf0,0x05,0xbf]
+s_cmp_le_i32 s1, 0.5
+// CHECK: [0x01,0xf0,0x05,0xbf]
 
-s_cmp_le_i32 s0, -4.0
-// CHECK: [0x00,0xf7,0x05,0xbf]
+s_cmp_le_i32 s1, -4.0
+// CHECK: [0x01,0xf7,0x05,0xbf]
 
-s_cmp_le_i32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x05,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_le_i32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x05,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_le_i32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x05,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_le_i32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x05,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_eq_u32 s0, s0
-// CHECK: [0x00,0x00,0x06,0xbf]
+s_cmp_eq_u32 s1, s2
+// CHECK: [0x01,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 s101, s0
-// CHECK: [0x65,0x00,0x06,0xbf]
+s_cmp_eq_u32 s101, s2
+// CHECK: [0x65,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x06,0xbf]
+s_cmp_eq_u32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x06,0xbf]
+s_cmp_eq_u32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x06,0xbf]
+s_cmp_eq_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x06,0xbf]
+s_cmp_eq_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x06,0xbf]
+s_cmp_eq_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x06,0xbf]
+s_cmp_eq_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x06,0xbf]
+s_cmp_eq_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x06,0xbf]
+s_cmp_eq_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x06,0xbf]
+s_cmp_eq_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 m0, s0
-// CHECK: [0x7c,0x00,0x06,0xbf]
+s_cmp_eq_u32 m0, s2
+// CHECK: [0x7c,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x06,0xbf]
+s_cmp_eq_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x06,0xbf]
+s_cmp_eq_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 0, s0
-// CHECK: [0x80,0x00,0x06,0xbf]
+s_cmp_eq_u32 0, s2
+// CHECK: [0x80,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 -1, s0
-// CHECK: [0xc1,0x00,0x06,0xbf]
+s_cmp_eq_u32 -1, s2
+// CHECK: [0xc1,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x06,0xbf]
+s_cmp_eq_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x06,0xbf]
+s_cmp_eq_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x06,0xbf]
 
-s_cmp_eq_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x06,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_eq_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x06,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_eq_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x06,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_eq_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x06,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_eq_u32 s0, s101
-// CHECK: [0x00,0x65,0x06,0xbf]
+s_cmp_eq_u32 s1, s101
+// CHECK: [0x01,0x65,0x06,0xbf]
 
-s_cmp_eq_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x06,0xbf]
+s_cmp_eq_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x06,0xbf]
 
-s_cmp_eq_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x06,0xbf]
+s_cmp_eq_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x06,0xbf]
 
-s_cmp_eq_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x06,0xbf]
+s_cmp_eq_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x06,0xbf]
 
-s_cmp_eq_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x06,0xbf]
+s_cmp_eq_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x06,0xbf]
 
-s_cmp_eq_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x06,0xbf]
+s_cmp_eq_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x06,0xbf]
 
-s_cmp_eq_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x06,0xbf]
+s_cmp_eq_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x06,0xbf]
 
-s_cmp_eq_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x06,0xbf]
+s_cmp_eq_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x06,0xbf]
 
-s_cmp_eq_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x06,0xbf]
+s_cmp_eq_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x06,0xbf]
 
-s_cmp_eq_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x06,0xbf]
+s_cmp_eq_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x06,0xbf]
 
-s_cmp_eq_u32 s0, m0
-// CHECK: [0x00,0x7c,0x06,0xbf]
+s_cmp_eq_u32 s1, m0
+// CHECK: [0x01,0x7c,0x06,0xbf]
 
-s_cmp_eq_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x06,0xbf]
+s_cmp_eq_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x06,0xbf]
 
-s_cmp_eq_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x06,0xbf]
+s_cmp_eq_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x06,0xbf]
 
-s_cmp_eq_u32 s0, 0
-// CHECK: [0x00,0x80,0x06,0xbf]
+s_cmp_eq_u32 s1, 0
+// CHECK: [0x01,0x80,0x06,0xbf]
 
-s_cmp_eq_u32 s0, -1
-// CHECK: [0x00,0xc1,0x06,0xbf]
+s_cmp_eq_u32 s1, -1
+// CHECK: [0x01,0xc1,0x06,0xbf]
 
-s_cmp_eq_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x06,0xbf]
+s_cmp_eq_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x06,0xbf]
 
-s_cmp_eq_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x06,0xbf]
+s_cmp_eq_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x06,0xbf]
 
-s_cmp_eq_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x06,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_eq_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x06,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_eq_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x06,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_eq_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x06,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lg_u32 s0, s0
-// CHECK: [0x00,0x00,0x07,0xbf]
+s_cmp_lg_u32 s1, s2
+// CHECK: [0x01,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 s101, s0
-// CHECK: [0x65,0x00,0x07,0xbf]
+s_cmp_lg_u32 s101, s2
+// CHECK: [0x65,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x07,0xbf]
+s_cmp_lg_u32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x07,0xbf]
+s_cmp_lg_u32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x07,0xbf]
+s_cmp_lg_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x07,0xbf]
+s_cmp_lg_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x07,0xbf]
+s_cmp_lg_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x07,0xbf]
+s_cmp_lg_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x07,0xbf]
+s_cmp_lg_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x07,0xbf]
+s_cmp_lg_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x07,0xbf]
+s_cmp_lg_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 m0, s0
-// CHECK: [0x7c,0x00,0x07,0xbf]
+s_cmp_lg_u32 m0, s2
+// CHECK: [0x7c,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x07,0xbf]
+s_cmp_lg_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x07,0xbf]
+s_cmp_lg_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 0, s0
-// CHECK: [0x80,0x00,0x07,0xbf]
+s_cmp_lg_u32 0, s2
+// CHECK: [0x80,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 -1, s0
-// CHECK: [0xc1,0x00,0x07,0xbf]
+s_cmp_lg_u32 -1, s2
+// CHECK: [0xc1,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x07,0xbf]
+s_cmp_lg_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x07,0xbf]
+s_cmp_lg_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x07,0xbf]
 
-s_cmp_lg_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x07,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lg_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x07,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lg_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x07,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lg_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x07,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lg_u32 s0, s101
-// CHECK: [0x00,0x65,0x07,0xbf]
+s_cmp_lg_u32 s1, s101
+// CHECK: [0x01,0x65,0x07,0xbf]
 
-s_cmp_lg_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x07,0xbf]
+s_cmp_lg_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x07,0xbf]
 
-s_cmp_lg_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x07,0xbf]
+s_cmp_lg_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x07,0xbf]
 
-s_cmp_lg_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x07,0xbf]
+s_cmp_lg_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x07,0xbf]
 
-s_cmp_lg_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x07,0xbf]
+s_cmp_lg_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x07,0xbf]
 
-s_cmp_lg_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x07,0xbf]
+s_cmp_lg_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x07,0xbf]
 
-s_cmp_lg_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x07,0xbf]
+s_cmp_lg_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x07,0xbf]
 
-s_cmp_lg_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x07,0xbf]
+s_cmp_lg_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x07,0xbf]
 
-s_cmp_lg_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x07,0xbf]
+s_cmp_lg_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x07,0xbf]
 
-s_cmp_lg_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x07,0xbf]
+s_cmp_lg_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x07,0xbf]
 
-s_cmp_lg_u32 s0, m0
-// CHECK: [0x00,0x7c,0x07,0xbf]
+s_cmp_lg_u32 s1, m0
+// CHECK: [0x01,0x7c,0x07,0xbf]
 
-s_cmp_lg_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x07,0xbf]
+s_cmp_lg_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x07,0xbf]
 
-s_cmp_lg_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x07,0xbf]
+s_cmp_lg_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x07,0xbf]
 
-s_cmp_lg_u32 s0, 0
-// CHECK: [0x00,0x80,0x07,0xbf]
+s_cmp_lg_u32 s1, 0
+// CHECK: [0x01,0x80,0x07,0xbf]
 
-s_cmp_lg_u32 s0, -1
-// CHECK: [0x00,0xc1,0x07,0xbf]
+s_cmp_lg_u32 s1, -1
+// CHECK: [0x01,0xc1,0x07,0xbf]
 
-s_cmp_lg_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x07,0xbf]
+s_cmp_lg_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x07,0xbf]
 
-s_cmp_lg_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x07,0xbf]
+s_cmp_lg_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x07,0xbf]
 
-s_cmp_lg_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x07,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lg_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x07,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lg_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x07,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lg_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x07,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_gt_u32 s0, s0
-// CHECK: [0x00,0x00,0x08,0xbf]
+s_cmp_gt_u32 s1, s2
+// CHECK: [0x01,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 s101, s0
-// CHECK: [0x65,0x00,0x08,0xbf]
+s_cmp_gt_u32 s101, s2
+// CHECK: [0x65,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x08,0xbf]
+s_cmp_gt_u32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x08,0xbf]
+s_cmp_gt_u32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x08,0xbf]
+s_cmp_gt_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x08,0xbf]
+s_cmp_gt_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x08,0xbf]
+s_cmp_gt_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x08,0xbf]
+s_cmp_gt_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x08,0xbf]
+s_cmp_gt_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x08,0xbf]
+s_cmp_gt_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x08,0xbf]
+s_cmp_gt_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 m0, s0
-// CHECK: [0x7c,0x00,0x08,0xbf]
+s_cmp_gt_u32 m0, s2
+// CHECK: [0x7c,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x08,0xbf]
+s_cmp_gt_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x08,0xbf]
+s_cmp_gt_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 0, s0
-// CHECK: [0x80,0x00,0x08,0xbf]
+s_cmp_gt_u32 0, s2
+// CHECK: [0x80,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 -1, s0
-// CHECK: [0xc1,0x00,0x08,0xbf]
+s_cmp_gt_u32 -1, s2
+// CHECK: [0xc1,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x08,0xbf]
+s_cmp_gt_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x08,0xbf]
+s_cmp_gt_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x08,0xbf]
 
-s_cmp_gt_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x08,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_gt_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x08,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_gt_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x08,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_gt_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x08,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_gt_u32 s0, s101
-// CHECK: [0x00,0x65,0x08,0xbf]
+s_cmp_gt_u32 s1, s101
+// CHECK: [0x01,0x65,0x08,0xbf]
 
-s_cmp_gt_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x08,0xbf]
+s_cmp_gt_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x08,0xbf]
 
-s_cmp_gt_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x08,0xbf]
+s_cmp_gt_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x08,0xbf]
 
-s_cmp_gt_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x08,0xbf]
+s_cmp_gt_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x08,0xbf]
 
-s_cmp_gt_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x08,0xbf]
+s_cmp_gt_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x08,0xbf]
 
-s_cmp_gt_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x08,0xbf]
+s_cmp_gt_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x08,0xbf]
 
-s_cmp_gt_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x08,0xbf]
+s_cmp_gt_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x08,0xbf]
 
-s_cmp_gt_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x08,0xbf]
+s_cmp_gt_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x08,0xbf]
 
-s_cmp_gt_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x08,0xbf]
+s_cmp_gt_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x08,0xbf]
 
-s_cmp_gt_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x08,0xbf]
+s_cmp_gt_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x08,0xbf]
 
-s_cmp_gt_u32 s0, m0
-// CHECK: [0x00,0x7c,0x08,0xbf]
+s_cmp_gt_u32 s1, m0
+// CHECK: [0x01,0x7c,0x08,0xbf]
 
-s_cmp_gt_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x08,0xbf]
+s_cmp_gt_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x08,0xbf]
 
-s_cmp_gt_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x08,0xbf]
+s_cmp_gt_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x08,0xbf]
 
-s_cmp_gt_u32 s0, 0
-// CHECK: [0x00,0x80,0x08,0xbf]
+s_cmp_gt_u32 s1, 0
+// CHECK: [0x01,0x80,0x08,0xbf]
 
-s_cmp_gt_u32 s0, -1
-// CHECK: [0x00,0xc1,0x08,0xbf]
+s_cmp_gt_u32 s1, -1
+// CHECK: [0x01,0xc1,0x08,0xbf]
 
-s_cmp_gt_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x08,0xbf]
+s_cmp_gt_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x08,0xbf]
 
-s_cmp_gt_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x08,0xbf]
+s_cmp_gt_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x08,0xbf]
 
-s_cmp_gt_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x08,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_gt_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x08,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_gt_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x08,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_gt_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x08,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_ge_u32 s0, s0
-// CHECK: [0x00,0x00,0x09,0xbf]
+s_cmp_ge_u32 s1, s2
+// CHECK: [0x01,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 s101, s0
-// CHECK: [0x65,0x00,0x09,0xbf]
+s_cmp_ge_u32 s101, s2
+// CHECK: [0x65,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x09,0xbf]
+s_cmp_ge_u32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x09,0xbf]
+s_cmp_ge_u32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x09,0xbf]
+s_cmp_ge_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x09,0xbf]
+s_cmp_ge_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x09,0xbf]
+s_cmp_ge_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x09,0xbf]
+s_cmp_ge_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x09,0xbf]
+s_cmp_ge_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x09,0xbf]
+s_cmp_ge_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x09,0xbf]
+s_cmp_ge_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 m0, s0
-// CHECK: [0x7c,0x00,0x09,0xbf]
+s_cmp_ge_u32 m0, s2
+// CHECK: [0x7c,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x09,0xbf]
+s_cmp_ge_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x09,0xbf]
+s_cmp_ge_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 0, s0
-// CHECK: [0x80,0x00,0x09,0xbf]
+s_cmp_ge_u32 0, s2
+// CHECK: [0x80,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 -1, s0
-// CHECK: [0xc1,0x00,0x09,0xbf]
+s_cmp_ge_u32 -1, s2
+// CHECK: [0xc1,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x09,0xbf]
+s_cmp_ge_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x09,0xbf]
+s_cmp_ge_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x09,0xbf]
 
-s_cmp_ge_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x09,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_ge_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x09,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_ge_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x09,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_ge_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x09,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_ge_u32 s0, s101
-// CHECK: [0x00,0x65,0x09,0xbf]
+s_cmp_ge_u32 s1, s101
+// CHECK: [0x01,0x65,0x09,0xbf]
 
-s_cmp_ge_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x09,0xbf]
+s_cmp_ge_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x09,0xbf]
 
-s_cmp_ge_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x09,0xbf]
+s_cmp_ge_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x09,0xbf]
 
-s_cmp_ge_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x09,0xbf]
+s_cmp_ge_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x09,0xbf]
 
-s_cmp_ge_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x09,0xbf]
+s_cmp_ge_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x09,0xbf]
 
-s_cmp_ge_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x09,0xbf]
+s_cmp_ge_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x09,0xbf]
 
-s_cmp_ge_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x09,0xbf]
+s_cmp_ge_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x09,0xbf]
 
-s_cmp_ge_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x09,0xbf]
+s_cmp_ge_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x09,0xbf]
 
-s_cmp_ge_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x09,0xbf]
+s_cmp_ge_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x09,0xbf]
 
-s_cmp_ge_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x09,0xbf]
+s_cmp_ge_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x09,0xbf]
 
-s_cmp_ge_u32 s0, m0
-// CHECK: [0x00,0x7c,0x09,0xbf]
+s_cmp_ge_u32 s1, m0
+// CHECK: [0x01,0x7c,0x09,0xbf]
 
-s_cmp_ge_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x09,0xbf]
+s_cmp_ge_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x09,0xbf]
 
-s_cmp_ge_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x09,0xbf]
+s_cmp_ge_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x09,0xbf]
 
-s_cmp_ge_u32 s0, 0
-// CHECK: [0x00,0x80,0x09,0xbf]
+s_cmp_ge_u32 s1, 0
+// CHECK: [0x01,0x80,0x09,0xbf]
 
-s_cmp_ge_u32 s0, -1
-// CHECK: [0x00,0xc1,0x09,0xbf]
+s_cmp_ge_u32 s1, -1
+// CHECK: [0x01,0xc1,0x09,0xbf]
 
-s_cmp_ge_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x09,0xbf]
+s_cmp_ge_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x09,0xbf]
 
-s_cmp_ge_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x09,0xbf]
+s_cmp_ge_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x09,0xbf]
 
-s_cmp_ge_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x09,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_ge_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x09,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_ge_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x09,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_ge_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x09,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lt_u32 s0, s0
-// CHECK: [0x00,0x00,0x0a,0xbf]
+s_cmp_lt_u32 s1, s2
+// CHECK: [0x01,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 s101, s0
-// CHECK: [0x65,0x00,0x0a,0xbf]
+s_cmp_lt_u32 s101, s2
+// CHECK: [0x65,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x0a,0xbf]
+s_cmp_lt_u32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x0a,0xbf]
+s_cmp_lt_u32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x0a,0xbf]
+s_cmp_lt_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x0a,0xbf]
+s_cmp_lt_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x0a,0xbf]
+s_cmp_lt_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x0a,0xbf]
+s_cmp_lt_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x0a,0xbf]
+s_cmp_lt_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x0a,0xbf]
+s_cmp_lt_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x0a,0xbf]
+s_cmp_lt_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 m0, s0
-// CHECK: [0x7c,0x00,0x0a,0xbf]
+s_cmp_lt_u32 m0, s2
+// CHECK: [0x7c,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x0a,0xbf]
+s_cmp_lt_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x0a,0xbf]
+s_cmp_lt_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 0, s0
-// CHECK: [0x80,0x00,0x0a,0xbf]
+s_cmp_lt_u32 0, s2
+// CHECK: [0x80,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 -1, s0
-// CHECK: [0xc1,0x00,0x0a,0xbf]
+s_cmp_lt_u32 -1, s2
+// CHECK: [0xc1,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x0a,0xbf]
+s_cmp_lt_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x0a,0xbf]
+s_cmp_lt_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x0a,0xbf]
 
-s_cmp_lt_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0a,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lt_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0a,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lt_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0a,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lt_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0a,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lt_u32 s0, s101
-// CHECK: [0x00,0x65,0x0a,0xbf]
+s_cmp_lt_u32 s1, s101
+// CHECK: [0x01,0x65,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x0a,0xbf]
+s_cmp_lt_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x0a,0xbf]
+s_cmp_lt_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x0a,0xbf]
+s_cmp_lt_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x0a,0xbf]
+s_cmp_lt_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x0a,0xbf]
+s_cmp_lt_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x0a,0xbf]
+s_cmp_lt_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x0a,0xbf]
+s_cmp_lt_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x0a,0xbf]
+s_cmp_lt_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x0a,0xbf]
+s_cmp_lt_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, m0
-// CHECK: [0x00,0x7c,0x0a,0xbf]
+s_cmp_lt_u32 s1, m0
+// CHECK: [0x01,0x7c,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x0a,0xbf]
+s_cmp_lt_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x0a,0xbf]
+s_cmp_lt_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, 0
-// CHECK: [0x00,0x80,0x0a,0xbf]
+s_cmp_lt_u32 s1, 0
+// CHECK: [0x01,0x80,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, -1
-// CHECK: [0x00,0xc1,0x0a,0xbf]
+s_cmp_lt_u32 s1, -1
+// CHECK: [0x01,0xc1,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x0a,0xbf]
+s_cmp_lt_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x0a,0xbf]
+s_cmp_lt_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x0a,0xbf]
 
-s_cmp_lt_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x0a,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lt_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x0a,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lt_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x0a,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lt_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x0a,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_le_u32 s0, s0
-// CHECK: [0x00,0x00,0x0b,0xbf]
+s_cmp_le_u32 s1, s2
+// CHECK: [0x01,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 s101, s0
-// CHECK: [0x65,0x00,0x0b,0xbf]
+s_cmp_le_u32 s101, s2
+// CHECK: [0x65,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x0b,0xbf]
+s_cmp_le_u32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x0b,0xbf]
+s_cmp_le_u32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x0b,0xbf]
+s_cmp_le_u32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x0b,0xbf]
+s_cmp_le_u32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x0b,0xbf]
+s_cmp_le_u32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x0b,0xbf]
+s_cmp_le_u32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x0b,0xbf]
+s_cmp_le_u32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x0b,0xbf]
+s_cmp_le_u32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x0b,0xbf]
+s_cmp_le_u32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 m0, s0
-// CHECK: [0x7c,0x00,0x0b,0xbf]
+s_cmp_le_u32 m0, s2
+// CHECK: [0x7c,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x0b,0xbf]
+s_cmp_le_u32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x0b,0xbf]
+s_cmp_le_u32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 0, s0
-// CHECK: [0x80,0x00,0x0b,0xbf]
+s_cmp_le_u32 0, s2
+// CHECK: [0x80,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 -1, s0
-// CHECK: [0xc1,0x00,0x0b,0xbf]
+s_cmp_le_u32 -1, s2
+// CHECK: [0xc1,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 0.5, s0
-// CHECK: [0xf0,0x00,0x0b,0xbf]
+s_cmp_le_u32 0.5, s2
+// CHECK: [0xf0,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 -4.0, s0
-// CHECK: [0xf7,0x00,0x0b,0xbf]
+s_cmp_le_u32 -4.0, s2
+// CHECK: [0xf7,0x02,0x0b,0xbf]
 
-s_cmp_le_u32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0b,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_le_u32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0b,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_le_u32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0b,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_le_u32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0b,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_le_u32 s0, s101
-// CHECK: [0x00,0x65,0x0b,0xbf]
+s_cmp_le_u32 s1, s101
+// CHECK: [0x01,0x65,0x0b,0xbf]
 
-s_cmp_le_u32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x0b,0xbf]
+s_cmp_le_u32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x0b,0xbf]
 
-s_cmp_le_u32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x0b,0xbf]
+s_cmp_le_u32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x0b,0xbf]
 
-s_cmp_le_u32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x0b,0xbf]
+s_cmp_le_u32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x0b,0xbf]
 
-s_cmp_le_u32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x0b,0xbf]
+s_cmp_le_u32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x0b,0xbf]
 
-s_cmp_le_u32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x0b,0xbf]
+s_cmp_le_u32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x0b,0xbf]
 
-s_cmp_le_u32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x0b,0xbf]
+s_cmp_le_u32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x0b,0xbf]
 
-s_cmp_le_u32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x0b,0xbf]
+s_cmp_le_u32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x0b,0xbf]
 
-s_cmp_le_u32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x0b,0xbf]
+s_cmp_le_u32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x0b,0xbf]
 
-s_cmp_le_u32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x0b,0xbf]
+s_cmp_le_u32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x0b,0xbf]
 
-s_cmp_le_u32 s0, m0
-// CHECK: [0x00,0x7c,0x0b,0xbf]
+s_cmp_le_u32 s1, m0
+// CHECK: [0x01,0x7c,0x0b,0xbf]
 
-s_cmp_le_u32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x0b,0xbf]
+s_cmp_le_u32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x0b,0xbf]
 
-s_cmp_le_u32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x0b,0xbf]
+s_cmp_le_u32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x0b,0xbf]
 
-s_cmp_le_u32 s0, 0
-// CHECK: [0x00,0x80,0x0b,0xbf]
+s_cmp_le_u32 s1, 0
+// CHECK: [0x01,0x80,0x0b,0xbf]
 
-s_cmp_le_u32 s0, -1
-// CHECK: [0x00,0xc1,0x0b,0xbf]
+s_cmp_le_u32 s1, -1
+// CHECK: [0x01,0xc1,0x0b,0xbf]
 
-s_cmp_le_u32 s0, 0.5
-// CHECK: [0x00,0xf0,0x0b,0xbf]
+s_cmp_le_u32 s1, 0.5
+// CHECK: [0x01,0xf0,0x0b,0xbf]
 
-s_cmp_le_u32 s0, -4.0
-// CHECK: [0x00,0xf7,0x0b,0xbf]
+s_cmp_le_u32 s1, -4.0
+// CHECK: [0x01,0xf7,0x0b,0xbf]
 
-s_cmp_le_u32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x0b,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_le_u32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x0b,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_le_u32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x0b,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_le_u32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x0b,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp0_b32 s0, s0
-// CHECK: [0x00,0x00,0x0c,0xbf]
+s_bitcmp0_b32 s1, s2
+// CHECK: [0x01,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 s101, s0
-// CHECK: [0x65,0x00,0x0c,0xbf]
+s_bitcmp0_b32 s101, s2
+// CHECK: [0x65,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x0c,0xbf]
+s_bitcmp0_b32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x0c,0xbf]
+s_bitcmp0_b32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x0c,0xbf]
+s_bitcmp0_b32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x0c,0xbf]
+s_bitcmp0_b32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x0c,0xbf]
+s_bitcmp0_b32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x0c,0xbf]
+s_bitcmp0_b32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x0c,0xbf]
+s_bitcmp0_b32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x0c,0xbf]
+s_bitcmp0_b32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x0c,0xbf]
+s_bitcmp0_b32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 m0, s0
-// CHECK: [0x7c,0x00,0x0c,0xbf]
+s_bitcmp0_b32 m0, s2
+// CHECK: [0x7c,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x0c,0xbf]
+s_bitcmp0_b32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x0c,0xbf]
+s_bitcmp0_b32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 0, s0
-// CHECK: [0x80,0x00,0x0c,0xbf]
+s_bitcmp0_b32 0, s2
+// CHECK: [0x80,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 -1, s0
-// CHECK: [0xc1,0x00,0x0c,0xbf]
+s_bitcmp0_b32 -1, s2
+// CHECK: [0xc1,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 0.5, s0
-// CHECK: [0xf0,0x00,0x0c,0xbf]
+s_bitcmp0_b32 0.5, s2
+// CHECK: [0xf0,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 -4.0, s0
-// CHECK: [0xf7,0x00,0x0c,0xbf]
+s_bitcmp0_b32 -4.0, s2
+// CHECK: [0xf7,0x02,0x0c,0xbf]
 
-s_bitcmp0_b32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0c,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp0_b32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0c,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp0_b32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0c,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp0_b32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0c,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp0_b32 s0, s101
-// CHECK: [0x00,0x65,0x0c,0xbf]
+s_bitcmp0_b32 s1, s101
+// CHECK: [0x01,0x65,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x0c,0xbf]
+s_bitcmp0_b32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x0c,0xbf]
+s_bitcmp0_b32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x0c,0xbf]
+s_bitcmp0_b32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x0c,0xbf]
+s_bitcmp0_b32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x0c,0xbf]
+s_bitcmp0_b32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x0c,0xbf]
+s_bitcmp0_b32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x0c,0xbf]
+s_bitcmp0_b32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x0c,0xbf]
+s_bitcmp0_b32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x0c,0xbf]
+s_bitcmp0_b32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, m0
-// CHECK: [0x00,0x7c,0x0c,0xbf]
+s_bitcmp0_b32 s1, m0
+// CHECK: [0x01,0x7c,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x0c,0xbf]
+s_bitcmp0_b32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x0c,0xbf]
+s_bitcmp0_b32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, 0
-// CHECK: [0x00,0x80,0x0c,0xbf]
+s_bitcmp0_b32 s1, 0
+// CHECK: [0x01,0x80,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, -1
-// CHECK: [0x00,0xc1,0x0c,0xbf]
+s_bitcmp0_b32 s1, -1
+// CHECK: [0x01,0xc1,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, 0.5
-// CHECK: [0x00,0xf0,0x0c,0xbf]
+s_bitcmp0_b32 s1, 0.5
+// CHECK: [0x01,0xf0,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, -4.0
-// CHECK: [0x00,0xf7,0x0c,0xbf]
+s_bitcmp0_b32 s1, -4.0
+// CHECK: [0x01,0xf7,0x0c,0xbf]
 
-s_bitcmp0_b32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x0c,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp0_b32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x0c,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp0_b32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x0c,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp0_b32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x0c,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp1_b32 s0, s0
-// CHECK: [0x00,0x00,0x0d,0xbf]
+s_bitcmp1_b32 s1, s2
+// CHECK: [0x01,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 s101, s0
-// CHECK: [0x65,0x00,0x0d,0xbf]
+s_bitcmp1_b32 s101, s2
+// CHECK: [0x65,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x0d,0xbf]
+s_bitcmp1_b32 flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x0d,0xbf]
+s_bitcmp1_b32 flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 vcc_lo, s0
-// CHECK: [0x6a,0x00,0x0d,0xbf]
+s_bitcmp1_b32 vcc_lo, s2
+// CHECK: [0x6a,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 vcc_hi, s0
-// CHECK: [0x6b,0x00,0x0d,0xbf]
+s_bitcmp1_b32 vcc_hi, s2
+// CHECK: [0x6b,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 tba_lo, s0
-// CHECK: [0x6c,0x00,0x0d,0xbf]
+s_bitcmp1_b32 tba_lo, s2
+// CHECK: [0x6c,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 tba_hi, s0
-// CHECK: [0x6d,0x00,0x0d,0xbf]
+s_bitcmp1_b32 tba_hi, s2
+// CHECK: [0x6d,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 tma_lo, s0
-// CHECK: [0x6e,0x00,0x0d,0xbf]
+s_bitcmp1_b32 tma_lo, s2
+// CHECK: [0x6e,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 tma_hi, s0
-// CHECK: [0x6f,0x00,0x0d,0xbf]
+s_bitcmp1_b32 tma_hi, s2
+// CHECK: [0x6f,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 ttmp11, s0
-// CHECK: [0x7b,0x00,0x0d,0xbf]
+s_bitcmp1_b32 ttmp11, s2
+// CHECK: [0x7b,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 m0, s0
-// CHECK: [0x7c,0x00,0x0d,0xbf]
+s_bitcmp1_b32 m0, s2
+// CHECK: [0x7c,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 exec_lo, s0
-// CHECK: [0x7e,0x00,0x0d,0xbf]
+s_bitcmp1_b32 exec_lo, s2
+// CHECK: [0x7e,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 exec_hi, s0
-// CHECK: [0x7f,0x00,0x0d,0xbf]
+s_bitcmp1_b32 exec_hi, s2
+// CHECK: [0x7f,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 0, s0
-// CHECK: [0x80,0x00,0x0d,0xbf]
+s_bitcmp1_b32 0, s2
+// CHECK: [0x80,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 -1, s0
-// CHECK: [0xc1,0x00,0x0d,0xbf]
+s_bitcmp1_b32 -1, s2
+// CHECK: [0xc1,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 0.5, s0
-// CHECK: [0xf0,0x00,0x0d,0xbf]
+s_bitcmp1_b32 0.5, s2
+// CHECK: [0xf0,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 -4.0, s0
-// CHECK: [0xf7,0x00,0x0d,0xbf]
+s_bitcmp1_b32 -4.0, s2
+// CHECK: [0xf7,0x02,0x0d,0xbf]
 
-s_bitcmp1_b32 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0d,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp1_b32 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0d,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp1_b32 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0d,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp1_b32 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0d,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp1_b32 s0, s101
-// CHECK: [0x00,0x65,0x0d,0xbf]
+s_bitcmp1_b32 s1, s101
+// CHECK: [0x01,0x65,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x0d,0xbf]
+s_bitcmp1_b32 s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x0d,0xbf]
+s_bitcmp1_b32 s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, vcc_lo
-// CHECK: [0x00,0x6a,0x0d,0xbf]
+s_bitcmp1_b32 s1, vcc_lo
+// CHECK: [0x01,0x6a,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, vcc_hi
-// CHECK: [0x00,0x6b,0x0d,0xbf]
+s_bitcmp1_b32 s1, vcc_hi
+// CHECK: [0x01,0x6b,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, tba_lo
-// CHECK: [0x00,0x6c,0x0d,0xbf]
+s_bitcmp1_b32 s1, tba_lo
+// CHECK: [0x01,0x6c,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, tba_hi
-// CHECK: [0x00,0x6d,0x0d,0xbf]
+s_bitcmp1_b32 s1, tba_hi
+// CHECK: [0x01,0x6d,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, tma_lo
-// CHECK: [0x00,0x6e,0x0d,0xbf]
+s_bitcmp1_b32 s1, tma_lo
+// CHECK: [0x01,0x6e,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, tma_hi
-// CHECK: [0x00,0x6f,0x0d,0xbf]
+s_bitcmp1_b32 s1, tma_hi
+// CHECK: [0x01,0x6f,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, ttmp11
-// CHECK: [0x00,0x7b,0x0d,0xbf]
+s_bitcmp1_b32 s1, ttmp11
+// CHECK: [0x01,0x7b,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, m0
-// CHECK: [0x00,0x7c,0x0d,0xbf]
+s_bitcmp1_b32 s1, m0
+// CHECK: [0x01,0x7c,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, exec_lo
-// CHECK: [0x00,0x7e,0x0d,0xbf]
+s_bitcmp1_b32 s1, exec_lo
+// CHECK: [0x01,0x7e,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, exec_hi
-// CHECK: [0x00,0x7f,0x0d,0xbf]
+s_bitcmp1_b32 s1, exec_hi
+// CHECK: [0x01,0x7f,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, 0
-// CHECK: [0x00,0x80,0x0d,0xbf]
+s_bitcmp1_b32 s1, 0
+// CHECK: [0x01,0x80,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, -1
-// CHECK: [0x00,0xc1,0x0d,0xbf]
+s_bitcmp1_b32 s1, -1
+// CHECK: [0x01,0xc1,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, 0.5
-// CHECK: [0x00,0xf0,0x0d,0xbf]
+s_bitcmp1_b32 s1, 0.5
+// CHECK: [0x01,0xf0,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, -4.0
-// CHECK: [0x00,0xf7,0x0d,0xbf]
+s_bitcmp1_b32 s1, -4.0
+// CHECK: [0x01,0xf7,0x0d,0xbf]
 
-s_bitcmp1_b32 s0, 0xaf123456
-// CHECK: [0x00,0xff,0x0d,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp1_b32 s1, 0xaf123456
+// CHECK: [0x01,0xff,0x0d,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp1_b32 s0, 0x3f717273
-// CHECK: [0x00,0xff,0x0d,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp1_b32 s1, 0x3f717273
+// CHECK: [0x01,0xff,0x0d,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp0_b64 s[0:1], s0
-// CHECK: [0x00,0x00,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], s2
+// CHECK: [0x02,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 s[2:3], s0
-// CHECK: [0x02,0x00,0x0e,0xbf]
+s_bitcmp0_b64 s[4:5], s2
+// CHECK: [0x04,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 s[100:101], s0
-// CHECK: [0x64,0x00,0x0e,0xbf]
+s_bitcmp0_b64 s[100:101], s2
+// CHECK: [0x64,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 flat_scratch, s0
-// CHECK: [0x66,0x00,0x0e,0xbf]
+s_bitcmp0_b64 flat_scratch, s2
+// CHECK: [0x66,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 vcc, s0
-// CHECK: [0x6a,0x00,0x0e,0xbf]
+s_bitcmp0_b64 vcc, s2
+// CHECK: [0x6a,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 tba, s0
-// CHECK: [0x6c,0x00,0x0e,0xbf]
+s_bitcmp0_b64 tba, s2
+// CHECK: [0x6c,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 tma, s0
-// CHECK: [0x6e,0x00,0x0e,0xbf]
+s_bitcmp0_b64 tma, s2
+// CHECK: [0x6e,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x0e,0xbf]
+s_bitcmp0_b64 ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 exec, s0
-// CHECK: [0x7e,0x00,0x0e,0xbf]
+s_bitcmp0_b64 exec, s2
+// CHECK: [0x7e,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 0, s0
-// CHECK: [0x80,0x00,0x0e,0xbf]
+s_bitcmp0_b64 0, s2
+// CHECK: [0x80,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 -1, s0
-// CHECK: [0xc1,0x00,0x0e,0xbf]
+s_bitcmp0_b64 -1, s2
+// CHECK: [0xc1,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 0.5, s0
-// CHECK: [0xf0,0x00,0x0e,0xbf]
+s_bitcmp0_b64 0.5, s2
+// CHECK: [0xf0,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 -4.0, s0
-// CHECK: [0xf7,0x00,0x0e,0xbf]
+s_bitcmp0_b64 -4.0, s2
+// CHECK: [0xf7,0x02,0x0e,0xbf]
 
-s_bitcmp0_b64 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0e,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp0_b64 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0e,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp0_b64 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0e,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp0_b64 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0e,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp0_b64 s[0:1], s101
-// CHECK: [0x00,0x65,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], s101
+// CHECK: [0x02,0x65,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x66,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x66,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x67,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x67,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], m0
-// CHECK: [0x00,0x7c,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], m0
+// CHECK: [0x02,0x7c,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], 0
-// CHECK: [0x00,0x80,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], 0
+// CHECK: [0x02,0x80,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], -1
-// CHECK: [0x00,0xc1,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], -1
+// CHECK: [0x02,0xc1,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x0e,0xbf]
+s_bitcmp0_b64 s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x0e,0xbf]
 
-s_bitcmp0_b64 s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x0e,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp0_b64 s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x0e,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp0_b64 s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x0e,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp0_b64 s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x0e,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp1_b64 s[0:1], s0
-// CHECK: [0x00,0x00,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], s2
+// CHECK: [0x02,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 s[2:3], s0
-// CHECK: [0x02,0x00,0x0f,0xbf]
+s_bitcmp1_b64 s[4:5], s2
+// CHECK: [0x04,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 s[100:101], s0
-// CHECK: [0x64,0x00,0x0f,0xbf]
+s_bitcmp1_b64 s[100:101], s2
+// CHECK: [0x64,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 flat_scratch, s0
-// CHECK: [0x66,0x00,0x0f,0xbf]
+s_bitcmp1_b64 flat_scratch, s2
+// CHECK: [0x66,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 vcc, s0
-// CHECK: [0x6a,0x00,0x0f,0xbf]
+s_bitcmp1_b64 vcc, s2
+// CHECK: [0x6a,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 tba, s0
-// CHECK: [0x6c,0x00,0x0f,0xbf]
+s_bitcmp1_b64 tba, s2
+// CHECK: [0x6c,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 tma, s0
-// CHECK: [0x6e,0x00,0x0f,0xbf]
+s_bitcmp1_b64 tma, s2
+// CHECK: [0x6e,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 ttmp[10:11], s0
-// CHECK: [0x7a,0x00,0x0f,0xbf]
+s_bitcmp1_b64 ttmp[10:11], s2
+// CHECK: [0x7a,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 exec, s0
-// CHECK: [0x7e,0x00,0x0f,0xbf]
+s_bitcmp1_b64 exec, s2
+// CHECK: [0x7e,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 0, s0
-// CHECK: [0x80,0x00,0x0f,0xbf]
+s_bitcmp1_b64 0, s2
+// CHECK: [0x80,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 -1, s0
-// CHECK: [0xc1,0x00,0x0f,0xbf]
+s_bitcmp1_b64 -1, s2
+// CHECK: [0xc1,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 0.5, s0
-// CHECK: [0xf0,0x00,0x0f,0xbf]
+s_bitcmp1_b64 0.5, s2
+// CHECK: [0xf0,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 -4.0, s0
-// CHECK: [0xf7,0x00,0x0f,0xbf]
+s_bitcmp1_b64 -4.0, s2
+// CHECK: [0xf7,0x02,0x0f,0xbf]
 
-s_bitcmp1_b64 0xaf123456, s0
-// CHECK: [0xff,0x00,0x0f,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp1_b64 0xaf123456, s2
+// CHECK: [0xff,0x02,0x0f,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp1_b64 0x3f717273, s0
-// CHECK: [0xff,0x00,0x0f,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp1_b64 0x3f717273, s2
+// CHECK: [0xff,0x02,0x0f,0xbf,0x73,0x72,0x71,0x3f]
 
-s_bitcmp1_b64 s[0:1], s101
-// CHECK: [0x00,0x65,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], s101
+// CHECK: [0x02,0x65,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], flat_scratch_lo
-// CHECK: [0x00,0x66,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], flat_scratch_lo
+// CHECK: [0x02,0x66,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], flat_scratch_hi
-// CHECK: [0x00,0x67,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], flat_scratch_hi
+// CHECK: [0x02,0x67,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], vcc_lo
-// CHECK: [0x00,0x6a,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], vcc_lo
+// CHECK: [0x02,0x6a,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], vcc_hi
-// CHECK: [0x00,0x6b,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], vcc_hi
+// CHECK: [0x02,0x6b,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], tba_lo
-// CHECK: [0x00,0x6c,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], tba_lo
+// CHECK: [0x02,0x6c,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], tba_hi
-// CHECK: [0x00,0x6d,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], tba_hi
+// CHECK: [0x02,0x6d,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], tma_lo
-// CHECK: [0x00,0x6e,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], tma_lo
+// CHECK: [0x02,0x6e,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], tma_hi
-// CHECK: [0x00,0x6f,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], tma_hi
+// CHECK: [0x02,0x6f,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], ttmp11
-// CHECK: [0x00,0x7b,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], ttmp11
+// CHECK: [0x02,0x7b,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], m0
-// CHECK: [0x00,0x7c,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], m0
+// CHECK: [0x02,0x7c,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], exec_lo
-// CHECK: [0x00,0x7e,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], exec_lo
+// CHECK: [0x02,0x7e,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], exec_hi
-// CHECK: [0x00,0x7f,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], exec_hi
+// CHECK: [0x02,0x7f,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], 0
-// CHECK: [0x00,0x80,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], 0
+// CHECK: [0x02,0x80,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], -1
-// CHECK: [0x00,0xc1,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], -1
+// CHECK: [0x02,0xc1,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x0f,0xbf]
+s_bitcmp1_b64 s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x0f,0xbf]
 
-s_bitcmp1_b64 s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x0f,0xbf,0x56,0x34,0x12,0xaf]
+s_bitcmp1_b64 s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x0f,0xbf,0x56,0x34,0x12,0xaf]
 
-s_bitcmp1_b64 s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x0f,0xbf,0x73,0x72,0x71,0x3f]
+s_bitcmp1_b64 s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x0f,0xbf,0x73,0x72,0x71,0x3f]
 
-s_setvskip s0, s0
-// CHECK: [0x00,0x00,0x10,0xbf]
+s_setvskip s1, s2
+// CHECK: [0x01,0x02,0x10,0xbf]
 
-s_setvskip s101, s0
-// CHECK: [0x65,0x00,0x10,0xbf]
+s_setvskip s101, s2
+// CHECK: [0x65,0x02,0x10,0xbf]
 
-s_setvskip flat_scratch_lo, s0
-// CHECK: [0x66,0x00,0x10,0xbf]
+s_setvskip flat_scratch_lo, s2
+// CHECK: [0x66,0x02,0x10,0xbf]
 
-s_setvskip flat_scratch_hi, s0
-// CHECK: [0x67,0x00,0x10,0xbf]
+s_setvskip flat_scratch_hi, s2
+// CHECK: [0x67,0x02,0x10,0xbf]
 
-s_setvskip vcc_lo, s0
-// CHECK: [0x6a,0x00,0x10,0xbf]
+s_setvskip vcc_lo, s2
+// CHECK: [0x6a,0x02,0x10,0xbf]
 
-s_setvskip vcc_hi, s0
-// CHECK: [0x6b,0x00,0x10,0xbf]
+s_setvskip vcc_hi, s2
+// CHECK: [0x6b,0x02,0x10,0xbf]
 
-s_setvskip tba_lo, s0
-// CHECK: [0x6c,0x00,0x10,0xbf]
+s_setvskip tba_lo, s2
+// CHECK: [0x6c,0x02,0x10,0xbf]
 
-s_setvskip tba_hi, s0
-// CHECK: [0x6d,0x00,0x10,0xbf]
+s_setvskip tba_hi, s2
+// CHECK: [0x6d,0x02,0x10,0xbf]
 
-s_setvskip tma_lo, s0
-// CHECK: [0x6e,0x00,0x10,0xbf]
+s_setvskip tma_lo, s2
+// CHECK: [0x6e,0x02,0x10,0xbf]
 
-s_setvskip tma_hi, s0
-// CHECK: [0x6f,0x00,0x10,0xbf]
+s_setvskip tma_hi, s2
+// CHECK: [0x6f,0x02,0x10,0xbf]
 
-s_setvskip ttmp11, s0
-// CHECK: [0x7b,0x00,0x10,0xbf]
+s_setvskip ttmp11, s2
+// CHECK: [0x7b,0x02,0x10,0xbf]
 
-s_setvskip m0, s0
-// CHECK: [0x7c,0x00,0x10,0xbf]
+s_setvskip m0, s2
+// CHECK: [0x7c,0x02,0x10,0xbf]
 
-s_setvskip exec_lo, s0
-// CHECK: [0x7e,0x00,0x10,0xbf]
+s_setvskip exec_lo, s2
+// CHECK: [0x7e,0x02,0x10,0xbf]
 
-s_setvskip exec_hi, s0
-// CHECK: [0x7f,0x00,0x10,0xbf]
+s_setvskip exec_hi, s2
+// CHECK: [0x7f,0x02,0x10,0xbf]
 
-s_setvskip 0, s0
-// CHECK: [0x80,0x00,0x10,0xbf]
+s_setvskip 0, s2
+// CHECK: [0x80,0x02,0x10,0xbf]
 
-s_setvskip -1, s0
-// CHECK: [0xc1,0x00,0x10,0xbf]
+s_setvskip -1, s2
+// CHECK: [0xc1,0x02,0x10,0xbf]
 
-s_setvskip 0.5, s0
-// CHECK: [0xf0,0x00,0x10,0xbf]
+s_setvskip 0.5, s2
+// CHECK: [0xf0,0x02,0x10,0xbf]
 
-s_setvskip -4.0, s0
-// CHECK: [0xf7,0x00,0x10,0xbf]
+s_setvskip -4.0, s2
+// CHECK: [0xf7,0x02,0x10,0xbf]
 
-s_setvskip 0xaf123456, s0
-// CHECK: [0xff,0x00,0x10,0xbf,0x56,0x34,0x12,0xaf]
+s_setvskip 0xaf123456, s2
+// CHECK: [0xff,0x02,0x10,0xbf,0x56,0x34,0x12,0xaf]
 
-s_setvskip 0x3f717273, s0
-// CHECK: [0xff,0x00,0x10,0xbf,0x73,0x72,0x71,0x3f]
+s_setvskip 0x3f717273, s2
+// CHECK: [0xff,0x02,0x10,0xbf,0x73,0x72,0x71,0x3f]
 
-s_setvskip s0, s101
-// CHECK: [0x00,0x65,0x10,0xbf]
+s_setvskip s1, s101
+// CHECK: [0x01,0x65,0x10,0xbf]
 
-s_setvskip s0, flat_scratch_lo
-// CHECK: [0x00,0x66,0x10,0xbf]
+s_setvskip s1, flat_scratch_lo
+// CHECK: [0x01,0x66,0x10,0xbf]
 
-s_setvskip s0, flat_scratch_hi
-// CHECK: [0x00,0x67,0x10,0xbf]
+s_setvskip s1, flat_scratch_hi
+// CHECK: [0x01,0x67,0x10,0xbf]
 
-s_setvskip s0, vcc_lo
-// CHECK: [0x00,0x6a,0x10,0xbf]
+s_setvskip s1, vcc_lo
+// CHECK: [0x01,0x6a,0x10,0xbf]
 
-s_setvskip s0, vcc_hi
-// CHECK: [0x00,0x6b,0x10,0xbf]
+s_setvskip s1, vcc_hi
+// CHECK: [0x01,0x6b,0x10,0xbf]
 
-s_setvskip s0, tba_lo
-// CHECK: [0x00,0x6c,0x10,0xbf]
+s_setvskip s1, tba_lo
+// CHECK: [0x01,0x6c,0x10,0xbf]
 
-s_setvskip s0, tba_hi
-// CHECK: [0x00,0x6d,0x10,0xbf]
+s_setvskip s1, tba_hi
+// CHECK: [0x01,0x6d,0x10,0xbf]
 
-s_setvskip s0, tma_lo
-// CHECK: [0x00,0x6e,0x10,0xbf]
+s_setvskip s1, tma_lo
+// CHECK: [0x01,0x6e,0x10,0xbf]
 
-s_setvskip s0, tma_hi
-// CHECK: [0x00,0x6f,0x10,0xbf]
+s_setvskip s1, tma_hi
+// CHECK: [0x01,0x6f,0x10,0xbf]
 
-s_setvskip s0, ttmp11
-// CHECK: [0x00,0x7b,0x10,0xbf]
+s_setvskip s1, ttmp11
+// CHECK: [0x01,0x7b,0x10,0xbf]
 
-s_setvskip s0, m0
-// CHECK: [0x00,0x7c,0x10,0xbf]
+s_setvskip s1, m0
+// CHECK: [0x01,0x7c,0x10,0xbf]
 
-s_setvskip s0, exec_lo
-// CHECK: [0x00,0x7e,0x10,0xbf]
+s_setvskip s1, exec_lo
+// CHECK: [0x01,0x7e,0x10,0xbf]
 
-s_setvskip s0, exec_hi
-// CHECK: [0x00,0x7f,0x10,0xbf]
+s_setvskip s1, exec_hi
+// CHECK: [0x01,0x7f,0x10,0xbf]
 
-s_setvskip s0, 0
-// CHECK: [0x00,0x80,0x10,0xbf]
+s_setvskip s1, 0
+// CHECK: [0x01,0x80,0x10,0xbf]
 
-s_setvskip s0, -1
-// CHECK: [0x00,0xc1,0x10,0xbf]
+s_setvskip s1, -1
+// CHECK: [0x01,0xc1,0x10,0xbf]
 
-s_setvskip s0, 0.5
-// CHECK: [0x00,0xf0,0x10,0xbf]
+s_setvskip s1, 0.5
+// CHECK: [0x01,0xf0,0x10,0xbf]
 
-s_setvskip s0, -4.0
-// CHECK: [0x00,0xf7,0x10,0xbf]
+s_setvskip s1, -4.0
+// CHECK: [0x01,0xf7,0x10,0xbf]
 
-s_setvskip s0, 0xaf123456
-// CHECK: [0x00,0xff,0x10,0xbf,0x56,0x34,0x12,0xaf]
+s_setvskip s1, 0xaf123456
+// CHECK: [0x01,0xff,0x10,0xbf,0x56,0x34,0x12,0xaf]
 
-s_setvskip s0, 0x3f717273
-// CHECK: [0x00,0xff,0x10,0xbf,0x73,0x72,0x71,0x3f]
+s_setvskip s1, 0x3f717273
+// CHECK: [0x01,0xff,0x10,0xbf,0x73,0x72,0x71,0x3f]
 
-s_set_gpr_idx_on s0, 0x0
-// CHECK: [0x00,0x00,0x11,0xbf]
+s_set_gpr_idx_on s1, 0x0
+// CHECK: [0x01,0x00,0x11,0xbf]
 
 s_set_gpr_idx_on s101, 0x0
 // CHECK: [0x65,0x00,0x11,0xbf]
@@ -21909,188 +22070,188 @@ s_set_gpr_idx_on 0xaf123456, 0x0
 s_set_gpr_idx_on 0x3f717273, 0x0
 // CHECK: [0xff,0x00,0x11,0xbf,0x73,0x72,0x71,0x3f]
 
-s_set_gpr_idx_on s0, 0x1
-// CHECK: [0x00,0x01,0x11,0xbf]
+s_set_gpr_idx_on s1, 0x1
+// CHECK: [0x01,0x01,0x11,0xbf]
 
-s_set_gpr_idx_on s0, 0xF
-// CHECK: [0x00,0x0f,0x11,0xbf]
+s_set_gpr_idx_on s1, 0xF
+// CHECK: [0x01,0x0f,0x11,0xbf]
 
-s_cmp_eq_u64 s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x12,0xbf]
+s_cmp_eq_u64 s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x12,0xbf]
+s_cmp_eq_u64 s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x12,0xbf]
+s_cmp_eq_u64 flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x12,0xbf]
+s_cmp_eq_u64 vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 tba, s[0:1]
-// CHECK: [0x6c,0x00,0x12,0xbf]
+s_cmp_eq_u64 tba, s[4:5]
+// CHECK: [0x6c,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 tma, s[0:1]
-// CHECK: [0x6e,0x00,0x12,0xbf]
+s_cmp_eq_u64 tma, s[4:5]
+// CHECK: [0x6e,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x12,0xbf]
+s_cmp_eq_u64 ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 exec, s[0:1]
-// CHECK: [0x7e,0x00,0x12,0xbf]
+s_cmp_eq_u64 exec, s[4:5]
+// CHECK: [0x7e,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 0, s[0:1]
-// CHECK: [0x80,0x00,0x12,0xbf]
+s_cmp_eq_u64 0, s[4:5]
+// CHECK: [0x80,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 -1, s[0:1]
-// CHECK: [0xc1,0x00,0x12,0xbf]
+s_cmp_eq_u64 -1, s[4:5]
+// CHECK: [0xc1,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x12,0xbf]
+s_cmp_eq_u64 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x12,0xbf]
+s_cmp_eq_u64 -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x12,0xbf]
 
-s_cmp_eq_u64 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x12,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_eq_u64 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x12,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_eq_u64 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x12,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_eq_u64 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x12,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_eq_u64 s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], vcc
-// CHECK: [0x00,0x6a,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], vcc
+// CHECK: [0x02,0x6a,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], tba
-// CHECK: [0x00,0x6c,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], tba
+// CHECK: [0x02,0x6c,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], tma
-// CHECK: [0x00,0x6e,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], tma
+// CHECK: [0x02,0x6e,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], exec
-// CHECK: [0x00,0x7e,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], exec
+// CHECK: [0x02,0x7e,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], 0
-// CHECK: [0x00,0x80,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], 0
+// CHECK: [0x02,0x80,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], -1
-// CHECK: [0x00,0xc1,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], -1
+// CHECK: [0x02,0xc1,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x12,0xbf]
+s_cmp_eq_u64 s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x12,0xbf]
 
-s_cmp_eq_u64 s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x12,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_eq_u64 s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x12,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_eq_u64 s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x12,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_eq_u64 s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x12,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lg_u64 s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], s[4:5]
+// CHECK: [0x02,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 s[2:3], s[0:1]
-// CHECK: [0x02,0x00,0x13,0xbf]
+s_cmp_lg_u64 s[4:5], s[4:5]
+// CHECK: [0x04,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 s[100:101], s[0:1]
-// CHECK: [0x64,0x00,0x13,0xbf]
+s_cmp_lg_u64 s[100:101], s[4:5]
+// CHECK: [0x64,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 flat_scratch, s[0:1]
-// CHECK: [0x66,0x00,0x13,0xbf]
+s_cmp_lg_u64 flat_scratch, s[4:5]
+// CHECK: [0x66,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 vcc, s[0:1]
-// CHECK: [0x6a,0x00,0x13,0xbf]
+s_cmp_lg_u64 vcc, s[4:5]
+// CHECK: [0x6a,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 tba, s[0:1]
-// CHECK: [0x6c,0x00,0x13,0xbf]
+s_cmp_lg_u64 tba, s[4:5]
+// CHECK: [0x6c,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 tma, s[0:1]
-// CHECK: [0x6e,0x00,0x13,0xbf]
+s_cmp_lg_u64 tma, s[4:5]
+// CHECK: [0x6e,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 ttmp[10:11], s[0:1]
-// CHECK: [0x7a,0x00,0x13,0xbf]
+s_cmp_lg_u64 ttmp[10:11], s[4:5]
+// CHECK: [0x7a,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 exec, s[0:1]
-// CHECK: [0x7e,0x00,0x13,0xbf]
+s_cmp_lg_u64 exec, s[4:5]
+// CHECK: [0x7e,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 0, s[0:1]
-// CHECK: [0x80,0x00,0x13,0xbf]
+s_cmp_lg_u64 0, s[4:5]
+// CHECK: [0x80,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 -1, s[0:1]
-// CHECK: [0xc1,0x00,0x13,0xbf]
+s_cmp_lg_u64 -1, s[4:5]
+// CHECK: [0xc1,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 0.5, s[0:1]
-// CHECK: [0xf0,0x00,0x13,0xbf]
+s_cmp_lg_u64 0.5, s[4:5]
+// CHECK: [0xf0,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 -4.0, s[0:1]
-// CHECK: [0xf7,0x00,0x13,0xbf]
+s_cmp_lg_u64 -4.0, s[4:5]
+// CHECK: [0xf7,0x04,0x13,0xbf]
 
-s_cmp_lg_u64 0xaf123456, s[0:1]
-// CHECK: [0xff,0x00,0x13,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lg_u64 0xaf123456, s[4:5]
+// CHECK: [0xff,0x04,0x13,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lg_u64 0x3f717273, s[0:1]
-// CHECK: [0xff,0x00,0x13,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lg_u64 0x3f717273, s[4:5]
+// CHECK: [0xff,0x04,0x13,0xbf,0x73,0x72,0x71,0x3f]
 
-s_cmp_lg_u64 s[0:1], s[2:3]
-// CHECK: [0x00,0x02,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], s[6:7]
+// CHECK: [0x02,0x06,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], s[100:101]
-// CHECK: [0x00,0x64,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], s[100:101]
+// CHECK: [0x02,0x64,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], flat_scratch
-// CHECK: [0x00,0x66,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], flat_scratch
+// CHECK: [0x02,0x66,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], vcc
-// CHECK: [0x00,0x6a,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], vcc
+// CHECK: [0x02,0x6a,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], tba
-// CHECK: [0x00,0x6c,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], tba
+// CHECK: [0x02,0x6c,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], tma
-// CHECK: [0x00,0x6e,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], tma
+// CHECK: [0x02,0x6e,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], ttmp[10:11]
-// CHECK: [0x00,0x7a,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], ttmp[10:11]
+// CHECK: [0x02,0x7a,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], exec
-// CHECK: [0x00,0x7e,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], exec
+// CHECK: [0x02,0x7e,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], 0
-// CHECK: [0x00,0x80,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], 0
+// CHECK: [0x02,0x80,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], -1
-// CHECK: [0x00,0xc1,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], -1
+// CHECK: [0x02,0xc1,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], 0.5
-// CHECK: [0x00,0xf0,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], 0.5
+// CHECK: [0x02,0xf0,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], -4.0
-// CHECK: [0x00,0xf7,0x13,0xbf]
+s_cmp_lg_u64 s[2:3], -4.0
+// CHECK: [0x02,0xf7,0x13,0xbf]
 
-s_cmp_lg_u64 s[0:1], 0xaf123456
-// CHECK: [0x00,0xff,0x13,0xbf,0x56,0x34,0x12,0xaf]
+s_cmp_lg_u64 s[2:3], 0xaf123456
+// CHECK: [0x02,0xff,0x13,0xbf,0x56,0x34,0x12,0xaf]
 
-s_cmp_lg_u64 s[0:1], 0x3f717273
-// CHECK: [0x00,0xff,0x13,0xbf,0x73,0x72,0x71,0x3f]
+s_cmp_lg_u64 s[2:3], 0x3f717273
+// CHECK: [0x02,0xff,0x13,0xbf,0x73,0x72,0x71,0x3f]
 
-s_movk_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb0]
+s_movk_i32 s5, 0x3141
+// CHECK: [0x41,0x31,0x05,0xb0]
 
 s_movk_i32 s101, 0x3141
 // CHECK: [0x41,0x31,0x65,0xb0]
@@ -22131,11 +22292,11 @@ s_movk_i32 exec_lo, 0x3141
 s_movk_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb0]
 
-s_movk_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb0]
+s_movk_i32 s5, 0xc1d1
+// CHECK: [0xd1,0xc1,0x05,0xb0]
 
-s_cmovk_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb0]
+s_cmovk_i32 s5, 0x3141
+// CHECK: [0x41,0x31,0x85,0xb0]
 
 s_cmovk_i32 s101, 0x3141
 // CHECK: [0x41,0x31,0xe5,0xb0]
@@ -22176,11 +22337,11 @@ s_cmovk_i32 exec_lo, 0x3141
 s_cmovk_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb0]
 
-s_cmovk_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb0]
+s_cmovk_i32 s5, 0xc1d1
+// CHECK: [0xd1,0xc1,0x85,0xb0]
 
-s_cmpk_eq_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb1]
+s_cmpk_eq_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb1]
 
 s_cmpk_eq_i32 s101, 0x3141
 // CHECK: [0x41,0x31,0x65,0xb1]
@@ -22221,11 +22382,11 @@ s_cmpk_eq_i32 exec_lo, 0x3141
 s_cmpk_eq_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb1]
 
-s_cmpk_eq_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb1]
+s_cmpk_eq_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb1]
 
-s_cmpk_lg_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb1]
+s_cmpk_lg_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb1]
 
 s_cmpk_lg_i32 s101, 0x3141
 // CHECK: [0x41,0x31,0xe5,0xb1]
@@ -22266,11 +22427,11 @@ s_cmpk_lg_i32 exec_lo, 0x3141
 s_cmpk_lg_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb1]
 
-s_cmpk_lg_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb1]
+s_cmpk_lg_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb1]
 
-s_cmpk_gt_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb2]
+s_cmpk_gt_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb2]
 
 s_cmpk_gt_i32 s101, 0x3141
 // CHECK: [0x41,0x31,0x65,0xb2]
@@ -22311,11 +22472,11 @@ s_cmpk_gt_i32 exec_lo, 0x3141
 s_cmpk_gt_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb2]
 
-s_cmpk_gt_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb2]
+s_cmpk_gt_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb2]
 
-s_cmpk_ge_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb2]
+s_cmpk_ge_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb2]
 
 s_cmpk_ge_i32 s101, 0x3141
 // CHECK: [0x41,0x31,0xe5,0xb2]
@@ -22356,11 +22517,11 @@ s_cmpk_ge_i32 exec_lo, 0x3141
 s_cmpk_ge_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb2]
 
-s_cmpk_ge_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb2]
+s_cmpk_ge_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb2]
 
-s_cmpk_lt_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb3]
+s_cmpk_lt_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb3]
 
 s_cmpk_lt_i32 s101, 0x3141
 // CHECK: [0x41,0x31,0x65,0xb3]
@@ -22401,11 +22562,11 @@ s_cmpk_lt_i32 exec_lo, 0x3141
 s_cmpk_lt_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb3]
 
-s_cmpk_lt_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb3]
+s_cmpk_lt_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb3]
 
-s_cmpk_le_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb3]
+s_cmpk_le_i32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb3]
 
 s_cmpk_le_i32 s101, 0x3141
 // CHECK: [0x41,0x31,0xe5,0xb3]
@@ -22446,11 +22607,11 @@ s_cmpk_le_i32 exec_lo, 0x3141
 s_cmpk_le_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb3]
 
-s_cmpk_le_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb3]
+s_cmpk_le_i32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb3]
 
-s_cmpk_eq_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb4]
+s_cmpk_eq_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb4]
 
 s_cmpk_eq_u32 s101, 0x3141
 // CHECK: [0x41,0x31,0x65,0xb4]
@@ -22491,11 +22652,11 @@ s_cmpk_eq_u32 exec_lo, 0x3141
 s_cmpk_eq_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb4]
 
-s_cmpk_eq_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb4]
+s_cmpk_eq_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb4]
 
-s_cmpk_lg_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb4]
+s_cmpk_lg_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb4]
 
 s_cmpk_lg_u32 s101, 0x3141
 // CHECK: [0x41,0x31,0xe5,0xb4]
@@ -22536,11 +22697,11 @@ s_cmpk_lg_u32 exec_lo, 0x3141
 s_cmpk_lg_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb4]
 
-s_cmpk_lg_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb4]
+s_cmpk_lg_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb4]
 
-s_cmpk_gt_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb5]
+s_cmpk_gt_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb5]
 
 s_cmpk_gt_u32 s101, 0x3141
 // CHECK: [0x41,0x31,0x65,0xb5]
@@ -22581,11 +22742,11 @@ s_cmpk_gt_u32 exec_lo, 0x3141
 s_cmpk_gt_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb5]
 
-s_cmpk_gt_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb5]
+s_cmpk_gt_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb5]
 
-s_cmpk_ge_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb5]
+s_cmpk_ge_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb5]
 
 s_cmpk_ge_u32 s101, 0x3141
 // CHECK: [0x41,0x31,0xe5,0xb5]
@@ -22626,11 +22787,11 @@ s_cmpk_ge_u32 exec_lo, 0x3141
 s_cmpk_ge_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb5]
 
-s_cmpk_ge_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb5]
+s_cmpk_ge_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb5]
 
-s_cmpk_lt_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb6]
+s_cmpk_lt_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x01,0xb6]
 
 s_cmpk_lt_u32 s101, 0x3141
 // CHECK: [0x41,0x31,0x65,0xb6]
@@ -22671,11 +22832,11 @@ s_cmpk_lt_u32 exec_lo, 0x3141
 s_cmpk_lt_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb6]
 
-s_cmpk_lt_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb6]
+s_cmpk_lt_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x01,0xb6]
 
-s_cmpk_le_u32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb6]
+s_cmpk_le_u32 s1, 0x3141
+// CHECK: [0x41,0x31,0x81,0xb6]
 
 s_cmpk_le_u32 s101, 0x3141
 // CHECK: [0x41,0x31,0xe5,0xb6]
@@ -22716,11 +22877,11 @@ s_cmpk_le_u32 exec_lo, 0x3141
 s_cmpk_le_u32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb6]
 
-s_cmpk_le_u32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb6]
+s_cmpk_le_u32 s1, 0xc1d1
+// CHECK: [0xd1,0xc1,0x81,0xb6]
 
-s_addk_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x00,0xb7]
+s_addk_i32 s5, 0x3141
+// CHECK: [0x41,0x31,0x05,0xb7]
 
 s_addk_i32 s101, 0x3141
 // CHECK: [0x41,0x31,0x65,0xb7]
@@ -22761,11 +22922,11 @@ s_addk_i32 exec_lo, 0x3141
 s_addk_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0x7f,0xb7]
 
-s_addk_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x00,0xb7]
+s_addk_i32 s5, 0xc1d1
+// CHECK: [0xd1,0xc1,0x05,0xb7]
 
-s_mulk_i32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb7]
+s_mulk_i32 s5, 0x3141
+// CHECK: [0x41,0x31,0x85,0xb7]
 
 s_mulk_i32 s101, 0x3141
 // CHECK: [0x41,0x31,0xe5,0xb7]
@@ -22806,15 +22967,15 @@ s_mulk_i32 exec_lo, 0x3141
 s_mulk_i32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb7]
 
-s_mulk_i32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb7]
-
-s_cbranch_i_fork s[0:1], 12609
-// CHECK: [0x41,0x31,0x00,0xb8]
+s_mulk_i32 s5, 0xc1d1
+// CHECK: [0xd1,0xc1,0x85,0xb7]
 
 s_cbranch_i_fork s[2:3], 12609
 // CHECK: [0x41,0x31,0x02,0xb8]
 
+s_cbranch_i_fork s[4:5], 12609
+// CHECK: [0x41,0x31,0x04,0xb8]
+
 s_cbranch_i_fork s[100:101], 12609
 // CHECK: [0x41,0x31,0x64,0xb8]
 
@@ -22836,11 +22997,11 @@ s_cbranch_i_fork ttmp[10:11], 12609
 s_cbranch_i_fork exec, 12609
 // CHECK: [0x41,0x31,0x7e,0xb8]
 
-s_cbranch_i_fork s[0:1], 49617
-// CHECK: [0xd1,0xc1,0x00,0xb8]
+s_cbranch_i_fork s[2:3], 49617
+// CHECK: [0xd1,0xc1,0x02,0xb8]
 
-s_getreg_b32 s0, 0x3141
-// CHECK: [0x41,0x31,0x80,0xb8]
+s_getreg_b32 s5, 0x3141
+// CHECK: [0x41,0x31,0x85,0xb8]
 
 s_getreg_b32 s101, 0x3141
 // CHECK: [0x41,0x31,0xe5,0xb8]
@@ -22881,14 +23042,14 @@ s_getreg_b32 exec_lo, 0x3141
 s_getreg_b32 exec_hi, 0x3141
 // CHECK: [0x41,0x31,0xff,0xb8]
 
-s_getreg_b32 s0, 0xc1d1
-// CHECK: [0xd1,0xc1,0x80,0xb8]
+s_getreg_b32 s5, 0xc1d1
+// CHECK: [0xd1,0xc1,0x85,0xb8]
 
-s_setreg_b32 0x3141, s0
-// CHECK: [0x41,0x31,0x00,0xb9]
+s_setreg_b32 0x3141, s1
+// CHECK: [0x41,0x31,0x01,0xb9]
 
-s_setreg_b32 0xc1d1, s0
-// CHECK: [0xd1,0xc1,0x00,0xb9]
+s_setreg_b32 0xc1d1, s1
+// CHECK: [0xd1,0xc1,0x01,0xb9]
 
 s_setreg_b32 0x3141, s101
 // CHECK: [0x41,0x31,0x65,0xb9]
@@ -23064,83 +23225,89 @@ s_set_gpr_idx_mode 0x1
 s_set_gpr_idx_mode 0xF
 // CHECK: [0x0f,0x00,0x9d,0xbf]
 
-v_interp_p1_f32 v255, v0, attr0.x
-// CHECK: [0x00,0x00,0xfc,0xd7]
+v_interp_p1_f32 v5, v1, attr0.x
+// CHECK: [0x01,0x00,0x14,0xd4]
 
-v_interp_p1_f32 v255, v0, attr1.x
-// CHECK: [0x00,0x04,0xfc,0xd7]
+v_interp_p1_f32 v255, v1, attr0.x
+// CHECK: [0x01,0x00,0xfc,0xd7]
 
-v_interp_p1_f32 v255, v0, attr31.x
-// CHECK: [0x00,0x7c,0xfc,0xd7]
+v_interp_p1_f32 v5, v255, attr0.x
+// CHECK: [0xff,0x00,0x14,0xd4]
 
-v_interp_p1_f32 v255, v0, attr32.x
-// CHECK: [0x00,0x80,0xfc,0xd7]
+v_interp_p1_f32 v5, v1, attr1.x
+// CHECK: [0x01,0x04,0x14,0xd4]
 
-v_interp_p1_f32 v255, v0, attr0.y
-// CHECK: [0x00,0x01,0xfc,0xd7]
+v_interp_p1_f32 v5, v1, attr31.x
+// CHECK: [0x01,0x7c,0x14,0xd4]
 
-v_interp_p1_f32 v255, v0, attr0.z
-// CHECK: [0x00,0x02,0xfc,0xd7]
+v_interp_p1_f32 v5, v1, attr32.x
+// CHECK: [0x01,0x80,0x14,0xd4]
 
-v_interp_p1_f32 v255, v0, attr0.w
-// CHECK: [0x00,0x03,0xfc,0xd7]
+v_interp_p1_f32 v5, v1, attr0.y
+// CHECK: [0x01,0x01,0x14,0xd4]
 
-v_interp_p2_f32 v0, v0, attr0.x
-// CHECK: [0x00,0x00,0x01,0xd4]
+v_interp_p1_f32 v5, v1, attr0.z
+// CHECK: [0x01,0x02,0x14,0xd4]
 
-v_interp_p2_f32 v255, v0, attr0.x
-// CHECK: [0x00,0x00,0xfd,0xd7]
+v_interp_p1_f32 v5, v1, attr0.w
+// CHECK: [0x01,0x03,0x14,0xd4]
 
-v_interp_p2_f32 v0, v255, attr0.x
-// CHECK: [0xff,0x00,0x01,0xd4]
+v_interp_p2_f32 v5, v1, attr0.x
+// CHECK: [0x01,0x00,0x15,0xd4]
 
-v_interp_p2_f32 v0, v0, attr1.x
-// CHECK: [0x00,0x04,0x01,0xd4]
+v_interp_p2_f32 v255, v1, attr0.x
+// CHECK: [0x01,0x00,0xfd,0xd7]
 
-v_interp_p2_f32 v0, v0, attr31.x
-// CHECK: [0x00,0x7c,0x01,0xd4]
+v_interp_p2_f32 v5, v255, attr0.x
+// CHECK: [0xff,0x00,0x15,0xd4]
 
-v_interp_p2_f32 v0, v0, attr32.x
-// CHECK: [0x00,0x80,0x01,0xd4]
+v_interp_p2_f32 v5, v1, attr1.x
+// CHECK: [0x01,0x04,0x15,0xd4]
 
-v_interp_p2_f32 v0, v0, attr0.y
-// CHECK: [0x00,0x01,0x01,0xd4]
+v_interp_p2_f32 v5, v1, attr31.x
+// CHECK: [0x01,0x7c,0x15,0xd4]
 
-v_interp_p2_f32 v0, v0, attr0.z
-// CHECK: [0x00,0x02,0x01,0xd4]
+v_interp_p2_f32 v5, v1, attr32.x
+// CHECK: [0x01,0x80,0x15,0xd4]
 
-v_interp_p2_f32 v0, v0, attr0.w
-// CHECK: [0x00,0x03,0x01,0xd4]
+v_interp_p2_f32 v5, v1, attr0.y
+// CHECK: [0x01,0x01,0x15,0xd4]
 
-v_interp_mov_f32 v0, p10, attr0.x
-// CHECK: [0x00,0x00,0x02,0xd4]
+v_interp_p2_f32 v5, v1, attr0.z
+// CHECK: [0x01,0x02,0x15,0xd4]
+
+v_interp_p2_f32 v5, v1, attr0.w
+// CHECK: [0x01,0x03,0x15,0xd4]
+
+v_interp_mov_f32 v5, p10, attr0.x
+// CHECK: [0x00,0x00,0x16,0xd4]
 
 v_interp_mov_f32 v255, p10, attr0.x
 // CHECK: [0x00,0x00,0xfe,0xd7]
 
-v_interp_mov_f32 v0, p20, attr0.x
-// CHECK: [0x01,0x00,0x02,0xd4]
+v_interp_mov_f32 v5, p20, attr0.x
+// CHECK: [0x01,0x00,0x16,0xd4]
 
-v_interp_mov_f32 v0, p0, attr0.x
-// CHECK: [0x02,0x00,0x02,0xd4]
+v_interp_mov_f32 v5, p0, attr0.x
+// CHECK: [0x02,0x00,0x16,0xd4]
 
-v_interp_mov_f32 v0, p10, attr1.x
-// CHECK: [0x00,0x04,0x02,0xd4]
+v_interp_mov_f32 v5, p10, attr1.x
+// CHECK: [0x00,0x04,0x16,0xd4]
 
-v_interp_mov_f32 v0, p10, attr31.x
-// CHECK: [0x00,0x7c,0x02,0xd4]
+v_interp_mov_f32 v5, p10, attr31.x
+// CHECK: [0x00,0x7c,0x16,0xd4]
 
-v_interp_mov_f32 v0, p10, attr32.x
-// CHECK: [0x00,0x80,0x02,0xd4]
+v_interp_mov_f32 v5, p10, attr32.x
+// CHECK: [0x00,0x80,0x16,0xd4]
 
-v_interp_mov_f32 v0, p10, attr0.y
-// CHECK: [0x00,0x01,0x02,0xd4]
+v_interp_mov_f32 v5, p10, attr0.y
+// CHECK: [0x00,0x01,0x16,0xd4]
 
-v_interp_mov_f32 v0, p10, attr0.z
-// CHECK: [0x00,0x02,0x02,0xd4]
+v_interp_mov_f32 v5, p10, attr0.z
+// CHECK: [0x00,0x02,0x16,0xd4]
 
-v_interp_mov_f32 v0, p10, attr0.w
-// CHECK: [0x00,0x03,0x02,0xd4]
+v_interp_mov_f32 v5, p10, attr0.w
+// CHECK: [0x00,0x03,0x16,0xd4]
 
 v_nop
 // CHECK: [0x00,0x00,0x00,0x7e]
@@ -23148,6497 +23315,6506 @@ v_nop
 v_nop_e64
 // CHECK: [0x00,0x00,0x40,0xd1,0x00,0x00,0x00,0x00]
 
-v_mov_b32 v0, s0
-// CHECK: [0x00,0x02,0x00,0x7e]
+v_mov_b32 v5, s1
+// CHECK: [0x01,0x02,0x0a,0x7e]
+
+v_mov_b32 v255, s1
+// CHECK: [0x01,0x02,0xfe,0x7f]
+
+v_mov_b32 v5, s101
+// CHECK: [0x65,0x02,0x0a,0x7e]
 
-v_mov_b32 v255, s0
-// CHECK: [0x00,0x02,0xfe,0x7f]
+v_mov_b32 v5, flat_scratch_lo
+// CHECK: [0x66,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, s101
-// CHECK: [0x65,0x02,0x00,0x7e]
+v_mov_b32 v5, flat_scratch_hi
+// CHECK: [0x67,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, flat_scratch_lo
-// CHECK: [0x66,0x02,0x00,0x7e]
+v_mov_b32 v5, vcc_lo
+// CHECK: [0x6a,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, flat_scratch_hi
-// CHECK: [0x67,0x02,0x00,0x7e]
+v_mov_b32 v5, vcc_hi
+// CHECK: [0x6b,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, vcc_lo
-// CHECK: [0x6a,0x02,0x00,0x7e]
+v_mov_b32 v5, tba_lo
+// CHECK: [0x6c,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, vcc_hi
-// CHECK: [0x6b,0x02,0x00,0x7e]
+v_mov_b32 v5, tba_hi
+// CHECK: [0x6d,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, tba_lo
-// CHECK: [0x6c,0x02,0x00,0x7e]
+v_mov_b32 v5, tma_lo
+// CHECK: [0x6e,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, tba_hi
-// CHECK: [0x6d,0x02,0x00,0x7e]
+v_mov_b32 v5, tma_hi
+// CHECK: [0x6f,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, tma_lo
-// CHECK: [0x6e,0x02,0x00,0x7e]
+v_mov_b32 v5, ttmp11
+// CHECK: [0x7b,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, tma_hi
-// CHECK: [0x6f,0x02,0x00,0x7e]
+v_mov_b32 v5, m0
+// CHECK: [0x7c,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, ttmp11
-// CHECK: [0x7b,0x02,0x00,0x7e]
+v_mov_b32 v5, exec_lo
+// CHECK: [0x7e,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, m0
-// CHECK: [0x7c,0x02,0x00,0x7e]
+v_mov_b32 v5, exec_hi
+// CHECK: [0x7f,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, exec_lo
-// CHECK: [0x7e,0x02,0x00,0x7e]
+v_mov_b32 v5, 0
+// CHECK: [0x80,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, exec_hi
-// CHECK: [0x7f,0x02,0x00,0x7e]
+v_mov_b32 v5, -1
+// CHECK: [0xc1,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, 0
-// CHECK: [0x80,0x02,0x00,0x7e]
+v_mov_b32 v5, 0.5
+// CHECK: [0xf0,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, -1
-// CHECK: [0xc1,0x02,0x00,0x7e]
+v_mov_b32 v5, -4.0
+// CHECK: [0xf7,0x02,0x0a,0x7e]
 
-v_mov_b32 v0, 0.5
-// CHECK: [0xf0,0x02,0x00,0x7e]
+v_mov_b32 v5, 0xaf123456
+// CHECK: [0xff,0x02,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_mov_b32 v0, -4.0
-// CHECK: [0xf7,0x02,0x00,0x7e]
+v_mov_b32 v5, 0x3f717273
+// CHECK: [0xff,0x02,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_mov_b32 v0, 0xaf123456
-// CHECK: [0xff,0x02,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_mov_b32 v5, v1
+// CHECK: [0x01,0x03,0x0a,0x7e]
 
-v_mov_b32 v0, 0x3f717273
-// CHECK: [0xff,0x02,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_mov_b32 v5, v255
+// CHECK: [0xff,0x03,0x0a,0x7e]
 
-v_mov_b32 v0, v0
-// CHECK: [0x00,0x03,0x00,0x7e]
+v_mov_b32_e64 v5, s1
+// CHECK: [0x05,0x00,0x41,0xd1,0x01,0x00,0x00,0x00]
 
-v_mov_b32 v0, v255
-// CHECK: [0xff,0x03,0x00,0x7e]
+v_mov_b32_e64 v255, s1
+// CHECK: [0xff,0x00,0x41,0xd1,0x01,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, s0
-// CHECK: [0x00,0x00,0x41,0xd1,0x00,0x00,0x00,0x00]
+v_mov_b32_e64 v5, s101
+// CHECK: [0x05,0x00,0x41,0xd1,0x65,0x00,0x00,0x00]
 
-v_mov_b32_e64 v255, s0
-// CHECK: [0xff,0x00,0x41,0xd1,0x00,0x00,0x00,0x00]
+v_mov_b32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x41,0xd1,0x66,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, s101
-// CHECK: [0x00,0x00,0x41,0xd1,0x65,0x00,0x00,0x00]
+v_mov_b32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x41,0xd1,0x67,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x41,0xd1,0x66,0x00,0x00,0x00]
+v_mov_b32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x41,0xd1,0x6a,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x41,0xd1,0x67,0x00,0x00,0x00]
+v_mov_b32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x41,0xd1,0x6b,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x41,0xd1,0x6a,0x00,0x00,0x00]
+v_mov_b32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x41,0xd1,0x6c,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x41,0xd1,0x6b,0x00,0x00,0x00]
+v_mov_b32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x41,0xd1,0x6d,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x41,0xd1,0x6c,0x00,0x00,0x00]
+v_mov_b32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x41,0xd1,0x6e,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x41,0xd1,0x6d,0x00,0x00,0x00]
+v_mov_b32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x41,0xd1,0x6f,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x41,0xd1,0x6e,0x00,0x00,0x00]
+v_mov_b32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x41,0xd1,0x7b,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x41,0xd1,0x6f,0x00,0x00,0x00]
+v_mov_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x41,0xd1,0x7c,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x41,0xd1,0x7b,0x00,0x00,0x00]
+v_mov_b32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x41,0xd1,0x7e,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x41,0xd1,0x7c,0x00,0x00,0x00]
+v_mov_b32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x41,0xd1,0x7f,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x41,0xd1,0x7e,0x00,0x00,0x00]
+v_mov_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x41,0xd1,0x80,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x41,0xd1,0x7f,0x00,0x00,0x00]
+v_mov_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x41,0xd1,0xc1,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x41,0xd1,0x80,0x00,0x00,0x00]
+v_mov_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x41,0xd1,0xf0,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x41,0xd1,0xc1,0x00,0x00,0x00]
+v_mov_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x41,0xd1,0xf7,0x00,0x00,0x00]
 
-v_mov_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x41,0xd1,0xf0,0x00,0x00,0x00]
+v_mov_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x41,0xd1,0x01,0x01,0x00,0x00]
 
-v_mov_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x41,0xd1,0xf7,0x00,0x00,0x00]
+v_mov_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x41,0xd1,0xff,0x01,0x00,0x00]
 
-v_mov_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x41,0xd1,0x00,0x01,0x00,0x00]
+v_readfirstlane_b32 s5, v1
+// CHECK: [0x01,0x05,0x0a,0x7e]
 
-v_mov_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x41,0xd1,0xff,0x01,0x00,0x00]
+v_readfirstlane_b32 s101, v1
+// CHECK: [0x01,0x05,0xca,0x7e]
 
-v_readfirstlane_b32 s0, v0
-// CHECK: [0x00,0x05,0x00,0x7e]
+v_readfirstlane_b32 flat_scratch_lo, v1
+// CHECK: [0x01,0x05,0xcc,0x7e]
 
-v_readfirstlane_b32 s101, v0
-// CHECK: [0x00,0x05,0xca,0x7e]
+v_readfirstlane_b32 flat_scratch_hi, v1
+// CHECK: [0x01,0x05,0xce,0x7e]
 
-v_readfirstlane_b32 flat_scratch_lo, v0
-// CHECK: [0x00,0x05,0xcc,0x7e]
+v_readfirstlane_b32 tba_lo, v1
+// CHECK: [0x01,0x05,0xd8,0x7e]
 
-v_readfirstlane_b32 flat_scratch_hi, v0
-// CHECK: [0x00,0x05,0xce,0x7e]
+v_readfirstlane_b32 tba_hi, v1
+// CHECK: [0x01,0x05,0xda,0x7e]
 
-v_readfirstlane_b32 tba_lo, v0
-// CHECK: [0x00,0x05,0xd8,0x7e]
+v_readfirstlane_b32 tma_lo, v1
+// CHECK: [0x01,0x05,0xdc,0x7e]
 
-v_readfirstlane_b32 tba_hi, v0
-// CHECK: [0x00,0x05,0xda,0x7e]
+v_readfirstlane_b32 tma_hi, v1
+// CHECK: [0x01,0x05,0xde,0x7e]
 
-v_readfirstlane_b32 tma_lo, v0
-// CHECK: [0x00,0x05,0xdc,0x7e]
+v_readfirstlane_b32 ttmp11, v1
+// CHECK: [0x01,0x05,0xf6,0x7e]
 
-v_readfirstlane_b32 tma_hi, v0
-// CHECK: [0x00,0x05,0xde,0x7e]
+v_readfirstlane_b32 s5, v255
+// CHECK: [0xff,0x05,0x0a,0x7e]
 
-v_readfirstlane_b32 ttmp11, v0
-// CHECK: [0x00,0x05,0xf6,0x7e]
+v_cvt_i32_f64 v5, s[2:3]
+// CHECK: [0x02,0x06,0x0a,0x7e]
 
-v_readfirstlane_b32 s0, v255
-// CHECK: [0xff,0x05,0x00,0x7e]
+v_cvt_i32_f64 v255, s[2:3]
+// CHECK: [0x02,0x06,0xfe,0x7f]
 
-v_cvt_i32_f64 v0, s[0:1]
-// CHECK: [0x00,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, s[4:5]
+// CHECK: [0x04,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v255, s[0:1]
-// CHECK: [0x00,0x06,0xfe,0x7f]
+v_cvt_i32_f64 v5, s[100:101]
+// CHECK: [0x64,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, s[2:3]
-// CHECK: [0x02,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, flat_scratch
+// CHECK: [0x66,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, s[100:101]
-// CHECK: [0x64,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, vcc
+// CHECK: [0x6a,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, flat_scratch
-// CHECK: [0x66,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, tba
+// CHECK: [0x6c,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, vcc
-// CHECK: [0x6a,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, tma
+// CHECK: [0x6e,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, tba
-// CHECK: [0x6c,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, ttmp[10:11]
+// CHECK: [0x7a,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, tma
-// CHECK: [0x6e,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, exec
+// CHECK: [0x7e,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, ttmp[10:11]
-// CHECK: [0x7a,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, 0
+// CHECK: [0x80,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, exec
-// CHECK: [0x7e,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, -1
+// CHECK: [0xc1,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, 0
-// CHECK: [0x80,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, 0.5
+// CHECK: [0xf0,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, -1
-// CHECK: [0xc1,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, -4.0
+// CHECK: [0xf7,0x06,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, 0.5
-// CHECK: [0xf0,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, 0xaf123456
+// CHECK: [0xff,0x06,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_f64 v0, -4.0
-// CHECK: [0xf7,0x06,0x00,0x7e]
+v_cvt_i32_f64 v5, 0x3f717273
+// CHECK: [0xff,0x06,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_i32_f64 v0, 0xaf123456
-// CHECK: [0xff,0x06,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_i32_f64 v5, v[1:2]
+// CHECK: [0x01,0x07,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, 0x3f717273
-// CHECK: [0xff,0x06,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_i32_f64 v5, v[254:255]
+// CHECK: [0xfe,0x07,0x0a,0x7e]
 
-v_cvt_i32_f64 v0, v[0:1]
-// CHECK: [0x00,0x07,0x00,0x7e]
+v_cvt_i32_f64_e64 v5, s[2:3]
+// CHECK: [0x05,0x00,0x43,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_i32_f64 v0, v[254:255]
-// CHECK: [0xfe,0x07,0x00,0x7e]
+v_cvt_i32_f64_e64 v255, s[2:3]
+// CHECK: [0xff,0x00,0x43,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, s[0:1]
-// CHECK: [0x00,0x00,0x43,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, s[4:5]
+// CHECK: [0x05,0x00,0x43,0xd1,0x04,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v255, s[0:1]
-// CHECK: [0xff,0x00,0x43,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, s[100:101]
+// CHECK: [0x05,0x00,0x43,0xd1,0x64,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, s[2:3]
-// CHECK: [0x00,0x00,0x43,0xd1,0x02,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, flat_scratch
+// CHECK: [0x05,0x00,0x43,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, s[100:101]
-// CHECK: [0x00,0x00,0x43,0xd1,0x64,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, vcc
+// CHECK: [0x05,0x00,0x43,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, flat_scratch
-// CHECK: [0x00,0x00,0x43,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, tba
+// CHECK: [0x05,0x00,0x43,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, vcc
-// CHECK: [0x00,0x00,0x43,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, tma
+// CHECK: [0x05,0x00,0x43,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, tba
-// CHECK: [0x00,0x00,0x43,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, ttmp[10:11]
+// CHECK: [0x05,0x00,0x43,0xd1,0x7a,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, tma
-// CHECK: [0x00,0x00,0x43,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, exec
+// CHECK: [0x05,0x00,0x43,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, ttmp[10:11]
-// CHECK: [0x00,0x00,0x43,0xd1,0x7a,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, scc
+// CHECK: [0x05,0x00,0x43,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, exec
-// CHECK: [0x00,0x00,0x43,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, v[1:2]
+// CHECK: [0x05,0x00,0x43,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, scc
-// CHECK: [0x00,0x00,0x43,0xd1,0xfd,0x00,0x00,0x00]
+v_cvt_i32_f64_e64 v5, v[254:255]
+// CHECK: [0x05,0x00,0x43,0xd1,0xfe,0x01,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x43,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_i32_f64_e64 v5, -s[2:3]
+// CHECK: [0x05,0x00,0x43,0xd1,0x02,0x00,0x00,0x20]
 
-v_cvt_i32_f64_e64 v0, v[254:255]
-// CHECK: [0x00,0x00,0x43,0xd1,0xfe,0x01,0x00,0x00]
+v_cvt_i32_f64_e64 v5, |s[2:3]|
+// CHECK: [0x05,0x01,0x43,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, -s[0:1]
-// CHECK: [0x00,0x00,0x43,0xd1,0x00,0x00,0x00,0x20]
+v_cvt_i32_f64_e64 v5, s[2:3] clamp
+// CHECK: [0x05,0x80,0x43,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_i32_f64_e64 v0, |s[0:1]|
-// CHECK: [0x00,0x01,0x43,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f64_i32 v[5:6], s1
+// CHECK: [0x01,0x08,0x0a,0x7e]
 
-v_cvt_i32_f64_e64 v0, s[0:1] clamp
-// CHECK: [0x00,0x80,0x43,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f64_i32 v[254:255], s1
+// CHECK: [0x01,0x08,0xfc,0x7f]
 
-v_cvt_f64_i32 v[0:1], s0
-// CHECK: [0x00,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], s101
+// CHECK: [0x65,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[254:255], s0
-// CHECK: [0x00,0x08,0xfc,0x7f]
+v_cvt_f64_i32 v[5:6], flat_scratch_lo
+// CHECK: [0x66,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], s101
-// CHECK: [0x65,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], flat_scratch_hi
+// CHECK: [0x67,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], flat_scratch_lo
-// CHECK: [0x66,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], vcc_lo
+// CHECK: [0x6a,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], flat_scratch_hi
-// CHECK: [0x67,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], vcc_hi
+// CHECK: [0x6b,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], vcc_lo
-// CHECK: [0x6a,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], tba_lo
+// CHECK: [0x6c,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], vcc_hi
-// CHECK: [0x6b,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], tba_hi
+// CHECK: [0x6d,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], tba_lo
-// CHECK: [0x6c,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], tma_lo
+// CHECK: [0x6e,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], tba_hi
-// CHECK: [0x6d,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], tma_hi
+// CHECK: [0x6f,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], tma_lo
-// CHECK: [0x6e,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], ttmp11
+// CHECK: [0x7b,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], tma_hi
-// CHECK: [0x6f,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], m0
+// CHECK: [0x7c,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], ttmp11
-// CHECK: [0x7b,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], exec_lo
+// CHECK: [0x7e,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], m0
-// CHECK: [0x7c,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], exec_hi
+// CHECK: [0x7f,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], exec_lo
-// CHECK: [0x7e,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], 0
+// CHECK: [0x80,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], exec_hi
-// CHECK: [0x7f,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], -1
+// CHECK: [0xc1,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], 0
-// CHECK: [0x80,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], 0.5
+// CHECK: [0xf0,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], -1
-// CHECK: [0xc1,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], -4.0
+// CHECK: [0xf7,0x08,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], 0.5
-// CHECK: [0xf0,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], 0xaf123456
+// CHECK: [0xff,0x08,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f64_i32 v[0:1], -4.0
-// CHECK: [0xf7,0x08,0x00,0x7e]
+v_cvt_f64_i32 v[5:6], 0x3f717273
+// CHECK: [0xff,0x08,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f64_i32 v[0:1], 0xaf123456
-// CHECK: [0xff,0x08,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f64_i32 v[5:6], v1
+// CHECK: [0x01,0x09,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], 0x3f717273
-// CHECK: [0xff,0x08,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f64_i32 v[5:6], v255
+// CHECK: [0xff,0x09,0x0a,0x7e]
 
-v_cvt_f64_i32 v[0:1], v0
-// CHECK: [0x00,0x09,0x00,0x7e]
+v_cvt_f64_i32_e64 v[5:6], s1
+// CHECK: [0x05,0x00,0x44,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_i32 v[0:1], v255
-// CHECK: [0xff,0x09,0x00,0x7e]
+v_cvt_f64_i32_e64 v[254:255], s1
+// CHECK: [0xfe,0x00,0x44,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], s0
-// CHECK: [0x00,0x00,0x44,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], s101
+// CHECK: [0x05,0x00,0x44,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[254:255], s0
-// CHECK: [0xfe,0x00,0x44,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], flat_scratch_lo
+// CHECK: [0x05,0x00,0x44,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], s101
-// CHECK: [0x00,0x00,0x44,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], flat_scratch_hi
+// CHECK: [0x05,0x00,0x44,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x44,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], vcc_lo
+// CHECK: [0x05,0x00,0x44,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x44,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], vcc_hi
+// CHECK: [0x05,0x00,0x44,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x44,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], tba_lo
+// CHECK: [0x05,0x00,0x44,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x44,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], tba_hi
+// CHECK: [0x05,0x00,0x44,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], tba_lo
-// CHECK: [0x00,0x00,0x44,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], tma_lo
+// CHECK: [0x05,0x00,0x44,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], tba_hi
-// CHECK: [0x00,0x00,0x44,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], tma_hi
+// CHECK: [0x05,0x00,0x44,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], tma_lo
-// CHECK: [0x00,0x00,0x44,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], ttmp11
+// CHECK: [0x05,0x00,0x44,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], tma_hi
-// CHECK: [0x00,0x00,0x44,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], m0
+// CHECK: [0x05,0x00,0x44,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], ttmp11
-// CHECK: [0x00,0x00,0x44,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], exec_lo
+// CHECK: [0x05,0x00,0x44,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], m0
-// CHECK: [0x00,0x00,0x44,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], exec_hi
+// CHECK: [0x05,0x00,0x44,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], exec_lo
-// CHECK: [0x00,0x00,0x44,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], 0
+// CHECK: [0x05,0x00,0x44,0xd1,0x80,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], exec_hi
-// CHECK: [0x00,0x00,0x44,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], -1
+// CHECK: [0x05,0x00,0x44,0xd1,0xc1,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], 0
-// CHECK: [0x00,0x00,0x44,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], 0.5
+// CHECK: [0x05,0x00,0x44,0xd1,0xf0,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], -1
-// CHECK: [0x00,0x00,0x44,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], -4.0
+// CHECK: [0x05,0x00,0x44,0xd1,0xf7,0x00,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], 0.5
-// CHECK: [0x00,0x00,0x44,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], v1
+// CHECK: [0x05,0x00,0x44,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], -4.0
-// CHECK: [0x00,0x00,0x44,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_f64_i32_e64 v[5:6], v255
+// CHECK: [0x05,0x00,0x44,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_f64_i32_e64 v[0:1], v0
-// CHECK: [0x00,0x00,0x44,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f32_i32 v5, s1
+// CHECK: [0x01,0x0a,0x0a,0x7e]
 
-v_cvt_f64_i32_e64 v[0:1], v255
-// CHECK: [0x00,0x00,0x44,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_f32_i32 v255, s1
+// CHECK: [0x01,0x0a,0xfe,0x7f]
 
-v_cvt_f32_i32 v0, s0
-// CHECK: [0x00,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, s101
+// CHECK: [0x65,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v255, s0
-// CHECK: [0x00,0x0a,0xfe,0x7f]
+v_cvt_f32_i32 v5, flat_scratch_lo
+// CHECK: [0x66,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, s101
-// CHECK: [0x65,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, flat_scratch_hi
+// CHECK: [0x67,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, flat_scratch_lo
-// CHECK: [0x66,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, vcc_lo
+// CHECK: [0x6a,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, flat_scratch_hi
-// CHECK: [0x67,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, vcc_hi
+// CHECK: [0x6b,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, vcc_lo
-// CHECK: [0x6a,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, tba_lo
+// CHECK: [0x6c,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, vcc_hi
-// CHECK: [0x6b,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, tba_hi
+// CHECK: [0x6d,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, tba_lo
-// CHECK: [0x6c,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, tma_lo
+// CHECK: [0x6e,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, tba_hi
-// CHECK: [0x6d,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, tma_hi
+// CHECK: [0x6f,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, tma_lo
-// CHECK: [0x6e,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, ttmp11
+// CHECK: [0x7b,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, tma_hi
-// CHECK: [0x6f,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, m0
+// CHECK: [0x7c,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, ttmp11
-// CHECK: [0x7b,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, exec_lo
+// CHECK: [0x7e,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, m0
-// CHECK: [0x7c,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, exec_hi
+// CHECK: [0x7f,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, exec_lo
-// CHECK: [0x7e,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, 0
+// CHECK: [0x80,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, exec_hi
-// CHECK: [0x7f,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, -1
+// CHECK: [0xc1,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, 0
-// CHECK: [0x80,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, 0.5
+// CHECK: [0xf0,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, -1
-// CHECK: [0xc1,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, -4.0
+// CHECK: [0xf7,0x0a,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, 0.5
-// CHECK: [0xf0,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, 0xaf123456
+// CHECK: [0xff,0x0a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_i32 v0, -4.0
-// CHECK: [0xf7,0x0a,0x00,0x7e]
+v_cvt_f32_i32 v5, 0x3f717273
+// CHECK: [0xff,0x0a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_i32 v0, 0xaf123456
-// CHECK: [0xff,0x0a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_i32 v5, v1
+// CHECK: [0x01,0x0b,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, 0x3f717273
-// CHECK: [0xff,0x0a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_i32 v5, v255
+// CHECK: [0xff,0x0b,0x0a,0x7e]
 
-v_cvt_f32_i32 v0, v0
-// CHECK: [0x00,0x0b,0x00,0x7e]
+v_cvt_f32_i32_e64 v5, s1
+// CHECK: [0x05,0x00,0x45,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_i32 v0, v255
-// CHECK: [0xff,0x0b,0x00,0x7e]
+v_cvt_f32_i32_e64 v255, s1
+// CHECK: [0xff,0x00,0x45,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, s0
-// CHECK: [0x00,0x00,0x45,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, s101
+// CHECK: [0x05,0x00,0x45,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v255, s0
-// CHECK: [0xff,0x00,0x45,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x45,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, s101
-// CHECK: [0x00,0x00,0x45,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x45,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x45,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x45,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x45,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x45,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x45,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x45,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x45,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x45,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x45,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x45,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x45,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x45,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x45,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x45,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x45,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, m0
+// CHECK: [0x05,0x00,0x45,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x45,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x45,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, m0
-// CHECK: [0x00,0x00,0x45,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x45,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x45,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, 0
+// CHECK: [0x05,0x00,0x45,0xd1,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x45,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, -1
+// CHECK: [0x05,0x00,0x45,0xd1,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, 0
-// CHECK: [0x00,0x00,0x45,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x45,0xd1,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, -1
-// CHECK: [0x00,0x00,0x45,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x45,0xd1,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x45,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, v1
+// CHECK: [0x05,0x00,0x45,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x45,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_f32_i32_e64 v5, v255
+// CHECK: [0x05,0x00,0x45,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_i32_e64 v0, v0
-// CHECK: [0x00,0x00,0x45,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f32_u32 v5, s1
+// CHECK: [0x01,0x0c,0x0a,0x7e]
 
-v_cvt_f32_i32_e64 v0, v255
-// CHECK: [0x00,0x00,0x45,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_f32_u32 v255, s1
+// CHECK: [0x01,0x0c,0xfe,0x7f]
 
-v_cvt_f32_u32 v0, s0
-// CHECK: [0x00,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, s101
+// CHECK: [0x65,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v255, s0
-// CHECK: [0x00,0x0c,0xfe,0x7f]
+v_cvt_f32_u32 v5, flat_scratch_lo
+// CHECK: [0x66,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, s101
-// CHECK: [0x65,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, flat_scratch_hi
+// CHECK: [0x67,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, flat_scratch_lo
-// CHECK: [0x66,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, vcc_lo
+// CHECK: [0x6a,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, flat_scratch_hi
-// CHECK: [0x67,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, vcc_hi
+// CHECK: [0x6b,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, vcc_lo
-// CHECK: [0x6a,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, tba_lo
+// CHECK: [0x6c,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, vcc_hi
-// CHECK: [0x6b,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, tba_hi
+// CHECK: [0x6d,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, tba_lo
-// CHECK: [0x6c,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, tma_lo
+// CHECK: [0x6e,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, tba_hi
-// CHECK: [0x6d,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, tma_hi
+// CHECK: [0x6f,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, tma_lo
-// CHECK: [0x6e,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, ttmp11
+// CHECK: [0x7b,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, tma_hi
-// CHECK: [0x6f,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, m0
+// CHECK: [0x7c,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, ttmp11
-// CHECK: [0x7b,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, exec_lo
+// CHECK: [0x7e,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, m0
-// CHECK: [0x7c,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, exec_hi
+// CHECK: [0x7f,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, exec_lo
-// CHECK: [0x7e,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, 0
+// CHECK: [0x80,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, exec_hi
-// CHECK: [0x7f,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, -1
+// CHECK: [0xc1,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, 0
-// CHECK: [0x80,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, 0.5
+// CHECK: [0xf0,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, -1
-// CHECK: [0xc1,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, -4.0
+// CHECK: [0xf7,0x0c,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, 0.5
-// CHECK: [0xf0,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, 0xaf123456
+// CHECK: [0xff,0x0c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_u32 v0, -4.0
-// CHECK: [0xf7,0x0c,0x00,0x7e]
+v_cvt_f32_u32 v5, 0x3f717273
+// CHECK: [0xff,0x0c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_u32 v0, 0xaf123456
-// CHECK: [0xff,0x0c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_u32 v5, v1
+// CHECK: [0x01,0x0d,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, 0x3f717273
-// CHECK: [0xff,0x0c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_u32 v5, v255
+// CHECK: [0xff,0x0d,0x0a,0x7e]
 
-v_cvt_f32_u32 v0, v0
-// CHECK: [0x00,0x0d,0x00,0x7e]
+v_cvt_f32_u32_e64 v5, s1
+// CHECK: [0x05,0x00,0x46,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_u32 v0, v255
-// CHECK: [0xff,0x0d,0x00,0x7e]
+v_cvt_f32_u32_e64 v255, s1
+// CHECK: [0xff,0x00,0x46,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, s0
-// CHECK: [0x00,0x00,0x46,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, s101
+// CHECK: [0x05,0x00,0x46,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v255, s0
-// CHECK: [0xff,0x00,0x46,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x46,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, s101
-// CHECK: [0x00,0x00,0x46,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x46,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x46,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x46,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x46,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x46,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x46,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x46,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x46,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x46,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x46,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x46,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x46,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x46,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x46,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x46,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x46,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, m0
+// CHECK: [0x05,0x00,0x46,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x46,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x46,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, m0
-// CHECK: [0x00,0x00,0x46,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x46,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x46,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, 0
+// CHECK: [0x05,0x00,0x46,0xd1,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x46,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, -1
+// CHECK: [0x05,0x00,0x46,0xd1,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, 0
-// CHECK: [0x00,0x00,0x46,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x46,0xd1,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, -1
-// CHECK: [0x00,0x00,0x46,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x46,0xd1,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x46,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, v1
+// CHECK: [0x05,0x00,0x46,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x46,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_f32_u32_e64 v5, v255
+// CHECK: [0x05,0x00,0x46,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_u32_e64 v0, v0
-// CHECK: [0x00,0x00,0x46,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_u32_f32 v5, s1
+// CHECK: [0x01,0x0e,0x0a,0x7e]
 
-v_cvt_f32_u32_e64 v0, v255
-// CHECK: [0x00,0x00,0x46,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_u32_f32 v255, s1
+// CHECK: [0x01,0x0e,0xfe,0x7f]
 
-v_cvt_u32_f32 v0, s0
-// CHECK: [0x00,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, s101
+// CHECK: [0x65,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v255, s0
-// CHECK: [0x00,0x0e,0xfe,0x7f]
+v_cvt_u32_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, s101
-// CHECK: [0x65,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, vcc_lo
+// CHECK: [0x6a,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, vcc_hi
+// CHECK: [0x6b,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, vcc_lo
-// CHECK: [0x6a,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, tba_lo
+// CHECK: [0x6c,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, vcc_hi
-// CHECK: [0x6b,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, tba_hi
+// CHECK: [0x6d,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, tba_lo
-// CHECK: [0x6c,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, tma_lo
+// CHECK: [0x6e,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, tba_hi
-// CHECK: [0x6d,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, tma_hi
+// CHECK: [0x6f,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, tma_lo
-// CHECK: [0x6e,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, ttmp11
+// CHECK: [0x7b,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, tma_hi
-// CHECK: [0x6f,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, m0
+// CHECK: [0x7c,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, ttmp11
-// CHECK: [0x7b,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, exec_lo
+// CHECK: [0x7e,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, m0
-// CHECK: [0x7c,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, exec_hi
+// CHECK: [0x7f,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, exec_lo
-// CHECK: [0x7e,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, 0
+// CHECK: [0x80,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, exec_hi
-// CHECK: [0x7f,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, -1
+// CHECK: [0xc1,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, 0
-// CHECK: [0x80,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, 0.5
+// CHECK: [0xf0,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, -1
-// CHECK: [0xc1,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, -4.0
+// CHECK: [0xf7,0x0e,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, 0.5
-// CHECK: [0xf0,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, 0xaf123456
+// CHECK: [0xff,0x0e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_f32 v0, -4.0
-// CHECK: [0xf7,0x0e,0x00,0x7e]
+v_cvt_u32_f32 v5, 0x3f717273
+// CHECK: [0xff,0x0e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_u32_f32 v0, 0xaf123456
-// CHECK: [0xff,0x0e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_u32_f32 v5, v1
+// CHECK: [0x01,0x0f,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, 0x3f717273
-// CHECK: [0xff,0x0e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_u32_f32 v5, v255
+// CHECK: [0xff,0x0f,0x0a,0x7e]
 
-v_cvt_u32_f32 v0, v0
-// CHECK: [0x00,0x0f,0x00,0x7e]
+v_cvt_u32_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x47,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_u32_f32 v0, v255
-// CHECK: [0xff,0x0f,0x00,0x7e]
+v_cvt_u32_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x47,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x47,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x47,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x47,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x47,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x47,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x47,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x47,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x47,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x47,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x47,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x47,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x47,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x47,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x47,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x47,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x47,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x47,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x47,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x47,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x47,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x47,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x47,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x47,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x47,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x47,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x47,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x47,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x47,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x47,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x47,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x47,0xd1,0xfd,0x00,0x00,0x00]
+v_cvt_u32_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x47,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x47,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_u32_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x47,0xd1,0x01,0x00,0x00,0x20]
 
-v_cvt_u32_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x47,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_u32_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x47,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x47,0xd1,0x00,0x00,0x00,0x20]
+v_cvt_u32_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x47,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_u32_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x47,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_i32_f32 v5, s1
+// CHECK: [0x01,0x10,0x0a,0x7e]
 
-v_cvt_u32_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x47,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_i32_f32 v255, s1
+// CHECK: [0x01,0x10,0xfe,0x7f]
 
-v_cvt_i32_f32 v0, s0
-// CHECK: [0x00,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, s101
+// CHECK: [0x65,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v255, s0
-// CHECK: [0x00,0x10,0xfe,0x7f]
+v_cvt_i32_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, s101
-// CHECK: [0x65,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, vcc_lo
+// CHECK: [0x6a,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, vcc_hi
+// CHECK: [0x6b,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, vcc_lo
-// CHECK: [0x6a,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, tba_lo
+// CHECK: [0x6c,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, vcc_hi
-// CHECK: [0x6b,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, tba_hi
+// CHECK: [0x6d,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, tba_lo
-// CHECK: [0x6c,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, tma_lo
+// CHECK: [0x6e,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, tba_hi
-// CHECK: [0x6d,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, tma_hi
+// CHECK: [0x6f,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, tma_lo
-// CHECK: [0x6e,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, ttmp11
+// CHECK: [0x7b,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, tma_hi
-// CHECK: [0x6f,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, m0
+// CHECK: [0x7c,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, ttmp11
-// CHECK: [0x7b,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, exec_lo
+// CHECK: [0x7e,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, m0
-// CHECK: [0x7c,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, exec_hi
+// CHECK: [0x7f,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, exec_lo
-// CHECK: [0x7e,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, 0
+// CHECK: [0x80,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, exec_hi
-// CHECK: [0x7f,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, -1
+// CHECK: [0xc1,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, 0
-// CHECK: [0x80,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, 0.5
+// CHECK: [0xf0,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, -1
-// CHECK: [0xc1,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, -4.0
+// CHECK: [0xf7,0x10,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, 0.5
-// CHECK: [0xf0,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, 0xaf123456
+// CHECK: [0xff,0x10,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_i32_f32 v0, -4.0
-// CHECK: [0xf7,0x10,0x00,0x7e]
+v_cvt_i32_f32 v5, 0x3f717273
+// CHECK: [0xff,0x10,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_i32_f32 v0, 0xaf123456
-// CHECK: [0xff,0x10,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_i32_f32 v5, v1
+// CHECK: [0x01,0x11,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, 0x3f717273
-// CHECK: [0xff,0x10,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_i32_f32 v5, v255
+// CHECK: [0xff,0x11,0x0a,0x7e]
 
-v_cvt_i32_f32 v0, v0
-// CHECK: [0x00,0x11,0x00,0x7e]
+v_cvt_i32_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x48,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_i32_f32 v0, v255
-// CHECK: [0xff,0x11,0x00,0x7e]
+v_cvt_i32_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x48,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x48,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x48,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x48,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x48,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x48,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x48,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x48,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x48,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x48,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x48,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x48,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x48,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x48,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x48,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x48,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x48,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x48,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x48,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x48,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x48,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x48,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x48,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x48,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x48,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x48,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x48,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x48,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x48,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x48,0xd1,0xfd,0x00,0x00,0x00]
+v_cvt_i32_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x48,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_i32_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x48,0xd1,0x01,0x00,0x00,0x20]
 
-v_cvt_i32_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x48,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_i32_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x48,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x48,0xd1,0x00,0x00,0x00,0x20]
+v_cvt_i32_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x48,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_i32_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f16_f32 v5, s1
+// CHECK: [0x01,0x14,0x0a,0x7e]
 
-v_cvt_i32_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x48,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f16_f32 v255, s1
+// CHECK: [0x01,0x14,0xfe,0x7f]
 
-v_cvt_f16_f32 v0, s0
-// CHECK: [0x00,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, s101
+// CHECK: [0x65,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v255, s0
-// CHECK: [0x00,0x14,0xfe,0x7f]
+v_cvt_f16_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, s101
-// CHECK: [0x65,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, vcc_lo
+// CHECK: [0x6a,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, vcc_hi
+// CHECK: [0x6b,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, vcc_lo
-// CHECK: [0x6a,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, tba_lo
+// CHECK: [0x6c,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, vcc_hi
-// CHECK: [0x6b,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, tba_hi
+// CHECK: [0x6d,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, tba_lo
-// CHECK: [0x6c,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, tma_lo
+// CHECK: [0x6e,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, tba_hi
-// CHECK: [0x6d,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, tma_hi
+// CHECK: [0x6f,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, tma_lo
-// CHECK: [0x6e,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, ttmp11
+// CHECK: [0x7b,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, tma_hi
-// CHECK: [0x6f,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, m0
+// CHECK: [0x7c,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, ttmp11
-// CHECK: [0x7b,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, exec_lo
+// CHECK: [0x7e,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, m0
-// CHECK: [0x7c,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, exec_hi
+// CHECK: [0x7f,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, exec_lo
-// CHECK: [0x7e,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, 0
+// CHECK: [0x80,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, exec_hi
-// CHECK: [0x7f,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, -1
+// CHECK: [0xc1,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, 0
-// CHECK: [0x80,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, 0.5
+// CHECK: [0xf0,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, -1
-// CHECK: [0xc1,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, -4.0
+// CHECK: [0xf7,0x14,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, 0.5
-// CHECK: [0xf0,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, 0xaf123456
+// CHECK: [0xff,0x14,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f16_f32 v0, -4.0
-// CHECK: [0xf7,0x14,0x00,0x7e]
+v_cvt_f16_f32 v5, 0x3f717273
+// CHECK: [0xff,0x14,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f16_f32 v0, 0xaf123456
-// CHECK: [0xff,0x14,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f16_f32 v5, v1
+// CHECK: [0x01,0x15,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, 0x3f717273
-// CHECK: [0xff,0x14,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f16_f32 v5, v255
+// CHECK: [0xff,0x15,0x0a,0x7e]
 
-v_cvt_f16_f32 v0, v0
-// CHECK: [0x00,0x15,0x00,0x7e]
+v_cvt_f16_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x4a,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f16_f32 v0, v255
-// CHECK: [0xff,0x15,0x00,0x7e]
+v_cvt_f16_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x4a,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x4a,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x4a,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x4a,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x4a,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x4a,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x4a,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x4a,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x4a,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x4a,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x4a,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x4a,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x4a,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x4a,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x4a,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x4a,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x4a,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x4a,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x4a,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x4a,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x4a,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x4a,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x4a,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x4a,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x4a,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x4a,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x4a,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x4a,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x4a,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x4a,0xd1,0xfd,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x4a,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f16_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x4a,0xd1,0x01,0x00,0x00,0x20]
 
-v_cvt_f16_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x4a,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_f16_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x4a,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x00,0x00,0x20]
+v_cvt_f16_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x4a,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f16_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x4a,0xd1,0x01,0x00,0x00,0x08]
 
-v_cvt_f16_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x4a,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f16_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x4a,0xd1,0x01,0x00,0x00,0x10]
 
-v_cvt_f16_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x00,0x00,0x08]
+v_cvt_f16_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x4a,0xd1,0x01,0x00,0x00,0x18]
 
-v_cvt_f16_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x00,0x00,0x10]
+v_cvt_f32_f16 v5, s1
+// CHECK: [0x01,0x16,0x0a,0x7e]
 
-v_cvt_f16_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x4a,0xd1,0x00,0x00,0x00,0x18]
+v_cvt_f32_f16 v255, s1
+// CHECK: [0x01,0x16,0xfe,0x7f]
 
-v_cvt_f32_f16 v0, s0
-// CHECK: [0x00,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, s101
+// CHECK: [0x65,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v255, s0
-// CHECK: [0x00,0x16,0xfe,0x7f]
+v_cvt_f32_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, s101
-// CHECK: [0x65,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, vcc_lo
+// CHECK: [0x6a,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, vcc_hi
+// CHECK: [0x6b,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, vcc_lo
-// CHECK: [0x6a,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, tba_lo
+// CHECK: [0x6c,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, vcc_hi
-// CHECK: [0x6b,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, tba_hi
+// CHECK: [0x6d,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, tba_lo
-// CHECK: [0x6c,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, tma_lo
+// CHECK: [0x6e,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, tba_hi
-// CHECK: [0x6d,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, tma_hi
+// CHECK: [0x6f,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, tma_lo
-// CHECK: [0x6e,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, ttmp11
+// CHECK: [0x7b,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, tma_hi
-// CHECK: [0x6f,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, m0
+// CHECK: [0x7c,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, ttmp11
-// CHECK: [0x7b,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, exec_lo
+// CHECK: [0x7e,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, m0
-// CHECK: [0x7c,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, exec_hi
+// CHECK: [0x7f,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, exec_lo
-// CHECK: [0x7e,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, 0
+// CHECK: [0x80,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, exec_hi
-// CHECK: [0x7f,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, -1
+// CHECK: [0xc1,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, 0
-// CHECK: [0x80,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, 0.5
+// CHECK: [0xf0,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, -1
-// CHECK: [0xc1,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, -4.0
+// CHECK: [0xf7,0x16,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, 0.5
-// CHECK: [0xf0,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, 0xfe0b
+// CHECK: [0xff,0x16,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
 
-v_cvt_f32_f16 v0, -4.0
-// CHECK: [0xf7,0x16,0x00,0x7e]
+v_cvt_f32_f16 v5, 0x3456
+// CHECK: [0xff,0x16,0x0a,0x7e,0x56,0x34,0x00,0x00]
 
-v_cvt_f32_f16 v0, 0xfe0b
-// CHECK: [0xff,0x16,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_cvt_f32_f16 v5, v1
+// CHECK: [0x01,0x17,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, 0x3456
-// CHECK: [0xff,0x16,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_cvt_f32_f16 v5, v255
+// CHECK: [0xff,0x17,0x0a,0x7e]
 
-v_cvt_f32_f16 v0, v0
-// CHECK: [0x00,0x17,0x00,0x7e]
+v_cvt_f32_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x4b,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_f16 v0, v255
-// CHECK: [0xff,0x17,0x00,0x7e]
+v_cvt_f32_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x4b,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x4b,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x4b,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x4b,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x4b,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x4b,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x4b,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x4b,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x4b,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x4b,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x4b,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x4b,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x4b,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x4b,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x4b,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x4b,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x4b,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x4b,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x4b,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x4b,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x4b,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x4b,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x4b,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x4b,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x4b,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x4b,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x4b,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x4b,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x4b,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x4b,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x4b,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, 0
-// CHECK: [0x00,0x00,0x4b,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x4b,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, -1
-// CHECK: [0x00,0x00,0x4b,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x4b,0xd1,0x01,0x00,0x00,0x20]
 
-v_cvt_f32_f16_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x4b,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x4b,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x4b,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_f32_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x4b,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x4b,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f32_f16_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x4b,0xd1,0x01,0x00,0x00,0x08]
 
-v_cvt_f32_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x4b,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_f32_f16_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x4b,0xd1,0x01,0x00,0x00,0x10]
 
-v_cvt_rpi_i32_f32 v0, s0
-// CHECK: [0x00,0x18,0x00,0x7e]
+v_cvt_f32_f16_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x4b,0xd1,0x01,0x00,0x00,0x18]
 
-v_cvt_rpi_i32_f32 v255, s0
-// CHECK: [0x00,0x18,0xfe,0x7f]
+v_cvt_rpi_i32_f32 v5, s1
+// CHECK: [0x01,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, s101
-// CHECK: [0x65,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v255, s1
+// CHECK: [0x01,0x18,0xfe,0x7f]
 
-v_cvt_rpi_i32_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, s101
+// CHECK: [0x65,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, vcc_lo
-// CHECK: [0x6a,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, vcc_hi
-// CHECK: [0x6b,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, vcc_lo
+// CHECK: [0x6a,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, tba_lo
-// CHECK: [0x6c,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, vcc_hi
+// CHECK: [0x6b,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, tba_hi
-// CHECK: [0x6d,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, tba_lo
+// CHECK: [0x6c,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, tma_lo
-// CHECK: [0x6e,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, tba_hi
+// CHECK: [0x6d,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, tma_hi
-// CHECK: [0x6f,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, tma_lo
+// CHECK: [0x6e,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, ttmp11
-// CHECK: [0x7b,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, tma_hi
+// CHECK: [0x6f,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, m0
-// CHECK: [0x7c,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, ttmp11
+// CHECK: [0x7b,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, exec_lo
-// CHECK: [0x7e,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, m0
+// CHECK: [0x7c,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, exec_hi
-// CHECK: [0x7f,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, exec_lo
+// CHECK: [0x7e,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, 0
-// CHECK: [0x80,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, exec_hi
+// CHECK: [0x7f,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, -1
-// CHECK: [0xc1,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, 0
+// CHECK: [0x80,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, 0.5
-// CHECK: [0xf0,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, -1
+// CHECK: [0xc1,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, -4.0
-// CHECK: [0xf7,0x18,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, 0.5
+// CHECK: [0xf0,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, 0xaf123456
-// CHECK: [0xff,0x18,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_rpi_i32_f32 v5, -4.0
+// CHECK: [0xf7,0x18,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32 v0, 0x3f717273
-// CHECK: [0xff,0x18,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_rpi_i32_f32 v5, 0xaf123456
+// CHECK: [0xff,0x18,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_rpi_i32_f32 v0, v0
-// CHECK: [0x00,0x19,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, 0x3f717273
+// CHECK: [0xff,0x18,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_rpi_i32_f32 v0, v255
-// CHECK: [0xff,0x19,0x00,0x7e]
+v_cvt_rpi_i32_f32 v5, v1
+// CHECK: [0x01,0x19,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32 v5, v255
+// CHECK: [0xff,0x19,0x0a,0x7e]
 
-v_cvt_rpi_i32_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x4c,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x4c,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x4c,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x4c,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x4c,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x4c,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x4c,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x4c,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x4c,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x4c,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x4c,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x4c,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x4c,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x4c,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x4c,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x4c,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x4c,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x4c,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x4c,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x4c,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x4c,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x4c,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x4c,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x4c,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x4c,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x4c,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x4c,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x4c,0xd1,0xfd,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x4c,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x4c,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x4c,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x4c,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x4c,0xd1,0x00,0x00,0x00,0x20]
+v_cvt_rpi_i32_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x4c,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_rpi_i32_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x4c,0xd1,0x01,0x00,0x00,0x20]
 
-v_cvt_rpi_i32_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x4c,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_rpi_i32_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x4c,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32 v0, s0
-// CHECK: [0x00,0x1a,0x00,0x7e]
+v_cvt_rpi_i32_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x4c,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32 v255, s0
-// CHECK: [0x00,0x1a,0xfe,0x7f]
+v_cvt_flr_i32_f32 v5, s1
+// CHECK: [0x01,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, s101
-// CHECK: [0x65,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v255, s1
+// CHECK: [0x01,0x1a,0xfe,0x7f]
 
-v_cvt_flr_i32_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, s101
+// CHECK: [0x65,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, vcc_lo
-// CHECK: [0x6a,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, vcc_hi
-// CHECK: [0x6b,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, vcc_lo
+// CHECK: [0x6a,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, tba_lo
-// CHECK: [0x6c,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, vcc_hi
+// CHECK: [0x6b,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, tba_hi
-// CHECK: [0x6d,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, tba_lo
+// CHECK: [0x6c,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, tma_lo
-// CHECK: [0x6e,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, tba_hi
+// CHECK: [0x6d,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, tma_hi
-// CHECK: [0x6f,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, tma_lo
+// CHECK: [0x6e,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, ttmp11
-// CHECK: [0x7b,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, tma_hi
+// CHECK: [0x6f,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, m0
-// CHECK: [0x7c,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, ttmp11
+// CHECK: [0x7b,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, exec_lo
-// CHECK: [0x7e,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, m0
+// CHECK: [0x7c,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, exec_hi
-// CHECK: [0x7f,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, exec_lo
+// CHECK: [0x7e,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, 0
-// CHECK: [0x80,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, exec_hi
+// CHECK: [0x7f,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, -1
-// CHECK: [0xc1,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, 0
+// CHECK: [0x80,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, 0.5
-// CHECK: [0xf0,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, -1
+// CHECK: [0xc1,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, -4.0
-// CHECK: [0xf7,0x1a,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, 0.5
+// CHECK: [0xf0,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, 0xaf123456
-// CHECK: [0xff,0x1a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_flr_i32_f32 v5, -4.0
+// CHECK: [0xf7,0x1a,0x0a,0x7e]
 
-v_cvt_flr_i32_f32 v0, 0x3f717273
-// CHECK: [0xff,0x1a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_flr_i32_f32 v5, 0xaf123456
+// CHECK: [0xff,0x1a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_flr_i32_f32 v0, v0
-// CHECK: [0x00,0x1b,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, 0x3f717273
+// CHECK: [0xff,0x1a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_flr_i32_f32 v0, v255
-// CHECK: [0xff,0x1b,0x00,0x7e]
+v_cvt_flr_i32_f32 v5, v1
+// CHECK: [0x01,0x1b,0x0a,0x7e]
 
-v_cvt_flr_i32_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x4d,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_flr_i32_f32 v5, v255
+// CHECK: [0xff,0x1b,0x0a,0x7e]
 
-v_cvt_flr_i32_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x4d,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x4d,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x4d,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x4d,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x4d,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x4d,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x4d,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x4d,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x4d,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x4d,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x4d,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x4d,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x4d,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x4d,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x4d,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x4d,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x4d,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x4d,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x4d,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x4d,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x4d,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x4d,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x4d,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x4d,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x4d,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x4d,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x4d,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x4d,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x4d,0xd1,0xfd,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x4d,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x4d,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x4d,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x4d,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x4d,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x4d,0xd1,0x00,0x00,0x00,0x20]
+v_cvt_flr_i32_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x4d,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_flr_i32_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x4d,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x4d,0xd1,0x01,0x00,0x00,0x20]
 
-v_cvt_flr_i32_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x4d,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_flr_i32_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x4d,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4 v0, s0
-// CHECK: [0x00,0x1c,0x00,0x7e]
+v_cvt_flr_i32_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x4d,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4 v255, s0
-// CHECK: [0x00,0x1c,0xfe,0x7f]
+v_cvt_off_f32_i4 v5, s1
+// CHECK: [0x01,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, s101
-// CHECK: [0x65,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v255, s1
+// CHECK: [0x01,0x1c,0xfe,0x7f]
 
-v_cvt_off_f32_i4 v0, flat_scratch_lo
-// CHECK: [0x66,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, s101
+// CHECK: [0x65,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, flat_scratch_hi
-// CHECK: [0x67,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, flat_scratch_lo
+// CHECK: [0x66,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, vcc_lo
-// CHECK: [0x6a,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, flat_scratch_hi
+// CHECK: [0x67,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, vcc_hi
-// CHECK: [0x6b,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, vcc_lo
+// CHECK: [0x6a,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, tba_lo
-// CHECK: [0x6c,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, vcc_hi
+// CHECK: [0x6b,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, tba_hi
-// CHECK: [0x6d,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, tba_lo
+// CHECK: [0x6c,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, tma_lo
-// CHECK: [0x6e,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, tba_hi
+// CHECK: [0x6d,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, tma_hi
-// CHECK: [0x6f,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, tma_lo
+// CHECK: [0x6e,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, ttmp11
-// CHECK: [0x7b,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, tma_hi
+// CHECK: [0x6f,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, m0
-// CHECK: [0x7c,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, ttmp11
+// CHECK: [0x7b,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, exec_lo
-// CHECK: [0x7e,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, m0
+// CHECK: [0x7c,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, exec_hi
-// CHECK: [0x7f,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, exec_lo
+// CHECK: [0x7e,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, 0
-// CHECK: [0x80,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, exec_hi
+// CHECK: [0x7f,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, -1
-// CHECK: [0xc1,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, 0
+// CHECK: [0x80,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, 0.5
-// CHECK: [0xf0,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, -1
+// CHECK: [0xc1,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, -4.0
-// CHECK: [0xf7,0x1c,0x00,0x7e]
+v_cvt_off_f32_i4 v5, 0.5
+// CHECK: [0xf0,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, 0x4f
-// CHECK: [0xff,0x1c,0x00,0x7e,0x4f,0x00,0x00,0x00]
+v_cvt_off_f32_i4 v5, -4.0
+// CHECK: [0xf7,0x1c,0x0a,0x7e]
 
-v_cvt_off_f32_i4 v0, 0x41
-// CHECK: [0xff,0x1c,0x00,0x7e,0x41,0x00,0x00,0x00]
+v_cvt_off_f32_i4 v5, 0x4f
+// CHECK: [0xff,0x1c,0x0a,0x7e,0x4f,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4 v0, v0
-// CHECK: [0x00,0x1d,0x00,0x7e]
+v_cvt_off_f32_i4 v5, 0x41
+// CHECK: [0xff,0x1c,0x0a,0x7e,0x41,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4 v0, v255
-// CHECK: [0xff,0x1d,0x00,0x7e]
+v_cvt_off_f32_i4 v5, v1
+// CHECK: [0x01,0x1d,0x0a,0x7e]
 
-v_cvt_off_f32_i4_e64 v0, s0
-// CHECK: [0x00,0x00,0x4e,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_off_f32_i4 v5, v255
+// CHECK: [0xff,0x1d,0x0a,0x7e]
 
-v_cvt_off_f32_i4_e64 v255, s0
-// CHECK: [0xff,0x00,0x4e,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, s1
+// CHECK: [0x05,0x00,0x4e,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, s101
-// CHECK: [0x00,0x00,0x4e,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v255, s1
+// CHECK: [0xff,0x00,0x4e,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x4e,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, s101
+// CHECK: [0x05,0x00,0x4e,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x4e,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x4e,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x4e,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x4e,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x4e,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x4e,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x4e,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x4e,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x4e,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x4e,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x4e,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x4e,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x4e,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x4e,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x4e,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x4e,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, m0
-// CHECK: [0x00,0x00,0x4e,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x4e,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x4e,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, m0
+// CHECK: [0x05,0x00,0x4e,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x4e,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x4e,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, 0
-// CHECK: [0x00,0x00,0x4e,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x4e,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, -1
-// CHECK: [0x00,0x00,0x4e,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, 0
+// CHECK: [0x05,0x00,0x4e,0xd1,0x80,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x4e,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, -1
+// CHECK: [0x05,0x00,0x4e,0xd1,0xc1,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x4e,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x4e,0xd1,0xf0,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, v0
-// CHECK: [0x00,0x00,0x4e,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x4e,0xd1,0xf7,0x00,0x00,0x00]
 
-v_cvt_off_f32_i4_e64 v0, v255
-// CHECK: [0x00,0x00,0x4e,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_off_f32_i4_e64 v5, v1
+// CHECK: [0x05,0x00,0x4e,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_f64 v0, s[0:1]
-// CHECK: [0x00,0x1e,0x00,0x7e]
+v_cvt_off_f32_i4_e64 v5, v255
+// CHECK: [0x05,0x00,0x4e,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_f64 v255, s[0:1]
-// CHECK: [0x00,0x1e,0xfe,0x7f]
+v_cvt_f32_f64 v5, s[2:3]
+// CHECK: [0x02,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, s[2:3]
-// CHECK: [0x02,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v255, s[2:3]
+// CHECK: [0x02,0x1e,0xfe,0x7f]
 
-v_cvt_f32_f64 v0, s[100:101]
-// CHECK: [0x64,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, s[4:5]
+// CHECK: [0x04,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, flat_scratch
-// CHECK: [0x66,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, s[100:101]
+// CHECK: [0x64,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, vcc
-// CHECK: [0x6a,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, flat_scratch
+// CHECK: [0x66,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, tba
-// CHECK: [0x6c,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, vcc
+// CHECK: [0x6a,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, tma
-// CHECK: [0x6e,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, tba
+// CHECK: [0x6c,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, ttmp[10:11]
-// CHECK: [0x7a,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, tma
+// CHECK: [0x6e,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, exec
-// CHECK: [0x7e,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, ttmp[10:11]
+// CHECK: [0x7a,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, 0
-// CHECK: [0x80,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, exec
+// CHECK: [0x7e,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, -1
-// CHECK: [0xc1,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, 0
+// CHECK: [0x80,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, 0.5
-// CHECK: [0xf0,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, -1
+// CHECK: [0xc1,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, -4.0
-// CHECK: [0xf7,0x1e,0x00,0x7e]
+v_cvt_f32_f64 v5, 0.5
+// CHECK: [0xf0,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, 0xaf123456
-// CHECK: [0xff,0x1e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_f64 v5, -4.0
+// CHECK: [0xf7,0x1e,0x0a,0x7e]
 
-v_cvt_f32_f64 v0, 0x3f717273
-// CHECK: [0xff,0x1e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_f64 v5, 0xaf123456
+// CHECK: [0xff,0x1e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_f64 v0, v[0:1]
-// CHECK: [0x00,0x1f,0x00,0x7e]
+v_cvt_f32_f64 v5, 0x3f717273
+// CHECK: [0xff,0x1e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_f64 v0, v[254:255]
-// CHECK: [0xfe,0x1f,0x00,0x7e]
+v_cvt_f32_f64 v5, v[1:2]
+// CHECK: [0x01,0x1f,0x0a,0x7e]
 
-v_cvt_f32_f64_e64 v0, s[0:1]
-// CHECK: [0x00,0x00,0x4f,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_f64 v5, v[254:255]
+// CHECK: [0xfe,0x1f,0x0a,0x7e]
 
-v_cvt_f32_f64_e64 v255, s[0:1]
-// CHECK: [0xff,0x00,0x4f,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, s[2:3]
+// CHECK: [0x05,0x00,0x4f,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, s[2:3]
-// CHECK: [0x00,0x00,0x4f,0xd1,0x02,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v255, s[2:3]
+// CHECK: [0xff,0x00,0x4f,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, s[100:101]
-// CHECK: [0x00,0x00,0x4f,0xd1,0x64,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, s[4:5]
+// CHECK: [0x05,0x00,0x4f,0xd1,0x04,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, flat_scratch
-// CHECK: [0x00,0x00,0x4f,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, s[100:101]
+// CHECK: [0x05,0x00,0x4f,0xd1,0x64,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, vcc
-// CHECK: [0x00,0x00,0x4f,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, flat_scratch
+// CHECK: [0x05,0x00,0x4f,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, tba
-// CHECK: [0x00,0x00,0x4f,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, vcc
+// CHECK: [0x05,0x00,0x4f,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, tma
-// CHECK: [0x00,0x00,0x4f,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, tba
+// CHECK: [0x05,0x00,0x4f,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, ttmp[10:11]
-// CHECK: [0x00,0x00,0x4f,0xd1,0x7a,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, tma
+// CHECK: [0x05,0x00,0x4f,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, exec
-// CHECK: [0x00,0x00,0x4f,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, ttmp[10:11]
+// CHECK: [0x05,0x00,0x4f,0xd1,0x7a,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, scc
-// CHECK: [0x00,0x00,0x4f,0xd1,0xfd,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, exec
+// CHECK: [0x05,0x00,0x4f,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x4f,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f32_f64_e64 v5, scc
+// CHECK: [0x05,0x00,0x4f,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, v[254:255]
-// CHECK: [0x00,0x00,0x4f,0xd1,0xfe,0x01,0x00,0x00]
+v_cvt_f32_f64_e64 v5, v[1:2]
+// CHECK: [0x05,0x00,0x4f,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, -s[0:1]
-// CHECK: [0x00,0x00,0x4f,0xd1,0x00,0x00,0x00,0x20]
+v_cvt_f32_f64_e64 v5, v[254:255]
+// CHECK: [0x05,0x00,0x4f,0xd1,0xfe,0x01,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, |s[0:1]|
-// CHECK: [0x00,0x01,0x4f,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, -s[2:3]
+// CHECK: [0x05,0x00,0x4f,0xd1,0x02,0x00,0x00,0x20]
 
-v_cvt_f32_f64_e64 v0, s[0:1] clamp
-// CHECK: [0x00,0x80,0x4f,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_f64_e64 v5, |s[2:3]|
+// CHECK: [0x05,0x01,0x4f,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, s[0:1] mul:2
-// CHECK: [0x00,0x00,0x4f,0xd1,0x00,0x00,0x00,0x08]
+v_cvt_f32_f64_e64 v5, s[2:3] clamp
+// CHECK: [0x05,0x80,0x4f,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_f32_f64_e64 v0, s[0:1] mul:4
-// CHECK: [0x00,0x00,0x4f,0xd1,0x00,0x00,0x00,0x10]
+v_cvt_f32_f64_e64 v5, s[2:3] mul:2
+// CHECK: [0x05,0x00,0x4f,0xd1,0x02,0x00,0x00,0x08]
 
-v_cvt_f32_f64_e64 v0, s[0:1] div:2
-// CHECK: [0x00,0x00,0x4f,0xd1,0x00,0x00,0x00,0x18]
+v_cvt_f32_f64_e64 v5, s[2:3] mul:4
+// CHECK: [0x05,0x00,0x4f,0xd1,0x02,0x00,0x00,0x10]
 
-v_cvt_f64_f32 v[0:1], s0
-// CHECK: [0x00,0x20,0x00,0x7e]
+v_cvt_f32_f64_e64 v5, s[2:3] div:2
+// CHECK: [0x05,0x00,0x4f,0xd1,0x02,0x00,0x00,0x18]
 
-v_cvt_f64_f32 v[254:255], s0
-// CHECK: [0x00,0x20,0xfc,0x7f]
+v_cvt_f64_f32 v[5:6], s1
+// CHECK: [0x01,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], s101
-// CHECK: [0x65,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[254:255], s1
+// CHECK: [0x01,0x20,0xfc,0x7f]
 
-v_cvt_f64_f32 v[0:1], flat_scratch_lo
-// CHECK: [0x66,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], s101
+// CHECK: [0x65,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], flat_scratch_hi
-// CHECK: [0x67,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], flat_scratch_lo
+// CHECK: [0x66,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], vcc_lo
-// CHECK: [0x6a,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], flat_scratch_hi
+// CHECK: [0x67,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], vcc_hi
-// CHECK: [0x6b,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], vcc_lo
+// CHECK: [0x6a,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], tba_lo
-// CHECK: [0x6c,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], vcc_hi
+// CHECK: [0x6b,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], tba_hi
-// CHECK: [0x6d,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], tba_lo
+// CHECK: [0x6c,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], tma_lo
-// CHECK: [0x6e,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], tba_hi
+// CHECK: [0x6d,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], tma_hi
-// CHECK: [0x6f,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], tma_lo
+// CHECK: [0x6e,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], ttmp11
-// CHECK: [0x7b,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], tma_hi
+// CHECK: [0x6f,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], m0
-// CHECK: [0x7c,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], ttmp11
+// CHECK: [0x7b,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], exec_lo
-// CHECK: [0x7e,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], m0
+// CHECK: [0x7c,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], exec_hi
-// CHECK: [0x7f,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], exec_lo
+// CHECK: [0x7e,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], 0
-// CHECK: [0x80,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], exec_hi
+// CHECK: [0x7f,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], -1
-// CHECK: [0xc1,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], 0
+// CHECK: [0x80,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], 0.5
-// CHECK: [0xf0,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], -1
+// CHECK: [0xc1,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], -4.0
-// CHECK: [0xf7,0x20,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], 0.5
+// CHECK: [0xf0,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], 0xaf123456
-// CHECK: [0xff,0x20,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f64_f32 v[5:6], -4.0
+// CHECK: [0xf7,0x20,0x0a,0x7e]
 
-v_cvt_f64_f32 v[0:1], 0x3f717273
-// CHECK: [0xff,0x20,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f64_f32 v[5:6], 0xaf123456
+// CHECK: [0xff,0x20,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f64_f32 v[0:1], v0
-// CHECK: [0x00,0x21,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], 0x3f717273
+// CHECK: [0xff,0x20,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f64_f32 v[0:1], v255
-// CHECK: [0xff,0x21,0x00,0x7e]
+v_cvt_f64_f32 v[5:6], v1
+// CHECK: [0x01,0x21,0x0a,0x7e]
 
-v_cvt_f64_f32_e64 v[0:1], s0
-// CHECK: [0x00,0x00,0x50,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f64_f32 v[5:6], v255
+// CHECK: [0xff,0x21,0x0a,0x7e]
 
-v_cvt_f64_f32_e64 v[254:255], s0
-// CHECK: [0xfe,0x00,0x50,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], s1
+// CHECK: [0x05,0x00,0x50,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], s101
-// CHECK: [0x00,0x00,0x50,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[254:255], s1
+// CHECK: [0xfe,0x00,0x50,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x50,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], s101
+// CHECK: [0x05,0x00,0x50,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x50,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], flat_scratch_lo
+// CHECK: [0x05,0x00,0x50,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x50,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], flat_scratch_hi
+// CHECK: [0x05,0x00,0x50,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x50,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], vcc_lo
+// CHECK: [0x05,0x00,0x50,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], tba_lo
-// CHECK: [0x00,0x00,0x50,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], vcc_hi
+// CHECK: [0x05,0x00,0x50,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], tba_hi
-// CHECK: [0x00,0x00,0x50,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], tba_lo
+// CHECK: [0x05,0x00,0x50,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], tma_lo
-// CHECK: [0x00,0x00,0x50,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], tba_hi
+// CHECK: [0x05,0x00,0x50,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], tma_hi
-// CHECK: [0x00,0x00,0x50,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], tma_lo
+// CHECK: [0x05,0x00,0x50,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], ttmp11
-// CHECK: [0x00,0x00,0x50,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], tma_hi
+// CHECK: [0x05,0x00,0x50,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], m0
-// CHECK: [0x00,0x00,0x50,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], ttmp11
+// CHECK: [0x05,0x00,0x50,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], exec_lo
-// CHECK: [0x00,0x00,0x50,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], m0
+// CHECK: [0x05,0x00,0x50,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], exec_hi
-// CHECK: [0x00,0x00,0x50,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], exec_lo
+// CHECK: [0x05,0x00,0x50,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x50,0xd1,0xfd,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], exec_hi
+// CHECK: [0x05,0x00,0x50,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], v0
-// CHECK: [0x00,0x00,0x50,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x50,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], v255
-// CHECK: [0x00,0x00,0x50,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], v1
+// CHECK: [0x05,0x00,0x50,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], -s0
-// CHECK: [0x00,0x00,0x50,0xd1,0x00,0x00,0x00,0x20]
+v_cvt_f64_f32_e64 v[5:6], v255
+// CHECK: [0x05,0x00,0x50,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], |s0|
-// CHECK: [0x00,0x01,0x50,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], -s1
+// CHECK: [0x05,0x00,0x50,0xd1,0x01,0x00,0x00,0x20]
 
-v_cvt_f64_f32_e64 v[0:1], s0 clamp
-// CHECK: [0x00,0x80,0x50,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f64_f32_e64 v[5:6], |s1|
+// CHECK: [0x05,0x01,0x50,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], s0 mul:2
-// CHECK: [0x00,0x00,0x50,0xd1,0x00,0x00,0x00,0x08]
+v_cvt_f64_f32_e64 v[5:6], s1 clamp
+// CHECK: [0x05,0x80,0x50,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_f32_e64 v[0:1], s0 mul:4
-// CHECK: [0x00,0x00,0x50,0xd1,0x00,0x00,0x00,0x10]
+v_cvt_f64_f32_e64 v[5:6], s1 mul:2
+// CHECK: [0x05,0x00,0x50,0xd1,0x01,0x00,0x00,0x08]
 
-v_cvt_f64_f32_e64 v[0:1], s0 div:2
-// CHECK: [0x00,0x00,0x50,0xd1,0x00,0x00,0x00,0x18]
+v_cvt_f64_f32_e64 v[5:6], s1 mul:4
+// CHECK: [0x05,0x00,0x50,0xd1,0x01,0x00,0x00,0x10]
 
-v_cvt_f32_ubyte0 v0, s0
-// CHECK: [0x00,0x22,0x00,0x7e]
+v_cvt_f64_f32_e64 v[5:6], s1 div:2
+// CHECK: [0x05,0x00,0x50,0xd1,0x01,0x00,0x00,0x18]
 
-v_cvt_f32_ubyte0 v255, s0
-// CHECK: [0x00,0x22,0xfe,0x7f]
+v_cvt_f32_ubyte0 v5, s1
+// CHECK: [0x01,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, s101
-// CHECK: [0x65,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v255, s1
+// CHECK: [0x01,0x22,0xfe,0x7f]
 
-v_cvt_f32_ubyte0 v0, flat_scratch_lo
-// CHECK: [0x66,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, s101
+// CHECK: [0x65,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, flat_scratch_hi
-// CHECK: [0x67,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, flat_scratch_lo
+// CHECK: [0x66,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, vcc_lo
-// CHECK: [0x6a,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, flat_scratch_hi
+// CHECK: [0x67,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, vcc_hi
-// CHECK: [0x6b,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, vcc_lo
+// CHECK: [0x6a,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, tba_lo
-// CHECK: [0x6c,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, vcc_hi
+// CHECK: [0x6b,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, tba_hi
-// CHECK: [0x6d,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, tba_lo
+// CHECK: [0x6c,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, tma_lo
-// CHECK: [0x6e,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, tba_hi
+// CHECK: [0x6d,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, tma_hi
-// CHECK: [0x6f,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, tma_lo
+// CHECK: [0x6e,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, ttmp11
-// CHECK: [0x7b,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, tma_hi
+// CHECK: [0x6f,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, m0
-// CHECK: [0x7c,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, ttmp11
+// CHECK: [0x7b,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, exec_lo
-// CHECK: [0x7e,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, m0
+// CHECK: [0x7c,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, exec_hi
-// CHECK: [0x7f,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, exec_lo
+// CHECK: [0x7e,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, 0
-// CHECK: [0x80,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, exec_hi
+// CHECK: [0x7f,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, -1
-// CHECK: [0xc1,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, 0
+// CHECK: [0x80,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, 0.5
-// CHECK: [0xf0,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, -1
+// CHECK: [0xc1,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, -4.0
-// CHECK: [0xf7,0x22,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, 0.5
+// CHECK: [0xf0,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, 0xaf123456
-// CHECK: [0xff,0x22,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_ubyte0 v5, -4.0
+// CHECK: [0xf7,0x22,0x0a,0x7e]
 
-v_cvt_f32_ubyte0 v0, 0x3f717273
-// CHECK: [0xff,0x22,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_ubyte0 v5, 0xaf123456
+// CHECK: [0xff,0x22,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_ubyte0 v0, v0
-// CHECK: [0x00,0x23,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, 0x3f717273
+// CHECK: [0xff,0x22,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_ubyte0 v0, v255
-// CHECK: [0xff,0x23,0x00,0x7e]
+v_cvt_f32_ubyte0 v5, v1
+// CHECK: [0x01,0x23,0x0a,0x7e]
 
-v_cvt_f32_ubyte0_e64 v0, s0
-// CHECK: [0x00,0x00,0x51,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte0 v5, v255
+// CHECK: [0xff,0x23,0x0a,0x7e]
 
-v_cvt_f32_ubyte0_e64 v255, s0
-// CHECK: [0xff,0x00,0x51,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, s1
+// CHECK: [0x05,0x00,0x51,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, s101
-// CHECK: [0x00,0x00,0x51,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v255, s1
+// CHECK: [0xff,0x00,0x51,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x51,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, s101
+// CHECK: [0x05,0x00,0x51,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x51,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x51,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x51,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x51,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x51,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x51,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x51,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x51,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x51,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x51,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x51,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x51,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x51,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x51,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x51,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x51,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, m0
-// CHECK: [0x00,0x00,0x51,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x51,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x51,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, m0
+// CHECK: [0x05,0x00,0x51,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x51,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x51,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, 0
-// CHECK: [0x00,0x00,0x51,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x51,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, -1
-// CHECK: [0x00,0x00,0x51,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, 0
+// CHECK: [0x05,0x00,0x51,0xd1,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x51,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, -1
+// CHECK: [0x05,0x00,0x51,0xd1,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x51,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x51,0xd1,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, v0
-// CHECK: [0x00,0x00,0x51,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x51,0xd1,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte0_e64 v0, v255
-// CHECK: [0x00,0x00,0x51,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_f32_ubyte0_e64 v5, v1
+// CHECK: [0x05,0x00,0x51,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte1 v0, s0
-// CHECK: [0x00,0x24,0x00,0x7e]
+v_cvt_f32_ubyte0_e64 v5, v255
+// CHECK: [0x05,0x00,0x51,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte1 v255, s0
-// CHECK: [0x00,0x24,0xfe,0x7f]
+v_cvt_f32_ubyte1 v5, s1
+// CHECK: [0x01,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, s101
-// CHECK: [0x65,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v255, s1
+// CHECK: [0x01,0x24,0xfe,0x7f]
 
-v_cvt_f32_ubyte1 v0, flat_scratch_lo
-// CHECK: [0x66,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, s101
+// CHECK: [0x65,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, flat_scratch_hi
-// CHECK: [0x67,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, flat_scratch_lo
+// CHECK: [0x66,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, vcc_lo
-// CHECK: [0x6a,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, flat_scratch_hi
+// CHECK: [0x67,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, vcc_hi
-// CHECK: [0x6b,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, vcc_lo
+// CHECK: [0x6a,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, tba_lo
-// CHECK: [0x6c,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, vcc_hi
+// CHECK: [0x6b,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, tba_hi
-// CHECK: [0x6d,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, tba_lo
+// CHECK: [0x6c,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, tma_lo
-// CHECK: [0x6e,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, tba_hi
+// CHECK: [0x6d,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, tma_hi
-// CHECK: [0x6f,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, tma_lo
+// CHECK: [0x6e,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, ttmp11
-// CHECK: [0x7b,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, tma_hi
+// CHECK: [0x6f,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, m0
-// CHECK: [0x7c,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, ttmp11
+// CHECK: [0x7b,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, exec_lo
-// CHECK: [0x7e,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, m0
+// CHECK: [0x7c,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, exec_hi
-// CHECK: [0x7f,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, exec_lo
+// CHECK: [0x7e,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, 0
-// CHECK: [0x80,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, exec_hi
+// CHECK: [0x7f,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, -1
-// CHECK: [0xc1,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, 0
+// CHECK: [0x80,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, 0.5
-// CHECK: [0xf0,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, -1
+// CHECK: [0xc1,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, -4.0
-// CHECK: [0xf7,0x24,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, 0.5
+// CHECK: [0xf0,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, 0xaf123456
-// CHECK: [0xff,0x24,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_ubyte1 v5, -4.0
+// CHECK: [0xf7,0x24,0x0a,0x7e]
 
-v_cvt_f32_ubyte1 v0, 0x3f717273
-// CHECK: [0xff,0x24,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_ubyte1 v5, 0xaf123456
+// CHECK: [0xff,0x24,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_ubyte1 v0, v0
-// CHECK: [0x00,0x25,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, 0x3f717273
+// CHECK: [0xff,0x24,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_ubyte1 v0, v255
-// CHECK: [0xff,0x25,0x00,0x7e]
+v_cvt_f32_ubyte1 v5, v1
+// CHECK: [0x01,0x25,0x0a,0x7e]
 
-v_cvt_f32_ubyte1_e64 v0, s0
-// CHECK: [0x00,0x00,0x52,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte1 v5, v255
+// CHECK: [0xff,0x25,0x0a,0x7e]
 
-v_cvt_f32_ubyte1_e64 v255, s0
-// CHECK: [0xff,0x00,0x52,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, s1
+// CHECK: [0x05,0x00,0x52,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, s101
-// CHECK: [0x00,0x00,0x52,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v255, s1
+// CHECK: [0xff,0x00,0x52,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x52,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, s101
+// CHECK: [0x05,0x00,0x52,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x52,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x52,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x52,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x52,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x52,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x52,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x52,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x52,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x52,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x52,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x52,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x52,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x52,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x52,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x52,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x52,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, m0
-// CHECK: [0x00,0x00,0x52,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x52,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x52,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, m0
+// CHECK: [0x05,0x00,0x52,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x52,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x52,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, 0
-// CHECK: [0x00,0x00,0x52,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x52,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, -1
-// CHECK: [0x00,0x00,0x52,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, 0
+// CHECK: [0x05,0x00,0x52,0xd1,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x52,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, -1
+// CHECK: [0x05,0x00,0x52,0xd1,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x52,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x52,0xd1,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, v0
-// CHECK: [0x00,0x00,0x52,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x52,0xd1,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte1_e64 v0, v255
-// CHECK: [0x00,0x00,0x52,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_f32_ubyte1_e64 v5, v1
+// CHECK: [0x05,0x00,0x52,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte2 v0, s0
-// CHECK: [0x00,0x26,0x00,0x7e]
+v_cvt_f32_ubyte1_e64 v5, v255
+// CHECK: [0x05,0x00,0x52,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte2 v255, s0
-// CHECK: [0x00,0x26,0xfe,0x7f]
+v_cvt_f32_ubyte2 v5, s1
+// CHECK: [0x01,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, s101
-// CHECK: [0x65,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v255, s1
+// CHECK: [0x01,0x26,0xfe,0x7f]
 
-v_cvt_f32_ubyte2 v0, flat_scratch_lo
-// CHECK: [0x66,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, s101
+// CHECK: [0x65,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, flat_scratch_hi
-// CHECK: [0x67,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, flat_scratch_lo
+// CHECK: [0x66,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, vcc_lo
-// CHECK: [0x6a,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, flat_scratch_hi
+// CHECK: [0x67,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, vcc_hi
-// CHECK: [0x6b,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, vcc_lo
+// CHECK: [0x6a,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, tba_lo
-// CHECK: [0x6c,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, vcc_hi
+// CHECK: [0x6b,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, tba_hi
-// CHECK: [0x6d,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, tba_lo
+// CHECK: [0x6c,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, tma_lo
-// CHECK: [0x6e,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, tba_hi
+// CHECK: [0x6d,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, tma_hi
-// CHECK: [0x6f,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, tma_lo
+// CHECK: [0x6e,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, ttmp11
-// CHECK: [0x7b,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, tma_hi
+// CHECK: [0x6f,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, m0
-// CHECK: [0x7c,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, ttmp11
+// CHECK: [0x7b,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, exec_lo
-// CHECK: [0x7e,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, m0
+// CHECK: [0x7c,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, exec_hi
-// CHECK: [0x7f,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, exec_lo
+// CHECK: [0x7e,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, 0
-// CHECK: [0x80,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, exec_hi
+// CHECK: [0x7f,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, -1
-// CHECK: [0xc1,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, 0
+// CHECK: [0x80,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, 0.5
-// CHECK: [0xf0,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, -1
+// CHECK: [0xc1,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, -4.0
-// CHECK: [0xf7,0x26,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, 0.5
+// CHECK: [0xf0,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, 0xaf123456
-// CHECK: [0xff,0x26,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_ubyte2 v5, -4.0
+// CHECK: [0xf7,0x26,0x0a,0x7e]
 
-v_cvt_f32_ubyte2 v0, 0x3f717273
-// CHECK: [0xff,0x26,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_ubyte2 v5, 0xaf123456
+// CHECK: [0xff,0x26,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_ubyte2 v0, v0
-// CHECK: [0x00,0x27,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, 0x3f717273
+// CHECK: [0xff,0x26,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_ubyte2 v0, v255
-// CHECK: [0xff,0x27,0x00,0x7e]
+v_cvt_f32_ubyte2 v5, v1
+// CHECK: [0x01,0x27,0x0a,0x7e]
 
-v_cvt_f32_ubyte2_e64 v0, s0
-// CHECK: [0x00,0x00,0x53,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte2 v5, v255
+// CHECK: [0xff,0x27,0x0a,0x7e]
 
-v_cvt_f32_ubyte2_e64 v255, s0
-// CHECK: [0xff,0x00,0x53,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, s1
+// CHECK: [0x05,0x00,0x53,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, s101
-// CHECK: [0x00,0x00,0x53,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v255, s1
+// CHECK: [0xff,0x00,0x53,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x53,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, s101
+// CHECK: [0x05,0x00,0x53,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x53,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x53,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x53,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x53,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x53,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x53,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x53,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x53,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x53,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x53,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x53,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x53,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x53,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x53,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x53,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x53,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, m0
-// CHECK: [0x00,0x00,0x53,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x53,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x53,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, m0
+// CHECK: [0x05,0x00,0x53,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x53,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x53,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, 0
-// CHECK: [0x00,0x00,0x53,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x53,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, -1
-// CHECK: [0x00,0x00,0x53,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, 0
+// CHECK: [0x05,0x00,0x53,0xd1,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x53,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, -1
+// CHECK: [0x05,0x00,0x53,0xd1,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x53,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x53,0xd1,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, v0
-// CHECK: [0x00,0x00,0x53,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x53,0xd1,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte2_e64 v0, v255
-// CHECK: [0x00,0x00,0x53,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_f32_ubyte2_e64 v5, v1
+// CHECK: [0x05,0x00,0x53,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte3 v0, s0
-// CHECK: [0x00,0x28,0x00,0x7e]
+v_cvt_f32_ubyte2_e64 v5, v255
+// CHECK: [0x05,0x00,0x53,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_f32_ubyte3 v255, s0
-// CHECK: [0x00,0x28,0xfe,0x7f]
+v_cvt_f32_ubyte3 v5, s1
+// CHECK: [0x01,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, s101
-// CHECK: [0x65,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v255, s1
+// CHECK: [0x01,0x28,0xfe,0x7f]
 
-v_cvt_f32_ubyte3 v0, flat_scratch_lo
-// CHECK: [0x66,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, s101
+// CHECK: [0x65,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, flat_scratch_hi
-// CHECK: [0x67,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, flat_scratch_lo
+// CHECK: [0x66,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, vcc_lo
-// CHECK: [0x6a,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, flat_scratch_hi
+// CHECK: [0x67,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, vcc_hi
-// CHECK: [0x6b,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, vcc_lo
+// CHECK: [0x6a,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, tba_lo
-// CHECK: [0x6c,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, vcc_hi
+// CHECK: [0x6b,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, tba_hi
-// CHECK: [0x6d,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, tba_lo
+// CHECK: [0x6c,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, tma_lo
-// CHECK: [0x6e,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, tba_hi
+// CHECK: [0x6d,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, tma_hi
-// CHECK: [0x6f,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, tma_lo
+// CHECK: [0x6e,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, ttmp11
-// CHECK: [0x7b,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, tma_hi
+// CHECK: [0x6f,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, m0
-// CHECK: [0x7c,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, ttmp11
+// CHECK: [0x7b,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, exec_lo
-// CHECK: [0x7e,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, m0
+// CHECK: [0x7c,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, exec_hi
-// CHECK: [0x7f,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, exec_lo
+// CHECK: [0x7e,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, 0
-// CHECK: [0x80,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, exec_hi
+// CHECK: [0x7f,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, -1
-// CHECK: [0xc1,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, 0
+// CHECK: [0x80,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, 0.5
-// CHECK: [0xf0,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, -1
+// CHECK: [0xc1,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, -4.0
-// CHECK: [0xf7,0x28,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, 0.5
+// CHECK: [0xf0,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, 0xaf123456
-// CHECK: [0xff,0x28,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f32_ubyte3 v5, -4.0
+// CHECK: [0xf7,0x28,0x0a,0x7e]
 
-v_cvt_f32_ubyte3 v0, 0x3f717273
-// CHECK: [0xff,0x28,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f32_ubyte3 v5, 0xaf123456
+// CHECK: [0xff,0x28,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f32_ubyte3 v0, v0
-// CHECK: [0x00,0x29,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, 0x3f717273
+// CHECK: [0xff,0x28,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f32_ubyte3 v0, v255
-// CHECK: [0xff,0x29,0x00,0x7e]
+v_cvt_f32_ubyte3 v5, v1
+// CHECK: [0x01,0x29,0x0a,0x7e]
 
-v_cvt_f32_ubyte3_e64 v0, s0
-// CHECK: [0x00,0x00,0x54,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte3 v5, v255
+// CHECK: [0xff,0x29,0x0a,0x7e]
 
-v_cvt_f32_ubyte3_e64 v255, s0
-// CHECK: [0xff,0x00,0x54,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, s1
+// CHECK: [0x05,0x00,0x54,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, s101
-// CHECK: [0x00,0x00,0x54,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v255, s1
+// CHECK: [0xff,0x00,0x54,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x54,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, s101
+// CHECK: [0x05,0x00,0x54,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x54,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x54,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x54,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x54,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x54,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x54,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x54,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x54,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x54,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x54,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x54,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x54,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x54,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x54,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x54,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x54,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, m0
-// CHECK: [0x00,0x00,0x54,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x54,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x54,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, m0
+// CHECK: [0x05,0x00,0x54,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x54,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x54,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, 0
-// CHECK: [0x00,0x00,0x54,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x54,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, -1
-// CHECK: [0x00,0x00,0x54,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, 0
+// CHECK: [0x05,0x00,0x54,0xd1,0x80,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x54,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, -1
+// CHECK: [0x05,0x00,0x54,0xd1,0xc1,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x54,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x54,0xd1,0xf0,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, v0
-// CHECK: [0x00,0x00,0x54,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x54,0xd1,0xf7,0x00,0x00,0x00]
 
-v_cvt_f32_ubyte3_e64 v0, v255
-// CHECK: [0x00,0x00,0x54,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_f32_ubyte3_e64 v5, v1
+// CHECK: [0x05,0x00,0x54,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_u32_f64 v0, s[0:1]
-// CHECK: [0x00,0x2a,0x00,0x7e]
+v_cvt_f32_ubyte3_e64 v5, v255
+// CHECK: [0x05,0x00,0x54,0xd1,0xff,0x01,0x00,0x00]
 
-v_cvt_u32_f64 v255, s[0:1]
-// CHECK: [0x00,0x2a,0xfe,0x7f]
+v_cvt_u32_f64 v5, s[2:3]
+// CHECK: [0x02,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, s[2:3]
-// CHECK: [0x02,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v255, s[2:3]
+// CHECK: [0x02,0x2a,0xfe,0x7f]
 
-v_cvt_u32_f64 v0, s[100:101]
-// CHECK: [0x64,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, s[4:5]
+// CHECK: [0x04,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, flat_scratch
-// CHECK: [0x66,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, s[100:101]
+// CHECK: [0x64,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, vcc
-// CHECK: [0x6a,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, flat_scratch
+// CHECK: [0x66,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, tba
-// CHECK: [0x6c,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, vcc
+// CHECK: [0x6a,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, tma
-// CHECK: [0x6e,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, tba
+// CHECK: [0x6c,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, ttmp[10:11]
-// CHECK: [0x7a,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, tma
+// CHECK: [0x6e,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, exec
-// CHECK: [0x7e,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, ttmp[10:11]
+// CHECK: [0x7a,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, 0
-// CHECK: [0x80,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, exec
+// CHECK: [0x7e,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, -1
-// CHECK: [0xc1,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, 0
+// CHECK: [0x80,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, 0.5
-// CHECK: [0xf0,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, -1
+// CHECK: [0xc1,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, -4.0
-// CHECK: [0xf7,0x2a,0x00,0x7e]
+v_cvt_u32_f64 v5, 0.5
+// CHECK: [0xf0,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, 0xaf123456
-// CHECK: [0xff,0x2a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_u32_f64 v5, -4.0
+// CHECK: [0xf7,0x2a,0x0a,0x7e]
 
-v_cvt_u32_f64 v0, 0x3f717273
-// CHECK: [0xff,0x2a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_u32_f64 v5, 0xaf123456
+// CHECK: [0xff,0x2a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_u32_f64 v0, v[0:1]
-// CHECK: [0x00,0x2b,0x00,0x7e]
+v_cvt_u32_f64 v5, 0x3f717273
+// CHECK: [0xff,0x2a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_u32_f64 v0, v[254:255]
-// CHECK: [0xfe,0x2b,0x00,0x7e]
+v_cvt_u32_f64 v5, v[1:2]
+// CHECK: [0x01,0x2b,0x0a,0x7e]
 
-v_cvt_u32_f64_e64 v0, s[0:1]
-// CHECK: [0x00,0x00,0x55,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_u32_f64 v5, v[254:255]
+// CHECK: [0xfe,0x2b,0x0a,0x7e]
 
-v_cvt_u32_f64_e64 v255, s[0:1]
-// CHECK: [0xff,0x00,0x55,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, s[2:3]
+// CHECK: [0x05,0x00,0x55,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, s[2:3]
-// CHECK: [0x00,0x00,0x55,0xd1,0x02,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v255, s[2:3]
+// CHECK: [0xff,0x00,0x55,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, s[100:101]
-// CHECK: [0x00,0x00,0x55,0xd1,0x64,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, s[4:5]
+// CHECK: [0x05,0x00,0x55,0xd1,0x04,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, flat_scratch
-// CHECK: [0x00,0x00,0x55,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, s[100:101]
+// CHECK: [0x05,0x00,0x55,0xd1,0x64,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, vcc
-// CHECK: [0x00,0x00,0x55,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, flat_scratch
+// CHECK: [0x05,0x00,0x55,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, tba
-// CHECK: [0x00,0x00,0x55,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, vcc
+// CHECK: [0x05,0x00,0x55,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, tma
-// CHECK: [0x00,0x00,0x55,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, tba
+// CHECK: [0x05,0x00,0x55,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, ttmp[10:11]
-// CHECK: [0x00,0x00,0x55,0xd1,0x7a,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, tma
+// CHECK: [0x05,0x00,0x55,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, exec
-// CHECK: [0x00,0x00,0x55,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, ttmp[10:11]
+// CHECK: [0x05,0x00,0x55,0xd1,0x7a,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, scc
-// CHECK: [0x00,0x00,0x55,0xd1,0xfd,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, exec
+// CHECK: [0x05,0x00,0x55,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x55,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_u32_f64_e64 v5, scc
+// CHECK: [0x05,0x00,0x55,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, v[254:255]
-// CHECK: [0x00,0x00,0x55,0xd1,0xfe,0x01,0x00,0x00]
+v_cvt_u32_f64_e64 v5, v[1:2]
+// CHECK: [0x05,0x00,0x55,0xd1,0x01,0x01,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, -s[0:1]
-// CHECK: [0x00,0x00,0x55,0xd1,0x00,0x00,0x00,0x20]
+v_cvt_u32_f64_e64 v5, v[254:255]
+// CHECK: [0x05,0x00,0x55,0xd1,0xfe,0x01,0x00,0x00]
 
-v_cvt_u32_f64_e64 v0, |s[0:1]|
-// CHECK: [0x00,0x01,0x55,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, -s[2:3]
+// CHECK: [0x05,0x00,0x55,0xd1,0x02,0x00,0x00,0x20]
 
-v_cvt_u32_f64_e64 v0, s[0:1] clamp
-// CHECK: [0x00,0x80,0x55,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_u32_f64_e64 v5, |s[2:3]|
+// CHECK: [0x05,0x01,0x55,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_f64_u32 v[0:1], s0
-// CHECK: [0x00,0x2c,0x00,0x7e]
+v_cvt_u32_f64_e64 v5, s[2:3] clamp
+// CHECK: [0x05,0x80,0x55,0xd1,0x02,0x00,0x00,0x00]
 
-v_cvt_f64_u32 v[254:255], s0
-// CHECK: [0x00,0x2c,0xfc,0x7f]
+v_cvt_f64_u32 v[5:6], s1
+// CHECK: [0x01,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], s101
-// CHECK: [0x65,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[254:255], s1
+// CHECK: [0x01,0x2c,0xfc,0x7f]
 
-v_cvt_f64_u32 v[0:1], flat_scratch_lo
-// CHECK: [0x66,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], s101
+// CHECK: [0x65,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], flat_scratch_hi
-// CHECK: [0x67,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], flat_scratch_lo
+// CHECK: [0x66,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], vcc_lo
-// CHECK: [0x6a,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], flat_scratch_hi
+// CHECK: [0x67,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], vcc_hi
-// CHECK: [0x6b,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], vcc_lo
+// CHECK: [0x6a,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], tba_lo
-// CHECK: [0x6c,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], vcc_hi
+// CHECK: [0x6b,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], tba_hi
-// CHECK: [0x6d,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], tba_lo
+// CHECK: [0x6c,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], tma_lo
-// CHECK: [0x6e,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], tba_hi
+// CHECK: [0x6d,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], tma_hi
-// CHECK: [0x6f,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], tma_lo
+// CHECK: [0x6e,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], ttmp11
-// CHECK: [0x7b,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], tma_hi
+// CHECK: [0x6f,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], m0
-// CHECK: [0x7c,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], ttmp11
+// CHECK: [0x7b,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], exec_lo
-// CHECK: [0x7e,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], m0
+// CHECK: [0x7c,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], exec_hi
-// CHECK: [0x7f,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], exec_lo
+// CHECK: [0x7e,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], 0
-// CHECK: [0x80,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], exec_hi
+// CHECK: [0x7f,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], -1
-// CHECK: [0xc1,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], 0
+// CHECK: [0x80,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], 0.5
-// CHECK: [0xf0,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], -1
+// CHECK: [0xc1,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], -4.0
-// CHECK: [0xf7,0x2c,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], 0.5
+// CHECK: [0xf0,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], 0xaf123456
-// CHECK: [0xff,0x2c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cvt_f64_u32 v[5:6], -4.0
+// CHECK: [0xf7,0x2c,0x0a,0x7e]
 
-v_cvt_f64_u32 v[0:1], 0x3f717273
-// CHECK: [0xff,0x2c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cvt_f64_u32 v[5:6], 0xaf123456
+// CHECK: [0xff,0x2c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cvt_f64_u32 v[0:1], v0
-// CHECK: [0x00,0x2d,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], 0x3f717273
+// CHECK: [0xff,0x2c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cvt_f64_u32 v[0:1], v255
-// CHECK: [0xff,0x2d,0x00,0x7e]
+v_cvt_f64_u32 v[5:6], v1
+// CHECK: [0x01,0x2d,0x0a,0x7e]
 
-v_cvt_f64_u32_e64 v[0:1], s0
-// CHECK: [0x00,0x00,0x56,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f64_u32 v[5:6], v255
+// CHECK: [0xff,0x2d,0x0a,0x7e]
 
-v_cvt_f64_u32_e64 v[254:255], s0
-// CHECK: [0xfe,0x00,0x56,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], s1
+// CHECK: [0x05,0x00,0x56,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], s101
-// CHECK: [0x00,0x00,0x56,0xd1,0x65,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[254:255], s1
+// CHECK: [0xfe,0x00,0x56,0xd1,0x01,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], flat_scratch_lo
-// CHECK: [0x00,0x00,0x56,0xd1,0x66,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], s101
+// CHECK: [0x05,0x00,0x56,0xd1,0x65,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], flat_scratch_hi
-// CHECK: [0x00,0x00,0x56,0xd1,0x67,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], flat_scratch_lo
+// CHECK: [0x05,0x00,0x56,0xd1,0x66,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], vcc_lo
-// CHECK: [0x00,0x00,0x56,0xd1,0x6a,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], flat_scratch_hi
+// CHECK: [0x05,0x00,0x56,0xd1,0x67,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], vcc_hi
-// CHECK: [0x00,0x00,0x56,0xd1,0x6b,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], vcc_lo
+// CHECK: [0x05,0x00,0x56,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], tba_lo
-// CHECK: [0x00,0x00,0x56,0xd1,0x6c,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], vcc_hi
+// CHECK: [0x05,0x00,0x56,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], tba_hi
-// CHECK: [0x00,0x00,0x56,0xd1,0x6d,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], tba_lo
+// CHECK: [0x05,0x00,0x56,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], tma_lo
-// CHECK: [0x00,0x00,0x56,0xd1,0x6e,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], tba_hi
+// CHECK: [0x05,0x00,0x56,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], tma_hi
-// CHECK: [0x00,0x00,0x56,0xd1,0x6f,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], tma_lo
+// CHECK: [0x05,0x00,0x56,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], ttmp11
-// CHECK: [0x00,0x00,0x56,0xd1,0x7b,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], tma_hi
+// CHECK: [0x05,0x00,0x56,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], m0
-// CHECK: [0x00,0x00,0x56,0xd1,0x7c,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], ttmp11
+// CHECK: [0x05,0x00,0x56,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], exec_lo
-// CHECK: [0x00,0x00,0x56,0xd1,0x7e,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], m0
+// CHECK: [0x05,0x00,0x56,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], exec_hi
-// CHECK: [0x00,0x00,0x56,0xd1,0x7f,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], exec_lo
+// CHECK: [0x05,0x00,0x56,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], 0
-// CHECK: [0x00,0x00,0x56,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], exec_hi
+// CHECK: [0x05,0x00,0x56,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], -1
-// CHECK: [0x00,0x00,0x56,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], 0
+// CHECK: [0x05,0x00,0x56,0xd1,0x80,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], 0.5
-// CHECK: [0x00,0x00,0x56,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], -1
+// CHECK: [0x05,0x00,0x56,0xd1,0xc1,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], -4.0
-// CHECK: [0x00,0x00,0x56,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], 0.5
+// CHECK: [0x05,0x00,0x56,0xd1,0xf0,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], v0
-// CHECK: [0x00,0x00,0x56,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], -4.0
+// CHECK: [0x05,0x00,0x56,0xd1,0xf7,0x00,0x00,0x00]
 
-v_cvt_f64_u32_e64 v[0:1], v255
-// CHECK: [0x00,0x00,0x56,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_f64_u32_e64 v[5:6], v1
+// CHECK: [0x05,0x00,0x56,0xd1,0x01,0x01,0x00,0x00]
 
-v_trunc_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x2e,0x00,0x7e]
+v_cvt_f64_u32_e64 v[5:6], v255
+// CHECK: [0x05,0x00,0x56,0xd1,0xff,0x01,0x00,0x00]
 
-v_trunc_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x2e,0xfc,0x7f]
+v_trunc_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x2e,0x00,0x7e]
+v_trunc_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x2e,0xfc,0x7f]
 
-v_trunc_f64 v[0:1], s[100:101]
-// CHECK: [0x64,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], flat_scratch
-// CHECK: [0x66,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], s[100:101]
+// CHECK: [0x64,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], vcc
-// CHECK: [0x6a,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], flat_scratch
+// CHECK: [0x66,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], tba
-// CHECK: [0x6c,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], vcc
+// CHECK: [0x6a,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], tma
-// CHECK: [0x6e,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], tba
+// CHECK: [0x6c,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], tma
+// CHECK: [0x6e,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], exec
-// CHECK: [0x7e,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], 0
-// CHECK: [0x80,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], exec
+// CHECK: [0x7e,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], -1
-// CHECK: [0xc1,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], 0
+// CHECK: [0x80,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], -1
+// CHECK: [0xc1,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x2e,0x00,0x7e]
+v_trunc_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x2e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_trunc_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x2e,0x0a,0x7e]
 
-v_trunc_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x2e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_trunc_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x2e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_trunc_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x2f,0x00,0x7e]
+v_trunc_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x2e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_trunc_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x2f,0x00,0x7e]
+v_trunc_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x2f,0x0a,0x7e]
 
-v_trunc_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x57,0xd1,0x00,0x00,0x00,0x00]
+v_trunc_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x2f,0x0a,0x7e]
 
-v_trunc_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x57,0xd1,0x00,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x57,0xd1,0x02,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x57,0xd1,0x02,0x00,0x00,0x00]
+v_trunc_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x57,0xd1,0x02,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], s[100:101]
-// CHECK: [0x00,0x00,0x57,0xd1,0x64,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x57,0xd1,0x04,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x57,0xd1,0x66,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], s[100:101]
+// CHECK: [0x05,0x00,0x57,0xd1,0x64,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x57,0xd1,0x6a,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x57,0xd1,0x66,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x57,0xd1,0x6c,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x57,0xd1,0x6a,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x57,0xd1,0x6e,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x57,0xd1,0x6c,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x57,0xd1,0x7a,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x57,0xd1,0x6e,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x57,0xd1,0x7e,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x57,0xd1,0x7a,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x57,0xd1,0xfd,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x57,0xd1,0x7e,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x57,0xd1,0x00,0x01,0x00,0x00]
+v_trunc_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x57,0xd1,0xfd,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x57,0xd1,0xfe,0x01,0x00,0x00]
+v_trunc_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x57,0xd1,0x01,0x01,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x57,0xd1,0x00,0x00,0x00,0x20]
+v_trunc_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x57,0xd1,0xfe,0x01,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x57,0xd1,0x00,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x57,0xd1,0x02,0x00,0x00,0x20]
 
-v_trunc_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x57,0xd1,0x00,0x00,0x00,0x00]
+v_trunc_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x57,0xd1,0x02,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x57,0xd1,0x00,0x00,0x00,0x08]
+v_trunc_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x80,0x57,0xd1,0x02,0x00,0x00,0x00]
 
-v_trunc_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x57,0xd1,0x00,0x00,0x00,0x10]
+v_trunc_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x57,0xd1,0x02,0x00,0x00,0x08]
 
-v_trunc_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x57,0xd1,0x00,0x00,0x00,0x18]
+v_trunc_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x57,0xd1,0x02,0x00,0x00,0x10]
 
-v_ceil_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x30,0x00,0x7e]
+v_trunc_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x57,0xd1,0x02,0x00,0x00,0x18]
 
-v_ceil_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x30,0xfc,0x7f]
+v_ceil_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x30,0x00,0x7e]
+v_ceil_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x30,0xfc,0x7f]
 
-v_ceil_f64 v[0:1], s[100:101]
-// CHECK: [0x64,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], flat_scratch
-// CHECK: [0x66,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], s[100:101]
+// CHECK: [0x64,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], vcc
-// CHECK: [0x6a,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], flat_scratch
+// CHECK: [0x66,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], tba
-// CHECK: [0x6c,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], vcc
+// CHECK: [0x6a,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], tma
-// CHECK: [0x6e,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], tba
+// CHECK: [0x6c,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], tma
+// CHECK: [0x6e,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], exec
-// CHECK: [0x7e,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], 0
-// CHECK: [0x80,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], exec
+// CHECK: [0x7e,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], -1
-// CHECK: [0xc1,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], 0
+// CHECK: [0x80,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], -1
+// CHECK: [0xc1,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x30,0x00,0x7e]
+v_ceil_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x30,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_ceil_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x30,0x0a,0x7e]
 
-v_ceil_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x30,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_ceil_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x30,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_ceil_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x31,0x00,0x7e]
+v_ceil_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x30,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_ceil_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x31,0x00,0x7e]
+v_ceil_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x31,0x0a,0x7e]
 
-v_ceil_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd1,0x00,0x00,0x00,0x00]
+v_ceil_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x31,0x0a,0x7e]
 
-v_ceil_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x58,0xd1,0x00,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x58,0xd1,0x02,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x58,0xd1,0x02,0x00,0x00,0x00]
+v_ceil_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x58,0xd1,0x02,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], s[100:101]
-// CHECK: [0x00,0x00,0x58,0xd1,0x64,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x58,0xd1,0x04,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x58,0xd1,0x66,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], s[100:101]
+// CHECK: [0x05,0x00,0x58,0xd1,0x64,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x58,0xd1,0x6a,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x58,0xd1,0x66,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x58,0xd1,0x6c,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x58,0xd1,0x6e,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x58,0xd1,0x6c,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x58,0xd1,0x7a,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x58,0xd1,0x6e,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x58,0xd1,0x7e,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x58,0xd1,0x7a,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x58,0xd1,0xfd,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x58,0xd1,0x7e,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x58,0xd1,0x00,0x01,0x00,0x00]
+v_ceil_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x58,0xd1,0xfd,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x58,0xd1,0xfe,0x01,0x00,0x00]
+v_ceil_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x58,0xd1,0x01,0x01,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x58,0xd1,0x00,0x00,0x00,0x20]
+v_ceil_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x58,0xd1,0xfe,0x01,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x58,0xd1,0x00,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x58,0xd1,0x02,0x00,0x00,0x20]
 
-v_ceil_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x58,0xd1,0x00,0x00,0x00,0x00]
+v_ceil_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x58,0xd1,0x02,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x58,0xd1,0x00,0x00,0x00,0x08]
+v_ceil_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x80,0x58,0xd1,0x02,0x00,0x00,0x00]
 
-v_ceil_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x58,0xd1,0x00,0x00,0x00,0x10]
+v_ceil_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x58,0xd1,0x02,0x00,0x00,0x08]
 
-v_ceil_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x58,0xd1,0x00,0x00,0x00,0x18]
+v_ceil_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x58,0xd1,0x02,0x00,0x00,0x10]
 
-v_rndne_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x32,0x00,0x7e]
+v_ceil_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x58,0xd1,0x02,0x00,0x00,0x18]
 
-v_rndne_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x32,0xfc,0x7f]
+v_rndne_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x32,0x00,0x7e]
+v_rndne_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x32,0xfc,0x7f]
 
-v_rndne_f64 v[0:1], s[100:101]
-// CHECK: [0x64,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], flat_scratch
-// CHECK: [0x66,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], s[100:101]
+// CHECK: [0x64,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], vcc
-// CHECK: [0x6a,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], flat_scratch
+// CHECK: [0x66,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], tba
-// CHECK: [0x6c,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], vcc
+// CHECK: [0x6a,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], tma
-// CHECK: [0x6e,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], tba
+// CHECK: [0x6c,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], tma
+// CHECK: [0x6e,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], exec
-// CHECK: [0x7e,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], 0
-// CHECK: [0x80,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], exec
+// CHECK: [0x7e,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], -1
-// CHECK: [0xc1,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], 0
+// CHECK: [0x80,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], -1
+// CHECK: [0xc1,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x32,0x00,0x7e]
+v_rndne_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x32,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rndne_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x32,0x0a,0x7e]
 
-v_rndne_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x32,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rndne_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x32,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rndne_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x33,0x00,0x7e]
+v_rndne_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x32,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rndne_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x33,0x00,0x7e]
+v_rndne_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x33,0x0a,0x7e]
 
-v_rndne_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x59,0xd1,0x00,0x00,0x00,0x00]
+v_rndne_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x33,0x0a,0x7e]
 
-v_rndne_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x59,0xd1,0x00,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x59,0xd1,0x02,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x59,0xd1,0x02,0x00,0x00,0x00]
+v_rndne_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x59,0xd1,0x02,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], s[100:101]
-// CHECK: [0x00,0x00,0x59,0xd1,0x64,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x59,0xd1,0x04,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x59,0xd1,0x66,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], s[100:101]
+// CHECK: [0x05,0x00,0x59,0xd1,0x64,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x59,0xd1,0x6a,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x59,0xd1,0x66,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x59,0xd1,0x6c,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x59,0xd1,0x6a,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x59,0xd1,0x6e,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x59,0xd1,0x6c,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x59,0xd1,0x7a,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x59,0xd1,0x6e,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x59,0xd1,0x7e,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x59,0xd1,0x7a,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], 0
-// CHECK: [0x00,0x00,0x59,0xd1,0x80,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x59,0xd1,0x7e,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], 0.5
-// CHECK: [0x00,0x00,0x59,0xd1,0xf0,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], 0
+// CHECK: [0x05,0x00,0x59,0xd1,0x80,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x59,0xd1,0xfd,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], 0.5
+// CHECK: [0x05,0x00,0x59,0xd1,0xf0,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x59,0xd1,0x00,0x01,0x00,0x00]
+v_rndne_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x59,0xd1,0xfd,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x59,0xd1,0xfe,0x01,0x00,0x00]
+v_rndne_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x59,0xd1,0x01,0x01,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x59,0xd1,0x00,0x00,0x00,0x20]
+v_rndne_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x59,0xd1,0xfe,0x01,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x59,0xd1,0x00,0x00,0x00,0x00]
+v_rndne_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x59,0xd1,0x02,0x00,0x00,0x20]
 
-v_rndne_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x59,0xd1,0x00,0x00,0x00,0x08]
+v_rndne_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x80,0x59,0xd1,0x02,0x00,0x00,0x00]
 
-v_rndne_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x59,0xd1,0x00,0x00,0x00,0x10]
+v_rndne_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x59,0xd1,0x02,0x00,0x00,0x08]
 
-v_rndne_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x59,0xd1,0x00,0x00,0x00,0x18]
+v_rndne_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x59,0xd1,0x02,0x00,0x00,0x10]
 
-v_floor_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x34,0x00,0x7e]
+v_rndne_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x59,0xd1,0x02,0x00,0x00,0x18]
 
-v_floor_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x34,0xfc,0x7f]
+v_floor_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x34,0x00,0x7e]
+v_floor_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x34,0xfc,0x7f]
 
-v_floor_f64 v[0:1], s[100:101]
-// CHECK: [0x64,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], flat_scratch
-// CHECK: [0x66,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], s[100:101]
+// CHECK: [0x64,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], vcc
-// CHECK: [0x6a,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], flat_scratch
+// CHECK: [0x66,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], tba
-// CHECK: [0x6c,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], vcc
+// CHECK: [0x6a,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], tma
-// CHECK: [0x6e,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], tba
+// CHECK: [0x6c,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], tma
+// CHECK: [0x6e,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], exec
-// CHECK: [0x7e,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], 0
-// CHECK: [0x80,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], exec
+// CHECK: [0x7e,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], -1
-// CHECK: [0xc1,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], 0
+// CHECK: [0x80,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], -1
+// CHECK: [0xc1,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x34,0x00,0x7e]
+v_floor_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x34,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_floor_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x34,0x0a,0x7e]
 
-v_floor_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x34,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_floor_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x34,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_floor_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x35,0x00,0x7e]
+v_floor_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x34,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_floor_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x35,0x00,0x7e]
+v_floor_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x35,0x0a,0x7e]
 
-v_floor_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd1,0x00,0x00,0x00,0x00]
+v_floor_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x35,0x0a,0x7e]
 
-v_floor_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x5a,0xd1,0x00,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x5a,0xd1,0x02,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x5a,0xd1,0x02,0x00,0x00,0x00]
+v_floor_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x5a,0xd1,0x02,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], s[100:101]
-// CHECK: [0x00,0x00,0x5a,0xd1,0x64,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x5a,0xd1,0x04,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x5a,0xd1,0x66,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], s[100:101]
+// CHECK: [0x05,0x00,0x5a,0xd1,0x64,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x5a,0xd1,0x6a,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x5a,0xd1,0x66,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x5a,0xd1,0x6c,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x5a,0xd1,0x6a,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x5a,0xd1,0x6e,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x5a,0xd1,0x6c,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x5a,0xd1,0x7a,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x5a,0xd1,0x6e,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x5a,0xd1,0x7e,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x5a,0xd1,0x7a,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], 0
-// CHECK: [0x00,0x00,0x5a,0xd1,0x80,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x5a,0xd1,0x7e,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], 0.5
-// CHECK: [0x00,0x00,0x5a,0xd1,0xf0,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], 0
+// CHECK: [0x05,0x00,0x5a,0xd1,0x80,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x5a,0xd1,0xfd,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], 0.5
+// CHECK: [0x05,0x00,0x5a,0xd1,0xf0,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd1,0x00,0x01,0x00,0x00]
+v_floor_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x5a,0xd1,0xfd,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x5a,0xd1,0xfe,0x01,0x00,0x00]
+v_floor_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x5a,0xd1,0x01,0x01,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x5a,0xd1,0x00,0x00,0x00,0x20]
+v_floor_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x5a,0xd1,0xfe,0x01,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x5a,0xd1,0x00,0x00,0x00,0x00]
+v_floor_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x5a,0xd1,0x02,0x00,0x00,0x20]
 
-v_floor_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x5a,0xd1,0x00,0x00,0x00,0x08]
+v_floor_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x80,0x5a,0xd1,0x02,0x00,0x00,0x00]
 
-v_floor_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x5a,0xd1,0x00,0x00,0x00,0x10]
+v_floor_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x5a,0xd1,0x02,0x00,0x00,0x08]
 
-v_floor_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x5a,0xd1,0x00,0x00,0x00,0x18]
+v_floor_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x5a,0xd1,0x02,0x00,0x00,0x10]
 
-v_fract_f32 v0, s0
-// CHECK: [0x00,0x36,0x00,0x7e]
+v_floor_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x5a,0xd1,0x02,0x00,0x00,0x18]
 
-v_fract_f32 v255, s0
-// CHECK: [0x00,0x36,0xfe,0x7f]
+v_fract_f32 v5, s1
+// CHECK: [0x01,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, s101
-// CHECK: [0x65,0x36,0x00,0x7e]
+v_fract_f32 v255, s1
+// CHECK: [0x01,0x36,0xfe,0x7f]
 
-v_fract_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x36,0x00,0x7e]
+v_fract_f32 v5, s101
+// CHECK: [0x65,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x36,0x00,0x7e]
+v_fract_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, vcc_lo
-// CHECK: [0x6a,0x36,0x00,0x7e]
+v_fract_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, vcc_hi
-// CHECK: [0x6b,0x36,0x00,0x7e]
+v_fract_f32 v5, vcc_lo
+// CHECK: [0x6a,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, tba_lo
-// CHECK: [0x6c,0x36,0x00,0x7e]
+v_fract_f32 v5, vcc_hi
+// CHECK: [0x6b,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, tba_hi
-// CHECK: [0x6d,0x36,0x00,0x7e]
+v_fract_f32 v5, tba_lo
+// CHECK: [0x6c,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, tma_lo
-// CHECK: [0x6e,0x36,0x00,0x7e]
+v_fract_f32 v5, tba_hi
+// CHECK: [0x6d,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, tma_hi
-// CHECK: [0x6f,0x36,0x00,0x7e]
+v_fract_f32 v5, tma_lo
+// CHECK: [0x6e,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, ttmp11
-// CHECK: [0x7b,0x36,0x00,0x7e]
+v_fract_f32 v5, tma_hi
+// CHECK: [0x6f,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, m0
-// CHECK: [0x7c,0x36,0x00,0x7e]
+v_fract_f32 v5, ttmp11
+// CHECK: [0x7b,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, exec_lo
-// CHECK: [0x7e,0x36,0x00,0x7e]
+v_fract_f32 v5, m0
+// CHECK: [0x7c,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, exec_hi
-// CHECK: [0x7f,0x36,0x00,0x7e]
+v_fract_f32 v5, exec_lo
+// CHECK: [0x7e,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, 0
-// CHECK: [0x80,0x36,0x00,0x7e]
+v_fract_f32 v5, exec_hi
+// CHECK: [0x7f,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, -1
-// CHECK: [0xc1,0x36,0x00,0x7e]
+v_fract_f32 v5, 0
+// CHECK: [0x80,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, 0.5
-// CHECK: [0xf0,0x36,0x00,0x7e]
+v_fract_f32 v5, -1
+// CHECK: [0xc1,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, -4.0
-// CHECK: [0xf7,0x36,0x00,0x7e]
+v_fract_f32 v5, 0.5
+// CHECK: [0xf0,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, 0xaf123456
-// CHECK: [0xff,0x36,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_fract_f32 v5, -4.0
+// CHECK: [0xf7,0x36,0x0a,0x7e]
 
-v_fract_f32 v0, 0x3f717273
-// CHECK: [0xff,0x36,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_fract_f32 v5, 0xaf123456
+// CHECK: [0xff,0x36,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_fract_f32 v0, v0
-// CHECK: [0x00,0x37,0x00,0x7e]
+v_fract_f32 v5, 0x3f717273
+// CHECK: [0xff,0x36,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_fract_f32 v0, v255
-// CHECK: [0xff,0x37,0x00,0x7e]
+v_fract_f32 v5, v1
+// CHECK: [0x01,0x37,0x0a,0x7e]
 
-v_fract_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x5b,0xd1,0x00,0x00,0x00,0x00]
+v_fract_f32 v5, v255
+// CHECK: [0xff,0x37,0x0a,0x7e]
 
-v_fract_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x5b,0xd1,0x00,0x00,0x00,0x00]
+v_fract_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x5b,0xd1,0x01,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x5b,0xd1,0x65,0x00,0x00,0x00]
+v_fract_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x5b,0xd1,0x01,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x5b,0xd1,0x66,0x00,0x00,0x00]
+v_fract_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x5b,0xd1,0x65,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x5b,0xd1,0x67,0x00,0x00,0x00]
+v_fract_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x5b,0xd1,0x66,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x5b,0xd1,0x6a,0x00,0x00,0x00]
+v_fract_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x5b,0xd1,0x67,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x5b,0xd1,0x6b,0x00,0x00,0x00]
+v_fract_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x5b,0xd1,0x6a,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x5b,0xd1,0x6c,0x00,0x00,0x00]
+v_fract_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x5b,0xd1,0x6b,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x5b,0xd1,0x6d,0x00,0x00,0x00]
+v_fract_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x5b,0xd1,0x6c,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x5b,0xd1,0x6e,0x00,0x00,0x00]
+v_fract_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x5b,0xd1,0x6d,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x5b,0xd1,0x6f,0x00,0x00,0x00]
+v_fract_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x5b,0xd1,0x6e,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x5b,0xd1,0x7b,0x00,0x00,0x00]
+v_fract_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x5b,0xd1,0x6f,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x5b,0xd1,0x7c,0x00,0x00,0x00]
+v_fract_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x5b,0xd1,0x7b,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x5b,0xd1,0x7e,0x00,0x00,0x00]
+v_fract_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x5b,0xd1,0x7c,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x5b,0xd1,0x7f,0x00,0x00,0x00]
+v_fract_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x5b,0xd1,0x7e,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x5b,0xd1,0x80,0x00,0x00,0x00]
+v_fract_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x5b,0xd1,0x7f,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x5b,0xd1,0xf0,0x00,0x00,0x00]
+v_fract_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x5b,0xd1,0x80,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x5b,0xd1,0xfd,0x00,0x00,0x00]
+v_fract_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x5b,0xd1,0xf0,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x5b,0xd1,0x00,0x01,0x00,0x00]
+v_fract_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x5b,0xd1,0xfd,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x5b,0xd1,0xff,0x01,0x00,0x00]
+v_fract_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x5b,0xd1,0x01,0x01,0x00,0x00]
 
-v_fract_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x5b,0xd1,0x00,0x00,0x00,0x20]
+v_fract_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x5b,0xd1,0xff,0x01,0x00,0x00]
 
-v_fract_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x5b,0xd1,0x00,0x00,0x00,0x00]
+v_fract_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x5b,0xd1,0x01,0x00,0x00,0x20]
 
-v_fract_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x5b,0xd1,0x00,0x00,0x00,0x08]
+v_fract_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x5b,0xd1,0x01,0x00,0x00,0x00]
 
-v_fract_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x5b,0xd1,0x00,0x00,0x00,0x10]
+v_fract_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x5b,0xd1,0x01,0x00,0x00,0x08]
 
-v_fract_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x5b,0xd1,0x00,0x00,0x00,0x18]
+v_fract_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x5b,0xd1,0x01,0x00,0x00,0x10]
 
-v_trunc_f32 v0, s0
-// CHECK: [0x00,0x38,0x00,0x7e]
+v_fract_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x5b,0xd1,0x01,0x00,0x00,0x18]
 
-v_trunc_f32 v255, s0
-// CHECK: [0x00,0x38,0xfe,0x7f]
+v_trunc_f32 v5, s1
+// CHECK: [0x01,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, s101
-// CHECK: [0x65,0x38,0x00,0x7e]
+v_trunc_f32 v255, s1
+// CHECK: [0x01,0x38,0xfe,0x7f]
 
-v_trunc_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x38,0x00,0x7e]
+v_trunc_f32 v5, s101
+// CHECK: [0x65,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x38,0x00,0x7e]
+v_trunc_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, vcc_lo
-// CHECK: [0x6a,0x38,0x00,0x7e]
+v_trunc_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, vcc_hi
-// CHECK: [0x6b,0x38,0x00,0x7e]
+v_trunc_f32 v5, vcc_lo
+// CHECK: [0x6a,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, tba_lo
-// CHECK: [0x6c,0x38,0x00,0x7e]
+v_trunc_f32 v5, vcc_hi
+// CHECK: [0x6b,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, tba_hi
-// CHECK: [0x6d,0x38,0x00,0x7e]
+v_trunc_f32 v5, tba_lo
+// CHECK: [0x6c,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, tma_lo
-// CHECK: [0x6e,0x38,0x00,0x7e]
+v_trunc_f32 v5, tba_hi
+// CHECK: [0x6d,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, tma_hi
-// CHECK: [0x6f,0x38,0x00,0x7e]
+v_trunc_f32 v5, tma_lo
+// CHECK: [0x6e,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, ttmp11
-// CHECK: [0x7b,0x38,0x00,0x7e]
+v_trunc_f32 v5, tma_hi
+// CHECK: [0x6f,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, m0
-// CHECK: [0x7c,0x38,0x00,0x7e]
+v_trunc_f32 v5, ttmp11
+// CHECK: [0x7b,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, exec_lo
-// CHECK: [0x7e,0x38,0x00,0x7e]
+v_trunc_f32 v5, m0
+// CHECK: [0x7c,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, exec_hi
-// CHECK: [0x7f,0x38,0x00,0x7e]
+v_trunc_f32 v5, exec_lo
+// CHECK: [0x7e,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, 0
-// CHECK: [0x80,0x38,0x00,0x7e]
+v_trunc_f32 v5, exec_hi
+// CHECK: [0x7f,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, -1
-// CHECK: [0xc1,0x38,0x00,0x7e]
+v_trunc_f32 v5, 0
+// CHECK: [0x80,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, 0.5
-// CHECK: [0xf0,0x38,0x00,0x7e]
+v_trunc_f32 v5, -1
+// CHECK: [0xc1,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, -4.0
-// CHECK: [0xf7,0x38,0x00,0x7e]
+v_trunc_f32 v5, 0.5
+// CHECK: [0xf0,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, 0xaf123456
-// CHECK: [0xff,0x38,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_trunc_f32 v5, -4.0
+// CHECK: [0xf7,0x38,0x0a,0x7e]
 
-v_trunc_f32 v0, 0x3f717273
-// CHECK: [0xff,0x38,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_trunc_f32 v5, 0xaf123456
+// CHECK: [0xff,0x38,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_trunc_f32 v0, v0
-// CHECK: [0x00,0x39,0x00,0x7e]
+v_trunc_f32 v5, 0x3f717273
+// CHECK: [0xff,0x38,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_trunc_f32 v0, v255
-// CHECK: [0xff,0x39,0x00,0x7e]
+v_trunc_f32 v5, v1
+// CHECK: [0x01,0x39,0x0a,0x7e]
 
-v_trunc_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x5c,0xd1,0x00,0x00,0x00,0x00]
+v_trunc_f32 v5, v255
+// CHECK: [0xff,0x39,0x0a,0x7e]
 
-v_trunc_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x5c,0xd1,0x00,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x5c,0xd1,0x01,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x5c,0xd1,0x65,0x00,0x00,0x00]
+v_trunc_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x5c,0xd1,0x01,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x5c,0xd1,0x66,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x5c,0xd1,0x65,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x5c,0xd1,0x67,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x5c,0xd1,0x66,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x5c,0xd1,0x6a,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x5c,0xd1,0x67,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x5c,0xd1,0x6b,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x5c,0xd1,0x6a,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x5c,0xd1,0x6c,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x5c,0xd1,0x6b,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x5c,0xd1,0x6d,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x5c,0xd1,0x6c,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x5c,0xd1,0x6e,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x5c,0xd1,0x6d,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x5c,0xd1,0x6f,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x5c,0xd1,0x6e,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x5c,0xd1,0x7b,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x5c,0xd1,0x6f,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x5c,0xd1,0x7c,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x5c,0xd1,0x7b,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x5c,0xd1,0x7e,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x5c,0xd1,0x7c,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x5c,0xd1,0x7f,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x5c,0xd1,0x7e,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x5c,0xd1,0x7f,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x5c,0xd1,0xf0,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x5c,0xd1,0xfd,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x5c,0xd1,0xf0,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x5c,0xd1,0x00,0x01,0x00,0x00]
+v_trunc_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x5c,0xd1,0xfd,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x5c,0xd1,0xff,0x01,0x00,0x00]
+v_trunc_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x5c,0xd1,0x01,0x01,0x00,0x00]
 
-v_trunc_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x5c,0xd1,0x00,0x00,0x00,0x20]
+v_trunc_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x5c,0xd1,0xff,0x01,0x00,0x00]
 
-v_trunc_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x5c,0xd1,0x00,0x00,0x00,0x00]
+v_trunc_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x5c,0xd1,0x01,0x00,0x00,0x20]
 
-v_trunc_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x5c,0xd1,0x00,0x00,0x00,0x08]
+v_trunc_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x5c,0xd1,0x01,0x00,0x00,0x00]
 
-v_trunc_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x5c,0xd1,0x00,0x00,0x00,0x10]
+v_trunc_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x5c,0xd1,0x01,0x00,0x00,0x08]
 
-v_trunc_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x5c,0xd1,0x00,0x00,0x00,0x18]
+v_trunc_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x5c,0xd1,0x01,0x00,0x00,0x10]
 
-v_ceil_f32 v0, s0
-// CHECK: [0x00,0x3a,0x00,0x7e]
+v_trunc_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x5c,0xd1,0x01,0x00,0x00,0x18]
 
-v_ceil_f32 v255, s0
-// CHECK: [0x00,0x3a,0xfe,0x7f]
+v_ceil_f32 v5, s1
+// CHECK: [0x01,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, s101
-// CHECK: [0x65,0x3a,0x00,0x7e]
+v_ceil_f32 v255, s1
+// CHECK: [0x01,0x3a,0xfe,0x7f]
 
-v_ceil_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x3a,0x00,0x7e]
+v_ceil_f32 v5, s101
+// CHECK: [0x65,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x3a,0x00,0x7e]
+v_ceil_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, vcc_lo
-// CHECK: [0x6a,0x3a,0x00,0x7e]
+v_ceil_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, vcc_hi
-// CHECK: [0x6b,0x3a,0x00,0x7e]
+v_ceil_f32 v5, vcc_lo
+// CHECK: [0x6a,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, tba_lo
-// CHECK: [0x6c,0x3a,0x00,0x7e]
+v_ceil_f32 v5, vcc_hi
+// CHECK: [0x6b,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, tba_hi
-// CHECK: [0x6d,0x3a,0x00,0x7e]
+v_ceil_f32 v5, tba_lo
+// CHECK: [0x6c,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, tma_lo
-// CHECK: [0x6e,0x3a,0x00,0x7e]
+v_ceil_f32 v5, tba_hi
+// CHECK: [0x6d,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, tma_hi
-// CHECK: [0x6f,0x3a,0x00,0x7e]
+v_ceil_f32 v5, tma_lo
+// CHECK: [0x6e,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, ttmp11
-// CHECK: [0x7b,0x3a,0x00,0x7e]
+v_ceil_f32 v5, tma_hi
+// CHECK: [0x6f,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, m0
-// CHECK: [0x7c,0x3a,0x00,0x7e]
+v_ceil_f32 v5, ttmp11
+// CHECK: [0x7b,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, exec_lo
-// CHECK: [0x7e,0x3a,0x00,0x7e]
+v_ceil_f32 v5, m0
+// CHECK: [0x7c,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, exec_hi
-// CHECK: [0x7f,0x3a,0x00,0x7e]
+v_ceil_f32 v5, exec_lo
+// CHECK: [0x7e,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, 0
-// CHECK: [0x80,0x3a,0x00,0x7e]
+v_ceil_f32 v5, exec_hi
+// CHECK: [0x7f,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, -1
-// CHECK: [0xc1,0x3a,0x00,0x7e]
+v_ceil_f32 v5, 0
+// CHECK: [0x80,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, 0.5
-// CHECK: [0xf0,0x3a,0x00,0x7e]
+v_ceil_f32 v5, -1
+// CHECK: [0xc1,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, -4.0
-// CHECK: [0xf7,0x3a,0x00,0x7e]
+v_ceil_f32 v5, 0.5
+// CHECK: [0xf0,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, 0xaf123456
-// CHECK: [0xff,0x3a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_ceil_f32 v5, -4.0
+// CHECK: [0xf7,0x3a,0x0a,0x7e]
 
-v_ceil_f32 v0, 0x3f717273
-// CHECK: [0xff,0x3a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_ceil_f32 v5, 0xaf123456
+// CHECK: [0xff,0x3a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_ceil_f32 v0, v0
-// CHECK: [0x00,0x3b,0x00,0x7e]
+v_ceil_f32 v5, 0x3f717273
+// CHECK: [0xff,0x3a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_ceil_f32 v0, v255
-// CHECK: [0xff,0x3b,0x00,0x7e]
+v_ceil_f32 v5, v1
+// CHECK: [0x01,0x3b,0x0a,0x7e]
 
-v_ceil_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x5d,0xd1,0x00,0x00,0x00,0x00]
+v_ceil_f32 v5, v255
+// CHECK: [0xff,0x3b,0x0a,0x7e]
 
-v_ceil_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x5d,0xd1,0x00,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x5d,0xd1,0x01,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x5d,0xd1,0x65,0x00,0x00,0x00]
+v_ceil_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x5d,0xd1,0x01,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x5d,0xd1,0x66,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x5d,0xd1,0x65,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x5d,0xd1,0x67,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x5d,0xd1,0x66,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x5d,0xd1,0x6a,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x5d,0xd1,0x67,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x5d,0xd1,0x6b,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x5d,0xd1,0x6a,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x5d,0xd1,0x6c,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x5d,0xd1,0x6b,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x5d,0xd1,0x6d,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x5d,0xd1,0x6c,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x5d,0xd1,0x6e,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x5d,0xd1,0x6d,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x5d,0xd1,0x6f,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x5d,0xd1,0x6e,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x5d,0xd1,0x7b,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x5d,0xd1,0x6f,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x5d,0xd1,0x7c,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x5d,0xd1,0x7b,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x5d,0xd1,0x7e,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x5d,0xd1,0x7c,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x5d,0xd1,0x7f,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x5d,0xd1,0x7e,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x5d,0xd1,0x80,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x5d,0xd1,0x7f,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x5d,0xd1,0xf0,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x5d,0xd1,0x80,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x5d,0xd1,0xfd,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x5d,0xd1,0xf0,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x5d,0xd1,0x00,0x01,0x00,0x00]
+v_ceil_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x5d,0xd1,0xfd,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x5d,0xd1,0xff,0x01,0x00,0x00]
+v_ceil_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x5d,0xd1,0x01,0x01,0x00,0x00]
 
-v_ceil_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x5d,0xd1,0x00,0x00,0x00,0x20]
+v_ceil_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x5d,0xd1,0xff,0x01,0x00,0x00]
 
-v_ceil_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x5d,0xd1,0x00,0x00,0x00,0x00]
+v_ceil_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x5d,0xd1,0x01,0x00,0x00,0x20]
 
-v_ceil_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x5d,0xd1,0x00,0x00,0x00,0x08]
+v_ceil_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x5d,0xd1,0x01,0x00,0x00,0x00]
 
-v_ceil_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x5d,0xd1,0x00,0x00,0x00,0x10]
+v_ceil_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x5d,0xd1,0x01,0x00,0x00,0x08]
 
-v_ceil_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x5d,0xd1,0x00,0x00,0x00,0x18]
+v_ceil_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x5d,0xd1,0x01,0x00,0x00,0x10]
 
-v_rndne_f32 v0, s0
-// CHECK: [0x00,0x3c,0x00,0x7e]
+v_ceil_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x5d,0xd1,0x01,0x00,0x00,0x18]
 
-v_rndne_f32 v255, s0
-// CHECK: [0x00,0x3c,0xfe,0x7f]
+v_rndne_f32 v5, s1
+// CHECK: [0x01,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, s101
-// CHECK: [0x65,0x3c,0x00,0x7e]
+v_rndne_f32 v255, s1
+// CHECK: [0x01,0x3c,0xfe,0x7f]
 
-v_rndne_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x3c,0x00,0x7e]
+v_rndne_f32 v5, s101
+// CHECK: [0x65,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x3c,0x00,0x7e]
+v_rndne_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, vcc_lo
-// CHECK: [0x6a,0x3c,0x00,0x7e]
+v_rndne_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, vcc_hi
-// CHECK: [0x6b,0x3c,0x00,0x7e]
+v_rndne_f32 v5, vcc_lo
+// CHECK: [0x6a,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, tba_lo
-// CHECK: [0x6c,0x3c,0x00,0x7e]
+v_rndne_f32 v5, vcc_hi
+// CHECK: [0x6b,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, tba_hi
-// CHECK: [0x6d,0x3c,0x00,0x7e]
+v_rndne_f32 v5, tba_lo
+// CHECK: [0x6c,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, tma_lo
-// CHECK: [0x6e,0x3c,0x00,0x7e]
+v_rndne_f32 v5, tba_hi
+// CHECK: [0x6d,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, tma_hi
-// CHECK: [0x6f,0x3c,0x00,0x7e]
+v_rndne_f32 v5, tma_lo
+// CHECK: [0x6e,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, ttmp11
-// CHECK: [0x7b,0x3c,0x00,0x7e]
+v_rndne_f32 v5, tma_hi
+// CHECK: [0x6f,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, m0
-// CHECK: [0x7c,0x3c,0x00,0x7e]
+v_rndne_f32 v5, ttmp11
+// CHECK: [0x7b,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, exec_lo
-// CHECK: [0x7e,0x3c,0x00,0x7e]
+v_rndne_f32 v5, m0
+// CHECK: [0x7c,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, exec_hi
-// CHECK: [0x7f,0x3c,0x00,0x7e]
+v_rndne_f32 v5, exec_lo
+// CHECK: [0x7e,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, 0
-// CHECK: [0x80,0x3c,0x00,0x7e]
+v_rndne_f32 v5, exec_hi
+// CHECK: [0x7f,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, -1
-// CHECK: [0xc1,0x3c,0x00,0x7e]
+v_rndne_f32 v5, 0
+// CHECK: [0x80,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, 0.5
-// CHECK: [0xf0,0x3c,0x00,0x7e]
+v_rndne_f32 v5, -1
+// CHECK: [0xc1,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, -4.0
-// CHECK: [0xf7,0x3c,0x00,0x7e]
+v_rndne_f32 v5, 0.5
+// CHECK: [0xf0,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, 0xaf123456
-// CHECK: [0xff,0x3c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rndne_f32 v5, -4.0
+// CHECK: [0xf7,0x3c,0x0a,0x7e]
 
-v_rndne_f32 v0, 0x3f717273
-// CHECK: [0xff,0x3c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rndne_f32 v5, 0xaf123456
+// CHECK: [0xff,0x3c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rndne_f32 v0, v0
-// CHECK: [0x00,0x3d,0x00,0x7e]
+v_rndne_f32 v5, 0x3f717273
+// CHECK: [0xff,0x3c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rndne_f32 v0, v255
-// CHECK: [0xff,0x3d,0x00,0x7e]
+v_rndne_f32 v5, v1
+// CHECK: [0x01,0x3d,0x0a,0x7e]
 
-v_rndne_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x5e,0xd1,0x00,0x00,0x00,0x00]
+v_rndne_f32 v5, v255
+// CHECK: [0xff,0x3d,0x0a,0x7e]
 
-v_rndne_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x5e,0xd1,0x00,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x5e,0xd1,0x01,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x5e,0xd1,0x65,0x00,0x00,0x00]
+v_rndne_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x5e,0xd1,0x01,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x5e,0xd1,0x66,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x5e,0xd1,0x65,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x5e,0xd1,0x67,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x5e,0xd1,0x66,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x5e,0xd1,0x6a,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x5e,0xd1,0x67,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x5e,0xd1,0x6b,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x5e,0xd1,0x6a,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x5e,0xd1,0x6c,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x5e,0xd1,0x6b,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x5e,0xd1,0x6d,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x5e,0xd1,0x6c,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x5e,0xd1,0x6e,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x5e,0xd1,0x6d,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x5e,0xd1,0x6f,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x5e,0xd1,0x6e,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x5e,0xd1,0x7b,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x5e,0xd1,0x6f,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x5e,0xd1,0x7c,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x5e,0xd1,0x7b,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x5e,0xd1,0x7e,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x5e,0xd1,0x7c,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x5e,0xd1,0x7f,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x5e,0xd1,0x7e,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, 0
-// CHECK: [0x00,0x00,0x5e,0xd1,0x80,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x5e,0xd1,0x7f,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x5e,0xd1,0xf0,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, 0
+// CHECK: [0x05,0x00,0x5e,0xd1,0x80,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x5e,0xd1,0xfd,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x5e,0xd1,0xf0,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x5e,0xd1,0x00,0x01,0x00,0x00]
+v_rndne_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x5e,0xd1,0xfd,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x5e,0xd1,0xff,0x01,0x00,0x00]
+v_rndne_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x5e,0xd1,0x01,0x01,0x00,0x00]
 
-v_rndne_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x5e,0xd1,0x00,0x00,0x00,0x20]
+v_rndne_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x5e,0xd1,0xff,0x01,0x00,0x00]
 
-v_rndne_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x5e,0xd1,0x00,0x00,0x00,0x00]
+v_rndne_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x5e,0xd1,0x01,0x00,0x00,0x20]
 
-v_rndne_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x5e,0xd1,0x00,0x00,0x00,0x08]
+v_rndne_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x5e,0xd1,0x01,0x00,0x00,0x00]
 
-v_rndne_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x5e,0xd1,0x00,0x00,0x00,0x10]
+v_rndne_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x5e,0xd1,0x01,0x00,0x00,0x08]
 
-v_rndne_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x5e,0xd1,0x00,0x00,0x00,0x18]
+v_rndne_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x5e,0xd1,0x01,0x00,0x00,0x10]
 
-v_floor_f32 v0, s0
-// CHECK: [0x00,0x3e,0x00,0x7e]
+v_rndne_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x5e,0xd1,0x01,0x00,0x00,0x18]
 
-v_floor_f32 v255, s0
-// CHECK: [0x00,0x3e,0xfe,0x7f]
+v_floor_f32 v5, s1
+// CHECK: [0x01,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, s101
-// CHECK: [0x65,0x3e,0x00,0x7e]
+v_floor_f32 v255, s1
+// CHECK: [0x01,0x3e,0xfe,0x7f]
 
-v_floor_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x3e,0x00,0x7e]
+v_floor_f32 v5, s101
+// CHECK: [0x65,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x3e,0x00,0x7e]
+v_floor_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, vcc_lo
-// CHECK: [0x6a,0x3e,0x00,0x7e]
+v_floor_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, vcc_hi
-// CHECK: [0x6b,0x3e,0x00,0x7e]
+v_floor_f32 v5, vcc_lo
+// CHECK: [0x6a,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, tba_lo
-// CHECK: [0x6c,0x3e,0x00,0x7e]
+v_floor_f32 v5, vcc_hi
+// CHECK: [0x6b,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, tba_hi
-// CHECK: [0x6d,0x3e,0x00,0x7e]
+v_floor_f32 v5, tba_lo
+// CHECK: [0x6c,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, tma_lo
-// CHECK: [0x6e,0x3e,0x00,0x7e]
+v_floor_f32 v5, tba_hi
+// CHECK: [0x6d,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, tma_hi
-// CHECK: [0x6f,0x3e,0x00,0x7e]
+v_floor_f32 v5, tma_lo
+// CHECK: [0x6e,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, ttmp11
-// CHECK: [0x7b,0x3e,0x00,0x7e]
+v_floor_f32 v5, tma_hi
+// CHECK: [0x6f,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, m0
-// CHECK: [0x7c,0x3e,0x00,0x7e]
+v_floor_f32 v5, ttmp11
+// CHECK: [0x7b,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, exec_lo
-// CHECK: [0x7e,0x3e,0x00,0x7e]
+v_floor_f32 v5, m0
+// CHECK: [0x7c,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, exec_hi
-// CHECK: [0x7f,0x3e,0x00,0x7e]
+v_floor_f32 v5, exec_lo
+// CHECK: [0x7e,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, 0
-// CHECK: [0x80,0x3e,0x00,0x7e]
+v_floor_f32 v5, exec_hi
+// CHECK: [0x7f,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, -1
-// CHECK: [0xc1,0x3e,0x00,0x7e]
+v_floor_f32 v5, 0
+// CHECK: [0x80,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, 0.5
-// CHECK: [0xf0,0x3e,0x00,0x7e]
+v_floor_f32 v5, -1
+// CHECK: [0xc1,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, -4.0
-// CHECK: [0xf7,0x3e,0x00,0x7e]
+v_floor_f32 v5, 0.5
+// CHECK: [0xf0,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, 0xaf123456
-// CHECK: [0xff,0x3e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_floor_f32 v5, -4.0
+// CHECK: [0xf7,0x3e,0x0a,0x7e]
 
-v_floor_f32 v0, 0x3f717273
-// CHECK: [0xff,0x3e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_floor_f32 v5, 0xaf123456
+// CHECK: [0xff,0x3e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_floor_f32 v0, v0
-// CHECK: [0x00,0x3f,0x00,0x7e]
+v_floor_f32 v5, 0x3f717273
+// CHECK: [0xff,0x3e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_floor_f32 v0, v255
-// CHECK: [0xff,0x3f,0x00,0x7e]
+v_floor_f32 v5, v1
+// CHECK: [0x01,0x3f,0x0a,0x7e]
 
-v_floor_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x5f,0xd1,0x00,0x00,0x00,0x00]
+v_floor_f32 v5, v255
+// CHECK: [0xff,0x3f,0x0a,0x7e]
 
-v_floor_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x5f,0xd1,0x00,0x00,0x00,0x00]
+v_floor_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x5f,0xd1,0x01,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x5f,0xd1,0x65,0x00,0x00,0x00]
+v_floor_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x5f,0xd1,0x01,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x5f,0xd1,0x66,0x00,0x00,0x00]
+v_floor_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x5f,0xd1,0x65,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x5f,0xd1,0x67,0x00,0x00,0x00]
+v_floor_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x5f,0xd1,0x66,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x5f,0xd1,0x6a,0x00,0x00,0x00]
+v_floor_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x5f,0xd1,0x67,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x5f,0xd1,0x6b,0x00,0x00,0x00]
+v_floor_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x5f,0xd1,0x6a,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x5f,0xd1,0x6c,0x00,0x00,0x00]
+v_floor_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x5f,0xd1,0x6b,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x5f,0xd1,0x6d,0x00,0x00,0x00]
+v_floor_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x5f,0xd1,0x6c,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x5f,0xd1,0x6e,0x00,0x00,0x00]
+v_floor_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x5f,0xd1,0x6d,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x5f,0xd1,0x6f,0x00,0x00,0x00]
+v_floor_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x5f,0xd1,0x6e,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x5f,0xd1,0x7b,0x00,0x00,0x00]
+v_floor_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x5f,0xd1,0x6f,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x5f,0xd1,0x7c,0x00,0x00,0x00]
+v_floor_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x5f,0xd1,0x7b,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x5f,0xd1,0x7e,0x00,0x00,0x00]
+v_floor_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x5f,0xd1,0x7c,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x5f,0xd1,0x7f,0x00,0x00,0x00]
+v_floor_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x5f,0xd1,0x7e,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x5f,0xd1,0xfd,0x00,0x00,0x00]
+v_floor_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x5f,0xd1,0x7f,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x5f,0xd1,0x00,0x01,0x00,0x00]
+v_floor_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x5f,0xd1,0xfd,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x5f,0xd1,0xff,0x01,0x00,0x00]
+v_floor_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x5f,0xd1,0x01,0x01,0x00,0x00]
 
-v_floor_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x5f,0xd1,0x00,0x00,0x00,0x20]
+v_floor_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x5f,0xd1,0xff,0x01,0x00,0x00]
 
-v_floor_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x5f,0xd1,0x00,0x00,0x00,0x00]
+v_floor_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x5f,0xd1,0x01,0x00,0x00,0x20]
 
-v_floor_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x5f,0xd1,0x00,0x00,0x00,0x00]
+v_floor_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x5f,0xd1,0x01,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x5f,0xd1,0x00,0x00,0x00,0x08]
+v_floor_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x5f,0xd1,0x01,0x00,0x00,0x00]
 
-v_floor_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x5f,0xd1,0x00,0x00,0x00,0x10]
+v_floor_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x5f,0xd1,0x01,0x00,0x00,0x08]
 
-v_floor_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x5f,0xd1,0x00,0x00,0x00,0x18]
+v_floor_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x5f,0xd1,0x01,0x00,0x00,0x10]
 
-v_exp_f32 v0, s0
-// CHECK: [0x00,0x40,0x00,0x7e]
+v_floor_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x5f,0xd1,0x01,0x00,0x00,0x18]
 
-v_exp_f32 v255, s0
-// CHECK: [0x00,0x40,0xfe,0x7f]
+v_exp_f32 v5, s1
+// CHECK: [0x01,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, s101
-// CHECK: [0x65,0x40,0x00,0x7e]
+v_exp_f32 v255, s1
+// CHECK: [0x01,0x40,0xfe,0x7f]
 
-v_exp_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x40,0x00,0x7e]
+v_exp_f32 v5, s101
+// CHECK: [0x65,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x40,0x00,0x7e]
+v_exp_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, vcc_lo
-// CHECK: [0x6a,0x40,0x00,0x7e]
+v_exp_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, vcc_hi
-// CHECK: [0x6b,0x40,0x00,0x7e]
+v_exp_f32 v5, vcc_lo
+// CHECK: [0x6a,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, tba_lo
-// CHECK: [0x6c,0x40,0x00,0x7e]
+v_exp_f32 v5, vcc_hi
+// CHECK: [0x6b,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, tba_hi
-// CHECK: [0x6d,0x40,0x00,0x7e]
+v_exp_f32 v5, tba_lo
+// CHECK: [0x6c,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, tma_lo
-// CHECK: [0x6e,0x40,0x00,0x7e]
+v_exp_f32 v5, tba_hi
+// CHECK: [0x6d,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, tma_hi
-// CHECK: [0x6f,0x40,0x00,0x7e]
+v_exp_f32 v5, tma_lo
+// CHECK: [0x6e,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, ttmp11
-// CHECK: [0x7b,0x40,0x00,0x7e]
+v_exp_f32 v5, tma_hi
+// CHECK: [0x6f,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, m0
-// CHECK: [0x7c,0x40,0x00,0x7e]
+v_exp_f32 v5, ttmp11
+// CHECK: [0x7b,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, exec_lo
-// CHECK: [0x7e,0x40,0x00,0x7e]
+v_exp_f32 v5, m0
+// CHECK: [0x7c,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, exec_hi
-// CHECK: [0x7f,0x40,0x00,0x7e]
+v_exp_f32 v5, exec_lo
+// CHECK: [0x7e,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, 0
-// CHECK: [0x80,0x40,0x00,0x7e]
+v_exp_f32 v5, exec_hi
+// CHECK: [0x7f,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, -1
-// CHECK: [0xc1,0x40,0x00,0x7e]
+v_exp_f32 v5, 0
+// CHECK: [0x80,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, 0.5
-// CHECK: [0xf0,0x40,0x00,0x7e]
+v_exp_f32 v5, -1
+// CHECK: [0xc1,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, -4.0
-// CHECK: [0xf7,0x40,0x00,0x7e]
+v_exp_f32 v5, 0.5
+// CHECK: [0xf0,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, 0xaf123456
-// CHECK: [0xff,0x40,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_exp_f32 v5, -4.0
+// CHECK: [0xf7,0x40,0x0a,0x7e]
 
-v_exp_f32 v0, 0x3f717273
-// CHECK: [0xff,0x40,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_exp_f32 v5, 0xaf123456
+// CHECK: [0xff,0x40,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_exp_f32 v0, v0
-// CHECK: [0x00,0x41,0x00,0x7e]
+v_exp_f32 v5, 0x3f717273
+// CHECK: [0xff,0x40,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_exp_f32 v0, v255
-// CHECK: [0xff,0x41,0x00,0x7e]
+v_exp_f32 v5, v1
+// CHECK: [0x01,0x41,0x0a,0x7e]
 
-v_exp_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_exp_f32 v5, v255
+// CHECK: [0xff,0x41,0x0a,0x7e]
 
-v_exp_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_exp_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x60,0xd1,0x01,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x60,0xd1,0x65,0x00,0x00,0x00]
+v_exp_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x60,0xd1,0x01,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x60,0xd1,0x66,0x00,0x00,0x00]
+v_exp_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x60,0xd1,0x65,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x60,0xd1,0x67,0x00,0x00,0x00]
+v_exp_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x60,0xd1,0x66,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x60,0xd1,0x6a,0x00,0x00,0x00]
+v_exp_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x60,0xd1,0x67,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x60,0xd1,0x6b,0x00,0x00,0x00]
+v_exp_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x60,0xd1,0x6a,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x60,0xd1,0x6c,0x00,0x00,0x00]
+v_exp_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x60,0xd1,0x6b,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x60,0xd1,0x6d,0x00,0x00,0x00]
+v_exp_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x60,0xd1,0x6c,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x60,0xd1,0x6e,0x00,0x00,0x00]
+v_exp_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x60,0xd1,0x6d,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x60,0xd1,0x6f,0x00,0x00,0x00]
+v_exp_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x60,0xd1,0x6e,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x60,0xd1,0x7b,0x00,0x00,0x00]
+v_exp_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x60,0xd1,0x6f,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x60,0xd1,0x7c,0x00,0x00,0x00]
+v_exp_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x60,0xd1,0x7b,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x60,0xd1,0x7e,0x00,0x00,0x00]
+v_exp_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x60,0xd1,0x7c,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x60,0xd1,0x7f,0x00,0x00,0x00]
+v_exp_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x60,0xd1,0x7e,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x60,0xd1,0xfd,0x00,0x00,0x00]
+v_exp_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x60,0xd1,0x7f,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x01,0x00,0x00]
+v_exp_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x60,0xd1,0xfd,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x60,0xd1,0xff,0x01,0x00,0x00]
+v_exp_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x60,0xd1,0x01,0x01,0x00,0x00]
 
-v_exp_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x00,0x00,0x20]
+v_exp_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x60,0xd1,0xff,0x01,0x00,0x00]
 
-v_exp_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_exp_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x60,0xd1,0x01,0x00,0x00,0x20]
 
-v_exp_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x60,0xd1,0x00,0x00,0x00,0x00]
+v_exp_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x60,0xd1,0x01,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x00,0x00,0x08]
+v_exp_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x60,0xd1,0x01,0x00,0x00,0x00]
 
-v_exp_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x00,0x00,0x10]
+v_exp_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x60,0xd1,0x01,0x00,0x00,0x08]
 
-v_exp_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x60,0xd1,0x00,0x00,0x00,0x18]
+v_exp_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x60,0xd1,0x01,0x00,0x00,0x10]
 
-v_log_f32 v0, s0
-// CHECK: [0x00,0x42,0x00,0x7e]
+v_exp_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x60,0xd1,0x01,0x00,0x00,0x18]
 
-v_log_f32 v255, s0
-// CHECK: [0x00,0x42,0xfe,0x7f]
+v_log_f32 v5, s1
+// CHECK: [0x01,0x42,0x0a,0x7e]
 
-v_log_f32 v0, s101
-// CHECK: [0x65,0x42,0x00,0x7e]
+v_log_f32 v255, s1
+// CHECK: [0x01,0x42,0xfe,0x7f]
 
-v_log_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x42,0x00,0x7e]
+v_log_f32 v5, s101
+// CHECK: [0x65,0x42,0x0a,0x7e]
 
-v_log_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x42,0x00,0x7e]
+v_log_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x42,0x0a,0x7e]
 
-v_log_f32 v0, vcc_lo
-// CHECK: [0x6a,0x42,0x00,0x7e]
+v_log_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x42,0x0a,0x7e]
 
-v_log_f32 v0, vcc_hi
-// CHECK: [0x6b,0x42,0x00,0x7e]
+v_log_f32 v5, vcc_lo
+// CHECK: [0x6a,0x42,0x0a,0x7e]
 
-v_log_f32 v0, tba_lo
-// CHECK: [0x6c,0x42,0x00,0x7e]
+v_log_f32 v5, vcc_hi
+// CHECK: [0x6b,0x42,0x0a,0x7e]
 
-v_log_f32 v0, tba_hi
-// CHECK: [0x6d,0x42,0x00,0x7e]
+v_log_f32 v5, tba_lo
+// CHECK: [0x6c,0x42,0x0a,0x7e]
 
-v_log_f32 v0, tma_lo
-// CHECK: [0x6e,0x42,0x00,0x7e]
+v_log_f32 v5, tba_hi
+// CHECK: [0x6d,0x42,0x0a,0x7e]
 
-v_log_f32 v0, tma_hi
-// CHECK: [0x6f,0x42,0x00,0x7e]
+v_log_f32 v5, tma_lo
+// CHECK: [0x6e,0x42,0x0a,0x7e]
 
-v_log_f32 v0, ttmp11
-// CHECK: [0x7b,0x42,0x00,0x7e]
+v_log_f32 v5, tma_hi
+// CHECK: [0x6f,0x42,0x0a,0x7e]
 
-v_log_f32 v0, m0
-// CHECK: [0x7c,0x42,0x00,0x7e]
+v_log_f32 v5, ttmp11
+// CHECK: [0x7b,0x42,0x0a,0x7e]
 
-v_log_f32 v0, exec_lo
-// CHECK: [0x7e,0x42,0x00,0x7e]
+v_log_f32 v5, m0
+// CHECK: [0x7c,0x42,0x0a,0x7e]
 
-v_log_f32 v0, exec_hi
-// CHECK: [0x7f,0x42,0x00,0x7e]
+v_log_f32 v5, exec_lo
+// CHECK: [0x7e,0x42,0x0a,0x7e]
 
-v_log_f32 v0, 0
-// CHECK: [0x80,0x42,0x00,0x7e]
+v_log_f32 v5, exec_hi
+// CHECK: [0x7f,0x42,0x0a,0x7e]
 
-v_log_f32 v0, -1
-// CHECK: [0xc1,0x42,0x00,0x7e]
+v_log_f32 v5, 0
+// CHECK: [0x80,0x42,0x0a,0x7e]
 
-v_log_f32 v0, 0.5
-// CHECK: [0xf0,0x42,0x00,0x7e]
+v_log_f32 v5, -1
+// CHECK: [0xc1,0x42,0x0a,0x7e]
 
-v_log_f32 v0, -4.0
-// CHECK: [0xf7,0x42,0x00,0x7e]
+v_log_f32 v5, 0.5
+// CHECK: [0xf0,0x42,0x0a,0x7e]
 
-v_log_f32 v0, 0xaf123456
-// CHECK: [0xff,0x42,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_log_f32 v5, -4.0
+// CHECK: [0xf7,0x42,0x0a,0x7e]
 
-v_log_f32 v0, 0x3f717273
-// CHECK: [0xff,0x42,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_log_f32 v5, 0xaf123456
+// CHECK: [0xff,0x42,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_log_f32 v0, v0
-// CHECK: [0x00,0x43,0x00,0x7e]
+v_log_f32 v5, 0x3f717273
+// CHECK: [0xff,0x42,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_log_f32 v0, v255
-// CHECK: [0xff,0x43,0x00,0x7e]
+v_log_f32 v5, v1
+// CHECK: [0x01,0x43,0x0a,0x7e]
 
-v_log_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x61,0xd1,0x00,0x00,0x00,0x00]
+v_log_f32 v5, v255
+// CHECK: [0xff,0x43,0x0a,0x7e]
 
-v_log_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x61,0xd1,0x00,0x00,0x00,0x00]
+v_log_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x61,0xd1,0x01,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x61,0xd1,0x65,0x00,0x00,0x00]
+v_log_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x61,0xd1,0x01,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x61,0xd1,0x66,0x00,0x00,0x00]
+v_log_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x61,0xd1,0x65,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x61,0xd1,0x67,0x00,0x00,0x00]
+v_log_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x61,0xd1,0x66,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x61,0xd1,0x6a,0x00,0x00,0x00]
+v_log_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x61,0xd1,0x67,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x61,0xd1,0x6b,0x00,0x00,0x00]
+v_log_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x61,0xd1,0x6a,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x61,0xd1,0x6c,0x00,0x00,0x00]
+v_log_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x61,0xd1,0x6b,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x61,0xd1,0x6d,0x00,0x00,0x00]
+v_log_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x61,0xd1,0x6c,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x61,0xd1,0x6e,0x00,0x00,0x00]
+v_log_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x61,0xd1,0x6d,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x61,0xd1,0x6f,0x00,0x00,0x00]
+v_log_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x61,0xd1,0x6e,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x61,0xd1,0x7b,0x00,0x00,0x00]
+v_log_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x61,0xd1,0x6f,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x61,0xd1,0x7c,0x00,0x00,0x00]
+v_log_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x61,0xd1,0x7b,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x61,0xd1,0x7e,0x00,0x00,0x00]
+v_log_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x61,0xd1,0x7c,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x61,0xd1,0x7f,0x00,0x00,0x00]
+v_log_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x61,0xd1,0x7e,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x61,0xd1,0xfd,0x00,0x00,0x00]
+v_log_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x61,0xd1,0x7f,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x61,0xd1,0x00,0x01,0x00,0x00]
+v_log_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x61,0xd1,0xfd,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x61,0xd1,0xff,0x01,0x00,0x00]
+v_log_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x61,0xd1,0x01,0x01,0x00,0x00]
 
-v_log_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x61,0xd1,0x00,0x00,0x00,0x20]
+v_log_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x61,0xd1,0xff,0x01,0x00,0x00]
 
-v_log_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x61,0xd1,0x00,0x00,0x00,0x00]
+v_log_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x61,0xd1,0x01,0x00,0x00,0x20]
 
-v_log_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x61,0xd1,0x00,0x00,0x00,0x00]
+v_log_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x61,0xd1,0x01,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x61,0xd1,0x00,0x00,0x00,0x08]
+v_log_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x61,0xd1,0x01,0x00,0x00,0x00]
 
-v_log_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x61,0xd1,0x00,0x00,0x00,0x10]
+v_log_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x61,0xd1,0x01,0x00,0x00,0x08]
 
-v_log_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x61,0xd1,0x00,0x00,0x00,0x18]
+v_log_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x61,0xd1,0x01,0x00,0x00,0x10]
 
-v_rcp_f32 v0, s0
-// CHECK: [0x00,0x44,0x00,0x7e]
+v_log_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x61,0xd1,0x01,0x00,0x00,0x18]
 
-v_rcp_f32 v255, s0
-// CHECK: [0x00,0x44,0xfe,0x7f]
+v_rcp_f32 v5, s1
+// CHECK: [0x01,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, s101
-// CHECK: [0x65,0x44,0x00,0x7e]
+v_rcp_f32 v255, s1
+// CHECK: [0x01,0x44,0xfe,0x7f]
 
-v_rcp_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x44,0x00,0x7e]
+v_rcp_f32 v5, s101
+// CHECK: [0x65,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x44,0x00,0x7e]
+v_rcp_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, vcc_lo
-// CHECK: [0x6a,0x44,0x00,0x7e]
+v_rcp_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, vcc_hi
-// CHECK: [0x6b,0x44,0x00,0x7e]
+v_rcp_f32 v5, vcc_lo
+// CHECK: [0x6a,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, tba_lo
-// CHECK: [0x6c,0x44,0x00,0x7e]
+v_rcp_f32 v5, vcc_hi
+// CHECK: [0x6b,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, tba_hi
-// CHECK: [0x6d,0x44,0x00,0x7e]
+v_rcp_f32 v5, tba_lo
+// CHECK: [0x6c,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, tma_lo
-// CHECK: [0x6e,0x44,0x00,0x7e]
+v_rcp_f32 v5, tba_hi
+// CHECK: [0x6d,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, tma_hi
-// CHECK: [0x6f,0x44,0x00,0x7e]
+v_rcp_f32 v5, tma_lo
+// CHECK: [0x6e,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, ttmp11
-// CHECK: [0x7b,0x44,0x00,0x7e]
+v_rcp_f32 v5, tma_hi
+// CHECK: [0x6f,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, m0
-// CHECK: [0x7c,0x44,0x00,0x7e]
+v_rcp_f32 v5, ttmp11
+// CHECK: [0x7b,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, exec_lo
-// CHECK: [0x7e,0x44,0x00,0x7e]
+v_rcp_f32 v5, m0
+// CHECK: [0x7c,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, exec_hi
-// CHECK: [0x7f,0x44,0x00,0x7e]
+v_rcp_f32 v5, exec_lo
+// CHECK: [0x7e,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, 0
-// CHECK: [0x80,0x44,0x00,0x7e]
+v_rcp_f32 v5, exec_hi
+// CHECK: [0x7f,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, -1
-// CHECK: [0xc1,0x44,0x00,0x7e]
+v_rcp_f32 v5, 0
+// CHECK: [0x80,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, 0.5
-// CHECK: [0xf0,0x44,0x00,0x7e]
+v_rcp_f32 v5, -1
+// CHECK: [0xc1,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, -4.0
-// CHECK: [0xf7,0x44,0x00,0x7e]
+v_rcp_f32 v5, 0.5
+// CHECK: [0xf0,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, 0xaf123456
-// CHECK: [0xff,0x44,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rcp_f32 v5, -4.0
+// CHECK: [0xf7,0x44,0x0a,0x7e]
 
-v_rcp_f32 v0, 0x3f717273
-// CHECK: [0xff,0x44,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rcp_f32 v5, 0xaf123456
+// CHECK: [0xff,0x44,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rcp_f32 v0, v0
-// CHECK: [0x00,0x45,0x00,0x7e]
+v_rcp_f32 v5, 0x3f717273
+// CHECK: [0xff,0x44,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rcp_f32 v0, v255
-// CHECK: [0xff,0x45,0x00,0x7e]
+v_rcp_f32 v5, v1
+// CHECK: [0x01,0x45,0x0a,0x7e]
 
-v_rcp_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_f32 v5, v255
+// CHECK: [0xff,0x45,0x0a,0x7e]
 
-v_rcp_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x62,0xd1,0x01,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x62,0xd1,0x65,0x00,0x00,0x00]
+v_rcp_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x62,0xd1,0x01,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x62,0xd1,0x66,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x62,0xd1,0x65,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x62,0xd1,0x67,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x62,0xd1,0x66,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x62,0xd1,0x6a,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x62,0xd1,0x67,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x62,0xd1,0x6b,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x62,0xd1,0x6a,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x62,0xd1,0x6c,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x62,0xd1,0x6b,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x62,0xd1,0x6d,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x62,0xd1,0x6c,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x62,0xd1,0x6e,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x62,0xd1,0x6d,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x62,0xd1,0x6f,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x62,0xd1,0x6e,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x62,0xd1,0x7b,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x62,0xd1,0x6f,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x62,0xd1,0x7c,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x62,0xd1,0x7b,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x62,0xd1,0x7e,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x62,0xd1,0x7c,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x62,0xd1,0x7f,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x62,0xd1,0x7e,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x62,0xd1,0xfd,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x62,0xd1,0x7f,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x01,0x00,0x00]
+v_rcp_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x62,0xd1,0xfd,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x62,0xd1,0xff,0x01,0x00,0x00]
+v_rcp_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x62,0xd1,0x01,0x01,0x00,0x00]
 
-v_rcp_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x00,0x00,0x20]
+v_rcp_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x62,0xd1,0xff,0x01,0x00,0x00]
 
-v_rcp_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x62,0xd1,0x01,0x00,0x00,0x20]
 
-v_rcp_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x62,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x62,0xd1,0x01,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x00,0x00,0x08]
+v_rcp_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x62,0xd1,0x01,0x00,0x00,0x00]
 
-v_rcp_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x00,0x00,0x10]
+v_rcp_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x62,0xd1,0x01,0x00,0x00,0x08]
 
-v_rcp_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x62,0xd1,0x00,0x00,0x00,0x18]
+v_rcp_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x62,0xd1,0x01,0x00,0x00,0x10]
 
-v_rcp_iflag_f32 v0, s0
-// CHECK: [0x00,0x46,0x00,0x7e]
+v_rcp_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x62,0xd1,0x01,0x00,0x00,0x18]
 
-v_rcp_iflag_f32 v255, s0
-// CHECK: [0x00,0x46,0xfe,0x7f]
+v_rcp_iflag_f32 v5, s1
+// CHECK: [0x01,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, s101
-// CHECK: [0x65,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v255, s1
+// CHECK: [0x01,0x46,0xfe,0x7f]
 
-v_rcp_iflag_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, s101
+// CHECK: [0x65,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, vcc_lo
-// CHECK: [0x6a,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, vcc_hi
-// CHECK: [0x6b,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, vcc_lo
+// CHECK: [0x6a,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, tba_lo
-// CHECK: [0x6c,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, vcc_hi
+// CHECK: [0x6b,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, tba_hi
-// CHECK: [0x6d,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, tba_lo
+// CHECK: [0x6c,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, tma_lo
-// CHECK: [0x6e,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, tba_hi
+// CHECK: [0x6d,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, tma_hi
-// CHECK: [0x6f,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, tma_lo
+// CHECK: [0x6e,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, ttmp11
-// CHECK: [0x7b,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, tma_hi
+// CHECK: [0x6f,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, m0
-// CHECK: [0x7c,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, ttmp11
+// CHECK: [0x7b,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, exec_lo
-// CHECK: [0x7e,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, m0
+// CHECK: [0x7c,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, exec_hi
-// CHECK: [0x7f,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, exec_lo
+// CHECK: [0x7e,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, 0
-// CHECK: [0x80,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, exec_hi
+// CHECK: [0x7f,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, -1
-// CHECK: [0xc1,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, 0
+// CHECK: [0x80,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, 0.5
-// CHECK: [0xf0,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, -1
+// CHECK: [0xc1,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, -4.0
-// CHECK: [0xf7,0x46,0x00,0x7e]
+v_rcp_iflag_f32 v5, 0.5
+// CHECK: [0xf0,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, 0xaf123456
-// CHECK: [0xff,0x46,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rcp_iflag_f32 v5, -4.0
+// CHECK: [0xf7,0x46,0x0a,0x7e]
 
-v_rcp_iflag_f32 v0, 0x3f717273
-// CHECK: [0xff,0x46,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rcp_iflag_f32 v5, 0xaf123456
+// CHECK: [0xff,0x46,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rcp_iflag_f32 v0, v0
-// CHECK: [0x00,0x47,0x00,0x7e]
+v_rcp_iflag_f32 v5, 0x3f717273
+// CHECK: [0xff,0x46,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rcp_iflag_f32 v0, v255
-// CHECK: [0xff,0x47,0x00,0x7e]
+v_rcp_iflag_f32 v5, v1
+// CHECK: [0x01,0x47,0x0a,0x7e]
 
-v_rcp_iflag_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x63,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_iflag_f32 v5, v255
+// CHECK: [0xff,0x47,0x0a,0x7e]
 
-v_rcp_iflag_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x63,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x63,0xd1,0x01,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x63,0xd1,0x65,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x63,0xd1,0x01,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x63,0xd1,0x66,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x63,0xd1,0x65,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x63,0xd1,0x67,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x63,0xd1,0x66,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x63,0xd1,0x6a,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x63,0xd1,0x67,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x63,0xd1,0x6b,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x63,0xd1,0x6a,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x63,0xd1,0x6c,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x63,0xd1,0x6b,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x63,0xd1,0x6d,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x63,0xd1,0x6c,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x63,0xd1,0x6e,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x63,0xd1,0x6d,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x63,0xd1,0x6f,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x63,0xd1,0x6e,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x63,0xd1,0x7b,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x63,0xd1,0x6f,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x63,0xd1,0x7c,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x63,0xd1,0x7b,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x63,0xd1,0x7e,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x63,0xd1,0x7c,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x63,0xd1,0x7f,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x63,0xd1,0x7e,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x63,0xd1,0xfd,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x63,0xd1,0x7f,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x63,0xd1,0x00,0x01,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x63,0xd1,0xfd,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x63,0xd1,0xff,0x01,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x63,0xd1,0x01,0x01,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x63,0xd1,0x00,0x00,0x00,0x20]
+v_rcp_iflag_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x63,0xd1,0xff,0x01,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x63,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x63,0xd1,0x01,0x00,0x00,0x20]
 
-v_rcp_iflag_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x63,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_iflag_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x63,0xd1,0x01,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x63,0xd1,0x00,0x00,0x00,0x08]
+v_rcp_iflag_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x63,0xd1,0x01,0x00,0x00,0x00]
 
-v_rcp_iflag_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x63,0xd1,0x00,0x00,0x00,0x10]
+v_rcp_iflag_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x63,0xd1,0x01,0x00,0x00,0x08]
 
-v_rcp_iflag_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x63,0xd1,0x00,0x00,0x00,0x18]
+v_rcp_iflag_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x63,0xd1,0x01,0x00,0x00,0x10]
 
-v_rsq_f32 v0, s0
-// CHECK: [0x00,0x48,0x00,0x7e]
+v_rcp_iflag_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x63,0xd1,0x01,0x00,0x00,0x18]
 
-v_rsq_f32 v255, s0
-// CHECK: [0x00,0x48,0xfe,0x7f]
+v_rsq_f32 v5, s1
+// CHECK: [0x01,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, s101
-// CHECK: [0x65,0x48,0x00,0x7e]
+v_rsq_f32 v255, s1
+// CHECK: [0x01,0x48,0xfe,0x7f]
 
-v_rsq_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x48,0x00,0x7e]
+v_rsq_f32 v5, s101
+// CHECK: [0x65,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x48,0x00,0x7e]
+v_rsq_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, vcc_lo
-// CHECK: [0x6a,0x48,0x00,0x7e]
+v_rsq_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, vcc_hi
-// CHECK: [0x6b,0x48,0x00,0x7e]
+v_rsq_f32 v5, vcc_lo
+// CHECK: [0x6a,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, tba_lo
-// CHECK: [0x6c,0x48,0x00,0x7e]
+v_rsq_f32 v5, vcc_hi
+// CHECK: [0x6b,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, tba_hi
-// CHECK: [0x6d,0x48,0x00,0x7e]
+v_rsq_f32 v5, tba_lo
+// CHECK: [0x6c,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, tma_lo
-// CHECK: [0x6e,0x48,0x00,0x7e]
+v_rsq_f32 v5, tba_hi
+// CHECK: [0x6d,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, tma_hi
-// CHECK: [0x6f,0x48,0x00,0x7e]
+v_rsq_f32 v5, tma_lo
+// CHECK: [0x6e,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, ttmp11
-// CHECK: [0x7b,0x48,0x00,0x7e]
+v_rsq_f32 v5, tma_hi
+// CHECK: [0x6f,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, m0
-// CHECK: [0x7c,0x48,0x00,0x7e]
+v_rsq_f32 v5, ttmp11
+// CHECK: [0x7b,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, exec_lo
-// CHECK: [0x7e,0x48,0x00,0x7e]
+v_rsq_f32 v5, m0
+// CHECK: [0x7c,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, exec_hi
-// CHECK: [0x7f,0x48,0x00,0x7e]
+v_rsq_f32 v5, exec_lo
+// CHECK: [0x7e,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, 0
-// CHECK: [0x80,0x48,0x00,0x7e]
+v_rsq_f32 v5, exec_hi
+// CHECK: [0x7f,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, -1
-// CHECK: [0xc1,0x48,0x00,0x7e]
+v_rsq_f32 v5, 0
+// CHECK: [0x80,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, 0.5
-// CHECK: [0xf0,0x48,0x00,0x7e]
+v_rsq_f32 v5, -1
+// CHECK: [0xc1,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, -4.0
-// CHECK: [0xf7,0x48,0x00,0x7e]
+v_rsq_f32 v5, 0.5
+// CHECK: [0xf0,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, 0xaf123456
-// CHECK: [0xff,0x48,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rsq_f32 v5, -4.0
+// CHECK: [0xf7,0x48,0x0a,0x7e]
 
-v_rsq_f32 v0, 0x3f717273
-// CHECK: [0xff,0x48,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rsq_f32 v5, 0xaf123456
+// CHECK: [0xff,0x48,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rsq_f32 v0, v0
-// CHECK: [0x00,0x49,0x00,0x7e]
+v_rsq_f32 v5, 0x3f717273
+// CHECK: [0xff,0x48,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rsq_f32 v0, v255
-// CHECK: [0xff,0x49,0x00,0x7e]
+v_rsq_f32 v5, v1
+// CHECK: [0x01,0x49,0x0a,0x7e]
 
-v_rsq_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_rsq_f32 v5, v255
+// CHECK: [0xff,0x49,0x0a,0x7e]
 
-v_rsq_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x64,0xd1,0x01,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x64,0xd1,0x65,0x00,0x00,0x00]
+v_rsq_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x64,0xd1,0x01,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x64,0xd1,0x66,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x64,0xd1,0x65,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x64,0xd1,0x67,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x64,0xd1,0x66,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x64,0xd1,0x6a,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x64,0xd1,0x67,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x64,0xd1,0x6b,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x64,0xd1,0x6a,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x64,0xd1,0x6c,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x64,0xd1,0x6b,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x64,0xd1,0x6d,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x64,0xd1,0x6c,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x64,0xd1,0x6e,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x64,0xd1,0x6d,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x64,0xd1,0x6f,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x64,0xd1,0x6e,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x64,0xd1,0x7b,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x64,0xd1,0x6f,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x64,0xd1,0x7c,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x64,0xd1,0x7b,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x64,0xd1,0x7e,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x64,0xd1,0x7c,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x64,0xd1,0x7f,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x64,0xd1,0x7e,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x64,0xd1,0xfd,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x64,0xd1,0x7f,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x01,0x00,0x00]
+v_rsq_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x64,0xd1,0xfd,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x64,0xd1,0xff,0x01,0x00,0x00]
+v_rsq_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x64,0xd1,0x01,0x01,0x00,0x00]
 
-v_rsq_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x00,0x00,0x20]
+v_rsq_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x64,0xd1,0xff,0x01,0x00,0x00]
 
-v_rsq_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x64,0xd1,0x01,0x00,0x00,0x20]
 
-v_rsq_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x64,0xd1,0x00,0x00,0x00,0x00]
+v_rsq_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x64,0xd1,0x01,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x00,0x00,0x08]
+v_rsq_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x64,0xd1,0x01,0x00,0x00,0x00]
 
-v_rsq_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x00,0x00,0x10]
+v_rsq_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x64,0xd1,0x01,0x00,0x00,0x08]
 
-v_rsq_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x64,0xd1,0x00,0x00,0x00,0x18]
+v_rsq_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x64,0xd1,0x01,0x00,0x00,0x10]
 
-v_rcp_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x4a,0x00,0x7e]
+v_rsq_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x64,0xd1,0x01,0x00,0x00,0x18]
 
-v_rcp_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x4a,0xfc,0x7f]
+v_rcp_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x4a,0x00,0x7e]
+v_rcp_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x4a,0xfc,0x7f]
 
-v_rcp_f64 v[0:1], s[100:101]
-// CHECK: [0x64,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], flat_scratch
-// CHECK: [0x66,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], s[100:101]
+// CHECK: [0x64,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], vcc
-// CHECK: [0x6a,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], flat_scratch
+// CHECK: [0x66,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], tba
-// CHECK: [0x6c,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], vcc
+// CHECK: [0x6a,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], tma
-// CHECK: [0x6e,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], tba
+// CHECK: [0x6c,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], tma
+// CHECK: [0x6e,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], exec
-// CHECK: [0x7e,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], 0
-// CHECK: [0x80,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], exec
+// CHECK: [0x7e,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], -1
-// CHECK: [0xc1,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], 0
+// CHECK: [0x80,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], -1
+// CHECK: [0xc1,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x4a,0x00,0x7e]
+v_rcp_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x4a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rcp_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x4a,0x0a,0x7e]
 
-v_rcp_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x4a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rcp_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x4a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rcp_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x4b,0x00,0x7e]
+v_rcp_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x4a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rcp_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x4b,0x00,0x7e]
+v_rcp_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x4b,0x0a,0x7e]
 
-v_rcp_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x4b,0x0a,0x7e]
 
-v_rcp_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x65,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x65,0xd1,0x02,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x65,0xd1,0x02,0x00,0x00,0x00]
+v_rcp_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x65,0xd1,0x02,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], s[100:101]
-// CHECK: [0x00,0x00,0x65,0xd1,0x64,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x65,0xd1,0x04,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x65,0xd1,0x66,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], s[100:101]
+// CHECK: [0x05,0x00,0x65,0xd1,0x64,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x65,0xd1,0x6a,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x65,0xd1,0x66,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x65,0xd1,0x6c,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x65,0xd1,0x6a,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x65,0xd1,0x6e,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x65,0xd1,0x6c,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x65,0xd1,0x7a,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x65,0xd1,0x6e,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x65,0xd1,0x7e,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x65,0xd1,0x7a,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x65,0xd1,0xfd,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x65,0xd1,0x7e,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x65,0xd1,0x00,0x01,0x00,0x00]
+v_rcp_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x65,0xd1,0xfd,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x65,0xd1,0xfe,0x01,0x00,0x00]
+v_rcp_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x65,0xd1,0x01,0x01,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd1,0x00,0x00,0x00,0x20]
+v_rcp_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x65,0xd1,0xfe,0x01,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x65,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x65,0xd1,0x02,0x00,0x00,0x20]
 
-v_rcp_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x65,0xd1,0x00,0x00,0x00,0x00]
+v_rcp_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x65,0xd1,0x02,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x65,0xd1,0x00,0x00,0x00,0x08]
+v_rcp_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x80,0x65,0xd1,0x02,0x00,0x00,0x00]
 
-v_rcp_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x65,0xd1,0x00,0x00,0x00,0x10]
+v_rcp_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x65,0xd1,0x02,0x00,0x00,0x08]
 
-v_rcp_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x65,0xd1,0x00,0x00,0x00,0x18]
+v_rcp_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x65,0xd1,0x02,0x00,0x00,0x10]
 
-v_rsq_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x4c,0x00,0x7e]
+v_rcp_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x65,0xd1,0x02,0x00,0x00,0x18]
 
-v_rsq_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x4c,0xfc,0x7f]
+v_rsq_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x4c,0x00,0x7e]
+v_rsq_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x4c,0xfc,0x7f]
 
-v_rsq_f64 v[0:1], s[100:101]
-// CHECK: [0x64,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], flat_scratch
-// CHECK: [0x66,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], s[100:101]
+// CHECK: [0x64,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], vcc
-// CHECK: [0x6a,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], flat_scratch
+// CHECK: [0x66,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], tba
-// CHECK: [0x6c,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], vcc
+// CHECK: [0x6a,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], tma
-// CHECK: [0x6e,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], tba
+// CHECK: [0x6c,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], tma
+// CHECK: [0x6e,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], exec
-// CHECK: [0x7e,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], 0
-// CHECK: [0x80,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], exec
+// CHECK: [0x7e,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], -1
-// CHECK: [0xc1,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], 0
+// CHECK: [0x80,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], -1
+// CHECK: [0xc1,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x4c,0x00,0x7e]
+v_rsq_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x4c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_rsq_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x4c,0x0a,0x7e]
 
-v_rsq_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x4c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_rsq_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x4c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_rsq_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x4d,0x00,0x7e]
+v_rsq_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x4c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_rsq_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x4d,0x00,0x7e]
+v_rsq_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x4d,0x0a,0x7e]
 
-v_rsq_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_rsq_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x4d,0x0a,0x7e]
 
-v_rsq_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x66,0xd1,0x02,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x66,0xd1,0x02,0x00,0x00,0x00]
+v_rsq_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x66,0xd1,0x02,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], s[100:101]
-// CHECK: [0x00,0x00,0x66,0xd1,0x64,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x66,0xd1,0x04,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x66,0xd1,0x66,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], s[100:101]
+// CHECK: [0x05,0x00,0x66,0xd1,0x64,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x66,0xd1,0x6a,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x66,0xd1,0x66,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x66,0xd1,0x6c,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x66,0xd1,0x6a,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x66,0xd1,0x6e,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x66,0xd1,0x6c,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x66,0xd1,0x7a,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x66,0xd1,0x6e,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x66,0xd1,0x7e,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x66,0xd1,0x7a,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x66,0xd1,0xfd,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x66,0xd1,0x7e,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x01,0x00,0x00]
+v_rsq_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x66,0xd1,0xfd,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x66,0xd1,0xfe,0x01,0x00,0x00]
+v_rsq_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x66,0xd1,0x01,0x01,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x00,0x00,0x20]
+v_rsq_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x66,0xd1,0xfe,0x01,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x66,0xd1,0x02,0x00,0x00,0x20]
 
-v_rsq_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x66,0xd1,0x00,0x00,0x00,0x00]
+v_rsq_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x66,0xd1,0x02,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x00,0x00,0x08]
+v_rsq_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x80,0x66,0xd1,0x02,0x00,0x00,0x00]
 
-v_rsq_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x00,0x00,0x10]
+v_rsq_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x66,0xd1,0x02,0x00,0x00,0x08]
 
-v_rsq_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x66,0xd1,0x00,0x00,0x00,0x18]
+v_rsq_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x66,0xd1,0x02,0x00,0x00,0x10]
 
-v_sqrt_f32 v0, s0
-// CHECK: [0x00,0x4e,0x00,0x7e]
+v_rsq_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x66,0xd1,0x02,0x00,0x00,0x18]
 
-v_sqrt_f32 v255, s0
-// CHECK: [0x00,0x4e,0xfe,0x7f]
+v_sqrt_f32 v5, s1
+// CHECK: [0x01,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, s101
-// CHECK: [0x65,0x4e,0x00,0x7e]
+v_sqrt_f32 v255, s1
+// CHECK: [0x01,0x4e,0xfe,0x7f]
 
-v_sqrt_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, s101
+// CHECK: [0x65,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, vcc_lo
-// CHECK: [0x6a,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, vcc_hi
-// CHECK: [0x6b,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, vcc_lo
+// CHECK: [0x6a,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, tba_lo
-// CHECK: [0x6c,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, vcc_hi
+// CHECK: [0x6b,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, tba_hi
-// CHECK: [0x6d,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, tba_lo
+// CHECK: [0x6c,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, tma_lo
-// CHECK: [0x6e,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, tba_hi
+// CHECK: [0x6d,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, tma_hi
-// CHECK: [0x6f,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, tma_lo
+// CHECK: [0x6e,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, ttmp11
-// CHECK: [0x7b,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, tma_hi
+// CHECK: [0x6f,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, m0
-// CHECK: [0x7c,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, ttmp11
+// CHECK: [0x7b,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, exec_lo
-// CHECK: [0x7e,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, m0
+// CHECK: [0x7c,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, exec_hi
-// CHECK: [0x7f,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, exec_lo
+// CHECK: [0x7e,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, 0
-// CHECK: [0x80,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, exec_hi
+// CHECK: [0x7f,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, -1
-// CHECK: [0xc1,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, 0
+// CHECK: [0x80,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, 0.5
-// CHECK: [0xf0,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, -1
+// CHECK: [0xc1,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, -4.0
-// CHECK: [0xf7,0x4e,0x00,0x7e]
+v_sqrt_f32 v5, 0.5
+// CHECK: [0xf0,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, 0xaf123456
-// CHECK: [0xff,0x4e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_sqrt_f32 v5, -4.0
+// CHECK: [0xf7,0x4e,0x0a,0x7e]
 
-v_sqrt_f32 v0, 0x3f717273
-// CHECK: [0xff,0x4e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_sqrt_f32 v5, 0xaf123456
+// CHECK: [0xff,0x4e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_sqrt_f32 v0, v0
-// CHECK: [0x00,0x4f,0x00,0x7e]
+v_sqrt_f32 v5, 0x3f717273
+// CHECK: [0xff,0x4e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_sqrt_f32 v0, v255
-// CHECK: [0xff,0x4f,0x00,0x7e]
+v_sqrt_f32 v5, v1
+// CHECK: [0x01,0x4f,0x0a,0x7e]
 
-v_sqrt_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x67,0xd1,0x00,0x00,0x00,0x00]
+v_sqrt_f32 v5, v255
+// CHECK: [0xff,0x4f,0x0a,0x7e]
 
-v_sqrt_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x67,0xd1,0x00,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x67,0xd1,0x01,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x67,0xd1,0x65,0x00,0x00,0x00]
+v_sqrt_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x67,0xd1,0x01,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x67,0xd1,0x66,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x67,0xd1,0x65,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x67,0xd1,0x67,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x67,0xd1,0x66,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x67,0xd1,0x6a,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x67,0xd1,0x67,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x67,0xd1,0x6b,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x67,0xd1,0x6a,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x67,0xd1,0x6c,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x67,0xd1,0x6b,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x67,0xd1,0x6d,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x67,0xd1,0x6c,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x67,0xd1,0x6e,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x67,0xd1,0x6d,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x67,0xd1,0x6f,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x67,0xd1,0x6e,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x67,0xd1,0x7b,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x67,0xd1,0x6f,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x67,0xd1,0x7c,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x67,0xd1,0x7b,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x67,0xd1,0x7e,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x67,0xd1,0x7c,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x67,0xd1,0x7f,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x67,0xd1,0x7e,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x67,0xd1,0xfd,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x67,0xd1,0x7f,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x67,0xd1,0x00,0x01,0x00,0x00]
+v_sqrt_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x67,0xd1,0xfd,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x67,0xd1,0xff,0x01,0x00,0x00]
+v_sqrt_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x67,0xd1,0x01,0x01,0x00,0x00]
 
-v_sqrt_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x67,0xd1,0x00,0x00,0x00,0x20]
+v_sqrt_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x67,0xd1,0xff,0x01,0x00,0x00]
 
-v_sqrt_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x67,0xd1,0x00,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x67,0xd1,0x01,0x00,0x00,0x20]
 
-v_sqrt_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x67,0xd1,0x00,0x00,0x00,0x00]
+v_sqrt_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x67,0xd1,0x01,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x67,0xd1,0x00,0x00,0x00,0x08]
+v_sqrt_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x67,0xd1,0x01,0x00,0x00,0x00]
 
-v_sqrt_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x67,0xd1,0x00,0x00,0x00,0x10]
+v_sqrt_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x67,0xd1,0x01,0x00,0x00,0x08]
 
-v_sqrt_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x67,0xd1,0x00,0x00,0x00,0x18]
+v_sqrt_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x67,0xd1,0x01,0x00,0x00,0x10]
 
-v_sqrt_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x50,0x00,0x7e]
+v_sqrt_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x67,0xd1,0x01,0x00,0x00,0x18]
 
-v_sqrt_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x50,0xfc,0x7f]
+v_sqrt_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x50,0x00,0x7e]
+v_sqrt_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x50,0xfc,0x7f]
 
-v_sqrt_f64 v[0:1], s[100:101]
-// CHECK: [0x64,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], flat_scratch
-// CHECK: [0x66,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], s[100:101]
+// CHECK: [0x64,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], vcc
-// CHECK: [0x6a,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], flat_scratch
+// CHECK: [0x66,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], tba
-// CHECK: [0x6c,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], vcc
+// CHECK: [0x6a,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], tma
-// CHECK: [0x6e,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], tba
+// CHECK: [0x6c,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], tma
+// CHECK: [0x6e,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], exec
-// CHECK: [0x7e,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], 0
-// CHECK: [0x80,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], exec
+// CHECK: [0x7e,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], -1
-// CHECK: [0xc1,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], 0
+// CHECK: [0x80,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], -1
+// CHECK: [0xc1,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x50,0x00,0x7e]
+v_sqrt_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x50,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_sqrt_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x50,0x0a,0x7e]
 
-v_sqrt_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x50,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_sqrt_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x50,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_sqrt_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x51,0x00,0x7e]
+v_sqrt_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x50,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_sqrt_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x51,0x00,0x7e]
+v_sqrt_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x51,0x0a,0x7e]
 
-v_sqrt_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_sqrt_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x51,0x0a,0x7e]
 
-v_sqrt_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x68,0xd1,0x02,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x68,0xd1,0x02,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x68,0xd1,0x02,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], s[100:101]
-// CHECK: [0x00,0x00,0x68,0xd1,0x64,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x68,0xd1,0x04,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x68,0xd1,0x66,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], s[100:101]
+// CHECK: [0x05,0x00,0x68,0xd1,0x64,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x68,0xd1,0x6a,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x68,0xd1,0x66,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x68,0xd1,0x6c,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x68,0xd1,0x6a,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x68,0xd1,0x6e,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x68,0xd1,0x6c,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x68,0xd1,0x7a,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x68,0xd1,0x6e,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x68,0xd1,0x7e,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x68,0xd1,0x7a,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x68,0xd1,0xfd,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x68,0xd1,0x7e,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x01,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x68,0xd1,0xfd,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x68,0xd1,0xfe,0x01,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x68,0xd1,0x01,0x01,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x00,0x00,0x20]
+v_sqrt_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x68,0xd1,0xfe,0x01,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x68,0xd1,0x02,0x00,0x00,0x20]
 
-v_sqrt_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x68,0xd1,0x00,0x00,0x00,0x00]
+v_sqrt_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x68,0xd1,0x02,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x00,0x00,0x08]
+v_sqrt_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x80,0x68,0xd1,0x02,0x00,0x00,0x00]
 
-v_sqrt_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x00,0x00,0x10]
+v_sqrt_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x68,0xd1,0x02,0x00,0x00,0x08]
 
-v_sqrt_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x68,0xd1,0x00,0x00,0x00,0x18]
+v_sqrt_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x68,0xd1,0x02,0x00,0x00,0x10]
 
-v_sin_f32 v0, s0
-// CHECK: [0x00,0x52,0x00,0x7e]
+v_sqrt_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x68,0xd1,0x02,0x00,0x00,0x18]
 
-v_sin_f32 v255, s0
-// CHECK: [0x00,0x52,0xfe,0x7f]
+v_sin_f32 v5, s1
+// CHECK: [0x01,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, s101
-// CHECK: [0x65,0x52,0x00,0x7e]
+v_sin_f32 v255, s1
+// CHECK: [0x01,0x52,0xfe,0x7f]
 
-v_sin_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x52,0x00,0x7e]
+v_sin_f32 v5, s101
+// CHECK: [0x65,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x52,0x00,0x7e]
+v_sin_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, vcc_lo
-// CHECK: [0x6a,0x52,0x00,0x7e]
+v_sin_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, vcc_hi
-// CHECK: [0x6b,0x52,0x00,0x7e]
+v_sin_f32 v5, vcc_lo
+// CHECK: [0x6a,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, tba_lo
-// CHECK: [0x6c,0x52,0x00,0x7e]
+v_sin_f32 v5, vcc_hi
+// CHECK: [0x6b,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, tba_hi
-// CHECK: [0x6d,0x52,0x00,0x7e]
+v_sin_f32 v5, tba_lo
+// CHECK: [0x6c,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, tma_lo
-// CHECK: [0x6e,0x52,0x00,0x7e]
+v_sin_f32 v5, tba_hi
+// CHECK: [0x6d,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, tma_hi
-// CHECK: [0x6f,0x52,0x00,0x7e]
+v_sin_f32 v5, tma_lo
+// CHECK: [0x6e,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, ttmp11
-// CHECK: [0x7b,0x52,0x00,0x7e]
+v_sin_f32 v5, tma_hi
+// CHECK: [0x6f,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, m0
-// CHECK: [0x7c,0x52,0x00,0x7e]
+v_sin_f32 v5, ttmp11
+// CHECK: [0x7b,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, exec_lo
-// CHECK: [0x7e,0x52,0x00,0x7e]
+v_sin_f32 v5, m0
+// CHECK: [0x7c,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, exec_hi
-// CHECK: [0x7f,0x52,0x00,0x7e]
+v_sin_f32 v5, exec_lo
+// CHECK: [0x7e,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, 0
-// CHECK: [0x80,0x52,0x00,0x7e]
+v_sin_f32 v5, exec_hi
+// CHECK: [0x7f,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, -1
-// CHECK: [0xc1,0x52,0x00,0x7e]
+v_sin_f32 v5, 0
+// CHECK: [0x80,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, 0.5
-// CHECK: [0xf0,0x52,0x00,0x7e]
+v_sin_f32 v5, -1
+// CHECK: [0xc1,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, -4.0
-// CHECK: [0xf7,0x52,0x00,0x7e]
+v_sin_f32 v5, 0.5
+// CHECK: [0xf0,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, 0xaf123456
-// CHECK: [0xff,0x52,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_sin_f32 v5, -4.0
+// CHECK: [0xf7,0x52,0x0a,0x7e]
 
-v_sin_f32 v0, 0x3f717273
-// CHECK: [0xff,0x52,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_sin_f32 v5, 0xaf123456
+// CHECK: [0xff,0x52,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_sin_f32 v0, v0
-// CHECK: [0x00,0x53,0x00,0x7e]
+v_sin_f32 v5, 0x3f717273
+// CHECK: [0xff,0x52,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_sin_f32 v0, v255
-// CHECK: [0xff,0x53,0x00,0x7e]
+v_sin_f32 v5, v1
+// CHECK: [0x01,0x53,0x0a,0x7e]
 
-v_sin_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x69,0xd1,0x00,0x00,0x00,0x00]
+v_sin_f32 v5, v255
+// CHECK: [0xff,0x53,0x0a,0x7e]
 
-v_sin_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x69,0xd1,0x00,0x00,0x00,0x00]
+v_sin_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x69,0xd1,0x01,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x69,0xd1,0x65,0x00,0x00,0x00]
+v_sin_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x69,0xd1,0x01,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x69,0xd1,0x66,0x00,0x00,0x00]
+v_sin_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x69,0xd1,0x65,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x69,0xd1,0x67,0x00,0x00,0x00]
+v_sin_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x69,0xd1,0x66,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x69,0xd1,0x6a,0x00,0x00,0x00]
+v_sin_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x69,0xd1,0x67,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x69,0xd1,0x6b,0x00,0x00,0x00]
+v_sin_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x69,0xd1,0x6a,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x69,0xd1,0x6c,0x00,0x00,0x00]
+v_sin_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x69,0xd1,0x6b,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x69,0xd1,0x6d,0x00,0x00,0x00]
+v_sin_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x69,0xd1,0x6c,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x69,0xd1,0x6e,0x00,0x00,0x00]
+v_sin_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x69,0xd1,0x6d,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x69,0xd1,0x6f,0x00,0x00,0x00]
+v_sin_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x69,0xd1,0x6e,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x69,0xd1,0x7b,0x00,0x00,0x00]
+v_sin_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x69,0xd1,0x6f,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x69,0xd1,0x7c,0x00,0x00,0x00]
+v_sin_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x69,0xd1,0x7b,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x69,0xd1,0x7e,0x00,0x00,0x00]
+v_sin_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x69,0xd1,0x7c,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x69,0xd1,0x7f,0x00,0x00,0x00]
+v_sin_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x69,0xd1,0x7e,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x69,0xd1,0xfd,0x00,0x00,0x00]
+v_sin_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x69,0xd1,0x7f,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x69,0xd1,0x00,0x01,0x00,0x00]
+v_sin_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x69,0xd1,0xfd,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x69,0xd1,0xff,0x01,0x00,0x00]
+v_sin_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x69,0xd1,0x01,0x01,0x00,0x00]
 
-v_sin_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x69,0xd1,0x00,0x00,0x00,0x20]
+v_sin_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x69,0xd1,0xff,0x01,0x00,0x00]
 
-v_sin_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x69,0xd1,0x00,0x00,0x00,0x00]
+v_sin_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x69,0xd1,0x01,0x00,0x00,0x20]
 
-v_sin_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x69,0xd1,0x00,0x00,0x00,0x00]
+v_sin_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x69,0xd1,0x01,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x69,0xd1,0x00,0x00,0x00,0x08]
+v_sin_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x69,0xd1,0x01,0x00,0x00,0x00]
 
-v_sin_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x69,0xd1,0x00,0x00,0x00,0x10]
+v_sin_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x69,0xd1,0x01,0x00,0x00,0x08]
 
-v_sin_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x69,0xd1,0x00,0x00,0x00,0x18]
+v_sin_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x69,0xd1,0x01,0x00,0x00,0x10]
 
-v_cos_f32 v0, s0
-// CHECK: [0x00,0x54,0x00,0x7e]
+v_sin_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x69,0xd1,0x01,0x00,0x00,0x18]
 
-v_cos_f32 v255, s0
-// CHECK: [0x00,0x54,0xfe,0x7f]
+v_cos_f32 v5, s1
+// CHECK: [0x01,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, s101
-// CHECK: [0x65,0x54,0x00,0x7e]
+v_cos_f32 v255, s1
+// CHECK: [0x01,0x54,0xfe,0x7f]
 
-v_cos_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x54,0x00,0x7e]
+v_cos_f32 v5, s101
+// CHECK: [0x65,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x54,0x00,0x7e]
+v_cos_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, vcc_lo
-// CHECK: [0x6a,0x54,0x00,0x7e]
+v_cos_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, vcc_hi
-// CHECK: [0x6b,0x54,0x00,0x7e]
+v_cos_f32 v5, vcc_lo
+// CHECK: [0x6a,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, tba_lo
-// CHECK: [0x6c,0x54,0x00,0x7e]
+v_cos_f32 v5, vcc_hi
+// CHECK: [0x6b,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, tba_hi
-// CHECK: [0x6d,0x54,0x00,0x7e]
+v_cos_f32 v5, tba_lo
+// CHECK: [0x6c,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, tma_lo
-// CHECK: [0x6e,0x54,0x00,0x7e]
+v_cos_f32 v5, tba_hi
+// CHECK: [0x6d,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, tma_hi
-// CHECK: [0x6f,0x54,0x00,0x7e]
+v_cos_f32 v5, tma_lo
+// CHECK: [0x6e,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, ttmp11
-// CHECK: [0x7b,0x54,0x00,0x7e]
+v_cos_f32 v5, tma_hi
+// CHECK: [0x6f,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, m0
-// CHECK: [0x7c,0x54,0x00,0x7e]
+v_cos_f32 v5, ttmp11
+// CHECK: [0x7b,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, exec_lo
-// CHECK: [0x7e,0x54,0x00,0x7e]
+v_cos_f32 v5, m0
+// CHECK: [0x7c,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, exec_hi
-// CHECK: [0x7f,0x54,0x00,0x7e]
+v_cos_f32 v5, exec_lo
+// CHECK: [0x7e,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, 0
-// CHECK: [0x80,0x54,0x00,0x7e]
+v_cos_f32 v5, exec_hi
+// CHECK: [0x7f,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, -1
-// CHECK: [0xc1,0x54,0x00,0x7e]
+v_cos_f32 v5, 0
+// CHECK: [0x80,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, 0.5
-// CHECK: [0xf0,0x54,0x00,0x7e]
+v_cos_f32 v5, -1
+// CHECK: [0xc1,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, -4.0
-// CHECK: [0xf7,0x54,0x00,0x7e]
+v_cos_f32 v5, 0.5
+// CHECK: [0xf0,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, 0xaf123456
-// CHECK: [0xff,0x54,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_cos_f32 v5, -4.0
+// CHECK: [0xf7,0x54,0x0a,0x7e]
 
-v_cos_f32 v0, 0x3f717273
-// CHECK: [0xff,0x54,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_cos_f32 v5, 0xaf123456
+// CHECK: [0xff,0x54,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_cos_f32 v0, v0
-// CHECK: [0x00,0x55,0x00,0x7e]
+v_cos_f32 v5, 0x3f717273
+// CHECK: [0xff,0x54,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_cos_f32 v0, v255
-// CHECK: [0xff,0x55,0x00,0x7e]
+v_cos_f32 v5, v1
+// CHECK: [0x01,0x55,0x0a,0x7e]
 
-v_cos_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cos_f32 v5, v255
+// CHECK: [0xff,0x55,0x0a,0x7e]
 
-v_cos_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cos_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x6a,0xd1,0x01,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x6a,0xd1,0x65,0x00,0x00,0x00]
+v_cos_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x6a,0xd1,0x01,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x6a,0xd1,0x66,0x00,0x00,0x00]
+v_cos_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x6a,0xd1,0x65,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x6a,0xd1,0x67,0x00,0x00,0x00]
+v_cos_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x6a,0xd1,0x66,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x6a,0xd1,0x6a,0x00,0x00,0x00]
+v_cos_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x6a,0xd1,0x67,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x6a,0xd1,0x6b,0x00,0x00,0x00]
+v_cos_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x6a,0xd1,0x6a,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x6a,0xd1,0x6c,0x00,0x00,0x00]
+v_cos_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x6a,0xd1,0x6b,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x6a,0xd1,0x6d,0x00,0x00,0x00]
+v_cos_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x6a,0xd1,0x6c,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x6a,0xd1,0x6e,0x00,0x00,0x00]
+v_cos_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x6a,0xd1,0x6d,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x6a,0xd1,0x6f,0x00,0x00,0x00]
+v_cos_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x6a,0xd1,0x6e,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x6a,0xd1,0x7b,0x00,0x00,0x00]
+v_cos_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x6a,0xd1,0x6f,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x6a,0xd1,0x7c,0x00,0x00,0x00]
+v_cos_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x6a,0xd1,0x7b,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x6a,0xd1,0x7e,0x00,0x00,0x00]
+v_cos_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x6a,0xd1,0x7c,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x6a,0xd1,0x7f,0x00,0x00,0x00]
+v_cos_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x6a,0xd1,0x7e,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x6a,0xd1,0xfd,0x00,0x00,0x00]
+v_cos_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x6a,0xd1,0x7f,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x01,0x00,0x00]
+v_cos_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x6a,0xd1,0xfd,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x6a,0xd1,0xff,0x01,0x00,0x00]
+v_cos_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x6a,0xd1,0x01,0x01,0x00,0x00]
 
-v_cos_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x00,0x00,0x20]
+v_cos_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x6a,0xd1,0xff,0x01,0x00,0x00]
 
-v_cos_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cos_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x6a,0xd1,0x01,0x00,0x00,0x20]
 
-v_cos_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x6a,0xd1,0x00,0x00,0x00,0x00]
+v_cos_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x6a,0xd1,0x01,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x00,0x00,0x08]
+v_cos_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x6a,0xd1,0x01,0x00,0x00,0x00]
 
-v_cos_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x00,0x00,0x10]
+v_cos_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x6a,0xd1,0x01,0x00,0x00,0x08]
 
-v_cos_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x6a,0xd1,0x00,0x00,0x00,0x18]
+v_cos_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x6a,0xd1,0x01,0x00,0x00,0x10]
 
-v_not_b32 v0, s0
-// CHECK: [0x00,0x56,0x00,0x7e]
+v_cos_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x6a,0xd1,0x01,0x00,0x00,0x18]
 
-v_not_b32 v255, s0
-// CHECK: [0x00,0x56,0xfe,0x7f]
+v_not_b32 v5, s1
+// CHECK: [0x01,0x56,0x0a,0x7e]
 
-v_not_b32 v0, s101
-// CHECK: [0x65,0x56,0x00,0x7e]
+v_not_b32 v255, s1
+// CHECK: [0x01,0x56,0xfe,0x7f]
 
-v_not_b32 v0, flat_scratch_lo
-// CHECK: [0x66,0x56,0x00,0x7e]
+v_not_b32 v5, s101
+// CHECK: [0x65,0x56,0x0a,0x7e]
 
-v_not_b32 v0, flat_scratch_hi
-// CHECK: [0x67,0x56,0x00,0x7e]
+v_not_b32 v5, flat_scratch_lo
+// CHECK: [0x66,0x56,0x0a,0x7e]
 
-v_not_b32 v0, vcc_lo
-// CHECK: [0x6a,0x56,0x00,0x7e]
+v_not_b32 v5, flat_scratch_hi
+// CHECK: [0x67,0x56,0x0a,0x7e]
 
-v_not_b32 v0, vcc_hi
-// CHECK: [0x6b,0x56,0x00,0x7e]
+v_not_b32 v5, vcc_lo
+// CHECK: [0x6a,0x56,0x0a,0x7e]
 
-v_not_b32 v0, tba_lo
-// CHECK: [0x6c,0x56,0x00,0x7e]
+v_not_b32 v5, vcc_hi
+// CHECK: [0x6b,0x56,0x0a,0x7e]
 
-v_not_b32 v0, tba_hi
-// CHECK: [0x6d,0x56,0x00,0x7e]
+v_not_b32 v5, tba_lo
+// CHECK: [0x6c,0x56,0x0a,0x7e]
 
-v_not_b32 v0, tma_lo
-// CHECK: [0x6e,0x56,0x00,0x7e]
+v_not_b32 v5, tba_hi
+// CHECK: [0x6d,0x56,0x0a,0x7e]
 
-v_not_b32 v0, tma_hi
-// CHECK: [0x6f,0x56,0x00,0x7e]
+v_not_b32 v5, tma_lo
+// CHECK: [0x6e,0x56,0x0a,0x7e]
 
-v_not_b32 v0, ttmp11
-// CHECK: [0x7b,0x56,0x00,0x7e]
+v_not_b32 v5, tma_hi
+// CHECK: [0x6f,0x56,0x0a,0x7e]
 
-v_not_b32 v0, m0
-// CHECK: [0x7c,0x56,0x00,0x7e]
+v_not_b32 v5, ttmp11
+// CHECK: [0x7b,0x56,0x0a,0x7e]
 
-v_not_b32 v0, exec_lo
-// CHECK: [0x7e,0x56,0x00,0x7e]
+v_not_b32 v5, m0
+// CHECK: [0x7c,0x56,0x0a,0x7e]
 
-v_not_b32 v0, exec_hi
-// CHECK: [0x7f,0x56,0x00,0x7e]
+v_not_b32 v5, exec_lo
+// CHECK: [0x7e,0x56,0x0a,0x7e]
 
-v_not_b32 v0, 0
-// CHECK: [0x80,0x56,0x00,0x7e]
+v_not_b32 v5, exec_hi
+// CHECK: [0x7f,0x56,0x0a,0x7e]
 
-v_not_b32 v0, -1
-// CHECK: [0xc1,0x56,0x00,0x7e]
+v_not_b32 v5, 0
+// CHECK: [0x80,0x56,0x0a,0x7e]
 
-v_not_b32 v0, 0.5
-// CHECK: [0xf0,0x56,0x00,0x7e]
+v_not_b32 v5, -1
+// CHECK: [0xc1,0x56,0x0a,0x7e]
 
-v_not_b32 v0, -4.0
-// CHECK: [0xf7,0x56,0x00,0x7e]
+v_not_b32 v5, 0.5
+// CHECK: [0xf0,0x56,0x0a,0x7e]
 
-v_not_b32 v0, 0xaf123456
-// CHECK: [0xff,0x56,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_not_b32 v5, -4.0
+// CHECK: [0xf7,0x56,0x0a,0x7e]
 
-v_not_b32 v0, 0x3f717273
-// CHECK: [0xff,0x56,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_not_b32 v5, 0xaf123456
+// CHECK: [0xff,0x56,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_not_b32 v0, v0
-// CHECK: [0x00,0x57,0x00,0x7e]
+v_not_b32 v5, 0x3f717273
+// CHECK: [0xff,0x56,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_not_b32 v0, v255
-// CHECK: [0xff,0x57,0x00,0x7e]
+v_not_b32 v5, v1
+// CHECK: [0x01,0x57,0x0a,0x7e]
 
-v_not_b32_e64 v0, s0
-// CHECK: [0x00,0x00,0x6b,0xd1,0x00,0x00,0x00,0x00]
+v_not_b32 v5, v255
+// CHECK: [0xff,0x57,0x0a,0x7e]
 
-v_not_b32_e64 v255, s0
-// CHECK: [0xff,0x00,0x6b,0xd1,0x00,0x00,0x00,0x00]
+v_not_b32_e64 v5, s1
+// CHECK: [0x05,0x00,0x6b,0xd1,0x01,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, s101
-// CHECK: [0x00,0x00,0x6b,0xd1,0x65,0x00,0x00,0x00]
+v_not_b32_e64 v255, s1
+// CHECK: [0xff,0x00,0x6b,0xd1,0x01,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x6b,0xd1,0x66,0x00,0x00,0x00]
+v_not_b32_e64 v5, s101
+// CHECK: [0x05,0x00,0x6b,0xd1,0x65,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x6b,0xd1,0x67,0x00,0x00,0x00]
+v_not_b32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x6b,0xd1,0x66,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x6b,0xd1,0x6a,0x00,0x00,0x00]
+v_not_b32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x6b,0xd1,0x67,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x6b,0xd1,0x6b,0x00,0x00,0x00]
+v_not_b32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x6b,0xd1,0x6a,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x6b,0xd1,0x6c,0x00,0x00,0x00]
+v_not_b32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x6b,0xd1,0x6b,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x6b,0xd1,0x6d,0x00,0x00,0x00]
+v_not_b32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x6b,0xd1,0x6c,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x6b,0xd1,0x6e,0x00,0x00,0x00]
+v_not_b32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x6b,0xd1,0x6d,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x6b,0xd1,0x6f,0x00,0x00,0x00]
+v_not_b32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x6b,0xd1,0x6e,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x6b,0xd1,0x7b,0x00,0x00,0x00]
+v_not_b32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x6b,0xd1,0x6f,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x6b,0xd1,0x7c,0x00,0x00,0x00]
+v_not_b32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x6b,0xd1,0x7b,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x6b,0xd1,0x7e,0x00,0x00,0x00]
+v_not_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x6b,0xd1,0x7c,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x6b,0xd1,0x7f,0x00,0x00,0x00]
+v_not_b32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x6b,0xd1,0x7e,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x6b,0xd1,0x80,0x00,0x00,0x00]
+v_not_b32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x6b,0xd1,0x7f,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x6b,0xd1,0xc1,0x00,0x00,0x00]
+v_not_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x6b,0xd1,0x80,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x6b,0xd1,0xf0,0x00,0x00,0x00]
+v_not_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x6b,0xd1,0xc1,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x6b,0xd1,0xf7,0x00,0x00,0x00]
+v_not_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x6b,0xd1,0xf0,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x6b,0xd1,0x00,0x01,0x00,0x00]
+v_not_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x6b,0xd1,0xf7,0x00,0x00,0x00]
 
-v_not_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x6b,0xd1,0xff,0x01,0x00,0x00]
+v_not_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x6b,0xd1,0x01,0x01,0x00,0x00]
 
-v_bfrev_b32 v0, s0
-// CHECK: [0x00,0x58,0x00,0x7e]
+v_not_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x6b,0xd1,0xff,0x01,0x00,0x00]
 
-v_bfrev_b32 v255, s0
-// CHECK: [0x00,0x58,0xfe,0x7f]
+v_bfrev_b32 v5, s1
+// CHECK: [0x01,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, s101
-// CHECK: [0x65,0x58,0x00,0x7e]
+v_bfrev_b32 v255, s1
+// CHECK: [0x01,0x58,0xfe,0x7f]
 
-v_bfrev_b32 v0, flat_scratch_lo
-// CHECK: [0x66,0x58,0x00,0x7e]
+v_bfrev_b32 v5, s101
+// CHECK: [0x65,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, flat_scratch_hi
-// CHECK: [0x67,0x58,0x00,0x7e]
+v_bfrev_b32 v5, flat_scratch_lo
+// CHECK: [0x66,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, vcc_lo
-// CHECK: [0x6a,0x58,0x00,0x7e]
+v_bfrev_b32 v5, flat_scratch_hi
+// CHECK: [0x67,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, vcc_hi
-// CHECK: [0x6b,0x58,0x00,0x7e]
+v_bfrev_b32 v5, vcc_lo
+// CHECK: [0x6a,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, tba_lo
-// CHECK: [0x6c,0x58,0x00,0x7e]
+v_bfrev_b32 v5, vcc_hi
+// CHECK: [0x6b,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, tba_hi
-// CHECK: [0x6d,0x58,0x00,0x7e]
+v_bfrev_b32 v5, tba_lo
+// CHECK: [0x6c,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, tma_lo
-// CHECK: [0x6e,0x58,0x00,0x7e]
+v_bfrev_b32 v5, tba_hi
+// CHECK: [0x6d,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, tma_hi
-// CHECK: [0x6f,0x58,0x00,0x7e]
+v_bfrev_b32 v5, tma_lo
+// CHECK: [0x6e,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, ttmp11
-// CHECK: [0x7b,0x58,0x00,0x7e]
+v_bfrev_b32 v5, tma_hi
+// CHECK: [0x6f,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, m0
-// CHECK: [0x7c,0x58,0x00,0x7e]
+v_bfrev_b32 v5, ttmp11
+// CHECK: [0x7b,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, exec_lo
-// CHECK: [0x7e,0x58,0x00,0x7e]
+v_bfrev_b32 v5, m0
+// CHECK: [0x7c,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, exec_hi
-// CHECK: [0x7f,0x58,0x00,0x7e]
+v_bfrev_b32 v5, exec_lo
+// CHECK: [0x7e,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, 0
-// CHECK: [0x80,0x58,0x00,0x7e]
+v_bfrev_b32 v5, exec_hi
+// CHECK: [0x7f,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, -1
-// CHECK: [0xc1,0x58,0x00,0x7e]
+v_bfrev_b32 v5, 0
+// CHECK: [0x80,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, 0.5
-// CHECK: [0xf0,0x58,0x00,0x7e]
+v_bfrev_b32 v5, -1
+// CHECK: [0xc1,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, -4.0
-// CHECK: [0xf7,0x58,0x00,0x7e]
+v_bfrev_b32 v5, 0.5
+// CHECK: [0xf0,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, 0xaf123456
-// CHECK: [0xff,0x58,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_bfrev_b32 v5, -4.0
+// CHECK: [0xf7,0x58,0x0a,0x7e]
 
-v_bfrev_b32 v0, 0x3f717273
-// CHECK: [0xff,0x58,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_bfrev_b32 v5, 0xaf123456
+// CHECK: [0xff,0x58,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_bfrev_b32 v0, v0
-// CHECK: [0x00,0x59,0x00,0x7e]
+v_bfrev_b32 v5, 0x3f717273
+// CHECK: [0xff,0x58,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_bfrev_b32 v0, v255
-// CHECK: [0xff,0x59,0x00,0x7e]
+v_bfrev_b32 v5, v1
+// CHECK: [0x01,0x59,0x0a,0x7e]
 
-v_bfrev_b32_e64 v0, s0
-// CHECK: [0x00,0x00,0x6c,0xd1,0x00,0x00,0x00,0x00]
+v_bfrev_b32 v5, v255
+// CHECK: [0xff,0x59,0x0a,0x7e]
 
-v_bfrev_b32_e64 v255, s0
-// CHECK: [0xff,0x00,0x6c,0xd1,0x00,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, s1
+// CHECK: [0x05,0x00,0x6c,0xd1,0x01,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, s101
-// CHECK: [0x00,0x00,0x6c,0xd1,0x65,0x00,0x00,0x00]
+v_bfrev_b32_e64 v255, s1
+// CHECK: [0xff,0x00,0x6c,0xd1,0x01,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x6c,0xd1,0x66,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, s101
+// CHECK: [0x05,0x00,0x6c,0xd1,0x65,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x6c,0xd1,0x67,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x6c,0xd1,0x66,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x6c,0xd1,0x6a,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x6c,0xd1,0x67,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x6c,0xd1,0x6b,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x6c,0xd1,0x6a,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x6c,0xd1,0x6c,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x6c,0xd1,0x6b,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x6c,0xd1,0x6d,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x6c,0xd1,0x6c,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x6c,0xd1,0x6e,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x6c,0xd1,0x6d,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x6c,0xd1,0x6f,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x6c,0xd1,0x6e,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x6c,0xd1,0x7b,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x6c,0xd1,0x6f,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x6c,0xd1,0x7c,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x6c,0xd1,0x7b,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x6c,0xd1,0x7e,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x6c,0xd1,0x7c,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x6c,0xd1,0x7f,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x6c,0xd1,0x7e,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x6c,0xd1,0x80,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x6c,0xd1,0x7f,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x6c,0xd1,0xc1,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x6c,0xd1,0x80,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x6c,0xd1,0xf0,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x6c,0xd1,0xc1,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x6c,0xd1,0xf7,0x00,0x00,0x00]
+v_bfrev_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x6c,0xd1,0xf0,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x6c,0xd1,0x00,0x01,0x00,0x00]
+v_bfrev_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x6c,0xd1,0xf7,0x00,0x00,0x00]
 
-v_bfrev_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x6c,0xd1,0xff,0x01,0x00,0x00]
+v_bfrev_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x6c,0xd1,0x01,0x01,0x00,0x00]
 
-v_ffbh_u32 v0, s0
-// CHECK: [0x00,0x5a,0x00,0x7e]
+v_bfrev_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x6c,0xd1,0xff,0x01,0x00,0x00]
 
-v_ffbh_u32 v255, s0
-// CHECK: [0x00,0x5a,0xfe,0x7f]
+v_ffbh_u32 v5, s1
+// CHECK: [0x01,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, s101
-// CHECK: [0x65,0x5a,0x00,0x7e]
+v_ffbh_u32 v255, s1
+// CHECK: [0x01,0x5a,0xfe,0x7f]
 
-v_ffbh_u32 v0, flat_scratch_lo
-// CHECK: [0x66,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, s101
+// CHECK: [0x65,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, flat_scratch_hi
-// CHECK: [0x67,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, flat_scratch_lo
+// CHECK: [0x66,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, vcc_lo
-// CHECK: [0x6a,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, flat_scratch_hi
+// CHECK: [0x67,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, vcc_hi
-// CHECK: [0x6b,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, vcc_lo
+// CHECK: [0x6a,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, tba_lo
-// CHECK: [0x6c,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, vcc_hi
+// CHECK: [0x6b,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, tba_hi
-// CHECK: [0x6d,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, tba_lo
+// CHECK: [0x6c,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, tma_lo
-// CHECK: [0x6e,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, tba_hi
+// CHECK: [0x6d,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, tma_hi
-// CHECK: [0x6f,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, tma_lo
+// CHECK: [0x6e,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, ttmp11
-// CHECK: [0x7b,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, tma_hi
+// CHECK: [0x6f,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, m0
-// CHECK: [0x7c,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, ttmp11
+// CHECK: [0x7b,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, exec_lo
-// CHECK: [0x7e,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, m0
+// CHECK: [0x7c,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, exec_hi
-// CHECK: [0x7f,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, exec_lo
+// CHECK: [0x7e,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, 0
-// CHECK: [0x80,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, exec_hi
+// CHECK: [0x7f,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, -1
-// CHECK: [0xc1,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, 0
+// CHECK: [0x80,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, 0.5
-// CHECK: [0xf0,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, -1
+// CHECK: [0xc1,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, -4.0
-// CHECK: [0xf7,0x5a,0x00,0x7e]
+v_ffbh_u32 v5, 0.5
+// CHECK: [0xf0,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, 0xaf123456
-// CHECK: [0xff,0x5a,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_ffbh_u32 v5, -4.0
+// CHECK: [0xf7,0x5a,0x0a,0x7e]
 
-v_ffbh_u32 v0, 0x3f717273
-// CHECK: [0xff,0x5a,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_ffbh_u32 v5, 0xaf123456
+// CHECK: [0xff,0x5a,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_ffbh_u32 v0, v0
-// CHECK: [0x00,0x5b,0x00,0x7e]
+v_ffbh_u32 v5, 0x3f717273
+// CHECK: [0xff,0x5a,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_ffbh_u32 v0, v255
-// CHECK: [0xff,0x5b,0x00,0x7e]
+v_ffbh_u32 v5, v1
+// CHECK: [0x01,0x5b,0x0a,0x7e]
 
-v_ffbh_u32_e64 v0, s0
-// CHECK: [0x00,0x00,0x6d,0xd1,0x00,0x00,0x00,0x00]
+v_ffbh_u32 v5, v255
+// CHECK: [0xff,0x5b,0x0a,0x7e]
 
-v_ffbh_u32_e64 v255, s0
-// CHECK: [0xff,0x00,0x6d,0xd1,0x00,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, s1
+// CHECK: [0x05,0x00,0x6d,0xd1,0x01,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, s101
-// CHECK: [0x00,0x00,0x6d,0xd1,0x65,0x00,0x00,0x00]
+v_ffbh_u32_e64 v255, s1
+// CHECK: [0xff,0x00,0x6d,0xd1,0x01,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x6d,0xd1,0x66,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, s101
+// CHECK: [0x05,0x00,0x6d,0xd1,0x65,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x6d,0xd1,0x67,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x6d,0xd1,0x66,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x6d,0xd1,0x6a,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x6d,0xd1,0x67,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x6d,0xd1,0x6b,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x6d,0xd1,0x6a,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x6d,0xd1,0x6c,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x6d,0xd1,0x6b,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x6d,0xd1,0x6d,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x6d,0xd1,0x6c,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x6d,0xd1,0x6e,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x6d,0xd1,0x6d,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x6d,0xd1,0x6f,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x6d,0xd1,0x6e,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x6d,0xd1,0x7b,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x6d,0xd1,0x6f,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, m0
-// CHECK: [0x00,0x00,0x6d,0xd1,0x7c,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x6d,0xd1,0x7b,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x6d,0xd1,0x7e,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, m0
+// CHECK: [0x05,0x00,0x6d,0xd1,0x7c,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x6d,0xd1,0x7f,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x6d,0xd1,0x7e,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, 0
-// CHECK: [0x00,0x00,0x6d,0xd1,0x80,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x6d,0xd1,0x7f,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, -1
-// CHECK: [0x00,0x00,0x6d,0xd1,0xc1,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, 0
+// CHECK: [0x05,0x00,0x6d,0xd1,0x80,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x6d,0xd1,0xf0,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, -1
+// CHECK: [0x05,0x00,0x6d,0xd1,0xc1,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x6d,0xd1,0xf7,0x00,0x00,0x00]
+v_ffbh_u32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x6d,0xd1,0xf0,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, v0
-// CHECK: [0x00,0x00,0x6d,0xd1,0x00,0x01,0x00,0x00]
+v_ffbh_u32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x6d,0xd1,0xf7,0x00,0x00,0x00]
 
-v_ffbh_u32_e64 v0, v255
-// CHECK: [0x00,0x00,0x6d,0xd1,0xff,0x01,0x00,0x00]
+v_ffbh_u32_e64 v5, v1
+// CHECK: [0x05,0x00,0x6d,0xd1,0x01,0x01,0x00,0x00]
 
-v_ffbl_b32 v0, s0
-// CHECK: [0x00,0x5c,0x00,0x7e]
+v_ffbh_u32_e64 v5, v255
+// CHECK: [0x05,0x00,0x6d,0xd1,0xff,0x01,0x00,0x00]
 
-v_ffbl_b32 v255, s0
-// CHECK: [0x00,0x5c,0xfe,0x7f]
+v_ffbl_b32 v5, s1
+// CHECK: [0x01,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, s101
-// CHECK: [0x65,0x5c,0x00,0x7e]
+v_ffbl_b32 v255, s1
+// CHECK: [0x01,0x5c,0xfe,0x7f]
 
-v_ffbl_b32 v0, flat_scratch_lo
-// CHECK: [0x66,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, s101
+// CHECK: [0x65,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, flat_scratch_hi
-// CHECK: [0x67,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, flat_scratch_lo
+// CHECK: [0x66,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, vcc_lo
-// CHECK: [0x6a,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, flat_scratch_hi
+// CHECK: [0x67,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, vcc_hi
-// CHECK: [0x6b,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, vcc_lo
+// CHECK: [0x6a,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, tba_lo
-// CHECK: [0x6c,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, vcc_hi
+// CHECK: [0x6b,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, tba_hi
-// CHECK: [0x6d,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, tba_lo
+// CHECK: [0x6c,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, tma_lo
-// CHECK: [0x6e,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, tba_hi
+// CHECK: [0x6d,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, tma_hi
-// CHECK: [0x6f,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, tma_lo
+// CHECK: [0x6e,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, ttmp11
-// CHECK: [0x7b,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, tma_hi
+// CHECK: [0x6f,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, m0
-// CHECK: [0x7c,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, ttmp11
+// CHECK: [0x7b,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, exec_lo
-// CHECK: [0x7e,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, m0
+// CHECK: [0x7c,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, exec_hi
-// CHECK: [0x7f,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, exec_lo
+// CHECK: [0x7e,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, 0
-// CHECK: [0x80,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, exec_hi
+// CHECK: [0x7f,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, -1
-// CHECK: [0xc1,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, 0
+// CHECK: [0x80,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, 0.5
-// CHECK: [0xf0,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, -1
+// CHECK: [0xc1,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, -4.0
-// CHECK: [0xf7,0x5c,0x00,0x7e]
+v_ffbl_b32 v5, 0.5
+// CHECK: [0xf0,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, 0xaf123456
-// CHECK: [0xff,0x5c,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_ffbl_b32 v5, -4.0
+// CHECK: [0xf7,0x5c,0x0a,0x7e]
 
-v_ffbl_b32 v0, 0x3f717273
-// CHECK: [0xff,0x5c,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_ffbl_b32 v5, 0xaf123456
+// CHECK: [0xff,0x5c,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_ffbl_b32 v0, v0
-// CHECK: [0x00,0x5d,0x00,0x7e]
+v_ffbl_b32 v5, 0x3f717273
+// CHECK: [0xff,0x5c,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_ffbl_b32 v0, v255
-// CHECK: [0xff,0x5d,0x00,0x7e]
+v_ffbl_b32 v5, v1
+// CHECK: [0x01,0x5d,0x0a,0x7e]
 
-v_ffbl_b32_e64 v0, s0
-// CHECK: [0x00,0x00,0x6e,0xd1,0x00,0x00,0x00,0x00]
+v_ffbl_b32 v5, v255
+// CHECK: [0xff,0x5d,0x0a,0x7e]
 
-v_ffbl_b32_e64 v255, s0
-// CHECK: [0xff,0x00,0x6e,0xd1,0x00,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, s1
+// CHECK: [0x05,0x00,0x6e,0xd1,0x01,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, s101
-// CHECK: [0x00,0x00,0x6e,0xd1,0x65,0x00,0x00,0x00]
+v_ffbl_b32_e64 v255, s1
+// CHECK: [0xff,0x00,0x6e,0xd1,0x01,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x6e,0xd1,0x66,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, s101
+// CHECK: [0x05,0x00,0x6e,0xd1,0x65,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x6e,0xd1,0x67,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x6e,0xd1,0x66,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x6e,0xd1,0x6a,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x6e,0xd1,0x67,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x6e,0xd1,0x6b,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x6e,0xd1,0x6a,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x6e,0xd1,0x6c,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x6e,0xd1,0x6b,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x6e,0xd1,0x6d,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x6e,0xd1,0x6c,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x6e,0xd1,0x6e,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x6e,0xd1,0x6d,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x6e,0xd1,0x6f,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x6e,0xd1,0x6e,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x6e,0xd1,0x7b,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x6e,0xd1,0x6f,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x6e,0xd1,0x7c,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x6e,0xd1,0x7b,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x6e,0xd1,0x7e,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x6e,0xd1,0x7c,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x6e,0xd1,0x7f,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x6e,0xd1,0x7e,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x6e,0xd1,0x80,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x6e,0xd1,0x7f,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x6e,0xd1,0xc1,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x6e,0xd1,0x80,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x6e,0xd1,0xf0,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x6e,0xd1,0xc1,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x6e,0xd1,0xf7,0x00,0x00,0x00]
+v_ffbl_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x6e,0xd1,0xf0,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x6e,0xd1,0x00,0x01,0x00,0x00]
+v_ffbl_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x6e,0xd1,0xf7,0x00,0x00,0x00]
 
-v_ffbl_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x6e,0xd1,0xff,0x01,0x00,0x00]
+v_ffbl_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x6e,0xd1,0x01,0x01,0x00,0x00]
 
-v_ffbh_i32 v0, s0
-// CHECK: [0x00,0x5e,0x00,0x7e]
+v_ffbl_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x6e,0xd1,0xff,0x01,0x00,0x00]
 
-v_ffbh_i32 v255, s0
-// CHECK: [0x00,0x5e,0xfe,0x7f]
+v_ffbh_i32 v5, s1
+// CHECK: [0x01,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, s101
-// CHECK: [0x65,0x5e,0x00,0x7e]
+v_ffbh_i32 v255, s1
+// CHECK: [0x01,0x5e,0xfe,0x7f]
 
-v_ffbh_i32 v0, flat_scratch_lo
-// CHECK: [0x66,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, s101
+// CHECK: [0x65,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, flat_scratch_hi
-// CHECK: [0x67,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, flat_scratch_lo
+// CHECK: [0x66,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, vcc_lo
-// CHECK: [0x6a,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, flat_scratch_hi
+// CHECK: [0x67,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, vcc_hi
-// CHECK: [0x6b,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, vcc_lo
+// CHECK: [0x6a,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, tba_lo
-// CHECK: [0x6c,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, vcc_hi
+// CHECK: [0x6b,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, tba_hi
-// CHECK: [0x6d,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, tba_lo
+// CHECK: [0x6c,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, tma_lo
-// CHECK: [0x6e,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, tba_hi
+// CHECK: [0x6d,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, tma_hi
-// CHECK: [0x6f,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, tma_lo
+// CHECK: [0x6e,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, ttmp11
-// CHECK: [0x7b,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, tma_hi
+// CHECK: [0x6f,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, m0
-// CHECK: [0x7c,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, ttmp11
+// CHECK: [0x7b,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, exec_lo
-// CHECK: [0x7e,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, m0
+// CHECK: [0x7c,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, exec_hi
-// CHECK: [0x7f,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, exec_lo
+// CHECK: [0x7e,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, 0
-// CHECK: [0x80,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, exec_hi
+// CHECK: [0x7f,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, -1
-// CHECK: [0xc1,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, 0
+// CHECK: [0x80,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, 0.5
-// CHECK: [0xf0,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, -1
+// CHECK: [0xc1,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, -4.0
-// CHECK: [0xf7,0x5e,0x00,0x7e]
+v_ffbh_i32 v5, 0.5
+// CHECK: [0xf0,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, 0xaf123456
-// CHECK: [0xff,0x5e,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_ffbh_i32 v5, -4.0
+// CHECK: [0xf7,0x5e,0x0a,0x7e]
 
-v_ffbh_i32 v0, 0x3f717273
-// CHECK: [0xff,0x5e,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_ffbh_i32 v5, 0xaf123456
+// CHECK: [0xff,0x5e,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_ffbh_i32 v0, v0
-// CHECK: [0x00,0x5f,0x00,0x7e]
+v_ffbh_i32 v5, 0x3f717273
+// CHECK: [0xff,0x5e,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_ffbh_i32 v0, v255
-// CHECK: [0xff,0x5f,0x00,0x7e]
+v_ffbh_i32 v5, v1
+// CHECK: [0x01,0x5f,0x0a,0x7e]
 
-v_ffbh_i32_e64 v0, s0
-// CHECK: [0x00,0x00,0x6f,0xd1,0x00,0x00,0x00,0x00]
+v_ffbh_i32 v5, v255
+// CHECK: [0xff,0x5f,0x0a,0x7e]
 
-v_ffbh_i32_e64 v255, s0
-// CHECK: [0xff,0x00,0x6f,0xd1,0x00,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, s1
+// CHECK: [0x05,0x00,0x6f,0xd1,0x01,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, s101
-// CHECK: [0x00,0x00,0x6f,0xd1,0x65,0x00,0x00,0x00]
+v_ffbh_i32_e64 v255, s1
+// CHECK: [0xff,0x00,0x6f,0xd1,0x01,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x6f,0xd1,0x66,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, s101
+// CHECK: [0x05,0x00,0x6f,0xd1,0x65,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x6f,0xd1,0x67,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x6f,0xd1,0x66,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x6f,0xd1,0x6a,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x6f,0xd1,0x67,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x6f,0xd1,0x6b,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x6f,0xd1,0x6a,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x6f,0xd1,0x6c,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x6f,0xd1,0x6b,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x6f,0xd1,0x6d,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x6f,0xd1,0x6c,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x6f,0xd1,0x6e,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x6f,0xd1,0x6d,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x6f,0xd1,0x6f,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x6f,0xd1,0x6e,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x6f,0xd1,0x7b,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x6f,0xd1,0x6f,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, m0
-// CHECK: [0x00,0x00,0x6f,0xd1,0x7c,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x6f,0xd1,0x7b,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x6f,0xd1,0x7e,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, m0
+// CHECK: [0x05,0x00,0x6f,0xd1,0x7c,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x6f,0xd1,0x7f,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x6f,0xd1,0x7e,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, 0
-// CHECK: [0x00,0x00,0x6f,0xd1,0x80,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x6f,0xd1,0x7f,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, -1
-// CHECK: [0x00,0x00,0x6f,0xd1,0xc1,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, 0
+// CHECK: [0x05,0x00,0x6f,0xd1,0x80,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x6f,0xd1,0xf0,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, -1
+// CHECK: [0x05,0x00,0x6f,0xd1,0xc1,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x6f,0xd1,0xf7,0x00,0x00,0x00]
+v_ffbh_i32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x6f,0xd1,0xf0,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, v0
-// CHECK: [0x00,0x00,0x6f,0xd1,0x00,0x01,0x00,0x00]
+v_ffbh_i32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x6f,0xd1,0xf7,0x00,0x00,0x00]
 
-v_ffbh_i32_e64 v0, v255
-// CHECK: [0x00,0x00,0x6f,0xd1,0xff,0x01,0x00,0x00]
+v_ffbh_i32_e64 v5, v1
+// CHECK: [0x05,0x00,0x6f,0xd1,0x01,0x01,0x00,0x00]
 
-v_frexp_exp_i32_f64 v0, s[0:1]
-// CHECK: [0x00,0x60,0x00,0x7e]
+v_ffbh_i32_e64 v5, v255
+// CHECK: [0x05,0x00,0x6f,0xd1,0xff,0x01,0x00,0x00]
 
-v_frexp_exp_i32_f64 v255, s[0:1]
-// CHECK: [0x00,0x60,0xfe,0x7f]
+v_frexp_exp_i32_f64 v5, s[2:3]
+// CHECK: [0x02,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, s[2:3]
-// CHECK: [0x02,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v255, s[2:3]
+// CHECK: [0x02,0x60,0xfe,0x7f]
 
-v_frexp_exp_i32_f64 v0, s[100:101]
-// CHECK: [0x64,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, s[4:5]
+// CHECK: [0x04,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, flat_scratch
-// CHECK: [0x66,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, s[100:101]
+// CHECK: [0x64,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, vcc
-// CHECK: [0x6a,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, flat_scratch
+// CHECK: [0x66,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, tba
-// CHECK: [0x6c,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, vcc
+// CHECK: [0x6a,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, tma
-// CHECK: [0x6e,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, tba
+// CHECK: [0x6c,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, ttmp[10:11]
-// CHECK: [0x7a,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, tma
+// CHECK: [0x6e,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, exec
-// CHECK: [0x7e,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, ttmp[10:11]
+// CHECK: [0x7a,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, 0
-// CHECK: [0x80,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, exec
+// CHECK: [0x7e,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, -1
-// CHECK: [0xc1,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, 0
+// CHECK: [0x80,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, 0.5
-// CHECK: [0xf0,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, -1
+// CHECK: [0xc1,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, -4.0
-// CHECK: [0xf7,0x60,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, 0.5
+// CHECK: [0xf0,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, 0xaf123456
-// CHECK: [0xff,0x60,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_frexp_exp_i32_f64 v5, -4.0
+// CHECK: [0xf7,0x60,0x0a,0x7e]
 
-v_frexp_exp_i32_f64 v0, 0x3f717273
-// CHECK: [0xff,0x60,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_frexp_exp_i32_f64 v5, 0xaf123456
+// CHECK: [0xff,0x60,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_frexp_exp_i32_f64 v0, v[0:1]
-// CHECK: [0x00,0x61,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, 0x3f717273
+// CHECK: [0xff,0x60,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_frexp_exp_i32_f64 v0, v[254:255]
-// CHECK: [0xfe,0x61,0x00,0x7e]
+v_frexp_exp_i32_f64 v5, v[1:2]
+// CHECK: [0x01,0x61,0x0a,0x7e]
 
-v_frexp_exp_i32_f64_e64 v0, s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f64 v5, v[254:255]
+// CHECK: [0xfe,0x61,0x0a,0x7e]
 
-v_frexp_exp_i32_f64_e64 v255, s[0:1]
-// CHECK: [0xff,0x00,0x70,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, s[2:3]
+// CHECK: [0x05,0x00,0x70,0xd1,0x02,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, s[2:3]
-// CHECK: [0x00,0x00,0x70,0xd1,0x02,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v255, s[2:3]
+// CHECK: [0xff,0x00,0x70,0xd1,0x02,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, s[100:101]
-// CHECK: [0x00,0x00,0x70,0xd1,0x64,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, s[4:5]
+// CHECK: [0x05,0x00,0x70,0xd1,0x04,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, flat_scratch
-// CHECK: [0x00,0x00,0x70,0xd1,0x66,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, s[100:101]
+// CHECK: [0x05,0x00,0x70,0xd1,0x64,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, vcc
-// CHECK: [0x00,0x00,0x70,0xd1,0x6a,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, flat_scratch
+// CHECK: [0x05,0x00,0x70,0xd1,0x66,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, tba
-// CHECK: [0x00,0x00,0x70,0xd1,0x6c,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, vcc
+// CHECK: [0x05,0x00,0x70,0xd1,0x6a,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, tma
-// CHECK: [0x00,0x00,0x70,0xd1,0x6e,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, tba
+// CHECK: [0x05,0x00,0x70,0xd1,0x6c,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, ttmp[10:11]
-// CHECK: [0x00,0x00,0x70,0xd1,0x7a,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, tma
+// CHECK: [0x05,0x00,0x70,0xd1,0x6e,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, exec
-// CHECK: [0x00,0x00,0x70,0xd1,0x7e,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, ttmp[10:11]
+// CHECK: [0x05,0x00,0x70,0xd1,0x7a,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, scc
-// CHECK: [0x00,0x00,0x70,0xd1,0xfd,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, exec
+// CHECK: [0x05,0x00,0x70,0xd1,0x7e,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, v[0:1]
-// CHECK: [0x00,0x00,0x70,0xd1,0x00,0x01,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, scc
+// CHECK: [0x05,0x00,0x70,0xd1,0xfd,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, v[254:255]
-// CHECK: [0x00,0x00,0x70,0xd1,0xfe,0x01,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, v[1:2]
+// CHECK: [0x05,0x00,0x70,0xd1,0x01,0x01,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, -s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd1,0x00,0x00,0x00,0x20]
+v_frexp_exp_i32_f64_e64 v5, v[254:255]
+// CHECK: [0x05,0x00,0x70,0xd1,0xfe,0x01,0x00,0x00]
 
-v_frexp_exp_i32_f64_e64 v0, |s[0:1]|
-// CHECK: [0x00,0x01,0x70,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, -s[2:3]
+// CHECK: [0x05,0x00,0x70,0xd1,0x02,0x00,0x00,0x20]
 
-v_frexp_exp_i32_f64_e64 v0, s[0:1] clamp
-// CHECK: [0x00,0x80,0x70,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f64_e64 v5, |s[2:3]|
+// CHECK: [0x05,0x01,0x70,0xd1,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x62,0x00,0x7e]
+v_frexp_exp_i32_f64_e64 v5, s[2:3] clamp
+// CHECK: [0x05,0x80,0x70,0xd1,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x62,0xfc,0x7f]
+v_frexp_mant_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x62,0xfc,0x7f]
 
-v_frexp_mant_f64 v[0:1], s[100:101]
-// CHECK: [0x64,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], flat_scratch
-// CHECK: [0x66,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], s[100:101]
+// CHECK: [0x64,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], vcc
-// CHECK: [0x6a,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], flat_scratch
+// CHECK: [0x66,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], tba
-// CHECK: [0x6c,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], vcc
+// CHECK: [0x6a,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], tma
-// CHECK: [0x6e,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], tba
+// CHECK: [0x6c,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], tma
+// CHECK: [0x6e,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], exec
-// CHECK: [0x7e,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], 0
-// CHECK: [0x80,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], exec
+// CHECK: [0x7e,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], -1
-// CHECK: [0xc1,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], 0
+// CHECK: [0x80,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], -1
+// CHECK: [0xc1,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x62,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x62,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_frexp_mant_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x62,0x0a,0x7e]
 
-v_frexp_mant_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x62,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_frexp_mant_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x62,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x63,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x62,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_frexp_mant_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x63,0x00,0x7e]
+v_frexp_mant_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x63,0x0a,0x7e]
 
-v_frexp_mant_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_mant_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x63,0x0a,0x7e]
 
-v_frexp_mant_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x71,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x71,0xd1,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x71,0xd1,0x02,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x71,0xd1,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], s[100:101]
-// CHECK: [0x00,0x00,0x71,0xd1,0x64,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x71,0xd1,0x04,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x71,0xd1,0x66,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], s[100:101]
+// CHECK: [0x05,0x00,0x71,0xd1,0x64,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x71,0xd1,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x71,0xd1,0x66,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x71,0xd1,0x6c,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x71,0xd1,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x71,0xd1,0x6e,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x71,0xd1,0x6c,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x71,0xd1,0x7a,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x71,0xd1,0x6e,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x71,0xd1,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x71,0xd1,0x7a,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x71,0xd1,0xfd,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x71,0xd1,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x71,0xd1,0x00,0x01,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x71,0xd1,0xfd,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x71,0xd1,0xfe,0x01,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x71,0xd1,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd1,0x00,0x00,0x00,0x20]
+v_frexp_mant_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x71,0xd1,0xfe,0x01,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x71,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x71,0xd1,0x02,0x00,0x00,0x20]
 
-v_frexp_mant_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x71,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_mant_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x71,0xd1,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x71,0xd1,0x00,0x00,0x00,0x08]
+v_frexp_mant_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x80,0x71,0xd1,0x02,0x00,0x00,0x00]
 
-v_frexp_mant_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x71,0xd1,0x00,0x00,0x00,0x10]
+v_frexp_mant_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x71,0xd1,0x02,0x00,0x00,0x08]
 
-v_frexp_mant_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x71,0xd1,0x00,0x00,0x00,0x18]
+v_frexp_mant_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x71,0xd1,0x02,0x00,0x00,0x10]
 
-v_fract_f64 v[0:1], s[0:1]
-// CHECK: [0x00,0x64,0x00,0x7e]
+v_frexp_mant_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x71,0xd1,0x02,0x00,0x00,0x18]
 
-v_fract_f64 v[254:255], s[0:1]
-// CHECK: [0x00,0x64,0xfc,0x7f]
+v_fract_f64 v[5:6], s[2:3]
+// CHECK: [0x02,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], s[2:3]
-// CHECK: [0x02,0x64,0x00,0x7e]
+v_fract_f64 v[254:255], s[2:3]
+// CHECK: [0x02,0x64,0xfc,0x7f]
 
-v_fract_f64 v[0:1], s[100:101]
-// CHECK: [0x64,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], s[4:5]
+// CHECK: [0x04,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], flat_scratch
-// CHECK: [0x66,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], s[100:101]
+// CHECK: [0x64,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], vcc
-// CHECK: [0x6a,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], flat_scratch
+// CHECK: [0x66,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], tba
-// CHECK: [0x6c,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], vcc
+// CHECK: [0x6a,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], tma
-// CHECK: [0x6e,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], tba
+// CHECK: [0x6c,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], ttmp[10:11]
-// CHECK: [0x7a,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], tma
+// CHECK: [0x6e,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], exec
-// CHECK: [0x7e,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], ttmp[10:11]
+// CHECK: [0x7a,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], 0
-// CHECK: [0x80,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], exec
+// CHECK: [0x7e,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], -1
-// CHECK: [0xc1,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], 0
+// CHECK: [0x80,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], 0.5
-// CHECK: [0xf0,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], -1
+// CHECK: [0xc1,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], -4.0
-// CHECK: [0xf7,0x64,0x00,0x7e]
+v_fract_f64 v[5:6], 0.5
+// CHECK: [0xf0,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], 0xaf123456
-// CHECK: [0xff,0x64,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_fract_f64 v[5:6], -4.0
+// CHECK: [0xf7,0x64,0x0a,0x7e]
 
-v_fract_f64 v[0:1], 0x3f717273
-// CHECK: [0xff,0x64,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_fract_f64 v[5:6], 0xaf123456
+// CHECK: [0xff,0x64,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_fract_f64 v[0:1], v[0:1]
-// CHECK: [0x00,0x65,0x00,0x7e]
+v_fract_f64 v[5:6], 0x3f717273
+// CHECK: [0xff,0x64,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_fract_f64 v[0:1], v[254:255]
-// CHECK: [0xfe,0x65,0x00,0x7e]
+v_fract_f64 v[5:6], v[1:2]
+// CHECK: [0x01,0x65,0x0a,0x7e]
 
-v_fract_f64_e64 v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd1,0x00,0x00,0x00,0x00]
+v_fract_f64 v[5:6], v[254:255]
+// CHECK: [0xfe,0x65,0x0a,0x7e]
 
-v_fract_f64_e64 v[254:255], s[0:1]
-// CHECK: [0xfe,0x00,0x72,0xd1,0x00,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], s[2:3]
+// CHECK: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], s[2:3]
-// CHECK: [0x00,0x00,0x72,0xd1,0x02,0x00,0x00,0x00]
+v_fract_f64_e64 v[254:255], s[2:3]
+// CHECK: [0xfe,0x00,0x72,0xd1,0x02,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], s[100:101]
-// CHECK: [0x00,0x00,0x72,0xd1,0x64,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], s[4:5]
+// CHECK: [0x05,0x00,0x72,0xd1,0x04,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], flat_scratch
-// CHECK: [0x00,0x00,0x72,0xd1,0x66,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], s[100:101]
+// CHECK: [0x05,0x00,0x72,0xd1,0x64,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], vcc
-// CHECK: [0x00,0x00,0x72,0xd1,0x6a,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], flat_scratch
+// CHECK: [0x05,0x00,0x72,0xd1,0x66,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], tba
-// CHECK: [0x00,0x00,0x72,0xd1,0x6c,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], vcc
+// CHECK: [0x05,0x00,0x72,0xd1,0x6a,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], tma
-// CHECK: [0x00,0x00,0x72,0xd1,0x6e,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], tba
+// CHECK: [0x05,0x00,0x72,0xd1,0x6c,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], ttmp[10:11]
-// CHECK: [0x00,0x00,0x72,0xd1,0x7a,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], tma
+// CHECK: [0x05,0x00,0x72,0xd1,0x6e,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], exec
-// CHECK: [0x00,0x00,0x72,0xd1,0x7e,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], ttmp[10:11]
+// CHECK: [0x05,0x00,0x72,0xd1,0x7a,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], scc
-// CHECK: [0x00,0x00,0x72,0xd1,0xfd,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], exec
+// CHECK: [0x05,0x00,0x72,0xd1,0x7e,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x72,0xd1,0x00,0x01,0x00,0x00]
+v_fract_f64_e64 v[5:6], scc
+// CHECK: [0x05,0x00,0x72,0xd1,0xfd,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x72,0xd1,0xfe,0x01,0x00,0x00]
+v_fract_f64_e64 v[5:6], v[1:2]
+// CHECK: [0x05,0x00,0x72,0xd1,0x01,0x01,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd1,0x00,0x00,0x00,0x20]
+v_fract_f64_e64 v[5:6], v[254:255]
+// CHECK: [0x05,0x00,0x72,0xd1,0xfe,0x01,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], |s[0:1]|
-// CHECK: [0x00,0x01,0x72,0xd1,0x00,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], -s[2:3]
+// CHECK: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x20]
 
-v_fract_f64_e64 v[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x72,0xd1,0x00,0x00,0x00,0x00]
+v_fract_f64_e64 v[5:6], |s[2:3]|
+// CHECK: [0x05,0x01,0x72,0xd1,0x02,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x72,0xd1,0x00,0x00,0x00,0x08]
+v_fract_f64_e64 v[5:6], s[2:3] clamp
+// CHECK: [0x05,0x80,0x72,0xd1,0x02,0x00,0x00,0x00]
 
-v_fract_f64_e64 v[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x72,0xd1,0x00,0x00,0x00,0x10]
+v_fract_f64_e64 v[5:6], s[2:3] mul:2
+// CHECK: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x08]
 
-v_fract_f64_e64 v[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x72,0xd1,0x00,0x00,0x00,0x18]
+v_fract_f64_e64 v[5:6], s[2:3] mul:4
+// CHECK: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x10]
 
-v_frexp_exp_i32_f32 v0, s0
-// CHECK: [0x00,0x66,0x00,0x7e]
+v_fract_f64_e64 v[5:6], s[2:3] div:2
+// CHECK: [0x05,0x00,0x72,0xd1,0x02,0x00,0x00,0x18]
 
-v_frexp_exp_i32_f32 v255, s0
-// CHECK: [0x00,0x66,0xfe,0x7f]
+v_frexp_exp_i32_f32 v5, s1
+// CHECK: [0x01,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, s101
-// CHECK: [0x65,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v255, s1
+// CHECK: [0x01,0x66,0xfe,0x7f]
 
-v_frexp_exp_i32_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, s101
+// CHECK: [0x65,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, vcc_lo
-// CHECK: [0x6a,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, vcc_hi
-// CHECK: [0x6b,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, vcc_lo
+// CHECK: [0x6a,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, tba_lo
-// CHECK: [0x6c,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, vcc_hi
+// CHECK: [0x6b,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, tba_hi
-// CHECK: [0x6d,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, tba_lo
+// CHECK: [0x6c,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, tma_lo
-// CHECK: [0x6e,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, tba_hi
+// CHECK: [0x6d,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, tma_hi
-// CHECK: [0x6f,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, tma_lo
+// CHECK: [0x6e,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, ttmp11
-// CHECK: [0x7b,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, tma_hi
+// CHECK: [0x6f,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, m0
-// CHECK: [0x7c,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, ttmp11
+// CHECK: [0x7b,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, exec_lo
-// CHECK: [0x7e,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, m0
+// CHECK: [0x7c,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, exec_hi
-// CHECK: [0x7f,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, exec_lo
+// CHECK: [0x7e,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, 0
-// CHECK: [0x80,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, exec_hi
+// CHECK: [0x7f,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, -1
-// CHECK: [0xc1,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, 0
+// CHECK: [0x80,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, 0.5
-// CHECK: [0xf0,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, -1
+// CHECK: [0xc1,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, -4.0
-// CHECK: [0xf7,0x66,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, 0.5
+// CHECK: [0xf0,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, 0xaf123456
-// CHECK: [0xff,0x66,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_frexp_exp_i32_f32 v5, -4.0
+// CHECK: [0xf7,0x66,0x0a,0x7e]
 
-v_frexp_exp_i32_f32 v0, 0x3f717273
-// CHECK: [0xff,0x66,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_frexp_exp_i32_f32 v5, 0xaf123456
+// CHECK: [0xff,0x66,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_frexp_exp_i32_f32 v0, v0
-// CHECK: [0x00,0x67,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, 0x3f717273
+// CHECK: [0xff,0x66,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_frexp_exp_i32_f32 v0, v255
-// CHECK: [0xff,0x67,0x00,0x7e]
+v_frexp_exp_i32_f32 v5, v1
+// CHECK: [0x01,0x67,0x0a,0x7e]
 
-v_frexp_exp_i32_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x73,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f32 v5, v255
+// CHECK: [0xff,0x67,0x0a,0x7e]
 
-v_frexp_exp_i32_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x73,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x73,0xd1,0x01,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x73,0xd1,0x65,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x73,0xd1,0x01,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x73,0xd1,0x66,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x73,0xd1,0x65,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x73,0xd1,0x67,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x73,0xd1,0x66,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x73,0xd1,0x6a,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x73,0xd1,0x67,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x73,0xd1,0x6b,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x73,0xd1,0x6a,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x73,0xd1,0x6c,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x73,0xd1,0x6b,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x73,0xd1,0x6d,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x73,0xd1,0x6c,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x73,0xd1,0x6e,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x73,0xd1,0x6d,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x73,0xd1,0x6f,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x73,0xd1,0x6e,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x73,0xd1,0x7b,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x73,0xd1,0x6f,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x73,0xd1,0x7c,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x73,0xd1,0x7b,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x73,0xd1,0x7e,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x73,0xd1,0x7c,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x73,0xd1,0x7f,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x73,0xd1,0x7e,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x73,0xd1,0xfd,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x73,0xd1,0x7f,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x73,0xd1,0x00,0x01,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x73,0xd1,0xfd,0x00,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x73,0xd1,0xff,0x01,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x73,0xd1,0x01,0x01,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x73,0xd1,0x00,0x00,0x00,0x20]
+v_frexp_exp_i32_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x73,0xd1,0xff,0x01,0x00,0x00]
 
-v_frexp_exp_i32_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x73,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x73,0xd1,0x01,0x00,0x00,0x20]
 
-v_frexp_exp_i32_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x73,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_exp_i32_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x73,0xd1,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f32 v0, s0
-// CHECK: [0x00,0x68,0x00,0x7e]
+v_frexp_exp_i32_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x73,0xd1,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f32 v255, s0
-// CHECK: [0x00,0x68,0xfe,0x7f]
+v_frexp_mant_f32 v5, s1
+// CHECK: [0x01,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, s101
-// CHECK: [0x65,0x68,0x00,0x7e]
+v_frexp_mant_f32 v255, s1
+// CHECK: [0x01,0x68,0xfe,0x7f]
 
-v_frexp_mant_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, s101
+// CHECK: [0x65,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, vcc_lo
-// CHECK: [0x6a,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, vcc_hi
-// CHECK: [0x6b,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, vcc_lo
+// CHECK: [0x6a,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, tba_lo
-// CHECK: [0x6c,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, vcc_hi
+// CHECK: [0x6b,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, tba_hi
-// CHECK: [0x6d,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, tba_lo
+// CHECK: [0x6c,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, tma_lo
-// CHECK: [0x6e,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, tba_hi
+// CHECK: [0x6d,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, tma_hi
-// CHECK: [0x6f,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, tma_lo
+// CHECK: [0x6e,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, ttmp11
-// CHECK: [0x7b,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, tma_hi
+// CHECK: [0x6f,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, m0
-// CHECK: [0x7c,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, ttmp11
+// CHECK: [0x7b,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, exec_lo
-// CHECK: [0x7e,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, m0
+// CHECK: [0x7c,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, exec_hi
-// CHECK: [0x7f,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, exec_lo
+// CHECK: [0x7e,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, 0
-// CHECK: [0x80,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, exec_hi
+// CHECK: [0x7f,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, -1
-// CHECK: [0xc1,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, 0
+// CHECK: [0x80,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, 0.5
-// CHECK: [0xf0,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, -1
+// CHECK: [0xc1,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, -4.0
-// CHECK: [0xf7,0x68,0x00,0x7e]
+v_frexp_mant_f32 v5, 0.5
+// CHECK: [0xf0,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, 0xaf123456
-// CHECK: [0xff,0x68,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_frexp_mant_f32 v5, -4.0
+// CHECK: [0xf7,0x68,0x0a,0x7e]
 
-v_frexp_mant_f32 v0, 0x3f717273
-// CHECK: [0xff,0x68,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_frexp_mant_f32 v5, 0xaf123456
+// CHECK: [0xff,0x68,0x0a,0x7e,0x56,0x34,0x12,0xaf]
 
-v_frexp_mant_f32 v0, v0
-// CHECK: [0x00,0x69,0x00,0x7e]
+v_frexp_mant_f32 v5, 0x3f717273
+// CHECK: [0xff,0x68,0x0a,0x7e,0x73,0x72,0x71,0x3f]
 
-v_frexp_mant_f32 v0, v255
-// CHECK: [0xff,0x69,0x00,0x7e]
+v_frexp_mant_f32 v5, v1
+// CHECK: [0x01,0x69,0x0a,0x7e]
 
-v_frexp_mant_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x74,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_mant_f32 v5, v255
+// CHECK: [0xff,0x69,0x0a,0x7e]
 
-v_frexp_mant_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x74,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x74,0xd1,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x74,0xd1,0x65,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x74,0xd1,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x74,0xd1,0x66,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x74,0xd1,0x65,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x74,0xd1,0x67,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x74,0xd1,0x66,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x74,0xd1,0x6a,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x74,0xd1,0x67,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x74,0xd1,0x6b,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x74,0xd1,0x6a,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x74,0xd1,0x6c,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x74,0xd1,0x6b,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x74,0xd1,0x6d,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x74,0xd1,0x6c,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x74,0xd1,0x6e,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x74,0xd1,0x6d,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x74,0xd1,0x6f,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x74,0xd1,0x6e,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x74,0xd1,0x7b,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x74,0xd1,0x6f,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x74,0xd1,0x7c,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x74,0xd1,0x7b,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x74,0xd1,0x7e,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x74,0xd1,0x7c,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x74,0xd1,0x7f,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x74,0xd1,0x7e,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x74,0xd1,0xfd,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x74,0xd1,0x7f,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x74,0xd1,0x00,0x01,0x00,0x00]
+v_frexp_mant_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x74,0xd1,0xfd,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x74,0xd1,0xff,0x01,0x00,0x00]
+v_frexp_mant_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x74,0xd1,0x01,0x01,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x74,0xd1,0x00,0x00,0x00,0x20]
+v_frexp_mant_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x74,0xd1,0xff,0x01,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x74,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x74,0xd1,0x01,0x00,0x00,0x20]
 
-v_frexp_mant_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x74,0xd1,0x00,0x00,0x00,0x00]
+v_frexp_mant_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x74,0xd1,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x74,0xd1,0x00,0x00,0x00,0x08]
+v_frexp_mant_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x74,0xd1,0x01,0x00,0x00,0x00]
 
-v_frexp_mant_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x74,0xd1,0x00,0x00,0x00,0x10]
+v_frexp_mant_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x74,0xd1,0x01,0x00,0x00,0x08]
 
-v_frexp_mant_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x74,0xd1,0x00,0x00,0x00,0x18]
+v_frexp_mant_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x74,0xd1,0x01,0x00,0x00,0x10]
+
+v_frexp_mant_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x74,0xd1,0x01,0x00,0x00,0x18]
 
 v_clrexcp
 // CHECK: [0x00,0x6a,0x00,0x7e]
@@ -29646,60584 +29822,69026 @@ v_clrexcp
 v_clrexcp_e64
 // CHECK: [0x00,0x00,0x75,0xd1,0x00,0x00,0x00,0x00]
 
-v_movreld_b32 v0, m0
-// CHECK: [0x7c,0x6c,0x00,0x7e]
+v_movreld_b32 v5, m0
+// CHECK: [0x7c,0x6c,0x0a,0x7e]
 
 v_movreld_b32 v255, m0
 // CHECK: [0x7c,0x6c,0xfe,0x7f]
 
-v_movreld_b32 v0, 0
-// CHECK: [0x80,0x6c,0x00,0x7e]
+v_movreld_b32 v5, 0
+// CHECK: [0x80,0x6c,0x0a,0x7e]
 
-v_movreld_b32 v0, -1
-// CHECK: [0xc1,0x6c,0x00,0x7e]
+v_movreld_b32 v5, -1
+// CHECK: [0xc1,0x6c,0x0a,0x7e]
 
-v_movreld_b32 v0, 0.5
-// CHECK: [0xf0,0x6c,0x00,0x7e]
+v_movreld_b32 v5, 0.5
+// CHECK: [0xf0,0x6c,0x0a,0x7e]
 
-v_movreld_b32 v0, -4.0
-// CHECK: [0xf7,0x6c,0x00,0x7e]
+v_movreld_b32 v5, -4.0
+// CHECK: [0xf7,0x6c,0x0a,0x7e]
 
-v_movreld_b32 v0, v0
-// CHECK: [0x00,0x6d,0x00,0x7e]
+v_movreld_b32 v5, v1
+// CHECK: [0x01,0x6d,0x0a,0x7e]
 
-v_movreld_b32 v0, v255
-// CHECK: [0xff,0x6d,0x00,0x7e]
+v_movreld_b32 v5, v255
+// CHECK: [0xff,0x6d,0x0a,0x7e]
 
-v_movreld_b32_e64 v0, m0
-// CHECK: [0x00,0x00,0x76,0xd1,0x7c,0x00,0x00,0x00]
+v_movreld_b32_e64 v5, m0
+// CHECK: [0x05,0x00,0x76,0xd1,0x7c,0x00,0x00,0x00]
 
 v_movreld_b32_e64 v255, m0
 // CHECK: [0xff,0x00,0x76,0xd1,0x7c,0x00,0x00,0x00]
 
-v_movreld_b32_e64 v0, 0
-// CHECK: [0x00,0x00,0x76,0xd1,0x80,0x00,0x00,0x00]
+v_movreld_b32_e64 v5, 0
+// CHECK: [0x05,0x00,0x76,0xd1,0x80,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, -1
+// CHECK: [0x05,0x00,0x76,0xd1,0xc1,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x76,0xd1,0xf0,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x76,0xd1,0xf7,0x00,0x00,0x00]
+
+v_movreld_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x76,0xd1,0x01,0x01,0x00,0x00]
+
+v_movreld_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x76,0xd1,0xff,0x01,0x00,0x00]
+
+v_movrels_b32 v5, v1
+// CHECK: [0x01,0x6f,0x0a,0x7e]
+
+v_movrels_b32 v255, v1
+// CHECK: [0x01,0x6f,0xfe,0x7f]
+
+v_movrels_b32 v5, v255
+// CHECK: [0xff,0x6f,0x0a,0x7e]
+
+v_movrels_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x77,0xd1,0x01,0x01,0x00,0x00]
+
+v_movrels_b32_e64 v255, v1
+// CHECK: [0xff,0x00,0x77,0xd1,0x01,0x01,0x00,0x00]
+
+v_movrels_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x77,0xd1,0xff,0x01,0x00,0x00]
+
+v_movrelsd_b32 v5, v1
+// CHECK: [0x01,0x71,0x0a,0x7e]
+
+v_movrelsd_b32 v255, v1
+// CHECK: [0x01,0x71,0xfe,0x7f]
+
+v_movrelsd_b32 v5, v255
+// CHECK: [0xff,0x71,0x0a,0x7e]
+
+v_movrelsd_b32_e64 v5, v1
+// CHECK: [0x05,0x00,0x78,0xd1,0x01,0x01,0x00,0x00]
+
+v_movrelsd_b32_e64 v255, v1
+// CHECK: [0xff,0x00,0x78,0xd1,0x01,0x01,0x00,0x00]
+
+v_movrelsd_b32_e64 v5, v255
+// CHECK: [0x05,0x00,0x78,0xd1,0xff,0x01,0x00,0x00]
+
+v_cvt_f16_u16 v5, s1
+// CHECK: [0x01,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v255, s1
+// CHECK: [0x01,0x72,0xfe,0x7f]
+
+v_cvt_f16_u16 v5, s101
+// CHECK: [0x65,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, flat_scratch_lo
+// CHECK: [0x66,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, flat_scratch_hi
+// CHECK: [0x67,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, vcc_lo
+// CHECK: [0x6a,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, vcc_hi
+// CHECK: [0x6b,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, tba_lo
+// CHECK: [0x6c,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, tba_hi
+// CHECK: [0x6d,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, tma_lo
+// CHECK: [0x6e,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, tma_hi
+// CHECK: [0x6f,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, ttmp11
+// CHECK: [0x7b,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, m0
+// CHECK: [0x7c,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, exec_lo
+// CHECK: [0x7e,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, exec_hi
+// CHECK: [0x7f,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, 0
+// CHECK: [0x80,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, -1
+// CHECK: [0xc1,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, 0.5
+// CHECK: [0xf0,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, -4.0
+// CHECK: [0xf7,0x72,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, 0xfe0b
+// CHECK: [0xff,0x72,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f16_u16 v5, 0x3456
+// CHECK: [0xff,0x72,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_cvt_f16_u16 v5, v1
+// CHECK: [0x01,0x73,0x0a,0x7e]
+
+v_cvt_f16_u16 v5, v255
+// CHECK: [0xff,0x73,0x0a,0x7e]
+
+v_cvt_f16_u16_e64 v5, s1
+// CHECK: [0x05,0x00,0x79,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v255, s1
+// CHECK: [0xff,0x00,0x79,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, s101
+// CHECK: [0x05,0x00,0x79,0xd1,0x65,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x79,0xd1,0x66,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x79,0xd1,0x67,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x79,0xd1,0x6a,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x79,0xd1,0x6b,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x79,0xd1,0x6c,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x79,0xd1,0x6d,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x79,0xd1,0x6e,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x79,0xd1,0x6f,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x79,0xd1,0x7b,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, m0
+// CHECK: [0x05,0x00,0x79,0xd1,0x7c,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x79,0xd1,0x7e,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x79,0xd1,0x7f,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, 0
+// CHECK: [0x05,0x00,0x79,0xd1,0x80,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, -1
+// CHECK: [0x05,0x00,0x79,0xd1,0xc1,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x79,0xd1,0xf0,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x79,0xd1,0xf7,0x00,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, v1
+// CHECK: [0x05,0x00,0x79,0xd1,0x01,0x01,0x00,0x00]
+
+v_cvt_f16_u16_e64 v5, v255
+// CHECK: [0x05,0x00,0x79,0xd1,0xff,0x01,0x00,0x00]
+
+v_cvt_f16_i16 v5, s1
+// CHECK: [0x01,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v255, s1
+// CHECK: [0x01,0x74,0xfe,0x7f]
+
+v_cvt_f16_i16 v5, s101
+// CHECK: [0x65,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, flat_scratch_lo
+// CHECK: [0x66,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, flat_scratch_hi
+// CHECK: [0x67,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, vcc_lo
+// CHECK: [0x6a,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, vcc_hi
+// CHECK: [0x6b,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, tba_lo
+// CHECK: [0x6c,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, tba_hi
+// CHECK: [0x6d,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, tma_lo
+// CHECK: [0x6e,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, tma_hi
+// CHECK: [0x6f,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, ttmp11
+// CHECK: [0x7b,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, m0
+// CHECK: [0x7c,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, exec_lo
+// CHECK: [0x7e,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, exec_hi
+// CHECK: [0x7f,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, 0
+// CHECK: [0x80,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, -1
+// CHECK: [0xc1,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, 0.5
+// CHECK: [0xf0,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, -4.0
+// CHECK: [0xf7,0x74,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, 0xfe0b
+// CHECK: [0xff,0x74,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_f16_i16 v5, 0x3456
+// CHECK: [0xff,0x74,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_cvt_f16_i16 v5, v1
+// CHECK: [0x01,0x75,0x0a,0x7e]
+
+v_cvt_f16_i16 v5, v255
+// CHECK: [0xff,0x75,0x0a,0x7e]
+
+v_cvt_f16_i16_e64 v5, s1
+// CHECK: [0x05,0x00,0x7a,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v255, s1
+// CHECK: [0xff,0x00,0x7a,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, s101
+// CHECK: [0x05,0x00,0x7a,0xd1,0x65,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x7a,0xd1,0x66,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x7a,0xd1,0x67,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x7a,0xd1,0x6a,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x7a,0xd1,0x6b,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x7a,0xd1,0x6c,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x7a,0xd1,0x6d,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x7a,0xd1,0x6e,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x7a,0xd1,0x6f,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x7a,0xd1,0x7b,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, m0
+// CHECK: [0x05,0x00,0x7a,0xd1,0x7c,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x7a,0xd1,0x7e,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x7a,0xd1,0x7f,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, 0
+// CHECK: [0x05,0x00,0x7a,0xd1,0x80,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, -1
+// CHECK: [0x05,0x00,0x7a,0xd1,0xc1,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, 0.5
+// CHECK: [0x05,0x00,0x7a,0xd1,0xf0,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, -4.0
+// CHECK: [0x05,0x00,0x7a,0xd1,0xf7,0x00,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, v1
+// CHECK: [0x05,0x00,0x7a,0xd1,0x01,0x01,0x00,0x00]
+
+v_cvt_f16_i16_e64 v5, v255
+// CHECK: [0x05,0x00,0x7a,0xd1,0xff,0x01,0x00,0x00]
+
+v_cvt_u16_f16 v5, s1
+// CHECK: [0x01,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v255, s1
+// CHECK: [0x01,0x76,0xfe,0x7f]
+
+v_cvt_u16_f16 v5, s101
+// CHECK: [0x65,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, vcc_lo
+// CHECK: [0x6a,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, vcc_hi
+// CHECK: [0x6b,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, tba_lo
+// CHECK: [0x6c,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, tba_hi
+// CHECK: [0x6d,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, tma_lo
+// CHECK: [0x6e,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, tma_hi
+// CHECK: [0x6f,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, ttmp11
+// CHECK: [0x7b,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, m0
+// CHECK: [0x7c,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, exec_lo
+// CHECK: [0x7e,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, exec_hi
+// CHECK: [0x7f,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, 0
+// CHECK: [0x80,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, -1
+// CHECK: [0xc1,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, 0.5
+// CHECK: [0xf0,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, -4.0
+// CHECK: [0xf7,0x76,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, 0xfe0b
+// CHECK: [0xff,0x76,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_u16_f16 v5, 0x3456
+// CHECK: [0xff,0x76,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_cvt_u16_f16 v5, v1
+// CHECK: [0x01,0x77,0x0a,0x7e]
+
+v_cvt_u16_f16 v5, v255
+// CHECK: [0xff,0x77,0x0a,0x7e]
+
+v_cvt_u16_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x7b,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x7b,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x7b,0xd1,0x65,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x7b,0xd1,0x66,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x7b,0xd1,0x67,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x7b,0xd1,0x6a,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x7b,0xd1,0x6b,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x7b,0xd1,0x6c,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x7b,0xd1,0x6d,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x7b,0xd1,0x6e,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x7b,0xd1,0x6f,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x7b,0xd1,0x7b,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x7b,0xd1,0x7c,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x7b,0xd1,0x7e,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x7b,0xd1,0x7f,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x7b,0xd1,0xfd,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x7b,0xd1,0x01,0x01,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x7b,0xd1,0xff,0x01,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x7b,0xd1,0x01,0x00,0x00,0x20]
+
+v_cvt_u16_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x7b,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_u16_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x7b,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_i16_f16 v5, s1
+// CHECK: [0x01,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v255, s1
+// CHECK: [0x01,0x78,0xfe,0x7f]
+
+v_cvt_i16_f16 v5, s101
+// CHECK: [0x65,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, vcc_lo
+// CHECK: [0x6a,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, vcc_hi
+// CHECK: [0x6b,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, tba_lo
+// CHECK: [0x6c,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, tba_hi
+// CHECK: [0x6d,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, tma_lo
+// CHECK: [0x6e,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, tma_hi
+// CHECK: [0x6f,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, ttmp11
+// CHECK: [0x7b,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, m0
+// CHECK: [0x7c,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, exec_lo
+// CHECK: [0x7e,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, exec_hi
+// CHECK: [0x7f,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, 0
+// CHECK: [0x80,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, -1
+// CHECK: [0xc1,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, 0.5
+// CHECK: [0xf0,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, -4.0
+// CHECK: [0xf7,0x78,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, 0xfe0b
+// CHECK: [0xff,0x78,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cvt_i16_f16 v5, 0x3456
+// CHECK: [0xff,0x78,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_cvt_i16_f16 v5, v1
+// CHECK: [0x01,0x79,0x0a,0x7e]
+
+v_cvt_i16_f16 v5, v255
+// CHECK: [0xff,0x79,0x0a,0x7e]
+
+v_cvt_i16_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x7c,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x7c,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x7c,0xd1,0x65,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x7c,0xd1,0x66,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x7c,0xd1,0x67,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x7c,0xd1,0x6a,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x7c,0xd1,0x6b,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x7c,0xd1,0x6c,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x7c,0xd1,0x6d,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x7c,0xd1,0x6e,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x7c,0xd1,0x6f,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x7c,0xd1,0x7b,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x7c,0xd1,0x7c,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x7c,0xd1,0x7e,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x7c,0xd1,0x7f,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x7c,0xd1,0xfd,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x7c,0xd1,0x01,0x01,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x7c,0xd1,0xff,0x01,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x7c,0xd1,0x01,0x00,0x00,0x20]
+
+v_cvt_i16_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x7c,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_i16_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x7c,0xd1,0x01,0x00,0x00,0x00]
+
+v_rcp_f16 v5, s1
+// CHECK: [0x01,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v255, s1
+// CHECK: [0x01,0x7a,0xfe,0x7f]
+
+v_rcp_f16 v5, s101
+// CHECK: [0x65,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, vcc_lo
+// CHECK: [0x6a,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, vcc_hi
+// CHECK: [0x6b,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, tba_lo
+// CHECK: [0x6c,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, tba_hi
+// CHECK: [0x6d,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, tma_lo
+// CHECK: [0x6e,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, tma_hi
+// CHECK: [0x6f,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, ttmp11
+// CHECK: [0x7b,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, m0
+// CHECK: [0x7c,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, exec_lo
+// CHECK: [0x7e,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, exec_hi
+// CHECK: [0x7f,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, 0
+// CHECK: [0x80,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, -1
+// CHECK: [0xc1,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, 0.5
+// CHECK: [0xf0,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, -4.0
+// CHECK: [0xf7,0x7a,0x0a,0x7e]
+
+v_rcp_f16 v5, 0xfe0b
+// CHECK: [0xff,0x7a,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rcp_f16 v5, 0x3456
+// CHECK: [0xff,0x7a,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_rcp_f16 v5, v1
+// CHECK: [0x01,0x7b,0x0a,0x7e]
+
+v_rcp_f16 v5, v255
+// CHECK: [0xff,0x7b,0x0a,0x7e]
+
+v_rcp_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x7d,0xd1,0x01,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x7d,0xd1,0x01,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x7d,0xd1,0x65,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x7d,0xd1,0x66,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x7d,0xd1,0x67,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x7d,0xd1,0x6a,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x7d,0xd1,0x6b,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x7d,0xd1,0x6c,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x7d,0xd1,0x6d,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x7d,0xd1,0x6e,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x7d,0xd1,0x6f,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x7d,0xd1,0x7b,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x7d,0xd1,0x7c,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x7d,0xd1,0x7e,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x7d,0xd1,0x7f,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x7d,0xd1,0xfd,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x7d,0xd1,0x01,0x01,0x00,0x00]
+
+v_rcp_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x7d,0xd1,0xff,0x01,0x00,0x00]
+
+v_rcp_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x7d,0xd1,0x01,0x00,0x00,0x20]
+
+v_rcp_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x7d,0xd1,0x01,0x00,0x00,0x00]
+
+v_rcp_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x7d,0xd1,0x01,0x00,0x00,0x00]
+
+v_sqrt_f16 v5, s1
+// CHECK: [0x01,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v255, s1
+// CHECK: [0x01,0x7c,0xfe,0x7f]
+
+v_sqrt_f16 v5, s101
+// CHECK: [0x65,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, vcc_lo
+// CHECK: [0x6a,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, vcc_hi
+// CHECK: [0x6b,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, tba_lo
+// CHECK: [0x6c,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, tba_hi
+// CHECK: [0x6d,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, tma_lo
+// CHECK: [0x6e,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, tma_hi
+// CHECK: [0x6f,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, ttmp11
+// CHECK: [0x7b,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, m0
+// CHECK: [0x7c,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, exec_lo
+// CHECK: [0x7e,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, exec_hi
+// CHECK: [0x7f,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, 0
+// CHECK: [0x80,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, -1
+// CHECK: [0xc1,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, 0.5
+// CHECK: [0xf0,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, -4.0
+// CHECK: [0xf7,0x7c,0x0a,0x7e]
+
+v_sqrt_f16 v5, 0xfe0b
+// CHECK: [0xff,0x7c,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sqrt_f16 v5, 0x3456
+// CHECK: [0xff,0x7c,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_sqrt_f16 v5, v1
+// CHECK: [0x01,0x7d,0x0a,0x7e]
+
+v_sqrt_f16 v5, v255
+// CHECK: [0xff,0x7d,0x0a,0x7e]
+
+v_sqrt_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x7e,0xd1,0x01,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x7e,0xd1,0x01,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x7e,0xd1,0x65,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x7e,0xd1,0x66,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x7e,0xd1,0x67,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x7e,0xd1,0x6a,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x7e,0xd1,0x6b,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x7e,0xd1,0x6c,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x7e,0xd1,0x6d,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x7e,0xd1,0x6e,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x7e,0xd1,0x6f,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x7e,0xd1,0x7b,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x7e,0xd1,0x7c,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x7e,0xd1,0x7e,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x7e,0xd1,0x7f,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x7e,0xd1,0xfd,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x7e,0xd1,0x01,0x01,0x00,0x00]
+
+v_sqrt_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x7e,0xd1,0xff,0x01,0x00,0x00]
+
+v_sqrt_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x7e,0xd1,0x01,0x00,0x00,0x20]
+
+v_sqrt_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x7e,0xd1,0x01,0x00,0x00,0x00]
+
+v_sqrt_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x7e,0xd1,0x01,0x00,0x00,0x00]
+
+v_rsq_f16 v5, s1
+// CHECK: [0x01,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v255, s1
+// CHECK: [0x01,0x7e,0xfe,0x7f]
+
+v_rsq_f16 v5, s101
+// CHECK: [0x65,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, vcc_lo
+// CHECK: [0x6a,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, vcc_hi
+// CHECK: [0x6b,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, tba_lo
+// CHECK: [0x6c,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, tba_hi
+// CHECK: [0x6d,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, tma_lo
+// CHECK: [0x6e,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, tma_hi
+// CHECK: [0x6f,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, ttmp11
+// CHECK: [0x7b,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, m0
+// CHECK: [0x7c,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, exec_lo
+// CHECK: [0x7e,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, exec_hi
+// CHECK: [0x7f,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, 0
+// CHECK: [0x80,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, -1
+// CHECK: [0xc1,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, 0.5
+// CHECK: [0xf0,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, -4.0
+// CHECK: [0xf7,0x7e,0x0a,0x7e]
+
+v_rsq_f16 v5, 0xfe0b
+// CHECK: [0xff,0x7e,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rsq_f16 v5, 0x3456
+// CHECK: [0xff,0x7e,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_rsq_f16 v5, v1
+// CHECK: [0x01,0x7f,0x0a,0x7e]
+
+v_rsq_f16 v5, v255
+// CHECK: [0xff,0x7f,0x0a,0x7e]
+
+v_rsq_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x7f,0xd1,0x01,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x7f,0xd1,0x01,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x7f,0xd1,0x65,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x7f,0xd1,0x66,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x7f,0xd1,0x67,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x7f,0xd1,0x6a,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x7f,0xd1,0x6b,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x7f,0xd1,0x6c,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x7f,0xd1,0x6d,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x7f,0xd1,0x6e,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x7f,0xd1,0x6f,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x7f,0xd1,0x7b,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x7f,0xd1,0x7c,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x7f,0xd1,0x7e,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x7f,0xd1,0x7f,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x7f,0xd1,0xfd,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x7f,0xd1,0x01,0x01,0x00,0x00]
+
+v_rsq_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x7f,0xd1,0xff,0x01,0x00,0x00]
+
+v_rsq_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x7f,0xd1,0x01,0x00,0x00,0x20]
+
+v_rsq_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x7f,0xd1,0x01,0x00,0x00,0x00]
+
+v_rsq_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x7f,0xd1,0x01,0x00,0x00,0x00]
+
+v_log_f16 v5, s1
+// CHECK: [0x01,0x80,0x0a,0x7e]
+
+v_log_f16 v255, s1
+// CHECK: [0x01,0x80,0xfe,0x7f]
+
+v_log_f16 v5, s101
+// CHECK: [0x65,0x80,0x0a,0x7e]
+
+v_log_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x80,0x0a,0x7e]
+
+v_log_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x80,0x0a,0x7e]
+
+v_log_f16 v5, vcc_lo
+// CHECK: [0x6a,0x80,0x0a,0x7e]
+
+v_log_f16 v5, vcc_hi
+// CHECK: [0x6b,0x80,0x0a,0x7e]
+
+v_log_f16 v5, tba_lo
+// CHECK: [0x6c,0x80,0x0a,0x7e]
+
+v_log_f16 v5, tba_hi
+// CHECK: [0x6d,0x80,0x0a,0x7e]
+
+v_log_f16 v5, tma_lo
+// CHECK: [0x6e,0x80,0x0a,0x7e]
+
+v_log_f16 v5, tma_hi
+// CHECK: [0x6f,0x80,0x0a,0x7e]
+
+v_log_f16 v5, ttmp11
+// CHECK: [0x7b,0x80,0x0a,0x7e]
+
+v_log_f16 v5, m0
+// CHECK: [0x7c,0x80,0x0a,0x7e]
+
+v_log_f16 v5, exec_lo
+// CHECK: [0x7e,0x80,0x0a,0x7e]
+
+v_log_f16 v5, exec_hi
+// CHECK: [0x7f,0x80,0x0a,0x7e]
+
+v_log_f16 v5, 0
+// CHECK: [0x80,0x80,0x0a,0x7e]
+
+v_log_f16 v5, -1
+// CHECK: [0xc1,0x80,0x0a,0x7e]
+
+v_log_f16 v5, 0.5
+// CHECK: [0xf0,0x80,0x0a,0x7e]
+
+v_log_f16 v5, -4.0
+// CHECK: [0xf7,0x80,0x0a,0x7e]
+
+v_log_f16 v5, 0xfe0b
+// CHECK: [0xff,0x80,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_log_f16 v5, 0x3456
+// CHECK: [0xff,0x80,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_log_f16 v5, v1
+// CHECK: [0x01,0x81,0x0a,0x7e]
+
+v_log_f16 v5, v255
+// CHECK: [0xff,0x81,0x0a,0x7e]
+
+v_log_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x80,0xd1,0x01,0x00,0x00,0x00]
+
+v_log_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x80,0xd1,0x01,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x80,0xd1,0x65,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x80,0xd1,0x66,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x80,0xd1,0x67,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x80,0xd1,0x6a,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x80,0xd1,0x6b,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x80,0xd1,0x6c,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x80,0xd1,0x6d,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x80,0xd1,0x6e,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x80,0xd1,0x6f,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x80,0xd1,0x7b,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x80,0xd1,0x7c,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x80,0xd1,0x7e,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x80,0xd1,0x7f,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x80,0xd1,0xfd,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x80,0xd1,0x01,0x01,0x00,0x00]
+
+v_log_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x80,0xd1,0xff,0x01,0x00,0x00]
+
+v_log_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x80,0xd1,0x01,0x00,0x00,0x20]
+
+v_log_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x80,0xd1,0x01,0x00,0x00,0x00]
+
+v_log_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x80,0xd1,0x01,0x00,0x00,0x00]
+
+v_exp_f16 v5, s1
+// CHECK: [0x01,0x82,0x0a,0x7e]
+
+v_exp_f16 v255, s1
+// CHECK: [0x01,0x82,0xfe,0x7f]
+
+v_exp_f16 v5, s101
+// CHECK: [0x65,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, vcc_lo
+// CHECK: [0x6a,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, vcc_hi
+// CHECK: [0x6b,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, tba_lo
+// CHECK: [0x6c,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, tba_hi
+// CHECK: [0x6d,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, tma_lo
+// CHECK: [0x6e,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, tma_hi
+// CHECK: [0x6f,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, ttmp11
+// CHECK: [0x7b,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, m0
+// CHECK: [0x7c,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, exec_lo
+// CHECK: [0x7e,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, exec_hi
+// CHECK: [0x7f,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, 0
+// CHECK: [0x80,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, -1
+// CHECK: [0xc1,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, 0.5
+// CHECK: [0xf0,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, -4.0
+// CHECK: [0xf7,0x82,0x0a,0x7e]
+
+v_exp_f16 v5, 0xfe0b
+// CHECK: [0xff,0x82,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_exp_f16 v5, 0x3456
+// CHECK: [0xff,0x82,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_exp_f16 v5, v1
+// CHECK: [0x01,0x83,0x0a,0x7e]
+
+v_exp_f16 v5, v255
+// CHECK: [0xff,0x83,0x0a,0x7e]
+
+v_exp_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x81,0xd1,0x01,0x00,0x00,0x00]
+
+v_exp_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x81,0xd1,0x01,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x81,0xd1,0x65,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x81,0xd1,0x66,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x81,0xd1,0x67,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x81,0xd1,0x6a,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x81,0xd1,0x6b,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x81,0xd1,0x6c,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x81,0xd1,0x6d,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x81,0xd1,0x6e,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x81,0xd1,0x6f,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x81,0xd1,0x7b,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x81,0xd1,0x7c,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x81,0xd1,0x7e,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x81,0xd1,0x7f,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x81,0xd1,0xfd,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x81,0xd1,0x01,0x01,0x00,0x00]
+
+v_exp_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x81,0xd1,0xff,0x01,0x00,0x00]
+
+v_exp_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x81,0xd1,0x01,0x00,0x00,0x20]
+
+v_exp_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x81,0xd1,0x01,0x00,0x00,0x00]
+
+v_exp_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x81,0xd1,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f16 v5, s1
+// CHECK: [0x01,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v255, s1
+// CHECK: [0x01,0x84,0xfe,0x7f]
+
+v_frexp_mant_f16 v5, s101
+// CHECK: [0x65,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, vcc_lo
+// CHECK: [0x6a,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, vcc_hi
+// CHECK: [0x6b,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, tba_lo
+// CHECK: [0x6c,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, tba_hi
+// CHECK: [0x6d,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, tma_lo
+// CHECK: [0x6e,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, tma_hi
+// CHECK: [0x6f,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, ttmp11
+// CHECK: [0x7b,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, m0
+// CHECK: [0x7c,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, exec_lo
+// CHECK: [0x7e,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, exec_hi
+// CHECK: [0x7f,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, 0
+// CHECK: [0x80,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, -1
+// CHECK: [0xc1,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, 0.5
+// CHECK: [0xf0,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, -4.0
+// CHECK: [0xf7,0x84,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, 0xfe0b
+// CHECK: [0xff,0x84,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_frexp_mant_f16 v5, 0x3456
+// CHECK: [0xff,0x84,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_frexp_mant_f16 v5, v1
+// CHECK: [0x01,0x85,0x0a,0x7e]
+
+v_frexp_mant_f16 v5, v255
+// CHECK: [0xff,0x85,0x0a,0x7e]
+
+v_frexp_mant_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x82,0xd1,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x82,0xd1,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x82,0xd1,0x65,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x82,0xd1,0x66,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x82,0xd1,0x67,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x82,0xd1,0x6a,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x82,0xd1,0x6b,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x82,0xd1,0x6c,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x82,0xd1,0x6d,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x82,0xd1,0x6e,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x82,0xd1,0x6f,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x82,0xd1,0x7b,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x82,0xd1,0x7c,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x82,0xd1,0x7e,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x82,0xd1,0x7f,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x82,0xd1,0xfd,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x82,0xd1,0x01,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x82,0xd1,0xff,0x01,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x82,0xd1,0x01,0x00,0x00,0x20]
+
+v_frexp_mant_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x82,0xd1,0x01,0x00,0x00,0x00]
+
+v_frexp_mant_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x82,0xd1,0x01,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16 v5, s1
+// CHECK: [0x01,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v255, s1
+// CHECK: [0x01,0x86,0xfe,0x7f]
+
+v_frexp_exp_i16_f16 v5, s101
+// CHECK: [0x65,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, vcc_lo
+// CHECK: [0x6a,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, vcc_hi
+// CHECK: [0x6b,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, tba_lo
+// CHECK: [0x6c,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, tba_hi
+// CHECK: [0x6d,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, tma_lo
+// CHECK: [0x6e,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, tma_hi
+// CHECK: [0x6f,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, ttmp11
+// CHECK: [0x7b,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, m0
+// CHECK: [0x7c,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, exec_lo
+// CHECK: [0x7e,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, exec_hi
+// CHECK: [0x7f,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, 0
+// CHECK: [0x80,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, -1
+// CHECK: [0xc1,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, 0.5
+// CHECK: [0xf0,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, -4.0
+// CHECK: [0xf7,0x86,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, 0xfe0b
+// CHECK: [0xff,0x86,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_frexp_exp_i16_f16 v5, 0x3456
+// CHECK: [0xff,0x86,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_frexp_exp_i16_f16 v5, v1
+// CHECK: [0x01,0x87,0x0a,0x7e]
+
+v_frexp_exp_i16_f16 v5, v255
+// CHECK: [0xff,0x87,0x0a,0x7e]
+
+v_frexp_exp_i16_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x83,0xd1,0x01,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x83,0xd1,0x01,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x83,0xd1,0x65,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x83,0xd1,0x66,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x83,0xd1,0x67,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x83,0xd1,0x6a,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x83,0xd1,0x6b,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x83,0xd1,0x6c,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x83,0xd1,0x6d,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x83,0xd1,0x6e,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x83,0xd1,0x6f,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x83,0xd1,0x7b,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x83,0xd1,0x7c,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x83,0xd1,0x7e,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x83,0xd1,0x7f,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x83,0xd1,0xfd,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x83,0xd1,0x01,0x01,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x83,0xd1,0xff,0x01,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x83,0xd1,0x01,0x00,0x00,0x20]
+
+v_frexp_exp_i16_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x83,0xd1,0x01,0x00,0x00,0x00]
+
+v_frexp_exp_i16_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x83,0xd1,0x01,0x00,0x00,0x00]
+
+v_floor_f16 v5, s1
+// CHECK: [0x01,0x88,0x0a,0x7e]
+
+v_floor_f16 v255, s1
+// CHECK: [0x01,0x88,0xfe,0x7f]
+
+v_floor_f16 v5, s101
+// CHECK: [0x65,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, vcc_lo
+// CHECK: [0x6a,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, vcc_hi
+// CHECK: [0x6b,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, tba_lo
+// CHECK: [0x6c,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, tba_hi
+// CHECK: [0x6d,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, tma_lo
+// CHECK: [0x6e,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, tma_hi
+// CHECK: [0x6f,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, ttmp11
+// CHECK: [0x7b,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, m0
+// CHECK: [0x7c,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, exec_lo
+// CHECK: [0x7e,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, exec_hi
+// CHECK: [0x7f,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, 0
+// CHECK: [0x80,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, -1
+// CHECK: [0xc1,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, 0.5
+// CHECK: [0xf0,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, -4.0
+// CHECK: [0xf7,0x88,0x0a,0x7e]
+
+v_floor_f16 v5, 0xfe0b
+// CHECK: [0xff,0x88,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_floor_f16 v5, 0x3456
+// CHECK: [0xff,0x88,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_floor_f16 v5, v1
+// CHECK: [0x01,0x89,0x0a,0x7e]
+
+v_floor_f16 v5, v255
+// CHECK: [0xff,0x89,0x0a,0x7e]
+
+v_floor_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x84,0xd1,0x01,0x00,0x00,0x00]
+
+v_floor_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x84,0xd1,0x01,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x84,0xd1,0x65,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x84,0xd1,0x66,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x84,0xd1,0x67,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x84,0xd1,0x6a,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x84,0xd1,0x6b,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x84,0xd1,0x6c,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x84,0xd1,0x6d,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x84,0xd1,0x6e,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x84,0xd1,0x6f,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x84,0xd1,0x7b,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x84,0xd1,0x7c,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x84,0xd1,0x7e,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x84,0xd1,0x7f,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x84,0xd1,0xfd,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x84,0xd1,0x01,0x01,0x00,0x00]
+
+v_floor_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x84,0xd1,0xff,0x01,0x00,0x00]
+
+v_floor_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x84,0xd1,0x01,0x00,0x00,0x20]
+
+v_floor_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x84,0xd1,0x01,0x00,0x00,0x00]
+
+v_floor_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x84,0xd1,0x01,0x00,0x00,0x00]
+
+v_ceil_f16 v5, s1
+// CHECK: [0x01,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v255, s1
+// CHECK: [0x01,0x8a,0xfe,0x7f]
+
+v_ceil_f16 v5, s101
+// CHECK: [0x65,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, vcc_lo
+// CHECK: [0x6a,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, vcc_hi
+// CHECK: [0x6b,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, tba_lo
+// CHECK: [0x6c,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, tba_hi
+// CHECK: [0x6d,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, tma_lo
+// CHECK: [0x6e,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, tma_hi
+// CHECK: [0x6f,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, ttmp11
+// CHECK: [0x7b,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, m0
+// CHECK: [0x7c,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, exec_lo
+// CHECK: [0x7e,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, exec_hi
+// CHECK: [0x7f,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, 0
+// CHECK: [0x80,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, -1
+// CHECK: [0xc1,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, 0.5
+// CHECK: [0xf0,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, -4.0
+// CHECK: [0xf7,0x8a,0x0a,0x7e]
+
+v_ceil_f16 v5, 0xfe0b
+// CHECK: [0xff,0x8a,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_ceil_f16 v5, 0x3456
+// CHECK: [0xff,0x8a,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_ceil_f16 v5, v1
+// CHECK: [0x01,0x8b,0x0a,0x7e]
+
+v_ceil_f16 v5, v255
+// CHECK: [0xff,0x8b,0x0a,0x7e]
+
+v_ceil_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x85,0xd1,0x01,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x85,0xd1,0x01,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x85,0xd1,0x65,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x85,0xd1,0x66,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x85,0xd1,0x67,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x85,0xd1,0x6a,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x85,0xd1,0x6b,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x85,0xd1,0x6c,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x85,0xd1,0x6d,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x85,0xd1,0x6e,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x85,0xd1,0x6f,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x85,0xd1,0x7b,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x85,0xd1,0x7c,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x85,0xd1,0x7e,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x85,0xd1,0x7f,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x85,0xd1,0xfd,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x85,0xd1,0x01,0x01,0x00,0x00]
+
+v_ceil_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x85,0xd1,0xff,0x01,0x00,0x00]
+
+v_ceil_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x85,0xd1,0x01,0x00,0x00,0x20]
+
+v_ceil_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x85,0xd1,0x01,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x85,0xd1,0x01,0x00,0x00,0x00]
+
+v_trunc_f16 v5, s1
+// CHECK: [0x01,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v255, s1
+// CHECK: [0x01,0x8c,0xfe,0x7f]
+
+v_trunc_f16 v5, s101
+// CHECK: [0x65,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, vcc_lo
+// CHECK: [0x6a,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, vcc_hi
+// CHECK: [0x6b,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, tba_lo
+// CHECK: [0x6c,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, tba_hi
+// CHECK: [0x6d,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, tma_lo
+// CHECK: [0x6e,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, tma_hi
+// CHECK: [0x6f,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, ttmp11
+// CHECK: [0x7b,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, m0
+// CHECK: [0x7c,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, exec_lo
+// CHECK: [0x7e,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, exec_hi
+// CHECK: [0x7f,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, 0
+// CHECK: [0x80,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, -1
+// CHECK: [0xc1,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, 0.5
+// CHECK: [0xf0,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, -4.0
+// CHECK: [0xf7,0x8c,0x0a,0x7e]
+
+v_trunc_f16 v5, 0xfe0b
+// CHECK: [0xff,0x8c,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_trunc_f16 v5, 0x3456
+// CHECK: [0xff,0x8c,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_trunc_f16 v5, v1
+// CHECK: [0x01,0x8d,0x0a,0x7e]
+
+v_trunc_f16 v5, v255
+// CHECK: [0xff,0x8d,0x0a,0x7e]
+
+v_trunc_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x86,0xd1,0x01,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x86,0xd1,0x01,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x86,0xd1,0x65,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x86,0xd1,0x66,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x86,0xd1,0x67,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x86,0xd1,0x6a,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x86,0xd1,0x6b,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x86,0xd1,0x6c,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x86,0xd1,0x6d,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x86,0xd1,0x6e,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x86,0xd1,0x6f,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x86,0xd1,0x7b,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x86,0xd1,0x7c,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x86,0xd1,0x7e,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x86,0xd1,0x7f,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x86,0xd1,0xfd,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x86,0xd1,0x01,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x86,0xd1,0xff,0x01,0x00,0x00]
+
+v_trunc_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x86,0xd1,0x01,0x00,0x00,0x20]
+
+v_trunc_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x86,0xd1,0x01,0x00,0x00,0x00]
+
+v_trunc_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x86,0xd1,0x01,0x00,0x00,0x00]
+
+v_rndne_f16 v5, s1
+// CHECK: [0x01,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v255, s1
+// CHECK: [0x01,0x8e,0xfe,0x7f]
+
+v_rndne_f16 v5, s101
+// CHECK: [0x65,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, vcc_lo
+// CHECK: [0x6a,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, vcc_hi
+// CHECK: [0x6b,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, tba_lo
+// CHECK: [0x6c,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, tba_hi
+// CHECK: [0x6d,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, tma_lo
+// CHECK: [0x6e,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, tma_hi
+// CHECK: [0x6f,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, ttmp11
+// CHECK: [0x7b,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, m0
+// CHECK: [0x7c,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, exec_lo
+// CHECK: [0x7e,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, exec_hi
+// CHECK: [0x7f,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, 0
+// CHECK: [0x80,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, -1
+// CHECK: [0xc1,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, 0.5
+// CHECK: [0xf0,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, -4.0
+// CHECK: [0xf7,0x8e,0x0a,0x7e]
+
+v_rndne_f16 v5, 0xfe0b
+// CHECK: [0xff,0x8e,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_rndne_f16 v5, 0x3456
+// CHECK: [0xff,0x8e,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_rndne_f16 v5, v1
+// CHECK: [0x01,0x8f,0x0a,0x7e]
+
+v_rndne_f16 v5, v255
+// CHECK: [0xff,0x8f,0x0a,0x7e]
+
+v_rndne_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x87,0xd1,0x01,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x87,0xd1,0x01,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x87,0xd1,0x65,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x87,0xd1,0x66,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x87,0xd1,0x67,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x87,0xd1,0x6a,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x87,0xd1,0x6b,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x87,0xd1,0x6c,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x87,0xd1,0x6d,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x87,0xd1,0x6e,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x87,0xd1,0x6f,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x87,0xd1,0x7b,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x87,0xd1,0x7c,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x87,0xd1,0x7e,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x87,0xd1,0x7f,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x87,0xd1,0xfd,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x87,0xd1,0x01,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x87,0xd1,0xff,0x01,0x00,0x00]
+
+v_rndne_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x87,0xd1,0x01,0x00,0x00,0x20]
+
+v_rndne_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x87,0xd1,0x01,0x00,0x00,0x00]
+
+v_rndne_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x87,0xd1,0x01,0x00,0x00,0x00]
+
+v_fract_f16 v5, s1
+// CHECK: [0x01,0x90,0x0a,0x7e]
+
+v_fract_f16 v255, s1
+// CHECK: [0x01,0x90,0xfe,0x7f]
+
+v_fract_f16 v5, s101
+// CHECK: [0x65,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, vcc_lo
+// CHECK: [0x6a,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, vcc_hi
+// CHECK: [0x6b,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, tba_lo
+// CHECK: [0x6c,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, tba_hi
+// CHECK: [0x6d,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, tma_lo
+// CHECK: [0x6e,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, tma_hi
+// CHECK: [0x6f,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, ttmp11
+// CHECK: [0x7b,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, m0
+// CHECK: [0x7c,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, exec_lo
+// CHECK: [0x7e,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, exec_hi
+// CHECK: [0x7f,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, 0
+// CHECK: [0x80,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, -1
+// CHECK: [0xc1,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, 0.5
+// CHECK: [0xf0,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, -4.0
+// CHECK: [0xf7,0x90,0x0a,0x7e]
+
+v_fract_f16 v5, 0xfe0b
+// CHECK: [0xff,0x90,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_fract_f16 v5, 0x3456
+// CHECK: [0xff,0x90,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_fract_f16 v5, v1
+// CHECK: [0x01,0x91,0x0a,0x7e]
+
+v_fract_f16 v5, v255
+// CHECK: [0xff,0x91,0x0a,0x7e]
+
+v_fract_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x88,0xd1,0x01,0x00,0x00,0x00]
+
+v_fract_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x88,0xd1,0x01,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x88,0xd1,0x65,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x88,0xd1,0x66,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x88,0xd1,0x67,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x88,0xd1,0x6a,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x88,0xd1,0x6b,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x88,0xd1,0x6c,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x88,0xd1,0x6d,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x88,0xd1,0x6e,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x88,0xd1,0x6f,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x88,0xd1,0x7b,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x88,0xd1,0x7c,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x88,0xd1,0x7e,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x88,0xd1,0x7f,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x88,0xd1,0xfd,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x88,0xd1,0x01,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x88,0xd1,0xff,0x01,0x00,0x00]
+
+v_fract_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x88,0xd1,0x01,0x00,0x00,0x20]
+
+v_fract_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x88,0xd1,0x01,0x00,0x00,0x00]
+
+v_fract_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x88,0xd1,0x01,0x00,0x00,0x00]
+
+v_sin_f16 v5, s1
+// CHECK: [0x01,0x92,0x0a,0x7e]
+
+v_sin_f16 v255, s1
+// CHECK: [0x01,0x92,0xfe,0x7f]
+
+v_sin_f16 v5, s101
+// CHECK: [0x65,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, vcc_lo
+// CHECK: [0x6a,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, vcc_hi
+// CHECK: [0x6b,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, tba_lo
+// CHECK: [0x6c,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, tba_hi
+// CHECK: [0x6d,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, tma_lo
+// CHECK: [0x6e,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, tma_hi
+// CHECK: [0x6f,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, ttmp11
+// CHECK: [0x7b,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, m0
+// CHECK: [0x7c,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, exec_lo
+// CHECK: [0x7e,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, exec_hi
+// CHECK: [0x7f,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, 0
+// CHECK: [0x80,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, -1
+// CHECK: [0xc1,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, 0.5
+// CHECK: [0xf0,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, -4.0
+// CHECK: [0xf7,0x92,0x0a,0x7e]
+
+v_sin_f16 v5, 0xfe0b
+// CHECK: [0xff,0x92,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_sin_f16 v5, 0x3456
+// CHECK: [0xff,0x92,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_sin_f16 v5, v1
+// CHECK: [0x01,0x93,0x0a,0x7e]
+
+v_sin_f16 v5, v255
+// CHECK: [0xff,0x93,0x0a,0x7e]
+
+v_sin_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x89,0xd1,0x01,0x00,0x00,0x00]
+
+v_sin_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x89,0xd1,0x01,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x89,0xd1,0x65,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x89,0xd1,0x66,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x89,0xd1,0x67,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x89,0xd1,0x6a,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x89,0xd1,0x6b,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x89,0xd1,0x6c,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x89,0xd1,0x6d,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x89,0xd1,0x6e,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x89,0xd1,0x6f,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x89,0xd1,0x7b,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x89,0xd1,0x7c,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x89,0xd1,0x7e,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x89,0xd1,0x7f,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x89,0xd1,0xfd,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x89,0xd1,0x01,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x89,0xd1,0xff,0x01,0x00,0x00]
+
+v_sin_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x89,0xd1,0x01,0x00,0x00,0x20]
+
+v_sin_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x89,0xd1,0x01,0x00,0x00,0x00]
+
+v_sin_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x89,0xd1,0x01,0x00,0x00,0x00]
+
+v_cos_f16 v5, s1
+// CHECK: [0x01,0x94,0x0a,0x7e]
+
+v_cos_f16 v255, s1
+// CHECK: [0x01,0x94,0xfe,0x7f]
+
+v_cos_f16 v5, s101
+// CHECK: [0x65,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, flat_scratch_lo
+// CHECK: [0x66,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, flat_scratch_hi
+// CHECK: [0x67,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, vcc_lo
+// CHECK: [0x6a,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, vcc_hi
+// CHECK: [0x6b,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, tba_lo
+// CHECK: [0x6c,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, tba_hi
+// CHECK: [0x6d,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, tma_lo
+// CHECK: [0x6e,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, tma_hi
+// CHECK: [0x6f,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, ttmp11
+// CHECK: [0x7b,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, m0
+// CHECK: [0x7c,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, exec_lo
+// CHECK: [0x7e,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, exec_hi
+// CHECK: [0x7f,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, 0
+// CHECK: [0x80,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, -1
+// CHECK: [0xc1,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, 0.5
+// CHECK: [0xf0,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, -4.0
+// CHECK: [0xf7,0x94,0x0a,0x7e]
+
+v_cos_f16 v5, 0xfe0b
+// CHECK: [0xff,0x94,0x0a,0x7e,0x0b,0xfe,0x00,0x00]
+
+v_cos_f16 v5, 0x3456
+// CHECK: [0xff,0x94,0x0a,0x7e,0x56,0x34,0x00,0x00]
+
+v_cos_f16 v5, v1
+// CHECK: [0x01,0x95,0x0a,0x7e]
+
+v_cos_f16 v5, v255
+// CHECK: [0xff,0x95,0x0a,0x7e]
+
+v_cos_f16_e64 v5, s1
+// CHECK: [0x05,0x00,0x8a,0xd1,0x01,0x00,0x00,0x00]
+
+v_cos_f16_e64 v255, s1
+// CHECK: [0xff,0x00,0x8a,0xd1,0x01,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, s101
+// CHECK: [0x05,0x00,0x8a,0xd1,0x65,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x8a,0xd1,0x66,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x8a,0xd1,0x67,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x8a,0xd1,0x6a,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x8a,0xd1,0x6b,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x8a,0xd1,0x6c,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x8a,0xd1,0x6d,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x8a,0xd1,0x6e,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x8a,0xd1,0x6f,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x8a,0xd1,0x7b,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, m0
+// CHECK: [0x05,0x00,0x8a,0xd1,0x7c,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x8a,0xd1,0x7e,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x8a,0xd1,0x7f,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, scc
+// CHECK: [0x05,0x00,0x8a,0xd1,0xfd,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, v1
+// CHECK: [0x05,0x00,0x8a,0xd1,0x01,0x01,0x00,0x00]
+
+v_cos_f16_e64 v5, v255
+// CHECK: [0x05,0x00,0x8a,0xd1,0xff,0x01,0x00,0x00]
+
+v_cos_f16_e64 v5, -s1
+// CHECK: [0x05,0x00,0x8a,0xd1,0x01,0x00,0x00,0x20]
+
+v_cos_f16_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x8a,0xd1,0x01,0x00,0x00,0x00]
+
+v_cos_f16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x8a,0xd1,0x01,0x00,0x00,0x00]
+
+v_exp_legacy_f32 v5, s1
+// CHECK: [0x01,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v255, s1
+// CHECK: [0x01,0x96,0xfe,0x7f]
+
+v_exp_legacy_f32 v5, s101
+// CHECK: [0x65,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, vcc_lo
+// CHECK: [0x6a,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, vcc_hi
+// CHECK: [0x6b,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, tba_lo
+// CHECK: [0x6c,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, tba_hi
+// CHECK: [0x6d,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, tma_lo
+// CHECK: [0x6e,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, tma_hi
+// CHECK: [0x6f,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, ttmp11
+// CHECK: [0x7b,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, m0
+// CHECK: [0x7c,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, exec_lo
+// CHECK: [0x7e,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, exec_hi
+// CHECK: [0x7f,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, 0
+// CHECK: [0x80,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, -1
+// CHECK: [0xc1,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, 0.5
+// CHECK: [0xf0,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, -4.0
+// CHECK: [0xf7,0x96,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, 0xaf123456
+// CHECK: [0xff,0x96,0x0a,0x7e,0x56,0x34,0x12,0xaf]
+
+v_exp_legacy_f32 v5, 0x3f717273
+// CHECK: [0xff,0x96,0x0a,0x7e,0x73,0x72,0x71,0x3f]
+
+v_exp_legacy_f32 v5, v1
+// CHECK: [0x01,0x97,0x0a,0x7e]
+
+v_exp_legacy_f32 v5, v255
+// CHECK: [0xff,0x97,0x0a,0x7e]
+
+v_exp_legacy_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x8b,0xd1,0x01,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x8b,0xd1,0x01,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x8b,0xd1,0x65,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x8b,0xd1,0x66,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x8b,0xd1,0x67,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x8b,0xd1,0x6a,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x8b,0xd1,0x6b,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x8b,0xd1,0x6c,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x8b,0xd1,0x6d,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x8b,0xd1,0x6e,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x8b,0xd1,0x6f,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x8b,0xd1,0x7b,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x8b,0xd1,0x7c,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x8b,0xd1,0x7e,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x8b,0xd1,0x7f,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x8b,0xd1,0xfd,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x8b,0xd1,0x01,0x01,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x8b,0xd1,0xff,0x01,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x8b,0xd1,0x01,0x00,0x00,0x20]
+
+v_exp_legacy_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x8b,0xd1,0x01,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x8b,0xd1,0x01,0x00,0x00,0x00]
+
+v_exp_legacy_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x8b,0xd1,0x01,0x00,0x00,0x08]
+
+v_exp_legacy_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x8b,0xd1,0x01,0x00,0x00,0x10]
+
+v_exp_legacy_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x8b,0xd1,0x01,0x00,0x00,0x18]
+
+v_log_legacy_f32 v5, s1
+// CHECK: [0x01,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v255, s1
+// CHECK: [0x01,0x98,0xfe,0x7f]
+
+v_log_legacy_f32 v5, s101
+// CHECK: [0x65,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, flat_scratch_lo
+// CHECK: [0x66,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, flat_scratch_hi
+// CHECK: [0x67,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, vcc_lo
+// CHECK: [0x6a,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, vcc_hi
+// CHECK: [0x6b,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, tba_lo
+// CHECK: [0x6c,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, tba_hi
+// CHECK: [0x6d,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, tma_lo
+// CHECK: [0x6e,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, tma_hi
+// CHECK: [0x6f,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, ttmp11
+// CHECK: [0x7b,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, m0
+// CHECK: [0x7c,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, exec_lo
+// CHECK: [0x7e,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, exec_hi
+// CHECK: [0x7f,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, 0
+// CHECK: [0x80,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, -1
+// CHECK: [0xc1,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, 0.5
+// CHECK: [0xf0,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, -4.0
+// CHECK: [0xf7,0x98,0x0a,0x7e]
+
+v_log_legacy_f32 v5, 0xaf123456
+// CHECK: [0xff,0x98,0x0a,0x7e,0x56,0x34,0x12,0xaf]
+
+v_log_legacy_f32 v5, 0x3f717273
+// CHECK: [0xff,0x98,0x0a,0x7e,0x73,0x72,0x71,0x3f]
+
+v_log_legacy_f32 v5, v1
+// CHECK: [0x01,0x99,0x0a,0x7e]
+
+v_log_legacy_f32 v5, v255
+// CHECK: [0xff,0x99,0x0a,0x7e]
+
+v_log_legacy_f32_e64 v5, s1
+// CHECK: [0x05,0x00,0x8c,0xd1,0x01,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v255, s1
+// CHECK: [0xff,0x00,0x8c,0xd1,0x01,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, s101
+// CHECK: [0x05,0x00,0x8c,0xd1,0x65,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, flat_scratch_lo
+// CHECK: [0x05,0x00,0x8c,0xd1,0x66,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, flat_scratch_hi
+// CHECK: [0x05,0x00,0x8c,0xd1,0x67,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, vcc_lo
+// CHECK: [0x05,0x00,0x8c,0xd1,0x6a,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, vcc_hi
+// CHECK: [0x05,0x00,0x8c,0xd1,0x6b,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, tba_lo
+// CHECK: [0x05,0x00,0x8c,0xd1,0x6c,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, tba_hi
+// CHECK: [0x05,0x00,0x8c,0xd1,0x6d,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, tma_lo
+// CHECK: [0x05,0x00,0x8c,0xd1,0x6e,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, tma_hi
+// CHECK: [0x05,0x00,0x8c,0xd1,0x6f,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, ttmp11
+// CHECK: [0x05,0x00,0x8c,0xd1,0x7b,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, m0
+// CHECK: [0x05,0x00,0x8c,0xd1,0x7c,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, exec_lo
+// CHECK: [0x05,0x00,0x8c,0xd1,0x7e,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, exec_hi
+// CHECK: [0x05,0x00,0x8c,0xd1,0x7f,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, scc
+// CHECK: [0x05,0x00,0x8c,0xd1,0xfd,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, v1
+// CHECK: [0x05,0x00,0x8c,0xd1,0x01,0x01,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, v255
+// CHECK: [0x05,0x00,0x8c,0xd1,0xff,0x01,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, -s1
+// CHECK: [0x05,0x00,0x8c,0xd1,0x01,0x00,0x00,0x20]
+
+v_log_legacy_f32_e64 v5, |s1|
+// CHECK: [0x05,0x01,0x8c,0xd1,0x01,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x8c,0xd1,0x01,0x00,0x00,0x00]
+
+v_log_legacy_f32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x8c,0xd1,0x01,0x00,0x00,0x08]
+
+v_log_legacy_f32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x8c,0xd1,0x01,0x00,0x00,0x10]
+
+v_log_legacy_f32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x8c,0xd1,0x01,0x00,0x00,0x18]
+
+v_cndmask_b32 v5, 0, v2, vcc
+// CHECK: [0x80,0x04,0x0a,0x00]
+
+v_cndmask_b32 v255, 0, v2, vcc
+// CHECK: [0x80,0x04,0xfe,0x01]
+
+v_cndmask_b32 v5, -1, v2, vcc
+// CHECK: [0xc1,0x04,0x0a,0x00]
+
+v_cndmask_b32 v5, 0.5, v2, vcc
+// CHECK: [0xf0,0x04,0x0a,0x00]
+
+v_cndmask_b32 v5, -4.0, v2, vcc
+// CHECK: [0xf7,0x04,0x0a,0x00]
+
+v_cndmask_b32 v5, v1, v2, vcc
+// CHECK: [0x01,0x05,0x0a,0x00]
+
+v_cndmask_b32 v5, v255, v2, vcc
+// CHECK: [0xff,0x05,0x0a,0x00]
+
+v_cndmask_b32 v5, 0, v255, vcc
+// CHECK: [0x80,0xfe,0x0b,0x00]
+
+v_cndmask_b32_e64 v5, 0, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0x00,0x19,0x00]
+
+v_cndmask_b32_e64 v255, 0, 0, s[6:7]
+// CHECK: [0xff,0x00,0x00,0xd1,0x80,0x00,0x19,0x00]
+
+v_cndmask_b32_e64 v5, -1, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0xc1,0x00,0x19,0x00]
+
+v_cndmask_b32_e64 v5, 0.5, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0xf0,0x00,0x19,0x00]
+
+v_cndmask_b32_e64 v5, -4.0, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0xf7,0x00,0x19,0x00]
+
+v_cndmask_b32_e64 v5, v1, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0x01,0x01,0x19,0x00]
+
+v_cndmask_b32_e64 v5, v255, 0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0xff,0x01,0x19,0x00]
+
+v_cndmask_b32_e64 v5, 0, -1, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0x82,0x19,0x00]
+
+v_cndmask_b32_e64 v5, 0, 0.5, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0xe0,0x19,0x00]
+
+v_cndmask_b32_e64 v5, 0, -4.0, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0xee,0x19,0x00]
+
+v_cndmask_b32_e64 v5, 0, v2, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0x04,0x1a,0x00]
+
+v_cndmask_b32_e64 v5, 0, v255, s[6:7]
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0xfe,0x1b,0x00]
+
+v_cndmask_b32_e64 v5, 0, 0, s[8:9]
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0x00,0x21,0x00]
+
+v_cndmask_b32_e64 v5, 0, 0, s[100:101]
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0x00,0x91,0x01]
+
+v_cndmask_b32_e64 v5, 0, 0, flat_scratch
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0x00,0x99,0x01]
+
+v_cndmask_b32_e64 v5, 0, 0, vcc
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0x00,0xa9,0x01]
+
+v_cndmask_b32_e64 v5, 0, 0, tba
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0x00,0xb1,0x01]
+
+v_cndmask_b32_e64 v5, 0, 0, tma
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0x00,0xb9,0x01]
+
+v_cndmask_b32_e64 v5, 0, 0, ttmp[10:11]
+// CHECK: [0x05,0x00,0x00,0xd1,0x80,0x00,0xe9,0x01]
+
+v_add_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x02]
+
+v_add_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x03]
+
+v_add_f32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x02]
+
+v_add_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x02]
+
+v_add_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x02]
+
+v_add_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x02]
+
+v_add_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x02]
+
+v_add_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x02]
+
+v_add_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x02]
+
+v_add_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x02]
+
+v_add_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x02]
+
+v_add_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x02]
+
+v_add_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x02]
+
+v_add_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x02]
+
+v_add_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x02]
+
+v_add_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x02]
+
+v_add_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x02]
+
+v_add_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x02]
+
+v_add_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x02]
+
+v_add_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x02,0x56,0x34,0x12,0xaf]
+
+v_add_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x02,0x73,0x72,0x71,0x3f]
+
+v_add_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x02]
+
+v_add_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x02]
+
+v_add_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x02]
+
+v_add_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x01,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x01,0xd1,0xff,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xcb,0x00,0x00]
+
+v_add_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xcd,0x00,0x00]
+
+v_add_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xcf,0x00,0x00]
+
+v_add_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xd5,0x00,0x00]
+
+v_add_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xd7,0x00,0x00]
+
+v_add_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xd9,0x00,0x00]
+
+v_add_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xdb,0x00,0x00]
+
+v_add_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xdd,0x00,0x00]
+
+v_add_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xdf,0x00,0x00]
+
+v_add_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xf7,0x00,0x00]
+
+v_add_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xf9,0x00,0x00]
+
+v_add_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xfd,0x00,0x00]
+
+v_add_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xff,0x00,0x00]
+
+v_add_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xfb,0x01,0x00]
+
+v_add_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0x05,0x02,0x00]
+
+v_add_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0xff,0x03,0x00]
+
+v_add_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0x05,0x00,0x20]
+
+v_add_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0x05,0x00,0x40]
+
+v_add_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0x05,0x00,0x60]
+
+v_add_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x01,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x01,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x01,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x01,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0x05,0x00,0x08]
+
+v_add_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0x05,0x00,0x10]
+
+v_add_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x01,0xd1,0x01,0x05,0x00,0x18]
+
+v_sub_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x04]
+
+v_sub_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x05]
+
+v_sub_f32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x04]
+
+v_sub_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x04]
+
+v_sub_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x04]
+
+v_sub_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x04]
+
+v_sub_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x04]
+
+v_sub_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x04]
+
+v_sub_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x04]
+
+v_sub_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x04]
+
+v_sub_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x04]
+
+v_sub_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x04]
+
+v_sub_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x04]
+
+v_sub_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x04]
+
+v_sub_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x04]
+
+v_sub_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x04]
+
+v_sub_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x04]
+
+v_sub_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x04]
+
+v_sub_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x04]
+
+v_sub_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x04,0x56,0x34,0x12,0xaf]
+
+v_sub_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x04,0x73,0x72,0x71,0x3f]
+
+v_sub_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x04]
+
+v_sub_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x04]
+
+v_sub_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x04]
+
+v_sub_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x02,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x02,0xd1,0xff,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xcb,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xcd,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xcf,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xd5,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xd7,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xd9,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xdb,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xdd,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xdf,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xf7,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xf9,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xfd,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xff,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xfb,0x01,0x00]
+
+v_sub_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0x05,0x02,0x00]
+
+v_sub_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0xff,0x03,0x00]
+
+v_sub_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0x05,0x00,0x20]
+
+v_sub_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0x05,0x00,0x40]
+
+v_sub_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0x05,0x00,0x60]
+
+v_sub_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x02,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x02,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x02,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x02,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0x05,0x00,0x08]
+
+v_sub_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0x05,0x00,0x10]
+
+v_sub_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x02,0xd1,0x01,0x05,0x00,0x18]
+
+v_subrev_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x06]
+
+v_subrev_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x07]
+
+v_subrev_f32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x06]
+
+v_subrev_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x06,0x56,0x34,0x12,0xaf]
+
+v_subrev_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x06,0x73,0x72,0x71,0x3f]
+
+v_subrev_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x06]
+
+v_subrev_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x06]
+
+v_subrev_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x06]
+
+v_subrev_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x03,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x03,0xd1,0xff,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xcb,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xcd,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xcf,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xd5,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xd7,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xd9,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xdb,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xdd,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xdf,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xf7,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xf9,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xfd,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xff,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xfb,0x01,0x00]
+
+v_subrev_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0x05,0x02,0x00]
+
+v_subrev_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0xff,0x03,0x00]
+
+v_subrev_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0x05,0x00,0x20]
+
+v_subrev_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0x05,0x00,0x40]
+
+v_subrev_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0x05,0x00,0x60]
+
+v_subrev_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x03,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x03,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x03,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x03,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0x05,0x00,0x08]
+
+v_subrev_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0x05,0x00,0x10]
+
+v_subrev_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x03,0xd1,0x01,0x05,0x00,0x18]
+
+v_mul_legacy_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x09]
+
+v_mul_legacy_f32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x08]
+
+v_mul_legacy_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x08,0x56,0x34,0x12,0xaf]
+
+v_mul_legacy_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x08,0x73,0x72,0x71,0x3f]
+
+v_mul_legacy_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x08]
+
+v_mul_legacy_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x08]
+
+v_mul_legacy_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x08]
+
+v_mul_legacy_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x04,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x04,0xd1,0xff,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xcb,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xcd,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xcf,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xd5,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xd7,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xd9,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xdb,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xdd,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xdf,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xf7,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xf9,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xfd,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xff,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xfb,0x01,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0x05,0x02,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0xff,0x03,0x00]
+
+v_mul_legacy_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0x05,0x00,0x20]
+
+v_mul_legacy_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0x05,0x00,0x40]
+
+v_mul_legacy_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0x05,0x00,0x60]
+
+v_mul_legacy_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x04,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x04,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x04,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x04,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_legacy_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0x05,0x00,0x08]
+
+v_mul_legacy_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0x05,0x00,0x10]
+
+v_mul_legacy_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x04,0xd1,0x01,0x05,0x00,0x18]
+
+v_mul_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x0a]
+
+v_mul_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x0b]
+
+v_mul_f32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x0a]
+
+v_mul_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x0a,0x56,0x34,0x12,0xaf]
+
+v_mul_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x0a,0x73,0x72,0x71,0x3f]
+
+v_mul_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x0a]
+
+v_mul_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x0a]
+
+v_mul_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x0a]
+
+v_mul_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x05,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x05,0xd1,0xff,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xcb,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xcd,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xcf,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xd5,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xd7,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xd9,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xdb,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xdd,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xdf,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xf7,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xf9,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xfd,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xff,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xfb,0x01,0x00]
+
+v_mul_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0x05,0x02,0x00]
+
+v_mul_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0xff,0x03,0x00]
+
+v_mul_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0x05,0x00,0x20]
+
+v_mul_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0x05,0x00,0x40]
+
+v_mul_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0x05,0x00,0x60]
+
+v_mul_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x05,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x05,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x05,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x05,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0x05,0x00,0x08]
+
+v_mul_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0x05,0x00,0x10]
+
+v_mul_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x05,0xd1,0x01,0x05,0x00,0x18]
+
+v_mul_i32_i24 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x0d]
+
+v_mul_i32_i24 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x0c]
+
+v_mul_i32_i24 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x0c,0x56,0x34,0x12,0xaf]
+
+v_mul_i32_i24 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x0c,0x73,0x72,0x71,0x3f]
+
+v_mul_i32_i24 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x0c]
+
+v_mul_i32_i24 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x0c]
+
+v_mul_i32_i24 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x0c]
+
+v_mul_i32_i24_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0x04,0x00,0x00]
+
+v_mul_i32_i24_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x06,0xd1,0x80,0x04,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x06,0xd1,0xc1,0x04,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x06,0xd1,0xf0,0x04,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x06,0xd1,0xf7,0x04,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x06,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x06,0xd1,0xff,0x05,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xca,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xcc,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xce,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xd4,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xd6,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xd8,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xda,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xdc,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xde,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xf6,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xf8,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xfc,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xfe,0x00,0x00]
+
+v_mul_i32_i24_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0x00,0x01,0x00]
+
+v_mul_i32_i24_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0x82,0x01,0x00]
+
+v_mul_i32_i24_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xe0,0x01,0x00]
+
+v_mul_i32_i24_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xee,0x01,0x00]
+
+v_mul_i32_i24_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0x04,0x02,0x00]
+
+v_mul_i32_i24_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x06,0xd1,0x80,0xfe,0x03,0x00]
+
+v_mul_hi_i32_i24 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x0f]
+
+v_mul_hi_i32_i24 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x0e,0x56,0x34,0x12,0xaf]
+
+v_mul_hi_i32_i24 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x0e,0x73,0x72,0x71,0x3f]
+
+v_mul_hi_i32_i24 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x0e]
+
+v_mul_hi_i32_i24 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x0e]
+
+v_mul_hi_i32_i24_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0x04,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x07,0xd1,0x80,0x04,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x07,0xd1,0xc1,0x04,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x07,0xd1,0xf0,0x04,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x07,0xd1,0xf7,0x04,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x07,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x07,0xd1,0xff,0x05,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xca,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xcc,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xce,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xd4,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xd6,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xd8,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xda,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xdc,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xde,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xf6,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xf8,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xfc,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xfe,0x00,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0x00,0x01,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0x82,0x01,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xe0,0x01,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xee,0x01,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0x04,0x02,0x00]
+
+v_mul_hi_i32_i24_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x07,0xd1,0x80,0xfe,0x03,0x00]
+
+v_mul_u32_u24 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x11]
+
+v_mul_u32_u24 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x10]
+
+v_mul_u32_u24 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x10,0x56,0x34,0x12,0xaf]
+
+v_mul_u32_u24 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x10,0x73,0x72,0x71,0x3f]
+
+v_mul_u32_u24 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x10]
+
+v_mul_u32_u24 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x10]
+
+v_mul_u32_u24 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x10]
+
+v_mul_u32_u24_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0x04,0x00,0x00]
+
+v_mul_u32_u24_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x08,0xd1,0x80,0x04,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x08,0xd1,0xc1,0x04,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x08,0xd1,0xf0,0x04,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x08,0xd1,0xf7,0x04,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x08,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x08,0xd1,0xff,0x05,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xca,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xcc,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xce,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xd4,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xd6,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xd8,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xda,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xdc,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xde,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xf6,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xf8,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xfc,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xfe,0x00,0x00]
+
+v_mul_u32_u24_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0x00,0x01,0x00]
+
+v_mul_u32_u24_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0x82,0x01,0x00]
+
+v_mul_u32_u24_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xe0,0x01,0x00]
+
+v_mul_u32_u24_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xee,0x01,0x00]
+
+v_mul_u32_u24_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0x04,0x02,0x00]
+
+v_mul_u32_u24_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x08,0xd1,0x80,0xfe,0x03,0x00]
+
+v_mul_hi_u32_u24 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x13]
+
+v_mul_hi_u32_u24 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x12,0x56,0x34,0x12,0xaf]
+
+v_mul_hi_u32_u24 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x12,0x73,0x72,0x71,0x3f]
+
+v_mul_hi_u32_u24 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x12]
+
+v_mul_hi_u32_u24 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x12]
+
+v_mul_hi_u32_u24_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0x04,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x09,0xd1,0x80,0x04,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x09,0xd1,0xc1,0x04,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x09,0xd1,0xf0,0x04,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x09,0xd1,0xf7,0x04,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x09,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x09,0xd1,0xff,0x05,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xca,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xcc,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xce,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xd4,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xd6,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xd8,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xda,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xdc,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xde,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xf6,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xf8,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xfc,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xfe,0x00,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0x00,0x01,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0x82,0x01,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xe0,0x01,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xee,0x01,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0x04,0x02,0x00]
+
+v_mul_hi_u32_u24_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x09,0xd1,0x80,0xfe,0x03,0x00]
+
+v_min_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x14]
+
+v_min_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x15]
+
+v_min_f32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x14]
+
+v_min_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x14]
+
+v_min_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x14]
+
+v_min_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x14]
+
+v_min_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x14]
+
+v_min_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x14]
+
+v_min_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x14]
+
+v_min_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x14]
+
+v_min_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x14]
+
+v_min_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x14]
+
+v_min_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x14]
+
+v_min_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x14]
+
+v_min_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x14]
+
+v_min_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x14]
+
+v_min_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x14]
+
+v_min_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x14]
+
+v_min_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x14]
+
+v_min_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x14,0x56,0x34,0x12,0xaf]
+
+v_min_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x14,0x73,0x72,0x71,0x3f]
+
+v_min_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x14]
+
+v_min_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x14]
+
+v_min_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x14]
+
+v_min_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x0a,0xd1,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x0a,0xd1,0xff,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xcb,0x00,0x00]
+
+v_min_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xcd,0x00,0x00]
+
+v_min_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xcf,0x00,0x00]
+
+v_min_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xd5,0x00,0x00]
+
+v_min_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xd7,0x00,0x00]
+
+v_min_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xd9,0x00,0x00]
+
+v_min_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xdb,0x00,0x00]
+
+v_min_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xdd,0x00,0x00]
+
+v_min_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xdf,0x00,0x00]
+
+v_min_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xf7,0x00,0x00]
+
+v_min_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xf9,0x00,0x00]
+
+v_min_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xfd,0x00,0x00]
+
+v_min_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xff,0x00,0x00]
+
+v_min_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xfb,0x01,0x00]
+
+v_min_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0x05,0x02,0x00]
+
+v_min_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0xff,0x03,0x00]
+
+v_min_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0x05,0x00,0x20]
+
+v_min_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0x05,0x00,0x40]
+
+v_min_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0x05,0x00,0x60]
+
+v_min_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x0a,0xd1,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x0a,0xd1,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x0a,0xd1,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x0a,0xd1,0x01,0x05,0x00,0x00]
+
+v_min_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0x05,0x00,0x08]
+
+v_min_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0x05,0x00,0x10]
+
+v_min_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x0a,0xd1,0x01,0x05,0x00,0x18]
+
+v_max_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x16]
+
+v_max_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x17]
+
+v_max_f32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x16]
+
+v_max_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x16]
+
+v_max_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x16]
+
+v_max_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x16]
+
+v_max_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x16]
+
+v_max_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x16]
+
+v_max_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x16]
+
+v_max_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x16]
+
+v_max_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x16]
+
+v_max_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x16]
+
+v_max_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x16]
+
+v_max_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x16]
+
+v_max_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x16]
+
+v_max_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x16]
+
+v_max_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x16]
+
+v_max_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x16]
+
+v_max_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x16]
+
+v_max_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x16,0x56,0x34,0x12,0xaf]
+
+v_max_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x16,0x73,0x72,0x71,0x3f]
+
+v_max_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x16]
+
+v_max_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x16]
+
+v_max_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x16]
+
+v_max_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x0b,0xd1,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x0b,0xd1,0xff,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xcb,0x00,0x00]
+
+v_max_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xcd,0x00,0x00]
+
+v_max_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xcf,0x00,0x00]
+
+v_max_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xd5,0x00,0x00]
+
+v_max_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xd7,0x00,0x00]
+
+v_max_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xd9,0x00,0x00]
+
+v_max_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xdb,0x00,0x00]
+
+v_max_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xdd,0x00,0x00]
+
+v_max_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xdf,0x00,0x00]
+
+v_max_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xf7,0x00,0x00]
+
+v_max_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xf9,0x00,0x00]
+
+v_max_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xfd,0x00,0x00]
+
+v_max_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xff,0x00,0x00]
+
+v_max_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xfb,0x01,0x00]
+
+v_max_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0x05,0x02,0x00]
+
+v_max_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0xff,0x03,0x00]
+
+v_max_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0x05,0x00,0x20]
+
+v_max_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0x05,0x00,0x40]
+
+v_max_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0x05,0x00,0x60]
+
+v_max_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x0b,0xd1,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x0b,0xd1,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x0b,0xd1,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x0b,0xd1,0x01,0x05,0x00,0x00]
+
+v_max_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0x05,0x00,0x08]
+
+v_max_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0x05,0x00,0x10]
+
+v_max_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x0b,0xd1,0x01,0x05,0x00,0x18]
+
+v_min_i32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x18]
+
+v_min_i32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x19]
+
+v_min_i32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x18]
+
+v_min_i32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x18]
+
+v_min_i32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x18]
+
+v_min_i32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x18]
+
+v_min_i32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x18]
+
+v_min_i32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x18]
+
+v_min_i32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x18]
+
+v_min_i32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x18]
+
+v_min_i32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x18]
+
+v_min_i32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x18]
+
+v_min_i32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x18]
+
+v_min_i32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x18]
+
+v_min_i32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x18]
+
+v_min_i32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x18]
+
+v_min_i32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x18]
+
+v_min_i32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x18]
+
+v_min_i32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x18]
+
+v_min_i32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x18,0x56,0x34,0x12,0xaf]
+
+v_min_i32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x18,0x73,0x72,0x71,0x3f]
+
+v_min_i32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x18]
+
+v_min_i32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x18]
+
+v_min_i32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x18]
+
+v_min_i32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0x04,0x00,0x00]
+
+v_min_i32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x0c,0xd1,0x80,0x04,0x00,0x00]
+
+v_min_i32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x0c,0xd1,0xc1,0x04,0x00,0x00]
+
+v_min_i32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x0c,0xd1,0xf0,0x04,0x00,0x00]
+
+v_min_i32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x0c,0xd1,0xf7,0x04,0x00,0x00]
+
+v_min_i32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x0c,0xd1,0x01,0x05,0x00,0x00]
+
+v_min_i32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x0c,0xd1,0xff,0x05,0x00,0x00]
+
+v_min_i32_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xca,0x00,0x00]
+
+v_min_i32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xcc,0x00,0x00]
+
+v_min_i32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xce,0x00,0x00]
+
+v_min_i32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xd4,0x00,0x00]
+
+v_min_i32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xd6,0x00,0x00]
+
+v_min_i32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xd8,0x00,0x00]
+
+v_min_i32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xda,0x00,0x00]
+
+v_min_i32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xdc,0x00,0x00]
+
+v_min_i32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xde,0x00,0x00]
+
+v_min_i32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xf6,0x00,0x00]
+
+v_min_i32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xf8,0x00,0x00]
+
+v_min_i32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xfc,0x00,0x00]
+
+v_min_i32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xfe,0x00,0x00]
+
+v_min_i32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0x00,0x01,0x00]
+
+v_min_i32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0x82,0x01,0x00]
+
+v_min_i32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xe0,0x01,0x00]
+
+v_min_i32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xee,0x01,0x00]
+
+v_min_i32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0x04,0x02,0x00]
+
+v_min_i32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x0c,0xd1,0x80,0xfe,0x03,0x00]
+
+v_max_i32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x1a]
+
+v_max_i32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x1b]
+
+v_max_i32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x1a]
+
+v_max_i32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x1a]
+
+v_max_i32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x1a]
+
+v_max_i32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x1a]
+
+v_max_i32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x1a]
+
+v_max_i32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x1a]
+
+v_max_i32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x1a]
+
+v_max_i32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x1a]
+
+v_max_i32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x1a]
+
+v_max_i32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x1a]
+
+v_max_i32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x1a]
+
+v_max_i32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x1a]
+
+v_max_i32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x1a]
+
+v_max_i32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x1a]
+
+v_max_i32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x1a]
+
+v_max_i32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x1a]
+
+v_max_i32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x1a]
+
+v_max_i32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x1a,0x56,0x34,0x12,0xaf]
+
+v_max_i32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x1a,0x73,0x72,0x71,0x3f]
+
+v_max_i32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x1a]
+
+v_max_i32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x1a]
+
+v_max_i32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x1a]
+
+v_max_i32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0x04,0x00,0x00]
+
+v_max_i32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x0d,0xd1,0x80,0x04,0x00,0x00]
+
+v_max_i32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x0d,0xd1,0xc1,0x04,0x00,0x00]
+
+v_max_i32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x0d,0xd1,0xf0,0x04,0x00,0x00]
+
+v_max_i32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x0d,0xd1,0xf7,0x04,0x00,0x00]
+
+v_max_i32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x0d,0xd1,0x01,0x05,0x00,0x00]
+
+v_max_i32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x0d,0xd1,0xff,0x05,0x00,0x00]
+
+v_max_i32_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xca,0x00,0x00]
+
+v_max_i32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xcc,0x00,0x00]
+
+v_max_i32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xce,0x00,0x00]
+
+v_max_i32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xd4,0x00,0x00]
+
+v_max_i32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xd6,0x00,0x00]
+
+v_max_i32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xd8,0x00,0x00]
+
+v_max_i32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xda,0x00,0x00]
+
+v_max_i32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xdc,0x00,0x00]
+
+v_max_i32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xde,0x00,0x00]
+
+v_max_i32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xf6,0x00,0x00]
+
+v_max_i32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xf8,0x00,0x00]
+
+v_max_i32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xfc,0x00,0x00]
+
+v_max_i32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xfe,0x00,0x00]
+
+v_max_i32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0x00,0x01,0x00]
+
+v_max_i32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0x82,0x01,0x00]
+
+v_max_i32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xe0,0x01,0x00]
+
+v_max_i32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xee,0x01,0x00]
+
+v_max_i32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0x04,0x02,0x00]
+
+v_max_i32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x0d,0xd1,0x80,0xfe,0x03,0x00]
+
+v_min_u32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x1c]
+
+v_min_u32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x1d]
+
+v_min_u32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x1c]
+
+v_min_u32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x1c]
+
+v_min_u32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x1c]
+
+v_min_u32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x1c]
+
+v_min_u32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x1c]
+
+v_min_u32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x1c]
+
+v_min_u32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x1c]
+
+v_min_u32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x1c]
+
+v_min_u32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x1c]
+
+v_min_u32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x1c]
+
+v_min_u32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x1c]
+
+v_min_u32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x1c]
+
+v_min_u32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x1c]
+
+v_min_u32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x1c]
+
+v_min_u32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x1c]
+
+v_min_u32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x1c]
+
+v_min_u32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x1c]
+
+v_min_u32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x1c,0x56,0x34,0x12,0xaf]
+
+v_min_u32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x1c,0x73,0x72,0x71,0x3f]
+
+v_min_u32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x1c]
+
+v_min_u32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x1c]
+
+v_min_u32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x1c]
+
+v_min_u32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0x04,0x00,0x00]
+
+v_min_u32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x0e,0xd1,0x80,0x04,0x00,0x00]
+
+v_min_u32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x0e,0xd1,0xc1,0x04,0x00,0x00]
+
+v_min_u32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x0e,0xd1,0xf0,0x04,0x00,0x00]
+
+v_min_u32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x0e,0xd1,0xf7,0x04,0x00,0x00]
+
+v_min_u32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x0e,0xd1,0x01,0x05,0x00,0x00]
+
+v_min_u32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x0e,0xd1,0xff,0x05,0x00,0x00]
+
+v_min_u32_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xca,0x00,0x00]
+
+v_min_u32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xcc,0x00,0x00]
+
+v_min_u32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xce,0x00,0x00]
+
+v_min_u32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xd4,0x00,0x00]
+
+v_min_u32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xd6,0x00,0x00]
+
+v_min_u32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xd8,0x00,0x00]
+
+v_min_u32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xda,0x00,0x00]
+
+v_min_u32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xdc,0x00,0x00]
+
+v_min_u32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xde,0x00,0x00]
+
+v_min_u32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xf6,0x00,0x00]
+
+v_min_u32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xf8,0x00,0x00]
+
+v_min_u32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xfc,0x00,0x00]
+
+v_min_u32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xfe,0x00,0x00]
+
+v_min_u32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0x00,0x01,0x00]
+
+v_min_u32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0x82,0x01,0x00]
+
+v_min_u32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xe0,0x01,0x00]
+
+v_min_u32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xee,0x01,0x00]
+
+v_min_u32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0x04,0x02,0x00]
+
+v_min_u32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x0e,0xd1,0x80,0xfe,0x03,0x00]
+
+v_max_u32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x1e]
+
+v_max_u32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x1f]
+
+v_max_u32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x1e]
+
+v_max_u32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x1e]
+
+v_max_u32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x1e]
+
+v_max_u32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x1e]
+
+v_max_u32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x1e]
+
+v_max_u32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x1e]
+
+v_max_u32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x1e]
+
+v_max_u32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x1e]
+
+v_max_u32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x1e]
+
+v_max_u32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x1e]
+
+v_max_u32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x1e]
+
+v_max_u32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x1e]
+
+v_max_u32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x1e]
+
+v_max_u32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x1e]
+
+v_max_u32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x1e]
+
+v_max_u32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x1e]
+
+v_max_u32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x1e]
+
+v_max_u32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x1e,0x56,0x34,0x12,0xaf]
+
+v_max_u32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x1e,0x73,0x72,0x71,0x3f]
+
+v_max_u32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x1e]
+
+v_max_u32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x1e]
+
+v_max_u32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x1e]
+
+v_max_u32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0x04,0x00,0x00]
+
+v_max_u32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x0f,0xd1,0x80,0x04,0x00,0x00]
+
+v_max_u32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x0f,0xd1,0xc1,0x04,0x00,0x00]
+
+v_max_u32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x0f,0xd1,0xf0,0x04,0x00,0x00]
+
+v_max_u32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x0f,0xd1,0xf7,0x04,0x00,0x00]
+
+v_max_u32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x0f,0xd1,0x01,0x05,0x00,0x00]
+
+v_max_u32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x0f,0xd1,0xff,0x05,0x00,0x00]
+
+v_max_u32_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xca,0x00,0x00]
+
+v_max_u32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xcc,0x00,0x00]
+
+v_max_u32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xce,0x00,0x00]
+
+v_max_u32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xd4,0x00,0x00]
+
+v_max_u32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xd6,0x00,0x00]
+
+v_max_u32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xd8,0x00,0x00]
+
+v_max_u32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xda,0x00,0x00]
+
+v_max_u32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xdc,0x00,0x00]
+
+v_max_u32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xde,0x00,0x00]
+
+v_max_u32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xf6,0x00,0x00]
+
+v_max_u32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xf8,0x00,0x00]
+
+v_max_u32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xfc,0x00,0x00]
+
+v_max_u32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xfe,0x00,0x00]
+
+v_max_u32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0x00,0x01,0x00]
+
+v_max_u32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0x82,0x01,0x00]
+
+v_max_u32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xe0,0x01,0x00]
+
+v_max_u32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xee,0x01,0x00]
+
+v_max_u32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0x04,0x02,0x00]
+
+v_max_u32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x0f,0xd1,0x80,0xfe,0x03,0x00]
+
+v_lshrrev_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x21]
+
+v_lshrrev_b32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x20]
+
+v_lshrrev_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x20,0x56,0x34,0x12,0xaf]
+
+v_lshrrev_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x20,0x73,0x72,0x71,0x3f]
+
+v_lshrrev_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x20]
+
+v_lshrrev_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x20]
+
+v_lshrrev_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x20]
+
+v_lshrrev_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0x04,0x00,0x00]
+
+v_lshrrev_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x10,0xd1,0x80,0x04,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x10,0xd1,0xc1,0x04,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x10,0xd1,0xf0,0x04,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x10,0xd1,0xf7,0x04,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x10,0xd1,0x01,0x05,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x10,0xd1,0xff,0x05,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xca,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xcc,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xce,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xd4,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xd6,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xd8,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xda,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xdc,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xde,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xf6,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xf8,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xfc,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xfe,0x00,0x00]
+
+v_lshrrev_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0x00,0x01,0x00]
+
+v_lshrrev_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0x82,0x01,0x00]
+
+v_lshrrev_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xe0,0x01,0x00]
+
+v_lshrrev_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xee,0x01,0x00]
+
+v_lshrrev_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0x04,0x02,0x00]
+
+v_lshrrev_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x10,0xd1,0x80,0xfe,0x03,0x00]
+
+v_ashrrev_i32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x23]
+
+v_ashrrev_i32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x22]
+
+v_ashrrev_i32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x22,0x56,0x34,0x12,0xaf]
+
+v_ashrrev_i32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x22,0x73,0x72,0x71,0x3f]
+
+v_ashrrev_i32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x22]
+
+v_ashrrev_i32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x22]
+
+v_ashrrev_i32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x22]
+
+v_ashrrev_i32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0x04,0x00,0x00]
+
+v_ashrrev_i32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x11,0xd1,0x80,0x04,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x11,0xd1,0xc1,0x04,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x11,0xd1,0xf0,0x04,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x11,0xd1,0xf7,0x04,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x11,0xd1,0x01,0x05,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x11,0xd1,0xff,0x05,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xca,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xcc,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xce,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xd4,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xd6,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xd8,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xda,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xdc,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xde,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xf6,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xf8,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xfc,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xfe,0x00,0x00]
+
+v_ashrrev_i32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0x00,0x01,0x00]
+
+v_ashrrev_i32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0x82,0x01,0x00]
+
+v_ashrrev_i32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xe0,0x01,0x00]
+
+v_ashrrev_i32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xee,0x01,0x00]
+
+v_ashrrev_i32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0x04,0x02,0x00]
+
+v_ashrrev_i32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x11,0xd1,0x80,0xfe,0x03,0x00]
+
+v_lshlrev_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x25]
+
+v_lshlrev_b32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x24]
+
+v_lshlrev_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x24,0x56,0x34,0x12,0xaf]
+
+v_lshlrev_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x24,0x73,0x72,0x71,0x3f]
+
+v_lshlrev_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x24]
+
+v_lshlrev_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x24]
+
+v_lshlrev_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x24]
+
+v_lshlrev_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0x04,0x00,0x00]
+
+v_lshlrev_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x12,0xd1,0x80,0x04,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x12,0xd1,0xc1,0x04,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x12,0xd1,0xf0,0x04,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x12,0xd1,0xf7,0x04,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x12,0xd1,0x01,0x05,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x12,0xd1,0xff,0x05,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xca,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xcc,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xce,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xd4,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xd6,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xd8,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xda,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xdc,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xde,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xf6,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xf8,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xfc,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xfe,0x00,0x00]
+
+v_lshlrev_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0x00,0x01,0x00]
+
+v_lshlrev_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0x82,0x01,0x00]
+
+v_lshlrev_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xe0,0x01,0x00]
+
+v_lshlrev_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xee,0x01,0x00]
+
+v_lshlrev_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0x04,0x02,0x00]
+
+v_lshlrev_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x12,0xd1,0x80,0xfe,0x03,0x00]
+
+v_and_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x26]
+
+v_and_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x27]
+
+v_and_b32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x26]
+
+v_and_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x26]
+
+v_and_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x26]
+
+v_and_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x26]
+
+v_and_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x26]
+
+v_and_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x26]
+
+v_and_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x26]
+
+v_and_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x26]
+
+v_and_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x26]
+
+v_and_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x26]
+
+v_and_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x26]
+
+v_and_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x26]
+
+v_and_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x26]
+
+v_and_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x26]
+
+v_and_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x26]
+
+v_and_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x26]
+
+v_and_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x26]
+
+v_and_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x26,0x56,0x34,0x12,0xaf]
+
+v_and_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x26,0x73,0x72,0x71,0x3f]
+
+v_and_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x26]
+
+v_and_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x26]
+
+v_and_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x26]
+
+v_and_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0x04,0x00,0x00]
+
+v_and_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x13,0xd1,0x80,0x04,0x00,0x00]
+
+v_and_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x13,0xd1,0xc1,0x04,0x00,0x00]
+
+v_and_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x13,0xd1,0xf0,0x04,0x00,0x00]
+
+v_and_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x13,0xd1,0xf7,0x04,0x00,0x00]
+
+v_and_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x13,0xd1,0x01,0x05,0x00,0x00]
+
+v_and_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x13,0xd1,0xff,0x05,0x00,0x00]
+
+v_and_b32_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xca,0x00,0x00]
+
+v_and_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xcc,0x00,0x00]
+
+v_and_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xce,0x00,0x00]
+
+v_and_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xd4,0x00,0x00]
+
+v_and_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xd6,0x00,0x00]
+
+v_and_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xd8,0x00,0x00]
+
+v_and_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xda,0x00,0x00]
+
+v_and_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xdc,0x00,0x00]
+
+v_and_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xde,0x00,0x00]
+
+v_and_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xf6,0x00,0x00]
+
+v_and_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xf8,0x00,0x00]
+
+v_and_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xfc,0x00,0x00]
+
+v_and_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xfe,0x00,0x00]
+
+v_and_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0x00,0x01,0x00]
+
+v_and_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0x82,0x01,0x00]
+
+v_and_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xe0,0x01,0x00]
+
+v_and_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xee,0x01,0x00]
+
+v_and_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0x04,0x02,0x00]
+
+v_and_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x13,0xd1,0x80,0xfe,0x03,0x00]
+
+v_or_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x28]
+
+v_or_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x29]
+
+v_or_b32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x28]
+
+v_or_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x28]
+
+v_or_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x28]
+
+v_or_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x28]
+
+v_or_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x28]
+
+v_or_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x28]
+
+v_or_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x28]
+
+v_or_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x28]
+
+v_or_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x28]
+
+v_or_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x28]
+
+v_or_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x28]
+
+v_or_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x28]
+
+v_or_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x28]
+
+v_or_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x28]
+
+v_or_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x28]
+
+v_or_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x28]
+
+v_or_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x28]
+
+v_or_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x28,0x56,0x34,0x12,0xaf]
+
+v_or_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x28,0x73,0x72,0x71,0x3f]
+
+v_or_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x28]
+
+v_or_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x28]
+
+v_or_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x28]
+
+v_or_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0x04,0x00,0x00]
+
+v_or_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x14,0xd1,0x80,0x04,0x00,0x00]
+
+v_or_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x14,0xd1,0xc1,0x04,0x00,0x00]
+
+v_or_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x14,0xd1,0xf0,0x04,0x00,0x00]
+
+v_or_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x14,0xd1,0xf7,0x04,0x00,0x00]
+
+v_or_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x14,0xd1,0x01,0x05,0x00,0x00]
+
+v_or_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x14,0xd1,0xff,0x05,0x00,0x00]
+
+v_or_b32_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xca,0x00,0x00]
+
+v_or_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xcc,0x00,0x00]
+
+v_or_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xce,0x00,0x00]
+
+v_or_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xd4,0x00,0x00]
+
+v_or_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xd6,0x00,0x00]
+
+v_or_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xd8,0x00,0x00]
+
+v_or_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xda,0x00,0x00]
+
+v_or_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xdc,0x00,0x00]
+
+v_or_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xde,0x00,0x00]
+
+v_or_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xf6,0x00,0x00]
+
+v_or_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xf8,0x00,0x00]
+
+v_or_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xfc,0x00,0x00]
+
+v_or_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xfe,0x00,0x00]
+
+v_or_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0x00,0x01,0x00]
+
+v_or_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0x82,0x01,0x00]
+
+v_or_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xe0,0x01,0x00]
+
+v_or_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xee,0x01,0x00]
+
+v_or_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0x04,0x02,0x00]
+
+v_or_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x14,0xd1,0x80,0xfe,0x03,0x00]
+
+v_xor_b32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x2a]
+
+v_xor_b32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x2b]
+
+v_xor_b32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x2a]
+
+v_xor_b32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x2a,0x56,0x34,0x12,0xaf]
+
+v_xor_b32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x2a,0x73,0x72,0x71,0x3f]
+
+v_xor_b32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x2a]
+
+v_xor_b32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x2a]
+
+v_xor_b32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x2a]
+
+v_xor_b32_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0x04,0x00,0x00]
+
+v_xor_b32_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x15,0xd1,0x80,0x04,0x00,0x00]
+
+v_xor_b32_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x15,0xd1,0xc1,0x04,0x00,0x00]
+
+v_xor_b32_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x15,0xd1,0xf0,0x04,0x00,0x00]
+
+v_xor_b32_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x15,0xd1,0xf7,0x04,0x00,0x00]
+
+v_xor_b32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x15,0xd1,0x01,0x05,0x00,0x00]
+
+v_xor_b32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x15,0xd1,0xff,0x05,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xca,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xcc,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xce,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xd4,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xd6,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xd8,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xda,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xdc,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xde,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xf6,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xf8,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xfc,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xfe,0x00,0x00]
+
+v_xor_b32_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0x00,0x01,0x00]
+
+v_xor_b32_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0x82,0x01,0x00]
+
+v_xor_b32_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xe0,0x01,0x00]
+
+v_xor_b32_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xee,0x01,0x00]
+
+v_xor_b32_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0x04,0x02,0x00]
+
+v_xor_b32_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x15,0xd1,0x80,0xfe,0x03,0x00]
+
+v_mac_f32 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x2c]
+
+v_mac_f32 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x2d]
+
+v_mac_f32 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x2c]
+
+v_mac_f32 v5, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x2c,0x56,0x34,0x12,0xaf]
+
+v_mac_f32 v5, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x2c,0x73,0x72,0x71,0x3f]
+
+v_mac_f32 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x2c]
+
+v_mac_f32 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x2c]
+
+v_mac_f32 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x2c]
+
+v_mac_f32_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x16,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x16,0xd1,0xff,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xcb,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xcd,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xcf,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xd5,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xd7,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xd9,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xdb,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xdd,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xdf,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xf7,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xf9,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xfd,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xff,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xfb,0x01,0x00]
+
+v_mac_f32_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0x05,0x02,0x00]
+
+v_mac_f32_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0xff,0x03,0x00]
+
+v_mac_f32_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0x05,0x00,0x20]
+
+v_mac_f32_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0x05,0x00,0x40]
+
+v_mac_f32_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0x05,0x00,0x60]
+
+v_mac_f32_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x16,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x16,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x16,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x16,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f32_e64 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0x05,0x00,0x08]
+
+v_mac_f32_e64 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0x05,0x00,0x10]
+
+v_mac_f32_e64 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x16,0xd1,0x01,0x05,0x00,0x18]
+
+v_madmk_f32 v5, 0, 0x11213141, v3
+// CHECK: [0x80,0x06,0x0a,0x2e,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v255, 0, 0x11213141, v3
+// CHECK: [0x80,0x06,0xfe,0x2f,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, -1, 0x11213141, v3
+// CHECK: [0xc1,0x06,0x0a,0x2e,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, 0.5, 0x11213141, v3
+// CHECK: [0xf0,0x06,0x0a,0x2e,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, -4.0, 0x11213141, v3
+// CHECK: [0xf7,0x06,0x0a,0x2e,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, v1, 0x11213141, v3
+// CHECK: [0x01,0x07,0x0a,0x2e,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, v255, 0x11213141, v3
+// CHECK: [0xff,0x07,0x0a,0x2e,0x41,0x31,0x21,0x11]
+
+v_madmk_f32 v5, 0, 0xa1b1c1d1, v3
+// CHECK: [0x80,0x06,0x0a,0x2e,0xd1,0xc1,0xb1,0xa1]
+
+v_madmk_f32 v5, 0, 0x11213141, v255
+// CHECK: [0x80,0xfe,0x0b,0x2e,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, 0, v2, 0x11213141
+// CHECK: [0x80,0x04,0x0a,0x30,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v255, 0, v2, 0x11213141
+// CHECK: [0x80,0x04,0xfe,0x31,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, -1, v2, 0x11213141
+// CHECK: [0xc1,0x04,0x0a,0x30,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, 0.5, v2, 0x11213141
+// CHECK: [0xf0,0x04,0x0a,0x30,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, -4.0, v2, 0x11213141
+// CHECK: [0xf7,0x04,0x0a,0x30,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, v1, v2, 0x11213141
+// CHECK: [0x01,0x05,0x0a,0x30,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, v255, v2, 0x11213141
+// CHECK: [0xff,0x05,0x0a,0x30,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, 0, v255, 0x11213141
+// CHECK: [0x80,0xfe,0x0b,0x30,0x41,0x31,0x21,0x11]
+
+v_madak_f32 v5, 0, v2, 0xa1b1c1d1
+// CHECK: [0x80,0x04,0x0a,0x30,0xd1,0xc1,0xb1,0xa1]
+
+v_add_u32 v5, vcc, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x32]
+
+v_add_u32 v255, vcc, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x33]
+
+v_add_u32 v5, vcc, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x32]
+
+v_add_u32 v5, vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x32,0x56,0x34,0x12,0xaf]
+
+v_add_u32 v5, vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x32,0x73,0x72,0x71,0x3f]
+
+v_add_u32 v5, vcc, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x32]
+
+v_add_u32 v5, vcc, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x32]
+
+v_add_u32 v5, vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x32]
+
+v_add_u32_e64 v5, s[12:13], 0, s2
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u32_e64 v255, s[12:13], 0, s2
+// CHECK: [0xff,0x0c,0x19,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, s[14:15], 0, s2
+// CHECK: [0x05,0x0e,0x19,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, s[100:101], 0, s2
+// CHECK: [0x05,0x64,0x19,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, flat_scratch, 0, s2
+// CHECK: [0x05,0x66,0x19,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, vcc, 0, s2
+// CHECK: [0x05,0x6a,0x19,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, tba, 0, s2
+// CHECK: [0x05,0x6c,0x19,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, tma, 0, s2
+// CHECK: [0x05,0x6e,0x19,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, ttmp[10:11], 0, s2
+// CHECK: [0x05,0x7a,0x19,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], -1, s2
+// CHECK: [0x05,0x0c,0x19,0xd1,0xc1,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0.5, s2
+// CHECK: [0x05,0x0c,0x19,0xd1,0xf0,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], -4.0, s2
+// CHECK: [0x05,0x0c,0x19,0xd1,0xf7,0x04,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], v1, s2
+// CHECK: [0x05,0x0c,0x19,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], v255, s2
+// CHECK: [0x05,0x0c,0x19,0xd1,0xff,0x05,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, s101
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xca,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, flat_scratch_lo
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xcc,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, flat_scratch_hi
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xce,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, vcc_lo
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xd4,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, vcc_hi
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xd6,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, tba_lo
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xd8,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, tba_hi
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xda,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, tma_lo
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xdc,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, tma_hi
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xde,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, ttmp11
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xf6,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, m0
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xf8,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, exec_lo
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xfc,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, exec_hi
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xfe,0x00,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, 0
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0x00,0x01,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, -1
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0x82,0x01,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, 0.5
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xe0,0x01,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, -4.0
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xee,0x01,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, v2
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0x04,0x02,0x00]
+
+v_add_u32_e64 v5, s[12:13], 0, v255
+// CHECK: [0x05,0x0c,0x19,0xd1,0x80,0xfe,0x03,0x00]
+
+v_sub_u32 v5, vcc, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x34]
+
+v_sub_u32 v255, vcc, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x35]
+
+v_sub_u32 v5, vcc, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x34]
+
+v_sub_u32 v5, vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x34,0x56,0x34,0x12,0xaf]
+
+v_sub_u32 v5, vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x34,0x73,0x72,0x71,0x3f]
+
+v_sub_u32 v5, vcc, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x34]
+
+v_sub_u32 v5, vcc, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x34]
+
+v_sub_u32 v5, vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x34]
+
+v_sub_u32_e64 v5, s[12:13], 0, s2
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u32_e64 v255, s[12:13], 0, s2
+// CHECK: [0xff,0x0c,0x1a,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, s[14:15], 0, s2
+// CHECK: [0x05,0x0e,0x1a,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, s[100:101], 0, s2
+// CHECK: [0x05,0x64,0x1a,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, flat_scratch, 0, s2
+// CHECK: [0x05,0x66,0x1a,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, vcc, 0, s2
+// CHECK: [0x05,0x6a,0x1a,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, tba, 0, s2
+// CHECK: [0x05,0x6c,0x1a,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, tma, 0, s2
+// CHECK: [0x05,0x6e,0x1a,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, ttmp[10:11], 0, s2
+// CHECK: [0x05,0x7a,0x1a,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], -1, s2
+// CHECK: [0x05,0x0c,0x1a,0xd1,0xc1,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0.5, s2
+// CHECK: [0x05,0x0c,0x1a,0xd1,0xf0,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], -4.0, s2
+// CHECK: [0x05,0x0c,0x1a,0xd1,0xf7,0x04,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], v1, s2
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], v255, s2
+// CHECK: [0x05,0x0c,0x1a,0xd1,0xff,0x05,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, s101
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xca,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, flat_scratch_lo
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xcc,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, flat_scratch_hi
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xce,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, vcc_lo
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xd4,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, vcc_hi
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xd6,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, tba_lo
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xd8,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, tba_hi
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xda,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, tma_lo
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xdc,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, tma_hi
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xde,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, ttmp11
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xf6,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, m0
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xf8,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, exec_lo
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xfc,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, exec_hi
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xfe,0x00,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, 0
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0x00,0x01,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, -1
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0x82,0x01,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, 0.5
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xe0,0x01,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, -4.0
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xee,0x01,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, v2
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0x04,0x02,0x00]
+
+v_sub_u32_e64 v5, s[12:13], 0, v255
+// CHECK: [0x05,0x0c,0x1a,0xd1,0x80,0xfe,0x03,0x00]
+
+v_subrev_u32 v5, vcc, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x36]
+
+v_subrev_u32 v255, vcc, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x37]
+
+v_subrev_u32 v5, vcc, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x0a,0x36,0x56,0x34,0x12,0xaf]
+
+v_subrev_u32 v5, vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x0a,0x36,0x73,0x72,0x71,0x3f]
+
+v_subrev_u32 v5, vcc, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x36]
+
+v_subrev_u32 v5, vcc, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x36]
+
+v_subrev_u32_e64 v5, s[12:13], 0, s2
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v255, s[12:13], 0, s2
+// CHECK: [0xff,0x0c,0x1b,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[14:15], 0, s2
+// CHECK: [0x05,0x0e,0x1b,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[100:101], 0, s2
+// CHECK: [0x05,0x64,0x1b,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, flat_scratch, 0, s2
+// CHECK: [0x05,0x66,0x1b,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, vcc, 0, s2
+// CHECK: [0x05,0x6a,0x1b,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, tba, 0, s2
+// CHECK: [0x05,0x6c,0x1b,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, tma, 0, s2
+// CHECK: [0x05,0x6e,0x1b,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, ttmp[10:11], 0, s2
+// CHECK: [0x05,0x7a,0x1b,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], -1, s2
+// CHECK: [0x05,0x0c,0x1b,0xd1,0xc1,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0.5, s2
+// CHECK: [0x05,0x0c,0x1b,0xd1,0xf0,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], -4.0, s2
+// CHECK: [0x05,0x0c,0x1b,0xd1,0xf7,0x04,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], v1, s2
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], v255, s2
+// CHECK: [0x05,0x0c,0x1b,0xd1,0xff,0x05,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, s101
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xca,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, flat_scratch_lo
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xcc,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, flat_scratch_hi
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xce,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, vcc_lo
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xd4,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, vcc_hi
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xd6,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, tba_lo
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xd8,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, tba_hi
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xda,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, tma_lo
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xdc,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, tma_hi
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xde,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, ttmp11
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xf6,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, m0
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xf8,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, exec_lo
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xfc,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, exec_hi
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xfe,0x00,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, 0
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0x00,0x01,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, -1
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0x82,0x01,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, 0.5
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xe0,0x01,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, -4.0
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xee,0x01,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, v2
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0x04,0x02,0x00]
+
+v_subrev_u32_e64 v5, s[12:13], 0, v255
+// CHECK: [0x05,0x0c,0x1b,0xd1,0x80,0xfe,0x03,0x00]
+
+v_addc_u32 v5, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0x0a,0x38]
+
+v_addc_u32 v255, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0xfe,0x39]
+
+v_addc_u32 v5, vcc, -1, v2, vcc
+// CHECK: [0xc1,0x04,0x0a,0x38]
+
+v_addc_u32 v5, vcc, 0.5, v2, vcc
+// CHECK: [0xf0,0x04,0x0a,0x38]
+
+v_addc_u32 v5, vcc, -4.0, v2, vcc
+// CHECK: [0xf7,0x04,0x0a,0x38]
+
+v_addc_u32 v5, vcc, v1, v2, vcc
+// CHECK: [0x01,0x05,0x0a,0x38]
+
+v_addc_u32 v5, vcc, v255, v2, vcc
+// CHECK: [0xff,0x05,0x0a,0x38]
+
+v_addc_u32 v5, vcc, 0, v255, vcc
+// CHECK: [0x80,0xfe,0x0b,0x38]
+
+v_addc_u32_e64 v5, s[12:13], 0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0x00,0x19,0x00]
+
+v_addc_u32_e64 v255, s[12:13], 0, 0, s[6:7]
+// CHECK: [0xff,0x0c,0x1c,0xd1,0x80,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, s[14:15], 0, 0, s[6:7]
+// CHECK: [0x05,0x0e,0x1c,0xd1,0x80,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, s[100:101], 0, 0, s[6:7]
+// CHECK: [0x05,0x64,0x1c,0xd1,0x80,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, flat_scratch, 0, 0, s[6:7]
+// CHECK: [0x05,0x66,0x1c,0xd1,0x80,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, vcc, 0, 0, s[6:7]
+// CHECK: [0x05,0x6a,0x1c,0xd1,0x80,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, tba, 0, 0, s[6:7]
+// CHECK: [0x05,0x6c,0x1c,0xd1,0x80,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, tma, 0, 0, s[6:7]
+// CHECK: [0x05,0x6e,0x1c,0xd1,0x80,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, ttmp[10:11], 0, 0, s[6:7]
+// CHECK: [0x05,0x7a,0x1c,0xd1,0x80,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, s[12:13], -1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0xc1,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, s[12:13], 0.5, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0xf0,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, s[12:13], -4.0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0xf7,0x00,0x19,0x00]
+
+v_addc_u32_e64 v5, s[12:13], v1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x01,0x01,0x19,0x00]
+
+v_addc_u32_e64 v5, s[12:13], v255, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0xff,0x01,0x19,0x00]
+
+v_addc_u32_e64 v5, s[12:13], 0, -1, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0x82,0x19,0x00]
+
+v_addc_u32_e64 v5, s[12:13], 0, 0.5, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0xe0,0x19,0x00]
+
+v_addc_u32_e64 v5, s[12:13], 0, -4.0, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0xee,0x19,0x00]
+
+v_addc_u32_e64 v5, s[12:13], 0, v2, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0x04,0x1a,0x00]
+
+v_addc_u32_e64 v5, s[12:13], 0, v255, s[6:7]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0xfe,0x1b,0x00]
+
+v_addc_u32_e64 v5, s[12:13], 0, 0, s[8:9]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0x00,0x21,0x00]
+
+v_addc_u32_e64 v5, s[12:13], 0, 0, s[100:101]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0x00,0x91,0x01]
+
+v_addc_u32_e64 v5, s[12:13], 0, 0, flat_scratch
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0x00,0x99,0x01]
+
+v_addc_u32_e64 v5, s[12:13], 0, 0, vcc
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0x00,0xa9,0x01]
+
+v_addc_u32_e64 v5, s[12:13], 0, 0, tba
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0x00,0xb1,0x01]
+
+v_addc_u32_e64 v5, s[12:13], 0, 0, tma
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0x00,0xb9,0x01]
+
+v_addc_u32_e64 v5, s[12:13], 0, 0, ttmp[10:11]
+// CHECK: [0x05,0x0c,0x1c,0xd1,0x80,0x00,0xe9,0x01]
+
+v_subb_u32 v5, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0x0a,0x3a]
+
+v_subb_u32 v255, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0xfe,0x3b]
+
+v_subb_u32 v5, vcc, -1, v2, vcc
+// CHECK: [0xc1,0x04,0x0a,0x3a]
+
+v_subb_u32 v5, vcc, 0.5, v2, vcc
+// CHECK: [0xf0,0x04,0x0a,0x3a]
+
+v_subb_u32 v5, vcc, -4.0, v2, vcc
+// CHECK: [0xf7,0x04,0x0a,0x3a]
+
+v_subb_u32 v5, vcc, v1, v2, vcc
+// CHECK: [0x01,0x05,0x0a,0x3a]
+
+v_subb_u32 v5, vcc, v255, v2, vcc
+// CHECK: [0xff,0x05,0x0a,0x3a]
+
+v_subb_u32 v5, vcc, 0, v255, vcc
+// CHECK: [0x80,0xfe,0x0b,0x3a]
+
+v_subb_u32_e64 v5, s[12:13], 0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0x00,0x19,0x00]
+
+v_subb_u32_e64 v255, s[12:13], 0, 0, s[6:7]
+// CHECK: [0xff,0x0c,0x1d,0xd1,0x80,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, s[14:15], 0, 0, s[6:7]
+// CHECK: [0x05,0x0e,0x1d,0xd1,0x80,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, s[100:101], 0, 0, s[6:7]
+// CHECK: [0x05,0x64,0x1d,0xd1,0x80,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, flat_scratch, 0, 0, s[6:7]
+// CHECK: [0x05,0x66,0x1d,0xd1,0x80,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, vcc, 0, 0, s[6:7]
+// CHECK: [0x05,0x6a,0x1d,0xd1,0x80,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, tba, 0, 0, s[6:7]
+// CHECK: [0x05,0x6c,0x1d,0xd1,0x80,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, tma, 0, 0, s[6:7]
+// CHECK: [0x05,0x6e,0x1d,0xd1,0x80,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, ttmp[10:11], 0, 0, s[6:7]
+// CHECK: [0x05,0x7a,0x1d,0xd1,0x80,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, s[12:13], -1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0xc1,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, s[12:13], 0.5, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0xf0,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, s[12:13], -4.0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0xf7,0x00,0x19,0x00]
+
+v_subb_u32_e64 v5, s[12:13], v1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x01,0x01,0x19,0x00]
+
+v_subb_u32_e64 v5, s[12:13], v255, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0xff,0x01,0x19,0x00]
+
+v_subb_u32_e64 v5, s[12:13], 0, -1, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0x82,0x19,0x00]
+
+v_subb_u32_e64 v5, s[12:13], 0, 0.5, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0xe0,0x19,0x00]
+
+v_subb_u32_e64 v5, s[12:13], 0, -4.0, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0xee,0x19,0x00]
+
+v_subb_u32_e64 v5, s[12:13], 0, v2, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0x04,0x1a,0x00]
+
+v_subb_u32_e64 v5, s[12:13], 0, v255, s[6:7]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0xfe,0x1b,0x00]
+
+v_subb_u32_e64 v5, s[12:13], 0, 0, s[8:9]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0x00,0x21,0x00]
+
+v_subb_u32_e64 v5, s[12:13], 0, 0, s[100:101]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0x00,0x91,0x01]
+
+v_subb_u32_e64 v5, s[12:13], 0, 0, flat_scratch
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0x00,0x99,0x01]
+
+v_subb_u32_e64 v5, s[12:13], 0, 0, vcc
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0x00,0xa9,0x01]
+
+v_subb_u32_e64 v5, s[12:13], 0, 0, tba
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0x00,0xb1,0x01]
+
+v_subb_u32_e64 v5, s[12:13], 0, 0, tma
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0x00,0xb9,0x01]
+
+v_subb_u32_e64 v5, s[12:13], 0, 0, ttmp[10:11]
+// CHECK: [0x05,0x0c,0x1d,0xd1,0x80,0x00,0xe9,0x01]
+
+v_subbrev_u32 v5, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0x0a,0x3c]
+
+v_subbrev_u32 v255, vcc, 0, v2, vcc
+// CHECK: [0x80,0x04,0xfe,0x3d]
+
+v_subbrev_u32 v5, vcc, -1, v2, vcc
+// CHECK: [0xc1,0x04,0x0a,0x3c]
+
+v_subbrev_u32 v5, vcc, 0.5, v2, vcc
+// CHECK: [0xf0,0x04,0x0a,0x3c]
+
+v_subbrev_u32 v5, vcc, -4.0, v2, vcc
+// CHECK: [0xf7,0x04,0x0a,0x3c]
+
+v_subbrev_u32 v5, vcc, v1, v2, vcc
+// CHECK: [0x01,0x05,0x0a,0x3c]
+
+v_subbrev_u32 v5, vcc, v255, v2, vcc
+// CHECK: [0xff,0x05,0x0a,0x3c]
+
+v_subbrev_u32 v5, vcc, 0, v255, vcc
+// CHECK: [0x80,0xfe,0x0b,0x3c]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v255, s[12:13], 0, 0, s[6:7]
+// CHECK: [0xff,0x0c,0x1e,0xd1,0x80,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[14:15], 0, 0, s[6:7]
+// CHECK: [0x05,0x0e,0x1e,0xd1,0x80,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[100:101], 0, 0, s[6:7]
+// CHECK: [0x05,0x64,0x1e,0xd1,0x80,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, flat_scratch, 0, 0, s[6:7]
+// CHECK: [0x05,0x66,0x1e,0xd1,0x80,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, vcc, 0, 0, s[6:7]
+// CHECK: [0x05,0x6a,0x1e,0xd1,0x80,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, tba, 0, 0, s[6:7]
+// CHECK: [0x05,0x6c,0x1e,0xd1,0x80,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, tma, 0, 0, s[6:7]
+// CHECK: [0x05,0x6e,0x1e,0xd1,0x80,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, ttmp[10:11], 0, 0, s[6:7]
+// CHECK: [0x05,0x7a,0x1e,0xd1,0x80,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], -1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0xc1,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], 0.5, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0xf0,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], -4.0, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0xf7,0x00,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], v1, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x01,0x01,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], v255, 0, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0xff,0x01,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, -1, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0x82,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, 0.5, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0xe0,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, -4.0, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0xee,0x19,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, v2, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0x04,0x1a,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, v255, s[6:7]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0xfe,0x1b,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, s[8:9]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0x00,0x21,0x00]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, s[100:101]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0x00,0x91,0x01]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, flat_scratch
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0x00,0x99,0x01]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, vcc
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0x00,0xa9,0x01]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, tba
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0x00,0xb1,0x01]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, tma
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0x00,0xb9,0x01]
+
+v_subbrev_u32_e64 v5, s[12:13], 0, 0, ttmp[10:11]
+// CHECK: [0x05,0x0c,0x1e,0xd1,0x80,0x00,0xe9,0x01]
+
+v_add_f16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x3e]
+
+v_add_f16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x3f]
+
+v_add_f16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x3e]
+
+v_add_f16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x3e]
+
+v_add_f16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x3e]
+
+v_add_f16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x3e]
+
+v_add_f16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x3e]
+
+v_add_f16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x3e]
+
+v_add_f16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x3e]
+
+v_add_f16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x3e]
+
+v_add_f16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x3e]
+
+v_add_f16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x3e]
+
+v_add_f16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x3e]
+
+v_add_f16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x3e]
+
+v_add_f16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x3e]
+
+v_add_f16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x3e]
+
+v_add_f16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x3e]
+
+v_add_f16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x3e]
+
+v_add_f16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x3e]
+
+v_add_f16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x3e,0x0b,0xfe,0x00,0x00]
+
+v_add_f16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x3e,0x56,0x34,0x00,0x00]
+
+v_add_f16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x3e]
+
+v_add_f16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x3e]
+
+v_add_f16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x3e]
+
+v_add_f16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f16_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x1f,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x1f,0xd1,0xff,0x05,0x00,0x00]
+
+v_add_f16_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xcb,0x00,0x00]
+
+v_add_f16_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xcd,0x00,0x00]
+
+v_add_f16_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xcf,0x00,0x00]
+
+v_add_f16_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xd5,0x00,0x00]
+
+v_add_f16_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xd7,0x00,0x00]
+
+v_add_f16_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xd9,0x00,0x00]
+
+v_add_f16_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xdb,0x00,0x00]
+
+v_add_f16_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xdd,0x00,0x00]
+
+v_add_f16_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xdf,0x00,0x00]
+
+v_add_f16_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xf7,0x00,0x00]
+
+v_add_f16_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xf9,0x00,0x00]
+
+v_add_f16_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xfd,0x00,0x00]
+
+v_add_f16_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xff,0x00,0x00]
+
+v_add_f16_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xfb,0x01,0x00]
+
+v_add_f16_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0x05,0x02,0x00]
+
+v_add_f16_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0xff,0x03,0x00]
+
+v_add_f16_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0x05,0x00,0x20]
+
+v_add_f16_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0x05,0x00,0x40]
+
+v_add_f16_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x1f,0xd1,0x01,0x05,0x00,0x60]
+
+v_add_f16_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x1f,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f16_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x1f,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f16_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x1f,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_f16_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x1f,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x40]
+
+v_sub_f16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x41]
+
+v_sub_f16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x40]
+
+v_sub_f16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x40]
+
+v_sub_f16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x40]
+
+v_sub_f16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x40]
+
+v_sub_f16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x40]
+
+v_sub_f16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x40]
+
+v_sub_f16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x40]
+
+v_sub_f16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x40]
+
+v_sub_f16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x40]
+
+v_sub_f16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x40]
+
+v_sub_f16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x40]
+
+v_sub_f16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x40]
+
+v_sub_f16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x40]
+
+v_sub_f16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x40]
+
+v_sub_f16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x40]
+
+v_sub_f16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x40]
+
+v_sub_f16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x40]
+
+v_sub_f16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x40,0x0b,0xfe,0x00,0x00]
+
+v_sub_f16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x40,0x56,0x34,0x00,0x00]
+
+v_sub_f16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x40]
+
+v_sub_f16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x40]
+
+v_sub_f16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x40]
+
+v_sub_f16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f16_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x20,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x20,0xd1,0xff,0x05,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xcb,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xcd,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xcf,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xd5,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xd7,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xd9,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xdb,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xdd,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xdf,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xf7,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xf9,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xfd,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xff,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xfb,0x01,0x00]
+
+v_sub_f16_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0x05,0x02,0x00]
+
+v_sub_f16_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0xff,0x03,0x00]
+
+v_sub_f16_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0x05,0x00,0x20]
+
+v_sub_f16_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0x05,0x00,0x40]
+
+v_sub_f16_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x20,0xd1,0x01,0x05,0x00,0x60]
+
+v_sub_f16_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x20,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x20,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f16_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x20,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_f16_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x20,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x42]
+
+v_subrev_f16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x43]
+
+v_subrev_f16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x42]
+
+v_subrev_f16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x42,0x0b,0xfe,0x00,0x00]
+
+v_subrev_f16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x42,0x56,0x34,0x00,0x00]
+
+v_subrev_f16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x42]
+
+v_subrev_f16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x42]
+
+v_subrev_f16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x42]
+
+v_subrev_f16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f16_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x21,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x21,0xd1,0xff,0x05,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xcb,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xcd,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xcf,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xd5,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xd7,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xd9,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xdb,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xdd,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xdf,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xf7,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xf9,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xfd,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xff,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xfb,0x01,0x00]
+
+v_subrev_f16_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0x05,0x02,0x00]
+
+v_subrev_f16_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0xff,0x03,0x00]
+
+v_subrev_f16_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0x05,0x00,0x20]
+
+v_subrev_f16_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0x05,0x00,0x40]
+
+v_subrev_f16_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x21,0xd1,0x01,0x05,0x00,0x60]
+
+v_subrev_f16_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x21,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x21,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f16_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x21,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_f16_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x21,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x44]
+
+v_mul_f16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x45]
+
+v_mul_f16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x44]
+
+v_mul_f16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x44]
+
+v_mul_f16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x44]
+
+v_mul_f16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x44]
+
+v_mul_f16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x44]
+
+v_mul_f16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x44]
+
+v_mul_f16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x44]
+
+v_mul_f16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x44]
+
+v_mul_f16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x44]
+
+v_mul_f16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x44]
+
+v_mul_f16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x44]
+
+v_mul_f16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x44]
+
+v_mul_f16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x44]
+
+v_mul_f16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x44]
+
+v_mul_f16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x44]
+
+v_mul_f16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x44]
+
+v_mul_f16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x44]
+
+v_mul_f16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x44,0x0b,0xfe,0x00,0x00]
+
+v_mul_f16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x44,0x56,0x34,0x00,0x00]
+
+v_mul_f16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x44]
+
+v_mul_f16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x44]
+
+v_mul_f16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x44]
+
+v_mul_f16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f16_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x22,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x22,0xd1,0xff,0x05,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xcb,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xcd,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xcf,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xd5,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xd7,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xd9,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xdb,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xdd,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xdf,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xf7,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xf9,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xfd,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xff,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xfb,0x01,0x00]
+
+v_mul_f16_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0x05,0x02,0x00]
+
+v_mul_f16_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0xff,0x03,0x00]
+
+v_mul_f16_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0x05,0x00,0x20]
+
+v_mul_f16_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0x05,0x00,0x40]
+
+v_mul_f16_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x22,0xd1,0x01,0x05,0x00,0x60]
+
+v_mul_f16_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x22,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x22,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f16_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x22,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_f16_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x22,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x46]
+
+v_mac_f16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x47]
+
+v_mac_f16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x46]
+
+v_mac_f16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x46]
+
+v_mac_f16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x46]
+
+v_mac_f16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x46]
+
+v_mac_f16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x46]
+
+v_mac_f16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x46]
+
+v_mac_f16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x46]
+
+v_mac_f16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x46]
+
+v_mac_f16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x46]
+
+v_mac_f16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x46]
+
+v_mac_f16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x46]
+
+v_mac_f16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x46]
+
+v_mac_f16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x46]
+
+v_mac_f16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x46]
+
+v_mac_f16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x46]
+
+v_mac_f16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x46]
+
+v_mac_f16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x46]
+
+v_mac_f16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x46,0x0b,0xfe,0x00,0x00]
+
+v_mac_f16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x46,0x56,0x34,0x00,0x00]
+
+v_mac_f16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x46]
+
+v_mac_f16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x46]
+
+v_mac_f16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x46]
+
+v_mac_f16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f16_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x23,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x23,0xd1,0xff,0x05,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xcb,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xcd,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xcf,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xd5,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xd7,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xd9,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xdb,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xdd,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xdf,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xf7,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xf9,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xfd,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xff,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xfb,0x01,0x00]
+
+v_mac_f16_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0x05,0x02,0x00]
+
+v_mac_f16_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0xff,0x03,0x00]
+
+v_mac_f16_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0x05,0x00,0x20]
+
+v_mac_f16_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0x05,0x00,0x40]
+
+v_mac_f16_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x23,0xd1,0x01,0x05,0x00,0x60]
+
+v_mac_f16_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x23,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x23,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f16_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x23,0xd1,0x01,0x05,0x00,0x00]
+
+v_mac_f16_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x23,0xd1,0x01,0x05,0x00,0x00]
+
+v_madmk_f16 v5, 0, 0x1121, v3
+// CHECK: [0x80,0x06,0x0a,0x48,0x21,0x11,0x00,0x00]
+
+v_madmk_f16 v255, 0, 0x1121, v3
+// CHECK: [0x80,0x06,0xfe,0x49,0x21,0x11,0x00,0x00]
+
+v_madmk_f16 v5, -1, 0x1121, v3
+// CHECK: [0xc1,0x06,0x0a,0x48,0x21,0x11,0x00,0x00]
+
+v_madmk_f16 v5, 0.5, 0x1121, v3
+// CHECK: [0xf0,0x06,0x0a,0x48,0x21,0x11,0x00,0x00]
+
+v_madmk_f16 v5, -4.0, 0x1121, v3
+// CHECK: [0xf7,0x06,0x0a,0x48,0x21,0x11,0x00,0x00]
+
+v_madmk_f16 v5, v1, 0x1121, v3
+// CHECK: [0x01,0x07,0x0a,0x48,0x21,0x11,0x00,0x00]
+
+v_madmk_f16 v5, v255, 0x1121, v3
+// CHECK: [0xff,0x07,0x0a,0x48,0x21,0x11,0x00,0x00]
+
+v_madmk_f16 v5, 0, 0xa1b1, v3
+// CHECK: [0x80,0x06,0x0a,0x48,0xb1,0xa1,0x00,0x00]
+
+v_madmk_f16 v5, 0, 0x1121, v255
+// CHECK: [0x80,0xfe,0x0b,0x48,0x21,0x11,0x00,0x00]
+
+v_madak_f16 v5, 0, v2, 0x1121
+// CHECK: [0x80,0x04,0x0a,0x4a,0x21,0x11,0x00,0x00]
+
+v_madak_f16 v255, 0, v2, 0x1121
+// CHECK: [0x80,0x04,0xfe,0x4b,0x21,0x11,0x00,0x00]
+
+v_madak_f16 v5, -1, v2, 0x1121
+// CHECK: [0xc1,0x04,0x0a,0x4a,0x21,0x11,0x00,0x00]
+
+v_madak_f16 v5, 0.5, v2, 0x1121
+// CHECK: [0xf0,0x04,0x0a,0x4a,0x21,0x11,0x00,0x00]
+
+v_madak_f16 v5, -4.0, v2, 0x1121
+// CHECK: [0xf7,0x04,0x0a,0x4a,0x21,0x11,0x00,0x00]
+
+v_madak_f16 v5, v1, v2, 0x1121
+// CHECK: [0x01,0x05,0x0a,0x4a,0x21,0x11,0x00,0x00]
+
+v_madak_f16 v5, v255, v2, 0x1121
+// CHECK: [0xff,0x05,0x0a,0x4a,0x21,0x11,0x00,0x00]
+
+v_madak_f16 v5, 0, v255, 0x1121
+// CHECK: [0x80,0xfe,0x0b,0x4a,0x21,0x11,0x00,0x00]
+
+v_madak_f16 v5, 0, v2, 0xa1b1
+// CHECK: [0x80,0x04,0x0a,0x4a,0xb1,0xa1,0x00,0x00]
+
+v_add_u16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x4c]
+
+v_add_u16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x4d]
+
+v_add_u16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x4c]
+
+v_add_u16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x4c]
+
+v_add_u16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x4c]
+
+v_add_u16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x4c]
+
+v_add_u16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x4c]
+
+v_add_u16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x4c]
+
+v_add_u16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x4c]
+
+v_add_u16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x4c]
+
+v_add_u16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x4c]
+
+v_add_u16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x4c]
+
+v_add_u16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x4c]
+
+v_add_u16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x4c]
+
+v_add_u16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x4c]
+
+v_add_u16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x4c]
+
+v_add_u16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x4c]
+
+v_add_u16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x4c]
+
+v_add_u16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x4c]
+
+v_add_u16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x4c,0x0b,0xfe,0x00,0x00]
+
+v_add_u16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x4c,0x56,0x34,0x00,0x00]
+
+v_add_u16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x4c]
+
+v_add_u16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x4c]
+
+v_add_u16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x4c]
+
+v_add_u16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x26,0xd1,0x80,0x04,0x00,0x00]
+
+v_add_u16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x26,0xd1,0xc1,0x04,0x00,0x00]
+
+v_add_u16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x26,0xd1,0xf0,0x04,0x00,0x00]
+
+v_add_u16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x26,0xd1,0xf7,0x04,0x00,0x00]
+
+v_add_u16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x26,0xd1,0x01,0x05,0x00,0x00]
+
+v_add_u16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x26,0xd1,0xff,0x05,0x00,0x00]
+
+v_add_u16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xca,0x00,0x00]
+
+v_add_u16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xcc,0x00,0x00]
+
+v_add_u16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xce,0x00,0x00]
+
+v_add_u16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xd4,0x00,0x00]
+
+v_add_u16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xd6,0x00,0x00]
+
+v_add_u16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xd8,0x00,0x00]
+
+v_add_u16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xda,0x00,0x00]
+
+v_add_u16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xdc,0x00,0x00]
+
+v_add_u16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xde,0x00,0x00]
+
+v_add_u16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xf6,0x00,0x00]
+
+v_add_u16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xf8,0x00,0x00]
+
+v_add_u16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xfc,0x00,0x00]
+
+v_add_u16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xfe,0x00,0x00]
+
+v_add_u16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0x00,0x01,0x00]
+
+v_add_u16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0x82,0x01,0x00]
+
+v_add_u16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xe0,0x01,0x00]
+
+v_add_u16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xee,0x01,0x00]
+
+v_add_u16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0x04,0x02,0x00]
+
+v_add_u16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x26,0xd1,0x80,0xfe,0x03,0x00]
+
+v_sub_u16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x4e]
+
+v_sub_u16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x4f]
+
+v_sub_u16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x4e]
+
+v_sub_u16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x4e,0x0b,0xfe,0x00,0x00]
+
+v_sub_u16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x4e,0x56,0x34,0x00,0x00]
+
+v_sub_u16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x4e]
+
+v_sub_u16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x4e]
+
+v_sub_u16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x4e]
+
+v_sub_u16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x27,0xd1,0x80,0x04,0x00,0x00]
+
+v_sub_u16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x27,0xd1,0xc1,0x04,0x00,0x00]
+
+v_sub_u16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x27,0xd1,0xf0,0x04,0x00,0x00]
+
+v_sub_u16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x27,0xd1,0xf7,0x04,0x00,0x00]
+
+v_sub_u16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x27,0xd1,0x01,0x05,0x00,0x00]
+
+v_sub_u16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x27,0xd1,0xff,0x05,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xca,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xcc,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xce,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xd4,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xd6,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xd8,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xda,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xdc,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xde,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xf6,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xf8,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xfc,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xfe,0x00,0x00]
+
+v_sub_u16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0x00,0x01,0x00]
+
+v_sub_u16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0x82,0x01,0x00]
+
+v_sub_u16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xe0,0x01,0x00]
+
+v_sub_u16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xee,0x01,0x00]
+
+v_sub_u16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0x04,0x02,0x00]
+
+v_sub_u16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x27,0xd1,0x80,0xfe,0x03,0x00]
+
+v_subrev_u16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x50]
+
+v_subrev_u16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x51]
+
+v_subrev_u16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x50]
+
+v_subrev_u16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x50,0x0b,0xfe,0x00,0x00]
+
+v_subrev_u16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x50,0x56,0x34,0x00,0x00]
+
+v_subrev_u16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x50]
+
+v_subrev_u16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x50]
+
+v_subrev_u16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x50]
+
+v_subrev_u16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x28,0xd1,0x80,0x04,0x00,0x00]
+
+v_subrev_u16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x28,0xd1,0xc1,0x04,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x28,0xd1,0xf0,0x04,0x00,0x00]
+
+v_subrev_u16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x28,0xd1,0xf7,0x04,0x00,0x00]
+
+v_subrev_u16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x28,0xd1,0x01,0x05,0x00,0x00]
+
+v_subrev_u16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x28,0xd1,0xff,0x05,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xca,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xcc,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xce,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xd4,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xd6,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xd8,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xda,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xdc,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xde,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xf6,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xf8,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xfc,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xfe,0x00,0x00]
+
+v_subrev_u16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0x00,0x01,0x00]
+
+v_subrev_u16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0x82,0x01,0x00]
+
+v_subrev_u16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xe0,0x01,0x00]
+
+v_subrev_u16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xee,0x01,0x00]
+
+v_subrev_u16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0x04,0x02,0x00]
+
+v_subrev_u16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x28,0xd1,0x80,0xfe,0x03,0x00]
+
+v_mul_lo_u16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x53]
+
+v_mul_lo_u16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x52]
+
+v_mul_lo_u16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x52,0x0b,0xfe,0x00,0x00]
+
+v_mul_lo_u16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x52,0x56,0x34,0x00,0x00]
+
+v_mul_lo_u16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x52]
+
+v_mul_lo_u16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x52]
+
+v_mul_lo_u16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x52]
+
+v_mul_lo_u16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0x04,0x00,0x00]
+
+v_mul_lo_u16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x29,0xd1,0x80,0x04,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x29,0xd1,0xc1,0x04,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x29,0xd1,0xf0,0x04,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x29,0xd1,0xf7,0x04,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x29,0xd1,0x01,0x05,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x29,0xd1,0xff,0x05,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xca,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xcc,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xce,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xd4,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xd6,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xd8,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xda,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xdc,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xde,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xf6,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xf8,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xfc,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xfe,0x00,0x00]
+
+v_mul_lo_u16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0x00,0x01,0x00]
+
+v_mul_lo_u16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0x82,0x01,0x00]
+
+v_mul_lo_u16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xe0,0x01,0x00]
+
+v_mul_lo_u16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xee,0x01,0x00]
+
+v_mul_lo_u16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0x04,0x02,0x00]
+
+v_mul_lo_u16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x29,0xd1,0x80,0xfe,0x03,0x00]
+
+v_lshlrev_b16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x54]
 
-v_movreld_b32_e64 v0, -1
-// CHECK: [0x00,0x00,0x76,0xd1,0xc1,0x00,0x00,0x00]
+v_lshlrev_b16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x55]
 
-v_movreld_b32_e64 v0, 0.5
-// CHECK: [0x00,0x00,0x76,0xd1,0xf0,0x00,0x00,0x00]
+v_lshlrev_b16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x54]
 
-v_movreld_b32_e64 v0, -4.0
-// CHECK: [0x00,0x00,0x76,0xd1,0xf7,0x00,0x00,0x00]
+v_lshlrev_b16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x54]
 
-v_movreld_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x76,0xd1,0x00,0x01,0x00,0x00]
+v_lshlrev_b16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x54]
 
-v_movreld_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x76,0xd1,0xff,0x01,0x00,0x00]
+v_lshlrev_b16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x54]
 
-v_movrels_b32 v0, v0
-// CHECK: [0x00,0x6f,0x00,0x7e]
+v_lshlrev_b16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x54]
 
-v_movrels_b32 v255, v0
-// CHECK: [0x00,0x6f,0xfe,0x7f]
+v_lshlrev_b16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x54]
 
-v_movrels_b32 v0, v255
-// CHECK: [0xff,0x6f,0x00,0x7e]
+v_lshlrev_b16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x54]
 
-v_movrels_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x77,0xd1,0x00,0x01,0x00,0x00]
+v_lshlrev_b16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x54]
 
-v_movrels_b32_e64 v255, v0
-// CHECK: [0xff,0x00,0x77,0xd1,0x00,0x01,0x00,0x00]
+v_lshlrev_b16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x54]
 
-v_movrels_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x77,0xd1,0xff,0x01,0x00,0x00]
+v_lshlrev_b16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x54]
 
-v_movrelsd_b32 v0, v0
-// CHECK: [0x00,0x71,0x00,0x7e]
+v_lshlrev_b16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x54]
 
-v_movrelsd_b32 v255, v0
-// CHECK: [0x00,0x71,0xfe,0x7f]
+v_lshlrev_b16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x54]
 
-v_movrelsd_b32 v0, v255
-// CHECK: [0xff,0x71,0x00,0x7e]
+v_lshlrev_b16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x54]
 
-v_movrelsd_b32_e64 v0, v0
-// CHECK: [0x00,0x00,0x78,0xd1,0x00,0x01,0x00,0x00]
+v_lshlrev_b16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x54]
 
-v_movrelsd_b32_e64 v255, v0
-// CHECK: [0xff,0x00,0x78,0xd1,0x00,0x01,0x00,0x00]
+v_lshlrev_b16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x54]
 
-v_movrelsd_b32_e64 v0, v255
-// CHECK: [0x00,0x00,0x78,0xd1,0xff,0x01,0x00,0x00]
+v_lshlrev_b16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x54]
 
-v_cvt_f16_u16 v0, s0
-// CHECK: [0x00,0x72,0x00,0x7e]
+v_lshlrev_b16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x54]
 
-v_cvt_f16_u16 v255, s0
-// CHECK: [0x00,0x72,0xfe,0x7f]
+v_lshlrev_b16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x54,0x0b,0xfe,0x00,0x00]
 
-v_cvt_f16_u16 v0, s101
-// CHECK: [0x65,0x72,0x00,0x7e]
+v_lshlrev_b16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x54,0x56,0x34,0x00,0x00]
 
-v_cvt_f16_u16 v0, flat_scratch_lo
-// CHECK: [0x66,0x72,0x00,0x7e]
+v_lshlrev_b16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x54]
 
-v_cvt_f16_u16 v0, flat_scratch_hi
-// CHECK: [0x67,0x72,0x00,0x7e]
+v_lshlrev_b16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x54]
 
-v_cvt_f16_u16 v0, vcc_lo
-// CHECK: [0x6a,0x72,0x00,0x7e]
+v_lshlrev_b16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x54]
 
-v_cvt_f16_u16 v0, vcc_hi
-// CHECK: [0x6b,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cvt_f16_u16 v0, tba_lo
-// CHECK: [0x6c,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x2a,0xd1,0x80,0x04,0x00,0x00]
 
-v_cvt_f16_u16 v0, tba_hi
-// CHECK: [0x6d,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x2a,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cvt_f16_u16 v0, tma_lo
-// CHECK: [0x6e,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x2a,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cvt_f16_u16 v0, tma_hi
-// CHECK: [0x6f,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x2a,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cvt_f16_u16 v0, ttmp11
-// CHECK: [0x7b,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x2a,0xd1,0x01,0x05,0x00,0x00]
 
-v_cvt_f16_u16 v0, m0
-// CHECK: [0x7c,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x2a,0xd1,0xff,0x05,0x00,0x00]
 
-v_cvt_f16_u16 v0, exec_lo
-// CHECK: [0x7e,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xca,0x00,0x00]
 
-v_cvt_f16_u16 v0, exec_hi
-// CHECK: [0x7f,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xcc,0x00,0x00]
 
-v_cvt_f16_u16 v0, 0
-// CHECK: [0x80,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xce,0x00,0x00]
 
-v_cvt_f16_u16 v0, -1
-// CHECK: [0xc1,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cvt_f16_u16 v0, 0.5
-// CHECK: [0xf0,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cvt_f16_u16 v0, -4.0
-// CHECK: [0xf7,0x72,0x00,0x7e]
+v_lshlrev_b16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cvt_f16_u16 v0, 0xfe0b
-// CHECK: [0xff,0x72,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xda,0x00,0x00]
 
-v_cvt_f16_u16 v0, 0x3456
-// CHECK: [0xff,0x72,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cvt_f16_u16 v0, v0
-// CHECK: [0x00,0x73,0x00,0x7e]
+v_lshlrev_b16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xde,0x00,0x00]
 
-v_cvt_f16_u16 v0, v255
-// CHECK: [0xff,0x73,0x00,0x7e]
+v_lshlrev_b16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cvt_f16_u16_e64 v0, s0
-// CHECK: [0x00,0x00,0x79,0xd1,0x00,0x00,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cvt_f16_u16_e64 v255, s0
-// CHECK: [0xff,0x00,0x79,0xd1,0x00,0x00,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cvt_f16_u16_e64 v0, s101
-// CHECK: [0x00,0x00,0x79,0xd1,0x65,0x00,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cvt_f16_u16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x79,0xd1,0x66,0x00,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0x00,0x01,0x00]
 
-v_cvt_f16_u16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x79,0xd1,0x67,0x00,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0x82,0x01,0x00]
 
-v_cvt_f16_u16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x79,0xd1,0x6a,0x00,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cvt_f16_u16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x79,0xd1,0x6b,0x00,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xee,0x01,0x00]
 
-v_cvt_f16_u16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x79,0xd1,0x6c,0x00,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0x04,0x02,0x00]
 
-v_cvt_f16_u16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x79,0xd1,0x6d,0x00,0x00,0x00]
+v_lshlrev_b16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x2a,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cvt_f16_u16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x79,0xd1,0x6e,0x00,0x00,0x00]
+v_lshrrev_b16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x56]
 
-v_cvt_f16_u16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x79,0xd1,0x6f,0x00,0x00,0x00]
+v_lshrrev_b16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x57]
 
-v_cvt_f16_u16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x79,0xd1,0x7b,0x00,0x00,0x00]
+v_lshrrev_b16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x56]
 
-v_cvt_f16_u16_e64 v0, m0
-// CHECK: [0x00,0x00,0x79,0xd1,0x7c,0x00,0x00,0x00]
+v_lshrrev_b16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x56]
 
-v_cvt_f16_u16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x79,0xd1,0x7e,0x00,0x00,0x00]
+v_lshrrev_b16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x56]
 
-v_cvt_f16_u16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x79,0xd1,0x7f,0x00,0x00,0x00]
+v_lshrrev_b16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x56]
 
-v_cvt_f16_u16_e64 v0, 0
-// CHECK: [0x00,0x00,0x79,0xd1,0x80,0x00,0x00,0x00]
+v_lshrrev_b16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x56]
 
-v_cvt_f16_u16_e64 v0, -1
-// CHECK: [0x00,0x00,0x79,0xd1,0xc1,0x00,0x00,0x00]
+v_lshrrev_b16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x56]
 
-v_cvt_f16_u16_e64 v0, v0
-// CHECK: [0x00,0x00,0x79,0xd1,0x00,0x01,0x00,0x00]
+v_lshrrev_b16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x56]
 
-v_cvt_f16_u16_e64 v0, v255
-// CHECK: [0x00,0x00,0x79,0xd1,0xff,0x01,0x00,0x00]
+v_lshrrev_b16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x56]
 
-v_cvt_f16_i16 v0, s0
-// CHECK: [0x00,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x56]
 
-v_cvt_f16_i16 v255, s0
-// CHECK: [0x00,0x74,0xfe,0x7f]
+v_lshrrev_b16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x56]
 
-v_cvt_f16_i16 v0, s101
-// CHECK: [0x65,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x56]
 
-v_cvt_f16_i16 v0, flat_scratch_lo
-// CHECK: [0x66,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x56]
 
-v_cvt_f16_i16 v0, flat_scratch_hi
-// CHECK: [0x67,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x56]
 
-v_cvt_f16_i16 v0, vcc_lo
-// CHECK: [0x6a,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x56]
 
-v_cvt_f16_i16 v0, vcc_hi
-// CHECK: [0x6b,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x56]
 
-v_cvt_f16_i16 v0, tba_lo
-// CHECK: [0x6c,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x56]
 
-v_cvt_f16_i16 v0, tba_hi
-// CHECK: [0x6d,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x56]
 
-v_cvt_f16_i16 v0, tma_lo
-// CHECK: [0x6e,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x56,0x0b,0xfe,0x00,0x00]
 
-v_cvt_f16_i16 v0, tma_hi
-// CHECK: [0x6f,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x56,0x56,0x34,0x00,0x00]
 
-v_cvt_f16_i16 v0, ttmp11
-// CHECK: [0x7b,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x56]
 
-v_cvt_f16_i16 v0, m0
-// CHECK: [0x7c,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x56]
 
-v_cvt_f16_i16 v0, exec_lo
-// CHECK: [0x7e,0x74,0x00,0x7e]
+v_lshrrev_b16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x56]
 
-v_cvt_f16_i16 v0, exec_hi
-// CHECK: [0x7f,0x74,0x00,0x7e]
+v_lshrrev_b16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0x04,0x00,0x00]
 
-v_cvt_f16_i16 v0, 0
-// CHECK: [0x80,0x74,0x00,0x7e]
+v_lshrrev_b16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x2b,0xd1,0x80,0x04,0x00,0x00]
 
-v_cvt_f16_i16 v0, -1
-// CHECK: [0xc1,0x74,0x00,0x7e]
+v_lshrrev_b16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x2b,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cvt_f16_i16 v0, 0.5
-// CHECK: [0xf0,0x74,0x00,0x7e]
+v_lshrrev_b16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x2b,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cvt_f16_i16 v0, -4.0
-// CHECK: [0xf7,0x74,0x00,0x7e]
+v_lshrrev_b16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x2b,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cvt_f16_i16 v0, 0xfe0b
-// CHECK: [0xff,0x74,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_lshrrev_b16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x2b,0xd1,0x01,0x05,0x00,0x00]
 
-v_cvt_f16_i16 v0, 0x3456
-// CHECK: [0xff,0x74,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_lshrrev_b16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x2b,0xd1,0xff,0x05,0x00,0x00]
 
-v_cvt_f16_i16 v0, v0
-// CHECK: [0x00,0x75,0x00,0x7e]
+v_lshrrev_b16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xca,0x00,0x00]
 
-v_cvt_f16_i16 v0, v255
-// CHECK: [0xff,0x75,0x00,0x7e]
+v_lshrrev_b16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xcc,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, s0
-// CHECK: [0x00,0x00,0x7a,0xd1,0x00,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xce,0x00,0x00]
 
-v_cvt_f16_i16_e64 v255, s0
-// CHECK: [0xff,0x00,0x7a,0xd1,0x00,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, s101
-// CHECK: [0x00,0x00,0x7a,0xd1,0x65,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x7a,0xd1,0x66,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x7a,0xd1,0x67,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xda,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x7a,0xd1,0x6a,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x7a,0xd1,0x6b,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xde,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x7a,0xd1,0x6c,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x7a,0xd1,0x6d,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x7a,0xd1,0x6e,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x7a,0xd1,0x6f,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cvt_f16_i16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x7a,0xd1,0x7b,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0x00,0x01,0x00]
 
-v_cvt_f16_i16_e64 v0, m0
-// CHECK: [0x00,0x00,0x7a,0xd1,0x7c,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0x82,0x01,0x00]
 
-v_cvt_f16_i16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x7a,0xd1,0x7e,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cvt_f16_i16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x7a,0xd1,0x7f,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xee,0x01,0x00]
 
-v_cvt_f16_i16_e64 v0, 0
-// CHECK: [0x00,0x00,0x7a,0xd1,0x80,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0x04,0x02,0x00]
 
-v_cvt_f16_i16_e64 v0, -1
-// CHECK: [0x00,0x00,0x7a,0xd1,0xc1,0x00,0x00,0x00]
+v_lshrrev_b16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x2b,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cvt_f16_i16_e64 v0, v0
-// CHECK: [0x00,0x00,0x7a,0xd1,0x00,0x01,0x00,0x00]
+v_ashrrev_i16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x58]
 
-v_cvt_f16_i16_e64 v0, v255
-// CHECK: [0x00,0x00,0x7a,0xd1,0xff,0x01,0x00,0x00]
+v_ashrrev_i16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x59]
 
-v_cvt_u16_f16 v0, s0
-// CHECK: [0x00,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v255, s0
-// CHECK: [0x00,0x76,0xfe,0x7f]
+v_ashrrev_i16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, s101
-// CHECK: [0x65,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, vcc_lo
-// CHECK: [0x6a,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, vcc_hi
-// CHECK: [0x6b,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, tba_lo
-// CHECK: [0x6c,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, tba_hi
-// CHECK: [0x6d,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, tma_lo
-// CHECK: [0x6e,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, tma_hi
-// CHECK: [0x6f,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, ttmp11
-// CHECK: [0x7b,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, m0
-// CHECK: [0x7c,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, exec_lo
-// CHECK: [0x7e,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, exec_hi
-// CHECK: [0x7f,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, 0
-// CHECK: [0x80,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, -1
-// CHECK: [0xc1,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x58]
 
-v_cvt_u16_f16 v0, 0.5
-// CHECK: [0xf0,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x58,0x0b,0xfe,0x00,0x00]
 
-v_cvt_u16_f16 v0, -4.0
-// CHECK: [0xf7,0x76,0x00,0x7e]
+v_ashrrev_i16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x58,0x56,0x34,0x00,0x00]
 
-v_cvt_u16_f16 v0, 0xfe0b
-// CHECK: [0xff,0x76,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_ashrrev_i16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x58]
 
-v_cvt_u16_f16 v0, 0x3456
-// CHECK: [0xff,0x76,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_ashrrev_i16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x58]
 
-v_cvt_u16_f16 v0, v0
-// CHECK: [0x00,0x77,0x00,0x7e]
+v_ashrrev_i16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x58]
 
-v_cvt_u16_f16 v0, v255
-// CHECK: [0xff,0x77,0x00,0x7e]
+v_ashrrev_i16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x7b,0xd1,0x00,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x2c,0xd1,0x80,0x04,0x00,0x00]
 
-v_cvt_u16_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x7b,0xd1,0x00,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x2c,0xd1,0xc1,0x04,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x7b,0xd1,0x65,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x2c,0xd1,0xf0,0x04,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x7b,0xd1,0x66,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x2c,0xd1,0xf7,0x04,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x7b,0xd1,0x67,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x2c,0xd1,0x01,0x05,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x7b,0xd1,0x6a,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x2c,0xd1,0xff,0x05,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x7b,0xd1,0x6b,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xca,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x7b,0xd1,0x6c,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xcc,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x7b,0xd1,0x6d,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xce,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x7b,0xd1,0x6e,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xd4,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x7b,0xd1,0x6f,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xd6,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x7b,0xd1,0x7b,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xd8,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x7b,0xd1,0x7c,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xda,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x7b,0xd1,0x7e,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xdc,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x7b,0xd1,0x7f,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xde,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x7b,0xd1,0xfd,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xf6,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x7b,0xd1,0x00,0x01,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xf8,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x7b,0xd1,0xff,0x01,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xfc,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x7b,0xd1,0x00,0x00,0x00,0x20]
+v_ashrrev_i16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xfe,0x00,0x00]
 
-v_cvt_u16_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x7b,0xd1,0x00,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0x00,0x01,0x00]
 
-v_cvt_u16_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x7b,0xd1,0x00,0x00,0x00,0x00]
+v_ashrrev_i16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0x82,0x01,0x00]
 
-v_cvt_i16_f16 v0, s0
-// CHECK: [0x00,0x78,0x00,0x7e]
+v_ashrrev_i16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xe0,0x01,0x00]
 
-v_cvt_i16_f16 v255, s0
-// CHECK: [0x00,0x78,0xfe,0x7f]
+v_ashrrev_i16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xee,0x01,0x00]
 
-v_cvt_i16_f16 v0, s101
-// CHECK: [0x65,0x78,0x00,0x7e]
+v_ashrrev_i16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0x04,0x02,0x00]
 
-v_cvt_i16_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x78,0x00,0x7e]
+v_ashrrev_i16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x2c,0xd1,0x80,0xfe,0x03,0x00]
 
-v_cvt_i16_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x78,0x00,0x7e]
+v_max_f16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, vcc_lo
-// CHECK: [0x6a,0x78,0x00,0x7e]
+v_max_f16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x5b]
 
-v_cvt_i16_f16 v0, vcc_hi
-// CHECK: [0x6b,0x78,0x00,0x7e]
+v_max_f16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, tba_lo
-// CHECK: [0x6c,0x78,0x00,0x7e]
+v_max_f16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, tba_hi
-// CHECK: [0x6d,0x78,0x00,0x7e]
+v_max_f16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, tma_lo
-// CHECK: [0x6e,0x78,0x00,0x7e]
+v_max_f16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, tma_hi
-// CHECK: [0x6f,0x78,0x00,0x7e]
+v_max_f16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, ttmp11
-// CHECK: [0x7b,0x78,0x00,0x7e]
+v_max_f16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, m0
-// CHECK: [0x7c,0x78,0x00,0x7e]
+v_max_f16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, exec_lo
-// CHECK: [0x7e,0x78,0x00,0x7e]
+v_max_f16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, exec_hi
-// CHECK: [0x7f,0x78,0x00,0x7e]
+v_max_f16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, 0
-// CHECK: [0x80,0x78,0x00,0x7e]
+v_max_f16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, -1
-// CHECK: [0xc1,0x78,0x00,0x7e]
+v_max_f16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, 0.5
-// CHECK: [0xf0,0x78,0x00,0x7e]
+v_max_f16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, -4.0
-// CHECK: [0xf7,0x78,0x00,0x7e]
+v_max_f16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, 0xfe0b
-// CHECK: [0xff,0x78,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_max_f16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, 0x3456
-// CHECK: [0xff,0x78,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_max_f16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, v0
-// CHECK: [0x00,0x79,0x00,0x7e]
+v_max_f16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16 v0, v255
-// CHECK: [0xff,0x79,0x00,0x7e]
+v_max_f16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x5a]
 
-v_cvt_i16_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x7c,0xd1,0x00,0x00,0x00,0x00]
+v_max_f16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x5a,0x0b,0xfe,0x00,0x00]
 
-v_cvt_i16_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x7c,0xd1,0x00,0x00,0x00,0x00]
+v_max_f16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x5a,0x56,0x34,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x7c,0xd1,0x65,0x00,0x00,0x00]
+v_max_f16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x5a]
 
-v_cvt_i16_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x7c,0xd1,0x66,0x00,0x00,0x00]
+v_max_f16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x5a]
 
-v_cvt_i16_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x7c,0xd1,0x67,0x00,0x00,0x00]
+v_max_f16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x5a]
 
-v_cvt_i16_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x7c,0xd1,0x6a,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0x05,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x7c,0xd1,0x6b,0x00,0x00,0x00]
+v_max_f16_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x2d,0xd1,0x01,0x05,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x7c,0xd1,0x6c,0x00,0x00,0x00]
+v_max_f16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x2d,0xd1,0xff,0x05,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x7c,0xd1,0x6d,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xcb,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x7c,0xd1,0x6e,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xcd,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x7c,0xd1,0x6f,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xcf,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x7c,0xd1,0x7b,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xd5,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x7c,0xd1,0x7c,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xd7,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x7c,0xd1,0x7e,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xd9,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x7c,0xd1,0x7f,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xdb,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x7c,0xd1,0xfd,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xdd,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x7c,0xd1,0x00,0x01,0x00,0x00]
+v_max_f16_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xdf,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x7c,0xd1,0xff,0x01,0x00,0x00]
+v_max_f16_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xf7,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x7c,0xd1,0x00,0x00,0x00,0x20]
+v_max_f16_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xf9,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x7c,0xd1,0x00,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xfd,0x00,0x00]
 
-v_cvt_i16_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x7c,0xd1,0x00,0x00,0x00,0x00]
+v_max_f16_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xff,0x00,0x00]
 
-v_rcp_f16 v0, s0
-// CHECK: [0x00,0x7a,0x00,0x7e]
+v_max_f16_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xfb,0x01,0x00]
 
-v_rcp_f16 v255, s0
-// CHECK: [0x00,0x7a,0xfe,0x7f]
+v_max_f16_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0x05,0x02,0x00]
 
-v_rcp_f16 v0, s101
-// CHECK: [0x65,0x7a,0x00,0x7e]
+v_max_f16_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0xff,0x03,0x00]
 
-v_rcp_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x7a,0x00,0x7e]
+v_max_f16_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0x05,0x00,0x20]
 
-v_rcp_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x7a,0x00,0x7e]
+v_max_f16_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0x05,0x00,0x40]
 
-v_rcp_f16 v0, vcc_lo
-// CHECK: [0x6a,0x7a,0x00,0x7e]
+v_max_f16_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x2d,0xd1,0x01,0x05,0x00,0x60]
 
-v_rcp_f16 v0, vcc_hi
-// CHECK: [0x6b,0x7a,0x00,0x7e]
+v_max_f16_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x2d,0xd1,0x01,0x05,0x00,0x00]
 
-v_rcp_f16 v0, tba_lo
-// CHECK: [0x6c,0x7a,0x00,0x7e]
+v_max_f16_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x2d,0xd1,0x01,0x05,0x00,0x00]
 
-v_rcp_f16 v0, tba_hi
-// CHECK: [0x6d,0x7a,0x00,0x7e]
+v_max_f16_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x2d,0xd1,0x01,0x05,0x00,0x00]
 
-v_rcp_f16 v0, tma_lo
-// CHECK: [0x6e,0x7a,0x00,0x7e]
+v_max_f16_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x2d,0xd1,0x01,0x05,0x00,0x00]
 
-v_rcp_f16 v0, tma_hi
-// CHECK: [0x6f,0x7a,0x00,0x7e]
+v_min_f16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, ttmp11
-// CHECK: [0x7b,0x7a,0x00,0x7e]
+v_min_f16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x5d]
 
-v_rcp_f16 v0, m0
-// CHECK: [0x7c,0x7a,0x00,0x7e]
+v_min_f16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, exec_lo
-// CHECK: [0x7e,0x7a,0x00,0x7e]
+v_min_f16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, exec_hi
-// CHECK: [0x7f,0x7a,0x00,0x7e]
+v_min_f16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, 0
-// CHECK: [0x80,0x7a,0x00,0x7e]
+v_min_f16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, -1
-// CHECK: [0xc1,0x7a,0x00,0x7e]
+v_min_f16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, 0.5
-// CHECK: [0xf0,0x7a,0x00,0x7e]
+v_min_f16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, -4.0
-// CHECK: [0xf7,0x7a,0x00,0x7e]
+v_min_f16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, 0xfe0b
-// CHECK: [0xff,0x7a,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_min_f16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, 0x3456
-// CHECK: [0xff,0x7a,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_min_f16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, v0
-// CHECK: [0x00,0x7b,0x00,0x7e]
+v_min_f16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x5c]
 
-v_rcp_f16 v0, v255
-// CHECK: [0xff,0x7b,0x00,0x7e]
+v_min_f16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x5c]
 
-v_rcp_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x7d,0xd1,0x00,0x00,0x00,0x00]
+v_min_f16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x5c]
 
-v_rcp_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x7d,0xd1,0x00,0x00,0x00,0x00]
+v_min_f16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x5c]
 
-v_rcp_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x7d,0xd1,0x65,0x00,0x00,0x00]
+v_min_f16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x5c]
 
-v_rcp_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x7d,0xd1,0x66,0x00,0x00,0x00]
+v_min_f16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x5c]
 
-v_rcp_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x7d,0xd1,0x67,0x00,0x00,0x00]
+v_min_f16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x5c]
 
-v_rcp_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x7d,0xd1,0x6a,0x00,0x00,0x00]
+v_min_f16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x5c]
 
-v_rcp_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x7d,0xd1,0x6b,0x00,0x00,0x00]
+v_min_f16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x5c,0x0b,0xfe,0x00,0x00]
 
-v_rcp_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x7d,0xd1,0x6c,0x00,0x00,0x00]
+v_min_f16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x5c,0x56,0x34,0x00,0x00]
 
-v_rcp_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x7d,0xd1,0x6d,0x00,0x00,0x00]
+v_min_f16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x5c]
 
-v_rcp_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x7d,0xd1,0x6e,0x00,0x00,0x00]
+v_min_f16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x5c]
 
-v_rcp_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x7d,0xd1,0x6f,0x00,0x00,0x00]
+v_min_f16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x5c]
 
-v_rcp_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x7d,0xd1,0x7b,0x00,0x00,0x00]
+v_min_f16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0x05,0x00,0x00]
 
-v_rcp_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x7d,0xd1,0x7c,0x00,0x00,0x00]
+v_min_f16_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x2e,0xd1,0x01,0x05,0x00,0x00]
 
-v_rcp_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x7d,0xd1,0x7e,0x00,0x00,0x00]
+v_min_f16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x2e,0xd1,0xff,0x05,0x00,0x00]
 
-v_rcp_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x7d,0xd1,0x7f,0x00,0x00,0x00]
+v_min_f16_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xcb,0x00,0x00]
 
-v_rcp_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x7d,0xd1,0xfd,0x00,0x00,0x00]
+v_min_f16_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xcd,0x00,0x00]
 
-v_rcp_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x7d,0xd1,0x00,0x01,0x00,0x00]
+v_min_f16_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xcf,0x00,0x00]
 
-v_rcp_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x7d,0xd1,0xff,0x01,0x00,0x00]
+v_min_f16_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xd5,0x00,0x00]
 
-v_rcp_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x7d,0xd1,0x00,0x00,0x00,0x20]
+v_min_f16_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xd7,0x00,0x00]
 
-v_rcp_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x7d,0xd1,0x00,0x00,0x00,0x00]
+v_min_f16_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xd9,0x00,0x00]
 
-v_rcp_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x7d,0xd1,0x00,0x00,0x00,0x00]
+v_min_f16_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xdb,0x00,0x00]
 
-v_sqrt_f16 v0, s0
-// CHECK: [0x00,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xdd,0x00,0x00]
 
-v_sqrt_f16 v255, s0
-// CHECK: [0x00,0x7c,0xfe,0x7f]
+v_min_f16_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xdf,0x00,0x00]
 
-v_sqrt_f16 v0, s101
-// CHECK: [0x65,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xf7,0x00,0x00]
 
-v_sqrt_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xf9,0x00,0x00]
 
-v_sqrt_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xfd,0x00,0x00]
 
-v_sqrt_f16 v0, vcc_lo
-// CHECK: [0x6a,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xff,0x00,0x00]
 
-v_sqrt_f16 v0, vcc_hi
-// CHECK: [0x6b,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xfb,0x01,0x00]
 
-v_sqrt_f16 v0, tba_lo
-// CHECK: [0x6c,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0x05,0x02,0x00]
 
-v_sqrt_f16 v0, tba_hi
-// CHECK: [0x6d,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0xff,0x03,0x00]
 
-v_sqrt_f16 v0, tma_lo
-// CHECK: [0x6e,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0x05,0x00,0x20]
 
-v_sqrt_f16 v0, tma_hi
-// CHECK: [0x6f,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, -s2
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0x05,0x00,0x40]
 
-v_sqrt_f16 v0, ttmp11
-// CHECK: [0x7b,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x2e,0xd1,0x01,0x05,0x00,0x60]
 
-v_sqrt_f16 v0, m0
-// CHECK: [0x7c,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x2e,0xd1,0x01,0x05,0x00,0x00]
 
-v_sqrt_f16 v0, exec_lo
-// CHECK: [0x7e,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x2e,0xd1,0x01,0x05,0x00,0x00]
 
-v_sqrt_f16 v0, exec_hi
-// CHECK: [0x7f,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x2e,0xd1,0x01,0x05,0x00,0x00]
 
-v_sqrt_f16 v0, 0
-// CHECK: [0x80,0x7c,0x00,0x7e]
+v_min_f16_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x2e,0xd1,0x01,0x05,0x00,0x00]
 
-v_sqrt_f16 v0, -1
-// CHECK: [0xc1,0x7c,0x00,0x7e]
+v_max_u16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x5e]
 
-v_sqrt_f16 v0, 0.5
-// CHECK: [0xf0,0x7c,0x00,0x7e]
+v_max_u16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x5f]
 
-v_sqrt_f16 v0, -4.0
-// CHECK: [0xf7,0x7c,0x00,0x7e]
+v_max_u16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x5e]
 
-v_sqrt_f16 v0, 0xfe0b
-// CHECK: [0xff,0x7c,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_max_u16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x5e]
 
-v_sqrt_f16 v0, 0x3456
-// CHECK: [0xff,0x7c,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_max_u16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x5e]
 
-v_sqrt_f16 v0, v0
-// CHECK: [0x00,0x7d,0x00,0x7e]
+v_max_u16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x5e]
 
-v_sqrt_f16 v0, v255
-// CHECK: [0xff,0x7d,0x00,0x7e]
+v_max_u16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x7e,0xd1,0x00,0x00,0x00,0x00]
+v_max_u16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x7e,0xd1,0x00,0x00,0x00,0x00]
+v_max_u16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x7e,0xd1,0x65,0x00,0x00,0x00]
+v_max_u16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x7e,0xd1,0x66,0x00,0x00,0x00]
+v_max_u16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x7e,0xd1,0x67,0x00,0x00,0x00]
+v_max_u16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x7e,0xd1,0x6a,0x00,0x00,0x00]
+v_max_u16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x7e,0xd1,0x6b,0x00,0x00,0x00]
+v_max_u16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x7e,0xd1,0x6c,0x00,0x00,0x00]
+v_max_u16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x7e,0xd1,0x6d,0x00,0x00,0x00]
+v_max_u16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x7e,0xd1,0x6e,0x00,0x00,0x00]
+v_max_u16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x7e,0xd1,0x6f,0x00,0x00,0x00]
+v_max_u16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x7e,0xd1,0x7b,0x00,0x00,0x00]
+v_max_u16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x7e,0xd1,0x7c,0x00,0x00,0x00]
+v_max_u16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x5e,0x0b,0xfe,0x00,0x00]
 
-v_sqrt_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x7e,0xd1,0x7e,0x00,0x00,0x00]
+v_max_u16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x5e,0x56,0x34,0x00,0x00]
 
-v_sqrt_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x7e,0xd1,0x7f,0x00,0x00,0x00]
+v_max_u16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x7e,0xd1,0xfd,0x00,0x00,0x00]
+v_max_u16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x5e]
 
-v_sqrt_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x7e,0xd1,0x00,0x01,0x00,0x00]
+v_max_u16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x5e]
 
-v_sqrt_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x7e,0xd1,0xff,0x01,0x00,0x00]
+v_max_u16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0x04,0x00,0x00]
 
-v_sqrt_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x7e,0xd1,0x00,0x00,0x00,0x20]
+v_max_u16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x2f,0xd1,0x80,0x04,0x00,0x00]
 
-v_sqrt_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x7e,0xd1,0x00,0x00,0x00,0x00]
+v_max_u16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x2f,0xd1,0xc1,0x04,0x00,0x00]
 
-v_sqrt_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x7e,0xd1,0x00,0x00,0x00,0x00]
+v_max_u16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x2f,0xd1,0xf0,0x04,0x00,0x00]
 
-v_rsq_f16 v0, s0
-// CHECK: [0x00,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x2f,0xd1,0xf7,0x04,0x00,0x00]
 
-v_rsq_f16 v255, s0
-// CHECK: [0x00,0x7e,0xfe,0x7f]
+v_max_u16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x2f,0xd1,0x01,0x05,0x00,0x00]
 
-v_rsq_f16 v0, s101
-// CHECK: [0x65,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x2f,0xd1,0xff,0x05,0x00,0x00]
 
-v_rsq_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xca,0x00,0x00]
 
-v_rsq_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xcc,0x00,0x00]
 
-v_rsq_f16 v0, vcc_lo
-// CHECK: [0x6a,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xce,0x00,0x00]
 
-v_rsq_f16 v0, vcc_hi
-// CHECK: [0x6b,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xd4,0x00,0x00]
 
-v_rsq_f16 v0, tba_lo
-// CHECK: [0x6c,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xd6,0x00,0x00]
 
-v_rsq_f16 v0, tba_hi
-// CHECK: [0x6d,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xd8,0x00,0x00]
 
-v_rsq_f16 v0, tma_lo
-// CHECK: [0x6e,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xda,0x00,0x00]
 
-v_rsq_f16 v0, tma_hi
-// CHECK: [0x6f,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xdc,0x00,0x00]
 
-v_rsq_f16 v0, ttmp11
-// CHECK: [0x7b,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xde,0x00,0x00]
 
-v_rsq_f16 v0, m0
-// CHECK: [0x7c,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xf6,0x00,0x00]
 
-v_rsq_f16 v0, exec_lo
-// CHECK: [0x7e,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xf8,0x00,0x00]
 
-v_rsq_f16 v0, exec_hi
-// CHECK: [0x7f,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xfc,0x00,0x00]
 
-v_rsq_f16 v0, 0
-// CHECK: [0x80,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xfe,0x00,0x00]
 
-v_rsq_f16 v0, -1
-// CHECK: [0xc1,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0x00,0x01,0x00]
 
-v_rsq_f16 v0, 0.5
-// CHECK: [0xf0,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0x82,0x01,0x00]
 
-v_rsq_f16 v0, -4.0
-// CHECK: [0xf7,0x7e,0x00,0x7e]
+v_max_u16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xe0,0x01,0x00]
 
-v_rsq_f16 v0, 0xfe0b
-// CHECK: [0xff,0x7e,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_max_u16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xee,0x01,0x00]
 
-v_rsq_f16 v0, 0x3456
-// CHECK: [0xff,0x7e,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_max_u16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0x04,0x02,0x00]
 
-v_rsq_f16 v0, v0
-// CHECK: [0x00,0x7f,0x00,0x7e]
+v_max_u16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x2f,0xd1,0x80,0xfe,0x03,0x00]
 
-v_rsq_f16 v0, v255
-// CHECK: [0xff,0x7f,0x00,0x7e]
+v_max_i16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x7f,0xd1,0x00,0x00,0x00,0x00]
+v_max_i16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x61]
 
-v_rsq_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x7f,0xd1,0x00,0x00,0x00,0x00]
+v_max_i16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x7f,0xd1,0x65,0x00,0x00,0x00]
+v_max_i16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x7f,0xd1,0x66,0x00,0x00,0x00]
+v_max_i16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x7f,0xd1,0x67,0x00,0x00,0x00]
+v_max_i16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x7f,0xd1,0x6a,0x00,0x00,0x00]
+v_max_i16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x7f,0xd1,0x6b,0x00,0x00,0x00]
+v_max_i16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x7f,0xd1,0x6c,0x00,0x00,0x00]
+v_max_i16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x7f,0xd1,0x6d,0x00,0x00,0x00]
+v_max_i16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x7f,0xd1,0x6e,0x00,0x00,0x00]
+v_max_i16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x7f,0xd1,0x6f,0x00,0x00,0x00]
+v_max_i16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x7f,0xd1,0x7b,0x00,0x00,0x00]
+v_max_i16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x7f,0xd1,0x7c,0x00,0x00,0x00]
+v_max_i16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x7f,0xd1,0x7e,0x00,0x00,0x00]
+v_max_i16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x7f,0xd1,0x7f,0x00,0x00,0x00]
+v_max_i16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x7f,0xd1,0xfd,0x00,0x00,0x00]
+v_max_i16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x7f,0xd1,0x00,0x01,0x00,0x00]
+v_max_i16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x7f,0xd1,0xff,0x01,0x00,0x00]
+v_max_i16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x60]
 
-v_rsq_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x7f,0xd1,0x00,0x00,0x00,0x20]
+v_max_i16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x60,0x0b,0xfe,0x00,0x00]
 
-v_rsq_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x7f,0xd1,0x00,0x00,0x00,0x00]
+v_max_i16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x60,0x56,0x34,0x00,0x00]
 
-v_rsq_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x7f,0xd1,0x00,0x00,0x00,0x00]
+v_max_i16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x60]
 
-v_log_f16 v0, s0
-// CHECK: [0x00,0x80,0x00,0x7e]
+v_max_i16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x60]
 
-v_log_f16 v255, s0
-// CHECK: [0x00,0x80,0xfe,0x7f]
+v_max_i16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x60]
 
-v_log_f16 v0, s101
-// CHECK: [0x65,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0x04,0x00,0x00]
 
-v_log_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x80,0x00,0x7e]
+v_max_i16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x30,0xd1,0x80,0x04,0x00,0x00]
 
-v_log_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x80,0x00,0x7e]
+v_max_i16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x30,0xd1,0xc1,0x04,0x00,0x00]
 
-v_log_f16 v0, vcc_lo
-// CHECK: [0x6a,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x30,0xd1,0xf0,0x04,0x00,0x00]
 
-v_log_f16 v0, vcc_hi
-// CHECK: [0x6b,0x80,0x00,0x7e]
+v_max_i16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x30,0xd1,0xf7,0x04,0x00,0x00]
 
-v_log_f16 v0, tba_lo
-// CHECK: [0x6c,0x80,0x00,0x7e]
+v_max_i16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x30,0xd1,0x01,0x05,0x00,0x00]
 
-v_log_f16 v0, tba_hi
-// CHECK: [0x6d,0x80,0x00,0x7e]
+v_max_i16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x30,0xd1,0xff,0x05,0x00,0x00]
 
-v_log_f16 v0, tma_lo
-// CHECK: [0x6e,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xca,0x00,0x00]
 
-v_log_f16 v0, tma_hi
-// CHECK: [0x6f,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xcc,0x00,0x00]
 
-v_log_f16 v0, ttmp11
-// CHECK: [0x7b,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xce,0x00,0x00]
 
-v_log_f16 v0, m0
-// CHECK: [0x7c,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xd4,0x00,0x00]
 
-v_log_f16 v0, exec_lo
-// CHECK: [0x7e,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xd6,0x00,0x00]
 
-v_log_f16 v0, exec_hi
-// CHECK: [0x7f,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xd8,0x00,0x00]
 
-v_log_f16 v0, 0
-// CHECK: [0x80,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xda,0x00,0x00]
 
-v_log_f16 v0, -1
-// CHECK: [0xc1,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xdc,0x00,0x00]
 
-v_log_f16 v0, 0.5
-// CHECK: [0xf0,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xde,0x00,0x00]
 
-v_log_f16 v0, -4.0
-// CHECK: [0xf7,0x80,0x00,0x7e]
+v_max_i16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xf6,0x00,0x00]
 
-v_log_f16 v0, 0xfe0b
-// CHECK: [0xff,0x80,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_max_i16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xf8,0x00,0x00]
 
-v_log_f16 v0, 0x3456
-// CHECK: [0xff,0x80,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_max_i16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xfc,0x00,0x00]
 
-v_log_f16 v0, v0
-// CHECK: [0x00,0x81,0x00,0x7e]
+v_max_i16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xfe,0x00,0x00]
 
-v_log_f16 v0, v255
-// CHECK: [0xff,0x81,0x00,0x7e]
+v_max_i16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0x00,0x01,0x00]
 
-v_log_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_max_i16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0x82,0x01,0x00]
 
-v_log_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_max_i16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xe0,0x01,0x00]
 
-v_log_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x80,0xd1,0x65,0x00,0x00,0x00]
+v_max_i16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xee,0x01,0x00]
 
-v_log_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x80,0xd1,0x66,0x00,0x00,0x00]
+v_max_i16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0x04,0x02,0x00]
 
-v_log_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x80,0xd1,0x67,0x00,0x00,0x00]
+v_max_i16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x30,0xd1,0x80,0xfe,0x03,0x00]
 
-v_log_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x80,0xd1,0x6a,0x00,0x00,0x00]
+v_min_u16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x80,0xd1,0x6b,0x00,0x00,0x00]
+v_min_u16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x63]
 
-v_log_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x80,0xd1,0x6c,0x00,0x00,0x00]
+v_min_u16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x80,0xd1,0x6d,0x00,0x00,0x00]
+v_min_u16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x80,0xd1,0x6e,0x00,0x00,0x00]
+v_min_u16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x80,0xd1,0x6f,0x00,0x00,0x00]
+v_min_u16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x80,0xd1,0x7b,0x00,0x00,0x00]
+v_min_u16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x80,0xd1,0x7c,0x00,0x00,0x00]
+v_min_u16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x80,0xd1,0x7e,0x00,0x00,0x00]
+v_min_u16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x80,0xd1,0x7f,0x00,0x00,0x00]
+v_min_u16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x80,0xd1,0xfd,0x00,0x00,0x00]
+v_min_u16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0x01,0x00,0x00]
+v_min_u16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x80,0xd1,0xff,0x01,0x00,0x00]
+v_min_u16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x80,0xd1,0x00,0x00,0x00,0x20]
+v_min_u16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_min_u16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x62]
 
-v_log_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x80,0xd1,0x00,0x00,0x00,0x00]
+v_min_u16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x62]
 
-v_exp_f16 v0, s0
-// CHECK: [0x00,0x82,0x00,0x7e]
+v_min_u16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x62]
 
-v_exp_f16 v255, s0
-// CHECK: [0x00,0x82,0xfe,0x7f]
+v_min_u16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x62]
 
-v_exp_f16 v0, s101
-// CHECK: [0x65,0x82,0x00,0x7e]
+v_min_u16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x62]
 
-v_exp_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x82,0x00,0x7e]
+v_min_u16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x62,0x0b,0xfe,0x00,0x00]
 
-v_exp_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x82,0x00,0x7e]
+v_min_u16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x62,0x56,0x34,0x00,0x00]
 
-v_exp_f16 v0, vcc_lo
-// CHECK: [0x6a,0x82,0x00,0x7e]
+v_min_u16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x62]
 
-v_exp_f16 v0, vcc_hi
-// CHECK: [0x6b,0x82,0x00,0x7e]
+v_min_u16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x62]
 
-v_exp_f16 v0, tba_lo
-// CHECK: [0x6c,0x82,0x00,0x7e]
+v_min_u16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x62]
 
-v_exp_f16 v0, tba_hi
-// CHECK: [0x6d,0x82,0x00,0x7e]
+v_min_u16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0x04,0x00,0x00]
 
-v_exp_f16 v0, tma_lo
-// CHECK: [0x6e,0x82,0x00,0x7e]
+v_min_u16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x31,0xd1,0x80,0x04,0x00,0x00]
 
-v_exp_f16 v0, tma_hi
-// CHECK: [0x6f,0x82,0x00,0x7e]
+v_min_u16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x31,0xd1,0xc1,0x04,0x00,0x00]
 
-v_exp_f16 v0, ttmp11
-// CHECK: [0x7b,0x82,0x00,0x7e]
+v_min_u16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x31,0xd1,0xf0,0x04,0x00,0x00]
 
-v_exp_f16 v0, m0
-// CHECK: [0x7c,0x82,0x00,0x7e]
+v_min_u16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x31,0xd1,0xf7,0x04,0x00,0x00]
 
-v_exp_f16 v0, exec_lo
-// CHECK: [0x7e,0x82,0x00,0x7e]
+v_min_u16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x31,0xd1,0x01,0x05,0x00,0x00]
 
-v_exp_f16 v0, exec_hi
-// CHECK: [0x7f,0x82,0x00,0x7e]
+v_min_u16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x31,0xd1,0xff,0x05,0x00,0x00]
 
-v_exp_f16 v0, 0
-// CHECK: [0x80,0x82,0x00,0x7e]
+v_min_u16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xca,0x00,0x00]
 
-v_exp_f16 v0, -1
-// CHECK: [0xc1,0x82,0x00,0x7e]
+v_min_u16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xcc,0x00,0x00]
 
-v_exp_f16 v0, 0.5
-// CHECK: [0xf0,0x82,0x00,0x7e]
+v_min_u16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xce,0x00,0x00]
 
-v_exp_f16 v0, -4.0
-// CHECK: [0xf7,0x82,0x00,0x7e]
+v_min_u16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xd4,0x00,0x00]
 
-v_exp_f16 v0, 0xfe0b
-// CHECK: [0xff,0x82,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_min_u16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xd6,0x00,0x00]
 
-v_exp_f16 v0, 0x3456
-// CHECK: [0xff,0x82,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_min_u16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xd8,0x00,0x00]
 
-v_exp_f16 v0, v0
-// CHECK: [0x00,0x83,0x00,0x7e]
+v_min_u16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xda,0x00,0x00]
 
-v_exp_f16 v0, v255
-// CHECK: [0xff,0x83,0x00,0x7e]
+v_min_u16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xdc,0x00,0x00]
 
-v_exp_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x81,0xd1,0x00,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xde,0x00,0x00]
 
-v_exp_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x81,0xd1,0x00,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xf6,0x00,0x00]
 
-v_exp_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x81,0xd1,0x65,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xf8,0x00,0x00]
 
-v_exp_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x81,0xd1,0x66,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xfc,0x00,0x00]
 
-v_exp_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x81,0xd1,0x67,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xfe,0x00,0x00]
 
-v_exp_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x81,0xd1,0x6a,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0x00,0x01,0x00]
 
-v_exp_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x81,0xd1,0x6b,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0x82,0x01,0x00]
 
-v_exp_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x81,0xd1,0x6c,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xe0,0x01,0x00]
 
-v_exp_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x81,0xd1,0x6d,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xee,0x01,0x00]
 
-v_exp_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x81,0xd1,0x6e,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0x04,0x02,0x00]
 
-v_exp_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x81,0xd1,0x6f,0x00,0x00,0x00]
+v_min_u16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x31,0xd1,0x80,0xfe,0x03,0x00]
 
-v_exp_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x81,0xd1,0x7b,0x00,0x00,0x00]
+v_min_i16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x64]
 
-v_exp_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x81,0xd1,0x7c,0x00,0x00,0x00]
+v_min_i16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x65]
 
-v_exp_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x81,0xd1,0x7e,0x00,0x00,0x00]
+v_min_i16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x64]
 
-v_exp_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x81,0xd1,0x7f,0x00,0x00,0x00]
+v_min_i16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x64]
 
-v_exp_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x81,0xd1,0xfd,0x00,0x00,0x00]
+v_min_i16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x64]
 
-v_exp_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x81,0xd1,0x00,0x01,0x00,0x00]
+v_min_i16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x64]
 
-v_exp_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x81,0xd1,0xff,0x01,0x00,0x00]
+v_min_i16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x64]
 
-v_exp_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x81,0xd1,0x00,0x00,0x00,0x20]
+v_min_i16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x64]
 
-v_exp_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x81,0xd1,0x00,0x00,0x00,0x00]
+v_min_i16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x64]
 
-v_exp_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x81,0xd1,0x00,0x00,0x00,0x00]
+v_min_i16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x64]
 
-v_frexp_mant_f16 v0, s0
-// CHECK: [0x00,0x84,0x00,0x7e]
+v_min_i16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x64]
 
-v_frexp_mant_f16 v255, s0
-// CHECK: [0x00,0x84,0xfe,0x7f]
+v_min_i16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x64]
 
-v_frexp_mant_f16 v0, s101
-// CHECK: [0x65,0x84,0x00,0x7e]
+v_min_i16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x64]
 
-v_frexp_mant_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x84,0x00,0x7e]
+v_min_i16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x64]
 
-v_frexp_mant_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x84,0x00,0x7e]
+v_min_i16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x64]
 
-v_frexp_mant_f16 v0, vcc_lo
-// CHECK: [0x6a,0x84,0x00,0x7e]
+v_min_i16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x64]
 
-v_frexp_mant_f16 v0, vcc_hi
-// CHECK: [0x6b,0x84,0x00,0x7e]
+v_min_i16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x64]
 
-v_frexp_mant_f16 v0, tba_lo
-// CHECK: [0x6c,0x84,0x00,0x7e]
+v_min_i16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x64]
 
-v_frexp_mant_f16 v0, tba_hi
-// CHECK: [0x6d,0x84,0x00,0x7e]
+v_min_i16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x64]
 
-v_frexp_mant_f16 v0, tma_lo
-// CHECK: [0x6e,0x84,0x00,0x7e]
+v_min_i16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x64,0x0b,0xfe,0x00,0x00]
 
-v_frexp_mant_f16 v0, tma_hi
-// CHECK: [0x6f,0x84,0x00,0x7e]
+v_min_i16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x64,0x56,0x34,0x00,0x00]
 
-v_frexp_mant_f16 v0, ttmp11
-// CHECK: [0x7b,0x84,0x00,0x7e]
+v_min_i16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x64]
 
-v_frexp_mant_f16 v0, m0
-// CHECK: [0x7c,0x84,0x00,0x7e]
+v_min_i16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x64]
 
-v_frexp_mant_f16 v0, exec_lo
-// CHECK: [0x7e,0x84,0x00,0x7e]
+v_min_i16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x64]
 
-v_frexp_mant_f16 v0, exec_hi
-// CHECK: [0x7f,0x84,0x00,0x7e]
+v_min_i16_e64 v5, 0, s2
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0x04,0x00,0x00]
 
-v_frexp_mant_f16 v0, 0
-// CHECK: [0x80,0x84,0x00,0x7e]
+v_min_i16_e64 v255, 0, s2
+// CHECK: [0xff,0x00,0x32,0xd1,0x80,0x04,0x00,0x00]
 
-v_frexp_mant_f16 v0, -1
-// CHECK: [0xc1,0x84,0x00,0x7e]
+v_min_i16_e64 v5, -1, s2
+// CHECK: [0x05,0x00,0x32,0xd1,0xc1,0x04,0x00,0x00]
 
-v_frexp_mant_f16 v0, 0.5
-// CHECK: [0xf0,0x84,0x00,0x7e]
+v_min_i16_e64 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x32,0xd1,0xf0,0x04,0x00,0x00]
 
-v_frexp_mant_f16 v0, -4.0
-// CHECK: [0xf7,0x84,0x00,0x7e]
+v_min_i16_e64 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x32,0xd1,0xf7,0x04,0x00,0x00]
 
-v_frexp_mant_f16 v0, 0xfe0b
-// CHECK: [0xff,0x84,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_min_i16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x32,0xd1,0x01,0x05,0x00,0x00]
 
-v_frexp_mant_f16 v0, 0x3456
-// CHECK: [0xff,0x84,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_min_i16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x32,0xd1,0xff,0x05,0x00,0x00]
 
-v_frexp_mant_f16 v0, v0
-// CHECK: [0x00,0x85,0x00,0x7e]
+v_min_i16_e64 v5, 0, s101
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xca,0x00,0x00]
 
-v_frexp_mant_f16 v0, v255
-// CHECK: [0xff,0x85,0x00,0x7e]
+v_min_i16_e64 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xcc,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xce,0x00,0x00]
 
-v_frexp_mant_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xd4,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x82,0xd1,0x65,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xd6,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x82,0xd1,0x66,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xd8,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x82,0xd1,0x67,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xda,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x82,0xd1,0x6a,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xdc,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x82,0xd1,0x6b,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xde,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x82,0xd1,0x6c,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xf6,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x82,0xd1,0x6d,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, m0
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xf8,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x82,0xd1,0x6e,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xfc,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x82,0xd1,0x6f,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xfe,0x00,0x00]
 
-v_frexp_mant_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x82,0xd1,0x7b,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, 0
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0x00,0x01,0x00]
 
-v_frexp_mant_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x82,0xd1,0x7c,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, -1
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0x82,0x01,0x00]
 
-v_frexp_mant_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x82,0xd1,0x7e,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xe0,0x01,0x00]
 
-v_frexp_mant_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x82,0xd1,0x7f,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xee,0x01,0x00]
 
-v_frexp_mant_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x82,0xd1,0xfd,0x00,0x00,0x00]
+v_min_i16_e64 v5, 0, v2
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0x04,0x02,0x00]
 
-v_frexp_mant_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0x01,0x00,0x00]
+v_min_i16_e64 v5, 0, v255
+// CHECK: [0x05,0x00,0x32,0xd1,0x80,0xfe,0x03,0x00]
 
-v_frexp_mant_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x82,0xd1,0xff,0x01,0x00,0x00]
+v_ldexp_f16 v5, s1, v2
+// CHECK: [0x01,0x04,0x0a,0x66]
 
-v_frexp_mant_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x82,0xd1,0x00,0x00,0x00,0x20]
+v_ldexp_f16 v255, s1, v2
+// CHECK: [0x01,0x04,0xfe,0x67]
 
-v_frexp_mant_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f16 v5, s101, v2
+// CHECK: [0x65,0x04,0x0a,0x66]
 
-v_frexp_mant_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x82,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f16 v5, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, s0
-// CHECK: [0x00,0x86,0x00,0x7e]
+v_ldexp_f16 v5, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v255, s0
-// CHECK: [0x00,0x86,0xfe,0x7f]
+v_ldexp_f16 v5, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, s101
-// CHECK: [0x65,0x86,0x00,0x7e]
+v_ldexp_f16 v5, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x86,0x00,0x7e]
+v_ldexp_f16 v5, tba_lo, v2
+// CHECK: [0x6c,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x86,0x00,0x7e]
+v_ldexp_f16 v5, tba_hi, v2
+// CHECK: [0x6d,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, vcc_lo
-// CHECK: [0x6a,0x86,0x00,0x7e]
+v_ldexp_f16 v5, tma_lo, v2
+// CHECK: [0x6e,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, vcc_hi
-// CHECK: [0x6b,0x86,0x00,0x7e]
+v_ldexp_f16 v5, tma_hi, v2
+// CHECK: [0x6f,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, tba_lo
-// CHECK: [0x6c,0x86,0x00,0x7e]
+v_ldexp_f16 v5, ttmp11, v2
+// CHECK: [0x7b,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, tba_hi
-// CHECK: [0x6d,0x86,0x00,0x7e]
+v_ldexp_f16 v5, m0, v2
+// CHECK: [0x7c,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, tma_lo
-// CHECK: [0x6e,0x86,0x00,0x7e]
+v_ldexp_f16 v5, exec_lo, v2
+// CHECK: [0x7e,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, tma_hi
-// CHECK: [0x6f,0x86,0x00,0x7e]
+v_ldexp_f16 v5, exec_hi, v2
+// CHECK: [0x7f,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, ttmp11
-// CHECK: [0x7b,0x86,0x00,0x7e]
+v_ldexp_f16 v5, 0, v2
+// CHECK: [0x80,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, m0
-// CHECK: [0x7c,0x86,0x00,0x7e]
+v_ldexp_f16 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, exec_lo
-// CHECK: [0x7e,0x86,0x00,0x7e]
+v_ldexp_f16 v5, 0.5, v2
+// CHECK: [0xf0,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, exec_hi
-// CHECK: [0x7f,0x86,0x00,0x7e]
+v_ldexp_f16 v5, -4.0, v2
+// CHECK: [0xf7,0x04,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, 0
-// CHECK: [0x80,0x86,0x00,0x7e]
+v_ldexp_f16 v5, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x0a,0x66,0x0b,0xfe,0x00,0x00]
 
-v_frexp_exp_i16_f16 v0, -1
-// CHECK: [0xc1,0x86,0x00,0x7e]
+v_ldexp_f16 v5, 0x3456, v2
+// CHECK: [0xff,0x04,0x0a,0x66,0x56,0x34,0x00,0x00]
 
-v_frexp_exp_i16_f16 v0, 0.5
-// CHECK: [0xf0,0x86,0x00,0x7e]
+v_ldexp_f16 v5, v1, v2
+// CHECK: [0x01,0x05,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, -4.0
-// CHECK: [0xf7,0x86,0x00,0x7e]
+v_ldexp_f16 v5, v255, v2
+// CHECK: [0xff,0x05,0x0a,0x66]
 
-v_frexp_exp_i16_f16 v0, 0xfe0b
-// CHECK: [0xff,0x86,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_ldexp_f16 v5, s1, v255
+// CHECK: [0x01,0xfe,0x0b,0x66]
 
-v_frexp_exp_i16_f16 v0, 0x3456
-// CHECK: [0xff,0x86,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, s2
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0x05,0x00,0x00]
 
-v_frexp_exp_i16_f16 v0, v0
-// CHECK: [0x00,0x87,0x00,0x7e]
+v_ldexp_f16_e64 v255, v1, s2
+// CHECK: [0xff,0x00,0x33,0xd1,0x01,0x05,0x00,0x00]
 
-v_frexp_exp_i16_f16 v0, v255
-// CHECK: [0xff,0x87,0x00,0x7e]
+v_ldexp_f16_e64 v5, v255, s2
+// CHECK: [0x05,0x00,0x33,0xd1,0xff,0x05,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x83,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, s101
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xcb,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x83,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xcd,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x83,0xd1,0x65,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xcf,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x83,0xd1,0x66,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xd5,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x83,0xd1,0x67,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xd7,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x83,0xd1,0x6a,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xd9,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x83,0xd1,0x6b,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xdb,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x83,0xd1,0x6c,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xdd,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x83,0xd1,0x6d,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xdf,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x83,0xd1,0x6e,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xf7,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x83,0xd1,0x6f,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, m0
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xf9,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x83,0xd1,0x7b,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xfd,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x83,0xd1,0x7c,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xff,0x00,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x83,0xd1,0x7e,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, 0
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0x01,0x01,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x83,0xd1,0x7f,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, -1
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0x83,0x01,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x83,0xd1,0xfd,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, 0.5
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xe1,0x01,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x83,0xd1,0x00,0x01,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, -4.0
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xef,0x01,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x83,0xd1,0xff,0x01,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, scc
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xfb,0x01,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x83,0xd1,0x00,0x00,0x00,0x20]
+v_ldexp_f16_e64 v5, v1, v2
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0x05,0x02,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x83,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, v1, v255
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0xff,0x03,0x00]
 
-v_frexp_exp_i16_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x83,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f16_e64 v5, -v1, s2
+// CHECK: [0x05,0x00,0x33,0xd1,0x01,0x05,0x00,0x20]
 
-v_floor_f16 v0, s0
-// CHECK: [0x00,0x88,0x00,0x7e]
+v_ldexp_f16_e64 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x33,0xd1,0x01,0x05,0x00,0x00]
 
-v_floor_f16 v255, s0
-// CHECK: [0x00,0x88,0xfe,0x7f]
+v_ldexp_f16_e64 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x33,0xd1,0x01,0x05,0x00,0x00]
 
-v_floor_f16 v0, s101
-// CHECK: [0x65,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_floor_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x88,0x00,0x7e]
+v_mad_legacy_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xc0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_floor_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x65,0x04,0x0e,0x04]
 
-v_floor_f16 v0, vcc_lo
-// CHECK: [0x6a,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x66,0x04,0x0e,0x04]
 
-v_floor_f16 v0, vcc_hi
-// CHECK: [0x6b,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x67,0x04,0x0e,0x04]
 
-v_floor_f16 v0, tba_lo
-// CHECK: [0x6c,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_floor_f16 v0, tba_hi
-// CHECK: [0x6d,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_floor_f16 v0, tma_lo
-// CHECK: [0x6e,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_floor_f16 v0, tma_hi
-// CHECK: [0x6f,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_floor_f16 v0, ttmp11
-// CHECK: [0x7b,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_floor_f16 v0, m0
-// CHECK: [0x7c,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_floor_f16 v0, exec_lo
-// CHECK: [0x7e,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_floor_f16 v0, exec_hi
-// CHECK: [0x7f,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_floor_f16 v0, 0
-// CHECK: [0x80,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_floor_f16 v0, -1
-// CHECK: [0xc1,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_floor_f16 v0, 0.5
-// CHECK: [0xf0,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_floor_f16 v0, -4.0
-// CHECK: [0xf7,0x88,0x00,0x7e]
+v_mad_legacy_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0x05,0x0e,0x04]
 
-v_floor_f16 v0, 0xfe0b
-// CHECK: [0xff,0x88,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_mad_legacy_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0xff,0x05,0x0e,0x04]
 
-v_floor_f16 v0, 0x3456
-// CHECK: [0xff,0x88,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_mad_legacy_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_floor_f16 v0, v0
-// CHECK: [0x00,0x89,0x00,0x7e]
+v_mad_legacy_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0x04,0xfe,0x07]
 
-v_floor_f16 v0, v255
-// CHECK: [0xff,0x89,0x00,0x7e]
+v_mad_legacy_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0x04,0x0e,0x24]
 
-v_floor_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0x04,0x0e,0x44]
 
-v_floor_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0x04,0x0e,0x84]
 
-v_floor_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x84,0xd1,0x65,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_floor_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x84,0xd1,0x66,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xc0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x84,0xd1,0x67,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xc0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x84,0xd1,0x6a,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xc0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x84,0xd1,0x6b,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xc0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x84,0xd1,0x6c,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xc0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x84,0xd1,0x6d,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_floor_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x84,0xd1,0x6e,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0x04,0x0e,0x14]
 
-v_floor_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x84,0xd1,0x6f,0x00,0x00,0x00]
+v_mad_legacy_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xc0,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_floor_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x84,0xd1,0x7b,0x00,0x00,0x00]
+v_mad_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x84,0xd1,0x7c,0x00,0x00,0x00]
+v_mad_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xc1,0xd1,0x01,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x84,0xd1,0x7e,0x00,0x00,0x00]
+v_mad_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x65,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x84,0xd1,0x7f,0x00,0x00,0x00]
+v_mad_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x66,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x84,0xd1,0xfd,0x00,0x00,0x00]
+v_mad_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x67,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0x01,0x00,0x00]
+v_mad_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x84,0xd1,0xff,0x01,0x00,0x00]
+v_mad_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x84,0xd1,0x00,0x00,0x00,0x20]
+v_mad_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_mad_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_floor_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x84,0xd1,0x00,0x00,0x00,0x00]
+v_mad_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, s0
-// CHECK: [0x00,0x8a,0x00,0x7e]
+v_mad_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_ceil_f16 v255, s0
-// CHECK: [0x00,0x8a,0xfe,0x7f]
+v_mad_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, s101
-// CHECK: [0x65,0x8a,0x00,0x7e]
+v_mad_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x8a,0x00,0x7e]
+v_mad_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x8a,0x00,0x7e]
+v_mad_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, vcc_lo
-// CHECK: [0x6a,0x8a,0x00,0x7e]
+v_mad_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, vcc_hi
-// CHECK: [0x6b,0x8a,0x00,0x7e]
+v_mad_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0x05,0x0e,0x04]
 
-v_ceil_f16 v0, tba_lo
-// CHECK: [0x6c,0x8a,0x00,0x7e]
+v_mad_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0xff,0x05,0x0e,0x04]
 
-v_ceil_f16 v0, tba_hi
-// CHECK: [0x6d,0x8a,0x00,0x7e]
+v_mad_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_ceil_f16 v0, tma_lo
-// CHECK: [0x6e,0x8a,0x00,0x7e]
+v_mad_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0x04,0xfe,0x07]
 
-v_ceil_f16 v0, tma_hi
-// CHECK: [0x6f,0x8a,0x00,0x7e]
+v_mad_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0x04,0x0e,0x24]
 
-v_ceil_f16 v0, ttmp11
-// CHECK: [0x7b,0x8a,0x00,0x7e]
+v_mad_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0x04,0x0e,0x44]
 
-v_ceil_f16 v0, m0
-// CHECK: [0x7c,0x8a,0x00,0x7e]
+v_mad_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0x04,0x0e,0x84]
 
-v_ceil_f16 v0, exec_lo
-// CHECK: [0x7e,0x8a,0x00,0x7e]
+v_mad_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_ceil_f16 v0, exec_hi
-// CHECK: [0x7f,0x8a,0x00,0x7e]
+v_mad_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xc1,0xd1,0x01,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, 0
-// CHECK: [0x80,0x8a,0x00,0x7e]
+v_mad_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xc1,0xd1,0x01,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, -1
-// CHECK: [0xc1,0x8a,0x00,0x7e]
+v_mad_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xc1,0xd1,0x01,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, 0.5
-// CHECK: [0xf0,0x8a,0x00,0x7e]
+v_mad_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xc1,0xd1,0x01,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, -4.0
-// CHECK: [0xf7,0x8a,0x00,0x7e]
+v_mad_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xc1,0xd1,0x01,0x04,0x0e,0x04]
 
-v_ceil_f16 v0, 0xfe0b
-// CHECK: [0xff,0x8a,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_mad_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_ceil_f16 v0, 0x3456
-// CHECK: [0xff,0x8a,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_mad_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0x04,0x0e,0x14]
 
-v_ceil_f16 v0, v0
-// CHECK: [0x00,0x8b,0x00,0x7e]
+v_mad_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xc1,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_ceil_f16 v0, v255
-// CHECK: [0xff,0x8b,0x00,0x7e]
+v_mad_i32_i24 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x85,0xd1,0x00,0x00,0x00,0x00]
+v_mad_i32_i24 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xc2,0xd1,0x01,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x85,0xd1,0x00,0x00,0x00,0x00]
+v_mad_i32_i24 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x65,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x85,0xd1,0x65,0x00,0x00,0x00]
+v_mad_i32_i24 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x66,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x85,0xd1,0x66,0x00,0x00,0x00]
+v_mad_i32_i24 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x67,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x85,0xd1,0x67,0x00,0x00,0x00]
+v_mad_i32_i24 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x6a,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x85,0xd1,0x6a,0x00,0x00,0x00]
+v_mad_i32_i24 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x6b,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x85,0xd1,0x6b,0x00,0x00,0x00]
+v_mad_i32_i24 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x6c,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x85,0xd1,0x6c,0x00,0x00,0x00]
+v_mad_i32_i24 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x6d,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x85,0xd1,0x6d,0x00,0x00,0x00]
+v_mad_i32_i24 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x6e,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x85,0xd1,0x6e,0x00,0x00,0x00]
+v_mad_i32_i24 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x6f,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x85,0xd1,0x6f,0x00,0x00,0x00]
+v_mad_i32_i24 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x7b,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x85,0xd1,0x7b,0x00,0x00,0x00]
+v_mad_i32_i24 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x7c,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x85,0xd1,0x7c,0x00,0x00,0x00]
+v_mad_i32_i24 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x7e,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x85,0xd1,0x7e,0x00,0x00,0x00]
+v_mad_i32_i24 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x7f,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x85,0xd1,0x7f,0x00,0x00,0x00]
+v_mad_i32_i24 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x80,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x85,0xd1,0xfd,0x00,0x00,0x00]
+v_mad_i32_i24 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0xc1,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x85,0xd1,0x00,0x01,0x00,0x00]
+v_mad_i32_i24 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0xf0,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x85,0xd1,0xff,0x01,0x00,0x00]
+v_mad_i32_i24 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0xf7,0x00,0x01,0x02]
 
-v_ceil_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x85,0xd1,0x00,0x00,0x00,0x20]
+v_mad_i32_i24 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0x01,0x01,0x02]
 
-v_ceil_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x85,0xd1,0x00,0x00,0x00,0x00]
+v_mad_i32_i24 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0xff,0x01,0x01,0x02]
 
-v_ceil_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x85,0xd1,0x00,0x00,0x00,0x00]
+v_mad_i32_i24 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0x82,0x01,0x02]
 
-v_trunc_f16 v0, s0
-// CHECK: [0x00,0x8c,0x00,0x7e]
+v_mad_i32_i24 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0xe0,0x01,0x02]
 
-v_trunc_f16 v255, s0
-// CHECK: [0x00,0x8c,0xfe,0x7f]
+v_mad_i32_i24 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0xee,0x01,0x02]
 
-v_trunc_f16 v0, s101
-// CHECK: [0x65,0x8c,0x00,0x7e]
+v_mad_i32_i24 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0x04,0x02,0x02]
 
-v_trunc_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x8c,0x00,0x7e]
+v_mad_i32_i24 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0xfe,0x03,0x02]
 
-v_trunc_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x8c,0x00,0x7e]
+v_mad_i32_i24 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0x00,0x05,0x03]
 
-v_trunc_f16 v0, vcc_lo
-// CHECK: [0x6a,0x8c,0x00,0x7e]
+v_mad_i32_i24 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0x00,0xc1,0x03]
 
-v_trunc_f16 v0, vcc_hi
-// CHECK: [0x6b,0x8c,0x00,0x7e]
+v_mad_i32_i24 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0x00,0xdd,0x03]
 
-v_trunc_f16 v0, tba_lo
-// CHECK: [0x6c,0x8c,0x00,0x7e]
+v_mad_i32_i24 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0x00,0x0d,0x04]
 
-v_trunc_f16 v0, tba_hi
-// CHECK: [0x6d,0x8c,0x00,0x7e]
+v_mad_i32_i24 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xc2,0xd1,0x01,0x00,0xfd,0x07]
 
-v_trunc_f16 v0, tma_lo
-// CHECK: [0x6e,0x8c,0x00,0x7e]
+v_mad_u32_u24 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0x00,0x01,0x02]
 
-v_trunc_f16 v0, tma_hi
-// CHECK: [0x6f,0x8c,0x00,0x7e]
+v_mad_u32_u24 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xc3,0xd1,0x01,0x00,0x01,0x02]
 
-v_trunc_f16 v0, ttmp11
-// CHECK: [0x7b,0x8c,0x00,0x7e]
+v_mad_u32_u24 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x65,0x00,0x01,0x02]
 
-v_trunc_f16 v0, m0
-// CHECK: [0x7c,0x8c,0x00,0x7e]
+v_mad_u32_u24 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x66,0x00,0x01,0x02]
 
-v_trunc_f16 v0, exec_lo
-// CHECK: [0x7e,0x8c,0x00,0x7e]
+v_mad_u32_u24 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x67,0x00,0x01,0x02]
 
-v_trunc_f16 v0, exec_hi
-// CHECK: [0x7f,0x8c,0x00,0x7e]
+v_mad_u32_u24 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x6a,0x00,0x01,0x02]
 
-v_trunc_f16 v0, 0
-// CHECK: [0x80,0x8c,0x00,0x7e]
+v_mad_u32_u24 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x6b,0x00,0x01,0x02]
 
-v_trunc_f16 v0, -1
-// CHECK: [0xc1,0x8c,0x00,0x7e]
+v_mad_u32_u24 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x6c,0x00,0x01,0x02]
 
-v_trunc_f16 v0, 0.5
-// CHECK: [0xf0,0x8c,0x00,0x7e]
+v_mad_u32_u24 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x6d,0x00,0x01,0x02]
 
-v_trunc_f16 v0, -4.0
-// CHECK: [0xf7,0x8c,0x00,0x7e]
+v_mad_u32_u24 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x6e,0x00,0x01,0x02]
 
-v_trunc_f16 v0, 0xfe0b
-// CHECK: [0xff,0x8c,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_mad_u32_u24 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x6f,0x00,0x01,0x02]
 
-v_trunc_f16 v0, 0x3456
-// CHECK: [0xff,0x8c,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_mad_u32_u24 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x7b,0x00,0x01,0x02]
 
-v_trunc_f16 v0, v0
-// CHECK: [0x00,0x8d,0x00,0x7e]
+v_mad_u32_u24 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x7c,0x00,0x01,0x02]
 
-v_trunc_f16 v0, v255
-// CHECK: [0xff,0x8d,0x00,0x7e]
+v_mad_u32_u24 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x7e,0x00,0x01,0x02]
 
-v_trunc_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_mad_u32_u24 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x7f,0x00,0x01,0x02]
 
-v_trunc_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_mad_u32_u24 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x80,0x00,0x01,0x02]
 
-v_trunc_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x86,0xd1,0x65,0x00,0x00,0x00]
+v_mad_u32_u24 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0xc1,0x00,0x01,0x02]
 
-v_trunc_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x86,0xd1,0x66,0x00,0x00,0x00]
+v_mad_u32_u24 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0xf0,0x00,0x01,0x02]
 
-v_trunc_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x86,0xd1,0x67,0x00,0x00,0x00]
+v_mad_u32_u24 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0xf7,0x00,0x01,0x02]
 
-v_trunc_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x86,0xd1,0x6a,0x00,0x00,0x00]
+v_mad_u32_u24 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0x01,0x01,0x02]
 
-v_trunc_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x86,0xd1,0x6b,0x00,0x00,0x00]
+v_mad_u32_u24 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0xff,0x01,0x01,0x02]
 
-v_trunc_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x86,0xd1,0x6c,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0x82,0x01,0x02]
 
-v_trunc_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x86,0xd1,0x6d,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0xe0,0x01,0x02]
 
-v_trunc_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x86,0xd1,0x6e,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0xee,0x01,0x02]
 
-v_trunc_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x86,0xd1,0x6f,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0x04,0x02,0x02]
 
-v_trunc_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x86,0xd1,0x7b,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0xfe,0x03,0x02]
 
-v_trunc_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x86,0xd1,0x7c,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0x00,0x05,0x03]
 
-v_trunc_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x86,0xd1,0x7e,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0x00,0xc1,0x03]
 
-v_trunc_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x86,0xd1,0x7f,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0x00,0xdd,0x03]
 
-v_trunc_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x86,0xd1,0xfd,0x00,0x00,0x00]
+v_mad_u32_u24 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0x00,0x0d,0x04]
 
-v_trunc_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0x01,0x00,0x00]
+v_mad_u32_u24 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xc3,0xd1,0x01,0x00,0xfd,0x07]
 
-v_trunc_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x86,0xd1,0xff,0x01,0x00,0x00]
+v_cubeid_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0x04,0x0e,0x04]
 
-v_trunc_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x86,0xd1,0x00,0x00,0x00,0x20]
+v_cubeid_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xc4,0xd1,0x01,0x04,0x0e,0x04]
 
-v_trunc_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_cubeid_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x65,0x04,0x0e,0x04]
 
-v_trunc_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x86,0xd1,0x00,0x00,0x00,0x00]
+v_cubeid_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x66,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, s0
-// CHECK: [0x00,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x67,0x04,0x0e,0x04]
 
-v_rndne_f16 v255, s0
-// CHECK: [0x00,0x8e,0xfe,0x7f]
+v_cubeid_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, s101
-// CHECK: [0x65,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, vcc_lo
-// CHECK: [0x6a,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, vcc_hi
-// CHECK: [0x6b,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, tba_lo
-// CHECK: [0x6c,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, tba_hi
-// CHECK: [0x6d,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, tma_lo
-// CHECK: [0x6e,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, tma_hi
-// CHECK: [0x6f,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, ttmp11
-// CHECK: [0x7b,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, m0
-// CHECK: [0x7c,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0x05,0x0e,0x04]
 
-v_rndne_f16 v0, exec_lo
-// CHECK: [0x7e,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0xff,0x05,0x0e,0x04]
 
-v_rndne_f16 v0, exec_hi
-// CHECK: [0x7f,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_rndne_f16 v0, 0
-// CHECK: [0x80,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0x04,0xfe,0x07]
 
-v_rndne_f16 v0, -1
-// CHECK: [0xc1,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0x04,0x0e,0x24]
 
-v_rndne_f16 v0, 0.5
-// CHECK: [0xf0,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0x04,0x0e,0x44]
 
-v_rndne_f16 v0, -4.0
-// CHECK: [0xf7,0x8e,0x00,0x7e]
+v_cubeid_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0x04,0x0e,0x84]
 
-v_rndne_f16 v0, 0xfe0b
-// CHECK: [0xff,0x8e,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_cubeid_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_rndne_f16 v0, 0x3456
-// CHECK: [0xff,0x8e,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_cubeid_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xc4,0xd1,0x01,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, v0
-// CHECK: [0x00,0x8f,0x00,0x7e]
+v_cubeid_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xc4,0xd1,0x01,0x04,0x0e,0x04]
 
-v_rndne_f16 v0, v255
-// CHECK: [0xff,0x8f,0x00,0x7e]
+v_cubeid_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xc4,0xd1,0x01,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x87,0xd1,0x00,0x00,0x00,0x00]
+v_cubeid_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xc4,0xd1,0x01,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x87,0xd1,0x00,0x00,0x00,0x00]
+v_cubeid_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xc4,0xd1,0x01,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x87,0xd1,0x65,0x00,0x00,0x00]
+v_cubeid_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_rndne_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x87,0xd1,0x66,0x00,0x00,0x00]
+v_cubeid_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0x04,0x0e,0x14]
 
-v_rndne_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x87,0xd1,0x67,0x00,0x00,0x00]
+v_cubeid_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xc4,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_rndne_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x87,0xd1,0x6a,0x00,0x00,0x00]
+v_cubesc_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x87,0xd1,0x6b,0x00,0x00,0x00]
+v_cubesc_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xc5,0xd1,0x01,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x87,0xd1,0x6c,0x00,0x00,0x00]
+v_cubesc_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x65,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x87,0xd1,0x6d,0x00,0x00,0x00]
+v_cubesc_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x66,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x87,0xd1,0x6e,0x00,0x00,0x00]
+v_cubesc_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x67,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x87,0xd1,0x6f,0x00,0x00,0x00]
+v_cubesc_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x87,0xd1,0x7b,0x00,0x00,0x00]
+v_cubesc_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x87,0xd1,0x7c,0x00,0x00,0x00]
+v_cubesc_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x87,0xd1,0x7e,0x00,0x00,0x00]
+v_cubesc_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x87,0xd1,0x7f,0x00,0x00,0x00]
+v_cubesc_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x87,0xd1,0xfd,0x00,0x00,0x00]
+v_cubesc_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x87,0xd1,0x00,0x01,0x00,0x00]
+v_cubesc_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x87,0xd1,0xff,0x01,0x00,0x00]
+v_cubesc_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x87,0xd1,0x00,0x00,0x00,0x20]
+v_cubesc_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x87,0xd1,0x00,0x00,0x00,0x00]
+v_cubesc_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_rndne_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x87,0xd1,0x00,0x00,0x00,0x00]
+v_cubesc_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_fract_f16 v0, s0
-// CHECK: [0x00,0x90,0x00,0x7e]
+v_cubesc_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0x05,0x0e,0x04]
 
-v_fract_f16 v255, s0
-// CHECK: [0x00,0x90,0xfe,0x7f]
+v_cubesc_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0xff,0x05,0x0e,0x04]
 
-v_fract_f16 v0, s101
-// CHECK: [0x65,0x90,0x00,0x7e]
+v_cubesc_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_fract_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x90,0x00,0x7e]
+v_cubesc_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0x04,0xfe,0x07]
 
-v_fract_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x90,0x00,0x7e]
+v_cubesc_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0x04,0x0e,0x24]
 
-v_fract_f16 v0, vcc_lo
-// CHECK: [0x6a,0x90,0x00,0x7e]
+v_cubesc_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0x04,0x0e,0x44]
 
-v_fract_f16 v0, vcc_hi
-// CHECK: [0x6b,0x90,0x00,0x7e]
+v_cubesc_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0x04,0x0e,0x84]
 
-v_fract_f16 v0, tba_lo
-// CHECK: [0x6c,0x90,0x00,0x7e]
+v_cubesc_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_fract_f16 v0, tba_hi
-// CHECK: [0x6d,0x90,0x00,0x7e]
+v_cubesc_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xc5,0xd1,0x01,0x04,0x0e,0x04]
 
-v_fract_f16 v0, tma_lo
-// CHECK: [0x6e,0x90,0x00,0x7e]
+v_cubesc_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xc5,0xd1,0x01,0x04,0x0e,0x04]
 
-v_fract_f16 v0, tma_hi
-// CHECK: [0x6f,0x90,0x00,0x7e]
+v_cubesc_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xc5,0xd1,0x01,0x04,0x0e,0x04]
 
-v_fract_f16 v0, ttmp11
-// CHECK: [0x7b,0x90,0x00,0x7e]
+v_cubesc_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xc5,0xd1,0x01,0x04,0x0e,0x04]
 
-v_fract_f16 v0, m0
-// CHECK: [0x7c,0x90,0x00,0x7e]
+v_cubesc_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xc5,0xd1,0x01,0x04,0x0e,0x04]
 
-v_fract_f16 v0, exec_lo
-// CHECK: [0x7e,0x90,0x00,0x7e]
+v_cubesc_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_fract_f16 v0, exec_hi
-// CHECK: [0x7f,0x90,0x00,0x7e]
+v_cubesc_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0x04,0x0e,0x14]
 
-v_fract_f16 v0, 0
-// CHECK: [0x80,0x90,0x00,0x7e]
+v_cubesc_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xc5,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_fract_f16 v0, -1
-// CHECK: [0xc1,0x90,0x00,0x7e]
+v_cubetc_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_fract_f16 v0, 0.5
-// CHECK: [0xf0,0x90,0x00,0x7e]
+v_cubetc_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xc6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_fract_f16 v0, -4.0
-// CHECK: [0xf7,0x90,0x00,0x7e]
+v_cubetc_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x65,0x04,0x0e,0x04]
 
-v_fract_f16 v0, 0xfe0b
-// CHECK: [0xff,0x90,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_cubetc_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x66,0x04,0x0e,0x04]
 
-v_fract_f16 v0, 0x3456
-// CHECK: [0xff,0x90,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_cubetc_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x67,0x04,0x0e,0x04]
 
-v_fract_f16 v0, v0
-// CHECK: [0x00,0x91,0x00,0x7e]
+v_cubetc_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_fract_f16 v0, v255
-// CHECK: [0xff,0x91,0x00,0x7e]
+v_cubetc_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cubetc_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cubetc_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x88,0xd1,0x65,0x00,0x00,0x00]
+v_cubetc_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x88,0xd1,0x66,0x00,0x00,0x00]
+v_cubetc_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x88,0xd1,0x67,0x00,0x00,0x00]
+v_cubetc_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x88,0xd1,0x6a,0x00,0x00,0x00]
+v_cubetc_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x88,0xd1,0x6b,0x00,0x00,0x00]
+v_cubetc_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x88,0xd1,0x6c,0x00,0x00,0x00]
+v_cubetc_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x88,0xd1,0x6d,0x00,0x00,0x00]
+v_cubetc_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x88,0xd1,0x6e,0x00,0x00,0x00]
+v_cubetc_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0x05,0x0e,0x04]
 
-v_fract_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x88,0xd1,0x6f,0x00,0x00,0x00]
+v_cubetc_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0xff,0x05,0x0e,0x04]
 
-v_fract_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x88,0xd1,0x7b,0x00,0x00,0x00]
+v_cubetc_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_fract_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x88,0xd1,0x7c,0x00,0x00,0x00]
+v_cubetc_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0x04,0xfe,0x07]
 
-v_fract_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x88,0xd1,0x7e,0x00,0x00,0x00]
+v_cubetc_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0x04,0x0e,0x24]
 
-v_fract_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x88,0xd1,0x7f,0x00,0x00,0x00]
+v_cubetc_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0x04,0x0e,0x44]
 
-v_fract_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x88,0xd1,0xfd,0x00,0x00,0x00]
+v_cubetc_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0x04,0x0e,0x84]
 
-v_fract_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0x01,0x00,0x00]
+v_cubetc_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_fract_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x88,0xd1,0xff,0x01,0x00,0x00]
+v_cubetc_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xc6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x88,0xd1,0x00,0x00,0x00,0x20]
+v_cubetc_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xc6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cubetc_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xc6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_fract_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x88,0xd1,0x00,0x00,0x00,0x00]
+v_cubetc_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xc6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sin_f16 v0, s0
-// CHECK: [0x00,0x92,0x00,0x7e]
+v_cubetc_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xc6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sin_f16 v255, s0
-// CHECK: [0x00,0x92,0xfe,0x7f]
+v_cubetc_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_sin_f16 v0, s101
-// CHECK: [0x65,0x92,0x00,0x7e]
+v_cubetc_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0x04,0x0e,0x14]
 
-v_sin_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x92,0x00,0x7e]
+v_cubetc_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xc6,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_sin_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x92,0x00,0x7e]
+v_cubema_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sin_f16 v0, vcc_lo
-// CHECK: [0x6a,0x92,0x00,0x7e]
+v_cubema_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xc7,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sin_f16 v0, vcc_hi
-// CHECK: [0x6b,0x92,0x00,0x7e]
+v_cubema_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x65,0x04,0x0e,0x04]
 
-v_sin_f16 v0, tba_lo
-// CHECK: [0x6c,0x92,0x00,0x7e]
+v_cubema_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x66,0x04,0x0e,0x04]
 
-v_sin_f16 v0, tba_hi
-// CHECK: [0x6d,0x92,0x00,0x7e]
+v_cubema_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x67,0x04,0x0e,0x04]
 
-v_sin_f16 v0, tma_lo
-// CHECK: [0x6e,0x92,0x00,0x7e]
+v_cubema_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_sin_f16 v0, tma_hi
-// CHECK: [0x6f,0x92,0x00,0x7e]
+v_cubema_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_sin_f16 v0, ttmp11
-// CHECK: [0x7b,0x92,0x00,0x7e]
+v_cubema_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_sin_f16 v0, m0
-// CHECK: [0x7c,0x92,0x00,0x7e]
+v_cubema_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_sin_f16 v0, exec_lo
-// CHECK: [0x7e,0x92,0x00,0x7e]
+v_cubema_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_sin_f16 v0, exec_hi
-// CHECK: [0x7f,0x92,0x00,0x7e]
+v_cubema_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_sin_f16 v0, 0
-// CHECK: [0x80,0x92,0x00,0x7e]
+v_cubema_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_sin_f16 v0, -1
-// CHECK: [0xc1,0x92,0x00,0x7e]
+v_cubema_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_sin_f16 v0, 0.5
-// CHECK: [0xf0,0x92,0x00,0x7e]
+v_cubema_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_sin_f16 v0, -4.0
-// CHECK: [0xf7,0x92,0x00,0x7e]
+v_cubema_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_sin_f16 v0, 0xfe0b
-// CHECK: [0xff,0x92,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_cubema_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_sin_f16 v0, 0x3456
-// CHECK: [0xff,0x92,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_cubema_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0x05,0x0e,0x04]
 
-v_sin_f16 v0, v0
-// CHECK: [0x00,0x93,0x00,0x7e]
+v_cubema_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0xff,0x05,0x0e,0x04]
 
-v_sin_f16 v0, v255
-// CHECK: [0xff,0x93,0x00,0x7e]
+v_cubema_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_sin_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x89,0xd1,0x00,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0x04,0xfe,0x07]
 
-v_sin_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x89,0xd1,0x00,0x00,0x00,0x00]
+v_cubema_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0x04,0x0e,0x24]
 
-v_sin_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x89,0xd1,0x65,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0x04,0x0e,0x44]
 
-v_sin_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x89,0xd1,0x66,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0x04,0x0e,0x84]
 
-v_sin_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x89,0xd1,0x67,0x00,0x00,0x00]
+v_cubema_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_sin_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x89,0xd1,0x6a,0x00,0x00,0x00]
+v_cubema_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xc7,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sin_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x89,0xd1,0x6b,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xc7,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sin_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x89,0xd1,0x6c,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xc7,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sin_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x89,0xd1,0x6d,0x00,0x00,0x00]
+v_cubema_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xc7,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sin_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x89,0xd1,0x6e,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xc7,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sin_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x89,0xd1,0x6f,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_sin_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x89,0xd1,0x7b,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0x04,0x0e,0x14]
 
-v_sin_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x89,0xd1,0x7c,0x00,0x00,0x00]
+v_cubema_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xc7,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_sin_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x89,0xd1,0x7e,0x00,0x00,0x00]
+v_bfe_u32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0x00,0x01,0x02]
 
-v_sin_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x89,0xd1,0x7f,0x00,0x00,0x00]
+v_bfe_u32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xc8,0xd1,0x01,0x00,0x01,0x02]
 
-v_sin_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x89,0xd1,0xfd,0x00,0x00,0x00]
+v_bfe_u32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x65,0x00,0x01,0x02]
 
-v_sin_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x89,0xd1,0x00,0x01,0x00,0x00]
+v_bfe_u32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x66,0x00,0x01,0x02]
 
-v_sin_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x89,0xd1,0xff,0x01,0x00,0x00]
+v_bfe_u32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x67,0x00,0x01,0x02]
 
-v_sin_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x89,0xd1,0x00,0x00,0x00,0x20]
+v_bfe_u32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x6a,0x00,0x01,0x02]
 
-v_sin_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x89,0xd1,0x00,0x00,0x00,0x00]
+v_bfe_u32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x6b,0x00,0x01,0x02]
 
-v_sin_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x89,0xd1,0x00,0x00,0x00,0x00]
+v_bfe_u32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x6c,0x00,0x01,0x02]
 
-v_cos_f16 v0, s0
-// CHECK: [0x00,0x94,0x00,0x7e]
+v_bfe_u32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x6d,0x00,0x01,0x02]
 
-v_cos_f16 v255, s0
-// CHECK: [0x00,0x94,0xfe,0x7f]
+v_bfe_u32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x6e,0x00,0x01,0x02]
 
-v_cos_f16 v0, s101
-// CHECK: [0x65,0x94,0x00,0x7e]
+v_bfe_u32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x6f,0x00,0x01,0x02]
 
-v_cos_f16 v0, flat_scratch_lo
-// CHECK: [0x66,0x94,0x00,0x7e]
+v_bfe_u32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x7b,0x00,0x01,0x02]
 
-v_cos_f16 v0, flat_scratch_hi
-// CHECK: [0x67,0x94,0x00,0x7e]
+v_bfe_u32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x7c,0x00,0x01,0x02]
 
-v_cos_f16 v0, vcc_lo
-// CHECK: [0x6a,0x94,0x00,0x7e]
+v_bfe_u32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x7e,0x00,0x01,0x02]
 
-v_cos_f16 v0, vcc_hi
-// CHECK: [0x6b,0x94,0x00,0x7e]
+v_bfe_u32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x7f,0x00,0x01,0x02]
 
-v_cos_f16 v0, tba_lo
-// CHECK: [0x6c,0x94,0x00,0x7e]
+v_bfe_u32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x80,0x00,0x01,0x02]
 
-v_cos_f16 v0, tba_hi
-// CHECK: [0x6d,0x94,0x00,0x7e]
+v_bfe_u32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0xc1,0x00,0x01,0x02]
 
-v_cos_f16 v0, tma_lo
-// CHECK: [0x6e,0x94,0x00,0x7e]
+v_bfe_u32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0xf0,0x00,0x01,0x02]
 
-v_cos_f16 v0, tma_hi
-// CHECK: [0x6f,0x94,0x00,0x7e]
+v_bfe_u32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0xf7,0x00,0x01,0x02]
 
-v_cos_f16 v0, ttmp11
-// CHECK: [0x7b,0x94,0x00,0x7e]
+v_bfe_u32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0x01,0x01,0x02]
 
-v_cos_f16 v0, m0
-// CHECK: [0x7c,0x94,0x00,0x7e]
+v_bfe_u32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0xff,0x01,0x01,0x02]
 
-v_cos_f16 v0, exec_lo
-// CHECK: [0x7e,0x94,0x00,0x7e]
+v_bfe_u32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0x82,0x01,0x02]
 
-v_cos_f16 v0, exec_hi
-// CHECK: [0x7f,0x94,0x00,0x7e]
+v_bfe_u32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0xe0,0x01,0x02]
 
-v_cos_f16 v0, 0
-// CHECK: [0x80,0x94,0x00,0x7e]
+v_bfe_u32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0xee,0x01,0x02]
 
-v_cos_f16 v0, -1
-// CHECK: [0xc1,0x94,0x00,0x7e]
+v_bfe_u32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0x04,0x02,0x02]
 
-v_cos_f16 v0, 0.5
-// CHECK: [0xf0,0x94,0x00,0x7e]
+v_bfe_u32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0xfe,0x03,0x02]
 
-v_cos_f16 v0, -4.0
-// CHECK: [0xf7,0x94,0x00,0x7e]
+v_bfe_u32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0x00,0x05,0x03]
 
-v_cos_f16 v0, 0xfe0b
-// CHECK: [0xff,0x94,0x00,0x7e,0x0b,0xfe,0x00,0x00]
+v_bfe_u32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0x00,0xc1,0x03]
 
-v_cos_f16 v0, 0x3456
-// CHECK: [0xff,0x94,0x00,0x7e,0x56,0x34,0x00,0x00]
+v_bfe_u32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0x00,0xdd,0x03]
 
-v_cos_f16 v0, v0
-// CHECK: [0x00,0x95,0x00,0x7e]
+v_bfe_u32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0x00,0x0d,0x04]
 
-v_cos_f16 v0, v255
-// CHECK: [0xff,0x95,0x00,0x7e]
+v_bfe_u32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xc8,0xd1,0x01,0x00,0xfd,0x07]
 
-v_cos_f16_e64 v0, s0
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_bfe_i32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0x00,0x01,0x02]
 
-v_cos_f16_e64 v255, s0
-// CHECK: [0xff,0x00,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_bfe_i32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xc9,0xd1,0x01,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, s101
-// CHECK: [0x00,0x00,0x8a,0xd1,0x65,0x00,0x00,0x00]
+v_bfe_i32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x65,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x8a,0xd1,0x66,0x00,0x00,0x00]
+v_bfe_i32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x66,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x8a,0xd1,0x67,0x00,0x00,0x00]
+v_bfe_i32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x67,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x8a,0xd1,0x6a,0x00,0x00,0x00]
+v_bfe_i32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x6a,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x8a,0xd1,0x6b,0x00,0x00,0x00]
+v_bfe_i32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x6b,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x8a,0xd1,0x6c,0x00,0x00,0x00]
+v_bfe_i32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x6c,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x8a,0xd1,0x6d,0x00,0x00,0x00]
+v_bfe_i32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x6d,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x8a,0xd1,0x6e,0x00,0x00,0x00]
+v_bfe_i32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x6e,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x8a,0xd1,0x6f,0x00,0x00,0x00]
+v_bfe_i32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x6f,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x8a,0xd1,0x7b,0x00,0x00,0x00]
+v_bfe_i32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x7b,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, m0
-// CHECK: [0x00,0x00,0x8a,0xd1,0x7c,0x00,0x00,0x00]
+v_bfe_i32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x7c,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x8a,0xd1,0x7e,0x00,0x00,0x00]
+v_bfe_i32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x7e,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x8a,0xd1,0x7f,0x00,0x00,0x00]
+v_bfe_i32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x7f,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, scc
-// CHECK: [0x00,0x00,0x8a,0xd1,0xfd,0x00,0x00,0x00]
+v_bfe_i32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x80,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, v0
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0x01,0x00,0x00]
+v_bfe_i32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0xc1,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, v255
-// CHECK: [0x00,0x00,0x8a,0xd1,0xff,0x01,0x00,0x00]
+v_bfe_i32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0xf0,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, -s0
-// CHECK: [0x00,0x00,0x8a,0xd1,0x00,0x00,0x00,0x20]
+v_bfe_i32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0xf7,0x00,0x01,0x02]
 
-v_cos_f16_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_bfe_i32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0x01,0x01,0x02]
 
-v_cos_f16_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x8a,0xd1,0x00,0x00,0x00,0x00]
+v_bfe_i32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0xff,0x01,0x01,0x02]
 
-v_exp_legacy_f32 v0, s0
-// CHECK: [0x00,0x96,0x00,0x7e]
+v_bfe_i32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0x82,0x01,0x02]
 
-v_exp_legacy_f32 v255, s0
-// CHECK: [0x00,0x96,0xfe,0x7f]
+v_bfe_i32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0xe0,0x01,0x02]
 
-v_exp_legacy_f32 v0, s101
-// CHECK: [0x65,0x96,0x00,0x7e]
+v_bfe_i32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0xee,0x01,0x02]
 
-v_exp_legacy_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x96,0x00,0x7e]
+v_bfe_i32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0x04,0x02,0x02]
 
-v_exp_legacy_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x96,0x00,0x7e]
+v_bfe_i32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0xfe,0x03,0x02]
 
-v_exp_legacy_f32 v0, vcc_lo
-// CHECK: [0x6a,0x96,0x00,0x7e]
+v_bfe_i32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0x00,0x05,0x03]
 
-v_exp_legacy_f32 v0, vcc_hi
-// CHECK: [0x6b,0x96,0x00,0x7e]
+v_bfe_i32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0x00,0xc1,0x03]
 
-v_exp_legacy_f32 v0, tba_lo
-// CHECK: [0x6c,0x96,0x00,0x7e]
+v_bfe_i32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0x00,0xdd,0x03]
 
-v_exp_legacy_f32 v0, tba_hi
-// CHECK: [0x6d,0x96,0x00,0x7e]
+v_bfe_i32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0x00,0x0d,0x04]
 
-v_exp_legacy_f32 v0, tma_lo
-// CHECK: [0x6e,0x96,0x00,0x7e]
+v_bfe_i32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xc9,0xd1,0x01,0x00,0xfd,0x07]
 
-v_exp_legacy_f32 v0, tma_hi
-// CHECK: [0x6f,0x96,0x00,0x7e]
+v_bfi_b32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, ttmp11
-// CHECK: [0x7b,0x96,0x00,0x7e]
+v_bfi_b32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xca,0xd1,0x01,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, m0
-// CHECK: [0x7c,0x96,0x00,0x7e]
+v_bfi_b32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x65,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, exec_lo
-// CHECK: [0x7e,0x96,0x00,0x7e]
+v_bfi_b32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x66,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, exec_hi
-// CHECK: [0x7f,0x96,0x00,0x7e]
+v_bfi_b32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x67,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, 0
-// CHECK: [0x80,0x96,0x00,0x7e]
+v_bfi_b32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x6a,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, -1
-// CHECK: [0xc1,0x96,0x00,0x7e]
+v_bfi_b32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x6b,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, 0.5
-// CHECK: [0xf0,0x96,0x00,0x7e]
+v_bfi_b32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x6c,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, -4.0
-// CHECK: [0xf7,0x96,0x00,0x7e]
+v_bfi_b32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x6d,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, 0xaf123456
-// CHECK: [0xff,0x96,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_bfi_b32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x6e,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, 0x3f717273
-// CHECK: [0xff,0x96,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_bfi_b32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x6f,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, v0
-// CHECK: [0x00,0x97,0x00,0x7e]
+v_bfi_b32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x7b,0x00,0x01,0x02]
 
-v_exp_legacy_f32 v0, v255
-// CHECK: [0xff,0x97,0x00,0x7e]
+v_bfi_b32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x7c,0x00,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x8b,0xd1,0x00,0x00,0x00,0x00]
+v_bfi_b32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x7e,0x00,0x01,0x02]
 
-v_exp_legacy_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x8b,0xd1,0x00,0x00,0x00,0x00]
+v_bfi_b32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x7f,0x00,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x8b,0xd1,0x65,0x00,0x00,0x00]
+v_bfi_b32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x80,0x00,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x8b,0xd1,0x66,0x00,0x00,0x00]
+v_bfi_b32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0xc1,0x00,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x8b,0xd1,0x67,0x00,0x00,0x00]
+v_bfi_b32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0xf0,0x00,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x8b,0xd1,0x6a,0x00,0x00,0x00]
+v_bfi_b32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0xf7,0x00,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x8b,0xd1,0x6b,0x00,0x00,0x00]
+v_bfi_b32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0x01,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x8b,0xd1,0x6c,0x00,0x00,0x00]
+v_bfi_b32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0xff,0x01,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x8b,0xd1,0x6d,0x00,0x00,0x00]
+v_bfi_b32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0x82,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x8b,0xd1,0x6e,0x00,0x00,0x00]
+v_bfi_b32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0xe0,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x8b,0xd1,0x6f,0x00,0x00,0x00]
+v_bfi_b32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0xee,0x01,0x02]
 
-v_exp_legacy_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x8b,0xd1,0x7b,0x00,0x00,0x00]
+v_bfi_b32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0x04,0x02,0x02]
 
-v_exp_legacy_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x8b,0xd1,0x7c,0x00,0x00,0x00]
+v_bfi_b32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0xfe,0x03,0x02]
 
-v_exp_legacy_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x8b,0xd1,0x7e,0x00,0x00,0x00]
+v_bfi_b32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0x00,0x05,0x03]
 
-v_exp_legacy_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x8b,0xd1,0x7f,0x00,0x00,0x00]
+v_bfi_b32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0x00,0xc1,0x03]
 
-v_exp_legacy_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x8b,0xd1,0xfd,0x00,0x00,0x00]
+v_bfi_b32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0x00,0xdd,0x03]
 
-v_exp_legacy_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x8b,0xd1,0x00,0x01,0x00,0x00]
+v_bfi_b32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0x00,0x0d,0x04]
 
-v_exp_legacy_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x8b,0xd1,0xff,0x01,0x00,0x00]
+v_bfi_b32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xca,0xd1,0x01,0x00,0xfd,0x07]
 
-v_exp_legacy_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x8b,0xd1,0x00,0x00,0x00,0x20]
+v_fma_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0x04,0x0e,0x04]
 
-v_exp_legacy_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x8b,0xd1,0x00,0x00,0x00,0x00]
+v_fma_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xcb,0xd1,0x01,0x04,0x0e,0x04]
 
-v_exp_legacy_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x8b,0xd1,0x00,0x00,0x00,0x00]
+v_fma_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x65,0x04,0x0e,0x04]
 
-v_exp_legacy_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x8b,0xd1,0x00,0x00,0x00,0x08]
+v_fma_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x66,0x04,0x0e,0x04]
 
-v_exp_legacy_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x8b,0xd1,0x00,0x00,0x00,0x10]
+v_fma_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x67,0x04,0x0e,0x04]
 
-v_exp_legacy_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x8b,0xd1,0x00,0x00,0x00,0x18]
+v_fma_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, s0
-// CHECK: [0x00,0x98,0x00,0x7e]
+v_fma_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v255, s0
-// CHECK: [0x00,0x98,0xfe,0x7f]
+v_fma_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, s101
-// CHECK: [0x65,0x98,0x00,0x7e]
+v_fma_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, flat_scratch_lo
-// CHECK: [0x66,0x98,0x00,0x7e]
+v_fma_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, flat_scratch_hi
-// CHECK: [0x67,0x98,0x00,0x7e]
+v_fma_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, vcc_lo
-// CHECK: [0x6a,0x98,0x00,0x7e]
+v_fma_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, vcc_hi
-// CHECK: [0x6b,0x98,0x00,0x7e]
+v_fma_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, tba_lo
-// CHECK: [0x6c,0x98,0x00,0x7e]
+v_fma_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, tba_hi
-// CHECK: [0x6d,0x98,0x00,0x7e]
+v_fma_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, tma_lo
-// CHECK: [0x6e,0x98,0x00,0x7e]
+v_fma_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, tma_hi
-// CHECK: [0x6f,0x98,0x00,0x7e]
+v_fma_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0x05,0x0e,0x04]
 
-v_log_legacy_f32 v0, ttmp11
-// CHECK: [0x7b,0x98,0x00,0x7e]
+v_fma_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0xff,0x05,0x0e,0x04]
 
-v_log_legacy_f32 v0, m0
-// CHECK: [0x7c,0x98,0x00,0x7e]
+v_fma_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_log_legacy_f32 v0, exec_lo
-// CHECK: [0x7e,0x98,0x00,0x7e]
+v_fma_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0x04,0xfe,0x07]
 
-v_log_legacy_f32 v0, exec_hi
-// CHECK: [0x7f,0x98,0x00,0x7e]
+v_fma_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0x04,0x0e,0x24]
 
-v_log_legacy_f32 v0, 0
-// CHECK: [0x80,0x98,0x00,0x7e]
+v_fma_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0x04,0x0e,0x44]
 
-v_log_legacy_f32 v0, -1
-// CHECK: [0xc1,0x98,0x00,0x7e]
+v_fma_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0x04,0x0e,0x84]
 
-v_log_legacy_f32 v0, 0.5
-// CHECK: [0xf0,0x98,0x00,0x7e]
+v_fma_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_log_legacy_f32 v0, -4.0
-// CHECK: [0xf7,0x98,0x00,0x7e]
+v_fma_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xcb,0xd1,0x01,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, 0xaf123456
-// CHECK: [0xff,0x98,0x00,0x7e,0x56,0x34,0x12,0xaf]
+v_fma_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xcb,0xd1,0x01,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, 0x3f717273
-// CHECK: [0xff,0x98,0x00,0x7e,0x73,0x72,0x71,0x3f]
+v_fma_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xcb,0xd1,0x01,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, v0
-// CHECK: [0x00,0x99,0x00,0x7e]
+v_fma_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xcb,0xd1,0x01,0x04,0x0e,0x04]
 
-v_log_legacy_f32 v0, v255
-// CHECK: [0xff,0x99,0x00,0x7e]
+v_fma_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xcb,0xd1,0x01,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, s0
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_fma_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_log_legacy_f32_e64 v255, s0
-// CHECK: [0xff,0x00,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_fma_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0x04,0x0e,0x14]
 
-v_log_legacy_f32_e64 v0, s101
-// CHECK: [0x00,0x00,0x8c,0xd1,0x65,0x00,0x00,0x00]
+v_fma_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xcb,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_log_legacy_f32_e64 v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x8c,0xd1,0x66,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x02,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x8c,0xd1,0x67,0x00,0x00,0x00]
+v_fma_f64 v[254:255], s[2:3], v[2:3], v[3:4]
+// CHECK: [0xfe,0x00,0xcc,0xd1,0x02,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, vcc_lo
-// CHECK: [0x00,0x00,0x8c,0xd1,0x6a,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[4:5], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x04,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, vcc_hi
-// CHECK: [0x00,0x00,0x8c,0xd1,0x6b,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[100:101], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x64,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, tba_lo
-// CHECK: [0x00,0x00,0x8c,0xd1,0x6c,0x00,0x00,0x00]
+v_fma_f64 v[5:6], flat_scratch, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x66,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, tba_hi
-// CHECK: [0x00,0x00,0x8c,0xd1,0x6d,0x00,0x00,0x00]
+v_fma_f64 v[5:6], vcc, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, tma_lo
-// CHECK: [0x00,0x00,0x8c,0xd1,0x6e,0x00,0x00,0x00]
+v_fma_f64 v[5:6], tba, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, tma_hi
-// CHECK: [0x00,0x00,0x8c,0xd1,0x6f,0x00,0x00,0x00]
+v_fma_f64 v[5:6], tma, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, ttmp11
-// CHECK: [0x00,0x00,0x8c,0xd1,0x7b,0x00,0x00,0x00]
+v_fma_f64 v[5:6], ttmp[10:11], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x7a,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, m0
-// CHECK: [0x00,0x00,0x8c,0xd1,0x7c,0x00,0x00,0x00]
+v_fma_f64 v[5:6], exec, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, exec_lo
-// CHECK: [0x00,0x00,0x8c,0xd1,0x7e,0x00,0x00,0x00]
+v_fma_f64 v[5:6], scc, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, exec_hi
-// CHECK: [0x00,0x00,0x8c,0xd1,0x7f,0x00,0x00,0x00]
+v_fma_f64 v[5:6], v[1:2], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x01,0x05,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, scc
-// CHECK: [0x00,0x00,0x8c,0xd1,0xfd,0x00,0x00,0x00]
+v_fma_f64 v[5:6], v[254:255], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0xfe,0x05,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, v0
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x01,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[254:255], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x02,0xfc,0x0f,0x04]
 
-v_log_legacy_f32_e64 v0, v255
-// CHECK: [0x00,0x00,0x8c,0xd1,0xff,0x01,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[254:255]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x02,0x04,0xfa,0x07]
 
-v_log_legacy_f32_e64 v0, -s0
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x00,0x00,0x20]
+v_fma_f64 v[5:6], -s[2:3], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x02,0x04,0x0e,0x24]
 
-v_log_legacy_f32_e64 v0, |s0|
-// CHECK: [0x00,0x01,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], -v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x02,0x04,0x0e,0x44]
 
-v_log_legacy_f32_e64 v0, s0 clamp
-// CHECK: [0x00,0x80,0x8c,0xd1,0x00,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], -v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x02,0x04,0x0e,0x84]
 
-v_log_legacy_f32_e64 v0, s0 mul:2
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x00,0x00,0x08]
+v_fma_f64 v[5:6], -s[2:3], -v[2:3], -v[3:4]
+// CHECK: [0x05,0x00,0xcc,0xd1,0x02,0x04,0x0e,0xe4]
 
-v_log_legacy_f32_e64 v0, s0 mul:4
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x00,0x00,0x10]
+v_fma_f64 v[5:6], |s[2:3]|, v[2:3], v[3:4]
+// CHECK: [0x05,0x01,0xcc,0xd1,0x02,0x04,0x0e,0x04]
 
-v_log_legacy_f32_e64 v0, s0 div:2
-// CHECK: [0x00,0x00,0x8c,0xd1,0x00,0x00,0x00,0x18]
+v_fma_f64 v[5:6], s[2:3], |v[2:3]|, v[3:4]
+// CHECK: [0x05,0x02,0xcc,0xd1,0x02,0x04,0x0e,0x04]
 
-v_cndmask_b32 v0, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], |v[3:4]|
+// CHECK: [0x05,0x04,0xcc,0xd1,0x02,0x04,0x0e,0x04]
 
-v_cndmask_b32 v255, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0xfe,0x01]
+v_fma_f64 v[5:6], |s[2:3]|, |v[2:3]|, |v[3:4]|
+// CHECK: [0x05,0x07,0xcc,0xd1,0x02,0x04,0x0e,0x04]
 
-v_cndmask_b32 v0, vcc_hi, v0, vcc
-// CHECK: [0x6b,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[3:4] clamp
+// CHECK: [0x05,0x80,0xcc,0xd1,0x02,0x04,0x0e,0x04]
 
-v_cndmask_b32 v0, 0, v0, vcc
-// CHECK: [0x80,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[3:4] mul:2
+// CHECK: [0x05,0x00,0xcc,0xd1,0x02,0x04,0x0e,0x0c]
 
-v_cndmask_b32 v0, -1, v0, vcc
-// CHECK: [0xc1,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[3:4] mul:4
+// CHECK: [0x05,0x00,0xcc,0xd1,0x02,0x04,0x0e,0x14]
 
-v_cndmask_b32 v0, 0.5, v0, vcc
-// CHECK: [0xf0,0x00,0x00,0x00]
+v_fma_f64 v[5:6], s[2:3], v[2:3], v[3:4] div:2
+// CHECK: [0x05,0x00,0xcc,0xd1,0x02,0x04,0x0e,0x1c]
 
-v_cndmask_b32 v0, -4.0, v0, vcc
-// CHECK: [0xf7,0x00,0x00,0x00]
+v_lerp_u8 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0x00,0x01,0x02]
 
-v_cndmask_b32 v0, v0, v0, vcc
-// CHECK: [0x00,0x01,0x00,0x00]
+v_lerp_u8 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xcd,0xd1,0x01,0x00,0x01,0x02]
 
-v_cndmask_b32 v0, v255, v0, vcc
-// CHECK: [0xff,0x01,0x00,0x00]
+v_lerp_u8 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x65,0x00,0x01,0x02]
 
-v_cndmask_b32 v0, vcc_lo, v255, vcc
-// CHECK: [0x6a,0xfe,0x01,0x00]
+v_lerp_u8 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x66,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, s0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0x00,0x00,0x00]
+v_lerp_u8 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x67,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v255, s0, s0, s[0:1]
-// CHECK: [0xff,0x00,0x00,0xd1,0x00,0x00,0x00,0x00]
+v_lerp_u8 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x6a,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0x80,0x00,0x00,0x00]
+v_lerp_u8 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x6b,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0xc1,0x00,0x00,0x00]
+v_lerp_u8 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x6c,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0xf0,0x00,0x00,0x00]
+v_lerp_u8 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x6d,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0xf7,0x00,0x00,0x00]
+v_lerp_u8 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x6e,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, v0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0x01,0x00,0x00]
+v_lerp_u8 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x6f,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, v255, s0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0xff,0x01,0x00,0x00]
+v_lerp_u8 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x7b,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, s0, 0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0x00,0x01,0x00]
+v_lerp_u8 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x7c,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, s0, -1, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0x82,0x01,0x00]
+v_lerp_u8 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x7e,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, s0, 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0xe0,0x01,0x00]
+v_lerp_u8 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x7f,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, s0, -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0xee,0x01,0x00]
+v_lerp_u8 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x80,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, s0, v0, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0x00,0x02,0x00]
+v_lerp_u8 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0xc1,0x00,0x01,0x02]
 
-v_cndmask_b32_e64 v0, s0, v255, s[0:1]
-// CHECK: [0x00,0x00,0x00,0xd1,0x00,0xfe,0x03,0x00]
+v_lerp_u8 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0xf0,0x00,0x01,0x02]
 
-v_add_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x02]
+v_lerp_u8 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0xf7,0x00,0x01,0x02]
 
-v_add_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x03]
+v_lerp_u8 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0x01,0x01,0x02]
 
-v_add_f32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x02]
+v_lerp_u8 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0xff,0x01,0x01,0x02]
 
-v_add_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x02]
+v_lerp_u8 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0x82,0x01,0x02]
 
-v_add_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x02]
+v_lerp_u8 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0xe0,0x01,0x02]
 
-v_add_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x02]
+v_lerp_u8 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0xee,0x01,0x02]
 
-v_add_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x02]
+v_lerp_u8 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0x04,0x02,0x02]
 
-v_add_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x02]
+v_lerp_u8 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0xfe,0x03,0x02]
 
-v_add_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x02]
+v_lerp_u8 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0x00,0x05,0x03]
 
-v_add_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x02]
+v_lerp_u8 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0x00,0xc1,0x03]
 
-v_add_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x02]
+v_lerp_u8 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0x00,0xdd,0x03]
 
-v_add_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x02]
+v_lerp_u8 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0x00,0x0d,0x04]
 
-v_add_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x02]
+v_lerp_u8 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xcd,0xd1,0x01,0x00,0xfd,0x07]
 
-v_add_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x02]
+v_alignbit_b32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0x00,0x01,0x02]
 
-v_add_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x02]
+v_alignbit_b32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xce,0xd1,0x01,0x00,0x01,0x02]
 
-v_add_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x02]
+v_alignbit_b32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x65,0x00,0x01,0x02]
 
-v_add_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x02]
+v_alignbit_b32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x66,0x00,0x01,0x02]
 
-v_add_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x02]
+v_alignbit_b32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x67,0x00,0x01,0x02]
 
-v_add_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x02]
+v_alignbit_b32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x6a,0x00,0x01,0x02]
 
-v_add_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x02,0x56,0x34,0x12,0xaf]
+v_alignbit_b32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x6b,0x00,0x01,0x02]
 
-v_add_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x02,0x73,0x72,0x71,0x3f]
+v_alignbit_b32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x6c,0x00,0x01,0x02]
 
-v_add_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x02]
+v_alignbit_b32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x6d,0x00,0x01,0x02]
 
-v_add_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x02]
+v_alignbit_b32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x6e,0x00,0x01,0x02]
 
-v_add_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x02]
+v_alignbit_b32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x6f,0x00,0x01,0x02]
 
-v_add_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0x00,0x00,0x00]
+v_alignbit_b32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x7b,0x00,0x01,0x02]
 
-v_add_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x01,0xd1,0x00,0x00,0x00,0x00]
+v_alignbit_b32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x7c,0x00,0x01,0x02]
 
-v_add_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x01,0xd1,0xfd,0x00,0x00,0x00]
+v_alignbit_b32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x7e,0x00,0x01,0x02]
 
-v_add_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0x01,0x00,0x00]
+v_alignbit_b32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x7f,0x00,0x01,0x02]
 
-v_add_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x01,0xd1,0xff,0x01,0x00,0x00]
+v_alignbit_b32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x80,0x00,0x01,0x02]
 
-v_add_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0xfa,0x01,0x00]
+v_alignbit_b32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0xc1,0x00,0x01,0x02]
 
-v_add_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0x00,0x02,0x00]
+v_alignbit_b32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0xf0,0x00,0x01,0x02]
 
-v_add_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0xfe,0x03,0x00]
+v_alignbit_b32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0xf7,0x00,0x01,0x02]
 
-v_add_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0x00,0x00,0x20]
+v_alignbit_b32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0x01,0x01,0x02]
 
-v_add_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0x00,0x00,0x40]
+v_alignbit_b32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0xff,0x01,0x01,0x02]
 
-v_add_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0x00,0x00,0x60]
+v_alignbit_b32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0x82,0x01,0x02]
 
-v_add_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x01,0xd1,0x00,0x00,0x00,0x00]
+v_alignbit_b32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0xe0,0x01,0x02]
 
-v_add_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x01,0xd1,0x00,0x00,0x00,0x00]
+v_alignbit_b32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0xee,0x01,0x02]
 
-v_add_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x01,0xd1,0x00,0x00,0x00,0x00]
+v_alignbit_b32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0x04,0x02,0x02]
 
-v_add_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x01,0xd1,0x00,0x00,0x00,0x00]
+v_alignbit_b32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0xfe,0x03,0x02]
 
-v_add_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0x00,0x00,0x08]
+v_alignbit_b32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0x00,0x05,0x03]
 
-v_add_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0x00,0x00,0x10]
+v_alignbit_b32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0x00,0xc1,0x03]
 
-v_add_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x01,0xd1,0x00,0x00,0x00,0x18]
+v_alignbit_b32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0x00,0xdd,0x03]
 
-v_sub_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x04]
+v_alignbit_b32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0x00,0x0d,0x04]
 
-v_sub_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x05]
+v_alignbit_b32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xce,0xd1,0x01,0x00,0xfd,0x07]
 
-v_sub_f32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x04]
+v_alignbyte_b32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x00,0x01,0x02]
 
-v_sub_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x04]
+v_alignbyte_b32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xcf,0xd1,0x01,0x00,0x01,0x02]
 
-v_sub_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x04]
+v_alignbyte_b32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x65,0x00,0x01,0x02]
 
-v_sub_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x04]
+v_alignbyte_b32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x66,0x00,0x01,0x02]
 
-v_sub_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x04]
+v_alignbyte_b32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x67,0x00,0x01,0x02]
 
-v_sub_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x04]
+v_alignbyte_b32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x6a,0x00,0x01,0x02]
 
-v_sub_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x04]
+v_alignbyte_b32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x6b,0x00,0x01,0x02]
 
-v_sub_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x04]
+v_alignbyte_b32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x6c,0x00,0x01,0x02]
 
-v_sub_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x04]
+v_alignbyte_b32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x6d,0x00,0x01,0x02]
 
-v_sub_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x04]
+v_alignbyte_b32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x6e,0x00,0x01,0x02]
 
-v_sub_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x04]
+v_alignbyte_b32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x6f,0x00,0x01,0x02]
 
-v_sub_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x04]
+v_alignbyte_b32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x7b,0x00,0x01,0x02]
 
-v_sub_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x04]
+v_alignbyte_b32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x7c,0x00,0x01,0x02]
 
-v_sub_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x04]
+v_alignbyte_b32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x7e,0x00,0x01,0x02]
 
-v_sub_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x04]
+v_alignbyte_b32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x7f,0x00,0x01,0x02]
 
-v_sub_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x04]
+v_alignbyte_b32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x80,0x00,0x01,0x02]
 
-v_sub_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x04]
+v_alignbyte_b32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0xc1,0x00,0x01,0x02]
 
-v_sub_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x04,0x56,0x34,0x12,0xaf]
+v_alignbyte_b32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0xf0,0x00,0x01,0x02]
 
-v_sub_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x04,0x73,0x72,0x71,0x3f]
+v_alignbyte_b32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0xf7,0x00,0x01,0x02]
 
-v_sub_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x04]
+v_alignbyte_b32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x01,0x01,0x02]
 
-v_sub_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x04]
+v_alignbyte_b32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0xff,0x01,0x01,0x02]
 
-v_sub_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x04]
+v_alignbyte_b32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x82,0x01,0x02]
 
-v_sub_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_alignbyte_b32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0xe0,0x01,0x02]
 
-v_sub_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_alignbyte_b32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0xee,0x01,0x02]
 
-v_sub_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0xfd,0x00,0x00,0x00]
+v_alignbyte_b32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x04,0x02,0x02]
 
-v_sub_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x01,0x00,0x00]
+v_alignbyte_b32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0xfe,0x03,0x02]
 
-v_sub_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0xff,0x01,0x00,0x00]
+v_alignbyte_b32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x00,0x05,0x03]
 
-v_sub_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0xfa,0x01,0x00]
+v_alignbyte_b32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x00,0xc1,0x03]
 
-v_sub_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x02,0x00]
+v_alignbyte_b32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x00,0xdd,0x03]
 
-v_sub_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0xfe,0x03,0x00]
+v_alignbyte_b32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x00,0x0d,0x04]
 
-v_sub_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x00,0x20]
+v_alignbyte_b32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xcf,0xd1,0x01,0x00,0xfd,0x07]
 
-v_sub_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x00,0x40]
+v_min3_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sub_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x00,0x60]
+v_min3_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xd0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_sub_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_min3_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x65,0x04,0x0e,0x04]
 
-v_sub_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_min3_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x66,0x04,0x0e,0x04]
 
-v_sub_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_min3_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x67,0x04,0x0e,0x04]
 
-v_sub_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x02,0xd1,0x00,0x00,0x00,0x00]
+v_min3_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_sub_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x00,0x08]
+v_min3_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_sub_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x00,0x10]
+v_min3_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_sub_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x02,0xd1,0x00,0x00,0x00,0x18]
+v_min3_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x06]
+v_min3_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_subrev_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x07]
+v_min3_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x06]
+v_min3_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x06]
+v_min3_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x06]
+v_min3_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x06]
+v_min3_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x06]
+v_min3_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x06]
+v_min3_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x05,0x0e,0x04]
 
-v_subrev_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x06]
+v_min3_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0xff,0x05,0x0e,0x04]
 
-v_subrev_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x06]
+v_min3_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_subrev_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x06]
+v_min3_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x04,0xfe,0x07]
 
-v_subrev_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x06]
+v_min3_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x04,0x0e,0x24]
 
-v_subrev_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x06]
+v_min3_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x04,0x0e,0x44]
 
-v_subrev_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x06]
+v_min3_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x04,0x0e,0x84]
 
-v_subrev_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x06]
+v_min3_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_subrev_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x06]
+v_min3_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xd0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x06]
+v_min3_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xd0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x06]
+v_min3_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xd0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x06]
+v_min3_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xd0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x06,0x56,0x34,0x12,0xaf]
+v_min3_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xd0,0xd1,0x01,0x04,0x0e,0x04]
 
-v_subrev_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x06,0x73,0x72,0x71,0x3f]
+v_min3_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_subrev_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x06]
+v_min3_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x04,0x0e,0x14]
 
-v_subrev_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x06]
+v_min3_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xd0,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_subrev_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x06]
+v_min3_i32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0x00,0x00,0x00]
+v_min3_i32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xd1,0xd1,0x01,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x03,0xd1,0x00,0x00,0x00,0x00]
+v_min3_i32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x65,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x03,0xd1,0xfd,0x00,0x00,0x00]
+v_min3_i32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x66,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0x01,0x00,0x00]
+v_min3_i32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x67,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x03,0xd1,0xff,0x01,0x00,0x00]
+v_min3_i32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x6a,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0xfa,0x01,0x00]
+v_min3_i32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x6b,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0x00,0x02,0x00]
+v_min3_i32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x6c,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0xfe,0x03,0x00]
+v_min3_i32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x6d,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0x00,0x00,0x20]
+v_min3_i32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x6e,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0x00,0x00,0x40]
+v_min3_i32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x6f,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0x00,0x00,0x60]
+v_min3_i32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x7b,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x03,0xd1,0x00,0x00,0x00,0x00]
+v_min3_i32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x7c,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x03,0xd1,0x00,0x00,0x00,0x00]
+v_min3_i32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x7e,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x03,0xd1,0x00,0x00,0x00,0x00]
+v_min3_i32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x7f,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x03,0xd1,0x00,0x00,0x00,0x00]
+v_min3_i32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x80,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0x00,0x00,0x08]
+v_min3_i32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0xc1,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0x00,0x00,0x10]
+v_min3_i32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0xf0,0x00,0x01,0x02]
 
-v_subrev_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x03,0xd1,0x00,0x00,0x00,0x18]
+v_min3_i32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0xf7,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x08]
+v_min3_i32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0x01,0x01,0x02]
 
-v_mul_legacy_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x09]
+v_min3_i32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0xff,0x01,0x01,0x02]
 
-v_mul_legacy_f32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x08]
+v_min3_i32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0x82,0x01,0x02]
 
-v_mul_legacy_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x08]
+v_min3_i32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0xe0,0x01,0x02]
 
-v_mul_legacy_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x08]
+v_min3_i32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0xee,0x01,0x02]
 
-v_mul_legacy_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x08]
+v_min3_i32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0x04,0x02,0x02]
 
-v_mul_legacy_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x08]
+v_min3_i32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0xfe,0x03,0x02]
 
-v_mul_legacy_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x08]
+v_min3_i32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0x00,0x05,0x03]
 
-v_mul_legacy_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x08]
+v_min3_i32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0x00,0xc1,0x03]
 
-v_mul_legacy_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x08]
+v_min3_i32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0x00,0xdd,0x03]
 
-v_mul_legacy_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x08]
+v_min3_i32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0x00,0x0d,0x04]
 
-v_mul_legacy_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x08]
+v_min3_i32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xd1,0xd1,0x01,0x00,0xfd,0x07]
 
-v_mul_legacy_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x08]
+v_min3_u32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x08]
+v_min3_u32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xd2,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x08]
+v_min3_u32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x65,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x08]
+v_min3_u32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x66,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x08]
+v_min3_u32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x67,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x08]
+v_min3_u32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x6a,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x08]
+v_min3_u32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x6b,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x08,0x56,0x34,0x12,0xaf]
+v_min3_u32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x6c,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x08,0x73,0x72,0x71,0x3f]
+v_min3_u32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x6d,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x08]
+v_min3_u32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x6e,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x08]
+v_min3_u32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x6f,0x00,0x01,0x02]
 
-v_mul_legacy_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x08]
+v_min3_u32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x7b,0x00,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_min3_u32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x7c,0x00,0x01,0x02]
 
-v_mul_legacy_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_min3_u32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x7e,0x00,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0xfd,0x00,0x00,0x00]
+v_min3_u32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x7f,0x00,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x01,0x00,0x00]
+v_min3_u32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x80,0x00,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0xff,0x01,0x00,0x00]
+v_min3_u32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0xc1,0x00,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0xfa,0x01,0x00]
+v_min3_u32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0xf0,0x00,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x02,0x00]
+v_min3_u32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0xf7,0x00,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0xfe,0x03,0x00]
+v_min3_u32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0x01,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x00,0x20]
+v_min3_u32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0xff,0x01,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x00,0x40]
+v_min3_u32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0x82,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x00,0x60]
+v_min3_u32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0xe0,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_min3_u32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0xee,0x01,0x02]
 
-v_mul_legacy_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_min3_u32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0x04,0x02,0x02]
 
-v_mul_legacy_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_min3_u32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0xfe,0x03,0x02]
 
-v_mul_legacy_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x04,0xd1,0x00,0x00,0x00,0x00]
+v_min3_u32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0x00,0x05,0x03]
 
-v_mul_legacy_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x00,0x08]
+v_min3_u32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0x00,0xc1,0x03]
 
-v_mul_legacy_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x00,0x10]
+v_min3_u32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0x00,0xdd,0x03]
 
-v_mul_legacy_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x04,0xd1,0x00,0x00,0x00,0x18]
+v_min3_u32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0x00,0x0d,0x04]
 
-v_mul_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x0a]
+v_min3_u32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xd2,0xd1,0x01,0x00,0xfd,0x07]
 
-v_mul_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x0b]
+v_max3_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_f32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x0a]
+v_max3_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xd3,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x0a]
+v_max3_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x65,0x04,0x0e,0x04]
 
-v_mul_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x0a]
+v_max3_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x66,0x04,0x0e,0x04]
 
-v_mul_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x0a]
+v_max3_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x67,0x04,0x0e,0x04]
 
-v_mul_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x0a]
+v_max3_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_mul_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x0a]
+v_max3_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_mul_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x0a]
+v_max3_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_mul_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x0a]
+v_max3_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_mul_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x0a]
+v_max3_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_mul_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x0a]
+v_max3_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_mul_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x0a]
+v_max3_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_mul_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x0a]
+v_max3_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_mul_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x0a]
+v_max3_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_mul_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x0a]
+v_max3_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_mul_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x0a]
+v_max3_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_mul_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x0a]
+v_max3_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0x05,0x0e,0x04]
 
-v_mul_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x0a]
+v_max3_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0xff,0x05,0x0e,0x04]
 
-v_mul_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x0a,0x56,0x34,0x12,0xaf]
+v_max3_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_mul_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x0a,0x73,0x72,0x71,0x3f]
+v_max3_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0x04,0xfe,0x07]
 
-v_mul_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x0a]
+v_max3_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0x04,0x0e,0x24]
 
-v_mul_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x0a]
+v_max3_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0x04,0x0e,0x44]
 
-v_mul_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x0a]
+v_max3_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0x04,0x0e,0x84]
 
-v_mul_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0x00,0x00,0x00]
+v_max3_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_mul_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x05,0xd1,0x00,0x00,0x00,0x00]
+v_max3_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xd3,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x05,0xd1,0xfd,0x00,0x00,0x00]
+v_max3_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xd3,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0x01,0x00,0x00]
+v_max3_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xd3,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x05,0xd1,0xff,0x01,0x00,0x00]
+v_max3_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xd3,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0xfa,0x01,0x00]
+v_max3_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xd3,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0x00,0x02,0x00]
+v_max3_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_mul_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0xfe,0x03,0x00]
+v_max3_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0x04,0x0e,0x14]
 
-v_mul_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0x00,0x00,0x20]
+v_max3_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xd3,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_mul_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0x00,0x00,0x40]
+v_max3_i32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0x00,0x00,0x60]
+v_max3_i32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xd4,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x05,0xd1,0x00,0x00,0x00,0x00]
+v_max3_i32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x65,0x00,0x01,0x02]
 
-v_mul_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x05,0xd1,0x00,0x00,0x00,0x00]
+v_max3_i32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x66,0x00,0x01,0x02]
 
-v_mul_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x05,0xd1,0x00,0x00,0x00,0x00]
+v_max3_i32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x67,0x00,0x01,0x02]
 
-v_mul_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x05,0xd1,0x00,0x00,0x00,0x00]
+v_max3_i32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x6a,0x00,0x01,0x02]
 
-v_mul_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0x00,0x00,0x08]
+v_max3_i32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x6b,0x00,0x01,0x02]
 
-v_mul_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0x00,0x00,0x10]
+v_max3_i32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x6c,0x00,0x01,0x02]
 
-v_mul_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x05,0xd1,0x00,0x00,0x00,0x18]
+v_max3_i32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x6d,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x0c]
+v_max3_i32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x6e,0x00,0x01,0x02]
 
-v_mul_i32_i24 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x0d]
+v_max3_i32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x6f,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x0c]
+v_max3_i32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x7b,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x0c]
+v_max3_i32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x7c,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x0c]
+v_max3_i32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x7e,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x0c]
+v_max3_i32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x7f,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x0c]
+v_max3_i32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x80,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x0c]
+v_max3_i32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0xc1,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x0c]
+v_max3_i32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0xf0,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x0c]
+v_max3_i32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0xf7,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x0c]
+v_max3_i32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0x01,0x01,0x02]
 
-v_mul_i32_i24 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x0c]
+v_max3_i32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0xff,0x01,0x01,0x02]
 
-v_mul_i32_i24 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x0c]
+v_max3_i32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0x82,0x01,0x02]
 
-v_mul_i32_i24 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x0c]
+v_max3_i32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0xe0,0x01,0x02]
 
-v_mul_i32_i24 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x0c]
+v_max3_i32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0xee,0x01,0x02]
 
-v_mul_i32_i24 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x0c]
+v_max3_i32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0x04,0x02,0x02]
 
-v_mul_i32_i24 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x0c]
+v_max3_i32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0xfe,0x03,0x02]
 
-v_mul_i32_i24 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x0c]
+v_max3_i32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0x00,0x05,0x03]
 
-v_mul_i32_i24 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x0c]
+v_max3_i32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0x00,0xc1,0x03]
 
-v_mul_i32_i24 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x0c,0x56,0x34,0x12,0xaf]
+v_max3_i32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0x00,0xdd,0x03]
 
-v_mul_i32_i24 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x0c,0x73,0x72,0x71,0x3f]
+v_max3_i32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0x00,0x0d,0x04]
 
-v_mul_i32_i24 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x0c]
+v_max3_i32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xd4,0xd1,0x01,0x00,0xfd,0x07]
 
-v_mul_i32_i24 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x0c]
+v_max3_u32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_i32_i24 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x0c]
+v_max3_u32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xd5,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0x00,0x00,0x00]
+v_max3_u32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x65,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x06,0xd1,0x00,0x00,0x00,0x00]
+v_max3_u32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x66,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0x80,0x00,0x00,0x00]
+v_max3_u32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x67,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0xc1,0x00,0x00,0x00]
+v_max3_u32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x6a,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0xf0,0x00,0x00,0x00]
+v_max3_u32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x6b,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0xf7,0x00,0x00,0x00]
+v_max3_u32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x6c,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0x01,0x00,0x00]
+v_max3_u32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x6d,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x06,0xd1,0xff,0x01,0x00,0x00]
+v_max3_u32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x6e,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0x00,0x01,0x00]
+v_max3_u32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x6f,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0x82,0x01,0x00]
+v_max3_u32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x7b,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0xe0,0x01,0x00]
+v_max3_u32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x7c,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0xee,0x01,0x00]
+v_max3_u32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x7e,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0x00,0x02,0x00]
+v_max3_u32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x7f,0x00,0x01,0x02]
 
-v_mul_i32_i24_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x06,0xd1,0x00,0xfe,0x03,0x00]
+v_max3_u32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x80,0x00,0x01,0x02]
 
-v_mul_hi_i32_i24 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x0e]
+v_max3_u32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0xc1,0x00,0x01,0x02]
 
-v_mul_hi_i32_i24 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x0f]
+v_max3_u32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0xf0,0x00,0x01,0x02]
 
-v_mul_hi_i32_i24 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x0e]
+v_max3_u32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0xf7,0x00,0x01,0x02]
 
-v_mul_hi_i32_i24 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x0e]
+v_max3_u32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0x01,0x01,0x02]
 
-v_mul_hi_i32_i24 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x0e]
+v_max3_u32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0xff,0x01,0x01,0x02]
 
-v_mul_hi_i32_i24 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x0e]
+v_max3_u32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0x82,0x01,0x02]
 
-v_mul_hi_i32_i24 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x0e]
+v_max3_u32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0xe0,0x01,0x02]
 
-v_mul_hi_i32_i24 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x0e]
+v_max3_u32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0xee,0x01,0x02]
 
-v_mul_hi_i32_i24 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x0e]
+v_max3_u32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0x04,0x02,0x02]
 
-v_mul_hi_i32_i24 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x0e]
+v_max3_u32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0xfe,0x03,0x02]
 
-v_mul_hi_i32_i24 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x0e]
+v_max3_u32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0x00,0x05,0x03]
 
-v_mul_hi_i32_i24 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x0e]
+v_max3_u32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0x00,0xc1,0x03]
 
-v_mul_hi_i32_i24 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x0e]
+v_max3_u32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0x00,0xdd,0x03]
 
-v_mul_hi_i32_i24 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x0e]
+v_max3_u32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0x00,0x0d,0x04]
 
-v_mul_hi_i32_i24 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x0e]
+v_max3_u32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xd5,0xd1,0x01,0x00,0xfd,0x07]
 
-v_mul_hi_i32_i24 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x0e]
+v_med3_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x0e]
+v_med3_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xd6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x0e]
+v_med3_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x65,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x0e]
+v_med3_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x66,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x0e,0x56,0x34,0x12,0xaf]
+v_med3_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x67,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x0e,0x73,0x72,0x71,0x3f]
+v_med3_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x0e]
+v_med3_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x0e]
+v_med3_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x0e]
+v_med3_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x07,0xd1,0x00,0x00,0x00,0x00]
+v_med3_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x07,0xd1,0x00,0x00,0x00,0x00]
+v_med3_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x07,0xd1,0x80,0x00,0x00,0x00]
+v_med3_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x07,0xd1,0xc1,0x00,0x00,0x00]
+v_med3_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x07,0xd1,0xf0,0x00,0x00,0x00]
+v_med3_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x07,0xd1,0xf7,0x00,0x00,0x00]
+v_med3_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x07,0xd1,0x00,0x01,0x00,0x00]
+v_med3_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_mul_hi_i32_i24_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x07,0xd1,0xff,0x01,0x00,0x00]
+v_med3_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0x05,0x0e,0x04]
 
-v_mul_hi_i32_i24_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x07,0xd1,0x00,0x00,0x01,0x00]
+v_med3_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0xff,0x05,0x0e,0x04]
 
-v_mul_hi_i32_i24_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x07,0xd1,0x00,0x82,0x01,0x00]
+v_med3_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_mul_hi_i32_i24_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x07,0xd1,0x00,0xe0,0x01,0x00]
+v_med3_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0x04,0xfe,0x07]
 
-v_mul_hi_i32_i24_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x07,0xd1,0x00,0xee,0x01,0x00]
+v_med3_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0x04,0x0e,0x24]
 
-v_mul_hi_i32_i24_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x07,0xd1,0x00,0x00,0x02,0x00]
+v_med3_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0x04,0x0e,0x44]
 
-v_mul_hi_i32_i24_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x07,0xd1,0x00,0xfe,0x03,0x00]
+v_med3_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0x04,0x0e,0x84]
 
-v_mul_u32_u24 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x10]
+v_med3_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_mul_u32_u24 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x11]
+v_med3_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xd6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_u32_u24 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x10]
+v_med3_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xd6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_u32_u24 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x10]
+v_med3_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xd6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_u32_u24 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x10]
+v_med3_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xd6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_u32_u24 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x10]
+v_med3_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xd6,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mul_u32_u24 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x10]
+v_med3_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_mul_u32_u24 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x10]
+v_med3_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0x04,0x0e,0x14]
 
-v_mul_u32_u24 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x10]
+v_med3_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xd6,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_mul_u32_u24 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x10]
+v_med3_i32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x10]
+v_med3_i32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xd7,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x10]
+v_med3_i32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x65,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x10]
+v_med3_i32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x66,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x10]
+v_med3_i32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x67,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x10]
+v_med3_i32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x6a,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x10]
+v_med3_i32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x6b,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x10]
+v_med3_i32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x6c,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x10]
+v_med3_i32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x6d,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x10]
+v_med3_i32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x6e,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x10,0x56,0x34,0x12,0xaf]
+v_med3_i32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x6f,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x10,0x73,0x72,0x71,0x3f]
+v_med3_i32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x7b,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x10]
+v_med3_i32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x7c,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x10]
+v_med3_i32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x7e,0x00,0x01,0x02]
 
-v_mul_u32_u24 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x10]
+v_med3_i32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x7f,0x00,0x01,0x02]
 
-v_mul_u32_u24_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0x00,0x00,0x00]
+v_med3_i32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x80,0x00,0x01,0x02]
 
-v_mul_u32_u24_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x08,0xd1,0x00,0x00,0x00,0x00]
+v_med3_i32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0xc1,0x00,0x01,0x02]
 
-v_mul_u32_u24_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0x80,0x00,0x00,0x00]
+v_med3_i32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0xf0,0x00,0x01,0x02]
 
-v_mul_u32_u24_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0xc1,0x00,0x00,0x00]
+v_med3_i32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0xf7,0x00,0x01,0x02]
 
-v_mul_u32_u24_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0xf0,0x00,0x00,0x00]
+v_med3_i32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0x01,0x01,0x02]
 
-v_mul_u32_u24_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0xf7,0x00,0x00,0x00]
+v_med3_i32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0xff,0x01,0x01,0x02]
 
-v_mul_u32_u24_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0x01,0x00,0x00]
+v_med3_i32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0x82,0x01,0x02]
 
-v_mul_u32_u24_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x08,0xd1,0xff,0x01,0x00,0x00]
+v_med3_i32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0xe0,0x01,0x02]
 
-v_mul_u32_u24_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0x00,0x01,0x00]
+v_med3_i32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0xee,0x01,0x02]
 
-v_mul_u32_u24_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0x82,0x01,0x00]
+v_med3_i32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0x04,0x02,0x02]
 
-v_mul_u32_u24_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0xe0,0x01,0x00]
+v_med3_i32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0xfe,0x03,0x02]
 
-v_mul_u32_u24_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0xee,0x01,0x00]
+v_med3_i32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0x00,0x05,0x03]
 
-v_mul_u32_u24_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0x00,0x02,0x00]
+v_med3_i32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0x00,0xc1,0x03]
 
-v_mul_u32_u24_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x08,0xd1,0x00,0xfe,0x03,0x00]
+v_med3_i32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0x00,0xdd,0x03]
 
-v_mul_hi_u32_u24 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x12]
+v_med3_i32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0x00,0x0d,0x04]
 
-v_mul_hi_u32_u24 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x13]
+v_med3_i32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xd7,0xd1,0x01,0x00,0xfd,0x07]
 
-v_mul_hi_u32_u24 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x12]
+v_med3_u32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x12]
+v_med3_u32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xd8,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x12]
+v_med3_u32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x65,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x12]
+v_med3_u32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x66,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x12]
+v_med3_u32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x67,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x12]
+v_med3_u32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x6a,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x12]
+v_med3_u32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x6b,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x12]
+v_med3_u32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x6c,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x12]
+v_med3_u32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x6d,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x12]
+v_med3_u32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x6e,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x12]
+v_med3_u32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x6f,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x12]
+v_med3_u32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x7b,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x12]
+v_med3_u32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x7c,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x12]
+v_med3_u32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x7e,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x12]
+v_med3_u32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x7f,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x12]
+v_med3_u32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x80,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x12]
+v_med3_u32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0xc1,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x12,0x56,0x34,0x12,0xaf]
+v_med3_u32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0xf0,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x12,0x73,0x72,0x71,0x3f]
+v_med3_u32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0xf7,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x12]
+v_med3_u32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0x01,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x12]
+v_med3_u32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0xff,0x01,0x01,0x02]
 
-v_mul_hi_u32_u24 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x12]
+v_med3_u32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0x82,0x01,0x02]
 
-v_mul_hi_u32_u24_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x09,0xd1,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0xe0,0x01,0x02]
 
-v_mul_hi_u32_u24_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x09,0xd1,0x00,0x00,0x00,0x00]
+v_med3_u32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0xee,0x01,0x02]
 
-v_mul_hi_u32_u24_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x09,0xd1,0x80,0x00,0x00,0x00]
+v_med3_u32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0x04,0x02,0x02]
 
-v_mul_hi_u32_u24_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x09,0xd1,0xc1,0x00,0x00,0x00]
+v_med3_u32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0xfe,0x03,0x02]
 
-v_mul_hi_u32_u24_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x09,0xd1,0xf0,0x00,0x00,0x00]
+v_med3_u32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0x00,0x05,0x03]
 
-v_mul_hi_u32_u24_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x09,0xd1,0xf7,0x00,0x00,0x00]
+v_med3_u32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0x00,0xc1,0x03]
 
-v_mul_hi_u32_u24_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x09,0xd1,0x00,0x01,0x00,0x00]
+v_med3_u32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0x00,0xdd,0x03]
 
-v_mul_hi_u32_u24_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x09,0xd1,0xff,0x01,0x00,0x00]
+v_med3_u32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0x00,0x0d,0x04]
 
-v_mul_hi_u32_u24_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x09,0xd1,0x00,0x00,0x01,0x00]
+v_med3_u32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xd8,0xd1,0x01,0x00,0xfd,0x07]
 
-v_mul_hi_u32_u24_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x09,0xd1,0x00,0x82,0x01,0x00]
+v_sad_u8 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x09,0xd1,0x00,0xe0,0x01,0x00]
+v_sad_u8 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xd9,0xd1,0x01,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x09,0xd1,0x00,0xee,0x01,0x00]
+v_sad_u8 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x65,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x09,0xd1,0x00,0x00,0x02,0x00]
+v_sad_u8 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x66,0x00,0x01,0x02]
 
-v_mul_hi_u32_u24_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x09,0xd1,0x00,0xfe,0x03,0x00]
+v_sad_u8 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x67,0x00,0x01,0x02]
 
-v_min_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x14]
+v_sad_u8 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x6a,0x00,0x01,0x02]
 
-v_min_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x15]
+v_sad_u8 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x6b,0x00,0x01,0x02]
 
-v_min_f32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x14]
+v_sad_u8 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x6c,0x00,0x01,0x02]
 
-v_min_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x14]
+v_sad_u8 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x6d,0x00,0x01,0x02]
 
-v_min_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x14]
+v_sad_u8 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x6e,0x00,0x01,0x02]
 
-v_min_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x14]
+v_sad_u8 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x6f,0x00,0x01,0x02]
 
-v_min_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x14]
+v_sad_u8 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x7b,0x00,0x01,0x02]
 
-v_min_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x14]
+v_sad_u8 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x7c,0x00,0x01,0x02]
 
-v_min_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x14]
+v_sad_u8 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x7e,0x00,0x01,0x02]
 
-v_min_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x14]
+v_sad_u8 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x7f,0x00,0x01,0x02]
 
-v_min_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x14]
+v_sad_u8 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x80,0x00,0x01,0x02]
 
-v_min_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x14]
+v_sad_u8 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0xc1,0x00,0x01,0x02]
 
-v_min_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x14]
+v_sad_u8 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0xf0,0x00,0x01,0x02]
 
-v_min_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x14]
+v_sad_u8 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0xf7,0x00,0x01,0x02]
 
-v_min_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x14]
+v_sad_u8 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0x01,0x01,0x02]
 
-v_min_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x14]
+v_sad_u8 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0xff,0x01,0x01,0x02]
 
-v_min_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x14]
+v_sad_u8 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0x82,0x01,0x02]
 
-v_min_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x14]
+v_sad_u8 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0xe0,0x01,0x02]
 
-v_min_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x14]
+v_sad_u8 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0xee,0x01,0x02]
 
-v_min_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x14,0x56,0x34,0x12,0xaf]
+v_sad_u8 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0x04,0x02,0x02]
 
-v_min_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x14,0x73,0x72,0x71,0x3f]
+v_sad_u8 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0xfe,0x03,0x02]
 
-v_min_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x14]
+v_sad_u8 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0x00,0x05,0x03]
 
-v_min_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x14]
+v_sad_u8 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0x00,0xc1,0x03]
 
-v_min_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x14]
+v_sad_u8 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0x00,0xdd,0x03]
 
-v_min_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_sad_u8 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0x00,0x0d,0x04]
 
-v_min_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_sad_u8 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xd9,0xd1,0x01,0x00,0xfd,0x07]
 
-v_min_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0xfd,0x00,0x00,0x00]
+v_sad_hi_u8 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x01,0x00,0x00]
+v_sad_hi_u8 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xda,0xd1,0x01,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0xff,0x01,0x00,0x00]
+v_sad_hi_u8 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x65,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0xfa,0x01,0x00]
+v_sad_hi_u8 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x66,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x02,0x00]
+v_sad_hi_u8 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x67,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0xfe,0x03,0x00]
+v_sad_hi_u8 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x6a,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x00,0x20]
+v_sad_hi_u8 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x6b,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x00,0x40]
+v_sad_hi_u8 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x6c,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x00,0x60]
+v_sad_hi_u8 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x6d,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x6e,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x6f,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x7b,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x0a,0xd1,0x00,0x00,0x00,0x00]
+v_sad_hi_u8 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x7c,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x00,0x08]
+v_sad_hi_u8 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x7e,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x00,0x10]
+v_sad_hi_u8 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x7f,0x00,0x01,0x02]
 
-v_min_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x0a,0xd1,0x00,0x00,0x00,0x18]
+v_sad_hi_u8 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x80,0x00,0x01,0x02]
 
-v_max_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x16]
+v_sad_hi_u8 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0xc1,0x00,0x01,0x02]
 
-v_max_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x17]
+v_sad_hi_u8 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0xf0,0x00,0x01,0x02]
 
-v_max_f32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x16]
+v_sad_hi_u8 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0xf7,0x00,0x01,0x02]
 
-v_max_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x16]
+v_sad_hi_u8 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0x01,0x01,0x02]
 
-v_max_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x16]
+v_sad_hi_u8 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0xff,0x01,0x01,0x02]
 
-v_max_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x16]
+v_sad_hi_u8 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0x82,0x01,0x02]
 
-v_max_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x16]
+v_sad_hi_u8 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0xe0,0x01,0x02]
 
-v_max_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x16]
+v_sad_hi_u8 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0xee,0x01,0x02]
 
-v_max_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x16]
+v_sad_hi_u8 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0x04,0x02,0x02]
 
-v_max_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x16]
+v_sad_hi_u8 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0xfe,0x03,0x02]
 
-v_max_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x16]
+v_sad_hi_u8 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0x00,0x05,0x03]
 
-v_max_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x16]
+v_sad_hi_u8 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0x00,0xc1,0x03]
 
-v_max_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x16]
+v_sad_hi_u8 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0x00,0xdd,0x03]
 
-v_max_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x16]
+v_sad_hi_u8 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0x00,0x0d,0x04]
 
-v_max_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x16]
+v_sad_hi_u8 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xda,0xd1,0x01,0x00,0xfd,0x07]
 
-v_max_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x16]
+v_sad_u16 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0x00,0x01,0x02]
 
-v_max_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x16]
+v_sad_u16 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xdb,0xd1,0x01,0x00,0x01,0x02]
 
-v_max_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x16]
+v_sad_u16 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x65,0x00,0x01,0x02]
 
-v_max_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x16]
+v_sad_u16 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x66,0x00,0x01,0x02]
 
-v_max_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x16,0x56,0x34,0x12,0xaf]
+v_sad_u16 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x67,0x00,0x01,0x02]
 
-v_max_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x16,0x73,0x72,0x71,0x3f]
+v_sad_u16 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x6a,0x00,0x01,0x02]
 
-v_max_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x16]
+v_sad_u16 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x6b,0x00,0x01,0x02]
 
-v_max_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x16]
+v_sad_u16 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x6c,0x00,0x01,0x02]
 
-v_max_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x16]
+v_sad_u16 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x6d,0x00,0x01,0x02]
 
-v_max_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x6e,0x00,0x01,0x02]
 
-v_max_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x0b,0xd1,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x6f,0x00,0x01,0x02]
 
-v_max_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x0b,0xd1,0xfd,0x00,0x00,0x00]
+v_sad_u16 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x7b,0x00,0x01,0x02]
 
-v_max_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0x01,0x00,0x00]
+v_sad_u16 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x7c,0x00,0x01,0x02]
 
-v_max_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x0b,0xd1,0xff,0x01,0x00,0x00]
+v_sad_u16 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x7e,0x00,0x01,0x02]
 
-v_max_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0xfa,0x01,0x00]
+v_sad_u16 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x7f,0x00,0x01,0x02]
 
-v_max_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0x00,0x02,0x00]
+v_sad_u16 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x80,0x00,0x01,0x02]
 
-v_max_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0xfe,0x03,0x00]
+v_sad_u16 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0xc1,0x00,0x01,0x02]
 
-v_max_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0x00,0x00,0x20]
+v_sad_u16 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0xf0,0x00,0x01,0x02]
 
-v_max_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0x00,0x00,0x40]
+v_sad_u16 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0xf7,0x00,0x01,0x02]
 
-v_max_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0x00,0x00,0x60]
+v_sad_u16 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0x01,0x01,0x02]
 
-v_max_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x0b,0xd1,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0xff,0x01,0x01,0x02]
 
-v_max_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x0b,0xd1,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0x82,0x01,0x02]
 
-v_max_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x0b,0xd1,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0xe0,0x01,0x02]
 
-v_max_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x0b,0xd1,0x00,0x00,0x00,0x00]
+v_sad_u16 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0xee,0x01,0x02]
 
-v_max_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0x00,0x00,0x08]
+v_sad_u16 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0x04,0x02,0x02]
 
-v_max_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0x00,0x00,0x10]
+v_sad_u16 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0xfe,0x03,0x02]
 
-v_max_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x0b,0xd1,0x00,0x00,0x00,0x18]
+v_sad_u16 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0x00,0x05,0x03]
 
-v_min_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x18]
+v_sad_u16 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0x00,0xc1,0x03]
 
-v_min_i32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x19]
+v_sad_u16 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0x00,0xdd,0x03]
 
-v_min_i32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x18]
+v_sad_u16 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0x00,0x0d,0x04]
 
-v_min_i32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x18]
+v_sad_u16 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xdb,0xd1,0x01,0x00,0xfd,0x07]
 
-v_min_i32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x18]
+v_sad_u32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0x00,0x01,0x02]
 
-v_min_i32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x18]
+v_sad_u32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xdc,0xd1,0x01,0x00,0x01,0x02]
 
-v_min_i32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x18]
+v_sad_u32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x65,0x00,0x01,0x02]
 
-v_min_i32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x18]
+v_sad_u32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x66,0x00,0x01,0x02]
 
-v_min_i32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x18]
+v_sad_u32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x67,0x00,0x01,0x02]
 
-v_min_i32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x18]
+v_sad_u32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x6a,0x00,0x01,0x02]
 
-v_min_i32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x18]
+v_sad_u32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x6b,0x00,0x01,0x02]
 
-v_min_i32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x18]
+v_sad_u32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x6c,0x00,0x01,0x02]
 
-v_min_i32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x18]
+v_sad_u32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x6d,0x00,0x01,0x02]
 
-v_min_i32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x18]
+v_sad_u32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x6e,0x00,0x01,0x02]
 
-v_min_i32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x18]
+v_sad_u32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x6f,0x00,0x01,0x02]
 
-v_min_i32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x18]
+v_sad_u32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x7b,0x00,0x01,0x02]
 
-v_min_i32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x18]
+v_sad_u32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x7c,0x00,0x01,0x02]
 
-v_min_i32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x18]
+v_sad_u32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x7e,0x00,0x01,0x02]
 
-v_min_i32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x18]
+v_sad_u32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x7f,0x00,0x01,0x02]
 
-v_min_i32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x18,0x56,0x34,0x12,0xaf]
+v_sad_u32 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x80,0x00,0x01,0x02]
 
-v_min_i32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x18,0x73,0x72,0x71,0x3f]
+v_sad_u32 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0xc1,0x00,0x01,0x02]
 
-v_min_i32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x18]
+v_sad_u32 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0xf0,0x00,0x01,0x02]
 
-v_min_i32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x18]
+v_sad_u32 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0xf7,0x00,0x01,0x02]
 
-v_min_i32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x18]
+v_sad_u32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0x01,0x01,0x02]
 
-v_min_i32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0x00,0x00,0x00]
+v_sad_u32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0xff,0x01,0x01,0x02]
 
-v_min_i32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x0c,0xd1,0x00,0x00,0x00,0x00]
+v_sad_u32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0x82,0x01,0x02]
 
-v_min_i32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x80,0x00,0x00,0x00]
+v_sad_u32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0xe0,0x01,0x02]
 
-v_min_i32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0xc1,0x00,0x00,0x00]
+v_sad_u32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0xee,0x01,0x02]
 
-v_min_i32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0xf0,0x00,0x00,0x00]
+v_sad_u32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0x04,0x02,0x02]
 
-v_min_i32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0xf7,0x00,0x00,0x00]
+v_sad_u32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0xfe,0x03,0x02]
 
-v_min_i32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0x01,0x00,0x00]
+v_sad_u32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0x00,0x05,0x03]
 
-v_min_i32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x0c,0xd1,0xff,0x01,0x00,0x00]
+v_sad_u32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0x00,0xc1,0x03]
 
-v_min_i32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0x00,0x01,0x00]
+v_sad_u32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0x00,0xdd,0x03]
 
-v_min_i32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0x82,0x01,0x00]
+v_sad_u32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0x00,0x0d,0x04]
 
-v_min_i32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0xe0,0x01,0x00]
+v_sad_u32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xdc,0xd1,0x01,0x00,0xfd,0x07]
 
-v_min_i32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0xee,0x01,0x00]
+v_cvt_pk_u8_f32 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0x00,0x01,0x02]
 
-v_min_i32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0x00,0x02,0x00]
+v_cvt_pk_u8_f32 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xdd,0xd1,0x01,0x00,0x01,0x02]
 
-v_min_i32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x0c,0xd1,0x00,0xfe,0x03,0x00]
+v_cvt_pk_u8_f32 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x65,0x00,0x01,0x02]
 
-v_max_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x66,0x00,0x01,0x02]
 
-v_max_i32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x1b]
+v_cvt_pk_u8_f32 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x67,0x00,0x01,0x02]
 
-v_max_i32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x6a,0x00,0x01,0x02]
 
-v_max_i32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x6b,0x00,0x01,0x02]
 
-v_max_i32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x6c,0x00,0x01,0x02]
 
-v_max_i32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x6d,0x00,0x01,0x02]
 
-v_max_i32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x6e,0x00,0x01,0x02]
 
-v_max_i32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x6f,0x00,0x01,0x02]
 
-v_max_i32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x7b,0x00,0x01,0x02]
 
-v_max_i32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x7c,0x00,0x01,0x02]
 
-v_max_i32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x7e,0x00,0x01,0x02]
 
-v_max_i32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x7f,0x00,0x01,0x02]
 
-v_max_i32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, scc, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0xfd,0x00,0x01,0x02]
 
-v_max_i32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0x01,0x01,0x02]
 
-v_max_i32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0xff,0x01,0x01,0x02]
 
-v_max_i32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0x82,0x01,0x02]
 
-v_max_i32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0xe0,0x01,0x02]
 
-v_max_i32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0xee,0x01,0x02]
 
-v_max_i32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0x04,0x02,0x02]
 
-v_max_i32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x1a,0x56,0x34,0x12,0xaf]
+v_cvt_pk_u8_f32 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0xfe,0x03,0x02]
 
-v_max_i32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x1a,0x73,0x72,0x71,0x3f]
+v_cvt_pk_u8_f32 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0x00,0x05,0x03]
 
-v_max_i32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0x00,0xc1,0x03]
 
-v_max_i32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x1a]
+v_cvt_pk_u8_f32 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0x00,0xdd,0x03]
 
-v_max_i32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x1a]
+v_cvt_pk_u8_f32 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0x00,0x0d,0x04]
 
-v_max_i32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x0d,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0x00,0xfd,0x07]
 
-v_max_i32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x0d,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, -s1, 0, 0
+// CHECK: [0x05,0x00,0xdd,0xd1,0x01,0x00,0x01,0x22]
 
-v_max_i32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x0d,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, |s1|, 0, 0
+// CHECK: [0x05,0x01,0xdd,0xd1,0x01,0x00,0x01,0x02]
 
-v_max_i32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x0d,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_pk_u8_f32 v5, s1, 0, 0 clamp
+// CHECK: [0x05,0x80,0xdd,0xd1,0x01,0x00,0x01,0x02]
 
-v_max_i32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x0d,0xd1,0xf0,0x00,0x00,0x00]
+v_div_fixup_f32 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x0d,0xd1,0xf7,0x00,0x00,0x00]
+v_div_fixup_f32 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xde,0xd1,0x01,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x0d,0xd1,0x00,0x01,0x00,0x00]
+v_div_fixup_f32 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x65,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x0d,0xd1,0xff,0x01,0x00,0x00]
+v_div_fixup_f32 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x66,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x0d,0xd1,0x00,0x00,0x01,0x00]
+v_div_fixup_f32 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x67,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x0d,0xd1,0x00,0x82,0x01,0x00]
+v_div_fixup_f32 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x0d,0xd1,0x00,0xe0,0x01,0x00]
+v_div_fixup_f32 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x0d,0xd1,0x00,0xee,0x01,0x00]
+v_div_fixup_f32 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x0d,0xd1,0x00,0x00,0x02,0x00]
+v_div_fixup_f32 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_max_i32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x0d,0xd1,0x00,0xfe,0x03,0x00]
+v_div_fixup_f32 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_min_u32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_min_u32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x1d]
+v_div_fixup_f32 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_min_u32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_min_u32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_min_u32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_min_u32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_min_u32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0x05,0x0e,0x04]
 
-v_min_u32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0xff,0x05,0x0e,0x04]
 
-v_min_u32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_min_u32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0x04,0xfe,0x07]
 
-v_min_u32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0x04,0x0e,0x24]
 
-v_min_u32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0x04,0x0e,0x44]
 
-v_min_u32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0x04,0x0e,0x84]
 
-v_min_u32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_min_u32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xde,0xd1,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xde,0xd1,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xde,0xd1,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xde,0xd1,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x1c]
+v_div_fixup_f32 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xde,0xd1,0x01,0x04,0x0e,0x04]
 
-v_min_u32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x1c,0x56,0x34,0x12,0xaf]
+v_div_fixup_f32 v5, s1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0x04,0x0e,0x0c]
 
-v_min_u32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x1c,0x73,0x72,0x71,0x3f]
+v_div_fixup_f32 v5, s1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0x04,0x0e,0x14]
 
-v_min_u32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x1c]
+v_div_fixup_f32 v5, s1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xde,0xd1,0x01,0x04,0x0e,0x1c]
 
-v_min_u32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x1c]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x02,0x04,0x0e,0x04]
 
-v_min_u32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x1c]
+v_div_fixup_f64 v[254:255], s[2:3], v[2:3], v[3:4]
+// CHECK: [0xfe,0x00,0xdf,0xd1,0x02,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], s[4:5], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x04,0x04,0x0e,0x04]
 
-v_min_u32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x0e,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], s[100:101], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x64,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x80,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], flat_scratch, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x66,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0xc1,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], vcc, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0xf0,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], tba, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0xf7,0x00,0x00,0x00]
+v_div_fixup_f64 v[5:6], tma, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0x01,0x00,0x00]
+v_div_fixup_f64 v[5:6], ttmp[10:11], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x7a,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x0e,0xd1,0xff,0x01,0x00,0x00]
+v_div_fixup_f64 v[5:6], exec, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0x00,0x01,0x00]
+v_div_fixup_f64 v[5:6], scc, v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0x82,0x01,0x00]
+v_div_fixup_f64 v[5:6], v[1:2], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x01,0x05,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0xe0,0x01,0x00]
+v_div_fixup_f64 v[5:6], v[254:255], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0xfe,0x05,0x0e,0x04]
 
-v_min_u32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0xee,0x01,0x00]
+v_div_fixup_f64 v[5:6], s[2:3], v[254:255], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x02,0xfc,0x0f,0x04]
 
-v_min_u32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0x00,0x02,0x00]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[254:255]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x02,0x04,0xfa,0x07]
 
-v_min_u32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x0e,0xd1,0x00,0xfe,0x03,0x00]
+v_div_fixup_f64 v[5:6], -s[2:3], v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x02,0x04,0x0e,0x24]
 
-v_max_u32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x1e]
+v_div_fixup_f64 v[5:6], s[2:3], -v[2:3], v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x02,0x04,0x0e,0x44]
 
-v_max_u32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x1f]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], -v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x02,0x04,0x0e,0x84]
 
-v_max_u32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x1e]
+v_div_fixup_f64 v[5:6], -s[2:3], -v[2:3], -v[3:4]
+// CHECK: [0x05,0x00,0xdf,0xd1,0x02,0x04,0x0e,0xe4]
 
-v_max_u32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x1e]
+v_div_fixup_f64 v[5:6], |s[2:3]|, v[2:3], v[3:4]
+// CHECK: [0x05,0x01,0xdf,0xd1,0x02,0x04,0x0e,0x04]
 
-v_max_u32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x1e]
+v_div_fixup_f64 v[5:6], s[2:3], |v[2:3]|, v[3:4]
+// CHECK: [0x05,0x02,0xdf,0xd1,0x02,0x04,0x0e,0x04]
 
-v_max_u32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x1e]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], |v[3:4]|
+// CHECK: [0x05,0x04,0xdf,0xd1,0x02,0x04,0x0e,0x04]
 
-v_max_u32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x1e]
+v_div_fixup_f64 v[5:6], |s[2:3]|, |v[2:3]|, |v[3:4]|
+// CHECK: [0x05,0x07,0xdf,0xd1,0x02,0x04,0x0e,0x04]
 
-v_max_u32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x1e]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[3:4] clamp
+// CHECK: [0x05,0x80,0xdf,0xd1,0x02,0x04,0x0e,0x04]
 
-v_max_u32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x1e]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[3:4] mul:2
+// CHECK: [0x05,0x00,0xdf,0xd1,0x02,0x04,0x0e,0x0c]
 
-v_max_u32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x1e]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[3:4] mul:4
+// CHECK: [0x05,0x00,0xdf,0xd1,0x02,0x04,0x0e,0x14]
 
-v_max_u32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x1e]
+v_div_fixup_f64 v[5:6], s[2:3], v[2:3], v[3:4] div:2
+// CHECK: [0x05,0x00,0xdf,0xd1,0x02,0x04,0x0e,0x1c]
 
-v_max_u32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x1e]
+v_div_scale_f32 v5, vcc, s1, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0x00,0x01,0x02]
 
-v_max_u32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x1e]
+v_div_scale_f32 v255, vcc, s1, 0, 0
+// CHECK: [0xff,0x6a,0xe0,0xd1,0x01,0x00,0x01,0x02]
 
-v_max_u32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x1e]
+v_div_scale_f32 v5, vcc, s101, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x65,0x00,0x01,0x02]
 
-v_max_u32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x1e]
+v_div_scale_f32 v5, vcc, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x66,0x00,0x01,0x02]
 
-v_max_u32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x1e]
+v_div_scale_f32 v5, vcc, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x67,0x00,0x01,0x02]
 
-v_max_u32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x1e]
+v_div_scale_f32 v5, vcc, vcc_lo, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x6a,0x00,0x01,0x02]
 
-v_max_u32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x1e]
+v_div_scale_f32 v5, vcc, vcc_hi, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x6b,0x00,0x01,0x02]
 
-v_max_u32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x1e]
+v_div_scale_f32 v5, vcc, tba_lo, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x6c,0x00,0x01,0x02]
 
-v_max_u32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x1e,0x56,0x34,0x12,0xaf]
+v_div_scale_f32 v5, vcc, tba_hi, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x6d,0x00,0x01,0x02]
 
-v_max_u32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x1e,0x73,0x72,0x71,0x3f]
+v_div_scale_f32 v5, vcc, tma_lo, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x6e,0x00,0x01,0x02]
 
-v_max_u32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x1e]
+v_div_scale_f32 v5, vcc, tma_hi, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x6f,0x00,0x01,0x02]
 
-v_max_u32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x1e]
+v_div_scale_f32 v5, vcc, ttmp11, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x7b,0x00,0x01,0x02]
 
-v_max_u32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x1e]
+v_div_scale_f32 v5, vcc, m0, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x7c,0x00,0x01,0x02]
 
-v_max_u32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x0f,0xd1,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, exec_lo, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x7e,0x00,0x01,0x02]
 
-v_max_u32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x0f,0xd1,0x00,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, exec_hi, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x7f,0x00,0x01,0x02]
 
-v_max_u32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x0f,0xd1,0x80,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, 0, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x80,0x00,0x01,0x02]
 
-v_max_u32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x0f,0xd1,0xc1,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, -1, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0xc1,0x00,0x01,0x02]
 
-v_max_u32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x0f,0xd1,0xf0,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, 0.5, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0xf0,0x00,0x01,0x02]
 
-v_max_u32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x0f,0xd1,0xf7,0x00,0x00,0x00]
+v_div_scale_f32 v5, vcc, -4.0, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0xf7,0x00,0x01,0x02]
 
-v_max_u32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x0f,0xd1,0x00,0x01,0x00,0x00]
+v_div_scale_f32 v5, vcc, v1, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0x01,0x01,0x02]
 
-v_max_u32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x0f,0xd1,0xff,0x01,0x00,0x00]
+v_div_scale_f32 v5, vcc, v255, 0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0xff,0x01,0x01,0x02]
 
-v_max_u32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x0f,0xd1,0x00,0x00,0x01,0x00]
+v_div_scale_f32 v5, vcc, s1, -1, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0x82,0x01,0x02]
 
-v_max_u32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x0f,0xd1,0x00,0x82,0x01,0x00]
+v_div_scale_f32 v5, vcc, s1, 0.5, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0xe0,0x01,0x02]
 
-v_max_u32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x0f,0xd1,0x00,0xe0,0x01,0x00]
+v_div_scale_f32 v5, vcc, s1, -4.0, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0xee,0x01,0x02]
 
-v_max_u32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x0f,0xd1,0x00,0xee,0x01,0x00]
+v_div_scale_f32 v5, vcc, s1, v2, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0x04,0x02,0x02]
 
-v_max_u32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x0f,0xd1,0x00,0x00,0x02,0x00]
+v_div_scale_f32 v5, vcc, s1, v255, 0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0xfe,0x03,0x02]
 
-v_max_u32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x0f,0xd1,0x00,0xfe,0x03,0x00]
+v_div_scale_f32 v5, vcc, s1, 0, -1
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0x00,0x05,0x03]
 
-v_lshrrev_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x20]
+v_div_scale_f32 v5, vcc, s1, 0, 0.5
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0x00,0xc1,0x03]
 
-v_lshrrev_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x21]
+v_div_scale_f32 v5, vcc, s1, 0, -4.0
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0x00,0xdd,0x03]
 
-v_lshrrev_b32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x20]
+v_div_scale_f32 v5, vcc, s1, 0, v3
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0x00,0x0d,0x04]
 
-v_lshrrev_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x20]
+v_div_scale_f32 v5, vcc, s1, 0, v255
+// CHECK: [0x05,0x6a,0xe0,0xd1,0x01,0x00,0xfd,0x07]
 
-v_lshrrev_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x20]
+v_div_scale_f64 v[254:255], vcc, s[2:3], 0, 0
+// CHECK: [0xfe,0x6a,0xe1,0xd1,0x02,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, s[4:5], 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x04,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, s[100:101], 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x64,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, flat_scratch, 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x66,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, vcc, 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x6a,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, tba, 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x6c,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, tma, 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x6e,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, ttmp[10:11], 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x7a,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, exec, 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x7e,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, 0, 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x80,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, -1, 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0xc1,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, 0.5, 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0xf0,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, -4.0, 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0xf7,0x00,0x01,0x02]
 
-v_lshrrev_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, v[1:2], 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x01,0x01,0x01,0x02]
 
-v_lshrrev_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x20,0x56,0x34,0x12,0xaf]
+v_div_scale_f64 v[5:6], vcc, v[254:255], 0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0xfe,0x01,0x01,0x02]
 
-v_lshrrev_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x20,0x73,0x72,0x71,0x3f]
+v_div_scale_f64 v[5:6], vcc, s[2:3], -1, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0x82,0x01,0x02]
 
-v_lshrrev_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0.5, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0xe0,0x01,0x02]
 
-v_lshrrev_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x20]
+v_div_scale_f64 v[5:6], vcc, s[2:3], -4.0, 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0xee,0x01,0x02]
 
-v_lshrrev_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x20]
+v_div_scale_f64 v[5:6], vcc, s[2:3], v[2:3], 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0x04,0x02,0x02]
 
-v_lshrrev_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x10,0xd1,0x00,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], v[254:255], 0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0xfc,0x03,0x02]
 
-v_lshrrev_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x10,0xd1,0x00,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, -1
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0x00,0x05,0x03]
 
-v_lshrrev_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x10,0xd1,0x80,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, 0.5
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0x00,0xc1,0x03]
 
-v_lshrrev_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x10,0xd1,0xc1,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, -4.0
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0x00,0xdd,0x03]
 
-v_lshrrev_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x10,0xd1,0xf0,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, v[3:4]
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0x00,0x0d,0x04]
 
-v_lshrrev_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x10,0xd1,0xf7,0x00,0x00,0x00]
+v_div_scale_f64 v[5:6], vcc, s[2:3], 0, v[254:255]
+// CHECK: [0x05,0x6a,0xe1,0xd1,0x02,0x00,0xf9,0x07]
 
-v_lshrrev_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x10,0xd1,0x00,0x01,0x00,0x00]
+v_div_fmas_f32 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xe2,0xd1,0x01,0x05,0x0e,0x04]
 
-v_lshrrev_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x10,0xd1,0xff,0x01,0x00,0x00]
+v_div_fmas_f32 v255, v1, v2, v3
+// CHECK: [0xff,0x00,0xe2,0xd1,0x01,0x05,0x0e,0x04]
 
-v_lshrrev_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x10,0xd1,0x00,0x00,0x01,0x00]
+v_div_fmas_f32 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xe2,0xd1,0xff,0x05,0x0e,0x04]
 
-v_lshrrev_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x10,0xd1,0x00,0x82,0x01,0x00]
+v_div_fmas_f32 v5, v1, v255, v3
+// CHECK: [0x05,0x00,0xe2,0xd1,0x01,0xff,0x0f,0x04]
 
-v_lshrrev_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x10,0xd1,0x00,0xe0,0x01,0x00]
+v_div_fmas_f32 v5, v1, v2, v255
+// CHECK: [0x05,0x00,0xe2,0xd1,0x01,0x05,0xfe,0x07]
 
-v_lshrrev_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x10,0xd1,0x00,0xee,0x01,0x00]
+v_div_fmas_f32 v5, -v1, v2, v3
+// CHECK: [0x05,0x00,0xe2,0xd1,0x01,0x05,0x0e,0x24]
 
-v_lshrrev_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x10,0xd1,0x00,0x00,0x02,0x00]
+v_div_fmas_f32 v5, v1, -v2, v3
+// CHECK: [0x05,0x00,0xe2,0xd1,0x01,0x05,0x0e,0x44]
 
-v_lshrrev_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x10,0xd1,0x00,0xfe,0x03,0x00]
+v_div_fmas_f32 v5, v1, v2, -v3
+// CHECK: [0x05,0x00,0xe2,0xd1,0x01,0x05,0x0e,0x84]
 
-v_ashrrev_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x22]
+v_div_fmas_f32 v5, -v1, -v2, -v3
+// CHECK: [0x05,0x00,0xe2,0xd1,0x01,0x05,0x0e,0xe4]
 
-v_ashrrev_i32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x23]
+v_div_fmas_f32 v5, |v1|, v2, v3
+// CHECK: [0x05,0x01,0xe2,0xd1,0x01,0x05,0x0e,0x04]
 
-v_ashrrev_i32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x22]
+v_div_fmas_f32 v5, v1, |v2|, v3
+// CHECK: [0x05,0x02,0xe2,0xd1,0x01,0x05,0x0e,0x04]
 
-v_ashrrev_i32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x22]
+v_div_fmas_f32 v5, v1, v2, |v3|
+// CHECK: [0x05,0x04,0xe2,0xd1,0x01,0x05,0x0e,0x04]
 
-v_ashrrev_i32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x22]
+v_div_fmas_f32 v5, |v1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xe2,0xd1,0x01,0x05,0x0e,0x04]
 
-v_ashrrev_i32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x22]
+v_div_fmas_f32 v5, v1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xe2,0xd1,0x01,0x05,0x0e,0x04]
 
-v_ashrrev_i32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x22]
+v_div_fmas_f32 v5, v1, v2, v3 mul:2
+// CHECK: [0x05,0x00,0xe2,0xd1,0x01,0x05,0x0e,0x0c]
 
-v_ashrrev_i32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x22]
+v_div_fmas_f32 v5, v1, v2, v3 mul:4
+// CHECK: [0x05,0x00,0xe2,0xd1,0x01,0x05,0x0e,0x14]
 
-v_ashrrev_i32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x22]
+v_div_fmas_f32 v5, v1, v2, v3 div:2
+// CHECK: [0x05,0x00,0xe2,0xd1,0x01,0x05,0x0e,0x1c]
 
-v_ashrrev_i32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x22]
+v_div_fmas_f64 v[5:6], vcc, vcc, vcc
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xd4,0xa8,0x01]
 
-v_ashrrev_i32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x22]
+v_div_fmas_f64 v[254:255], vcc, vcc, vcc
+// CHECK: [0xfe,0x00,0xe3,0xd1,0x6a,0xd4,0xa8,0x01]
 
-v_ashrrev_i32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x22]
+v_div_fmas_f64 v[5:6], v[1:2], vcc, vcc
+// CHECK: [0x05,0x00,0xe3,0xd1,0x01,0xd5,0xa8,0x01]
 
-v_ashrrev_i32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x22]
+v_div_fmas_f64 v[5:6], v[254:255], vcc, vcc
+// CHECK: [0x05,0x00,0xe3,0xd1,0xfe,0xd5,0xa8,0x01]
 
-v_ashrrev_i32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x22]
+v_div_fmas_f64 v[5:6], vcc, v[2:3], vcc
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0x04,0xaa,0x01]
 
-v_ashrrev_i32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x22]
+v_div_fmas_f64 v[5:6], vcc, v[254:255], vcc
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xfc,0xab,0x01]
 
-v_ashrrev_i32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x22]
+v_div_fmas_f64 v[5:6], vcc, vcc, v[3:4]
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xd4,0x0c,0x04]
 
-v_ashrrev_i32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x22]
+v_div_fmas_f64 v[5:6], vcc, vcc, v[254:255]
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xd4,0xf8,0x07]
 
-v_ashrrev_i32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x22]
+v_div_fmas_f64 v[5:6], -vcc, vcc, vcc
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xd4,0xa8,0x21]
 
-v_ashrrev_i32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x22]
+v_div_fmas_f64 v[5:6], vcc, -vcc, vcc
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xd4,0xa8,0x41]
 
-v_ashrrev_i32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x22,0x56,0x34,0x12,0xaf]
+v_div_fmas_f64 v[5:6], vcc, vcc, -vcc
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xd4,0xa8,0x81]
 
-v_ashrrev_i32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x22,0x73,0x72,0x71,0x3f]
+v_div_fmas_f64 v[5:6], -vcc, -vcc, -vcc
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xd4,0xa8,0xe1]
 
-v_ashrrev_i32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x22]
+v_div_fmas_f64 v[5:6], |vcc|, vcc, vcc
+// CHECK: [0x05,0x01,0xe3,0xd1,0x6a,0xd4,0xa8,0x01]
 
-v_ashrrev_i32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x22]
+v_div_fmas_f64 v[5:6], vcc, |vcc|, vcc
+// CHECK: [0x05,0x02,0xe3,0xd1,0x6a,0xd4,0xa8,0x01]
 
-v_ashrrev_i32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x22]
+v_div_fmas_f64 v[5:6], vcc, vcc, |vcc|
+// CHECK: [0x05,0x04,0xe3,0xd1,0x6a,0xd4,0xa8,0x01]
 
-v_ashrrev_i32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x11,0xd1,0x00,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], |vcc|, |vcc|, |vcc|
+// CHECK: [0x05,0x07,0xe3,0xd1,0x6a,0xd4,0xa8,0x01]
 
-v_ashrrev_i32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x11,0xd1,0x00,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, vcc clamp
+// CHECK: [0x05,0x80,0xe3,0xd1,0x6a,0xd4,0xa8,0x01]
 
-v_ashrrev_i32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x11,0xd1,0x80,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, vcc mul:2
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xd4,0xa8,0x09]
 
-v_ashrrev_i32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x11,0xd1,0xc1,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, vcc mul:4
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xd4,0xa8,0x11]
 
-v_ashrrev_i32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x11,0xd1,0xf0,0x00,0x00,0x00]
+v_div_fmas_f64 v[5:6], vcc, vcc, vcc div:2
+// CHECK: [0x05,0x00,0xe3,0xd1,0x6a,0xd4,0xa8,0x19]
 
-v_ashrrev_i32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x11,0xd1,0xf7,0x00,0x00,0x00]
+v_msad_u8 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0x00,0x01,0x02]
 
-v_ashrrev_i32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x11,0xd1,0x00,0x01,0x00,0x00]
+v_msad_u8 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xe4,0xd1,0x01,0x00,0x01,0x02]
 
-v_ashrrev_i32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x11,0xd1,0xff,0x01,0x00,0x00]
+v_msad_u8 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x65,0x00,0x01,0x02]
 
-v_ashrrev_i32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x11,0xd1,0x00,0x00,0x01,0x00]
+v_msad_u8 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x66,0x00,0x01,0x02]
 
-v_ashrrev_i32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x11,0xd1,0x00,0x82,0x01,0x00]
+v_msad_u8 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x67,0x00,0x01,0x02]
 
-v_ashrrev_i32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x11,0xd1,0x00,0xe0,0x01,0x00]
+v_msad_u8 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x6a,0x00,0x01,0x02]
 
-v_ashrrev_i32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x11,0xd1,0x00,0xee,0x01,0x00]
+v_msad_u8 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x6b,0x00,0x01,0x02]
 
-v_ashrrev_i32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x11,0xd1,0x00,0x00,0x02,0x00]
+v_msad_u8 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x6c,0x00,0x01,0x02]
 
-v_ashrrev_i32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x11,0xd1,0x00,0xfe,0x03,0x00]
+v_msad_u8 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x6d,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x24]
+v_msad_u8 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x6e,0x00,0x01,0x02]
 
-v_lshlrev_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x25]
+v_msad_u8 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x6f,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x24]
+v_msad_u8 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x7b,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x24]
+v_msad_u8 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x7c,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x24]
+v_msad_u8 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x7e,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x24]
+v_msad_u8 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x7f,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x24]
+v_msad_u8 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x80,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x24]
+v_msad_u8 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0xc1,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x24]
+v_msad_u8 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0xf0,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x24]
+v_msad_u8 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0xf7,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x24]
+v_msad_u8 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0x01,0x01,0x02]
 
-v_lshlrev_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x24]
+v_msad_u8 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0xff,0x01,0x01,0x02]
 
-v_lshlrev_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x24]
+v_msad_u8 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0x82,0x01,0x02]
 
-v_lshlrev_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x24]
+v_msad_u8 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0xe0,0x01,0x02]
 
-v_lshlrev_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x24]
+v_msad_u8 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0xee,0x01,0x02]
 
-v_lshlrev_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x24]
+v_msad_u8 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0x04,0x02,0x02]
 
-v_lshlrev_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x24]
+v_msad_u8 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0xfe,0x03,0x02]
 
-v_lshlrev_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x24]
+v_msad_u8 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0x00,0x05,0x03]
 
-v_lshlrev_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x24]
+v_msad_u8 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0x00,0xc1,0x03]
 
-v_lshlrev_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x24,0x56,0x34,0x12,0xaf]
+v_msad_u8 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0x00,0xdd,0x03]
 
-v_lshlrev_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x24,0x73,0x72,0x71,0x3f]
+v_msad_u8 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0x00,0x0d,0x04]
 
-v_lshlrev_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x24]
+v_msad_u8 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xe4,0xd1,0x01,0x00,0xfd,0x07]
 
-v_lshlrev_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x24]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0x00,0x01,0x02]
 
-v_lshlrev_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x24]
+v_qsad_pk_u16_u8 v[254:255], s[2:3], 0, 0
+// CHECK: [0xfe,0x00,0xe5,0xd1,0x02,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x12,0xd1,0x00,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], s[4:5], 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x04,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x12,0xd1,0x00,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], s[100:101], 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x64,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x12,0xd1,0x80,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], flat_scratch, 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x66,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x12,0xd1,0xc1,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], vcc, 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x6a,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x12,0xd1,0xf0,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], tba, 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x6c,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x12,0xd1,0xf7,0x00,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], tma, 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x6e,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x12,0xd1,0x00,0x01,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], ttmp[10:11], 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x7a,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x12,0xd1,0xff,0x01,0x00,0x00]
+v_qsad_pk_u16_u8 v[5:6], exec, 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x7e,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x12,0xd1,0x00,0x00,0x01,0x00]
+v_qsad_pk_u16_u8 v[5:6], 0, 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x80,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x12,0xd1,0x00,0x82,0x01,0x00]
+v_qsad_pk_u16_u8 v[5:6], -1, 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0xc1,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x12,0xd1,0x00,0xe0,0x01,0x00]
+v_qsad_pk_u16_u8 v[5:6], 0.5, 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0xf0,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x12,0xd1,0x00,0xee,0x01,0x00]
+v_qsad_pk_u16_u8 v[5:6], -4.0, 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0xf7,0x00,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x12,0xd1,0x00,0x00,0x02,0x00]
+v_qsad_pk_u16_u8 v[5:6], v[1:2], 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x01,0x01,0x01,0x02]
 
-v_lshlrev_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x12,0xd1,0x00,0xfe,0x03,0x00]
+v_qsad_pk_u16_u8 v[5:6], v[254:255], 0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0xfe,0x01,0x01,0x02]
 
-v_and_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x26]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], -1, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0x82,0x01,0x02]
 
-v_and_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x27]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0.5, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0xe0,0x01,0x02]
 
-v_and_b32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x26]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], -4.0, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0xee,0x01,0x02]
 
-v_and_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x26]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], v2, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0x04,0x02,0x02]
 
-v_and_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x26]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], v255, 0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0xfe,0x03,0x02]
 
-v_and_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x26]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0, -1
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0x00,0x05,0x03]
 
-v_and_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x26]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0, 0.5
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0x00,0xc1,0x03]
 
-v_and_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x26]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0, -4.0
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0x00,0xdd,0x03]
 
-v_and_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x26]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0, v[3:4]
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0x00,0x0d,0x04]
 
-v_and_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x26]
+v_qsad_pk_u16_u8 v[5:6], s[2:3], 0, v[254:255]
+// CHECK: [0x05,0x00,0xe5,0xd1,0x02,0x00,0xf9,0x07]
 
-v_and_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x26]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0x00,0x01,0x02]
 
-v_and_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x26]
+v_mqsad_pk_u16_u8 v[254:255], s[2:3], 0, 0
+// CHECK: [0xfe,0x00,0xe6,0xd1,0x02,0x00,0x01,0x02]
 
-v_and_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x26]
+v_mqsad_pk_u16_u8 v[5:6], s[4:5], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x04,0x00,0x01,0x02]
 
-v_and_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x26]
+v_mqsad_pk_u16_u8 v[5:6], s[100:101], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x64,0x00,0x01,0x02]
 
-v_and_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x26]
+v_mqsad_pk_u16_u8 v[5:6], flat_scratch, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x66,0x00,0x01,0x02]
 
-v_and_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x26]
+v_mqsad_pk_u16_u8 v[5:6], vcc, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x6a,0x00,0x01,0x02]
 
-v_and_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x26]
+v_mqsad_pk_u16_u8 v[5:6], tba, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x6c,0x00,0x01,0x02]
 
-v_and_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x26]
+v_mqsad_pk_u16_u8 v[5:6], tma, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x6e,0x00,0x01,0x02]
 
-v_and_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x26]
+v_mqsad_pk_u16_u8 v[5:6], ttmp[10:11], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x7a,0x00,0x01,0x02]
 
-v_and_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x26,0x56,0x34,0x12,0xaf]
+v_mqsad_pk_u16_u8 v[5:6], exec, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x7e,0x00,0x01,0x02]
 
-v_and_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x26,0x73,0x72,0x71,0x3f]
+v_mqsad_pk_u16_u8 v[5:6], 0, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x80,0x00,0x01,0x02]
 
-v_and_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x26]
+v_mqsad_pk_u16_u8 v[5:6], -1, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0xc1,0x00,0x01,0x02]
 
-v_and_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x26]
+v_mqsad_pk_u16_u8 v[5:6], 0.5, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0xf0,0x00,0x01,0x02]
 
-v_and_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x26]
+v_mqsad_pk_u16_u8 v[5:6], -4.0, 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0xf7,0x00,0x01,0x02]
 
-v_and_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x13,0xd1,0x00,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], v[1:2], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x01,0x01,0x01,0x02]
 
-v_and_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x13,0xd1,0x00,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], v[254:255], 0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0xfe,0x01,0x01,0x02]
 
-v_and_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x13,0xd1,0x80,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], -1, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0x82,0x01,0x02]
 
-v_and_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x13,0xd1,0xc1,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0.5, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0xe0,0x01,0x02]
 
-v_and_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x13,0xd1,0xf0,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], -4.0, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0xee,0x01,0x02]
 
-v_and_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x13,0xd1,0xf7,0x00,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], v2, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0x04,0x02,0x02]
 
-v_and_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x13,0xd1,0x00,0x01,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], v255, 0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0xfe,0x03,0x02]
 
-v_and_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x13,0xd1,0xff,0x01,0x00,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0, -1
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0x00,0x05,0x03]
 
-v_and_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x13,0xd1,0x00,0x00,0x01,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0, 0.5
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0x00,0xc1,0x03]
 
-v_and_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x13,0xd1,0x00,0x82,0x01,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0, -4.0
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0x00,0xdd,0x03]
 
-v_and_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x13,0xd1,0x00,0xe0,0x01,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0, v[3:4]
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0x00,0x0d,0x04]
 
-v_and_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x13,0xd1,0x00,0xee,0x01,0x00]
+v_mqsad_pk_u16_u8 v[5:6], s[2:3], 0, v[254:255]
+// CHECK: [0x05,0x00,0xe6,0xd1,0x02,0x00,0xf9,0x07]
 
-v_and_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x13,0xd1,0x00,0x00,0x02,0x00]
+v_mqsad_u32_u8 v[5:8], 0, s2, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0x04,0x0c,0x04]
 
-v_and_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x13,0xd1,0x00,0xfe,0x03,0x00]
+v_mqsad_u32_u8 v[252:255], 0, s2, v[3:6]
+// CHECK: [0xfc,0x00,0xe7,0xd1,0x80,0x04,0x0c,0x04]
 
-v_or_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], -1, s2, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0xc1,0x04,0x0c,0x04]
 
-v_or_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x29]
+v_mqsad_u32_u8 v[5:8], 0.5, s2, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0xf0,0x04,0x0c,0x04]
 
-v_or_b32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], -4.0, s2, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0xf7,0x04,0x0c,0x04]
 
-v_or_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], v[1:2], s2, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x01,0x05,0x0c,0x04]
 
-v_or_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], v[254:255], s2, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0xfe,0x05,0x0c,0x04]
 
-v_or_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, s101, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xca,0x0c,0x04]
 
-v_or_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, flat_scratch_lo, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xcc,0x0c,0x04]
 
-v_or_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, flat_scratch_hi, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xce,0x0c,0x04]
 
-v_or_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, vcc_lo, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xd4,0x0c,0x04]
 
-v_or_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, vcc_hi, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xd6,0x0c,0x04]
 
-v_or_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, tba_lo, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xd8,0x0c,0x04]
 
-v_or_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, tba_hi, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xda,0x0c,0x04]
 
-v_or_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, tma_lo, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xdc,0x0c,0x04]
 
-v_or_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, tma_hi, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xde,0x0c,0x04]
 
-v_or_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, ttmp11, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xf6,0x0c,0x04]
 
-v_or_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, m0, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xf8,0x0c,0x04]
 
-v_or_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, exec_lo, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xfc,0x0c,0x04]
 
-v_or_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, exec_hi, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xfe,0x0c,0x04]
 
-v_or_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, 0, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0x00,0x0d,0x04]
 
-v_or_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x28,0x56,0x34,0x12,0xaf]
+v_mqsad_u32_u8 v[5:8], 0, -1, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0x82,0x0d,0x04]
 
-v_or_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x28,0x73,0x72,0x71,0x3f]
+v_mqsad_u32_u8 v[5:8], 0, 0.5, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xe0,0x0d,0x04]
 
-v_or_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, -4.0, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xee,0x0d,0x04]
 
-v_or_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x28]
+v_mqsad_u32_u8 v[5:8], 0, v2, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0x04,0x0e,0x04]
 
-v_or_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x28]
+v_mqsad_u32_u8 v[5:8], 0, v255, v[3:6]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0xfe,0x0f,0x04]
 
-v_or_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x14,0xd1,0x00,0x00,0x00,0x00]
+v_mqsad_u32_u8 v[5:8], 0, s2, v[252:255]
+// CHECK: [0x05,0x00,0xe7,0xd1,0x80,0x04,0xf0,0x07]
 
-v_or_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x14,0xd1,0x00,0x00,0x00,0x00]
+v_mad_f16 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x01,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x14,0xd1,0x80,0x00,0x00,0x00]
+v_mad_f16 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xea,0xd1,0x01,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x14,0xd1,0xc1,0x00,0x00,0x00]
+v_mad_f16 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x65,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x14,0xd1,0xf0,0x00,0x00,0x00]
+v_mad_f16 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x66,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x14,0xd1,0xf7,0x00,0x00,0x00]
+v_mad_f16 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x67,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x14,0xd1,0x00,0x01,0x00,0x00]
+v_mad_f16 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x14,0xd1,0xff,0x01,0x00,0x00]
+v_mad_f16 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x14,0xd1,0x00,0x00,0x01,0x00]
+v_mad_f16 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x14,0xd1,0x00,0x82,0x01,0x00]
+v_mad_f16 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x14,0xd1,0x00,0xe0,0x01,0x00]
+v_mad_f16 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x14,0xd1,0x00,0xee,0x01,0x00]
+v_mad_f16 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x14,0xd1,0x00,0x00,0x02,0x00]
+v_mad_f16 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_or_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x14,0xd1,0x00,0xfe,0x03,0x00]
+v_mad_f16 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_xor_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x2a]
+v_mad_f16 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_xor_b32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x2b]
+v_mad_f16 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_xor_b32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x2a]
+v_mad_f16 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_xor_b32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x2a]
+v_mad_f16 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x01,0x05,0x0e,0x04]
 
-v_xor_b32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x2a]
+v_mad_f16 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0xff,0x05,0x0e,0x04]
 
-v_xor_b32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x2a]
+v_mad_f16 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_xor_b32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x2a]
+v_mad_f16 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xea,0xd1,0x01,0x04,0xfe,0x07]
 
-v_xor_b32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x2a]
+v_mad_f16 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x01,0x04,0x0e,0x24]
 
-v_xor_b32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x2a]
+v_mad_f16 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x01,0x04,0x0e,0x44]
 
-v_xor_b32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x2a]
+v_mad_f16 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x01,0x04,0x0e,0x84]
 
-v_xor_b32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x2a]
+v_mad_f16 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xea,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_xor_b32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x2a]
+v_mad_f16 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xea,0xd1,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x2a]
+v_mad_f16 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xea,0xd1,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x2a]
+v_mad_f16 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xea,0xd1,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x2a]
+v_mad_f16 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xea,0xd1,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x2a]
+v_mad_f16 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xea,0xd1,0x01,0x04,0x0e,0x04]
 
-v_xor_b32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x2a]
+v_mad_u16 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0x00,0x01,0x02]
 
-v_xor_b32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x2a]
+v_mad_u16 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xeb,0xd1,0x01,0x00,0x01,0x02]
 
-v_xor_b32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x2a]
+v_mad_u16 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x65,0x00,0x01,0x02]
 
-v_xor_b32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x2a,0x56,0x34,0x12,0xaf]
+v_mad_u16 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x66,0x00,0x01,0x02]
 
-v_xor_b32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x2a,0x73,0x72,0x71,0x3f]
+v_mad_u16 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x67,0x00,0x01,0x02]
 
-v_xor_b32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x2a]
+v_mad_u16 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x6a,0x00,0x01,0x02]
 
-v_xor_b32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x2a]
+v_mad_u16 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x6b,0x00,0x01,0x02]
 
-v_xor_b32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x2a]
+v_mad_u16 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x6c,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x15,0xd1,0x00,0x00,0x00,0x00]
+v_mad_u16 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x6d,0x00,0x01,0x02]
 
-v_xor_b32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x15,0xd1,0x00,0x00,0x00,0x00]
+v_mad_u16 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x6e,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x15,0xd1,0x80,0x00,0x00,0x00]
+v_mad_u16 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x6f,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x15,0xd1,0xc1,0x00,0x00,0x00]
+v_mad_u16 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x7b,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x15,0xd1,0xf0,0x00,0x00,0x00]
+v_mad_u16 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x7c,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x15,0xd1,0xf7,0x00,0x00,0x00]
+v_mad_u16 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x7e,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x15,0xd1,0x00,0x01,0x00,0x00]
+v_mad_u16 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x7f,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x15,0xd1,0xff,0x01,0x00,0x00]
+v_mad_u16 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x80,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x15,0xd1,0x00,0x00,0x01,0x00]
+v_mad_u16 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0xc1,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x15,0xd1,0x00,0x82,0x01,0x00]
+v_mad_u16 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0xf0,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x15,0xd1,0x00,0xe0,0x01,0x00]
+v_mad_u16 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0xf7,0x00,0x01,0x02]
 
-v_xor_b32_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x15,0xd1,0x00,0xee,0x01,0x00]
+v_mad_u16 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0x01,0x01,0x02]
 
-v_xor_b32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x15,0xd1,0x00,0x00,0x02,0x00]
+v_mad_u16 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0xff,0x01,0x01,0x02]
 
-v_xor_b32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x15,0xd1,0x00,0xfe,0x03,0x00]
+v_mad_u16 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0x82,0x01,0x02]
 
-v_mac_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x2c]
+v_mad_u16 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0xe0,0x01,0x02]
 
-v_mac_f32 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x2d]
+v_mad_u16 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0xee,0x01,0x02]
 
-v_mac_f32 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x2c]
+v_mad_u16 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0x04,0x02,0x02]
 
-v_mac_f32 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x2c]
+v_mad_u16 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0xfe,0x03,0x02]
 
-v_mac_f32 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x2c]
+v_mad_u16 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0x00,0x05,0x03]
 
-v_mac_f32 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x2c]
+v_mad_u16 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0x00,0xc1,0x03]
 
-v_mac_f32 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x2c]
+v_mad_u16 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0x00,0xdd,0x03]
 
-v_mac_f32 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x2c]
+v_mad_u16 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0x00,0x0d,0x04]
 
-v_mac_f32 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x2c]
+v_mad_u16 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xeb,0xd1,0x01,0x00,0xfd,0x07]
 
-v_mac_f32 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x2c]
+v_mad_i16 v5, s1, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0x00,0x01,0x02]
 
-v_mac_f32 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x2c]
+v_mad_i16 v255, s1, 0, 0
+// CHECK: [0xff,0x00,0xec,0xd1,0x01,0x00,0x01,0x02]
 
-v_mac_f32 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x2c]
+v_mad_i16 v5, s101, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x65,0x00,0x01,0x02]
 
-v_mac_f32 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x2c]
+v_mad_i16 v5, flat_scratch_lo, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x66,0x00,0x01,0x02]
 
-v_mac_f32 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x2c]
+v_mad_i16 v5, flat_scratch_hi, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x67,0x00,0x01,0x02]
 
-v_mac_f32 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x2c]
+v_mad_i16 v5, vcc_lo, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x6a,0x00,0x01,0x02]
 
-v_mac_f32 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x2c]
+v_mad_i16 v5, vcc_hi, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x6b,0x00,0x01,0x02]
 
-v_mac_f32 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x2c]
+v_mad_i16 v5, tba_lo, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x6c,0x00,0x01,0x02]
 
-v_mac_f32 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x2c]
+v_mad_i16 v5, tba_hi, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x6d,0x00,0x01,0x02]
 
-v_mac_f32 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x2c]
+v_mad_i16 v5, tma_lo, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x6e,0x00,0x01,0x02]
 
-v_mac_f32 v0, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x2c,0x56,0x34,0x12,0xaf]
+v_mad_i16 v5, tma_hi, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x6f,0x00,0x01,0x02]
 
-v_mac_f32 v0, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x2c,0x73,0x72,0x71,0x3f]
+v_mad_i16 v5, ttmp11, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x7b,0x00,0x01,0x02]
 
-v_mac_f32 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x2c]
+v_mad_i16 v5, m0, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x7c,0x00,0x01,0x02]
 
-v_mac_f32 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x2c]
+v_mad_i16 v5, exec_lo, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x7e,0x00,0x01,0x02]
 
-v_mac_f32 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x2c]
+v_mad_i16 v5, exec_hi, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x7f,0x00,0x01,0x02]
 
-v_mac_f32_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0x00,0x00,0x00]
+v_mad_i16 v5, 0, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x80,0x00,0x01,0x02]
 
-v_mac_f32_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x16,0xd1,0x00,0x00,0x00,0x00]
+v_mad_i16 v5, -1, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0xc1,0x00,0x01,0x02]
 
-v_mac_f32_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x16,0xd1,0xfd,0x00,0x00,0x00]
+v_mad_i16 v5, 0.5, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0xf0,0x00,0x01,0x02]
 
-v_mac_f32_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0x01,0x00,0x00]
+v_mad_i16 v5, -4.0, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0xf7,0x00,0x01,0x02]
 
-v_mac_f32_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x16,0xd1,0xff,0x01,0x00,0x00]
+v_mad_i16 v5, v1, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0x01,0x01,0x02]
 
-v_mac_f32_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0xfa,0x01,0x00]
+v_mad_i16 v5, v255, 0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0xff,0x01,0x01,0x02]
 
-v_mac_f32_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0x00,0x02,0x00]
+v_mad_i16 v5, s1, -1, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0x82,0x01,0x02]
 
-v_mac_f32_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0xfe,0x03,0x00]
+v_mad_i16 v5, s1, 0.5, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0xe0,0x01,0x02]
 
-v_mac_f32_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0x00,0x00,0x20]
+v_mad_i16 v5, s1, -4.0, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0xee,0x01,0x02]
 
-v_mac_f32_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0x00,0x00,0x40]
+v_mad_i16 v5, s1, v2, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0x04,0x02,0x02]
 
-v_mac_f32_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0x00,0x00,0x60]
+v_mad_i16 v5, s1, v255, 0
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0xfe,0x03,0x02]
 
-v_mac_f32_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x16,0xd1,0x00,0x00,0x00,0x00]
+v_mad_i16 v5, s1, 0, -1
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0x00,0x05,0x03]
 
-v_mac_f32_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x16,0xd1,0x00,0x00,0x00,0x00]
+v_mad_i16 v5, s1, 0, 0.5
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0x00,0xc1,0x03]
 
-v_mac_f32_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x16,0xd1,0x00,0x00,0x00,0x00]
+v_mad_i16 v5, s1, 0, -4.0
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0x00,0xdd,0x03]
 
-v_mac_f32_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x16,0xd1,0x00,0x00,0x00,0x00]
+v_mad_i16 v5, s1, 0, v3
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0x00,0x0d,0x04]
 
-v_mac_f32_e64 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0x00,0x00,0x08]
+v_mad_i16 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xec,0xd1,0x01,0x00,0xfd,0x07]
 
-v_mac_f32_e64 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0x00,0x00,0x10]
+v_fma_f16 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x01,0x04,0x0e,0x04]
 
-v_mac_f32_e64 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x16,0xd1,0x00,0x00,0x00,0x18]
+v_fma_f16 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xee,0xd1,0x01,0x04,0x0e,0x04]
 
-v_madmk_f32 v0, 0, 0x11213141, v0
-// CHECK: [0x80,0x00,0x00,0x2e,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x65,0x04,0x0e,0x04]
 
-v_madmk_f32 v255, 0, 0x11213141, v0
-// CHECK: [0x80,0x00,0xfe,0x2f,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x66,0x04,0x0e,0x04]
 
-v_madmk_f32 v0, -1, 0x11213141, v0
-// CHECK: [0xc1,0x00,0x00,0x2e,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x67,0x04,0x0e,0x04]
 
-v_madmk_f32 v0, 0.5, 0x11213141, v0
-// CHECK: [0xf0,0x00,0x00,0x2e,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_madmk_f32 v0, -4.0, 0x11213141, v0
-// CHECK: [0xf7,0x00,0x00,0x2e,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_madmk_f32 v0, v0, 0x11213141, v0
-// CHECK: [0x00,0x01,0x00,0x2e,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_madmk_f32 v0, v255, 0x11213141, v0
-// CHECK: [0xff,0x01,0x00,0x2e,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_madmk_f32 v0, 0, 0xa1b1c1d1, v0
-// CHECK: [0x80,0x00,0x00,0x2e,0xd1,0xc1,0xb1,0xa1]
+v_fma_f16 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_madmk_f32 v0, 0, 0x11213141, v255
-// CHECK: [0x80,0xfe,0x01,0x2e,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_madak_f32 v0, 0, v0, 0x11213141
-// CHECK: [0x80,0x00,0x00,0x30,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_madak_f32 v255, 0, v0, 0x11213141
-// CHECK: [0x80,0x00,0xfe,0x31,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_madak_f32 v0, -1, v0, 0x11213141
-// CHECK: [0xc1,0x00,0x00,0x30,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_madak_f32 v0, 0.5, v0, 0x11213141
-// CHECK: [0xf0,0x00,0x00,0x30,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_madak_f32 v0, -4.0, v0, 0x11213141
-// CHECK: [0xf7,0x00,0x00,0x30,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_madak_f32 v0, v0, v0, 0x11213141
-// CHECK: [0x00,0x01,0x00,0x30,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x01,0x05,0x0e,0x04]
 
-v_madak_f32 v0, v255, v0, 0x11213141
-// CHECK: [0xff,0x01,0x00,0x30,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0xff,0x05,0x0e,0x04]
 
-v_madak_f32 v0, 0, v255, 0x11213141
-// CHECK: [0x80,0xfe,0x01,0x30,0x41,0x31,0x21,0x11]
+v_fma_f16 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_madak_f32 v0, 0, v0, 0xa1b1c1d1
-// CHECK: [0x80,0x00,0x00,0x30,0xd1,0xc1,0xb1,0xa1]
+v_fma_f16 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xee,0xd1,0x01,0x04,0xfe,0x07]
 
-v_add_u32 v0, vcc, s0, v0
-// CHECK: [0x00,0x00,0x00,0x32]
+v_fma_f16 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x01,0x04,0x0e,0x24]
 
-v_add_u32 v255, vcc, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x33]
+v_fma_f16 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x01,0x04,0x0e,0x44]
 
-v_add_u32 v0, vcc, s101, v0
-// CHECK: [0x65,0x00,0x00,0x32]
+v_fma_f16 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x01,0x04,0x0e,0x84]
 
-v_add_u32 v0, vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x32]
+v_fma_f16 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xee,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_add_u32 v0, vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x32]
+v_fma_f16 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xee,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x32]
+v_fma_f16 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xee,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x32]
+v_fma_f16 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xee,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x32]
+v_fma_f16 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xee,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x32]
+v_fma_f16 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xee,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x32]
+v_div_fixup_f16 v5, s1, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x32]
+v_div_fixup_f16 v255, s1, v2, v3
+// CHECK: [0xff,0x00,0xef,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x32]
+v_div_fixup_f16 v5, s101, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x65,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x32]
+v_div_fixup_f16 v5, flat_scratch_lo, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x66,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x32]
+v_div_fixup_f16 v5, flat_scratch_hi, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x67,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x32]
+v_div_fixup_f16 v5, vcc_lo, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x6a,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, 0, v0
-// CHECK: [0x80,0x00,0x00,0x32]
+v_div_fixup_f16 v5, vcc_hi, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x6b,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x32]
+v_div_fixup_f16 v5, tba_lo, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x6c,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x32]
+v_div_fixup_f16 v5, tba_hi, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x6d,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x32]
+v_div_fixup_f16 v5, tma_lo, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x6e,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x32,0x56,0x34,0x12,0xaf]
+v_div_fixup_f16 v5, tma_hi, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x6f,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x32,0x73,0x72,0x71,0x3f]
+v_div_fixup_f16 v5, ttmp11, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x7b,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, v0, v0
-// CHECK: [0x00,0x01,0x00,0x32]
+v_div_fixup_f16 v5, m0, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x7c,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, v255, v0
-// CHECK: [0xff,0x01,0x00,0x32]
+v_div_fixup_f16 v5, exec_lo, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x7e,0x04,0x0e,0x04]
 
-v_add_u32 v0, vcc, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x32]
+v_div_fixup_f16 v5, exec_hi, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x7f,0x04,0x0e,0x04]
 
-v_add_u32_e64 v0, s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x19,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f16 v5, scc, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0xfd,0x04,0x0e,0x04]
 
-v_add_u32_e64 v255, s[0:1], s0, s0
-// CHECK: [0xff,0x00,0x19,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f16 v5, v1, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x01,0x05,0x0e,0x04]
 
-v_add_u32_e64 v0, s[2:3], s0, s0
-// CHECK: [0x00,0x02,0x19,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f16 v5, v255, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0xff,0x05,0x0e,0x04]
 
-v_add_u32_e64 v0, s[100:101], s0, s0
-// CHECK: [0x00,0x64,0x19,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f16 v5, s1, v255, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x01,0xfe,0x0f,0x04]
 
-v_add_u32_e64 v0, flat_scratch, s0, s0
-// CHECK: [0x00,0x66,0x19,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f16 v5, s1, v2, v255
+// CHECK: [0x05,0x00,0xef,0xd1,0x01,0x04,0xfe,0x07]
 
-v_add_u32_e64 v0, vcc, s0, s0
-// CHECK: [0x00,0x6a,0x19,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f16 v5, -s1, v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x01,0x04,0x0e,0x24]
 
-v_add_u32_e64 v0, tba, s0, s0
-// CHECK: [0x00,0x6c,0x19,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f16 v5, s1, -v2, v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x01,0x04,0x0e,0x44]
 
-v_add_u32_e64 v0, tma, s0, s0
-// CHECK: [0x00,0x6e,0x19,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f16 v5, s1, v2, -v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x01,0x04,0x0e,0x84]
 
-v_add_u32_e64 v0, ttmp[10:11], s0, s0
-// CHECK: [0x00,0x7a,0x19,0xd1,0x00,0x00,0x00,0x00]
+v_div_fixup_f16 v5, -s1, -v2, -v3
+// CHECK: [0x05,0x00,0xef,0xd1,0x01,0x04,0x0e,0xe4]
 
-v_add_u32_e64 v0, s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x19,0xd1,0x80,0x00,0x00,0x00]
+v_div_fixup_f16 v5, |s1|, v2, v3
+// CHECK: [0x05,0x01,0xef,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32_e64 v0, s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x19,0xd1,0xc1,0x00,0x00,0x00]
+v_div_fixup_f16 v5, s1, |v2|, v3
+// CHECK: [0x05,0x02,0xef,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32_e64 v0, s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x19,0xd1,0xf0,0x00,0x00,0x00]
+v_div_fixup_f16 v5, s1, v2, |v3|
+// CHECK: [0x05,0x04,0xef,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32_e64 v0, s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x19,0xd1,0xf7,0x00,0x00,0x00]
+v_div_fixup_f16 v5, |s1|, |v2|, |v3|
+// CHECK: [0x05,0x07,0xef,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32_e64 v0, s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x19,0xd1,0x00,0x01,0x00,0x00]
+v_div_fixup_f16 v5, s1, v2, v3 clamp
+// CHECK: [0x05,0x80,0xef,0xd1,0x01,0x04,0x0e,0x04]
 
-v_add_u32_e64 v0, s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x19,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_pkaccum_u8_f32 v5, v1, s2
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0x05,0x00,0x00]
 
-v_add_u32_e64 v0, s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x19,0xd1,0x00,0x00,0x01,0x00]
+v_cvt_pkaccum_u8_f32 v255, v1, s2
+// CHECK: [0xff,0x00,0xf0,0xd1,0x01,0x05,0x00,0x00]
 
-v_add_u32_e64 v0, s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x19,0xd1,0x00,0x82,0x01,0x00]
+v_cvt_pkaccum_u8_f32 v5, v255, s2
+// CHECK: [0x05,0x00,0xf0,0xd1,0xff,0x05,0x00,0x00]
 
-v_add_u32_e64 v0, s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x19,0xd1,0x00,0xe0,0x01,0x00]
+v_cvt_pkaccum_u8_f32 v5, v1, s101
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xcb,0x00,0x00]
 
-v_add_u32_e64 v0, s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x19,0xd1,0x00,0xee,0x01,0x00]
+v_cvt_pkaccum_u8_f32 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xcd,0x00,0x00]
 
-v_add_u32_e64 v0, s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x19,0xd1,0x00,0x00,0x02,0x00]
+v_cvt_pkaccum_u8_f32 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xcf,0x00,0x00]
 
-v_add_u32_e64 v0, s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x19,0xd1,0x00,0xfe,0x03,0x00]
+v_cvt_pkaccum_u8_f32 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xd5,0x00,0x00]
 
-v_sub_u32 v0, vcc, s0, v0
-// CHECK: [0x00,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xd7,0x00,0x00]
 
-v_sub_u32 v255, vcc, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x35]
+v_cvt_pkaccum_u8_f32 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xd9,0x00,0x00]
 
-v_sub_u32 v0, vcc, s101, v0
-// CHECK: [0x65,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xdb,0x00,0x00]
 
-v_sub_u32 v0, vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xdd,0x00,0x00]
 
-v_sub_u32 v0, vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xdf,0x00,0x00]
 
-v_sub_u32 v0, vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xf7,0x00,0x00]
 
-v_sub_u32 v0, vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, m0
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xf9,0x00,0x00]
 
-v_sub_u32 v0, vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xfd,0x00,0x00]
 
-v_sub_u32 v0, vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xff,0x00,0x00]
 
-v_sub_u32 v0, vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, 0
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0x01,0x01,0x00]
 
-v_sub_u32 v0, vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, -1
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0x83,0x01,0x00]
 
-v_sub_u32 v0, vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, 0.5
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xe1,0x01,0x00]
 
-v_sub_u32 v0, vcc, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, -4.0
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xef,0x01,0x00]
 
-v_sub_u32 v0, vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, scc
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xfb,0x01,0x00]
 
-v_sub_u32 v0, vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, v2
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0x05,0x02,0x00]
 
-v_sub_u32 v0, vcc, 0, v0
-// CHECK: [0x80,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, v255
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0xff,0x03,0x00]
 
-v_sub_u32 v0, vcc, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, -v1, s2
+// CHECK: [0x05,0x00,0xf0,0xd1,0x01,0x05,0x00,0x20]
 
-v_sub_u32 v0, vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, |v1|, s2
+// CHECK: [0x05,0x01,0xf0,0xd1,0x01,0x05,0x00,0x00]
 
-v_sub_u32 v0, vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x34]
+v_cvt_pkaccum_u8_f32 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0xf0,0xd1,0x01,0x05,0x00,0x00]
 
-v_sub_u32 v0, vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x34,0x56,0x34,0x12,0xaf]
+v_add_f64 v[5:6], s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0x80,0xd2,0x04,0x08,0x00,0x00]
 
-v_sub_u32 v0, vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x34,0x73,0x72,0x71,0x3f]
+v_add_f64 v[254:255], s[4:5], s[4:5]
+// CHECK: [0xfe,0x00,0x80,0xd2,0x04,0x08,0x00,0x00]
 
-v_sub_u32 v0, vcc, v0, v0
-// CHECK: [0x00,0x01,0x00,0x34]
+v_add_f64 v[5:6], v[1:2], s[4:5]
+// CHECK: [0x05,0x00,0x80,0xd2,0x01,0x09,0x00,0x00]
 
-v_sub_u32 v0, vcc, v255, v0
-// CHECK: [0xff,0x01,0x00,0x34]
+v_add_f64 v[5:6], v[254:255], s[4:5]
+// CHECK: [0x05,0x00,0x80,0xd2,0xfe,0x09,0x00,0x00]
 
-v_sub_u32 v0, vcc, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x34]
+v_add_f64 v[5:6], s[4:5], v[2:3]
+// CHECK: [0x05,0x00,0x80,0xd2,0x04,0x04,0x02,0x00]
 
-v_sub_u32_e64 v0, s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x1a,0xd1,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], s[4:5], v[254:255]
+// CHECK: [0x05,0x00,0x80,0xd2,0x04,0xfc,0x03,0x00]
 
-v_sub_u32_e64 v255, s[0:1], s0, s0
-// CHECK: [0xff,0x00,0x1a,0xd1,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], -s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0x80,0xd2,0x04,0x08,0x00,0x20]
 
-v_sub_u32_e64 v0, s[2:3], s0, s0
-// CHECK: [0x00,0x02,0x1a,0xd1,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0x80,0xd2,0x04,0x08,0x00,0x40]
 
-v_sub_u32_e64 v0, s[100:101], s0, s0
-// CHECK: [0x00,0x64,0x1a,0xd1,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], -s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0x80,0xd2,0x04,0x08,0x00,0x60]
 
-v_sub_u32_e64 v0, flat_scratch, s0, s0
-// CHECK: [0x00,0x66,0x1a,0xd1,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], |s[4:5]|, s[4:5]
+// CHECK: [0x05,0x01,0x80,0xd2,0x04,0x08,0x00,0x00]
 
-v_sub_u32_e64 v0, vcc, s0, s0
-// CHECK: [0x00,0x6a,0x1a,0xd1,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], s[4:5], |s[4:5]|
+// CHECK: [0x05,0x02,0x80,0xd2,0x04,0x08,0x00,0x00]
 
-v_sub_u32_e64 v0, tba, s0, s0
-// CHECK: [0x00,0x6c,0x1a,0xd1,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], |s[4:5]|, |s[4:5]|
+// CHECK: [0x05,0x03,0x80,0xd2,0x04,0x08,0x00,0x00]
 
-v_sub_u32_e64 v0, tma, s0, s0
-// CHECK: [0x00,0x6e,0x1a,0xd1,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], s[4:5], s[4:5] clamp
+// CHECK: [0x05,0x80,0x80,0xd2,0x04,0x08,0x00,0x00]
 
-v_sub_u32_e64 v0, ttmp[10:11], s0, s0
-// CHECK: [0x00,0x7a,0x1a,0xd1,0x00,0x00,0x00,0x00]
+v_add_f64 v[5:6], s[4:5], s[4:5] mul:2
+// CHECK: [0x05,0x00,0x80,0xd2,0x04,0x08,0x00,0x08]
 
-v_sub_u32_e64 v0, s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x1a,0xd1,0x80,0x00,0x00,0x00]
+v_add_f64 v[5:6], s[4:5], s[4:5] mul:4
+// CHECK: [0x05,0x00,0x80,0xd2,0x04,0x08,0x00,0x10]
 
-v_sub_u32_e64 v0, s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x1a,0xd1,0xc1,0x00,0x00,0x00]
+v_add_f64 v[5:6], s[4:5], s[4:5] div:2
+// CHECK: [0x05,0x00,0x80,0xd2,0x04,0x08,0x00,0x18]
 
-v_sub_u32_e64 v0, s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x1a,0xd1,0xf0,0x00,0x00,0x00]
+v_mul_f64 v[5:6], s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0x81,0xd2,0x04,0x08,0x00,0x00]
 
-v_sub_u32_e64 v0, s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x1a,0xd1,0xf7,0x00,0x00,0x00]
+v_mul_f64 v[254:255], s[4:5], s[4:5]
+// CHECK: [0xfe,0x00,0x81,0xd2,0x04,0x08,0x00,0x00]
 
-v_sub_u32_e64 v0, s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x1a,0xd1,0x00,0x01,0x00,0x00]
+v_mul_f64 v[5:6], v[1:2], s[4:5]
+// CHECK: [0x05,0x00,0x81,0xd2,0x01,0x09,0x00,0x00]
 
-v_sub_u32_e64 v0, s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x1a,0xd1,0xff,0x01,0x00,0x00]
+v_mul_f64 v[5:6], v[254:255], s[4:5]
+// CHECK: [0x05,0x00,0x81,0xd2,0xfe,0x09,0x00,0x00]
 
-v_sub_u32_e64 v0, s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x1a,0xd1,0x00,0x00,0x01,0x00]
+v_mul_f64 v[5:6], s[4:5], v[2:3]
+// CHECK: [0x05,0x00,0x81,0xd2,0x04,0x04,0x02,0x00]
 
-v_sub_u32_e64 v0, s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x1a,0xd1,0x00,0x82,0x01,0x00]
+v_mul_f64 v[5:6], s[4:5], v[254:255]
+// CHECK: [0x05,0x00,0x81,0xd2,0x04,0xfc,0x03,0x00]
 
-v_sub_u32_e64 v0, s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x1a,0xd1,0x00,0xe0,0x01,0x00]
+v_mul_f64 v[5:6], -s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0x81,0xd2,0x04,0x08,0x00,0x20]
 
-v_sub_u32_e64 v0, s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x1a,0xd1,0x00,0xee,0x01,0x00]
+v_mul_f64 v[5:6], s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0x81,0xd2,0x04,0x08,0x00,0x40]
 
-v_sub_u32_e64 v0, s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x1a,0xd1,0x00,0x00,0x02,0x00]
+v_mul_f64 v[5:6], -s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0x81,0xd2,0x04,0x08,0x00,0x60]
 
-v_sub_u32_e64 v0, s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x1a,0xd1,0x00,0xfe,0x03,0x00]
+v_mul_f64 v[5:6], |s[4:5]|, s[4:5]
+// CHECK: [0x05,0x01,0x81,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v0, vcc, s0, v0
-// CHECK: [0x00,0x00,0x00,0x36]
+v_mul_f64 v[5:6], s[4:5], |s[4:5]|
+// CHECK: [0x05,0x02,0x81,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v255, vcc, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x37]
+v_mul_f64 v[5:6], |s[4:5]|, |s[4:5]|
+// CHECK: [0x05,0x03,0x81,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v0, vcc, s101, v0
-// CHECK: [0x65,0x00,0x00,0x36]
+v_mul_f64 v[5:6], s[4:5], s[4:5] clamp
+// CHECK: [0x05,0x80,0x81,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v0, vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x36]
+v_mul_f64 v[5:6], s[4:5], s[4:5] mul:2
+// CHECK: [0x05,0x00,0x81,0xd2,0x04,0x08,0x00,0x08]
 
-v_subrev_u32 v0, vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x36]
+v_mul_f64 v[5:6], s[4:5], s[4:5] mul:4
+// CHECK: [0x05,0x00,0x81,0xd2,0x04,0x08,0x00,0x10]
 
-v_subrev_u32 v0, vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x36]
+v_mul_f64 v[5:6], s[4:5], s[4:5] div:2
+// CHECK: [0x05,0x00,0x81,0xd2,0x04,0x08,0x00,0x18]
 
-v_subrev_u32 v0, vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x36]
+v_min_f64 v[5:6], s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0x82,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v0, vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x36]
+v_min_f64 v[254:255], s[4:5], s[4:5]
+// CHECK: [0xfe,0x00,0x82,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v0, vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x36]
+v_min_f64 v[5:6], v[1:2], s[4:5]
+// CHECK: [0x05,0x00,0x82,0xd2,0x01,0x09,0x00,0x00]
 
-v_subrev_u32 v0, vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x36]
+v_min_f64 v[5:6], v[254:255], s[4:5]
+// CHECK: [0x05,0x00,0x82,0xd2,0xfe,0x09,0x00,0x00]
 
-v_subrev_u32 v0, vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x36]
+v_min_f64 v[5:6], s[4:5], v[2:3]
+// CHECK: [0x05,0x00,0x82,0xd2,0x04,0x04,0x02,0x00]
 
-v_subrev_u32 v0, vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x36]
+v_min_f64 v[5:6], s[4:5], v[254:255]
+// CHECK: [0x05,0x00,0x82,0xd2,0x04,0xfc,0x03,0x00]
 
-v_subrev_u32 v0, vcc, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x36]
+v_min_f64 v[5:6], -s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0x82,0xd2,0x04,0x08,0x00,0x20]
 
-v_subrev_u32 v0, vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x36]
+v_min_f64 v[5:6], s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0x82,0xd2,0x04,0x08,0x00,0x40]
 
-v_subrev_u32 v0, vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x36]
+v_min_f64 v[5:6], -s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0x82,0xd2,0x04,0x08,0x00,0x60]
 
-v_subrev_u32 v0, vcc, 0, v0
-// CHECK: [0x80,0x00,0x00,0x36]
+v_min_f64 v[5:6], |s[4:5]|, s[4:5]
+// CHECK: [0x05,0x01,0x82,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v0, vcc, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x36]
+v_min_f64 v[5:6], s[4:5], |s[4:5]|
+// CHECK: [0x05,0x02,0x82,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v0, vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x36]
+v_min_f64 v[5:6], |s[4:5]|, |s[4:5]|
+// CHECK: [0x05,0x03,0x82,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v0, vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x36]
+v_min_f64 v[5:6], s[4:5], s[4:5] clamp
+// CHECK: [0x05,0x80,0x82,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v0, vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x00,0x36,0x56,0x34,0x12,0xaf]
+v_min_f64 v[5:6], s[4:5], s[4:5] mul:2
+// CHECK: [0x05,0x00,0x82,0xd2,0x04,0x08,0x00,0x08]
 
-v_subrev_u32 v0, vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x00,0x36,0x73,0x72,0x71,0x3f]
+v_min_f64 v[5:6], s[4:5], s[4:5] mul:4
+// CHECK: [0x05,0x00,0x82,0xd2,0x04,0x08,0x00,0x10]
 
-v_subrev_u32 v0, vcc, v0, v0
-// CHECK: [0x00,0x01,0x00,0x36]
+v_min_f64 v[5:6], s[4:5], s[4:5] div:2
+// CHECK: [0x05,0x00,0x82,0xd2,0x04,0x08,0x00,0x18]
 
-v_subrev_u32 v0, vcc, v255, v0
-// CHECK: [0xff,0x01,0x00,0x36]
+v_max_f64 v[5:6], s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0x83,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32 v0, vcc, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x36]
+v_max_f64 v[254:255], s[4:5], s[4:5]
+// CHECK: [0xfe,0x00,0x83,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x1b,0xd1,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], v[1:2], s[4:5]
+// CHECK: [0x05,0x00,0x83,0xd2,0x01,0x09,0x00,0x00]
 
-v_subrev_u32_e64 v255, s[0:1], s0, s0
-// CHECK: [0xff,0x00,0x1b,0xd1,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], v[254:255], s[4:5]
+// CHECK: [0x05,0x00,0x83,0xd2,0xfe,0x09,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[2:3], s0, s0
-// CHECK: [0x00,0x02,0x1b,0xd1,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], v[2:3]
+// CHECK: [0x05,0x00,0x83,0xd2,0x04,0x04,0x02,0x00]
 
-v_subrev_u32_e64 v0, s[100:101], s0, s0
-// CHECK: [0x00,0x64,0x1b,0xd1,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], v[254:255]
+// CHECK: [0x05,0x00,0x83,0xd2,0x04,0xfc,0x03,0x00]
 
-v_subrev_u32_e64 v0, flat_scratch, s0, s0
-// CHECK: [0x00,0x66,0x1b,0xd1,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], -s[4:5], s[4:5]
+// CHECK: [0x05,0x00,0x83,0xd2,0x04,0x08,0x00,0x20]
 
-v_subrev_u32_e64 v0, vcc, s0, s0
-// CHECK: [0x00,0x6a,0x1b,0xd1,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0x83,0xd2,0x04,0x08,0x00,0x40]
 
-v_subrev_u32_e64 v0, tba, s0, s0
-// CHECK: [0x00,0x6c,0x1b,0xd1,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], -s[4:5], -s[4:5]
+// CHECK: [0x05,0x00,0x83,0xd2,0x04,0x08,0x00,0x60]
 
-v_subrev_u32_e64 v0, tma, s0, s0
-// CHECK: [0x00,0x6e,0x1b,0xd1,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], |s[4:5]|, s[4:5]
+// CHECK: [0x05,0x01,0x83,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32_e64 v0, ttmp[10:11], s0, s0
-// CHECK: [0x00,0x7a,0x1b,0xd1,0x00,0x00,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], |s[4:5]|
+// CHECK: [0x05,0x02,0x83,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x1b,0xd1,0x80,0x00,0x00,0x00]
+v_max_f64 v[5:6], |s[4:5]|, |s[4:5]|
+// CHECK: [0x05,0x03,0x83,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[0:1], -1, s0
-// CHECK: [0x00,0x00,0x1b,0xd1,0xc1,0x00,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], s[4:5] clamp
+// CHECK: [0x05,0x80,0x83,0xd2,0x04,0x08,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x1b,0xd1,0xf0,0x00,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], s[4:5] mul:2
+// CHECK: [0x05,0x00,0x83,0xd2,0x04,0x08,0x00,0x08]
 
-v_subrev_u32_e64 v0, s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0x1b,0xd1,0xf7,0x00,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], s[4:5] mul:4
+// CHECK: [0x05,0x00,0x83,0xd2,0x04,0x08,0x00,0x10]
 
-v_subrev_u32_e64 v0, s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x1b,0xd1,0x00,0x01,0x00,0x00]
+v_max_f64 v[5:6], s[4:5], s[4:5] div:2
+// CHECK: [0x05,0x00,0x83,0xd2,0x04,0x08,0x00,0x18]
 
-v_subrev_u32_e64 v0, s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x1b,0xd1,0xff,0x01,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, s2
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0x04,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x1b,0xd1,0x00,0x00,0x01,0x00]
+v_ldexp_f64 v[254:255], 0, s2
+// CHECK: [0xfe,0x00,0x84,0xd2,0x80,0x04,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[0:1], s0, -1
-// CHECK: [0x00,0x00,0x1b,0xd1,0x00,0x82,0x01,0x00]
+v_ldexp_f64 v[5:6], 0.5, s2
+// CHECK: [0x05,0x00,0x84,0xd2,0xf0,0x04,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x1b,0xd1,0x00,0xe0,0x01,0x00]
+v_ldexp_f64 v[5:6], v[1:2], s2
+// CHECK: [0x05,0x00,0x84,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x1b,0xd1,0x00,0xee,0x01,0x00]
+v_ldexp_f64 v[5:6], v[254:255], s2
+// CHECK: [0x05,0x00,0x84,0xd2,0xfe,0x05,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x1b,0xd1,0x00,0x00,0x02,0x00]
+v_ldexp_f64 v[5:6], 0, s101
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xca,0x00,0x00]
 
-v_subrev_u32_e64 v0, s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x1b,0xd1,0x00,0xfe,0x03,0x00]
+v_ldexp_f64 v[5:6], 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xcc,0x00,0x00]
 
-v_addc_u32 v0, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0x00,0x38]
+v_ldexp_f64 v[5:6], 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xce,0x00,0x00]
 
-v_addc_u32 v255, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0xfe,0x39]
+v_ldexp_f64 v[5:6], 0, vcc_lo
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xd4,0x00,0x00]
 
-v_addc_u32 v0, vcc, vcc_hi, v0, vcc
-// CHECK: [0x6b,0x00,0x00,0x38]
+v_ldexp_f64 v[5:6], 0, vcc_hi
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xd6,0x00,0x00]
 
-v_addc_u32 v0, vcc, 0, v0, vcc
-// CHECK: [0x80,0x00,0x00,0x38]
+v_ldexp_f64 v[5:6], 0, tba_lo
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xd8,0x00,0x00]
 
-v_addc_u32 v0, vcc, -1, v0, vcc
-// CHECK: [0xc1,0x00,0x00,0x38]
+v_ldexp_f64 v[5:6], 0, tba_hi
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xda,0x00,0x00]
 
-v_addc_u32 v0, vcc, 0.5, v0, vcc
-// CHECK: [0xf0,0x00,0x00,0x38]
+v_ldexp_f64 v[5:6], 0, tma_lo
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xdc,0x00,0x00]
 
-v_addc_u32 v0, vcc, -4.0, v0, vcc
-// CHECK: [0xf7,0x00,0x00,0x38]
+v_ldexp_f64 v[5:6], 0, tma_hi
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xde,0x00,0x00]
 
-v_addc_u32 v0, vcc, v0, v0, vcc
-// CHECK: [0x00,0x01,0x00,0x38]
+v_ldexp_f64 v[5:6], 0, ttmp11
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xf6,0x00,0x00]
 
-v_addc_u32 v0, vcc, v255, v0, vcc
-// CHECK: [0xff,0x01,0x00,0x38]
+v_ldexp_f64 v[5:6], 0, m0
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xf8,0x00,0x00]
 
-v_addc_u32 v0, vcc, vcc_lo, v255, vcc
-// CHECK: [0x6a,0xfe,0x01,0x38]
+v_ldexp_f64 v[5:6], 0, exec_lo
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xfc,0x00,0x00]
 
-v_addc_u32_e64 v0, s[0:1], s0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, exec_hi
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xfe,0x00,0x00]
 
-v_addc_u32_e64 v255, s[0:1], s0, s0, s[0:1]
-// CHECK: [0xff,0x00,0x1c,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, 0
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0x00,0x01,0x00]
 
-v_addc_u32_e64 v0, s[2:3], s0, s0, s[0:1]
-// CHECK: [0x00,0x02,0x1c,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, -1
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0x82,0x01,0x00]
 
-v_addc_u32_e64 v0, s[100:101], s0, s0, s[0:1]
-// CHECK: [0x00,0x64,0x1c,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, 0.5
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xe0,0x01,0x00]
 
-v_addc_u32_e64 v0, flat_scratch, s0, s0, s[0:1]
-// CHECK: [0x00,0x66,0x1c,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, -4.0
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xee,0x01,0x00]
 
-v_addc_u32_e64 v0, vcc, s0, s0, s[0:1]
-// CHECK: [0x00,0x6a,0x1c,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, scc
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xfa,0x01,0x00]
 
-v_addc_u32_e64 v0, tba, s0, s0, s[0:1]
-// CHECK: [0x00,0x6c,0x1c,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, v2
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0x04,0x02,0x00]
 
-v_addc_u32_e64 v0, tma, s0, s0, s[0:1]
-// CHECK: [0x00,0x6e,0x1c,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, v255
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0xfe,0x03,0x00]
 
-v_addc_u32_e64 v0, ttmp[10:11], s0, s0, s[0:1]
-// CHECK: [0x00,0x7a,0x1c,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, s2 clamp
+// CHECK: [0x05,0x80,0x84,0xd2,0x80,0x04,0x00,0x00]
 
-v_addc_u32_e64 v0, s[0:1], 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0x80,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, s2 mul:2
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0x04,0x00,0x08]
 
-v_addc_u32_e64 v0, s[0:1], -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0xc1,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, s2 mul:4
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0x04,0x00,0x10]
 
-v_addc_u32_e64 v0, s[0:1], 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0xf0,0x00,0x00,0x00]
+v_ldexp_f64 v[5:6], 0, s2 div:2
+// CHECK: [0x05,0x00,0x84,0xd2,0x80,0x04,0x00,0x18]
 
-v_addc_u32_e64 v0, s[0:1], -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0xf7,0x00,0x00,0x00]
+v_mul_lo_u32 v5, 0, s2
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0x04,0x00,0x00]
 
-v_addc_u32_e64 v0, s[0:1], v0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0x00,0x01,0x00,0x00]
+v_mul_lo_u32 v255, 0, s2
+// CHECK: [0xff,0x00,0x85,0xd2,0x80,0x04,0x00,0x00]
 
-v_addc_u32_e64 v0, s[0:1], v255, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0xff,0x01,0x00,0x00]
+v_mul_lo_u32 v5, -1, s2
+// CHECK: [0x05,0x00,0x85,0xd2,0xc1,0x04,0x00,0x00]
 
-v_addc_u32_e64 v0, s[0:1], s0, 0, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0x00,0x00,0x01,0x00]
+v_mul_lo_u32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x85,0xd2,0xf0,0x04,0x00,0x00]
 
-v_addc_u32_e64 v0, s[0:1], s0, -1, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0x00,0x82,0x01,0x00]
+v_mul_lo_u32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x85,0xd2,0xf7,0x04,0x00,0x00]
 
-v_addc_u32_e64 v0, s[0:1], s0, 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0x00,0xe0,0x01,0x00]
+v_mul_lo_u32 v5, v1, s2
+// CHECK: [0x05,0x00,0x85,0xd2,0x01,0x05,0x00,0x00]
 
-v_addc_u32_e64 v0, s[0:1], s0, -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0x00,0xee,0x01,0x00]
+v_mul_lo_u32 v5, v255, s2
+// CHECK: [0x05,0x00,0x85,0xd2,0xff,0x05,0x00,0x00]
 
-v_addc_u32_e64 v0, s[0:1], s0, v0, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0x00,0x00,0x02,0x00]
+v_mul_lo_u32 v5, 0, s101
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xca,0x00,0x00]
 
-v_addc_u32_e64 v0, s[0:1], s0, v255, s[0:1]
-// CHECK: [0x00,0x00,0x1c,0xd1,0x00,0xfe,0x03,0x00]
+v_mul_lo_u32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xcc,0x00,0x00]
 
-v_subb_u32 v0, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0x00,0x3a]
+v_mul_lo_u32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xce,0x00,0x00]
 
-v_subb_u32 v255, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0xfe,0x3b]
+v_mul_lo_u32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xd4,0x00,0x00]
 
-v_subb_u32 v0, vcc, vcc_hi, v0, vcc
-// CHECK: [0x6b,0x00,0x00,0x3a]
+v_mul_lo_u32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xd6,0x00,0x00]
 
-v_subb_u32 v0, vcc, 0, v0, vcc
-// CHECK: [0x80,0x00,0x00,0x3a]
+v_mul_lo_u32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xd8,0x00,0x00]
 
-v_subb_u32 v0, vcc, -1, v0, vcc
-// CHECK: [0xc1,0x00,0x00,0x3a]
+v_mul_lo_u32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xda,0x00,0x00]
 
-v_subb_u32 v0, vcc, 0.5, v0, vcc
-// CHECK: [0xf0,0x00,0x00,0x3a]
+v_mul_lo_u32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xdc,0x00,0x00]
 
-v_subb_u32 v0, vcc, -4.0, v0, vcc
-// CHECK: [0xf7,0x00,0x00,0x3a]
+v_mul_lo_u32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xde,0x00,0x00]
 
-v_subb_u32 v0, vcc, v0, v0, vcc
-// CHECK: [0x00,0x01,0x00,0x3a]
+v_mul_lo_u32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xf6,0x00,0x00]
 
-v_subb_u32 v0, vcc, v255, v0, vcc
-// CHECK: [0xff,0x01,0x00,0x3a]
+v_mul_lo_u32 v5, 0, m0
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xf8,0x00,0x00]
 
-v_subb_u32 v0, vcc, vcc_lo, v255, vcc
-// CHECK: [0x6a,0xfe,0x01,0x3a]
+v_mul_lo_u32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xfc,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], s0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0x00,0x00,0x00,0x00]
+v_mul_lo_u32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xfe,0x00,0x00]
 
-v_subb_u32_e64 v255, s[0:1], s0, s0, s[0:1]
-// CHECK: [0xff,0x00,0x1d,0xd1,0x00,0x00,0x00,0x00]
+v_mul_lo_u32 v5, 0, 0
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0x00,0x01,0x00]
 
-v_subb_u32_e64 v0, s[2:3], s0, s0, s[0:1]
-// CHECK: [0x00,0x02,0x1d,0xd1,0x00,0x00,0x00,0x00]
+v_mul_lo_u32 v5, 0, -1
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0x82,0x01,0x00]
 
-v_subb_u32_e64 v0, s[100:101], s0, s0, s[0:1]
-// CHECK: [0x00,0x64,0x1d,0xd1,0x00,0x00,0x00,0x00]
+v_mul_lo_u32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xe0,0x01,0x00]
 
-v_subb_u32_e64 v0, flat_scratch, s0, s0, s[0:1]
-// CHECK: [0x00,0x66,0x1d,0xd1,0x00,0x00,0x00,0x00]
+v_mul_lo_u32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xee,0x01,0x00]
 
-v_subb_u32_e64 v0, vcc, s0, s0, s[0:1]
-// CHECK: [0x00,0x6a,0x1d,0xd1,0x00,0x00,0x00,0x00]
+v_mul_lo_u32 v5, 0, v2
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0x04,0x02,0x00]
 
-v_subb_u32_e64 v0, tba, s0, s0, s[0:1]
-// CHECK: [0x00,0x6c,0x1d,0xd1,0x00,0x00,0x00,0x00]
+v_mul_lo_u32 v5, 0, v255
+// CHECK: [0x05,0x00,0x85,0xd2,0x80,0xfe,0x03,0x00]
 
-v_subb_u32_e64 v0, tma, s0, s0, s[0:1]
-// CHECK: [0x00,0x6e,0x1d,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_u32 v5, 0, s2
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0x04,0x00,0x00]
 
-v_subb_u32_e64 v0, ttmp[10:11], s0, s0, s[0:1]
-// CHECK: [0x00,0x7a,0x1d,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_u32 v255, 0, s2
+// CHECK: [0xff,0x00,0x86,0xd2,0x80,0x04,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0x80,0x00,0x00,0x00]
+v_mul_hi_u32 v5, -1, s2
+// CHECK: [0x05,0x00,0x86,0xd2,0xc1,0x04,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0xc1,0x00,0x00,0x00]
+v_mul_hi_u32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x86,0xd2,0xf0,0x04,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0xf0,0x00,0x00,0x00]
+v_mul_hi_u32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x86,0xd2,0xf7,0x04,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0xf7,0x00,0x00,0x00]
+v_mul_hi_u32 v5, v1, s2
+// CHECK: [0x05,0x00,0x86,0xd2,0x01,0x05,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], v0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0x00,0x01,0x00,0x00]
+v_mul_hi_u32 v5, v255, s2
+// CHECK: [0x05,0x00,0x86,0xd2,0xff,0x05,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], v255, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0xff,0x01,0x00,0x00]
+v_mul_hi_u32 v5, 0, s101
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xca,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], s0, 0, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0x00,0x00,0x01,0x00]
+v_mul_hi_u32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xcc,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], s0, -1, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0x00,0x82,0x01,0x00]
+v_mul_hi_u32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xce,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], s0, 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0x00,0xe0,0x01,0x00]
+v_mul_hi_u32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xd4,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], s0, -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0x00,0xee,0x01,0x00]
+v_mul_hi_u32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xd6,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], s0, v0, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0x00,0x00,0x02,0x00]
+v_mul_hi_u32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xd8,0x00,0x00]
 
-v_subb_u32_e64 v0, s[0:1], s0, v255, s[0:1]
-// CHECK: [0x00,0x00,0x1d,0xd1,0x00,0xfe,0x03,0x00]
+v_mul_hi_u32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xda,0x00,0x00]
 
-v_subbrev_u32 v0, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0x00,0x3c]
+v_mul_hi_u32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xdc,0x00,0x00]
 
-v_subbrev_u32 v255, vcc, vcc_lo, v0, vcc
-// CHECK: [0x6a,0x00,0xfe,0x3d]
+v_mul_hi_u32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xde,0x00,0x00]
 
-v_subbrev_u32 v0, vcc, vcc_hi, v0, vcc
-// CHECK: [0x6b,0x00,0x00,0x3c]
+v_mul_hi_u32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xf6,0x00,0x00]
 
-v_subbrev_u32 v0, vcc, 0, v0, vcc
-// CHECK: [0x80,0x00,0x00,0x3c]
+v_mul_hi_u32 v5, 0, m0
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xf8,0x00,0x00]
 
-v_subbrev_u32 v0, vcc, -1, v0, vcc
-// CHECK: [0xc1,0x00,0x00,0x3c]
+v_mul_hi_u32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xfc,0x00,0x00]
 
-v_subbrev_u32 v0, vcc, 0.5, v0, vcc
-// CHECK: [0xf0,0x00,0x00,0x3c]
+v_mul_hi_u32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xfe,0x00,0x00]
 
-v_subbrev_u32 v0, vcc, -4.0, v0, vcc
-// CHECK: [0xf7,0x00,0x00,0x3c]
+v_mul_hi_u32 v5, 0, 0
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0x00,0x01,0x00]
 
-v_subbrev_u32 v0, vcc, v0, v0, vcc
-// CHECK: [0x00,0x01,0x00,0x3c]
+v_mul_hi_u32 v5, 0, -1
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0x82,0x01,0x00]
 
-v_subbrev_u32 v0, vcc, v255, v0, vcc
-// CHECK: [0xff,0x01,0x00,0x3c]
+v_mul_hi_u32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xe0,0x01,0x00]
 
-v_subbrev_u32 v0, vcc, vcc_lo, v255, vcc
-// CHECK: [0x6a,0xfe,0x01,0x3c]
+v_mul_hi_u32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xee,0x01,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_u32 v5, 0, v2
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0x04,0x02,0x00]
 
-v_subbrev_u32_e64 v255, s[0:1], s0, s0, s[0:1]
-// CHECK: [0xff,0x00,0x1e,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_u32 v5, 0, v255
+// CHECK: [0x05,0x00,0x86,0xd2,0x80,0xfe,0x03,0x00]
 
-v_subbrev_u32_e64 v0, s[2:3], s0, s0, s[0:1]
-// CHECK: [0x00,0x02,0x1e,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0, s2
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0x04,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[100:101], s0, s0, s[0:1]
-// CHECK: [0x00,0x64,0x1e,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v255, 0, s2
+// CHECK: [0xff,0x00,0x87,0xd2,0x80,0x04,0x00,0x00]
 
-v_subbrev_u32_e64 v0, flat_scratch, s0, s0, s[0:1]
-// CHECK: [0x00,0x66,0x1e,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, -1, s2
+// CHECK: [0x05,0x00,0x87,0xd2,0xc1,0x04,0x00,0x00]
 
-v_subbrev_u32_e64 v0, vcc, s0, s0, s[0:1]
-// CHECK: [0x00,0x6a,0x1e,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x87,0xd2,0xf0,0x04,0x00,0x00]
 
-v_subbrev_u32_e64 v0, tba, s0, s0, s[0:1]
-// CHECK: [0x00,0x6c,0x1e,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x87,0xd2,0xf7,0x04,0x00,0x00]
 
-v_subbrev_u32_e64 v0, tma, s0, s0, s[0:1]
-// CHECK: [0x00,0x6e,0x1e,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, v1, s2
+// CHECK: [0x05,0x00,0x87,0xd2,0x01,0x05,0x00,0x00]
 
-v_subbrev_u32_e64 v0, ttmp[10:11], s0, s0, s[0:1]
-// CHECK: [0x00,0x7a,0x1e,0xd1,0x00,0x00,0x00,0x00]
+v_mul_hi_i32 v5, v255, s2
+// CHECK: [0x05,0x00,0x87,0xd2,0xff,0x05,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0x80,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0, s101
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xca,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0xc1,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xcc,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0xf0,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xce,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0xf7,0x00,0x00,0x00]
+v_mul_hi_i32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xd4,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], v0, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0x00,0x01,0x00,0x00]
+v_mul_hi_i32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xd6,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], v255, s0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0xff,0x01,0x00,0x00]
+v_mul_hi_i32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xd8,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, 0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0x00,0x00,0x01,0x00]
+v_mul_hi_i32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xda,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, -1, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0x00,0x82,0x01,0x00]
+v_mul_hi_i32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xdc,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0x00,0xe0,0x01,0x00]
+v_mul_hi_i32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xde,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0x00,0xee,0x01,0x00]
+v_mul_hi_i32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xf6,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, v0, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0x00,0x00,0x02,0x00]
+v_mul_hi_i32 v5, 0, m0
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xf8,0x00,0x00]
 
-v_subbrev_u32_e64 v0, s[0:1], s0, v255, s[0:1]
-// CHECK: [0x00,0x00,0x1e,0xd1,0x00,0xfe,0x03,0x00]
+v_mul_hi_i32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xfc,0x00,0x00]
 
-v_add_f16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x3e]
+v_mul_hi_i32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xfe,0x00,0x00]
 
-v_add_f16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x3f]
+v_mul_hi_i32 v5, 0, 0
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0x00,0x01,0x00]
 
-v_add_f16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x3e]
+v_mul_hi_i32 v5, 0, -1
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0x82,0x01,0x00]
 
-v_add_f16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x3e]
+v_mul_hi_i32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xe0,0x01,0x00]
 
-v_add_f16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x3e]
+v_mul_hi_i32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xee,0x01,0x00]
 
-v_add_f16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x3e]
+v_mul_hi_i32 v5, 0, v2
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0x04,0x02,0x00]
 
-v_add_f16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x3e]
+v_mul_hi_i32 v5, 0, v255
+// CHECK: [0x05,0x00,0x87,0xd2,0x80,0xfe,0x03,0x00]
 
-v_add_f16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v1, s2
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x05,0x00,0x00]
 
-v_add_f16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x3e]
+v_ldexp_f32 v255, v1, s2
+// CHECK: [0xff,0x00,0x88,0xd2,0x01,0x05,0x00,0x00]
 
-v_add_f16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v255, s2
+// CHECK: [0x05,0x00,0x88,0xd2,0xff,0x05,0x00,0x00]
 
-v_add_f16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v1, s101
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xcb,0x00,0x00]
 
-v_add_f16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xcd,0x00,0x00]
 
-v_add_f16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xcf,0x00,0x00]
 
-v_add_f16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xd5,0x00,0x00]
 
-v_add_f16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xd7,0x00,0x00]
 
-v_add_f16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xd9,0x00,0x00]
 
-v_add_f16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xdb,0x00,0x00]
 
-v_add_f16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xdd,0x00,0x00]
 
-v_add_f16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x3e]
+v_ldexp_f32 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xdf,0x00,0x00]
 
-v_add_f16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x3e,0x0b,0xfe,0x00,0x00]
+v_ldexp_f32 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xf7,0x00,0x00]
 
-v_add_f16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x3e,0x56,0x34,0x00,0x00]
+v_ldexp_f32 v5, v1, m0
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xf9,0x00,0x00]
 
-v_add_f16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x3e]
+v_ldexp_f32 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xfd,0x00,0x00]
 
-v_add_f16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x3e]
+v_ldexp_f32 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xff,0x00,0x00]
 
-v_add_f16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x3e]
+v_ldexp_f32 v5, v1, 0
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x01,0x01,0x00]
 
-v_add_f16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x1f,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f32 v5, v1, -1
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x83,0x01,0x00]
 
-v_add_f16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x1f,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f32 v5, v1, 0.5
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xe1,0x01,0x00]
 
-v_add_f16_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x1f,0xd1,0xfd,0x00,0x00,0x00]
+v_ldexp_f32 v5, v1, -4.0
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xef,0x01,0x00]
 
-v_add_f16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x1f,0xd1,0x00,0x01,0x00,0x00]
+v_ldexp_f32 v5, v1, scc
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xfb,0x01,0x00]
 
-v_add_f16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x1f,0xd1,0xff,0x01,0x00,0x00]
+v_ldexp_f32 v5, v1, v2
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x05,0x02,0x00]
 
-v_add_f16_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x1f,0xd1,0x00,0xfa,0x01,0x00]
+v_ldexp_f32 v5, v1, v255
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0xff,0x03,0x00]
 
-v_add_f16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x1f,0xd1,0x00,0x00,0x02,0x00]
+v_ldexp_f32 v5, -v1, s2
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x05,0x00,0x20]
 
-v_add_f16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x1f,0xd1,0x00,0xfe,0x03,0x00]
+v_ldexp_f32 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x88,0xd2,0x01,0x05,0x00,0x00]
 
-v_add_f16_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x1f,0xd1,0x00,0x00,0x00,0x20]
+v_ldexp_f32 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x88,0xd2,0x01,0x05,0x00,0x00]
 
-v_add_f16_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x1f,0xd1,0x00,0x00,0x00,0x40]
+v_ldexp_f32 v5, v1, s2 mul:2
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x05,0x00,0x08]
 
-v_add_f16_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x1f,0xd1,0x00,0x00,0x00,0x60]
+v_ldexp_f32 v5, v1, s2 mul:4
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x05,0x00,0x10]
 
-v_add_f16_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x1f,0xd1,0x00,0x00,0x00,0x00]
+v_ldexp_f32 v5, v1, s2 div:2
+// CHECK: [0x05,0x00,0x88,0xd2,0x01,0x05,0x00,0x18]
 
-v_add_f16_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x1f,0xd1,0x00,0x00,0x00,0x00]
+v_readlane_b32 s5, v1, s2
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0x05,0x00,0x00]
 
-v_add_f16_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x1f,0xd1,0x00,0x00,0x00,0x00]
+v_readlane_b32 s101, v1, s2
+// CHECK: [0x65,0x00,0x89,0xd2,0x01,0x05,0x00,0x00]
 
-v_add_f16_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x1f,0xd1,0x00,0x00,0x00,0x00]
+v_readlane_b32 flat_scratch_lo, v1, s2
+// CHECK: [0x66,0x00,0x89,0xd2,0x01,0x05,0x00,0x00]
 
-v_sub_f16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x40]
+v_readlane_b32 flat_scratch_hi, v1, s2
+// CHECK: [0x67,0x00,0x89,0xd2,0x01,0x05,0x00,0x00]
 
-v_sub_f16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x41]
+v_readlane_b32 tba_lo, v1, s2
+// CHECK: [0x6c,0x00,0x89,0xd2,0x01,0x05,0x00,0x00]
 
-v_sub_f16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x40]
+v_readlane_b32 tba_hi, v1, s2
+// CHECK: [0x6d,0x00,0x89,0xd2,0x01,0x05,0x00,0x00]
 
-v_sub_f16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x40]
+v_readlane_b32 tma_lo, v1, s2
+// CHECK: [0x6e,0x00,0x89,0xd2,0x01,0x05,0x00,0x00]
 
-v_sub_f16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x40]
+v_readlane_b32 tma_hi, v1, s2
+// CHECK: [0x6f,0x00,0x89,0xd2,0x01,0x05,0x00,0x00]
 
-v_sub_f16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x40]
+v_readlane_b32 ttmp11, v1, s2
+// CHECK: [0x7b,0x00,0x89,0xd2,0x01,0x05,0x00,0x00]
 
-v_sub_f16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x40]
+v_readlane_b32 s5, v255, s2
+// CHECK: [0x05,0x00,0x89,0xd2,0xff,0x05,0x00,0x00]
 
-v_sub_f16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, s101
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xcb,0x00,0x00]
 
-v_sub_f16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xcd,0x00,0x00]
 
-v_sub_f16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xcf,0x00,0x00]
 
-v_sub_f16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xd5,0x00,0x00]
 
-v_sub_f16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xd7,0x00,0x00]
 
-v_sub_f16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, tba_lo
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xd9,0x00,0x00]
 
-v_sub_f16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, tba_hi
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xdb,0x00,0x00]
 
-v_sub_f16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, tma_lo
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xdd,0x00,0x00]
 
-v_sub_f16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, tma_hi
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xdf,0x00,0x00]
 
-v_sub_f16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, ttmp11
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xf7,0x00,0x00]
 
-v_sub_f16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, m0
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0xf9,0x00,0x00]
 
-v_sub_f16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x40]
+v_readlane_b32 s5, v1, 0
+// CHECK: [0x05,0x00,0x89,0xd2,0x01,0x01,0x01,0x00]
 
-v_sub_f16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x40,0x0b,0xfe,0x00,0x00]
+v_writelane_b32 v5, s1, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x01,0x00,0x01,0x00]
 
-v_sub_f16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x40,0x56,0x34,0x00,0x00]
+v_writelane_b32 v255, s1, 0
+// CHECK: [0xff,0x00,0x8a,0xd2,0x01,0x00,0x01,0x00]
 
-v_sub_f16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x40]
+v_writelane_b32 v5, s101, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x65,0x00,0x01,0x00]
 
-v_sub_f16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x40]
+v_writelane_b32 v5, flat_scratch_lo, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x66,0x00,0x01,0x00]
 
-v_sub_f16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x40]
+v_writelane_b32 v5, flat_scratch_hi, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x67,0x00,0x01,0x00]
 
-v_sub_f16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_writelane_b32 v5, vcc_lo, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6a,0x00,0x01,0x00]
 
-v_sub_f16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_writelane_b32 v5, vcc_hi, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6b,0x00,0x01,0x00]
 
-v_sub_f16_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0xfd,0x00,0x00,0x00]
+v_writelane_b32 v5, tba_lo, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6c,0x00,0x01,0x00]
 
-v_sub_f16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x01,0x00,0x00]
+v_writelane_b32 v5, tba_hi, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6d,0x00,0x01,0x00]
 
-v_sub_f16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0xff,0x01,0x00,0x00]
+v_writelane_b32 v5, tma_lo, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6e,0x00,0x01,0x00]
 
-v_sub_f16_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0xfa,0x01,0x00]
+v_writelane_b32 v5, tma_hi, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x6f,0x00,0x01,0x00]
 
-v_sub_f16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x00,0x02,0x00]
+v_writelane_b32 v5, ttmp11, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x7b,0x00,0x01,0x00]
 
-v_sub_f16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0xfe,0x03,0x00]
+v_writelane_b32 v5, m0, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x7c,0x00,0x01,0x00]
 
-v_sub_f16_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x00,0x00,0x20]
+v_writelane_b32 v5, exec_lo, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x7e,0x00,0x01,0x00]
 
-v_sub_f16_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x00,0x00,0x40]
+v_writelane_b32 v5, exec_hi, 0
+// CHECK: [0x05,0x00,0x8a,0xd2,0x7f,0x00,0x01,0x00]
 
-v_sub_f16_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x20,0xd1,0x00,0x00,0x00,0x60]
+v_bcnt_u32_b32 v5, 0, s2
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0x04,0x00,0x00]
 
-v_sub_f16_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_bcnt_u32_b32 v255, 0, s2
+// CHECK: [0xff,0x00,0x8b,0xd2,0x80,0x04,0x00,0x00]
 
-v_sub_f16_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_bcnt_u32_b32 v5, -1, s2
+// CHECK: [0x05,0x00,0x8b,0xd2,0xc1,0x04,0x00,0x00]
 
-v_sub_f16_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_bcnt_u32_b32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x8b,0xd2,0xf0,0x04,0x00,0x00]
 
-v_sub_f16_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x20,0xd1,0x00,0x00,0x00,0x00]
+v_bcnt_u32_b32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x8b,0xd2,0xf7,0x04,0x00,0x00]
 
-v_subrev_f16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, v1, s2
+// CHECK: [0x05,0x00,0x8b,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_f16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x43]
+v_bcnt_u32_b32 v5, v255, s2
+// CHECK: [0x05,0x00,0x8b,0xd2,0xff,0x05,0x00,0x00]
 
-v_subrev_f16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, s101
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xca,0x00,0x00]
 
-v_subrev_f16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xcc,0x00,0x00]
 
-v_subrev_f16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xce,0x00,0x00]
 
-v_subrev_f16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xd4,0x00,0x00]
 
-v_subrev_f16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xd6,0x00,0x00]
 
-v_subrev_f16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xd8,0x00,0x00]
 
-v_subrev_f16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xda,0x00,0x00]
 
-v_subrev_f16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xdc,0x00,0x00]
 
-v_subrev_f16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xde,0x00,0x00]
 
-v_subrev_f16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xf6,0x00,0x00]
 
-v_subrev_f16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, m0
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xf8,0x00,0x00]
 
-v_subrev_f16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xfc,0x00,0x00]
 
-v_subrev_f16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xfe,0x00,0x00]
 
-v_subrev_f16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, 0
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0x00,0x01,0x00]
 
-v_subrev_f16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, -1
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0x82,0x01,0x00]
 
-v_subrev_f16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xe0,0x01,0x00]
 
-v_subrev_f16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x42]
+v_bcnt_u32_b32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xee,0x01,0x00]
 
-v_subrev_f16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x42,0x0b,0xfe,0x00,0x00]
+v_bcnt_u32_b32 v5, 0, v2
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0x04,0x02,0x00]
 
-v_subrev_f16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x42,0x56,0x34,0x00,0x00]
+v_bcnt_u32_b32 v5, 0, v255
+// CHECK: [0x05,0x00,0x8b,0xd2,0x80,0xfe,0x03,0x00]
 
-v_subrev_f16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x42]
+v_mbcnt_lo_u32_b32 v5, 0, s2
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0x04,0x00,0x00]
 
-v_subrev_f16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x42]
+v_mbcnt_lo_u32_b32 v255, 0, s2
+// CHECK: [0xff,0x00,0x8c,0xd2,0x80,0x04,0x00,0x00]
 
-v_subrev_f16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x42]
+v_mbcnt_lo_u32_b32 v5, -1, s2
+// CHECK: [0x05,0x00,0x8c,0xd2,0xc1,0x04,0x00,0x00]
 
-v_subrev_f16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x21,0xd1,0x00,0x00,0x00,0x00]
+v_mbcnt_lo_u32_b32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x8c,0xd2,0xf0,0x04,0x00,0x00]
 
-v_subrev_f16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x21,0xd1,0x00,0x00,0x00,0x00]
+v_mbcnt_lo_u32_b32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x8c,0xd2,0xf7,0x04,0x00,0x00]
 
-v_subrev_f16_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x21,0xd1,0xfd,0x00,0x00,0x00]
+v_mbcnt_lo_u32_b32 v5, v1, s2
+// CHECK: [0x05,0x00,0x8c,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_f16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x21,0xd1,0x00,0x01,0x00,0x00]
+v_mbcnt_lo_u32_b32 v5, v255, s2
+// CHECK: [0x05,0x00,0x8c,0xd2,0xff,0x05,0x00,0x00]
 
-v_subrev_f16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x21,0xd1,0xff,0x01,0x00,0x00]
+v_mbcnt_lo_u32_b32 v5, 0, s101
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xca,0x00,0x00]
 
-v_subrev_f16_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x21,0xd1,0x00,0xfa,0x01,0x00]
+v_mbcnt_lo_u32_b32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xcc,0x00,0x00]
 
-v_subrev_f16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x21,0xd1,0x00,0x00,0x02,0x00]
+v_mbcnt_lo_u32_b32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xce,0x00,0x00]
 
-v_subrev_f16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x21,0xd1,0x00,0xfe,0x03,0x00]
+v_mbcnt_lo_u32_b32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xd4,0x00,0x00]
 
-v_subrev_f16_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x21,0xd1,0x00,0x00,0x00,0x20]
+v_mbcnt_lo_u32_b32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xd6,0x00,0x00]
 
-v_subrev_f16_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x21,0xd1,0x00,0x00,0x00,0x40]
+v_mbcnt_lo_u32_b32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xd8,0x00,0x00]
 
-v_subrev_f16_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x21,0xd1,0x00,0x00,0x00,0x60]
+v_mbcnt_lo_u32_b32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xda,0x00,0x00]
 
-v_subrev_f16_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x21,0xd1,0x00,0x00,0x00,0x00]
+v_mbcnt_lo_u32_b32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xdc,0x00,0x00]
 
-v_subrev_f16_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x21,0xd1,0x00,0x00,0x00,0x00]
+v_mbcnt_lo_u32_b32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xde,0x00,0x00]
 
-v_subrev_f16_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x21,0xd1,0x00,0x00,0x00,0x00]
+v_mbcnt_lo_u32_b32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xf6,0x00,0x00]
 
-v_subrev_f16_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x21,0xd1,0x00,0x00,0x00,0x00]
+v_mbcnt_lo_u32_b32 v5, 0, m0
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xf8,0x00,0x00]
 
-v_mul_f16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x44]
+v_mbcnt_lo_u32_b32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xfc,0x00,0x00]
 
-v_mul_f16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x45]
+v_mbcnt_lo_u32_b32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xfe,0x00,0x00]
 
-v_mul_f16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x44]
+v_mbcnt_lo_u32_b32 v5, 0, 0
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0x00,0x01,0x00]
 
-v_mul_f16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x44]
+v_mbcnt_lo_u32_b32 v5, 0, -1
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0x82,0x01,0x00]
 
-v_mul_f16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x44]
+v_mbcnt_lo_u32_b32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xe0,0x01,0x00]
 
-v_mul_f16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x44]
+v_mbcnt_lo_u32_b32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xee,0x01,0x00]
 
-v_mul_f16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x44]
+v_mbcnt_lo_u32_b32 v5, 0, v2
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0x04,0x02,0x00]
 
-v_mul_f16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x44]
+v_mbcnt_lo_u32_b32 v5, 0, v255
+// CHECK: [0x05,0x00,0x8c,0xd2,0x80,0xfe,0x03,0x00]
 
-v_mul_f16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, 0, s2
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0x04,0x00,0x00]
 
-v_mul_f16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v255, 0, s2
+// CHECK: [0xff,0x00,0x8d,0xd2,0x80,0x04,0x00,0x00]
 
-v_mul_f16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, -1, s2
+// CHECK: [0x05,0x00,0x8d,0xd2,0xc1,0x04,0x00,0x00]
 
-v_mul_f16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x8d,0xd2,0xf0,0x04,0x00,0x00]
 
-v_mul_f16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x8d,0xd2,0xf7,0x04,0x00,0x00]
 
-v_mul_f16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, v1, s2
+// CHECK: [0x05,0x00,0x8d,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_f16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, v255, s2
+// CHECK: [0x05,0x00,0x8d,0xd2,0xff,0x05,0x00,0x00]
 
-v_mul_f16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, 0, s101
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xca,0x00,0x00]
 
-v_mul_f16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xcc,0x00,0x00]
 
-v_mul_f16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xce,0x00,0x00]
 
-v_mul_f16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xd4,0x00,0x00]
 
-v_mul_f16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x44,0x0b,0xfe,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xd6,0x00,0x00]
 
-v_mul_f16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x44,0x56,0x34,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xd8,0x00,0x00]
 
-v_mul_f16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xda,0x00,0x00]
 
-v_mul_f16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x44]
+v_mbcnt_hi_u32_b32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xdc,0x00,0x00]
 
-v_mul_f16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x44]
+v_mbcnt_hi_u32_b32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xde,0x00,0x00]
 
-v_mul_f16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xf6,0x00,0x00]
 
-v_mul_f16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, m0
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xf8,0x00,0x00]
 
-v_mul_f16_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0xfd,0x00,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xfc,0x00,0x00]
 
-v_mul_f16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x01,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xfe,0x00,0x00]
 
-v_mul_f16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0xff,0x01,0x00,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, 0
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0x00,0x01,0x00]
 
-v_mul_f16_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0xfa,0x01,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, -1
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0x82,0x01,0x00]
 
-v_mul_f16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x00,0x02,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xe0,0x01,0x00]
 
-v_mul_f16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0xfe,0x03,0x00]
+v_mbcnt_hi_u32_b32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xee,0x01,0x00]
 
-v_mul_f16_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x00,0x00,0x20]
+v_mbcnt_hi_u32_b32 v5, 0, v2
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0x04,0x02,0x00]
 
-v_mul_f16_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x00,0x00,0x40]
+v_mbcnt_hi_u32_b32 v5, 0, v255
+// CHECK: [0x05,0x00,0x8d,0xd2,0x80,0xfe,0x03,0x00]
 
-v_mul_f16_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x22,0xd1,0x00,0x00,0x00,0x60]
+v_lshlrev_b64 v[5:6], 0, s[4:5]
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0x08,0x00,0x00]
 
-v_mul_f16_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_lshlrev_b64 v[254:255], 0, s[4:5]
+// CHECK: [0xfe,0x00,0x8f,0xd2,0x80,0x08,0x00,0x00]
 
-v_mul_f16_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_lshlrev_b64 v[5:6], -1, s[4:5]
+// CHECK: [0x05,0x00,0x8f,0xd2,0xc1,0x08,0x00,0x00]
 
-v_mul_f16_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_lshlrev_b64 v[5:6], 0.5, s[4:5]
+// CHECK: [0x05,0x00,0x8f,0xd2,0xf0,0x08,0x00,0x00]
 
-v_mul_f16_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x22,0xd1,0x00,0x00,0x00,0x00]
+v_lshlrev_b64 v[5:6], -4.0, s[4:5]
+// CHECK: [0x05,0x00,0x8f,0xd2,0xf7,0x08,0x00,0x00]
 
-v_mac_f16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], v1, s[4:5]
+// CHECK: [0x05,0x00,0x8f,0xd2,0x01,0x09,0x00,0x00]
 
-v_mac_f16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x47]
+v_lshlrev_b64 v[5:6], v255, s[4:5]
+// CHECK: [0x05,0x00,0x8f,0xd2,0xff,0x09,0x00,0x00]
 
-v_mac_f16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, s[6:7]
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0x0c,0x00,0x00]
 
-v_mac_f16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, s[100:101]
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0xc8,0x00,0x00]
 
-v_mac_f16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, flat_scratch
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0xcc,0x00,0x00]
 
-v_mac_f16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, vcc
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0xd4,0x00,0x00]
 
-v_mac_f16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, tba
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0xd8,0x00,0x00]
 
-v_mac_f16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, tma
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0xdc,0x00,0x00]
 
-v_mac_f16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, ttmp[10:11]
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0xf4,0x00,0x00]
 
-v_mac_f16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, exec
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0xfc,0x00,0x00]
 
-v_mac_f16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, 0
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0x00,0x01,0x00]
 
-v_mac_f16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, -1
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0x82,0x01,0x00]
 
-v_mac_f16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, 0.5
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0xe0,0x01,0x00]
 
-v_mac_f16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, -4.0
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0xee,0x01,0x00]
 
-v_mac_f16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, v[2:3]
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0x04,0x02,0x00]
 
-v_mac_f16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x46]
+v_lshlrev_b64 v[5:6], 0, v[254:255]
+// CHECK: [0x05,0x00,0x8f,0xd2,0x80,0xfc,0x03,0x00]
 
-v_mac_f16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x46]
+v_lshrrev_b64 v[5:6], 0, s[4:5]
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0x08,0x00,0x00]
 
-v_mac_f16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x46]
+v_lshrrev_b64 v[254:255], 0, s[4:5]
+// CHECK: [0xfe,0x00,0x90,0xd2,0x80,0x08,0x00,0x00]
 
-v_mac_f16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x46]
+v_lshrrev_b64 v[5:6], -1, s[4:5]
+// CHECK: [0x05,0x00,0x90,0xd2,0xc1,0x08,0x00,0x00]
 
-v_mac_f16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x46,0x0b,0xfe,0x00,0x00]
+v_lshrrev_b64 v[5:6], 0.5, s[4:5]
+// CHECK: [0x05,0x00,0x90,0xd2,0xf0,0x08,0x00,0x00]
 
-v_mac_f16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x46,0x56,0x34,0x00,0x00]
+v_lshrrev_b64 v[5:6], -4.0, s[4:5]
+// CHECK: [0x05,0x00,0x90,0xd2,0xf7,0x08,0x00,0x00]
 
-v_mac_f16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x46]
+v_lshrrev_b64 v[5:6], v1, s[4:5]
+// CHECK: [0x05,0x00,0x90,0xd2,0x01,0x09,0x00,0x00]
 
-v_mac_f16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x46]
+v_lshrrev_b64 v[5:6], v255, s[4:5]
+// CHECK: [0x05,0x00,0x90,0xd2,0xff,0x09,0x00,0x00]
 
-v_mac_f16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x46]
+v_lshrrev_b64 v[5:6], 0, s[6:7]
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0x0c,0x00,0x00]
 
-v_mac_f16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x23,0xd1,0x00,0x00,0x00,0x00]
+v_lshrrev_b64 v[5:6], 0, s[100:101]
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0xc8,0x00,0x00]
 
-v_mac_f16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x23,0xd1,0x00,0x00,0x00,0x00]
+v_lshrrev_b64 v[5:6], 0, flat_scratch
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0xcc,0x00,0x00]
 
-v_mac_f16_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x23,0xd1,0xfd,0x00,0x00,0x00]
+v_lshrrev_b64 v[5:6], 0, vcc
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0xd4,0x00,0x00]
 
-v_mac_f16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x23,0xd1,0x00,0x01,0x00,0x00]
+v_lshrrev_b64 v[5:6], 0, tba
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0xd8,0x00,0x00]
 
-v_mac_f16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x23,0xd1,0xff,0x01,0x00,0x00]
+v_lshrrev_b64 v[5:6], 0, tma
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0xdc,0x00,0x00]
 
-v_mac_f16_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x23,0xd1,0x00,0xfa,0x01,0x00]
+v_lshrrev_b64 v[5:6], 0, ttmp[10:11]
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0xf4,0x00,0x00]
 
-v_mac_f16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x23,0xd1,0x00,0x00,0x02,0x00]
+v_lshrrev_b64 v[5:6], 0, exec
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0xfc,0x00,0x00]
 
-v_mac_f16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x23,0xd1,0x00,0xfe,0x03,0x00]
+v_lshrrev_b64 v[5:6], 0, 0
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0x00,0x01,0x00]
 
-v_mac_f16_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x23,0xd1,0x00,0x00,0x00,0x20]
+v_lshrrev_b64 v[5:6], 0, -1
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0x82,0x01,0x00]
 
-v_mac_f16_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x23,0xd1,0x00,0x00,0x00,0x40]
+v_lshrrev_b64 v[5:6], 0, 0.5
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0xe0,0x01,0x00]
 
-v_mac_f16_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x23,0xd1,0x00,0x00,0x00,0x60]
+v_lshrrev_b64 v[5:6], 0, -4.0
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0xee,0x01,0x00]
 
-v_mac_f16_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x23,0xd1,0x00,0x00,0x00,0x00]
+v_lshrrev_b64 v[5:6], 0, v[2:3]
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0x04,0x02,0x00]
 
-v_mac_f16_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x23,0xd1,0x00,0x00,0x00,0x00]
+v_lshrrev_b64 v[5:6], 0, v[254:255]
+// CHECK: [0x05,0x00,0x90,0xd2,0x80,0xfc,0x03,0x00]
 
-v_mac_f16_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x23,0xd1,0x00,0x00,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, s[4:5]
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0x08,0x00,0x00]
 
-v_mac_f16_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x23,0xd1,0x00,0x00,0x00,0x00]
+v_ashrrev_i64 v[254:255], 0, s[4:5]
+// CHECK: [0xfe,0x00,0x91,0xd2,0x80,0x08,0x00,0x00]
 
-v_madmk_f16 v0, 0, 0x1121, v0
-// CHECK: [0x80,0x00,0x00,0x48,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], -1, s[4:5]
+// CHECK: [0x05,0x00,0x91,0xd2,0xc1,0x08,0x00,0x00]
 
-v_madmk_f16 v255, 0, 0x1121, v0
-// CHECK: [0x80,0x00,0xfe,0x49,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0.5, s[4:5]
+// CHECK: [0x05,0x00,0x91,0xd2,0xf0,0x08,0x00,0x00]
 
-v_madmk_f16 v0, -1, 0x1121, v0
-// CHECK: [0xc1,0x00,0x00,0x48,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], -4.0, s[4:5]
+// CHECK: [0x05,0x00,0x91,0xd2,0xf7,0x08,0x00,0x00]
 
-v_madmk_f16 v0, 0.5, 0x1121, v0
-// CHECK: [0xf0,0x00,0x00,0x48,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], v1, s[4:5]
+// CHECK: [0x05,0x00,0x91,0xd2,0x01,0x09,0x00,0x00]
 
-v_madmk_f16 v0, -4.0, 0x1121, v0
-// CHECK: [0xf7,0x00,0x00,0x48,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], v255, s[4:5]
+// CHECK: [0x05,0x00,0x91,0xd2,0xff,0x09,0x00,0x00]
 
-v_madmk_f16 v0, v0, 0x1121, v0
-// CHECK: [0x00,0x01,0x00,0x48,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, s[6:7]
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0x0c,0x00,0x00]
 
-v_madmk_f16 v0, v255, 0x1121, v0
-// CHECK: [0xff,0x01,0x00,0x48,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, s[100:101]
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0xc8,0x00,0x00]
 
-v_madmk_f16 v0, 0, 0xa1b1, v0
-// CHECK: [0x80,0x00,0x00,0x48,0xb1,0xa1,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, flat_scratch
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0xcc,0x00,0x00]
 
-v_madmk_f16 v0, 0, 0x1121, v255
-// CHECK: [0x80,0xfe,0x01,0x48,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, vcc
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0xd4,0x00,0x00]
 
-v_madak_f16 v0, 0, v0, 0x1121
-// CHECK: [0x80,0x00,0x00,0x4a,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, tba
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0xd8,0x00,0x00]
 
-v_madak_f16 v255, 0, v0, 0x1121
-// CHECK: [0x80,0x00,0xfe,0x4b,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, tma
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0xdc,0x00,0x00]
 
-v_madak_f16 v0, -1, v0, 0x1121
-// CHECK: [0xc1,0x00,0x00,0x4a,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, ttmp[10:11]
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0xf4,0x00,0x00]
 
-v_madak_f16 v0, 0.5, v0, 0x1121
-// CHECK: [0xf0,0x00,0x00,0x4a,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, exec
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0xfc,0x00,0x00]
 
-v_madak_f16 v0, -4.0, v0, 0x1121
-// CHECK: [0xf7,0x00,0x00,0x4a,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, 0
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0x00,0x01,0x00]
 
-v_madak_f16 v0, v0, v0, 0x1121
-// CHECK: [0x00,0x01,0x00,0x4a,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, -1
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0x82,0x01,0x00]
 
-v_madak_f16 v0, v255, v0, 0x1121
-// CHECK: [0xff,0x01,0x00,0x4a,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, 0.5
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0xe0,0x01,0x00]
 
-v_madak_f16 v0, 0, v255, 0x1121
-// CHECK: [0x80,0xfe,0x01,0x4a,0x21,0x11,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, -4.0
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0xee,0x01,0x00]
 
-v_madak_f16 v0, 0, v0, 0xa1b1
-// CHECK: [0x80,0x00,0x00,0x4a,0xb1,0xa1,0x00,0x00]
+v_ashrrev_i64 v[5:6], 0, v[2:3]
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0x04,0x02,0x00]
 
-v_add_u16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x4c]
+v_ashrrev_i64 v[5:6], 0, v[254:255]
+// CHECK: [0x05,0x00,0x91,0xd2,0x80,0xfc,0x03,0x00]
 
-v_add_u16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x4d]
+v_trig_preop_f64 v[5:6], 0, s2
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_u16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x4c]
+v_trig_preop_f64 v[254:255], 0, s2
+// CHECK: [0xfe,0x00,0x92,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_u16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0.5, s2
+// CHECK: [0x05,0x00,0x92,0xd2,0xf0,0x04,0x00,0x00]
 
-v_add_u16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], v[1:2], s2
+// CHECK: [0x05,0x00,0x92,0xd2,0x01,0x05,0x00,0x00]
 
-v_add_u16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], v[254:255], s2
+// CHECK: [0x05,0x00,0x92,0xd2,0xfe,0x05,0x00,0x00]
 
-v_add_u16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, s101
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xca,0x00,0x00]
 
-v_add_u16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xcc,0x00,0x00]
 
-v_add_u16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xce,0x00,0x00]
 
-v_add_u16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, vcc_lo
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xd4,0x00,0x00]
 
-v_add_u16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, vcc_hi
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xd6,0x00,0x00]
 
-v_add_u16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, tba_lo
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xd8,0x00,0x00]
 
-v_add_u16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, tba_hi
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xda,0x00,0x00]
 
-v_add_u16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, tma_lo
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xdc,0x00,0x00]
 
-v_add_u16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, tma_hi
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xde,0x00,0x00]
 
-v_add_u16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, ttmp11
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xf6,0x00,0x00]
 
-v_add_u16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, m0
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xf8,0x00,0x00]
 
-v_add_u16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, exec_lo
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xfc,0x00,0x00]
 
-v_add_u16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, exec_hi
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xfe,0x00,0x00]
 
-v_add_u16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x4c,0x0b,0xfe,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, 0
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0x00,0x01,0x00]
 
-v_add_u16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x4c,0x56,0x34,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, -1
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0x82,0x01,0x00]
 
-v_add_u16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, 0.5
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xe0,0x01,0x00]
 
-v_add_u16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x4c]
+v_trig_preop_f64 v[5:6], 0, -4.0
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xee,0x01,0x00]
 
-v_add_u16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x4c]
+v_trig_preop_f64 v[5:6], 0, scc
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xfa,0x01,0x00]
 
-v_add_u16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, v2
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0x04,0x02,0x00]
 
-v_add_u16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x26,0xd1,0x00,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, v255
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0xfe,0x03,0x00]
 
-v_add_u16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0x80,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, s2 clamp
+// CHECK: [0x05,0x80,0x92,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_u16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0xc1,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, s2 mul:2
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0x04,0x00,0x08]
 
-v_add_u16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0xf0,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, s2 mul:4
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0x04,0x00,0x10]
 
-v_add_u16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0xf7,0x00,0x00,0x00]
+v_trig_preop_f64 v[5:6], 0, s2 div:2
+// CHECK: [0x05,0x00,0x92,0xd2,0x80,0x04,0x00,0x18]
 
-v_add_u16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0x01,0x00,0x00]
+v_bfm_b32 v5, 0, s2
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_u16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x26,0xd1,0xff,0x01,0x00,0x00]
+v_bfm_b32 v255, 0, s2
+// CHECK: [0xff,0x00,0x93,0xd2,0x80,0x04,0x00,0x00]
 
-v_add_u16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0x00,0x01,0x00]
+v_bfm_b32 v5, -1, s2
+// CHECK: [0x05,0x00,0x93,0xd2,0xc1,0x04,0x00,0x00]
 
-v_add_u16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0x82,0x01,0x00]
+v_bfm_b32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x93,0xd2,0xf0,0x04,0x00,0x00]
 
-v_add_u16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0xe0,0x01,0x00]
+v_bfm_b32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x93,0xd2,0xf7,0x04,0x00,0x00]
 
-v_add_u16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0xee,0x01,0x00]
+v_bfm_b32 v5, v1, s2
+// CHECK: [0x05,0x00,0x93,0xd2,0x01,0x05,0x00,0x00]
 
-v_add_u16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0x00,0x02,0x00]
+v_bfm_b32 v5, v255, s2
+// CHECK: [0x05,0x00,0x93,0xd2,0xff,0x05,0x00,0x00]
 
-v_add_u16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x26,0xd1,0x00,0xfe,0x03,0x00]
+v_bfm_b32 v5, 0, s101
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xca,0x00,0x00]
 
-v_sub_u16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xcc,0x00,0x00]
 
-v_sub_u16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x4f]
+v_bfm_b32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xce,0x00,0x00]
 
-v_sub_u16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xd4,0x00,0x00]
 
-v_sub_u16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xd6,0x00,0x00]
 
-v_sub_u16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xd8,0x00,0x00]
 
-v_sub_u16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xda,0x00,0x00]
 
-v_sub_u16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xdc,0x00,0x00]
 
-v_sub_u16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xde,0x00,0x00]
 
-v_sub_u16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xf6,0x00,0x00]
 
-v_sub_u16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, m0
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xf8,0x00,0x00]
 
-v_sub_u16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xfc,0x00,0x00]
 
-v_sub_u16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xfe,0x00,0x00]
 
-v_sub_u16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, 0
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0x00,0x01,0x00]
 
-v_sub_u16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, -1
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0x82,0x01,0x00]
 
-v_sub_u16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xe0,0x01,0x00]
 
-v_sub_u16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xee,0x01,0x00]
 
-v_sub_u16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, v2
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0x04,0x02,0x00]
 
-v_sub_u16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x4e]
+v_bfm_b32 v5, 0, v255
+// CHECK: [0x05,0x00,0x93,0xd2,0x80,0xfe,0x03,0x00]
 
-v_sub_u16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x4e]
+v_cvt_pknorm_i16_f32 v5, v1, s2
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x05,0x00,0x00]
 
-v_sub_u16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x4e,0x0b,0xfe,0x00,0x00]
+v_cvt_pknorm_i16_f32 v255, v1, s2
+// CHECK: [0xff,0x00,0x94,0xd2,0x01,0x05,0x00,0x00]
 
-v_sub_u16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x4e,0x56,0x34,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, v255, s2
+// CHECK: [0x05,0x00,0x94,0xd2,0xff,0x05,0x00,0x00]
 
-v_sub_u16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x4e]
+v_cvt_pknorm_i16_f32 v5, v1, s101
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xcb,0x00,0x00]
 
-v_sub_u16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x4e]
+v_cvt_pknorm_i16_f32 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xcd,0x00,0x00]
 
-v_sub_u16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x4e]
+v_cvt_pknorm_i16_f32 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xcf,0x00,0x00]
 
-v_sub_u16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x27,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xd5,0x00,0x00]
 
-v_sub_u16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x27,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xd7,0x00,0x00]
 
-v_sub_u16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x27,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xd9,0x00,0x00]
 
-v_sub_u16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x27,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xdb,0x00,0x00]
 
-v_sub_u16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x27,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xdd,0x00,0x00]
 
-v_sub_u16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x27,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xdf,0x00,0x00]
 
-v_sub_u16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x27,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xf7,0x00,0x00]
 
-v_sub_u16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x27,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, m0
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xf9,0x00,0x00]
 
-v_sub_u16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x27,0xd1,0x00,0x00,0x01,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xfd,0x00,0x00]
 
-v_sub_u16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x27,0xd1,0x00,0x82,0x01,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xff,0x00,0x00]
 
-v_sub_u16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x27,0xd1,0x00,0xe0,0x01,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, scc
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xfb,0x01,0x00]
 
-v_sub_u16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x27,0xd1,0x00,0xee,0x01,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, v2
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x05,0x02,0x00]
 
-v_sub_u16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x27,0xd1,0x00,0x00,0x02,0x00]
+v_cvt_pknorm_i16_f32 v5, v1, v255
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0xff,0x03,0x00]
 
-v_sub_u16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x27,0xd1,0x00,0xfe,0x03,0x00]
+v_cvt_pknorm_i16_f32 v5, -v1, s2
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x05,0x00,0x20]
 
-v_subrev_u16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x50]
+v_cvt_pknorm_i16_f32 v5, v1, -s2
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x05,0x00,0x40]
 
-v_subrev_u16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x51]
+v_cvt_pknorm_i16_f32 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x94,0xd2,0x01,0x05,0x00,0x60]
 
-v_subrev_u16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x50]
+v_cvt_pknorm_i16_f32 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x94,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x50]
+v_cvt_pknorm_i16_f32 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x94,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x50]
+v_cvt_pknorm_i16_f32 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x94,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x50]
+v_cvt_pknorm_i16_f32 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x94,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, s2
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v255, v1, s2
+// CHECK: [0xff,0x00,0x95,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v255, s2
+// CHECK: [0x05,0x00,0x95,0xd2,0xff,0x05,0x00,0x00]
 
-v_subrev_u16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, s101
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xcb,0x00,0x00]
 
-v_subrev_u16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xcd,0x00,0x00]
 
-v_subrev_u16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xcf,0x00,0x00]
 
-v_subrev_u16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xd5,0x00,0x00]
 
-v_subrev_u16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xd7,0x00,0x00]
 
-v_subrev_u16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xd9,0x00,0x00]
 
-v_subrev_u16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xdb,0x00,0x00]
 
-v_subrev_u16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xdd,0x00,0x00]
 
-v_subrev_u16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xdf,0x00,0x00]
 
-v_subrev_u16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xf7,0x00,0x00]
 
-v_subrev_u16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x50,0x0b,0xfe,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, v1, m0
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xf9,0x00,0x00]
 
-v_subrev_u16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x50,0x56,0x34,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xfd,0x00,0x00]
 
-v_subrev_u16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xff,0x00,0x00]
 
-v_subrev_u16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, scc
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xfb,0x01,0x00]
 
-v_subrev_u16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x50]
+v_cvt_pknorm_u16_f32 v5, v1, v2
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0x05,0x02,0x00]
 
-v_subrev_u16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, v1, v255
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0xff,0x03,0x00]
 
-v_subrev_u16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x28,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, -v1, s2
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0x05,0x00,0x20]
 
-v_subrev_u16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, v1, -s2
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0x05,0x00,0x40]
 
-v_subrev_u16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x95,0xd2,0x01,0x05,0x00,0x60]
 
-v_subrev_u16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x95,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x95,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x95,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x28,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_pknorm_u16_f32 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x95,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0x00,0x01,0x00]
+v_cvt_pkrtz_f16_f32 v5, v1, s2
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0x82,0x01,0x00]
+v_cvt_pkrtz_f16_f32 v255, v1, s2
+// CHECK: [0xff,0x00,0x96,0xd2,0x01,0x05,0x00,0x00]
 
-v_subrev_u16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0xe0,0x01,0x00]
+v_cvt_pkrtz_f16_f32 v5, v255, s2
+// CHECK: [0x05,0x00,0x96,0xd2,0xff,0x05,0x00,0x00]
 
-v_subrev_u16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0xee,0x01,0x00]
+v_cvt_pkrtz_f16_f32 v5, v1, s101
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xcb,0x00,0x00]
 
-v_subrev_u16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0x00,0x02,0x00]
+v_cvt_pkrtz_f16_f32 v5, v1, flat_scratch_lo
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xcd,0x00,0x00]
 
-v_subrev_u16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x28,0xd1,0x00,0xfe,0x03,0x00]
+v_cvt_pkrtz_f16_f32 v5, v1, flat_scratch_hi
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xcf,0x00,0x00]
 
-v_mul_lo_u16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, vcc_lo
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xd5,0x00,0x00]
 
-v_mul_lo_u16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x53]
+v_cvt_pkrtz_f16_f32 v5, v1, vcc_hi
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xd7,0x00,0x00]
 
-v_mul_lo_u16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, tba_lo
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xd9,0x00,0x00]
 
-v_mul_lo_u16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, tba_hi
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xdb,0x00,0x00]
 
-v_mul_lo_u16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, tma_lo
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xdd,0x00,0x00]
 
-v_mul_lo_u16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, tma_hi
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xdf,0x00,0x00]
 
-v_mul_lo_u16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, ttmp11
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xf7,0x00,0x00]
 
-v_mul_lo_u16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, m0
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xf9,0x00,0x00]
 
-v_mul_lo_u16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, exec_lo
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xfd,0x00,0x00]
 
-v_mul_lo_u16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, exec_hi
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xff,0x00,0x00]
 
-v_mul_lo_u16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, scc
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xfb,0x01,0x00]
 
-v_mul_lo_u16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, v2
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x05,0x02,0x00]
 
-v_mul_lo_u16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, v255
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0xff,0x03,0x00]
 
-v_mul_lo_u16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, -v1, s2
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x05,0x00,0x20]
 
-v_mul_lo_u16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, -s2
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x05,0x00,0x40]
 
-v_mul_lo_u16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, -v1, -s2
+// CHECK: [0x05,0x00,0x96,0xd2,0x01,0x05,0x00,0x60]
 
-v_mul_lo_u16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, |v1|, s2
+// CHECK: [0x05,0x01,0x96,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_lo_u16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, v1, |s2|
+// CHECK: [0x05,0x02,0x96,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_lo_u16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x52]
+v_cvt_pkrtz_f16_f32 v5, |v1|, |s2|
+// CHECK: [0x05,0x03,0x96,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_lo_u16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x52,0x0b,0xfe,0x00,0x00]
+v_cvt_pkrtz_f16_f32 v5, v1, s2 clamp
+// CHECK: [0x05,0x80,0x96,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_lo_u16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x52,0x56,0x34,0x00,0x00]
+v_cvt_pk_u16_u32 v5, 0, s2
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0x04,0x00,0x00]
 
-v_mul_lo_u16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x52]
+v_cvt_pk_u16_u32 v255, 0, s2
+// CHECK: [0xff,0x00,0x97,0xd2,0x80,0x04,0x00,0x00]
 
-v_mul_lo_u16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x52]
+v_cvt_pk_u16_u32 v5, -1, s2
+// CHECK: [0x05,0x00,0x97,0xd2,0xc1,0x04,0x00,0x00]
 
-v_mul_lo_u16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x52]
+v_cvt_pk_u16_u32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x97,0xd2,0xf0,0x04,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x29,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_pk_u16_u32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x97,0xd2,0xf7,0x04,0x00,0x00]
 
-v_mul_lo_u16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x29,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_pk_u16_u32 v5, v1, s2
+// CHECK: [0x05,0x00,0x97,0xd2,0x01,0x05,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x29,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_pk_u16_u32 v5, v255, s2
+// CHECK: [0x05,0x00,0x97,0xd2,0xff,0x05,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x29,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_pk_u16_u32 v5, 0, s101
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xca,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x29,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_pk_u16_u32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xcc,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x29,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_pk_u16_u32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xce,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x29,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_pk_u16_u32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xd4,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x29,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_pk_u16_u32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xd6,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x29,0xd1,0x00,0x00,0x01,0x00]
+v_cvt_pk_u16_u32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xd8,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x29,0xd1,0x00,0x82,0x01,0x00]
+v_cvt_pk_u16_u32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xda,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x29,0xd1,0x00,0xe0,0x01,0x00]
+v_cvt_pk_u16_u32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xdc,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x29,0xd1,0x00,0xee,0x01,0x00]
+v_cvt_pk_u16_u32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xde,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x29,0xd1,0x00,0x00,0x02,0x00]
+v_cvt_pk_u16_u32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xf6,0x00,0x00]
 
-v_mul_lo_u16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x29,0xd1,0x00,0xfe,0x03,0x00]
+v_cvt_pk_u16_u32 v5, 0, m0
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xf8,0x00,0x00]
 
-v_lshlrev_b16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x54]
+v_cvt_pk_u16_u32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xfc,0x00,0x00]
 
-v_lshlrev_b16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x55]
+v_cvt_pk_u16_u32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xfe,0x00,0x00]
 
-v_lshlrev_b16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x54]
+v_cvt_pk_u16_u32 v5, 0, 0
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0x00,0x01,0x00]
 
-v_lshlrev_b16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x54]
+v_cvt_pk_u16_u32 v5, 0, -1
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0x82,0x01,0x00]
 
-v_lshlrev_b16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x54]
+v_cvt_pk_u16_u32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xe0,0x01,0x00]
 
-v_lshlrev_b16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x54]
+v_cvt_pk_u16_u32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xee,0x01,0x00]
 
-v_lshlrev_b16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x54]
+v_cvt_pk_u16_u32 v5, 0, v2
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0x04,0x02,0x00]
 
-v_lshlrev_b16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x54]
+v_cvt_pk_u16_u32 v5, 0, v255
+// CHECK: [0x05,0x00,0x97,0xd2,0x80,0xfe,0x03,0x00]
 
-v_lshlrev_b16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v5, 0, s2
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0x04,0x00,0x00]
 
-v_lshlrev_b16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v255, 0, s2
+// CHECK: [0xff,0x00,0x98,0xd2,0x80,0x04,0x00,0x00]
 
-v_lshlrev_b16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v5, -1, s2
+// CHECK: [0x05,0x00,0x98,0xd2,0xc1,0x04,0x00,0x00]
 
-v_lshlrev_b16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v5, 0.5, s2
+// CHECK: [0x05,0x00,0x98,0xd2,0xf0,0x04,0x00,0x00]
 
-v_lshlrev_b16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v5, -4.0, s2
+// CHECK: [0x05,0x00,0x98,0xd2,0xf7,0x04,0x00,0x00]
 
-v_lshlrev_b16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v5, v1, s2
+// CHECK: [0x05,0x00,0x98,0xd2,0x01,0x05,0x00,0x00]
 
-v_lshlrev_b16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v5, v255, s2
+// CHECK: [0x05,0x00,0x98,0xd2,0xff,0x05,0x00,0x00]
 
-v_lshlrev_b16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v5, 0, s101
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xca,0x00,0x00]
 
-v_lshlrev_b16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v5, 0, flat_scratch_lo
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xcc,0x00,0x00]
 
-v_lshlrev_b16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v5, 0, flat_scratch_hi
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xce,0x00,0x00]
 
-v_lshlrev_b16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x54]
+v_cvt_pk_i16_i32 v5, 0, vcc_lo
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xd4,0x00,0x00]
 
-v_lshlrev_b16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x54,0x0b,0xfe,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0, vcc_hi
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xd6,0x00,0x00]
 
-v_lshlrev_b16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x54,0x56,0x34,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0, tba_lo
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xd8,0x00,0x00]
 
-v_lshlrev_b16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x54]
+v_cvt_pk_i16_i32 v5, 0, tba_hi
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xda,0x00,0x00]
 
-v_lshlrev_b16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x54]
+v_cvt_pk_i16_i32 v5, 0, tma_lo
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xdc,0x00,0x00]
 
-v_lshlrev_b16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x54]
+v_cvt_pk_i16_i32 v5, 0, tma_hi
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xde,0x00,0x00]
 
-v_lshlrev_b16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0, ttmp11
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xf6,0x00,0x00]
 
-v_lshlrev_b16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x2a,0xd1,0x00,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0, m0
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xf8,0x00,0x00]
 
-v_lshlrev_b16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x80,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0, exec_lo
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xfc,0x00,0x00]
 
-v_lshlrev_b16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0xc1,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0, exec_hi
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xfe,0x00,0x00]
 
-v_lshlrev_b16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0xf0,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0, 0
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0x00,0x01,0x00]
 
-v_lshlrev_b16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0xf7,0x00,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0, -1
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0x82,0x01,0x00]
 
-v_lshlrev_b16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0x01,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0, 0.5
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xe0,0x01,0x00]
 
-v_lshlrev_b16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x2a,0xd1,0xff,0x01,0x00,0x00]
+v_cvt_pk_i16_i32 v5, 0, -4.0
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xee,0x01,0x00]
 
-v_lshlrev_b16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0x00,0x01,0x00]
+v_cvt_pk_i16_i32 v5, 0, v2
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0x04,0x02,0x00]
 
-v_lshlrev_b16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0x82,0x01,0x00]
+v_cvt_pk_i16_i32 v5, 0, v255
+// CHECK: [0x05,0x00,0x98,0xd2,0x80,0xfe,0x03,0x00]
 
-v_lshlrev_b16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_class_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x20,0x7c]
 
-v_lshlrev_b16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_class_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x20,0x7c]
 
-v_lshlrev_b16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_class_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x20,0x7c]
 
-v_lshlrev_b16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x2a,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_class_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x57]
+v_cmp_class_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x20,0x7c]
 
-v_lshrrev_b16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x20,0x7c,0x56,0x34,0x12,0xaf]
 
-v_lshrrev_b16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x20,0x7c,0x73,0x72,0x71,0x3f]
 
-v_lshrrev_b16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x20,0x7c]
 
-v_lshrrev_b16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x20,0x7c]
 
-v_lshrrev_b16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x56]
+v_cmp_class_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x21,0x7c]
 
-v_lshrrev_b16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x56,0x0b,0xfe,0x00,0x00]
+v_cmpx_class_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x22,0x7c]
 
-v_lshrrev_b16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x56,0x56,0x34,0x00,0x00]
+v_cmpx_class_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x22,0x7c]
 
-v_lshrrev_b16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x56]
+v_cmpx_class_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x22,0x7c]
 
-v_lshrrev_b16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x56]
+v_cmpx_class_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x22,0x7c]
 
-v_lshrrev_b16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x56]
+v_cmpx_class_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x2b,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_class_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x2b,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_class_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x2b,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_class_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x2b,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_class_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x2b,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_class_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x2b,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_class_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x2b,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_class_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x2b,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_class_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x2b,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_class_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x2b,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_class_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x2b,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_class_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x2b,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_class_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x2b,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_class_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x22,0x7c]
 
-v_lshrrev_b16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x2b,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_class_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x22,0x7c,0x56,0x34,0x12,0xaf]
 
-v_ashrrev_i16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x58]
+v_cmpx_class_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x22,0x7c,0x73,0x72,0x71,0x3f]
 
-v_ashrrev_i16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x59]
+v_cmpx_class_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x22,0x7c]
 
-v_ashrrev_i16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x58]
+v_cmpx_class_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x22,0x7c]
 
-v_ashrrev_i16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x58]
+v_cmpx_class_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x23,0x7c]
 
-v_ashrrev_i16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, s[2:3], v2
+// CHECK: [0x02,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, s[4:5], v2
+// CHECK: [0x04,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, s[100:101], v2
+// CHECK: [0x64,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, flat_scratch, v2
+// CHECK: [0x66,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, vcc, v2
+// CHECK: [0x6a,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, tba, v2
+// CHECK: [0x6c,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, tma, v2
+// CHECK: [0x6e,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, ttmp[10:11], v2
+// CHECK: [0x7a,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, exec, v2
+// CHECK: [0x7e,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, 0, v2
+// CHECK: [0x80,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x24,0x7c]
 
-v_ashrrev_i16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x24,0x7c,0x56,0x34,0x12,0xaf]
 
-v_ashrrev_i16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x58]
+v_cmp_class_f64 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x24,0x7c,0x73,0x72,0x71,0x3f]
 
-v_ashrrev_i16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x58,0x0b,0xfe,0x00,0x00]
+v_cmp_class_f64 vcc, v[1:2], v2
+// CHECK: [0x01,0x05,0x24,0x7c]
 
-v_ashrrev_i16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x58,0x56,0x34,0x00,0x00]
+v_cmp_class_f64 vcc, v[254:255], v2
+// CHECK: [0xfe,0x05,0x24,0x7c]
 
-v_ashrrev_i16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x58]
+v_cmp_class_f64 vcc, s[2:3], v255
+// CHECK: [0x02,0xfe,0x25,0x7c]
 
-v_ashrrev_i16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x58]
+v_cmpx_class_f64 vcc, s[2:3], v2
+// CHECK: [0x02,0x04,0x26,0x7c]
 
-v_ashrrev_i16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x58]
+v_cmpx_class_f64 vcc, s[4:5], v2
+// CHECK: [0x04,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_class_f64 vcc, s[100:101], v2
+// CHECK: [0x64,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x2c,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_class_f64 vcc, flat_scratch, v2
+// CHECK: [0x66,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_class_f64 vcc, vcc, v2
+// CHECK: [0x6a,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_class_f64 vcc, tba, v2
+// CHECK: [0x6c,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_class_f64 vcc, tma, v2
+// CHECK: [0x6e,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_class_f64 vcc, ttmp[10:11], v2
+// CHECK: [0x7a,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_class_f64 vcc, exec, v2
+// CHECK: [0x7e,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x2c,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_class_f64 vcc, 0, v2
+// CHECK: [0x80,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_class_f64 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_class_f64 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_class_f64 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x26,0x7c]
 
-v_ashrrev_i16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_class_f64 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x26,0x7c,0x56,0x34,0x12,0xaf]
 
-v_ashrrev_i16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_class_f64 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x26,0x7c,0x73,0x72,0x71,0x3f]
 
-v_ashrrev_i16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x2c,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_class_f64 vcc, v[1:2], v2
+// CHECK: [0x01,0x05,0x26,0x7c]
 
-v_max_f16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x5a]
+v_cmpx_class_f64 vcc, v[254:255], v2
+// CHECK: [0xfe,0x05,0x26,0x7c]
 
-v_max_f16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x5b]
+v_cmpx_class_f64 vcc, s[2:3], v255
+// CHECK: [0x02,0xfe,0x27,0x7c]
 
-v_max_f16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x28,0x7c]
 
-v_max_f16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x28,0x7c]
 
-v_max_f16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x28,0x7c]
 
-v_max_f16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x28,0x7c]
 
-v_max_f16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x28,0x7c]
 
-v_max_f16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x28,0x7c]
 
-v_max_f16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x28,0x7c]
 
-v_max_f16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x28,0x7c]
 
-v_max_f16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x28,0x7c]
 
-v_max_f16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x28,0x7c]
 
-v_max_f16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x28,0x7c]
 
-v_max_f16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x28,0x7c]
 
-v_max_f16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x28,0x7c]
 
-v_max_f16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x28,0x7c]
 
-v_max_f16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x28,0x7c]
 
-v_max_f16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x28,0x7c]
 
-v_max_f16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x5a]
+v_cmp_class_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x28,0x7c]
 
-v_max_f16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x5a,0x0b,0xfe,0x00,0x00]
+v_cmp_class_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x28,0x7c]
 
-v_max_f16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x5a,0x56,0x34,0x00,0x00]
+v_cmp_class_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x28,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_max_f16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x5a]
+v_cmp_class_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x28,0x7c,0x56,0x34,0x00,0x00]
 
-v_max_f16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x5a]
+v_cmp_class_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x28,0x7c]
 
-v_max_f16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x5a]
+v_cmp_class_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x28,0x7c]
 
-v_max_f16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x2d,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_class_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x29,0x7c]
 
-v_max_f16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x2d,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_class_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x2d,0xd1,0xfd,0x00,0x00,0x00]
+v_cmpx_class_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x2d,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_class_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x2d,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_class_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x2d,0xd1,0x00,0xfa,0x01,0x00]
+v_cmpx_class_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x2d,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_class_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x2d,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_class_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x2d,0xd1,0x00,0x00,0x00,0x20]
+v_cmpx_class_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x2d,0xd1,0x00,0x00,0x00,0x40]
+v_cmpx_class_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x2d,0xd1,0x00,0x00,0x00,0x60]
+v_cmpx_class_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x2d,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_class_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x2d,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_class_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x2d,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_class_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x2a,0x7c]
 
-v_max_f16_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x2d,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_class_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x2a,0x7c]
 
-v_min_f16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x5c]
+v_cmpx_class_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x2a,0x7c]
 
-v_min_f16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x5d]
+v_cmpx_class_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x2a,0x7c]
 
-v_min_f16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x5c]
+v_cmpx_class_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x2a,0x7c]
 
-v_min_f16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x5c]
+v_cmpx_class_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x2a,0x7c]
 
-v_min_f16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x5c]
+v_cmpx_class_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x2a,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_min_f16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x5c]
+v_cmpx_class_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x2a,0x7c,0x56,0x34,0x00,0x00]
 
-v_min_f16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x5c]
+v_cmpx_class_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x2a,0x7c]
 
-v_min_f16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x5c]
+v_cmpx_class_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x2a,0x7c]
 
-v_min_f16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x5c]
+v_cmpx_class_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x2b,0x7c]
 
-v_min_f16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x5c]
+v_cmp_f_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x40,0x7c]
 
-v_min_f16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x5c]
+v_cmp_f_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x40,0x7c]
 
-v_min_f16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x5c]
+v_cmp_f_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x40,0x7c]
 
-v_min_f16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x5c]
+v_cmp_f_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x40,0x7c]
 
-v_min_f16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x5c]
+v_cmp_f_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x40,0x7c]
 
-v_min_f16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x5c]
+v_cmp_f_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x40,0x7c]
 
-v_min_f16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x5c]
+v_cmp_f_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x40,0x7c]
 
-v_min_f16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x5c]
+v_cmp_f_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x40,0x7c]
 
-v_min_f16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x5c]
+v_cmp_f_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x40,0x7c]
 
-v_min_f16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x5c]
+v_cmp_f_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x40,0x7c]
 
-v_min_f16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x5c,0x0b,0xfe,0x00,0x00]
+v_cmp_f_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x40,0x7c]
 
-v_min_f16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x5c,0x56,0x34,0x00,0x00]
+v_cmp_f_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x40,0x7c]
 
-v_min_f16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x5c]
+v_cmp_f_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x40,0x7c]
 
-v_min_f16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x5c]
+v_cmp_f_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x40,0x7c]
 
-v_min_f16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x5c]
+v_cmp_f_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x40,0x7c]
 
-v_min_f16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x40,0x7c]
 
-v_min_f16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x40,0x7c]
 
-v_min_f16_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_f_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x40,0x7c]
 
-v_min_f16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_f_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x40,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_min_f16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_f_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x40,0x7c,0x56,0x34,0x00,0x00]
 
-v_min_f16_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_f_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x40,0x7c]
 
-v_min_f16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_f_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x40,0x7c]
 
-v_min_f16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_f_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x41,0x7c]
 
-v_min_f16_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_f_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f16_e64 v0, s0, -s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_f_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f16_e64 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x2e,0xd1,0x00,0x00,0x00,0x60]
+v_cmp_f_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f16_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f16_e64 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f16_e64 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f16_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x2e,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_f_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_u16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_u16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x5f]
+v_cmp_f_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x20,0xd0,0xf0,0x04,0x00,0x00]
 
-v_max_u16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x20,0xd0,0x01,0x05,0x00,0x00]
 
-v_max_u16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x20,0xd0,0xff,0x05,0x00,0x00]
 
-v_max_u16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xca,0x00,0x00]
 
-v_max_u16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xcc,0x00,0x00]
 
-v_max_u16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xce,0x00,0x00]
 
-v_max_u16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xd4,0x00,0x00]
 
-v_max_u16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xd6,0x00,0x00]
 
-v_max_u16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xd8,0x00,0x00]
 
-v_max_u16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xda,0x00,0x00]
 
-v_max_u16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xdc,0x00,0x00]
 
-v_max_u16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xde,0x00,0x00]
 
-v_max_u16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xf6,0x00,0x00]
 
-v_max_u16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xf8,0x00,0x00]
 
-v_max_u16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xfc,0x00,0x00]
 
-v_max_u16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xfe,0x00,0x00]
 
-v_max_u16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0x00,0x01,0x00]
 
-v_max_u16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xe0,0x01,0x00]
 
-v_max_u16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x5e,0x0b,0xfe,0x00,0x00]
+v_cmp_f_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xfa,0x01,0x00]
 
-v_max_u16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x5e,0x56,0x34,0x00,0x00]
+v_cmp_f_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0x04,0x02,0x00]
 
-v_max_u16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0xfe,0x03,0x00]
 
-v_max_u16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x20,0xd0,0x80,0x04,0x00,0x40]
 
-v_max_u16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x5e]
+v_cmp_f_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x20,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_u16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x2f,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x42,0x7c]
 
-v_max_u16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x2f,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x2f,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_lt_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x2f,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_lt_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x2f,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_lt_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x2f,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_lt_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x2f,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_lt_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x2f,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_lt_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x2f,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_lt_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x2f,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_lt_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x2f,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_lt_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x2f,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_lt_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x2f,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_lt_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x42,0x7c]
 
-v_max_u16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x2f,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_lt_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x42,0x7c]
 
-v_max_i16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x60]
+v_cmp_lt_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x42,0x7c]
 
-v_max_i16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x61]
+v_cmp_lt_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x42,0x7c]
 
-v_max_i16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x60]
+v_cmp_lt_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x42,0x7c]
 
-v_max_i16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x60]
+v_cmp_lt_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x42,0x7c]
 
-v_max_i16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x60]
+v_cmp_lt_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x42,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_max_i16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x60]
+v_cmp_lt_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x42,0x7c,0x56,0x34,0x00,0x00]
 
-v_max_i16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x60]
+v_cmp_lt_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x42,0x7c]
 
-v_max_i16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x60]
+v_cmp_lt_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x42,0x7c]
 
-v_max_i16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x60]
+v_cmp_lt_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x43,0x7c]
 
-v_max_i16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x60]
+v_cmp_lt_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_i16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x60]
+v_cmp_lt_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x21,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_i16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x60]
+v_cmp_lt_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x21,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_i16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x60]
+v_cmp_lt_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x21,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_i16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x60]
+v_cmp_lt_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x21,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_i16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x60]
+v_cmp_lt_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x21,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_i16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x60]
+v_cmp_lt_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x21,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_i16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x60]
+v_cmp_lt_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x21,0xd0,0x80,0x04,0x00,0x00]
 
-v_max_i16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x60]
+v_cmp_lt_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x21,0xd0,0xf0,0x04,0x00,0x00]
 
-v_max_i16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x60]
+v_cmp_lt_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x21,0xd0,0x01,0x05,0x00,0x00]
 
-v_max_i16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x60,0x0b,0xfe,0x00,0x00]
+v_cmp_lt_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x21,0xd0,0xff,0x05,0x00,0x00]
 
-v_max_i16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x60,0x56,0x34,0x00,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xca,0x00,0x00]
 
-v_max_i16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x60]
+v_cmp_lt_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xcc,0x00,0x00]
 
-v_max_i16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x60]
+v_cmp_lt_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xce,0x00,0x00]
 
-v_max_i16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x60]
+v_cmp_lt_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xd4,0x00,0x00]
 
-v_max_i16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x30,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xd6,0x00,0x00]
 
-v_max_i16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x30,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xd8,0x00,0x00]
 
-v_max_i16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x30,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xda,0x00,0x00]
 
-v_max_i16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x30,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xdc,0x00,0x00]
 
-v_max_i16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x30,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xde,0x00,0x00]
 
-v_max_i16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x30,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xf6,0x00,0x00]
 
-v_max_i16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x30,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xf8,0x00,0x00]
 
-v_max_i16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x30,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xfc,0x00,0x00]
 
-v_max_i16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x30,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xfe,0x00,0x00]
 
-v_max_i16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x30,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0x00,0x01,0x00]
 
-v_max_i16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x30,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xe0,0x01,0x00]
 
-v_max_i16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x30,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xfa,0x01,0x00]
 
-v_max_i16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x30,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0x04,0x02,0x00]
 
-v_max_i16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x30,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_lt_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0xfe,0x03,0x00]
 
-v_min_u16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x62]
+v_cmp_lt_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x21,0xd0,0x80,0x04,0x00,0x40]
 
-v_min_u16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x63]
+v_cmp_lt_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x21,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_u16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x44,0x7c]
 
-v_min_u16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x44,0x7c]
 
-v_min_u16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x44,0x7c]
 
-v_min_u16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x44,0x7c]
 
-v_min_u16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x44,0x7c]
 
-v_min_u16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x44,0x7c]
 
-v_min_u16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x44,0x7c]
 
-v_min_u16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x44,0x7c]
 
-v_min_u16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x44,0x7c]
 
-v_min_u16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x44,0x7c]
 
-v_min_u16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x44,0x7c]
 
-v_min_u16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x44,0x7c]
 
-v_min_u16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x44,0x7c]
 
-v_min_u16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x44,0x7c]
 
-v_min_u16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x44,0x7c]
 
-v_min_u16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x44,0x7c]
 
-v_min_u16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x62]
+v_cmp_eq_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x44,0x7c]
 
-v_min_u16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x62,0x0b,0xfe,0x00,0x00]
+v_cmp_eq_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x44,0x7c]
 
-v_min_u16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x62,0x56,0x34,0x00,0x00]
+v_cmp_eq_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x44,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_min_u16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x62]
+v_cmp_eq_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x44,0x7c,0x56,0x34,0x00,0x00]
 
-v_min_u16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x62]
+v_cmp_eq_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x44,0x7c]
 
-v_min_u16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x62]
+v_cmp_eq_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x44,0x7c]
 
-v_min_u16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x31,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x45,0x7c]
 
-v_min_u16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x31,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_eq_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_u16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x31,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_eq_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_u16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x31,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_eq_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_u16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x31,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_eq_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_u16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x31,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_eq_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_u16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x31,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_eq_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_u16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x31,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_eq_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_u16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x31,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_eq_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_u16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x31,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_eq_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x22,0xd0,0xf0,0x04,0x00,0x00]
 
-v_min_u16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x31,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_eq_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x22,0xd0,0x01,0x05,0x00,0x00]
 
-v_min_u16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x31,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_eq_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x22,0xd0,0xff,0x05,0x00,0x00]
 
-v_min_u16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x31,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_eq_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xca,0x00,0x00]
 
-v_min_u16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x31,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_eq_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xcc,0x00,0x00]
 
-v_min_i16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xce,0x00,0x00]
 
-v_min_i16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x65]
+v_cmp_eq_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xd4,0x00,0x00]
 
-v_min_i16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xd6,0x00,0x00]
 
-v_min_i16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xd8,0x00,0x00]
 
-v_min_i16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xda,0x00,0x00]
 
-v_min_i16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xdc,0x00,0x00]
 
-v_min_i16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xde,0x00,0x00]
 
-v_min_i16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xf6,0x00,0x00]
 
-v_min_i16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xf8,0x00,0x00]
 
-v_min_i16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xfc,0x00,0x00]
 
-v_min_i16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xfe,0x00,0x00]
 
-v_min_i16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0x00,0x01,0x00]
 
-v_min_i16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xe0,0x01,0x00]
 
-v_min_i16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xfa,0x01,0x00]
 
-v_min_i16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0x04,0x02,0x00]
 
-v_min_i16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0xfe,0x03,0x00]
 
-v_min_i16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x22,0xd0,0x80,0x04,0x00,0x40]
 
-v_min_i16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x64]
+v_cmp_eq_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x22,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_i16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x64]
+v_cmp_le_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x46,0x7c]
 
-v_min_i16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x64,0x0b,0xfe,0x00,0x00]
+v_cmp_le_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x46,0x7c]
 
-v_min_i16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x64,0x56,0x34,0x00,0x00]
+v_cmp_le_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x46,0x7c]
 
-v_min_i16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x64]
+v_cmp_le_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x46,0x7c]
 
-v_min_i16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x64]
+v_cmp_le_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x46,0x7c]
 
-v_min_i16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x64]
+v_cmp_le_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x32,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x46,0x7c]
 
-v_min_i16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x32,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, 0, s0
-// CHECK: [0x00,0x00,0x32,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_le_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, -1, s0
-// CHECK: [0x00,0x00,0x32,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_le_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x32,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_le_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x32,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_le_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x32,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_le_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x32,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_le_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x32,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_le_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x32,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_le_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x32,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_le_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x32,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_le_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x46,0x7c]
 
-v_min_i16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x32,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_le_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x46,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_min_i16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x32,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_le_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x46,0x7c,0x56,0x34,0x00,0x00]
 
-v_ldexp_f16 v0, s0, v0
-// CHECK: [0x00,0x00,0x00,0x66]
+v_cmp_le_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x46,0x7c]
 
-v_ldexp_f16 v255, s0, v0
-// CHECK: [0x00,0x00,0xfe,0x67]
+v_cmp_le_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x46,0x7c]
 
-v_ldexp_f16 v0, s101, v0
-// CHECK: [0x65,0x00,0x00,0x66]
+v_cmp_le_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x47,0x7c]
 
-v_ldexp_f16 v0, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f16 v0, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x23,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f16 v0, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x23,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f16 v0, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x00,0x66]
+v_cmp_le_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x23,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f16 v0, tba_lo, v0
-// CHECK: [0x6c,0x00,0x00,0x66]
+v_cmp_le_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x23,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f16 v0, tba_hi, v0
-// CHECK: [0x6d,0x00,0x00,0x66]
+v_cmp_le_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x23,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f16 v0, tma_lo, v0
-// CHECK: [0x6e,0x00,0x00,0x66]
+v_cmp_le_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x23,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f16 v0, tma_hi, v0
-// CHECK: [0x6f,0x00,0x00,0x66]
+v_cmp_le_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x23,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f16 v0, ttmp11, v0
-// CHECK: [0x7b,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x23,0xd0,0xf0,0x04,0x00,0x00]
 
-v_ldexp_f16 v0, m0, v0
-// CHECK: [0x7c,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x23,0xd0,0x01,0x05,0x00,0x00]
 
-v_ldexp_f16 v0, exec_lo, v0
-// CHECK: [0x7e,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x23,0xd0,0xff,0x05,0x00,0x00]
 
-v_ldexp_f16 v0, exec_hi, v0
-// CHECK: [0x7f,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xca,0x00,0x00]
 
-v_ldexp_f16 v0, 0, v0
-// CHECK: [0x80,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xcc,0x00,0x00]
 
-v_ldexp_f16 v0, -1, v0
-// CHECK: [0xc1,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xce,0x00,0x00]
 
-v_ldexp_f16 v0, 0.5, v0
-// CHECK: [0xf0,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xd4,0x00,0x00]
 
-v_ldexp_f16 v0, -4.0, v0
-// CHECK: [0xf7,0x00,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xd6,0x00,0x00]
 
-v_ldexp_f16 v0, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x00,0x66,0x0b,0xfe,0x00,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xd8,0x00,0x00]
 
-v_ldexp_f16 v0, 0x3456, v0
-// CHECK: [0xff,0x00,0x00,0x66,0x56,0x34,0x00,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xda,0x00,0x00]
 
-v_ldexp_f16 v0, v0, v0
-// CHECK: [0x00,0x01,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xdc,0x00,0x00]
 
-v_ldexp_f16 v0, v255, v0
-// CHECK: [0xff,0x01,0x00,0x66]
+v_cmp_le_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xde,0x00,0x00]
 
-v_ldexp_f16 v0, s0, v255
-// CHECK: [0x00,0xfe,0x01,0x66]
+v_cmp_le_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xf6,0x00,0x00]
 
-v_ldexp_f16_e64 v0, s0, s0
-// CHECK: [0x00,0x00,0x33,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xf8,0x00,0x00]
 
-v_ldexp_f16_e64 v255, s0, s0
-// CHECK: [0xff,0x00,0x33,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xfc,0x00,0x00]
 
-v_ldexp_f16_e64 v0, scc, s0
-// CHECK: [0x00,0x00,0x33,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xfe,0x00,0x00]
 
-v_ldexp_f16_e64 v0, v0, s0
-// CHECK: [0x00,0x00,0x33,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0x00,0x01,0x00]
 
-v_ldexp_f16_e64 v0, v255, s0
-// CHECK: [0x00,0x00,0x33,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xe0,0x01,0x00]
 
-v_ldexp_f16_e64 v0, s0, 0
-// CHECK: [0x00,0x00,0x33,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xfa,0x01,0x00]
 
-v_ldexp_f16_e64 v0, s0, -1
-// CHECK: [0x00,0x00,0x33,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0x04,0x02,0x00]
 
-v_ldexp_f16_e64 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x33,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0xfe,0x03,0x00]
 
-v_ldexp_f16_e64 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x33,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x23,0xd0,0x80,0x04,0x00,0x40]
 
-v_ldexp_f16_e64 v0, s0, scc
-// CHECK: [0x00,0x00,0x33,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_le_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x23,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f16_e64 v0, s0, v0
-// CHECK: [0x00,0x00,0x33,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_gt_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x48,0x7c]
 
-v_ldexp_f16_e64 v0, s0, v255
-// CHECK: [0x00,0x00,0x33,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_gt_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x48,0x7c]
 
-v_ldexp_f16_e64 v0, -s0, s0
-// CHECK: [0x00,0x00,0x33,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_gt_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x48,0x7c]
 
-v_ldexp_f16_e64 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x33,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x48,0x7c]
 
-v_ldexp_f16_e64 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x33,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_gt_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_gt_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_gt_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_gt_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_gt_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_gt_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_gt_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_gt_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_gt_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_gt_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_gt_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_gt_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x48,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_mad_legacy_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_gt_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x48,0x7c,0x56,0x34,0x00,0x00]
 
-v_mad_legacy_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x48,0x7c]
 
-v_mad_legacy_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x49,0x7c]
 
-v_mad_legacy_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xc0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_gt_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_gt_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_legacy_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xc0,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_gt_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xc1,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x24,0xd0,0xf0,0x04,0x00,0x00]
 
-v_mad_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x24,0xd0,0x01,0x05,0x00,0x00]
 
-v_mad_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_gt_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x24,0xd0,0xff,0x05,0x00,0x00]
 
-v_mad_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xca,0x00,0x00]
 
-v_mad_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xcc,0x00,0x00]
 
-v_mad_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_gt_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xce,0x00,0x00]
 
-v_mad_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_gt_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xd4,0x00,0x00]
 
-v_mad_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_gt_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xd6,0x00,0x00]
 
-v_mad_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_gt_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xd8,0x00,0x00]
 
-v_mad_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_gt_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xda,0x00,0x00]
 
-v_mad_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_gt_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xdc,0x00,0x00]
 
-v_mad_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_gt_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xde,0x00,0x00]
 
-v_mad_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xc1,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xf6,0x00,0x00]
 
-v_mad_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xc1,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xf8,0x00,0x00]
 
-v_mad_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xc1,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xfc,0x00,0x00]
 
-v_mad_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xc1,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xfe,0x00,0x00]
 
-v_mad_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xc1,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0x00,0x01,0x00]
 
-v_mad_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_gt_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xe0,0x01,0x00]
 
-v_mad_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_gt_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xfa,0x01,0x00]
 
-v_mad_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xc1,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_gt_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0x04,0x02,0x00]
 
-v_mad_i32_i24 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0xfe,0x03,0x00]
 
-v_mad_i32_i24 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xc2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x24,0xd0,0x80,0x04,0x00,0x40]
 
-v_mad_i32_i24 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_gt_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x24,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_i32_i24 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_lg_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_lg_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_lg_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_lg_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_lg_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_lg_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_lg_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_lg_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_lg_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_lg_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_lg_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_lg_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_lg_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_lg_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_lg_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_lg_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x4a,0x7c]
 
-v_mad_i32_i24 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xc2,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_lg_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x4a,0x7c]
 
-v_mad_u32_u24 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lg_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x4a,0x7c]
 
-v_mad_u32_u24 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xc3,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lg_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x4a,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_mad_u32_u24 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_lg_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x4a,0x7c,0x56,0x34,0x00,0x00]
 
-v_mad_u32_u24 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_lg_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x4a,0x7c]
 
-v_mad_u32_u24 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_lg_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x4a,0x7c]
 
-v_mad_u32_u24 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_lg_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x4b,0x7c]
 
-v_mad_u32_u24 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_u32_u24 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_lg_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x25,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_lg_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x25,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_lg_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x25,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_lg_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x25,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_lg_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x25,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_lg_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x25,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_lg_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x25,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_lg_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x25,0xd0,0xf0,0x04,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_lg_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x25,0xd0,0x01,0x05,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_lg_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x25,0xd0,0xff,0x05,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_lg_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xca,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_lg_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xcc,0x00,0x00]
 
-v_mad_u32_u24 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xc3,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_lg_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xce,0x00,0x00]
 
-v_cubeid_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cubeid_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cubeid_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cubeid_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xda,0x00,0x00]
 
-v_cubeid_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cubeid_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xde,0x00,0x00]
 
-v_cubeid_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cubeid_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cubeid_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_lg_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cubeid_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_lg_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cubeid_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_lg_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0x00,0x01,0x00]
 
-v_cubeid_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_lg_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cubeid_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_lg_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cubeid_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_lg_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0x04,0x02,0x00]
 
-v_cubeid_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_lg_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cubeid_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x25,0xd0,0x80,0x04,0x00,0x40]
 
-v_cubeid_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_lg_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x25,0xd0,0x80,0x04,0x00,0x00]
 
-v_cubeid_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x4c,0x7c]
 
-v_cubeid_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x4c,0x7c]
 
-v_cubeid_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xc4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x4c,0x7c]
 
-v_cubeid_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_ge_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x4c,0x7c]
 
-v_cubeid_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_ge_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x4c,0x7c]
 
-v_cubeid_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_ge_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xc5,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_ge_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ge_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_ge_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_ge_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ge_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_ge_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_ge_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_ge_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_ge_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_ge_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x4c,0x7c]
 
-v_cubesc_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_ge_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x4c,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_cubesc_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_ge_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x4c,0x7c,0x56,0x34,0x00,0x00]
 
-v_cubesc_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_ge_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x4c,0x7c]
 
-v_cubesc_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xc5,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x4c,0x7c]
 
-v_cubesc_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xc5,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x4d,0x7c]
 
-v_cubesc_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xc5,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cubesc_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xc5,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cubesc_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xc5,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cubesc_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_ge_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cubesc_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_ge_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cubesc_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xc5,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_ge_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cubetc_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cubetc_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cubetc_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x26,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cubetc_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x26,0xd0,0x01,0x05,0x00,0x00]
 
-v_cubetc_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x26,0xd0,0xff,0x05,0x00,0x00]
 
-v_cubetc_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xca,0x00,0x00]
 
-v_cubetc_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cubetc_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xce,0x00,0x00]
 
-v_cubetc_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_ge_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cubetc_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_ge_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cubetc_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_ge_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cubetc_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_ge_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xda,0x00,0x00]
 
-v_cubetc_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_ge_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cubetc_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_ge_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xde,0x00,0x00]
 
-v_cubetc_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_ge_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cubetc_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cubetc_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cubetc_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cubetc_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0x00,0x01,0x00]
 
-v_cubetc_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xc6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cubetc_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_ge_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cubetc_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_ge_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0x04,0x02,0x00]
 
-v_cubetc_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xc6,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_ge_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cubema_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x26,0xd0,0x80,0x04,0x00,0x40]
 
-v_cubema_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xc7,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ge_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x26,0xd0,0x80,0x04,0x00,0x00]
 
-v_cubema_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_o_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_o_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_o_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_o_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_o_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_o_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_o_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_o_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_o_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_o_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_o_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_o_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_o_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xc7,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_o_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xc7,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_o_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xc7,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_o_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xc7,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_o_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xc7,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_o_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x4e,0x7c]
 
-v_cubema_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_o_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x4e,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_cubema_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_o_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x4e,0x7c,0x56,0x34,0x00,0x00]
 
-v_cubema_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xc7,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_o_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x4e,0x7c]
 
-v_bfe_u32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_o_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x4e,0x7c]
 
-v_bfe_u32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xc8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_o_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x4f,0x7c]
 
-v_bfe_u32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfe_u32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_o_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x27,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfe_u32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_o_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x27,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfe_u32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_o_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x27,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfe_u32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_o_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x27,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfe_u32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_o_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x27,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfe_u32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_o_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x27,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfe_u32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_o_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x27,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfe_u32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_o_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x27,0xd0,0xf0,0x04,0x00,0x00]
 
-v_bfe_u32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_o_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x27,0xd0,0x01,0x05,0x00,0x00]
 
-v_bfe_u32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_o_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x27,0xd0,0xff,0x05,0x00,0x00]
 
-v_bfe_u32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xca,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_o_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xcc,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_o_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xce,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_o_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xd4,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_o_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xd6,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_o_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xd8,0x00,0x00]
 
-v_bfe_u32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xc8,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_o_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xda,0x00,0x00]
 
-v_bfe_i32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xdc,0x00,0x00]
 
-v_bfe_i32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xc9,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xde,0x00,0x00]
 
-v_bfe_i32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xf6,0x00,0x00]
 
-v_bfe_i32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xf8,0x00,0x00]
 
-v_bfe_i32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xfc,0x00,0x00]
 
-v_bfe_i32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xfe,0x00,0x00]
 
-v_bfe_i32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0x00,0x01,0x00]
 
-v_bfe_i32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xe0,0x01,0x00]
 
-v_bfe_i32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xfa,0x01,0x00]
 
-v_bfe_i32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0x04,0x02,0x00]
 
-v_bfe_i32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0xfe,0x03,0x00]
 
-v_bfe_i32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x27,0xd0,0x80,0x04,0x00,0x40]
 
-v_bfe_i32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_o_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x27,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfe_i32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_u_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x50,0x7c]
 
-v_bfe_i32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_u_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x50,0x7c]
 
-v_bfe_i32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_u_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x50,0x7c]
 
-v_bfe_i32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_u_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x50,0x7c]
 
-v_bfe_i32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_u_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x50,0x7c]
 
-v_bfe_i32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_u_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x50,0x7c]
 
-v_bfe_i32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xc9,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_u_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x50,0x7c]
 
-v_bfi_b32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xca,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_u_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_u_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_u_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_u_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_u_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_u_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_u_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_u_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_u_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x50,0x7c]
 
-v_bfi_b32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_u_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x50,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_bfi_b32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_u_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x50,0x7c,0x56,0x34,0x00,0x00]
 
-v_bfi_b32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_u_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x50,0x7c]
 
-v_bfi_b32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_u_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x50,0x7c]
 
-v_bfi_b32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_u_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x51,0x7c]
 
-v_bfi_b32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_u_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfi_b32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_u_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfi_b32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_u_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_bfi_b32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xca,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_u_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xcb,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_u_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_u_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_u_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x28,0xd0,0xf0,0x04,0x00,0x00]
 
-v_fma_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_u_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x28,0xd0,0x01,0x05,0x00,0x00]
 
-v_fma_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_u_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x28,0xd0,0xff,0x05,0x00,0x00]
 
-v_fma_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_u_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xca,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_u_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xcc,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_u_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xce,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_u_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xd4,0x00,0x00]
 
-v_fma_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_u_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xd6,0x00,0x00]
 
-v_fma_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_u_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xd8,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_u_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xda,0x00,0x00]
 
-v_fma_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_u_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xdc,0x00,0x00]
 
-v_fma_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xcb,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xde,0x00,0x00]
 
-v_fma_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xcb,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xf6,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xcb,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xf8,0x00,0x00]
 
-v_fma_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xcb,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xfc,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xcb,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xfe,0x00,0x00]
 
-v_fma_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_u_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0x00,0x01,0x00]
 
-v_fma_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_u_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xe0,0x01,0x00]
 
-v_fma_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xcb,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_u_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xfa,0x01,0x00]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0x04,0x02,0x00]
 
-v_fma_f64 v[254:255], s[0:1], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_u_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0xfe,0x03,0x00]
 
-v_fma_f64 v[0:1], scc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_u_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x28,0xd0,0x80,0x04,0x00,0x40]
 
-v_fma_f64 v[0:1], v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_u_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x28,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f64 v[0:1], v[254:255], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0xfe,0x01,0x00,0x00]
+v_cmp_nge_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_nge_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nge_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0xfc,0x03,0x00]
+v_cmp_nge_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_nge_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_nge_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0xf8,0x07]
+v_cmp_nge_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], -s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_nge_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_nge_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_nge_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], -s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_nge_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], |s[0:1]|, s[0:1], s[0:1]
-// CHECK: [0x00,0x01,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nge_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x02,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nge_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x04,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nge_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], |s[0:1]|, |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x07,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nge_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0xcc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nge_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_nge_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_nge_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x52,0x7c]
 
-v_fma_f64 v[0:1], s[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0xcc,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_nge_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x52,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nge_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x52,0x7c,0x56,0x34,0x00,0x00]
 
-v_lerp_u8 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xcd,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nge_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x52,0x7c]
 
-v_lerp_u8 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_nge_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x52,0x7c]
 
-v_lerp_u8 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_nge_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x53,0x7c]
 
-v_lerp_u8 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_nge_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x29,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_nge_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x29,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_nge_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x29,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_nge_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x29,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_nge_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x29,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_nge_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x29,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_nge_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x29,0xd0,0x80,0x04,0x00,0x00]
 
-v_lerp_u8 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x29,0xd0,0xf0,0x04,0x00,0x00]
 
-v_lerp_u8 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_nge_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x29,0xd0,0x01,0x05,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_nge_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x29,0xd0,0xff,0x05,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_nge_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xca,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_nge_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xcc,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_nge_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xce,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_nge_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xd4,0x00,0x00]
 
-v_lerp_u8 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xcd,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_nge_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xd6,0x00,0x00]
 
-v_alignbit_b32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xd8,0x00,0x00]
 
-v_alignbit_b32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xce,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xda,0x00,0x00]
 
-v_alignbit_b32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xdc,0x00,0x00]
 
-v_alignbit_b32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xde,0x00,0x00]
 
-v_alignbit_b32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xf6,0x00,0x00]
 
-v_alignbit_b32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xf8,0x00,0x00]
 
-v_alignbit_b32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xfc,0x00,0x00]
 
-v_alignbit_b32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xfe,0x00,0x00]
 
-v_alignbit_b32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0x00,0x01,0x00]
 
-v_alignbit_b32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xe0,0x01,0x00]
 
-v_alignbit_b32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xfa,0x01,0x00]
 
-v_alignbit_b32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0x04,0x02,0x00]
 
-v_alignbit_b32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0xfe,0x03,0x00]
 
-v_alignbit_b32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_nge_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x29,0xd0,0x80,0x04,0x00,0x40]
 
-v_alignbit_b32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_nge_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x29,0xd0,0x80,0x04,0x00,0x00]
 
-v_alignbit_b32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_nlg_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x54,0x7c]
 
-v_alignbit_b32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_nlg_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x54,0x7c]
 
-v_alignbit_b32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_nlg_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x54,0x7c]
 
-v_alignbit_b32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_nlg_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x54,0x7c]
 
-v_alignbit_b32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xce,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_nlg_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xcf,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_nlg_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_nlg_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_nlg_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_nlg_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_nlg_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_nlg_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_nlg_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_nlg_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_nlg_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_nlg_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nlg_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x54,0x7c]
 
-v_alignbyte_b32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_nlg_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x54,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_alignbyte_b32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_nlg_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x54,0x7c,0x56,0x34,0x00,0x00]
 
-v_alignbyte_b32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_nlg_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x54,0x7c]
 
-v_alignbyte_b32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_nlg_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x54,0x7c]
 
-v_alignbyte_b32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_nlg_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x55,0x7c]
 
-v_alignbyte_b32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_nlg_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_alignbyte_b32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xcf,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_nlg_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xd0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_nlg_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_nlg_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_nlg_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0xf0,0x04,0x00,0x00]
 
-v_min3_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_nlg_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x01,0x05,0x00,0x00]
 
-v_min3_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_nlg_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0xff,0x05,0x00,0x00]
 
-v_min3_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_nlg_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xca,0x00,0x00]
 
-v_min3_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_nlg_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xcc,0x00,0x00]
 
-v_min3_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_nlg_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xce,0x00,0x00]
 
-v_min3_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_nlg_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xd4,0x00,0x00]
 
-v_min3_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_nlg_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xd6,0x00,0x00]
 
-v_min3_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_nlg_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xd8,0x00,0x00]
 
-v_min3_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xd0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xda,0x00,0x00]
 
-v_min3_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xd0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xdc,0x00,0x00]
 
-v_min3_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xd0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xde,0x00,0x00]
 
-v_min3_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xd0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xf6,0x00,0x00]
 
-v_min3_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xd0,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xf8,0x00,0x00]
 
-v_min3_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_nlg_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xfc,0x00,0x00]
 
-v_min3_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_nlg_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xfe,0x00,0x00]
 
-v_min3_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xd0,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_nlg_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0x00,0x01,0x00]
 
-v_min3_i32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xe0,0x01,0x00]
 
-v_min3_i32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xd1,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xfa,0x01,0x00]
 
-v_min3_i32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0x04,0x02,0x00]
 
-v_min3_i32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0xfe,0x03,0x00]
 
-v_min3_i32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x2a,0xd0,0x80,0x04,0x00,0x40]
 
-v_min3_i32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_nlg_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x2a,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_i32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ngt_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x56,0x7c]
 
-v_min3_i32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_ngt_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_ngt_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_ngt_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_ngt_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_ngt_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ngt_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_ngt_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_ngt_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_ngt_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_ngt_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_ngt_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_ngt_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x56,0x7c]
 
-v_min3_i32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xd1,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_ngt_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x56,0x7c]
 
-v_min3_u32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x56,0x7c]
 
-v_min3_u32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xd2,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x56,0x7c]
 
-v_min3_u32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_ngt_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x56,0x7c]
 
-v_min3_u32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_ngt_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x56,0x7c]
 
-v_min3_u32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_ngt_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x56,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_min3_u32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_ngt_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x56,0x7c,0x56,0x34,0x00,0x00]
 
-v_min3_u32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ngt_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x56,0x7c]
 
-v_min3_u32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_ngt_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x56,0x7c]
 
-v_min3_u32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_ngt_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x57,0x7c]
 
-v_min3_u32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_u32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_ngt_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2b,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_u32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_ngt_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x2b,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_u32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ngt_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x2b,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_u32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_ngt_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2b,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_u32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_ngt_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2b,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_u32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_ngt_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2b,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_u32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_ngt_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2b,0xd0,0x80,0x04,0x00,0x00]
 
-v_min3_u32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_ngt_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2b,0xd0,0xf0,0x04,0x00,0x00]
 
-v_min3_u32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_ngt_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x01,0x05,0x00,0x00]
 
-v_min3_u32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xd2,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_ngt_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2b,0xd0,0xff,0x05,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xca,0x00,0x00]
 
-v_max3_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xd3,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xcc,0x00,0x00]
 
-v_max3_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xce,0x00,0x00]
 
-v_max3_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xd4,0x00,0x00]
 
-v_max3_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xd6,0x00,0x00]
 
-v_max3_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xd8,0x00,0x00]
 
-v_max3_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xda,0x00,0x00]
 
-v_max3_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xdc,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_ngt_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xde,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_ngt_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xf6,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_ngt_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xf8,0x00,0x00]
 
-v_max3_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_ngt_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xfc,0x00,0x00]
 
-v_max3_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_ngt_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xfe,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_ngt_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0x00,0x01,0x00]
 
-v_max3_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_ngt_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xe0,0x01,0x00]
 
-v_max3_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xd3,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xfa,0x01,0x00]
 
-v_max3_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xd3,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0x04,0x02,0x00]
 
-v_max3_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xd3,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0xfe,0x03,0x00]
 
-v_max3_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xd3,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x2b,0xd0,0x80,0x04,0x00,0x40]
 
-v_max3_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xd3,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x2b,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_nle_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x58,0x7c]
 
-v_max3_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_nle_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x58,0x7c]
 
-v_max3_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xd3,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_nle_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x58,0x7c]
 
-v_max3_i32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nle_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x58,0x7c]
 
-v_max3_i32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xd4,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nle_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x58,0x7c]
 
-v_max3_i32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_nle_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x58,0x7c]
 
-v_max3_i32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_nle_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x58,0x7c]
 
-v_max3_i32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_nle_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x58,0x7c]
 
-v_max3_i32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_nle_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x58,0x7c]
 
-v_max3_i32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_nle_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x58,0x7c]
 
-v_max3_i32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_nle_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x58,0x7c]
 
-v_max3_i32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_nle_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x58,0x7c]
 
-v_max3_i32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_nle_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x58,0x7c]
 
-v_max3_i32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_nle_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x58,0x7c]
 
-v_max3_i32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_nle_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x58,0x7c]
 
-v_max3_i32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nle_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x58,0x7c]
 
-v_max3_i32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x58,0x7c]
 
-v_max3_i32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_nle_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x58,0x7c]
 
-v_max3_i32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_nle_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x58,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_max3_i32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_nle_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x58,0x7c,0x56,0x34,0x00,0x00]
 
-v_max3_i32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_nle_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x58,0x7c]
 
-v_max3_i32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_nle_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x58,0x7c]
 
-v_max3_i32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xd4,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_nle_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x59,0x7c]
 
-v_max3_u32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_u32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xd5,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nle_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_u32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_nle_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_u32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_nle_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_u32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_nle_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_u32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_nle_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_u32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_nle_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_u32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_nle_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_max3_u32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0xf0,0x04,0x00,0x00]
 
-v_max3_u32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_nle_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x01,0x05,0x00,0x00]
 
-v_max3_u32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_nle_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0xff,0x05,0x00,0x00]
 
-v_max3_u32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xca,0x00,0x00]
 
-v_max3_u32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xcc,0x00,0x00]
 
-v_max3_u32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xce,0x00,0x00]
 
-v_max3_u32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_nle_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xd4,0x00,0x00]
 
-v_max3_u32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_nle_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xd6,0x00,0x00]
 
-v_max3_u32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_nle_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xd8,0x00,0x00]
 
-v_max3_u32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_nle_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xda,0x00,0x00]
 
-v_max3_u32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_nle_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xdc,0x00,0x00]
 
-v_max3_u32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xd5,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_nle_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xde,0x00,0x00]
 
-v_med3_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xf6,0x00,0x00]
 
-v_med3_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xd6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xf8,0x00,0x00]
 
-v_med3_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xfc,0x00,0x00]
 
-v_med3_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xfe,0x00,0x00]
 
-v_med3_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0x00,0x01,0x00]
 
-v_med3_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xe0,0x01,0x00]
 
-v_med3_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xfa,0x01,0x00]
 
-v_med3_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0x04,0x02,0x00]
 
-v_med3_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_nle_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0xfe,0x03,0x00]
 
-v_med3_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_nle_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x2c,0xd0,0x80,0x04,0x00,0x40]
 
-v_med3_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_nle_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x2c,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_neq_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0x00,0x40]
+v_cmp_neq_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0x00,0x80]
+v_cmp_neq_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0x00,0xe0]
+v_cmp_neq_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xd6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xd6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xd6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xd6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xd6,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0x00,0x08]
+v_cmp_neq_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0x00,0x10]
+v_cmp_neq_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x5a,0x7c]
 
-v_med3_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xd6,0xd1,0x00,0x00,0x00,0x18]
+v_cmp_neq_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x5a,0x7c]
 
-v_med3_i32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x5a,0x7c]
 
-v_med3_i32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xd7,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x5a,0x7c]
 
-v_med3_i32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x5a,0x7c]
 
-v_med3_i32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x5a,0x7c]
 
-v_med3_i32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x5a,0x7c]
 
-v_med3_i32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_neq_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x5a,0x7c]
 
-v_med3_i32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_neq_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x5a,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_med3_i32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_neq_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x5a,0x7c,0x56,0x34,0x00,0x00]
 
-v_med3_i32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_neq_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x5a,0x7c]
 
-v_med3_i32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_neq_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x5a,0x7c]
 
-v_med3_i32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_neq_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x5b,0x7c]
 
-v_med3_i32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_i32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_neq_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2d,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_i32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_neq_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x2d,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_i32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_neq_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x2d,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_i32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_neq_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2d,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_i32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_neq_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2d,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_i32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_neq_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2d,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_i32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_neq_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2d,0xd0,0x80,0x04,0x00,0x00]
 
-v_med3_i32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xd7,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_neq_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2d,0xd0,0xf0,0x04,0x00,0x00]
 
-v_med3_u32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x01,0x05,0x00,0x00]
 
-v_med3_u32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xd8,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2d,0xd0,0xff,0x05,0x00,0x00]
 
-v_med3_u32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xca,0x00,0x00]
 
-v_med3_u32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xcc,0x00,0x00]
 
-v_med3_u32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xce,0x00,0x00]
 
-v_med3_u32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xd4,0x00,0x00]
 
-v_med3_u32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xd6,0x00,0x00]
 
-v_med3_u32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xd8,0x00,0x00]
 
-v_med3_u32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xda,0x00,0x00]
 
-v_med3_u32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xdc,0x00,0x00]
 
-v_med3_u32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xde,0x00,0x00]
 
-v_med3_u32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xf6,0x00,0x00]
 
-v_med3_u32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xf8,0x00,0x00]
 
-v_med3_u32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xfc,0x00,0x00]
 
-v_med3_u32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_neq_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xfe,0x00,0x00]
 
-v_med3_u32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_neq_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0x00,0x01,0x00]
 
-v_med3_u32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_neq_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xe0,0x01,0x00]
 
-v_med3_u32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_neq_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xfa,0x01,0x00]
 
-v_med3_u32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_neq_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0x04,0x02,0x00]
 
-v_med3_u32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xd8,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_neq_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0xfe,0x03,0x00]
 
-v_sad_u8 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x2d,0xd0,0x80,0x04,0x00,0x40]
 
-v_sad_u8 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xd9,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_neq_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x2d,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u8 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_nlt_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_nlt_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_nlt_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_nlt_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_nlt_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_nlt_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_nlt_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_nlt_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_nlt_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_nlt_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nlt_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_nlt_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_nlt_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_nlt_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_nlt_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_nlt_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_nlt_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x5c,0x7c]
 
-v_sad_u8 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xd9,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_nlt_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x5c,0x7c]
 
-v_sad_hi_u8 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x5c,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_sad_hi_u8 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xda,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x5c,0x7c,0x56,0x34,0x00,0x00]
 
-v_sad_hi_u8 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_nlt_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x5c,0x7c]
 
-v_sad_hi_u8 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_nlt_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x5c,0x7c]
 
-v_sad_hi_u8 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_nlt_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x5d,0x7c]
 
-v_sad_hi_u8 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_hi_u8 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_nlt_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_hi_u8 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_nlt_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_nlt_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_nlt_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_nlt_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_nlt_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nlt_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0xf0,0x04,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_nlt_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x01,0x05,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_nlt_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0xff,0x05,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_nlt_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xca,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_nlt_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xcc,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_nlt_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xce,0x00,0x00]
 
-v_sad_hi_u8 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xda,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_nlt_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xd4,0x00,0x00]
 
-v_sad_u16 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xd6,0x00,0x00]
 
-v_sad_u16 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xdb,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xd8,0x00,0x00]
 
-v_sad_u16 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xda,0x00,0x00]
 
-v_sad_u16 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xdc,0x00,0x00]
 
-v_sad_u16 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xde,0x00,0x00]
 
-v_sad_u16 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xf6,0x00,0x00]
 
-v_sad_u16 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xf8,0x00,0x00]
 
-v_sad_u16 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xfc,0x00,0x00]
 
-v_sad_u16 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xfe,0x00,0x00]
 
-v_sad_u16 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0x00,0x01,0x00]
 
-v_sad_u16 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xe0,0x01,0x00]
 
-v_sad_u16 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xfa,0x01,0x00]
 
-v_sad_u16 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0x04,0x02,0x00]
 
-v_sad_u16 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_nlt_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0xfe,0x03,0x00]
 
-v_sad_u16 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_nlt_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x2e,0xd0,0x80,0x04,0x00,0x40]
 
-v_sad_u16 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_nlt_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x2e,0xd0,0x80,0x04,0x00,0x00]
 
-v_sad_u16 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_tru_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x5e,0x7c]
 
-v_sad_u16 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_tru_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x5e,0x7c]
 
-v_sad_u16 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_tru_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x5e,0x7c]
 
-v_sad_u16 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xdb,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_tru_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_tru_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x5e,0x7c]
 
-v_sad_u32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xdc,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_tru_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x80,0x00,0x00,0x00]
+v_cmp_tru_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0xc1,0x00,0x00,0x00]
+v_cmp_tru_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0xf0,0x00,0x00,0x00]
+v_cmp_tru_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0xf7,0x00,0x00,0x00]
+v_cmp_tru_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_tru_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_tru_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_tru_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_tru_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_tru_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_tru_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_tru_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_tru_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x5e,0x7c]
 
-v_sad_u32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_tru_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x5e,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_sad_u32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_tru_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x5e,0x7c,0x56,0x34,0x00,0x00]
 
-v_sad_u32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_tru_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x5e,0x7c]
 
-v_sad_u32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_tru_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x5e,0x7c]
 
-v_sad_u32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_tru_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x5f,0x7c]
 
-v_sad_u32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xdc,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_tru_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_tru_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x2f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u8_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xdd,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_tru_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x2f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_tru_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x2f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_tru_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x2f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_tru_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x2f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0x01,0x00]
+v_cmp_tru_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x2f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x82,0x01,0x00]
+v_cmp_tru_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x2f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0xe0,0x01,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x2f,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0xee,0x01,0x00]
+v_cmp_tru_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x01,0x05,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_tru_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x2f,0xd0,0xff,0x05,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xca,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0x00,0x02]
+v_cmp_tru_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xce,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0x04,0x03]
+v_cmp_tru_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0xc0,0x03]
+v_cmp_tru_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0xdc,0x03]
+v_cmp_tru_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0xf4,0x03]
+v_cmp_tru_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xda,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0x00,0x04]
+v_cmp_tru_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0xfc,0x07]
+v_cmp_tru_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xde,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xdd,0xd1,0x00,0x00,0x00,0x20]
+v_cmp_tru_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xdd,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cvt_pk_u8_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xdd,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xfc,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xfe,0x00,0x00]
 
-v_div_fixup_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xde,0xd1,0x00,0x00,0x00,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0x00,0x01,0x00]
 
-v_div_fixup_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xde,0xd1,0xfd,0x00,0x00,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xe0,0x01,0x00]
 
-v_div_fixup_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x01,0x00,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xfa,0x01,0x00]
 
-v_div_fixup_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xde,0xd1,0xff,0x01,0x00,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0x04,0x02,0x00]
 
-v_div_fixup_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0xfa,0x01,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0xfe,0x03,0x00]
 
-v_div_fixup_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0x02,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x2f,0xd0,0x80,0x04,0x00,0x40]
 
-v_div_fixup_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0xfe,0x03,0x00]
+v_cmp_tru_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x2f,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0xf4,0x03]
+v_cmpx_f_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_f_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0xfc,0x07]
+v_cmpx_f_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0x00,0x20]
+v_cmpx_f_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0x00,0x40]
+v_cmpx_f_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0x00,0x80]
+v_cmpx_f_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0x00,0xe0]
+v_cmpx_f_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xde,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xde,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xde,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xde,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xde,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0x00,0x08]
+v_cmpx_f_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0x00,0x10]
+v_cmpx_f_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x60,0x7c]
 
-v_div_fixup_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xde,0xd1,0x00,0x00,0x00,0x18]
+v_cmpx_f_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x60,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x60,0x7c]
 
-v_div_fixup_f64 v[254:255], s[0:1], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0xdf,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x60,0x7c]
 
-v_div_fixup_f64 v[0:1], scc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0xfd,0x00,0x00,0x00]
+v_cmpx_f_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x60,0x7c]
 
-v_div_fixup_f64 v[0:1], v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_f_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x60,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], v[254:255], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_f_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x60,0x7c,0x56,0x34,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0xfa,0x01,0x00]
+v_cmpx_f_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x60,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_f_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x60,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_f_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x61,0x7c]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0xf4,0x03]
+v_cmpx_f_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_f_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0xf8,0x07]
+v_cmpx_f_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], -s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0x00,0x20]
+v_cmpx_f_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0x00,0x40]
+v_cmpx_f_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0x00,0x80]
+v_cmpx_f_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], -s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0x00,0xe0]
+v_cmpx_f_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], |s[0:1]|, s[0:1], s[0:1]
-// CHECK: [0x00,0x01,0xdf,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x02,0xdf,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x30,0xd0,0xf0,0x04,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x04,0xdf,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x30,0xd0,0x01,0x05,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], |s[0:1]|, |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x07,0xdf,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x30,0xd0,0xff,0x05,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0xdf,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xca,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0x00,0x08]
+v_cmpx_f_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xcc,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0x00,0x10]
+v_cmpx_f_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xce,0x00,0x00]
 
-v_div_fixup_f64 v[0:1], s[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0xdf,0xd1,0x00,0x00,0x00,0x18]
+v_cmpx_f_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xd4,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, s0, s0, s0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xd6,0x00,0x00]
 
-v_div_scale_f32 v255, vcc, s0, s0, s0
-// CHECK: [0xff,0x6a,0xe0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xd8,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, 0, s0, s0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xda,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, 0.5, s0, s0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xdc,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, v0, s0, s0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xde,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, v255, s0, s0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xf6,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, s0, 0, s0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xf8,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, s0, 0.5, s0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xfc,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, s0, v0, s0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xfe,0x00,0x00]
 
-v_div_scale_f32 v0, vcc, s0, v255, s0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0x00,0x01,0x00]
 
-v_div_scale_f32 v0, vcc, s0, s0, 0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x00,0x00,0x00,0x02]
+v_cmpx_f_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xe0,0x01,0x00]
 
-v_div_scale_f32 v0, vcc, s0, s0, 0.5
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x00,0x00,0xc0,0x03]
+v_cmpx_f_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xfa,0x01,0x00]
 
-v_div_scale_f32 v0, vcc, s0, s0, v0
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_f_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0x04,0x02,0x00]
 
-v_div_scale_f32 v0, vcc, s0, s0, v255
-// CHECK: [0x00,0x6a,0xe0,0xd1,0x00,0x00,0xfc,0x07]
+v_cmpx_f_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0xfe,0x03,0x00]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x30,0xd0,0x80,0x04,0x00,0x40]
 
-v_div_scale_f64 v[254:255], vcc, s[0:1], s[0:1], s[0:1]
-// CHECK: [0xfe,0x6a,0xe1,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_f_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x30,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_scale_f64 v[0:1], vcc, 0, s[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_lt_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, 0.5, s[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_lt_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, v[254:255], s[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_lt_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_lt_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_lt_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_lt_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], 0
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x00,0x00,0x00,0x02]
+v_cmpx_lt_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x00,0x00,0xc0,0x03]
+v_cmpx_lt_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_lt_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x62,0x7c]
 
-v_div_scale_f64 v[0:1], vcc, s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x6a,0xe1,0xd1,0x00,0x00,0xf8,0x07]
+v_cmpx_lt_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x62,0x7c]
 
-v_div_fmas_f32 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x62,0x7c]
 
-v_div_fmas_f32 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x62,0x7c]
 
-v_div_fmas_f32 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0xfd,0x00,0x00,0x00]
+v_cmpx_lt_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x62,0x7c]
 
-v_div_fmas_f32 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_lt_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x62,0x7c]
 
-v_div_fmas_f32 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_lt_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x62,0x7c]
 
-v_div_fmas_f32 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0xfa,0x01,0x00]
+v_cmpx_lt_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x62,0x7c]
 
-v_div_fmas_f32 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_lt_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x62,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_lt_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x62,0x7c,0x56,0x34,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0xf4,0x03]
+v_cmpx_lt_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x62,0x7c]
 
-v_div_fmas_f32 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_lt_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x62,0x7c]
 
-v_div_fmas_f32 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0xfc,0x07]
+v_cmpx_lt_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x63,0x7c]
 
-v_div_fmas_f32 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x00,0x20]
+v_cmpx_lt_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x00,0x40]
+v_cmpx_lt_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x31,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x00,0x80]
+v_cmpx_lt_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x31,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x00,0xe0]
+v_cmpx_lt_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x31,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x31,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x31,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x31,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x31,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xe2,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x31,0xd0,0xf0,0x04,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x00,0x08]
+v_cmpx_lt_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x31,0xd0,0x01,0x05,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x00,0x10]
+v_cmpx_lt_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x31,0xd0,0xff,0x05,0x00,0x00]
 
-v_div_fmas_f32 v0, s0, s0, s0 div:2
-// CHECK: [0x00,0x00,0xe2,0xd1,0x00,0x00,0x00,0x18]
+v_cmpx_lt_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xca,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xcc,0x00,0x00]
 
-v_div_fmas_f64 v[254:255], s[0:1], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0xe3,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xce,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], scc, s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0xfd,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xd4,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xd6,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], v[254:255], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xd8,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0xfa,0x01,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xda,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xdc,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0xfc,0x03,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xde,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0xf4,0x03]
+v_cmpx_lt_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xf6,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_lt_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xf8,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0xf8,0x07]
+v_cmpx_lt_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xfc,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], -s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0x00,0x20]
+v_cmpx_lt_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xfe,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0x00,0x40]
+v_cmpx_lt_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0x00,0x01,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0x00,0x80]
+v_cmpx_lt_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xe0,0x01,0x00]
 
-v_div_fmas_f64 v[0:1], -s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0x00,0xe0]
+v_cmpx_lt_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xfa,0x01,0x00]
 
-v_div_fmas_f64 v[0:1], |s[0:1]|, s[0:1], s[0:1]
-// CHECK: [0x00,0x01,0xe3,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0x04,0x02,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x02,0xe3,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0xfe,0x03,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x04,0xe3,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x31,0xd0,0x80,0x04,0x00,0x40]
 
-v_div_fmas_f64 v[0:1], |s[0:1]|, |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x07,0xe3,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x31,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0xe3,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x64,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0x00,0x08]
+v_cmpx_eq_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x64,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0x00,0x10]
+v_cmpx_eq_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x64,0x7c]
 
-v_div_fmas_f64 v[0:1], s[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0xe3,0xd1,0x00,0x00,0x00,0x18]
+v_cmpx_eq_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x64,0x7c]
 
-v_msad_u8 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x64,0x7c]
 
-v_msad_u8 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xe4,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x64,0x7c]
 
-v_msad_u8 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_eq_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x64,0x7c]
 
-v_msad_u8 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x64,0x7c]
 
-v_msad_u8 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x64,0x7c]
 
-v_msad_u8 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x64,0x7c]
 
-v_msad_u8 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_eq_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x64,0x7c]
 
-v_msad_u8 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_eq_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x64,0x7c]
 
-v_msad_u8 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_eq_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x64,0x7c]
 
-v_msad_u8 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_eq_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x64,0x7c]
 
-v_msad_u8 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x64,0x7c]
 
-v_msad_u8 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_eq_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x64,0x7c]
 
-v_msad_u8 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_eq_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x64,0x7c]
 
-v_msad_u8 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x64,0x7c]
 
-v_msad_u8 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0x00,0x02]
+v_cmpx_eq_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x64,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_msad_u8 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0x04,0x03]
+v_cmpx_eq_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x64,0x7c,0x56,0x34,0x00,0x00]
 
-v_msad_u8 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0xc0,0x03]
+v_cmpx_eq_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x64,0x7c]
 
-v_msad_u8 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0xdc,0x03]
+v_cmpx_eq_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x64,0x7c]
 
-v_msad_u8 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_eq_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x65,0x7c]
 
-v_msad_u8 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xe4,0xd1,0x00,0x00,0xfc,0x07]
+v_cmpx_eq_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[254:255], s[0:1], s0, s[0:1]
-// CHECK: [0xfe,0x00,0xe5,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], v[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_eq_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], v[254:255], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x32,0xd0,0xf0,0x04,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_eq_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x32,0xd0,0x01,0x05,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_eq_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x32,0xd0,0xff,0x05,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xca,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xcc,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], v0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xce,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], v255, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xd4,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x00,0x00,0x02]
+v_cmpx_eq_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xd6,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x00,0x04,0x03]
+v_cmpx_eq_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xd8,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x00,0xc0,0x03]
+v_cmpx_eq_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xda,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x00,0xdc,0x03]
+v_cmpx_eq_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xdc,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, v[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_eq_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xde,0x00,0x00]
 
-v_qsad_pk_u16_u8 v[0:1], s[0:1], s0, v[254:255]
-// CHECK: [0x00,0x00,0xe5,0xd1,0x00,0x00,0xf8,0x07]
+v_cmpx_eq_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xf6,0x00,0x00]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xf8,0x00,0x00]
 
-v_mqsad_pk_u16_u8 v[254:255], s[0:1], s0, s[0:1]
-// CHECK: [0xfe,0x00,0xe6,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xfc,0x00,0x00]
 
-v_mqsad_pk_u16_u8 v[0:1], 0, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xfe,0x00,0x00]
 
-v_mqsad_pk_u16_u8 v[0:1], -1, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0x00,0x01,0x00]
 
-v_mqsad_pk_u16_u8 v[0:1], 0.5, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xe0,0x01,0x00]
 
-v_mqsad_pk_u16_u8 v[0:1], -4.0, s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xfa,0x01,0x00]
 
-v_mqsad_pk_u16_u8 v[0:1], v[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0x04,0x02,0x00]
 
-v_mqsad_pk_u16_u8 v[0:1], v[254:255], s0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0xfe,0x01,0x00,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0xfe,0x03,0x00]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x32,0xd0,0x80,0x04,0x00,0x40]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_eq_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x32,0xd0,0x80,0x04,0x00,0x00]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_le_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x66,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_le_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x66,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], v0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_le_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x66,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], v255, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_le_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x66,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0x00,0x02]
+v_cmpx_le_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x66,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0x04,0x03]
+v_cmpx_le_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x66,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0xc0,0x03]
+v_cmpx_le_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x66,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0xdc,0x03]
+v_cmpx_le_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x66,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, v[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_le_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x66,0x7c]
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, v[254:255]
-// CHECK: [0x00,0x00,0xe6,0xd1,0x00,0x00,0xf8,0x07]
+v_cmpx_le_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x66,0x7c]
 
-v_mqsad_u32_u8 v[0:3], s[0:1], s0, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_le_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x66,0x7c]
 
-v_mqsad_u32_u8 v[252:255], s[0:1], s0, v[0:3]
-// CHECK: [0xfc,0x00,0xe7,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_le_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x66,0x7c]
 
-v_mqsad_u32_u8 v[0:3], 0, s0, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0x80,0x00,0x00,0x04]
+v_cmpx_le_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x66,0x7c]
 
-v_mqsad_u32_u8 v[0:3], -1, s0, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0xc1,0x00,0x00,0x04]
+v_cmpx_le_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x66,0x7c]
 
-v_mqsad_u32_u8 v[0:3], 0.5, s0, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0xf0,0x00,0x00,0x04]
+v_cmpx_le_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x66,0x7c]
 
-v_mqsad_u32_u8 v[0:3], -4.0, s0, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0xf7,0x00,0x00,0x04]
+v_cmpx_le_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x66,0x7c]
 
-v_mqsad_u32_u8 v[0:3], v[0:1], s0, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0x00,0x01,0x00,0x04]
+v_cmpx_le_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x66,0x7c]
 
-v_mqsad_u32_u8 v[0:3], v[254:255], s0, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0xfe,0x01,0x00,0x04]
+v_cmpx_le_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x66,0x7c]
 
-v_mqsad_u32_u8 v[0:3], s[0:1], 0, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0x00,0x00,0x01,0x04]
+v_cmpx_le_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x66,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_mqsad_u32_u8 v[0:3], s[0:1], -1, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0x00,0x82,0x01,0x04]
+v_cmpx_le_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x66,0x7c,0x56,0x34,0x00,0x00]
 
-v_mqsad_u32_u8 v[0:3], s[0:1], 0.5, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0x00,0xe0,0x01,0x04]
+v_cmpx_le_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x66,0x7c]
 
-v_mqsad_u32_u8 v[0:3], s[0:1], -4.0, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0x00,0xee,0x01,0x04]
+v_cmpx_le_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x66,0x7c]
 
-v_mqsad_u32_u8 v[0:3], s[0:1], v0, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0x00,0x00,0x02,0x04]
+v_cmpx_le_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x67,0x7c]
 
-v_mqsad_u32_u8 v[0:3], s[0:1], v255, v[0:3]
-// CHECK: [0x00,0x00,0xe7,0xd1,0x00,0xfe,0x03,0x04]
+v_cmpx_le_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0x04,0x00,0x00]
 
-v_mqsad_u32_u8 v[0:3], s[0:1], s0, v[252:255]
-// CHECK: [0x00,0x00,0xe7,0xd1,0x00,0x00,0xf0,0x07]
+v_cmpx_le_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x33,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f16 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x33,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f16 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x33,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f16 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xea,0xd1,0xfd,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x33,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f16 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_le_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x33,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f16 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xea,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_le_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x33,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f16 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0xfa,0x01,0x00]
+v_cmpx_le_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x33,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_f16 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x33,0xd0,0xf0,0x04,0x00,0x00]
 
-v_mad_f16 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_le_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x33,0xd0,0x01,0x05,0x00,0x00]
 
-v_mad_f16 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0xf4,0x03]
+v_cmpx_le_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x33,0xd0,0xff,0x05,0x00,0x00]
 
-v_mad_f16 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_le_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xca,0x00,0x00]
 
-v_mad_f16 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0xfc,0x07]
+v_cmpx_le_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xcc,0x00,0x00]
 
-v_mad_f16 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0x00,0x20]
+v_cmpx_le_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xce,0x00,0x00]
 
-v_mad_f16 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0x00,0x40]
+v_cmpx_le_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xd4,0x00,0x00]
 
-v_mad_f16 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0x00,0x80]
+v_cmpx_le_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xd6,0x00,0x00]
 
-v_mad_f16 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xea,0xd1,0x00,0x00,0x00,0xe0]
+v_cmpx_le_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xd8,0x00,0x00]
 
-v_mad_f16 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xda,0x00,0x00]
 
-v_mad_f16 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xdc,0x00,0x00]
 
-v_mad_f16 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xde,0x00,0x00]
 
-v_mad_f16 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xf6,0x00,0x00]
 
-v_mad_f16 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xea,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xf8,0x00,0x00]
 
-v_mad_u16 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xfc,0x00,0x00]
 
-v_mad_u16 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xeb,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xfe,0x00,0x00]
 
-v_mad_u16 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0x00,0x01,0x00]
 
-v_mad_u16 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xe0,0x01,0x00]
 
-v_mad_u16 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xfa,0x01,0x00]
 
-v_mad_u16 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0x04,0x02,0x00]
 
-v_mad_u16 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0xfe,0x03,0x00]
 
-v_mad_u16 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x33,0xd0,0x80,0x04,0x00,0x40]
 
-v_mad_u16 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_le_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x33,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_u16 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_gt_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x68,0x7c]
 
-v_mad_u16 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x68,0x7c]
 
-v_mad_u16 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_gt_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x68,0x7c]
 
-v_mad_u16 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_gt_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x68,0x7c]
 
-v_mad_u16 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x68,0x7c]
 
-v_mad_u16 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x00,0x00,0x02]
+v_cmpx_gt_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x68,0x7c]
 
-v_mad_u16 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x00,0x04,0x03]
+v_cmpx_gt_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x68,0x7c]
 
-v_mad_u16 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x00,0xc0,0x03]
+v_cmpx_gt_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x68,0x7c]
 
-v_mad_u16 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x00,0xdc,0x03]
+v_cmpx_gt_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x68,0x7c]
 
-v_mad_u16 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_gt_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x68,0x7c]
 
-v_mad_u16 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xeb,0xd1,0x00,0x00,0xfc,0x07]
+v_cmpx_gt_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x68,0x7c]
 
-v_mad_i16 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x68,0x7c]
 
-v_mad_i16 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xec,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x68,0x7c]
 
-v_mad_i16 v0, 0, s0, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0x80,0x00,0x00,0x00]
+v_cmpx_gt_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x68,0x7c]
 
-v_mad_i16 v0, -1, s0, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x68,0x7c]
 
-v_mad_i16 v0, 0.5, s0, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x68,0x7c]
 
-v_mad_i16 v0, -4.0, s0, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0xf7,0x00,0x00,0x00]
+v_cmpx_gt_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x68,0x7c]
 
-v_mad_i16 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_gt_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x68,0x7c]
 
-v_mad_i16 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_gt_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x68,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_mad_i16 v0, s0, 0, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_gt_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x68,0x7c,0x56,0x34,0x00,0x00]
 
-v_mad_i16 v0, s0, -1, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_gt_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x68,0x7c]
 
-v_mad_i16 v0, s0, 0.5, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x68,0x7c]
 
-v_mad_i16 v0, s0, -4.0, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_gt_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x69,0x7c]
 
-v_mad_i16 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_i16 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_i16 v0, s0, s0, 0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0x00,0x02]
+v_cmpx_gt_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_i16 v0, s0, s0, -1
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0x04,0x03]
+v_cmpx_gt_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_i16 v0, s0, s0, 0.5
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0xc0,0x03]
+v_cmpx_gt_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_i16 v0, s0, s0, -4.0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0xdc,0x03]
+v_cmpx_gt_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_i16 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_gt_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_mad_i16 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xec,0xd1,0x00,0x00,0xfc,0x07]
+v_cmpx_gt_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_fma_f16 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x34,0xd0,0xf0,0x04,0x00,0x00]
 
-v_fma_f16 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x34,0xd0,0x01,0x05,0x00,0x00]
 
-v_fma_f16 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xee,0xd1,0xfd,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x34,0xd0,0xff,0x05,0x00,0x00]
 
-v_fma_f16 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xca,0x00,0x00]
 
-v_fma_f16 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xee,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xcc,0x00,0x00]
 
-v_fma_f16 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0xfa,0x01,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xce,0x00,0x00]
 
-v_fma_f16 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xd4,0x00,0x00]
 
-v_fma_f16 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xd6,0x00,0x00]
 
-v_fma_f16 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0xf4,0x03]
+v_cmpx_gt_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xd8,0x00,0x00]
 
-v_fma_f16 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_gt_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xda,0x00,0x00]
 
-v_fma_f16 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0xfc,0x07]
+v_cmpx_gt_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xdc,0x00,0x00]
 
-v_fma_f16 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0x00,0x20]
+v_cmpx_gt_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xde,0x00,0x00]
 
-v_fma_f16 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0x00,0x40]
+v_cmpx_gt_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xf6,0x00,0x00]
 
-v_fma_f16 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0x00,0x80]
+v_cmpx_gt_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xf8,0x00,0x00]
 
-v_fma_f16 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xee,0xd1,0x00,0x00,0x00,0xe0]
+v_cmpx_gt_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xfc,0x00,0x00]
 
-v_fma_f16 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xfe,0x00,0x00]
 
-v_fma_f16 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0x00,0x01,0x00]
 
-v_fma_f16 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xe0,0x01,0x00]
 
-v_fma_f16 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xfa,0x01,0x00]
 
-v_fma_f16 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xee,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0x04,0x02,0x00]
 
-v_div_fixup_f16 v0, s0, s0, s0
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0xfe,0x03,0x00]
 
-v_div_fixup_f16 v255, s0, s0, s0
-// CHECK: [0xff,0x00,0xef,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x34,0xd0,0x80,0x04,0x00,0x40]
 
-v_div_fixup_f16 v0, scc, s0, s0
-// CHECK: [0x00,0x00,0xef,0xd1,0xfd,0x00,0x00,0x00]
+v_cmpx_gt_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x34,0xd0,0x80,0x04,0x00,0x00]
 
-v_div_fixup_f16 v0, v0, s0, s0
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_lg_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, v255, s0, s0
-// CHECK: [0x00,0x00,0xef,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_lg_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, scc, s0
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0xfa,0x01,0x00]
+v_cmpx_lg_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, v0, s0
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_lg_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, v255, s0
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_lg_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, s0, scc
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0x00,0xf4,0x03]
+v_cmpx_lg_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, s0, v0
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0x00,0x00,0x04]
+v_cmpx_lg_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, s0, v255
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0x00,0xfc,0x07]
+v_cmpx_lg_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, -s0, s0, s0
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0x00,0x00,0x20]
+v_cmpx_lg_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, -s0, s0
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0x00,0x00,0x40]
+v_cmpx_lg_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, s0, -s0
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0x00,0x00,0x80]
+v_cmpx_lg_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, -s0, -s0, -s0
-// CHECK: [0x00,0x00,0xef,0xd1,0x00,0x00,0x00,0xe0]
+v_cmpx_lg_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, |s0|, s0, s0
-// CHECK: [0x00,0x01,0xef,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, |s0|, s0
-// CHECK: [0x00,0x02,0xef,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, s0, |s0|
-// CHECK: [0x00,0x04,0xef,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, |s0|, |s0|, |s0|
-// CHECK: [0x00,0x07,0xef,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x6a,0x7c]
 
-v_div_fixup_f16 v0, s0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xef,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x6a,0x7c]
 
-v_cvt_pkaccum_u8_f32 v0, s0, s0
-// CHECK: [0x00,0x00,0xf0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x6a,0x7c]
 
-v_cvt_pkaccum_u8_f32 v255, s0, s0
-// CHECK: [0xff,0x00,0xf0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x6a,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, scc, s0
-// CHECK: [0x00,0x00,0xf0,0xd1,0xfd,0x00,0x00,0x00]
+v_cmpx_lg_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x6a,0x7c,0x56,0x34,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, v0, s0
-// CHECK: [0x00,0x00,0xf0,0xd1,0x00,0x01,0x00,0x00]
+v_cmpx_lg_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x6a,0x7c]
 
-v_cvt_pkaccum_u8_f32 v0, v255, s0
-// CHECK: [0x00,0x00,0xf0,0xd1,0xff,0x01,0x00,0x00]
+v_cmpx_lg_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x6a,0x7c]
 
-v_cvt_pkaccum_u8_f32 v0, s0, 0
-// CHECK: [0x00,0x00,0xf0,0xd1,0x00,0x00,0x01,0x00]
+v_cmpx_lg_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x6b,0x7c]
 
-v_cvt_pkaccum_u8_f32 v0, s0, -1
-// CHECK: [0x00,0x00,0xf0,0xd1,0x00,0x82,0x01,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0xf0,0xd1,0x00,0xe0,0x01,0x00]
+v_cmpx_lg_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x35,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0xf0,0xd1,0x00,0xee,0x01,0x00]
+v_cmpx_lg_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x35,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, s0, scc
-// CHECK: [0x00,0x00,0xf0,0xd1,0x00,0xfa,0x01,0x00]
+v_cmpx_lg_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x35,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0xf0,0xd1,0x00,0x00,0x02,0x00]
+v_cmpx_lg_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x35,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, s0, v255
-// CHECK: [0x00,0x00,0xf0,0xd1,0x00,0xfe,0x03,0x00]
+v_cmpx_lg_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x35,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, -s0, s0
-// CHECK: [0x00,0x00,0xf0,0xd1,0x00,0x00,0x00,0x20]
+v_cmpx_lg_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x35,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, |s0|, s0
-// CHECK: [0x00,0x01,0xf0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x35,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkaccum_u8_f32 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0xf0,0xd1,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x35,0xd0,0xf0,0x04,0x00,0x00]
 
-v_add_f64 v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x35,0xd0,0x01,0x05,0x00,0x00]
 
-v_add_f64 v[254:255], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x35,0xd0,0xff,0x05,0x00,0x00]
 
-v_add_f64 v[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x80,0xd2,0xfd,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xca,0x00,0x00]
 
-v_add_f64 v[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xcc,0x00,0x00]
 
-v_add_f64 v[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x80,0xd2,0xfe,0x01,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xce,0x00,0x00]
 
-v_add_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0xfa,0x01,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xd4,0x00,0x00]
 
-v_add_f64 v[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xd6,0x00,0x00]
 
-v_add_f64 v[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0xfc,0x03,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xd8,0x00,0x00]
 
-v_add_f64 v[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x20]
+v_cmpx_lg_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xda,0x00,0x00]
 
-v_add_f64 v[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x40]
+v_cmpx_lg_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xdc,0x00,0x00]
 
-v_add_f64 v[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x60]
+v_cmpx_lg_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xde,0x00,0x00]
 
-v_add_f64 v[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x01,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xf6,0x00,0x00]
 
-v_add_f64 v[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x02,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xf8,0x00,0x00]
 
-v_add_f64 v[0:1], |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x03,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xfc,0x00,0x00]
 
-v_add_f64 v[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x80,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xfe,0x00,0x00]
 
-v_add_f64 v[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x08]
+v_cmpx_lg_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0x00,0x01,0x00]
 
-v_add_f64 v[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x10]
+v_cmpx_lg_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xe0,0x01,0x00]
 
-v_add_f64 v[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x00,0x00,0x18]
+v_cmpx_lg_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xfa,0x01,0x00]
 
-v_mul_f64 v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0x04,0x02,0x00]
 
-v_mul_f64 v[254:255], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0x81,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0xfe,0x03,0x00]
 
-v_mul_f64 v[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x81,0xd2,0xfd,0x00,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x35,0xd0,0x80,0x04,0x00,0x40]
 
-v_mul_f64 v[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_lg_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x35,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_f64 v[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x81,0xd2,0xfe,0x01,0x00,0x00]
+v_cmpx_ge_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0xfa,0x01,0x00]
+v_cmpx_ge_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_ge_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0xfc,0x03,0x00]
+v_cmpx_ge_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0x00,0x00,0x20]
+v_cmpx_ge_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0x00,0x00,0x40]
+v_cmpx_ge_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0x00,0x00,0x60]
+v_cmpx_ge_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x01,0x81,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x02,0x81,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x03,0x81,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x81,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0x00,0x00,0x08]
+v_cmpx_ge_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0x00,0x00,0x10]
+v_cmpx_ge_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x6c,0x7c]
 
-v_mul_f64 v[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x81,0xd2,0x00,0x00,0x00,0x18]
+v_cmpx_ge_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x6c,0x7c]
 
-v_min_f64 v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x6c,0x7c]
 
-v_min_f64 v[254:255], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x6c,0x7c]
 
-v_min_f64 v[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x82,0xd2,0xfd,0x00,0x00,0x00]
+v_cmpx_ge_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x6c,0x7c]
 
-v_min_f64 v[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_ge_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x6c,0x7c]
 
-v_min_f64 v[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x82,0xd2,0xfe,0x01,0x00,0x00]
+v_cmpx_ge_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x6c,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_min_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0xfa,0x01,0x00]
+v_cmpx_ge_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x6c,0x7c,0x56,0x34,0x00,0x00]
 
-v_min_f64 v[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_ge_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x6c,0x7c]
 
-v_min_f64 v[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0xfc,0x03,0x00]
+v_cmpx_ge_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x6c,0x7c]
 
-v_min_f64 v[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x20]
+v_cmpx_ge_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x6d,0x7c]
 
-v_min_f64 v[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x40]
+v_cmpx_ge_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f64 v[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x60]
+v_cmpx_ge_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f64 v[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x01,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f64 v[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x02,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f64 v[0:1], |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x03,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f64 v[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x82,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f64 v[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x08]
+v_cmpx_ge_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f64 v[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x10]
+v_cmpx_ge_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_min_f64 v[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x82,0xd2,0x00,0x00,0x00,0x18]
+v_cmpx_ge_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x36,0xd0,0xf0,0x04,0x00,0x00]
 
-v_max_f64 v[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x36,0xd0,0x01,0x05,0x00,0x00]
 
-v_max_f64 v[254:255], s[0:1], s[0:1]
-// CHECK: [0xfe,0x00,0x83,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x36,0xd0,0xff,0x05,0x00,0x00]
 
-v_max_f64 v[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xca,0x00,0x00]
 
-v_max_f64 v[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xcc,0x00,0x00]
 
-v_max_f64 v[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x83,0xd2,0xfe,0x01,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xce,0x00,0x00]
 
-v_max_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0xfa,0x01,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xd4,0x00,0x00]
 
-v_max_f64 v[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xd6,0x00,0x00]
 
-v_max_f64 v[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0xfc,0x03,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xd8,0x00,0x00]
 
-v_max_f64 v[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0x00,0x00,0x20]
+v_cmpx_ge_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xda,0x00,0x00]
 
-v_max_f64 v[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0x00,0x00,0x40]
+v_cmpx_ge_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xdc,0x00,0x00]
 
-v_max_f64 v[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0x00,0x00,0x60]
+v_cmpx_ge_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xde,0x00,0x00]
 
-v_max_f64 v[0:1], |s[0:1]|, s[0:1]
-// CHECK: [0x00,0x01,0x83,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xf6,0x00,0x00]
 
-v_max_f64 v[0:1], s[0:1], |s[0:1]|
-// CHECK: [0x00,0x02,0x83,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xf8,0x00,0x00]
 
-v_max_f64 v[0:1], |s[0:1]|, |s[0:1]|
-// CHECK: [0x00,0x03,0x83,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xfc,0x00,0x00]
 
-v_max_f64 v[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x83,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xfe,0x00,0x00]
 
-v_max_f64 v[0:1], s[0:1], s[0:1] mul:2
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0x00,0x00,0x08]
+v_cmpx_ge_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0x00,0x01,0x00]
 
-v_max_f64 v[0:1], s[0:1], s[0:1] mul:4
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0x00,0x00,0x10]
+v_cmpx_ge_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xe0,0x01,0x00]
 
-v_max_f64 v[0:1], s[0:1], s[0:1] div:2
-// CHECK: [0x00,0x00,0x83,0xd2,0x00,0x00,0x00,0x18]
+v_cmpx_ge_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xfa,0x01,0x00]
 
-v_ldexp_f64 v[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0x04,0x02,0x00]
 
-v_ldexp_f64 v[254:255], s[0:1], s0
-// CHECK: [0xfe,0x00,0x84,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0xfe,0x03,0x00]
 
-v_ldexp_f64 v[0:1], scc, s0
-// CHECK: [0x00,0x00,0x84,0xd2,0xfd,0x00,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x36,0xd0,0x80,0x04,0x00,0x40]
 
-v_ldexp_f64 v[0:1], v[0:1], s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_ge_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x36,0xd0,0x80,0x04,0x00,0x00]
 
-v_ldexp_f64 v[0:1], v[254:255], s0
-// CHECK: [0x00,0x00,0x84,0xd2,0xfe,0x01,0x00,0x00]
+v_cmpx_o_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_o_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_o_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_o_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_o_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0xfa,0x01,0x00]
+v_cmpx_o_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], v0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_o_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], v255
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_o_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], -s[0:1], s0
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x00,0x20]
+v_cmpx_o_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], |s[0:1]|, s0
-// CHECK: [0x00,0x01,0x84,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_o_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], s0 clamp
-// CHECK: [0x00,0x80,0x84,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_o_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], s0 mul:2
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x00,0x08]
+v_cmpx_o_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], s0 mul:4
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x00,0x10]
+v_cmpx_o_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x6e,0x7c]
 
-v_ldexp_f64 v[0:1], s[0:1], s0 div:2
-// CHECK: [0x00,0x00,0x84,0xd2,0x00,0x00,0x00,0x18]
+v_cmpx_o_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x6e,0x7c]
 
-v_mul_lo_u32 v0, s0, s0
-// CHECK: [0x00,0x00,0x85,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_o_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x6e,0x7c]
 
-v_mul_lo_u32 v255, s0, s0
-// CHECK: [0xff,0x00,0x85,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_o_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x6e,0x7c]
 
-v_mul_lo_u32 v0, 0, s0
-// CHECK: [0x00,0x00,0x85,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_o_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x6e,0x7c]
 
-v_mul_lo_u32 v0, -1, s0
-// CHECK: [0x00,0x00,0x85,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_o_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x6e,0x7c]
 
-v_mul_lo_u32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x85,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_o_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x6e,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_mul_lo_u32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x85,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_o_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x6e,0x7c,0x56,0x34,0x00,0x00]
 
-v_mul_lo_u32 v0, v0, s0
-// CHECK: [0x00,0x00,0x85,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_o_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x6e,0x7c]
 
-v_mul_lo_u32 v0, v255, s0
-// CHECK: [0x00,0x00,0x85,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_o_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x6e,0x7c]
 
-v_mul_lo_u32 v0, s0, 0
-// CHECK: [0x00,0x00,0x85,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_o_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x6f,0x7c]
 
-v_mul_lo_u32 v0, s0, -1
-// CHECK: [0x00,0x00,0x85,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_u32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x85,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_o_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x37,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_u32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x85,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_o_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x37,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_u32 v0, s0, v0
-// CHECK: [0x00,0x00,0x85,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_o_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x37,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_lo_u32 v0, s0, v255
-// CHECK: [0x00,0x00,0x85,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_o_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x37,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_hi_u32 v0, s0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x37,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_hi_u32 v255, s0, s0
-// CHECK: [0xff,0x00,0x86,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x37,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_hi_u32 v0, 0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x37,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_hi_u32 v0, -1, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x37,0xd0,0xf0,0x04,0x00,0x00]
 
-v_mul_hi_u32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x37,0xd0,0x01,0x05,0x00,0x00]
 
-v_mul_hi_u32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x37,0xd0,0xff,0x05,0x00,0x00]
 
-v_mul_hi_u32 v0, v0, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xca,0x00,0x00]
 
-v_mul_hi_u32 v0, v255, s0
-// CHECK: [0x00,0x00,0x86,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xcc,0x00,0x00]
 
-v_mul_hi_u32 v0, s0, 0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xce,0x00,0x00]
 
-v_mul_hi_u32 v0, s0, -1
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xd4,0x00,0x00]
 
-v_mul_hi_u32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xd6,0x00,0x00]
 
-v_mul_hi_u32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xd8,0x00,0x00]
 
-v_mul_hi_u32 v0, s0, v0
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xda,0x00,0x00]
 
-v_mul_hi_u32 v0, s0, v255
-// CHECK: [0x00,0x00,0x86,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xdc,0x00,0x00]
 
-v_mul_hi_i32 v0, s0, s0
-// CHECK: [0x00,0x00,0x87,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xde,0x00,0x00]
 
-v_mul_hi_i32 v255, s0, s0
-// CHECK: [0xff,0x00,0x87,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xf6,0x00,0x00]
 
-v_mul_hi_i32 v0, 0, s0
-// CHECK: [0x00,0x00,0x87,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xf8,0x00,0x00]
 
-v_mul_hi_i32 v0, -1, s0
-// CHECK: [0x00,0x00,0x87,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xfc,0x00,0x00]
 
-v_mul_hi_i32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x87,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xfe,0x00,0x00]
 
-v_mul_hi_i32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x87,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0x00,0x01,0x00]
 
-v_mul_hi_i32 v0, v0, s0
-// CHECK: [0x00,0x00,0x87,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xe0,0x01,0x00]
 
-v_mul_hi_i32 v0, v255, s0
-// CHECK: [0x00,0x00,0x87,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xfa,0x01,0x00]
 
-v_mul_hi_i32 v0, s0, 0
-// CHECK: [0x00,0x00,0x87,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0x04,0x02,0x00]
 
-v_mul_hi_i32 v0, s0, -1
-// CHECK: [0x00,0x00,0x87,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0xfe,0x03,0x00]
 
-v_mul_hi_i32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x87,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x37,0xd0,0x80,0x04,0x00,0x40]
 
-v_mul_hi_i32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x87,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_o_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x37,0xd0,0x80,0x04,0x00,0x00]
 
-v_mul_hi_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0x87,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_u_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x70,0x7c]
 
-v_mul_hi_i32 v0, s0, v255
-// CHECK: [0x00,0x00,0x87,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_u_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_u_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x70,0x7c]
 
-v_ldexp_f32 v255, s0, s0
-// CHECK: [0xff,0x00,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_u_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, scc, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0xfd,0x00,0x00,0x00]
+v_cmpx_u_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, v0, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, v255, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_u_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, 0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_u_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, -1
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_u_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_u_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_u_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, scc
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0xfa,0x01,0x00]
+v_cmpx_u_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_u_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, v255
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_u_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, -s0, s0
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x20]
+v_cmpx_u_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_u_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x88,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_u_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, s0 mul:2
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x08]
+v_cmpx_u_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x70,0x7c]
 
-v_ldexp_f32 v0, s0, s0 mul:4
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x10]
+v_cmpx_u_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x70,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_ldexp_f32 v0, s0, s0 div:2
-// CHECK: [0x00,0x00,0x88,0xd2,0x00,0x00,0x00,0x18]
+v_cmpx_u_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x70,0x7c,0x56,0x34,0x00,0x00]
 
-v_readlane_b32 s0, v0, s0
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x70,0x7c]
 
-v_readlane_b32 s101, v0, s0
-// CHECK: [0x65,0x00,0x89,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x70,0x7c]
 
-v_readlane_b32 flat_scratch_lo, v0, s0
-// CHECK: [0x66,0x00,0x89,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x71,0x7c]
 
-v_readlane_b32 flat_scratch_hi, v0, s0
-// CHECK: [0x67,0x00,0x89,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 tba_lo, v0, s0
-// CHECK: [0x6c,0x00,0x89,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 tba_hi, v0, s0
-// CHECK: [0x6d,0x00,0x89,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 tma_lo, v0, s0
-// CHECK: [0x6e,0x00,0x89,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 tma_hi, v0, s0
-// CHECK: [0x6f,0x00,0x89,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 ttmp11, v0, s0
-// CHECK: [0x7b,0x00,0x89,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 s0, v255, s0
-// CHECK: [0x00,0x00,0x89,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_u_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 s0, v0, s101
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xcb,0x00,0x00]
+v_cmpx_u_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_readlane_b32 s0, v0, flat_scratch_lo
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xcd,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x38,0xd0,0xf0,0x04,0x00,0x00]
 
-v_readlane_b32 s0, v0, flat_scratch_hi
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xcf,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x38,0xd0,0x01,0x05,0x00,0x00]
 
-v_readlane_b32 s0, v0, vcc_lo
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xd5,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x38,0xd0,0xff,0x05,0x00,0x00]
 
-v_readlane_b32 s0, v0, vcc_hi
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xd7,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xca,0x00,0x00]
 
-v_readlane_b32 s0, v0, tba_lo
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xd9,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xcc,0x00,0x00]
 
-v_readlane_b32 s0, v0, tba_hi
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xdb,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xce,0x00,0x00]
 
-v_readlane_b32 s0, v0, tma_lo
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xdd,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xd4,0x00,0x00]
 
-v_readlane_b32 s0, v0, tma_hi
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xdf,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xd6,0x00,0x00]
 
-v_readlane_b32 s0, v0, ttmp11
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xf7,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xd8,0x00,0x00]
 
-v_readlane_b32 s0, v0, m0
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0xf9,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xda,0x00,0x00]
 
-v_readlane_b32 s0, v0, 0
-// CHECK: [0x00,0x00,0x89,0xd2,0x00,0x01,0x01,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xdc,0x00,0x00]
 
-v_writelane_b32 v0, s0, s0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xde,0x00,0x00]
 
-v_writelane_b32 v255, s0, s0
-// CHECK: [0xff,0x00,0x8a,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xf6,0x00,0x00]
 
-v_writelane_b32 v0, s0, 0
-// CHECK: [0x00,0x00,0x8a,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xf8,0x00,0x00]
 
-v_bcnt_u32_b32 v0, s0, s0
-// CHECK: [0x00,0x00,0x8b,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xfc,0x00,0x00]
 
-v_bcnt_u32_b32 v255, s0, s0
-// CHECK: [0xff,0x00,0x8b,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xfe,0x00,0x00]
 
-v_bcnt_u32_b32 v0, 0, s0
-// CHECK: [0x00,0x00,0x8b,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0x00,0x01,0x00]
 
-v_bcnt_u32_b32 v0, -1, s0
-// CHECK: [0x00,0x00,0x8b,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xe0,0x01,0x00]
 
-v_bcnt_u32_b32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x8b,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xfa,0x01,0x00]
 
-v_bcnt_u32_b32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x8b,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0x04,0x02,0x00]
 
-v_bcnt_u32_b32 v0, v0, s0
-// CHECK: [0x00,0x00,0x8b,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0xfe,0x03,0x00]
 
-v_bcnt_u32_b32 v0, v255, s0
-// CHECK: [0x00,0x00,0x8b,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x38,0xd0,0x80,0x04,0x00,0x40]
 
-v_bcnt_u32_b32 v0, s0, 0
-// CHECK: [0x00,0x00,0x8b,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_u_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x38,0xd0,0x80,0x04,0x00,0x00]
 
-v_bcnt_u32_b32 v0, s0, -1
-// CHECK: [0x00,0x00,0x8b,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_nge_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x72,0x7c]
 
-v_bcnt_u32_b32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x8b,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_nge_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x72,0x7c]
 
-v_bcnt_u32_b32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x8b,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_nge_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x72,0x7c]
 
-v_bcnt_u32_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x8b,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_nge_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x72,0x7c]
 
-v_bcnt_u32_b32 v0, s0, v255
-// CHECK: [0x00,0x00,0x8b,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_nge_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, s0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v255, s0, s0
-// CHECK: [0xff,0x00,0x8c,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, 0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_nge_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, -1, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_nge_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_nge_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_nge_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, v0, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_nge_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, v255, s0
-// CHECK: [0x00,0x00,0x8c,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_nge_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, s0, 0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_nge_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, s0, -1
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_nge_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_nge_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_nge_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_nge_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x72,0x7c]
 
-v_mbcnt_lo_u32_b32 v0, s0, v255
-// CHECK: [0x00,0x00,0x8c,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_nge_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x72,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v0, s0, s0
-// CHECK: [0x00,0x00,0x8d,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x72,0x7c,0x56,0x34,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v255, s0, s0
-// CHECK: [0xff,0x00,0x8d,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x72,0x7c]
 
-v_mbcnt_hi_u32_b32 v0, 0, s0
-// CHECK: [0x00,0x00,0x8d,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_nge_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x72,0x7c]
 
-v_mbcnt_hi_u32_b32 v0, -1, s0
-// CHECK: [0x00,0x00,0x8d,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_nge_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x73,0x7c]
 
-v_mbcnt_hi_u32_b32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x8d,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0x04,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x8d,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x39,0xd0,0x80,0x04,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v0, v0, s0
-// CHECK: [0x00,0x00,0x8d,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_nge_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x39,0xd0,0x80,0x04,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v0, v255, s0
-// CHECK: [0x00,0x00,0x8d,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_nge_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x39,0xd0,0x80,0x04,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v0, s0, 0
-// CHECK: [0x00,0x00,0x8d,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_nge_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x39,0xd0,0x80,0x04,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v0, s0, -1
-// CHECK: [0x00,0x00,0x8d,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_nge_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x39,0xd0,0x80,0x04,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x8d,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_nge_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x39,0xd0,0x80,0x04,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x8d,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_nge_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x39,0xd0,0x80,0x04,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x8d,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x39,0xd0,0xf0,0x04,0x00,0x00]
 
-v_mbcnt_hi_u32_b32 v0, s0, v255
-// CHECK: [0x00,0x00,0x8d,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_nge_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x39,0xd0,0x01,0x05,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0x8f,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x39,0xd0,0xff,0x05,0x00,0x00]
 
-v_lshlrev_b64 v[254:255], s0, s[0:1]
-// CHECK: [0xfe,0x00,0x8f,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xca,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x8f,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xcc,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x8f,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xce,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x8f,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xd4,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x8f,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xd6,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], v0, s[0:1]
-// CHECK: [0x00,0x00,0x8f,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xd8,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], v255, s[0:1]
-// CHECK: [0x00,0x00,0x8f,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xda,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], s0, 0
-// CHECK: [0x00,0x00,0x8f,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xdc,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], s0, -1
-// CHECK: [0x00,0x00,0x8f,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xde,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x8f,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xf6,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x8f,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xf8,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], s0, v[0:1]
-// CHECK: [0x00,0x00,0x8f,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xfc,0x00,0x00]
 
-v_lshlrev_b64 v[0:1], s0, v[254:255]
-// CHECK: [0x00,0x00,0x8f,0xd2,0x00,0xfc,0x03,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xfe,0x00,0x00]
 
-v_lshrrev_b64 v[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0x00,0x01,0x00]
 
-v_lshrrev_b64 v[254:255], s0, s[0:1]
-// CHECK: [0xfe,0x00,0x90,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xe0,0x01,0x00]
 
-v_lshrrev_b64 v[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x90,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xfa,0x01,0x00]
 
-v_lshrrev_b64 v[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x90,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0x04,0x02,0x00]
 
-v_lshrrev_b64 v[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x90,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0xfe,0x03,0x00]
 
-v_lshrrev_b64 v[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x90,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x39,0xd0,0x80,0x04,0x00,0x40]
 
-v_lshrrev_b64 v[0:1], v0, s[0:1]
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_nge_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x39,0xd0,0x80,0x04,0x00,0x00]
 
-v_lshrrev_b64 v[0:1], v255, s[0:1]
-// CHECK: [0x00,0x00,0x90,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_nlg_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x74,0x7c]
 
-v_lshrrev_b64 v[0:1], s0, 0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_nlg_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x74,0x7c]
 
-v_lshrrev_b64 v[0:1], s0, -1
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_nlg_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x74,0x7c]
 
-v_lshrrev_b64 v[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_nlg_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x74,0x7c]
 
-v_lshrrev_b64 v[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_nlg_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x74,0x7c]
 
-v_lshrrev_b64 v[0:1], s0, v[0:1]
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_nlg_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x74,0x7c]
 
-v_lshrrev_b64 v[0:1], s0, v[254:255]
-// CHECK: [0x00,0x00,0x90,0xd2,0x00,0xfc,0x03,0x00]
+v_cmpx_nlg_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], s0, s[0:1]
-// CHECK: [0x00,0x00,0x91,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[254:255], s0, s[0:1]
-// CHECK: [0xfe,0x00,0x91,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x91,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_nlg_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0x91,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_nlg_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x91,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_nlg_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0x91,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_nlg_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], v0, s[0:1]
-// CHECK: [0x00,0x00,0x91,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_nlg_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], v255, s[0:1]
-// CHECK: [0x00,0x00,0x91,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_nlg_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], s0, 0
-// CHECK: [0x00,0x00,0x91,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_nlg_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], s0, -1
-// CHECK: [0x00,0x00,0x91,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_nlg_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x91,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_nlg_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x74,0x7c]
 
-v_ashrrev_i64 v[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0x91,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_nlg_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x74,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_ashrrev_i64 v[0:1], s0, v[0:1]
-// CHECK: [0x00,0x00,0x91,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_nlg_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x74,0x7c,0x56,0x34,0x00,0x00]
 
-v_ashrrev_i64 v[0:1], s0, v[254:255]
-// CHECK: [0x00,0x00,0x91,0xd2,0x00,0xfc,0x03,0x00]
+v_cmpx_nlg_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x74,0x7c]
 
-v_trig_preop_f64 v[0:1], s[0:1], s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x74,0x7c]
 
-v_trig_preop_f64 v[254:255], s[0:1], s0
-// CHECK: [0xfe,0x00,0x92,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x75,0x7c]
 
-v_trig_preop_f64 v[0:1], scc, s0
-// CHECK: [0x00,0x00,0x92,0xd2,0xfd,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], v[0:1], s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], v[254:255], s0
-// CHECK: [0x00,0x00,0x92,0xd2,0xfe,0x01,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_nlg_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_nlg_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_nlg_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_nlg_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0xfa,0x01,0x00]
+v_cmpx_nlg_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], v0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0xf0,0x04,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], v255
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x01,0x05,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], -s[0:1], s0
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x00,0x20]
+v_cmpx_nlg_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0xff,0x05,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], |s[0:1]|, s0
-// CHECK: [0x00,0x01,0x92,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xca,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], s0 clamp
-// CHECK: [0x00,0x80,0x92,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xcc,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], s0 mul:2
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x00,0x08]
+v_cmpx_nlg_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xce,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], s0 mul:4
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x00,0x10]
+v_cmpx_nlg_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xd4,0x00,0x00]
 
-v_trig_preop_f64 v[0:1], s[0:1], s0 div:2
-// CHECK: [0x00,0x00,0x92,0xd2,0x00,0x00,0x00,0x18]
+v_cmpx_nlg_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xd6,0x00,0x00]
 
-v_bfm_b32 v0, s0, s0
-// CHECK: [0x00,0x00,0x93,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xd8,0x00,0x00]
 
-v_bfm_b32 v255, s0, s0
-// CHECK: [0xff,0x00,0x93,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xda,0x00,0x00]
 
-v_bfm_b32 v0, 0, s0
-// CHECK: [0x00,0x00,0x93,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xdc,0x00,0x00]
 
-v_bfm_b32 v0, -1, s0
-// CHECK: [0x00,0x00,0x93,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xde,0x00,0x00]
 
-v_bfm_b32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x93,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xf6,0x00,0x00]
 
-v_bfm_b32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x93,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xf8,0x00,0x00]
 
-v_bfm_b32 v0, v0, s0
-// CHECK: [0x00,0x00,0x93,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xfc,0x00,0x00]
 
-v_bfm_b32 v0, v255, s0
-// CHECK: [0x00,0x00,0x93,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xfe,0x00,0x00]
 
-v_bfm_b32 v0, s0, 0
-// CHECK: [0x00,0x00,0x93,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0x00,0x01,0x00]
 
-v_bfm_b32 v0, s0, -1
-// CHECK: [0x00,0x00,0x93,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xe0,0x01,0x00]
 
-v_bfm_b32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x93,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xfa,0x01,0x00]
 
-v_bfm_b32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x93,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0x04,0x02,0x00]
 
-v_bfm_b32 v0, s0, v0
-// CHECK: [0x00,0x00,0x93,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0xfe,0x03,0x00]
 
-v_bfm_b32 v0, s0, v255
-// CHECK: [0x00,0x00,0x93,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x3a,0xd0,0x80,0x04,0x00,0x40]
 
-v_cvt_pknorm_i16_f32 v0, s0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x3a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pknorm_i16_f32 v255, s0, s0
-// CHECK: [0xff,0x00,0x94,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, scc, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0xfd,0x00,0x00,0x00]
+v_cmpx_ngt_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, v0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_ngt_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, v255, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_ngt_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, s0, scc
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0xfa,0x01,0x00]
+v_cmpx_ngt_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_ngt_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, s0, v255
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_ngt_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, -s0, s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x00,0x20]
+v_cmpx_ngt_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, s0, -s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x00,0x40]
+v_cmpx_ngt_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x94,0xd2,0x00,0x00,0x00,0x60]
+v_cmpx_ngt_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x94,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x94,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x94,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x76,0x7c]
 
-v_cvt_pknorm_i16_f32 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x94,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x76,0x7c]
 
-v_cvt_pknorm_u16_f32 v0, s0, s0
-// CHECK: [0x00,0x00,0x95,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x76,0x7c]
 
-v_cvt_pknorm_u16_f32 v255, s0, s0
-// CHECK: [0xff,0x00,0x95,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x76,0x7c]
 
-v_cvt_pknorm_u16_f32 v0, scc, s0
-// CHECK: [0x00,0x00,0x95,0xd2,0xfd,0x00,0x00,0x00]
+v_cmpx_ngt_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x76,0x7c]
 
-v_cvt_pknorm_u16_f32 v0, v0, s0
-// CHECK: [0x00,0x00,0x95,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_ngt_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x76,0x7c]
 
-v_cvt_pknorm_u16_f32 v0, v255, s0
-// CHECK: [0x00,0x00,0x95,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_ngt_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x76,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, s0, scc
-// CHECK: [0x00,0x00,0x95,0xd2,0x00,0xfa,0x01,0x00]
+v_cmpx_ngt_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x76,0x7c,0x56,0x34,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x95,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_ngt_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x76,0x7c]
 
-v_cvt_pknorm_u16_f32 v0, s0, v255
-// CHECK: [0x00,0x00,0x95,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_ngt_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x76,0x7c]
 
-v_cvt_pknorm_u16_f32 v0, -s0, s0
-// CHECK: [0x00,0x00,0x95,0xd2,0x00,0x00,0x00,0x20]
+v_cmpx_ngt_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x77,0x7c]
 
-v_cvt_pknorm_u16_f32 v0, s0, -s0
-// CHECK: [0x00,0x00,0x95,0xd2,0x00,0x00,0x00,0x40]
+v_cmpx_ngt_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x95,0xd2,0x00,0x00,0x00,0x60]
+v_cmpx_ngt_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x3b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x95,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x3b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x95,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x3b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x95,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x3b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pknorm_u16_f32 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x95,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x3b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, s0, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x3b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v255, s0, s0
-// CHECK: [0xff,0x00,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x3b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, scc, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0xfd,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x3b,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, v0, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x01,0x05,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, v255, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x3b,0xd0,0xff,0x05,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, s0, scc
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0xfa,0x01,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xca,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, s0, v0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, s0, v255
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xce,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, -s0, s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x20]
+v_cmpx_ngt_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, s0, -s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x40]
+v_cmpx_ngt_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, -s0, -s0
-// CHECK: [0x00,0x00,0x96,0xd2,0x00,0x00,0x00,0x60]
+v_cmpx_ngt_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, |s0|, s0
-// CHECK: [0x00,0x01,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xda,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, s0, |s0|
-// CHECK: [0x00,0x02,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, |s0|, |s0|
-// CHECK: [0x00,0x03,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xde,0x00,0x00]
 
-v_cvt_pkrtz_f16_f32 v0, s0, s0 clamp
-// CHECK: [0x00,0x80,0x96,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, s0, s0
-// CHECK: [0x00,0x00,0x97,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cvt_pk_u16_u32 v255, s0, s0
-// CHECK: [0xff,0x00,0x97,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, 0, s0
-// CHECK: [0x00,0x00,0x97,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, -1, s0
-// CHECK: [0x00,0x00,0x97,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0x00,0x01,0x00]
 
-v_cvt_pk_u16_u32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x97,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cvt_pk_u16_u32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x97,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cvt_pk_u16_u32 v0, v0, s0
-// CHECK: [0x00,0x00,0x97,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0x04,0x02,0x00]
 
-v_cvt_pk_u16_u32 v0, v255, s0
-// CHECK: [0x00,0x00,0x97,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cvt_pk_u16_u32 v0, s0, 0
-// CHECK: [0x00,0x00,0x97,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x3b,0xd0,0x80,0x04,0x00,0x40]
 
-v_cvt_pk_u16_u32 v0, s0, -1
-// CHECK: [0x00,0x00,0x97,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_ngt_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x3b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cvt_pk_u16_u32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x97,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_nle_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x78,0x7c]
 
-v_cvt_pk_u16_u32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x97,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_nle_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x78,0x7c]
 
-v_cvt_pk_u16_u32 v0, s0, v0
-// CHECK: [0x00,0x00,0x97,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_nle_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x78,0x7c]
 
-v_cvt_pk_u16_u32 v0, s0, v255
-// CHECK: [0x00,0x00,0x97,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_nle_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, s0, s0
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v255, s0, s0
-// CHECK: [0xff,0x00,0x98,0xd2,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, 0, s0
-// CHECK: [0x00,0x00,0x98,0xd2,0x80,0x00,0x00,0x00]
+v_cmpx_nle_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, -1, s0
-// CHECK: [0x00,0x00,0x98,0xd2,0xc1,0x00,0x00,0x00]
+v_cmpx_nle_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, 0.5, s0
-// CHECK: [0x00,0x00,0x98,0xd2,0xf0,0x00,0x00,0x00]
+v_cmpx_nle_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, -4.0, s0
-// CHECK: [0x00,0x00,0x98,0xd2,0xf7,0x00,0x00,0x00]
+v_cmpx_nle_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, v0, s0
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x01,0x00,0x00]
+v_cmpx_nle_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, v255, s0
-// CHECK: [0x00,0x00,0x98,0xd2,0xff,0x01,0x00,0x00]
+v_cmpx_nle_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, s0, 0
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x01,0x00]
+v_cmpx_nle_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, s0, -1
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x82,0x01,0x00]
+v_cmpx_nle_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, s0, 0.5
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0xe0,0x01,0x00]
+v_cmpx_nle_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, s0, -4.0
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0xee,0x01,0x00]
+v_cmpx_nle_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, s0, v0
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0x00,0x02,0x00]
+v_cmpx_nle_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x78,0x7c]
 
-v_cvt_pk_i16_i32 v0, s0, v255
-// CHECK: [0x00,0x00,0x98,0xd2,0x00,0xfe,0x03,0x00]
+v_cmpx_nle_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x78,0x7c]
 
-v_cmp_class_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x20,0x7c]
+v_cmpx_nle_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x78,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_cmp_class_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x20,0x7c]
+v_cmpx_nle_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x78,0x7c,0x56,0x34,0x00,0x00]
 
-v_cmp_class_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x20,0x7c]
+v_cmpx_nle_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x78,0x7c]
 
-v_cmp_class_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x20,0x7c]
+v_cmpx_nle_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x78,0x7c]
 
-v_cmp_class_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x20,0x7c]
+v_cmpx_nle_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x79,0x7c]
 
-v_cmp_class_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_class_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_class_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_class_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_class_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x20,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_class_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x20,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nle_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_class_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x20,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nle_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_class_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x20,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_class_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x20,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_class_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x21,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_class_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_class_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_class_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_class_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_class_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_class_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_class_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_class_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_class_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_class_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_class_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_class_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x3c,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_class_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x22,0x7c]
+v_cmpx_nle_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x3c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x22,0x7c]
+v_cmpx_neq_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x7a,0x7c]
 
-v_cmpx_class_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x22,0x7c]
+v_cmpx_neq_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x7a,0x7c]
 
-v_cmpx_class_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x22,0x7c]
+v_cmpx_neq_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x7a,0x7c]
 
-v_cmpx_class_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x22,0x7c]
+v_cmpx_neq_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x7a,0x7c]
 
-v_cmpx_class_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x22,0x7c]
+v_cmpx_neq_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x7a,0x7c]
 
-v_cmpx_class_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x22,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_neq_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x7a,0x7c]
 
-v_cmpx_class_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x22,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_neq_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x7a,0x7c]
 
-v_cmpx_class_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x22,0x7c]
+v_cmpx_neq_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x7a,0x7c]
 
-v_cmpx_class_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x22,0x7c]
+v_cmpx_neq_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x7a,0x7c]
 
-v_cmpx_class_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x23,0x7c]
+v_cmpx_neq_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, s[0:1], v0
-// CHECK: [0x00,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, s[2:3], v0
-// CHECK: [0x02,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, s[100:101], v0
-// CHECK: [0x64,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, flat_scratch, v0
-// CHECK: [0x66,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, vcc, v0
-// CHECK: [0x6a,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, tba, v0
-// CHECK: [0x6c,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, tma, v0
-// CHECK: [0x6e,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, ttmp[10:11], v0
-// CHECK: [0x7a,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, exec, v0
-// CHECK: [0x7e,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x7a,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_cmp_class_f64 vcc, 0, v0
-// CHECK: [0x80,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x7a,0x7c,0x56,0x34,0x00,0x00]
 
-v_cmp_class_f64 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x7a,0x7c]
 
-v_cmp_class_f64 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x24,0x7c]
+v_cmpx_neq_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x7b,0x7c]
 
-v_cmp_class_f64 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x24,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_neq_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f64 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x24,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_neq_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x3d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f64 vcc, v[0:1], v0
-// CHECK: [0x00,0x01,0x24,0x7c]
+v_cmpx_neq_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x3d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f64 vcc, v[254:255], v0
-// CHECK: [0xfe,0x01,0x24,0x7c]
+v_cmpx_neq_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x3d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f64 vcc, s[0:1], v255
-// CHECK: [0x00,0xfe,0x25,0x7c]
+v_cmpx_neq_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x3d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f64 vcc, s[0:1], v0
-// CHECK: [0x00,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x3d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f64 vcc, s[2:3], v0
-// CHECK: [0x02,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x3d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f64 vcc, s[100:101], v0
-// CHECK: [0x64,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x3d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f64 vcc, flat_scratch, v0
-// CHECK: [0x66,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x3d,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_class_f64 vcc, vcc, v0
-// CHECK: [0x6a,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_class_f64 vcc, tba, v0
-// CHECK: [0x6c,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x3d,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_class_f64 vcc, tma, v0
-// CHECK: [0x6e,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_class_f64 vcc, ttmp[10:11], v0
-// CHECK: [0x7a,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_class_f64 vcc, exec, v0
-// CHECK: [0x7e,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_class_f64 vcc, 0, v0
-// CHECK: [0x80,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_class_f64 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_class_f64 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_class_f64 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_class_f64 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x26,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_neq_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_class_f64 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x26,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_neq_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_class_f64 vcc, v[0:1], v0
-// CHECK: [0x00,0x01,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_class_f64 vcc, v[254:255], v0
-// CHECK: [0xfe,0x01,0x26,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_class_f64 vcc, s[0:1], v255
-// CHECK: [0x00,0xfe,0x27,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_class_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x28,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_class_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x28,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_class_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x28,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_class_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x28,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_class_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x28,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_class_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x28,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_class_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x28,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x3d,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_class_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x28,0x7c]
+v_cmpx_neq_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x3d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_class_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x28,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_nlt_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x28,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_nlt_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x28,0x7c]
+v_cmpx_nlt_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x7c,0x7c]
 
-v_cmp_class_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x29,0x7c]
+v_cmpx_nlt_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x7c,0x7c]
 
-v_cmpx_class_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x7c,0x7c]
 
-v_cmpx_class_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x7c,0x7c]
 
-v_cmpx_class_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x7c,0x7c]
 
-v_cmpx_class_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x7c,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_class_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x7c,0x7c,0x56,0x34,0x00,0x00]
 
-v_cmpx_class_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x7c,0x7c]
 
-v_cmpx_class_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x7c,0x7c]
 
-v_cmpx_class_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x7d,0x7c]
 
-v_cmpx_class_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_class_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_class_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_class_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x2a,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_nlt_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_class_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x2a,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_nlt_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_class_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_class_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x2a,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_class_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x2b,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_f_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_f_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_f_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_f_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_f_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_f_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_f_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_f_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_f_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_f_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_f_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_f_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_f_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_f_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_f_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x3e,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_f_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x40,0x7c]
+v_cmpx_nlt_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x3e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x40,0x7c]
+v_cmpx_tru_f16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x7e,0x7c]
 
-v_cmp_f_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x40,0x7c]
+v_cmpx_tru_f16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x7e,0x7c]
 
-v_cmp_f_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x40,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_tru_f16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x7e,0x7c]
 
-v_cmp_f_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x40,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_tru_f16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x7e,0x7c]
 
-v_cmp_f_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x40,0x7c]
+v_cmpx_tru_f16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x7e,0x7c]
 
-v_cmp_f_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x40,0x7c]
+v_cmpx_tru_f16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x7e,0x7c]
 
-v_cmp_f_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x41,0x7c]
+v_cmpx_tru_f16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_tru_f16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_tru_f16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_tru_f16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x7e,0x7c]
 
-v_cmp_f_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_tru_f16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x7e,0x7c,0x0b,0xfe,0x00,0x00]
 
-v_cmp_f_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_tru_f16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x7e,0x7c,0x56,0x34,0x00,0x00]
 
-v_cmp_f_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_tru_f16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x7e,0x7c]
 
-v_cmp_f_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_tru_f16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x7e,0x7c]
 
-v_cmp_f_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_tru_f16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x7f,0x7c]
 
-v_cmp_f_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_tru_f16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_tru_f16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x3f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x20,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_tru_f16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x3f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x20,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x3f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x3f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x3f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x3f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x3f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x3f,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_lt_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_lt_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x3f,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_lt_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_lt_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_lt_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lt_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_lt_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_lt_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_lt_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_lt_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_lt_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_lt_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_lt_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_lt_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x42,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_tru_f16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_lt_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x42,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_tru_f16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_lt_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_lt_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x42,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_lt_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x43,0x7c]
+v_cmpx_tru_f16_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_lt_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x21,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_lt_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x21,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_lt_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x21,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x3f,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_lt_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x21,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f16_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x3f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x21,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x21,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x21,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x21,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x21,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x21,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x21,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x21,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_f_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x21,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x21,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_f_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x21,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x21,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_f_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x21,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_f_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x21,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_f_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x21,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_f_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x80,0x7c]
 
-v_cmp_lt_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x21,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x80,0x7c]
 
-v_cmp_eq_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x44,0x7c]
+v_cmp_f_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x80,0x7c]
 
-v_cmp_eq_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x44,0x7c]
+v_cmp_f_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x80,0x7c]
 
-v_cmp_eq_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x44,0x7c]
+v_cmp_f_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x80,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x44,0x7c]
+v_cmp_f_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x80,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x44,0x7c]
+v_cmp_f_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x80,0x7c]
 
-v_cmp_eq_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x44,0x7c]
+v_cmp_f_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x80,0x7c]
 
-v_cmp_eq_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x44,0x7c]
+v_cmp_f_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x81,0x7c]
 
-v_cmp_eq_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x40,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x40,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x40,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x40,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x40,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x40,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x40,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x40,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_eq_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x40,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_eq_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x44,0x7c]
+v_cmp_f_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x40,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_eq_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x44,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_eq_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x44,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_eq_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x44,0x7c]
+v_cmp_f_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_eq_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x44,0x7c]
+v_cmp_f_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_eq_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x45,0x7c]
+v_cmp_f_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_eq_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_eq_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_eq_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_eq_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_eq_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_eq_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_eq_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_eq_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_eq_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_eq_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_eq_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_eq_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_eq_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_eq_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x40,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_eq_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x40,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x82,0x7c]
 
-v_cmp_eq_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lt_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x82,0x7c]
 
-v_cmp_eq_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x82,0x7c]
 
-v_cmp_eq_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x82,0x7c]
 
-v_cmp_eq_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x22,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x82,0x7c]
 
-v_cmp_le_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x82,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x82,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x82,0x7c]
 
-v_cmp_le_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x82,0x7c]
 
-v_cmp_le_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x46,0x7c]
+v_cmp_lt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x83,0x7c]
 
-v_cmp_le_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x46,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x46,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_lt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x41,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x46,0x7c]
+v_cmp_lt_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x41,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x46,0x7c]
+v_cmp_lt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x41,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x47,0x7c]
+v_cmp_lt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x41,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x23,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x41,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x23,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x41,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x23,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x41,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x23,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x41,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_le_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x23,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x41,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_le_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x23,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x41,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_le_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x23,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_le_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x23,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x23,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x23,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x23,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x23,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x23,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x23,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x23,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x23,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x23,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x23,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x23,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_le_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x23,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_gt_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x48,0x7c]
+v_cmp_lt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_gt_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x48,0x7c]
+v_cmp_lt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_gt_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x48,0x7c]
+v_cmp_lt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_gt_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x48,0x7c]
+v_cmp_lt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_gt_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x48,0x7c]
+v_cmp_lt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x41,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_gt_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x48,0x7c]
+v_cmp_lt_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x41,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x48,0x7c]
+v_cmp_eq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x48,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_eq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x48,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_eq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x48,0x7c]
+v_cmp_eq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x48,0x7c]
+v_cmp_eq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x84,0x7c]
 
-v_cmp_gt_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x49,0x7c]
+v_cmp_eq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x84,0x7c]
 
-v_cmp_gt_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x84,0x7c]
 
-v_cmp_gt_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x84,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x84,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x84,0x7c]
 
-v_cmp_gt_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x84,0x7c]
 
-v_cmp_gt_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x85,0x7c]
 
-v_cmp_gt_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x42,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x42,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x42,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x42,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_eq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x42,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x42,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_eq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x42,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x42,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_eq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x42,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_eq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x42,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_eq_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x24,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_eq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_gt_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x24,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lg_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_lg_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_lg_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_lg_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_lg_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_lg_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_lg_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_lg_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_lg_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_lg_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_lg_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_lg_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_lg_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_lg_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_lg_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_lg_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x42,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_lg_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x4a,0x7c]
+v_cmp_eq_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x42,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x4a,0x7c]
+v_cmp_le_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x86,0x7c]
 
-v_cmp_lg_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x4a,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_le_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x86,0x7c]
 
-v_cmp_lg_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x4a,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_le_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x86,0x7c]
 
-v_cmp_lg_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x4a,0x7c]
+v_cmp_le_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x86,0x7c]
 
-v_cmp_lg_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x4a,0x7c]
+v_cmp_le_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x86,0x7c]
 
-v_cmp_lg_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x4b,0x7c]
+v_cmp_le_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x25,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x25,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x25,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x25,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x25,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x25,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x25,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x25,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x25,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x25,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_le_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x25,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x25,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_le_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x86,0x7c]
 
-v_cmp_lg_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x25,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x86,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_lg_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x25,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_le_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x86,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_lg_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x25,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x86,0x7c]
 
-v_cmp_lg_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x25,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_le_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x86,0x7c]
 
-v_cmp_lg_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x25,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_le_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x87,0x7c]
 
-v_cmp_lg_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x25,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_le_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x25,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_le_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x43,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x25,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x43,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x43,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x43,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x43,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x43,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x43,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x43,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ge_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x43,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ge_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x43,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ge_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_ge_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_ge_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ge_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ge_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ge_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ge_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ge_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ge_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ge_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ge_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x4c,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ge_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x4c,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ge_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ge_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x4c,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ge_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x4d,0x7c]
+v_cmp_le_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ge_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_ge_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ge_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ge_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x43,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_ge_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x43,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_gt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_gt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_gt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_gt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_gt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_gt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x26,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_gt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x88,0x7c]
 
-v_cmp_ge_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x26,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x88,0x7c]
 
-v_cmp_o_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x4e,0x7c]
+v_cmp_gt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x88,0x7c]
 
-v_cmp_o_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x4e,0x7c]
+v_cmp_gt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x88,0x7c]
 
-v_cmp_o_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x4e,0x7c]
+v_cmp_gt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x88,0x7c]
 
-v_cmp_o_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x4e,0x7c]
+v_cmp_gt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x88,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_o_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x4e,0x7c]
+v_cmp_gt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x88,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_o_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x4e,0x7c]
+v_cmp_gt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x88,0x7c]
 
-v_cmp_o_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x4e,0x7c]
+v_cmp_gt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x88,0x7c]
 
-v_cmp_o_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x4e,0x7c]
+v_cmp_gt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x89,0x7c]
 
-v_cmp_o_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x4e,0x7c]
+v_cmp_gt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x4e,0x7c]
+v_cmp_gt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x44,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x4e,0x7c]
+v_cmp_gt_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x44,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x4e,0x7c]
+v_cmp_gt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x44,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x4e,0x7c]
+v_cmp_gt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x44,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x4e,0x7c]
+v_cmp_gt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x44,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x4e,0x7c]
+v_cmp_gt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x44,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x4e,0x7c]
+v_cmp_gt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x44,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x4e,0x7c]
+v_cmp_gt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x44,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_o_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x4e,0x7c]
+v_cmp_gt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x44,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_o_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x4e,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x44,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_o_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x4e,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_o_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x4e,0x7c]
+v_cmp_gt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_o_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x4e,0x7c]
+v_cmp_gt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_o_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x4f,0x7c]
+v_cmp_gt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_o_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x27,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_o_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x27,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_o_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x27,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_o_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x27,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_o_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x27,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_o_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x27,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_o_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x27,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_o_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x27,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_o_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x27,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_o_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x27,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_o_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x27,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_o_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x27,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_o_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x27,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_o_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x27,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_o_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x27,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x44,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_o_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x27,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_gt_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x44,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x27,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8a,0x7c]
 
-v_cmp_o_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x27,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lg_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x8a,0x7c]
 
-v_cmp_o_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x27,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x8a,0x7c]
 
-v_cmp_o_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x27,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_u_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_u_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x50,0x7c]
+v_cmp_lg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8a,0x7c]
 
-v_cmp_u_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x50,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_lg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8b,0x7c]
 
-v_cmp_u_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x50,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x50,0x7c]
+v_cmp_lg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x45,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x50,0x7c]
+v_cmp_lg_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x45,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x51,0x7c]
+v_cmp_lg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x45,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x45,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x45,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x45,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x45,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x45,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_u_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x45,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_u_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x45,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_u_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x28,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_u_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x28,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_nge_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x52,0x7c]
+v_cmp_lg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_nge_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x52,0x7c]
+v_cmp_lg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nge_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x52,0x7c]
+v_cmp_lg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_nge_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x52,0x7c]
+v_cmp_lg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nge_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x52,0x7c]
+v_cmp_lg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nge_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x52,0x7c]
+v_cmp_lg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x45,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_nge_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x52,0x7c]
+v_cmp_lg_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x45,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x52,0x7c]
+v_cmp_ge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x52,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_ge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x52,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_ge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x52,0x7c]
+v_cmp_ge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x52,0x7c]
+v_cmp_ge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x53,0x7c]
+v_cmp_ge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x29,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x29,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8c,0x7c]
 
-v_cmp_nge_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x29,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nge_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x29,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nge_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x29,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8c,0x7c]
 
-v_cmp_nge_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x29,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8c,0x7c]
 
-v_cmp_nge_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x29,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8d,0x7c]
 
-v_cmp_nge_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x29,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x29,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x46,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x29,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x46,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x29,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x46,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x29,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x46,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x29,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x46,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x29,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x46,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x29,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x46,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x29,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x46,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x29,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x46,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x29,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x46,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x29,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ge_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_nge_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x29,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_nlg_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nlg_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_nlg_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nlg_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nlg_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x46,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_nlg_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x54,0x7c]
+v_cmp_ge_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x46,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlg_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x54,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_o_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x54,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_o_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x54,0x7c]
+v_cmp_o_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x54,0x7c]
+v_cmp_o_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x55,0x7c]
+v_cmp_o_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_o_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_o_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_o_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_o_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_o_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nlg_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_o_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nlg_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_o_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_o_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8e,0x7c]
 
-v_cmp_nlg_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_o_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8f,0x7c]
 
-v_cmp_nlg_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x2a,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_o_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlg_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x2a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x47,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x47,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x47,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x47,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x47,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x47,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x47,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x47,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x47,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x47,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x56,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x56,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x56,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ngt_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x57,0x7c]
+v_cmp_o_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ngt_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ngt_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_ngt_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x2b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ngt_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x2b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ngt_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x47,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_ngt_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x47,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2b,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_u_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x2b,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_u_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2b,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_u_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2b,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_u_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2b,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_u_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x2b,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_u_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2b,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_u_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2b,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_u_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x2b,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_u_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x2b,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_u_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x2b,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_u_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x90,0x7c]
 
-v_cmp_ngt_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x2b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x90,0x7c]
 
-v_cmp_nle_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x58,0x7c]
+v_cmp_u_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x90,0x7c]
 
-v_cmp_nle_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x58,0x7c]
+v_cmp_u_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x90,0x7c]
 
-v_cmp_nle_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x58,0x7c]
+v_cmp_u_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x90,0x7c]
 
-v_cmp_nle_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x58,0x7c]
+v_cmp_u_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x90,0x7c]
 
-v_cmp_nle_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x58,0x7c]
+v_cmp_u_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x90,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nle_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x58,0x7c]
+v_cmp_u_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x90,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nle_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x58,0x7c]
+v_cmp_u_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x90,0x7c]
 
-v_cmp_nle_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x58,0x7c]
+v_cmp_u_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x90,0x7c]
 
-v_cmp_nle_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x58,0x7c]
+v_cmp_u_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x91,0x7c]
 
-v_cmp_nle_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x58,0x7c]
+v_cmp_u_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x58,0x7c]
+v_cmp_u_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x48,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x58,0x7c]
+v_cmp_u_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x48,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x58,0x7c]
+v_cmp_u_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x48,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x58,0x7c]
+v_cmp_u_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x48,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x58,0x7c]
+v_cmp_u_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x48,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x58,0x7c]
+v_cmp_u_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x48,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x58,0x7c]
+v_cmp_u_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x48,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x58,0x7c]
+v_cmp_u_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x48,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_nle_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x58,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x48,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_nle_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x58,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x48,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_nle_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x58,0x7c]
+v_cmp_u_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_nle_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x58,0x7c]
+v_cmp_u_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_nle_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x59,0x7c]
+v_cmp_u_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_nle_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_nle_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_nle_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_nle_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_nle_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_nle_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_nle_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_nle_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_nle_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_nle_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_nle_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_nle_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nle_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_nle_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nle_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nle_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_u_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x48,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_nle_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_u_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x48,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x92,0x7c]
 
-v_cmp_nle_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x2c,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nge_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x92,0x7c]
 
-v_cmp_nle_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x2c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x92,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_neq_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x92,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_neq_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x5a,0x7c]
+v_cmp_nge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x5a,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_nge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x92,0x7c]
 
-v_cmp_neq_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x5a,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_nge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x93,0x7c]
 
-v_cmp_neq_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x5a,0x7c]
+v_cmp_nge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x5a,0x7c]
+v_cmp_nge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x49,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x5b,0x7c]
+v_cmp_nge_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x49,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x49,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x49,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x2d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x49,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x2d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x49,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x49,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x49,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_neq_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x49,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_neq_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x49,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2d,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x2d,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2d,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2d,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2d,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x2d,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2d,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2d,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x2d,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x2d,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x2d,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_neq_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x2d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_nlt_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x5c,0x7c]
+v_cmp_nge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_nlt_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x5c,0x7c]
+v_cmp_nge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_nlt_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x5c,0x7c]
+v_cmp_nge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nlt_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x5c,0x7c]
+v_cmp_nge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_nlt_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x5c,0x7c]
+v_cmp_nge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nlt_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x5c,0x7c]
+v_cmp_nge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nlt_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x5c,0x7c]
+v_cmp_nge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x49,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_nlt_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x5c,0x7c]
+v_cmp_nge_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x49,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x5c,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_nlg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x5c,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_nlg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x5c,0x7c]
+v_cmp_nlg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x5d,0x7c]
+v_cmp_nlg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x94,0x7c]
 
-v_cmp_nlt_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x94,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nlt_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x94,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nlt_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x94,0x7c]
 
-v_cmp_nlt_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x94,0x7c]
 
-v_cmp_nlt_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x95,0x7c]
 
-v_cmp_nlt_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x4a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nlg_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x4a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nlg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x4a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nlg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x4a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nlg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x4a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nlg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x4a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nlg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x4a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nlg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x4a,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nlg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x2e,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nlg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x4a,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_nlt_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x2e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_tru_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_tru_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_tru_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_tru_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_tru_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_tru_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_tru_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_tru_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_tru_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_tru_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_tru_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_tru_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_tru_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_tru_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_tru_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_tru_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_tru_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_tru_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x5e,0x7c]
+v_cmp_nlg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x4a,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_tru_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x5e,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_nlg_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x4a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x5e,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_ngt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x96,0x7c]
 
-v_cmp_tru_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x5e,0x7c]
+v_cmp_ngt_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x96,0x7c]
 
-v_cmp_tru_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x5e,0x7c]
+v_cmp_ngt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x96,0x7c]
 
-v_cmp_tru_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x5f,0x7c]
+v_cmp_ngt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x2f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x2f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x2f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x2f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x2f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x2f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x2f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x2f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x2f,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x2f,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ngt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x2f,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ngt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x2f,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ngt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x2f,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ngt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x2f,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ngt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x2f,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ngt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x96,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_tru_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x2f,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ngt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x96,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_tru_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x2f,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ngt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x2f,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ngt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x96,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x2f,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ngt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x97,0x7c]
 
-v_cmp_tru_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x2f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x4b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x4b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x4b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x4b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x4b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x4b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x4b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x4b,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_f_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_f_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x4b,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_f_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_f_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_f_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_f_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_f_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_f_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_f_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_f_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_f_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x60,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_f_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x60,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_f_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_f_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x60,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_f_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x61,0x7c]
+v_cmp_ngt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_f_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_f_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_f_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_f_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_f_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_f_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x4b,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_f_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x4b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nle_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nle_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nle_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nle_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nle_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nle_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nle_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x30,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nle_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x98,0x7c]
 
-v_cmpx_f_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x30,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x98,0x7c]
 
-v_cmpx_lt_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x62,0x7c]
+v_cmp_nle_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x98,0x7c]
 
-v_cmpx_lt_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x62,0x7c]
+v_cmp_nle_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x98,0x7c]
 
-v_cmpx_lt_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x62,0x7c]
+v_cmp_nle_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x98,0x7c]
 
-v_cmpx_lt_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x62,0x7c]
+v_cmp_nle_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x98,0x7c]
 
-v_cmpx_lt_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x62,0x7c]
+v_cmp_nle_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x98,0x7c]
 
-v_cmpx_lt_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x62,0x7c]
+v_cmp_nle_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x98,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x62,0x7c]
+v_cmp_nle_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x98,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x62,0x7c]
+v_cmp_nle_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x98,0x7c]
 
-v_cmpx_lt_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x62,0x7c]
+v_cmp_nle_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x98,0x7c]
 
-v_cmpx_lt_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x62,0x7c]
+v_cmp_nle_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x99,0x7c]
 
-v_cmpx_lt_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x62,0x7c]
+v_cmp_nle_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x62,0x7c]
+v_cmp_nle_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x4c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x62,0x7c]
+v_cmp_nle_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x4c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x62,0x7c]
+v_cmp_nle_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x4c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x62,0x7c]
+v_cmp_nle_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x4c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x62,0x7c]
+v_cmp_nle_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x4c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x62,0x7c]
+v_cmp_nle_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x4c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x62,0x7c]
+v_cmp_nle_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x4c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x62,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x4c,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x62,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x62,0x7c]
+v_cmp_nle_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x4c,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x62,0x7c]
+v_cmp_nle_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_lt_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x63,0x7c]
+v_cmp_nle_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x31,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_lt_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x31,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_lt_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x31,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_lt_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x31,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lt_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x31,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_lt_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x31,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lt_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x31,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_lt_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x31,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x31,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x31,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x31,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x31,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x31,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x31,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x31,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x31,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x31,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nle_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x4c,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_lt_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x31,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nle_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x4c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x31,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_neq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x9a,0x7c]
 
-v_cmpx_lt_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x31,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x9a,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x64,0x7c]
+v_cmp_neq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x9a,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x64,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_neq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x64,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_neq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x9a,0x7c]
 
-v_cmpx_eq_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x64,0x7c]
+v_cmp_neq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x9b,0x7c]
 
-v_cmpx_eq_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x64,0x7c]
+v_cmp_neq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x65,0x7c]
+v_cmp_neq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x4d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x4d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x4d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x4d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x4d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x4d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x4d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x4d,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_eq_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x4d,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_neq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_neq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x32,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_neq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_eq_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x32,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_le_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x66,0x7c]
+v_cmp_neq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_le_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x66,0x7c]
+v_cmp_neq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_le_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x66,0x7c]
+v_cmp_neq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_le_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x66,0x7c]
+v_cmp_neq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_le_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x66,0x7c]
+v_cmp_neq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_le_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x66,0x7c]
+v_cmp_neq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_le_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x66,0x7c]
+v_cmp_neq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_le_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x66,0x7c]
+v_cmp_neq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x4d,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_le_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x66,0x7c]
+v_cmp_neq_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x4d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x66,0x7c]
+v_cmp_nlt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x66,0x7c]
+v_cmp_nlt_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x66,0x7c]
+v_cmp_nlt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x66,0x7c]
+v_cmp_nlt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x66,0x7c]
+v_cmp_nlt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x66,0x7c]
+v_cmp_nlt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x66,0x7c]
+v_cmp_nlt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x66,0x7c]
+v_cmp_nlt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x66,0x7c]
+v_cmp_nlt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x66,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_nlt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x66,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_nlt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x66,0x7c]
+v_cmp_nlt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x66,0x7c]
+v_cmp_nlt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x67,0x7c]
+v_cmp_nlt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x33,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x33,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x33,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x33,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x9c,0x7c]
 
-v_cmpx_le_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x33,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x9c,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x33,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x9c,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x33,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x9c,0x7c]
 
-v_cmpx_le_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x33,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x9c,0x7c]
 
-v_cmpx_le_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x33,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nlt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x9d,0x7c]
 
-v_cmpx_le_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x33,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x33,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nlt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x4e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x33,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nlt_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x4e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x33,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nlt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x4e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x33,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nlt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x4e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x33,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nlt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x4e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x33,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nlt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x4e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x33,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nlt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x4e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x33,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nlt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x4e,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_le_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x33,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nlt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_le_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x33,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x4e,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_gt_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_gt_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_gt_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_gt_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x68,0x7c]
+v_cmp_nlt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_gt_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x68,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x4e,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_gt_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x68,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_nlt_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x4e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x68,0x7c]
+v_cmp_tru_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x68,0x7c]
+v_cmp_tru_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x69,0x7c]
+v_cmp_tru_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_tru_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_tru_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_tru_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_tru_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_tru_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_tru_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x9e,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_tru_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x9e,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_tru_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x34,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_tru_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x9e,0x7c]
 
-v_cmpx_gt_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x34,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x9f,0x7c]
 
-v_cmpx_lg_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x4f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x4f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x4f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x4f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x4f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x4f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x4f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x4f,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x4f,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x6a,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x6a,0x7c,0x56,0x34,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x6a,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lg_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x6b,0x7c]
+v_cmp_tru_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lg_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x35,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lg_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x35,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lg_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x35,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lg_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x35,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_lg_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x35,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lg_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x35,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lg_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x35,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x4f,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_lg_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x35,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x4f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lg_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x35,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x35,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x35,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_f_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x35,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_f_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x35,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_f_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x35,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_f_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x35,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_f_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x35,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_f_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x35,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_f_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x35,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_f_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x35,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_f_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa0,0x7c]
 
-v_cmpx_lg_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x35,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa0,0x7c]
 
-v_cmpx_ge_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa0,0x7c]
 
-v_cmpx_ge_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa0,0x7c]
 
-v_cmpx_ge_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa0,0x7c]
 
-v_cmpx_ge_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa0,0x7c]
 
-v_cmpx_ge_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa0,0x7c]
 
-v_cmpx_ge_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa0,0x7c]
 
-v_cmpx_ge_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa0,0x7c]
 
-v_cmpx_ge_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa0,0x7c]
 
-v_cmpx_ge_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x6c,0x7c]
+v_cmpx_f_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa1,0x7c]
 
-v_cmpx_ge_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x6c,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x6c,0x7c]
+v_cmpx_f_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x50,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x6c,0x7c]
+v_cmpx_f_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x50,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x6c,0x7c]
+v_cmpx_f_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x50,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x6c,0x7c]
+v_cmpx_f_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x50,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x6c,0x7c]
+v_cmpx_f_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x50,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x6c,0x7c]
+v_cmpx_f_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x50,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x6c,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_f_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x50,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x6c,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x50,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x6c,0x7c]
+v_cmpx_f_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x50,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x6c,0x7c]
+v_cmpx_f_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x50,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_ge_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x6d,0x7c]
+v_cmpx_f_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_ge_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_ge_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ge_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ge_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ge_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_ge_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ge_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_f_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_f_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_f_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x50,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_ge_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x36,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_f_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x50,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x36,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_o_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x6e,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_lt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_o_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x6e,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_lt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa2,0x7c]
 
-v_cmpx_o_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x6e,0x7c]
+v_cmpx_lt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa3,0x7c]
 
-v_cmpx_o_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x6f,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x37,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x51,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x37,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x51,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x37,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x51,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x37,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x51,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x37,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x51,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x37,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x51,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x37,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x51,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x37,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x51,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x37,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x51,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x37,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x51,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x37,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x37,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x37,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x37,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x37,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x37,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x37,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_lt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x37,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_lt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x37,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_lt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_o_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x37,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_u_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x70,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_u_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x70,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_u_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x70,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_u_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x70,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_u_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x70,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_u_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x70,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_u_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x70,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_u_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x70,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_u_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x70,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x51,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_u_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x70,0x7c]
+v_cmpx_lt_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x51,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x70,0x7c]
+v_cmpx_eq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x70,0x7c]
+v_cmpx_eq_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x70,0x7c]
+v_cmpx_eq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x70,0x7c]
+v_cmpx_eq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x70,0x7c]
+v_cmpx_eq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x70,0x7c]
+v_cmpx_eq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x70,0x7c]
+v_cmpx_eq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x70,0x7c]
+v_cmpx_eq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x70,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_eq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x70,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_eq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x70,0x7c]
+v_cmpx_eq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x70,0x7c]
+v_cmpx_eq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x71,0x7c]
+v_cmpx_eq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa4,0x7c]
 
-v_cmpx_u_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_u_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_u_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa4,0x7c]
 
-v_cmpx_u_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa4,0x7c]
 
-v_cmpx_u_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_eq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa5,0x7c]
 
-v_cmpx_u_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_eq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x52,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x52,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_eq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x52,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x52,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x52,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_eq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x52,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_eq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x52,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x38,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_eq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x52,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_u_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x38,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x52,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x52,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_nge_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_nge_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_nge_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_nge_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x72,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_nge_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x72,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_eq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x52,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_nge_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x72,0x7c]
+v_cmpx_eq_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x52,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nge_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x72,0x7c]
+v_cmpx_le_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x73,0x7c]
+v_cmpx_le_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x39,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x39,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x39,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x39,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x39,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x39,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x39,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x39,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x39,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x39,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x39,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_le_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x39,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_le_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x39,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_le_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x39,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_le_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x39,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_le_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x39,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_le_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x39,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_le_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nge_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x39,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_le_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nge_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x39,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_le_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa6,0x7c]
 
-v_cmpx_nge_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x39,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa6,0x7c]
 
-v_cmpx_nlg_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x74,0x7c]
+v_cmpx_le_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa7,0x7c]
 
-v_cmpx_nlg_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x53,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x53,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x53,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x53,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x53,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x53,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x53,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x53,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x53,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x53,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x74,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x74,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x74,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_nlg_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x75,0x7c]
+v_cmpx_le_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_nlg_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_nlg_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_nlg_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_nlg_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_nlg_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_nlg_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_nlg_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_nlg_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x53,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_nlg_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x53,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_gt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa8,0x7c]
 
-v_cmpx_nlg_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xa8,0x7c]
 
-v_cmpx_nlg_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_gt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xa8,0x7c]
 
-v_cmpx_nlg_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xa8,0x7c]
 
-v_cmpx_nlg_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_gt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa8,0x7c]
 
-v_cmpx_nlg_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa8,0x7c]
 
-v_cmpx_nlg_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa8,0x7c]
 
-v_cmpx_nlg_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_gt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa8,0x7c]
 
-v_cmpx_nlg_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_gt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa8,0x7c]
 
-v_cmpx_nlg_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x3a,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_gt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa8,0x7c]
 
-v_cmpx_nlg_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x3a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa8,0x7c]
 
-v_cmpx_ngt_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa8,0x7c]
 
-v_cmpx_ngt_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa8,0x7c]
 
-v_cmpx_ngt_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa8,0x7c]
 
-v_cmpx_ngt_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa8,0x7c]
 
-v_cmpx_ngt_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa8,0x7c]
 
-v_cmpx_ngt_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa8,0x7c]
 
-v_cmpx_ngt_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa8,0x7c]
 
-v_cmpx_ngt_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ngt_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ngt_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa8,0x7c]
 
-v_cmpx_ngt_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa8,0x7c]
 
-v_cmpx_ngt_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x76,0x7c]
+v_cmpx_gt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa9,0x7c]
 
-v_cmpx_ngt_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x76,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x76,0x7c]
+v_cmpx_gt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x54,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x76,0x7c]
+v_cmpx_gt_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x54,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x76,0x7c]
+v_cmpx_gt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x54,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x76,0x7c]
+v_cmpx_gt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x54,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x76,0x7c]
+v_cmpx_gt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x54,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x76,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_gt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x54,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x76,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_gt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x54,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x76,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x54,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ngt_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x76,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x54,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_ngt_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x77,0x7c]
+v_cmpx_gt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x54,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x3b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x3b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x3b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x3b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x3b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x3b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x3b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x3b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x3b,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x3b,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x3b,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x3b,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x3b,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x3b,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x3b,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x3b,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x3b,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_gt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x3b,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_gt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ngt_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x3b,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_gt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x54,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_ngt_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x3b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x54,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x78,0x7c]
+v_cmpx_lg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x78,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_lg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xaa,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nle_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x78,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_lg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xaa,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nle_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x78,0x7c]
+v_cmpx_lg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x78,0x7c]
+v_cmpx_lg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xaa,0x7c]
 
-v_cmpx_nle_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x79,0x7c]
+v_cmpx_lg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xab,0x7c]
 
-v_cmpx_nle_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x55,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x55,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x55,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x55,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x55,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x55,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x55,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x55,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x55,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x55,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_lg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_lg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x3c,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_lg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nle_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x3c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_neq_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_neq_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_neq_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_neq_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_neq_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_neq_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_neq_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_neq_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_neq_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_neq_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x55,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_neq_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x7a,0x7c]
+v_cmpx_lg_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x55,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x7a,0x7c]
+v_cmpx_ge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x7a,0x7c]
+v_cmpx_ge_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x7a,0x7c]
+v_cmpx_ge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x7a,0x7c]
+v_cmpx_ge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x7a,0x7c]
+v_cmpx_ge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x7a,0x7c]
+v_cmpx_ge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x7a,0x7c]
+v_cmpx_ge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x7a,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_ge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x7a,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_ge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x7a,0x7c]
+v_cmpx_ge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x7a,0x7c]
+v_cmpx_ge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x7b,0x7c]
+v_cmpx_ge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x3d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x3d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x3d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x3d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x3d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x3d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xac,0x7c]
 
-v_cmpx_neq_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x3d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xac,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_neq_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x3d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xac,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_neq_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x3d,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xac,0x7c]
 
-v_cmpx_neq_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x3d,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_ge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xac,0x7c]
 
-v_cmpx_neq_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x3d,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xad,0x7c]
 
-v_cmpx_neq_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x3d,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x3d,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x56,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x3d,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_ge_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x56,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x3d,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x56,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x3d,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x56,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x3d,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_ge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x56,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x3d,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_ge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x56,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x3d,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_ge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x56,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x3d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x56,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x56,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x56,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_nlt_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_nlt_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_nlt_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x7c,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_nlt_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x7c,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_ge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_nlt_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x56,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_nlt_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x7c,0x7c]
+v_cmpx_ge_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x56,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x7d,0x7c]
+v_cmpx_o_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_o_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_o_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_o_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_o_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_o_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_o_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_o_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xae,0x7c]
 
-v_cmpx_nlt_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_o_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xae,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlt_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x3e,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_o_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xae,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nlt_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x3e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xae,0x7c]
 
-v_cmpx_tru_f16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x7e,0x7c]
+v_cmpx_o_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xae,0x7c]
 
-v_cmpx_tru_f16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x7e,0x7c]
+v_cmpx_o_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xaf,0x7c]
 
-v_cmpx_tru_f16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x57,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x57,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x57,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x57,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x57,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x57,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x57,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x57,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x57,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x57,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x7e,0x7c,0x0b,0xfe,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x7e,0x7c,0x56,0x34,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x7e,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_tru_f16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x7f,0x7c]
+v_cmpx_o_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_tru_f16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x3f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_tru_f16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x3f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_tru_f16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x3f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_tru_f16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x3f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_tru_f16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x3f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_tru_f16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x3f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmpx_tru_f16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x3f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_tru_f16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x3f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_tru_f16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x3f,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x57,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmpx_tru_f16_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x3f,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_o_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x57,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x3f,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_u_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb0,0x7c]
 
-v_cmpx_tru_f16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x3f,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_u_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xb0,0x7c]
 
-v_cmpx_tru_f16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x3f,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_u_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xb0,0x7c]
 
-v_cmpx_tru_f16_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x3f,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_u_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xb0,0x7c]
 
-v_cmpx_tru_f16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x3f,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_u_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb0,0x7c]
 
-v_cmpx_tru_f16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x3f,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_u_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb0,0x7c]
 
-v_cmpx_tru_f16_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x3f,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_u_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb0,0x7c]
 
-v_cmpx_tru_f16_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x3f,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_u_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb0,0x7c]
 
-v_cmpx_tru_f16_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x3f,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_u_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb0,0x7c]
 
-v_cmpx_tru_f16_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x3f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_f_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb0,0x7c]
 
-v_cmp_f_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x80,0x7c]
+v_cmpx_u_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb1,0x7c]
 
-v_cmp_f_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x80,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x80,0x7c]
+v_cmpx_u_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x58,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x80,0x7c]
+v_cmpx_u_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x58,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x80,0x7c]
+v_cmpx_u_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x58,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x80,0x7c]
+v_cmpx_u_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x58,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x80,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_u_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x58,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x80,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_u_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x58,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x80,0x7c]
+v_cmpx_u_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x58,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x80,0x7c]
+v_cmpx_u_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x58,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_f_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x81,0x7c]
+v_cmpx_u_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x58,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x58,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_f_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_f_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_f_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_f_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_f_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_f_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_f_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x40,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x40,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x40,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x40,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_u_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_f_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_u_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_u_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x58,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_f_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x40,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_u_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x58,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x40,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x82,0x7c]
+v_cmpx_nge_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x82,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nge_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_lt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x82,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nge_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x82,0x7c]
+v_cmpx_nge_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb2,0x7c]
 
-v_cmp_lt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x82,0x7c]
+v_cmpx_nge_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb3,0x7c]
 
-v_cmp_lt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x83,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x41,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x59,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x41,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x59,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x41,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x59,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x41,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x59,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x41,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x59,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x41,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x59,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x41,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x59,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x41,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x59,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x41,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x59,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x41,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x59,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x41,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x41,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x41,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x41,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x41,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x41,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x41,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x41,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x41,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nge_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x41,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nge_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x41,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nge_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_lt_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x41,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_eq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x84,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_eq_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x84,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_eq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x84,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_eq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x84,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_eq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x84,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_eq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x84,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_eq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x84,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x59,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_eq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x84,0x7c]
+v_cmpx_nge_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x59,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x84,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nlg_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x84,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nlg_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x84,0x7c]
+v_cmpx_nlg_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x85,0x7c]
+v_cmpx_nlg_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb4,0x7c]
 
-v_cmp_eq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb4,0x7c]
 
-v_cmp_eq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb4,0x7c]
 
-v_cmp_eq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb5,0x7c]
 
-v_cmp_eq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x42,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nlg_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x42,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nlg_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x5a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x42,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nlg_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x5a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nlg_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x5a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x42,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nlg_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x5a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nlg_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x5a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nlg_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x5a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nlg_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x5a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nlg_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x5a,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nlg_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nlg_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x5a,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nlg_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x42,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nlg_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_eq_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x42,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_le_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_le_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_le_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_le_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_le_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_le_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_le_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_le_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_le_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_le_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_le_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_le_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_le_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_le_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_le_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_le_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x5a,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_le_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x86,0x7c]
+v_cmpx_nlg_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x5a,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x86,0x7c]
+v_cmpx_ngt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb6,0x7c]
 
-v_cmp_le_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x86,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_ngt_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xb6,0x7c]
 
-v_cmp_le_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x86,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_ngt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xb6,0x7c]
 
-v_cmp_le_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x86,0x7c]
+v_cmpx_ngt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xb6,0x7c]
 
-v_cmp_le_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x86,0x7c]
+v_cmpx_ngt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb6,0x7c]
 
-v_cmp_le_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x87,0x7c]
+v_cmpx_ngt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x43,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x43,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x43,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x43,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x43,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x43,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x43,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x43,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x43,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x43,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x43,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_ngt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x43,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ngt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb6,0x7c]
 
-v_cmp_le_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x43,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ngt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x43,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ngt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x43,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ngt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb6,0x7c]
 
-v_cmp_le_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x43,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_ngt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb6,0x7c]
 
-v_cmp_le_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x43,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ngt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb7,0x7c]
 
-v_cmp_le_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x43,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x43,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_ngt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x5b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x43,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_ngt_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x5b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x43,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_ngt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x5b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x43,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x5b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x5b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x5b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x5b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x5b,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_gt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_gt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x5b,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_gt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_gt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_gt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_gt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_gt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_gt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_gt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_gt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_gt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_gt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_gt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_gt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_gt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x88,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_ngt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_gt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x88,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_ngt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_gt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_gt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x88,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_gt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x89,0x7c]
+v_cmpx_ngt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_gt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_gt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x5b,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_gt_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x5b,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x44,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nle_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x44,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nle_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x44,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nle_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nle_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x44,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nle_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nle_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nle_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nle_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nle_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nle_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nle_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nle_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x44,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nle_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb8,0x7c]
 
-v_cmp_gt_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x44,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_lg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8a,0x7c]
+v_cmpx_nle_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_lg_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x8a,0x7c]
+v_cmpx_nle_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb8,0x7c]
 
-v_cmp_lg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x8a,0x7c]
+v_cmpx_nle_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb8,0x7c]
 
-v_cmp_lg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x8a,0x7c]
+v_cmpx_nle_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb9,0x7c]
 
-v_cmp_lg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x5c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x5c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x5c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x5c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x5c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x5c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x5c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x5c,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_lg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_lg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x5c,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_lg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_lg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_lg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nle_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_lg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nle_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_lg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_lg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8a,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_lg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8b,0x7c]
+v_cmpx_nle_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x45,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x45,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x45,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_lg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x45,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_lg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x45,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_lg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x45,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_lg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x45,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_lg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x45,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x45,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x45,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x45,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x5c,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_lg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x45,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nle_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x5c,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x45,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_neq_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xba,0x7c]
 
-v_cmp_lg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x45,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_neq_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xba,0x7c]
 
-v_cmp_lg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x45,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_neq_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xba,0x7c]
 
-v_cmp_lg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x45,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_neq_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xba,0x7c]
 
-v_cmp_lg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x45,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_neq_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xba,0x7c]
 
-v_cmp_lg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x45,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_neq_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xba,0x7c]
 
-v_cmp_lg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x45,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_neq_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xba,0x7c]
 
-v_cmp_lg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x45,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_neq_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xba,0x7c]
 
-v_cmp_lg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x45,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_neq_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xba,0x7c]
 
-v_cmp_lg_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x45,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xba,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xba,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xba,0x7c]
 
-v_cmp_ge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8c,0x7c]
+v_cmpx_neq_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xbb,0x7c]
 
-v_cmp_ge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8c,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8c,0x7c]
+v_cmpx_neq_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x5d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8c,0x7c]
+v_cmpx_neq_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x5d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8c,0x7c]
+v_cmpx_neq_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x5d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8c,0x7c]
+v_cmpx_neq_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x5d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_neq_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x5d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_neq_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x5d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8c,0x7c]
+v_cmpx_neq_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x5d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8c,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x5d,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8d,0x7c]
+v_cmpx_neq_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x5d,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_ge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x46,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x46,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x46,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x46,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_neq_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_neq_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_neq_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x5d,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_ge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x46,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_neq_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x5d,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x46,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xbc,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_o_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nlt_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xbc,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_o_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nlt_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xbc,0x7c]
 
-v_cmp_o_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8e,0x7c]
+v_cmpx_nlt_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xbd,0x7c]
 
-v_cmp_o_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8f,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x47,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x5e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x47,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x5e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x47,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x5e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x47,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x5e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x47,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x5e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x47,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x5e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x47,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x5e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x47,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x5e,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x47,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x47,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x5e,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x47,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x47,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x47,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x47,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x47,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x47,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x47,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x47,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x47,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nlt_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x47,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nlt_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x47,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nlt_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_o_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x47,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_u_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x90,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_u_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x90,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_u_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x90,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_u_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x90,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_u_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x90,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_u_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x90,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_u_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x90,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x5e,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_u_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x90,0x7c]
+v_cmpx_nlt_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x5e,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x90,0x7c]
+v_cmpx_tru_f32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x90,0x7c]
+v_cmpx_tru_f32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x90,0x7c]
+v_cmpx_tru_f32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x90,0x7c]
+v_cmpx_tru_f32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x90,0x7c]
+v_cmpx_tru_f32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x90,0x7c]
+v_cmpx_tru_f32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x90,0x7c]
+v_cmpx_tru_f32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x90,0x7c]
+v_cmpx_tru_f32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x90,0x7c]
+v_cmpx_tru_f32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x90,0x7c]
+v_cmpx_tru_f32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x90,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_tru_f32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x90,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_tru_f32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x90,0x7c]
+v_cmpx_tru_f32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x90,0x7c]
+v_cmpx_tru_f32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xbe,0x7c]
 
-v_cmp_u_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x91,0x7c]
+v_cmpx_tru_f32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xbe,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xbe,0x7c]
 
-v_cmp_u_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xbe,0x7c]
 
-v_cmp_u_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xbe,0x7c]
 
-v_cmp_u_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xbe,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_u_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xbe,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_u_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xbe,0x7c]
 
-v_cmp_u_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xbe,0x7c]
 
-v_cmp_u_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xbf,0x7c]
 
-v_cmp_u_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x48,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_tru_f32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x48,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_tru_f32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0x5f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x48,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_tru_f32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0x5f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_tru_f32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0x5f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x48,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_tru_f32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0x5f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_tru_f32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0x5f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_tru_f32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0x5f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_tru_f32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0x5f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_tru_f32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0x5f,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_tru_f32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_tru_f32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0x5f,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_tru_f32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x48,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_tru_f32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_u_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x48,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_nge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_nge_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_nge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_nge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_nge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_nge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_nge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_nge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_nge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_nge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_nge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_nge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, scc
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xfa,0x01,0x00]
 
-v_cmp_nge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, -s2
+// CHECK: [0x0a,0x00,0x5f,0xd0,0x80,0x04,0x00,0x40]
 
-v_cmp_nge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x92,0x7c]
+v_cmpx_tru_f32_e64 s[10:11], 0, s2 clamp
+// CHECK: [0x0a,0x80,0x5f,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x92,0x7c]
+v_cmp_f_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x92,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_f_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x92,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_f_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x92,0x7c]
+v_cmp_f_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x92,0x7c]
+v_cmp_f_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x93,0x7c]
+v_cmp_f_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x49,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x49,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x49,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x49,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x49,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x49,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x49,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc0,0x7c]
 
-v_cmp_nge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x49,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x49,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x49,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc0,0x7c]
 
-v_cmp_nge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x49,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_f_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc0,0x7c]
 
-v_cmp_nge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x49,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc1,0x7c]
 
-v_cmp_nge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x49,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_f_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x49,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x49,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_f_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x49,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_f_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x49,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x49,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_f_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x49,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_f_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x49,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_f_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x49,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_f_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_nge_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x49,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_nlg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x94,0x7c]
+v_cmp_f_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_nlg_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x94,0x7c]
+v_cmp_f_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_nlg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x94,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_nlg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x94,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_nlg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x94,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_nlg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x94,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_nlg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x94,0x7c]
+v_cmp_f_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_nlg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x94,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_nlg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x94,0x7c]
+v_cmp_f_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x60,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_nlg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x94,0x7c]
+v_cmp_f_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x60,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x94,0x7c]
+v_cmp_lt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x94,0x7c]
+v_cmp_lt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x94,0x7c]
+v_cmp_lt_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x94,0x7c]
+v_cmp_lt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x94,0x7c]
+v_cmp_lt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x94,0x7c]
+v_cmp_lt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x94,0x7c]
+v_cmp_lt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x94,0x7c]
+v_cmp_lt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x94,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_lt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x94,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_lt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x94,0x7c]
+v_cmp_lt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x94,0x7c]
+v_cmp_lt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x95,0x7c]
+v_cmp_lt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc2,0x7c]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nlg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nlg_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc2,0x7c]
 
-v_cmp_nlg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc2,0x7c]
 
-v_cmp_nlg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc3,0x7c]
 
-v_cmp_nlg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x61,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x61,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x61,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x4a,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x61,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x4a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x61,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x4a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_lt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x61,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x61,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x4a,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_lt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x61,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x61,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_lt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x61,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_lt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x61,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x61,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x61,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x61,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x61,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x4a,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x61,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_nlg_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x4a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x61,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_ngt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x96,0x7c]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x61,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_ngt_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x96,0x7c]
+v_cmp_lt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x61,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_ngt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x96,0x7c]
+v_cmp_lt_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x61,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_ngt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x96,0x7c]
+v_cmp_eq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_ngt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x96,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_eq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x96,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_eq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc4,0x7c]
 
-v_cmp_ngt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x96,0x7c]
+v_cmp_eq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc5,0x7c]
 
-v_cmp_ngt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x96,0x7c]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x97,0x7c]
+v_cmp_eq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x4b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x4b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x4b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x4b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x4b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x4b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x4b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_ngt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x4b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x4b,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x4b,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x4b,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x4b,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x4b,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x4b,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x4b,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_eq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x4b,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x4b,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x62,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x4b,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_eq_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x62,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ngt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x4b,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_le_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc6,0x7c]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x4b,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_le_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc6,0x7c]
 
-v_cmp_ngt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x4b,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_le_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xc6,0x7c]
 
-v_cmp_ngt_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x4b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nle_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nle_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc6,0x7c]
 
-v_cmp_nle_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x98,0x7c]
+v_cmp_le_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc7,0x7c]
 
-v_cmp_nle_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x98,0x7c]
+v_cmp_le_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x63,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x98,0x7c]
+v_cmp_le_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x63,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x98,0x7c]
+v_cmp_le_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x63,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x98,0x7c]
+v_cmp_le_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x63,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x98,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_le_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x63,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x98,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_le_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x63,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x98,0x7c]
+v_cmp_le_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x63,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x98,0x7c]
+v_cmp_le_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x63,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x99,0x7c]
+v_cmp_le_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x63,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_nle_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x63,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_nle_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x63,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_nle_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x63,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_nle_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x63,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_nle_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x63,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_nle_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x63,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_nle_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x63,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_nle_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x63,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_nle_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x4c,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x63,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_nle_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x4c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x63,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_nle_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x4c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_le_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x63,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nle_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc8,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x4c,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_gt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc8,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xc8,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_gt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xc8,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_gt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc8,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc8,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_gt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc8,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_gt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc8,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_gt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc8,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x4c,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_gt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc8,0x7c]
 
-v_cmp_nle_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x4c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc8,0x7c]
 
-v_cmp_neq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x9a,0x7c]
+v_cmp_gt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc8,0x7c]
 
-v_cmp_neq_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x9a,0x7c]
+v_cmp_gt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc8,0x7c]
 
-v_cmp_neq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x9a,0x7c]
+v_cmp_gt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_neq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x9a,0x7c]
+v_cmp_gt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_neq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x9a,0x7c]
+v_cmp_gt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc8,0x7c]
 
-v_cmp_neq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x9a,0x7c]
+v_cmp_gt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc8,0x7c]
 
-v_cmp_neq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x9a,0x7c]
+v_cmp_gt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc9,0x7c]
 
-v_cmp_neq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_neq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_neq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x9a,0x7c]
+v_cmp_gt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_neq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x9a,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_gt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_neq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x9a,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_neq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x9a,0x7c]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_neq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x9a,0x7c]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_neq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x9b,0x7c]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_neq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x4d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_neq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x4d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_neq_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x4d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x64,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_neq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x4d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x64,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_neq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x4d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x4d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x4d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x4d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x4d,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x4d,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x4d,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_lg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x4d,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x4d,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_lg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x4d,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x4d,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_lg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x4d,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_lg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x4d,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x4d,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xca,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_neq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x4d,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xca,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_neq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x4d,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x4d,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xca,0x7c]
 
-v_cmp_neq_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x4d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcb,0x7c]
 
-v_cmp_nlt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x65,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x65,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x65,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x65,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x65,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x65,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x65,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x65,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x65,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x65,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x65,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x65,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x65,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_nlt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x65,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_nlt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x65,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_nlt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x65,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_nlt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x65,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_nlt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x9c,0x7c]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x65,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_nlt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x9c,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_lg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x65,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_nlt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x9c,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_lg_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x65,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x9c,0x7c]
+v_cmp_ge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x9c,0x7c]
+v_cmp_ge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x9d,0x7c]
+v_cmp_ge_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x4e,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x4e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x4e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xcc,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_nlt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xcc,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_nlt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x4e,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xcc,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcd,0x7c]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ge_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x4e,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_nlt_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x4e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_tru_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_tru_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_tru_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_tru_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_tru_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_tru_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_tru_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_tru_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_tru_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_tru_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x66,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_tru_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x9e,0x7c]
+v_cmp_ge_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x66,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x9e,0x7c]
+v_cmp_o_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xce,0x7c]
 
-v_cmp_tru_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x9e,0x7c]
+v_cmp_o_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xce,0x7c]
 
-v_cmp_tru_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x9e,0x7c]
+v_cmp_o_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xce,0x7c]
 
-v_cmp_tru_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x9e,0x7c]
+v_cmp_o_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xce,0x7c]
 
-v_cmp_tru_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x9e,0x7c]
+v_cmp_o_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xce,0x7c]
 
-v_cmp_tru_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x9e,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_o_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xce,0x7c]
 
-v_cmp_tru_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x9e,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_o_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xce,0x7c]
 
-v_cmp_tru_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x9e,0x7c]
+v_cmp_o_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xce,0x7c]
 
-v_cmp_tru_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x9e,0x7c]
+v_cmp_o_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xce,0x7c]
 
-v_cmp_tru_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x9f,0x7c]
+v_cmp_o_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xce,0x7c]
 
-v_cmp_tru_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x4f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xce,0x7c]
 
-v_cmp_tru_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x4f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xce,0x7c]
 
-v_cmp_tru_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x4f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xce,0x7c]
 
-v_cmp_tru_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x4f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xce,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_tru_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x4f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xce,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_tru_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x4f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xce,0x7c]
 
-v_cmp_tru_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x4f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xce,0x7c]
 
-v_cmp_tru_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x4f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcf,0x7c]
 
-v_cmp_tru_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x4f,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_o_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x67,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x4f,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_o_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x67,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x4f,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_o_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x67,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x4f,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_o_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x67,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x4f,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_o_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x67,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x4f,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_o_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x67,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x4f,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_o_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x67,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x4f,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_o_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x67,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x4f,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_o_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x67,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x4f,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_o_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x67,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x4f,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_o_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x67,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x4f,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_o_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x67,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x4f,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_o_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x67,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_tru_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x4f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_o_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x67,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_f_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa0,0x7c]
+v_cmp_o_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x67,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_f_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xa0,0x7c]
+v_cmp_o_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x67,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_f_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xa0,0x7c]
+v_cmp_o_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x67,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_f_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xa0,0x7c]
+v_cmp_o_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x67,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_f_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa0,0x7c]
+v_cmp_o_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x67,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_f_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa0,0x7c]
+v_cmp_o_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x67,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa0,0x7c]
+v_cmp_u_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_u_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_u_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa0,0x7c]
+v_cmp_u_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa0,0x7c]
+v_cmp_u_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd0,0x7c]
 
-v_cmpx_f_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa1,0x7c]
+v_cmp_u_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd0,0x7c]
 
-v_cmpx_f_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd1,0x7c]
 
-v_cmpx_f_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_u_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x50,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_u_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x50,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_u_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x50,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_u_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_u_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x50,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_u_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_u_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_u_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_u_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_u_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_u_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_f_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_u_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_f_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_u_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x68,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_f_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x50,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_u_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x68,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x50,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd2,0x7c]
 
-v_cmpx_lt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa2,0x7c]
+v_cmp_nge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd3,0x7c]
 
-v_cmpx_lt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa2,0x7c]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x69,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_nge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x69,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_nge_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x69,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa2,0x7c]
+v_cmp_nge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x69,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa2,0x7c]
+v_cmp_nge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x69,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa3,0x7c]
+v_cmp_nge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x69,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x51,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x69,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x51,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x69,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x51,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x69,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_lt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x51,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x69,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_lt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x51,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x69,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_lt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x51,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x69,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_lt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x51,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x69,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_lt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x51,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x69,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x51,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x69,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x51,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x69,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x51,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x69,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_lt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x51,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x69,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_lt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x51,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x69,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x51,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nge_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x69,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x51,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_nlg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd4,0x7c]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x51,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nlg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd4,0x7c]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x51,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nlg_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xd4,0x7c]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x51,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nlg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xd4,0x7c]
 
-v_cmpx_lt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x51,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nlg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd4,0x7c]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x51,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nlg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd4,0x7c]
 
-v_cmpx_lt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x51,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nlg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd4,0x7c]
 
-v_cmpx_lt_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x51,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd4,0x7c]
 
-v_cmpx_eq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa4,0x7c]
+v_cmp_nlg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd4,0x7c]
 
-v_cmpx_eq_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xa4,0x7c]
+v_cmp_nlg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd4,0x7c]
 
-v_cmpx_eq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xa4,0x7c]
+v_cmp_nlg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd4,0x7c]
 
-v_cmpx_eq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xa4,0x7c]
+v_cmp_nlg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd4,0x7c]
 
-v_cmpx_eq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa4,0x7c]
+v_cmp_nlg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd4,0x7c]
 
-v_cmpx_eq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa4,0x7c]
+v_cmp_nlg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa4,0x7c]
+v_cmp_nlg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa4,0x7c]
+v_cmp_nlg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd4,0x7c]
 
-v_cmpx_eq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa4,0x7c]
+v_cmp_nlg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd4,0x7c]
 
-v_cmpx_eq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa4,0x7c]
+v_cmp_nlg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd5,0x7c]
 
-v_cmpx_eq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa4,0x7c]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa4,0x7c]
+v_cmp_nlg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa4,0x7c]
+v_cmp_nlg_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa4,0x7c]
+v_cmp_nlg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa4,0x7c]
+v_cmp_nlg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa4,0x7c]
+v_cmp_nlg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa4,0x7c]
+v_cmp_nlg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa4,0x7c]
+v_cmp_nlg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_nlg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_nlg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa4,0x7c]
+v_cmp_nlg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa4,0x7c]
+v_cmp_nlg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_eq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa5,0x7c]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_eq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_eq_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_eq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_eq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_eq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6a,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_eq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlg_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x6a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x52,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ngt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x52,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ngt_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x52,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ngt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ngt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x52,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ngt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ngt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ngt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ngt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ngt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ngt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ngt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ngt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd6,0x7c]
 
-v_cmpx_eq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x52,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ngt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x52,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ngt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa6,0x7c]
+v_cmp_ngt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd6,0x7c]
 
-v_cmpx_le_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xa6,0x7c]
+v_cmp_ngt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd6,0x7c]
 
-v_cmpx_le_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xa6,0x7c]
+v_cmp_ngt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd7,0x7c]
 
-v_cmpx_le_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x6b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6b,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_le_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6b,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_le_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6b,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_le_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6b,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_le_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6b,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_le_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6b,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_le_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6b,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_le_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6b,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_le_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ngt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6b,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_le_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6b,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_le_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa6,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6b,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_le_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa7,0x7c]
+v_cmp_ngt_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x6b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x53,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x53,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x53,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x53,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x53,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x53,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x53,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x53,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x53,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x53,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x53,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nle_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x53,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nle_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x53,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nle_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x53,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nle_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x53,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_nle_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x53,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nle_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x53,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nle_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd8,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x53,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nle_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd9,0x7c]
 
-v_cmpx_le_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x53,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x53,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nle_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x53,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nle_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x53,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nle_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_gt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_gt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_gt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_gt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_gt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_gt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6c,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_gt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa8,0x7c]
+v_cmp_nle_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x6c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa8,0x7c]
+v_cmp_neq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa8,0x7c]
+v_cmp_neq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_neq_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_neq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa8,0x7c]
+v_cmp_neq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa8,0x7c]
+v_cmp_neq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa9,0x7c]
+v_cmp_neq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xda,0x7c]
 
-v_cmpx_gt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xda,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xda,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x54,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xda,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x54,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xda,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x54,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_neq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xdb,0x7c]
 
-v_cmpx_gt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x54,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_neq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_neq_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x6d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_neq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_neq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_neq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_neq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_neq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_neq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6d,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x54,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_neq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6d,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_gt_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x54,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_neq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6d,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_lg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xaa,0x7c]
+v_cmp_neq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6d,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_lg_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xaa,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6d,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_lg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xaa,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6d,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_lg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xaa,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6d,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_lg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xaa,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6d,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_lg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xaa,0x7c]
+v_cmp_neq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6d,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_lg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xaa,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6d,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_lg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xaa,0x7c]
+v_cmp_neq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6d,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_lg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xaa,0x7c]
+v_cmp_neq_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x6d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xaa,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_nlt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xaa,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_nlt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xaa,0x7c]
+v_cmp_nlt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xdc,0x7c]
 
-v_cmpx_lg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xab,0x7c]
+v_cmp_nlt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xdc,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x55,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xdc,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x55,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xdc,0x7c]
 
-v_cmpx_lg_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x55,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xdc,0x7c]
 
-v_cmpx_lg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x55,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xdd,0x7c]
 
-v_cmpx_lg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x55,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x55,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x55,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x55,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x55,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_nlt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x55,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_nlt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x55,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_nlt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x55,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_nlt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x55,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_nlt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x55,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_nlt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x55,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_nlt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x55,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_nlt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x55,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x55,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x55,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x55,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_lg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x55,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_nlt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_lg_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x55,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_ge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xac,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6e,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_ge_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xac,0x7c]
+v_cmp_nlt_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x6e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xde,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xde,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xac,0x7c]
+v_cmp_tru_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xac,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_tru_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xde,0x7c]
 
-v_cmpx_ge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xac,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_tru_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xdf,0x7c]
 
-v_cmpx_ge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xac,0x7c]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xac,0x7c]
+v_cmp_tru_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x6f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xad,0x7c]
+v_cmp_tru_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x6f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x6f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x6f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x6f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x6f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x6f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x6f,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x6f,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x6f,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x56,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x6f,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x56,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x6f,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x56,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x6f,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x6f,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x56,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x6f,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_tru_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x6f,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6f,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_tru_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x6f,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_tru_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x6f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_f_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe0,0x7c]
 
-v_cmpx_ge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_f_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe0,0x7c]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_f_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xe0,0x7c]
 
-v_cmpx_ge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x56,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_f_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xe0,0x7c]
 
-v_cmpx_ge_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x56,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_o_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_o_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe0,0x7c]
 
-v_cmpx_o_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xae,0x7c]
+v_cmpx_f_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe1,0x7c]
 
-v_cmpx_o_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xae,0x7c]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xae,0x7c]
+v_cmpx_f_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xae,0x7c]
+v_cmpx_f_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xae,0x7c]
+v_cmpx_f_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xae,0x7c]
+v_cmpx_f_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xae,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_f_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xae,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_f_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xae,0x7c]
+v_cmpx_f_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xae,0x7c]
+v_cmpx_f_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_o_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xaf,0x7c]
+v_cmpx_f_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_o_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x57,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_o_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x57,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_o_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x57,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_o_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x57,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_o_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x57,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_o_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x57,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_o_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x57,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_o_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x57,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_o_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x57,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x70,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_o_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x57,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_f_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x70,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_o_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x57,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_lt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x57,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x57,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lt_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x57,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x57,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x57,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_lt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x57,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x57,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_lt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x57,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_lt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x57,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_lt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x57,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_lt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe2,0x7c]
 
-v_cmpx_o_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x57,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe2,0x7c]
 
-v_cmpx_u_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb0,0x7c]
+v_cmpx_lt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe2,0x7c]
 
-v_cmpx_u_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xb0,0x7c]
+v_cmpx_lt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_u_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xb0,0x7c]
+v_cmpx_lt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_u_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xb0,0x7c]
+v_cmpx_lt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe2,0x7c]
 
-v_cmpx_u_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb0,0x7c]
+v_cmpx_lt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe2,0x7c]
 
-v_cmpx_u_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb0,0x7c]
+v_cmpx_lt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe3,0x7c]
 
-v_cmpx_u_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x71,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x71,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x71,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x71,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x71,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x71,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x71,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x71,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x71,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x71,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_u_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x71,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_u_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb0,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x71,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_u_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x71,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_u_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x71,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_u_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb0,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x71,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_u_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb0,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x71,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_u_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb1,0x7c]
+v_cmpx_lt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x71,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_u_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x71,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_u_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x71,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_u_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x71,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_u_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x58,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x58,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x58,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_eq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x58,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_eq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_eq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_u_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_u_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_eq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_eq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe4,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x58,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_eq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe5,0x7c]
 
-v_cmpx_u_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x58,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_nge_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_nge_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_nge_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_nge_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_nge_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_nge_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb2,0x7c]
+v_cmpx_eq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x72,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_nge_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x72,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_le_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb2,0x7c]
+v_cmpx_le_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb2,0x7c]
+v_cmpx_le_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb3,0x7c]
+v_cmpx_le_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x59,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x59,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x59,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x59,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x59,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x59,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x59,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x59,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x59,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x59,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nge_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x59,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_le_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nge_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x59,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_le_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x59,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_le_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe6,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x59,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_le_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe7,0x7c]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x59,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x73,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x59,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_le_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x73,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x59,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_le_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x73,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x59,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_le_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x73,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x59,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_le_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x73,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x59,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_le_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x73,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x59,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_le_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x73,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nge_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x59,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x73,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x73,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x73,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x73,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x73,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x73,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_nlg_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x73,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_nlg_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x73,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_nlg_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x73,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_nlg_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x73,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_nlg_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x73,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_nlg_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x73,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_nlg_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb4,0x7c]
+v_cmpx_le_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x73,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb4,0x7c]
+v_cmpx_gt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb4,0x7c]
+v_cmpx_gt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb4,0x7c]
+v_cmpx_gt_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb4,0x7c]
+v_cmpx_gt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb4,0x7c]
+v_cmpx_gt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb4,0x7c]
+v_cmpx_gt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb4,0x7c]
+v_cmpx_gt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb4,0x7c]
+v_cmpx_gt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb5,0x7c]
+v_cmpx_gt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe8,0x7c]
 
-v_cmpx_nlg_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlg_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nlg_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe8,0x7c]
 
-v_cmpx_nlg_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe8,0x7c]
 
-v_cmpx_nlg_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe9,0x7c]
 
-v_cmpx_nlg_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x5a,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x5a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x5a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x5a,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_gt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_gt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_gt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x5a,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_nlg_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x5a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ngt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb6,0x7c]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ngt_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xb6,0x7c]
+v_cmpx_gt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_ngt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xb6,0x7c]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_ngt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xb6,0x7c]
+v_cmpx_gt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x74,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_ngt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb6,0x7c]
+v_cmpx_gt_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x74,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_lg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xea,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ngt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_lg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xea,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ngt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb6,0x7c]
+v_cmpx_lg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xea,0x7c]
 
-v_cmpx_ngt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb7,0x7c]
+v_cmpx_lg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xeb,0x7c]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x5b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x75,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x5b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x75,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x5b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x75,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x5b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x75,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x5b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x75,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x5b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x75,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x5b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x75,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x5b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x75,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x5b,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x75,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x5b,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x75,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x5b,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x75,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x5b,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x75,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x5b,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x75,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x5b,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x75,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x5b,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x75,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x5b,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x75,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x5b,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x75,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x5b,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x75,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_ngt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x5b,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_lg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x75,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x5b,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_lg_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x75,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ngt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x5b,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_ge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xec,0x7c]
 
-v_cmpx_ngt_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x5b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xec,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nle_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xec,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nle_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xec,0x7c]
 
-v_cmpx_nle_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb8,0x7c]
+v_cmpx_ge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xed,0x7c]
 
-v_cmpx_nle_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb8,0x7c]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb8,0x7c]
+v_cmpx_ge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb8,0x7c]
+v_cmpx_ge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb8,0x7c]
+v_cmpx_ge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb9,0x7c]
+v_cmpx_ge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_nle_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_nle_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_nle_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_nle_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_nle_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x5c,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_nle_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x5c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_nle_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x5c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_nle_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x76,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_nle_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x5c,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ge_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x76,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_o_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xee,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_o_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xee,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_o_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xee,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_o_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xee,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_o_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xee,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_o_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xee,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_o_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xee,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x5c,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_o_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xee,0x7c]
 
-v_cmpx_nle_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x5c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xee,0x7c]
 
-v_cmpx_neq_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xba,0x7c]
+v_cmpx_o_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xee,0x7c]
 
-v_cmpx_neq_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xba,0x7c]
+v_cmpx_o_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xee,0x7c]
 
-v_cmpx_neq_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xba,0x7c]
+v_cmpx_o_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xee,0x7c]
 
-v_cmpx_neq_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xba,0x7c]
+v_cmpx_o_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xee,0x7c]
 
-v_cmpx_neq_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xba,0x7c]
+v_cmpx_o_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xee,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_neq_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xba,0x7c]
+v_cmpx_o_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xee,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_neq_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xba,0x7c]
+v_cmpx_o_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xee,0x7c]
 
-v_cmpx_neq_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xba,0x7c]
+v_cmpx_o_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xee,0x7c]
 
-v_cmpx_neq_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xba,0x7c]
+v_cmpx_o_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xef,0x7c]
 
-v_cmpx_neq_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xba,0x7c]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x77,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xba,0x7c]
+v_cmpx_o_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x77,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xba,0x7c]
+v_cmpx_o_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x77,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xba,0x7c]
+v_cmpx_o_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x77,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xba,0x7c]
+v_cmpx_o_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x77,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xba,0x7c]
+v_cmpx_o_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x77,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xba,0x7c]
+v_cmpx_o_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x77,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xba,0x7c]
+v_cmpx_o_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x77,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xba,0x7c]
+v_cmpx_o_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x77,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xba,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_o_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x77,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xba,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_o_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x77,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xba,0x7c]
+v_cmpx_o_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x77,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_neq_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xba,0x7c]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x77,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_neq_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xbb,0x7c]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x77,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x5d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x77,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_neq_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x5d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x77,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_neq_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x5d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x77,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_neq_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x5d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x77,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_neq_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x5d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x77,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_neq_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x5d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_o_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x77,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_neq_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x5d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x5d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x5d,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_u_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x5d,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_u_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x5d,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_u_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x5d,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_u_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x5d,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_u_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x5d,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_u_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x5d,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_u_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x5d,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_u_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x5d,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_u_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x5d,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_u_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x5d,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_u_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf0,0x7c]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x5d,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_u_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf0,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_neq_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x5d,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_u_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf0,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_neq_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x5d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_u_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf0,0x7c]
 
-v_cmpx_nlt_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xbc,0x7c]
+v_cmpx_u_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf0,0x7c]
 
-v_cmpx_nlt_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xbc,0x7c]
+v_cmpx_u_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf1,0x7c]
 
-v_cmpx_nlt_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_nlt_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_nlt_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_nlt_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_nlt_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xbc,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_u_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_nlt_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xbc,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_nlt_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x78,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_nlt_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xbc,0x7c]
+v_cmpx_u_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x78,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xbd,0x7c]
+v_cmpx_nge_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x5e,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x5e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x5e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nge_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nge_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x5e,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nge_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf2,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nge_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf2,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nge_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nge_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf2,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nge_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf3,0x7c]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x79,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nge_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x79,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nge_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x79,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x5e,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nge_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x79,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_nlt_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x5e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nge_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x79,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x79,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x79,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x79,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x79,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x79,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x79,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x79,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x79,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_tru_f32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x79,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_tru_f32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x79,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_tru_f32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x79,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_tru_f32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x79,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmpx_tru_f32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x79,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmpx_tru_f32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x79,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmpx_tru_f32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xbe,0x7c]
+v_cmpx_nge_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x79,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xbe,0x7c]
+v_cmpx_nlg_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xbe,0x7c]
+v_cmpx_nlg_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xbe,0x7c]
+v_cmpx_nlg_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xbe,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nlg_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xbe,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nlg_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xbe,0x7c]
+v_cmpx_nlg_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xbe,0x7c]
+v_cmpx_nlg_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xbf,0x7c]
+v_cmpx_nlg_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0x5f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0x5f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0x5f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0x5f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0x5f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf4,0x7c]
 
-v_cmpx_tru_f32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0x5f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf4,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmpx_tru_f32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0x5f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf4,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmpx_tru_f32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0x5f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf4,0x7c]
 
-v_cmpx_tru_f32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0x5f,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf4,0x7c]
 
-v_cmpx_tru_f32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0x5f,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nlg_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf5,0x7c]
 
-v_cmpx_tru_f32_e64 s[0:1], scc, s0
-// CHECK: [0x00,0x00,0x5f,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0x5f,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0x5f,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0x5f,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nlg_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0x5f,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nlg_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, scc
-// CHECK: [0x00,0x00,0x5f,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nlg_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0x5f,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nlg_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0x5f,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_nlg_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], -s0, s0
-// CHECK: [0x00,0x00,0x5f,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nlg_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, -s0
-// CHECK: [0x00,0x00,0x5f,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nlg_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], -s0, -s0
-// CHECK: [0x00,0x00,0x5f,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nlg_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_tru_f32_e64 s[0:1], s0, s0 clamp
-// CHECK: [0x00,0x80,0x5f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlg_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_f_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc0,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_f_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc0,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_f_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xc0,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_f_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xc0,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_f_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc0,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_f_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc0,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_f_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc0,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7a,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_f_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc0,0x7c]
+v_cmpx_nlg_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x7a,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc0,0x7c]
+v_cmpx_ngt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf6,0x7c]
 
-v_cmp_f_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc0,0x7c]
+v_cmpx_ngt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf6,0x7c]
 
-v_cmp_f_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc0,0x7c]
+v_cmpx_ngt_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xf6,0x7c]
 
-v_cmp_f_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc0,0x7c]
+v_cmpx_ngt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xf6,0x7c]
 
-v_cmp_f_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc0,0x7c]
+v_cmpx_ngt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf6,0x7c]
 
-v_cmp_f_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_ngt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf6,0x7c]
 
-v_cmp_f_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_ngt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf6,0x7c]
 
-v_cmp_f_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc0,0x7c]
+v_cmpx_ngt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf6,0x7c]
 
-v_cmp_f_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc0,0x7c]
+v_cmpx_ngt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf6,0x7c]
 
-v_cmp_f_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc1,0x7c]
+v_cmpx_ngt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf6,0x7c]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf6,0x7c]
 
-v_cmp_f_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf6,0x7c]
 
-v_cmp_f_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf6,0x7c]
 
-v_cmp_f_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf6,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_f_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf6,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf6,0x7c]
 
-v_cmp_f_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf6,0x7c]
 
-v_cmp_f_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf7,0x7c]
 
-v_cmp_f_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ngt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x7b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_ngt_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x7b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ngt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x7b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_ngt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x7b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ngt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x7b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ngt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x7b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_ngt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x7b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ngt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x7b,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_ngt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x7b,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_ngt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x7b,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_ngt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x7b,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_f_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x60,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x7b,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_f_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x60,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x7b,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_lt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc2,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x7b,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_lt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc2,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x7b,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_lt_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xc2,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7b,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_lt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xc2,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7b,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_lt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc2,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7b,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_lt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc2,0x7c]
+v_cmpx_ngt_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x7b,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc2,0x7c]
+v_cmpx_nle_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc2,0x7c]
+v_cmpx_nle_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc2,0x7c]
+v_cmpx_nle_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc2,0x7c]
+v_cmpx_nle_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc2,0x7c]
+v_cmpx_nle_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc2,0x7c]
+v_cmpx_nle_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc2,0x7c]
+v_cmpx_nle_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nle_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nle_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc2,0x7c]
+v_cmpx_nle_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc2,0x7c]
+v_cmpx_nle_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc3,0x7c]
+v_cmpx_nle_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x61,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf8,0x7c]
 
-v_cmp_lt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x61,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf8,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x61,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf8,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_lt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x61,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf8,0x7c]
 
-v_cmp_lt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x61,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf8,0x7c]
 
-v_cmp_lt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x61,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf9,0x7c]
 
-v_cmp_lt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x61,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x61,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x61,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nle_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x61,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nle_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x61,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nle_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x61,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nle_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x61,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_nle_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x61,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nle_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x61,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nle_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x61,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nle_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x61,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nle_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x61,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_nle_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x61,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x61,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x61,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_lt_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x61,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_eq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc4,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_eq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc4,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_eq_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xc4,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7c,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_eq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xc4,0x7c]
+v_cmpx_nle_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x7c,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_neq_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_neq_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc4,0x7c]
+v_cmpx_neq_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xfa,0x7c]
 
-v_cmp_eq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc5,0x7c]
+v_cmpx_neq_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xfa,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xfa,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xfa,0x7c]
 
-v_cmp_eq_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xfa,0x7c]
 
-v_cmp_eq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xfb,0x7c]
 
-v_cmp_eq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x7d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x7d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x7d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_neq_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x7d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_neq_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x7d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_neq_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x7d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_neq_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x7d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_neq_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x7d,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_neq_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x7d,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_neq_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x7d,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_neq_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x7d,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x7d,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x7d,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x7d,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x7d,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_eq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x62,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_neq_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7d,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_eq_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x62,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7d,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_le_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc6,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7d,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_le_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc6,0x7c]
+v_cmpx_neq_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x7d,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_nlt_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_nlt_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xfc,0x7c]
 
-v_cmp_le_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xfc,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc6,0x7c]
+v_cmpx_nlt_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xfc,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc7,0x7c]
+v_cmpx_nlt_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xfc,0x7c]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x63,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xfc,0x7c]
 
-v_cmp_le_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x63,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xfd,0x7c]
 
-v_cmp_le_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x63,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x63,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x63,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x63,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x63,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x63,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x63,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_nlt_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x63,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_nlt_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x63,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_nlt_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x63,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_nlt_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x63,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_nlt_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x63,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_nlt_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x63,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x63,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x63,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x63,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_le_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x63,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_nlt_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x63,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_le_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x63,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_nlt_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7e,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_le_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x63,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_nlt_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x7e,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_tru_f64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xfe,0x7c,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_tru_f64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xfe,0x7c,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc8,0x7c]
+v_cmpx_tru_f64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xfe,0x7c]
 
-v_cmp_gt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc9,0x7c]
+v_cmpx_tru_f64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xff,0x7c]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0x7f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0x7f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0x7f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0x7f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0x7f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0x7f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0x7f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0x7f,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0x7f,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_tru_f64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0x7f,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_tru_f64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0x7f,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0x7f,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0x7f,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0x7f,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0x7f,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_tru_f64_e64 s[10:11], -s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0x7f,0xd0,0x04,0x08,0x00,0x20]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7f,0xd0,0x04,0x08,0x00,0x40]
 
-v_cmp_gt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_tru_f64_e64 s[10:11], -s[4:5], -s[4:5]
+// CHECK: [0x0a,0x00,0x7f,0xd0,0x04,0x08,0x00,0x60]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_tru_f64_e64 s[10:11], s[4:5], s[4:5] clamp
+// CHECK: [0x0a,0x80,0x7f,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x64,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_f_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x40,0x7d]
 
-v_cmp_gt_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x64,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xca,0x7c]
+v_cmp_f_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xca,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_f_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xca,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_f_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xca,0x7c]
+v_cmp_f_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x40,0x7d]
 
-v_cmp_lg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xca,0x7c]
+v_cmp_f_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x40,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_lg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcb,0x7c]
+v_cmp_f_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x40,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x40,0x7d]
 
-v_cmp_lg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x65,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x40,0x7d]
 
-v_cmp_lg_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x65,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x41,0x7d]
 
-v_cmp_lg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x65,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x65,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x65,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x65,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x65,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_f_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_f_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_f_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x65,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x65,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_f_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x65,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_f_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x65,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x65,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_f_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_f_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_f_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x65,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_f_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_lg_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x65,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ge_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_ge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_ge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xcc,0x7c]
+v_cmp_f_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa0,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xcc,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_lt_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x42,0x7d]
 
-v_cmp_ge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xcc,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_lt_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x42,0x7d]
 
-v_cmp_ge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xcc,0x7c]
+v_cmp_lt_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x42,0x7d]
 
-v_cmp_ge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xcc,0x7c]
+v_cmp_lt_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x42,0x7d]
 
-v_cmp_ge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcd,0x7c]
+v_cmp_lt_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_lt_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_lt_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x42,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_lt_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x42,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_lt_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x42,0x7d]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_lt_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x43,0x7d]
 
-v_cmp_ge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lt_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lt_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x66,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lt_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xa1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x66,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xa1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_o_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa1,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_o_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa1,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_o_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa1,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_o_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_o_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa1,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_o_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_o_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_o_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_o_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_o_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xce,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_lt_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_o_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xce,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_lt_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_o_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_o_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xce,0x7c]
+v_cmp_lt_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_o_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcf,0x7c]
+v_cmp_lt_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x67,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_o_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x67,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_o_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x67,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_o_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x67,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_o_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x67,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_o_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x67,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_o_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x67,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_o_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x67,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_o_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x67,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_o_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x67,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lt_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa1,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_o_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x67,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_eq_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x67,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x67,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_eq_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x67,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x67,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_eq_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x67,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_eq_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x67,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x67,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_eq_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x67,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_eq_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x67,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_eq_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x67,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_eq_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x44,0x7d]
 
-v_cmp_o_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x67,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x44,0x7d]
 
-v_cmp_u_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x44,0x7d]
 
-v_cmp_u_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x44,0x7d]
 
-v_cmp_u_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x44,0x7d]
 
-v_cmp_u_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x44,0x7d]
 
-v_cmp_u_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x44,0x7d]
 
-v_cmp_u_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x44,0x7d]
 
-v_cmp_u_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x44,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_u_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x44,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_u_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x44,0x7d]
 
-v_cmp_u_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x44,0x7d]
 
-v_cmp_u_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd0,0x7c]
+v_cmp_eq_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x45,0x7d]
 
-v_cmp_u_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd0,0x7c]
+v_cmp_eq_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd0,0x7c]
+v_cmp_eq_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_eq_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_eq_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd0,0x7c]
+v_cmp_eq_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd0,0x7c]
+v_cmp_eq_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd1,0x7c]
+v_cmp_eq_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_u_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_u_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_u_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_u_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_u_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_u_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_u_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_eq_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_eq_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_u_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x68,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_eq_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_u_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x68,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd2,0x7c]
+v_cmp_eq_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_nge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd2,0x7c]
+v_cmp_eq_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nge_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xd2,0x7c]
+v_cmp_eq_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa2,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xd2,0x7c]
+v_cmp_le_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd2,0x7c]
+v_cmp_le_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd2,0x7c]
+v_cmp_le_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd2,0x7c]
+v_cmp_le_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd2,0x7c]
+v_cmp_le_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd2,0x7c]
+v_cmp_le_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd2,0x7c]
+v_cmp_le_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd2,0x7c]
+v_cmp_le_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd2,0x7c]
+v_cmp_le_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd2,0x7c]
+v_cmp_le_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_le_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_le_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd2,0x7c]
+v_cmp_le_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd2,0x7c]
+v_cmp_le_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x46,0x7d]
 
-v_cmp_nge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd3,0x7c]
+v_cmp_le_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x46,0x7d]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x69,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x46,0x7d]
 
-v_cmp_nge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x69,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x46,0x7d]
 
-v_cmp_nge_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x69,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x46,0x7d]
 
-v_cmp_nge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x69,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x46,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_nge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x69,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x46,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_nge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x69,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x46,0x7d]
 
-v_cmp_nge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x69,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x46,0x7d]
 
-v_cmp_nge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x69,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x47,0x7d]
 
-v_cmp_nge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x69,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x69,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_le_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x69,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_le_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xa3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x69,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xa3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x69,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_le_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x69,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x69,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_le_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x69,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_le_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x69,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa3,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x69,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_le_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa3,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x69,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_le_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa3,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x69,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_le_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x69,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_le_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa3,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_nge_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x69,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_nlg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_nlg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_le_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_nlg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_le_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nlg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_nlg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd4,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nlg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd5,0x7c]
+v_cmp_le_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa3,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_gt_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_gt_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_gt_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_gt_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_gt_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x48,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_gt_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x48,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_nlg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6a,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_gt_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x48,0x7d]
 
-v_cmp_nlg_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x6a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x48,0x7d]
 
-v_cmp_ngt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd6,0x7c]
+v_cmp_gt_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x49,0x7d]
 
-v_cmp_ngt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd6,0x7c]
+v_cmp_gt_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_gt_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_gt_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd6,0x7c]
+v_cmp_gt_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd6,0x7c]
+v_cmp_gt_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ngt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd7,0x7c]
+v_cmp_gt_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ngt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ngt_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x6b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ngt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ngt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ngt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ngt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ngt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ngt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6b,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ngt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6b,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ngt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x6b,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_ngt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6b,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ngt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6b,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6b,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6b,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_gt_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa4,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x6b,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ne_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x4a,0x7d]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6b,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ne_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x4a,0x7d]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6b,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ne_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x4a,0x7d]
 
-v_cmp_ngt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6b,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ne_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x4a,0x7d]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6b,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ne_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x4a,0x7d]
 
-v_cmp_ngt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6b,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ne_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x4a,0x7d]
 
-v_cmp_ngt_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x6b,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x4a,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_nle_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd8,0x7c]
+v_cmp_ne_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x4a,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_nle_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ne_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ne_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x4a,0x7d]
 
-v_cmp_nle_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd8,0x7c]
+v_cmp_ne_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x4b,0x7d]
 
-v_cmp_nle_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd8,0x7c]
+v_cmp_ne_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd9,0x7c]
+v_cmp_ne_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xa5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xa5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_nle_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa5,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_nle_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa5,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa5,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa5,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ne_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ne_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ne_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ne_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ne_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ne_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ne_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ne_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ne_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6c,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ne_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_nle_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x6c,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_neq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xda,0x7c]
+v_cmp_ne_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_neq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xda,0x7c]
+v_cmp_ne_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_neq_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xda,0x7c]
+v_cmp_ne_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_neq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xda,0x7c]
+v_cmp_ne_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_neq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xda,0x7c]
+v_cmp_ne_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_neq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xda,0x7c]
+v_cmp_ne_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_neq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xda,0x7c]
+v_cmp_ne_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_neq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xda,0x7c]
+v_cmp_ne_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa5,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_neq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xda,0x7c]
+v_cmp_ge_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xda,0x7c]
+v_cmp_ge_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xda,0x7c]
+v_cmp_ge_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xda,0x7c]
+v_cmp_ge_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xda,0x7c]
+v_cmp_ge_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xda,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ge_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xda,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ge_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xda,0x7c]
+v_cmp_ge_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xda,0x7c]
+v_cmp_ge_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xdb,0x7c]
+v_cmp_ge_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x6d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6d,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x4c,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6d,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x4c,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x6d,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ge_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6d,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x4c,0x7d]
 
-v_cmp_neq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6d,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ge_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x4d,0x7d]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6d,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6d,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ge_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x6d,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ge_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6d,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6d,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ge_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6d,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ge_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6d,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ge_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6d,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ge_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_neq_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x6d,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xdc,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ge_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xdc,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ge_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xdc,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_nlt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xdd,0x7c]
+v_cmp_ge_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_nlt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_nlt_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_nlt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_nlt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa6,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_nlt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_t_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_t_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_t_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_t_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_t_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_t_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_t_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_t_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_t_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_t_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_t_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_t_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6e,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_t_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x4e,0x7d]
 
-v_cmp_nlt_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x6e,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x4e,0x7d]
 
-v_cmp_tru_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xde,0x7c]
+v_cmp_t_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x4e,0x7d]
 
-v_cmp_tru_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xde,0x7c]
+v_cmp_t_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x4e,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_tru_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xde,0x7c]
+v_cmp_t_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x4e,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_tru_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xde,0x7c]
+v_cmp_t_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x4e,0x7d]
 
-v_cmp_tru_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xde,0x7c]
+v_cmp_t_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x4e,0x7d]
 
-v_cmp_tru_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xde,0x7c]
+v_cmp_t_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x4f,0x7d]
 
-v_cmp_tru_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xde,0x7c]
+v_cmp_t_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xde,0x7c]
+v_cmp_t_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xde,0x7c]
+v_cmp_t_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xa7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xde,0x7c]
+v_cmp_t_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xa7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xde,0x7c]
+v_cmp_t_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xde,0x7c]
+v_cmp_t_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xde,0x7c]
+v_cmp_t_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xde,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_t_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xde,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_t_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa7,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xde,0x7c]
+v_cmp_t_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa7,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xde,0x7c]
+v_cmp_t_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa7,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_tru_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xdf,0x7c]
+v_cmp_t_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa7,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x6f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x6f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_tru_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x6f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_tru_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x6f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_tru_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x6f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_tru_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x6f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_tru_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x6f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x6f,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x6f,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x6f,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6f,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x6f,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x6f,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x6f,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x6f,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x6f,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x6f,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_t_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x6f,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_t_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6f,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_t_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa7,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_tru_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x6f,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_f_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x50,0x7d]
 
-v_cmp_tru_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x6f,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe0,0x7c]
+v_cmp_f_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_f_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_f_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe0,0x7c]
+v_cmp_f_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x50,0x7d]
 
-v_cmpx_f_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe0,0x7c]
+v_cmp_f_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x50,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_f_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe1,0x7c]
+v_cmp_f_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x50,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x50,0x7d]
 
-v_cmpx_f_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x50,0x7d]
 
-v_cmpx_f_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x51,0x7d]
 
-v_cmpx_f_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_f_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_f_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_f_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_f_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_f_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_f_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_f_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_f_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x70,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_f_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_f_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x70,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_lt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_lt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe2,0x7c]
+v_cmp_f_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa8,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_lt_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_lt_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe2,0x7c]
+v_cmp_lt_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe2,0x7c]
+v_cmp_lt_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe3,0x7c]
+v_cmp_lt_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x71,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x71,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x71,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x71,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x71,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x71,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x71,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_lt_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_lt_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x71,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x52,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x71,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_lt_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x52,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x71,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_lt_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x71,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x52,0x7d]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x71,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_lt_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x53,0x7d]
 
-v_cmpx_lt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_lt_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_lt_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xa9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x71,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_lt_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xa9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x71,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xa9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xa9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xa9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xa9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xa9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xa9,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xa9,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xa9,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xa9,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_lt_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_lt_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe4,0x7c]
+v_cmp_lt_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_eq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe5,0x7c]
+v_cmp_lt_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_eq_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_eq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_eq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_eq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_eq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_eq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lt_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xa9,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_eq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_eq_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_eq_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_eq_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_eq_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_eq_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_eq_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_eq_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x72,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_eq_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x54,0x7d]
 
-v_cmpx_eq_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x72,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x54,0x7d]
 
-v_cmpx_le_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x54,0x7d]
 
-v_cmpx_le_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x54,0x7d]
 
-v_cmpx_le_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x54,0x7d]
 
-v_cmpx_le_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x54,0x7d]
 
-v_cmpx_le_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x54,0x7d]
 
-v_cmpx_le_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x54,0x7d]
 
-v_cmpx_le_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x54,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_le_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x54,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_le_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x54,0x7d]
 
-v_cmpx_le_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x54,0x7d]
 
-v_cmpx_le_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe6,0x7c]
+v_cmp_eq_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x55,0x7d]
 
-v_cmpx_le_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe6,0x7c]
+v_cmp_eq_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe6,0x7c]
+v_cmp_eq_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_eq_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_eq_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe6,0x7c]
+v_cmp_eq_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe6,0x7c]
+v_cmp_eq_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe7,0x7c]
+v_cmp_eq_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x73,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xaa,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x73,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x73,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x73,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_le_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x73,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_le_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x73,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_le_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x73,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_le_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x73,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x73,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x73,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x73,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x73,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x73,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x73,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x73,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x73,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x73,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x73,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x73,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_eq_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x73,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_eq_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x73,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_eq_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_le_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x73,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_gt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe8,0x7c]
+v_cmp_eq_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_gt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe8,0x7c]
+v_cmp_eq_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_gt_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xe8,0x7c]
+v_cmp_eq_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xaa,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_gt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xe8,0x7c]
+v_cmp_le_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe8,0x7c]
+v_cmp_le_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe8,0x7c]
+v_cmp_le_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe8,0x7c]
+v_cmp_le_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe8,0x7c]
+v_cmp_le_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe8,0x7c]
+v_cmp_le_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe8,0x7c]
+v_cmp_le_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe8,0x7c]
+v_cmp_le_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe8,0x7c]
+v_cmp_le_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe8,0x7c]
+v_cmp_le_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_le_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_le_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe8,0x7c]
+v_cmp_le_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe8,0x7c]
+v_cmp_le_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe9,0x7c]
+v_cmp_le_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x56,0x7d]
 
-v_cmpx_gt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x56,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_gt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x56,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_gt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x56,0x7d]
 
-v_cmpx_gt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x56,0x7d]
 
-v_cmpx_gt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x57,0x7d]
 
-v_cmpx_gt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_le_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xab,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_le_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xab,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xab,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_le_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xab,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xab,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_le_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xab,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_le_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xab,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xab,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_le_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xab,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_le_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xab,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_le_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xab,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x74,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_le_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xab,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_gt_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x74,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xea,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_le_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_lg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xea,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_le_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_lg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xea,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xeb,0x7c]
+v_cmp_le_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xab,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x75,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x75,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x75,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x75,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x75,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x75,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x75,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x75,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x75,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x75,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x75,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x75,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x75,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_gt_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x75,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x75,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_gt_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x75,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_gt_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x75,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x75,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_gt_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x75,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_gt_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x58,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x75,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_gt_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x58,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_lg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x75,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_gt_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x58,0x7d]
 
-v_cmpx_lg_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x75,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x58,0x7d]
 
-v_cmpx_ge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xec,0x7c]
+v_cmp_gt_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x59,0x7d]
 
-v_cmpx_ge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xac,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xec,0x7c]
+v_cmp_gt_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xec,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_gt_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xac,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xec,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_gt_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xec,0x7c]
+v_cmp_gt_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xec,0x7c]
+v_cmp_gt_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_ge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xed,0x7c]
+v_cmp_gt_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_ge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_ge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_gt_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xac,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ne_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x5a,0x7d]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ne_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x5a,0x7d]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ne_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x5a,0x7d]
 
-v_cmpx_ge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ne_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x5a,0x7d]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ne_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x5a,0x7d]
 
-v_cmpx_ge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x76,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ne_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x5a,0x7d]
 
-v_cmpx_ge_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x76,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x5a,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_o_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xee,0x7c]
+v_cmp_ne_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x5a,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_o_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xee,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ne_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xee,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ne_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x5a,0x7d]
 
-v_cmpx_o_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xee,0x7c]
+v_cmp_ne_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x5b,0x7d]
 
-v_cmpx_o_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xee,0x7c]
+v_cmp_ne_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xef,0x7c]
+v_cmp_ne_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xad,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x77,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xad,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x77,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xad,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x77,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xad,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x77,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xad,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x77,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xad,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x77,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xad,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x77,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xad,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x77,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xad,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x77,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xad,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x77,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xad,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x77,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xad,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x77,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ne_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x77,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ne_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x77,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ne_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x77,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ne_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x77,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ne_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x77,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ne_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x77,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ne_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x77,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ne_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x77,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ne_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x77,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ne_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_o_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x77,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_u_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf0,0x7c]
+v_cmp_ne_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_u_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf0,0x7c]
+v_cmp_ne_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_u_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xf0,0x7c]
+v_cmp_ne_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_u_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xf0,0x7c]
+v_cmp_ne_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_u_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf0,0x7c]
+v_cmp_ne_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_u_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf0,0x7c]
+v_cmp_ne_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_u_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf0,0x7c]
+v_cmp_ne_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_u_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf0,0x7c]
+v_cmp_ne_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xad,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_u_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf0,0x7c]
+v_cmp_ge_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf0,0x7c]
+v_cmp_ge_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf0,0x7c]
+v_cmp_ge_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf0,0x7c]
+v_cmp_ge_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf0,0x7c]
+v_cmp_ge_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf0,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ge_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf0,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ge_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf0,0x7c]
+v_cmp_ge_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf0,0x7c]
+v_cmp_ge_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf1,0x7c]
+v_cmp_ge_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x5c,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x5c,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_ge_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x5c,0x7d]
 
-v_cmpx_u_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ge_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x5d,0x7d]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ge_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_ge_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ge_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_ge_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_ge_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x78,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_ge_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xae,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_u_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x78,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xae,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf2,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_ge_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf2,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_ge_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf2,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_nge_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf3,0x7c]
+v_cmp_ge_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x79,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_nge_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x79,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_nge_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x79,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_nge_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x79,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_nge_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x79,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xae,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_nge_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x79,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x79,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x79,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x79,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_t_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x79,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_t_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x79,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_t_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x79,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_t_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x79,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_t_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x79,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_t_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x79,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_t_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x79,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_t_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x79,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_t_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x79,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_t_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x79,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_t_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x79,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_t_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x79,0xd0,0x00,0x00,0x00,0x60]
+v_cmp_t_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x5e,0x7d]
 
-v_cmpx_nge_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x79,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x5e,0x7d]
 
-v_cmpx_nlg_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf4,0x7c]
+v_cmp_t_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x5e,0x7d]
 
-v_cmpx_nlg_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf4,0x7c]
+v_cmp_t_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x5e,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xf4,0x7c]
+v_cmp_t_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x5e,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xf4,0x7c]
+v_cmp_t_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x5e,0x7d]
 
-v_cmpx_nlg_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf4,0x7c]
+v_cmp_t_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x5e,0x7d]
 
-v_cmpx_nlg_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf4,0x7c]
+v_cmp_t_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x5f,0x7d]
 
-v_cmpx_nlg_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf4,0x7c]
+v_cmp_t_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf4,0x7c]
+v_cmp_t_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xaf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf4,0x7c]
+v_cmp_t_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xaf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf4,0x7c]
+v_cmp_t_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xaf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf4,0x7c]
+v_cmp_t_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xaf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf4,0x7c]
+v_cmp_t_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xaf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf4,0x7c]
+v_cmp_t_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xaf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf4,0x7c,0x56,0x34,0x12,0xaf]
+v_cmp_t_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xaf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf4,0x7c,0x73,0x72,0x71,0x3f]
+v_cmp_t_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xaf,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf4,0x7c]
+v_cmp_t_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xaf,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf4,0x7c]
+v_cmp_t_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xaf,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_nlg_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf5,0x7c]
+v_cmp_t_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xaf,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0xfd,0x00,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0xfa,0x01,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_t_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x00,0x20]
+v_cmp_t_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x00,0x40]
+v_cmp_t_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xaf,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_nlg_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7a,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_f_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x60,0x7d]
 
-v_cmpx_nlg_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x7a,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf6,0x7c]
+v_cmpx_f_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf6,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_f_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf6,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_f_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf6,0x7c]
+v_cmpx_f_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x60,0x7d]
 
-v_cmpx_ngt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf6,0x7c]
+v_cmpx_f_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x60,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_ngt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf7,0x7c]
+v_cmpx_f_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x60,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x60,0x7d]
 
-v_cmpx_ngt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x7b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x60,0x7d]
 
-v_cmpx_ngt_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x7b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x61,0x7d]
 
-v_cmpx_ngt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x7b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x7b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x7b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x7b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x7b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x7b,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x7b,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_f_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x7b,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_f_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7b,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_f_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x7b,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_f_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x7b,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_f_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x7b,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_f_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x7b,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_f_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7b,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_f_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7b,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_f_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7b,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_f_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7b,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_f_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7b,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_f_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ngt_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x7b,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_nle_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_nle_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_nle_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_nle_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_nle_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_nle_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf8,0x7c]
+v_cmpx_f_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb0,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_nle_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf8,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf8,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf8,0x7c]
+v_cmpx_lt_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf8,0x7c]
+v_cmpx_lt_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf9,0x7c]
+v_cmpx_lt_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_lt_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_lt_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x62,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x62,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_lt_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x62,0x7d]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_lt_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x63,0x7d]
 
-v_cmpx_nle_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_lt_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_lt_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7c,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_lt_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xb1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nle_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x7c,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xb1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xb1,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb1,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xb1,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb1,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xfa,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xfa,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xfa,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_neq_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xfb,0x7c]
+v_cmpx_lt_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x7d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_neq_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x7d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_neq_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x7d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_neq_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x7d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_neq_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x7d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_neq_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x7d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_neq_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x7d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x7d,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x7d,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb1,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_neq_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x7d,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_eq_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7d,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x7d,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_eq_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x7d,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x7d,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x7d,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_eq_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7d,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7d,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_eq_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7d,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_eq_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7d,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_eq_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7d,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_eq_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x64,0x7d]
 
-v_cmpx_neq_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x7d,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x64,0x7d]
 
-v_cmpx_nlt_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x64,0x7d]
 
-v_cmpx_nlt_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x64,0x7d]
 
-v_cmpx_nlt_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x64,0x7d]
 
-v_cmpx_nlt_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x64,0x7d]
 
-v_cmpx_nlt_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x64,0x7d]
 
-v_cmpx_nlt_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x64,0x7d]
 
-v_cmpx_nlt_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x64,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_nlt_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x64,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_nlt_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x64,0x7d]
 
-v_cmpx_nlt_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x64,0x7d]
 
-v_cmpx_nlt_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xfc,0x7c]
+v_cmpx_eq_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x65,0x7d]
 
-v_cmpx_nlt_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xfc,0x7c]
+v_cmpx_eq_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xfc,0x7c]
+v_cmpx_eq_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xfc,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xfc,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xfc,0x7c]
+v_cmpx_eq_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xfc,0x7c]
+v_cmpx_eq_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xfd,0x7c]
+v_cmpx_eq_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_eq_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_eq_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7e,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_eq_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_nlt_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x7e,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_tru_f64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfe,0x7c]
+v_cmpx_eq_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_tru_f64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xfe,0x7c]
+v_cmpx_eq_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_tru_f64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xfe,0x7c]
+v_cmpx_eq_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb2,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_tru_f64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xfe,0x7c]
+v_cmpx_le_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xfe,0x7c]
+v_cmpx_le_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xfe,0x7c]
+v_cmpx_le_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xfe,0x7c]
+v_cmpx_le_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xfe,0x7c]
+v_cmpx_le_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xfe,0x7c]
+v_cmpx_le_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xfe,0x7c]
+v_cmpx_le_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xfe,0x7c]
+v_cmpx_le_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xfe,0x7c]
+v_cmpx_le_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xfe,0x7c]
+v_cmpx_le_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xfe,0x7c,0x56,0x34,0x12,0xaf]
+v_cmpx_le_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xfe,0x7c,0x73,0x72,0x71,0x3f]
+v_cmpx_le_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xfe,0x7c]
+v_cmpx_le_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xfe,0x7c]
+v_cmpx_le_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xff,0x7c]
+v_cmpx_le_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0x7f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0x7f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x66,0x7d]
 
-v_cmpx_tru_f64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0x7f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x66,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_tru_f64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0x7f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x66,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_tru_f64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0x7f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x66,0x7d]
 
-v_cmpx_tru_f64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0x7f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x66,0x7d]
 
-v_cmpx_tru_f64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0x7f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x67,0x7d]
 
-v_cmpx_tru_f64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0x7f,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0x7f,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_le_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], scc, s[0:1]
-// CHECK: [0x00,0x00,0x7f,0xd0,0xfd,0x00,0x00,0x00]
+v_cmpx_le_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xb3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7f,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_le_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xb3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0x7f,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_le_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0x7f,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_le_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0x7f,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_le_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], scc
-// CHECK: [0x00,0x00,0x7f,0xd0,0x00,0xfa,0x01,0x00]
+v_cmpx_le_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0x7f,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_le_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xb3,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0x7f,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_le_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb3,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], -s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0x7f,0xd0,0x00,0x00,0x00,0x20]
+v_cmpx_le_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xb3,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7f,0xd0,0x00,0x00,0x00,0x40]
+v_cmpx_le_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], -s[0:1], -s[0:1]
-// CHECK: [0x00,0x00,0x7f,0xd0,0x00,0x00,0x00,0x60]
+v_cmpx_le_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb3,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_tru_f64_e64 s[0:1], s[0:1], s[0:1] clamp
-// CHECK: [0x00,0x80,0x7f,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_f_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_f_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_f_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_f_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_f_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_f_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_f_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_f_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_f_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_f_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_f_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_f_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_f_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_f_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_f_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_f_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_f_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_f_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x40,0x7d]
+v_cmpx_le_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb3,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_f_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x40,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_gt_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x68,0x7d]
 
-v_cmp_f_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x40,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_gt_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x68,0x7d]
 
-v_cmp_f_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x40,0x7d]
+v_cmpx_gt_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x68,0x7d]
 
-v_cmp_f_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x40,0x7d]
+v_cmpx_gt_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x68,0x7d]
 
-v_cmp_f_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x41,0x7d]
+v_cmpx_gt_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa0,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_gt_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x68,0x7d]
 
-v_cmp_f_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_gt_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x68,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_f_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x68,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_f_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa0,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x68,0x7d]
 
-v_cmp_lt_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x42,0x7d]
+v_cmpx_gt_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x68,0x7d]
 
-v_cmp_lt_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x42,0x7d]
+v_cmpx_gt_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x69,0x7d]
 
-v_cmp_lt_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_lt_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_lt_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_lt_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_lt_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_lt_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lt_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x42,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_lt_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x42,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_lt_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_lt_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x42,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_lt_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x43,0x7d]
+v_cmpx_gt_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_lt_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_lt_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_lt_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xa1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_lt_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xa1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_lt_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_lt_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_lt_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_lt_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_lt_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa1,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_lt_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa1,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_lt_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa1,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb4,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_lt_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa1,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ne_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x6a,0x7d]
 
-v_cmp_lt_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa1,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ne_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x6a,0x7d]
 
-v_cmp_lt_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa1,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ne_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x6a,0x7d]
 
-v_cmp_lt_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa1,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ne_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x6a,0x7d]
 
-v_cmp_lt_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa1,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ne_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x6a,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_eq_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x6a,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_eq_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x6a,0x7d]
 
-v_cmp_eq_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x44,0x7d]
+v_cmpx_ne_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x6b,0x7d]
 
-v_cmp_eq_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x44,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_ne_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x44,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_ne_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x44,0x7d]
+v_cmpx_ne_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xb5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x44,0x7d]
+v_cmpx_ne_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xb5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x45,0x7d]
+v_cmpx_ne_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xb5,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_eq_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb5,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_eq_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xb5,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_eq_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_eq_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb5,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ne_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ne_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa2,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ne_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ne_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ne_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ne_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_eq_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa2,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ne_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_le_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_le_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_le_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_le_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_le_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_le_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_le_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_le_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_le_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_le_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_le_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x46,0x7d]
+v_cmpx_ne_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb5,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_le_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x46,0x7d]
+v_cmpx_ge_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x46,0x7d]
+v_cmpx_ge_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x46,0x7d]
+v_cmpx_ge_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x46,0x7d]
+v_cmpx_ge_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x46,0x7d]
+v_cmpx_ge_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x46,0x7d]
+v_cmpx_ge_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x46,0x7d]
+v_cmpx_ge_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x46,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_ge_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x46,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_ge_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x46,0x7d]
+v_cmpx_ge_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x46,0x7d]
+v_cmpx_ge_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x6c,0x7d]
 
-v_cmp_le_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x47,0x7d]
+v_cmpx_ge_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x6c,0x7d]
 
-v_cmp_le_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x6c,0x7d]
 
-v_cmp_le_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x6c,0x7d]
 
-v_cmp_le_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xa3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x6c,0x7d]
 
-v_cmp_le_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xa3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x6c,0x7d]
 
-v_cmp_le_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x6c,0x7d]
 
-v_cmp_le_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x6c,0x7d]
 
-v_cmp_le_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x6c,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_le_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x6c,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_le_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa3,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x6c,0x7d]
 
-v_cmp_le_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa3,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x6c,0x7d]
 
-v_cmp_le_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa3,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x6d,0x7d]
 
-v_cmp_le_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa3,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ge_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa3,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ge_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa3,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ge_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa3,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ge_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa3,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ge_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_gt_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_gt_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_gt_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_gt_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_gt_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_gt_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_gt_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_gt_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_gt_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_gt_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_gt_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_gt_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_gt_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_gt_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_gt_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x48,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_ge_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_gt_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x48,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_ge_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_gt_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_gt_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x48,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_gt_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x49,0x7d]
+v_cmpx_ge_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_gt_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_gt_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_gt_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_gt_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb6,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_gt_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_t_i16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_t_i16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_t_i16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa4,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_t_i16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_t_i16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_t_i16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_t_i16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x6e,0x7d]
 
-v_cmp_gt_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa4,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_t_i16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x6e,0x7d]
 
-v_cmp_ne_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x6e,0x7d]
 
-v_cmp_ne_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x6e,0x7d]
 
-v_cmp_ne_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x6e,0x7d]
 
-v_cmp_ne_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x6e,0x7d]
 
-v_cmp_ne_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x6e,0x7d]
 
-v_cmp_ne_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x6e,0x7d]
 
-v_cmp_ne_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x6e,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_ne_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x6e,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_ne_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x6e,0x7d]
 
-v_cmp_ne_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x6e,0x7d]
 
-v_cmp_ne_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x4a,0x7d]
+v_cmpx_t_i16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x6f,0x7d]
 
-v_cmp_ne_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x4a,0x7d]
+v_cmpx_t_i16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x4a,0x7d]
+v_cmpx_t_i16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x4a,0x7d]
+v_cmpx_t_i16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xb7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x4a,0x7d]
+v_cmpx_t_i16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xb7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x4a,0x7d]
+v_cmpx_t_i16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x4a,0x7d]
+v_cmpx_t_i16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x4a,0x7d]
+v_cmpx_t_i16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x4a,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_t_i16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x4a,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xb7,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x4a,0x7d]
+v_cmpx_t_i16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb7,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x4a,0x7d]
+v_cmpx_t_i16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xb7,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_ne_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x4b,0x7d]
+v_cmpx_t_i16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ne_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb7,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ne_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_ne_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xa5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_ne_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xa5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ne_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ne_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ne_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ne_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ne_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa5,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ne_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa5,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ne_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa5,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ne_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa5,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ne_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa5,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ne_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa5,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ne_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa5,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ne_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa5,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_t_i16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_ge_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x4c,0x7d]
+v_cmpx_t_i16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ge_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x4c,0x7d]
+v_cmpx_t_i16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_ge_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x4c,0x7d]
+v_cmpx_t_i16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ge_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x4c,0x7d]
+v_cmpx_t_i16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb7,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ge_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x4c,0x7d]
+v_cmpx_f_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x4c,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_f_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x4c,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_f_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x4c,0x7d]
+v_cmpx_f_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x4c,0x7d]
+v_cmpx_f_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x70,0x7d]
 
-v_cmp_ge_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x4d,0x7d]
+v_cmpx_f_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x70,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_ge_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x70,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_ge_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x70,0x7d]
 
-v_cmp_ge_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x70,0x7d]
 
-v_cmp_ge_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x71,0x7d]
 
-v_cmp_ge_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_f_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_f_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa6,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_f_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_f_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_f_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_f_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_ge_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa6,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_f_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_t_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_t_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_t_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_t_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_t_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_t_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_t_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_t_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_t_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_t_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_t_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_t_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_t_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_t_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_t_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_t_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_t_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_t_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x4e,0x7d]
+v_cmpx_f_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_t_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x4e,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_f_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_t_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x4e,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_f_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb8,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_t_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x4e,0x7d]
+v_cmpx_lt_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x72,0x7d]
 
-v_cmp_t_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x4e,0x7d]
+v_cmpx_lt_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x72,0x7d]
 
-v_cmp_t_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x4f,0x7d]
+v_cmpx_lt_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xa7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xa7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa7,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa7,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa7,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa7,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lt_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa7,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa7,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_lt_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa7,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x72,0x7d]
 
-v_cmp_t_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa7,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_lt_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x72,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_f_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x50,0x7d]
+v_cmpx_lt_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x72,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_f_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x50,0x7d]
+v_cmpx_lt_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x72,0x7d]
 
-v_cmp_f_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x50,0x7d]
+v_cmpx_lt_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x72,0x7d]
 
-v_cmp_f_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x50,0x7d]
+v_cmpx_lt_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x73,0x7d]
 
-v_cmp_f_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xb9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xb9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xb9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xb9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xb9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xb9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xb9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xb9,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xb9,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xb9,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_f_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_f_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xb9,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_f_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_f_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x50,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_f_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x50,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_f_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_f_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x50,0x7d]
+v_cmpx_lt_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_f_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x51,0x7d]
+v_cmpx_lt_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_f_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_f_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_f_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_f_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_f_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_f_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_f_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_f_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_f_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_f_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_f_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_f_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa8,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_f_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xb9,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_f_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_eq_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x74,0x7d]
 
-v_cmp_f_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x74,0x7d]
 
-v_cmp_f_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa8,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x74,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_lt_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x74,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_lt_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x52,0x7d]
+v_cmpx_eq_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x52,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_eq_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x74,0x7d]
 
-v_cmp_lt_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x52,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_eq_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x75,0x7d]
 
-v_cmp_lt_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x52,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x52,0x7d]
+v_cmpx_eq_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x53,0x7d]
+v_cmpx_eq_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xa9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xa9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xa9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xa9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xa9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xba,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xa9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_lt_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xa9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_lt_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xa9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xa9,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_eq_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xa9,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xba,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xa9,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xa9,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_eq_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xa9,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xa9,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_eq_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xa9,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_lt_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xa9,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_eq_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_eq_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_eq_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_eq_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_eq_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_eq_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_eq_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_eq_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_eq_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_eq_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_eq_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_eq_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_eq_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x54,0x7d]
+v_cmpx_eq_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xba,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_eq_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x54,0x7d]
+v_cmpx_le_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x76,0x7d]
 
-v_cmp_eq_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x54,0x7d]
+v_cmpx_le_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x76,0x7d]
 
-v_cmp_eq_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x54,0x7d]
+v_cmpx_le_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x76,0x7d]
 
-v_cmp_eq_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x54,0x7d]
+v_cmpx_le_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x76,0x7d]
 
-v_cmp_eq_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x54,0x7d]
+v_cmpx_le_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x76,0x7d]
 
-v_cmp_eq_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x54,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_le_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x76,0x7d]
 
-v_cmp_eq_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x54,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_le_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x76,0x7d]
 
-v_cmp_eq_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x54,0x7d]
+v_cmpx_le_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x76,0x7d]
 
-v_cmp_eq_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x54,0x7d]
+v_cmpx_le_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x76,0x7d]
 
-v_cmp_eq_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x55,0x7d]
+v_cmpx_le_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x76,0x7d]
 
-v_cmp_eq_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x76,0x7d]
 
-v_cmp_eq_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x76,0x7d]
 
-v_cmp_eq_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x76,0x7d]
 
-v_cmp_eq_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x76,0x7d]
 
-v_cmp_eq_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x76,0x7d]
 
-v_cmp_eq_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x76,0x7d]
 
-v_cmp_eq_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x76,0x7d]
 
-v_cmp_eq_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xaa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x76,0x7d]
 
-v_cmp_eq_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x76,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_eq_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_le_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x76,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_eq_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_le_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x76,0x7d]
 
-v_cmp_eq_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xaa,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_le_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x76,0x7d]
 
-v_cmp_eq_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_le_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x77,0x7d]
 
-v_cmp_eq_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_le_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_le_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xbb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xaa,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_le_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xbb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xbb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xbb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xbb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xbb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xbb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xbb,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_le_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xbb,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_le_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xbb,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_le_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_le_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xbb,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_le_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_le_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_le_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_le_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_le_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_le_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_le_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_le_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_le_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x56,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_le_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_le_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x56,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_le_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_le_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_le_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x56,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_le_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x57,0x7d]
+v_cmpx_le_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_le_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xab,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_le_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xab,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_le_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xab,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_le_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xab,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_le_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xab,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_le_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xab,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xbb,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_le_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xab,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x78,0x7d]
 
-v_cmp_le_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xab,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x78,0x7d]
 
-v_cmp_le_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xab,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x78,0x7d]
 
-v_cmp_le_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xab,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x78,0x7d]
 
-v_cmp_le_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xab,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x78,0x7d]
 
-v_cmp_le_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xab,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_gt_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x78,0x7d]
 
-v_cmp_le_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xab,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x78,0x7d]
 
-v_cmp_le_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xab,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_gt_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x78,0x7d]
 
-v_cmp_le_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xab,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x78,0x7d]
 
-v_cmp_le_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xab,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x78,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_gt_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x78,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_gt_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x78,0x7d]
 
-v_cmp_gt_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x58,0x7d]
+v_cmpx_gt_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x79,0x7d]
 
-v_cmp_gt_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x58,0x7d]
+v_cmpx_gt_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x58,0x7d]
+v_cmpx_gt_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x58,0x7d]
+v_cmpx_gt_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x58,0x7d]
+v_cmpx_gt_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x58,0x7d]
+v_cmpx_gt_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x58,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_gt_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x58,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_gt_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x58,0x7d]
+v_cmpx_gt_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xbc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x58,0x7d]
+v_cmpx_gt_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_gt_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x59,0x7d]
+v_cmpx_gt_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_gt_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_gt_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_gt_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_gt_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_gt_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xac,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xac,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_gt_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xac,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ne_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x5a,0x7d]
+v_cmpx_gt_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ne_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x5a,0x7d]
+v_cmpx_gt_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_ne_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x5a,0x7d]
+v_cmpx_gt_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ne_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x5a,0x7d]
+v_cmpx_gt_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_ne_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x5a,0x7d]
+v_cmpx_gt_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ne_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x5a,0x7d]
+v_cmpx_gt_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xbc,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ne_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x5a,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_ne_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x5a,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_ne_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x5a,0x7d]
+v_cmpx_ne_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x5b,0x7d]
+v_cmpx_ne_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xad,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x7a,0x7d]
 
-v_cmp_ne_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xad,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x7a,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_ne_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xad,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x7a,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_ne_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xad,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x7a,0x7d]
 
-v_cmp_ne_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xad,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x7a,0x7d]
 
-v_cmp_ne_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xad,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x7b,0x7d]
 
-v_cmp_ne_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xad,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xad,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xbd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xad,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ne_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xbd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xad,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xbd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xad,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ne_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xbd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xad,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ne_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xbd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xad,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ne_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xbd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xad,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ne_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xbd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xad,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ne_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xbd,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_ne_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xad,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ne_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xbd,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ge_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xbd,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_ge_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ge_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xbd,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ge_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_ge_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_ge_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ge_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ge_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ge_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ge_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ge_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ge_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ge_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ge_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ge_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ge_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ge_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ge_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_ge_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x5c,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_ne_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ge_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x5c,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_ne_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_ge_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ge_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x5c,0x7d]
+v_cmpx_ne_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xbd,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ge_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x5d,0x7d]
+v_cmpx_ge_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xae,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xae,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ge_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ge_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ge_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ge_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x7c,0x7d]
 
-v_cmp_ge_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xae,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ge_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x7c,0x7d]
 
-v_cmp_t_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x5e,0x7d]
+v_cmpx_ge_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x7c,0x7d]
 
-v_cmp_t_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x5e,0x7d]
+v_cmpx_ge_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x7c,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmp_t_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x5e,0x7d]
+v_cmpx_ge_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x7c,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmp_t_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x5e,0x7d]
+v_cmpx_ge_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x7c,0x7d]
 
-v_cmp_t_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x5e,0x7d]
+v_cmpx_ge_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x7c,0x7d]
 
-v_cmp_t_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x5e,0x7d]
+v_cmpx_ge_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x7d,0x7d]
 
-v_cmp_t_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xbe,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_t_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x5e,0x7d]
+v_cmpx_ge_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_t_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x5e,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_t_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x5e,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_t_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x5e,0x7d]
+v_cmpx_ge_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_t_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x5e,0x7d]
+v_cmpx_ge_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_t_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x5f,0x7d]
+v_cmpx_ge_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_t_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xaf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_t_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xaf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_t_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xaf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_t_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xaf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_t_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xaf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_t_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xaf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_t_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xaf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_t_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xaf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_t_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xaf,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_t_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xaf,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_t_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xaf,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_t_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xaf,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_t_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xaf,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_t_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xaf,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_t_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xaf,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ge_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xbe,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_t_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xaf,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_t_u16 vcc, s1, v2
+// CHECK: [0x01,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, s101, v2
+// CHECK: [0x65,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, 0, v2
+// CHECK: [0x80,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x60,0x7d]
+v_cmpx_t_u16 vcc, 0xfe0b, v2
+// CHECK: [0xff,0x04,0x7e,0x7d,0x0b,0xfe,0x00,0x00]
 
-v_cmpx_f_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x60,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmpx_t_u16 vcc, 0x3456, v2
+// CHECK: [0xff,0x04,0x7e,0x7d,0x56,0x34,0x00,0x00]
 
-v_cmpx_f_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x60,0x7d,0x56,0x34,0x00,0x00]
+v_cmpx_t_u16 vcc, v1, v2
+// CHECK: [0x01,0x05,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x60,0x7d]
+v_cmpx_t_u16 vcc, v255, v2
+// CHECK: [0xff,0x05,0x7e,0x7d]
 
-v_cmpx_f_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x60,0x7d]
+v_cmpx_t_u16 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x7f,0x7d]
 
-v_cmpx_f_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x61,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u16_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xbf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u16_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xbf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u16_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xbf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u16_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xbf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u16_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xbf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u16_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xbf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u16_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xbf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u16_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xbf,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_t_u16_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xbf,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_t_u16_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xbf,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_t_u16_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb0,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_t_u16_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xbf,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_t_u16_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_t_u16_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_t_u16_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_f_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb0,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_t_u16_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_lt_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_lt_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lt_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_lt_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lt_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_lt_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lt_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lt_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lt_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lt_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lt_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_lt_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lt_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_lt_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lt_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x62,0x7d]
+v_cmpx_t_u16_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xbf,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lt_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x62,0x7d]
+v_cmp_f_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x62,0x7d]
+v_cmp_f_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x62,0x7d]
+v_cmp_f_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x62,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_f_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x62,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_f_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x62,0x7d]
+v_cmp_f_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x62,0x7d]
+v_cmp_f_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x63,0x7d]
+v_cmp_f_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xb1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xb1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb1,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xb1,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_f_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb1,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x80,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb1,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_f_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x80,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb1,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xb1,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_f_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x80,0x7d]
 
-v_cmpx_lt_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb1,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x81,0x7d]
 
-v_cmpx_lt_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb1,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xc0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xc0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xc0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xc0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xc0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xc0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xc0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xc0,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xc0,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xc0,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xc0,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x64,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x64,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x64,0x7d]
+v_cmp_f_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_eq_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x65,0x7d]
+v_cmp_f_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_eq_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_eq_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_eq_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_eq_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_eq_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_eq_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_eq_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_eq_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xc0,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_eq_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x82,0x7d]
 
-v_cmpx_eq_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_lt_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x82,0x7d]
 
-v_cmpx_eq_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x82,0x7d]
 
-v_cmpx_eq_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb2,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_lt_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x82,0x7d]
 
-v_cmpx_eq_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x82,0x7d]
 
-v_cmpx_eq_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_lt_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x82,0x7d]
 
-v_cmpx_eq_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x82,0x7d]
 
-v_cmpx_eq_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb2,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lt_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x82,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x82,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x82,0x7d]
 
-v_cmpx_le_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x66,0x7d]
+v_cmp_lt_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x83,0x7d]
 
-v_cmpx_le_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x66,0x7d]
+v_cmp_lt_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x66,0x7d]
+v_cmp_lt_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xc1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x66,0x7d]
+v_cmp_lt_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xc1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x66,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_lt_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xc1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x66,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_lt_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xc1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x66,0x7d]
+v_cmp_lt_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xc1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x66,0x7d]
+v_cmp_lt_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xc1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x67,0x7d]
+v_cmp_lt_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xc1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xc1,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xc1,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xb3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xc1,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_le_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xb3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_le_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xc1,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_le_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_le_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_le_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb3,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xb3,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb3,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb3,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb3,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xb3,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb3,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_le_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb3,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lt_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_gt_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x68,0x7d]
+v_cmp_lt_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_gt_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x68,0x7d]
+v_cmp_lt_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_gt_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x68,0x7d]
+v_cmp_lt_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_gt_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x68,0x7d]
+v_cmp_lt_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_gt_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x68,0x7d]
+v_cmp_lt_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_gt_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x68,0x7d]
+v_cmp_lt_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_gt_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x68,0x7d]
+v_cmp_lt_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_gt_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x68,0x7d]
+v_cmp_lt_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xc1,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_gt_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x68,0x7d]
+v_cmp_eq_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x68,0x7d]
+v_cmp_eq_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x68,0x7d]
+v_cmp_eq_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x68,0x7d]
+v_cmp_eq_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x68,0x7d]
+v_cmp_eq_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x68,0x7d]
+v_cmp_eq_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x68,0x7d]
+v_cmp_eq_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x68,0x7d]
+v_cmp_eq_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x68,0x7d]
+v_cmp_eq_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x68,0x7d]
+v_cmp_eq_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x68,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_eq_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x68,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_eq_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x68,0x7d]
+v_cmp_eq_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x68,0x7d]
+v_cmp_eq_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x69,0x7d]
+v_cmp_eq_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x84,0x7d]
 
-v_cmpx_gt_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x84,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x84,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x84,0x7d]
 
-v_cmpx_gt_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x84,0x7d]
 
-v_cmpx_gt_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x85,0x7d]
 
-v_cmpx_gt_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xc2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xc2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb4,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_eq_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xc2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xc2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_eq_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xc2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xc2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb4,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_eq_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xc2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xc2,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xc2,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xc2,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xc2,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ne_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x6a,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_eq_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_ne_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x6a,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_eq_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_ne_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ne_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x6a,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_ne_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x6b,0x7d]
+v_cmp_eq_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_ne_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xc2,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ne_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xb5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xb5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb5,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xb5,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_le_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb5,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb5,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_le_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb5,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xb5,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_le_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb5,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x86,0x7d]
 
-v_cmpx_ne_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb5,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_le_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x86,0x7d]
 
-v_cmpx_ge_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x6c,0x7d]
+v_cmp_le_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x86,0x7d]
 
-v_cmpx_ge_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x6c,0x7d]
+v_cmp_le_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x86,0x7d]
 
-v_cmpx_ge_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x6c,0x7d]
+v_cmp_le_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x86,0x7d]
 
-v_cmpx_ge_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x6c,0x7d]
+v_cmp_le_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x86,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x6c,0x7d]
+v_cmp_le_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x86,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x6c,0x7d]
+v_cmp_le_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x86,0x7d]
 
-v_cmpx_ge_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x6c,0x7d]
+v_cmp_le_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x86,0x7d]
 
-v_cmpx_ge_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x6c,0x7d]
+v_cmp_le_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x87,0x7d]
 
-v_cmpx_ge_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x6c,0x7d]
+v_cmp_le_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x6c,0x7d]
+v_cmp_le_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xc3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x6c,0x7d]
+v_cmp_le_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xc3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x6c,0x7d]
+v_cmp_le_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xc3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x6c,0x7d]
+v_cmp_le_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xc3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x6c,0x7d]
+v_cmp_le_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xc3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x6c,0x7d]
+v_cmp_le_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xc3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x6c,0x7d]
+v_cmp_le_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xc3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x6c,0x7d]
+v_cmp_le_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xc3,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x6c,0x7d]
+v_cmp_le_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xc3,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x6c,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xc3,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x6c,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x6c,0x7d]
+v_cmp_le_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xc3,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x6c,0x7d]
+v_cmp_le_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_ge_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x6d,0x7d]
+v_cmp_le_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_ge_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_ge_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ge_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ge_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ge_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_ge_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ge_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_ge_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ge_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ge_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ge_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ge_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb6,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_ge_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_ge_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ge_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_ge_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb6,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_le_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_t_i16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x6e,0x7d]
+v_cmp_le_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xc3,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_t_i16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x6e,0x7d]
+v_cmp_gt_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x6e,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_gt_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x6e,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_gt_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x88,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_i16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x6e,0x7d]
+v_cmp_gt_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x88,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_t_i16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x6e,0x7d]
+v_cmp_gt_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x88,0x7d]
 
-v_cmpx_t_i16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x6f,0x7d]
+v_cmp_gt_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x88,0x7d]
 
-v_cmpx_t_i16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x89,0x7d]
 
-v_cmpx_t_i16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xb7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xc4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xb7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xc4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xc4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xc4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xc4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xc4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb7,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xc4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xb7,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xc4,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb7,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xc4,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb7,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_gt_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xc4,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_t_i16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb7,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_t_i16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xb7,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_gt_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xc4,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_t_i16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb7,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_t_i16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb7,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_gt_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_f_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_f_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_f_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_f_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_f_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_f_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_f_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_f_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_f_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_f_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_f_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_f_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_f_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_f_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_f_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_f_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_f_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x70,0x7d]
+v_cmp_gt_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xc4,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_f_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x70,0x7d]
+v_cmp_ne_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x70,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_ne_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x70,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_ne_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x70,0x7d]
+v_cmp_ne_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x70,0x7d]
+v_cmp_ne_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x71,0x7d]
+v_cmp_ne_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_ne_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ne_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb8,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ne_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ne_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8a,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_ne_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8a,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ne_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8a,0x7d]
 
-v_cmpx_f_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb8,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ne_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8a,0x7d]
 
-v_cmpx_lt_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x72,0x7d]
+v_cmp_ne_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8b,0x7d]
 
-v_cmpx_lt_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xc5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xc5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xc5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xc5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xc5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xc5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xc5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xc5,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xc5,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xc5,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xc5,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x72,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x72,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x72,0x7d]
+v_cmp_ne_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lt_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x73,0x7d]
+v_cmp_ne_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_lt_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xb9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lt_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xb9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lt_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xb9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lt_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xb9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lt_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xb9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lt_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xb9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_lt_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xb9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lt_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xb9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_lt_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xb9,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lt_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xb9,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_ne_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xc5,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lt_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xb9,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8c,0x7d]
 
-v_cmpx_lt_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xb9,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ge_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x8c,0x7d]
 
-v_cmpx_lt_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xb9,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x8c,0x7d]
 
-v_cmpx_lt_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xb9,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_ge_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x8c,0x7d]
 
-v_cmpx_lt_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xb9,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8c,0x7d]
 
-v_cmpx_lt_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xb9,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ge_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8c,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8c,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8c,0x7d]
 
-v_cmpx_eq_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x74,0x7d]
+v_cmp_ge_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8d,0x7d]
 
-v_cmpx_eq_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x74,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x74,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_ge_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xc6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x74,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_ge_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xc6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x74,0x7d]
+v_cmp_ge_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xc6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x74,0x7d]
+v_cmp_ge_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xc6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x75,0x7d]
+v_cmp_ge_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xc6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xc6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xc6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xc6,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_eq_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xc6,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_eq_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xc6,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_eq_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_eq_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xc6,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_eq_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xba,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xba,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_eq_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xba,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ge_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_le_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x76,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_le_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x76,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_le_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x76,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_le_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x76,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_le_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x76,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_le_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x76,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_le_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x76,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_le_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x76,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_le_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x76,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_le_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x76,0x7d]
+v_cmp_ge_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xc6,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_le_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x76,0x7d]
+v_cmp_t_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x76,0x7d]
+v_cmp_t_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x76,0x7d]
+v_cmp_t_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x76,0x7d]
+v_cmp_t_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x76,0x7d]
+v_cmp_t_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x76,0x7d]
+v_cmp_t_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x76,0x7d]
+v_cmp_t_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x76,0x7d]
+v_cmp_t_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x76,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_t_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x76,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_t_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x76,0x7d]
+v_cmp_t_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x76,0x7d]
+v_cmp_t_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x77,0x7d]
+v_cmp_t_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xbb,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xbb,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xbb,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xbb,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xbb,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x8e,0x7d]
 
-v_cmpx_le_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xbb,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x8e,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xbb,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x8e,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xbb,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x8e,0x7d]
 
-v_cmpx_le_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xbb,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_t_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x8e,0x7d]
 
-v_cmpx_le_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xbb,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_t_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x8f,0x7d]
 
-v_cmpx_le_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xbb,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xbb,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_t_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xc7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xbb,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_t_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xc7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xbb,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_t_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xc7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xbb,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_t_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xc7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xbb,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_t_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xc7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xc7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xc7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xc7,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xc7,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xc7,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xc7,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x78,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x78,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_gt_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_gt_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x78,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_gt_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x79,0x7d]
+v_cmp_t_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_gt_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_gt_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_gt_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xc7,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_gt_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xbc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_f_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xbc,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_f_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_f_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x90,0x7d]
 
-v_cmpx_gt_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xbc,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_f_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x90,0x7d]
 
-v_cmpx_ne_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x7a,0x7d]
+v_cmp_f_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x90,0x7d]
 
-v_cmpx_ne_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x7a,0x7d]
+v_cmp_f_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x90,0x7d]
 
-v_cmpx_ne_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x7a,0x7d]
+v_cmp_f_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x90,0x7d]
 
-v_cmpx_ne_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x7a,0x7d]
+v_cmp_f_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x90,0x7d]
 
-v_cmpx_ne_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x7a,0x7d]
+v_cmp_f_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x90,0x7d]
 
-v_cmpx_ne_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x7a,0x7d]
+v_cmp_f_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x90,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ne_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x7a,0x7d]
+v_cmp_f_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x90,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ne_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x7a,0x7d]
+v_cmp_f_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x90,0x7d]
 
-v_cmpx_ne_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x7a,0x7d]
+v_cmp_f_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x90,0x7d]
 
-v_cmpx_ne_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x7a,0x7d]
+v_cmp_f_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x91,0x7d]
 
-v_cmpx_ne_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x7a,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x7a,0x7d]
+v_cmp_f_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xc8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x7a,0x7d]
+v_cmp_f_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xc8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x7a,0x7d]
+v_cmp_f_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xc8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x7a,0x7d]
+v_cmp_f_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xc8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x7a,0x7d]
+v_cmp_f_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xc8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x7a,0x7d]
+v_cmp_f_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xc8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x7a,0x7d]
+v_cmp_f_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xc8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x7a,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xc8,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x7a,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xc8,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x7a,0x7d]
+v_cmp_f_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xc8,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x7a,0x7d]
+v_cmp_f_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_ne_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x7b,0x7d]
+v_cmp_f_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xc8,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_ne_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xbd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_ne_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xbd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_ne_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xbd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_ne_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xbd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ne_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xbd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ne_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xbd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ne_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xbd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_ne_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xbd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ne_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xbd,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_ne_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xbd,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ne_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xbd,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ne_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xbd,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ne_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xbd,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ne_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xbd,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_ne_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xbd,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_ne_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xbd,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_f_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ge_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x7c,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_ge_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x7c,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_ge_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x7c,0x7d]
+v_cmp_f_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xc8,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ge_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x7c,0x7d]
+v_cmp_lt_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x7c,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_lt_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x7c,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_lt_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x7c,0x7d]
+v_cmp_lt_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x92,0x7d]
 
-v_cmpx_ge_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x7c,0x7d]
+v_cmp_lt_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x92,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x7d,0x7d]
+v_cmp_lt_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x92,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x92,0x7d]
 
-v_cmpx_ge_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x92,0x7d]
 
-v_cmpx_ge_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x93,0x7d]
 
-v_cmpx_ge_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xc9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xc9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xc9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xbe,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xc9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xc9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_lt_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xc9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xc9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xbe,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xc9,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xc9,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_lt_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xc9,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_ge_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_ge_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xbe,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lt_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xc9,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_t_u16 vcc, s0, v0
-// CHECK: [0x00,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_t_u16 vcc, s101, v0
-// CHECK: [0x65,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_t_u16 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_t_u16 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_t_u16 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_t_u16 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_t_u16 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_t_u16 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_t_u16 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_t_u16 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_t_u16 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_t_u16 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_t_u16 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_t_u16 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_t_u16 vcc, 0, v0
-// CHECK: [0x80,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_t_u16 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_t_u16 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_t_u16 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x7e,0x7d]
+v_cmp_lt_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_t_u16 vcc, 0xfe0b, v0
-// CHECK: [0xff,0x00,0x7e,0x7d,0x0b,0xfe,0x00,0x00]
+v_cmp_lt_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xc9,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_t_u16 vcc, 0x3456, v0
-// CHECK: [0xff,0x00,0x7e,0x7d,0x56,0x34,0x00,0x00]
+v_cmp_eq_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x94,0x7d]
 
-v_cmpx_t_u16 vcc, v0, v0
-// CHECK: [0x00,0x01,0x7e,0x7d]
+v_cmp_eq_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x94,0x7d]
 
-v_cmpx_t_u16 vcc, v255, v0
-// CHECK: [0xff,0x01,0x7e,0x7d]
+v_cmp_eq_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x94,0x7d]
 
-v_cmpx_t_u16 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x7f,0x7d]
+v_cmp_eq_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xbf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xbf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xbf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xbf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xbf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xbf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xbf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xbf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xbf,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xbf,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_eq_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xbf,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xbf,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_eq_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xbf,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xbf,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_eq_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x94,0x7d]
 
-v_cmpx_t_u16_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xbf,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x94,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_u16_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xbf,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_eq_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x94,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x80,0x7d]
+v_cmp_eq_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x94,0x7d]
 
-v_cmp_f_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x80,0x7d]
+v_cmp_eq_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x94,0x7d]
 
-v_cmp_f_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x80,0x7d]
+v_cmp_eq_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x95,0x7d]
 
-v_cmp_f_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xca,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xca,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xca,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xca,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xca,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xca,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xca,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xca,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xca,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xca,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_f_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xca,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_f_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xca,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_f_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_f_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x80,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_f_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x80,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_eq_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_f_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x80,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_eq_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_f_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x80,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_f_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x80,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_f_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x81,0x7d]
+v_cmp_eq_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_f_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_f_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_f_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_f_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_f_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_f_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_f_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_f_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xc0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_f_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc0,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_f_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc0,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_f_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc0,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_f_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc0,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_eq_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xca,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_f_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x96,0x7d]
 
-v_cmp_f_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xc0,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_le_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x96,0x7d]
 
-v_cmp_f_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x96,0x7d]
 
-v_cmp_f_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_le_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x96,0x7d]
 
-v_cmp_f_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_le_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x96,0x7d]
 
-v_cmp_f_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_le_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x96,0x7d]
 
-v_cmp_f_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x96,0x7d]
 
-v_cmp_f_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xc0,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_le_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x96,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x96,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_lt_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x96,0x7d]
 
-v_cmp_lt_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x82,0x7d]
+v_cmp_le_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x97,0x7d]
 
-v_cmp_lt_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x82,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x82,0x7d]
+v_cmp_le_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xcb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x82,0x7d]
+v_cmp_le_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xcb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x82,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_le_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xcb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x82,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_le_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xcb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x82,0x7d]
+v_cmp_le_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xcb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x82,0x7d]
+v_cmp_le_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xcb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x83,0x7d]
+v_cmp_le_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xcb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xc1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xcb,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xc1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xcb,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xc1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xcb,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_lt_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xc1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_lt_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xc1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xcb,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_lt_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xc1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_lt_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xc1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_lt_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xc1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc1,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc1,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc1,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc1,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xc1,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xc1,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xc1,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xc1,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xc1,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xc1,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xc1,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_lt_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xc1,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_le_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_eq_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x84,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_eq_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x84,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_eq_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x84,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_eq_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x84,0x7d]
+v_cmp_le_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xcb,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_eq_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x84,0x7d]
+v_cmp_gt_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x84,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_gt_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x84,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_gt_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x84,0x7d]
+v_cmp_gt_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x84,0x7d]
+v_cmp_gt_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x98,0x7d]
 
-v_cmp_eq_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x85,0x7d]
+v_cmp_gt_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x98,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x98,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x98,0x7d]
 
-v_cmp_eq_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x98,0x7d]
 
-v_cmp_eq_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x99,0x7d]
 
-v_cmp_eq_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xcc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xcc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xc2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xcc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc2,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xcc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc2,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xcc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc2,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xcc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc2,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_gt_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xcc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xcc,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xc2,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xcc,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xcc,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_gt_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_gt_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xcc,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_eq_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xc2,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_gt_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_le_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_le_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_le_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_le_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_le_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_le_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_le_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_le_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_le_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_le_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_le_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_le_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_le_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_le_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_le_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_le_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x86,0x7d]
+v_cmp_gt_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xcc,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_le_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x86,0x7d]
+v_cmp_ne_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x9a,0x7d]
 
-v_cmp_le_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x86,0x7d]
+v_cmp_ne_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x9a,0x7d]
 
-v_cmp_le_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x86,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ne_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x9a,0x7d]
 
-v_cmp_le_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x86,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ne_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x9a,0x7d]
 
-v_cmp_le_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x86,0x7d]
+v_cmp_ne_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x9a,0x7d]
 
-v_cmp_le_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x86,0x7d]
+v_cmp_ne_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x9a,0x7d]
 
-v_cmp_le_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x87,0x7d]
+v_cmp_ne_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xc3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xc3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xc3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xc3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xc3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xc3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xc3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xc3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc3,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc3,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc3,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x9a,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc3,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_ne_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x9a,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xc3,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ne_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x9a,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xc3,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ne_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x9a,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xc3,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ne_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x9a,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xc3,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_ne_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x9b,0x7d]
 
-v_cmp_le_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xc3,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xc3,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_ne_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xcd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xc3,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ne_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xcd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xc3,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ne_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xcd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xcd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xcd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xcd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xcd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xcd,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xcd,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xcd,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_gt_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_gt_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xcd,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_gt_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_gt_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_gt_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_gt_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_gt_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_gt_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_gt_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_gt_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_gt_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_gt_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x88,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ne_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_gt_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x88,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ne_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_gt_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_gt_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x88,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_gt_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x89,0x7d]
+v_cmp_ne_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_gt_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_gt_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_gt_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_gt_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_gt_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xcd,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_gt_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xc4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc4,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc4,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc4,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc4,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_ge_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xc4,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ge_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_ge_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ge_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_ge_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x9c,0x7d]
 
-v_cmp_gt_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xc4,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_ge_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x9c,0x7d]
 
-v_cmp_ne_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8a,0x7d]
+v_cmp_ge_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x9c,0x7d]
 
-v_cmp_ne_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x8a,0x7d]
+v_cmp_ge_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x9c,0x7d]
 
-v_cmp_ne_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x8a,0x7d]
+v_cmp_ge_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x9c,0x7d]
 
-v_cmp_ne_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x8a,0x7d]
+v_cmp_ge_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x9c,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ne_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8a,0x7d]
+v_cmp_ge_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x9c,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ne_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8a,0x7d]
+v_cmp_ge_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x9c,0x7d]
 
-v_cmp_ne_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8a,0x7d]
+v_cmp_ge_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x9c,0x7d]
 
-v_cmp_ne_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8a,0x7d]
+v_cmp_ge_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x9d,0x7d]
 
-v_cmp_ne_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8a,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8a,0x7d]
+v_cmp_ge_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xce,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8a,0x7d]
+v_cmp_ge_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xce,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8a,0x7d]
+v_cmp_ge_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xce,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8a,0x7d]
+v_cmp_ge_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xce,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8a,0x7d]
+v_cmp_ge_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xce,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8a,0x7d]
+v_cmp_ge_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xce,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8a,0x7d]
+v_cmp_ge_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xce,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8a,0x7d]
+v_cmp_ge_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xce,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8a,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xce,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8a,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ge_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xce,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_ne_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8a,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ge_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xce,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ne_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8a,0x7d]
+v_cmp_ge_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xce,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ne_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8a,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_ne_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8b,0x7d]
+v_cmp_ge_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xc5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xc5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xc5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ne_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xc5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ne_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xc5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ne_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xc5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ne_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xc5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ne_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xc5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc5,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc5,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc5,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc5,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xc5,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xc5,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xc5,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xc5,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xc5,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ge_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xce,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ne_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xc5,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_t_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0x9e,0x7d]
 
-v_cmp_ne_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xc5,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_t_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0x9e,0x7d]
 
-v_cmp_ne_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xc5,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_t_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0x9e,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0x9e,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8c,0x7d]
+v_cmp_t_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8c,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_t_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0x9e,0x7d]
 
-v_cmp_ge_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8c,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_t_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0x9f,0x7d]
 
-v_cmp_ge_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8c,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8c,0x7d]
+v_cmp_t_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xcf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8d,0x7d]
+v_cmp_t_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xcf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xcf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xcf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xcf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xcf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xcf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xcf,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xcf,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xc6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xcf,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc6,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc6,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xcf,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc6,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc6,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xc6,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ge_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xc6,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_t_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_t_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x8e,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_t_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x8e,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_t_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x8e,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_t_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x8e,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_t_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x8e,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_t_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x8e,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_t_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x8e,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_t_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x8e,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_t_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x8e,0x7d]
+v_cmp_t_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xcf,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_t_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x8e,0x7d]
+v_cmpx_f_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x8e,0x7d]
+v_cmpx_f_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x8e,0x7d]
+v_cmpx_f_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x8e,0x7d]
+v_cmpx_f_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x8e,0x7d]
+v_cmpx_f_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x8e,0x7d]
+v_cmpx_f_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x8e,0x7d]
+v_cmpx_f_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x8e,0x7d]
+v_cmpx_f_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x8e,0x7d]
+v_cmpx_f_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x8e,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_f_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x8e,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x8e,0x7d]
+v_cmpx_f_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x8e,0x7d]
+v_cmpx_f_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa0,0x7d]
 
-v_cmp_t_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x8f,0x7d]
+v_cmpx_f_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa0,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xc7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa0,0x7d]
 
-v_cmp_t_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xc7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa0,0x7d]
 
-v_cmp_t_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xc7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa0,0x7d]
 
-v_cmp_t_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xc7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa0,0x7d]
 
-v_cmp_t_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xc7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa0,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_t_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xc7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa0,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_t_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xc7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa0,0x7d]
 
-v_cmp_t_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xc7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa0,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc7,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa1,0x7d]
 
-v_cmp_t_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc7,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_f_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc7,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_f_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xd0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc7,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_f_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xd0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xc7,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_f_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xd0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xc7,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_f_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xd0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xc7,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_f_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xd0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xc7,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_f_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xd0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xc7,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_f_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xd0,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xc7,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_f_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xd0,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xc7,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_f_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xd0,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_t_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xc7,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_f_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xd0,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_f_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_f_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xd0,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_f_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_f_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_f_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_f_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_f_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_f_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_f_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_f_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_f_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_f_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_f_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_f_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_f_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_f_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_f_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_f_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_f_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x90,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_f_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_f_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x90,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_f_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x90,0x7d]
+v_cmpx_f_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xd0,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_f_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x90,0x7d]
+v_cmpx_lt_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa2,0x7d]
 
-v_cmp_f_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x91,0x7d]
+v_cmpx_lt_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xc8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc8,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc8,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc8,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc8,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_lt_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xc8,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lt_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_lt_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa2,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_f_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_lt_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa2,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa2,0x7d]
 
-v_cmp_f_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xc8,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_lt_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa2,0x7d]
 
-v_cmp_lt_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x92,0x7d]
+v_cmpx_lt_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa3,0x7d]
 
-v_cmp_lt_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xd1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xd1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xd1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xd1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xd1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xd1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xd1,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xd1,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xd1,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xd1,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_lt_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_lt_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xd1,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_lt_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_lt_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_lt_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_lt_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_lt_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x92,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_lt_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x92,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_lt_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_lt_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x92,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_lt_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x93,0x7d]
+v_cmpx_lt_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xc9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_lt_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xc9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_lt_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xc9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_lt_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xc9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_lt_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xc9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_lt_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xc9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_lt_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xc9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_lt_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xc9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xc9,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xc9,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xd1,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_lt_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xc9,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa4,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xc9,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xa4,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xc9,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xa4,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xc9,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_eq_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xa4,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xc9,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa4,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xc9,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_eq_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa4,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xc9,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa4,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xc9,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_eq_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa4,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xc9,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa4,0x7d]
 
-v_cmp_lt_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xc9,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa4,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa4,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa4,0x7d]
 
-v_cmp_eq_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x94,0x7d]
+v_cmpx_eq_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa5,0x7d]
 
-v_cmp_eq_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x94,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x94,0x7d]
+v_cmpx_eq_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xd2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x94,0x7d]
+v_cmpx_eq_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xd2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x94,0x7d]
+v_cmpx_eq_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xd2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x94,0x7d]
+v_cmpx_eq_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xd2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x94,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xd2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x94,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xd2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x94,0x7d]
+v_cmpx_eq_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xd2,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x94,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xd2,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_eq_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x95,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xd2,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xd2,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xd2,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_eq_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_eq_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_eq_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_eq_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_eq_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xca,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xca,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xca,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xca,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xca,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xca,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_eq_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xca,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_le_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x96,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_le_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x96,0x7d]
+v_cmpx_eq_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xd2,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_le_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x96,0x7d]
+v_cmpx_le_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x96,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_le_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x96,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_le_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa6,0x7d]
 
-v_cmp_le_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x96,0x7d]
+v_cmpx_le_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa6,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x96,0x7d]
+v_cmpx_le_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa6,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x97,0x7d]
+v_cmpx_le_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa6,0x7d]
 
-v_cmp_le_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xcb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa6,0x7d]
 
-v_cmp_le_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xcb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa7,0x7d]
 
-v_cmp_le_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xcb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xcb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xd3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xcb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xd3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xcb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xd3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xcb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xd3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xcb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xd3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xcb,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xd3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xcb,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xd3,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xcb,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xd3,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xcb,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xd3,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xcb,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_le_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xd3,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xcb,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_le_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xcb,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_le_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xd3,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xcb,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xcb,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xcb,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xcb,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_le_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xcb,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_le_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_gt_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_gt_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_gt_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_gt_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_gt_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_gt_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_gt_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_gt_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_gt_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_gt_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_gt_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_gt_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_gt_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_gt_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x98,0x7d]
+v_cmpx_le_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xd3,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_gt_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x98,0x7d]
+v_cmpx_gt_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x98,0x7d]
+v_cmpx_gt_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x98,0x7d]
+v_cmpx_gt_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x98,0x7d]
+v_cmpx_gt_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x98,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x98,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x98,0x7d]
+v_cmpx_gt_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x98,0x7d]
+v_cmpx_gt_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x99,0x7d]
+v_cmpx_gt_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xcc,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xcc,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xcc,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xa8,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xcc,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xa8,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xcc,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_gt_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xa8,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xcc,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_gt_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xa9,0x7d]
 
-v_cmp_gt_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_gt_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xd4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xd4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_gt_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xd4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xd4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_gt_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xcc,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xd4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xd4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xd4,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xd4,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xd4,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xd4,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_ne_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ne_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xd4,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ne_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_ne_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_ne_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ne_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ne_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ne_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ne_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ne_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ne_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ne_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ne_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ne_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x9a,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ne_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x9a,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ne_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ne_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x9a,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_ne_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x9b,0x7d]
+v_cmpx_gt_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ne_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xcd,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_ne_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xcd,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ne_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xcd,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xd4,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ne_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xcd,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xcd,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xcd,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xcd,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xcd,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xcd,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xcd,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xcd,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xcd,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_ne_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xcd,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ne_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xcd,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ne_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xcd,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ne_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xcd,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ne_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xcd,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ne_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xcd,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_ne_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xcd,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ne_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xaa,0x7d]
 
-v_cmp_ne_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xcd,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ne_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xaa,0x7d]
 
-v_cmp_ge_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x9c,0x7d]
+v_cmpx_ne_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xaa,0x7d]
 
-v_cmp_ge_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x9c,0x7d]
+v_cmpx_ne_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xaa,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x9c,0x7d]
+v_cmpx_ne_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xaa,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x9c,0x7d]
+v_cmpx_ne_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xaa,0x7d]
 
-v_cmp_ge_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x9c,0x7d]
+v_cmpx_ne_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xaa,0x7d]
 
-v_cmp_ge_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x9c,0x7d]
+v_cmpx_ne_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xab,0x7d]
 
-v_cmp_ge_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xd5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xd5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xd5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xd5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xd5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xd5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xd5,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xd5,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xd5,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xd5,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_ge_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x9c,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_ge_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x9c,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ne_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xd5,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_ge_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x9c,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ne_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_ge_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x9c,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_ge_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x9c,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_ge_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x9d,0x7d]
+v_cmpx_ne_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_ge_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_ge_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_ge_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_ge_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_ge_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xce,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xce,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xce,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xce,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xce,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xce,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ne_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xd5,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmp_ge_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ge_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xac,0x7d]
 
-v_cmp_ge_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xac,0x7d]
 
-v_cmp_ge_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_ge_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xac,0x7d]
 
-v_cmp_ge_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ge_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xac,0x7d]
 
-v_cmp_ge_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xce,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ge_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xac,0x7d]
 
-v_cmp_t_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xac,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_t_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xac,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_t_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xac,0x7d]
 
-v_cmp_t_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xac,0x7d]
 
-v_cmp_t_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0x9e,0x7d]
+v_cmpx_ge_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xad,0x7d]
 
-v_cmp_t_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0x9e,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0x9e,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xd6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0x9e,0x7d]
+v_cmpx_ge_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xd6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0x9e,0x7d]
+v_cmpx_ge_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xd6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0x9f,0x7d]
+v_cmpx_ge_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xd6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xcf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xd6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xcf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xd6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xcf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xd6,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xcf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xd6,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xcf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xd6,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xcf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xd6,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmp_t_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xcf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmp_t_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xcf,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xd6,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xcf,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xcf,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xcf,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xcf,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xcf,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xcf,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xcf,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xcf,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xcf,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xcf,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xcf,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmp_t_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xcf,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ge_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_f_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa0,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_f_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xa0,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_f_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xa0,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_f_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xa0,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_f_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa0,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_f_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa0,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_f_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa0,0x7d]
+v_cmpx_ge_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xd6,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_f_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa0,0x7d]
+v_cmpx_t_i32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa0,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_i32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa0,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_t_i32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa0,0x7d]
+v_cmpx_t_i32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa0,0x7d]
+v_cmpx_t_i32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xae,0x7d]
 
-v_cmpx_f_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa1,0x7d]
+v_cmpx_t_i32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xae,0x7d]
 
-v_cmpx_f_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xae,0x7d]
 
-v_cmpx_f_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xae,0x7d]
 
-v_cmpx_f_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xae,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xae,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xae,0x7d]
 
-v_cmpx_f_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xae,0x7d]
 
-v_cmpx_f_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xaf,0x7d]
 
-v_cmpx_f_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xd0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xd0,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_t_i32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xd7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xd0,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_t_i32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xd7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xd0,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_t_i32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xd7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xd0,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_t_i32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xd7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_t_i32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xd7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xd0,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_t_i32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xd7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_t_i32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xd7,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_t_i32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xd7,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_t_i32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xd7,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_t_i32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xd7,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_t_i32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_f_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xd0,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_t_i32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xd7,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_lt_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lt_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_lt_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lt_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_lt_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa2,0x7d]
+v_cmpx_t_i32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lt_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa2,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_i32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xd7,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lt_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa2,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa2,0x7d]
+v_cmpx_f_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa2,0x7d]
+v_cmpx_f_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa3,0x7d]
+v_cmpx_f_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xd1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xd1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xd1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xd1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xd1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xd1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xd1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xd1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xd1,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xd1,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xd1,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xd1,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_f_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xd1,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_f_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xd1,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_f_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xd1,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_f_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb0,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xd1,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_f_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb0,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xd1,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_f_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xd1,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_f_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb0,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xd1,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_f_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb1,0x7d]
 
-v_cmpx_lt_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xd1,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xd8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xd8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xd8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xd8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xd8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xd8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xd8,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xd8,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xd8,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xd8,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xd8,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa4,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_f_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa4,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa4,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_eq_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa5,0x7d]
+v_cmpx_f_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_eq_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_eq_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_eq_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_eq_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_eq_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_eq_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xd2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xd8,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_eq_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xd2,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xd2,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xd2,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xd2,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_lt_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xd2,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lt_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_lt_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_lt_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb2,0x7d]
 
-v_cmpx_eq_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xd2,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_lt_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb2,0x7d]
 
-v_cmpx_le_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb2,0x7d]
 
-v_cmpx_le_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb2,0x7d]
 
-v_cmpx_le_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb2,0x7d]
 
-v_cmpx_le_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb2,0x7d]
 
-v_cmpx_le_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb2,0x7d]
 
-v_cmpx_le_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb2,0x7d]
 
-v_cmpx_le_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb2,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb2,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb2,0x7d]
 
-v_cmpx_le_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb2,0x7d]
 
-v_cmpx_le_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa6,0x7d]
+v_cmpx_lt_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb3,0x7d]
 
-v_cmpx_le_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa6,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa6,0x7d]
+v_cmpx_lt_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xd9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa6,0x7d]
+v_cmpx_lt_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xd9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa6,0x7d]
+v_cmpx_lt_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xd9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa6,0x7d]
+v_cmpx_lt_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xd9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa6,0x7d]
+v_cmpx_lt_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xd9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa6,0x7d]
+v_cmpx_lt_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xd9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa6,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xd9,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa6,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xd9,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa6,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xd9,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa6,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xd9,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_le_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa7,0x7d]
+v_cmpx_lt_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xd3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xd9,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xd3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xd3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_le_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xd3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_le_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xd3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_le_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xd3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_le_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xd3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_le_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xd3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xd3,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xd3,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xd3,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xd3,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xd3,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xd3,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xd3,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xd3,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xd3,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xd3,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xd3,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_le_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xd3,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_lt_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xd9,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_gt_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xa8,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb4,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xa8,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb4,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xa8,0x7d]
+v_cmpx_eq_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb4,0x7d]
 
-v_cmpx_gt_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xa9,0x7d]
+v_cmpx_eq_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb5,0x7d]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xda,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xda,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xda,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xda,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xda,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xda,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xd4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xda,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xd4,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xda,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xd4,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xda,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xd4,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xda,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xd4,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xda,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xda,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xd4,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_gt_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xd4,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_eq_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ne_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_ne_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_ne_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ne_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_ne_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_ne_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xaa,0x7d]
+v_cmpx_eq_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xda,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ne_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xaa,0x7d]
+v_cmpx_le_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xaa,0x7d]
+v_cmpx_le_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xaa,0x7d]
+v_cmpx_le_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xaa,0x7d]
+v_cmpx_le_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xaa,0x7d]
+v_cmpx_le_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xaa,0x7d]
+v_cmpx_le_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xaa,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_le_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xaa,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_le_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xaa,0x7d]
+v_cmpx_le_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xaa,0x7d]
+v_cmpx_le_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xab,0x7d]
+v_cmpx_le_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xd5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xd5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xd5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xd5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xd5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xd5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xd5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb6,0x7d]
 
-v_cmpx_ne_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xd5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb6,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ne_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xd5,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb6,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ne_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xd5,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb6,0x7d]
 
-v_cmpx_ne_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xd5,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb6,0x7d]
 
-v_cmpx_ne_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xd5,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_le_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb7,0x7d]
 
-v_cmpx_ne_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xd5,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xd5,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_le_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xdb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xd5,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_le_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xdb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xd5,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_le_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xdb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xd5,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_le_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xdb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xd5,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_le_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xdb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xd5,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_le_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xdb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ne_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xd5,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_le_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xdb,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xdb,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xdb,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xdb,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xdb,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_ge_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xac,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_le_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_ge_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xac,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_le_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_ge_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_ge_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xac,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_ge_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xad,0x7d]
+v_cmpx_le_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xdb,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_ge_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xd6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xd6,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xd6,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xd6,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xd6,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_gt_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xd6,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_gt_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_gt_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_gt_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xb8,0x7d]
 
-v_cmpx_ge_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xd6,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_gt_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xb8,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_i32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xae,0x7d]
+v_cmpx_gt_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xb8,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_t_i32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xae,0x7d]
+v_cmpx_gt_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xb8,0x7d]
 
-v_cmpx_t_i32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xae,0x7d]
+v_cmpx_gt_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xb8,0x7d]
 
-v_cmpx_t_i32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xae,0x7d]
+v_cmpx_gt_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xb9,0x7d]
 
-v_cmpx_t_i32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xdc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xdc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xdc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xdc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xdc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xdc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xdc,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xdc,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xdc,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xdc,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_t_i32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_t_i32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xdc,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_t_i32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_t_i32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xae,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_t_i32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xae,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_t_i32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_t_i32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xae,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_t_i32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xaf,0x7d]
+v_cmpx_gt_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xd7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_t_i32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xd7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_t_i32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xd7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_t_i32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xd7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_t_i32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xd7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_t_i32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xd7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_t_i32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xd7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_t_i32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xd7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xd7,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xd7,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xd7,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xd7,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xd7,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xdc,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_t_i32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xd7,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ne_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xba,0x7d]
 
-v_cmpx_t_i32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xd7,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ne_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xba,0x7d]
 
-v_cmpx_t_i32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xd7,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ne_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xba,0x7d]
 
-v_cmpx_t_i32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xd7,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ne_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xba,0x7d]
 
-v_cmpx_t_i32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xd7,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_ne_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xba,0x7d]
 
-v_cmpx_t_i32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xd7,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ne_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xba,0x7d]
 
-v_cmpx_t_i32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xd7,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ne_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xba,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xba,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xba,0x7d]
 
-v_cmpx_f_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb0,0x7d]
+v_cmpx_ne_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xbb,0x7d]
 
-v_cmpx_f_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb0,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb0,0x7d]
+v_cmpx_ne_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xdd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb0,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ne_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xdd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb0,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ne_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xdd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb0,0x7d]
+v_cmpx_ne_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xdd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb0,0x7d]
+v_cmpx_ne_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xdd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb1,0x7d]
+v_cmpx_ne_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xdd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xdd,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xdd,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xdd,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xdd,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_f_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_f_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xdd,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_f_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_f_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xd8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xd8,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xd8,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xd8,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xd8,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xd8,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_f_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xd8,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ne_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_lt_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb2,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_lt_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xb2,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_lt_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xb2,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_lt_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xb2,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_lt_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb2,0x7d]
+v_cmpx_ne_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xdd,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_lt_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb2,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb2,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb2,0x7d]
+v_cmpx_ge_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb3,0x7d]
+v_cmpx_ge_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xbc,0x7d]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xd9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xbc,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xd9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xbc,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xd9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xbc,0x7d]
 
-v_cmpx_lt_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xd9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xbc,0x7d]
 
-v_cmpx_lt_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xd9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xbd,0x7d]
 
-v_cmpx_lt_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xd9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xd9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xde,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xd9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xde,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xd9,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xde,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xd9,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xde,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xd9,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xde,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xd9,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_ge_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xde,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xd9,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xde,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xd9,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_ge_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xde,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xd9,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xde,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xd9,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ge_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xde,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xd9,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xde,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xd9,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_ge_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xde,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xd9,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_lt_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xd9,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_ge_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_eq_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_eq_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_eq_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_eq_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_eq_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_eq_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb4,0x7d]
+v_cmpx_ge_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xde,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_eq_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb4,0x7d]
+v_cmpx_t_u32 vcc, s1, v2
+// CHECK: [0x01,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb4,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_u32 vcc, s101, v2
+// CHECK: [0x65,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb4,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_t_u32 vcc, flat_scratch_lo, v2
+// CHECK: [0x66,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb4,0x7d]
+v_cmpx_t_u32 vcc, flat_scratch_hi, v2
+// CHECK: [0x67,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb4,0x7d]
+v_cmpx_t_u32 vcc, vcc_lo, v2
+// CHECK: [0x6a,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb5,0x7d]
+v_cmpx_t_u32 vcc, vcc_hi, v2
+// CHECK: [0x6b,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, tba_lo, v2
+// CHECK: [0x6c,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, tba_hi, v2
+// CHECK: [0x6d,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, tma_lo, v2
+// CHECK: [0x6e,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, tma_hi, v2
+// CHECK: [0x6f,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, ttmp11, v2
+// CHECK: [0x7b,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, m0, v2
+// CHECK: [0x7c,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, exec_lo, v2
+// CHECK: [0x7e,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xda,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, exec_hi, v2
+// CHECK: [0x7f,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xda,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, 0, v2
+// CHECK: [0x80,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xda,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, -1, v2
+// CHECK: [0xc1,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xda,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, 0.5, v2
+// CHECK: [0xf0,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xda,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_t_u32 vcc, -4.0, v2
+// CHECK: [0xf7,0x04,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_t_u32 vcc, 0xaf123456, v2
+// CHECK: [0xff,0x04,0xbe,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xda,0xd0,0xff,0x01,0x00,0x00]
+v_cmpx_t_u32 vcc, 0x3f717273, v2
+// CHECK: [0xff,0x04,0xbe,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_t_u32 vcc, v1, v2
+// CHECK: [0x01,0x05,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_t_u32 vcc, v255, v2
+// CHECK: [0xff,0x05,0xbe,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_t_u32 vcc, s1, v255
+// CHECK: [0x01,0xfe,0xbf,0x7d]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, s2
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_t_u32_e64 s[12:13], 0, s2
+// CHECK: [0x0c,0x00,0xdf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_eq_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xda,0xd0,0x00,0xfe,0x03,0x00]
+v_cmpx_t_u32_e64 s[100:101], 0, s2
+// CHECK: [0x64,0x00,0xdf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 flat_scratch, 0, s2
+// CHECK: [0x66,0x00,0xdf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 vcc, 0, s2
+// CHECK: [0x6a,0x00,0xdf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 tba, 0, s2
+// CHECK: [0x6c,0x00,0xdf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 tma, 0, s2
+// CHECK: [0x6e,0x00,0xdf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 ttmp[10:11], 0, s2
+// CHECK: [0x7a,0x00,0xdf,0xd0,0x80,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], -1, s2
+// CHECK: [0x0a,0x00,0xdf,0xd0,0xc1,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0.5, s2
+// CHECK: [0x0a,0x00,0xdf,0xd0,0xf0,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], -4.0, s2
+// CHECK: [0x0a,0x00,0xdf,0xd0,0xf7,0x04,0x00,0x00]
 
-v_cmpx_le_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], v1, s2
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x01,0x05,0x00,0x00]
 
-v_cmpx_le_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], v255, s2
+// CHECK: [0x0a,0x00,0xdf,0xd0,0xff,0x05,0x00,0x00]
 
-v_cmpx_le_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, s101
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xca,0x00,0x00]
 
-v_cmpx_le_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, flat_scratch_lo
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xcc,0x00,0x00]
 
-v_cmpx_le_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, flat_scratch_hi
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xce,0x00,0x00]
 
-v_cmpx_le_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, vcc_lo
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xd4,0x00,0x00]
 
-v_cmpx_le_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, vcc_hi
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xd6,0x00,0x00]
 
-v_cmpx_le_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, tba_lo
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xd8,0x00,0x00]
 
-v_cmpx_le_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, tba_hi
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xda,0x00,0x00]
 
-v_cmpx_le_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, tma_lo
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xdc,0x00,0x00]
 
-v_cmpx_le_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb6,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_u32_e64 s[10:11], 0, tma_hi
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xde,0x00,0x00]
 
-v_cmpx_le_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb6,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_t_u32_e64 s[10:11], 0, ttmp11
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xf6,0x00,0x00]
 
-v_cmpx_le_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, m0
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xf8,0x00,0x00]
 
-v_cmpx_le_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb6,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, exec_lo
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xfc,0x00,0x00]
 
-v_cmpx_le_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb7,0x7d]
+v_cmpx_t_u32_e64 s[10:11], 0, exec_hi
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xfe,0x00,0x00]
 
-v_cmpx_le_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xdb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, 0
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0x00,0x01,0x00]
 
-v_cmpx_le_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xdb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, -1
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0x82,0x01,0x00]
 
-v_cmpx_le_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xdb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, 0.5
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xe0,0x01,0x00]
 
-v_cmpx_le_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xdb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, -4.0
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xee,0x01,0x00]
 
-v_cmpx_le_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xdb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, v2
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0x04,0x02,0x00]
 
-v_cmpx_le_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xdb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u32_e64 s[10:11], 0, v255
+// CHECK: [0x0a,0x00,0xdf,0xd0,0x80,0xfe,0x03,0x00]
 
-v_cmpx_le_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xdb,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xdb,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xdb,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xdb,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xdb,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xdb,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_f_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xdb,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xdb,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_f_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xdb,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xdb,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_f_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xdb,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_f_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xdb,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_f_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xdb,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc0,0x7d]
 
-v_cmpx_le_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xdb,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_f_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc0,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xb8,0x7d]
+v_cmp_f_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc0,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xb8,0x7d]
+v_cmp_f_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc0,0x7d]
 
-v_cmpx_gt_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xb8,0x7d]
+v_cmp_f_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc0,0x7d]
 
-v_cmpx_gt_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xb8,0x7d]
+v_cmp_f_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc1,0x7d]
 
-v_cmpx_gt_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xb8,0x7d]
+v_cmp_f_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_gt_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xb8,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_f_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_gt_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xb8,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_f_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_gt_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xb8,0x7d]
+v_cmp_f_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_gt_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xb8,0x7d]
+v_cmp_f_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_gt_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xb9,0x7d]
+v_cmp_f_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe0,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_gt_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xdc,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xdc,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xdc,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xdc,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xdc,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_lt_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xdc,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_lt_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc2,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_lt_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc2,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_lt_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_lt_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc2,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc3,0x7d]
 
-v_cmpx_gt_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xdc,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xe1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe1,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe1,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe1,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe1,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe1,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe1,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ne_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe1,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_ne_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe1,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_ne_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe1,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ne_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe1,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_ne_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xba,0x7d]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe1,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ne_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xba,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_lt_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe1,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ne_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xba,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_eq_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xba,0x7d]
+v_cmp_eq_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xba,0x7d]
+v_cmp_eq_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xbb,0x7d]
+v_cmp_eq_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xdd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xdd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xdd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xdd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xdd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xdd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xdd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xdd,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xdd,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xdd,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc4,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ne_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xdd,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc4,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ne_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xdd,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_eq_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xdd,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc4,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xdd,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_eq_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc5,0x7d]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xdd,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xdd,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_eq_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xdd,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_eq_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xdd,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_eq_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xdd,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xdd,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_eq_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ge_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_ge_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_ge_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ge_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_ge_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ge_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xbc,0x7d]
+v_cmp_eq_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe2,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ge_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xbc,0x7d]
+v_cmp_le_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xbc,0x7d]
+v_cmp_le_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xbc,0x7d]
+v_cmp_le_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xbc,0x7d]
+v_cmp_le_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xbc,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_le_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xbc,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_le_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xbc,0x7d]
+v_cmp_le_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xbc,0x7d]
+v_cmp_le_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xbd,0x7d]
+v_cmp_le_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc6,0x7d]
 
-v_cmpx_ge_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc6,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc6,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc6,0x7d]
 
-v_cmpx_ge_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xde,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc6,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xde,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc7,0x7d]
 
-v_cmpx_ge_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xde,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_le_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xde,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_le_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xde,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_le_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xe3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xde,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_le_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_le_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_le_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_le_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe3,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe3,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_ge_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xde,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_le_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe3,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_t_u32 vcc, s0, v0
-// CHECK: [0x00,0x00,0xbe,0x7d]
+v_cmp_le_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe3,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_t_u32 vcc, s101, v0
-// CHECK: [0x65,0x00,0xbe,0x7d]
+v_cmp_le_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe3,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_t_u32 vcc, flat_scratch_lo, v0
-// CHECK: [0x66,0x00,0xbe,0x7d]
+v_cmp_le_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe3,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_t_u32 vcc, flat_scratch_hi, v0
-// CHECK: [0x67,0x00,0xbe,0x7d]
+v_cmp_le_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe3,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_t_u32 vcc, vcc_lo, v0
-// CHECK: [0x6a,0x00,0xbe,0x7d]
+v_cmp_le_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe3,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_t_u32 vcc, vcc_hi, v0
-// CHECK: [0x6b,0x00,0xbe,0x7d]
+v_cmp_le_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe3,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_t_u32 vcc, tba_lo, v0
-// CHECK: [0x6c,0x00,0xbe,0x7d]
+v_cmp_le_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe3,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_t_u32 vcc, tba_hi, v0
-// CHECK: [0x6d,0x00,0xbe,0x7d]
+v_cmp_le_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe3,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_t_u32 vcc, tma_lo, v0
-// CHECK: [0x6e,0x00,0xbe,0x7d]
+v_cmp_le_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe3,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_t_u32 vcc, tma_hi, v0
-// CHECK: [0x6f,0x00,0xbe,0x7d]
+v_cmp_gt_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, ttmp11, v0
-// CHECK: [0x7b,0x00,0xbe,0x7d]
+v_cmp_gt_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, m0, v0
-// CHECK: [0x7c,0x00,0xbe,0x7d]
+v_cmp_gt_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, exec_lo, v0
-// CHECK: [0x7e,0x00,0xbe,0x7d]
+v_cmp_gt_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, exec_hi, v0
-// CHECK: [0x7f,0x00,0xbe,0x7d]
+v_cmp_gt_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, 0, v0
-// CHECK: [0x80,0x00,0xbe,0x7d]
+v_cmp_gt_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, -1, v0
-// CHECK: [0xc1,0x00,0xbe,0x7d]
+v_cmp_gt_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, 0.5, v0
-// CHECK: [0xf0,0x00,0xbe,0x7d]
+v_cmp_gt_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, -4.0, v0
-// CHECK: [0xf7,0x00,0xbe,0x7d]
+v_cmp_gt_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, 0xaf123456, v0
-// CHECK: [0xff,0x00,0xbe,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_gt_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, 0x3f717273, v0
-// CHECK: [0xff,0x00,0xbe,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_gt_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, v0, v0
-// CHECK: [0x00,0x01,0xbe,0x7d]
+v_cmp_gt_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, v255, v0
-// CHECK: [0xff,0x01,0xbe,0x7d]
+v_cmp_gt_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xc8,0x7d]
 
-v_cmpx_t_u32 vcc, s0, v255
-// CHECK: [0x00,0xfe,0xbf,0x7d]
+v_cmp_gt_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xc8,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_u32_e64 s[0:1], s0, s0
-// CHECK: [0x00,0x00,0xdf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xc8,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_t_u32_e64 s[2:3], s0, s0
-// CHECK: [0x02,0x00,0xdf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xc8,0x7d]
 
-v_cmpx_t_u32_e64 s[100:101], s0, s0
-// CHECK: [0x64,0x00,0xdf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xc8,0x7d]
 
-v_cmpx_t_u32_e64 flat_scratch, s0, s0
-// CHECK: [0x66,0x00,0xdf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xc9,0x7d]
 
-v_cmpx_t_u32_e64 vcc, s0, s0
-// CHECK: [0x6a,0x00,0xdf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 tba, s0, s0
-// CHECK: [0x6c,0x00,0xdf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 tma, s0, s0
-// CHECK: [0x6e,0x00,0xdf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 ttmp[10:11], s0, s0
-// CHECK: [0x7a,0x00,0xdf,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], 0, s0
-// CHECK: [0x00,0x00,0xdf,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], -1, s0
-// CHECK: [0x00,0x00,0xdf,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], 0.5, s0
-// CHECK: [0x00,0x00,0xdf,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], -4.0, s0
-// CHECK: [0x00,0x00,0xdf,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_gt_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], v0, s0
-// CHECK: [0x00,0x00,0xdf,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], v255, s0
-// CHECK: [0x00,0x00,0xdf,0xd0,0xff,0x01,0x00,0x00]
+v_cmp_gt_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, 0
-// CHECK: [0x00,0x00,0xdf,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, -1
-// CHECK: [0x00,0x00,0xdf,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_gt_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, 0.5
-// CHECK: [0x00,0x00,0xdf,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_gt_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, -4.0
-// CHECK: [0x00,0x00,0xdf,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_gt_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, v0
-// CHECK: [0x00,0x00,0xdf,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_t_u32_e64 s[0:1], s0, v255
-// CHECK: [0x00,0x00,0xdf,0xd0,0x00,0xfe,0x03,0x00]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_f_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc0,0x7d]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_f_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc0,0x7d]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_f_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xc0,0x7d]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_f_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xc0,0x7d]
+v_cmp_gt_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe4,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_f_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc0,0x7d]
+v_cmp_ne_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc0,0x7d]
+v_cmp_ne_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc0,0x7d]
+v_cmp_ne_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc0,0x7d]
+v_cmp_ne_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc0,0x7d]
+v_cmp_ne_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc0,0x7d]
+v_cmp_ne_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc0,0x7d]
+v_cmp_ne_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc0,0x7d]
+v_cmp_ne_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc0,0x7d]
+v_cmp_ne_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc0,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ne_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc0,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ne_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc0,0x7d]
+v_cmp_ne_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc0,0x7d]
+v_cmp_ne_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xca,0x7d]
 
-v_cmp_f_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc1,0x7d]
+v_cmp_ne_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xca,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xca,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xca,0x7d]
 
-v_cmp_f_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xca,0x7d]
 
-v_cmp_f_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcb,0x7d]
 
-v_cmp_f_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xe5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe0,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_ne_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe5,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ne_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe5,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ne_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe5,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_ne_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe5,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ne_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe5,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_ne_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe5,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe5,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_f_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe0,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe5,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_lt_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc2,0x7d]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe5,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_lt_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc2,0x7d]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe5,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_lt_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xc2,0x7d]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe5,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_lt_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xc2,0x7d]
+v_cmp_ne_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe5,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_lt_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc2,0x7d]
+v_cmp_ge_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc2,0x7d]
+v_cmp_ge_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc2,0x7d]
+v_cmp_ge_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc2,0x7d]
+v_cmp_ge_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc2,0x7d]
+v_cmp_ge_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc2,0x7d]
+v_cmp_ge_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc2,0x7d]
+v_cmp_ge_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc2,0x7d]
+v_cmp_ge_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc2,0x7d]
+v_cmp_ge_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc2,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ge_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc2,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ge_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc2,0x7d]
+v_cmp_ge_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc2,0x7d]
+v_cmp_ge_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xcc,0x7d]
 
-v_cmp_lt_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc3,0x7d]
+v_cmp_ge_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xcc,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xcc,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_lt_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xcc,0x7d]
 
-v_cmp_lt_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xe1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xcc,0x7d]
 
-v_cmp_lt_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcd,0x7d]
 
-v_cmp_lt_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe1,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe1,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe1,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe1,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe1,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_ge_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe1,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe1,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ge_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe1,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe1,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_ge_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe1,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ge_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe1,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_ge_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe1,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_lt_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe1,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_eq_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc4,0x7d]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_eq_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc4,0x7d]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_eq_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xc4,0x7d]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_eq_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xc4,0x7d]
+v_cmp_ge_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe6,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_eq_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc4,0x7d]
+v_cmp_t_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc4,0x7d]
+v_cmp_t_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc4,0x7d]
+v_cmp_t_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc4,0x7d]
+v_cmp_t_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc4,0x7d]
+v_cmp_t_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc4,0x7d]
+v_cmp_t_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc4,0x7d]
+v_cmp_t_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc4,0x7d]
+v_cmp_t_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc4,0x7d]
+v_cmp_t_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc4,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_t_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc4,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_t_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc4,0x7d]
+v_cmp_t_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc4,0x7d]
+v_cmp_t_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xce,0x7d]
 
-v_cmp_eq_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc5,0x7d]
+v_cmp_t_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xce,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xce,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xce,0x7d]
 
-v_cmp_eq_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xce,0x7d]
 
-v_cmp_eq_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xcf,0x7d]
 
-v_cmp_eq_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xe7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe2,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_t_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_t_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_t_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_t_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe7,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_t_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe7,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_t_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe7,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_t_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe7,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_t_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe7,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_t_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe7,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_t_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe7,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_eq_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe2,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_t_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe7,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_le_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc6,0x7d]
+v_cmp_t_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe7,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_le_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc6,0x7d]
+v_cmp_t_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe7,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_le_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xc6,0x7d]
+v_cmp_t_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe7,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_le_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xc6,0x7d]
+v_cmp_t_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe7,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_le_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc6,0x7d]
+v_cmp_f_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc6,0x7d]
+v_cmp_f_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc6,0x7d]
+v_cmp_f_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc6,0x7d]
+v_cmp_f_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc6,0x7d]
+v_cmp_f_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc6,0x7d]
+v_cmp_f_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc6,0x7d]
+v_cmp_f_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc6,0x7d]
+v_cmp_f_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc6,0x7d]
+v_cmp_f_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc6,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_f_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc6,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_f_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc6,0x7d]
+v_cmp_f_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc6,0x7d]
+v_cmp_f_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd0,0x7d]
 
-v_cmp_le_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc7,0x7d]
+v_cmp_f_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd0,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd0,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd0,0x7d]
 
-v_cmp_le_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xe3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd0,0x7d]
 
-v_cmp_le_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd1,0x7d]
 
-v_cmp_le_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe3,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_f_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_f_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_f_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_f_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_f_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_f_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_f_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe3,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_f_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe3,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_f_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe3,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_f_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe3,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_f_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe3,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_f_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_le_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe3,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_f_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_gt_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xc8,0x7d]
+v_cmp_f_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_gt_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xc8,0x7d]
+v_cmp_f_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_gt_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xc8,0x7d]
+v_cmp_f_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_gt_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xc8,0x7d]
+v_cmp_f_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe8,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_gt_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xc8,0x7d]
+v_cmp_lt_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xc8,0x7d]
+v_cmp_lt_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xc8,0x7d]
+v_cmp_lt_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xc8,0x7d]
+v_cmp_lt_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xc8,0x7d]
+v_cmp_lt_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xc8,0x7d]
+v_cmp_lt_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xc8,0x7d]
+v_cmp_lt_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xc8,0x7d]
+v_cmp_lt_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xc8,0x7d]
+v_cmp_lt_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xc8,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_lt_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xc8,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_lt_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xc8,0x7d]
+v_cmp_lt_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xc8,0x7d]
+v_cmp_lt_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd2,0x7d]
 
-v_cmp_gt_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xc9,0x7d]
+v_cmp_lt_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd2,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd2,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd2,0x7d]
 
-v_cmp_gt_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd2,0x7d]
 
-v_cmp_gt_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd3,0x7d]
 
-v_cmp_gt_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xe9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xe9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xe9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe4,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xe9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xe9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xe9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xe9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_lt_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xe9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_lt_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xe9,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_lt_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xe9,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_lt_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xe9,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_lt_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xe9,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_lt_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xe9,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_lt_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xe9,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xe9,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_gt_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe4,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xe9,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_ne_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xca,0x7d]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xe9,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_ne_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xca,0x7d]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xe9,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_ne_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xca,0x7d]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xe9,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_ne_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xca,0x7d]
+v_cmp_lt_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xe9,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_ne_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xca,0x7d]
+v_cmp_eq_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xca,0x7d]
+v_cmp_eq_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xca,0x7d]
+v_cmp_eq_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xca,0x7d]
+v_cmp_eq_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xca,0x7d]
+v_cmp_eq_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xca,0x7d]
+v_cmp_eq_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xca,0x7d]
+v_cmp_eq_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xca,0x7d]
+v_cmp_eq_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xca,0x7d]
+v_cmp_eq_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xca,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_eq_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xca,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_eq_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xca,0x7d]
+v_cmp_eq_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xca,0x7d]
+v_cmp_eq_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd4,0x7d]
 
-v_cmp_ne_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcb,0x7d]
+v_cmp_eq_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd4,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd4,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ne_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd4,0x7d]
 
-v_cmp_ne_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xe5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd4,0x7d]
 
-v_cmp_ne_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd5,0x7d]
 
-v_cmp_ne_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe5,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_eq_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xea,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_eq_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_eq_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe5,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_eq_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe5,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_eq_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe5,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_eq_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe5,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_eq_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xea,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe5,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_ne_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe5,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_ge_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xcc,0x7d]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_ge_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xcc,0x7d]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_ge_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xcc,0x7d]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_ge_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xcc,0x7d]
+v_cmp_eq_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xea,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_ge_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xcc,0x7d]
+v_cmp_le_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xcc,0x7d]
+v_cmp_le_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xcc,0x7d]
+v_cmp_le_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xcc,0x7d]
+v_cmp_le_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xcc,0x7d]
+v_cmp_le_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xcc,0x7d]
+v_cmp_le_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xcc,0x7d]
+v_cmp_le_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xcc,0x7d]
+v_cmp_le_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xcc,0x7d]
+v_cmp_le_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xcc,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_le_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xcc,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_le_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xcc,0x7d]
+v_cmp_le_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xcc,0x7d]
+v_cmp_le_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd6,0x7d]
 
-v_cmp_ge_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcd,0x7d]
+v_cmp_le_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd6,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd6,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd6,0x7d]
 
-v_cmp_ge_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd6,0x7d]
 
-v_cmp_ge_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd7,0x7d]
 
-v_cmp_ge_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xeb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xeb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xeb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe6,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_le_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xeb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_le_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xeb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_le_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xeb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_le_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xeb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_le_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xeb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_le_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xeb,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_le_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xeb,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_le_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xeb,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_le_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xeb,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_le_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xeb,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_le_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xeb,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_le_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xeb,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_ge_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe6,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_le_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xeb,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_t_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xce,0x7d]
+v_cmp_le_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xeb,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_t_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xce,0x7d]
+v_cmp_le_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xeb,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_t_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xce,0x7d]
+v_cmp_le_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xeb,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_t_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xce,0x7d]
+v_cmp_le_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xeb,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_t_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xce,0x7d]
+v_cmp_gt_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xce,0x7d]
+v_cmp_gt_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xce,0x7d]
+v_cmp_gt_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xce,0x7d]
+v_cmp_gt_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xce,0x7d]
+v_cmp_gt_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xce,0x7d]
+v_cmp_gt_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xce,0x7d]
+v_cmp_gt_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xce,0x7d]
+v_cmp_gt_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xce,0x7d]
+v_cmp_gt_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xce,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_gt_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xce,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_gt_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xce,0x7d]
+v_cmp_gt_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xce,0x7d]
+v_cmp_gt_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xd8,0x7d]
 
-v_cmp_t_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xcf,0x7d]
+v_cmp_gt_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xd8,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xd8,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_t_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xd8,0x7d]
 
-v_cmp_t_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xe7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xd8,0x7d]
 
-v_cmp_t_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xd9,0x7d]
 
-v_cmp_t_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe7,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe7,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe7,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe7,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe7,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_gt_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xec,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe7,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_gt_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe7,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_gt_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe7,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_gt_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe7,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_gt_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe7,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_gt_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe7,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_gt_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xec,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe7,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_t_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe7,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_f_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd0,0x7d]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_f_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd0,0x7d]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_f_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xd0,0x7d]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_f_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xd0,0x7d]
+v_cmp_gt_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xec,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_f_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd0,0x7d]
+v_cmp_ne_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd0,0x7d]
+v_cmp_ne_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd0,0x7d]
+v_cmp_ne_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd0,0x7d]
+v_cmp_ne_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd0,0x7d]
+v_cmp_ne_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd0,0x7d]
+v_cmp_ne_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd0,0x7d]
+v_cmp_ne_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd0,0x7d]
+v_cmp_ne_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd0,0x7d]
+v_cmp_ne_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd0,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ne_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd0,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ne_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd0,0x7d]
+v_cmp_ne_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd0,0x7d]
+v_cmp_ne_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xda,0x7d]
 
-v_cmp_f_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd1,0x7d]
+v_cmp_ne_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xda,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xda,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_f_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xda,0x7d]
 
-v_cmp_f_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xda,0x7d]
 
-v_cmp_f_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xdb,0x7d]
 
-v_cmp_f_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xed,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xed,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xed,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe8,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xed,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xed,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xed,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xed,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_ne_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xed,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ne_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xed,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ne_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xed,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ne_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xed,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_ne_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xed,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ne_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xed,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_ne_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xed,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xed,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_f_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe8,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xed,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_lt_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd2,0x7d]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xed,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_lt_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd2,0x7d]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xed,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_lt_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xd2,0x7d]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xed,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_lt_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xd2,0x7d]
+v_cmp_ne_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xed,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_lt_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd2,0x7d]
+v_cmp_ge_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd2,0x7d]
+v_cmp_ge_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd2,0x7d]
+v_cmp_ge_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd2,0x7d]
+v_cmp_ge_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd2,0x7d]
+v_cmp_ge_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd2,0x7d]
+v_cmp_ge_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd2,0x7d]
+v_cmp_ge_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd2,0x7d]
+v_cmp_ge_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd2,0x7d]
+v_cmp_ge_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd2,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_ge_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd2,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_ge_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd2,0x7d]
+v_cmp_ge_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd2,0x7d]
+v_cmp_ge_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xdc,0x7d]
 
-v_cmp_lt_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd3,0x7d]
+v_cmp_ge_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xdc,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xdc,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_lt_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xe9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xdc,0x7d]
 
-v_cmp_lt_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xe9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xdc,0x7d]
 
-v_cmp_lt_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xe9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xdd,0x7d]
 
-v_cmp_lt_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xe9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xe9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xe9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xe9,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xe9,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xe9,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xe9,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xe9,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_ge_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xee,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xe9,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_ge_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xe9,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_ge_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xe9,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_ge_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xe9,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_ge_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xe9,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_ge_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xe9,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_ge_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xee,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe9,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_lt_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xe9,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_eq_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd4,0x7d]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_eq_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd4,0x7d]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_eq_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xd4,0x7d]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_eq_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xd4,0x7d]
+v_cmp_ge_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xee,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_eq_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd4,0x7d]
+v_cmp_t_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd4,0x7d]
+v_cmp_t_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd4,0x7d]
+v_cmp_t_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd4,0x7d]
+v_cmp_t_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd4,0x7d]
+v_cmp_t_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd4,0x7d]
+v_cmp_t_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd4,0x7d]
+v_cmp_t_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd4,0x7d]
+v_cmp_t_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd4,0x7d]
+v_cmp_t_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd4,0x7d,0x56,0x34,0x12,0xaf]
+v_cmp_t_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd4,0x7d,0x73,0x72,0x71,0x3f]
+v_cmp_t_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd4,0x7d]
+v_cmp_t_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd4,0x7d]
+v_cmp_t_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xde,0x7d]
 
-v_cmp_eq_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd5,0x7d]
+v_cmp_t_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xde,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xde,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_eq_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xde,0x7d]
 
-v_cmp_eq_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xde,0x7d]
 
-v_cmp_eq_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xdf,0x7d]
 
-v_cmp_eq_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xef,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xef,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xef,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xea,0xd0,0x00,0x00,0x00,0x00]
+v_cmp_t_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xef,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x80,0x00,0x00,0x00]
+v_cmp_t_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xef,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0xc1,0x00,0x00,0x00]
+v_cmp_t_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xef,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0xf0,0x00,0x00,0x00]
+v_cmp_t_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xef,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0xf7,0x00,0x00,0x00]
+v_cmp_t_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xef,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x01,0x00,0x00]
+v_cmp_t_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xef,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0xfe,0x01,0x00,0x00]
+v_cmp_t_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xef,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x00,0x01,0x00]
+v_cmp_t_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xef,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x82,0x01,0x00]
+v_cmp_t_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xef,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0xe0,0x01,0x00]
+v_cmp_t_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xef,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0xee,0x01,0x00]
+v_cmp_t_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xef,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0x00,0x02,0x00]
+v_cmp_t_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xef,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_eq_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xea,0xd0,0x00,0xfc,0x03,0x00]
+v_cmp_t_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xef,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_le_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd6,0x7d]
+v_cmp_t_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xef,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_le_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd6,0x7d]
+v_cmp_t_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xef,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_le_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xd6,0x7d]
+v_cmp_t_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xef,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_le_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xd6,0x7d]
+v_cmp_t_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xef,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_le_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd6,0x7d]
+v_cmpx_f_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd6,0x7d]
+v_cmpx_f_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd6,0x7d]
+v_cmpx_f_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd6,0x7d]
+v_cmpx_f_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd6,0x7d]
+v_cmpx_f_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd6,0x7d]
+v_cmpx_f_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd6,0x7d]
+v_cmpx_f_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd6,0x7d]
+v_cmpx_f_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd6,0x7d]
+v_cmpx_f_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd6,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_f_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd6,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd6,0x7d]
+v_cmpx_f_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd6,0x7d]
+v_cmpx_f_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe0,0x7d]
 
-v_cmp_le_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd7,0x7d]
+v_cmpx_f_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe0,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xeb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe0,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_le_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xeb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe0,0x7d]
 
-v_cmp_le_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xeb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe0,0x7d]
 
-v_cmp_le_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xeb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe1,0x7d]
 
-v_cmp_le_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xeb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xeb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xeb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xeb,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xeb,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xeb,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xeb,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xeb,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_f_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf0,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xeb,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xeb,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_f_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xeb,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_f_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xeb,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_f_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xeb,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_f_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xeb,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_f_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xeb,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_le_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xeb,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_gt_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xd8,0x7d]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_gt_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xd8,0x7d]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_gt_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xd8,0x7d]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_gt_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xd8,0x7d]
+v_cmpx_f_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf0,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_gt_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xd8,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xd8,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xd8,0x7d]
+v_cmpx_lt_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe2,0x7d]
 
-v_cmp_gt_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xd9,0x7d]
+v_cmpx_lt_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe2,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe2,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_gt_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe2,0x7d]
 
-v_cmp_gt_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe2,0x7d]
 
-v_cmp_gt_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe3,0x7d]
 
-v_cmp_gt_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xf1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xec,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_lt_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf1,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf1,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_lt_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xf1,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf1,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_lt_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xf1,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf1,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_lt_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf1,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf1,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_gt_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xec,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xf1,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_ne_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xda,0x7d]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf1,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_ne_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xda,0x7d]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xf1,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_ne_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xda,0x7d]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf1,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_ne_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xda,0x7d]
+v_cmpx_lt_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf1,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_ne_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xda,0x7d]
+v_cmpx_eq_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xda,0x7d]
+v_cmpx_eq_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xda,0x7d]
+v_cmpx_eq_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xda,0x7d]
+v_cmpx_eq_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xda,0x7d]
+v_cmpx_eq_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xda,0x7d]
+v_cmpx_eq_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xda,0x7d]
+v_cmpx_eq_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xda,0x7d]
+v_cmpx_eq_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xda,0x7d]
+v_cmpx_eq_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xda,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xda,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xda,0x7d]
+v_cmpx_eq_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xda,0x7d]
+v_cmpx_eq_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe4,0x7d]
 
-v_cmp_ne_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xdb,0x7d]
+v_cmpx_eq_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe4,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xed,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe4,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ne_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xed,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe4,0x7d]
 
-v_cmp_ne_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xed,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe4,0x7d]
 
-v_cmp_ne_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xed,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe5,0x7d]
 
-v_cmp_ne_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xed,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xed,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xed,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xed,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xed,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xed,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xed,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xed,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf2,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xed,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xed,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_eq_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xed,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xed,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_eq_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xed,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xed,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_eq_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xed,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_ne_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xed,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_ge_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xdc,0x7d]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_ge_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xdc,0x7d]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_ge_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xdc,0x7d]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_ge_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xdc,0x7d]
+v_cmpx_eq_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf2,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_ge_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xdc,0x7d]
+v_cmpx_le_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xdc,0x7d]
+v_cmpx_le_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xdc,0x7d]
+v_cmpx_le_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xdc,0x7d]
+v_cmpx_le_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xdc,0x7d]
+v_cmpx_le_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xdc,0x7d]
+v_cmpx_le_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xdc,0x7d]
+v_cmpx_le_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xdc,0x7d]
+v_cmpx_le_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xdc,0x7d]
+v_cmpx_le_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xdc,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_le_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xdc,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_le_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xdc,0x7d]
+v_cmpx_le_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xdc,0x7d]
+v_cmpx_le_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe6,0x7d]
 
-v_cmp_ge_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xdd,0x7d]
+v_cmpx_le_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe6,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe6,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_ge_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe6,0x7d]
 
-v_cmp_ge_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe6,0x7d]
 
-v_cmp_ge_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe7,0x7d]
 
-v_cmp_ge_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xf3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xee,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_le_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf3,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf3,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_le_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xf3,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_le_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf3,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_le_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xf3,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_le_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf3,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_le_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf3,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf3,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_ge_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xee,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xf3,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmp_t_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xde,0x7d]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf3,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmp_t_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xde,0x7d]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xf3,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmp_t_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xde,0x7d]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf3,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmp_t_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xde,0x7d]
+v_cmpx_le_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf3,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmp_t_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xde,0x7d]
+v_cmpx_gt_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xde,0x7d]
+v_cmpx_gt_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xde,0x7d]
+v_cmpx_gt_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xde,0x7d]
+v_cmpx_gt_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xde,0x7d]
+v_cmpx_gt_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xde,0x7d]
+v_cmpx_gt_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xde,0x7d]
+v_cmpx_gt_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xde,0x7d]
+v_cmpx_gt_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xde,0x7d]
+v_cmpx_gt_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xde,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xde,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xde,0x7d]
+v_cmpx_gt_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xde,0x7d]
+v_cmpx_gt_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xe8,0x7d]
 
-v_cmp_t_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xdf,0x7d]
+v_cmpx_gt_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xe8,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xef,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xe8,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmp_t_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xef,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xe8,0x7d]
 
-v_cmp_t_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xef,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xe8,0x7d]
 
-v_cmp_t_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xef,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xe9,0x7d]
 
-v_cmp_t_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xef,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xef,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xef,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xef,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xef,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xef,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xef,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xef,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_gt_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf4,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xef,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xef,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_gt_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xef,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xef,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_gt_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xef,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xef,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_gt_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xef,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmp_t_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xef,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_f_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe0,0x7d]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_f_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe0,0x7d]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_f_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xe0,0x7d]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_f_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xe0,0x7d]
+v_cmpx_gt_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf4,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_f_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe0,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ne_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe0,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ne_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe0,0x7d]
+v_cmpx_ne_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xea,0x7d]
 
-v_cmpx_f_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe1,0x7d]
+v_cmpx_ne_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xea,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xea,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xea,0x7d]
 
-v_cmpx_f_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xea,0x7d]
 
-v_cmpx_f_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xeb,0x7d]
 
-v_cmpx_f_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xf5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf0,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_ne_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf5,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf5,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_ne_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xf5,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ne_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf5,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ne_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xf5,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ne_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf5,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_ne_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf5,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf5,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_f_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf0,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xf5,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_lt_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe2,0x7d]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf5,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_lt_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe2,0x7d]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xf5,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_lt_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xe2,0x7d]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf5,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_lt_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xe2,0x7d]
+v_cmpx_ne_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf5,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_lt_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe2,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe2,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe2,0x7d]
+v_cmpx_ge_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xec,0x7d]
 
-v_cmpx_lt_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe3,0x7d]
+v_cmpx_ge_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xec,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xec,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xec,0x7d]
 
-v_cmpx_lt_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xf1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xec,0x7d]
 
-v_cmpx_lt_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xed,0x7d]
 
-v_cmpx_lt_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf1,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf1,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xf1,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf1,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xf1,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_ge_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf6,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf1,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf1,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_ge_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf1,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ge_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xf1,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ge_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf1,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xf1,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_ge_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf1,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_lt_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf1,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_eq_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe4,0x7d]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_eq_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe4,0x7d]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_eq_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xe4,0x7d]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_eq_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xe4,0x7d]
+v_cmpx_ge_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf6,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_eq_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe4,0x7d]
+v_cmpx_t_i64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe4,0x7d]
+v_cmpx_t_i64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe4,0x7d]
+v_cmpx_t_i64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe4,0x7d]
+v_cmpx_t_i64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe4,0x7d]
+v_cmpx_t_i64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe4,0x7d]
+v_cmpx_t_i64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe4,0x7d]
+v_cmpx_t_i64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe4,0x7d]
+v_cmpx_t_i64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe4,0x7d]
+v_cmpx_t_i64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe4,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_i64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe4,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_t_i64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe4,0x7d]
+v_cmpx_t_i64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe4,0x7d]
+v_cmpx_t_i64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xee,0x7d]
 
-v_cmpx_eq_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe5,0x7d]
+v_cmpx_t_i64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xee,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xee,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xee,0x7d]
 
-v_cmpx_eq_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xee,0x7d]
 
-v_cmpx_eq_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xef,0x7d]
 
-v_cmpx_eq_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xf7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf2,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_t_i64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf7,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf7,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_t_i64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xf7,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_t_i64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf7,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_t_i64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xf7,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_t_i64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf7,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_t_i64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf7,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf7,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_eq_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf2,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xf7,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_le_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe6,0x7d]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf7,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_le_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe6,0x7d]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xf7,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_le_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xe6,0x7d]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf7,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_le_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xe6,0x7d]
+v_cmpx_t_i64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf7,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_le_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe6,0x7d]
+v_cmpx_f_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe6,0x7d]
+v_cmpx_f_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe6,0x7d]
+v_cmpx_f_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe6,0x7d]
+v_cmpx_f_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe6,0x7d]
+v_cmpx_f_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe6,0x7d]
+v_cmpx_f_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe6,0x7d]
+v_cmpx_f_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe6,0x7d]
+v_cmpx_f_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe6,0x7d]
+v_cmpx_f_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe6,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_f_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe6,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_f_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe6,0x7d]
+v_cmpx_f_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe6,0x7d]
+v_cmpx_f_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf0,0x7d]
 
-v_cmpx_le_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe7,0x7d]
+v_cmpx_f_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf0,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf0,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_le_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf0,0x7d]
 
-v_cmpx_le_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xf3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf0,0x7d]
 
-v_cmpx_le_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf1,0x7d]
 
-v_cmpx_le_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf3,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf3,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xf3,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf3,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xf3,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_f_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf8,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf3,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_f_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf3,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_f_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf3,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_f_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xf3,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_f_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf3,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_f_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xf3,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_f_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf3,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_le_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf3,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_gt_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xe8,0x7d]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_gt_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xe8,0x7d]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_gt_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xe8,0x7d]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_gt_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xe8,0x7d]
+v_cmpx_f_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf8,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_gt_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xe8,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_lt_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xe8,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_lt_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xe8,0x7d]
+v_cmpx_lt_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf2,0x7d]
 
-v_cmpx_gt_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xe9,0x7d]
+v_cmpx_lt_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf2,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf2,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_gt_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf2,0x7d]
 
-v_cmpx_gt_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf2,0x7d]
 
-v_cmpx_gt_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf3,0x7d]
 
-v_cmpx_gt_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xf9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xf9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xf9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf4,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xf9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xf9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xf9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xf9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_lt_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xf9,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_lt_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xf9,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_lt_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xf9,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_lt_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xf9,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_lt_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xf9,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_lt_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xf9,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_lt_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xf9,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xf9,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_gt_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf4,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xf9,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_ne_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xea,0x7d]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xf9,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ne_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xea,0x7d]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xf9,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_ne_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xea,0x7d]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xf9,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ne_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xea,0x7d]
+v_cmpx_lt_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xf9,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ne_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xea,0x7d]
+v_cmpx_eq_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xea,0x7d]
+v_cmpx_eq_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xea,0x7d]
+v_cmpx_eq_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xea,0x7d]
+v_cmpx_eq_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xea,0x7d]
+v_cmpx_eq_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xea,0x7d]
+v_cmpx_eq_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xea,0x7d]
+v_cmpx_eq_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xea,0x7d]
+v_cmpx_eq_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xea,0x7d]
+v_cmpx_eq_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xea,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_eq_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xea,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_eq_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xea,0x7d]
+v_cmpx_eq_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xea,0x7d]
+v_cmpx_eq_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf4,0x7d]
 
-v_cmpx_ne_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xeb,0x7d]
+v_cmpx_eq_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf4,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf4,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ne_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf4,0x7d]
 
-v_cmpx_ne_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xf5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf4,0x7d]
 
-v_cmpx_ne_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf5,0x7d]
 
-v_cmpx_ne_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf5,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf5,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xf5,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf5,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xf5,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_eq_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xfa,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf5,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_eq_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf5,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_eq_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf5,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_eq_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xf5,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_eq_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf5,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_eq_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xf5,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_eq_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf5,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_ne_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf5,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_ge_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xec,0x7d]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_ge_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xec,0x7d]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_ge_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xec,0x7d]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_ge_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xec,0x7d]
+v_cmpx_eq_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xfa,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_ge_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xec,0x7d]
+v_cmpx_le_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xec,0x7d]
+v_cmpx_le_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xec,0x7d]
+v_cmpx_le_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xec,0x7d]
+v_cmpx_le_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xec,0x7d]
+v_cmpx_le_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xec,0x7d]
+v_cmpx_le_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xec,0x7d]
+v_cmpx_le_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xec,0x7d]
+v_cmpx_le_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xec,0x7d]
+v_cmpx_le_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xec,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_le_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xec,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_le_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xec,0x7d]
+v_cmpx_le_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xec,0x7d]
+v_cmpx_le_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf6,0x7d]
 
-v_cmpx_ge_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xed,0x7d]
+v_cmpx_le_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf6,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf6,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_ge_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf6,0x7d]
 
-v_cmpx_ge_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf6,0x7d]
 
-v_cmpx_ge_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf7,0x7d]
 
-v_cmpx_ge_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xfb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xfb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf6,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xfb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xfb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xfb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xfb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_le_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xfb,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_le_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xfb,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_le_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xfb,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_le_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xfb,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_le_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xfb,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_le_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xfb,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_le_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xfb,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xfb,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_ge_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf6,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xfb,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_t_i64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xee,0x7d]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xfb,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_t_i64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xee,0x7d]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xfb,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_t_i64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xee,0x7d]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xfb,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_t_i64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xee,0x7d]
+v_cmpx_le_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xfb,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_t_i64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xee,0x7d]
+v_cmpx_gt_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xee,0x7d]
+v_cmpx_gt_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xee,0x7d]
+v_cmpx_gt_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xee,0x7d]
+v_cmpx_gt_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xee,0x7d]
+v_cmpx_gt_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xee,0x7d]
+v_cmpx_gt_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xee,0x7d]
+v_cmpx_gt_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xee,0x7d]
+v_cmpx_gt_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xee,0x7d]
+v_cmpx_gt_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xee,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_gt_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xee,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_gt_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xee,0x7d]
+v_cmpx_gt_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xee,0x7d]
+v_cmpx_gt_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xf8,0x7d]
 
-v_cmpx_t_i64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xef,0x7d]
+v_cmpx_gt_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xf8,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xf8,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_t_i64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xf8,0x7d]
 
-v_cmpx_t_i64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xf7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xf8,0x7d]
 
-v_cmpx_t_i64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xf9,0x7d]
 
-v_cmpx_t_i64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf7,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf7,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xf7,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf7,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xf7,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_gt_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xfc,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf7,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_gt_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf7,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_gt_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf7,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_gt_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xf7,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_gt_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf7,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_gt_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xf7,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_gt_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf7,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_t_i64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf7,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_f_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf0,0x7d]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_f_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf0,0x7d]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_f_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xf0,0x7d]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_f_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xf0,0x7d]
+v_cmpx_gt_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xfc,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_f_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf0,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ne_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf0,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ne_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf0,0x7d]
+v_cmpx_ne_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xfa,0x7d]
 
-v_cmpx_f_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf1,0x7d]
+v_cmpx_ne_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xfa,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xfa,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_f_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xfa,0x7d]
 
-v_cmpx_f_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xfa,0x7d]
 
-v_cmpx_f_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xfb,0x7d]
 
-v_cmpx_f_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfd,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xfd,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xfd,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf8,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xfd,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xfd,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xfd,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xfd,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_ne_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xfd,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ne_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xfd,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_ne_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xfd,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ne_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xfd,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ne_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xfd,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ne_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xfd,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_ne_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xfd,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xfd,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_f_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf8,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xfd,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_lt_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf2,0x7d]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xfd,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_lt_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf2,0x7d]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xfd,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_lt_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xf2,0x7d]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xfd,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_lt_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xf2,0x7d]
+v_cmpx_ne_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xfd,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_lt_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf2,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_ge_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf2,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_ge_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf2,0x7d]
+v_cmpx_ge_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xfc,0x7d]
 
-v_cmpx_lt_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf3,0x7d]
+v_cmpx_ge_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xfc,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xfc,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_lt_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xf9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xfc,0x7d]
 
-v_cmpx_lt_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xf9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xfc,0x7d]
 
-v_cmpx_lt_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xf9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xfd,0x7d]
 
-v_cmpx_lt_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xf9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xf9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xf9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xf9,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xf9,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xf9,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xf9,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xf9,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_ge_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xfe,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xf9,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_ge_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xf9,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_ge_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xf9,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_ge_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xf9,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_ge_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xf9,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_ge_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xf9,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_ge_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf9,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_lt_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xf9,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_eq_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf4,0x7d]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_eq_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf4,0x7d]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_eq_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xf4,0x7d]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_eq_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xf4,0x7d]
+v_cmpx_ge_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xfe,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_eq_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf4,0x7d]
+v_cmpx_t_u64 vcc, s[2:3], v[2:3]
+// CHECK: [0x02,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf4,0x7d]
+v_cmpx_t_u64 vcc, s[4:5], v[2:3]
+// CHECK: [0x04,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf4,0x7d]
+v_cmpx_t_u64 vcc, s[100:101], v[2:3]
+// CHECK: [0x64,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf4,0x7d]
+v_cmpx_t_u64 vcc, flat_scratch, v[2:3]
+// CHECK: [0x66,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf4,0x7d]
+v_cmpx_t_u64 vcc, vcc, v[2:3]
+// CHECK: [0x6a,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf4,0x7d]
+v_cmpx_t_u64 vcc, tba, v[2:3]
+// CHECK: [0x6c,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf4,0x7d]
+v_cmpx_t_u64 vcc, tma, v[2:3]
+// CHECK: [0x6e,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf4,0x7d]
+v_cmpx_t_u64 vcc, ttmp[10:11], v[2:3]
+// CHECK: [0x7a,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf4,0x7d]
+v_cmpx_t_u64 vcc, exec, v[2:3]
+// CHECK: [0x7e,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf4,0x7d,0x56,0x34,0x12,0xaf]
+v_cmpx_t_u64 vcc, 0, v[2:3]
+// CHECK: [0x80,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf4,0x7d,0x73,0x72,0x71,0x3f]
+v_cmpx_t_u64 vcc, -1, v[2:3]
+// CHECK: [0xc1,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf4,0x7d]
+v_cmpx_t_u64 vcc, 0.5, v[2:3]
+// CHECK: [0xf0,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf4,0x7d]
+v_cmpx_t_u64 vcc, -4.0, v[2:3]
+// CHECK: [0xf7,0x04,0xfe,0x7d]
 
-v_cmpx_eq_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf5,0x7d]
+v_cmpx_t_u64 vcc, 0xaf123456, v[2:3]
+// CHECK: [0xff,0x04,0xfe,0x7d,0x56,0x34,0x12,0xaf]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64 vcc, 0x3f717273, v[2:3]
+// CHECK: [0xff,0x04,0xfe,0x7d,0x73,0x72,0x71,0x3f]
 
-v_cmpx_eq_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64 vcc, v[1:2], v[2:3]
+// CHECK: [0x01,0x05,0xfe,0x7d]
 
-v_cmpx_eq_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64 vcc, v[254:255], v[2:3]
+// CHECK: [0xfe,0x05,0xfe,0x7d]
 
-v_cmpx_eq_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64 vcc, s[2:3], v[254:255]
+// CHECK: [0x02,0xfc,0xff,0x7d]
 
-v_cmpx_eq_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], s[4:5]
+// CHECK: [0x0a,0x00,0xff,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 s[12:13], s[4:5], s[4:5]
+// CHECK: [0x0c,0x00,0xff,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 s[100:101], s[4:5], s[4:5]
+// CHECK: [0x64,0x00,0xff,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xfa,0xd0,0x00,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 flat_scratch, s[4:5], s[4:5]
+// CHECK: [0x66,0x00,0xff,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x80,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 vcc, s[4:5], s[4:5]
+// CHECK: [0x6a,0x00,0xff,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0xc1,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 tba, s[4:5], s[4:5]
+// CHECK: [0x6c,0x00,0xff,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0xf0,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 tma, s[4:5], s[4:5]
+// CHECK: [0x6e,0x00,0xff,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0xf7,0x00,0x00,0x00]
+v_cmpx_t_u64_e64 ttmp[10:11], s[4:5], s[4:5]
+// CHECK: [0x7a,0x00,0xff,0xd0,0x04,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x01,0x00,0x00]
+v_cmpx_t_u64_e64 s[10:11], 0, s[4:5]
+// CHECK: [0x0a,0x00,0xff,0xd0,0x80,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0xfe,0x01,0x00,0x00]
+v_cmpx_t_u64_e64 s[10:11], -1, s[4:5]
+// CHECK: [0x0a,0x00,0xff,0xd0,0xc1,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x00,0x01,0x00]
+v_cmpx_t_u64_e64 s[10:11], 0.5, s[4:5]
+// CHECK: [0x0a,0x00,0xff,0xd0,0xf0,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x82,0x01,0x00]
+v_cmpx_t_u64_e64 s[10:11], -4.0, s[4:5]
+// CHECK: [0x0a,0x00,0xff,0xd0,0xf7,0x08,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0xe0,0x01,0x00]
+v_cmpx_t_u64_e64 s[10:11], v[1:2], s[4:5]
+// CHECK: [0x0a,0x00,0xff,0xd0,0x01,0x09,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0xee,0x01,0x00]
+v_cmpx_t_u64_e64 s[10:11], v[254:255], s[4:5]
+// CHECK: [0x0a,0x00,0xff,0xd0,0xfe,0x09,0x00,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0x00,0x02,0x00]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], 0
+// CHECK: [0x0a,0x00,0xff,0xd0,0x04,0x00,0x01,0x00]
 
-v_cmpx_eq_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xfa,0xd0,0x00,0xfc,0x03,0x00]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], -1
+// CHECK: [0x0a,0x00,0xff,0xd0,0x04,0x82,0x01,0x00]
 
-v_cmpx_le_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf6,0x7d]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], 0.5
+// CHECK: [0x0a,0x00,0xff,0xd0,0x04,0xe0,0x01,0x00]
 
-v_cmpx_le_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf6,0x7d]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], -4.0
+// CHECK: [0x0a,0x00,0xff,0xd0,0x04,0xee,0x01,0x00]
 
-v_cmpx_le_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xf6,0x7d]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], v[2:3]
+// CHECK: [0x0a,0x00,0xff,0xd0,0x04,0x04,0x02,0x00]
 
-v_cmpx_le_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xf6,0x7d]
+v_cmpx_t_u64_e64 s[10:11], s[4:5], v[254:255]
+// CHECK: [0x0a,0x00,0xff,0xd0,0x04,0xfc,0x03,0x00]
 
-v_cmpx_le_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf6,0x7d]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_le_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf6,0x7d]
+v_mov_b32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cmpx_le_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf6,0x7d]
+v_mov_b32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cmpx_le_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf6,0x7d]
+v_mov_b32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_le_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf6,0x7d]
+v_mov_b32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cmpx_le_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf6,0x7d]
+v_mov_b32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cmpx_le_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf6,0x7d]
+v_mov_b32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cmpx_le_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf6,0x7d]
+v_mov_b32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cmpx_le_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf6,0x7d]
+v_mov_b32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cmpx_le_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf6,0x7d,0x56,0x34,0x12,0xaf]
+v_mov_b32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cmpx_le_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf6,0x7d,0x73,0x72,0x71,0x3f]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cmpx_le_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf6,0x7d]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf6,0x7d]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf7,0x7d]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfb,0xd0,0x00,0x00,0x00,0x00]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cmpx_le_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xfb,0xd0,0x00,0x00,0x00,0x00]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cmpx_le_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xfb,0xd0,0x00,0x00,0x00,0x00]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cmpx_le_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xfb,0xd0,0x00,0x00,0x00,0x00]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cmpx_le_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xfb,0xd0,0x00,0x00,0x00,0x00]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cmpx_le_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xfb,0xd0,0x00,0x00,0x00,0x00]
+v_mov_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cmpx_le_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xfb,0xd0,0x00,0x00,0x00,0x00]
+v_mov_b32_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x02,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_cmpx_le_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xfb,0xd0,0x00,0x00,0x00,0x00]
+v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xfb,0xd0,0x80,0x00,0x00,0x00]
+v_mov_b32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xfb,0xd0,0xc1,0x00,0x00,0x00]
+v_mov_b32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xfb,0xd0,0xf0,0x00,0x00,0x00]
+v_mov_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xfb,0xd0,0xf7,0x00,0x00,0x00]
+v_mov_b32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfb,0xd0,0x00,0x01,0x00,0x00]
+v_mov_b32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xfb,0xd0,0xfe,0x01,0x00,0x00]
+v_mov_b32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xfb,0xd0,0x00,0x00,0x01,0x00]
+v_mov_b32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xfb,0xd0,0x00,0x82,0x01,0x00]
+v_mov_b32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xfb,0xd0,0x00,0xe0,0x01,0x00]
+v_mov_b32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xfb,0xd0,0x00,0xee,0x01,0x00]
+v_mov_b32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfb,0xd0,0x00,0x00,0x02,0x00]
+v_mov_b32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cmpx_le_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xfb,0xd0,0x00,0xfc,0x03,0x00]
+v_mov_b32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cmpx_gt_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cmpx_gt_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cmpx_gt_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cmpx_gt_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cmpx_gt_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cmpx_gt_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cmpx_gt_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cmpx_gt_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cmpx_gt_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cmpx_gt_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cmpx_gt_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cmpx_gt_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cmpx_gt_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xf8,0x7d]
+v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cmpx_gt_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xf8,0x7d,0x56,0x34,0x12,0xaf]
+v_mov_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x02,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cmpx_gt_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xf8,0x7d,0x73,0x72,0x71,0x3f]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_gt_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xf8,0x7d]
+v_cvt_f32_i32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cmpx_gt_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xf8,0x7d]
+v_cvt_f32_i32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cmpx_gt_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xf9,0x7d]
+v_cvt_f32_i32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_gt_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cmpx_gt_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cmpx_gt_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cmpx_gt_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cmpx_gt_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cmpx_gt_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cmpx_gt_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xfc,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x80,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0xc1,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0xf0,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0xf7,0x00,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x01,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0xfe,0x01,0x00,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x00,0x01,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x82,0x01,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0xe0,0x01,0x00]
+v_cvt_f32_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0xee,0x01,0x00]
+v_cvt_f32_i32_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0a,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0x00,0x02,0x00]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cmpx_gt_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xfc,0xd0,0x00,0xfc,0x03,0x00]
+v_cvt_f32_i32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cmpx_ne_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cmpx_ne_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cmpx_ne_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xfa,0x7d,0x56,0x34,0x12,0xaf]
+v_cvt_f32_i32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xfa,0x7d,0x73,0x72,0x71,0x3f]
+v_cvt_f32_i32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cmpx_ne_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xfa,0x7d]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cmpx_ne_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xfb,0x7d]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfd,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cmpx_ne_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xfd,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cmpx_ne_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xfd,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cmpx_ne_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xfd,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cmpx_ne_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xfd,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cmpx_ne_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xfd,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cmpx_ne_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xfd,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x0a,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cmpx_ne_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xfd,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xfd,0xd0,0x80,0x00,0x00,0x00]
+v_cvt_f32_u32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xfd,0xd0,0xc1,0x00,0x00,0x00]
+v_cvt_f32_u32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xfd,0xd0,0xf0,0x00,0x00,0x00]
+v_cvt_f32_u32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xfd,0xd0,0xf7,0x00,0x00,0x00]
+v_cvt_f32_u32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfd,0xd0,0x00,0x01,0x00,0x00]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xfd,0xd0,0xfe,0x01,0x00,0x00]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xfd,0xd0,0x00,0x00,0x01,0x00]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xfd,0xd0,0x00,0x82,0x01,0x00]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xfd,0xd0,0x00,0xe0,0x01,0x00]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xfd,0xd0,0x00,0xee,0x01,0x00]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfd,0xd0,0x00,0x00,0x02,0x00]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cmpx_ne_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xfd,0xd0,0x00,0xfc,0x03,0x00]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfc,0x7d]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xfc,0x7d]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_ge_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xfc,0x7d]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cmpx_ge_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xfc,0x7d]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cmpx_ge_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xfc,0x7d]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cmpx_ge_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xfc,0x7d]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cmpx_ge_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xfc,0x7d]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cmpx_ge_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xfc,0x7d]
+v_cvt_f32_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cmpx_ge_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xfc,0x7d]
+v_cvt_f32_u32_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0c,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_cmpx_ge_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xfc,0x7d]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cmpx_ge_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xfc,0x7d]
+v_cvt_f32_u32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cmpx_ge_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xfc,0x7d]
+v_cvt_f32_u32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cmpx_ge_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xfc,0x7d]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cmpx_ge_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xfc,0x7d,0x56,0x34,0x12,0xaf]
+v_cvt_f32_u32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cmpx_ge_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xfc,0x7d,0x73,0x72,0x71,0x3f]
+v_cvt_f32_u32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cmpx_ge_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xfc,0x7d]
+v_cvt_f32_u32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cmpx_ge_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xfc,0x7d]
+v_cvt_f32_u32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cmpx_ge_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xfd,0x7d]
+v_cvt_f32_u32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cmpx_ge_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cmpx_ge_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cmpx_ge_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cmpx_ge_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cmpx_ge_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cmpx_ge_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cmpx_ge_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xfe,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x80,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0xc1,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cmpx_ge_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0xf0,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cmpx_ge_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0xf7,0x00,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cmpx_ge_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x01,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cmpx_ge_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0xfe,0x01,0x00,0x00]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x00,0x01,0x00]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x82,0x01,0x00]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0xe0,0x01,0x00]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0xee,0x01,0x00]
+v_cvt_f32_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x0c,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0x00,0x02,0x00]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_ge_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xfe,0xd0,0x00,0xfc,0x03,0x00]
+v_cvt_u32_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cmpx_t_u64 vcc, s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cmpx_t_u64 vcc, s[2:3], v[0:1]
-// CHECK: [0x02,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cmpx_t_u64 vcc, s[100:101], v[0:1]
-// CHECK: [0x64,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_t_u64 vcc, flat_scratch, v[0:1]
-// CHECK: [0x66,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cmpx_t_u64 vcc, vcc, v[0:1]
-// CHECK: [0x6a,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cmpx_t_u64 vcc, tba, v[0:1]
-// CHECK: [0x6c,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cmpx_t_u64 vcc, tma, v[0:1]
-// CHECK: [0x6e,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cmpx_t_u64 vcc, ttmp[10:11], v[0:1]
-// CHECK: [0x7a,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cmpx_t_u64 vcc, exec, v[0:1]
-// CHECK: [0x7e,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cmpx_t_u64 vcc, 0, v[0:1]
-// CHECK: [0x80,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cmpx_t_u64 vcc, -1, v[0:1]
-// CHECK: [0xc1,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_u64 vcc, 0.5, v[0:1]
-// CHECK: [0xf0,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_u64 vcc, -4.0, v[0:1]
-// CHECK: [0xf7,0x00,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_t_u64 vcc, 0xaf123456, v[0:1]
-// CHECK: [0xff,0x00,0xfe,0x7d,0x56,0x34,0x12,0xaf]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cmpx_t_u64 vcc, 0x3f717273, v[0:1]
-// CHECK: [0xff,0x00,0xfe,0x7d,0x73,0x72,0x71,0x3f]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cmpx_t_u64 vcc, v[0:1], v[0:1]
-// CHECK: [0x00,0x01,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cmpx_t_u64 vcc, v[254:255], v[0:1]
-// CHECK: [0xfe,0x01,0xfe,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cmpx_t_u64 vcc, s[0:1], v[254:255]
-// CHECK: [0x00,0xfc,0xff,0x7d]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xff,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cmpx_t_u64_e64 s[2:3], s[0:1], s[0:1]
-// CHECK: [0x02,0x00,0xff,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cmpx_t_u64_e64 s[100:101], s[0:1], s[0:1]
-// CHECK: [0x64,0x00,0xff,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x0e,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cmpx_t_u64_e64 flat_scratch, s[0:1], s[0:1]
-// CHECK: [0x66,0x00,0xff,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cmpx_t_u64_e64 vcc, s[0:1], s[0:1]
-// CHECK: [0x6a,0x00,0xff,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cmpx_t_u64_e64 tba, s[0:1], s[0:1]
-// CHECK: [0x6c,0x00,0xff,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cmpx_t_u64_e64 tma, s[0:1], s[0:1]
-// CHECK: [0x6e,0x00,0xff,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cmpx_t_u64_e64 ttmp[10:11], s[0:1], s[0:1]
-// CHECK: [0x7a,0x00,0xff,0xd0,0x00,0x00,0x00,0x00]
+v_cvt_u32_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], 0, s[0:1]
-// CHECK: [0x00,0x00,0xff,0xd0,0x80,0x00,0x00,0x00]
+v_cvt_u32_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], -1, s[0:1]
-// CHECK: [0x00,0x00,0xff,0xd0,0xc1,0x00,0x00,0x00]
+v_cvt_u32_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], 0.5, s[0:1]
-// CHECK: [0x00,0x00,0xff,0xd0,0xf0,0x00,0x00,0x00]
+v_cvt_u32_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], -4.0, s[0:1]
-// CHECK: [0x00,0x00,0xff,0xd0,0xf7,0x00,0x00,0x00]
+v_cvt_u32_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], v[0:1], s[0:1]
-// CHECK: [0x00,0x00,0xff,0xd0,0x00,0x01,0x00,0x00]
+v_cvt_u32_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], v[254:255], s[0:1]
-// CHECK: [0x00,0x00,0xff,0xd0,0xfe,0x01,0x00,0x00]
+v_cvt_u32_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], 0
-// CHECK: [0x00,0x00,0xff,0xd0,0x00,0x00,0x01,0x00]
+v_cvt_u32_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], -1
-// CHECK: [0x00,0x00,0xff,0xd0,0x00,0x82,0x01,0x00]
+v_cvt_u32_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], 0.5
-// CHECK: [0x00,0x00,0xff,0xd0,0x00,0xe0,0x01,0x00]
+v_cvt_u32_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], -4.0
-// CHECK: [0x00,0x00,0xff,0xd0,0x00,0xee,0x01,0x00]
+v_cvt_u32_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], v[0:1]
-// CHECK: [0x00,0x00,0xff,0xd0,0x00,0x00,0x02,0x00]
+v_cvt_u32_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cmpx_t_u64_e64 s[0:1], s[0:1], v[254:255]
-// CHECK: [0x00,0x00,0xff,0xd0,0x00,0xfc,0x03,0x00]
+v_cvt_u32_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_u32_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_mov_b32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_mov_b32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_mov_b32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_mov_b32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_mov_b32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_mov_b32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_mov_b32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_mov_b32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_mov_b32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_u32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_u32_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_u32_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x0e,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_i32_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_i32_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_i32_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_i32_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_mov_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_mov_b32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x02,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_mov_b32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_mov_b32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_mov_b32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_mov_b32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_mov_b32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_mov_b32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_mov_b32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_mov_b32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_mov_b32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_mov_b32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_mov_b32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_mov_b32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_i32_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_mov_b32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_i32_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x10,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_mov_b32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_mov_b32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_i32_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_mov_b32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_i32_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_mov_b32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_i32_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_i32_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_i32_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_i32_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_i32_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_i32_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_i32_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_i32_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x02,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_i32_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_f32_i32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_f32_i32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_f32_i32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_i32_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_i32_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x10,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_f16_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_f16_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_f16_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_f32_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_f16_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0a,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_f32_i32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_f32_i32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_f16_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x14,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_f32_i32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_f16_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_f16_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f16_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_f16_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_f16_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f16_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f16_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_f32_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x0a,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_f16_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_f32_u32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_f32_u32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_f32_u32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_f16_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_f16_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_f16_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x14,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_f32_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_u32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0c,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_cvt_f32_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_f32_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_f32_u32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_f32_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_f32_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_f32_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x16,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f32_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_f32_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f32_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f32_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_f32_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x0c,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_f32_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_f32_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_u32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_f32_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_u32_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_cvt_f32_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x16,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_u32_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x0e,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_rpi_i32_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_u32_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_rpi_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_rpi_i32_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_rpi_i32_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x18,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_rpi_i32_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_rpi_i32_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_u32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_rpi_i32_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_u32_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_cvt_rpi_i32_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_u32_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x0e,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_cvt_rpi_i32_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_rpi_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_rpi_i32_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_i32_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_cvt_rpi_i32_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x18,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_i32_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x10,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_flr_i32_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_i32_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_flr_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_flr_i32_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_flr_i32_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1a,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_flr_i32_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_flr_i32_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_flr_i32_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_i32_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_cvt_flr_i32_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_i32_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x10,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_cvt_flr_i32_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_flr_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_f16_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_flr_i32_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_f16_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_cvt_flr_i32_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1a,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_f16_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x14,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_off_f32_i4_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_off_f32_i4_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_off_f32_i4_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_off_f32_i4_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x1c,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_off_f32_i4_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_off_f32_i4_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_off_f32_i4_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_f16_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_off_f32_i4_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_f16_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_cvt_off_f32_i4_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_f16_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x14,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_cvt_off_f32_i4_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_off_f32_i4_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x1c,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x16,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_f32_ubyte0_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_f32_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_f32_ubyte0_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_f32_ubyte0_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x22,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f32_ubyte0_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_f32_ubyte0_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f32_ubyte0_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f32_ubyte0_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_f32_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x16,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_f32_ubyte0_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_f32_ubyte0_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x22,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_rpi_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_rpi_i32_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_cvt_f32_ubyte1_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_rpi_i32_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x18,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_cvt_f32_ubyte1_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_f32_ubyte1_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_f32_ubyte1_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x24,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f32_ubyte1_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f32_ubyte1_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_f32_ubyte1_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f32_ubyte1_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f32_ubyte1_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_rpi_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_f32_ubyte1_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_rpi_i32_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_cvt_f32_ubyte1_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_rpi_i32_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x18,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_cvt_f32_ubyte1_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_flr_i32_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_flr_i32_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_f32_ubyte1_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x24,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_f32_ubyte2_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_f32_ubyte2_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_flr_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_f32_ubyte2_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_flr_i32_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_flr_i32_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1a,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_flr_i32_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_f32_ubyte2_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x26,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_cvt_flr_i32_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_f32_ubyte2_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_f32_ubyte2_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f32_ubyte2_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f32_ubyte2_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_f32_ubyte2_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_f32_ubyte2_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f32_ubyte2_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f32_ubyte2_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_f32_ubyte2_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_cvt_f32_ubyte2_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_flr_i32_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1a,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_cvt_f32_ubyte2_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_off_f32_i4_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_off_f32_i4_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_off_f32_i4_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f32_ubyte2_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x26,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_f32_ubyte3_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_f32_ubyte3_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_f32_ubyte3_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_off_f32_i4_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_off_f32_i4_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x1c,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_off_f32_i4_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_f32_ubyte3_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_f32_ubyte3_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x28,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_cvt_off_f32_i4_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_f32_ubyte3_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_f32_ubyte3_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_f32_ubyte3_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_f32_ubyte3_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f32_ubyte3_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f32_ubyte3_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_f32_ubyte3_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_f32_ubyte3_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f32_ubyte3_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f32_ubyte3_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_off_f32_i4_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x1c,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_f32_ubyte3_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_f32_ubyte0_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_f32_ubyte0_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f32_ubyte3_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x28,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_fract_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_fract_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_fract_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_fract_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_fract_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_f32_ubyte0_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_fract_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_f32_ubyte0_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x22,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_fract_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_f32_ubyte0_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_fract_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_fract_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_fract_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x36,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_f32_ubyte0_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_fract_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_fract_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_fract_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_fract_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_fract_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_fract_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_fract_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_fract_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_fract_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_fract_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_fract_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_f32_ubyte0_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x22,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_fract_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_fract_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_f32_ubyte1_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_fract_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_f32_ubyte1_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_fract_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_fract_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_fract_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_fract_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_fract_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_fract_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_fract_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_fract_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_fract_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_fract_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_fract_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_fract_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_fract_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_fract_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_fract_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x36,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_trunc_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_trunc_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte1_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_trunc_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_f32_ubyte1_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x24,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_trunc_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_trunc_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_trunc_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_trunc_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x38,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_trunc_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_trunc_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_trunc_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_trunc_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_trunc_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_trunc_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_trunc_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_trunc_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_f32_ubyte1_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x24,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_trunc_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_trunc_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_f32_ubyte2_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_trunc_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_trunc_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_trunc_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_trunc_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_trunc_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_trunc_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_trunc_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_trunc_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_trunc_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_trunc_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_trunc_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_trunc_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_trunc_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_trunc_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_trunc_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_trunc_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_trunc_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_trunc_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_trunc_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x38,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_f32_ubyte2_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte2_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x26,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_ceil_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_ceil_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_ceil_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_ceil_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_ceil_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_ceil_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_ceil_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3a,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_ceil_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_ceil_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_ceil_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_ceil_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_ceil_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_f32_ubyte2_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x26,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_ceil_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ceil_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_ceil_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_ceil_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_ceil_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ceil_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_ceil_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_ceil_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_ceil_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_ceil_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_ceil_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_ceil_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_ceil_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_ceil_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_ceil_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ceil_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_ceil_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_ceil_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_ceil_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_ceil_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_ceil_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f32_ubyte3_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_ceil_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_f32_ubyte3_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x28,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_ceil_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_ceil_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3a,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_f32_ubyte3_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_rndne_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_rndne_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rndne_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_rndne_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_rndne_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3c,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rndne_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rndne_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_f32_ubyte3_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x28,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_rndne_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rndne_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_fract_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_rndne_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_fract_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_rndne_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_rndne_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rndne_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_rndne_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_rndne_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_rndne_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_rndne_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_rndne_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_rndne_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_rndne_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rndne_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rndne_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rndne_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_rndne_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_rndne_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_rndne_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_rndne_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_rndne_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_fract_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_rndne_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_fract_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_rndne_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_fract_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x36,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_rndne_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_fract_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_rndne_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_fract_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_rndne_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_fract_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_rndne_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3c,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_fract_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_floor_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_floor_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_floor_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_fract_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_fract_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_fract_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_fract_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_fract_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_fract_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_floor_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_fract_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_floor_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_fract_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_floor_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x3e,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_fract_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_floor_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_fract_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_floor_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_fract_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x36,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_floor_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_floor_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_trunc_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_floor_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_floor_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_floor_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_floor_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_floor_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_floor_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_floor_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_floor_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_floor_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_floor_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_floor_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_floor_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_floor_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_floor_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_floor_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_floor_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_floor_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_floor_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_floor_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_trunc_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_floor_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_trunc_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_floor_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_trunc_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x38,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_floor_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_floor_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_trunc_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_floor_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_trunc_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_floor_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x3e,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_exp_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_exp_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_exp_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_exp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_exp_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_exp_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x40,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_trunc_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_exp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_trunc_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_exp_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_trunc_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x38,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_exp_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_exp_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_ceil_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_exp_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_exp_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_exp_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_exp_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_exp_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_exp_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_exp_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_exp_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_exp_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_exp_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_exp_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_exp_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_exp_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_exp_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_exp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_exp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_exp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_exp_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_exp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_ceil_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_exp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_ceil_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_exp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_ceil_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3a,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_exp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_exp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_ceil_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_exp_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_ceil_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_exp_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x40,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_log_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_log_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_log_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_log_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_log_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_log_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x42,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_ceil_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_log_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_ceil_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_log_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_ceil_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3a,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_log_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_log_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_rndne_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_log_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_log_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_log_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_log_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_log_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_log_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_log_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_log_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_log_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_log_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_log_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_log_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_log_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_log_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_log_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_log_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_log_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_log_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_log_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_rndne_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_log_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_rndne_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_log_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rndne_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3c,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_log_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_log_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_rndne_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_log_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_rndne_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_log_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x42,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_rcp_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_rcp_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_rcp_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rcp_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rcp_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x44,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_rndne_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_rcp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_rndne_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_rcp_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_rndne_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3c,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_rcp_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rcp_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_floor_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_rcp_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_floor_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_rcp_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_rcp_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rcp_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_rcp_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_rcp_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_rcp_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_rcp_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_rcp_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_rcp_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_rcp_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rcp_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rcp_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rcp_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_rcp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_rcp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_rcp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_rcp_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_rcp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_floor_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_rcp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_floor_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_rcp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_floor_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x3e,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_rcp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_floor_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_rcp_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_floor_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_rcp_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_floor_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_rcp_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x44,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_floor_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_rcp_iflag_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_floor_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_floor_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_floor_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_floor_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_floor_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_floor_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_rcp_iflag_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_floor_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rcp_iflag_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_floor_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rcp_iflag_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x46,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_floor_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_floor_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_rcp_iflag_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_floor_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x3e,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_rcp_iflag_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_exp_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_exp_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rcp_iflag_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_exp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_exp_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_exp_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x40,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_exp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_rcp_iflag_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_exp_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_rcp_iflag_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_exp_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_rcp_iflag_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x46,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_exp_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_rsq_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_rsq_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_exp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_exp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_exp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_exp_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_exp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_exp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_rsq_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_exp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rsq_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_exp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rsq_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x48,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_exp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_rsq_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_exp_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_rsq_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_exp_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x40,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_rsq_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rsq_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_log_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_rsq_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_log_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_rsq_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_rsq_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rsq_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_rsq_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_rsq_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_rsq_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_rsq_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_rsq_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_rsq_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_rsq_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rsq_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rsq_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rsq_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_rsq_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_rsq_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_rsq_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_rsq_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_rsq_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_log_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_rsq_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_log_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_rsq_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_log_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x42,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_rsq_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_log_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_rsq_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_log_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_rsq_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_log_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_rsq_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x48,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_log_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_log_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_sqrt_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_log_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_sqrt_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_log_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_log_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_log_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_log_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_log_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_log_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_log_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_log_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_log_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_log_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_log_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_log_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_log_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_log_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_log_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_log_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_log_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_log_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_sqrt_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_log_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sqrt_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_log_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sqrt_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x4e,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_log_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_sqrt_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_log_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_sqrt_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_log_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x42,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_sqrt_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_rcp_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sqrt_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_sqrt_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_sqrt_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_sqrt_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_sqrt_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_sqrt_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_rcp_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_sqrt_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_rcp_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_sqrt_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rcp_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x44,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_sqrt_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_sqrt_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_rcp_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_sqrt_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_rcp_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_sqrt_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x4e,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_sin_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_sin_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_sin_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_sin_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sin_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sin_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x52,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_rcp_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_sin_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_rcp_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_sin_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_rcp_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x44,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_sin_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sin_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_rcp_iflag_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_sin_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_sin_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_sin_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sin_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_sin_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_sin_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_sin_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_sin_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_sin_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_sin_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_sin_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_sin_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_sin_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sin_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_sin_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_sin_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_sin_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_sin_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_sin_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_rcp_iflag_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_sin_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_rcp_iflag_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_sin_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rcp_iflag_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x46,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_sin_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_sin_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_rcp_iflag_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_sin_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_rcp_iflag_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_sin_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x52,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cos_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cos_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cos_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cos_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cos_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cos_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x54,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_rcp_iflag_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cos_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_rcp_iflag_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_cos_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_rcp_iflag_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x46,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_cos_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cos_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_rsq_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cos_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cos_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cos_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cos_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cos_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cos_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cos_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cos_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cos_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cos_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cos_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cos_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cos_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cos_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cos_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cos_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cos_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cos_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cos_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_rsq_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cos_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_rsq_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_cos_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rsq_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x48,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_cos_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cos_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_rsq_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cos_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_rsq_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cos_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x54,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_not_b32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_not_b32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rsq_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_not_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_not_b32_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x56,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_not_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_not_b32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_rsq_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_not_b32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_rsq_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_not_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_rsq_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x48,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_not_b32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_not_b32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_sqrt_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_not_b32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_not_b32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_not_b32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_not_b32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_not_b32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_not_b32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_not_b32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_not_b32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_not_b32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_not_b32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_not_b32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_not_b32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_not_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_not_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_not_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_not_b32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_not_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_not_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_not_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_sqrt_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_not_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_sqrt_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_not_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x56,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_sqrt_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x4e,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_bfrev_b32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_sqrt_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_bfrev_b32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_sqrt_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_sqrt_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_sqrt_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_sqrt_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_sqrt_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_bfrev_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_bfrev_b32_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x58,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_bfrev_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_bfrev_b32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_bfrev_b32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_bfrev_b32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_bfrev_b32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_sqrt_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_bfrev_b32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_sqrt_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_bfrev_b32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_sqrt_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x4e,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_bfrev_b32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_sin_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_sin_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_sin_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_sin_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_sin_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_sin_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_sin_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_sin_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_sin_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_sin_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_bfrev_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_bfrev_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_bfrev_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_bfrev_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x58,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_ffbh_u32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_sin_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_ffbh_u32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_sin_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_ffbh_u32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_sin_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x52,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_ffbh_u32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_sin_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_sin_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_sin_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_sin_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_sin_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_sin_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_sin_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_sin_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_sin_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_sin_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_sin_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_sin_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_sin_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_sin_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_sin_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_sin_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_ffbh_u32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_sin_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_ffbh_u32_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5a,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_sin_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_ffbh_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_sin_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_ffbh_u32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_sin_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_ffbh_u32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_sin_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_ffbh_u32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_sin_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_ffbh_u32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_sin_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_ffbh_u32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_sin_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_ffbh_u32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_sin_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_ffbh_u32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_sin_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_ffbh_u32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_sin_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_ffbh_u32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_sin_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_ffbh_u32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_sin_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x52,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_ffbh_u32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cos_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cos_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cos_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cos_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cos_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cos_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cos_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cos_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cos_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cos_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ffbh_u32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x5a,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_ffbl_b32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_ffbl_b32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_ffbl_b32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_ffbl_b32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cos_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_ffbl_b32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cos_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_ffbl_b32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cos_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x54,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_ffbl_b32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cos_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cos_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cos_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cos_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cos_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cos_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cos_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cos_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cos_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cos_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cos_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cos_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cos_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_ffbl_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cos_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_ffbl_b32_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5c,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_cos_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_ffbl_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cos_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_ffbl_b32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cos_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_ffbl_b32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cos_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_ffbl_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cos_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_ffbl_b32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cos_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_ffbl_b32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cos_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_ffbl_b32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cos_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_ffbl_b32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cos_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_ffbl_b32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cos_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_ffbl_b32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cos_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_ffbl_b32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cos_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_ffbl_b32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cos_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_ffbl_b32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cos_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_ffbl_b32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cos_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x54,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_ffbl_b32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_not_b32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_not_b32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_not_b32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_not_b32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_not_b32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_not_b32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_not_b32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_not_b32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_not_b32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_ffbl_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x5c,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ffbh_i32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_ffbh_i32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_ffbh_i32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_ffbh_i32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_ffbh_i32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_ffbh_i32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_not_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_ffbh_i32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_not_b32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x56,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_ffbh_i32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_not_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_not_b32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_not_b32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_not_b32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_not_b32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_not_b32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_not_b32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_not_b32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_not_b32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_not_b32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_not_b32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_not_b32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_ffbh_i32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_not_b32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_ffbh_i32_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x5e,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_not_b32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_ffbh_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_not_b32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_ffbh_i32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_not_b32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_ffbh_i32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_not_b32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_ffbh_i32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_not_b32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_ffbh_i32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_not_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_ffbh_i32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_not_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_ffbh_i32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_not_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_ffbh_i32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_not_b32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_ffbh_i32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_not_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_ffbh_i32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_not_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_ffbh_i32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_not_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_ffbh_i32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_not_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_ffbh_i32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_not_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x56,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_ffbh_i32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_bfrev_b32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_bfrev_b32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_bfrev_b32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_ffbh_i32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x5e,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_frexp_exp_i32_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_frexp_exp_i32_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_frexp_exp_i32_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_bfrev_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_bfrev_b32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x58,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_bfrev_b32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_bfrev_b32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_bfrev_b32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_bfrev_b32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_bfrev_b32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_bfrev_b32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_bfrev_b32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_bfrev_b32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_bfrev_b32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_bfrev_b32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_bfrev_b32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_bfrev_b32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_bfrev_b32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_frexp_exp_i32_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x66,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_bfrev_b32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_bfrev_b32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_frexp_exp_i32_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_bfrev_b32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_frexp_exp_i32_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_frexp_exp_i32_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_frexp_exp_i32_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_frexp_exp_i32_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_frexp_exp_i32_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_frexp_exp_i32_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_frexp_exp_i32_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_frexp_exp_i32_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_bfrev_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x58,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_frexp_exp_i32_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_ffbh_u32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_ffbh_u32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_ffbh_u32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_ffbh_u32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_exp_i32_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_frexp_exp_i32_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_frexp_exp_i32_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x66,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_frexp_mant_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_frexp_mant_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_ffbh_u32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_frexp_mant_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_ffbh_u32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5a,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_frexp_mant_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_ffbh_u32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_ffbh_u32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_ffbh_u32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_ffbh_u32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_ffbh_u32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_ffbh_u32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_ffbh_u32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_ffbh_u32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ffbh_u32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_ffbh_u32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_ffbh_u32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_ffbh_u32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_ffbh_u32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_ffbh_u32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_ffbh_u32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_ffbh_u32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_frexp_mant_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x68,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_frexp_mant_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_frexp_mant_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_frexp_mant_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_frexp_mant_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_frexp_mant_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_frexp_mant_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_ffbh_u32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x5a,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_frexp_mant_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_ffbl_b32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_ffbl_b32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_ffbl_b32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_ffbl_b32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_frexp_mant_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_frexp_mant_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_frexp_mant_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x68,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_ffbl_b32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ffbl_b32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5c,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_cvt_f16_u16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_f16_u16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_ffbl_b32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_f16_u16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ffbl_b32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_ffbl_b32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_ffbl_b32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_ffbl_b32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_ffbl_b32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_f16_u16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_f16_u16_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x72,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f16_u16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_f16_u16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f16_u16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f16_u16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_ffbl_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x5c,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_f16_u16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_ffbh_i32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_f16_u16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x72,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_ffbh_i32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ffbh_i32_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x5e,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_cvt_f16_i16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_f16_i16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_ffbh_i32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_f16_i16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ffbh_i32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_ffbh_i32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_ffbh_i32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_ffbh_i32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_ffbh_i32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_f16_i16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_f16_i16_sdwa v5, sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x74,0x0a,0x7e,0x01,0x06,0x0e,0x06]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_f16_i16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_f16_i16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f16_i16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_f16_i16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_ffbh_i32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x5e,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_f16_i16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_f16_i16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x74,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_frexp_exp_i32_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_exp_i32_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_cvt_u16_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_frexp_exp_i32_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x66,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_cvt_u16_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_u16_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_u16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_u16_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_u16_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x76,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_u16_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_u16_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_u16_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_u16_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_frexp_exp_i32_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_u16_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_frexp_exp_i32_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_cvt_u16_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_frexp_exp_i32_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x66,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_cvt_u16_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_frexp_mant_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_frexp_mant_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_u16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_u16_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_u16_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x76,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_frexp_mant_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_mant_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_cvt_i16_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_frexp_mant_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x68,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_cvt_i16_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cvt_i16_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_frexp_mant_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cvt_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cvt_i16_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cvt_i16_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x78,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cvt_i16_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cvt_i16_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_i16_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cvt_i16_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_frexp_mant_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cvt_i16_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_frexp_mant_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_cvt_i16_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_frexp_mant_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x68,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_cvt_i16_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_f16_u16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_f16_u16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cvt_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cvt_i16_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cvt_i16_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x78,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_f16_u16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_u16_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x72,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_rcp_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_rcp_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_f16_u16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_rcp_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_rcp_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_rcp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_rcp_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rcp_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7a,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_rcp_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_rcp_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rcp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rcp_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_f16_u16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x72,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_rcp_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rcp_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_rcp_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_rcp_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_rcp_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rcp_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_rcp_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_rcp_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_rcp_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_rcp_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_rcp_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_rcp_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_rcp_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_f16_i16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_rcp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_f16_i16_sdwa v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x74,0x00,0x7e,0x00,0x06,0x0e,0x06]
+v_rcp_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_rcp_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7a,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_f16_i16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_sqrt_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_sqrt_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sqrt_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_sqrt_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_sqrt_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7c,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sqrt_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_f16_i16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x74,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_sqrt_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_u16_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sqrt_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_u16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_u16_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_u16_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x76,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_sqrt_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_u16_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_sqrt_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_u16_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_sqrt_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7c,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_rsq_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_rsq_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_rsq_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rsq_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rsq_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x7e,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_u16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_u16_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_rsq_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_u16_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x76,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_rsq_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rsq_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_cvt_i16_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_rsq_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_rsq_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_rsq_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rsq_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_rsq_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_rsq_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_rsq_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_rsq_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_rsq_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_rsq_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_rsq_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rsq_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rsq_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rsq_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_cvt_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_cvt_i16_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_i16_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x78,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_rsq_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_cvt_i16_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_rsq_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_cvt_i16_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_rsq_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x7e,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_log_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_log_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_log_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_log_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_log_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_log_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x80,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_cvt_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cvt_i16_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_log_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_cvt_i16_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x78,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_log_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_log_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_rcp_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_log_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_log_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_log_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_log_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_log_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_log_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_log_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_log_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_log_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_log_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_log_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_log_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_log_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_log_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_rcp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_rcp_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rcp_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7a,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_log_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_rcp_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_log_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_rcp_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_log_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x80,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_exp_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_exp_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_exp_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_exp_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_exp_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_exp_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x82,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_rcp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_rcp_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_exp_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_rcp_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7a,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_exp_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_exp_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_sqrt_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_exp_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_exp_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_exp_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_exp_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_exp_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_exp_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_exp_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_exp_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_exp_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_exp_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_exp_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_exp_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_exp_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_exp_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_sqrt_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_sqrt_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_sqrt_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7c,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_exp_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_sqrt_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_exp_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_sqrt_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_exp_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x82,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_frexp_mant_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_frexp_mant_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_frexp_mant_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_frexp_mant_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x84,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_sqrt_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_sqrt_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_frexp_mant_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_sqrt_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7c,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_frexp_mant_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_rsq_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_mant_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_rsq_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_rsq_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rsq_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x7e,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_frexp_mant_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_rsq_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_frexp_mant_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_rsq_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_frexp_mant_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x84,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_frexp_exp_i16_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_frexp_exp_i16_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_frexp_exp_i16_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x86,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_rsq_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_rsq_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_frexp_exp_i16_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_rsq_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x7e,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_frexp_exp_i16_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_log_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_log_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_log_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_log_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_log_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x80,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_log_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_frexp_exp_i16_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_log_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_frexp_exp_i16_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_log_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_frexp_exp_i16_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x86,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_log_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_log_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_floor_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_log_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_floor_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_log_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_floor_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_log_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_log_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_log_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_log_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_log_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_log_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_log_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_log_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_log_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_log_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_log_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_log_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_log_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_log_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_log_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_log_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_log_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_floor_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_log_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_floor_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_log_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_floor_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x88,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_log_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_log_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_floor_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_log_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x80,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_floor_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_floor_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_exp_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_floor_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_exp_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_floor_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_floor_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_floor_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_floor_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_floor_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_floor_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_floor_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_floor_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_floor_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_floor_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_floor_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_floor_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_floor_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_exp_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_exp_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_exp_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x82,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_exp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_exp_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_floor_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_exp_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_floor_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x88,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_exp_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_ceil_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_ceil_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_exp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_exp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_exp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_exp_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_exp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_exp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_ceil_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_exp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_ceil_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_exp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_ceil_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8a,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_exp_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_exp_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_ceil_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_exp_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x82,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_ceil_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ceil_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_frexp_mant_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_ceil_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_ceil_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_ceil_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ceil_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_ceil_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_ceil_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_ceil_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_ceil_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_ceil_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_ceil_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_ceil_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_ceil_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_ceil_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_ceil_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_frexp_mant_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_frexp_mant_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_frexp_mant_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x84,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_frexp_mant_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_ceil_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_frexp_mant_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_ceil_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8a,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_trunc_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_trunc_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_trunc_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_trunc_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_trunc_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8c,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_frexp_mant_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_frexp_mant_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_trunc_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_frexp_mant_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x84,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_trunc_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_trunc_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_frexp_exp_i16_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_trunc_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_trunc_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_trunc_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_trunc_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_trunc_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_trunc_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_trunc_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_trunc_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_trunc_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_trunc_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_trunc_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_trunc_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_trunc_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_trunc_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_frexp_exp_i16_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_frexp_exp_i16_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_frexp_exp_i16_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x86,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_trunc_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_frexp_exp_i16_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_trunc_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_frexp_exp_i16_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_trunc_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8c,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_rndne_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_rndne_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_rndne_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rndne_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_rndne_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x8e,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_frexp_exp_i16_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_frexp_exp_i16_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_rndne_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_frexp_exp_i16_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x86,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_rndne_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rndne_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_floor_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_rndne_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_floor_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_rndne_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_rndne_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rndne_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_rndne_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_rndne_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_rndne_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_rndne_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_rndne_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_rndne_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_rndne_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rndne_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_rndne_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_rndne_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_floor_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_floor_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_floor_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x88,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_floor_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_rndne_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_floor_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_rndne_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_floor_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_rndne_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x8e,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_floor_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_fract_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_fract_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_fract_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_floor_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_floor_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_floor_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_floor_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_floor_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_floor_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_fract_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_floor_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_fract_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_floor_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_fract_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x90,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_floor_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_floor_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_fract_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_floor_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x88,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_fract_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_fract_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_ceil_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_fract_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_fract_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_fract_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_fract_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_fract_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_fract_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_fract_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_fract_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_fract_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_fract_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_fract_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_fract_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_fract_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_fract_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_ceil_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_ceil_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_ceil_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8a,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_fract_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_ceil_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_fract_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_ceil_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_fract_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x90,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_sin_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_sin_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_sin_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_sin_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sin_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sin_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x92,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_ceil_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_ceil_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_sin_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_ceil_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8a,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_sin_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sin_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_trunc_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_sin_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_sin_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_sin_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sin_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_sin_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_sin_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_sin_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_sin_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_sin_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_sin_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_sin_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_sin_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_sin_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sin_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_trunc_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_trunc_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_trunc_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8c,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_sin_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_trunc_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_sin_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_trunc_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_sin_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x92,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_cos_f16_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_cos_f16_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_cos_f16_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_cos_f16_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cos_f16_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_cos_f16_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x94,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_trunc_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_trunc_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_cos_f16_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_trunc_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8c,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_cos_f16_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cos_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_rndne_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_cos_f16_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_cos_f16_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_cos_f16_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cos_f16_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_cos_f16_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_cos_f16_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_cos_f16_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_cos_f16_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_cos_f16_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_cos_f16_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_cos_f16_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cos_f16_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_cos_f16_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_cos_f16_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_rndne_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_rndne_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rndne_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x8e,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_cos_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_rndne_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_cos_f16_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_rndne_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_cos_f16_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x94,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_exp_legacy_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_exp_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_exp_legacy_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_exp_legacy_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x96,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_rndne_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_rndne_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_exp_legacy_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_rndne_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x8e,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_exp_legacy_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_fract_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_fract_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_exp_legacy_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_fract_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_fract_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_fract_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x90,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_fract_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_exp_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_fract_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_exp_legacy_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_fract_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_exp_legacy_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x96,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_fract_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_log_legacy_f32_sdwa v255, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0xfe,0x7f,0x01,0x06,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0xff,0x06,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x26,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x00,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x01,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x02,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x03,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x04,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x05,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x0e,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x16,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_fract_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x00,0x06]
 
-v_fract_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x01,0x06]
 
-v_fract_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x02,0x06]
 
-v_fract_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x03,0x06]
 
-v_fract_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x04,0x06]
 
-v_fract_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_log_legacy_f32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x05,0x06]
 
-v_fract_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_log_legacy_f32_sdwa v5, -v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x16,0x06]
 
-v_fract_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_log_legacy_f32_sdwa v5, |v1| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x98,0x0a,0x7e,0x01,0x06,0x26,0x06]
 
-v_fract_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_fract_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_log_legacy_f32_dpp v255, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0xfe,0x7f,0x01,0xe4,0x00,0x00]
 
-v_fract_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x90,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_log_legacy_f32_dpp v5, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0xff,0xe4,0x00,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x1b,0x00,0x00]
 
-v_sin_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x40,0x01,0x00]
 
-v_sin_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x41,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x42,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x43,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x30,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x34,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x38,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x3c,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x01,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x0f,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x11,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x1f,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x21,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_log_legacy_f32_dpp v5, v1 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0x2f,0x01,0x00]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x00,0x10]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x00,0x30]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x00,0xf0]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x00,0x01]
 
-v_sin_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x00,0x03]
 
-v_sin_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_sin_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x92,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x00,0x0f]
 
-v_sin_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_log_legacy_f32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x08,0x00]
 
-v_sin_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_log_legacy_f32_dpp v5, -v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x10,0x00]
 
-v_sin_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_log_legacy_f32_dpp v5, |v1| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x98,0x0a,0x7e,0x01,0xe4,0x20,0x00]
 
-v_sin_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_add_f32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x03,0x01,0x06,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_add_f32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0xff,0x06,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_add_f32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x02,0x01,0x06,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x26,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x00,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x01,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x02,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x03,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x04,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x05,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x0e,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x16,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x16,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x06]
 
-v_sin_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x00,0x06]
 
-v_sin_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x01,0x06]
 
-v_sin_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x02,0x06]
 
-v_sin_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x03,0x06]
 
-v_sin_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x04,0x06]
 
-v_sin_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x05,0x06]
 
-v_sin_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_add_f32_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x16,0x06]
 
-v_sin_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_add_f32_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x26,0x06]
 
-v_sin_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x06]
 
-v_sin_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x92,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x01]
 
-v_cos_f16_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x02]
 
-v_cos_f16_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x03]
 
-v_cos_f16_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x04]
 
-v_cos_f16_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_add_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x05]
 
-v_cos_f16_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_add_f32_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x16]
 
-v_cos_f16_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_add_f32_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x02,0x01,0x06,0x06,0x26]
 
-v_cos_f16_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_add_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x03,0x01,0xe4,0x00,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_add_f32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0xff,0xe4,0x00,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_add_f32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x02,0x01,0xe4,0x00,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_add_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x1b,0x00,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_add_f32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x40,0x01,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_add_f32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x41,0x01,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_add_f32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x42,0x01,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_add_f32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x43,0x01,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_add_f32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x30,0x01,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_add_f32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x34,0x01,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_add_f32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x38,0x01,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_add_f32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x3c,0x01,0x00]
 
-v_cos_f16_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_add_f32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x01,0x01,0x00]
 
-v_cos_f16_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_add_f32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x0f,0x01,0x00]
 
-v_cos_f16_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x94,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_add_f32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x11,0x01,0x00]
 
-v_cos_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_add_f32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x1f,0x01,0x00]
 
-v_cos_f16_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_add_f32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x21,0x01,0x00]
 
-v_cos_f16_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_add_f32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0x2f,0x01,0x00]
 
-v_cos_f16_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0x10]
 
-v_cos_f16_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0x30]
 
-v_cos_f16_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xf0]
 
-v_cos_f16_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0xf0]
 
-v_cos_f16_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0x01]
 
-v_cos_f16_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0x03]
 
-v_cos_f16_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0x0f]
 
-v_cos_f16_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x00,0x0f]
 
-v_cos_f16_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_add_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x08,0x00]
 
-v_cos_f16_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_add_f32_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x10,0x00]
 
-v_cos_f16_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_add_f32_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x20,0x00]
 
-v_cos_f16_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_add_f32_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x40,0x00]
 
-v_cos_f16_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_add_f32_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x02,0x01,0xe4,0x80,0x00]
 
-v_cos_f16_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x06]
 
-v_cos_f16_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_sub_f32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x05,0x01,0x06,0x06,0x06]
 
-v_cos_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_sub_f32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0xff,0x06,0x06,0x06]
 
-v_cos_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_sub_f32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x04,0x01,0x06,0x06,0x06]
 
-v_cos_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sub_f32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x26,0x06,0x06]
 
-v_cos_f16_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sub_f32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x06]
 
-v_cos_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x00,0x06,0x06]
 
-v_cos_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x01,0x06,0x06]
 
-v_cos_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x02,0x06,0x06]
 
-v_cos_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x03,0x06,0x06]
 
-v_cos_f16_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x04,0x06,0x06]
 
-v_cos_f16_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x05,0x06,0x06]
 
-v_cos_f16_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x94,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x0e,0x06,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x16,0x06,0x06]
 
-v_exp_legacy_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x16,0x06,0x06]
 
-v_exp_legacy_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x00,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x01,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x02,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x03,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x04,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x05,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_sub_f32_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x16,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_sub_f32_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x26,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x06]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x00]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x01]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x02]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x03]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x04]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_sub_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x05]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_sub_f32_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x16]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_sub_f32_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x04,0x01,0x06,0x06,0x26]
 
-v_exp_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x00]
 
-v_exp_legacy_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_sub_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x05,0x01,0xe4,0x00,0x00]
 
-v_exp_legacy_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x96,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_sub_f32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0xff,0xe4,0x00,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_sub_f32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x04,0x01,0xe4,0x00,0x00]
 
-v_exp_legacy_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x1b,0x00,0x00]
 
-v_exp_legacy_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_sub_f32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x40,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_sub_f32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x41,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x42,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x43,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x30,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x34,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x38,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x3c,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x01,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x0f,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x11,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x1f,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x21,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0x2f,0x01,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x10]
 
-v_exp_legacy_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x30]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0xf0]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0xf0]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x01]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x03]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x0f]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x00,0x0f]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sub_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x08,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_sub_f32_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x10,0x00]
 
-v_exp_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_sub_f32_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x20,0x00]
 
-v_exp_legacy_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_sub_f32_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x40,0x00]
 
-v_exp_legacy_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x96,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_sub_f32_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x04,0x01,0xe4,0x80,0x00]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x06]
 
-v_log_legacy_f32_sdwa v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0xfe,0x7f,0x00,0x06,0x06,0x06]
+v_subrev_f32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x07,0x01,0x06,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0xff,0x06,0x06,0x06]
+v_subrev_f32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0xff,0x06,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x26,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x06,0x01,0x06,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x26,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x00,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x01,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x00,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x02,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x01,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x03,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x02,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x04,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x03,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x05,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x04,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x0e,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x05,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x0e,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x16,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x16,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x06,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x16,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x00,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x01,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x00,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x02,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x01,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x03,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x02,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x04,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x03,0x06]
 
-v_log_legacy_f32_sdwa v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x05,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x04,0x06]
 
-v_log_legacy_f32_sdwa v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x16,0x06]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x05,0x06]
 
-v_log_legacy_f32_sdwa v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x98,0x00,0x7e,0x00,0x06,0x26,0x06]
+v_subrev_f32_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x16,0x06]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x00,0x00]
+v_subrev_f32_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x26,0x06]
 
-v_log_legacy_f32_dpp v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0xfe,0x7f,0x00,0xe4,0x00,0x00]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x06]
 
-v_log_legacy_f32_dpp v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0xff,0xe4,0x00,0x00]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x00]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x1b,0x00,0x00]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x01]
 
-v_log_legacy_f32_dpp v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x40,0x01,0x00]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x02]
 
-v_log_legacy_f32_dpp v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x41,0x01,0x00]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x03]
 
-v_log_legacy_f32_dpp v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x42,0x01,0x00]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x04]
 
-v_log_legacy_f32_dpp v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x43,0x01,0x00]
+v_subrev_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x05]
 
-v_log_legacy_f32_dpp v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x30,0x01,0x00]
+v_subrev_f32_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x16]
 
-v_log_legacy_f32_dpp v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x34,0x01,0x00]
+v_subrev_f32_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x06,0x01,0x06,0x06,0x26]
 
-v_log_legacy_f32_dpp v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x38,0x01,0x00]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0x00]
 
-v_log_legacy_f32_dpp v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x3c,0x01,0x00]
+v_subrev_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x07,0x01,0xe4,0x00,0x00]
 
-v_log_legacy_f32_dpp v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x01,0x01,0x00]
+v_subrev_f32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0xff,0xe4,0x00,0x00]
 
-v_log_legacy_f32_dpp v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x0f,0x01,0x00]
+v_subrev_f32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x06,0x01,0xe4,0x00,0x00]
 
-v_log_legacy_f32_dpp v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x11,0x01,0x00]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x1b,0x00,0x00]
 
-v_log_legacy_f32_dpp v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x1f,0x01,0x00]
+v_subrev_f32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x40,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x21,0x01,0x00]
+v_subrev_f32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x41,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0x2f,0x01,0x00]
+v_subrev_f32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x42,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x00,0x10]
+v_subrev_f32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x43,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x00,0x30]
+v_subrev_f32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x30,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_subrev_f32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x34,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x00,0xf0]
+v_subrev_f32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x38,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x00,0x01]
+v_subrev_f32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x3c,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x00,0x03]
+v_subrev_f32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x01,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_subrev_f32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x0f,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x00,0x0f]
+v_subrev_f32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x11,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x08,0x00]
+v_subrev_f32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x1f,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x10,0x00]
+v_subrev_f32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x21,0x01,0x00]
 
-v_log_legacy_f32_dpp v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x98,0x00,0x7e,0x00,0xe4,0x20,0x00]
+v_subrev_f32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0x2f,0x01,0x00]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x06]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0x10]
 
-v_add_f32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x03,0x00,0x06,0x06,0x06]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0x30]
 
-v_add_f32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0xff,0x06,0x06,0x06]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0xf0]
 
-v_add_f32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x02,0x00,0x06,0x06,0x06]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0xf0]
 
-v_add_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x26,0x06,0x06]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0x01]
 
-v_add_f32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x06]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0x03]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x00,0x06,0x06]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0x0f]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x01,0x06,0x06]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x00,0x0f]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x02,0x06,0x06]
+v_subrev_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x08,0x00]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x03,0x06,0x06]
+v_subrev_f32_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x10,0x00]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x04,0x06,0x06]
+v_subrev_f32_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x20,0x00]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x05,0x06,0x06]
+v_subrev_f32_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x40,0x00]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x0e,0x06,0x06]
+v_subrev_f32_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x06,0x01,0xe4,0x80,0x00]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x16,0x06,0x06]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x16,0x06,0x06]
+v_mul_legacy_f32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x09,0x01,0x06,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x06]
+v_mul_legacy_f32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0xff,0x06,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x00,0x06]
+v_mul_legacy_f32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x08,0x01,0x06,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x01,0x06]
+v_mul_legacy_f32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x26,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x02,0x06]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x03,0x06]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x00,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x04,0x06]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x01,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x05,0x06]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x02,0x06,0x06]
 
-v_add_f32_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x16,0x06]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x03,0x06,0x06]
 
-v_add_f32_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x26,0x06]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x04,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x06]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x05,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x00]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x0e,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x01]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x16,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x02]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x16,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x03]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x04]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x00,0x06]
 
-v_add_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x05]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x01,0x06]
 
-v_add_f32_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x16]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x02,0x06]
 
-v_add_f32_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x02,0x00,0x06,0x06,0x26]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x03,0x06]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x00,0x00]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x04,0x06]
 
-v_add_f32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x03,0x00,0xe4,0x00,0x00]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x05,0x06]
 
-v_add_f32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0xff,0xe4,0x00,0x00]
+v_mul_legacy_f32_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x16,0x06]
 
-v_add_f32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x02,0x00,0xe4,0x00,0x00]
+v_mul_legacy_f32_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x26,0x06]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x1b,0x00,0x00]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x06]
 
-v_add_f32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x40,0x01,0x00]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x00]
 
-v_add_f32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x41,0x01,0x00]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x01]
 
-v_add_f32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x42,0x01,0x00]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x02]
 
-v_add_f32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x43,0x01,0x00]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x03]
 
-v_add_f32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x30,0x01,0x00]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x04]
 
-v_add_f32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x34,0x01,0x00]
+v_mul_legacy_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x05]
 
-v_add_f32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x38,0x01,0x00]
+v_mul_legacy_f32_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x16]
 
-v_add_f32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x3c,0x01,0x00]
+v_mul_legacy_f32_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x08,0x01,0x06,0x06,0x26]
 
-v_add_f32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x01,0x01,0x00]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0x00]
 
-v_add_f32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x0f,0x01,0x00]
+v_mul_legacy_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x09,0x01,0xe4,0x00,0x00]
 
-v_add_f32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x11,0x01,0x00]
+v_mul_legacy_f32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0xff,0xe4,0x00,0x00]
 
-v_add_f32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x1f,0x01,0x00]
+v_mul_legacy_f32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x08,0x01,0xe4,0x00,0x00]
 
-v_add_f32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x21,0x01,0x00]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x1b,0x00,0x00]
 
-v_add_f32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0x2f,0x01,0x00]
+v_mul_legacy_f32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x40,0x01,0x00]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x00,0x10]
+v_mul_legacy_f32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x41,0x01,0x00]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x00,0x30]
+v_mul_legacy_f32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x42,0x01,0x00]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x00,0xf0]
+v_mul_legacy_f32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x43,0x01,0x00]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x00,0xf0]
+v_mul_legacy_f32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x30,0x01,0x00]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x00,0x01]
+v_mul_legacy_f32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x34,0x01,0x00]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x00,0x03]
+v_mul_legacy_f32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x38,0x01,0x00]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x00,0x0f]
+v_mul_legacy_f32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x3c,0x01,0x00]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x00,0x0f]
+v_mul_legacy_f32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x01,0x01,0x00]
 
-v_add_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x08,0x00]
+v_mul_legacy_f32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x0f,0x01,0x00]
 
-v_add_f32_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x10,0x00]
+v_mul_legacy_f32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x11,0x01,0x00]
 
-v_add_f32_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x20,0x00]
+v_mul_legacy_f32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x1f,0x01,0x00]
 
-v_add_f32_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x40,0x00]
+v_mul_legacy_f32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x21,0x01,0x00]
 
-v_add_f32_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x02,0x00,0xe4,0x80,0x00]
+v_mul_legacy_f32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0x2f,0x01,0x00]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0x10]
 
-v_sub_f32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x05,0x00,0x06,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0x30]
 
-v_sub_f32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0xff,0x06,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0xf0]
 
-v_sub_f32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x04,0x00,0x06,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0xf0]
 
-v_sub_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x26,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0x01]
 
-v_sub_f32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0x03]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x00,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0x0f]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x01,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x00,0x0f]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x02,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x08,0x00]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x03,0x06,0x06]
+v_mul_legacy_f32_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x10,0x00]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x04,0x06,0x06]
+v_mul_legacy_f32_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x20,0x00]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x05,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x40,0x00]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x0e,0x06,0x06]
+v_mul_legacy_f32_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x08,0x01,0xe4,0x80,0x00]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x16,0x06,0x06]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x16,0x06,0x06]
+v_mul_f32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x0b,0x01,0x06,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x06]
+v_mul_f32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0xff,0x06,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x00,0x06]
+v_mul_f32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x0a,0x01,0x06,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x01,0x06]
+v_mul_f32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x26,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x02,0x06]
+v_mul_f32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x03,0x06]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x00,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x04,0x06]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x01,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x05,0x06]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x02,0x06,0x06]
 
-v_sub_f32_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x16,0x06]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x03,0x06,0x06]
 
-v_sub_f32_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x26,0x06]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x04,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x06]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x05,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x00]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x0e,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x01]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x16,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x02]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x16,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x03]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x04]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x00,0x06]
 
-v_sub_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x05]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x01,0x06]
 
-v_sub_f32_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x16]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x02,0x06]
 
-v_sub_f32_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x04,0x00,0x06,0x06,0x26]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x03,0x06]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x00,0x00]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x04,0x06]
 
-v_sub_f32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x05,0x00,0xe4,0x00,0x00]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x05,0x06]
 
-v_sub_f32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0xff,0xe4,0x00,0x00]
+v_mul_f32_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x16,0x06]
 
-v_sub_f32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x04,0x00,0xe4,0x00,0x00]
+v_mul_f32_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x26,0x06]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x1b,0x00,0x00]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x06]
 
-v_sub_f32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x40,0x01,0x00]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x00]
 
-v_sub_f32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x41,0x01,0x00]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x01]
 
-v_sub_f32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x42,0x01,0x00]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x02]
 
-v_sub_f32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x43,0x01,0x00]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x03]
 
-v_sub_f32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x30,0x01,0x00]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x04]
 
-v_sub_f32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x34,0x01,0x00]
+v_mul_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x05]
 
-v_sub_f32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x38,0x01,0x00]
+v_mul_f32_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x16]
 
-v_sub_f32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x3c,0x01,0x00]
+v_mul_f32_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0a,0x01,0x06,0x06,0x26]
 
-v_sub_f32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x01,0x01,0x00]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0x00]
 
-v_sub_f32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x0f,0x01,0x00]
+v_mul_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x0b,0x01,0xe4,0x00,0x00]
 
-v_sub_f32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x11,0x01,0x00]
+v_mul_f32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0xff,0xe4,0x00,0x00]
 
-v_sub_f32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x1f,0x01,0x00]
+v_mul_f32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x0a,0x01,0xe4,0x00,0x00]
 
-v_sub_f32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x21,0x01,0x00]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x1b,0x00,0x00]
 
-v_sub_f32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0x2f,0x01,0x00]
+v_mul_f32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x40,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x00,0x10]
+v_mul_f32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x41,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x00,0x30]
+v_mul_f32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x42,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x00,0xf0]
+v_mul_f32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x43,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x00,0xf0]
+v_mul_f32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x30,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x00,0x01]
+v_mul_f32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x34,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x00,0x03]
+v_mul_f32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x38,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x00,0x0f]
+v_mul_f32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x3c,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x00,0x0f]
+v_mul_f32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x01,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x08,0x00]
+v_mul_f32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x0f,0x01,0x00]
 
-v_sub_f32_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x10,0x00]
+v_mul_f32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x11,0x01,0x00]
 
-v_sub_f32_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x20,0x00]
+v_mul_f32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x1f,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x40,0x00]
+v_mul_f32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x21,0x01,0x00]
 
-v_sub_f32_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x04,0x00,0xe4,0x80,0x00]
+v_mul_f32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0x2f,0x01,0x00]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x06]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0x10]
 
-v_subrev_f32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x07,0x00,0x06,0x06,0x06]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0x30]
 
-v_subrev_f32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0xff,0x06,0x06,0x06]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0xf0]
 
-v_subrev_f32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x06,0x00,0x06,0x06,0x06]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0xf0]
 
-v_subrev_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x26,0x06,0x06]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0x01]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x06]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0x03]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x00,0x06,0x06]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0x0f]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x01,0x06,0x06]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x00,0x0f]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x02,0x06,0x06]
+v_mul_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x08,0x00]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x03,0x06,0x06]
+v_mul_f32_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x10,0x00]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x04,0x06,0x06]
+v_mul_f32_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x20,0x00]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x05,0x06,0x06]
+v_mul_f32_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x40,0x00]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x0e,0x06,0x06]
+v_mul_f32_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0a,0x01,0xe4,0x80,0x00]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x16,0x06,0x06]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x16,0x06,0x06]
+v_mul_i32_i24_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x0d,0x01,0x06,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x06]
+v_mul_i32_i24_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0xff,0x06,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x00,0x06]
+v_mul_i32_i24_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x0c,0x01,0x06,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x01,0x06]
+v_mul_i32_i24_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x26,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x02,0x06]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x03,0x06]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x00,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x04,0x06]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x01,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x05,0x06]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x02,0x06,0x06]
 
-v_subrev_f32_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x16,0x06]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x03,0x06,0x06]
 
-v_subrev_f32_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x26,0x06]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x04,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x06]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x05,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x00]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x0e,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x01]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x16,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x02]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x16,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x03]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x04]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x00,0x06]
 
-v_subrev_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x05]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x01,0x06]
 
-v_subrev_f32_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x16]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x02,0x06]
 
-v_subrev_f32_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x06,0x00,0x06,0x06,0x26]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x03,0x06]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x00,0x00]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x04,0x06]
 
-v_subrev_f32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x07,0x00,0xe4,0x00,0x00]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x05,0x06]
 
-v_subrev_f32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0xff,0xe4,0x00,0x00]
+v_mul_i32_i24_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x0e,0x06]
 
-v_subrev_f32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x06,0x00,0xe4,0x00,0x00]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x06]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x1b,0x00,0x00]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x40,0x01,0x00]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x01]
 
-v_subrev_f32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x41,0x01,0x00]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x02]
 
-v_subrev_f32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x42,0x01,0x00]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x03]
 
-v_subrev_f32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x43,0x01,0x00]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x04]
 
-v_subrev_f32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x30,0x01,0x00]
+v_mul_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x05]
 
-v_subrev_f32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x34,0x01,0x00]
+v_mul_i32_i24_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0c,0x01,0x06,0x06,0x0e]
 
-v_subrev_f32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x38,0x01,0x00]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0xe4,0x00,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x3c,0x01,0x00]
+v_mul_i32_i24_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x0d,0x01,0xe4,0x00,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x01,0x01,0x00]
+v_mul_i32_i24_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0xff,0xe4,0x00,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x0f,0x01,0x00]
+v_mul_i32_i24_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x0c,0x01,0xe4,0x00,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x11,0x01,0x00]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x1b,0x00,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x1f,0x01,0x00]
+v_mul_i32_i24_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x40,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x21,0x01,0x00]
+v_mul_i32_i24_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x41,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0x2f,0x01,0x00]
+v_mul_i32_i24_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x42,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x00,0x10]
+v_mul_i32_i24_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x43,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x00,0x30]
+v_mul_i32_i24_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x30,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x00,0xf0]
+v_mul_i32_i24_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x34,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x00,0xf0]
+v_mul_i32_i24_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x38,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x00,0x01]
+v_mul_i32_i24_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x3c,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x00,0x03]
+v_mul_i32_i24_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x01,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x00,0x0f]
+v_mul_i32_i24_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x0f,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x00,0x0f]
+v_mul_i32_i24_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x11,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x08,0x00]
+v_mul_i32_i24_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x1f,0x01,0x00]
 
-v_subrev_f32_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x10,0x00]
+v_mul_i32_i24_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x21,0x01,0x00]
 
-v_subrev_f32_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x20,0x00]
+v_mul_i32_i24_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0x2f,0x01,0x00]
 
-v_subrev_f32_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x40,0x00]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0xe4,0x00,0x10]
 
-v_subrev_f32_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x06,0x00,0xe4,0x80,0x00]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0xe4,0x00,0x30]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x06]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0xe4,0x00,0xf0]
 
-v_mul_legacy_f32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x09,0x00,0x06,0x06,0x06]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0xe4,0x00,0xf0]
 
-v_mul_legacy_f32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0xff,0x06,0x06,0x06]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0xe4,0x00,0x01]
 
-v_mul_legacy_f32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x08,0x00,0x06,0x06,0x06]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0xe4,0x00,0x03]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x26,0x06,0x06]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0xe4,0x00,0x0f]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x06]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0xe4,0x00,0x0f]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x00,0x06,0x06]
+v_mul_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x0c,0x01,0xe4,0x08,0x00]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x01,0x06,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x02,0x06,0x06]
+v_mul_hi_i32_i24_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x0f,0x01,0x06,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x03,0x06,0x06]
+v_mul_hi_i32_i24_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0xff,0x06,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x04,0x06,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x0e,0x01,0x06,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x05,0x06,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x26,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x0e,0x06,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x16,0x06,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x00,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x16,0x06,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x01,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x02,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x00,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x03,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x01,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x04,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x02,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x05,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x03,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x0e,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x04,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x16,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x05,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x16,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x16,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x26,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x00,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x06]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x01,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x00]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x02,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x01]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x03,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x02]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x04,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x03]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x05,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x04]
+v_mul_hi_i32_i24_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x0e,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x05]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x06]
 
-v_mul_legacy_f32_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x16]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x00]
 
-v_mul_legacy_f32_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x08,0x00,0x06,0x06,0x26]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x01]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x00,0x00]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x02]
 
-v_mul_legacy_f32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x09,0x00,0xe4,0x00,0x00]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x03]
 
-v_mul_legacy_f32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0xff,0xe4,0x00,0x00]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x04]
 
-v_mul_legacy_f32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x08,0x00,0xe4,0x00,0x00]
+v_mul_hi_i32_i24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x05]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x1b,0x00,0x00]
+v_mul_hi_i32_i24_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x0e,0x01,0x06,0x06,0x0e]
 
-v_mul_legacy_f32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x40,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x41,0x01,0x00]
+v_mul_hi_i32_i24_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x0f,0x01,0xe4,0x00,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x42,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0xff,0xe4,0x00,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x43,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x0e,0x01,0xe4,0x00,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x30,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x1b,0x00,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x34,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x40,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x38,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x41,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x3c,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x42,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x01,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x43,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x0f,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x30,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x11,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x34,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x1f,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x38,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x21,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x3c,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0x2f,0x01,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x01,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x00,0x10]
+v_mul_hi_i32_i24_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x0f,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x00,0x30]
+v_mul_hi_i32_i24_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x11,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x00,0xf0]
+v_mul_hi_i32_i24_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x1f,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x00,0xf0]
+v_mul_hi_i32_i24_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x21,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x00,0x01]
+v_mul_hi_i32_i24_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0x2f,0x01,0x00]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x00,0x03]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0x10]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x00,0x0f]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0x30]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x00,0x0f]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xf0]
 
-v_mul_legacy_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x08,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0xf0]
 
-v_mul_legacy_f32_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x10,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0x01]
 
-v_mul_legacy_f32_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x20,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0x03]
 
-v_mul_legacy_f32_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x40,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0x0f]
 
-v_mul_legacy_f32_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x08,0x00,0xe4,0x80,0x00]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x00,0x0f]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x06]
+v_mul_hi_i32_i24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x0e,0x01,0xe4,0x08,0x00]
 
-v_mul_f32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x0b,0x00,0x06,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x06]
 
-v_mul_f32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0xff,0x06,0x06,0x06]
+v_mul_u32_u24_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x11,0x01,0x06,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x0a,0x00,0x06,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0xff,0x06,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x26,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x10,0x01,0x06,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x26,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x00,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x01,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x00,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x02,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x01,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x03,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x02,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x04,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x03,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x05,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x04,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x0e,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x05,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x16,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x0e,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x16,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x16,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x16,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x00,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x01,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x00,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x02,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x01,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x03,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x02,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x04,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x03,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x05,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x04,0x06]
 
-v_mul_f32_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x16,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x05,0x06]
 
-v_mul_f32_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x26,0x06]
+v_mul_u32_u24_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x0e,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x06]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x06]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x00]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x00]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x01]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x01]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x02]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x02]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x03]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x03]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x04]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x04]
 
-v_mul_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x05]
+v_mul_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x05]
 
-v_mul_f32_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x16]
+v_mul_u32_u24_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x10,0x01,0x06,0x06,0x0e]
 
-v_mul_f32_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0a,0x00,0x06,0x06,0x26]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0x00]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x00,0x00]
+v_mul_u32_u24_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x11,0x01,0xe4,0x00,0x00]
 
-v_mul_f32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x0b,0x00,0xe4,0x00,0x00]
+v_mul_u32_u24_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0xff,0xe4,0x00,0x00]
 
-v_mul_f32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0xff,0xe4,0x00,0x00]
+v_mul_u32_u24_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x10,0x01,0xe4,0x00,0x00]
 
-v_mul_f32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x0a,0x00,0xe4,0x00,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x1b,0x00,0x00]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x1b,0x00,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x40,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x40,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x41,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x41,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x42,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x42,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x43,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x43,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x30,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x30,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x34,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x34,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x38,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x38,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x3c,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x3c,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x01,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x01,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x0f,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x0f,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x11,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x11,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x1f,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x1f,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x21,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x21,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0x2f,0x01,0x00]
 
-v_mul_f32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0x2f,0x01,0x00]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0x10]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x00,0x10]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0x30]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x00,0x30]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0xf0]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x00,0xf0]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0xf0]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x00,0xf0]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0x01]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x00,0x01]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0x03]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x00,0x03]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0x0f]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x00,0x0f]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x00,0x0f]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x00,0x0f]
+v_mul_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x10,0x01,0xe4,0x08,0x00]
 
-v_mul_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x08,0x00]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x06]
 
-v_mul_f32_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x10,0x00]
+v_mul_hi_u32_u24_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x13,0x01,0x06,0x06,0x06]
 
-v_mul_f32_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x20,0x00]
+v_mul_hi_u32_u24_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0xff,0x06,0x06,0x06]
 
-v_mul_f32_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x40,0x00]
+v_mul_hi_u32_u24_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x12,0x01,0x06,0x06,0x06]
 
-v_mul_f32_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0a,0x00,0xe4,0x80,0x00]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x26,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x06]
 
-v_mul_i32_i24_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x0d,0x00,0x06,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x00,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0xff,0x06,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x01,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x0c,0x00,0x06,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x02,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x26,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x03,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x04,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x00,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x05,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x01,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x0e,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x02,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x16,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x03,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x16,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x04,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x05,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x00,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x0e,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x01,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x16,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x02,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x16,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x03,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x04,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x00,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x05,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x01,0x06]
+v_mul_hi_u32_u24_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x0e,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x02,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x06]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x03,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x00]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x04,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x01]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x05,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x02]
 
-v_mul_i32_i24_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x0e,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x03]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x06]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x04]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x00]
+v_mul_hi_u32_u24_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x05]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x01]
+v_mul_hi_u32_u24_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x12,0x01,0x06,0x06,0x0e]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x02]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0x00]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x03]
+v_mul_hi_u32_u24_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x13,0x01,0xe4,0x00,0x00]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x04]
+v_mul_hi_u32_u24_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0xff,0xe4,0x00,0x00]
 
-v_mul_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x05]
+v_mul_hi_u32_u24_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x12,0x01,0xe4,0x00,0x00]
 
-v_mul_i32_i24_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0c,0x00,0x06,0x06,0x0e]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x1b,0x00,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0xe4,0x00,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x40,0x01,0x00]
 
-v_mul_i32_i24_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x0d,0x00,0xe4,0x00,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x41,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0xff,0xe4,0x00,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x42,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x0c,0x00,0xe4,0x00,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x43,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x1b,0x00,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x30,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x40,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x34,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x41,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x38,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x42,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x3c,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x43,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x01,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x30,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x0f,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x34,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x11,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x38,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x1f,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x3c,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x21,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x01,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0x2f,0x01,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x0f,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0x10]
 
-v_mul_i32_i24_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x11,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0x30]
 
-v_mul_i32_i24_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x1f,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0xf0]
 
-v_mul_i32_i24_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x21,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0xf0]
 
-v_mul_i32_i24_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0x2f,0x01,0x00]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0x01]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0xe4,0x00,0x10]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0x03]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0xe4,0x00,0x30]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0x0f]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0xe4,0x00,0xf0]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x00,0x0f]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0xe4,0x00,0xf0]
+v_mul_hi_u32_u24_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x12,0x01,0xe4,0x08,0x00]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0xe4,0x00,0x01]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x06]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0xe4,0x00,0x03]
+v_min_f32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x15,0x01,0x06,0x06,0x06]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0xe4,0x00,0x0f]
+v_min_f32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0xff,0x06,0x06,0x06]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0xe4,0x00,0x0f]
+v_min_f32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x14,0x01,0x06,0x06,0x06]
 
-v_mul_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x0c,0x00,0xe4,0x08,0x00]
+v_min_f32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x26,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x0f,0x00,0x06,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x00,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0xff,0x06,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x01,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x0e,0x00,0x06,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x02,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x26,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x03,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x04,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x00,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x05,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x01,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x0e,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x02,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x16,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x03,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x16,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x04,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x05,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x00,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x0e,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x01,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x16,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x02,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x16,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x03,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x04,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x00,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x05,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x01,0x06]
+v_min_f32_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x16,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x02,0x06]
+v_min_f32_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x26,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x03,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x06]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x04,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x00]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x05,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x01]
 
-v_mul_hi_i32_i24_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x0e,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x02]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x06]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x03]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x00]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x04]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x01]
+v_min_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x05]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x02]
+v_min_f32_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x16]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x03]
+v_min_f32_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x14,0x01,0x06,0x06,0x26]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x04]
+v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0x00]
 
-v_mul_hi_i32_i24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x05]
+v_min_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x15,0x01,0xe4,0x00,0x00]
 
-v_mul_hi_i32_i24_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x0e,0x00,0x06,0x06,0x0e]
+v_min_f32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0xff,0xe4,0x00,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0xe4,0x00,0x00]
+v_min_f32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x14,0x01,0xe4,0x00,0x00]
 
-v_mul_hi_i32_i24_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x0f,0x00,0xe4,0x00,0x00]
+v_min_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x1b,0x00,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0xff,0xe4,0x00,0x00]
+v_min_f32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x40,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x0e,0x00,0xe4,0x00,0x00]
+v_min_f32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x41,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x1b,0x00,0x00]
+v_min_f32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x42,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x40,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x43,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x41,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x30,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x42,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x34,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x43,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x38,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x30,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x3c,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x34,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x01,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x38,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x0f,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x3c,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x11,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x01,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x1f,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x0f,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x21,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x11,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0x2f,0x01,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x1f,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0x10]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x21,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0x30]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0x2f,0x01,0x00]
+v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0xf0]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0xe4,0x00,0x10]
+v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0xf0]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0xe4,0x00,0x30]
+v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0x01]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0xe4,0x00,0xf0]
+v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0x03]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0xe4,0x00,0xf0]
+v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0x0f]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0xe4,0x00,0x01]
+v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x00,0x0f]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0xe4,0x00,0x03]
+v_min_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x08,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0xe4,0x00,0x0f]
+v_min_f32_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x10,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0xe4,0x00,0x0f]
+v_min_f32_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x20,0x00]
 
-v_mul_hi_i32_i24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x0e,0x00,0xe4,0x08,0x00]
+v_min_f32_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x40,0x00]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x06]
+v_min_f32_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x14,0x01,0xe4,0x80,0x00]
 
-v_mul_u32_u24_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x11,0x00,0x06,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0xff,0x06,0x06,0x06]
+v_max_f32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x17,0x01,0x06,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x10,0x00,0x06,0x06,0x06]
+v_max_f32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0xff,0x06,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x26,0x06,0x06]
+v_max_f32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x16,0x01,0x06,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x26,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x00,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x01,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x00,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x02,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x01,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x03,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x02,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x04,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x03,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x05,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x04,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x0e,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x05,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x16,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x0e,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x16,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x16,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x16,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x00,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x01,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x00,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x02,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x01,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x03,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x02,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x04,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x03,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x05,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x04,0x06]
 
-v_mul_u32_u24_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x0e,0x06]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x05,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x06]
+v_max_f32_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x16,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x00]
+v_max_f32_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x26,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x01]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x06]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x02]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x00]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x03]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x01]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x04]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x02]
 
-v_mul_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x05]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x03]
 
-v_mul_u32_u24_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x10,0x00,0x06,0x06,0x0e]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x04]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0xe4,0x00,0x00]
+v_max_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x05]
 
-v_mul_u32_u24_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x11,0x00,0xe4,0x00,0x00]
+v_max_f32_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x16]
 
-v_mul_u32_u24_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0xff,0xe4,0x00,0x00]
+v_max_f32_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x16,0x01,0x06,0x06,0x26]
 
-v_mul_u32_u24_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x10,0x00,0xe4,0x00,0x00]
+v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x1b,0x00,0x00]
+v_max_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x17,0x01,0xe4,0x00,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x40,0x01,0x00]
+v_max_f32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0xff,0xe4,0x00,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x41,0x01,0x00]
+v_max_f32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x16,0x01,0xe4,0x00,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x42,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x1b,0x00,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x43,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x40,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x30,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x41,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x34,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x42,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x38,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x43,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x3c,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x30,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x01,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x34,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x0f,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x38,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x11,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x3c,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x1f,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x01,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x21,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x0f,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0x2f,0x01,0x00]
+v_max_f32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x11,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0xe4,0x00,0x10]
+v_max_f32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x1f,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0xe4,0x00,0x30]
+v_max_f32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x21,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0xe4,0x00,0xf0]
+v_max_f32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0x2f,0x01,0x00]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0xe4,0x00,0xf0]
+v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0x10]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0xe4,0x00,0x01]
+v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0x30]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0xe4,0x00,0x03]
+v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0xf0]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0xe4,0x00,0x0f]
+v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0xf0]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0xe4,0x00,0x0f]
+v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0x01]
 
-v_mul_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x10,0x00,0xe4,0x08,0x00]
+v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0x03]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x06]
+v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0x0f]
 
-v_mul_hi_u32_u24_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x13,0x00,0x06,0x06,0x06]
+v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x00,0x0f]
 
-v_mul_hi_u32_u24_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0xff,0x06,0x06,0x06]
+v_max_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x08,0x00]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x12,0x00,0x06,0x06,0x06]
+v_max_f32_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x10,0x00]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x26,0x06,0x06]
+v_max_f32_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x20,0x00]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x06]
+v_max_f32_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x40,0x00]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x00,0x06,0x06]
+v_max_f32_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x16,0x01,0xe4,0x80,0x00]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x01,0x06,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x02,0x06,0x06]
+v_min_i32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x19,0x01,0x06,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x03,0x06,0x06]
+v_min_i32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0xff,0x06,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x04,0x06,0x06]
+v_min_i32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x18,0x01,0x06,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x05,0x06,0x06]
+v_min_i32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x26,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x0e,0x06,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x16,0x06,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x00,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x16,0x06,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x01,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x02,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x00,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x03,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x01,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x04,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x02,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x05,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x03,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x0e,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x04,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x16,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x05,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x16,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x0e,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x06]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x00,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x00]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x01,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x01]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x02,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x02]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x03,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x03]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x04,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x04]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x05,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x05]
+v_min_i32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x0e,0x06]
 
-v_mul_hi_u32_u24_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x12,0x00,0x06,0x06,0x0e]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x06]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0xe4,0x00,0x00]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x00]
 
-v_mul_hi_u32_u24_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x13,0x00,0xe4,0x00,0x00]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x01]
 
-v_mul_hi_u32_u24_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0xff,0xe4,0x00,0x00]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x02]
 
-v_mul_hi_u32_u24_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x12,0x00,0xe4,0x00,0x00]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x03]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x1b,0x00,0x00]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x04]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x40,0x01,0x00]
+v_min_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x05]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x41,0x01,0x00]
+v_min_i32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x18,0x01,0x06,0x06,0x0e]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x42,0x01,0x00]
+v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x43,0x01,0x00]
+v_min_i32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x19,0x01,0xe4,0x00,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x30,0x01,0x00]
+v_min_i32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0xff,0xe4,0x00,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x34,0x01,0x00]
+v_min_i32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x18,0x01,0xe4,0x00,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x38,0x01,0x00]
+v_min_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x1b,0x00,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x3c,0x01,0x00]
+v_min_i32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x40,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x01,0x01,0x00]
+v_min_i32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x41,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x0f,0x01,0x00]
+v_min_i32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x42,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x11,0x01,0x00]
+v_min_i32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x43,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x1f,0x01,0x00]
+v_min_i32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x30,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x21,0x01,0x00]
+v_min_i32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x34,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0x2f,0x01,0x00]
+v_min_i32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x38,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0xe4,0x00,0x10]
+v_min_i32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x3c,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0xe4,0x00,0x30]
+v_min_i32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x01,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0xe4,0x00,0xf0]
+v_min_i32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x0f,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0xe4,0x00,0xf0]
+v_min_i32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x11,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0xe4,0x00,0x01]
+v_min_i32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x1f,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0xe4,0x00,0x03]
+v_min_i32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x21,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0xe4,0x00,0x0f]
+v_min_i32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0x2f,0x01,0x00]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0xe4,0x00,0x0f]
+v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0x10]
 
-v_mul_hi_u32_u24_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x12,0x00,0xe4,0x08,0x00]
+v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0x30]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x06]
+v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0xf0]
 
-v_min_f32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x15,0x00,0x06,0x06,0x06]
+v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0xf0]
 
-v_min_f32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0xff,0x06,0x06,0x06]
+v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0x01]
 
-v_min_f32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x14,0x00,0x06,0x06,0x06]
+v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0x03]
 
-v_min_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x26,0x06,0x06]
+v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0x0f]
 
-v_min_f32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x06]
+v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x00,0x0f]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x00,0x06,0x06]
+v_min_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x18,0x01,0xe4,0x08,0x00]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x01,0x06,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x02,0x06,0x06]
+v_max_i32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x1b,0x01,0x06,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x03,0x06,0x06]
+v_max_i32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0xff,0x06,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x04,0x06,0x06]
+v_max_i32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x1a,0x01,0x06,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x05,0x06,0x06]
+v_max_i32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x26,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x0e,0x06,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x16,0x06,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x00,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x16,0x06,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x01,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x02,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x00,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x03,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x01,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x04,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x02,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x05,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x03,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x0e,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x04,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x16,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x05,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x16,0x06,0x06]
 
-v_min_f32_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x16,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x06]
 
-v_min_f32_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x26,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x00,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x06]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x01,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x00]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x02,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x01]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x03,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x02]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x04,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x03]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x05,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x04]
+v_max_i32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x0e,0x06]
 
-v_min_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x05]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x06]
 
-v_min_f32_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x16]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x00]
 
-v_min_f32_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x14,0x00,0x06,0x06,0x26]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x01]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x00,0x00]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x02]
 
-v_min_f32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x15,0x00,0xe4,0x00,0x00]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x03]
 
-v_min_f32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0xff,0xe4,0x00,0x00]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x04]
 
-v_min_f32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x14,0x00,0xe4,0x00,0x00]
+v_max_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x05]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x1b,0x00,0x00]
+v_max_i32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1a,0x01,0x06,0x06,0x0e]
 
-v_min_f32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x40,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x00,0x00]
 
-v_min_f32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x41,0x01,0x00]
+v_max_i32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x1b,0x01,0xe4,0x00,0x00]
 
-v_min_f32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x42,0x01,0x00]
+v_max_i32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0xff,0xe4,0x00,0x00]
 
-v_min_f32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x43,0x01,0x00]
+v_max_i32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x1a,0x01,0xe4,0x00,0x00]
 
-v_min_f32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x30,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x1b,0x00,0x00]
 
-v_min_f32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x34,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x40,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x38,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x41,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x3c,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x42,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x01,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x43,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x0f,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x30,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x11,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x34,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x1f,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x38,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x21,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x3c,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0x2f,0x01,0x00]
+v_max_i32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x01,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x00,0x10]
+v_max_i32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x0f,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x00,0x30]
+v_max_i32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x11,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x00,0xf0]
+v_max_i32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x1f,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x00,0xf0]
+v_max_i32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x21,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x00,0x01]
+v_max_i32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0x2f,0x01,0x00]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x00,0x03]
+v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x00,0x10]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x00,0x0f]
+v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x00,0x30]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x00,0x0f]
+v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x00,0xf0]
 
-v_min_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x08,0x00]
+v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x00,0xf0]
 
-v_min_f32_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x10,0x00]
+v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x00,0x01]
 
-v_min_f32_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x20,0x00]
+v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x00,0x03]
 
-v_min_f32_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x40,0x00]
+v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x00,0x0f]
 
-v_min_f32_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x14,0x00,0xe4,0x80,0x00]
+v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x00,0x0f]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x06]
+v_max_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x1a,0x01,0xe4,0x08,0x00]
 
-v_max_f32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x17,0x00,0x06,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x06]
 
-v_max_f32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0xff,0x06,0x06,0x06]
+v_min_u32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x1d,0x01,0x06,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x16,0x00,0x06,0x06,0x06]
+v_min_u32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0xff,0x06,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x26,0x06,0x06]
+v_min_u32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x1c,0x01,0x06,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x26,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x00,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x01,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x00,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x02,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x01,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x03,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x02,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x04,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x03,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x05,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x04,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x0e,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x05,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x16,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x0e,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x16,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x16,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x16,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x00,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x01,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x00,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x02,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x01,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x03,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x02,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x04,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x03,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x05,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x04,0x06]
 
-v_max_f32_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x16,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x05,0x06]
 
-v_max_f32_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x26,0x06]
+v_min_u32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x0e,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x06]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x06]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x00]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x00]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x01]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x01]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x02]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x02]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x03]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x03]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x04]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x04]
 
-v_max_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x05]
+v_min_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x05]
 
-v_max_f32_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x16]
+v_min_u32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1c,0x01,0x06,0x06,0x0e]
 
-v_max_f32_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x16,0x00,0x06,0x06,0x26]
+v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0xe4,0x00,0x00]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x00,0x00]
+v_min_u32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x1d,0x01,0xe4,0x00,0x00]
 
-v_max_f32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x17,0x00,0xe4,0x00,0x00]
+v_min_u32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0xff,0xe4,0x00,0x00]
 
-v_max_f32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0xff,0xe4,0x00,0x00]
+v_min_u32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x1c,0x01,0xe4,0x00,0x00]
 
-v_max_f32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x16,0x00,0xe4,0x00,0x00]
+v_min_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x1b,0x00,0x00]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x1b,0x00,0x00]
+v_min_u32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x40,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x40,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x41,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x41,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x42,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x42,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x43,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x43,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x30,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x30,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x34,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x34,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x38,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x38,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x3c,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x3c,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x01,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x01,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x0f,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x0f,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x11,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x11,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x1f,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x1f,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x21,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x21,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0x2f,0x01,0x00]
 
-v_max_f32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0x2f,0x01,0x00]
+v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0xe4,0x00,0x10]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x00,0x10]
+v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0xe4,0x00,0x30]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x00,0x30]
+v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0xe4,0x00,0xf0]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x00,0xf0]
+v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0xe4,0x00,0xf0]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x00,0xf0]
+v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0xe4,0x00,0x01]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x00,0x01]
+v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0xe4,0x00,0x03]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x00,0x03]
+v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0xe4,0x00,0x0f]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x00,0x0f]
+v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0xe4,0x00,0x0f]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x00,0x0f]
+v_min_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x1c,0x01,0xe4,0x08,0x00]
 
-v_max_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x08,0x00]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x06]
 
-v_max_f32_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x10,0x00]
+v_max_u32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x1f,0x01,0x06,0x06,0x06]
 
-v_max_f32_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x20,0x00]
+v_max_u32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0xff,0x06,0x06,0x06]
 
-v_max_f32_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x40,0x00]
+v_max_u32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x1e,0x01,0x06,0x06,0x06]
 
-v_max_f32_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x16,0x00,0xe4,0x80,0x00]
+v_max_u32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x26,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x06]
 
-v_min_i32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x19,0x00,0x06,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x00,0x06,0x06]
 
-v_min_i32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0xff,0x06,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x01,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x18,0x00,0x06,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x02,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x26,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x03,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x04,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x00,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x05,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x01,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x0e,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x02,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x16,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x03,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x16,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x04,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x05,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x00,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x0e,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x01,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x16,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x02,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x16,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x03,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x04,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x00,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x05,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x01,0x06]
+v_max_u32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x0e,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x02,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x06]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x03,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x00]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x04,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x01]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x05,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x02]
 
-v_min_i32_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x0e,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x03]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x06]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x04]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x00]
+v_max_u32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x05]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x01]
+v_max_u32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x1e,0x01,0x06,0x06,0x0e]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x02]
+v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0x00]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x03]
+v_max_u32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x1f,0x01,0xe4,0x00,0x00]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x04]
+v_max_u32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0xff,0xe4,0x00,0x00]
 
-v_min_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x05]
+v_max_u32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x1e,0x01,0xe4,0x00,0x00]
 
-v_min_i32_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x18,0x00,0x06,0x06,0x0e]
+v_max_u32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x1b,0x00,0x00]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0xe4,0x00,0x00]
+v_max_u32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x40,0x01,0x00]
 
-v_min_i32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x19,0x00,0xe4,0x00,0x00]
+v_max_u32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x41,0x01,0x00]
 
-v_min_i32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0xff,0xe4,0x00,0x00]
+v_max_u32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x42,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x18,0x00,0xe4,0x00,0x00]
+v_max_u32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x43,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x1b,0x00,0x00]
+v_max_u32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x30,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x40,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x34,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x41,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x38,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x42,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x3c,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x43,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x01,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x30,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x0f,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x34,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x11,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x38,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x1f,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x3c,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x21,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x01,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0x2f,0x01,0x00]
 
-v_min_i32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x0f,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0x10]
 
-v_min_i32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x11,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0x30]
 
-v_min_i32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x1f,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0xf0]
 
-v_min_i32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x21,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0xf0]
 
-v_min_i32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0x2f,0x01,0x00]
+v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0x01]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0xe4,0x00,0x10]
+v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0x03]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0xe4,0x00,0x30]
+v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0x0f]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0xe4,0x00,0xf0]
+v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x00,0x0f]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0xe4,0x00,0xf0]
+v_max_u32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x1e,0x01,0xe4,0x08,0x00]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0xe4,0x00,0x01]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x06]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0xe4,0x00,0x03]
+v_lshrrev_b32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x21,0x01,0x06,0x06,0x06]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0xe4,0x00,0x0f]
+v_lshrrev_b32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0xff,0x06,0x06,0x06]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0xe4,0x00,0x0f]
+v_lshrrev_b32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x20,0x01,0x06,0x06,0x06]
 
-v_min_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x18,0x00,0xe4,0x08,0x00]
+v_lshrrev_b32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x26,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x06]
 
-v_max_i32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x1b,0x00,0x06,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x00,0x06,0x06]
 
-v_max_i32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0xff,0x06,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x01,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x1a,0x00,0x06,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x02,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x26,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x03,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x04,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x00,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x05,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x01,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x0e,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x02,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x16,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x03,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x16,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x04,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x05,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x00,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x0e,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x01,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x16,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x02,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x16,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x03,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x04,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x00,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x05,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x01,0x06]
+v_lshrrev_b32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x0e,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x02,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x06]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x03,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x00]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x04,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x01]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x05,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x02]
 
-v_max_i32_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x0e,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x03]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x06]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x04]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x00]
+v_lshrrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x05]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x01]
+v_lshrrev_b32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x20,0x01,0x06,0x06,0x0e]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x02]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0x00]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x03]
+v_lshrrev_b32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x21,0x01,0xe4,0x00,0x00]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x04]
+v_lshrrev_b32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0xff,0xe4,0x00,0x00]
 
-v_max_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x05]
+v_lshrrev_b32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x20,0x01,0xe4,0x00,0x00]
 
-v_max_i32_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1a,0x00,0x06,0x06,0x0e]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x1b,0x00,0x00]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0xe4,0x00,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x40,0x01,0x00]
 
-v_max_i32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x1b,0x00,0xe4,0x00,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x41,0x01,0x00]
 
-v_max_i32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0xff,0xe4,0x00,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x42,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x1a,0x00,0xe4,0x00,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x43,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x1b,0x00,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x30,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x40,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x34,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x41,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x38,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x42,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x3c,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x43,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x01,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x30,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x0f,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x34,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x11,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x38,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x1f,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x3c,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x21,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x01,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0x2f,0x01,0x00]
 
-v_max_i32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x0f,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0x10]
 
-v_max_i32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x11,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0x30]
 
-v_max_i32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x1f,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0xf0]
 
-v_max_i32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x21,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0xf0]
 
-v_max_i32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0x2f,0x01,0x00]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0x01]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0xe4,0x00,0x10]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0x03]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0xe4,0x00,0x30]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0x0f]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0xe4,0x00,0xf0]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x00,0x0f]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0xe4,0x00,0xf0]
+v_lshrrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x20,0x01,0xe4,0x08,0x00]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0xe4,0x00,0x01]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x06]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0xe4,0x00,0x03]
+v_ashrrev_i32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x23,0x01,0x06,0x06,0x06]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0xe4,0x00,0x0f]
+v_ashrrev_i32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0xff,0x06,0x06,0x06]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0xe4,0x00,0x0f]
+v_ashrrev_i32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x22,0x01,0x06,0x06,0x06]
 
-v_max_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x1a,0x00,0xe4,0x08,0x00]
+v_ashrrev_i32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x26,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x06]
 
-v_min_u32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x1d,0x00,0x06,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x00,0x06,0x06]
 
-v_min_u32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0xff,0x06,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x01,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x1c,0x00,0x06,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x02,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x26,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x03,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x04,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x00,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x05,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x01,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x0e,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x02,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x16,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x03,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x16,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x04,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x05,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x00,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x0e,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x01,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x16,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x02,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x16,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x03,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x04,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x00,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x05,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x01,0x06]
+v_ashrrev_i32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x0e,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x02,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x06]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x03,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x00]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x04,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x01]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x05,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x02]
 
-v_min_u32_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x0e,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x03]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x06]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x04]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x00]
+v_ashrrev_i32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x05]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x01]
+v_ashrrev_i32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x22,0x01,0x06,0x06,0x0e]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x02]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0x00]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x03]
+v_ashrrev_i32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x23,0x01,0xe4,0x00,0x00]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x04]
+v_ashrrev_i32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0xff,0xe4,0x00,0x00]
 
-v_min_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x05]
+v_ashrrev_i32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x22,0x01,0xe4,0x00,0x00]
 
-v_min_u32_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1c,0x00,0x06,0x06,0x0e]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x1b,0x00,0x00]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0xe4,0x00,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x40,0x01,0x00]
 
-v_min_u32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x1d,0x00,0xe4,0x00,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x41,0x01,0x00]
 
-v_min_u32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0xff,0xe4,0x00,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x42,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x1c,0x00,0xe4,0x00,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x43,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x1b,0x00,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x30,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x40,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x34,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x41,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x38,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x42,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x3c,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x43,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x01,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x30,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x0f,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x34,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x11,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x38,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x1f,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x3c,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x21,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x01,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0x2f,0x01,0x00]
 
-v_min_u32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x0f,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0x10]
 
-v_min_u32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x11,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0x30]
 
-v_min_u32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x1f,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0xf0]
 
-v_min_u32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x21,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0xf0]
 
-v_min_u32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0x2f,0x01,0x00]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0x01]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0xe4,0x00,0x10]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0x03]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0xe4,0x00,0x30]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0x0f]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0xe4,0x00,0xf0]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x00,0x0f]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0xe4,0x00,0xf0]
+v_ashrrev_i32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x22,0x01,0xe4,0x08,0x00]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0xe4,0x00,0x01]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x06]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0xe4,0x00,0x03]
+v_lshlrev_b32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x25,0x01,0x06,0x06,0x06]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0xe4,0x00,0x0f]
+v_lshlrev_b32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0xff,0x06,0x06,0x06]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0xe4,0x00,0x0f]
+v_lshlrev_b32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x24,0x01,0x06,0x06,0x06]
 
-v_min_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x1c,0x00,0xe4,0x08,0x00]
+v_lshlrev_b32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x26,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x06]
 
-v_max_u32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x1f,0x00,0x06,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x00,0x06,0x06]
 
-v_max_u32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0xff,0x06,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x01,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x1e,0x00,0x06,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x02,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x26,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x03,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x04,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x00,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x05,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x01,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x0e,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x02,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x16,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x03,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x16,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x04,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x05,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x00,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x0e,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x01,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x16,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x02,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x16,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x03,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x04,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x00,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x05,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x01,0x06]
+v_lshlrev_b32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x0e,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x02,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x06]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x03,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x00]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x04,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x01]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x05,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x02]
 
-v_max_u32_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x0e,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x03]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x06]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x04]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x00]
+v_lshlrev_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x05]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x01]
+v_lshlrev_b32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x24,0x01,0x06,0x06,0x0e]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x02]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0x00]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x03]
+v_lshlrev_b32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x25,0x01,0xe4,0x00,0x00]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x04]
+v_lshlrev_b32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0xff,0xe4,0x00,0x00]
 
-v_max_u32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x05]
+v_lshlrev_b32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x24,0x01,0xe4,0x00,0x00]
 
-v_max_u32_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x1e,0x00,0x06,0x06,0x0e]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x1b,0x00,0x00]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0xe4,0x00,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x40,0x01,0x00]
 
-v_max_u32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x1f,0x00,0xe4,0x00,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x41,0x01,0x00]
 
-v_max_u32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0xff,0xe4,0x00,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x42,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x1e,0x00,0xe4,0x00,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x43,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x1b,0x00,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x30,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x40,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x34,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x41,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x38,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x42,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x3c,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x43,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x01,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x30,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x0f,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x34,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x11,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x38,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x1f,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x3c,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x21,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x01,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0x2f,0x01,0x00]
 
-v_max_u32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x0f,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0x10]
 
-v_max_u32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x11,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0x30]
 
-v_max_u32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x1f,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0xf0]
 
-v_max_u32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x21,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0xf0]
 
-v_max_u32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0x2f,0x01,0x00]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0x01]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0xe4,0x00,0x10]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0x03]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0xe4,0x00,0x30]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0x0f]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0xe4,0x00,0xf0]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x00,0x0f]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0xe4,0x00,0xf0]
+v_lshlrev_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x24,0x01,0xe4,0x08,0x00]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0xe4,0x00,0x01]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x06]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0xe4,0x00,0x03]
+v_and_b32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x27,0x01,0x06,0x06,0x06]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0xe4,0x00,0x0f]
+v_and_b32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0xff,0x06,0x06,0x06]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0xe4,0x00,0x0f]
+v_and_b32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x26,0x01,0x06,0x06,0x06]
 
-v_max_u32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x1e,0x00,0xe4,0x08,0x00]
+v_and_b32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x00,0x06,0x06]
 
-v_lshrrev_b32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x21,0x00,0x06,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x01,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0xff,0x06,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x02,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x20,0x00,0x06,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x03,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x26,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x04,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x05,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x00,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x0e,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x01,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x16,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x02,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x16,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x03,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x04,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x00,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x05,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x01,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x0e,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x02,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x16,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x03,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x16,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x04,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x05,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x00,0x06]
+v_and_b32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x0e,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x01,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x02,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x00]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x03,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x01]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x04,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x02]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x05,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x03]
 
-v_lshrrev_b32_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x0e,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x04]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x06]
+v_and_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x05]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x00]
+v_and_b32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x26,0x01,0x06,0x06,0x0e]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x01]
+v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0x00]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x02]
+v_and_b32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x27,0x01,0xe4,0x00,0x00]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x03]
+v_and_b32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0xff,0xe4,0x00,0x00]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x04]
+v_and_b32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x26,0x01,0xe4,0x00,0x00]
 
-v_lshrrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x05]
+v_and_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x1b,0x00,0x00]
 
-v_lshrrev_b32_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x20,0x00,0x06,0x06,0x0e]
+v_and_b32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x40,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0xe4,0x00,0x00]
+v_and_b32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x41,0x01,0x00]
 
-v_lshrrev_b32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x21,0x00,0xe4,0x00,0x00]
+v_and_b32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x42,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0xff,0xe4,0x00,0x00]
+v_and_b32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x43,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x20,0x00,0xe4,0x00,0x00]
+v_and_b32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x30,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x1b,0x00,0x00]
+v_and_b32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x34,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x40,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x38,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x41,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x3c,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x42,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x01,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x43,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x0f,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x30,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x11,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x34,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x1f,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x38,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x21,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x3c,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0x2f,0x01,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x01,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0x10]
 
-v_lshrrev_b32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x0f,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0x30]
 
-v_lshrrev_b32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x11,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0xf0]
 
-v_lshrrev_b32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x1f,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0xf0]
 
-v_lshrrev_b32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x21,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0x01]
 
-v_lshrrev_b32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0x2f,0x01,0x00]
+v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0x03]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0xe4,0x00,0x10]
+v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0x0f]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0xe4,0x00,0x30]
+v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x00,0x0f]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0xe4,0x00,0xf0]
+v_and_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x26,0x01,0xe4,0x08,0x00]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0xe4,0x00,0xf0]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0xe4,0x00,0x01]
+v_or_b32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x29,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0xe4,0x00,0x03]
+v_or_b32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0xff,0x06,0x06,0x06]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0xe4,0x00,0x0f]
+v_or_b32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x28,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0xe4,0x00,0x0f]
+v_or_b32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x20,0x00,0xe4,0x08,0x00]
+v_or_b32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x00,0x06,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x01,0x06,0x06]
 
-v_ashrrev_i32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x23,0x00,0x06,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x02,0x06,0x06]
 
-v_ashrrev_i32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0xff,0x06,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x03,0x06,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x22,0x00,0x06,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x04,0x06,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x26,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x05,0x06,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x0e,0x06,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x00,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x16,0x06,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x01,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x16,0x06,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x02,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x03,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x00,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x04,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x01,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x05,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x02,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x0e,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x03,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x16,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x04,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x16,0x06,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x05,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x06]
+v_or_b32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x0e,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x00,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x01,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x00]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x02,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x01]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x03,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x02]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x04,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x03]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x05,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x04]
 
-v_ashrrev_i32_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x0e,0x06]
+v_or_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x05]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x06]
+v_or_b32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x28,0x01,0x06,0x06,0x0e]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x00]
+v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0x00]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x01]
+v_or_b32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x29,0x01,0xe4,0x00,0x00]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x02]
+v_or_b32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0xff,0xe4,0x00,0x00]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x03]
+v_or_b32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x28,0x01,0xe4,0x00,0x00]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x04]
+v_or_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x1b,0x00,0x00]
 
-v_ashrrev_i32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x05]
+v_or_b32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x40,0x01,0x00]
 
-v_ashrrev_i32_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x22,0x00,0x06,0x06,0x0e]
+v_or_b32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x41,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0xe4,0x00,0x00]
+v_or_b32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x42,0x01,0x00]
 
-v_ashrrev_i32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x23,0x00,0xe4,0x00,0x00]
+v_or_b32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x43,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0xff,0xe4,0x00,0x00]
+v_or_b32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x30,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x22,0x00,0xe4,0x00,0x00]
+v_or_b32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x34,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x1b,0x00,0x00]
+v_or_b32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x38,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x40,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x3c,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x41,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x01,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x42,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x0f,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x43,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x11,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x30,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x1f,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x34,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x21,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x38,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0x2f,0x01,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x3c,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0x10]
 
-v_ashrrev_i32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x01,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0x30]
 
-v_ashrrev_i32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x0f,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0xf0]
 
-v_ashrrev_i32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x11,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0xf0]
 
-v_ashrrev_i32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x1f,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0x01]
 
-v_ashrrev_i32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x21,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0x03]
 
-v_ashrrev_i32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0x2f,0x01,0x00]
+v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0x0f]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0xe4,0x00,0x10]
+v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x00,0x0f]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0xe4,0x00,0x30]
+v_or_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x28,0x01,0xe4,0x08,0x00]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0xe4,0x00,0xf0]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0xe4,0x00,0xf0]
+v_xor_b32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x2b,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0xe4,0x00,0x01]
+v_xor_b32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0xff,0x06,0x06,0x06]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0xe4,0x00,0x03]
+v_xor_b32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x2a,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0xe4,0x00,0x0f]
+v_xor_b32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0xe4,0x00,0x0f]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x00,0x06,0x06]
 
-v_ashrrev_i32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x22,0x00,0xe4,0x08,0x00]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x01,0x06,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x02,0x06,0x06]
 
-v_lshlrev_b32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x25,0x00,0x06,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x03,0x06,0x06]
 
-v_lshlrev_b32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0xff,0x06,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x04,0x06,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x24,0x00,0x06,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x05,0x06,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x26,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x0e,0x06,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x16,0x06,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x00,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x16,0x06,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x01,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x02,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x00,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x03,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x01,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x04,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x02,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x05,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x03,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x0e,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x04,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x16,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x05,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x16,0x06,0x06]
+v_xor_b32_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x0e,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x00,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x00]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x01,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x01]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x02,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x02]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x03,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x03]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x04,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x04]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x05,0x06]
+v_xor_b32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x05]
 
-v_lshlrev_b32_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x0e,0x06]
+v_xor_b32_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2a,0x01,0x06,0x06,0x0e]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x06]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0x00]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x00]
+v_xor_b32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x2b,0x01,0xe4,0x00,0x00]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x01]
+v_xor_b32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0xff,0xe4,0x00,0x00]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x02]
+v_xor_b32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x2a,0x01,0xe4,0x00,0x00]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x03]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x1b,0x00,0x00]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x04]
+v_xor_b32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x40,0x01,0x00]
 
-v_lshlrev_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x05]
+v_xor_b32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x41,0x01,0x00]
 
-v_lshlrev_b32_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x24,0x00,0x06,0x06,0x0e]
+v_xor_b32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x42,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0xe4,0x00,0x00]
+v_xor_b32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x43,0x01,0x00]
 
-v_lshlrev_b32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x25,0x00,0xe4,0x00,0x00]
+v_xor_b32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x30,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0xff,0xe4,0x00,0x00]
+v_xor_b32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x34,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x24,0x00,0xe4,0x00,0x00]
+v_xor_b32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x38,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x1b,0x00,0x00]
+v_xor_b32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x3c,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x40,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x01,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x41,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x0f,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x42,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x11,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x43,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x1f,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x30,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x21,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x34,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0x2f,0x01,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x38,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0x10]
 
-v_lshlrev_b32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x3c,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0x30]
 
-v_lshlrev_b32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x01,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0xf0]
 
-v_lshlrev_b32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x0f,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0xf0]
 
-v_lshlrev_b32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x11,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0x01]
 
-v_lshlrev_b32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x1f,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0x03]
 
-v_lshlrev_b32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x21,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0x0f]
 
-v_lshlrev_b32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0x2f,0x01,0x00]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x00,0x0f]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0xe4,0x00,0x10]
+v_xor_b32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x2a,0x01,0xe4,0x08,0x00]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0xe4,0x00,0x30]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0xe4,0x00,0xf0]
+v_mac_f32_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x2d,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0xe4,0x00,0xf0]
+v_mac_f32_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0xff,0x06,0x06,0x06]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0xe4,0x00,0x01]
+v_mac_f32_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x2c,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0xe4,0x00,0x03]
+v_mac_f32_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x26,0x06,0x06]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0xe4,0x00,0x0f]
+v_mac_f32_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0xe4,0x00,0x0f]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x0e,0x06,0x06]
 
-v_lshlrev_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x24,0x00,0xe4,0x08,0x00]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x16,0x06,0x06]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x16,0x06,0x06]
 
-v_and_b32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x27,0x00,0x06,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x06]
 
-v_and_b32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0xff,0x06,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x00,0x06]
 
-v_and_b32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x26,0x00,0x06,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x01,0x06]
 
-v_and_b32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x02,0x06]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x00,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x03,0x06]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x01,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x04,0x06]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x02,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x05,0x06]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x03,0x06,0x06]
+v_mac_f32_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x16,0x06]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x04,0x06,0x06]
+v_mac_f32_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x26,0x06]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x05,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x06]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x0e,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x00]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x16,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x01]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x16,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x02]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x03]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x00,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x04]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x01,0x06]
+v_mac_f32_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x05]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x02,0x06]
+v_mac_f32_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x16]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x03,0x06]
+v_mac_f32_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x2c,0x01,0x06,0x06,0x26]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x04,0x06]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0x00]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x05,0x06]
+v_mac_f32_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x2d,0x01,0xe4,0x00,0x00]
 
-v_and_b32_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x0e,0x06]
+v_mac_f32_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0xff,0xe4,0x00,0x00]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x06]
+v_mac_f32_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x2c,0x01,0xe4,0x00,0x00]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x00]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x1b,0x00,0x00]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x01]
+v_mac_f32_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x40,0x01,0x00]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x02]
+v_mac_f32_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x41,0x01,0x00]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x03]
+v_mac_f32_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x42,0x01,0x00]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x04]
+v_mac_f32_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x43,0x01,0x00]
 
-v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x05]
+v_mac_f32_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x30,0x01,0x00]
 
-v_and_b32_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x26,0x00,0x06,0x06,0x0e]
+v_mac_f32_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x34,0x01,0x00]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0xe4,0x00,0x00]
+v_mac_f32_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x38,0x01,0x00]
 
-v_and_b32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x27,0x00,0xe4,0x00,0x00]
+v_mac_f32_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x3c,0x01,0x00]
 
-v_and_b32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0xff,0xe4,0x00,0x00]
+v_mac_f32_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x01,0x01,0x00]
 
-v_and_b32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x26,0x00,0xe4,0x00,0x00]
+v_mac_f32_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x0f,0x01,0x00]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x1b,0x00,0x00]
+v_mac_f32_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x11,0x01,0x00]
 
-v_and_b32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x40,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x1f,0x01,0x00]
 
-v_and_b32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x41,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x21,0x01,0x00]
 
-v_and_b32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x42,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0x2f,0x01,0x00]
 
-v_and_b32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x43,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0x10]
 
-v_and_b32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x30,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0x30]
 
-v_and_b32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x34,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0xf0]
 
-v_and_b32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x38,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0xf0]
 
-v_and_b32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x3c,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0x01]
 
-v_and_b32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x01,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0x03]
 
-v_and_b32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x0f,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0x0f]
 
-v_and_b32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x11,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x00,0x0f]
 
-v_and_b32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x1f,0x01,0x00]
+v_mac_f32_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x08,0x00]
 
-v_and_b32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x21,0x01,0x00]
+v_mac_f32_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x10,0x00]
 
-v_and_b32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0x2f,0x01,0x00]
+v_mac_f32_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x20,0x00]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0xe4,0x00,0x10]
+v_mac_f32_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x40,0x00]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0xe4,0x00,0x30]
+v_mac_f32_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x2c,0x01,0xe4,0x80,0x00]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0xe4,0x00,0xf0]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x06]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0xe4,0x00,0xf0]
+v_addc_u32_sdwa v255, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x39,0x01,0x06,0x06,0x06]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0xe4,0x00,0x01]
+v_addc_u32_sdwa v5, vcc, v255, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0xff,0x06,0x06,0x06]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0xe4,0x00,0x03]
+v_addc_u32_sdwa v5, vcc, v1, v255, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x38,0x01,0x06,0x06,0x06]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0xe4,0x00,0x0f]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x26,0x06,0x06]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0xe4,0x00,0x0f]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x06]
 
-v_and_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x26,0x00,0xe4,0x08,0x00]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x00,0x06,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x01,0x06,0x06]
 
-v_or_b32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x29,0x00,0x06,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x02,0x06,0x06]
 
-v_or_b32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0xff,0x06,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x03,0x06,0x06]
 
-v_or_b32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x28,0x00,0x06,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x04,0x06,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x05,0x06,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x00,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x0e,0x06,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x01,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x16,0x06,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x02,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x16,0x06,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x03,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x04,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x00,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x05,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x01,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x0e,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x02,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x16,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x03,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x16,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x04,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x05,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x00,0x06]
+v_addc_u32_sdwa v5, vcc, sext(v1), v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x0e,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x01,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x06]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x02,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x00]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x03,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x01]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x04,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x02]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x05,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x03]
 
-v_or_b32_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x0e,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x04]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x06]
+v_addc_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x05]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x00]
+v_addc_u32_sdwa v5, vcc, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x38,0x01,0x06,0x06,0x0e]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x01]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0x00]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x02]
+v_addc_u32_dpp v255, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x39,0x01,0xe4,0x00,0x00]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x03]
+v_addc_u32_dpp v5, vcc, v255, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0xff,0xe4,0x00,0x00]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x04]
+v_addc_u32_dpp v5, vcc, v1, v255, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x38,0x01,0xe4,0x00,0x00]
 
-v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x05]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x1b,0x00,0x00]
 
-v_or_b32_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x28,0x00,0x06,0x06,0x0e]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x40,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0xe4,0x00,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x41,0x01,0x00]
 
-v_or_b32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x29,0x00,0xe4,0x00,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x42,0x01,0x00]
 
-v_or_b32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0xff,0xe4,0x00,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x43,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x28,0x00,0xe4,0x00,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x30,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x1b,0x00,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x34,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x40,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x38,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x41,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x3c,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x42,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x01,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x43,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x0f,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x30,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x11,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x34,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x1f,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x38,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x21,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x3c,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0x2f,0x01,0x00]
 
-v_or_b32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x01,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0x10]
 
-v_or_b32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x0f,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0x30]
 
-v_or_b32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x11,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0xf0]
 
-v_or_b32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x1f,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0xf0]
 
-v_or_b32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x21,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0x01]
 
-v_or_b32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0x2f,0x01,0x00]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0x03]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0xe4,0x00,0x10]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0x0f]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0xe4,0x00,0x30]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x00,0x0f]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0xe4,0x00,0xf0]
+v_addc_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x38,0x01,0xe4,0x08,0x00]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0xe4,0x00,0xf0]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x06]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0xe4,0x00,0x01]
+v_subb_u32_sdwa v255, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x3b,0x01,0x06,0x06,0x06]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0xe4,0x00,0x03]
+v_subb_u32_sdwa v5, vcc, v255, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0xff,0x06,0x06,0x06]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0xe4,0x00,0x0f]
+v_subb_u32_sdwa v5, vcc, v1, v255, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x3a,0x01,0x06,0x06,0x06]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0xe4,0x00,0x0f]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x26,0x06,0x06]
 
-v_or_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x28,0x00,0xe4,0x08,0x00]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x00,0x06,0x06]
 
-v_xor_b32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x2b,0x00,0x06,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x01,0x06,0x06]
 
-v_xor_b32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0xff,0x06,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x02,0x06,0x06]
 
-v_xor_b32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x2a,0x00,0x06,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x03,0x06,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x04,0x06,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x00,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x05,0x06,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x01,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x0e,0x06,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x02,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x16,0x06,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x03,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x16,0x06,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x04,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x05,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x00,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x0e,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x01,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x16,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x02,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x16,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x03,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x04,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x00,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x05,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x01,0x06]
+v_subb_u32_sdwa v5, vcc, sext(v1), v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x0e,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x02,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x06]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x03,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x00]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x04,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x01]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x05,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x02]
 
-v_xor_b32_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x0e,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x03]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x06]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x04]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x00]
+v_subb_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x05]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x01]
+v_subb_u32_sdwa v5, vcc, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3a,0x01,0x06,0x06,0x0e]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x02]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0x00]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x03]
+v_subb_u32_dpp v255, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x3b,0x01,0xe4,0x00,0x00]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x04]
+v_subb_u32_dpp v5, vcc, v255, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0xff,0xe4,0x00,0x00]
 
-v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x05]
+v_subb_u32_dpp v5, vcc, v1, v255, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x3a,0x01,0xe4,0x00,0x00]
 
-v_xor_b32_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2a,0x00,0x06,0x06,0x0e]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x1b,0x00,0x00]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0xe4,0x00,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x40,0x01,0x00]
 
-v_xor_b32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x2b,0x00,0xe4,0x00,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x41,0x01,0x00]
 
-v_xor_b32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0xff,0xe4,0x00,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x42,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x2a,0x00,0xe4,0x00,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x43,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x1b,0x00,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x30,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x40,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x34,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x41,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x38,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x42,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x3c,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x43,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x01,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x30,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x0f,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x34,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x11,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x38,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x1f,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x3c,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x21,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x01,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0x2f,0x01,0x00]
 
-v_xor_b32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x0f,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0x10]
 
-v_xor_b32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x11,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0x30]
 
-v_xor_b32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x1f,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0xf0]
 
-v_xor_b32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x21,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0xf0]
 
-v_xor_b32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0x2f,0x01,0x00]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0x01]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0xe4,0x00,0x10]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0x03]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0xe4,0x00,0x30]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0x0f]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0xe4,0x00,0xf0]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x00,0x0f]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0xe4,0x00,0xf0]
+v_subb_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x3a,0x01,0xe4,0x08,0x00]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0xe4,0x00,0x01]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x06]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0xe4,0x00,0x03]
+v_subbrev_u32_sdwa v255, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x3d,0x01,0x06,0x06,0x06]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0xe4,0x00,0x0f]
+v_subbrev_u32_sdwa v5, vcc, v255, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0xff,0x06,0x06,0x06]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0xe4,0x00,0x0f]
+v_subbrev_u32_sdwa v5, vcc, v1, v255, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x3c,0x01,0x06,0x06,0x06]
 
-v_xor_b32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x2a,0x00,0xe4,0x08,0x00]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x26,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x06]
 
-v_mac_f32_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x2d,0x00,0x06,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x00,0x06,0x06]
 
-v_mac_f32_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0xff,0x06,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x01,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x2c,0x00,0x06,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x02,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x26,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x03,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x04,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x0e,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x05,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x16,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x0e,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x16,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x16,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x16,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x00,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x01,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x00,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x02,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x01,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x03,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x02,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x04,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x03,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x05,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x04,0x06]
 
-v_mac_f32_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x16,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x05,0x06]
 
-v_mac_f32_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x26,0x06]
+v_subbrev_u32_sdwa v5, vcc, sext(v1), v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x0e,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x06]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x06]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x00]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x00]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x01]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x01]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x02]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x02]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x03]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x03]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x04]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x04]
 
-v_mac_f32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x05]
+v_subbrev_u32_sdwa v5, vcc, v1, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x05]
 
-v_mac_f32_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x16]
+v_subbrev_u32_sdwa v5, vcc, v1, sext(v2), vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3c,0x01,0x06,0x06,0x0e]
 
-v_mac_f32_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x2c,0x00,0x06,0x06,0x26]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0x00]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x00,0x00]
+v_subbrev_u32_dpp v255, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x3d,0x01,0xe4,0x00,0x00]
 
-v_mac_f32_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x2d,0x00,0xe4,0x00,0x00]
+v_subbrev_u32_dpp v5, vcc, v255, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0xff,0xe4,0x00,0x00]
 
-v_mac_f32_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0xff,0xe4,0x00,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v255, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x3c,0x01,0xe4,0x00,0x00]
 
-v_mac_f32_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x2c,0x00,0xe4,0x00,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x1b,0x00,0x00]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x1b,0x00,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x40,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x40,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x41,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x41,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x42,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x42,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x43,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x43,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x30,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x30,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x34,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x34,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x38,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x38,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x3c,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x3c,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x01,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x01,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x0f,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x0f,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x11,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x11,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x1f,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x1f,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x21,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x21,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0x2f,0x01,0x00]
 
-v_mac_f32_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0x2f,0x01,0x00]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0x10]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x00,0x10]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0x30]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x00,0x30]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0xf0]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x00,0xf0]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0xf0]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x00,0xf0]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0x01]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x00,0x01]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0x03]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x00,0x03]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0x0f]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x00,0x0f]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x00,0x0f]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x00,0x0f]
+v_subbrev_u32_dpp v5, vcc, v1, v2, vcc quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x3c,0x01,0xe4,0x08,0x00]
 
-v_mac_f32_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x08,0x00]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x06]
 
-v_mac_f32_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x10,0x00]
+v_add_f16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x3f,0x01,0x06,0x06,0x06]
 
-v_mac_f32_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x20,0x00]
+v_add_f16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0xff,0x06,0x06,0x06]
 
-v_mac_f32_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x40,0x00]
+v_add_f16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x3e,0x01,0x06,0x06,0x06]
 
-v_mac_f32_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x2c,0x00,0xe4,0x80,0x00]
+v_add_f16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x26,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x06]
 
-v_add_f16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x3f,0x00,0x06,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x00,0x06,0x06]
 
-v_add_f16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0xff,0x06,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x01,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x3e,0x00,0x06,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x02,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x26,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x03,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x04,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x00,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x05,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x01,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x0e,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x02,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x16,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x03,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x16,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x04,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x05,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x00,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x0e,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x01,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x16,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x02,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x16,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x03,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x04,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x00,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x05,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x01,0x06]
+v_add_f16_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x16,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x02,0x06]
+v_add_f16_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x26,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x03,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x06]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x04,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x00]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x05,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x01]
 
-v_add_f16_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x16,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x02]
 
-v_add_f16_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x26,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x03]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x06]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x04]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x00]
+v_add_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x05]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x01]
+v_add_f16_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x16]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x02]
+v_add_f16_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x3e,0x01,0x06,0x06,0x26]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x03]
+v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x00,0x00]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x04]
+v_add_f16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x3f,0x01,0xe4,0x00,0x00]
 
-v_add_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x05]
+v_add_f16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0xff,0xe4,0x00,0x00]
 
-v_add_f16_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x16]
+v_add_f16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x3e,0x01,0xe4,0x00,0x00]
 
-v_add_f16_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x3e,0x00,0x06,0x06,0x26]
+v_add_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x1b,0x00,0x00]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x00,0x00]
+v_add_f16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x40,0x01,0x00]
 
-v_add_f16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x3f,0x00,0xe4,0x00,0x00]
+v_add_f16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x41,0x01,0x00]
 
-v_add_f16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0xff,0xe4,0x00,0x00]
+v_add_f16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x42,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x3e,0x00,0xe4,0x00,0x00]
+v_add_f16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x43,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x1b,0x00,0x00]
+v_add_f16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x30,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x40,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x34,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x41,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x38,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x42,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x3c,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x43,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x01,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x30,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x0f,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x34,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x11,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x38,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x1f,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x3c,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x21,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x01,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0x2f,0x01,0x00]
 
-v_add_f16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x0f,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x00,0x10]
 
-v_add_f16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x11,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x00,0x30]
 
-v_add_f16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x1f,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x00,0xf0]
 
-v_add_f16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x21,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x00,0xf0]
 
-v_add_f16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0x2f,0x01,0x00]
+v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x00,0x01]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x00,0x10]
+v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x00,0x03]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x00,0x30]
+v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x00,0x0f]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x00,0xf0]
+v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x00,0x0f]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x00,0xf0]
+v_add_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x08,0x00]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x00,0x01]
+v_add_f16_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x10,0x00]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x00,0x03]
+v_add_f16_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x20,0x00]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x00,0x0f]
+v_add_f16_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x40,0x00]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x00,0x0f]
+v_add_f16_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x3e,0x01,0xe4,0x80,0x00]
 
-v_add_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x08,0x00]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x06]
 
-v_add_f16_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x10,0x00]
+v_sub_f16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x41,0x01,0x06,0x06,0x06]
 
-v_add_f16_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x20,0x00]
+v_sub_f16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0xff,0x06,0x06,0x06]
 
-v_add_f16_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x40,0x00]
+v_sub_f16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x40,0x01,0x06,0x06,0x06]
 
-v_add_f16_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x3e,0x00,0xe4,0x80,0x00]
+v_sub_f16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x26,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x06]
 
-v_sub_f16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x41,0x00,0x06,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x00,0x06,0x06]
 
-v_sub_f16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0xff,0x06,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x01,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x40,0x00,0x06,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x02,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x26,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x03,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x04,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x00,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x05,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x01,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x0e,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x02,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x16,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x03,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x16,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x04,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x05,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x00,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x0e,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x01,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x16,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x02,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x16,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x03,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x04,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x00,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x05,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x01,0x06]
+v_sub_f16_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x16,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x02,0x06]
+v_sub_f16_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x26,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x03,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x06]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x04,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x00]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x05,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x01]
 
-v_sub_f16_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x16,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x02]
 
-v_sub_f16_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x26,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x03]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x06]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x04]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x00]
+v_sub_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x05]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x01]
+v_sub_f16_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x16]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x02]
+v_sub_f16_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x40,0x01,0x06,0x06,0x26]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x03]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0x00]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x04]
+v_sub_f16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x41,0x01,0xe4,0x00,0x00]
 
-v_sub_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x05]
+v_sub_f16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0xff,0xe4,0x00,0x00]
 
-v_sub_f16_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x16]
+v_sub_f16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x40,0x01,0xe4,0x00,0x00]
 
-v_sub_f16_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x40,0x00,0x06,0x06,0x26]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x1b,0x00,0x00]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x00,0x00]
+v_sub_f16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x40,0x01,0x00]
 
-v_sub_f16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x41,0x00,0xe4,0x00,0x00]
+v_sub_f16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x41,0x01,0x00]
 
-v_sub_f16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0xff,0xe4,0x00,0x00]
+v_sub_f16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x42,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x40,0x00,0xe4,0x00,0x00]
+v_sub_f16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x43,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x1b,0x00,0x00]
+v_sub_f16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x30,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x40,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x34,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x41,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x38,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x42,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x3c,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x43,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x01,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x30,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x0f,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x34,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x11,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x38,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x1f,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x3c,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x21,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x01,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0x2f,0x01,0x00]
 
-v_sub_f16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x0f,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0x10]
 
-v_sub_f16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x11,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0x30]
 
-v_sub_f16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x1f,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xf0]
 
-v_sub_f16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x21,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0xf0]
 
-v_sub_f16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0x2f,0x01,0x00]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0x01]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x00,0x10]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0x03]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x00,0x30]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0x0f]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x00,0xf0]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x00,0x0f]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x00,0xf0]
+v_sub_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x08,0x00]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x00,0x01]
+v_sub_f16_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x10,0x00]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x00,0x03]
+v_sub_f16_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x20,0x00]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x00,0x0f]
+v_sub_f16_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x40,0x00]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x00,0x0f]
+v_sub_f16_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x40,0x01,0xe4,0x80,0x00]
 
-v_sub_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x08,0x00]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x06]
 
-v_sub_f16_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x10,0x00]
+v_subrev_f16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x43,0x01,0x06,0x06,0x06]
 
-v_sub_f16_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x20,0x00]
+v_subrev_f16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0xff,0x06,0x06,0x06]
 
-v_sub_f16_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x40,0x00]
+v_subrev_f16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x42,0x01,0x06,0x06,0x06]
 
-v_sub_f16_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x40,0x00,0xe4,0x80,0x00]
+v_subrev_f16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x26,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x06]
 
-v_subrev_f16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x43,0x00,0x06,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x00,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0xff,0x06,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x01,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x42,0x00,0x06,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x02,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x26,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x03,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x04,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x00,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x05,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x01,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x0e,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x02,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x16,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x03,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x16,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x04,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x05,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x00,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x0e,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x01,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x16,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x02,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x16,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x03,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x04,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x00,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x05,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x01,0x06]
+v_subrev_f16_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x16,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x02,0x06]
+v_subrev_f16_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x26,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x03,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x06]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x04,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x00]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x05,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x01]
 
-v_subrev_f16_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x16,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x02]
 
-v_subrev_f16_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x26,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x03]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x06]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x04]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x00]
+v_subrev_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x05]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x01]
+v_subrev_f16_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x16]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x02]
+v_subrev_f16_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x42,0x01,0x06,0x06,0x26]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x03]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0x00]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x04]
+v_subrev_f16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x43,0x01,0xe4,0x00,0x00]
 
-v_subrev_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x05]
+v_subrev_f16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0xff,0xe4,0x00,0x00]
 
-v_subrev_f16_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x16]
+v_subrev_f16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x42,0x01,0xe4,0x00,0x00]
 
-v_subrev_f16_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x42,0x00,0x06,0x06,0x26]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x1b,0x00,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x00,0x00]
+v_subrev_f16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x40,0x01,0x00]
 
-v_subrev_f16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x43,0x00,0xe4,0x00,0x00]
+v_subrev_f16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x41,0x01,0x00]
 
-v_subrev_f16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0xff,0xe4,0x00,0x00]
+v_subrev_f16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x42,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x42,0x00,0xe4,0x00,0x00]
+v_subrev_f16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x43,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x1b,0x00,0x00]
+v_subrev_f16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x30,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x40,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x34,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x41,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x38,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x42,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x3c,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x43,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x01,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x30,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x0f,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x34,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x11,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x38,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x1f,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x3c,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x21,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x01,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0x2f,0x01,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x0f,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0x10]
 
-v_subrev_f16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x11,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0x30]
 
-v_subrev_f16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x1f,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xf0]
 
-v_subrev_f16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x21,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0xf0]
 
-v_subrev_f16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0x2f,0x01,0x00]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0x01]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x00,0x10]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0x03]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x00,0x30]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0x0f]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x00,0xf0]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x00,0x0f]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x00,0xf0]
+v_subrev_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x08,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x00,0x01]
+v_subrev_f16_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x10,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x00,0x03]
+v_subrev_f16_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x20,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x00,0x0f]
+v_subrev_f16_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x40,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x00,0x0f]
+v_subrev_f16_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x42,0x01,0xe4,0x80,0x00]
 
-v_subrev_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x08,0x00]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x06]
 
-v_subrev_f16_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x10,0x00]
+v_mul_f16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x45,0x01,0x06,0x06,0x06]
 
-v_subrev_f16_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x20,0x00]
+v_mul_f16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0xff,0x06,0x06,0x06]
 
-v_subrev_f16_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x40,0x00]
+v_mul_f16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x44,0x01,0x06,0x06,0x06]
 
-v_subrev_f16_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x42,0x00,0xe4,0x80,0x00]
+v_mul_f16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x26,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x06]
 
-v_mul_f16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x45,0x00,0x06,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x00,0x06,0x06]
 
-v_mul_f16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0xff,0x06,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x01,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x44,0x00,0x06,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x02,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x26,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x03,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x04,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x00,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x05,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x01,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x0e,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x02,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x16,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x03,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x16,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x04,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x05,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x00,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x0e,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x01,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x16,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x02,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x16,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x03,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x04,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x00,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x05,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x01,0x06]
+v_mul_f16_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x16,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x02,0x06]
+v_mul_f16_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x26,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x03,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x06]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x04,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x00]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x05,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x01]
 
-v_mul_f16_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x16,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x02]
 
-v_mul_f16_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x26,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x03]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x06]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x04]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x00]
+v_mul_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x05]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x01]
+v_mul_f16_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x16]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x02]
+v_mul_f16_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x44,0x01,0x06,0x06,0x26]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x03]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0x00]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x04]
+v_mul_f16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x45,0x01,0xe4,0x00,0x00]
 
-v_mul_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x05]
+v_mul_f16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0xff,0xe4,0x00,0x00]
 
-v_mul_f16_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x16]
+v_mul_f16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x44,0x01,0xe4,0x00,0x00]
 
-v_mul_f16_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x44,0x00,0x06,0x06,0x26]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x1b,0x00,0x00]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x00,0x00]
+v_mul_f16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x40,0x01,0x00]
 
-v_mul_f16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x45,0x00,0xe4,0x00,0x00]
+v_mul_f16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x41,0x01,0x00]
 
-v_mul_f16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0xff,0xe4,0x00,0x00]
+v_mul_f16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x42,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x44,0x00,0xe4,0x00,0x00]
+v_mul_f16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x43,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x1b,0x00,0x00]
+v_mul_f16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x30,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x40,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x34,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x41,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x38,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x42,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x3c,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x43,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x01,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x30,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x0f,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x34,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x11,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x38,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x1f,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x3c,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x21,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x01,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0x2f,0x01,0x00]
 
-v_mul_f16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x0f,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0x10]
 
-v_mul_f16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x11,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0x30]
 
-v_mul_f16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x1f,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xf0]
 
-v_mul_f16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x21,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0xf0]
 
-v_mul_f16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0x2f,0x01,0x00]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0x01]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x00,0x10]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0x03]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x00,0x30]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0x0f]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x00,0xf0]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x00,0x0f]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x00,0xf0]
+v_mul_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x08,0x00]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x00,0x01]
+v_mul_f16_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x10,0x00]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x00,0x03]
+v_mul_f16_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x20,0x00]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x00,0x0f]
+v_mul_f16_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x40,0x00]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x00,0x0f]
+v_mul_f16_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x44,0x01,0xe4,0x80,0x00]
 
-v_mul_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x08,0x00]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x06]
 
-v_mul_f16_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x10,0x00]
+v_mac_f16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x47,0x01,0x06,0x06,0x06]
 
-v_mul_f16_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x20,0x00]
+v_mac_f16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0xff,0x06,0x06,0x06]
 
-v_mul_f16_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x40,0x00]
+v_mac_f16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x46,0x01,0x06,0x06,0x06]
 
-v_mul_f16_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x44,0x00,0xe4,0x80,0x00]
+v_mac_f16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x26,0x06,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x06]
 
-v_mac_f16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x47,0x00,0x06,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x0e,0x06,0x06]
 
-v_mac_f16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0xff,0x06,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x16,0x06,0x06]
 
-v_mac_f16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x46,0x00,0x06,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x16,0x06,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x26,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x00,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x0e,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x01,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x16,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x02,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x16,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x03,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x04,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x00,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x05,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x01,0x06]
+v_mac_f16_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x16,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x02,0x06]
+v_mac_f16_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x26,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x03,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x06]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x04,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x00]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x05,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x01]
 
-v_mac_f16_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x16,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x02]
 
-v_mac_f16_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x26,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x03]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x06]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x04]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x00]
+v_mac_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x05]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x01]
+v_mac_f16_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x16]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x02]
+v_mac_f16_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x46,0x01,0x06,0x06,0x26]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x03]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x00,0x00]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x04]
+v_mac_f16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x47,0x01,0xe4,0x00,0x00]
 
-v_mac_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x05]
+v_mac_f16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0xff,0xe4,0x00,0x00]
 
-v_mac_f16_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x16]
+v_mac_f16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x46,0x01,0xe4,0x00,0x00]
 
-v_mac_f16_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x46,0x00,0x06,0x06,0x26]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x1b,0x00,0x00]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x00,0x00]
+v_mac_f16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x40,0x01,0x00]
 
-v_mac_f16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x47,0x00,0xe4,0x00,0x00]
+v_mac_f16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x41,0x01,0x00]
 
-v_mac_f16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0xff,0xe4,0x00,0x00]
+v_mac_f16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x42,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x46,0x00,0xe4,0x00,0x00]
+v_mac_f16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x43,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x1b,0x00,0x00]
+v_mac_f16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x30,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x40,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x34,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x41,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x38,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x42,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x3c,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x43,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x01,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x30,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x0f,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x34,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x11,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x38,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x1f,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x3c,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x21,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x01,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0x2f,0x01,0x00]
 
-v_mac_f16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x0f,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x00,0x10]
 
-v_mac_f16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x11,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x00,0x30]
 
-v_mac_f16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x1f,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x00,0xf0]
 
-v_mac_f16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x21,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x00,0xf0]
 
-v_mac_f16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0x2f,0x01,0x00]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x00,0x01]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x00,0x10]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x00,0x03]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x00,0x30]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x00,0x0f]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x00,0xf0]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x00,0x0f]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x00,0xf0]
+v_mac_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x08,0x00]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x00,0x01]
+v_mac_f16_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x10,0x00]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x00,0x03]
+v_mac_f16_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x20,0x00]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x00,0x0f]
+v_mac_f16_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x40,0x00]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x00,0x0f]
+v_mac_f16_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x46,0x01,0xe4,0x80,0x00]
 
-v_mac_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x08,0x00]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x06]
 
-v_mac_f16_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x10,0x00]
+v_add_u16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x4d,0x01,0x06,0x06,0x06]
 
-v_mac_f16_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x20,0x00]
+v_add_u16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0xff,0x06,0x06,0x06]
 
-v_mac_f16_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x40,0x00]
+v_add_u16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x4c,0x01,0x06,0x06,0x06]
 
-v_mac_f16_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x46,0x00,0xe4,0x80,0x00]
+v_add_u16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x26,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x06]
 
-v_add_u16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x4d,0x00,0x06,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x00,0x06,0x06]
 
-v_add_u16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0xff,0x06,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x01,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x4c,0x00,0x06,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x02,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x26,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x03,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x04,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x00,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x05,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x01,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x0e,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x02,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x16,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x03,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x16,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x04,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x05,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x00,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x0e,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x01,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x16,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x02,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x16,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x03,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x04,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x00,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x05,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x01,0x06]
+v_add_u16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x0e,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x02,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x06]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x03,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x00]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x04,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x01]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x05,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x02]
 
-v_add_u16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x0e,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x03]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x06]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x04]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x00]
+v_add_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x05]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x01]
+v_add_u16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4c,0x01,0x06,0x06,0x0e]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x02]
+v_add_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0x00]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x03]
+v_add_u16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x4d,0x01,0xe4,0x00,0x00]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x04]
+v_add_u16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0xff,0xe4,0x00,0x00]
 
-v_add_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x05]
+v_add_u16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x4c,0x01,0xe4,0x00,0x00]
 
-v_add_u16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4c,0x00,0x06,0x06,0x0e]
+v_add_u16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x1b,0x00,0x00]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0xe4,0x00,0x00]
+v_add_u16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x40,0x01,0x00]
 
-v_add_u16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x4d,0x00,0xe4,0x00,0x00]
+v_add_u16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x41,0x01,0x00]
 
-v_add_u16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0xff,0xe4,0x00,0x00]
+v_add_u16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x42,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x4c,0x00,0xe4,0x00,0x00]
+v_add_u16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x43,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x1b,0x00,0x00]
+v_add_u16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x30,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x40,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x34,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x41,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x38,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x42,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x3c,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x43,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x01,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x30,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x0f,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x34,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x11,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x38,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x1f,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x3c,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x21,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x01,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0x2f,0x01,0x00]
 
-v_add_u16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x0f,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0x10]
 
-v_add_u16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x11,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0x30]
 
-v_add_u16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x1f,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0xf0]
 
-v_add_u16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x21,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0xf0]
 
-v_add_u16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0x2f,0x01,0x00]
+v_add_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0x01]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0xe4,0x00,0x10]
+v_add_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0x03]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0xe4,0x00,0x30]
+v_add_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0x0f]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0xe4,0x00,0xf0]
+v_add_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x00,0x0f]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0xe4,0x00,0xf0]
+v_add_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x4c,0x01,0xe4,0x08,0x00]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0xe4,0x00,0x01]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x06]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0xe4,0x00,0x03]
+v_sub_u16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x4f,0x01,0x06,0x06,0x06]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0xe4,0x00,0x0f]
+v_sub_u16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0xff,0x06,0x06,0x06]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0xe4,0x00,0x0f]
+v_sub_u16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x4e,0x01,0x06,0x06,0x06]
 
-v_add_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x4c,0x00,0xe4,0x08,0x00]
+v_sub_u16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x26,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x06]
 
-v_sub_u16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x4f,0x00,0x06,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x00,0x06,0x06]
 
-v_sub_u16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0xff,0x06,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x01,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x4e,0x00,0x06,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x02,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x26,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x03,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x04,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x00,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x05,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x01,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x0e,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x02,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x16,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x03,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x16,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x04,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x05,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x00,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x0e,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x01,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x16,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x02,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x16,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x03,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x04,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x00,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x05,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x01,0x06]
+v_sub_u16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x0e,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x02,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x06]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x03,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x00]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x04,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x01]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x05,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x02]
 
-v_sub_u16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x0e,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x03]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x06]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x04]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x00]
+v_sub_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x05]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x01]
+v_sub_u16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x4e,0x01,0x06,0x06,0x0e]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x02]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0x00]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x03]
+v_sub_u16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x4f,0x01,0xe4,0x00,0x00]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x04]
+v_sub_u16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0xff,0xe4,0x00,0x00]
 
-v_sub_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x05]
+v_sub_u16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x4e,0x01,0xe4,0x00,0x00]
 
-v_sub_u16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x4e,0x00,0x06,0x06,0x0e]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x1b,0x00,0x00]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0xe4,0x00,0x00]
+v_sub_u16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x40,0x01,0x00]
 
-v_sub_u16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x4f,0x00,0xe4,0x00,0x00]
+v_sub_u16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x41,0x01,0x00]
 
-v_sub_u16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0xff,0xe4,0x00,0x00]
+v_sub_u16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x42,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x4e,0x00,0xe4,0x00,0x00]
+v_sub_u16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x43,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x1b,0x00,0x00]
+v_sub_u16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x30,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x40,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x34,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x41,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x38,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x42,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x3c,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x43,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x01,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x30,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x0f,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x34,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x11,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x38,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x1f,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x3c,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x21,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x01,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0x2f,0x01,0x00]
 
-v_sub_u16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x0f,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0x10]
 
-v_sub_u16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x11,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0x30]
 
-v_sub_u16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x1f,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0xf0]
 
-v_sub_u16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x21,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0xf0]
 
-v_sub_u16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0x2f,0x01,0x00]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0x01]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0xe4,0x00,0x10]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0x03]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0xe4,0x00,0x30]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0x0f]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0xe4,0x00,0xf0]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x00,0x0f]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0xe4,0x00,0xf0]
+v_sub_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x4e,0x01,0xe4,0x08,0x00]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0xe4,0x00,0x01]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x06]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0xe4,0x00,0x03]
+v_subrev_u16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x51,0x01,0x06,0x06,0x06]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0xe4,0x00,0x0f]
+v_subrev_u16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0xff,0x06,0x06,0x06]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0xe4,0x00,0x0f]
+v_subrev_u16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x50,0x01,0x06,0x06,0x06]
 
-v_sub_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x4e,0x00,0xe4,0x08,0x00]
+v_subrev_u16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x26,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x06]
 
-v_subrev_u16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x51,0x00,0x06,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x00,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0xff,0x06,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x01,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x50,0x00,0x06,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x02,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x26,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x03,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x04,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x00,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x05,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x01,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x0e,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x02,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x16,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x03,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x16,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x04,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x05,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x00,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x0e,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x01,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x16,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x02,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x16,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x03,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x04,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x00,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x05,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x01,0x06]
+v_subrev_u16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x0e,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x02,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x06]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x03,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x00]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x04,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x01]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x05,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x02]
 
-v_subrev_u16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x0e,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x03]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x06]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x04]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x00]
+v_subrev_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x05]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x01]
+v_subrev_u16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x50,0x01,0x06,0x06,0x0e]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x02]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x00]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x03]
+v_subrev_u16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x51,0x01,0xe4,0x00,0x00]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x04]
+v_subrev_u16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0xff,0xe4,0x00,0x00]
 
-v_subrev_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x05]
+v_subrev_u16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x50,0x01,0xe4,0x00,0x00]
 
-v_subrev_u16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x50,0x00,0x06,0x06,0x0e]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x1b,0x00,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0xe4,0x00,0x00]
+v_subrev_u16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x40,0x01,0x00]
 
-v_subrev_u16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x51,0x00,0xe4,0x00,0x00]
+v_subrev_u16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x41,0x01,0x00]
 
-v_subrev_u16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0xff,0xe4,0x00,0x00]
+v_subrev_u16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x42,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x50,0x00,0xe4,0x00,0x00]
+v_subrev_u16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x43,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x1b,0x00,0x00]
+v_subrev_u16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x30,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x40,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x34,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x41,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x38,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x42,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x3c,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x43,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x01,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x30,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x0f,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x34,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x11,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x38,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x1f,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x3c,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x21,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x01,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0x2f,0x01,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x0f,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x10]
 
-v_subrev_u16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x11,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x30]
 
-v_subrev_u16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x1f,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0xf0]
 
-v_subrev_u16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x21,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0xf0]
 
-v_subrev_u16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0x2f,0x01,0x00]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x01]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0xe4,0x00,0x10]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x03]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0xe4,0x00,0x30]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x0f]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0xe4,0x00,0xf0]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x00,0x0f]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0xe4,0x00,0xf0]
+v_subrev_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x50,0x01,0xe4,0x08,0x00]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0xe4,0x00,0x01]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x06]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0xe4,0x00,0x03]
+v_mul_lo_u16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x53,0x01,0x06,0x06,0x06]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0xe4,0x00,0x0f]
+v_mul_lo_u16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0xff,0x06,0x06,0x06]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0xe4,0x00,0x0f]
+v_mul_lo_u16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x52,0x01,0x06,0x06,0x06]
 
-v_subrev_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x50,0x00,0xe4,0x08,0x00]
+v_mul_lo_u16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x26,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x06]
 
-v_mul_lo_u16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x53,0x00,0x06,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x00,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0xff,0x06,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x01,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x52,0x00,0x06,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x02,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x26,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x03,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x04,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x00,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x05,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x01,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x0e,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x02,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x16,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x03,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x16,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x04,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x05,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x00,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x0e,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x01,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x16,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x02,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x16,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x03,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x04,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x00,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x05,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x01,0x06]
+v_mul_lo_u16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x0e,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x02,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x06]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x03,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x00]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x04,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x01]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x05,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x02]
 
-v_mul_lo_u16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x0e,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x03]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x06]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x04]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x00]
+v_mul_lo_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x05]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x01]
+v_mul_lo_u16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x52,0x01,0x06,0x06,0x0e]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x02]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x00]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x03]
+v_mul_lo_u16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x53,0x01,0xe4,0x00,0x00]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x04]
+v_mul_lo_u16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0xff,0xe4,0x00,0x00]
 
-v_mul_lo_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x05]
+v_mul_lo_u16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x52,0x01,0xe4,0x00,0x00]
 
-v_mul_lo_u16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x52,0x00,0x06,0x06,0x0e]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x1b,0x00,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0xe4,0x00,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x40,0x01,0x00]
 
-v_mul_lo_u16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x53,0x00,0xe4,0x00,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x41,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0xff,0xe4,0x00,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x42,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x52,0x00,0xe4,0x00,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x43,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x1b,0x00,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x30,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x40,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x34,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x41,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x38,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x42,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x3c,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x43,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x01,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x30,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x0f,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x34,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x11,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x38,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x1f,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x3c,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x21,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x01,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0x2f,0x01,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x0f,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x10]
 
-v_mul_lo_u16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x11,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x30]
 
-v_mul_lo_u16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x1f,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0xf0]
 
-v_mul_lo_u16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x21,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0xf0]
 
-v_mul_lo_u16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0x2f,0x01,0x00]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x01]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0xe4,0x00,0x10]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x03]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0xe4,0x00,0x30]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x0f]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0xe4,0x00,0xf0]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x00,0x0f]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0xe4,0x00,0xf0]
+v_mul_lo_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x52,0x01,0xe4,0x08,0x00]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0xe4,0x00,0x01]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x06]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0xe4,0x00,0x03]
+v_lshlrev_b16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x55,0x01,0x06,0x06,0x06]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0xe4,0x00,0x0f]
+v_lshlrev_b16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0xff,0x06,0x06,0x06]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0xe4,0x00,0x0f]
+v_lshlrev_b16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x54,0x01,0x06,0x06,0x06]
 
-v_mul_lo_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x52,0x00,0xe4,0x08,0x00]
+v_lshlrev_b16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x26,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x55,0x00,0x06,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x00,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0xff,0x06,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x01,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x54,0x00,0x06,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x02,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x26,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x03,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x04,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x00,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x05,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x01,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x0e,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x02,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x16,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x03,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x16,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x04,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x05,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x00,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x0e,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x01,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x16,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x02,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x16,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x03,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x04,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x00,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x05,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x01,0x06]
+v_lshlrev_b16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x0e,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x02,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x03,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x00]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x04,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x01]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x05,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x02]
 
-v_lshlrev_b16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x0e,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x03]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x06]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x04]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x00]
+v_lshlrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x05]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x01]
+v_lshlrev_b16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x54,0x01,0x06,0x06,0x0e]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x02]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x00]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x03]
+v_lshlrev_b16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x55,0x01,0xe4,0x00,0x00]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x04]
+v_lshlrev_b16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0xff,0xe4,0x00,0x00]
 
-v_lshlrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x05]
+v_lshlrev_b16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x54,0x01,0xe4,0x00,0x00]
 
-v_lshlrev_b16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x54,0x00,0x06,0x06,0x0e]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x1b,0x00,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0xe4,0x00,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x40,0x01,0x00]
 
-v_lshlrev_b16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x55,0x00,0xe4,0x00,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x41,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0xff,0xe4,0x00,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x42,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x54,0x00,0xe4,0x00,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x43,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x1b,0x00,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x30,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x40,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x34,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x41,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x38,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x42,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x3c,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x43,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x01,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x30,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x0f,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x34,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x11,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x38,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x1f,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x3c,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x21,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x01,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0x2f,0x01,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x0f,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x10]
 
-v_lshlrev_b16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x11,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x30]
 
-v_lshlrev_b16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x1f,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0xf0]
 
-v_lshlrev_b16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x21,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0xf0]
 
-v_lshlrev_b16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0x2f,0x01,0x00]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x01]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0xe4,0x00,0x10]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x03]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0xe4,0x00,0x30]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x0f]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0xe4,0x00,0xf0]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x00,0x0f]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0xe4,0x00,0xf0]
+v_lshlrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x54,0x01,0xe4,0x08,0x00]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0xe4,0x00,0x01]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0xe4,0x00,0x03]
+v_lshrrev_b16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x57,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0xe4,0x00,0x0f]
+v_lshrrev_b16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0xff,0x06,0x06,0x06]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0xe4,0x00,0x0f]
+v_lshrrev_b16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x56,0x01,0x06,0x06,0x06]
 
-v_lshlrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x54,0x00,0xe4,0x08,0x00]
+v_lshrrev_b16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x26,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x57,0x00,0x06,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x00,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0xff,0x06,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x01,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x56,0x00,0x06,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x02,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x26,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x03,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x04,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x00,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x05,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x01,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x0e,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x02,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x16,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x03,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x16,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x04,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x05,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x00,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x0e,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x01,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x16,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x02,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x16,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x03,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x04,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x00,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x05,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x01,0x06]
+v_lshrrev_b16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x0e,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x02,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x03,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x00]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x04,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x01]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x05,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x02]
 
-v_lshrrev_b16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x0e,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x03]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x06]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x04]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x00]
+v_lshrrev_b16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x05]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x01]
+v_lshrrev_b16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x56,0x01,0x06,0x06,0x0e]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x02]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0x00]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x03]
+v_lshrrev_b16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x57,0x01,0xe4,0x00,0x00]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x04]
+v_lshrrev_b16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0xff,0xe4,0x00,0x00]
 
-v_lshrrev_b16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x05]
+v_lshrrev_b16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x56,0x01,0xe4,0x00,0x00]
 
-v_lshrrev_b16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x56,0x00,0x06,0x06,0x0e]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x1b,0x00,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0xe4,0x00,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x40,0x01,0x00]
 
-v_lshrrev_b16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x57,0x00,0xe4,0x00,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x41,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0xff,0xe4,0x00,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x42,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x56,0x00,0xe4,0x00,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x43,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x1b,0x00,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x30,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x40,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x34,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x41,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x38,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x42,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x3c,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x43,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x01,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x30,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x0f,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x34,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x11,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x38,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x1f,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x3c,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x21,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x01,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0x2f,0x01,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x0f,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0x10]
 
-v_lshrrev_b16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x11,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0x30]
 
-v_lshrrev_b16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x1f,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0xf0]
 
-v_lshrrev_b16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x21,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0xf0]
 
-v_lshrrev_b16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0x2f,0x01,0x00]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0x01]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0xe4,0x00,0x10]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0x03]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0xe4,0x00,0x30]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0x0f]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0xe4,0x00,0xf0]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x00,0x0f]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0xe4,0x00,0xf0]
+v_lshrrev_b16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x56,0x01,0xe4,0x08,0x00]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0xe4,0x00,0x01]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0xe4,0x00,0x03]
+v_ashrrev_i16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x59,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0xe4,0x00,0x0f]
+v_ashrrev_i16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0xff,0x06,0x06,0x06]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0xe4,0x00,0x0f]
+v_ashrrev_i16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x58,0x01,0x06,0x06,0x06]
 
-v_lshrrev_b16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x56,0x00,0xe4,0x08,0x00]
+v_ashrrev_i16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x26,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x59,0x00,0x06,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x00,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0xff,0x06,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x01,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x58,0x00,0x06,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x02,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x26,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x03,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x04,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x00,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x05,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x01,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x0e,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x02,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x16,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x03,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x16,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x04,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x05,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x00,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x0e,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x01,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x16,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x02,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x16,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x03,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x04,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x00,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x05,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x01,0x06]
+v_ashrrev_i16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x0e,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x02,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x03,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x00]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x04,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x01]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x05,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x02]
 
-v_ashrrev_i16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x0e,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x03]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x06]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x04]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x00]
+v_ashrrev_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x05]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x01]
+v_ashrrev_i16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x58,0x01,0x06,0x06,0x0e]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x02]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0xe4,0x00,0x00]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x03]
+v_ashrrev_i16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x59,0x01,0xe4,0x00,0x00]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x04]
+v_ashrrev_i16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0xff,0xe4,0x00,0x00]
 
-v_ashrrev_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x05]
+v_ashrrev_i16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x58,0x01,0xe4,0x00,0x00]
 
-v_ashrrev_i16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x58,0x00,0x06,0x06,0x0e]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x1b,0x00,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0xe4,0x00,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x40,0x01,0x00]
 
-v_ashrrev_i16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x59,0x00,0xe4,0x00,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x41,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0xff,0xe4,0x00,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x42,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x58,0x00,0xe4,0x00,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x43,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x1b,0x00,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x30,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x40,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x34,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x41,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x38,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x42,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x3c,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x43,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x01,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x30,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x0f,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x34,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x11,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x38,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x1f,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x3c,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x21,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x01,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0x2f,0x01,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x0f,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0xe4,0x00,0x10]
 
-v_ashrrev_i16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x11,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0xe4,0x00,0x30]
 
-v_ashrrev_i16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x1f,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0xe4,0x00,0xf0]
 
-v_ashrrev_i16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x21,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0xe4,0x00,0xf0]
 
-v_ashrrev_i16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0x2f,0x01,0x00]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0xe4,0x00,0x01]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0xe4,0x00,0x10]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0xe4,0x00,0x03]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0xe4,0x00,0x30]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0xe4,0x00,0x0f]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0xe4,0x00,0xf0]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0xe4,0x00,0x0f]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0xe4,0x00,0xf0]
+v_ashrrev_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x58,0x01,0xe4,0x08,0x00]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0xe4,0x00,0x01]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0xe4,0x00,0x03]
+v_max_f16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x5b,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0xe4,0x00,0x0f]
+v_max_f16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0xff,0x06,0x06,0x06]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0xe4,0x00,0x0f]
+v_max_f16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x5a,0x01,0x06,0x06,0x06]
 
-v_ashrrev_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x58,0x00,0xe4,0x08,0x00]
+v_max_f16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x26,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x06]
 
-v_max_f16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x5b,0x00,0x06,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x00,0x06,0x06]
 
-v_max_f16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0xff,0x06,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x01,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x5a,0x00,0x06,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x02,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x26,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x03,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x04,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x00,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x05,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x01,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x0e,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x02,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x16,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x03,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x16,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x04,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x05,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x00,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x0e,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x01,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x16,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x02,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x16,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x03,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x04,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x00,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x05,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x01,0x06]
+v_max_f16_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x16,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x02,0x06]
+v_max_f16_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x26,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x03,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x06]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x04,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x00]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x05,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x01]
 
-v_max_f16_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x16,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x02]
 
-v_max_f16_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x26,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x03]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x06]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x04]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x00]
+v_max_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x05]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x01]
+v_max_f16_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x16]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x02]
+v_max_f16_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5a,0x01,0x06,0x06,0x26]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x03]
+v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x00,0x00]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x04]
+v_max_f16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x5b,0x01,0xe4,0x00,0x00]
 
-v_max_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x05]
+v_max_f16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0xff,0xe4,0x00,0x00]
 
-v_max_f16_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x16]
+v_max_f16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x5a,0x01,0xe4,0x00,0x00]
 
-v_max_f16_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5a,0x00,0x06,0x06,0x26]
+v_max_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x1b,0x00,0x00]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x00,0x00]
+v_max_f16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x40,0x01,0x00]
 
-v_max_f16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x5b,0x00,0xe4,0x00,0x00]
+v_max_f16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x41,0x01,0x00]
 
-v_max_f16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0xff,0xe4,0x00,0x00]
+v_max_f16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x42,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x5a,0x00,0xe4,0x00,0x00]
+v_max_f16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x43,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x1b,0x00,0x00]
+v_max_f16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x30,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x40,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x34,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x41,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x38,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x42,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x3c,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x43,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x01,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x30,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x0f,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x34,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x11,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x38,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x1f,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x3c,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x21,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x01,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0x2f,0x01,0x00]
 
-v_max_f16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x0f,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x00,0x10]
 
-v_max_f16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x11,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x00,0x30]
 
-v_max_f16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x1f,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x00,0xf0]
 
-v_max_f16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x21,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x00,0xf0]
 
-v_max_f16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0x2f,0x01,0x00]
+v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x00,0x01]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x00,0x10]
+v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x00,0x03]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x00,0x30]
+v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x00,0x0f]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x00,0xf0]
+v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x00,0x0f]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x00,0xf0]
+v_max_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x08,0x00]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x00,0x01]
+v_max_f16_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x10,0x00]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x00,0x03]
+v_max_f16_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x20,0x00]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x00,0x0f]
+v_max_f16_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x40,0x00]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x00,0x0f]
+v_max_f16_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5a,0x01,0xe4,0x80,0x00]
 
-v_max_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x08,0x00]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x06]
 
-v_max_f16_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x10,0x00]
+v_min_f16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x5d,0x01,0x06,0x06,0x06]
 
-v_max_f16_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x20,0x00]
+v_min_f16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0xff,0x06,0x06,0x06]
 
-v_max_f16_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x40,0x00]
+v_min_f16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x5c,0x01,0x06,0x06,0x06]
 
-v_max_f16_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5a,0x00,0xe4,0x80,0x00]
+v_min_f16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x26,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x06]
 
-v_min_f16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x5d,0x00,0x06,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x00,0x06,0x06]
 
-v_min_f16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0xff,0x06,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x01,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x5c,0x00,0x06,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x02,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x26,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x03,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x04,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x00,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x05,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x01,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x0e,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x02,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x16,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x03,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x16,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x04,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x05,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x00,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x0e,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x01,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x16,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x02,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x16,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x03,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x04,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x00,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x05,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x01,0x06]
+v_min_f16_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x16,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x02,0x06]
+v_min_f16_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x26,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x03,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x06]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x04,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x00]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x05,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x01]
 
-v_min_f16_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x16,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x02]
 
-v_min_f16_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x26,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x03]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x06]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x04]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x00]
+v_min_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x05]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x01]
+v_min_f16_sdwa v5, v1, -v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x16]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x02]
+v_min_f16_sdwa v5, v1, |v2| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5c,0x01,0x06,0x06,0x26]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x03]
+v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x00,0x00]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x04]
+v_min_f16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x5d,0x01,0xe4,0x00,0x00]
 
-v_min_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x05]
+v_min_f16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0xff,0xe4,0x00,0x00]
 
-v_min_f16_sdwa v0, v0, -v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x16]
+v_min_f16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x5c,0x01,0xe4,0x00,0x00]
 
-v_min_f16_sdwa v0, v0, |v0| dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5c,0x00,0x06,0x06,0x26]
+v_min_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x1b,0x00,0x00]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x00,0x00]
+v_min_f16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x40,0x01,0x00]
 
-v_min_f16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x5d,0x00,0xe4,0x00,0x00]
+v_min_f16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x41,0x01,0x00]
 
-v_min_f16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0xff,0xe4,0x00,0x00]
+v_min_f16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x42,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x5c,0x00,0xe4,0x00,0x00]
+v_min_f16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x43,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x1b,0x00,0x00]
+v_min_f16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x30,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x40,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x34,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x41,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x38,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x42,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x3c,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x43,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x01,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x30,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x0f,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x34,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x11,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x38,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x1f,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x3c,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x21,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x01,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0x2f,0x01,0x00]
 
-v_min_f16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x0f,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x00,0x10]
 
-v_min_f16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x11,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x00,0x30]
 
-v_min_f16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x1f,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x00,0xf0]
 
-v_min_f16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x21,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x00,0xf0]
 
-v_min_f16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0x2f,0x01,0x00]
+v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x00,0x01]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x00,0x10]
+v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x00,0x03]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x00,0x30]
+v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x00,0x0f]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x00,0xf0]
+v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x00,0x0f]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x00,0xf0]
+v_min_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x08,0x00]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x00,0x01]
+v_min_f16_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x10,0x00]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x00,0x03]
+v_min_f16_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x20,0x00]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x00,0x0f]
+v_min_f16_dpp v5, v1, -v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x40,0x00]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x00,0x0f]
+v_min_f16_dpp v5, v1, |v2| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5c,0x01,0xe4,0x80,0x00]
 
-v_min_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x08,0x00]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x06]
 
-v_min_f16_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x10,0x00]
+v_max_u16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x5f,0x01,0x06,0x06,0x06]
 
-v_min_f16_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x20,0x00]
+v_max_u16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0xff,0x06,0x06,0x06]
 
-v_min_f16_dpp v0, v0, -v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x40,0x00]
+v_max_u16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x5e,0x01,0x06,0x06,0x06]
 
-v_min_f16_dpp v0, v0, |v0| quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5c,0x00,0xe4,0x80,0x00]
+v_max_u16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x26,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x06]
 
-v_max_u16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x5f,0x00,0x06,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x00,0x06,0x06]
 
-v_max_u16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0xff,0x06,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x01,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x5e,0x00,0x06,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x02,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x26,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x03,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x04,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x00,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x05,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x01,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x0e,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x02,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x16,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x03,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x16,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x04,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x05,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x00,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x0e,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x01,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x16,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x02,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x16,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x03,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x04,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x00,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x05,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x01,0x06]
+v_max_u16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x0e,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x02,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x06]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x03,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x00]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x04,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x01]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x05,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x02]
 
-v_max_u16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x0e,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x03]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x06]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x04]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x00]
+v_max_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x05]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x01]
+v_max_u16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x5e,0x01,0x06,0x06,0x0e]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x02]
+v_max_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0x00]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x03]
+v_max_u16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x5f,0x01,0xe4,0x00,0x00]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x04]
+v_max_u16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0xff,0xe4,0x00,0x00]
 
-v_max_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x05]
+v_max_u16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x5e,0x01,0xe4,0x00,0x00]
 
-v_max_u16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x5e,0x00,0x06,0x06,0x0e]
+v_max_u16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x1b,0x00,0x00]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0xe4,0x00,0x00]
+v_max_u16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x40,0x01,0x00]
 
-v_max_u16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x5f,0x00,0xe4,0x00,0x00]
+v_max_u16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x41,0x01,0x00]
 
-v_max_u16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0xff,0xe4,0x00,0x00]
+v_max_u16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x42,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x5e,0x00,0xe4,0x00,0x00]
+v_max_u16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x43,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x1b,0x00,0x00]
+v_max_u16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x30,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x40,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x34,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x41,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x38,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x42,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x3c,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x43,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x01,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x30,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x0f,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x34,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x11,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x38,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x1f,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x3c,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x21,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x01,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0x2f,0x01,0x00]
 
-v_max_u16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x0f,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0x10]
 
-v_max_u16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x11,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0x30]
 
-v_max_u16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x1f,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xf0]
 
-v_max_u16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x21,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0xf0]
 
-v_max_u16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0x2f,0x01,0x00]
+v_max_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0x01]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0xe4,0x00,0x10]
+v_max_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0x03]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0xe4,0x00,0x30]
+v_max_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0x0f]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0xe4,0x00,0xf0]
+v_max_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x00,0x0f]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0xe4,0x00,0xf0]
+v_max_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x5e,0x01,0xe4,0x08,0x00]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0xe4,0x00,0x01]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x06]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0xe4,0x00,0x03]
+v_max_i16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x61,0x01,0x06,0x06,0x06]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0xe4,0x00,0x0f]
+v_max_i16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0xff,0x06,0x06,0x06]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0xe4,0x00,0x0f]
+v_max_i16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x60,0x01,0x06,0x06,0x06]
 
-v_max_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x5e,0x00,0xe4,0x08,0x00]
+v_max_i16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x26,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x06]
 
-v_max_i16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x61,0x00,0x06,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x00,0x06,0x06]
 
-v_max_i16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0xff,0x06,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x01,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x60,0x00,0x06,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x02,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x26,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x03,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x04,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x00,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x05,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x01,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x0e,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x02,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x16,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x03,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x16,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x04,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x05,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x00,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x0e,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x01,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x16,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x02,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x16,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x03,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x04,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x00,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x05,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x01,0x06]
+v_max_i16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x0e,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x02,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x06]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x03,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x00]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x04,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x01]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x05,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x02]
 
-v_max_i16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x0e,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x03]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x06]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x04]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x00]
+v_max_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x05]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x01]
+v_max_i16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x60,0x01,0x06,0x06,0x0e]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x02]
+v_max_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0x00]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x03]
+v_max_i16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x61,0x01,0xe4,0x00,0x00]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x04]
+v_max_i16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0xff,0xe4,0x00,0x00]
 
-v_max_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x05]
+v_max_i16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x60,0x01,0xe4,0x00,0x00]
 
-v_max_i16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x60,0x00,0x06,0x06,0x0e]
+v_max_i16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x1b,0x00,0x00]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0xe4,0x00,0x00]
+v_max_i16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x40,0x01,0x00]
 
-v_max_i16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x61,0x00,0xe4,0x00,0x00]
+v_max_i16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x41,0x01,0x00]
 
-v_max_i16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0xff,0xe4,0x00,0x00]
+v_max_i16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x42,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x60,0x00,0xe4,0x00,0x00]
+v_max_i16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x43,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x1b,0x00,0x00]
+v_max_i16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x30,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x40,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x34,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x41,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x38,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x42,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x3c,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x43,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x01,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x30,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x0f,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x34,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x11,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x38,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x1f,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x3c,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x21,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x01,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0x2f,0x01,0x00]
 
-v_max_i16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x0f,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0x10]
 
-v_max_i16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x11,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0x30]
 
-v_max_i16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x1f,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0xf0]
 
-v_max_i16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x21,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0xf0]
 
-v_max_i16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0x2f,0x01,0x00]
+v_max_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0x01]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0xe4,0x00,0x10]
+v_max_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0x03]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0xe4,0x00,0x30]
+v_max_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0x0f]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0xe4,0x00,0xf0]
+v_max_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x00,0x0f]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0xe4,0x00,0xf0]
+v_max_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x60,0x01,0xe4,0x08,0x00]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0xe4,0x00,0x01]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x06]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0xe4,0x00,0x03]
+v_min_u16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x63,0x01,0x06,0x06,0x06]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0xe4,0x00,0x0f]
+v_min_u16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0xff,0x06,0x06,0x06]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0xe4,0x00,0x0f]
+v_min_u16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x62,0x01,0x06,0x06,0x06]
 
-v_max_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x60,0x00,0xe4,0x08,0x00]
+v_min_u16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x26,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x06]
 
-v_min_u16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x63,0x00,0x06,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x00,0x06,0x06]
 
-v_min_u16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0xff,0x06,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x01,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x62,0x00,0x06,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x02,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x26,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x03,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x04,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x00,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x05,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x01,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x0e,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x02,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x16,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x03,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x16,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x04,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x05,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x00,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x0e,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x01,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x16,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x02,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x16,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x03,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x04,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x00,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x05,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x01,0x06]
+v_min_u16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x0e,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x02,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x06]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x03,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x00]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x04,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x01]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x05,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x02]
 
-v_min_u16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x0e,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x03]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x06]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x04]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x00]
+v_min_u16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x05]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x01]
+v_min_u16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x62,0x01,0x06,0x06,0x0e]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x02]
+v_min_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0x00]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x03]
+v_min_u16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x63,0x01,0xe4,0x00,0x00]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x04]
+v_min_u16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0xff,0xe4,0x00,0x00]
 
-v_min_u16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x05]
+v_min_u16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x62,0x01,0xe4,0x00,0x00]
 
-v_min_u16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x62,0x00,0x06,0x06,0x0e]
+v_min_u16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x1b,0x00,0x00]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0xe4,0x00,0x00]
+v_min_u16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x40,0x01,0x00]
 
-v_min_u16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x63,0x00,0xe4,0x00,0x00]
+v_min_u16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x41,0x01,0x00]
 
-v_min_u16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0xff,0xe4,0x00,0x00]
+v_min_u16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x42,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x62,0x00,0xe4,0x00,0x00]
+v_min_u16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x43,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x1b,0x00,0x00]
+v_min_u16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x30,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x40,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x34,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x41,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x38,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x42,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x3c,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x43,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x01,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x30,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x0f,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x34,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x11,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x38,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x1f,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x3c,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x21,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x01,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0x2f,0x01,0x00]
 
-v_min_u16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x0f,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0x10]
 
-v_min_u16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x11,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0x30]
 
-v_min_u16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x1f,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0xf0]
 
-v_min_u16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x21,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0xf0]
 
-v_min_u16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0x2f,0x01,0x00]
+v_min_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0x01]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0xe4,0x00,0x10]
+v_min_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0x03]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0xe4,0x00,0x30]
+v_min_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0x0f]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0xe4,0x00,0xf0]
+v_min_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x00,0x0f]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0xe4,0x00,0xf0]
+v_min_u16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x62,0x01,0xe4,0x08,0x00]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0xe4,0x00,0x01]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x06]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0xe4,0x00,0x03]
+v_min_i16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x65,0x01,0x06,0x06,0x06]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0xe4,0x00,0x0f]
+v_min_i16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0xff,0x06,0x06,0x06]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0xe4,0x00,0x0f]
+v_min_i16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x64,0x01,0x06,0x06,0x06]
 
-v_min_u16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x62,0x00,0xe4,0x08,0x00]
+v_min_i16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x26,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x06]
 
-v_min_i16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x65,0x00,0x06,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x00,0x06,0x06]
 
-v_min_i16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0xff,0x06,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x01,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x64,0x00,0x06,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x02,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x26,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x03,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x04,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x00,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x05,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x01,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x0e,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x02,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x16,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x03,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x16,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x04,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x05,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x00,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x0e,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x01,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x16,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x02,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x16,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x03,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x04,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x00,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x05,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x01,0x06]
+v_min_i16_sdwa v5, sext(v1), v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x0e,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x02,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x06]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x03,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x00]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x04,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x01]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x05,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x02]
 
-v_min_i16_sdwa v0, sext(v0), v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x0e,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x03]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x06]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x04]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x00]
+v_min_i16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x05]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x01]
+v_min_i16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x64,0x01,0x06,0x06,0x0e]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x02]
+v_min_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0x00]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x03]
+v_min_i16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x65,0x01,0xe4,0x00,0x00]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x04]
+v_min_i16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0xff,0xe4,0x00,0x00]
 
-v_min_i16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x05]
+v_min_i16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x64,0x01,0xe4,0x00,0x00]
 
-v_min_i16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x64,0x00,0x06,0x06,0x0e]
+v_min_i16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x1b,0x00,0x00]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0xe4,0x00,0x00]
+v_min_i16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x40,0x01,0x00]
 
-v_min_i16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x65,0x00,0xe4,0x00,0x00]
+v_min_i16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x41,0x01,0x00]
 
-v_min_i16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0xff,0xe4,0x00,0x00]
+v_min_i16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x42,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x64,0x00,0xe4,0x00,0x00]
+v_min_i16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x43,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x1b,0x00,0x00]
+v_min_i16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x30,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x40,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x34,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x41,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x38,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x42,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x3c,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x43,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x01,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x30,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x0f,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x34,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x11,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x38,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x1f,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x3c,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x21,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x01,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0x2f,0x01,0x00]
 
-v_min_i16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x0f,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0x10]
 
-v_min_i16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x11,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0x30]
 
-v_min_i16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x1f,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xf0]
 
-v_min_i16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x21,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0xf0]
 
-v_min_i16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0x2f,0x01,0x00]
+v_min_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0x01]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0xe4,0x00,0x10]
+v_min_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0x03]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0xe4,0x00,0x30]
+v_min_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0x0f]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0xe4,0x00,0xf0]
+v_min_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x00,0x0f]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0xe4,0x00,0xf0]
+v_min_i16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x64,0x01,0xe4,0x08,0x00]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0xe4,0x00,0x01]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x06]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0xe4,0x00,0x03]
+v_ldexp_f16_sdwa v255, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xfe,0x67,0x01,0x06,0x06,0x06]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0xe4,0x00,0x0f]
+v_ldexp_f16_sdwa v5, v255, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0xff,0x06,0x06,0x06]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0xe4,0x00,0x0f]
+v_ldexp_f16_sdwa v5, v1, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x0b,0x66,0x01,0x06,0x06,0x06]
 
-v_min_i16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x64,0x00,0xe4,0x08,0x00]
+v_ldexp_f16_sdwa v5, v1, v2 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x26,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x06]
 
-v_ldexp_f16_sdwa v255, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xfe,0x67,0x00,0x06,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x00,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v255, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0xff,0x06,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x01,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v255 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x01,0x66,0x00,0x06,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x02,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 clamp dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x26,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x03,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x04,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:BYTE_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x00,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x05,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x01,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x0e,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:BYTE_2 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x02,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x16,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:BYTE_3 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x03,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x16,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:WORD_0 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x04,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x05,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x00,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_SEXT src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x0e,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x01,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x16,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x02,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x16,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x03,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x04,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x00,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x05,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x01,0x06]
+v_ldexp_f16_sdwa v5, -v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x16,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x02,0x06]
+v_ldexp_f16_sdwa v5, |v1|, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x26,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x03,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x06]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x04,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x00]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x05,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x01]
 
-v_ldexp_f16_sdwa v0, -v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x16,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x02]
 
-v_ldexp_f16_sdwa v0, |v0|, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x26,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x03]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x06]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x04]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x00]
+v_ldexp_f16_sdwa v5, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x05]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x01]
+v_ldexp_f16_sdwa v5, v1, sext(v2) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x0a,0x66,0x01,0x06,0x06,0x0e]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x02]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0x00]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x03]
+v_ldexp_f16_dpp v255, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0xfe,0x67,0x01,0xe4,0x00,0x00]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x04]
+v_ldexp_f16_dpp v5, v255, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0xff,0xe4,0x00,0x00]
 
-v_ldexp_f16_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x05]
+v_ldexp_f16_dpp v5, v1, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0xfe,0x0b,0x66,0x01,0xe4,0x00,0x00]
 
-v_ldexp_f16_sdwa v0, v0, sext(v0) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x00,0x66,0x00,0x06,0x06,0x0e]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x1b,0x00,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x00,0x00]
+v_ldexp_f16_dpp v5, v1, v2 row_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x40,0x01,0x00]
 
-v_ldexp_f16_dpp v255, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0xfe,0x67,0x00,0xe4,0x00,0x00]
+v_ldexp_f16_dpp v5, v1, v2 row_half_mirror row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x41,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v255, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0xff,0xe4,0x00,0x00]
+v_ldexp_f16_dpp v5, v1, v2 row_bcast:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x42,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v255 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0xfe,0x01,0x66,0x00,0xe4,0x00,0x00]
+v_ldexp_f16_dpp v5, v1, v2 row_bcast:31 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x43,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[3,2,1,0] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x1b,0x00,0x00]
+v_ldexp_f16_dpp v5, v1, v2 wave_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x30,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 row_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x40,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 wave_rol:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x34,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 row_half_mirror row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x41,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 wave_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x38,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 row_bcast:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x42,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 wave_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x3c,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 row_bcast:31 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x43,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 row_shl:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x01,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 wave_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x30,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 row_shl:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x0f,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 wave_rol:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x34,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 row_shr:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x11,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 wave_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x38,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 row_shr:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x1f,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 wave_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x3c,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 row_ror:1 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x21,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 row_shl:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x01,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 row_ror:15 row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0x2f,0x01,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 row_shl:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x0f,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0x10]
 
-v_ldexp_f16_dpp v0, v0, v0 row_shr:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x11,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0x30]
 
-v_ldexp_f16_dpp v0, v0, v0 row_shr:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x1f,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xf0]
 
-v_ldexp_f16_dpp v0, v0, v0 row_ror:1 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x21,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0xf0]
 
-v_ldexp_f16_dpp v0, v0, v0 row_ror:15 row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0x2f,0x01,0x00]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0x01]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x1 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x00,0x10]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0x03]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x3 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x00,0x30]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0x0f]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x00,0xf0]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x00,0x0f]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x00,0xf0]
+v_ldexp_f16_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x08,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x1
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x00,0x01]
+v_ldexp_f16_dpp v5, -v1, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x10,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x3
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x00,0x03]
+v_ldexp_f16_dpp v5, |v1|, v2 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x04,0x0a,0x66,0x01,0xe4,0x20,0x00]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0xf
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x00,0x0f]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x06,0x06]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x00,0x0f]
+v_cmp_class_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0xff,0x16,0x06,0x06]
 
-v_ldexp_f16_dpp v0, v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 bound_ctrl:0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x08,0x00]
+v_cmp_class_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x21,0x7c,0x01,0x16,0x06,0x06]
 
-v_ldexp_f16_dpp v0, -v0, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x10,0x00]
+v_cmp_class_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x36,0x06,0x06]
 
-v_ldexp_f16_dpp v0, |v0|, v0 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
-// CHECK: [0xfa,0x00,0x00,0x66,0x00,0xe4,0x20,0x00]
+v_cmp_class_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_class_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x21,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_class_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_class_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_class_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_class_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_class_f32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x20,0x7c,0x01,0x16,0x06,0x0e]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_class_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_class_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x23,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_class_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_class_f32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x20,0x7c,0x00,0x16,0x06,0x0e]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x23,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_class_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_class_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_class_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_class_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_class_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_class_f32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x22,0x7c,0x01,0x16,0x06,0x0e]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_class_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_class_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x29,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_class_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_class_f32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x22,0x7c,0x00,0x16,0x06,0x0e]
+v_cmp_class_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_class_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x29,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_class_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_class_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_class_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_class_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_class_f16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x28,0x7c,0x01,0x16,0x06,0x0e]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_class_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_class_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x2b,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_class_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_class_f16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x28,0x7c,0x00,0x16,0x06,0x0e]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x2b,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_class_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_class_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_class_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_class_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_class_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_class_f16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x2a,0x7c,0x01,0x16,0x06,0x0e]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_f_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_f_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x41,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_f_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_class_f16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x2a,0x7c,0x00,0x16,0x06,0x0e]
+v_cmp_f_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_f_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x41,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_f_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_f_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_f_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_f_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_f_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_f_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_lt_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_lt_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x43,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_lt_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_f_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x43,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_lt_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_lt_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_lt_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_lt_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_lt_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_lt_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_eq_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_eq_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x45,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_eq_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_lt_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x45,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_eq_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_eq_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_eq_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_eq_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_eq_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_eq_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_le_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_le_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x47,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_le_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_eq_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_le_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_le_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x47,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_le_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_le_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_le_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_le_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_le_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_le_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_gt_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_gt_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x49,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_gt_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_le_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x49,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_gt_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_gt_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_gt_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_gt_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_gt_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_gt_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_lg_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_lg_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x4b,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_lg_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_gt_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x4b,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_lg_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_lg_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_lg_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_lg_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_lg_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_lg_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_ge_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_ge_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x4d,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_ge_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_lg_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x4d,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_ge_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_ge_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_ge_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_ge_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_ge_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_ge_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_o_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_o_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x4f,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_o_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_ge_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_o_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_o_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x4f,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_o_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_o_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_o_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_o_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_o_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_o_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_u_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_u_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x51,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_u_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_o_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_u_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_u_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x51,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_u_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_u_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_u_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_u_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_u_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_u_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_nge_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_nge_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x53,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_nge_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_u_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x53,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_nge_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_nge_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_nge_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_nge_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_nge_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_nge_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_nlg_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_nlg_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x55,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_nge_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x55,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_nlg_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_nlg_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_nlg_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_nlg_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_nlg_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_nlg_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_ngt_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_ngt_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x57,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_nlg_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x57,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_ngt_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_ngt_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_ngt_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_ngt_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_ngt_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_ngt_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_nle_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_nle_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x59,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_nle_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_ngt_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x59,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_nle_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_nle_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_nle_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_nle_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_nle_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_nle_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_neq_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_neq_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x5b,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_neq_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_nle_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x5b,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_neq_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_neq_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_neq_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_neq_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_neq_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_neq_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_nlt_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_nlt_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x5d,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_neq_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x5d,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_nlt_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_nlt_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_nlt_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_nlt_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_nlt_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_nlt_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_tru_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_tru_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x5f,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_tru_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_nlt_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x5f,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_tru_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_tru_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_tru_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_tru_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_tru_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_tru_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_f_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_f_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x61,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_f_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_tru_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x61,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_f_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_f_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_f_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_f_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_f_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_f_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_f_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_lt_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_lt_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x63,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_f_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x63,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_lt_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_lt_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_lt_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_lt_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_lt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_lt_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_lt_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_eq_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_eq_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x65,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_lt_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x65,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_eq_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_eq_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_eq_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_eq_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_eq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_eq_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_eq_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_le_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_le_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x67,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_le_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_eq_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x67,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_le_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_le_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_le_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_le_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_le_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_le_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_le_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_gt_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_gt_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x69,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_le_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x69,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_gt_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_gt_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_gt_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_gt_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_gt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_gt_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_gt_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_lg_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_lg_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x6b,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_gt_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x6b,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_lg_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_lg_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_lg_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_lg_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_lg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_lg_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_lg_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_ge_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_ge_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x6d,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_lg_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x6d,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_ge_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_ge_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_ge_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_ge_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_ge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_ge_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_ge_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_o_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_o_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x6f,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_o_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_ge_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x6f,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_o_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_o_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_o_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_o_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_o_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_o_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_o_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_u_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_u_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x71,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_u_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_o_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x71,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_u_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_u_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_u_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_u_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_u_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_u_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_u_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_nge_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_nge_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x73,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_u_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x73,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_nge_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_nge_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_nge_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_nge_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_nge_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_nge_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_nge_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_nlg_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_nlg_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x75,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_nge_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x75,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_nlg_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_nlg_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_nlg_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_nlg_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_nlg_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_nlg_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_nlg_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_ngt_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_ngt_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x77,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_nlg_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x77,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_ngt_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_ngt_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_ngt_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_ngt_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_ngt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_ngt_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_ngt_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_nle_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_nle_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x79,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_ngt_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x79,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_nle_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_nle_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_nle_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_nle_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_nle_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_nle_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_nle_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_neq_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_neq_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x7b,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_nle_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x7b,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_neq_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_neq_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_neq_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_neq_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_neq_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_neq_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_neq_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_nlt_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_nlt_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x7d,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_neq_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x7d,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_nlt_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_nlt_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_nlt_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_nlt_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_nlt_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_nlt_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_nlt_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_tru_f16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_tru_f16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x7f,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_nlt_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x7f,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_tru_f16_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_tru_f16_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_tru_f16_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_tru_f16_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_tru_f16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_tru_f16_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_tru_f16_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_f_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_f_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x81,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_f_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_tru_f16_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_f_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_f_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x81,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_f_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_f_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_f_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_f_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_f_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_f_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_lt_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_lt_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x83,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_lt_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_f_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x83,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_lt_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_lt_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_lt_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_lt_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_lt_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_lt_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_eq_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_eq_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x85,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_eq_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_lt_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x85,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_eq_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_eq_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_eq_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_eq_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_eq_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_eq_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_le_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_le_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x87,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_le_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_eq_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_le_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_le_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x87,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_le_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_le_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_le_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_le_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_le_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_le_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_gt_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_gt_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x89,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_gt_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_le_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x89,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_gt_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_gt_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_gt_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_gt_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_gt_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_gt_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_lg_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_lg_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x8b,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_lg_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_gt_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x8b,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_lg_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_lg_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_lg_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_lg_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_lg_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_lg_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_ge_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_ge_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x8d,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_ge_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_lg_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x8d,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_ge_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_ge_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_ge_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_ge_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_ge_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_ge_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_o_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_o_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x8f,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_o_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_ge_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_o_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_o_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x8f,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_o_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_o_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_o_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_o_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_o_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_o_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_u_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_u_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x91,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_u_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_o_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_u_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_u_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x91,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_u_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_u_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_u_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_u_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_u_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_u_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_nge_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_nge_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x93,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_nge_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_u_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x93,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_nge_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_nge_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_nge_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_nge_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_nge_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_nge_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_nlg_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_nlg_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x95,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_nge_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x95,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_nlg_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_nlg_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_nlg_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_nlg_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_nlg_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_nlg_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_ngt_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_ngt_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x97,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_nlg_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x97,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_ngt_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_ngt_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_ngt_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_ngt_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_ngt_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_ngt_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_nle_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_nle_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x99,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_nle_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_ngt_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x99,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_nle_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_nle_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_nle_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_nle_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_nle_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_nle_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_neq_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_neq_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x9b,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_neq_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_nle_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x9b,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_neq_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_neq_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_neq_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_neq_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_neq_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_neq_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_nlt_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_nlt_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x9d,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_neq_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x9d,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_nlt_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_nlt_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_nlt_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_nlt_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_nlt_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_nlt_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_tru_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_tru_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x9f,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_tru_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_nlt_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0xff,0x16,0x06,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x9f,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x36,0x06,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x00,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x01,0x06]
+v_cmp_tru_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x02,0x06]
+v_cmp_tru_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x03,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x04,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x05,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmp_tru_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x16,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmp_tru_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x26,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x06]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x00]
+v_cmp_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x01]
+v_cmp_tru_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x02]
+v_cmp_tru_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_f_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_f_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xa1,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_f_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmp_tru_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xa1,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_f_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_f_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_f_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_f_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_f_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_f_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_f_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_lt_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_lt_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xa3,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_f_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xa3,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_lt_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_lt_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_lt_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_lt_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_lt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_lt_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_lt_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_eq_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_eq_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xa5,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_lt_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xa5,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_eq_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_eq_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_eq_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_eq_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_eq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_eq_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_eq_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_le_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_le_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xa7,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_le_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_eq_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xa7,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_le_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_le_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_le_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_le_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_le_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_le_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_le_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_gt_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_gt_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xa9,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_le_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xa9,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_gt_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_gt_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_gt_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_gt_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_gt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_gt_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_gt_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_lg_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_lg_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xab,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_gt_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xab,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_lg_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_lg_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_lg_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_lg_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_lg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_lg_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_lg_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_ge_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_ge_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xad,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_lg_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xad,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_ge_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_ge_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_ge_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_ge_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_ge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_ge_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_ge_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_o_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_o_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xaf,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_o_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_ge_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xaf,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_o_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_o_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_o_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_o_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_o_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_o_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_o_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_u_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_u_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xb1,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_u_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_o_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xb1,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_u_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_u_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_u_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_u_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_u_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_u_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_u_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_nge_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_nge_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xb3,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_u_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xb3,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_nge_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_nge_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_nge_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_nge_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_nge_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_nge_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_nge_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_nlg_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_nlg_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xb5,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_nge_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xb5,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_nlg_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_nlg_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_nlg_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_nlg_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_nlg_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_nlg_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_nlg_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_ngt_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_ngt_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xb7,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_nlg_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xb7,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_ngt_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_ngt_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_ngt_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_ngt_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_ngt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_ngt_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_ngt_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_nle_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_nle_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xb9,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_ngt_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xb9,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_nle_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_nle_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_nle_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_nle_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_nle_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_nle_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_nle_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_neq_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_neq_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xbb,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_nle_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xbb,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_neq_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_neq_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_neq_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_neq_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_neq_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_neq_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_neq_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_nlt_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_nlt_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xbd,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_neq_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xbd,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_nlt_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_nlt_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_nlt_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_nlt_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_nlt_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_nlt_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_nlt_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x03]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x04]
+v_cmpx_tru_f32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0xff,0x16,0x06,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x05]
+v_cmpx_tru_f32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xbf,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x16]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x36,0x06,0x06]
 
-v_cmpx_nlt_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7c,0x00,0x16,0x06,0x26]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x00,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0xff,0x16,0x06,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x01,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xbf,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x02,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x36,0x06,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x03,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x04,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x00,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x05,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x01,0x06]
+v_cmpx_tru_f32_sdwa vcc, -v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x16,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x02,0x06]
+v_cmpx_tru_f32_sdwa vcc, |v1|, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x26,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x03,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x04,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x00]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x05,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x01]
 
-v_cmpx_tru_f32_sdwa vcc, -v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x16,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x02]
 
-v_cmpx_tru_f32_sdwa vcc, |v0|, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x26,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x03]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x06]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x04]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x00]
+v_cmpx_tru_f32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x05]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x01]
+v_cmpx_tru_f32_sdwa vcc, v1, -v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x16]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x02]
+v_cmpx_tru_f32_sdwa vcc, v1, |v2| src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7c,0x01,0x16,0x06,0x26]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x03]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x04]
+v_cmp_f_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x05]
+v_cmp_f_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x41,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, -v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x16]
+v_cmp_f_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_tru_f32_sdwa vcc, v0, |v0| src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7c,0x00,0x16,0x06,0x26]
+v_cmp_f_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_f_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x41,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_f_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_f_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_f_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x40,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_lt_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_lt_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x43,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_lt_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_f_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x40,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x43,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_lt_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_lt_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_lt_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x42,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_eq_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_eq_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x45,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_eq_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_lt_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x42,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x45,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_eq_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_eq_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_eq_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x44,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_le_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_le_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x47,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_le_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_eq_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x44,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_le_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_le_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x47,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_le_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_le_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_le_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x46,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_gt_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_gt_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x49,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_gt_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_le_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x46,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x49,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_gt_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_gt_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_gt_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x48,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_ne_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_ne_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x4b,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_ne_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_gt_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x48,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x4b,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_ne_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_ne_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_ne_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4a,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_ge_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_ge_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x4d,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_ge_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_ne_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4a,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x4d,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_ge_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_ge_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_ge_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4c,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_t_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_t_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x4f,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_t_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_ge_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4c,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_t_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_t_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x4f,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_t_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_t_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_t_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x4e,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_f_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_f_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x51,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_f_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_t_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x4e,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_f_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_f_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x51,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_f_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_f_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_f_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x50,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_lt_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_lt_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x53,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_lt_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_f_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x50,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x53,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_lt_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_lt_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_lt_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x52,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_eq_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_eq_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x55,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_eq_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_lt_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x52,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x55,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_eq_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_eq_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_eq_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x54,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_le_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_le_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x57,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_le_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_eq_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x54,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_le_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_le_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x57,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_le_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_le_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_le_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x56,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_gt_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_gt_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x59,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_gt_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_le_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x56,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x59,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_gt_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_gt_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_gt_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x58,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_ne_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_ne_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x5b,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_ne_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_gt_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x58,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x5b,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_ne_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_ne_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_ne_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5a,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_ge_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_ge_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x5d,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_ge_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_ne_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5a,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x5d,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_ge_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_ge_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_ge_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5c,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_t_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_t_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x5f,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_t_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_ge_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5c,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_t_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_t_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x5f,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_t_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_t_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_t_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x5e,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_f_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_f_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x61,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_f_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_t_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x5e,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x61,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_f_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_f_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_f_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_f_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x60,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_lt_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_lt_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x63,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_f_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x60,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x63,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_lt_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_lt_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_lt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_lt_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x62,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_eq_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_eq_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x65,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_lt_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x62,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x65,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_eq_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_eq_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_eq_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_eq_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x64,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_le_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_le_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x67,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_le_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_eq_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x64,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x67,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_le_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_le_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_le_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_le_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x66,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_gt_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_gt_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x69,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_le_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x66,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x69,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_gt_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_gt_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_gt_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_gt_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x68,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_ne_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_ne_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x6b,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_gt_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x68,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x6b,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_ne_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_ne_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_ne_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_ne_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6a,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_ge_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_ge_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x6d,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_ne_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6a,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x6d,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_ge_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_ge_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_ge_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_ge_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6c,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_t_i16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_t_i16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x6f,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_t_i16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_ge_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6c,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x6f,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_t_i16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_t_i16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_t_i16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_t_i16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x6e,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_f_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_f_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x71,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_f_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_t_i16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x6e,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x71,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_f_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_f_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_f_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_f_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x70,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_lt_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_lt_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x73,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_f_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x70,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x73,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_lt_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_lt_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_lt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_lt_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x72,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_eq_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_eq_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x75,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_lt_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x72,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x75,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_eq_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_eq_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_eq_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_eq_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x74,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_le_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_le_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x77,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_le_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_eq_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x74,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x77,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_le_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_le_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_le_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_le_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x76,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_gt_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_gt_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x79,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_le_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x76,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x79,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_gt_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_gt_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_gt_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_gt_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x78,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_ne_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_ne_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x7b,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_gt_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x78,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x7b,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_ne_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_ne_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_ne_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_ne_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7a,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_ge_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_ge_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x7d,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_ne_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7a,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x7d,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_ge_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_ge_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_ge_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_ge_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7c,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_t_u16_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_t_u16_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x7f,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_t_u16_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_ge_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7c,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x7f,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_t_u16_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_t_u16_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_t_u16_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_t_u16_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x7e,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_f_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_f_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x81,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_f_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_t_u16_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x7e,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_f_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_f_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x81,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_f_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_f_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_f_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x80,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_lt_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_lt_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x83,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_lt_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_f_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x80,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x83,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_lt_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_lt_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_lt_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x82,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_eq_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_eq_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x85,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_eq_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_lt_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x82,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x85,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_eq_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_eq_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_eq_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x84,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_le_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_le_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x87,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_le_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_eq_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x84,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_le_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_le_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x87,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_le_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_le_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_le_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x86,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_gt_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_gt_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x89,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_gt_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_le_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x86,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x89,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_gt_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_gt_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_gt_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x88,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_ne_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_ne_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x8b,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_ne_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_gt_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x88,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x8b,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_ne_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_ne_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_ne_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8a,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_ge_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_ge_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x8d,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_ge_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_ne_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8a,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x8d,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_ge_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_ge_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_ge_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8c,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_t_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_t_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x8f,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_t_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_ge_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8c,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_t_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_t_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x8f,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_t_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_t_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_t_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x8e,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_f_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_f_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x91,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_f_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_t_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x8e,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_f_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_f_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x91,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_f_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_f_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_f_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x90,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_lt_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_lt_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x93,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_lt_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_f_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x90,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x93,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_lt_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_lt_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_lt_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x92,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_eq_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_eq_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x95,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_eq_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_lt_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x92,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x95,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_eq_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_eq_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_eq_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x94,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_le_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_le_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x97,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_le_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_eq_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x94,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_le_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_le_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x97,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_le_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_le_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_le_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x96,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_gt_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_gt_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x99,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_gt_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_le_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x96,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x99,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_gt_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_gt_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_gt_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x98,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_ne_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_ne_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x9b,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_ne_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_gt_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x98,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x9b,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_ne_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_ne_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_ne_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9a,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_ge_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_ge_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x9d,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_ge_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_ne_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9a,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x9d,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_ge_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_ge_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_ge_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9c,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x06,0x02]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x06,0x03]
+v_cmp_t_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x06,0x04]
+v_cmp_t_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0x9f,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x06,0x05]
+v_cmp_t_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_ge_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9c,0x7d,0x00,0x16,0x06,0x0e]
+v_cmp_t_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmp_t_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0xff,0x16,0x06,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0x9f,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x36,0x06,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x00,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x01,0x06]
+v_cmp_t_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x02,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x03,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x04,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x05,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmp_t_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x0e,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x06,0x06]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x06,0x00]
+v_cmp_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x06,0x01]
+v_cmp_t_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0x9e,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_f_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_f_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xa1,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_f_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmp_t_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0x9e,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xa1,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_f_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_f_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_f_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_f_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa0,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_lt_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_lt_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xa3,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_f_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa0,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xa3,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_lt_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_lt_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_lt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_lt_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa2,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_eq_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_eq_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xa5,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_lt_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa2,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xa5,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_eq_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_eq_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_eq_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_eq_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa4,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_le_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_le_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xa7,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_le_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_eq_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa4,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xa7,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_le_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_le_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_le_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_le_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa6,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_gt_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_gt_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xa9,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_le_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa6,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xa9,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_gt_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_gt_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_gt_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_gt_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xa8,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_ne_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_ne_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xab,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_gt_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xa8,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xab,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_ne_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_ne_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_ne_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_ne_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xaa,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_ge_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_ge_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xad,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_ne_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xaa,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xad,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_ge_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_ge_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_ge_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_ge_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xac,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_t_i32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_t_i32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xaf,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_t_i32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_ge_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xac,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xaf,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_t_i32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_t_i32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_t_i32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_t_i32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xae,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_f_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_f_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xb1,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_f_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_t_i32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xae,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xb1,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_f_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_f_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_f_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_f_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb0,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_lt_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_lt_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xb3,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_f_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb0,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xb3,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_lt_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_lt_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_lt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_lt_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb2,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_eq_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_eq_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xb5,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_lt_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb2,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xb5,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_eq_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_eq_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_eq_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_eq_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb4,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_le_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_le_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xb7,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_le_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_eq_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb4,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xb7,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_le_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_le_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_le_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_le_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb6,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_gt_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_gt_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xb9,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_le_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb6,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xb9,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_gt_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_gt_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_gt_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_gt_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xb8,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_ne_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_ne_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xbb,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_gt_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xb8,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xbb,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_ne_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_ne_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_ne_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_ne_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xba,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_ge_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_ge_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xbd,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_ne_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xba,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xbd,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_ge_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_ge_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_ge_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_ge_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbc,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x06,0x02]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x06,0x03]
+v_cmpx_t_u32_sdwa vcc, v255, v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0xff,0x16,0x06,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x06,0x04]
+v_cmpx_t_u32_sdwa vcc, v1, v255 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0xfe,0xbf,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x06,0x05]
+v_cmpx_t_u32_sdwa vcc, v1, v2 clamp src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x36,0x06,0x06]
 
-v_cmpx_ge_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbc,0x7d,0x00,0x16,0x06,0x0e]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:BYTE_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x00,0x06]
 
-v_cmpx_t_u32_sdwa vcc, v255, v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0xff,0x16,0x06,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:BYTE_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x01,0x06]
 
-v_cmpx_t_u32_sdwa vcc, v0, v255 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0xfe,0xbf,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:BYTE_2 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x02,0x06]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 clamp src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x36,0x06,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:BYTE_3 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x03,0x06]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:WORD_0 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x04,0x06]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:BYTE_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x00,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:WORD_1 src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x05,0x06]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:BYTE_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x01,0x06]
+v_cmpx_t_u32_sdwa vcc, sext(v1), v2 src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x0e,0x06]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:BYTE_2 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x02,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x06,0x06]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:BYTE_3 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x03,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_0
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x06,0x00]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:WORD_0 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x04,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_1
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x06,0x01]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:WORD_1 src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x05,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_2
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x06,0x02]
 
-v_cmpx_t_u32_sdwa vcc, sext(v0), v0 src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x0e,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:BYTE_3
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x06,0x03]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x06,0x06]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_0
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x06,0x04]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_0
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x06,0x00]
+v_cmpx_t_u32_sdwa vcc, v1, v2 src0_sel:DWORD src1_sel:WORD_1
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x06,0x05]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_1
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x06,0x01]
+v_cmpx_t_u32_sdwa vcc, v1, sext(v2) src0_sel:DWORD src1_sel:DWORD
+// CHECK: [0xf9,0x04,0xbe,0x7d,0x01,0x16,0x06,0x0e]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_2
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x06,0x02]
+s_rfe_restore_b64 s[4:5], s2
+// CHECK: [0x04,0x02,0x80,0x95]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:BYTE_3
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x06,0x03]
+v_mov_fed_b32_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0
+// CHECK: [0xfa,0x12,0x0a,0x7e,0x01,0xe4,0x00,0x00]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_0
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x06,0x04]
+v_mov_fed_b32_e64 v5, s1
+// CHECK: [0x05,0x00,0x49,0xd1,0x01,0x00,0x00,0x00]
 
-v_cmpx_t_u32_sdwa vcc, v0, v0 src0_sel:DWORD src1_sel:WORD_1
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x06,0x05]
+v_mov_fed_b32_sdwa v5, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD
+// CHECK: [0xf9,0x12,0x0a,0x7e,0x01,0x06,0x06,0x06]
 
-v_cmpx_t_u32_sdwa vcc, v0, sext(v0) src0_sel:DWORD src1_sel:DWORD
-// CHECK: [0xf9,0x00,0xbe,0x7d,0x00,0x16,0x06,0x0e]
+v_perm_b32 v5, s1, 0, v255
+// CHECK: [0x05,0x00,0xed,0xd1,0x01,0x00,0xfd,0x07]
diff --git a/test/MC/AMDGPU/hsa-exp.s b/test/MC/AMDGPU/hsa-exp.s
index cc5dfe82ff493491fb6630aa72e53d0f994cb016..0323056b7bb2ac25a207b89e90ce2f6b1e36a548 100644
--- a/test/MC/AMDGPU/hsa-exp.s
+++ b/test/MC/AMDGPU/hsa-exp.s
@@ -65,7 +65,7 @@ amd_kernel_code_t_minimal:
 // ASM-LABEL: {{^}}amd_kernel_code_t_minimal:
 // ASM: .amd_kernel_code_t
 // ASM:	amd_code_version_major = 7
-// ASM:	amd_code_version_minor = 0
+// ASM:	amd_code_version_minor = 1
 // ASM:	amd_machine_kind = 1
 // ASM:	amd_machine_version_major = 7
 // ASM:	amd_machine_version_minor = 0
diff --git a/test/MC/AMDGPU/hsa.s b/test/MC/AMDGPU/hsa.s
index 21083f3915db2dff9fec468a2359400d1fb54ec2..5f1297e0f376cd43f81ceefde6962cbc11fd3c56 100644
--- a/test/MC/AMDGPU/hsa.s
+++ b/test/MC/AMDGPU/hsa.s
@@ -37,25 +37,19 @@
 .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 // ASM: .hsa_code_object_isa 7,0,0,"AMD","AMDGPU"
 
-.amdgpu_runtime_metadata
-    {
-        amd.MDVersion: [ 2, 0 ]
-        amd.Kernels: [
-            { amd.KernelName: amd_kernel_code_t_test_all },
-            { amd.KernelName: amd_kernel_code_t_minimal }
-        ]
-    }
-.end_amdgpu_runtime_metadata
+.amdgpu_code_object_metadata
+  Version: [ 3, 0 ]
+  Kernels:
+    - Name: amd_kernel_code_t_test_all
+    - Name: amd_kernel_code_t_minimal
+.end_amdgpu_code_object_metadata
 
-// ASM: .amdgpu_runtime_metadata
-// ASM:     {
-// ASM:         amd.MDVersion: [ 2, 0 ]
-// ASM:         amd.Kernels: [
-// ASM:             { amd.KernelName: amd_kernel_code_t_test_all },
-// ASM:             { amd.KernelName: amd_kernel_code_t_minimal }
-// ASM:         ]
-// ASM:     }
-// ASM: .end_amdgpu_runtime_metadata
+// ASM: .amdgpu_code_object_metadata
+// ASM:    Version: [ 3, 0 ]
+// ASM:    Kernels:
+// ASM:      - Name: amd_kernel_code_t_test_all
+// ASM:      - Name: amd_kernel_code_t_minimal
+// ASM: .end_amdgpu_code_object_metadata
 
 .amdgpu_hsa_kernel amd_kernel_code_t_test_all
 .amdgpu_hsa_kernel amd_kernel_code_t_minimal
@@ -214,7 +208,7 @@ amd_kernel_code_t_minimal:
 // ASM-LABEL: {{^}}amd_kernel_code_t_minimal:
 // ASM: .amd_kernel_code_t
 // ASM:	amd_code_version_major = 1
-// ASM:	amd_code_version_minor = 0
+// ASM:	amd_code_version_minor = 1
 // ASM:	amd_machine_kind = 1
 // ASM:	amd_machine_version_major = 7
 // ASM:	amd_machine_version_minor = 0
diff --git a/test/MC/AMDGPU/literals.s b/test/MC/AMDGPU/literals.s
index af3c47b7ce5001b04e0b9cb51814aa1b3c88c7b5..c18da5dd8ffec09b7229740132719084942bb915 100644
--- a/test/MC/AMDGPU/literals.s
+++ b/test/MC/AMDGPU/literals.s
@@ -248,12 +248,12 @@ v_trunc_f32_e32 v0, -13
 // VI: v_fract_f64_e32 v[0:1], -13 ; encoding: [0xcd,0x64,0x00,0x7e]
 v_fract_f64_e32 v[0:1], -13
 
-// SICI: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x42,0xd3,0x8d,0x00,0x00,0x20]
-// VI: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x5c,0xd1,0x8d,0x00,0x00,0x20]
+// SICI: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00]
+// VI: v_trunc_f32_e64 v0, -13 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00]
 v_trunc_f32_e64 v0, -13
 
-// SICI: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x7c,0xd3,0x8d,0x00,0x00,0x20]
-// VI: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x72,0xd1,0x8d,0x00,0x00,0x20]
+// SICI: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00]
+// VI: v_fract_f64_e64 v[0:1], -13 ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00]
 v_fract_f64_e64 v[0:1], -13
 
 // SICI: v_trunc_f32_e32 v0, 35 ; encoding: [0xa3,0x42,0x00,0x7e]
diff --git a/test/MC/AMDGPU/literalv216-err.s b/test/MC/AMDGPU/literalv216-err.s
new file mode 100644
index 0000000000000000000000000000000000000000..09739024e9e80a3f296e419f703e72e9bc5469b6
--- /dev/null
+++ b/test/MC/AMDGPU/literalv216-err.s
@@ -0,0 +1,22 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX9 %s
+
+v_pk_add_f16 v1, -17, v2
+// GFX9: :19: error: invalid operand for instruction
+
+v_pk_add_f16 v1, 65, v2
+// GFX9: :18: error: invalid operand for instruction
+
+v_pk_add_f16 v1, 64.0, v2
+// GFX9: :18: error: invalid operand for instruction
+
+v_pk_add_f16 v1, -0.15915494, v2
+// GFX9: :19: error: invalid operand for instruction
+
+v_pk_add_f16 v1, -0.0, v2
+// GFX9: :19: error: invalid operand for instruction
+
+v_pk_add_f16 v1, -32768, v2
+// GFX9: :19: error: invalid operand for instruction
+
+v_pk_add_f16 v1, 32767, v2
+// GFX9: :18: error: invalid operand for instruction
diff --git a/test/MC/AMDGPU/literalv216.s b/test/MC/AMDGPU/literalv216.s
new file mode 100644
index 0000000000000000000000000000000000000000..1ea05d55d7546b542052cb09d7d9775c1f65374b
--- /dev/null
+++ b/test/MC/AMDGPU/literalv216.s
@@ -0,0 +1,112 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s | FileCheck -check-prefix=GFX9 %s
+
+v_pk_add_f16 v1, 0, v2
+// GFX9: v_pk_add_f16 v1, 0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x80,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0.0, v2
+// GFX9: v_pk_add_f16 v1, 0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x80,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, v2, 0
+// GFX9: v_pk_add_f16 v1, v2, 0 ; encoding: [0x01,0x00,0x8f,0xd3,0x02,0x01,0x01,0x18]
+
+v_pk_add_f16 v1, v2, 0.0
+// GFX9: v_pk_add_f16 v1, v2, 0 ; encoding: [0x01,0x00,0x8f,0xd3,0x02,0x01,0x01,0x18]
+
+v_pk_add_f16 v1, 1.0, v2
+// GFX9: v_pk_add_f16 v1, 1.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf2,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, -1.0, v2
+// GFX9: v_pk_add_f16 v1, -1.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf3,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, -0.5, v2
+// GFX9: v_pk_add_f16 v1, -0.5, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf1,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0.5, v2
+// GFX9: v_pk_add_f16 v1, 0.5, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf0,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 2.0, v2
+// GFX9: v_pk_add_f16 v1, 2.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf4,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, -2.0, v2
+// GFX9: v_pk_add_f16 v1, -2.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf5,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 4.0, v2
+// GFX9: v_pk_add_f16 v1, 4.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf6,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, -4.0, v2
+// GFX9: v_pk_add_f16 v1, -4.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf7,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0.15915494, v2
+// GFX9: v_pk_add_f16 v1, 0.15915494, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf8,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, -1, v2
+// GFX9: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc1,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, -2, v2
+// GFX9: v_pk_add_f16 v1, -2, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc2,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, -3, v2
+// GFX9: v_pk_add_f16 v1, -3, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc3,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, -16, v2
+// GFX9: v_pk_add_f16 v1, -16, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xd0,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 1, v2
+// GFX9: v_pk_add_f16 v1, 1, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x81,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 2, v2
+// GFX9: v_pk_add_f16 v1, 2, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x82,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 3, v2
+// GFX9: v_pk_add_f16 v1, 3, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x83,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 4, v2
+// GFX9: v_pk_add_f16 v1, 4, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x84,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 15, v2
+// GFX9: v_pk_add_f16 v1, 15, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x8f,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 16, v2
+// GFX9: v_pk_add_f16 v1, 16, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x90,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 63, v2
+// GFX9: v_pk_add_f16 v1, 63, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xbf,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 64, v2
+// GFX9: v_pk_add_f16 v1, 64, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc0,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0x0001, v2
+// GFX9: v_pk_add_f16 v1, 1, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0x81,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0xffff, v2
+// GFX9: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc1,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0x3c00, v2
+// GFX9: v_pk_add_f16 v1, 1.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf2,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0xbc00, v2
+// GFX9: v_pk_add_f16 v1, -1.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf3,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0x3800, v2
+// GFX9: v_pk_add_f16 v1, 0.5, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf0,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0xb800, v2
+// GFX9: v_pk_add_f16 v1, -0.5, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf1,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0x4000, v2
+// GFX9: v_pk_add_f16 v1, 2.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf4,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0xc000, v2
+// GFX9: v_pk_add_f16 v1, -2.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf5,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0x4400, v2
+// GFX9: v_pk_add_f16 v1, 4.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf6,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0xc400, v2
+// GFX9: v_pk_add_f16 v1, -4.0, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf7,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 0x3118, v2
+// GFX9: v_pk_add_f16 v1, 0.15915494, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xf8,0x04,0x02,0x18]
+
+v_pk_add_f16 v1, 65535, v2
+// GFX9: v_pk_add_f16 v1, -1, v2 ; encoding: [0x01,0x00,0x8f,0xd3,0xc1,0x04,0x02,0x18]
diff --git a/test/MC/AMDGPU/metadata.s b/test/MC/AMDGPU/metadata.s
deleted file mode 100644
index 3c009ff590d3c5caabf4ef192f7ce65081010dff..0000000000000000000000000000000000000000
--- a/test/MC/AMDGPU/metadata.s
+++ /dev/null
@@ -1,35 +0,0 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
-
-.amdgpu_runtime_metadata
-    { amd.MDVersion: [ 2, 0 ], amd.PrintfInfo: [ '1:1:4:%d\n', '2:1:8:%g\n' ], amd.Kernels:
-
-    - { amd.KernelName: test_char, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args:
-        - { amd.ArgSize: 1, amd.ArgAlign: 1, amd.ArgKind: 0, amd.ArgValueType: 1, amd.ArgTypeName: char, amd.ArgAccQual: 0 }
-        - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-        - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-        - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-        - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-
-   - { amd.KernelName: test_ushort2, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args:
-       - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 4, amd.ArgTypeName: ushort2, amd.ArgAccQual: 0 }
-       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-       - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-   }
-.end_amdgpu_runtime_metadata
-
-// ASM: { amd.MDVersion: [ 2, 0 ], amd.PrintfInfo: [ '1:1:4:%d\n', '2:1:8:%g\n' ], amd.Kernels:
-// ASM: - { amd.KernelName: test_char, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args:
-// ASM:     - { amd.ArgSize: 1, amd.ArgAlign: 1, amd.ArgKind: 0, amd.ArgValueType: 1, amd.ArgTypeName: char, amd.ArgAccQual: 0 }
-// ASM:     - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-// ASM:     - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-// ASM:     - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-// ASM:     - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-// ASM: - { amd.KernelName: test_ushort2, amd.Language: OpenCL C, amd.LanguageVersion: [ 2, 0 ], amd.Args:
-// ASM:    - { amd.ArgSize: 4, amd.ArgAlign: 4, amd.ArgKind: 0, amd.ArgValueType: 4, amd.ArgTypeName: ushort2, amd.ArgAccQual: 0 }
-// ASM:    - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 7, amd.ArgValueType: 9 }
-// ASM:    - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 8, amd.ArgValueType: 9 }
-// ASM:    - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 9, amd.ArgValueType: 9 }
-// ASM:    - { amd.ArgSize: 8, amd.ArgAlign: 8, amd.ArgKind: 11, amd.ArgValueType: 1, amd.ArgAddrQual: 1 } }
-// ASM: }
diff --git a/test/MC/AMDGPU/regression/bug28168.s b/test/MC/AMDGPU/regression/bug28168.s
index 86f818937efb9938e945d8de06e27b2b2608b548..e836a3f96a9077dafcfafc4d9d17192d9cff4b18 100644
--- a/test/MC/AMDGPU/regression/bug28168.s
+++ b/test/MC/AMDGPU/regression/bug28168.s
@@ -1,10 +1,10 @@
 // RUN: llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=SICI --check-prefix=CI
 // RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s --check-prefix=GCN --check-prefix=CIVI --check-prefix=VI
 
-v_mqsad_pk_u16_u8 v[0:1], s[0:1], s0, v[254:255]
-// CI: [0x00,0x00,0xe6,0xd2,0x00,0x00,0xf8,0x07]
-// VI: [0x00,0x00,0xe6,0xd1,0x00,0x00,0xf8,0x07]
+v_mqsad_pk_u16_u8 v[0:1], s[0:1], 1, v[254:255]
+// CI: [0x00,0x00,0xe6,0xd2,0x00,0x02,0xf9,0x07]
+// VI: [0x00,0x00,0xe6,0xd1,0x00,0x02,0xf9,0x07]
 
-v_qsad_pk_u16_u8 v[0:1], v[0:1], s0, s[0:1]
-// CI: [0x00,0x00,0xe4,0xd2,0x00,0x01,0x00,0x00]
-// VI: [0x00,0x00,0xe5,0xd1,0x00,0x01,0x00,0x00]
+v_qsad_pk_u16_u8 v[0:1], v[0:1], 1, s[0:1]
+// CI: [0x00,0x00,0xe4,0xd2,0x00,0x03,0x01,0x00]
+// VI: [0x00,0x00,0xe5,0xd1,0x00,0x03,0x01,0x00]
diff --git a/test/MC/AMDGPU/sop1.s b/test/MC/AMDGPU/sop1.s
index f611b022fb4b8f4ad35d8a393043be95c2b39d35..c1fe7d013e6cefca4d611a03ed050fc1aaacbc67 100644
--- a/test/MC/AMDGPU/sop1.s
+++ b/test/MC/AMDGPU/sop1.s
@@ -232,9 +232,17 @@ s_movreld_b64 s[2:3], s[4:5]
 // SICI: s_movreld_b64 s[2:3], s[4:5] ; encoding: [0x04,0x31,0x82,0xbe]
 // VI:   s_movreld_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2d,0x82,0xbe]
 
-s_cbranch_join s[4:5]
-// SICI: s_cbranch_join s[4:5] ; encoding: [0x04,0x32,0x80,0xbe]
-// VI:   s_cbranch_join s[4:5] ; encoding: [0x04,0x2e,0x80,0xbe]
+s_cbranch_join s4
+// SICI: s_cbranch_join s4 ; encoding: [0x04,0x32,0x80,0xbe]
+// VI:   s_cbranch_join s4 ; encoding: [0x04,0x2e,0x80,0xbe]
+
+s_cbranch_join 1
+// NOSICI: error: invalid operand for instruction
+// NOVI: error: invalid operand for instruction
+
+s_cbranch_join 100
+// NOSICI: error: invalid operand for instruction
+// NOVI: error: invalid operand for instruction
 
 s_abs_i32 s1, s2
 // SICI: s_abs_i32 s1, s2 ; encoding: [0x02,0x34,0x81,0xbe]
diff --git a/test/MC/AMDGPU/sopp-gfx9.s b/test/MC/AMDGPU/sopp-gfx9.s
new file mode 100644
index 0000000000000000000000000000000000000000..237bceb287f246dd7cec6aa4ececcdf2c1ab4251
--- /dev/null
+++ b/test/MC/AMDGPU/sopp-gfx9.s
@@ -0,0 +1,71 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -show-encoding %s | FileCheck --check-prefix=GFX9 %s
+
+//===----------------------------------------------------------------------===//
+// s_waitcnt
+//===----------------------------------------------------------------------===//
+
+s_waitcnt 0
+// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+s_waitcnt vmcnt(0) & expcnt(0) & lgkmcnt(0)
+// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+s_waitcnt vmcnt(0), expcnt(0), lgkmcnt(0)
+// GFX9: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; encoding: [0x00,0x00,0x8c,0xbf]
+
+s_waitcnt vmcnt(1)
+// GFX9: s_waitcnt vmcnt(1) ; encoding: [0x71,0x0f,0x8c,0xbf]
+
+s_waitcnt vmcnt(9)
+// GFX9: s_waitcnt vmcnt(9) ; encoding: [0x79,0x0f,0x8c,0xbf]
+
+s_waitcnt expcnt(2)
+// GFX9: s_waitcnt expcnt(2) ; encoding: [0x2f,0xcf,0x8c,0xbf]
+
+s_waitcnt lgkmcnt(3)
+// GFX9: s_waitcnt lgkmcnt(3) ; encoding: [0x7f,0xc3,0x8c,0xbf]
+
+s_waitcnt lgkmcnt(9)
+// GFX9: s_waitcnt lgkmcnt(9) ; encoding: [0x7f,0xc9,0x8c,0xbf]
+
+s_waitcnt vmcnt(0), expcnt(0)
+// GFX9: s_waitcnt vmcnt(0) expcnt(0) ; encoding: [0x00,0x0f,0x8c,0xbf]
+
+s_waitcnt vmcnt(15)
+// GFX9: s_waitcnt vmcnt(15) ; encoding: [0x7f,0x0f,0x8c,0xbf]
+
+s_waitcnt vmcnt(15) expcnt(6)
+// GFX9: s_waitcnt vmcnt(15) expcnt(6) ; encoding: [0x6f,0x0f,0x8c,0xbf]
+
+s_waitcnt vmcnt(15) lgkmcnt(14)
+// GFX9: s_waitcnt vmcnt(15) lgkmcnt(14) ; encoding: [0x7f,0x0e,0x8c,0xbf]
+
+s_waitcnt vmcnt(15) expcnt(6) lgkmcnt(14)
+// GFX9: s_waitcnt vmcnt(15) expcnt(6) lgkmcnt(14) ; encoding: [0x6f,0x0e,0x8c,0xbf]
+
+s_waitcnt vmcnt(31)
+// GFX9: s_waitcnt vmcnt(31) ; encoding: [0x7f,0x4f,0x8c,0xbf]
+
+s_waitcnt vmcnt(31) expcnt(6)
+// GFX9: s_waitcnt vmcnt(31) expcnt(6) ; encoding: [0x6f,0x4f,0x8c,0xbf]
+
+s_waitcnt vmcnt(31) lgkmcnt(14)
+// GFX9: s_waitcnt vmcnt(31) lgkmcnt(14) ; encoding: [0x7f,0x4e,0x8c,0xbf]
+
+s_waitcnt vmcnt(31) expcnt(6) lgkmcnt(14)
+// GFX9: s_waitcnt vmcnt(31) expcnt(6) lgkmcnt(14) ; encoding: [0x6f,0x4e,0x8c,0xbf]
+
+s_waitcnt vmcnt(62)
+// GFX9: s_waitcnt vmcnt(62) ; encoding: [0x7e,0xcf,0x8c,0xbf]
+
+s_waitcnt vmcnt(62) expcnt(6)
+// GFX9: s_waitcnt vmcnt(62) expcnt(6) ; encoding: [0x6e,0xcf,0x8c,0xbf]
+
+s_waitcnt vmcnt(62) lgkmcnt(14)
+// GFX9: s_waitcnt vmcnt(62) lgkmcnt(14) ; encoding: [0x7e,0xce,0x8c,0xbf]
+
+s_waitcnt vmcnt(62) expcnt(6) lgkmcnt(14)
+// GFX9: s_waitcnt vmcnt(62) expcnt(6) lgkmcnt(14) ; encoding: [0x6e,0xce,0x8c,0xbf]
diff --git a/test/MC/AMDGPU/sopp.s b/test/MC/AMDGPU/sopp.s
index b073c8dfc635e95dbfbc0817910bda23f225d578..140e26a9f600c5c86a86d19f5d89410a46b819b1 100644
--- a/test/MC/AMDGPU/sopp.s
+++ b/test/MC/AMDGPU/sopp.s
@@ -43,6 +43,18 @@ s_cbranch_execz 7
 s_cbranch_execnz 8
 // GCN: s_cbranch_execnz 8 ; encoding: [0x08,0x00,0x89,0xbf]
 
+s_cbranch_cdbgsys 9
+// GCN: s_cbranch_cdbgsys 9 ; encoding: [0x09,0x00,0x97,0xbf]
+
+s_cbranch_cdbgsys_and_user 10
+// GCN: s_cbranch_cdbgsys_and_user 10 ; encoding: [0x0a,0x00,0x9a,0xbf]
+
+s_cbranch_cdbgsys_or_user 11
+// GCN: s_cbranch_cdbgsys_or_user 11 ; encoding: [0x0b,0x00,0x99,0xbf]
+
+s_cbranch_cdbguser 12
+// GCN: s_cbranch_cdbguser 12 ; encoding: [0x0c,0x00,0x98,0xbf]
+
 s_barrier
 // GCN: s_barrier ; encoding: [0x00,0x00,0x8a,0xbf]
 
@@ -84,6 +96,9 @@ s_waitcnt vmcnt(0), expcnt(0)
 s_sethalt 9
 // GCN: s_sethalt 9 ; encoding: [0x09,0x00,0x8d,0xbf]
 
+s_setkill 7
+// GCN: s_setkill 7 ; encoding: [0x07,0x00,0x8b,0xbf]
+
 s_sleep 10
 // GCN: s_sleep 10 ; encoding: [0x0a,0x00,0x8e,0xbf]
 
@@ -188,3 +203,11 @@ s_set_gpr_idx_mode 0
 s_set_gpr_idx_mode 15
 // VI: s_set_gpr_idx_mode dst src0 src1 src2 ; encoding: [0x0f,0x00,0x9d,0xbf]
 // NOSICI: error: instruction not supported on this GPU
+
+s_endpgm_saved
+// VI: s_endpgm_saved ; encoding: [0x00,0x00,0x9b,0xbf]
+// NOSICI: error: instruction not supported on this GPU
+
+s_wakeup
+// VI: s_wakeup ; encoding: [0x00,0x00,0x83,0xbf]
+// NOSICI: error: instruction not supported on this GPU
diff --git a/test/MC/AMDGPU/vop-err.s b/test/MC/AMDGPU/vop-err.s
new file mode 100644
index 0000000000000000000000000000000000000000..13388263b20e98499738cdab4858e4fd3ef9529f
--- /dev/null
+++ b/test/MC/AMDGPU/vop-err.s
@@ -0,0 +1,290 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=bonaire -show-encoding %s 2>&1 | FileCheck %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga   -show-encoding %s 2>&1 | FileCheck %s
+
+// GENERIC LIMITATIONS ON VOP FORMATS: CONSTANT BUS RESTRICTIONS
+
+//=====================================================
+// v_movreld_b32: implicitly reads m0 (VOP1/VOP3)
+
+v_movreld_b32 v0, s1
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32 v0, flat_scratch_lo
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32 v0, flat_scratch_hi
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32 v0, vcc_lo
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32 v0, vcc_hi
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32 v0, exec_lo
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32 v0, exec_hi
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32 v0, ttmp0
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32 v0, ttmp1
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32 v0, 123
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32_e64 v0, s1
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32_e64 v0, flat_scratch_lo
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_movreld_b32_e64 v0, flat_scratch_hi
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+//=====================================================
+// v_div_fmas: implicitly read VCC (VOP3)
+
+v_div_fmas_f32 v0, s1, s1, s1
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f32 v0, v2, v3, -s1
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f32 v0, v1, s2, |v3|
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f32 v0, v1, -v2, -s3
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f32 v0, v1, flat_scratch_lo, v3
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f32 v0, v1, v2, flat_scratch_hi
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f32 v0, v1, v2, m0
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f32 v0, v1, ttmp2, v2
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f64 v[0:1], s[2:3], v[4:5], v[6:7]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f64 v[0:1], v[2:3], s[4:5], v[6:7]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f64 v[0:1], v[2:3], v[4:5], s[6:7]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f64 v[0:1], v[2:3], v[4:5], ttmp[2:3]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f64 v[0:1], v[2:3], v[4:5], flat_scratch
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_div_fmas_f64 v[0:1], v[2:3], v[4:5], exec
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+//=====================================================
+// v_cndmask_b32: implicitly reads VCC (VOP2)
+
+v_cndmask_b32 v0, s1, v2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32 v0, flat_scratch_lo, v2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32 v0, flat_scratch_hi, v2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32 v0, exec_lo, v2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32 v0, exec_hi, v2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+//=====================================================
+// v_cndmask_b32_e64: VOP3, no implicit reads
+
+v_cndmask_b32_e64 v0, s1, v2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, flat_scratch_lo, v2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, flat_scratch_hi, v2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, s1, v2, flat_scratch
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, s0, v2, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, v2, s0, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, s0, s0, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, s1, v2, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, v2, s1, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, s1, s1, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, s1, v2, s[2:3]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, v2, s1, s[2:3]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cndmask_b32_e64 v0, s1, s1, s[2:3]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+//=====================================================
+// v_addc_u32: implicitly reads VCC (VOP2 only!)
+
+v_addc_u32 v0, vcc, s0, v0, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32 v0, vcc, flat_scratch_lo, v0, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32 v0, vcc, flat_scratch_hi, v0, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32 v0, vcc, exec_lo, v0, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32 v0, vcc, exec_hi, v0, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+//=====================================================
+// v_addc_u32_e64: no implicit read in VOP3
+
+v_addc_u32_e64 v0, s[0:1], s2, v2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32_e64 v0, s[0:1], v2, s2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32_e64 v0, s[0:1], s2, s2, vcc
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32_e64 v0, s[0:1], s0, v2, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32_e64 v0, s[0:1], v2, s0, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32_e64 v0, s[0:1], s0, s0, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32_e64 v0, s[0:1], s2, v2, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32_e64 v0, s[0:1], v2, s2, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_addc_u32_e64 v0, s[0:1], s2, s2, s[0:1]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+//=====================================================
+// VOP1 w/o implicit reads have no negative test cases on constant bus use
+// VOPC has no negative test cases on constant bus use
+
+//=====================================================
+// madak/madmk: a special case for VOP2 w/o implicit reads
+
+v_madak_f32 v0, s0, v0, 0x11213141
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_madak_f32 v0, flat_scratch_lo, v0, 0x11213141
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_madak_f32 v0, flat_scratch_hi, v0, 0x11213141
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_madak_f32 v0, exec_lo, v0, 0x11213141
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_madak_f32 v0, exec_hi, v0, 0x11213141
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_madak_f32 v0, vcc_lo, v0, 0x11213141
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_madak_f32 v0, vcc_hi, v0, 0x11213141
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+//=====================================================
+// VOP3 w/o implicit reads
+
+v_mad_f32 v0, s0, s1, s0
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_mad_f32 v0, s1, s0, s0
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_mad_f32 v0, s0, s0, s1
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_mad_f32 v0, s0, s0, flat_scratch_lo
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+//=====================================================
+// VOP2_e64:
+
+v_add_f32_e64 v0, s0, s1
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_add_f32_e64 v0, s0, flat_scratch_lo
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_add_f32_e64 v0, flat_scratch_hi, s1
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_add_f32_e64 v0, flat_scratch_hi, m0
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_add_f64 v[0:1], s[0:1], s[2:3]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_add_f64 v[0:1], s[0:1], flat_scratch
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_add_f64 v[0:1], vcc, s[2:3]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+//=====================================================
+// VOPC_e64:
+
+v_cmp_eq_f32_e64 s[0:1], s0, s1
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cmp_eq_f32_e64 s[0:1], s0, flat_scratch_lo
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cmp_eq_f32_e64 s[0:1], flat_scratch_hi, s1
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cmp_eq_f32_e64 s[0:1], s0, m0
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cmp_eq_f64_e64 s[0:1], s[0:1], s[2:3]
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cmp_eq_f64_e64 s[0:1], s[0:1], flat_scratch
+// CHECK: error: invalid operand (violates constant bus restrictions)
+
+v_cmp_eq_f64_e64 s[0:1], vcc, s[2:3]
+// CHECK: error: invalid operand (violates constant bus restrictions)
diff --git a/test/MC/AMDGPU/vop1-gfx9-err.s b/test/MC/AMDGPU/vop1-gfx9-err.s
new file mode 100644
index 0000000000000000000000000000000000000000..87251e6243ccef03e40f46f84792ebc1a8a5927a
--- /dev/null
+++ b/test/MC/AMDGPU/vop1-gfx9-err.s
@@ -0,0 +1,25 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefix=GCN %s
+
+v_swap_b32 v1, 1
+// GCN: :16: error: invalid operand for instruction
+
+v_swap_b32 v1, s0
+// GCN: :16: error: invalid operand for instruction
+
+// FIXME: Better error for it requiring VOP1 encoding
+v_swap_b32_e64 v1, v2
+// GCN: :1: error: unrecognized instruction mnemonic
+
+v_swap_b32 v1, v2, v1
+// GCN: :20: error: invalid operand for instruction
+
+v_swap_b32 v1, v2, v2
+// GCN: :20: error: invalid operand for instruction
+
+v_swap_b32 v1, v2, v2, v2
+// GCN: :20: error: invalid operand for instruction
+
+v_swap_codegen_pseudo_b32 v1, v2
+// GCN: :1: error: unrecognized instruction mnemonic
diff --git a/test/MC/AMDGPU/vop1-gfx9.s b/test/MC/AMDGPU/vop1-gfx9.s
new file mode 100644
index 0000000000000000000000000000000000000000..8706190aa142059c7a790160df9156d927e2f509
--- /dev/null
+++ b/test/MC/AMDGPU/vop1-gfx9.s
@@ -0,0 +1,13 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s | FileCheck -check-prefix=GFX9 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti -show-encoding %s 2>&1 | FileCheck -check-prefix=NOVI %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefix=NOVI %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=NOVI %s
+
+v_swap_b32 v1, v2
+// GFX9: v_swap_b32 v1, v2 ; encoding: [0x02,0xa3,0x02,0x7e]
+// NOVI: :1: error: instruction not supported on this GPU
+
+// FIXME: Error for it requiring VOP1 encoding
+v_swap_b32_e32 v1, v2
+// GFX9: v_swap_b32 v1, v2 ; encoding: [0x02,0xa3,0x02,0x7e]
+// NOVI: :1: error: instruction not supported on this GPU
diff --git a/test/MC/AMDGPU/vop1.s b/test/MC/AMDGPU/vop1.s
index e55c05e5c539f35b6e2ffeb449580846d9bdf25a..03abd6107f3d8e2c3b43de9c5042f121bd838372 100644
--- a/test/MC/AMDGPU/vop1.s
+++ b/test/MC/AMDGPU/vop1.s
@@ -56,7 +56,7 @@ v_cvt_u32_f32_e32 v1, v2
 v_cvt_i32_f32_e32 v1, v2
 
 // SICI: v_mov_fed_b32_e32 v1, v2 ; encoding: [0x02,0x13,0x02,0x7e]
-// NOVI: error: instruction not supported on this GPU
+// VI:   v_mov_fed_b32_e32 v1, v2 ; encoding: [0x02,0x13,0x02,0x7e]
 v_mov_fed_b32_e32 v1, v2
 
 // GCN: v_cvt_f16_f32_e32 v1, v2 ; encoding: [0x02,0x15,0x02,0x7e]
diff --git a/test/MC/AMDGPU/vop2.s b/test/MC/AMDGPU/vop2.s
index 43b5c5de3eec6f082b1420551b3295408d0f90b6..078b6863800846ed6fe5df64e20072f987a847ea 100644
--- a/test/MC/AMDGPU/vop2.s
+++ b/test/MC/AMDGPU/vop2.s
@@ -116,9 +116,17 @@ v_cndmask_b32_e32 v1, v2, v3, vcc
 // VI:   v_readlane_b32 s1, v2, s3 ; encoding: [0x01,0x00,0x89,0xd2,0x02,0x07,0x00,0x00]
 v_readlane_b32 s1, v2, s3
 
-// SICI: v_writelane_b32 v1, s2, s3 ; encoding: [0x02,0x06,0x02,0x04]
-// VI:   v_writelane_b32 v1, s2, s3 ; encoding: [0x01,0x00,0x8a,0xd2,0x02,0x06,0x00,0x00]
-v_writelane_b32 v1, s2, s3
+// SICI: v_writelane_b32 v1, s2, 4 ; encoding: [0x02,0x08,0x03,0x04]
+// VI:   v_writelane_b32 v1, s2, 4 ; encoding: [0x01,0x00,0x8a,0xd2,0x02,0x08,0x01,0x00]
+v_writelane_b32 v1, s2, 4
+
+// SICI: v_writelane_b32 v2, 1, s4 ; encoding: [0x81,0x08,0x04,0x04]
+// VI:   v_writelane_b32 v2, 1, s4 ; encoding: [0x02,0x00,0x8a,0xd2,0x81,0x08,0x00,0x00]
+v_writelane_b32 v2, 1, s4
+
+// SICI: v_writelane_b32 v255, 0xaf123456, 2 ; encoding: [0xff,0x04,0xff,0x05,0x56,0x34,0x12,0xaf]
+// NOVI: error: instruction not supported on this GPU
+v_writelane_b32 v255, 0xaf123456, 2
 
 // SICI: v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x06]
 // VI:   v_add_f32_e32 v1, v2, v3 ; encoding: [0x02,0x07,0x02,0x02]
diff --git a/test/MC/AMDGPU/vop3-gfx9.s b/test/MC/AMDGPU/vop3-gfx9.s
new file mode 100644
index 0000000000000000000000000000000000000000..22a0cddceab4a2bb8ea7157c67441f97785868f0
--- /dev/null
+++ b/test/MC/AMDGPU/vop3-gfx9.s
@@ -0,0 +1,48 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s | FileCheck -check-prefix=GFX9 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti -show-encoding %s 2>&1 | FileCheck -check-prefix=NOVI %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=hawaii -show-encoding %s 2>&1 | FileCheck -check-prefix=NOVI %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck -check-prefix=NOVI %s
+
+v_lshl_add_u32 v1, v2, v3, v4
+// GFX9: v_lshl_add_u32 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfd,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_add_lshl_u32 v1, v2, v3, v4
+// GFX9: v_add_lshl_u32 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfe,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_add3_u32 v1, v2, v3, v4
+// GFX9: v_add3_u32 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xff,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_lshl_or_b32 v1, v2, v3, v4
+// GFX9: v_lshl_or_b32 v1, v2, v3, v4 ; encoding: [0x01,0x00,0x00,0xd2,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_and_or_b32 v1, v2, v3, v4
+// GFX9: v_and_or_b32 v1, v2, v3, v4 ; encoding: [0x01,0x00,0x01,0xd2,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_or3_b32 v1, v2, v3, v4
+// GFX9: v_or3_b32 v1, v2, v3, v4 ; encoding: [0x01,0x00,0x02,0xd2,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_pack_b32_f16 v1, v2, v3
+// GFX9: v_pack_b32_f16 v1, v2, v3 ; encoding: [0x01,0x00,0xa0,0xd2,0x02,0x07,0x02,0x00]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_xad_u32 v1, v2, v3, v4
+// GFX9: v_xad_u32 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xf3,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_med3_f16 v1, v2, v3, v4
+// GFX9: v_med3_f16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfa,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_med3_i16 v1, v2, v3, v4
+// GFX9: v_med3_i16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfb,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
+
+v_med3_u16 v1, v2, v3, v4
+// GFX9: v_med3_u16 v1, v2, v3, v4 ; encoding: [0x01,0x00,0xfc,0xd1,0x02,0x07,0x12,0x04]
+// NOVI: :1: error: instruction not supported on this GPU
diff --git a/test/MC/AMDGPU/vop3-modifiers-err.s b/test/MC/AMDGPU/vop3-modifiers-err.s
new file mode 100644
index 0000000000000000000000000000000000000000..bd08ee2d10aa635781263633364f42246fdacde4
--- /dev/null
+++ b/test/MC/AMDGPU/vop3-modifiers-err.s
@@ -0,0 +1,15 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s 2>&1 | FileCheck %s
+
+//---------------------------------------------------------------------------//
+// VOP3 Modifiers
+//---------------------------------------------------------------------------//
+
+// 'neg(1)' cannot be encoded as 32-bit literal while preserving e64 semantics
+v_ceil_f64_e32 v[0:1], neg(1)
+// CHECK: error: invalid operand for instruction
+
+v_ceil_f32 v0, --1
+// CHECK: error: invalid syntax, expected 'neg' modifier
+
+v_ceil_f16 v0, abs(neg(1))
+// CHECK: error: not a valid operand
\ No newline at end of file
diff --git a/test/MC/AMDGPU/vop3-modifiers.s b/test/MC/AMDGPU/vop3-modifiers.s
new file mode 100644
index 0000000000000000000000000000000000000000..f18a38caac38a660ac2df854361175d6e103f10e
--- /dev/null
+++ b/test/MC/AMDGPU/vop3-modifiers.s
@@ -0,0 +1,388 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=tonga -show-encoding %s | FileCheck %s
+
+//---------------------------------------------------------------------------//
+// VOP1/VOP3 F16
+//---------------------------------------------------------------------------//
+
+v_ceil_f16 v0, -1
+// CHECK: [0xc1,0x8a,0x00,0x7e]
+
+v_ceil_f16 v0, -2
+// CHECK: [0xc2,0x8a,0x00,0x7e]
+
+v_ceil_f16 v0, -16
+// CHECK: [0xd0,0x8a,0x00,0x7e]
+
+v_ceil_f16 v0, -0.5
+// CHECK: [0xf1,0x8a,0x00,0x7e]
+
+v_ceil_f16 v0, -1.0
+// CHECK: [0xf3,0x8a,0x00,0x7e]
+
+v_ceil_f16 v0, -2.0
+// CHECK: [0xf5,0x8a,0x00,0x7e]
+
+v_ceil_f16 v0, -4.0
+// CHECK: [0xf7,0x8a,0x00,0x7e]
+
+// Arbitrary f16 literal in hex
+v_ceil_f16 v0, 0xabcd
+// CHECK: [0xff,0x8a,0x00,0x7e,0xcd,0xab,0x00,0x00]
+
+// '-' is a part of hex literal (not a 'neg' modifier)
+v_ceil_f16 v0, -0x5433
+// CHECK: [0xff,0x8a,0x00,0x7e,0xcd,0xab,0x00,0x00]
+
+v_ceil_f16 v0, abs(0xabcd)
+// CHECK: [0xff,0x8a,0x00,0x7e,0xcd,0x2b,0x00,0x00]
+
+v_ceil_f16 v0, neg(0xabcd)
+// CHECK: [0xff,0x8a,0x00,0x7e,0xcd,0x2b,0x00,0x00]
+
+v_ceil_f16 v0, neg(abs(0xabcd))
+// CHECK: [0xff,0x8a,0x00,0x7e,0xcd,0xab,0x00,0x00]
+
+v_ceil_f16 v0, -abs(0xabcd)
+// CHECK: [0xff,0x8a,0x00,0x7e,0xcd,0xab,0x00,0x00]
+
+// 1/(2*pi) encoded as inline constant in VOP1
+v_ceil_f16 v0, 0x3118
+// CHECK: [0xf8,0x8a,0x00,0x7e]
+
+// 1/(2*pi) encoded as inline constant in VOP3
+v_ceil_f16_e64 v0, 0x3118
+// CHECK: [0x00,0x00,0x85,0xd1,0xf8,0x00,0x00,0x00]
+
+// neg(-1/(2*pi)) = 1/(2*pi)
+v_ceil_f16 v0, neg(0xb118)
+// CHECK: [0xf8,0x8a,0x00,0x7e]
+
+// -1/(2*pi) cannot be encoded as inline constant in VOP1
+v_ceil_f16 v0, 0xb118
+// CHECK: [0xff,0x8a,0x00,0x7e,0x18,0xb1,0x00,0x00]
+
+// -1/(2*pi) cannot be encoded as inline constant in VOP1
+v_ceil_f16 v0, neg(0x3118)
+// CHECK: [0xff,0x8a,0x00,0x7e,0x18,0xb1,0x00,0x00]
+
+// -1/(2*pi) can be encoded as inline constant w/ modifiers in VOP3
+v_ceil_f16_e64 v0, neg(0x3118)
+// CHECK: [0x00,0x00,0x85,0xd1,0xf8,0x00,0x00,0x20]
+
+v_ceil_f16_e64 v0, abs(0x3118)
+// CHECK: 0x00,0x01,0x85,0xd1,0xf8,0x00,0x00,0x00]
+
+v_ceil_f16_e64 v0, neg(abs(0x3118))
+// CHECK: [0x00,0x01,0x85,0xd1,0xf8,0x00,0x00,0x20]
+
+v_ceil_f16_e64 v0, neg(|v1|)
+// CHECK: [0x00,0x01,0x85,0xd1,0x01,0x01,0x00,0x20]
+
+v_ceil_f16_e64 v0, -|v1|
+// CHECK: [0x00,0x01,0x85,0xd1,0x01,0x01,0x00,0x20]
+
+//---------------------------------------------------------------------------//
+// VOP1/VOP3 F64
+//---------------------------------------------------------------------------//
+
+// Encoded as inline constant 1 with 'neg' modifier
+v_ceil_f64 v[0:1], neg(1)
+// CHECK: [0x00,0x00,0x58,0xd1,0x81,0x00,0x00,0x20]
+
+// Encoded as inline constant -1 with 'neg' modifier
+v_ceil_f64 v[0:1], neg(-1)
+// CHECK: [0x00,0x00,0x58,0xd1,0xc1,0x00,0x00,0x20]
+
+v_ceil_f64_e32 v[0:1], 1.0
+// CHECK: [0xf2,0x30,0x00,0x7e]
+
+// abs(1.0) = 1.0
+v_ceil_f64_e32 v[0:1], abs(1.0)
+// CHECK: [0xf2,0x30,0x00,0x7e]
+
+// neg(1.0) = -1.0
+v_ceil_f64_e32 v[0:1], neg(1.0)
+// CHECK: [0xf3,0x30,0x00,0x7e]
+
+// 1/(2*pi) encoded as inline constant in VOP1
+v_ceil_f64 v[0:1], 0x3fc45f306dc9c882
+// CHECK: [0xf8,0x30,0x00,0x7e]
+
+// 1/(2*pi) encoded as inline constant in VOP3
+v_ceil_f64_e64 v[0:1], 0x3fc45f306dc9c882
+// CHECK: [0x00,0x00,0x58,0xd1,0xf8,0x00,0x00,0x00]
+
+// -1/(2*pi) cannot be encoded as inline constant in VOP1.
+// It cannot be encoded as literal either due to int literal rules.
+// So it is encoded as VOP3
+v_ceil_f64 v[0:1], abs(0x3fc45f306dc9c882)
+// CHECK: [0x00,0x01,0x58,0xd1,0xf8,0x00,0x00,0x00]
+
+v_ceil_f64 v[0:1], neg(abs(0x3fc45f306dc9c882))
+// CHECK: [0x00,0x01,0x58,0xd1,0xf8,0x00,0x00,0x20]
+
+
+//---------------------------------------------------------------------------//
+// VOP2/VOP3 F32
+//---------------------------------------------------------------------------//
+
+v_add_f32 v5, -1, v2
+// CHECK: [0xc1,0x04,0x0a,0x02]
+
+v_add_f32 v5, -16, v2
+// CHECK: [0xd0,0x04,0x0a,0x02]
+
+v_add_f32 v5, 0x3e22f983, v2
+// CHECK: [0xf8,0x04,0x0a,0x02]
+
+// abs(1/(2*pi)) = 1/(2*pi)
+v_add_f32 v5, abs(0x3e22f983), v2
+// CHECK: [0xf8,0x04,0x0a,0x02]
+
+// neg(-1/(2*pi)) = 1/(2*pi)
+v_add_f32 v5, neg(0xbe22f983), v2
+// CHECK: [0xf8,0x04,0x0a,0x02]
+
+// -1/(2*pi) cannot be encoded as inline constant in VOP1
+v_add_f32 v5, neg(0x3e22f983), v2
+// CHECK: [0xff,0x04,0x0a,0x02,0x83,0xf9,0x22,0xbe]
+
+
+v_add_f32_e64 v0, -2, s0
+// CHECK: [0x00,0x00,0x01,0xd1,0xc2,0x00,0x00,0x00]
+
+v_add_f32_e64 v0, -16, s0
+// CHECK: [0x00,0x00,0x01,0xd1,0xd0,0x00,0x00,0x00]
+
+v_add_f32_e64 v0, -0.5, s0
+// CHECK: [0x00,0x00,0x01,0xd1,0xf1,0x00,0x00,0x00]
+
+v_add_f32_e64 v0, -1.0, s0
+// CHECK: [0x00,0x00,0x01,0xd1,0xf3,0x00,0x00,0x00]
+
+v_add_f32_e64 v0, -2.0, s0
+// CHECK: [0x00,0x00,0x01,0xd1,0xf5,0x00,0x00,0x00]
+
+v_add_f32_e64 v0, -4.0, s0
+// CHECK: [0x00,0x00,0x01,0xd1,0xf7,0x00,0x00,0x00]
+
+v_add_f32_e64 v0, 0x3e22f983, s0
+// CHECK: [0x00,0x00,0x01,0xd1,0xf8,0x00,0x00,0x00]
+
+v_add_f32_e64 v0, neg(0x3e22f983), s0
+// CHECK: [0x00,0x00,0x01,0xd1,0xf8,0x00,0x00,0x20]
+
+//---------------------------------------------------------------------------//
+// VOPC/VOP3
+//---------------------------------------------------------------------------//
+
+v_cmp_eq_f16 vcc, -1, v0
+// CHECK: [0xc1,0x00,0x44,0x7c]
+
+v_cmp_eq_f16_e64 s[0:1], s0, -1
+// CHECK: [0x00,0x00,0x22,0xd0,0x00,0x82,0x01,0x00]
+
+v_cmp_eq_f16_e64 s[0:1], s0, 0x3118
+// CHECK: [0x00,0x00,0x22,0xd0,0x00,0xf0,0x01,0x00]
+
+v_cmp_eq_f16_e64 s[0:1], s0, neg(0x3118)
+// CHECK: [0x00,0x00,0x22,0xd0,0x00,0xf0,0x01,0x40]
+
+v_cmp_eq_f32 vcc, -4.0, v0
+// CHECK: [0xf7,0x00,0x84,0x7c]
+
+// 1/(2*pi) can be encoded as inline constant
+v_cmp_eq_f32 vcc, 0x3e22f983, v0
+// CHECK: [0xf8,0x00,0x84,0x7c]
+
+// -1/(2*pi) cannot be encoded as inline constant in VOPC
+v_cmp_eq_f32 vcc, neg(0x3e22f983), v0
+// CHECK: [0xff,0x00,0x84,0x7c,0x83,0xf9,0x22,0xbe]
+
+// abs(1/(2*pi)) = 1/(2*pi)
+v_cmp_eq_f32 vcc, abs(0x3e22f983), v0
+// CHECK: [0xf8,0x00,0x84,0x7c]
+
+// -1/(2*pi) can be encoded as inline constant w/ modifiers in VOP3
+v_cmp_eq_f32_e64 vcc, neg(0x3e22f983), v0
+// CHECK: [0x6a,0x00,0x42,0xd0,0xf8,0x00,0x02,0x20]
+
+v_cmp_eq_f32_e64 vcc, v0, abs(0x3e22f983)
+// CHECK: [0x6a,0x02,0x42,0xd0,0x00,0xf1,0x01,0x00]
+
+v_cmp_eq_f32_e64 vcc, v0, -abs(0x3e22f983)
+// CHECK: [0x6a,0x02,0x42,0xd0,0x00,0xf1,0x01,0x40]
+
+//---------------------------------------------------------------------------//
+// VOP3
+//---------------------------------------------------------------------------//
+
+v_add_f64 v[0:1], s[0:1], -1
+// CHECK: [0x00,0x00,0x80,0xd2,0x00,0x82,0x01,0x00]
+
+v_add_f64 v[0:1], s[0:1], -16
+// CHECK: [0x00,0x00,0x80,0xd2,0x00,0xa0,0x01,0x00]
+
+v_add_f64 v[0:1], s[0:1], -0.5
+// CHECK: [0x00,0x00,0x80,0xd2,0x00,0xe2,0x01,0x00]
+
+v_add_f64 v[0:1], s[0:1], -1.0
+// CHECK: [0x00,0x00,0x80,0xd2,0x00,0xe6,0x01,0x00]
+
+v_add_f64 v[0:1], s[0:1], -2.0
+// CHECK: [0x00,0x00,0x80,0xd2,0x00,0xea,0x01,0x00]
+
+v_add_f64 v[0:1], s[0:1], -4.0
+// CHECK: [0x00,0x00,0x80,0xd2,0x00,0xee,0x01,0x00]
+
+v_add_f64 v[4:5], s[0:1], 0x3fc45f306dc9c882
+// CHECK: [0x04,0x00,0x80,0xd2,0x00,0xf0,0x01,0x00]
+
+v_add_f64 v[4:5], s[0:1], neg(0x3fc45f306dc9c882)
+// CHECK: [0x04,0x00,0x80,0xd2,0x00,0xf0,0x01,0x40]
+
+
+v_cubeid_f32 v0, s0, s0, -1
+// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0x04,0x03]
+
+v_cubeid_f32 v0, s0, s0, -4.0
+// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0xdc,0x03]
+
+v_cubeid_f32 v0, s0, s0, 0x3e22f983
+// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0xe0,0x03]
+
+v_cubeid_f32 v0, s0, s0, neg(0x3e22f983)
+// CHECK: [0x00,0x00,0xc4,0xd1,0x00,0x00,0xe0,0x83]
+
+v_cubeid_f32 v0, s0, s0, abs(0x3e22f983)
+// CHECK: [0x00,0x04,0xc4,0xd1,0x00,0x00,0xe0,0x03]
+
+
+//---------------------------------------------------------------------------//
+// VOP3 Instructions without Input Modifiers but with Output Modifiers
+//---------------------------------------------------------------------------//
+
+v_cvt_f64_i32_e64 v[5:6], s1 clamp
+// CHECK: [0x05,0x80,0x44,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f64_i32_e64 v[5:6], s1 mul:2
+// CHECK: [0x05,0x00,0x44,0xd1,0x01,0x00,0x00,0x08]
+
+v_cvt_f64_i32_e64 v[5:6], s1 mul:4
+// CHECK: [0x05,0x00,0x44,0xd1,0x01,0x00,0x00,0x10]
+
+v_cvt_f64_i32_e64 v[5:6], s1 div:2
+// CHECK: [0x05,0x00,0x44,0xd1,0x01,0x00,0x00,0x18]
+
+
+v_cvt_f64_u32_e64 v[5:6], s1 clamp
+// CHECK: [0x05,0x80,0x56,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f64_u32_e64 v[5:6], s1 mul:2
+// CHECK: [0x05,0x00,0x56,0xd1,0x01,0x00,0x00,0x08]
+
+v_cvt_f64_u32_e64 v[5:6], s1 mul:4
+// CHECK: [0x05,0x00,0x56,0xd1,0x01,0x00,0x00,0x10]
+
+v_cvt_f64_u32_e64 v[5:6], s1 div:2
+// CHECK: [0x05,0x00,0x56,0xd1,0x01,0x00,0x00,0x18]
+
+
+v_cvt_f32_i32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x45,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_i32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x45,0xd1,0x01,0x00,0x00,0x08]
+
+v_cvt_f32_i32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x45,0xd1,0x01,0x00,0x00,0x10]
+
+v_cvt_f32_i32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x45,0xd1,0x01,0x00,0x00,0x18]
+
+
+v_cvt_f32_u32_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x46,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_u32_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x46,0xd1,0x01,0x00,0x00,0x08]
+
+v_cvt_f32_u32_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x46,0xd1,0x01,0x00,0x00,0x10]
+
+v_cvt_f32_u32_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x46,0xd1,0x01,0x00,0x00,0x18]
+
+
+v_cvt_off_f32_i4_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x4e,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_off_f32_i4_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x4e,0xd1,0x01,0x00,0x00,0x08]
+
+v_cvt_off_f32_i4_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x4e,0xd1,0x01,0x00,0x00,0x10]
+
+v_cvt_off_f32_i4_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x4e,0xd1,0x01,0x00,0x00,0x18]
+
+
+v_cvt_f32_ubyte0_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x51,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte0_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x51,0xd1,0x01,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte0_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x51,0xd1,0x01,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte0_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x51,0xd1,0x01,0x00,0x00,0x18]
+
+
+v_cvt_f32_ubyte1_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x52,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte1_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x52,0xd1,0x01,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte1_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x52,0xd1,0x01,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte1_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x52,0xd1,0x01,0x00,0x00,0x18]
+
+
+v_cvt_f32_ubyte2_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x53,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte2_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x53,0xd1,0x01,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte2_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x53,0xd1,0x01,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte2_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x53,0xd1,0x01,0x00,0x00,0x18]
+
+
+v_cvt_f32_ubyte3_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x54,0xd1,0x01,0x00,0x00,0x00]
+
+v_cvt_f32_ubyte3_e64 v5, s1 mul:2
+// CHECK: [0x05,0x00,0x54,0xd1,0x01,0x00,0x00,0x08]
+
+v_cvt_f32_ubyte3_e64 v5, s1 mul:4
+// CHECK: [0x05,0x00,0x54,0xd1,0x01,0x00,0x00,0x10]
+
+v_cvt_f32_ubyte3_e64 v5, s1 div:2
+// CHECK: [0x05,0x00,0x54,0xd1,0x01,0x00,0x00,0x18]
+
+
+// NB: output modifiers are not supported for f16
+v_cvt_f16_i16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x7a,0xd1,0x01,0x00,0x00,0x00]
+
+// NB: output modifiers are not supported for f16
+v_cvt_f16_u16_e64 v5, s1 clamp
+// CHECK: [0x05,0x80,0x79,0xd1,0x01,0x00,0x00,0x00]
diff --git a/test/MC/AMDGPU/vop3.s b/test/MC/AMDGPU/vop3.s
index e1dceefc9af7bbe8b34f08c1f85a65f46e7b83c7..98cc9cc35bc7efe28f4455143bd5db0793772a5a 100644
--- a/test/MC/AMDGPU/vop3.s
+++ b/test/MC/AMDGPU/vop3.s
@@ -265,11 +265,11 @@ v_mac_f16_e64 v0, 0.5, flat_scratch_lo
 
 v_mac_f16_e64 v0, -4.0, flat_scratch_lo
 // NOSICI: error:
-// VI: v_mac_f16_e64 v0, -4.0, flat_scratch_lo ; encoding: [0x00,0x00,0x23,0xd1,0xf6,0xcc,0x00,0x20]
+// VI: v_mac_f16_e64 v0, -4.0, flat_scratch_lo ; encoding: [0x00,0x00,0x23,0xd1,0xf7,0xcc,0x00,0x00]
 
 v_mac_f16_e64 v0, flat_scratch_lo, -4.0
 // NOSICI: error:
-// VI: v_mac_f16_e64 v0, flat_scratch_lo, -4.0 ; encoding: [0x00,0x00,0x23,0xd1,0x66,0xec,0x01,0x40]
+// VI: v_mac_f16_e64 v0, flat_scratch_lo, -4.0 ; encoding: [0x00,0x00,0x23,0xd1,0x66,0xee,0x01,0x00]
 
 ///===---------------------------------------------------------------------===//
 // VOP3 Instructions
@@ -386,6 +386,52 @@ v_mad_f32 v9, 0.5, v5, -v8
 // VI:   v_mad_f32 v9, 0.5, v5, -v8      ; encoding: [0x09,0x00,0xc1,0xd1,0xf0,0x0a,0x22,0x84]
 
 v_mqsad_u32_u8 v[0:3], s[2:3], v4, v[0:3]
-// CI: v_mqsad_u32_u8 v[0:3], s[2:3], v4, v[0:3] ; encoding: [0x00,0x00,0xe8,0xd2,0x02,0x08,0x02,0x04]
+// CI: v_mqsad_u32_u8 v[0:3], s[2:3], v4, v[0:3] ; encoding: [0x00,0x00,0xea,0xd2,0x02,0x08,0x02,0x04]
 // VI: v_mqsad_u32_u8 v[0:3], s[2:3], v4, v[0:3] ; encoding: [0x00,0x00,0xe7,0xd1,0x02,0x08,0x02,0x04]
 // NOSI: error: instruction not supported on this GPU
+
+v_mad_u64_u32 v[5:6], s[12:13], s1, 0, 0
+// CI: v_mad_u64_u32 v[5:6], s[12:13], s1, 0, 0 ; encoding: [0x05,0x0c,0xec,0xd2,0x01,0x00,0x01,0x02]
+// VI: v_mad_u64_u32 v[5:6], s[12:13], s1, 0, 0 ; encoding: [0x05,0x0c,0xe8,0xd1,0x01,0x00,0x01,0x02]
+// NOSI: error: instruction not supported on this GPU
+
+v_mad_i64_i32 v[5:6], s[12:13], s1, 0, v[254:255]
+// CI: v_mad_i64_i32 v[5:6], s[12:13], s1, 0, v[254:255] ; encoding: [0x05,0x0c,0xee,0xd2,0x01,0x00,0xf9,0x07]
+// VI: v_mad_i64_i32 v[5:6], s[12:13], s1, 0, v[254:255] ; encoding: [0x05,0x0c,0xe9,0xd1,0x01,0x00,0xf9,0x07]
+// NOSI: error: instruction not supported on this GPU
+
+v_cmp_class_f16_e64 s[10:11], v1, s2
+// NOSICI: error: instruction not supported on this GPU
+// VI: v_cmp_class_f16_e64 s[10:11], v1, s2 ; encoding: [0x0a,0x00,0x14,0xd0,0x01,0x05,0x00,0x00]
+
+v_cmp_class_f32_e64 s[10:11], -v1, s2
+// SICI: v_cmp_class_f32_e64 s[10:11], -v1, s2 ; encoding: [0x0a,0x00,0x10,0xd1,0x01,0x05,0x00,0x20]
+// VI:   v_cmp_class_f32_e64 s[10:11], -v1, s2 ; encoding: [0x0a,0x00,0x10,0xd0,0x01,0x05,0x00,0x20]
+
+v_cmp_class_f64_e64 s[10:11], -v[254:255], s2
+// SICI: v_cmp_class_f64_e64 s[10:11], -v[254:255], s2 ; encoding: [0x0a,0x00,0x50,0xd1,0xfe,0x05,0x00,0x20]
+// VI:   v_cmp_class_f64_e64 s[10:11], -v[254:255], s2 ; encoding: [0x0a,0x00,0x12,0xd0,0xfe,0x05,0x00,0x20]
+
+v_cmpx_class_f16_e64 s[10:11], v255, s2
+// NOSICI: error: instruction not supported on this GPU
+// VI: v_cmpx_class_f16_e64 s[10:11], v255, s2 ; encoding: [0x0a,0x00,0x15,0xd0,0xff,0x05,0x00,0x00]
+
+v_cmpx_class_f32_e64 s[10:11], 0, s101
+// SICI: v_cmpx_class_f32_e64 s[10:11], 0, s101 ; encoding: [0x0a,0x00,0x30,0xd1,0x80,0xca,0x00,0x00]
+// VI:   v_cmpx_class_f32_e64 s[10:11], 0, s101 ; encoding: [0x0a,0x00,0x11,0xd0,0x80,0xca,0x00,0x00]
+
+v_cmpx_class_f64_e64 s[10:11], -v[1:2], s2
+// SICI: v_cmpx_class_f64_e64 s[10:11], -v[1:2], s2 ; encoding: [0x0a,0x00,0x70,0xd1,0x01,0x05,0x00,0x20]
+// VI:   v_cmpx_class_f64_e64 s[10:11], -v[1:2], s2 ; encoding: [0x0a,0x00,0x13,0xd0,0x01,0x05,0x00,0x20]
+
+//
+// Modifier tests:
+//
+
+v_mul_f64 v[0:1], |0|, |0|
+// SICI: v_mul_f64 v[0:1], |0|, |0|      ; encoding: [0x00,0x03,0xca,0xd2,0x80,0x00,0x01,0x00]
+// VI:   v_mul_f64 v[0:1], |0|, |0|      ; encoding: [0x00,0x03,0x81,0xd2,0x80,0x00,0x01,0x00]
+
+v_cubeid_f32 v0, |-1|, |-1.0|, |1.0|
+// SICI: v_cubeid_f32 v0, |-1|, |-1.0|, |1.0| ; encoding: [0x00,0x07,0x88,0xd2,0xc1,0xe6,0xc9,0x03]
+// VI:   v_cubeid_f32 v0, |-1|, |-1.0|, |1.0| ; encoding: [0x00,0x07,0xc4,0xd1,0xc1,0xe6,0xc9,0x03]
diff --git a/test/MC/AMDGPU/vop3p-err.s b/test/MC/AMDGPU/vop3p-err.s
new file mode 100644
index 0000000000000000000000000000000000000000..f4b1a3da714f898e99bd83c0edcf958856282051
--- /dev/null
+++ b/test/MC/AMDGPU/vop3p-err.s
@@ -0,0 +1,120 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s 2>&1 | FileCheck -check-prefix=GFX9 %s
+
+// GFX9: 31: error: failed parsing operand.
+v_pk_add_u16 v1, v2, v3 op_sel
+
+// GFX9: 32: error: failed parsing operand.
+v_pk_add_u16 v1, v2, v3 op_sel:
+
+// GFX9: 33: error: failed parsing operand.
+v_pk_add_u16 v1, v2, v3 op_sel:[
+
+// GFX9: 33: error: failed parsing operand.
+v_pk_add_u16 v1, v2, v3 op_sel:[]
+
+// GFX9: 34: error: failed parsing operand.
+v_pk_add_u16 v1, v2, v3 op_sel:[,]
+
+// XXGFX9: 34: error: failed parsing operand.
+// v_pk_add_u16 v1, v2, v3 op_sel:[0]
+
+// GFX9: 35: error: failed parsing operand.
+v_pk_add_u16 v1, v2, v3 op_sel:[0,]
+
+// XXGFX9: 36: error: failed parsing operand.
+// v_pk_add_u16 v1, v2, v3 op_sel:[,0]
+
+// GFX9: 36: error: failed parsing operand.
+v_pk_add_u16 v1, v2, v3 op_sel:[0,2]
+
+// GFX9: 35: error: failed parsing operand.
+v_pk_add_u16 v1, v2, v3 op_sel:[2,0]
+
+// GFX9: 33: error: failed parsing operand.
+v_pk_add_u16 v1, v2, v3 op_sel:[-1,0]
+
+// GFX9: 35: error: failed parsing operand.
+v_pk_add_u16 v1, v2, v3 op_sel:[0,-1]
+
+// GFX9: 40: error: not a valid operand.
+v_pk_add_u16 v1, v2, v3 op_sel:[0,0,0,0]
+
+// XXGFX9: invalid operand for instruction
+v_pk_add_u16 v1, v2, v3 neg_lo:[0,0]
+
+//
+// Regular modifiers on packed instructions
+//
+
+// FIXME: should be invalid operand for instruction
+// GFX9: :18: error: not a valid operand.
+v_pk_add_f16 v1, |v2|, v3
+
+// GFX9: :21: error: not a valid operand.
+v_pk_add_f16 v1, abs(v2), v3
+
+// GFX9: :22: error: not a valid operand.
+v_pk_add_f16 v1, v2, |v3|
+
+// GFX9: :25: error: not a valid operand.
+v_pk_add_f16 v1, v2, abs(v3)
+
+// GFX9: :19: error: invalid operand for instruction
+v_pk_add_f16 v1, -v2, v3
+
+// GFX9: :23: error: invalid operand for instruction
+v_pk_add_f16 v1, v2, -v3
+
+// GFX9: :21: error: not a valid operand.
+v_pk_add_u16 v1, abs(v2), v3
+
+// GFX9: :19: error: invalid operand for instruction
+v_pk_add_u16 v1, -v2, v3
+
+
+//
+// Packed operands on the non-packed VOP3P instructions
+//
+
+// GFX9: invalid operand for instruction
+v_mad_mix_f32 v1, v2, v3, v4 op_sel:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mix_f32 v1, v2, v3, v4 op_sel_hi:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mix_f32 v1, v2, v3, v4 neg_lo:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mix_f32 v1, v2, v3, v4 neg_hi:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mixlo_f16 v1, v2, v3, v4 op_sel:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mixlo_f16 v1, v2, v3, v4 op_sel_hi:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mixlo_f16 v1, v2, v3, v4 neg_lo:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mixlo_f16 v1, v2, v3, v4 neg_hi:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mixhi_f16 v1, v2, v3, v4 op_sel:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mixhi_f16 v1, v2, v3, v4 op_sel_hi:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mixhi_f16 v1, v2, v3, v4 neg_lo:[0,0,0]
+
+// GFX9: invalid operand for instruction
+v_mad_mixhi_f16 v1, v2, v3, v4 neg_hi:[0,0,0]
+
+//
+// Constant bus restrictions
+//
+
+// GFX9: invalid operand (violates constant bus restrictions)
+v_pk_add_f16 v255, s1, s2
diff --git a/test/MC/AMDGPU/vop3p.s b/test/MC/AMDGPU/vop3p.s
new file mode 100644
index 0000000000000000000000000000000000000000..c9eda69e13d2ad7d678a943d763cde6c4eed3b82
--- /dev/null
+++ b/test/MC/AMDGPU/vop3p.s
@@ -0,0 +1,216 @@
+// RUN: llvm-mc -arch=amdgcn -mcpu=gfx901 -show-encoding %s | FileCheck -check-prefix=GFX9 %s
+
+//
+// Test op_sel/op_sel_hi
+//
+
+v_pk_add_u16 v1, v2, v3
+// GFX9: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,0]
+// GFX9: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,1]
+// GFX9: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[1,1]
+// GFX9: v_pk_add_u16 v1, v2, v3 ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x00]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,0] op_sel_hi:[0,0]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,0] ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x00]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,0]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] ; encoding: [0x01,0x08,0x8a,0xd3,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,1]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] ; encoding: [0x01,0x10,0x8a,0xd3,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,1]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x18,0x8a,0xd3,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel_hi:[0,1] ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x10]
+
+v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel_hi:[1,0] ; encoding: [0x01,0x00,0x8a,0xd3,0x02,0x07,0x02,0x08]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,1] op_sel_hi:[1,1]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[1,1] ; encoding: [0x01,0x18,0x8a,0xd3,0x02,0x07,0x02,0x18]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[1,0] ; encoding: [0x01,0x08,0x8a,0xd3,0x02,0x07,0x02,0x08]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[0,1] ; encoding: [0x01,0x10,0x8a,0xd3,0x02,0x07,0x02,0x10]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[1,0] op_sel_hi:[0,1] ; encoding: [0x01,0x08,0x8a,0xd3,0x02,0x07,0x02,0x10]
+
+v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0]
+// GFX9: v_pk_add_u16 v1, v2, v3 op_sel:[0,1] op_sel_hi:[1,0] ; encoding: [0x01,0x10,0x8a,0xd3,0x02,0x07,0x02,0x08]
+
+//
+// Test src2 op_sel/op_sel_hi
+//
+
+v_pk_fma_f16 v8, v0, s0, v1
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,0] neg_hi:[0,0,0]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1] neg_lo:[0,0,0] neg_hi:[0,0,0]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[1,1,1]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,0] op_sel_hi:[0,0,0]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 op_sel_hi:[0,0,0] ; encoding: [0x08,0x00,0x8e,0xd3,0x00,0x01,0x04,0x04]
+
+v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 op_sel:[0,0,1] op_sel_hi:[0,0,1] ; encoding: [0x08,0x60,0x8e,0xd3,0x00,0x01,0x04,0x04]
+
+//
+// Test neg_lo/neg_hi
+//
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0xfc]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x8e,0xd3,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,1,1] neg_hi:[1,1,1] ; encoding: [0x08,0x47,0x8e,0xd3,0x00,0x01,0x04,0xfc]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x3c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x5c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x8e,0xd3,0x00,0x01,0x04,0x9c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x8e,0xd3,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x8e,0xd3,0x00,0x01,0x04,0x1c]
+
+v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1]
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x8e,0xd3,0x00,0x01,0x04,0x1c]
+
+
+// Test clamp
+v_pk_fma_f16 v8, v0, s0, v1 clamp
+// GFX9: v_pk_fma_f16 v8, v0, s0, v1 clamp ; encoding: [0x08,0xc0,0x8e,0xd3,0x00,0x01,0x04,0x1c]
+
+v_pk_add_u16 v1, v2, v3 clamp
+// GFX9: v_pk_add_u16 v1, v2, v3 clamp ; encoding: [0x01,0x80,0x8a,0xd3,0x02,0x07,0x02,0x18]
+
+v_pk_min_i16 v0, v1, v2 clamp
+// GFX9: v_pk_min_i16 v0, v1, v2 clamp ; encoding: [0x00,0x80,0x88,0xd3,0x01,0x05,0x02,0x18]
+
+//
+// Instruction tests:
+//
+
+v_pk_mul_lo_u16 v0, v1, v2
+// GFX9: v_pk_mul_lo_u16 v0, v1, v2 ; encoding: [0x00,0x00,0x81,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_add_i16 v0, v1, v2
+// GFX9: v_pk_add_i16 v0, v1, v2 ; encoding: [0x00,0x00,0x82,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_sub_i16 v0, v1, v2
+// GFX9: v_pk_sub_i16 v0, v1, v2 ; encoding: [0x00,0x00,0x83,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_lshlrev_b16 v0, v1, v2
+// GFX9: v_pk_lshlrev_b16 v0, v1, v2 ; encoding: [0x00,0x00,0x84,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_lshrrev_b16 v0, v1, v2
+// GFX9: v_pk_lshrrev_b16 v0, v1, v2 ; encoding: [0x00,0x00,0x85,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_ashrrev_i16 v0, v1, v2
+// GFX9: v_pk_ashrrev_i16 v0, v1, v2 ; encoding: [0x00,0x00,0x86,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_max_i16 v0, v1, v2
+// GFX9: v_pk_max_i16 v0, v1, v2 ; encoding: [0x00,0x00,0x87,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_min_i16 v0, v1, v2
+// GFX9: v_pk_min_i16 v0, v1, v2 ; encoding: [0x00,0x00,0x88,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_add_u16 v0, v1, v2
+// GFX9: v_pk_add_u16 v0, v1, v2 ; encoding: [0x00,0x00,0x8a,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_max_u16 v0, v1, v2
+// GFX9: v_pk_max_u16 v0, v1, v2 ; encoding: [0x00,0x00,0x8c,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_min_u16 v0, v1, v2
+// GFX9: v_pk_min_u16 v0, v1, v2 ; encoding: [0x00,0x00,0x8d,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_fma_f16 v0, v1, v2, v3
+// GFX9: v_pk_fma_f16 v0, v1, v2, v3 ; encoding: [0x00,0x40,0x8e,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_pk_add_f16 v0, v1, v2
+// GFX9: v_pk_add_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x8f,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_mul_f16 v0, v1, v2
+// GFX9: v_pk_mul_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x90,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_min_f16 v0, v1, v2
+// GFX9: v_pk_min_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x91,0xd3,0x01,0x05,0x02,0x18]
+
+v_pk_max_f16 v0, v1, v2
+// GFX9: v_pk_max_f16 v0, v1, v2 ; encoding: [0x00,0x00,0x92,0xd3,0x01,0x05,0x02,0x18]
+
+v_mad_mix_f32 v0, v1, v2, v3
+// GFX9: v_mad_mix_f32 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mixlo_f16 v0, v1, v2, v3
+// GFX9: v_mad_mixlo_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa1,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mixhi_f16 v0, v1, v2, v3
+// GFX9: v_mad_mixhi_f16 v0, v1, v2, v3 ; encoding: [0x00,0x00,0xa2,0xd3,0x01,0x05,0x0e,0x04]
+
+
+//
+// Regular source modifiers on non-packed instructions
+//
+
+v_mad_mix_f32 v0, abs(v1), v2, v3
+// GFX9: v_mad_mix_f32 v0, |v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mix_f32 v0, v1, abs(v2), v3
+// GFX9: v_mad_mix_f32 v0, v1, |v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mix_f32 v0, v1, v2, abs(v3)
+// GFX9: v_mad_mix_f32 v0, v1, v2, |v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x04]
+
+v_mad_mix_f32 v0, -v1, v2, v3
+// GFX9: v_mad_mix_f32 v0, -v1, v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x24]
+
+v_mad_mix_f32 v0, v1, -v2, v3
+// GFX9: v_mad_mix_f32 v0, v1, -v2, v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x44]
+
+v_mad_mix_f32 v0, v1, v2, -v3
+// GFX9: v_mad_mix_f32 v0, v1, v2, -v3 ; encoding: [0x00,0x00,0xa0,0xd3,0x01,0x05,0x0e,0x84]
+
+v_mad_mix_f32 v0, -abs(v1), v2, v3
+// GFX9: v_mad_mix_f32 v0, -|v1|, v2, v3 ; encoding: [0x00,0x01,0xa0,0xd3,0x01,0x05,0x0e,0x24]
+
+v_mad_mix_f32 v0, v1, -abs(v2), v3
+// GFX9: v_mad_mix_f32 v0, v1, -|v2|, v3 ; encoding: [0x00,0x02,0xa0,0xd3,0x01,0x05,0x0e,0x44]
+
+v_mad_mix_f32 v0, v1, v2, -abs(v3)
+// GFX9: v_mad_mix_f32 v0, v1, v2, -|v3| ; encoding: [0x00,0x04,0xa0,0xd3,0x01,0x05,0x0e,0x84]
+
+v_mad_mixlo_f16 v0, abs(v1), -v2, abs(v3)
+// GFX9: v_mad_mixlo_f16 v0, |v1|, -v2, |v3| ; encoding: [0x00,0x05,0xa1,0xd3,0x01,0x05,0x0e,0x44]
+
+v_mad_mixhi_f16 v0, -v1, abs(v2), -abs(v3)
+// GFX9: v_mad_mixhi_f16 v0, -v1, |v2|, -|v3| ; encoding: [0x00,0x06,0xa2,0xd3,0x01,0x05,0x0e,0xa4]
diff --git a/test/MC/ARM/Inputs/1.s b/test/MC/ARM/Inputs/1.s
new file mode 100644
index 0000000000000000000000000000000000000000..0afcc633f6415663de235fc7579b641261d3c24e
--- /dev/null
+++ b/test/MC/ARM/Inputs/1.s
@@ -0,0 +1,3 @@
+        .section        .foobar,"ax",%progbits
+         nop
+        .word 32
diff --git a/test/MC/ARM/Inputs/2.s b/test/MC/ARM/Inputs/2.s
new file mode 100644
index 0000000000000000000000000000000000000000..0ecdb294ab86f19972c8a11abe189f90c1cb9c6a
--- /dev/null
+++ b/test/MC/ARM/Inputs/2.s
@@ -0,0 +1,3 @@
+        .section        .foobar,"",%progbits
+         nop
+        .word 32
diff --git a/test/MC/ARM/Inputs/3.s b/test/MC/ARM/Inputs/3.s
new file mode 100644
index 0000000000000000000000000000000000000000..09392623fc1030eb89e0913e231c5b7290bf78c6
--- /dev/null
+++ b/test/MC/ARM/Inputs/3.s
@@ -0,0 +1,3 @@
+        .section        .foobar,"aw",%progbits
+         nop
+        .word 32
diff --git a/test/MC/ARM/Inputs/4.s b/test/MC/ARM/Inputs/4.s
new file mode 100644
index 0000000000000000000000000000000000000000..28d8244bb417f3087c6a85905e36a1bfd017a218
--- /dev/null
+++ b/test/MC/ARM/Inputs/4.s
@@ -0,0 +1,2 @@
+        .section        .foobar,"",%progbits
+        .word 32
diff --git a/test/MC/ARM/Inputs/5.s b/test/MC/ARM/Inputs/5.s
new file mode 100644
index 0000000000000000000000000000000000000000..1faef539b1350ddc301460c62e4e7f7a04d0ed24
--- /dev/null
+++ b/test/MC/ARM/Inputs/5.s
@@ -0,0 +1,2 @@
+        .section        .foobar,"aw",%progbits
+        .word 32
diff --git a/test/MC/ARM/Inputs/6.s b/test/MC/ARM/Inputs/6.s
new file mode 100644
index 0000000000000000000000000000000000000000..0fdb9daaf295c08f42df478b4c35bc4f2a26d4f2
--- /dev/null
+++ b/test/MC/ARM/Inputs/6.s
@@ -0,0 +1,12 @@
+        .section        .foo
+        .word 30
+        .word 31
+        .word 32
+        .word 33
+        nop
+        .word 34
+        .word 35
+        .word 36
+        .word 37
+        .word 38
+        nop
diff --git a/test/MC/ARM/Inputs/7.s b/test/MC/ARM/Inputs/7.s
new file mode 100644
index 0000000000000000000000000000000000000000..b92a61ec971f289f87ef168d64bc03b30da42a44
--- /dev/null
+++ b/test/MC/ARM/Inputs/7.s
@@ -0,0 +1,3 @@
+        .section        .foobar,"aw",%progbits
+        .word 32
+        nop
diff --git a/test/MC/ARM/Inputs/attr.s b/test/MC/ARM/Inputs/attr.s
new file mode 100644
index 0000000000000000000000000000000000000000..412cad76842570629d7781a5025b0e6b914274cf
--- /dev/null
+++ b/test/MC/ARM/Inputs/attr.s
@@ -0,0 +1,5 @@
+	.text
+	.syntax unified
+	.eabi_attribute	67, "2.09"	@ Tag_conformance
+	.cpu	arm7tdmi
+	.eabi_attribute	6, 2	@ Tag_CPU_arch
diff --git a/test/MC/ARM/Inputs/ident.s b/test/MC/ARM/Inputs/ident.s
new file mode 100644
index 0000000000000000000000000000000000000000..19d65fcc7e07e37e01f9be76fbf75e3a2d5c8d93
--- /dev/null
+++ b/test/MC/ARM/Inputs/ident.s
@@ -0,0 +1 @@
+	.ident	"LLVM ARM Compiler"
diff --git a/test/MC/ARM/arm-thumb-trustzone.s b/test/MC/ARM/arm-thumb-trustzone.s
index 4fec4b7e982c633364bc3f4e9d01dc7afcbec410..de38c7f15e0924795e039b088567e646a1320c64 100644
--- a/test/MC/ARM/arm-thumb-trustzone.s
+++ b/test/MC/ARM/arm-thumb-trustzone.s
@@ -16,11 +16,11 @@ _func:
 @ SMC
 @------------------------------------------------------------------------------
         smc #0xf
-        ite eq
+        it eq
         smceq #0
 
 @ NOTZ-NOT: smc 	#15
 @ NOTZ-NOT: smceq	#0
 @ TZ: smc	#15                     @ encoding: [0xff,0xf7,0x00,0x80]
-@ TZ: ite	eq                      @ encoding: [0x0c,0xbf]
+@ TZ: it	eq                      @ encoding: [0x08,0xbf]
 @ TZ: smceq	#0                      @ encoding: [0xf0,0xf7,0x00,0x80]
diff --git a/test/MC/ARM/basic-arm-instructions-v8.1a.s b/test/MC/ARM/basic-arm-instructions-v8.1a.s
index 9b764c18448a11e60f149d502026a97c5186b5cf..6193796ffba355aec53f0ea103a88d3af19ac564 100644
--- a/test/MC/ARM/basic-arm-instructions-v8.1a.s
+++ b/test/MC/ARM/basic-arm-instructions-v8.1a.s
@@ -192,10 +192,10 @@
 //CHECK-ERROR: error: too few operands for instruction
 //CHECK-ERROR:  setpan
 //CHECK-ERROR:  ^
-//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR: error: immediate operand must be in the range [0,1]
 //CHECK-ERROR:  setpan #-1
 //CHECK-ERROR:         ^
-//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR: error: immediate operand must be in the range [0,1]
 //CHECK-ERROR:  setpan #2
 //CHECK-ERROR:         ^
 
diff --git a/test/MC/ARM/basic-thumb2-instructions.s b/test/MC/ARM/basic-thumb2-instructions.s
index f0319717b9958d5e41ff9999c86c93a783f69e24..af1b6289755eff58073474e2bd87539657629993 100644
--- a/test/MC/ARM/basic-thumb2-instructions.s
+++ b/test/MC/ARM/basic-thumb2-instructions.s
@@ -268,6 +268,11 @@ _func:
         asrs.w r7, #5
         asr.w r12, #21
 
+        asrs  r1, r2, #1
+        itt eq
+        asrseq r1, r2, #1
+        asreq r1, r2, #1
+
 @ CHECK: asr.w	r2, r3, #12             @ encoding: [0x4f,0xea,0x23,0x32]
 @ CHECK: asrs.w	r8, r3, #32             @ encoding: [0x5f,0xea,0x23,0x08]
 @ CHECK: asrs.w	r2, r3, #1              @ encoding: [0x5f,0xea,0x63,0x02]
@@ -279,6 +284,10 @@ _func:
 @ CHECK: asrs.w	r7, r7, #5              @ encoding: [0x5f,0xea,0x67,0x17]
 @ CHECK: asr.w	r12, r12, #21           @ encoding: [0x4f,0xea,0x6c,0x5c]
 
+@ CHECK: asrs   r1, r2, #1              @ encoding: [0x51,0x10]
+@ CHECK: itt    eq                      @ encoding: [0x04,0xbf]
+@ CHECK: asrseq.w r1, r2, #1            @ encoding: [0x5f,0xea,0x62,0x01]
+@ CHECK: asreq  r1, r2, #1              @ encoding: [0x51,0x10]
 
 @------------------------------------------------------------------------------
 @ ASR (register)
@@ -1314,6 +1323,11 @@ _func:
         lsls.w r7, #5
         lsl.w r12, #21
 
+        lsls r1, r2, #1
+        itt eq
+        lslseq r1, r2, #1
+        lsleq r1, r2, #1
+
 @ CHECK: lsl.w	r2, r3, #12             @ encoding: [0x4f,0xea,0x03,0x32]
 @ CHECK: lsls.w	r8, r3, #31             @ encoding: [0x5f,0xea,0xc3,0x78]
 @ CHECK: lsls.w	r2, r3, #1              @ encoding: [0x5f,0xea,0x43,0x02]
@@ -1325,6 +1339,10 @@ _func:
 @ CHECK: lsls.w	r7, r7, #5              @ encoding: [0x5f,0xea,0x47,0x17]
 @ CHECK: lsl.w	r12, r12, #21           @ encoding: [0x4f,0xea,0x4c,0x5c]
 
+@ CHECK: lsls   r1, r2, #1              @ encoding: [0x51,0x00]
+@ CHECK: itt eq                         @ encoding: [0x04,0xbf]
+@ CHECK: lslseq.w r1, r2, #1            @ encoding: [0x5f,0xea,0x42,0x01]
+@ CHECK: lsleq  r1, r2, #1              @ encoding: [0x51,0x00]
 
 @------------------------------------------------------------------------------
 @ LSL (register)
@@ -1352,6 +1370,11 @@ _func:
         lsrs.w r7, #5
         lsr.w r12, #21
 
+        lsrs  r1, r2, #1
+        itt eq
+        lsrseq r1, r2, #1
+        lsreq r1, r2, #1
+
 @ CHECK: lsr.w	r2, r3, #12             @ encoding: [0x4f,0xea,0x13,0x32]
 @ CHECK: lsrs.w	r8, r3, #32             @ encoding: [0x5f,0xea,0x13,0x08]
 @ CHECK: lsrs.w	r2, r3, #1              @ encoding: [0x5f,0xea,0x53,0x02]
@@ -1363,6 +1386,10 @@ _func:
 @ CHECK: lsrs.w	r7, r7, #5              @ encoding: [0x5f,0xea,0x57,0x17]
 @ CHECK: lsr.w	r12, r12, #21           @ encoding: [0x4f,0xea,0x5c,0x5c]
 
+@ CHECK: lsrs   r1, r2, #1              @ encoding: [0x51,0x08]
+@ CHECK: itt    eq                      @ encoding: [0x04,0xbf]
+@ CHECK: lsrseq.w r1, r2, #1            @ encoding: [0x5f,0xea,0x52,0x01]
+@ CHECK: lsreq  r1, r2, #1              @ encoding: [0x51,0x08]
 
 @------------------------------------------------------------------------------
 @ LSR (register)
@@ -3066,13 +3093,15 @@ _func:
 @ SVC
 @------------------------------------------------------------------------------
         svc #0
-        ite eq
+        it eq
         svceq #255
+        it ne
         swine #33
 
 @ CHECK: svc	#0                      @ encoding: [0x00,0xdf]
-@ CHECK: ite	eq                      @ encoding: [0x0c,0xbf]
+@ CHECK: it	eq                      @ encoding: [0x08,0xbf]
 @ CHECK: svceq	#255                    @ encoding: [0xff,0xdf]
+@ CHECK: it	ne                      @ encoding: [0x18,0xbf]
 @ CHECK: svcne	#33                     @ encoding: [0x21,0xdf]
 
 
diff --git a/test/MC/ARM/branch-disassemble.s b/test/MC/ARM/branch-disassemble.s
new file mode 100644
index 0000000000000000000000000000000000000000..4df40e05e8c947e7647d3f6aa0f0b9f7888621b1
--- /dev/null
+++ b/test/MC/ARM/branch-disassemble.s
@@ -0,0 +1,15 @@
+@ RUN: llvm-mc -mcpu=cortex-a9 -triple armv7-arm-none-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-objdump -mcpu=cortex-a9 -triple armv7-arm-none-eabi -d - \
+@ RUN:   | FileCheck %s -check-prefix CHECK-ARM
+
+@ RUN: llvm-mc -mcpu=cortex-m3 -triple thumbv7m-arm-none-eabi -filetype obj -o - %s \
+@ RUN:   | llvm-objdump -mcpu=cortex-m3 -triple thumbv7m-arm-none-eabi -d - \
+@ RUN:   | FileCheck %s -check-prefix CHECK-THUMB
+
+b.w .Lbranch
+@ CHECK-ARM: b #4 <$a.0+0xC>
+@ CHECK-THUMB: b.w #8 <$t.0+0xC>
+adds r0, r1, #42
+adds r1, r2, #42
+.Lbranch:
+movs r2, r3
diff --git a/test/MC/ARM/coff-relocations.s b/test/MC/ARM/coff-relocations.s
index fa2d407bb8f3a4bfecd31b4596b56f1da6b3a56e..c15b99f17f781e2b98c2824edf4440e56aee612b 100644
--- a/test/MC/ARM/coff-relocations.s
+++ b/test/MC/ARM/coff-relocations.s
@@ -14,21 +14,21 @@
 branch24t:
 	b target
 
-@ CHECK-ENCODING-LABEL: branch24t
+@ CHECK-ENCODING-LABEL: branch24t:
 @ CHECK-ENCODING-NEXT: b.w #0
 
 	.thumb_func
 branch20t:
 	bcc target
 
-@ CHECK-ENCODING-LABEL: branch20t
+@ CHECK-ENCODING-LABEL: branch20t:
 @ CHECK-ENCODING-NEXT: blo.w #0
 
 	.thumb_func
 blx23t:
 	bl target
 
-@ CHECK-ENCODING-LABEL: blx23t
+@ CHECK-ENCODING-LABEL: blx23t:
 @ CHECK-ENCODING-NEXT: bl #0
 
 	.thumb_func
@@ -37,7 +37,7 @@ mov32t:
 	movt r0, :upper16:target
 	blx r0
 
-@ CHECK-ENCODING-LABEL: mov32t
+@ CHECK-ENCODING-LABEL: mov32t:
 @ CHECK-ENCODING-NEXT: movw r0, #0
 @ CHECK-ENCODING-NEXT: movt r0, #0
 @ CHECK-ENCODING-NEXT: blx r0
@@ -50,7 +50,7 @@ addr32:
 .Laddr32:
 	.long target
 
-@ CHECK-ENCODING-LABEL: addr32
+@ CHECK-ENCODING-LABEL: addr32:
 @ CHECK-ENCODING-NEXT: ldr r0, [pc, #4]
 @ CHECK-ENCODING-NEXT: bx r0
 @ CHECK-ENCODING-NEXT: trap
@@ -65,7 +65,7 @@ addr32nb:
 .Laddr32nb:
 	.long target(imgrel)
 
-@ CHECK-ENCODING-LABEL: addr32nb
+@ CHECK-ENCODING-LABEL: addr32nb:
 @ CHECK-ENCODING-NEXT: ldr.w r0, [pc, #4]
 @ CHECK-ENCODING-NEXT: bx r0
 @ CHECK-ENCODING-NEXT: trap
@@ -80,7 +80,7 @@ secrel:
 .Lsecrel:
 	.long target(secrel32)
 
-@ CHECK-ENCODING-LABEL: secrel
+@ CHECK-ENCODING-LABEL: secrel:
 @ CHECK-ENCODING-NEXT: ldr.w r0, [pc, #4]
 @ CHECK-ENCODING-NEXT: bx r0
 @ CHECK-ENCODING-NEXT: trap
diff --git a/test/MC/ARM/data-in-code.ll b/test/MC/ARM/data-in-code.ll
index c2194e9179c8228ea98a178ca38d98c7a39b8c94..e579146acfb3f68a06f969e678cb88a083b0584c 100644
--- a/test/MC/ARM/data-in-code.ll
+++ b/test/MC/ARM/data-in-code.ll
@@ -60,23 +60,6 @@ exit:
 ;; ARM-NEXT:     Other:
 ;; ARM-NEXT:     Section: [[MIXED_SECT]]
 
-;; ARM:        Symbol {
-;; ARM:          Name: $d
-;; ARM-NEXT:     Value: 0x0
-;; ARM-NEXT:     Size: 0
-;; ARM-NEXT:     Binding: Local (0x0)
-;; ARM-NEXT:     Type: None (0x0)
-;; ARM-NEXT:     Other: 0
-;; ARM-NEXT:     Section: .ARM.exidx
-;; ARM-NEXT:   }
-
-;; ARM:        Symbol {
-;; ARM:          Name: $d
-;; ARM-NEXT:     Value: 0
-;; ARM-NEXT:     Size: 0
-;; ARM-NEXT:     Binding: Local
-;; ARM-NEXT:     Type: None
-
 ;; ARM-NOT:     ${{[atd]}}
 
 ;; TMB:        Symbol {
diff --git a/test/MC/ARM/diagnostics.s b/test/MC/ARM/diagnostics.s
index a1dd95f7d7fcc9f22a456439b033bf1f7e403720..49299380d062298148cd6bfa20fb66201a0d9f2e 100644
--- a/test/MC/ARM/diagnostics.s
+++ b/test/MC/ARM/diagnostics.s
@@ -93,17 +93,19 @@
         @ Out of range 16-bit immediate on BKPT
         bkpt #65536
 
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,65535]
+@ CHECK-ERRORS:         bkpt #65536
+@ CHECK-ERRORS:              ^
 
         @ Out of range immediates for v8 HLT instruction.
         hlt #65536
         hlt #-1
-@CHECK-ERRORS: error: invalid operand for instruction
+@CHECK-ERRORS: error: immediate operand must be in the range [0,65535]
 @CHECK-ERRORS:         hlt #65536
-@CHECK-ERRORS:              ^
-@CHECK-ERRORS: error: invalid operand for instruction
+@CHECK-ERRORS:             ^
+@CHECK-ERRORS: error: immediate operand must be in the range [0,65535]
 @CHECK-ERRORS:         hlt #-1
-@CHECK-ERRORS:              ^
+@CHECK-ERRORS:             ^
 
         @ Illegal condition code for v8 HLT instruction.
         hlteq #2
@@ -123,10 +125,14 @@
         cdp2  p7, #2, c1, c1, c1, #8
         cdp2  p7, #1, c1, c1, c1, #8
 
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS-V7: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS-V7: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS-V7: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS-V7: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS-V8: error: invalid operand for instruction
+@ CHECK-ERRORS-V8: error: invalid operand for instruction
+@ CHECK-ERRORS-V8: error: invalid operand for instruction
+@ CHECK-ERRORS-V8: error: invalid operand for instruction
 
         @ Out of range immediates for DBG
         dbg #-1
@@ -136,6 +142,7 @@
 @ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
 @  Double-check that we're synced up with the right diagnostics.
 @ CHECK-ERRORS: dbg #16
+@ CHECK-ERRORS:     ^
 
         @ Out of range immediate for MCR/MCR2/MCRR/MCRR2
         mcr  p7, #8, r5, c1, c1, #4
@@ -144,10 +151,10 @@
         mcr2  p7, #1, r5, c1, c1, #8
         mcrr  p7, #16, r5, r4, c1
         mcrr2  p7, #16, r5, r4, c1
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
 @ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
 @ CHECK-ERRORS-V7: error: immediate operand must be in the range [0,15]
 @ CHECK-ERRORS-V8: error: invalid operand for instruction
@@ -161,16 +168,20 @@
         @ Out of range immediate for MOV
         movw r9, 0x10000
 @ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:        movw r9, 0x10000
+@ CHECK-ERRORS:                 ^
 
         @ Invalid 's' bit usage for MOVW
         movs r6, #0xffff
         movwseq r9, #0xffff
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,255]
 @ CHECK-ERRORS: error: instruction 'movw' can not set flags, but 's' suffix specified
 
         @ Out of range immediate for MOVT
         movt r9, 0x10000
 @ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS:        movt r9, 0x10000
+@ CHECK-ERRORS:                 ^
 
         @ Out of range immediates for MRC/MRC2/MRRC/MRRC2
         mrc  p14, #8, r1, c1, c2, #4
@@ -179,10 +190,10 @@
         mrc2  p14, #0, r1, c1, c2, #9
         mrrc  p7, #16, r5, r4, c1
         mrrc2  p7, #17, r5, r4, c1
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
 @ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
 @ CHECK-ERRORS-V7: error: immediate operand must be in the range [0,15]
 @ CHECK-ERRORS-V8: error: invalid operand for instruction
@@ -242,10 +253,10 @@
         ssat    r8, #1, r10, lsl fred
         ssat    r8, #1, r10, lsl #fred
 
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [1,32]
 @ CHECK-ERRORS: 	ssat	r8, #0, r10, lsl #8
 @ CHECK-ERRORS: 	    	    ^
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [1,32]
 @ CHECK-ERRORS: 	ssat	r8, #33, r10, lsl #8
 @ CHECK-ERRORS: 	    	    ^
 @ CHECK-ERRORS: error: 'lsr' shift amount must be in range [0,31]
@@ -274,10 +285,10 @@
 	ssat16	r2, #0, r7
 	ssat16	r3, #17, r5
 
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [1,16]
 @ CHECK-ERRORS: 	ssat16	r2, #0, r7
 @ CHECK-ERRORS: 	      	    ^
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [1,16]
 @ CHECK-ERRORS: 	ssat16	r3, #17, r5
 @ CHECK-ERRORS: 	      	    ^
 
@@ -292,7 +303,7 @@
 
         @ Out of range immediate on SVC
         svc #0x1000000
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,0xffffff]
 @ CHECK-ERRORS:   svc #0x1000000
 @ CHECK-ERRORS:       ^
 
@@ -407,7 +418,7 @@
 
         @ Bad CPS instruction format.
         cps f,#1
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,31]
 @ CHECK-ERRORS:         cps f,#1
 @ CHECK-ERRORS:               ^
 
@@ -491,9 +502,12 @@
 foo2:
         mov r0, foo2
         movw r0, foo2
+        movt r0, foo2
 @ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
 @ CHECK-ERRORS:                 ^
 @ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
+@ CHECK-ERRORS:                  ^
+@ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
 @ CHECK-ERRORS:                  ^
 
         str r0, [r0, #4]!
diff --git a/test/MC/ARM/dwarf-asm-multiple-sections.s b/test/MC/ARM/dwarf-asm-multiple-sections.s
index 49550559e9567b50d1f4e441d8819f379721180f..619f4e4c3bff022d1649cb99c9295964fd89f44e 100644
--- a/test/MC/ARM/dwarf-asm-multiple-sections.s
+++ b/test/MC/ARM/dwarf-asm-multiple-sections.s
@@ -1,11 +1,14 @@
+// RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 5 -fdebug-compilation-dir=/tmp
+// RUN: llvm-dwarfdump %t | FileCheck -check-prefix DWARF -check-prefix DWARF45 %s
+// RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC -check-prefix RELOC5 %s
 // RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -fdebug-compilation-dir=/tmp
-// RUN: llvm-dwarfdump %t | FileCheck -check-prefix DWARF -check-prefix DWARF4 %s
-// RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC %s
+// RUN: llvm-dwarfdump %t | FileCheck -check-prefix DWARF -check-prefix DWARF45 %s
+// RUN: llvm-objdump -r %t | FileCheck -check-prefix RELOC -check-prefix RELOC4 %s
 // RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 3 -fdebug-compilation-dir=/tmp
 // RUN: llvm-dwarfdump %t | FileCheck -check-prefix DWARF -check-prefix DWARF3 %s
 // RUN: llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 2 2>&1 | FileCheck -check-prefix VERSION %s
 // RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 1 2>&1 | FileCheck -check-prefix DWARF1 %s
-// RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 5 2>&1 | FileCheck -check-prefix DWARF5 %s
+// RUN: not llvm-mc < %s -triple=armv7-linux-gnueabi -filetype=obj -o %t -g -dwarf-version 6 2>&1 | FileCheck -check-prefix DWARF6 %s
   .section .text, "ax"
 a:
   mov r0, r0
@@ -18,9 +21,9 @@ b:
 // DWARF: Abbrev table for offset: 0x00000000
 // DWARF: [1] DW_TAG_compile_unit DW_CHILDREN_yes
 // DWARF3:        DW_AT_stmt_list DW_FORM_data4
-// DWARF4:        DW_AT_stmt_list DW_FORM_sec_offset
+// DWARF45:       DW_AT_stmt_list DW_FORM_sec_offset
 // DWARF3:        DW_AT_ranges    DW_FORM_data4
-// DWARF4:        DW_AT_ranges    DW_FORM_sec_offset
+// DWARF45:       DW_AT_ranges    DW_FORM_sec_offset
 // DWARF:         DW_AT_name      DW_FORM_string
 // DWARF:         DW_AT_comp_dir  DW_FORM_string
 // DWARF:         DW_AT_producer  DW_FORM_string
@@ -29,8 +32,8 @@ b:
 // DWARF: .debug_info contents:
 // DWARF: 0x{{[0-9a-f]+}}: DW_TAG_compile_unit [1]
 // DWARF-NOT: DW_TAG_
-// DWARF3: DW_AT_ranges [DW_FORM_data4]           (0x00000000
-// DWARF4: DW_AT_ranges [DW_FORM_sec_offset]      (0x00000000
+// DWARF3:  DW_AT_ranges [DW_FORM_data4]           (0x00000000
+// DWARF45: DW_AT_ranges [DW_FORM_sec_offset]      (0x00000000
 
 // DWARF: 0x{{[0-9a-f]+}}:   DW_TAG_label [2] *
 // DWARF-NEXT: DW_AT_name [DW_FORM_string]     ("a")
@@ -46,10 +49,10 @@ b:
 
 
 // DWARF: .debug_line contents:
-// DWARF:      0x0000000000000000     11      0      1   0   0  is_stmt
-// DWARF-NEXT: 0x0000000000000004     11      0      1   0   0  is_stmt end_sequence
-// DWARF-NEXT: 0x0000000000000000     15      0      1   0   0  is_stmt
-// DWARF-NEXT: 0x0000000000000004     15      0      1   0   0  is_stmt end_sequence
+// DWARF:      0x0000000000000000     14      0      1   0   0  is_stmt
+// DWARF-NEXT: 0x0000000000000004     14      0      1   0   0  is_stmt end_sequence
+// DWARF-NEXT: 0x0000000000000000     18      0      1   0   0  is_stmt
+// DWARF-NEXT: 0x0000000000000004     18      0      1   0   0  is_stmt end_sequence
 
 
 // DWARF: .debug_ranges contents:
@@ -61,10 +64,14 @@ b:
 
 
 
+// Offsets are different in DWARF v5 due to different header layout.
 // RELOC: RELOCATION RECORDS FOR [.rel.debug_info]:
-// RELOC-NEXT: 00000006 R_ARM_ABS32 .debug_abbrev
-// RELOC-NEXT: 0000000c R_ARM_ABS32 .debug_line
-// RELOC-NEXT: 00000010 R_ARM_ABS32 .debug_ranges
+// RELOC4-NEXT: 00000006 R_ARM_ABS32 .debug_abbrev
+// RELOC4-NEXT: 0000000c R_ARM_ABS32 .debug_line
+// RELOC4-NEXT: 00000010 R_ARM_ABS32 .debug_ranges
+// RELOC5-NEXT: 00000008 R_ARM_ABS32 .debug_abbrev
+// RELOC5-NEXT: 0000000d R_ARM_ABS32 .debug_line
+// RELOC5-NEXT: 00000011 R_ARM_ABS32 .debug_ranges
 // RELOC-NEXT: R_ARM_ABS32 .text
 // RELOC-NEXT: R_ARM_ABS32 foo
 
@@ -81,4 +88,4 @@ b:
 // VERSION: {{.*}} warning: DWARF2 only supports one section per compilation unit
 
 // DWARF1: Dwarf version 1 is not supported.
-// DWARF5: Dwarf version 5 is not supported.
+// DWARF6: Dwarf version 6 is not supported.
diff --git a/test/MC/ARM/error-location-post-layout.s b/test/MC/ARM/error-location-post-layout.s
index 637f5941976c79b086d2a46644b6295c619b3876..dea929e4352e814cce17bee51abe52ad42e874ad 100644
--- a/test/MC/ARM/error-location-post-layout.s
+++ b/test/MC/ARM/error-location-post-layout.s
@@ -1,7 +1,7 @@
 @ RUN: not llvm-mc -triple armv7a--none-eabi -filetype obj < %s -o /dev/null 2>&1 | FileCheck %s
 
   .set v1, -undef
-@ CHECK: <unknown>:0: error: expression could not be evaluated
+@ CHECK: 3:12: error: expression could not be evaluated
 
   .comm common, 4
   .set v3, common
diff --git a/test/MC/ARM/inline-asm-diags.ll b/test/MC/ARM/inline-asm-diags.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f71338215548dabfdd0edf5ca2d322ffdeda6ccc
--- /dev/null
+++ b/test/MC/ARM/inline-asm-diags.ll
@@ -0,0 +1,9 @@
+; RUN: not llc -mtriple=armv7-linux -filetype=obj < %s 2>&1 -o /dev/null | FileCheck %s
+
+module asm ".word 0x10"
+module asm ".word -bar"
+
+; CHECK: <inline asm>:2:{{[0-9]+}}: error: expected relocatable expression
+
+module asm ".word -foo"
+; CHECK: <inline asm>:3:{{[0-9]+}}: error: expected relocatable expression
diff --git a/test/MC/ARM/inline-asm-srcloc.ll b/test/MC/ARM/inline-asm-srcloc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9fb9c5b4ef9da5710e9d49b4524f5799103f3159
--- /dev/null
+++ b/test/MC/ARM/inline-asm-srcloc.ll
@@ -0,0 +1,37 @@
+; RUN: not llc -filetype=obj 2>&1 -o /dev/null < %s | FileCheck %s
+
+; ModuleID = '/scratch/llvm/master/tools/clang/test/Misc/inline-asm-diags.c'
+source_filename = "/scratch/llvm/master/tools/clang/test/Misc/inline-asm-diags.c"
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv7-arm-none-eabi"
+
+; Function Attrs: noinline nounwind
+define void @foo2() #0 {
+entry:
+  call void asm sideeffect " wibble", ""() #1, !srcloc !3
+; CHECK: note: !srcloc = 107
+  ret void
+}
+
+; Function Attrs: noinline nounwind
+define void @foo() #0 {
+entry:
+  call void asm sideeffect " .word -bar", ""() #1, !srcloc !4
+; CHECK: note: !srcloc = 181
+  call void asm sideeffect " .word -foo", ""() #1, !srcloc !5
+; CHECK: note: !srcloc = 257
+  ret void
+}
+
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+strict-align,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind }
+
+!llvm.module.flags = !{!0, !1}
+!llvm.ident = !{!2}
+
+!0 = !{i32 1, !"wchar_size", i32 4}
+!1 = !{i32 1, !"min_enum_size", i32 4}
+!2 = !{!"clang version 5.0.0 "}
+!3 = !{i32 107}
+!4 = !{i32 181}
+!5 = !{i32 257}
diff --git a/test/MC/ARM/invalid-special-reg.s b/test/MC/ARM/invalid-special-reg.s
new file mode 100644
index 0000000000000000000000000000000000000000..2e39fe6e250aa4dde0e7596d081f9893ea7b8295
--- /dev/null
+++ b/test/MC/ARM/invalid-special-reg.s
@@ -0,0 +1,11 @@
+@ RUN: not llvm-mc -triple armv7a--none-eabi < %s 2>&1 | FileCheck %s
+@ RUN: not llvm-mc -triple thumbv7a--none-eabi < %s 2>&1 | FileCheck %s
+
+  msr apsr_c, r0
+@ CHECK: invalid operand for instruction
+  msr cpsr_w
+@ CHECK: invalid operand for instruction
+  msr cpsr_cc
+@ CHECK: invalid operand for instruction
+  msr xpsr_c
+@ CHECK: invalid operand for instruction
diff --git a/test/MC/ARM/lsl-zero-errors.s b/test/MC/ARM/lsl-zero-errors.s
new file mode 100644
index 0000000000000000000000000000000000000000..845507c069ad87c6b2b142e8875a8f896c14d325
--- /dev/null
+++ b/test/MC/ARM/lsl-zero-errors.s
@@ -0,0 +1,103 @@
+// RUN: not llvm-mc -triple=thumbv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV7 %s
+// RUN: not llvm-mc -triple=thumbv8 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV8 %s
+// RUN: llvm-mc -triple=armv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s
+
+        // lsl #0 is actually mov, so here we check that it behaves the same as
+        // mov with regards to the permitted registers
+
+        // Using PC is invalid in thumb
+        lsl pc, r0, #0
+        lsl r0, pc, #0
+        lsl pc, pc, #0
+        lsls pc, r0, #0
+        lsls r0, pc, #0
+        lsls pc, pc, #0
+
+// CHECK-NONARM: error: instruction requires: arm-mode
+// CHECK-NONARM-NEXT: lsl pc, r0, #0
+// CHECK-NONARM: error: instruction requires: arm-mode
+// CHECK-NONARM-NEXT: lsl r0, pc, #0
+// CHECK-NONARM: error: instruction requires: arm-mode
+// CHECK-NONARM-NEXT: lsl pc, pc, #0
+// CHECK-NONARM: error: instruction requires: arm-mode
+// CHECK-NONARM-NEXT: lsls pc, r0, #0
+// CHECK-NONARM: error: instruction requires: arm-mode
+// CHECK-NONARM-NEXT: lsls r0, pc, #0
+// CHECK-NONARM: error: instruction requires: arm-mode
+// CHECK-NONARM-NEXT: lsls pc, pc, #0
+
+// CHECK-ARM: mov pc, r0                @ encoding: [0x00,0xf0,0xa0,0xe1]
+// CHECK-ARM: mov r0, pc                @ encoding: [0x0f,0x00,0xa0,0xe1]
+// CHECK-ARM: mov pc, pc                @ encoding: [0x0f,0xf0,0xa0,0xe1]
+// CHECK-ARM: movs pc, r0               @ encoding: [0x00,0xf0,0xb0,0xe1]
+// CHECK-ARM: movs r0, pc               @ encoding: [0x0f,0x00,0xb0,0xe1]
+// CHECK-ARM: movs pc, pc               @ encoding: [0x0f,0xf0,0xb0,0xe1]
+
+        mov pc, r0, lsl #0
+        mov r0, pc, lsl #0
+        mov pc, pc, lsl #0
+        movs pc, r0, lsl #0
+        movs r0, pc, lsl #0
+        movs pc, pc, lsl #0
+
+// FIXME: Really the error we should be giving is "requires: arm-mode"
+// CHECK-NONARM: error: invalid operand for instruction
+// CHECK-NONARM-NEXT: mov pc, r0, lsl #0
+// CHECK-NONARM: error: invalid operand for instruction
+// CHECK-NONARM-NEXT: mov r0, pc, lsl #0
+// CHECK-NONARM: error: invalid operand for instruction
+// CHECK-NONARM-NEXT: mov pc, pc, lsl #0
+// CHECK-NONARM: error: invalid operand for instruction
+// CHECK-NONARM-NEXT: movs pc, r0, lsl #0
+// CHECK-NONARM: error: invalid operand for instruction
+// CHECK-NONARM-NEXT: movs r0, pc, lsl #0
+// CHECK-NONARM: error: invalid operand for instruction
+// CHECK-NONARM-NEXT: movs pc, pc, lsl #0
+
+// CHECK-ARM: mov pc, r0                @ encoding: [0x00,0xf0,0xa0,0xe1]
+// CHECK-ARM: mov r0, pc                @ encoding: [0x0f,0x00,0xa0,0xe1]
+// CHECK-ARM: mov pc, pc                @ encoding: [0x0f,0xf0,0xa0,0xe1]
+// CHECK-ARM: movs pc, r0               @ encoding: [0x00,0xf0,0xb0,0xe1]
+// CHECK-ARM: movs r0, pc               @ encoding: [0x0f,0x00,0xb0,0xe1]
+// CHECK-ARM: movs pc, pc               @ encoding: [0x0f,0xf0,0xb0,0xe1]
+
+        // Using SP is invalid before ARMv8 in thumb unless non-flags-setting
+        // and one of the source and destination is not SP
+        lsl sp, sp, #0
+        lsls sp, sp, #0
+        lsls r0, sp, #0
+        lsls sp, r0, #0
+
+// CHECK-THUMBV7: error: instruction variant requires ARMv8 or later
+// CHECK-THUMBV7-NEXT: lsl sp, sp, #0
+// CHECK-THUMBV7: error: instruction variant requires ARMv8 or later
+// CHECK-THUMBV7-NEXT: lsls sp, sp, #0
+// CHECK-THUMBV7: error: instruction variant requires ARMv8 or later
+// CHECK-THUMBV7-NEXT: lsls r0, sp, #0
+// CHECK-THUMBV7: error: instruction variant requires ARMv8 or later
+// CHECK-THUMBV7-NEXT: lsls sp, r0, #0
+
+// CHECK-ARM: mov sp, sp                @ encoding: [0x0d,0xd0,0xa0,0xe1]
+// CHECK-ARM: movs sp, sp               @ encoding: [0x0d,0xd0,0xb0,0xe1]
+// CHECK-ARM: movs r0, sp               @ encoding: [0x0d,0x00,0xb0,0xe1]
+// CHECK-ARM: movs sp, r0               @ encoding: [0x00,0xd0,0xb0,0xe1]
+
+        mov sp, sp, lsl #0
+        movs sp, sp, lsl #0
+        movs r0, sp, lsl #0
+        movs sp, r0, lsl #0
+
+// FIXME: We should consistently have the "requires ARMv8" error here
+// CHECK-THUMBV7: error: invalid operand for instruction
+// CHECK-THUMBV7-NEXT: mov sp, sp, lsl #0
+// CHECK-THUMBV7: error: invalid operand for instruction
+// CHECK-THUMBV7-NEXT: movs sp, sp, lsl #0
+// CHECK-THUMBV7: error: instruction variant requires ARMv8 or later
+// CHECK-THUMBV7-NEXT: movs r0, sp, lsl #0
+// CHECK-THUMBV7: error: invalid operand for instruction
+// CHECK-THUMBV7-NEXT: movs sp, r0, lsl #0
+
+// CHECK-ARM: mov sp, sp                @ encoding: [0x0d,0xd0,0xa0,0xe1]
+// CHECK-ARM: movs sp, sp               @ encoding: [0x0d,0xd0,0xb0,0xe1]
+// CHECK-ARM: movs r0, sp               @ encoding: [0x0d,0x00,0xb0,0xe1]
+// CHECK-ARM: movs sp, r0               @ encoding: [0x00,0xd0,0xb0,0xe1]
diff --git a/test/MC/ARM/lsl-zero.s b/test/MC/ARM/lsl-zero.s
new file mode 100644
index 0000000000000000000000000000000000000000..5d097115448f8b27f135a708cccd4bf4f75d5b37
--- /dev/null
+++ b/test/MC/ARM/lsl-zero.s
@@ -0,0 +1,57 @@
+// RUN: llvm-mc -triple=thumbv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV7 %s
+// RUN: llvm-mc -triple=thumbv8 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-NONARM --check-prefix=CHECK-THUMBV8 %s
+// RUN: llvm-mc -triple=armv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-ARM %s
+
+        // lsl #0 is actually mov, so here we check that it behaves the same as
+        // mov with regards to the permitted registers and how it behaves in an
+        // IT block.
+
+        // Non-flags-setting with only one of source and destination SP should
+        // be OK
+        lsl sp, r0, #0
+        lsl r0, sp, #0
+
+// CHECK-NONARM: mov.w sp, r0           @ encoding: [0x4f,0xea,0x00,0x0d]
+// CHECK-NONARM: mov.w r0, sp           @ encoding: [0x4f,0xea,0x0d,0x00]
+
+// CHECK-ARM: mov sp, r0                @ encoding: [0x00,0xd0,0xa0,0xe1]
+// CHECK-ARM: mov r0, sp                @ encoding: [0x0d,0x00,0xa0,0xe1]
+
+        //FIXME: pre-ARMv8 we give an error for these instructions
+        //mov sp, r0, lsl #0
+        //mov r0, sp, lsl #0
+
+        // LSL #0 in IT block should select the 32-bit encoding
+        itt eq
+        lsleq  r0, r1, #0
+        lslseq r0, r1, #0
+        itt gt
+        lslgt  r0, r1, #0
+        lslsgt r0, r1, #0
+
+// CHECK-NONARM: moveq.w r0, r1         @ encoding: [0x4f,0xea,0x01,0x00]
+// CHECK-NONARM: movseq.w r0, r1        @ encoding: [0x5f,0xea,0x01,0x00]
+// CHECK-NONARM: movgt.w r0, r1         @ encoding: [0x4f,0xea,0x01,0x00]
+// CHECK-NONARM: movsgt.w r0, r1        @ encoding: [0x5f,0xea,0x01,0x00]
+
+// CHECK-ARM: moveq r0, r1              @ encoding: [0x01,0x00,0xa0,0x01]
+// CHECK-ARM: movseq r0, r1             @ encoding: [0x01,0x00,0xb0,0x01]
+// CHECK-ARM: movgt r0, r1              @ encoding: [0x01,0x00,0xa0,0xc1]
+// CHECK-ARM: movsgt r0, r1             @ encoding: [0x01,0x00,0xb0,0xc1]
+
+        itt eq
+        moveq  r0, r1, lsl #0
+        movseq r0, r1, lsl #0
+        itt gt
+        movgt  r0, r1, lsl #0
+        movsgt r0, r1, lsl #0
+
+// CHECK-NONARM: moveq.w r0, r1         @ encoding: [0x4f,0xea,0x01,0x00]
+// CHECK-NONARM: movseq.w r0, r1        @ encoding: [0x5f,0xea,0x01,0x00]
+// CHECK-NONARM: movgt.w r0, r1         @ encoding: [0x4f,0xea,0x01,0x00]
+// CHECK-NONARM: movsgt.w r0, r1        @ encoding: [0x5f,0xea,0x01,0x00]
+
+// CHECK-ARM: moveq r0, r1              @ encoding: [0x01,0x00,0xa0,0x01]
+// CHECK-ARM: movseq r0, r1             @ encoding: [0x01,0x00,0xb0,0x01]
+// CHECK-ARM: movgt r0, r1              @ encoding: [0x01,0x00,0xa0,0xc1]
+// CHECK-ARM: movsgt r0, r1             @ encoding: [0x01,0x00,0xb0,0xc1]
diff --git a/test/MC/ARM/mappingsymbols.s b/test/MC/ARM/mappingsymbols.s
new file mode 100644
index 0000000000000000000000000000000000000000..fff8e10478106815060e019e5c653d7cf811e19e
--- /dev/null
+++ b/test/MC/ARM/mappingsymbols.s
@@ -0,0 +1,48 @@
+# Check section containing code and data with permission executable for the section.
+@ RUN: llvm-mc -triple armv7-none-linux -filetype=obj -o %t.o %p/Inputs/1.s
+@ RUN: llvm-readobj -elf-output-style=GNU -symbols %t.o | FileCheck %s
+
+# Check section containing code and data with no permissions for the section.
+@ RUN: llvm-mc -triple armv7-none-linux -filetype=obj -o %t.o %p/Inputs/2.s
+@ RUN: llvm-readobj -elf-output-style=GNU -symbols %t.o | FileCheck %s
+
+# Check section containing code and data with read/write permissions for the section.
+@ RUN: llvm-mc -triple armv7-none-linux -filetype=obj -o %t.o %p/Inputs/3.s
+@ RUN: llvm-readobj -elf-output-style=GNU -symbols %t.o | FileCheck %s
+
+# Check section containing data with no permissions for the section.
+@ RUN: llvm-mc -triple armv7-none-linux -filetype=obj -o %t.o %p/Inputs/4.s
+@ RUN: llvm-readobj -elf-output-style=GNU -symbols %t.o | FileCheck %s -check-prefix=MAPPINGSYMBOLS
+
+# Check section containing only data with read/write permissions for the section.
+@ RUN: llvm-mc -triple armv7-none-linux -filetype=obj -o %t.o %p/Inputs/5.s
+@ RUN: llvm-readobj -elf-output-style=GNU -symbols %t.o | FileCheck %s -check-prefix=MAPPINGSYMBOLS
+
+# Check section containing the ident string with no permissions for the section.
+@ RUN: llvm-mc -triple armv7-none-linux -filetype=obj -o %t.o %p/Inputs/ident.s
+@ RUN: llvm-readobj -elf-output-style=GNU -symbols %t.o | FileCheck %s -check-prefix=MAPPINGSYMBOLS
+
+# Check section containing the attributes with no permissions for the section.
+@ RUN: llvm-mc -triple armv7-none-linux -filetype=obj -o %t.o %p/Inputs/attr.s
+@ RUN: llvm-readobj -elf-output-style=GNU -symbols %t.o | FileCheck %s -check-prefix=MAPPINGSYMBOLS
+
+# Check section containing code and data with no permissions for the section.
+# data comes before code.
+@ RUN: llvm-mc -triple armv7-none-linux -filetype=obj -o %t.o %p/Inputs/6.s
+@ RUN: llvm-readobj -elf-output-style=GNU -symbols %t.o | FileCheck %s -check-prefix=MIX
+
+# Check section containing code and data with no permissions for the section.
+# data comes before code.
+@ RUN: llvm-mc -triple armv7-none-linux -filetype=obj -o %t.o %p/Inputs/7.s
+@ RUN: llvm-readobj -elf-output-style=GNU -symbols %t.o | FileCheck %s
+
+#CHECK: $a
+#CHECK: $d
+
+#MIX: $a
+#MIX: $a
+#MIX: $d
+#MIX: $d
+
+#MAPPINGSYMBOLS-NOT: $a
+#MAPPINGSYMBOLS-NOT: $d
diff --git a/test/MC/ARM/multi-section-mapping.s b/test/MC/ARM/multi-section-mapping.s
index 2b1b0efab53c92317b384098d86cf7487f899841..e4b7146e4b0f703a72577f68c7729a33eab19190 100644
--- a/test/MC/ARM/multi-section-mapping.s
+++ b/test/MC/ARM/multi-section-mapping.s
@@ -29,7 +29,6 @@
 
 @ CHECK: 00000000 .text 00000000 $a
 @ CHECK-NEXT: 00000000 .wibble 00000000 $a
-@ CHECK-NEXT: 00000000 .starts_data 00000000 $d
 @ CHECK-NEXT: 00000000 .starts_thumb 00000000 $t
 @ CHECK-NOT: ${{[adt]}}
 
diff --git a/test/MC/ARM/negative-immediates-fail.s b/test/MC/ARM/negative-immediates-fail.s
new file mode 100644
index 0000000000000000000000000000000000000000..dd45e4316389cf6518cc0ec2f7f49172054f5e3d
--- /dev/null
+++ b/test/MC/ARM/negative-immediates-fail.s
@@ -0,0 +1,13 @@
+# RUN: not llvm-mc -triple armv7 %s 2>&1| FileCheck %s
+
+.arm
+
+ADC r0, r1, #0xFFFFFEEE
+# CHECK: error: invalid operand for instruction
+ADC r0, r1, #0xABFEABFF
+# CHECK: error: invalid operand for instruction
+ADC r0, r1, #0xFFFFFE02
+# CHECK: error: invalid operand for instruction
+
+ADD.W r0, r0, #0xFF01FF01
+# CHECK: error: immediate operand must be in the range [0,7]
diff --git a/test/MC/ARM/negative-immediates-thumb1-fail.s b/test/MC/ARM/negative-immediates-thumb1-fail.s
new file mode 100644
index 0000000000000000000000000000000000000000..0e8525ede90312faac380e9c9e864b6629a102ef
--- /dev/null
+++ b/test/MC/ARM/negative-immediates-thumb1-fail.s
@@ -0,0 +1,15 @@
+# RUN: not llvm-mc -triple thumbv7 -mcpu=cortex-m0 %s 2>&1 | FileCheck %s
+
+.thumb
+
+ADDs r1, r0, #0xFFFFFFF5
+# CHECK: error: instruction requires: arm-mode
+
+ADDs r0, #0xFFFFFEFF
+# CHECK: error: immediate operand must be in the range [0,255]
+
+SUBs r1, r0, #0xFFFFFFF5
+# CHECK: error: instruction requires: arm-mode
+
+SUBs r0, #0xFFFFFEFF
+# CHECK: error: immediate operand must be in the range [0,255]
diff --git a/test/MC/ARM/negative-immediates-thumb1.s b/test/MC/ARM/negative-immediates-thumb1.s
new file mode 100644
index 0000000000000000000000000000000000000000..7b6f57b3aae1b0257513a9e6517d527d5b44ab2a
--- /dev/null
+++ b/test/MC/ARM/negative-immediates-thumb1.s
@@ -0,0 +1,19 @@
+# RUN: llvm-mc -triple thumbv7 -mcpu=cortex-m0 %s -show-encoding | FileCheck %s
+# RUN: not llvm-mc -triple thumbv7 -mcpu=cortex-m0 %s -show-encoding -mattr=+no-neg-immediates 2>&1 | FileCheck %s -check-prefix=CHECK-DISABLED
+
+.thumb
+
+	ADDs r1, r0, #0xFFFFFFF9
+# CHECK: subs r1, r0, #7
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+	ADDs r0, #0xFFFFFF01
+# CHECK: subs r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+
+	SUBs r0, #0xFFFFFF01
+# CHECK: adds r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+
+	SUBs r1, r0, #0xFFFFFFF9
+# CHECK: adds r1, r0, #7
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
diff --git a/test/MC/ARM/negative-immediates.s b/test/MC/ARM/negative-immediates.s
new file mode 100644
index 0000000000000000000000000000000000000000..aa3998163d88312875cab68c21c92b0acb6f2d5f
--- /dev/null
+++ b/test/MC/ARM/negative-immediates.s
@@ -0,0 +1,128 @@
+# RUN: llvm-mc -triple armv7 %s -show-encoding | FileCheck %s
+# RUN: not llvm-mc -triple armv7 %s -show-encoding -mattr=+no-neg-immediates 2>&1 | FileCheck %s -check-prefix=CHECK-DISABLED
+
+.arm
+
+	ADC r0, r1, #0xFFFFFF00
+# CHECK: sbc r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADC
+	ADC r0, r1, #0xFFFFFE03
+# CHECK: sbc r0, r1, #508
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADC
+	ADD r0, r1, #0xFFFFFF01
+# CHECK: sub r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADD
+	AND r0, r1, #0xFFFFFF00
+# CHECK: bic r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: AND
+	BIC r0, r1, #0xFFFFFF00
+# CHECK: and r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: BIC
+	CMP r0, #0xFFFFFF01
+# CHECK: cmn r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: CMP
+	CMN r0, #0xFFFFFF01
+# CHECK: cmp r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: CMN
+	MOV r0, #0xFFFFFF00
+# CHECK: mvn r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: MOV
+	MVN r0, #0xFFFFFF00
+# CHECK: mov r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: MVN
+	SBC r0, r1, #0xFFFFFF00
+# CHECK: adc r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: SBC
+	SUB r0, r1, #0xFFFFFF01
+# CHECK: add r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: SUB
+
+.thumb
+
+	ADC r0, r1, #0xFFFFFF00
+# CHECK: sbc r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADC
+	ADC r0, r1, #0xFFFF00FF
+# CHECK: sbc r0, r1, #65280
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADC
+	ADC r0, r1, #0xFFFEFFFE
+# CHECK: sbc r0, r1, #65537 @ encoding: [0x61,0xf1,0x01,0x10]
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADC
+	ADC r0, r1, #0xFEFFFEFF
+# CHECK: sbc r0, r1, #16777472 @ encoding: [0x61,0xf1,0x01,0x20]
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADC
+	ADD.W r0, r0, #0xFFFFFF01
+# CHECK: sub.w r0, r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADD.W
+	ADD.W r0, r0, #0xFF01FF02
+# CHECK: sub.w r0, r0, #16646398 @ encoding: [0xa0,0xf1,0xfe,0x10]
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADD.W
+	ADDW r0, r1, #0xFFFFFF01
+# CHECK: subw r0, r1, #255 @ encoding: [0xa1,0xf2,0xff,0x00]
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADDW
+	ADD.W r0, r1, #0xFFFFFF01
+# CHECK: sub.w r0, r1, #255 @ encoding: [0xa1,0xf1,0xff,0x00]
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: ADD.W
+	AND r0, r1, #0xFFFFFF00
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: AND
+# CHECK: bic r0, r1, #255
+	AND r0, r1, #0xFEFFFEFF
+# CHECK: bic r0, r1, #16777472 @ encoding: [0x21,0xf0,0x01,0x20]
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: AND
+	BIC r0, r1, #0xFFFFFF00
+# CHECK: and r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: BIC
+	BIC r0, r1, #0xFEFFFEFF
+# CHECK: and r0, r1, #16777472 @ encoding: [0x01,0xf0,0x01,0x20]
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: BIC
+	CMP r0, #0xFFFFFF01
+# CHECK: cmn.w r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: CMP
+	CMN r0, #0xFFFFFF01
+# CHECK: cmp.w r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: CMN
+	MOV r0, #0xFFFFFF00
+# CHECK: mvn r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: MOV
+	MVN r0, #0xFFFFFF00
+# CHECK: mov.w r0, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: MVN
+	SBC r0, r1, #0xFFFFFF00
+# CHECK: adc r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: SBC
+	SUBW r0, r1, #0xFFFFFF01
+# CHECK: addw r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: SUBW
+	SUB.W r0, r1, #0xFFFFFF01
+# CHECK: add.w r0, r1, #255
+# CHECK-DISABLED: error: instruction requires: NegativeImmediates
+# CHECK-DISABLED: SUB.W
diff --git a/test/MC/ARM/quad-relocation.s b/test/MC/ARM/quad-relocation.s
index 83d312b0e34fe084f34785c3dfd7b5c7ab7260ab..34de182924e2821a22cb5d5b1f2ee63bb244d599 100644
--- a/test/MC/ARM/quad-relocation.s
+++ b/test/MC/ARM/quad-relocation.s
@@ -4,6 +4,6 @@
 symbol:
   .quad(symbol)
 
-@ CHECK: error: bad relocation fixup type
+@ CHECK: error: unsupported relocation on symbol
 @ CHECK-NEXT:   .quad(symbol)
 @ CHECK-NEXT:        ^
diff --git a/test/MC/ARM/simple-fp-encoding.s b/test/MC/ARM/simple-fp-encoding.s
index 539dd2c4d9761ad58be8e54ce3a87f93f02ffb7e..74babf9a699a6154f2bbdee6d4b151728a2bc834 100644
--- a/test/MC/ARM/simple-fp-encoding.s
+++ b/test/MC/ARM/simple-fp-encoding.s
@@ -38,6 +38,18 @@
 @ CHECK: vnmul.f64 d16, d17, d16     @ encoding: [0xe0,0x0b,0x61,0xee]
 @ CHECK: vnmul.f32 s0, s1, s0        @ encoding: [0xc0,0x0a,0x20,0xee]
 
+        vcmp.f64       d17, d16
+        vcmp.f32       s1, s0
+
+@ CHECK: vcmp.f64  d17, d16        @ encoding: [0x60,0x1b,0xf4,0xee]
+@ CHECK: vcmp.f32  s1, s0          @ encoding: [0x40,0x0a,0xf4,0xee]
+
+        vcmp.f64       d17, #0.0
+        vcmp.f32       s1, #0.0
+
+@ CHECK: vcmp.f64  d17, #0         @ encoding: [0x40,0x1b,0xf5,0xee]
+@ CHECK: vcmp.f32  s1, #0          @ encoding: [0x40,0x0a,0xf5,0xee]
+
         vcmpe.f64       d17, d16
         vcmpe.f32       s1, s0
 
diff --git a/test/MC/ARM/thumb-diagnostics.s b/test/MC/ARM/thumb-diagnostics.s
index ab7c92cf3b901fb7dbbcf14c3157581829958a85..f0a94aa810557e27135fd1ca98c077029e70a14f 100644
--- a/test/MC/ARM/thumb-diagnostics.s
+++ b/test/MC/ARM/thumb-diagnostics.s
@@ -28,7 +28,7 @@
 
 @ Out of range immediates for ASR instruction.
         asrs r2, r3, #33
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,32]
 @ CHECK-ERRORS:         asrs r2, r3, #33
 @ CHECK-ERRORS:                      ^
 
@@ -51,7 +51,7 @@ error: invalid operand for instruction
 @CHECK-ERRORS-V8: error: instruction requires: arm-mode
 @CHECK-ERRORS-V8:         hlt #64
 @CHECK-ERRORS-V8:              ^
-@CHECK-ERRORS: error: invalid operand for instruction
+@CHECK-ERRORS: error: immediate operand must be in the range [0,65535]
 @CHECK-ERRORS:         hlt #-1
 @CHECK-ERRORS:              ^
 
@@ -153,10 +153,10 @@ error: invalid operand for instruction
 @ Out of range immediates for LSL instruction.
         lsls r4, r5, #-1
         lsls r4, r5, #32
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,31]
 @ CHECK-ERRORS:         lsls r4, r5, #-1
 @ CHECK-ERRORS:                      ^
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,31]
 @ CHECK-ERRORS:         lsls r4, r5, #32
 @ CHECK-ERRORS:                      ^
 
@@ -184,7 +184,7 @@ error: invalid operand for instruction
 @ Out of range immediate for SVC instruction.
         svc #-1
         svc #256
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,0xffffff]
 @ CHECK-ERRORS:         svc #-1
 @ CHECK-ERRORS:             ^
 @ CHECK-ERRORS: error: instruction requires: arm-mode
diff --git a/test/MC/ARM/thumb-mov.s b/test/MC/ARM/thumb-mov.s
new file mode 100644
index 0000000000000000000000000000000000000000..0a644ea00bfaa439bd7671eb706fdf0293218750
--- /dev/null
+++ b/test/MC/ARM/thumb-mov.s
@@ -0,0 +1,100 @@
+// RUN: not llvm-mc -triple=thumbv7 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V7 %s
+// RUN: not llvm-mc -triple=thumbv8 -show-encoding < %s 2>&1 | FileCheck --check-prefix=CHECK --check-prefix=CHECK-V8 %s
+
+        // Tests to check handling of sp and pc in thumb mov instructions. We
+        // have to be careful about the order of things, as stdout/stderr
+        // buffering means the errors appear before the non-error output, so
+        // we have to put all the error checks at the top.
+
+        // First check instructions that are never valid. These are thumb2
+        // instructions that uses pc
+
+        // t2MOVr selected because no thumb1 movs that can access high regs
+        movs pc, r0
+        movs r0, pc
+        movs pc, pc
+// CHECK: error: invalid operand for instruction
+// CHECK-NEXT: movs pc, r0
+// CHECK: error: invalid operand for instruction
+// CHECK-NEXT: movs r0, pc
+// CHECK: error: invalid operand for instruction
+// CHECK-NEXT: movs pc, pc
+
+        // mov.w selects t2MOVr
+        mov.w pc, r0
+        mov.w r0, pc
+        mov.w pc, pc
+// CHECK: error: invalid operand for instruction
+// CHECK-NEXT: mov.w pc, r0
+// CHECK: error: invalid operand for instruction
+// CHECK-NEXT: mov.w r0, pc
+// CHECK: error: invalid operand for instruction
+// CHECK-NEXT: mov.w pc, pc
+
+        // movs.w selects t2MOVr
+        movs.w pc, r0
+        movs.w r0, pc
+        movs.w pc, pc
+// CHECK: error: invalid operand for instruction
+// CHECK-NEXT: movs.w pc, r0
+// CHECK: error: invalid operand for instruction
+// CHECK-NEXT: movs.w r0, pc
+// CHECK: error: invalid operand for instruction
+// CHECK-NEXT: movs.w pc, pc
+
+
+        // Now check instructions that are invalid before ARMv8 due to SP usage
+
+        movs sp, r0
+        movs r0, sp
+        movs sp, sp
+// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7-NEXT: movs sp, r0
+// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7-NEXT: movs r0, sp
+// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7-NEXT: movs sp, sp
+// CHECK-V8: movs.w sp, r0            @ encoding: [0x5f,0xea,0x00,0x0d]
+// CHECK-V8: movs.w r0, sp            @ encoding: [0x5f,0xea,0x0d,0x00]
+// CHECK-V8: movs.w sp, sp            @ encoding: [0x5f,0xea,0x0d,0x0d]
+
+        mov.w sp, sp
+// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7-NEXT: mov.w sp, sp
+// CHECK-V8: mov.w sp, sp             @ encoding: [0x4f,0xea,0x0d,0x0d]
+
+        movs.w sp, r0
+        movs.w r0, sp
+        movs.w sp, sp
+// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7-NEXT: movs.w sp, r0
+// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7-NEXT: movs.w r0, sp
+// CHECK-V7: error: instruction variant requires ARMv8 or later
+// CHECK-V7-NEXT: movs.w sp, sp
+// CHECK-V8: movs.w sp, r0            @ encoding: [0x5f,0xea,0x00,0x0d]
+// CHECK-V8: movs.w r0, sp            @ encoding: [0x5f,0xea,0x0d,0x00]
+// CHECK-V8: movs.w sp, sp            @ encoding: [0x5f,0xea,0x0d,0x0d]
+
+
+        // Now instructions that are always valid
+
+        // mov selects tMOVr, where sp and pc are allowed
+        mov sp, r0
+        mov r0, sp
+        mov sp, sp
+        mov pc, r0
+        mov r0, pc
+        mov pc, pc
+// CHECK: mov sp, r0                  @ encoding: [0x85,0x46]
+// CHECK: mov r0, sp                  @ encoding: [0x68,0x46]
+// CHECK: mov sp, sp                  @ encoding: [0xed,0x46]
+// CHECK: mov pc, r0                  @ encoding: [0x87,0x46]
+// CHECK: mov r0, pc                  @ encoding: [0x78,0x46]
+// CHECK: mov pc, pc                  @ encoding: [0xff,0x46]
+
+        // sp allowed in non-flags-setting t2MOVr
+        mov.w sp, r0
+        mov.w r0, sp
+// CHECK: mov.w sp, r0                @ encoding: [0x4f,0xea,0x00,0x0d]
+// CHECK: mov.w r0, sp                @ encoding: [0x4f,0xea,0x0d,0x00]
diff --git a/test/MC/ARM/thumb-not-mclass.s b/test/MC/ARM/thumb-not-mclass.s
index fec545e64b0682853137dfad4e92e781170d7989..a90dc7eefe31ab225c2d48b1e966ccf98d37272f 100644
--- a/test/MC/ARM/thumb-not-mclass.s
+++ b/test/MC/ARM/thumb-not-mclass.s
@@ -22,5 +22,5 @@
         setend be
         setend le
 
-@ CHECK: error: invalid operand for instruction
-@ CHECK: error: invalid operand for instruction
+@ CHECK: error: immediate operand must be in the range [0,1]
+@ CHECK: error: immediate operand must be in the range [0,1]
diff --git a/test/MC/ARM/thumb2-diagnostics.s b/test/MC/ARM/thumb2-diagnostics.s
index 38cc74dee5658b6924ead00025db5ea27c32b0a4..76b4cf12626b57ef7fc71326a2d2dacb68d7b2a3 100644
--- a/test/MC/ARM/thumb2-diagnostics.s
+++ b/test/MC/ARM/thumb2-diagnostics.s
@@ -39,10 +39,10 @@
         mrc2  p14, #0, r1, c1, c2, #9
         mrrc  p7, #16, r5, r4, c1
         mrrc2  p7, #17, r5, r4, c1
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
-@ CHECK-ERRORS: error: invalid operand for instruction
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
+@ CHECK-ERRORS: error: immediate operand must be in the range [0,7]
 @ CHECK-ERRORS: error: immediate operand must be in the range [0,15]
 @ CHECK-ERRORS-V7: error: immediate operand must be in the range [0,15]
 @ CHECK-ERRORS-V8: error: invalid operand for instruction
@@ -79,8 +79,7 @@ foo2:
         mov r0, foo2
         movw r0, foo2
         movt r0, foo2
-@ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
-@ CHECK-ERRORS:                 ^
+@ CHECK-ERRORS: error: instruction requires: arm-mode
 @ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
 @ CHECK-ERRORS:                  ^
 @ CHECK-ERRORS: error: immediate expression for mov requires :lower16: or :upper16
@@ -117,4 +116,10 @@ foo2:
 @ CHECK-ERRORS: error: invalid operand for instruction
 @ CHECK-ERRORS: error: instruction requires: arm-mode
 @ CHECK-ERRORS: error: immediate value expected for vector index
+@ CHECK-ERRORS: error: instruction requires: arm-mode
+
+        @ SWP(B) is an ARM-only instruction
+        swp  r0, r1, [r2]
+        swpb r3, r4, [r5]
+@ CHECK-ERRORS: error: instruction requires: arm-mode
 @ CHECK-ERRORS: error: instruction requires: arm-mode
diff --git a/test/MC/ARM/thumbv8m.s b/test/MC/ARM/thumbv8m.s
index a0830a227a1575a20ac5252570c9996ca1e3930c..9af32ddd4ea1fb6c4d69eba2f913f69282a9fc4a 100644
--- a/test/MC/ARM/thumbv8m.s
+++ b/test/MC/ARM/thumbv8m.s
@@ -4,7 +4,7 @@
 // RUN: not llvm-mc -triple=thumbv8m.main -show-encoding < %s 2>%t \
 // RUN:   | FileCheck --check-prefix=CHECK-MAINLINE --check-prefix=CHECK %s
 // RUN:     FileCheck --check-prefix=UNDEF-MAINLINE --check-prefix=UNDEF < %t %s
-// RUN: not llvm-mc -triple=thumbv8m.main -mattr=+dsp,+t2xtpk -show-encoding < %s 2>%t \
+// RUN: not llvm-mc -triple=thumbv8m.main -mattr=+dsp -show-encoding < %s 2>%t \
 // RUN:   | FileCheck --check-prefix=CHECK-MAINLINE_DSP --check-prefix=CHECK %s
 // RUN:     FileCheck --check-prefix=UNDEF-MAINLINE_DSP --check-prefix=UNDEF < %t %s
 
@@ -18,7 +18,7 @@ mov.w r0, r0
 // UNDEF: target does not support ARM mode
 .arm
 
-// And only +dsp,+t2xtpk has DSP and t2xtpk instructions
+// And only +dsp has DSP and instructions
 // UNDEF-BASELINE: error: instruction requires: arm-mode
 // UNDEF-MAINLINE: error: instruction requires: arm-mode
 // UNDEF-MAINLINE_DSP-NOT: error: instruction requires:
diff --git a/test/MC/ARM/udf-arm-diagnostics.s b/test/MC/ARM/udf-arm-diagnostics.s
index 9ec9bf2124f094fd5d25de6b0256210a718f771f..71a1e387eebbc88d86171ed899e4a9e2f5d7b985 100644
--- a/test/MC/ARM/udf-arm-diagnostics.s
+++ b/test/MC/ARM/udf-arm-diagnostics.s
@@ -13,7 +13,7 @@ undefined:
 
 	udf #65536
 
-@ CHECK: error: invalid operand for instruction
+@ CHECK: error: immediate operand must be in the range [0,65535]
 @ CHECK: 	udf #65536
 @ CHECK: 	    ^
 
diff --git a/test/MC/ARM/udf-thumb-2-diagnostics.s b/test/MC/ARM/udf-thumb-2-diagnostics.s
index f8375601a0319f7f1cc48d827c944586742dc74b..f1916446d65dee07f7aec2d0398115b1065cd8dd 100644
--- a/test/MC/ARM/udf-thumb-2-diagnostics.s
+++ b/test/MC/ARM/udf-thumb-2-diagnostics.s
@@ -19,7 +19,7 @@ undefined:
 
 	udf.w #65536
 
-@ CHECK: error: invalid operand for instruction
+@ CHECK: error: immediate operand must be in the range [0,65535]
 @ CHECK: 	udf.w #65536
 @ CHECK: 	      ^
 
diff --git a/test/MC/ARM/unpred-control-flow-in-it-block.s b/test/MC/ARM/unpred-control-flow-in-it-block.s
new file mode 100644
index 0000000000000000000000000000000000000000..885d158d83dd3f17d5e5951a51e29d4c507a75e3
--- /dev/null
+++ b/test/MC/ARM/unpred-control-flow-in-it-block.s
@@ -0,0 +1,57 @@
+@ RUN: not llvm-mc -triple=thumbv7m--none-eabi < %s 2>&1 | FileCheck %s
+
+@ These instructions all write to the PC, so are UNPREDICTABLE if they are in
+@ an IT block, but not the last instruction in the block.
+
+  itttt eq
+  addeq pc, r0
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  addeq pc, sp, pc
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  beq.n #.+0x20
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  nopeq
+  itttt eq
+  beq.w #.+0x20
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  bleq sym
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  blxeq r0
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  nopeq
+  itttt eq
+  bxeq r0
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  ldmeq r0, {r8, pc}
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  ldmdbeq r0, {r8, pc}
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  nopeq
+  itttt eq
+  ldreq pc, [r0, #4]
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  ldreq pc, [r0, #-4]
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  ldreq pc, [pc, #4]
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  nopeq
+  itttt eq
+  ldreq pc, [r0, r1, LSL #1]
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  moveq pc, r0
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  popeq {r0, pc}
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  nopeq
+  itttt eq
+  popeq {r8, pc}
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  popeq {pc}
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  tbbeq [r0, r1]
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  nopeq
+  itt eq
+  tbheq [r0, r1, LSL #1]
+@ CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction must be outside of IT block or the last instruction in an IT block
+  nopeq
diff --git a/test/MC/AsmParser/macro-duplicate-params-names-err.s b/test/MC/AsmParser/macro-duplicate-params-names-err.s
new file mode 100644
index 0000000000000000000000000000000000000000..618cce02abdfdad1eff28708e621267fe84bf10b
--- /dev/null
+++ b/test/MC/AsmParser/macro-duplicate-params-names-err.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc %s 2> %t
+// RUN: FileCheck < %t %s
+
+.macro M a a
+.endm
+
+// CHECK: macro 'M' has multiple parameters named 'a'
diff --git a/test/MC/AsmParser/section_names.s b/test/MC/AsmParser/section_names.s
index 3883e15880a58aaa0fe99109faf5e56a8f8120b1..38a5310099d94fd9dd057574ba9138b6f0b06da8 100644
--- a/test/MC/AsmParser/section_names.s
+++ b/test/MC/AsmParser/section_names.s
@@ -8,6 +8,8 @@
 .byte 1
 .section .init_array
 .byte 1
+.section .init_array.42
+.byte 1
 .section .init_array2
 .byte 1
 .section .init_arrayfoo
@@ -30,6 +32,14 @@
 .byte 1
 .section .notefoo
 .byte 1
+.section .bss
+.space 1
+.section .bss.foo
+.space 1
+.section .tbss
+.space 1
+.section .tbss.foo
+.space 1
 # CHECK:        Name: .nobits
 # CHECK-NEXT:   Type: SHT_PROGBITS
 # CHECK:        Name: .nobits2
@@ -38,6 +48,8 @@
 # CHECK-NEXT:   Type: SHT_PROGBITS
 # CHECK:        Name: .init_array
 # CHECK-NEXT:   Type:  SHT_INIT_ARRAY
+# CHECK:        Name: .init_array.42
+# CHECK-NEXT:   Type:  SHT_INIT_ARRAY
 # CHECK:        Name: .init_array2
 # CHECK-NEXT:   Type: SHT_PROGBITS
 # CHECK:        Name: .init_arrayfoo
@@ -60,3 +72,11 @@
 # CHECK-NEXT:   Type: SHT_NOTE
 # CHECK:        Name: .notefoo
 # CHECK-NEXT:   Type: SHT_NOTE
+# CHECK:        Name: .bss
+# CHECK-NEXT:   Type: SHT_NOBITS
+# CHECK:        Name: .bss.foo
+# CHECK-NEXT:   Type: SHT_NOBITS
+# CHECK:        Name: .tbss
+# CHECK-NEXT:   Type: SHT_NOBITS
+# CHECK:        Name: .tbss.foo
+# CHECK-NEXT:   Type: SHT_NOBITS
diff --git a/test/MC/COFF/section-comdat.s b/test/MC/COFF/section-comdat.s
index e7052d8f5ae3f98fd4285205c21076ad428d6be4..7669ffbadc3aee82fc7ed75cdf463b319f7fa682 100644
--- a/test/MC/COFF/section-comdat.s
+++ b/test/MC/COFF/section-comdat.s
@@ -161,7 +161,7 @@ Symbol8:
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (9)
+// CHECK:     Section: SecName (11)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Associative
 // CHECK:       AssocSection: assocSec (4)
@@ -169,25 +169,25 @@ Symbol8:
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (10)
+// CHECK:     Section: SecName (9)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Largest
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: Symbol6
-// CHECK:     Section: SecName (10)
+// CHECK:     Section: SecName (9)
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: SecName
-// CHECK:     Section: SecName (11)
+// CHECK:     Section: SecName (10)
 // CHECK:     AuxSectionDef {
 // CHECK:       Selection: Newest (0x7)
 // CHECK:     }
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: Symbol7
-// CHECK:     Section: SecName (11)
+// CHECK:     Section: SecName (10)
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: assocSec
@@ -199,7 +199,7 @@ Symbol8:
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: Symbol5
-// CHECK:     Section: SecName (9)
+// CHECK:     Section: SecName (11)
 // CHECK:   }
 // CHECK:   Symbol {
 // CHECK:     Name: Symbol8
diff --git a/test/MC/Disassembler/AMDGPU/aperture-regs.ll b/test/MC/Disassembler/AMDGPU/aperture-regs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5fec281145b3b8b65c02602b58751bb81629430e
--- /dev/null
+++ b/test/MC/Disassembler/AMDGPU/aperture-regs.ll
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx900 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX9 %s
+
+# GFX9: v_mov_b32_e32 v1, src_shared_base ; encoding: [0xeb,0x02,0x02,0x7e]
+0xeb 0x02 0x02 0x7e
+
+# GFX9: v_mov_b32_e32 v1, src_shared_limit ; encoding: [0xec,0x02,0x02,0x7e]
+0xec 0x02 0x02 0x7e
+
+# GFX9: v_mov_b32_e32 v1, src_private_base ; encoding: [0xed,0x02,0x02,0x7e]
+0xed 0x02 0x02 0x7e
+
+# GFX9: v_mov_b32_e32 v1, src_private_limit ; encoding: [0xee,0x02,0x02,0x7e]
+0xee 0x02 0x02 0x7e
diff --git a/test/MC/Disassembler/AMDGPU/ds_vi.txt b/test/MC/Disassembler/AMDGPU/ds_vi.txt
index 84d55cd7e63d70fcd2b0f0e9a0cda6f4acaf5e1a..6d910ea5bb580903e34e1816b0239ed2fc6279a5 100644
--- a/test/MC/Disassembler/AMDGPU/ds_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/ds_vi.txt
@@ -81,20 +81,26 @@
 # VI:   ds_max_f32 v2, v4 ; encoding: [0x00,0x00,0x26,0xd8,0x02,0x04,0x00,0x00]
 0x00 0x00 0x26 0xd8 0x02 0x04 0x00 0x00
 
-# VI:   ds_gws_init v2 gds ; encoding: [0x00,0x00,0x33,0xd8,0x02,0x00,0x00,0x00]
-0x00 0x00 0x33 0xd8 0x02 0x00 0x00 0x00
+# VI:   ds_gws_init v2 gds ; encoding: [0x00,0x00,0x33,0xd9,0x00,0x02,0x00,0x00]
+0x00 0x00 0x33 0xd9 0x00 0x02 0x00,0x00
 
-# VI:   ds_gws_sema_v v2 gds ; encoding: [0x00,0x00,0x35,0xd8,0x02,0x00,0x00,0x00]
-0x00 0x00 0x35 0xd8 0x02 0x00 0x00 0x00
+# VI:   ds_gws_init v3 offset:12345 gds ; encoding: [0x39,0x30,0x33,0xd9,0x00,0x03,0x00,0x00]
+0x39 0x30 0x33 0xd9 0x00 0x03 0x00 0x00
 
-# VI:   ds_gws_sema_br v2 gds ; encoding: [0x00,0x00,0x37,0xd8,0x02,0x00,0x00,0x00]
-0x00 0x00 0x37 0xd8 0x02 0x00 0x00 0x00
+# VI:   ds_gws_sema_v gds ; encoding: [0x00,0x00,0x35,0xd9,0x00,0x00,0x00,0x00]
+0x00 0x00 0x35 0xd9 0x00 0x00 0x00 0x00
 
-# VI:   ds_gws_sema_p v2 gds ; encoding: [0x00,0x00,0x39,0xd8,0x02,0x00,0x00,0x00]
-0x00 0x00 0x39 0xd8 0x02 0x00 0x00 0x00
+# VI:   ds_gws_sema_v offset:257 gds    ; encoding: [0x01,0x01,0x35,0xd9,0x00,0x00,0x00,0x00]
+0x01 0x01 0x35 0xd9 0x00 0x00 0x00 0x00
 
-# VI:   ds_gws_barrier v2 gds ; encoding: [0x00,0x00,0x3b,0xd8,0x02,0x00,0x00,0x00]
-0x00 0x00 0x3b 0xd8 0x02 0x00 0x00 0x00
+# VI:   ds_gws_sema_br v2 gds ; encoding: [0x00,0x00,0x37,0xd9,0x00,0x02,0x00,0x00]
+0x00 0x00 0x37 0xd9 0x00 0x02 0x00 0x00
+
+# VI:   ds_gws_sema_p gds ; encoding: [0x00,0x00,0x39,0xd9,0x00,0x00,0x00,0x00]
+0x00 0x00 0x39 0xd9 0x00 0x00 0x00 0x00
+
+# VI:   ds_gws_barrier v2 gds ; encoding: [0x00,0x00,0x3b,0xd9,0x00,0x02,0x00,0x00]
+0x00 0x00 0x3b 0xd9 0x00 0x02 0x00 0x00
 
 # VI:   ds_write_b8 v2, v4 ; encoding: [0x00,0x00,0x3c,0xd8,0x02,0x04,0x00,0x00]
 0x00 0x00 0x3c 0xd8 0x02 0x04 0x00 0x00
diff --git a/test/MC/Disassembler/AMDGPU/mac.txt b/test/MC/Disassembler/AMDGPU/mac.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7f7f952655a401dc09193c7a0d3cb19a30aae480
--- /dev/null
+++ b/test/MC/Disassembler/AMDGPU/mac.txt
@@ -0,0 +1,19 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=tonga -disassemble -show-encoding < %s | FileCheck %s -check-prefix=VI
+
+# VI: v_mac_f32_e64 v0, v1, v2 mul:2  ; encoding: [0x00,0x00,0x16,0xd1,0x01,0x05,0x02,0x08]
+0x00 0x00 0x16 0xd1 0x01 0x05 0x02 0x08
+
+# VI: v_mac_f32_e64 v0, v1, v2 clamp  ; encoding: [0x00,0x80,0x16,0xd1,0x01,0x05,0x02,0x00]
+0x00 0x80 0x16 0xd1 0x01 0x05 0x02 0x00
+
+# VI: v_mac_f32_e64 v0, v1, v2 clamp mul:2 ; encoding: [0x00,0x80,0x16,0xd1,0x01,0x05,0x02,0x08]
+0x00 0x80 0x16 0xd1 0x01 0x05 0x02 0x08
+
+# VI: v_mac_f16_e64 v0, v1, v2 mul:2  ; encoding: [0x00,0x00,0x23,0xd1,0x01,0x05,0x02,0x08]
+0x00 0x00 0x23 0xd1 0x01 0x05 0x02 0x08
+
+# VI: v_mac_f16_e64 v0, v1, v2 clamp  ; encoding: [0x00,0x80,0x23,0xd1,0x01,0x05,0x02,0x00]
+0x00 0x80 0x23 0xd1 0x01 0x05 0x02 0x00
+
+# VI: v_mac_f16_e64 v0, v1, v2 clamp mul:2 ; encoding: [0x00,0x80,0x23,0xd1,0x01,0x05,0x02,0x08]
+0x00 0x80 0x23 0xd1 0x01 0x05 0x02 0x08
diff --git a/test/MC/Disassembler/AMDGPU/si-support.txt b/test/MC/Disassembler/AMDGPU/si-support.txt
new file mode 100644
index 0000000000000000000000000000000000000000..f3f5ab946eb351df20b62740933e5eaffaa994f9
--- /dev/null
+++ b/test/MC/Disassembler/AMDGPU/si-support.txt
@@ -0,0 +1,4 @@
+# RUN: not llvm-mc -arch=amdgcn -mcpu=tahiti -disassemble < %s 2>&1 | FileCheck %s
+
+# CHECK: LLVM ERROR: Disassembly not yet supported for subtarget
+0x00 0x00 0x00 0x7e
diff --git a/test/MC/Disassembler/AMDGPU/sop1_vi.txt b/test/MC/Disassembler/AMDGPU/sop1_vi.txt
index 49c030594e5784343eea5a3e67193b8bd7479202..749783d3bf896b3b59dbe98660496c9b017c1592 100644
--- a/test/MC/Disassembler/AMDGPU/sop1_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/sop1_vi.txt
@@ -165,7 +165,7 @@
 # VI:   s_movreld_b64 s[2:3], s[4:5] ; encoding: [0x04,0x2d,0x82,0xbe]
 0x04 0x2d 0x82 0xbe
 
-# VI:   s_cbranch_join s[4:5] ; encoding: [0x04,0x2e,0x80,0xbe]
+# VI:   s_cbranch_join s4 ; encoding: [0x04,0x2e,0x80,0xbe]
 0x04 0x2e 0x80 0xbe
 
 # VI:   s_abs_i32 s1, s2 ; encoding: [0x02,0x30,0x81,0xbe]
diff --git a/test/MC/Disassembler/AMDGPU/vop1_gfx9.txt b/test/MC/Disassembler/AMDGPU/vop1_gfx9.txt
new file mode 100644
index 0000000000000000000000000000000000000000..370ba632ebca5917ae4ba5f7528b5c7a1775e4dd
--- /dev/null
+++ b/test/MC/Disassembler/AMDGPU/vop1_gfx9.txt
@@ -0,0 +1,4 @@
+# RUN: llvm-mc -arch=amdgcn -mcpu=gfx901 -disassemble -show-encoding < %s | FileCheck %s -check-prefix=GFX9
+
+# GFX9: v_swap_b32 v1, v2 ; encoding: [0x02,0xa3,0x02,0x7e]
+0x02 0xa3 0x02 0x7e
diff --git a/test/MC/Disassembler/AMDGPU/vop3_vi.txt b/test/MC/Disassembler/AMDGPU/vop3_vi.txt
index d28a231edf2c93acc30fe2dd98ac192f6b18c4c7..c15fbaa1e3a85f77e358a5c3da363d3722acb223 100644
--- a/test/MC/Disassembler/AMDGPU/vop3_vi.txt
+++ b/test/MC/Disassembler/AMDGPU/vop3_vi.txt
@@ -215,3 +215,9 @@
 
 # VI:   v_mad_f32 v9, 0.5, v5, -v8      ; encoding: [0x09,0x00,0xc1,0xd1,0xf0,0x0a,0x22,0x84]
 0x09 0x00 0xc1 0xd1 0xf0 0x0a 0x22 0x84
+
+# VI:   v_ceil_f32_e64 v0, neg(-1)      ; encoding: [0x00,0x00,0x5d,0xd1,0xc1,0x00,0x00,0x20]
+0x00,0x00,0x5d,0xd1,0xc1,0x00,0x00,0x20
+
+# VI:   v_ceil_f32_e64 v0, neg(-1.0)    ; encoding: [0x00,0x00,0x5d,0xd1,0xf3,0x00,0x00,0x20]
+0x00,0x00,0x5d,0xd1,0xf3,0x00,0x00,0x20
diff --git a/test/MC/Disassembler/Hexagon/alu32_alu.txt b/test/MC/Disassembler/Hexagon/alu32_alu.txt
index 26b320ecde00f187183b55558fb0d03c4f771772..e75a9982abd1f35a584996f88c1142a3baab447a 100644
--- a/test/MC/Disassembler/Hexagon/alu32_alu.txt
+++ b/test/MC/Disassembler/Hexagon/alu32_alu.txt
@@ -3,27 +3,27 @@
 
 # Add
 0xf1 0xc3 0x15 0xb0
-# CHECK: r17 = add(r21, #31)
+# CHECK: r17 = add(r21,#31)
 0x11 0xdf 0x15 0xf3
-# CHECK: r17 = add(r21, r31)
+# CHECK: r17 = add(r21,r31)
 0x11 0xdf 0x55 0xf6
-# CHECK: r17 = add(r21, r31):sat
+# CHECK: r17 = add(r21,r31):sat
 
 # And
 0xf1 0xc3 0x15 0x76
-# CHECK: r17 = and(r21, #31)
+# CHECK: r17 = and(r21,#31)
 0xf1 0xc3 0x95 0x76
-# CHECK: r17 = or(r21, #31)
+# CHECK: r17 = or(r21,#31)
 0x11 0xdf 0x15 0xf1
-# CHECK: r17 = and(r21, r31)
+# CHECK: r17 = and(r21,r31)
 0x11 0xdf 0x35 0xf1
-# CHECK: r17 = or(r21, r31)
+# CHECK: r17 = or(r21,r31)
 0x11 0xdf 0x75 0xf1
-# CHECK: r17 = xor(r21, r31)
+# CHECK: r17 = xor(r21,r31)
 0x11 0xd5 0x9f 0xf1
-# CHECK: r17 = and(r21, ~r31)
+# CHECK: r17 = and(r21,~r31)
 0x11 0xd5 0xbf 0xf1
-# CHECK: r17 = or(r21, ~r31)
+# CHECK: r17 = or(r21,~r31)
 
 # Nop
 0x00 0xc0 0x00 0x7f
@@ -31,11 +31,11 @@
 
 # Subtract
 0xb1 0xc2 0x5f 0x76
-# CHECK: r17 = sub(#21, r31)
+# CHECK: r17 = sub(#21,r31)
 0x11 0xdf 0x35 0xf3
-# CHECK: r17 = sub(r31, r21)
+# CHECK: r17 = sub(r31,r21)
 0x11 0xdf 0xd5 0xf6
-# CHECK: r17 = sub(r31, r21):sat
+# CHECK: r17 = sub(r31,r21):sat
 
 # Sign extend
 0x11 0xc0 0xbf 0x70
@@ -57,27 +57,27 @@
 
 # Vector add halfwords
 0x11 0xdf 0x15 0xf6
-# CHECK: r17 = vaddh(r21, r31)
+# CHECK: r17 = vaddh(r21,r31)
 0x11 0xdf 0x35 0xf6
-# CHECK: r17 = vaddh(r21, r31):sat
+# CHECK: r17 = vaddh(r21,r31):sat
 0x11 0xdf 0x75 0xf6
-# CHECK: r17 = vadduh(r21, r31):sat
+# CHECK: r17 = vadduh(r21,r31):sat
 
 # Vector average halfwords
 0x11 0xdf 0x15 0xf7
-# CHECK: r17 = vavgh(r21, r31)
+# CHECK: r17 = vavgh(r21,r31)
 0x11 0xdf 0x35 0xf7
-# CHECK: r17 = vavgh(r21, r31):rnd
+# CHECK: r17 = vavgh(r21,r31):rnd
 0x11 0xdf 0x75 0xf7
-# CHECK: r17 = vnavgh(r31, r21)
+# CHECK: r17 = vnavgh(r31,r21)
 
 # Vector subtract halfwords
 0x11 0xdf 0x95 0xf6
-# CHECK: r17 = vsubh(r31, r21)
+# CHECK: r17 = vsubh(r31,r21)
 0x11 0xdf 0xb5 0xf6
-# CHECK: r17 = vsubh(r31, r21):sat
+# CHECK: r17 = vsubh(r31,r21):sat
 0x11 0xdf 0xf5 0xf6
-# CHECK: r17 = vsubuh(r31, r21):sat
+# CHECK: r17 = vsubuh(r31,r21):sat
 
 # Zero extend
 0x11 0xc0 0xd5 0x70
diff --git a/test/MC/Disassembler/Hexagon/alu32_perm.txt b/test/MC/Disassembler/Hexagon/alu32_perm.txt
index a2953506c599e896cd029e36ac0f2a9bc9f155b5..c4b1ab97963e470bb9ddb9318e2f592e2ac8cbfd 100644
--- a/test/MC/Disassembler/Hexagon/alu32_perm.txt
+++ b/test/MC/Disassembler/Hexagon/alu32_perm.txt
@@ -3,31 +3,31 @@
 
 # Combine words in to doublewords
 0x11 0xdf 0x95 0xf3
-# CHECK: r17 = combine(r31.h, r21.h)
+# CHECK: r17 = combine(r31.h,r21.h)
 0x11 0xdf 0xb5 0xf3
-# CHECK: r17 = combine(r31.h, r21.l)
+# CHECK: r17 = combine(r31.h,r21.l)
 0x11 0xdf 0xd5 0xf3
-# CHECK: r17 = combine(r31.l, r21.h)
+# CHECK: r17 = combine(r31.l,r21.h)
 0x11 0xdf 0xf5 0xf3
-# CHECK: r17 = combine(r31.l, r21.l)
+# CHECK: r17 = combine(r31.l,r21.l)
 0xb0 0xe2 0x0f 0x7c
-# CHECK: r17:16 = combine(#21, #31)
+# CHECK: r17:16 = combine(#21,#31)
 0xb0 0xe2 0x3f 0x73
-# CHECK: r17:16 = combine(#21, r31)
+# CHECK: r17:16 = combine(#21,r31)
 0xf0 0xe3 0x15 0x73
-# CHECK: r17:16 = combine(r21, #31)
+# CHECK: r17:16 = combine(r21,#31)
 0x10 0xdf 0x15 0xf5
-# CHECK: r17:16 = combine(r21, r31)
+# CHECK: r17:16 = combine(r21,r31)
 
 # Mux
 0xf1 0xc3 0x75 0x73
-# CHECK: r17 = mux(p3, r21, #31)
+# CHECK: r17 = mux(p3,r21,#31)
 0xb1 0xc2 0xff 0x73
-# CHECK: r17 = mux(p3, #21, r31)
+# CHECK: r17 = mux(p3,#21,r31)
 0xb1 0xe2 0x8f 0x7b
-# CHECK: r17 = mux(p3, #21, #31)
+# CHECK: r17 = mux(p3,#21,#31)
 0x71 0xdf 0x15 0xf4
-# CHECK: r17 = mux(p3, r21, r31)
+# CHECK: r17 = mux(p3,r21,r31)
 
 # Shift word by 16
 0x11 0xc0 0x15 0x70
@@ -37,4 +37,4 @@
 
 # Pack high and low halfwords
 0x10 0xdf 0x95 0xf5
-# CHECK: r17:16 = packhl(r21, r31)
+# CHECK: r17:16 = packhl(r21,r31)
diff --git a/test/MC/Disassembler/Hexagon/alu32_pred.txt b/test/MC/Disassembler/Hexagon/alu32_pred.txt
index 084b39d8cbf577d1ee3b45296550f459c6d91d0d..b9e111364e6751c236cf85b54a8bee1451dbf8bd 100644
--- a/test/MC/Disassembler/Hexagon/alu32_pred.txt
+++ b/test/MC/Disassembler/Hexagon/alu32_pred.txt
@@ -3,25 +3,25 @@
 
 # Conditional add
 0xf1 0xc3 0x75 0x74
-# CHECK: if (p3) r17 = add(r21, #31)
+# CHECK: if (p3) r17 = add(r21,#31)
 0x03 0x40 0x45 0x85 0xf1 0xe3 0x75 0x74
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = add(r21, #31)
+# CHECK-NEXT: if (p3.new) r17 = add(r21,#31)
 0xf1 0xc3 0xf5 0x74
-# CHECK: if (!p3) r17 = add(r21, #31)
+# CHECK: if (!p3) r17 = add(r21,#31)
 0x03 0x40 0x45 0x85 0xf1 0xe3 0xf5 0x74
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = add(r21, #31)
+# CHECK-NEXT: if (!p3.new) r17 = add(r21,#31)
 0x71 0xdf 0x15 0xfb
-# CHECK: if (p3) r17 = add(r21, r31)
+# CHECK: if (p3) r17 = add(r21,r31)
 0x03 0x40 0x45 0x85 0x71 0xff 0x15 0xfb
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = add(r21, r31)
+# CHECK-NEXT: if (p3.new) r17 = add(r21,r31)
 0xf1 0xdf 0x15 0xfb
-# CHECK: if (!p3) r17 = add(r21, r31)
+# CHECK: if (!p3) r17 = add(r21,r31)
 0x03 0x40 0x45 0x85 0xf1 0xff 0x15 0xfb
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = add(r21, r31)
+# CHECK-NEXT: if (!p3.new) r17 = add(r21,r31)
 
 # Conditional shift halfword
 0x11 0xe3 0x15 0x70
@@ -47,59 +47,59 @@
 
 # Conditional combine
 0x70 0xdf 0x15 0xfd
-# CHECK: if (p3) r17:16 = combine(r21, r31)
+# CHECK: if (p3) r17:16 = combine(r21,r31)
 0xf0 0xdf 0x15 0xfd
-# CHECK: if (!p3) r17:16 = combine(r21, r31)
+# CHECK: if (!p3) r17:16 = combine(r21,r31)
 0x03 0x40 0x45 0x85 0x70 0xff 0x15 0xfd
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17:16 = combine(r21, r31)
+# CHECK-NEXT: if (p3.new) r17:16 = combine(r21,r31)
 0x03 0x40 0x45 0x85 0xf0 0xff 0x15 0xfd
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17:16 = combine(r21, r31)
+# CHECK-NEXT: if (!p3.new) r17:16 = combine(r21,r31)
 
 # Conditional logical operations
 0x71 0xdf 0x15 0xf9
-# CHECK: if (p3) r17 = and(r21, r31)
+# CHECK: if (p3) r17 = and(r21,r31)
 0xf1 0xdf 0x15 0xf9
-# CHECK: if (!p3) r17 = and(r21, r31)
+# CHECK: if (!p3) r17 = and(r21,r31)
 0x03 0x40 0x45 0x85 0x71 0xff 0x15 0xf9
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = and(r21, r31)
+# CHECK-NEXT: if (p3.new) r17 = and(r21,r31)
 0x03 0x40 0x45 0x85 0xf1 0xff 0x15 0xf9
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = and(r21, r31)
+# CHECK-NEXT: if (!p3.new) r17 = and(r21,r31)
 0x71 0xdf 0x35 0xf9
-# CHECK: if (p3) r17 = or(r21, r31)
+# CHECK: if (p3) r17 = or(r21,r31)
 0xf1 0xdf 0x35 0xf9
-# CHECK: if (!p3) r17 = or(r21, r31)
+# CHECK: if (!p3) r17 = or(r21,r31)
 0x03 0x40 0x45 0x85 0x71 0xff 0x35 0xf9
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = or(r21, r31)
+# CHECK-NEXT: if (p3.new) r17 = or(r21,r31)
 0x03 0x40 0x45 0x85 0xf1 0xff 0x35 0xf9
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = or(r21, r31)
+# CHECK-NEXT: if (!p3.new) r17 = or(r21,r31)
 0x71 0xdf 0x75 0xf9
-# CHECK: if (p3) r17 = xor(r21, r31)
+# CHECK: if (p3) r17 = xor(r21,r31)
 0xf1 0xdf 0x75 0xf9
-# CHECK: if (!p3) r17 = xor(r21, r31)
+# CHECK: if (!p3) r17 = xor(r21,r31)
 0x03 0x40 0x45 0x85 0x71 0xff 0x75 0xf9
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = xor(r21, r31)
+# CHECK-NEXT: if (p3.new) r17 = xor(r21,r31)
 0x03 0x40 0x45 0x85 0xf1 0xff 0x75 0xf9
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = xor(r21, r31)
+# CHECK-NEXT: if (!p3.new) r17 = xor(r21,r31)
 
 # Conditional subtract
 0x71 0xdf 0x35 0xfb
-# CHECK: if (p3) r17 = sub(r31, r21)
+# CHECK: if (p3) r17 = sub(r31,r21)
 0xf1 0xdf 0x35 0xfb
-# CHECK: if (!p3) r17 = sub(r31, r21)
+# CHECK: if (!p3) r17 = sub(r31,r21)
 0x03 0x40 0x45 0x85 0x71 0xff 0x35 0xfb
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = sub(r31, r21)
+# CHECK-NEXT: if (p3.new) r17 = sub(r31,r21)
 0x03 0x40 0x45 0x85 0xf1 0xff 0x35 0xfb
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = sub(r31, r21)
+# CHECK-NEXT: if (!p3.new) r17 = sub(r31,r21)
 
 # Conditional sign extend
 0x11 0xe3 0xb5 0x70
@@ -159,36 +159,36 @@
 
 # Compare
 0xe3 0xc3 0x15 0x75
-# CHECK: p3 = cmp.eq(r21, #31)
+# CHECK: p3 = cmp.eq(r21,#31)
 0xf3 0xc3 0x15 0x75
-# CHECK: p3 = !cmp.eq(r21, #31)
+# CHECK: p3 = !cmp.eq(r21,#31)
 0xe3 0xc3 0x55 0x75
-# CHECK: p3 = cmp.gt(r21, #31)
+# CHECK: p3 = cmp.gt(r21,#31)
 0xf3 0xc3 0x55 0x75
-# CHECK: p3 = !cmp.gt(r21, #31)
+# CHECK: p3 = !cmp.gt(r21,#31)
 0xe3 0xc3 0x95 0x75
-# CHECK: p3 = cmp.gtu(r21, #31)
+# CHECK: p3 = cmp.gtu(r21,#31)
 0xf3 0xc3 0x95 0x75
-# CHECK: p3 = !cmp.gtu(r21, #31)
+# CHECK: p3 = !cmp.gtu(r21,#31)
 0x03 0xdf 0x15 0xf2
-# CHECK: p3 = cmp.eq(r21, r31)
+# CHECK: p3 = cmp.eq(r21,r31)
 0x13 0xdf 0x15 0xf2
-# CHECK: p3 = !cmp.eq(r21, r31)
+# CHECK: p3 = !cmp.eq(r21,r31)
 0x03 0xdf 0x55 0xf2
-# CHECK: p3 = cmp.gt(r21, r31)
+# CHECK: p3 = cmp.gt(r21,r31)
 0x13 0xdf 0x55 0xf2
-# CHECK: p3 = !cmp.gt(r21, r31)
+# CHECK: p3 = !cmp.gt(r21,r31)
 0x03 0xdf 0x75 0xf2
-# CHECK: p3 = cmp.gtu(r21, r31)
+# CHECK: p3 = cmp.gtu(r21,r31)
 0x13 0xdf 0x75 0xf2
-# CHECK: p3 = !cmp.gtu(r21, r31)
+# CHECK: p3 = !cmp.gtu(r21,r31)
 
 # Compare to general register
 0xf1 0xe3 0x55 0x73
-# CHECK: r17 = cmp.eq(r21, #31)
+# CHECK: r17 = cmp.eq(r21,#31)
 0xf1 0xe3 0x75 0x73
-# CHECK: r17 = !cmp.eq(r21, #31)
+# CHECK: r17 = !cmp.eq(r21,#31)
 0x11 0xdf 0x55 0xf3
-# CHECK: r17 = cmp.eq(r21, r31)
+# CHECK: r17 = cmp.eq(r21,r31)
 0x11 0xdf 0x75 0xf3
-# CHECK: r17 = !cmp.eq(r21, r31)
+# CHECK: r17 = !cmp.eq(r21,r31)
diff --git a/test/MC/Disassembler/Hexagon/cr.txt b/test/MC/Disassembler/Hexagon/cr.txt
index 6cf2b5fda39994b1f0eab734b74f1d82f26f4c28..8e505299d966bdb5c77c90c05f9a9a1ba1132204 100644
--- a/test/MC/Disassembler/Hexagon/cr.txt
+++ b/test/MC/Disassembler/Hexagon/cr.txt
@@ -3,9 +3,9 @@
 
 # Corner detection acceleration
 0x93 0xe1 0x12 0x6b
-# CHECK: p3 = !fastcorner9(p2, p1)
+# CHECK: p3 = !fastcorner9(p2,p1)
 0x91 0xe3 0x02 0x6b
-# CHECK: p1 = fastcorner9(p2, p3)
+# CHECK: p1 = fastcorner9(p2,p3)
 
 # Logical reductions on predicates
 0x01 0xc0 0x82 0x6b
@@ -25,7 +25,7 @@
 
 # Add to PC
 0x91 0xca 0x49 0x6a
-# CHECK: r17 = add(pc, #21)
+# CHECK: r17 = add(pc,#21)
 
 # Pipelined loop instructions
 0x08 0xc4 0xb5 0x60
@@ -43,33 +43,33 @@
 
 # Logical operations on predicates
 0x01 0xc3 0x02 0x6b
-# CHECK: p1 = and(p3, p2)
+# CHECK: p1 = and(p3,p2)
 0xc1 0xc3 0x12 0x6b
-# CHECK: p1 = and(p2, and(p3, p3))
+# CHECK: p1 = and(p2,and(p3,p3))
 0x01 0xc3 0x22 0x6b
-# CHECK: p1 = or(p3, p2)
+# CHECK: p1 = or(p3,p2)
 0xc1 0xc3 0x32 0x6b
-# CHECK: p1 = and(p2, or(p3, p3))
+# CHECK: p1 = and(p2,or(p3,p3))
 0x01 0xc3 0x42 0x6b
-# CHECK: p1 = xor(p2, p3)
+# CHECK: p1 = xor(p2,p3)
 0xc1 0xc3 0x52 0x6b
-# CHECK: p1 = or(p2, and(p3, p3))
+# CHECK: p1 = or(p2,and(p3,p3))
 0x01 0xc2 0x63 0x6b
-# CHECK: p1 = and(p2, !p3)
+# CHECK: p1 = and(p2,!p3)
 0xc1 0xc3 0x72 0x6b
-# CHECK: p1 = or(p2, or(p3, p3))
+# CHECK: p1 = or(p2,or(p3,p3))
 0xc1 0xc3 0x92 0x6b
-# CHECK: p1 = and(p2, and(p3, !p3))
+# CHECK: p1 = and(p2,and(p3,!p3))
 0xc1 0xc3 0xb2 0x6b
-# CHECK: p1 = and(p2, or(p3, !p3))
+# CHECK: p1 = and(p2,or(p3,!p3))
 0x01 0xc0 0xc2 0x6b
 # CHECK: p1 = not(p2)
 0xc1 0xc3 0xd2 0x6b
-# CHECK: p1 = or(p2, and(p3, !p3))
+# CHECK: p1 = or(p2,and(p3,!p3))
 0x01 0xc2 0xe3 0x6b
-# CHECK: p1 = or(p2, !p3)
+# CHECK: p1 = or(p2,!p3)
 0xc1 0xc3 0xf2 0x6b
-# CHECK: p1 = or(p2, or(p3, !p3))
+# CHECK: p1 = or(p2,or(p3,!p3))
 
 # User control register transfer
 0x0d 0xc0 0x35 0x62
diff --git a/test/MC/Disassembler/Hexagon/j.txt b/test/MC/Disassembler/Hexagon/j.txt
index 661670e2a6149065cfb005e7b83494382b39aad2..c3d16386393049e07fdca170f5ea88b92a374ed6 100644
--- a/test/MC/Disassembler/Hexagon/j.txt
+++ b/test/MC/Disassembler/Hexagon/j.txt
@@ -15,145 +15,145 @@
 0x00 0xc1 0x89 0x11
 # CHECK: p0 = cmp.gt(r17,#-1); if (p0.new) jump:nt
 0x00 0xc3 0x89 0x11
-# CHECK: p0 = tstbit(r17, #0); if (p0.new) jump:nt
+# CHECK: p0 = tstbit(r17,#0); if (p0.new) jump:nt
 0x00 0xe0 0x89 0x11
 # CHECK: p0 = cmp.eq(r17,#-1); if (p0.new) jump:t
 0x00 0xe1 0x89 0x11
 # CHECK: p0 = cmp.gt(r17,#-1); if (p0.new) jump:t
 0x00 0xe3 0x89 0x11
-# CHECK: p0 = tstbit(r17, #0); if (p0.new) jump:t
+# CHECK: p0 = tstbit(r17,#0); if (p0.new) jump:t
 0x00 0xc0 0xc9 0x11
 # CHECK: p0 = cmp.eq(r17,#-1); if (!p0.new) jump:nt
 0x00 0xc1 0xc9 0x11
 # CHECK: p0 = cmp.gt(r17,#-1); if (!p0.new) jump:nt
 0x00 0xc3 0xc9 0x11
-# CHECK: p0 = tstbit(r17, #0); if (!p0.new) jump:nt
+# CHECK: p0 = tstbit(r17,#0); if (!p0.new) jump:nt
 0x00 0xe0 0xc9 0x11
 # CHECK: p0 = cmp.eq(r17,#-1); if (!p0.new) jump:t
 0x00 0xe1 0xc9 0x11
 # CHECK: p0 = cmp.gt(r17,#-1); if (!p0.new) jump:t
 0x00 0xe3 0xc9 0x11
-# CHECK: p0 = tstbit(r17, #0); if (!p0.new) jump:t
+# CHECK: p0 = tstbit(r17,#0); if (!p0.new) jump:t
 0x00 0xd5 0x09 0x10
-# CHECK: p0 = cmp.eq(r17, #21); if (p0.new) jump:nt
+# CHECK: p0 = cmp.eq(r17,#21); if (p0.new) jump:nt
 0x00 0xf5 0x09 0x10
-# CHECK: p0 = cmp.eq(r17, #21); if (p0.new) jump:t
+# CHECK: p0 = cmp.eq(r17,#21); if (p0.new) jump:t
 0x00 0xd5 0x49 0x10
-# CHECK: p0 = cmp.eq(r17, #21); if (!p0.new) jump:nt
+# CHECK: p0 = cmp.eq(r17,#21); if (!p0.new) jump:nt
 0x00 0xf5 0x49 0x10
-# CHECK: p0 = cmp.eq(r17, #21); if (!p0.new) jump:t
+# CHECK: p0 = cmp.eq(r17,#21); if (!p0.new) jump:t
 0x00 0xd5 0x89 0x10
-# CHECK: p0 = cmp.gt(r17, #21); if (p0.new) jump:nt
+# CHECK: p0 = cmp.gt(r17,#21); if (p0.new) jump:nt
 0x00 0xf5 0x89 0x10
-# CHECK: p0 = cmp.gt(r17, #21); if (p0.new) jump:t
+# CHECK: p0 = cmp.gt(r17,#21); if (p0.new) jump:t
 0x00 0xd5 0xc9 0x10
-# CHECK: p0 = cmp.gt(r17, #21); if (!p0.new) jump:nt
+# CHECK: p0 = cmp.gt(r17,#21); if (!p0.new) jump:nt
 0x00 0xf5 0xc9 0x10
-# CHECK: p0 = cmp.gt(r17, #21); if (!p0.new) jump:t
+# CHECK: p0 = cmp.gt(r17,#21); if (!p0.new) jump:t
 0x00 0xd5 0x09 0x11
-# CHECK: p0 = cmp.gtu(r17, #21); if (p0.new) jump:nt
+# CHECK: p0 = cmp.gtu(r17,#21); if (p0.new) jump:nt
 0x00 0xf5 0x09 0x11
-# CHECK: p0 = cmp.gtu(r17, #21); if (p0.new) jump:t
+# CHECK: p0 = cmp.gtu(r17,#21); if (p0.new) jump:t
 0x00 0xd5 0x49 0x11
-# CHECK: p0 = cmp.gtu(r17, #21); if (!p0.new) jump:nt
+# CHECK: p0 = cmp.gtu(r17,#21); if (!p0.new) jump:nt
 0x00 0xf5 0x49 0x11
-# CHECK: p0 = cmp.gtu(r17, #21); if (!p0.new) jump:t
+# CHECK: p0 = cmp.gtu(r17,#21); if (!p0.new) jump:t
 0x00 0xc0 0x89 0x13
 # CHECK: p1 = cmp.eq(r17,#-1); if (p1.new) jump:nt
 0x00 0xc1 0x89 0x13
 # CHECK: p1 = cmp.gt(r17,#-1); if (p1.new) jump:nt
 0x00 0xc3 0x89 0x13
-# CHECK: p1 = tstbit(r17, #0); if (p1.new) jump:nt
+# CHECK: p1 = tstbit(r17,#0); if (p1.new) jump:nt
 0x00 0xe0 0x89 0x13
 # CHECK: p1 = cmp.eq(r17,#-1); if (p1.new) jump:t
 0x00 0xe1 0x89 0x13
 # CHECK: p1 = cmp.gt(r17,#-1); if (p1.new) jump:t
 0x00 0xe3 0x89 0x13
-# CHECK: p1 = tstbit(r17, #0); if (p1.new) jump:t
+# CHECK: p1 = tstbit(r17,#0); if (p1.new) jump:t
 0x00 0xc0 0xc9 0x13
 # CHECK: p1 = cmp.eq(r17,#-1); if (!p1.new) jump:nt
 0x00 0xc1 0xc9 0x13
 # CHECK: p1 = cmp.gt(r17,#-1); if (!p1.new) jump:nt
 0x00 0xc3 0xc9 0x13
-# CHECK: p1 = tstbit(r17, #0); if (!p1.new) jump:nt
+# CHECK: p1 = tstbit(r17,#0); if (!p1.new) jump:nt
 0x00 0xe0 0xc9 0x13
 # CHECK: p1 = cmp.eq(r17,#-1); if (!p1.new) jump:t
 0x00 0xe1 0xc9 0x13
 # CHECK: p1 = cmp.gt(r17,#-1); if (!p1.new) jump:t
 0x00 0xe3 0xc9 0x13
-# CHECK: p1 = tstbit(r17, #0); if (!p1.new) jump:t
+# CHECK: p1 = tstbit(r17,#0); if (!p1.new) jump:t
 0x00 0xd5 0x09 0x12
-# CHECK: p1 = cmp.eq(r17, #21); if (p1.new) jump:nt
+# CHECK: p1 = cmp.eq(r17,#21); if (p1.new) jump:nt
 0x00 0xf5 0x09 0x12
-# CHECK: p1 = cmp.eq(r17, #21); if (p1.new) jump:t
+# CHECK: p1 = cmp.eq(r17,#21); if (p1.new) jump:t
 0x00 0xd5 0x49 0x12
-# CHECK: p1 = cmp.eq(r17, #21); if (!p1.new) jump:nt
+# CHECK: p1 = cmp.eq(r17,#21); if (!p1.new) jump:nt
 0x00 0xf5 0x49 0x12
-# CHECK: p1 = cmp.eq(r17, #21); if (!p1.new) jump:t
+# CHECK: p1 = cmp.eq(r17,#21); if (!p1.new) jump:t
 0x00 0xd5 0x89 0x12
-# CHECK: p1 = cmp.gt(r17, #21); if (p1.new) jump:nt
+# CHECK: p1 = cmp.gt(r17,#21); if (p1.new) jump:nt
 0x00 0xf5 0x89 0x12
-# CHECK: p1 = cmp.gt(r17, #21); if (p1.new) jump:t
+# CHECK: p1 = cmp.gt(r17,#21); if (p1.new) jump:t
 0x00 0xd5 0xc9 0x12
-# CHECK: p1 = cmp.gt(r17, #21); if (!p1.new) jump:nt
+# CHECK: p1 = cmp.gt(r17,#21); if (!p1.new) jump:nt
 0x00 0xf5 0xc9 0x12
-# CHECK: p1 = cmp.gt(r17, #21); if (!p1.new) jump:t
+# CHECK: p1 = cmp.gt(r17,#21); if (!p1.new) jump:t
 0x00 0xd5 0x09 0x13
-# CHECK: p1 = cmp.gtu(r17, #21); if (p1.new) jump:nt
+# CHECK: p1 = cmp.gtu(r17,#21); if (p1.new) jump:nt
 0x00 0xf5 0x09 0x13
-# CHECK: p1 = cmp.gtu(r17, #21); if (p1.new) jump:t
+# CHECK: p1 = cmp.gtu(r17,#21); if (p1.new) jump:t
 0x00 0xd5 0x49 0x13
-# CHECK: p1 = cmp.gtu(r17, #21); if (!p1.new) jump:nt
+# CHECK: p1 = cmp.gtu(r17,#21); if (!p1.new) jump:nt
 0x00 0xf5 0x49 0x13
-# CHECK: p1 = cmp.gtu(r17, #21); if (!p1.new) jump:t
+# CHECK: p1 = cmp.gtu(r17,#21); if (!p1.new) jump:t
 0x00 0xcd 0x09 0x14
-# CHECK: p0 = cmp.eq(r17, r21); if (p0.new) jump:nt
+# CHECK: p0 = cmp.eq(r17,r21); if (p0.new) jump:nt
 0x00 0xdd 0x09 0x14
-# CHECK: p1 = cmp.eq(r17, r21); if (p1.new) jump:nt
+# CHECK: p1 = cmp.eq(r17,r21); if (p1.new) jump:nt
 0x00 0xed 0x09 0x14
-# CHECK: p0 = cmp.eq(r17, r21); if (p0.new) jump:t
+# CHECK: p0 = cmp.eq(r17,r21); if (p0.new) jump:t
 0x00 0xfd 0x09 0x14
-# CHECK: p1 = cmp.eq(r17, r21); if (p1.new) jump:t
+# CHECK: p1 = cmp.eq(r17,r21); if (p1.new) jump:t
 0x00 0xcd 0x49 0x14
-# CHECK: p0 = cmp.eq(r17, r21); if (!p0.new) jump:nt
+# CHECK: p0 = cmp.eq(r17,r21); if (!p0.new) jump:nt
 0x00 0xdd 0x49 0x14
-# CHECK: p1 = cmp.eq(r17, r21); if (!p1.new) jump:nt
+# CHECK: p1 = cmp.eq(r17,r21); if (!p1.new) jump:nt
 0x00 0xed 0x49 0x14
-# CHECK: p0 = cmp.eq(r17, r21); if (!p0.new) jump:t
+# CHECK: p0 = cmp.eq(r17,r21); if (!p0.new) jump:t
 0x00 0xfd 0x49 0x14
-# CHECK: p1 = cmp.eq(r17, r21); if (!p1.new) jump:t
+# CHECK: p1 = cmp.eq(r17,r21); if (!p1.new) jump:t
 0x00 0xcd 0x89 0x14
-# CHECK: p0 = cmp.gt(r17, r21); if (p0.new) jump:nt
+# CHECK: p0 = cmp.gt(r17,r21); if (p0.new) jump:nt
 0x00 0xdd 0x89 0x14
-# CHECK: p1 = cmp.gt(r17, r21); if (p1.new) jump:nt
+# CHECK: p1 = cmp.gt(r17,r21); if (p1.new) jump:nt
 0x00 0xed 0x89 0x14
-# CHECK: p0 = cmp.gt(r17, r21); if (p0.new) jump:t
+# CHECK: p0 = cmp.gt(r17,r21); if (p0.new) jump:t
 0x00 0xfd 0x89 0x14
-# CHECK: p1 = cmp.gt(r17, r21); if (p1.new) jump:t
+# CHECK: p1 = cmp.gt(r17,r21); if (p1.new) jump:t
 0x00 0xcd 0xc9 0x14
-# CHECK: p0 = cmp.gt(r17, r21); if (!p0.new) jump:nt
+# CHECK: p0 = cmp.gt(r17,r21); if (!p0.new) jump:nt
 0x00 0xdd 0xc9 0x14
-# CHECK: p1 = cmp.gt(r17, r21); if (!p1.new) jump:nt
+# CHECK: p1 = cmp.gt(r17,r21); if (!p1.new) jump:nt
 0x00 0xed 0xc9 0x14
-# CHECK: p0 = cmp.gt(r17, r21); if (!p0.new) jump:t
+# CHECK: p0 = cmp.gt(r17,r21); if (!p0.new) jump:t
 0x00 0xfd 0xc9 0x14
-# CHECK: p1 = cmp.gt(r17, r21); if (!p1.new) jump:t
+# CHECK: p1 = cmp.gt(r17,r21); if (!p1.new) jump:t
 0x00 0xcd 0x09 0x15
-# CHECK: p0 = cmp.gtu(r17, r21); if (p0.new) jump:nt
+# CHECK: p0 = cmp.gtu(r17,r21); if (p0.new) jump:nt
 0x00 0xdd 0x09 0x15
-# CHECK: p1 = cmp.gtu(r17, r21); if (p1.new) jump:nt
+# CHECK: p1 = cmp.gtu(r17,r21); if (p1.new) jump:nt
 0x00 0xed 0x09 0x15
-# CHECK: p0 = cmp.gtu(r17, r21); if (p0.new) jump:t
+# CHECK: p0 = cmp.gtu(r17,r21); if (p0.new) jump:t
 0x00 0xfd 0x09 0x15
-# CHECK: p1 = cmp.gtu(r17, r21); if (p1.new) jump:t
+# CHECK: p1 = cmp.gtu(r17,r21); if (p1.new) jump:t
 0x00 0xcd 0x49 0x15
-# CHECK: p0 = cmp.gtu(r17, r21); if (!p0.new) jump:nt
+# CHECK: p0 = cmp.gtu(r17,r21); if (!p0.new) jump:nt
 0x00 0xdd 0x49 0x15
-# CHECK: p1 = cmp.gtu(r17, r21); if (!p1.new) jump:nt
+# CHECK: p1 = cmp.gtu(r17,r21); if (!p1.new) jump:nt
 0x00 0xed 0x49 0x15
-# CHECK: p0 = cmp.gtu(r17, r21); if (!p0.new) jump:t
+# CHECK: p0 = cmp.gtu(r17,r21); if (!p0.new) jump:t
 0x00 0xfd 0x49 0x15
-# CHECK: p1 = cmp.gtu(r17, r21); if (!p1.new) jump:t
+# CHECK: p1 = cmp.gtu(r17,r21); if (!p1.new) jump:t
 
 # Jump to address
 0x22 0xc0 0x00 0x58
diff --git a/test/MC/Disassembler/Hexagon/ld.txt b/test/MC/Disassembler/Hexagon/ld.txt
index 56e00fd94f5629b3c56232d4db638740815054bb..66e014fea59f69a9740c4ba82fb50abf844fdecb 100644
--- a/test/MC/Disassembler/Hexagon/ld.txt
+++ b/test/MC/Disassembler/Hexagon/ld.txt
@@ -3,25 +3,25 @@
 
 # Load doubleword
 0x90 0xff 0xd5 0x3a
-# CHECK: r17:16 = memd(r21 + r31<<#3)
+# CHECK: r17:16 = memd(r21+r31<<#3)
 0xb0 0xc2 0xc0 0x49
-# CHECK: r17:16 = memd(#168)
+# CHECK: r17:16 = memd(gp+#168)
 0x02 0x40 0x00 0x00 0x10 0xc5 0xc0 0x49
 # CHECK: r17:16 = memd(##168)
 0xd0 0xc0 0xd5 0x91
-# CHECK: r17:16 = memd(r21 + #48)
+# CHECK: r17:16 = memd(r21+#48)
 0xb0 0xe0 0xd5 0x99
-# CHECK: r17:16 = memd(r21 ++ #40:circ(m1))
+# CHECK: r17:16 = memd(r21++#40:circ(m1))
 0x10 0xe2 0xd5 0x99
-# CHECK: r17:16 = memd(r21 ++ I:circ(m1))
+# CHECK: r17:16 = memd(r21++I:circ(m1))
 0x00 0x40 0x00 0x00 0x70 0xd7 0xd5 0x9b
-# CHECK: r17:16 = memd(r21 = ##31)
+# CHECK: r17:16 = memd(r21=##31)
 0xb0 0xc0 0xd5 0x9b
 # CHECK: r17:16 = memd(r21++#40)
 0x10 0xe0 0xd5 0x9d
 # CHECK: r17:16 = memd(r21++m1)
 0x10 0xe0 0xd5 0x9f
-# CHECK: r17:16 = memd(r21 ++ m1:brev)
+# CHECK: r17:16 = memd(r21++m1:brev)
 
 # Load doubleword conditionally
 0xf0 0xff 0xd5 0x30
@@ -35,15 +35,15 @@
 # CHECK: p3 = r5
 # CHECK-NEXT: if (!p3.new) r17:16 = memd(r21+r31<<#3)
 0x70 0xd8 0xd5 0x41
-# CHECK: if (p3) r17:16 = memd(r21 + #24)
+# CHECK: if (p3) r17:16 = memd(r21+#24)
 0x03 0x40 0x45 0x85 0x70 0xd8 0xd5 0x43
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17:16 = memd(r21 + #24)
+# CHECK-NEXT: if (p3.new) r17:16 = memd(r21+#24)
 0x70 0xd8 0xd5 0x45
-# CHECK: if (!p3) r17:16 = memd(r21 + #24)
+# CHECK: if (!p3) r17:16 = memd(r21+#24)
 0x03 0x40 0x45 0x85 0x70 0xd8 0xd5 0x47
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17:16 = memd(r21 + #24)
+# CHECK-NEXT: if (!p3.new) r17:16 = memd(r21+#24)
 0xb0 0xe6 0xd5 0x9b
 # CHECK: if (p3) r17:16 = memd(r21++#40)
 0xb0 0xee 0xd5 0x9b
@@ -57,25 +57,25 @@
 
 # Load byte
 0x91 0xff 0x15 0x3a
-# CHECK: r17 = memb(r21 + r31<<#3)
+# CHECK: r17 = memb(r21+r31<<#3)
 0xb1 0xc2 0x00 0x49
-# CHECK: r17 = memb(#21)
+# CHECK: r17 = memb(gp+#21)
 0x00 0x40 0x00 0x00 0xb1 0xc2 0x00 0x49
 # CHECK: r17 = memb(##21)
 0xf1 0xc3 0x15 0x91
-# CHECK: r17 = memb(r21 + #31)
+# CHECK: r17 = memb(r21+#31)
 0xb1 0xe0 0x15 0x99
-# CHECK: r17 = memb(r21 ++ #5:circ(m1))
+# CHECK: r17 = memb(r21++#5:circ(m1))
 0x11 0xe2 0x15 0x99
-# CHECK: r17 = memb(r21 ++ I:circ(m1))
+# CHECK: r17 = memb(r21++I:circ(m1))
 0x00 0x40 0x00 0x00 0x71 0xd7 0x15 0x9b
-# CHECK: r17 = memb(r21 = ##31)
+# CHECK: r17 = memb(r21=##31)
 0xb1 0xc0 0x15 0x9b
 # CHECK: r17 = memb(r21++#5)
 0x11 0xe0 0x15 0x9d
 # CHECK: r17 = memb(r21++m1)
 0x11 0xe0 0x15 0x9f
-# CHECK: r17 = memb(r21 ++ m1:brev)
+# CHECK: r17 = memb(r21++m1:brev)
 
 # Load byte conditionally
 0xf1 0xff 0x15 0x30
@@ -89,15 +89,15 @@
 # CHECK: p3 = r5
 # CHECK-NEXT: if (!p3.new) r17 = memb(r21+r31<<#3)
 0x91 0xdd 0x15 0x41
-# CHECK: if (p3) r17 = memb(r21 + #44)
+# CHECK: if (p3) r17 = memb(r21+#44)
 0x03 0x40 0x45 0x85 0x91 0xdd 0x15 0x43
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = memb(r21 + #44)
+# CHECK-NEXT: if (p3.new) r17 = memb(r21+#44)
 0x91 0xdd 0x15 0x45
-# CHECK: if (!p3) r17 = memb(r21 + #44)
+# CHECK: if (!p3) r17 = memb(r21+#44)
 0x03 0x40 0x45 0x85 0x91 0xdd 0x15 0x47
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = memb(r21 + #44)
+# CHECK-NEXT: if (!p3.new) r17 = memb(r21+#44)
 0xb1 0xe6 0x15 0x9b
 # CHECK: if (p3) r17 = memb(r21++#5)
 0xb1 0xee 0x15 0x9b
@@ -111,41 +111,41 @@
 
 # Load byte into shifted vector
 0xf0 0xc3 0x95 0x90
-# CHECK: r17:16 = memb_fifo(r21 + #31)
+# CHECK: r17:16 = memb_fifo(r21+#31)
 0xb0 0xe0 0x95 0x98
-# CHECK: r17:16 = memb_fifo(r21 ++ #5:circ(m1))
+# CHECK: r17:16 = memb_fifo(r21++#5:circ(m1))
 0x10 0xe2 0x95 0x98
-# CHECK: r17:16 = memb_fifo(r21 ++ I:circ(m1))
+# CHECK: r17:16 = memb_fifo(r21++I:circ(m1))
 
 # Load half into shifted vector
 0xf0 0xc3 0x55 0x90
-# CHECK: r17:16 = memh_fifo(r21 + #62)
+# CHECK: r17:16 = memh_fifo(r21+#62)
 0xb0 0xe0 0x55 0x98
-# CHECK: r17:16 = memh_fifo(r21 ++ #10:circ(m1))
+# CHECK: r17:16 = memh_fifo(r21++#10:circ(m1))
 0x10 0xe2 0x55 0x98
-# CHECK: r17:16 = memh_fifo(r21 ++ I:circ(m1))
+# CHECK: r17:16 = memh_fifo(r21++I:circ(m1))
 
 # Load halfword
 0x91 0xff 0x55 0x3a
-# CHECK: r17 = memh(r21 + r31<<#3)
+# CHECK: r17 = memh(r21+r31<<#3)
 0xb1 0xc2 0x40 0x49
-# CHECK: r17 = memh(#42)
+# CHECK: r17 = memh(gp+#42)
 0x00 0x40 0x00 0x00 0x51 0xc5 0x40 0x49
 # CHECK: r17 = memh(##42)
 0xf1 0xc3 0x55 0x91
-# CHECK: r17 = memh(r21 + #62)
+# CHECK: r17 = memh(r21+#62)
 0xb1 0xe0 0x55 0x99
-# CHECK: r17 = memh(r21 ++ #10:circ(m1))
+# CHECK: r17 = memh(r21++#10:circ(m1))
 0x11 0xe2 0x55 0x99
-# CHECK: r17 = memh(r21 ++ I:circ(m1))
+# CHECK: r17 = memh(r21++I:circ(m1))
 0x00 0x40 0x00 0x00 0x71 0xd7 0x55 0x9b
-# CHECK: r17 = memh(r21 = ##31)
+# CHECK: r17 = memh(r21=##31)
 0xb1 0xc0 0x55 0x9b
 # CHECK: r17 = memh(r21++#10)
 0x11 0xe0 0x55 0x9d
 # CHECK: r17 = memh(r21++m1)
 0x11 0xe0 0x55 0x9f
-# CHECK: r17 = memh(r21 ++ m1:brev)
+# CHECK: r17 = memh(r21++m1:brev)
 
 # Load halfword conditionally
 0xf1 0xff 0x55 0x30
@@ -169,37 +169,37 @@
 # CHECK: p3 = r5
 # CHECK-NEXT: if (!p3.new) r17 = memh(r21++#10)
 0xf1 0xdb 0x55 0x41
-# CHECK: if (p3) r17 = memh(r21 + #62)
+# CHECK: if (p3) r17 = memh(r21+#62)
 0xf1 0xdb 0x55 0x45
-# CHECK: if (!p3) r17 = memh(r21 + #62)
+# CHECK: if (!p3) r17 = memh(r21+#62)
 0x03 0x40 0x45 0x85 0xf1 0xdb 0x55 0x43
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = memh(r21 + #62)
+# CHECK-NEXT: if (p3.new) r17 = memh(r21+#62)
 0x03 0x40 0x45 0x85 0xf1 0xdb 0x55 0x47
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = memh(r21 + #62)
+# CHECK-NEXT: if (!p3.new) r17 = memh(r21+#62)
 
 # Load unsigned byte
 0x91 0xff 0x35 0x3a
-# CHECK: r17 = memub(r21 + r31<<#3)
+# CHECK: r17 = memub(r21+r31<<#3)
 0xb1 0xc2 0x20 0x49
-# CHECK: r17 = memub(#21)
+# CHECK: r17 = memub(gp+#21)
 0x00 0x40 0x00 0x00 0xb1 0xc2 0x20 0x49
 # CHECK: r17 = memub(##21)
 0xf1 0xc3 0x35 0x91
-# CHECK: r17 = memub(r21 + #31)
+# CHECK: r17 = memub(r21+#31)
 0xb1 0xe0 0x35 0x99
-# CHECK: r17 = memub(r21 ++ #5:circ(m1))
+# CHECK: r17 = memub(r21++#5:circ(m1))
 0x11 0xe2 0x35 0x99
-# CHECK: r17 = memub(r21 ++ I:circ(m1))
+# CHECK: r17 = memub(r21++I:circ(m1))
 0x00 0x40 0x00 0x00 0x71 0xd7 0x35 0x9b
-# CHECK: r17 = memub(r21 = ##31)
+# CHECK: r17 = memub(r21=##31)
 0xb1 0xc0 0x35 0x9b
 # CHECK: r17 = memub(r21++#5)
 0x11 0xe0 0x35 0x9d
 # CHECK: r17 = memub(r21++m1)
 0x11 0xe0 0x35 0x9f
-# CHECK: r17 = memub(r21 ++ m1:brev)
+# CHECK: r17 = memub(r21++m1:brev)
 
 # Load unsigned byte conditionally
 0xf1 0xff 0x35 0x30
@@ -213,15 +213,15 @@
 # CHECK: p3 = r5
 # CHECK-NEXT: if (!p3.new) r17 = memub(r21+r31<<#3)
 0xf1 0xdb 0x35 0x41
-# CHECK: if (p3) r17 = memub(r21 + #31)
+# CHECK: if (p3) r17 = memub(r21+#31)
 0x03 0x40 0x45 0x85 0xf1 0xdb 0x35 0x43
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = memub(r21 + #31)
+# CHECK-NEXT: if (p3.new) r17 = memub(r21+#31)
 0xf1 0xdb 0x35 0x45
-# CHECK: if (!p3) r17 = memub(r21 + #31)
+# CHECK: if (!p3) r17 = memub(r21+#31)
 0x03 0x40 0x45 0x85 0xf1 0xdb 0x35 0x47
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = memub(r21 + #31)
+# CHECK-NEXT: if (!p3.new) r17 = memub(r21+#31)
 0xb1 0xe6 0x35 0x9b
 # CHECK: if (p3) r17 = memub(r21++#5)
 0xb1 0xee 0x35 0x9b
@@ -235,25 +235,25 @@
 
 # Load unsigned halfword
 0x91 0xff 0x75 0x3a
-# CHECK: r17 = memuh(r21 + r31<<#3)
+# CHECK: r17 = memuh(r21+r31<<#3)
 0xb1 0xc2 0x60 0x49
-# CHECK: r17 = memuh(#42)
+# CHECK: r17 = memuh(gp+#42)
 0x00 0x40 0x00 0x00 0x51 0xc5 0x60 0x49
 # CHECK: r17 = memuh(##42)
 0xb1 0xc2 0x75 0x91
-# CHECK: r17 = memuh(r21 + #42)
+# CHECK: r17 = memuh(r21+#42)
 0xb1 0xe0 0x75 0x99
-# CHECK: r17 = memuh(r21 ++ #10:circ(m1))
+# CHECK: r17 = memuh(r21++#10:circ(m1))
 0x11 0xe2 0x75 0x99
-# CHECK: r17 = memuh(r21 ++ I:circ(m1))
+# CHECK: r17 = memuh(r21++I:circ(m1))
 0x00 0x40 0x00 0x00 0x71 0xd7 0x75 0x9b
-# CHECK: r17 = memuh(r21 = ##31)
+# CHECK: r17 = memuh(r21=##31)
 0xb1 0xc0 0x75 0x9b
 # CHECK: r17 = memuh(r21++#10)
 0x11 0xe0 0x75 0x9d
 # CHECK: r17 = memuh(r21++m1)
 0x11 0xe0 0x75 0x9f
-# CHECK: r17 = memuh(r21 ++ m1:brev)
+# CHECK: r17 = memuh(r21++m1:brev)
 
 # Load unsigned halfword conditionally
 0xf1 0xff 0x75 0x30
@@ -267,15 +267,15 @@
 # CHECK: p3 = r5
 # CHECK-NEXT: if (!p3.new) r17 = memuh(r21+r31<<#3)
 0xb1 0xda 0x75 0x41
-# CHECK: if (p3) r17 = memuh(r21 + #42)
+# CHECK: if (p3) r17 = memuh(r21+#42)
 0xb1 0xda 0x75 0x45
-# CHECK: if (!p3) r17 = memuh(r21 + #42)
+# CHECK: if (!p3) r17 = memuh(r21+#42)
 0x03 0x40 0x45 0x85 0xb1 0xda 0x75 0x43
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = memuh(r21 + #42)
+# CHECK-NEXT: if (p3.new) r17 = memuh(r21+#42)
 0x03 0x40 0x45 0x85 0xb1 0xda 0x75 0x47
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = memuh(r21 + #42)
+# CHECK-NEXT: if (!p3.new) r17 = memuh(r21+#42)
 0xb1 0xe6 0x75 0x9b
 # CHECK: if (p3) r17 = memuh(r21++#10)
 0xb1 0xee 0x75 0x9b
@@ -289,25 +289,25 @@
 
 # Load word
 0x91 0xff 0x95 0x3a
-# CHECK: r17 = memw(r21 + r31<<#3)
+# CHECK: r17 = memw(r21+r31<<#3)
 0xb1 0xc2 0x80 0x49
-# CHECK: r17 = memw(#84)
+# CHECK: r17 = memw(gp+#84)
 0x01 0x40 0x00 0x00 0x91 0xc2 0x80 0x49
 # CHECK: r17 = memw(##84)
 0xb1 0xc2 0x95 0x91
-# CHECK: r17 = memw(r21 + #84)
+# CHECK: r17 = memw(r21+#84)
 0xb1 0xe0 0x95 0x99
-# CHECK: r17 = memw(r21 ++ #20:circ(m1))
+# CHECK: r17 = memw(r21++#20:circ(m1))
 0x11 0xe2 0x95 0x99
-# CHECK: r17 = memw(r21 ++ I:circ(m1))
+# CHECK: r17 = memw(r21++I:circ(m1))
 0x00 0x40 0x00 0x00 0x71 0xd7 0x95 0x9b
-# CHECK: r17 = memw(r21 = ##31)
+# CHECK: r17 = memw(r21=##31)
 0xb1 0xc0 0x95 0x9b
 # CHECK: r17 = memw(r21++#20)
 0x11 0xe0 0x95 0x9d
 # CHECK: r17 = memw(r21++m1)
 0x11 0xe0 0x95 0x9f
-# CHECK: r17 = memw(r21 ++ m1:brev)
+# CHECK: r17 = memw(r21++m1:brev)
 
 # Load word conditionally
 0xf1 0xff 0x95 0x30
@@ -321,15 +321,15 @@
 # CHECK: p3 = r5
 # CHECK-NEXT: if (!p3.new) r17 = memw(r21+r31<<#3)
 0xb1 0xda 0x95 0x41
-# CHECK: if (p3) r17 = memw(r21 + #84)
+# CHECK: if (p3) r17 = memw(r21+#84)
 0xb1 0xda 0x95 0x45
-# CHECK: if (!p3) r17 = memw(r21 + #84)
+# CHECK: if (!p3) r17 = memw(r21+#84)
 0x03 0x40 0x45 0x85 0xb1 0xda 0x95 0x43
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) r17 = memw(r21 + #84)
+# CHECK-NEXT: if (p3.new) r17 = memw(r21+#84)
 0x03 0x40 0x45 0x85 0xb1 0xda 0x95 0x47
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) r17 = memw(r21 + #84)
+# CHECK-NEXT: if (!p3.new) r17 = memw(r21+#84)
 0xb1 0xe6 0x95 0x9b
 # CHECK: if (p3) r17 = memw(r21++#20)
 0xb1 0xee 0x95 0x9b
@@ -367,59 +367,59 @@
 
 # Load and unpack bytes to halfwords
 0xf1 0xc3 0x35 0x90
-# CHECK: r17 = membh(r21 + #62)
+# CHECK: r17 = membh(r21+#62)
 0xf1 0xc3 0x75 0x90
-# CHECK: r17 = memubh(r21 + #62)
+# CHECK: r17 = memubh(r21+#62)
 0xf0 0xc3 0xb5 0x90
-# CHECK: r17:16 = memubh(r21 + #124)
+# CHECK: r17:16 = memubh(r21+#124)
 0xf0 0xc3 0xf5 0x90
-# CHECK: r17:16 = membh(r21 + #124)
+# CHECK: r17:16 = membh(r21+#124)
 0xb1 0xe0 0x35 0x98
-# CHECK: r17 = membh(r21 ++ #10:circ(m1))
+# CHECK: r17 = membh(r21++#10:circ(m1))
 0x11 0xe2 0x35 0x98
-# CHECK: r17 = membh(r21 ++ I:circ(m1))
+# CHECK: r17 = membh(r21++I:circ(m1))
 0xb1 0xe0 0x75 0x98
-# CHECK: r17 = memubh(r21 ++ #10:circ(m1))
+# CHECK: r17 = memubh(r21++#10:circ(m1))
 0x11 0xe2 0x75 0x98
-# CHECK: r17 = memubh(r21 ++ I:circ(m1))
+# CHECK: r17 = memubh(r21++I:circ(m1))
 0xb0 0xe0 0xf5 0x98
-# CHECK: r17:16 = membh(r21 ++ #20:circ(m1))
+# CHECK: r17:16 = membh(r21++#20:circ(m1))
 0x10 0xe2 0xf5 0x98
-# CHECK: r17:16 = membh(r21 ++ I:circ(m1))
+# CHECK: r17:16 = membh(r21++I:circ(m1))
 0xb0 0xe0 0xb5 0x98
-# CHECK: r17:16 = memubh(r21 ++ #20:circ(m1))
+# CHECK: r17:16 = memubh(r21++#20:circ(m1))
 0x10 0xe2 0xb5 0x98
-# CHECK: r17:16 = memubh(r21 ++ I:circ(m1))
+# CHECK: r17:16 = memubh(r21++I:circ(m1))
 0x00 0x40 0x00 0x00 0x71 0xd7 0x35 0x9a
-# CHECK: r17 = membh(r21 = ##31)
+# CHECK: r17 = membh(r21=##31)
 0xb1 0xc0 0x35 0x9a
 # CHECK: r17 = membh(r21++#10)
 0x00 0x40 0x00 0x00 0x71 0xd7 0x75 0x9a
-# CHECK: r17 = memubh(r21 = ##31)
+# CHECK: r17 = memubh(r21=##31)
 0xb1 0xc0 0x75 0x9a
 # CHECK: r17 = memubh(r21++#10)
 0x00 0x40 0x00 0x00 0x70 0xd7 0xb5 0x9a
-# CHECK: r17:16 = memubh(r21 = ##31)
+# CHECK: r17:16 = memubh(r21=##31)
 0xb0 0xc0 0xb5 0x9a
 # CHECK: r17:16 = memubh(r21++#20)
 0x00 0x40 0x00 0x00 0x70 0xd7 0xf5 0x9a
-# CHECK: r17:16 = membh(r21 = ##31)
+# CHECK: r17:16 = membh(r21=##31)
 0xb0 0xc0 0xf5 0x9a
 # CHECK: r17:16 = membh(r21++#20)
 0x00 0x40 0x00 0x00 0xf1 0xf7 0x35 0x9c
-# CHECK: r17 = membh(r21<<#3 + ##31)
+# CHECK: r17 = membh(r21<<#3+##31)
 0x11 0xe0 0x35 0x9c
 # CHECK: r17 = membh(r21++m1)
 0x00 0x40 0x00 0x00 0xf1 0xf7 0x75 0x9c
-# CHECK: r17 = memubh(r21<<#3 + ##31)
+# CHECK: r17 = memubh(r21<<#3+##31)
 0x11 0xe0 0x75 0x9c
 # CHECK: r17 = memubh(r21++m1)
 0x00 0x40 0x00 0x00 0xf0 0xf7 0xf5 0x9c
-# CHECK: r17:16 = membh(r21<<#3 + ##31)
+# CHECK: r17:16 = membh(r21<<#3+##31)
 0x10 0xe0 0xf5 0x9c
 # CHECK: r17:16 = membh(r21++m1)
 0x00 0x40 0x00 0x00 0xf0 0xf7 0xb5 0x9c
-# CHECK: r17:16 = memubh(r21<<#3 + ##31)
+# CHECK: r17:16 = memubh(r21<<#3+##31)
 0x11 0xe0 0x35 0x9c
 # CHECK: r17 = membh(r21++m1)
 0x11 0xe0 0x75 0x9c
@@ -429,10 +429,10 @@
 0x10 0xe0 0xb5 0x9c
 # CHECK: r17:16 = memubh(r21++m1)
 0x11 0xe0 0x35 0x9e
-# CHECK: r17 = membh(r21 ++ m1:brev)
+# CHECK: r17 = membh(r21++m1:brev)
 0x11 0xe0 0x75 0x9e
-# CHECK: r17 = memubh(r21 ++ m1:brev)
+# CHECK: r17 = memubh(r21++m1:brev)
 0x10 0xe0 0xb5 0x9e
-# CHECK: r17:16 = memubh(r21 ++ m1:brev)
+# CHECK: r17:16 = memubh(r21++m1:brev)
 0x10 0xe0 0xf5 0x9e
-# CHECK: r17:16 = membh(r21 ++ m1:brev)
+# CHECK: r17:16 = membh(r21++m1:brev)
diff --git a/test/MC/Disassembler/Hexagon/nv_j.txt b/test/MC/Disassembler/Hexagon/nv_j.txt
index 2135b5a039f636e17739a700018b9985c3567f91..f3b7140f8a7564e6ec5e93e2836acf2ce5439015 100644
--- a/test/MC/Disassembler/Hexagon/nv_j.txt
+++ b/test/MC/Disassembler/Hexagon/nv_j.txt
@@ -4,133 +4,133 @@
 # Jump to address conditioned on new register value
 0x11 0x40 0x71 0x70 0x92 0xd5 0x02 0x20
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.eq(r17.new, r21)) jump:nt
+# CHECK-NEXT: if (cmp.eq(r17.new,r21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x02 0x20
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.eq(r17.new, r21)) jump:t
+# CHECK-NEXT: if (cmp.eq(r17.new,r21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x42 0x20
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.eq(r17.new, r21)) jump:nt
+# CHECK-NEXT: if (!cmp.eq(r17.new,r21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x42 0x20
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.eq(r17.new, r21)) jump:t
+# CHECK-NEXT: if (!cmp.eq(r17.new,r21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x82 0x20
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gt(r17.new, r21)) jump:nt
+# CHECK-NEXT: if (cmp.gt(r17.new,r21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x82 0x20
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gt(r17.new, r21)) jump:t
+# CHECK-NEXT: if (cmp.gt(r17.new,r21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0xc2 0x20
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gt(r17.new, r21)) jump:nt
+# CHECK-NEXT: if (!cmp.gt(r17.new,r21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0xc2 0x20
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gt(r17.new, r21)) jump:t
+# CHECK-NEXT: if (!cmp.gt(r17.new,r21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x02 0x21
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gtu(r17.new, r21)) jump:nt
+# CHECK-NEXT: if (cmp.gtu(r17.new,r21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x02 0x21
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gtu(r17.new, r21)) jump:t
+# CHECK-NEXT: if (cmp.gtu(r17.new,r21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x42 0x21
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gtu(r17.new, r21)) jump:nt
+# CHECK-NEXT: if (!cmp.gtu(r17.new,r21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x42 0x21
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gtu(r17.new, r21)) jump:t
+# CHECK-NEXT: if (!cmp.gtu(r17.new,r21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x82 0x21
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gt(r21, r17.new)) jump:nt
+# CHECK-NEXT: if (cmp.gt(r21,r17.new)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x82 0x21
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gt(r21, r17.new)) jump:t
+# CHECK-NEXT: if (cmp.gt(r21,r17.new)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0xc2 0x21
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gt(r21, r17.new)) jump:nt
+# CHECK-NEXT: if (!cmp.gt(r21,r17.new)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0xc2 0x21
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gt(r21, r17.new)) jump:t
+# CHECK-NEXT: if (!cmp.gt(r21,r17.new)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x02 0x22
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gtu(r21, r17.new)) jump:nt
+# CHECK-NEXT: if (cmp.gtu(r21,r17.new)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x02 0x22
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gtu(r21, r17.new)) jump:t
+# CHECK-NEXT: if (cmp.gtu(r21,r17.new)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x42 0x22
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gtu(r21, r17.new)) jump:nt
+# CHECK-NEXT: if (!cmp.gtu(r21,r17.new)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x42 0x22
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gtu(r21, r17.new)) jump:t
+# CHECK-NEXT: if (!cmp.gtu(r21,r17.new)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x02 0x24
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.eq(r17.new, #21)) jump:nt
+# CHECK-NEXT: if (cmp.eq(r17.new,#21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x02 0x24
 # CHECK: r17 = r17
-# CHECK-NETX: if (cmp.eq(r17.new, #21)) jump:t
+# CHECK-NETX: if (cmp.eq(r17.new,#21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x42 0x24
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.eq(r17.new, #21)) jump:nt
+# CHECK-NEXT: if (!cmp.eq(r17.new,#21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x42 0x24
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.eq(r17.new, #21)) jump:t
+# CHECK-NEXT: if (!cmp.eq(r17.new,#21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x82 0x24
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gt(r17.new, #21)) jump:nt
+# CHECK-NEXT: if (cmp.gt(r17.new,#21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x82 0x24
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gt(r17.new, #21)) jump:t
+# CHECK-NEXT: if (cmp.gt(r17.new,#21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0xc2 0x24
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gt(r17.new, #21)) jump:nt
+# CHECK-NEXT: if (!cmp.gt(r17.new,#21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0xc2 0x24
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gt(r17.new, #21)) jump:t
+# CHECK-NEXT: if (!cmp.gt(r17.new,#21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x02 0x25
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gtu(r17.new, #21)) jump:nt
+# CHECK-NEXT: if (cmp.gtu(r17.new,#21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x02 0x25
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gtu(r17.new, #21)) jump:t
+# CHECK-NEXT: if (cmp.gtu(r17.new,#21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xd5 0x42 0x25
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gtu(r17.new, #21)) jump:nt
+# CHECK-NEXT: if (!cmp.gtu(r17.new,#21)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xf5 0x42 0x25
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gtu(r17.new, #21)) jump:t
+# CHECK-NEXT: if (!cmp.gtu(r17.new,#21)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xc0 0x82 0x25
 # CHECK: r17 = r17
-# CHECK-NEXT: if (tstbit(r17.new, #0)) jump:nt
+# CHECK-NEXT: if (tstbit(r17.new,#0)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xe0 0x82 0x25
 # CHECK: r17 = r17
-# CHECK-NEXT: if (tstbit(r17.new, #0)) jump:t
+# CHECK-NEXT: if (tstbit(r17.new,#0)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xc0 0xc2 0x25
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!tstbit(r17.new, #0)) jump:nt
+# CHECK-NEXT: if (!tstbit(r17.new,#0)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xe0 0xc2 0x25
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!tstbit(r17.new, #0)) jump:t
+# CHECK-NEXT: if (!tstbit(r17.new,#0)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xc0 0x02 0x26
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.eq(r17.new, #-1)) jump:nt
+# CHECK-NEXT: if (cmp.eq(r17.new,#-1)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xe0 0x02 0x26
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.eq(r17.new, #-1)) jump:t
+# CHECK-NEXT: if (cmp.eq(r17.new,#-1)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xc0 0x42 0x26
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.eq(r17.new, #-1)) jump:nt
+# CHECK-NEXT: if (!cmp.eq(r17.new,#-1)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xe0 0x42 0x26
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.eq(r17.new, #-1)) jump:t
+# CHECK-NEXT: if (!cmp.eq(r17.new,#-1)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xc0 0x82 0x26
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gt(r17.new, #-1)) jump:nt
+# CHECK-NEXT: if (cmp.gt(r17.new,#-1)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xe0 0x82 0x26
 # CHECK: r17 = r17
-# CHECK-NEXT: if (cmp.gt(r17.new, #-1)) jump:t
+# CHECK-NEXT: if (cmp.gt(r17.new,#-1)) jump:t
 0x11 0x40 0x71 0x70 0x92 0xc0 0xc2 0x26
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gt(r17.new, #-1)) jump:nt
+# CHECK-NEXT: if (!cmp.gt(r17.new,#-1)) jump:nt
 0x11 0x40 0x71 0x70 0x92 0xe0 0xc2 0x26
 # CHECK: r17 = r17
-# CHECK-NEXT: if (!cmp.gt(r17.new, #-1)) jump:t
+# CHECK-NEXT: if (!cmp.gt(r17.new,#-1)) jump:t
diff --git a/test/MC/Disassembler/Hexagon/nv_st.txt b/test/MC/Disassembler/Hexagon/nv_st.txt
index 3a767f33b36a1a06a369197c746b750c77daf55e..7b76cb56dd3ae5665cd6dfed0b53c2ed206fb7c3 100644
--- a/test/MC/Disassembler/Hexagon/nv_st.txt
+++ b/test/MC/Disassembler/Hexagon/nv_st.txt
@@ -4,19 +4,19 @@
 # Store new-value byte
 0x1f 0x40 0x7f 0x70 0x82 0xf5 0xb1 0x3b
 # CHECK: r31 = r31
-# CHECK-NEXT: memb(r17 + r21<<#3) = r31.new
+# CHECK-NEXT: memb(r17+r21<<#3) = r31.new
 0x1f 0x40 0x7f 0x70 0x11 0xc2 0xa0 0x48
 # CHECK: r31 = r31
-# CHECK-NEXT: memb(#17) = r31.new
+# CHECK-NEXT: memb(gp+#17) = r31.new
 0x1f 0x40 0x7f 0x70 0x15 0xc2 0xb1 0xa1
 # CHECK: r31 = r31
 # CHECK-NEXT: memb(r17+#21) = r31.new
 0x1f 0x40 0x7f 0x70 0x02 0xe2 0xb1 0xa9
 # CHECK: r31 = r31
-# CHECK-NEXT: memb(r17 ++ I:circ(m1)) = r31.new
+# CHECK-NEXT: memb(r17++I:circ(m1)) = r31.new
 0x1f 0x40 0x7f 0x70 0x28 0xe2 0xb1 0xa9
 # CHECK: r31 = r31
-# CHECK-NEXT: memb(r17 ++ #5:circ(m1)) = r31.new
+# CHECK-NEXT: memb(r17++#5:circ(m1)) = r31.new
 0x1f 0x40 0x7f 0x70 0x28 0xc2 0xb1 0xab
 # CHECK: r31 = r31
 # CHECK-NEXT: memb(r17++#5) = r31.new
@@ -25,7 +25,7 @@
 # CHECK-NEXT: memb(r17++m1) = r31.new
 0x1f 0x40 0x7f 0x70 0x00 0xe2 0xb1 0xaf
 # CHECK: r31 = r31
-# CHECK-NEXT: memb(r17 ++ m1:brev) = r31.new
+# CHECK-NEXT: memb(r17++m1:brev) = r31.new
 
 # Store new-value byte conditionally
 0x1f 0x40 0x7f 0x70 0xe2 0xf5 0xb1 0x34
@@ -74,19 +74,19 @@
 # Store new-value halfword
 0x1f 0x40 0x7f 0x70 0x8a 0xf5 0xb1 0x3b
 # CHECK: r31 = r31
-# CHECK-NEXT: memh(r17 + r21<<#3) = r31.new
+# CHECK-NEXT: memh(r17+r21<<#3) = r31.new
 0x1f 0x40 0x7f 0x70 0x15 0xca 0xa0 0x48
 # CHECK: r31 = r31
-# CHECK-NEXT: memh(#42) = r31.new
+# CHECK-NEXT: memh(gp+#42) = r31.new
 0x1f 0x40 0x7f 0x70 0x15 0xca 0xb1 0xa1
 # CHECK: r31 = r31
 # CHECK-NEXT: memh(r17+#42) = r31.new
 0x1f 0x40 0x7f 0x70 0x02 0xea 0xb1 0xa9
 # CHECK: r31 = r31
-# CHECK-NEXT: memh(r17 ++ I:circ(m1)) = r31.new
+# CHECK-NEXT: memh(r17++I:circ(m1)) = r31.new
 0x1f 0x40 0x7f 0x70 0x28 0xea 0xb1 0xa9
 # CHECK: r31 = r31
-# CHECK-NEXT: memh(r17 ++ #10:circ(m1)) = r31.new
+# CHECK-NEXT: memh(r17++#10:circ(m1)) = r31.new
 0x1f 0x40 0x7f 0x70 0x28 0xca 0xb1 0xab
 # CHECK: r31 = r31
 # CHECK-NEXT: memh(r17++#10) = r31.new
@@ -95,7 +95,7 @@
 # CHECK-NEXT: memh(r17++m1) = r31.new
 0x1f 0x40 0x7f 0x70 0x00 0xea 0xb1 0xaf
 # CHECK: r31 = r31
-# CHECK-NEXT: memh(r17 ++ m1:brev) = r31.new
+# CHECK-NEXT: memh(r17++m1:brev) = r31.new
 
 # Store new-value halfword conditionally
 0x1f 0x40 0x7f 0x70 0xea 0xf5 0xb1 0x34
@@ -144,19 +144,19 @@
 # Store new-value word
 0x1f 0x40 0x7f 0x70 0x92 0xf5 0xb1 0x3b
 # CHECK: r31 = r31
-# CHECK-NEXT: memw(r17 + r21<<#3) = r31.new
+# CHECK-NEXT: memw(r17+r21<<#3) = r31.new
 0x1f 0x40 0x7f 0x70 0x15 0xd2 0xa0 0x48
 # CHECK: r31 = r31
-# CHECK-NEXT: memw(#84) = r31.new
+# CHECK-NEXT: memw(gp+#84) = r31.new
 0x1f 0x40 0x7f 0x70 0x15 0xd2 0xb1 0xa1
 # CHECK: r31 = r31
 # CHECK-NEXT: memw(r17+#84) = r31.new
 0x1f 0x40 0x7f 0x70 0x02 0xf2 0xb1 0xa9
 # CHECK: r31 = r31
-# CHECK-NEXT: memw(r17 ++ I:circ(m1)) = r31.new
+# CHECK-NEXT: memw(r17++I:circ(m1)) = r31.new
 0x1f 0x40 0x7f 0x70 0x28 0xf2 0xb1 0xa9
 # CHECK: r31 = r31
-# CHECK-NEXT: memw(r17 ++ #20:circ(m1)) = r31.new
+# CHECK-NEXT: memw(r17++#20:circ(m1)) = r31.new
 0x1f 0x40 0x7f 0x70 0x28 0xd2 0xb1 0xab
 # CHECK: r31 = r31
 # CHECK-NEXT: memw(r17++#20) = r31.new
@@ -165,7 +165,7 @@
 # CHECK-NEXT: memw(r17++m1) = r31.new
 0x1f 0x40 0x7f 0x70 0x00 0xf2 0xb1 0xaf
 # CHECK: r31 = r31
-# CHECK-NEXT: memw(r17 ++ m1:brev) = r31.new
+# CHECK-NEXT: memw(r17++m1:brev) = r31.new
 
 # Store new-value word conditionally
 0x1f 0x40 0x7f 0x70 0xf2 0xf5 0xb1 0x34
diff --git a/test/MC/Disassembler/Hexagon/st.txt b/test/MC/Disassembler/Hexagon/st.txt
index 6d9074a05ef77c1c7a5a7b681629bd68f0e4afea..0f936c267f56f9201f0f759633ce7ecc5d044055 100644
--- a/test/MC/Disassembler/Hexagon/st.txt
+++ b/test/MC/Disassembler/Hexagon/st.txt
@@ -3,25 +3,25 @@
 
 # Store doubleword
 0x9e 0xf5 0xd1 0x3b
-# CHECK: memd(r17 + r21<<#3) = r31:30
+# CHECK: memd(r17+r21<<#3) = r31:30
 0x28 0xd4 0xc0 0x48
-# CHECK: memd(#320) = r21:20
+# CHECK: memd(gp+#320) = r21:20
 0x02 0x40 0x00 0x00 0x28 0xd4 0xc0 0x48
 # CHECK: memd(##168) = r21:20
 0x15 0xd4 0xd1 0xa1
 # CHECK: memd(r17+#168) = r21:20
 0x02 0xf4 0xd1 0xa9
-# CHECK: memd(r17 ++ I:circ(m1)) = r21:20
+# CHECK: memd(r17++I:circ(m1)) = r21:20
 0x28 0xf4 0xd1 0xa9
-# CHECK: memd(r17 ++ #40:circ(m1)) = r21:20
+# CHECK: memd(r17++#40:circ(m1)) = r21:20
 0x28 0xd4 0xd1 0xab
 # CHECK: memd(r17++#40) = r21:20
 0x00 0x40 0x00 0x00 0xd5 0xfe 0xd1 0xad
-# CHECK: memd(r17<<#3 + ##21) = r31:30
+# CHECK: memd(r17<<#3+##21) = r31:30
 0x00 0xf4 0xd1 0xad
 # CHECK: memd(r17++m1) = r21:20
 0x00 0xf4 0xd1 0xaf
-# CHECK: memd(r17 ++ m1:brev) = r21:20
+# CHECK: memd(r17++m1:brev) = r21:20
 
 # Store doubleword conditionally
 0xfe 0xf5 0xd1 0x34
@@ -67,27 +67,27 @@
 
 # Store byte
 0x9f 0xf5 0x11 0x3b
-# CHECK: memb(r17 + r21<<#3) = r31
+# CHECK: memb(r17+r21<<#3) = r31
 0x9f 0xca 0x11 0x3c
-# CHECK: memb(r17+#21)=#31
+# CHECK: memb(r17+#21) = #31
 0x15 0xd5 0x00 0x48
-# CHECK: memb(#21) = r21
+# CHECK: memb(gp+#21) = r21
 0x00 0x40 0x00 0x00 0x15 0xd5 0x00 0x48
 # CHECK: memb(##21) = r21
 0x15 0xd5 0x11 0xa1
 # CHECK: memb(r17+#21) = r21
 0x02 0xf5 0x11 0xa9
-# CHECK: memb(r17 ++ I:circ(m1)) = r21
+# CHECK: memb(r17++I:circ(m1)) = r21
 0x28 0xf5 0x11 0xa9
-# CHECK: memb(r17 ++ #5:circ(m1)) = r21
+# CHECK: memb(r17++#5:circ(m1)) = r21
 0x28 0xd5 0x11 0xab
 # CHECK: memb(r17++#5) = r21
 0x00 0x40 0x00 0x00 0xd5 0xff 0x11 0xad
-# CHECK: memb(r17<<#3 + ##21) = r31
+# CHECK: memb(r17<<#3+##21) = r31
 0x00 0xf5 0x11 0xad
 # CHECK: memb(r17++m1) = r21
 0x00 0xf5 0x11 0xaf
-# CHECK: memb(r17 ++ m1:brev) = r21
+# CHECK: memb(r17++m1:brev) = r21
 
 # Store byte conditionally
 0xff 0xf5 0x11 0x34
@@ -101,15 +101,15 @@
 # CHECK: p3 = r5
 # CHECK-NEXT: if (!p3.new) memb(r17+r21<<#3) = r31
 0xff 0xca 0x11 0x38
-# CHECK: if (p3) memb(r17+#21)=#31
+# CHECK: if (p3) memb(r17+#21) = #31
 0xff 0xca 0x91 0x38
-# CHECK: if (!p3) memb(r17+#21)=#31
+# CHECK: if (!p3) memb(r17+#21) = #31
 0x03 0x40 0x45 0x85 0xff 0xca 0x11 0x39
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) memb(r17+#21)=#31
+# CHECK-NEXT: if (p3.new) memb(r17+#21) = #31
 0x03 0x40 0x45 0x85 0xff 0xca 0x91 0x39
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) memb(r17+#21)=#31
+# CHECK-NEXT: if (!p3.new) memb(r17+#21) = #31
 0xab 0xdf 0x11 0x40
 # CHECK: if (p3) memb(r17+#21) = r31
 0xab 0xdf 0x11 0x44
@@ -143,47 +143,47 @@
 
 # Store halfword
 0x9f 0xf5 0x51 0x3b
-# CHECK: memh(r17 + r21<<#3) = r31
+# CHECK: memh(r17+r21<<#3) = r31
 0x9f 0xf5 0x71 0x3b
-# CHECK: memh(r17 + r21<<#3) = r31.h
+# CHECK: memh(r17+r21<<#3) = r31.h
 0x95 0xcf 0x31 0x3c
-# CHECK: memh(r17+#62)=#21
+# CHECK: memh(r17+#62) = #21
 0x00 0x40 0x00 0x00 0x2a 0xd5 0x40 0x48
 # CHECK: memh(##42) = r21
 0x00 0x40 0x00 0x00 0x2a 0xd5 0x60 0x48
 # CHECK: memh(##42) = r21.h
 0x2a 0xd5 0x40 0x48
-# CHECK: memh(#84) = r21
+# CHECK: memh(gp+#84) = r21
 0x2a 0xd5 0x60 0x48
-# CHECK: memh(#84) = r21.h
+# CHECK: memh(gp+#84) = r21.h
 0x15 0xdf 0x51 0xa1
 # CHECK: memh(r17+#42) = r31
 0x15 0xdf 0x71 0xa1
 # CHECK: memh(r17+#42) = r31.h
 0x02 0xf5 0x51 0xa9
-# CHECK: memh(r17 ++ I:circ(m1)) = r21
+# CHECK: memh(r17++I:circ(m1)) = r21
 0x28 0xf5 0x51 0xa9
-# CHECK: memh(r17 ++ #10:circ(m1)) = r21
+# CHECK: memh(r17++#10:circ(m1)) = r21
 0x02 0xf5 0x71 0xa9
-# CHECK: memh(r17 ++ I:circ(m1)) = r21.h
+# CHECK: memh(r17++I:circ(m1)) = r21.h
 0x28 0xf5 0x71 0xa9
-# CHECK: memh(r17 ++ #10:circ(m1)) = r21.h
+# CHECK: memh(r17++#10:circ(m1)) = r21.h
 0x28 0xd5 0x51 0xab
 # CHECK: memh(r17++#10) = r21
 0x00 0x40 0x00 0x00 0xd5 0xff 0x51 0xad
-# CHECK: memh(r17<<#3 + ##21) = r31
+# CHECK: memh(r17<<#3+##21) = r31
 0x28 0xd5 0x71 0xab
 # CHECK: memh(r17++#10) = r21.h
 0x00 0x40 0x00 0x00 0xd5 0xff 0x71 0xad
-# CHECK: memh(r17<<#3 + ##21) = r31.h
+# CHECK: memh(r17<<#3+##21) = r31.h
 0x00 0xf5 0x51 0xad
 # CHECK: memh(r17++m1) = r21
 0x00 0xf5 0x71 0xad
 # CHECK: memh(r17++m1) = r21.h
 0x00 0xf5 0x51 0xaf
-# CHECK: memh(r17 ++ m1:brev) = r21
+# CHECK: memh(r17++m1:brev) = r21
 0x00 0xf5 0x71 0xaf
-# CHECK: memh(r17 ++ m1:brev) = r21.h
+# CHECK: memh(r17++m1:brev) = r21.h
 
 # Store halfword conditionally
 0xff 0xf5 0x51 0x34
@@ -207,15 +207,15 @@
 # CHECK: p3 = r5
 # CHECK-NEXT: if (!p3.new) memh(r17+r21<<#3) = r31.h
 0xf5 0xcf 0x31 0x38
-# CHECK: if (p3) memh(r17+#62)=#21
+# CHECK: if (p3) memh(r17+#62) = #21
 0xf5 0xcf 0xb1 0x38
-# CHECK: if (!p3) memh(r17+#62)=#21
+# CHECK: if (!p3) memh(r17+#62) = #21
 0x03 0x40 0x45 0x85 0xf5 0xcf 0x31 0x39
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) memh(r17+#62)=#21
+# CHECK-NEXT: if (p3.new) memh(r17+#62) = #21
 0x03 0x40 0x45 0x85 0xf5 0xcf 0xb1 0x39
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) memh(r17+#62)=#21
+# CHECK-NEXT: if (!p3.new) memh(r17+#62) = #21
 0xfb 0xd5 0x51 0x40
 # CHECK: if (p3) memh(r17+#62) = r21
 0xfb 0xd5 0x71 0x40
@@ -279,29 +279,29 @@
 
 # Store word
 0x9f 0xf5 0x91 0x3b
-# CHECK: memw(r17 + r21<<#3) = r31
+# CHECK: memw(r17+r21<<#3) = r31
 0x9f 0xca 0x51 0x3c
-# CHECK: memw(r17{{ *}}+{{ *}}#84)=#31
+# CHECK: memw(r17+#84) = #31
 0x15 0xdf 0x80 0x48
-# CHECK: memw(#84) = r31
+# CHECK: memw(gp+#84) = r31
 0x01 0x40 0x00 0x00 0x14 0xd5 0x80 0x48
 # CHECK: memw(##84) = r21
 0x9f 0xca 0x51 0x3c
-# CHECK: memw(r17+#84)=#31
+# CHECK: memw(r17+#84) = #31
 0x15 0xdf 0x91 0xa1
 # CHECK: memw(r17+#84) = r31
 0x02 0xf5 0x91 0xa9
-# CHECK: memw(r17 ++ I:circ(m1)) = r21
+# CHECK: memw(r17++I:circ(m1)) = r21
 0x28 0xf5 0x91 0xa9
-# CHECK: memw(r17 ++ #20:circ(m1)) = r21
+# CHECK: memw(r17++#20:circ(m1)) = r21
 0x28 0xd5 0x91 0xab
 # CHECK: memw(r17++#20) = r21
 0x00 0x40 0x00 0x00 0xd5 0xff 0x91 0xad
-# CHECK: memw(r17<<#3 + ##21) = r31
+# CHECK: memw(r17<<#3+##21) = r31
 0x00 0xf5 0x91 0xad
 # CHECK: memw(r17++m1) = r21
 0x00 0xf5 0x91 0xaf
-# CHECK: memw(r17 ++ m1:brev) = r21
+# CHECK: memw(r17++m1:brev) = r21
 
 # Store word conditionally
 0xff 0xf5 0x91 0x34
@@ -315,15 +315,15 @@
 # CHECK: p3 = r5
 # CHECK-NEXT: if (!p3.new) memw(r17+r21<<#3) = r31
 0xff 0xca 0x51 0x38
-# CHECK: if (p3) memw(r17+#84)=#31
+# CHECK: if (p3) memw(r17+#84) = #31
 0xff 0xca 0xd1 0x38
-# CHECK: if (!p3) memw(r17+#84)=#31
+# CHECK: if (!p3) memw(r17+#84) = #31
 0x03 0x40 0x45 0x85 0xff 0xca 0x51 0x39
 # CHECK: p3 = r5
-# CHECK-NEXT: if (p3.new) memw(r17+#84)=#31
+# CHECK-NEXT: if (p3.new) memw(r17+#84) = #31
 0x03 0x40 0x45 0x85 0xff 0xca 0xd1 0x39
 # CHECK: p3 = r5
-# CHECK-NEXT: if (!p3.new) memw(r17+#84)=#31
+# CHECK-NEXT: if (!p3.new) memw(r17+#84) = #31
 0xab 0xdf 0x91 0x40
 # CHECK: if (p3) memw(r17+#84) = r31
 0xab 0xdf 0x91 0x44 
diff --git a/test/MC/Disassembler/Hexagon/system_user.txt b/test/MC/Disassembler/Hexagon/system_user.txt
index d55a94e939b54666eddd1ba1d2b5dbde20b1ab87..f4d731059e0488b6e90edb754f6e8510bdd943c2 100644
--- a/test/MC/Disassembler/Hexagon/system_user.txt
+++ b/test/MC/Disassembler/Hexagon/system_user.txt
@@ -9,9 +9,9 @@
 
 # Store conditional
 0x03 0xd5 0xb1 0xa0
-# CHECK: memw_locked(r17, p3) = r21
+# CHECK: memw_locked(r17,p3) = r21
 0x03 0xd4 0xf1 0xa0
-# CHECK: memd_locked(r17, p3) = r21:20
+# CHECK: memd_locked(r17,p3) = r21:20
 
 # Memory barrier
 0x00 0xc0 0x00 0xa8
@@ -19,7 +19,7 @@
 
 # Data cache prefetch
 0x15 0xc0 0x11 0x94
-# CHECK: dcfetch(r17 + #168)
+# CHECK: dcfetch(r17+#168)
 
 # Send value to ETM trace
 0x00 0xc0 0x51 0x62
diff --git a/test/MC/Disassembler/Hexagon/xtype_alu.txt b/test/MC/Disassembler/Hexagon/xtype_alu.txt
index 03d0f0518a3dd82eb871baca09562667528d0730..f05dafb3fce8462d414c1e6060682b1cceae89ef 100644
--- a/test/MC/Disassembler/Hexagon/xtype_alu.txt
+++ b/test/MC/Disassembler/Hexagon/xtype_alu.txt
@@ -11,137 +11,137 @@
 
 # Add and accumulate
 0xff 0xd1 0x35 0xdb
-# CHECK: r17 = add(r21, add(r31, #23))
+# CHECK: r17 = add(r21,add(r31,#23))
 0xff 0xd1 0xb5 0xdb
-# CHECK: r17 = add(r21, sub(#23, r31))
+# CHECK: r17 = add(r21,sub(#23,r31))
 0xf1 0xc2 0x15 0xe2
-# CHECK: r17 += add(r21, #23)
+# CHECK: r17 += add(r21,#23)
 0xf1 0xc2 0x95 0xe2
-# CHECK: r17 -= add(r21, #23)
+# CHECK: r17 -= add(r21,#23)
 0x31 0xdf 0x15 0xef
-# CHECK: r17 += add(r21, r31)
+# CHECK: r17 += add(r21,r31)
 0x31 0xdf 0x95 0xef
-# CHECK: r17 -= add(r21, r31)
+# CHECK: r17 -= add(r21,r31)
 
 # Add doublewords
 0xf0 0xde 0x14 0xd3
-# CHECK: r17:16 = add(r21:20, r31:30)
+# CHECK: r17:16 = add(r21:20,r31:30)
 0xb0 0xde 0x74 0xd3
-# CHECK: r17:16 = add(r21:20, r31:30):sat
+# CHECK: r17:16 = add(r21:20,r31:30):sat
 0xd0 0xde 0x74 0xd3
-# CHECK: r17:16 = add(r21:20, r31:30):raw:lo
+# CHECK: r17:16 = add(r21:20,r31:30):raw:lo
 0xf0 0xde 0x74 0xd3
-# CHECK: r17:16 = add(r21:20, r31:30):raw:hi
+# CHECK: r17:16 = add(r21:20,r31:30):raw:hi
 
 # Add halfword
 0x11 0xd5 0x1f 0xd5
-# CHECK: r17 = add(r21.l, r31.l)
+# CHECK: r17 = add(r21.l,r31.l)
 0x51 0xd5 0x1f 0xd5
-# CHECK: r17 = add(r21.l, r31.h)
+# CHECK: r17 = add(r21.l,r31.h)
 0x91 0xd5 0x1f 0xd5
-# CHECK: r17 = add(r21.l, r31.l):sat
+# CHECK: r17 = add(r21.l,r31.l):sat
 0xd1 0xd5 0x1f 0xd5
-# CHECK: r17 = add(r21.l, r31.h):sat
+# CHECK: r17 = add(r21.l,r31.h):sat
 0x11 0xd5 0x5f 0xd5
-# CHECK: r17 = add(r21.l, r31.l):<<16
+# CHECK: r17 = add(r21.l,r31.l):<<16
 0x31 0xd5 0x5f 0xd5
-# CHECK: r17 = add(r21.l, r31.h):<<16
+# CHECK: r17 = add(r21.l,r31.h):<<16
 0x51 0xd5 0x5f 0xd5
-# CHECK: r17 = add(r21.h, r31.l):<<16
+# CHECK: r17 = add(r21.h,r31.l):<<16
 0x71 0xd5 0x5f 0xd5
-# CHECK: r17 = add(r21.h, r31.h):<<16
+# CHECK: r17 = add(r21.h,r31.h):<<16
 0x91 0xd5 0x5f 0xd5
-# CHECK: r17 = add(r21.l, r31.l):sat:<<16
+# CHECK: r17 = add(r21.l,r31.l):sat:<<16
 0xb1 0xd5 0x5f 0xd5
-# CHECK: r17 = add(r21.l, r31.h):sat:<<16
+# CHECK: r17 = add(r21.l,r31.h):sat:<<16
 0xd1 0xd5 0x5f 0xd5
-# CHECK: r17 = add(r21.h, r31.l):sat:<<16
+# CHECK: r17 = add(r21.h,r31.l):sat:<<16
 0xf1 0xd5 0x5f 0xd5
-# CHECK: r17 = add(r21.h, r31.h):sat:<<16
+# CHECK: r17 = add(r21.h,r31.h):sat:<<16
 
 # Add or subtract doublewords with carry
 0x70 0xde 0xd4 0xc2
-# CHECK: r17:16 = add(r21:20, r31:30, p3):carry
+# CHECK: r17:16 = add(r21:20,r31:30,p3):carry
 0x70 0xde 0xf4 0xc2
-# CHECK: r17:16 = sub(r21:20, r31:30, p3):carry
+# CHECK: r17:16 = sub(r21:20,r31:30,p3):carry
 
 # Logical doublewords
 0x90 0xc0 0x94 0x80
 # CHECK: r17:16 = not(r21:20)
 0x10 0xde 0xf4 0xd3
-# CHECK: r17:16 = and(r21:20, r31:30)
+# CHECK: r17:16 = and(r21:20,r31:30)
 0x30 0xd4 0xfe 0xd3
-# CHECK: r17:16 = and(r21:20, ~r31:30)
+# CHECK: r17:16 = and(r21:20,~r31:30)
 0x50 0xde 0xf4 0xd3
-# CHECK: r17:16 = or(r21:20, r31:30)
+# CHECK: r17:16 = or(r21:20,r31:30)
 0x70 0xd4 0xfe 0xd3
-# CHECK: r17:16 = or(r21:20, ~r31:30)
+# CHECK: r17:16 = or(r21:20,~r31:30)
 0x90 0xde 0xf4 0xd3
-# CHECK: r17:16 = xor(r21:20, r31:30)
+# CHECK: r17:16 = xor(r21:20,r31:30)
 
 # Logical-logical doublewords
 0x10 0xde 0x94 0xca
-# CHECK: r17:16 ^= xor(r21:20, r31:30)
+# CHECK: r17:16 ^= xor(r21:20,r31:30)
 
 # Logical-logical words
 0xf1 0xc3 0x15 0xda
-# CHECK: r17 |= and(r21, #31)
+# CHECK: r17 |= and(r21,#31)
 0xf5 0xc3 0x51 0xda
-# CHECK: r17 = or(r21, and(r17, #31))
+# CHECK: r17 = or(r21,and(r17,#31))
 0xf1 0xc3 0x95 0xda
-# CHECK: r17 |= or(r21, #31)
+# CHECK: r17 |= or(r21,#31)
 0x11 0xdf 0x35 0xef
-# CHECK: r17 |= and(r21, ~r31)
+# CHECK: r17 |= and(r21,~r31)
 0x31 0xdf 0x35 0xef
-# CHECK: r17 &= and(r21, ~r31)
+# CHECK: r17 &= and(r21,~r31)
 0x51 0xdf 0x35 0xef
-# CHECK: r17 ^= and(r21, ~r31)
+# CHECK: r17 ^= and(r21,~r31)
 0x11 0xdf 0x55 0xef
-# CHECK: r17 &= and(r21, r31)
+# CHECK: r17 &= and(r21,r31)
 0x31 0xdf 0x55 0xef
-# CHECK: r17 &= or(r21, r31)
+# CHECK: r17 &= or(r21,r31)
 0x51 0xdf 0x55 0xef
-# CHECK: r17 &= xor(r21, r31)
+# CHECK: r17 &= xor(r21,r31)
 0x71 0xdf 0x55 0xef
-# CHECK: r17 |= and(r21, r31)
+# CHECK: r17 |= and(r21,r31)
 0x71 0xdf 0x95 0xef
-# CHECK: r17 ^= xor(r21, r31)
+# CHECK: r17 ^= xor(r21,r31)
 0x11 0xdf 0xd5 0xef
-# CHECK: r17 |= or(r21, r31)
+# CHECK: r17 |= or(r21,r31)
 0x31 0xdf 0xd5 0xef
-# CHECK: r17 |= xor(r21, r31)
+# CHECK: r17 |= xor(r21,r31)
 0x51 0xdf 0xd5 0xef
-# CHECK: r17 ^= and(r21, r31)
+# CHECK: r17 ^= and(r21,r31)
 0x71 0xdf 0xd5 0xef
-# CHECK: r17 ^= or(r21, r31)
+# CHECK: r17 ^= or(r21,r31)
 
 # Maximum words
 0x11 0xdf 0xd5 0xd5
-# CHECK: r17 = max(r21, r31)
+# CHECK: r17 = max(r21,r31)
 0x91 0xdf 0xd5 0xd5
-# CHECK: r17 = maxu(r21, r31)
+# CHECK: r17 = maxu(r21,r31)
 
 # Maximum doublewords
 0x90 0xde 0xd4 0xd3
-# CHECK: r17:16 = max(r21:20, r31:30)
+# CHECK: r17:16 = max(r21:20,r31:30)
 0xb0 0xde 0xd4 0xd3
-# CHECK: r17:16 = maxu(r21:20, r31:30)
+# CHECK: r17:16 = maxu(r21:20,r31:30)
 
 # Minimum words
 0x11 0xd5 0xbf 0xd5
-# CHECK: r17 = min(r21, r31)
+# CHECK: r17 = min(r21,r31)
 0x91 0xd5 0xbf 0xd5
-# CHECK: r17 = minu(r21, r31)
+# CHECK: r17 = minu(r21,r31)
 
 # Minimum doublewords
 0xd0 0xd4 0xbe 0xd3
-# CHECK: r17:16 = min(r21:20, r31:30)
+# CHECK: r17:16 = min(r21:20,r31:30)
 0xf0 0xd4 0xbe 0xd3
-# CHECK: r17:16 = minu(r21:20, r31:30)
+# CHECK: r17:16 = minu(r21:20,r31:30)
 
 # Module wrap
 0xf1 0xdf 0xf5 0xd3
-# CHECK: r17 = modwrap(r21, r31)
+# CHECK: r17 = modwrap(r21,r31)
 
 # Negate
 0xb0 0xc0 0x94 0x80
@@ -153,51 +153,51 @@
 0x31 0xc0 0xd4 0x88
 # CHECK: r17 = round(r21:20):sat
 0x11 0xdf 0xf5 0x8c
-# CHECK: r17 = cround(r21, #31)
+# CHECK: r17 = cround(r21,#31)
 0x91 0xdf 0xf5 0x8c
-# CHECK: r17 = round(r21, #31)
+# CHECK: r17 = round(r21,#31)
 0xd1 0xdf 0xf5 0x8c
-# CHECK: r17 = round(r21, #31):sat
+# CHECK: r17 = round(r21,#31):sat
 0x11 0xdf 0xd5 0xc6
-# CHECK: r17 = cround(r21, r31)
+# CHECK: r17 = cround(r21,r31)
 0x91 0xdf 0xd5 0xc6
-# CHECK: r17 = round(r21, r31)
+# CHECK: r17 = round(r21,r31)
 0xd1 0xdf 0xd5 0xc6
-# CHECK: r17 = round(r21, r31):sat
+# CHECK: r17 = round(r21,r31):sat
 
 # Subtract doublewords
 0xf0 0xd4 0x3e 0xd3
-# CHECK: r17:16 = sub(r21:20, r31:30)
+# CHECK: r17:16 = sub(r21:20,r31:30)
 
 # Subtract and accumulate words
 0x71 0xd5 0x1f 0xef
-# CHECK: r17 += sub(r21, r31)
+# CHECK: r17 += sub(r21,r31)
 
 # Subtract halfword
 0x11 0xd5 0x3f 0xd5
-# CHECK: r17 = sub(r21.l, r31.l)
+# CHECK: r17 = sub(r21.l,r31.l)
 0x51 0xd5 0x3f 0xd5
-# CHECK: r17 = sub(r21.l, r31.h)
+# CHECK: r17 = sub(r21.l,r31.h)
 0x91 0xd5 0x3f 0xd5
-# CHECK: r17 = sub(r21.l, r31.l):sat
+# CHECK: r17 = sub(r21.l,r31.l):sat
 0xd1 0xd5 0x3f 0xd5
-# CHECK: r17 = sub(r21.l, r31.h):sat
+# CHECK: r17 = sub(r21.l,r31.h):sat
 0x11 0xd5 0x7f 0xd5
-# CHECK: r17 = sub(r21.l, r31.l):<<16
+# CHECK: r17 = sub(r21.l,r31.l):<<16
 0x31 0xd5 0x7f 0xd5
-# CHECK: r17 = sub(r21.l, r31.h):<<16
+# CHECK: r17 = sub(r21.l,r31.h):<<16
 0x51 0xd5 0x7f 0xd5
-# CHECK: r17 = sub(r21.h, r31.l):<<16
+# CHECK: r17 = sub(r21.h,r31.l):<<16
 0x71 0xd5 0x7f 0xd5
-# CHECK: r17 = sub(r21.h, r31.h):<<16
+# CHECK: r17 = sub(r21.h,r31.h):<<16
 0x91 0xd5 0x7f 0xd5
-# CHECK: r17 = sub(r21.l, r31.l):sat:<<16
+# CHECK: r17 = sub(r21.l,r31.l):sat:<<16
 0xb1 0xd5 0x7f 0xd5
-# CHECK: r17 = sub(r21.l, r31.h):sat:<<16
+# CHECK: r17 = sub(r21.l,r31.h):sat:<<16
 0xd1 0xd5 0x7f 0xd5
-# CHECK: r17 = sub(r21.h, r31.l):sat:<<16
+# CHECK: r17 = sub(r21.h,r31.l):sat:<<16
 0xf1 0xd5 0x7f 0xd5
-# CHECK: r17 = sub(r21.h, r31.h):sat:<<16
+# CHECK: r17 = sub(r21.h,r31.h):sat:<<16
 
 # Sign extend word to doubleword
 0x10 0xc0 0x55 0x84
@@ -217,179 +217,179 @@
 
 # Vector absolute difference halfwords
 0x10 0xd4 0x7e 0xe8
-# CHECK: r17:16 = vabsdiffh(r21:20, r31:30)
+# CHECK: r17:16 = vabsdiffh(r21:20,r31:30)
 
 # Vector absolute difference words
 0x10 0xd4 0x3e 0xe8
-# CHECK: r17:16 = vabsdiffw(r21:20, r31:30)
+# CHECK: r17:16 = vabsdiffw(r21:20,r31:30)
 
 # Vector add halfwords
 0x50 0xde 0x14 0xd3
-# CHECK: r17:16 = vaddh(r21:20, r31:30)
+# CHECK: r17:16 = vaddh(r21:20,r31:30)
 0x70 0xde 0x14 0xd3
-# CHECK: r17:16 = vaddh(r21:20, r31:30):sat
+# CHECK: r17:16 = vaddh(r21:20,r31:30):sat
 0x90 0xde 0x14 0xd3
-# CHECK: r17:16 = vadduh(r21:20, r31:30):sat
+# CHECK: r17:16 = vadduh(r21:20,r31:30):sat
 
 # Vector add halfwords with saturate and pack to unsigned bytes
 0x31 0xde 0x54 0xc1
-# CHECK: r17 = vaddhub(r21:20, r31:30):sat
+# CHECK: r17 = vaddhub(r21:20,r31:30):sat
 
 # Vector reduce add unsigned bytes
 0x30 0xde 0x54 0xe8
-# CHECK: r17:16 = vraddub(r21:20, r31:30)
+# CHECK: r17:16 = vraddub(r21:20,r31:30)
 0x30 0xde 0x54 0xea
-# CHECK: r17:16 += vraddub(r21:20, r31:30)
+# CHECK: r17:16 += vraddub(r21:20,r31:30)
 
 # Vector reduce add halfwords
 0x31 0xde 0x14 0xe9
-# CHECK: r17 = vradduh(r21:20, r31:30)
+# CHECK: r17 = vradduh(r21:20,r31:30)
 0xf1 0xde 0x34 0xe9
-# CHECK: r17 = vraddh(r21:20, r31:30)
+# CHECK: r17 = vraddh(r21:20,r31:30)
 
 # Vector add bytes
 0x10 0xde 0x14 0xd3
-# CHECK: r17:16 = vaddub(r21:20, r31:30)
+# CHECK: r17:16 = vaddub(r21:20,r31:30)
 0x30 0xde 0x14 0xd3
-# CHECK: r17:16 = vaddub(r21:20, r31:30):sat
+# CHECK: r17:16 = vaddub(r21:20,r31:30):sat
 
 # Vector add words
 0xb0 0xde 0x14 0xd3
-# CHECK: r17:16 = vaddw(r21:20, r31:30)
+# CHECK: r17:16 = vaddw(r21:20,r31:30)
 0xd0 0xde 0x14 0xd3
-# CHECK: r17:16 = vaddw(r21:20, r31:30):sat
+# CHECK: r17:16 = vaddw(r21:20,r31:30):sat
 
 # Vector average halfwords
 0x50 0xde 0x54 0xd3
-# CHECK: r17:16 = vavgh(r21:20, r31:30)
+# CHECK: r17:16 = vavgh(r21:20,r31:30)
 0x70 0xde 0x54 0xd3
-# CHECK: r17:16 = vavgh(r21:20, r31:30):rnd
+# CHECK: r17:16 = vavgh(r21:20,r31:30):rnd
 0x90 0xde 0x54 0xd3
-# CHECK: r17:16 = vavgh(r21:20, r31:30):crnd
+# CHECK: r17:16 = vavgh(r21:20,r31:30):crnd
 0xb0 0xde 0x54 0xd3
-# CHECK: r17:16 = vavguh(r21:20, r31:30)
+# CHECK: r17:16 = vavguh(r21:20,r31:30)
 0xd0 0xde 0x54 0xd3
-# CHECK: r17:16 = vavguh(r21:20, r31:30):rnd
+# CHECK: r17:16 = vavguh(r21:20,r31:30):rnd
 0x10 0xd4 0x9e 0xd3
-# CHECK: r17:16 = vnavgh(r21:20, r31:30)
+# CHECK: r17:16 = vnavgh(r21:20,r31:30)
 0x30 0xd4 0x9e 0xd3
-# CHECK: r17:16 = vnavgh(r21:20, r31:30):rnd:sat
+# CHECK: r17:16 = vnavgh(r21:20,r31:30):rnd:sat
 0x50 0xd4 0x9e 0xd3
-# CHECK: r17:16 = vnavgh(r21:20, r31:30):crnd:sat
+# CHECK: r17:16 = vnavgh(r21:20,r31:30):crnd:sat
 
 # Vector average unsigned bytes
 0x10 0xde 0x54 0xd3
-# CHECK: r17:16 = vavgub(r21:20, r31:30)
+# CHECK: r17:16 = vavgub(r21:20,r31:30)
 0x30 0xde 0x54 0xd3
-# CHECK: r17:16 = vavgub(r21:20, r31:30):rnd
+# CHECK: r17:16 = vavgub(r21:20,r31:30):rnd
 
 # Vector average words
 0x10 0xde 0x74 0xd3
-# CHECK: r17:16 = vavgw(r21:20, r31:30)
+# CHECK: r17:16 = vavgw(r21:20,r31:30)
 0x30 0xde 0x74 0xd3
-# CHECK: r17:16 = vavgw(r21:20, r31:30):rnd
+# CHECK: r17:16 = vavgw(r21:20,r31:30):rnd
 0x50 0xde 0x74 0xd3
-# CHECK: r17:16 = vavgw(r21:20, r31:30):crnd
+# CHECK: r17:16 = vavgw(r21:20,r31:30):crnd
 0x70 0xde 0x74 0xd3
-# CHECK: r17:16 = vavguw(r21:20, r31:30)
+# CHECK: r17:16 = vavguw(r21:20,r31:30)
 0x90 0xde 0x74 0xd3
-# CHECK: r17:16 = vavguw(r21:20, r31:30):rnd
+# CHECK: r17:16 = vavguw(r21:20,r31:30):rnd
 0x70 0xd4 0x9e 0xd3
-# CHECK: r17:16 = vnavgw(r21:20, r31:30)
+# CHECK: r17:16 = vnavgw(r21:20,r31:30)
 0x90 0xd4 0x9e 0xd3
-# CHECK: r17:16 = vnavgw(r21:20, r31:30):rnd:sat
+# CHECK: r17:16 = vnavgw(r21:20,r31:30):rnd:sat
 0xd0 0xd4 0x9e 0xd3
-# CHECK: r17:16 = vnavgw(r21:20, r31:30):crnd:sat
+# CHECK: r17:16 = vnavgw(r21:20,r31:30):crnd:sat
 
 # Vector conditional negate
 0x50 0xdf 0xd4 0xc3
-# CHECK: r17:16 = vcnegh(r21:20, r31)
+# CHECK: r17:16 = vcnegh(r21:20,r31)
 
 0xf0 0xff 0x34 0xcb
-# CHECK: r17:16 += vrcnegh(r21:20, r31)
+# CHECK: r17:16 += vrcnegh(r21:20,r31)
 
 # Vector maximum bytes
 0x10 0xd4 0xde 0xd3
-# CHECK: r17:16 = vmaxub(r21:20, r31:30)
+# CHECK: r17:16 = vmaxub(r21:20,r31:30)
 0xd0 0xd4 0xde 0xd3
-# CHECK: r17:16 = vmaxb(r21:20, r31:30)
+# CHECK: r17:16 = vmaxb(r21:20,r31:30)
 
 # Vector maximum halfwords
 0x30 0xd4 0xde 0xd3
-# CHECK: r17:16 = vmaxh(r21:20, r31:30)
+# CHECK: r17:16 = vmaxh(r21:20,r31:30)
 0x50 0xd4 0xde 0xd3
-# CHECK: r17:16 = vmaxuh(r21:20, r31:30)
+# CHECK: r17:16 = vmaxuh(r21:20,r31:30)
 
 # Vector reduce maximum halfwords
 0x3f 0xd0 0x34 0xcb
-# CHECK: r17:16 = vrmaxh(r21:20, r31)
+# CHECK: r17:16 = vrmaxh(r21:20,r31)
 0x3f 0xf0 0x34 0xcb
-# CHECK: r17:16 = vrmaxuh(r21:20, r31)
+# CHECK: r17:16 = vrmaxuh(r21:20,r31)
 
 # Vector reduce maximum words
 0x5f 0xd0 0x34 0xcb
-# CHECK: r17:16 = vrmaxw(r21:20, r31)
+# CHECK: r17:16 = vrmaxw(r21:20,r31)
 0x5f 0xf0 0x34 0xcb
-# CHECK: r17:16 = vrmaxuw(r21:20, r31)
+# CHECK: r17:16 = vrmaxuw(r21:20,r31)
 
 # Vector maximum words
 0xb0 0xd4 0xbe 0xd3
-# CHECK: r17:16 = vmaxuw(r21:20, r31:30)
+# CHECK: r17:16 = vmaxuw(r21:20,r31:30)
 0x70 0xd4 0xde 0xd3
-# CHECK: r17:16 = vmaxw(r21:20, r31:30)
+# CHECK: r17:16 = vmaxw(r21:20,r31:30)
 
 # Vector minimum bytes
 0x10 0xd4 0xbe 0xd3
-# CHECK: r17:16 = vminub(r21:20, r31:30)
+# CHECK: r17:16 = vminub(r21:20,r31:30)
 0xf0 0xd4 0xde 0xd3
-# CHECK: r17:16 = vminb(r21:20, r31:30)
+# CHECK: r17:16 = vminb(r21:20,r31:30)
 
 # Vector minimum halfwords
 0x30 0xd4 0xbe 0xd3
-# CHECK: r17:16 = vminh(r21:20, r31:30)
+# CHECK: r17:16 = vminh(r21:20,r31:30)
 0x50 0xd4 0xbe 0xd3
-# CHECK: r17:16 = vminuh(r21:20, r31:30)
+# CHECK: r17:16 = vminuh(r21:20,r31:30)
 
 # Vector reduce minimum halfwords
 0xbf 0xd0 0x34 0xcb
-# CHECK: r17:16 = vrminh(r21:20, r31)
+# CHECK: r17:16 = vrminh(r21:20,r31)
 0xbf 0xf0 0x34 0xcb
-# CHECK: r17:16 = vrminuh(r21:20, r31)
+# CHECK: r17:16 = vrminuh(r21:20,r31)
 
 # Vector reduce minimum words
 0xdf 0xd0 0x34 0xcb
-# CHECK: r17:16 = vrminw(r21:20, r31)
+# CHECK: r17:16 = vrminw(r21:20,r31)
 0xdf 0xf0 0x34 0xcb
-# CHECK: r17:16 = vrminuw(r21:20, r31)
+# CHECK: r17:16 = vrminuw(r21:20,r31)
 
 # Vector minimum words
 0x70 0xd4 0xbe 0xd3
-# CHECK: r17:16 = vminw(r21:20, r31:30)
+# CHECK: r17:16 = vminw(r21:20,r31:30)
 0x90 0xd4 0xbe 0xd3
-# CHECK: r17:16 = vminuw(r21:20, r31:30)
+# CHECK: r17:16 = vminuw(r21:20,r31:30)
 
 # Vector sum of absolute differences unsigned bytes
 0x50 0xde 0x54 0xe8
-# CHECK: r17:16 = vrsadub(r21:20, r31:30)
+# CHECK: r17:16 = vrsadub(r21:20,r31:30)
 0x50 0xde 0x54 0xea
-# CHECK: r17:16 += vrsadub(r21:20, r31:30)
+# CHECK: r17:16 += vrsadub(r21:20,r31:30)
 
 # Vector subtract halfwords
 0x50 0xd4 0x3e 0xd3
-# CHECK: r17:16 = vsubh(r21:20, r31:30)
+# CHECK: r17:16 = vsubh(r21:20,r31:30)
 0x70 0xd4 0x3e 0xd3
-# CHECK: r17:16 = vsubh(r21:20, r31:30):sat
+# CHECK: r17:16 = vsubh(r21:20,r31:30):sat
 0x90 0xd4 0x3e 0xd3
-# CHECK: r17:16 = vsubuh(r21:20, r31:30):sat
+# CHECK: r17:16 = vsubuh(r21:20,r31:30):sat
 
 # Vector subtract bytes
 0x10 0xd4 0x3e 0xd3
-# CHECK: r17:16 = vsubub(r21:20, r31:30)
+# CHECK: r17:16 = vsubub(r21:20,r31:30)
 0x30 0xd4 0x3e 0xd3
-# CHECK: r17:16 = vsubub(r21:20, r31:30):sat
+# CHECK: r17:16 = vsubub(r21:20,r31:30):sat
 
 # Vector subtract words
 0xb0 0xd4 0x3e 0xd3
-# CHECK: r17:16 = vsubw(r21:20, r31:30)
+# CHECK: r17:16 = vsubw(r21:20,r31:30)
 0xd0 0xd4 0x3e 0xd3
-# CHECK: r17:16 = vsubw(r21:20, r31:30):sat
+# CHECK: r17:16 = vsubw(r21:20,r31:30):sat
diff --git a/test/MC/Disassembler/Hexagon/xtype_bit.txt b/test/MC/Disassembler/Hexagon/xtype_bit.txt
index 89b6906afa92b9a1b891341f70bfa8f6e80a16b3..490a8bf85029368ec87e39529e435f0a95259c8d 100644
--- a/test/MC/Disassembler/Hexagon/xtype_bit.txt
+++ b/test/MC/Disassembler/Hexagon/xtype_bit.txt
@@ -11,9 +11,9 @@
 0x11 0xc0 0x74 0x88
 # CHECK: r17 = normamt(r21:20)
 0x51 0xd7 0x74 0x88
-# CHECK: r17 = add(clb(r21:20), #23)
+# CHECK: r17 = add(clb(r21:20),#23)
 0x11 0xd7 0x35 0x8c
-# CHECK: r17 = add(clb(r21), #23)
+# CHECK: r17 = add(clb(r21),#23)
 0x91 0xc0 0x15 0x8c
 # CHECK: r17 = clb(r21)
 0xb1 0xc0 0x15 0x8c
@@ -39,31 +39,31 @@
 
 # Extract bitfield
 0xf0 0xdf 0x54 0x81
-# CHECK: r17:16 = extractu(r21:20, #31, #23)
+# CHECK: r17:16 = extractu(r21:20,#31,#23)
 0xf0 0xdf 0x54 0x8a
-# CHECK: r17:16 = extract(r21:20, #31, #23)
+# CHECK: r17:16 = extract(r21:20,#31,#23)
 0xf1 0xdf 0x55 0x8d
-# CHECK: r17 = extractu(r21, #31, #23)
+# CHECK: r17 = extractu(r21,#31,#23)
 0xf1 0xdf 0xd5 0x8d
-# CHECK: r17 = extract(r21, #31, #23)
+# CHECK: r17 = extract(r21,#31,#23)
 0x10 0xde 0x14 0xc1
-# CHECK: r17:16 = extractu(r21:20, r31:30)
+# CHECK: r17:16 = extractu(r21:20,r31:30)
 0x90 0xde 0xd4 0xc1
-# CHECK: r17:16 = extract(r21:20, r31:30)
+# CHECK: r17:16 = extract(r21:20,r31:30)
 0x11 0xde 0x15 0xc9
-# CHECK: r17 = extractu(r21, r31:30)
+# CHECK: r17 = extractu(r21,r31:30)
 0x51 0xde 0x15 0xc9
-# CHECK: r17 = extract(r21, r31:30)
+# CHECK: r17 = extract(r21,r31:30)
 
 # Insert bitfield
 0xf0 0xdf 0x54 0x83
-# CHECK: r17:16 = insert(r21:20, #31, #23)
+# CHECK: r17:16 = insert(r21:20,#31,#23)
 0xf1 0xdf 0x55 0x8f
-# CHECK: r17 = insert(r21, #31, #23)
+# CHECK: r17 = insert(r21,#31,#23)
 0x11 0xde 0x15 0xc8
-# CHECK: r17 = insert(r21, r31:30)
+# CHECK: r17 = insert(r21,r31:30)
 0x10 0xde 0x14 0xca
-# CHECK: r17:16 = insert(r21:20, r31:30)
+# CHECK: r17:16 = insert(r21:20,r31:30)
 
 # Interleave/deinterleave
 0x90 0xc0 0xd4 0x80
@@ -73,13 +73,13 @@
 
 # Linear feedback-shift iteration
 0xd0 0xde 0x94 0xc1
-# CHECK: r17:16 = lfs(r21:20, r31:30)
+# CHECK: r17:16 = lfs(r21:20,r31:30)
 
 # Masked parity
 0x11 0xde 0x14 0xd0
-# CHECK: r17 = parity(r21:20, r31:30)
+# CHECK: r17 = parity(r21:20,r31:30)
 0x11 0xdf 0xf5 0xd5
-# CHECK: r17 = parity(r21, r31)
+# CHECK: r17 = parity(r21,r31)
 
 # Bit reverse
 0xd0 0xc0 0xd4 0x80
@@ -89,30 +89,30 @@
 
 # Set/clear/toggle bit
 0x11 0xdf 0xd5 0x8c
-# CHECK: r17 = setbit(r21, #31)
+# CHECK: r17 = setbit(r21,#31)
 0x31 0xdf 0xd5 0x8c
-# CHECK: r17 = clrbit(r21, #31)
+# CHECK: r17 = clrbit(r21,#31)
 0x51 0xdf 0xd5 0x8c
-# CHECK: r17 = togglebit(r21, #31)
+# CHECK: r17 = togglebit(r21,#31)
 0x11 0xdf 0x95 0xc6
-# CHECK: r17 = setbit(r21, r31)
+# CHECK: r17 = setbit(r21,r31)
 0x51 0xdf 0x95 0xc6
-# CHECK: r17 = clrbit(r21, r31)
+# CHECK: r17 = clrbit(r21,r31)
 0x91 0xdf 0x95 0xc6
-# CHECK: r17 = togglebit(r21, r31)
+# CHECK: r17 = togglebit(r21,r31)
 
 # Split bitfield
 0x90 0xdf 0xd5 0x88
-# CHECK: r17:16 = bitsplit(r21, #31)
+# CHECK: r17:16 = bitsplit(r21,#31)
 0x10 0xdf 0x35 0xd4
-# CHECK: r17:16 = bitsplit(r21, r31)
+# CHECK: r17:16 = bitsplit(r21,r31)
 
 # Table index
 0xf1 0xcd 0x15 0x87
-# CHECK: r17 = tableidxb(r21, #7, #13):raw
+# CHECK: r17 = tableidxb(r21,#7,#13):raw
 0xf1 0xcd 0x55 0x87
-# CHECK: r17 = tableidxh(r21, #7, #13):raw
+# CHECK: r17 = tableidxh(r21,#7,#13):raw
 0xf1 0xcd 0x95 0x87
-# CHECK: r17 = tableidxw(r21, #7, #13):raw
+# CHECK: r17 = tableidxw(r21,#7,#13):raw
 0xf1 0xcd 0xd5 0x87
-# CHECK: r17 = tableidxd(r21, #7, #13):raw
+# CHECK: r17 = tableidxd(r21,#7,#13):raw
diff --git a/test/MC/Disassembler/Hexagon/xtype_complex.txt b/test/MC/Disassembler/Hexagon/xtype_complex.txt
index 2332082d835e02fe1d7defa550531a400f2342e8..2c604f37d2eccd17fc48de49391af108223ce157 100644
--- a/test/MC/Disassembler/Hexagon/xtype_complex.txt
+++ b/test/MC/Disassembler/Hexagon/xtype_complex.txt
@@ -3,89 +3,89 @@
 
 # Complex add/sub halfwords
 0x90 0xde 0x54 0xc1
-# CHECK: r17:16 = vxaddsubh(r21:20, r31:30):sat
+# CHECK: r17:16 = vxaddsubh(r21:20,r31:30):sat
 0xd0 0xde 0x54 0xc1
-# CHECK: r17:16 = vxsubaddh(r21:20, r31:30):sat
+# CHECK: r17:16 = vxsubaddh(r21:20,r31:30):sat
 0x10 0xde 0xd4 0xc1
-# CHECK: r17:16 = vxaddsubh(r21:20, r31:30):rnd:>>1:sat
+# CHECK: r17:16 = vxaddsubh(r21:20,r31:30):rnd:>>1:sat
 0x50 0xde 0xd4 0xc1
-# CHECK: r17:16 = vxsubaddh(r21:20, r31:30):rnd:>>1:sat
+# CHECK: r17:16 = vxsubaddh(r21:20,r31:30):rnd:>>1:sat
 
 # Complex add/sub words
 0x10 0xde 0x54 0xc1
-# CHECK: r17:16 = vxaddsubw(r21:20, r31:30):sat
+# CHECK: r17:16 = vxaddsubw(r21:20,r31:30):sat
 0x50 0xde 0x54 0xc1
-# CHECK: r17:16 = vxsubaddw(r21:20, r31:30):sat
+# CHECK: r17:16 = vxsubaddw(r21:20,r31:30):sat
 
 # Complex multiply
 0xd0 0xdf 0x15 0xe5
-# CHECK: r17:16 = cmpy(r21, r31):sat
+# CHECK: r17:16 = cmpy(r21,r31):sat
 0xd0 0xdf 0x95 0xe5
-# CHECK: r17:16 = cmpy(r21, r31):<<1:sat
+# CHECK: r17:16 = cmpy(r21,r31):<<1:sat
 0xd0 0xdf 0x55 0xe5
-# CHECK: r17:16 = cmpy(r21, r31*):sat
+# CHECK: r17:16 = cmpy(r21,r31*):sat
 0xd0 0xdf 0xd5 0xe5
-# CHECK: r17:16 = cmpy(r21, r31*):<<1:sat
+# CHECK: r17:16 = cmpy(r21,r31*):<<1:sat
 0xd0 0xdf 0x15 0xe7
-# CHECK: r17:16 += cmpy(r21, r31):sat
+# CHECK: r17:16 += cmpy(r21,r31):sat
 0xd0 0xdf 0x95 0xe7
-# CHECK: r17:16 += cmpy(r21, r31):<<1:sat
+# CHECK: r17:16 += cmpy(r21,r31):<<1:sat
 0xf0 0xdf 0x15 0xe7
-# CHECK: r17:16 -= cmpy(r21, r31):sat
+# CHECK: r17:16 -= cmpy(r21,r31):sat
 0xf0 0xdf 0x95 0xe7
-# CHECK: r17:16 -= cmpy(r21, r31):<<1:sat
+# CHECK: r17:16 -= cmpy(r21,r31):<<1:sat
 0xd0 0xdf 0x55 0xe7
-# CHECK: r17:16 += cmpy(r21, r31*):sat
+# CHECK: r17:16 += cmpy(r21,r31*):sat
 0xd0 0xdf 0xd5 0xe7
-# CHECK: r17:16 += cmpy(r21, r31*):<<1:sat
+# CHECK: r17:16 += cmpy(r21,r31*):<<1:sat
 0xf0 0xdf 0x55 0xe7
-# CHECK: r17:16 -= cmpy(r21, r31*):sat
+# CHECK: r17:16 -= cmpy(r21,r31*):sat
 0xf0 0xdf 0xd5 0xe7
-# CHECK: r17:16 -= cmpy(r21, r31*):<<1:sat
+# CHECK: r17:16 -= cmpy(r21,r31*):<<1:sat
 
 # Complex multiply real or imaginary
 0x30 0xdf 0x15 0xe5
-# CHECK: r17:16 = cmpyi(r21, r31)
+# CHECK: r17:16 = cmpyi(r21,r31)
 0x50 0xdf 0x15 0xe5
-# CHECK: r17:16 = cmpyr(r21, r31)
+# CHECK: r17:16 = cmpyr(r21,r31)
 0x30 0xdf 0x15 0xe7
-# CHECK: r17:16 += cmpyi(r21, r31)
+# CHECK: r17:16 += cmpyi(r21,r31)
 0x50 0xdf 0x15 0xe7
-# CHECK: r17:16 += cmpyr(r21, r31)
+# CHECK: r17:16 += cmpyr(r21,r31)
 
 # Complex multiply with round and pack
 0xd1 0xdf 0x35 0xed
-# CHECK: r17 = cmpy(r21, r31):rnd:sat
+# CHECK: r17 = cmpy(r21,r31):rnd:sat
 0xd1 0xdf 0xb5 0xed
-# CHECK: r17 = cmpy(r21, r31):<<1:rnd:sat
+# CHECK: r17 = cmpy(r21,r31):<<1:rnd:sat
 0xd1 0xdf 0x75 0xed
-# CHECK: r17 = cmpy(r21, r31*):rnd:sat
+# CHECK: r17 = cmpy(r21,r31*):rnd:sat
 0xd1 0xdf 0xf5 0xed
-# CHECK: r17 = cmpy(r21, r31*):<<1:rnd:sat
+# CHECK: r17 = cmpy(r21,r31*):<<1:rnd:sat
 
 # Complex multiply 32x16
 0x91 0xdf 0x14 0xc5
-# CHECK: r17 = cmpyiwh(r21:20, r31):<<1:rnd:sat
+# CHECK: r17 = cmpyiwh(r21:20,r31):<<1:rnd:sat
 0xb1 0xdf 0x14 0xc5
-# CHECK: r17 = cmpyiwh(r21:20, r31*):<<1:rnd:sat
+# CHECK: r17 = cmpyiwh(r21:20,r31*):<<1:rnd:sat
 0xd1 0xdf 0x14 0xc5
-# CHECK: r17 = cmpyrwh(r21:20, r31):<<1:rnd:sat
+# CHECK: r17 = cmpyrwh(r21:20,r31):<<1:rnd:sat
 0xf1 0xdf 0x14 0xc5
-# CHECK: r17 = cmpyrwh(r21:20, r31*):<<1:rnd:sat
+# CHECK: r17 = cmpyrwh(r21:20,r31*):<<1:rnd:sat
 
 # Vector complex multiply real or imaginary
 0xd0 0xde 0x34 0xe8
-# CHECK: r17:16 = vcmpyr(r21:20, r31:30):sat
+# CHECK: r17:16 = vcmpyr(r21:20,r31:30):sat
 0xd0 0xde 0xb4 0xe8
-# CHECK: r17:16 = vcmpyr(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 = vcmpyr(r21:20,r31:30):<<1:sat
 0xd0 0xde 0x54 0xe8
-# CHECK: r17:16 = vcmpyi(r21:20, r31:30):sat
+# CHECK: r17:16 = vcmpyi(r21:20,r31:30):sat
 0xd0 0xde 0xd4 0xe8
-# CHECK: r17:16 = vcmpyi(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 = vcmpyi(r21:20,r31:30):<<1:sat
 0x90 0xde 0x34 0xea
-# CHECK: r17:16 += vcmpyr(r21:20, r31:30):sat
+# CHECK: r17:16 += vcmpyr(r21:20,r31:30):sat
 0x90 0xde 0x54 0xea
-# CHECK: r17:16 += vcmpyi(r21:20, r31:30):sat
+# CHECK: r17:16 += vcmpyi(r21:20,r31:30):sat
 
 # Vector complex conjugate
 0xf0 0xc0 0x94 0x80
@@ -93,36 +93,36 @@
 
 # Vector complex rotate
 0x10 0xdf 0xd4 0xc3
-# CHECK: r17:16 = vcrotate(r21:20, r31)
+# CHECK: r17:16 = vcrotate(r21:20,r31)
 
 # Vector reduce complex multiply real or imaginary
 0x10 0xde 0x14 0xe8
-# CHECK: r17:16 = vrcmpyi(r21:20, r31:30)
+# CHECK: r17:16 = vrcmpyi(r21:20,r31:30)
 0x30 0xde 0x14 0xe8
-# CHECK: r17:16 = vrcmpyr(r21:20, r31:30)
+# CHECK: r17:16 = vrcmpyr(r21:20,r31:30)
 0x10 0xde 0x54 0xe8
-# CHECK: r17:16 = vrcmpyi(r21:20, r31:30*)
+# CHECK: r17:16 = vrcmpyi(r21:20,r31:30*)
 0x30 0xde 0x74 0xe8
-# CHECK: r17:16 = vrcmpyr(r21:20, r31:30*)
+# CHECK: r17:16 = vrcmpyr(r21:20,r31:30*)
 
 # Vector reduce complex multiply by scalar
 0x90 0xde 0xb4 0xe8
-# CHECK: r17:16 = vrcmpys(r21:20, r31:30):<<1:sat:raw:hi
+# CHECK: r17:16 = vrcmpys(r21:20,r31:30):<<1:sat:raw:hi
 0x90 0xde 0xf4 0xe8
-# CHECK: r17:16 = vrcmpys(r21:20, r31:30):<<1:sat:raw:lo
+# CHECK: r17:16 = vrcmpys(r21:20,r31:30):<<1:sat:raw:lo
 0x90 0xde 0xb4 0xea
-# CHECK: r17:16 += vrcmpys(r21:20, r31:30):<<1:sat:raw:hi
+# CHECK: r17:16 += vrcmpys(r21:20,r31:30):<<1:sat:raw:hi
 0x90 0xde 0xf4 0xea
-# CHECK: r17:16 += vrcmpys(r21:20, r31:30):<<1:sat:raw:lo
+# CHECK: r17:16 += vrcmpys(r21:20,r31:30):<<1:sat:raw:lo
 
 # Vector reduce complex multiply by scalar with round and pack
 0xd1 0xde 0xb4 0xe9
-# CHECK: r17 = vrcmpys(r21:20, r31:30):<<1:rnd:sat:raw:hi
+# CHECK: r17 = vrcmpys(r21:20,r31:30):<<1:rnd:sat:raw:hi
 0xf1 0xde 0xb4 0xe9
-# CHECK: r17 = vrcmpys(r21:20, r31:30):<<1:rnd:sat:raw:lo
+# CHECK: r17 = vrcmpys(r21:20,r31:30):<<1:rnd:sat:raw:lo
 
 # Vector reduce complex rotate
 0xf0 0xff 0xd4 0xc3
-# CHECK: r17:16 = vrcrotate(r21:20, r31, #3)
+# CHECK: r17:16 = vrcrotate(r21:20,r31,#3)
 0x30 0xff 0xb4 0xcb
-# CHECK: r17:16 += vrcrotate(r21:20, r31, #3)
+# CHECK: r17:16 += vrcrotate(r21:20,r31,#3)
diff --git a/test/MC/Disassembler/Hexagon/xtype_fp.txt b/test/MC/Disassembler/Hexagon/xtype_fp.txt
index 70074208edadadd2129834d601fba99d218ce513..31f2a5330f2b0e85bc6dbde6ab04e37d8d514020 100644
--- a/test/MC/Disassembler/Hexagon/xtype_fp.txt
+++ b/test/MC/Disassembler/Hexagon/xtype_fp.txt
@@ -3,31 +3,31 @@
 
 # Floating point addition
 0x11 0xdf 0x15 0xeb
-# CHECK: r17 = sfadd(r21, r31)
+# CHECK: r17 = sfadd(r21,r31)
 
 # Classify floating-point value
 0x03 0xd5 0xf1 0x85
-# CHECK: p3 = sfclass(r17, #21)
+# CHECK: p3 = sfclass(r17,#21)
 0xb3 0xc2 0x90 0xdc
-# CHECK: p3 = dfclass(r17:16, #21)
+# CHECK: p3 = dfclass(r17:16,#21)
 
 # Compare floating-point value
 0x03 0xd5 0xf1 0xc7
-# CHECK: p3 = sfcmp.ge(r17, r21)
+# CHECK: p3 = sfcmp.ge(r17,r21)
 0x23 0xd5 0xf1 0xc7
-# CHECK: p3 = sfcmp.uo(r17, r21)
+# CHECK: p3 = sfcmp.uo(r17,r21)
 0x63 0xd5 0xf1 0xc7
-# CHECK: p3 = sfcmp.eq(r17, r21)
+# CHECK: p3 = sfcmp.eq(r17,r21)
 0x83 0xd5 0xf1 0xc7
-# CHECK: p3 = sfcmp.gt(r17, r21)
+# CHECK: p3 = sfcmp.gt(r17,r21)
 0x03 0xd4 0xf0 0xd2
-# CHECK: p3 = dfcmp.eq(r17:16, r21:20)
+# CHECK: p3 = dfcmp.eq(r17:16,r21:20)
 0x23 0xd4 0xf0 0xd2
-# CHECK: p3 = dfcmp.gt(r17:16, r21:20)
+# CHECK: p3 = dfcmp.gt(r17:16,r21:20)
 0x43 0xd4 0xf0 0xd2
-# CHECK: p3 = dfcmp.ge(r17:16, r21:20)
+# CHECK: p3 = dfcmp.ge(r17:16,r21:20)
 0x63 0xd4 0xf0 0xd2
-# CHECK: p3 = dfcmp.uo(r17:16, r21:20)
+# CHECK: p3 = dfcmp.uo(r17:16,r21:20)
 
 # Convert floating-point value to other format
 0x10 0xc0 0x95 0x84
@@ -91,29 +91,29 @@
 0x11 0xc0 0xb5 0x8b
 # CHECK: r17 = sffixupr(r21)
 0x11 0xdf 0xd5 0xeb
-# CHECK: r17 = sffixupn(r21, r31)
+# CHECK: r17 = sffixupn(r21,r31)
 0x31 0xdf 0xd5 0xeb
-# CHECK: r17 = sffixupd(r21, r31)
+# CHECK: r17 = sffixupd(r21,r31)
 
 # Floating point fused multiply-add
 0x91 0xdf 0x15 0xef
-# CHECK: r17 += sfmpy(r21, r31)
+# CHECK: r17 += sfmpy(r21,r31)
 0xb1 0xdf 0x15 0xef
-# CHECK: r17 -= sfmpy(r21, r31)
+# CHECK: r17 -= sfmpy(r21,r31)
 
 # Floating point fused multiply-add with scaling
 0xf1 0xdf 0x75 0xef
-# CHECK: r17 += sfmpy(r21, r31, p3):scale
+# CHECK: r17 += sfmpy(r21,r31,p3):scale
 
 # Floating point reciprocal square root approximation
 0x71 0xc0 0xf5 0x8b
-# CHECK: r17, p3 = sfinvsqrta(r21)
+# CHECK: r17,p3 = sfinvsqrta(r21)
 
 # Floating point fused multiply-add for library routines
 0xd1 0xdf 0x15 0xef
-# CHECK: r17 += sfmpy(r21, r31):lib
+# CHECK: r17 += sfmpy(r21,r31):lib
 0xf1 0xdf 0x15 0xef
-# CHECK: r17 -= sfmpy(r21, r31):lib
+# CHECK: r17 -= sfmpy(r21,r31):lib
 
 # Create floating-point constant
 0xb1 0xc2 0x00 0xd6
@@ -127,20 +127,20 @@
 
 # Floating point maximum
 0x11 0xdf 0x95 0xeb
-# CHECK: r17 = sfmax(r21, r31)
+# CHECK: r17 = sfmax(r21,r31)
 
 # Floating point minimum
 0x31 0xdf 0x95 0xeb
-# CHECK: r17 = sfmin(r21, r31)
+# CHECK: r17 = sfmin(r21,r31)
 
 # Floating point multiply
 0x11 0xdf 0x55 0xeb
-# CHECK: r17 = sfmpy(r21, r31)
+# CHECK: r17 = sfmpy(r21,r31)
 
 # Floating point reciprocal approximation
 0xf1 0xdf 0xf5 0xeb
-# CHECK: r17, p3 = sfrecipa(r21, r31)
+# CHECK: r17,p3 = sfrecipa(r21,r31)
 
 # Floating point subtraction
 0x31 0xdf 0x15 0xeb
-# CHECK: r17 = sfsub(r21, r31)
+# CHECK: r17 = sfsub(r21,r31)
diff --git a/test/MC/Disassembler/Hexagon/xtype_mpy.txt b/test/MC/Disassembler/Hexagon/xtype_mpy.txt
index ada32162a81e73343789653c2f75490bdedefe9e..dde6e76b266aac46175322582ac2efb9f9d96ff8 100644
--- a/test/MC/Disassembler/Hexagon/xtype_mpy.txt
+++ b/test/MC/Disassembler/Hexagon/xtype_mpy.txt
@@ -3,398 +3,398 @@
 
 # Multiply and use lower result
 0xb1 0xdf 0x35 0xd7
-# CHECK: r17 = add(#21, mpyi(r21, r31))
+# CHECK: r17 = add(#21,mpyi(r21,r31))
 0xbf 0xd1 0x35 0xd8
-# CHECK: r17 = add(#21, mpyi(r21, #31))
+# CHECK: r17 = add(#21,mpyi(r21,#31))
 0xb5 0xd1 0x3f 0xdf
-# CHECK: r17 = add(r21, mpyi(#84, r31))
+# CHECK: r17 = add(r21,mpyi(#84,r31))
 0xf5 0xf1 0xb5 0xdf
-# CHECK: r17 = add(r21, mpyi(r21, #31))
+# CHECK: r17 = add(r21,mpyi(r21,#31))
 0x15 0xd1 0x1f 0xe3
-# CHECK: r17 = add(r21, mpyi(r17, r31))
+# CHECK: r17 = add(r21,mpyi(r17,r31))
 0xf1 0xc3 0x15 0xe0
-# CHECK: r17 =+ mpyi(r21, #31)
+# CHECK: r17 = +mpyi(r21,#31)
 0xf1 0xc3 0x95 0xe0
-# CHECK: r17 =- mpyi(r21, #31)
+# CHECK: r17 = -mpyi(r21,#31)
 0xf1 0xc3 0x15 0xe1
-# CHECK: r17 += mpyi(r21, #31)
+# CHECK: r17 += mpyi(r21,#31)
 0xf1 0xc3 0x95 0xe1
-# CHECK: r17 -= mpyi(r21, #31)
+# CHECK: r17 -= mpyi(r21,#31)
 0x11 0xdf 0x15 0xed
-# CHECK: r17 = mpyi(r21, r31)
+# CHECK: r17 = mpyi(r21,r31)
 0x11 0xdf 0x15 0xef
-# CHECK: r17 += mpyi(r21, r31)
+# CHECK: r17 += mpyi(r21,r31)
 
 # Vector multiply word by signed half (32x16)
 0xb0 0xde 0x14 0xe8
-# CHECK: r17:16 = vmpyweh(r21:20, r31:30):sat
+# CHECK: r17:16 = vmpyweh(r21:20,r31:30):sat
 0xb0 0xde 0x94 0xe8
-# CHECK: r17:16 = vmpyweh(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 = vmpyweh(r21:20,r31:30):<<1:sat
 0xf0 0xde 0x14 0xe8
-# CHECK: r17:16 = vmpywoh(r21:20, r31:30):sat
+# CHECK: r17:16 = vmpywoh(r21:20,r31:30):sat
 0xf0 0xde 0x94 0xe8
-# CHECK: r17:16 = vmpywoh(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 = vmpywoh(r21:20,r31:30):<<1:sat
 0xb0 0xde 0x34 0xe8
-# CHECK: r17:16 = vmpyweh(r21:20, r31:30):rnd:sat
+# CHECK: r17:16 = vmpyweh(r21:20,r31:30):rnd:sat
 0xb0 0xde 0xb4 0xe8
-# CHECK: r17:16 = vmpyweh(r21:20, r31:30):<<1:rnd:sat
+# CHECK: r17:16 = vmpyweh(r21:20,r31:30):<<1:rnd:sat
 0xf0 0xde 0x34 0xe8
-# CHECK: r17:16 = vmpywoh(r21:20, r31:30):rnd:sat
+# CHECK: r17:16 = vmpywoh(r21:20,r31:30):rnd:sat
 0xf0 0xde 0xb4 0xe8
-# CHECK: r17:16 = vmpywoh(r21:20, r31:30):<<1:rnd:sat
+# CHECK: r17:16 = vmpywoh(r21:20,r31:30):<<1:rnd:sat
 0xb0 0xde 0x14 0xea
-# CHECK: r17:16 += vmpyweh(r21:20, r31:30):sat
+# CHECK: r17:16 += vmpyweh(r21:20,r31:30):sat
 0xb0 0xde 0x94 0xea
-# CHECK: r17:16 += vmpyweh(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 += vmpyweh(r21:20,r31:30):<<1:sat
 0xf0 0xde 0x14 0xea
-# CHECK: r17:16 += vmpywoh(r21:20, r31:30):sat
+# CHECK: r17:16 += vmpywoh(r21:20,r31:30):sat
 0xf0 0xde 0x94 0xea
-# CHECK: r17:16 += vmpywoh(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 += vmpywoh(r21:20,r31:30):<<1:sat
 0xb0 0xde 0x34 0xea
-# CHECK: r17:16 += vmpyweh(r21:20, r31:30):rnd:sat
+# CHECK: r17:16 += vmpyweh(r21:20,r31:30):rnd:sat
 0xb0 0xde 0xb4 0xea
-# CHECK: r17:16 += vmpyweh(r21:20, r31:30):<<1:rnd:sat
+# CHECK: r17:16 += vmpyweh(r21:20,r31:30):<<1:rnd:sat
 0xf0 0xde 0x34 0xea
-# CHECK: r17:16 += vmpywoh(r21:20, r31:30):rnd:sat
+# CHECK: r17:16 += vmpywoh(r21:20,r31:30):rnd:sat
 0xf0 0xde 0xb4 0xea
-# CHECK: r17:16 += vmpywoh(r21:20, r31:30):<<1:rnd:sat
+# CHECK: r17:16 += vmpywoh(r21:20,r31:30):<<1:rnd:sat
 
 # Vector multiply word by unsigned half (32x16)
 0xb0 0xde 0x54 0xe8
-# CHECK: r17:16 = vmpyweuh(r21:20, r31:30):sat
+# CHECK: r17:16 = vmpyweuh(r21:20,r31:30):sat
 0xb0 0xde 0xd4 0xe8
-# CHECK: r17:16 = vmpyweuh(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 = vmpyweuh(r21:20,r31:30):<<1:sat
 0xf0 0xde 0x54 0xe8
-# CHECK: r17:16 = vmpywouh(r21:20, r31:30):sat
+# CHECK: r17:16 = vmpywouh(r21:20,r31:30):sat
 0xf0 0xde 0xd4 0xe8
-# CHECK: r17:16 = vmpywouh(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 = vmpywouh(r21:20,r31:30):<<1:sat
 0xb0 0xde 0x74 0xe8
-# CHECK: r17:16 = vmpyweuh(r21:20, r31:30):rnd:sat
+# CHECK: r17:16 = vmpyweuh(r21:20,r31:30):rnd:sat
 0xb0 0xde 0xf4 0xe8
-# CHECK: r17:16 = vmpyweuh(r21:20, r31:30):<<1:rnd:sat
+# CHECK: r17:16 = vmpyweuh(r21:20,r31:30):<<1:rnd:sat
 0xf0 0xde 0x74 0xe8
-# CHECK: r17:16 = vmpywouh(r21:20, r31:30):rnd:sat
+# CHECK: r17:16 = vmpywouh(r21:20,r31:30):rnd:sat
 0xf0 0xde 0xf4 0xe8
-# CHECK: r17:16 = vmpywouh(r21:20, r31:30):<<1:rnd:sat
+# CHECK: r17:16 = vmpywouh(r21:20,r31:30):<<1:rnd:sat
 0xb0 0xde 0x54 0xea
-# CHECK: r17:16 += vmpyweuh(r21:20, r31:30):sat
+# CHECK: r17:16 += vmpyweuh(r21:20,r31:30):sat
 0xb0 0xde 0xd4 0xea
-# CHECK: r17:16 += vmpyweuh(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 += vmpyweuh(r21:20,r31:30):<<1:sat
 0xf0 0xde 0x54 0xea
-# CHECK: r17:16 += vmpywouh(r21:20, r31:30):sat
+# CHECK: r17:16 += vmpywouh(r21:20,r31:30):sat
 0xf0 0xde 0xd4 0xea
-# CHECK: r17:16 += vmpywouh(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 += vmpywouh(r21:20,r31:30):<<1:sat
 0xb0 0xde 0x74 0xea
-# CHECK: r17:16 += vmpyweuh(r21:20, r31:30):rnd:sat
+# CHECK: r17:16 += vmpyweuh(r21:20,r31:30):rnd:sat
 0xb0 0xde 0xf4 0xea
-# CHECK: r17:16 += vmpyweuh(r21:20, r31:30):<<1:rnd:sat
+# CHECK: r17:16 += vmpyweuh(r21:20,r31:30):<<1:rnd:sat
 0xf0 0xde 0x74 0xea
-# CHECK: r17:16 += vmpywouh(r21:20, r31:30):rnd:sat
+# CHECK: r17:16 += vmpywouh(r21:20,r31:30):rnd:sat
 0xf0 0xde 0xf4 0xea
-# CHECK: r17:16 += vmpywouh(r21:20, r31:30):<<1:rnd:sat
+# CHECK: r17:16 += vmpywouh(r21:20,r31:30):<<1:rnd:sat
 
 # Multiply signed halfwords
 0x10 0xdf 0x95 0xe4
-# CHECK: r17:16 = mpy(r21.l, r31.l):<<1
+# CHECK: r17:16 = mpy(r21.l,r31.l):<<1
 0x30 0xdf 0x95 0xe4
-# CHECK: r17:16 = mpy(r21.l, r31.h):<<1
+# CHECK: r17:16 = mpy(r21.l,r31.h):<<1
 0x50 0xdf 0x95 0xe4
-# CHECK: r17:16 = mpy(r21.h, r31.l):<<1
+# CHECK: r17:16 = mpy(r21.h,r31.l):<<1
 0x70 0xdf 0x95 0xe4
-# CHECK: r17:16 = mpy(r21.h, r31.h):<<1
+# CHECK: r17:16 = mpy(r21.h,r31.h):<<1
 0x10 0xdf 0xb5 0xe4
-# CHECK: r17:16 = mpy(r21.l, r31.l):<<1:rnd
+# CHECK: r17:16 = mpy(r21.l,r31.l):<<1:rnd
 0x30 0xdf 0xb5 0xe4
-# CHECK: r17:16 = mpy(r21.l, r31.h):<<1:rnd
+# CHECK: r17:16 = mpy(r21.l,r31.h):<<1:rnd
 0x50 0xdf 0xb5 0xe4
-# CHECK: r17:16 = mpy(r21.h, r31.l):<<1:rnd
+# CHECK: r17:16 = mpy(r21.h,r31.l):<<1:rnd
 0x70 0xdf 0xb5 0xe4
-# CHECK: r17:16 = mpy(r21.h, r31.h):<<1:rnd
+# CHECK: r17:16 = mpy(r21.h,r31.h):<<1:rnd
 0x10 0xdf 0x95 0xe6
-# CHECK: r17:16 += mpy(r21.l, r31.l):<<1
+# CHECK: r17:16 += mpy(r21.l,r31.l):<<1
 0x30 0xdf 0x95 0xe6
-# CHECK: r17:16 += mpy(r21.l, r31.h):<<1
+# CHECK: r17:16 += mpy(r21.l,r31.h):<<1
 0x50 0xdf 0x95 0xe6
-# CHECK: r17:16 += mpy(r21.h, r31.l):<<1
+# CHECK: r17:16 += mpy(r21.h,r31.l):<<1
 0x70 0xdf 0x95 0xe6
-# CHECK: r17:16 += mpy(r21.h, r31.h):<<1
+# CHECK: r17:16 += mpy(r21.h,r31.h):<<1
 0x10 0xdf 0xb5 0xe6
-# CHECK: r17:16 -= mpy(r21.l, r31.l):<<1
+# CHECK: r17:16 -= mpy(r21.l,r31.l):<<1
 0x30 0xdf 0xb5 0xe6
-# CHECK: r17:16 -= mpy(r21.l, r31.h):<<1
+# CHECK: r17:16 -= mpy(r21.l,r31.h):<<1
 0x50 0xdf 0xb5 0xe6
-# CHECK: r17:16 -= mpy(r21.h, r31.l):<<1
+# CHECK: r17:16 -= mpy(r21.h,r31.l):<<1
 0x70 0xdf 0xb5 0xe6
-# CHECK: r17:16 -= mpy(r21.h, r31.h):<<1
+# CHECK: r17:16 -= mpy(r21.h,r31.h):<<1
 0x11 0xdf 0x95 0xec
-# CHECK: r17 = mpy(r21.l, r31.l):<<1
+# CHECK: r17 = mpy(r21.l,r31.l):<<1
 0x31 0xdf 0x95 0xec
-# CHECK: r17 = mpy(r21.l, r31.h):<<1
+# CHECK: r17 = mpy(r21.l,r31.h):<<1
 0x51 0xdf 0x95 0xec
-# CHECK: r17 = mpy(r21.h, r31.l):<<1
+# CHECK: r17 = mpy(r21.h,r31.l):<<1
 0x71 0xdf 0x95 0xec
-# CHECK: r17 = mpy(r21.h, r31.h):<<1
+# CHECK: r17 = mpy(r21.h,r31.h):<<1
 0x91 0xdf 0x95 0xec
-# CHECK: r17 = mpy(r21.l, r31.l):<<1:sat
+# CHECK: r17 = mpy(r21.l,r31.l):<<1:sat
 0xb1 0xdf 0x95 0xec
-# CHECK: r17 = mpy(r21.l, r31.h):<<1:sat
+# CHECK: r17 = mpy(r21.l,r31.h):<<1:sat
 0xd1 0xdf 0x95 0xec
-# CHECK: r17 = mpy(r21.h, r31.l):<<1:sat
+# CHECK: r17 = mpy(r21.h,r31.l):<<1:sat
 0xf1 0xdf 0x95 0xec
-# CHECK: r17 = mpy(r21.h, r31.h):<<1:sat
+# CHECK: r17 = mpy(r21.h,r31.h):<<1:sat
 0x11 0xdf 0xb5 0xec
-# CHECK: r17 = mpy(r21.l, r31.l):<<1:rnd
+# CHECK: r17 = mpy(r21.l,r31.l):<<1:rnd
 0x31 0xdf 0xb5 0xec
-# CHECK: r17 = mpy(r21.l, r31.h):<<1:rnd
+# CHECK: r17 = mpy(r21.l,r31.h):<<1:rnd
 0x51 0xdf 0xb5 0xec
-# CHECK: r17 = mpy(r21.h, r31.l):<<1:rnd
+# CHECK: r17 = mpy(r21.h,r31.l):<<1:rnd
 0x71 0xdf 0xb5 0xec
-# CHECK: r17 = mpy(r21.h, r31.h):<<1:rnd
+# CHECK: r17 = mpy(r21.h,r31.h):<<1:rnd
 0x91 0xdf 0xb5 0xec
-# CHECK: r17 = mpy(r21.l, r31.l):<<1:rnd:sat
+# CHECK: r17 = mpy(r21.l,r31.l):<<1:rnd:sat
 0xb1 0xdf 0xb5 0xec
-# CHECK: r17 = mpy(r21.l, r31.h):<<1:rnd:sat
+# CHECK: r17 = mpy(r21.l,r31.h):<<1:rnd:sat
 0xd1 0xdf 0xb5 0xec
-# CHECK: r17 = mpy(r21.h, r31.l):<<1:rnd:sat
+# CHECK: r17 = mpy(r21.h,r31.l):<<1:rnd:sat
 0xf1 0xdf 0xb5 0xec
-# CHECK: r17 = mpy(r21.h, r31.h):<<1:rnd:sat
+# CHECK: r17 = mpy(r21.h,r31.h):<<1:rnd:sat
 0x11 0xdf 0x95 0xee
-# CHECK: r17 += mpy(r21.l, r31.l):<<1
+# CHECK: r17 += mpy(r21.l,r31.l):<<1
 0x31 0xdf 0x95 0xee
-# CHECK: r17 += mpy(r21.l, r31.h):<<1
+# CHECK: r17 += mpy(r21.l,r31.h):<<1
 0x51 0xdf 0x95 0xee
-# CHECK: r17 += mpy(r21.h, r31.l):<<1
+# CHECK: r17 += mpy(r21.h,r31.l):<<1
 0x71 0xdf 0x95 0xee
-# CHECK: r17 += mpy(r21.h, r31.h):<<1
+# CHECK: r17 += mpy(r21.h,r31.h):<<1
 0x91 0xdf 0x95 0xee
-# CHECK: r17 += mpy(r21.l, r31.l):<<1:sat
+# CHECK: r17 += mpy(r21.l,r31.l):<<1:sat
 0xb1 0xdf 0x95 0xee
-# CHECK: r17 += mpy(r21.l, r31.h):<<1:sat
+# CHECK: r17 += mpy(r21.l,r31.h):<<1:sat
 0xd1 0xdf 0x95 0xee
-# CHECK: r17 += mpy(r21.h, r31.l):<<1:sat
+# CHECK: r17 += mpy(r21.h,r31.l):<<1:sat
 0xf1 0xdf 0x95 0xee
-# CHECK: r17 += mpy(r21.h, r31.h):<<1:sat
+# CHECK: r17 += mpy(r21.h,r31.h):<<1:sat
 0x11 0xdf 0xb5 0xee
-# CHECK: r17 -= mpy(r21.l, r31.l):<<1
+# CHECK: r17 -= mpy(r21.l,r31.l):<<1
 0x31 0xdf 0xb5 0xee
-# CHECK: r17 -= mpy(r21.l, r31.h):<<1
+# CHECK: r17 -= mpy(r21.l,r31.h):<<1
 0x51 0xdf 0xb5 0xee
-# CHECK: r17 -= mpy(r21.h, r31.l):<<1
+# CHECK: r17 -= mpy(r21.h,r31.l):<<1
 0x71 0xdf 0xb5 0xee
-# CHECK: r17 -= mpy(r21.h, r31.h):<<1
+# CHECK: r17 -= mpy(r21.h,r31.h):<<1
 0x91 0xdf 0xb5 0xee
-# CHECK: r17 -= mpy(r21.l, r31.l):<<1:sat
+# CHECK: r17 -= mpy(r21.l,r31.l):<<1:sat
 0xb1 0xdf 0xb5 0xee
-# CHECK: r17 -= mpy(r21.l, r31.h):<<1:sat
+# CHECK: r17 -= mpy(r21.l,r31.h):<<1:sat
 0xd1 0xdf 0xb5 0xee
-# CHECK: r17 -= mpy(r21.h, r31.l):<<1:sat
+# CHECK: r17 -= mpy(r21.h,r31.l):<<1:sat
 0xf1 0xdf 0xb5 0xee
-# CHECK: r17 -= mpy(r21.h, r31.h):<<1:sat
+# CHECK: r17 -= mpy(r21.h,r31.h):<<1:sat
 
 # Multiply unsigned halfwords
 0x10 0xdf 0xd5 0xe4
-# CHECK: r17:16 = mpyu(r21.l, r31.l):<<1
+# CHECK: r17:16 = mpyu(r21.l,r31.l):<<1
 0x30 0xdf 0xd5 0xe4
-# CHECK: r17:16 = mpyu(r21.l, r31.h):<<1
+# CHECK: r17:16 = mpyu(r21.l,r31.h):<<1
 0x50 0xdf 0xd5 0xe4
-# CHECK: r17:16 = mpyu(r21.h, r31.l):<<1
+# CHECK: r17:16 = mpyu(r21.h,r31.l):<<1
 0x70 0xdf 0xd5 0xe4
-# CHECK: r17:16 = mpyu(r21.h, r31.h):<<1
+# CHECK: r17:16 = mpyu(r21.h,r31.h):<<1
 0x10 0xdf 0xd5 0xe6
-# CHECK: r17:16 += mpyu(r21.l, r31.l):<<1
+# CHECK: r17:16 += mpyu(r21.l,r31.l):<<1
 0x30 0xdf 0xd5 0xe6
-# CHECK: r17:16 += mpyu(r21.l, r31.h):<<1
+# CHECK: r17:16 += mpyu(r21.l,r31.h):<<1
 0x50 0xdf 0xd5 0xe6
-# CHECK: r17:16 += mpyu(r21.h, r31.l):<<1
+# CHECK: r17:16 += mpyu(r21.h,r31.l):<<1
 0x70 0xdf 0xd5 0xe6
-# CHECK: r17:16 += mpyu(r21.h, r31.h):<<1
+# CHECK: r17:16 += mpyu(r21.h,r31.h):<<1
 0x10 0xdf 0xf5 0xe6
-# CHECK: r17:16 -= mpyu(r21.l, r31.l):<<1
+# CHECK: r17:16 -= mpyu(r21.l,r31.l):<<1
 0x30 0xdf 0xf5 0xe6
-# CHECK: r17:16 -= mpyu(r21.l, r31.h):<<1
+# CHECK: r17:16 -= mpyu(r21.l,r31.h):<<1
 0x50 0xdf 0xf5 0xe6
-# CHECK: r17:16 -= mpyu(r21.h, r31.l):<<1
+# CHECK: r17:16 -= mpyu(r21.h,r31.l):<<1
 0x70 0xdf 0xf5 0xe6
-# CHECK: r17:16 -= mpyu(r21.h, r31.h):<<1
+# CHECK: r17:16 -= mpyu(r21.h,r31.h):<<1
 0x11 0xdf 0xd5 0xec
-# CHECK: r17 = mpyu(r21.l, r31.l):<<1
+# CHECK: r17 = mpyu(r21.l,r31.l):<<1
 0x31 0xdf 0xd5 0xec
-# CHECK: r17 = mpyu(r21.l, r31.h):<<1
+# CHECK: r17 = mpyu(r21.l,r31.h):<<1
 0x51 0xdf 0xd5 0xec
-# CHECK: r17 = mpyu(r21.h, r31.l):<<1
+# CHECK: r17 = mpyu(r21.h,r31.l):<<1
 0x71 0xdf 0xd5 0xec
-# CHECK: r17 = mpyu(r21.h, r31.h):<<1
+# CHECK: r17 = mpyu(r21.h,r31.h):<<1
 0x11 0xdf 0xd5 0xee
-# CHECK: r17 += mpyu(r21.l, r31.l):<<1
+# CHECK: r17 += mpyu(r21.l,r31.l):<<1
 0x31 0xdf 0xd5 0xee
-# CHECK: r17 += mpyu(r21.l, r31.h):<<1
+# CHECK: r17 += mpyu(r21.l,r31.h):<<1
 0x51 0xdf 0xd5 0xee
-# CHECK: r17 += mpyu(r21.h, r31.l):<<1
+# CHECK: r17 += mpyu(r21.h,r31.l):<<1
 0x71 0xdf 0xd5 0xee
-# CHECK: r17 += mpyu(r21.h, r31.h):<<1
+# CHECK: r17 += mpyu(r21.h,r31.h):<<1
 0x11 0xdf 0xf5 0xee
-# CHECK: r17 -= mpyu(r21.l, r31.l):<<1
+# CHECK: r17 -= mpyu(r21.l,r31.l):<<1
 0x31 0xdf 0xf5 0xee
-# CHECK: r17 -= mpyu(r21.l, r31.h):<<1
+# CHECK: r17 -= mpyu(r21.l,r31.h):<<1
 0x51 0xdf 0xf5 0xee
-# CHECK: r17 -= mpyu(r21.h, r31.l):<<1
+# CHECK: r17 -= mpyu(r21.h,r31.l):<<1
 0x71 0xdf 0xf5 0xee
-# CHECK: r17 -= mpyu(r21.h, r31.h):<<1
+# CHECK: r17 -= mpyu(r21.h,r31.h):<<1
 
 # Polynomial multiply words
 0xf0 0xdf 0x55 0xe5
-# CHECK: r17:16 = pmpyw(r21, r31)
+# CHECK: r17:16 = pmpyw(r21,r31)
 0xf0 0xdf 0x35 0xe7
-# CHECK: r17:16 ^= pmpyw(r21, r31)
+# CHECK: r17:16 ^= pmpyw(r21,r31)
 
 # Vector reduce multiply word by signed half (32x16)
 0x50 0xde 0x34 0xe8
-# CHECK: r17:16 = vrmpywoh(r21:20, r31:30)
+# CHECK: r17:16 = vrmpywoh(r21:20,r31:30)
 0x50 0xde 0xb4 0xe8
-# CHECK: r17:16 = vrmpywoh(r21:20, r31:30):<<1
+# CHECK: r17:16 = vrmpywoh(r21:20,r31:30):<<1
 0x90 0xde 0x54 0xe8
-# CHECK: r17:16 = vrmpyweh(r21:20, r31:30)
+# CHECK: r17:16 = vrmpyweh(r21:20,r31:30)
 0x90 0xde 0xd4 0xe8
-# CHECK: r17:16 = vrmpyweh(r21:20, r31:30):<<1
+# CHECK: r17:16 = vrmpyweh(r21:20,r31:30):<<1
 0xd0 0xde 0x74 0xea
-# CHECK: r17:16 += vrmpywoh(r21:20, r31:30)
+# CHECK: r17:16 += vrmpywoh(r21:20,r31:30)
 0xd0 0xde 0xf4 0xea
-# CHECK: r17:16 += vrmpywoh(r21:20, r31:30):<<1
+# CHECK: r17:16 += vrmpywoh(r21:20,r31:30):<<1
 0xd0 0xde 0x34 0xea
-# CHECK: r17:16 += vrmpyweh(r21:20, r31:30)
+# CHECK: r17:16 += vrmpyweh(r21:20,r31:30)
 0xd0 0xde 0xb4 0xea
-# CHECK: r17:16 += vrmpyweh(r21:20, r31:30):<<1
+# CHECK: r17:16 += vrmpyweh(r21:20,r31:30):<<1
 
 # Multiply and use upper result
 0x31 0xdf 0x15 0xed
-# CHECK: r17 = mpy(r21, r31)
+# CHECK: r17 = mpy(r21,r31)
 0x31 0xdf 0x35 0xed
-# CHECK: r17 = mpy(r21, r31):rnd
+# CHECK: r17 = mpy(r21,r31):rnd
 0x31 0xdf 0x55 0xed
-# CHECK: r17 = mpyu(r21, r31)
+# CHECK: r17 = mpyu(r21,r31)
 0x31 0xdf 0x75 0xed
-# CHECK: r17 = mpysu(r21, r31)
+# CHECK: r17 = mpysu(r21,r31)
 0x11 0xdf 0xb5 0xed
-# CHECK: r17 = mpy(r21, r31.h):<<1:sat
+# CHECK: r17 = mpy(r21,r31.h):<<1:sat
 0x31 0xdf 0xb5 0xed
-# CHECK: r17 = mpy(r21, r31.l):<<1:sat
+# CHECK: r17 = mpy(r21,r31.l):<<1:sat
 0x91 0xdf 0xb5 0xed
-# CHECK: r17 = mpy(r21, r31.h):<<1:rnd:sat
+# CHECK: r17 = mpy(r21,r31.h):<<1:rnd:sat
 0x11 0xdf 0xf5 0xed
-# CHECK: r17 = mpy(r21, r31):<<1:sat
+# CHECK: r17 = mpy(r21,r31):<<1:sat
 0x91 0xdf 0xf5 0xed
-# CHECK: r17 = mpy(r21, r31.l):<<1:rnd:sat
+# CHECK: r17 = mpy(r21,r31.l):<<1:rnd:sat
 0x51 0xdf 0xb5 0xed
-# CHECK: r17 = mpy(r21, r31):<<1
+# CHECK: r17 = mpy(r21,r31):<<1
 0x11 0xdf 0x75 0xef
-# CHECK: r17 += mpy(r21, r31):<<1:sat
+# CHECK: r17 += mpy(r21,r31):<<1:sat
 0x31 0xdf 0x75 0xef
-# CHECK: r17 -= mpy(r21, r31):<<1:sat
+# CHECK: r17 -= mpy(r21,r31):<<1:sat
 
 # Multiply and use full result
 0x10 0xdf 0x15 0xe5
-# CHECK: r17:16 = mpy(r21, r31)
+# CHECK: r17:16 = mpy(r21,r31)
 0x10 0xdf 0x55 0xe5
-# CHECK: r17:16 = mpyu(r21, r31)
+# CHECK: r17:16 = mpyu(r21,r31)
 0x10 0xdf 0x15 0xe7
-# CHECK: r17:16 += mpy(r21, r31)
+# CHECK: r17:16 += mpy(r21,r31)
 0x10 0xdf 0x35 0xe7
-# CHECK: r17:16 -= mpy(r21, r31)
+# CHECK: r17:16 -= mpy(r21,r31)
 0x10 0xdf 0x55 0xe7
-# CHECK: r17:16 += mpyu(r21, r31)
+# CHECK: r17:16 += mpyu(r21,r31)
 0x10 0xdf 0x75 0xe7
-# CHECK: r17:16 -= mpyu(r21, r31)
+# CHECK: r17:16 -= mpyu(r21,r31)
 
 # Vector dual multiply
 0x90 0xde 0x14 0xe8
-# CHECK: r17:16 = vdmpy(r21:20, r31:30):sat
+# CHECK: r17:16 = vdmpy(r21:20,r31:30):sat
 0x90 0xde 0x94 0xe8
-# CHECK: r17:16 = vdmpy(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 = vdmpy(r21:20,r31:30):<<1:sat
 0x90 0xde 0x14 0xea
-# CHECK: r17:16 += vdmpy(r21:20, r31:30):sat
+# CHECK: r17:16 += vdmpy(r21:20,r31:30):sat
 0x90 0xde 0x94 0xea
-# CHECK: r17:16 += vdmpy(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 += vdmpy(r21:20,r31:30):<<1:sat
 
 # Vector dual multiply with round and pack
 0x11 0xde 0x14 0xe9
-# CHECK: r17 = vdmpy(r21:20, r31:30):rnd:sat
+# CHECK: r17 = vdmpy(r21:20,r31:30):rnd:sat
 0x11 0xde 0x94 0xe9
-# CHECK: r17 = vdmpy(r21:20, r31:30):<<1:rnd:sat
+# CHECK: r17 = vdmpy(r21:20,r31:30):<<1:rnd:sat
 
 # Vector reduce multiply bytes
 0x30 0xde 0x94 0xe8
-# CHECK: r17:16 = vrmpybu(r21:20, r31:30)
+# CHECK: r17:16 = vrmpybu(r21:20,r31:30)
 0x30 0xde 0xd4 0xe8
-# CHECK: r17:16 = vrmpybsu(r21:20, r31:30)
+# CHECK: r17:16 = vrmpybsu(r21:20,r31:30)
 0x30 0xde 0x94 0xea
-# CHECK: r17:16 += vrmpybu(r21:20, r31:30)
+# CHECK: r17:16 += vrmpybu(r21:20,r31:30)
 0x30 0xde 0xd4 0xea
-# CHECK: r17:16 += vrmpybsu(r21:20, r31:30)
+# CHECK: r17:16 += vrmpybsu(r21:20,r31:30)
 
 # Vector dual multiply signed by unsigned bytes
 0x30 0xde 0xb4 0xe8
-# CHECK: r17:16 = vdmpybsu(r21:20, r31:30):sat
+# CHECK: r17:16 = vdmpybsu(r21:20,r31:30):sat
 0x30 0xde 0x34 0xea
-# CHECK: r17:16 += vdmpybsu(r21:20, r31:30):sat
+# CHECK: r17:16 += vdmpybsu(r21:20,r31:30):sat
 
 # Vector multiply even haldwords
 0xd0 0xde 0x14 0xe8
-# CHECK: r17:16 = vmpyeh(r21:20, r31:30):sat
+# CHECK: r17:16 = vmpyeh(r21:20,r31:30):sat
 0xd0 0xde 0x94 0xe8
-# CHECK: r17:16 = vmpyeh(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 = vmpyeh(r21:20,r31:30):<<1:sat
 0x50 0xde 0x34 0xea
-# CHECK: r17:16 += vmpyeh(r21:20, r31:30)
+# CHECK: r17:16 += vmpyeh(r21:20,r31:30)
 0xd0 0xde 0x14 0xea
-# CHECK: r17:16 += vmpyeh(r21:20, r31:30):sat
+# CHECK: r17:16 += vmpyeh(r21:20,r31:30):sat
 0xd0 0xde 0x94 0xea
-# CHECK: r17:16 += vmpyeh(r21:20, r31:30):<<1:sat
+# CHECK: r17:16 += vmpyeh(r21:20,r31:30):<<1:sat
 
 # Vector multiply halfwords
 0xb0 0xdf 0x15 0xe5
-# CHECK: r17:16 = vmpyh(r21, r31):sat
+# CHECK: r17:16 = vmpyh(r21,r31):sat
 0xb0 0xdf 0x95 0xe5
-# CHECK: r17:16 = vmpyh(r21, r31):<<1:sat
+# CHECK: r17:16 = vmpyh(r21,r31):<<1:sat
 0x30 0xdf 0x35 0xe7
-# CHECK: r17:16 += vmpyh(r21, r31)
+# CHECK: r17:16 += vmpyh(r21,r31)
 0xb0 0xdf 0x15 0xe7
-# CHECK: r17:16 += vmpyh(r21, r31):sat
+# CHECK: r17:16 += vmpyh(r21,r31):sat
 0xb0 0xdf 0x95 0xe7
-# CHECK: r17:16 += vmpyh(r21, r31):<<1:sat
+# CHECK: r17:16 += vmpyh(r21,r31):<<1:sat
 
 # Vector multiply halfwords with round and pack
 0xf1 0xdf 0x35 0xed
-# CHECK: r17 = vmpyh(r21, r31):rnd:sat
+# CHECK: r17 = vmpyh(r21,r31):rnd:sat
 0xf1 0xdf 0xb5 0xed
-# CHECK: r17 = vmpyh(r21, r31):<<1:rnd:sat
+# CHECK: r17 = vmpyh(r21,r31):<<1:rnd:sat
 
 # Vector multiply halfwords signed by unsigned
 0xf0 0xdf 0x15 0xe5
-# CHECK: r17:16 = vmpyhsu(r21, r31):sat
+# CHECK: r17:16 = vmpyhsu(r21,r31):sat
 0xf0 0xdf 0x95 0xe5
-# CHECK: r17:16 = vmpyhsu(r21, r31):<<1:sat
+# CHECK: r17:16 = vmpyhsu(r21,r31):<<1:sat
 0xb0 0xdf 0x75 0xe7
-# CHECK: r17:16 += vmpyhsu(r21, r31):sat
+# CHECK: r17:16 += vmpyhsu(r21,r31):sat
 0xb0 0xdf 0xf5 0xe7
-# CHECK: r17:16 += vmpyhsu(r21, r31):<<1:sat
+# CHECK: r17:16 += vmpyhsu(r21,r31):<<1:sat
 
 # Vector reduce multiply halfwords
 0x50 0xde 0x14 0xe8
-# CHECK: r17:16 = vrmpyh(r21:20, r31:30)
+# CHECK: r17:16 = vrmpyh(r21:20,r31:30)
 0x50 0xde 0x14 0xea
-# CHECK: r17:16 += vrmpyh(r21:20, r31:30)
+# CHECK: r17:16 += vrmpyh(r21:20,r31:30)
 
 # Vector multiply bytes
 0x30 0xdf 0x55 0xe5
-# CHECK: r17:16 = vmpybsu(r21, r31)
+# CHECK: r17:16 = vmpybsu(r21,r31)
 0x30 0xdf 0x95 0xe5
-# CHECK: r17:16 = vmpybu(r21, r31)
+# CHECK: r17:16 = vmpybu(r21,r31)
 0x30 0xdf 0x95 0xe7
-# CHECK: r17:16 += vmpybu(r21, r31)
+# CHECK: r17:16 += vmpybu(r21,r31)
 0x30 0xdf 0xd5 0xe7
-# CHECK: r17:16 += vmpybsu(r21, r31)
+# CHECK: r17:16 += vmpybsu(r21,r31)
 
 # Vector polynomial multiply halfwords
 0xf0 0xdf 0xd5 0xe5
-# CHECK: r17:16 = vpmpyh(r21, r31)
+# CHECK: r17:16 = vpmpyh(r21,r31)
 0xf0 0xdf 0xb5 0xe7
-# CHECK: r17:16 ^= vpmpyh(r21, r31)
+# CHECK: r17:16 ^= vpmpyh(r21,r31)
diff --git a/test/MC/Disassembler/Hexagon/xtype_perm.txt b/test/MC/Disassembler/Hexagon/xtype_perm.txt
index 91d2fc5ae698105d8b03edce031acc2b45cb5e95..e8173fb049c1d0f062b30393109637a3167cc4c0 100644
--- a/test/MC/Disassembler/Hexagon/xtype_perm.txt
+++ b/test/MC/Disassembler/Hexagon/xtype_perm.txt
@@ -3,7 +3,7 @@
 
 # CABAC decode bin
 0xd0 0xde 0xd4 0xc1
-# CHECK: r17:16 = decbin(r21:20, r31:30)
+# CHECK: r17:16 = decbin(r21:20,r31:30)
 
 # Saturate
 0x11 0xc0 0xd4 0x88
@@ -23,9 +23,9 @@
 
 # Vector align
 0x70 0xd4 0x1e 0xc2
-# CHECK: r17:16 = valignb(r21:20, r31:30, p3)
+# CHECK: r17:16 = valignb(r21:20,r31:30,p3)
 0x70 0xde 0x94 0xc2
-# CHECK: r17:16 = vspliceb(r21:20, r31:30, p3)
+# CHECK: r17:16 = vspliceb(r21:20,r31:30,p3)
 
 # Vector round and pack
 0x91 0xc0 0x94 0x88
@@ -59,13 +59,13 @@
 
 # Vector shuffle
 0x50 0xde 0x14 0xc1
-# CHECK: r17:16 = shuffeb(r21:20, r31:30)
+# CHECK: r17:16 = shuffeb(r21:20,r31:30)
 0x90 0xd4 0x1e 0xc1
-# CHECK: r17:16 = shuffob(r21:20, r31:30)
+# CHECK: r17:16 = shuffob(r21:20,r31:30)
 0xd0 0xde 0x14 0xc1
-# CHECK: r17:16 = shuffeh(r21:20, r31:30)
+# CHECK: r17:16 = shuffeh(r21:20,r31:30)
 0x10 0xd4 0x9e 0xc1
-# CHECK: r17:16 = shuffoh(r21:20, r31:30)
+# CHECK: r17:16 = shuffoh(r21:20,r31:30)
 
 # Vector splat bytes
 0xf1 0xc0 0x55 0x8c
@@ -77,9 +77,9 @@
 
 # Vector splice
 0x70 0xde 0x94 0xc0
-# CHECK: r17:16 = vspliceb(r21:20, r31:30, #3)
+# CHECK: r17:16 = vspliceb(r21:20,r31:30,#3)
 0x70 0xde 0x94 0xc2
-# CHECK: r17:16 = vspliceb(r21:20, r31:30, p3)
+# CHECK: r17:16 = vspliceb(r21:20,r31:30,p3)
 
 # Vector sign extend
 0x10 0xc0 0x15 0x84
@@ -93,9 +93,9 @@
 0x51 0xc0 0x94 0x88
 # CHECK: r17 = vtrunehb(r21:20)
 0x50 0xde 0x94 0xc1
-# CHECK: r17:16 = vtrunewh(r21:20, r31:30)
+# CHECK: r17:16 = vtrunewh(r21:20,r31:30)
 0x90 0xde 0x94 0xc1
-# CHECK: r17:16 = vtrunowh(r21:20, r31:30)
+# CHECK: r17:16 = vtrunowh(r21:20,r31:30)
 
 # Vector zero extend
 0x50 0xc0 0x15 0x84
diff --git a/test/MC/Disassembler/Hexagon/xtype_pred.txt b/test/MC/Disassembler/Hexagon/xtype_pred.txt
index cec6d1be0f1054b30e842cb09fedb8b7283b9178..816eef58a0993dd53be973234b0683744ed089f8 100644
--- a/test/MC/Disassembler/Hexagon/xtype_pred.txt
+++ b/test/MC/Disassembler/Hexagon/xtype_pred.txt
@@ -3,59 +3,59 @@
 
 # Bounds check
 0x83 0xf4 0x10 0xd2
-# CHECK: p3 = boundscheck(r17:16, r21:20):raw:lo
+# CHECK: p3 = boundscheck(r17:16,r21:20):raw:lo
 0xa3 0xf4 0x10 0xd2
-# CHECK: p3 = boundscheck(r17:16, r21:20):raw:hi
+# CHECK: p3 = boundscheck(r17:16,r21:20):raw:hi
 
 # Compare byte
 0x43 0xd5 0xd1 0xc7
-# CHECK: p3 = cmpb.gt(r17, r21)
+# CHECK: p3 = cmpb.gt(r17,r21)
 0xc3 0xd5 0xd1 0xc7
-# CHECK: p3 = cmpb.eq(r17, r21)
+# CHECK: p3 = cmpb.eq(r17,r21)
 0xe3 0xd5 0xd1 0xc7
-# CHECK: p3 = cmpb.gtu(r17, r21)
+# CHECK: p3 = cmpb.gtu(r17,r21)
 0xa3 0xc2 0x11 0xdd
-# CHECK: p3 = cmpb.eq(r17, #21)
+# CHECK: p3 = cmpb.eq(r17,#21)
 0xa3 0xc2 0x31 0xdd
-# CHECK: p3 = cmpb.gt(r17, #21)
+# CHECK: p3 = cmpb.gt(r17,#21)
 0xa3 0xc2 0x51 0xdd
-# CHECK: p3 = cmpb.gtu(r17, #21)
+# CHECK: p3 = cmpb.gtu(r17,#21)
 
 # Compare half
 0x63 0xd5 0xd1 0xc7
-# CHECK: p3 = cmph.eq(r17, r21)
+# CHECK: p3 = cmph.eq(r17,r21)
 0x83 0xd5 0xd1 0xc7
-# CHECK: p3 = cmph.gt(r17, r21)
+# CHECK: p3 = cmph.gt(r17,r21)
 0xa3 0xd5 0xd1 0xc7
-# CHECK: p3 = cmph.gtu(r17, r21)
+# CHECK: p3 = cmph.gtu(r17,r21)
 0xab 0xc2 0x11 0xdd
-# CHECK: p3 = cmph.eq(r17, #21)
+# CHECK: p3 = cmph.eq(r17,#21)
 0xab 0xc2 0x31 0xdd
-# CHECK: p3 = cmph.gt(r17, #21)
+# CHECK: p3 = cmph.gt(r17,#21)
 0xab 0xc2 0x51 0xdd
-# CHECK: p3 = cmph.gtu(r17, #21)
+# CHECK: p3 = cmph.gtu(r17,#21)
 
 # Compare doublewords
 0x03 0xde 0x94 0xd2
-# CHECK: p3 = cmp.eq(r21:20, r31:30)
+# CHECK: p3 = cmp.eq(r21:20,r31:30)
 0x43 0xde 0x94 0xd2
-# CHECK: p3 = cmp.gt(r21:20, r31:30)
+# CHECK: p3 = cmp.gt(r21:20,r31:30)
 0x83 0xde 0x94 0xd2
-# CHECK: p3 = cmp.gtu(r21:20, r31:30)
+# CHECK: p3 = cmp.gtu(r21:20,r31:30)
 
 # Compare bitmask
 0x03 0xd5 0x91 0x85
-# CHECK: p3 = bitsclr(r17, #21)
+# CHECK: p3 = bitsclr(r17,#21)
 0x03 0xd5 0xb1 0x85
-# CHECK: p3 = !bitsclr(r17, #21)
+# CHECK: p3 = !bitsclr(r17,#21)
 0x03 0xd5 0x51 0xc7
-# CHECK: p3 = bitsset(r17, r21)
+# CHECK: p3 = bitsset(r17,r21)
 0x03 0xd5 0x71 0xc7
-# CHECK: p3 = !bitsset(r17, r21)
+# CHECK: p3 = !bitsset(r17,r21)
 0x03 0xd5 0x91 0xc7
-# CHECK: p3 = bitsclr(r17, r21)
+# CHECK: p3 = bitsclr(r17,r21)
 0x03 0xd5 0xb1 0xc7
-# CHECK: p3 = !bitsclr(r17, r21)
+# CHECK: p3 = !bitsclr(r17,r21)
 
 # mask generate from predicate
 0x10 0xc3 0x00 0x86
@@ -63,7 +63,7 @@
 
 # Check for TLB match
 0x63 0xf5 0x10 0xd2
-# CHECK: p3 = tlbmatch(r17:16, r21)
+# CHECK: p3 = tlbmatch(r17:16,r21)
 
 # Predicate Transfer
 0x03 0xc0 0x45 0x85
@@ -73,64 +73,64 @@
 
 # Test bit
 0x03 0xd5 0x11 0x85
-# CHECK: p3 = tstbit(r17, #21)
+# CHECK: p3 = tstbit(r17,#21)
 0x03 0xd5 0x31 0x85
-# CHECK: p3 = !tstbit(r17, #21)
+# CHECK: p3 = !tstbit(r17,#21)
 0x03 0xd5 0x11 0xc7
-# CHECK: p3 = tstbit(r17, r21)
+# CHECK: p3 = tstbit(r17,r21)
 0x03 0xd5 0x31 0xc7
-# CHECK: p3 = !tstbit(r17, r21)
+# CHECK: p3 = !tstbit(r17,r21)
 
 # Vector compare halfwords
 0x63 0xde 0x14 0xd2
-# CHECK: p3 = vcmph.eq(r21:20, r31:30)
+# CHECK: p3 = vcmph.eq(r21:20,r31:30)
 0x83 0xde 0x14 0xd2
-# CHECK: p3 = vcmph.gt(r21:20, r31:30)
+# CHECK: p3 = vcmph.gt(r21:20,r31:30)
 0xa3 0xde 0x14 0xd2
-# CHECK: p3 = vcmph.gtu(r21:20, r31:30)
+# CHECK: p3 = vcmph.gtu(r21:20,r31:30)
 0xeb 0xc3 0x14 0xdc
-# CHECK: p3 = vcmph.eq(r21:20, #31)
+# CHECK: p3 = vcmph.eq(r21:20,#31)
 0xeb 0xc3 0x34 0xdc
-# CHECK: p3 = vcmph.gt(r21:20, #31)
+# CHECK: p3 = vcmph.gt(r21:20,#31)
 0xeb 0xc3 0x54 0xdc
-# CHECK: p3 = vcmph.gtu(r21:20, #31)
+# CHECK: p3 = vcmph.gtu(r21:20,#31)
 
 # Vector compare bytes for any match
 0x03 0xfe 0x14 0xd2
-# CHECK: p3 = any8(vcmpb.eq(r21:20, r31:30))
+# CHECK: p3 = any8(vcmpb.eq(r21:20,r31:30))
 
 # Vector compare bytes
 0x63 0xde 0x14 0xd2
-# CHECK: p3 = vcmph.eq(r21:20, r31:30)
+# CHECK: p3 = vcmph.eq(r21:20,r31:30)
 0x83 0xde 0x14 0xd2
-# CHECK: p3 = vcmph.gt(r21:20, r31:30)
+# CHECK: p3 = vcmph.gt(r21:20,r31:30)
 0xa3 0xde 0x14 0xd2
-# CHECK: p3 = vcmph.gtu(r21:20, r31:30)
+# CHECK: p3 = vcmph.gtu(r21:20,r31:30)
 0xeb 0xc3 0x14 0xdc
-# CHECK: p3 = vcmph.eq(r21:20, #31)
+# CHECK: p3 = vcmph.eq(r21:20,#31)
 0xeb 0xc3 0x34 0xdc
-# CHECK: p3 = vcmph.gt(r21:20, #31)
+# CHECK: p3 = vcmph.gt(r21:20,#31)
 0xeb 0xc3 0x54 0xdc
-# CHECK: p3 = vcmph.gtu(r21:20, #31)
+# CHECK: p3 = vcmph.gtu(r21:20,#31)
 
 # Vector compare words
 0x03 0xde 0x14 0xd2
-# CHECK: p3 = vcmpw.eq(r21:20, r31:30)
+# CHECK: p3 = vcmpw.eq(r21:20,r31:30)
 0x23 0xde 0x14 0xd2
-# CHECK: p3 = vcmpw.gt(r21:20, r31:30)
+# CHECK: p3 = vcmpw.gt(r21:20,r31:30)
 0x43 0xde 0x14 0xd2
-# CHECK: p3 = vcmpw.gtu(r21:20, r31:30)
+# CHECK: p3 = vcmpw.gtu(r21:20,r31:30)
 0xf3 0xc3 0x14 0xdc
-# CHECK: p3 = vcmpw.eq(r21:20, #31)
+# CHECK: p3 = vcmpw.eq(r21:20,#31)
 0xf3 0xc3 0x34 0xdc
-# CHECK: p3 = vcmpw.gt(r21:20, #31)
+# CHECK: p3 = vcmpw.gt(r21:20,#31)
 0xf3 0xc3 0x54 0xdc
-# CHECK: p3 = vcmpw.gtu(r21:20, #31)
+# CHECK: p3 = vcmpw.gtu(r21:20,#31)
 
 # Viterbi pack even and odd predicate bits
 0x11 0xc2 0x03 0x89
-# CHECK: r17 = vitpack(p3, p2)
+# CHECK: r17 = vitpack(p3,p2)
 
 # Vector mux
 0x70 0xde 0x14 0xd1
-# CHECK: r17:16 = vmux(p3, r21:20, r31:30)
+# CHECK: r17:16 = vmux(p3,r21:20,r31:30)
diff --git a/test/MC/Disassembler/Hexagon/xtype_shift.txt b/test/MC/Disassembler/Hexagon/xtype_shift.txt
index e2d6816c1cac769fd7a9c5cdcb791cb05de18547..d5688c962cfee72aeab96c51b82f2985698f00f7 100644
--- a/test/MC/Disassembler/Hexagon/xtype_shift.txt
+++ b/test/MC/Disassembler/Hexagon/xtype_shift.txt
@@ -3,258 +3,258 @@
 
 # Shift by immediate
 0x10 0xdf 0x14 0x80
-# CHECK: r17:16 = asr(r21:20, #31)
+# CHECK: r17:16 = asr(r21:20,#31)
 0x30 0xdf 0x14 0x80
-# CHECK: r17:16 = lsr(r21:20, #31)
+# CHECK: r17:16 = lsr(r21:20,#31)
 0x50 0xdf 0x14 0x80
-# CHECK: r17:16 = asl(r21:20, #31)
+# CHECK: r17:16 = asl(r21:20,#31)
 0x11 0xdf 0x15 0x8c
-# CHECK: r17 = asr(r21, #31)
+# CHECK: r17 = asr(r21,#31)
 0x31 0xdf 0x15 0x8c
-# CHECK: r17 = lsr(r21, #31)
+# CHECK: r17 = lsr(r21,#31)
 0x51 0xdf 0x15 0x8c
-# CHECK: r17 = asl(r21, #31)
+# CHECK: r17 = asl(r21,#31)
 
 # Shift by immediate and accumulate
 0x10 0xdf 0x14 0x82
-# CHECK: r17:16 -= asr(r21:20, #31)
+# CHECK: r17:16 -= asr(r21:20,#31)
 0x30 0xdf 0x14 0x82
-# CHECK: r17:16 -= lsr(r21:20, #31)
+# CHECK: r17:16 -= lsr(r21:20,#31)
 0x50 0xdf 0x14 0x82
-# CHECK: r17:16 -= asl(r21:20, #31)
+# CHECK: r17:16 -= asl(r21:20,#31)
 0x90 0xdf 0x14 0x82
-# CHECK: r17:16 += asr(r21:20, #31)
+# CHECK: r17:16 += asr(r21:20,#31)
 0xb0 0xdf 0x14 0x82
-# CHECK: r17:16 += lsr(r21:20, #31)
+# CHECK: r17:16 += lsr(r21:20,#31)
 0xd0 0xdf 0x14 0x82
-# CHECK: r17:16 += asl(r21:20, #31)
+# CHECK: r17:16 += asl(r21:20,#31)
 0x11 0xdf 0x15 0x8e
-# CHECK: r17 -= asr(r21, #31)
+# CHECK: r17 -= asr(r21,#31)
 0x31 0xdf 0x15 0x8e
-# CHECK: r17 -= lsr(r21, #31)
+# CHECK: r17 -= lsr(r21,#31)
 0x51 0xdf 0x15 0x8e
-# CHECK: r17 -= asl(r21, #31)
+# CHECK: r17 -= asl(r21,#31)
 0x91 0xdf 0x15 0x8e
-# CHECK: r17 += asr(r21, #31)
+# CHECK: r17 += asr(r21,#31)
 0xb1 0xdf 0x15 0x8e
-# CHECK: r17 += lsr(r21, #31)
+# CHECK: r17 += lsr(r21,#31)
 0xd1 0xdf 0x15 0x8e
-# CHECK: r17 += asl(r21, #31)
+# CHECK: r17 += asl(r21,#31)
 0x4c 0xf7 0x11 0xde
-# CHECK: r17 = add(#21, asl(r17, #23))
+# CHECK: r17 = add(#21,asl(r17,#23))
 0x4e 0xf7 0x11 0xde
-# CHECK: r17 = sub(#21, asl(r17, #23))
+# CHECK: r17 = sub(#21,asl(r17,#23))
 0x5c 0xf7 0x11 0xde
-# CHECK: r17 = add(#21, lsr(r17, #23))
+# CHECK: r17 = add(#21,lsr(r17,#23))
 0x5e 0xf7 0x11 0xde
-# CHECK: r17 = sub(#21, lsr(r17, #23))
+# CHECK: r17 = sub(#21,lsr(r17,#23))
 
 # Shift by immediate and add
 0xf1 0xd5 0x1f 0xc4
-# CHECK: r17 = addasl(r21, r31, #7)
+# CHECK: r17 = addasl(r21,r31,#7)
 
 # Shift by immediate and logical
 0x10 0xdf 0x54 0x82
-# CHECK: r17:16 &= asr(r21:20, #31)
+# CHECK: r17:16 &= asr(r21:20,#31)
 0x30 0xdf 0x54 0x82
-# CHECK: r17:16 &= lsr(r21:20, #31)
+# CHECK: r17:16 &= lsr(r21:20,#31)
 0x50 0xdf 0x54 0x82
-# CHECK: r17:16 &= asl(r21:20, #31)
+# CHECK: r17:16 &= asl(r21:20,#31)
 0x90 0xdf 0x54 0x82
-# CHECK: r17:16 |= asr(r21:20, #31)
+# CHECK: r17:16 |= asr(r21:20,#31)
 0xb0 0xdf 0x54 0x82
-# CHECK: r17:16 |= lsr(r21:20, #31)
+# CHECK: r17:16 |= lsr(r21:20,#31)
 0xd0 0xdf 0x54 0x82
-# CHECK: r17:16 |= asl(r21:20, #31)
+# CHECK: r17:16 |= asl(r21:20,#31)
 0x30 0xdf 0x94 0x82
-# CHECK: r17:16 ^= lsr(r21:20, #31)
+# CHECK: r17:16 ^= lsr(r21:20,#31)
 0x50 0xdf 0x94 0x82
-# CHECK: r17:16 ^= asl(r21:20, #31)
+# CHECK: r17:16 ^= asl(r21:20,#31)
 0x11 0xdf 0x55 0x8e
-# CHECK: r17 &= asr(r21, #31)
+# CHECK: r17 &= asr(r21,#31)
 0x31 0xdf 0x55 0x8e
-# CHECK: r17 &= lsr(r21, #31)
+# CHECK: r17 &= lsr(r21,#31)
 0x51 0xdf 0x55 0x8e
-# CHECK: r17 &= asl(r21, #31)
+# CHECK: r17 &= asl(r21,#31)
 0x91 0xdf 0x55 0x8e
-# CHECK: r17 |= asr(r21, #31)
+# CHECK: r17 |= asr(r21,#31)
 0xb1 0xdf 0x55 0x8e
-# CHECK: r17 |= lsr(r21, #31)
+# CHECK: r17 |= lsr(r21,#31)
 0xd1 0xdf 0x55 0x8e
-# CHECK: r17 |= asl(r21, #31)
+# CHECK: r17 |= asl(r21,#31)
 0x31 0xdf 0x95 0x8e
-# CHECK: r17 ^= lsr(r21, #31)
+# CHECK: r17 ^= lsr(r21,#31)
 0x51 0xdf 0x95 0x8e
-# CHECK: r17 ^= asl(r21, #31)
+# CHECK: r17 ^= asl(r21,#31)
 0x48 0xff 0x11 0xde
-# CHECK: r17 = and(#21, asl(r17, #31))
+# CHECK: r17 = and(#21,asl(r17,#31))
 0x4a 0xff 0x11 0xde
-# CHECK: r17 = or(#21, asl(r17, #31))
+# CHECK: r17 = or(#21,asl(r17,#31))
 0x58 0xff 0x11 0xde
-# CHECK: r17 = and(#21, lsr(r17, #31))
+# CHECK: r17 = and(#21,lsr(r17,#31))
 0x5a 0xff 0x11 0xde
-# CHECK: r17 = or(#21, lsr(r17, #31))
+# CHECK: r17 = or(#21,lsr(r17,#31))
 
 # Shift right by immediate with rounding
 0xf0 0xdf 0xd4 0x80
-# CHECK: r17:16 = asr(r21:20, #31):rnd
+# CHECK: r17:16 = asr(r21:20,#31):rnd
 0x11 0xdf 0x55 0x8c
-# CHECK: r17 = asr(r21, #31):rnd
+# CHECK: r17 = asr(r21,#31):rnd
 
 # Shift left by immediate with saturation
 0x51 0xdf 0x55 0x8c
-# CHECK: r17 = asl(r21, #31):sat
+# CHECK: r17 = asl(r21,#31):sat
 
 # Shift by register
 0x10 0xdf 0x94 0xc3
-# CHECK: r17:16 = asr(r21:20, r31)
+# CHECK: r17:16 = asr(r21:20,r31)
 0x50 0xdf 0x94 0xc3
-# CHECK: r17:16 = lsr(r21:20, r31)
+# CHECK: r17:16 = lsr(r21:20,r31)
 0x90 0xdf 0x94 0xc3
-# CHECK: r17:16 = asl(r21:20, r31)
+# CHECK: r17:16 = asl(r21:20,r31)
 0xd0 0xdf 0x94 0xc3
-# CHECK: r17:16 = lsl(r21:20, r31)
+# CHECK: r17:16 = lsl(r21:20,r31)
 0x11 0xdf 0x55 0xc6
-# CHECK: r17 = asr(r21, r31)
+# CHECK: r17 = asr(r21,r31)
 0x51 0xdf 0x55 0xc6
-# CHECK: r17 = lsr(r21, r31)
+# CHECK: r17 = lsr(r21,r31)
 0x91 0xdf 0x55 0xc6
-# CHECK: r17 = asl(r21, r31)
+# CHECK: r17 = asl(r21,r31)
 0xd1 0xdf 0x55 0xc6
-# CHECK: r17 = lsl(r21, r31)
+# CHECK: r17 = lsl(r21,r31)
 0xf1 0xdf 0x8a 0xc6
-# CHECK: r17 = lsl(#21, r31)
+# CHECK: r17 = lsl(#21,r31)
 
 # Shift by register and accumulate
 0x10 0xdf 0x94 0xcb
-# CHECK: r17:16 -= asr(r21:20, r31)
+# CHECK: r17:16 -= asr(r21:20,r31)
 0x50 0xdf 0x94 0xcb
-# CHECK: r17:16 -= lsr(r21:20, r31)
+# CHECK: r17:16 -= lsr(r21:20,r31)
 0x90 0xdf 0x94 0xcb
-# CHECK: r17:16 -= asl(r21:20, r31)
+# CHECK: r17:16 -= asl(r21:20,r31)
 0xd0 0xdf 0x94 0xcb
-# CHECK: r17:16 -= lsl(r21:20, r31)
+# CHECK: r17:16 -= lsl(r21:20,r31)
 0x10 0xdf 0xd4 0xcb
-# CHECK: r17:16 += asr(r21:20, r31)
+# CHECK: r17:16 += asr(r21:20,r31)
 0x50 0xdf 0xd4 0xcb
-# CHECK: r17:16 += lsr(r21:20, r31)
+# CHECK: r17:16 += lsr(r21:20,r31)
 0x90 0xdf 0xd4 0xcb
-# CHECK: r17:16 += asl(r21:20, r31)
+# CHECK: r17:16 += asl(r21:20,r31)
 0xd0 0xdf 0xd4 0xcb
-# CHECK: r17:16 += lsl(r21:20, r31)
+# CHECK: r17:16 += lsl(r21:20,r31)
 0x11 0xdf 0x95 0xcc
-# CHECK: r17 -= asr(r21, r31)
+# CHECK: r17 -= asr(r21,r31)
 0x51 0xdf 0x95 0xcc
-# CHECK: r17 -= lsr(r21, r31)
+# CHECK: r17 -= lsr(r21,r31)
 0x91 0xdf 0x95 0xcc
-# CHECK: r17 -= asl(r21, r31)
+# CHECK: r17 -= asl(r21,r31)
 0xd1 0xdf 0x95 0xcc
-# CHECK: r17 -= lsl(r21, r31)
+# CHECK: r17 -= lsl(r21,r31)
 0x11 0xdf 0xd5 0xcc
-# CHECK: r17 += asr(r21, r31)
+# CHECK: r17 += asr(r21,r31)
 0x51 0xdf 0xd5 0xcc
-# CHECK: r17 += lsr(r21, r31)
+# CHECK: r17 += lsr(r21,r31)
 0x91 0xdf 0xd5 0xcc
-# CHECK: r17 += asl(r21, r31)
+# CHECK: r17 += asl(r21,r31)
 0xd1 0xdf 0xd5 0xcc
-# CHECK: r17 += lsl(r21, r31)
+# CHECK: r17 += lsl(r21,r31)
 
 # Shift by register and logical
 0x10 0xdf 0x14 0xcb
-# CHECK: r17:16 |= asr(r21:20, r31)
+# CHECK: r17:16 |= asr(r21:20,r31)
 0x50 0xdf 0x14 0xcb
-# CHECK: r17:16 |= lsr(r21:20, r31)
+# CHECK: r17:16 |= lsr(r21:20,r31)
 0x90 0xdf 0x14 0xcb
-# CHECK: r17:16 |= asl(r21:20, r31)
+# CHECK: r17:16 |= asl(r21:20,r31)
 0xd0 0xdf 0x14 0xcb
-# CHECK: r17:16 |= lsl(r21:20, r31)
+# CHECK: r17:16 |= lsl(r21:20,r31)
 0x10 0xdf 0x54 0xcb
-# CHECK: r17:16 &= asr(r21:20, r31)
+# CHECK: r17:16 &= asr(r21:20,r31)
 0x50 0xdf 0x54 0xcb
-# CHECK: r17:16 &= lsr(r21:20, r31)
+# CHECK: r17:16 &= lsr(r21:20,r31)
 0x90 0xdf 0x54 0xcb
-# CHECK: r17:16 &= asl(r21:20, r31)
+# CHECK: r17:16 &= asl(r21:20,r31)
 0xd0 0xdf 0x54 0xcb
-# CHECK: r17:16 &= lsl(r21:20, r31)
+# CHECK: r17:16 &= lsl(r21:20,r31)
 0x10 0xdf 0x74 0xcb
-# CHECK: r17:16 ^= asr(r21:20, r31)
+# CHECK: r17:16 ^= asr(r21:20,r31)
 0x50 0xdf 0x74 0xcb
-# CHECK: r17:16 ^= lsr(r21:20, r31)
+# CHECK: r17:16 ^= lsr(r21:20,r31)
 0x90 0xdf 0x74 0xcb
-# CHECK: r17:16 ^= asl(r21:20, r31)
+# CHECK: r17:16 ^= asl(r21:20,r31)
 0xd0 0xdf 0x74 0xcb
-# CHECK: r17:16 ^= lsl(r21:20, r31)
+# CHECK: r17:16 ^= lsl(r21:20,r31)
 0x11 0xdf 0x15 0xcc
-# CHECK: r17 |= asr(r21, r31)
+# CHECK: r17 |= asr(r21,r31)
 0x51 0xdf 0x15 0xcc
-# CHECK: r17 |= lsr(r21, r31)
+# CHECK: r17 |= lsr(r21,r31)
 0x91 0xdf 0x15 0xcc
-# CHECK: r17 |= asl(r21, r31)
+# CHECK: r17 |= asl(r21,r31)
 0xd1 0xdf 0x15 0xcc
-# CHECK: r17 |= lsl(r21, r31)
+# CHECK: r17 |= lsl(r21,r31)
 0x11 0xdf 0x55 0xcc
-# CHECK: r17 &= asr(r21, r31)
+# CHECK: r17 &= asr(r21,r31)
 0x51 0xdf 0x55 0xcc
-# CHECK: r17 &= lsr(r21, r31)
+# CHECK: r17 &= lsr(r21,r31)
 0x91 0xdf 0x55 0xcc
-# CHECK: r17 &= asl(r21, r31)
+# CHECK: r17 &= asl(r21,r31)
 0xd1 0xdf 0x55 0xcc
-# CHECK: r17 &= lsl(r21, r31)
+# CHECK: r17 &= lsl(r21,r31)
 
 # Shift by register with saturation
 0x11 0xdf 0x15 0xc6
-# CHECK: r17 = asr(r21, r31):sat
+# CHECK: r17 = asr(r21,r31):sat
 0x91 0xdf 0x15 0xc6
-# CHECK: r17 = asl(r21, r31):sat
+# CHECK: r17 = asl(r21,r31):sat
 
 # Vector shift halfwords by immediate
 0x10 0xc5 0x94 0x80
-# CHECK: r17:16 = vasrh(r21:20, #5)
+# CHECK: r17:16 = vasrh(r21:20,#5)
 0x30 0xc5 0x94 0x80
-# CHECK: r17:16 = vlsrh(r21:20, #5)
+# CHECK: r17:16 = vlsrh(r21:20,#5)
 0x50 0xc5 0x94 0x80
-# CHECK: r17:16 = vaslh(r21:20, #5)
+# CHECK: r17:16 = vaslh(r21:20,#5)
 
 # Vector arithmetic shift halfwords with round
 0x10 0xc5 0x34 0x80
-# CHECK: r17:16 = vasrh(r21:20, #5):raw
+# CHECK: r17:16 = vasrh(r21:20,#5):raw
 
 # Vector arithmetic shift halfwords with saturate and pack
 0x91 0xc5 0x74 0x88
-# CHECK: r17 = vasrhub(r21:20, #5):raw
+# CHECK: r17 = vasrhub(r21:20,#5):raw
 0xb1 0xc5 0x74 0x88
-# CHECK: r17 = vasrhub(r21:20, #5):sat
+# CHECK: r17 = vasrhub(r21:20,#5):sat
 
 # Vector shift halfwords by register
 0x10 0xdf 0x54 0xc3
-# CHECK: r17:16 = vasrh(r21:20, r31)
+# CHECK: r17:16 = vasrh(r21:20,r31)
 0x50 0xdf 0x54 0xc3
-# CHECK: r17:16 = vlsrh(r21:20, r31)
+# CHECK: r17:16 = vlsrh(r21:20,r31)
 0x90 0xdf 0x54 0xc3
-# CHECK: r17:16 = vaslh(r21:20, r31)
+# CHECK: r17:16 = vaslh(r21:20,r31)
 0xd0 0xdf 0x54 0xc3
-# CHECK: r17:16 = vlslh(r21:20, r31)
+# CHECK: r17:16 = vlslh(r21:20,r31)
 
 # Vector shift words by immediate
 0x10 0xdf 0x54 0x80
-# CHECK: r17:16 = vasrw(r21:20, #31)
+# CHECK: r17:16 = vasrw(r21:20,#31)
 0x30 0xdf 0x54 0x80
-# CHECK: r17:16 = vlsrw(r21:20, #31)
+# CHECK: r17:16 = vlsrw(r21:20,#31)
 0x50 0xdf 0x54 0x80
-# CHECK: r17:16 = vaslw(r21:20, #31)
+# CHECK: r17:16 = vaslw(r21:20,#31)
 
 # Vector shift words by register
 0x10 0xdf 0x14 0xc3
-# CHECK: r17:16 = vasrw(r21:20, r31)
+# CHECK: r17:16 = vasrw(r21:20,r31)
 0x50 0xdf 0x14 0xc3
-# CHECK: r17:16 = vlsrw(r21:20, r31)
+# CHECK: r17:16 = vlsrw(r21:20,r31)
 0x90 0xdf 0x14 0xc3
-# CHECK: r17:16 = vaslw(r21:20, r31)
+# CHECK: r17:16 = vaslw(r21:20,r31)
 0xd0 0xdf 0x14 0xc3
-# CHECK: r17:16 = vlslw(r21:20, r31)
+# CHECK: r17:16 = vlslw(r21:20,r31)
 
 # Vector shift words with truncate and pack
 0x51 0xdf 0xd4 0x88
-# CHECK: r17 = vasrw(r21:20, #31)
+# CHECK: r17 = vasrw(r21:20,#31)
 0x51 0xdf 0x14 0xc5
-# CHECK: r17 = vasrw(r21:20, r31)
+# CHECK: r17 = vasrw(r21:20,r31)
diff --git a/test/MC/Disassembler/PowerPC/vsx.txt b/test/MC/Disassembler/PowerPC/vsx.txt
index 3f8adc91245255d54410f592d611e99f9e03e20b..0c647737c3713c8f8e123981d211daccd1dfc319 100644
--- a/test/MC/Disassembler/PowerPC/vsx.txt
+++ b/test/MC/Disassembler/PowerPC/vsx.txt
@@ -525,8 +525,8 @@
 # CHECK: xxswapd 7, 63
 0xf0 0xff 0xfa 0x56
 
-# CHECK: mfvsrd 3, 0
-0x7c 0x03 0x00 0x66
+# CHECK: mfvsrd 3, 40
+0x7d 0x03 0x00 0x67
 
 # CHECK: mfvsrwz 5, 0
 0x7c 0x05 0x00 0xe6
diff --git a/test/MC/Disassembler/X86/x86-32.txt b/test/MC/Disassembler/X86/x86-32.txt
index f92d6057877bfcdc2ddf17ba6113fad405c56fa0..9dd49e51d91b835b3b315c76ce29836e1e0c4605 100644
--- a/test/MC/Disassembler/X86/x86-32.txt
+++ b/test/MC/Disassembler/X86/x86-32.txt
@@ -129,6 +129,9 @@
 # CHECK: invlpga
 0x0f 0x01 0xdf
 
+# CHECK: clzero
+0x0f,0x01,0xfc
+
 # CHECK: movl $0, -4(%ebp)
 0xc7 0x45 0xfc 0x00 0x00 0x00 0x00
 
@@ -517,9 +520,6 @@
 # CHECK: clwb (%eax)
 0x66 0x0f 0xae 0x30
 
-# CHECK: pcommit
-0x66 0x0f 0xae 0xf8
-
 # CHECK: vcvtph2ps %xmm0, %xmm0
 0xc4 0xe2 0x79 0x13 0xc0
 
diff --git a/test/MC/Disassembler/X86/x86-64.txt b/test/MC/Disassembler/X86/x86-64.txt
index 13e36df002a4d7691f5450a48f4f276263230956..de62b0ff1d7744c73c10f55f78cdbfbc52649098 100644
--- a/test/MC/Disassembler/X86/x86-64.txt
+++ b/test/MC/Disassembler/X86/x86-64.txt
@@ -185,10 +185,10 @@
 # CHECK: sha1msg2 (%rax), %xmm2
 0x0f 0x38 0xca 0x10
 
-# CHECK: sha256rnds2 (%rax), %xmm2
+# CHECK: sha256rnds2 %xmm0, (%rax), %xmm2
 0x0f 0x38 0xcb 0x10
 
-# CHECK: sha256rnds2 %xmm1, %xmm2
+# CHECK: sha256rnds2 %xmm0, %xmm1, %xmm2
 0x0f 0x38 0xcb 0xd1
 
 # CHECK: sha256msg1 %xmm1, %xmm2
diff --git a/test/MC/ELF/ARM/gnu-type-hash-diagnostics.s b/test/MC/ELF/ARM/gnu-type-hash-diagnostics.s
index eb364755c4d75d64ef6d2b833516bc13e2791c10..7dc656d5a4d76b2bb40b73a4a21458d5f297d1ec 100644
--- a/test/MC/ELF/ARM/gnu-type-hash-diagnostics.s
+++ b/test/MC/ELF/ARM/gnu-type-hash-diagnostics.s
@@ -7,3 +7,7 @@
 // CHECK: .type TYPE #32
 // CHECK:             ^
 
+  // For ARM, the comment character is '@', so we don't list '@<type>' as a
+  // valid option.
+  .section "foo", "a", @progbits
+// CHECK: error: expected '%<type>' or "<type>"
diff --git a/test/MC/ELF/gen-dwarf.s b/test/MC/ELF/gen-dwarf.s
index 4e773c79af281bbb3bde784fc8a1817b10501def..e00580926827d68000e38e8d881e6e6d5e8bd09c 100644
--- a/test/MC/ELF/gen-dwarf.s
+++ b/test/MC/ELF/gen-dwarf.s
@@ -3,7 +3,8 @@
 // RUN: llvm-mc -g -dwarf-version 2 -triple  i686-pc-linux-gnu %s -filetype=asm -o - | FileCheck --check-prefix=ASM --check-prefix=DWARF2 %s
 // RUN: llvm-mc -g -dwarf-version 3 -triple  i686-pc-linux-gnu %s -filetype=asm -o - | FileCheck --check-prefix=ASM --check-prefix=DWARF3 %s
 // RUN: llvm-mc -g -triple  i686-pc-linux-gnu %s -filetype=asm -o - | FileCheck --check-prefix=ASM --check-prefix=DWARF4 %s
-// RUN: not llvm-mc -g -dwarf-version 5  -triple  i686-pc-linux-gnu %s -filetype=asm -o - 2>&1 | FileCheck --check-prefix=DWARF5 %s
+// RUN: llvm-mc -g -dwarf-version 5  -triple  i686-pc-linux-gnu %s -filetype=asm -o - 2>&1 | FileCheck --check-prefix=DWARF5 %s
+// RUN: not llvm-mc -g -dwarf-version 6  -triple  i686-pc-linux-gnu %s -filetype=asm -o - 2>&1 | FileCheck --check-prefix=DWARF6 %s
 
 
 // Test that on ELF:
@@ -34,8 +35,9 @@ foo:
 // ASM: .section .debug_info
 
 // ASM: .section .debug_abbrev
-// ASM-NEXT: .Lsection_abbrev:
 // ASM-NEXT: [[ABBREV_LABEL:.Ltmp[0-9]+]]
+// DWARF5: .section .debug_abbrev
+// DWARF5-NEXT: [[ABBREV_LABEL:.Ltmp[0-9]+]]
 
 // Second instance of the section has the CU
 // ASM: .section .debug_info
@@ -44,6 +46,11 @@ foo:
 // DWARF3: .short 3
 // DWARF4: .short 4
 // ASM-NEXT: .long [[ABBREV_LABEL]]
+// DWARF5: .short 5
+// DWARF5-NEXT: .byte 1
+// DWARF5-NEXT: .byte 4
+// DWARF5-NEXT: .long [[ABBREV_LABEL]]
+
 // First .byte 1 is the abbreviation number for the compile_unit abbrev
 // ASM: .byte 1
 // ASM-NEXT: .long [[LINE_LABEL:.L[a-z0-9]+]]
@@ -52,4 +59,4 @@ foo:
 // ASM-NEXT: [[LINE_LABEL]]
 
 // DWARF1: Dwarf version 1 is not supported.
-// DWARF5: Dwarf version 5 is not supported.
+// DWARF6: Dwarf version 6 is not supported.
diff --git a/test/MC/ELF/section-metadata-err1.s b/test/MC/ELF/section-metadata-err1.s
new file mode 100644
index 0000000000000000000000000000000000000000..682f0e82f30d372a2a7abd80c4a00900f25c967b
--- /dev/null
+++ b/test/MC/ELF/section-metadata-err1.s
@@ -0,0 +1,5 @@
+// RUN: not llvm-mc -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+// CHECK: error: symbol is not in a section: foo
+
+        .section .shf_metadata,"ao",@progbits,foo
diff --git a/test/MC/ELF/section-metadata-err2.s b/test/MC/ELF/section-metadata-err2.s
new file mode 100644
index 0000000000000000000000000000000000000000..1912f67d0e0f16f91b37aad418e85a3289c86622
--- /dev/null
+++ b/test/MC/ELF/section-metadata-err2.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+// CHECK: error: symbol is not in a section: foo
+
+        .quad foo
+        .section .shf_metadata,"ao",@progbits,foo
diff --git a/test/MC/ELF/section-metadata-err3.s b/test/MC/ELF/section-metadata-err3.s
new file mode 100644
index 0000000000000000000000000000000000000000..388ca377fd42eb312b558d19b79a5b048fd00ce8
--- /dev/null
+++ b/test/MC/ELF/section-metadata-err3.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+// CHECK: error: symbol is not in a section: foo
+
+        foo = 42
+        .section .shf_metadata,"ao",@progbits,foo
diff --git a/test/MC/ELF/section-metadata-err4.s b/test/MC/ELF/section-metadata-err4.s
new file mode 100644
index 0000000000000000000000000000000000000000..d7677d292f70e7a359b3b83036aab352c8b5a6f7
--- /dev/null
+++ b/test/MC/ELF/section-metadata-err4.s
@@ -0,0 +1,5 @@
+// RUN: not llvm-mc -triple x86_64-pc-linux-gnu %s -o - 2>&1 | FileCheck %s
+
+// CHECK: error: expected metadata symbol
+
+        .section .shf_metadata,"ao",@progbits
diff --git a/test/MC/ELF/section-numeric-invalid-type.s b/test/MC/ELF/section-numeric-invalid-type.s
new file mode 100644
index 0000000000000000000000000000000000000000..3ae071bc7c13f98018b81a70bfc098f68ac7568f
--- /dev/null
+++ b/test/MC/ELF/section-numeric-invalid-type.s
@@ -0,0 +1,14 @@
+// RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux-gnu %s -o - \
+// RUN:   | llvm-readobj -s -t | FileCheck --check-prefix=OBJ %s
+
+// RUN: not llvm-mc -filetype=asm -triple=x86_64-pc-linux-gnu %s -o - 2>&1 \
+// RUN:   | FileCheck --check-prefix=ASM %s
+
+  .section .sec,"a",@0x7fffffff
+
+// OBJ:      Section {
+// OBJ:        Name: .sec
+// OBJ-NEXT:   Type: (0x7FFFFFFF)
+// OBJ:      }
+
+// ASM: unsupported type 0x7fffffff for section .sec
diff --git a/test/MC/ELF/section-numeric-type.s b/test/MC/ELF/section-numeric-type.s
new file mode 100644
index 0000000000000000000000000000000000000000..2e51bd4eb187f20fd9a4510dc81be1c26b052871
--- /dev/null
+++ b/test/MC/ELF/section-numeric-type.s
@@ -0,0 +1,20 @@
+// RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux-gnu %s -o - \
+// RUN:   | llvm-readobj -s -t | FileCheck --check-prefix=OBJ %s
+
+// RUN: llvm-mc -filetype=asm -triple=x86_64-pc-linux-gnu %s -o - \
+// RUN:   | FileCheck --check-prefix=ASM %s
+
+  .section .sec1,"a",@0x70000001
+  .section .sec2,"a",@1879048193
+
+// OBJ:      Section {
+// OBJ:        Name: .sec1
+// OBJ-NEXT:   Type: SHT_X86_64_UNWIND (0x70000001)
+// OBJ:      }
+// OBJ:      Section {
+// OBJ:        Name: .sec2
+// OBJ-NEXT:   Type: SHT_X86_64_UNWIND (0x70000001)
+// OBJ:      }
+
+// ASM: .section  .sec1,"a",@unwind
+// ASM: .section  .sec2,"a",@unwind
diff --git a/test/MC/ELF/section-sym-err.s b/test/MC/ELF/section-sym-err.s
new file mode 100644
index 0000000000000000000000000000000000000000..789fee7c422ce056790a32d39db7ed685c1174d9
--- /dev/null
+++ b/test/MC/ELF/section-sym-err.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t.o 2>&1 | FileCheck %s
+
+.section foo
+foo:
+
+// CHECK: error: invalid symbol redefinition
diff --git a/test/MC/ELF/section-sym-err2.s b/test/MC/ELF/section-sym-err2.s
new file mode 100644
index 0000000000000000000000000000000000000000..27d8e9a9ac2493191837033440489c2a4d27e05a
--- /dev/null
+++ b/test/MC/ELF/section-sym-err2.s
@@ -0,0 +1,6 @@
+// RUN: not llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o %t.o 2>&1 | FileCheck %s
+
+foo:
+.section foo
+
+// CHECK: error: invalid symbol redefinition
diff --git a/test/MC/ELF/section-sym-redefine.s b/test/MC/ELF/section-sym-redefine.s
deleted file mode 100644
index 1f6dd5723af1e556ab63a29fe7082c7b2f1f25e7..0000000000000000000000000000000000000000
--- a/test/MC/ELF/section-sym-redefine.s
+++ /dev/null
@@ -1,138 +0,0 @@
-// RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj  -t -r --expand-relocs | FileCheck %s
-
-// Local symbol overriding section.
-.section x1,"a",@progbits
-.local  x1
-.comm   x1,4,4
-.long x1  // reloc: .bss + 0
-
-// Section declared after local. Local symbol wins.
-.local  x2
-.comm   x2,4,4
-.section x2,"a",@progbits
-.long x2  // reloc: .bss + 4
-
-// No overriding symbol.
-.section x3,"a",@progbits
-.long x3  // reloc: x3(section) + 0
-
-// Global vs section.
-.section x4,"a",@progbits
-.long 0
-.globl x4
-.section foo, "a", @progbits
-x4:
-.long 0
-.long x4  // reloc: x4(global) + 0
-
-// Global vs implicit section
-.globl .data
-.data:
-.long 42
-.long .data // reloc: .data(global) + 0
-
-// CHECK: Relocations [
-// CHECK:   Section (4) .relax1 {
-// CHECK:     Relocation {
-// CHECK:       Offset: 0x0
-// CHECK:       Type: R_X86_64_32 (10)
-// CHECK:       Symbol: .bss (3)
-// CHECK:       Addend: 0x0
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Section (7) .relax2 {
-// CHECK:     Relocation {
-// CHECK:       Offset: 0x0
-// CHECK:       Type: R_X86_64_32 (10)
-// CHECK:       Symbol: .bss (3)
-// CHECK:       Addend: 0x4
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Section (9) .relax3 {
-// CHECK:     Relocation {
-// CHECK:       Offset: 0x0
-// CHECK:       Type: R_X86_64_32 (10)
-// CHECK:       Symbol: x3 (4)
-// CHECK:       Addend: 0x0
-// CHECK:     }
-// CHECK:   }
-// CHECK:   Section (12) .relafoo {
-// CHECK:     Relocation {
-// CHECK:       Offset: 0x4
-// CHECK:       Type: R_X86_64_32 (10)
-// CHECK:       Symbol: x4 (6)
-// CHECK:       Addend: 0x0
-// CHECK:     }
-// CHECK:     Relocation {
-// CHECK:       Offset: 0xC
-// CHECK:       Type: R_X86_64_32 (10)
-// CHECK:       Symbol: .data (5)
-// CHECK:       Addend: 0x0
-// CHECK:     }
-// CHECK:   }
-// CHECK: ]
-// CHECK: Symbols [
-// CHECK:   Symbol {
-// CHECK:     Name:  (0)
-// CHECK:     Value: 0x0
-// CHECK:     Size: 0
-// CHECK:     Binding: Local (0x0)
-// CHECK:     Type: None (0x0)
-// CHECK:     Other: 0
-// CHECK:     Section: Undefined (0x0)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: x1 (67)
-// CHECK:     Value: 0x0
-// CHECK:     Size: 4
-// CHECK:     Binding: Local (0x0)
-// CHECK:     Type: Object (0x1)
-// CHECK:     Other: 0
-// CHECK:     Section: .bss (0x5)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: x2 (59)
-// CHECK:     Value: 0x4
-// CHECK:     Size: 4
-// CHECK:     Binding: Local (0x0)
-// CHECK:     Type: Object (0x1)
-// CHECK:     Other: 0
-// CHECK:     Section: .bss (0x5)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name:  (0)
-// CHECK:     Value: 0x0
-// CHECK:     Size: 0
-// CHECK:     Binding: Local (0x0)
-// CHECK:     Type: Section (0x3)
-// CHECK:     Other: 0
-// CHECK:     Section: .bss (0x5)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name:  (0)
-// CHECK:     Value: 0x0
-// CHECK:     Size: 0
-// CHECK:     Binding: Local (0x0)
-// CHECK:     Type: Section (0x3)
-// CHECK:     Other: 0
-// CHECK:     Section: x3 (0x8)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: .data (37)
-// CHECK:     Value: 0x8
-// CHECK:     Size: 0
-// CHECK:     Binding: Global (0x1)
-// CHECK:     Type: None (0x0)
-// CHECK:     Other: 0
-// CHECK:     Section: foo (0xB)
-// CHECK:   }
-// CHECK:   Symbol {
-// CHECK:     Name: x4 (43)
-// CHECK:     Value: 0x0
-// CHECK:     Size: 0
-// CHECK:     Binding: Global (0x1)
-// CHECK:     Type: None (0x0)
-// CHECK:     Other: 0
-// CHECK:     Section: foo (0xB)
-// CHECK:   }
-// CHECK: ]
diff --git a/test/MC/ELF/section.s b/test/MC/ELF/section.s
index 0277be5223617073168f48ed3d953d71529f6e4d..03a0f22e580b98f0116d22baf349464024fbdd7e 100644
--- a/test/MC/ELF/section.s
+++ b/test/MC/ELF/section.s
@@ -1,4 +1,5 @@
 // RUN: llvm-mc -filetype=obj -triple x86_64-pc-linux-gnu %s -o - | llvm-readobj -s | FileCheck %s
+// RUN: llvm-mc -filetype=asm -triple x86_64-pc-linux-gnu %s -o - |  FileCheck %s --check-prefix=ASM
 
 // Test that these names are accepted.
 
@@ -143,9 +144,126 @@ bar:
 
 // Test that we handle the strings like gas
 .section bar-"foo"
-.section "foo"
+.section "fooo"
+
 
 // CHECK:        Section {
 // CHECK:          Name: bar-"foo"
 // CHECK:        Section {
-// CHECK:          Name: foo
+// CHECK:          Name: fooo
+
+// Test SHF_LINK_ORDER
+
+.section .shf_metadata_target1, "a"
+        .quad 0
+.section .shf_metadata_target2, "a", @progbits, unique, 1
+.Lshf_metadata_target2_1:
+        .quad 0
+.section .shf_metadata_target2, "a", @progbits, unique, 2
+.Lshf_metadata_target2_2:
+        .quad 0
+
+.section .shf_metadata1,"ao",@progbits,.Lshf_metadata_target2_1
+.section .shf_metadata2,"ao",@progbits,.Lshf_metadata_target2_2
+.section .shf_metadata3,"ao",@progbits,.shf_metadata_target1
+// ASM: .section .shf_metadata1,"ao",@progbits,.Lshf_metadata_target2_1
+// ASM: .section .shf_metadata2,"ao",@progbits,.Lshf_metadata_target2_2
+// ASM: .section .shf_metadata3,"ao",@progbits,.shf_metadata_target1
+
+// CHECK:      Section {
+// CHECK:        Index: 22
+// CHECK-NEXT:   Name: .shf_metadata_target1
+
+// CHECK:      Section {
+// CHECK:        Index: 23
+// CHECK-NEXT:   Name: .shf_metadata_target2
+
+// CHECK:      Section {
+// CHECK:        Index: 24
+// CHECK-NEXT:   Name: .shf_metadata_target2
+
+// CHECK:      Section {
+// CHECK:        Name: .shf_metadata1
+// CHECK-NEXT:   Type: SHT_PROGBITS
+// CHECK-NEXT:   Flags [
+// CHECK-NEXT:     SHF_ALLOC
+// CHECK-NEXT:     SHF_LINK_ORDER
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Address:
+// CHECK-NEXT:   Offset:
+// CHECK-NEXT:   Size:
+// CHECK-NEXT:   Link:    23
+// CHECK-NEXT:   Info:    0
+
+// CHECK:      Section {
+// CHECK:        Name: .shf_metadata2
+// CHECK-NEXT:   Type: SHT_PROGBITS
+// CHECK-NEXT:   Flags [
+// CHECK-NEXT:     SHF_ALLOC
+// CHECK-NEXT:     SHF_LINK_ORDER
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Address:
+// CHECK-NEXT:   Offset:
+// CHECK-NEXT:   Size:
+// CHECK-NEXT:   Link:    24
+// CHECK-NEXT:   Info:    0
+
+// CHECK:      Section {
+// CHECK:        Name: .shf_metadata3
+// CHECK-NEXT:   Type: SHT_PROGBITS
+// CHECK-NEXT:   Flags [
+// CHECK-NEXT:     SHF_ALLOC
+// CHECK-NEXT:     SHF_LINK_ORDER
+// CHECK-NEXT:   ]
+// CHECK-NEXT:   Address:
+// CHECK-NEXT:   Offset:
+// CHECK-NEXT:   Size:
+// CHECK-NEXT:   Link:    22
+// CHECK-NEXT:   Info:    0
+
+.section	.text.foo
+// CHECK:        Section {
+// CHECK:          Name: .text.foo
+// CHECK-NEXT:     Type: SHT_PROGBITS
+// CHECK-NEXT:     Flags [
+// CHECK-NEXT:       SHF_ALLOC
+// CHECK-NEXT:       SHF_EXECINSTR
+// CHECK-NEXT:     ]
+
+.section .bss
+// CHECK:        Section {
+// CHECK:          Name: .bss
+// CHECK-NEXT:     Type: SHT_NOBITS
+// CHECK-NEXT:     Flags [
+// CHECK-NEXT:       SHF_ALLOC
+// CHECK-NEXT:       SHF_WRITE
+// CHECK-NEXT:     ]
+
+.section .bss.foo
+// CHECK:        Section {
+// CHECK:          Name: .bss.foo
+// CHECK-NEXT:     Type: SHT_NOBITS
+// CHECK-NEXT:     Flags [
+// CHECK-NEXT:       SHF_ALLOC
+// CHECK-NEXT:       SHF_WRITE
+// CHECK-NEXT:     ]
+
+.section .tbss
+// CHECK:        Section {
+// CHECK:          Name: .tbss
+// CHECK-NEXT:     Type: SHT_NOBITS
+// CHECK-NEXT:     Flags [
+// CHECK-NEXT:       SHF_ALLOC
+// CHECK-NEXT:       SHF_TLS
+// CHECK-NEXT:       SHF_WRITE
+// CHECK-NEXT:     ]
+
+.section .tbss.foo
+// CHECK:        Section {
+// CHECK:          Name: .tbss.foo
+// CHECK-NEXT:     Type: SHT_NOBITS
+// CHECK-NEXT:     Flags [
+// CHECK-NEXT:       SHF_ALLOC
+// CHECK-NEXT:       SHF_TLS
+// CHECK-NEXT:       SHF_WRITE
+// CHECK-NEXT:     ]
diff --git a/test/MC/Hexagon/align.s b/test/MC/Hexagon/align.s
index 01a112392ed43f0a73ba80a11f954eb20423742c..80cebf125cea5431945f05a00a7b32a57130b464 100644
--- a/test/MC/Hexagon/align.s
+++ b/test/MC/Hexagon/align.s
@@ -3,7 +3,7 @@
 # Verify that the .align directive emits the proper insn packets.
 
 { r1 = sub(#1, r1) }
-# CHECK: 76414021 { r1 = sub(#1, r1)
+# CHECK: 76414021 { r1 = sub(#1,r1)
 # CHECK-NEXT: 7f004000   nop
 # CHECK-NEXT: 7f004000   nop
 # CHECK-NEXT: 7f00c000   nop }
@@ -11,8 +11,8 @@
 .align 16
 { r1 = sub(#1, r1)
   r2 = sub(#1, r2) }
-# CHECK: 76414021 { r1 = sub(#1, r1)
-# CHECK-NEXT: 76424022   r2 = sub(#1, r2)
+# CHECK: 76414021 { r1 = sub(#1,r1)
+# CHECK-NEXT: 76424022   r2 = sub(#1,r2)
 # CHECK-NEXT: 7f004000  nop
 # CHECK-NEXT: 7f00c000   nop }
 
@@ -20,7 +20,7 @@
 { r1 = sub(#1, r1)
   r2 = sub(#1, r2)
   r3 = sub(#1, r3) }
-# CHECK: 76434023   r3 = sub(#1, r3)
+# CHECK: 76434023   r3 = sub(#1,r3)
 # CHECK-NEXT: 7f00c000 nop }
 
 .align 16
@@ -33,13 +33,13 @@
 # CHECK: 9200c020 {  r0 = vextract(v0,r0) }
 r0 = vextract(v0, r0)
 .align 128
-# CHECK: 76414021 { r1 = sub(#1, r1)
+# CHECK: 76414021 { r1 = sub(#1,r1)
 # CHECK-NEXT: 7f00c000   nop }
 { r1 = sub(#1, r1) }
 
-#CHECK: { r1 = sub(#1, r1)
-#CHECK:   r2 = sub(#1, r2)
-#CHECK:   r3 = sub(#1, r3) }
+#CHECK: { r1 = sub(#1,r1)
+#CHECK:   r2 = sub(#1,r2)
+#CHECK:   r3 = sub(#1,r3) }
 .falign
 .align 8
 { r1 = sub(#1, r1)
@@ -47,14 +47,14 @@ r0 = vextract(v0, r0)
   r3 = sub(#1, r3)  }
 
 # CHECK: { immext(#0)
-# CHECK:   r0 = sub(##1, r0)
+# CHECK:   r0 = sub(##1,r0)
 # CHECK:   immext(#0)
-# CHECK:   r1 = sub(##1, r1) }
+# CHECK:   r1 = sub(##1,r1) }
 # CHECK: { nop
 # CHECK:   nop
 # CHECK:   nop }
-# CHECK: { r0 = sub(#1, r0) }
+# CHECK: { r0 = sub(#1,r0) }
 { r0 = sub (##1, r0)
   r1 = sub (##1, r1) }
 .align 16
-{ r0 = sub (#1, r0) }
\ No newline at end of file
+{ r0 = sub (#1, r0) }
diff --git a/test/MC/Hexagon/asmMap.s b/test/MC/Hexagon/asmMap.s
index 81bb8f31f02c7f01200261acb3826abfc2de1a04..4a2ca2499cc1729ae49f227832099e451fa4652d 100644
--- a/test/MC/Hexagon/asmMap.s
+++ b/test/MC/Hexagon/asmMap.s
@@ -2,607 +2,607 @@
 
 # Make sure that the assembler mapped instructions are being handled correctly.
 
-#CHECK: 3c56c000 { memw(r22{{ *}}+{{ *}}#0)=#0
+#CHECK: 3c56c000 { memw(r22+#0) = #0
 memw(r22)=#0
 
-#CHECK: 3c23e05f { memh(r3{{ *}}+{{ *}}#0)=#-33
+#CHECK: 3c23e05f { memh(r3+#0) = #-33
 memh(r3)=#-33
 
-#CHECK: 3c07c012 { memb(r7{{ *}}+{{ *}}#0)=#18
+#CHECK: 3c07c012 { memb(r7+#0) = #18
 memb(r7)=#18
 
-#CHECK: 4101c008 { if (p0) r8 = memb(r1{{ *}}+{{ *}}#0)
+#CHECK: 4101c008 { if (p0) r8 = memb(r1+#0)
 if (p0) r8=memb(r1)
 
-#CHECK: 4519d817 { if (!p3) r23 = memb(r25{{ *}}+{{ *}}#0)
+#CHECK: 4519d817 { if (!p3) r23 = memb(r25+#0)
 if (!p3) r23=memb(r25)
 
-#CHECK: 412dc002 { if (p0) r2 = memub(r13{{ *}}+{{ *}}#0)
+#CHECK: 412dc002 { if (p0) r2 = memub(r13+#0)
 if (p0) r2=memub(r13)
 
-#CHECK: 453cc01a { if (!p0) r26 = memub(r28{{ *}}+{{ *}}#0)
+#CHECK: 453cc01a { if (!p0) r26 = memub(r28+#0)
 if (!p0) r26=memub(r28)
 
-#CHECK: 416bc818 { if (p1) r24 = memuh(r11{{ *}}+{{ *}}#0)
+#CHECK: 416bc818 { if (p1) r24 = memuh(r11+#0)
 if (p1) r24=memuh(r11)
 
-#CHECK: 457fc012 { if (!p0) r18 = memuh(r31{{ *}}+{{ *}}#0)
+#CHECK: 457fc012 { if (!p0) r18 = memuh(r31+#0)
 if (!p0) r18=memuh(r31)
 
-#CHECK: 455dc014 { if (!p0) r20 = memh(r29{{ *}}+{{ *}}#0)
+#CHECK: 455dc014 { if (!p0) r20 = memh(r29+#0)
 if (!p0) r20=memh(r29)
 
-#CHECK: 415dc01d { if (p0) r29 = memh(r29{{ *}}+{{ *}}#0)
+#CHECK: 415dc01d { if (p0) r29 = memh(r29+#0)
 if (p0) r29=memh(r29)
 
-#CHECK: 4583c01d { if (!p0) r29 = memw(r3{{ *}}+{{ *}}#0)
+#CHECK: 4583c01d { if (!p0) r29 = memw(r3+#0)
 if (!p0) r29=memw(r3)
 
-#CHECK: 419bd01e { if (p2) r30 = memw(r27{{ *}}+{{ *}}#0)
+#CHECK: 419bd01e { if (p2) r30 = memw(r27+#0)
 if (p2) r30=memw(r27)
 
-#CHECK: 90e2c018 { r25:24 = membh(r2{{ *}}+{{ *}}#0)
+#CHECK: 90e2c018 { r25:24 = membh(r2+#0)
 r25:24=membh(r2)
 
-#CHECK: 902bc006 { r6 = membh(r11{{ *}}+{{ *}}#0)
+#CHECK: 902bc006 { r6 = membh(r11+#0)
 r6=membh(r11)
 
-#CHECK: 90a2c01c { r29:28 = memubh(r2{{ *}}+{{ *}}#0)
+#CHECK: 90a2c01c { r29:28 = memubh(r2+#0)
 r29:28=memubh(r2)
 
-#CHECK: 906ec00d { r13 = memubh(r14{{ *}}+{{ *}}#0)
+#CHECK: 906ec00d { r13 = memubh(r14+#0)
 r13=memubh(r14)
 
-#CHECK: 91dac00c { r13:12 = memd(r26{{ *}}+{{ *}}#0)
+#CHECK: 91dac00c { r13:12 = memd(r26+#0)
 r13:12=memd(r26)
 
-#CHECK: 919bc004 { r4 = memw(r27{{ *}}+{{ *}}#0)
+#CHECK: 919bc004 { r4 = memw(r27+#0)
 r4=memw(r27)
 
-#CHECK: 914cc005 { r5 = memh(r12{{ *}}+{{ *}}#0)
+#CHECK: 914cc005 { r5 = memh(r12+#0)
 r5=memh(r12)
 
-#CHECK: 9176c010 { r16 = memuh(r22{{ *}}+{{ *}}#0)
+#CHECK: 9176c010 { r16 = memuh(r22+#0)
 r16=memuh(r22)
 
-#CHECK: 910bc017 { r23 = memb(r11{{ *}}+{{ *}}#0)
+#CHECK: 910bc017 { r23 = memb(r11+#0)
 r23=memb(r11)
 
-#CHECK: 912bc01b { r27 = memub(r11{{ *}}+{{ *}}#0)
+#CHECK: 912bc01b { r27 = memub(r11+#0)
 r27=memub(r11)
 
-#CHECK: 404ede01 { if (p1) memh(r14{{ *}}+{{ *}}#0) = r30
+#CHECK: 404ede01 { if (p1) memh(r14+#0) = r30
 if (p1) memh(r14)=r30
 
-#CHECK: 4449d900 { if (!p0) memh(r9{{ *}}+{{ *}}#0) = r25
+#CHECK: 4449d900 { if (!p0) memh(r9+#0) = r25
 if (!p0) memh(r9)=r25
 
-#CHECK: 400ecd00 { if (p0) memb(r14{{ *}}+{{ *}}#0) = r13
+#CHECK: 400ecd00 { if (p0) memb(r14+#0) = r13
 if (p0) memb(r14)=r13
 
-#CHECK: 440bcc01 { if (!p1) memb(r11{{ *}}+{{ *}}#0) = r12
+#CHECK: 440bcc01 { if (!p1) memb(r11+#0) = r12
 if (!p1) memb(r11)=r12
 
-#CHECK: 41d0d804 { if (p3) r5:4 = memd(r16{{ *}}+{{ *}}#0)
+#CHECK: 41d0d804 { if (p3) r5:4 = memd(r16+#0)
 if (p3) r5:4=memd(r16)
 
-#CHECK: 45d9c00c { if (!p0) r13:12 = memd(r25{{ *}}+{{ *}}#0)
+#CHECK: 45d9c00c { if (!p0) r13:12 = memd(r25+#0)
 if (!p0) r13:12=memd(r25)
 
-#CHECK: 385ee06d { if (p3) memw(r30{{ *}}+{{ *}}#0)=#-19
+#CHECK: 385ee06d { if (p3) memw(r30+#0) = #-19
 if (p3) memw(r30)=#-19
 
-#CHECK: 38c6c053 { if (!p2) memw(r6{{ *}}+{{ *}}#0)=#19
+#CHECK: 38c6c053 { if (!p2) memw(r6+#0) = #19
 if (!p2) memw(r6)=#19
 
-#CHECK: 381fc034 { if (p1) memb(r31{{ *}}+{{ *}}#0)=#20
+#CHECK: 381fc034 { if (p1) memb(r31+#0) = #20
 if (p1) memb(r31)=#20
 
-#CHECK: 389dc010 { if (!p0) memb(r29{{ *}}+{{ *}}#0)=#16
+#CHECK: 389dc010 { if (!p0) memb(r29+#0) = #16
 if (!p0) memb(r29)=#16
 
-#CHECK: 3833e019 { if (p0) memh(r19{{ *}}+{{ *}}#0)=#-7
+#CHECK: 3833e019 { if (p0) memh(r19+#0) = #-7
 if (p0) memh(r19)=#-7
 
-#CHECK: 38b7c013 { if (!p0) memh(r23{{ *}}+{{ *}}#0)=#19
+#CHECK: 38b7c013 { if (!p0) memh(r23+#0) = #19
 if (!p0) memh(r23)=#19
 
-#CHECK: 4488d401 { if (!p1) memw(r8{{ *}}+{{ *}}#0) = r20
+#CHECK: 4488d401 { if (!p1) memw(r8+#0) = r20
 if (!p1) memw(r8)=r20
 
-#CHECK: 409ddc02 { if (p2) memw(r29{{ *}}+{{ *}}#0) = r28
+#CHECK: 409ddc02 { if (p2) memw(r29+#0) = r28
 if (p2) memw(r29)=r28
 
-#CHECK: 446fc301 { if (!p1) memh(r15{{ *}}+{{ *}}#0) = r3.h
+#CHECK: 446fc301 { if (!p1) memh(r15+#0) = r3.h
 if (!p1) memh(r15)=r3.h
 
-#CHECK: 406dc201 { if (p1) memh(r13{{ *}}+{{ *}}#0) = r2.h
+#CHECK: 406dc201 { if (p1) memh(r13+#0) = r2.h
 if (p1) memh(r13)=r2.h
 
-#CHECK: 40d9c601 { if (p1) memd(r25{{ *}}+{{ *}}#0) = r7:6
+#CHECK: 40d9c601 { if (p1) memd(r25+#0) = r7:6
 if (p1) memd(r25)=r7:6
 
-#CHECK: 44dad803 { if (!p3) memd(r26{{ *}}+{{ *}}#0) = r25:24
+#CHECK: 44dad803 { if (!p3) memd(r26+#0) = r25:24
 if (!p3) memd(r26)=r25:24
 
-#CHECK: 3e21c011 { memh(r1{{ *}}+{{ *}}#0) {{ *}}+={{ *}} r17
+#CHECK: 3e21c011 { memh(r1+#0) += r17
 memh(r1)+=r17
 
-#CHECK: 3e4fc019 { memw(r15{{ *}}+{{ *}}#0) {{ *}}+={{ *}} r25
+#CHECK: 3e4fc019 { memw(r15+#0) += r25
 memw(r15)+=r25
 
-#CHECK: 3e5dc022 { memw(r29{{ *}}+{{ *}}#0) {{ *}}-={{ *}} r2
+#CHECK: 3e5dc022 { memw(r29+#0) -= r2
 memw(r29)-=r2
 
-#CHECK: 3e04c004 { memb(r4{{ *}}+{{ *}}#0) {{ *}}+={{ *}} r4
+#CHECK: 3e04c004 { memb(r4+#0) += r4
 memb(r4)+=r4
 
-#CHECK: 3f53c016 { memw(r19{{ *}}+{{ *}}#0){{ *}}{{ *}}+={{ *}}{{ *}}#22
+#CHECK: 3f53c016 { memw(r19+#0) += #22
 memw(r19)+=#22
 
-#CHECK: 3f24c01e { memh(r4{{ *}}+{{ *}}#0){{ *}}{{ *}}+={{ *}}{{ *}}#30
+#CHECK: 3f24c01e { memh(r4+#0) += #30
 memh(r4)+=#30
 
-#CHECK: 3e27c02d { memh(r7{{ *}}+{{ *}}#0) {{ *}}-={{ *}} r13
+#CHECK: 3e27c02d { memh(r7+#0) -= r13
 memh(r7)-=r13
 
-#CHECK: 3e1ec032 { memb(r30{{ *}}+{{ *}}#0) {{ *}}-={{ *}} r18
+#CHECK: 3e1ec032 { memb(r30+#0) -= r18
 memb(r30)-=r18
 
-#CHECK: 3e49c05b { memw(r9{{ *}}+{{ *}}#0) &= r27
+#CHECK: 3e49c05b { memw(r9+#0) &= r27
 memw(r9)&=r27
 
-#CHECK: 3e2dc040 { memh(r13{{ *}}+{{ *}}#0) &= r0
+#CHECK: 3e2dc040 { memh(r13+#0) &= r0
 memh(r13)&=r0
 
-#CHECK: 3e05c046 { memb(r5{{ *}}+{{ *}}#0) &= r6
+#CHECK: 3e05c046 { memb(r5+#0) &= r6
 memb(r5)&=r6
 
-#CHECK: 3e45c06a { memw(r5{{ *}}+{{ *}}#0) |= r10
+#CHECK: 3e45c06a { memw(r5+#0) |= r10
 memw(r5)|=r10
 
-#CHECK: 3e21c07e { memh(r1{{ *}}+{{ *}}#0) |= r30
+#CHECK: 3e21c07e { memh(r1+#0) |= r30
 memh(r1)|=r30
 
-#CHECK: 3e09c06f { memb(r9{{ *}}+{{ *}}#0) |= r15
+#CHECK: 3e09c06f { memb(r9+#0) |= r15
 memb(r9)|=r15
 
-#CHECK: a157d100 { memh(r23{{ *}}+{{ *}}#0) = r17
+#CHECK: a157d100 { memh(r23+#0) = r17
 memh(r23)=r17
 
-#CHECK: a10fd400 { memb(r15{{ *}}+{{ *}}#0) = r20
+#CHECK: a10fd400 { memb(r15+#0) = r20
 memb(r15)=r20
 
-#CHECK: 9082c014 { r21:20 = memb_fifo(r2{{ *}}+{{ *}}#0)
+#CHECK: 9082c014 { r21:20 = memb_fifo(r2+#0)
 r21:20=memb_fifo(r2)
 
-#CHECK: 9056c01c { r29:28 = memh_fifo(r22{{ *}}+{{ *}}#0)
+#CHECK: 9056c01c { r29:28 = memh_fifo(r22+#0)
 r29:28=memh_fifo(r22)
 
-#CHECK: a1d8ca00 { memd(r24{{ *}}+{{ *}}#0) = r11:10
+#CHECK: a1d8ca00 { memd(r24+#0) = r11:10
 memd(r24)=r11:10
 
-#CHECK: a19ed900 { memw(r30{{ *}}+{{ *}}#0) = r25
+#CHECK: a19ed900 { memw(r30+#0) = r25
 memw(r30)=r25
 
-#CHECK: a169ce00 { memh(r9{{ *}}+{{ *}}#0) = r14.h
+#CHECK: a169ce00 { memh(r9+#0) = r14.h
 memh(r9)=r14.h
 
-#CHECK: 3f07c06b { memb(r7{{ *}}+{{ *}}#0) = setbit(#11)
+#CHECK: 3f07c06b { memb(r7+#0) = setbit(#11)
 memb(r7)=setbit(#11)
 
-#CHECK: 3f34c07b { memh(r20{{ *}}+{{ *}}#0) = setbit(#27)
+#CHECK: 3f34c07b { memh(r20+#0) = setbit(#27)
 memh(r20)=setbit(#27)
 
-#CHECK: 3f1cc032 { memb(r28{{ *}}+{{ *}}#0){{ *}}-={{ *}}#18
+#CHECK: 3f1cc032 { memb(r28+#0) -= #18
 memb(r28)-=#18
 
-#CHECK: 3f29c02a { memh(r9{{ *}}+{{ *}}#0){{ *}}-={{ *}}#10
+#CHECK: 3f29c02a { memh(r9+#0) -= #10
 memh(r9)-=#10
 
-#CHECK: 3f4cc026 { memw(r12{{ *}}+{{ *}}#0){{ *}}-={{ *}}#6
+#CHECK: 3f4cc026 { memw(r12+#0) -= #6
 memw(r12)-=#6
 
-#CHECK: 3f00c00c { memb(r0{{ *}}+{{ *}}#0){{ *}}+={{ *}}#12
+#CHECK: 3f00c00c { memb(r0+#0) += #12
 memb(r0)+=#12
 
-#CHECK: 3f50c07a { memw(r16{{ *}}+{{ *}}#0) = setbit(#26)
+#CHECK: 3f50c07a { memw(r16+#0) = setbit(#26)
 memw(r16)=setbit(#26)
 
-#CHECK: 3f1fc05d { memb(r31{{ *}}+{{ *}}#0) = clrbit(#29)
+#CHECK: 3f1fc05d { memb(r31+#0) = clrbit(#29)
 memb(r31)=clrbit(#29)
 
-#CHECK: 3f20c05e { memh(r0{{ *}}+{{ *}}#0) = clrbit(#30)
+#CHECK: 3f20c05e { memh(r0+#0) = clrbit(#30)
 memh(r0)=clrbit(#30)
 
-#CHECK: 3f42c059 { memw(r2{{ *}}+{{ *}}#0) = clrbit(#25)
+#CHECK: 3f42c059 { memw(r2+#0) = clrbit(#25)
 memw(r2)=clrbit(#25)
 
-#CHECK: 39cfe072 if (!p3.new) memw(r15{{ *}}+{{ *}}#0)=#-14
+#CHECK: 39cfe072 if (!p3.new) memw(r15+#0) = #-14
 {
   p3=cmp.eq(r5,##-1997506977)
   if (!p3.new) memw(r15)=#-14
 }
 
-#CHECK: 3959e06b if (p3.new) memw(r25{{ *}}+{{ *}}#0)=#-21
+#CHECK: 3959e06b if (p3.new) memw(r25+#0) = #-21
 {
   p3=cmp.eq(r0,##1863618461)
   if (p3.new) memw(r25)=#-21
 }
 
-#CHECK: 4312c801 if (p1.new) r1 = memb(r18{{ *}}+{{ *}}#0)
+#CHECK: 4312c801 if (p1.new) r1 = memb(r18+#0)
 {
   if (p1.new) r1=memb(r18)
   p1=cmp.eq(r23,##-1105571618)
 }
 
-#CHECK: 4718d803 if (!p3.new) r3 = memb(r24{{ *}}+{{ *}}#0)
+#CHECK: 4718d803 if (!p3.new) r3 = memb(r24+#0)
 {
   if (!p3.new) r3=memb(r24)
   p3=cmp.eq(r3,##-210870878)
 }
 
-#CHECK: 4326c81b if (p1.new) r27 = memub(r6{{ *}}+{{ *}}#0)
+#CHECK: 4326c81b if (p1.new) r27 = memub(r6+#0)
 {
   if (p1.new) r27=memub(r6)
   p1=cmp.eq(r29,##-188410493)
 }
 
-#CHECK: 473ad00d if (!p2.new) r13 = memub(r26{{ *}}+{{ *}}#0)
+#CHECK: 473ad00d if (!p2.new) r13 = memub(r26+#0)
 {
   p2=cmp.eq(r30,##-1823852150)
   if (!p2.new) r13=memub(r26)
 }
 
-#CHECK: 4785d80e if (!p3.new) r14 = memw(r5{{ *}}+{{ *}}#0)
+#CHECK: 4785d80e if (!p3.new) r14 = memw(r5+#0)
 {
   if (!p3.new) r14=memw(r5)
   p3=cmp.eq(r31,##-228524711)
 }
 
-#CHECK: 438cc81a if (p1.new) r26 = memw(r12{{ *}}+{{ *}}#0)
+#CHECK: 438cc81a if (p1.new) r26 = memw(r12+#0)
 {
   if (p1.new) r26=memw(r12)
   p1=cmp.eq(r11,##-485232313)
 }
 
-#CHECK: 477dc019 if (!p0.new) r25 = memuh(r29{{ *}}+{{ *}}#0)
+#CHECK: 477dc019 if (!p0.new) r25 = memuh(r29+#0)
 {
   p0=cmp.eq(r23,##127565957)
   if (!p0.new) r25=memuh(r29)
 }
 
-#CHECK: 4377c807 if (p1.new) r7 = memuh(r23{{ *}}+{{ *}}#0)
+#CHECK: 4377c807 if (p1.new) r7 = memuh(r23+#0)
 {
   p1=cmp.eq(r30,##-222020054)
   if (p1.new) r7=memuh(r23)
 }
 
-#CHECK: 4754c81c if (!p1.new) r28 = memh(r20{{ *}}+{{ *}}#0)
+#CHECK: 4754c81c if (!p1.new) r28 = memh(r20+#0)
 {
   p1=cmp.eq(r18,##1159699785)
   if (!p1.new) r28=memh(r20)
 }
 
-#CHECK: 435ec01b if (p0.new) r27 = memh(r30{{ *}}+{{ *}}#0)
+#CHECK: 435ec01b if (p0.new) r27 = memh(r30+#0)
 {
   p0=cmp.eq(r7,##-1114567705)
   if (p0.new) r27=memh(r30)
 }
 
-#CHECK: 420dd100 if (p0.new) memb(r13{{ *}}+{{ *}}#0) = r17
+#CHECK: 420dd100 if (p0.new) memb(r13+#0) = r17
 {
   p0=cmp.eq(r21,##-1458796638)
   if (p0.new) memb(r13)=r17
 }
 
-#CHECK: 4601d602 if (!p2.new) memb(r1{{ *}}+{{ *}}#0) = r22
+#CHECK: 4601d602 if (!p2.new) memb(r1+#0) = r22
 {
   p2=cmp.eq(r20,##-824022439)
   if (!p2.new) memb(r1)=r22
 }
 
-#CHECK: 43dcd808 if (p3.new) r9:8 = memd(r28{{ *}}+{{ *}}#0)
+#CHECK: 43dcd808 if (p3.new) r9:8 = memd(r28+#0)
 {
   p3=cmp.eq(r13,##56660744)
   if (p3.new) r9:8=memd(r28)
 }
 
-#CHECK: 47d8c80e if (!p1.new) r15:14 = memd(r24{{ *}}+{{ *}}#0)
+#CHECK: 47d8c80e if (!p1.new) r15:14 = memd(r24+#0)
 {
   if (!p1.new) r15:14=memd(r24)
   p1=cmp.eq(r15,##1536716489)
 }
 
-#CHECK: 3918e045 if (p2.new) memb(r24{{ *}}+{{ *}}#0)=#-27
+#CHECK: 3918e045 if (p2.new) memb(r24+#0) = #-27
 {
   if (p2.new) memb(r24)=#-27
   p2=cmp.eq(r21,##1741091811)
 }
 
-#CHECK: 398fe04d if (!p2.new) memb(r15{{ *}}+{{ *}}#0)=#-19
+#CHECK: 398fe04d if (!p2.new) memb(r15+#0) = #-19
 {
   if (!p2.new) memb(r15)=#-19
   p2=cmp.eq(r15,##779870261)
 }
 
-#CHECK: 3931c04b if (p2.new) memh(r17{{ *}}+{{ *}}#0)=#11
+#CHECK: 3931c04b if (p2.new) memh(r17+#0) = #11
 {
   if (p2.new) memh(r17)=#11
   p2=cmp.eq(r13,##-1171145798)
 }
 
-#CHECK: 39aee056 if (!p2.new) memh(r14{{ *}}+{{ *}}#0)=#-10
+#CHECK: 39aee056 if (!p2.new) memh(r14+#0) = #-10
 {
   p2=cmp.eq(r23,##-633976762)
   if (!p2.new) memh(r14)=#-10
 }
 
-#CHECK: 4692df01 if (!p1.new) memw(r18{{ *}}+{{ *}}#0) = r31
+#CHECK: 4692df01 if (!p1.new) memw(r18+#0) = r31
 {
   if (!p1.new) memw(r18)=r31
   p1=cmp.eq(r11,##-319375732)
 }
 
-#CHECK: 428dc402 if (p2.new) memw(r13{{ *}}+{{ *}}#0) = r4
+#CHECK: 428dc402 if (p2.new) memw(r13+#0) = r4
 {
   if (p2.new) memw(r13)=r4
   p2=cmp.eq(r18,##1895120239)
 }
 
-#CHECK: 4670c300 if (!p0.new) memh(r16{{ *}}+{{ *}}#0) = r3.h
+#CHECK: 4670c300 if (!p0.new) memh(r16+#0) = r3.h
 {
   p0=cmp.eq(r25,##1348715015)
   if (!p0.new) memh(r16)=r3.h
 }
 
-#CHECK: 426ddf02 if (p2.new) memh(r13{{ *}}+{{ *}}#0) = r31.h
+#CHECK: 426ddf02 if (p2.new) memh(r13+#0) = r31.h
 {
   p2=cmp.eq(r25,##1085560657)
   if (p2.new) memh(r13)=r31.h
 }
 
-#CHECK: 464bcb01 if (!p1.new) memh(r11{{ *}}+{{ *}}#0) = r11
+#CHECK: 464bcb01 if (!p1.new) memh(r11+#0) = r11
 {
   p1=cmp.eq(r10,##1491455911)
   if (!p1.new) memh(r11)=r11
 }
 
-#CHECK: 4248d200 if (p0.new) memh(r8{{ *}}+{{ *}}#0) = r18
+#CHECK: 4248d200 if (p0.new) memh(r8+#0) = r18
 {
   p0=cmp.eq(r3,##687581160)
   if (p0.new) memh(r8)=r18
 }
 
-#CHECK: 42deca00 if (p0.new) memd(r30{{ *}}+{{ *}}#0) = r11:10
+#CHECK: 42deca00 if (p0.new) memd(r30+#0) = r11:10
 {
   if (p0.new) memd(r30)=r11:10
   p0=cmp.eq(r28,##562796189)
 }
 
-#CHECK: 46d5cc03 if (!p3.new) memd(r21{{ *}}+{{ *}}#0) = r13:12
+#CHECK: 46d5cc03 if (!p3.new) memd(r21+#0) = r13:12
 {
   if (!p3.new) memd(r21)=r13:12
   p3=cmp.eq(r6,##-969273288)
 }
 
-#CHECK: 42bad201 if (p1.new) memw(r26{{ *}}+{{ *}}#0) = r22.new
+#CHECK: 42bad201 if (p1.new) memw(r26+#0) = r22.new
 {
   if (p1.new) memw(r26)=r22.new
   p1=cmp.eq(r0,##-1110065473)
   r22=add(r28,r9)
 }
 
-#CHECK: 46b9d201 if (!p1.new) memw(r25{{ *}}+{{ *}}#0) = r26.new
+#CHECK: 46b9d201 if (!p1.new) memw(r25+#0) = r26.new
 {
   p1=cmp.eq(r11,##-753121346)
   r26=add(r19,r7)
   if (!p1.new) memw(r25)=r26.new
 }
 
-#CHECK: 40aad200 if (p0) memw(r10{{ *}}+{{ *}}#0) = r6.new
+#CHECK: 40aad200 if (p0) memw(r10+#0) = r6.new
 {
   r6=add(r30,r0)
   if (p0) memw(r10)=r6.new
 }
 
-#CHECK: 44a6d202 if (!p2) memw(r6{{ *}}+{{ *}}#0) = r4.new
+#CHECK: 44a6d202 if (!p2) memw(r6+#0) = r4.new
 {
   if (!p2) memw(r6)=r4.new
   r4=add(r0,r3)
 }
 
-#CHECK: 40b9c200 if (p0) memb(r25{{ *}}+{{ *}}#0) = r29.new
+#CHECK: 40b9c200 if (p0) memb(r25+#0) = r29.new
 {
   if (p0) memb(r25)=r29.new
   r29=add(r27,r30)
 }
 
-#CHECK: 44bec203 if (!p3) memb(r30{{ *}}+{{ *}}#0) = r8.new
+#CHECK: 44bec203 if (!p3) memb(r30+#0) = r8.new
 {
   if (!p3) memb(r30)=r8.new
   r8=add(r24,r4)
 }
 
-#CHECK: 46aecc01 if (!p1.new) memh(r14{{ *}}+{{ *}}#0) = r13.new
+#CHECK: 46aecc01 if (!p1.new) memh(r14+#0) = r13.new
 {
   if (!p1.new) memh(r14)=r13.new
   r13=add(r21,r2)
   p1=cmp.eq(r3,##-1529345886)
 }
 
-#CHECK: 42bcca02 if (p2.new) memh(r28{{ *}}+{{ *}}#0) = r18.new
+#CHECK: 42bcca02 if (p2.new) memh(r28+#0) = r18.new
 {
   p2=cmp.eq(r15,##2048545649)
   if (p2.new) memh(r28)=r18.new
   r18=add(r9,r3)
 }
 
-#CHECK: 46aac200 if (!p0.new) memb(r10{{ *}}+{{ *}}#0) = r30.new
+#CHECK: 46aac200 if (!p0.new) memb(r10+#0) = r30.new
 {
   p0=cmp.eq(r21,##-1160401822)
   r30=add(r9,r22)
   if (!p0.new) memb(r10)=r30.new
 }
 
-#CHECK: 42b8c202 if (p2.new) memb(r24{{ *}}+{{ *}}#0) = r11.new
+#CHECK: 42b8c202 if (p2.new) memb(r24+#0) = r11.new
 {
   if (p2.new) memb(r24)=r11.new
   p2=cmp.eq(r30,##1267977346)
   r11=add(r8,r18)
 }
 
-#CHECK: 44a3ca00 if (!p0) memh(r3{{ *}}+{{ *}}#0) = r28.new
+#CHECK: 44a3ca00 if (!p0) memh(r3+#0) = r28.new
 {
   r28=add(r16,r11)
   if (!p0) memh(r3)=r28.new
 }
 
-#CHECK: 40abca03 if (p3) memh(r11{{ *}}+{{ *}}#0) = r24.new
+#CHECK: 40abca03 if (p3) memh(r11+#0) = r24.new
 {
   if (p3) memh(r11)=r24.new
   r24=add(r18,r19)
 }
 
-#CHECK: a1abd200 memw(r11{{ *}}+{{ *}}#0) = r5.new
+#CHECK: a1abd200 memw(r11+#0) = r5.new
 {
   memw(r11)=r5.new
   r5=add(r0,r10)
 }
 
-#CHECK: a1a2ca00 memh(r2{{ *}}+{{ *}}#0) = r18.new
+#CHECK: a1a2ca00 memh(r2+#0) = r18.new
 {
   r18=add(r27,r18)
   memh(r2)=r18.new
 }
 
-#CHECK: a1bac200 memb(r26{{ *}}+{{ *}}#0) = r15.new
+#CHECK: a1bac200 memb(r26+#0) = r15.new
 {
   r15=add(r22,r17)
   memb(r26)=r15.new
 }
 
-#CHECK: d328ce1c { r29:28{{ *}}={{ *}}vsubub(r15:14, r9:8)
+#CHECK: d328ce1c { r29:28 = vsubub(r15:14,r9:8)
 r29:28=vsubb(r15:14,r9:8)
 
-#CHECK: 8c5ed60c { r12{{ *}}={{ *}}asr(r30, #22):rnd
+#CHECK: 8c5ed60c { r12 = asr(r30,#22):rnd
 r12=asrrnd(r30,#23)
 
-#CHECK: ed1ec109 { r9{{ *}}={{ *}}mpyi(r30, r1)
+#CHECK: ed1ec109 { r9 = mpyi(r30,r1)
 r9=mpyui(r30,r1)
 
-#CHECK: e010d787 { r7{{ *}}={{ *}}+{{ *}}mpyi(r16, #188)
+#CHECK: e010d787 { r7 = +mpyi(r16,#188)
 r7=mpyi(r16,#188)
 
-#CHECK: d206eea2 { p2{{ *}}={{ *}}boundscheck(r7:6, r15:14):raw:hi
+#CHECK: d206eea2 { p2 = boundscheck(r7:6,r15:14):raw:hi
 p2=boundscheck(r7,r15:14)
 
-#CHECK: f27ac102 { p2{{ *}}={{ *}}cmp.gtu(r26, r1)
+#CHECK: f27ac102 { p2 = cmp.gtu(r26,r1)
 p2=cmp.ltu(r1,r26)
 
-#CHECK: f240df00 { p0{{ *}}={{ *}}cmp.gt(r0, r31)
+#CHECK: f240df00 { p0 = cmp.gt(r0,r31)
 p0=cmp.lt(r31,r0)
 
-#CHECK: 7586cc01 { p1{{ *}}={{ *}}cmp.gtu(r6, #96)
+#CHECK: 7586cc01 { p1 = cmp.gtu(r6,#96)
 p1=cmp.geu(r6,#97)
 
-#CHECK: 755dc9a2 { p2{{ *}}={{ *}}cmp.gt(r29, #77)
+#CHECK: 755dc9a2 { p2 = cmp.gt(r29,#77)
 p2=cmp.ge(r29,#78)
 
-#CHECK: d310d60a { r11:10{{ *}}={{ *}}vaddub(r17:16, r23:22)
+#CHECK: d310d60a { r11:10 = vaddub(r17:16,r23:22)
 r11:10=vaddb(r17:16,r23:22)
 
-#CHECK: 8753d1e6 { r6{{ *}}={{ *}}tableidxh(r19, #7, #17):raw
+#CHECK: 8753d1e6 { r6 = tableidxh(r19,#7,#17):raw
 r6=tableidxh(r19,#7,#18)
 
-#CHECK: 8786d277 { r23{{ *}}={{ *}}tableidxw(r6, #3, #18):raw
+#CHECK: 8786d277 { r23 = tableidxw(r6,#3,#18):raw
 r23=tableidxw(r6,#3,#20)
 
-#CHECK: 7c4dfff8 { r25:24{{ *}}={{ *}}combine(#-1, #-101)
+#CHECK: 7c4dfff8 { r25:24 = combine(#-1,#-101)
 r25:24=#-101
 
-#CHECK: 8866c09a { r26{{ *}}={{ *}}vasrhub(r7:6, #0):raw
+#CHECK: 8866c09a { r26 = vasrhub(r7:6,#0):raw
 r26=vasrhub(r7:6,#1):rnd:sat
 
-#CHECK: 7654c016 { r22{{ *}}={{ *}}sub(#0, r20)
+#CHECK: 7654c016 { r22 = sub(#0,r20)
 r22=neg(r20)
 
-#CHECK: 802cc808 { r9:8{{ *}}={{ *}}vasrh(r13:12, #8):raw
+#CHECK: 802cc808 { r9:8 = vasrh(r13:12,#8):raw
 r9:8=vasrh(r13:12,#9):rnd
 
-#CHECK: 7614dfe5 { r5{{ *}}={{ *}}{{zxtb\(r20\)|and\(r20, *#255\)}}
+#CHECK: 7614dfe5 { r5 = {{zxtb\(r20\)|and\(r20,#255\)}}
 r5=zxtb(r20)
 
 #CHECK: 00ab68e2 immext(#179976320)
-#CHECK: 7500c500 p0{{ *}}={{ *}}cmp.eq(r0, ##179976360)
+#CHECK: 7500c500 p0 = cmp.eq(r0,##179976360)
 {
 	if (p0.new) r11=r26
 	p0=cmp.eq(r0,##179976360)
 }
 
-#CHECK: 74f9c00f { if (!p3) r15{{ *}}={{ *}}r25
+#CHECK: 74f9c00f { if (!p3) r15 = add(r25,#0)
 if (!p3) r15=r25
 
-#CHECK: 7425c005 { if (p1) r5{{ *}}={{ *}}r5
+#CHECK: 7425c005 { if (p1) r5 = add(r5,#0)
 if (p1) r5=r5
 
-#CHECK: e9badae2 { r2{{ *}}={{ *}}vrcmpys(r27:26, r27:26):<<1:rnd:sat:raw:lo
+#CHECK: e9badae2 { r2 = vrcmpys(r27:26,r27:26):<<1:rnd:sat:raw:lo
 r2=vrcmpys(r27:26,r26):<<1:rnd:sat
 
-#CHECK: fd13f20e if (p0.new) r15:14{{ *}}={{ *}}{{r19:18|combine\(r19, *r18\)}}
+#CHECK: fd13f20e if (p0.new) r15:14 = {{r19:18|combine\(r19,r18\)}}
 {
   p0=cmp.eq(r26,##1766934387)
   if (p0.new) r15:14=r19:18
 }
 
-#CHECK: fd07c6c2 { if (!p2) r3:2{{ *}}={{ *}}{{r7:6|combine\(r7, *r6\)}}
+#CHECK: fd07c6c2 { if (!p2) r3:2 = {{r7:6|combine\(r7,r6\)}}
 if (!p2) r3:2=r7:6
 
-#CHECK: fd0dcc7e { if (p3) r31:30{{ *}}={{ *}}{{r13:12|combine\(r13, *r12\)}}
+#CHECK: fd0dcc7e { if (p3) r31:30 = {{r13:12|combine\(r13,r12\)}}
 if (p3) r31:30=r13:12
 
-#CHECK: 748ae015 if (!p0.new) r21{{ *}}={{ *}}r10
+#CHECK: 748ae015 if (!p0.new) r21 = add(r10,#0)
 {
   p0=cmp.eq(r23,##805633208)
   if (!p0.new) r21=r10
 }
 
-#CHECK: d36ec6c8 { r9:8{{ *}}={{ *}}add(r15:14, r7:6):raw:lo
+#CHECK: d36ec6c8 { r9:8 = add(r15:14,r7:6):raw:lo
 r9:8=add(r14,r7:6)
 
 #CHECK: 01e65477 immext(#509943232)
-#CHECK: 7516c3a3 p3{{ *}}={{ *}}cmp.eq(r22, ##509943261)
+#CHECK: 7516c3a3 p3 = cmp.eq(r22,##509943261)
 {
-  if (!p3.new) r9:8=r25:24
+  if (!p3.new) r9:8 = r25:24
   p3=cmp.eq(r22,##509943261)
 }
 
-#CHECK: 87e0d5e5 { r5{{ *}}={{ *}}tableidxd(r0, #15, #21):raw
+#CHECK: 87e0d5e5 { r5 = tableidxd(r0,#15,#21):raw
 r5=tableidxd(r0,#15,#24)
 
-#CHECK: 8701db65 { r5{{ *}}={{ *}}tableidxb(r1, #3, #27):raw
+#CHECK: 8701db65 { r5 = tableidxb(r1,#3,#27):raw
 r5=tableidxb(r1,#3,#27)
 
-#CHECK: 767affe3 { r3{{ *}}={{ *}}sub(#-1, r26)
+#CHECK: 767affe3 { r3 = sub(#-1,r26)
 r3=not(r26)
 
-#CHECK: f51ddc06 { r7:6{{ *}}={{ *}}{{r29:28|combine\(r29, *r28\)}}
+#CHECK: f51ddc06 { r7:6 = {{r29:28|combine\(r29,r28\)}}
 r7:6=r29:28
 
-#CHECK: 9406c000 { dcfetch(r6 + #0)
+#CHECK: 9406c000 { dcfetch(r6+#0)
 dcfetch(r6)
 
-#CHECK: 6b20c001 { p1{{ *}}={{ *}}or(p0, p0)
+#CHECK: 6b20c001 { p1 = or(p0,p0)
 p1=p0
 
-#CHECK: eafcdc82 { r3:2 += vrcmpys(r29:28, r29:28):<<1:sat:raw:lo
+#CHECK: eafcdc82 { r3:2 += vrcmpys(r29:28,r29:28):<<1:sat:raw:lo
 r3:2+=vrcmpys(r29:28,r28):<<1:sat
 
-#CHECK: e8ead092 { r19:18{{ *}}={{ *}}vrcmpys(r11:10, r17:16):<<1:sat:raw:lo
+#CHECK: e8ead092 { r19:18 = vrcmpys(r11:10,r17:16):<<1:sat:raw:lo
 r19:18=vrcmpys(r11:10,r16):<<1:sat
 
-#CHECK: 9082c014 { r21:20{{ *}}={{ *}}memb_fifo(r2{{ *}}+{{ *}}#0)
+#CHECK: 9082c014 { r21:20 = memb_fifo(r2+#0)
 r21:20=memb_fifo(r2)
 
-#CHECK: 9056c01c { r29:28{{ *}}={{ *}}memh_fifo(r22{{ *}}+{{ *}}#0)
-r29:28=memh_fifo(r22)
\ No newline at end of file
+#CHECK: 9056c01c { r29:28 = memh_fifo(r22+#0)
+r29:28=memh_fifo(r22)
diff --git a/test/MC/Hexagon/bug20416.s b/test/MC/Hexagon/bug20416.s
new file mode 100644
index 0000000000000000000000000000000000000000..530a4e64778aa85c886513a177a32375936c0d87
--- /dev/null
+++ b/test/MC/Hexagon/bug20416.s
@@ -0,0 +1,13 @@
+# RUN: not llvm-mc -triple=hexagon -mv60 -mhvx -filetype=asm %s 2>%t; FileCheck %s --check-prefix=CHECK-V60-ERROR <%t
+# RUN:     llvm-mc -triple=hexagon -mv62 -mhvx -filetype=asm %s | FileCheck %s
+
+// for this a v60+/hvx instruction sequence, make sure fails with v60
+// but passes with v62.  this is because this instruction uses different
+// itinerary between v60 and v62
+{
+  v0.h=vsat(v5.w,v9.w)
+  v16.h=vsat(v6.w,v26.w)
+}
+# CHECK-V60-ERROR: rror: invalid instruction packet: slot error
+# CHECK: v0.h = vsat(v5.w,v9.w)
+# CHECK: v16.h = vsat(v6.w,v26.w)
diff --git a/test/MC/Hexagon/capitalizedEndloop.s b/test/MC/Hexagon/capitalizedEndloop.s
index d20ff34de6fed1f2f6ab4ad698ce41b15faa081f..c7a25d9fb27b666f3e2cf491b96f0872d1010bba 100644
--- a/test/MC/Hexagon/capitalizedEndloop.s
+++ b/test/MC/Hexagon/capitalizedEndloop.s
@@ -15,7 +15,7 @@
 	{ R0 = mpyi(R0,R0) } : ENDLOOP0 : ENDLOOP1
 	{ R0 = mpyi(R0,R0) }:endloop0:endloop1
 
-# CHECK: r0 = mpyi(r0, r0)
+# CHECK: r0 = mpyi(r0,r0)
 # CHECK: :endloop0
 # CHECK: :endloop0
 # CHECK: :endloop0
diff --git a/test/MC/Hexagon/common-redeclare.s b/test/MC/Hexagon/common-redeclare.s
new file mode 100644
index 0000000000000000000000000000000000000000..52b77992a871d64fc0159c27377e5cef075e09a0
--- /dev/null
+++ b/test/MC/Hexagon/common-redeclare.s
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -t - | FileCheck %s
+
+# CHECK: 00000062 g       *COM*  00000008 quartet_table_isqrt
+
+.common quartet_table_isqrt, 98, 8
+.common quartet_table_isqrt, 98, 8
diff --git a/test/MC/Hexagon/dcfetch-symbol.s b/test/MC/Hexagon/dcfetch-symbol.s
new file mode 100644
index 0000000000000000000000000000000000000000..8309439a2aaa042407e96142b3479200736fa271
--- /dev/null
+++ b/test/MC/Hexagon/dcfetch-symbol.s
@@ -0,0 +1,8 @@
+# RUN: not llvm-mc -arch=hexagon -filetype=obj %s
+
+#CHECK: 9400c000 { dcfetch(r0 + #0) }
+
+junk:
+{
+  dcfetch(r0 + #junk)
+}
diff --git a/test/MC/Hexagon/decode_acc_type.s b/test/MC/Hexagon/decode_acc_type.s
new file mode 100644
index 0000000000000000000000000000000000000000..84d0abc0e18d2a245571daf00c25dccf8fe327ce
--- /dev/null
+++ b/test/MC/Hexagon/decode_acc_type.s
@@ -0,0 +1,150 @@
+# RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+#
+
+# Currently ignore if there is one or two #'s
+
+		r7 = memw(gp+#192)
+# CHECK:	r7 = memw(gp+#192)
+
+		r3:2 = memd(gp+#64)
+# CHECK: 	r3:2 = memd(gp+#64)
+
+		{ p3 = p1; r8 = #2; if (p3.new) memw(##8) = r8.new }
+# CHECK:	if (p3.new) memw({{..}}8) = r8
+
+		{ p3 = p1; r8 = #2; if (!p3.new) memw(##8) = r8.new }
+# CHECK:	if (!p3.new) memw({{..}}8) = r8.new
+
+		{ r8 = #2; if (p3) memw(##8) = r8.new }
+# CHECK:	if (p3) memw({{..}}8) = r8.new
+
+		{ r8 = #2; if (!p3) memw(##8) = r8.new }
+# CHECK:	if (!p3) memw({{..}}8) = r8.new
+
+		{ p3 = p1; r8 = #2; if (p3.new) memh(##8) = r8.new }
+# CHECK:	if (p3.new) memh({{..}}8) = r8.new
+
+		{ p3 = p1; r8 = #2; if (!p3.new) memh(##8) = r8.new }
+# CHECK:	if (!p3.new) memh({{..}}8) = r8.new
+
+		{ r8 = #2; if (p3) memh(##8) = r8.new }
+# CHECK:	memh({{..}}8) = r8.new
+
+		{ r8 = #2; if (!p3) memh(##8) = r8.new }
+# CHECK:	if (!p3) memh({{..}}8) = r8.new
+
+		{ p3 = p1; r8 = #2; if (p3.new) memb(##8) = r8.new }
+# CHECK:	if (p3.new) memb({{..}}8) = r8.new
+
+		{ p3 = p1; r8 = #2; if (!p3.new) memb(##8) = r8.new }
+# CHECK:	if (!p3.new) memb({{..}}8) = r8.new
+
+		{ r8 = #2; if (p3) memb(##8) = r8.new }
+# CHECK:	if (p3) memb({{..}}8) = r8.new
+
+		{ r8 = #2; if (!p3) memb(##8) = r8.new }
+# CHECK:	if (!p3) memb({{..}}8) = r8.new
+
+		{ if (p3) memw(##8) = r8 }
+# CHECK:	if (p3) memw({{..}}8) = r8
+
+		{ if (!p3) memw(##8) = r8 }
+# CHECK:	if (!p3) memw({{..}}8) = r8
+
+		{ p3 = p1; if (p3.new) memw(##8) = r8 }
+# CHECK:	if (p3.new) memw({{..}}8) = r8
+
+		{ p3 = p1; if (!p3.new) memw(##8) = r8 }
+# CHECK:	if (!p3.new) memw({{..}}8) = r8
+
+
+		if (!p2) r14 = memb(##48)
+# CHECK:	if (!p2) r14 = memb({{..}}48)
+
+		if (p2) r14 = memb(##48)
+# CHECK:	if (p2) r14 = memb({{..}}48)
+
+		{p2 = p0; if (!p2.new) r14 = memb(##48) }
+# CHECK:	if (!p2.new) r14 = memb({{..}}48)
+
+		{p3 = p2; if (p3.new) r14 = memb(##48) }
+# CHECK:	if (p3.new) r14 = memb({{..}}48)
+
+
+		if (!p2) r14 = memh(##48)
+# CHECK:	if (!p2) r14 = memh({{..}}48)
+
+		if (p2) r14 = memh(##48)
+# CHECK:	if (p2) r14 = memh({{..}}48)
+
+		{p2 = p0; if (!p2.new) r14 = memh(##48) }
+# CHECK:	if (!p2.new) r14 = memh({{..}}48)
+
+		{p3 = p2; if (p3.new) r14 = memh(##48) }
+# CHECK:	if (p3.new) r14 = memh({{..}}48)
+
+
+		if (!p2) r14 = memub(##48)
+# CHECK:	if (!p2) r14 = memub({{..}}48)
+
+		if (p2) r14 = memub(##48)
+# CHECK:	if (p2) r14 = memub({{..}}48)
+
+		{p2 = p0; if (!p2.new) r14 = memub(##48) }
+# CHECK:	if (!p2.new) r14 = memub({{..}}48)
+
+		{p3 = p2; if (p3.new) r14 = memub(##48) }
+# CHECK:	if (p3.new) r14 = memub({{..}}48)
+
+
+		if (!p2) r14 = memuh(##48)
+# CHECK:	if (!p2) r14 = memuh({{..}}48)
+
+		if (p2) r14 = memuh(##48)
+# CHECK:	if (p2) r14 = memuh({{..}}48)
+
+		{p2 = p0; if (!p2.new) r14 = memuh(##48) }
+# CHECK:	if (!p2.new) r14 = memuh({{..}}48)
+
+		{p3 = p2; if (p3.new) r14 = memuh(##48) }
+# CHECK:	r14 = memuh({{..}}48)
+
+
+		if (!p2) r14 = memw(##48)
+# CHECK:	if (!p2) r14 = memw({{..}}48)
+
+		if (p2) r14 = memw(##48)
+# CHECK:	if (p2) r14 = memw({{..}}48)
+
+		{p2 = p0; if (!p2.new) r14 = memw(##48) }
+# CHECK:	if (!p2.new) r14 = memw({{..}}48)
+
+		{p3 = p2; if (p3.new) r14 = memw(##48) }
+# CHECK:	if (p3.new) r14 = memw({{..}}48)
+
+		r7 = memh(##32)
+# CHECK: 	r7 = memh(##32)
+		r7 = memuh(##32)
+# CHECK: 	r7 = memuh(##32)
+
+		memd(##32) = r15:14
+# CHECK: 	memd(##32) = r15:14
+
+		{r2 = #9; memw(##32) = r2.new}
+# CHECK:	memw(##32) = r2.new
+
+		{r2 = #9; memb(##32) = r2.new}
+# CHECK:	memb(##32) = r2.new
+
+		memw(##32) = r15
+# CHECK: 	memw(##32) = r15
+
+		memh(##32) = r16
+# CHECK: 	memh(##32) = r16
+
+		memb(##32) = r17
+# CHECK: 	memb(##32) = r17
+
+
+		r3:2 = interleave(r31:30)
+# CHECK:	r3:2 = interleave(r31:30)
diff --git a/test/MC/Hexagon/dis-duplex-p0.s b/test/MC/Hexagon/dis-duplex-p0.s
index dc6a1260145e1868c01efc730ec07a5af7bebad4..4ee518fa2a3170e2f892d66fea9278be14f7fdb3 100644
--- a/test/MC/Hexagon/dis-duplex-p0.s
+++ b/test/MC/Hexagon/dis-duplex-p0.s
@@ -1,7 +1,10 @@
 // RUN: llvm-mc -arch=hexagon -filetype=obj -o - %s | llvm-objdump -d - | FileCheck %s
-// REQUIRES: asserts
-  .text
-// CHECK: { r7 = #-1; r7 = #-1 }
-  .long 0x3a373a27
-// CHECK: { if (!p0.new) r7 = #0; if (p0.new) r7 = #0 }
-  .long 0x3a573a47
+
+{ r7 = #-1
+  r6 = #-1 }
+// CHECK: { r7 = #-1; r6 = #-1 }
+
+{ p0 = r0
+  if (p0.new) r7 = #0
+  if (!p0.new) r7 = #0 }
+// CHECK: if (p0.new) r7 = #0; if (!p0.new) r7 = #0
diff --git a/test/MC/Hexagon/duplex-registers.s b/test/MC/Hexagon/duplex-registers.s
index f0cde7f9628d73caf58c5df4e8c1c025f5e1ecf5..2a02b4534f29d8e1810234ad2505187e74b051dc 100644
--- a/test/MC/Hexagon/duplex-registers.s
+++ b/test/MC/Hexagon/duplex-registers.s
@@ -7,4 +7,4 @@
 }
 
 # CHECK: 289808ba
-# CHECK: r16 = memuh(r17 + #0);{{ *}}r18 = memuh(r19 + #0)
+# CHECK: r16 = memuh(r17+#0);{{ *}}r18 = memuh(r19+#0)
diff --git a/test/MC/Hexagon/elf-flags.s b/test/MC/Hexagon/elf-flags.s
index 94dce8152144906899c40412bea6afcf1be638a0..0d2f007cb3da75d38f1f3e8583d38335d8cfbe7f 100644
--- a/test/MC/Hexagon/elf-flags.s
+++ b/test/MC/Hexagon/elf-flags.s
@@ -2,8 +2,10 @@
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv5 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V5 %s
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv55 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V55 %s
 # RUN: llvm-mc -arch=hexagon -mcpu=hexagonv60 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V60 %s
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv62 --filetype=obj %s -o - | llvm-readobj -file-headers -elf-output-style=GNU | FileCheck --check-prefix=CHECK-V62 %s
 
 # CHECK-V4: Flags: 0x3
 # CHECK-V5: Flags: 0x4
 # CHECK-V55: Flags: 0x5
 # CHECK-V60: Flags: 0x60
+# CHECK-V62: Flags: 0x62
diff --git a/test/MC/Hexagon/equ.s b/test/MC/Hexagon/equ.s
new file mode 100644
index 0000000000000000000000000000000000000000..fbf09edbbc1e0e189f76bfa009a71f87c8ed7b6b
--- /dev/null
+++ b/test/MC/Hexagon/equ.s
@@ -0,0 +1,9 @@
+# RUN: not llvm-mc -arch=hexagon %s 2> %t
+# RUN: FileCheck < %t %s
+
+.equ   a, 0
+.set   a, 1
+.equ   a, 2
+.equiv a, 3
+# CHECK: {{[Ee]}}rror: redefinition of 'a'
+
diff --git a/test/MC/Hexagon/ext-callt-rel.s b/test/MC/Hexagon/ext-callt-rel.s
new file mode 100644
index 0000000000000000000000000000000000000000..344a8fbc11b94e4acdd0243f4f4d048f7127bdaa
--- /dev/null
+++ b/test/MC/Hexagon/ext-callt-rel.s
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -arch=hexagon -filetype=obj %s -o - | llvm-objdump -r - | FileCheck %s
+
+if (p0) call foo
+#CHECK: R_HEX_B32_PCREL_X
+#CHECK: R_HEX_B15_PCREL_X
+
diff --git a/test/MC/Hexagon/extended_relocations.ll b/test/MC/Hexagon/extended_relocations.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a16185c399459addeb0b3b513fb84f14acd10696
--- /dev/null
+++ b/test/MC/Hexagon/extended_relocations.ll
@@ -0,0 +1,23 @@
+; RUN: llc -filetype=obj -march=hexagon %s -o - | llvm-objdump -r - | FileCheck %s
+
+; CHECK: RELOCATION RECORDS FOR [.rela.text]:
+; CHECK: 00000000 R_HEX_B22_PCREL printf
+; CHECK: 00000004 R_HEX_32_6_X .rodata.str1.1
+; CHECK: 00000008 R_HEX_6_X .rodata.str1.1
+
+target triple = "hexagon-unknown--elf"
+
+@.str = private unnamed_addr constant [10 x i8] c"cxfir.log\00", align 1
+
+declare i32 @printf(i8*, ...) #1
+
+; Function Attrs: nounwind
+define i32 @main() #0 {
+entry:
+  %call = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([10 x i8], [10 x i8]* @.str, i32 0, i32 0))
+  ret i32 0
+}
+
+attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
diff --git a/test/MC/Hexagon/extender.s b/test/MC/Hexagon/extender.s
new file mode 100644
index 0000000000000000000000000000000000000000..f807dbe0cdd7491f3dc1f9390b541e0a4423e110
--- /dev/null
+++ b/test/MC/Hexagon/extender.s
@@ -0,0 +1,210 @@
+# RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+#
+
+# STrib_abs_V4
+{
+  memb(##1024056) = r0
+}
+
+# CHECK: immext(#1024000)
+# CHECK: memb(##1024056) = r0
+
+# S2_storerbgp
+{
+  memb(GP + #56) = r0
+}
+
+# CHECK: memb(gp+#56) = r0
+
+# STrih_abs_V4
+{
+  memh(##1024056) = r0
+}
+
+# CHECK: immext(#1024000)
+# CHECK: memh(##1024056) = r0
+
+# S2_storerhgp
+{
+  memh(GP + #56) = r0
+}
+
+# CHECK: memh(gp+#56) = r0
+
+# STriw_abs_V4
+{
+  memw(##1024056) = r0
+}
+
+# CHECK: immext(#1024000)
+# CHECK: memw(##1024056) = r0
+
+# S2_storerigp
+{
+  memw(GP + #56) = r0
+}
+
+# CHECK: memw(gp+#56) = r0
+
+# STrib_abs_nv_V4
+{
+  r0 = #1
+  memb(##1024056) = r0.new
+}
+
+# CHECK: r0 = #1
+# CHECK: immext(#1024000)
+# CHECK: memb(##1024056) = r0.new
+
+# S2_storerbnewgp
+{
+  r0 = #1
+  memb(GP + #56) = r0.new
+}
+
+# CHECK: r0 = #1
+# CHECK: memb(gp+#56) = r0.new
+
+# STrih_abs_nv_V4
+{
+  r0 = #1
+  memh(##1024056) = r0.new
+}
+
+# CHECK: r0 = #1
+# CHECK: immext(#1024000)
+# CHECK: memh(##1024056) = r0.new
+
+# S2_storerhnewgp
+{
+  r0 = #1
+  memh(GP + #56) = r0.new
+}
+
+# CHECK: r0 = #1
+# CHECK: memh(gp+#56) = r0.new
+
+# STriw_abs_nv_V4
+{
+  r0 = #1
+  memw(##1024056) = r0.new
+}
+
+# CHECK: r0 = #1
+# CHECK: immext(#1024000)
+# CHECK: memw(##1024056) = r0.new
+
+# S2_storerinewgp
+{
+  r0 = #1
+  memw(GP + #56) = r0.new
+}
+
+# CHECK: r0 = #1
+# CHECK: memw(gp+#56) = r0.new
+
+# STrid_abs_V4
+{
+  memd(##1024056) = r1:0
+}
+
+# CHECK: immext(#1024000)
+# CHECK: memd(##1024056) = r1:0
+
+# S2_storerdgp
+{
+  memd(GP + #56) = r1:0
+}
+
+# CHECK: memd(gp+#56) = r1:0
+
+# LDrib_abs_V4
+{
+  r0 = memb(##1024056)
+}
+
+# CHECK: immext(#1024000)
+# CHECK: r0 = memb(##1024056)
+
+# LDb_GP_V4
+{
+  r0 = memb(GP + #56)
+}
+
+# CHECK: r0 = memb(gp+#56)
+
+# LDriub_abs_V4
+{
+  r0 = memub(##1024056)
+}
+
+# CHECK: immext(#1024000)
+# CHECK: r0 = memub(##1024056)
+
+# LDub_GP_V4
+{
+  r0 = memub(GP + #56)
+}
+
+# CHECK: r0 = memub(gp+#56)
+
+# LDrih_abs_V4
+{
+  r0 = memh(##1024056)
+}
+
+# CHECK: immext(#1024000)
+# CHECK: r0 = memh(##1024056)
+
+# LDh_GP_V4
+{
+  r0 = memh(GP + #56)
+}
+
+# CHECK: r0 = memh(gp+#56)
+
+# LDriuh_abs_V4
+{
+  r0 = memuh(##1024056)
+}
+
+# CHECK: immext(#1024000)
+# CHECK: r0 = memuh(##1024056)
+
+# LDuh_GP_V4
+{
+  r0 = memuh(GP + #56)
+}
+
+# CHECK: r0 = memuh(gp+#56)
+
+# LDriw_abs_V4
+{
+  r0 = memw(##1024056)
+}
+
+# CHECK: immext(#1024000)
+# CHECK: r0 = memw(##1024056)
+
+# LDw_GP_V4
+{
+  r0 = memw(GP + #56)
+}
+
+# CHECK: r0 = memw(gp+#56)
+
+# LDrid_abs_V4
+{
+  r1:0 = memd(##1024056)
+}
+
+# CHECK: immext(#1024000)
+# CHECK: r1:0 = memd(##1024056)
+
+# LDd_GP_V4
+{
+  r1:0 = memd(GP + #56)
+}
+
+# CHECK: r1:0 = memd(gp+#56)
+
diff --git a/test/MC/Hexagon/fixups.s b/test/MC/Hexagon/fixups.s
index 059a18fa882284d738274f928d30d096cc2c6a66..33913362df7bb9ae789605cc5da5c4fa702d5210 100644
--- a/test/MC/Hexagon/fixups.s
+++ b/test/MC/Hexagon/fixups.s
@@ -3,7 +3,7 @@
   .text
 # CHECK-LABEL: 0:
 # CHECK: 2442e106
-# CHECK: if (!cmp.eq(r1.new, #1)) jump:t 0xc
+# CHECK: if (!cmp.eq(r1.new,#1)) jump:t 0xc
   {
     r1 = zxth(r2)
     if (!cmp.eq(r1.new, #1)) jump:t .L1
@@ -15,7 +15,7 @@
 # CHECK: 00004020
 # CHECK: immext(#2048)
 # CHECK: 2442e118
-# CHECK: if (!cmp.eq(r1.new, #1)) jump:t 0x81c
+# CHECK: if (!cmp.eq(r1.new,#1)) jump:t 0x81c
   {
     r1 = zxth(r2)
     if (!cmp.eq(r1.new, #1)) jump:t .L2
diff --git a/test/MC/Hexagon/iconst.s b/test/MC/Hexagon/iconst.s
index 277c4de869233c779c43f75a9db3f88fcccbe602..917cc64ba953bc27abe5110cb700f65fb65388f6 100644
--- a/test/MC/Hexagon/iconst.s
+++ b/test/MC/Hexagon/iconst.s
@@ -1,6 +1,6 @@
 # RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-objdump -d -r - | FileCheck %s
 
 a:
-# CHECK: r0 = add(r0, #0)
+# CHECK: r0 = add(r0,#0)
 # CHECK: R_HEX_23_REG
-r0 = iconst(#a)
\ No newline at end of file
+r0 = iconst(#a)
diff --git a/test/MC/Hexagon/inst_cmp_eq.ll b/test/MC/Hexagon/inst_cmp_eq.ll
index 98202368aff35ee0e3c79cfc93c8861abaf7c1d1..5c483451d713dd66957d27e423704f81774bafb0 100644
--- a/test/MC/Hexagon/inst_cmp_eq.ll
+++ b/test/MC/Hexagon/inst_cmp_eq.ll
@@ -7,6 +7,6 @@ define i1 @foo (i32 %a, i32 %b)
   ret i1 %1
 }
 
-; CHECK: p0 = cmp.eq(r0, r1)
+; CHECK: p0 = cmp.eq(r0,r1)
 ; CHECK: r0 = p0
 ; CHECK: jumpr r31
diff --git a/test/MC/Hexagon/inst_cmp_eqi.ll b/test/MC/Hexagon/inst_cmp_eqi.ll
index 612dfdc8f23dac5bb7045f7b885af0ac34a56da1..5d8132b70bb9b2628b4831f0bdcb54f34a5f516e 100644
--- a/test/MC/Hexagon/inst_cmp_eqi.ll
+++ b/test/MC/Hexagon/inst_cmp_eqi.ll
@@ -7,6 +7,6 @@ define i1 @foo (i32 %a)
   ret i1 %1
 }
 
-; CHECK: p0 = cmp.eq(r0, #42)
+; CHECK: p0 = cmp.eq(r0,#42)
 ; CHECK: r0 = p0
 ; CHECK: jumpr r31
diff --git a/test/MC/Hexagon/inst_cmp_gt.ll b/test/MC/Hexagon/inst_cmp_gt.ll
index 3ce1c0addad7ac0fe5c934e890dfe176c482bb86..45a4e33e940f9062af11c9c5efeb8dbdf3e40b58 100644
--- a/test/MC/Hexagon/inst_cmp_gt.ll
+++ b/test/MC/Hexagon/inst_cmp_gt.ll
@@ -7,6 +7,6 @@ define i1 @foo (i32 %a, i32 %b)
   ret i1 %1
 }
 
-; CHECK: p0 = cmp.gt(r0, r1)
+; CHECK: p0 = cmp.gt(r0,r1)
 ; CHECK: r0 = p0
-; CHECK: jumpr r31 }
\ No newline at end of file
+; CHECK: jumpr r31 }
diff --git a/test/MC/Hexagon/inst_cmp_gti.ll b/test/MC/Hexagon/inst_cmp_gti.ll
index f3c13a2fb96e6543cfb2b49ae3b14306244f9f71..67cdc4c909bbb23daa13816297ae9bb0c8bbad93 100644
--- a/test/MC/Hexagon/inst_cmp_gti.ll
+++ b/test/MC/Hexagon/inst_cmp_gti.ll
@@ -7,6 +7,6 @@ define i1 @foo (i32 %a)
   ret i1 %1
 }
 
-; CHECK: p0 = cmp.gt(r0, #42)
+; CHECK: p0 = cmp.gt(r0,#42)
 ; CHECK: r0 = p0
 ; CHECK: jumpr r31
diff --git a/test/MC/Hexagon/inst_cmp_lt.ll b/test/MC/Hexagon/inst_cmp_lt.ll
index 80ba16f41418ce9fb65d003efea392f7cb6a8a9b..b19a4a676aafa9fd885997544029bb985a86a961 100644
--- a/test/MC/Hexagon/inst_cmp_lt.ll
+++ b/test/MC/Hexagon/inst_cmp_lt.ll
@@ -7,6 +7,6 @@ define i1 @foo (i32 %a, i32 %b)
   ret i1 %1
 }
 
-; CHECK: p0 = cmp.gt(r1, r0)
+; CHECK: p0 = cmp.gt(r1,r0)
 ; CHECK: r0 = p0
 ; CHECK: jumpr r31
diff --git a/test/MC/Hexagon/inst_cmp_ugt.ll b/test/MC/Hexagon/inst_cmp_ugt.ll
index 07fa784dc64adf60bdd35aa85388974173fe3b8d..7af40c6ed034d90d74e8917f7467807a28296ed1 100644
--- a/test/MC/Hexagon/inst_cmp_ugt.ll
+++ b/test/MC/Hexagon/inst_cmp_ugt.ll
@@ -7,6 +7,6 @@ define i1 @foo (i32 %a, i32 %b)
   ret i1 %1
 }
 
-; CHECK: p0 = cmp.gtu(r0, r1)
+; CHECK: p0 = cmp.gtu(r0,r1)
 ; CHECK: r0 = p0
 ; CHECK: jumpr r31
diff --git a/test/MC/Hexagon/inst_cmp_ugti.ll b/test/MC/Hexagon/inst_cmp_ugti.ll
index 59db552b39f4da792ee3b5c490ec96f998b4c21b..63d94e4ff87ab87cb964cb7542d3704087a6fd8e 100644
--- a/test/MC/Hexagon/inst_cmp_ugti.ll
+++ b/test/MC/Hexagon/inst_cmp_ugti.ll
@@ -7,6 +7,6 @@ define i1 @foo (i32 %a)
   ret i1 %1
 }
 
-; CHECK: p0 = cmp.gtu(r0, #42)
+; CHECK: p0 = cmp.gtu(r0,#42)
 ; CHECK: r0 = p0
 ; CHECK: jumpr r31
diff --git a/test/MC/Hexagon/inst_cmp_ult.ll b/test/MC/Hexagon/inst_cmp_ult.ll
index c880ac8a229c9fc81fdf4174554935551f6a6555..ecda120a4598319315c1f06b3dce1929d7aabffb 100644
--- a/test/MC/Hexagon/inst_cmp_ult.ll
+++ b/test/MC/Hexagon/inst_cmp_ult.ll
@@ -7,6 +7,6 @@ define i1 @foo (i32 %a, i32 %b)
   ret i1 %1
 }
 
-; CHECK: p0 = cmp.gtu(r1, r0)
+; CHECK: p0 = cmp.gtu(r1,r0)
 ; CHECK: r0 = p0
-; CHECK: jumpr r31
\ No newline at end of file
+; CHECK: jumpr r31
diff --git a/test/MC/Hexagon/inst_select.ll b/test/MC/Hexagon/inst_select.ll
index 9d12c1de73fef347a852a287e67a0900b6bc7327..a730419c854a76c17bf277196f1856112af25e05 100644
--- a/test/MC/Hexagon/inst_select.ll
+++ b/test/MC/Hexagon/inst_select.ll
@@ -7,7 +7,7 @@ define i32 @foo (i1 %a, i32 %b, i32 %c)
   ret i32 %1
 }
 
-; CHECK: 00 40 00 85 85004000
+; CHECK: 00 40 40 85 85404000
 ; CHECK: 00 40 9f 52 529f4000
 ; CHECK: 00 60 01 74 74016000
-; CHECK: 00 e0 82 74 7482e000
\ No newline at end of file
+; CHECK: 00 e0 82 74 7482e000
diff --git a/test/MC/Hexagon/instructions/ld.s b/test/MC/Hexagon/instructions/ld.s
index 2695999aa85fe15f041f7f4e65fbf004f9a508ea..5d18e6a304926ee2239a894de7628eaab81e3065 100644
--- a/test/MC/Hexagon/instructions/ld.s
+++ b/test/MC/Hexagon/instructions/ld.s
@@ -1,6 +1,11 @@
 # RUN: llvm-mc -triple hexagon -filetype=obj -o - %s | llvm-objdump -d - | FileCheck %s
 # Hexagon Programmer's Reference Manual 11.5 LD
 
+# Load doubleword
+# CHECK: 90 ff d5 3a
+r17:16 = memd(r21 + r31<<#3)
+# CHECK: b0 c2 c0 49
+r17:16 = memd(gp+#168)
 # CHECK: 02 40 00 00
 # CHECK-NEXT: 10 c5 c0 49
 r17:16 = memd(##168)
@@ -62,7 +67,7 @@ if (!p3) r17:16 = memd(r21++#40)
 # CHECK: 91 ff 15 3a
 r17 = memb(r21 + r31<<#3)
 # CHECK: b1 c2 00 49
-r17 = memb(#21)
+r17 = memb(gp+#21)
 # CHECK: 00 40 00 00
 # CHECK-NEXT: b1 c2 00 49
 r17 = memb(##21)
@@ -140,7 +145,7 @@ r17:16 = memh_fifo(r21 ++ I:circ(m1))
 # CHECK: 91 ff 55 3a
 r17 = memh(r21 + r31<<#3)
 # CHECK: b1 c2 40 49
-r17 = memh(#42)
+r17 = memh(gp+#42)
 # CHECK: 00 40 00 00
 # CHECK-NEXT: 51 c5 40 49
 r17 = memh(##42)
@@ -202,7 +207,7 @@ if (!p3) r17 = memh(r21 + #62)
 # CHECK: 91 ff 35 3a
 r17 = memub(r21 + r31<<#3)
 # CHECK: b1 c2 20 49
-r17 = memub(#21)
+r17 = memub(gp+#21)
 # CHECK: 00 40 00 00
 # CHECK-NEXT: b1 c2 20 49
 r17 = memub(##21)
@@ -264,7 +269,7 @@ if (!p3) r17 = memub(r21++#5)
 # CHECK: 91 ff 75 3a
 r17 = memuh(r21 + r31<<#3)
 # CHECK: b1 c2 60 49
-r17 = memuh(#42)
+r17 = memuh(gp+#42)
 # CHECK: 00 40 00 00
 # CHECK-NEXT: 51 c5 60 49
 r17 = memuh(##42)
@@ -326,7 +331,7 @@ if (!p3) r17 = memuh(r21++#10)
 # CHECK: 91 ff 95 3a
 r17 = memw(r21 + r31<<#3)
 # CHECK: b1 c2 80 49
-r17 = memw(#84)
+r17 = memw(gp+#84)
 # CHECK: 01 40 00 00
 # CHECK-NEXT: 91 c2 80 49
 r17 = memw(##84)
diff --git a/test/MC/Hexagon/instructions/nv_st.s b/test/MC/Hexagon/instructions/nv_st.s
index 4ff490024a82a1588babd685bbb73bcddd02063f..46ab31ef2f75ad2a59d4cbf99ba6d5c70ebdf696 100644
--- a/test/MC/Hexagon/instructions/nv_st.s
+++ b/test/MC/Hexagon/instructions/nv_st.s
@@ -9,7 +9,7 @@
 # CHECK: 1f 40 7f 70
 # CHECK-NEXT: 11 c2 a0 48
 { r31 = r31
-  memb(#17) = r31.new }
+  memb(gp+#17) = r31.new }
 # CHECK: 1f 40 7f 70
 # CHECK-NEXT: 15 c2 b1 a1
 { r31 = r31
@@ -105,7 +105,7 @@
 # CHECK: 1f 40 7f 70
 # CHECK-NEXT: 15 ca a0 48
 { r31 = r31
-  memh(#42) = r31.new }
+  memh(gp+#42) = r31.new }
 # CHECK: 1f 40 7f 70
 # CHECK-NEXT: 15 ca b1 a1
 { r31 = r31
@@ -201,7 +201,7 @@
 # CHECK: 1f 40 7f 70
 # CHECK-NEXT: 15 d2 a0 48
 { r31 = r31
-  memw(#84) = r31.new }
+  memw(gp+#84) = r31.new }
 # CHECK: 1f 40 7f 70
 # CHECK-NEXT: 15 d2 b1 a1
 { r31 = r31
diff --git a/test/MC/Hexagon/instructions/st.s b/test/MC/Hexagon/instructions/st.s
index 3b5e8ee18100b78c1f987e2972affe57d5113555..6ea6e9f47f7766ba4f45f436289bbde76d5d4acb 100644
--- a/test/MC/Hexagon/instructions/st.s
+++ b/test/MC/Hexagon/instructions/st.s
@@ -5,7 +5,7 @@
 # CHECK: 9e f5 d1 3b
 memd(r17 + r21<<#3) = r31:30
 # CHECK: 28 d4 c0 48
-memd(#320) = r21:20
+memd(gp+#320) = r21:20
 # CHECK: 02 40 00 00
 # CHECK-NEXT: 28 d4 c0 48
 memd(##168) = r21:20
@@ -83,7 +83,7 @@ memb(r17 + r21<<#3) = r31
 # CHECK: 9f ca 11 3c
 memb(r17+#21)=#31
 # CHECK: 15 d5 00 48
-memb(#21) = r21
+memb(gp+#21) = r21
 # CHECK: 00 40 00 00
 # CHECK-NEXT: 15 d5 00 48
 memb(##21) = r21
@@ -183,9 +183,9 @@ memh(##42) = r21
 # CHECK-NEXT: 2a d5 60 48
 memh(##42) = r21.h
 # CHECK: 2a d5 40 48
-memh(#84) = r21
+memh(gp+#84) = r21
 # CHECK: 2a d5 60 48
-memh(#84) = r21.h
+memh(gp+#84) = r21.h
 # CHECK: 15 df 51 a1
 memh(r17+#42) = r31
 # CHECK: 15 df 71 a1
@@ -341,7 +341,7 @@ memw(r17 + r21<<#3) = r31
 # CHECK: 9f ca 51 3c
 memw(r17+#84)=#31
 # CHECK: 15 df 80 48
-memw(#84) = r31
+memw(gp+#84) = r31
 # CHECK: 01 40 00 00
 # CHECK-NEXT: 14 d5 80 48
 memw(##84) = r21
diff --git a/test/MC/Hexagon/instructions/system_user.s b/test/MC/Hexagon/instructions/system_user.s
index f0ead9645dd5895bd4a564a4525ec073ab913495..02c81fa099287b93486bd47a4642433671968dc2 100644
--- a/test/MC/Hexagon/instructions/system_user.s
+++ b/test/MC/Hexagon/instructions/system_user.s
@@ -57,6 +57,3 @@ syncht
 
 # CHECK: 18 df 00 54
 trap0(#254)
-
-# CHECK: 14 df 80 54
-trap1(#253)
diff --git a/test/MC/Hexagon/jumpdoublepound.s b/test/MC/Hexagon/jumpdoublepound.s
index 6b829360a906fdaeb25c858473f547a31829c2c7..8d0eef7fb60a16d845f4d3a5c72794f1c124dced 100644
--- a/test/MC/Hexagon/jumpdoublepound.s
+++ b/test/MC/Hexagon/jumpdoublepound.s
@@ -7,7 +7,7 @@ mylabel:
 # CHECK: if (p0) jump
 if (p0) jump ##mylabel
 
-# CHECK: if (cmp.gtu(r5.new, r4)) jump:t
+# CHECK: if (cmp.gtu(r5.new,r4)) jump:t
 { r5 = r4
   if (cmp.gtu(r5.new, r4)) jump:t ##mylabel }
 
diff --git a/test/MC/Hexagon/labels.s b/test/MC/Hexagon/labels.s
index d52ae004b07dd2a7470134218e8184f9b399f724..f2b62d1412ba6c424397a5a5440490efc5305c8b 100644
--- a/test/MC/Hexagon/labels.s
+++ b/test/MC/Hexagon/labels.s
@@ -10,17 +10,17 @@ r1:
 # CHECK: nop
 r3:nop
 
-# CHECK: r5:4 = combine(r5, r4)
+# CHECK: r5:4 = combine(r5,r4)
 r5:4 = r5:4
 
 # CHECK: r0 = r1
-# CHECK: p0 = tstbit(r0, #10)
+# CHECK: p0 = tstbit(r0,#10)
 # CHECK: if (!p0) jump
 1:r0=r1; p0=tstbit(r0, #10); if !p0 jump 1b;
 
 # CHECK: nop
-# CHECK: r1 = add(r1, #4)
-# CHECK: r5 = memw(r1 + #0)
+# CHECK: r1 = add(r1,#4)
+# CHECK: r5 = memw(r1+#0)
 # CHECK: endloop0
 b: { r5 = memw(r1)
-     r1 = add(r1, #4) } : endloop0
\ No newline at end of file
+     r1 = add(r1, #4) } : endloop0
diff --git a/test/MC/Hexagon/load-GPRel.s b/test/MC/Hexagon/load-GPRel.s
new file mode 100644
index 0000000000000000000000000000000000000000..88f33cd6d7eb113b52c5c265911b75988d8e25e8
--- /dev/null
+++ b/test/MC/Hexagon/load-GPRel.s
@@ -0,0 +1,33 @@
+#RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+
+# Check encoding bits for GP-relative loads.
+
+#CHECK: 4fc6ff8c { r13:12 = memd(gp+#421856) }
+r13:12 = memd(gp+#421856)
+#CHECK: 4fc6ff8c { r13:12 = memd(gp+#421856) }
+r13:12 = memd(#421856)
+
+#CHECK: 4d1ac4d2 { r18 = memb(gp+#46118) }
+r18 = memb(gp+#46118)
+#CHECK: 4d1ac4d2 { r18 = memb(gp+#46118) }
+r18 = memb(#46118)
+
+#CHECK: 4d81f772 { r18 = memw(gp+#134892) }
+r18 = memw(gp+#134892)
+#CHECK: 4d81f772 { r18 = memw(gp+#134892) }
+r18 = memw(#134892)
+
+#CHECK: 497de287 { r7 = memuh(gp+#30248) }
+r7 = memuh(gp+#30248)
+#CHECK: 497de287 { r7 = memuh(gp+#30248) }
+r7 = memuh(#30248)
+
+#CHECK: 4b43e87a { r26 = memh(gp+#36486) }
+r26 = memh(gp+#36486)
+#CHECK: 4b43e87a { r26 = memh(gp+#36486) }
+r26 = memh(#36486)
+
+#CHECK: 4f37d07f { r31 = memub(gp+#61059) }
+r31 = memub(gp+#61059)
+#CHECK: 4f37d07f { r31 = memub(gp+#61059) }
+r31 = memub(#61059)
diff --git a/test/MC/Hexagon/missing_label.s b/test/MC/Hexagon/missing_label.s
new file mode 100644
index 0000000000000000000000000000000000000000..80f69472029c80a4e9feaa73913c40e1208262fa
--- /dev/null
+++ b/test/MC/Hexagon/missing_label.s
@@ -0,0 +1,8 @@
+# RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+#
+
+.I1:
+nop
+
+# CHECK: .I1:
+# CHECK:        nop
diff --git a/test/MC/Hexagon/non-relocatable.s b/test/MC/Hexagon/non-relocatable.s
new file mode 100644
index 0000000000000000000000000000000000000000..72a17901c62267757f0e2b6795b5f2ad4a86bee0
--- /dev/null
+++ b/test/MC/Hexagon/non-relocatable.s
@@ -0,0 +1,10 @@
+# RUN: not llvm-mc -arch=hexagon -filetype=obj %s 2>%t; FileCheck %s <%t
+
+# Don't allow a symbolic operand for an insn that cannot take a
+# relocation.
+
+r7:6 = rol(r5:4,#r2)
+
+# This should produce an error
+#CHECK: error:
+
diff --git a/test/MC/Hexagon/not-over.s b/test/MC/Hexagon/not-over.s
new file mode 100644
index 0000000000000000000000000000000000000000..c31ce5312305090474f0d8dc249bbe9b7062d5af
--- /dev/null
+++ b/test/MC/Hexagon/not-over.s
@@ -0,0 +1,55 @@
+# RUN: llvm-mc -arch=hexagon -filetype=asm %s 2>%t; FileCheck %s <%t
+#
+
+# Check that proper packets are not wrongly flagged as invalid.
+
+1-3-4-f:
+	{
+	       r3 = memub(r2++#1)
+	       if (cmp.eq(r3.new,#0)) jump:nt .
+	       jumpr lr
+	       r4 = #4
+	}
+# CHECK-NOT: rror: invalid instruction packet
+
+1-3-f-f:
+        {
+                r3 = memub(r2++#1)
+                if (cmp.eq(r3.new,#0)) jump:nt .
+                r5 = #5
+                r4 = #4
+        }
+# CHECK-NOT: rror: invalid instruction packet
+
+# Special case of a fat packet that will slim when a compound is formed.
+3-3-8-c:
+   { LOOP0(3-3-8-c, R7)
+     P0 = CMP.GT(R7, #0)
+     IF (!P0.NEW) JUMP:NT .
+     R21:20 = MEMD(R0+#16)
+     R23:22 = MEMD(R0+#24)
+   }
+# CHECK-NOT: rror: invalid instruction packet
+
+1-f-f-f:
+        {
+                r3 = #3
+                if (cmp.eq(r3.new,#0)) jump:nt .
+                r5 = #5
+                r4 = #4
+        }
+# CHECK-NOT: rror: invalid instruction packet
+
+4:
+        jumpr lr
+# CHECK-NOT: rror: invalid instruction packet
+
+f-f-f-f:
+        {
+                r3 = #3
+                r2 = #2
+                r5 = #5
+                r4 = #4
+        }
+# CHECK-NOT: rror: invalid instruction packet
+
diff --git a/test/MC/Hexagon/not_found.s b/test/MC/Hexagon/not_found.s
new file mode 100644
index 0000000000000000000000000000000000000000..2403042792dd11f940fc4fad16b487e74657573f
--- /dev/null
+++ b/test/MC/Hexagon/not_found.s
@@ -0,0 +1,4 @@
+# RUN: not llvm-mc -arch=hexagon -filetype=asm junk123.s 2>%t ; FileCheck %s < %t
+#
+
+# CHECK: junk123.s: {{[N|n]}}o such file or directory
diff --git a/test/MC/Hexagon/offset.s b/test/MC/Hexagon/offset.s
new file mode 100644
index 0000000000000000000000000000000000000000..b079634814d0633b88bed41748aa6839fa5ed30d
--- /dev/null
+++ b/test/MC/Hexagon/offset.s
@@ -0,0 +1,7 @@
+# RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -t - | FileCheck %s
+#
+
+sym_a:
+.set sym_d, sym_a + 8
+# CHECK: 00000000         .text 00000000 sym_a
+# CHECK: 00000008         .text 00000000 sym_d
diff --git a/test/MC/Hexagon/operand-range.s b/test/MC/Hexagon/operand-range.s
new file mode 100644
index 0000000000000000000000000000000000000000..c38aab7060ddf16014399e28337c3c54ec77f7c5
--- /dev/null
+++ b/test/MC/Hexagon/operand-range.s
@@ -0,0 +1,7 @@
+# RUN: not llvm-mc -arch=hexagon -filetype=asm %s 2>&1 | FileCheck %s
+
+# Expect errors here, insn needs to be extended
+R1 = mpyi(R2, #-256)
+# CHECK: error:
+R3 = mpyi(R4, #256)
+# CHECK: error:
diff --git a/test/MC/Hexagon/parse-pound-hi.s b/test/MC/Hexagon/parse-pound-hi.s
new file mode 100644
index 0000000000000000000000000000000000000000..5c6786481c72fd728a3e699eb4d734e6221fb618
--- /dev/null
+++ b/test/MC/Hexagon/parse-pound-hi.s
@@ -0,0 +1,60 @@
+# RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+
+        memw(gp+#hi_htc_version) = r3
+#CHECK: 4880c300 { memw(gp+#0) = r3 }
+        memw(gp+#HI) = r3
+#CHECK: 4880c300 { memw(gp+#0) = r3 }
+        r3 = memw(gp+#HI)
+#CHECK: 4980c003 { r3 = memw(gp+#0) }
+        memw(gp+#HI_x) = r3
+#CHECK: 4880c300 { memw(gp+#0) = r3 }
+        r3 = memw(gp+#HI_x)
+#CHECK: 4980c003 { r3 = memw(gp+#0) }
+        memw(gp+#hi) = r3
+#CHECK: 4880c300 { memw(gp+#0) = r3 }
+        r3 = memw(gp+#hi)
+#CHECK: 4980c003 { r3 = memw(gp+#0) }
+        memw(gp+#hi_x) = r3
+#CHECK: 4880c300 { memw(gp+#0) = r3 }
+        r3 = memw(gp+#hi_x)
+#CHECK: 4980c003 { r3 = memw(gp+#0) }
+        memw(gp+#lo) = r3
+#CHECK: 4880c300 { memw(gp+#0) = r3 }
+        r3 = memw(gp+#lo)
+#CHECK: 4980c003 { r3 = memw(gp+#0) }
+        memw(gp+#lo_x) = r3
+#CHECK: 4880c300 { memw(gp+#0) = r3 }
+        r3 = memw(gp+#lo_x)
+#CHECK: 4980c003 { r3 = memw(gp+#0) }
+        memw(gp+#LO) = r3
+#CHECK: 4880c300 { memw(gp+#0) = r3 }
+        r3 = memw(gp+#lo)
+#CHECK: 4980c003 { r3 = memw(gp+#0) }
+        memw(gp+#LO_x) = r3
+#CHECK: 4880c300 { memw(gp+#0) = r3 }
+        r3 = memw(gp+#LO_x)
+#CHECK: 4980c003 { r3 = memw(gp+#0) }
+        r16.h = #HI(0x405000)
+#CHECK: 7230c040 { r16.h = #64 }
+        r16.h = #HI (0x405000)
+#CHECK: 7230c040 { r16.h = #64 }
+        r16.h = #hi(0x405000)
+#CHECK: 7230c040 { r16.h = #64 }
+        r16.h = #hi (0x405000)
+#CHECK: 7230c040 { r16.h = #64 }
+        r16.l = #LO(0x405020)
+#CHECK: 7170d020 { r16.l = #20512 }
+        r16.l = #LO (0x405020)
+#CHECK: 7170d020 { r16.l = #20512 }
+        r16.l = #lo(0x405020)
+#CHECK: 7170d020 { r16.l = #20512 }
+        r16.l = #lo (0x405020)
+#CHECK: 7170d020 { r16.l = #20512 }
+
+{
+  r19.h = #HI(-559030611)
+  memw(r17+#0) = r19.new
+}
+# CHECK: 72f35ead { r19.h = #57005
+# CHECK: a1b1d200   memw(r17+#0) = r19.new }
+
diff --git a/test/MC/Hexagon/reg_altnames.s b/test/MC/Hexagon/reg_altnames.s
new file mode 100644
index 0000000000000000000000000000000000000000..9c7f7e9b0bfaa07d891e967202e4734050e65ba4
--- /dev/null
+++ b/test/MC/Hexagon/reg_altnames.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple hexagon -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+
+# CHECK: 11 df 75 f1
+r17 = xor(r21, lr)
+
+# CHECK: 1d df 35 f3
+sp = sub(lr, r21)
+
+# CHECK: 15 c0 3e 71
+fp.l = #21
diff --git a/test/MC/Hexagon/register-alt-names.s b/test/MC/Hexagon/register-alt-names.s
index 97bfd32c51d9fc11694423521e06733a020b69d0..3e514661887e257768d75058f60bf6ac1f8d5aa1 100644
--- a/test/MC/Hexagon/register-alt-names.s
+++ b/test/MC/Hexagon/register-alt-names.s
@@ -9,6 +9,6 @@ r1 = fp
 # CHECK: r2 = r29
 r2 = sp
 
-# CHECK: r1:0 = combine(r31, r30)
+# CHECK: r1:0 = combine(r31,r30)
 r1:0 = lr:fp
 
diff --git a/test/MC/Hexagon/relaxed_newvalue.s b/test/MC/Hexagon/relaxed_newvalue.s
index 65fbd312e0ac16e05eedf93d01a16859c73842d6..4e8c6cc2cbc53db48d324f18d8b331bf25c1f202 100644
--- a/test/MC/Hexagon/relaxed_newvalue.s
+++ b/test/MC/Hexagon/relaxed_newvalue.s
@@ -1,9 +1,9 @@
 # RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-objdump -d - | FileCheck %s
 # Make sure relaxation doesn't hinder newvalue calculation
 
-#CHECK: r18 = add(r2, #-6)
+#CHECK: r18 = add(r2,#-6)
 #CHECK-NEXT: immext(#0)
-#CHECK-NEXT: if (!cmp.gt(r18.new, #1)) jump:t
+#CHECK-NEXT: if (!cmp.gt(r18.new,#1)) jump:t
 {
   r18 = add(r2, #-6)
   if (!cmp.gt(r18.new, #1)) jump:t .unknown
diff --git a/test/MC/Hexagon/relocations.s b/test/MC/Hexagon/relocations.s
index 8b90bc7c0cdf22aac2b34989fd629ad8e59f6c00..4acc8084ae6afb4a41bbb1d8f42f1d75d4b07673 100644
--- a/test/MC/Hexagon/relocations.s
+++ b/test/MC/Hexagon/relocations.s
@@ -12,6 +12,14 @@ r_hex_b15_pcrel:
 r_hex_b7_pcrel:
 { loop1 (#undefined, #0) }
 
+# CHECK: R_HEX_LO16
+r_hex_lo16:
+{ r0.l = #lo(undefined) }
+
+# CHECK: R_HEX_HI16
+r_hex_hi16:
+{ r0.h = #hi(undefined) }
+
 # CHECK: R_HEX_32
 r_hex_32:
 .word undefined
@@ -30,19 +38,19 @@ r_hex_8:
 
 # CHECK: R_HEX_GPREL16_0
 r_hex_gprel16_0:
-{ r0 = memb (#undefined@gotrel) }
+{ r0 = memb (gp+#undefined) }
 
 # CHECK: R_HEX_GPREL16_1
 r_hex_gprel16_1:
-{ r0 = memh (#undefined@gotrel) }
+{ r0 = memh (gp+#undefined) }
 
 # CHECK: R_HEX_GPREL16_2
 r_hex_gprel16_2:
-{ r0 = memw (#undefined@gotrel) }
+{ r0 = memw (gp+#undefined) }
 
 # CHECK: R_HEX_GPREL16_3
 r_hex_gprel16_3:
-{ r1:0 = memd (#undefined@gotrel) }
+{ r1:0 = memd (gp+#undefined) }
 
 # CHECK: R_HEX_B13_PCREL
 r_hex_b13_pcrel:
@@ -68,10 +76,6 @@ r_hex_b22_pcrel_x:
 r_hex_b15_pcrel_x:
 { if (p0) jump ##undefined }
 
-# CHECK: R_HEX_B9_PCREL_X
-r_hex_b9_pcrel_x:
-{ r0 = #0 ; jump ##undefined }
-
 # CHECK: R_HEX_B7_PCREL_X
 r_hex_b7_pcrel_x:
 { loop1 (##undefined, #0) }
diff --git a/test/MC/Hexagon/store-GPRel.s b/test/MC/Hexagon/store-GPRel.s
new file mode 100644
index 0000000000000000000000000000000000000000..090a6d0059b0cf14a634cd7689de7ce03ac57809
--- /dev/null
+++ b/test/MC/Hexagon/store-GPRel.s
@@ -0,0 +1,46 @@
+#RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -d -r - | FileCheck %s
+
+# Check encoding bits for gp-rel stores.
+
+#CHECK: 4ab3f229 memw(gp+#105636) = r12.new
+{ r12 = add(r0,r19)
+  memw(gp+#105636) = r12.new }
+
+#CHECK: 4ab3f229 memw(gp+#105636) = r12.new
+{ r12 = add(r0,r19)
+  memw(#105636) = r12.new }
+
+#CHECK: 4ebdca35 memh(gp+#128106) = r6.new
+{ r6 = add(r18,r13)
+  memh(gp+#128106) = r6.new }
+
+#CHECK: 4ebdca35 memh(gp+#128106) = r6.new
+{ r6 = add(r18,r13)
+  memh(#128106) = r6.new }
+
+#CHECK: 4eb3e2fc memb(gp+#59388) = r17.new
+{ r17 = add(r26,r18)
+  memb(gp+#59388) = r17.new }
+#CHECK: 4eb3e2fc memb(gp+#59388) = r17.new
+{ r17 = add(r26,r18)
+  memb(#59388) = r17.new }
+
+#CHECK: 4ad2ea01 { memd(gp+#206856) = r11:10
+{ memd(gp+#206856) = r11:10 }
+#CHECK: 4ad2ea01 { memd(gp+#206856) = r11:10
+{ memd(#206856) = r11:10 }
+
+#CHECK: 4c9dfa1e { memw(gp+#191608) = r26
+{ memw(gp+#191608) = r26 }
+#CHECK: 4c9dfa1e { memw(gp+#191608) = r26
+{ memw(#191608) = r26 }
+
+#CHECK: 4855cfdc { memh(gp+#21944) = r15
+{ memh(gp+#21944) = r15 }
+#CHECK: 4855cfdc { memh(gp+#21944) = r15
+{ memh(#21944) = r15 }
+
+#CHECK: 4a00cea2 { memb(gp+#16546) = r14
+{ memb(gp+#16546) = r14 }
+#CHECK: 4a00cea2 { memb(gp+#16546) = r14
+{ memb(#16546) = r14 }
diff --git a/test/MC/Hexagon/two-extenders.s b/test/MC/Hexagon/two-extenders.s
new file mode 100644
index 0000000000000000000000000000000000000000..314579270135753e41cac421a5f0b6ce72174f8c
--- /dev/null
+++ b/test/MC/Hexagon/two-extenders.s
@@ -0,0 +1,135 @@
+# RUN: llvm-mc -arch=hexagon -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+#
+
+# In packets with two extensions assembler is not extending both instructions
+#
+
+//['D_DUMMY,C4_or_or,L4_ploadrbtnew_abs,S2_storerfgp']
+{
+        if (p3) r23 = memb(##2164335510)
+        memh(##1696682668) = r28.h
+}
+# CHECK: { immext(#2164335488)
+# CHECK:   if (p3) r23 = memb(##2164335510)
+# CHECK:   immext(#1696682624)
+# CHECK:   memh(##1696682668) = r28.h }
+
+//['D_DUMMY,C4_or_or,L4_ploadrbtnew_abs,S2_storerfgp']
+{
+        if (p3.new) r23 = memb(##2164335510)
+        p3 = or(p2,or(p3, p0))
+}
+# CHECK: { p3 = or(p2,or(p3,p0))
+# CHECK:   immext(#2164335488)
+# CHECK:   if (p3.new) r23 = memb(##2164335510) }
+
+
+# -------------------------- Non-extended cases:
+# -------------------------- Use GP and non GP notation
+
+R2 = memb(gp+#0x1000)
+# CHECK: { r2 = memb(gp+#4096) }
+
+R3 = memh(gp+#0x1000)
+# CHECK: { r3 = memh(gp+#4096) }
+
+r4 = memub(gp+#0x1000)
+# CHECK: { r4 = memub(gp+#4096) }
+
+r5 = memuh(gp+#0x1000)
+# CHECK: { r5 = memuh(gp+#4096) }
+
+r6 = memw(gp+#0x1000)
+# CHECK: { r6 = memw(gp+#4096) }
+
+R1:0 = memd(gp+#0x1000)
+# CHECK: { r1:0 = memd(gp+#4096) }
+
+{R25 = #1; memb(gp+#0x1000) = R25.new}
+# CHECK: { r25 = #1
+# CHECK-NEXT: memb(gp+#4096) = r25.new }
+
+{R26 = #1; memh(gp+#0x1000) = R26.new}
+# CHECK: { r26 = #1
+# CHECK-NEXT: memh(gp+#4096) = r26.new }
+
+{R27 = #1; memw(gp+#0x1000) = R27.new}
+# CHECK: { r27 = #1
+# CHECK-NEXT: memw(gp+#4096) = r27.new }
+
+memd(gp+#0x1000) = R1:0
+# CHECK: { memd(gp+#4096) = r1:0 }
+
+memb(gp+#0x1000) = R2
+# CHECK: { memb(gp+#4096) = r2 }
+
+memh(gp+#0x1000) = r3.h
+# CHECK: { memh(gp+#4096) = r3.h }
+
+memh(gp+#0x1000) = R4
+# CHECK: { memh(gp+#4096) = r4 }
+
+memw(gp+#0x1000) = R5
+# CHECK: { memw(gp+#4096) = r5 }
+
+# -------------------------- Extended cases:
+# -------------------------- Use GP and non GP notation
+
+R11:10 = memd(##0x1000)
+# CHECK: { immext(#4096)
+# CHECK-NEXT: r11:10 = memd(##4096) }
+
+R11 = memb(##0x1000)
+# CHECK: { immext(#4096)
+# CHECK-NEXT: r11 = memb(##4096) }
+
+R12 = memh(##0x1000)
+# CHECK: { immext(#4096)
+# CHECK-NEXT: r12 = memh(##4096) }
+
+r13 = memub(##0x1000)
+# CHECK: { immext(#4096)
+# CHECK-NEXT: r13 = memub(##4096) }
+
+r14 = memuh(##0x1000)
+# CHECK: { immext(#4096)
+# CHECK-NEXT: r14 = memuh(##4096) }
+
+r15 = memw(##0x1000)
+# CHECK: { immext(#4096)
+# CHECK-NEXT: r15 = memw(##4096) }
+
+{R22 = #1; memb(##0x1000) = R22.new}
+# CHECK: { r22 = #1
+# CHECK-NEXT: immext(#4096)
+# CHECK-NEXT: memb(##4096) = r22.new }
+
+{R23 = #1; memh(##0x1000) = R23.new}
+# CHECK: { r23 = #1
+# CHECK-NEXT: immext(#4096)
+# CHECK-NEXT: memh(##4096) = r23.new }
+
+{R24 = #1; memw(##0x1000) = R24.new}
+# CHECK: { r24 = #1
+# CHECK-NEXT: immext(#4096)
+# CHECK-NEXT: memw(##4096) = r24.new }
+
+memd(##0x1000) = R17:16
+# CHECK: { immext(#4096)
+# CHECK-NEXT: memd(##4096) = r17:16 }
+
+memb(##0x1000) = R18
+# CHECK: { immext(#4096)
+# CHECK-NEXT: memb(##4096) = r18 }
+
+memh(##0x1000) = r19.h
+# CHECK: { immext(#4096)
+# CHECK-NEXT: memh(##4096) = r19.h }
+
+memh(##0x1000) = R20
+# CHECK: { immext(#4096)
+# CHECK-NEXT: memh(##4096) = r20 }
+
+memw(##0x1000) = R21
+# CHECK: { immext(#4096)
+# CHECK-NEXT: memw(##4096) = r21 }
diff --git a/test/MC/Hexagon/v60-misc.s b/test/MC/Hexagon/v60-misc.s
index e16034948dc35fbe3f3415ab713cd36b86381058..b278447ab100cc387e55964345a095286232619a 100644
--- a/test/MC/Hexagon/v60-misc.s
+++ b/test/MC/Hexagon/v60-misc.s
@@ -14,10 +14,10 @@ if (p2) jumpr r0
 # CHECK: 5361c300 { if (!p3) jumpr:nt
 if (!p3) jumpr r1
 
-# CHECK: 1c2eceee { v14 = vxor(v14,{{ *}}v14) }
+# CHECK: 1c2eceee { v14 = vxor(v14,v14) }
 v14 = #0
 
-# CHECK: 1c80c0a0 { v1:0.w = vsub(v1:0.w,v1:0.w) }
+# CHECK: 1c9edea0 { v1:0.w = vsub(v31:30.w,v31:30.w) }
 v1:0 = #0
 
 # CHECK: 1f42c3e0 { v1:0 = vcombine(v3,v2) }
@@ -53,7 +53,7 @@ q0 = vcmp.eq(v8.uw, v9.uw)
 # CHECK: 1c8aea09 { q1 &= vcmp.eq(v10.w,v10.w) }
 q1 &= vcmp.eq(v10.uw, v10.uw)
 
-# CHECK: 1c8ceb46 { q2 |= vcmp.eq(v11.h,v12.h) }
+# CHECK: 1c8ceb4a { q2 |= vcmp.eq(v11.w,v12.w) }
 q2 |= vcmp.eq(v11.uw, v12.uw)
 
 # CHECK: 1c8eed8b { q3 ^= vcmp.eq(v13.w,v14.w) }
diff --git a/test/MC/Hexagon/v60-vmem.s b/test/MC/Hexagon/v60-vmem.s
index fe202251ec4b81811263a170fb8ff5053e52d961..0580a1e62448fac9f7dc2cd372b648382ecea588 100644
--- a/test/MC/Hexagon/v60-vmem.s
+++ b/test/MC/Hexagon/v60-vmem.s
@@ -327,25 +327,25 @@
   vmem(r6+#-6):nt=v16.new
 }
 
-#CHECK: 28b1cd42 if(p1) vmem(r17+#5) = v17.new }
+#CHECK: 28b1cd42 if (p1) vmem(r17+#5) = v17.new }
 {
   v17 = v25
   if(p1)vmem(r17+#5)=v17.new
 }
 
-#CHECK: 28bbeb6a if(!p1) vmem(r27+#-5) = v17.new }
+#CHECK: 28bbeb6a if (!p1) vmem(r27+#-5) = v17.new }
 {
   v17 = v15
   if(!p1)vmem(r27+#-5)=v17.new
 }
 
-#CHECK: 28e4d252 if(p2) vmem(r4+#2):nt = v24.new }
+#CHECK: 28e4d252 if (p2) vmem(r4+#2):nt = v24.new }
 {
   v24 = v10
   if(p2)vmem(r4+#2):nt=v24.new
 }
 
-#CHECK: 28f8d17a if(!p2) vmem(r24+#1):nt = v4.new }
+#CHECK: 28f8d17a if (!p2) vmem(r24+#1):nt = v4.new }
 {
   v4 = v8
   if(!p2)vmem(r24+#1):nt=v4.new
@@ -363,25 +363,25 @@
   vmem(r1++#1):nt=v7.new
 }
 
-#CHECK: 29a6d042 if(p2) vmem(r6++#0) = v11.new }
+#CHECK: 29a6d042 if (p2) vmem(r6++#0) = v11.new }
 {
   v11 = v13
   if(p2)vmem(r6++#0)=v11.new
 }
 
-#CHECK: 29a2cb6a if(!p1) vmem(r2++#3) = v25.new }
+#CHECK: 29a2cb6a if (!p1) vmem(r2++#3) = v25.new }
 {
   v25 = v17
   if(!p1)vmem(r2++#3)=v25.new
 }
 
-#CHECK: 29f5c952 if(p1) vmem(r21++#1):nt = v14.new }
+#CHECK: 29f5c952 if (p1) vmem(r21++#1):nt = v14.new }
 {
   v14 = v13
   if(p1)vmem(r21++#1):nt=v14.new
 }
 
-#CHECK: 29f7cd7a if(!p1) vmem(r23++#-3):nt = v1.new }
+#CHECK: 29f7cd7a if (!p1) vmem(r23++#-3):nt = v1.new }
 {
   v1 = v0
   if(!p1)vmem(r23++#-3):nt=v1.new
@@ -399,25 +399,25 @@
   vmem(r15++m0):nt=v19.new
 }
 
-#CHECK: 2bb7f042 if(p2) vmem(r23++m1) = v6.new }
+#CHECK: 2bb7f042 if (p2) vmem(r23++m1) = v6.new }
 {
   v6 = v30
   if(p2)vmem(r23++m1)=v6.new
 }
 
-#CHECK: 2ba2f06a if(!p2) vmem(r2++m1) = v12.new }
+#CHECK: 2ba2f06a if (!p2) vmem(r2++m1) = v12.new }
 {
   v12 = v9
   if(!p2)vmem(r2++m1)=v12.new
 }
 
-#CHECK: 2be7e852 if(p1) vmem(r7++m1):nt = v3.new }
+#CHECK: 2be7e852 if (p1) vmem(r7++m1):nt = v3.new }
 {
   v3 = v13
   if(p1)vmem(r7++m1):nt=v3.new
 }
 
-#CHECK: 2bfdd07a if(!p2) vmem(r29++m0):nt = v29.new }
+#CHECK: 2bfdd07a if (!p2) vmem(r29++m0):nt = v29.new }
 {
   v29 = v9
   if(!p2)vmem(r29++m0):nt=v29.new
diff --git a/test/MC/Hexagon/v62_all.s b/test/MC/Hexagon/v62_all.s
new file mode 100644
index 0000000000000000000000000000000000000000..6effdc0caba9b06216fb6d756f28f9a1c5e4df48
--- /dev/null
+++ b/test/MC/Hexagon/v62_all.s
@@ -0,0 +1,552 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv62 -filetype=obj %s | llvm-objdump -arch=hexagon -mcpu=hexagonv62 -d - | FileCheck %s
+
+//   V6_lvsplatb
+//   Vd32.b=vsplat(Rt32)
+     V0.b=vsplat(R0)
+# CHECK: 19c0c040 { v0.b = vsplat(r0) }
+
+//   V6_lvsplath
+//   Vd32.h=vsplat(Rt32)
+     V0.h=vsplat(R0)
+# CHECK: 19c0c020 { v0.h = vsplat(r0) }
+
+//   V6_pred_scalar2v2
+//   Qd4=vsetq2(Rt32)
+     Q0=vsetq2(R0)
+# CHECK: 19a0c04c { q0 = vsetq2(r0) }
+
+//   V6_shuffeqh
+//   Qd4.b=vshuffe(Qs4.h,Qt4.h)
+     Q0.b=vshuffe(Q0.h,Q0.h)
+# CHECK: 1e03c018 { q0.b = vshuffe(q0.h,q0.h) }
+
+//   V6_shuffeqw
+//   Qd4.h=vshuffe(Qs4.w,Qt4.w)
+     Q0.h=vshuffe(Q0.w,Q0.w)
+# CHECK: 1e03c01c { q0.h = vshuffe(q0.w,q0.w) }
+
+//   V6_vaddbsat
+//   Vd32.b=vadd(Vu32.b,Vv32.b):sat
+     V0.b=vadd(V0.b,V0.b):sat
+# CHECK: 1f00c000 { v0.b = vadd(v0.b,v0.b):sat }
+
+//   V6_vaddbsat_dv
+//   Vdd32.b=vadd(Vuu32.b,Vvv32.b):sat
+     V1:0.b=vadd(V1:0.b,V1:0.b):sat
+# CHECK: 1ea0c000 { v1:0.b = vadd(v1:0.b,v1:0.b):sat }
+
+//   V6_vaddcarry
+//   Vd32.w=vadd(Vu32.w,Vv32.w,Qx4):carry
+     V0.w=vadd(V0.w,V0.w,Q0):carry
+# CHECK: 1ca0e000 { v0.w = vadd(v0.w,v0.w,q0):carry }
+
+//   V6_vaddclbh
+//   $Vd.h=vadd(vclb($Vu.h),$Vv.h)
+     V0.h=vadd(vclb(V0.h),V0.h)
+# CHECK: 1f00e000 { v0.h = vadd(vclb(v0.h),v0.h) }
+
+//   V6_vaddclbw
+//   $Vd.w=vadd(vclb($Vu.w),$Vv.w)
+     V0.w=vadd(vclb(V0.w),V0.w)
+# CHECK: 1f00e020 { v0.w = vadd(vclb(v0.w),v0.w) }
+
+//   V6_vaddhw_acc
+//   Vxx32.w+=vadd(Vu32.h,Vv32.h)
+     V1:0.w+=vadd(V0.h,V0.h)
+# CHECK: 1c20e040 { v1:0.w += vadd(v0.h,v0.h) }
+
+//   V6_vaddubh_acc
+//   Vxx32.h+=vadd(Vu32.ub,Vv32.ub)
+     V1:0.h+=vadd(V0.ub,V0.ub)
+# CHECK: 1c40e0a0 { v1:0.h += vadd(v0.ub,v0.ub) }
+
+//   V6_vaddububb_sat
+//   Vd32.ub=vadd(Vu32.ub,Vv32.b):sat
+     V0.ub=vadd(V0.ub,V0.b):sat
+# CHECK: 1ea0c080 { v0.ub = vadd(v0.ub,v0.b):sat }
+
+//   V6_vadduhw_acc
+//   Vxx32.w+=vadd(Vu32.uh,Vv32.uh)
+     V1:0.w+=vadd(V0.uh,V0.uh)
+# CHECK: 1c40e080 { v1:0.w += vadd(v0.uh,v0.uh) }
+
+//   V6_vadduwsat
+//   Vd32.uw=vadd(Vu32.uw,Vv32.uw):sat
+     V0.uw=vadd(V0.uw,V0.uw):sat
+# CHECK: 1f60c020 { v0.uw = vadd(v0.uw,v0.uw):sat }
+
+//   V6_vadduwsat_dv
+//   Vdd32.uw=vadd(Vuu32.uw,Vvv32.uw):sat
+     V1:0.uw=vadd(V1:0.uw,V1:0.uw):sat
+# CHECK: 1ea0c040 { v1:0.uw = vadd(v1:0.uw,v1:0.uw):sat }
+
+//   V6_vandnqrt
+//   Vd32=vand(!Qu4,Rt32)
+     V0=vand(!Q0,R0)
+# CHECK: 19a0c4a0 { v0 = vand(!q0,r0) }
+
+//   V6_vandnqrt_acc
+//   Vx32|=vand(!Qu4,Rt32)
+     V0|=vand(!Q0,R0)
+# CHECK: 1960e460 { v0 |= vand(!q0,r0) }
+
+//   V6_vandvnqv
+//   Vd32=vand(!Qv4,Vu32)
+     V0=vand(!Q0,V0)
+# CHECK: 1e03e020 { v0 = vand(!q0,v0) }
+
+//   V6_vandvqv
+//   Vd32=vand(Qv4,Vu32)
+     V0=vand(Q0,V0)
+# CHECK: 1e03e000 { v0 = vand(q0,v0) }
+
+//   V6_vasrhbsat
+//   Vd32.b=vasr(Vu32.h,Vv32.h,Rt8):sat
+     V0.b=vasr(V0.h,V0.h,R0):sat
+# CHECK: 1800c000 { v0.b = vasr(v0.h,v0.h,r0):sat }
+
+//   V6_vasruwuhrndsat
+//   Vd32.uh=vasr(Vu32.uw,Vv32.uw,Rt8):rnd:sat
+     V0.uh=vasr(V0.uw,V0.uw,R0):rnd:sat
+# CHECK: 1800c020 { v0.uh = vasr(v0.uw,v0.uw,r0):rnd:sat }
+
+//   V6_vasrwuhrndsat
+//   Vd32.uh=vasr(Vu32.w,Vv32.w,Rt8):rnd:sat
+     V0.uh=vasr(V0.w,V0.w,R0):rnd:sat
+# CHECK: 1800c040 { v0.uh = vasr(v0.w,v0.w,r0):rnd:sat }
+
+//   V6_vL32b_cur_npred_ai
+//   if (!Pv4) Vd32.cur=vmem(Rt32+#s4)
+     {
+     v1=v0
+     if (!P0) V0.cur=vmem(R0+#04)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2880c4a0   if (!p0) v0.cur = vmem(r0+#4) }
+
+//   V6_vL32b_cur_npred_pi
+//   if (!Pv4) Vd32.cur=vmem(Rx32++#s3)
+     {
+     v1=v0
+     if (!P0) V0.cur=vmem(R0++#03)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2980c3a0   if (!p0) v0.cur = vmem(r0++#3) }
+
+//   V6_vL32b_cur_npred_ppu
+//   if (!Pv4) Vd32.cur=vmem(Rx32++Mu2)
+     {
+     v1=v0
+     if (!P0) V0.cur=vmem(R0++M0)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2b80c0a0   if (!p0) v0.cur = vmem(r0++m0) }
+
+//   V6_vL32b_cur_pred_ai
+//   if (Pv4) Vd32.cur=vmem(Rt32+#s4)
+     {
+     v1=v0
+     if (P0) V0.cur=vmem(R0+#04)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2880c480   if (p0) v0.cur = vmem(r0+#4) }
+
+//   V6_vL32b_cur_pred_pi
+//   if (Pv4) Vd32.cur=vmem(Rx32++#s3)
+     {
+     v1=v0
+     if (P0) V0.cur=vmem(R0++#03)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2980c380   if (p0) v0.cur = vmem(r0++#3) }
+
+//   V6_vL32b_cur_pred_ppu
+//   if (Pv4) Vd32.cur=vmem(Rx32++Mu2)
+     {
+     v1=v0
+     if (P0) V0.cur=vmem(R0++M0)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2b80c080   if (p0) v0.cur = vmem(r0++m0) }
+
+//   V6_vL32b_npred_ai
+//   if (!Pv4) Vd32=vmem(Rt32+#s4)
+     if (!P0) V0=vmem(R0+#04)
+# CHECK: 2880c460 { if (!p0) v0 = vmem(r0+#4) }
+
+//   V6_vL32b_npred_pi
+//   if (!Pv4) Vd32=vmem(Rx32++#s3)
+     if (!P0) V0=vmem(R0++#03)
+# CHECK: 2980c360 { if (!p0) v0 = vmem(r0++#3) }
+
+//   V6_vL32b_npred_ppu
+//   if (!Pv4) Vd32=vmem(Rx32++Mu2)
+     if (!P0) V0=vmem(R0++M0)
+# CHECK: 2b80c060 { if (!p0) v0 = vmem(r0++m0) }
+
+//   V6_vL32b_nt_cur_npred_ai
+//   if (!Pv4) Vd32.cur=vmem(Rt32+#s4):nt
+     {
+     v1=v0
+     if (!P0) V0.cur=vmem(R0+#04):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 28c0c4a0   if (!p0) v0.cur = vmem(r0+#4):nt }
+
+//   V6_vL32b_nt_cur_npred_pi
+//   if (!Pv4) Vd32.cur=vmem(Rx32++#s3):nt
+     {
+     v1=v0
+     if (!P0) V0.cur=vmem(R0++#03):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 29c0c3a0   if (!p0) v0.cur = vmem(r0++#3):nt }
+
+//   V6_vL32b_nt_cur_npred_ppu
+//   if (!Pv4) Vd32.cur=vmem(Rx32++Mu2):nt
+     {
+     v1=v0
+     if (!P0) V0.cur=vmem(R0++M0):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2bc0c0a0   if (!p0) v0.cur = vmem(r0++m0):nt }
+
+//   V6_vL32b_nt_cur_pred_ai
+//   if (Pv4) Vd32.cur=vmem(Rt32+#s4):nt
+     {
+     v1=v0
+     if (P0) V0.cur=vmem(R0+#04):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 28c0c480   if (p0) v0.cur = vmem(r0+#4):nt }
+
+//   V6_vL32b_nt_cur_pred_pi
+//   if (Pv4) Vd32.cur=vmem(Rx32++#s3):nt
+     {
+     v1=v0
+     if (P0) V0.cur=vmem(R0++#03):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 29c0c380   if (p0) v0.cur = vmem(r0++#3):nt }
+
+//   V6_vL32b_nt_cur_pred_ppu
+//   if (Pv4) Vd32.cur=vmem(Rx32++Mu2):nt
+     {
+     v1=v0
+     if (P0) V0.cur=vmem(R0++M0):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2bc0c080   if (p0) v0.cur = vmem(r0++m0):nt }
+
+//   V6_vL32b_nt_npred_ai
+//   if (!Pv4) Vd32=vmem(Rt32+#s4):nt
+     if (!P0) V0=vmem(R0+#04):nt
+# CHECK: 28c0c460 { if (!p0) v0 = vmem(r0+#4):nt }
+
+//   V6_vL32b_nt_npred_pi
+//   if (!Pv4) Vd32=vmem(Rx32++#s3):nt
+     if (!P0) V0=vmem(R0++#03):nt
+# CHECK: 29c0c360 { if (!p0) v0 = vmem(r0++#3):nt }
+
+//   V6_vL32b_nt_npred_ppu
+//   if (!Pv4) Vd32=vmem(Rx32++Mu2):nt
+     if (!P0) V0=vmem(R0++M0):nt
+# CHECK: 2bc0c060 { if (!p0) v0 = vmem(r0++m0):nt }
+
+//   V6_vL32b_nt_pred_ai
+//   if (Pv4) Vd32=vmem(Rt32+#s4):nt
+     if (P0) V0=vmem(R0+#04):nt
+# CHECK: 28c0c440 { if (p0) v0 = vmem(r0+#4):nt }
+
+//   V6_vL32b_nt_pred_pi
+//   if (Pv4) Vd32=vmem(Rx32++#s3):nt
+     if (P0) V0=vmem(R0++#03):nt
+# CHECK: 29c0c340 { if (p0) v0 = vmem(r0++#3):nt }
+
+//   V6_vL32b_nt_pred_ppu
+//   if (Pv4) Vd32=vmem(Rx32++Mu2):nt
+     if (P0) V0=vmem(R0++M0):nt
+# CHECK: 2bc0c040 { if (p0) v0 = vmem(r0++m0):nt }
+
+//   V6_vL32b_nt_tmp_npred_ai
+//   if (!Pv4) Vd32.tmp=vmem(Rt32+#s4):nt
+     {
+     v1=v0
+     if (!P0) V0.tmp=vmem(R0+#04):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 28c0c4e0   if (!p0) v0.tmp = vmem(r0+#4):nt }
+
+//   V6_vL32b_nt_tmp_npred_pi
+//   if (!Pv4) Vd32.tmp=vmem(Rx32++#s3):nt
+     {
+     v1=v0
+     if (!P0) V0.tmp=vmem(R0++#03):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 29c0c3e0   if (!p0) v0.tmp = vmem(r0++#3):nt }
+
+//   V6_vL32b_nt_tmp_npred_ppu
+//   if (!Pv4) Vd32.tmp=vmem(Rx32++Mu2):nt
+     {
+     v1=v0
+     if (!P0) V0.tmp=vmem(R0++M0):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2bc0c0e0   if (!p0) v0.tmp = vmem(r0++m0):nt }
+
+//   V6_vL32b_nt_tmp_pred_ai
+//   if (Pv4) Vd32.tmp=vmem(Rt32+#s4):nt
+     {
+     v1=v0
+     if (P0) V0.tmp=vmem(R0+#04):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 28c0c4c0   if (p0) v0.tmp = vmem(r0+#4):nt }
+
+//   V6_vL32b_nt_tmp_pred_pi
+//   if (Pv4) Vd32.tmp=vmem(Rx32++#s3):nt
+     {
+     v1=v0
+     if (P0) V0.tmp=vmem(R0++#03):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 29c0c3c0   if (p0) v0.tmp = vmem(r0++#3):nt }
+
+//   V6_vL32b_nt_tmp_pred_ppu
+//   if (Pv4) Vd32.tmp=vmem(Rx32++Mu2):nt
+     {
+     v1=v0
+     if (P0) V0.tmp=vmem(R0++M0):nt
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2bc0c0c0   if (p0) v0.tmp = vmem(r0++m0):nt }
+
+//   V6_vL32b_pred_ai
+//   if (Pv4) Vd32=vmem(Rt32+#s4)
+     if (P0) V0=vmem(R0+#04)
+# CHECK: 2880c440 { if (p0) v0 = vmem(r0+#4) }
+
+//   V6_vL32b_pred_pi
+//   if (Pv4) Vd32=vmem(Rx32++#s3)
+     if (P0) V0=vmem(R0++#03)
+# CHECK: 2980c340 { if (p0) v0 = vmem(r0++#3) }
+
+//   V6_vL32b_pred_ppu
+//   if (Pv4) Vd32=vmem(Rx32++Mu2)
+     if (P0) V0=vmem(R0++M0)
+# CHECK: 2b80c040 { if (p0) v0 = vmem(r0++m0) }
+
+//   V6_vL32b_tmp_npred_ai
+//   if (!Pv4) Vd32.tmp=vmem(Rt32+#s4)
+     {
+     v1=v0
+     if (!P0) V0.tmp=vmem(R0+#04)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2880c4e0   if (!p0) v0.tmp = vmem(r0+#4) }
+
+//   V6_vL32b_tmp_npred_pi
+//   if (!Pv4) Vd32.tmp=vmem(Rx32++#s3)
+     {
+     v1=v0
+     if (!P0) V0.tmp=vmem(R0++#03)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2980c3e0   if (!p0) v0.tmp = vmem(r0++#3) }
+
+//   V6_vL32b_tmp_npred_ppu
+//   if (!Pv4) Vd32.tmp=vmem(Rx32++Mu2)
+     {
+     v1=v0
+     if (!P0) V0.tmp=vmem(R0++M0)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2b80c0e0   if (!p0) v0.tmp = vmem(r0++m0) }
+
+//   V6_vL32b_tmp_pred_ai
+//   if (Pv4) Vd32.tmp=vmem(Rt32+#s4)
+     {
+     v1=v0
+     if (P0) V0.tmp=vmem(R0+#04)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2880c4c0   if (p0) v0.tmp = vmem(r0+#4) }
+
+//   V6_vL32b_tmp_pred_pi
+//   if (Pv4) Vd32.tmp=vmem(Rx32++#s3)
+     {
+     v1=v0
+     if (P0) V0.tmp=vmem(R0++#03)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2980c3c0   if (p0) v0.tmp = vmem(r0++#3) }
+
+//   V6_vL32b_tmp_pred_ppu
+//   if (Pv4) Vd32.tmp=vmem(Rx32++Mu2)
+     {
+     v1=v0
+     if (P0) V0.tmp=vmem(R0++M0)
+     }
+# CHECK: 1e0360e1 { v1 = v0
+# CHECK: 2b80c0c0   if (p0) v0.tmp = vmem(r0++m0) }
+
+//   V6_vlsrb
+//   Vd32.ub=vlsr(Vu32.ub,Rt32)
+     V0.ub=vlsr(V0.ub,R0)
+# CHECK: 1980c060 { v0.ub = vlsr(v0.ub,r0) }
+
+//   V6_vlutvvbi
+//   Vd32.b=vlut32(Vu32.b,Vv32.b,#u3)
+     V0.b=vlut32(V0.b,V0.b,#03)
+# CHECK: 1e20c060 { v0.b = vlut32(v0.b,v0.b,#3) }
+
+//   V6_vlutvvb_nm
+//   Vd32.b=vlut32(Vu32.b,Vv32.b,Rt8):nomatch
+     V0.b=vlut32(V0.b,V0.b,R0):nomatch
+# CHECK: 1800c060 { v0.b = vlut32(v0.b,v0.b,r0):nomatch }
+
+//   V6_vlutvvb_oracci
+//   Vx32.b|=vlut32(Vu32.b,Vv32.b,#u3)
+     V0.b|=vlut32(V0.b,V0.b,#03)
+# CHECK: 1cc0e060 { v0.b |= vlut32(v0.b,v0.b,#3) }
+
+//   V6_vlutvwhi
+//   Vdd32.h=vlut16(Vu32.b,Vv32.h,#u3)
+     V1:0.h=vlut16(V0.b,V0.h,#03)
+# CHECK: 1e60c060 { v1:0.h = vlut16(v0.b,v0.h,#3) }
+
+//   V6_vlutvwh_nm
+//   Vdd32.h=vlut16(Vu32.b,Vv32.h,Rt8):nomatch
+     V1:0.h=vlut16(V0.b,V0.h,R0):nomatch
+# CHECK: 1800c080 { v1:0.h = vlut16(v0.b,v0.h,r0):nomatch }
+
+//   V6_vlutvwh_oracci
+//   Vxx32.h|=vlut16(Vu32.b,Vv32.h,#u3)
+     V1:0.h|=vlut16(V0.b,V0.h,#03)
+# CHECK: 1ce0e060 { v1:0.h |= vlut16(v0.b,v0.h,#3) }
+
+//   V6_vmaxb
+//   Vd32.b=vmax(Vu32.b,Vv32.b)
+     V0.b=vmax(V0.b,V0.b)
+# CHECK: 1f20c0a0 { v0.b = vmax(v0.b,v0.b) }
+
+//   V6_vminb
+//   Vd32.b=vmin(Vu32.b,Vv32.b)
+     V0.b=vmin(V0.b,V0.b)
+# CHECK: 1f20c080 { v0.b = vmin(v0.b,v0.b) }
+
+//   V6_vmpauhb
+//   Vdd32.w=vmpa(Vuu32.uh,Rt32.b)
+     V1:0.w=vmpa(V1:0.uh,R0.b)
+# CHECK: 1980c0a0 { v1:0.w = vmpa(v1:0.uh,r0.b) }
+
+//   V6_vmpauhb_acc
+//   Vxx32.w+=vmpa(Vuu32.uh,Rt32.b)
+     V1:0.w+=vmpa(V1:0.uh,R0.b)
+# CHECK: 1980e040 { v1:0.w += vmpa(v1:0.uh,r0.b) }
+
+//   V6_vmpyewuh_64
+//   Vdd32=vmpye(Vu32.w,Vv32.uh)
+     V1:0=vmpye(V0.w,V0.uh)
+# CHECK: 1ea0c0c0 { v1:0 = vmpye(v0.w,v0.uh) }
+
+//   V6_vmpyiwub
+//   Vd32.w=vmpyi(Vu32.w,Rt32.ub)
+     V0.w=vmpyi(V0.w,R0.ub)
+# CHECK: 1980c0c0 { v0.w = vmpyi(v0.w,r0.ub) }
+
+//   V6_vmpyiwub_acc
+//   Vx32.w+=vmpyi(Vu32.w,Rt32.ub)
+     V0.w+=vmpyi(V0.w,R0.ub)
+# CHECK: 1980e020 { v0.w += vmpyi(v0.w,r0.ub) }
+
+//   V6_vmpyowh_64_acc
+//   Vxx32+=vmpyo(Vu32.w,Vv32.h)
+     V1:0+=vmpyo(V0.w,V0.h)
+# CHECK: 1c20e060 { v1:0 += vmpyo(v0.w,v0.h) }
+
+//   V6_vrounduhub
+//   Vd32.ub=vround(Vu32.uh,Vv32.uh):sat
+     V0.ub=vround(V0.uh,V0.uh):sat
+# CHECK: 1fe0c060 { v0.ub = vround(v0.uh,v0.uh):sat }
+
+//   V6_vrounduwuh
+//   Vd32.uh=vround(Vu32.uw,Vv32.uw):sat
+     V0.uh=vround(V0.uw,V0.uw):sat
+# CHECK: 1fe0c080 { v0.uh = vround(v0.uw,v0.uw):sat }
+
+//   V6_vsatuwuh
+//   Vd32.uh=vsat(Vu32.uw,Vv32.uw)
+     V0.uh=vsat(V0.uw,V0.uw)
+# CHECK: 1f20c0c0 { v0.uh = vsat(v0.uw,v0.uw) }
+
+//   V6_vsubbsat
+//   Vd32.b=vsub(Vu32.b,Vv32.b):sat
+     V0.b=vsub(V0.b,V0.b):sat
+# CHECK: 1f20c040 { v0.b = vsub(v0.b,v0.b):sat }
+
+//   V6_vsubbsat_dv
+//   Vdd32.b=vsub(Vuu32.b,Vvv32.b):sat
+     V1:0.b=vsub(V1:0.b,V1:0.b):sat
+# CHECK: 1ea0c020 { v1:0.b = vsub(v1:0.b,v1:0.b):sat }
+
+//   V6_vsubcarry
+//   Vd32.w=vsub(Vu32.w,Vv32.w,Qx4):carry
+     V0.w=vsub(V0.w,V0.w,Q0):carry
+# CHECK: 1ca0e080 { v0.w = vsub(v0.w,v0.w,q0):carry }
+
+//   V6_vsubububb_sat
+//   Vd32.ub=vsub(Vu32.ub,Vv32.b):sat
+     V0.ub=vsub(V0.ub,V0.b):sat
+# CHECK: 1ea0c0a0 { v0.ub = vsub(v0.ub,v0.b):sat }
+
+//   V6_vsubuwsat
+//   Vd32.uw=vsub(Vu32.uw,Vv32.uw):sat
+     V0.uw=vsub(V0.uw,V0.uw):sat
+# CHECK: 1fc0c080 { v0.uw = vsub(v0.uw,v0.uw):sat }
+
+//   V6_vsubuwsat_dv
+//   Vdd32.uw=vsub(Vuu32.uw,Vvv32.uw):sat
+     V1:0.uw=vsub(V1:0.uw,V1:0.uw):sat
+# CHECK: 1ea0c060 { v1:0.uw = vsub(v1:0.uw,v1:0.uw):sat }
+
+//   V6_vwhist128
+//   vwhist128
+     vwhist128
+# CHECK: 1e00e480 { vwhist128 }
+
+//   V6_vwhist128m
+//   vwhist128(#u1)
+     vwhist128(#01)
+# CHECK: 1e00e780 { vwhist128(#1) }
+
+//   V6_vwhist128q
+//   vwhist128(Qv4)
+     vwhist128(Q0)
+# CHECK: 1e02e480 { vwhist128(q0) }
+
+//   V6_vwhist128qm
+//   vwhist128(Qv4,#u1)
+     vwhist128(Q0,#01)
+# CHECK: 1e02e780 { vwhist128(q0,#1) }
+
+//   V6_vwhist256
+//   vwhist256
+     vwhist256
+# CHECK: 1e00e280 { vwhist256 }
+
+//   V6_vwhist256q
+//   vwhist256(Qv4)
+     vwhist256(Q0)
+# CHECK: 1e02e280 { vwhist256(q0) }
+
+//   V6_vwhist256q_sat
+//   vwhist256(Qv4):sat
+     vwhist256(Q0):sat
+# CHECK: 1e02e380 { vwhist256(q0):sat }
+
+//   V6_vwhist256_sat
+//   vwhist256:sat
+     vwhist256:sat
+# CHECK: 1e00e380 { vwhist256:sat }
diff --git a/test/MC/Hexagon/v62_jumps.s b/test/MC/Hexagon/v62_jumps.s
new file mode 100644
index 0000000000000000000000000000000000000000..0197ecdd23215b89830483524e65b5ce2a855d8a
--- /dev/null
+++ b/test/MC/Hexagon/v62_jumps.s
@@ -0,0 +1,13 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv62 -filetype=obj %s | llvm-objdump -arch=hexagon -mcpu=hexagonv62 -d - | FileCheck %s
+
+# verify compound is split into single instructions if needed
+{
+  p0=cmp.eq(R1:0,R3:2)
+  if (!p0.new) jump:nt ltmp
+  r0=r1 ; jump ltmp
+}
+
+# CHECK: 5c204800 { if (!p0.new) jump:nt
+# CHECK: d2804200   p0 = cmp.eq(r1:0,r3:2)
+# CHECK: 58004000   jump
+# CHECK: 7061c000   r0 = r1 }
diff --git a/test/MC/Hexagon/v62a.s b/test/MC/Hexagon/v62a.s
new file mode 100644
index 0000000000000000000000000000000000000000..4cc6302f6fab80ad2ccc8431ac1d868eba1f0c54
--- /dev/null
+++ b/test/MC/Hexagon/v62a.s
@@ -0,0 +1,19 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv62 -filetype=obj -o - %s | llvm-objdump -arch=hexagon -arch=hexagon -mcpu=hexagonv62 -d - | FileCheck %s
+
+  r31:30=vabsdiffb(r29:28, r27:26)
+# CHECK: e8fadc1e { r31:30 = vabsdiffb(r29:28,r27:26)
+
+  r25:24=vabsdiffub(r23:22, r21:20)
+# CHECK: e8b4d618 { r25:24 = vabsdiffub(r23:22,r21:20)
+
+  r19:18,p3=vminub(r17:16, r15:14)
+# CHECK: eaeed072 { r19:18,p3 = vminub(r17:16,r15:14)
+
+  r13:12=vtrunehb(r11:10, r9:8)
+# CHECK: c18ac86c { r13:12 = vtrunehb(r11:10,r9:8)
+
+  r7:6=vtrunohb(r5:4, r3:2)
+# CHECK: c184c2a6 { r7:6 = vtrunohb(r5:4,r3:2)
+
+  r1:0=vsplatb(r31)
+# CHECK: 845fc080 { r1:0 = vsplatb(r31)
diff --git a/test/MC/Hexagon/v62a_regs.s b/test/MC/Hexagon/v62a_regs.s
new file mode 100644
index 0000000000000000000000000000000000000000..2d31b837afd4b103f82585a0b449f8796f14efee
--- /dev/null
+++ b/test/MC/Hexagon/v62a_regs.s
@@ -0,0 +1,44 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv62 -filetype=obj %s | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-V62
+# RUN: not llvm-mc -arch=hexagon -mcpu=hexagonv60 -filetype=asm %s 2>%t; FileCheck -check-prefix=CHECK-NOV62 %s < %t
+#
+
+# Assure that v62 added registers are understood
+
+r0=framelimit
+r0=framekey
+r1:0=c17:16
+
+# CHECK-V62:  6a10c000 { r0 = framelimit }
+# CHECK-V62:  6a11c000 { r0 = framekey }
+# CHECK-V62:  6810c000 { r1:0 = c17:16 }
+# CHECK-NOV62: rror: invalid operand for instruction
+# CHECK-NOV62: rror: invalid operand for instruction
+# CHECK-NOV62: rror: invalid operand for instruction
+
+r0=pktcountlo
+r0=pktcounthi
+r1:0=c19:18
+r1:0=pktcount
+
+# CHECK-V62:  6a12c000 { r0 = pktcountlo }
+# CHECK-V62:  6a13c000 { r0 = pktcounthi }
+# CHECK-V62:  6812c000 { r1:0 = c19:18 }
+# CHECK-V62:  6812c000 { r1:0 = c19:18 }
+# CHECK-NOV62: rror: invalid operand for instruction
+# CHECK-NOV62: rror: invalid operand for instruction
+# CHECK-NOV62: rror: invalid operand for instruction
+# CHECK-NOV62: rror: invalid operand for instruction
+
+r0=utimerlo
+r0=utimerhi
+r1:0=c31:30
+r1:0=UTIMER
+
+# CHECK-V62:  6a1ec000 { r0 = utimerlo }
+# CHECK-V62:  6a1fc000 { r0 = utimerhi }
+# CHECK-V62:  681ec000 { r1:0 = c31:30 }
+# CHECK-V62:  681ec000 { r1:0 = c31:30 }
+# CHECK-NOV62: rror: invalid operand for instruction
+# CHECK-NOV62: rror: invalid operand for instruction
+# CHECK-NOV62: rror: invalid operand for instruction
+# CHECK-NOV62: rror: invalid operand for instruction
diff --git a/test/MC/MachO/ARM/no-tls-assert.ll b/test/MC/MachO/ARM/no-tls-assert.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3466d4a5a5fffe9692e36a3d93091b86906cd595
--- /dev/null
+++ b/test/MC/MachO/ARM/no-tls-assert.ll
@@ -0,0 +1,28 @@
+; RUN: llc -filetype=obj -o - %s | llvm-objdump -section-headers - | FileCheck %s
+; This should not trigger the "Creating regular section after DWARF" assert.
+; CHECK: __text
+; CHECK: __thread_ptr  00000004
+target triple = "thumbv7-apple-ios9.0.0"
+
+@b = external thread_local global i32
+define i32* @func(i32 %a) !dbg !9 {
+  ret i32* @b
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5, !6, !7}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "r.ii", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"wchar_size", i32 4}
+!6 = !{i32 1, !"min_enum_size", i32 4}
+!7 = !{i32 1, !"PIC Level", i32 2}
+!9 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 4, type: !10, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!10 = !DISubroutineType(types: !11)
+!11 = !{null, !12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DILocalVariable(name: "a", arg: 1, scope: !9, file: !1, line: 4, type: !12)
+!14 = !DIExpression()
diff --git a/test/MC/Mips/bopt-directive.s b/test/MC/Mips/bopt-directive.s
new file mode 100644
index 0000000000000000000000000000000000000000..63e2a05281a83cdefd8f9fdfc5c35bcb270e83bd
--- /dev/null
+++ b/test/MC/Mips/bopt-directive.s
@@ -0,0 +1,16 @@
+# RUN: llvm-mc -arch=mips -mcpu=mips32 %s 2>&1 | FileCheck %s
+
+# We don't support the bopt option in the integrated assembler. Given it's
+# single pass nature, it would be quite difficult to implement currently.
+
+# Ensure we parse the bopt & nobopt directives and warn in the bopt case.
+
+# CHECK: warning: 'bopt' feature is unsupported
+# CHECK: nop
+.text
+f:
+.set bopt
+g:
+.set nobopt
+nop
+
diff --git a/test/MC/Mips/branch-pseudos-bad.s b/test/MC/Mips/branch-pseudos-bad.s
index 3a0193b2e94bfcd20c2b602d6fc3eda801c37791..f2fa74fdcee0a19bbf192437cc98273c558518e5 100644
--- a/test/MC/Mips/branch-pseudos-bad.s
+++ b/test/MC/Mips/branch-pseudos-bad.s
@@ -20,6 +20,10 @@ local_label:
   bgtu $7, $8, local_label
 # CHECK: :[[@LINE-1]]:3: error: pseudo-instruction requires $at, which is not available
 
+  beql $7, 256, local_label
+# CHECK: :[[@LINE-1]]:3: error: pseudo-instruction requires $at, which is not available
+  bnel $7, 256, local_label
+# CHECK: :[[@LINE-1]]:3: error: pseudo-instruction requires $at, which is not available
   bltl $7, $8, local_label
 # CHECK: :[[@LINE-1]]:3: error: pseudo-instruction requires $at, which is not available
   bltul $7, $8, local_label
diff --git a/test/MC/Mips/elf-debug-section.s b/test/MC/Mips/elf-debug-section.s
new file mode 100644
index 0000000000000000000000000000000000000000..6cc901bcb59edd78aa77b47e072993cd978d5e86
--- /dev/null
+++ b/test/MC/Mips/elf-debug-section.s
@@ -0,0 +1,6 @@
+# RUN: llvm-mc -filetype=obj -triple=mips-linux-gnu -g %s -o - \
+# RUN:   | llvm-readobj -s | FileCheck %s
+
+# CHECK:      Section {
+# CHECK:        Name: .debug_line
+# CHECK-NEXT:   Type: SHT_MIPS_DWARF (0x7000001E)
diff --git a/test/MC/Mips/end-directive.s b/test/MC/Mips/end-directive.s
new file mode 100644
index 0000000000000000000000000000000000000000..b2959409e661edd34b1cb69cc174f2080cfc8c4d
--- /dev/null
+++ b/test/MC/Mips/end-directive.s
@@ -0,0 +1,22 @@
+# RUN: llvm-mc -arch=mips -mcpu=mips32 -filetype=obj %s -o - | \
+# RUN:   llvm-readobj -symbols | FileCheck %s
+
+# Check that the assembler doesn't choke on .align between a symbol and the
+# .end directive.
+
+	.text
+	.globl	a
+	.p2align	2
+	.type	a,@function
+	.ent	a
+a:
+	addu	$2, $5, $4
+	.align 4
+	jr	$ra
+	.end	a
+$func_end0:
+	.size	a, ($func_end0)-a
+
+# CHECK: Name: a
+# CHECK-NEXT: Value: 0x0
+# CHECK-NEXT: Size: 24
diff --git a/test/MC/Mips/expansion-j-sym-pic.s b/test/MC/Mips/expansion-j-sym-pic.s
index b22d60ca4f2fa989e59a5e1feaab127d4af068f1..3c0f5ea7c4d61b228b2aab9aa3a487e1c32205d1 100644
--- a/test/MC/Mips/expansion-j-sym-pic.s
+++ b/test/MC/Mips/expansion-j-sym-pic.s
@@ -87,11 +87,11 @@ local_label:
 # MICRO:  b      .text            # encoding: [0x94,0x00,A,A]
 # MICRO:                          #   fixup A - offset: 0, value: .text, kind: fixup_MICROMIPS_PC16_S1
 
-# ELF-O32:      10 00 ff ff     b       0
-# ELF-O32-NEXT:         00000018:  R_MIPS_PC16  .text
+# ELF-O32:      10 00 ff f9 	b	-24 <local_label>
+# ELF-O32-NEXT: 00 00 00 00 	nop
 
-# ELF-NXX:      10 00 00 00     b       4
-# ELF-NXX-NEXT:                 R_MIPS_PC16/R_MIPS_NONE/R_MIPS_NONE  .text
+# ELF-NXX:      10 00 ff f9 	b	-24 <local_label>
+# ELF-NXX-NEXT: 00 00 00 00 	nop
 
   j 1f
   nop
diff --git a/test/MC/Mips/expansion-jal-sym-pic.s b/test/MC/Mips/expansion-jal-sym-pic.s
index d188101d66e2aeb67375816e477f6c628f4783fa..116d1eb15b34a0a9d95c0e6b3b4c7416e6c01bb4 100644
--- a/test/MC/Mips/expansion-jal-sym-pic.s
+++ b/test/MC/Mips/expansion-jal-sym-pic.s
@@ -151,37 +151,35 @@ local_label:
   jal .text
   nop
 
-# FIXME: The .text section MCSymbol isn't created when printing assembly. However,
-# it is created when generating an ELF object file.
 # Expanding "jal .text":
-# O32-FIXME: lw    $25, %call16(.text)($gp)        # encoding: [0x8f,0x99,A,A]
-# O32-FIXME:                                       #   fixup A - offset: 0, value: %got(.text), kind: fixup_Mips_GOT_CALL
+# O32: lw	$25, %got(.text)($gp)   # encoding: [0x8f,0x99,A,A]
+# O32-NEXT:                                       #   fixup A - offset: 0, value: %got(.text), kind: fixup_Mips_GOT
 
 # ELF-O32:      8f 99 00 00 lw $25, 0($gp)
-# ELF-O32-NEXT:                 R_MIPS_CALL16 .text
+# ELF-O32-NEXT:                 R_MIPS_GOT16 .text
 
-# N32-FIXME: lw  $25, %call16(.text)($gp)          # encoding: [0x8f,0x99,A,A]
-# N32-FIXME:                                       #   fixup A - offset: 0, value: %call16(.text), kind: fixup_Mips_GOT_DISP
+# N32: lw	$25, %got_disp(.text)($gp) # encoding: [0x8f,0x99,A,A]
+# N32-NEXT:                                       #   fixup A - offset: 0, value: %got_disp(.text), kind: fixup_Mips_GOT_DISP
 
 # ELF-N32:      8f 99 00 00 lw $25, 0($gp)
-# ELF-N32-NEXT:                 R_MIPS_CALL16/R_MIPS_NONE/R_MIPS_NONE .text
+# ELF-N32-NEXT:                 R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE	.text
 
-# N64-FIXME: ld  $25, %call16(.text)($gp)          # encoding: [0xdf,0x99,A,A]
-# N64-FIXME:                                       #   fixup A - offset: 0, value: %call16(.text), kind: fixup_Mips_GOT_DISP
+# N64: ld	$25, %got_disp(.text)($gp) # encoding: [0xdf,0x99,A,A]
+# N64-NEXT:                                       #   fixup A - offset: 0, value: %got_disp(.text), kind: fixup_Mips_GOT_DISP
 
 # ELF-N64:      df 99 00 00 ld $25, 0($gp)
-# ELF-N64-NEXT:                 R_MIPS_CALL16/R_MIPS_NONE/R_MIPS_NONE .text
+# ELF-N64-NEXT:                 R_MIPS_GOT_DISP/R_MIPS_NONE/R_MIPS_NONE	.text
 
-# O32-MM-FIXME: lw    $25, %got(.text)($gp)      # encoding: [0xff,0x3c,A,A]
-# O32-MM-FIXME:                                  #   fixup A - offset: 0, value: %got(.text), kind: fixup_MICROMIPS_GOT16
-# O32-MM-FIXME: addiu $25, $25, %lo(.text)       # encoding: [0x33,0x39,A,A]
-# O32-MM-FIXME:                                  #   fixup A - offset: 0, value: %lo(.text), kind: fixup_MICROMIPS_LO16
+# O32-MM: lw    $25, %got(.text)($gp)      # encoding: [0xff,0x3c,A,A]
+# O32-MM-NEXT:                                  #   fixup A - offset: 0, value: %got(.text), kind: fixup_MICROMIPS_GOT16
+# O32-MM-NEXT: addiu $25, $25, %lo(.text)       # encoding: [0x33,0x39,A,A]
+# O32-MM-NEXT:                                  #   fixup A - offset: 0, value: %lo(.text), kind: fixup_MICROMIPS_LO16
 
-# N32-MM-FIXME: lw    $25, %got_disp(.text)($gp) # encoding: [0xff,0x3c,A,A]
-# N32-MM-FIXME:                                  #   fixup A - offset: 0, value: %got_disp(.text), kind: fixup_MICROMIPS_GOT_DISP
+# N32-MM: lw    $25, %got_disp(.text)($gp) # encoding: [0xff,0x3c,A,A]
+# N32-MM-NEXT:                                  #   fixup A - offset: 0, value: %got_disp(.text), kind: fixup_MICROMIPS_GOT_DISP
 
-# N64-MM-FIXME: ld    $25, %got_disp(.text)($gp) # encoding: [0xdf,0x99,A,A]
-# N64-MM-FIXME:                                  #   fixup A - offset: 0, value: %got_disp(.text), kind: fixup_MICROMIPS_GOT_DISP
+# N64-MM: ld    $25, %got_disp(.text)($gp) # encoding: [0xdf,0x99,A,A]
+# N64-MM-NEXT:                                  #   fixup A - offset: 0, value: %got_disp(.text), kind: fixup_MICROMIPS_GOT_DISP
 
 # MIPS: jalr $25      # encoding: [0x03,0x20,0xf8,0x09]
 # MM:   jalr $ra, $25 # encoding: [0x03,0xf9,0x0f,0x3c]
@@ -199,7 +197,7 @@ local_label:
 
 # ELF-O32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-O32-NEXT:                 R_MIPS_GOT16 .text
-# ELF-O32-NEXT: 27 39 00 54 addiu $25, $25, 84
+# ELF-O32-NEXT: 27 39 00 58 	addiu	$25, $25, 88
 # ELF-O32-NEXT:                 R_MIPS_LO16 .text
 
 # N32: lw  $25, %got_disp($tmp0)($gp) # encoding: [0x8f,0x99,A,A]
@@ -241,7 +239,7 @@ local_label:
 
 # ELF-O32:      8f 99 00 00 lw $25, 0($gp)
 # ELF-O32-NEXT:                 R_MIPS_GOT16 .text
-# ELF-O32-NEXT: 27 39 00 60 addiu $25, $25, 96
+# ELF-O32-NEXT: 27 39 00 64 	addiu	$25, $25, 100
 # ELF-O32-NEXT:                 R_MIPS_LO16 .text
 
 # N32-FIXME: lw  $25, %got_disp(forward_local)($gp)            # encoding: [0x8f,0x99,A,A]
diff --git a/test/MC/Mips/instalias-imm-expanding.s b/test/MC/Mips/instalias-imm-expanding.s
index 9759dabdc0875268644facfe93da2bab6d1007bb..b26863169f0045906bd41cae54973da93686dee4 100644
--- a/test/MC/Mips/instalias-imm-expanding.s
+++ b/test/MC/Mips/instalias-imm-expanding.s
@@ -23,6 +23,10 @@ text_label:
 # CHECK: add    $4, $4, $1              # encoding: [0x20,0x20,0x81,0x00]
   add $4, 0xFFFFFFFF
 # CHECK: addi   $4, $4, -1              # encoding: [0xff,0xff,0x84,0x20]
+  add $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $1, 255                 # encoding: [0xff,0x00,0x01,0x3c]
+# CHECK: ori    $1, $1, 65295           # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK: add    $5, $5, $1              # encoding: [0x20,0x28,0xa1,0x00]
 
   add $4, $5, -0x80000000
 # CHECK: lui    $4, 32768               # encoding: [0x00,0x80,0x04,0x3c]
@@ -43,6 +47,10 @@ text_label:
 # CHECK: add    $4, $4, $5              # encoding: [0x20,0x20,0x85,0x00]
   add $4, $5, 0xFFFFFFFF
 # CHECK: addi   $4, $5, -1              # encoding: [0xff,0xff,0xa4,0x20]
+  add $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $4, 255                 # encoding: [0xff,0x00,0x04,0x3c]
+# CHECK: ori    $4, $4, 65295           # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK: add    $4, $4, $5              # encoding: [0x20,0x20,0x85,0x00]
 
   addu $4, -0x80000000
 # CHECK: lui    $1, 32768               # encoding: [0x00,0x80,0x01,0x3c]
@@ -63,6 +71,10 @@ text_label:
 # CHECK: addu   $4, $4, $1              # encoding: [0x21,0x20,0x81,0x00]
   addu $4, 0xFFFFFFFF
 # CHECK: addiu  $4, $4, -1              # encoding: [0xff,0xff,0x84,0x24]
+  addu $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $1, 255                 # encoding: [0xff,0x00,0x01,0x3c]
+# CHECK: ori    $1, $1, 65295           # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK: addu    $5, $5, $1              # encoding: [0x21,0x28,0xa1,0x00]
 
   addu $4, $5, -0x80000000
 # CHECK: lui    $4, 32768               # encoding: [0x00,0x80,0x04,0x3c]
@@ -83,6 +95,10 @@ text_label:
 # CHECK: addu   $4, $4, $5              # encoding: [0x21,0x20,0x85,0x00]
   addu $4, $5, 0xFFFFFFFF
 # CHECK: addiu  $4, $5, -1              # encoding: [0xff,0xff,0xa4,0x24]
+  addu $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $4, 255                 # encoding: [0xff,0x00,0x04,0x3c]
+# CHECK: ori    $4, $4, 65295           # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK: addu    $4, $4, $5              # encoding: [0x21,0x20,0x85,0x00]
 
   and $4, -0x80000000
 # CHECK: lui    $1, 32768               # encoding: [0x00,0x80,0x01,0x3c]
@@ -103,6 +119,10 @@ text_label:
   and $4, 0xFFFFFFFF
 # CHECK: addiu  $1, $zero, -1           # encoding: [0xff,0xff,0x01,0x24]
 # CHECK: and    $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+  and $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $1, 255                 # encoding: [0xff,0x00,0x01,0x3c]
+# CHECK: ori    $1, $1, 65295           # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK: and    $5, $5, $1              # encoding: [0x24,0x28,0xa1,0x00]
 
   and $4, $5, -0x80000000
 # CHECK: lui    $4, 32768               # encoding: [0x00,0x80,0x04,0x3c]
@@ -123,6 +143,10 @@ text_label:
 # CHECK: and    $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
   and $4, $5, 0xFFFFFFFF
 # CHECK: addiu  $4, $zero, -1           # encoding: [0xff,0xff,0x04,0x24]
+# CHECK: and    $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $4, 255                 # encoding: [0xff,0x00,0x04,0x3c]
+# CHECK: ori    $4, $4, 65295           # encoding: [0x0f,0xff,0x84,0x34]
 # CHECK: and    $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
 
   nor $4, $5, 0
@@ -144,6 +168,34 @@ text_label:
 # CHECK: lui    $4, 1                   # encoding: [0x01,0x00,0x04,0x3c]
 # CHECK: ori    $4, $4, 42405           # encoding: [0xa5,0xa5,0x84,0x34]
 # CHECK: nor    $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $1, 255                 # encoding: [0xff,0x00,0x01,0x3c]
+# CHECK: ori    $1, $1, 65295           # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK: nor    $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+
+  nor $4, 0
+# CHECK: addiu  $1, $zero, 0            # encoding: [0x00,0x00,0x01,0x24]
+# CHECK: nor    $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 1
+# CHECK: addiu  $1, $zero, 1            # encoding: [0x01,0x00,0x01,0x24]
+# CHECK: nor    $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0x8000
+# CHECK: ori    $1, $zero, 32768        # encoding: [0x00,0x80,0x01,0x34]
+# CHECK: nor    $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, -0x8000
+# CHECK: addiu  $1, $zero, -32768       # encoding: [0x00,0x80,0x01,0x24]
+# CHECK: nor    $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0x10000
+# CHECK: lui    $1, 1                   # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK: nor    $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0x1a5a5
+# CHECK: lui    $1, 1                   # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK: ori    $1, $1, 42405           # encoding: [0xa5,0xa5,0x21,0x34]
+# CHECK: nor    $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $1, 255                 # encoding: [0xff,0x00,0x01,0x3c]
+# CHECK: ori    $1, $1, 65295           # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK: nor    $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
 
   or $4, -0x80000000
 # CHECK: lui    $1, 32768               # encoding: [0x00,0x80,0x01,0x3c]
@@ -165,6 +217,10 @@ text_label:
   or $4, 0xFFFFFFFF
 # CHECK: addiu  $1, $zero, -1           # encoding: [0xff,0xff,0x01,0x24]
 # CHECK: or     $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $1, 255                 # encoding: [0xff,0x00,0x01,0x3c]
+# CHECK: ori    $1, $1, 65295           # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK: or     $5, $5, $1              # encoding: [0x25,0x28,0xa1,0x00]
 
   or $4, $5, -0x80000000
 # CHECK: lui    $4, 32768               # encoding: [0x00,0x80,0x04,0x3c]
@@ -185,6 +241,10 @@ text_label:
 # CHECK: or     $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
   or $4, $5, 0xFFFFFFFF
 # CHECK: addiu  $4, $zero, -1           # encoding: [0xff,0xff,0x04,0x24]
+# CHECK: or     $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, ~(0xF0000000|0x0F000000|0x000000F0)
+# CHECK: lui    $4, 255                 # encoding: [0xff,0x00,0x04,0x3c]
+# CHECK: ori    $4, $4, 65295           # encoding: [0x0f,0xff,0x84,0x34]
 # CHECK: or     $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
 
   slt $4, $5, -0x80000000
@@ -205,6 +265,10 @@ text_label:
 # CHECK: slt    $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
   slt $4, $5, 0xFFFFFFFF
 # CHECK: slti   $4, $5, -1              # encoding: [0xff,0xff,0xa4,0x28]
+  slt $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $4, 255                 # encoding: [0xff,0x00,0x04,0x3c]
+# CHECK: ori    $4, $4, 65295           # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK: slt    $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
 
   sltu $4, $5, -0x80000000
 # CHECK: lui    $4, 32768               # encoding: [0x00,0x80,0x04,0x3c]
@@ -224,6 +288,10 @@ text_label:
 # CHECK: sltu   $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
   sltu $4, $5, 0xFFFFFFFF
 # CHECK: sltiu  $4, $5, -1              # encoding: [0xff,0xff,0xa4,0x2c]
+  sltu $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $4, 255                 # encoding: [0xff,0x00,0x04,0x3c]
+# CHECK: ori    $4, $4, 65295           # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK: sltu   $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
 
   xor $4, -0x80000000
 # CHECK: lui    $1, 32768               # encoding: [0x00,0x80,0x01,0x3c]
@@ -243,6 +311,10 @@ text_label:
 # CHECK: xor    $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
   xor $4, 0xFFFFFFFF
 # CHECK: addiu  $1, $zero, -1           # encoding: [0xff,0xff,0x01,0x24]
+# CHECK: xor    $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+  xor $4, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $1, 255                 # encoding: [0xff,0x00,0x01,0x3c]
+# CHECK: ori    $1, $1, 65295           # encoding: [0x0f,0xff,0x21,0x34]
 # CHECK: xor    $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
 
   xor $4, $5, -0x80000000
@@ -254,7 +326,7 @@ text_label:
 # CHECK: xor    $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
   xor $4, $5, -0x8000
 # CHECK: addiu  $4, $zero, -32768       # encoding: [0x00,0x80,0x04,0x24]
-# CHECK: xor $4, $4, $5                 # encoding: [0x26,0x20,0x85,0x00]
+# CHECK: xor    $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
   xor $4, $5, 0
 # CHECK: xori   $4, $5, 0               # encoding: [0x00,0x00,0xa4,0x38]
   xor $4, $5, 0xFFFF
@@ -264,4 +336,8 @@ text_label:
 # CHECK: xor    $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
   xor $4, $5, 0xFFFFFFFF
 # CHECK: addiu  $4, $zero, -1           # encoding: [0xff,0xff,0x04,0x24]
+# CHECK: xor    $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK: lui    $4, 255                 # encoding: [0xff,0x00,0x04,0x3c]
+# CHECK: ori    $4, $4, 65295           # encoding: [0x0f,0xff,0x84,0x34]
 # CHECK: xor    $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
diff --git a/test/MC/Mips/macro-bcc-imm.s b/test/MC/Mips/macro-bcc-imm.s
index fbc4662d6833a23ad8b01a235741a880833fe360..ebc4cd2ce18987420b33ad911e1bc642a8d3371c 100644
--- a/test/MC/Mips/macro-bcc-imm.s
+++ b/test/MC/Mips/macro-bcc-imm.s
@@ -2,7 +2,45 @@
 # RUN:     FileCheck %s --check-prefix=ALL
 
     .text
-foo:                      # ALL-LABEL: foo:
+foo:
+    beql $a2, 0x1ffff, foo # ALL: lui $1, 1
+                           # ALL: ori $1, $1, 65535
+                           # ALL: beql  $6, $1, foo
+                           # ALL:  #   fixup A - offset: 0, value: foo-4, kind: fixup_Mips_PC16
+                           # ALL: nop
+    beql $a2, -4096, foo   # ALL: addiu $1, $zero, -4096
+                           # ALL: beql  $6, $1, foo
+                           # ALL: #   fixup A - offset: 0, value: foo-4, kind: fixup_Mips_PC16
+    beql $a2, -0x10000, foo # ALL: lui $1, 65535
+                            # ALL: beql  $6, $1, foo
+                            # ALL: # fixup A - offset: 0, value: foo-4, kind: fixup_Mips_PC16
+    beql $a2, 16, foo     # ALL: addiu   $1, $zero, 16
+                          # ALL: beql    $6, $1, foo
+                          # ALL:  # fixup A - offset: 0, value: foo-4, kind: fixup_Mips_PC16
+                          # ALL: nop
+    bnel $a2, 0x1ffff, foo # ALL: lui $1, 1
+                           # ALL: ori $1, $1, 65535
+                           # ALL: bnel  $6, $1, foo
+                           # ALL:  #   fixup A - offset: 0, value: foo-4, kind: fixup_Mips_PC16
+                           # ALL: nop
+    bnel $a2, -4096, foo   # ALL: addiu $1, $zero, -4096
+                           # ALL: bnel  $6, $1, foo
+                           # ALL: #   fixup A - offset: 0, value: foo-4, kind: fixup_Mips_PC16
+    bnel $a2, -0x10000, foo # ALL: lui $1, 65535
+                            # ALL: bnel  $6, $1, foo
+                            # ALL: # fixup A - offset: 0, value: foo-4, kind: fixup_Mips_PC16
+    bnel $a2, 16, foo     # ALL: addiu   $1, $zero, 16
+                          # ALL: bnel    $6, $1, foo
+                          # ALL: #   fixup A - offset: 0, value: foo-4, kind: fixup_Mips_PC16
+                          # ALL: nop
+    beql $a2, 32767, foo  # ALL: addiu   $1, $zero, 32767
+                          # ALL: beql    $6, $1, foo
+                          # ALL: #   fixup A - offset: 0, value: foo-4, kind: fixup_Mips_PC16
+                          # ALL: nop
+    bnel $a2, 32768, foo  # ALL: ori     $1, $zero, 32768
+                          # ALL: bnel    $6, $1, foo
+                          # ALL: #   fixup A - offset: 0, value: foo-4, kind: fixup_Mips_PC16
+                          # ALL: nop
     blt $a2, 16, foo      # ALL: addiu $1, $zero, 16
                           # ALL: slt   $1, $6, $1
                           # ALL: bnez  $1, foo
diff --git a/test/MC/Mips/macro-ddiv.s b/test/MC/Mips/macro-ddiv.s
index d36e6998d603799b4e7acf9d556513405f85b0a4..44650d7fdba3bcb07f9de8af700c37505bf569a3 100644
--- a/test/MC/Mips/macro-ddiv.s
+++ b/test/MC/Mips/macro-ddiv.s
@@ -1,126 +1,354 @@
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r2 | \
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r2 | \
 # RUN:   FileCheck %s --check-prefix=CHECK-NOTRAP
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r2 \
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r2 \
 # RUN:  -mattr=+use-tcc-in-div | FileCheck %s --check-prefix=CHECK-TRAP
 
-  ddiv $25, $11
-# CHECK-NOTRAP: bne $11, $zero, 8         # encoding: [0x15,0x60,0x00,0x02]
+  ddiv $25,$11
+# CHECK-NOTRAP: bne $11, $zero, .Ltmp0    # encoding: [0x15,0x60,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp0-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: ddiv $zero, $25, $11      # encoding: [0x03,0x2b,0x00,0x1e]
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: .Ltmp0
 # CHECK-NOTRAP: addiu $1, $zero, -1       # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-NOTRAP: bne $11, $1, 20           # encoding: [0x15,0x61,0x00,0x05]
+# CHECK-NOTRAP: bne $11, $1, .Ltmp1       # encoding: [0x15,0x61,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp1-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: addiu $1, $zero, 1        # encoding: [0x24,0x01,0x00,0x01]
 # CHECK-NOTRAP: dsll32 $1, $1, 31         # encoding: [0x00,0x01,0x0f,0xfc]
-# CHECK-NOTRAP: bne $25, $1, 8            # encoding: [0x17,0x21,0x00,0x02]
+# CHECK-NOTRAP: bne $25, $1, .Ltmp1       # encoding: [0x17,0x21,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp1-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: sll $zero, $zero, 0       # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-NOTRAP: break 6                   # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK-NOTRAP: .Ltmp1
 # CHECK-NOTRAP: mflo $25                  # encoding: [0x00,0x00,0xc8,0x12]
 
+# CHECK-TRAP: teq $11, $zero, 7           # encoding: [0x01,0x60,0x01,0xf4]
+# CHECK-TRAP: ddiv $zero, $25, $11        # encoding: [0x03,0x2b,0x00,0x1e]
+# CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
+# CHECK-TRAP: bne $11, $1, .Ltmp0         # encoding: [0x15,0x61,A,A]
+# CHECK-TRAP:                             # fixup A - offset: 0, value: .Ltmp0-4, kind: fixup_Mips_PC16
+# CHECK-TRAP: addiu $1, $zero, 1          # encoding: [0x24,0x01,0x00,0x01]
+# CHECK-TRAP: dsll32 $1, $1, 31           # encoding: [0x00,0x01,0x0f,0xfc]
+# CHECK-TRAP: teq $25, $1, 6              # encoding: [0x03,0x21,0x01,0xb4]
+# CHECK-TRAP: .Ltmp0:
+# CHECK-TRAP: mflo $25                    # encoding: [0x00,0x00,0xc8,0x12]
+
   ddiv $24,$12
-# CHECK-NOTRAP: bne $12, $zero, 8         # encoding: [0x15,0x80,0x00,0x02]
+# CHECK-NOTRAP: bne $12, $zero, .Ltmp2    # encoding: [0x15,0x80,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp2-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: ddiv $zero, $24, $12      # encoding: [0x03,0x0c,0x00,0x1e]
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: .Ltmp2:
 # CHECK-NOTRAP: addiu $1, $zero, -1       # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-NOTRAP: bne $12, $1, 20           # encoding: [0x15,0x81,0x00,0x05]
+# CHECK-NOTRAP: bne $12, $1, .Ltmp3       # encoding: [0x15,0x81,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp3-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: addiu $1, $zero, 1        # encoding: [0x24,0x01,0x00,0x01]
 # CHECK-NOTRAP: dsll32 $1, $1, 31         # encoding: [0x00,0x01,0x0f,0xfc]
-# CHECK-NOTRAP: bne $24, $1, 8            # encoding: [0x17,0x01,0x00,0x02]
+# CHECK-NOTRAP: bne $24, $1, .Ltmp3       # encoding: [0x17,0x01,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp3-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: sll $zero, $zero, 0       # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-NOTRAP: break 6                   # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK-NOTRAP: .Ltmp3
 # CHECK-NOTRAP: mflo $24                  # encoding: [0x00,0x00,0xc0,0x12]
 
-  ddiv $25,$0
-# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
-
-  ddiv $0,$9
-# CHECK-NOTRAP: bne $9, $zero, 8          # encoding: [0x15,0x20,0x00,0x02]
-# CHECK-NOTRAP: ddiv $zero, $zero, $9     # encoding: [0x00,0x09,0x00,0x1e]
-# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
-# CHECK-NOTRAP: addiu $1, $zero, -1       # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-NOTRAP: bne $9, $1, 20            # encoding: [0x15,0x21,0x00,0x05]
-# CHECK-NOTRAP: addiu $1, $zero, 1        # encoding: [0x24,0x01,0x00,0x01]
-# CHECK-NOTRAP: dsll32 $1, $1, 31         # encoding: [0x00,0x01,0x0f,0xfc]
-# CHECK-NOTRAP: bne $zero, $1, 8          # encoding: [0x14,0x01,0x00,0x02]
-# CHECK-NOTRAP: sll $zero, $zero, 0       # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-NOTRAP: break 6                   # encoding: [0x00,0x06,0x00,0x0d]
-# CHECK-NOTRAP: mflo $zero                # encoding: [0x00,0x00,0x00,0x12]
-
-  ddiv $0,$0
-# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
-
-  ddiv  $4,$5,$6
-# CHECK-NOTRAP: bne $6, $zero, 8          # encoding: [0x14,0xc0,0x00,0x02]
-# CHECK-NOTRAP: ddiv $zero, $5, $6        # encoding: [0x00,0xa6,0x00,0x1e]
-# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
-# CHECK-NOTRAP: addiu $1, $zero, -1       # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-NOTRAP: bne $6, $1, 20            # encoding: [0x14,0xc1,0x00,0x05]
-# CHECK-NOTRAP: addiu $1, $zero, 1        # encoding: [0x24,0x01,0x00,0x01]
-# CHECK-NOTRAP: dsll32 $1, $1, 31         # encoding: [0x00,0x01,0x0f,0xfc]
-# CHECK-NOTRAP: bne $5, $1, 8             # encoding: [0x14,0xa1,0x00,0x02]
-# CHECK-NOTRAP: sll $zero, $zero, 0       # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-NOTRAP: break 6                   # encoding: [0x00,0x06,0x00,0x0d]
-# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
-
-  ddiv  $4,$5,$0
-# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
-
-  ddiv  $4,$0,$0
-# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
-
-  ddiv $0, $4, $5
-# CHECK-NOTRAP: ddiv $zero, $4, $5        # encoding: [0x00,0x85,0x00,0x1e]
-
-  ddiv $25,$11
-# CHECK-TRAP: teq $11, $zero, 7           # encoding: [0x01,0x60,0x01,0xf4]
-# CHECK-TRAP: ddiv $zero, $25, $11        # encoding: [0x03,0x2b,0x00,0x1e]
-# CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-TRAP: bne $11, $1, 12             # encoding: [0x15,0x61,0x00,0x03]
-# CHECK-TRAP: addiu $1, $zero, 1          # encoding: [0x24,0x01,0x00,0x01]
-# CHECK-TRAP: dsll32 $1, $1, 31           # encoding: [0x00,0x01,0x0f,0xfc]
-# CHECK-TRAP: teq $25, $1, 6              # encoding: [0x03,0x21,0x01,0xb4]
-# CHECK-TRAP: mflo $25                    # encoding: [0x00,0x00,0xc8,0x12]
-
-  ddiv $24,$12
 # CHECK-TRAP: teq $12, $zero, 7           # encoding: [0x01,0x80,0x01,0xf4]
 # CHECK-TRAP: ddiv $zero, $24, $12        # encoding: [0x03,0x0c,0x00,0x1e]
 # CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-TRAP: bne $12, $1, 12             # encoding: [0x15,0x81,0x00,0x03]
+# CHECK-TRAP: bne $12, $1, .Ltmp1         # encoding: [0x15,0x81,A,A]
+# CHECK-TRAP:                             # fixup A - offset: 0, value: .Ltmp1-4, kind: fixup_Mips_PC16
 # CHECK-TRAP: addiu $1, $zero, 1          # encoding: [0x24,0x01,0x00,0x01]
 # CHECK-TRAP: dsll32 $1, $1, 31           # encoding: [0x00,0x01,0x0f,0xfc]
 # CHECK-TRAP: teq $24, $1, 6              # encoding: [0x03,0x01,0x01,0xb4]
+# CHECK-TRAP: .Ltmp1:
 # CHECK-TRAP: mflo $24                    # encoding: [0x00,0x00,0xc0,0x12]
 
   ddiv $25,$0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
 # CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
   ddiv $0,$9
+# CHECK-NOTRAP: ddiv $zero, $zero, $9     # encoding: [0x00,0x09,0x00,0x1e]
+
 # CHECK-TRAP: teq $9, $zero, 7            # encoding: [0x01,0x20,0x01,0xf4]
 # CHECK-TRAP: ddiv $zero, $zero, $9       # encoding: [0x00,0x09,0x00,0x1e]
 # CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-TRAP: bne $9, $1, 12              # encoding: [0x15,0x21,0x00,0x03]
+# CHECK-TRAP: bne $9, $1, .Ltmp2          # encoding: [0x15,0x21,A,A]
+# CHECK-TRAP:                             # fixup A - offset: 0, value: .Ltmp2-4, kind: fixup_Mips_PC16
 # CHECK-TRAP: addiu $1, $zero, 1          # encoding: [0x24,0x01,0x00,0x01]
 # CHECK-TRAP: dsll32 $1, $1, 31           # encoding: [0x00,0x01,0x0f,0xfc]
 # CHECK-TRAP: teq $zero, $1, 6            # encoding: [0x00,0x01,0x01,0xb4]
+# CHECH-TRAP: .Ltmp2:
 # CHECK-TRAP: mflo $zero                  # encoding: [0x00,0x00,0x00,0x12]
 
   ddiv $0,$0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
 # CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
-  ddiv  $4,$5,$6
+  ddiv $4,0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+
+  ddiv $0,0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+
+  ddiv $4,1
+# CHECK-NOTRAP: move $4, $4               # encoding: [0x00,0x80,0x20,0x25]
+# CHECK-TRAP: move $4, $4                 # encoding: [0x00,0x80,0x20,0x25]
+
+  ddiv $4,-1
+# CHECK-NOTRAP: dneg $4, $4               # encoding: [0x00,0x04,0x20,0x2e]
+# CHECK-TRAP: dneg $4, $4                 # encoding: [0x00,0x04,0x20,0x2e]
+
+  ddiv $4,2
+# CHECK-NOTRAP: addiu $1, $zero, 2        # encoding: [0x24,0x01,0x00,0x02]
+# CHECK-NOTRAP: ddiv $zero, $4, $1        # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, 2          # encoding: [0x24,0x01,0x00,0x02]
+# CHECK-TRAP: ddiv $zero, $4, $1          # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,0x8000
+# CHECK-NOTRAP: ori $1, $zero, 32768      # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-NOTRAP: ddiv $zero, $4, $1        # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: ori $1, $zero, 32768        # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-TRAP: ddiv $zero, $4, $1          # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,-0x8000
+# CHECK-NOTRAP: addiu $1, $zero, -32768   # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-NOTRAP: ddiv $zero, $4, $1        # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, -32768     # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-TRAP: ddiv $zero, $4, $1          # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,0x10000
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: ddiv $zero, $4, $1        # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: ddiv $zero, $4, $1          # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,0x1a5a5
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: ori $1, $1, 42405         # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-NOTRAP: ddiv $zero, $4, $1        # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: ori $1, $1, 42405           # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-TRAP: ddiv $zero, $4, $1          # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,0xfffffff
+# CHECK-NOTRAP: lui $1, 4095              # encoding: [0x3c,0x01,0x0f,0xff]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: ddiv $zero, $4, $1        # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 4095                # encoding: [0x3c,0x01,0x0f,0xff]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: ddiv $zero, $4, $1          # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,0x10000000
+# CHECK-NOTRAP: lui $1, 4096              # encoding: [0x3c,0x01,0x10,0x00]
+# CHECK-NOTRAP: ddiv $zero, $4, $1        # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 4096                # encoding: [0x3c,0x01,0x10,0x00]
+# CHECK-TRAP: ddiv $zero, $4, $1          # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,0xfffffffe
+# CHECK-NOTRAP: ori $1, $zero, 65535      # encoding: [0x34,0x01,0xff,0xff]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65534         # encoding: [0x34,0x21,0xff,0xfe]
+# CHECK-NOTRAP: ddiv $zero, $4, $1        # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: ori $1, $zero, 65535        # encoding: [0x34,0x01,0xff,0xff]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65534           # encoding: [0x34,0x21,0xff,0xfe]
+# CHECK-TRAP: ddiv $zero, $4, $1          # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+
+  ddiv $4,0xffffffff
+# CHECK-NOTRAP: lui $1, 65535             # encoding: [0x3c,0x01,0xff,0xff]
+# CHECK-NOTRAP: dsrl32 $1, $1, 0          # encoding: [0x00,0x01,0x08,0x3e]
+# CHECK-NOTRAP: ddiv $zero, $4, $1        # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 65535               # encoding: [0x3c,0x01,0xff,0xff]
+# CHECK-TRAP: dsrl32 $1, $1, 0            # encoding: [0x00,0x01,0x08,0x3e]
+# CHECK-TRAP: ddiv $zero, $4, $1          # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,0xfffffffff
+# CHECK-NOTRAP: addiu  $1, $zero, 15      # encoding: [0x24,0x01,0x00,0x0f]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: ddiv $zero, $4, $1        # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu  $1, $zero, 15        # encoding: [0x24,0x01,0x00,0x0f]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: ddiv $zero, $4, $1          # encoding: [0x00,0x81,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,$5,$6
+# CHECK-NOTRAP: bne $6, $zero, .Ltmp6     # encoding: [0x14,0xc0,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp6-4, kind: fixup_Mips_PC16
+# CHECK-NOTRAP: ddiv $zero, $5, $6        # encoding: [0x00,0xa6,0x00,0x1e]
+# CHECK-NOTRAP: break  7                  # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: .Ltmp6:
+# CHECK-NOTRAP: addiu $1, $zero, -1       # encoding: [0x24,0x01,0xff,0xff]
+# CHECK-NOTRAP: bne $6, $1, .Ltmp7        # encoding: [0x14,0xc1,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp7-4, kind: fixup_Mips_PC16
+# CHECK-NOTRAP: addiu $1, $zero, 1        # encoding: [0x24,0x01,0x00,0x01]
+# CHECK-NOTRAP: dsll32 $1, $1, 31         # encoding: [0x00,0x01,0x0f,0xfc]
+# CHECK-NOTRAP: bne $5, $1, .Ltmp7        # encoding: [0x14,0xa1,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp7-4, kind: fixup_Mips_PC16
+# CHECK-NOTRAP: sll $zero, $zero, 0       # encoding: [0x00,0x00,0x00,0x00]
+# CHECK-NOTRAP: break  6                  # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK-NOTRAP: .Ltmp7:
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+
 # CHECK-TRAP: teq $6, $zero, 7            # encoding: [0x00,0xc0,0x01,0xf4]
 # CHECK-TRAP: ddiv $zero, $5, $6          # encoding: [0x00,0xa6,0x00,0x1e]
 # CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-TRAP: bne $6, $1, 12              # encoding: [0x14,0xc1,0x00,0x03]
+# CHECK-TRAP: bne $6, $1, .Ltmp3          # encoding: [0x14,0xc1,A,A]
+# CHECK-TRAP:                             # fixup A - offset: 0, value: .Ltmp3-4, kind: fixup_Mips_PC16
 # CHECK-TRAP: addiu $1, $zero, 1          # encoding: [0x24,0x01,0x00,0x01]
 # CHECK-TRAP: dsll32 $1, $1, 31           # encoding: [0x00,0x01,0x0f,0xfc]
 # CHECK-TRAP: teq $5, $1, 6               # encoding: [0x00,0xa1,0x01,0xb4]
+# CHECK-TRAP: .Ltmp3:
 # CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
-  ddiv  $4,$5,$0
+  ddiv $4,$5,$0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
 # CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
-  ddiv  $4,$0,$0
+  ddiv $4,$0,$0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
 # CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
-  ddiv $0, $4, $5
+  ddiv $0,$4,$5
+# CHECK-NOTRAP: ddiv $zero, $4, $5        # encoding: [0x00,0x85,0x00,0x1e]
 # CHECK-TRAP: ddiv $zero, $4, $5          # encoding: [0x00,0x85,0x00,0x1e]
+
+  ddiv $4,$0,0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+
+  ddiv $4,$5,0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+
+  ddiv $4,$5,1
+# CHECK-NOTRAP: move $4, $5               # encoding: [0x00,0xa0,0x20,0x25]
+# CHECK-TRAP: move     $4, $5             # encoding: [0x00,0xa0,0x20,0x25]
+
+  ddiv $4,$5,-1
+# CHECK-NOTRAP: dneg $4, $5               # encoding: [0x00,0x05,0x20,0x2e]
+# CHECK-TRAP: dneg    $4, $5              # encoding: [0x00,0x05,0x20,0x2e]
+
+  ddiv $4,$5,2
+# CHECK-NOTRAP: addiu $1, $zero, 2        # encoding: [0x24,0x01,0x00,0x02]
+# CHECK-NOTRAP: ddiv $zero, $5, $1        # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, 2          # encoding: [0x24,0x01,0x00,0x02]
+# CHECK-TRAP: ddiv $zero, $5, $1          # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,$5,0x8000
+# CHECK-NOTRAP: ori $1, $zero, 32768      # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-NOTRAP: ddiv $zero, $5, $1        # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: ori $1, $zero, 32768        # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-TRAP: ddiv $zero, $5, $1          # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,$5,-0x8000
+# CHECK-NOTRAP: addiu $1, $zero, -32768   # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-NOTRAP: ddiv $zero, $5, $1        # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, -32768     # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-TRAP: ddiv $zero, $5, $1          # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,$5,0x10000
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: ddiv $zero, $5, $1        # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: ddiv $zero, $5, $1          # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,$5,0x1a5a5
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: ori $1, $1, 42405         # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-NOTRAP: ddiv $zero, $5, $1        # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: ori $1, $1, 42405           # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-TRAP: ddiv $zero, $5, $1          # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,$5,0xfffffff
+# CHECK-NOTRAP: lui $1, 4095              # encoding: [0x3c,0x01,0x0f,0xff]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: ddiv $zero, $5, $1        # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 4095                # encoding: [0x3c,0x01,0x0f,0xff]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: ddiv $zero, $5, $1          # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,$5,0x10000000
+# CHECK-NOTRAP: lui $1, 4096              # encoding: [0x3c,0x01,0x10,0x00]
+# CHECK-NOTRAP: ddiv $zero, $5, $1        # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 4096                # encoding: [0x3c,0x01,0x10,0x00]
+# CHECK-TRAP: ddiv $zero, $5, $1          # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,$5,0xfffffffe
+# CHECK-NOTRAP: ori $1, $zero, 65535      # encoding: [0x34,0x01,0xff,0xff]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65534         # encoding: [0x34,0x21,0xff,0xfe]
+# CHECK-NOTRAP: ddiv $zero, $5, $1        # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: ori $1, $zero, 65535        # encoding: [0x34,0x01,0xff,0xff]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65534           # encoding: [0x34,0x21,0xff,0xfe]
+# CHECK-TRAP: ddiv $zero, $5, $1          # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,$5,0xffffffff
+# CHECK-NOTRAP: lui $1, 65535             # encoding: [0x3c,0x01,0xff,0xff]
+# CHECK-NOTRAP: dsrl32 $1, $1, 0          # encoding: [0x00,0x01,0x08,0x3e]
+# CHECK-NOTRAP: ddiv $zero, $5, $1        # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 65535               # encoding: [0x3c,0x01,0xff,0xff]
+# CHECK-TRAP: dsrl32 $1, $1, 0            # encoding: [0x00,0x01,0x08,0x3e]
+# CHECK-TRAP: ddiv $zero, $5, $1          # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddiv $4,$5,0xfffffffff
+# CHECK-NOTRAP: addiu $1, $zero, 15       # encoding: [0x24,0x01,0x00,0x0f]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: ddiv $zero, $5, $1        # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, 15         # encoding: [0x24,0x01,0x00,0x0f]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: ddiv $zero, $5, $1          # encoding: [0x00,0xa1,0x00,0x1e]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
diff --git a/test/MC/Mips/macro-ddivu.s b/test/MC/Mips/macro-ddivu.s
index ff7e8c46d0be80cde6d942d29e7ed51080a41960..88998ac76be756ceb603de78b2db7df32e9f9dd3 100644
--- a/test/MC/Mips/macro-ddivu.s
+++ b/test/MC/Mips/macro-ddivu.s
@@ -1,98 +1,301 @@
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r2 | \
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r2 | \
 # RUN:   FileCheck %s --check-prefix=CHECK-NOTRAP
-# RUN: llvm-mc %s -triple=mips-unknown-linux -show-encoding -mcpu=mips64r2 \
+# RUN: llvm-mc %s -triple=mips64-unknown-linux -show-encoding -mcpu=mips64r2 \
 # RUN:  -mattr=+use-tcc-in-div | FileCheck %s --check-prefix=CHECK-TRAP
 
   ddivu $25,$11
-# CHECK-NOTRAP: bne $11, $zero, 8         # encoding: [0x15,0x60,0x00,0x02]
+# CHECK-NOTRAP: bne $11, $zero, .Ltmp0    # encoding: [0x15,0x60,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp0-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: ddivu $zero, $25, $11     # encoding: [0x03,0x2b,0x00,0x1f]
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: .Ltmp0
 # CHECK-NOTRAP: mflo $25                  # encoding: [0x00,0x00,0xc8,0x12]
+# CHECK-TRAP: teq $11, $zero, 7           # encoding: [0x01,0x60,0x01,0xf4]
+# CHECK-TRAP: ddivu $zero, $25, $11       # encoding: [0x03,0x2b,0x00,0x1f]
+# CHECK-TRAP: mflo $25                    # encoding: [0x00,0x00,0xc8,0x12]
 
   ddivu $24,$12
-# CHECK-NOTRAP: bne $12, $zero, 8         # encoding: [0x15,0x80,0x00,0x02]
+# CHECK-NOTRAP: bne $12, $zero, .Ltmp1    # encoding: [0x15,0x80,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp1-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: ddivu $zero, $24, $12     # encoding: [0x03,0x0c,0x00,0x1f]
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: .Ltmp1
 # CHECK-NOTRAP: mflo $24                  # encoding: [0x00,0x00,0xc0,0x12]
+# CHECK-TRAP: teq $12, $zero, 7           # encoding: [0x01,0x80,0x01,0xf4]
+# CHECK-TRAP: ddivu $zero, $24, $12       # encoding: [0x03,0x0c,0x00,0x1f]
+# CHECK-TRAP: mflo $24                    # encoding: [0x00,0x00,0xc0,0x12]
 
   ddivu $25,$0
-# CHECK-NOTRAP: bne $zero, $zero, 8       # encoding: [0x14,0x00,0x00,0x02]
-# CHECK-NOTRAP: ddivu $zero, $25, $zero   # encoding: [0x03,0x20,0x00,0x1f]
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
-# CHECK-NOTRAP: mflo $25                  # encoding: [0x00,0x00,0xc8,0x12]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
   ddivu $0,$9
-# CHECK-NOTRAP: bne $9, $zero, 8          # encoding: [0x15,0x20,0x00,0x02]
+# CHECK-NOTRAP: bne $9, $zero, .Ltmp2     # encoding: [0x15,0x20,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp2-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: ddivu $zero, $zero, $9    # encoding: [0x00,0x09,0x00,0x1f]
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: .Ltmp2
 # CHECK-NOTRAP: mflo $zero                # encoding: [0x00,0x00,0x00,0x12]
+# CHECK-TRAP: teq $9, $zero, 7            # encoding: [0x01,0x20,0x01,0xf4]
+# CHECK-TRAP: ddivu $zero, $zero, $9      # encoding: [0x00,0x09,0x00,0x1f]
+# CHECK-TRAP: mflo $zero                  # encoding: [0x00,0x00,0x00,0x12]
 
   ddivu $0,$0
-# CHECK-NOTRAP: bne $zero, $zero, 8       # encoding: [0x14,0x00,0x00,0x02]
-# CHECK-NOTRAP: ddivu $zero, $zero, $zero # encoding: [0x00,0x00,0x00,0x1f]
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
-# CHECK-NOTRAP: mflo $zero                # encoding: [0x00,0x00,0x00,0x12]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
-  ddivu  $4,$5,$6
-# CHECK-NOTRAP: bne $6, $zero, 8          # encoding: [0x14,0xc0,0x00,0x02]
-# CHECK-NOTRAP: ddivu $zero, $5, $6       # encoding: [0x00,0xa6,0x00,0x1f]
+  ddivu $4,0
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
-# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
-  ddivu $4,$5,$0
-# CHECK-NOTRAP: bne $zero, $zero, 8       # encoding: [0x14,0x00,0x00,0x02]
-# CHECK-NOTRAP: ddivu $zero, $5, $zero    # encoding: [0x00,0xa0,0x00,0x1f]
+  ddivu $0,0
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+
+  ddivu $4,1
+# CHECK-NOTRAP: move $4, $4               # encoding: [0x00,0x80,0x20,0x25]
+# CHECK-TRAP: move $4, $4                 # encoding: [0x00,0x80,0x20,0x25]
+
+  ddivu $4,-1
+# CHECK-NOTRAP: addiu $1, $zero, -1       # encoding: [0x24,0x01,0xff,0xff]
+# CHECK-NOTRAP: ddivu $zero, $4, $1       # encoding: [0x00,0x81,0x00,0x1f]
 # CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
+# CHECK-TRAP: ddivu $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
-  ddivu $4,$0,$0
-# CHECK-NOTRAP: bne $zero, $zero, 8       # encoding: [0x14,0x00,0x00,0x02]
-# CHECK-NOTRAP: ddivu $zero, $zero, $zero # encoding: [0x00,0x00,0x00,0x1f]
-# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+  ddivu $4,0x8000
+# CHECK-NOTRAP: ori $1, $zero, 32768      # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-NOTRAP: ddivu $zero, $4, $1       # encoding: [0x00,0x81,0x00,0x1f]
 # CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: ori $1, $zero, 32768        # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-TRAP: ddivu $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
-  ddivu $0, $4, $5
-# CHECK-NOTRAP: ddivu $zero, $4, $5       # encoding: [0x00,0x85,0x00,0x1f]
+  ddivu $4,-0x8000
+# CHECK-NOTRAP: addiu $1, $zero, -32768   # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-NOTRAP: ddivu $zero, $4, $1       # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, -32768     # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-TRAP: ddivu $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
-  ddivu $25, $11
-# CHECK-TRAP: teq $11, $zero, 7           # encoding: [0x01,0x60,0x01,0xf4]
-# CHECK-TRAP: ddivu $zero, $25, $11       # encoding: [0x03,0x2b,0x00,0x1f]
-# CHECK-TRAP: mflo $25                    # encoding: [0x00,0x00,0xc8,0x12]
+  ddivu $4,0x10000
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: ddivu $zero, $4, $1       # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: ddivu $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
-  ddivu $24,$12
-# CHECK-TRAP: teq $12, $zero, 7           # encoding: [0x01,0x80,0x01,0xf4]
-# CHECK-TRAP: ddivu $zero, $24, $12       # encoding: [0x03,0x0c,0x00,0x1f]
-# CHECK-TRAP: mflo $24                    # encoding: [0x00,0x00,0xc0,0x12]
+  ddivu $4,0x1a5a5
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: ori $1, $1, 42405         # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-NOTRAP: ddivu $zero, $4, $1       # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: ori $1, $1, 42405           # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-TRAP: ddivu $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
-  ddivu $25,$0
-# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
-# CHECK-TRAP: ddivu $zero, $25, $zero     # encoding: [0x03,0x20,0x00,0x1f]
-# CHECK-TRAP: mflo $25                    # encoding: [0x00,0x00,0xc8,0x12]
+  ddivu $4,0xfffffff
+# CHECK-NOTRAP: lui $1, 4095              # encoding: [0x3c,0x01,0x0f,0xff]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: ddivu $zero, $4, $1       # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 4095                # encoding: [0x3c,0x01,0x0f,0xff]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: ddivu $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
-  ddivu $0,$9
-# CHECK-TRAP: teq $9, $zero, 7            # encoding: [0x01,0x20,0x01,0xf4]
-# CHECK-TRAP: ddivu $zero, $zero, $9      # encoding: [0x00,0x09,0x00,0x1f]
-# CHECK-TRAP: mflo $zero                  # encoding: [0x00,0x00,0x00,0x12]
+  ddivu $4,0x10000000
+# CHECK-NOTRAP: lui $1, 4096              # encoding: [0x3c,0x01,0x10,0x00]
+# CHECK-NOTRAP: ddivu $zero, $4, $1       # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 4096                # encoding: [0x3c,0x01,0x10,0x00]
+# CHECK-TRAP: ddivu $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
-  ddivu $0,$0
-# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
-# CHECK-TRAP: ddivu $zero, $zero, $zero   # encoding: [0x00,0x00,0x00,0x1f]
-# CHECK-TRAP: mflo $zero                  # encoding: [0x00,0x00,0x00,0x12]
+  ddivu $4,0xfffffffe
+# CHECK-NOTRAP: ori $1, $zero, 65535      # encoding: [0x34,0x01,0xff,0xff]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65534         # encoding: [0x34,0x21,0xff,0xfe]
+# CHECK-NOTRAP: ddivu $zero, $4, $1       # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: ori $1, $zero, 65535        # encoding: [0x34,0x01,0xff,0xff]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65534           # encoding: [0x34,0x21,0xff,0xfe]
+# CHECK-TRAP: ddivu $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
-  ddivu  $4,$5,$6
+  ddivu $4,0xffffffff
+# CHECK-NOTRAP: lui $1, 65535             # encoding: [0x3c,0x01,0xff,0xff]
+# CHECK-NOTRAP: dsrl32 $1, $1, 0          # encoding: [0x00,0x01,0x08,0x3e]
+# CHECK-NOTRAP: ddivu $zero, $4, $1       # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 65535               # encoding: [0x3c,0x01,0xff,0xff]
+# CHECK-TRAP: dsrl32 $1, $1, 0            # encoding: [0x00,0x01,0x08,0x3e]
+# CHECK-TRAP: ddivu $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,0xfffffffff
+# CHECK-NOTRAP: addiu $1, $zero, 15       # encoding: [0x24,0x01,0x00,0x0f]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: ddivu $zero, $4, $1       # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, 15         # encoding: [0x24,0x01,0x00,0x0f]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: ddivu $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,$6
+# CHECK-NOTRAP: bne $6, $zero, .Ltmp3     # encoding: [0x14,0xc0,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: .Ltmp3-4, kind: fixup_Mips_PC16
+# CHECK-NOTRAP: ddivu $zero, $5, $6       # encoding: [0x00,0xa6,0x00,0x1f]
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: .Ltmp3:
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
 # CHECK-TRAP: teq $6, $zero, 7            # encoding: [0x00,0xc0,0x01,0xf4]
 # CHECK-TRAP: ddivu $zero, $5, $6         # encoding: [0x00,0xa6,0x00,0x1f]
 # CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
   ddivu $4,$5,$0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
 # CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
-# CHECK-TRAP: ddivu $zero, $5, $zero      # encoding: [0x00,0xa0,0x00,0x1f]
-# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
   ddivu $4,$0,$0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
 # CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
-# CHECK-TRAP: ddivu $zero, $zero, $zero   # encoding: [0x00,0x00,0x00,0x1f]
-# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
   ddivu $0, $4, $5
+# CHECK-NOTRAP: ddivu $zero, $4, $5       # encoding: [0x00,0x85,0x00,0x1f]
 # CHECK-TRAP: ddivu $zero, $4, $5         # encoding: [0x00,0x85,0x00,0x1f]
+
+  ddivu $4,$5,0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+
+  ddivu $4,$0,0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+
+  ddivu $0,$0,0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+
+  ddivu $4,$5,1
+# CHECK-NOTRAP: move $4, $5               # encoding: [0x00,0xa0,0x20,0x25]
+# CHECK-TRAP: move $4, $5                 # encoding: [0x00,0xa0,0x20,0x25]
+
+  ddivu $4,$5,-1
+# CHECK-NOTRAP: addiu $1, $zero, -1       # encoding: [0x24,0x01,0xff,0xff]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,2
+# CHECK-NOTRAP: addiu $1, $zero, 2        # encoding: [0x24,0x01,0x00,0x02]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, 2          # encoding: [0x24,0x01,0x00,0x02]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,0x8000
+# CHECK-NOTRAP: ori $1, $zero, 32768      # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: ori $1, $zero, 32768        # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,-0x8000
+# CHECK-NOTRAP: addiu $1, $zero, -32768   # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, -32768     # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,0x10000
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,0x1a5a5
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: ori $1, $1, 42405         # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: ori $1, $1, 42405           # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,0xfffffff
+# CHECK-NOTRAP: lui $1, 4095              # encoding: [0x3c,0x01,0x0f,0xff]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 4095                # encoding: [0x3c,0x01,0x0f,0xff]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,0x10000000
+# CHECK-NOTRAP: lui $1, 4096              # encoding: [0x3c,0x01,0x10,0x00]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 4096                # encoding: [0x3c,0x01,0x10,0x00]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,0xfffffffe
+
+# CHECK-NOTRAP: ori $1, $zero, 65535      # encoding: [0x34,0x01,0xff,0xff]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65534         # encoding: [0x34,0x21,0xff,0xfe]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: ori $1, $zero, 65535        # encoding: [0x34,0x01,0xff,0xff]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65534           # encoding: [0x34,0x21,0xff,0xfe]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,0xffffffff
+# CHECK-NOTRAP: lui $1, 65535             # encoding: [0x3c,0x01,0xff,0xff]
+# CHECK-NOTRAP: dsrl32 $1, $1, 0          # encoding: [0x00,0x01,0x08,0x3e]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 65535               # encoding: [0x3c,0x01,0xff,0xff]
+# CHECK-TRAP: dsrl32 $1, $1, 0            # encoding: [0x00,0x01,0x08,0x3e]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
+
+  ddivu $4,$5,0xfffffffff
+# CHECK-NOTRAP: addiu $1, $zero, 15       # encoding: [0x24,0x01,0x00,0x0f]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: dsll $1, $1, 16           # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-NOTRAP: ori $1, $1, 65535         # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-NOTRAP: ddivu $zero, $5, $1       # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, 15         # encoding: [0x24,0x01,0x00,0x0f]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: dsll $1, $1, 16             # encoding: [0x00,0x01,0x0c,0x38]
+# CHECK-TRAP: ori $1, $1, 65535           # encoding: [0x34,0x21,0xff,0xff]
+# CHECK-TRAP: ddivu $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1f]
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
diff --git a/test/MC/Mips/macro-div-bad.s b/test/MC/Mips/macro-div-bad.s
index 20ad39087a144f80c365f4271127dc1ea72b79f7..4d93a1a9a69d39fe99e33744e51cb6404eb696ea 100644
--- a/test/MC/Mips/macro-div-bad.s
+++ b/test/MC/Mips/macro-div-bad.s
@@ -8,7 +8,7 @@
 # RUN: FileCheck %s --check-prefix=NOT-R6
 
   .text
-  div $25, $11
+  div $25, 11
   # R6: :[[@LINE-1]]:3: error: instruction requires a CPU feature not currently enabled
 
   div $25, $0
diff --git a/test/MC/Mips/macro-div.s b/test/MC/Mips/macro-div.s
index 3ac763e17d7c1a7189aa7d27d08ff2cc4f6884fc..8ce30d745bcf590e9aa1f997551f3d2e85b0129d 100644
--- a/test/MC/Mips/macro-div.s
+++ b/test/MC/Mips/macro-div.s
@@ -4,100 +4,219 @@
 # RUN:  -mattr=+use-tcc-in-div | FileCheck %s --check-prefix=CHECK-TRAP
 
   div $25,$11
-# CHECK-NOTRAP: bnez $11, 8               # encoding: [0x15,0x60,0x00,0x02]
+# CHECK-NOTRAP: bnez $11, $tmp0           # encoding: [0x15,0x60,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp0)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: div $zero, $25, $11       # encoding: [0x03,0x2b,0x00,0x1a]
-# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: break  7                  # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: $tmp0:
 # CHECK-NOTRAP: addiu $1, $zero, -1       # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-NOTRAP: bne $11, $1, 16           # encoding: [0x15,0x61,0x00,0x04]
+# CHECK-NOTRAP: bne $11, $1, $tmp1        # encoding: [0x15,0x61,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: lui $1, 32768             # encoding: [0x3c,0x01,0x80,0x00]
-# CHECK-NOTRAP: bne $25, $1, 8            # encoding: [0x17,0x21,0x00,0x02]
+# CHECK-NOTRAP: bne $25, $1, $tmp1        # encoding: [0x17,0x21,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: nop                       # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-NOTRAP: break 6                   # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK-NOTRAP: $tmp1:
 # CHECK-NOTRAP: mflo $25                  # encoding: [0x00,0x00,0xc8,0x12]
+# CHECK-TRAP: teq $11, $zero, 7           # encoding: [0x01,0x60,0x01,0xf4]
+# CHECK-TRAP: div $zero, $25, $11         # encoding: [0x03,0x2b,0x00,0x1a]
+# CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
+# CHECK-TRAP: bne $11, $1, $tmp0          # encoding: [0x15,0x61,A,A]
+# CHECK-TRAP:                             # fixup A - offset: 0, value: ($tmp0)-4, kind: fixup_Mips_PC16
+# CHECK-TRAP: lui $1, 32768               # encoding: [0x3c,0x01,0x80,0x00]
+# CHECK-TRAP: teq $25, $1, 6              # encoding: [0x03,0x21,0x01,0xb4]
+# CHECK-TRAP: $tmp0:
+# CHECK-TRAP: mflo $25                    # encoding: [0x00,0x00,0xc8,0x12]
 
   div $24,$12
-# CHECK-NOTRAP: bnez $12, 8               # encoding: [0x15,0x80,0x00,0x02]
+# CHECK-NOTRAP: bnez $12, $tmp2           # encoding: [0x15,0x80,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp2)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: div $zero, $24, $12       # encoding: [0x03,0x0c,0x00,0x1a]
-# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: break  7                  # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: $tmp2:
 # CHECK-NOTRAP: addiu $1, $zero, -1       # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-NOTRAP: bne $12, $1, 16           # encoding: [0x15,0x81,0x00,0x04]
+# CHECK-NOTRAP: bne $12, $1, $tmp3        # encoding: [0x15,0x81,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp3)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: lui $1, 32768             # encoding: [0x3c,0x01,0x80,0x00]
-# CHECK-NOTRAP: bne $24, $1, 8            # encoding: [0x17,0x01,0x00,0x02]
+# CHECK-NOTRAP: bne $24, $1, $tmp3        # encoding: [0x17,0x01,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp3)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: nop                       # encoding: [0x00,0x00,0x00,0x00]
-# CHECK-NOTRAP: break 6                   # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK-NOTRAP: break  6                  # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK-NOTRAP: $tmp3:
 # CHECK-NOTRAP: mflo $24                  # encoding: [0x00,0x00,0xc0,0x12]
+# CHECK-TRAP: teq $12, $zero, 7           # encoding: [0x01,0x80,0x01,0xf4]
+# CHECK-TRAP: div $zero, $24, $12         # encoding: [0x03,0x0c,0x00,0x1a]
+# CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
+# CHECK-TRAP: bne $12, $1, $tmp1          # encoding: [0x15,0x81,A,A]
+# CHECK-TRAP:                             # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16
+# CHECK-TRAP: lui $1, 32768               # encoding: [0x3c,0x01,0x80,0x00]
+# CHECK-TRAP: teq $24, $1, 6              # encoding: [0x03,0x01,0x01,0xb4]
+# CHECK-TRAP: $tmp1:
+# CHECK-TRAP: mflo $24                    # encoding: [0x00,0x00,0xc0,0x12]
 
   div $25,$0
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
   div $0,$9
 # CHECK-NOTRAP: div $zero, $zero, $9      # encoding: [0x00,0x09,0x00,0x1a]
+# CHECK-TRAP: div $zero, $zero, $9        # encoding: [0x00,0x09,0x00,0x1a]
 
   div $0,$0
 # CHECK-NOTRAP: div $zero, $zero, $zero   # encoding: [0x00,0x00,0x00,0x1a]
+# CHECK-TRAP: div $zero, $zero, $zero     # encoding: [0x00,0x00,0x00,0x1a]
 
-  div  $4,$5,$6
-# CHECK-NOTRAP: bnez $6, 8                # encoding: [0x14,0xc0,0x00,0x02]
-# CHECK-NOTRAP: div $zero, $5, $6         # encoding: [0x00,0xa6,0x00,0x1a]
+  div $4,0
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+
+  div $0,0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+
+  div $4,1
+# CHECK-NOTRAP: move $4, $4               # encoding: [0x00,0x80,0x20,0x25]
+# CHECK-TRAP: move $4, $4                 # encoding: [0x00,0x80,0x20,0x25]
+
+  div $4,-1
+# CHECK-NOTRAP: neg  $4, $4               # encoding: [0x00,0x04,0x20,0x22]
+# CHECK-TRAP: neg  $4, $4                 # encoding: [0x00,0x04,0x20,0x22]
+
+  div $4,2
+# CHECK-NOTRAP: addiu $1, $zero, 2        # encoding: [0x24,0x01,0x00,0x02]
+# CHECK-NOTRAP: div $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1a]
+# CHECK-NOTRAP: mflo  $4                  # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, 2          # encoding: [0x24,0x01,0x00,0x02]
+# CHECK-TRAP: div $zero, $4, $1           # encoding: [0x00,0x81,0x00,0x1a]
+# CHECK-TRAP: mflo  $4                    # encoding: [0x00,0x00,0x20,0x12]
+
+  div $4,0x8000
+# CHECK-NOTRAP: ori $1, $zero, 32768      # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-NOTRAP: div $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1a]
+# CHECK-NOTRAP: mflo  $4                  # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: ori $1, $zero, 32768        # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-TRAP: div $zero, $4, $1           # encoding: [0x00,0x81,0x00,0x1a]
+# CHECK-TRAP: mflo  $4                    # encoding: [0x00,0x00,0x20,0x12]
+
+  div $4,-0x8000
+# CHECK-NOTRAP: addiu $1, $zero, -32768   # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-NOTRAP: div $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1a]
+# CHECK-NOTRAP: mflo  $4                  # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, -32768     # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-TRAP: div $zero, $4, $1           # encoding: [0x00,0x81,0x00,0x1a]
+# CHECK-TRAP: mflo  $4                    # encoding: [0x00,0x00,0x20,0x12]
+
+  div $4,0x10000
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: div $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1a]
+# CHECK-NOTRAP: mflo  $4                  # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: div $zero, $4, $1           # encoding: [0x00,0x81,0x00,0x1a]
+# CHECK-TRAP: mflo  $4                    # encoding: [0x00,0x00,0x20,0x12]
+
+  div $4,0x1a5a5
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: ori $1, $1, 42405         # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-NOTRAP: div $zero, $4, $1         # encoding: [0x00,0x81,0x00,0x1a]
+# CHECK-NOTRAP: mflo  $4                  # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: ori $1, $1, 42405           # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-TRAP: div $zero, $4, $1           # encoding: [0x00,0x81,0x00,0x1a]
+# CHECK-TRAP: mflo  $4                    # encoding: [0x00,0x00,0x20,0x12]
+
+  div $4,$5,$6
+# CHECK-NOTRAP: bnez $6, $tmp4            # encoding: [0x14,0xc0,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp4)-4, kind: fixup_Mips_PC16
+# CHECK-NOTRAP: div $zero, $5, $6         # encoding: [0x00,0xa6,0x00,0x1a]
+# CHECK-NOTRAP: break  7                  # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: $tmp4:
 # CHECK-NOTRAP: addiu $1, $zero, -1       # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-NOTRAP: bne $6, $1, 16            # encoding: [0x14,0xc1,0x00,0x04]
+# CHECK-NOTRAP: bne $6, $1, $tmp5         # encoding: [0x14,0xc1,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp5)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: lui $1, 32768             # encoding: [0x3c,0x01,0x80,0x00]
-# CHECK-NOTRAP: bne $5, $1, 8             # encoding: [0x14,0xa1,0x00,0x02]
+# CHECK-NOTRAP: bne $5, $1, $tmp5         # encoding: [0x14,0xa1,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp5)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: nop                       # encoding: [0x00,0x00,0x00,0x00]
 # CHECK-NOTRAP: break 6                   # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK-NOTRAP: $tmp5:
 # CHECK-NOTRAP: mflo $4                   # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: teq $6, $zero, 7            # encoding: [0x00,0xc0,0x01,0xf4]
+# CHECK-TRAP: div $zero, $5, $6           # encoding: [0x00,0xa6,0x00,0x1a]
+# CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
+# CHECK-TRAP: bne $6, $1, $tmp2           # encoding: [0x14,0xc1,A,A]
+# CHECK-TRAP:                             # fixup A - offset: 0, value: ($tmp2)-4, kind: fixup_Mips_PC16
+# CHECK-TRAP: lui $1, 32768               # encoding: [0x3c,0x01,0x80,0x00]
+# CHECK-TRAP: teq $5, $1, 6               # encoding: [0x00,0xa1,0x01,0xb4]
+# CHECK-TRAP: $tmp2:
+# CHECK-TRAP: mflo $4                     # encoding: [0x00,0x00,0x20,0x12]
 
-  div  $4,$5,$0
+  div $4,$5,$0
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
-  div  $4,$0,$0
+  div $4,$0,$0
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
-  div $0, $4, $5
+  div $0,$4,$5
 # CHECK-NOTRAP: div $zero, $4, $5         # encoding: [0x00,0x85,0x00,0x1a]
+# CHECK-TRAP: div $zero, $4, $5           # encoding: [0x00,0x85,0x00,0x1a]
 
-  div $25, $11
-# CHECK-TRAP: teq $11, $zero, 7           # encoding: [0x01,0x60,0x01,0xf4]
-# CHECK-TRAP: div $zero, $25, $11         # encoding: [0x03,0x2b,0x00,0x1a]
-# CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-TRAP: bne $11, $1, 8              # encoding: [0x15,0x61,0x00,0x02]
-# CHECK-TRAP: lui $1, 32768               # encoding: [0x3c,0x01,0x80,0x00]
-# CHECK-TRAP: teq $25, $1, 6              # encoding: [0x03,0x21,0x01,0xb4]
-# CHECK-TRAP: mflo $25                    # encoding: [0x00,0x00,0xc8,0x12]
-
-  div $24,$12
-# CHECK-TRAP: teq $12, $zero, 7           # encoding: [0x01,0x80,0x01,0xf4]
-# CHECK-TRAP: div $zero, $24, $12         # encoding: [0x03,0x0c,0x00,0x1a]
-# CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-TRAP: bne $12, $1, 8              # encoding: [0x15,0x81,0x00,0x02]
-# CHECK-TRAP: lui $1, 32768               # encoding: [0x3c,0x01,0x80,0x00]
-# CHECK-TRAP: teq $24, $1, 6              # encoding: [0x03,0x01,0x01,0xb4]
-# CHECK-TRAP: mflo $24                    # encoding: [0x00,0x00,0xc0,0x12]
+  div $4,$5,0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
-  div $25,$0
+  div $4,$0,0
+# CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
 # CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
 
-  div $0,$9
-# CHECK-TRAP: div $zero, $zero, $9        # encoding: [0x00,0x09,0x00,0x1a]
+  div $4,$5,1
+# CHECK-NOTRAP: move $4, $5               # encoding: [0x00,0xa0,0x20,0x25]
+# CHECK-TRAP: move $4, $5                 # encoding: [0x00,0xa0,0x20,0x25]
 
-  div $0,$0
-# CHECK-TRAP: div $zero, $zero, $zero     # encoding: [0x00,0x00,0x00,0x1a]
+  div $4,$5,-1
+# CHECK-NOTRAP: neg  $4, $5               # encoding: [0x00,0x05,0x20,0x22]
+# CHECK-TRAP: neg  $4, $5                 # encoding: [0x00,0x05,0x20,0x22]
 
-  div  $4,$5,$6
-# CHECK-TRAP: teq $6, $zero, 7            # encoding: [0x00,0xc0,0x01,0xf4]
-# CHECK-TRAP: div $zero, $5, $6           # encoding: [0x00,0xa6,0x00,0x1a]
-# CHECK-TRAP: addiu $1, $zero, -1         # encoding: [0x24,0x01,0xff,0xff]
-# CHECK-TRAP: bne $6, $1, 8               # encoding: [0x14,0xc1,0x00,0x02]
-# CHECK-TRAP: lui $1, 32768               # encoding: [0x3c,0x01,0x80,0x00]
-# CHECK-TRAP: teq $5, $1, 6               # encoding: [0x00,0xa1,0x01,0xb4]
+  div $4,$5,2
+# CHECK-NOTRAP: addiu $1, $zero, 2        # encoding: [0x24,0x01,0x00,0x02]
+# CHECK-NOTRAP: div $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1a]
+# CHECK-NOTRAP: mflo  $4                  # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, 2          # encoding: [0x24,0x01,0x00,0x02]
+# CHECK-TRAP: div $zero, $5, $1           # encoding: [0x00,0xa1,0x00,0x1a]
 # CHECK-TRAP: mflo  $4                    # encoding: [0x00,0x00,0x20,0x12]
 
-  div  $4,$5,$0
-# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+  div $4,$5,0x8000
+# CHECK-NOTRAP: ori $1, $zero, 32768      # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-NOTRAP: div $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1a]
+# CHECK-NOTRAP: mflo  $4                  # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: ori $1, $zero, 32768        # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-TRAP: div $zero, $5, $1           # encoding: [0x00,0xa1,0x00,0x1a]
+# CHECK-TRAP: mflo  $4                    # encoding: [0x00,0x00,0x20,0x12]
 
-  div  $4,$0,$0
-# CHECK-TRAP: teq $zero, $zero, 7         # encoding: [0x00,0x00,0x01,0xf4]
+  div $4,$5,-0x8000
+# CHECK-NOTRAP: addiu $1, $zero, -32768   # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-NOTRAP: div $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1a]
+# CHECK-NOTRAP: mflo  $4                  # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: addiu $1, $zero, -32768     # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-TRAP: div $zero, $5, $1           # encoding: [0x00,0xa1,0x00,0x1a]
+# CHECK-TRAP: mflo  $4                    # encoding: [0x00,0x00,0x20,0x12]
+
+  div $4,$5,0x10000
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: div $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1a]
+# CHECK-NOTRAP: mflo  $4                  # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: div $zero, $5, $1           # encoding: [0x00,0xa1,0x00,0x1a]
+# CHECK-TRAP: mflo  $4                    # encoding: [0x00,0x00,0x20,0x12]
 
-  div $0, $4, $5
-# CHECK-TRAP: div $zero, $4, $5           # encoding: [0x00,0x85,0x00,0x1a]
\ No newline at end of file
+  div $4,$5,0x1a5a5
+# CHECK-NOTRAP: lui $1, 1                 # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-NOTRAP: ori $1, $1, 42405         # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-NOTRAP: div $zero, $5, $1         # encoding: [0x00,0xa1,0x00,0x1a]
+# CHECK-NOTRAP: mflo  $4                  # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP: lui $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP: ori $1, $1, 42405           # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-TRAP: div $zero, $5, $1           # encoding: [0x00,0xa1,0x00,0x1a]
+# CHECK-TRAP: mflo  $4                    # encoding: [0x00,0x00,0x20,0x12]
diff --git a/test/MC/Mips/macro-divu-bad.s b/test/MC/Mips/macro-divu-bad.s
index 6eeaa614ff86b7d666c3597678462524f125edfc..b5b492ec682807cb7b7f5ecc9448a7c9224ac169 100644
--- a/test/MC/Mips/macro-divu-bad.s
+++ b/test/MC/Mips/macro-divu-bad.s
@@ -8,7 +8,7 @@
 # RUN: FileCheck %s --check-prefix=NOT-R6
 
   .text
-  divu $25, $11
+  divu $25, 11
   # R6: :[[@LINE-1]]:3: error: instruction requires a CPU feature not currently enabled
 
   divu $25, $0
diff --git a/test/MC/Mips/macro-divu.s b/test/MC/Mips/macro-divu.s
index d8137d5ba7336f1d119b083b5f2269fe34acf125..a3e8ae067c7479f63a113a7e2a2a60cc0c055154 100644
--- a/test/MC/Mips/macro-divu.s
+++ b/test/MC/Mips/macro-divu.s
@@ -4,22 +4,23 @@
 # RUN:  -mattr=+use-tcc-in-div | FileCheck %s --check-prefix=CHECK-TRAP
 
   divu $25,$11
-# CHECK-NOTRAP: bnez $11, 8               # encoding: [0x15,0x60,0x00,0x02]
+# CHECK-NOTRAP: bnez $11, $tmp0           # encoding: [0x15,0x60,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp0)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: divu $zero, $25, $11      # encoding: [0x03,0x2b,0x00,0x1b]
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: $tmp0:
 # CHECK-NOTRAP: mflo $25                  # encoding: [0x00,0x00,0xc8,0x12]
 
   divu $24,$12
-# CHECK-NOTRAP: bnez $12, 8               # encoding: [0x15,0x80,0x00,0x02]
+# CHECK-NOTRAP: bnez $12, $tmp1           # encoding: [0x15,0x80,A,A]
+# CHECK-NOTRAP:                           # fixup A - offset: 0, value: ($tmp1)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: divu $zero, $24, $12      # encoding: [0x03,0x0c,0x00,0x1b]
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: $tmp1:
 # CHECK-NOTRAP: mflo $24                  # encoding: [0x00,0x00,0xc0,0x12]
 
   divu $25,$0
-# CHECK-NOTRAP: bnez $zero, 8             # encoding: [0x14,0x00,0x00,0x02]
-# CHECK-NOTRAP: divu $zero, $25, $zero    # encoding: [0x03,0x20,0x00,0x1b]
 # CHECK-NOTRAP: break 7                   # encoding: [0x00,0x07,0x00,0x0d]
-# CHECK-NOTRAP: mflo $25                  # encoding: [0x00,0x00,0xc8,0x12]
 
   divu $0,$9
 # CHECK-NOTRAP: divu $zero, $zero, $9     # encoding: [0x00,0x09,0x00,0x1b]
@@ -28,22 +29,18 @@
 # CHECK-NOTRAP: divu $zero, $zero, $zero  # encoding: [0x00,0x00,0x00,0x1b]
 
    divu $4,$5,$6
-# CHECK-NOTRAP: bnez $6, 8                 # encoding: [0x14,0xc0,0x00,0x02]
+# CHECK-NOTRAP: bnez $6, $tmp2             # encoding: [0x14,0xc0,A,A]
+# CHECK-NOTRAP:                            # fixup A - offset: 0, value: ($tmp2)-4, kind: fixup_Mips_PC16
 # CHECK-NOTRAP: divu $zero, $5, $6         # encoding: [0x00,0xa6,0x00,0x1b]
 # CHECK-NOTRAP: break 7                    # encoding: [0x00,0x07,0x00,0x0d]
+# CHECK-NOTRAP: $tmp2:
 # CHECK-NOTRAP: mflo $4                    # encoding: [0x00,0x00,0x20,0x12]
 
    divu $4,$5,$0
-# CHECK-NOTRAP: bnez $zero, 8              # encoding: [0x14,0x00,0x00,0x02]
-# CHECK-NOTRAP: divu $zero, $5, $zero      # encoding: [0x00,0xa0,0x00,0x1b]
 # CHECK-NOTRAP: break 7                    # encoding: [0x00,0x07,0x00,0x0d]
-# CHECK-NOTRAP: mflo $4                    # encoding: [0x00,0x00,0x20,0x12]
 
    divu $4,$0,$0
-# CHECK-NOTRAP: bnez $zero, 8              # encoding: [0x14,0x00,0x00,0x02]
-# CHECK-NOTRAP: divu $zero, $zero, $zero   # encoding: [0x00,0x00,0x00,0x1b]
 # CHECK-NOTRAP: break 7                    # encoding: [0x00,0x07,0x00,0x0d]
-# CHECK-NOTRAP: mflo $4                    # encoding: [0x00,0x00,0x20,0x12]
 
   divu $0, $4, $5
 # CHECK-NOTRAP: divu $zero, $4, $5         # encoding: [0x00,0x85,0x00,0x1b]
@@ -60,8 +57,6 @@
 
   divu $25,$0
 # CHECK-TRAP: teq $zero, $zero, 7          # encoding: [0x00,0x00,0x01,0xf4]
-# CHECK-TRAP: divu $zero, $25, $zero       # encoding: [0x03,0x20,0x00,0x1b]
-# CHECK-TRAP: mflo $25                     # encoding: [0x00,0x00,0xc8,0x12]
 
   divu $0,$9
 # CHECK-TRAP: divu $zero, $zero, $9        # encoding: [0x00,0x09,0x00,0x1b]
@@ -76,8 +71,6 @@
 
   divu $4,$5,$0
 # CHECK-TRAP: teq $zero, $zero, 7          # encoding: [0x00,0x00,0x01,0xf4]
-# CHECK-TRAP: divu $zero, $5, $zero        # encoding: [0x00,0xa0,0x00,0x1b]
-# CHECK-TRAP: mflo $4                      # encoding: [0x00,0x00,0x20,0x12]
 
   divu $4,$0,$0
 # CHECK-TRAP: teq $zero, $zero, 7          # encoding: [0x00,0x00,0x01,0xf4]
@@ -85,4 +78,4 @@
 # CHECK-TRAP: mflo $4                      # encoding: [0x00,0x00,0x20,0x12]
 
   divu $0, $4, $5
-# CHECK-TRAP: divu $zero, $4, $5           # encoding: [0x00,0x85,0x00,0x1b]
\ No newline at end of file
+# CHECK-TRAP: divu $zero, $4, $5           # encoding: [0x00,0x85,0x00,0x1b]
diff --git a/test/MC/Mips/macro-dla.s b/test/MC/Mips/macro-dla.s
index e3b558e9e5141c27524d47a0eecfa33b43afbafe..321af00c8be312d1ad2305862fe635591bd3deaf 100644
--- a/test/MC/Mips/macro-dla.s
+++ b/test/MC/Mips/macro-dla.s
@@ -702,6 +702,54 @@ dla $5, extern_sym+8($5) # CHECK: lui $1, %highest(extern_sym+8)       # encodin
                          # CHECK: daddiu $1, $1, %lo(extern_sym+8)     # encoding: [0x64,0x21,A,A]
                          # CHECK:                                      # fixup A - offset: 0, value: %lo(extern_sym+8), kind: fixup_Mips_LO16
                          # CHECK: daddu   $5, $1, $5                   # encoding: [0x00,0x25,0x28,0x2d]
+.set noat
+dla $5, extern_sym       # CHECK: lui $5, %highest(extern_sym)         # encoding: [0x3c,0x05,A,A]
+                         # CHECK:                                      # fixup A - offset: 0, value: %highest(extern_sym), kind: fixup_Mips_HIGHEST
+                         # CHECK: daddiu  $5, $5, %higher(extern_sym)  # encoding: [0x64,0xa5,A,A]
+                         # CHECK:                                      # fixup A - offset: 0, value: %higher(extern_sym), kind: fixup_Mips_HIGHER
+                         # CHECK: dsll  $5, $5, 16                     # encoding: [0x00,0x05,0x2c,0x38]
+                         # CHECK: daddiu  $5, $5, %hi(extern_sym)      # encoding: [0x64,0xa5,A,A]
+                         # CHECK:                                      # fixup A - offset: 0, value: %hi(extern_sym), kind: fixup_Mips_HI16
+                         # CHECK: dsll  $5, $5, 16                     # encoding: [0x00,0x05,0x2c,0x38]
+                         # CHECK: daddiu  $5, $5, %lo(extern_sym)      # encoding: [0x64,0xa5,A,A]
+                         # CHECK:                                      # fixup A - offset: 0, value: %lo(extern_sym), kind: fixup_Mips_LO16
+
+dla $5, extern_sym+8     # CHECK: lui $5, %highest(extern_sym+8)        # encoding: [0x3c,0x05,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %highest(extern_sym+8), kind: fixup_Mips_HIGHEST
+                         # CHECK: daddiu  $5, $5, %higher(extern_sym+8) # encoding: [0x64,0xa5,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %higher(extern_sym+8), kind: fixup_Mips_HIGHER
+                         # CHECK: dsll  $5, $5, 16                      # encoding: [0x00,0x05,0x2c,0x38]
+                         # CHECK: daddiu  $5, $5, %hi(extern_sym+8)     # encoding: [0x64,0xa5,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %hi(extern_sym+8), kind: fixup_Mips_HI16
+                         # CHECK: dsll  $5, $5, 16                      # encoding: [0x00,0x05,0x2c,0x38]
+                         # CHECK: daddiu  $5, $5, %lo(extern_sym+8)     # encoding: [0x64,0xa5,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %lo(extern_sym+8), kind: fixup_Mips_LO16
+
+dla $5, extern_sym($6)   # CHECK: lui $5, %highest(extern_sym)          # encoding: [0x3c,0x05,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %highest(extern_sym), kind: fixup_Mips_HIGHEST
+                         # CHECK: daddiu  $5, $5, %higher(extern_sym)   # encoding: [0x64,0xa5,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %higher(extern_sym), kind: fixup_Mips_HIGHER
+                         # CHECK: dsll  $5, $5, 16                      # encoding: [0x00,0x05,0x2c,0x38]
+                         # CHECK: daddiu  $5, $5, %hi(extern_sym)       # encoding: [0x64,0xa5,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %hi(extern_sym), kind: fixup_Mips_HI16
+                         # CHECK: dsll  $5, $5, 16                      # encoding: [0x00,0x05,0x2c,0x38]
+                         # CHECK: daddiu  $5, $5, %lo(extern_sym)       # encoding: [0x64,0xa5,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %lo(extern_sym), kind: fixup_Mips_LO16
+                         # CHECK: daddu $5, $5, $6                      # encoding: [0x00,0xa6,0x28,0x2d]
+
+dla $4, extern_sym+8($6) # CHECK: lui $4, %highest(extern_sym+8)        # encoding: [0x3c,0x04,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %highest(extern_sym+8), kind: fixup_Mips_HIGHEST
+                         # CHECK: daddiu  $4, $4, %higher(extern_sym+8) # encoding: [0x64,0x84,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %higher(extern_sym+8), kind: fixup_Mips_HIGHER
+                         # CHECK: dsll  $4, $4, 16                      # encoding: [0x00,0x04,0x24,0x38]
+                         # CHECK: daddiu  $4, $4, %hi(extern_sym+8)     # encoding: [0x64,0x84,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %hi(extern_sym+8), kind: fixup_Mips_HI16
+                         # CHECK: dsll  $4, $4, 16                      # encoding: [0x00,0x04,0x24,0x38]
+                         # CHECK: daddiu  $4, $4, %lo(extern_sym+8)     # encoding: [0x64,0x84,A,A]
+                         # CHECK:                                       # fixup A - offset: 0, value: %lo(extern_sym+8), kind: fixup_Mips_LO16
+                         # CHECK: daddu $4, $4, $6                      # encoding: [0x00,0x86,0x20,0x2d]
+
+.set at
 
 .option pic2
 #dla $5, symbol
diff --git a/test/MC/Mips/mips64-instalias-imm-expanding.s b/test/MC/Mips/mips64-instalias-imm-expanding.s
new file mode 100644
index 0000000000000000000000000000000000000000..80764ebd9746c5a26b8dcef4648aecdc359fbe64
--- /dev/null
+++ b/test/MC/Mips/mips64-instalias-imm-expanding.s
@@ -0,0 +1,741 @@
+# RUN: llvm-mc -triple mips64el-unknown-linux -show-encoding -print-imm-hex %s | FileCheck %s
+
+  .text
+text_label:
+# CHECK: text_label:
+  add $4, -0x80000000
+# CHECK-NEXT: lui    $1, 0x8000              # encoding: [0x00,0x80,0x01,0x3c]
+# CHECK-NEXT: add    $4, $4, $1              # encoding: [0x20,0x20,0x81,0x00]
+  add $4, -0x8001
+# CHECK-NEXT: lui    $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: ori    $1, $1, 0x7fff          # encoding: [0xff,0x7f,0x21,0x34]
+# CHECK-NEXT: add    $4, $4, $1              # encoding: [0x20,0x20,0x81,0x00]
+  add $4, -0x8000
+# CHECK-NEXT: addi   $4, $4, -0x8000         # encoding: [0x00,0x80,0x84,0x20]
+  add $4, 0
+# CHECK-NEXT: addi   $4, $4, 0x0             # encoding: [0x00,0x00,0x84,0x20]
+  add $4, 0xFFFF
+# CHECK-NEXT: ori    $1, $zero, 0xffff       # encoding: [0xff,0xff,0x01,0x34]
+# CHECK-NEXT: add    $4, $4, $1              # encoding: [0x20,0x20,0x81,0x00]
+  add $4, 0x10000
+# CHECK-NEXT: lui    $1, 0x1                 # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK-NEXT: add    $4, $4, $1              # encoding: [0x20,0x20,0x81,0x00]
+  add $4, 0xFFFFFFFF # This should be sign-extended because it's a 32-bit add
+# CHECK-NEXT: addi   $4, $4, -0x1            # encoding: [0xff,0xff,0x84,0x20]
+  add $4, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: lui    $1, 0xff                # encoding: [0xff,0x00,0x01,0x3c]
+# CHECK-NEXT: ori    $1, $1, 0xff0f          # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK-NEXT: add $4, $4, $1                 # encoding: [0x20,0x20,0x81,0x00]
+
+  add $4, $5, -0x80000000
+# CHECK:      lui    $4, 0x8000              # encoding: [0x00,0x80,0x04,0x3c]
+# CHECK-NEXT: add    $4, $4, $5              # encoding: [0x20,0x20,0x85,0x00]
+  add $4, $5, -0x8001
+# CHECK-NEXT: lui    $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: ori    $4, $4, 0x7fff          # encoding: [0xff,0x7f,0x84,0x34]
+# CHECK-NEXT: add    $4, $4, $5              # encoding: [0x20,0x20,0x85,0x00]
+  add $4, $5, -0x8000
+# CHECK-NEXT: addi   $4, $5, -0x8000         # encoding: [0x00,0x80,0xa4,0x20]
+  add $4, $5, 0
+# CHECK-NEXT: addi   $4, $5, 0x0             # encoding: [0x00,0x00,0xa4,0x20]
+  add $4, $5, 0xFFFF
+# CHECK-NEXT: ori    $4, $zero, 0xffff       # encoding: [0xff,0xff,0x04,0x34]
+# CHECK-NEXT: add    $4, $4, $5              # encoding: [0x20,0x20,0x85,0x00]
+  add $4, $5, 0x10000
+# CHECK-NEXT: lui    $4, 0x1                 # encoding: [0x01,0x00,0x04,0x3c]
+# CHECK-NEXT: add    $4, $4, $5              # encoding: [0x20,0x20,0x85,0x00]
+  add $4, $5, 0xFFFFFFFF # This should be sign-extended because it's a 32-bit addi
+# CHECK-NEXT: addi   $4, $5, -0x1            # encoding: [0xff,0xff,0xa4,0x20]
+  add $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: lui     $4, 0xff                # encoding: [0xff,0x00,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xff0f          # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK-NEXT: add     $4, $4, $5              # encoding: [0x20,0x20,0x85,0x00]
+
+
+  addu $4, -0x80000000
+# CHECK:      lui    $1, 0x8000              # encoding: [0x00,0x80,0x01,0x3c]
+# CHECK-NEXT: addu   $4, $4, $1              # encoding: [0x21,0x20,0x81,0x00]
+  addu $4, -0x8001
+# CHECK-NEXT: lui    $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: ori    $1, $1, 0x7fff          # encoding: [0xff,0x7f,0x21,0x34]
+# CHECK-NEXT: addu   $4, $4, $1              # encoding: [0x21,0x20,0x81,0x00]
+  addu $4, -0x8000
+# CHECK-NEXT: addiu  $4, $4, -0x8000         # encoding: [0x00,0x80,0x84,0x24]
+  addu $4, 0
+# CHECK-NEXT: addiu  $4, $4, 0x0             # encoding: [0x00,0x00,0x84,0x24]
+  addu $4, 0xFFFF
+# CHECK-NEXT: ori    $1, $zero, 0xffff       # encoding: [0xff,0xff,0x01,0x34]
+# CHECK-NEXT: addu   $4, $4, $1              # encoding: [0x21,0x20,0x81,0x00]
+  addu $4, 0x10000
+# CHECK-NEXT: lui    $1, 0x1                 # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK-NEXT: addu   $4, $4, $1              # encoding: [0x21,0x20,0x81,0x00]
+  addu $4, 0xFFFFFFFF # This should be sign-extended because it's a 32-bit add
+# CHECK-NEXT: addiu  $4, $4, -0x1            # encoding: [0xff,0xff,0x84,0x24]
+  addu $4, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: lui    $1, 0xff                # encoding: [0xff,0x00,0x01,0x3c]
+# CHECK-NEXT: ori    $1, $1, 0xff0f          # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK-NEXT: addu   $4, $4, $1              # encoding: [0x21,0x20,0x81,0x00]
+
+  addu $4, $5, -0x80000000
+# CHECK:      lui    $4, 0x8000              # encoding: [0x00,0x80,0x04,0x3c]
+# CHECK-NEXT: addu   $4, $4, $5              # encoding: [0x21,0x20,0x85,0x00]
+  addu $4, $5, -0x8001
+# CHECK-NEXT: lui    $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: ori    $4, $4, 0x7fff          # encoding: [0xff,0x7f,0x84,0x34]
+# CHECK-NEXT: addu   $4, $4, $5              # encoding: [0x21,0x20,0x85,0x00]
+  addu $4, $5, -0x8000
+# CHECK-NEXT: addiu  $4, $5, -0x8000         # encoding: [0x00,0x80,0xa4,0x24]
+  addu $4, $5, 0
+# CHECK-NEXT: addiu  $4, $5, 0x0             # encoding: [0x00,0x00,0xa4,0x24]
+  addu $4, $5, 0xFFFF
+# CHECK-NEXT: ori    $4, $zero, 0xffff       # encoding: [0xff,0xff,0x04,0x34]
+# CHECK-NEXT: addu   $4, $4, $5              # encoding: [0x21,0x20,0x85,0x00]
+  addu $4, $5, 0x10000
+# CHECK-NEXT: lui    $4, 0x1                 # encoding: [0x01,0x00,0x04,0x3c]
+# CHECK-NEXT: addu   $4, $4, $5              # encoding: [0x21,0x20,0x85,0x00]
+  addu $4, $5, 0xFFFFFFFF # This should be sign-extended because it's a 32-bit add
+# CHECK-NEXT: addiu  $4, $5, -0x1              # encoding: [0xff,0xff,0xa4,0x24]
+  addu $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: lui     $4, 0xff                # encoding: [0xff,0x00,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xff0f          # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK-NEXT: addu    $4, $4, $5              # encoding: [0x21,0x20,0x85,0x00]
+
+
+  and $4, -0x80000000
+# CHECK:      lui    $1, 0x8000              # encoding: [0x00,0x80,0x01,0x3c]
+# CHECK-NEXT: and    $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+  and $4, -0x8001
+# CHECK-NEXT: lui    $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: ori    $1, $1, 0x7fff          # encoding: [0xff,0x7f,0x21,0x34]
+# CHECK-NEXT: and    $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+  and $4, -0x8000
+# CHECK-NEXT: addiu   $1, $zero, -0x8000     # encoding: [0x00,0x80,0x01,0x24]
+# CHECK-NEXT: and     $4, $4, $1             # encoding: [0x24,0x20,0x81,0x00]
+  and $4, 0
+# CHECK-NEXT: andi   $4, $4, 0x0             # encoding: [0x00,0x00,0x84,0x30]
+  and $4, 0xFFFF
+# CHECK-NEXT: andi   $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x30]
+  and $4, 0x10000
+# CHECK-NEXT: lui    $1, 0x1                 # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK-NEXT: and    $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+  and $4, 0xFFFFFFFF
+# CHECK-NEXT: lui     $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: dsrl32  $1, $1, 0x0             # encoding: [0x3e,0x08,0x01,0x00]
+# CHECK-NEXT: and     $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+  and $4, 0xF0000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: and     $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+  and $4, 0x7FFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: and     $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+  and $4, 0x7FFFFFFFFFFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: and     $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+  and $4, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: addiu   $1, $zero, -0x1         # encoding: [0xff,0xff,0x01,0x24]
+# CHECK-NEXT: and     $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+  and $4, 0xF000000000000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x30            # encoding: [0x3c,0x0c,0x01,0x00]
+# CHECK-NEXT: and     $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+  and $4, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $1, $zero, -0x1         # encoding: [0xff,0xff,0x01,0x24]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff            # encoding: [0xff,0x00,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff0f          # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK-NEXT: and     $4, $4, $1              # encoding: [0x24,0x20,0x81,0x00]
+
+  and $4, $5, -0x80000000
+# CHECK:      lui    $4, 0x8000              # encoding: [0x00,0x80,0x04,0x3c]
+# CHECK-NEXT: and    $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, -0x8001
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0x7fff          # encoding: [0xff,0x7f,0x84,0x34]
+# CHECK-NEXT: and     $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, -0x8000
+# CHECK-NEXT: addiu   $4, $zero, -0x8000      # encoding: [0x00,0x80,0x04,0x24]
+# CHECK-NEXT: and     $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, 0
+# CHECK-NEXT: andi    $4, $5, 0x0             # encoding: [0x00,0x00,0xa4,0x30]
+  and $4, $5, 0xFFFF
+# CHECK-NEXT: andi    $4, $5, 0xffff          # encoding: [0xff,0xff,0xa4,0x30]
+  and $4, $5, 0x10000
+# CHECK-NEXT: lui     $4, 0x1                 # encoding: [0x01,0x00,0x04,0x3c]
+# CHECK-NEXT: and     $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, 0xFFFFFFFF
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: dsrl32  $4, $4, 0x0             # encoding: [0x3e,0x20,0x04,0x00]
+# CHECK-NEXT: and     $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, 0xF0000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: and     $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, 0x7FFFFFFF
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: and     $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, 0x7FFFFFFFFFFFFFFF
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: and     $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: and     $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, 0xF000000000000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x30            # encoding: [0x3c,0x24,0x04,0x00]
+# CHECK-NEXT: and     $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+  and $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff            # encoding: [0xff,0x00,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff0f          # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK-NEXT: and     $4, $4, $5              # encoding: [0x24,0x20,0x85,0x00]
+
+  or $4, -0x80000000
+# CHECK:      lui     $1, 0x8000              # encoding: [0x00,0x80,0x01,0x3c]
+# CHECK-NEXT: or      $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $4, -0x8001
+# CHECK-NEXT: lui     $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0x7fff          # encoding: [0xff,0x7f,0x21,0x34]
+# CHECK-NEXT: or      $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $4, -0x8000
+# CHECK-NEXT: addiu   $1, $zero, -0x8000      # encoding: [0x00,0x80,0x01,0x24]
+# CHECK-NEXT: or      $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $4, 0
+# CHECK-NEXT: ori     $4, $4, 0x0             # encoding: [0x00,0x00,0x84,0x34]
+  or $4, 0xFFFF
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+  or $4, 0x10000
+# CHECK-NEXT: lui     $1, 0x1                 # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK-NEXT: or      $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $4, 0xFFFFFFFF
+# CHECK-NEXT: lui     $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: dsrl32  $1, $1, 0x0             # encoding: [0x3e,0x08,0x01,0x00]
+# CHECK-NEXT: or      $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $4, 0xF0000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: or      $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $4, 0x7FFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: or      $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $4, 0x7FFFFFFFFFFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: or      $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $4, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: addiu   $1, $zero, -0x1         # encoding: [0xff,0xff,0x01,0x24]
+# CHECK-NEXT: or      $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $4, 0xF000000000000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x30            # encoding: [0x3c,0x0c,0x01,0x00]
+# CHECK-NEXT: or      $4, $4, $1              # encoding: [0x25,0x20,0x81,0x00]
+  or $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff            # encoding: [0xff,0x00,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff0f          # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+
+  or $4, $5, -0x80000000
+# CHECK:      lui     $4, 0x8000              # encoding: [0x00,0x80,0x04,0x3c]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, -0x8001
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0x7fff          # encoding: [0xff,0x7f,0x84,0x34]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, -0x8000
+# CHECK-NEXT: addiu   $4, $zero, -0x8000      # encoding: [0x00,0x80,0x04,0x24]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, 0
+# CHECK-NEXT: ori     $4, $5, 0x0             # encoding: [0x00,0x00,0xa4,0x34]
+  or $4, $5, 0xFFFF
+# CHECK-NEXT: ori     $4, $5, 0xffff          # encoding: [0xff,0xff,0xa4,0x34]
+  or $4, $5, 0x10000
+# CHECK-NEXT: lui     $4, 0x1                 # encoding: [0x01,0x00,0x04,0x3c]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, 0xFFFFFFFF
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: dsrl32  $4, $4, 0x0             # encoding: [0x3e,0x20,0x04,0x00]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, 0xF0000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, 0x7FFFFFFF
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, 0x7FFFFFFFFFFFFFFF
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, 0xF000000000000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x30            # encoding: [0x3c,0x24,0x04,0x00]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+  or $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff            # encoding: [0xff,0x00,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff0f          # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK-NEXT: or      $4, $4, $5              # encoding: [0x25,0x20,0x85,0x00]
+
+  xor $4, -0x80000000
+# CHECK:      lui     $1, 0x8000              # encoding: [0x00,0x80,0x01,0x3c]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+  xor $4, -0x8001
+# CHECK-NEXT: lui     $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0x7fff          # encoding: [0xff,0x7f,0x21,0x34]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+  xor $4, -0x8000
+# CHECK-NEXT: addiu   $1, $zero, -0x8000      # encoding: [0x00,0x80,0x01,0x24]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+  xor $4, 0
+# CHECK-NEXT: xori    $4, $4, 0x0             # encoding: [0x00,0x00,0x84,0x38]
+  xor $4, 0xFFFF
+# CHECK-NEXT: xori    $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x38]
+  xor $4, 0x10000
+# CHECK-NEXT: lui     $1, 0x1                 # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+  xor $4, 0xFFFFFFFF
+# CHECK-NEXT: lui     $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: dsrl32  $1, $1, 0x0             # encoding: [0x3e,0x08,0x01,0x00]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+  xor $4, 0xF0000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00
+  xor $4, 0x7FFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+  xor $4, 0x7FFFFFFFFFFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+  xor $4, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: addiu   $1, $zero, -0x1         # encoding: [0xff,0xff,0x01,0x24]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+  xor $4, 0xF000000000000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x30            # encoding: [0x3c,0x0c,0x01,0x00]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+  xor $4, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $1, $zero, -0x1         # encoding: [0xff,0xff,0x01,0x24]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff            # encoding: [0xff,0x00,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff0f          # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK-NEXT: xor     $4, $4, $1              # encoding: [0x26,0x20,0x81,0x00]
+
+  xor $4, $5, -0x80000000
+# CHECK:      lui     $4, 0x8000              # encoding: [0x00,0x80,0x04,0x3c]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, -0x8001
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0x7fff          # encoding: [0xff,0x7f,0x84,0x34]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, -0x8000
+# CHECK-NEXT: addiu   $4, $zero, -0x8000      # encoding: [0x00,0x80,0x04,0x24]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, 0
+# CHECK-NEXT: xori    $4, $5, 0x0             # encoding: [0x00,0x00,0xa4,0x38]
+  xor $4, $5, 0xFFFF
+# CHECK-NEXT: xori    $4, $5, 0xffff          # encoding: [0xff,0xff,0xa4,0x38]
+  xor $4, $5, 0x10000
+# CHECK-NEXT: lui     $4, 0x1                 # encoding: [0x01,0x00,0x04,0x3c]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, 0xFFFFFFFF
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: dsrl32  $4, $4, 0x0             # encoding: [0x3e,0x20,0x04,0x00]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, 0xF0000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, 0x7FFFFFFF
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, 0x7FFFFFFFFFFFFFFF
+# FIXME: this is awfully inefficient...
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, 0xF000000000000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x30            # encoding: [0x3c,0x24,0x04,0x00]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+  xor $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff            # encoding: [0xff,0x00,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff0f          # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK-NEXT: xor     $4, $4, $5              # encoding: [0x26,0x20,0x85,0x00]
+
+  nor $4, 0
+# CHECK:      addiu   $1, $zero, 0x0          # encoding: [0x00,0x00,0x01,0x24]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 1
+# CHECK-NEXT: addiu   $1, $zero, 0x1          # encoding: [0x01,0x00,0x01,0x24]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0x8000
+# CHECK-NEXT: ori     $1, $zero, 0x8000       # encoding: [0x00,0x80,0x01,0x34]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, -0x8000
+# CHECK-NEXT: addiu   $1, $zero, -0x8000      # encoding: [0x00,0x80,0x01,0x24]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0x10000
+# CHECK-NEXT: lui     $1, 0x1                 # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0x1a5a5
+# CHECK-NEXT: lui     $1, 0x1                 # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xa5a5          # encoding: [0xa5,0xa5,0x21,0x34]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0xFFFFFFFF
+# CHECK-NEXT: lui     $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: dsrl32  $1, $1, 0x0             # encoding: [0x3e,0x08,0x01,0x00]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0xF0000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0x7FFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0x7FFFFFFFFFFFFFF
+# CHECK-NEXT: lui     $1, 0x7ff               # encoding: [0xff,0x07,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: addiu   $1, $zero, -0x1         # encoding: [0xff,0xff,0x01,0x24]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0xF000000000000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x30            # encoding: [0x3c,0x0c,0x01,0x00]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $1, $zero, -0x1         # encoding: [0xff,0xff,0x01,0x24]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff            # encoding: [0xff,0x00,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff0f          # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+  nor $4, 0xff00ff00
+# CHECK-NEXT: ori     $1, $zero, 0xff00       # encoding: [0x00,0xff,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff00          # encoding: [0x00,0xff,0x21,0x34]
+# CHECK-NEXT: nor     $4, $4, $1              # encoding: [0x27,0x20,0x81,0x00]
+
+  nor $4, $5, 0
+# CHECK:      addiu   $4, $zero, 0x0          # encoding: [0x00,0x00,0x04,0x24]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 1
+# CHECK-NEXT: addiu   $4, $zero, 0x1          # encoding: [0x01,0x00,0x04,0x24]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 0x8000
+# CHECK-NEXT: ori     $4, $zero, 0x8000       # encoding: [0x00,0x80,0x04,0x34]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, -0x8000
+# CHECK-NEXT: addiu   $4, $zero, -0x8000      # encoding: [0x00,0x80,0x04,0x24]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 0x10000
+# CHECK-NEXT: lui     $4, 0x1                 # encoding: [0x01,0x00,0x04,0x3c]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 0x1a5a5
+# CHECK-NEXT: lui     $4, 0x1                 # encoding: [0x01,0x00,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xa5a5          # encoding: [0xa5,0xa5,0x84,0x34]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 0xFFFFFFFF
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: dsrl32  $4, $4, 0x0             # encoding: [0x3e,0x20,0x04,0x00]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 0xF0000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 0x7FFFFFFF
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 0x7FFFFFFFFFFFFFFF
+# FIXME: this is awfully inefficient...
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 0xF000000000000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x30            # encoding: [0x3c,0x24,0x04,0x00]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff            # encoding: [0xff,0x00,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff0f          # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+  nor $4, $5, 0xff00ff00
+# CHECK-NEXT: ori     $4, $zero, 0xff00       # encoding: [0x00,0xff,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff00          # encoding: [0x00,0xff,0x84,0x34]
+# CHECK-NEXT: nor     $4, $4, $5              # encoding: [0x27,0x20,0x85,0x00]
+
+
+  slt $4, -0x80000000
+# CHECK:      lui     $1, 0x8000              # encoding: [0x00,0x80,0x01,0x3c]
+# CHECK-NEXT: slt     $4, $4, $1              # encoding: [0x2a,0x20,0x81,0x00]
+  slt $4, -0x8001
+# CHECK-NEXT: lui     $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0x7fff          # encoding: [0xff,0x7f,0x21,0x34]
+# CHECK-NEXT: slt     $4, $4, $1              # encoding: [0x2a,0x20,0x81,0x00]
+  slt $4, -0x8000
+# CHECK-NEXT: slti    $4, $4, -0x8000         # encoding: [0x00,0x80,0x84,0x28]
+  slt $4, 0
+# CHECK-NEXT: slti    $4, $4, 0x0             # encoding: [0x00,0x00,0x84,0x28]
+  slt $4, 0xFFFF
+# CHECK-NEXT: ori     $1, $zero, 0xffff       # encoding: [0xff,0xff,0x01,0x34]
+# CHECK-NEXT: slt     $4, $4, $1              # encoding: [0x2a,0x20,0x81,0x00]
+  slt $4, 0x10000
+# CHECK-NEXT: lui     $1, 0x1                 # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK-NEXT: slt     $4, $4, $1              # encoding: [0x2a,0x20,0x81,0x00]
+  slt $4, 0xFFFFFFFF
+# CHECK-NEXT: lui     $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: dsrl32  $1, $1, 0x0             # encoding: [0x3e,0x08,0x01,0x00]
+# CHECK-NEXT: slt     $4, $4, $1              # encoding: [0x2a,0x20,0x81,0x00]
+  slt $4, 0xF0000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: slt     $4, $4, $1              # encoding: [0x2a,0x20,0x81,0x00]
+  slt $4, 0x7FFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: slt     $4, $4, $1              # encoding: [0x2a,0x20,0x81,0x00]
+  slt $4, 0x7FFFFFFFFFFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: slt     $4, $4, $1              # encoding: [0x2a,0x20,0x81,0x00]
+  slt $4, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: slti    $4, $4, -0x1            # encoding: [0xff,0xff,0x84,0x28]
+  slt $4, 0xF000000000000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x30            # encoding: [0x3c,0x0c,0x01,0x00]
+# CHECK-NEXT: slt     $4, $4, $1              # encoding: [0x2a,0x20,0x81,0x00]
+  slt $4, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $1, $zero, -0x1         # encoding: [0xff,0xff,0x01,0x24]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff            # encoding: [0xff,0x00,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff0f          # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK-NEXT: slt     $4, $4, $1              # encoding: [0x2a,0x20,0x81,0x00]
+
+  slt $4, $5, -0x80000000
+# CHECK:      lui     $4, 0x8000              # encoding: [0x00,0x80,0x04,0x3c]
+# CHECK-NEXT: slt     $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
+  slt $4, $5, -0x8001
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0x7fff          # encoding: [0xff,0x7f,0x84,0x34]
+# CHECK-NEXT: slt     $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
+  slt $4, $5, -0x8000
+# CHECK-NEXT: slti    $4, $5, -0x8000         # encoding: [0x00,0x80,0xa4,0x28]
+  slt $4, $5, 0
+# CHECK-NEXT: slti    $4, $5, 0x0             # encoding: [0x00,0x00,0xa4,0x28]
+  slt $4, $5, 0xFFFF
+# CHECK-NEXT: ori     $4, $zero, 0xffff       # encoding: [0xff,0xff,0x04,0x34]
+# CHECK-NEXT: slt     $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
+  slt $4, $5, 0x10000
+# CHECK-NEXT: lui     $4, 0x1                 # encoding: [0x01,0x00,0x04,0x3c]
+# CHECK-NEXT: slt     $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
+  slt $4, $5, 0xFFFFFFFF
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: dsrl32  $4, $4, 0x0             # encoding: [0x3e,0x20,0x04,0x00]
+# CHECK-NEXT: slt     $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
+  slt $4, $5, 0xF0000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: slt     $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
+  slt $4, $5, 0x7FFFFFFF
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: slt     $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
+  slt $4, $5, 0x7FFFFFFFFFFFFFFF
+# FIXME: this is awfully inefficient...
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: slt     $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
+  slt $4, $5, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: slti    $4, $5, -0x1            # encoding: [0xff,0xff,0xa4,0x28]
+  slt $4, $5, 0xF000000000000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x30            # encoding: [0x3c,0x24,0x04,0x00]
+# CHECK-NEXT: slt $4, $4, $5                  # encoding: [0x2a,0x20,0x85,0x00]
+  slt $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff            # encoding: [0xff,0x00,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff0f          # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK-NEXT: slt     $4, $4, $5              # encoding: [0x2a,0x20,0x85,0x00]
+
+  sltu $4, -0x80000000
+# CHECK:      lui     $1, 0x8000              # encoding: [0x00,0x80,0x01,0x3c]
+# CHECK-NEXT: sltu    $4, $4, $1              # encoding: [0x2b,0x20,0x81,0x00]
+  sltu $4, -0x8001
+# CHECK-NEXT: lui     $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0x7fff          # encoding: [0xff,0x7f,0x21,0x34]
+# CHECK-NEXT: sltu    $4, $4, $1              # encoding: [0x2b,0x20,0x81,0x00]
+  sltu $4, -0x8000
+# CHECK-NEXT:   sltiu   $4, $4, -0x8000       # encoding: [0x00,0x80,0x84,0x2c]
+  sltu $4, 0
+# CHECK-NEXT: sltiu $4, $4, 0x0               # encoding: [0x00,0x00,0x84,0x2c]
+  sltu $4, 0xFFFF
+# CHECK-NEXT: ori     $1, $zero, 0xffff       # encoding: [0xff,0xff,0x01,0x34]
+# CHECK-NEXT: sltu    $4, $4, $1              # encoding: [0x2b,0x20,0x81,0x00]
+  sltu $4, 0x10000
+# CHECK-NEXT: lui     $1, 0x1                 # encoding: [0x01,0x00,0x01,0x3c]
+# CHECK-NEXT: sltu    $4, $4, $1              # encoding: [0x2b,0x20,0x81,0x00]
+  sltu $4, 0xFFFFFFFF
+# CHECK-NEXT: lui     $1, 0xffff              # encoding: [0xff,0xff,0x01,0x3c]
+# CHECK-NEXT: dsrl32  $1, $1, 0x0             # encoding: [0x3e,0x08,0x01,0x00]
+# CHECK-NEXT: sltu    $4, $4, $1              # encoding: [0x2b,0x20,0x81,0x00]
+  sltu $4, 0xF0000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: sltu    $4, $4, $1              # encoding: [0x2b,0x20,0x81,0x00]
+  sltu $4, 0x7FFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: sltu    $4, $4, $1              # encoding: [0x2b,0x20,0x81,0x00]
+  sltu $4, 0x7FFFFFFFFFFFFFFF
+# CHECK-NEXT: lui     $1, 0x7fff              # encoding: [0xff,0x7f,0x01,0x3c]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xffff          # encoding: [0xff,0xff,0x21,0x34]
+# CHECK-NEXT: sltu    $4, $4, $1              # encoding: [0x2b,0x20,0x81,0x00]
+  sltu $4, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: sltiu   $4, $4, -0x1            # encoding: [0xff,0xff,0x84,0x2c]
+  sltu $4, 0xF000000000000000
+# CHECK-NEXT: ori     $1, $zero, 0xf000       # encoding: [0x00,0xf0,0x01,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x30            # encoding: [0x3c,0x0c,0x01,0x00]
+# CHECK-NEXT: sltu    $4, $4, $1              # encoding: [0x2b,0x20,0x81,0x00]
+  sltu $4, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $1, $zero, -0x1         # encoding: [0xff,0xff,0x01,0x24]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff            # encoding: [0xff,0x00,0x21,0x34]
+# CHECK-NEXT: dsll    $1, $1, 0x10            # encoding: [0x38,0x0c,0x01,0x00]
+# CHECK-NEXT: ori     $1, $1, 0xff0f          # encoding: [0x0f,0xff,0x21,0x34]
+# CHECK-NEXT: sltu    $4, $4, $1              # encoding: [0x2b,0x20,0x81,0x00]
+
+  sltu $4, $5, -0x80000000
+# CHECK:      lui     $4, 0x8000              # encoding: [0x00,0x80,0x04,0x3c]
+# CHECK-NEXT: sltu    $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
+  sltu $4, $5, -0x8001
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0x7fff          # encoding: [0xff,0x7f,0x84,0x34]
+# CHECK-NEXT: sltu    $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
+  sltu $4, $5, -0x8000
+# CHECK-NEXT: sltiu   $4, $5, -0x8000         # encoding: [0x00,0x80,0xa4,0x2c]
+  sltu $4, $5, 0
+# CHECK-NEXT: sltiu   $4, $5, 0x0             # encoding: [0x00,0x00,0xa4,0x2c]
+  sltu $4, $5, 0xFFFF
+# CHECK-NEXT: ori     $4, $zero, 0xffff       # encoding: [0xff,0xff,0x04,0x34]
+# CHECK-NEXT: sltu    $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
+  sltu $4, $5, 0x10000
+# CHECK-NEXT: lui     $4, 0x1                 # encoding: [0x01,0x00,0x04,0x3c]
+# CHECK-NEXT: sltu    $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
+  sltu $4, $5, 0xFFFFFFFF
+# CHECK-NEXT: lui     $4, 0xffff              # encoding: [0xff,0xff,0x04,0x3c]
+# CHECK-NEXT: dsrl32  $4, $4, 0x0             # encoding: [0x3e,0x20,0x04,0x00]
+# CHECK-NEXT: sltu    $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
+  sltu $4, $5, 0xF0000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: sltu    $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
+  sltu $4, $5, 0x7FFFFFFF
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: sltu    $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
+  sltu $4, $5, 0x7FFFFFFFFFFFFFFF
+# FIXME: this is awfully inefficient...
+# CHECK-NEXT: lui     $4, 0x7fff              # encoding: [0xff,0x7f,0x04,0x3c]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xffff          # encoding: [0xff,0xff,0x84,0x34]
+# CHECK-NEXT: sltu    $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
+  sltu $4, $5, 0xFFFFFFFFFFFFFFFF
+# CHECK-NEXT: sltiu   $4, $5, -0x1            # encoding: [0xff,0xff,0xa4,0x2c]
+  sltu $4, $5, 0xF000000000000000
+# CHECK-NEXT: ori     $4, $zero, 0xf000       # encoding: [0x00,0xf0,0x04,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x30            # encoding: [0x3c,0x24,0x04,0x00]
+# CHECK-NEXT: sltu    $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
+  sltu $4, $5, ~(0xf0000000|0x0f000000|0x000000f0)
+# CHECK-NEXT: addiu   $4, $zero, -0x1         # encoding: [0xff,0xff,0x04,0x24]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff            # encoding: [0xff,0x00,0x84,0x34]
+# CHECK-NEXT: dsll    $4, $4, 0x10            # encoding: [0x38,0x24,0x04,0x00]
+# CHECK-NEXT: ori     $4, $4, 0xff0f          # encoding: [0x0f,0xff,0x84,0x34]
+# CHECK-NEXT: sltu    $4, $4, $5              # encoding: [0x2b,0x20,0x85,0x00]
diff --git a/test/MC/Mips/mips64extins.s b/test/MC/Mips/mips64extins.s
index 3f1973bf52deacd2282ccb052995fde2eebb0e9d..5bd18ff62d5e5be20be07c27e36567b245d84b41 100644
--- a/test/MC/Mips/mips64extins.s
+++ b/test/MC/Mips/mips64extins.s
@@ -5,5 +5,5 @@
         dextu $2, $4, 34, 6  # CHECK: dextu ${{[0-9]+}}, ${{[0-9]+}}, 34, 6
         dextm $2, $4, 5, 34  # CHECK: dextm ${{[0-9]+}}, ${{[0-9]+}}, 5, 34
         dins $4, $5, 8, 10   # CHECK: dins ${{[0-9]+}}, ${{[0-9]+}}, 8, 10
-        dinsm $4, $5, 10, 1  # CHECK: dinsm ${{[0-9]+}}, ${{[0-9]+}}, 10, 1
+        dinsm $4, $5, 30, 6  # CHECK: dinsm ${{[0-9]+}}, ${{[0-9]+}}, 30, 6
         dinsu $4, $5, 40, 13 # CHECK: dinsu ${{[0-9]+}}, ${{[0-9]+}}, 40, 13
diff --git a/test/MC/Mips/mul-macro-variants.s b/test/MC/Mips/mul-macro-variants.s
new file mode 100644
index 0000000000000000000000000000000000000000..a15c5e595228754ed23be36f8103d8e8d9fdb111
--- /dev/null
+++ b/test/MC/Mips/mul-macro-variants.s
@@ -0,0 +1,154 @@
+# RUN: llvm-mc  %s -triple mips-unknown-linux -show-encoding -mcpu=mips64r2 | FileCheck %s
+# RUN: llvm-mc  %s -triple mips-unknown-linux -show-encoding -mcpu=mips64r3 | FileCheck %s
+# RUN: llvm-mc  %s -triple mips-unknown-linux -show-encoding -mcpu=mips64r5 | FileCheck %s
+
+# RUN: llvm-mc  %s -triple mips-unknown-linux -show-encoding -mattr=use-tcc-in-div -mcpu=mips64 | FileCheck %s --check-prefix=CHECK-TRAP
+# RUN: llvm-mc  %s -triple mips-unknown-linux -show-encoding -mattr=use-tcc-in-div -mcpu=mips64r2 | FileCheck %s --check-prefix=CHECK-TRAP
+# RUN: llvm-mc  %s -triple mips-unknown-linux -show-encoding -mattr=use-tcc-in-div -mcpu=mips64r3 | FileCheck %s --check-prefix=CHECK-TRAP
+# RUN: llvm-mc  %s -triple mips-unknown-linux -show-encoding -mattr=use-tcc-in-div -mcpu=mips64r5 | FileCheck %s --check-prefix=CHECK-TRAP
+
+.text
+text_label:
+
+  mul  $4, $5
+# CHECK:        mul     $4, $4, $5              # encoding: [0x70,0x85,0x20,0x02]
+# CHECK-TRAP:   mul     $4, $4, $5              # encoding: [0x70,0x85,0x20,0x02]
+  mul   $4, $5, $6
+# CHECK:        mul     $4, $5, $6              # encoding: [0x70,0xa6,0x20,0x02]
+# CHECK-TRAP:   mul     $4, $5, $6              # encoding: [0x70,0xa6,0x20,0x02]
+  mul  $4, $5, 0
+# CHECK:        addiu   $1, $zero, 0            # encoding: [0x24,0x01,0x00,0x00]
+# CHECK:        mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   addiu   $1, $zero, 0            # encoding: [0x24,0x01,0x00,0x00]
+# CHECK-TRAP:   mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+  mul   $4, $5, 1
+# CHECK:        addiu   $1, $zero, 1            # encoding: [0x24,0x01,0x00,0x01]
+# CHECK:        mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   addiu   $1, $zero, 1            # encoding: [0x24,0x01,0x00,0x01]
+# CHECK-TRAP:   mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+  mul  $4, $5, 0x8000
+# CHECK:        ori     $1, $zero, 32768        # encoding: [0x34,0x01,0x80,0x00]
+# CHECK:        mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   ori     $1, $zero, 32768        # encoding: [0x34,0x01,0x80,0x00]
+# CHECK-TRAP:   mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+  mul  $4, $5, -0x8000
+# CHECK:        addiu   $1, $zero, -32768       # encoding: [0x24,0x01,0x80,0x00]
+# CHECK:        mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   addiu   $1, $zero, -32768       # encoding: [0x24,0x01,0x80,0x00]
+# CHECK-TRAP:   mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+  mul  $4, $5, 0x10000
+# CHECK:        lui     $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK:        mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   lui     $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP:   mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+  mul  $4, $5, 0x1a5a5
+# CHECK:        lui     $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK:        ori     $1, $1, 42405           # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK:        mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   lui     $1, 1                   # encoding: [0x3c,0x01,0x00,0x01]
+# CHECK-TRAP:   ori     $1, $1, 42405           # encoding: [0x34,0x21,0xa5,0xa5]
+# CHECK-TRAP:   mult    $5, $1                  # encoding: [0x00,0xa1,0x00,0x18]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+  mulo  $4, $5
+# CHECK:        mult    $4, $5                  # encoding: [0x00,0x85,0x00,0x18]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK:        sra     $4, $4, 31              # encoding: [0x00,0x04,0x27,0xc3]
+# CHECK:        mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK:        beq     $4, $1, $tmp0           # encoding: [0x10,0x81,A,A]
+# CHECK:        nop                             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:        break   6                       # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   mult    $4, $5                  # encoding: [0x00,0x85,0x00,0x18]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   sra     $4, $4, 31              # encoding: [0x00,0x04,0x27,0xc3]
+# CHECK-TRAP:   mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK-TRAP:   tne     $4, $1, 6               # encoding: [0x00,0x81,0x01,0xb6]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+
+  mulo  $4, $5, $6
+# CHECK:        mult    $5, $6                  # encoding: [0x00,0xa6,0x00,0x18]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK:        sra     $4, $4, 31              # encoding: [0x00,0x04,0x27,0xc3]
+# CHECK:        mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK:        beq     $4, $1, $tmp1           # encoding: [0x10,0x81,A,A]
+# CHECK:        nop                             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:        break   6                       # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   mult    $5, $6                  # encoding: [0x00,0xa6,0x00,0x18]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   sra     $4, $4, 31              # encoding: [0x00,0x04,0x27,0xc3]
+# CHECK-TRAP:   mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK-TRAP:   tne     $4, $1, 6               # encoding: [0x00,0x81,0x01,0xb6]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+ mulou  $4,$5
+# CHECK:        multu   $4, $5                  # encoding: [0x00,0x85,0x00,0x19]
+# CHECK:        mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK:        beqz    $1, $tmp2               # encoding: [0x10,0x20,A,A]
+# CHECK:        nop                             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:        break   6                       # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK-TRAP:   multu   $4, $5                  # encoding: [0x00,0x85,0x00,0x19]
+# CHECK-TRAP:   mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   tne     $1, $zero, 6            # encoding: [0x00,0x20,0x01,0xb6]
+ mulou $4, $5, $6
+# CHECK:        multu   $5, $6                  # encoding: [0x00,0xa6,0x00,0x19]
+# CHECK:        mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK:        beqz    $1, $tmp3               # encoding: [0x10,0x20,A,A]
+# CHECK:        nop                             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:        break   6                       # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK-TRAP:   multu   $5, $6                  # encoding: [0x00,0xa6,0x00,0x19]
+# CHECK-TRAP:   mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   tne     $1, $zero, 6            # encoding: [0x00,0x20,0x01,0xb6]
+
+ dmul $4, $5, $6
+# CHECK:        dmultu  $5, $6                  # encoding: [0x00,0xa6,0x00,0x1d]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP    dmultu  $5, $6                  # encoding: [0x00,0xa6,0x00,0x1d]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+ dmul $4, $5, 1
+# CHECK:        addiu   $1, $zero, 1            # encoding: [0x24,0x01,0x00,0x01]
+# CHECK:        dmult   $5, $1                  # encoding: [0x00,0xa1,0x00,0x1c]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   addiu   $1, $zero, 1            # encoding: [0x24,0x01,0x00,0x01]
+# CHECK-TRAP:   dmult   $5, $1                  # encoding: [0x00,0xa1,0x00,0x1c]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+ dmulo $4, $5, $6
+# CHECK:        dmult   $5, $6                  # encoding: [0x00,0xa6,0x00,0x1c]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK:        dsra32  $4, $4, 31              # encoding: [0x00,0x04,0x27,0xff]
+# CHECK:        mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK:        beq     $4, $1, $tmp4           # encoding: [0x10,0x81,A,A]
+# CHECK:        nop                             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:        break   6                       # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   dmult   $5, $6                  # encoding: [0x00,0xa6,0x00,0x1c]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   dsra32  $4, $4, 31              # encoding: [0x00,0x04,0x27,0xff]
+# CHECK-TRAP:   mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK-TRAP:   tne     $4, $1, 6               # encoding: [0x00,0x81,0x01,0xb6]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+ dmulou  $4,$5,$6
+# CHECK:        dmultu  $5, $6                  # encoding: [0x00,0xa6,0x00,0x1d]
+# CHECK:        mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK:        mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK:        beqz    $1, $tmp5               # encoding: [0x10,0x20,A,A]
+# CHECK:        nop                             # encoding: [0x00,0x00,0x00,0x00]
+# CHECK:        break   6                       # encoding: [0x00,0x06,0x00,0x0d]
+# CHECK-TRAP:   dmultu  $5, $6                  # encoding: [0x00,0xa6,0x00,0x1d]
+# CHECK-TRAP:   mfhi    $1                      # encoding: [0x00,0x00,0x08,0x10]
+# CHECK-TRAP:   mflo    $4                      # encoding: [0x00,0x00,0x20,0x12]
+# CHECK-TRAP:   tne     $1, $zero, 6            # encoding: [0x00,0x20,0x01,0xb6]
diff --git a/test/MC/Mips/set-nomacro.s b/test/MC/Mips/set-nomacro.s
index 1b7a49fbaffba3a58c5c9c8201bfc5df6d1f14b3..f0e2f8883863764532473fd6a7b7059edd872431 100644
--- a/test/MC/Mips/set-nomacro.s
+++ b/test/MC/Mips/set-nomacro.s
@@ -181,6 +181,15 @@
   bgtu $0, $0, local_label
 # CHECK-NOT: [[@LINE-1]]:3: warning: macro instruction expanded into multiple instructions
 
+  bnel $2, 0, local_label
+# CHECK-NOT: [[@LINE-1]]:3: warning: macro instruction expanded into multiple instructions
+  bnel $2, 1, local_label
+# CHECK: [[@LINE-1]]:3: warning: macro instruction expanded into multiple instructions
+  beql $2, 0, local_label
+# CHECK-NOT: [[@LINE-1]]:3: warning: macro instruction expanded into multiple instructions
+  beql $2, 1, local_label
+# CHECK: [[@LINE-1]]:3: warning: macro instruction expanded into multiple instructions
+
   ulh $5, 0
 # CHECK: [[@LINE-1]]:3: warning: macro instruction expanded into multiple instructions
   ulhu $5, 0
diff --git a/test/MC/Mips/sext_64_32.ll b/test/MC/Mips/sext_64_32.ll
index 5679829e8eab900517aa3fd7601a20584c7e13ed..f6c468187d7bd77e8052b3a1bacc41c6106b6031 100644
--- a/test/MC/Mips/sext_64_32.ll
+++ b/test/MC/Mips/sext_64_32.ll
@@ -11,7 +11,8 @@ entry:
   ret i64 %conv
 }
 
-; CHECK: dsll32 ${{[a-z0-9]+}}, ${{[a-z0-9]+}}, 0
+; CHECK-LABEL: foo_2:
+; CHECK: dext ${{[a-z0-9]+}}, ${{[a-z0-9]+}}, 0, 32
 
 define i64 @foo_2(i32 %ival_2) nounwind readnone {
 entry:
diff --git a/test/MC/PowerPC/ppc64-encoding-vmx.s b/test/MC/PowerPC/ppc64-encoding-vmx.s
index 16c48a71e428fa0492b934f2192fc5d78259ce89..62851e4082d88c3b2362720f6dc26ca29c5d10f5 100644
--- a/test/MC/PowerPC/ppc64-encoding-vmx.s
+++ b/test/MC/PowerPC/ppc64-encoding-vmx.s
@@ -550,9 +550,15 @@
 # CHECK-BE: vnor 2, 3, 4                    # encoding: [0x10,0x43,0x25,0x04]
 # CHECK-LE: vnor 2, 3, 4                    # encoding: [0x04,0x25,0x43,0x10]
             vnor 2, 3, 4
+# CHECK-BE: vnot 2, 3                       # encoding: [0x10,0x43,0x1d,0x04]
+# CHECK-LE: vnot 2, 3                       # encoding: [0x04,0x1d,0x43,0x10]
+            vnot 2, 3
 # CHECK-BE: vor 2, 3, 4                     # encoding: [0x10,0x43,0x24,0x84]
 # CHECK-LE: vor 2, 3, 4                     # encoding: [0x84,0x24,0x43,0x10]
             vor 2, 3, 4
+# CHECK-BE: vmr 2, 3                        # encoding: [0x10,0x43,0x1c,0x84]
+# CHECK-LE: vmr 2, 3                        # encoding: [0x84,0x1c,0x43,0x10]
+            vmr 2, 3
 # CHECK-BE: vxor 2, 3, 4                    # encoding: [0x10,0x43,0x24,0xc4]
 # CHECK-LE: vxor 2, 3, 4                    # encoding: [0xc4,0x24,0x43,0x10]
             vxor 2, 3, 4
diff --git a/test/MC/PowerPC/vsx.s b/test/MC/PowerPC/vsx.s
index 7dae97b0060ad1aaf4d68d1336b2ac9c201848ec..fc92af6967cd9ef8f09b378b9cc407e97810ceb5 100644
--- a/test/MC/PowerPC/vsx.s
+++ b/test/MC/PowerPC/vsx.s
@@ -532,9 +532,12 @@
             xxswapd 7, 63
 
 # Move to/from VSR
-# CHECK-BE: mfvsrd 3, 0                        # encoding: [0x7c,0x03,0x00,0x66]
-# CHECK-LE: mfvsrd 3, 0                        # encoding: [0x66,0x00,0x03,0x7c]
-            mfvsrd 3, 0
+# CHECK-BE: mfvsrd 3, 40                       # encoding: [0x7d,0x03,0x00,0x67]
+# CHECK-LE: mfvsrd 3, 40                       # encoding: [0x67,0x00,0x03,0x7d]
+            mfvsrd 3, 40
+# CHECK-BE: mfvsrd 3, 40                       # encoding: [0x7d,0x03,0x00,0x67]
+# CHECK-LE: mfvsrd 3, 40                       # encoding: [0x67,0x00,0x03,0x7d]
+            mfvrd 3, 8
 # CHECK-BE: mfvsrwz 5, 0                       # encoding: [0x7c,0x05,0x00,0xe6]
 # CHECK-LE: mfvsrwz 5, 0                       # encoding: [0xe6,0x00,0x05,0x7c]
             mfvsrwz 5, 0
diff --git a/test/MC/WebAssembly/file-headers.ll b/test/MC/WebAssembly/file-headers.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1e5dd4b160a80cd265ab21adf3729b9d8803a32e
--- /dev/null
+++ b/test/MC/WebAssembly/file-headers.ll
@@ -0,0 +1,9 @@
+; RUN: llc -filetype=obj %s -o - | llvm-readobj -file-headers | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown-wasm"
+
+; CHECK: Format: WASM{{$}}
+; CHECK: Arch: wasm32{{$}}
+; CHECK: AddressSize: 32bit{{$}}
+; CHECK: Version: 0x1{{$}}
diff --git a/test/MC/WebAssembly/lit.local.cfg b/test/MC/WebAssembly/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..0dd8c920ff1eb54b421a7387e8415f8718d91236
--- /dev/null
+++ b/test/MC/WebAssembly/lit.local.cfg
@@ -0,0 +1,2 @@
+if 'WebAssembly' not in config.root.targets:
+    config.unsupported = True
diff --git a/test/MC/X86/abs8.s b/test/MC/X86/abs8.s
new file mode 100644
index 0000000000000000000000000000000000000000..1172fb08d4e582ed00dfca364dc302e62b07b7b1
--- /dev/null
+++ b/test/MC/X86/abs8.s
@@ -0,0 +1,8 @@
+// RUN: llvm-mc -filetype=obj %s -o - -triple i686-pc-linux | llvm-objdump -d -r - | FileCheck --check-prefix=32 %s
+// RUN: llvm-mc -filetype=obj %s -o - -triple x86_64-pc-linux | llvm-objdump -d -r - | FileCheck --check-prefix=64 %s
+
+// 32: 0: 83 ff 00  cmpl $0, %edi
+// 32:   00000002:  R_386_8 foo
+// 64: 0: 83 ff 00  cmpl $0, %edi
+// 64:  0000000000000002:  R_X86_64_8 foo+0
+cmp $foo@ABS8, %edi
diff --git a/test/MC/X86/intel-syntax-bitwise-ops.s b/test/MC/X86/intel-syntax-bitwise-ops.s
index 1f09996fe914cb2fc090d798204f6163b4a8ac6f..6d4df609c061ac6db1ac9403676abed0cea369d9 100644
--- a/test/MC/X86/intel-syntax-bitwise-ops.s
+++ b/test/MC/X86/intel-syntax-bitwise-ops.s
@@ -6,19 +6,53 @@
     and ecx, 1+2
 // CHECK: andl	$3, %ecx
     and ecx, 1|2
-// CHECK: andl	$3, %ecx
+// CHECK: andl $3, %ecx
+    and ecx, 1 or 2
+// CHECK: andl $3, %ecx
+    and ecx, 1 OR 2
+// CHECK: andl $3, %ecx
     and ecx, 1*3
 // CHECK: andl	$1, %ecx
     and ecx, 1&3
-// CHECK: andl	$0, %ecx
+// CHECK: andl $1, %ecx
+    and ecx, 1 and 3
+// CHECK: andl $1, %ecx
+    and ecx, 1 AND 3
+// CHECK: andl $0, %ecx
     and ecx, (1&2)
-// CHECK: andl	$3, %ecx
+// CHECK: andl $0, %ecx
+    and ecx, (1 and 2)
+// CHECK: andl $0, %ecx
+    and ecx, (1 AND 2)
+// CHECK: andl $3, %ecx
     and ecx, ((1)|2)
-// CHECK: andl	$1, %ecx
+// CHECK: andl $3, %ecx
+    and ecx, ((1) or 2)
+// CHECK: andl $3, %ecx
+    and ecx, ((1) OR 2)
+// CHECK: andl $1, %ecx
     and ecx, 1&2+3
-// CHECK: addl	$4938, %eax
+// CHECK: andl $1, %ecx
+    and ecx, 1 and 2+3
+// CHECK: andl $1, %ecx
+    and ecx, 1 AND 2+3
+// CHECK: addl $4938, %eax
     add eax, 9876 >> 1
-// CHECK: addl	$19752, %eax
+// CHECK: addl $4938, %eax
+    add eax, 9876 shr 1
+// CHECK: addl $4938, %eax
+    add eax, 9876 SHR 1
+// CHECK: addl $19752, %eax
     add eax, 9876 << 1
-// CHECK: addl	$5, %eax
+// CHECK: addl $19752, %eax
+    add eax, 9876 shl 1
+// CHECK: addl $19752, %eax
+    add eax, 9876 SHL 1
+// CHECK: addl $5, %eax
     add eax, 6 ^ 3
+// CHECK: addl $5, %eax
+    add eax, 6 xor 3
+// CHECK: addl $5, %eax
+    add eax, 6 XOR 3
+// CHECK: addl $5, %eax
+    add eax, 6 XOR 3 shl 1 SHR 1
diff --git a/test/MC/X86/line-table-sections.s b/test/MC/X86/line-table-sections.s
new file mode 100644
index 0000000000000000000000000000000000000000..93b911d9576f2c81e9aab12ece09f8dc268dfae4
--- /dev/null
+++ b/test/MC/X86/line-table-sections.s
@@ -0,0 +1,15 @@
+// RUN: llvm-mc -triple x86_64-unknown-unknown -filetype=obj %s | llvm-objdump -r - | FileCheck %s
+// CHECK-NOT: RELOCATION RECORDS
+
+// ensure that a .loc directive at the end of a section doesn't bleed into the
+// following section previously this would produce a relocation for
+// .other_section in the line table. But it should actually produce no line
+// table entries at all.
+	.text
+	.file	1 "fail.cpp"
+	.loc	1 7 3 prologue_end      # fail.cpp:7:3
+	# addss   %xmm0, %xmm1
+
+	.section	.other_section,"",@progbits
+	.long	46                      # Length of Unit
+
diff --git a/test/MC/X86/x86-32-coverage.s b/test/MC/X86/x86-32-coverage.s
index 261b7276a8964fe85ae8a503e543de4a5a1c06ce..c4f649ff4f4b4a2857b59f6dc5c706e78a2bb687 100644
--- a/test/MC/X86/x86-32-coverage.s
+++ b/test/MC/X86/x86-32-coverage.s
@@ -10526,9 +10526,9 @@
 // CHECK: 	invlpga %ecx, %eax
         	invlpga %ecx, %eax
 
-// CHECK:   blendvps	(%eax), %xmm1   # encoding: [0x66,0x0f,0x38,0x14,0x08]
+// CHECK:   blendvps	%xmm0, (%eax), %xmm1   # encoding: [0x66,0x0f,0x38,0x14,0x08]
             blendvps (%eax), %xmm1
-// CHECK:   blendvps	%xmm2, %xmm1    # encoding: [0x66,0x0f,0x38,0x14,0xca]
+// CHECK:   blendvps	%xmm0, %xmm2, %xmm1    # encoding: [0x66,0x0f,0x38,0x14,0xca]
             blendvps %xmm2, %xmm1
 
 // These instructions can take an unsigned 8-bit mask as well as a signed 8-bit
@@ -10563,29 +10563,29 @@
           insertps $-64, %xmm2, %xmm1
 
 // PR13253 handle implicit optional third argument that must always be xmm0
-// CHECK: pblendvb %xmm2, %xmm1
+// CHECK: pblendvb %xmm0, %xmm2, %xmm1
 pblendvb %xmm2, %xmm1
-// CHECK: pblendvb %xmm2, %xmm1
+// CHECK: pblendvb %xmm0, %xmm2, %xmm1
 pblendvb %xmm0, %xmm2, %xmm1
-// CHECK: pblendvb (%eax), %xmm1
+// CHECK: pblendvb %xmm0, (%eax), %xmm1
 pblendvb (%eax), %xmm1
-// CHECK: pblendvb (%eax), %xmm1
+// CHECK: pblendvb %xmm0, (%eax), %xmm1
 pblendvb %xmm0, (%eax), %xmm1
-// CHECK: blendvpd %xmm2, %xmm1
+// CHECK: blendvpd %xmm0, %xmm2, %xmm1
 blendvpd %xmm2, %xmm1
-// CHECK: blendvpd %xmm2, %xmm1
+// CHECK: blendvpd %xmm0, %xmm2, %xmm1
 blendvpd %xmm0, %xmm2, %xmm1
-// CHECK: blendvpd (%eax), %xmm1
+// CHECK: blendvpd %xmm0, (%eax), %xmm1
 blendvpd (%eax), %xmm1
-// CHECK: blendvpd (%eax), %xmm1
+// CHECK: blendvpd %xmm0, (%eax), %xmm1
 blendvpd %xmm0, (%eax), %xmm1
-// CHECK: blendvps %xmm2, %xmm1
+// CHECK: blendvps %xmm0, %xmm2, %xmm1
 blendvps %xmm2, %xmm1
-// CHECK: blendvps %xmm2, %xmm1
+// CHECK: blendvps %xmm0, %xmm2, %xmm1
 blendvps %xmm0, %xmm2, %xmm1
-// CHECK: blendvps (%eax), %xmm1
+// CHECK: blendvps %xmm0, (%eax), %xmm1
 blendvps (%eax), %xmm1
-// CHECK: blendvps (%eax), %xmm1
+// CHECK: blendvps %xmm0, (%eax), %xmm1
 blendvps %xmm0, (%eax), %xmm1
 
 
@@ -10654,10 +10654,6 @@ btcq $4, (%eax)
 // CHECK:  encoding: [0x66,0x0f,0xae,0x35,0x78,0x56,0x34,0x12]
         	clwb	0x12345678
 
-// CHECK: pcommit
-// CHECK:  encoding: [0x66,0x0f,0xae,0xf8]
-        	pcommit
-
 // CHECK: xsave	3735928559(%ebx,%ecx,8)
 // CHECK:  encoding: [0x0f,0xae,0xa4,0xcb,0xef,0xbe,0xad,0xde]
         	xsave	0xdeadbeef(%ebx,%ecx,8)
diff --git a/test/MC/X86/x86-32.s b/test/MC/X86/x86-32.s
index c05cf41d91ed08858908f40beb6902ec429f7041..f3633dcffef4343bae446a43e1f58d0d67886be8 100644
--- a/test/MC/X86/x86-32.s
+++ b/test/MC/X86/x86-32.s
@@ -444,6 +444,14 @@ cmovnae	%bx,%bx
 // CHECK:  encoding: [0x0f,0x21,0xf8]
         movl %dr7,%eax
 
+// CHECK:       clzero
+// CHECK:  encoding: [0x0f,0x01,0xfc]
+                clzero
+
+// CHECK:       clzero
+// CHECK:  encoding: [0x0f,0x01,0xfc]
+                clzero %eax
+
 // radr://8017522
 // CHECK: wait
 // CHECK:  encoding: [0x9b]
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 89dc599e04f68c33e834c8be9d70486992a6382a..a605dbbbd7467a731ccaadb8c3423d9c11f793ba 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -1502,6 +1502,14 @@ vmovq %xmm0, %rax
 // CHECK:  encoding: [0x0f,0x01,0xfb]
         	mwaitx %rax, %rcx, %rbx
 
+// CHECK:       clzero
+// CHECK:  encoding: [0x0f,0x01,0xfc]
+                clzero
+
+// CHECK:       clzero
+// CHECK:  encoding: [0x0f,0x01,0xfc]
+                clzero %rax
+
 // CHECK: 	movl %r15d, (%r15,%r15)
 // CHECK:  encoding: [0x47,0x89,0x3c,0x3f]
 movl %r15d, (%r15,%r15)
diff --git a/test/MC/X86/x86_64-encoding.s b/test/MC/X86/x86_64-encoding.s
index 62af1bdb235732b7f1eeeef3c6455e8c28b7c2cb..c502ed4664338b5796f88454c99713e76265cbf3 100644
--- a/test/MC/X86/x86_64-encoding.s
+++ b/test/MC/X86/x86_64-encoding.s
@@ -148,19 +148,19 @@ sha1msg2 %xmm1, %xmm2
 // CHECK:   encoding: [0x0f,0x38,0xca,0x10]
 sha1msg2 (%rax), %xmm2
 
-// CHECK: sha256rnds2 (%rax), %xmm2
+// CHECK: sha256rnds2 %xmm0, (%rax), %xmm2
 // CHECK:   encoding: [0x0f,0x38,0xcb,0x10]
 sha256rnds2 (%rax), %xmm2
 
-// CHECK: sha256rnds2 %xmm1, %xmm2
+// CHECK: sha256rnds2 %xmm0, %xmm1, %xmm2
 // CHECK:   encoding: [0x0f,0x38,0xcb,0xd1]
 sha256rnds2 %xmm1, %xmm2
 
-// CHECK: sha256rnds2 (%rax), %xmm2
+// CHECK: sha256rnds2 %xmm0, (%rax), %xmm2
 // CHECK:   encoding: [0x0f,0x38,0xcb,0x10]
 sha256rnds2 %xmm0, (%rax), %xmm2
 
-// CHECK: sha256rnds2 %xmm1, %xmm2
+// CHECK: sha256rnds2 %xmm0, %xmm1, %xmm2
 // CHECK:   encoding: [0x0f,0x38,0xcb,0xd1]
 sha256rnds2 %xmm0, %xmm1, %xmm2
 
diff --git a/test/Object/AMDGPU/elf-definitios.yaml b/test/Object/AMDGPU/elf-definitions.yaml
similarity index 100%
rename from test/Object/AMDGPU/elf-definitios.yaml
rename to test/Object/AMDGPU/elf-definitions.yaml
diff --git a/test/Object/ARM/nm-mapping-symbol.s b/test/Object/ARM/nm-mapping-symbol.s
index 485c1cc39d7227cc8f0c0f9ed1237e68995a895e..9b7b5b583ea07fbbb3f0c75d55b45337518ca504 100644
--- a/test/Object/ARM/nm-mapping-symbol.s
+++ b/test/Object/ARM/nm-mapping-symbol.s
@@ -9,3 +9,4 @@
 
         .section        .foobar,"",%progbits
         .asciz  "foo"
+        nop
diff --git a/test/Object/Inputs/solaris-nosymbols.yaml b/test/Object/Inputs/solaris-nosymbols.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..85dabedcf9a59153edc3d388f3586d5a37d4be58
--- /dev/null
+++ b/test/Object/Inputs/solaris-nosymbols.yaml
@@ -0,0 +1,7 @@
+--- !ELF
+FileHeader:      
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+...
diff --git a/test/Object/X86/nm-ir.ll b/test/Object/X86/nm-ir.ll
index 1742a8f938e273a6511b7e7654c950d0239d4d4e..29f7a5c7018c8a87ec73d5644433b35fde2e4957 100644
--- a/test/Object/X86/nm-ir.ll
+++ b/test/Object/X86/nm-ir.ll
@@ -12,6 +12,7 @@
 ; CHECK-NEXT: C g3
 ; CHECK-NOT: g4
 ; CHECK-NEXT: T global_asm_sym
+; CHECK-NEXT: D ifunc_f1
 ; CHECK-NEXT: t local_asm_sym
 ; CHECK-NEXT: U undef_asm_sy
 
@@ -36,6 +37,8 @@ define void @f1() {
   ret void
 }
 
+@ifunc_f1 = ifunc void (), void ()* @f1
+
 define internal void @f2() {
   ret void
 }
diff --git a/test/Object/archive-extract.test b/test/Object/archive-extract.test
index 50372d530d88c212c005543520b0035fcac6414d..664529d6c8070d5b91b284a5e0f3ff94c97b4643 100644
--- a/test/Object/archive-extract.test
+++ b/test/Object/archive-extract.test
@@ -37,11 +37,16 @@
 ; RUN: rm -f very_long_bytecode_file_name.bc
 ; RUN: llvm-ar xo %p/Inputs/GNU.a very_long_bytecode_file_name.bc
 ; RUN: rm -f %t.a
-; RUN: llvm-ar rcU %t.a very_long_bytecode_file_name.bc
-; RUN: env TZ=GMT llvm-ar tv %t.a | FileCheck %s
+; RUN: llvm-ar -format gnu rcU %t.a very_long_bytecode_file_name.bc
+; RUN: env TZ=GMT llvm-ar tv %t.a | FileCheck %s -check-prefix CHECK-GNU
 
-CHECK: 1465 2004-11-19 03:01:31.000000000 very_long_bytecode_file_name.bc
+CHECK-GNU: 1465 2004-11-19 03:01:31.000000000 very_long_bytecode_file_name.bc
 
+; RUN: rm -f %t.a
+; RUN: llvm-ar -format darwin rcU %t.a very_long_bytecode_file_name.bc
+; RUN: env TZ=GMT llvm-ar tv %t.a | FileCheck %s -check-prefix CHECK-DARWIN
+
+CHECK-DARWIN: 1472 2004-11-19 03:01:31.000000000 very_long_bytecode_file_name.bc
 
 RUN: not llvm-ar x %p/Inputs/GNU.a foo.o 2>&1 | FileCheck --check-prefix=NOTFOUND %s
 NOTFOUND: foo.o was not found
@@ -52,5 +57,5 @@ THINEXTRACT: extracting from a thin archive is not supported
 RUN: llvm-ar p %p/Inputs/thin.a evenlen | FileCheck %s --check-prefix=EVENLEN
 EVENLEN: evenlen
 
-RUN: not llvm-ar p %p/Inputs/thin-path.a t/test2.o | FileCheck %s --check-prefix=MISSING
+RUN: not llvm-ar p %p/Inputs/thin-path.a t/test2.o 2>&1 | FileCheck %s --check-prefix=MISSING
 MISSING: {{N|n}}o such file or directory.
diff --git a/test/Object/archive-format.test b/test/Object/archive-format.test
index b9562a36d67b5c9af9d0ae3da824d00f1d46fd1e..219fc7f894a7f6c0ad5c336ce45afe1b0d14ffe8 100644
--- a/test/Object/archive-format.test
+++ b/test/Object/archive-format.test
@@ -37,6 +37,19 @@ BSD-NEXT: 0123456789abcde{{.....}}bar.
 BSD-SAME: #1/16           0           0     0     644     20        `
 BSD-NEXT: 0123456789abcdefzed.
 
+RUN: rm -f %t.a
+RUN: llvm-ar --format=darwin rc %t.a 0123456789abcde 0123456789abcdef
+RUN: cat %t.a | FileCheck -strict-whitespace --check-prefix=DARWIN %s
+
+DARWIN:      !<arch>
+DARWIN-NEXT: #1/20           0           0     0     644     28        `
+Each [[:space:]] matches a newline.  We explicitly match 3 newlines, as the
+fourth newline is implicitly consumed by FileCheck and cannot be matched.
+DARWIN-NEXT: 0123456789abcde{{.....}}bar.{{[[:space:]][[:space:]][[:space:]]}}
+DARWIN-NEXT: #1/20           0           0     0     644     28        `
+DARWIN-NEXT: 0123456789abcdef{{....}}zed.
+
+
 RUN: rm -f test.a
 RUN: llvm-ar --format=gnu rcT test.a 0123456789abcde 0123456789abcdef
 RUN: cat test.a | FileCheck -strict-whitespace --check-prefix=THIN %s
@@ -65,3 +78,15 @@ THIN-PATH-NEXT: /65             0           0     0     644     4         `
 
 RUN: not llvm-ar --format=bsd rcT bad.a 0123456789abcde 0123456789abcdef 2>&1 | FileCheck --check-prefix=BSD-THIN %s
 BSD-THIN: Only the gnu format has a thin mode.
+
+If an archive has an object with no symbols, the linker and some other
+tools on some versions of Solaris will abort operations if there is no
+symbol table.  Create such an object, put it into an archive, and check to
+see that there is an empty symbol table.
+RUN: mkdir -p %t
+RUN: yaml2obj %S/Inputs/solaris-nosymbols.yaml > %t/foo.o
+RUN: llvm-ar rs %t/foo.a %t/foo.o
+RUN: cat -v %t/foo.a | FileCheck -strict-whitespace --check-prefix=SOLARIS %s
+SOLARIS:      !<arch>
+SOLARIS-NEXT: /               0           0     0     0       8         `
+SOLARIS-NEXT: ^@^@^@^@^@^@^@^@foo.o/
diff --git a/test/Object/archive-pad.test b/test/Object/archive-pad.test
new file mode 100644
index 0000000000000000000000000000000000000000..343f51ef60afb38ded3640129f499819b1c0b7f4
--- /dev/null
+++ b/test/Object/archive-pad.test
@@ -0,0 +1,19 @@
+Test that only the darwin format needs to modify archive members to
+avoid a ld64 bug.
+
+RUN: echo foo > %t.o
+
+RUN: rm -f %t.a
+RUN: llvm-ar -format=bsd rc %t.a %t.o
+RUN: llvm-ar p %t.a > %t.bsd.o
+RUN: cmp %t.bsd.o %t.o
+
+RUN: rm -f %t.a
+RUN: llvm-ar -format=gnu rc %t.a %t.o
+RUN: llvm-ar p %t.a > %t.gnu.o
+RUN: cmp %t.gnu.o %t.o
+
+RUN: rm -f %t.a
+RUN: llvm-ar -format=darwin rc %t.a %t.o
+RUN: llvm-ar p %t.a > %t.darwin.o
+RUN: not cmp %t.darwin.o %t.o
diff --git a/test/Object/macho-invalid.test b/test/Object/macho-invalid.test
index 93feaa11eda44c2e33a25a9a6140ffb0da615e8c..e956680a2ce5497bfb4f90edf58a8657c1aeab56 100644
--- a/test/Object/macho-invalid.test
+++ b/test/Object/macho-invalid.test
@@ -117,7 +117,7 @@ RUN: not llvm-objdump -macho -private-headers %p/Inputs/macho-invalid-segment-fi
 INVALID-SEGMENT-FILESIZE: macho-invalid-segment-filesize': truncated or malformed object (load command 0 fileoff field plus filesize field in LC_SEGMENT extends past the end of the file)
 
 RUN: not llvm-objdump -macho -private-headers %p/Inputs/macho-invalid-segment-vmsize 2>&1 | FileCheck -check-prefix INVALID-SEGMENT-VMSIZE %s
-INVALID-SEGMENT-VMSIZE: macho-invalid-segment-vmsize': truncated or malformed object (load command 0 fileoff field in LC_SEGMENT greater than vmsize field)
+INVALID-SEGMENT-VMSIZE: macho-invalid-segment-vmsize': truncated or malformed object (load command 0 filesize field in LC_SEGMENT greater than vmsize field)
 
 RUN: not llvm-objdump -macho -private-headers %p/Inputs/macho-invalid-section-offset 2>&1 | FileCheck -check-prefix INVALID-SECTION-FILEOFF %s
 INVALID-SECTION-FILEOFF: macho-invalid-section-offset': truncated or malformed object (offset field of section 0 in LC_SEGMENT command 0 extends past the end of the file)
diff --git a/test/Object/nm-shared-object.test b/test/Object/nm-shared-object.test
index 32ae6a861529b567e4685e764fa8a2ee1754562e..975cf760ba9f99d65a547512f2f9d46d47fe5f9d 100644
--- a/test/Object/nm-shared-object.test
+++ b/test/Object/nm-shared-object.test
@@ -29,3 +29,5 @@ RUN: not llvm-nm -D %p/Inputs/trivial-object-test.coff-i386 2>&1 \
 RUN:         | FileCheck %s -check-prefix ERROR
 
 ERROR: File format has no dynamic symbol table.
+
+RUN: llvm-nm -D %p/Inputs/trivial-object-test.elf-i386 | count 0
diff --git a/test/ObjectYAML/MachO/DWARF-debug_aranges.yaml b/test/ObjectYAML/MachO/DWARF-debug_aranges.yaml
index 2822c94d77514ae7ba0d7d4cc0e577f52df00490..0b0421d6a09210e62f425afd7f3a4e11f59b937a 100644
--- a/test/ObjectYAML/MachO/DWARF-debug_aranges.yaml
+++ b/test/ObjectYAML/MachO/DWARF-debug_aranges.yaml
@@ -313,7 +313,8 @@ LinkEditData:
     - _main
 DWARF:           
   debug_aranges:   
-    - Length:          44
+    - Length:          
+        TotalLength:     44
       Version:         2
       CuOffset:        0
       AddrSize:        8
@@ -325,7 +326,8 @@ DWARF:
 
 #CHECK: DWARF:           
 #CHECK:   debug_aranges:   
-#CHECK:     - Length:          44
+#CHECK:     - Length:          
+#CHECK:         TotalLength:     44
 #CHECK:       Version:         2
 #CHECK:       CuOffset:        0
 #CHECK:       AddrSize:        8
diff --git a/test/ObjectYAML/MachO/DWARF-debug_info.yaml b/test/ObjectYAML/MachO/DWARF-debug_info.yaml
index b1b6b8ad19e8ce91ba441062909c108299e34ac1..0ede72bd1f419875cc638606e39ea966422bae6a 100644
--- a/test/ObjectYAML/MachO/DWARF-debug_info.yaml
+++ b/test/ObjectYAML/MachO/DWARF-debug_info.yaml
@@ -375,7 +375,8 @@ DWARF:
         - Attribute:       DW_AT_type
           Form:            DW_FORM_ref4
   debug_aranges:   
-    - Length:          44
+    - Length:          
+        TotalLength:     44
       Version:         2
       CuOffset:        0
       AddrSize:        8
@@ -384,7 +385,8 @@ DWARF:
         - Address:         0x0000000100000F50
           Length:          52
   debug_info:      
-    - Length:          117
+    - Length:          
+        TotalLength:     117
       Version:         4
       AbbrOffset:      0
       AddrSize:        8
@@ -452,7 +454,8 @@ DWARF:
         - AbbrCode:        0x00000000
           Values:          
   debug_line:      
-    - TotalLength:     65
+    - Length:          
+        TotalLength:     65
       Version:         2
       PrologueLength:  36
       MinInstLength:   1
@@ -508,7 +511,8 @@ DWARF:
 
 #CHECK: DWARF:           
 #CHECK:   debug_info:      
-#CHECK:     - Length:          117
+#CHECK:     - Length:          
+#CHECK:         TotalLength:     117
 #CHECK:       Version:         4
 #CHECK:       AbbrOffset:      0
 #CHECK:       AddrSize:        8
diff --git a/test/ObjectYAML/MachO/DWARF-debug_line.yaml b/test/ObjectYAML/MachO/DWARF-debug_line.yaml
index c1e015839f975feceb6db900c5047053b180138c..6d87ea68cdcf0f787b3ae6f1dcbdf43c965c72ab 100644
--- a/test/ObjectYAML/MachO/DWARF-debug_line.yaml
+++ b/test/ObjectYAML/MachO/DWARF-debug_line.yaml
@@ -394,7 +394,8 @@ DWARF:
         - Attribute:       DW_AT_type
           Form:            DW_FORM_ref4
   debug_aranges:   
-    - Length:          44
+    - Length:          
+        TotalLength:     44
       Version:         2
       CuOffset:        0
       AddrSize:        8
@@ -403,7 +404,8 @@ DWARF:
         - Address:         0x0000000100000F50
           Length:          52
   debug_pubnames:  
-    Length:          23
+    Length:          
+      TotalLength:     23
     Version:         2
     UnitOffset:      0
     UnitSize:        121
@@ -411,7 +413,8 @@ DWARF:
       - DieOffset:       0x0000002A
         Name:            main
   debug_pubtypes:  
-    Length:          31
+    Length:          
+      TotalLength:     31
     Version:         2
     UnitOffset:      0
     UnitSize:        121
@@ -421,7 +424,8 @@ DWARF:
       - DieOffset:       0x00000071
         Name:            char
   debug_info:      
-    - Length:          117
+    - Length:          
+        TotalLength:     117
       Version:         4
       AbbrOffset:      0
       AddrSize:        8
@@ -489,7 +493,8 @@ DWARF:
         - AbbrCode:        0x00000000
           Values:          
   debug_line:      
-    - TotalLength:     65
+    - Length:          
+        TotalLength:     65
       Version:         2
       PrologueLength:  36
       MinInstLength:   1
@@ -497,19 +502,7 @@ DWARF:
       LineBase:        251
       LineRange:       14
       OpcodeBase:      13
-      StandardOpcodeLengths: 
-        - 0
-        - 1
-        - 1
-        - 1
-        - 1
-        - 0
-        - 0
-        - 0
-        - 1
-        - 0
-        - 0
-        - 1
+      StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
       IncludeDirs:     
       Files:           
         - Name:            hello_world.c
@@ -542,7 +535,8 @@ DWARF:
 ...
 
 #CHECK:   debug_line:      
-#CHECK:     - TotalLength:     65
+#CHECK:     - Length:          
+#CHECK:         TotalLength:     65
 #CHECK:       Version:         2
 #CHECK:       PrologueLength:  36
 #CHECK:       MinInstLength:   1
@@ -550,19 +544,7 @@ DWARF:
 #CHECK:       LineBase:        251
 #CHECK:       LineRange:       14
 #CHECK:       OpcodeBase:      13
-#CHECK:       StandardOpcodeLengths: 
-#CHECK:         - 0
-#CHECK:         - 1
-#CHECK:         - 1
-#CHECK:         - 1
-#CHECK:         - 1
-#CHECK:         - 0
-#CHECK:         - 0
-#CHECK:         - 0
-#CHECK:         - 1
-#CHECK:         - 0
-#CHECK:         - 0
-#CHECK:         - 1
+#CHECK:       StandardOpcodeLengths: [ 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1 ]
 #CHECK:       IncludeDirs:     
 #CHECK:       Files:           
 #CHECK:         - Name:            hello_world.c
diff --git a/test/ObjectYAML/MachO/DWARF-pubsections.yaml b/test/ObjectYAML/MachO/DWARF-pubsections.yaml
index 8535ed0b5c452862adcfccb5ee5125279f133df7..a3c05ca5b35813381c60288a680a2967b103499e 100644
--- a/test/ObjectYAML/MachO/DWARF-pubsections.yaml
+++ b/test/ObjectYAML/MachO/DWARF-pubsections.yaml
@@ -314,7 +314,8 @@ DWARF:
     - int
     - char
   debug_pubnames:  
-    Length:          23
+    Length:
+      TotalLength:       23
     Version:         2
     UnitOffset:      0
     UnitSize:        121
@@ -322,7 +323,8 @@ DWARF:
       - DieOffset:       0x0000002A
         Name:            main
   debug_pubtypes:  
-    Length:          31
+    Length:
+      TotalLength:       31
     Version:         2
     UnitOffset:      0
     UnitSize:        121
@@ -335,7 +337,8 @@ DWARF:
 
 #CHECK: DWARF:           
 #CHECK:   debug_pubnames:  
-#CHECK:     Length:          23
+#CHECK:     Length:
+#CHECK:       TotalLength:       23
 #CHECK:     Version:         2
 #CHECK:     UnitOffset:      0
 #CHECK:     UnitSize:        121
@@ -343,7 +346,8 @@ DWARF:
 #CHECK:       - DieOffset:       0x0000002A
 #CHECK:         Name:            main
 #CHECK:   debug_pubtypes:  
-#CHECK:     Length:          31
+#CHECK:     Length:
+#CHECK:       TotalLength:       31
 #CHECK:     Version:         2
 #CHECK:     UnitOffset:      0
 #CHECK:     UnitSize:        121
diff --git a/test/ObjectYAML/MachO/DWARF2-AddrSize8-FormValues.yaml b/test/ObjectYAML/MachO/DWARF2-AddrSize8-FormValues.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e91935bcdc144c168152d2687f3d52512052c500
--- /dev/null
+++ b/test/ObjectYAML/MachO/DWARF2-AddrSize8-FormValues.yaml
@@ -0,0 +1,507 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !mach-o
+FileHeader:      
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x0000000A
+  ncmds:           5
+  sizeofcmds:      1800
+  flags:           0x00000000
+  reserved:        0x00000000
+LoadCommands:    
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         472
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        5
+    nsects:          5
+    flags:           0
+    Sections:        
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000100000F50
+        size:            52
+        offset:          0x00000000
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x0000000100000F84
+        size:            6
+        offset:          0x00000000
+        align:           1
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x00000000
+        reserved2:       0x00000006
+        reserved3:       0x00000000
+      - sectname:        __stub_helper
+        segname:         __TEXT
+        addr:            0x0000000100000F8C
+        size:            26
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x0000000100000FA6
+        size:            14
+        offset:          0x00000000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000002
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x0000000100000FB4
+        size:            72
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        3
+    nsects:          2
+    flags:           0
+    Sections:        
+      - sectname:        __nl_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001000
+        size:            16
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000006
+        reserved1:       0x00000001
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __la_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001010
+        size:            8
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000007
+        reserved1:       0x00000003
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294975488
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        60
+    maxprot:         7
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         952
+    segname:         __DWARF
+    vmaddr:          4294979584
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        764
+    maxprot:         7
+    initprot:        3
+    nsects:          11
+    flags:           0
+    Sections:        
+      - sectname:        __debug_line
+        segname:         __DWARF
+        addr:            0x0000000100003000
+        size:            69
+        offset:          0x00002000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubnames
+        segname:         __DWARF
+        addr:            0x0000000100003045
+        size:            27
+        offset:          0x00002045
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubtypes
+        segname:         __DWARF
+        addr:            0x0000000100003060
+        size:            35
+        offset:          0x00002060
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_aranges
+        segname:         __DWARF
+        addr:            0x0000000100003083
+        size:            48
+        offset:          0x00002083
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_info
+        segname:         __DWARF
+        addr:            0x00000001000030B3
+        size:            180
+        offset:          0x000020B3
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_abbrev
+        segname:         __DWARF
+        addr:            0x000000010000312C
+        size:            84
+        offset:          0x00002167
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_str
+        segname:         __DWARF
+        addr:            0x0000000100003178
+        size:            83
+        offset:          0x000021BB
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_names
+        segname:         __DWARF
+        addr:            0x0000000100003206
+        size:            36
+        offset:          0x0000221E
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_namespac
+        segname:         __DWARF
+        addr:            0x0000000100003242
+        size:            36
+        offset:          0x00002242
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_types
+        segname:         __DWARF
+        addr:            0x0000000100003266
+        size:            114
+        offset:          0x00002266
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_objc
+        segname:         __DWARF
+        addr:            0x00000001000032D8
+        size:            36
+        offset:          0x000022D8
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+DWARF:
+  debug_str:       
+    - World
+  debug_abbrev:    
+    - Code:            0x00000001
+      Tag:             DW_TAG_compile_unit
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       0x2000
+          Form:            DW_FORM_addr
+        - Attribute:       0x2001
+          Form:            DW_FORM_block
+        - Attribute:       DW_AT_MIPS_loop_begin
+          Form:            DW_FORM_block1
+        - Attribute:       DW_AT_MIPS_tail_loop_begin
+          Form:            DW_FORM_block2
+        - Attribute:       DW_AT_MIPS_epilog_begin
+          Form:            DW_FORM_block4
+        - Attribute:       DW_AT_MIPS_loop_unroll_factor
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_MIPS_software_pipeline_depth
+          Form:            DW_FORM_data2
+        - Attribute:       DW_AT_MIPS_linkage_name
+          Form:            DW_FORM_data4
+        - Attribute:       DW_AT_MIPS_stride
+          Form:            DW_FORM_data8
+        - Attribute:       DW_AT_MIPS_abstract_name
+          Form:            DW_FORM_string
+        - Attribute:       DW_AT_MIPS_clone_origin
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_MIPS_has_inlines
+          Form:            DW_FORM_ref_addr
+        - Attribute:       DW_AT_MIPS_stride_byte
+          Form:            DW_FORM_ref1
+        - Attribute:       DW_AT_MIPS_stride_elem
+          Form:            DW_FORM_ref2
+        - Attribute:       DW_AT_MIPS_ptr_dopetype
+          Form:            DW_FORM_ref4
+        - Attribute:       DW_AT_MIPS_allocatable_dopetype
+          Form:            DW_FORM_ref8
+        - Attribute:       DW_AT_MIPS_assumed_shape_dopetype
+          Form:            DW_FORM_ref_sig8
+        - Attribute:       DW_AT_MIPS_assumed_size
+          Form:            DW_FORM_ref_udata
+        - Attribute:       0x2012
+          Form:            DW_FORM_flag
+        - Attribute:       0x2013
+          Form:            DW_FORM_flag
+        - Attribute:       0x2014
+          Form:            DW_FORM_flag_present
+        - Attribute:       0x2015
+          Form:            DW_FORM_sdata
+        - Attribute:       0x2017
+          Form:            DW_FORM_udata
+        - Attribute:       0x2018
+          Form:            DW_FORM_GNU_ref_alt
+        - Attribute:       0x2019
+          Form:            DW_FORM_sec_offset
+        - Attribute:       0x201A
+          Form:            DW_FORM_addr
+  debug_info:      
+    - Length:          
+        TotalLength:     168
+      Version:         2
+      AbbrOffset:      0
+      AddrSize:        8
+      Entries:         
+        - AbbrCode:        0x00000001
+          Values:          
+            - Value:           0x0123456789ABCDEF
+            - Value:           0x000000000000000A
+              BlockData:       
+                - 0x01
+                - 0x02
+                - 0x03
+                - 0x04
+                - 0x05
+                - 0x06
+                - 0x07
+                - 0x08
+                - 0x09
+                - 0x00
+            - Value:           0x000000000000000A
+              BlockData:       
+                - 0x01
+                - 0x02
+                - 0x03
+                - 0x04
+                - 0x05
+                - 0x06
+                - 0x07
+                - 0x08
+                - 0x09
+                - 0x00
+            - Value:           0x000000000000000A
+              BlockData:       
+                - 0x01
+                - 0x02
+                - 0x03
+                - 0x04
+                - 0x05
+                - 0x06
+                - 0x07
+                - 0x08
+                - 0x09
+                - 0x00
+            - Value:           0x000000000000000A
+              BlockData:       
+                - 0x01
+                - 0x02
+                - 0x03
+                - 0x04
+                - 0x05
+                - 0x06
+                - 0x07
+                - 0x08
+                - 0x09
+                - 0x00
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000002345
+            - Value:           0x000000006789ABCD
+            - Value:           0x0011223344556677
+            - Value:           0x0000000000000000
+              CStr:            Hello
+            - Value:           0x0000000000000000
+            - Value:           0x0000000012345678
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000002345
+            - Value:           0x000000006789ABCD
+            - Value:           0x0011223344556677
+            - Value:           0xAABBCCDDEEFF0011
+            - Value:           0xFFFFFFFFFFFFFFFE
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000000
+            - Value:           0x0000000000000001
+            - Value:           0x8000000000000000
+            - Value:           0xFFFFFFFFFFFFFFFE
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000002
+            - Value:           0x0123456789ABCDEF
+...
+
+#CHECK:   debug_info:      
+#CHECK:     - Length:          
+#CHECK:         TotalLength:     168
+#CHECK:       Version:         2
+#CHECK:       AbbrOffset:      0
+#CHECK:       AddrSize:        8
+#CHECK:       Entries:         
+#CHECK:         - AbbrCode:        0x00000001
+#CHECK:           Values:          
+#CHECK:             - Value:           0x0123456789ABCDEF
+#CHECK:             - Value:           0x000000000000000A
+#CHECK:               BlockData:       
+#CHECK:                 - 0x01
+#CHECK:                 - 0x02
+#CHECK:                 - 0x03
+#CHECK:                 - 0x04
+#CHECK:                 - 0x05
+#CHECK:                 - 0x06
+#CHECK:                 - 0x07
+#CHECK:                 - 0x08
+#CHECK:                 - 0x09
+#CHECK:                 - 0x00
+#CHECK:             - Value:           0x000000000000000A
+#CHECK:               BlockData:       
+#CHECK:                 - 0x01
+#CHECK:                 - 0x02
+#CHECK:                 - 0x03
+#CHECK:                 - 0x04
+#CHECK:                 - 0x05
+#CHECK:                 - 0x06
+#CHECK:                 - 0x07
+#CHECK:                 - 0x08
+#CHECK:                 - 0x09
+#CHECK:                 - 0x00
+#CHECK:             - Value:           0x000000000000000A
+#CHECK:               BlockData:       
+#CHECK:                 - 0x01
+#CHECK:                 - 0x02
+#CHECK:                 - 0x03
+#CHECK:                 - 0x04
+#CHECK:                 - 0x05
+#CHECK:                 - 0x06
+#CHECK:                 - 0x07
+#CHECK:                 - 0x08
+#CHECK:                 - 0x09
+#CHECK:                 - 0x00
+#CHECK:             - Value:           0x000000000000000A
+#CHECK:               BlockData:       
+#CHECK:                 - 0x01
+#CHECK:                 - 0x02
+#CHECK:                 - 0x03
+#CHECK:                 - 0x04
+#CHECK:                 - 0x05
+#CHECK:                 - 0x06
+#CHECK:                 - 0x07
+#CHECK:                 - 0x08
+#CHECK:                 - 0x09
+#CHECK:                 - 0x00
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:             - Value:           0x0000000000002345
+#CHECK:             - Value:           0x000000006789ABCD
+#CHECK:             - Value:           0x0011223344556677
+#CHECK:               CStr:            Hello
+#CHECK:             - Value:           0x0000000000000000
+#CHECK:             - Value:           0x0000000012345678
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:             - Value:           0x0000000000002345
+#CHECK:             - Value:           0x000000006789ABCD
+#CHECK:             - Value:           0x0011223344556677
+#CHECK:             - Value:           0xAABBCCDDEEFF0011
+#CHECK:             - Value:           0xFFFFFFFFFFFFFFFE
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:             - Value:           0x0000000000000000
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:             - Value:           0xFFFFFFFFFFFFFFFE
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:             - Value:           0x0000000000000002
+#CHECK:             - Value:           0x0123456789ABCDEF
diff --git a/test/ObjectYAML/MachO/DWARF5-abbrevValues.yaml b/test/ObjectYAML/MachO/DWARF5-abbrevValues.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..047ee749d674b7a2bd1c95314cc2f28834e7a7c4
--- /dev/null
+++ b/test/ObjectYAML/MachO/DWARF5-abbrevValues.yaml
@@ -0,0 +1,307 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !mach-o
+FileHeader:      
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x0000000A
+  ncmds:           5
+  sizeofcmds:      1800
+  flags:           0x00000000
+  reserved:        0x00000000
+LoadCommands:    
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         472
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        5
+    nsects:          5
+    flags:           0
+    Sections:        
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000100000F50
+        size:            52
+        offset:          0x00000000
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x0000000100000F84
+        size:            6
+        offset:          0x00000000
+        align:           1
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x00000000
+        reserved2:       0x00000006
+        reserved3:       0x00000000
+      - sectname:        __stub_helper
+        segname:         __TEXT
+        addr:            0x0000000100000F8C
+        size:            26
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x0000000100000FA6
+        size:            14
+        offset:          0x00000000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000002
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x0000000100000FB4
+        size:            72
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        3
+    nsects:          2
+    flags:           0
+    Sections:        
+      - sectname:        __nl_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001000
+        size:            16
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000006
+        reserved1:       0x00000001
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __la_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001010
+        size:            8
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000007
+        reserved1:       0x00000003
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294975488
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        60
+    maxprot:         7
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         952
+    segname:         __DWARF
+    vmaddr:          4294979584
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        764
+    maxprot:         7
+    initprot:        3
+    nsects:          11
+    flags:           0
+    Sections:        
+      - sectname:        __debug_line
+        segname:         __DWARF
+        addr:            0x0000000100003000
+        size:            69
+        offset:          0x00002000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubnames
+        segname:         __DWARF
+        addr:            0x0000000100003045
+        size:            27
+        offset:          0x00002045
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubtypes
+        segname:         __DWARF
+        addr:            0x0000000100003060
+        size:            35
+        offset:          0x00002060
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_aranges
+        segname:         __DWARF
+        addr:            0x0000000100003083
+        size:            48
+        offset:          0x00002083
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_info
+        segname:         __DWARF
+        addr:            0x00000001000030B3
+        size:            121
+        offset:          0x000020B3
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_abbrev
+        segname:         __DWARF
+        addr:            0x000000010000312C
+        size:            76
+        offset:          0x0000212C
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_str
+        segname:         __DWARF
+        addr:            0x0000000100003178
+        size:            142
+        offset:          0x00002178
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_names
+        segname:         __DWARF
+        addr:            0x0000000100003206
+        size:            60
+        offset:          0x00002206
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_namespac
+        segname:         __DWARF
+        addr:            0x0000000100003242
+        size:            36
+        offset:          0x00002242
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_types
+        segname:         __DWARF
+        addr:            0x0000000100003266
+        size:            114
+        offset:          0x00002266
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_objc
+        segname:         __DWARF
+        addr:            0x00000001000032D8
+        size:            36
+        offset:          0x000022D8
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+DWARF:           
+  debug_abbrev:     
+    - Code:            0x00000001
+      Tag:             DW_TAG_compile_unit
+      Children:        DW_CHILDREN_yes
+      Attributes:      
+        - Attribute:       0x2001
+          Form:            DW_FORM_implicit_const
+          Value:           0x12345678
+...
+
+#CHECK: DWARF:           
+#CHECK:   debug_abbrev:
+#CHECK:     - Code:            0x00000001
+#CHECK:       Tag:             DW_TAG_compile_unit
+#CHECK:       Children:        DW_CHILDREN_yes
+#CHECK:       Attributes:
+#CHECK:         - Attribute:       0x2001
+#CHECK:           Form:            DW_FORM_implicit_const
+#CHECK:           Value:           0x0000000012345678
diff --git a/test/ObjectYAML/MachO/DWARF5-debug_info.yaml b/test/ObjectYAML/MachO/DWARF5-debug_info.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d0e81834b59319ca8de56dc07c6b56e6867d5c68
--- /dev/null
+++ b/test/ObjectYAML/MachO/DWARF5-debug_info.yaml
@@ -0,0 +1,582 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+
+--- !mach-o
+FileHeader:      
+  magic:           0xFEEDFACF
+  cputype:         0x01000007
+  cpusubtype:      0x00000003
+  filetype:        0x0000000A
+  ncmds:           5
+  sizeofcmds:      1800
+  flags:           0x00000000
+  reserved:        0x00000000
+LoadCommands:    
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __PAGEZERO
+    vmaddr:          0
+    vmsize:          4294967296
+    fileoff:         0
+    filesize:        0
+    maxprot:         0
+    initprot:        0
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         472
+    segname:         __TEXT
+    vmaddr:          4294967296
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        5
+    nsects:          5
+    flags:           0
+    Sections:        
+      - sectname:        __text
+        segname:         __TEXT
+        addr:            0x0000000100000F50
+        size:            52
+        offset:          0x00000000
+        align:           4
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __stubs
+        segname:         __TEXT
+        addr:            0x0000000100000F84
+        size:            6
+        offset:          0x00000000
+        align:           1
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000408
+        reserved1:       0x00000000
+        reserved2:       0x00000006
+        reserved3:       0x00000000
+      - sectname:        __stub_helper
+        segname:         __TEXT
+        addr:            0x0000000100000F8C
+        size:            26
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x80000400
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __cstring
+        segname:         __TEXT
+        addr:            0x0000000100000FA6
+        size:            14
+        offset:          0x00000000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000002
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __unwind_info
+        segname:         __TEXT
+        addr:            0x0000000100000FB4
+        size:            72
+        offset:          0x00000000
+        align:           2
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         232
+    segname:         __DATA
+    vmaddr:          4294971392
+    vmsize:          4096
+    fileoff:         0
+    filesize:        0
+    maxprot:         7
+    initprot:        3
+    nsects:          2
+    flags:           0
+    Sections:        
+      - sectname:        __nl_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001000
+        size:            16
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000006
+        reserved1:       0x00000001
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __la_symbol_ptr
+        segname:         __DATA
+        addr:            0x0000000100001010
+        size:            8
+        offset:          0x00000000
+        align:           3
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000007
+        reserved1:       0x00000003
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         72
+    segname:         __LINKEDIT
+    vmaddr:          4294975488
+    vmsize:          4096
+    fileoff:         4096
+    filesize:        60
+    maxprot:         7
+    initprot:        1
+    nsects:          0
+    flags:           0
+  - cmd:             LC_SEGMENT_64
+    cmdsize:         952
+    segname:         __DWARF
+    vmaddr:          4294979584
+    vmsize:          4096
+    fileoff:         8192
+    filesize:        765
+    maxprot:         7
+    initprot:        3
+    nsects:          11
+    flags:           0
+    Sections:        
+      - sectname:        __debug_line
+        segname:         __DWARF
+        addr:            0x0000000100003000
+        size:            69
+        offset:          0x00002000
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubnames
+        segname:         __DWARF
+        addr:            0x0000000100003045
+        size:            27
+        offset:          0x00002045
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_pubtypes
+        segname:         __DWARF
+        addr:            0x0000000100003060
+        size:            35
+        offset:          0x00002060
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_aranges
+        segname:         __DWARF
+        addr:            0x0000000100003083
+        size:            48
+        offset:          0x00002083
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_info
+        segname:         __DWARF
+        addr:            0x00000001000030B3
+        size:            122
+        offset:          0x000020B3
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_abbrev
+        segname:         __DWARF
+        addr:            0x000000010000312C
+        size:            76
+        offset:          0x0000212D
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __debug_str
+        segname:         __DWARF
+        addr:            0x0000000100003178
+        size:            142
+        offset:          0x00002179
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_names
+        segname:         __DWARF
+        addr:            0x0000000100003206
+        size:            60
+        offset:          0x00002207
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_namespac
+        segname:         __DWARF
+        addr:            0x0000000100003242
+        size:            36
+        offset:          0x00002243
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_types
+        segname:         __DWARF
+        addr:            0x0000000100003266
+        size:            114
+        offset:          0x00002267
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+      - sectname:        __apple_objc
+        segname:         __DWARF
+        addr:            0x00000001000032D8
+        size:            36
+        offset:          0x000022D9
+        align:           0
+        reloff:          0x00000000
+        nreloc:          0
+        flags:           0x00000000
+        reserved1:       0x00000000
+        reserved2:       0x00000000
+        reserved3:       0x00000000
+LinkEditData:    
+  NameList:        
+    - n_strx:          2
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          16
+      n_value:         4294967296
+    - n_strx:          22
+      n_type:          0x0F
+      n_sect:          1
+      n_desc:          0
+      n_value:         4294971216
+  StringTable:     
+    - ''
+    - ''
+    - __mh_execute_header
+    - _main
+DWARF:           
+  debug_abbrev:    
+    - Code:            0x00000001
+      Tag:             DW_TAG_compile_unit
+      Children:        DW_CHILDREN_yes
+      Attributes:      
+        - Attribute:       DW_AT_producer
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_language
+          Form:            DW_FORM_data2
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_stmt_list
+          Form:            DW_FORM_sec_offset
+        - Attribute:       DW_AT_comp_dir
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_low_pc
+          Form:            DW_FORM_addr
+        - Attribute:       DW_AT_high_pc
+          Form:            DW_FORM_data4
+    - Code:            0x00000002
+      Tag:             DW_TAG_subprogram
+      Children:        DW_CHILDREN_yes
+      Attributes:      
+        - Attribute:       DW_AT_low_pc
+          Form:            DW_FORM_addr
+        - Attribute:       DW_AT_high_pc
+          Form:            DW_FORM_data4
+        - Attribute:       DW_AT_frame_base
+          Form:            DW_FORM_exprloc
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_decl_file
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_decl_line
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_prototyped
+          Form:            DW_FORM_flag_present
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+        - Attribute:       DW_AT_external
+          Form:            DW_FORM_flag_present
+    - Code:            0x00000003
+      Tag:             DW_TAG_formal_parameter
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_location
+          Form:            DW_FORM_exprloc
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_decl_file
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_decl_line
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+    - Code:            0x00000004
+      Tag:             DW_TAG_base_type
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_name
+          Form:            DW_FORM_strp
+        - Attribute:       DW_AT_encoding
+          Form:            DW_FORM_data1
+        - Attribute:       DW_AT_byte_size
+          Form:            DW_FORM_data1
+    - Code:            0x00000005
+      Tag:             DW_TAG_pointer_type
+      Children:        DW_CHILDREN_no
+      Attributes:      
+        - Attribute:       DW_AT_type
+          Form:            DW_FORM_ref4
+  debug_aranges:   
+    - Length:          
+        TotalLength:     44
+      Version:         2
+      CuOffset:        0
+      AddrSize:        8
+      SegSize:         0
+      Descriptors:     
+        - Address:         0x0000000100000F50
+          Length:          52
+  debug_info:      
+    - Length:          
+        TotalLength:     118
+      Version:         5
+      UnitType:        DW_UT_compile
+      AbbrOffset:      0
+      AddrSize:        8
+      Entries:         
+        - AbbrCode:        0x00000001
+          Values:          
+            - Value:           0x0000000000000001
+            - Value:           0x000000000000000C
+            - Value:           0x0000000000000038
+            - Value:           0x0000000000000000
+            - Value:           0x0000000000000046
+            - Value:           0x0000000100000F50
+            - Value:           0x0000000000000034
+        - AbbrCode:        0x00000002
+          Values:          
+            - Value:           0x0000000100000F50
+            - Value:           0x0000000000000034
+            - Value:           0x0000000000000001
+              BlockData:       
+                - 0x56
+            - Value:           0x0000000000000076
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000003
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000060
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000003
+          Values:          
+            - Value:           0x0000000000000002
+              BlockData:       
+                - 0x91
+                - 0x78
+            - Value:           0x000000000000007B
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000003
+            - Value:           0x0000000000000060
+        - AbbrCode:        0x00000003
+          Values:          
+            - Value:           0x0000000000000002
+              BlockData:       
+                - 0x91
+                - 0x70
+            - Value:           0x0000000000000080
+            - Value:           0x0000000000000001
+            - Value:           0x0000000000000003
+            - Value:           0x0000000000000067
+        - AbbrCode:        0x00000000
+          Values:          
+        - AbbrCode:        0x00000004
+          Values:          
+            - Value:           0x0000000000000085
+            - Value:           0x0000000000000005
+            - Value:           0x0000000000000004
+        - AbbrCode:        0x00000005
+          Values:          
+            - Value:           0x000000000000006C
+        - AbbrCode:        0x00000005
+          Values:          
+            - Value:           0x0000000000000071
+        - AbbrCode:        0x00000004
+          Values:          
+            - Value:           0x0000000000000089
+            - Value:           0x0000000000000006
+            - Value:           0x0000000000000001
+        - AbbrCode:        0x00000000
+          Values:          
+  debug_line:      
+    - Length:          
+        TotalLength:     65
+      Version:         2
+      PrologueLength:  36
+      MinInstLength:   1
+      DefaultIsStmt:   1
+      LineBase:        251
+      LineRange:       14
+      OpcodeBase:      13
+      StandardOpcodeLengths: 
+        - 0
+        - 1
+        - 1
+        - 1
+        - 1
+        - 0
+        - 0
+        - 0
+        - 1
+        - 0
+        - 0
+        - 1
+      IncludeDirs:     
+      Files:           
+        - Name:            hello_world.c
+          DirIdx:          0
+          ModTime:         0
+          Length:          0
+      Opcodes:         
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          9
+          SubOpcode:       DW_LNE_set_address
+          Data:            4294971216
+        - Opcode:          0x14
+          Data:            4294971216
+        - Opcode:          DW_LNS_set_column
+          Data:            3
+        - Opcode:          DW_LNS_set_prologue_end
+          Data:            3
+        - Opcode:          DW_LNS_const_add_pc
+          Data:            3
+        - Opcode:          0xBB
+          Data:            3
+        - Opcode:          0xBB
+          Data:            3
+        - Opcode:          DW_LNS_advance_pc
+          Data:            11
+        - Opcode:          DW_LNS_extended_op
+          ExtLen:          1
+          SubOpcode:       DW_LNE_end_sequence
+          Data:            11
+...
+
+
+#CHECK: DWARF:           
+#CHECK:   debug_info:      
+#CHECK:     - Length:          
+#CHECK:         TotalLength:     118
+#CHECK:       Version:         5
+#CHECK:       UnitType:        DW_UT_compile
+#CHECK:       AbbrOffset:      0
+#CHECK:       AddrSize:        8
+#CHECK:       Entries:         
+#CHECK:         - AbbrCode:        0x00000001
+#CHECK:           Values:          
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:             - Value:           0x000000000000000C
+#CHECK:             - Value:           0x0000000000000038
+#CHECK:             - Value:           0x0000000000000000
+#CHECK:             - Value:           0x0000000000000046
+#CHECK:             - Value:           0x0000000100000F50
+#CHECK:             - Value:           0x0000000000000034
+#CHECK:         - AbbrCode:        0x00000002
+#CHECK:           Values:          
+#CHECK:             - Value:           0x0000000100000F50
+#CHECK:             - Value:           0x0000000000000034
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:               BlockData:       
+#CHECK:                 - 0x56
+#CHECK:             - Value:           0x0000000000000076
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:             - Value:           0x0000000000000003
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:             - Value:           0x0000000000000060
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:         - AbbrCode:        0x00000003
+#CHECK:           Values:          
+#CHECK:             - Value:           0x0000000000000002
+#CHECK:               BlockData:       
+#CHECK:                 - 0x91
+#CHECK:                 - 0x78
+#CHECK:             - Value:           0x000000000000007B
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:             - Value:           0x0000000000000003
+#CHECK:             - Value:           0x0000000000000060
+#CHECK:         - AbbrCode:        0x00000003
+#CHECK:           Values:          
+#CHECK:             - Value:           0x0000000000000002
+#CHECK:               BlockData:       
+#CHECK:                 - 0x91
+#CHECK:                 - 0x70
+#CHECK:             - Value:           0x0000000000000080
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:             - Value:           0x0000000000000003
+#CHECK:             - Value:           0x0000000000000067
+#CHECK:         - AbbrCode:        0x00000000
+#CHECK:           Values:          
+#CHECK:         - AbbrCode:        0x00000004
+#CHECK:           Values:          
+#CHECK:             - Value:           0x0000000000000085
+#CHECK:             - Value:           0x0000000000000005
+#CHECK:             - Value:           0x0000000000000004
+#CHECK:         - AbbrCode:        0x00000005
+#CHECK:           Values:          
+#CHECK:             - Value:           0x000000000000006C
+#CHECK:         - AbbrCode:        0x00000005
+#CHECK:           Values:          
+#CHECK:             - Value:           0x0000000000000071
+#CHECK:         - AbbrCode:        0x00000004
+#CHECK:           Values:          
+#CHECK:             - Value:           0x0000000000000089
+#CHECK:             - Value:           0x0000000000000006
+#CHECK:             - Value:           0x0000000000000001
+#CHECK:         - AbbrCode:        0x00000000
+#CHECK:           Values:          
diff --git a/test/ObjectYAML/wasm/code_section.yaml b/test/ObjectYAML/wasm/code_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..b75bf7e1cfd55bbd3b719714a7e56676fbee8893
--- /dev/null
+++ b/test/ObjectYAML/wasm/code_section.yaml
@@ -0,0 +1,72 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - ReturnType:      F32
+        ParamTypes:
+          - I32
+      - ReturnType:      NORESULT
+        ParamTypes:
+          - I32
+          - I64
+  - Type:            FUNCTION
+    FunctionTypes:
+      - 0
+      - 1
+  - Type:            CODE
+    Relocations:
+      - Type:            R_WEBASSEMBLY_TABLE_INDEX_SLEB
+        Index:           0
+        Offset:          0x00000006
+        Addend:          0x00000000
+      - Type:            R_WEBASSEMBLY_FUNCTION_INDEX_LEB
+        Index:           1
+        Offset:          0x00000025
+        Addend:          0x00000000
+    Functions:
+      - Locals:
+         - Type:            I32
+           Count:           3
+        Body:            418080808000210020002101200111808080800000210220020F0B
+      - Locals:
+         - Type:            I32
+           Count:           1
+        Body:            108180808000210020000F0B
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:  - Type:            TYPE
+# CHECK:    Signatures:
+# CHECK:      - Index:           0
+# CHECK:        ReturnType:      F32
+# CHECK:        ParamTypes:
+# CHECK:          - I32
+# CHECK:      - Index:           1
+# CHECK:        ReturnType:      NORESULT
+# CHECK:        ParamTypes:
+# CHECK:          - I32
+# CHECK:          - I64
+# CHECK:  - Type:            CODE
+# CHECK:    Relocations:
+# CHECK:      - Type:            R_WEBASSEMBLY_TABLE_INDEX_SLEB
+# CHECK:        Index:           0
+# CHECK:        Offset:          0x00000006
+# CHECK:        Addend:          0x00000000
+# CHECK:      - Type:            R_WEBASSEMBLY_FUNCTION_INDEX_LEB
+# CHECK:        Index:           1
+# CHECK:        Offset:          0x00000025
+# CHECK:        Addend:          0x00000000
+# CHECK:    Functions:
+# CHECK:      - Locals:
+# CHECK:         - Type:            I32
+# CHECK:           Count:           3
+# CHECK:        Body:            418080808000210020002101200111808080800000210220020F0B
+# CHECK:      - Locals:
+# CHECK:         - Type:            I32
+# CHECK:           Count:           1
+# CHECK:        Body:            108180808000210020000F0B
diff --git a/test/ObjectYAML/wasm/custom_section.yaml b/test/ObjectYAML/wasm/custom_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c7b87cb4239d1fe3c1899ecc57950d5b804a6fd5
--- /dev/null
+++ b/test/ObjectYAML/wasm/custom_section.yaml
@@ -0,0 +1,17 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            CUSTOM
+    Name:            foo
+    Payload:         03666F6F0401020304
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:  - Type:            CUSTOM
+# CHECK:    Name:            foo
+# CHECK:    Payload:         03666F6F0401020304
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/data_section.yaml b/test/ObjectYAML/wasm/data_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..542d0efe42f3d4598a75e73ee772ce5ab2b161c8
--- /dev/null
+++ b/test/ObjectYAML/wasm/data_section.yaml
@@ -0,0 +1,28 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            MEMORY
+    Memories:
+      - Initial:         0x00000003
+  - Type:            DATA
+    Segments:
+      - Index:           0
+        Offset:
+          Opcode:          I32_CONST
+          Value:           4
+        Content:         '10001000'
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:   - Type:            DATA
+# CHECK:     Segments:
+# CHECK:       - Index:           0
+# CHECK:         Offset:
+# CHECK:           Opcode:          I32_CONST
+# CHECK:           Value:           4
+# CHECK:         Content:         '10001000'
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/elem_section.yaml b/test/ObjectYAML/wasm/elem_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..498c9aa93ea2af2fe5ff9f51c0ded38ab57a4ac3
--- /dev/null
+++ b/test/ObjectYAML/wasm/elem_section.yaml
@@ -0,0 +1,40 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            TABLE
+    Tables:         
+      - ElemType:          ANYFUNC
+        Limits:
+          Flags:           0x00000001
+          Initial:         0x00000010
+          Maximum:         0x00000011
+  - Type:            ELEM
+    Segments:
+      - Offset:
+          Opcode:        I32_CONST
+          Value:         3
+        Functions:
+          - 1
+      - Offset:
+          Opcode:        I32_CONST
+          Value:         5
+        Functions:
+          - 4
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:   - Type:            ELEM
+# CHECK:     Segments:
+# CHECK:       - Offset:
+# CHECK:           Opcode:           I32_CONST
+# CHECK:           Value:            3
+# CHECK:         Functions: [ 1 ]
+# CHECK:       - Offset:
+# CHECK:           Opcode:           I32_CONST
+# CHECK:           Value:            5
+# CHECK:         Functions: [ 4 ]
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/export_section.yaml b/test/ObjectYAML/wasm/export_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1d1a16fb83353b8ac017be50b3955349d5dc76d1
--- /dev/null
+++ b/test/ObjectYAML/wasm/export_section.yaml
@@ -0,0 +1,27 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            EXPORT
+    Exports:         
+      - Name:            foo
+        Kind:            FUNCTION
+        Index:           0
+      - Name:            bar
+        Kind:            FUNCTION
+        Index:           1
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:   - Type:            EXPORT
+# CHECK:     Exports:         
+# CHECK:       - Name:            foo
+# CHECK:         Kind:            FUNCTION
+# CHECK:         Index:           0
+# CHECK:       - Name:            bar
+# CHECK:         Kind:            FUNCTION
+# CHECK:         Index:           1
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/function_section.yaml b/test/ObjectYAML/wasm/function_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..39e6b75d5cdcb93e5dacb08af22c7b0aaefa9982
--- /dev/null
+++ b/test/ObjectYAML/wasm/function_section.yaml
@@ -0,0 +1,17 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            FUNCTION
+    FunctionTypes:   
+      - 1
+      - 0
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:   - Type:            FUNCTION
+# CHECK:     FunctionTypes: [ 1, 0 ]
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/global_section.yaml b/test/ObjectYAML/wasm/global_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..3f17c6d88ba48db98c1e7cd3ba3d3bff308fea81
--- /dev/null
+++ b/test/ObjectYAML/wasm/global_section.yaml
@@ -0,0 +1,25 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            GLOBAL
+    Globals:         
+      - Type:        I32
+        Mutable:     false
+        InitExpr:
+          Opcode:          I64_CONST
+          Value:           -5
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:  - Type:            GLOBAL
+# CHECK:    Globals:         
+# CHECK:      - Type:        I32
+# CHECK:        Mutable:     false
+# CHECK:        InitExpr:
+# CHECK:          Opcode:          I64_CONST
+# CHECK:          Value:           -5
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/header.yaml b/test/ObjectYAML/wasm/header.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..c4be4c8a4aaf7bd0bafcc2206ffa391d0584c1d6
--- /dev/null
+++ b/test/ObjectYAML/wasm/header.yaml
@@ -0,0 +1,9 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/header_invalid_version.yaml b/test/ObjectYAML/wasm/header_invalid_version.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..a4e1246b984f45458baf9c3cb631eb457018bf20
--- /dev/null
+++ b/test/ObjectYAML/wasm/header_invalid_version.yaml
@@ -0,0 +1,8 @@
+# RUN: yaml2obj %s | not obj2yaml 2>&1 | FileCheck %s
+
+--- !WASM
+FileHeader:
+  Version:         0x00000002
+...
+
+# CHECK: Error: 'Invalid data was encountered while parsing the file'
diff --git a/test/ObjectYAML/wasm/import_section.yaml b/test/ObjectYAML/wasm/import_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..52f466a00b66409338311b03aea887faf020e8dc
--- /dev/null
+++ b/test/ObjectYAML/wasm/import_section.yaml
@@ -0,0 +1,41 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - ReturnType:      I32
+        ParamTypes:
+          - I32
+  - Type:            IMPORT
+    Imports:         
+      - Module:          foo
+        Field:           bar
+        Kind:            FUNCTION
+        SigIndex:        0
+      - Module:          fiz
+        Field:           baz
+        Kind:            GLOBAL
+        GlobalType:      I32
+        GlobalMutable:   false
+  - Type:            FUNCTION
+    FunctionTypes:
+      - 0
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:   - Type:            IMPORT
+# CHECK:     Imports:         
+# CHECK:       - Module:          foo
+# CHECK:         Field:           bar
+# CHECK:         Kind:            FUNCTION
+# CHECK:         SigIndex:        0
+# CHECK:       - Module:          fiz
+# CHECK:         Field:           baz
+# CHECK:         Kind:            GLOBAL
+# CHECK:         GlobalType:      I32
+# CHECK:         GlobalMutable:   false
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/memory_section.yaml b/test/ObjectYAML/wasm/memory_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..83aae05871db0bcd8b12e0846dd5ab835c60fcfe
--- /dev/null
+++ b/test/ObjectYAML/wasm/memory_section.yaml
@@ -0,0 +1,23 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            MEMORY
+    Memories:         
+      - Flags:           0x00000001
+        Initial:         0x00000002
+        Maximum:         0x000000FF
+      - Initial:         0x00000003
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:   - Type:            MEMORY
+# CHECK:     Memories:         
+# CHECK:       - Flags:            0x00000001
+# CHECK:         Initial:          0x00000002
+# CHECK:         Maximum:          0x000000FF
+# CHECK:       - Initial:          0x00000003
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/start_section.yaml b/test/ObjectYAML/wasm/start_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..41301a6200377f0bc3d37b02e909eb81c9e7e47b
--- /dev/null
+++ b/test/ObjectYAML/wasm/start_section.yaml
@@ -0,0 +1,15 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            START
+    StartFunction:   1
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:  - Type:            START
+# CHECK:    StartFunction:   1
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/table_section.yaml b/test/ObjectYAML/wasm/table_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d8b1df25e043b2ee0e045712498780d2db430915
--- /dev/null
+++ b/test/ObjectYAML/wasm/table_section.yaml
@@ -0,0 +1,25 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            TABLE
+    Tables:         
+      - ElemType:        ANYFUNC
+        Limits:
+          Flags:           0x00000001
+          Initial:         0x00000010
+          Maximum:         0x00000011
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:  - Type:            TABLE
+# CHECK:    Tables:         
+# CHECK:      - ElemType:        ANYFUNC
+# CHECK:        Limits:
+# CHECK:          Flags:           0x00000001
+# CHECK:          Initial:         0x00000010
+# CHECK:          Maximum:         0x00000011
+# CHECK: ...
diff --git a/test/ObjectYAML/wasm/type_section.yaml b/test/ObjectYAML/wasm/type_section.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..d6602595a60e9093ceaf0f20d3418adfecd2cbec
--- /dev/null
+++ b/test/ObjectYAML/wasm/type_section.yaml
@@ -0,0 +1,33 @@
+# RUN: yaml2obj %s | obj2yaml | FileCheck %s
+--- !WASM
+FileHeader:
+  Version:         0x00000001
+Sections:
+  - Type:            TYPE
+    Signatures:
+      - ReturnType:      I32
+        ParamTypes:
+          - F32
+          - F32
+      - ReturnType:      I64
+        ParamTypes:
+          - F64
+          - F64
+...
+# CHECK: --- !WASM
+# CHECK: FileHeader:
+# CHECK:   Version:           0x00000001
+# CHECK: Sections:
+# CHECK:  - Type:            TYPE
+# CHECK:    Signatures:
+# CHECK:      - Index:           0
+# CHECK:        ReturnType:      I32
+# CHECK:        ParamTypes:
+# CHECK:          - F32
+# CHECK:          - F32
+# CHECK:      - Index:           1
+# CHECK:        ReturnType:      I64
+# CHECK:        ParamTypes:
+# CHECK:          - F64
+# CHECK:          - F64
+# CHECK: ...
diff --git a/test/Other/Inputs/glob-input b/test/Other/Inputs/glob-input
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/test/Other/cgscc-devirt-iteration.ll b/test/Other/cgscc-devirt-iteration.ll
index df5ea2985b943cdea958848185980c6b8df53f6f..111dac5bccaf6400d8e045e80512b2ac7ee2d480 100644
--- a/test/Other/cgscc-devirt-iteration.ll
+++ b/test/Other/cgscc-devirt-iteration.ll
@@ -7,6 +7,9 @@
 ; RUN: opt -aa-pipeline=basic-aa -passes='cgscc(function-attrs,function(gvn,instcombine))' -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=BEFORE
 ; RUN: opt -aa-pipeline=basic-aa -passes='cgscc(devirt<1>(function-attrs,function(gvn,instcombine)))' -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AFTER --check-prefix=AFTER1
 ; RUN: opt -aa-pipeline=basic-aa -passes='cgscc(devirt<2>(function-attrs,function(gvn,instcombine)))' -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AFTER --check-prefix=AFTER2
+;
+; We also verify that the real O2 pipeline catches these cases.
+; RUN: opt -aa-pipeline=basic-aa -passes='default<O2>' -S < %s | FileCheck %s --check-prefix=CHECK --check-prefix=AFTER --check-prefix=AFTER2
 
 declare void @readnone() readnone
 ; CHECK: Function Attrs: readnone
@@ -93,8 +96,7 @@ entry:
 }
 
 declare i8* @memcpy(i8*, i8*, i64)
-; CHECK-NOT: Function Attrs
-; CHECK: declare i8* @memcpy(i8*, i8*, i64)
+; CHECK: declare i8* @memcpy(
 
 ; The @test3 function checks that when we refine an indirect call to an
 ; intrinsic we still revisit the SCC pass. This also covers cases where the
@@ -112,3 +114,15 @@ define void @test3(i8* %src, i8* %dest, i64 %size) {
 ; CHECK: call void @llvm.memcpy
   ret void
 }
+
+; A boring function that just keeps our declarations around.
+define void @keep(i8** %sink) {
+; CHECK-NOT: Function Attrs
+; CHECK: define void @keep(
+entry:
+  store volatile i8* bitcast (void ()* @readnone to i8*), i8** %sink
+  store volatile i8* bitcast (void ()* @unknown to i8*), i8** %sink
+  store volatile i8* bitcast (i8* (i8*, i8*, i64)* @memcpy to i8*), i8** %sink
+  call void @unknown()
+  ret void
+}
diff --git a/test/Other/constant-fold-gep.ll b/test/Other/constant-fold-gep.ll
index 77c43a200c03f23ba64a5fe92a2703279cc081ba..8028b4fff98703d56d18d0db9b0e02c902427474 100644
--- a/test/Other/constant-fold-gep.ll
+++ b/test/Other/constant-fold-gep.ll
@@ -8,7 +8,7 @@
 
 ; "TO" - Optimizations and targetdata. This tests target-dependent
 ; folding in the optimizers.
-; RUN: opt -S -o - -instcombine -globalopt -default-data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64" < %s | FileCheck --check-prefix=TO %s
+; RUN: opt -S -o - -instcombine -globalopt -data-layout="e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64" < %s | FileCheck --check-prefix=TO %s
 
 ; "SCEV" - ScalarEvolution with default target layout
 ; RUN: opt -analyze -scalar-evolution < %s | FileCheck --check-prefix=SCEV %s
diff --git a/test/Other/debugcounter-newgvn.ll b/test/Other/debugcounter-newgvn.ll
new file mode 100644
index 0000000000000000000000000000000000000000..cfe043c8455ac6a51b8b8f66dee27c2acb3799f9
--- /dev/null
+++ b/test/Other/debugcounter-newgvn.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt -S -debug-counter=newgvn-vn-skip=1,newgvn-vn-count=2 -newgvn  < %s 2>&1 | FileCheck %s
+;; Test that, with debug counters on, we don't value number the first instruction, only the second and third,
+;; which means we do not discover the return is constant.
+define i32 @vntest() {
+; CHECK-LABEL: @vntest(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[A:%.*]] = add i32 1, 3
+; CHECK-NEXT:    [[D:%.*]] = add i32 8, 8
+; CHECK-NEXT:    ret i32 [[D]]
+;
+bb:
+  %a = add i32 1, 3
+  %b = add i32 %a, %a
+  %c = add i32 %a, %a
+  %d = add i32 %b, %c
+  ret i32 %d
+}
+
+
+
diff --git a/test/Other/debugcounter-predicateinfo.ll b/test/Other/debugcounter-predicateinfo.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eb2ec09802fedce589f282c188a3a57169b6c5be
--- /dev/null
+++ b/test/Other/debugcounter-predicateinfo.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; REQUIRES: asserts
+; RUN: opt -debug-counter=predicateinfo-rename-skip=1,predicateinfo-rename-count=1 -print-predicateinfo -analyze  < %s 2>&1 | FileCheck %s
+;; Test that, with debug counters on, we don't rename the first info, only the second
+define fastcc void @barney() {
+; CHECK-LABEL: @barney(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB22:%.*]]
+; CHECK:       bb22:
+; CHECK-NEXT:    [[TMP23:%.*]] = icmp eq i32 undef, 2
+; CHECK:         [[TMP23_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP23]])
+; CHECK-NEXT:    br i1 [[TMP23]], label [[BB29:%.*]], label [[BB35:%.*]]
+; CHECK:       bb29:
+; CHECK:         [[TMP23_0_1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP23_0]])
+; CHECK-NEXT:    br i1 [[TMP23]], label [[BB33:%.*]], label [[BB35]]
+; CHECK:       bb33:
+; CHECK-NEXT:    br i1 [[TMP23_0_1]], label [[BB35]], label [[BB35]]
+; CHECK:       bb35:
+; CHECK-NEXT:    unreachable
+;
+bb:
+  br label %bb22
+bb22:                                             ; preds = %bb21
+  %tmp23 = icmp eq i32 undef, 2
+  br i1 %tmp23, label %bb29, label %bb35
+
+
+bb29:                                             ; preds = %bb28
+;; We will not rename this one (we will still generate a copy of a copy for the next one)
+  br i1 %tmp23, label %bb33, label %bb35
+
+
+bb33:                                             ; preds = %bb31
+;; We will rename this one
+  br i1 %tmp23, label %bb35, label %bb35
+
+bb35:                                             ; preds = %bb33, %bb29, %bb22
+  unreachable
+}
diff --git a/test/Other/lit-globbing.ll b/test/Other/lit-globbing.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5a668a90a40b304f6ed12d45cc77b61d546b3285
--- /dev/null
+++ b/test/Other/lit-globbing.ll
@@ -0,0 +1,28 @@
+RUN: echo TA > %T/TA.txt
+RUN: echo TB > %T/TB.txt
+RUN: echo TAB > %T/TAB.txt
+
+RUN: echo %T/TA* | FileCheck -check-prefix=STAR %s
+RUN: echo %T/'TA'* | FileCheck -check-prefix=STAR %s
+RUN: echo %T/T'A'* | FileCheck -check-prefix=STAR %s
+
+RUN: echo %T/T?.txt | FileCheck -check-prefix=QUESTION %s
+RUN: echo %T/'T'?.txt | FileCheck -check-prefix=QUESTION %s
+
+RUN: echo %T/T??.txt | FileCheck -check-prefix=QUESTION2 %s
+RUN: echo %T/'T'??.txt | FileCheck -check-prefix=QUESTION2 %s
+
+RUN: echo 'T*' 'T?.txt' 'T??.txt' | FileCheck -check-prefix=QUOTEDARGS %s
+
+STAR-NOT: TB.txt
+STAR: {{(TA.txt.*TAB.txt|TAB.txt.*TA.txt)}}
+
+QUESTION-NOT: TAB.txt
+QUESTION: {{(TA.txt.*TB.txt|TB.txt.*TA.txt)}}
+
+QUESTION2-NOT: TA.txt
+QUESTION2-NOT: TB.txt
+QUESTION2: TAB.txt
+
+QUOTEDARGS-NOT: .txt
+QUOTEDARGS: T* T?.txt T??.txt
diff --git a/test/Other/loop-pm-invalidation.ll b/test/Other/loop-pm-invalidation.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d2a0e23a7200b537d06ef45e5342534d714f508c
--- /dev/null
+++ b/test/Other/loop-pm-invalidation.ll
@@ -0,0 +1,277 @@
+; Test that the loop PM infrastructure is invalidated appropriately.
+;
+; Check that we always nuke the LPM stuff when the loops themselves are
+; invalidated.
+; RUN: opt -disable-output -disable-verify -debug-pass-manager %s 2>&1 \
+; RUN:     -passes='loop(no-op-loop),invalidate<loops>,loop(no-op-loop)' \
+; RUN:     | FileCheck %s --check-prefix=CHECK-LOOP-INV
+;
+; If we ended up building the standard analyses, their invalidation should nuke
+; stuff as well.
+; RUN: opt -disable-output -disable-verify -debug-pass-manager %s 2>&1 \
+; RUN:     -passes='loop(no-op-loop),invalidate<scalar-evolution>,loop(no-op-loop)' \
+; RUN:     | FileCheck %s --check-prefix=CHECK-SCEV-INV
+;
+; Also provide a test that can delete loops after populating analyses for them.
+; RUN: opt -disable-output -disable-verify -debug-pass-manager %s 2>&1 \
+; RUN:     -passes='loop(no-op-loop,loop-deletion),invalidate<scalar-evolution>,loop(no-op-loop)' \
+; RUN:     | FileCheck %s --check-prefix=CHECK-SCEV-INV-AFTER-DELETE
+
+define void @no_loops() {
+; CHECK-LOOP-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on no_loops
+; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-LOOP-INV-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Invalidating all non-preserved analyses
+; CHECK-LOOP-INV-NEXT: Invalidating analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on no_loops
+; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run.
+;
+; CHECK-SCEV-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on no_loops
+; CHECK-SCEV-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-SCEV-INV-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Invalidating all non-preserved analyses
+; CHECK-SCEV-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on no_loops
+; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run.
+
+entry:
+  ret void
+}
+
+define void @one_loop(i1* %ptr) {
+; CHECK-LOOP-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on one_loop
+; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: AAManager
+; CHECK-LOOP-INV-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-LOOP-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Invalidating all non-preserved analyses
+; CHECK-LOOP-INV-NEXT: Clearing all analysis results for: l0.header
+; CHECK-LOOP-INV-NEXT: Invalidating analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis
+; CHECK-LOOP-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-LOOP-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on one_loop
+; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-LOOP-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run.
+;
+; CHECK-SCEV-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on one_loop
+; CHECK-SCEV-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: AAManager
+; CHECK-SCEV-INV-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-SCEV-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Invalidating all non-preserved analyses
+; CHECK-SCEV-INV-NEXT: Clearing all analysis results for: l0.header
+; CHECK-SCEV-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on one_loop
+; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-SCEV-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run.
+
+entry:
+  br label %l0.header
+
+l0.header:
+  %flag0 = load volatile i1, i1* %ptr
+  br i1 %flag0, label %l0.header, label %exit
+
+exit:
+  ret void
+}
+
+define void @nested_loops(i1* %ptr) {
+; CHECK-LOOP-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on nested_loops
+; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: AAManager
+; CHECK-LOOP-INV-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-LOOP-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-LOOP-INV: Finished {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Invalidating all non-preserved analyses
+; CHECK-LOOP-INV-NEXT: Clearing all analysis results for: l.0.header
+; CHECK-LOOP-INV-NEXT: Clearing all analysis results for: l.0.0.header
+; CHECK-LOOP-INV-NEXT: Invalidating analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis
+; CHECK-LOOP-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-LOOP-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on nested_loops
+; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-LOOP-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-LOOP-INV: Finished {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run.
+;
+; CHECK-SCEV-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on nested_loops
+; CHECK-SCEV-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: AAManager
+; CHECK-SCEV-INV-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-SCEV-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-SCEV-INV: Finished {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Invalidating all non-preserved analyses
+; CHECK-SCEV-INV-NEXT: Clearing all analysis results for: l.0.header
+; CHECK-SCEV-INV-NEXT: Clearing all analysis results for: l.0.0.header
+; CHECK-SCEV-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on nested_loops
+; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-SCEV-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-SCEV-INV: Finished {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run.
+
+entry:
+  br label %l.0.header
+
+l.0.header:
+  br label %l.0.0.header
+
+l.0.0.header:
+  %flag.0.0 = load volatile i1, i1* %ptr
+  br i1 %flag.0.0, label %l.0.0.header, label %l.0.latch
+
+l.0.latch:
+  %flag.0 = load volatile i1, i1* %ptr
+  br i1 %flag.0, label %l.0.header, label %exit
+
+exit:
+  ret void
+}
+
+define void @dead_loop() {
+; CHECK-LOOP-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop
+; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: AAManager
+; CHECK-LOOP-INV-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-LOOP-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Invalidating all non-preserved analyses
+; CHECK-LOOP-INV-NEXT: Clearing all analysis results for: l0.header
+; CHECK-LOOP-INV-NEXT: Invalidating analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis
+; CHECK-LOOP-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-LOOP-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop
+; CHECK-LOOP-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-LOOP-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-LOOP-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-LOOP-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-LOOP-INV-NEXT: Finished {{.*}}Function pass manager run.
+;
+; CHECK-SCEV-INV-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop
+; CHECK-SCEV-INV-NEXT: Running analysis: LoopAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: AAManager
+; CHECK-SCEV-INV-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-SCEV-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Invalidating all non-preserved analyses
+; CHECK-SCEV-INV-NEXT: Clearing all analysis results for: l0.header
+; CHECK-SCEV-INV-NEXT: Invalidating analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop
+; CHECK-SCEV-INV-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Running pass: NoOpLoopPass
+; CHECK-SCEV-INV-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-NEXT: Finished {{.*}}Function pass manager run.
+;
+; CHECK-SCEV-INV-AFTER-DELETE-LABEL: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: LoopAnalysis
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: DominatorTreeAnalysis
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: AssumptionAnalysis
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: AAManager
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: TargetLibraryAnalysis
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Starting {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running pass: NoOpLoopPass
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running pass: LoopDeletionPass
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Clearing all analysis results for:
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Finished {{.*}}Loop pass manager run.
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Invalidating all non-preserved analyses
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running pass: InvalidateAnalysisPass<{{.*}}ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Invalidating all non-preserved analyses
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Invalidating analysis: ScalarEvolutionAnalysis
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Invalidating analysis: InnerAnalysisManagerProxy<{{.*}}Loop
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}> on dead_loop
+; CHECK-SCEV-INV-AFTER-DELETE-NEXT: Finished {{.*}}Function pass manager run.
+
+entry:
+  br label %l0.header
+
+l0.header:
+  br i1 false, label %l0.header, label %exit
+
+exit:
+  ret void
+}
diff --git a/test/Other/new-pass-manager.ll b/test/Other/new-pass-manager.ll
index b195f495d90af6c56542d2a6305cfc8faf7a51af..bf8e596d118b83d5c479d611a902556bb34aafcf 100644
--- a/test/Other/new-pass-manager.ll
+++ b/test/Other/new-pass-manager.ll
@@ -454,11 +454,11 @@
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: DominatorTreeAnalysis
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: AssumptionAnalysis
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Invalidating all non-preserved analyses
-; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}>
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: AAManager
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: TargetLibraryAnalysis
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: ScalarEvolutionAnalysis
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: TargetIRAnalysis
+; CHECK-REPEAT-LOOP-PASS-NEXT: Running analysis: InnerAnalysisManagerProxy<{{.*}}>
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Starting Loop pass manager run
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Running pass: RepeatedPass
 ; CHECK-REPEAT-LOOP-PASS-NEXT: Starting Loop pass manager run
diff --git a/test/Other/new-pm-defaults.ll b/test/Other/new-pm-defaults.ll
index 56ee847fd2998d8c624e2d04ae51a1aa2d5a3ac7..7657f184b28cdce1e928b3aadc38fd65dc14f3ee 100644
--- a/test/Other/new-pm-defaults.ll
+++ b/test/Other/new-pm-defaults.ll
@@ -57,15 +57,19 @@
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Finished llvm::Function pass manager run.
+; CHECK-O-NEXT: Running pass: PGOIndirectCallPromotion
+; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
+; CHECK-O-NEXT: Running analysis: GlobalsAA
+; CHECK-O-NEXT: Running analysis: CallGraphAnalysis
 ; CHECK-O-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}LazyCallGraph{{.*}}>
 ; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
 ; CHECK-O-NEXT: Running analysis: LazyCallGraphAnalysis
 ; CHECK-O-NEXT: Starting CGSCC pass manager run.
 ; CHECK-O-NEXT: Running pass: InlinerPass
 ; CHECK-O-NEXT: Running analysis: OuterAnalysisManagerProxy<{{.*}}LazyCallGraph{{.*}}>
-; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
-; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O-NEXT: Running pass: PostOrderFunctionAttrsPass
+; CHECK-O-NEXT: Running analysis: FunctionAnalysisManagerCGSCCProxy
+; CHECK-O3-NEXT: Running pass: ArgumentPromotionPass
 ; CHECK-O-NEXT: Running pass: CGSCCToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
 ; CHECK-O-NEXT: Starting llvm::Function pass manager run.
 ; CHECK-O-NEXT: Running pass: SROA
@@ -82,10 +86,12 @@
 ; CHECK-O-NEXT: Running pass: TailCallElimPass
 ; CHECK-O-NEXT: Running pass: SimplifyCFGPass
 ; CHECK-O-NEXT: Running pass: ReassociatePass
+; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
+; CHECK-O-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopStandardAnalysisResults{{.*}}>
 ; CHECK-O-NEXT: Running analysis: LoopAnalysis
-; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
 ; CHECK-O-NEXT: Running analysis: ScalarEvolutionAnalysis
+; CHECK-O-NEXT: Running analysis: InnerAnalysisManagerProxy
 ; CHECK-O-NEXT: Starting Loop pass manager run.
 ; CHECK-O-NEXT: Running pass: LoopRotatePass
 ; CHECK-O-NEXT: Running pass: LICM
@@ -125,7 +131,7 @@
 ; CHECK-O-NEXT: Finished CGSCC pass manager run.
 ; CHECK-O-NEXT: Running pass: EliminateAvailableExternallyPass
 ; CHECK-O-NEXT: Running pass: ReversePostOrderFunctionAttrsPass
-; CHECK-O-NEXT: Running analysis: CallGraphAnalysis
+; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}GlobalsAA
 ; CHECK-O-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
 ; CHECK-O-NEXT: Starting llvm::Function pass manager run.
 ; CHECK-O-NEXT: Running pass: Float2IntPass
@@ -142,6 +148,7 @@
 ; CHECK-O-NEXT: Running pass: InstCombinePass
 ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LoopUnrollPass
 ; CHECK-O-NEXT: Running pass: InstCombinePass
+; CHECK-O-NEXT: Running pass: RequireAnalysisPass<{{.*}}OptimizationRemarkEmitterAnalysis
 ; CHECK-O-NEXT: Running pass: FunctionToLoopPassAdaptor<{{.*}}LICMPass
 ; CHECK-O-NEXT: Running pass: AlignmentFromAssumptionsPass
 ; CHECK-O-NEXT: Running pass: LoopSinkPass
diff --git a/test/Other/new-pm-lto-defaults.ll b/test/Other/new-pm-lto-defaults.ll
index e86bcdddcdaeaf65a15f2f9a2d9fc09dd66f64a9..dfd2983532729879f34c9d5c19d25df6a6f6e5b6 100644
--- a/test/Other/new-pm-lto-defaults.ll
+++ b/test/Other/new-pm-lto-defaults.ll
@@ -47,7 +47,6 @@
 ; CHECK-O2-NEXT: Running pass: DeadArgumentEliminationPass
 ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}InstCombinePass>
 ; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}InlinerPass>
-; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O2-NEXT: Running pass: GlobalOptPass
 ; CHECK-O2-NEXT: Running pass: GlobalDCEPass
 ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
@@ -60,6 +59,7 @@
 ; CHECK-O2-NEXT: Running pass: ModuleToPostOrderCGSCCPassAdaptor<{{.*}}PostOrderFunctionAttrsPass>
 ; CHECK-O2-NEXT: Running pass: ModuleToFunctionPassAdaptor<{{.*}}PassManager{{.*}}>
 ; CHECK-O2-NEXT: Running analysis: MemoryDependenceAnalysis
+; CHECK-O2-NEXT: Running analysis: OptimizationRemarkEmitterAnalysis
 ; CHECK-O2-NEXT: Running analysis: TargetIRAnalysis
 ; CHECK-O2-NEXT: Running analysis: DemandedBitsAnalysis
 ; CHECK-O2-NEXT: Running pass: CrossDSOCFIPass
diff --git a/test/Other/optimization-remarks-invalidation.ll b/test/Other/optimization-remarks-invalidation.ll
index 83a9ee042cf69985a89ffa7509d5eb10fe59ae17..4a9fbac15c8a9689453bba3e0f47a68764168bd2 100644
--- a/test/Other/optimization-remarks-invalidation.ll
+++ b/test/Other/optimization-remarks-invalidation.ll
@@ -53,7 +53,7 @@ Loop:
   %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
   %addr = getelementptr i32, i32* %array, i32 %j
   %a = load i32, i32* %addr
-; CHECK: remark: /tmp/kk.c:2:20: hosting load
+; CHECK: remark: /tmp/kk.c:2:20: hoisting load
   %b = load i32, i32* %p, !dbg !8
   %a2 = add i32 %a, %b
   store i32 %a2, i32* %addr
diff --git a/test/Other/writing-to-stdout.ll b/test/Other/writing-to-stdout.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e3dee782ce6991a061fcc7121f6af0da10162b19
--- /dev/null
+++ b/test/Other/writing-to-stdout.ll
@@ -0,0 +1,16 @@
+; REQUIRES: default_triple
+
+; Often LLVM tools use "-" to indicate that output should be written to stdout
+; instead of a file. This behaviour is implemented by the raw_fd_ostream class.
+; This test verifies that when doing so multiple times we don't try to access a
+; closed STDOUT_FILENO. The exact options used in this test are unimportant, as
+; long as they write to stdout using raw_fd_ostream.
+; RUN: llc %s -o=- -pass-remarks-output=- -filetype=asm | FileCheck %s
+; foobar should appear as a function somewhere in the assembly file.
+; CHECK: foobar
+; !Analysis appears at the start of pass-remarks-output.
+; CHECK: !Analysis
+
+define void @foobar() {
+  ret void
+}
diff --git a/test/TableGen/GlobalISelEmitter.td b/test/TableGen/GlobalISelEmitter.td
new file mode 100644
index 0000000000000000000000000000000000000000..25be435df2de4ac913d557c857789ca4fd02c8d7
--- /dev/null
+++ b/test/TableGen/GlobalISelEmitter.td
@@ -0,0 +1,407 @@
+// RUN: llvm-tblgen -gen-global-isel -I %p/../../include %s | FileCheck %s
+
+include "llvm/Target/Target.td"
+
+//===- Define the necessary boilerplate for our test target. --------------===//
+
+def MyTargetISA : InstrInfo;
+def MyTarget : Target { let InstructionSet = MyTargetISA; }
+
+def R0 : Register<"r0"> { let Namespace = "MyTarget"; }
+def GPR32 : RegisterClass<"MyTarget", [i32], 32, (add R0)>;
+
+class I<dag OOps, dag IOps, list<dag> Pat>
+  : Instruction {
+  let Namespace = "MyTarget";
+  let OutOperandList = OOps;
+  let InOperandList = IOps;
+  let Pattern = Pat;
+}
+
+def complex : Operand<i32>, ComplexPattern<i32, 2, "SelectComplexPattern", []> {
+  let MIOperandInfo = (ops i32imm, i32imm);
+}
+def gi_complex :
+    GIComplexOperandMatcher<s32, (ops i32imm, i32imm), "selectComplexPattern">,
+    GIComplexPatternEquiv<complex>;
+
+def m1 : OperandWithDefaultOps <i32, (ops (i32 -1))>;
+def Z : OperandWithDefaultOps <i32, (ops R0)>;
+def m1Z : OperandWithDefaultOps <i32, (ops (i32 -1), R0)>;
+
+//===- Test the function definition boilerplate. --------------------------===//
+
+// CHECK: bool MyTargetInstructionSelector::selectImpl(MachineInstr &I) const {
+// CHECK: MachineFunction &MF = *I.getParent()->getParent();
+// CHECK: const MachineRegisterInfo &MRI = MF.getRegInfo();
+
+//===- Test a pattern with multiple ComplexPattern operands. --------------===//
+//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 4)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_SELECT) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        (selectComplexPattern(MI0.getOperand(2), TempOp0, TempOp1)))) &&
+// CHECK-NEXT:        ((/* src3 */ (MRI.getType(MI0.getOperand(3).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        (selectComplexPattern(MI0.getOperand(3), TempOp2, TempOp3))))) {
+// CHECK-NEXT:          // (select:i32 GPR32:i32:$src1, complex:i32:$src2, complex:i32:$src3) => (INSN2:i32 GPR32:i32:$src1, complex:i32:$src3, complex:i32:$src2)
+// CHECK-NEXT:          MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::INSN2));
+// CHECK-NEXT:          MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:          MIB.add(MI0.getOperand(1)/*src1*/);
+// CHECK-NEXT:          MIB.add(TempOp2);
+// CHECK-NEXT:          MIB.add(TempOp3);
+// CHECK-NEXT:          MIB.add(TempOp0);
+// CHECK-NEXT:          MIB.add(TempOp1);
+// CHECK-NEXT:          for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT:            for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:              MIB.addMemOperand(MMO);
+// CHECK-NEXT:          I.eraseFromParent();
+// CHECK-NEXT:          MachineInstr &NewI = *MIB;
+// CHECK-NEXT:          constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:          return true;
+// CHECK-NEXT:        }
+
+def : GINodeEquiv<G_SELECT, select>;
+def INSN2 : I<(outs GPR32:$dst), (ins GPR32:$src1, complex:$src2, complex:$src3), []>;
+def : Pat<(select GPR32:$src1, complex:$src2, complex:$src3),
+          (INSN2 GPR32:$src1, complex:$src3, complex:$src2)>;
+
+//===- Test a simple pattern with regclass operands. ----------------------===//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_ADD) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) {
+
+// CHECK-NEXT:      // (add:i32 GPR32:i32:$src1, GPR32:i32:$src2) => (ADD:i32 GPR32:i32:$src1, GPR32:i32:$src2)
+// CHECK-NEXT:      I.setDesc(TII.get(MyTarget::ADD));
+// CHECK-NEXT:      MachineInstr &NewI = I;
+// CHECK-NEXT:      constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:      return true;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }()) { return true; }
+
+
+def ADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2),
+            [(set GPR32:$dst, (add GPR32:$src1, GPR32:$src2))]>;
+
+//===- Test a nested instruction match. -----------------------------------===//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if (!MI0.getOperand(1).isReg())
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    MachineInstr &MI1 = *MRI.getVRegDef(MI0.getOperand(1).getReg());
+// CHECK-NEXT:    if (MI1.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_MUL) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* Operand 1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        (((MI1.getOpcode() == TargetOpcode::G_ADD) &&
+// CHECK-NEXT:        ((/* Operand 0 */ (MRI.getType(MI1.getOperand(0).getReg()) == (LLT::scalar(32))))) &&
+// CHECK-NEXT:        ((/* src1 */ (MRI.getType(MI1.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src2 */ (MRI.getType(MI1.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(2).getReg(), MRI, TRI))))))
+// CHECK-NEXT:        ))) &&
+// CHECK-NEXT:        ((/* src3 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) {
+// CHECK-NEXT:        if (!isObviouslySafeToFold(MI1)) return false;
+// CHECK-NEXT:        // (mul:i32 (add:i32 GPR32:i32:$src1, GPR32:i32:$src2), GPR32:i32:$src3)  =>  (MULADD:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3)
+// CHECK-NEXT:     MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MULADD));
+// CHECK-NEXT:     MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:     MIB.add(MI1.getOperand(1)/*src1*/);
+// CHECK-NEXT:     MIB.add(MI1.getOperand(2)/*src2*/);
+// CHECK-NEXT:     MIB.add(MI0.getOperand(2)/*src3*/);
+// CHECK-NEXT:     for (const auto *FromMI : {&MI0, &MI1, })
+// CHECK-NEXT:       for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:         MIB.addMemOperand(MMO);
+// CHECK-NEXT:     I.eraseFromParent();
+// CHECK-NEXT:     MachineInstr &NewI = *MIB;
+// CHECK-NEXT:     constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:     return true;
+// CHECK-NEXT:   }
+
+// We also get a second rule by commutativity.
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if (!MI0.getOperand(2).isReg())
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    MachineInstr &MI1 = *MRI.getVRegDef(MI0.getOperand(2).getReg());
+// CHECK-NEXT:    if (MI1.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_MUL) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src3 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        (((MI1.getOpcode() == TargetOpcode::G_ADD) &&
+// CHECK-NEXT:        ((/* Operand 0 */ (MRI.getType(MI1.getOperand(0).getReg()) == (LLT::scalar(32))))) &&
+// CHECK-NEXT:        ((/* src1 */ (MRI.getType(MI1.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src2 */ (MRI.getType(MI1.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI1.getOperand(2).getReg(), MRI, TRI))))))
+// CHECK-NEXT:        )))) {
+// CHECK-NEXT:        if (!isObviouslySafeToFold(MI1)) return false;
+// CHECK-NEXT:        // (mul:i32 GPR32:i32:$src3, (add:i32 GPR32:i32:$src1, GPR32:i32:$src2))  =>  (MULADD:i32 GPR32:i32:$src1, GPR32:i32:$src2, GPR32:i32:$src3)
+// CHECK-NEXT:     MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MULADD));
+// CHECK-NEXT:     MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:     MIB.add(MI1.getOperand(1)/*src1*/);
+// CHECK-NEXT:     MIB.add(MI1.getOperand(2)/*src2*/);
+// CHECK-NEXT:     MIB.add(MI0.getOperand(1)/*src3*/);
+// CHECK-NEXT:     for (const auto *FromMI : {&MI0, &MI1, })
+// CHECK-NEXT:       for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:         MIB.addMemOperand(MMO);
+// CHECK-NEXT:     I.eraseFromParent();
+// CHECK-NEXT:     MachineInstr &NewI = *MIB;
+// CHECK-NEXT:     constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:     return true;
+// CHECK-NEXT:   }
+
+def MULADD : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2, GPR32:$src3),
+               [(set GPR32:$dst,
+                     (mul (add GPR32:$src1, GPR32:$src2), GPR32:$src3))]>;
+
+//===- Test another simple pattern with regclass operands. ----------------===//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_MUL) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(2).getReg(), MRI, TRI)))))) {
+// CHECK-NEXT:      // (mul:i32 GPR32:i32:$src1, GPR32:i32:$src2) => (MUL:i32 GPR32:i32:$src2, GPR32:i32:$src1)
+// CHECK-NEXT:      MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::MUL));
+// CHECK-NEXT:      MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:      MIB.add(MI0.getOperand(2)/*src2*/);
+// CHECK-NEXT:      MIB.add(MI0.getOperand(1)/*src1*/);
+// CHECK-NEXT:     for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT:       for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:         MIB.addMemOperand(MMO);
+// CHECK-NEXT:      I.eraseFromParent();
+// CHECK-NEXT:      MachineInstr &NewI = *MIB;
+// CHECK-NEXT:      constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:      return true;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }()) { return true; }
+
+def MUL : I<(outs GPR32:$dst), (ins GPR32:$src2, GPR32:$src1),
+             [(set GPR32:$dst, (mul GPR32:$src1, GPR32:$src2))]>;
+
+//===- Test a pattern with ComplexPattern operands. -----------------------===//
+//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_SUB) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        (selectComplexPattern(MI0.getOperand(2), TempOp0, TempOp1))))) {
+// CHECK-NEXT:          // (sub:i32 GPR32:i32:$src1, complex:i32:$src2) => (INSN1:i32 GPR32:i32:$src1, complex:i32:$src2)
+// CHECK-NEXT:          MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::INSN1));
+// CHECK-NEXT:          MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:          MIB.add(MI0.getOperand(1)/*src1*/);
+// CHECK-NEXT:          MIB.add(TempOp0);
+// CHECK-NEXT:          MIB.add(TempOp1);
+// CHECK-NEXT:          for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT:            for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:              MIB.addMemOperand(MMO);
+// CHECK-NEXT:          I.eraseFromParent();
+// CHECK-NEXT:          MachineInstr &NewI = *MIB;
+// CHECK-NEXT:          constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:          return true;
+// CHECK-NEXT:        }
+
+def INSN1 : I<(outs GPR32:$dst), (ins GPR32:$src1, complex:$src2), []>;
+def : Pat<(sub GPR32:$src1, complex:$src2), (INSN1 GPR32:$src1, complex:$src2)>;
+
+//===- Test a simple pattern with a default operand. ----------------------===//
+//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_XOR) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        (isOperandImmEqual(MI0.getOperand(2), -2, MRI))))) {
+// CHECK-NEXT:      // (xor:i32 GPR32:i32:$src1, -2:i32) => (XORI:i32 GPR32:i32:$src1)
+// CHECK-NEXT:      MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XORI));
+// CHECK-NEXT:      MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:      MIB.addImm(-1);
+// CHECK-NEXT:      MIB.add(MI0.getOperand(1)/*src1*/);
+// CHECK-NEXT:      for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT:        for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:          MIB.addMemOperand(MMO);
+// CHECK-NEXT:      I.eraseFromParent();
+// CHECK-NEXT:      MachineInstr &NewI = *MIB;
+// CHECK-NEXT:      constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:      return true;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }()) { return true; }
+
+// The -2 is just to distinguish it from the 'not' case below.
+def XORI : I<(outs GPR32:$dst), (ins m1:$src2, GPR32:$src1),
+             [(set GPR32:$dst, (xor GPR32:$src1, -2))]>;
+
+//===- Test a simple pattern with a default register operand. -------------===//
+//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_XOR) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        (isOperandImmEqual(MI0.getOperand(2), -3, MRI))))) {
+// CHECK-NEXT:      // (xor:i32 GPR32:i32:$src1, -3:i32) => (XOR:i32 GPR32:i32:$src1)
+// CHECK-NEXT:      MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XOR));
+// CHECK-NEXT:      MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:      MIB.addReg(MyTarget::R0);
+// CHECK-NEXT:      MIB.add(MI0.getOperand(1)/*src1*/);
+// CHECK-NEXT:      for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT:        for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:          MIB.addMemOperand(MMO);
+// CHECK-NEXT:      I.eraseFromParent();
+// CHECK-NEXT:      MachineInstr &NewI = *MIB;
+// CHECK-NEXT:      constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:      return true;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }()) { return true; }
+
+// The -3 is just to distinguish it from the 'not' case below and the other default op case above.
+def XOR : I<(outs GPR32:$dst), (ins Z:$src2, GPR32:$src1),
+            [(set GPR32:$dst, (xor GPR32:$src1, -3))]>;
+
+//===- Test a simple pattern with a multiple default operands. ------------===//
+//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_XOR) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* src1 */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        (isOperandImmEqual(MI0.getOperand(2), -4, MRI))))) {
+// CHECK-NEXT:      // (xor:i32 GPR32:i32:$src1, -4:i32) => (XORlike:i32 GPR32:i32:$src1)
+// CHECK-NEXT:      MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::XORlike));
+// CHECK-NEXT:      MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:      MIB.addImm(-1);
+// CHECK-NEXT:      MIB.addReg(MyTarget::R0);
+// CHECK-NEXT:      MIB.add(MI0.getOperand(1)/*src1*/);
+// CHECK-NEXT:      for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT:        for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:          MIB.addMemOperand(MMO);
+// CHECK-NEXT:      I.eraseFromParent();
+// CHECK-NEXT:      MachineInstr &NewI = *MIB;
+// CHECK-NEXT:      constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:      return true;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }()) { return true; }
+
+// The -4 is just to distinguish it from the other 'not' cases.
+def XORlike : I<(outs GPR32:$dst), (ins m1Z:$src2, GPR32:$src1),
+                [(set GPR32:$dst, (xor GPR32:$src1, -4))]>;
+
+//===- Test a simple pattern with constant immediate operands. ------------===//
+//
+// This must precede the 3-register variants because constant immediates have
+// priority over register banks.
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:    MachineInstr &MI0 = I;
+// CHECK-NEXT:    if (MI0.getNumOperands() < 3)
+// CHECK-NEXT:      return false;
+// CHECK-NEXT:    if ((MI0.getOpcode() == TargetOpcode::G_XOR) &&
+// CHECK-NEXT:        ((/* dst */ (MRI.getType(MI0.getOperand(0).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(0).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* Wm */ (MRI.getType(MI0.getOperand(1).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:         ((&RBI.getRegBankFromRegClass(MyTarget::GPR32RegClass) == RBI.getRegBank(MI0.getOperand(1).getReg(), MRI, TRI))))) &&
+// CHECK-NEXT:        ((/* Operand 2 */ (MRI.getType(MI0.getOperand(2).getReg()) == (LLT::scalar(32))) &&
+// CHECK-NEXT:        (isOperandImmEqual(MI0.getOperand(2), -1, MRI))))) {
+// CHECK-NEXT:      // (xor:i32 GPR32:i32:$Wm, -1:i32) => (ORN:i32 R0:i32, GPR32:i32:$Wm)
+// CHECK-NEXT:      MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(MyTarget::ORN));
+// CHECK-NEXT:      MIB.add(MI0.getOperand(0)/*dst*/);
+// CHECK-NEXT:      MIB.addReg(MyTarget::R0);
+// CHECK-NEXT:      MIB.add(MI0.getOperand(1)/*Wm*/);
+// CHECK-NEXT:      for (const auto *FromMI : {&MI0, })
+// CHECK-NEXT:        for (const auto &MMO : FromMI->memoperands())
+// CHECK-NEXT:          MIB.addMemOperand(MMO);
+// CHECK-NEXT:      I.eraseFromParent();
+// CHECK-NEXT:      MachineInstr &NewI = *MIB;
+// CHECK-NEXT:      constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:      return true;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }()) { return true; }
+
+def ORN : I<(outs GPR32:$dst), (ins GPR32:$src1, GPR32:$src2), []>;
+def : Pat<(not GPR32:$Wm), (ORN R0, GPR32:$Wm)>;
+
+//===- Test a pattern with an MBB operand. --------------------------------===//
+
+// CHECK-LABEL: if ([&]() {
+// CHECK-NEXT:      MachineInstr &MI0 = I;
+// CHECK-NEXT:      if (MI0.getNumOperands() < 1)
+// CHECK-NEXT:        return false;
+// CHECK-NEXT:      if ((MI0.getOpcode() == TargetOpcode::G_BR) &&
+// CHECK-NEXT:          ((/* target */ (MI0.getOperand(0).isMBB())))) {
+                    
+// CHECK-NEXT:       // (br (bb:Other):$target) => (BR (bb:Other):$target)
+// CHECK-NEXT:       I.setDesc(TII.get(MyTarget::BR));
+// CHECK-NEXT:       MachineInstr &NewI = I;
+// CHECK-NEXT:       constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);
+// CHECK-NEXT:       return true;
+// CHECK-NEXT:    }
+// CHECK-NEXT:    return false;
+// CHECK-NEXT:  }()) { return true; }
+
+def BR : I<(outs), (ins unknown:$target),
+            [(br bb:$target)]>;
diff --git a/test/ThinLTO/X86/Inputs/cache-import-lists1.ll b/test/ThinLTO/X86/Inputs/cache-import-lists1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..58bfb39f9ee18c87ea764cf8eff803c59d7d78a6
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/cache-import-lists1.ll
@@ -0,0 +1,11 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f1() {
+  call void @linkonce_odr()
+  ret void
+}
+
+define linkonce_odr void @linkonce_odr() {
+  ret void
+}
diff --git a/test/ThinLTO/X86/Inputs/cache-import-lists2.ll b/test/ThinLTO/X86/Inputs/cache-import-lists2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..899bbaea13d6f5fbd753ab5e90d3ce12f6669b01
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/cache-import-lists2.ll
@@ -0,0 +1,11 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f2() {
+  call void @linkonce_odr()
+  ret void
+}
+
+define linkonce_odr void @linkonce_odr() {
+  ret void
+}
diff --git a/test/ThinLTO/X86/Inputs/cache-typeid-resolutions-import.ll b/test/ThinLTO/X86/Inputs/cache-typeid-resolutions-import.ll
new file mode 100644
index 0000000000000000000000000000000000000000..95ecd1824351eace2890e9ba52b034383fba3519
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/cache-typeid-resolutions-import.ll
@@ -0,0 +1,15 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i1 @importf1(i8* %p) {
+  %x = call i1 @f1(i8* %p)
+  ret i1 %x
+}
+
+define i1 @importf2(i8* %p) {
+  %x = call i1 @f2(i8* %p)
+  ret i1 %x
+}
+
+declare i1 @f1(i8* %p)
+declare i1 @f2(i8* %p)
diff --git a/test/ThinLTO/X86/Inputs/cache-typeid-resolutions1.ll b/test/ThinLTO/X86/Inputs/cache-typeid-resolutions1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e53673bcd05e169589cb113a08e71ae7a8dda70e
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/cache-typeid-resolutions1.ll
@@ -0,0 +1,6 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@vt1 = constant i32 0, !type !0
+
+!0 = !{i32 0, !"typeid1"}
diff --git a/test/ThinLTO/X86/Inputs/cache-typeid-resolutions2.ll b/test/ThinLTO/X86/Inputs/cache-typeid-resolutions2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..283badad3bbf08ee4b8cb12b71b71a997d80006a
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/cache-typeid-resolutions2.ll
@@ -0,0 +1,10 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@vt2 = constant i1 (i8*)* @vf2, !type !0
+
+define internal i1 @vf2(i8* %this) {
+  ret i1 0
+}
+
+!0 = !{i32 0, !"typeid2"}
diff --git a/test/ThinLTO/X86/Inputs/cache-typeid-resolutions3.ll b/test/ThinLTO/X86/Inputs/cache-typeid-resolutions3.ll
new file mode 100644
index 0000000000000000000000000000000000000000..830622e9cd762613d49d60e40616032589a9b7d5
--- /dev/null
+++ b/test/ThinLTO/X86/Inputs/cache-typeid-resolutions3.ll
@@ -0,0 +1,15 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@vt2a = constant i1 (i8*)* @vf2a, !type !0
+@vt2b = constant i1 (i8*)* @vf2b, !type !0
+
+define internal i1 @vf2a(i8* %this) {
+  ret i1 0
+}
+
+define internal i1 @vf2b(i8* %this) {
+  ret i1 1
+}
+
+!0 = !{i32 0, !"typeid2"}
diff --git a/test/ThinLTO/X86/cache-config.ll b/test/ThinLTO/X86/cache-config.ll
index a947969f669073de7d6ceb87c9485a8ca1081bd7..01e44b8b16a3ba4c96bbfd1411a944f3e98b3ed9 100644
--- a/test/ThinLTO/X86/cache-config.ll
+++ b/test/ThinLTO/X86/cache-config.ll
@@ -1,21 +1,21 @@
-; RUN: rm -rf %t.cache && mkdir %t.cache
+; RUN: rm -rf %t.cache
 ; RUN: opt -module-hash -module-summary %s -o %t.bc
 
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -mcpu=yonah
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -relax-elf-relocations
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -function-sections
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -data-sections
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -debugger-tune=sce
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -mattr=+sse2
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -relocation-model=static
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -code-model=large
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -cg-opt-level=0
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -O1
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -opt-pipeline=loweratomic
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -aa-pipeline=basic-aa
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -override-triple=x86_64-unknown-linux-gnu
-; RUN: llvm-lto2 -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -default-triple=x86_64-unknown-linux-gnu
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -mcpu=yonah
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -relax-elf-relocations
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -function-sections
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -data-sections
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -debugger-tune=sce
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -mattr=+sse2
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -relocation-model=static
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -code-model=large
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -cg-opt-level=0
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -O1
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -opt-pipeline=loweratomic
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -aa-pipeline=basic-aa
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -override-triple=x86_64-unknown-linux-gnu
+; RUN: llvm-lto2 run -o %t.o %t.bc -cache-dir %t.cache -r=%t.bc,globalfunc,plx -default-triple=x86_64-unknown-linux-gnu
 ; RUN: ls %t.cache | count 15
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/ThinLTO/X86/cache-import-lists.ll b/test/ThinLTO/X86/cache-import-lists.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ba7b437e8cad89d9522a6e06187be1c036c068b3
--- /dev/null
+++ b/test/ThinLTO/X86/cache-import-lists.ll
@@ -0,0 +1,24 @@
+; RUN: opt -module-hash -module-summary %s -o %t.bc
+; RUN: opt -module-hash -module-summary %S/Inputs/cache-import-lists1.ll -o %t1.bc
+; RUN: opt -module-hash -module-summary %S/Inputs/cache-import-lists2.ll -o %t2.bc
+
+; Tests that the hash for t is sensitive to the set of imported functions
+; for each module, which in this case depends on the link order (the function
+; linkonce_odr will be imported from either t1 or t2, whichever comes first).
+
+; RUN: rm -rf %t.cache
+; RUN: llvm-lto2 run -cache-dir %t.cache -o %t.o %t.bc %t1.bc %t2.bc -r=%t.bc,main,plx -r=%t.bc,f1,lx -r=%t.bc,f2,lx -r=%t1.bc,f1,plx -r=%t1.bc,linkonce_odr,plx -r=%t2.bc,f2,plx -r=%t2.bc,linkonce_odr,lx
+; RUN: llvm-lto2 run -cache-dir %t.cache -o %t.o %t.bc %t2.bc %t1.bc -r=%t.bc,main,plx -r=%t.bc,f1,lx -r=%t.bc,f2,lx -r=%t2.bc,f2,plx -r=%t2.bc,linkonce_odr,plx -r=%t1.bc,f1,plx -r=%t1.bc,linkonce_odr,lx
+; RUN: ls %t.cache | count 6
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @main() {
+  call void @f1()
+  call void @f2()
+  ret void
+}
+
+declare void @f1()
+declare void @f2()
diff --git a/test/ThinLTO/X86/cache-typeid-resolutions.ll b/test/ThinLTO/X86/cache-typeid-resolutions.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1609e90b0ae65b61c77a1fdd3fbf3bd7906b4699
--- /dev/null
+++ b/test/ThinLTO/X86/cache-typeid-resolutions.ll
@@ -0,0 +1,47 @@
+; RUN: opt -module-hash -module-summary %s -o %t.bc
+; RUN: opt -module-hash -module-summary %S/Inputs/cache-typeid-resolutions-import.ll -o %t-import.bc
+
+; RUN: llvm-as -o %t1.bc %S/Inputs/cache-typeid-resolutions1.ll
+; RUN: llvm-as -o %t2.bc %S/Inputs/cache-typeid-resolutions2.ll
+; RUN: llvm-as -o %t3.bc %S/Inputs/cache-typeid-resolutions3.ll
+
+; Two resolutions for typeid1: Unsat, Single
+; where both t and t-import are sensitive to typeid1's resolution
+; so 4 distinct objects in total.
+; RUN: rm -rf %t.cache
+; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
+; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc %t1.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx -r=%t1.bc,vt1,plx
+; RUN: ls %t.cache | count 4
+
+; Three resolutions for typeid2: Indir, SingleImpl, UniqueRetVal
+; where both t and t-import are sensitive to typeid2's resolution
+; so 6 distinct objects in total.
+; RUN: rm -rf %t.cache
+; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
+; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc %t2.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t2.bc,vt2,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
+; RUN: llvm-lto2 run -o %t.o %t.bc %t-import.bc %t3.bc -cache-dir %t.cache -r=%t.bc,f1,plx -r=%t.bc,f2,plx -r=%t3.bc,vt2a,plx -r=%t3.bc,vt2b,plx -r=%t-import.bc,importf1,plx -r=%t-import.bc,f1,lx -r=%t-import.bc,importf2,plx -r=%t-import.bc,f2,lx
+; RUN: ls %t.cache | count 6
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define i1 @f1(i8* %p) {
+  %x = call i1 @llvm.type.test(i8* %p, metadata !"typeid1")
+  ret i1 %x
+}
+
+define i1 @f2(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [3 x i8*]**
+  %vtable = load [3 x i8*]*, [3 x i8*]** %vtableptr
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid2")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
+  %result = call i1 %fptr_casted(i8* %obj)
+  ret i1 %result
+}
+
+declare i1 @llvm.type.test(i8*, metadata)
+declare void @llvm.assume(i1)
diff --git a/test/ThinLTO/X86/cache.ll b/test/ThinLTO/X86/cache.ll
index b796b00fc5d511bcf4d48f373466a3ffe6037b4d..ea5c2f98d876c835746687a87e18f60d2866e1d3 100644
--- a/test/ThinLTO/X86/cache.ll
+++ b/test/ThinLTO/X86/cache.ll
@@ -10,8 +10,8 @@
 ; RUN: ls %t.cache | count 1
 
 ; Verify that enabling caching is ignoring module without hash with llvm-lto2
-; RUN: rm -Rf %t.cache && mkdir %t.cache
-; RUN: llvm-lto2 -o %t.o %t2.bc  %t.bc -cache-dir %t.cache \
+; RUN: rm -Rf %t.cache
+; RUN: llvm-lto2 run -o %t.o %t2.bc  %t.bc -cache-dir %t.cache \
 ; RUN:  -r=%t2.bc,_main,plx \
 ; RUN:  -r=%t2.bc,_globalfunc,lx \
 ; RUN:  -r=%t.bc,_globalfunc,plx
@@ -23,19 +23,25 @@
 ; RUN: opt -module-hash -module-summary %s -o %t.bc
 ; RUN: opt -module-hash -module-summary %p/Inputs/cache.ll -o %t2.bc
 
-; Verify that enabling caching is working
+; Verify that enabling caching is working, and that the pruner only removes
+; files matching the pattern "llvmcache-*".
 ; RUN: rm -Rf %t.cache && mkdir %t.cache
+; RUN: touch -t 197001011200 %t.cache/llvmcache-foo %t.cache/foo
 ; RUN: llvm-lto -thinlto-action=run -exported-symbol=globalfunc %t2.bc  %t.bc -thinlto-cache-dir %t.cache
+; RUN: ls %t.cache | count 4
 ; RUN: ls %t.cache/llvmcache.timestamp
-; RUN: ls %t.cache | count 3
+; RUN: ls %t.cache/foo
+; RUN: not ls %t.cache/llvmcache-foo
+; RUN: ls %t.cache/llvmcache-* | count 2
 
 ; Verify that enabling caching is working with llvm-lto2
-; RUN: rm -Rf %t.cache && mkdir %t.cache
-; RUN: llvm-lto2 -o %t.o %t2.bc  %t.bc -cache-dir %t.cache \
+; RUN: rm -Rf %t.cache
+; RUN: llvm-lto2 run -o %t.o %t2.bc  %t.bc -cache-dir %t.cache \
 ; RUN:  -r=%t2.bc,_main,plx \
 ; RUN:  -r=%t2.bc,_globalfunc,lx \
 ; RUN:  -r=%t.bc,_globalfunc,plx
 ; RUN: ls %t.cache | count 2
+; RUN: ls %t.cache/llvmcache-* | count 2
 
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
diff --git a/test/ThinLTO/X86/crash_debuginfo.ll b/test/ThinLTO/X86/crash_debuginfo.ll
index 31b55fb4f9e48054ed33a40830fa8dd677e28f3e..8638c24d0820a0048a35d1066ad5b12c14287f29 100644
--- a/test/ThinLTO/X86/crash_debuginfo.ll
+++ b/test/ThinLTO/X86/crash_debuginfo.ll
@@ -41,6 +41,5 @@ declare void @bar(i32)
 !14 = !DILocalVariable(name: "caster", scope: !9, file: !1, line: 728, type: !15)
 !15 = distinct !DICompositeType(tag: DW_TAG_union_type, scope: !9, file: !1, line: 728, size: 64, align: 64, elements: !6, identifier: "someclass")
 !16 = distinct !DILocation(line: 87, column: 9, scope: !17)
-!17 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !10, line: 73, type: !11, isLocal: false, isDefinition: true, scopeLine: 74, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !18, variables: !6)
+!17 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !10, file: !1, line: 73, type: !11, isLocal: false, isDefinition: true, scopeLine: 74, flags: DIFlagPrototyped, isOptimized: true, unit: !0, declaration: !18, variables: !6)
 !18 = !DISubprogram(name: "foo", linkageName: "foo", scope: !10, file: !1, line: 83, type: !11, isLocal: false, isDefinition: false, scopeLine: 83, flags: DIFlagPrototyped, isOptimized: true)
-
diff --git a/test/ThinLTO/X86/deadstrip.ll b/test/ThinLTO/X86/deadstrip.ll
index 6f1cbfe596932602df034cf37556c3ed3c8e610e..0c85322eb565ff8af594de9b11d330dc0d1eecd4 100644
--- a/test/ThinLTO/X86/deadstrip.ll
+++ b/test/ThinLTO/X86/deadstrip.ll
@@ -8,7 +8,7 @@
 ; RUN: llvm-lto -exported-symbol=_main -thinlto-action=run %t1.bc %t2.bc
 ; RUN: llvm-nm %t1.bc.thinlto.o | FileCheck %s --check-prefix=CHECK-NM
 
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.out -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.out -save-temps \
 ; RUN:   -r %t1.bc,_main,plx \
 ; RUN:   -r %t1.bc,_bar,pl \
 ; RUN:   -r %t1.bc,_dead_func,pl \
@@ -51,7 +51,7 @@
 ; In that case there are uses of @dead_func in the regular LTO partition
 ; and it shouldn't be internalized.
 ; RUN: opt %p/Inputs/deadstrip.ll -o %t3.bc
-; RUN: llvm-lto2 %t1.bc %t3.bc -o %t4.out -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t3.bc -o %t4.out -save-temps \
 ; RUN:   -r %t1.bc,_main,plx \
 ; RUN:   -r %t1.bc,_bar,pl \
 ; RUN:   -r %t1.bc,_dead_func,pl \
diff --git a/test/ThinLTO/X86/debuginfo-compositetype-import.ll b/test/ThinLTO/X86/debuginfo-compositetype-import.ll
index 0b3a7a45224e9c59efc2befa8d11b1344f09236c..ae2f5f26d226f790816222fea0cf03b2807f70d3 100644
--- a/test/ThinLTO/X86/debuginfo-compositetype-import.ll
+++ b/test/ThinLTO/X86/debuginfo-compositetype-import.ll
@@ -7,7 +7,7 @@
 
 ; By default, composite types are imported as type declarations
 ; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t.index.bc -o - | llvm-dis -o - | FileCheck %s
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.out -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.out -save-temps \
 ; RUN:   -r %t2.bc,main,plx \
 ; RUN:   -r %t2.bc,foo,l \
 ; RUN:   -r %t1.bc,foo,pl
@@ -20,7 +20,7 @@
 
 ; Ensure that full type definitions of composite types are imported if requested
 ; RUN: llvm-lto -import-full-type-definitions -thinlto-action=import %t2.bc -thinlto-index=%t.index.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=FULL
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.out -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.out -save-temps \
 ; RUN:   -import-full-type-definitions \
 ; RUN:   -r %t2.bc,main,plx \
 ; RUN:   -r %t2.bc,foo,l \
diff --git a/test/ThinLTO/X86/diagnostic-handler-remarks.ll b/test/ThinLTO/X86/diagnostic-handler-remarks.ll
index 7467a082c5a5631a807a13ce9fb6012c5a052dd9..3880b6f1138037df4bbeebbc3a6ad07b9e5dae9b 100644
--- a/test/ThinLTO/X86/diagnostic-handler-remarks.ll
+++ b/test/ThinLTO/X86/diagnostic-handler-remarks.ll
@@ -2,6 +2,7 @@
 ; RUN: opt -module-summary %p/Inputs/diagnostic-handler-remarks.ll -o %t2.bc
 
 ; Optimization records are collected regardless of the diagnostic handler
+; RUN: rm -f %t.yaml.thin.0.yaml %t.yaml.thin.1.yaml
 ; RUN: llvm-lto -thinlto-action=run \
 ; RUN:          -lto-pass-remarks-output=%t.yaml \
 ; RUN:          -exported-symbol _func2 \
diff --git a/test/ThinLTO/X86/dicompositetype-unique.ll b/test/ThinLTO/X86/dicompositetype-unique.ll
index 3550e6c6a74a3d22d88c7e88b3ab768f0dcd3dd2..7a35f877e63de8ce0d9a05db66413f34bf9aa7ac 100644
--- a/test/ThinLTO/X86/dicompositetype-unique.ll
+++ b/test/ThinLTO/X86/dicompositetype-unique.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -module-summary -o %t1.bc %s
 ; RUN: opt -module-summary -o %t2.bc %S/Inputs/dicompositetype-unique.ll
 
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t --save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t --save-temps \
 ; RUN:    -r %t1.bc,_foo,lx \
 ; RUN:    -r %t1.bc,_main,plx \
 ; RUN:    -r %t2.bc,_foo,plx
diff --git a/test/ThinLTO/X86/distributed_import.ll b/test/ThinLTO/X86/distributed_import.ll
index 0a3f9c07f257d7e2972055e799340323bf3c1381..82cc57c48303fb0dafac80198db236089cc78bee 100644
--- a/test/ThinLTO/X86/distributed_import.ll
+++ b/test/ThinLTO/X86/distributed_import.ll
@@ -1,15 +1,50 @@
-; RUN: opt -module-summary %s -o %t1.bc
-; RUN: opt -module-summary %p/Inputs/distributed_import.ll -o %t2.bc
+; Test distributed build thin link output from llvm-lto2
 
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; Generate bitcode files with summary, as well as minimized bitcode without
+; the debug metadata for the thin link.
+; RUN: opt -thinlto-bc %s -thin-link-bitcode-file=%t1.thinlink.bc -o %t1.bc
+; RUN: opt -thinlto-bc %p/Inputs/distributed_import.ll -thin-link-bitcode-file=%t2.thinlink.bc -o %t2.bc
+
+; First perform the thin link on the normal bitcode file.
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN:     -thinlto-distributed-indexes \
+; RUN:     -r=%t1.bc,g, \
+; RUN:     -r=%t1.bc,f,px \
+; RUN:     -r=%t2.bc,g,px
+; RUN: opt -function-import -summary-file %t1.bc.thinlto.bc %t1.bc -o %t1.out
+; RUN: opt -function-import -summary-file %t2.bc.thinlto.bc %t2.bc -o %t2.out
+; RUN: llvm-dis -o - %t2.out | FileCheck %s
+
+; Save the generated index files.
+; RUN: cp %t1.bc.thinlto.bc %t1.bc.thinlto.bc.orig
+; RUN: cp %t2.bc.thinlto.bc %t2.bc.thinlto.bc.orig
+
+; Copy the minimized bitcode to the regular bitcode path so the module
+; paths in the index are the same (save the regular bitcode for use again
+; further down).
+; RUN: cp %t1.bc %t1.bc.sv
+; RUN: cp %t1.thinlink.bc %t1.bc
+; RUN: cp %t2.bc %t2.bc.sv
+; RUN: cp %t2.thinlink.bc %t2.bc
+
+; Next perform the thin link on the minimized bitcode files, and compare dumps
+; of the resulting indexes to the above dumps to ensure they are identical.
+; RUN: rm -f %t1.bc.thinlto.bc %t2.bc.thinlto.bc
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:     -thinlto-distributed-indexes \
 ; RUN:     -r=%t1.bc,g, \
 ; RUN:     -r=%t1.bc,f,px \
 ; RUN:     -r=%t2.bc,g,px
-; RUN:  opt -function-import -summary-file %t1.bc.thinlto.bc %t1.bc -o %t1.out
+; RUN: diff %t1.bc.thinlto.bc.orig %t1.bc.thinlto.bc
+; RUN: diff %t2.bc.thinlto.bc.orig %t2.bc.thinlto.bc
+
+; Make sure importing occurs as expected
+; RUN: cp %t1.bc.sv %t1.bc
+; RUN: cp %t2.bc.sv %t2.bc
 ; RUN: opt -function-import -summary-file %t2.bc.thinlto.bc %t2.bc -o %t2.out
 ; RUN: llvm-dis -o - %t2.out | FileCheck %s
-; CHECK: @G.llvm.0
+
+; CHECK: @G.llvm.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -20,3 +55,8 @@ entry:
   call i32 (...) @g()
   ret void
 }
+
+!llvm.dbg.cu = !{}
+
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!llvm.module.flags = !{!1}
diff --git a/test/ThinLTO/X86/emit_imports.ll b/test/ThinLTO/X86/emit_imports.ll
index 64ea02d857e6a46c1d95b8cf765ecc82f51bf693..fc025f416ae1ad1dca7571346d52c542d81d0638 100644
--- a/test/ThinLTO/X86/emit_imports.ll
+++ b/test/ThinLTO/X86/emit_imports.ll
@@ -22,7 +22,7 @@
 ; RUN: rm -f %t1.thinlto.bc %t1.bc.imports
 ; RUN: rm -f %t2.thinlto.bc %t2.bc.imports
 ; RUN: rm -f %t3.bc.thinlto.bc %t3.bc.imports
-; RUN: llvm-lto2 %t1.bc %t2.bc %t3.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc %t3.bc -o %t.o -save-temps \
 ; RUN:     -thinlto-distributed-indexes \
 ; RUN:     -r=%t1.bc,g, \
 ; RUN:     -r=%t1.bc,f,px \
diff --git a/test/ThinLTO/X86/empty_module_with_cache.ll b/test/ThinLTO/X86/empty_module_with_cache.ll
index 3e16c395a893f99eba3d2ee088ffc4b23c9b69f8..76fe3e91d20236215dd26e754cda3bea65bdecd8 100644
--- a/test/ThinLTO/X86/empty_module_with_cache.ll
+++ b/test/ThinLTO/X86/empty_module_with_cache.ll
@@ -8,8 +8,8 @@
 ; RUN: ls %t.cache | count 3
 
 ; Verify that enabling caching is working with llvm-lto2
-; RUN: rm -Rf %t.cache && mkdir %t.cache
-; RUN: llvm-lto2 -o %t.o %t2.bc  %t.bc -cache-dir %t.cache \
+; RUN: rm -Rf %t.cache
+; RUN: llvm-lto2 run -o %t.o %t2.bc  %t.bc -cache-dir %t.cache \
 ; RUN:  -r=%t2.bc,_main,plx
 ; RUN: ls %t.cache | count 2
 
@@ -25,8 +25,8 @@
 ; RUN: ls %t.cache | count 1
 
 ; Verify that caching is disabled for module without hash, with llvm-lto2
-; RUN: rm -Rf %t.cache && mkdir %t.cache
-; RUN: llvm-lto2 -o %t.o %t2.bc  %t.bc -cache-dir %t.cache \
+; RUN: rm -Rf %t.cache
+; RUN: llvm-lto2 run -o %t.o %t2.bc  %t.bc -cache-dir %t.cache \
 ; RUN:  -r=%t2.bc,_main,plx
 ; RUN: ls %t.cache | count 0
 
diff --git a/test/ThinLTO/X86/error-newpm.ll b/test/ThinLTO/X86/error-newpm.ll
index 9b0132276028a596e3c02d1889ebc00e18a044b6..9c2fd2c70d6ddbd18f7d90dd7bb0cf4d62a4c6fb 100644
--- a/test/ThinLTO/X86/error-newpm.ll
+++ b/test/ThinLTO/X86/error-newpm.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -module-summary %s -o %t1.bc
-; RUN: not llvm-lto2 %t1.bc -o %t.o \
+; RUN: not llvm-lto2 run %t1.bc -o %t.o \
 ; RUN:     -r=%t1.bc,_tinkywinky,pxl \
 ; RUN:     -lto-use-new-pm 2>&1 | FileCheck %s
 
diff --git a/test/ThinLTO/X86/funcimport2.ll b/test/ThinLTO/X86/funcimport2.ll
index c83370be97069b508d582ba856a27be800a574de..7338f9a9d98aa4d76de6322f97e770dc94dfb51b 100644
--- a/test/ThinLTO/X86/funcimport2.ll
+++ b/test/ThinLTO/X86/funcimport2.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -module-summary %s -o %t1.bc
 ; RUN: opt -module-summary %p/Inputs/funcimport2.ll -o %t2.bc
 
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:     -r=%t1.bc,_foo,plx \
 ; RUN:     -r=%t2.bc,_main,plx \
 ; RUN:     -r=%t2.bc,_foo,l
@@ -11,7 +11,7 @@
 
 ; We shouldn't do any importing at -O0
 ; rm -f %t.o.1.3.import.bc
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:     -O0 \
 ; RUN:     -r=%t1.bc,_foo,plx \
 ; RUN:     -r=%t2.bc,_main,plx \
diff --git a/test/ThinLTO/X86/internalize.ll b/test/ThinLTO/X86/internalize.ll
index 14ff6791561d3c90257b3c1729640613fcca4469..867e3e5a00abf930db096fb643b98783d2d4f0d5 100644
--- a/test/ThinLTO/X86/internalize.ll
+++ b/test/ThinLTO/X86/internalize.ll
@@ -3,7 +3,7 @@
 ; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o - | llvm-dis -o - | FileCheck %s --check-prefix=REGULAR
 ; RUN: llvm-lto -thinlto-action=internalize -thinlto-index %t.index.bc %t1.bc -o -  --exported-symbol=foo | llvm-dis -o - | FileCheck %s --check-prefix=INTERNALIZE
 
-; RUN: llvm-lto2 %t1.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps \
 ; RUN:     -r=%t1.bc,_foo,pxl \
 ; RUN:     -r=%t1.bc,_bar,pl \
 ; RUN:     -r=%t1.bc,_linkonce_func,pl
diff --git a/test/ThinLTO/X86/lazyload_metadata.ll b/test/ThinLTO/X86/lazyload_metadata.ll
index bddabcdf9e72297aac4bac1b6ace12b6331f4e72..f5b6b96ebf025547b442d6159dc80c570d6b891d 100644
--- a/test/ThinLTO/X86/lazyload_metadata.ll
+++ b/test/ThinLTO/X86/lazyload_metadata.ll
@@ -10,13 +10,13 @@
 ; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
 ; RUN:          -o /dev/null -stats \
 ; RUN:  2>&1 | FileCheck %s -check-prefix=LAZY
-; LAZY: 49 bitcode-reader  - Number of Metadata records loaded
+; LAZY: 51 bitcode-reader  - Number of Metadata records loaded
 ; LAZY: 2 bitcode-reader  - Number of MDStrings loaded
 
 ; RUN: llvm-lto -thinlto-action=import %t2.bc -thinlto-index=%t3.bc \
 ; RUN:          -o /dev/null -disable-ondemand-mds-loading -stats \
 ; RUN:  2>&1 | FileCheck %s -check-prefix=NOTLAZY
-; NOTLAZY: 58 bitcode-reader  - Number of Metadata records loaded
+; NOTLAZY: 60 bitcode-reader  - Number of Metadata records loaded
 ; NOTLAZY: 7 bitcode-reader  - Number of MDStrings loaded
 
 
@@ -55,4 +55,4 @@ declare i1 @llvm.type.test(i8* %ptr, metadata %bitset) nounwind readnone
 !6 = !{!9}
 !7 = !{!"7"}
 !8 = !{!"8"}
-!9 = !{!6}
\ No newline at end of file
+!9 = !{!6}
diff --git a/test/ThinLTO/X86/linkonce_aliasee_ref_import.ll b/test/ThinLTO/X86/linkonce_aliasee_ref_import.ll
index 9b8cc7f7228accb11cb2297bacb69630a89bb224..9086d9824b7b65fa5d6750b28dd8b7fb49f911b5 100644
--- a/test/ThinLTO/X86/linkonce_aliasee_ref_import.ll
+++ b/test/ThinLTO/X86/linkonce_aliasee_ref_import.ll
@@ -7,7 +7,7 @@
 ; RUN: llvm-nm -o - < %t2.bc.thinlto.o | FileCheck %s --check-prefix=NM2
 
 ; Import with instr limit to ensure only foo imported.
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:    -r=%t1.bc,foo,pxl \
 ; RUN:    -r=%t1.bc,baz,pxl \
 ; RUN:    -r=%t1.bc,baz.clone,pxl \
diff --git a/test/ThinLTO/X86/module_asm2.ll b/test/ThinLTO/X86/module_asm2.ll
index 02404062163deb5eac5237b2439eb62b287a8329..b46f40196535c6598bfb50e51ef41ac90c6a9107 100644
--- a/test/ThinLTO/X86/module_asm2.ll
+++ b/test/ThinLTO/X86/module_asm2.ll
@@ -8,7 +8,7 @@
 ; RUN:  llvm-nm %t1.bc.thinlto.o | FileCheck  %s --check-prefix=NM0
 ; RUN:  llvm-nm %t2.bc.thinlto.o | FileCheck  %s --check-prefix=NM1
 
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:     -r=%t1.bc,foo,plx \
 ; RUN:     -r=%t1.bc,globalfunc,plx \
 ; RUN:     -r=%t1.bc,globalfunc,plx \
diff --git a/test/ThinLTO/X86/module_asm_glob.ll b/test/ThinLTO/X86/module_asm_glob.ll
index bcc44c58c9f01194934b5cd6b57384305a140a72..e27007524ce419188843b01dc7d1b549e94ad500 100644
--- a/test/ThinLTO/X86/module_asm_glob.ll
+++ b/test/ThinLTO/X86/module_asm_glob.ll
@@ -5,7 +5,7 @@
 ; RUN: llvm-nm %t1.bc.thinlto.o | FileCheck  %s --check-prefix=NM0
 ; RUN: llvm-nm %t2.bc.thinlto.o | FileCheck  %s --check-prefix=NM1
 
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:     -r=%t1.bc,foo,lx \
 ; RUN:     -r=%t1.bc,foo,plx \
 ; RUN:     -r=%t1.bc,_simplefunction,pl \
diff --git a/test/ThinLTO/X86/reference_non_importable.ll b/test/ThinLTO/X86/reference_non_importable.ll
index a001666d28a2deb75e5c293fe1f2113b60516d2c..5cf225e95de035060c8e607e61c6fcd40dc7f1e4 100644
--- a/test/ThinLTO/X86/reference_non_importable.ll
+++ b/test/ThinLTO/X86/reference_non_importable.ll
@@ -1,7 +1,7 @@
 ; RUN: opt -module-summary %s -o %t1.bc
 ; RUN: opt -module-summary %p/Inputs/reference_non_importable.ll -o %t2.bc
 
-; RUN: llvm-lto2 %t1.bc %t2.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc %t2.bc -o %t.o -save-temps \
 ; RUN:     -r=%t1.bc,_foo,pxl \
 ; RUN:     -r=%t1.bc,_b,pxl \
 ; RUN:     -r=%t2.bc,_main,pxl \
diff --git a/test/ThinLTO/X86/tli-nobuiltin.ll b/test/ThinLTO/X86/tli-nobuiltin.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9a480cba11568ee79b3ea8479007cdd35c1c101f
--- /dev/null
+++ b/test/ThinLTO/X86/tli-nobuiltin.ll
@@ -0,0 +1,46 @@
+; Test -lto-freestanding option for libLTO.
+; RUN: llvm-as < %s > %t.bc
+
+; Regular run: expects fprintf to be turned into fwrite
+; RUN: llvm-lto %t.bc -exported-symbol=_foo -o %t.o
+; RUN: llvm-nm %t.o | FileCheck %s --check-prefix=LTO
+; LTO: fwrite
+
+; Freestanding run: expects fprintf to NOT be turned into fwrite
+; RUN: llvm-lto %t.bc -lto-freestanding -exported-symbol=_foo -o %t.o
+; RUN: llvm-nm %t.o | FileCheck %s --check-prefix=LTO-FREESTANDING
+; LTO-FREESTANDING: fprintf
+
+; Same with ThinLTO now.
+; RUN: opt -module-hash -module-summary %s -o %t.bc
+
+; Regular run: expects fprintf to be turned into fwrite
+; RUN: llvm-lto -exported-symbol=_foo -thinlto-action=run %t.bc
+; RUN: llvm-nm %t.bc.thinlto.o | FileCheck %s --check-prefix=ThinLTO
+; ThinLTO: fwrite
+
+; Freestanding run: expects fprintf to NOT be turned into fwrite
+; RUN: llvm-lto -lto-freestanding -exported-symbol=_foo -thinlto-action=run %t.bc
+; RUN: llvm-nm %t.bc.thinlto.o | FileCheck %s --check-prefix=ThinLTO-FREESTANDING
+; ThinLTO-FREESTANDING: fprintf
+
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.11.0"
+
+declare i32 @fprintf(%FILE*, i8*, ...)
+
+%FILE = type { }
+
+@hello_world = constant [13 x i8] c"hello world\0A\00"
+@percent_s = constant [3 x i8] c"%s\00"
+
+; Check fprintf(fp, "%s", str) -> fwrite(str, fp) only when builtins are enabled
+
+define void @foo(%FILE* %fp) {
+  %fmt = getelementptr [3 x i8], [3 x i8]* @percent_s, i32 0, i32 0
+  %str = getelementptr [13 x i8], [13 x i8]* @hello_world, i32 0, i32 0
+  call i32 (%FILE*, i8*, ...) @fprintf(%FILE* %fp, i8* %fmt, i8* %str)
+  ret void
+}
+
diff --git a/test/Transforms/ADCE/delete-profiling-calls-to-constant.ll b/test/Transforms/ADCE/delete-profiling-calls-to-constant.ll
index a61e8f8caccbe6e0efacec0bb5b780c15bf34c37..804b3dd67f2ab1d15414a33437965c87380f204b 100644
--- a/test/Transforms/ADCE/delete-profiling-calls-to-constant.ll
+++ b/test/Transforms/ADCE/delete-profiling-calls-to-constant.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -adce | FileCheck %s
-; RUN: opt < %s -passes=adce | FileCheck %s
+; RUN: opt < %s -adce -S | FileCheck %s
+; RUN: opt < %s -passes=adce -S | FileCheck %s
 
 ; Verify that a call to instrument a constant is deleted.
 
@@ -7,7 +7,7 @@
 @__profd_foo = private global { i64, i64, i64*, i8*, i8*, i32, [1 x i16] } { i64 6699318081062747564, i64 0, i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc_foo, i32 0, i32 0), i8* bitcast (i32 ()* @foo to i8*), i8* null, i32 1, [1 x i16] [i16 1] }, section "__llvm_prf_data", align 8
 
 define i32 @foo() {
-; CHECK-NOT: __llvm_profile_instrument_target
+; CHECK-NOT: call void @__llvm_profile_instrument_target
 entry:
   tail call void @__llvm_profile_instrument_target(i64 ptrtoint (i32 (i32)* @bar to i64), i8* bitcast ({ i64, i64, i64*, i8*, i8*, i32, [1 x i16] }* @__profd_foo to i8*), i32 0)
   %call = tail call i32 @bar(i32 21)
diff --git a/test/Transforms/AddDiscriminators/basic.ll b/test/Transforms/AddDiscriminators/basic.ll
index 801eda2b066521d7478d9ee371cca028dc34f2d9..a781c0d409bcb7826f600097cb7c4d0815b09dba 100644
--- a/test/Transforms/AddDiscriminators/basic.ll
+++ b/test/Transforms/AddDiscriminators/basic.ll
@@ -58,5 +58,5 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 ; CHECK: ![[FOO:[0-9]+]] = distinct !DISubprogram(name: "foo"
 ; CHECK: ![[BLOCK:[0-9]+]] = distinct !DILexicalBlock(scope: ![[FOO]],{{.*}} line: 3)
 ; CHECK: ![[THEN]] = !DILocation(line: 3, scope: ![[BLOCKFILE:[0-9]+]])
-; CHECK: ![[BLOCKFILE]] = !DILexicalBlockFile(scope: ![[BLOCK]],{{.*}} discriminator: 1)
+; CHECK: ![[BLOCKFILE]] = !DILexicalBlockFile(scope: ![[BLOCK]],{{.*}} discriminator: 2)
 ; CHECK: ![[END]] = !DILocation(line: 4, scope: ![[FOO]])
diff --git a/test/Transforms/AddDiscriminators/call-nested.ll b/test/Transforms/AddDiscriminators/call-nested.ll
index 481d6f260047c89d8a7bdeda0883835279073451..4d5145abafe1fb1a244cc86b7b0cd0f08fe5a15e 100644
--- a/test/Transforms/AddDiscriminators/call-nested.ll
+++ b/test/Transforms/AddDiscriminators/call-nested.ll
@@ -47,4 +47,4 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !14 = !DILocation(line: 4, column: 3, scope: !4)
 
 ; CHECK: ![[CALL2]] = !DILocation(line: 4, column: 10, scope: ![[CALL2BLOCK:[0-9]+]])
-; CHECK: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 1)
+; CHECK: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 2)
diff --git a/test/Transforms/AddDiscriminators/call.ll b/test/Transforms/AddDiscriminators/call.ll
index 847a6ad4dc3ab350bc314432066f21ee19aa1c56..49aca5a488f5eb73a5fe9974fb51d7a3fb3e0b35 100644
--- a/test/Transforms/AddDiscriminators/call.ll
+++ b/test/Transforms/AddDiscriminators/call.ll
@@ -5,7 +5,7 @@
 ; #1 void bar();
 ; #2
 ; #3 void foo() {
-; #4  bar();bar()/*discriminator 1*/;bar()/*discriminator 2*/;
+; #4  bar();bar()/*discriminator 2*/;bar()/*discriminator 4*/;
 ; #5 }
 
 ; Function Attrs: uwtable
@@ -14,8 +14,8 @@ define void @_Z3foov() #0 !dbg !4 {
 ; CHECK:  call void @_Z3barv(), !dbg ![[CALL0:[0-9]+]]
   %a = alloca [100 x i8], align 16
   %b = bitcast [100 x i8]* %a to i8*
-  call void @llvm.lifetime.start(i64 100, i8* %b), !dbg !11
-  call void @llvm.lifetime.end(i64 100, i8* %b), !dbg !11
+  call void @llvm.lifetime.start.p0i8(i64 100, i8* %b), !dbg !11
+  call void @llvm.lifetime.end.p0i8(i64 100, i8* %b), !dbg !11
   call void @_Z3barv(), !dbg !11
 ; CHECK:  call void @_Z3barv(), !dbg ![[CALL1:[0-9]+]]
   call void @_Z3barv(), !dbg !12
@@ -24,8 +24,8 @@ define void @_Z3foov() #0 !dbg !4 {
 }
 
 declare void @_Z3barv() #1
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind argmemonly
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind argmemonly
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind argmemonly
 
 attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
@@ -49,6 +49,6 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !13 = !DILocation(line: 5, column: 1, scope: !4)
 
 ; CHECK: ![[CALL1]] = !DILocation(line: 4, column: 9, scope: ![[CALL1BLOCK:[0-9]+]])
-; CHECK: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 1)
+; CHECK: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 2)
 ; CHECK: ![[CALL2]] = !DILocation(line: 4, column: 15, scope: ![[CALL2BLOCK:[0-9]+]])
-; CHECK: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 2)
+; CHECK: ![[CALL2BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
diff --git a/test/Transforms/AddDiscriminators/diamond.ll b/test/Transforms/AddDiscriminators/diamond.ll
index b3afe728547246e4a339a6fec921956645b2d7ce..307e95f41e189a1c83ba682781ff6405bd9a6dc3 100644
--- a/test/Transforms/AddDiscriminators/diamond.ll
+++ b/test/Transforms/AddDiscriminators/diamond.ll
@@ -10,7 +10,7 @@
 ; #6 }
 
 ; bar(5):     discriminator 0
-; bar(3):     discriminator 1
+; bar(3):     discriminator 2
 
 ; Function Attrs: uwtable
 define void @_Z3fooi(i32 %i) #0 !dbg !4 {
@@ -69,4 +69,4 @@ attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !20 = !DILocation(line: 6, column: 1, scope: !4)
 
 ; CHECK: ![[ELSE]] = !DILocation(line: 5, column: 18, scope: ![[ELSEBLOCK:[0-9]+]])
-; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 1)
+; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 2)
diff --git a/test/Transforms/AddDiscriminators/first-only.ll b/test/Transforms/AddDiscriminators/first-only.ll
index 1bd8dae5d05cdf78ea4c6c66a89f904fe1d0bf5d..dd2117a5b1878f007cf1f7e898133b9f30ff122a 100644
--- a/test/Transforms/AddDiscriminators/first-only.ll
+++ b/test/Transforms/AddDiscriminators/first-only.ll
@@ -69,7 +69,7 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !12 = !DILocation(line: 3, scope: !13)
 
 !13 = distinct !DILexicalBlock(line: 3, column: 0, file: !1, scope: !11)
-; CHECK: !DILexicalBlockFile(scope: ![[BLOCK2:[0-9]+]],{{.*}} discriminator: 1)
+; CHECK: !DILexicalBlockFile(scope: ![[BLOCK2:[0-9]+]],{{.*}} discriminator: 2)
 
 !14 = !DILocation(line: 4, scope: !13)
 ; CHECK: ![[BLOCK2]] = distinct !DILexicalBlock(scope: ![[BLOCK1]],{{.*}} line: 3)
diff --git a/test/Transforms/AddDiscriminators/inlined.ll b/test/Transforms/AddDiscriminators/inlined.ll
index 2e8ea97348d0ef30fb01030f358a5d5e31355d70..226e903ee21221b4cf0779d39c27ffd40576a0be 100644
--- a/test/Transforms/AddDiscriminators/inlined.ll
+++ b/test/Transforms/AddDiscriminators/inlined.ll
@@ -62,8 +62,8 @@ attributes #3 = { nounwind readnone }
 !12 = distinct !DISubprogram(name: "g", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, variables: !2)
 !13 = distinct !DILocation(line: 1, column: 17, scope: !14)
 ; CHECK: ![[BF:.*]] = !DILexicalBlockFile(scope: ![[LB1:[0-9]+]],
-; CHECK-SAME:                             discriminator: 1)
-!14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 1)
+; CHECK-SAME:                             discriminator: 2)
+!14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
 ; CHECK: ![[LB1]] = distinct !DILexicalBlock(scope: ![[LB2:[0-9]+]],
 ; CHECK-SAME:                                line: 1, column: 16)
 !15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 1, column: 16)
diff --git a/test/Transforms/AddDiscriminators/memcpy-discriminator.ll b/test/Transforms/AddDiscriminators/memcpy-discriminator.ll
new file mode 100644
index 0000000000000000000000000000000000000000..00642d29502e0e9807f3dd7fb5cde510f342b320
--- /dev/null
+++ b/test/Transforms/AddDiscriminators/memcpy-discriminator.ll
@@ -0,0 +1,104 @@
+; RUN: opt < %s -add-discriminators -sroa -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test case obtained from the following C code:
+
+; struct A {
+;  int field1;
+;  short field2;
+; };
+;
+; struct B {
+;   struct A field1;
+;   int field2;
+; };
+;
+;
+; extern struct B g_b;
+; extern int bar(struct B b, int c);
+;
+; int foo(int cond) {
+;   int result = cond ? bar(g_b, 33) : 42;
+;   return result;
+; }
+
+; In this test, global variable g_b is passed by copy to function bar. That
+; copy is located on the stack (see alloca %g_b.coerce), and it is initialized
+; by a memcpy call.
+;
+; SROA would split alloca %g_b.coerce into two (smaller disjoint) slices:
+; slice [0,8) and slice [8, 12). Users of the original alloca are rewritten
+; as users of the new alloca slices.
+; In particular, the memcpy is rewritten by SROA as two load/store pairs.
+;
+; Later on, mem2reg successfully promotes the new alloca slices to registers,
+; and loads %3 and %5 are made redundant by the loads obtained from the memcpy
+; intrinsic expansion.
+;
+; If pass AddDiscriminators doesn't assign a discriminator to the intrinsic
+; memcpy call, then the loads obtained from the memcpy expansion would not have
+; a correct discriminator.
+;
+; This test checks that the two new loads inserted by SROA in %cond.true
+; correctly reference a debug location with a non-zero discriminator. This test
+; also checks that the same discriminator is used by all instructions from
+; basic block %cond.true.
+
+%struct.B = type { %struct.A, i32 }
+%struct.A = type { i32, i16 }
+
+@g_b = external global %struct.B, align 4
+
+define i32 @foo(i32 %cond) #0 !dbg !5 {
+entry:
+  %g_b.coerce = alloca { i64, i32 }, align 4
+  %tobool = icmp ne i32 %cond, 0, !dbg !7
+  br i1 %tobool, label %cond.true, label %cond.end, !dbg !7
+
+cond.true:
+; CHECK-LABEL: cond.true:
+; CHECK:       load i64, {{.*}}, !dbg ![[LOC:[0-9]+]]
+; CHECK-NEXT:  load i32, {{.*}}, !dbg ![[LOC]]
+; CHECK-NEXT:  %call = call i32 @bar({{.*}}), !dbg ![[LOC]]
+; CHECK-NEXT:  br label %cond.end, !dbg ![[BR_LOC:[0-9]+]]
+
+; CHECK-DAG: ![[LOC]] = !DILocation(line: 16, column: 23, scope: ![[SCOPE:[0-9]+]])
+; CHECK-DAG: ![[SCOPE]] = !DILexicalBlockFile({{.*}}, discriminator: 2)
+; CHECK-DAG: ![[BR_LOC]] = !DILocation(line: 16, column: 16, scope: ![[SCOPE]])
+
+  %0 = bitcast { i64, i32 }* %g_b.coerce to i8*, !dbg !8
+  %1 = bitcast %struct.B* @g_b to i8*, !dbg !8
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 12, i32 4, i1 false), !dbg !8
+  %2 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %g_b.coerce, i32 0, i32 0, !dbg !8
+  %3 = load i64, i64* %2, align 4, !dbg !8
+  %4 = getelementptr inbounds { i64, i32 }, { i64, i32 }* %g_b.coerce, i32 0, i32 1, !dbg !8
+  %5 = load i32, i32* %4, align 4, !dbg !8
+  %call = call i32 @bar(i64 %3, i32 %5, i32 33), !dbg !8
+  br label %cond.end, !dbg !7
+
+cond.end:                                         ; preds = %entry, %cond.true
+  %cond1 = phi i32 [ %call, %cond.true ], [ 42, %entry ], !dbg !7
+  ret i32 %cond1, !dbg !9
+}
+
+declare i32 @bar(i64, i32, i32)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { noinline nounwind uwtable }
+attributes #1 = { argmemonly nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "test.c", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 15, type: !6, isLocal: false, isDefinition: true, scopeLine: 15, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!6 = !DISubroutineType(types: !2)
+!7 = !DILocation(line: 16, column: 16, scope: !5)
+!8 = !DILocation(line: 16, column: 23, scope: !5)
+!9 = !DILocation(line: 17, column: 3, scope: !5)
diff --git a/test/Transforms/AddDiscriminators/multiple.ll b/test/Transforms/AddDiscriminators/multiple.ll
index 387689caddff084982bc83d04ad0e16c9b35fbac..b4c353cf00f1e4e5154c983c428100f6567d19a3 100644
--- a/test/Transforms/AddDiscriminators/multiple.ll
+++ b/test/Transforms/AddDiscriminators/multiple.ll
@@ -67,6 +67,6 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointe
 !12 = !DILocation(line: 4, scope: !4)
 
 ; CHECK: ![[THEN]] = !DILocation(line: 3, scope: ![[THENBLOCK:[0-9]+]])
-; CHECK: ![[THENBLOCK]] = !DILexicalBlockFile(scope: ![[SCOPE:[0-9]+]],{{.*}} discriminator: 1)
+; CHECK: ![[THENBLOCK]] = !DILexicalBlockFile(scope: ![[SCOPE:[0-9]+]],{{.*}} discriminator: 2)
 ; CHECK: ![[ELSE]] = !DILocation(line: 3, scope: ![[ELSEBLOCK:[0-9]+]])
-; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile(scope: ![[SCOPE]],{{.*}} discriminator: 2)
+; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile(scope: ![[SCOPE]],{{.*}} discriminator: 4)
diff --git a/test/Transforms/AddDiscriminators/oneline.ll b/test/Transforms/AddDiscriminators/oneline.ll
index aa52ae42ee472e9a12ed4938cfd997a75679a0df..724574a24ddf07cb2a90eccf13d43663ef7d5118 100644
--- a/test/Transforms/AddDiscriminators/oneline.ll
+++ b/test/Transforms/AddDiscriminators/oneline.ll
@@ -7,9 +7,9 @@
 ; #3 }
 
 ; i == 3:     discriminator 0
-; i == 5:     discriminator 1
-; return 100: discriminator 2
-; return 99:  discriminator 3
+; i == 5:     discriminator 2
+; return 100: discriminator 4
+; return 99:  discriminator 6
 
 define i32 @_Z3fooi(i32 %i) #0 !dbg !4 {
   %1 = alloca i32, align 4
@@ -91,11 +91,11 @@ attributes #1 = { nounwind readnone }
 ; CHECK: ![[F:.*]] = distinct !DISubprogram(name: "foo",
 ; CHECK: ![[IF:.*]] = distinct !DILexicalBlock(scope: ![[F]],{{.*}}line: 2, column: 7)
 ; CHECK: ![[THEN1]] = !DILocation(line: 2, column: 17, scope: ![[THENBLOCK:[0-9]+]])
-; CHECK: ![[THENBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 1)
+; CHECK: ![[THENBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 2)
 ; CHECK: ![[THEN2]] = !DILocation(line: 2, column: 19, scope: ![[THENBLOCK]])
 ; CHECK: ![[THEN3]] = !DILocation(line: 2, column: 7, scope: ![[BRBLOCK:[0-9]+]])
-; CHECK: ![[BRBLOCK]] = !DILexicalBlockFile(scope: ![[F]],{{.*}} discriminator: 1)
+; CHECK: ![[BRBLOCK]] = !DILexicalBlockFile(scope: ![[F]],{{.*}} discriminator: 2)
 ; CHECK: ![[ELSE]] = !DILocation(line: 2, column: 25, scope: ![[ELSEBLOCK:[0-9]+]])
-; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 2)
+; CHECK: ![[ELSEBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 4)
 ; CHECK: ![[COMBINE]] = !DILocation(line: 2, column: 42, scope: ![[COMBINEBLOCK:[0-9]+]])
-; CHECK: ![[COMBINEBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 3)
+; CHECK: ![[COMBINEBLOCK]] = !DILexicalBlockFile(scope: ![[IF]],{{.*}} discriminator: 6)
diff --git a/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll b/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
index 267a6c04597416b2513a4652cd7f09ecefc29875..fac84d092df3931c7ba3bf3d0ae9a57d3a87cc2a 100644
--- a/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
+++ b/test/Transforms/ArgumentPromotion/2008-07-02-array-indexing.ll
@@ -1,25 +1,30 @@
-; RUN: opt < %s -argpromotion -S > %t
-; RUN: cat %t | grep "define.*@callee(.*i32\*"
+; RUN: opt < %s -argpromotion -S | FileCheck %s
 ; PR2498
 
 ; This test tries to convince argpromotion about promoting the load from %A + 2,
 ; because there is a load of %A in the entry block
 define internal i32 @callee(i1 %C, i32* %A) {
+; CHECK-LABEL: define internal i32 @callee(
+; CHECK: i1 %C, i32* %A)
 entry:
-        ; Unconditonally load the element at %A
-        %A.0 = load i32, i32* %A
-        br i1 %C, label %T, label %F
+  ; Unconditonally load the element at %A
+  %A.0 = load i32, i32* %A
+  br i1 %C, label %T, label %F
+
 T:
-        ret i32 %A.0
+  ret i32 %A.0
+
 F:
-        ; Load the element at offset two from %A. This should not be promoted!
-        %A.2 = getelementptr i32, i32* %A, i32 2
-        %R = load i32, i32* %A.2
-        ret i32 %R
+  ; Load the element at offset two from %A. This should not be promoted!
+  %A.2 = getelementptr i32, i32* %A, i32 2
+  %R = load i32, i32* %A.2
+  ret i32 %R
 }
 
 define i32 @foo() {
+; CHECK-LABEL: define i32 @foo
         %X = call i32 @callee(i1 false, i32* null)             ; <i32> [#uses=1]
+; CHECK: call i32 @callee(i1 false, i32* null)
         ret i32 %X
 }
 
diff --git a/test/Transforms/ArgumentPromotion/aggregate-promote.ll b/test/Transforms/ArgumentPromotion/aggregate-promote.ll
index 3f521bace7f3f93278d54363a6588724afb2610b..b0bab7784edb1e033700b8c498bc4c19ab5bf9e0 100644
--- a/test/Transforms/ArgumentPromotion/aggregate-promote.ll
+++ b/test/Transforms/ArgumentPromotion/aggregate-promote.ll
@@ -1,24 +1,31 @@
-; RUN: opt < %s -argpromotion -instcombine -S | not grep load
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
-%QuadTy = type { i32, i32, i32, i32 }
-@G = constant %QuadTy {
-    i32 0, 
-    i32 0, 
-    i32 17, 
-    i32 25 }            ; <%QuadTy*> [#uses=1]
+%T = type { i32, i32, i32, i32 }
+@G = constant %T { i32 0, i32 0, i32 17, i32 25 }
 
-define internal i32 @test(%QuadTy* %P) {
-        %A = getelementptr %QuadTy, %QuadTy* %P, i64 0, i32 3            ; <i32*> [#uses=1]
-        %B = getelementptr %QuadTy, %QuadTy* %P, i64 0, i32 2            ; <i32*> [#uses=1]
-        %a = load i32, i32* %A               ; <i32> [#uses=1]
-        %b = load i32, i32* %B               ; <i32> [#uses=1]
-        %V = add i32 %a, %b             ; <i32> [#uses=1]
-        ret i32 %V
+define internal i32 @test(%T* %p) {
+; CHECK-LABEL: define internal i32 @test(
+; CHECK: i32 %{{.*}}, i32 %{{.*}})
+entry:
+  %a.gep = getelementptr %T, %T* %p, i64 0, i32 3
+  %b.gep = getelementptr %T, %T* %p, i64 0, i32 2
+  %a = load i32, i32* %a.gep
+  %b = load i32, i32* %b.gep
+; CHECK-NOT: load
+  %v = add i32 %a, %b
+  ret i32 %v
+; CHECK: ret i32
 }
 
 define i32 @caller() {
-        %V = call i32 @test( %QuadTy* @G )              ; <i32> [#uses=1]
-        ret i32 %V
+; CHECK-LABEL: define i32 @caller(
+entry:
+  %v = call i32 @test(%T* @G)
+; CHECK: %[[B_GEP:.*]] = getelementptr %T, %T* @G, i64 0, i32 2
+; CHECK: %[[B:.*]] = load i32, i32* %[[B_GEP]]
+; CHECK: %[[A_GEP:.*]] = getelementptr %T, %T* @G, i64 0, i32 3
+; CHECK: %[[A:.*]] = load i32, i32* %[[A_GEP]]
+; CHECK: call i32 @test(i32 %[[B]], i32 %[[A]])
+  ret i32 %v
 }
-
diff --git a/test/Transforms/ArgumentPromotion/attrs.ll b/test/Transforms/ArgumentPromotion/attrs.ll
index 46128f93c2409ab63ce33a6d4e981b10a54feaaa..29cef50fe802923010db4702821d996d00449119 100644
--- a/test/Transforms/ArgumentPromotion/attrs.ll
+++ b/test/Transforms/ArgumentPromotion/attrs.ll
@@ -1,25 +1,52 @@
-; RUN: opt < %s -argpromotion -S | grep zeroext
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
-	%struct.ss = type { i32, i64 }
+%struct.ss = type { i32, i64 }
 
-define internal void @f(%struct.ss* byval  %b, i32* byval %X, i32 %i) nounwind  {
+; Don't drop 'byval' on %X here.
+define internal void @f(%struct.ss* byval %b, i32* byval %X, i32 %i) nounwind {
+; CHECK-LABEL: define internal void @f(
+; CHECK: i32 %[[B0:.*]], i64 %[[B1:.*]], i32* byval %X, i32 %i)
 entry:
-	%tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
-	%tmp1 = load i32, i32* %tmp, align 4
-	%tmp2 = add i32 %tmp1, 1	
-	store i32 %tmp2, i32* %tmp, align 4
+; CHECK: %[[B:.*]] = alloca %struct.ss
+; CHECK: %[[B_GEP0:.*]] = getelementptr %struct.ss, %struct.ss* %[[B]], i32 0, i32 0
+; CHECK: store i32 %[[B0]], i32* %[[B_GEP0]]
+; CHECK: %[[B_GEP1:.*]] = getelementptr %struct.ss, %struct.ss* %[[B]], i32 0, i32 1
+; CHECK: store i64 %[[B1]], i64* %[[B_GEP1]]
 
-	store i32 0, i32* %X
-	ret void
+  %tmp = getelementptr %struct.ss, %struct.ss* %b, i32 0, i32 0
+; CHECK: %[[TMP:.*]] = getelementptr %struct.ss, %struct.ss* %[[B]], i32 0, i32 0
+  %tmp1 = load i32, i32* %tmp, align 4
+; CHECK: %[[TMP1:.*]] = load i32, i32* %[[TMP]]
+  %tmp2 = add i32 %tmp1, 1
+; CHECK: %[[TMP2:.*]] = add i32 %[[TMP1]], 1
+  store i32 %tmp2, i32* %tmp, align 4
+; CHECK: store i32 %[[TMP2]], i32* %[[TMP]]
+
+  store i32 0, i32* %X
+; CHECK: store i32 0, i32* %X
+  ret void
 }
 
+; Also make sure we don't drop the call zeroext attribute.
 define i32 @test(i32* %X) {
+; CHECK-LABEL: define i32 @test(
 entry:
-	%S = alloca %struct.ss		; <%struct.ss*> [#uses=4]
-	%tmp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0		; <i32*> [#uses=1]
-	store i32 1, i32* %tmp1, align 8
-	%tmp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1		; <i64*> [#uses=1]
-	store i64 2, i64* %tmp4, align 4
-	call void @f( %struct.ss* byval %S, i32* byval %X, i32 zeroext 0) 
-	ret i32 0
+  %S = alloca %struct.ss
+; CHECK: %[[S:.*]] = alloca %struct.ss
+  %tmp1 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 0
+  store i32 1, i32* %tmp1, align 8
+; CHECK: store i32 1
+  %tmp4 = getelementptr %struct.ss, %struct.ss* %S, i32 0, i32 1
+  store i64 2, i64* %tmp4, align 4
+; CHECK: store i64 2
+
+  call void @f( %struct.ss* byval %S, i32* byval %X, i32 zeroext 0)
+; CHECK: %[[S_GEP0:.*]] = getelementptr %struct.ss, %struct.ss* %[[S]], i32 0, i32 0
+; CHECK: %[[S0:.*]] = load i32, i32* %[[S_GEP0]]
+; CHECK: %[[S_GEP1:.*]] = getelementptr %struct.ss, %struct.ss* %[[S]], i32 0, i32 1
+; CHECK: %[[S1:.*]] = load i64, i64* %[[S_GEP1]]
+; CHECK: call void @f(i32 %[[S0]], i64 %[[S1]], i32* byval %X, i32 zeroext 0)
+
+  ret i32 0
 }
diff --git a/test/Transforms/ArgumentPromotion/byval-2.ll b/test/Transforms/ArgumentPromotion/byval-2.ll
index 6c0288f5f989c441690b346a1cc5413d0e88b7c7..3e1fee8badd9947931bb39df57ee40ef8102917a 100644
--- a/test/Transforms/ArgumentPromotion/byval-2.ll
+++ b/test/Transforms/ArgumentPromotion/byval-2.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; Arg promotion eliminates the struct argument.
 ; FIXME: Should it eliminate the i32* argument?
diff --git a/test/Transforms/ArgumentPromotion/byval.ll b/test/Transforms/ArgumentPromotion/byval.ll
index b091b09a3597a41f9ede355767745946b61b2ffe..58475fc89607ba4adc6b7b29528cf07c55caa012 100644
--- a/test/Transforms/ArgumentPromotion/byval.ll
+++ b/test/Transforms/ArgumentPromotion/byval.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
diff --git a/test/Transforms/ArgumentPromotion/callgraph-update.ll b/test/Transforms/ArgumentPromotion/callgraph-update.ll
deleted file mode 100644
index 989043d7ea58604964c9605bc35857d7c0cb5f5e..0000000000000000000000000000000000000000
--- a/test/Transforms/ArgumentPromotion/callgraph-update.ll
+++ /dev/null
@@ -1,23 +0,0 @@
-; RUN: opt < %s -argpromotion -simplifycfg -constmerge | llvm-dis
-target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
-target triple = "i386-apple-darwin10.0"
-
-%struct.VEC2 = type { double, double, double }
-%struct.VERTEX = type { %struct.VEC2, %struct.VERTEX*, %struct.VERTEX* }
-%struct.edge_rec = type { %struct.VERTEX*, %struct.edge_rec*, i32, i8* }
-
-declare %struct.edge_rec* @alloc_edge() nounwind ssp
-
-define i64 @build_delaunay(%struct.VERTEX* %tree, %struct.VERTEX* %extra) nounwind ssp {
-entry:
-  br i1 undef, label %bb11, label %bb12
-
-bb11:                                             ; preds = %bb10
-  %a = call %struct.edge_rec* @alloc_edge() nounwind ; <%struct.edge_rec*> [#uses=0]
-  ret i64 123
-
-bb12:                                             ; preds = %bb10
-  %b = call %struct.edge_rec* @alloc_edge() nounwind ; <%struct.edge_rec*> [#uses=1]
-  %c = ptrtoint %struct.edge_rec* %b to i64
-  ret i64 %c
-}
diff --git a/test/Transforms/ArgumentPromotion/chained.ll b/test/Transforms/ArgumentPromotion/chained.ll
index 6ba2e8d486940b42b34323fcc5b57cf8ecedb3b8..028c6c426e5238d7b3af8b92218199b23058de7d 100644
--- a/test/Transforms/ArgumentPromotion/chained.ll
+++ b/test/Transforms/ArgumentPromotion/chained.ll
@@ -1,17 +1,27 @@
-; RUN: opt < %s -argpromotion -instcombine -S | not grep load
-target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
-@G1 = constant i32 0            ; <i32*> [#uses=1]
-@G2 = constant i32* @G1         ; <i32**> [#uses=1]
+@G1 = constant i32 0
+@G2 = constant i32* @G1
 
-define internal i32 @test(i32** %X) {
-        %Y = load i32*, i32** %X              ; <i32*> [#uses=1]
-        %X.upgrd.1 = load i32, i32* %Y               ; <i32> [#uses=1]
-        ret i32 %X.upgrd.1
+define internal i32 @test(i32** %x) {
+; CHECK-LABEL: define internal i32 @test(
+; CHECK: i32 %{{.*}})
+entry:
+  %y = load i32*, i32** %x
+  %z = load i32, i32* %y
+; CHECK-NOT: load
+  ret i32 %z
+; CHECK: ret i32
 }
 
-define i32 @caller(i32** %P) {
-        %X = call i32 @test( i32** @G2 )                ; <i32> [#uses=1]
-        ret i32 %X
+define i32 @caller() {
+; CHECK-LABEL: define i32 @caller()
+entry:
+  %x = call i32 @test(i32** @G2)
+; CHECK: %[[Y:.*]] = load i32*, i32** @G2
+; CHECK: %[[Z:.*]] = load i32, i32* %[[Y]]
+; CHECK: call i32 @test(i32 %[[Z]])
+  ret i32 %x
 }
 
diff --git a/test/Transforms/ArgumentPromotion/control-flow.ll b/test/Transforms/ArgumentPromotion/control-flow.ll
index cdff36eb83c0e8d616dcfb82ba0f1dc728396024..c3fe0c00e877259ac4bcaa5b21a28c5749ef06a5 100644
--- a/test/Transforms/ArgumentPromotion/control-flow.ll
+++ b/test/Transforms/ArgumentPromotion/control-flow.ll
@@ -1,19 +1,27 @@
-; RUN: opt < %s -argpromotion -S | \
-; RUN:    not grep "load i32* null"
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
+; Don't promote around control flow.
 define internal i32 @callee(i1 %C, i32* %P) {
-        br i1 %C, label %T, label %F
+; CHECK-LABEL: define internal i32 @callee(
+; CHECK: i1 %C, i32* %P)
+entry:
+  br i1 %C, label %T, label %F
 
-T:              ; preds = %0
-        ret i32 17
+T:
+  ret i32 17
 
-F:              ; preds = %0
-        %X = load i32, i32* %P               ; <i32> [#uses=1]
-        ret i32 %X
+F:
+  %X = load i32, i32* %P
+  ret i32 %X
 }
 
 define i32 @foo() {
-        %X = call i32 @callee( i1 true, i32* null )             ; <i32> [#uses=1]
-        ret i32 %X
+; CHECK-LABEL: define i32 @foo(
+entry:
+; CHECK-NOT: load i32, i32* null
+  %X = call i32 @callee(i1 true, i32* null)
+; CHECK: call i32 @callee(i1 true, i32* null)
+  ret i32 %X
 }
 
diff --git a/test/Transforms/ArgumentPromotion/control-flow2.ll b/test/Transforms/ArgumentPromotion/control-flow2.ll
index 7413f46a860f9d1f106071bdaab0b3a1f41bd0c6..b75a32ddb3313e6f2bdfbb54cdb80db7d9fa7e73 100644
--- a/test/Transforms/ArgumentPromotion/control-flow2.ll
+++ b/test/Transforms/ArgumentPromotion/control-flow2.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; CHECK: load i32, i32* %A
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
diff --git a/test/Transforms/ArgumentPromotion/crash.ll b/test/Transforms/ArgumentPromotion/crash.ll
index d3f412da14d9c76f4dd73ccf08d357e529b6a66b..d46a48101e78d439b58d2f6a9cb46b9088e25a65 100644
--- a/test/Transforms/ArgumentPromotion/crash.ll
+++ b/test/Transforms/ArgumentPromotion/crash.ll
@@ -1,61 +1,73 @@
-; RUN: opt -inline -argpromotion < %s
-; rdar://7879828
+; RUN: opt -S < %s -inline -argpromotion | FileCheck %s
+; RUN: opt -S < %s -passes=inline,argpromotion | FileCheck %s
 
-define void @foo() personality i32 (...)* @__gxx_personality_v0 {
-  invoke void @foo2()
-          to label %if.end432 unwind label %for.end520 
+%S = type { %S* }
 
-if.end432:  
+; Inlining should nuke the invoke (and any inlined calls) here even with
+; argument promotion running along with it.
+define void @zot() personality i32 (...)* @wibble {
+; CHECK-LABEL: define void @zot() personality i32 (...)* @wibble
+; CHECK-NOT: call
+; CHECK-NOT: invoke
+bb:
+  invoke void @hoge()
+          to label %bb1 unwind label %bb2
+
+bb1:
   unreachable
 
-for.end520: 
-  %exn = landingpad {i8*, i32}
-           cleanup
+bb2:
+  %tmp = landingpad { i8*, i32 }
+          cleanup
   unreachable
 }
 
-define internal  void @foo2() ssp {
-  %call7 = call fastcc i8* @foo3(i1 (i8*)* @foo4)
-  %call58 = call fastcc i8* @foo3(i1 (i8*)* @foo5)
+define internal void @hoge() {
+bb:
+  %tmp = call fastcc i8* @spam(i1 (i8*)* @eggs)
+  %tmp1 = call fastcc i8* @spam(i1 (i8*)* @barney)
   unreachable
 }
 
-define internal fastcc i8* @foo3(i1 (i8*)* %Pred) {
-entry:
+define internal fastcc i8* @spam(i1 (i8*)* %arg) {
+bb:
   unreachable
 }
 
-define internal i1 @foo4(i8* %O) nounwind {
-entry:
-  %call = call zeroext i1 @foo5(i8* %O) ; <i1> [#uses=0]
+define internal i1 @eggs(i8* %arg) {
+bb:
+  %tmp = call zeroext i1 @barney(i8* %arg)
   unreachable
 }
 
-define internal i1 @foo5(i8* %O) nounwind {
-entry:
+define internal i1 @barney(i8* %arg) {
+bb:
   ret i1 undef
 }
 
+define i32 @test_inf_promote_caller(i32 %arg) {
+; CHECK-LABEL: define i32 @test_inf_promote_caller(
+bb:
+  %tmp = alloca %S
+  %tmp1 = alloca %S
+  %tmp2 = call i32 @test_inf_promote_callee(%S* %tmp, %S* %tmp1)
+; CHECK: call i32 @test_inf_promote_callee(%S* %{{.*}}, %S* %{{.*}})
 
-; PR8932 - infinite promotion.
-%0 = type { %0* }
-
-define i32 @test2(i32 %a) {
-init:
-  %0 = alloca %0
-  %1 = alloca %0
-  %2 = call i32 @"clay_assign(Chain, Chain)"(%0* %0, %0* %1)
   ret i32 0
 }
 
-define internal i32 @"clay_assign(Chain, Chain)"(%0* %c, %0* %d) {
-init:
-  %0 = getelementptr %0, %0* %d, i32 0, i32 0
-  %1 = load %0*, %0** %0
-  %2 = getelementptr %0, %0* %c, i32 0, i32 0
-  %3 = load %0*, %0** %2
-  %4 = call i32 @"clay_assign(Chain, Chain)"(%0* %3, %0* %1)
+define internal i32 @test_inf_promote_callee(%S* %arg, %S* %arg1) {
+; CHECK-LABEL: define internal i32 @test_inf_promote_callee(
+; CHECK: %S* %{{.*}}, %S* %{{.*}})
+bb:
+  %tmp = getelementptr %S, %S* %arg1, i32 0, i32 0
+  %tmp2 = load %S*, %S** %tmp
+  %tmp3 = getelementptr %S, %S* %arg, i32 0, i32 0
+  %tmp4 = load %S*, %S** %tmp3
+  %tmp5 = call i32 @test_inf_promote_callee(%S* %tmp4, %S* %tmp2)
+; CHECK: call i32 @test_inf_promote_callee(%S* %{{.*}}, %S* %{{.*}})
+
   ret i32 0
 }
 
-declare i32 @__gxx_personality_v0(...)
+declare i32 @wibble(...)
diff --git a/test/Transforms/ArgumentPromotion/dbg.ll b/test/Transforms/ArgumentPromotion/dbg.ll
index 3d353db105fd64d297cbb71a1364d3d679e171e7..61b7c1843e481c35c47d3dd47f9206d960809304 100644
--- a/test/Transforms/ArgumentPromotion/dbg.ll
+++ b/test/Transforms/ArgumentPromotion/dbg.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 declare void @sink(i32)
 
@@ -23,6 +24,6 @@ define void @caller(i32** %Y) {
 
 !0 = !{i32 2, !"Debug Info Version", i32 3}
 !1 = !DILocation(line: 8, scope: !2)
-!2 = distinct !DISubprogram(name: "test", line: 3, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, scopeLine: 3, scope: null)
+!2 = distinct !DISubprogram(name: "test", file: !5, line: 3, isLocal: true, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !3, scopeLine: 3, scope: null)
 !3 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5.0 ", isOptimized: false, emissionKind: LineTablesOnly, file: !5)
 !5 = !DIFile(filename: "test.c", directory: "")
diff --git a/test/Transforms/ArgumentPromotion/fp80.ll b/test/Transforms/ArgumentPromotion/fp80.ll
index 84ef603de82c12c54cc8662668162018c2fadb19..bd780fa21aebf5d61babdf60f2873557ef11943b 100644
--- a/test/Transforms/ArgumentPromotion/fp80.ll
+++ b/test/Transforms/ArgumentPromotion/fp80.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/ArgumentPromotion/inalloca.ll b/test/Transforms/ArgumentPromotion/inalloca.ll
index 5bf57c8ff46505aff9c4dbfef3b20f6298905712..7ea3b4e42777f679d16c06e14dc20f18465e36d2 100644
--- a/test/Transforms/ArgumentPromotion/inalloca.ll
+++ b/test/Transforms/ArgumentPromotion/inalloca.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -argpromotion -sroa -S | FileCheck %s
+; RUN: opt %s -passes='argpromotion,function(sroa)' -S | FileCheck %s
 
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
diff --git a/test/Transforms/ArgumentPromotion/pr27568.ll b/test/Transforms/ArgumentPromotion/pr27568.ll
index 648317aee0daad4d5062181d0df39cfc625c8a12..1496780748da71bd59711eff27085736a9490fb3 100644
--- a/test/Transforms/ArgumentPromotion/pr27568.ll
+++ b/test/Transforms/ArgumentPromotion/pr27568.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -argpromotion < %s | FileCheck %s
+; RUN: opt -S -passes=argpromotion < %s | FileCheck %s
 target triple = "x86_64-pc-windows-msvc"
 
 define internal void @callee(i8*) {
diff --git a/test/Transforms/ArgumentPromotion/reserve-tbaa.ll b/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
index 3c8ed79eeb29106bfd500723ebca329d56314225..3a3aa44b2a98db8683d8d0bf2561e6ad97b02d7f 100644
--- a/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
+++ b/test/Transforms/ArgumentPromotion/reserve-tbaa.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -argpromotion -S
+; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; PR17906
 ; When we promote two arguments in a single function with different types,
diff --git a/test/Transforms/ArgumentPromotion/sret.ll b/test/Transforms/ArgumentPromotion/sret.ll
index 8e5521f48d1008289124dd0786a4adba6a339177..55fc036f177503b6f7ed08a25c5ea0ac5d0bac82 100644
--- a/test/Transforms/ArgumentPromotion/sret.ll
+++ b/test/Transforms/ArgumentPromotion/sret.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc"
diff --git a/test/Transforms/ArgumentPromotion/tail.ll b/test/Transforms/ArgumentPromotion/tail.ll
index 2ea387cd26450fe7d1f92532540a4a6aa61d192d..93de60afe91553763661df18d601cd697d094561 100644
--- a/test/Transforms/ArgumentPromotion/tail.ll
+++ b/test/Transforms/ArgumentPromotion/tail.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -argpromotion -S -o - | FileCheck %s
+; RUN: opt %s -passes=argpromotion -S -o - | FileCheck %s
 ; PR14710
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/Transforms/ArgumentPromotion/variadic.ll b/test/Transforms/ArgumentPromotion/variadic.ll
index 0e03882d3b202bc5528f19c3c95447e4acc6b9a1..034f853883fd774a7045117aed29ee891467c21f 100644
--- a/test/Transforms/ArgumentPromotion/variadic.ll
+++ b/test/Transforms/ArgumentPromotion/variadic.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -argpromotion -S | FileCheck %s
+; RUN: opt < %s -passes=argpromotion -S | FileCheck %s
 
 ; Unused arguments from variadic functions cannot be eliminated as that changes
 ; their classiciation according to the SysV amd64 ABI. Clang and other frontends
diff --git a/test/Transforms/AtomicExpand/SPARC/libcalls.ll b/test/Transforms/AtomicExpand/SPARC/libcalls.ll
index afab7a39b2781a47ececbfb11daf6a95d57dda82..fc6aade8708a3b5bda9e1b86d75e50d0e04317c9 100644
--- a/test/Transforms/AtomicExpand/SPARC/libcalls.ll
+++ b/test/Transforms/AtomicExpand/SPARC/libcalls.ll
@@ -43,11 +43,11 @@ define i16 @test_exchange_i16(i16* %arg, i16 %val) {
 ; CHECK:  %1 = bitcast i16* %arg to i8*
 ; CHECK:  %2 = alloca i16, align 2
 ; CHECK:  %3 = bitcast i16* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 2, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 2, i8* %3)
 ; CHECK:  store i16 %old, i16* %2, align 2
 ; CHECK:  %4 = call zeroext i1 @__atomic_compare_exchange_2(i8* %1, i8* %3, i16 %new, i32 5, i32 0)
 ; CHECK:  %5 = load i16, i16* %2, align 2
-; CHECK:  call void @llvm.lifetime.end(i64 2, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 2, i8* %3)
 ; CHECK:  %6 = insertvalue { i16, i1 } undef, i16 %5, 0
 ; CHECK:  %7 = insertvalue { i16, i1 } %6, i1 %4, 1
 ; CHECK:  %ret = extractvalue { i16, i1 } %7, 0
@@ -76,10 +76,10 @@ define i16 @test_add_i16(i16* %arg, i16 %val) {
 ; CHECK:  %1 = bitcast i128* %arg to i8*
 ; CHECK:  %2 = alloca i128, align 8
 ; CHECK:  %3 = bitcast i128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
 ; CHECK:  call void @__atomic_load(i32 16, i8* %1, i8* %3, i32 5)
 ; CHECK:  %4 = load i128, i128* %2, align 8
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 ; CHECK:  ret i128 %4
 define i128 @test_load_i128(i128* %arg) {
   %ret = load atomic i128, i128* %arg seq_cst, align 16
@@ -90,10 +90,10 @@ define i128 @test_load_i128(i128* %arg) {
 ; CHECK:  %1 = bitcast i128* %arg to i8*
 ; CHECK:  %2 = alloca i128, align 8
 ; CHECK:  %3 = bitcast i128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
 ; CHECK:  store i128 %val, i128* %2, align 8
 ; CHECK:  call void @__atomic_store(i32 16, i8* %1, i8* %3, i32 5)
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 ; CHECK:  ret void
 define void @test_store_i128(i128* %arg, i128 %val) {
   store atomic i128 %val, i128* %arg seq_cst, align 16
@@ -104,15 +104,15 @@ define void @test_store_i128(i128* %arg, i128 %val) {
 ; CHECK:  %1 = bitcast i128* %arg to i8*
 ; CHECK:  %2 = alloca i128, align 8
 ; CHECK:  %3 = bitcast i128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
 ; CHECK:  store i128 %val, i128* %2, align 8
 ; CHECK:  %4 = alloca i128, align 8
 ; CHECK:  %5 = bitcast i128* %4 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %5)
 ; CHECK:  call void @__atomic_exchange(i32 16, i8* %1, i8* %3, i8* %5, i32 5)
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 ; CHECK:  %6 = load i128, i128* %4, align 8
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %5)
 ; CHECK:  ret i128 %6
 define i128 @test_exchange_i128(i128* %arg, i128 %val) {
   %ret = atomicrmw xchg i128* %arg, i128 %val seq_cst
@@ -123,16 +123,16 @@ define i128 @test_exchange_i128(i128* %arg, i128 %val) {
 ; CHECK:  %1 = bitcast i128* %arg to i8*
 ; CHECK:  %2 = alloca i128, align 8
 ; CHECK:  %3 = bitcast i128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
 ; CHECK:  store i128 %old, i128* %2, align 8
 ; CHECK:  %4 = alloca i128, align 8
 ; CHECK:  %5 = bitcast i128* %4 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %5)
 ; CHECK:  store i128 %new, i128* %4, align 8
 ; CHECK:  %6 = call zeroext i1 @__atomic_compare_exchange(i32 16, i8* %1, i8* %3, i8* %5, i32 5, i32 0)
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %5)
 ; CHECK:  %7 = load i128, i128* %2, align 8
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 ; CHECK:  %8 = insertvalue { i128, i1 } undef, i128 %7, 0
 ; CHECK:  %9 = insertvalue { i128, i1 } %8, i1 %6, 1
 ; CHECK:  %ret = extractvalue { i128, i1 } %9, 0
@@ -157,15 +157,15 @@ define i128 @test_cmpxchg_i128(i128* %arg, i128 %old, i128 %new) {
 ; CHECK:  %new = add i128 %loaded, %val
 ; CHECK:  %4 = bitcast i128* %arg to i8*
 ; CHECK:  %5 = bitcast i128* %1 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %5)
 ; CHECK:  store i128 %loaded, i128* %1, align 8
 ; CHECK:  %6 = bitcast i128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %6)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %6)
 ; CHECK:  store i128 %new, i128* %2, align 8
 ; CHECK:  %7 = call zeroext i1 @__atomic_compare_exchange(i32 16, i8* %4, i8* %5, i8* %6, i32 5, i32 5)
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %6)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %6)
 ; CHECK:  %8 = load i128, i128* %1, align 8
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %5)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %5)
 ; CHECK:  %9 = insertvalue { i128, i1 } undef, i128 %8, 0
 ; CHECK:  %10 = insertvalue { i128, i1 } %9, i1 %7, 1
 ; CHECK:  %success = extractvalue { i128, i1 } %10, 1
@@ -204,12 +204,12 @@ define void @test_store_double(double* %arg, double %val) {
 ; CHECK:   %1 = bitcast i16** %arg to i8*
 ; CHECK:   %2 = alloca i16*, align 4
 ; CHECK:   %3 = bitcast i16** %2 to i8*
-; CHECK:   call void @llvm.lifetime.start(i64 4, i8* %3)
+; CHECK:   call void @llvm.lifetime.start.p0i8(i64 4, i8* %3)
 ; CHECK:   store i16* %old, i16** %2, align 4
 ; CHECK:   %4 = ptrtoint i16* %new to i32
 ; CHECK:   %5 = call zeroext i1 @__atomic_compare_exchange_4(i8* %1, i8* %3, i32 %4, i32 5, i32 2)
 ; CHECK:   %6 = load i16*, i16** %2, align 4
-; CHECK:   call void @llvm.lifetime.end(i64 4, i8* %3)
+; CHECK:   call void @llvm.lifetime.end.p0i8(i64 4, i8* %3)
 ; CHECK:   %7 = insertvalue { i16*, i1 } undef, i16* %6, 0
 ; CHECK:   %8 = insertvalue { i16*, i1 } %7, i1 %5, 1
 ; CHECK:   %ret = extractvalue { i16*, i1 } %8, 0
@@ -227,10 +227,10 @@ define i16* @test_cmpxchg_ptr(i16** %arg, i16* %old, i16* %new) {
 ; CHECK:   %1 = bitcast fp128* %arg to i8*
 ; CHECK:  %2 = alloca fp128, align 8
 ; CHECK:  %3 = bitcast fp128* %2 to i8*
-; CHECK:  call void @llvm.lifetime.start(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.start.p0i8(i64 16, i8* %3)
 ; CHECK:  store fp128 %val, fp128* %2, align 8
 ; CHECK:  call void @__atomic_store(i32 16, i8* %1, i8* %3, i32 5)
-; CHECK:  call void @llvm.lifetime.end(i64 16, i8* %3)
+; CHECK:  call void @llvm.lifetime.end.p0i8(i64 16, i8* %3)
 ; CHECK:  ret void
 define void @test_store_fp128(fp128* %arg, fp128 %val) {
   store atomic fp128 %val, fp128* %arg seq_cst, align 16
diff --git a/test/Transforms/BBVectorize/X86/wr-aliases.ll b/test/Transforms/BBVectorize/X86/wr-aliases.ll
index a6ea27fc3ecb86880abf887c46880450dc6d04bf..e34414988f32a0084fc03bfbe4ab5d0485a199fa 100644
--- a/test/Transforms/BBVectorize/X86/wr-aliases.ll
+++ b/test/Transforms/BBVectorize/X86/wr-aliases.ll
@@ -14,7 +14,7 @@ declare fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval nocaptur
 declare void @llvm.lifetime.start(i64, i8* nocapture) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 define void @main_arrayctor.cont([10 x %class.QBezier.15]* %beziers, %class.QBezier.15* %agg.tmp.i, %class.QBezier.15* %agg.tmp55.i, %class.QBezier.15* %agg.tmp56.i) {
 newFuncRoot:
@@ -134,9 +134,9 @@ arrayctor.cont:                                   ; preds = %newFuncRoot
   call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp55.i)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %v2, i8* %v3, i64 64, i32 8, i1 false)
   call fastcc void @_ZL12printQBezier7QBezier(%class.QBezier.15* byval align 8 %agg.tmp56.i)
-  call void @llvm.lifetime.end(i64 64, i8* %v0)
-  call void @llvm.lifetime.end(i64 64, i8* %v1)
-  call void @llvm.lifetime.end(i64 64, i8* %v2)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %v0)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %v1)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %v2)
   br label %arrayctor.cont.ret.exitStub
 }
 
diff --git a/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll b/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
index 6cec253bbf9b88e49f03ec38fcb1c388b53524ac..2bcb3a9d1e3d582b579cc485288700d2e192db48 100644
--- a/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
+++ b/test/Transforms/CodeGenPrepare/AMDGPU/no-sink-addrspacecast.ll
@@ -5,7 +5,7 @@
 ; ASC-NOT: ptrtoint
 ; ASC-NOT: inttoptr
 
-define void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 {
+define amdgpu_kernel void @test_sink_ptrtoint_asc(float addrspace(1)* nocapture %arg, float addrspace(1)* nocapture readonly %arg1, float addrspace(3)* %arg2) #0 {
 bb:
   %tmp = getelementptr inbounds float, float addrspace(3)* %arg2, i32 16
   %tmp2 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
diff --git a/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll b/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
new file mode 100644
index 0000000000000000000000000000000000000000..dfa81b54cc3dd03e7fcd81d4b46c23494b1ffdc6
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/NVPTX/bypass-slow-div-special-cases.ll
@@ -0,0 +1,216 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -codegenprepare < %s | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+target triple = "nvptx64-nvidia-cuda"
+
+; No bypassing should be done in apparently unsuitable cases.
+define void @Test_no_bypassing(i32 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_no_bypassing(
+; CHECK-NEXT:    [[A_1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[A_2:%.*]] = sub i64 -1, [[A_1]]
+; CHECK-NEXT:    [[RES:%.*]] = srem i64 [[A_2]], [[B:%.*]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = zext i32 %a to i64
+  ; %a.2 is always negative so the division cannot be bypassed.
+  %a.2 = sub i64 -1, %a.1
+  %res = srem i64 %a.2, %b
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; No OR instruction is needed if one of the operands (divisor) is known
+; to fit into 32 bits.
+define void @Test_check_one_operand(i64 %a, i32 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_check_one_operand(
+; CHECK-NEXT:    [[B_1:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[A:%.*]], -4294967296
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP8:%.*]]
+; CHECK:         [[TMP4:%.*]] = trunc i64 [[B_1]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[A]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    br label [[TMP10:%.*]]
+; CHECK:         [[TMP9:%.*]] = sdiv i64 [[A]], [[B_1]]
+; CHECK-NEXT:    br label [[TMP10]]
+; CHECK:         [[TMP11:%.*]] = phi i64 [ [[TMP7]], [[TMP3]] ], [ [[TMP9]], [[TMP8]] ]
+; CHECK-NEXT:    store i64 [[TMP11]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %b.1 = zext i32 %b to i64
+  %res = sdiv i64 %a, %b.1
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; If both operands are known to fit into 32 bits, then replace the division
+; in-place without CFG modification.
+define void @Test_check_none(i64 %a, i32 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_check_none(
+; CHECK-NEXT:    [[A_1:%.*]] = and i64 [[A:%.*]], 4294967295
+; CHECK-NEXT:    [[B_1:%.*]] = zext i32 [[B:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[A_1]] to i32
+; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[B_1]] to i32
+; CHECK-NEXT:    [[TMP3:%.*]] = udiv i32 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
+; CHECK-NEXT:    store i64 [[TMP4]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = and i64 %a, 4294967295
+  %b.1 = zext i32 %b to i64
+  %res = udiv i64 %a.1, %b.1
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+; In case of unsigned long division with a short dividend,
+; the long division is not needed any more.
+define void @Test_special_case(i32 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_special_case(
+; CHECK-NEXT:    [[A_1:%.*]] = zext i32 [[A:%.*]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp uge i64 [[A_1]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP9:%.*]]
+; CHECK:         [[TMP3:%.*]] = trunc i64 [[B]] to i32
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[A_1]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = udiv i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = urem i32 [[TMP4]], [[TMP3]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP5]] to i64
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:         [[TMP10:%.*]] = phi i64 [ [[TMP7]], [[TMP2]] ], [ 0, [[TMP0:%.*]] ]
+; CHECK-NEXT:    [[TMP11:%.*]] = phi i64 [ [[TMP8]], [[TMP2]] ], [ [[A_1]], [[TMP0]] ]
+; CHECK-NEXT:    [[RES:%.*]] = add i64 [[TMP10]], [[TMP11]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %a.1 = zext i32 %a to i64
+  %div = udiv i64 %a.1, %b
+  %rem = urem i64 %a.1, %b
+  %res = add i64 %div, %rem
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+
+; Do not bypass a division if one of the operands looks like a hash value.
+define void @Test_dont_bypass_xor(i64 %a, i64 %b, i64 %l, i64* %retptr) {
+; CHECK-LABEL: @Test_dont_bypass_xor(
+; CHECK-NEXT:    [[C:%.*]] = xor i64 [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = udiv i64 [[C]], [[L:%.*]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %c = xor i64 %a, %b
+  %res = udiv i64 %c, %l
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+define void @Test_dont_bypass_phi_xor(i64 %a, i64 %b, i64 %l, i64* %retptr) {
+; CHECK-LABEL: @Test_dont_bypass_phi_xor(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[B:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[MERGE:%.*]], label [[XORPATH:%.*]]
+; CHECK:       xorpath:
+; CHECK-NEXT:    [[C:%.*]] = xor i64 [[A:%.*]], [[B]]
+; CHECK-NEXT:    br label [[MERGE]]
+; CHECK:       merge:
+; CHECK-NEXT:    [[E:%.*]] = phi i64 [ undef, [[ENTRY:%.*]] ], [ [[C]], [[XORPATH]] ]
+; CHECK-NEXT:    [[RES:%.*]] = sdiv i64 [[E]], [[L:%.*]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp eq i64 %b, 0
+  br i1 %cmp, label %merge, label %xorpath
+
+xorpath:
+  %c = xor i64 %a, %b
+  br label %merge
+
+merge:
+  %e = phi i64 [ undef, %entry ], [ %c, %xorpath ]
+  %res = sdiv i64 %e, %l
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+define void @Test_dont_bypass_mul_long_const(i64 %a, i64 %l, i64* %retptr) {
+; CHECK-LABEL: @Test_dont_bypass_mul_long_const(
+; CHECK-NEXT:    [[C:%.*]] = mul i64 [[A:%.*]], 5229553307
+; CHECK-NEXT:    [[RES:%.*]] = urem i64 [[C]], [[L:%.*]]
+; CHECK-NEXT:    store i64 [[RES]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %c = mul i64 %a, 5229553307 ; the constant doesn't fit 32 bits
+  %res = urem i64 %c, %l
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+define void @Test_bypass_phi_mul_const(i64 %a, i64 %b, i64* %retptr) {
+; CHECK-LABEL: @Test_bypass_phi_mul_const(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[A_MUL:%.*]] = mul nsw i64 [[A:%.*]], 34806414968801
+; CHECK-NEXT:    [[P:%.*]] = icmp sgt i64 [[A]], [[B:%.*]]
+; CHECK-NEXT:    br i1 [[P]], label [[BRANCH:%.*]], label [[MERGE:%.*]]
+; CHECK:       branch:
+; CHECK-NEXT:    br label [[MERGE]]
+; CHECK:       merge:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i64 [ 42, [[BRANCH]] ], [ [[A_MUL]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = or i64 [[LHS]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], -4294967296
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP8:%.*]]
+; CHECK:         [[TMP4:%.*]] = trunc i64 [[B]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[LHS]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = udiv i32 [[TMP5]], [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = zext i32 [[TMP6]] to i64
+; CHECK-NEXT:    br label [[TMP10:%.*]]
+; CHECK:         [[TMP9:%.*]] = sdiv i64 [[LHS]], [[B]]
+; CHECK-NEXT:    br label [[TMP10]]
+; CHECK:         [[TMP11:%.*]] = phi i64 [ [[TMP7]], [[TMP3]] ], [ [[TMP9]], [[TMP8]] ]
+; CHECK-NEXT:    store i64 [[TMP11]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+entry:
+  %a.mul = mul nsw i64 %a, 34806414968801
+  %p = icmp sgt i64 %a, %b
+  br i1 %p, label %branch, label %merge
+
+branch:
+  br label %merge
+
+merge:
+  %lhs = phi i64 [ 42, %branch ], [ %a.mul, %entry ]
+  %res = sdiv i64 %lhs, %b
+  store i64 %res, i64* %retptr
+  ret void
+}
+
+define void @Test_bypass_mul_short_const(i64 %a, i64 %l, i64* %retptr) {
+; CHECK-LABEL: @Test_bypass_mul_short_const(
+; CHECK-NEXT:    [[C:%.*]] = mul i64 [[A:%.*]], -42
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 [[C]], [[L:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i64 [[TMP1]], -4294967296
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[TMP2]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP9:%.*]]
+; CHECK:         [[TMP5:%.*]] = trunc i64 [[L]] to i32
+; CHECK-NEXT:    [[TMP6:%.*]] = trunc i64 [[C]] to i32
+; CHECK-NEXT:    [[TMP7:%.*]] = urem i32 [[TMP6]], [[TMP5]]
+; CHECK-NEXT:    [[TMP8:%.*]] = zext i32 [[TMP7]] to i64
+; CHECK-NEXT:    br label [[TMP11:%.*]]
+; CHECK:         [[TMP10:%.*]] = urem i64 [[C]], [[L]]
+; CHECK-NEXT:    br label [[TMP11]]
+; CHECK:         [[TMP12:%.*]] = phi i64 [ [[TMP8]], [[TMP4]] ], [ [[TMP10]], [[TMP9]] ]
+; CHECK-NEXT:    store i64 [[TMP12]], i64* [[RETPTR:%.*]]
+; CHECK-NEXT:    ret void
+;
+  %c = mul i64 %a, -42
+  %res = urem i64 %c, %l
+  store i64 %res, i64* %retptr
+  ret void
+}
diff --git a/test/Transforms/CodeGenPrepare/X86/computedgoto.ll b/test/Transforms/CodeGenPrepare/X86/computedgoto.ll
new file mode 100644
index 0000000000000000000000000000000000000000..00a4df9b2c59ab57543ebbec264ce3ede8c2aaf7
--- /dev/null
+++ b/test/Transforms/CodeGenPrepare/X86/computedgoto.ll
@@ -0,0 +1,294 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -codegenprepare -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare void @use(i32) local_unnamed_addr
+declare void @useptr([2 x i8*]*) local_unnamed_addr
+
+; CHECK: @simple.targets = constant [2 x i8*] [i8* blockaddress(@simple, %bb0), i8* blockaddress(@simple, %bb1)], align 16
+@simple.targets = constant [2 x i8*] [i8* blockaddress(@simple, %bb0), i8* blockaddress(@simple, %bb1)], align 16
+
+; CHECK: @multi.targets = constant [2 x i8*] [i8* blockaddress(@multi, %bb0), i8* blockaddress(@multi, %bb1)], align 16
+@multi.targets = constant [2 x i8*] [i8* blockaddress(@multi, %bb0), i8* blockaddress(@multi, %bb1)], align 16
+
+; CHECK: @loop.targets = constant [2 x i8*] [i8* blockaddress(@loop, %bb0), i8* blockaddress(@loop, %bb1)], align 16
+@loop.targets = constant [2 x i8*] [i8* blockaddress(@loop, %bb0), i8* blockaddress(@loop, %bb1)], align 16
+
+; CHECK: @nophi.targets = constant [2 x i8*] [i8* blockaddress(@nophi, %bb0), i8* blockaddress(@nophi, %bb1)], align 16
+@nophi.targets = constant [2 x i8*] [i8* blockaddress(@nophi, %bb0), i8* blockaddress(@nophi, %bb1)], align 16
+
+; CHECK: @noncritical.targets = constant [2 x i8*] [i8* blockaddress(@noncritical, %bb0), i8* blockaddress(@noncritical, %bb1)], align 16
+@noncritical.targets = constant [2 x i8*] [i8* blockaddress(@noncritical, %bb0), i8* blockaddress(@noncritical, %bb1)], align 16
+
+; Check that we break the critical edge when an jump table has only one use.
+define void @simple(i32* nocapture readonly %p) {
+; CHECK-LABEL: @simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITVAL:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0_CLONE:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1_CLONE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       .split:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i32* [ [[PTR:%.*]], [[BB0:%.*]] ], [ [[INCDEC_PTR]], [[BB0_CLONE]] ]
+; CHECK-NEXT:    [[MERGE2:%.*]] = phi i32 [ 0, [[BB0]] ], [ [[INITVAL]], [[BB0_CLONE]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[MERGE2]])
+; CHECK-NEXT:    br label [[INDIRECTGOTO:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[DOTSPLIT3:%.*]]
+; CHECK:       .split3:
+; CHECK-NEXT:    [[MERGE5:%.*]] = phi i32* [ [[PTR]], [[BB1:%.*]] ], [ [[INCDEC_PTR]], [[BB1_CLONE]] ]
+; CHECK-NEXT:    [[MERGE7:%.*]] = phi i32 [ 1, [[BB1]] ], [ [[INITVAL]], [[BB1_CLONE]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[MERGE7]])
+; CHECK-NEXT:    br label [[INDIRECTGOTO]]
+; CHECK:       indirectgoto:
+; CHECK-NEXT:    [[P_ADDR_SINK:%.*]] = phi i32* [ [[MERGE5]], [[DOTSPLIT3]] ], [ [[MERGE]], [[DOTSPLIT]] ]
+; CHECK-NEXT:    [[PTR]] = getelementptr inbounds i32, i32* [[P_ADDR_SINK]], i64 1
+; CHECK-NEXT:    [[NEWP:%.*]] = load i32, i32* [[P_ADDR_SINK]], align 4
+; CHECK-NEXT:    [[IDX:%.*]] = sext i32 [[NEWP]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @simple.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[NEWOP:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+; CHECK:       bb0.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT]]
+; CHECK:       bb1.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT3]]
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initval = load i32, i32* %p, align 4
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  %p.addr.0 = phi i32* [ %incdec.ptr, %entry ], [ %ptr, %indirectgoto ]
+  %opcode.0 = phi i32 [ %initval, %entry ], [ 0, %indirectgoto ]
+  tail call void @use(i32 %opcode.0)
+  br label %indirectgoto
+
+bb1:
+  %p.addr.1 = phi i32* [ %incdec.ptr, %entry ], [ %ptr, %indirectgoto ]
+  %opcode.1 = phi i32 [ %initval, %entry ], [ 1, %indirectgoto ]
+  tail call void @use(i32 %opcode.1)
+  br label %indirectgoto
+
+indirectgoto:
+  %p.addr.sink = phi i32* [ %p.addr.1, %bb1 ], [ %p.addr.0, %bb0 ]
+  %ptr = getelementptr inbounds i32, i32* %p.addr.sink, i64 1
+  %newp = load i32, i32* %p.addr.sink, align 4
+  %idx = sext i32 %newp to i64
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @simple.targets, i64 0, i64 %idx
+  %newop = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %newop, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
+
+; Don't try to break critical edges when several indirectbr point to a single block
+define void @multi(i32* nocapture readonly %p) {
+; CHECK-LABEL: @multi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITVAL:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[P_ADDR_0:%.*]] = phi i32* [ [[INCDEC_PTR]], [[ENTRY:%.*]] ], [ [[NEXT0:%.*]], [[BB0]] ], [ [[NEXT1:%.*]], [[BB1]] ]
+; CHECK-NEXT:    [[OPCODE_0:%.*]] = phi i32 [ [[INITVAL]], [[ENTRY]] ], [ 0, [[BB0]] ], [ 1, [[BB1]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[OPCODE_0]])
+; CHECK-NEXT:    [[NEXT0]] = getelementptr inbounds i32, i32* [[P_ADDR_0]], i64 1
+; CHECK-NEXT:    [[NEWP0:%.*]] = load i32, i32* [[P_ADDR_0]], align 4
+; CHECK-NEXT:    [[IDX0:%.*]] = sext i32 [[NEWP0]] to i64
+; CHECK-NEXT:    [[ARRAYIDX0:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 [[IDX0]]
+; CHECK-NEXT:    [[NEWOP0:%.*]] = load i8*, i8** [[ARRAYIDX0]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP0]], [label [[BB0]], label %bb1]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[P_ADDR_1:%.*]] = phi i32* [ [[INCDEC_PTR]], [[ENTRY]] ], [ [[NEXT0]], [[BB0]] ], [ [[NEXT1]], [[BB1]] ]
+; CHECK-NEXT:    [[OPCODE_1:%.*]] = phi i32 [ [[INITVAL]], [[ENTRY]] ], [ 0, [[BB0]] ], [ 1, [[BB1]] ]
+; CHECK-NEXT:    tail call void @use(i32 [[OPCODE_1]])
+; CHECK-NEXT:    [[NEXT1]] = getelementptr inbounds i32, i32* [[P_ADDR_1]], i64 1
+; CHECK-NEXT:    [[NEWP1:%.*]] = load i32, i32* [[P_ADDR_1]], align 4
+; CHECK-NEXT:    [[IDX1:%.*]] = sext i32 [[NEWP1]] to i64
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 [[IDX1]]
+; CHECK-NEXT:    [[NEWOP1:%.*]] = load i8*, i8** [[ARRAYIDX1]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP1]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initval = load i32, i32* %p, align 4
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  %p.addr.0 = phi i32* [ %incdec.ptr, %entry ], [ %next0, %bb0 ], [ %next1, %bb1 ]
+  %opcode.0 = phi i32 [ %initval, %entry ], [ 0, %bb0 ], [ 1, %bb1 ]
+  tail call void @use(i32 %opcode.0)
+  %next0 = getelementptr inbounds i32, i32* %p.addr.0, i64 1
+  %newp0 = load i32, i32* %p.addr.0, align 4
+  %idx0 = sext i32 %newp0 to i64
+  %arrayidx0 = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 %idx0
+  %newop0 = load i8*, i8** %arrayidx0, align 8
+  indirectbr i8* %newop0, [label %bb0, label %bb1]
+
+bb1:
+  %p.addr.1 = phi i32* [ %incdec.ptr, %entry ], [ %next0, %bb0 ], [ %next1, %bb1 ]
+  %opcode.1 = phi i32 [ %initval, %entry ], [ 0, %bb0 ], [ 1, %bb1 ]
+  tail call void @use(i32 %opcode.1)
+  %next1 = getelementptr inbounds i32, i32* %p.addr.1, i64 1
+  %newp1 = load i32, i32* %p.addr.1, align 4
+  %idx1 = sext i32 %newp1 to i64
+  %arrayidx1 = getelementptr inbounds [2 x i8*], [2 x i8*]* @multi.targets, i64 0, i64 %idx1
+  %newop1 = load i8*, i8** %arrayidx1, align 8
+  indirectbr i8* %newop1, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
+
+; Make sure we do the right thing for cases where the indirectbr branches to
+; the block it terminates.
+define void @loop(i64* nocapture readonly %p) {
+; CHECK-LABEL: @loop(
+; CHECK-NEXT:  bb0.clone:
+; CHECK-NEXT:    br label [[DOTSPLIT:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    br label [[DOTSPLIT]]
+; CHECK:       .split:
+; CHECK-NEXT:    [[MERGE:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[BB0:%.*]] ], [ 0, [[BB0_CLONE:%.*]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* [[P:%.*]], i64 [[MERGE]]
+; CHECK-NEXT:    store i64 [[MERGE]], i64* [[TMP0]], align 4
+; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[MERGE]], 1
+; CHECK-NEXT:    [[IDX:%.*]] = srem i64 [[MERGE]], 2
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @loop.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[TARGET:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[TARGET]], [label [[BB0]], label %bb1]
+; CHECK:       bb1:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %bb0
+
+bb0:
+  %i = phi i64 [ %i.next, %bb0 ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %p, i64 %i
+  store i64 %i, i64* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %idx = srem i64 %i, 2
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @loop.targets, i64 0, i64 %idx
+  %target = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %target, [label %bb0, label %bb1]
+
+bb1:
+  ret void
+}
+
+; Don't do anything for cases that contain no phis.
+define void @nophi(i32* %p) {
+; CHECK-LABEL: @nophi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[INITOP:%.*]] = load i32, i32* [[INCDEC_PTR]], align 4
+; CHECK-NEXT:    switch i32 [[INITOP]], label [[EXIT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB0:%.*]]
+; CHECK-NEXT:    i32 1, label [[BB1:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb0:
+; CHECK-NEXT:    tail call void @use(i32 0)
+; CHECK-NEXT:    br label [[INDIRECTGOTO:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    tail call void @use(i32 1)
+; CHECK-NEXT:    br label [[INDIRECTGOTO]]
+; CHECK:       indirectgoto:
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to i8*
+; CHECK-NEXT:    [[SUNKADDR:%.*]] = getelementptr i8, i8* [[TMP0]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[SUNKADDR]] to i32*
+; CHECK-NEXT:    [[NEWP:%.*]] = load i32, i32* [[TMP1]], align 4
+; CHECK-NEXT:    [[IDX:%.*]] = sext i32 [[NEWP]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [2 x i8*], [2 x i8*]* @nophi.targets, i64 0, i64 [[IDX]]
+; CHECK-NEXT:    [[NEWOP:%.*]] = load i8*, i8** [[ARRAYIDX]], align 8
+; CHECK-NEXT:    indirectbr i8* [[NEWOP]], [label [[BB0]], label %bb1]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %incdec.ptr = getelementptr inbounds i32, i32* %p, i64 1
+  %initop = load i32, i32* %incdec.ptr, align 4
+  switch i32 %initop, label %exit [
+  i32 0, label %bb0
+  i32 1, label %bb1
+  ]
+
+bb0:
+  tail call void @use(i32 0)  br label %indirectgoto
+
+bb1:
+  tail call void @use(i32 1)
+  br label %indirectgoto
+
+indirectgoto:
+  %newp = load i32, i32* %incdec.ptr, align 4
+  %idx = sext i32 %newp to i64
+  %arrayidx = getelementptr inbounds [2 x i8*], [2 x i8*]* @nophi.targets, i64 0, i64 %idx
+  %newop = load i8*, i8** %arrayidx, align 8
+  indirectbr i8* %newop, [label %bb0, label %bb1]
+
+exit:
+  ret void
+}
+
+; Don't do anything if the edge isn't critical.
+define i32 @noncritical(i32 %k, i8* %p)
+; CHECK-LABEL: @noncritical(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[D:%.*]] = add i32 [[K:%.*]], 1
+; CHECK-NEXT:    indirectbr i8* [[P:%.*]], [label [[BB0:%.*]], label %bb1]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[R0:%.*]] = sub i32 [[K]], [[D]]
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[R1:%.*]] = sub i32 [[D]], [[K]]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[V:%.*]] = phi i32 [ [[R0]], [[BB0]] ], [ [[R1]], [[BB1:%.*]] ]
+; CHECK-NEXT:    ret i32 0
+;
+{
+entry:
+  %d = add i32 %k, 1
+  indirectbr i8* %p, [label %bb0, label %bb1]
+
+bb0:
+  %v00 = phi i32 [%k, %entry]
+  %v01 = phi i32 [%d, %entry]
+  %r0 = sub i32 %v00, %v01
+  br label %exit
+
+bb1:
+  %v10 = phi i32 [%d, %entry]
+  %v11 = phi i32 [%k, %entry]
+  %r1 = sub i32 %v10, %v11
+  br label %exit
+
+exit:
+  %v = phi i32 [%r0, %bb0], [%r1, %bb1]
+  ret i32 0
+}
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
index 5c0b5f3839d074024d16ac55e476a58e1042b141..9d6e668167fbbef72288b927d8e1373cecb44724 100644
--- a/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrmode.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; Can we sink single addressing mode computation to use?
 define void @test1(i1 %cond, i64* %base) {
 ; CHECK-LABEL: @test1
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
 entry:
   %addr = getelementptr inbounds i64, i64* %base, i64 5
   %casted = bitcast i64* %addr to i32*
@@ -33,7 +33,7 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
   %cmp = icmp eq i32 %v1, 0
@@ -41,7 +41,7 @@ if.then:
 
 next:
 ; CHECK-LABEL: next:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v2 = load i32, i32* %casted, align 4
   call void @foo(i32 %v2)
   br label %fallthrough
@@ -61,10 +61,10 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
-; CHECK-NOT: add i64 {{.+}}, 40
+; CHECK-NOT: getelementptr i8, {{.+}}, 40
   %v2 = load i32, i32* %casted, align 4
   call void @foo(i32 %v2)
   br label %fallthrough
@@ -84,7 +84,7 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
   %cmp = icmp eq i32 %v1, 0
@@ -95,7 +95,7 @@ fallthrough:
 
 rare.1:
 ; CHECK-LABEL: rare.1:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   call void @slowpath(i32 %v1, i32* %casted) cold
   br label %fallthrough
 }
@@ -111,7 +111,7 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK-NOT: add i64 {{.+}}, 40
+; CHECK-NOT: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
   %cmp = icmp eq i32 %v1, 0
@@ -136,7 +136,7 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK-NOT: add i64 {{.+}}, 40
+; CHECK-NOT: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
   %cmp = icmp eq i32 %v1, 0
@@ -162,7 +162,7 @@ entry:
 
 if.then:
 ; CHECK-LABEL: if.then:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v1 = load i32, i32* %casted, align 4
   call void @foo(i32 %v1)
   %cmp = icmp eq i32 %v1, 0
@@ -170,7 +170,7 @@ if.then:
 
 next:
 ; CHECK-LABEL: next:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   %v2 = load i32, i32* %casted, align 4
   call void @foo(i32 %v2)
   %cmp2 = icmp eq i32 %v2, 0
@@ -181,13 +181,13 @@ fallthrough:
 
 rare.1:
 ; CHECK-LABEL: rare.1:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   call void @slowpath(i32 %v1, i32* %casted) cold
   br label %next
 
 rare.2:
 ; CHECK-LABEL: rare.2:
-; CHECK: add i64 {{.+}}, 40
+; CHECK: getelementptr i8, {{.+}} 40
   call void @slowpath(i32 %v2, i32* %casted) cold
   br label %fallthrough
 }
diff --git a/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll b/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
index c9f49b5d4f86a58634908fa67c2317911fc74520..31f0ca239e3a3e11dfbd3b8896f3bb54f5a67b51 100644
--- a/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
+++ b/test/Transforms/CodeGenPrepare/X86/sink-addrspacecast.ll
@@ -1,11 +1,12 @@
-; RUN: opt -S -codegenprepare < %s | FileCheck %s
+; RUN: opt -S -codegenprepare < %s | FileCheck %s -check-prefix=CHECK -check-prefix=GEP
 
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: @load_cast_gep
-; CHECK: add i64 %sunkaddr, 40
+; GEP: [[CAST:%[0-9]+]] = addrspacecast i64* %base to i8 addrspace(1)*
+; GEP: getelementptr i8, i8 addrspace(1)* [[CAST]], i64 40
 define void @load_cast_gep(i1 %cond, i64* %base) {
 entry:
   %addr = getelementptr inbounds i64, i64* %base, i64 5
@@ -21,7 +22,8 @@ fallthrough:
 }
 
 ; CHECK-LABEL: @store_gep_cast
-; CHECK: add i64 %sunkaddr, 20
+; GEP: [[CAST:%[0-9]+]] = addrspacecast i64* %base to i8 addrspace(1)*
+; GEP: getelementptr i8, i8 addrspace(1)* [[CAST]], i64 20
 define void @store_gep_cast(i1 %cond, i64* %base) {
 entry:
   %casted = addrspacecast i64* %base to i32 addrspace(1)*
diff --git a/test/Transforms/CodeGenPrepare/basic.ll b/test/Transforms/CodeGenPrepare/basic.ll
index 495d910b5cd62008a6e41f6791e55cf2400e462c..2e58de7d093469a58c3119183c9ce6766f3a6f97 100644
--- a/test/Transforms/CodeGenPrepare/basic.ll
+++ b/test/Transforms/CodeGenPrepare/basic.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-apple-darwin10.0.0"
 ; rdar://8785296
 define i32 @test1(i8* %ptr) nounwind ssp noredzone align 2 {
 entry:
-  %0 = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false)
+  %0 = tail call i64 @llvm.objectsize.i64(i8* %ptr, i1 false, i1 false)
   %1 = icmp ugt i64 %0, 3
   br i1 %1, label %T, label %trap
 
@@ -25,6 +25,44 @@ T:
   ret i32 4
 }
 
-declare i64 @llvm.objectsize.i64(i8*, i1) nounwind readonly
+; CHECK-LABEL: @test_objectsize_null_flag(
+define i64 @test_objectsize_null_flag(i8* %ptr) {
+entry:
+  ; CHECK: ret i64 -1
+  %0 = tail call i64 @llvm.objectsize.i64(i8* null, i1 false, i1 true)
+  ret i64 %0
+}
+
+; CHECK-LABEL: @test_objectsize_null_flag_min(
+define i64 @test_objectsize_null_flag_min(i8* %ptr) {
+entry:
+  ; CHECK: ret i64 0
+  %0 = tail call i64 @llvm.objectsize.i64(i8* null, i1 true, i1 true)
+  ret i64 %0
+}
+
+; Test foldable null pointers because we evaluate them with non-exact modes in
+; CodeGenPrepare.
+; CHECK-LABEL: @test_objectsize_null_flag_noas0(
+define i64 @test_objectsize_null_flag_noas0() {
+entry:
+  ; CHECK: ret i64 0
+  %0 = tail call i64 @llvm.objectsize.i64.p1i8(i8 addrspace(1)* null, i1 false,
+                                               i1 true)
+  ret i64 %0
+}
+
+; CHECK-LABEL: @test_objectsize_null_flag_min_noas0(
+define i64 @test_objectsize_null_flag_min_noas0() {
+entry:
+  ; CHECK: ret i64 0
+  %0 = tail call i64 @llvm.objectsize.i64.p1i8(i8 addrspace(1)* null, i1 true,
+                                               i1 true)
+  ret i64 %0
+}
+
+
+declare i64 @llvm.objectsize.i64(i8*, i1, i1) nounwind readonly
+declare i64 @llvm.objectsize.i64.p1i8(i8 addrspace(1)*, i1, i1) nounwind readonly
 
 declare void @llvm.trap() nounwind
diff --git a/test/Transforms/CodeGenPrepare/builtin-condition.ll b/test/Transforms/CodeGenPrepare/builtin-condition.ll
index 0d41e9e1eddbbab75962605445a5937b9881685e..e42529a7b9a11981cfe4f52d55e64f65ff682ee4 100644
--- a/test/Transforms/CodeGenPrepare/builtin-condition.ll
+++ b/test/Transforms/CodeGenPrepare/builtin-condition.ll
@@ -74,39 +74,39 @@ entry:
   %chararray = alloca [30 x i8], align 16
   %chararray2 = alloca [10 x i8], align 1
   %0 = getelementptr inbounds [30 x i8], [30 x i8]* %chararray, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 30, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 30, i8* %0)
   %1 = getelementptr inbounds [10 x i8], [10 x i8]* %chararray2, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 10, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 10, i8* %1)
   %tobool = icmp eq i32 %flag, 0
   %cptr.0 = select i1 %tobool, i8* %0, i8* %1
   %2 = call i64 @llvm.objectsize.i64.p0i8(i8* %cptr.0, i1 true)
-  call void @llvm.lifetime.end(i64 10, i8* %1)
-  call void @llvm.lifetime.end(i64 30, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 10, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 30, i8* %0)
   ret i64 %2
 ; CHECK-LABEL: foo1
 ; CHECK:  ret i64 10
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare i64 @llvm.objectsize.i64.p0i8(i8*, i1)
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define i64 @foo2(i32 %n) {
 entry:
   %Small = alloca [10 x i8], align 1
   %Large = alloca [20 x i8], align 16
   %0 = getelementptr inbounds [10 x i8], [10 x i8]* %Small, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 10, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 10, i8* %0)
   %1 = getelementptr inbounds [20 x i8], [20 x i8]* %Large, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 20, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* %1)
   %tobool = icmp ne i32 %n, 0
   %add.ptr = getelementptr inbounds [20 x i8], [20 x i8]* %Large, i64 0, i64 19
   %cond = select i1 %tobool, i8* %0, i8* %add.ptr
   %2 = call i64 @llvm.objectsize.i64.p0i8(i8* %cond, i1 false)
-  call void @llvm.lifetime.end(i64 20, i8* %1)
-  call void @llvm.lifetime.end(i64 10, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 10, i8* %0)
   ret i64 %2
 ; CHECK-LABEL: foo2
 ; CHECK:  ret i64 10
diff --git a/test/Transforms/CodeGenPrepare/section.ll b/test/Transforms/CodeGenPrepare/section.ll
index 795c45c220dbf9c29f0895595ee4b5fb7ab7818e..2c96612e1bafafab82dabba7deea0d6d5e01b4e9 100644
--- a/test/Transforms/CodeGenPrepare/section.ll
+++ b/test/Transforms/CodeGenPrepare/section.ll
@@ -5,12 +5,32 @@ target triple = "x86_64-pc-linux-gnu"
 ; This tests that hot/cold functions get correct section prefix assigned
 
 ; CHECK: hot_func{{.*}}!section_prefix ![[HOT_ID:[0-9]+]]
+; The entry is hot
 define void @hot_func() !prof !15 {
   ret void
 }
 
+; CHECK: hot_call_func{{.*}}!section_prefix ![[HOT_ID]]
+; The sum of 2 callsites are hot
+define void @hot_call_func() !prof !16 {
+  call void @hot_func(), !prof !17
+  call void @hot_func(), !prof !17
+  ret void
+}
+
+; CHECK-NOT: normal_func{{.*}}!section_prefix
+; The sum of all callsites are neither hot or cold
+define void @normal_func() !prof !16 {
+  call void @hot_func(), !prof !17
+  call void @hot_func(), !prof !18
+  call void @hot_func(), !prof !18
+  ret void
+}
+
 ; CHECK: cold_func{{.*}}!section_prefix ![[COLD_ID:[0-9]+]]
+; The entry and the callsite are both cold
 define void @cold_func() !prof !16 {
+  call void @hot_func(), !prof !18
   ret void
 }
 
@@ -33,3 +53,5 @@ define void @cold_func() !prof !16 {
 !14 = !{i32 999999, i64 1, i32 2}
 !15 = !{!"function_entry_count", i64 1000}
 !16 = !{!"function_entry_count", i64 1}
+!17 = !{!"branch_weights", i32 80}
+!18 = !{!"branch_weights", i32 1}
diff --git a/test/Transforms/ConstProp/loads.ll b/test/Transforms/ConstProp/loads.ll
index 89387ad06ba8ca3ff8eb97e447339cd6ecd84cdd..dce2068a8d5563c0534650c6376d9f2449201e29 100644
--- a/test/Transforms/ConstProp/loads.ll
+++ b/test/Transforms/ConstProp/loads.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -default-data-layout="e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=LE
-; RUN: opt < %s -default-data-layout="E-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=BE
+; RUN: opt < %s -data-layout="e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -data-layout="E-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64" -instcombine -S | FileCheck %s --check-prefix=BE
 
 ; {{ 0xDEADBEEF, 0xBA }, 0xCAFEBABE}
 @g1 = constant {{i32,i8},i32} {{i32,i8} { i32 -559038737, i8 186 }, i32 -889275714 }
diff --git a/test/Transforms/ConstantHoisting/X86/ehpad.ll b/test/Transforms/ConstantHoisting/X86/ehpad.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3178e87f7548bcf49d8f2491039322761a7debb4
--- /dev/null
+++ b/test/Transforms/ConstantHoisting/X86/ehpad.ll
@@ -0,0 +1,62 @@
+; RUN: opt -S -consthoist < %s | FileCheck %s
+
+; FIXME: The catchpad doesn't even use the constant, so a better fix would be to
+; insert the bitcast in the catchpad block.
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc"
+
+; CHECK-LABEL: define i32 @main
+; CHECK: %tobool = icmp eq i32 %argc, 0
+; CHECK-NEXT: bitcast i64 9209618997431186100 to i64
+; CHECK-NEXT: br i1 %tobool
+
+; Function Attrs: norecurse
+define i32 @main(i32 %argc, i8** nocapture readnone %argv) local_unnamed_addr #0 personality i8* bitcast (i32 (...)* @__CxxFrameHandler3 to i8*) {
+  %call = tail call i64 @fn(i64 0)
+  %call1 = tail call i64 @fn(i64 1)
+  %tobool = icmp eq i32 %argc, 0
+  br i1 %tobool, label %2, label %1
+
+; <label>:1:                                      ; preds = %0
+  %call2 = invoke i64 @fn(i64 %call)
+          to label %6 unwind label %catch.dispatch
+
+; <label>:2:                                      ; preds = %0
+  %call3 = invoke i64 @fn(i64 %call1)
+          to label %6 unwind label %catch.dispatch
+
+catch.dispatch:                                   ; preds = %2, %1
+  %z.0 = phi i64 [ %call, %1 ], [ %call1, %2 ]
+  %3 = catchswitch within none [label %4] unwind to caller
+
+; <label>:4:                                      ; preds = %catch.dispatch
+  %5 = catchpad within %3 [i8* null, i32 64, i8* null]
+  br i1 %tobool, label %then, label %else
+
+then:
+  %call4 = tail call i64 @fn(i64 %z.0) [ "funclet"(token %5) ]
+  %add = add i64 %call4, 9209618997431186100
+  br label %endif
+
+else:
+  %call5 = tail call i64 @fn(i64 0) [ "funclet"(token %5) ]
+  %add6 = add i64 %call5, 9209618997431186100
+  br label %endif
+
+endif:
+  %v = phi i64 [ %add, %then ], [ %add6, %else ]
+  %call7 = tail call i64 @fn(i64 %v) [ "funclet"(token %5) ]
+  %call8 = tail call i64 @fn(i64 %call7) [ "funclet"(token %5) ]
+  catchret from %5 to label %6
+
+; <label>:6:                                      ; preds = %1, %2, %4
+  ret i32 0
+}
+
+declare i64 @fn(i64) local_unnamed_addr #1
+
+declare i32 @__CxxFrameHandler3(...)
+
+attributes #0 = { norecurse "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/ConstantMerge/dont-merge.ll b/test/Transforms/ConstantMerge/dont-merge.ll
index e5337dff27dfbcffcd2f7a36979c2557df44ffca..21e390785df561597eeb82e99f6d32aa6cb424f2 100644
--- a/test/Transforms/ConstantMerge/dont-merge.ll
+++ b/test/Transforms/ConstantMerge/dont-merge.ll
@@ -42,3 +42,41 @@ define void @test3() {
   call void asm sideeffect "T3A, T3B",""() ; invisible use of T3A and T3B
   ret void
 }
+
+; Don't merge constants with !type annotations.
+
+@T4A1 = internal constant i32 2, !type !0
+@T4A2 = internal unnamed_addr constant i32 2, !type !1
+
+@T4B1 = internal constant i32 3, !type !0
+@T4B2 = internal unnamed_addr constant i32 3, !type !0
+
+@T4C1 = internal constant i32 4, !type !0
+@T4C2 = unnamed_addr constant i32 4
+
+@T4D1 = unnamed_addr constant i32 5, !type !0
+@T4D2 = internal constant i32 5
+
+!0 = !{i64 0, !"typeinfo name for A"}
+!1 = !{i64 0, !"typeinfo name for B"}
+
+; CHECK: @T4A1
+; CHECK: @T4A2
+; CHECK: @T4B1
+; CHECK: @T4B2
+; CHECK: @T4C1
+; CHECK: @T4C2
+; CHECK: @T4D1
+; CHECK: @T4D2
+
+define void @test4(i32** %P1, i32** %P2, i32** %P3, i32** %P4, i32** %P5, i32** %P6, i32** %P7, i32** %P8) {
+        store i32* @T4A1, i32** %P1
+        store i32* @T4A2, i32** %P2
+        store i32* @T4B1, i32** %P3
+        store i32* @T4B2, i32** %P4
+        store i32* @T4C1, i32** %P5
+        store i32* @T4C2, i32** %P6
+        store i32* @T4D1, i32** %P7
+        store i32* @T4D2, i32** %P8
+        ret void
+}
diff --git a/test/Transforms/ConstantMerge/merge-dbg.ll b/test/Transforms/ConstantMerge/merge-dbg.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bc33248514e0b9840e247d5d11a53216deb7a5e5
--- /dev/null
+++ b/test/Transforms/ConstantMerge/merge-dbg.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -constmerge -S | FileCheck %s
+
+; CHECK: = constant i32 1, !dbg [[A:![0-9]+]], !dbg [[B:![0-9]+]]
+@a = internal constant i32 1, !dbg !0
+@b = unnamed_addr constant i32 1, !dbg !9
+
+define void @test1(i32** %P1, i32** %P2) {
+  store i32* @a, i32** %P1
+  store i32* @b, i32** %P2
+  ret void
+}
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!7, !8}
+
+; CHECK: [[A]] = !DIGlobalVariableExpression(var: [[VA:![0-9]+]])
+; CHECK: [[VA]] = distinct !DIGlobalVariable(name: "y"
+; CHECK: [[B]] = !DIGlobalVariableExpression(var: [[VB:![0-9]+]])
+; CHECK: [[VB]] = distinct !DIGlobalVariable(name: "x"
+
+!0 = !DIGlobalVariableExpression(var: !1)
+!1 = distinct !DIGlobalVariable(name: "x", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, producer: "clang version 5.0.0 (trunk 297227) (llvm/trunk 297234)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
+!3 = !DIFile(filename: "1.cc", directory: "/build")
+!4 = !{}
+!5 = !{!0}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 2, !"Debug Info Version", i32 3}
+
+!9 = !DIGlobalVariableExpression(var: !10)
+!10 = distinct !DIGlobalVariable(name: "y", scope: !2, file: !3, line: 1, type: !6, isLocal: false, isDefinition: true)
diff --git a/test/Transforms/Coroutines/ArgAddr.ll b/test/Transforms/Coroutines/ArgAddr.ll
index 4bedb510cd9ee551e45786ce1479d43566d8eadb..5d0fbd781be96d38bb6f3f1e2782082ea5cd6f38 100644
--- a/test/Transforms/Coroutines/ArgAddr.ll
+++ b/test/Transforms/Coroutines/ArgAddr.ll
@@ -32,7 +32,7 @@ coro_Cleanup:
   br label %coro_Suspend
 
 coro_Suspend:
-  call void @llvm.coro.end(i8* null, i1 false)
+  call i1 @llvm.coro.end(i8* null, i1 false)
   ret i8* %1
 }
 
@@ -61,7 +61,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8* @llvm.coro.begin(token, i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/coro-frame.ll b/test/Transforms/Coroutines/coro-frame.ll
index a6b749e7a91c161d947bb805ff9cb9b1a433ba12..001012fcd0c98fe3302fb20e039210a1c16885e3 100644
--- a/test/Transforms/Coroutines/coro-frame.ll
+++ b/test/Transforms/Coroutines/coro-frame.ll
@@ -22,7 +22,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 0)
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)
   ret i8* %hdl
 pad:
   %tok = cleanuppad within none []
@@ -54,7 +54,7 @@ declare void @llvm.coro.destroy(i8*)
 declare token @llvm.coro.id(i32, i8*, i8*, i8*)
 declare i1 @llvm.coro.alloc(token)
 declare i8* @llvm.coro.begin(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare noalias i8* @malloc(i32)
 declare double @print(double)
diff --git a/test/Transforms/Coroutines/coro-spill-after-phi.ll b/test/Transforms/Coroutines/coro-spill-after-phi.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3c7e050c09e951524fd9549ba61571fd3cdbe850
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-spill-after-phi.ll
@@ -0,0 +1,60 @@
+; Verifies that we insert spills of PHI instruction _after) all PHI Nodes
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+define i8* @f(i1 %n) "coroutine.presplit"="1" {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %size = call i32 @llvm.coro.size.i32()
+  %alloc = call i8* @malloc(i32 %size)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* %alloc)
+  br i1 %n, label %begin, label %alt
+alt:
+  br label %begin
+
+begin:
+  %phi1 = phi i32 [ 0, %entry ], [ 2, %alt ]
+  %phi2 = phi i32 [ 1, %entry ], [ 3, %alt ]
+
+  %sp1 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %sp1, label %suspend [i8 0, label %resume
+                                  i8 1, label %cleanup]
+resume:
+  call i32 @print(i32 %phi1)
+  call i32 @print(i32 %phi2)
+  br label %cleanup
+
+cleanup:
+  %mem = call i8* @llvm.coro.free(token %id, i8* %hdl)
+  call void @free(i8* %mem)
+  br label %suspend
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)
+  ret i8* %hdl
+}
+
+; Verifies that the both phis are stored correctly in the coroutine frame
+; CHECK: %f.Frame = type { void (%f.Frame*)*, void (%f.Frame*)*, i1, i1, i32, i32 }
+; CHECK-LABEL: @f(
+; CHECK: store void (%f.Frame*)* @f.destroy, void (%f.Frame*)** %destroy.addr
+; CHECK: %phi1 = select i1 %n, i32 0, i32 2
+; CHECK: %phi2 = select i1 %n, i32 1, i32 3
+; CHECK: %phi2.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 5
+; CHECK: store i32 %phi2, i32* %phi2.spill.addr
+; CHECK: %phi1.spill.addr = getelementptr inbounds %f.Frame, %f.Frame* %FramePtr, i32 0, i32 4
+; CHECK: store i32 %phi1, i32* %phi1.spill.addr
+; CHECK: ret i8* %hdl
+
+declare i8* @llvm.coro.free(token, i8*)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(i8*)
+declare void @llvm.coro.destroy(i8*)
+
+declare token @llvm.coro.id(i32, i8*, i8*, i8*)
+declare i1 @llvm.coro.alloc(token)
+declare i8* @llvm.coro.begin(token, i8*)
+declare i1 @llvm.coro.end(i8*, i1)
+
+declare noalias i8* @malloc(i32)
+declare i32 @print(i32)
+declare void @free(i8*)
diff --git a/test/Transforms/Coroutines/coro-split-00.ll b/test/Transforms/Coroutines/coro-split-00.ll
index 12aec27b2fe62f65e6dc5afe2291802166f84413..0461b7dddb6c52a97f8f0179a3718a5b9c6ca55f 100644
--- a/test/Transforms/Coroutines/coro-split-00.ll
+++ b/test/Transforms/Coroutines/coro-split-00.ll
@@ -28,7 +28,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 0)  
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)  
   ret i8* %hdl
 }
 
@@ -72,7 +72,7 @@ declare void @llvm.coro.destroy(i8*)
 declare token @llvm.coro.id(i32, i8*, i8*, i8*)
 declare i1 @llvm.coro.alloc(token)
 declare i8* @llvm.coro.begin(token, i8*)
-declare void @llvm.coro.end(i8*, i1) 
+declare i1 @llvm.coro.end(i8*, i1) 
 
 declare noalias i8* @malloc(i32)
 declare void @print(i32)
diff --git a/test/Transforms/Coroutines/coro-split-01.ll b/test/Transforms/Coroutines/coro-split-01.ll
index 2b5801f7ddd115efd703ae6d14585dc5b2872437..cff2e9ca6f0a48dadc2f0dd9c07456dda06c1fd1 100644
--- a/test/Transforms/Coroutines/coro-split-01.ll
+++ b/test/Transforms/Coroutines/coro-split-01.ll
@@ -26,7 +26,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 0)  
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)  
   ret i8* %hdl
 }
 define i32 @main() {
@@ -49,7 +49,7 @@ declare void @llvm.coro.destroy(i8*)
 declare token @llvm.coro.id(i32, i8*, i8*, i8*)
 declare i1 @llvm.coro.alloc(token)
 declare i8* @llvm.coro.begin(token, i8*)
-declare void @llvm.coro.end(i8*, i1) 
+declare i1 @llvm.coro.end(i8*, i1) 
 
 declare noalias i8* @malloc(i32)
 declare void @print(i32)
diff --git a/test/Transforms/Coroutines/coro-split-02.ll b/test/Transforms/Coroutines/coro-split-02.ll
index 2326f77f1987106749dd616e3209b5300157e570..953c25088652bc1a18337df19945654709440e02 100644
--- a/test/Transforms/Coroutines/coro-split-02.ll
+++ b/test/Transforms/Coroutines/coro-split-02.ll
@@ -28,7 +28,7 @@ await.ready:
   call void @print(i32 %val)
   br label %exit  
 exit:
-  call void @llvm.coro.end(i8* null, i1 false)
+  call i1 @llvm.coro.end(i8* null, i1 false)
   ret void
 }
 
@@ -50,5 +50,5 @@ declare i8* @llvm.coro.frame() #5
 declare i8 @llvm.coro.suspend(token, i1) #3
 declare void @"\01??3@YAXPEAX@Z"(i8*) local_unnamed_addr #10
 declare i8* @llvm.coro.free(token, i8* nocapture readonly) #2
-declare void @llvm.coro.end(i8*, i1) #3
+declare i1 @llvm.coro.end(i8*, i1) #3
 
diff --git a/test/Transforms/Coroutines/coro-split-dbg.ll b/test/Transforms/Coroutines/coro-split-dbg.ll
index 293622c40ebd402ec0c34a9632bbd6198e7749b8..80f706879e5520122008fb1cb32692edd80c2cc8 100644
--- a/test/Transforms/Coroutines/coro-split-dbg.ll
+++ b/test/Transforms/Coroutines/coro-split-dbg.ll
@@ -38,12 +38,12 @@ coro_Cleanup:                                     ; preds = %for.cond
   br label %coro_Suspend, !dbg !36
 
 coro_Suspend:                                     ; preds = %for.cond, %if.then, %coro_Cleanup
-  tail call void @llvm.coro.end(i8* null, i1 false) #9, !dbg !38
+  tail call i1 @llvm.coro.end(i8* null, i1 false) #9, !dbg !38
   ret i8* %2, !dbg !39
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #4
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #4
 
 ; Function Attrs: argmemonly nounwind readonly
 declare token @llvm.coro.id(i32, i8* readnone, i8* nocapture readonly, i8*) #5
@@ -54,10 +54,10 @@ declare i64 @llvm.coro.size.i64() #1
 declare i8* @llvm.coro.begin(token, i8* writeonly) #7
 declare token @llvm.coro.save(i8*) #7
 declare i8 @llvm.coro.suspend(token, i1) #7
-declare void @llvm.lifetime.end(i64, i8* nocapture) #4
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #4
 declare i8* @llvm.coro.free(token, i8* nocapture readonly) #5
 declare void @free(i8* nocapture) local_unnamed_addr #6
-declare void @llvm.coro.end(i8*, i1) #7
+declare i1 @llvm.coro.end(i8*, i1) #7
 declare i8* @llvm.coro.subfn.addr(i8* nocapture readonly, i8) #5
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
diff --git a/test/Transforms/Coroutines/coro-split-eh.ll b/test/Transforms/Coroutines/coro-split-eh.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7fc97e261e8107df14c6bbdfaa652f8106ba29e0
--- /dev/null
+++ b/test/Transforms/Coroutines/coro-split-eh.ll
@@ -0,0 +1,145 @@
+; Tests that coro-split removes cleanup code after coro.end in resume functions
+; and retains it in the start function.
+; RUN: opt < %s -coro-split -S | FileCheck %s
+
+define i8* @f(i1 %val) "coroutine.presplit"="1" personality i32 3 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* null)
+  call void @print(i32 0)
+  br i1 %val, label %resume, label %susp
+
+susp:  
+  %0 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %0, label %suspend [i8 0, label %resume 
+                                i8 1, label %suspend]
+resume:
+  invoke void @print(i32 1) to label %suspend unwind label %lpad
+
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)  
+  call void @print(i32 0) ; should not be present in f.resume
+  ret i8* %hdl
+
+lpad:
+  %lpval = landingpad { i8*, i32 }
+     cleanup
+
+  call void @print(i32 2)
+  %need.resume = call i1 @llvm.coro.end(i8* null, i1 true)
+  br i1 %need.resume, label %eh.resume, label %cleanup.cont
+
+cleanup.cont:
+  call void @print(i32 3) ; should not be present in f.resume
+  br label %eh.resume
+
+eh.resume:
+  resume { i8*, i32 } %lpval
+}
+
+; Verify that start function contains both print calls the one before and after coro.end
+; CHECK-LABEL: define i8* @f(
+; CHECK: invoke void @print(i32 1)
+; CHECK:   to label %AfterCoroEnd unwind label %lpad
+
+; CHECK: AfterCoroEnd:
+; CHECK:   call void @print(i32 0)
+; CHECK:   ret i8* %hdl
+
+; CHECK:         lpad:
+; CHECK-NEXT:      %lpval = landingpad { i8*, i32 }
+; CHECK-NEXT:         cleanup
+; CHECK-NEXT:      call void @print(i32 2)
+; CHECK-NEXT:      call void @print(i32 3)
+; CHECK-NEXT:      resume { i8*, i32 } %lpval
+
+define i8* @f2(i1 %val) "coroutine.presplit"="1" personality i32 4 {
+entry:
+  %id = call token @llvm.coro.id(i32 0, i8* null, i8* null, i8* null)
+  %hdl = call i8* @llvm.coro.begin(token %id, i8* null)
+  call void @print(i32 0)
+  br i1 %val, label %resume, label %susp
+
+susp:  
+  %0 = call i8 @llvm.coro.suspend(token none, i1 false)
+  switch i8 %0, label %suspend [i8 0, label %resume 
+                                i8 1, label %suspend]
+resume:
+  invoke void @print(i32 1) to label %suspend unwind label %lpad
+
+suspend:
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)  
+  call void @print(i32 0) ; should not be present in f.resume
+  ret i8* %hdl
+
+lpad:
+  %tok = cleanuppad within none []
+  call void @print(i32 2)
+  %unused = call i1 @llvm.coro.end(i8* null, i1 true) [ "funclet"(token %tok) ]
+  cleanupret from %tok unwind label %cleanup.cont
+
+cleanup.cont:
+  %tok2 = cleanuppad within none []
+  call void @print(i32 3) ; should not be present in f.resume
+  cleanupret from %tok2 unwind to caller 
+}
+
+; Verify that start function contains both print calls the one before and after coro.end
+; CHECK-LABEL: define i8* @f2(
+; CHECK: invoke void @print(i32 1)
+; CHECK:   to label %AfterCoroEnd unwind label %lpad
+
+; CHECK: AfterCoroEnd:
+; CHECK:   call void @print(i32 0)
+; CHECK:   ret i8* %hdl
+
+; CHECK:      lpad:
+; CHECK-NEXT:   %tok = cleanuppad within none []
+; CHECK-NEXT:   call void @print(i32 2)
+; CHECK-NEXT:   call void @print(i32 3)
+; CHECK-NEXT:   cleanupret from %tok unwind to caller
+
+; VERIFY Resume Parts
+
+; Verify that resume function does not contains both print calls appearing after coro.end
+; CHECK-LABEL: define internal fastcc void @f.resume
+; CHECK: invoke void @print(i32 1)
+; CHECK:   to label %CoroEnd unwind label %lpad
+
+; CHECK:      CoroEnd:
+; CHECK-NEXT:   ret void
+
+; CHECK:         lpad:
+; CHECK-NEXT:      %lpval = landingpad { i8*, i32 }
+; CHECK-NEXT:         cleanup
+; CHECK-NEXT:      call void @print(i32 2)
+; CHECK-NEXT:      resume { i8*, i32 } %lpval
+
+; Verify that resume function does not contains both print calls appearing after coro.end
+; CHECK-LABEL: define internal fastcc void @f2.resume
+; CHECK: invoke void @print(i32 1)
+; CHECK:   to label %CoroEnd unwind label %lpad
+
+; CHECK:      CoroEnd:
+; CHECK-NEXT:   ret void
+
+; CHECK:      lpad:
+; CHECK-NEXT:   %tok = cleanuppad within none []
+; CHECK-NEXT:   call void @print(i32 2)
+; CHECK-NEXT:   cleanupret from %tok unwind to caller
+
+declare i8* @llvm.coro.free(token, i8*)
+declare i32 @llvm.coro.size.i32()
+declare i8  @llvm.coro.suspend(token, i1)
+declare void @llvm.coro.resume(i8*)
+declare void @llvm.coro.destroy(i8*)
+
+declare token @llvm.coro.id(i32, i8*, i8*, i8*)
+declare i8* @llvm.coro.alloc(token)
+declare i8* @llvm.coro.begin(token, i8*)
+declare i1 @llvm.coro.end(i8*, i1) 
+
+declare noalias i8* @malloc(i32)
+declare void @print(i32)
+declare void @free(i8*)
+
diff --git a/test/Transforms/Coroutines/ex0.ll b/test/Transforms/Coroutines/ex0.ll
index d4a9f941d838c7764e40e303101f89438f71ec01..59bebc5466490ca1744f288fcf991bcb503f3cad 100644
--- a/test/Transforms/Coroutines/ex0.ll
+++ b/test/Transforms/Coroutines/ex0.ll
@@ -24,7 +24,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 0)  
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)  
   ret i8* %hdl
 }
 
@@ -52,7 +52,7 @@ declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
   
 declare i8* @llvm.coro.begin(token, i8*)
-declare void @llvm.coro.end(i8*, i1) 
+declare i1 @llvm.coro.end(i8*, i1) 
 
 declare noalias i8* @malloc(i32)
 declare void @print(i32)
diff --git a/test/Transforms/Coroutines/ex1.ll b/test/Transforms/Coroutines/ex1.ll
index 86ac75b13404bb26590f40edc4f05be392012661..c2a5586fde584abf34ea0a6a48dab32de26dec8d 100644
--- a/test/Transforms/Coroutines/ex1.ll
+++ b/test/Transforms/Coroutines/ex1.ll
@@ -20,7 +20,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret i8* %hdl
 }
 
@@ -48,7 +48,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8* @llvm.coro.begin(token, i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/ex2.ll b/test/Transforms/Coroutines/ex2.ll
index 8681e4cecc80de495be4b23026dbec42fafcfa7e..6987d2a4c9fd756cee530bbfb9de226d07c4f0f0 100644
--- a/test/Transforms/Coroutines/ex2.ll
+++ b/test/Transforms/Coroutines/ex2.ll
@@ -29,7 +29,7 @@ dyn.free:
   call void @CustomFree(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret i8* %hdl
 }
 
@@ -57,7 +57,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8* @llvm.coro.begin(token, i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/ex3.ll b/test/Transforms/Coroutines/ex3.ll
index 13289c8e974aceb20d3876975f86afeab0706d00..8ff4d718230f5248b1470154fa1a838c9270d12b 100644
--- a/test/Transforms/Coroutines/ex3.ll
+++ b/test/Transforms/Coroutines/ex3.ll
@@ -26,7 +26,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret i8* %hdl
 }
 
@@ -54,7 +54,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8* @llvm.coro.begin(token, i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/ex4.ll b/test/Transforms/Coroutines/ex4.ll
index ce896ad7ee413a26c6908c49ebb21ccfa975bb28..4992052acd2edbba32ceade734f7cd1a49cc43aa 100644
--- a/test/Transforms/Coroutines/ex4.ll
+++ b/test/Transforms/Coroutines/ex4.ll
@@ -28,7 +28,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret i8* %hdl
 }
 
@@ -65,7 +65,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8* @llvm.coro.begin(token, i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/ex5.ll b/test/Transforms/Coroutines/ex5.ll
index c9772825f25074cfb32c3fd614b07dda7f0d6aaf..34767584c81163c2914f7940bc8e80717ed1d50c 100644
--- a/test/Transforms/Coroutines/ex5.ll
+++ b/test/Transforms/Coroutines/ex5.ll
@@ -31,7 +31,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret i8* %hdl
 }
 
@@ -46,7 +46,7 @@ declare i8* @llvm.coro.begin(token, i8*)
 declare token @llvm.coro.save(i8*)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 ; CHECK-LABEL: @main
 define i32 @main() {
diff --git a/test/Transforms/Coroutines/no-suspend.ll b/test/Transforms/Coroutines/no-suspend.ll
index d219495de6cc0783039fd0da14a5d3a45d65eb6d..804b38cc1abeaaaa3b884445da1b0bcbb5b9ab3e 100644
--- a/test/Transforms/Coroutines/no-suspend.ll
+++ b/test/Transforms/Coroutines/no-suspend.ll
@@ -32,7 +32,7 @@ dyn.free:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret void
 }
 
@@ -77,7 +77,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret void
 }
 
@@ -122,7 +122,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret void
 }
 
@@ -167,7 +167,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 false)
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)
   ret void
 }
 
@@ -183,7 +183,7 @@ declare i8* @llvm.coro.begin(token, i8*)
 declare token @llvm.coro.save(i8* %hdl)
 declare i8 @llvm.coro.suspend(token, i1)
 declare i8* @llvm.coro.free(token, i8*)
-declare void @llvm.coro.end(i8*, i1)
+declare i1 @llvm.coro.end(i8*, i1)
 
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
diff --git a/test/Transforms/Coroutines/phi-coro-end.ll b/test/Transforms/Coroutines/phi-coro-end.ll
index e2529412e72c6b6ba01a9b9b65fd5e444d03a276..f99990cf33bc200cf68759bfc70604bda4d47dfa 100644
--- a/test/Transforms/Coroutines/phi-coro-end.ll
+++ b/test/Transforms/Coroutines/phi-coro-end.ll
@@ -17,7 +17,7 @@ cleanup:
 
 suspend:
   %r = phi i32 [%n, %entry], [1, %cleanup]
-  call void @llvm.coro.end(i8* %hdl, i1 false)  
+  call i1 @llvm.coro.end(i8* %hdl, i1 false)  
   call void @print(i32 %r)
   ret i8* %hdl
 }
@@ -41,7 +41,7 @@ declare void @llvm.coro.destroy(i8*)
   
 declare token @llvm.coro.id(i32, i8*, i8*, i8*)
 declare i8* @llvm.coro.begin(token, i8*)
-declare void @llvm.coro.end(i8*, i1) 
+declare i1 @llvm.coro.end(i8*, i1) 
 
 declare noalias i8* @malloc(i32)
 declare void @print(i32)
diff --git a/test/Transforms/Coroutines/restart-trigger.ll b/test/Transforms/Coroutines/restart-trigger.ll
index 2240f8fa63230d31f699b2245596c27267151550..f7f203f2fb5cf1c2e4a565a6b274354af9ca8433 100644
--- a/test/Transforms/Coroutines/restart-trigger.ll
+++ b/test/Transforms/Coroutines/restart-trigger.ll
@@ -25,7 +25,7 @@ cleanup:
   call void @free(i8* %mem)
   br label %suspend
 suspend:
-  call void @llvm.coro.end(i8* %hdl, i1 0)
+  call i1 @llvm.coro.end(i8* %hdl, i1 0)
   ret void  
 }
 
@@ -36,7 +36,7 @@ declare i32 @llvm.coro.size.i32()
 declare i8  @llvm.coro.suspend(token, i1)
 declare void @llvm.coro.resume(i8*)
 declare void @llvm.coro.destroy(i8*)
-declare void @llvm.coro.end(i8*, i1) 
+declare i1 @llvm.coro.end(i8*, i1) 
 
 declare noalias i8* @malloc(i32)
 declare void @print(i32)
diff --git a/test/Transforms/CorrelatedValuePropagation/add.ll b/test/Transforms/CorrelatedValuePropagation/add.ll
index 4b436ff9a4014d5d08e098de481ef056009dd326..0ba521c894e2f800b8bd4e3e0e5e917b173d07f5 100644
--- a/test/Transforms/CorrelatedValuePropagation/add.ll
+++ b/test/Transforms/CorrelatedValuePropagation/add.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -correlated-propagation -S | FileCheck %s
+; RUN: opt < %s -correlated-propagation -cvp-dont-process-adds=false -S | FileCheck %s
 
 ; CHECK-LABEL: @test0(
 define void @test0(i32 %a) {
diff --git a/test/Transforms/CorrelatedValuePropagation/alloca.ll b/test/Transforms/CorrelatedValuePropagation/alloca.ll
index 0a6ba675a477f7566cfa50696c62a7dca68740bc..37b27b29445c3807dfa3153e8384067603481da0 100644
--- a/test/Transforms/CorrelatedValuePropagation/alloca.ll
+++ b/test/Transforms/CorrelatedValuePropagation/alloca.ll
@@ -13,14 +13,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str = private unnamed_addr constant [8 x i8] c"a = %l\0A\00", align 1
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare void @hoo(i64*)
 
 declare i32 @printf(i8* nocapture readonly, ...)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define void @goo(i32 %N, i64* %b) {
 entry:
@@ -35,12 +35,12 @@ for.cond:                                         ; preds = %for.body, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start(i64 8, i8* %tmp)
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %tmp)
   call void @hoo(i64* %a.i)
   call void @hoo(i64* %c)
   %tmp1 = load volatile i64, i64* %a.i, align 8
   %call.i = call i32 (i8*, ...) @printf(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i64 %tmp1)
-  call void @llvm.lifetime.end(i64 8, i8* %tmp)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %tmp)
   %inc = add nsw i32 %i.0, 1
   br label %for.cond
 
diff --git a/test/Transforms/CorrelatedValuePropagation/basic.ll b/test/Transforms/CorrelatedValuePropagation/basic.ll
index 9836c7f80778bf01a0baf57149bdc93c426f509c..14b9a1999cc3ceab7308ed8441a6f2b09546829d 100644
--- a/test/Transforms/CorrelatedValuePropagation/basic.ll
+++ b/test/Transforms/CorrelatedValuePropagation/basic.ll
@@ -115,9 +115,9 @@ negative:
     i32 1, label %out
 ; CHECK-NOT: i32 1
     i32 -1, label %next
-; CHECK: i32 -1, label %next
+; CHECK-DAG: i32 -1, label %next
     i32 -2, label %next
-; CHECK: i32 -2, label %next
+; CHECK-DAG: i32 -2, label %next
     i32 2, label %out
 ; CHECK-NOT: i32 2
     i32 3, label %out
diff --git a/test/Transforms/DeadArgElim/call_profile.ll b/test/Transforms/DeadArgElim/call_profile.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6acb6f000dbe00f149c871824205013709b4982e
--- /dev/null
+++ b/test/Transforms/DeadArgElim/call_profile.ll
@@ -0,0 +1,22 @@
+; RUN: opt -deadargelim -S < %s | FileCheck %s
+
+; Checks if !prof metadata is corret in deadargelim.
+
+define void @caller() #0 {
+; CHECK: call void @test_vararg(), !prof ![[PROF:[0-9]]]
+; CHECK: call void @test(), !prof ![[PROF]]
+  call void (i32, ...) @test_vararg(i32 1), !prof !0
+  call void @test(i32 1), !prof !0
+  ret void
+}
+
+define internal void @test_vararg(i32, ...) #1 {
+  ret void
+}
+
+define internal void @test(i32 %a) #1 {
+  ret void
+}
+
+; CHECK:![[PROF]] = !{!"branch_weights", i32 30}
+!0 = !{!"branch_weights", i32 30}
diff --git a/test/Transforms/DeadStoreElimination/dominate.ll b/test/Transforms/DeadStoreElimination/dominate.ll
index 638992bae729d2d4b67e2a822b1e87dfebc4a24a..24dd65e07bbc2d7a36b68393b7146893ed72cd32 100644
--- a/test/Transforms/DeadStoreElimination/dominate.ll
+++ b/test/Transforms/DeadStoreElimination/dominate.ll
@@ -9,12 +9,12 @@ bb1:
   br label %bb3
 
 bb2:
-  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0)
   br label %bb3
 
 bb3:
   call void @bar()
-  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0)
   br label %bb4
 
 bb4:
@@ -22,4 +22,4 @@ bb4:
 
 }
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
diff --git a/test/Transforms/DeadStoreElimination/lifetime.ll b/test/Transforms/DeadStoreElimination/lifetime.ll
index 305c916dc02b9c66884432a5500fb2007b67451c..97f199b5e0f6aebc49a1605aedcdba5403506a7f 100644
--- a/test/Transforms/DeadStoreElimination/lifetime.ll
+++ b/test/Transforms/DeadStoreElimination/lifetime.ll
@@ -2,8 +2,8 @@
 
 target datalayout = "E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128"
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 declare void @llvm.memset.p0i8.i8(i8* nocapture, i8, i8, i32, i1) nounwind
 
 define void @test1() {
@@ -11,7 +11,7 @@ define void @test1() {
   %A = alloca i8
 
   store i8 0, i8* %A  ;; Written to by memset
-  call void @llvm.lifetime.end(i64 1, i8* %A)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %A)
 ; CHECK: lifetime.end
 
   call void @llvm.memset.p0i8.i8(i8* %A, i8 0, i8 -1, i32 0, i1 false)
@@ -25,11 +25,11 @@ define void @test2(i32* %P) {
 ; CHECK: test2
   %Q = getelementptr i32, i32* %P, i32 1
   %R = bitcast i32* %Q to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %R)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %R)
 ; CHECK: lifetime.start
   store i32 0, i32* %Q  ;; This store is dead.
 ; CHECK-NOT: store
-  call void @llvm.lifetime.end(i64 4, i8* %R)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %R)
 ; CHECK: lifetime.end
   ret void
 }
diff --git a/test/Transforms/DeadStoreElimination/operand-bundles.ll b/test/Transforms/DeadStoreElimination/operand-bundles.ll
index d71b9673ed1d7e65f97f793d4b337f40dbb96d5f..784b2e8e55f79b9a64cfd805d2a8329bac755a1b 100644
--- a/test/Transforms/DeadStoreElimination/operand-bundles.ll
+++ b/test/Transforms/DeadStoreElimination/operand-bundles.ll
@@ -41,3 +41,15 @@ define void @test3() {
   store i64 0, i64* %s
   ret void
 }
+
+declare noalias i8* @calloc(i64, i64)
+
+define void @test4() {
+; CHECK-LABEL: @test4
+  %local_obj = call i8* @calloc(i64 1, i64 4)
+  call void @foo() ["deopt" (i8* %local_obj)]
+  store i8 0, i8* %local_obj, align 4
+  ; CHECK-NOT: store i8 0, i8* %local_obj, align 4
+  call void @bar(i8* nocapture %local_obj)
+  ret void
+}
diff --git a/test/Transforms/FunctionAttrs/nonnull.ll b/test/Transforms/FunctionAttrs/nonnull.ll
index 1fb64b7434abac58eabf21f257648af4fac496de..4a1ff14b2041aa6d61b527ff9970dec7203cc97a 100644
--- a/test/Transforms/FunctionAttrs/nonnull.ll
+++ b/test/Transforms/FunctionAttrs/nonnull.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -functionattrs %s | FileCheck %s
+; RUN: opt -S -functionattrs -enable-nonnull-arg-prop %s | FileCheck %s
 declare nonnull i8* @ret_nonnull()
 
 ; Return a pointer trivially nonnull (call return attribute)
@@ -71,4 +71,148 @@ exit:
   ret i8* %phi
 }
 
+; Test propagation of nonnull callsite args back to caller.
+
+declare void @use1(i8* %x)
+declare void @use2(i8* %x, i8* %y);
+declare void @use3(i8* %x, i8* %y, i8* %z);
+
+declare void @use1nonnull(i8* nonnull %x);
+declare void @use2nonnull(i8* nonnull %x, i8* nonnull %y);
+declare void @use3nonnull(i8* nonnull %x, i8* nonnull %y, i8* nonnull %z);
+
+declare i8 @use1safecall(i8* %x) readonly nounwind ; readonly+nounwind guarantees that execution continues to successor
+
+; Can't extend non-null to parent for any argument because the 2nd call is not guaranteed to execute.
+
+define void @parent1(i8* %a, i8* %b, i8* %c) {
+; CHECK-LABEL: @parent1(i8* %a, i8* %b, i8* %c)
+; CHECK-NEXT:    call void @use3(i8* %c, i8* %a, i8* %b)
+; CHECK-NEXT:    call void @use3nonnull(i8* %b, i8* %c, i8* %a)
+; CHECK-NEXT:    ret void
+;
+  call void @use3(i8* %c, i8* %a, i8* %b)
+  call void @use3nonnull(i8* %b, i8* %c, i8* %a)
+  ret void
+}
+
+; Extend non-null to parent for all arguments.
+
+define void @parent2(i8* %a, i8* %b, i8* %c) {
+; CHECK-LABEL: @parent2(i8* nonnull %a, i8* nonnull %b, i8* nonnull %c)
+; CHECK-NEXT:    call void @use3nonnull(i8* %b, i8* %c, i8* %a)
+; CHECK-NEXT:    call void @use3(i8* %c, i8* %a, i8* %b)
+; CHECK-NEXT:    ret void
+;
+  call void @use3nonnull(i8* %b, i8* %c, i8* %a)
+  call void @use3(i8* %c, i8* %a, i8* %b)
+  ret void
+}
+
+; Extend non-null to parent for 1st argument.
+
+define void @parent3(i8* %a, i8* %b, i8* %c) {
+; CHECK-LABEL: @parent3(i8* nonnull %a, i8* %b, i8* %c)
+; CHECK-NEXT:    call void @use1nonnull(i8* %a)
+; CHECK-NEXT:    call void @use3(i8* %c, i8* %b, i8* %a)
+; CHECK-NEXT:    ret void
+;
+  call void @use1nonnull(i8* %a)
+  call void @use3(i8* %c, i8* %b, i8* %a)
+  ret void
+}
+
+; Extend non-null to parent for last 2 arguments.
+
+define void @parent4(i8* %a, i8* %b, i8* %c) {
+; CHECK-LABEL: @parent4(i8* %a, i8* nonnull %b, i8* nonnull %c)
+; CHECK-NEXT:    call void @use2nonnull(i8* %c, i8* %b)
+; CHECK-NEXT:    call void @use2(i8* %a, i8* %c)
+; CHECK-NEXT:    call void @use1(i8* %b)
+; CHECK-NEXT:    ret void
+;
+  call void @use2nonnull(i8* %c, i8* %b)
+  call void @use2(i8* %a, i8* %c)
+  call void @use1(i8* %b)
+  ret void
+}
+
+; The callsite must execute in order for the attribute to transfer to the parent.
+; It appears benign to extend non-null to the parent in this case, but we can't do that
+; because it would incorrectly propagate the wrong information to its callers.
+
+define void @parent5(i8* %a, i1 %a_is_notnull) {
+; CHECK-LABEL: @parent5(i8* %a, i1 %a_is_notnull)
+; CHECK-NEXT:    br i1 %a_is_notnull, label %t, label %f
+; CHECK:       t:
+; CHECK-NEXT:    call void @use1nonnull(i8* %a)
+; CHECK-NEXT:    ret void
+; CHECK:       f:
+; CHECK-NEXT:    ret void
+;
+  br i1 %a_is_notnull, label %t, label %f
+t:
+  call void @use1nonnull(i8* %a)
+  ret void
+f:
+  ret void
+}
+
+; The callsite must execute in order for the attribute to transfer to the parent.
+; The volatile load might trap, so there's no guarantee that we'll ever get to the call.
+
+define i8 @parent6(i8* %a, i8* %b) {
+; CHECK-LABEL: @parent6(i8* %a, i8* %b)
+; CHECK-NEXT:    [[C:%.*]] = load volatile i8, i8* %b
+; CHECK-NEXT:    call void @use1nonnull(i8* %a)
+; CHECK-NEXT:    ret i8 [[C]]
+;
+  %c = load volatile i8, i8* %b
+  call void @use1nonnull(i8* %a)
+  ret i8 %c
+}
+
+; The nonnull callsite is guaranteed to execute, so the argument must be nonnull throughout the parent.
+
+define i8 @parent7(i8* %a) {
+; CHECK-LABEL: @parent7(i8* nonnull %a)
+; CHECK-NEXT:    [[RET:%.*]] = call i8 @use1safecall(i8* %a)
+; CHECK-NEXT:    call void @use1nonnull(i8* %a)
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+  %ret = call i8 @use1safecall(i8* %a)
+  call void @use1nonnull(i8* %a)
+  ret i8 %ret
+}
+
+; Make sure that an invoke works similarly to a call.
+
+declare i32 @esfp(...)
+
+define i1 @parent8(i8* %a, i8* %bogus1, i8* %b) personality i8* bitcast (i32 (...)* @esfp to i8*){
+; CHECK-LABEL: @parent8(i8* nonnull %a, i8* nocapture readnone %bogus1, i8* nonnull %b)
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    invoke void @use2nonnull(i8* %a, i8* %b)
+; CHECK-NEXT:    to label %cont unwind label %exc
+; CHECK:       cont:
+; CHECK-NEXT:    [[NULL_CHECK:%.*]] = icmp eq i8* %b, null
+; CHECK-NEXT:    ret i1 [[NULL_CHECK]]
+; CHECK:       exc:
+; CHECK-NEXT:    [[LP:%.*]] = landingpad { i8*, i32 }
+; CHECK-NEXT:    filter [0 x i8*] zeroinitializer
+; CHECK-NEXT:    unreachable
+;
+entry:
+  invoke void @use2nonnull(i8* %a, i8* %b)
+  to label %cont unwind label %exc
+
+cont:
+  %null_check = icmp eq i8* %b, null
+  ret i1 %null_check
+
+exc:
+  %lp = landingpad { i8*, i32 }
+  filter [0 x i8*] zeroinitializer
+  unreachable
+}
 
diff --git a/test/Transforms/FunctionImport/funcimport.ll b/test/Transforms/FunctionImport/funcimport.ll
index 97c18488af646c689b6a1ee7cc2df287727eac7d..cc732a3bd98d7ccbf7fd934942de10edd76bae29 100644
--- a/test/Transforms/FunctionImport/funcimport.ll
+++ b/test/Transforms/FunctionImport/funcimport.ll
@@ -4,20 +4,16 @@
 ; RUN: llvm-lto -thinlto -print-summary-global-ids -o %t3 %t.bc %t2.bc 2>&1 | FileCheck %s --check-prefix=GUID
 
 ; Do the import now
-; RUN: opt -disable-force-link-odr -function-import -stats -print-imports -enable-import-metadata -summary-file %t3.thinlto.bc %t.bc -S 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIMDEF
+; RUN: opt -function-import -stats -print-imports -enable-import-metadata -summary-file %t3.thinlto.bc %t.bc -S 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIMDEF
 ; Try again with new pass manager
-; RUN: opt -disable-force-link-odr -passes='function-import' -stats -print-imports -enable-import-metadata -summary-file %t3.thinlto.bc %t.bc -S 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIMDEF
+; RUN: opt -passes='function-import' -stats -print-imports -enable-import-metadata -summary-file %t3.thinlto.bc %t.bc -S 2>&1 | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIMDEF
 ; "-stats" requires +Asserts.
 ; REQUIRES: asserts
 
 ; Test import with smaller instruction limit
-; RUN: opt -disable-force-link-odr -function-import -enable-import-metadata  -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=5 -S | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIM5
+; RUN: opt -function-import -enable-import-metadata  -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=5 -S | FileCheck %s --check-prefix=CHECK --check-prefix=INSTLIM5
 ; INSTLIM5-NOT: @staticfunc.llvm.
 
-; Test import with smaller instruction limit and without the -disable-force-link-odr
-; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc -import-instr-limit=5 -S | FileCheck %s --check-prefix=INSTLIM5ODR
-; INSTLIM5ODR: define linkonce_odr void @linkonceodr() {
-
 
 define i32 @main() #0 {
 entry:
@@ -44,10 +40,12 @@ declare void @weakalias(...) #1
 ; CHECK-DAG: declare void @analias
 declare void @analias(...) #1
 
-; FIXME: Add this checking back when follow on fix to add alias summary
-; records is committed.
 ; Aliases import the aliasee function
 declare void @linkoncealias(...) #1
+; INSTLIMDEF-DAG: Import linkoncealias
+; INSTLIMDEF-DAG: Import linkoncefunc
+; CHECK-DAG: define linkonce_odr void @linkoncefunc()
+; CHECK-DAG: @linkoncealias = alias void (...), bitcast (void ()* @linkoncefunc to void (...)*
 
 ; INSTLIMDEF-DAG: Import referencestatics
 ; INSTLIMDEF-DAG: define available_externally i32 @referencestatics(i32 %i) !thinlto_src_module !0 {
diff --git a/test/Transforms/FunctionImport/unnamed-globals.ll b/test/Transforms/FunctionImport/unnamed-globals.ll
new file mode 100644
index 0000000000000000000000000000000000000000..167fad28f439e2541bcbcd22523f8e932364735d
--- /dev/null
+++ b/test/Transforms/FunctionImport/unnamed-globals.ll
@@ -0,0 +1,10 @@
+; Make sure we don't crash when referencing an unnamed global.
+; RUN: opt %s -module-summary-analysis -S
+
+@0 = external global [1 x { i64 }]
+
+define internal void @tinkywinky() {
+  call void @patatino(i64 ptrtoint ([1 x { i64 }]* @0 to i64), i64 4)
+  ret void
+}
+declare void @patatino(i64, i64)
diff --git a/test/Transforms/GVN/PRE/rle-addrspace-cast.ll b/test/Transforms/GVN/PRE/rle-addrspace-cast.ll
index 07fd7c11d1b5059f1db594f3e9700e21575fe751..d8de5b360ba186f94583b82a8142fc45cc0455f8 100644
--- a/test/Transforms/GVN/PRE/rle-addrspace-cast.ll
+++ b/test/Transforms/GVN/PRE/rle-addrspace-cast.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -default-data-layout="e-p:32:32:32-p1:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -data-layout="e-p:32:32:32-p1:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
 
 define i8 @coerce_offset0_addrspacecast(i32 %V, i32* %P) {
   store i32 %V, i32* %P
diff --git a/test/Transforms/GVN/PRE/rle.ll b/test/Transforms/GVN/PRE/rle.ll
index c1946faab20e3c70045b42f6d188735ca991d6dd..1d2cba2f1f6460e105493bd37c36520d54973749 100644
--- a/test/Transforms/GVN/PRE/rle.ll
+++ b/test/Transforms/GVN/PRE/rle.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -default-data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
-; RUN: opt < %s -default-data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -gvn -S -die | FileCheck %s
+; RUN: opt < %s -data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -gvn -S -die | FileCheck %s
 
 ;; Trivial RLE test.
 define i32 @test0(i32 %V, i32* %P) {
diff --git a/test/Transforms/GVN/cond_br2.ll b/test/Transforms/GVN/cond_br2.ll
index baa282ec200cf323418e2d9a9404e32f10840620..a3749510cb4ac9bc7f25a6d469e22796032a5027 100644
--- a/test/Transforms/GVN/cond_br2.ll
+++ b/test/Transforms/GVN/cond_br2.ll
@@ -18,7 +18,7 @@ define void @_Z4testv() #0 personality i8* bitcast (i32 (...)* @__gxx_personalit
 entry:
   %sv = alloca %"class.llvm::SmallVector", align 16
   %0 = bitcast %"class.llvm::SmallVector"* %sv to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* %0) #1
   %BeginX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0
   %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
   %1 = bitcast %"union.llvm::SmallVectorBase::U"* %FirstEl.i.i.i.i.i.i to i8*
@@ -94,7 +94,7 @@ if.then.i.i.i20:                                  ; preds = %invoke.cont3
   br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
 
 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21:          ; preds = %invoke.cont3, %if.then.i.i.i20
-  call void @llvm.lifetime.end(i64 64, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %0) #1
   ret void
 
 lpad:                                             ; preds = %if.end.i14, %if.end.i, %invoke.cont2
@@ -113,14 +113,14 @@ eh.resume:                                        ; preds = %if.then.i.i.i, %lpa
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare i32 @__gxx_personality_v0(...)
 
 declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(%"class.llvm::SmallVector"*) #2
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(%"class.llvm::SmallVectorBase"*, i64, i64) #2
 
diff --git a/test/Transforms/GVN/debugloc.ll b/test/Transforms/GVN/debugloc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d8c1632d144210e93eeb785ae54346bbe17cfd80
--- /dev/null
+++ b/test/Transforms/GVN/debugloc.ll
@@ -0,0 +1,77 @@
+; RUN: opt < %s -gvn -S | FileCheck %s
+; CHECK: {{^}}for.body:
+; CHECK-NEXT: [[VREG1:%[^ ]+]] = phi{{.*}}[[VREG2:%[^ ]+]],{{.*}}%.sink,
+; CHECK-NOT: !dbg
+; CHECK-SAME: {{$}}
+; CHECK: {{^}}for.inc:
+; CHECK-NEXT: [[VREG2]] = phi{{.*}}%inc,{{.*}}[[VREG1]]
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external local_unnamed_addr global i32, align 4
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32 %x, i32 %y, i32 %z) local_unnamed_addr #0 !dbg !4 {
+entry:
+  %not.tobool = icmp eq i32 %x, 0, !dbg !8
+  %.sink = zext i1 %not.tobool to i32, !dbg !8
+  store i32 %.sink, i32* @g, align 4, !tbaa !9
+  %cmp8 = icmp sgt i32 %y, 0, !dbg !13
+  br i1 %cmp8, label %for.body.preheader, label %for.end, !dbg !17
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !19
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %i.09 = phi i32 [ %inc4, %for.inc ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp sgt i32 %i.09, %z, !dbg !19
+  br i1 %cmp1, label %if.then2, label %for.inc, !dbg !21
+
+if.then2:                                         ; preds = %for.body
+  %0 = load i32, i32* @g, align 4, !dbg !22, !tbaa !9
+  %inc = add nsw i32 %0, 1, !dbg !22
+  store i32 %inc, i32* @g, align 4, !dbg !22, !tbaa !9
+  br label %for.inc, !dbg !23
+
+for.inc:                                          ; preds = %for.body, %if.then2
+  %inc4 = add nuw nsw i32 %i.09, 1, !dbg !24
+  %exitcond = icmp ne i32 %inc4, %y, !dbg !13
+  br i1 %exitcond, label %for.body, label %for.end.loopexit, !dbg !17
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end, !dbg !26
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !26
+}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1)
+!1 = !DIFile(filename: "foo.c", directory: "b/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7, !7, !7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !DILocation(line: 4, column: 7, scope: !4)
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+!13 = !DILocation(line: 10, column: 13, scope: !14)
+!14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 1)
+!15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 10, column: 3)
+!16 = distinct !DILexicalBlock(scope: !4, file: !1, line: 10, column: 3)
+!17 = !DILocation(line: 10, column: 3, scope: !18)
+!18 = !DILexicalBlockFile(scope: !16, file: !1, discriminator: 1)
+!19 = !DILocation(line: 11, column: 11, scope: !20)
+!20 = distinct !DILexicalBlock(scope: !15, file: !1, line: 11, column: 9)
+!21 = !DILocation(line: 11, column: 9, scope: !15)
+!22 = !DILocation(line: 12, column: 8, scope: !20)
+!23 = !DILocation(line: 12, column: 7, scope: !20)
+!24 = !DILocation(line: 10, column: 20, scope: !25)
+!25 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
+!26 = !DILocation(line: 13, column: 1, scope: !4)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
diff --git a/test/Transforms/GVN/invariant.group.ll b/test/Transforms/GVN/invariant.group.ll
index 6f1f357cad65645f5b6fa612319ca32916fac217..570519bec520c13b82f98f70b3b6cfb74dc20b92 100644
--- a/test/Transforms/GVN/invariant.group.ll
+++ b/test/Transforms/GVN/invariant.group.ll
@@ -382,12 +382,12 @@ define void @testNotGlobal() {
 
    %b0 = bitcast i8* %a to i1*
    call void @fooBit(i1* %b0, i1 1)
-; CHECK: %trunc = trunc i8 %b to i1
+; CHECK: %1 = trunc i8 %b to i1
    %2 = load i1, i1* %b0, !invariant.group !0
-; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %trunc)
+; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %1)
    call void @fooBit(i1* %b0, i1 %2)
    %3 = load i1, i1* %b0, !invariant.group !0
-; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %trunc)
+; CHECK-NEXT: call void @fooBit(i1* %b0, i1 %1)
    call void @fooBit(i1* %b0, i1 %3)
    ret void
 }
diff --git a/test/Transforms/GVN/lifetime-simple.ll b/test/Transforms/GVN/lifetime-simple.ll
index d03b62c8158a55ab1b68bc5c17122eedb61f742f..8da3e4cbd30f07055cdfc135be9f5c0ecacb88ea 100644
--- a/test/Transforms/GVN/lifetime-simple.ll
+++ b/test/Transforms/GVN/lifetime-simple.ll
@@ -8,13 +8,13 @@ define i8 @test(i8* %P) nounwind {
 ; CHECK-NOT: load
 ; CHECK: lifetime.end
 entry:
-  call void @llvm.lifetime.start(i64 32, i8* %P)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %P)
   %0 = load i8, i8* %P
   store i8 1, i8* %P
-  call void @llvm.lifetime.end(i64 32, i8* %P)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %P)
   %1 = load i8, i8* %P
   ret i8 %1
 }
 
-declare void @llvm.lifetime.start(i64 %S, i8* nocapture %P) readonly
-declare void @llvm.lifetime.end(i64 %S, i8* nocapture %P)
+declare void @llvm.lifetime.start.p0i8(i64 %S, i8* nocapture %P) readonly
+declare void @llvm.lifetime.end.p0i8(i64 %S, i8* nocapture %P)
diff --git a/test/Transforms/GVNHoist/hoist-pr31891.ll b/test/Transforms/GVNHoist/hoist-pr31891.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3f6a22fc54a6a5bd7ddc5ba2b323a4c058b658fa
--- /dev/null
+++ b/test/Transforms/GVNHoist/hoist-pr31891.ll
@@ -0,0 +1,83 @@
+; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+
+; Hoisted inlinable calls need to have accurate scope information, but we're
+; allowed to erase the line information.
+
+source_filename = "t.c"
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+; Function Attrs: noinline nounwind readnone uwtable
+define float @fabsf(float %f) #0 !dbg !7 {
+entry:
+  %conv = fpext float %f to double, !dbg !9
+  %call = call double @fabs(double %conv) #1, !dbg !10
+  %conv1 = fptrunc double %call to float, !dbg !11
+  ret float %conv1, !dbg !12
+}
+
+; Function Attrs: nounwind readnone
+declare double @fabs(double) #1
+
+; Function Attrs: noinline nounwind uwtable
+define void @hoistit(i32 %cond, float %f) #2 !dbg !13 {
+entry:
+  %tobool = icmp ne i32 %cond, 0, !dbg !14
+  br i1 %tobool, label %if.then, label %if.else, !dbg !14
+
+if.then:                                          ; preds = %entry
+  %call = call float @fabsf(float %f) #1, !dbg !15
+  call void @useit1(float %call), !dbg !16
+  br label %if.end, !dbg !18
+
+if.else:                                          ; preds = %entry
+  %call1 = call float @fabsf(float %f) #1, !dbg !19
+  call void @useit2(float %call1), !dbg !20
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  ret void, !dbg !21
+}
+
+; CHECK-LABEL: define void @hoistit
+; CHECK-SAME: 		!dbg ![[sp_hoistit:[0-9]+]]
+; CHECK: call float @fabsf(float %f) {{.*}} !dbg ![[dbgloc:[0-9]+]]
+; CHECK: br i1 %tobool, label %if.then, label %if.else
+
+; CHECK: ![[sp_hoistit]] = distinct !DISubprogram(name: "hoistit", {{.*}})
+; CHECK: ![[dbgloc]] = !DILocation({{.*}}, scope: ![[sp_hoistit]])
+
+declare void @useit1(float)
+
+declare void @useit2(float)
+
+attributes #0 = { noinline nounwind readnone uwtable }
+attributes #1 = { nounwind readnone }
+attributes #2 = { noinline nounwind uwtable }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 ", isOptimized: false, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "t.c", directory: "C:\5Csrc\5Cllvm\5Cbuild")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 5.0.0 "}
+!7 = distinct !DISubprogram(name: "fabsf", scope: !1, file: !1, line: 4, type: !8, isLocal: false, isDefinition: true, scopeLine: 4, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 5, column: 22, scope: !7)
+!10 = !DILocation(line: 5, column: 17, scope: !7)
+!11 = !DILocation(line: 5, column: 10, scope: !7)
+!12 = !DILocation(line: 5, column: 3, scope: !7)
+!13 = distinct !DISubprogram(name: "hoistit", scope: !1, file: !1, line: 7, type: !8, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!14 = !DILocation(line: 8, column: 7, scope: !13)
+!15 = !DILocation(line: 9, column: 12, scope: !13)
+!16 = !DILocation(line: 9, column: 5, scope: !17)
+!17 = !DILexicalBlockFile(scope: !13, file: !1, discriminator: 1)
+!18 = !DILocation(line: 10, column: 3, scope: !13)
+!19 = !DILocation(line: 11, column: 12, scope: !13)
+!20 = !DILocation(line: 11, column: 5, scope: !17)
+!21 = !DILocation(line: 13, column: 1, scope: !13)
diff --git a/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll b/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
new file mode 100644
index 0000000000000000000000000000000000000000..654d5b6a5585f091f821428d8e3c3177af9d23ff
--- /dev/null
+++ b/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
@@ -0,0 +1,81 @@
+; RUN: opt -gvn-hoist -S < %s | FileCheck %s
+
+; Check that urem is not hoisted.
+; CHECK-LABEL: @main
+; CHECK: urem
+; CHECK: urem
+; CHECK: urem
+
+@g_x_s = global i32 -470211272, align 4
+@g_z_s = global i32 2007237709, align 4
+@g_x_u = global i32 282475249, align 4
+@g_z_u = global i32 984943658, align 4
+@g_m = global i32 16807, align 4
+@res = common global i32 0, align 4
+
+; Function Attrs:
+define i64 @func() #0 {
+entry:
+  ret i64 1
+}
+
+; Function Attrs:
+define i32 @main() {
+entry:
+  %0 = load volatile i32, i32* @g_x_s, align 4
+  %1 = load volatile i32, i32* @g_z_s, align 4
+  %2 = load volatile i32, i32* @g_x_u, align 4
+  %3 = load volatile i32, i32* @g_z_u, align 4
+  %4 = load volatile i32, i32* @g_m, align 4
+  %call = call i64 @func() #4
+  %conv = sext i32 %1 to i64
+  %cmp = icmp ne i64 %call, %conv
+  br i1 %cmp, label %if.end, label %lor.lhs.false
+
+lor.lhs.false:
+  %div = udiv i32 %4, %1
+  %rem = urem i32 %0, %div
+  %cmp2 = icmp eq i32 %rem, 0
+  br i1 %cmp2, label %if.end, label %if.then
+
+if.then:
+  br label %cleanup
+
+if.end:
+  %call4 = call i64 @func() #4
+  %conv5 = zext i32 %3 to i64
+  %cmp6 = icmp ne i64 %call4, %conv5
+  br i1 %cmp6, label %if.end14, label %lor.lhs.false8
+
+lor.lhs.false8:
+  %div9 = udiv i32 %4, %3
+  %rem10 = urem i32 %0, %div9
+  %cmp11 = icmp eq i32 %rem10, 0
+  br i1 %cmp11, label %if.end14, label %if.then13
+
+if.then13:
+  br label %cleanup
+
+if.end14:
+  %call15 = call i64 @func() #4
+  %cmp17 = icmp ne i64 %call15, %conv
+  br i1 %cmp17, label %if.end25, label %lor.lhs.false19
+
+lor.lhs.false19:
+  %div20 = udiv i32 %4, %1
+  %rem21 = urem i32 %0, %div20
+  %cmp22 = icmp eq i32 %rem21, 0
+  br i1 %cmp22, label %if.end25, label %if.then24
+
+if.then24:
+  br label %cleanup
+
+if.end25:
+  br label %cleanup
+
+cleanup:
+  %retval.0 = phi i32 [ 0, %if.end25 ], [ 1, %if.then24 ], [ 1, %if.then13 ], [ 1, %if.then ]
+  ret i32 %retval.0
+}
+
+attributes #0 = { minsize noinline nounwind optsize uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/Transforms/GVNHoist/hoist-very-busy.ll b/test/Transforms/GVNHoist/hoist-very-busy.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f421eff9921af090b37567f94766010e695f0d4d
--- /dev/null
+++ b/test/Transforms/GVNHoist/hoist-very-busy.ll
@@ -0,0 +1,55 @@
+; RUN: opt -S -gvn-hoist < %s | FileCheck %s
+
+%struct.__jmp_buf_tag = type { [8 x i64], i32 }
+
+; Check that hoisting only happens when the expression is very busy.
+; CHECK: store
+; CHECK: store
+
+@test_exit_buf = global %struct.__jmp_buf_tag zeroinitializer
+@G = global i32 0
+
+define void @test_command(i32 %c1) {
+entry:
+  switch i32 %c1, label %exit [
+    i32 0, label %sw0
+    i32 1, label %sw1
+  ]
+
+sw0:
+  store i32 1, i32* @G
+  br label %exit
+
+sw1:
+  store i32 1, i32* @G
+  br label %exit
+
+exit:
+  call void @longjmp(%struct.__jmp_buf_tag* @test_exit_buf, i32 1) #0
+  unreachable
+}
+
+declare void @longjmp(%struct.__jmp_buf_tag*, i32) #0
+
+attributes #0 = { noreturn nounwind }
+
+; Check that the store is hoisted.
+; CHECK-LABEL: define void @fun(
+; CHECK: store
+; CHECK-NOT: store
+
+define void @fun() {
+entry:
+  br label %if.then
+
+if.then:                                          ; preds = %entry
+  br i1 undef, label %sw0, label %sw1
+
+sw0:
+  store i32 1, i32* @G
+  unreachable
+
+sw1:
+  store i32 1, i32* @G
+  ret void
+}
diff --git a/test/Transforms/GVNHoist/pr29034.ll b/test/Transforms/GVNHoist/pr29034.ll
index 5e725ad38c8656f5a7b4db3abca3d94b90651c25..c0fcc3e741a8113367d89cd468bf86b63cfd9170 100644
--- a/test/Transforms/GVNHoist/pr29034.ll
+++ b/test/Transforms/GVNHoist/pr29034.ll
@@ -38,7 +38,7 @@ define void @music_task(i8* nocapture readnone %p) local_unnamed_addr {
 entry:
   %mapi = alloca %struct._MUSIC_OP_API_*, align 8
   %0 = bitcast %struct._MUSIC_OP_API_** %mapi to i8*
-  call void @llvm.lifetime.start(i64 8, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %0)
   store %struct._MUSIC_OP_API_* null, %struct._MUSIC_OP_API_** %mapi, align 8, !tbaa !1
   %call = call i32 @music_decoder_init(%struct._MUSIC_OP_API_** nonnull %mapi)
   br label %while.cond
@@ -103,7 +103,7 @@ while.cond2.backedge:                             ; preds = %sw.default, %sw.bb1
   br label %while.cond2
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 declare i32 @music_decoder_init(%struct._MUSIC_OP_API_**)
 declare i32 @music_play_api(%struct._MUSIC_OP_API_*, i32, i32, i32, i8*)
 declare i32 @printf(i8* nocapture readonly, ...)
diff --git a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
index c785e13403cc6ab940b88f1f15b77add55795fe7..da82b01560b3667974313bd6f590b54fabff1832 100644
--- a/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
+++ b/test/Transforms/GlobalOpt/2009-03-05-dbg.ll
@@ -77,4 +77,4 @@ attributes #1 = { nounwind readnone }
 !6 = !{i32 2, !"Dwarf Version", i32 2}
 !7 = !{i32 2, !"Debug Info Version", i32 3}
 !8 = !DILocalVariable(name: "i", arg: 1, scope: !9, file: !3, line: 4, type: !5)
-!9 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !2, line: 4, type: !10, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2)!10 = !DISubroutineType(types: !11)!11 = !{!5, !5}!12 = !DIExpression()!13 = !DILocation(line: 5, scope: !14)!14 = distinct !DILexicalBlock(scope: !9, file: !3)!15 = !DILocation(line: 6, scope: !14)!16 = !DILocation(line: 7, scope: !14)!17 = !DILocation(line: 9, scope: !14)!18 = !DILocation(line: 11, scope: !14)!19 = !DILocation(line: 14, scope: !20)!20 = distinct !DILexicalBlock(scope: !21, file: !3)!21 = distinct !DISubprogram(name: "bar", linkageName: "bar", scope: !2, line: 13, type: !22, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2)!22 = !DISubroutineType(types: !23)!23 = !{!5}!24 = !DILocation(line: 15, scope: !20)!25 = !DILocation(line: 16, scope: !20)
\ No newline at end of file
+!9 = distinct !DISubprogram(name: "foo", linkageName: "foo", scope: !2, file: !3, line: 4, type: !10, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2)!10 = !DISubroutineType(types: !11)!11 = !{!5, !5}!12 = !DIExpression()!13 = !DILocation(line: 5, scope: !14)!14 = distinct !DILexicalBlock(scope: !9, file: !3)!15 = !DILocation(line: 6, scope: !14)!16 = !DILocation(line: 7, scope: !14)!17 = !DILocation(line: 9, scope: !14)!18 = !DILocation(line: 11, scope: !14)!19 = !DILocation(line: 14, scope: !20)!20 = distinct !DILexicalBlock(scope: !21, file: !3)!21 = distinct !DISubprogram(name: "bar", linkageName: "bar", scope: !2, file: !3, line: 13, type: !22, isLocal: false, isDefinition: true, virtualIndex: 6, isOptimized: false, unit: !2)!22 = !DISubroutineType(types: !23)!23 = !{!5}!24 = !DILocation(line: 15, scope: !20)!25 = !DILocation(line: 16, scope: !20)
diff --git a/test/Transforms/GlobalSplit/basic.ll b/test/Transforms/GlobalSplit/basic.ll
index a0aaeffb6c3f3f00f29d81249919800268d84d2a..6834a8d18be9784c9395190dbd48a0294b335abc 100644
--- a/test/Transforms/GlobalSplit/basic.ll
+++ b/test/Transforms/GlobalSplit/basic.ll
@@ -12,13 +12,13 @@ target triple = "x86_64-unknown-linux-gnu"
 ]
 
 ; CHECK-NOT: @global =
-; CHECK: @global.0 = private constant [2 x i8* ()*] [i8* ()* @f1, i8* ()* @f2], !type [[T1:![0-9]+$]]
-; CHECK: @global.1 = private constant [1 x i8* ()*] [i8* ()* @f3], !type [[T2:![0-9]+$]]
+; CHECK: @global.0 = private constant [2 x i8* ()*] [i8* ()* @f1, i8* ()* @f2], !type [[T1:![0-9]+]], !type [[T2:![0-9]+]], !type [[T3:![0-9]+$]]
+; CHECK: @global.1 = private constant [1 x i8* ()*] [i8* ()* @f3], !type [[T4:![0-9]+]], !type [[T5:![0-9]+$]]
 ; CHECK-NOT: @global =
 @global = internal constant { [2 x i8* ()*], [1 x i8* ()*] } {
   [2 x i8* ()*] [i8* ()* @f1, i8* ()* @f2],
   [1 x i8* ()*] [i8* ()* @f3]
-}, !type !0, !type !1
+}, !type !0, !type !1, !type !2, !type !3, !type !4
 
 ; CHECK: define i8* @f1()
 define i8* @f1() {
@@ -51,7 +51,13 @@ define void @foo() {
 
 declare i1 @llvm.type.test(i8*, metadata) nounwind readnone
 
-; CHECK: [[T1]] = !{i32 8, !"foo"}
-; CHECK: [[T2]] = !{i32 0, !"bar"}
-!0 = !{i32 8, !"foo"}
-!1 = !{i32 16, !"bar"}
+; CHECK: [[T1]] = !{i32 0, !"foo"}
+; CHECK: [[T2]] = !{i32 15, !"bar"}
+; CHECK: [[T3]] = !{i32 16, !"a"}
+; CHECK: [[T4]] = !{i32 1, !"b"}
+; CHECK: [[T5]] = !{i32 8, !"c"}
+!0 = !{i32 0, !"foo"}
+!1 = !{i32 15, !"bar"}
+!2 = !{i32 16, !"a"}
+!3 = !{i32 17, !"b"}
+!4 = !{i32 24, !"c"}
diff --git a/test/Transforms/IPConstantProp/naked-return.ll b/test/Transforms/IPConstantProp/naked-return.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3a2dedafcd3758b002c3068eb59a1c3864c8c10d
--- /dev/null
+++ b/test/Transforms/IPConstantProp/naked-return.ll
@@ -0,0 +1,29 @@
+; RUN: opt -ipsccp -S %s | FileCheck %s
+; RUN: opt -ipconstprop -S %s | FileCheck %s
+
+target datalayout = "e-m:x-p:32:32-i64:64-f80:32-n8:16:32-a:0:32-S32"
+target triple = "i686-pc-windows-msvc19.0.24215"
+
+define i32 @dipsy(i32, i32) local_unnamed_addr #0 {
+BasicBlock0:
+  call void asm "\0D\0Apushl %ebp\0D\0Amovl 8(%esp),%eax\0D\0Amovl 12(%esp), %ebp\0D\0Acalll *%eax\0D\0Apopl %ebp\0D\0Aretl\0D\0A", ""()
+  ret i32 0
+}
+
+define void @tinkywinky(i32, i32, i32) local_unnamed_addr #0 {
+BasicBlock1:
+  call void asm "\0D\0A    movl 12(%esp), %ebp\0D\0A    movl 4(%esp), %eax\0D\0A    movl 8(%esp), %esp\0D\0A    jmpl *%eax\0D\0A", ""()
+  ret void
+}
+
+define void @patatino(i32, i32, i32) local_unnamed_addr #1 {
+bb:
+  %3 = tail call i32 @dipsy(i32 %0, i32 %1) #0
+; Check that we don't accidentally propagate zero.
+; CHECK: @tinkywinky(i32 %3, i32 %2, i32 %1) #0
+  tail call void @tinkywinky(i32 %3, i32 %2, i32 %1) #0
+  ret void
+}
+
+attributes #0 = { naked }
+attributes #1 = { "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
diff --git a/test/Transforms/IRCE/bad-loop-structure.ll b/test/Transforms/IRCE/bad-loop-structure.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9c2e4251423d42298809e91bfbe03e45e1e6a571
--- /dev/null
+++ b/test/Transforms/IRCE/bad-loop-structure.ll
@@ -0,0 +1,45 @@
+; RUN: opt -S -irce -irce-print-changed-loops=true < %s | FileCheck %s
+
+; CHECK-NOT: irce
+
+define void @bad_loop_structure_increasing(i64 %iv.start) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %iv.start, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cmp = icmp ult i64 %indvars.iv, 100
+  br i1 %cmp, label %switch.lookup, label %for.inc
+
+switch.lookup:
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %cmp55 = icmp slt i64 %indvars.iv.next, 11
+  br i1 %cmp55, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+define void @bad_loop_structure_decreasing(i64 %iv.start) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %iv.start, %entry ], [ %indvars.iv.next, %for.inc ]
+  %cmp = icmp ult i64 %indvars.iv, 100
+  br i1 %cmp, label %switch.lookup, label %for.inc
+
+switch.lookup:
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, -1
+  %cmp55 = icmp sgt i64 %indvars.iv.next, 11
+  br i1 %cmp55, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll b/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll
index 402ae8cc05d0ab9cf24ea4cb3301cecaaee65613..b9d571d9b64f51463f8531eedff8eb381897d86f 100644
--- a/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll
+++ b/test/Transforms/IndVarSimplify/2011-11-01-lftrptr.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -indvars -S "-default-data-layout=e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" | FileCheck %s
-; RUN: opt < %s -indvars -S "-default-data-layout=e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" | FileCheck %s
+; RUN: opt < %s -indvars -S "-data-layout=e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64" | FileCheck %s
+; RUN: opt < %s -indvars -S "-data-layout=e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32" | FileCheck %s
 ;
 ; PR11279: Assertion !IVLimit->getType()->isPointerTy()
 ;
diff --git a/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll b/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
index aa4fb8e68eb334fe00e3919055aba3e91e3048e4..36c7bd9c5ec35a89afc31352981dda17e374f7c3 100644
--- a/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
+++ b/test/Transforms/IndVarSimplify/AMDGPU/no-widen-to-i64.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK-LABEL: @indvar_32_bit(
 ; CHECK-NOT: sext i32
 ; CHECK: phi i32
-define void @indvar_32_bit(i32 %n, i32* nocapture %output) {
+define amdgpu_kernel void @indvar_32_bit(i32 %n, i32* nocapture %output) {
 entry:
   %cmp5 = icmp sgt i32 %n, 0
   br i1 %cmp5, label %for.body.preheader, label %for.end
@@ -46,7 +46,7 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ; CHECK-NOT: ashr i64
 ; CHECK-NOT: mul nsw i64
 ; CHECK-NOT: add nsw i64
-define void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
+define amdgpu_kernel void @no_promote_i32(i32 addrspace(1)* %out, i32 %a, i32 %b) {
 entry:
   br label %for.body
 
@@ -72,7 +72,7 @@ for.end:
 ; be legalized anyway.
 
 ; CHECK-LABEL: @indvar_48_bit(
-define void @indvar_48_bit(i48 %n, i48* nocapture %output) {
+define amdgpu_kernel void @indvar_48_bit(i48 %n, i48* nocapture %output) {
 entry:
   %cmp5 = icmp sgt i48 %n, 0
   br i1 %cmp5, label %for.body.preheader, label %for.end
diff --git a/test/Transforms/IndVarSimplify/exit_value_test2.ll b/test/Transforms/IndVarSimplify/exit_value_test2.ll
index 24e3e95a8918275982aa06a59c4dc93593d3484f..ee641667506c9c1d5e837a70b3b2e9d430f08a13 100644
--- a/test/Transforms/IndVarSimplify/exit_value_test2.ll
+++ b/test/Transforms/IndVarSimplify/exit_value_test2.ll
@@ -8,14 +8,14 @@
 ; CHECK-NOT: udiv
 
 declare void @_Z3mixRjj(i32* dereferenceable(4), i32)
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define i32 @_Z3fooPKcjj(i8* nocapture readonly %s, i32 %len, i32 %c) {
 entry:
   %a = alloca i32, align 4
   %tmp = bitcast i32* %a to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %tmp)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %tmp)
   store i32 -1640531527, i32* %a, align 4
   %cmp8 = icmp ugt i32 %len, 11
   br i1 %cmp8, label %while.body.lr.ph, label %while.end
@@ -47,6 +47,6 @@ while.end:                                        ; preds = %while.cond.while.en
   %keylen.0.lcssa = phi i32 [ %sub.lcssa, %while.cond.while.end_crit_edge ], [ %len, %entry ]
   call void @_Z3mixRjj(i32* dereferenceable(4) %a, i32 %keylen.0.lcssa)
   %tmp4 = load i32, i32* %a, align 4
-  call void @llvm.lifetime.end(i64 4, i8* %tmp)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %tmp)
   ret i32 %tmp4
 }
diff --git a/test/Transforms/IndVarSimplify/pr32045.ll b/test/Transforms/IndVarSimplify/pr32045.ll
new file mode 100644
index 0000000000000000000000000000000000000000..31efac3f833c1e6f255f98b7e87c424a040bc708
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/pr32045.ll
@@ -0,0 +1,39 @@
+; RUN: opt -S -indvars < %s | FileCheck %s
+
+; This is not an IndVarSimplify bug, but the original symptom
+; manifested as one.
+
+define i32 @foo(i32 %a, i32 %b, i32 %c, i32* %sink) {
+; CHECK-LABEL: @foo(
+; CHECK:       for.end:
+; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 %neg3, -1
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[SHR]]
+; CHECK-NEXT:    [[SHR1:%.*]] = ashr i32 [[SUB]], [[B:%.*]]
+; CHECK-NEXT:    [[NEG:%.*]] = xor i32 [[SHR1]], -1
+; CHECK-NEXT:    store i32 [[NEG]], i32* %sink
+;
+entry:
+  %tobool2 = icmp eq i32 %a, 0
+  br i1 %tobool2, label %exit, label %preheader
+
+preheader:
+  %neg3 = phi i32 [ %c, %entry ], [ %neg, %for.end ]
+  br label %for
+
+for:
+  %p = phi i32 [ %dec, %for ], [ 1, %preheader ]
+  %cmp = icmp sgt i32 %p, -1
+  %dec = add nsw i32 %p, -1
+  br i1 %cmp, label %for, label %for.end
+
+for.end:
+  %shr = ashr i32 %neg3, %p
+  %sub = sub nsw i32 0, %shr
+  %shr1 = ashr i32 %sub, %b
+  %neg = xor i32 %shr1, -1
+  store i32 %neg, i32* %sink
+  br i1 false, label %exit, label %preheader
+
+exit:
+  ret i32 0
+}
diff --git a/test/Transforms/IndVarSimplify/replace-sdiv-by-udiv.ll b/test/Transforms/IndVarSimplify/replace-sdiv-by-udiv.ll
new file mode 100644
index 0000000000000000000000000000000000000000..af25b20bec37ba5e67e34fd11af01160d4503333
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/replace-sdiv-by-udiv.ll
@@ -0,0 +1,130 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+
+define void @test0(i32* %a) {
+; CHECK-LABEL: @test0(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %div = sdiv i32 %i.01, 2
+; CHECK-NOT: sdiv
+; CHECK:     udiv
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @test1(i32* %a) {
+; CHECK-LABEL: @test1(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %div = sdiv exact i32 %i.01, 2
+; CHECK-NOT: sdiv
+; CHECK:     udiv exact
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @test2(i32* %a, i32 %d) {
+; CHECK-LABEL: @test2(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = mul nsw i32 %i.01, 64
+  %div = sdiv i32 %mul, %d
+; CHECK-NOT: udiv
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @test3(i32* %a) {
+; CHECK-LABEL: @test3(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %div = sdiv i32 2048, %i.01
+; CHECK:     udiv
+; CHECK-NOT: sdiv
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @test4(i32* %a) {
+; CHECK-LABEL: @test4(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = mul nsw i32 %i.01, 64
+  %div = sdiv i32 %mul, 8
+; CHECK:     udiv
+; CHECK-NOT: sdiv
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+define void @test5(i32* %a) {
+; CHECK-LABEL: @test5(
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.01 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %mul = mul nsw i32 %i.01, 64
+  %div = sdiv i32 %mul, 6
+; CHECK:     udiv
+; CHECK-NOT: sdiv
+  %idxprom = sext i32 %div to i64
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %idxprom
+  store i32 %i.01, i32* %arrayidx, align 4
+  %inc = add nsw i32 %i.01, 1
+  %cmp = icmp slt i32 %inc, 64
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll b/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
index 67b4ccda1a1b06a31a63de02e41f055feca34d4e..b566c147e9b883f2f87d68ced337376a83d6a8fa 100644
--- a/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/basic.ll
@@ -45,7 +45,7 @@ define float @load_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
 ; CHECK-LABEL: @store_global_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
 ; CHECK-NEXT: store float 0.000000e+00, float addrspace(1)* %tmp0
-define void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(1)*
   store float 0.0, float addrspace(1)* %tmp0
   ret void
@@ -54,7 +54,7 @@ define void @store_global_from_flat(float addrspace(4)* %generic_scalar) #0 {
 ; CHECK-LABEL: @store_group_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
 ; CHECK-NEXT: store float 0.000000e+00, float addrspace(3)* %tmp0
-define void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float addrspace(3)*
   store float 0.0, float addrspace(3)* %tmp0
   ret void
@@ -63,7 +63,7 @@ define void @store_group_from_flat(float addrspace(4)* %generic_scalar) #0 {
 ; CHECK-LABEL: @store_private_from_flat(
 ; CHECK-NEXT: %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
 ; CHECK-NEXT: store float 0.000000e+00, float* %tmp0
-define void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
+define amdgpu_kernel void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
   %tmp0 = addrspacecast float addrspace(4)* %generic_scalar to float*
   store float 0.0, float* %tmp0
   ret void
@@ -74,7 +74,7 @@ define void @store_private_from_flat(float addrspace(4)* %generic_scalar) #0 {
 ; CHECK-NEXT: %val = load i32, i32 addrspace(1)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(1)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -87,7 +87,7 @@ define void @load_store_global(i32 addrspace(1)* nocapture %input, i32 addrspace
 ; CHECK-NEXT: %val = load i32, i32 addrspace(3)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(3)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -100,7 +100,7 @@ define void @load_store_group(i32 addrspace(3)* nocapture %input, i32 addrspace(
 ; CHECK-NEXT: %val = load i32, i32* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -113,7 +113,7 @@ define void @load_store_private(i32* nocapture %input, i32* nocapture %output) #
 ; CHECK-NEXT: %val = load i32, i32 addrspace(4)* %input, align 4
 ; CHECK-NEXT: store i32 %val, i32 addrspace(4)* %output, align 4
 ; CHECK-NEXT: ret void
-define void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 {
+define amdgpu_kernel void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4)* nocapture %output) #0 {
   %val = load i32, i32 addrspace(4)* %input, align 4
   store i32 %val, i32 addrspace(4)* %output, align 4
   ret void
@@ -122,7 +122,7 @@ define void @load_store_flat(i32 addrspace(4)* nocapture %input, i32 addrspace(4
 ; CHECK-LABEL: @store_addrspacecast_ptr_value(
 ; CHECK: %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
 ; CHECK-NEXT: store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
-define void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @store_addrspacecast_ptr_value(i32 addrspace(1)* nocapture %input, i32 addrspace(4)* addrspace(1)* nocapture %output) #0 {
   %cast = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   store i32 addrspace(4)* %cast, i32 addrspace(4)* addrspace(1)* %output, align 4
   ret void
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/icmp.ll b/test/Transforms/InferAddressSpaces/AMDGPU/icmp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b185ede26579734f53cf08a0f3da779fd8e60f71
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/icmp.ll
@@ -0,0 +1,160 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; CHECK-LABEL: @icmp_flat_cmp_self(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, %group.ptr.0
+define i1 @icmp_flat_cmp_self(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, %cast0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_flat_flat_from_group(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, %group.ptr.1
+define i1 @icmp_flat_flat_from_group(i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, %cast1
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_mismatch_flat_from_group_private(
+; CHECK: %1 = addrspacecast i32* %private.ptr.0 to i32 addrspace(4)*
+; CHECK: %2 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, %2
+define i1 @icmp_mismatch_flat_from_group_private(i32* %private.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast0 = addrspacecast i32* %private.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, %cast1
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_flat_group_flat(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, %flat.ptr.1
+define i1 @icmp_flat_group_flat(i32 addrspace(3)* %group.ptr.0, i32 addrspace(4)* %flat.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, %flat.ptr.1
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_flat_flat_group(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* %flat.ptr.0, %1
+define i1 @icmp_flat_flat_group(i32 addrspace(4)* %flat.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %flat.ptr.0, %cast1
+  ret i1 %cmp
+}
+
+; Keeping as cmp addrspace(3)* is better
+; CHECK-LABEL: @icmp_flat_to_group_cmp(
+; CHECK: %cast0 = addrspacecast i32 addrspace(4)* %flat.ptr.0 to i32 addrspace(3)*
+; CHECK: %cast1 = addrspacecast i32 addrspace(4)* %flat.ptr.1 to i32 addrspace(3)*
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %cast0, %cast1
+define i1 @icmp_flat_to_group_cmp(i32 addrspace(4)* %flat.ptr.0, i32 addrspace(4)* %flat.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(4)* %flat.ptr.0 to i32 addrspace(3)*
+  %cast1 = addrspacecast i32 addrspace(4)* %flat.ptr.1 to i32 addrspace(3)*
+  %cmp = icmp eq i32 addrspace(3)* %cast0, %cast1
+  ret i1 %cmp
+}
+
+; FIXME: Should be able to ask target about how to constant fold the
+; constant cast if this is OK to change if 0 is a valid pointer.
+
+; CHECK-LABEL: @icmp_group_flat_cmp_null(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
+define i1 @icmp_group_flat_cmp_null(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, null
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_group_flat_cmp_constant_inttoptr(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, addrspacecast (i32 addrspace(4)* inttoptr (i64 400 to i32 addrspace(4)*) to i32 addrspace(3)*)
+define i1 @icmp_group_flat_cmp_constant_inttoptr(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, inttoptr (i64 400 to i32 addrspace(4)*)
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_null(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, addrspacecast (i32* null to i32 addrspace(4)*)
+define i1 @icmp_mismatch_flat_group_private_cmp_null(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, addrspacecast (i32* null to i32 addrspace(4)*)
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_undef(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, undef
+define i1 @icmp_mismatch_flat_group_private_cmp_undef(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, addrspacecast (i32* undef to i32 addrspace(4)*)
+  ret i1 %cmp
+}
+
+@lds0 = internal addrspace(3) global i32 0, align 4
+@global0 = internal addrspace(1) global i32 0, align 4
+
+; CHECK-LABEL: @icmp_mismatch_flat_group_global_cmp_gv(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* %1, addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+define i1 @icmp_mismatch_flat_group_global_cmp_gv(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_mismatch_group_global_cmp_gv_gv(
+; CHECK: %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+define i1 @icmp_mismatch_group_global_cmp_gv_gv(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_group_flat_cmp_undef(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* %group.ptr.0, undef
+define i1 @icmp_group_flat_cmp_undef(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* %cast0, undef
+  ret i1 %cmp
+}
+
+; Test non-canonical orders
+; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_null_swap(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*), %1
+define i1 @icmp_mismatch_flat_group_private_cmp_null_swap(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32* null to i32 addrspace(4)*), %cast0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_group_flat_cmp_undef_swap(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* undef, %group.ptr.0
+define i1 @icmp_group_flat_cmp_undef_swap(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* undef, %cast0
+  ret i1 %cmp
+}
+
+; CHECK-LABEL: @icmp_mismatch_flat_group_private_cmp_undef_swap(
+; CHECK: %cmp = icmp eq i32 addrspace(3)* undef, %group.ptr.0
+define i1 @icmp_mismatch_flat_group_private_cmp_undef_swap(i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cmp = icmp eq i32 addrspace(4)* addrspacecast (i32* undef to i32 addrspace(4)*), %cast0
+  ret i1 %cmp
+}
+
+; TODO: Should be handled
+; CHECK-LABEL: @icmp_flat_flat_from_group_vector(
+; CHECK: %cmp = icmp eq <2 x i32 addrspace(4)*> %cast0, %cast1
+define <2 x i1> @icmp_flat_flat_from_group_vector(<2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 {
+  %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*>
+  %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*>
+  %cmp = icmp eq <2 x i32 addrspace(4)*> %cast0, %cast1
+  ret <2 x i1> %cmp
+}
+
+attributes #0 = { nounwind }
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll b/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
index 5ed3dc0b95ea93f0abe4b75f6a4626bcd56de313..52067cd37bb9d6f1fd8275aacfdae0f6e33bdb7d 100644
--- a/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/infer-address-space.ll
@@ -28,7 +28,7 @@
 ; CHECK: store float %v, float addrspace(3)* %tmp7, align 4
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK: ret void
-define void @load_store_lds_f32(i32 %i, float %v) #0 {
+define amdgpu_kernel void @load_store_lds_f32(i32 %i, float %v) #0 {
 bb:
   %tmp = load float, float addrspace(4)* addrspacecast (float addrspace(3)* @scalar to float addrspace(4)*), align 4
   call void @use(float %tmp)
@@ -83,7 +83,7 @@ bb:
 
 ; CHECK-LABEL: @nested_const_expr(
 ; CHECK: store i32 1, i32 addrspace(3)* bitcast (float addrspace(3)* getelementptr inbounds ([10 x float], [10 x float] addrspace(3)* @array, i64 0, i64 1) to i32 addrspace(3)*), align 4
-define void @nested_const_expr() #0 {
+define amdgpu_kernel void @nested_const_expr() #0 {
   store i32 1, i32 addrspace(4)* bitcast (float addrspace(4)* getelementptr ([10 x float], [10 x float] addrspace(4)* addrspacecast ([10 x float] addrspace(3)* @array to [10 x float] addrspace(4)*), i64 0, i64 1) to i32 addrspace(4)*), align 4
   ret void
 }
@@ -93,7 +93,7 @@ define void @nested_const_expr() #0 {
 ; CHECK-NEXT: %v = load float, float addrspace(1)* %addr
 ; CHECK-NEXT: store float %v, float addrspace(1)* %addr
 ; CHECK-NEXT: ret void
-define void @rauw(float addrspace(1)* %input) #0 {
+define amdgpu_kernel void @rauw(float addrspace(1)* %input) #0 {
 bb:
   %generic_input = addrspacecast float addrspace(1)* %input to float addrspace(4)*
   %addr = getelementptr float, float addrspace(4)* %generic_input, i64 10
@@ -106,8 +106,7 @@ bb:
 ; CHECK-LABEL: @loop(
 
 ; CHECK: %p = bitcast [10 x float] addrspace(3)* @array to float addrspace(3)*
-; CHECK: %0 = addrspacecast float addrspace(3)* %p to float addrspace(4)*
-; CHECK: %end = getelementptr float, float addrspace(4)* %0, i64 10
+; CHECK: %end = getelementptr float, float addrspace(3)* %p, i64 10
 ; CHECK: br label %loop
 
 ; CHECK: loop:                                             ; preds = %loop, %entry
@@ -115,10 +114,10 @@ bb:
 ; CHECK: %v = load float, float addrspace(3)* %i
 ; CHECK: call void @use(float %v)
 ; CHECK: %i2 = getelementptr float, float addrspace(3)* %i, i64 1
-; CHECK: %1 = addrspacecast float addrspace(3)* %i2 to float addrspace(4)*
-; CHECK: %exit_cond = icmp eq float addrspace(4)* %1, %end
+; CHECK: %exit_cond = icmp eq float addrspace(3)* %i2, %end
+
 ; CHECK: br i1 %exit_cond, label %exit, label %loop
-define void @loop() #0 {
+define amdgpu_kernel void @loop() #0 {
 entry:
   %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
   %end = getelementptr float, float addrspace(4)* %p, i64 10
@@ -151,7 +150,7 @@ exit:                                             ; preds = %loop
 ; CHECK: %0 = addrspacecast float addrspace(3)* %i2 to float addrspace(4)*
 ; CHECK: %exit_cond = icmp eq float addrspace(4)* %0, %end
 ; CHECK: br i1 %exit_cond, label %exit, label %loop
-define void @loop_with_generic_bound() #0 {
+define amdgpu_kernel void @loop_with_generic_bound() #0 {
 entry:
   %p = addrspacecast [10 x float] addrspace(3)* @array to float addrspace(4)*
   %end = load float addrspace(4)*, float addrspace(4)* addrspace(1)* @generic_end
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll b/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ca6138d3fb01ea2db91e75abc93a2a7ce837daf0
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/intrinsics.ll
@@ -0,0 +1,146 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; CHECK-LABEL: @objectsize_group_to_flat_i32(
+; CHECK: %val = call i32 @llvm.objectsize.i32.p3i8(i8 addrspace(3)* %group.ptr, i1 true, i1 false)
+define i32 @objectsize_group_to_flat_i32(i8 addrspace(3)* %group.ptr) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+  %val = call i32 @llvm.objectsize.i32.p4i8(i8 addrspace(4)* %cast, i1 true, i1 false)
+  ret i32 %val
+}
+
+; CHECK-LABEL: @objectsize_global_to_flat_i64(
+; CHECK: %val = call i64 @llvm.objectsize.i64.p3i8(i8 addrspace(3)* %global.ptr, i1 true, i1 false)
+define i64 @objectsize_global_to_flat_i64(i8 addrspace(3)* %global.ptr) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %global.ptr to i8 addrspace(4)*
+  %val = call i64 @llvm.objectsize.i64.p4i8(i8 addrspace(4)* %cast, i1 true, i1 false)
+  ret i64 %val
+}
+
+; CHECK-LABEL: @atomicinc_global_to_flat_i32(
+; CHECK: call i32 @llvm.amdgcn.atomic.inc.i32.p1i32(i32 addrspace(1)* %global.ptr, i32 %y, i32 0, i32 0, i1 false)
+define i32 @atomicinc_global_to_flat_i32(i32 addrspace(1)* %global.ptr, i32 %y) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %cast, i32 %y, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @atomicinc_group_to_flat_i32(
+; CHECK: %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p3i32(i32 addrspace(3)* %group.ptr, i32 %y, i32 0, i32 0, i1 false)
+define i32 @atomicinc_group_to_flat_i32(i32 addrspace(3)* %group.ptr, i32 %y) #0 {
+  %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* %cast, i32 %y, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @atomicinc_global_to_flat_i64(
+; CHECK: call i64 @llvm.amdgcn.atomic.inc.i64.p1i64(i64 addrspace(1)* %global.ptr, i64 %y, i32 0, i32 0, i1 false)
+define i64 @atomicinc_global_to_flat_i64(i64 addrspace(1)* %global.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @atomicinc_group_to_flat_i64(
+; CHECK: call i64 @llvm.amdgcn.atomic.inc.i64.p3i64(i64 addrspace(3)* %group.ptr, i64 %y, i32 0, i32 0, i1 false)
+define i64 @atomicinc_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @atomicdec_global_to_flat_i32(
+; CHECK: call i32 @llvm.amdgcn.atomic.dec.i32.p1i32(i32 addrspace(1)* %global.ptr, i32 %val, i32 0, i32 0, i1 false)
+define i32 @atomicdec_global_to_flat_i32(i32 addrspace(1)* %global.ptr, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @atomicdec_group_to_flat_i32(
+; CHECK: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p3i32(i32 addrspace(3)* %group.ptr, i32 %val, i32 0, i32 0, i1 false)
+define i32 @atomicdec_group_to_flat_i32(i32 addrspace(3)* %group.ptr, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 false)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @atomicdec_global_to_flat_i64(
+; CHECK: call i64 @llvm.amdgcn.atomic.dec.i64.p1i64(i64 addrspace(1)* %global.ptr, i64 %y, i32 0, i32 0, i1 false)
+define i64 @atomicdec_global_to_flat_i64(i64 addrspace(1)* %global.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @atomicdec_group_to_flat_i64(
+; CHECK: call i64 @llvm.amdgcn.atomic.dec.i64.p3i64(i64 addrspace(3)* %group.ptr, i64 %y, i32 0, i32 0, i1 false
+define i64 @atomicdec_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 false)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicinc_group_to_flat_i64(
+; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 true)
+define i64 @volatile_atomicinc_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 true)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicdec_global_to_flat_i32(
+; CHECK-NEXT: %1 = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %1, i32 %val, i32 0, i32 0, i1 true)
+define i32 @volatile_atomicdec_global_to_flat_i32(i32 addrspace(1)* %global.ptr, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(1)* %global.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 true)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicdec_group_to_flat_i32(
+; CHECK-NEXT: %1 = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+; CHECK-NEXT: %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %1, i32 %val, i32 0, i32 0, i1 true)
+define i32 @volatile_atomicdec_group_to_flat_i32(i32 addrspace(3)* %group.ptr, i32 %val) #0 {
+  %cast = addrspacecast i32 addrspace(3)* %group.ptr to i32 addrspace(4)*
+  %ret = call i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* %cast, i32 %val, i32 0, i32 0, i1 true)
+  ret i32 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicdec_global_to_flat_i64(
+; CHECK-NEXT: %1 = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)*
+; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 true)
+define i64 @volatile_atomicdec_global_to_flat_i64(i64 addrspace(1)* %global.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(1)* %global.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 true)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @volatile_atomicdec_group_to_flat_i64(
+; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 true)
+define i64 @volatile_atomicdec_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y) #0 {
+  %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 true)
+  ret i64 %ret
+}
+
+; CHECK-LABEL: @invalid_variable_volatile_atomicinc_group_to_flat_i64(
+; CHECK-NEXT: %1 = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+; CHECK-NEXT: %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %1, i64 %y, i32 0, i32 0, i1 %volatile.var)
+define i64 @invalid_variable_volatile_atomicinc_group_to_flat_i64(i64 addrspace(3)* %group.ptr, i64 %y, i1 %volatile.var) #0 {
+  %cast = addrspacecast i64 addrspace(3)* %group.ptr to i64 addrspace(4)*
+  %ret = call i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* %cast, i64 %y, i32 0, i32 0, i1 %volatile.var)
+  ret i64 %ret
+}
+
+declare i32 @llvm.objectsize.i32.p4i8(i8 addrspace(4)*, i1, i1) #1
+declare i64 @llvm.objectsize.i64.p4i8(i8 addrspace(4)*, i1, i1) #1
+declare i32 @llvm.amdgcn.atomic.inc.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.inc.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2
+declare i32 @llvm.amdgcn.atomic.dec.i32.p4i32(i32 addrspace(4)* nocapture, i32, i32, i32, i1) #2
+declare i64 @llvm.amdgcn.atomic.dec.i64.p4i64(i64 addrspace(4)* nocapture, i64, i32, i32, i1) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind argmemonly }
diff --git a/test/Transforms/InferAddressSpaces/lit.local.cfg b/test/Transforms/InferAddressSpaces/AMDGPU/lit.local.cfg
similarity index 100%
rename from test/Transforms/InferAddressSpaces/lit.local.cfg
rename to test/Transforms/InferAddressSpaces/AMDGPU/lit.local.cfg
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
new file mode 100644
index 0000000000000000000000000000000000000000..557a80f1a5d1a79b274cc6111a8c631a265a2f5d
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll
@@ -0,0 +1,134 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; CHECK-LABEL: @memset_group_to_flat(
+; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memset_global_to_flat(
+; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
+  %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memset_group_to_flat_no_md(
+; CHECK: call void @llvm.memset.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 4, i64 %size, i32 4, i1 false){{$}}
+define amdgpu_kernel void @memset_group_to_flat_no_md(i8 addrspace(3)* %group.ptr, i64 %size) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @memset_global_to_flat_no_md(
+; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* %global.ptr, i8 4, i64 %size, i32 4, i1 false){{$}}
+define amdgpu_kernel void @memset_global_to_flat_no_md(i8 addrspace(1)* %global.ptr, i64 %size) #0 {
+  %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 %size, i32 4, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group(
+; CHCK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group(
+; CHECK: call void @llvm.memcpy.p3i8.p4i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(4)* %src.ptr, i64 %size) #0 {
+  %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %src.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_src_with_group(
+; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %src.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_src_with_group(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  %cast.dest = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_group_src_global(
+; CHECK: call void @llvm.memcpy.p3i8.p1i8.i64(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_group_src_global(i8 addrspace(3)* %dest.group.ptr, i8 addrspace(1)* %src.global.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(1)* %src.global.ptr to i8 addrspace(4)*
+  %cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast.dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_group_to_flat_replace_dest_global(
+; CHECK: call void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_group_to_flat_replace_dest_global(i8 addrspace(1)* %dest.global.ptr, i8 addrspace(3)* %src.group.ptr, i32 %size) #0 {
+  %cast.dest = addrspacecast i8 addrspace(1)* %dest.global.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* %cast.dest, i8 addrspace(3)* %src.group.ptr, i32 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(
+; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa.struct !7
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_tbaa_struct(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa.struct !7
+  ret void
+}
+
+; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group_no_md(
+; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
+define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(
+; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
+; CHECK: call void @llvm.memcpy.p4i8.p3i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false){{$}}
+define amdgpu_kernel void @multiple_memcpy_flat_to_flat_replace_src_with_group_no_md(i8 addrspace(4)* %dest0, i8 addrspace(4)* %dest1, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest0, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %dest1, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false)
+  ret void
+}
+
+; Check for iterator problems if the pointer has 2 uses in the same call
+; CHECK-LABEL: @memcpy_group_flat_to_flat_self(
+; CHECK: call void @llvm.memcpy.p3i8.p3i8.i64(i8 addrspace(3)* %group.ptr, i8 addrspace(3)* %group.ptr, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memcpy_group_flat_to_flat_self(i8 addrspace(3)* %group.ptr) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+  call void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* %cast, i8 addrspace(4)* %cast, i64 32, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group(
+; CHECK: call void @llvm.memmove.p4i8.p3i8.i64(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8 addrspace(4)* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
+  %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8 addrspace(4)*
+  call void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* %dest, i8 addrspace(4)* %cast.src, i64 %size, i32 4, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
+  ret void
+}
+
+declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8, i64, i32, i1) #1
+declare void @llvm.memcpy.p4i8.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8 addrspace(4)* nocapture readonly, i64, i32, i1) #1
+declare void @llvm.memcpy.p4i8.p3i8.i32(i8 addrspace(4)* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1
+declare void @llvm.memmove.p4i8.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8 addrspace(4)* nocapture readonly, i64, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
+
+!0 = !{!1, !1, i64 0}
+!1 = !{!"A", !2}
+!2 = !{!"tbaa root"}
+!3 = !{!"B", !2}
+!4 = !{!5}
+!5 = distinct !{!5, !6, !"some scope"}
+!6 = distinct !{!6, !"some domain"}
+!7 = !{i64 0, i64 8, null}
\ No newline at end of file
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll b/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
index 17997052f07763b4101f93c3012ce087aa8613cc..3231b6ccf1ccae9b39ceaff3d93d3ad39f489a9a 100644
--- a/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/old-pass-regressions.ll
@@ -9,7 +9,7 @@
 
 ; CHECK-LABEL: @generic_address_bitcast_const(
 ; CHECK: %vecload1 = load <2 x double>, <2 x double> addrspace(1)* bitcast (double addrspace(1)* getelementptr inbounds ([100 x double], [100 x double] addrspace(1)* @data, i64 0, i64 4) to <2 x double> addrspace(1)*), align 8
-define void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 {
+define amdgpu_kernel void @generic_address_bitcast_const(i64 %arg0, i32 addrspace(1)* nocapture %results) #0 {
 entry:
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp1 to i64
@@ -39,7 +39,7 @@ declare i32 @_Z9get_fencePU3AS4v(i8 addrspace(4)*)
 ; CHECK: %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
 ; CHECK: %add.ptr = getelementptr inbounds i32, i32 addrspace(3)* %tmp1, i32 2
 ; CHECK: %tmp2 = load i32, i32 addrspace(3)* %add.ptr, align 4
-define void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 {
+define amdgpu_kernel void @generic_address_pipe_bug9673(%opencl.pipe_t addrspace(3)* nocapture %in_pipe, i32 addrspace(1)* nocapture %dst) #0 {
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = bitcast %opencl.pipe_t addrspace(3)* %in_pipe to i32 addrspace(3)*
@@ -55,7 +55,7 @@ entry:
 ; CHECK: br i1
 ; CHECK: load float, float addrspace(4)*
 ; CHECK: br label
-define void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 {
+define amdgpu_kernel void @generic_address_bug9749(i32 addrspace(1)* nocapture %results) #0 {
 entry:
   %ptr = alloca float addrspace(4)*, align 8
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
@@ -85,7 +85,7 @@ helperFunction.exit:                              ; preds = %if.end.i, %entry
 ; CHECK-LABEL: @generic_address_opt_phi_bug9776_simple_phi_kernel(
 ; CHECK: phi i32 addrspace(3)*
 ; CHECK: store i32 %i.03, i32 addrspace(3)* %
-define void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 {
+define amdgpu_kernel void @generic_address_opt_phi_bug9776_simple_phi_kernel(i32 addrspace(3)* nocapture %in, i32 %numElems) #0 {
 entry:
   %cmp1 = icmp eq i32 %numElems, 0
   br i1 %cmp1, label %for.end, label %for.body.lr.ph
@@ -110,7 +110,7 @@ for.end:                                          ; preds = %for.body, %entry
 ; CHECK-LABEL: @generic_address_bug9899(
 ; CHECK: %vecload = load <2 x i32>, <2 x i32> addrspace(3)*
 ; CHECK: store <2 x i32> %tmp16, <2 x i32> addrspace(3)*
-define void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 {
+define amdgpu_kernel void @generic_address_bug9899(i64 %arg0, i32 addrspace(3)* nocapture %sourceA, i32 addrspace(3)* nocapture %destValues) #0 {
 entry:
   %tmp1 = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp2 = zext i32 %tmp1 to i64
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/select.ll b/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
new file mode 100644
index 0000000000000000000000000000000000000000..08edc20ecf9b75ec7dfe3e2447e3e6f9d10b302f
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/select.ll
@@ -0,0 +1,264 @@
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -infer-address-spaces %s | FileCheck %s
+
+; Instcombine pulls the addrspacecast out of the select, make sure
+;  this doesn't do something insane on non-canonical IR.
+
+; CHECK-LABEL: @return_select_group_flat(
+; CHECK-NEXT: %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK-NEXT: %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+; CHECK-NEXT: %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
+; CHECK-NEXT: ret i32 addrspace(4)* %select
+define i32 addrspace(4)* @return_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
+  ret i32 addrspace(4)* %select
+}
+
+; CHECK-LABEL: @store_select_group_flat(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1
+; CHECK: store i32 -1, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+; Make sure metadata is preserved
+; CHECK-LABEL: @load_select_group_flat_md(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1, !prof !0
+; CHECK: %load = load i32, i32 addrspace(3)* %select
+define i32 @load_select_group_flat_md(i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* %group.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32 addrspace(3)* %group.ptr.1 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1, !prof !0
+  %load = load i32, i32 addrspace(4)* %select
+  ret i32 %load
+}
+
+; CHECK-LABEL: @store_select_mismatch_group_private_flat(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %2 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)*
+; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* %2
+; CHECK: store i32 -1, i32 addrspace(4)* %select
+define amdgpu_kernel void @store_select_mismatch_group_private_flat(i1 %c, i32 addrspace(3)* %group.ptr.0, i32* %private.ptr.1) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %cast1 = addrspacecast i32* %private.ptr.1 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* %cast1
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+@lds0 = internal addrspace(3) global i32 123, align 4
+@lds1 = internal addrspace(3) global i32 456, align 4
+
+; CHECK-LABEL: @constexpr_select_group_flat(
+; CHECK: %tmp = load i32, i32 addrspace(3)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(3)* @lds0, i32 addrspace(3)* @lds1)
+define i32 @constexpr_select_group_flat() #0 {
+bb:
+  %tmp = load i32, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds1 to i32 addrspace(4)*))
+  ret i32 %tmp
+}
+
+; CHECK-LABEL: @constexpr_select_group_global_flat_mismatch(
+; CHECK: %tmp = load i32, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*))
+define i32 @constexpr_select_group_global_flat_mismatch() #0 {
+bb:
+  %tmp = load i32, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*))
+  ret i32 %tmp
+}
+
+; CHECK-LABEL: @store_select_group_flat_null(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
+; CHECK: store i32 -1, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_null_swap(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*), i32 addrspace(3)* %group.ptr.0
+; CHECK: store i32 -1, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_null_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* null, i32 addrspace(4)* %cast0
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_undef(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* undef
+; CHECK: store i32 -1, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_undef(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* undef
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_undef_swap(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* undef, i32 addrspace(3)* %group.ptr.0
+; CHECK: store i32 -1, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_undef_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* undef, i32 addrspace(4)* %cast0
+  store i32 -1, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_gep_group_flat_null(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* null to i32 addrspace(3)*)
+; CHECK: %gep = getelementptr i32, i32 addrspace(3)* %select, i64 16
+; CHECK: store i32 -1, i32 addrspace(3)* %gep
+define amdgpu_kernel void @store_select_gep_group_flat_null(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* null
+  %gep = getelementptr i32, i32 addrspace(4)* %select, i64 16
+  store i32 -1, i32 addrspace(4)* %gep
+  ret void
+}
+
+@global0 = internal addrspace(1) global i32 123, align 4
+
+; CHECK-LABEL: @store_select_group_flat_constexpr(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* @lds1
+; CHECK: store i32 7, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds1 to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_inttoptr_flat(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* addrspacecast (i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*) to i32 addrspace(3)*)
+; CHECK: store i32 7, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_inttoptr_flat(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* inttoptr (i64 12345 to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_inttoptr_group(
+; CHECK: %select = select i1 %c, i32 addrspace(3)* %group.ptr.0, i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*)
+; CHECK-NEXT: store i32 7, i32 addrspace(3)* %select
+define amdgpu_kernel void @store_select_group_flat_inttoptr_group(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i32 400 to i32 addrspace(3)*) to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_flat_constexpr(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+; CHECK: store i32 7, i32 addrspace(4)* %select
+define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_flat_constexpr_swap(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %1
+; CHECK: store i32 7, i32 addrspace(4)* %select
+define amdgpu_kernel void @store_select_group_global_mismatch_flat_constexpr_swap(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*), i32 addrspace(4)* %cast0
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_null_null(
+; CHECK: %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)
+; CHECK: store i32 7, i32 addrspace(4)* %select
+define amdgpu_kernel void @store_select_group_global_mismatch_null_null(i1 %c) #0 {
+  %select = select i1 %c, i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_null_null_constexpr(
+; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+define amdgpu_kernel void @store_select_group_global_mismatch_null_null_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_gv_null_constexpr(
+; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+define amdgpu_kernel void @store_select_group_global_mismatch_gv_null_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* @lds0 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_null_gv_constexpr(
+; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4
+define amdgpu_kernel void @store_select_group_global_mismatch_null_gv_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* @global0 to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_null_constexpr(
+; CHECK: store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_null_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* inttoptr (i64 123 to i32 addrspace(3)*) to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_inttoptr_flat_null_constexpr(
+; CHECK: store i32 7, i32 addrspace(1)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(1)* addrspacecast (i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*) to i32 addrspace(1)*), i32 addrspace(1)* null), align 4
+define amdgpu_kernel void @store_select_group_global_mismatch_inttoptr_flat_null_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* inttoptr (i64 123 to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* null to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_global_mismatch_undef_undef_constexpr(
+; CHECK: store i32 7, i32 addrspace(3)* null
+define amdgpu_kernel void @store_select_group_global_mismatch_undef_undef_constexpr() #0 {
+  store i32 7, i32 addrspace(4)* select (i1 icmp eq (i32 ptrtoint (i32 addrspace(3)* @lds1 to i32), i32 4), i32 addrspace(4)* addrspacecast (i32 addrspace(3)* null to i32 addrspace(4)*), i32 addrspace(4)* addrspacecast (i32 addrspace(1)* undef to i32 addrspace(4)*)), align 4
+  ret void
+}
+
+@lds2 = external addrspace(3) global [1024 x i32], align 4
+
+; CHECK-LABEL: @store_select_group_constexpr_ptrtoint(
+; CHECK: %1 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+; CHECK: %select = select i1 %c, i32 addrspace(4)* %1, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*)
+; CHECK: store i32 7, i32 addrspace(4)* %select
+define amdgpu_kernel void @store_select_group_constexpr_ptrtoint(i1 %c, i32 addrspace(3)* %group.ptr.0) #0 {
+  %cast0 = addrspacecast i32 addrspace(3)* %group.ptr.0 to i32 addrspace(4)*
+  %select = select i1 %c, i32 addrspace(4)* %cast0, i32 addrspace(4)* addrspacecast (i32 addrspace(1)* inttoptr (i32 add (i32 ptrtoint ([1024 x i32] addrspace(3)* @lds2 to i32), i32 124) to i32 addrspace(1)*) to i32 addrspace(4)*)
+  store i32 7, i32 addrspace(4)* %select
+  ret void
+}
+
+; CHECK-LABEL: @store_select_group_flat_vector(
+; CHECK: %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*>
+; CHECK: %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*>
+; CHECK: %select = select i1 %c, <2 x i32 addrspace(4)*> %cast0, <2 x i32 addrspace(4)*> %cast1
+; CHECK: %extract0 = extractelement <2 x i32 addrspace(4)*> %select, i32 0
+; CHECK: %extract1 = extractelement <2 x i32 addrspace(4)*> %select, i32 1
+; CHECK: store i32 -1, i32 addrspace(4)* %extract0
+; CHECK: store i32 -2, i32 addrspace(4)* %extract1
+define amdgpu_kernel void @store_select_group_flat_vector(i1 %c, <2 x i32 addrspace(3)*> %group.ptr.0, <2 x i32 addrspace(3)*> %group.ptr.1) #0 {
+  %cast0 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.0 to <2 x i32 addrspace(4)*>
+  %cast1 = addrspacecast <2 x i32 addrspace(3)*> %group.ptr.1 to <2 x i32 addrspace(4)*>
+  %select = select i1 %c, <2 x i32 addrspace(4)*> %cast0, <2 x i32 addrspace(4)*> %cast1
+  %extract0 = extractelement <2 x i32 addrspace(4)*> %select, i32 0
+  %extract1 = extractelement <2 x i32 addrspace(4)*> %select, i32 1
+  store i32 -1, i32 addrspace(4)* %extract0
+  store i32 -2, i32 addrspace(4)* %extract1
+  ret void
+}
+
+attributes #0 = { nounwind }
+
+!0 = !{!"branch_weights", i32 2, i32 10}
diff --git a/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll b/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
index f32d65b66aeef468e390af0e7e438cd02ecc0292..79bf92610a8dc06fb9bc74f5443d765e8dda2c46 100644
--- a/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
+++ b/test/Transforms/InferAddressSpaces/AMDGPU/volatile.ll
@@ -5,7 +5,7 @@
 ; CHECK-LABEL: @volatile_load_flat_from_global(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(1)*
-define void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -16,7 +16,7 @@ define void @volatile_load_flat_from_global(i32 addrspace(1)* nocapture %input,
 ; CHECK-LABEL: @volatile_load_flat_from_constant(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(1)*
-define void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(2)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -27,7 +27,7 @@ define void @volatile_load_flat_from_constant(i32 addrspace(2)* nocapture %input
 ; CHECK-LABEL: @volatile_load_flat_from_group(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32 addrspace(3)*
-define void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -38,7 +38,7 @@ define void @volatile_load_flat_from_group(i32 addrspace(3)* nocapture %input, i
 ; CHECK-LABEL: @volatile_load_flat_from_private(
 ; CHECK: load volatile i32, i32 addrspace(4)*
 ; CHECK: store i32 %val, i32*
-define void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load volatile i32, i32 addrspace(4)* %tmp0, align 4
@@ -49,7 +49,7 @@ define void @volatile_load_flat_from_private(i32* nocapture %input, i32* nocaptu
 ; CHECK-LABEL: @volatile_store_flat_to_global(
 ; CHECK: load i32, i32 addrspace(1)*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i32 addrspace(1)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(1)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(1)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -60,7 +60,7 @@ define void @volatile_store_flat_to_global(i32 addrspace(1)* nocapture %input, i
 ; CHECK-LABEL: @volatile_store_flat_to_group(
 ; CHECK: load i32, i32 addrspace(3)*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i32 addrspace(3)* nocapture %output) #0 {
   %tmp0 = addrspacecast i32 addrspace(3)* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32 addrspace(3)* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -71,7 +71,7 @@ define void @volatile_store_flat_to_group(i32 addrspace(3)* nocapture %input, i3
 ; CHECK-LABEL: @volatile_store_flat_to_private(
 ; CHECK: load i32, i32*
 ; CHECK: store volatile i32 %val, i32 addrspace(4)*
-define void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 {
+define amdgpu_kernel void @volatile_store_flat_to_private(i32* nocapture %input, i32* nocapture %output) #0 {
   %tmp0 = addrspacecast i32* %input to i32 addrspace(4)*
   %tmp1 = addrspacecast i32* %output to i32 addrspace(4)*
   %val = load i32, i32 addrspace(4)* %tmp0, align 4
@@ -115,4 +115,26 @@ define { i32, i1 } @volatile_cmpxchg_group_to_flat(i32 addrspace(3)* %group.ptr,
   ret { i32, i1 } %ret
 }
 
-attributes #0 = { nounwind }
\ No newline at end of file
+; FIXME: Shouldn't be losing names
+; CHECK-LABEL: @volatile_memset_group_to_flat(
+; CHECK: addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true)
+define amdgpu_kernel void @volatile_memset_group_to_flat(i8 addrspace(3)* %group.ptr, i32 %y) #0 {
+  %cast = addrspacecast i8 addrspace(3)* %group.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true)
+  ret void
+}
+
+; CHECK-LABEL: @volatile_memset_global_to_flat(
+; CHECK: addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
+; CHECK: call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %1, i8 4, i64 32, i32 4, i1 true)
+define amdgpu_kernel void @volatile_memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 {
+  %cast = addrspacecast i8 addrspace(1)* %global.ptr to i8 addrspace(4)*
+  call void @llvm.memset.p4i8.i64(i8 addrspace(4)* %cast, i8 4, i64 32, i32 4, i1 true)
+  ret void
+}
+
+declare void @llvm.memset.p4i8.i64(i8 addrspace(4)* nocapture writeonly, i8, i64, i32, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { argmemonly nounwind }
diff --git a/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll b/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b2d8ddb195653ebe611421cffc9311e31a3836d7
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/NVPTX/bug31948.ll
@@ -0,0 +1,24 @@
+; RUN: opt -S -mtriple=nvptx64-nvidia-cuda -infer-address-spaces %s | FileCheck %s
+
+target datalayout = "e-i64:64-v16:16-v32:32-n16:32:64"
+
+%struct.bar = type { float, float* }
+
+@var1 = local_unnamed_addr addrspace(3) externally_initialized global %struct.bar undef, align 8
+
+; CHECK-LABEL: @bug31948(
+; CHECK: %tmp = load float*, float* addrspace(3)* getelementptr inbounds (%struct.bar, %struct.bar addrspace(3)* @var1, i64 0, i32 1), align 8
+; CHECK: %tmp1 = load float, float* %tmp, align 4
+; CHECK: store float %conv1, float* %tmp, align 4
+; CHECK: store i32 32, i32 addrspace(3)* addrspacecast (i32* bitcast (float** getelementptr (%struct.bar, %struct.bar* addrspacecast (%struct.bar addrspace(3)* @var1 to %struct.bar*), i64 0, i32 1) to i32*) to i32 addrspace(3)*), align 4
+define void @bug31948(float %a, float* nocapture readnone %x, float* nocapture readnone %y) local_unnamed_addr #0 {
+entry:
+  %tmp = load float*, float** getelementptr (%struct.bar, %struct.bar* addrspacecast (%struct.bar addrspace(3)* @var1 to %struct.bar*), i64 0, i32 1), align 8
+  %tmp1 = load float, float* %tmp, align 4
+  %conv1 = fadd float %tmp1, 1.000000e+00
+  store float %conv1, float* %tmp, align 4
+  store i32 32, i32* bitcast (float** getelementptr (%struct.bar, %struct.bar* addrspacecast (%struct.bar addrspace(3)* @var1 to %struct.bar*), i64 0, i32 1) to i32*), align 4
+  ret void
+}
+
+attributes #0 = { norecurse nounwind }
diff --git a/test/Transforms/InferAddressSpaces/NVPTX/lit.local.cfg b/test/Transforms/InferAddressSpaces/NVPTX/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..2cb98eb371b21bc47c99d369adaffefd84d4a625
--- /dev/null
+++ b/test/Transforms/InferAddressSpaces/NVPTX/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'NVPTX' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/Inline/alloca-bonus.ll b/test/Transforms/Inline/alloca-bonus.ll
index 542dcee0fcb2a8b195c66af162edeee8588dd74a..c5c2ce11cc5b06fb76887f4cb28be3dc52146fb0 100644
--- a/test/Transforms/Inline/alloca-bonus.ll
+++ b/test/Transforms/Inline/alloca-bonus.ll
@@ -3,7 +3,7 @@
 
 target datalayout = "p:32:32"
 
-declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr)
+declare void @llvm.lifetime.start.p0i8(i64 %size, i8* nocapture %ptr)
 
 @glbl = external global i32
 
@@ -22,7 +22,7 @@ define void @inner1(i32 *%ptr) {
   %D = getelementptr inbounds i32, i32* %ptr, i32 1
   %E = bitcast i32* %ptr to i8*
   %F = select i1 false, i32* %ptr, i32* @glbl
-  call void @llvm.lifetime.start(i64 0, i8* %E)
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* %E)
   call void @extern()
   ret void
 }
@@ -43,7 +43,7 @@ define void @inner2(i32 *%ptr) {
   %D = getelementptr inbounds i32, i32* %ptr, i32 %A
   %E = bitcast i32* %ptr to i8*
   %F = select i1 false, i32* %ptr, i32* @glbl
-  call void @llvm.lifetime.start(i64 0, i8* %E)
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* %E)
   call void @extern()
   ret void
 }
@@ -152,7 +152,7 @@ if.then:
   %D = getelementptr inbounds i32, i32* %ptr, i32 %A
   %E = bitcast i32* %ptr to i8*
   %F = select i1 false, i32* %ptr, i32* @glbl
-  call void @llvm.lifetime.start(i64 0, i8* %E)
+  call void @llvm.lifetime.start.p0i8(i64 0, i8* %E)
   ret void
 
 exit:
diff --git a/test/Transforms/Inline/arg-attr-propagation.ll b/test/Transforms/Inline/arg-attr-propagation.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3d18e8047e5bf815ecc64593a673c34932bb36f9
--- /dev/null
+++ b/test/Transforms/Inline/arg-attr-propagation.ll
@@ -0,0 +1,50 @@
+; RUN: opt -inline -S < %s | FileCheck %s
+
+; The callee guarantees that the pointer argument is nonnull and dereferenceable.
+; That information should transfer to the caller.
+
+define i32 @callee(i32* dereferenceable(32) %t1) {
+; CHECK-LABEL: @callee(i32* dereferenceable(32) %t1)
+; CHECK-NEXT:    [[T2:%.*]] = load i32, i32* %t1
+; CHECK-NEXT:    ret i32 [[T2]]
+;
+  %t2 = load i32, i32* %t1
+  ret i32 %t2
+}
+
+; FIXME: All dereferenceability information is lost.
+; The caller argument could be known nonnull and dereferenceable(32).
+
+define i32 @caller1(i32* %t1) {
+; CHECK-LABEL: @caller1(i32* %t1)
+; CHECK-NEXT:    [[T2_I:%.*]] = load i32, i32* %t1
+; CHECK-NEXT:    ret i32 [[T2_I]]
+;
+  %t2 = tail call i32 @callee(i32* dereferenceable(32) %t1)
+  ret i32 %t2
+}
+
+; The caller argument is nonnull, but that can be explicit.
+; The dereferenceable amount could be increased.
+
+define i32 @caller2(i32* dereferenceable(31) %t1) {
+; CHECK-LABEL: @caller2(i32* dereferenceable(31) %t1)
+; CHECK-NEXT:    [[T2_I:%.*]] = load i32, i32* %t1
+; CHECK-NEXT:    ret i32 [[T2_I]]
+;
+  %t2 = tail call i32 @callee(i32* dereferenceable(32) %t1)
+  ret i32 %t2
+}
+
+; The caller argument is nonnull, but that can be explicit.
+; Make sure that we don't propagate a smaller dereferenceable amount.
+
+define i32 @caller3(i32* dereferenceable(33) %t1) {
+; CHECK-LABEL: @caller3(i32* dereferenceable(33) %t1)
+; CHECK-NEXT:    [[T2_I:%.*]] = load i32, i32* %t1
+; CHECK-NEXT:    ret i32 [[T2_I]]
+;
+  %t2 = tail call i32 @callee(i32* dereferenceable(32) %t1)
+  ret i32 %t2
+}
+
diff --git a/test/Transforms/Inline/bfi-update.ll b/test/Transforms/Inline/bfi-update.ll
new file mode 100644
index 0000000000000000000000000000000000000000..94584e2e6ce55c66eb48f653ea25068aaa1b3152
--- /dev/null
+++ b/test/Transforms/Inline/bfi-update.ll
@@ -0,0 +1,93 @@
+; RUN: opt < %s -passes='require<profile-summary>,cgscc(inline)' -S -inline-threshold=50 -inline-cold-callsite-threshold=0 -hot-callsite-threshold=50 | FileCheck %s
+; This tests incremental updates to caller's BFI as a callee gets inlined.
+; In bottom-up inlining, first c->e inlining is considered and fails because
+; e's size exceeds the threshold of 50. Then a->c inlining is considered and it
+; succeeds. a's BFI is updated incrementally. As c's blocks get pruned, the 
+; block with label cond_false is removed and since the remanining code is
+; straight-line a single block gets cloned into a. This block should get the
+; maximum block frequency among the original blocks in c. If it gets the
+; frequency of the block with label cond_true in @c, its frequency will be
+; 1/10th of function a's entry block frequency, resulting in a callsite count of
+; 2 (since a's entry count is 20) which means that a->e callsite will be
+; considered cold and not inlined. 
+
+@data = external global i32
+; CHECK-LABEL: define i32 @a(
+define i32 @a(i32 %a1) !prof !21 {
+; CHECK-NOT: call i32 @c
+; CHECK-NOT: call i32 @e
+; CHECK: ret
+entry:
+  %cond = icmp sle i32 %a1, 1
+  %a2 = call i32 @c(i32 1)
+  br label %exit
+exit:
+  ret i32 %a2
+}
+
+declare void @ext();
+
+; CHECK: @c(i32 %c1) !prof [[COUNT1:![0-9]+]]
+define i32 @c(i32 %c1) !prof !23 {
+  call void @ext()
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false, !prof !25
+
+cond_false:
+  br label %exit
+
+cond_true:
+  %c11 = call i32 @e(i32 %c1)
+  br label %exit
+exit:
+  %c12 = phi i32 [ 0, %cond_false], [ %c11, %cond_true ]
+  ret i32 %c12
+}
+
+
+; CHECK: @e(i32 %c1) !prof [[COUNT2:![0-9]+]]
+define i32 @e(i32 %c1) !prof !24 {
+  call void @ext()
+  call void @ext()
+  %cond = icmp sle i32 %c1, 1
+  br i1 %cond, label %cond_true, label %cond_false
+
+cond_false:
+  call void @ext()
+  %c2 = load i32, i32* @data, align 4
+  %c3 = add i32 %c1, %c2
+  %c4 = mul i32 %c3, %c2
+  %c5 = add i32 %c4, %c2
+  %c6 = mul i32 %c5, %c2
+  %c7 = add i32 %c6, %c2
+  %c8 = mul i32 %c7, %c2
+  %c9 = add i32 %c8, %c2
+  %c10 = mul i32 %c9, %c2
+  ret i32 %c10
+
+cond_true:
+  ret i32 0
+}
+
+; CHECK: [[COUNT1]] = !{!"function_entry_count", i64 480}
+; CHECK: [[COUNT2]] = !{!"function_entry_count", i64 80}
+!21 = !{!"function_entry_count", i64 20}
+!23 = !{!"function_entry_count", i64 500}
+!24 = !{!"function_entry_count", i64 100}
+!25 = !{!"branch_weights", i32 1, i32 9}
+
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 1000, i32 1}
+!13 = !{i32 999000, i64 1000, i32 1}
+!14 = !{i32 999999, i64 5, i32 2}
diff --git a/test/Transforms/Inline/cgscc-incremental-invalidate.ll b/test/Transforms/Inline/cgscc-incremental-invalidate.ll
index 7c6d0fc04ce32df0a91582ed55ddfb89d4e23c38..82d321ccf225c2e1b2993b37e32198273d3bfc54 100644
--- a/test/Transforms/Inline/cgscc-incremental-invalidate.ll
+++ b/test/Transforms/Inline/cgscc-incremental-invalidate.ll
@@ -7,19 +7,19 @@
 ; may stop testing anything.
 ;
 ; CHECK-LABEL: Starting llvm::Module pass manager run.
-; CHECK: Running pass: InlinerPass on (test1_h, test1_g, test1_f)
-; CHECK: Running analysis: FunctionAnalysisManagerCGSCCProxy on (test1_h, test1_g, test1_f)
+; CHECK: Running pass: InlinerPass on (test1_f, test1_g, test1_h)
+; CHECK: Running analysis: FunctionAnalysisManagerCGSCCProxy on (test1_f, test1_g, test1_h)
 ; CHECK: Running analysis: DominatorTreeAnalysis on test1_f
 ; CHECK: Running analysis: DominatorTreeAnalysis on test1_g
-; CHECK: Invalidating all non-preserved analyses for: (test1_h, test1_g, test1_f)
-; CHECK: Invalidating all non-preserved analyses for: test1_h
-; CHECK-NOT: Invalidating anaylsis:
-; CHECK: Invalidating all non-preserved analyses for: test1_g
-; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_g
+; CHECK: Invalidating all non-preserved analyses for: (test1_f, test1_g, test1_h)
 ; CHECK: Invalidating all non-preserved analyses for: test1_f
 ; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_f
+; CHECK: Invalidating all non-preserved analyses for: test1_g
+; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_g
+; CHECK: Invalidating all non-preserved analyses for: test1_h
+; CHECK-NOT: Invalidating anaylsis:
 ; CHECK: Running analysis: DominatorTreeAnalysis on test1_h
-; CHECK: Invalidating all non-preserved analyses for: (test1_h, test1_g)
+; CHECK: Invalidating all non-preserved analyses for: (test1_g, test1_h)
 ; CHECK: Invalidating all non-preserved analyses for: test1_h
 ; CHECK: Invalidating analysis: DominatorTreeAnalysis on test1_h
 
@@ -51,14 +51,14 @@ return:
 ; reducing an SCC in the inliner cannot accidentially leave stale function
 ; analysis results due to failing to invalidate them for all the functions.
 
-; We visit this function first in the inliner, and while we inline callee
-; perturbing the CFG, we don't inline anything else and the SCC structure
-; remains in tact.
-define void @test1_f() {
-; CHECK-LABEL: define void @test1_f()
+; The inliner visits this last function. It can't actually break any cycles
+; here, but because we visit this function we compute fresh analyses for it.
+; These analyses are then invalidated when we inline callee disrupting the
+; CFG, and it is important that they be freed.
+define void @test1_h() {
+; CHECK-LABEL: define void @test1_h()
 entry:
-  ; We force this edge to survive inlining.
-  call void @test1_g() noinline
+  call void @test1_g()
 ; CHECK: call void @test1_g()
 
   ; Pull interesting CFG into this function.
@@ -69,7 +69,7 @@ entry:
 ; CHECK: ret void
 }
 
-; Next we visit this function and here we inline the edge to 'test1_f'
+; We visit this function second and here we inline the edge to 'test1_f'
 ; separating it into its own SCC. The current SCC is now just 'test1_g' and
 ; 'test1_h'.
 define void @test1_g() {
@@ -92,14 +92,14 @@ entry:
 ; CHECK: ret void
 }
 
-; Finally the inliner visits this last function. It can't actually break any
-; cycles here, but because we visit this function we compute fresh analyses for
-; it. These analyses are then invalidated when we inline callee disrupting the
-; CFG, and it is important that they be freed.
-define void @test1_h() {
-; CHECK-LABEL: define void @test1_h()
+; We visit this function first in the inliner, and while we inline callee
+; perturbing the CFG, we don't inline anything else and the SCC structure
+; remains in tact.
+define void @test1_f() {
+; CHECK-LABEL: define void @test1_f()
 entry:
-  call void @test1_g()
+  ; We force this edge to survive inlining.
+  call void @test1_g() noinline
 ; CHECK: call void @test1_g()
 
   ; Pull interesting CFG into this function.
diff --git a/test/Transforms/Inline/cgscc-invalidate.ll b/test/Transforms/Inline/cgscc-invalidate.ll
index 60315cda771da71c87ae06435ca2553f9be23ad7..69d84f65e2517f01ee5958cb788b0cd3704f5a62 100644
--- a/test/Transforms/Inline/cgscc-invalidate.ll
+++ b/test/Transforms/Inline/cgscc-invalidate.ll
@@ -65,15 +65,15 @@ entry:
 ; The 'test3_' prefixed functions test the scenario of not inlining preserving
 ; dominators after splitting an SCC into two smaller SCCs.
 
-; The first function gets visited first and we end up inlining everything we
-; can into this routine. That splits test3_g into a separate SCC that is enqued
-; for later processing.
-define void @test3_f() {
-; CHECK-LABEL: define void @test3_f()
+; This function ends up split into a separate SCC, which can cause its analyses
+; to become stale if the splitting doesn't properly invalidate things. Also, as
+; a consequence of being split out, test3_f is too large to inline by the time
+; we get here.
+define void @test3_g() {
+; CHECK-LABEL: define void @test3_g()
 entry:
-  ; Create the first edge in the SCC cycle.
-  call void @test3_g()
-; CHECK-NOT: @test3_g()
+  ; Create the second edge in the SCC cycle.
+  call void @test3_f()
 ; CHECK: call void @test3_f()
 
   ; Pull interesting CFG into this function.
@@ -84,15 +84,15 @@ entry:
 ; CHECK: ret void
 }
 
-; This function ends up split into a separate SCC, which can cause its analyses
-; to become stale if the splitting doesn't properly invalidate things. Also, as
-; a consequence of being split out, test3_f is too large to inline by the time
-; we get here.
-define void @test3_g() {
-; CHECK-LABEL: define void @test3_g()
+; The second function gets visited first and we end up inlining everything we
+; can into this routine. That splits test3_g into a separate SCC that is enqued
+; for later processing.
+define void @test3_f() {
+; CHECK-LABEL: define void @test3_f()
 entry:
-  ; Create the second edge in the SCC cycle.
-  call void @test3_f()
+  ; Create the first edge in the SCC cycle.
+  call void @test3_g()
+; CHECK-NOT: @test3_g()
 ; CHECK: call void @test3_f()
 
   ; Pull interesting CFG into this function.
diff --git a/test/Transforms/Inline/crash-lifetime-marker.ll b/test/Transforms/Inline/crash-lifetime-marker.ll
index e7a594cdb5e438b38ba8b4da0efcb90c446a26e7..7196616521e95fef49af914f4d62e15ef1d76dda 100644
--- a/test/Transforms/Inline/crash-lifetime-marker.ll
+++ b/test/Transforms/Inline/crash-lifetime-marker.ll
@@ -15,9 +15,9 @@ define i32 @callee1(i32 %count) {
 
 ; CHECK-LABEL: define i32 @caller1(
 ; CHECK: [[ALLOCA:%[a-z0-9\.]+]] = alloca i8
-; CHECK-NOT: call void @llvm.lifetime.start(
+; CHECK-NOT: call void @llvm.lifetime.start.p0i8(
 ; CHECK: call i32 @callee2(i8* [[ALLOCA]])
-; CHECK-NOT: call void @llvm.lifetime.end(
+; CHECK-NOT: call void @llvm.lifetime.end.p0i8(
 
 define i32 @caller1(i32 %count) {
   %call0 = call i32 @callee1(i32 0)
diff --git a/test/Transforms/Inline/inline_stats.ll b/test/Transforms/Inline/inline_stats.ll
index cf0d43e9215b724cc648ba8b583cc3ee918954ba..bc005b6afd51e355ec00a2902ad02e1fc97582ff 100644
--- a/test/Transforms/Inline/inline_stats.ll
+++ b/test/Transforms/Inline/inline_stats.ll
@@ -36,9 +36,12 @@ define void @internal3() {
     ret void
 }
 
+declare void @external_decl()
+
 define void @external1() alwaysinline !thinlto_src_module !0 {
     call fastcc void @internal2()
     call fastcc void @external2();
+    call void @external_decl();
     ret void
 }
 
diff --git a/test/Transforms/Inline/internal-scc-members.ll b/test/Transforms/Inline/internal-scc-members.ll
new file mode 100644
index 0000000000000000000000000000000000000000..258ce00744c55ae0918021fa253a11dacbeb8bec
--- /dev/null
+++ b/test/Transforms/Inline/internal-scc-members.ll
@@ -0,0 +1,31 @@
+; Test that the inliner can handle deleting functions within an SCC while still
+; processing the calls in that SCC.
+;
+; RUN: opt < %s -S -inline | FileCheck %s
+; RUN: opt < %s -S -passes=inline | FileCheck %s
+
+; CHECK-LABEL: define internal void @test1_scc0()
+; CHECK-NOT: call
+; CHECK: call void @test1_scc0()
+; CHECK-NOT: call
+; CHECK: ret
+define internal void @test1_scc0() {
+entry:
+  call void @test1_scc1()
+  ret void
+}
+
+; CHECK-NOT: @test1_scc1
+define internal void @test1_scc1() {
+entry:
+  call void @test1_scc0()
+  ret void
+}
+
+; CHECK-LABEL: define void @test1()
+; CHECK: call void @test1_scc0()
+define void @test1() {
+entry:
+  call void @test1_scc0() noinline
+  ret void
+}
diff --git a/test/Transforms/Inline/last-call-bonus.ll b/test/Transforms/Inline/last-call-bonus.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0088d316848ff71608b6a495295ec6c4c11bad84
--- /dev/null
+++ b/test/Transforms/Inline/last-call-bonus.ll
@@ -0,0 +1,52 @@
+; The goal of this test is checking if LastCallToStaticBonus is applied
+; correctly while deciding inline deferral. For the test code below, when
+; inliner evaluates the callsite of bar->baz, it checks if inlining of bar->baz
+; prevents ininling of foo->bar, even when foo->bar inlining is more beneficial
+; than bar->baz inlining. As LastCallToStaticBonus has a massive value, and
+; both baz and bar has only one caller, the cost of foo->bar inlining and
+; bar->baz inlining should be non-trivial for inliner to compute that bar->baz
+; inlining can actaully prevent foo->bar inlining. To make the cost of these
+; callsites big enough, loop unrolling pass with very high threshold is used to
+; preprocess the test.
+
+; RUN: opt < %s -loop-unroll -inline -unroll-threshold=15000 -inline-threshold=250 -S | FileCheck %s
+; CHECK-LABEL: define internal i32 @bar()
+
+define internal i32 @baz() {
+entry:
+  br label %bb1
+
+bb1:
+  %ind = phi i32 [ 0, %entry ], [ %inc, %bb1 ]
+  call void @extern()
+  %inc = add nsw i32 %ind, 1
+  %cmp = icmp sgt i32 %inc, 510
+  br i1 %cmp, label %ret, label %bb1
+
+ret:
+  ret i32 0
+}
+
+define internal i32 @bar() {
+entry:
+  br label %bb1
+
+bb1:
+  %ind = phi i32 [ 0, %entry ], [ %inc, %bb1 ]
+  call void @extern()
+  %inc = add nsw i32 %ind, 1
+  %cmp = icmp sgt i32 %inc, 510
+  br i1 %cmp, label %ret, label %bb1
+
+ret:
+  call i32 @baz()
+  ret i32 0
+}
+
+define i32 @foo() {
+entry:
+  call i32 @bar()
+  ret i32 0
+}
+
+declare void @extern()
diff --git a/test/Transforms/Inline/lifetime-no-datalayout.ll b/test/Transforms/Inline/lifetime-no-datalayout.ll
index 0212e69d624a1adbcf109e2f41021f78bc7d78d5..5d1872c6a244ceb9dea1f5cd5439c5b3363c7d89 100644
--- a/test/Transforms/Inline/lifetime-no-datalayout.ll
+++ b/test/Transforms/Inline/lifetime-no-datalayout.ll
@@ -13,9 +13,9 @@ define void @helper() {
 define void @test() {
 ; CHECK-LABEL: @test(
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 1
+; CHECK: llvm.lifetime.start.p0i8(i64 1
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 1
+; CHECK: llvm.lifetime.end.p0i8(i64 1
   call void @helper()
 ; CHECK-NOT: lifetime
 ; CHECK: ret void
diff --git a/test/Transforms/Inline/lifetime.ll b/test/Transforms/Inline/lifetime.ll
index 4f415e58f1bf4df765a742f863fc854fddbc628d..c47091395fce430a0f2e699423fe98c2f4e44717 100644
--- a/test/Transforms/Inline/lifetime.ll
+++ b/test/Transforms/Inline/lifetime.ll
@@ -2,25 +2,25 @@
 ; RUN: opt -passes='cgscc(inline)' -S < %s | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-declare void @llvm.lifetime.start(i64, i8*)
-declare void @llvm.lifetime.end(i64, i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
 
 define void @helper_both_markers() {
   %a = alloca i8
   ; Size in llvm.lifetime.start / llvm.lifetime.end differs from
   ; allocation size. We should use the former.
-  call void @llvm.lifetime.start(i64 2, i8* %a)
-  call void @llvm.lifetime.end(i64 2, i8* %a)
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* %a)
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* %a)
   ret void
 }
 
 define void @test_both_markers() {
 ; CHECK-LABEL: @test_both_markers(
-; CHECK: llvm.lifetime.start(i64 2
-; CHECK-NEXT: llvm.lifetime.end(i64 2
+; CHECK: llvm.lifetime.start.p0i8(i64 2
+; CHECK-NEXT: llvm.lifetime.end.p0i8(i64 2
   call void @helper_both_markers()
-; CHECK-NEXT: llvm.lifetime.start(i64 2
-; CHECK-NEXT: llvm.lifetime.end(i64 2
+; CHECK-NEXT: llvm.lifetime.start.p0i8(i64 2
+; CHECK-NEXT: llvm.lifetime.end.p0i8(i64 2
   call void @helper_both_markers()
 ; CHECK-NEXT: ret void
   ret void
@@ -41,14 +41,14 @@ define void @helper_no_markers() {
 define void @test_no_marker() {
 ; CHECK-LABEL: @test_no_marker(
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 1
+; CHECK: llvm.lifetime.start.p0i8(i64 1
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 1
+; CHECK: llvm.lifetime.end.p0i8(i64 1
   call void @helper_no_markers()
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 1
+; CHECK: llvm.lifetime.start.p0i8(i64 1
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 1
+; CHECK: llvm.lifetime.end.p0i8(i64 1
   call void @helper_no_markers()
 ; CHECK-NOT: lifetime
 ; CHECK: ret void
@@ -58,23 +58,23 @@ define void @test_no_marker() {
 define void @helper_two_casts() {
   %a = alloca i32
   %b = bitcast i32* %a to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %b)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %b)
   %c = bitcast i32* %a to i8*
-  call void @llvm.lifetime.end(i64 4, i8* %c)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %c)
   ret void
 }
 
 define void @test_two_casts() {
 ; CHECK-LABEL: @test_two_casts(
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 4
+; CHECK: llvm.lifetime.start.p0i8(i64 4
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 4
+; CHECK: llvm.lifetime.end.p0i8(i64 4
   call void @helper_two_casts()
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 4
+; CHECK: llvm.lifetime.start.p0i8(i64 4
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 4
+; CHECK: llvm.lifetime.end.p0i8(i64 4
   call void @helper_two_casts()
 ; CHECK-NOT: lifetime
 ; CHECK: ret void
@@ -91,9 +91,9 @@ define void @helper_arrays_alloca() {
 define void @test_arrays_alloca() {
 ; CHECK-LABEL: @test_arrays_alloca(
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.start(i64 40,
+; CHECK: llvm.lifetime.start.p0i8(i64 40,
 ; CHECK-NOT: lifetime
-; CHECK: llvm.lifetime.end(i64 40,
+; CHECK: llvm.lifetime.end.p0i8(i64 40,
   call void @helper_arrays_alloca()
 ; CHECK-NOT: lifetime
 ; CHECK: ret void
diff --git a/test/Transforms/Inline/monster_scc.ll b/test/Transforms/Inline/monster_scc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0f8f1f21c8b56a5dc94f18d2a7082ab8e6618d15
--- /dev/null
+++ b/test/Transforms/Inline/monster_scc.ll
@@ -0,0 +1,460 @@
+; This test creates a monster SCC with a very pernicious call graph. It builds
+; a cycle of cross-connected pairs of functions with interesting inlining
+; decisions throughout, but ultimately trivial code complexity.
+;
+; Typically, a greedy approach to inlining works well for bottom-up inliners
+; such as LLVM's. However, there is no way to be bottom-up over an SCC: it's
+; a cycle! Greedily inlining as much as possible into each function of this
+; *SCC* will have the disasterous effect of inlining all N-1 functions into the
+; first one visited, N-2 functions into the second one visited, N-3 into the
+; third, and so on. This is because until inlining occurs, each function in
+; isolation appears to be an excellent inline candidate.
+;
+; Note that the exact number of calls in each function doesn't really matter.
+; It is mostly a function of cost thresholds and visit order. Because this is an
+; SCC there is no "right" or "wrong" answer here as long as no function blows up
+; to be *huge*. The specific concerning pattern is if one or more functions get
+; more than 16 calls in them.
+;
+; This test is extracted from the following C++ program compiled with Clang.
+; The IR is simplified with SROA, instcombine, and simplify-cfg. Then C++
+; linkage stuff, attributes, target specific things, metadata and comments were
+; removed. The order of the fuctions is also made more predictable than Clang's
+; output order.
+;
+;   void g(int);
+;
+;   template <bool K, int N> void f(bool *B, bool *E) {
+;     if (K)
+;       g(N);
+;     if (B == E)
+;       return;
+;     if (*B)
+;       f<true, N + 1>(B + 1, E);
+;     else
+;       f<false, N + 1>(B + 1, E);
+;   }
+;   template <> void f<false, MAX>(bool *B, bool *E) { return f<false, 0>(B, E); }
+;   template <> void f<true, MAX>(bool *B, bool *E) { return f<true, 0>(B, E); }
+;
+;   void test(bool *B, bool *E) { f<false, 0>(B, E); }
+;
+; RUN: opt -S < %s -inline -inline-threshold=150 | FileCheck %s --check-prefixes=CHECK,OLD
+; RUN: opt -S < %s -passes=inline -inline-threshold=150 | FileCheck %s --check-prefixes=CHECK,NEW
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+declare void @_Z1gi(i32)
+
+; CHECK-LABEL: define void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi1EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi2EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb0ELi0EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi1EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi1EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi1EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi2EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb1ELi0EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1gi(i32 0)
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi1EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi1EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb0ELi1EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi2EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb0ELi1EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi2EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi2EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb1ELi1EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi2EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb1ELi1EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1gi(i32 1)
+  %cmp = icmp eq i8* %B, %E
+; CHECK-NOT: call
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi2EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi2EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb0ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi4EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb0ELi2EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi3EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi3EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb1ELi2EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1gi(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb1ELi2EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1gi(i32 2)
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi3EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi3EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb0ELi3EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb0ELi0EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb0ELi3EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi4EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi4EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb1ELi3EEvPbS0_(
+; CHECK-NOT: call
+; CHECK: call void @_Z1gi(
+; CHECK-NOT: call
+; CHECK: call void @_Z1fILb1ELi0EEvPbS0_(
+; CHECK-NOT: call
+; CHECK: call void @_Z1fILb0ELi0EEvPbS0_(
+; CHECK-NOT: call
+define void @_Z1fILb1ELi3EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1gi(i32 3)
+  %cmp = icmp eq i8* %B, %E
+  br i1 %cmp, label %if.end3, label %if.end
+
+if.end:
+  %0 = load i8, i8* %B, align 1
+  %tobool = icmp eq i8 %0, 0
+  %add.ptr2 = getelementptr inbounds i8, i8* %B, i64 1
+  br i1 %tobool, label %if.else, label %if.then1
+
+if.then1:
+  call void @_Z1fILb1ELi4EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.else:
+  call void @_Z1fILb0ELi4EEvPbS0_(i8* %add.ptr2, i8* %E)
+  br label %if.end3
+
+if.end3:
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb0ELi4EEvPbS0_(
+; CHECK-NOT: call
+; CHECK: call void @_Z1fILb0ELi0EEvPbS0_(
+; CHECK-NOT: call
+define void @_Z1fILb0ELi4EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1fILb0ELi0EEvPbS0_(i8* %B, i8* %E)
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z1fILb1ELi4EEvPbS0_(
+; OLD-NOT: call
+; OLD: call void @_Z1fILb1ELi0EEvPbS0_(
+; OLD-NOT: call
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi1EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi2EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1gi(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb1ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi0EEvPbS0_(
+; NEW-NOT: call
+; NEW: call void @_Z1fILb0ELi3EEvPbS0_(
+; NEW-NOT: call
+define void @_Z1fILb1ELi4EEvPbS0_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1fILb1ELi0EEvPbS0_(i8* %B, i8* %E)
+  ret void
+}
+
+; CHECK-LABEL: define void @_Z4testPbS_(
+; CHECK: call
+; CHECK-NOT: call
+define void @_Z4testPbS_(i8* %B, i8* %E) {
+entry:
+  call void @_Z1fILb0ELi0EEvPbS0_(i8* %B, i8* %E)
+  ret void
+}
+
diff --git a/test/Transforms/Inline/prof-update.ll b/test/Transforms/Inline/prof-update.ll
new file mode 100644
index 0000000000000000000000000000000000000000..38fcc7e45996490391abedbc4718ad6145d16f01
--- /dev/null
+++ b/test/Transforms/Inline/prof-update.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -inline -S | FileCheck %s
+; Checks if inliner updates branch_weights annotation for call instructions.
+
+declare void @ext();
+declare void @ext1();
+
+; CHECK: define void @callee(i32 %n) !prof ![[ENTRY_COUNT:[0-9]*]]
+define void  @callee(i32 %n) !prof !1 {
+  %cond = icmp sle i32 %n, 10
+  br i1 %cond, label %cond_true, label %cond_false
+cond_true:
+; ext1 is optimized away, thus not updated.
+; CHECK: call void @ext1(), !prof ![[COUNT_CALLEE1:[0-9]*]]
+  call void @ext1(), !prof !2
+  ret void
+cond_false:
+; ext is cloned and updated.
+; CHECK: call void @ext(), !prof ![[COUNT_CALLEE:[0-9]*]]
+  call void @ext(), !prof !2
+  ret void
+}
+
+; CHECK: define void @caller()
+define void @caller() {
+; CHECK: call void @ext(), !prof ![[COUNT_CALLER:[0-9]*]]
+  call void @callee(i32 15), !prof !3
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"MaxFunctionCount", i32 2000}
+!1 = !{!"function_entry_count", i64 1000}
+!2 = !{!"branch_weights", i64 2000}
+!3 = !{!"branch_weights", i64 400}
+attributes #0 = { alwaysinline }
+; CHECK: ![[ENTRY_COUNT]] = !{!"function_entry_count", i64 600}
+; CHECK: ![[COUNT_CALLEE1]] = !{!"branch_weights", i64 2000}
+; CHECK: ![[COUNT_CALLEE]] = !{!"branch_weights", i32 1200}
+; CHECK: ![[COUNT_CALLER]] = !{!"branch_weights", i32 800}
diff --git a/test/Transforms/InstCombine/2008-01-29-AddICmp.ll b/test/Transforms/InstCombine/2008-01-29-AddICmp.ll
deleted file mode 100644
index a33eb9c1ddd42c7c56ab7a01d512989cac79666a..0000000000000000000000000000000000000000
--- a/test/Transforms/InstCombine/2008-01-29-AddICmp.ll
+++ /dev/null
@@ -1,85 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-; PR1949
-
-define i1 @test1(i32 %a) {
-; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 %a, -5
-; CHECK-NEXT:    ret i1 [[C]]
-;
-  %b = add i32 %a, 4
-  %c = icmp ult i32 %b, 4
-  ret i1 %c
-}
-
-define <2 x i1> @test1vec(<2 x i32> %a) {
-; CHECK-LABEL: @test1vec(
-; CHECK-NEXT:    [[C:%.*]] = icmp ugt <2 x i32> %a, <i32 -5, i32 -5>
-; CHECK-NEXT:    ret <2 x i1> [[C]]
-;
-  %b = add <2 x i32> %a, <i32 4, i32 4>
-  %c = icmp ult <2 x i32> %b, <i32 4, i32 4>
-  ret <2 x i1> %c
-}
-
-define i1 @test2(i32 %a) {
-; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 %a, 4
-; CHECK-NEXT:    ret i1 [[C]]
-;
-  %b = sub i32 %a, 4
-  %c = icmp ugt i32 %b, -5
-  ret i1 %c
-}
-
-define <2 x i1> @test2vec(<2 x i32> %a) {
-; CHECK-LABEL: @test2vec(
-; CHECK-NEXT:    [[C:%.*]] = icmp ult <2 x i32> %a, <i32 4, i32 4>
-; CHECK-NEXT:    ret <2 x i1> [[C]]
-;
-  %b = sub <2 x i32> %a, <i32 4, i32 4>
-  %c = icmp ugt <2 x i32> %b, <i32 -5, i32 -5>
-  ret <2 x i1> %c
-}
-
-define i1 @test3(i32 %a) {
-; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 %a, 2147483643
-; CHECK-NEXT:    ret i1 [[C]]
-;
-  %b = add i32 %a, 4
-  %c = icmp slt i32 %b, 2147483652
-  ret i1 %c
-}
-
-define <2 x i1> @test3vec(<2 x i32> %a) {
-; CHECK-LABEL: @test3vec(
-; CHECK-NEXT:    [[C:%.*]] = icmp sgt <2 x i32> %a, <i32 2147483643, i32 2147483643>
-; CHECK-NEXT:    ret <2 x i1> [[C]]
-;
-  %b = add <2 x i32> %a, <i32 4, i32 4>
-  %c = icmp slt <2 x i32> %b, <i32 2147483652, i32 2147483652>
-  ret <2 x i1> %c
-}
-
-define i1 @test4(i32 %a) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 %a, -4
-; CHECK-NEXT:    ret i1 [[C]]
-;
-  %b = add i32 %a, 2147483652
-  %c = icmp sge i32 %b, 4
-  ret i1 %c
-}
-
-define <2 x i1> @test4vec(<2 x i32> %a) {
-; CHECK-LABEL: @test4vec(
-; CHECK-NEXT:    [[C:%.*]] = icmp slt <2 x i32> %a, <i32 -4, i32 -4>
-; CHECK-NEXT:    ret <2 x i1> [[C]]
-;
-  %b = add <2 x i32> %a, <i32 2147483652, i32 2147483652>
-  %c = icmp sge <2 x i32> %b, <i32 4, i32 4>
-  ret <2 x i1> %c
-}
-
diff --git a/test/Transforms/InstCombine/2008-05-22-NegValVector.ll b/test/Transforms/InstCombine/2008-05-22-NegValVector.ll
index bf92faf2fec58d67127c16e3f21a036790240f27..58259be8bc9236383cebb27f589e242bb4d7dbe5 100644
--- a/test/Transforms/InstCombine/2008-05-22-NegValVector.ll
+++ b/test/Transforms/InstCombine/2008-05-22-NegValVector.ll
@@ -6,3 +6,9 @@ define <3 x i8> @f(<3 x i8> %a) {
   ret <3 x i8> %B
 }
 
+define <3 x i4> @g(<3 x i4> %a) {
+  %A = sub <3 x i4> zeroinitializer, %a
+  %B = mul <3 x i4> %A, <i4 5, i4 5, i4 5>
+  ret <3 x i4> %B
+}
+
diff --git a/test/Transforms/InstCombine/2008-11-20-DivMulRem.ll b/test/Transforms/InstCombine/2008-11-20-DivMulRem.ll
deleted file mode 100644
index 0c0e55a0b2d9a80ff643ad1321d85f1f87b87c33..0000000000000000000000000000000000000000
--- a/test/Transforms/InstCombine/2008-11-20-DivMulRem.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-; PR3103
-
-define i8 @test1(i8 %x, i8 %y) {
-; CHECK-LABEL: @test1(
-  %A = udiv i8 %x, %y
-; CHECK-NEXT: urem
-  %B = mul i8 %A, %y
-  %C = sub i8 %x, %B
-  ret i8 %C
-; CHECK-NEXT: ret
-}
-
-define i8 @test2(i8 %x, i8 %y) {
-; CHECK-LABEL: @test2(
-  %A = sdiv i8 %x, %y
-; CHECK-NEXT: srem
-  %B = mul i8 %A, %y
-  %C = sub i8 %x, %B
-  ret i8 %C
-; CHECK-NEXT: ret
-}
-
-define i8 @test3(i8 %x, i8 %y) {
-; CHECK-LABEL: @test3(
-  %A = udiv i8 %x, %y
-; CHECK-NEXT: urem
-  %B = mul i8 %A, %y
-  %C = sub i8 %B, %x
-; CHECK-NEXT: sub
-  ret i8 %C
-; CHECK-NEXT: ret
-}
-
-define i8 @test4(i8 %x) {
-; CHECK-LABEL: @test4(
-  %A = udiv i8 %x, 3
-; CHECK-NEXT: urem
-  %B = mul i8 %A, -3
-; CHECK-NEXT: sub
-  %C = sub i8 %x, %B
-; CHECK-NEXT: add
-  ret i8 %C
-; CHECK-NEXT: ret
-}
-
-define i32 @test5(i32 %x, i32 %y) {
-; CHECK-LABEL: @test5(
-; (((X / Y) * Y) / Y) -> X / Y
-  %div = sdiv i32 %x, %y
-; CHECK-NEXT: sdiv
-  %mul = mul i32 %div, %y
-  %r = sdiv i32 %mul, %y
-  ret i32 %r
-; CHECK-NEXT: ret
-}
-
-define i32 @test6(i32 %x, i32 %y) {
-; CHECK-LABEL: @test6(
-; (((X / Y) * Y) / Y) -> X / Y
-  %div = udiv i32 %x, %y
-; CHECK-NEXT: udiv
-  %mul = mul i32 %div, %y
-  %r = udiv i32 %mul, %y
-  ret i32 %r
-; CHECK-NEXT: ret
-}
diff --git a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
index 14fcf52fe9a78566c089424ab55a067ccab64294..71255ebbf81ff6246a448d7670306d04fdbebf15 100644
--- a/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
+++ b/test/Transforms/InstCombine/2012-07-25-LoadPart.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -default-data-layout="e-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=LE
-; RUN: opt < %s -default-data-layout="E-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=BE
+; RUN: opt < %s -data-layout="e-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=LE
+; RUN: opt < %s -data-layout="E-p:32:32:32" -instcombine -S | FileCheck %s --check-prefix=BE
 ; PR13442
 
 @test = constant [4 x i32] [i32 1, i32 2, i32 3, i32 4]
diff --git a/test/Transforms/InstCombine/add-sitofp.ll b/test/Transforms/InstCombine/add-sitofp.ll
index 3b5485e005284ee0684ee8aad03e767cbf4544a1..2abfa436f6d33a9fbee0b4e1afecf88a506003f8 100644
--- a/test/Transforms/InstCombine/add-sitofp.ll
+++ b/test/Transforms/InstCombine/add-sitofp.ll
@@ -1,6 +1,14 @@
-; RUN: opt < %s -instcombine -S | grep "add nuw nsw i32"
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
 
-define double @x(i32 %a, i32 %b) nounwind {
+define double @x(i32 %a, i32 %b) {
+; CHECK-LABEL: @x(
+; CHECK-NEXT:    [[M:%.*]] = lshr i32 [[A:%.*]], 24
+; CHECK-NEXT:    [[N:%.*]] = and i32 [[M]], [[B:%.*]]
+; CHECK-NEXT:    [[ADDCONV:%.*]] = add nuw nsw i32 [[N]], 1
+; CHECK-NEXT:    [[P:%.*]] = sitofp i32 [[ADDCONV]] to double
+; CHECK-NEXT:    ret double [[P]]
+;
   %m = lshr i32 %a, 24
   %n = and i32 %m, %b
   %o = sitofp i32 %n to double
diff --git a/test/Transforms/InstCombine/add.ll b/test/Transforms/InstCombine/add.ll
index 39a746ab310b138f7eccc2b4e047e246855d5517..648305d134cd385943bf88e337e67cfb511b83fc 100644
--- a/test/Transforms/InstCombine/add.ll
+++ b/test/Transforms/InstCombine/add.ll
@@ -1,6 +1,32 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
+; TODO: This should be canonicalized to either a select or xor+zext.
+
+define i32 @select_0_or_1_from_bool(i1 %x) {
+; CHECK-LABEL: @select_0_or_1_from_bool(
+; CHECK-NEXT:    [[EXT:%.*]] = sext i1 %x to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[EXT]], 1
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %ext = sext i1 %x to i32
+  %add = add i32 %ext, 1
+  ret i32 %add
+}
+
+; TODO: This should be canonicalized to either a select or xor+zext.
+
+define <2 x i32> @select_0_or_1_from_bool_vec(<2 x i1> %x) {
+; CHECK-LABEL: @select_0_or_1_from_bool_vec(
+; CHECK-NEXT:    [[EXT:%.*]] = sext <2 x i1> %x to <2 x i32>
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw <2 x i32> [[EXT]], <i32 1, i32 1>
+; CHECK-NEXT:    ret <2 x i32> [[ADD]]
+;
+  %ext = sext <2 x i1> %x to <2 x i32>
+  %add = add <2 x i32> %ext, <i32 1, i32 1>
+  ret <2 x i32> %add
+}
+
 define i32 @test1(i32 %A) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:    ret i32 %A
@@ -100,7 +126,7 @@ define i32 @test9(i32 %A) {
 define i1 @test10(i8 %A, i8 %b) {
 ; CHECK-LABEL: @test10(
 ; CHECK-NEXT:    [[B:%.*]] = sub i8 0, %b
-; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 %A, [[B]]
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[B]], %A
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %B = add i8 %A, %b
@@ -112,7 +138,7 @@ define i1 @test10(i8 %A, i8 %b) {
 define <2 x i1> @test10vec(<2 x i8> %a, <2 x i8> %b) {
 ; CHECK-LABEL: @test10vec(
 ; CHECK-NEXT:    [[C:%.*]] = sub <2 x i8> zeroinitializer, %b
-; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i8> %a, [[C]]
+; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i8> [[C]], %a
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
   %c = add <2 x i8> %a, %b
@@ -244,14 +270,59 @@ define i32 @test19(i1 %C) {
   ret i32 %V
 }
 
+define <2 x i32> @test19vec(i1 %C) {
+; CHECK-LABEL: @test19vec(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 1123, i32 1123>, <2 x i32> <i32 133, i32 133>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = add <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %V
+}
+
+; This is an InstSimplify fold, but test it here to make sure that
+; InstCombine does not prevent the fold.
+; With NSW, add of sign bit -> or of sign bit.
+
 define i32 @test20(i32 %x) {
 ; CHECK-LABEL: @test20(
 ; CHECK-NEXT:    ret i32 %x
 ;
-  %tmp.2 = xor i32 %x, -2147483648
-  ;; Add of sign bit -> xor of sign bit.
-  %tmp.4 = add i32 %tmp.2, -2147483648
-  ret i32 %tmp.4
+  %y = xor i32 %x, -2147483648
+  %z = add nsw i32 %y, -2147483648
+  ret i32 %z
+}
+
+define i32 @xor_sign_bit(i32 %x) {
+; CHECK-LABEL: @xor_sign_bit(
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 %x, -2147483606
+; CHECK-NEXT:    ret i32 [[ADD]]
+;
+  %xor = xor i32 %x, 2147483648
+  %add = add i32 %xor, 42
+  ret i32 %add
+}
+
+; No-wrap info allows converting the add to 'or'.
+
+define i8 @add_nsw_signbit(i8 %x) {
+; CHECK-LABEL: @add_nsw_signbit(
+; CHECK-NEXT:    [[Y:%.*]] = or i8 %x, -128
+; CHECK-NEXT:    ret i8 [[Y]]
+;
+  %y = add nsw i8 %x, -128
+  ret i8 %y
+}
+
+; No-wrap info allows converting the add to 'or'.
+
+define i8 @add_nuw_signbit(i8 %x) {
+; CHECK-LABEL: @add_nuw_signbit(
+; CHECK-NEXT:    [[Y:%.*]] = or i8 %x, -128
+; CHECK-NEXT:    ret i8 [[Y]]
+;
+  %y = add nuw i8 %x, 128
+  ret i8 %y
 }
 
 define i1 @test21(i32 %x) {
@@ -519,3 +590,99 @@ define i64 @test41(i32 %a) {
   %sub = add i64 %zext, -1
   ret i64 %sub
 }
+
+define i32 @test42(i1 %C) {
+; CHECK-LABEL: @test42(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], i32 1123, i32 133
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10
+  %V = add i32 123, %A
+  ret i32 %V
+}
+
+define <2 x i32> @test42vec(i1 %C) {
+; CHECK-LABEL: @test42vec(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 1123, i32 1123>, <2 x i32> <i32 133, i32 133>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = add <2 x i32> <i32 123, i32 123>, %A
+  ret <2 x i32> %V
+}
+
+define <2 x i32> @test42vec2(i1 %C) {
+; CHECK-LABEL: @test42vec2(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 1123, i32 2833>, <2 x i32> <i32 133, i32 363>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 2500>, <2 x i32> <i32 10, i32 30>
+  %V = add <2 x i32> <i32 123, i32 333>, %A
+  ret <2 x i32> %V
+}
+
+define i32 @test55(i1 %which) {
+; CHECK-LABEL: @test55(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 1123, [[ENTRY:%.*]] ], [ 133, [[DELAY]] ]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = add i32 123, %A
+  ret i32 %value
+}
+
+define <2 x i32> @test43vec(i1 %which) {
+; CHECK-LABEL: @test43vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1123, i32 1123>, [[ENTRY:%.*]] ], [ <i32 133, i32 133>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = add <2 x i32> <i32 123, i32 123>, %A
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test43vec2(i1 %which) {
+; CHECK-LABEL: @test43vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1123, i32 2833>, [[ENTRY:%.*]] ], [ <i32 133, i32 363>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = add <2 x i32> <i32 123, i32 333>, %A
+  ret <2 x i32> %value
+}
diff --git a/test/Transforms/InstCombine/alloca.ll b/test/Transforms/InstCombine/alloca.ll
index 2ee0372e5e0afc2a3d547a1e221c20ba5a5f5912..f81f700e6cf42ec8315a5d2f207398d414726b07 100644
--- a/test/Transforms/InstCombine/alloca.ll
+++ b/test/Transforms/InstCombine/alloca.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -instcombine -S -default-data-layout="E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" | FileCheck %s -check-prefix=CHECK -check-prefix=ALL
-; RUN: opt < %s -instcombine -S -default-data-layout="E-p:32:32:32-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" | FileCheck %s -check-prefix=P32 -check-prefix=ALL
+; RUN: opt < %s -instcombine -S -data-layout="E-p:64:64:64-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" | FileCheck %s -check-prefix=CHECK -check-prefix=ALL
+; RUN: opt < %s -instcombine -S -data-layout="E-p:32:32:32-a0:0:8-f32:32:32-f64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-v64:64:64-v128:128:128" | FileCheck %s -check-prefix=P32 -check-prefix=ALL
 ; RUN: opt < %s -instcombine -S | FileCheck %s -check-prefix=NODL -check-prefix=ALL
 
 
diff --git a/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll b/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll
new file mode 100644
index 0000000000000000000000000000000000000000..888f51bf939dd10d86e36311fb3e96f268fcb9a8
--- /dev/null
+++ b/test/Transforms/InstCombine/amdgcn-demanded-vector-elts.ll
@@ -0,0 +1,322 @@
+; RUN: opt -S -instcombine %s | FileCheck %s
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.buffer.load
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @buffer_load_f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @buffer_load_f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret float %data
+}
+
+; CHECK-LABEL: @buffer_load_v1f32(
+; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <1 x float> %data
+define amdgpu_ps <1 x float> @buffer_load_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret <1 x float> %data
+}
+
+; CHECK-LABEL: @buffer_load_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret <2 x float> %data
+}
+
+; CHECK-LABEL: @buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <4 x float> %data
+define amdgpu_ps <4 x float> @buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  ret <4 x float> %data
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_v2f32(
+; CHECK: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_buffer_load_v2f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <2 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <2 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_v4f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <4 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <4 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 3
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <4 x float> %data, i32 3
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 2, i32 3>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_elt2_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt1_elt2_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 1, i32 2>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt1_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 1, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt2_elt3_buffer_load_v4f32(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+; CHECK-NEXT: ret <3 x float> %shuf
+define amdgpu_ps <3 x float> @extract_elt0_elt2_elt3_buffer_load_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <3 x i32> <i32 0, i32 2, i32 3>
+  ret <3 x float> %shuf
+}
+
+; FIXME: Not handled even though only 2 elts used
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v4f32_2(
+; CHECK-NEXT: %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt0 = extractelement <4 x float> %data, i32 0
+; CHECK-NEXT: %elt1 = extractelement <4 x float> %data, i32 1
+; CHECK-NEXT: %ins0 = insertvalue { float, float } undef, float %elt0, 0
+; CHECK-NEXT: %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
+; CHECK-NEXT: ret { float, float } %ins1
+define amdgpu_ps { float, float } @extract_elt0_elt1_buffer_load_v4f32_2(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <4 x float> %data, i32 0
+  %elt1 = extractelement <4 x float> %data, i32 1
+  %ins0 = insertvalue { float, float } undef, float %elt0, 0
+  %ins1 = insertvalue { float, float } %ins0, float %elt1, 1
+  ret { float, float } %ins1
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_v3f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt0 = extractelement <3 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt1_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 1
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <3 x float> %data, i32 1
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt2_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %elt1 = extractelement <3 x float> %data, i32 2
+; CHECK-NEXT: ret float %elt1
+define amdgpu_ps float @extract_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %elt1 = extractelement <3 x float> %data, i32 2
+  ret float %elt1
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float>
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt1_elt2_buffer_load_v3f32(
+; CHECK-NEXT: %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+; CHECK-NEXT: ret <2 x float> %shuf
+define amdgpu_ps <2 x float> @extract_elt1_elt2_buffer_load_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 1, i32 2>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @preserve_metadata_extract_elt0_buffer_load_v2f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @preserve_metadata_extract_elt0_buffer_load_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false), !fpmath !0
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.buffer.load.format
+; --------------------------------------------------------------------
+
+; CHECK-LABEL: @buffer_load_format_v1f32(
+; CHECK-NEXT: %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
+; CHECK-NEXT: ret <1 x float> %data
+define amdgpu_ps <1 x float> @buffer_load_format_v1f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 true)
+  ret <1 x float> %data
+}
+
+; CHECK-LABEL: @extract_elt0_buffer_load_format_v2f32(
+; CHECK-NEXT: %data = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
+; CHECK-NEXT: ret float %data
+define amdgpu_ps float @extract_elt0_buffer_load_format_v2f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 true, i1 false)
+  %elt0 = extractelement <2 x float> %data, i32 0
+  ret float %elt0
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v3f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v3f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <3 x float> %data, <3 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; CHECK-LABEL: @extract_elt0_elt1_buffer_load_format_v4f32(
+; CHECK-NEXT: %data = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+; CHECK-NEXT: ret <2 x float> %data
+define amdgpu_ps <2 x float> @extract_elt0_elt1_buffer_load_format_v4f32(<4 x i32> inreg %rsrc, i32 %idx, i32 %ofs) #0 {
+  %data = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> %rsrc, i32 %idx, i32 %ofs, i1 false, i1 false)
+  %shuf = shufflevector <4 x float> %data, <4 x float> undef, <2 x i32> <i32 0, i32 1>
+  ret <2 x float> %shuf
+}
+
+; The initial insertion point is at the extractelement
+; CHECK-LABEL: @extract01_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %1 = shufflevector <2 x float> %tmp, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <2 x double>
+; CHECK-NEXT: %tmp2 = extractelement <2 x double> %tmp1, i32 0
+; CHECK-NEXT: ret double %tmp2
+define double @extract01_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <2 x double>
+  %tmp2 = extractelement <2 x double> %tmp1, i32 0
+  ret double %tmp2
+}
+
+; CHECK-LABEL: @extract0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %tmp2 = bitcast float %tmp to i32
+; CHECK-NEXT: ret i32 %tmp2
+define i32 @extract0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <4 x i32>
+  %tmp2 = extractelement <4 x i32> %tmp1, i32 0
+  ret i32 %tmp2
+}
+
+; CHECK-LABEL: @extract_lo16_0_bitcast_buffer_load_format_v4f32(
+; CHECK-NEXT: %tmp = call float @llvm.amdgcn.buffer.load.format.f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false)
+; CHECK-NEXT: %1 = insertelement <4 x float> undef, float %tmp, i64 0
+; CHECK-NEXT: %tmp1 = bitcast <4 x float> %1 to <8 x i16>
+; CHECK-NEXT: %tmp2 = extractelement <8 x i16> %tmp1, i32 0
+; CHECK-NEXT: ret i16 %tmp2
+define i16 @extract_lo16_0_bitcast_buffer_load_format_v4f32(i32 %arg) #0 {
+  %tmp = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %arg, i32 16, i1 false, i1 false) #3
+  %tmp1 = bitcast <4 x float> %tmp to <8 x i16>
+  %tmp2 = extractelement <8 x i16> %tmp1, i32 0
+  ret i16 %tmp2
+}
+
+declare float @llvm.amdgcn.buffer.load.f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <1 x float> @llvm.amdgcn.buffer.load.v1f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <2 x float> @llvm.amdgcn.buffer.load.v2f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <3 x float> @llvm.amdgcn.buffer.load.v3f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+declare float @llvm.amdgcn.buffer.load.format.f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <1 x float> @llvm.amdgcn.buffer.load.format.v1f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <2 x float> @llvm.amdgcn.buffer.load.format.v2f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <3 x float> @llvm.amdgcn.buffer.load.format.v3f32(<4 x i32>, i32, i32, i1, i1) #1
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #1
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readonly }
+
+!0 = !{float 2.500000e+00}
diff --git a/test/Transforms/InstCombine/amdgcn-intrinsics.ll b/test/Transforms/InstCombine/amdgcn-intrinsics.ll
index a228968f25bced0f43adc4d81c569e3e8ef2e8f6..deae5502bcdb8b65b4fec489cae70ace1ce4e249 100644
--- a/test/Transforms/InstCombine/amdgcn-intrinsics.ll
+++ b/test/Transforms/InstCombine/amdgcn-intrinsics.ll
@@ -7,6 +7,12 @@
 declare float @llvm.amdgcn.rcp.f32(float) nounwind readnone
 declare double @llvm.amdgcn.rcp.f64(double) nounwind readnone
 
+; CHECK-LABEL: @test_constant_fold_rcp_f32_undef
+; CHECK-NEXT: ret float undef
+define float @test_constant_fold_rcp_f32_undef() nounwind {
+  %val = call float @llvm.amdgcn.rcp.f32(float undef) nounwind readnone
+  ret float %val
+}
 
 ; CHECK-LABEL: @test_constant_fold_rcp_f32_1
 ; CHECK-NEXT: ret float 1.000000e+00
@@ -50,6 +56,18 @@ define double @test_constant_fold_rcp_f64_43() nounwind {
   ret double %val
 }
 
+; --------------------------------------------------------------------
+; llvm.amdgcn.rsq
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.rsq.f32(float) nounwind readnone
+
+; CHECK-LABEL: @test_constant_fold_rsq_f32_undef
+; CHECK-NEXT: ret float undef
+define float @test_constant_fold_rsq_f32_undef() nounwind {
+  %val = call float @llvm.amdgcn.rsq.f32(float undef) nounwind readnone
+  ret float %val
+}
 
 ; --------------------------------------------------------------------
 ; llvm.amdgcn.frexp.mant
@@ -633,3 +651,888 @@ define float @cos_fabs_fneg_f32(float %x) {
   %cos = call float @llvm.amdgcn.cos.f32(float %x.fabs.fneg)
   ret float %cos
 }
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.cvt.pkrtz
+; --------------------------------------------------------------------
+
+declare <2 x half> @llvm.amdgcn.cvt.pkrtz(float, float) nounwind readnone
+
+; CHECK-LABEL: @vars_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+define <2 x half> @vars_lhs_cvt_pkrtz(float %x, float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.000000e+00, float %y)
+define <2 x half> @constant_lhs_cvt_pkrtz(float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_rhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.000000e+00)
+define <2 x half> @constant_rhs_cvt_pkrtz(float %x) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float 0.0)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_lhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
+define <2 x half> @undef_lhs_cvt_pkrtz(float %y) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float %y)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_rhs_cvt_pkrtz(
+; CHECK: %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
+define <2 x half> @undef_rhs_cvt_pkrtz(float %x) {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float undef)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @undef_cvt_pkrtz(
+; CHECK: ret <2 x half> undef
+define <2 x half> @undef_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float undef, float undef)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_splat0_cvt_pkrtz(
+; CHECK: ret <2 x half> zeroinitializer
+define <2 x half> @constant_splat0_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 0.0, float 0.0)
+  ret <2 x half> %cvt
+}
+
+; CHECK-LABEL: @constant_cvt_pkrtz(
+; CHECK: ret <2 x half> <half 0xH4000, half 0xH4400>
+define <2 x half> @constant_cvt_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 2.0, float 4.0)
+  ret <2 x half> %cvt
+}
+
+; Test constant values where rtz changes result
+; CHECK-LABEL: @constant_rtz_pkrtz(
+; CHECK: ret <2 x half> <half 0xH7BFF, half 0xH7BFF>
+define <2 x half> @constant_rtz_pkrtz() {
+  %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float 65535.0, float 65535.0)
+  ret <2 x half> %cvt
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.ubfe
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.ubfe.i32(i32, i32, i32) nounwind readnone
+declare i64 @llvm.amdgcn.ubfe.i64(i64, i32, i32) nounwind readnone
+
+; CHECK-LABEL: @ubfe_var_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
+define i32 @ubfe_var_i32(i32 %src, i32 %offset, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_clear_high_bits_constant_offset_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 5, i32 %width)
+define i32 @ubfe_clear_high_bits_constant_offset_i32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 133, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_clear_high_bits_constant_width_i32(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 5)
+define i32 @ubfe_clear_high_bits_constant_width_i32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 133)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_width_0(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_31(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
+define i32 @ubfe_width_31(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 31)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_32(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_width_32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 32)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_width_33(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 1)
+define i32 @ubfe_width_33(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 33)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_33(
+; CHECK-NEXT: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 1, i32 %width)
+define i32 @ubfe_offset_33(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 33, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = lshr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @ubfe_offset_0(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_32(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = lshr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @ubfe_offset_32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_31(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = lshr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @ubfe_offset_31(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 32, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_width_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_offset_0_width_0(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_width_3(
+; CHECK-NEXT: and i32 %src, 7
+; CHECK-NEXT: ret
+define i32 @ubfe_offset_0_width_3(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 0, i32 3)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_3_width_1(
+; CHECK-NEXT: %1 = lshr i32 %src, 3
+; CHECK-NEXT: and i32 %1, 1
+; CHECK-NEXT: ret i32
+define i32 @ubfe_offset_3_width_1(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 3, i32 1)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_3_width_4(
+; CHECK-NEXT: %1 = lshr i32 %src, 3
+; CHECK-NEXT: and i32 %1, 15
+; CHECK-NEXT: ret i32
+define i32 @ubfe_offset_3_width_4(i32 %src) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 3, i32 4)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_0_0_0(
+; CHECK-NEXT: ret i32 0
+define i32 @ubfe_0_0_0() {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 0, i32 0, i32 0)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_neg1_5_7(
+; CHECK-NEXT: ret i32 127
+define i32 @ubfe_neg1_5_7() {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 -1, i32 5, i32 7)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_src_i32(
+; CHECK-NEXT: ret i32 undef
+define i32 @ubfe_undef_src_i32(i32 %offset, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 undef, i32 %offset, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_offset_i32(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
+define i32 @ubfe_undef_offset_i32(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 undef, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_undef_width_i32(
+; CHECK: %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
+define i32 @ubfe_undef_width_i32(i32 %src, i32 %offset) {
+  %bfe = call i32 @llvm.amdgcn.ubfe.i32(i32 %src, i32 %offset, i32 undef)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_33_width_4_i64(
+; CHECK-NEXT: %1 = lshr i64 %src, 33
+; CHECK-NEXT: %bfe = and i64 %1, 15
+define i64 @ubfe_offset_33_width_4_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 33, i32 4)
+  ret i64 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_0_i64(
+; CHECK-NEXT: %1 = sub i32 64, %width
+; CHECK-NEXT: %2 = zext i32 %1 to i64
+; CHECK-NEXT: %3 = shl i64 %src, %2
+; CHECK-NEXT: %bfe = lshr i64 %3, %2
+; CHECK-NEXT: ret i64 %bfe
+define i64 @ubfe_offset_0_i64(i64 %src, i32 %width) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 0, i32 %width)
+  ret i64 %bfe
+}
+
+; CHECK-LABEL: @ubfe_offset_32_width_32_i64(
+; CHECK-NEXT: %bfe = lshr i64 %src, 32
+; CHECK-NEXT: ret i64 %bfe
+define i64 @ubfe_offset_32_width_32_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.ubfe.i64(i64 %src, i32 32, i32 32)
+  ret i64 %bfe
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.sbfe
+; --------------------------------------------------------------------
+
+declare i32 @llvm.amdgcn.sbfe.i32(i32, i32, i32) nounwind readnone
+declare i64 @llvm.amdgcn.sbfe.i64(i64, i32, i32) nounwind readnone
+
+; CHECK-LABEL: @sbfe_offset_31(
+; CHECK-NEXT: %1 = sub i32 32, %width
+; CHECK-NEXT: %2 = shl i32 %src, %1
+; CHECK-NEXT: %bfe = ashr i32 %2, %1
+; CHECK-NEXT: ret i32 %bfe
+define i32 @sbfe_offset_31(i32 %src, i32 %width) {
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 32, i32 %width)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @sbfe_neg1_5_7(
+; CHECK-NEXT: ret i32 -1
+define i32 @sbfe_neg1_5_7() {
+  %bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 -1, i32 5, i32 7)
+  ret i32 %bfe
+}
+
+; CHECK-LABEL: @sbfe_offset_32_width_32_i64(
+; CHECK-NEXT: %bfe = ashr i64 %src, 32
+; CHECK-NEXT: ret i64 %bfe
+define i64 @sbfe_offset_32_width_32_i64(i64 %src) {
+  %bfe = call i64 @llvm.amdgcn.sbfe.i64(i64 %src, i32 32, i32 32)
+  ret i64 %bfe
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.exp
+; --------------------------------------------------------------------
+
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) nounwind inaccessiblememonly
+
+; Make sure no crashing on invalid variable params
+; CHECK-LABEL: @exp_invalid_inputs(
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 %en, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 %tgt, i32 15, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 true, i1 false)
+define void @exp_invalid_inputs(i32 %tgt, i32 %en) {
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 %en, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 %tgt, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @exp_disabled_inputs_to_undef(
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.000000e+00, float undef, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float 4.000000e+00, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %x, float undef, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float undef, float %y, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float undef, float undef, float %z, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float undef, float undef, float undef, float %w, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float undef, float undef, float undef, float undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.000000e+00, float 2.000000e+00, float undef, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.000000e+00, float undef, float 5.000000e-01, float undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.000000e+00, float undef, float undef, float 4.000000e+00, i1 false, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.000000e+00, float 2.000000e+00, float 5.000000e-01, float 4.000000e+00, i1 false, i1 false)
+define void @exp_disabled_inputs_to_undef(float %x, float %y, float %z, float %w) {
+  ; enable src0..src3 constants
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+
+  ; enable src0..src3 variables
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 1, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 2, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 4, float %x, float %y, float %z, float %w, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 8, float %x, float %y, float %z, float %w, i1 true, i1 false)
+
+  ; enable none
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 0, float %x, float %y, float %z, float %w, i1 true, i1 false)
+
+  ; enable different source combinations
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 3, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 5, float 1.0, float 2.0, float 0.5, float 4.0, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 9, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+  call void @llvm.amdgcn.exp.f32(i32 0, i32 15, float 1.0, float 2.0, float 0.5, float 4.0, i1 false, i1 false)
+
+  ret void
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.exp.compr
+; --------------------------------------------------------------------
+
+declare void @llvm.amdgcn.exp.compr.v2f16(i32, i32, <2 x half>, <2 x half>, i1, i1) nounwind inaccessiblememonly
+
+; CHECK-LABEL: @exp_compr_invalid_inputs(
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 %en, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> <half 0xH3800, half 0xH4400>, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 %tgt, i32 5, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> <half 0xH3800, half 0xH4400>, i1 true, i1 false)
+define void @exp_compr_invalid_inputs(i32 %tgt, i32 %en) {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 %en, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 %tgt, i32 5, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @exp_compr_disabled_inputs_to_undef(
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 0xH3C00, half 0xH4000>, <2 x half> undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> undef, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> %xy, <2 x half> undef, i1 true, i1 false)
+
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> undef, <2 x half> %zw, i1 true, i1 false)
+; CHECK: call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+define void @exp_compr_disabled_inputs_to_undef(<2 x half> %xy, <2 x half> %zw) {
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> <half 1.0, half 2.0>, <2 x half> <half 0.5, half 4.0>, i1 true, i1 false)
+
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 0, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 1, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 2, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 3, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 12, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  call void @llvm.amdgcn.exp.compr.v2f16(i32 0, i32 15, <2 x half> %xy, <2 x half> %zw, i1 true, i1 false)
+  ret void
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.fmed3
+; --------------------------------------------------------------------
+
+declare float @llvm.amdgcn.fmed3.f32(float, float, float) nounwind readnone
+
+; CHECK-LABEL: @fmed3_f32(
+; CHECK: %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
+define float @fmed3_f32(float %x, float %y, float %z) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float %z)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_c0_c1_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_x_c0_c1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0.0, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c0_x_c1_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_c0_x_c1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float %x, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c0_c1_x_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float 0.000000e+00, float 1.000000e+00)
+define float @fmed3_canonicalize_c0_c1_x_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.0, float 1.0, float %x)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_y_c_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_x_y_c_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_x_c_y_f32(
+; CHECK: %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_x_c_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 1.0, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_canonicalize_c_x_y_f32(
+; CHECK: call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 1.000000e+00)
+define float @fmed3_canonicalize_c_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 1.0, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_undef_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_undef_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_fmf_undef_x_y_f32(
+; CHECK: call nnan float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_fmf_undef_x_y_f32(float %x, float %y) {
+  %med3 = call nnan float @llvm.amdgcn.fmed3.f32(float undef, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_undef_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_undef_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float undef, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_y_undef_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_y_undef_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float undef)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_qnan0_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000000000000, float %x, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_qnan0_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_qnan0_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8000000000000, float %y)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_y_qnan0_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_x_y_qnan0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float %y, float 0x7FF8000000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan1_x_y_f32(
+; CHECK: call float @llvm.minnum.f32(float %x, float %y)
+define float @fmed3_qnan1_x_y_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float %x, float %y)
+  ret float %med3
+}
+
+; This can return any of the qnans.
+; CHECK-LABEL: @fmed3_qnan0_qnan1_qnan2_f32(
+; CHECK: ret float 0x7FF8002000000000
+define float @fmed3_qnan0_qnan1_qnan2_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8000100000000, float 0x7FF8002000000000, float 0x7FF8030000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src0_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src0_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float -1.0, float 4.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src0_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src0_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0.5, float 4.0, float -1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src1_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src1_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 0.5, float 4.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src1_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src1_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float 0.5, float -1.0)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src2_0_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src2_0_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float -1.0, float 4.0, float 0.5)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_constant_src2_1_f32(
+; CHECK: ret float 5.000000e-01
+define float @fmed3_constant_src2_1_f32(float %x, float %y) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 4.0, float -1.0, float 0.5)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_x_qnan0_qnan1_f32(
+; CHECK: ret float %x
+define float @fmed3_x_qnan0_qnan1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float %x, float 0x7FF8001000000000, float 0x7FF8002000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_x_qnan1_f32(
+; CHECK: ret float %x
+define float @fmed3_qnan0_x_qnan1_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float %x, float 0x7FF8002000000000)
+  ret float %med3
+}
+
+; CHECK-LABEL: @fmed3_qnan0_qnan1_x_f32(
+; CHECK: ret float %x
+define float @fmed3_qnan0_qnan1_x_f32(float %x) {
+  %med3 = call float @llvm.amdgcn.fmed3.f32(float 0x7FF8001000000000, float 0x7FF8002000000000, float %x)
+  ret float %med3
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.icmp
+; --------------------------------------------------------------------
+
+declare i64 @llvm.amdgcn.icmp.i32(i32, i32, i32) nounwind readnone convergent
+declare i64 @llvm.amdgcn.icmp.i64(i64, i64, i32) nounwind readnone convergent
+
+; Make sure there's no crash for invalid input
+; CHECK-LABEL: @invalid_nonconstant_icmp_code(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 %c)
+define i64 @invalid_nonconstant_icmp_code(i32 %a, i32 %b, i32 %c) {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 %c)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @invalid_icmp_code(
+; CHECK: %under = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 31)
+; CHECK: %over = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 42)
+define i64 @invalid_icmp_code(i32 %a, i32 %b) {
+  %under = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 31)
+  %over = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 42)
+  %or = or i64 %under, %over
+  ret i64 %or
+}
+
+; CHECK-LABEL: @icmp_constant_inputs_false(
+; CHECK: ret i64 0
+define i64 @icmp_constant_inputs_false() {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 32)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @icmp_constant_inputs_true(
+; CHECK: ret i64 -1
+define i64 @icmp_constant_inputs_true() {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 8, i32 34)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @icmp_constant_to_rhs_slt(
+; CHECK: %result = call i64 @llvm.amdgcn.icmp.i32(i32 %x, i32 9, i32 38)
+define i64 @icmp_constant_to_rhs_slt(i32 %x) {
+  %result = call i64 @llvm.amdgcn.icmp.i32(i32 9, i32 %x, i32 40)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_eq_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_ne_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ne_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 33)
+define i64 @fold_icmp_ne_0_zext_icmp_ne_i32(i32 %a, i32 %b) {
+  %cmp = icmp ne i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_sle_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 41)
+define i64 @fold_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
+  %cmp = icmp sle i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ugt_i64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 34)
+define i64 @fold_icmp_ne_0_zext_icmp_ugt_i64(i64 %a, i64 %b) {
+  %cmp = icmp ugt i64 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_icmp_ult_swap_i64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i64(i64 %a, i64 %b, i32 34)
+define i64 @fold_icmp_ne_0_zext_icmp_ult_swap_i64(i64 %a, i64 %b) {
+  %cmp = icmp ugt i64 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 0, i32 %zext.cmp, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_oeq_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 1)
+define i64 @fold_icmp_ne_0_zext_fcmp_oeq_f32(float %a, float %b) {
+  %cmp = fcmp oeq float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_une_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 14)
+define i64 @fold_icmp_ne_0_zext_fcmp_une_f32(float %a, float %b) {
+  %cmp = fcmp une float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_ne_0_zext_fcmp_olt_f64(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f64(double %a, double %b, i32 4)
+define i64 @fold_icmp_ne_0_zext_fcmp_olt_f64(double %a, double %b) {
+  %cmp = fcmp olt double %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_ne_0_i32(
+; CHECK: %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_sext_icmp_ne_0_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_eq_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 33)
+define i64 @fold_icmp_eq_0_zext_icmp_eq_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_icmp_slt_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 39)
+define i64 @fold_icmp_eq_0_zext_icmp_slt_i32(i32 %a, i32 %b) {
+  %cmp = icmp slt i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_oeq_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 14)
+define i64 @fold_icmp_eq_0_zext_fcmp_oeq_f32(float %a, float %b) {
+  %cmp = fcmp oeq float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ule_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 2)
+define i64 @fold_icmp_eq_0_zext_fcmp_ule_f32(float %a, float %b) {
+  %cmp = fcmp ule float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_eq_0_zext_fcmp_ogt_f32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 13)
+define i64 @fold_icmp_eq_0_zext_fcmp_ogt_f32(float %a, float %b) {
+  %cmp = fcmp ogt float %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_icmp_eq_1_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_zext_icmp_eq_1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %zext.cmp = zext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_argi1_eq_1_i32(
+; CHECK: %zext.cond = zext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 0, i32 33)
+define i64 @fold_icmp_zext_argi1_eq_1_i32(i1 %cond) {
+  %zext.cond = zext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_zext_argi1_eq_neg1_i32(
+; CHECK: %zext.cond = zext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 -1, i32 32)
+define i64 @fold_icmp_zext_argi1_eq_neg1_i32(i1 %cond) {
+  %zext.cond = zext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cond, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_1_i32(
+; CHECK: %sext.cond = sext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 1, i32 32)
+define i64 @fold_icmp_sext_argi1_eq_1_i32(i1 %cond) {
+  %sext.cond = sext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i32(
+; CHECK: %sext.cond = sext i1 %cond to i32
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 0, i32 33)
+define i64 @fold_icmp_sext_argi1_eq_neg1_i32(i1 %cond) {
+  %sext.cond = sext i1 %cond to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cond, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_argi1_eq_neg1_i64(
+; CHECK: %sext.cond = sext i1 %cond to i64
+; CHECK: call i64 @llvm.amdgcn.icmp.i64(i64 %sext.cond, i64 0, i32 33)
+define i64 @fold_icmp_sext_argi1_eq_neg1_i64(i1 %cond) {
+  %sext.cond = sext i1 %cond to i64
+  %mask = call i64 @llvm.amdgcn.icmp.i64(i64 %sext.cond, i64 -1, i32 32)
+  ret i64 %mask
+}
+
+; TODO: Should be able to fold to false
+; CHECK-LABEL: @fold_icmp_sext_icmp_eq_1_i32(
+; CHECK: %cmp = icmp eq i32 %a, %b
+; CHECK: %sext.cmp = sext i1 %cmp to i32
+; CHECK: %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 1, i32 32)
+define i64 @fold_icmp_sext_icmp_eq_1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_eq_neg1_i32(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 32)
+define i64 @fold_icmp_sext_icmp_eq_neg1_i32(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_icmp_sext_icmp_sge_neg1_i32(
+; CHECK: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 39)
+define i64 @fold_icmp_sext_icmp_sge_neg1_i32(i32 %a, i32 %b) {
+  %cmp = icmp sge i32 %a, %b
+  %sext.cmp = sext i1 %cmp to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %sext.cmp, i32 -1, i32 32)
+  ret i64 %mask
+}
+
+; CHECK-LABEL: @fold_not_icmp_ne_0_zext_icmp_sle_i32(
+; CHECK-NEXT: call i64 @llvm.amdgcn.icmp.i32(i32 %a, i32 %b, i32 38)
+define i64 @fold_not_icmp_ne_0_zext_icmp_sle_i32(i32 %a, i32 %b) {
+  %cmp = icmp sle i32 %a, %b
+  %not = xor i1 %cmp, true
+  %zext.cmp = zext i1 %not to i32
+  %mask = call i64 @llvm.amdgcn.icmp.i32(i32 %zext.cmp, i32 0, i32 33)
+  ret i64 %mask
+}
+
+; --------------------------------------------------------------------
+; llvm.amdgcn.fcmp
+; --------------------------------------------------------------------
+
+declare i64 @llvm.amdgcn.fcmp.f32(float, float, i32) nounwind readnone convergent
+
+; Make sure there's no crash for invalid input
+; CHECK-LABEL: @invalid_nonconstant_fcmp_code(
+; CHECK: call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 %c)
+define i64 @invalid_nonconstant_fcmp_code(float %a, float %b, i32 %c) {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 %c)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @invalid_fcmp_code(
+; CHECK: %under = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 -1)
+; CHECK: %over = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 16)
+define i64 @invalid_fcmp_code(float %a, float %b) {
+  %under = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 -1)
+  %over = call i64 @llvm.amdgcn.fcmp.f32(float %a, float %b, i32 16)
+  %or = or i64 %under, %over
+  ret i64 %or
+}
+
+; CHECK-LABEL: @fcmp_constant_inputs_false(
+; CHECK: ret i64 0
+define i64 @fcmp_constant_inputs_false() {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 1)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fcmp_constant_inputs_true(
+; CHECK: ret i64 -1
+define i64 @fcmp_constant_inputs_true() {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 2.0, float 4.0, i32 4)
+  ret i64 %result
+}
+
+; CHECK-LABEL: @fcmp_constant_to_rhs_olt(
+; CHECK: %result = call i64 @llvm.amdgcn.fcmp.f32(float %x, float 4.000000e+00, i32 2)
+define i64 @fcmp_constant_to_rhs_olt(float %x) {
+  %result = call i64 @llvm.amdgcn.fcmp.f32(float 4.0, float %x, i32 4)
+  ret i64 %result
+}
diff --git a/test/Transforms/InstCombine/and.ll b/test/Transforms/InstCombine/and.ll
index 8a30de1c07d70ae15422a88edab7efb30bcdc8af..cc2e0bb7b705d064e27f635cc35e2ff70701ac9a 100644
--- a/test/Transforms/InstCombine/and.ll
+++ b/test/Transforms/InstCombine/and.ll
@@ -176,7 +176,7 @@ define i8 @test16(i8 %A) {
 define i8 @test17(i8 %X, i8 %Y) {
 ; CHECK-LABEL: @test17(
 ; CHECK-NEXT:    [[Y_NOT:%.*]] = xor i8 %Y, -1
-; CHECK-NEXT:    [[D:%.*]] = or i8 %X, [[Y_NOT]]
+; CHECK-NEXT:    [[D:%.*]] = or i8 [[Y_NOT]], %X
 ; CHECK-NEXT:    ret i8 [[D]]
 ;
   %B = xor i8 %X, -1
@@ -382,6 +382,18 @@ define i32 @test31(i1 %X) {
   ret i32 %A
 }
 
+; Demanded bit analysis allows us to eliminate the add.
+
+define <2 x i32> @and_demanded_bits_splat_vec(<2 x i32> %x) {
+; CHECK-LABEL: @and_demanded_bits_splat_vec(
+; CHECK-NEXT:    [[Z:%.*]] = and <2 x i32> %x, <i32 7, i32 7>
+; CHECK-NEXT:    ret <2 x i32> [[Z]]
+;
+  %y = add <2 x i32> %x, <i32 8, i32 8>
+  %z = and <2 x i32> %y, <i32 7, i32 7>
+  ret <2 x i32> %z
+}
+
 define i32 @test32(i32 %In) {
 ; CHECK-LABEL: @test32(
 ; CHECK-NEXT:    ret i32 0
@@ -405,6 +417,42 @@ define i32 @test33(i32 %b) {
   ret i32 %tmp.13
 }
 
+define i32 @test33b(i32 %b) {
+; CHECK-LABEL: @test33b(
+; CHECK-NEXT:    [[TMP_13:%.*]] = xor i32 [[B:%.*]], 1
+; CHECK-NEXT:    ret i32 [[TMP_13]]
+;
+  %tmp.4.mask = and i32 %b, 1
+  %tmp.10 = xor i32 %tmp.4.mask, 1
+  %tmp.12 = and i32 %b, -2
+  %tmp.13 = or i32 %tmp.10, %tmp.12
+  ret i32 %tmp.13
+}
+
+define <2 x i32> @test33vec(<2 x i32> %b) {
+; CHECK-LABEL: @test33vec(
+; CHECK-NEXT:    [[TMP_13:%.*]] = xor <2 x i32> [[B:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    ret <2 x i32> [[TMP_13]]
+;
+  %tmp.4.mask = and <2 x i32> %b, <i32 1, i32 1>
+  %tmp.10 = xor <2 x i32> %tmp.4.mask, <i32 1, i32 1>
+  %tmp.12 = and <2 x i32> %b, <i32 -2, i32 -2>
+  %tmp.13 = or <2 x i32> %tmp.12, %tmp.10
+  ret <2 x i32> %tmp.13
+}
+
+define <2 x i32> @test33vecb(<2 x i32> %b) {
+; CHECK-LABEL: @test33vecb(
+; CHECK-NEXT:    [[TMP_13:%.*]] = xor <2 x i32> [[B:%.*]], <i32 1, i32 1>
+; CHECK-NEXT:    ret <2 x i32> [[TMP_13]]
+;
+  %tmp.4.mask = and <2 x i32> %b, <i32 1, i32 1>
+  %tmp.10 = xor <2 x i32> %tmp.4.mask, <i32 1, i32 1>
+  %tmp.12 = and <2 x i32> %b, <i32 -2, i32 -2>
+  %tmp.13 = or <2 x i32> %tmp.10, %tmp.12
+  ret <2 x i32> %tmp.13
+}
+
 define i32 @test34(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test34(
 ; CHECK-NEXT:    ret i32 %B
@@ -482,3 +530,99 @@ define i64 @test39(i32 %X) {
   %res = and i64 %zsub, 240
   ret i64 %res
 }
+
+define i32 @test40(i1 %C) {
+; CHECK-LABEL: @test40(
+; CHECK-NEXT:    [[A:%.*]] = select i1 [[C:%.*]], i32 104, i32 10
+; CHECK-NEXT:    ret i32 [[A]]
+;
+  %A = select i1 %C, i32 1000, i32 10
+  %V = and i32 %A, 123
+  ret i32 %V
+}
+
+define <2 x i32> @test40vec(i1 %C) {
+; CHECK-LABEL: @test40vec(
+; CHECK-NEXT:    [[A:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 104, i32 104>, <2 x i32> <i32 10, i32 10>
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = and <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %V
+}
+
+define <2 x i32> @test40vec2(i1 %C) {
+; CHECK-LABEL: @test40vec2(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 104, i32 324>, <2 x i32> <i32 10, i32 12>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 2500>, <2 x i32> <i32 10, i32 30>
+  %V = and <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %V
+}
+
+define i32 @test41(i1 %which) {
+; CHECK-LABEL: @test41(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 104, [[ENTRY:%.*]] ], [ 10, [[DELAY]] ]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = and i32 %A, 123
+  ret i32 %value
+}
+
+define <2 x i32> @test41vec(i1 %which) {
+; CHECK-LABEL: @test41vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 104, i32 104>, [[ENTRY:%.*]] ], [ <i32 10, i32 10>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = and <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test41vec2(i1 %which) {
+; CHECK-LABEL: @test41vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 104, i32 324>, [[ENTRY:%.*]] ], [ <i32 10, i32 12>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = and <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %value
+}
diff --git a/test/Transforms/InstCombine/and2.ll b/test/Transforms/InstCombine/and2.ll
index 3d043b0864cd34bcff245b411e686e73a1a62b12..0b4882fa823ead886c8f8c5b1d78d17932cbf6ef 100644
--- a/test/Transforms/InstCombine/and2.ll
+++ b/test/Transforms/InstCombine/and2.ll
@@ -122,3 +122,63 @@ define i64 @test10(i64 %x) {
   ret i64 %add
 }
 
+; The add in this test is unnecessary because the LSBs of the LHS are 0 and the 'and' only consumes bits from those LSBs. It doesn't matter what happens to the upper bits.
+define i32 @test11(i32 %a, i32 %b) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[X:%.*]] = shl i32 [[A:%.*]], 8
+; CHECK-NEXT:    [[Z:%.*]] = and i32 [[B:%.*]], 128
+; CHECK-NEXT:    [[W:%.*]] = mul i32 [[Z]], [[X]]
+; CHECK-NEXT:    ret i32 [[W]]
+;
+  %x = shl i32 %a, 8
+  %y = add i32 %x, %b
+  %z = and i32 %y, 128
+  %w = mul i32 %z, %x ; to keep the shift from being removed
+  ret i32 %w
+}
+
+; The add in this test is unnecessary because the LSBs of the RHS are 0 and the 'and' only consumes bits from those LSBs. It doesn't matter what happens to the upper bits.
+define i32 @test12(i32 %a, i32 %b) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[X:%.*]] = shl i32 [[A:%.*]], 8
+; CHECK-NEXT:    [[Z:%.*]] = and i32 [[B:%.*]], 128
+; CHECK-NEXT:    [[W:%.*]] = mul i32 [[Z]], [[X]]
+; CHECK-NEXT:    ret i32 [[W]]
+;
+  %x = shl i32 %a, 8
+  %y = add i32 %b, %x
+  %z = and i32 %y, 128
+  %w = mul i32 %z, %x ; to keep the shift from being removed
+  ret i32 %w
+}
+
+; The sub in this test is unnecessary because the LSBs of the RHS are 0 and the 'and' only consumes bits from those LSBs. It doesn't matter what happens to the upper bits.
+define i32 @test13(i32 %a, i32 %b) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[X:%.*]] = shl i32 [[A:%.*]], 8
+; CHECK-NEXT:    [[Z:%.*]] = and i32 [[B:%.*]], 128
+; CHECK-NEXT:    [[W:%.*]] = mul i32 [[Z]], [[X]]
+; CHECK-NEXT:    ret i32 [[W]]
+;
+  %x = shl i32 %a, 8
+  %y = sub i32 %b, %x
+  %z = and i32 %y, 128
+  %w = mul i32 %z, %x ; to keep the shift from being removed
+  ret i32 %w
+}
+
+; The sub in this test cannot be removed because we need to keep the negation of %b. TODO: But we should be able to replace the LHS of it with a 0.
+define i32 @test14(i32 %a, i32 %b) {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:    [[X:%.*]] = shl i32 [[A:%.*]], 8
+; CHECK-NEXT:    [[Y:%.*]] = sub i32 [[X]], [[B:%.*]]
+; CHECK-NEXT:    [[Z:%.*]] = and i32 [[Y]], 128
+; CHECK-NEXT:    [[W:%.*]] = mul i32 [[Z]], [[X]]
+; CHECK-NEXT:    ret i32 [[W]]
+;
+  %x = shl i32 %a, 8
+  %y = sub i32 %x, %b
+  %z = and i32 %y, 128
+  %w = mul i32 %z, %x ; to keep the shift from being removed
+  ret i32 %w
+}
diff --git a/test/Transforms/InstCombine/apint-shift.ll b/test/Transforms/InstCombine/apint-shift.ll
index fc1934b576f5cfa8f1e866c93c7b0be85df5d8ca..f339de35d77cd36ee9a2b24da8eef00b227aa90f 100644
--- a/test/Transforms/InstCombine/apint-shift.ll
+++ b/test/Transforms/InstCombine/apint-shift.ll
@@ -63,6 +63,8 @@ define i55 @test6(i55 %A) {
   ret i55 %C
 }
 
+; (X * C2) << C1 --> X * (C2 << C1)
+
 define i55 @test6a(i55 %A) {
 ; CHECK-LABEL: @test6a(
 ; CHECK-NEXT:    [[C:%.*]] = mul i55 %A, 6
@@ -73,6 +75,18 @@ define i55 @test6a(i55 %A) {
   ret i55 %C
 }
 
+; (X * C2) << C1 --> X * (C2 << C1)
+
+define <2 x i55> @test6a_vec(<2 x i55> %A) {
+; CHECK-LABEL: @test6a_vec(
+; CHECK-NEXT:    [[C:%.*]] = mul <2 x i55> %A, <i55 6, i55 48>
+; CHECK-NEXT:    ret <2 x i55> [[C]]
+;
+  %B = mul <2 x i55> %A, <i55 3, i55 12>
+  %C = shl <2 x i55> %B, <i55 1, i55 2>
+  ret <2 x i55> %C
+}
+
 define i29 @test7(i8 %X) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    ret i29 -1
@@ -126,6 +140,32 @@ define <2 x i19> @lshr_lshr_splat_vec(<2 x i19> %X) {
   ret <2 x i19> %sh2
 }
 
+define i9 @multiuse_lshr_lshr(i9 %x) {
+; CHECK-LABEL: @multiuse_lshr_lshr(
+; CHECK-NEXT:    [[SH1:%.*]] = lshr i9 %x, 2
+; CHECK-NEXT:    [[SH2:%.*]] = lshr i9 %x, 5
+; CHECK-NEXT:    [[MUL:%.*]] = mul i9 [[SH1]], [[SH2]]
+; CHECK-NEXT:    ret i9 [[MUL]]
+;
+  %sh1 = lshr i9 %x, 2
+  %sh2 = lshr i9 %sh1, 3
+  %mul = mul i9 %sh1, %sh2
+  ret i9 %mul
+}
+
+define <2 x i9> @multiuse_lshr_lshr_splat(<2 x i9> %x) {
+; CHECK-LABEL: @multiuse_lshr_lshr_splat(
+; CHECK-NEXT:    [[SH1:%.*]] = lshr <2 x i9> %x, <i9 2, i9 2>
+; CHECK-NEXT:    [[SH2:%.*]] = lshr <2 x i9> %x, <i9 5, i9 5>
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i9> [[SH1]], [[SH2]]
+; CHECK-NEXT:    ret <2 x i9> [[MUL]]
+;
+  %sh1 = lshr <2 x i9> %x, <i9 2, i9 2>
+  %sh2 = lshr <2 x i9> %sh1, <i9 3, i9 3>
+  %mul = mul <2 x i9> %sh1, %sh2
+  ret <2 x i9> %mul
+}
+
 ; Two left shifts in the same direction:
 ; shl (shl X, C1), C2 -->  shl X, C1 + C2
 
@@ -139,6 +179,32 @@ define <2 x i19> @shl_shl_splat_vec(<2 x i19> %X) {
   ret <2 x i19> %sh2
 }
 
+define i42 @multiuse_shl_shl(i42 %x) {
+; CHECK-LABEL: @multiuse_shl_shl(
+; CHECK-NEXT:    [[SH1:%.*]] = shl i42 %x, 8
+; CHECK-NEXT:    [[SH2:%.*]] = shl i42 %x, 17
+; CHECK-NEXT:    [[MUL:%.*]] = mul i42 [[SH1]], [[SH2]]
+; CHECK-NEXT:    ret i42 [[MUL]]
+;
+  %sh1 = shl i42 %x, 8
+  %sh2 = shl i42 %sh1, 9
+  %mul = mul i42 %sh1, %sh2
+  ret i42 %mul
+}
+
+define <2 x i42> @multiuse_shl_shl_splat(<2 x i42> %x) {
+; CHECK-LABEL: @multiuse_shl_shl_splat(
+; CHECK-NEXT:    [[SH1:%.*]] = shl <2 x i42> %x, <i42 8, i42 8>
+; CHECK-NEXT:    [[SH2:%.*]] = shl <2 x i42> %x, <i42 17, i42 17>
+; CHECK-NEXT:    [[MUL:%.*]] = mul <2 x i42> [[SH1]], [[SH2]]
+; CHECK-NEXT:    ret <2 x i42> [[MUL]]
+;
+  %sh1 = shl <2 x i42> %x, <i42 8, i42 8>
+  %sh2 = shl <2 x i42> %sh1, <i42 9, i42 9>
+  %mul = mul <2 x i42> %sh1, %sh2
+  ret <2 x i42> %mul
+}
+
 ; Equal shift amounts in opposite directions become bitwise 'and':
 ; lshr (shl X, C), C --> and X, C'
 
diff --git a/test/Transforms/InstCombine/apint-sub.ll b/test/Transforms/InstCombine/apint-sub.ll
index eb314ce3d1b2550f7474b8c02ceaf2ec34be596c..1a4e62ff0d73500d361f5ffeea0565326b4a23ef 100644
--- a/test/Transforms/InstCombine/apint-sub.ll
+++ b/test/Transforms/InstCombine/apint-sub.ll
@@ -50,7 +50,7 @@ define i19 @test5(i19 %A, i19 %Bok, i19 %Cok) {
 define i57 @test6(i57 %A, i57 %B) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[B_NOT:%.*]] = xor i57 %B, -1
-; CHECK-NEXT:    [[D:%.*]] = and i57 %A, [[B_NOT]]
+; CHECK-NEXT:    [[D:%.*]] = and i57 [[B_NOT]], %A
 ; CHECK-NEXT:    ret i57 [[D]]
 ;
   %C = and i57 %A, %B
diff --git a/test/Transforms/InstCombine/bitcast-bigendian.ll b/test/Transforms/InstCombine/bitcast-bigendian.ll
index 1a91d11d8aeed54d1a801b0db2b522013ca4bd7f..a6b2ac85c9c25bfe779338d91a801ebac83eaa1a 100644
--- a/test/Transforms/InstCombine/bitcast-bigendian.ll
+++ b/test/Transforms/InstCombine/bitcast-bigendian.ll
@@ -81,9 +81,8 @@ define <2 x float> @test5(float %A, float %B) {
 
 define <2 x float> @test6(float %A){
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float %A, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float 4.200000e+01, i32 1
-; CHECK-NEXT:    ret <2 x float> [[TMP2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float undef, float 4.200000e+01>, float %A, i32 0
+; CHECK-NEXT:    ret <2 x float> [[TMP1]]
 ;
   %tmp23 = bitcast float %A to i32
   %tmp24 = zext i32 %tmp23 to i64
diff --git a/test/Transforms/InstCombine/bitreverse-fold.ll b/test/Transforms/InstCombine/bitreverse-fold.ll
index ecdfbc8cb5f995e91f66f8a26543e474bd49977c..b798ad33b3f087548ec5c75ef00731fa77428b89 100644
--- a/test/Transforms/InstCombine/bitreverse-fold.ll
+++ b/test/Transforms/InstCombine/bitreverse-fold.ll
@@ -37,6 +37,13 @@ define i32 @reverse_neg1_i32() {
   ret i32 %x
 }
 
+; CHECK-LABEL: @reverse_undef_i32(
+; CHECK-NEXT: ret i32 undef
+define i32 @reverse_undef_i32() {
+  %x = call i32 @llvm.bitreverse.i32(i32 undef)
+  ret i32 %x
+}
+
 ; CHECK-LABEL: @reverse_false_i1(
 ; CHECK-NEXT: ret i1 false
 define i1 @reverse_false_i1() {
@@ -51,6 +58,13 @@ define i1 @reverse_true_i1() {
   ret i1 %x
 }
 
+; CHECK-LABEL: @reverse_undef_i1(
+; CHECK-NEXT: ret i1 undef
+define i1 @reverse_undef_i1() {
+  %x = call i1 @llvm.bitreverse.i1(i1 undef)
+  ret i1 %x
+}
+
 ; CHECK-LABEL: @reverse_false_v2i1(
 ; CHECK-NEXT: ret <2 x i1> zeroinitializer
 define <2 x i1> @reverse_false_v2i1() {
diff --git a/test/Transforms/InstCombine/bitreverse-known-bits.ll b/test/Transforms/InstCombine/bitreverse-known-bits.ll
index b73df77b9258b46b60e6a727f87a5ec2712dff1e..cd1523a3b06baca7849a1a3b155699e908b1fdc1 100644
--- a/test/Transforms/InstCombine/bitreverse-known-bits.ll
+++ b/test/Transforms/InstCombine/bitreverse-known-bits.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -S -instcombine | FileCheck %s
 
+declare i8 @llvm.bitreverse.i8(i8)
 declare i32 @llvm.bitreverse.i32(i32)
 
 ; CHECK-LABEL: @test1
@@ -32,3 +33,19 @@ define i1 @test3(i32 %arg) {
   %res = icmp eq i32 %and, 0
   ret i1 %res
 }
+
+; CHECK-LABEL: @add_bitreverse
+; Make sure we process range metadata on bitreverse
+define i8 @add_bitreverse(i8 %a) {
+  %b = and i8 %a, 252
+  ; known bits for the bitreverse will say the result is in the range [0, 64)
+  ; but the metadata says [0, 16). So make sure the range metadata wins.
+  ;    add %reverse, 1111 0000
+  ; should become
+  ;    or  %reverse, 1111 0000
+  %reverse = call i8 @llvm.bitreverse.i8(i8 %b), !range !1
+  %c = add i8 %reverse, -16
+; CHECK: or i8 %reverse, -16
+  ret i8 %c
+}
+!1 = !{i8 0, i8 16}
diff --git a/test/Transforms/InstCombine/bswap-fold.ll b/test/Transforms/InstCombine/bswap-fold.ll
index edf9572f1e1128c64d1b984a9117e72590feacd7..91678a91962a81049d7197e631c548c881ebd444 100644
--- a/test/Transforms/InstCombine/bswap-fold.ll
+++ b/test/Transforms/InstCombine/bswap-fold.ll
@@ -1,68 +1,75 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-define i1 @test1(i16 %tmp2) {
-; CHECK-LABEL: @test1
-; CHECK-NEXT:  %tmp = icmp eq i16 %tmp2, 256
-; CHECK-NEXT:  ret i1 %tmp
-        %tmp10 = call i16 @llvm.bswap.i16( i16 %tmp2 )
-        %tmp = icmp eq i16 %tmp10, 1
-        ret i1 %tmp
+define i1 @test1(i16 %t) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i16 %t, 256
+; CHECK-NEXT:    ret i1 [[TMP2]]
+;
+  %tmp1 = call i16 @llvm.bswap.i16( i16 %t )
+  %tmp2 = icmp eq i16 %tmp1, 1
+  ret i1 %tmp2
 }
 
 define i1 @test2(i32 %tmp) {
-; CHECK-LABEL: @test2
-; CHECK-NEXT:  %tmp.upgrd.1 = icmp eq i32 %tmp, 16777216
-; CHECK-NEXT:  ret i1 %tmp.upgrd.1
-        %tmp34 = tail call i32 @llvm.bswap.i32( i32 %tmp )
-        %tmp.upgrd.1 = icmp eq i32 %tmp34, 1
-        ret i1 %tmp.upgrd.1
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[TMP_UPGRD_1:%.*]] = icmp eq i32 %tmp, 16777216
+; CHECK-NEXT:    ret i1 [[TMP_UPGRD_1]]
+;
+  %tmp34 = tail call i32 @llvm.bswap.i32( i32 %tmp )
+  %tmp.upgrd.1 = icmp eq i32 %tmp34, 1
+  ret i1 %tmp.upgrd.1
 }
 
 define i1 @test3(i64 %tmp) {
-; CHECK-LABEL: @test3
-; CHECK-NEXT:  %tmp.upgrd.2 = icmp eq i64 %tmp, 72057594037927936
-; CHECK-NEXT:  ret i1 %tmp.upgrd.2
-        %tmp34 = tail call i64 @llvm.bswap.i64( i64 %tmp )
-        %tmp.upgrd.2 = icmp eq i64 %tmp34, 1
-        ret i1 %tmp.upgrd.2
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[TMP_UPGRD_2:%.*]] = icmp eq i64 %tmp, 72057594037927936
+; CHECK-NEXT:    ret i1 [[TMP_UPGRD_2]]
+;
+  %tmp34 = tail call i64 @llvm.bswap.i64( i64 %tmp )
+  %tmp.upgrd.2 = icmp eq i64 %tmp34, 1
+  ret i1 %tmp.upgrd.2
 }
 
 ; rdar://5992453
 ; A & 255
 define i32 @test4(i32 %a) nounwind  {
-; CHECK-LABEL: @test4
-; CHECK-NEXT:  %tmp2 = and i32 %a, 255
-; CHECK-NEXT:  ret i32 %tmp2
-	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
-	%tmp4 = lshr i32 %tmp2, 24
-	ret i32 %tmp4
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 %a, 255
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
+  %tmp4 = lshr i32 %tmp2, 24
+  ret i32 %tmp4
 }
 
 ; A
 define i32 @test5(i32 %a) nounwind {
-; CHECK-LABEL: @test5
-; CHECK-NEXT:  ret i32 %a
-	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
-	%tmp4 = tail call i32 @llvm.bswap.i32( i32 %tmp2 )
-	ret i32 %tmp4
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    ret i32 %a
+;
+  %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
+  %tmp4 = tail call i32 @llvm.bswap.i32( i32 %tmp2 )
+  ret i32 %tmp4
 }
 
 ; a >> 24
 define i32 @test6(i32 %a) nounwind {
-; CHECK-LABEL: @test6
-; CHECK-NEXT:  %tmp2 = lshr i32 %a, 24
-; CHECK-NEXT:  ret i32 %tmp2
-	%tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
-	%tmp4 = and i32 %tmp2, 255
-	ret i32 %tmp4
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 %a, 24
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+  %tmp2 = tail call i32 @llvm.bswap.i32( i32 %a )
+  %tmp4 = and i32 %tmp2, 255
+  ret i32 %tmp4
 }
 
 ; PR5284
 define i16 @test7(i32 %A) {
-; CHECK-LABEL: @test7
-; CHECK-NEXT:  %1 = lshr i32 %A, 16
-; CHECK-NEXT:  %D = trunc i32 %1 to i16
-; CHECK-NEXT:  ret i16 %D
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i32 %A, 16
+; CHECK-NEXT:    [[D:%.*]] = trunc i32 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[D]]
+;
   %B = tail call i32 @llvm.bswap.i32(i32 %A) nounwind
   %C = trunc i32 %B to i16
   %D = tail call i16 @llvm.bswap.i16(i16 %C) nounwind
@@ -70,11 +77,12 @@ define i16 @test7(i32 %A) {
 }
 
 define i16 @test8(i64 %A) {
-; CHECK-LABEL: @test8
-; CHECK-NEXT:  %1 = lshr i64 %A, 48
-; CHECK-NEXT:  %D = trunc i64 %1 to i16
-; CHECK-NEXT:  ret i16 %D
-  %B = tail call i64 @llvm.bswap.i64(i64 %A) nounwind 
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 %A, 48
+; CHECK-NEXT:    [[D:%.*]] = trunc i64 [[TMP1]] to i16
+; CHECK-NEXT:    ret i16 [[D]]
+;
+  %B = tail call i64 @llvm.bswap.i64(i64 %A) nounwind
   %C = trunc i64 %B to i16
   %D = tail call i16 @llvm.bswap.i16(i16 %C) nounwind
   ret i16 %D
@@ -82,8 +90,9 @@ define i16 @test8(i64 %A) {
 
 ; Misc: Fold bswap(undef) to undef.
 define i64 @foo() {
-; CHECK-LABEL: @foo
-; CHECK-NEXT: ret i64 undef
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    ret i64 undef
+;
   %a = call i64 @llvm.bswap.i64(i64 undef)
   ret i64 %a
 }
@@ -92,20 +101,22 @@ define i64 @foo() {
 ; Fold: OP( BSWAP(x), BSWAP(y) ) -> BSWAP( OP(x, y) )
 ; Fold: OP( BSWAP(x), CONSTANT ) -> BSWAP( OP(x, BSWAP(CONSTANT) ) )
 define i16 @bs_and16i(i16 %a, i16 %b) #0 {
-; CHECK-LABEL: @bs_and16i
-; CHECK-NEXT:  %1 = and i16 %a, 4391
-; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
-; CHECK-NEXT:  ret i16 %2
+; CHECK-LABEL: @bs_and16i(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i16 %a, 4391
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
   %1 = tail call i16 @llvm.bswap.i16(i16 %a)
   %2 = and i16 %1, 10001
   ret i16 %2
 }
 
 define i16 @bs_and16(i16 %a, i16 %b) #0 {
-; CHECK-LABEL: @bs_and16
-; CHECK-NEXT:  %1 = and i16 %a, %b
-; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
-; CHECK-NEXT:  ret i16 %2
+; CHECK-LABEL: @bs_and16(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i16 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
   %tmp1 = tail call i16 @llvm.bswap.i16(i16 %a)
   %tmp2 = tail call i16 @llvm.bswap.i16(i16 %b)
   %tmp3 = and i16 %tmp1, %tmp2
@@ -113,10 +124,11 @@ define i16 @bs_and16(i16 %a, i16 %b) #0 {
 }
 
 define i16 @bs_or16(i16 %a, i16 %b) #0 {
-; CHECK-LABEL: @bs_or16
-; CHECK-NEXT:  %1 = or i16 %a, %b
-; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
-; CHECK-NEXT:  ret i16 %2
+; CHECK-LABEL: @bs_or16(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i16 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
   %tmp1 = tail call i16 @llvm.bswap.i16(i16 %a)
   %tmp2 = tail call i16 @llvm.bswap.i16(i16 %b)
   %tmp3 = or i16 %tmp1, %tmp2
@@ -124,10 +136,11 @@ define i16 @bs_or16(i16 %a, i16 %b) #0 {
 }
 
 define i16 @bs_xor16(i16 %a, i16 %b) #0 {
-; CHECK-LABEL: @bs_xor16
-; CHECK-NEXT:  %1 = xor i16 %a, %b
-; CHECK-NEXT:  %2 = call i16 @llvm.bswap.i16(i16 %1)
-; CHECK-NEXT:  ret i16 %2
+; CHECK-LABEL: @bs_xor16(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i16 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i16 @llvm.bswap.i16(i16 [[TMP1]])
+; CHECK-NEXT:    ret i16 [[TMP2]]
+;
   %tmp1 = tail call i16 @llvm.bswap.i16(i16 %a)
   %tmp2 = tail call i16 @llvm.bswap.i16(i16 %b)
   %tmp3 = xor i16 %tmp1, %tmp2
@@ -135,20 +148,22 @@ define i16 @bs_xor16(i16 %a, i16 %b) #0 {
 }
 
 define i32 @bs_and32i(i32 %a, i32 %b) #0 {
-; CHECK-LABEL: @bs_and32i
-; CHECK-NEXT:  %1 = and i32 %a, -1585053440
-; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
-; CHECK-NEXT:  ret i32 %2
+; CHECK-LABEL: @bs_and32i(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 %a, -1585053440
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
   %tmp2 = and i32 %tmp1, 100001
   ret i32 %tmp2
 }
 
 define i32 @bs_and32(i32 %a, i32 %b) #0 {
-; CHECK-LABEL: @bs_and32
-; CHECK-NEXT:  %1 = and i32 %a, %b
-; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
-; CHECK-NEXT:  ret i32 %2
+; CHECK-LABEL: @bs_and32(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i32 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
   %tmp2 = tail call i32 @llvm.bswap.i32(i32 %b)
   %tmp3 = and i32 %tmp1, %tmp2
@@ -156,10 +171,11 @@ define i32 @bs_and32(i32 %a, i32 %b) #0 {
 }
 
 define i32 @bs_or32(i32 %a, i32 %b) #0 {
-; CHECK-LABEL: @bs_or32
-; CHECK-NEXT:  %1 = or i32 %a, %b
-; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
-; CHECK-NEXT:  ret i32 %2
+; CHECK-LABEL: @bs_or32(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
   %tmp2 = tail call i32 @llvm.bswap.i32(i32 %b)
   %tmp3 = or i32 %tmp1, %tmp2
@@ -167,10 +183,11 @@ define i32 @bs_or32(i32 %a, i32 %b) #0 {
 }
 
 define i32 @bs_xor32(i32 %a, i32 %b) #0 {
-; CHECK-LABEL: @bs_xor32
-; CHECK-NEXT:  %1 = xor i32 %a, %b
-; CHECK-NEXT:  %2 = call i32 @llvm.bswap.i32(i32 %1)
-; CHECK-NEXT:  ret i32 %2
+; CHECK-LABEL: @bs_xor32(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @llvm.bswap.i32(i32 [[TMP1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %tmp1 = tail call i32 @llvm.bswap.i32(i32 %a)
   %tmp2 = tail call i32 @llvm.bswap.i32(i32 %b)
   %tmp3 = xor i32 %tmp1, %tmp2
@@ -178,20 +195,22 @@ define i32 @bs_xor32(i32 %a, i32 %b) #0 {
 }
 
 define i64 @bs_and64i(i64 %a, i64 %b) #0 {
-; CHECK-LABEL: @bs_and64i
-; CHECK-NEXT:  %1 = and i64 %a, 129085117527228416
-; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
-; CHECK-NEXT:  ret i64 %2
+; CHECK-LABEL: @bs_and64i(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 %a, 129085117527228416
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
   %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
   %tmp2 = and i64 %tmp1, 1000000001
   ret i64 %tmp2
 }
 
 define i64 @bs_and64(i64 %a, i64 %b) #0 {
-; CHECK-LABEL: @bs_and64
-; CHECK-NEXT:  %1 = and i64 %a, %b
-; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
-; CHECK-NEXT:  ret i64 %2
+; CHECK-LABEL: @bs_and64(
+; CHECK-NEXT:    [[TMP1:%.*]] = and i64 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
   %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
   %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
   %tmp3 = and i64 %tmp1, %tmp2
@@ -199,10 +218,11 @@ define i64 @bs_and64(i64 %a, i64 %b) #0 {
 }
 
 define i64 @bs_or64(i64 %a, i64 %b) #0 {
-; CHECK-LABEL: @bs_or64
-; CHECK-NEXT:  %1 = or i64 %a, %b
-; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
-; CHECK-NEXT:  ret i64 %2
+; CHECK-LABEL: @bs_or64(
+; CHECK-NEXT:    [[TMP1:%.*]] = or i64 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
   %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
   %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
   %tmp3 = or i64 %tmp1, %tmp2
@@ -210,10 +230,11 @@ define i64 @bs_or64(i64 %a, i64 %b) #0 {
 }
 
 define i64 @bs_xor64(i64 %a, i64 %b) #0 {
-; CHECK-LABEL: @bs_xor64
-; CHECK-NEXT:  %1 = xor i64 %a, %b
-; CHECK-NEXT:  %2 = call i64 @llvm.bswap.i64(i64 %1)
-; CHECK-NEXT:  ret i64 %2
+; CHECK-LABEL: @bs_xor64(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i64 %a, %b
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.bswap.i64(i64 [[TMP1]])
+; CHECK-NEXT:    ret i64 [[TMP2]]
+;
   %tmp1 = tail call i64 @llvm.bswap.i64(i64 %a)
   %tmp2 = tail call i64 @llvm.bswap.i64(i64 %b)
   %tmp3 = xor i64 %tmp1, %tmp2
diff --git a/test/Transforms/InstCombine/builtin-object-size-offset.ll b/test/Transforms/InstCombine/builtin-object-size-offset.ll
index 7ab24a9acd94984b0035fcb6d23039b8737c64e7..248cf644df892ca83492f026fa42f5d1f8cbd0f5 100644
--- a/test/Transforms/InstCombine/builtin-object-size-offset.ll
+++ b/test/Transforms/InstCombine/builtin-object-size-offset.ll
@@ -26,25 +26,25 @@ entry:
   %Big = alloca [20 x i8], align 16
   %Small = alloca [10 x i8], align 1
   %0 = getelementptr inbounds [20 x i8], [20 x i8]* %Big, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 20, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 20, i8* %0)
   %1 = getelementptr inbounds [10 x i8], [10 x i8]* %Small, i64 0, i64 0
-  call void @llvm.lifetime.start(i64 10, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 10, i8* %1)
   %tobool = icmp ne i32 %N, 0
   %add.ptr = getelementptr inbounds [20 x i8], [20 x i8]* %Big, i64 0, i64 10
   %cond = select i1 %tobool, i8* %add.ptr, i8* %1
   %2 = call i64 @llvm.objectsize.i64.p0i8(i8* %cond, i1 false)
   %conv = trunc i64 %2 to i32
-  call void @llvm.lifetime.end(i64 10, i8* %1)
-  call void @llvm.lifetime.end(i64 20, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 10, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 20, i8* %0)
   ret i32 %conv
 ; CHECK: ret i32 10 
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
 
 declare i64 @llvm.objectsize.i64.p0i8(i8*, i1)
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define void @foo() {
 entry:
diff --git a/test/Transforms/InstCombine/builtin-object-size-ptr.ll b/test/Transforms/InstCombine/builtin-object-size-ptr.ll
index b38513999dc1a6cd9d2381b5b9ce4e6119593387..ada3fc1670265981ce6b6ea7b3afde79edd19a25 100644
--- a/test/Transforms/InstCombine/builtin-object-size-ptr.ll
+++ b/test/Transforms/InstCombine/builtin-object-size-ptr.ll
@@ -16,19 +16,19 @@ define i32 @foo() #0 {
 entry:
   %var = alloca %struct.V, align 4
   %0 = bitcast %struct.V* %var to i8*
-  call void @llvm.lifetime.start(i64 28, i8* %0) #3
+  call void @llvm.lifetime.start.p0i8(i64 28, i8* %0) #3
   %buf1 = getelementptr inbounds %struct.V, %struct.V* %var, i32 0, i32 0
   %arrayidx = getelementptr inbounds [10 x i8], [10 x i8]* %buf1, i64 0, i64 1
   %1 = call i64 @llvm.objectsize.i64.p0i8(i8* %arrayidx, i1 false)
   %conv = trunc i64 %1 to i32
-  call void @llvm.lifetime.end(i64 28, i8* %0) #3
+  call void @llvm.lifetime.end.p0i8(i64 28, i8* %0) #3
   ret i32 %conv
 ; CHECK: ret i32 27
 ; CHECK-NOT: ret i32 -1
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare i64 @llvm.objectsize.i64.p0i8(i8*, i1) #2
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
diff --git a/test/Transforms/InstCombine/call-guard.ll b/test/Transforms/InstCombine/call-guard.ll
index 18da465e606c593f9cd5b14b80a8a0a1f8fb8f90..9664467f914b42046dd5cc79fdd3393240aae5b9 100644
--- a/test/Transforms/InstCombine/call-guard.ll
+++ b/test/Transforms/InstCombine/call-guard.ll
@@ -2,8 +2,8 @@
 
 declare void @llvm.experimental.guard(i1, ...)
 
-define void @test_guard_adjacent(i1 %A) {
-; CHECK-LABEL: @test_guard_adjacent(
+define void @test_guard_adjacent_same_cond(i1 %A) {
+; CHECK-LABEL: @test_guard_adjacent_same_cond(
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %A) [ "deopt"() ]
 ; CHECK-NEXT:    ret void
   call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
@@ -19,12 +19,14 @@ define void @test_guard_adjacent(i1 %A) {
   ret void
 }
 
-define void @test_guard_adjacent_neg(i1 %A, i1 %B) {
-; CHECK-LABEL: @test_guard_adjacent_neg(
-; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %A) [ "deopt"() ]
-; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %B) [ "deopt"() ]
+define void @test_guard_adjacent_diff_cond(i1 %A, i1 %B, i1 %C) {
+; CHECK-LABEL: @test_guard_adjacent_diff_cond(
+; CHECK-NEXT:    %1 = and i1 %A, %B
+; CHECK-NEXT:    %2 = and i1 %1, %C
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %2, i32 123) [ "deopt"() ]
 ; CHECK-NEXT:    ret void
-  call void(i1, ...) @llvm.experimental.guard( i1 %A )[ "deopt"() ]
-  call void(i1, ...) @llvm.experimental.guard( i1 %B )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %A, i32 123 )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %B, i32 456 )[ "deopt"() ]
+  call void(i1, ...) @llvm.experimental.guard( i1 %C, i32 789 )[ "deopt"() ]
   ret void
 }
diff --git a/test/Transforms/InstCombine/call_nonnull_arg.ll b/test/Transforms/InstCombine/call_nonnull_arg.ll
index c502aa05731e5be9337ed13abd940d1191781c7e..8127f4734fcd687b81a458d720d261c20c7641f9 100644
--- a/test/Transforms/InstCombine/call_nonnull_arg.ll
+++ b/test/Transforms/InstCombine/call_nonnull_arg.ll
@@ -31,7 +31,7 @@ dead:
   unreachable
 }
 
-; FIXME: The nonnull attribute in the 'bar' declaration could be 
+; The nonnull attribute in the 'bar' declaration is 
 ; propagated to the parameters of the 'baz' callsite. 
 
 declare void @bar(i8*, i8* nonnull)
@@ -40,7 +40,7 @@ declare void @baz(i8*, i8*)
 define void @deduce_nonnull_from_another_call(i8* %a, i8* %b) {
 ; CHECK-LABEL: @deduce_nonnull_from_another_call(
 ; CHECK-NEXT:    call void @bar(i8* %a, i8* %b)
-; CHECK-NEXT:    call void @baz(i8* %b, i8* %b)
+; CHECK-NEXT:    call void @baz(i8* nonnull %b, i8* nonnull %b)
 ; CHECK-NEXT:    ret void
 ;
   call void @bar(i8* %a, i8* %b)
diff --git a/test/Transforms/InstCombine/cast-call-combine-prof.ll b/test/Transforms/InstCombine/cast-call-combine-prof.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e48c14c3526469f57e79bb9917e8bc59d9f12b75
--- /dev/null
+++ b/test/Transforms/InstCombine/cast-call-combine-prof.ll
@@ -0,0 +1,38 @@
+; RUN: opt -instcombine -inline -S -inline-threshold=0 -hot-callsite-threshold=100 < %s | FileCheck %s
+; Checks if VP profile is used for hotness checks in inlining after instcombine
+; converted the call to a direct call.
+
+declare void @bar(i16 *)
+
+define void @foo(i16* %a) {
+  call void @bar(i16* %a)
+  call void @bar(i16* %a)
+  ret void
+}
+
+; CHECK-LABEL: @test()
+; CHECK-NEXT: call void @bar
+; CHECK-NEXT: call void @bar
+define void @test() {
+  call void bitcast (void (i16*)* @foo to void (i8*)*) (i8* null), !prof !0
+  ret void
+}
+
+!0 = !{!"VP", i32 0, i64 2000, i64 -3913987384944532146, i64 2000}
+
+!llvm.module.flags = !{!1}
+
+!1 = !{i32 1, !"ProfileSummary", !2}
+!2 = !{!3, !4, !5, !6, !7, !8, !9, !10}
+!3 = !{!"ProfileFormat", !"InstrProf"}
+!4 = !{!"TotalCount", i64 10000}
+!5 = !{!"MaxCount", i64 1000}
+!6 = !{!"MaxInternalCount", i64 1}
+!7 = !{!"MaxFunctionCount", i64 1000}
+!8 = !{!"NumCounts", i64 3}
+!9 = !{!"NumFunctions", i64 3}
+!10 = !{!"DetailedSummary", !11}
+!11 = !{!12, !13, !14}
+!12 = !{i32 10000, i64 1000, i32 1}
+!13 = !{i32 999000, i64 1000, i32 1}
+!14 = !{i32 999999, i64 1, i32 2}
diff --git a/test/Transforms/InstCombine/compare-alloca.ll b/test/Transforms/InstCombine/compare-alloca.ll
index ca24da191779c0de04972c2308a20366fea6485b..414a07825f2f15f522ece031fece02afd6ec7e6f 100644
--- a/test/Transforms/InstCombine/compare-alloca.ll
+++ b/test/Transforms/InstCombine/compare-alloca.ll
@@ -72,15 +72,15 @@ define i1 @alloca_argument_compare_escaped_through_store(i64* %arg, i64** %ptr)
   ; CHECK: ret i1 %cmp
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 define i1 @alloca_argument_compare_benign_instrs(i8* %arg) {
   %alloc = alloca i8
-  call void @llvm.lifetime.start(i64 1, i8* %alloc)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %alloc)
   %cmp = icmp eq i8* %arg, %alloc
   %x = load i8, i8* %arg
   store i8 %x, i8* %alloc
-  call void @llvm.lifetime.end(i64 1, i8* %alloc)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %alloc)
   ret i1 %cmp
   ; CHECK-LABEL: alloca_argument_compare_benign_instrs
   ; CHECK: ret i1 false
diff --git a/test/Transforms/InstCombine/compare-unescaped.ll b/test/Transforms/InstCombine/compare-unescaped.ll
index 0e512aa28911c28a20ee3afcfd943ef1be5264c3..d15fc2fd4495c706a4511e556852a22616b0e666 100644
--- a/test/Transforms/InstCombine/compare-unescaped.ll
+++ b/test/Transforms/InstCombine/compare-unescaped.ll
@@ -144,7 +144,7 @@ chk2:
   ret i8* %n
 ; CHECK-LABEL: compare_ret_escape
 ; CHECK: %cmp = icmp eq i8* %n, %c
-; CHECK: %cmp2 = icmp eq i32* %bc, %lgp
+; CHECK: %cmp2 = icmp eq i32* %lgp, %bc
 }
 
 ; The malloc call for %m cannot be elided since it is used in the call to function f.
diff --git a/test/Transforms/InstCombine/consecutive-fences.ll b/test/Transforms/InstCombine/consecutive-fences.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6f1c412773861c028333dcb85186e3356fa90a0d
--- /dev/null
+++ b/test/Transforms/InstCombine/consecutive-fences.ll
@@ -0,0 +1,47 @@
+; RUN: opt -instcombine -S %s | FileCheck %s
+
+; Make sure we collapse the fences in this case
+
+; CHECK-LABEL: define void @tinkywinky
+; CHECK-NEXT:   fence seq_cst
+; CHECK-NEXT:   fence singlethread acquire
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+define void @tinkywinky() {
+  fence seq_cst
+  fence seq_cst
+  fence seq_cst
+  fence singlethread acquire
+  fence singlethread acquire
+  fence singlethread acquire
+  ret void
+}
+
+; CHECK-LABEL: define void @dipsy
+; CHECK-NEXT:   fence seq_cst
+; CHECK-NEXT:   fence singlethread seq_cst
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+define void @dipsy() {
+  fence seq_cst
+  fence singlethread seq_cst
+  ret void
+}
+
+; CHECK-LABEL: define void @patatino
+; CHECK-NEXT:   fence acquire
+; CHECK-NEXT:   fence seq_cst
+; CHECK-NEXT:   fence acquire
+; CHECK-NEXT:   fence seq_cst
+; CHECK-NEXT:   ret void
+; CHECK-NEXT: }
+
+define void @patatino() {
+  fence acquire
+  fence seq_cst
+  fence acquire
+  fence seq_cst
+  ret void
+}
diff --git a/test/Transforms/InstCombine/deadcode.ll b/test/Transforms/InstCombine/deadcode.ll
index 8fe673d8c9c07a7ed791baf18d0dc9ead80312cf..c5fa58babdbc0e21c23512565a9a288c00ec62a0 100644
--- a/test/Transforms/InstCombine/deadcode.ll
+++ b/test/Transforms/InstCombine/deadcode.ll
@@ -22,12 +22,12 @@ define i32* @test2(i32 %width) {
 
 declare i8* @llvm.stacksave()
 
-declare void @llvm.lifetime.start(i64, i8*)
-declare void @llvm.lifetime.end(i64, i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
 
 define void @test3() {
-  call void @llvm.lifetime.start(i64 -1, i8* undef)
-  call void @llvm.lifetime.end(i64 -1, i8* undef)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* undef)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* undef)
   ret void
 }
 
diff --git a/test/Transforms/InstCombine/debuginfo-dce.ll b/test/Transforms/InstCombine/debuginfo-dce.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e23aef7334d591dcc0535bf9fbd6cd679d662e49
--- /dev/null
+++ b/test/Transforms/InstCombine/debuginfo-dce.ll
@@ -0,0 +1,106 @@
+; RUN: opt -instcombine %s -S -o - | FileCheck %s
+; Verify that the eliminated instructions (bitcast, gep, load) are salvaged into
+; a DIExpression.
+;
+; Originally created from the following C source and then heavily isolated/reduced.
+;
+; struct entry {
+;   struct entry *next;
+; };
+; void scan(struct entry *queue, struct entry *end)
+; {
+;   struct entry *entry;
+;   for (entry = (struct entry *)((char *)(queue->next) - 8);
+;        &entry->next == end;
+;        entry = (struct entry *)((char *)(entry->next) - 8)) {
+;   }
+; }
+
+; ModuleID = '<stdin>'
+source_filename = "test.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+%struct.entry = type { %struct.entry* }
+
+; Function Attrs: nounwind ssp uwtable
+define void @salvage_load(%struct.entry** %queue) local_unnamed_addr #0 !dbg !14 {
+entry:
+  %im_not_dead = alloca %struct.entry*
+  %0 = load %struct.entry*, %struct.entry** %queue, align 8, !dbg !19
+  %1 = load %struct.entry*, %struct.entry** %queue, align 8, !dbg !19
+  call void @llvm.dbg.value(metadata %struct.entry* %1, i64 0, metadata !18, metadata !20), !dbg !19
+; CHECK: define void @salvage_load
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call void @llvm.dbg.value(metadata %struct.entry** %queue, i64 0,
+; CHECK-SAME:                           metadata ![[LOAD_EXPR:[0-9]+]])
+  store %struct.entry* %1, %struct.entry** %im_not_dead, align 8
+  ret void, !dbg !21
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @salvage_bitcast(%struct.entry* %queue) local_unnamed_addr #0 !dbg !14 {
+entry:
+  %im_not_dead = alloca i8*
+  %0 = bitcast %struct.entry* %queue to i8*, !dbg !19
+  %1 = bitcast %struct.entry* %queue to i8*, !dbg !19
+  call void @llvm.dbg.value(metadata i8* %1, i64 0, metadata !18, metadata !20), !dbg !19
+; CHECK: define void @salvage_bitcast
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call void @llvm.dbg.value(metadata %struct.entry* %queue, i64 0,
+; CHECK-SAME:                           metadata ![[BITCAST_EXPR:[0-9]+]])
+  store i8* %1, i8** %im_not_dead, align 8
+  ret void, !dbg !21
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @salvage_gep(%struct.entry* %queue, %struct.entry* %end) local_unnamed_addr #0 !dbg !14 {
+entry:
+  %im_not_dead = alloca %struct.entry**
+  %0 = getelementptr inbounds %struct.entry, %struct.entry* %queue, i32 -1, i32 0, !dbg !19
+  %1 = getelementptr inbounds %struct.entry, %struct.entry* %queue, i32 -1, i32 0, !dbg !19
+  call void @llvm.dbg.value(metadata %struct.entry** %1, i64 0, metadata !18, metadata !20), !dbg !19
+; CHECK: define void @salvage_gep
+; CHECK-NEXT: entry:
+; CHECK-NEXT: call void @llvm.dbg.value(metadata %struct.entry* %queue, i64 0,
+; CHECK-SAME:                           metadata ![[GEP_EXPR:[0-9]+]])
+  store %struct.entry** %1, %struct.entry*** %im_not_dead, align 8
+  ret void, !dbg !21
+}
+
+; CHECK: ![[LOAD_EXPR]] = !DIExpression(DW_OP_deref, DW_OP_plus, 0)
+; CHECK: ![[BITCAST_EXPR]] = !DIExpression(DW_OP_plus, 0)
+; CHECK: ![[GEP_EXPR]] = !DIExpression(DW_OP_minus, 8, DW_OP_plus, 0)
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!10, !11, !12}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (trunk 297628) (llvm/trunk 297643)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3)
+!1 = !DIFile(filename: "test.c", directory: "/")
+!2 = !{}
+!3 = !{!4, !8}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !5, size: 64)
+!5 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "entry", file: !1, line: 1, size: 64, elements: !6)
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "next", scope: !5, file: !1, line: 2, baseType: !4, size: 64)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !9, size: 64)
+!9 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!10 = !{i32 2, !"Dwarf Version", i32 4}
+!11 = !{i32 2, !"Debug Info Version", i32 3}
+!12 = !{i32 1, !"PIC Level", i32 2}
+!13 = !{!"clang version 5.0.0 (trunk 297628) (llvm/trunk 297643)"}
+!14 = distinct !DISubprogram(name: "scan", scope: !1, file: !1, line: 4, type: !15, isLocal: false, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null, !4, !4}
+!17 = !{!18}
+!18 = !DILocalVariable(name: "entry", scope: !14, file: !1, line: 6, type: !4)
+!19 = !DILocation(line: 6, column: 17, scope: !14)
+!20 = !DIExpression(DW_OP_plus, 0)
+!21 = !DILocation(line: 11, column: 1, scope: !14)
diff --git a/test/Transforms/InstCombine/double-float-shrink-2.ll b/test/Transforms/InstCombine/double-float-shrink-2.ll
index 435bd0b0fc9b641575bc860c14dbdd95eb46ab84..4813614f26cbd67d2f4ae5484053b877e9c2fd10 100644
--- a/test/Transforms/InstCombine/double-float-shrink-2.ll
+++ b/test/Transforms/InstCombine/double-float-shrink-2.ll
@@ -1,32 +1,9 @@
-; RUN: opt < %s -instcombine -S -mtriple "i386-pc-linux" | FileCheck -check-prefix=DO-SIMPLIFY %s
-; RUN: opt < %s -instcombine -S -mtriple "i386-pc-win32" | FileCheck -check-prefix=DONT-SIMPLIFY %s
-; RUN: opt < %s -instcombine -S -mtriple "x86_64-pc-win32" | FileCheck -check-prefix=C89-SIMPLIFY %s
-; RUN: opt < %s -instcombine -S -mtriple "i386-pc-mingw32" | FileCheck -check-prefix=DO-SIMPLIFY %s
-; RUN: opt < %s -instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck -check-prefix=DO-SIMPLIFY %s
-; RUN: opt < %s -instcombine -S -mtriple "sparc-sun-solaris" | FileCheck -check-prefix=DO-SIMPLIFY %s
-
-; DO-SIMPLIFY: call float @llvm.floor.f32(
-; DO-SIMPLIFY: call float @llvm.ceil.f32(
-; DO-SIMPLIFY: call float @llvm.round.f32(
-; DO-SIMPLIFY: call float @llvm.nearbyint.f32(
-; DO-SIMPLIFY: call float @llvm.trunc.f32(
-; DO-SIMPLIFY: call float @llvm.fabs.f32(
-; DO-SIMPLIFY: call fast float @llvm.fabs.f32(
-
-; C89-SIMPLIFY: call float @llvm.floor.f32(
-; C89-SIMPLIFY: call float @llvm.ceil.f32(
-; C89-SIMPLIFY: call double @round(
-; C89-SIMPLIFY: call double @nearbyint(
-
-; DONT-SIMPLIFY: call float @llvm.floor.f32(
-; DONT-SIMPLIFY: call float @llvm.ceil.f32(
-; DONT-SIMPLIFY: call double @round(
-; DONT-SIMPLIFY: call double @nearbyint(
-; DONT-SIMPLIFY: call double @trunc(
-
-; This is replaced with the intrinsic, which does the right thing on
-; all platforms.
-; DONT-SIMPLIFY: call float @llvm.fabs.f32(
+; RUN: opt < %s -instcombine -S -mtriple "i386-pc-linux" | FileCheck -check-prefix=DO-SIMPLIFY -check-prefix=ALL %s
+; RUN: opt < %s -instcombine -S -mtriple "i386-pc-win32" | FileCheck -check-prefix=DONT-SIMPLIFY -check-prefix=ALL %s
+; RUN: opt < %s -instcombine -S -mtriple "x86_64-pc-win32" | FileCheck -check-prefix=C89-SIMPLIFY -check-prefix=ALL %s
+; RUN: opt < %s -instcombine -S -mtriple "i386-pc-mingw32" | FileCheck -check-prefix=DO-SIMPLIFY -check-prefix=ALL %s
+; RUN: opt < %s -instcombine -S -mtriple "x86_64-pc-mingw32" | FileCheck -check-prefix=DO-SIMPLIFY -check-prefix=ALL %s
+; RUN: opt < %s -instcombine -S -mtriple "sparc-sun-solaris" | FileCheck -check-prefix=DO-SIMPLIFY -check-prefix=ALL %s
 
 declare double @floor(double)
 declare double @ceil(double)
@@ -34,9 +11,19 @@ declare double @round(double)
 declare double @nearbyint(double)
 declare double @trunc(double)
 declare double @fabs(double)
+
+declare double @llvm.floor.f64(double)
+declare double @llvm.ceil.f64(double)
+declare double @llvm.round.f64(double)
+declare double @llvm.nearbyint.f64(double)
+declare double @llvm.trunc.f64(double)
 declare double @llvm.fabs.f64(double)
 
-define float @test_floor(float %C) {
+; ALL-LABEL: @test_shrink_libcall_floor(
+; DO-SIMPLIFY: call float @llvm.floor.f32(
+; C89-SIMPLIFY: call float @llvm.floor.f32(
+; DONT-SIMPLIFY: call float @llvm.floor.f32(
+define float @test_shrink_libcall_floor(float %C) {
   %D = fpext float %C to double
   ; --> floorf
   %E = call double @floor(double %D)
@@ -44,7 +31,11 @@ define float @test_floor(float %C) {
   ret float %F
 }
 
-define float @test_ceil(float %C) {
+; ALL-LABEL: @test_shrink_libcall_ceil(
+; DO-SIMPLIFY: call float @llvm.ceil.f32(
+; C89-SIMPLIFY: call float @llvm.ceil.f32(
+; DONT-SIMPLIFY: call float @llvm.ceil.f32(
+define float @test_shrink_libcall_ceil(float %C) {
   %D = fpext float %C to double
   ; --> ceilf
   %E = call double @ceil(double %D)
@@ -52,7 +43,11 @@ define float @test_ceil(float %C) {
   ret float %F
 }
 
-define float @test_round(float %C) {
+; ALL-LABEL: @test_shrink_libcall_round(
+; DO-SIMPLIFY: call float @llvm.round.f32(
+; C89-SIMPLIFY: call double @round(
+; DONT-SIMPLIFY: call double @round(
+define float @test_shrink_libcall_round(float %C) {
   %D = fpext float %C to double
   ; --> roundf
   %E = call double @round(double %D)
@@ -60,7 +55,11 @@ define float @test_round(float %C) {
   ret float %F
 }
 
-define float @test_nearbyint(float %C) {
+; ALL-LABEL: @test_shrink_libcall_nearbyint(
+; DO-SIMPLIFY: call float @llvm.nearbyint.f32(
+; C89-SIMPLIFY: call double @nearbyint(
+; DONT-SIMPLIFY: call double @nearbyint(
+define float @test_shrink_libcall_nearbyint(float %C) {
   %D = fpext float %C to double
   ; --> nearbyintf
   %E = call double @nearbyint(double %D)
@@ -68,7 +67,10 @@ define float @test_nearbyint(float %C) {
   ret float %F
 }
 
-define float @test_trunc(float %C) {
+; ALL-LABEL: @test_shrink_libcall_trunc(
+; DO-SIMPLIFY: call float @llvm.trunc.f32(
+; DONT-SIMPLIFY: call double @trunc(
+define float @test_shrink_libcall_trunc(float %C) {
   %D = fpext float %C to double
   ; --> truncf
   %E = call double @trunc(double %D)
@@ -76,7 +78,13 @@ define float @test_trunc(float %C) {
   ret float %F
 }
 
-define float @test_fabs(float %C) {
+; ALL-LABEL: @test_shrink_libcall_fabs(
+; DO-SIMPLIFY: call float @llvm.fabs.f32(
+
+; This is replaced with the intrinsic, which does the right thing on
+; all platforms.
+; DONT-SIMPLIFY: call float @llvm.fabs.f32(
+define float @test_shrink_libcall_fabs(float %C) {
   %D = fpext float %C to double
   ; --> fabsf
   %E = call double @fabs(double %D)
@@ -85,10 +93,371 @@ define float @test_fabs(float %C) {
 }
 
 ; Make sure fast math flags are preserved
-define float @test_fabs_fast(float %C) {
+; ALL-LABEL: @test_shrink_libcall_fabs_fast(
+; DO-SIMPLIFY: call fast float @llvm.fabs.f32(
+define float @test_shrink_libcall_fabs_fast(float %C) {
   %D = fpext float %C to double
   ; --> fabsf
   %E = call fast double @fabs(double %D)
   %F = fptrunc double %E to float
   ret float %F
 }
+
+; ALL-LABEL: @test_shrink_intrin_floor(
+; ALL: call float @llvm.floor.f32(
+define float @test_shrink_intrin_floor(float %C) {
+  %D = fpext float %C to double
+  ; --> floorf
+  %E = call double @llvm.floor.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_ceil(
+; ALL: call float @llvm.ceil.f32(
+define float @test_shrink_intrin_ceil(float %C) {
+  %D = fpext float %C to double
+  ; --> ceilf
+  %E = call double @llvm.ceil.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_round(
+; ALL: call float @llvm.round.f32(
+define float @test_shrink_intrin_round(float %C) {
+  %D = fpext float %C to double
+  ; --> roundf
+  %E = call double @llvm.round.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_nearbyint(
+; ALL: call float @llvm.nearbyint.f32(
+define float @test_shrink_intrin_nearbyint(float %C) {
+  %D = fpext float %C to double
+  ; --> nearbyintf
+  %E = call double @llvm.nearbyint.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_trunc(
+; ALL-SIMPLIFY: call float @llvm.trunc.f32(
+define float @test_shrink_intrin_trunc(float %C) {
+  %D = fpext float %C to double
+  %E = call double @llvm.trunc.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_fabs(
+; ALL: call float @llvm.fabs.f32(
+define float @test_shrink_intrin_fabs(float %C) {
+  %D = fpext float %C to double
+  %E = call double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_shrink_intrin_fabs_fast(
+; ALL: call fast float @llvm.fabs.f32(
+define float @test_shrink_intrin_fabs_fast(float %C) {
+  %D = fpext float %C to double
+  %E = call fast double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_floor(
+; ALL: call double @llvm.floor.f64(
+define float @test_no_shrink_intrin_floor(double %D) {
+  %E = call double @llvm.floor.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_ceil(
+; ALL: call double @llvm.ceil.f64(
+define float @test_no_shrink_intrin_ceil(double %D) {
+  %E = call double @llvm.ceil.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_round(
+; ALL: call double @llvm.round.f64(
+define float @test_no_shrink_intrin_round(double %D) {
+  %E = call double @llvm.round.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_nearbyint(
+; ALL: call double @llvm.nearbyint.f64(
+define float @test_no_shrink_intrin_nearbyint(double %D) {
+  %E = call double @llvm.nearbyint.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_trunc(
+; ALL-SIMPLIFY: call double @llvm.trunc.f64(
+define float @test_no_shrink_intrin_trunc(double %D) {
+  %E = call double @llvm.trunc.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_fabs_double_src(
+; ALL: call float @llvm.fabs.f32(
+define float @test_shrink_intrin_fabs_double_src(double %D) {
+  %E = call double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_shrink_intrin_fabs_fast_double_src(
+; ALL: call fast float @llvm.fabs.f32(
+define float @test_shrink_intrin_fabs_fast_double_src(double %D) {
+  %E = call fast double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_floor(
+; ALL: ret float 2.000000e+00
+define float @test_shrink_float_convertible_constant_intrin_floor() {
+  %E = call double @llvm.floor.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_ceil(
+; ALL: ret float 3.000000e+00
+define float @test_shrink_float_convertible_constant_intrin_ceil() {
+  %E = call double @llvm.ceil.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_round(
+; ALL: ret float 2.000000e+00
+define float @test_shrink_float_convertible_constant_intrin_round() {
+  %E = call double @llvm.round.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_nearbyint(
+; ALL: ret float 2.000000e+00
+define float @test_shrink_float_convertible_constant_intrin_nearbyint() {
+  %E = call double @llvm.nearbyint.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_trunc(
+; ALL: ret float 2.000000e+00
+define float @test_shrink_float_convertible_constant_intrin_trunc() {
+  %E = call double @llvm.trunc.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_fabs(
+; ALL: ret float 0x4000CCCCC0000000
+define float @test_shrink_float_convertible_constant_intrin_fabs() {
+  %E = call double @llvm.fabs.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_shrink_float_convertible_constant_intrin_fabs_fast(
+; ALL: ret float 0x4000CCCCC0000000
+define float @test_shrink_float_convertible_constant_intrin_fabs_fast() {
+  %E = call fast double @llvm.fabs.f64(double 2.1)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_mismatched_type_intrin_floor(
+; ALL-NEXT: %E = call double @llvm.floor.f64(double %D)
+; ALL-NEXT: %F = fptrunc double %E to half
+; ALL-NEXT: ret half %F
+define half @test_no_shrink_mismatched_type_intrin_floor(double %D) {
+  %E = call double @llvm.floor.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_no_shrink_mismatched_type_intrin_ceil(
+; ALL-NEXT: %E = call double @llvm.ceil.f64(double %D)
+; ALL-NEXT: %F = fptrunc double %E to half
+; ALL-NEXT: ret half %F
+define half @test_no_shrink_mismatched_type_intrin_ceil(double %D) {
+  %E = call double @llvm.ceil.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_no_shrink_mismatched_type_intrin_round(
+; ALL-NEXT: %E = call double @llvm.round.f64(double %D)
+; ALL-NEXT: %F = fptrunc double %E to half
+; ALL-NEXT: ret half %F
+define half @test_no_shrink_mismatched_type_intrin_round(double %D) {
+  %E = call double @llvm.round.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_no_shrink_mismatched_type_intrin_nearbyint(
+; ALL-NEXT: %E = call double @llvm.nearbyint.f64(double %D)
+; ALL-NEXT: %F = fptrunc double %E to half
+; ALL-NEXT: ret half %F
+define half @test_no_shrink_mismatched_type_intrin_nearbyint(double %D) {
+  %E = call double @llvm.nearbyint.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_no_shrink_mismatched_type_intrin_trunc(
+; ALL-NEXT: %E = call double @llvm.trunc.f64(double %D)
+; ALL-NEXT: %F = fptrunc double %E to half
+; ALL-NEXT: ret half %F
+define half @test_no_shrink_mismatched_type_intrin_trunc(double %D) {
+  %E = call double @llvm.trunc.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_shrink_mismatched_type_intrin_fabs_double_src(
+; ALL-NEXT: %1 = fptrunc double %D to half
+; ALL-NEXT: %F = call half @llvm.fabs.f16(half %1)
+; ALL-NEXT: ret half %F
+define half @test_shrink_mismatched_type_intrin_fabs_double_src(double %D) {
+  %E = call double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_mismatched_type_intrin_fabs_fast_double_src(
+; ALL-NEXT: %1 = fptrunc double %D to half
+; ALL-NEXT: %F = call fast half @llvm.fabs.f16(half %1)
+; ALL-NEXT: ret half %F
+define half @test_mismatched_type_intrin_fabs_fast_double_src(double %D) {
+  %E = call fast double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to half
+  ret half %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_floor_fp16_src(
+; ALL-NEXT: %E = call half @llvm.floor.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+define float @test_shrink_intrin_floor_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.floor.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_ceil_fp16_src(
+; ALL-NEXT: %E = call half @llvm.ceil.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_ceil_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.ceil.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_round_fp16_src(
+; ALL-NEXT: %E = call half @llvm.round.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_round_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.round.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_nearbyint_fp16_src(
+; ALL-NEXT: %E = call half @llvm.nearbyint.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_nearbyint_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.nearbyint.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_trunc_fp16_src(
+; ALL-NEXT: %E = call half @llvm.trunc.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_trunc_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.trunc.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_shrink_intrin_fabs_fp16_src(
+; ALL-NEXT: %E = call half @llvm.fabs.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_fabs_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; Make sure fast math flags are preserved
+; ALL-LABEL: @test_shrink_intrin_fabs_fast_fp16_src(
+; ALL-NEXT: %E = call fast half @llvm.fabs.f16(half %C)
+; ALL-NEXT: %1 = fpext half %E to double
+; ALL-NEXT: %F = fptrunc double %1 to float
+; ALL-NEXT: ret float %F
+define float @test_shrink_intrin_fabs_fast_fp16_src(half %C) {
+  %D = fpext half %C to double
+  %E = call fast double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_floor_multi_use_fpext(
+; ALL: %D = fpext half %C to double
+; ALL: call double @llvm.floor.f64
+define float @test_no_shrink_intrin_floor_multi_use_fpext(half %C) {
+  %D = fpext half %C to double
+  store volatile double %D, double* undef
+  %E = call double @llvm.floor.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
+
+; ALL-LABEL: @test_no_shrink_intrin_fabs_multi_use_fpext(
+; ALL: %D = fpext half %C to double
+; ALL: call double @llvm.fabs.f64
+define float @test_no_shrink_intrin_fabs_multi_use_fpext(half %C) {
+  %D = fpext half %C to double
+  store volatile double %D, double* undef
+  %E = call double @llvm.fabs.f64(double %D)
+  %F = fptrunc double %E to float
+  ret float %F
+}
diff --git a/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll b/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
new file mode 100644
index 0000000000000000000000000000000000000000..107440f10a5a22623fb206ca17d6c91815c5a839
--- /dev/null
+++ b/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll
@@ -0,0 +1,92 @@
+; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Test basic unfolding
+define void @test1(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test1
+; CHECK-NOT: llvm.memcpy.element.atomic
+
+; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32*
+; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32*
+
+; CHECK-DAG: [[VAL1:%[^\s]+]] =  load atomic i32, i32* %memcpy_unfold.src_casted unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL1]], i32* %memcpy_unfold.dst_casted unordered, align 8
+
+; CHECK-DAG: [[VAL2:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL2]], i32* %{{[^\s]+}} unordered, align 4
+
+; CHECK-DAG: [[VAL3:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL3]], i32* %{{[^\s]+}} unordered, align 4
+
+; CHECK-DAG: [[VAL4:%[^\s]+]] =  load atomic i32, i32* %{{[^\s]+}} unordered, align 4
+; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 8 %Src, i64 4, i32 4)
+  ret void
+}
+
+; Test that we don't unfold too much
+define void @test2(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test2
+
+; CHECK-NOT: load
+; CHECK-NOT: store
+; CHECK: llvm.memcpy.element.atomic
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 4 %Dst, i8* align 4 %Src, i64 1000, i32 4)
+  ret void
+}
+
+; Test that we will not unfold into non native integers
+define void @test3(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test3
+
+; CHECK-NOT: load
+; CHECK-NOT: store
+; CHECK: llvm.memcpy.element.atomic
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 4, i32 64)
+  ret void
+}
+
+; Test that we will eliminate redundant bitcasts
+define void @test4(i64* %Src, i64* %Dst) {
+; CHECK-LABEL: test4
+; CHECK-NOT: llvm.memcpy.element.atomic
+
+; CHECK-NOT: bitcast
+
+; CHECK-DAG: [[VAL1:%[^\s]+]] =  load atomic i64, i64* %Src unordered, align 16
+; CHECK-DAG: store atomic i64 [[VAL1]], i64* %Dst unordered, align 16
+
+; CHECK-DAG: [[SRC_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Src, i64 1
+; CHECK-DAG: [[DST_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 1
+; CHECK-DAG: [[VAL2:%[^\s]+]] =  load atomic i64, i64* [[SRC_ADDR2]] unordered, align 8
+; CHECK-DAG: store atomic i64 [[VAL2]], i64* [[DST_ADDR2]] unordered, align 8
+
+; CHECK-DAG: [[SRC_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Src, i64 2
+; CHECK-DAG: [[DST_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 2
+; CHECK-DAG: [[VAL3:%[^ ]+]] =  load atomic i64, i64* [[SRC_ADDR3]] unordered, align 8
+; CHECK-DAG: store atomic i64 [[VAL3]], i64* [[DST_ADDR3]] unordered, align 8
+
+; CHECK-DAG: [[SRC_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Src, i64 3
+; CHECK-DAG: [[DST_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 3
+; CHECK-DAG: [[VAL4:%[^ ]+]] =  load atomic i64, i64* [[SRC_ADDR4]] unordered, align 8
+; CHECK-DAG: store atomic i64 [[VAL4]], i64* [[DST_ADDR4]] unordered, align 8
+entry:
+  %Src.casted = bitcast i64* %Src to i8*
+  %Dst.casted = bitcast i64* %Dst to i8*
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i64 4, i32 8)
+  ret void
+}
+
+define void @test5(i8* %Src, i8* %Dst) {
+; CHECK-LABEL: test5
+
+; CHECK-NOT: llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
+entry:
+  call void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* align 64 %Dst, i8* align 64 %Src, i64 0, i32 64)
+  ret void
+}
+
+declare void @llvm.memcpy.element.atomic.p0i8.p0i8(i8* nocapture, i8* nocapture, i64, i32)
diff --git a/test/Transforms/InstCombine/fast-math.ll b/test/Transforms/InstCombine/fast-math.ll
index ad8a9247e4e1d83c81df68f8f3a70376b896ac5e..6ddf3a58529f47da2150bdea7ecf8bfe47bfd3c5 100644
--- a/test/Transforms/InstCombine/fast-math.ll
+++ b/test/Transforms/InstCombine/fast-math.ll
@@ -831,3 +831,26 @@ define fp128 @min4(fp128 %a, fp128 %b) {
 ; CHECK-NEXT:  select {{.*}} fp128 %a, fp128 %b 
 ; CHECK-NEXT:  ret
 }
+
+define float @test55(i1 %which, float %a) {
+; CHECK-LABEL: @test55(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    [[PHITMP:%.*]] = fadd fast float [[A:%.*]], 1.000000e+00
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi float [ 3.000000e+00, [[ENTRY:%.*]] ], [ [[PHITMP]], [[DELAY]] ]
+; CHECK-NEXT:    ret float [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi float [ 2.0, %entry ], [ %a, %delay ]
+  %value = fadd fast float %A, 1.0
+  ret float %value
+}
diff --git a/test/Transforms/InstCombine/fcmp.ll b/test/Transforms/InstCombine/fcmp.ll
index 7fd46f2281832ce13876c4fe7b6eabd4eea6d33e..40f7bf9b64fa8eda52ce43347f8aa04626b5b369 100644
--- a/test/Transforms/InstCombine/fcmp.ll
+++ b/test/Transforms/InstCombine/fcmp.ll
@@ -3,238 +3,291 @@
 declare double @llvm.fabs.f64(double) nounwind readnone
 
 define i1 @test1(float %x, float %y) nounwind {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %ext1 = fpext float %x to double
   %ext2 = fpext float %y to double
   %cmp = fcmp ogt double %ext1, %ext2
   ret i1 %cmp
-; CHECK-LABEL: @test1(
-; CHECK-NEXT: fcmp ogt float %x, %y
 }
 
 define i1 @test2(float %a) nounwind {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float %a, 1.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %ext = fpext float %a to double
   %cmp = fcmp ogt double %ext, 1.000000e+00
   ret i1 %cmp
-; CHECK-LABEL: @test2(
-; CHECK-NEXT: fcmp ogt float %a, 1.0
 }
 
 define i1 @test3(float %a) nounwind {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[EXT:%.*]] = fpext float %a to double
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[EXT]], 0x3FF0000000000001
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %ext = fpext float %a to double
   %cmp = fcmp ogt double %ext, 0x3FF0000000000001 ; more precision than float.
   ret i1 %cmp
-; CHECK-LABEL: @test3(
-; CHECK-NEXT: fpext float %a to double
 }
 
 define i1 @test4(float %a) nounwind {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[EXT:%.*]] = fpext float %a to double
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt double [[EXT]], 0x36A0000000000000
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %ext = fpext float %a to double
   %cmp = fcmp ogt double %ext, 0x36A0000000000000 ; denormal in float.
   ret i1 %cmp
-; CHECK-LABEL: @test4(
-; CHECK-NEXT: fpext float %a to double
 }
 
 define i1 @test5(float %a) nounwind {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float %a, -1.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %neg = fsub float -0.000000e+00, %a
   %cmp = fcmp ogt float %neg, 1.000000e+00
   ret i1 %cmp
-; CHECK-LABEL: @test5(
-; CHECK-NEXT: fcmp olt float %a, -1.0
 }
 
 define i1 @test6(float %x, float %y) nounwind {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %neg1 = fsub float -0.000000e+00, %x
   %neg2 = fsub float -0.000000e+00, %y
   %cmp = fcmp olt float %neg1, %neg2
   ret i1 %cmp
-; CHECK-LABEL: @test6(
-; CHECK-NEXT: fcmp ogt float %x, %y
 }
 
 define i1 @test7(float %x) nounwind readnone ssp noredzone {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float %x, 0.000000e+00
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %ext = fpext float %x to ppc_fp128
   %cmp = fcmp ogt ppc_fp128 %ext, 0xM00000000000000000000000000000000
   ret i1 %cmp
-; CHECK-LABEL: @test7(
-; CHECK-NEXT: fcmp ogt float %x, 0.000000e+00
 }
 
 define float @test8(float %x) nounwind readnone optsize ssp {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp olt float %x, 0.000000e+00
+; CHECK-NEXT:    [[CONV2:%.*]] = uitofp i1 [[CMP]] to float
+; CHECK-NEXT:    ret float [[CONV2]]
+;
   %conv = fpext float %x to double
   %cmp = fcmp olt double %conv, 0.000000e+00
   %conv1 = zext i1 %cmp to i32
   %conv2 = sitofp i32 %conv1 to float
   ret float %conv2
 ; Float comparison to zero shouldn't cast to double.
-; CHECK-LABEL: @test8(
-; CHECK-NEXT: fcmp olt float %x, 0.000000e+00
 }
 
 declare double @fabs(double) nounwind readnone
 
 define i32 @test9(double %a) nounwind {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    ret i32 0
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp olt double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test9(
-; CHECK-NOT: fabs
-; CHECK: ret i32 0
 }
 
 define i32 @test9_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test9_intrinsic(
+; CHECK-NEXT:    ret i32 0
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp olt double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test9_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: ret i32 0
 }
 
 define i32 @test10(double %a) nounwind {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp ole double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test10(
-; CHECK-NOT: fabs
-; CHECK: fcmp oeq double %a, 0.000000e+00
 }
 
 define i32 @test10_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test10_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp ole double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test10_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp oeq double %a, 0.000000e+00
 }
 
 define i32 @test11(double %a) nounwind {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp ogt double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test11(
-; CHECK-NOT: fabs
-; CHECK: fcmp one double %a, 0.000000e+00
 }
 
 define i32 @test11_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test11_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp ogt double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test11_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp one double %a, 0.000000e+00
 }
 
 define i32 @test12(double %a) nounwind {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp oge double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test12(
-; CHECK-NOT: fabs
-; CHECK: fcmp ord double %a, 0.000000e+00
 }
 
 define i32 @test12_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test12_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ord double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp oge double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test12_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp ord double %a, 0.000000e+00
 }
 
 define i32 @test13(double %a) nounwind {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp une double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test13(
-; CHECK-NOT: fabs
-; CHECK: fcmp une double %a, 0.000000e+00
 }
 
 define i32 @test13_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test13_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp une double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test13_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp une double %a, 0.000000e+00
 }
 
 define i32 @test14(double %a) nounwind {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp oeq double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test14(
-; CHECK-NOT: fabs
-; CHECK: fcmp oeq double %a, 0.000000e+00
 }
 
 define i32 @test14_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test14_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp oeq double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test14_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp oeq double %a, 0.000000e+00
 }
 
 define i32 @test15(double %a) nounwind {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp one double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test15(
-; CHECK-NOT: fabs
-; CHECK: fcmp one double %a, 0.000000e+00
 }
 
 define i32 @test15_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test15_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp one double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp one double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test15_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp one double %a, 0.000000e+00
 }
 
 define i32 @test16(double %a) nounwind {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @fabs(double %a) nounwind
   %cmp = fcmp ueq double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test16(
-; CHECK-NOT: fabs
-; CHECK: fcmp ueq double %a, 0.000000e+00
 }
 
 define i32 @test16_intrinsic(double %a) nounwind {
+; CHECK-LABEL: @test16_intrinsic(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq double %a, 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double @llvm.fabs.f64(double %a) nounwind
   %cmp = fcmp ueq double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
   ret i32 %conv
-; CHECK-LABEL: @test16_intrinsic(
-; CHECK-NOT: fabs
-; CHECK: fcmp ueq double %a, 0.000000e+00
 }
 
 ; Don't crash.
 define i32 @test17(double %a, double (double)* %p) nounwind {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    [[CALL:%.*]] = tail call double %p(double %a) #1
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ueq double [[CALL]], 0.000000e+00
+; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[CMP]] to i32
+; CHECK-NEXT:    ret i32 [[CONV]]
+;
   %call = tail call double %p(double %a) nounwind
   %cmp = fcmp ueq double %call, 0.000000e+00
   %conv = zext i1 %cmp to i32
@@ -243,16 +296,18 @@ define i32 @test17(double %a, double (double)* %p) nounwind {
 
 ; Can fold fcmp with undef on one side by choosing NaN for the undef
 define i32 @test18_undef_unordered(float %a) nounwind {
-; CHECK-LABEL: @test18_undef_unordered
-; CHECK: ret i32 1
+; CHECK-LABEL: @test18_undef_unordered(
+; CHECK-NEXT:    ret i32 1
+;
   %cmp = fcmp ueq float %a, undef
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 }
 ; Can fold fcmp with undef on one side by choosing NaN for the undef
 define i32 @test18_undef_ordered(float %a) nounwind {
-; CHECK-LABEL: @test18_undef_ordered
-; CHECK: ret i32 0
+; CHECK-LABEL: @test18_undef_ordered(
+; CHECK-NEXT:    ret i32 0
+;
   %cmp = fcmp oeq float %a, undef
   %conv = zext i1 %cmp to i32
   ret i32 %conv
@@ -264,14 +319,18 @@ define i32 @test18_undef_ordered(float %a) nounwind {
 ; because whatever you choose for the first undef
 ; you can choose NaN for the other undef
 define i1 @test19_undef_unordered() nounwind {
-; CHECK-LABEL: @test19_undef
-; CHECK: ret i1 true
+; CHECK-LABEL: @test19_undef_unordered(
+; CHECK-NEXT:    ret i1 true
+;
   %cmp = fcmp ueq float undef, undef
   ret i1 %cmp
 }
+
 define i1 @test19_undef_ordered() nounwind {
-; CHECK-LABEL: @test19_undef
-; CHECK: ret i1 false
+; CHECK-LABEL: @test19_undef_ordered(
+; CHECK-NEXT:    ret i1 false
+;
   %cmp = fcmp oeq float undef, undef
   ret i1 %cmp
 }
+
diff --git a/test/Transforms/InstCombine/float-shrink-compare.ll b/test/Transforms/InstCombine/float-shrink-compare.ll
index a98f4cd1cb422019d98c7c9ab3e539879e5564c2..e0925952bf44d533463c9e7b040a244dd74dc5f3 100644
--- a/test/Transforms/InstCombine/float-shrink-compare.ll
+++ b/test/Transforms/InstCombine/float-shrink-compare.ll
@@ -119,7 +119,7 @@ define i32 @test5(float %x, float %y) nounwind uwtable {
   %cmp.ext = zext i1 %cmp to i32
   ret i32 %cmp.ext
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT: %rint = call float @llvm.nearbyint.f32(float %x)
+; CHECK-NEXT: %rint = call float @llvm.rint.f32(float %x)
 ; CHECK-NEXT: fcmp oeq float %rint, %y
 }
 
@@ -276,7 +276,7 @@ define i32 @test12(float %x, float %y) nounwind uwtable {
   %cmp.ext = zext i1 %cmp to i32
   ret i32 %cmp.ext
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT: %rint = call float @llvm.nearbyint.f32(float %x)
+; CHECK-NEXT: %rint = call float @llvm.rint.f32(float %x)
 ; CHECK-NEXT: fcmp oeq float %rint, %y
 }
 
diff --git a/test/Transforms/InstCombine/fma.ll b/test/Transforms/InstCombine/fma.ll
index e41f1e7edd4600cf1911f60e394cd9c497884d12..3808e07d89a0e33f1184112855bb57c13848911a 100644
--- a/test/Transforms/InstCombine/fma.ll
+++ b/test/Transforms/InstCombine/fma.ll
@@ -78,7 +78,8 @@ define float @fmuladd_fneg_x_fneg_y(float %x, float %y, float %z) {
 }
 
 ; CHECK-LABEL: @fmuladd_fneg_x_fneg_y_fast(
-; CHECK: %fmuladd = call fast float @llvm.fmuladd.f32(float %x, float %y, float %z)
+; CHECK-NEXT: %1 = fmul fast float %x, %y
+; CHECK-NEXT: %fmuladd = fadd fast float %1, %z
 define float @fmuladd_fneg_x_fneg_y_fast(float %x, float %y, float %z) {
   %x.fneg = fsub float -0.0, %x
   %y.fneg = fsub float -0.0, %y
@@ -122,7 +123,8 @@ define float @fmuladd_fabs_x_fabs_x(float %x, float %z) {
 }
 
 ; CHECK-LABEL: @fmuladd_fabs_x_fabs_x_fast(
-; CHECK: %fmuladd = call fast float @llvm.fmuladd.f32(float %x, float %x, float %z)
+; CHECK-NEXT: %1 = fmul fast float %x, %x
+; CHECK-NEXT: %fmuladd = fadd fast float %1, %z
 define float @fmuladd_fabs_x_fabs_x_fast(float %x, float %z) {
   %x.fabs = call float @llvm.fabs.f32(float %x)
   %fmuladd = call fast float @llvm.fmuladd.f32(float %x.fabs, float %x.fabs, float %z)
@@ -144,7 +146,8 @@ define float @fma_k_y_z_fast(float %y, float %z) {
 }
 
 ; CHECK-LABEL: @fmuladd_k_y_z_fast(
-; CHECK: %fmuladd = call fast float @llvm.fmuladd.f32(float %y, float 4.000000e+00, float %z)
+; CHECK: %1 = fmul fast float %y, 4.000000e+00
+; CHECK-NEXT: %fmuladd = fadd fast float %1, %z
 define float @fmuladd_k_y_z_fast(float %y, float %z) {
   %fmuladd = call fast float @llvm.fmuladd.f32(float 4.0, float %y, float %z)
   ret float %fmuladd
diff --git a/test/Transforms/InstCombine/getelementptr.ll b/test/Transforms/InstCombine/getelementptr.ll
index 6e2ac92b93c57b93483b47d3d09b65aa4f31c9dc..de8190da01c22d961abdd24709d65e537bdb7519 100644
--- a/test/Transforms/InstCombine/getelementptr.ll
+++ b/test/Transforms/InstCombine/getelementptr.ll
@@ -931,4 +931,15 @@ define i32 addrspace(1)* @ascast_0_0_gep([128 x i32]* %p) nounwind {
   ret i32 addrspace(1)* %x
 }
 
+define <2 x i32*> @PR32414(i32** %ptr) {
+; CHECK-LABEL: @PR32414(
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32** %ptr to i32*
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* [[TMP0]], <2 x i64> <i64 0, i64 1>
+; CHECK-NEXT:    ret <2 x i32*> [[TMP1]]
+;
+  %tmp0 = bitcast i32** %ptr to i32*
+  %tmp1 = getelementptr inbounds i32, i32* %tmp0, <2 x i64> <i64 0, i64 1>
+  ret <2 x i32*> %tmp1
+}
+
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/InstCombine/icmp-add.ll b/test/Transforms/InstCombine/icmp-add.ll
new file mode 100644
index 0000000000000000000000000000000000000000..efeb9d5bb45baef0ab8fedb9a42f242de88a69fa
--- /dev/null
+++ b/test/Transforms/InstCombine/icmp-add.ll
@@ -0,0 +1,247 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+; PR1949
+
+define i1 @test1(i32 %a) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt i32 %a, -5
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add i32 %a, 4
+  %c = icmp ult i32 %b, 4
+  ret i1 %c
+}
+
+define <2 x i1> @test1vec(<2 x i32> %a) {
+; CHECK-LABEL: @test1vec(
+; CHECK-NEXT:    [[C:%.*]] = icmp ugt <2 x i32> %a, <i32 -5, i32 -5>
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %b = add <2 x i32> %a, <i32 4, i32 4>
+  %c = icmp ult <2 x i32> %b, <i32 4, i32 4>
+  ret <2 x i1> %c
+}
+
+define i1 @test2(i32 %a) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 %a, 4
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = sub i32 %a, 4
+  %c = icmp ugt i32 %b, -5
+  ret i1 %c
+}
+
+define <2 x i1> @test2vec(<2 x i32> %a) {
+; CHECK-LABEL: @test2vec(
+; CHECK-NEXT:    [[C:%.*]] = icmp ult <2 x i32> %a, <i32 4, i32 4>
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %b = sub <2 x i32> %a, <i32 4, i32 4>
+  %c = icmp ugt <2 x i32> %b, <i32 -5, i32 -5>
+  ret <2 x i1> %c
+}
+
+define i1 @test3(i32 %a) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt i32 %a, 2147483643
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add i32 %a, 4
+  %c = icmp slt i32 %b, 2147483652
+  ret i1 %c
+}
+
+define <2 x i1> @test3vec(<2 x i32> %a) {
+; CHECK-LABEL: @test3vec(
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt <2 x i32> %a, <i32 2147483643, i32 2147483643>
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %b = add <2 x i32> %a, <i32 4, i32 4>
+  %c = icmp slt <2 x i32> %b, <i32 2147483652, i32 2147483652>
+  ret <2 x i1> %c
+}
+
+define i1 @test4(i32 %a) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i32 %a, -4
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add i32 %a, 2147483652
+  %c = icmp sge i32 %b, 4
+  ret i1 %c
+}
+
+define <2 x i1> @test4vec(<2 x i32> %a) {
+; CHECK-LABEL: @test4vec(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt <2 x i32> %a, <i32 -4, i32 -4>
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %b = add <2 x i32> %a, <i32 2147483652, i32 2147483652>
+  %c = icmp sge <2 x i32> %b, <i32 4, i32 4>
+  ret <2 x i1> %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; This becomes equality because it's at the limit.
+
+define i1 @nsw_slt1(i8 %a) {
+; CHECK-LABEL: @nsw_slt1(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 %a, -128
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add nsw i8 %a, 100
+  %c = icmp slt i8 %b, -27
+  ret i1 %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; This becomes equality because it's at the limit.
+
+define i1 @nsw_slt2(i8 %a) {
+; CHECK-LABEL: @nsw_slt2(
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 %a, 127
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add nsw i8 %a, -100
+  %c = icmp slt i8 %b, 27
+  ret i1 %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; Less than the limit, so the predicate doesn't change.
+
+define i1 @nsw_slt3(i8 %a) {
+; CHECK-LABEL: @nsw_slt3(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 %a, -126
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add nsw i8 %a, 100
+  %c = icmp slt i8 %b, -26
+  ret i1 %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; Less than the limit, so the predicate doesn't change.
+
+define i1 @nsw_slt4(i8 %a) {
+; CHECK-LABEL: @nsw_slt4(
+; CHECK-NEXT:    [[C:%.*]] = icmp slt i8 %a, 126
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add nsw i8 %a, -100
+  %c = icmp slt i8 %b, 26
+  ret i1 %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; Try sgt to make sure that works too.
+
+define i1 @nsw_sgt1(i8 %a) {
+; CHECK-LABEL: @nsw_sgt1(
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i8 %a, 127
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %b = add nsw i8 %a, -100
+  %c = icmp sgt i8 %b, 26
+  ret i1 %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; Try a vector type to make sure that works too.
+; FIXME: This should be 'eq 127' as above.
+
+define <2 x i1> @nsw_sgt2_splat_vec(<2 x i8> %a) {
+; CHECK-LABEL: @nsw_sgt2_splat_vec(
+; CHECK-NEXT:    [[C:%.*]] = icmp sgt <2 x i8> %a, <i8 -126, i8 -126>
+; CHECK-NEXT:    ret <2 x i1> [[C]]
+;
+  %b = add nsw <2 x i8> %a, <i8 100, i8 100>
+  %c = icmp sgt <2 x i8> %b, <i8 -26, i8 -26>
+  ret <2 x i1> %c
+}
+
+; icmp Pred (add nsw X, C2), C --> icmp Pred X, (C - C2), when C - C2 does not overflow.
+; Comparison with 0 doesn't need special-casing.
+
+define i1 @slt_zero_add_nsw(i32 %a) {
+; CHECK-LABEL: @slt_zero_add_nsw(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %a, -1
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %add = add nsw i32 %a, 1
+  %cmp = icmp slt i32 %add, 0
+  ret i1 %cmp
+}
+
+; The same fold should work with vectors.
+
+define <2 x i1> @slt_zero_add_nsw_splat_vec(<2 x i8> %a) {
+; CHECK-LABEL: @slt_zero_add_nsw_splat_vec(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <2 x i8> %a, <i8 -1, i8 -1>
+; CHECK-NEXT:    ret <2 x i1> [[CMP]]
+;
+  %add = add nsw <2 x i8> %a, <i8 1, i8 1>
+  %cmp = icmp slt <2 x i8> %add, zeroinitializer
+  ret <2 x i1> %cmp
+}
+
+; Test the edges - instcombine should not interfere with simplification to constants.
+; Constant subtraction does not overflow, but this is false.
+
+define i1 @nsw_slt3_ov_no(i8 %a) {
+; CHECK-LABEL: @nsw_slt3_ov_no(
+; CHECK-NEXT:    ret i1 false
+;
+  %b = add nsw i8 %a, 100
+  %c = icmp slt i8 %b, -28
+  ret i1 %c
+}
+
+; Test the edges - instcombine should not interfere with simplification to constants.
+; Constant subtraction overflows. This is false.
+
+define i1 @nsw_slt4_ov(i8 %a) {
+; CHECK-LABEL: @nsw_slt4_ov(
+; CHECK-NEXT:    ret i1 false
+;
+  %b = add nsw i8 %a, 100
+  %c = icmp slt i8 %b, -29
+  ret i1 %c
+}
+
+; Test the edges - instcombine should not interfere with simplification to constants.
+; Constant subtraction overflows. This is true.
+
+define i1 @nsw_slt5_ov(i8 %a) {
+; CHECK-LABEL: @nsw_slt5_ov(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = add nsw i8 %a, -100
+  %c = icmp slt i8 %b, 28
+  ret i1 %c
+}
+
+; InstCombine should not thwart this opportunity to simplify completely.
+
+define i1 @slt_zero_add_nsw_signbit(i8 %x) {
+; CHECK-LABEL: @slt_zero_add_nsw_signbit(
+; CHECK-NEXT:    ret i1 true
+;
+  %y = add nsw i8 %x, -128
+  %z = icmp slt i8 %y, 0
+  ret i1 %z
+}
+
+; InstCombine should not thwart this opportunity to simplify completely.
+
+define i1 @slt_zero_add_nuw_signbit(i8 %x) {
+; CHECK-LABEL: @slt_zero_add_nuw_signbit(
+; CHECK-NEXT:    ret i1 true
+;
+  %y = add nuw i8 %x, 128
+  %z = icmp slt i8 %y, 0
+  ret i1 %z
+}
+
diff --git a/test/Transforms/InstCombine/icmp.ll b/test/Transforms/InstCombine/icmp.ll
index e3f00f6cb875ca736b6c47fe44354f5c3a2482bd..b1dc395b3d95da6c11e694418e8308a05e4b8308 100644
--- a/test/Transforms/InstCombine/icmp.ll
+++ b/test/Transforms/InstCombine/icmp.ll
@@ -918,7 +918,7 @@ define i1 @test60_as1(i8 addrspace(1)* %foo, i64 %i, i64 %j) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %i to i16
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 %j to i16
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i16 [[TMP1]], 2
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp sgt i16 [[TMP2]], [[GEP1_IDX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i16 [[GEP1_IDX]], [[TMP2]]
 ; CHECK-NEXT:    ret i1 [[TMP3]]
 ;
   %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)*
@@ -949,7 +949,7 @@ define i1 @test60_addrspacecast_smaller(i8* %foo, i16 %i, i64 %j) {
 ; CHECK-LABEL: @test60_addrspacecast_smaller(
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i16 %i, 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %j to i16
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i16 [[TMP1]], [[GEP1_IDX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i16 [[GEP1_IDX]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
   %bit = addrspacecast i8* %foo to i32 addrspace(1)*
@@ -981,7 +981,7 @@ define i1 @test61(i8* %foo, i64 %i, i64 %j) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32* [[BIT]], i64 %i
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, i8* %foo, i64 %j
 ; CHECK-NEXT:    [[CAST1:%.*]] = bitcast i32* [[GEP1]] to i8*
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8* [[CAST1]], [[GEP2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8* [[GEP2]], [[CAST1]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %bit = bitcast i8* %foo to i32*
@@ -999,7 +999,7 @@ define i1 @test61_as1(i8 addrspace(1)* %foo, i16 %i, i16 %j) {
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, i32 addrspace(1)* [[BIT]], i16 %i
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, i8 addrspace(1)* %foo, i16 %j
 ; CHECK-NEXT:    [[CAST1:%.*]] = bitcast i32 addrspace(1)* [[GEP1]] to i8 addrspace(1)*
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 addrspace(1)* [[CAST1]], [[GEP2]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 addrspace(1)* [[GEP2]], [[CAST1]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)*
@@ -1876,6 +1876,55 @@ define <2 x i1> @icmp_and_X_-16_ne-16_vec(<2 x i32> %X) {
   ret <2 x i1> %cmp
 }
 
+; PR32524: https://bugs.llvm.org/show_bug.cgi?id=32524
+; X | C == C --> X <=u C (when C+1 is PowerOf2).
+
+define i1 @or1_eq1(i32 %x) {
+; CHECK-LABEL: @or1_eq1(
+; CHECK-NEXT:    [[T1:%.*]] = icmp ult i32 %x, 2
+; CHECK-NEXT:    ret i1 [[T1]]
+;
+  %t0 = or i32 %x, 1
+  %t1 = icmp eq i32 %t0, 1
+  ret i1 %t1
+}
+
+; X | C == C --> X <=u C (when C+1 is PowerOf2).
+
+define <2 x i1> @or3_eq3_vec(<2 x i8> %x) {
+; CHECK-LABEL: @or3_eq3_vec(
+; CHECK-NEXT:    [[T1:%.*]] = icmp ult <2 x i8> %x, <i8 4, i8 4>
+; CHECK-NEXT:    ret <2 x i1> [[T1]]
+;
+  %t0 = or <2 x i8> %x, <i8 3, i8 3>
+  %t1 = icmp eq <2 x i8> %t0, <i8 3, i8 3>
+  ret <2 x i1> %t1
+}
+
+; X | C != C --> X >u C (when C+1 is PowerOf2).
+
+define i1 @or7_ne7(i32 %x) {
+; CHECK-LABEL: @or7_ne7(
+; CHECK-NEXT:    [[T1:%.*]] = icmp ugt i32 %x, 7
+; CHECK-NEXT:    ret i1 [[T1]]
+;
+  %t0 = or i32 %x, 7
+  %t1 = icmp ne i32 %t0, 7
+  ret i1 %t1
+}
+
+; X | C != C --> X >u C (when C+1 is PowerOf2).
+
+define <2 x i1> @or63_ne63_vec(<2 x i8> %x) {
+; CHECK-LABEL: @or63_ne63_vec(
+; CHECK-NEXT:    [[T1:%.*]] = icmp ugt <2 x i8> %x, <i8 63, i8 63>
+; CHECK-NEXT:    ret <2 x i1> [[T1]]
+;
+  %t0 = or <2 x i8> %x, <i8 63, i8 63>
+  %t1 = icmp ne <2 x i8> %t0, <i8 63, i8 63>
+  ret <2 x i1> %t1
+}
+
 define i1 @shrink_constant(i32 %X) {
 ; CHECK-LABEL: @shrink_constant(
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 %X, -12
@@ -2231,16 +2280,6 @@ define i1 @icmp_sge_zero_add_nsw(i32 %a) {
   ret i1 %cmp
 }
 
-define i1 @icmp_slt_zero_add_nsw(i32 %a) {
-; CHECK-LABEL: @icmp_slt_zero_add_nsw(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %a, -1
-; CHECK-NEXT:    ret i1 [[CMP]]
-;
-  %add = add nsw i32 %a, 1
-  %cmp = icmp slt i32 %add, 0
-  ret i1 %cmp
-}
-
 define i1 @icmp_sle_zero_add_nsw(i32 %a) {
 ; CHECK-LABEL: @icmp_sle_zero_add_nsw(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %a, 0
@@ -2807,7 +2846,7 @@ define i1 @cmp_sle_rhs_inc(float %x, i32 %y) {
 ; CHECK-LABEL: @cmp_sle_rhs_inc(
 ; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
 ; CHECK-NEXT:    [[INC:%.*]] = add
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sle i32 [[CONV]], [[INC]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[INC]], [[CONV]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %conv = fptosi float %x to i32
@@ -2820,7 +2859,7 @@ define i1 @cmp_ule_rhs_inc(float %x, i32 %y) {
 ; CHECK-LABEL: @cmp_ule_rhs_inc(
 ; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
 ; CHECK-NEXT:    [[INC:%.*]] = add
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ule i32 [[CONV]], [[INC]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp uge i32 [[INC]], [[CONV]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %conv = fptosi float %x to i32
@@ -2833,7 +2872,7 @@ define i1 @cmp_slt_rhs_dec(float %x, i32 %y) {
 ; CHECK-LABEL: @cmp_slt_rhs_dec(
 ; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
 ; CHECK-NEXT:    [[DEC:%.*]] = {{add|sub}}
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[CONV]], [[DEC]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[DEC]], [[CONV]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %conv = fptosi float %x to i32
@@ -2846,7 +2885,7 @@ define i1 @cmp_ult_rhs_dec(float %x, i32 %y) {
 ; CHECK-LABEL: @cmp_ult_rhs_dec(
 ; CHECK-NEXT:    [[CONV:%.*]] = fptosi float %x to i32
 ; CHECK-NEXT:    [[DEC:%.*]] = {{add|sub}}
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[CONV]], [[DEC]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[DEC]], [[CONV]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %conv = fptosi float %x to i32
diff --git a/test/Transforms/InstCombine/indexed-gep-compares.ll b/test/Transforms/InstCombine/indexed-gep-compares.ll
index 64dff271297630fc8bc47c50298ff253d42107b7..71afed438d10f6101c52c3a08accff394d16a55e 100644
--- a/test/Transforms/InstCombine/indexed-gep-compares.ll
+++ b/test/Transforms/InstCombine/indexed-gep-compares.ll
@@ -188,3 +188,20 @@ bb10:
 
 
 declare i32 @__gxx_personality_v0(...)
+
+define i1 @test8(i64* %in, i64 %offset) {
+entry:
+
+ %ld = load i64, i64* %in, align 8
+ %casti8 = inttoptr i64 %ld to i8*
+ %gepi8 = getelementptr inbounds i8, i8* %casti8, i64 %offset
+ %cast = bitcast i8* %gepi8 to i32**
+ %ptrcast = inttoptr i64 %ld to i32**
+ %gepi32 = getelementptr inbounds i32*, i32** %ptrcast, i64 1
+ %cmp = icmp eq i32** %gepi32, %cast
+ ret i1 %cmp
+
+
+; CHECK-LABEL: @test8(
+; CHECK-NOT: icmp eq i32 %{{[0-9A-Za-z.]+}}, 1
+}
diff --git a/test/Transforms/InstCombine/insert-extract-shuffle.ll b/test/Transforms/InstCombine/insert-extract-shuffle.ll
index 4507deb7f023a78873df235e5e46633aaa153990..29f774c5f62b5b191632481a796d5d9267769d8a 100644
--- a/test/Transforms/InstCombine/insert-extract-shuffle.ll
+++ b/test/Transforms/InstCombine/insert-extract-shuffle.ll
@@ -86,11 +86,8 @@ define <8 x float> @widen_extract4(<8 x float> %ins, <2 x float> %ext) {
 
 define <8 x i16> @pr26015(<4 x i16> %t0) {
 ; CHECK-LABEL: @pr26015(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x i16> %t0, i32 2
-; CHECK-NEXT:    [[T2:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 undef>, i16 [[TMP2]], i32 3
-; CHECK-NEXT:    [[T3:%.*]] = insertelement <8 x i16> [[T2]], i16 0, i32 6
-; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> [[T3]], <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 undef>, <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 10, i32 4, i32 5, i32 6, i32 11>
 ; CHECK-NEXT:    ret <8 x i16> [[T5]]
 ;
   %t1 = extractelement <4 x i16> %t0, i32 2
@@ -110,8 +107,7 @@ define <8 x i16> @pr25999(<4 x i16> %t0, i1 %b) {
 ; CHECK-NEXT:    br i1 %b, label %if, label %end
 ; CHECK:       if:
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i16> %t0, <4 x i16> undef, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[T2:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 undef, i16 undef>, i16 [[T1]], i32 3
-; CHECK-NEXT:    [[T3:%.*]] = insertelement <8 x i16> [[T2]], i16 0, i32 6
+; CHECK-NEXT:    [[T3:%.*]] = insertelement <8 x i16> <i16 0, i16 0, i16 0, i16 undef, i16 0, i16 0, i16 0, i16 undef>, i16 [[T1]], i32 3
 ; CHECK-NEXT:    [[T5:%.*]] = shufflevector <8 x i16> [[T3]], <8 x i16> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
 ; CHECK-NEXT:    ret <8 x i16> [[T5]]
 ; CHECK:       end:
diff --git a/test/Transforms/InstCombine/lifetime-asan.ll b/test/Transforms/InstCombine/lifetime-asan.ll
index f52c0202b7738ac890f174314b5257f38cf30be8..7fdc1fcbc3b304add02b5abaaa70ef4b2c23e16a 100644
--- a/test/Transforms/InstCombine/lifetime-asan.ll
+++ b/test/Transforms/InstCombine/lifetime-asan.ll
@@ -1,7 +1,7 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @foo(i8* nocapture)
 
 define void @asan() sanitize_address {
@@ -9,8 +9,8 @@ entry:
   ; CHECK-LABEL: @asan(
   %text = alloca i8, align 1
 
-  call void @llvm.lifetime.start(i64 1, i8* %text)
-  call void @llvm.lifetime.end(i64 1, i8* %text)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %text)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %text)
   ; CHECK: call void @llvm.lifetime.start
   ; CHECK-NEXT: call void @llvm.lifetime.end
 
@@ -25,8 +25,8 @@ entry:
   ; CHECK-LABEL: @no_asan(
   %text = alloca i8, align 1
 
-  call void @llvm.lifetime.start(i64 1, i8* %text)
-  call void @llvm.lifetime.end(i64 1, i8* %text)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %text)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %text)
   ; CHECK-NO: call void @llvm.lifetime
 
   call void @foo(i8* %text) ; Keep alloca alive
diff --git a/test/Transforms/InstCombine/lifetime.ll b/test/Transforms/InstCombine/lifetime.ll
index c296d29b99b97f96f99684cb4b2e94bb35322f48..71c676233b088afa920d93e52be7a95e366d7482 100644
--- a/test/Transforms/InstCombine/lifetime.ll
+++ b/test/Transforms/InstCombine/lifetime.ll
@@ -1,8 +1,8 @@
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
 declare void @llvm.dbg.declare(metadata, metadata, metadata)
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @foo(i8* nocapture, i8* nocapture)
 
 define void @bar(i1 %flag) !dbg !4 {
@@ -17,11 +17,11 @@ entry:
 ; CHECK: bb3:
 ; CHECK-NEXT: call void @llvm.dbg.declare
 ; CHECK-NEXT: br label %fin
-; CHECK: call void @llvm.lifetime.start(i64 1, i8* %[[T]])
-; CHECK-NEXT: call void @llvm.lifetime.start(i64 1, i8* %[[B]])
+; CHECK: call void @llvm.lifetime.start.p0i8(i64 1, i8* %[[T]])
+; CHECK-NEXT: call void @llvm.lifetime.start.p0i8(i64 1, i8* %[[B]])
 ; CHECK-NEXT: call void @foo(i8* %[[B]], i8* %[[T]])
-; CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* %[[B]])
-; CHECK-NEXT: call void @llvm.lifetime.end(i64 1, i8* %[[T]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 1, i8* %[[B]])
+; CHECK-NEXT: call void @llvm.lifetime.end.p0i8(i64 1, i8* %[[T]])
   %text = alloca [1 x i8], align 1
   %buff = alloca [1 x i8], align 1
   %0 = getelementptr inbounds [1 x i8], [1 x i8]* %text, i64 0, i64 0
@@ -29,31 +29,31 @@ entry:
   br i1 %flag, label %if, label %else
 
 if:
-  call void @llvm.lifetime.start(i64 1, i8* %0)
-  call void @llvm.lifetime.start(i64 1, i8* %1)
-  call void @llvm.lifetime.end(i64 1, i8* %1)
-  call void @llvm.lifetime.end(i64 1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %0)
   br label %bb2
 
 bb2:
-  call void @llvm.lifetime.start(i64 1, i8* %0)
-  call void @llvm.lifetime.start(i64 1, i8* %1)
-  call void @llvm.lifetime.end(i64 1, i8* %0)
-  call void @llvm.lifetime.end(i64 1, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %1)
   br label %bb3
 
 bb3:
-  call void @llvm.lifetime.start(i64 1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %0)
   call void @llvm.dbg.declare(metadata [1 x i8]* %text, metadata !14, metadata !25), !dbg !26
-  call void @llvm.lifetime.end(i64 1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %0)
   br label %fin
 
 else:
-  call void @llvm.lifetime.start(i64 1, i8* %0)
-  call void @llvm.lifetime.start(i64 1, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* %1)
   call void @foo(i8* %1, i8* %0)
-  call void @llvm.lifetime.end(i64 1, i8* %1)
-  call void @llvm.lifetime.end(i64 1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* %0)
   br  label %fin
 
 fin:
diff --git a/test/Transforms/InstCombine/load-cmp.ll b/test/Transforms/InstCombine/load-cmp.ll
index 75952e01c19c8afcc89d7c13997a3b546955a767..5746b7aa28d543ac9029550d7ac6dffb94f8e0f4 100644
--- a/test/Transforms/InstCombine/load-cmp.ll
+++ b/test/Transforms/InstCombine/load-cmp.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -instcombine -S -default-data-layout="p:32:32:32-p1:16:16:16-n8:16:32:64" < %s | FileCheck %s
+; RUN: opt -instcombine -S -data-layout="p:32:32:32-p1:16:16:16-n8:16:32:64" < %s | FileCheck %s
 
 @G16 = internal constant [10 x i16] [i16 35, i16 82, i16 69, i16 81, i16 85,
                                      i16 73, i16 82, i16 69, i16 68, i16 0]
diff --git a/test/Transforms/InstCombine/load.ll b/test/Transforms/InstCombine/load.ll
index cad2899ea35d675b2ea03874577c8ed09fa5ca18..49ed897fd2ead1af662d352e5ae7cb7b4d0798fa 100644
--- a/test/Transforms/InstCombine/load.ll
+++ b/test/Transforms/InstCombine/load.ll
@@ -219,3 +219,22 @@ entry:
   store %swift.error* %err.res, %swift.error** %err, align 8
   ret void
 }
+
+; Make sure we preseve the type of the store to a swifterror pointer.
+; CHECK-LABEL: @test19(
+; CHECK: [[A:%.*]] = alloca
+; CHECK: call
+; CHECK: [[BC:%.*]] = bitcast i8** [[A]] to
+; CHECK: [[ERRVAL:%.*]] =  load {{.*}}[[BC]]
+; CHECK: store {{.*}}[[ERRVAL]]
+; CHECK: ret
+declare void @initi8(i8**)
+define void @test19(%swift.error** swifterror %err) {
+entry:
+  %tmp = alloca i8*, align 8
+  call void @initi8(i8** %tmp)
+  %swifterror = bitcast i8** %tmp to %swift.error**
+  %err.res = load %swift.error*, %swift.error** %swifterror, align 8
+  store %swift.error* %err.res, %swift.error** %err, align 8
+  ret void
+}
diff --git a/test/Transforms/InstCombine/malloc-free-delete.ll b/test/Transforms/InstCombine/malloc-free-delete.ll
index 8fcb8214360d42cd3cdd0813db6571b52a4cdc82..7a5c7457e3649fe201d59a825199e0094af6ab40 100644
--- a/test/Transforms/InstCombine/malloc-free-delete.ll
+++ b/test/Transforms/InstCombine/malloc-free-delete.ll
@@ -24,8 +24,8 @@ define i1 @foo() {
   ret i1 %z
 }
 
-declare void @llvm.lifetime.start(i64, i8*)
-declare void @llvm.lifetime.end(i64, i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
+declare void @llvm.lifetime.end.p0i8(i64, i8*)
 declare i64 @llvm.objectsize.i64(i8*, i1)
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
 declare void @llvm.memmove.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
@@ -35,8 +35,8 @@ define void @test3(i8* %src) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT: ret void
   %a = call noalias i8* @malloc(i32 10)
-  call void @llvm.lifetime.start(i64 10, i8* %a)
-  call void @llvm.lifetime.end(i64 10, i8* %a)
+  call void @llvm.lifetime.start.p0i8(i64 10, i8* %a)
+  call void @llvm.lifetime.end.p0i8(i64 10, i8* %a)
   %size = call i64 @llvm.objectsize.i64(i8* %a, i1 true)
   store i8 42, i8* %a
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %src, i32 32, i32 1, i1 false)
diff --git a/test/Transforms/InstCombine/max-of-nots.ll b/test/Transforms/InstCombine/max-of-nots.ll
index 96fac522897076a35ec316d8b57a86886c185fbb..519f1c6a90b0440929d62eb122ae1b092dcd3dbf 100644
--- a/test/Transforms/InstCombine/max-of-nots.ll
+++ b/test/Transforms/InstCombine/max-of-nots.ll
@@ -90,6 +90,28 @@ define i32 @max_of_nots(i32 %x, i32 %y) {
   ret i32 %smax96
 }
 
+ ; negative test case (i.e. can not simplify) : ABS(MIN(NOT x,y))
+define i32 @abs_of_min_of_not(i32 %x, i32 %y) {
+; CHECK-LABEL: @abs_of_min_of_not(
+; CHECK-NEXT: xor
+; CHECK-NEXT: add
+; CHECK-NEXT: icmp sge
+; CHECK-NEXT: select
+; CHECK-NEXT: icmp sgt
+; CHECK-NEXT: sub
+; CHECK-NEXT: select
+; CHECK-NEXT: ret
+
+  %xord = xor i32 %x, -1
+  %yadd = add i32 %y, 2
+  %cond.i = icmp sge i32 %yadd, %xord
+  %min = select i1 %cond.i, i32 %xord, i32 %yadd
+  %cmp2 = icmp sgt i32 %min, -1
+  %sub = sub i32 0, %min
+  %abs = select i1 %cmp2, i32 %min, i32 %sub
+  ret i32  %abs
+}
+
 define <2 x i32> @max_of_nots_vec(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @max_of_nots_vec(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt <2 x i32> %y, zeroinitializer
diff --git a/test/Transforms/InstCombine/memcmp-1.ll b/test/Transforms/InstCombine/memcmp-1.ll
index f9ff479e3addf95f387a5659591d522d815d2b0b..96516f44e081583bd1bc78bf192f6f899fa0f455 100644
--- a/test/Transforms/InstCombine/memcmp-1.ll
+++ b/test/Transforms/InstCombine/memcmp-1.ll
@@ -14,67 +14,76 @@ declare i32 @memcmp(i8*, i8*, i32)
 
 define i32 @test_simplify1(i8* %mem, i32 %size) {
 ; CHECK-LABEL: @test_simplify1(
+; CHECK-NEXT:    ret i32 0
+;
   %ret = call i32 @memcmp(i8* %mem, i8* %mem, i32 %size)
   ret i32 %ret
-; CHECK: ret i32 0
 }
 
 ; Check memcmp(mem1, mem2, 0) -> 0.
 
 define i32 @test_simplify2(i8* %mem1, i8* %mem2) {
 ; CHECK-LABEL: @test_simplify2(
+; CHECK-NEXT:    ret i32 0
+;
   %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 0)
   ret i32 %ret
-; CHECK: ret i32 0
 }
 
 ;; Check memcmp(mem1, mem2, 1) -> *(unsigned char*)mem1 - *(unsigned char*)mem2.
 
 define i32 @test_simplify3(i8* %mem1, i8* %mem2) {
 ; CHECK-LABEL: @test_simplify3(
+; CHECK-NEXT:    [[LHSC:%.*]] = load i8, i8* %mem1, align 1
+; CHECK-NEXT:    [[LHSV:%.*]] = zext i8 [[LHSC]] to i32
+; CHECK-NEXT:    [[RHSC:%.*]] = load i8, i8* %mem2, align 1
+; CHECK-NEXT:    [[RHSV:%.*]] = zext i8 [[RHSC]] to i32
+; CHECK-NEXT:    [[CHARDIFF:%.*]] = sub nsw i32 [[LHSV]], [[RHSV]]
+; CHECK-NEXT:    ret i32 [[CHARDIFF]]
+;
   %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 1)
-; CHECK: [[LOAD1:%[a-z]+]] = load i8, i8* %mem1, align 1
-; CHECK: [[ZEXT1:%[a-z]+]] = zext i8 [[LOAD1]] to i32
-; CHECK: [[LOAD2:%[a-z]+]] = load i8, i8* %mem2, align 1
-; CHECK: [[ZEXT2:%[a-z]+]] = zext i8 [[LOAD2]] to i32
-; CHECK: [[RET:%[a-z]+]] = sub nsw i32 [[ZEXT1]], [[ZEXT2]]
   ret i32 %ret
-; CHECK: ret i32 [[RET]]
 }
 
 ; Check memcmp(mem1, mem2, size) -> cnst, where all arguments are constants.
 
 define i32 @test_simplify4() {
 ; CHECK-LABEL: @test_simplify4(
+; CHECK-NEXT:    ret i32 0
+;
   %mem1 = getelementptr [4 x i8], [4 x i8]* @hel, i32 0, i32 0
   %mem2 = getelementptr [8 x i8], [8 x i8]* @hello_u, i32 0, i32 0
   %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
   ret i32 %ret
-; CHECK: ret i32 0
 }
 
 define i32 @test_simplify5() {
 ; CHECK-LABEL: @test_simplify5(
+; CHECK-NEXT:    ret i32 1
+;
   %mem1 = getelementptr [4 x i8], [4 x i8]* @hel, i32 0, i32 0
   %mem2 = getelementptr [4 x i8], [4 x i8]* @foo, i32 0, i32 0
   %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
   ret i32 %ret
-; CHECK: ret i32 1
 }
 
 define i32 @test_simplify6() {
 ; CHECK-LABEL: @test_simplify6(
+; CHECK-NEXT:    ret i32 -1
+;
   %mem1 = getelementptr [4 x i8], [4 x i8]* @foo, i32 0, i32 0
   %mem2 = getelementptr [4 x i8], [4 x i8]* @hel, i32 0, i32 0
   %ret = call i32 @memcmp(i8* %mem1, i8* %mem2, i32 3)
   ret i32 %ret
-; CHECK: ret i32 -1
 }
 
 ; Check memcmp(mem1, mem2, 8)==0 -> *(int64_t*)mem1 == *(int64_t*)mem2
 
 define i1 @test_simplify7(i64 %x, i64 %y) {
 ; CHECK-LABEL: @test_simplify7(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %x.addr = alloca i64, align 8
   %y.addr = alloca i64, align 8
   store i64 %x, i64* %x.addr, align 8
@@ -84,14 +93,15 @@ define i1 @test_simplify7(i64 %x, i64 %y) {
   %call = call i32 @memcmp(i8* %xptr, i8* %yptr, i32 8)
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
-; CHECK: %cmp = icmp eq i64 %x, %y
-; CHECK: ret i1 %cmp
 }
 
 ; Check memcmp(mem1, mem2, 4)==0 -> *(int32_t*)mem1 == *(int32_t*)mem2
 
 define i1 @test_simplify8(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test_simplify8(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %x.addr = alloca i32, align 4
   %y.addr = alloca i32, align 4
   store i32 %x, i32* %x.addr, align 4
@@ -101,14 +111,15 @@ define i1 @test_simplify8(i32 %x, i32 %y) {
   %call = call i32 @memcmp(i8* %xptr, i8* %yptr, i32 4)
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
-; CHECK: %cmp = icmp eq i32 %x, %y
-; CHECK: ret i1 %cmp
 }
 
 ; Check memcmp(mem1, mem2, 2)==0 -> *(int16_t*)mem1 == *(int16_t*)mem2
 
 define i1 @test_simplify9(i16 %x, i16 %y) {
 ; CHECK-LABEL: @test_simplify9(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 %x, %y
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
   %x.addr = alloca i16, align 2
   %y.addr = alloca i16, align 2
   store i16 %x, i16* %x.addr, align 2
@@ -118,6 +129,4 @@ define i1 @test_simplify9(i16 %x, i16 %y) {
   %call = call i32 @memcmp(i8* %xptr, i8* %yptr, i32 2)
   %cmp = icmp eq i32 %call, 0
   ret i1 %cmp
-; CHECK: %cmp = icmp eq i16 %x, %y
-; CHECK: ret i1 %cmp
 }
diff --git a/test/Transforms/InstCombine/memcpy-addrspace.ll b/test/Transforms/InstCombine/memcpy-addrspace.ll
new file mode 100644
index 0000000000000000000000000000000000000000..17bc1d08f98675cc62504b25a7edda339f72ff3f
--- /dev/null
+++ b/test/Transforms/InstCombine/memcpy-addrspace.ll
@@ -0,0 +1,85 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+@test.data = private unnamed_addr addrspace(2) constant [8 x i32] [i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7], align 4
+
+; CHECK-LABEL: test_load
+; CHECK: %[[GEP:.*]] = getelementptr [8 x i32], [8 x i32] addrspace(2)* @test.data, i64 0, i64 %x
+; CHECK: %{{.*}} = load i32, i32 addrspace(2)* %[[GEP]]
+; CHECK-NOT: alloca
+; CHECK-NOT: call void @llvm.memcpy.p0i8.p2i8.i64
+; CHECK-NOT: addrspacecast
+; CHECK-NOT: load i32, i32*
+define void @test_load(i32 addrspace(1)* %out, i64 %x) {
+entry:
+  %data = alloca [8 x i32], align 4
+  %0 = bitcast [8 x i32]* %data to i8*
+  call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %data, i64 0, i64 %x
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_load_bitcast_chain
+; CHECK: %[[GEP:.*]] = getelementptr [8 x i32], [8 x i32] addrspace(2)* @test.data, i64 0, i64 %x
+; CHECK: %{{.*}} = load i32, i32 addrspace(2)* %[[GEP]]
+; CHECK-NOT: alloca
+; CHECK-NOT: call void @llvm.memcpy.p0i8.p2i8.i64
+; CHECK-NOT: addrspacecast
+; CHECK-NOT: load i32, i32*
+define void @test_load_bitcast_chain(i32 addrspace(1)* %out, i64 %x) {
+entry:
+  %data = alloca [8 x i32], align 4
+  %0 = bitcast [8 x i32]* %data to i8*
+  call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
+  %1 = bitcast i8* %0 to i32*
+  %arrayidx = getelementptr inbounds i32, i32* %1, i64 %x
+  %2 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
+  store i32 %2, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_call
+; CHECK: alloca
+; CHECK: call void @llvm.memcpy.p0i8.p2i8.i64
+; CHECK-NOT: addrspacecast
+; CHECK: call i32 @foo(i32* %{{.*}})
+define void @test_call(i32 addrspace(1)* %out, i64 %x) {
+entry:
+  %data = alloca [8 x i32], align 4
+  %0 = bitcast [8 x i32]* %data to i8*
+  call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %data, i64 0, i64 %x
+  %1 = call i32 @foo(i32* %arrayidx)
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  ret void
+}
+
+; CHECK-LABEL: test_load_and_call
+; CHECK: alloca
+; CHECK: call void @llvm.memcpy.p0i8.p2i8.i64
+; CHECK: load i32, i32* %{{.*}}
+; CHECK: call i32 @foo(i32* %{{.*}})
+; CHECK-NOT: addrspacecast
+; CHECK-NOT: load i32, i32 addrspace(2)*
+define void @test_load_and_call(i32 addrspace(1)* %out, i64 %x, i64 %y) {
+entry:
+  %data = alloca [8 x i32], align 4
+  %0 = bitcast [8 x i32]* %data to i8*
+  call void @llvm.memcpy.p0i8.p2i8.i64(i8* %0, i8 addrspace(2)* bitcast ([8 x i32] addrspace(2)* @test.data to i8 addrspace(2)*), i64 32, i32 4, i1 false)
+  %arrayidx = getelementptr inbounds [8 x i32], [8 x i32]* %data, i64 0, i64 %x
+  %1 = load i32, i32* %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %x
+  store i32 %1, i32 addrspace(1)* %arrayidx1, align 4
+  %2 = call i32 @foo(i32* %arrayidx)
+  %arrayidx2 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 %y
+  store i32 %2, i32 addrspace(1)* %arrayidx2, align 4
+  ret void
+}
+
+
+declare void @llvm.memcpy.p0i8.p2i8.i64(i8* nocapture writeonly, i8 addrspace(2)* nocapture readonly, i64, i32, i1)
+declare i32 @foo(i32* %x)
diff --git a/test/Transforms/InstCombine/memcpy-from-global.ll b/test/Transforms/InstCombine/memcpy-from-global.ll
index da38087d7397e7b7583556a8ed65ebeabc32fec9..7c9384d89ba340a0281cfbf1fc5d7aae02143910 100644
--- a/test/Transforms/InstCombine/memcpy-from-global.ll
+++ b/test/Transforms/InstCombine/memcpy-from-global.ll
@@ -126,11 +126,11 @@ define void @test4() {
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8*)
+declare void @llvm.lifetime.start.p0i8(i64, i8*)
 define void @test5() {
   %A = alloca %T
   %a = bitcast %T* %A to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %a)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %a)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %a, i8* bitcast (%T* @G to i8*), i64 124, i32 4, i1 false)
   call void @baz(i8* byval %a)
 ; CHECK-LABEL: @test5(
diff --git a/test/Transforms/InstCombine/memcpy-to-load.ll b/test/Transforms/InstCombine/memcpy-to-load.ll
index bcc9e188b965f48c5e466bcc7d431771c296aedf..fe5f0ac657f159e949649ad79a210fe22235b341 100644
--- a/test/Transforms/InstCombine/memcpy-to-load.ll
+++ b/test/Transforms/InstCombine/memcpy-to-load.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -instcombine -S | grep "load double"
+; RUN: opt < %s -instcombine -S | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
 
@@ -10,4 +10,8 @@ entry:
   ret void
 }
 
+; Make sure that the memcpy has been replace with a load/store of i64
+; CHECK: [[TMP:%[0-9]+]] = load i64
+; CHECK: store i64 [[TMP]]
+
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/Transforms/InstCombine/memset_chk-1.ll b/test/Transforms/InstCombine/memset_chk-1.ll
index 9d08e96cb49be529a7deef89c9773e5be299facc..79028502b641b6a8512c09c3d0bfd98ba7aad98a 100644
--- a/test/Transforms/InstCombine/memset_chk-1.ll
+++ b/test/Transforms/InstCombine/memset_chk-1.ll
@@ -69,7 +69,7 @@ define i32 @test_rauw(i8* %a, i8* %b, i8** %c) {
 entry:
   %call49 = call i64 @strlen(i8* %a)
   %add180 = add i64 %call49, 1
-  %yo107 = call i64 @llvm.objectsize.i64.p0i8(i8* %b, i1 false)
+  %yo107 = call i64 @llvm.objectsize.i64.p0i8(i8* %b, i1 false, i1 false)
   %call50 = call i8* @__memmove_chk(i8* %b, i8* %a, i64 %add180, i64 %yo107)
 ; CHECK: %strlen = call i64 @strlen(i8* %b)
 ; CHECK-NEXT: %strchr2 = getelementptr i8, i8* %b, i64 %strlen
@@ -87,7 +87,7 @@ entry:
 declare i8* @__memmove_chk(i8*, i8*, i64, i64)
 declare i8* @strrchr(i8*, i32)
 declare i64 @strlen(i8* nocapture)
-declare i64 @llvm.objectsize.i64.p0i8(i8*, i1)
+declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1)
 
 declare i8* @__memset_chk(i8*, i32, i64, i64)
 
@@ -100,7 +100,7 @@ entry:
   br i1 %cmp, label %cleanup, label %if.end
 if.end:
   %bc = bitcast i8* %call to float*
-  %call2 = tail call i64 @llvm.objectsize.i64.p0i8(i8* nonnull %call, i1 false)
+  %call2 = tail call i64 @llvm.objectsize.i64.p0i8(i8* nonnull %call, i1 false, i1 false)
   %call3 = tail call i8* @__memset_chk(i8* nonnull %call, i32 0, i64 %size, i64 %call2) #1
   br label %cleanup
 cleanup:
@@ -114,7 +114,7 @@ cleanup:
 ; CHECK-NEXT:    br i1 %cmp, label %cleanup, label %if.end
 ; CHECK:       if.end:
 ; CHECK-NEXT:    %bc = bitcast i8* %call to float*
-; CHECK-NEXT:    %call2 = tail call i64 @llvm.objectsize.i64.p0i8(i8* nonnull %call, i1 false)
+; CHECK-NEXT:    %call2 = tail call i64 @llvm.objectsize.i64.p0i8(i8* nonnull %call, i1 false, i1 false)
 ; CHECK-NEXT:    %call3 = tail call i8* @__memset_chk(i8* nonnull %call, i32 0, i64 %size, i64 %call2)
 ; CHECK-NEXT:    br label %cleanup
 ; CHECK:       cleanup:
diff --git a/test/Transforms/InstCombine/minmax-fold.ll b/test/Transforms/InstCombine/minmax-fold.ll
index a9a824ed2fe79adddcbd375e7be895bba04b541b..19a7341fdc28f3e6d74aa656e356f64bf00dae95 100644
--- a/test/Transforms/InstCombine/minmax-fold.ll
+++ b/test/Transforms/InstCombine/minmax-fold.ll
@@ -415,9 +415,9 @@ define i32 @clamp_unsigned2(i32 %x) {
 define double @PR31751_umin1(i32 %x) {
 ; CHECK-LABEL: @PR31751_umin1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i32 %x, 2147483647
-; CHECK-NEXT:    [[CONV1:%.*]] = select i1 [[TMP1]], i32 %x, i32 2147483647
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp i32 [[CONV1]] to double
-; CHECK-NEXT:    ret double [[TMP2]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP1]], i32 %x, i32 2147483647
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[SEL]] to double
+; CHECK-NEXT:    ret double [[CONV]]
 ;
   %cmp = icmp slt i32 %x, 0
   %sel = select i1 %cmp, i32 2147483647, i32 %x
@@ -456,9 +456,9 @@ define double @PR31751_umin3(i32 %x) {
 define double @PR31751_umax1(i32 %x) {
 ; CHECK-LABEL: @PR31751_umax1(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ugt i32 %x, -2147483648
-; CHECK-NEXT:    [[CONV1:%.*]] = select i1 [[TMP1]], i32 %x, i32 -2147483648
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp i32 [[CONV1]] to double
-; CHECK-NEXT:    ret double [[TMP2]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP1]], i32 %x, i32 -2147483648
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[SEL]] to double
+; CHECK-NEXT:    ret double [[CONV]]
 ;
   %cmp = icmp sgt i32 %x, -1
   %sel = select i1 %cmp, i32 2147483648, i32 %x
@@ -492,3 +492,76 @@ define double @PR31751_umax3(i32 %x) {
   ret double %conv
 }
 
+; The icmp/select form a canonical smax, so don't hide that by folding the final bitcast into the select.
+
+define float @bitcast_scalar_smax(float %x, float %y) {
+; CHECK-LABEL: @bitcast_scalar_smax(
+; CHECK-NEXT:    [[BCX:%.*]] = bitcast float %x to i32
+; CHECK-NEXT:    [[BCY:%.*]] = bitcast float %y to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[BCX]], [[BCY]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[BCX]], i32 [[BCY]]
+; CHECK-NEXT:    [[BCS:%.*]] = bitcast i32 [[SEL]] to float
+; CHECK-NEXT:    ret float [[BCS]]
+;
+  %bcx = bitcast float %x to i32
+  %bcy = bitcast float %y to i32
+  %cmp = icmp sgt i32 %bcx, %bcy
+  %sel = select i1 %cmp, i32 %bcx, i32 %bcy
+  %bcs = bitcast i32 %sel to float
+  ret float %bcs
+}
+
+; FIXME: Create a canonical umax by bitcasting the select.
+
+define float @bitcast_scalar_umax(float %x, float %y) {
+; CHECK-LABEL: @bitcast_scalar_umax(
+; CHECK-NEXT:    [[BCX:%.*]] = bitcast float %x to i32
+; CHECK-NEXT:    [[BCY:%.*]] = bitcast float %y to i32
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[BCX]], [[BCY]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], float %x, float %y
+; CHECK-NEXT:    ret float [[SEL]]
+;
+  %bcx = bitcast float %x to i32
+  %bcy = bitcast float %y to i32
+  %cmp = icmp ugt i32 %bcx, %bcy
+  %sel = select i1 %cmp, float %x, float %y
+  ret float %sel
+}
+
+; PR32306 - https://bugs.llvm.org/show_bug.cgi?id=32306
+; The icmp/select form a canonical smin, so don't hide that by folding the final bitcast into the select.
+
+define <8 x float> @bitcast_vector_smin(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @bitcast_vector_smin(
+; CHECK-NEXT:    [[BCX:%.*]] = bitcast <8 x float> %x to <8 x i32>
+; CHECK-NEXT:    [[BCY:%.*]] = bitcast <8 x float> %y to <8 x i32>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <8 x i32> [[BCX]], [[BCY]]
+; CHECK-NEXT:    [[SEL:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[BCX]], <8 x i32> [[BCY]]
+; CHECK-NEXT:    [[BCS:%.*]] = bitcast <8 x i32> [[SEL]] to <8 x float>
+; CHECK-NEXT:    ret <8 x float> [[BCS]]
+;
+  %bcx = bitcast <8 x float> %x to <8 x i32>
+  %bcy = bitcast <8 x float> %y to <8 x i32>
+  %cmp = icmp slt <8 x i32> %bcx, %bcy
+  %sel = select <8 x i1> %cmp, <8 x i32> %bcx, <8 x i32> %bcy
+  %bcs = bitcast <8 x i32> %sel to <8 x float>
+  ret <8 x float> %bcs
+}
+
+; FIXME: Create a canonical umin by bitcasting the select.
+
+define <8 x float> @bitcast_vector_umin(<8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @bitcast_vector_umin(
+; CHECK-NEXT:    [[BCX:%.*]] = bitcast <8 x float> %x to <8 x i32>
+; CHECK-NEXT:    [[BCY:%.*]] = bitcast <8 x float> %y to <8 x i32>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt <8 x i32> [[BCX]], [[BCY]]
+; CHECK-NEXT:    [[SEL:%.*]] = select <8 x i1> [[CMP]], <8 x float> %x, <8 x float> %y
+; CHECK-NEXT:    ret <8 x float> [[SEL]]
+;
+  %bcx = bitcast <8 x float> %x to <8 x i32>
+  %bcy = bitcast <8 x float> %y to <8 x i32>
+  %cmp = icmp slt <8 x i32> %bcx, %bcy
+  %sel = select <8 x i1> %cmp, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %sel
+}
+
diff --git a/test/Transforms/InstCombine/narrow-switch.ll b/test/Transforms/InstCombine/narrow-switch.ll
index ccc17f87560ed283a230f28a432f466dd661f8c5..474bd820c8f8e8e7dc539e615413657a701cebdc 100644
--- a/test/Transforms/InstCombine/narrow-switch.ll
+++ b/test/Transforms/InstCombine/narrow-switch.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Vary legal integer types in data layout.
-; RUN: opt < %s -instcombine -S -default-data-layout=n32    | FileCheck %s --check-prefix=ALL --check-prefix=CHECK32
-; RUN: opt < %s -instcombine -S -default-data-layout=n32:64 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK64
+; RUN: opt < %s -instcombine -S -data-layout=n32    | FileCheck %s --check-prefix=ALL --check-prefix=CHECK32
+; RUN: opt < %s -instcombine -S -data-layout=n32:64 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK64
 
 ; In all cases, the data-layout is irrelevant. We should shrink as much as possible in InstCombine
 ; and allow the backend to expand as much as needed to ensure optimal codegen for any target.
diff --git a/test/Transforms/InstCombine/narrow.ll b/test/Transforms/InstCombine/narrow.ll
index 2ea312f83e990d6da7de72f7cc74e9d4ee385c95..1df400aac9738716d25f8fdf31fcfca51c85aeee 100644
--- a/test/Transforms/InstCombine/narrow.ll
+++ b/test/Transforms/InstCombine/narrow.ll
@@ -212,8 +212,7 @@ endif:
 
 ; FIXME:
 ; Narrowing should work with an 'xor' and is not limited to bool types.
-; FIXME:
-; We should either canonicalize based on complexity or enhance the pattern matching to catch this commuted variant.
+; Test that commuting the xor operands does not inhibit optimization.
 
 define i32 @shrinkLogicAndPhi2(i8 %x, i1 %cond) {
 ; CHECK-LABEL: @shrinkLogicAndPhi2(
@@ -224,7 +223,7 @@ define i32 @shrinkLogicAndPhi2(i8 %x, i1 %cond) {
 ; CHECK:       endif:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ 21, [[ENTRY:%.*]] ], [ 33, [[IF]] ]
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i8 [[X:%.*]] to i32
-; CHECK-NEXT:    [[LOGIC:%.*]] = xor i32 [[ZEXT]], [[PHI]]
+; CHECK-NEXT:    [[LOGIC:%.*]] = xor i32 [[PHI]], [[ZEXT]]
 ; CHECK-NEXT:    ret i32 [[LOGIC]]
 ;
 entry:
diff --git a/test/Transforms/InstCombine/not-fcmp.ll b/test/Transforms/InstCombine/not-fcmp.ll
deleted file mode 100644
index 9718e0b905fca6a424e7fdc06516ffb763eb5743..0000000000000000000000000000000000000000
--- a/test/Transforms/InstCombine/not-fcmp.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; RUN: opt < %s -instcombine -S | FileCheck %s
-; PR1570
-
-define i1 @f(float %X, float %Y) {
-entry:
-        %tmp3 = fcmp olt float %X, %Y           ; <i1> [#uses=1]
-        %toBoolnot5 = xor i1 %tmp3, true                ; <i1> [#uses=1]
-        ret i1 %toBoolnot5
-; CHECK-LABEL: @f(
-; CHECK-NEXT: entry:
-; CHECK-NEXT: %toBoolnot5 = fcmp uge float %X, %Y
-; CHECK-NEXT: ret i1 %toBoolnot5
-}
diff --git a/test/Transforms/InstCombine/not.ll b/test/Transforms/InstCombine/not.ll
index edb402a125ac10e8f03a9cc847b1f6f9c52d7892..d0c242f65558c647a67f75bd3e9d66a51d75beaa 100644
--- a/test/Transforms/InstCombine/not.ll
+++ b/test/Transforms/InstCombine/not.ll
@@ -1,61 +1,95 @@
-; This test makes sure that these instructions are properly eliminated.
-;
-
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-; CHECK-NOT: xor
 
 define i32 @test1(i32 %A) {
-        %B = xor i32 %A, -1
-        %C = xor i32 %B, -1
-        ret i32 %C
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i32 %A
+;
+  %B = xor i32 %A, -1
+  %C = xor i32 %B, -1
+  ret i32 %C
 }
 
-define i1 @test2(i32 %A, i32 %B) {
-        ; Can change into setge
-        %cond = icmp sle i32 %A, %B
-        %Ret = xor i1 %cond, true
-        ret i1 %Ret
+define i1 @invert_icmp(i32 %A, i32 %B) {
+; CHECK-LABEL: @invert_icmp(
+; CHECK-NEXT:    [[NOT:%.*]] = icmp sgt i32 %A, %B
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp = icmp sle i32 %A, %B
+  %not = xor i1 %cmp, true
+  ret i1 %not
+}
+
+; PR1570
+
+define i1 @invert_fcmp(float %X, float %Y) {
+; CHECK-LABEL: @invert_fcmp(
+; CHECK-NEXT:    [[NOT:%.*]] = fcmp uge float %X, %Y
+; CHECK-NEXT:    ret i1 [[NOT]]
+;
+  %cmp = fcmp olt float %X, %Y
+  %not = xor i1 %cmp, true
+  ret i1 %not
 }
 
 ; Test that De Morgan's law can be instcombined.
 define i32 @test3(i32 %A, i32 %B) {
-        %a = xor i32 %A, -1
-        %b = xor i32 %B, -1
-        %c = and i32 %a, %b
-        %d = xor i32 %c, -1
-        ret i32 %d
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[C_DEMORGAN:%.*]] = or i32 %A, %B
+; CHECK-NEXT:    ret i32 [[C_DEMORGAN]]
+;
+  %a = xor i32 %A, -1
+  %b = xor i32 %B, -1
+  %c = and i32 %a, %b
+  %d = xor i32 %c, -1
+  ret i32 %d
 }
 
 ; Test that De Morgan's law can work with constants.
 define i32 @test4(i32 %A, i32 %B) {
-        %a = xor i32 %A, -1
-        %c = and i32 %a, 5
-        %d = xor i32 %c, -1
-        ret i32 %d
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[D1:%.*]] = or i32 %A, -6
+; CHECK-NEXT:    ret i32 [[D1]]
+;
+  %a = xor i32 %A, -1
+  %c = and i32 %a, 5
+  %d = xor i32 %c, -1
+  ret i32 %d
 }
 
 ; Test the mirror of De Morgan's law.
 define i32 @test5(i32 %A, i32 %B) {
-        %a = xor i32 %A, -1
-        %b = xor i32 %B, -1
-        %c = or i32 %a, %b
-        %d = xor i32 %c, -1
-        ret i32 %d
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[C_DEMORGAN:%.*]] = and i32 %A, %B
+; CHECK-NEXT:    ret i32 [[C_DEMORGAN]]
+;
+  %a = xor i32 %A, -1
+  %b = xor i32 %B, -1
+  %c = or i32 %a, %b
+  %d = xor i32 %c, -1
+  ret i32 %d
 }
 
 ; PR2298
 define zeroext i8 @test6(i32 %a, i32 %b) {
-entry:
-	%tmp1not = xor i32 %a, -1
-	%tmp2not = xor i32 %b, -1
-	%tmp3 = icmp slt i32 %tmp1not, %tmp2not
-	%retval67 = zext i1 %tmp3 to i8
-	ret i8 %retval67
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp slt i32 %b, %a
+; CHECK-NEXT:    [[RETVAL67:%.*]] = zext i1 [[TMP3]] to i8
+; CHECK-NEXT:    ret i8 [[RETVAL67]]
+;
+  %tmp1not = xor i32 %a, -1
+  %tmp2not = xor i32 %b, -1
+  %tmp3 = icmp slt i32 %tmp1not, %tmp2not
+  %retval67 = zext i1 %tmp3 to i8
+  ret i8 %retval67
 }
 
 define <2 x i1> @test7(<2 x i32> %A, <2 x i32> %B) {
-        %cond = icmp sle <2 x i32> %A, %B
-        %Ret = xor <2 x i1> %cond, <i1 true, i1 true>
-        ret <2 x i1> %Ret
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[RET:%.*]] = icmp sgt <2 x i32> %A, %B
+; CHECK-NEXT:    ret <2 x i1> [[RET]]
+;
+  %cond = icmp sle <2 x i32> %A, %B
+  %Ret = xor <2 x i1> %cond, <i1 true, i1 true>
+  ret <2 x i1> %Ret
 }
 
diff --git a/test/Transforms/InstCombine/objsize.ll b/test/Transforms/InstCombine/objsize.ll
index 2af391f907cc300fe1dbc98750b995bce18cb204..5c0a36f5feaa0b9e10ea1f6054dc75257597f8de 100644
--- a/test/Transforms/InstCombine/objsize.ll
+++ b/test/Transforms/InstCombine/objsize.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f3
 define i32 @foo() nounwind {
 ; CHECK-LABEL: @foo(
 ; CHECK-NEXT: ret i32 60
-  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
   ret i32 %1
 }
 
@@ -16,7 +16,7 @@ define i8* @bar() nounwind {
 ; CHECK-LABEL: @bar(
 entry:
   %retval = alloca i8*
-  %0 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+  %0 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
   %cmp = icmp ne i32 %0, -1
 ; CHECK: br i1 true
   br i1 %cmp, label %cond.true, label %cond.false
@@ -33,7 +33,7 @@ cond.false:
 define i32 @f() nounwind {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT: ret i32 0
-  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr ([60 x i8], [60 x i8]* @a, i32 1, i32 0), i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr ([60 x i8], [60 x i8]* @a, i32 1, i32 0), i1 false, i1 false)
   ret i32 %1
 }
 
@@ -42,7 +42,7 @@ define i32 @f() nounwind {
 define i1 @baz() nounwind {
 ; CHECK-LABEL: @baz(
 ; CHECK-NEXT: objectsize
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([0 x i8], [0 x i8]* @window, i32 0, i32 0), i1 false)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([0 x i8], [0 x i8]* @window, i32 0, i32 0), i1 false, i1 false)
   %2 = icmp eq i32 %1, -1
   ret i1 %2
 }
@@ -51,7 +51,7 @@ define void @test1(i8* %q, i32 %x) nounwind noinline {
 ; CHECK-LABEL: @test1(
 ; CHECK: objectsize.i32.p0i8
 entry:
-  %0 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([0 x i8], [0 x i8]* @window, i32 0, i32 10), i1 false) ; <i64> [#uses=1]
+  %0 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([0 x i8], [0 x i8]* @window, i32 0, i32 10), i1 false, i1 false) ; <i64> [#uses=1]
   %1 = icmp eq i32 %0, -1                         ; <i1> [#uses=1]
   br i1 %1, label %"47", label %"46"
 
@@ -67,7 +67,7 @@ entry:
 define i32 @test2() nounwind {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT: ret i32 34
-  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr (i8, i8* bitcast ([9 x i32]* @.str5 to i8*), i32 2), i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr (i8, i8* bitcast ([9 x i32]* @.str5 to i8*), i32 2), i1 false, i1 false)
   ret i32 %1
 }
 
@@ -76,7 +76,9 @@ define i32 @test2() nounwind {
 
 declare i8* @__memcpy_chk(i8*, i8*, i32, i32) nounwind
 
-declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) nounwind readonly
+
+declare i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)*, i1, i1) nounwind readonly
 
 declare i8* @__inline_memcpy_chk(i8*, i8*, i32) nounwind inlinehint
 
@@ -88,7 +90,7 @@ entry:
 bb11:
   %0 = getelementptr inbounds float, float* getelementptr inbounds ([480 x float], [480 x float]* @array, i32 0, i32 128), i32 -127 ; <float*> [#uses=1]
   %1 = bitcast float* %0 to i8*                   ; <i8*> [#uses=1]
-  %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false) ; <i32> [#uses=1]
+  %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false, i1 false) ; <i32> [#uses=1]
   %3 = call i8* @__memcpy_chk(i8* undef, i8* undef, i32 512, i32 %2) nounwind ; <i8*> [#uses=0]
 ; CHECK: unreachable
   unreachable
@@ -110,7 +112,7 @@ define i32 @test4(i8** %esc) nounwind ssp {
 entry:
   %0 = alloca %struct.data, align 8
   %1 = bitcast %struct.data* %0 to i8*
-  %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false) nounwind
+  %2 = call i32 @llvm.objectsize.i32.p0i8(i8* %1, i1 false, i1 false) nounwind
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @llvm.memset.p0i8.i32(i8* %1, i8 0, i32 1824, i32 8, i1 false)
   %3 = call i8* @__memset_chk(i8* %1, i32 0, i32 1824, i32 %2) nounwind
@@ -125,7 +127,7 @@ define i8* @test5(i32 %n) nounwind ssp {
 ; CHECK-LABEL: @test5(
 entry:
   %0 = tail call noalias i8* @malloc(i32 20) nounwind
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %0, i1 false)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %0, i1 false, i1 false)
   %2 = load i8*, i8** @s, align 8
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 10, i32 1, i1 false)
@@ -137,7 +139,7 @@ define void @test6(i32 %n) nounwind ssp {
 ; CHECK-LABEL: @test6(
 entry:
   %0 = tail call noalias i8* @malloc(i32 20) nounwind
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %0, i1 false)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %0, i1 false, i1 false)
   %2 = load i8*, i8** @s, align 8
 ; CHECK-NOT: @llvm.objectsize
 ; CHECK: @__memcpy_chk(i8* %0, i8* %1, i32 30, i32 20)
@@ -154,7 +156,7 @@ define i32 @test7(i8** %esc) {
   %alloc = call noalias i8* @malloc(i32 48) nounwind
   store i8* %alloc, i8** %esc
   %gep = getelementptr inbounds i8, i8* %alloc, i32 16
-  %objsize = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 false) nounwind readonly
+  %objsize = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 false, i1 false) nounwind readonly
 ; CHECK: ret i32 32
   ret i32 %objsize
 }
@@ -166,7 +168,7 @@ define i32 @test8(i8** %esc) {
   %alloc = call noalias i8* @calloc(i32 5, i32 7) nounwind
   store i8* %alloc, i8** %esc
   %gep = getelementptr inbounds i8, i8* %alloc, i32 5
-  %objsize = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 false) nounwind readonly
+  %objsize = call i32 @llvm.objectsize.i32.p0i8(i8* %gep, i1 false, i1 false) nounwind readonly
 ; CHECK: ret i32 30
   ret i32 %objsize
 }
@@ -178,7 +180,7 @@ declare noalias i8* @strndup(i8* nocapture, i32) nounwind
 define i32 @test9(i8** %esc) {
   %call = tail call i8* @strdup(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0)) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true, i1 false)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -187,7 +189,7 @@ define i32 @test9(i8** %esc) {
 define i32 @test10(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i32 3) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true, i1 false)
 ; CHECK: ret i32 4
   ret i32 %1
 }
@@ -196,7 +198,7 @@ define i32 @test10(i8** %esc) {
 define i32 @test11(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i32 7) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true, i1 false)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -205,7 +207,7 @@ define i32 @test11(i8** %esc) {
 define i32 @test12(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i32 8) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true, i1 false)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -214,7 +216,7 @@ define i32 @test12(i8** %esc) {
 define i32 @test13(i8** %esc) {
   %call = tail call i8* @strndup(i8* getelementptr inbounds ([8 x i8], [8 x i8]* @.str, i64 0, i64 0), i32 57) nounwind
   store i8* %call, i8** %esc, align 8
-  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true)
+  %1 = tail call i32 @llvm.objectsize.i32.p0i8(i8* %call, i1 true, i1 false)
 ; CHECK: ret i32 8
   ret i32 %1
 }
@@ -225,7 +227,7 @@ define i32 @test13(i8** %esc) {
 ; CHECK-NEXT: ret i32 60
 define i32 @test18() {
   %bc = bitcast [60 x i8]* @globalalias to i8*
-  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false, i1 false)
   ret i32 %1
 }
 
@@ -235,7 +237,67 @@ define i32 @test18() {
 ; CHECK: llvm.objectsize
 define i32 @test19() {
   %bc = bitcast [60 x i8]* @globalalias2 to i8*
-  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false)
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* %bc, i1 false, i1 false)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test20(
+; CHECK: ret i32 0
+define i32 @test20() {
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* null, i1 false, i1 false)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test21(
+; CHECK: ret i32 0
+define i32 @test21() {
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* null, i1 true, i1 false)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test22(
+; CHECK: llvm.objectsize
+define i32 @test22() {
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* null, i1 false, i1 true)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test23(
+; CHECK: llvm.objectsize
+define i32 @test23() {
+  %1 = call i32 @llvm.objectsize.i32.p0i8(i8* null, i1 true, i1 true)
   ret i32 %1
 }
 
+; 1 is an arbitrary non-zero address space.
+; CHECK-LABEL: @test24(
+; CHECK: ret i32 0
+define i32 @test24() {
+  %1 = call i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)* null, i1 false,
+                                          i1 false)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test25(
+; CHECK: ret i32 0
+define i32 @test25() {
+  %1 = call i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)* null, i1 true,
+                                          i1 false)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test26(
+; CHECK: ret i32 0
+define i32 @test26() {
+  %1 = call i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)* null, i1 false,
+                                          i1 true)
+  ret i32 %1
+}
+
+; CHECK-LABEL: @test27(
+; CHECK: ret i32 0
+define i32 @test27() {
+  %1 = call i32 @llvm.objectsize.i32.p1i8(i8 addrspace(1)* null, i1 true,
+                                          i1 true)
+  ret i32 %1
+}
diff --git a/test/Transforms/InstCombine/or.ll b/test/Transforms/InstCombine/or.ll
index 2c9088428bdec55b159b9ab2ee3aeef0e3bf3699..2ac6f5b110472ea763337d7e880a794f6cc283dd 100644
--- a/test/Transforms/InstCombine/or.ll
+++ b/test/Transforms/InstCombine/or.ll
@@ -215,11 +215,25 @@ define i1 @test19(i32 %A) {
 ;
   %B = icmp eq i32 %A, 50
   %C = icmp eq i32 %A, 51
-  ;; (A&-2) == 50
   %D = or i1 %B, %C
   ret i1 %D
 }
 
+; PR32524: https://bugs.llvm.org/show_bug.cgi?id=32524
+
+define i1 @or_icmps_eq_diff1(i32 %x) {
+; CHECK-LABEL: @or_icmps_eq_diff1(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 %x, -1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 %x, 0
+; CHECK-NEXT:    [[LOGIC:%.*]] = or i1 [[CMP1]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[LOGIC]]
+;
+  %cmp1 = icmp eq i32 %x, -1
+  %cmp2 = icmp eq i32 %x, 0
+  %logic = or i1 %cmp1, %cmp2
+  ret i1 %logic
+}
+
 define i32 @test20(i32 %x) {
 ; CHECK-LABEL: @test20(
 ; CHECK-NEXT:    ret i32 %x
@@ -490,7 +504,7 @@ define i32 @orsext_to_sel_multi_use(i32 %x, i1 %y) {
 ; CHECK-LABEL: @orsext_to_sel_multi_use(
 ; CHECK-NEXT:    [[SEXT:%.*]] = sext i1 %y to i32
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[SEXT]], %x
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[SEXT]], [[OR]]
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[OR]], [[SEXT]]
 ; CHECK-NEXT:    ret i32 [[ADD]]
 ;
   %sext = sext i1 %y to i32
@@ -521,7 +535,7 @@ define <2 x i132> @orsext_to_sel_vec_swap(<2 x i132> %x, <2 x i1> %y) {
 
 define i32 @test39(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[OR:%.*]] = or i32 %a, %b
+; CHECK-NEXT:    [[OR:%.*]] = or i32 %b, %a
 ; CHECK-NEXT:    ret i32 [[OR]]
 ;
   %xor = xor i32 %a, -1
@@ -542,6 +556,42 @@ define i32 @test40(i32 %a, i32 %b) {
   ret i32 %or
 }
 
+define i32 @test40b(i32 %a, i32 %b) {
+; CHECK-LABEL: @test40b(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 %a, -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR]], %b
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %and = and i32 %b, %a
+  %xor = xor i32 %a, -1
+  %or = or i32 %and, %xor
+  ret i32 %or
+}
+
+define i32 @test40c(i32 %a, i32 %b) {
+; CHECK-LABEL: @test40c(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 %a, -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR]], %b
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %and = and i32 %b, %a
+  %xor = xor i32 %a, -1
+  %or = or i32 %xor, %and
+  ret i32 %or
+}
+
+define i32 @test40d(i32 %a, i32 %b) {
+; CHECK-LABEL: @test40d(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i32 %a, -1
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[XOR]], %b
+; CHECK-NEXT:    ret i32 [[OR]]
+;
+  %and = and i32 %a, %b
+  %xor = xor i32 %a, -1
+  %or = or i32 %xor, %and
+  ret i32 %or
+}
+
 define i32 @test41(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test41(
 ; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 %a, -1
@@ -701,3 +751,138 @@ define i1 @test48(i64 %x, i1 %b) {
   %3 = or i1 %1, %.b
   ret i1 %3
 }
+
+define i32 @test49(i1 %C) {
+; CHECK-LABEL: @test49(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], i32 1019, i32 123
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10
+  %V = or i32 %A, 123
+  ret i32 %V
+}
+
+define <2 x i32> @test49vec(i1 %C) {
+; CHECK-LABEL: @test49vec(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 1019, i32 1019>, <2 x i32> <i32 123, i32 123>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = or <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %V
+}
+
+define <2 x i32> @test49vec2(i1 %C) {
+; CHECK-LABEL: @test49vec2(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 1019, i32 2509>, <2 x i32> <i32 123, i32 351>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 2500>, <2 x i32> <i32 10, i32 30>
+  %V = or <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %V
+}
+
+define i32 @test50(i1 %which) {
+; CHECK-LABEL: @test50(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 1019, [[ENTRY:%.*]] ], [ 123, [[DELAY]] ]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = or i32 %A, 123
+  ret i32 %value
+}
+
+define <2 x i32> @test50vec(i1 %which) {
+; CHECK-LABEL: @test50vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1019, i32 1019>, [[ENTRY:%.*]] ], [ <i32 123, i32 123>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = or <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test50vec2(i1 %which) {
+; CHECK-LABEL: @test50vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1019, i32 2509>, [[ENTRY:%.*]] ], [ <i32 123, i32 351>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = or <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %value
+}
+
+define i8 @test51(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @test51(
+; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %w = mul i8 %b, %c
+  %z = xor i8 %a, -1
+  %y = and i8 %w, %z
+  %x = or i8 %y, %a
+  ret i8 %x
+}
+
+define i8 @test52(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @test52(
+; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %w = mul i8 %b, %c
+  %z = xor i8 %w, -1
+  %y = and i8 %z, %a
+  %x = or i8 %w, %y
+  ret i8 %x
+}
+
+define i8 @test53(i8 %a, i8 %b, i8 %c) {
+; CHECK-LABEL: @test53(
+; CHECK-NEXT:    [[W:%.*]] = mul i8 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[X:%.*]] = or i8 [[W]], [[A:%.*]]
+; CHECK-NEXT:    ret i8 [[X]]
+;
+  %w = mul i8 %b, %c
+  %z = xor i8 %w, -1
+  %y = and i8 %z, %a
+  %x = or i8 %w, %y
+  ret i8 %x
+}
diff --git a/test/Transforms/InstCombine/phi-select-constant.ll b/test/Transforms/InstCombine/phi-select-constant.ll
new file mode 100644
index 0000000000000000000000000000000000000000..272594d7f4f9cca0226723590b495f7bc0b0f9dd
--- /dev/null
+++ b/test/Transforms/InstCombine/phi-select-constant.ll
@@ -0,0 +1,57 @@
+; RUN: opt < %s -S -instcombine | FileCheck %s
+@A = extern_weak global i32, align 4
+@B = extern_weak global i32, align 4
+
+define i32 @foo(i1 %which) {
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+; CHECK-LABEL: @foo
+; CHECK-LABEL: final:
+; CHECK: phi i32 [ 1, %entry ], [ select (i1 icmp eq (i32* @A, i32* @B), i32 2, i32 1), %delay ]
+final:
+  %use2 = phi i1 [ false, %entry ], [ icmp eq (i32* @A, i32* @B), %delay ]
+  %value = select i1 %use2, i32 2, i32 1
+  ret i32 %value
+}
+
+
+; test folding of select into phi for vectors.
+define <4 x i64> @vec1(i1 %which) {
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+ br label %final
+
+final:
+; CHECK-LABEL: @vec1
+; CHECK-LABEL: final:
+; CHECK: %phinode = phi <4 x i64> [ zeroinitializer, %entry ], [ <i64 0, i64 0, i64 126, i64 127>, %delay ]
+; CHECK-NOT: select
+; CHECK: ret <4 x i64> %phinode
+ %phinode =  phi <4 x i1> [ <i1 true, i1 true, i1 true, i1 true>, %entry ], [ <i1 true, i1 true, i1 false, i1 false>, %delay ]
+ %sel = select <4 x i1> %phinode, <4 x i64> zeroinitializer, <4 x i64> <i64 124, i64 125, i64 126, i64 127>
+ ret <4 x i64> %sel
+}
+
+define <4 x i64> @vec2(i1 %which) {
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+ br label %final
+
+final:
+; CHECK-LABEL: @vec2
+; CHECK-LABEL: final:
+; CHECK: %phinode = phi <4 x i64> [ <i64 124, i64 125, i64 126, i64 127>, %entry ], [ <i64 0, i64 125, i64 0, i64 127>, %delay ]
+; CHECK-NOT: select
+; CHECK: ret <4 x i64> %phinode
+ %phinode =  phi <4 x i1> [ <i1 false, i1 false, i1 false, i1 false>, %entry ], [ <i1 true, i1 false, i1 true, i1 false>, %delay ]
+ %sel = select <4 x i1> %phinode, <4 x i64> zeroinitializer, <4 x i64> <i64 124, i64 125, i64 126, i64 127>
+ ret <4 x i64> %sel
+}
diff --git a/test/Transforms/InstCombine/phi-select-constexpr.ll b/test/Transforms/InstCombine/phi-select-constexpr.ll
deleted file mode 100644
index 054e0691d47a0d7c77d4807e8ff92c64cfe75984..0000000000000000000000000000000000000000
--- a/test/Transforms/InstCombine/phi-select-constexpr.ll
+++ /dev/null
@@ -1,19 +0,0 @@
-; RUN: opt < %s -S -instcombine | FileCheck %s
-@A = extern_weak global i32, align 4
-@B = extern_weak global i32, align 4
-
-define i32 @foo(i1 %which) {
-entry:
-  br i1 %which, label %final, label %delay
-
-delay:
-  br label %final
-
-; CHECK-LABEL: final:
-; CHECK: phi i32 [ 1, %entry ], [ select (i1 icmp eq (i32* @A, i32* @B), i32 2, i32 1), %delay ]
-final:
-  %use2 = phi i1 [ false, %entry ], [ icmp eq (i32* @A, i32* @B), %delay ]
-  %value = select i1 %use2, i32 2, i32 1
-  ret i32 %value
-}
-
diff --git a/test/Transforms/InstCombine/pr19420.ll b/test/Transforms/InstCombine/pr19420.ll
index 23fa0a4097458d86c2caa5215de0eb5d83b8d86a..015f35eaaa53da8cd0d75b9576483c0d77106e88 100644
--- a/test/Transforms/InstCombine/pr19420.ll
+++ b/test/Transforms/InstCombine/pr19420.ll
@@ -1,36 +1,44 @@
 ; RUN: opt -S -instcombine < %s | FileCheck %s
 
-; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL
-; CHECK: mul <4 x i32> %in, <i32 0, i32 -32, i32 0, i32 -32>
-; CHECK-NEXT: ret
 define <4 x i32> @test_FoldShiftByConstant_CreateSHL(<4 x i32> %in) {
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL(
+; CHECK-NEXT:    [[VSHL_N:%.*]] = mul <4 x i32> %in, <i32 0, i32 -32, i32 0, i32 -32>
+; CHECK-NEXT:    ret <4 x i32> [[VSHL_N]]
+;
   %mul.i = mul <4 x i32> %in, <i32 0, i32 -1, i32 0, i32 -1>
   %vshl_n = shl <4 x i32> %mul.i, <i32 5, i32 5, i32 5, i32 5>
   ret <4 x i32> %vshl_n
 }
 
-; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL2
-; CHECK: mul <8 x i16> %in, <i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32>
-; CHECK-NEXT: ret
 define <8 x i16> @test_FoldShiftByConstant_CreateSHL2(<8 x i16> %in) {
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateSHL2(
+; CHECK-NEXT:    [[VSHL_N:%.*]] = mul <8 x i16> %in, <i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32, i16 0, i16 -32>
+; CHECK-NEXT:    ret <8 x i16> [[VSHL_N]]
+;
   %mul.i = mul <8 x i16> %in, <i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1, i16 0, i16 -1>
   %vshl_n = shl <8 x i16> %mul.i, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
   ret <8 x i16> %vshl_n
 }
 
-; CHECK-LABEL: @test_FoldShiftByConstant_CreateAnd
-; CHECK: mul <16 x i8> %in0, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
-; CHECK-NEXT: and <16 x i8> %vsra_n2, <i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32>
-; CHECK-NEXT: ret
 define <16 x i8> @test_FoldShiftByConstant_CreateAnd(<16 x i8> %in0) {
+; CHECK-LABEL: @test_FoldShiftByConstant_CreateAnd(
+; CHECK-NEXT:    [[VSRA_N2:%.*]] = mul <16 x i8> %in0, <i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33, i8 33>
+; CHECK-NEXT:    [[VSHL_N:%.*]] = and <16 x i8> [[VSRA_N2]], <i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32, i8 -32>
+; CHECK-NEXT:    ret <16 x i8> [[VSHL_N]]
+;
   %vsra_n = ashr <16 x i8> %in0, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
   %tmp = add <16 x i8> %in0, %vsra_n
   %vshl_n = shl <16 x i8> %tmp, <i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5, i8 5>
   ret <16 x i8> %vshl_n
 }
 
-
 define i32 @bar(i32 %x, i32 %y) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[B1:%.*]] = shl i32 %y, 4
+; CHECK-NEXT:    [[A2:%.*]] = add i32 [[B1]], %x
+; CHECK-NEXT:    [[C:%.*]] = and i32 [[A2]], -16
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %a = lshr i32 %x, 4
   %b = add i32 %a, %y
   %c = shl i32 %b, 4
@@ -38,16 +46,25 @@ define i32 @bar(i32 %x, i32 %y) {
 }
 
 define <2 x i32> @bar_v2i32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @bar_v2i32(
+; CHECK-NEXT:    [[B1:%.*]] = shl <2 x i32> %y, <i32 5, i32 5>
+; CHECK-NEXT:    [[A2:%.*]] = add <2 x i32> [[B1]], %x
+; CHECK-NEXT:    [[C:%.*]] = and <2 x i32> [[A2]], <i32 -32, i32 -32>
+; CHECK-NEXT:    ret <2 x i32> [[C]]
+;
   %a = lshr <2 x i32> %x, <i32 5, i32 5>
   %b = add <2 x i32> %a, %y
   %c = shl <2 x i32> %b, <i32 5, i32 5>
   ret <2 x i32> %c
 }
 
-
-
-
 define i32 @foo(i32 %x, i32 %y) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[C1:%.*]] = shl i32 %y, 4
+; CHECK-NEXT:    [[X_MASK:%.*]] = and i32 %x, 128
+; CHECK-NEXT:    [[D:%.*]] = add i32 [[X_MASK]], [[C1]]
+; CHECK-NEXT:    ret i32 [[D]]
+;
   %a = lshr i32 %x, 4
   %b = and i32 %a, 8
   %c = add i32 %b, %y
@@ -56,6 +73,13 @@ define i32 @foo(i32 %x, i32 %y) {
 }
 
 define <2 x i32> @foo_v2i32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @foo_v2i32(
+; CHECK-NEXT:    [[A:%.*]] = lshr <2 x i32> %x, <i32 4, i32 4>
+; CHECK-NEXT:    [[B:%.*]] = and <2 x i32> [[A]], <i32 8, i32 8>
+; CHECK-NEXT:    [[C:%.*]] = add <2 x i32> [[B]], %y
+; CHECK-NEXT:    [[D:%.*]] = shl <2 x i32> [[C]], <i32 4, i32 4>
+; CHECK-NEXT:    ret <2 x i32> [[D]]
+;
   %a = lshr <2 x i32> %x, <i32 4, i32 4>
   %b = and <2 x i32> %a, <i32 8, i32 8>
   %c = add <2 x i32> %b, %y
@@ -63,5 +87,3 @@ define <2 x i32> @foo_v2i32(<2 x i32> %x, <2 x i32> %y) {
   ret <2 x i32> %d
 }
 
-
-
diff --git a/test/Transforms/InstCombine/pr31990_wrong_memcpy.ll b/test/Transforms/InstCombine/pr31990_wrong_memcpy.ll
new file mode 100644
index 0000000000000000000000000000000000000000..62ecd0311ffd1aeec31a0b6efa8c50ea2d4d2dcb
--- /dev/null
+++ b/test/Transforms/InstCombine/pr31990_wrong_memcpy.ll
@@ -0,0 +1,26 @@
+; RUN: opt -S -instcombine %s -o - | FileCheck %s
+
+; Regression test of PR31990. A memcpy of one byte, copying 0xff, was
+; replaced with a single store of an i4 0xf.
+
+@g = constant i8 -1
+
+define void @foo() {
+entry:
+  %0 = alloca i8
+  %1 = bitcast i8* %0 to i4*
+  call void @bar(i4* %1)
+  %2 = bitcast i4* %1 to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i32(i8* %2, i8* @g, i32 1, i32 1, i1 false)
+  call void @gaz(i8* %2)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture writeonly,
+                                        i8* nocapture readonly, i32, i32, i1)
+declare void @bar(i4*)
+declare void @gaz(i8*)
+
+; The mempcy should be simplified to a single store of an i8, not i4
+; CHECK: store i8 -1
+; CHECK-NOT: store i4 -1
diff --git a/test/Transforms/InstCombine/prefetch-load.ll b/test/Transforms/InstCombine/prefetch-load.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f98b7ae00bf1d712e126bba9f7494125871713b5
--- /dev/null
+++ b/test/Transforms/InstCombine/prefetch-load.ll
@@ -0,0 +1,34 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+%struct.C = type { %struct.C*, i32 }
+
+; Check that we instcombine the load across the prefetch.
+
+; CHECK-LABEL: define signext i32 @foo
+define signext i32 @foo(%struct.C* %c) local_unnamed_addr #0 {
+; CHECK: store i32 %dec, i32* %length_
+; CHECK-NOT: load
+; CHECK: llvm.prefetch
+; CHECK-NEXT: ret
+entry:
+  %next_ = getelementptr inbounds %struct.C, %struct.C* %c, i32 0, i32 0
+  %0 = load %struct.C*, %struct.C** %next_, align 8
+  %next_1 = getelementptr inbounds %struct.C, %struct.C* %0, i32 0, i32 0
+  %1 = load %struct.C*, %struct.C** %next_1, align 8
+  store %struct.C* %1, %struct.C** %next_, align 8
+  %length_ = getelementptr inbounds %struct.C, %struct.C* %c, i32 0, i32 1
+  %2 = load i32, i32* %length_, align 8
+  %dec = add nsw i32 %2, -1
+  store i32 %dec, i32* %length_, align 8
+  %3 = bitcast %struct.C* %1 to i8*
+  call void @llvm.prefetch(i8* %3, i32 0, i32 0, i32 1)
+  %4 = load i32, i32* %length_, align 8
+  ret i32 %4
+}
+
+; Function Attrs: inaccessiblemem_or_argmemonly nounwind
+declare void @llvm.prefetch(i8* nocapture readonly, i32, i32, i32) 
+
+attributes #0 = { noinline nounwind }
+; We've explicitly removed the function attrs from llvm.prefetch so we get the defaults.
+; attributes #1 = { inaccessiblemem_or_argmemonly nounwind }
diff --git a/test/Transforms/InstCombine/rem.ll b/test/Transforms/InstCombine/rem.ll
index 89a741c907074e371c2b43fd917828a8431da80c..7a7a134db9c5d42e716482491dab3bd49ca4536d 100644
--- a/test/Transforms/InstCombine/rem.ll
+++ b/test/Transforms/InstCombine/rem.ll
@@ -1,28 +1,169 @@
-; This test makes sure that rem instructions are properly eliminated.
-;
 ; RUN: opt < %s -instcombine -S | FileCheck %s
-; END.
+
+define i64 @rem_signed(i64 %x1, i64 %y2) {
+; CHECK-LABEL: @rem_signed(
+; CHECK-NEXT:    [[R:%.*]] = srem i64 %x1, %y2
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %r = sdiv i64 %x1, %y2
+  %r7 = mul i64 %r, %y2
+  %r8 = sub i64 %x1, %r7
+  ret i64 %r8
+}
+
+define <4 x i32> @rem_signed_vec(<4 x i32> %t, <4 x i32> %u) {
+; CHECK-LABEL: @rem_signed_vec(
+; CHECK-NEXT:    [[K:%.*]] = srem <4 x i32> %t, %u
+; CHECK-NEXT:    ret <4 x i32> [[K]]
+;
+  %k = sdiv <4 x i32> %t, %u
+  %l = mul <4 x i32> %k, %u
+  %m = sub <4 x i32> %t, %l
+  ret <4 x i32> %m
+}
+
+define i64 @rem_unsigned(i64 %x1, i64 %y2) {
+; CHECK-LABEL: @rem_unsigned(
+; CHECK-NEXT:    [[R:%.*]] = urem i64 %x1, %y2
+; CHECK-NEXT:    ret i64 [[R]]
+;
+  %r = udiv i64 %x1, %y2
+  %r7 = mul i64 %r, %y2
+  %r8 = sub i64 %x1, %r7
+  ret i64 %r8
+}
+
+; PR28672 - https://llvm.org/bugs/show_bug.cgi?id=28672
+
+define i8 @big_divisor(i8 %x) {
+; CHECK-LABEL: @big_divisor(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i8 %x, -127
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 %x, 127
+; CHECK-NEXT:    [[REM:%.*]] = select i1 [[TMP1]], i8 %x, i8 [[TMP2]]
+; CHECK-NEXT:    ret i8 [[REM]]
+;
+  %rem = urem i8 %x, 129
+  ret i8 %rem
+}
+
+define i5 @biggest_divisor(i5 %x) {
+; CHECK-LABEL: @biggest_divisor(
+; CHECK-NEXT:    [[NOT_:%.*]] = icmp eq i5 %x, -1
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[NOT_]] to i5
+; CHECK-NEXT:    [[REM:%.*]] = add i5 [[TMP1]], %x
+; CHECK-NEXT:    ret i5 [[REM]]
+;
+  %rem = urem i5 %x, -1
+  ret i5 %rem
+}
+
+define <2 x i4> @big_divisor_vec(<2 x i4> %x) {
+; CHECK-LABEL: @big_divisor_vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i4> [[X:%.*]], <i4 -3, i4 -3>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <2 x i4> [[X]], <i4 3, i4 3>
+; CHECK-NEXT:    [[REM:%.*]] = select <2 x i1> [[TMP1]], <2 x i4> [[X]], <2 x i4> [[TMP2]]
+; CHECK-NEXT:    ret <2 x i4> [[REM]]
+;
+  %rem = urem <2 x i4> %x, <i4 13, i4 13>
+  ret <2 x i4> %rem
+}
+
+define i8 @urem1(i8 %x, i8 %y) {
+; CHECK-LABEL: @urem1(
+; CHECK-NEXT:    [[A:%.*]] = urem i8 %x, %y
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %A = udiv i8 %x, %y
+  %B = mul i8 %A, %y
+  %C = sub i8 %x, %B
+  ret i8 %C
+}
+
+define i8 @srem1(i8 %x, i8 %y) {
+; CHECK-LABEL: @srem1(
+; CHECK-NEXT:    [[A:%.*]] = srem i8 %x, %y
+; CHECK-NEXT:    ret i8 [[A]]
+;
+  %A = sdiv i8 %x, %y
+  %B = mul i8 %A, %y
+  %C = sub i8 %x, %B
+  ret i8 %C
+}
+
+define i8 @urem2(i8 %x, i8 %y) {
+; CHECK-LABEL: @urem2(
+; CHECK-NEXT:    [[A:%.*]] = urem i8 %x, %y
+; CHECK-NEXT:    [[C:%.*]] = sub i8 0, [[A]]
+; CHECK-NEXT:    ret i8 [[C]]
+;
+  %A = udiv i8 %x, %y
+  %B = mul i8 %A, %y
+  %C = sub i8 %B, %x
+  ret i8 %C
+}
+
+define i8 @urem3(i8 %x) {
+; CHECK-LABEL: @urem3(
+; CHECK-NEXT:    [[A:%.*]] = urem i8 %x, 3
+; CHECK-NEXT:    [[B1:%.*]] = sub i8 %x, [[A]]
+; CHECK-NEXT:    [[C:%.*]] = add i8 [[B1]], %x
+; CHECK-NEXT:    ret i8 [[C]]
+;
+  %A = udiv i8 %x, 3
+  %B = mul i8 %A, -3
+  %C = sub i8 %x, %B
+  ret i8 %C
+}
+
+; (((X / Y) * Y) / Y) -> X / Y
+
+define i32 @sdiv_mul_sdiv(i32 %x, i32 %y) {
+; CHECK-LABEL: @sdiv_mul_sdiv(
+; CHECK-NEXT:    [[R:%.*]] = sdiv i32 %x, %y
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %div = sdiv i32 %x, %y
+  %mul = mul i32 %div, %y
+  %r = sdiv i32 %mul, %y
+  ret i32 %r
+}
+
+; (((X / Y) * Y) / Y) -> X / Y
+
+define i32 @udiv_mul_udiv(i32 %x, i32 %y) {
+; CHECK-LABEL: @udiv_mul_udiv(
+; CHECK-NEXT:    [[R:%.*]] = udiv i32 %x, %y
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %div = udiv i32 %x, %y
+  %mul = mul i32 %div, %y
+  %r = udiv i32 %mul, %y
+  ret i32 %r
+}
 
 define i32 @test1(i32 %A) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT: ret i32 0
-	%B = srem i32 %A, 1	; ISA constant 0
-	ret i32 %B
+; CHECK-NEXT:    ret i32 0
+;
+  %B = srem i32 %A, 1	; ISA constant 0
+  ret i32 %B
 }
 
 define i32 @test2(i32 %A) {	; 0 % X = 0, we don't need to preserve traps
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT: ret i32 0
-	%B = srem i32 0, %A
-	ret i32 %B
+; CHECK-NEXT:    ret i32 0
+;
+  %B = srem i32 0, %A
+  ret i32 %B
 }
 
 define i32 @test3(i32 %A) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT: [[AND:%.*]] = and i32 %A, 7
-; CHECK-NEXT: ret i32 [[AND]]
-	%B = urem i32 %A, 8
-	ret i32 %B
+; CHECK-NEXT:    [[B:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    ret i32 [[B]]
+;
+  %B = urem i32 %A, 8
+  ret i32 %B
 }
 
 define <2 x i32> @vec_power_of_2_constant_splat_divisor(<2 x i32> %A) {
@@ -45,12 +186,13 @@ define <2 x i19> @weird_vec_power_of_2_constant_splat_divisor(<2 x i19> %A) {
 
 define i1 @test3a(i32 %A) {
 ; CHECK-LABEL: @test3a(
-; CHECK-NEXT: [[AND:%.*]] = and i32 %A, 7
-; CHECK-NEXT: [[CMP:%.*]] = icmp ne i32 [[AND]], 0
-; CHECK-NEXT: ret i1 [[CMP]]
-	%B = srem i32 %A, -8
-	%C = icmp ne i32 %B, 0
-	ret i1 %C
+; CHECK-NEXT:    [[B1:%.*]] = and i32 %A, 7
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i32 [[B1]], 0
+; CHECK-NEXT:    ret i1 [[C]]
+;
+  %B = srem i32 %A, -8
+  %C = icmp ne i32 %B, 0
+  ret i1 %C
 }
 
 define <2 x i1> @test3a_vec(<2 x i32> %A) {
@@ -66,201 +208,221 @@ define <2 x i1> @test3a_vec(<2 x i32> %A) {
 
 define i32 @test4(i32 %X, i1 %C) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT: [[SEL:%.*]] = select i1 %C, i32 0, i32 7
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[SEL]], %X
-	%V = select i1 %C, i32 1, i32 8
-	%R = urem i32 %X, %V
-	ret i32 %R
+; CHECK-NEXT:    [[TMP1:%.*]] = select i1 %C, i32 0, i32 7
+; CHECK-NEXT:    [[R:%.*]] = and i32 [[TMP1]], %X
+; CHECK-NEXT:    ret i32 [[R]]
+;
+  %V = select i1 %C, i32 1, i32 8
+  %R = urem i32 %X, %V
+  ret i32 %R
 }
 
 define i32 @test5(i32 %X, i8 %B) {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT: [[ZEXT:%.*]] = zext i8 %B to i32
-; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 32, [[ZEXT]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SHL]], -1
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[ADD]], %X
-; CHECK-NEXT: ret i32 [[AND]]
-	%shift.upgrd.1 = zext i8 %B to i32
-	%Amt = shl i32 32, %shift.upgrd.1
-	%V = urem i32 %X, %Amt
-	ret i32 %V
+; CHECK-NEXT:    [[SHIFT_UPGRD_1:%.*]] = zext i8 %B to i32
+; CHECK-NEXT:    [[AMT:%.*]] = shl nuw i32 32, [[SHIFT_UPGRD_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[AMT]], -1
+; CHECK-NEXT:    [[V:%.*]] = and i32 [[TMP1]], %X
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %shift.upgrd.1 = zext i8 %B to i32
+  %Amt = shl i32 32, %shift.upgrd.1
+  %V = urem i32 %X, %Amt
+  ret i32 %V
 }
 
 define i32 @test6(i32 %A) {
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT: ret i32 undef
-	%B = srem i32 %A, 0	;; undef
-	ret i32 %B
+; CHECK-NEXT:    ret i32 undef
+;
+  %B = srem i32 %A, 0	;; undef
+  ret i32 %B
 }
 
 define i32 @test7(i32 %A) {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT: ret i32 0
-	%B = mul i32 %A, 8
-	%C = srem i32 %B, 4
-	ret i32 %C
+; CHECK-NEXT:    ret i32 0
+;
+  %B = mul i32 %A, 8
+  %C = srem i32 %B, 4
+  ret i32 %C
 }
 
 define i32 @test8(i32 %A) {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT: ret i32 0
-	%B = shl i32 %A, 4
-	%C = srem i32 %B, 8
-	ret i32 %C
+; CHECK-NEXT:    ret i32 0
+;
+  %B = shl i32 %A, 4
+  %C = srem i32 %B, 8
+  ret i32 %C
 }
 
 define i32 @test9(i32 %A) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT: ret i32 0
-	%B = mul i32 %A, 64
-	%C = urem i32 %B, 32
-	ret i32 %C
+; CHECK-NEXT:    ret i32 0
+;
+  %B = mul i32 %A, 64
+  %C = urem i32 %B, 32
+  ret i32 %C
 }
 
 define i32 @test10(i8 %c) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT: ret i32 0
-	%tmp.1 = zext i8 %c to i32
-	%tmp.2 = mul i32 %tmp.1, 4
-	%tmp.3 = sext i32 %tmp.2 to i64
-	%tmp.5 = urem i64 %tmp.3, 4
-	%tmp.6 = trunc i64 %tmp.5 to i32
-	ret i32 %tmp.6
+; CHECK-NEXT:    ret i32 0
+;
+  %tmp.1 = zext i8 %c to i32
+  %tmp.2 = mul i32 %tmp.1, 4
+  %tmp.3 = sext i32 %tmp.2 to i64
+  %tmp.5 = urem i64 %tmp.3, 4
+  %tmp.6 = trunc i64 %tmp.5 to i32
+  ret i32 %tmp.6
 }
 
 define i32 @test11(i32 %i) {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT: ret i32 0
-	%tmp.1 = and i32 %i, -2
-	%tmp.3 = mul i32 %tmp.1, 2
-	%tmp.5 = urem i32 %tmp.3, 4
-	ret i32 %tmp.5
+; CHECK-NEXT:    ret i32 0
+;
+  %tmp.1 = and i32 %i, -2
+  %tmp.3 = mul i32 %tmp.1, 2
+  %tmp.5 = urem i32 %tmp.3, 4
+  ret i32 %tmp.5
 }
 
 define i32 @test12(i32 %i) {
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT: ret i32 0
-	%tmp.1 = and i32 %i, -4
-	%tmp.5 = srem i32 %tmp.1, 2
-	ret i32 %tmp.5
+; CHECK-NEXT:    ret i32 0
+;
+  %tmp.1 = and i32 %i, -4
+  %tmp.5 = srem i32 %tmp.1, 2
+  ret i32 %tmp.5
 }
 
 define i32 @test13(i32 %i) {
 ; CHECK-LABEL: @test13(
-; CHECK-NEXT: ret i32 0
-	%x = srem i32 %i, %i
-	ret i32 %x
+; CHECK-NEXT:    ret i32 0
+;
+  %x = srem i32 %i, %i
+  ret i32 %x
 }
 
 define i64 @test14(i64 %x, i32 %y) {
 ; CHECK-LABEL: @test14(
-; CHECK-NEXT: [[SHL:%.*]] = shl i32 1, %y
-; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SHL]] to i64
-; CHECK-NEXT: [[ADD:%.*]] = add nsw i64 [[ZEXT]], -1
-; CHECK-NEXT: [[AND:%.*]] = and i64 [[ADD]], %x
-; CHECK-NEXT: ret i64 [[AND]]
-	%shl = shl i32 1, %y
-	%zext = zext i32 %shl to i64
-	%urem = urem i64 %x, %zext
-	ret i64 %urem
+; CHECK-NEXT:    [[SHL:%.*]] = shl i32 1, %y
+; CHECK-NEXT:    [[ZEXT:%.*]] = zext i32 [[SHL]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nsw i64 [[ZEXT]], -1
+; CHECK-NEXT:    [[UREM:%.*]] = and i64 [[TMP1]], %x
+; CHECK-NEXT:    ret i64 [[UREM]]
+;
+  %shl = shl i32 1, %y
+  %zext = zext i32 %shl to i64
+  %urem = urem i64 %x, %zext
+  ret i64 %urem
 }
 
 define i64 @test15(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test15(
-; CHECK-NEXT: [[SHL:%.*]] = shl nuw i32 1, %y
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[SHL]], -1
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[ADD]], %x
-; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[AND]] to i64
-; CHECK-NEXT: ret i64 [[ZEXT]]
-	%shl = shl i32 1, %y
-	%zext0 = zext i32 %shl to i64
-	%zext1 = zext i32 %x to i64
-	%urem = urem i64 %zext1, %zext0
-	ret i64 %urem
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i32 1, %y
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[SHL]], -1
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[TMP1]], %x
+; CHECK-NEXT:    [[UREM:%.*]] = zext i32 [[TMP2]] to i64
+; CHECK-NEXT:    ret i64 [[UREM]]
+;
+  %shl = shl i32 1, %y
+  %zext0 = zext i32 %shl to i64
+  %zext1 = zext i32 %x to i64
+  %urem = urem i64 %zext1, %zext0
+  ret i64 %urem
 }
 
 define i32 @test16(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test16(
-; CHECK-NEXT: [[SHR:%.*]] = lshr i32 %y, 11
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHR]], 4
-; CHECK-NEXT: [[OR:%.*]] = or i32 [[AND]], 3
-; CHECK-NEXT: [[REM:%.*]] = and i32 [[OR]], %x
-; CHECK-NEXT: ret i32 [[REM]]
-	%shr = lshr i32 %y, 11
-	%and = and i32 %shr, 4
-	%add = add i32 %and, 4
-	%rem = urem i32 %x, %add
-	ret i32 %rem
+; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 %y, 11
+; CHECK-NEXT:    [[AND:%.*]] = and i32 [[SHR]], 4
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[AND]], 3
+; CHECK-NEXT:    [[REM:%.*]] = and i32 [[TMP1]], %x
+; CHECK-NEXT:    ret i32 [[REM]]
+;
+  %shr = lshr i32 %y, 11
+  %and = and i32 %shr, 4
+  %add = add i32 %and, 4
+  %rem = urem i32 %x, %add
+  ret i32 %rem
 }
 
 define i32 @test17(i32 %X) {
 ; CHECK-LABEL: @test17(
-; CHECK-NEXT: icmp ne i32 %X, 1
-; CHECK-NEXT: zext i1
-; CHECK-NEXT: ret
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 %X, 1
+; CHECK-NEXT:    [[TMP2:%.*]] = zext i1 [[TMP1]] to i32
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
   %A = urem i32 1, %X
   ret i32 %A
 }
 
 define i32 @test18(i16 %x, i32 %y) {
-; CHECK: @test18
-; CHECK-NEXT: [[SHL:%.*]] = shl i16 %x, 3
-; CHECK-NEXT: [[AND:%.*]] = and i16 [[SHL]], 32
-; CHECK-NEXT: [[XOR:%.*]] = xor i16 [[AND]], 63
-; CHECK-NEXT: [[EXT:%.*]] = zext i16 [[XOR]] to i32
-; CHECK-NEXT: [[REM:%.*]] = and i32 [[EXT]], %y
-; CHECK-NEXT: ret i32 [[REM]]
-	%1 = and i16 %x, 4
-	%2 = icmp ne i16 %1, 0
-	%3 = select i1 %2, i32 32, i32 64
-	%4 = urem i32 %y, %3
-	ret i32 %4
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i16 %x, 3
+; CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 32
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i16 [[TMP2]], 63
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], %y
+; CHECK-NEXT:    ret i32 [[TMP5]]
+;
+  %1 = and i16 %x, 4
+  %2 = icmp ne i16 %1, 0
+  %3 = select i1 %2, i32 32, i32 64
+  %4 = urem i32 %y, %3
+  ret i32 %4
 }
 
 define i32 @test19(i32 %x, i32 %y) {
-; CHECK: @test19
-; CHECK-NEXT: [[SHL1:%.*]] = shl i32 1, %x
-; CHECK-NEXT: [[SHL2:%.*]] = shl i32 1, %y
-; CHECK-NEXT: [[AND:%.*]] = and i32 [[SHL1]], [[SHL2]]
-; CHECK-NEXT: [[ADD:%.*]] = add i32 [[AND]], [[SHL1]]
-; CHECK-NEXT: [[SUB:%.*]] = add i32 [[ADD]], -1
-; CHECK-NEXT: [[REM:%.*]] = and i32 [[SUB]], %y
-; CHECK-NEXT: ret i32 [[REM]]
-	%A = shl i32 1, %x
-	%B = shl i32 1, %y
-	%C = and i32 %A, %B
-	%D = add i32 %C, %A
-	%E = urem i32 %y, %D
-	ret i32 %E
+; CHECK-LABEL: @test19(
+; CHECK-NEXT:    [[A:%.*]] = shl i32 1, %x
+; CHECK-NEXT:    [[B:%.*]] = shl i32 1, %y
+; CHECK-NEXT:    [[C:%.*]] = and i32 [[A]], [[B]]
+; CHECK-NEXT:    [[D:%.*]] = add i32 [[C]], [[A]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i32 [[D]], -1
+; CHECK-NEXT:    [[E:%.*]] = and i32 [[TMP1]], %y
+; CHECK-NEXT:    ret i32 [[E]]
+;
+  %A = shl i32 1, %x
+  %B = shl i32 1, %y
+  %C = and i32 %A, %B
+  %D = add i32 %C, %A
+  %E = urem i32 %y, %D
+  ret i32 %E
 }
 
 define <2 x i64> @test20(<2 x i64> %X, <2 x i1> %C) {
 ; CHECK-LABEL: @test20(
-; CHECK-NEXT: select <2 x i1> %C, <2 x i64> <i64 1, i64 2>, <2 x i64> zeroinitializer
-; CHECK-NEXT: ret <2 x i64>
-	%V = select <2 x i1> %C, <2 x i64> <i64 1, i64 2>, <2 x i64> <i64 8, i64 9>
-	%R = urem <2 x i64> %V, <i64 2, i64 3>
-	ret <2 x i64> %R
+; CHECK-NEXT:    [[R:%.*]] = select <2 x i1> %C, <2 x i64> <i64 1, i64 2>, <2 x i64> zeroinitializer
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %V = select <2 x i1> %C, <2 x i64> <i64 1, i64 2>, <2 x i64> <i64 8, i64 9>
+  %R = urem <2 x i64> %V, <i64 2, i64 3>
+  ret <2 x i64> %R
 }
 
-define i32 @test21(i1 %c0, i32* %val) {
+define i32 @test21(i1 %c0, i32* %p) {
 ; CHECK-LABEL: @test21(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c0, label %if.then, label %if.end
+; CHECK:       if.then:
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, i32* %p, align 4
+; CHECK-NEXT:    [[PHITMP:%.*]] = srem i32 [[V]], 5
+; CHECK-NEXT:    br label %if.end
+; CHECK:       if.end:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i32 [ [[PHITMP]], %if.then ], [ 0, %entry ]
+; CHECK-NEXT:    ret i32 [[LHS]]
+;
 entry:
   br i1 %c0, label %if.then, label %if.end
 
 if.then:
-; CHECK: if.then:
-; CHECK-NEXT:  %v = load volatile i32, i32* %val, align 4
-; CHECK-NEXT:  %phitmp = srem i32 %v, 5
-
-  %v = load volatile i32, i32* %val
+  %v = load volatile i32, i32* %p
   br label %if.end
 
 if.end:
-; CHECK: if.end:
-; CHECK-NEXT:  %lhs = phi i32 [ %phitmp, %if.then ], [ 0, %entry ]
-; CHECK-NEXT:  ret i32 %lhs
-
   %lhs = phi i32 [ %v, %if.then ], [ 5, %entry ]
   %rem = srem i32 %lhs, 5
   ret i32 %rem
@@ -269,28 +431,34 @@ if.end:
 @a = common global [5 x i16] zeroinitializer, align 2
 @b = common global i16 0, align 2
 
-define i32 @pr27968_0(i1 %c0, i32* %val) {
+define i32 @pr27968_0(i1 %c0, i32* %p) {
 ; CHECK-LABEL: @pr27968_0(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c0, label %if.then, label %if.end
+; CHECK:       if.then:
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, i32* %p, align 4
+; CHECK-NEXT:    br label %if.end
+; CHECK:       if.end:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i32 [ [[V]], %if.then ], [ 5, %entry ]
+; CHECK-NEXT:    br i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b), label [[REM]].is.safe, label [[REM]].is.unsafe
+; CHECK:       rem.is.safe:
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[LHS]], zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
+; CHECK-NEXT:    ret i32 [[REM]]
+; CHECK:       rem.is.unsafe:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   br i1 %c0, label %if.then, label %if.end
 
 if.then:
-  %v = load volatile i32, i32* %val
+  %v = load volatile i32, i32* %p
   br label %if.end
 
-; CHECK: if.then:
-; CHECK-NOT: srem
-; CHECK:  br label %if.end
-
 if.end:
   %lhs = phi i32 [ %v, %if.then ], [ 5, %entry ]
   br i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b), label %rem.is.safe, label %rem.is.unsafe
 
 rem.is.safe:
-; CHECK: rem.is.safe:
-; CHECK-NEXT:  %rem = srem i32 %lhs, zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
-; CHECK-NEXT:  ret i32 %rem
-
   %rem = srem i32 %lhs, zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
   ret i32 %rem
 
@@ -298,19 +466,29 @@ rem.is.unsafe:
   ret i32 0
 }
 
-define i32 @pr27968_1(i1 %c0, i1 %always_false, i32* %val) {
+define i32 @pr27968_1(i1 %c0, i1 %always_false, i32* %p) {
 ; CHECK-LABEL: @pr27968_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c0, label %if.then, label %if.end
+; CHECK:       if.then:
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, i32* %p, align 4
+; CHECK-NEXT:    br label %if.end
+; CHECK:       if.end:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i32 [ [[V]], %if.then ], [ 5, %entry ]
+; CHECK-NEXT:    br i1 %always_false, label [[REM]].is.safe, label [[REM]].is.unsafe
+; CHECK:       rem.is.safe:
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[LHS]], -2147483648
+; CHECK-NEXT:    ret i32 [[REM]]
+; CHECK:       rem.is.unsafe:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   br i1 %c0, label %if.then, label %if.end
 
 if.then:
-  %v = load volatile i32, i32* %val
+  %v = load volatile i32, i32* %p
   br label %if.end
 
-; CHECK: if.then:
-; CHECK-NOT: srem
-; CHECK:  br label %if.end
-
 if.end:
   %lhs = phi i32 [ %v, %if.then ], [ 5, %entry ]
   br i1 %always_false, label %rem.is.safe, label %rem.is.unsafe
@@ -319,36 +497,38 @@ rem.is.safe:
   %rem = srem i32 %lhs, -2147483648
   ret i32 %rem
 
-; CHECK: rem.is.safe:
-; CHECK-NEXT:  %rem = srem i32 %lhs, -2147483648
-; CHECK-NEXT:  ret i32 %rem
-
 rem.is.unsafe:
   ret i32 0
 }
 
-define i32 @pr27968_2(i1 %c0, i32* %val) {
+define i32 @pr27968_2(i1 %c0, i32* %p) {
 ; CHECK-LABEL: @pr27968_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c0, label %if.then, label %if.end
+; CHECK:       if.then:
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, i32* %p, align 4
+; CHECK-NEXT:    br label %if.end
+; CHECK:       if.end:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i32 [ [[V]], %if.then ], [ 5, %entry ]
+; CHECK-NEXT:    br i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b), label [[REM]].is.safe, label [[REM]].is.unsafe
+; CHECK:       rem.is.safe:
+; CHECK-NEXT:    [[REM:%.*]] = urem i32 [[LHS]], zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
+; CHECK-NEXT:    ret i32 [[REM]]
+; CHECK:       rem.is.unsafe:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   br i1 %c0, label %if.then, label %if.end
 
 if.then:
-  %v = load volatile i32, i32* %val
+  %v = load volatile i32, i32* %p
   br label %if.end
 
-; CHECK: if.then:
-; CHECK-NOT: urem
-; CHECK:  br label %if.end
-
 if.end:
   %lhs = phi i32 [ %v, %if.then ], [ 5, %entry ]
   br i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b), label %rem.is.safe, label %rem.is.unsafe
 
 rem.is.safe:
-; CHECK: rem.is.safe:
-; CHECK-NEXT:  %rem = urem i32 %lhs, zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
-; CHECK-NEXT:  ret i32 %rem
-
   %rem = urem i32 %lhs, zext (i1 icmp eq (i16* getelementptr inbounds ([5 x i16], [5 x i16]* @a, i64 0, i64 4), i16* @b) to i32)
   ret i32 %rem
 
@@ -356,20 +536,29 @@ rem.is.unsafe:
   ret i32 0
 }
 
-define i32 @pr27968_3(i1 %c0, i1 %always_false, i32* %val) {
+define i32 @pr27968_3(i1 %c0, i1 %always_false, i32* %p) {
 ; CHECK-LABEL: @pr27968_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %c0, label %if.then, label %if.end
+; CHECK:       if.then:
+; CHECK-NEXT:    [[V:%.*]] = load volatile i32, i32* %p, align 4
+; CHECK-NEXT:    [[PHITMP:%.*]] = and i32 [[V]], 2147483647
+; CHECK-NEXT:    br label %if.end
+; CHECK:       if.end:
+; CHECK-NEXT:    [[LHS:%.*]] = phi i32 [ [[PHITMP]], %if.then ], [ 5, %entry ]
+; CHECK-NEXT:    br i1 %always_false, label %rem.is.safe, label %rem.is.unsafe
+; CHECK:       rem.is.safe:
+; CHECK-NEXT:    ret i32 [[LHS]]
+; CHECK:       rem.is.unsafe:
+; CHECK-NEXT:    ret i32 0
+;
 entry:
   br i1 %c0, label %if.then, label %if.end
 
 if.then:
-  %v = load volatile i32, i32* %val
+  %v = load volatile i32, i32* %p
   br label %if.end
 
-; CHECK: if.then:
-; CHECK-NEXT:  %v = load volatile i32, i32* %val, align 4
-; CHECK-NEXT:  %phitmp = and i32 %v, 2147483647
-; CHECK-NEXT:  br label %if.end
-
 if.end:
   %lhs = phi i32 [ %v, %if.then ], [ 5, %entry ]
   br i1 %always_false, label %rem.is.safe, label %rem.is.unsafe
@@ -381,3 +570,4 @@ rem.is.safe:
 rem.is.unsafe:
   ret i32 0
 }
+
diff --git a/test/Transforms/InstCombine/select-cmp-br.ll b/test/Transforms/InstCombine/select-cmp-br.ll
index 1dc7e153f5fb0da41e55036696f27899c5e09013..59384ab7b1f02779544ab08ff8272bea5e9feab6 100644
--- a/test/Transforms/InstCombine/select-cmp-br.ll
+++ b/test/Transforms/InstCombine/select-cmp-br.ll
@@ -1,155 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; Replace a 'select' with 'or' in 'select - cmp [eq|ne] - br' sequence
 ; RUN: opt -instcombine -S < %s | FileCheck %s
 
-%C = type <{ %struct.S }>
 %struct.S = type { i64*, i32, i32 }
+%C = type <{ %struct.S }>
 
-declare void @bar(%struct.S *) #1
+declare void @bar(%struct.S*)
 declare void @foobar()
 
-define void @test1(%C*) {
+define void @test1(%C* %arg) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[C:%.*]], %C* [[ARG:%.*]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[M:%.*]] = load i64*, i64** [[TMP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[N:%.*]] = load i64*, i64** [[TMP1]], align 8
+; CHECK-NEXT:    [[NOT_TMP5:%.*]] = icmp ne i64* [[M]], [[N]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq %C* [[ARG]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP71]], [[NOT_TMP5]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[BB10:%.*]], label [[BB8:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 0, i32 0
+; CHECK-NEXT:    tail call void @bar(%struct.S* [[TMP9]])
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[M]], i64 9
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to i64 (%C*)**
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64 (%C*)*, i64 (%C*)** [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](%C* [[ARG]])
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-  %1 = getelementptr inbounds %C, %C* %0, i64 0, i32 0, i32 0
-  %m = load i64*, i64** %1, align 8
-  %2 = getelementptr inbounds %C, %C* %0, i64 1, i32 0, i32 0
-  %n = load i64*, i64** %2, align 8
-  %3 = getelementptr inbounds i64, i64* %m, i64 9
-  %4 = bitcast i64* %3 to i64 (%C*)**
-  %5 = load i64 (%C*)*, i64 (%C*)** %4, align 8
-  %6 = icmp eq i64* %m, %n
-  %7 = select i1 %6, %C* %0, %C* null
-  %8 = icmp eq %C* %7, null
-  br i1 %8, label %12, label %10
-
-; <label>:9                                       ; preds = %10, %12
+  %tmp = getelementptr inbounds %C, %C* %arg, i64 0, i32 0, i32 0
+  %m = load i64*, i64** %tmp, align 8
+  %tmp1 = getelementptr inbounds %C, %C* %arg, i64 1, i32 0, i32 0
+  %n = load i64*, i64** %tmp1, align 8
+  %tmp2 = getelementptr inbounds i64, i64* %m, i64 9
+  %tmp3 = bitcast i64* %tmp2 to i64 (%C*)**
+  %tmp4 = load i64 (%C*)*, i64 (%C*)** %tmp3, align 8
+  %tmp5 = icmp eq i64* %m, %n
+  %tmp6 = select i1 %tmp5, %C* %arg, %C* null
+  %tmp7 = icmp eq %C* %tmp6, null
+  br i1 %tmp7, label %bb10, label %bb8
+
+bb:                                               ; preds = %bb10, %bb8
   ret void
 
-; <label>:10                                      ; preds = %entry
-  %11 = getelementptr inbounds %C, %C* %7, i64 0, i32 0
-  tail call void @bar(%struct.S* %11)
-  br label %9
+bb8:                                              ; preds = %entry
+  %tmp9 = getelementptr inbounds %C, %C* %tmp6, i64 0, i32 0
+  tail call void @bar(%struct.S* %tmp9)
+  br label %bb
 
-; <label>:12                                      ; preds = %entry
-  %13 = tail call i64 %5(%C* %0)
-  br label %9
-; CHECK-LABEL: @test1(
-; CHECK-NOT: select
-; CHECK: or
-; CHECK-NOT: select
+bb10:                                             ; preds = %entry
+  %tmp11 = tail call i64 %tmp4(%C* %arg)
+  br label %bb
 }
 
-define void @test2(%C*) {
+define void @test2(%C* %arg) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[C:%.*]], %C* [[ARG:%.*]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[M:%.*]] = load i64*, i64** [[TMP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[N:%.*]] = load i64*, i64** [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64* [[M]], [[N]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq %C* [[ARG]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP71]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[BB10:%.*]], label [[BB8:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 0, i32 0
+; CHECK-NEXT:    tail call void @bar(%struct.S* [[TMP9]])
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[M]], i64 9
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to i64 (%C*)**
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64 (%C*)*, i64 (%C*)** [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](%C* [[ARG]])
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-  %1 = getelementptr inbounds %C, %C* %0, i64 0, i32 0, i32 0
-  %m = load i64*, i64** %1, align 8
-  %2 = getelementptr inbounds %C, %C* %0, i64 1, i32 0, i32 0
-  %n = load i64*, i64** %2, align 8
-  %3 = getelementptr inbounds i64, i64* %m, i64 9
-  %4 = bitcast i64* %3 to i64 (%C*)**
-  %5 = load i64 (%C*)*, i64 (%C*)** %4, align 8
-  %6 = icmp eq i64* %m, %n
-  %7 = select i1 %6, %C* null, %C* %0
-  %8 = icmp eq %C* %7, null
-  br i1 %8, label %12, label %10
-
-; <label>:9                                       ; preds = %10, %12
+  %tmp = getelementptr inbounds %C, %C* %arg, i64 0, i32 0, i32 0
+  %m = load i64*, i64** %tmp, align 8
+  %tmp1 = getelementptr inbounds %C, %C* %arg, i64 1, i32 0, i32 0
+  %n = load i64*, i64** %tmp1, align 8
+  %tmp2 = getelementptr inbounds i64, i64* %m, i64 9
+  %tmp3 = bitcast i64* %tmp2 to i64 (%C*)**
+  %tmp4 = load i64 (%C*)*, i64 (%C*)** %tmp3, align 8
+  %tmp5 = icmp eq i64* %m, %n
+  %tmp6 = select i1 %tmp5, %C* null, %C* %arg
+  %tmp7 = icmp eq %C* %tmp6, null
+  br i1 %tmp7, label %bb10, label %bb8
+
+bb:                                               ; preds = %bb10, %bb8
   ret void
 
-; <label>:10                                      ; preds = %entry
-  %11 = getelementptr inbounds %C, %C* %7, i64 0, i32 0
-  tail call void @bar(%struct.S* %11)
-  br label %9
+bb8:                                              ; preds = %entry
+  %tmp9 = getelementptr inbounds %C, %C* %tmp6, i64 0, i32 0
+  tail call void @bar(%struct.S* %tmp9)
+  br label %bb
 
-; <label>:12                                      ; preds = %entry
-  %13 = tail call i64 %5(%C* %0)
-  br label %9
-; CHECK-LABEL: @test2(
-; CHECK-NOT: select
-; CHECK: or
-; CHECK-NOT: select
+bb10:                                             ; preds = %entry
+  %tmp11 = tail call i64 %tmp4(%C* %arg)
+  br label %bb
 }
 
-define void @test3(%C*) {
+define void @test3(%C* %arg) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[C:%.*]], %C* [[ARG:%.*]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[M:%.*]] = load i64*, i64** [[TMP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[N:%.*]] = load i64*, i64** [[TMP1]], align 8
+; CHECK-NEXT:    [[NOT_TMP5:%.*]] = icmp ne i64* [[M]], [[N]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq %C* [[ARG]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP71]], [[NOT_TMP5]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[BB10:%.*]], label [[BB8:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 0, i32 0
+; CHECK-NEXT:    tail call void @bar(%struct.S* [[TMP9]])
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[M]], i64 9
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to i64 (%C*)**
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64 (%C*)*, i64 (%C*)** [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](%C* [[ARG]])
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-  %1 = getelementptr inbounds %C, %C* %0, i64 0, i32 0, i32 0
-  %m = load i64*, i64** %1, align 8
-  %2 = getelementptr inbounds %C, %C* %0, i64 1, i32 0, i32 0
-  %n = load i64*, i64** %2, align 8
-  %3 = getelementptr inbounds i64, i64* %m, i64 9
-  %4 = bitcast i64* %3 to i64 (%C*)**
-  %5 = load i64 (%C*)*, i64 (%C*)** %4, align 8
-  %6 = icmp eq i64* %m, %n
-  %7 = select i1 %6, %C* %0, %C* null
-  %8 = icmp ne %C* %7, null
-  br i1 %8, label %10, label %12
-
-; <label>:9                                       ; preds = %10, %12
+  %tmp = getelementptr inbounds %C, %C* %arg, i64 0, i32 0, i32 0
+  %m = load i64*, i64** %tmp, align 8
+  %tmp1 = getelementptr inbounds %C, %C* %arg, i64 1, i32 0, i32 0
+  %n = load i64*, i64** %tmp1, align 8
+  %tmp2 = getelementptr inbounds i64, i64* %m, i64 9
+  %tmp3 = bitcast i64* %tmp2 to i64 (%C*)**
+  %tmp4 = load i64 (%C*)*, i64 (%C*)** %tmp3, align 8
+  %tmp5 = icmp eq i64* %m, %n
+  %tmp6 = select i1 %tmp5, %C* %arg, %C* null
+  %tmp7 = icmp ne %C* %tmp6, null
+  br i1 %tmp7, label %bb8, label %bb10
+
+bb:                                               ; preds = %bb10, %bb8
   ret void
 
-; <label>:10                                      ; preds = %entry
-  %11 = getelementptr inbounds %C, %C* %7, i64 0, i32 0
-  tail call void @bar(%struct.S* %11)
-  br label %9
+bb8:                                              ; preds = %entry
+  %tmp9 = getelementptr inbounds %C, %C* %tmp6, i64 0, i32 0
+  tail call void @bar(%struct.S* %tmp9)
+  br label %bb
 
-; <label>:12                                      ; preds = %entry
-  %13 = tail call i64 %5(%C* %0)
-  br label %9
-; CHECK-LABEL: @test3(
-; CHECK-NOT: select
-; CHECK: or
-; CHECK-NOT: select
+bb10:                                             ; preds = %entry
+  %tmp11 = tail call i64 %tmp4(%C* %arg)
+  br label %bb
 }
 
-define void @test4(%C*) {
+define void @test4(%C* %arg) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[C:%.*]], %C* [[ARG:%.*]], i64 0, i32 0, i32 0
+; CHECK-NEXT:    [[M:%.*]] = load i64*, i64** [[TMP]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[N:%.*]] = load i64*, i64** [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64* [[M]], [[N]]
+; CHECK-NEXT:    [[TMP71:%.*]] = icmp eq %C* [[ARG]], null
+; CHECK-NEXT:    [[TMP7:%.*]] = or i1 [[TMP5]], [[TMP71]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[BB10:%.*]], label [[BB8:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+; CHECK:       bb8:
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[C]], %C* [[ARG]], i64 0, i32 0
+; CHECK-NEXT:    tail call void @bar(%struct.S* [[TMP9]])
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb10:
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, i64* [[M]], i64 9
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i64* [[TMP2]] to i64 (%C*)**
+; CHECK-NEXT:    [[TMP4:%.*]] = load i64 (%C*)*, i64 (%C*)** [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](%C* [[ARG]])
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-  %1 = getelementptr inbounds %C, %C* %0, i64 0, i32 0, i32 0
-  %m = load i64*, i64** %1, align 8
-  %2 = getelementptr inbounds %C, %C* %0, i64 1, i32 0, i32 0
-  %n = load i64*, i64** %2, align 8
-  %3 = getelementptr inbounds i64, i64* %m, i64 9
-  %4 = bitcast i64* %3 to i64 (%C*)**
-  %5 = load i64 (%C*)*, i64 (%C*)** %4, align 8
-  %6 = icmp eq i64* %m, %n
-  %7 = select i1 %6, %C* null, %C* %0
-  %8 = icmp ne %C* %7, null
-  br i1 %8, label %10, label %12
-
-; <label>:9                                       ; preds = %10, %12
+  %tmp = getelementptr inbounds %C, %C* %arg, i64 0, i32 0, i32 0
+  %m = load i64*, i64** %tmp, align 8
+  %tmp1 = getelementptr inbounds %C, %C* %arg, i64 1, i32 0, i32 0
+  %n = load i64*, i64** %tmp1, align 8
+  %tmp2 = getelementptr inbounds i64, i64* %m, i64 9
+  %tmp3 = bitcast i64* %tmp2 to i64 (%C*)**
+  %tmp4 = load i64 (%C*)*, i64 (%C*)** %tmp3, align 8
+  %tmp5 = icmp eq i64* %m, %n
+  %tmp6 = select i1 %tmp5, %C* null, %C* %arg
+  %tmp7 = icmp ne %C* %tmp6, null
+  br i1 %tmp7, label %bb8, label %bb10
+
+bb:                                               ; preds = %bb10, %bb8
   ret void
 
-; <label>:10                                      ; preds = %entry
-  %11 = getelementptr inbounds %C, %C* %7, i64 0, i32 0
-  tail call void @bar(%struct.S* %11)
-  br label %9
+bb8:                                              ; preds = %entry
+  %tmp9 = getelementptr inbounds %C, %C* %tmp6, i64 0, i32 0
+  tail call void @bar(%struct.S* %tmp9)
+  br label %bb
 
-; <label>:12                                      ; preds = %entry
-  %13 = tail call i64 %5(%C* %0)
-  br label %9
-; CHECK-LABEL: @test4(
-; CHECK-NOT: select
-; CHECK: or
-; CHECK-NOT: select
+bb10:                                             ; preds = %entry
+  %tmp11 = tail call i64 %tmp4(%C* %arg)
+  br label %bb
 }
 
-define void @test5(%C*, i1) {
+define void @test5(%C* %arg, i1 %arg1) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq %C* [[ARG:%.*]], null
+; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP21]], [[ARG1:%.*]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[BB5:%.*]], label [[BB3:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    ret void
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[C:%.*]], %C* [[ARG]], i64 0, i32 0
+; CHECK-NEXT:    tail call void @bar(%struct.S* [[TMP4]])
+; CHECK-NEXT:    br label [[BB:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    tail call void @foobar()
+; CHECK-NEXT:    br label [[BB]]
+;
 entry:
-  %2 = select i1 %1, %C* null, %C* %0
-  %3 = icmp ne %C* %2, null
-  br i1 %3, label %5, label %7
+  %tmp = select i1 %arg1, %C* null, %C* %arg
+  %tmp2 = icmp ne %C* %tmp, null
+  br i1 %tmp2, label %bb3, label %bb5
 
-; <label>:4                                       ; preds = %10, %12
+bb:                                               ; preds = %bb5, %bb3
   ret void
 
-; <label>:5                                      ; preds = %entry
-  %6 = getelementptr inbounds %C, %C* %2, i64 0, i32 0
-  tail call void @bar(%struct.S* %6)
-  br label %4
+bb3:                                              ; preds = %entry
+  %tmp4 = getelementptr inbounds %C, %C* %tmp, i64 0, i32 0
+  tail call void @bar(%struct.S* %tmp4)
+  br label %bb
 
-; <label>:7                                      ; preds = %entry
+bb5:                                              ; preds = %entry
   tail call void @foobar()
-  br label %4
-; CHECK-LABEL: @test5(
-; CHECK-NOT: select
-; CHECK: or
-; CHECK-NOT: select
+  br label %bb
+}
+
+; Negative test. Must not trigger the select-cmp-br combine because the result
+; of the select is used in both flows following the br (the special case where
+; the conditional branch has the same target for both flows).
+define i32 @test6(i32 %arg, i1 %arg1) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[BB:%.*]], label [[BB]]
+; CHECK:       bb:
+; CHECK-NEXT:    [[TMP:%.*]] = select i1 [[ARG1:%.*]], i32 [[ARG:%.*]], i32 0
+; CHECK-NEXT:    ret i32 [[TMP]]
+;
+entry:
+  %tmp = select i1 %arg1, i32 %arg, i32 0
+  %tmp2 = icmp eq i32 %tmp, 0
+  br i1 %tmp2, label %bb, label %bb
+
+bb:                                               ; preds = %entry, %entry
+  ret i32 %tmp
 }
diff --git a/test/Transforms/InstCombine/select.ll b/test/Transforms/InstCombine/select.ll
index fac50b2f039c79bafdde7077cbeaf5ed114e2d42..a1ca6999f865fffa7d4745c344e500ad009307eb 100644
--- a/test/Transforms/InstCombine/select.ll
+++ b/test/Transforms/InstCombine/select.ll
@@ -190,7 +190,7 @@ define <2 x i1> @test62vec(<2 x i1> %A, <2 x i1> %B) {
 define i1 @test63(i1 %A, i1 %B) {
 ; CHECK-LABEL: @test63(
 ; CHECK-NEXT:    [[NOT:%.*]] = xor i1 %A, true
-; CHECK-NEXT:    [[C:%.*]] = or i1 %B, [[NOT]]
+; CHECK-NEXT:    [[C:%.*]] = or i1 [[NOT]], %B
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %not = xor i1 %A, true
@@ -201,7 +201,7 @@ define i1 @test63(i1 %A, i1 %B) {
 define <2 x i1> @test63vec(<2 x i1> %A, <2 x i1> %B) {
 ; CHECK-LABEL: @test63vec(
 ; CHECK-NEXT:    [[NOT:%.*]] = xor <2 x i1> %A, <i1 true, i1 true>
-; CHECK-NEXT:    [[C:%.*]] = or <2 x i1> %B, [[NOT]]
+; CHECK-NEXT:    [[C:%.*]] = or <2 x i1> [[NOT]], %B
 ; CHECK-NEXT:    ret <2 x i1> [[C]]
 ;
   %not = xor <2 x i1> %A, <i1 true, i1 true>
@@ -1264,11 +1264,10 @@ define i32 @PR23757(i32 %x) {
 define i32 @PR27137(i32 %a) {
 ; CHECK-LABEL: @PR27137(
 ; CHECK-NEXT:    [[NOT_A:%.*]] = xor i32 %a, -1
-; CHECK-NEXT:    [[C0:%.*]] = icmp slt i32 %a, 0
+; CHECK-NEXT:    [[C0:%.*]] = icmp sgt i32 [[NOT_A]], -1
 ; CHECK-NEXT:    [[S0:%.*]] = select i1 [[C0]], i32 [[NOT_A]], i32 -1
 ; CHECK-NEXT:    ret i32 [[S0]]
 ;
-
   %not_a = xor i32 %a, -1
   %c0 = icmp slt i32 %a, 0
   %s0 = select i1 %c0, i32 %not_a, i32 -1
diff --git a/test/Transforms/InstCombine/select_meta.ll b/test/Transforms/InstCombine/select_meta.ll
index 82a85e5836dca5a7fe647f8036874e450770f073..7d5771a0a81c789b7640f2602347961d45b00853 100644
--- a/test/Transforms/InstCombine/select_meta.ll
+++ b/test/Transforms/InstCombine/select_meta.ll
@@ -193,12 +193,11 @@ define i32 @test74(i32 %x) {
   ret i32 %retval
 }
 
-; FIXME:
 ; The compare should change, but the metadata remains the same because the select operands are not swapped.
 define i32 @smin1(i32 %x) {
 ; CHECK-LABEL: @smin1(
 ; CHECK-NEXT:    [[NOT_X:%.*]] = xor i32 %x, -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[NOT_X]], -1
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[NOT_X]], i32 -1, !prof ![[MD1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
@@ -208,13 +207,12 @@ define i32 @smin1(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, and the metadata is swapped because the select operands are swapped.
 define i32 @smin2(i32 %x) {
 ; CHECK-LABEL: @smin2(
 ; CHECK-NEXT:    [[NOT_X:%.*]] = xor i32 %x, -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %x, 0
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 -1, i32 [[NOT_X]], !prof ![[MD1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[NOT_X]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[NOT_X]], i32 -1, !prof ![[MD3]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %not_x = xor i32 %x, -1
@@ -223,12 +221,11 @@ define i32 @smin2(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, but the metadata remains the same because the select operands are not swapped.
 define i32 @smax1(i32 %x) {
 ; CHECK-LABEL: @smax1(
 ; CHECK-NEXT:    [[NOT_X:%.*]] = xor i32 %x, -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %x, 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[NOT_X]], -1
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[NOT_X]], i32 -1, !prof ![[MD1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
@@ -238,13 +235,12 @@ define i32 @smax1(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, and the metadata is swapped because the select operands are swapped.
 define i32 @smax2(i32 %x) {
 ; CHECK-LABEL: @smax2(
 ; CHECK-NEXT:    [[NOT_X:%.*]] = xor i32 %x, -1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, 0
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 -1, i32 [[NOT_X]], !prof ![[MD1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[NOT_X]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 [[NOT_X]], i32 -1, !prof ![[MD3]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %not_x = xor i32 %x, -1
@@ -253,11 +249,10 @@ define i32 @smax2(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, but the metadata remains the same because the select operands are not swapped.
 define i32 @umin1(i32 %x) {
 ; CHECK-LABEL: @umin1(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, -1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 %x, -2147483648
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 %x, i32 -2147483648, !prof ![[MD1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
@@ -266,12 +261,11 @@ define i32 @umin1(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, and the metadata is swapped because the select operands are swapped.
 define i32 @umin2(i32 %x) {
 ; CHECK-LABEL: @umin2(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %x, 0
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 2147483647, i32 %x, !prof ![[MD1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 %x, 2147483647
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 %x, i32 2147483647, !prof ![[MD3]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %cmp = icmp slt i32 %x, 0
@@ -279,11 +273,10 @@ define i32 @umin2(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, but the metadata remains the same because the select operands are not swapped.
 define i32 @umax1(i32 %x) {
 ; CHECK-LABEL: @umax1(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 %x, 0
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 %x, 2147483647
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 %x, i32 2147483647, !prof ![[MD1]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
@@ -292,12 +285,11 @@ define i32 @umax1(i32 %x) {
   ret i32 %sel
 }
 
-; FIXME:
 ; The compare should change, and the metadata is swapped because the select operands are swapped.
 define i32 @umax2(i32 %x) {
 ; CHECK-LABEL: @umax2(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 %x, -1
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 -2147483648, i32 %x, !prof ![[MD1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 %x, -2147483648
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP]], i32 %x, i32 -2147483648, !prof ![[MD3]]
 ; CHECK-NEXT:    ret i32 [[SEL]]
 ;
   %cmp = icmp sgt i32 %x, -1
diff --git a/test/Transforms/InstCombine/shufflevec-bitcast.ll b/test/Transforms/InstCombine/shufflevec-bitcast.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0f0365a07fb40b69e4594daf92e3da2e906eda8b
--- /dev/null
+++ b/test/Transforms/InstCombine/shufflevec-bitcast.ll
@@ -0,0 +1,16 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define void @test(<16 x i8> %w, i32* %o1, float* %o2) {
+
+; CHECK:       %v.bc = bitcast <16 x i8> %w to <4 x i32>
+; CHECK-NEXT:  %v.extract = extractelement <4 x i32> %v.bc, i32 3
+; CHECK-NEXT:  %v.bc{{[0-9]*}} = bitcast <16 x i8> %w to <4 x float>
+; CHECK-NEXT:  %v.extract{{[0-9]*}} = extractelement <4 x float> %v.bc{{[0-9]*}}, i32 3
+
+  %v = shufflevector <16 x i8> %w, <16 x i8> undef, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+  %f = bitcast <4 x i8> %v to float
+  %i = bitcast <4 x i8> %v to i32
+  store i32 %i, i32* %o1, align 4
+  store float %f, float* %o2, align 4
+  ret void
+}
diff --git a/test/Transforms/InstCombine/sitofp.ll b/test/Transforms/InstCombine/sitofp.ll
index 8209778388364f8fb8b8359749ead15ce8306024..149154723b9528139e1ca30d1ecfd8773379cdad 100644
--- a/test/Transforms/InstCombine/sitofp.ll
+++ b/test/Transforms/InstCombine/sitofp.ll
@@ -1,41 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-; CHECK-LABEL: test1
-; CHECK: ret i1 true
 define i1 @test1(i8 %A) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i1 true
+;
   %B = sitofp i8 %A to double
   %C = fcmp ult double %B, 128.0
   ret i1 %C
 }
 
-; CHECK-LABEL: test2
-; CHECK: ret i1 true
 define i1 @test2(i8 %A) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret i1 true
+;
   %B = sitofp i8 %A to double
   %C = fcmp ugt double %B, -128.1
   ret i1 %C
 }
 
-; CHECK-LABEL: test3
-; CHECK: ret i1 true
 define i1 @test3(i8 %A) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    ret i1 true
+;
   %B = sitofp i8 %A to double
   %C = fcmp ule double %B, 127.0
   ret i1 %C
 }
 
-; CHECK-LABEL: test4
-; CHECK: icmp ne i8 %A, 127
-; CHECK-NEXT: ret i1
 define i1 @test4(i8 %A) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    [[C:%.*]] = icmp ne i8 [[A:%.*]], 127
+; CHECK-NEXT:    ret i1 [[C]]
+;
   %B = sitofp i8 %A to double
   %C = fcmp ult double %B, 127.0
   ret i1 %C
 }
 
-; CHECK-LABEL: test5
-; CHECK: ret i32
 define i32 @test5(i32 %A) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
   %B = sitofp i32 %A to double
   %C = fptosi double %B to i32
   %D = uitofp i32 %C to double
@@ -43,10 +49,11 @@ define i32 @test5(i32 %A) {
   ret i32 %E
 }
 
-; CHECK-LABEL: test6
-; CHECK: and i32 %A, 39
-; CHECK-NEXT: ret i32
 define i32 @test6(i32 %A) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[ADDCONV:%.*]] = and i32 [[A:%.*]], 39
+; CHECK-NEXT:    ret i32 [[ADDCONV]]
+;
   %B = and i32 %A, 7
   %C = and i32 %A, 32
   %D = sitofp i32 %B to double
@@ -56,35 +63,39 @@ define i32 @test6(i32 %A) {
   ret i32 %G
 }
 
-; CHECK-LABEL: test7
-; CHECK: ret i32
-define i32 @test7(i32 %A) nounwind {
+define i32 @test7(i32 %A) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
   %B = sitofp i32 %A to double
   %C = fptoui double %B to i32
   ret i32 %C
 }
 
-; CHECK-LABEL: test8
-; CHECK: ret i32
-define i32 @test8(i32 %A) nounwind {
+define i32 @test8(i32 %A) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    ret i32 [[A:%.*]]
+;
   %B = uitofp i32 %A to double
   %C = fptosi double %B to i32
   ret i32 %C
 }
 
-; CHECK-LABEL: test9
-; CHECK: zext i8
-; CHECK-NEXT: ret i32
-define i32 @test9(i8 %A) nounwind {
+define i32 @test9(i8 %A) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[C:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %B = sitofp i8 %A to float
   %C = fptoui float %B to i32
   ret i32 %C
 }
 
-; CHECK-LABEL: test10
-; CHECK: sext i8
-; CHECK-NEXT: ret i32
-define i32 @test10(i8 %A) nounwind {
+define i32 @test10(i8 %A) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[C:%.*]] = sext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %B = sitofp i8 %A to float
   %C = fptosi float %B to i32
   ret i32 %C
@@ -92,10 +103,12 @@ define i32 @test10(i8 %A) nounwind {
 
 ; If the input value is outside of the range of the output cast, it's
 ; undefined behavior, so we can assume it fits.
-; CHECK-LABEL: test11
-; CHECK: trunc
-; CHECK-NEXT: ret i8
-define i8 @test11(i32 %A) nounwind {
+
+define i8 @test11(i32 %A) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[C:%.*]] = trunc i32 [[A:%.*]] to i8
+; CHECK-NEXT:    ret i8 [[C]]
+;
   %B = sitofp i32 %A to float
   %C = fptosi float %B to i8
   ret i8 %C
@@ -103,82 +116,103 @@ define i8 @test11(i32 %A) nounwind {
 
 ; If the input value is negative, it'll be outside the range of the
 ; output cast, and thus undefined behavior.
-; CHECK-LABEL: test12
-; CHECK: zext i8
-; CHECK-NEXT: ret i32
-define i32 @test12(i8 %A) nounwind {
+
+define i32 @test12(i8 %A) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[C:%.*]] = zext i8 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %B = sitofp i8 %A to float
   %C = fptoui float %B to i32
   ret i32 %C
 }
 
 ; This can't fold because the 25-bit input doesn't fit in the mantissa.
-; CHECK-LABEL: test13
-; CHECK: uitofp
-; CHECK-NEXT: fptoui
-define i32 @test13(i25 %A) nounwind {
+
+define i32 @test13(i25 %A) {
+; CHECK-LABEL: @test13(
+; CHECK-NEXT:    [[B:%.*]] = uitofp i25 [[A:%.*]] to float
+; CHECK-NEXT:    [[C:%.*]] = fptoui float [[B]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %B = uitofp i25 %A to float
   %C = fptoui float %B to i32
   ret i32 %C
 }
 
 ; But this one can.
-; CHECK-LABEL: test14
-; CHECK: zext i24
-; CHECK-NEXT: ret i32
-define i32 @test14(i24 %A) nounwind {
+
+define i32 @test14(i24 %A) {
+; CHECK-LABEL: @test14(
+; CHECK-NEXT:    [[C:%.*]] = zext i24 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
   %B = uitofp i24 %A to float
   %C = fptoui float %B to i32
   ret i32 %C
 }
 
 ; And this one can too.
-; CHECK-LABEL: test15
-; CHECK: trunc i32
-; CHECK-NEXT: ret i24
-define i24 @test15(i32 %A) nounwind {
+
+define i24 @test15(i32 %A) {
+; CHECK-LABEL: @test15(
+; CHECK-NEXT:    [[C:%.*]] = trunc i32 [[A:%.*]] to i24
+; CHECK-NEXT:    ret i24 [[C]]
+;
   %B = uitofp i32 %A to float
   %C = fptoui float %B to i24
   ret i24 %C
 }
 
-; This can fold because the 25-bit input is signed and we disard the sign bit.
-; CHECK-LABEL: test16
-; CHECK: zext
-define i32 @test16(i25 %A) nounwind {
- %B = sitofp i25 %A to float
- %C = fptoui float %B to i32
- ret i32 %C
+; This can fold because the 25-bit input is signed and we discard the sign bit.
+
+define i32 @test16(i25 %A) {
+; CHECK-LABEL: @test16(
+; CHECK-NEXT:    [[C:%.*]] = zext i25 [[A:%.*]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %B = sitofp i25 %A to float
+  %C = fptoui float %B to i32
+  ret i32 %C
 }
 
 ; This can't fold because the 26-bit input won't fit the mantissa
-; even after disarding the signed bit.
-; CHECK-LABEL: test17
-; CHECK: sitofp
-; CHECK-NEXT: fptoui
-define i32 @test17(i26 %A) nounwind {
- %B = sitofp i26 %A to float
- %C = fptoui float %B to i32
- ret i32 %C
+; even after discarding the signed bit.
+
+define i32 @test17(i26 %A) {
+; CHECK-LABEL: @test17(
+; CHECK-NEXT:    [[B:%.*]] = sitofp i26 [[A:%.*]] to float
+; CHECK-NEXT:    [[C:%.*]] = fptoui float [[B]] to i32
+; CHECK-NEXT:    ret i32 [[C]]
+;
+  %B = sitofp i26 %A to float
+  %C = fptoui float %B to i32
+  ret i32 %C
 }
 
-; This can fold because the 54-bit output is signed and we disard the sign bit.
-; CHECK-LABEL: test18
-; CHECK: trunc
-define i54 @test18(i64 %A) nounwind {
- %B = sitofp i64 %A to double
- %C = fptosi double %B to i54
- ret i54 %C
+; This can fold because the 54-bit output is signed and we discard the sign bit.
+
+define i54 @test18(i64 %A) {
+; CHECK-LABEL: @test18(
+; CHECK-NEXT:    [[C:%.*]] = trunc i64 [[A:%.*]] to i54
+; CHECK-NEXT:    ret i54 [[C]]
+;
+  %B = sitofp i64 %A to double
+  %C = fptosi double %B to i54
+  ret i54 %C
 }
 
 ; This can't fold because the 55-bit output won't fit the mantissa
-; even after disarding the sign bit.
-; CHECK-LABEL: test19
-; CHECK: sitofp
-; CHECK-NEXT: fptosi
-define i55 @test19(i64 %A) nounwind {
- %B = sitofp i64 %A to double
- %C = fptosi double %B to i55
- ret i55 %C
+; even after discarding the sign bit.
+
+define i55 @test19(i64 %A) {
+; CHECK-LABEL: @test19(
+; CHECK-NEXT:    [[B:%.*]] = sitofp i64 [[A:%.*]] to double
+; CHECK-NEXT:    [[C:%.*]] = fptosi double [[B]] to i55
+; CHECK-NEXT:    ret i55 [[C]]
+;
+  %B = sitofp i64 %A to double
+  %C = fptosi double %B to i55
+  ret i55 %C
 }
 
diff --git a/test/Transforms/InstCombine/srem.ll b/test/Transforms/InstCombine/srem.ll
deleted file mode 100644
index beefe4fb8d3fa712a0e895a83c9382a53bef0131..0000000000000000000000000000000000000000
--- a/test/Transforms/InstCombine/srem.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: opt < %s -instcombine -S | grep srem
-
-define i64 @foo(i64 %x1, i64 %y2) {
-	%r = sdiv i64 %x1, %y2
-	%r7 = mul i64 %r, %y2
-	%r8 = sub i64 %x1, %r7
-	ret i64 %r8
-}
diff --git a/test/Transforms/InstCombine/stpcpy_chk-1.ll b/test/Transforms/InstCombine/stpcpy_chk-1.ll
index 2fcc34b052278abf0fb1c88a4fdc416bd035d860..45e6879c8d26f671386d29d69fc0ab1b5c350d98 100644
--- a/test/Transforms/InstCombine/stpcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/stpcpy_chk-1.ll
@@ -64,10 +64,10 @@ define i8* @test_simplify5() {
   %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8], [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
 ; CHECK-NEXT: %1 = call i8* @__memcpy_chk(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i32 12, i32 %len)
 ; CHECK-NEXT: ret i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 11)
-  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false, i1 false)
   %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %src, i32 %len)
   ret i8* %ret
 }
@@ -81,7 +81,7 @@ define i8* @test_simplify6() {
 ; CHECK-NEXT: %strlen = call i32 @strlen(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0))
 ; CHECK-NEXT: %1 = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 %strlen
 ; CHECK-NEXT: ret i8* %1
-  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false, i1 false)
   %ret = call i8* @__stpcpy_chk(i8* %dst, i8* %dst, i32 %len)
   ret i8* %ret
 }
@@ -100,4 +100,4 @@ define i8* @test_no_simplify1() {
 }
 
 declare i8* @__stpcpy_chk(i8*, i8*, i32) nounwind
-declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/strcpy_chk-1.ll b/test/Transforms/InstCombine/strcpy_chk-1.ll
index 7a21a49c993ce7ebe47b8feac50f5b9d307c941e..824776c6ca18bbf5b698d235550aa5d7dfb6b18e 100644
--- a/test/Transforms/InstCombine/strcpy_chk-1.ll
+++ b/test/Transforms/InstCombine/strcpy_chk-1.ll
@@ -64,10 +64,10 @@ define i8* @test_simplify5() {
   %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0
   %src = getelementptr inbounds [12 x i8], [12 x i8]* @.str, i32 0, i32 0
 
-; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
 ; CHECK-NEXT: %1 = call i8* @__memcpy_chk(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([12 x i8], [12 x i8]* @.str, i32 0, i32 0), i32 12, i32 %len)
 ; CHECK-NEXT: ret i8* %1
-  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false, i1 false)
   %ret = call i8* @__strcpy_chk(i8* %dst, i8* %src, i32 %len)
   ret i8* %ret
 }
@@ -78,10 +78,10 @@ define i8* @test_simplify6() {
 ; CHECK-LABEL: @test_simplify6(
   %dst = getelementptr inbounds [60 x i8], [60 x i8]* @a, i32 0, i32 0
 
-; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false)
+; CHECK-NEXT: %len = call i32 @llvm.objectsize.i32.p0i8(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i1 false, i1 false)
 ; CHECK-NEXT: %ret = call i8* @__strcpy_chk(i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i8* getelementptr inbounds ([60 x i8], [60 x i8]* @a, i32 0, i32 0), i32 %len)
 ; CHECK-NEXT: ret i8* %ret
-  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false)
+  %len = call i32 @llvm.objectsize.i32.p0i8(i8* %dst, i1 false, i1 false)
   %ret = call i8* @__strcpy_chk(i8* %dst, i8* %dst, i32 %len)
   ret i8* %ret
 }
@@ -100,4 +100,4 @@ define i8* @test_no_simplify1() {
 }
 
 declare i8* @__strcpy_chk(i8*, i8*, i32) nounwind
-declare i32 @llvm.objectsize.i32.p0i8(i8*, i1) nounwind readonly
+declare i32 @llvm.objectsize.i32.p0i8(i8*, i1, i1) nounwind readonly
diff --git a/test/Transforms/InstCombine/sub-xor.ll b/test/Transforms/InstCombine/sub-xor.ll
index 9a0814c2c92f8c3528f9a46be18172f9e618efaa..812305d8e4896870c34744558e2a4f977fe9fadf 100644
--- a/test/Transforms/InstCombine/sub-xor.ll
+++ b/test/Transforms/InstCombine/sub-xor.ll
@@ -48,13 +48,3 @@ define i32 @test3(i32 %x) {
   ret i32 %add
 }
 
-define i32 @test4(i32 %x) {
-; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[ADD:%.*]] = add i32 %x, -2147483606
-; CHECK-NEXT:    ret i32 [[ADD]]
-;
-  %sub = xor i32 %x, 2147483648
-  %add = add i32 %sub, 42
-  ret i32 %add
-}
-
diff --git a/test/Transforms/InstCombine/sub.ll b/test/Transforms/InstCombine/sub.ll
index 32541f1f893eb51e7170e13a6bce0306d59bd28e..4c7047636e03fd252c84985090a1ecc40560c47b 100644
--- a/test/Transforms/InstCombine/sub.ll
+++ b/test/Transforms/InstCombine/sub.ll
@@ -15,7 +15,7 @@ define i32 @test1(i32 %A) {
 
 define i32 @test2(i32 %A) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    ret i32 %A
+; CHECK-NEXT:    ret i32 [[A:%.*]]
 ;
   %B = sub i32 %A, 0
   ret i32 %B
@@ -23,7 +23,7 @@ define i32 @test2(i32 %A) {
 
 define i32 @test3(i32 %A) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    ret i32 %A
+; CHECK-NEXT:    ret i32 [[A:%.*]]
 ;
   %B = sub i32 0, %A
   %C = sub i32 0, %B
@@ -32,7 +32,7 @@ define i32 @test3(i32 %A) {
 
 define i32 @test4(i32 %A, i32 %x) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[C:%.*]] = add i32 %x, %A
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[X:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = sub i32 0, %A
@@ -42,8 +42,8 @@ define i32 @test4(i32 %A, i32 %x) {
 
 define i32 @test5(i32 %A, i32 %B, i32 %C) {
 ; CHECK-LABEL: @test5(
-; CHECK-NEXT:    [[D1:%.*]] = sub i32 %C, %B
-; CHECK-NEXT:    [[E:%.*]] = add i32 [[D1]], %A
+; CHECK-NEXT:    [[D1:%.*]] = sub i32 [[C:%.*]], [[B:%.*]]
+; CHECK-NEXT:    [[E:%.*]] = add i32 [[D1]], [[A:%.*]]
 ; CHECK-NEXT:    ret i32 [[E]]
 ;
   %D = sub i32 %B, %C
@@ -53,8 +53,8 @@ define i32 @test5(i32 %A, i32 %B, i32 %C) {
 
 define i32 @test6(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test6(
-; CHECK-NEXT:    [[B_NOT:%.*]] = xor i32 %B, -1
-; CHECK-NEXT:    [[D:%.*]] = and i32 %A, [[B_NOT]]
+; CHECK-NEXT:    [[B_NOT:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[D:%.*]] = and i32 [[B_NOT]], [[A:%.*]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %C = and i32 %A, %B
@@ -62,9 +62,20 @@ define i32 @test6(i32 %A, i32 %B) {
   ret i32 %D
 }
 
+define i32 @test6commuted(i32 %A, i32 %B) {
+; CHECK-LABEL: @test6commuted(
+; CHECK-NEXT:    [[B_NOT:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[D:%.*]] = and i32 [[B_NOT]], [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[D]]
+;
+  %C = and i32 %B, %A
+  %D = sub i32 %A, %C
+  ret i32 %D
+}
+
 define i32 @test7(i32 %A) {
 ; CHECK-LABEL: @test7(
-; CHECK-NEXT:    [[B:%.*]] = xor i32 %A, -1
+; CHECK-NEXT:    [[B:%.*]] = xor i32 [[A:%.*]], -1
 ; CHECK-NEXT:    ret i32 [[B]]
 ;
   %B = sub i32 -1, %A
@@ -73,7 +84,7 @@ define i32 @test7(i32 %A) {
 
 define i32 @test8(i32 %A) {
 ; CHECK-LABEL: @test8(
-; CHECK-NEXT:    [[C:%.*]] = shl i32 %A, 3
+; CHECK-NEXT:    [[C:%.*]] = shl i32 [[A:%.*]], 3
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = mul i32 9, %A
@@ -83,7 +94,7 @@ define i32 @test8(i32 %A) {
 
 define i32 @test9(i32 %A) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    [[C:%.*]] = mul i32 %A, -2
+; CHECK-NEXT:    [[C:%.*]] = mul i32 [[A:%.*]], -2
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = mul i32 3, %A
@@ -93,7 +104,7 @@ define i32 @test9(i32 %A) {
 
 define i32 @test10(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test10(
-; CHECK-NEXT:    [[E:%.*]] = mul i32 %A, %B
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret i32 [[E]]
 ;
   %C = sub i32 0, %A
@@ -104,7 +115,7 @@ define i32 @test10(i32 %A, i32 %B) {
 
 define i32 @test10a(i32 %A) {
 ; CHECK-LABEL: @test10a(
-; CHECK-NEXT:    [[E:%.*]] = mul i32 %A, -7
+; CHECK-NEXT:    [[E:%.*]] = mul i32 [[A:%.*]], -7
 ; CHECK-NEXT:    ret i32 [[E]]
 ;
   %C = sub i32 0, %A
@@ -114,7 +125,7 @@ define i32 @test10a(i32 %A) {
 
 define i1 @test11(i8 %A, i8 %B) {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT:    [[D:%.*]] = icmp ne i8 %A, %B
+; CHECK-NEXT:    [[D:%.*]] = icmp ne i8 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret i1 [[D]]
 ;
   %C = sub i8 %A, %B
@@ -124,7 +135,7 @@ define i1 @test11(i8 %A, i8 %B) {
 
 define <2 x i1> @test11vec(<2 x i8> %A, <2 x i8> %B) {
 ; CHECK-LABEL: @test11vec(
-; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i8> %A, %B
+; CHECK-NEXT:    [[D:%.*]] = icmp ne <2 x i8> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[D]]
 ;
   %C = sub <2 x i8> %A, %B
@@ -134,7 +145,7 @@ define <2 x i1> @test11vec(<2 x i8> %A, <2 x i8> %B) {
 
 define i32 @test12(i32 %A) {
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT:    [[C:%.*]] = lshr i32 %A, 31
+; CHECK-NEXT:    [[C:%.*]] = lshr i32 [[A:%.*]], 31
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = ashr i32 %A, 31
@@ -144,7 +155,7 @@ define i32 @test12(i32 %A) {
 
 define i32 @test13(i32 %A) {
 ; CHECK-LABEL: @test13(
-; CHECK-NEXT:    [[C:%.*]] = ashr i32 %A, 31
+; CHECK-NEXT:    [[C:%.*]] = ashr i32 [[A:%.*]], 31
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = lshr i32 %A, 31
@@ -154,7 +165,7 @@ define i32 @test13(i32 %A) {
 
 define <2 x i32> @test12vec(<2 x i32> %A) {
 ; CHECK-LABEL: @test12vec(
-; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> %A, <i32 31, i32 31>
+; CHECK-NEXT:    [[C:%.*]] = lshr <2 x i32> [[A:%.*]], <i32 31, i32 31>
 ; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
   %B = ashr <2 x i32> %A, <i32 31, i32 31>
@@ -164,7 +175,7 @@ define <2 x i32> @test12vec(<2 x i32> %A) {
 
 define <2 x i32> @test13vec(<2 x i32> %A) {
 ; CHECK-LABEL: @test13vec(
-; CHECK-NEXT:    [[C:%.*]] = ashr <2 x i32> %A, <i32 31, i32 31>
+; CHECK-NEXT:    [[C:%.*]] = ashr <2 x i32> [[A:%.*]], <i32 31, i32 31>
 ; CHECK-NEXT:    ret <2 x i32> [[C]]
 ;
   %B = lshr <2 x i32> %A, <i32 31, i32 31>
@@ -174,8 +185,8 @@ define <2 x i32> @test13vec(<2 x i32> %A) {
 
 define i32 @test15(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test15(
-; CHECK-NEXT:    [[C:%.*]] = sub i32 0, %A
-; CHECK-NEXT:    [[D:%.*]] = srem i32 %B, [[C]]
+; CHECK-NEXT:    [[C:%.*]] = sub i32 0, [[A:%.*]]
+; CHECK-NEXT:    [[D:%.*]] = srem i32 [[B:%.*]], [[C]]
 ; CHECK-NEXT:    ret i32 [[D]]
 ;
   %C = sub i32 0, %A
@@ -185,7 +196,7 @@ define i32 @test15(i32 %A, i32 %B) {
 
 define i32 @test16(i32 %A) {
 ; CHECK-LABEL: @test16(
-; CHECK-NEXT:    [[Y:%.*]] = sdiv i32 %A, -1123
+; CHECK-NEXT:    [[Y:%.*]] = sdiv i32 [[A:%.*]], -1123
 ; CHECK-NEXT:    ret i32 [[Y]]
 ;
   %X = sdiv i32 %A, 1123
@@ -197,7 +208,7 @@ define i32 @test16(i32 %A) {
 ; PR3142
 define i32 @test17(i32 %A) {
 ; CHECK-LABEL: @test17(
-; CHECK-NEXT:    [[B:%.*]] = sub i32 0, %A
+; CHECK-NEXT:    [[B:%.*]] = sub i32 0, [[A:%.*]]
 ; CHECK-NEXT:    [[C:%.*]] = sdiv i32 [[B]], 1234
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
@@ -218,7 +229,7 @@ define i64 @test18(i64 %Y) {
 
 define i32 @test19(i32 %X, i32 %Y) {
 ; CHECK-LABEL: @test19(
-; CHECK-NEXT:    ret i32 %X
+; CHECK-NEXT:    ret i32 [[X:%.*]]
 ;
   %Z = sub i32 %X, %Y
   %Q = add i32 %Z, %Y
@@ -227,7 +238,7 @@ define i32 @test19(i32 %X, i32 %Y) {
 
 define i1 @test20(i32 %g, i32 %h) {
 ; CHECK-LABEL: @test20(
-; CHECK-NEXT:    [[TMP_4:%.*]] = icmp ne i32 %h, 0
+; CHECK-NEXT:    [[TMP_4:%.*]] = icmp ne i32 [[H:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[TMP_4]]
 ;
   %tmp.2 = sub i32 %g, %h
@@ -237,7 +248,7 @@ define i1 @test20(i32 %g, i32 %h) {
 
 define i1 @test21(i32 %g, i32 %h) {
 ; CHECK-LABEL: @test21(
-; CHECK-NEXT:    [[TMP_4:%.*]] = icmp ne i32 %h, 0
+; CHECK-NEXT:    [[TMP_4:%.*]] = icmp ne i32 [[H:%.*]], 0
 ; CHECK-NEXT:    ret i1 [[TMP_4]]
 ;
   %tmp.2 = sub i32 %g, %h
@@ -248,7 +259,7 @@ define i1 @test21(i32 %g, i32 %h) {
 ; PR2298
 define zeroext i1 @test22(i32 %a, i32 %b)  nounwind  {
 ; CHECK-LABEL: @test22(
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 %b, %a
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    ret i1 [[TMP5]]
 ;
   %tmp2 = sub i32 0, %a
@@ -260,7 +271,7 @@ define zeroext i1 @test22(i32 %a, i32 %b)  nounwind  {
 ; rdar://7362831
 define i32 @test23(i8* %P, i64 %A){
 ; CHECK-LABEL: @test23(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %A to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[A:%.*]] to i32
 ; CHECK-NEXT:    ret i32 [[TMP1]]
 ;
   %B = getelementptr inbounds i8, i8* %P, i64 %A
@@ -274,7 +285,7 @@ define i32 @test23(i8* %P, i64 %A){
 
 define i8 @test23_as1(i8 addrspace(1)* %P, i16 %A) {
 ; CHECK-LABEL: @test23_as1(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 %A to i8
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i16 [[A:%.*]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
   %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A
@@ -288,7 +299,7 @@ define i8 @test23_as1(i8 addrspace(1)* %P, i16 %A) {
 
 define i64 @test24(i8* %P, i64 %A){
 ; CHECK-LABEL: @test24(
-; CHECK-NEXT:    ret i64 %A
+; CHECK-NEXT:    ret i64 [[A:%.*]]
 ;
   %B = getelementptr inbounds i8, i8* %P, i64 %A
   %C = ptrtoint i8* %B to i64
@@ -299,7 +310,7 @@ define i64 @test24(i8* %P, i64 %A){
 
 define i16 @test24_as1(i8 addrspace(1)* %P, i16 %A) {
 ; CHECK-LABEL: @test24_as1(
-; CHECK-NEXT:    ret i16 %A
+; CHECK-NEXT:    ret i16 [[A:%.*]]
 ;
   %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A
   %C = ptrtoint i8 addrspace(1)* %B to i16
@@ -310,7 +321,7 @@ define i16 @test24_as1(i8 addrspace(1)* %P, i16 %A) {
 
 define i64 @test24a(i8* %P, i64 %A){
 ; CHECK-LABEL: @test24a(
-; CHECK-NEXT:    [[DIFF_NEG:%.*]] = sub i64 0, %A
+; CHECK-NEXT:    [[DIFF_NEG:%.*]] = sub i64 0, [[A:%.*]]
 ; CHECK-NEXT:    ret i64 [[DIFF_NEG]]
 ;
   %B = getelementptr inbounds i8, i8* %P, i64 %A
@@ -322,7 +333,7 @@ define i64 @test24a(i8* %P, i64 %A){
 
 define i16 @test24a_as1(i8 addrspace(1)* %P, i16 %A) {
 ; CHECK-LABEL: @test24a_as1(
-; CHECK-NEXT:    [[DIFF_NEG:%.*]] = sub i16 0, %A
+; CHECK-NEXT:    [[DIFF_NEG:%.*]] = sub i16 0, [[A:%.*]]
 ; CHECK-NEXT:    ret i16 [[DIFF_NEG]]
 ;
   %B = getelementptr inbounds i8, i8 addrspace(1)* %P, i16 %A
@@ -337,7 +348,7 @@ define i16 @test24a_as1(i8 addrspace(1)* %P, i16 %A) {
 
 define i64 @test24b(i8* %P, i64 %A){
 ; CHECK-LABEL: @test24b(
-; CHECK-NEXT:    [[B_IDX:%.*]] = shl nuw i64 %A, 1
+; CHECK-NEXT:    [[B_IDX:%.*]] = shl nuw i64 [[A:%.*]], 1
 ; CHECK-NEXT:    ret i64 [[B_IDX]]
 ;
   %B = getelementptr inbounds [42 x i16], [42 x i16]* @Arr, i64 0, i64 %A
@@ -349,7 +360,7 @@ define i64 @test24b(i8* %P, i64 %A){
 
 define i64 @test25(i8* %P, i64 %A){
 ; CHECK-LABEL: @test25(
-; CHECK-NEXT:    [[B_IDX:%.*]] = shl nuw i64 %A, 1
+; CHECK-NEXT:    [[B_IDX:%.*]] = shl nuw i64 [[A:%.*]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[B_IDX]], -84
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
@@ -363,7 +374,7 @@ define i64 @test25(i8* %P, i64 %A){
 
 define i16 @test25_as1(i8 addrspace(1)* %P, i64 %A) {
 ; CHECK-LABEL: @test25_as1(
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 %A to i16
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[A:%.*]] to i16
 ; CHECK-NEXT:    [[B_IDX:%.*]] = shl nuw i16 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i16 [[B_IDX]], -84
 ; CHECK-NEXT:    ret i16 [[TMP2]]
@@ -376,7 +387,7 @@ define i16 @test25_as1(i8 addrspace(1)* %P, i64 %A) {
 
 define i32 @test26(i32 %x) {
 ; CHECK-LABEL: @test26(
-; CHECK-NEXT:    [[NEG:%.*]] = shl i32 -3, %x
+; CHECK-NEXT:    [[NEG:%.*]] = shl i32 -3, [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[NEG]]
 ;
   %shl = shl i32 3, %x
@@ -386,8 +397,8 @@ define i32 @test26(i32 %x) {
 
 define i32 @test27(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test27(
-; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 %y, 3
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[Y:%.*]], 3
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %mul = mul i32 %y, -8
@@ -395,10 +406,87 @@ define i32 @test27(i32 %x, i32 %y) {
   ret i32 %sub
 }
 
+define <2 x i32> @test27vec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27vec(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[Y:%.*]], <i32 8, i32 6>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> %y, <i32 -8, i32 -6>
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
+define <2 x i32> @test27vecsplat(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27vecsplat(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i32> [[Y:%.*]], <i32 3, i32 3>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> %y, <i32 -8, i32 -8>
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
+define <2 x i32> @test27vecmixed(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27vecmixed(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[Y:%.*]], <i32 8, i32 -8>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> %y, <i32 -8, i32 8>
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
+define i32 @test27commuted(i32 %x, i32 %y) {
+; CHECK-LABEL: @test27commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl i32 [[Y:%.*]], 3
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+  %mul = mul i32 -8, %y
+  %sub = sub i32 %x, %mul
+  ret i32 %sub
+}
+
+define <2 x i32> @test27commutedvec(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27commutedvec(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[Y:%.*]], <i32 8, i32 6>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> <i32 -8, i32 -6>, %y
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
+define <2 x i32> @test27commutedvecsplat(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27commutedvecsplat(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i32> [[Y:%.*]], <i32 3, i32 3>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> <i32 -8, i32 -8>, %y
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
+define <2 x i32> @test27commutedvecmixed(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @test27commutedvecmixed(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul <2 x i32> [[Y:%.*]], <i32 8, i32 -8>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i32> [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret <2 x i32> [[SUB]]
+;
+  %mul = mul <2 x i32> <i32 -8, i32 8>, %y
+  %sub = sub <2 x i32> %x, %mul
+  ret <2 x i32> %sub
+}
+
 define i32 @test28(i32 %x, i32 %y, i32 %z) {
 ; CHECK-LABEL: @test28(
-; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 %z, %y
-; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[Z:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %neg = sub i32 0, %z
@@ -407,9 +495,21 @@ define i32 @test28(i32 %x, i32 %y, i32 %z) {
   ret i32 %sub
 }
 
+define i32 @test28commuted(i32 %x, i32 %y, i32 %z) {
+; CHECK-LABEL: @test28commuted(
+; CHECK-NEXT:    [[TMP1:%.*]] = mul i32 [[Y:%.*]], [[Z:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[TMP1]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+  %neg = sub i32 0, %z
+  %mul = mul i32 %y, %neg
+  %sub = sub i32 %x, %mul
+  ret i32 %sub
+}
+
 define i64 @test29(i8* %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test29(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 %i, %j
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[I:%.*]], [[J:%.*]]
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
   %gep1 = getelementptr inbounds i8, i8* %foo, i64 %i
@@ -422,8 +522,8 @@ define i64 @test29(i8* %foo, i64 %i, i64 %j) {
 
 define i64 @test30(i8* %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test30(
-; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i64 %i, 2
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[GEP1_IDX]], %j
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i64 [[I:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i64 [[GEP1_IDX]], [[J:%.*]]
 ; CHECK-NEXT:    ret i64 [[TMP1]]
 ;
   %bit = bitcast i8* %foo to i32*
@@ -437,8 +537,8 @@ define i64 @test30(i8* %foo, i64 %i, i64 %j) {
 
 define i16 @test30_as1(i8 addrspace(1)* %foo, i16 %i, i16 %j) {
 ; CHECK-LABEL: @test30_as1(
-; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i16 %i, 2
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i16 [[GEP1_IDX]], %j
+; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nuw i16 [[I:%.*]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i16 [[GEP1_IDX]], [[J:%.*]]
 ; CHECK-NEXT:    ret i16 [[TMP1]]
 ;
   %bit = bitcast i8 addrspace(1)* %foo to i32 addrspace(1)*
@@ -452,7 +552,7 @@ define i16 @test30_as1(i8 addrspace(1)* %foo, i16 %i, i16 %j) {
 
 define <2 x i64> @test31(<2 x i64> %A) {
 ; CHECK-LABEL: @test31(
-; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i64> %A, <i64 3, i64 4>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i64> [[A:%.*]], <i64 3, i64 4>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %xor = xor <2 x i64> %A, <i64 -1, i64 -1>
@@ -462,7 +562,7 @@ define <2 x i64> @test31(<2 x i64> %A) {
 
 define <2 x i64> @test32(<2 x i64> %A) {
 ; CHECK-LABEL: @test32(
-; CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> <i64 3, i64 4>, %A
+; CHECK-NEXT:    [[SUB:%.*]] = sub <2 x i64> <i64 3, i64 4>, [[A:%.*]]
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %add = add <2 x i64> %A, <i64 -1, i64 -1>
@@ -472,7 +572,7 @@ define <2 x i64> @test32(<2 x i64> %A) {
 
 define <2 x i64> @test33(<2 x i1> %A) {
 ; CHECK-LABEL: @test33(
-; CHECK-NEXT:    [[SUB:%.*]] = sext <2 x i1> %A to <2 x i64>
+; CHECK-NEXT:    [[SUB:%.*]] = sext <2 x i1> [[A:%.*]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %ext = zext <2 x i1> %A to <2 x i64>
@@ -482,7 +582,7 @@ define <2 x i64> @test33(<2 x i1> %A) {
 
 define <2 x i64> @test34(<2 x i1> %A) {
 ; CHECK-LABEL: @test34(
-; CHECK-NEXT:    [[SUB:%.*]] = zext <2 x i1> %A to <2 x i64>
+; CHECK-NEXT:    [[SUB:%.*]] = zext <2 x i1> [[A:%.*]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %ext = sext <2 x i1> %A to <2 x i64>
@@ -492,7 +592,7 @@ define <2 x i64> @test34(<2 x i1> %A) {
 
 define <2 x i64> @test35(<2 x i64> %A) {
 ; CHECK-LABEL: @test35(
-; CHECK-NEXT:    [[SUB:%.*]] = mul <2 x i64> %A, <i64 -2, i64 -3>
+; CHECK-NEXT:    [[SUB:%.*]] = mul <2 x i64> [[A:%.*]], <i64 -2, i64 -3>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %mul = mul <2 x i64> %A, <i64 3, i64 4>
@@ -502,7 +602,7 @@ define <2 x i64> @test35(<2 x i64> %A) {
 
 define <2 x i64> @test36(<2 x i64> %A) {
 ; CHECK-LABEL: @test36(
-; CHECK-NEXT:    [[SUB:%.*]] = mul <2 x i64> %A, <i64 7, i64 15>
+; CHECK-NEXT:    [[SUB:%.*]] = mul <2 x i64> [[A:%.*]], <i64 7, i64 15>
 ; CHECK-NEXT:    ret <2 x i64> [[SUB]]
 ;
   %shl = shl <2 x i64> %A, <i64 3, i64 4>
@@ -512,7 +612,7 @@ define <2 x i64> @test36(<2 x i64> %A) {
 
 define <2 x i32> @test37(<2 x i32> %A) {
 ; CHECK-LABEL: @test37(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i32> %A, <i32 -2147483648, i32 -2147483648>
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i32> [[A:%.*]], <i32 -2147483648, i32 -2147483648>
 ; CHECK-NEXT:    [[SUB:%.*]] = sext <2 x i1> [[TMP1]] to <2 x i32>
 ; CHECK-NEXT:    ret <2 x i32> [[SUB]]
 ;
@@ -523,7 +623,7 @@ define <2 x i32> @test37(<2 x i32> %A) {
 
 define i32 @test38(i32 %A) {
 ; CHECK-LABEL: @test38(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 %A, -2147483648
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[A:%.*]], -2147483648
 ; CHECK-NEXT:    [[SUB:%.*]] = sext i1 [[TMP1]] to i32
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
@@ -534,7 +634,7 @@ define i32 @test38(i32 %A) {
 
 define i32 @test39(i32 %A, i32 %x) {
 ; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[C:%.*]] = add i32 %x, %A
+; CHECK-NEXT:    [[C:%.*]] = add i32 [[X:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %B = sub i32 0, %A
@@ -544,8 +644,8 @@ define i32 @test39(i32 %A, i32 %x) {
 
 define i16 @test40(i16 %a, i16 %b) {
 ; CHECK-LABEL: @test40(
-; CHECK-NEXT:    [[ASHR:%.*]] = ashr i16 %a, 1
-; CHECK-NEXT:    [[ASHR1:%.*]] = ashr i16 %b, 1
+; CHECK-NEXT:    [[ASHR:%.*]] = ashr i16 [[A:%.*]], 1
+; CHECK-NEXT:    [[ASHR1:%.*]] = ashr i16 [[B:%.*]], 1
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i16 [[ASHR]], [[ASHR1]]
 ; CHECK-NEXT:    ret i16 [[SUB]]
 ;
@@ -557,8 +657,8 @@ define i16 @test40(i16 %a, i16 %b) {
 
 define i32 @test41(i16 %a, i16 %b) {
 ; CHECK-LABEL: @test41(
-; CHECK-NEXT:    [[CONV:%.*]] = sext i16 %a to i32
-; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 %b to i32
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[A:%.*]] to i32
+; CHECK-NEXT:    [[CONV1:%.*]] = sext i16 [[B:%.*]] to i32
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[CONV]], [[CONV1]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
@@ -570,8 +670,8 @@ define i32 @test41(i16 %a, i16 %b) {
 
 define i4 @test42(i4 %x, i4 %y) {
 ; CHECK-LABEL: @test42(
-; CHECK-NEXT:    [[A:%.*]] = and i4 %y, 7
-; CHECK-NEXT:    [[B:%.*]] = and i4 %x, 7
+; CHECK-NEXT:    [[A:%.*]] = and i4 [[Y:%.*]], 7
+; CHECK-NEXT:    [[B:%.*]] = and i4 [[X:%.*]], 7
 ; CHECK-NEXT:    [[C:%.*]] = sub nsw i4 [[A]], [[B]]
 ; CHECK-NEXT:    ret i4 [[C]]
 ;
@@ -583,8 +683,8 @@ define i4 @test42(i4 %x, i4 %y) {
 
 define i4 @test43(i4 %x, i4 %y) {
 ; CHECK-LABEL: @test43(
-; CHECK-NEXT:    [[A:%.*]] = or i4 %x, -8
-; CHECK-NEXT:    [[B:%.*]] = and i4 %y, 7
+; CHECK-NEXT:    [[A:%.*]] = or i4 [[X:%.*]], -8
+; CHECK-NEXT:    [[B:%.*]] = and i4 [[Y:%.*]], 7
 ; CHECK-NEXT:    [[C:%.*]] = sub nuw i4 [[A]], [[B]]
 ; CHECK-NEXT:    ret i4 [[C]]
 ;
@@ -596,7 +696,7 @@ define i4 @test43(i4 %x, i4 %y) {
 
 define i32 @test44(i32 %x) {
 ; CHECK-LABEL: @test44(
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 %x, -32768
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[X:%.*]], -32768
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %sub = sub nsw i32 %x, 32768
@@ -605,7 +705,7 @@ define i32 @test44(i32 %x) {
 
 define i32 @test45(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test45(
-; CHECK-NEXT:    [[SUB:%.*]] = and i32 %x, %y
+; CHECK-NEXT:    [[SUB:%.*]] = and i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %or = or i32 %x, %y
@@ -614,10 +714,21 @@ define i32 @test45(i32 %x, i32 %y) {
   ret i32 %sub
 }
 
+define i32 @test45commuted(i32 %x, i32 %y) {
+; CHECK-LABEL: @test45commuted(
+; CHECK-NEXT:    [[SUB:%.*]] = and i32 [[Y:%.*]], [[X:%.*]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+  %or = or i32 %x, %y
+  %xor = xor i32 %y, %x
+  %sub = sub i32 %or, %xor
+  ret i32 %sub
+}
+
 define i32 @test46(i32 %x, i32 %y) {
 ; CHECK-LABEL: @test46(
-; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 %x, -1
-; CHECK-NEXT:    [[SUB:%.*]] = and i32 %y, [[X_NOT]]
+; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[SUB:%.*]] = and i32 [[X_NOT]], [[Y:%.*]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %or = or i32 %x, %y
@@ -625,10 +736,21 @@ define i32 @test46(i32 %x, i32 %y) {
   ret i32 %sub
 }
 
+define i32 @test46commuted(i32 %x, i32 %y) {
+; CHECK-LABEL: @test46commuted(
+; CHECK-NEXT:    [[X_NOT:%.*]] = xor i32 [[X:%.*]], -1
+; CHECK-NEXT:    [[SUB:%.*]] = and i32 [[X_NOT]], [[Y:%.*]]
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+  %or = or i32 %y, %x
+  %sub = sub i32 %or, %x
+  ret i32 %sub
+}
+
 define i32 @test47(i1 %A, i32 %B, i32 %C, i32 %D) {
 ; CHECK-LABEL: @test47(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 %D, %C
-; CHECK-NEXT:    [[SUB:%.*]] = select i1 %A, i32 [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[D:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = select i1 [[A:%.*]], i32 [[TMP1]], i32 0
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %sel0 = select i1 %A, i32 %D, i32 %B
@@ -639,8 +761,8 @@ define i32 @test47(i1 %A, i32 %B, i32 %C, i32 %D) {
 
 define i32 @test48(i1 %A, i32 %B, i32 %C, i32 %D) {
 ; CHECK-LABEL: @test48(
-; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 %D, %C
-; CHECK-NEXT:    [[SUB:%.*]] = select i1 %A, i32 0, i32 [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[D:%.*]], [[C:%.*]]
+; CHECK-NEXT:    [[SUB:%.*]] = select i1 [[A:%.*]], i32 0, i32 [[TMP1]]
 ; CHECK-NEXT:    ret i32 [[SUB]]
 ;
   %sel0 = select i1 %A, i32 %B, i32 %D
@@ -653,8 +775,8 @@ define i32 @test48(i1 %A, i32 %B, i32 %C, i32 %D) {
 
 define i8 @bool_sext_sub(i8 %x, i1 %y) {
 ; CHECK-LABEL: @bool_sext_sub(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 %y to i8
-; CHECK-NEXT:    [[SUB:%.*]] = add i8 [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[SUB:%.*]] = add i8 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i8 [[SUB]]
 ;
   %sext = sext i1 %y to i8
@@ -666,8 +788,8 @@ define i8 @bool_sext_sub(i8 %x, i1 %y) {
 
 define <2 x i8> @bool_sext_sub_vec(<2 x i8> %x, <2 x i1> %y) {
 ; CHECK-LABEL: @bool_sext_sub_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i1> %y to <2 x i8>
-; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i8> [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i1> [[Y:%.*]] to <2 x i8>
+; CHECK-NEXT:    [[SUB:%.*]] = add <2 x i8> [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[SUB]]
 ;
   %sext = sext <2 x i1> %y to <2 x i8>
@@ -679,8 +801,8 @@ define <2 x i8> @bool_sext_sub_vec(<2 x i8> %x, <2 x i1> %y) {
 
 define <2 x i8> @bool_sext_sub_vec_nsw(<2 x i8> %x, <2 x i1> %y) {
 ; CHECK-LABEL: @bool_sext_sub_vec_nsw(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i1> %y to <2 x i8>
-; CHECK-NEXT:    [[SUB:%.*]] = add nsw <2 x i8> [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = zext <2 x i1> [[Y:%.*]] to <2 x i8>
+; CHECK-NEXT:    [[SUB:%.*]] = add nsw <2 x i8> [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret <2 x i8> [[SUB]]
 ;
   %sext = sext <2 x i1> %y to <2 x i8>
@@ -692,8 +814,8 @@ define <2 x i8> @bool_sext_sub_vec_nsw(<2 x i8> %x, <2 x i1> %y) {
 
 define i8 @bool_sext_sub_nuw(i8 %x, i1 %y) {
 ; CHECK-LABEL: @bool_sext_sub_nuw(
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 %y to i8
-; CHECK-NEXT:    [[SUB:%.*]] = add i8 [[TMP1]], %x
+; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[Y:%.*]] to i8
+; CHECK-NEXT:    [[SUB:%.*]] = add i8 [[TMP1]], [[X:%.*]]
 ; CHECK-NEXT:    ret i8 [[SUB]]
 ;
   %sext = sext i1 %y to i8
@@ -701,3 +823,171 @@ define i8 @bool_sext_sub_nuw(i8 %x, i1 %y) {
   ret i8 %sub
 }
 
+define i32 @test49(i32 %X) {
+; CHECK-LABEL: @test49(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1, [[X:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = and i32 [[SUB]], 64
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i32 129, %X
+  %res = and i32 %sub, 64
+  ret i32 %res
+}
+
+define i32 @test50(i32 %X) {
+; CHECK-LABEL: @test50(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 1, [[X:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = and i32 [[SUB]], 127
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i32 129, %X
+  %res = and i32 %sub, 127
+  ret i32 %res
+}
+
+define i32 @test51(i32 %X) {
+; CHECK-LABEL: @test51(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 126, [[X:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = and i32 [[SUB]], 64
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i32 254, %X
+  %res = and i32 %sub, 64
+  ret i32 %res
+}
+
+define i32 @test52(i32 %X) {
+; CHECK-LABEL: @test52(
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 126, [[X:%.*]]
+; CHECK-NEXT:    [[RES:%.*]] = and i32 [[SUB]], 127
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %sub = sub i32 254, %X
+  %res = and i32 %sub, 127
+  ret i32 %res
+}
+
+define <2 x i1> @test53(<2 x i1> %A, <2 x i1> %B) {
+  %sub = sub <2 x i1> %A, %B
+  ret <2 x i1> %sub
+; CHECK-LABEL: @test53(
+; CHECK-NEXT: %sub = xor <2 x i1> %A, %B
+; CHECK-NEXT: ret <2 x i1> %sub
+}
+
+define i32 @test54(i1 %C) {
+; CHECK-LABEL: @test54(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], i32 -877, i32 113
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10
+  %V = sub i32 123, %A
+  ret i32 %V
+}
+
+define <2 x i32> @test54vec(i1 %C) {
+; CHECK-LABEL: @test54vec(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 -877, i32 -877>, <2 x i32> <i32 113, i32 113>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = sub <2 x i32> <i32 123, i32 123>, %A
+  ret <2 x i32> %V
+}
+
+define <2 x i32> @test54vec2(i1 %C) {
+; CHECK-LABEL: @test54vec2(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 -877, i32 -2167>, <2 x i32> <i32 113, i32 303>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 2500>, <2 x i32> <i32 10, i32 30>
+  %V = sub <2 x i32> <i32 123, i32 333>, %A
+  ret <2 x i32> %V
+}
+
+define i32 @test55(i1 %which) {
+; CHECK-LABEL: @test55(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 1000, [[ENTRY:%.*]] ], [ 10, [[DELAY]] ]
+; CHECK-NEXT:    [[VALUE:%.*]] = sub nsw i32 123, [[A]]
+; CHECK-NEXT:    ret i32 [[VALUE]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = sub i32 123, %A
+  ret i32 %value
+}
+
+define <2 x i32> @test55vec(i1 %which) {
+; CHECK-LABEL: @test55vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1000, i32 1000>, [[ENTRY:%.*]] ], [ <i32 10, i32 10>, [[DELAY]] ]
+; CHECK-NEXT:    [[VALUE:%.*]] = sub nsw <2 x i32> <i32 123, i32 123>, [[A]]
+; CHECK-NEXT:    ret <2 x i32> [[VALUE]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = sub <2 x i32> <i32 123, i32 123>, %A
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test55vec2(i1 %which) {
+; CHECK-LABEL: @test55vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 1000, i32 2500>, [[ENTRY:%.*]] ], [ <i32 10, i32 30>, [[DELAY]] ]
+; CHECK-NEXT:    [[VALUE:%.*]] = sub nsw <2 x i32> <i32 123, i32 333>, [[A]]
+; CHECK-NEXT:    ret <2 x i32> [[VALUE]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = sub <2 x i32> <i32 123, i32 333>, %A
+  ret <2 x i32> %value
+}
+
+define i32 @test56(i32 %A, i32 %B) {
+; CHECK-LABEL: @test56(
+; CHECK-NEXT:    [[Y:%.*]] = sub i32 0, [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+  %X = add i32 %A, %B
+  %Y = sub i32 %A, %X
+  ret i32 %Y                                                                                                                                                                                                                                             }
+
+define i32 @test57(i32 %A, i32 %B) {
+; CHECK-LABEL: @test57(
+; CHECK-NEXT:    [[Y:%.*]] = sub i32 0, [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[Y]]
+;
+  %X = add i32 %B, %A
+  %Y = sub i32 %A, %X
+  ret i32 %Y                                                                                                                                                                                                                                             }
diff --git a/test/Transforms/InstCombine/trunc.ll b/test/Transforms/InstCombine/trunc.ll
index eaa45bbb286c8d2012fa11c6db5422efec2e4b0c..5597b578f017966805153556077e9f251d2497d3 100644
--- a/test/Transforms/InstCombine/trunc.ll
+++ b/test/Transforms/InstCombine/trunc.ll
@@ -119,8 +119,8 @@ define i64 @test8(i32 %A, i32 %B) {
 
 define i8 @test9(i32 %X) {
 ; CHECK-LABEL: @test9(
-; CHECK-NEXT:    [[X_TR:%.*]] = trunc i32 %X to i8
-; CHECK-NEXT:    [[Z:%.*]] = and i8 [[X_TR]], 42
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %X to i8
+; CHECK-NEXT:    [[Z:%.*]] = and i8 [[TMP1]], 42
 ; CHECK-NEXT:    ret i8 [[Z]]
 ;
   %Y = and i32 %X, 42
@@ -464,3 +464,72 @@ define <8 x i16> @trunc_shl_v8i16_v8i32_4(<8 x i32> %a) {
   ret <8 x i16> %conv
 }
 
+; Although the mask is the same value, we don't create a shuffle for types that the backend may not be able to handle:
+; trunc (shuffle X, C, Mask) --> shuffle (trunc X), C', Mask
+
+define <4 x i8> @wide_shuf(<4 x i32> %x) {
+; CHECK-LABEL: @wide_shuf(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> %x, <4 x i32> <i32 undef, i32 3634, i32 90, i32 undef>, <4 x i32> <i32 1, i32 5, i32 6, i32 2>
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <4 x i32> [[SHUF]] to <4 x i8>
+; CHECK-NEXT:    ret <4 x i8> [[TRUNC]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> <i32 35, i32 3634, i32 90, i32 -1>, <4 x i32> <i32 1, i32 5, i32 6, i32 2>
+  %trunc = trunc <4 x i32> %shuf to <4 x i8>
+  ret <4 x i8> %trunc
+}
+
+; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+
+define <4 x i8> @wide_splat1(<4 x i32> %x) {
+; CHECK-LABEL: @wide_splat1(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> %x to <4 x i8>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <4 x i8> [[TMP1]], <4 x i8> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    ret <4 x i8> [[TRUNC]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 2, i32 2, i32 2, i32 2>
+  %trunc = trunc <4 x i32> %shuf to <4 x i8>
+  ret <4 x i8> %trunc
+}
+
+; Test weird types.
+; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+
+define <3 x i31> @wide_splat2(<3 x i33> %x) {
+; CHECK-LABEL: @wide_splat2(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <3 x i33> %x to <3 x i31>
+; CHECK-NEXT:    [[TRUNC:%.*]] = shufflevector <3 x i31> [[TMP1]], <3 x i31> undef, <3 x i32> <i32 1, i32 1, i32 1>
+; CHECK-NEXT:    ret <3 x i31> [[TRUNC]]
+;
+  %shuf = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> <i32 1, i32 1, i32 1>
+  %trunc = trunc <3 x i33> %shuf to <3 x i31>
+  ret <3 x i31> %trunc
+}
+
+; FIXME:
+; trunc (shuffle X, undef, SplatMask) --> shuffle (trunc X), undef, SplatMask
+; A mask with undef elements should still be considered a splat mask.
+
+define <3 x i31> @wide_splat3(<3 x i33> %x) {
+; CHECK-LABEL: @wide_splat3(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> <i32 undef, i32 1, i32 1>
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <3 x i33> [[SHUF]] to <3 x i31>
+; CHECK-NEXT:    ret <3 x i31> [[TRUNC]]
+;
+  %shuf = shufflevector <3 x i33> %x, <3 x i33> undef, <3 x i32> <i32 undef, i32 1, i32 1>
+  %trunc = trunc <3 x i33> %shuf to <3 x i31>
+  ret <3 x i31> %trunc
+}
+
+; TODO: The shuffle extends the length of the input vector. Should we shrink this?
+
+define <8 x i8> @wide_lengthening_splat(<4 x i16> %v) {
+; CHECK-LABEL: @wide_lengthening_splat(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TR:%.*]] = trunc <8 x i16> [[SHUF]] to <8 x i8>
+; CHECK-NEXT:    ret <8 x i8> [[TR]]
+;
+  %shuf = shufflevector <4 x i16> %v, <4 x i16> %v, <8 x i32> zeroinitializer
+  %tr = trunc <8 x i16> %shuf to <8 x i8>
+  ret <8 x i8> %tr
+}
+
diff --git a/test/Transforms/InstCombine/urem.ll b/test/Transforms/InstCombine/urem.ll
deleted file mode 100644
index 0549d759eac48f6cf690fa8d3e1da86c52d53523..0000000000000000000000000000000000000000
--- a/test/Transforms/InstCombine/urem.ll
+++ /dev/null
@@ -1,50 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-define i64 @rem_unsigned(i64 %x1, i64 %y2) {
-; CHECK-LABEL: @rem_unsigned(
-; CHECK-NEXT:    [[R:%.*]] = urem i64 %x1, %y2
-; CHECK-NEXT:    ret i64 [[R]]
-;
-  %r = udiv i64 %x1, %y2
-  %r7 = mul i64 %r, %y2
-  %r8 = sub i64 %x1, %r7
-  ret i64 %r8
-}
-
-; PR28672 - https://llvm.org/bugs/show_bug.cgi?id=28672
-
-define i8 @big_divisor(i8 %x) {
-; CHECK-LABEL: @big_divisor(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult i8 %x, -127
-; CHECK-NEXT:    [[TMP2:%.*]] = add i8 %x, 127
-; CHECK-NEXT:    [[REM:%.*]] = select i1 [[TMP1]], i8 %x, i8 [[TMP2]]
-; CHECK-NEXT:    ret i8 [[REM]]
-;
-  %rem = urem i8 %x, 129
-  ret i8 %rem
-}
-
-define i5 @biggest_divisor(i5 %x) {
-; CHECK-LABEL: @biggest_divisor(
-; CHECK-NEXT:    [[NOT_:%.*]] = icmp eq i5 %x, -1
-; CHECK-NEXT:    [[TMP1:%.*]] = zext i1 [[NOT_]] to i5
-; CHECK-NEXT:    [[REM:%.*]] = add i5 [[TMP1]], %x
-; CHECK-NEXT:    ret i5 [[REM]]
-;
-  %rem = urem i5 %x, -1
-  ret i5 %rem
-}
-
-; TODO: Should vector subtract of constant be canonicalized to add?
-define <2 x i4> @big_divisor_vec(<2 x i4> %x) {
-; CHECK-LABEL: @big_divisor_vec(
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp ult <2 x i4> %x, <i4 -3, i4 -3>
-; CHECK-NEXT:    [[TMP2:%.*]] = sub <2 x i4> %x, <i4 -3, i4 -3>
-; CHECK-NEXT:    [[REM:%.*]] = select <2 x i1> [[TMP1]], <2 x i4> %x, <2 x i4> [[TMP2]]
-; CHECK-NEXT:    ret <2 x i4> [[REM]]
-;
-  %rem = urem <2 x i4> %x, <i4 13, i4 13>
-  ret <2 x i4> %rem
-}
-
diff --git a/test/Transforms/InstCombine/vararg.ll b/test/Transforms/InstCombine/vararg.ll
index 263a7425a0759e125ec578a1acfa4087e60f2a75..111cb4de7bc321bd6d664119f39ead80ca8f18b0 100644
--- a/test/Transforms/InstCombine/vararg.ll
+++ b/test/Transforms/InstCombine/vararg.ll
@@ -2,8 +2,8 @@
 
 %struct.__va_list = type { i8*, i8*, i8*, i32, i32 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @llvm.va_start(i8*)
 declare void @llvm.va_end(i8*)
 declare void @llvm.va_copy(i8*, i8*)
@@ -17,14 +17,14 @@ entry:
   %va1 = alloca %struct.__va_list, align 8
   %0 = bitcast %struct.__va_list* %va0 to i8*
   %1 = bitcast %struct.__va_list* %va1 to i8*
-  call void @llvm.lifetime.start(i64 32, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %0)
   call void @llvm.va_start(i8* %0)
-  call void @llvm.lifetime.start(i64 32, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %1)
   call void @llvm.va_copy(i8* %1, i8* %0)
   call void @llvm.va_end(i8* %1)
-  call void @llvm.lifetime.end(i64 32, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %1)
   call void @llvm.va_end(i8* %0)
-  call void @llvm.lifetime.end(i64 32, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %0)
   ret i32 0
 }
 
diff --git a/test/Transforms/InstCombine/vec_demanded_elts.ll b/test/Transforms/InstCombine/vec_demanded_elts.ll
index 7c46adaf616e69399ad23f1bb490a7cff11e1e6f..5f27634da19cc8da57aaa77ce69d5505a8ab5849 100644
--- a/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -67,7 +67,7 @@ define i64 @test3(float %f, double %d) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
 ; CHECK-NEXT:    ret i64 [[TMP15]]
 ;
   %v00 = insertelement <4 x float> undef, float %f, i32 0
@@ -182,10 +182,9 @@ define <4 x float> @dead_shuffle_elt(<4 x float> %x, <2 x float> %y) nounwind {
 
 define <2 x float> @test_fptrunc(double %f) {
 ; CHECK-LABEL: @test_fptrunc(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> undef, double %f, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double 0.000000e+00, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fptrunc <2 x double> [[TMP2]] to <2 x float>
-; CHECK-NEXT:    ret <2 x float> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> <double undef, double 0.000000e+00>, double %f, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fptrunc <2 x double> [[TMP1]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[TMP2]]
 ;
   %tmp9 = insertelement <4 x double> undef, double %f, i32 0
   %tmp10 = insertelement <4 x double> %tmp9, double 0.000000e+00, i32 1
@@ -198,10 +197,9 @@ define <2 x float> @test_fptrunc(double %f) {
 
 define <2 x double> @test_fpext(float %f) {
 ; CHECK-LABEL: @test_fpext(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> [[TMP1]], float 0.000000e+00, i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext <2 x float> [[TMP2]] to <2 x double>
-; CHECK-NEXT:    ret <2 x double> [[TMP3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x float> <float undef, float 0.000000e+00>, float %f, i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = fpext <2 x float> [[TMP1]] to <2 x double>
+; CHECK-NEXT:    ret <2 x double> [[TMP2]]
 ;
   %tmp9 = insertelement <4 x float> undef, float %f, i32 0
   %tmp10 = insertelement <4 x float> %tmp9, float 0.000000e+00, i32 1
@@ -223,8 +221,7 @@ define <4 x double> @test_shuffle(<4 x double> %f) {
 
 define <4 x float> @test_select(float %f, float %g) {
 ; CHECK-LABEL: @test_select(
-; CHECK-NEXT:    [[A0:%.*]] = insertelement <4 x float> undef, float %f, i32 0
-; CHECK-NEXT:    [[A3:%.*]] = insertelement <4 x float> [[A0]], float 3.000000e+00, i32 3
+; CHECK-NEXT:    [[A3:%.*]] = insertelement <4 x float> <float undef, float undef, float undef, float 3.000000e+00>, float %f, i32 0
 ; CHECK-NEXT:    [[RET:%.*]] = shufflevector <4 x float> [[A3]], <4 x float> <float undef, float 4.000000e+00, float 5.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
 ; CHECK-NEXT:    ret <4 x float> [[RET]]
 ;
diff --git a/test/Transforms/InstCombine/vec_sext.ll b/test/Transforms/InstCombine/vec_sext.ll
index 10947c1781e0322757493d6c058b16c93017c651..79a32d64b06385a23ccaa17079a1a3df7fbba33a 100644
--- a/test/Transforms/InstCombine/vec_sext.ll
+++ b/test/Transforms/InstCombine/vec_sext.ll
@@ -6,7 +6,7 @@ define <4 x i32> @psignd_3(<4 x i32> %a, <4 x i32> %b) {
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw <4 x i32> zeroinitializer, %a
 ; CHECK-NEXT:    [[B_LOBIT:%.*]] = ashr <4 x i32> %b, <i32 31, i32 31, i32 31, i32 31>
 ; CHECK-NEXT:    [[T1:%.*]] = xor <4 x i32> [[B_LOBIT]], <i32 -1, i32 -1, i32 -1, i32 -1>
-; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> %a, [[T1]]
+; CHECK-NEXT:    [[T2:%.*]] = and <4 x i32> [[T1]], %a
 ; CHECK-NEXT:    [[T3:%.*]] = and <4 x i32> [[B_LOBIT]], [[SUB]]
 ; CHECK-NEXT:    [[COND:%.*]] = or <4 x i32> [[T2]], [[T3]]
 ; CHECK-NEXT:    ret <4 x i32> [[COND]]
diff --git a/test/Transforms/InstCombine/vector-casts.ll b/test/Transforms/InstCombine/vector-casts.ll
index 0848a6e87422e0ea3e196cfe9321c3a2d028e4a6..643ab6c5348faa6df00dffb0d0c9591d246d8c8d 100644
--- a/test/Transforms/InstCombine/vector-casts.ll
+++ b/test/Transforms/InstCombine/vector-casts.ll
@@ -216,3 +216,91 @@ define <8 x i32> @pr24458(<8 x float> %n) {
   ret <8 x i32> %wrong
 }
 
+; Hoist a trunc to a scalar if we're inserting into an undef vector.
+; trunc (inselt undef, X, Index) --> inselt undef, (trunc X), Index
+
+define <3 x i16> @trunc_inselt_undef(i32 %x) {
+; CHECK-LABEL: @trunc_inselt_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 %x to i16
+; CHECK-NEXT:    [[TRUNC:%.*]] = insertelement <3 x i16> undef, i16 [[TMP1]], i32 1
+; CHECK-NEXT:    ret <3 x i16> [[TRUNC]]
+;
+  %vec = insertelement <3 x i32> undef, i32 %x, i32 1
+  %trunc = trunc <3 x i32> %vec to <3 x i16>
+  ret <3 x i16> %trunc
+}
+
+; Hoist a trunc to a scalar if we're inserting into an undef vector.
+; trunc (inselt undef, X, Index) --> inselt undef, (trunc X), Index
+
+define <2 x float> @fptrunc_inselt_undef(double %x, i32 %index) {
+; CHECK-LABEL: @fptrunc_inselt_undef(
+; CHECK-NEXT:    [[TMP1:%.*]] = fptrunc double %x to float
+; CHECK-NEXT:    [[TRUNC:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 %index
+; CHECK-NEXT:    ret <2 x float> [[TRUNC]]
+;
+  %vec = insertelement <2 x double> <double undef, double undef>, double %x, i32 %index
+  %trunc = fptrunc <2 x double> %vec to <2 x float>
+  ret <2 x float> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar int into a constant vector and truncate:
+; trunc (inselt C, X, Index) --> inselt C, (trunc X), Index
+
+define <3 x i16> @trunc_inselt1(i32 %x) {
+; CHECK-LABEL: @trunc_inselt1(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <3 x i32> <i32 3, i32 undef, i32 65536>, i32 %x, i32 1
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <3 x i32> [[VEC]] to <3 x i16>
+; CHECK-NEXT:    ret <3 x i16> [[TRUNC]]
+;
+  %vec = insertelement <3 x i32> <i32 3, i32 -2, i32 65536>, i32 %x, i32 1
+  %trunc = trunc <3 x i32> %vec to <3 x i16>
+  ret <3 x i16> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar FP into a constant vector and FP truncate:
+; fptrunc (inselt C, X, Index) --> inselt C, (fptrunc X), Index
+
+define <2 x float> @fptrunc_inselt1(double %x, i32 %index) {
+; CHECK-LABEL: @fptrunc_inselt1(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <2 x double> <double undef, double 3.000000e+00>, double %x, i32 %index
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <2 x double> [[VEC]] to <2 x float>
+; CHECK-NEXT:    ret <2 x float> [[TRUNC]]
+;
+  %vec = insertelement <2 x double> <double undef, double 3.0>, double %x, i32 %index
+  %trunc = fptrunc <2 x double> %vec to <2 x float>
+  ret <2 x float> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar int constant into a vector and truncate:
+; trunc (inselt X, C, Index) --> inselt (trunc X), C', Index
+
+define <8 x i16> @trunc_inselt2(<8 x i32> %x, i32 %index) {
+; CHECK-LABEL: @trunc_inselt2(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <8 x i32> %x, i32 1048576, i32 %index
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc <8 x i32> [[VEC]] to <8 x i16>
+; CHECK-NEXT:    ret <8 x i16> [[TRUNC]]
+;
+  %vec = insertelement <8 x i32> %x, i32 1048576, i32 %index
+  %trunc = trunc <8 x i32> %vec to <8 x i16>
+  ret <8 x i16> %trunc
+}
+
+; TODO: Strengthen the backend, so we can have this canonicalization.
+; Insert a scalar FP constant into a vector and FP truncate:
+; fptrunc (inselt X, C, Index) --> inselt (fptrunc X), C', Index
+
+define <3 x float> @fptrunc_inselt2(<3 x double> %x) {
+; CHECK-LABEL: @fptrunc_inselt2(
+; CHECK-NEXT:    [[VEC:%.*]] = insertelement <3 x double> %x, double 4.000000e+00, i32 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = fptrunc <3 x double> [[VEC]] to <3 x float>
+; CHECK-NEXT:    ret <3 x float> [[TRUNC]]
+;
+  %vec = insertelement <3 x double> %x, double 4.0, i32 2
+  %trunc = fptrunc <3 x double> %vec to <3 x float>
+  ret <3 x float> %trunc
+}
+
diff --git a/test/Transforms/InstCombine/vector-srem.ll b/test/Transforms/InstCombine/vector-srem.ll
deleted file mode 100644
index 44b38596e684aad1b2f84767d7f8c06f243af368..0000000000000000000000000000000000000000
--- a/test/Transforms/InstCombine/vector-srem.ll
+++ /dev/null
@@ -1,13 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -instcombine -S | FileCheck %s
-
-define <4 x i32> @foo(<4 x i32> %t, <4 x i32> %u) {
-; CHECK-LABEL: @foo(
-; CHECK-NEXT:    [[K:%.*]] = srem <4 x i32> %t, %u
-; CHECK-NEXT:    ret <4 x i32> [[K]]
-;
-  %k = sdiv <4 x i32> %t, %u
-  %l = mul <4 x i32> %k, %u
-  %m = sub <4 x i32> %t, %l
-  ret <4 x i32> %m
-}
diff --git a/test/Transforms/InstCombine/vector-urem.ll b/test/Transforms/InstCombine/vector-urem.ll
index 6cecc16069d36b8f67c60704b5ed26da9c8696c7..34eebeef3bb1044e8fa8655f13c9feab1726f22c 100644
--- a/test/Transforms/InstCombine/vector-urem.ll
+++ b/test/Transforms/InstCombine/vector-urem.ll
@@ -19,11 +19,3 @@ define <4 x i32> @test_v4i32_const_pow2(<4 x i32> %a0) {
   ret <4 x i32> %1
 }
 
-define <4 x i32> @test_v4i32_const_pow2_or_zero(<4 x i32> %a0) {
-; CHECK-LABEL: @test_v4i32_const_pow2_or_zero(
-; CHECK-NEXT:    [[TMP1:%.*]] = urem <4 x i32> %a0, <i32 1, i32 2, i32 0, i32 8>
-; CHECK-NEXT:    ret <4 x i32> [[TMP1]]
-;
-  %1 = urem <4 x i32> %a0, <i32 1, i32 2, i32 0, i32 8>
-  ret <4 x i32> %1
-}
diff --git a/test/Transforms/InstCombine/vector_insertelt_shuffle.ll b/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
index b3e614653cfa2235489bfda019e8dcae3860a52f..c358509d690e9b19b47bbc6ec7d3f2383688b9bc 100644
--- a/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
+++ b/test/Transforms/InstCombine/vector_insertelt_shuffle.ll
@@ -1,94 +1,95 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instcombine -S | FileCheck %s
 
-define<4 x float> @foo(<4 x float> %x) {
+; insertelements should fold to shuffle
+define <4 x float> @foo(<4 x float> %x) {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    [[INS2:%.*]] = shufflevector <4 x float> %x, <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
   %ins1 = insertelement<4 x float> %x, float 1.0, i32 1
   %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; insertelements should fold to shuffle
-; CHECK-LABEL: @foo
-; CHECK-NEXT: shufflevector <4 x float> %{{.+}}, <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT: ret <4 x float> %
+; Insert of a constant is canonicalized ahead of insert of a variable.
 
-define<4 x float> @bar(<4 x float> %x, float %a) {
+define <4 x float> @bar(<4 x float> %x, float %a) {
+; CHECK-LABEL: @bar(
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x float> %x, float 2.000000e+00, i32 2
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[TMP1]], float %a, i32 1
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
   %ins1 = insertelement<4 x float> %x, float %a, i32 1
   %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; CHECK-LABEL: @bar
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float %{{.+}}, i32 1
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 2.000000e+00, i32 2
-; CHECK-NEXT: ret <4 x float> %
-
-define<4 x float> @baz(<4 x float> %x, i32 %a) {
+define <4 x float> @baz(<4 x float> %x, i32 %a) {
+; CHECK-LABEL: @baz(
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> %x, float 1.000000e+00, i32 1
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float 2.000000e+00, i32 %a
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
   %ins1 = insertelement<4 x float> %x, float 1.0, i32 1
   %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 %a
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; CHECK-LABEL: @baz
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 1.000000e+00, i32 1
-; CHECK-NEXT: insertelement <4 x float> %ins1, float 2.000000e+00, i32 %
-; CHECK-NEXT: ret <4 x float> %
-
-define<4 x float> @bazz(<4 x float> %x, i32 %a) {
+; insertelements should fold to shuffle
+define <4 x float> @bazz(<4 x float> %x, i32 %a) {
+; CHECK-LABEL: @bazz(
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> %x, float 1.000000e+00, i32 3
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> [[INS1]], float 5.000000e+00, i32 %a
+; CHECK-NEXT:    [[INS5:%.*]] = shufflevector <4 x float> [[INS2]], <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
+; CHECK-NEXT:    [[INS6:%.*]] = insertelement <4 x float> [[INS5]], float 7.000000e+00, i32 %a
+; CHECK-NEXT:    ret <4 x float> [[INS6]]
+;
   %ins1 = insertelement<4 x float> %x, float 1.0, i32 3
   %ins2 = insertelement<4 x float> %ins1, float 5.0, i32 %a
   %ins3 = insertelement<4 x float> %ins2, float 3.0, i32 2
   %ins4 = insertelement<4 x float> %ins3, float 1.0, i32 1
   %ins5 = insertelement<4 x float> %ins4, float 2.0, i32 2
   %ins6 = insertelement<4 x float> %ins5, float 7.0, i32 %a
-  ret<4 x float> %ins6
+  ret <4 x float> %ins6
 }
 
-; insertelements should fold to shuffle
-; CHECK-LABEL: @bazz
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 1.000000e+00, i32 3
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 5.000000e+00, i32 %
-; CHECK-NEXT: shufflevector <4 x float> %{{.+}}, <4 x float> <float undef, float 1.000000e+00, float 2.000000e+00, float undef>, <4 x i32> <i32 0, i32 5, i32 6, i32 3>
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 7.000000e+00, i32 %
-; CHECK-NEXT: ret <4 x float> %
-
-define<4 x float> @bazzz(<4 x float> %x) {
+define <4 x float> @bazzz(<4 x float> %x) {
+; CHECK-LABEL: @bazzz(
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> %x, float 2.000000e+00, i32 2
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
   %ins1 = insertelement<4 x float> %x, float 1.0, i32 5
   %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; CHECK-LABEL: @bazzz
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 2.000000e+00, i32 2
-; CHECK-NEXT: ret <4 x float> %
-
-define<4 x float> @bazzzz(<4 x float> %x) {
+define <4 x float> @bazzzz(<4 x float> %x) {
+; CHECK-LABEL: @bazzzz(
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x float> %x, float 1.000000e+00, i32 undef
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x float> %x, float 2.000000e+00, i32 2
+; CHECK-NEXT:    ret <4 x float> [[INS2]]
+;
   %ins1 = insertelement<4 x float> %x, float 1.0, i32 undef
   %ins2 = insertelement<4 x float> %ins1, float 2.0, i32 2
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; CHECK-LABEL: @bazzzz
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 1.000000e+00, i32 undef
-; CHECK-NEXT: insertelement <4 x float> %{{.+}}, float 2.000000e+00, i32 2
-; CHECK-NEXT: ret <4 x float> %
-
-define<4 x float> @bazzzzz() {
+define <4 x float> @bazzzzz() {
+; CHECK-LABEL: @bazzzzz(
+; CHECK-NEXT:    ret <4 x float> <float 1.000000e+00, float 5.000000e+00, float 1.000000e+01, float 4.000000e+00>
+;
   %ins1 = insertelement <4 x float> insertelement (<4 x float> <float 1.0, float 2.0, float 3.0, float undef>, float 4.0, i32 3), float 5.0, i32 1
   %ins2 = insertelement<4 x float> %ins1, float 10.0, i32 2
-  ret<4 x float> %ins2
+  ret <4 x float> %ins2
 }
 
-; insertelements should fold to shuffle
-; CHECK-LABEL: @bazzzzz
-; CHECK-NEXT: ret <4 x float> <float 1.000000e+00, float 5.000000e+00, float 1.000000e+01, float 4.000000e+00>
-
-define<4 x float> @bazzzzzz(<4 x float> %x, i32 %a) {
+define <4 x float> @bazzzzzz(<4 x float> %x, i32 %a) {
+; CHECK-LABEL: @bazzzzzz(
+; CHECK-NEXT:    ret <4 x float> <float undef, float 5.000000e+00, float undef, float 4.000000e+00>
+;
   %ins1 = insertelement <4 x float> insertelement (<4 x float> shufflevector (<4 x float> undef, <4 x float> <float 1.0, float 2.0, float 3.0, float 4.0> , <4 x i32> <i32 0, i32 5, i32 undef, i32 6> ), float 4.0, i32 3), float 5.0, i32 1
-  ret<4 x float> %ins1
+  ret <4 x float> %ins1
 }
 
-; insertelements should fold to shuffle
-; CHECK-LABEL: @bazzzzz
-; CHECK-NEXT: ret <4 x float> <float undef, float 5.000000e+00, float undef, float 4.000000e+00>
 
diff --git a/test/Transforms/InstCombine/x86-avx512.ll b/test/Transforms/InstCombine/x86-avx512.ll
index d2a2580d8c24926ca62a21b59874c007b4f4b04b..4c3bb5898a4005bc2fd6b17aa24cc858304e9497 100644
--- a/test/Transforms/InstCombine/x86-avx512.ll
+++ b/test/Transforms/InstCombine/x86-avx512.ll
@@ -781,7 +781,7 @@ define i64 @test(float %f, double %d) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
 ; CHECK-NEXT:    ret i64 [[TMP15]]
 ;
   %v00 = insertelement <4 x float> undef, float %f, i32 0
@@ -861,7 +861,7 @@ define i64 @test2(float %f, double %d) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP5]], [[TMP7]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP11]], [[TMP14]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[TMP14]], [[TMP11]]
 ; CHECK-NEXT:    ret i64 [[TMP15]]
 ;
   %v00 = insertelement <4 x float> undef, float %f, i32 0
diff --git a/test/Transforms/InstCombine/x86-pack.ll b/test/Transforms/InstCombine/x86-pack.ll
index 68d5521d47a9e3e7d065686a2087e8a0acefaeab..f3c41a8aa47638ed3ecc55599ec26b0905cb28e3 100644
--- a/test/Transforms/InstCombine/x86-pack.ll
+++ b/test/Transforms/InstCombine/x86-pack.ll
@@ -69,6 +69,38 @@ define <32 x i8> @undef_packuswb_256() {
   ret <32 x i8> %1
 }
 
+define <32 x i16> @undef_packssdw_512() {
+; CHECK-LABEL: @undef_packssdw_512(
+; CHECK-NEXT:    ret <32 x i16> undef
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> undef, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @undef_packusdw_512() {
+; CHECK-LABEL: @undef_packusdw_512(
+; CHECK-NEXT:    ret <32 x i16> undef
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <64 x i8> @undef_packsswb_512() {
+; CHECK-LABEL: @undef_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> undef)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @undef_packuswb_512() {
+; CHECK-LABEL: @undef_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> undef, <32 x i16> undef)
+  ret <64 x i8> %1
+}
+
 ;
 ; Constant Folding
 ;
@@ -137,13 +169,45 @@ define <32 x i8> @fold_packuswb_256() {
   ret <32 x i8> %1
 }
 
+define <32 x i16> @fold_packssdw_512() {
+; CHECK-LABEL: @fold_packssdw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 512, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <16 x i32> undef)
+  ret <32 x i16> %1
+}
+
+define <32 x i16> @fold_packusdw_512() {
+; CHECK-LABEL: @fold_packusdw_512(
+; CHECK-NEXT:    ret <32 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767, i16 0, i16 0, i16 0, i16 -1, i16 0, i16 512, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767>
+;
+  %1 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> <i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767, i32 0, i32 -512, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <16 x i32> <i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767, i32 0, i32 512, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>)
+  ret <32 x i16> %1
+}
+
+define <64 x i8> @fold_packsswb_512() {
+; CHECK-LABEL: @fold_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> undef, <32 x i16> zeroinitializer)
+  ret <64 x i8> %1
+}
+
+define <64 x i8> @fold_packuswb_512() {
+; CHECK-LABEL: @fold_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64>
+;
+  %1 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> zeroinitializer, <32 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64, i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 512, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>)
+  ret <64 x i8> %1
+}
+
 ;
 ; Demanded Elts
 ;
 
 define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @elts_packssdw_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[A0:%.*]], <4 x i32> undef)
 ; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <8 x i16> [[TMP2]]
 ;
@@ -156,7 +220,7 @@ define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) {
 
 define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) {
 ; CHECK-LABEL: @elts_packusdw_128(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[A0:%.*]], <4 x i32> [[A1:%.*]])
 ; CHECK-NEXT:    ret <8 x i16> [[TMP1]]
 ;
   %1 = insertelement <4 x i32> %a0, i32 0, i32 0
@@ -190,7 +254,7 @@ define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) {
 
 define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: @elts_packssdw_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> undef)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[A0:%.*]], <8 x i32> undef)
 ; CHECK-NEXT:    ret <16 x i16> [[TMP1]]
 ;
   %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -202,7 +266,7 @@ define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) {
 
 define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) {
 ; CHECK-LABEL: @elts_packusdw_256(
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[A1:%.*]], <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    ret <16 x i16> [[TMP3]]
@@ -236,6 +300,56 @@ define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) {
   ret <32 x i8> %4
 }
 
+define <32 x i16> @elts_packssdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @elts_packssdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[A0:%.*]], <16 x i32> undef)
+; CHECK-NEXT:    ret <32 x i16> [[TMP1]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 1, i32 0, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 9, i32 8, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 undef, i32 2, i32 1, i32 undef, i32 undef, i32 6, i32 5, i32 undef, i32 undef, i32 10, i32 9, i32 undef, i32 undef, i32 14, i32 13, i32 undef>
+  %3 = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 undef, i32 undef, i32 2, i32 3, i32 4, i32 undef, i32 undef, i32 7, i32 8, i32 undef, i32 undef, i32 11, i32 12, i32 undef, i32 undef, i32 15, i32 undef, i32 undef, i32 18, i32 19, i32 20, i32 undef, i32 undef, i32 23, i32 24, i32 undef, i32 undef, i32 27, i32 28, i32 undef, i32 undef, i32 31>
+  ret <32 x i16> %4
+}
+
+define <32 x i16> @elts_packusdw_512(<16 x i32> %a0, <16 x i32> %a1) {
+; CHECK-LABEL: @elts_packusdw_512(
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <16 x i32> [[A1:%.*]], <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+; CHECK-NEXT:    [[TMP2:%.*]] = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> undef, <16 x i32> [[TMP1]])
+; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP2]], <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <32 x i16> [[TMP3]]
+;
+  %1 = shufflevector <16 x i32> %a0, <16 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %2 = shufflevector <16 x i32> %a1, <16 x i32> undef, <16 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0, i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8>
+  %3 = call <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32> %1, <16 x i32> %2)
+  %4 = shufflevector <32 x i16> %3, <32 x i16> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 20, i32 21, i32 22, i32 23, i32 undef, i32 undef, i32 undef, i32 undef, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <32 x i16> %4
+}
+
+define <64 x i8> @elts_packsswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_packsswb_512(
+; CHECK-NEXT:    ret <64 x i8> zeroinitializer
+;
+  %1 = insertelement <32 x i16> %a0, i16 0, i32 0
+  %2 = insertelement <32 x i16> %a1, i16 0, i32 8
+  %3 = insertelement <32 x i16> %1, i16 0, i32 16
+  %4 = insertelement <32 x i16> %2, i16 0, i32 24
+  %5 = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> %3, <32 x i16> %4)
+  %6 = shufflevector <64 x i8> %5, <64 x i8> undef, <64 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56, i32 56>
+  ret <64 x i8> %6
+}
+
+define <64 x i8> @elts_packuswb_512(<32 x i16> %a0, <32 x i16> %a1) {
+; CHECK-LABEL: @elts_packuswb_512(
+; CHECK-NEXT:    ret <64 x i8> undef
+;
+  %1 = insertelement <32 x i16> undef, i16 0, i32 1
+  %2 = insertelement <32 x i16> undef, i16 0, i32 0
+  %3 = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> %1, <32 x i16> %2)
+  %4 = shufflevector <64 x i8> %3, <64 x i8> undef, <64 x i32> zeroinitializer
+  ret <64 x i8> %4
+}
+
 declare <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32>, <4 x i32>) nounwind readnone
 declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind readnone
 declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind readnone
@@ -245,3 +359,8 @@ declare <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32>, <8 x i32>) nounwind readno
 declare <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32>, <8 x i32>) nounwind readnone
 declare <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16>, <16 x i16>) nounwind readnone
 declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) nounwind readnone
+
+declare <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <32 x i16> @llvm.x86.avx512.packusdw.512(<16 x i32>, <16 x i32>) nounwind readnone
+declare <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16>, <32 x i16>) nounwind readnone
+declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) nounwind readnone
diff --git a/test/Transforms/InstCombine/xor.ll b/test/Transforms/InstCombine/xor.ll
index cd137776bbfd1086609b0f30a5bd19e7003bd941..570155b162325bba33662efe615c1a3a34e0b9d6 100644
--- a/test/Transforms/InstCombine/xor.ll
+++ b/test/Transforms/InstCombine/xor.ll
@@ -321,7 +321,7 @@ define i32 @test25(i32 %g, i32 %h) {
 
 define i32 @test26(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test26(
-; CHECK-NEXT:    [[T4:%.*]] = and i32 %a, %b
+; CHECK-NEXT:    [[T4:%.*]] = and i32 %b, %a
 ; CHECK-NEXT:    ret i32 [[T4]]
 ;
   %b2 = xor i32 %b, -1
@@ -352,3 +352,187 @@ define i32 @test28(i32 %indvar) {
   %t214 = xor i32 %t7, -2147483648
   ret i32 %t214
 }
+
+define i32 @test29(i1 %C) {
+; CHECK-LABEL: @test29(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], i32 915, i32 113
+; CHECK-NEXT:    ret i32 [[V]]
+;
+  %A = select i1 %C, i32 1000, i32 10
+  %V = xor i32 %A, 123
+  ret i32 %V
+}
+
+define <2 x i32> @test29vec(i1 %C) {
+; CHECK-LABEL: @test29vec(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 915, i32 915>, <2 x i32> <i32 113, i32 113>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 1000>, <2 x i32> <i32 10, i32 10>
+  %V = xor <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %V
+}
+
+define <2 x i32> @test29vec2(i1 %C) {
+; CHECK-LABEL: @test29vec2(
+; CHECK-NEXT:    [[V:%.*]] = select i1 [[C:%.*]], <2 x i32> <i32 915, i32 2185>, <2 x i32> <i32 113, i32 339>
+; CHECK-NEXT:    ret <2 x i32> [[V]]
+;
+  %A = select i1 %C, <2 x i32> <i32 1000, i32 2500>, <2 x i32> <i32 10, i32 30>
+  %V = xor <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %V
+}
+
+define i32 @test30(i1 %which) {
+; CHECK-LABEL: @test30(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi i32 [ 915, [[ENTRY:%.*]] ], [ 113, [[DELAY]] ]
+; CHECK-NEXT:    ret i32 [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi i32 [ 1000, %entry ], [ 10, %delay ]
+  %value = xor i32 %A, 123
+  ret i32 %value
+}
+
+define <2 x i32> @test30vec(i1 %which) {
+; CHECK-LABEL: @test30vec(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 915, i32 915>, [[ENTRY:%.*]] ], [ <i32 113, i32 113>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 1000>, %entry ], [ <i32 10, i32 10>, %delay ]
+  %value = xor <2 x i32> %A, <i32 123, i32 123>
+  ret <2 x i32> %value
+}
+
+define <2 x i32> @test30vec2(i1 %which) {
+; CHECK-LABEL: @test30vec2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[WHICH:%.*]], label [[FINAL:%.*]], label [[DELAY:%.*]]
+; CHECK:       delay:
+; CHECK-NEXT:    br label [[FINAL]]
+; CHECK:       final:
+; CHECK-NEXT:    [[A:%.*]] = phi <2 x i32> [ <i32 915, i32 2185>, [[ENTRY:%.*]] ], [ <i32 113, i32 339>, [[DELAY]] ]
+; CHECK-NEXT:    ret <2 x i32> [[A]]
+;
+entry:
+  br i1 %which, label %final, label %delay
+
+delay:
+  br label %final
+
+final:
+  %A = phi <2 x i32> [ <i32 1000, i32 2500>, %entry ], [ <i32 10, i32 30>, %delay ]
+  %value = xor <2 x i32> %A, <i32 123, i32 333>
+  ret <2 x i32> %value
+}
+
+define i32 @test31(i32 %A, i32 %B) {
+; CHECK-LABEL: @test31(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = or i32 %A, %B
+  %xor = xor i32 %B, %and
+  ret i32 %xor
+}
+
+define i32 @test32(i32 %A, i32 %B) {
+; CHECK-LABEL: @test32(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = or i32 %B, %A
+  %xor = xor i32 %B, %and
+  ret i32 %xor
+}
+
+define i32 @test33(i32 %A, i32 %B) {
+; CHECK-LABEL: @test33(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = or i32 %A, %B
+  %xor = xor i32 %and, %B
+  ret i32 %xor
+}
+
+define i32 @test34(i32 %A, i32 %B) {
+; CHECK-LABEL: @test34(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[B:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[A:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = or i32 %B, %A
+  %xor = xor i32 %and, %B
+  ret i32 %xor
+}
+
+define i32 @test35(i32 %A, i32 %B) {
+; CHECK-LABEL: @test35(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %A, %B
+  %xor = xor i32 %B, %and
+  ret i32 %xor
+}
+
+define i32 @test36(i32 %A, i32 %B) {
+; CHECK-LABEL: @test36(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %B, %A
+  %xor = xor i32 %B, %and
+  ret i32 %xor
+}
+
+define i32 @test37(i32 %A, i32 %B) {
+; CHECK-LABEL: @test37(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %A, %B
+  %xor = xor i32 %and, %B
+  ret i32 %xor
+}
+
+define i32 @test38(i32 %A, i32 %B) {
+; CHECK-LABEL: @test38(
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i32 [[A:%.*]], -1
+; CHECK-NEXT:    [[XOR:%.*]] = and i32 [[TMP1]], [[B:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR]]
+;
+  %and = and i32 %B, %A
+  %xor = xor i32 %and, %B
+  ret i32 %xor
+}
diff --git a/test/Transforms/InstCombine/xor2.ll b/test/Transforms/InstCombine/xor2.ll
index f3591ed9c8a9b5e86dded7ff0ab06ab66982ff1c..79e62723f143f136d037bd38e096a192afcda9be 100644
--- a/test/Transforms/InstCombine/xor2.ll
+++ b/test/Transforms/InstCombine/xor2.ll
@@ -110,7 +110,7 @@ define i32 @test6(i32 %x) {
 define i32 @test7(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test7(
 ; CHECK-NEXT:    [[B_NOT:%.*]] = xor i32 %b, -1
-; CHECK-NEXT:    [[XOR:%.*]] = or i32 %a, [[B_NOT]]
+; CHECK-NEXT:    [[XOR:%.*]] = or i32 [[B_NOT]], %a
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %or = or i32 %a, %b
@@ -123,7 +123,7 @@ define i32 @test7(i32 %a, i32 %b) {
 define i32 @test8(i32 %a, i32 %b) {
 ; CHECK-LABEL: @test8(
 ; CHECK-NEXT:    [[B_NOT:%.*]] = xor i32 %b, -1
-; CHECK-NEXT:    [[XOR:%.*]] = or i32 %a, [[B_NOT]]
+; CHECK-NEXT:    [[XOR:%.*]] = or i32 [[B_NOT]], %a
 ; CHECK-NEXT:    ret i32 [[XOR]]
 ;
   %neg = xor i32 %a, -1
@@ -144,6 +144,18 @@ define i32 @test9(i32 %b, i32 %c) {
   ret i32 %xor2
 }
 
+; (A & B) ^ (B ^ A) -> (A | B)
+define i32 @test9b(i32 %b, i32 %c) {
+; CHECK-LABEL: @test9b(
+; CHECK-NEXT:    [[XOR2:%.*]] = or i32 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR2]]
+;
+  %and = and i32 %b, %c
+  %xor = xor i32 %c, %b
+  %xor2 = xor i32 %and, %xor
+  ret i32 %xor2
+}
+
 ; (A ^ B) ^ (A & B) -> (A | B)
 define i32 @test10(i32 %b, i32 %c) {
 ; CHECK-LABEL: @test10(
@@ -156,6 +168,18 @@ define i32 @test10(i32 %b, i32 %c) {
   ret i32 %xor2
 }
 
+; (A ^ B) ^ (A & B) -> (A | B)
+define i32 @test10b(i32 %b, i32 %c) {
+; CHECK-LABEL: @test10b(
+; CHECK-NEXT:    [[XOR2:%.*]] = or i32 [[B:%.*]], [[C:%.*]]
+; CHECK-NEXT:    ret i32 [[XOR2]]
+;
+  %xor = xor i32 %b, %c
+  %and = and i32 %c, %b
+  %xor2 = xor i32 %xor, %and
+  ret i32 %xor2
+}
+
 define i32 @test11(i32 %A, i32 %B) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    ret i32 0
diff --git a/test/Transforms/InstCombine/zext-or-icmp.ll b/test/Transforms/InstCombine/zext-or-icmp.ll
index 610e9a754f0d5faabf6fa6da37fcd99a030bd2c2..afbe36da3e37b79b3aace1e397800c27a40fd979 100644
--- a/test/Transforms/InstCombine/zext-or-icmp.ll
+++ b/test/Transforms/InstCombine/zext-or-icmp.ll
@@ -19,3 +19,33 @@ define i8 @zext_or_icmp_icmp(i8 %a, i8 %b) {
 ; CHECK-NEXT:    ret i8 %zext
 }
 
+; Here, widening the or from i1 to i32 and removing one of the icmps would
+; widen an undef value (created by the out-of-range shift), increasing the
+; range of valid values for the return, so we can't do it.
+define i32 @dont_widen_undef() {
+entry:
+  br label %block2
+
+block1:
+  br label %block2
+
+block2:
+  %m.011 = phi i32 [ 33, %entry ], [ 0, %block1 ]
+  %cmp.i = icmp ugt i32 %m.011, 1
+  %m.1.op = lshr i32 1, %m.011
+  %sext.mask = and i32 %m.1.op, 65535
+  %cmp115 = icmp ne i32 %sext.mask, 0
+  %cmp1 = or i1 %cmp.i, %cmp115
+  %conv2 = zext i1 %cmp1 to i32
+  ret i32 %conv2
+
+; CHECK-LABEL: dont_widen_undef(
+; CHECK:         %m.011 = phi i32 [ 33, %entry ], [ 0, %block1 ]
+; CHECK-NEXT:    %cmp.i = icmp ugt i32 %m.011, 1
+; CHECK-NEXT:    %m.1.op = lshr i32 1, %m.011
+; CHECK-NEXT:    %sext.mask = and i32 %m.1.op, 65535
+; CHECK-NEXT:    %cmp115 = icmp ne i32 %sext.mask, 0
+; CHECK-NEXT:    %cmp1 = or i1 %cmp.i, %cmp115
+; CHECK-NEXT:    %conv2 = zext i1 %cmp1 to i32
+; CHECK-NEXT:    ret i32 %conv2
+}
diff --git a/test/Transforms/InstCombine/zext-phi.ll b/test/Transforms/InstCombine/zext-phi.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5e352415c747c62aa2aec9f4c139f150c3dd741f
--- /dev/null
+++ b/test/Transforms/InstCombine/zext-phi.ll
@@ -0,0 +1,32 @@
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-n8:16:32:64"
+
+; Although i1 is not in the datalayout, we should treat it
+; as a legal type because it is a fundamental type in IR.
+; This means we should shrink the phi (sink the zexts).
+
+define i64 @sink_i1_casts(i1 %cond1, i1 %cond2) {
+; CHECK-LABEL: @sink_i1_casts(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 %cond1, label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:    br label %end
+; CHECK:       end:
+; CHECK-NEXT:    [[PHI_IN:%.*]] = phi i1 [ %cond1, %entry ], [ %cond2, %if ]
+; CHECK-NEXT:    [[PHI:%.*]] = zext i1 [[PHI_IN]] to i64
+; CHECK-NEXT:    ret i64 [[PHI]]
+;
+entry:
+  %z1 = zext i1 %cond1 to i64
+  br i1 %cond1, label %if, label %end
+
+if:
+  %z2 = zext i1 %cond2 to i64
+  br label %end
+
+end:
+  %phi = phi i64 [ %z1, %entry ], [ %z2, %if ]
+  ret i64 %phi
+}
+
diff --git a/test/Transforms/InstCombine/zext.ll b/test/Transforms/InstCombine/zext.ll
index 740509809d1c4a2fb0e1fa0ce778d796cc6ed454..887d839cb8c79d5426fcce2a0c10aab7873522ef 100644
--- a/test/Transforms/InstCombine/zext.ll
+++ b/test/Transforms/InstCombine/zext.ll
@@ -35,7 +35,7 @@ define <2 x i64> @test3(<2 x i64> %A) {
 
 define <2 x i64> @test4(<2 x i64> %A) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i64> %A, <i64 4294967295, i64 4294967295>
+; CHECK-NEXT:    [[TMP1:%.*]] = xor <2 x i64> %A, <i64 63, i64 63>
 ; CHECK-NEXT:    [[XOR:%.*]] = and <2 x i64> [[TMP1]], <i64 23, i64 42>
 ; CHECK-NEXT:    ret <2 x i64> [[XOR]]
 ;
diff --git a/test/Transforms/InstSimplify/AndOrXor.ll b/test/Transforms/InstSimplify/AndOrXor.ll
index c6959d72961d163f8708d2a3541d31d4247071f3..33fd978277d4ce477c78087429944c2c93ec804e 100644
--- a/test/Transforms/InstSimplify/AndOrXor.ll
+++ b/test/Transforms/InstSimplify/AndOrXor.ll
@@ -1,6 +1,28 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
+; add nsw (xor X, signbit), signbit --> X
+
+define <2 x i32> @add_nsw_signbit(<2 x i32> %x) {
+; CHECK-LABEL: @add_nsw_signbit(
+; CHECK-NEXT:    ret <2 x i32> %x
+;
+  %y = xor <2 x i32> %x, <i32 -2147483648, i32 -2147483648>
+  %z = add nsw <2 x i32> %y, <i32 -2147483648, i32 -2147483648>
+  ret <2 x i32> %z
+}
+
+; add nuw (xor X, signbit), signbit --> X
+
+define <2 x i5> @add_nuw_signbit(<2 x i5> %x) {
+; CHECK-LABEL: @add_nuw_signbit(
+; CHECK-NEXT:    ret <2 x i5> %x
+;
+  %y = xor <2 x i5> %x, <i5 -16, i5 -16>
+  %z = add nuw <2 x i5> %y, <i5 -16, i5 -16>
+  ret <2 x i5> %z
+}
+
 define i64 @pow2(i32 %x) {
 ; CHECK-LABEL: @pow2(
 ; CHECK-NEXT:    [[NEGX:%.*]] = sub i32 0, %x
diff --git a/test/Transforms/InstSimplify/addsub.ll b/test/Transforms/InstSimplify/addsub.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2f19a4d205e77190765d90f08a115e434635e028
--- /dev/null
+++ b/test/Transforms/InstSimplify/addsub.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define i1 @test1(i1 %a) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret i1 true
+;
+  %b = xor i1 %a, true
+  %res = sub i1 %a, %b
+  ret i1 %res
+}
+
+define <2 x i1> @test2(<2 x i1> %a) {
+; CHECK-LABEL: @test2(
+; CHECK-NEXT:    ret <2 x i1> <i1 true, i1 true>
+;
+  %b = xor <2 x i1> %a, <i1 true, i1 true>
+  %res = sub <2 x i1> %a, %b
+  ret <2 x i1> %res
+}
+
+define i1 @test5(i1 %a) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    ret i1 false
+;
+  %res = add i1 %a, %a
+  ret i1 %res
+}
+
+define <2 x i1> @test6(<2 x i1> %a) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %res = add <2 x i1> %a, %a
+  ret <2 x i1> %res
+}
+
+define i1 @test7(i1 %a) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    ret i1 [[A:%.*]]
+;
+  %c = xor i1 %a, true
+  %res = add i1 %c, true
+  ret i1 %res
+}
+
+; TODO: simplify this to %a
+define i1 @test8(i1 %a) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[C:%.*]] = add i1 [[A:%.*]], true
+; CHECK-NEXT:    [[RES:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %c = add i1 %a, true
+  %res = xor i1 %c, true
+  ret i1 %res
+}
+
+define i1 @test9(i1 %a) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    ret i1 [[A:%.*]]
+;
+  %c = xor i1 %a, true
+  %res = sub i1 %c, true
+  ret i1 %res
+}
+
+; TODO: simplify this to %a
+define i1 @test10(i1 %a) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[C:%.*]] = sub i1 [[A:%.*]], true
+; CHECK-NEXT:    [[RES:%.*]] = xor i1 [[C]], true
+; CHECK-NEXT:    ret i1 [[RES]]
+;
+  %c = sub i1 %a, true
+  %res = xor i1 %c, true
+  ret i1 %res
+}
diff --git a/test/Transforms/InstSimplify/assume.ll b/test/Transforms/InstSimplify/assume.ll
index 2487a9c8bb1548d534894dfedf642561e294faea..66f2120f2928bc5500a9290c4452c24e25a44d54 100644
--- a/test/Transforms/InstSimplify/assume.ll
+++ b/test/Transforms/InstSimplify/assume.ll
@@ -1,5 +1,10 @@
 ; NOTE: Assertions have been autogenerated by update_test_checks.py
-; RUN: opt -instsimplify -S < %s | FileCheck %s
+; RUN: opt -instsimplify -S < %s 2>&1 -pass-remarks-analysis=.* | FileCheck %s
+
+; Verify that warnings are emitted for the 2nd and 3rd tests.
+
+; CHECK: remark: /tmp/s.c:1:13: Detected conflicting code assumptions.
+; CHECK: remark: /tmp/s.c:4:10: Detected conflicting code assumptions.
 
 define void @test1() {
 ; CHECK-LABEL: @test1(
@@ -10,5 +15,58 @@ define void @test1() {
 
 }
 
+; The alloca guarantees that the low bits of %a are zero because of alignment.
+; The assume says the opposite. The assume is processed last, so that's the 
+; return value. There's no way to win (we can't undo transforms that happened
+; based on half-truths), so just don't crash.
+
+define i64 @PR31809() !dbg !7 {
+; CHECK-LABEL: @PR31809(
+; CHECK-NEXT:    ret i64 3
+;
+  %a = alloca i32
+  %t1 = ptrtoint i32* %a to i64, !dbg !9
+  %cond = icmp eq i64 %t1, 3
+  call void @llvm.assume(i1 %cond)
+  ret i64 %t1
+}
+
+; Similar to above: there's no way to know which assumption is truthful,
+; so just don't crash. The second icmp+assume gets processed later, so that
+; determines the return value.
+
+define i8 @conflicting_assumptions(i8 %x) !dbg !10 {
+; CHECK-LABEL: @conflicting_assumptions(
+; CHECK-NEXT:    call void @llvm.assume(i1 false)
+; CHECK-NEXT:    [[COND2:%.*]] = icmp eq i8 %x, 4
+; CHECK-NEXT:    call void @llvm.assume(i1 [[COND2]])
+; CHECK-NEXT:    ret i8 5
+;
+  %add = add i8 %x, 1, !dbg !11
+  %cond1 = icmp eq i8 %x, 3
+  call void @llvm.assume(i1 %cond1)
+  %cond2 = icmp eq i8 %x, 4
+  call void @llvm.assume(i1 %cond2)
+  ret i8 %add
+}
+
 declare void @llvm.assume(i1) nounwind
 
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 4.0.0 (trunk 282540) (llvm/trunk 282542)", isOptimized: true, runtimeVersion: 0, emissionKind: LineTablesOnly, enums: !2)
+!1 = !DIFile(filename: "/tmp/s.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 4.0.0 (trunk 282540) (llvm/trunk 282542)"}
+!7 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, isOptimized: true, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !2)
+!9 = !DILocation(line: 1, column: 13, scope: !7)
+!10 = distinct !DISubprogram(name: "bar", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, isOptimized: true, unit: !0, variables: !2)
+!11 = !DILocation(line: 4, column: 10, scope: !10)
+!12 = !DILocation(line: 4, column: 3, scope: !10)
+
diff --git a/test/Transforms/InstSimplify/div.ll b/test/Transforms/InstSimplify/div.ll
index b8ce34aaa37e6ab2081b8bb28ede0d9cece0a0aa..f096719359dcde37c53aea74692e46734dd867ff 100644
--- a/test/Transforms/InstSimplify/div.ll
+++ b/test/Transforms/InstSimplify/div.ll
@@ -1,10 +1,64 @@
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
+; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
+
+define <2 x i8> @sdiv_zero_elt_vec_constfold(<2 x i8> %x) {
+; CHECK-LABEL: @sdiv_zero_elt_vec_constfold(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %div = sdiv <2 x i8> <i8 1, i8 2>, <i8 0, i8 -42>
+  ret <2 x i8> %div
+}
+
+define <2 x i8> @udiv_zero_elt_vec_constfold(<2 x i8> %x) {
+; CHECK-LABEL: @udiv_zero_elt_vec_constfold(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %div = udiv <2 x i8> <i8 1, i8 2>, <i8 42, i8 0>
+  ret <2 x i8> %div
+}
+
+define <2 x i8> @sdiv_zero_elt_vec(<2 x i8> %x) {
+; CHECK-LABEL: @sdiv_zero_elt_vec(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %div = sdiv <2 x i8> %x, <i8 -42, i8 0>
+  ret <2 x i8> %div
+}
+
+define <2 x i8> @udiv_zero_elt_vec(<2 x i8> %x) {
+; CHECK-LABEL: @udiv_zero_elt_vec(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %div = udiv <2 x i8> %x, <i8 0, i8 42>
+  ret <2 x i8> %div
+}
+
+; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
+; Thus, we can simplify this: if any element of 'y' is 0, we can do anything.
+; Therefore, assume that all elements of 'y' must be 1.
+
+define <2 x i1> @sdiv_bool_vec(<2 x i1> %x, <2 x i1> %y) {
+; CHECK-LABEL: @sdiv_bool_vec(
+; CHECK-NEXT:    ret <2 x i1> %x
+;
+  %div = sdiv <2 x i1> %x, %y
+  ret <2 x i1> %div
+}
+
+define <2 x i1> @udiv_bool_vec(<2 x i1> %x, <2 x i1> %y) {
+; CHECK-LABEL: @udiv_bool_vec(
+; CHECK-NEXT:    ret <2 x i1> %x
+;
+  %div = udiv <2 x i1> %x, %y
+  ret <2 x i1> %div
+}
+
 declare i32 @external()
 
 define i32 @div1() {
 ; CHECK-LABEL: @div1(
-; CHECK:         [[CALL:%.*]] = call i32 @external(), !range !0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @external(), !range !0
 ; CHECK-NEXT:    ret i32 0
 ;
   %call = call i32 @external(), !range !0
diff --git a/test/Transforms/InstSimplify/fdiv.ll b/test/Transforms/InstSimplify/fdiv.ll
index bb7f443f4238709507094a95ba08fb5bb1740801..6643afd81471111a065e59633507a87e4c734fc7 100644
--- a/test/Transforms/InstSimplify/fdiv.ll
+++ b/test/Transforms/InstSimplify/fdiv.ll
@@ -1,9 +1,25 @@
-; NOTE: Assertions have been autogenerated by update_test_checks.py
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
+define float @fdiv_constant_fold() {
+; CHECK-LABEL: @fdiv_constant_fold(
+; CHECK-NEXT:    ret float 1.500000e+00
+;
+  %f = fdiv float 3.0, 2.0
+  ret float %f
+}
+
+define float @frem_constant_fold() {
+; CHECK-LABEL: @frem_constant_fold(
+; CHECK-NEXT:    ret float 1.000000e+00
+;
+  %f = frem float 3.0, 2.0
+  ret float %f
+}
+
 define double @fdiv_of_undef(double %X) {
 ; CHECK-LABEL: @fdiv_of_undef(
-; CHECK:         ret double undef
+; CHECK-NEXT:    ret double undef
 ;
 ; undef / X -> undef
   %r = fdiv double undef, %X
@@ -12,7 +28,7 @@ define double @fdiv_of_undef(double %X) {
 
 define double @fdiv_by_undef(double %X) {
 ; CHECK-LABEL: @fdiv_by_undef(
-; CHECK:         ret double undef
+; CHECK-NEXT:    ret double undef
 ;
 ; X / undef -> undef
   %r = fdiv double %X, undef
diff --git a/test/Transforms/InstSimplify/mul.ll b/test/Transforms/InstSimplify/mul.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0bf8f699a6860c187719758186259d0b9948304f
--- /dev/null
+++ b/test/Transforms/InstSimplify/mul.ll
@@ -0,0 +1,11 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define <2 x i1> @test1(<2 x i1> %a) {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %b = and <2 x i1> %a, <i1 true, i1 false>
+  %res = mul <2 x i1> %b, <i1 false, i1 true>
+  ret <2 x i1> %res
+}
diff --git a/test/Transforms/InstSimplify/rem.ll b/test/Transforms/InstSimplify/rem.ll
index c73d34346ded5804c65c835780dc79da3b6fc2de..b7f18f36b4b98631dd1ea1f067bd22ab2fdd319c 100644
--- a/test/Transforms/InstSimplify/rem.ll
+++ b/test/Transforms/InstSimplify/rem.ll
@@ -1,9 +1,63 @@
 ; NOTE: Assertions have been autogenerated by update_test_checks.py
 ; RUN: opt < %s -instsimplify -S | FileCheck %s
 
+; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
+
+define <2 x i8> @srem_zero_elt_vec_constfold(<2 x i8> %x) {
+; CHECK-LABEL: @srem_zero_elt_vec_constfold(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %rem = srem <2 x i8> <i8 1, i8 2>, <i8 0, i8 -42>
+  ret <2 x i8> %rem
+}
+
+define <2 x i8> @urem_zero_elt_vec_constfold(<2 x i8> %x) {
+; CHECK-LABEL: @urem_zero_elt_vec_constfold(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %rem = urem <2 x i8> <i8 1, i8 2>, <i8 42, i8 0>
+  ret <2 x i8> %rem
+}
+
+define <2 x i8> @srem_zero_elt_vec(<2 x i8> %x) {
+; CHECK-LABEL: @srem_zero_elt_vec(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %rem = srem <2 x i8> %x, <i8 -42, i8 0>
+  ret <2 x i8> %rem
+}
+
+define <2 x i8> @urem_zero_elt_vec(<2 x i8> %x) {
+; CHECK-LABEL: @urem_zero_elt_vec(
+; CHECK-NEXT:    ret <2 x i8> undef
+;
+  %rem = urem <2 x i8> %x, <i8 0, i8 42>
+  ret <2 x i8> %rem
+}
+
+; Division-by-zero is undef. UB in any vector lane means the whole op is undef.
+; Thus, we can simplify this: if any element of 'y' is 0, we can do anything.
+; Therefore, assume that all elements of 'y' must be 1.
+
+define <2 x i1> @srem_bool_vec(<2 x i1> %x, <2 x i1> %y) {
+; CHECK-LABEL: @srem_bool_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %rem = srem <2 x i1> %x, %y
+  ret <2 x i1> %rem
+}
+
+define <2 x i1> @urem_bool_vec(<2 x i1> %x, <2 x i1> %y) {
+; CHECK-LABEL: @urem_bool_vec(
+; CHECK-NEXT:    ret <2 x i1> zeroinitializer
+;
+  %rem = urem <2 x i1> %x, %y
+  ret <2 x i1> %rem
+}
+
 define i32 @select1(i32 %x, i1 %b) {
 ; CHECK-LABEL: @select1(
-; CHECK:         ret i32 0
+; CHECK-NEXT:    ret i32 0
 ;
   %rhs = select i1 %b, i32 %x, i32 1
   %rem = srem i32 %x, %rhs
@@ -12,7 +66,7 @@ define i32 @select1(i32 %x, i1 %b) {
 
 define i32 @select2(i32 %x, i1 %b) {
 ; CHECK-LABEL: @select2(
-; CHECK:         ret i32 0
+; CHECK-NEXT:    ret i32 0
 ;
   %rhs = select i1 %b, i32 %x, i32 1
   %rem = urem i32 %x, %rhs
@@ -21,40 +75,40 @@ define i32 @select2(i32 %x, i1 %b) {
 
 define i32 @rem1(i32 %x, i32 %n) {
 ; CHECK-LABEL: @rem1(
-; CHECK:         [[MOD:%.*]] = srem i32 %x, %n
+; CHECK-NEXT:    [[MOD:%.*]] = srem i32 %x, %n
 ; CHECK-NEXT:    ret i32 [[MOD]]
 ;
- %mod = srem i32 %x, %n
- %mod1 = srem i32 %mod, %n
- ret i32 %mod1
+  %mod = srem i32 %x, %n
+  %mod1 = srem i32 %mod, %n
+  ret i32 %mod1
 }
 
 define i32 @rem2(i32 %x, i32 %n) {
 ; CHECK-LABEL: @rem2(
-; CHECK:         [[MOD:%.*]] = urem i32 %x, %n
+; CHECK-NEXT:    [[MOD:%.*]] = urem i32 %x, %n
 ; CHECK-NEXT:    ret i32 [[MOD]]
 ;
- %mod = urem i32 %x, %n
- %mod1 = urem i32 %mod, %n
- ret i32 %mod1
+  %mod = urem i32 %x, %n
+  %mod1 = urem i32 %mod, %n
+  ret i32 %mod1
 }
 
 define i32 @rem3(i32 %x, i32 %n) {
 ; CHECK-LABEL: @rem3(
-; CHECK:         [[MOD:%.*]] = srem i32 %x, %n
+; CHECK-NEXT:    [[MOD:%.*]] = srem i32 %x, %n
 ; CHECK-NEXT:    [[MOD1:%.*]] = urem i32 [[MOD]], %n
 ; CHECK-NEXT:    ret i32 [[MOD1]]
 ;
- %mod = srem i32 %x, %n
- %mod1 = urem i32 %mod, %n
- ret i32 %mod1
+  %mod = srem i32 %x, %n
+  %mod1 = urem i32 %mod, %n
+  ret i32 %mod1
 }
 
 declare i32 @external()
 
 define i32 @rem4() {
 ; CHECK-LABEL: @rem4(
-; CHECK:         [[CALL:%.*]] = call i32 @external(), !range !0
+; CHECK-NEXT:    [[CALL:%.*]] = call i32 @external(), !range !0
 ; CHECK-NEXT:    ret i32 [[CALL]]
 ;
   %call = call i32 @external(), !range !0
diff --git a/test/Transforms/InstSimplify/shufflevector.ll b/test/Transforms/InstSimplify/shufflevector.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c6d180da293f8ee02857697d0577da6dd9cafc9c
--- /dev/null
+++ b/test/Transforms/InstSimplify/shufflevector.ll
@@ -0,0 +1,212 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instsimplify -S | FileCheck %s
+
+define <4 x i32> @const_folding(<4 x i32> %x) {
+; CHECK-LABEL: @const_folding(
+; CHECK-NEXT:    ret <4 x i32> zeroinitializer
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> zeroinitializer, <4 x i32> <i32 5, i32 4, i32 5, i32 4>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @const_folding1(<4 x i32> %x) {
+; CHECK-LABEL: @const_folding1(
+; CHECK-NEXT:    ret <4 x i32> <i32 5, i32 5, i32 5, i32 5>
+;
+  %shuf = shufflevector <4 x i32> <i32 5, i32 4, i32 5, i32 4>, <4 x i32> %x, <4 x i32> zeroinitializer
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @const_folding_negative(<3 x i32> %x) {
+; CHECK-LABEL: @const_folding_negative(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <3 x i32> [[X:%.*]], <3 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 5, i32 4>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <3 x i32> %x, <3 x i32> zeroinitializer, <4 x i32> <i32 2, i32 4, i32 5, i32 4>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand1(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand1(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand2(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> %y, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand3(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand3(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <4 x i32> [[SPLAT]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> zeroinitializer, <4 x i32> %splat, <4 x i32> <i32 7, i32 6, i32 5, i32 5>
+  ret <4 x i32> %shuf
+}
+
+define <8 x i32> @splat_operand_negative(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand_negative(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[SPLAT]], <4 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <8 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> undef, <8 x i32> <i32 0, i32 3, i32 2, i32 1, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand_negative2(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand_negative2(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[SPLAT]], <4 x i32> [[Y:%.*]], <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> %y, <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand_negative3(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @splat_operand_negative3(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[Y:%.*]], <4 x i32> [[SPLAT]], <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> zeroinitializer
+  %shuf = shufflevector <4 x i32> %y, <4 x i32> %splat, <4 x i32> <i32 0, i32 3, i32 4, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @splat_operand_negative4(<4 x i32> %x) {
+; CHECK-LABEL: @splat_operand_negative4(
+; CHECK-NEXT:    [[SPLAT:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[SPLAT]], <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %splat = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 2, i32 undef, i32 2, i32 undef>
+  %shuf = shufflevector <4 x i32> %splat, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @undef_mask(<4 x i32> %x) {
+; CHECK-LABEL: @undef_mask(
+; CHECK-NEXT:    ret <4 x i32> undef
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> undef
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @identity_mask_0(<4 x i32> %x) {
+; CHECK-LABEL: @identity_mask_0(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @identity_mask_1(<4 x i32> %x) {
+; CHECK-LABEL: @identity_mask_1(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> undef, <4 x i32> [[X:%.*]], <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <4 x i32> undef, <4 x i32> %x, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @pseudo_identity_mask(<4 x i32> %x) {
+; CHECK-LABEL: @pseudo_identity_mask(
+; CHECK-NEXT:    [[SHUF:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> [[X]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x i32> [[SHUF]]
+;
+  %shuf = shufflevector <4 x i32> %x, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @const_operand(<4 x i32> %x) {
+; CHECK-LABEL: @const_operand(
+; CHECK-NEXT:    ret <4 x i32> <i32 42, i32 45, i32 44, i32 43>
+;
+  %shuf = shufflevector <4 x i32> <i32 42, i32 43, i32 44, i32 45>, <4 x i32> %x, <4 x i32> <i32 0, i32 3, i32 2, i32 1>
+  ret <4 x i32> %shuf
+}
+
+define <4 x i32> @merge(<4 x i32> %x) {
+; CHECK-LABEL: @merge(
+; CHECK-NEXT:    [[LOWER:%.*]] = shufflevector <4 x i32> [[X:%.*]], <4 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:    [[UPPER:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[MERGED:%.*]] = shufflevector <2 x i32> [[UPPER]], <2 x i32> [[LOWER]], <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+; CHECK-NEXT:    ret <4 x i32> [[MERGED]]
+;
+  %lower = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 1, i32 0>
+  %upper = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
+  %merged = shufflevector <2 x i32> %upper, <2 x i32> %lower, <4 x i32> <i32 3, i32 2, i32 0, i32 1>
+  ret <4 x i32> %merged
+}
+
+define <8 x double> @extract_and_concat(<8 x double> %x) {
+; CHECK-LABEL: @extract_and_concat(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x double> [[X:%.*]], <8 x double> undef, <2 x i32> <i32 0, i32 1>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x double> [[X]], <8 x double> undef, <2 x i32> <i32 2, i32 3>
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x double> [[X]], <8 x double> undef, <2 x i32> <i32 4, i32 5>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <8 x double> [[X]], <8 x double> undef, <2 x i32> <i32 6, i32 7>
+; CHECK-NEXT:    [[S5:%.*]] = shufflevector <2 x double> [[S1]], <2 x double> [[S2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[S6:%.*]] = shufflevector <2 x double> [[S3]], <2 x double> [[S4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[S7:%.*]] = shufflevector <4 x double> [[S5]], <4 x double> [[S6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    ret <8 x double> [[S7]]
+;
+  %s1 = shufflevector <8 x double> %x, <8 x double> undef, <2 x i32> <i32 0, i32 1>
+  %s2 = shufflevector <8 x double> %x, <8 x double> undef, <2 x i32> <i32 2, i32 3>
+  %s3 = shufflevector <8 x double> %x, <8 x double> undef, <2 x i32> <i32 4, i32 5>
+  %s4 = shufflevector <8 x double> %x, <8 x double> undef, <2 x i32> <i32 6, i32 7>
+  %s5 = shufflevector <2 x double> %s1, <2 x double> %s2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s6 = shufflevector <2 x double> %s3, <2 x double> %s4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s7 = shufflevector <4 x double> %s5, <4 x double> %s6, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x double> %s7
+}
+
+; This case has intermediate lane crossings.
+
+define <8 x i64> @PR30630(<8 x i64> %x) {
+; CHECK-LABEL: @PR30630(
+; CHECK-NEXT:    [[S1:%.*]] = shufflevector <8 x i64> [[X:%.*]], <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT:    [[S2:%.*]] = shufflevector <8 x i64> [[X]], <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+; CHECK-NEXT:    [[S3:%.*]] = shufflevector <8 x i64> [[X]], <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+; CHECK-NEXT:    [[S4:%.*]] = shufflevector <8 x i64> [[X]], <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+; CHECK-NEXT:    [[S5:%.*]] = shufflevector <2 x i64> [[S1]], <2 x i64> [[S2]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[S6:%.*]] = shufflevector <2 x i64> [[S3]], <2 x i64> [[S4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[S7:%.*]] = shufflevector <4 x i64> [[S5]], <4 x i64> [[S6]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT:    ret <8 x i64> [[S7]]
+;
+  %s1 = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 0, i32 4>
+  %s2 = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 1, i32 5>
+  %s3 = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 2, i32 6>
+  %s4 = shufflevector <8 x i64> %x, <8 x i64> undef, <2 x i32> <i32 3, i32 7>
+  %s5 = shufflevector <2 x i64> %s1, <2 x i64> %s2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s6 = shufflevector <2 x i64> %s3, <2 x i64> %s4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %s7 = shufflevector <4 x i64> %s5, <4 x i64> %s6, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 1, i32 3, i32 5, i32 7>
+  ret <8 x i64> %s7
+}
+
diff --git a/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll b/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll
index 702dfdbb81a0d7d8c7321a4675bbeaebf02251c6..a038fd1a411b8bc613a2bd298dfa512179a14824 100644
--- a/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll
+++ b/test/Transforms/InterleavedAccess/AArch64/interleaved-accesses.ll
@@ -547,3 +547,230 @@ define void @store_general_mask_factor3_negativestart(<12 x i32>* %ptr, <32 x i3
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
   ret void
 }
+
+@g = external global <4 x float>
+
+; The following does not give a valid interleaved store
+; NEON-LABEL: define void @no_interleave
+; NEON-NOT: call void @llvm.aarch64.neon.st2
+; NEON: shufflevector
+; NEON: store
+; NEON: ret void
+; NO_NEON-LABEL: define void @no_interleave
+; NO_NEON: shufflevector
+; NO_NEON: store
+; NO_NEON: ret void
+define void @no_interleave(<4 x float> %a0) {
+  %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 3, i32 7, i32 undef>
+  store <4 x float> %v0, <4 x float>* @g, align 16
+  ret void
+}
+
+define void @load_factor2_wide2(<16 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor2_wide2(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2_wide2(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_wide3(<24 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor2_wide3(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP6]])
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
+; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; NEON-NEXT:       [[LDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0v4i32(<4 x i32>* [[TMP10]])
+; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 1
+; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN2]], 0
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; NEON-NEXT:       [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2_wide3(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+  ret void
+}
+
+define void @load_factor3_wide(<24 x i32>* %ptr) {
+; NEON-LABEL: @load_factor3_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld3.v4i32.p0v4i32(<4 x i32>* [[TMP7]])
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
+; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor3_wide(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  ret void
+}
+
+define void @load_factor4_wide(<32 x i32>* %ptr) {
+; NEON-LABEL: @load_factor4_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       [[LDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP2]])
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 3
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 2
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 1
+; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN]], 0
+; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; NEON-NEXT:       [[LDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld4.v4i32.p0v4i32(<4 x i32>* [[TMP8]])
+; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 3
+; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 2
+; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 1
+; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[LDN1]], 0
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor4_wide(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
+  %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  ret void
+}
+
+define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
+; NEON-LABEL:    @store_factor2_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32>* [[TMP4]])
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st2.v4i32.p0v4i32(<4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <4 x i32>* [[TMP8]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor2_wide(
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
+; NEON-LABEL:    @store_factor3_wide(
+; NEON:            [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP5:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32>* [[TMP5]])
+; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st3.v4i32.p0v4i32(<4 x i32> [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32>* [[TMP10]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor3_wide(
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
+; NEON-LABEL:    @store_factor4_wide(
+; NEON:            [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP1]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32>* [[TMP6]])
+; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; NEON-NEXT:       [[TMP11:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; NEON-NEXT:       [[TMP12:%.*]] = bitcast i32* [[TMP11]] to <4 x i32>*
+; NEON-NEXT:       call void @llvm.aarch64.neon.st4.v4i32.p0v4i32(<4 x i32> [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32>* [[TMP12]])
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor4_wide(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @load_factor2_fp128(<4 x fp128>* %ptr) {
+; NEON-LABEL:    @load_factor2_fp128(
+; NEON-NOT:        @llvm.aarch64.neon
+; NEON:            ret void
+; NO_NEON-LABEL: @load_factor2_fp128(
+; NO_NEON-NOT:     @llvm.aarch64.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
+  %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
+  ret void
+}
diff --git a/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll b/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
index caaaa21c5a130b9fc27257fd478a5cf846aa0dea..5938f9d7321d6a9b288ee2670692982c5ce838b5 100644
--- a/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
+++ b/test/Transforms/InterleavedAccess/ARM/interleaved-accesses.ll
@@ -1,5 +1,5 @@
-; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s -check-prefix=NEON
-; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefix=NO_NEON
+; RUN: opt < %s -mattr=+neon -interleaved-access -S | FileCheck %s -check-prefixes=NEON,ALL
+; RUN: opt < %s -interleaved-access -S | FileCheck %s -check-prefixes=NO_NEON,ALL
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-n32-S64"
 target triple = "arm---eabi"
@@ -387,13 +387,31 @@ define void @store_address_space(<4 x i32> addrspace(1)* %ptr, <2 x i32> %v0, <2
   ret void
 }
 
+define void @load_f16_factor2(<8 x half>* %ptr) {
+; ALL-LABEL: @load_f16_factor2(
+; ALL-NOT:     @llvm.arm.neon
+; ALL:         ret void
+;
+  %interleaved.vec = load <8 x half>, <8 x half>* %ptr, align 4
+  %v0 = shufflevector <8 x half> %interleaved.vec, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %v1 = shufflevector <8 x half> %interleaved.vec, <8 x half> undef, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  ret void
+}
+
+define void @store_f16_factor2(<8 x half>* %ptr, <4 x half> %v0, <4 x half> %v1) {
+; ALL-LABEL: @store_f16_factor2(
+; ALL-NOT:     @llvm.arm.neon
+; ALL:         ret void
+;
+  %interleaved.vec = shufflevector <4 x half> %v0, <4 x half> %v1, <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
+  store <8 x half> %interleaved.vec, <8 x half>* %ptr, align 4
+  ret void
+}
+
 define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
-; NEON-LABEL:    @load_illegal_factor2(
-; NEON-NOT:        @llvm.arm.neon
-; NEON:            ret void
-; NO_NEON-LABEL: @load_illegal_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; ALL-LABEL:    @load_illegal_factor2(
+; ALL-NOT:        @llvm.arm.neon
+; ALL:            ret void
 ;
   %interleaved.vec = load <3 x float>, <3 x float>* %ptr, align 16
   %v0 = shufflevector <3 x float> %interleaved.vec, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
@@ -401,12 +419,9 @@ define void @load_illegal_factor2(<3 x float>* %ptr) nounwind {
 }
 
 define void @store_illegal_factor2(<3 x float>* %ptr, <3 x float> %v0) nounwind {
-; NEON-LABEL:    @store_illegal_factor2(
-; NEON-NOT:        @llvm.arm.neon
-; NEON:            ret void
-; NO_NEON-LABEL: @store_illegal_factor2(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; ALL-LABEL: @store_illegal_factor2(
+; ALL-NOT:     @llvm.arm.neon
+; ALL:         ret void
 ;
   %interleaved.vec = shufflevector <3 x float> %v0, <3 x float> undef, <3 x i32> <i32 0, i32 2, i32 undef>
   store <3 x float> %interleaved.vec, <3 x float>* %ptr, align 16
@@ -538,12 +553,9 @@ define void @store_general_mask_factor3_undefmultimid(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_undef_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_undef_fail(
-; NEON-NOT:        @llvm.arm.neon
-; NEON:            ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_undef_fail(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; ALL-LABEL: @store_general_mask_factor3_undef_fail(
+; ALL-NOT:     @llvm.arm.neon
+; ALL:         ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 4, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 8, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -568,12 +580,9 @@ define void @store_general_mask_factor3_undeflane(<12 x i32>* %ptr, <32 x i32> %
 }
 
 define void @store_general_mask_factor3_endstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_endstart_fail(
-; NEON-NOT:        @llvm.arm.neon
-; NEON:            ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_endstart_fail(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; ALL-LABEL:    @store_general_mask_factor3_endstart_fail(
+; ALL-NOT:        @llvm.arm.neon
+; ALL:            ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 undef, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 2, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -598,12 +607,9 @@ define void @store_general_mask_factor3_endstart_pass(<12 x i32>* %ptr, <32 x i3
 }
 
 define void @store_general_mask_factor3_midstart_fail(<12 x i32>* %ptr, <32 x i32> %v0, <32 x i32> %v1) {
-; NEON-LABEL:    @store_general_mask_factor3_midstart_fail(
-; NEON-NOT:        @llvm.arm.neon
-; NEON:            ret void
-; NO_NEON-LABEL: @store_general_mask_factor3_midstart_fail(
-; NO_NEON-NOT:     @llvm.arm.neon
-; NO_NEON:         ret void
+; ALL-LABEL:    @store_general_mask_factor3_midstart_fail(
+; ALL-NOT:        @llvm.arm.neon
+; ALL:            ret void
 ;
   %interleaved.vec = shufflevector <32 x i32> %v0, <32 x i32> %v1, <12 x i32> <i32 undef, i32 32, i32 16, i32 0, i32 33, i32 17, i32 undef, i32 34, i32 18, i32 undef, i32 35, i32 19>
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
@@ -626,3 +632,225 @@ define void @store_general_mask_factor3_midstart_pass(<12 x i32>* %ptr, <32 x i3
   store <12 x i32> %interleaved.vec, <12 x i32>* %ptr, align 4
   ret void
 }
+
+@g = external global <4 x float>
+
+; The following does not give a valid interleaved store
+; ALL-LABEL: define void @no_interleave
+; ALL-NOT: call void @llvm.arm.neon.vst2
+; ALL: shufflevector
+; ALL: store
+; ALL: ret void
+define void @no_interleave(<4 x float> %a0) {
+  %v0 = shufflevector <4 x float> %a0, <4 x float> %a0, <4 x i32> <i32 0, i32 7, i32 1, i32 undef>
+  store <4 x float> %v0, <4 x float>* @g, align 16
+  ret void
+}
+
+define void @load_factor2_wide2(<16 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor2_wide2(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2_wide2(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <16 x i32>, <16 x i32>* %ptr, align 4
+  %v0 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %v1 = shufflevector <16 x i32> %interleaved.vec, <16 x i32> undef, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  ret void
+}
+
+define void @load_factor2_wide3(<24 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor2_wide3(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* [[PTR:%.*]] to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 1
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN]], 0
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP6]], i32 4)
+; NEON-NEXT:       [[TMP7:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; NEON-NEXT:       [[TMP9:%.*]] = getelementptr i32, i32* [[TMP5]], i32 8
+; NEON-NEXT:       [[TMP10:%.*]] = bitcast i32* [[TMP9]] to i8*
+; NEON-NEXT:       [[VLDN2:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32.p0i8(i8* [[TMP10]], i32 4)
+; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 1
+; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[VLDN2]], 0
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <8 x i32> [[TMP13]], <8 x i32> [[TMP14]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
+; NEON-NEXT:       [[TMP18:%.*]] = shufflevector <8 x i32> [[TMP16]], <8 x i32> [[TMP17]], <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor2_wide3(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14, i32 16, i32 18, i32 20, i32 22>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <12 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15, i32 17, i32 19, i32 21, i32 23>
+  ret void
+}
+
+define void @load_factor3_wide(<24 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor3_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
+; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32.p0i8(i8* [[TMP7]], i32 4)
+; NEON-NEXT:       [[TMP8:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
+; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP8]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor3_wide(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4
+  %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 0, i32 3, i32 6, i32 9, i32 12, i32 15, i32 18, i32 21>
+  %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 1, i32 4, i32 7, i32 10, i32 13, i32 16, i32 19, i32 22>
+  %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> undef, <8 x i32> <i32 2, i32 5, i32 8, i32 11, i32 14, i32 17, i32 20, i32 23>
+  ret void
+}
+
+define void @load_factor4_wide(<32 x i32>* %ptr) {
+; NEON-LABEL:    @load_factor4_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[VLDN:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP2]], i32 4)
+; NEON-NEXT:       [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 3
+; NEON-NEXT:       [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 2
+; NEON-NEXT:       [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 1
+; NEON-NEXT:       [[TMP6:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN]], 0
+; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; NEON-NEXT:       [[VLDN1:%.*]] = call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32.p0i8(i8* [[TMP8]], i32 4)
+; NEON-NEXT:       [[TMP9:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 3
+; NEON-NEXT:       [[TMP10:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 2
+; NEON-NEXT:       [[TMP11:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 1
+; NEON-NEXT:       [[TMP12:%.*]] = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } [[VLDN1]], 0
+; NEON-NEXT:       [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP9]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP10]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP11]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP16:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> [[TMP12]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @load_factor4_wide(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = load <32 x i32>, <32 x i32>* %ptr, align 4
+  %v0 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28>
+  %v1 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 1, i32 5, i32 9, i32 13, i32 17, i32 21, i32 25, i32 29>
+  %v2 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 2, i32 6, i32 10, i32 14, i32 18, i32 22, i32 26, i32 30>
+  %v3 = shufflevector <32 x i32> %interleaved.vec, <32 x i32> undef, <8 x i32> <i32 3, i32 7, i32 11, i32 15, i32 19, i32 23, i32 27, i32 31>
+  ret void
+}
+
+define void @store_factor2_wide(<16 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1) {
+; NEON-LABEL:    @store_factor2_wide(
+; NEON-NEXT:       [[TMP1:%.*]] = bitcast <16 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], i32 4)
+; NEON-NEXT:       [[TMP5:%.*]] = getelementptr i32, i32* [[TMP1]], i32 8
+; NEON-NEXT:       [[TMP6:%.*]] = bitcast i32* [[TMP5]] to i8*
+; NEON-NEXT:       [[TMP7:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <8 x i32> %v0, <8 x i32> %v1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       call void @llvm.arm.neon.vst2.p0i8.v4i32(i8* [[TMP6]], <4 x i32> [[TMP7]], <4 x i32> [[TMP8]], i32 4)
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor2_wide(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %interleaved.vec = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  store <16 x i32> %interleaved.vec, <16 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor3_wide(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) {
+; NEON-LABEL:    @store_factor3_wide(
+; NEON:            [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], i32 4)
+; NEON-NEXT:       [[TMP6:%.*]] = getelementptr i32, i32* [[TMP1]], i32 12
+; NEON-NEXT:       [[TMP7:%.*]] = bitcast i32* [[TMP6]] to i8*
+; NEON-NEXT:       [[TMP8:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; NEON-NEXT:       call void @llvm.arm.neon.vst3.p0i8.v4i32(i8* [[TMP7]], <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], i32 4)
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor3_wide(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> <i32 0, i32 8, i32 16, i32 1, i32 9, i32 17, i32 2, i32 10, i32 18, i32 3, i32 11, i32 19, i32 4, i32 12, i32 20, i32 5, i32 13, i32 21, i32 6, i32 14, i32 22, i32 7, i32 15, i32 23>
+  store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @store_factor4_wide(<32 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2, <8 x i32> %v3) {
+; NEON-LABEL:    @store_factor4_wide(
+; NEON:            [[TMP1:%.*]] = bitcast <32 x i32>* %ptr to i32*
+; NEON-NEXT:       [[TMP2:%.*]] = bitcast i32* [[TMP1]] to i8*
+; NEON-NEXT:       [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NEON-NEXT:       [[TMP4:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
+; NEON-NEXT:       [[TMP5:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 16, i32 17, i32 18, i32 19>
+; NEON-NEXT:       [[TMP6:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 24, i32 25, i32 26, i32 27>
+; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP2]], <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], i32 4)
+; NEON-NEXT:       [[TMP7:%.*]] = getelementptr i32, i32* [[TMP1]], i32 16
+; NEON-NEXT:       [[TMP8:%.*]] = bitcast i32* [[TMP7]] to i8*
+; NEON-NEXT:       [[TMP9:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+; NEON-NEXT:       [[TMP10:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 12, i32 13, i32 14, i32 15>
+; NEON-NEXT:       [[TMP11:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 20, i32 21, i32 22, i32 23>
+; NEON-NEXT:       [[TMP12:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <4 x i32> <i32 28, i32 29, i32 30, i32 31>
+; NEON-NEXT:       call void @llvm.arm.neon.vst4.p0i8.v4i32(i8* [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> [[TMP12]], i32 4)
+; NEON-NEXT:       ret void
+; NO_NEON-LABEL: @store_factor4_wide(
+; NO_NEON-NOT:     @llvm.arm.neon
+; NO_NEON:         ret void
+;
+  %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %s1 = shufflevector <8 x i32> %v2, <8 x i32> %v3, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <32 x i32> <i32 0, i32 8, i32 16, i32 24, i32 1, i32 9, i32 17, i32 25, i32 2, i32 10, i32 18, i32 26, i32 3, i32 11, i32 19, i32 27, i32 4, i32 12, i32 20, i32 28, i32 5, i32 13, i32 21, i32 29, i32 6, i32 14, i32 22, i32 30, i32 7, i32 15, i32 23, i32 31>
+  store <32 x i32> %interleaved.vec, <32 x i32>* %ptr, align 4
+  ret void
+}
+
+define void @load_factor2_fp128(<4 x fp128>* %ptr) {
+; ALL-LABEL: @load_factor2_fp128(
+; ALL-NOT:     @llvm.arm.neon
+; ALL:         ret void
+;
+  %interleaved.vec = load <4 x fp128>, <4 x fp128>* %ptr, align 16
+  %v0 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 0, i32 2>
+  %v1 = shufflevector <4 x fp128> %interleaved.vec, <4 x fp128> undef, <2 x i32> <i32 1, i32 3>
+  ret void
+}
diff --git a/test/Transforms/JumpThreading/guards.ll b/test/Transforms/JumpThreading/guards.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eac2b5dcd85f9c9b83252ac76463db6c01fa8515
--- /dev/null
+++ b/test/Transforms/JumpThreading/guards.ll
@@ -0,0 +1,183 @@
+; RUN: opt < %s -jump-threading -dce -S | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+declare i32 @f1()
+declare i32 @f2()
+
+define i32 @branch_implies_guard(i32 %a) {
+; CHECK-LABEL: @branch_implies_guard(
+  %cond = icmp slt i32 %a, 10
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:       T1.split
+; CHECK:         %v1 = call i32 @f1()
+; CHECK-NEXT:    %retVal
+; CHECK-NEXT:    br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:       F1.split
+; CHECK:         %v2 = call i32 @f2()
+; CHECK-NEXT:    %retVal
+; CHECK-NEXT:    %condGuard
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard
+; CHECK-NEXT:    br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK-NOT:     call void(i1, ...) @llvm.experimental.guard(
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp slt i32 %a, 20
+  call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @not_branch_implies_guard(i32 %a) {
+; CHECK-LABEL: @not_branch_implies_guard(
+  %cond = icmp slt i32 %a, 20
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:       T1.split:
+; CHECK-NEXT:    %v1 = call i32 @f1()
+; CHECK-NEXT:    %retVal
+; CHECK-NEXT:    %condGuard
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard
+; CHECK-NEXT:    br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:       F1.split:
+; CHECK-NEXT:   %v2 = call i32 @f2()
+; CHECK-NEXT:   %retVal
+; CHECK-NEXT:   br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK-NOT:     call void(i1, ...) @llvm.experimental.guard(
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp sgt i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @branch_overlaps_guard(i32 %a) {
+; CHECK-LABEL: @branch_overlaps_guard(
+  %cond = icmp slt i32 %a, 20
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:        T1:
+; CHECK-NEXT:      %v1 = call i32 @f1()
+; CHECK-NEXT:      br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:        F1:
+; CHECK-NEXT:     %v2 = call i32 @f2()
+; CHECK-NEXT:     br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK:         %condGuard = icmp slt i32 %a, 10
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp slt i32 %a, 10
+  call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @branch_doesnt_overlap_guard(i32 %a) {
+; CHECK-LABEL: @branch_doesnt_overlap_guard(
+  %cond = icmp slt i32 %a, 10
+  br i1 %cond, label %T1, label %F1
+
+T1:
+; CHECK:        T1:
+; CHECK-NEXT:      %v1 = call i32 @f1()
+; CHECK-NEXT:      br label %Merge
+  %v1 = call i32 @f1()
+  br label %Merge
+
+F1:
+; CHECK:        F1:
+; CHECK-NEXT:     %v2 = call i32 @f2()
+; CHECK-NEXT:     br label %Merge
+  %v2 = call i32 @f2()
+  br label %Merge
+
+Merge:
+; CHECK:       Merge
+; CHECK:         %condGuard = icmp sgt i32 %a, 20
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  %retPhi = phi i32 [ %v1, %T1 ], [ %v2, %F1 ]
+  %retVal = add i32 %retPhi, 10
+  %condGuard = icmp sgt i32 %a, 20
+  call void(i1, ...) @llvm.experimental.guard(i1 %condGuard) [ "deopt"() ]
+  ret i32 %retVal
+}
+
+define i32 @not_a_diamond1(i32 %a, i1 %cond1) {
+; CHECK-LABEL: @not_a_diamond1(
+  br i1 %cond1, label %Pred, label %Exit
+
+Pred:
+; CHECK:       Pred:
+; CHECK-NEXT:    switch i32 %a, label %Exit
+  switch i32 %a, label %Exit [
+    i32 10, label %Merge
+    i32 20, label %Merge
+  ]
+
+Merge:
+; CHECK:       Merge:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
+; CHECK-NEXT:    br label %Exit
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
+  br label %Exit
+
+Exit:
+; CHECK:       Exit:
+; CHECK-NEXT:    ret i32 %a
+  ret i32 %a
+}
+
+define void @not_a_diamond2(i32 %a, i1 %cond1) {
+; CHECK-LABEL: @not_a_diamond2(
+  br label %Parent
+
+Merge:
+  call void(i1, ...) @llvm.experimental.guard(i1 %cond1)[ "deopt"() ]
+  ret void
+
+Pred:
+; CHECK-NEXT:  Pred:
+; CHECK-NEXT:    switch i32 %a, label %Exit
+  switch i32 %a, label %Exit [
+    i32 10, label %Merge
+    i32 20, label %Merge
+  ]
+
+Parent:
+  br label %Pred
+
+Exit:
+; CHECK:       Merge:
+; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 %cond1) [ "deopt"() ]
+; CHECK-NEXT:    ret void
+  ret void
+}
diff --git a/test/Transforms/JumpThreading/thread-loads.ll b/test/Transforms/JumpThreading/thread-loads.ll
index f54672d1956695eee0ae7d5bed34e1da113a9859..3606e796cdd5f16f73e7ffb618590257370d7611 100644
--- a/test/Transforms/JumpThreading/thread-loads.ll
+++ b/test/Transforms/JumpThreading/thread-loads.ll
@@ -1,5 +1,5 @@
 ; RUN: opt < %s -jump-threading -S | FileCheck %s
-; RUN: opt < %s -passes=jump-threading -S | FileCheck %s
+; RUN: opt < %s -aa-pipeline=basic-aa -passes=jump-threading -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin7"
@@ -302,6 +302,229 @@ ret2:
   ret void
 }
 
+define i32 @fn_noalias(i1 %c2,i64* noalias %P, i64* noalias %P2) {
+; CHECK-LABEL: @fn_noalias
+; CHECK-LABEL: cond1:
+; CHECK: %[[LD1:.*]] = load i64, i64* %P
+; CHECK: br i1 %c, label %[[THREAD:.*]], label %end
+; CHECK-LABEL: cond2:
+; CHECK: %[[LD2:.*]] = load i64, i64* %P
+; CHECK-LABEL: cond3:
+; CHECK: %[[PHI:.*]] = phi i64 [ %[[LD1]], %[[THREAD]] ], [ %[[LD2]], %cond2 ]
+; CHECK: call void @fn3(i64 %[[PHI]])
+entry:
+  br i1 %c2, label %cond2, label %cond1
+
+cond1:
+  %l1 = load i64, i64* %P
+  store i64 42, i64* %P2
+  %c = icmp eq i64 %l1, 0
+  br i1 %c, label %cond2, label %end
+
+cond2:
+  %l2 = load i64, i64* %P
+  call void @fn2(i64 %l2)
+  %c3 = icmp eq i64 %l2,  0
+  br i1 %c3, label %cond3, label %end
+
+cond3:
+  call void @fn3(i64 %l2)
+  br label %end
+
+end:
+  ret i32 0
+}
+
+; This tests if we can thread from %sw.bb.i to %do.body.preheader.i67 through
+; %sw.bb21.i. To make this happen, %l2 should be detected as a partically
+; redundant load with %l3 across the store to %phase in %sw.bb21.i.
+
+%struct.NEXT_MOVE = type { i32, i32, i32* }
+@hash_move = unnamed_addr global [65 x i32] zeroinitializer, align 4
+@current_move = internal global [65 x i32] zeroinitializer, align 4
+@last = internal unnamed_addr global [65 x i32*] zeroinitializer, align 8
+@next_status = internal unnamed_addr global [65 x %struct.NEXT_MOVE] zeroinitializer, align 8
+define fastcc i32 @Search(i64 %idxprom.i, i64 %idxprom.i89, i32 %c) {
+; CHECK-LABEL: @Search
+; CHECK-LABEL: sw.bb.i:
+; CHECK: %[[LD1:.*]] = load i32, i32* %arrayidx185, align 4
+; CHECK: %[[C1:.*]] = icmp eq i32 %[[LD1]], 0
+; CHECK: br i1 %[[C1]], label %sw.bb21.i.thread, label %if.then.i64
+; CHECK-LABEL: sw.bb21.i.thread:
+; CHECK: br label %[[THREAD_TO:.*]]
+; CHECK-LABEL: sw.bb21.i:
+; CHECK: %[[LD2:.*]] = load i32, i32* %arrayidx185, align 4
+; CHECK: %[[C2:.*]] = icmp eq i32 %[[LD2]], 0
+; CHECK:br i1 %[[C2]], label %[[THREAD_TO]], label %cleanup
+entry:
+  %arrayidx185 = getelementptr inbounds [65 x i32], [65 x i32]* @hash_move, i64 0, i64 %idxprom.i
+  %arrayidx307 = getelementptr inbounds [65 x i32], [65 x i32]* @current_move, i64 0, i64 %idxprom.i
+  %arrayidx89 = getelementptr inbounds [65 x i32*], [65 x i32*]* @last, i64 0, i64 %idxprom.i
+  %phase = getelementptr inbounds [65 x %struct.NEXT_MOVE], [65 x %struct.NEXT_MOVE]* @next_status, i64 0, i64 %idxprom.i, i32 0
+  br label %cond.true282
+
+cond.true282:
+  switch i32 %c, label %sw.default.i [
+    i32 1, label %sw.bb.i
+    i32 0, label %sw.bb21.i
+  ]
+
+sw.default.i:
+  br label %cleanup
+
+sw.bb.i:
+  %call.i62 = call fastcc i32* @GenerateCheckEvasions()
+  store i32* %call.i62, i32** %arrayidx89, align 8
+  %l2 = load i32, i32* %arrayidx185, align 4
+  %tobool.i63 = icmp eq i32 %l2, 0
+  br i1 %tobool.i63, label %sw.bb21.i, label %if.then.i64
+
+if.then.i64:                                      ; preds = %sw.bb.i
+  store i32 7, i32* %phase, align 8
+  store i32 %l2, i32* %arrayidx307, align 4
+  %call16.i = call fastcc i32 @ValidMove(i32 %l2)
+  %tobool17.i = icmp eq i32 %call16.i, 0
+  br i1 %tobool17.i, label %if.else.i65, label %cleanup
+
+if.else.i65:
+  call void @f65()
+  br label %sw.bb21.i
+
+sw.bb21.i:
+  store i32 10, i32* %phase, align 8
+  %l3= load i32, i32* %arrayidx185, align 4
+  %tobool27.i = icmp eq i32 %l3, 0
+  br i1 %tobool27.i, label %do.body.preheader.i67, label %cleanup
+
+do.body.preheader.i67:
+  call void @f67()
+  ret  i32 67
+
+cleanup:
+  call void @Cleanup()
+  ret  i32 0
+}
+
+declare fastcc i32* @GenerateCheckEvasions()
+declare fastcc i32 @ValidMove(i32 %move)
+declare void @f67()
+declare void @Cleanup()
+declare void @f65()
+
+define i32 @fn_SinglePred(i1 %c2,i64* %P) {
+; CHECK-LABEL: @fn_SinglePred
+; CHECK-LABEL: entry:
+; CHECK: %[[L1:.*]] = load i64, i64* %P
+; CHECK: br i1 %c, label %cond3, label %cond1
+; CHECK-LABEL: cond2:
+; CHECK-NOT: load
+; CHECK: %[[PHI:.*]] = phi i64 [ %[[L1]], %cond1 ]
+; CHECK: call void @fn2(i64 %[[PHI]])
+; CHECK: br label %end
+; CHECK-LABEL: cond3:
+; CHECK: call void @fn2(i64 %l1)
+; CHECK: call void @fn3(i64 %l1)
+
+entry:
+  %l1 = load i64, i64* %P
+  %c = icmp eq i64 %l1, 0
+  br i1 %c, label %cond2, label %cond1
+
+cond1:
+  br i1 %c2, label %cond2, label %end
+
+cond2:
+  %l2 = load i64, i64* %P
+  call void @fn2(i64 %l2)
+  %c3 = icmp eq i64 %l2,  0
+  br i1 %c3, label %cond3, label %end
+
+cond3:
+  call void @fn3(i64 %l2)
+  br label %end
+
+end:
+  ret i32 0
+}
+
+define i32 @fn_SinglePredMultihop(i1 %c1, i1 %c2,i64* %P) {
+; CHECK-LABEL: @fn_SinglePredMultihop
+; CHECK-LABEL: entry:
+; CHECK: %[[L1:.*]] = load i64, i64* %P
+; CHECK: br i1 %c0, label %cond3, label %cond0
+; CHECK-LABEL: cond2:
+; CHECK-NOT: load
+; CHECK: %[[PHI:.*]] = phi i64 [ %[[L1]], %cond1 ]
+; CHECK: call void @fn2(i64 %[[PHI]])
+; CHECK: br label %end
+; CHECK-LABEL: cond3:
+; CHECK: call void @fn2(i64 %l1)
+; CHECK: call void @fn3(i64 %l1)
+
+entry:
+  %l1 = load i64, i64* %P
+  %c0 = icmp eq i64 %l1, 0
+  br i1 %c0, label %cond2, label %cond0
+
+cond0:
+  br i1 %c1, label %cond1, label %end
+
+cond1:
+  br i1 %c2, label %cond2, label %end
+
+cond2:
+  %l2 = load i64, i64* %P
+  call void @fn2(i64 %l2)
+  %c3 = icmp eq i64 %l2,  0
+  br i1 %c3, label %cond3, label %end
+
+cond3:
+  call void @fn3(i64 %l2)
+  br label %end
+
+end:
+  ret i32 0
+}
+
+declare void @fn2(i64)
+declare void @fn3(i64)
+
+
+; Make sure we phi-translate and make the partially redundant load in
+; merge fully redudant and then we can jump-thread the block with the
+; store.
+;
+; CHECK-LABEL: define i32 @phi_translate_partial_redundant_loads(i32, i32*, i32*
+; CHECK: merge.thread:
+; CHECK: store
+; CHECK: br label %left_x
+;
+; CHECK: left_x:
+; CHECK-NEXT: ret i32 20
+define i32 @phi_translate_partial_redundant_loads(i32, i32*, i32*)  {
+  %cmp0 = icmp ne i32 %0, 0
+  br i1 %cmp0, label %left, label %right
+
+left:
+  store i32 1, i32* %1, align 4
+  br label %merge
+
+right:
+  br label %merge
+
+merge:
+  %phiptr = phi i32* [ %1, %left ], [ %2, %right ]
+  %newload = load i32, i32* %phiptr, align 4
+  %cmp1 = icmp slt i32 %newload, 5
+  br i1 %cmp1, label %left_x, label %right_x
+
+left_x:
+  ret i32 20
+
+right_x:
+  ret i32 10
+}
+
 !0 = !{!3, !3, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LICM/atomics.ll b/test/Transforms/LICM/atomics.ll
index 919d1bdd114c5b3f22dde2dd745f9c4be2f11c48..15c461aeca27509f151e138042e2c91600f31dab 100644
--- a/test/Transforms/LICM/atomics.ll
+++ b/test/Transforms/LICM/atomics.ll
@@ -60,8 +60,7 @@ end:
 ; CHECK-NEXT: br label %loop
 }
 
-; Don't try to "sink" unordered stores yet; it is legal, but the machinery
-; isn't there.
+; We can sink an unordered store
 define i32 @test4(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
 entry:
   br label %loop
@@ -75,6 +74,149 @@ loop:
 end:
   ret i32 %vala
 ; CHECK-LABEL: define i32 @test4(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NOT: store
+; CHECK-LABEL: end:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %vala
+; CHECK:   store atomic i32 %[[LCSSAPHI]], i32* %x unordered, align 4
+}
+
+; We currently don't handle ordered atomics.
+define i32 @test5(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x release, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test5(
 ; CHECK: load atomic i32, i32* %y monotonic
 ; CHECK-NEXT: store atomic
 }
+
+; We currently don't touch volatiles
+define i32 @test6(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store volatile i32 %vala, i32* %x, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test6(
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store volatile
+}
+
+; We currently don't touch volatiles
+define i32 @test6b(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic volatile i32 %vala, i32* %x unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test6b(
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store atomic volatile
+}
+
+; Mixing unorder atomics and normal loads/stores is
+; current unimplemented
+define i32 @test7(i32* nocapture noalias %x, i32* nocapture %y) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  store i32 5, i32* %x
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test7(
+; CHECK: store i32 5, i32* %x
+; CHECK-NEXT: load atomic i32, i32* %y
+; CHECK-NEXT: store atomic i32
+}
+
+; Three provably noalias locations - we can sink normal and unordered, but
+;  not monotonic
+define i32 @test7b(i32* nocapture noalias %x, i32* nocapture %y, i32* noalias nocapture %z) nounwind uwtable ssp {
+entry:
+  br label %loop
+
+loop:
+  store i32 5, i32* %x
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %z unordered, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test7b(
+; CHECK: load atomic i32, i32* %y monotonic
+
+; CHECK-LABEL: end:
+; CHECK: store i32 5, i32* %x
+; CHECK: store atomic i32 %{{.+}}, i32* %z unordered, align 4
+}
+
+
+define i32 @test8(i32* nocapture noalias %x, i32* nocapture %y) {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x unordered, align 4
+  fence release
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test8(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT: store atomic
+; CHECK-NEXT: fence
+}
+
+; Exact semantics of monotonic accesses are a bit vague in the C++ spec,
+; for the moment, be conservative and don't touch them.
+define i32 @test9(i32* nocapture noalias %x, i32* nocapture %y) {
+entry:
+  br label %loop
+
+loop:
+  %vala = load atomic i32, i32* %y monotonic, align 4
+  store atomic i32 %vala, i32* %x monotonic, align 4
+  %exitcond = icmp ne i32 %vala, 0
+  br i1 %exitcond, label %end, label %loop
+
+end:
+  ret i32 %vala
+; CHECK-LABEL: define i32 @test9(
+; CHECK-LABEL: loop:
+; CHECK: load atomic i32, i32* %y monotonic
+; CHECK-NEXT:   store atomic i32 %vala, i32* %x monotonic, align 4
+}
diff --git a/test/Transforms/LICM/hoist-fast-fdiv.ll b/test/Transforms/LICM/hoist-fast-fdiv.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f61564fd726cd52c694c5e7f47937f855b22ee45
--- /dev/null
+++ b/test/Transforms/LICM/hoist-fast-fdiv.ll
@@ -0,0 +1,34 @@
+; RUN: opt -licm -S < %s | FileCheck %s
+
+; Function Attrs: noinline norecurse nounwind readnone ssp uwtable
+define zeroext i1 @f(double %v) #0 {
+entry:
+; CHECK-LABEL: @f(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: fdiv fast double 1.000000e+00, %v
+  br label %loop
+
+loop:                                       ; preds = %entry, %loop
+  %v3 = phi i32 [ 0, %entry ], [ %v11, %loop ]
+  %v4 = phi i32 [ 0, %entry ], [ %v12, %loop ]
+  %v5 = uitofp i32 %v4 to double
+
+; CHECK-LABEL: loop:
+; CHECK: fmul fast double
+; CHECK-NOT: fdiv
+  %v6 = fdiv fast double %v5, %v
+  %v7 = fptoui double %v6 to i64
+  %v8 = and i64 %v7, 1
+  %v9 = xor i64 %v8, 1
+  %v10 = trunc i64 %v9 to i32
+  %v11 = add i32 %v10, %v3
+  %v12 = add nuw i32 %v4, 1
+  %v13 = icmp eq i32 %v12, -1
+  br i1 %v13, label %end, label %loop
+
+end:                                      ; preds = %loop
+  %v15 = phi i32 [ %v11, %loop ]
+  %v16 = icmp ne i32 %v15, 0
+  ret i1 %v16
+}
+
diff --git a/test/Transforms/LICM/hoisting.ll b/test/Transforms/LICM/hoisting.ll
index 9b29f5d600fec75bfb4177237bc4a51f1c44d8b9..cbd17689e9396f99aa846346163161b446485341 100644
--- a/test/Transforms/LICM/hoisting.ll
+++ b/test/Transforms/LICM/hoisting.ll
@@ -149,3 +149,174 @@ latch:
 return:
   ret i32 %sum
 }
+
+declare {}* @llvm.invariant.start.p0i8(i64, i8* nocapture) nounwind readonly
+declare void @llvm.invariant.end.p0i8({}*, i64, i8* nocapture) nounwind
+declare void @escaping.invariant.start({}*) nounwind
+; invariant.start dominates the load, and in this scope, the
+; load is invariant. So, we can hoist the `addrld` load out of the loop.
+define i32 @test_fence(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+
+
+; Same as test above, but the load is no longer invariant (presence of
+; invariant.end). We cannot hoist the addrld out of loop.
+define i32 @test_fence1(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence1
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NEXT: invariant.end
+; CHECK-NEXT: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  call void @llvm.invariant.end.p0i8({}* %invst, i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; same as test above, but instead of invariant.end, we have the result of
+; invariant.start escaping through a call. We cannot hoist the load.
+define i32 @test_fence2(i8* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence2
+; CHECK-LABEL: entry
+; CHECK-NOT: load
+; CHECK: br label %loop
+entry:
+  %gep = getelementptr inbounds i8, i8* %addr, i64 8
+  %addr.i = bitcast i8* %gep to i32 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  call void @escaping.invariant.start({}* %invst)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; FIXME: invariant.start dominates the load, and in this scope, the
+; load is invariant. So, we can hoist the `addrld` load out of the loop.
+; Consider the loadoperand addr.i bitcasted before being passed to
+; invariant.start
+define i32 @test_fence3(i32* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence3
+; CHECK-LABEL: entry
+; CHECK: invariant.start
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %addr.i = getelementptr inbounds i32, i32* %addr, i64 8
+  %gep = bitcast i32* %addr.i to i8 *
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
+
+; We should not hoist the addrld out of the loop.
+define i32 @test_fence4(i32* %addr, i32 %n, i8* %volatile) {
+; CHECK-LABEL: @test_fence4
+; CHECK-LABEL: entry
+; CHECK-NOT: %addrld = load atomic i32, i32* %addr.i unordered, align 8
+; CHECK: br label %loop
+entry: 
+  %addr.i = getelementptr inbounds i32, i32* %addr, i64 8
+  %gep = bitcast i32* %addr.i to i8 *
+  br label %loop
+
+loop:
+  %indvar = phi i32 [ %indvar.next, %loop ], [ 0, %entry ]
+  %sum = phi i32 [ %sum.next, %loop ], [ 0, %entry ]
+  store atomic i32 5, i32 * %addr.i unordered, align 8
+  fence release
+  %invst = call {}* @llvm.invariant.start.p0i8(i64 4, i8* %gep)
+  %volload = load atomic i8, i8* %volatile unordered, align 8
+  fence acquire
+  %volchk = icmp eq i8 %volload, 0
+  %addrld = load atomic i32, i32* %addr.i unordered, align 8
+  %sel = select i1 %volchk, i32 0, i32 %addrld
+  %sum.next = add i32 %sel, %sum
+  %indvar.next = add i32 %indvar, 1
+  %cond = icmp slt i32 %indvar.next, %n
+  br i1 %cond, label %loop, label %loopexit
+
+loopexit:
+  ret i32 %sum
+}
diff --git a/test/Transforms/LICM/opt-remarks.ll b/test/Transforms/LICM/opt-remarks.ll
index f0ef386c9f9a2a88829f69a6dab15279b686d766..b44fc57131a59dc0729076a4fedf5282debcda1b 100644
--- a/test/Transforms/LICM/opt-remarks.ll
+++ b/test/Transforms/LICM/opt-remarks.ll
@@ -10,7 +10,7 @@ Loop:
   %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]
   %addr = getelementptr i32, i32* %array, i32 %j
   %a = load i32, i32* %addr
-; CHECK: remark: /tmp/kk.c:2:20: hosting load
+; CHECK: remark: /tmp/kk.c:2:20: hoisting load
   %b = load i32, i32* %p, !dbg !8
   %a2 = add i32 %a, %b
   store i32 %a2, i32* %addr
diff --git a/test/Transforms/LICM/pr32129.ll b/test/Transforms/LICM/pr32129.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2618afe463224092d0e74f4711eed6518ace0d2d
--- /dev/null
+++ b/test/Transforms/LICM/pr32129.ll
@@ -0,0 +1,18 @@
+; RUN: opt -S -licm -loop-unswitch -licm < %s | FileCheck %s
+
+declare void @llvm.experimental.guard(i1, ...)
+
+define void @test() {
+; CHECK-LABEL: @test(
+; CHECK-NOT: guard
+entry:
+  br label %header
+
+header:
+  br label %loop
+
+loop:
+  %0 = icmp ult i32 0, 400
+  call void (i1, ...) @llvm.experimental.guard(i1 %0, i32 9) [ "deopt"() ]
+  br i1 undef, label %header, label %loop
+}
diff --git a/test/Transforms/LICM/scalar-promote.ll b/test/Transforms/LICM/scalar-promote.ll
index c88701154b8f1cbca551729bf0b986d319c48468..89888546494fb8c179a96239d46a0647d248c706 100644
--- a/test/Transforms/LICM/scalar-promote.ll
+++ b/test/Transforms/LICM/scalar-promote.ll
@@ -378,6 +378,33 @@ exit:
   ret i32 %ret
 }
 
+define void @test10(i32 %i) {
+Entry:
+  br label %Loop
+; CHECK-LABEL: @test10(
+; CHECK: Entry:
+; CHECK-NEXT:   load atomic i32, i32* @X unordered, align 4
+; CHECK-NEXT:   br label %Loop
+
+
+Loop:   ; preds = %Loop, %0
+  %j = phi i32 [ 0, %Entry ], [ %Next, %Loop ]    ; <i32> [#uses=1]
+  %x = load atomic i32, i32* @X unordered, align 4
+  %x2 = add i32 %x, 1
+  store atomic i32 %x2, i32* @X unordered, align 4
+  %Next = add i32 %j, 1
+  %cond = icmp eq i32 %Next, 0
+  br i1 %cond, label %Out, label %Loop
+
+Out:
+  ret void
+; CHECK: Out:
+; CHECK-NEXT:   %[[LCSSAPHI:.*]] = phi i32 [ %x2
+; CHECK-NEXT:   store atomic i32 %[[LCSSAPHI]], i32* @X unordered, align 4
+; CHECK-NEXT:   ret void
+
+}
+
 !0 = !{!4, !4, i64 0}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/test/Transforms/LoadCombine/deadcode.ll b/test/Transforms/LoadCombine/deadcode.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ed72824ffb443299e4fc255bdfa888099938f1c7
--- /dev/null
+++ b/test/Transforms/LoadCombine/deadcode.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -load-combine -S < %s | FileCheck %s
+
+; It has been detected that dead loops like the one in this test case can be
+; created by -jump-threading (it was detected by a csmith generated program).
+;
+; According to -verify this is valid input (even if it could be discussed if
+; the dead loop really satisfies SSA form).
+;
+; The problem found was that the -load-combine pass ends up in an infinite loop
+; when analysing the 'bb1' basic block.
+define void @test1() {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:    ret void
+; CHECK:       bb1:
+; CHECK-NEXT:    [[_TMP4:%.*]] = load i16, i16* [[_TMP10:%.*]], align 1
+; CHECK-NEXT:    [[_TMP10]] = getelementptr i16, i16* [[_TMP10]], i16 1
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[_TMP7:%.*]] = load i16, i16* [[_TMP12:%.*]], align 1
+; CHECK-NEXT:    [[_TMP12]] = getelementptr i16, i16* [[_TMP12]], i16 1
+; CHECK-NEXT:    br label [[BB2:%.*]]
+;
+  ret void
+
+bb1:
+  %_tmp4 = load i16, i16* %_tmp10, align 1
+  %_tmp10 = getelementptr i16, i16* %_tmp10, i16 1
+  br label %bb1
+
+; A second basic block. Running the test with -debug-pass=Executions shows
+; that we only run the Dominator Tree Construction one time for each function,
+; also when having multiple basic blocks in the function.
+bb2:
+  %_tmp7 = load i16, i16* %_tmp12, align 1
+  %_tmp12 = getelementptr i16, i16* %_tmp12, i16 1
+  br label %bb2
+
+}
diff --git a/test/Transforms/LoadCombine/load-combine-aa.ll b/test/Transforms/LoadCombine/load-combine-aa.ll
index fc639c0bc05db5b70437d02465d500d763bf89b0..5a577516fb47d534eb54a3e760437eca2223749b 100644
--- a/test/Transforms/LoadCombine/load-combine-aa.ll
+++ b/test/Transforms/LoadCombine/load-combine-aa.ll
@@ -1,4 +1,4 @@
-; RUN: opt -basicaa -load-combine -instcombine -S < %s | FileCheck %s
+; RUN: opt -basicaa -load-combine -S < %s | FileCheck %s
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -22,6 +22,7 @@ define i64 @test1(i32* nocapture readonly noalias %a, i32* nocapture readonly no
 define i64 @test2(i32* nocapture readonly %a, i32* nocapture readonly %b) {
 ; CHECK-LABEL: @test2
 
+; CHECK-NOT: load i64
 ; CHECK: load i32, i32*
 ; CHECK: load i32, i32*
 ; CHECK: ret i64
@@ -37,3 +38,26 @@ define i64 @test2(i32* nocapture readonly %a, i32* nocapture readonly %b) {
   ret i64 %add
 }
 
+%rec11 = type { i16, i16, i16 }
+@str = global %rec11 { i16 1, i16 2, i16 3 }
+
+; PR31517 - Check that loads which span an aliasing store are not combined.
+define i16 @test3() {
+; CHECK-LABEL: @test3
+
+; CHECK-NOT: load i32
+; CHECK: load i16, i16*
+; CHECK: store i16
+; CHECK: load i16, i16*
+; CHECK: ret i16
+
+  %_tmp9 = getelementptr %rec11, %rec11* @str, i16 0, i32 1
+  %_tmp10 = load i16, i16* %_tmp9
+  %_tmp12 = getelementptr %rec11, %rec11* @str, i16 0, i32 0
+  store i16 %_tmp10, i16* %_tmp12
+  %_tmp13 = getelementptr %rec11, %rec11* @str, i16 0, i32 0
+  %_tmp14 = load i16, i16* %_tmp13
+  %_tmp15 = icmp eq i16 %_tmp14, 3
+  %_tmp16 = select i1 %_tmp15, i16 1, i16 0
+  ret i16 %_tmp16
+}
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
index e6904ee50bcaeed8dabfb36f6c3730f6c59a57dc..4b2dab47a20f0c6996ec55da10fe918edfe1fa1a 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/aa-metadata.ll
@@ -15,7 +15,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; NOSCOPE: load float
 ; NOSCOPE: store float
 ; NOSCOPE: store float
-define void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @vectorize_alias_scope(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %a.idx.1 = getelementptr inbounds float, float addrspace(1)* %a, i64 1
   store float 0.0, float addrspace(1)* %a, align 4, !noalias !0
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
index 4369dafa425893268b159098c1eacba439507c77..368dc6ab361ea104fdfed4c5891420c6c97e4925 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/adjust-alloca-alignment.ll
@@ -10,7 +10,7 @@ target triple = "amdgcn--"
 
 ; ALIGNED: load i8, i8* %ptr0, align 1{{$}}
 ; ALIGNED: load i8, i8* %ptr1, align 1{{$}}
-define void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i8], align 1
   %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
   %val0 = load i8, i8* %ptr0, align 1
@@ -27,7 +27,7 @@ define void @load_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %o
 
 ; ALIGNED: load i16, i16* %ptr0, align 1{{$}}
 ; ALIGNED: load i16, i16* %ptr1, align 1{{$}}
-define void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i16], align 1
   %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
   %val0 = load i16, i16* %ptr0, align 1
@@ -47,7 +47,7 @@ define void @load_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32
 
 ; ALIGNED: load i32, i32* %ptr0, align 1
 ; ALIGNED: load i32, i32* %ptr1, align 1
-define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 1
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   %val0 = load i32, i32* %ptr0, align 1
@@ -64,8 +64,11 @@ define void @load_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32
 ; ALL: alloca [128 x i32], align 16
 
 ; UNALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 1{{$}}
-; ALIGNED: load <2 x i32>, <2 x i32>* %{{[0-9]+}}, align 4{{$}}
-define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+
+; FIXME: Should change alignment
+; ALIGNED: load i32
+; ALIGNED: load i32
+define amdgpu_kernel void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 16
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   %val0 = load i32, i32* %ptr0, align 1
@@ -82,7 +85,7 @@ define void @load_alloca16_unknown_offset_align1_i32(i32 addrspace(1)* noalias %
 
 ; ALIGNED: store i8 9, i8* %ptr0, align 1{{$}}
 ; ALIGNED: store i8 10, i8* %ptr1, align 1{{$}}
-define void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i8], align 1
   %ptr0 = getelementptr inbounds [128 x i8], [128 x i8]* %alloca, i32 0, i32 %offset
   store i8 9, i8* %ptr0, align 1
@@ -97,7 +100,7 @@ define void @store_unknown_offset_align1_i8(i8 addrspace(1)* noalias %out, i32 %
 
 ; ALIGNED: store i16 9, i16* %ptr0, align 1{{$}}
 ; ALIGNED: store i16 10, i16* %ptr1, align 1{{$}}
-define void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i16], align 1
   %ptr0 = getelementptr inbounds [128 x i16], [128 x i16]* %alloca, i32 0, i32 %offset
   store i16 9, i16* %ptr0, align 1
@@ -116,7 +119,7 @@ define void @store_unknown_offset_align1_i16(i16 addrspace(1)* noalias %out, i32
 
 ; ALIGNED: store i32 9, i32* %ptr0, align 1
 ; ALIGNED: store i32 10, i32* %ptr1, align 1
-define void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
+define amdgpu_kernel void @store_unknown_offset_align1_i32(i32 addrspace(1)* noalias %out, i32 %offset) #0 {
   %alloca = alloca [128 x i32], align 1
   %ptr0 = getelementptr inbounds [128 x i32], [128 x i32]* %alloca, i32 0, i32 %offset
   store i32 9, i32* %ptr0, align 1
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
index 25abb98c6ebdbf9f13238ca380bb27bfa2e444e4..8a75b8743fa589d336df7691c86e6214e2015b88 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/extended-index.ll
@@ -8,7 +8,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; CHECK: sext i32 %id.x to i64
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float> zeroinitializer
-define void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @basic_merge_sext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %sext.id.x = sext i32 %id.x to i64
@@ -32,7 +32,7 @@ entry:
 ; CHECK: zext i32 %id.x to i64
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @basic_merge_zext_index(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %zext.id.x = zext i32 %id.x to i64
@@ -54,7 +54,7 @@ entry:
 ; CHECK-LABEL: @merge_op_zext_index(
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+define amdgpu_kernel void @merge_op_zext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %shl = shl i32 %id.x, 2
@@ -81,7 +81,7 @@ entry:
 ; CHECK-LABEL: @merge_op_sext_index(
 ; CHECK: load <2 x float>
 ; CHECK: store <2 x float>
-define void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
+define amdgpu_kernel void @merge_op_sext_index(float addrspace(1)* nocapture noalias %a, float addrspace(1)* nocapture noalias %b, float addrspace(1)* nocapture readonly noalias %c) #0 {
 entry:
   %id.x = call i32 @llvm.amdgcn.workitem.id.x()
   %shl = shl i32 %id.x, 2
@@ -112,7 +112,7 @@ entry:
 ; CHECK: loop:
 ; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32>
-define void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
+define amdgpu_kernel void @zext_trunc_phi_1(i32 addrspace(1)* nocapture noalias %a, i32 addrspace(1)* nocapture noalias %b, i32 addrspace(1)* nocapture readonly noalias %c, i32 %n, i64 %arst, i64 %aoeu) #0 {
 entry:
   %cmp0 = icmp eq i32 %n, 0
   br i1 %cmp0, label %exit, label %loop
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
index 2b2f9cbcf5083be9bb4837017acc2044a2d9e971..6182c09abcfe2388cee7a8cd88fa477e02f0df5a 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/insertion-point.ll
@@ -11,7 +11,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK: load <2 x float>
 ; CHECK: %w = add i32 %y, 9
 ; CHECK: %foo = add i32 %z, %w
-define void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @insert_load_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
   %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
   %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
@@ -38,7 +38,7 @@ entry:
 ; CHECK: %w = add i32 %y, 9
 ; CHECK: store <2 x float>
 ; CHECK: %foo = add i32 %z, %w
-define void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
+define amdgpu_kernel void @insert_store_point(float addrspace(1)* nocapture %a, float addrspace(1)* nocapture %b, float addrspace(1)* nocapture readonly %c, i64 %idx, i32 %x, i32 %y) #0 {
 entry:
   %a.idx.x = getelementptr inbounds float, float addrspace(1)* %a, i64 %idx
   %c.idx.x = getelementptr inbounds float, float addrspace(1)* %c, i64 %idx
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
index 4d6240a9aa9d1b49d27c90bcc9c24d0de3e4177c..3f6d7ee7dcacaa4e68398da65b672d0d85d7a671 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/interleaved-mayalias-store.ll
@@ -8,7 +8,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK: store double 0.000000e+00, double addrspace(1)* %a,
 ; CHECK: load double
 ; CHECK: store double 0.000000e+00, double addrspace(1)* %a.idx.1
-define void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
+define amdgpu_kernel void @interleave(double addrspace(1)* nocapture %a, double addrspace(1)* nocapture %b, double addrspace(1)* nocapture readonly %c) #0 {
 entry:
   %a.idx.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
   %c.idx.1 = getelementptr inbounds double, double addrspace(1)* %c, i64 1
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
index fd0aaa615db08194b7ba4c639a7e54a8e6e2432e..0fcdc7b9083a5eea09298d788bed52dfc343eb92 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores-private.ll
@@ -1,8 +1,9 @@
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT8-UNALIGNED -check-prefix=ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16 -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ALIGNED,ALL %s
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefix=ELT16-UNALIGNED -check-prefix=ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,-unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-ALIGNED,ALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,-unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-ALIGNED,ALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,-unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-ALIGNED,ALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-4,+unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT4,ELT4-UNALIGNED,UNALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-8,+unaligned-scratch-access  -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT8,ELT8-UNALIGNED,UNALIGNED,ALL %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -mattr=+max-private-element-size-16,+unaligned-scratch-access -load-store-vectorizer -S -o - %s | FileCheck -check-prefixes=ELT16,ELT16-UNALIGNED,UNALIGNED,ALL %s
 
 target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
 
@@ -16,7 +17,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; ELT8-UNALIGNED: store <2 x i32>
 
 ; ELT16-UNALIGNED: store <4 x i32>
-define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
   %out.gep.1 = getelementptr i32, i32* %out, i32 1
   %out.gep.2 = getelementptr i32, i32* %out, i32 2
   %out.gep.3 = getelementptr i32, i32* %out, i32 3
@@ -28,9 +29,63 @@ define void @merge_private_store_4_vector_elts_loads_v4i32(i32* %out) #0 {
   ret void
 }
 
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align1(
+; ALIGNED: store i32 9, i32* %out, align 1
+; ALIGNED: store i32 1, i32* %out.gep.1, align 1
+; ALIGNED: store i32 23, i32* %out.gep.2, align 1
+; ALIGNED: store i32 19, i32* %out.gep.3, align 1
+
+; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32>* %1, align 1
+
+; ELT8-UNALIGNED: store <2 x i32> <i32 9, i32 1>, <2 x i32>* %1, align 1
+; ELT8-UNALIGNED: store <2 x i32> <i32 23, i32 19>, <2 x i32>* %2, align 1
+
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align1(i32* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32* %out, i32 3
+
+  store i32 9, i32* %out, align 1
+  store i32 1, i32* %out.gep.1, align 1
+  store i32 23, i32* %out.gep.2, align 1
+  store i32 19, i32* %out.gep.3, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i32_align2(
+; ALIGNED: store i32 9, i32* %out, align 2
+; ALIGNED: store i32 1, i32* %out.gep.1, align 2
+; ALIGNED: store i32 23, i32* %out.gep.2, align 2
+; ALIGNED: store i32 19, i32* %out.gep.3, align 2
+
+; ELT16-UNALIGNED: store <4 x i32> <i32 9, i32 1, i32 23, i32 19>, <4 x i32>* %1, align 2
+
+; ELT8-UNALIGNED: store <2 x i32>
+; ELT8-UNALIGNED: store <2 x i32>
+
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i32_align2(i32* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32* %out, i32 2
+  %out.gep.3 = getelementptr i32, i32* %out, i32 3
+
+  store i32 9, i32* %out, align 2
+  store i32 1, i32* %out.gep.1, align 2
+  store i32 23, i32* %out.gep.2, align 2
+  store i32 19, i32* %out.gep.3, align 2
+  ret void
+}
+
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8(
 ; ALL: store <4 x i8>
-define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
   %out.gep.1 = getelementptr i8, i8* %out, i32 1
   %out.gep.2 = getelementptr i8, i8* %out, i32 2
   %out.gep.3 = getelementptr i8, i8* %out, i32 3
@@ -42,9 +97,28 @@ define void @merge_private_store_4_vector_elts_loads_v4i8(i8* %out) #0 {
   ret void
 }
 
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v4i8_align1(
+; ALIGNED: store i8
+; ALIGNED: store i8
+; ALIGNED: store i8
+; ALIGNED: store i8
+
+; UNALIGNED: store <4 x i8> <i8 9, i8 1, i8 23, i8 19>, <4 x i8>* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v4i8_align1(i8* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8* %out, i32 1
+  %out.gep.2 = getelementptr i8, i8* %out, i32 2
+  %out.gep.3 = getelementptr i8, i8* %out, i32 3
+
+  store i8 9, i8* %out, align 1
+  store i8 1, i8* %out.gep.1, align 1
+  store i8 23, i8* %out.gep.2, align 1
+  store i8 19, i8* %out.gep.3, align 1
+  ret void
+}
+
 ; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16(
 ; ALL: store <2 x i16>
-define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
   %out.gep.1 = getelementptr i16, i16* %out, i32 1
 
   store i16 9, i16* %out, align 4
@@ -52,4 +126,106 @@ define void @merge_private_store_4_vector_elts_loads_v2i16(i16* %out) #0 {
   ret void
 }
 
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align2(
+; ALIGNED: store i16
+; ALIGNED: store i16
+
+; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 2
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align2(i16* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16* %out, i32 1
+
+  store i16 9, i16* %out, align 2
+  store i16 12, i16* %out.gep.1, align 2
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align1(
+; ALIGNED: store i16
+; ALIGNED: store i16
+
+; UNALIGNED: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 1
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align1(i16* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16* %out, i32 1
+
+  store i16 9, i16* %out, align 1
+  store i16 12, i16* %out.gep.1, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_4_vector_elts_loads_v2i16_align8(
+; ALL: store <2 x i16> <i16 9, i16 12>, <2 x i16>* %1, align 8
+define amdgpu_kernel void @merge_private_store_4_vector_elts_loads_v2i16_align8(i16* %out) #0 {
+  %out.gep.1 = getelementptr i16, i16* %out, i32 1
+
+  store i16 9, i16* %out, align 8
+  store i16 12, i16* %out.gep.1, align 2
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32
+; ELT4: store i32
+; ELT4: store i32
+; ELT4: store i32
+
+; ELT8-ALIGNED: store i32
+; ELT8-ALIGNED: store i32
+; ELT8-ALIGNED: store i32
+
+; ELT8-UNALIGNED: store <2 x i32>
+; ELT8-UNALIGNED: store i32
+
+; ELT16-ALIGNED: store i32
+; ELT16-ALIGNED: store i32
+; ELT16-ALIGNED: store i32
+
+; ELT16-UNALIGNED: store <3 x i32>
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32(i32* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32* %out, i32 2
+
+  store i32 9, i32* %out
+  store i32 1, i32* %out.gep.1
+  store i32 23, i32* %out.gep.2
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i32_align1(
+; ALIGNED: store i32
+; ALIGNED: store i32
+; ALIGNED: store i32
+
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+; ELT4-UNALIGNED: store i32
+
+; ELT8-UNALIGNED: store <2 x i32>
+; ELT8-UNALIGNED: store i32
+
+; ELT16-UNALIGNED: store <3 x i32>
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i32_align1(i32* %out) #0 {
+  %out.gep.1 = getelementptr i32, i32* %out, i32 1
+  %out.gep.2 = getelementptr i32, i32* %out, i32 2
+
+  store i32 9, i32* %out, align 1
+  store i32 1, i32* %out.gep.1, align 1
+  store i32 23, i32* %out.gep.2, align 1
+  ret void
+}
+
+; ALL-LABEL: @merge_private_store_3_vector_elts_loads_v4i8_align1(
+; ALIGNED: store i8
+; ALIGNED: store i8
+; ALIGNED: store i8
+
+; UNALIGNED: store <3 x i8>
+define amdgpu_kernel void @merge_private_store_3_vector_elts_loads_v4i8_align1(i8* %out) #0 {
+  %out.gep.1 = getelementptr i8, i8* %out, i8 1
+  %out.gep.2 = getelementptr i8, i8* %out, i8 2
+
+  store i8 9, i8* %out, align 1
+  store i8 1, i8* %out.gep.1, align 1
+  store i8 23, i8* %out.gep.2, align 1
+  ret void
+}
+
 attributes #0 = { nounwind }
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
index d32387fa2c06e1565810a64bb4bc0e30d2fb6ca3..dbb7068eeae0d74c9c25337c93a258e2858d8df4 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-stores.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i8(
 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(1)* %{{[0-9]+}}, align 2
-define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -20,7 +20,7 @@ define void @merge_global_store_2_constants_i8(i8 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i8_natural_align
 ; CHECK: store <2 x i8>
-define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
 
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -30,7 +30,7 @@ define void @merge_global_store_2_constants_i8_natural_align(i8 addrspace(1)* %o
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i16
 ; CHECK: store <2 x i16> <i16 456, i16 123>, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -40,7 +40,7 @@ define void @merge_global_store_2_constants_i16(i16 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_0_i16
 ; CHECK: store <2 x i16> zeroinitializer, <2 x i16> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 0, i16 addrspace(1)* %out.gep.1
@@ -50,7 +50,7 @@ define void @merge_global_store_2_constants_0_i16(i16 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i16_natural_align
 ; CHECK: store <2 x i16>
-define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i16, i16 addrspace(1)* %out, i32 1
 
   store i16 123, i16 addrspace(1)* %out.gep.1
@@ -60,7 +60,7 @@ define void @merge_global_store_2_constants_i16_natural_align(i16 addrspace(1)*
 
 ; CHECK-LABEL: @merge_global_store_2_constants_half_natural_align
 ; CHECK: store <2 x half>
-define void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_half_natural_align(half addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr half, half addrspace(1)* %out, i32 1
 
   store half 2.0, half addrspace(1)* %out.gep.1
@@ -70,7 +70,7 @@ define void @merge_global_store_2_constants_half_natural_align(half addrspace(1)
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -80,7 +80,7 @@ define void @merge_global_store_2_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i32_f32
 ; CHECK: store <2 x i32> <i32 456, i32 1065353216>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast i32 addrspace(1)* %out.gep.1 to float addrspace(1)*
   store float 1.0, float addrspace(1)* %out.gep.1.bc
@@ -90,7 +90,7 @@ define void @merge_global_store_2_constants_i32_f32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_f32_i32
 ; CHECK  store <2 x float> <float 4.000000e+00, float 0x370EC00000000000>, <2 x float> addrspace(1)* %{{[0-9]+$}}
-define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.1.bc = bitcast float addrspace(1)* %out.gep.1 to i32 addrspace(1)*
   store i32 123, i32 addrspace(1)* %out.gep.1.bc
@@ -100,7 +100,7 @@ define void @merge_global_store_2_constants_f32_i32(float addrspace(1)* %out) #0
 
 ; CHECK-LABEL: @merge_global_store_4_constants_i32
 ; CHECK: store <4 x i32> <i32 1234, i32 123, i32 456, i32 333>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -114,7 +114,7 @@ define void @merge_global_store_4_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_4_constants_f32_order
 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}
-define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -129,7 +129,7 @@ define void @merge_global_store_4_constants_f32_order(float addrspace(1)* %out)
 ; First store is out of order.
 ; CHECK-LABEL: @merge_global_store_4_constants_f32
 ; CHECK: store <4 x float> <float 8.000000e+00, float 1.000000e+00, float 2.000000e+00, float 4.000000e+00>, <4 x float> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -143,7 +143,7 @@ define void @merge_global_store_4_constants_f32(float addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_4_constants_mixed_i32_f32
 ; CHECK: store <4 x i32> <i32 1090519040, i32 11, i32 1073741824, i32 17>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -160,7 +160,7 @@ define void @merge_global_store_4_constants_mixed_i32_f32(float addrspace(1)* %o
 
 ; CHECK-LABEL: @merge_global_store_3_constants_i32
 ; CHECK: store <3 x i32> <i32 1234, i32 123, i32 456>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
 
@@ -172,7 +172,7 @@ define void @merge_global_store_3_constants_i32(i32 addrspace(1)* %out) #0 {
 
 ; CHECK-LABEL: @merge_global_store_2_constants_i64
 ; CHECK: store <2 x i64> <i64 456, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
 
   store i64 123, i64 addrspace(1)* %out.gep.1
@@ -183,7 +183,7 @@ define void @merge_global_store_2_constants_i64(i64 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_global_store_4_constants_i64
 ; CHECK: store <2 x i64> <i64 456, i64 333>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
 ; CHECK: store <2 x i64> <i64 1234, i64 123>, <2 x i64> addrspace(1)* %{{[0-9]+}}, align 8
-define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i64, i64 addrspace(1)* %out, i64 1
   %out.gep.2 = getelementptr i64, i64 addrspace(1)* %out, i64 2
   %out.gep.3 = getelementptr i64, i64 addrspace(1)* %out, i64 3
@@ -202,7 +202,7 @@ define void @merge_global_store_4_constants_i64(i64 addrspace(1)* %out) #0 {
 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT1]], i32 1
 ; CHECK: store <2 x i32> [[INSERT1]]
-define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -220,7 +220,7 @@ define void @merge_global_store_2_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; CHECK: insertelement
 ; CHECK: insertelement
 ; CHECK: store <2 x i32>
-define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 3
 
@@ -241,7 +241,7 @@ define void @merge_global_store_2_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; CHECK: [[INSERT0:%[^ ]+]] = insertelement <2 x i32> undef, i32 [[ELT1]], i32 0
 ; CHECK: [[INSERT1:%[^ ]+]] = insertelement <2 x i32> [[INSERT0]], i32 [[ELT0]], i32 1
 ; CHECK: store <2 x i32> [[INSERT1]]
-define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
 
@@ -256,7 +256,7 @@ define void @merge_global_store_2_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -279,7 +279,7 @@ define void @merge_global_store_4_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; CHECK-LABEL: @merge_global_store_3_adjacent_loads_i32
 ; CHECK: load <3 x i32>
 ; CHECK: store <3 x i32>
-define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 1
@@ -298,7 +298,7 @@ define void @merge_global_store_3_adjacent_loads_i32(i32 addrspace(1)* %out, i32
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_f32
 ; CHECK: load <4 x float>
 ; CHECK: store <4 x float>
-define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, float addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr float, float addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr float, float addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr float, float addrspace(1)* %out, i32 3
@@ -321,7 +321,7 @@ define void @merge_global_store_4_adjacent_loads_f32(float addrspace(1)* %out, f
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i32_nonzero_base
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %in.gep.0 = getelementptr i32, i32 addrspace(1)* %in, i32 11
   %in.gep.1 = getelementptr i32, i32 addrspace(1)* %in, i32 12
   %in.gep.2 = getelementptr i32, i32 addrspace(1)* %in, i32 13
@@ -346,7 +346,7 @@ define void @merge_global_store_4_adjacent_loads_i32_nonzero_base(i32 addrspace(
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_inverse_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -373,7 +373,7 @@ define void @merge_global_store_4_adjacent_loads_inverse_i32(i32 addrspace(1)* %
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_shuffle_i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %out, i32 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -408,7 +408,7 @@ define void @merge_global_store_4_adjacent_loads_shuffle_i32(i32 addrspace(1)* %
 ; CHECK: insertelement <4 x i8>
 ; CHECK: insertelement <4 x i8>
 ; CHECK: store <4 x i8>
-define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -431,7 +431,7 @@ define void @merge_global_store_4_adjacent_loads_i8(i8 addrspace(1)* %out, i8 ad
 ; CHECK-LABEL: @merge_global_store_4_adjacent_loads_i8_natural_align
 ; CHECK: load <4 x i8>
 ; CHECK: store <4 x i8>
-define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1)* %out, i8 addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i8 1
   %out.gep.2 = getelementptr i8, i8 addrspace(1)* %out, i8 2
   %out.gep.3 = getelementptr i8, i8 addrspace(1)* %out, i8 3
@@ -454,7 +454,7 @@ define void @merge_global_store_4_adjacent_loads_i8_natural_align(i8 addrspace(1
 ; CHECK-LABEL: @merge_global_store_4_vector_elts_loads_v4i32
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32>
-define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
+define amdgpu_kernel void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %in) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
@@ -474,7 +474,7 @@ define void @merge_global_store_4_vector_elts_loads_v4i32(i32 addrspace(1)* %out
 
 ; CHECK-LABEL: @merge_local_store_2_constants_i8
 ; CHECK: store <2 x i8> <i8 -56, i8 123>, <2 x i8> addrspace(3)* %{{[0-9]+}}, align 2
-define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i8, i8 addrspace(3)* %out, i32 1
 
   store i8 123, i8 addrspace(3)* %out.gep.1
@@ -484,7 +484,7 @@ define void @merge_local_store_2_constants_i8(i8 addrspace(3)* %out) #0 {
 
 ; CHECK-LABEL: @merge_local_store_2_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 123>, <2 x i32> addrspace(3)* %{{[0-9]+}}, align 4
-define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1
@@ -495,7 +495,7 @@ define void @merge_local_store_2_constants_i32(i32 addrspace(3)* %out) #0 {
 ; CHECK-LABEL: @merge_local_store_2_constants_i32_align_2
 ; CHECK: store i32
 ; CHECK: store i32
-define void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
 
   store i32 123, i32 addrspace(3)* %out.gep.1, align 2
@@ -506,7 +506,7 @@ define void @merge_local_store_2_constants_i32_align_2(i32 addrspace(3)* %out) #
 ; CHECK-LABEL: @merge_local_store_4_constants_i32
 ; CHECK: store <2 x i32> <i32 456, i32 333>, <2 x i32> addrspace(3)*
 ; CHECK: store <2 x i32> <i32 1234, i32 123>, <2 x i32> addrspace(3)*
-define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
+define amdgpu_kernel void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(3)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(3)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(3)* %out, i32 3
@@ -521,7 +521,7 @@ define void @merge_local_store_4_constants_i32(i32 addrspace(3)* %out) #0 {
 ; CHECK-LABEL: @merge_global_store_5_constants_i32
 ; CHECK: store <4 x i32> <i32 9, i32 12, i32 16, i32 -12>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store i32
-define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
   store i32 9, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 12, i32 addrspace(1)* %idx1, align 4
@@ -537,7 +537,7 @@ define void @merge_global_store_5_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @merge_global_store_6_constants_i32
 ; CHECK: store <4 x i32> <i32 13, i32 15, i32 62, i32 63>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <2 x i32> <i32 11, i32 123>, <2 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
   store i32 13, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 15, i32 addrspace(1)* %idx1, align 4
@@ -555,7 +555,7 @@ define void @merge_global_store_6_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @merge_global_store_7_constants_i32
 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <3 x i32> <i32 98, i32 91, i32 212>, <3 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -575,7 +575,7 @@ define void @merge_global_store_7_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @merge_global_store_8_constants_i32
 ; CHECK: store <4 x i32> <i32 34, i32 999, i32 65, i32 33>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
 ; CHECK: store <4 x i32> <i32 98, i32 91, i32 212, i32 999>, <4 x i32> addrspace(1)* %{{[0-9]+}}, align 4
-define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
   store i32 34, i32 addrspace(1)* %out, align 4
   %idx1 = getelementptr inbounds i32, i32 addrspace(1)* %out, i64 1
   store i32 999, i32 addrspace(1)* %idx1, align 4
@@ -597,7 +597,7 @@ define void @merge_global_store_8_constants_i32(i32 addrspace(1)* %out) {
 ; CHECK-LABEL: @copy_v3i32_align4
 ; CHECK: %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
 ; CHECK: store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
-define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i32>, <3 x i32> addrspace(1)* %in, align 4
   store <3 x i32> %vec, <3 x i32> addrspace(1)* %out
   ret void
@@ -606,7 +606,7 @@ define void @copy_v3i32_align4(<3 x i32> addrspace(1)* noalias %out, <3 x i32> a
 ; CHECK-LABEL: @copy_v3i64_align4
 ; CHECK: %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
 ; CHECK: store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
-define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x i64>, <3 x i64> addrspace(1)* %in, align 4
   store <3 x i64> %vec, <3 x i64> addrspace(1)* %out
   ret void
@@ -615,7 +615,7 @@ define void @copy_v3i64_align4(<3 x i64> addrspace(1)* noalias %out, <3 x i64> a
 ; CHECK-LABEL: @copy_v3f32_align4
 ; CHECK: %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
 ; CHECK: store <3 x float>
-define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x float> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x float>, <3 x float> addrspace(1)* %in, align 4
   %fadd = fadd <3 x float> %vec, <float 1.0, float 2.0, float 4.0>
   store <3 x float> %fadd, <3 x float> addrspace(1)* %out
@@ -625,7 +625,7 @@ define void @copy_v3f32_align4(<3 x float> addrspace(1)* noalias %out, <3 x floa
 ; CHECK-LABEL: @copy_v3f64_align4
 ; CHECK: %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
 ; CHECK: store <3 x double> %fadd, <3 x double> addrspace(1)* %out
-define void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
+define amdgpu_kernel void @copy_v3f64_align4(<3 x double> addrspace(1)* noalias %out, <3 x double> addrspace(1)* noalias %in) #0 {
   %vec = load <3 x double>, <3 x double> addrspace(1)* %in, align 4
   %fadd = fadd <3 x double> %vec, <double 1.0, double 2.0, double 4.0>
   store <3 x double> %fadd, <3 x double> addrspace(1)* %out
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
index 8885d61014fc6788e2995c090e7bc67d33167714..226147df66a6190ce5154b03ab42495406487a09 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/merge-vectors.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK-LABEL: @merge_v2i32_v2i32(
 ; CHECK: load <4 x i32>
 ; CHECK: store <4 x i32> zeroinitializer
-define void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2i32_v2i32(<2 x i32> addrspace(1)* nocapture %a, <2 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i32>, <2 x i32> addrspace(1)* %b, i64 1
@@ -22,7 +22,7 @@ entry:
 ; CHECK-LABEL: @merge_v1i32_v1i32(
 ; CHECK: load <2 x i32>
 ; CHECK: store <2 x i32> zeroinitializer
-define void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v1i32_v1i32(<1 x i32> addrspace(1)* nocapture %a, <1 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <1 x i32>, <1 x i32> addrspace(1)* %b, i64 1
@@ -41,7 +41,7 @@ entry:
 ; CHECK: load <3 x i32>
 ; CHECK: store <3 x i32> zeroinitializer
 ; CHECK: store <3 x i32> zeroinitializer
-define void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @no_merge_v3i32_v3i32(<3 x i32> addrspace(1)* nocapture %a, <3 x i32> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <3 x i32>, <3 x i32> addrspace(1)* %b, i64 1
@@ -58,7 +58,7 @@ entry:
 ; CHECK-LABEL: @merge_v2i16_v2i16(
 ; CHECK: load <4 x i16>
 ; CHECK: store <4 x i16> zeroinitializer
-define void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2i16_v2i16(<2 x i16> addrspace(1)* nocapture %a, <2 x i16> addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i16>, <2 x i16> addrspace(1)* %b, i64 1
@@ -76,7 +76,7 @@ entry:
 ; CHECK-LABEL: @merge_load_i32_v2i16(
 ; CHECK: load i32,
 ; CHECK: load <2 x i16>
-define void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i32_v2i16(i32 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 1
   %a.1.cast = bitcast i32 addrspace(1)* %a.1 to <2 x i16> addrspace(1)*
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
index ba792f7835335ee66d4301d3fdc0e0a92128b260..f353106607d6864632048ce32c298bb245facfc9 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/missing-alignment.ll
@@ -7,7 +7,7 @@
 
 ; CHECK-LABEL: @load_keep_base_alignment_missing_align(
 ; CHECK: load <2 x float>, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
+define amdgpu_kernel void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
   %ptr0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 11
   %val0 = load float, float addrspace(3)* %ptr0
 
@@ -21,7 +21,7 @@ define void @load_keep_base_alignment_missing_align(float addrspace(1)* %out) {
 
 ; CHECK-LABEL: @store_keep_base_alignment_missing_align(
 ; CHECK: store <2 x float> zeroinitializer, <2 x float> addrspace(3)* %{{[0-9]+}}, align 4
-define void @store_keep_base_alignment_missing_align() {
+define amdgpu_kernel void @store_keep_base_alignment_missing_align() {
   %arrayidx0 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 1
   %arrayidx1 = getelementptr inbounds [512 x float], [512 x float] addrspace(3)* @lds, i32 0, i32 2
   store float 0.0, float addrspace(3)* %arrayidx0
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
index 88eca363902f67a4f0b147519e9376d7ac8b7fb0..8a78f3d7e9bc106df878de42bbf06c1a6e4ec4c1 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/multiple_tails.ll
@@ -11,7 +11,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:
 ; CHECK: store i32 0
 ; CHECK: store i32 0
 
-define void @no_crash(i32 %arg) {
+define amdgpu_kernel void @no_crash(i32 %arg) {
   %tmp2 = add i32 %arg, 14
   %tmp3 = getelementptr [16384 x i32], [16384 x i32] addrspace(3)* @0, i32 0, i32 %tmp2
   %tmp4 = add i32 %arg, 15
@@ -37,7 +37,7 @@ define void @no_crash(i32 %arg) {
 ; CHECK: load i32
 ; CHECK: load i32
 
-define void @interleave_get_longest(i32 %arg) {
+define amdgpu_kernel void @interleave_get_longest(i32 %arg) {
   %a1 = add i32 %arg, 1
   %a2 = add i32 %arg, 2
   %a3 = add i32 %arg, 3
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
index 4a429533df02ad16bb8ea277cc6c15c0d5ce9e4d..818189565b4ccd53d89978d17cb486bff8f49a41 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/no-implicit-float.ll
@@ -5,7 +5,7 @@
 ; CHECK: store i32
 ; CHECK: store i32
 ; CHECK: store i32
-define void @no_implicit_float(i32 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @no_implicit_float(i32 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
   %out.gep.2 = getelementptr i32, i32 addrspace(1)* %out, i32 2
   %out.gep.3 = getelementptr i32, i32 addrspace(1)* %out, i32 3
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
index 141e20a1f83c2112e4fdce2b5a72d5b3fa4a4816..28d29f8e8139414e9e6bfbe3c911595f1486d00b 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/optnone.ll
@@ -3,7 +3,7 @@
 ; CHECK-LABEL: @optnone(
 ; CHECK: store i32
 ; CHECK: store i32
-define void @optnone(i32 addrspace(1)* %out) noinline optnone {
+define amdgpu_kernel void @optnone(i32 addrspace(1)* %out) noinline optnone {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
@@ -13,7 +13,7 @@ define void @optnone(i32 addrspace(1)* %out) noinline optnone {
 
 ; CHECK-LABEL: @do_opt(
 ; CHECK: store <2 x i32>
-define void @do_opt(i32 addrspace(1)* %out) {
+define amdgpu_kernel void @do_opt(i32 addrspace(1)* %out) {
   %out.gep.1 = getelementptr i32, i32 addrspace(1)* %out, i32 1
 
   store i32 123, i32 addrspace(1)* %out.gep.1
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
index 202e988ea5f16f96bff536f721df12b59100f413..65200b95d5e62439ca04435cb8485b1dfa80b846 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/pointer-elements.ll
@@ -9,7 +9,7 @@ declare i32 @llvm.amdgcn.workitem.id.x() #1
 ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
 ; CHECK: inttoptr i64 %{{[^ ]+}} to i8 addrspace(1)*
 ; CHECK: store <2 x i64> zeroinitializer
-define void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2p1i8(i8 addrspace(1)* addrspace(1)* nocapture %a, i8 addrspace(1)* addrspace(1)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %b, i64 1
@@ -28,7 +28,7 @@ entry:
 ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
 ; CHECK: inttoptr i32 %{{[^ ]+}} to i8 addrspace(3)*
 ; CHECK: store <2 x i32> zeroinitializer
-define void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
+define amdgpu_kernel void @merge_v2p3i8(i8 addrspace(3)* addrspace(3)* nocapture %a, i8 addrspace(3)* addrspace(3)* nocapture readonly %b) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i64 1
   %b.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %b, i64 1
@@ -46,7 +46,7 @@ entry:
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i64_ptr64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
   %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
@@ -61,7 +61,7 @@ entry:
 ; CHECK: load <2 x i64>
 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 0
 ; CHECK: inttoptr i64 [[ELT0]] to i8 addrspace(1)*
-define void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr64_i64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -76,7 +76,7 @@ entry:
 ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr0 to i64
 ; CHECK: insertelement <2 x i64> undef, i64 [[ELT0]], i32 0
 ; CHECK: store <2 x i64>
-define void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr64_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, i64 %val1) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -92,7 +92,7 @@ entry:
 ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_i64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(1)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to i64 addrspace(1)*
@@ -107,7 +107,7 @@ entry:
 ; CHECK: load <2 x i32>
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i32 [[ELT1]] to i8 addrspace(3)*
-define void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_i32_ptr32(i32 addrspace(3)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
   %a.1.cast = bitcast i32 addrspace(3)* %a.1 to i8 addrspace(3)* addrspace(3)*
@@ -122,7 +122,7 @@ entry:
 ; CHECK: load <2 x i32>
 ; CHECK: [[ELT0:%[^ ]+]] = extractelement <2 x i32> %{{[^ ]+}}, i32 0
 ; CHECK: inttoptr i32 [[ELT0]] to i8 addrspace(3)*
-define void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr32_i32(i32 addrspace(3)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
@@ -137,7 +137,7 @@ entry:
 ; CHECK: [[ELT0:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr0 to i32
 ; CHECK: insertelement <2 x i32> undef, i32 [[ELT0]], i32 0
 ; CHECK: store <2 x i32>
-define void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr32_i32(i32 addrspace(3)* nocapture %a, i8 addrspace(3)* %ptr0, i32 %val1) #0 {
 entry:
   %a.cast = bitcast i32 addrspace(3)* %a to i8 addrspace(3)* addrspace(3)*
   %a.1 = getelementptr inbounds i32, i32 addrspace(3)* %a, i32 1
@@ -152,7 +152,7 @@ entry:
 ; CHECK: [[ELT1:%[^ ]+]] = ptrtoint i8 addrspace(3)* %ptr1 to i32
 ; CHECK: insertelement <2 x i32> %{{[^ ]+}}, i32 [[ELT1]], i32 1
 ; CHECK: store <2 x i32>
-define void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_i32_ptr32(i8 addrspace(3)* addrspace(3)* nocapture %a, i32 %val0, i8 addrspace(3)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(3)* %a, i32 1
   %a.cast = bitcast i8 addrspace(3)* addrspace(3)* %a to i32 addrspace(3)*
@@ -166,7 +166,7 @@ entry:
 ; CHECK-LABEL: @no_merge_store_ptr32_i64(
 ; CHECK: store i8 addrspace(3)*
 ; CHECK: store i64
-define void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
+define amdgpu_kernel void @no_merge_store_ptr32_i64(i64 addrspace(1)* nocapture %a, i8 addrspace(3)* %ptr0, i64 %val1) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -181,7 +181,7 @@ entry:
 ; CHECK-LABEL: @no_merge_store_i64_ptr32(
 ; CHECK: store i64
 ; CHECK: store i8 addrspace(3)*
-define void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
+define amdgpu_kernel void @no_merge_store_i64_ptr32(i8 addrspace(3)* addrspace(1)* nocapture %a, i64 %val0, i8 addrspace(3)* %ptr1) #0 {
 entry:
   %a.1 =  getelementptr inbounds i8 addrspace(3)*, i8 addrspace(3)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(3)* addrspace(1)* %a to i64 addrspace(1)*
@@ -195,7 +195,7 @@ entry:
 ; CHECK-LABEL: @no_merge_load_i64_ptr32(
 ; CHECK: load i64,
 ; CHECK: load i8 addrspace(3)*,
-define void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @no_merge_load_i64_ptr32(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
   %a.1.cast = bitcast i64 addrspace(1)* %a.1 to i8 addrspace(3)* addrspace(1)*
@@ -209,7 +209,7 @@ entry:
 ; CHECK-LABEL: @no_merge_load_ptr32_i64(
 ; CHECK: load i8 addrspace(3)*,
 ; CHECK: load i64,
-define void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @no_merge_load_ptr32_i64(i64 addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast i64 addrspace(1)* %a to i8 addrspace(3)* addrspace(1)*
   %a.1 =  getelementptr inbounds i64, i64 addrspace(1)* %a, i64 1
@@ -226,7 +226,7 @@ entry:
 ; CHECK: load <2 x i8 addrspace(1)*>
 ; CHECK: store <2 x i8 addrspace(1)*>
 ; CHECK: store <2 x i8 addrspace(1)*>
-define void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
+define amdgpu_kernel void @merge_v2p1i8_v2p1i8(<2 x i8 addrspace(1)*> addrspace(1)* nocapture noalias %a, <2 x i8 addrspace(1)*> addrspace(1)* nocapture readonly noalias %b) #0 {
 entry:
   %a.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %a, i64 1
   %b.1 = getelementptr inbounds <2 x i8 addrspace(1)*>, <2 x i8 addrspace(1)*> addrspace(1)* %b, i64 1
@@ -245,7 +245,7 @@ entry:
 ; CHECK: [[ELT0_INT:%[^ ]+]] = inttoptr i64 [[ELT0]] to i8 addrspace(1)*
 ; CHECK: [[ELT1_INT:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: bitcast i64 [[ELT1_INT]] to double
-define void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_ptr64_f64(double addrspace(1)* nocapture %a) #0 {
 entry:
   %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 =  getelementptr inbounds double, double addrspace(1)* %a, i64 1
@@ -262,7 +262,7 @@ entry:
 ; CHECK: bitcast i64 [[ELT0]] to double
 ; CHECK: [[ELT1:%[^ ]+]] = extractelement <2 x i64> %{{[^ ]+}}, i32 1
 ; CHECK: inttoptr i64 [[ELT1]] to i8 addrspace(1)*
-define void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
+define amdgpu_kernel void @merge_load_f64_ptr64(double addrspace(1)* nocapture %a) #0 {
 entry:
   %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
   %a.1.cast = bitcast double addrspace(1)* %a.1 to i8 addrspace(1)* addrspace(1)*
@@ -279,7 +279,7 @@ entry:
 ; CHECK: [[ELT1_INT:%[^ ]+]] = bitcast double %val1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
+define amdgpu_kernel void @merge_store_ptr64_f64(double addrspace(1)* nocapture %a, i8 addrspace(1)* %ptr0, double %val1) #0 {
 entry:
   %a.cast = bitcast double addrspace(1)* %a to i8 addrspace(1)* addrspace(1)*
   %a.1 = getelementptr inbounds double, double addrspace(1)* %a, i64 1
@@ -296,7 +296,7 @@ entry:
 ; CHECK: [[ELT1_INT:%[^ ]+]] = ptrtoint i8 addrspace(1)* %ptr1 to i64
 ; CHECK: insertelement <2 x i64> %{{[^ ]+}}, i64 [[ELT1_INT]], i32 1
 ; CHECK: store <2 x i64>
-define void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
+define amdgpu_kernel void @merge_store_f64_ptr64(i8 addrspace(1)* addrspace(1)* nocapture %a, double %val0, i8 addrspace(1)* %ptr1) #0 {
 entry:
   %a.1 = getelementptr inbounds i8 addrspace(1)*, i8 addrspace(1)* addrspace(1)* %a, i64 1
   %a.cast = bitcast i8 addrspace(1)* addrspace(1)* %a to double addrspace(1)*
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
index d70c449e14d75eb1615d1047ad0050f11faf1e5c..63e688e63fbb5edd0a07448257f0f4fbf1763cb4 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/store_with_aliasing_load.ll
@@ -9,7 +9,7 @@
 ; CHECK: store <4 x float>
 
 ; Function Attrs: nounwind
-define void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
+define amdgpu_kernel void @store_vectorize_with_alias(i8 addrspace(1)* %a, i8 addrspace(1)* %b) #0 {
 bb:
   %tmp = bitcast i8 addrspace(1)* %b to float addrspace(1)*
   %tmp1 = load float, float addrspace(1)* %tmp, align 4
diff --git a/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll b/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
index 18f62be27c82d2366f826e9a0deb74a8296426c8..412d2013f6b6174b0d8eac949e36616b3280a7d4 100644
--- a/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
+++ b/test/Transforms/LoadStoreVectorizer/AMDGPU/weird-type-accesses.ll
@@ -16,7 +16,7 @@ declare void @use_v2i9(<2 x i9>)
 ; CHECK-LABEL: @merge_store_2_constants_i1(
 ; CHECK: store i1
 ; CHECK: store i1
-define void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
   store i1 true, i1 addrspace(1)* %out.gep.1
   store i1 false, i1 addrspace(1)* %out
@@ -26,7 +26,7 @@ define void @merge_store_2_constants_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_i2(
 ; CHECK: store i2 1
 ; CHECK: store i2 -1
-define void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
   store i2 1, i2 addrspace(1)* %out.gep.1
   store i2 -1, i2 addrspace(1)* %out
@@ -36,7 +36,7 @@ define void @merge_store_2_constants_i2(i2 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_store_sizes_i1_i8(
 ; CHECK: store i1 true
 ; CHECK: store i8 123
-define void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
   %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   store i1 true, i1 addrspace(1)* %out.i1
@@ -47,7 +47,7 @@ define void @merge_different_store_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_store_sizes_i8_i1(
 ; CHECK: store i8 123
 ; CHECK: store i1 true
-define void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
   %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
   store i8 123, i8 addrspace(1)* %out.gep.1
@@ -58,7 +58,7 @@ define void @merge_different_store_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constant_structs(
 ; CHECK: store %struct.foo
 ; CHECK: store %struct.foo
-define void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
   store %struct.foo { i32 12, i8 3 }, %struct.foo addrspace(1)* %out.gep.1
   store %struct.foo { i32 92, i8 9 }, %struct.foo addrspace(1)* %out
@@ -69,7 +69,7 @@ define void @merge_store_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_v2i2(
 ; CHECK: store <2 x i2>
 ; CHECK: store <2 x i2>
-define void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
   store <2 x i2> <i2 1, i2 -1>, <2 x i2> addrspace(1)* %out.gep.1
   store <2 x i2> <i2 -1, i2 1>, <2 x i2> addrspace(1)* %out
@@ -81,7 +81,7 @@ define void @merge_store_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_v4i2(
 ; CHECK: store <4 x i2>
 ; CHECK: store <4 x i2>
-define void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
   store <4 x i2> <i2 1, i2 -1, i2 1, i2 -1>, <4 x i2> addrspace(1)* %out.gep.1
   store <4 x i2> <i2 -1, i2 1, i2 -1, i2 1>, <4 x i2> addrspace(1)* %out
@@ -91,7 +91,7 @@ define void @merge_store_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_i1(
 ; CHECK: load i1
 ; CHECK: load i1
-define void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i1, i1 addrspace(1)* %out, i32 1
   %x = load i1, i1 addrspace(1)* %out.gep.1
   %y = load i1, i1 addrspace(1)* %out
@@ -103,7 +103,7 @@ define void @merge_load_2_constants_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_i2(
 ; CHECK: load i2
 ; CHECK: load i2
-define void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i2, i2 addrspace(1)* %out, i32 1
   %x = load i2, i2 addrspace(1)* %out.gep.1
   %y = load i2, i2 addrspace(1)* %out
@@ -115,7 +115,7 @@ define void @merge_load_2_constants_i2(i2 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_load_sizes_i1_i8(
 ; CHECK: load i1
 ; CHECK: load i8
-define void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
   %out.i1 = bitcast i8 addrspace(1)* %out to i1 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out, i32 1
   %x = load i1, i1 addrspace(1)* %out.i1
@@ -128,7 +128,7 @@ define void @merge_different_load_sizes_i1_i8(i8 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_different_load_sizes_i8_i1(
 ; CHECK: load i8
 ; CHECK: load i1
-define void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
   %out.i8 = bitcast i1 addrspace(1)* %out to i8 addrspace(1)*
   %out.gep.1 = getelementptr i8, i8 addrspace(1)* %out.i8, i32 1
   %x = load i8, i8 addrspace(1)* %out.gep.1
@@ -141,7 +141,7 @@ define void @merge_different_load_sizes_i8_i1(i1 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constant_structs(
 ; CHECK: load %struct.foo
 ; CHECK: load %struct.foo
-define void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr %struct.foo, %struct.foo addrspace(1)* %out, i32 1
   %x = load %struct.foo, %struct.foo addrspace(1)* %out.gep.1
   %y = load %struct.foo, %struct.foo addrspace(1)* %out
@@ -153,7 +153,7 @@ define void @merge_load_2_constant_structs(%struct.foo addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_v2i2(
 ; CHECK: load <2 x i2>
 ; CHECK: load <2 x i2>
-define void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i2>, <2 x i2> addrspace(1)* %out, i32 1
   %x = load <2 x i2>, <2 x i2> addrspace(1)* %out.gep.1
   %y = load <2 x i2>, <2 x i2> addrspace(1)* %out
@@ -165,7 +165,7 @@ define void @merge_load_2_constants_v2i2(<2 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_v4i2(
 ; CHECK: load <4 x i2>
 ; CHECK: load <4 x i2>
-define void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <4 x i2>, <4 x i2> addrspace(1)* %out, i32 1
   %x = load <4 x i2>, <4 x i2> addrspace(1)* %out.gep.1
   %y = load <4 x i2>, <4 x i2> addrspace(1)* %out
@@ -177,7 +177,7 @@ define void @merge_load_2_constants_v4i2(<4 x i2> addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_store_2_constants_i9(
 ; CHECK: store i9 3
 ; CHECK: store i9 -5
-define void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr i9, i9 addrspace(1)* %out, i32 1
   store i9 3, i9 addrspace(1)* %out.gep.1
   store i9 -5, i9 addrspace(1)* %out
@@ -187,7 +187,7 @@ define void @merge_store_2_constants_i9(i9 addrspace(1)* %out) #0 {
 ; CHECK-LABEL: @merge_load_2_constants_v2i9(
 ; CHECK: load <2 x i9>
 ; CHECK: load <2 x i9>
-define void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
+define amdgpu_kernel void @merge_load_2_constants_v2i9(<2 x i9> addrspace(1)* %out) #0 {
   %out.gep.1 = getelementptr <2 x i9>, <2 x i9> addrspace(1)* %out, i32 1
   %x = load <2 x i9>, <2 x i9> addrspace(1)* %out.gep.1
   %y = load <2 x i9>, <2 x i9> addrspace(1)* %out
diff --git a/test/Transforms/LoadStoreVectorizer/X86/load-width.ll b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a61b25119a14cdb82ec4fdead9673955b5de1950
--- /dev/null
+++ b/test/Transforms/LoadStoreVectorizer/X86/load-width.ll
@@ -0,0 +1,38 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu haswell -S -o - %s | FileCheck --check-prefix=CHECK-HSW %s
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -load-store-vectorizer -mcpu knl -S -o - %s | FileCheck --check-prefix=CHECK-KNL %s
+
+define <8 x double> @loadwidth_insert_extract(double* %ptr) {
+    %a = bitcast double* %ptr to <2 x double> *
+    %b = getelementptr <2 x double>, <2 x double>* %a, i32 1
+    %c = getelementptr <2 x double>, <2 x double>* %a, i32 2
+    %d = getelementptr <2 x double>, <2 x double>* %a, i32 3
+; CHECK-HSW: load <4 x double>
+; CHECK-HSW: load <4 x double>
+; CHECK-HSW-NOT: load
+; CHECK-KNL: load <8 x double>
+; CHECK-KNL-NOT: load
+    %la = load <2 x double>, <2 x double> *%a
+    %lb = load <2 x double>, <2 x double> *%b
+    %lc = load <2 x double>, <2 x double> *%c
+    %ld = load <2 x double>, <2 x double> *%d
+    ; Scalarize everything - Explicitly not a shufflevector to test this code
+    ; path in the LSV
+    %v1 = extractelement <2 x double> %la, i32 0
+    %v2 = extractelement <2 x double> %la, i32 1
+    %v3 = extractelement <2 x double> %lb, i32 0
+    %v4 = extractelement <2 x double> %lb, i32 1
+    %v5 = extractelement <2 x double> %lc, i32 0
+    %v6 = extractelement <2 x double> %lc, i32 1
+    %v7 = extractelement <2 x double> %ld, i32 0
+    %v8 = extractelement <2 x double> %ld, i32 1
+    ; Make a vector again
+    %i1 = insertelement <8 x double> undef, double %v1, i32 0
+    %i2 = insertelement <8 x double> %i1, double %v2, i32 1
+    %i3 = insertelement <8 x double> %i2, double %v3, i32 2
+    %i4 = insertelement <8 x double> %i3, double %v4, i32 3
+    %i5 = insertelement <8 x double> %i4, double %v5, i32 4
+    %i6 = insertelement <8 x double> %i5, double %v6, i32 5
+    %i7 = insertelement <8 x double> %i6, double %v7, i32 6
+    %i8 = insertelement <8 x double> %i7, double %v8, i32 7
+    ret <8 x double> %i8
+}
diff --git a/test/Transforms/LoopPredication/basic.ll b/test/Transforms/LoopPredication/basic.ll
index a347e6192bd91473db21d575332f76ad8c775ce6..6ce07819cb0398c455da5e0bdcbc19a3c86e8d03 100644
--- a/test/Transforms/LoopPredication/basic.ll
+++ b/test/Transforms/LoopPredication/basic.ll
@@ -493,4 +493,79 @@ loop:
 exit:
   %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
   ret i32 %result
-}
\ No newline at end of file
+}
+
+define i32 @unsigned_loop_0_to_n_hoist_length(i32* %array, i16 %length.i16, i32 %n) {
+; CHECK-LABEL: @unsigned_loop_0_to_n_hoist_length
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK: [[max_index:[^ ]+]] = add i32 %n, -1
+; CHECK-NEXT: [[length:[^ ]+]] = zext i16 %length.i16 to i32
+; CHECK-NEXT: [[wide_cond:[^ ]+]] = icmp ult i32 [[max_index]], [[length]]
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK: call void (i1, ...) @llvm.experimental.guard(i1 [[wide_cond]], i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %length = zext i16 %length.i16 to i32
+  %within.bounds = icmp ult i32 %i, %length
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
+
+define i32 @unsigned_loop_0_to_n_cant_hoist_length(i32* %array, i32 %length, i32 %divider, i32 %n) {
+; CHECK-LABEL: @unsigned_loop_0_to_n_cant_hoist_length
+entry:
+  %tmp5 = icmp eq i32 %n, 0
+  br i1 %tmp5, label %exit, label %loop.preheader
+
+loop.preheader:
+; CHECK: loop.preheader:
+; CHECK-NEXT: br label %loop
+  br label %loop
+
+loop:
+; CHECK: loop:
+; CHECK-NEXT: %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+; CHECK-NEXT: %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+; CHECK-NEXT: %length.udiv = udiv i32 %length, %divider
+; CHECK-NEXT: %within.bounds = icmp ult i32 %i, %length.udiv
+; CHECK-NEXT: call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+  %loop.acc = phi i32 [ %loop.acc.next, %loop ], [ 0, %loop.preheader ]
+  %i = phi i32 [ %i.next, %loop ], [ 0, %loop.preheader ]
+  %length.udiv = udiv i32 %length, %divider
+  %within.bounds = icmp ult i32 %i, %length.udiv
+  call void (i1, ...) @llvm.experimental.guard(i1 %within.bounds, i32 9) [ "deopt"() ]
+
+  %i.i64 = zext i32 %i to i64
+  %array.i.ptr = getelementptr inbounds i32, i32* %array, i64 %i.i64
+  %array.i = load i32, i32* %array.i.ptr, align 4
+  %loop.acc.next = add i32 %loop.acc, %array.i
+
+  %i.next = add nuw i32 %i, 1
+  %continue = icmp ult i32 %i.next, %n
+  br i1 %continue, label %loop, label %exit
+
+exit:
+  %result = phi i32 [ 0, %entry ], [ %loop.acc.next, %loop ]
+  ret i32 %result
+}
diff --git a/test/Transforms/LoopRotate/phi-dbgvalue.ll b/test/Transforms/LoopRotate/phi-dbgvalue.ll
new file mode 100644
index 0000000000000000000000000000000000000000..aa8ca2f627bd305ec121dedb70f7f61cd3ff6d43
--- /dev/null
+++ b/test/Transforms/LoopRotate/phi-dbgvalue.ll
@@ -0,0 +1,79 @@
+; RUN: opt -S -loop-rotate < %s | FileCheck %s
+
+;CHECK-LABEL: func
+;CHECK-LABEL: entry
+;CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 %a
+;CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !13, metadata !11), !dbg !15
+;CHECK-LABEL: for.body:
+;CHECK-NEXT: [[I:%.*]] = phi i32 [ 1, %entry ], [ %inc, %for.body ]
+;CHECK-NEXT: tail call void @llvm.dbg.value(metadata i32 [[I]], i64 0, metadata !13, metadata !11), !dbg !15
+
+; Function Attrs: noinline nounwind
+define void @func(i32 %a) local_unnamed_addr #0 !dbg !6 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32 %a, i64 0, metadata !10, metadata !11), !dbg !12
+  tail call void @llvm.dbg.value(metadata i32 1, i64 0, metadata !13, metadata !11), !dbg !15
+  br label %for.cond, !dbg !16
+
+for.cond:                                         ; preds = %for.body, %entry
+  %i.0 = phi i32 [ 1, %entry ], [ %inc, %for.body ]
+  tail call void @llvm.dbg.value(metadata i32 %i.0, i64 0, metadata !13, metadata !11), !dbg !15
+  %cmp = icmp slt i32 %i.0, 10, !dbg !17
+  br i1 %cmp, label %for.body, label %for.end, !dbg !20
+
+for.body:                                         ; preds = %for.cond
+  %add = add nsw i32 %i.0, %a, !dbg !22
+  %call = tail call i32 @func2(i32 %i.0, i32 %add) #3, !dbg !24
+  %inc = add nsw i32 %i.0, 1, !dbg !25
+  tail call void @llvm.dbg.value(metadata i32 %inc, i64 0, metadata !13, metadata !11), !dbg !15
+  br label %for.cond, !dbg !27, !llvm.loop !28
+
+for.end:                                          ; preds = %for.cond
+  ret void, !dbg !31
+}
+
+declare i32 @func2(i32, i32) local_unnamed_addr
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
+
+attributes #0 = { noinline nounwind }
+attributes #2 = { nounwind readnone }
+attributes #3 = { nounwind }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+!llvm.ident = !{!5}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (http://llvm.org/git/clang.git 0f3ed908c1f13f83da4b240f7595eb8d05e0a754) (http://llvm.org/git/llvm.git 8e270f5a6b8ceb0f3ac3ef1ffb83c5e29b44ae68)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "debug-phi.c", directory: "/work/projects/src/tests/debug")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{!"clang version 5.0.0 (http://llvm.org/git/clang.git 0f3ed908c1f13f83da4b240f7595eb8d05e0a754) (http://llvm.org/git/llvm.git 8e270f5a6b8ceb0f3ac3ef1ffb83c5e29b44ae68)"}
+!6 = distinct !DISubprogram(name: "func", scope: !1, file: !1, line: 2, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!7 = !DISubroutineType(types: !8)
+!8 = !{null, !9}
+!9 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!10 = !DILocalVariable(name: "a", arg: 1, scope: !6, file: !1, line: 2, type: !9)
+!11 = !DIExpression()
+!12 = !DILocation(line: 2, column: 15, scope: !6)
+!13 = !DILocalVariable(name: "i", scope: !14, file: !1, line: 3, type: !9)
+!14 = distinct !DILexicalBlock(scope: !6, file: !1, line: 3, column: 3)
+!15 = !DILocation(line: 3, column: 11, scope: !14)
+!16 = !DILocation(line: 3, column: 7, scope: !14)
+!17 = !DILocation(line: 3, column: 20, scope: !18)
+!18 = !DILexicalBlockFile(scope: !19, file: !1, discriminator: 1)
+!19 = distinct !DILexicalBlock(scope: !14, file: !1, line: 3, column: 3)
+!20 = !DILocation(line: 3, column: 3, scope: !21)
+!21 = !DILexicalBlockFile(scope: !14, file: !1, discriminator: 1)
+!22 = !DILocation(line: 4, column: 15, scope: !23)
+!23 = distinct !DILexicalBlock(scope: !19, file: !1, line: 3, column: 31)
+!24 = !DILocation(line: 4, column: 5, scope: !23)
+!25 = !DILocation(line: 3, column: 27, scope: !26)
+!26 = !DILexicalBlockFile(scope: !19, file: !1, discriminator: 2)
+!27 = !DILocation(line: 3, column: 3, scope: !26)
+!28 = distinct !{!28, !29, !30}
+!29 = !DILocation(line: 3, column: 3, scope: !14)
+!30 = !DILocation(line: 5, column: 3, scope: !14)
+!31 = !DILocation(line: 6, column: 1, scope: !6)
diff --git a/test/Transforms/LoopSimplify/dbg-loc.ll b/test/Transforms/LoopSimplify/dbg-loc.ll
index 702a1ad16af6ccdc5382d79c8572887a78551cce..98bfefd12238bfba8892a9d7cc0ac0969b1e71fb 100644
--- a/test/Transforms/LoopSimplify/dbg-loc.ll
+++ b/test/Transforms/LoopSimplify/dbg-loc.ll
@@ -23,6 +23,7 @@ entry:
 
 for.body:                                         ; preds = %entry, %length.exit
   %begin.sink5 = phi %"Length"* [ %incdec.ptr, %length.exit ], [ %begin, %entry ]
+  tail call void @llvm.dbg.value(metadata %"Length"* %begin.sink5, i64 0, metadata !15, metadata !16), !dbg !17
   %m_type.i.i.i = getelementptr inbounds %"Length", %"Length"* %begin.sink5, i64 0, i32 2, !dbg !9
   %0 = load i8, i8* %m_type.i.i.i, align 1, !dbg !9
   %cmp.i.i = icmp eq i8 %0, 9, !dbg !7
@@ -68,6 +69,9 @@ eh.resume:                                        ; preds = %catch
   resume { i8*, i32 } undef, !dbg !13
 }
 
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
 ; CHECK-DAG: [[PREHEADER_LOC]] = !DILocation(line: 73, column: 27, scope: !{{[0-9]+}})
 ; CHECK-DAG: [[LOOPEXIT_LOC]] = !DILocation(line: 75, column: 9, scope: !{{[0-9]+}})
 ; CHECK-DAG: [[LPAD_PREHEADER_LOC]] = !DILocation(line: 85, column: 1, scope: !{{[0-9]+}})
@@ -93,3 +97,6 @@ eh.resume:                                        ; preds = %catch
                              file: !5,
                              isOptimized: true, flags: "-O2",
                              splitDebugFilename: "abc.debug", emissionKind: 2)
+!15 = !DILocalVariable(name: "begin", arg: 1, scope: !6, file: !5, line: 71)
+!16 = !DIExpression()
+!17 = !DILocation(line: 71, column: 32, scope: !6)
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
new file mode 100644
index 0000000000000000000000000000000000000000..054c61d187958b6a4370bb0e5b6dba646a801637
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/atomics.ll
@@ -0,0 +1,87 @@
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=bonaire -loop-reduce < %s | FileCheck -check-prefix=OPT %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:64:64-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+
+; Make sure the pointer / address space of AtomicRMW is considered
+
+; OPT-LABEL: @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(
+
+; OPT-NOT: getelementptr
+
+; OPT: .lr.ph:
+; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
+; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
+; OPT: %tmp4 = atomicrmw add i32 addrspace(3)* %scevgep4, i32 undef seq_cst
+; OPT: %tmp7 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 undef seq_cst
+; OPT: %0 = atomicrmw add i32 addrspace(3)* %lsr.iv1, i32 %tmp8 seq_cst
+; OPT: br i1 %exitcond
+define amdgpu_kernel void @test_local_atomicrmw_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+  %tmp = icmp sgt i32 %n, 0
+  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader:                                 ; preds = %bb
+  br label %.lr.ph
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
+  ret void
+
+.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %tmp1 = add nuw nsw i32 %indvars.iv, 16383
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
+  %tmp4 = atomicrmw add i32 addrspace(3)* %tmp3, i32 undef seq_cst
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
+  %tmp7 = atomicrmw add i32 addrspace(3)* %tmp6, i32 undef seq_cst
+  %tmp8 = add nsw i32 %tmp7, %tmp4
+  atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+; OPT-LABEL: test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(
+; OPT-NOT: getelementptr
+
+; OPT: .lr.ph:
+; OPT: %lsr.iv2 = phi i32 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
+; OPT: %lsr.iv1 = phi i32 addrspace(3)* [ %scevgep, %.lr.ph ], [ %arg0, %.lr.ph.preheader ]
+; OPT: %lsr.iv = phi i32 [ %lsr.iv.next, %.lr.ph ], [ %n, %.lr.ph.preheader ]
+; OPT: %scevgep4 = getelementptr i32, i32 addrspace(3)* %lsr.iv2, i32 16383
+; OPT: %tmp4 = cmpxchg i32 addrspace(3)* %scevgep4, i32 undef, i32 undef seq_cst monotonic
+define amdgpu_kernel void @test_local_cmpxchg_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(3)* noalias nocapture %arg0, i32 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+bb:
+  %tmp = icmp sgt i32 %n, 0
+  br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
+
+.lr.ph.preheader:                                 ; preds = %bb
+  br label %.lr.ph
+
+._crit_edge.loopexit:                             ; preds = %.lr.ph
+  br label %._crit_edge
+
+._crit_edge:                                      ; preds = %._crit_edge.loopexit, %bb
+  ret void
+
+.lr.ph:                                           ; preds = %.lr.ph, %.lr.ph.preheader
+  %indvars.iv = phi i32 [ %indvars.iv.next, %.lr.ph ], [ 0, %.lr.ph.preheader ]
+  %tmp1 = add nuw nsw i32 %indvars.iv, 16383
+  %tmp3 = getelementptr inbounds i32, i32 addrspace(3)* %arg1, i32 %tmp1
+  %tmp4 = cmpxchg i32 addrspace(3)* %tmp3, i32 undef, i32 undef seq_cst monotonic
+  %tmp4.0 = extractvalue { i32, i1 } %tmp4, 0
+  %tmp6 = getelementptr inbounds i32, i32 addrspace(3)* %arg0, i32 %indvars.iv
+  %tmp7 = cmpxchg i32 addrspace(3)* %tmp6, i32 undef, i32 undef seq_cst monotonic
+  %tmp7.0 = extractvalue { i32, i1 } %tmp7, 0
+  %tmp8 = add nsw i32 %tmp7.0, %tmp4.0
+  atomicrmw add i32 addrspace(3)* %tmp6, i32 %tmp8 seq_cst
+  %indvars.iv.next = add nuw nsw i32 %indvars.iv, 1
+  %exitcond = icmp eq i32 %indvars.iv.next, %n
+  br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
+}
+
+attributes #0 = { nounwind }
\ No newline at end of file
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
index bf61112a3c3e614fedf2588682729d6893107e87..c5ea1b915d9165d3f19dc80f90c963959355d262 100644
--- a/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; OPT: %lsr.iv2 = phi i8 addrspace(1)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv2, i64 4095
 ; OPT: load i8, i8 addrspace(1)* %scevgep4, align 1
-define void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -48,7 +48,7 @@ bb:
 ; OPT: {{^}}.lr.ph:
 ; OPT: %lsr.iv3 = phi i8 addrspace(1)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(1)* %lsr.iv3, i64 1
-define void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_global_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(1)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -83,7 +83,7 @@ bb:
 ; OPT: %lsr.iv2 = phi i8 addrspace(3)* [ %scevgep3, %.lr.ph ], [ %arg1, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv2, i32 65535
 ; OPT: %tmp4 = load i8, i8 addrspace(3)* %scevgep4, align 1
-define void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
@@ -122,7 +122,7 @@ bb:
 ; OPT: {{^}}.lr.ph:
 ; OPT: %lsr.iv3 = phi i8 addrspace(3)* [ %scevgep4, %.lr.ph ], [ %scevgep2, %.lr.ph.preheader ]
 ; OPT: %scevgep4 = getelementptr i8, i8 addrspace(3)* %lsr.iv3, i32 1
-define void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
+define amdgpu_kernel void @test_local_addressing_loop_uniform_index_max_offset_p1_i32(i32 addrspace(1)* noalias nocapture %arg0, i8 addrspace(3)* noalias nocapture readonly %arg1, i32 %n) #0 {
 bb:
   %tmp = icmp sgt i32 %n, 0
   br i1 %tmp, label %.lr.ph.preheader, label %._crit_edge
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
new file mode 100644
index 0000000000000000000000000000000000000000..02c3c05e7945ef63902ee8668da323a33f83624b
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-crash.ll
@@ -0,0 +1,31 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "amdgcn--"
+
+; We need to compile this for a target where we have different address spaces,
+; and where pointers in those address spaces have different size.
+; E.g. for amdgcn-- pointers in address space 0 are 32 bits and pointers in
+; address space 1 are 64 bits.
+
+; We shouldn't crash. Check that we get a loop with the two stores.
+;CHECK-LABEL: foo:
+;CHECK: [[LOOP_LABEL:BB[0-9]+_[0-9]+]]:
+;CHECK: buffer_store_dword
+;CHECK: buffer_store_dword
+;CHECK: s_branch [[LOOP_LABEL]]
+
+define amdgpu_kernel void @foo() {
+entry:
+  br label %loop
+
+loop:
+  %idx0 = phi i32 [ %next_idx0, %loop ], [ 0, %entry ]
+  %0 = getelementptr inbounds i32, i32* null, i32 %idx0
+  %1 = getelementptr inbounds i32, i32 addrspace(1)* null, i32 %idx0
+  store i32 1, i32* %0
+  store i32 7, i32 addrspace(1)* %1
+  %next_idx0 = add nuw nsw i32 %idx0, 1
+  br label %loop
+}
+
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
index 8c83df5843d2d3cde058b153ec9df36ee6065c7f..67b1926bdf27205e856cba5da5c5e68f5bc595c7 100644
--- a/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/lsr-postinc-pos-addrspace.ll
@@ -16,7 +16,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK: bb:
 ; CHECK: inttoptr i32 %lsr.iv.next2 to i8 addrspace(3)*
 ; CHECK: %c1 = icmp ne i8 addrspace(3)*
-define void @local_cmp_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @local_cmp_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -47,7 +47,7 @@ bb13:
 ; CHECK: bb:
 ; CHECK: inttoptr i64 %lsr.iv.next2 to i8 addrspace(1)*
 ; CHECK: icmp ne i8 addrspace(1)* %t
-define void @global_cmp_user(i64 %arg0) nounwind {
+define amdgpu_kernel void @global_cmp_user(i64 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -78,7 +78,7 @@ bb13:
 ; CHECK: bb:
 ; CHECK: %idxprom = sext i32 %lsr.iv1 to i64
 ; CHECK: getelementptr i8, i8 addrspace(1)* %t, i64 %idxprom
-define void @global_gep_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @global_gep_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
@@ -108,7 +108,7 @@ bb13:
 
 ; CHECK: bb
 ; CHECK: %p = getelementptr i8, i8 addrspace(1)* %t, i64 %ii.ext
-define void @global_sext_scale_user(i32 %arg0) nounwind {
+define amdgpu_kernel void @global_sext_scale_user(i32 %arg0) nounwind {
 entry:
   br label %bb11
 
diff --git a/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll b/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
index b3b696d42c59cb25dbd00972621bfcafe48adb2e..9eba0c3051dc1bef31b8f8816a8fee22327579ef 100644
--- a/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
+++ b/test/Transforms/LoopStrengthReduce/AMDGPU/preserve-addrspace-assert.ll
@@ -14,7 +14,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 
 ; CHECK: %scevgep = getelementptr i32, i32 addrspace(3)* %tmp1, i32 4
 ; CHECK:%tmp14 = load i32, i32 addrspace(3)* %scevgep
-define void @lsr_crash_preserve_addrspace_unknown_type() #0 {
+define amdgpu_kernel void @lsr_crash_preserve_addrspace_unknown_type() #0 {
 bb:
   br label %bb1
 
diff --git a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
index 788842101080478309218a495e3c11c505846767..a9d1e8758766222ea6f0d94465c4eeecbc227da4 100644
--- a/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
+++ b/test/Transforms/LoopStrengthReduce/ARM/ivchain-ARM.ll
@@ -1,5 +1,4 @@
 ; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 %s -o - | FileCheck %s -check-prefix=A9
-; RUN: llc -O3 -mtriple=thumb-eabi -mcpu=cortex-a9 -addr-sink-using-gep=1 %s -o - | FileCheck %s -check-prefix=A9
 
 ; @simple is the most basic chain of address induction variables. Chaining
 ; saves at least one register and avoids complex addressing and setup
diff --git a/test/Transforms/LoopStrengthReduce/X86/canonical.ll b/test/Transforms/LoopStrengthReduce/X86/canonical.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2dafbb408aad4199fb8e89197a6a8529908f39ff
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/canonical.ll
@@ -0,0 +1,65 @@
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -loop-reduce -S < %s | FileCheck %s
+; Check LSR formula canonicalization will put loop invariant regs before
+; induction variable of current loop, so exprs involving loop invariant regs
+; can be promoted outside of current loop.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+define void @foo(i32 %size, i32 %nsteps, i8* nocapture %maxarray, i8* nocapture readnone %buffer, i32 %init) local_unnamed_addr #0 {
+entry:
+  %cmp25 = icmp sgt i32 %nsteps, 0
+  br i1 %cmp25, label %for.cond1.preheader.lr.ph, label %for.end12
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp223 = icmp sgt i32 %size, 1
+  %t0 = sext i32 %init to i64
+  %wide.trip.count = zext i32 %size to i64
+  %wide.trip.count31 = zext i32 %nsteps to i64
+  br label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.inc10, %for.cond1.preheader.lr.ph
+  %indvars.iv28 = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next29, %for.inc10 ]
+  br i1 %cmp223, label %for.body3.lr.ph, label %for.inc10
+
+for.body3.lr.ph:                                  ; preds = %for.cond1.preheader
+  %t1 = add nsw i64 %indvars.iv28, %t0
+  %t2 = trunc i64 %indvars.iv28 to i8
+  br label %for.body3
+
+; Make sure loop invariant items are grouped together so that load address can
+; be represented in one getelementptr.
+; CHECK-LABEL: for.body3:
+; CHECK-NEXT: [[LSR:%[^,]+]] = phi i64 [ 1, %for.body3.lr.ph ], [ {{.*}}, %for.body3 ]
+; CHECK-NOT: = phi i64
+; CHECK-NEXT: [[LOADADDR:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]]
+; CHECK-NEXT: = load i8, i8* [[LOADADDR]], align 1
+; CHECK: br i1 %exitcond, label %for.inc10.loopexit, label %for.body3
+
+for.body3:                                        ; preds = %for.body3, %for.body3.lr.ph
+  %indvars.iv = phi i64 [ 1, %for.body3.lr.ph ], [ %indvars.iv.next, %for.body3 ]
+  %t5 = trunc i64 %indvars.iv to i8
+  %t3 = add nsw i64 %t1, %indvars.iv
+  %arrayidx = getelementptr inbounds i8, i8* %maxarray, i64 %t3
+  %t4 = load i8, i8* %arrayidx, align 1
+  %add5 = add i8 %t4, %t5
+  %add6 = add i8 %add5, %t2
+  %arrayidx9 = getelementptr inbounds i8, i8* %maxarray, i64 %indvars.iv
+  store i8 %add6, i8* %arrayidx9, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.inc10.loopexit, label %for.body3
+
+for.inc10.loopexit:                               ; preds = %for.body3
+  br label %for.inc10
+
+for.inc10:                                        ; preds = %for.inc10.loopexit, %for.cond1.preheader
+  %indvars.iv.next29 = add nuw nsw i64 %indvars.iv28, 1
+  %exitcond32 = icmp eq i64 %indvars.iv.next29, %wide.trip.count31
+  br i1 %exitcond32, label %for.end12.loopexit, label %for.cond1.preheader
+
+for.end12.loopexit:                               ; preds = %for.inc10
+  br label %for.end12
+
+for.end12:                                        ; preds = %for.end12.loopexit, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index ab7d4f1baa816f472aa03ccf4a355fadfc4e97c9..fb63b66137f374378a663bf3ccfecf56136488eb 100644
--- a/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -1,7 +1,5 @@
 ; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 | FileCheck %s -check-prefix=X64
 ; RUN: llc < %s -O3 -march=x86 -mcpu=core2 | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -O3 -march=x86-64 -mcpu=core2 -addr-sink-using-gep=1 | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -O3 -march=x86 -mcpu=core2 -addr-sink-using-gep=1 | FileCheck %s -check-prefix=X32
 
 ; @simple is the most basic chain of address induction variables. Chaining
 ; saves at least one register and avoids complex addressing and setup
diff --git a/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4888536bdf81998b50957d6d708f6304d927afad
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-1.ll
@@ -0,0 +1,52 @@
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
+
+; OPT test checks that LSR optimize compare for static counter to compare with 0.
+
+; BOTH: for.body:
+; INSN: icmp eq i64 %lsr.iv.next, 0
+; REGS: icmp eq i64 %indvars.iv.next, 1024
+
+; LLC test checks that LSR optimize compare for static counter.
+; That means that instead of creating the following:
+;   movl %ecx, (%rdx,%rax,4)
+;   incq %rax
+;   cmpq $1024, %rax
+; LSR should optimize out cmp:
+;   movl %ecx, 4096(%rdx,%rax)
+;   addq $4, %rax
+; or
+;   movl %ecx, 4096(%rdx,%rax,4)
+;   incq %rax
+
+; CHECK:      LBB0_1:
+; CHECK-NEXT:   movl 4096(%{{.+}},[[REG:%[0-9a-z]+]]
+; CHECK-NEXT:   addl 4096(%{{.+}},[[REG]]
+; CHECK-NEXT:   movl %{{.+}}, 4096(%{{.+}},[[REG]]
+; CHECK-NOT:    cmp
+; CHECK:        jne
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q) {
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1024
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3273cb4e6b5bc9396ae367c5c638a60b2cb6d0f0
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/lsr-insns-2.ll
@@ -0,0 +1,58 @@
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -lsr-insns-cost -S | FileCheck %s -check-prefix=BOTH -check-prefix=INSN
+; RUN: opt < %s -loop-reduce -mtriple=x86_64 -S | FileCheck %s -check-prefix=BOTH -check-prefix=REGS
+; RUN: llc < %s -O2 -march=x86-64 -lsr-insns-cost -asm-verbose=0 | FileCheck %s
+
+; OPT checks that LSR prefers less instructions to less registers.
+; For x86 LSR should prefer complicated address to new lsr induction
+; variables.
+
+; BOTH: for.body:
+; INSN:   getelementptr i32, i32* %x, i64 %indvars.iv
+; INSN:   getelementptr i32, i32* %y, i64 %indvars.iv
+; INSN:   getelementptr i32, i32* %q, i64 %indvars.iv
+; REGS    %lsr.iv4 = phi
+; REGS    %lsr.iv2 = phi
+; REGS    %lsr.iv1 = phi
+; REGS:   getelementptr i32, i32* %lsr.iv1, i64 1
+; REGS:   getelementptr i32, i32* %lsr.iv2, i64 1
+; REGS:   getelementptr i32, i32* %lsr.iv4, i64 1
+
+; LLC checks that LSR prefers less instructions to less registers.
+; LSR should prefer complicated address to additonal add instructions.
+
+; CHECK:      LBB0_2:
+; CHECK-NEXT:   movl (%r{{.+}},
+; CHECK-NEXT:   addl (%r{{.+}},
+; CHECK-NEXT:   movl %e{{.+}}, (%r{{.+}},
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Function Attrs: norecurse nounwind uwtable
+define void @foo(i32* nocapture readonly %x, i32* nocapture readonly %y, i32* nocapture %q, i32 %n) {
+entry:
+  %cmp10 = icmp sgt i32 %n, 0
+  br i1 %cmp10, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:                               ; preds = %entry
+  %wide.trip.count = zext i32 %n to i64
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds i32, i32* %x, i64 %indvars.iv
+  %tmp = load i32, i32* %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds i32, i32* %y, i64 %indvars.iv
+  %tmp1 = load i32, i32* %arrayidx2, align 4
+  %add = add nsw i32 %tmp1, %tmp
+  %arrayidx4 = getelementptr inbounds i32, i32* %q, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx4, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll b/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b563eb3ad994056ceda381bce2abfbe60e3147a9
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/nested-loop.ll
@@ -0,0 +1,65 @@
+; RUN: opt -loop-reduce -S < %s | FileCheck %s
+; Check when we use an outerloop induction variable inside of an innerloop
+; induction value expr, LSR can still choose to use single induction variable
+; for the innerloop and share it in multiple induction value exprs.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i32 %size, i32 %nsteps, i32 %hsize, i32* %lined, i8* %maxarray) {
+entry:
+  %cmp215 = icmp sgt i32 %size, 1
+  %t0 = zext i32 %size to i64
+  %t1 = sext i32 %nsteps to i64
+  %sub2 = sub i64 %t0, 2
+  br label %for.body
+
+for.body:                                         ; preds = %for.inc, %entry
+  %indvars.iv2 = phi i64 [ %indvars.iv.next3, %for.inc ], [ 0, %entry ]
+  %t2 = mul nsw i64 %indvars.iv2, %t0
+  br i1 %cmp215, label %for.body2.preheader, label %for.inc
+
+for.body2.preheader:                              ; preds = %for.body
+  br label %for.body2
+
+; Check LSR only generates one induction variable for for.body2 and the induction
+; variable will be shared by multiple array accesses.
+; CHECK: for.body2:
+; CHECK-NEXT: [[LSR:%[^,]+]] = phi i64 [ %lsr.iv.next, %for.body2 ], [ 0, %for.body2.preheader ]
+; CHECK-NOT:  = phi i64 [ {{.*}}, %for.body2 ], [ {{.*}}, %for.body2.preheader ]
+; CHECK:      [[SCEVGEP1:%[^,]+]] = getelementptr i8, i8* %maxarray, i64 [[LSR]]
+; CHECK:      [[SCEVGEP2:%[^,]+]] = getelementptr i8, i8* [[SCEVGEP1]], i64 1
+; CHECK:      {{.*}} = load i8, i8* [[SCEVGEP2]], align 1
+; CHECK:      [[SCEVGEP3:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]]
+; CHECK:      {{.*}} = load i8, i8* [[SCEVGEP3]], align 1
+; CHECK:      [[SCEVGEP4:%[^,]+]] = getelementptr i8, i8* {{.*}}, i64 [[LSR]]
+; CHECK:      store i8 {{.*}}, i8* [[SCEVGEP4]], align 1
+; CHECK:      br i1 %exitcond, label %for.body2, label %for.inc.loopexit
+
+for.body2:                                        ; preds = %for.body2.preheader, %for.body2
+  %indvars.iv = phi i64 [ 1, %for.body2.preheader ], [ %indvars.iv.next, %for.body2 ]
+  %arrayidx1 = getelementptr inbounds i8, i8* %maxarray, i64 %indvars.iv
+  %v1 = load i8, i8* %arrayidx1, align 1
+  %idx2 = add nsw i64 %indvars.iv, %sub2
+  %arrayidx2 = getelementptr inbounds i8, i8* %maxarray, i64 %idx2
+  %v2 = load i8, i8* %arrayidx2, align 1
+  %tmpv = xor i8 %v1, %v2
+  %t4 = add nsw i64 %t2, %indvars.iv
+  %add.ptr = getelementptr inbounds i8, i8* %maxarray, i64 %t4
+  store i8 %tmpv, i8* %add.ptr, align 1
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %wide.trip.count = zext i32 %size to i64
+  %exitcond = icmp ne i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond, label %for.body2, label %for.inc.loopexit
+
+for.inc.loopexit:                                 ; preds = %for.body2
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.loopexit, %for.body
+  %indvars.iv.next3 = add nuw nsw i64 %indvars.iv2, 1
+  %cmp = icmp slt i64 %indvars.iv.next3, %t1
+  br i1 %cmp, label %for.body, label %for.end.loopexit
+
+for.end.loopexit:                                 ; preds = %for.inc
+  ret void
+}
diff --git a/test/Transforms/LoopStrengthReduce/X86/sibling-loops.ll b/test/Transforms/LoopStrengthReduce/X86/sibling-loops.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a69d6adc0f0388a32e5117c1012e158401770acd
--- /dev/null
+++ b/test/Transforms/LoopStrengthReduce/X86/sibling-loops.ll
@@ -0,0 +1,97 @@
+; RUN: opt -loop-reduce -S < %s | FileCheck %s
+; We find it is very bad to allow LSR formula containing SCEVAddRecExpr Reg
+; from siblings of current loop. When one loop is LSR optimized, it can
+; insert lsr.iv for other sibling loops, which sometimes leads to many extra
+; lsr.iv inserted for loops.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@cond = common local_unnamed_addr global i64 0, align 8
+
+; Check there is no extra lsr.iv generated in foo.
+; CHECK-LABEL: @foo(
+; CHECK-NOT: lsr.iv{{[0-9]+}} =
+;
+define void @foo(i64 %N) local_unnamed_addr {
+entry:
+  br label %do.body
+
+do.body:                                          ; preds = %do.body, %entry
+  %i.0 = phi i64 [ 0, %entry ], [ %inc, %do.body ]
+  tail call void @goo(i64 %i.0, i64 %i.0)
+  %inc = add nuw nsw i64 %i.0, 1
+  %t0 = load i64, i64* @cond, align 8
+  %tobool = icmp eq i64 %t0, 0
+  br i1 %tobool, label %do.body2.preheader, label %do.body
+
+do.body2.preheader:                               ; preds = %do.body
+  br label %do.body2
+
+do.body2:                                         ; preds = %do.body2.preheader, %do.body2
+  %i.1 = phi i64 [ %inc3, %do.body2 ], [ 0, %do.body2.preheader ]
+  %j.1 = phi i64 [ %inc4, %do.body2 ], [ %inc, %do.body2.preheader ]
+  tail call void @goo(i64 %i.1, i64 %j.1)
+  %inc3 = add nuw nsw i64 %i.1, 1
+  %inc4 = add nsw i64 %j.1, 1
+  %t1 = load i64, i64* @cond, align 8
+  %tobool6 = icmp eq i64 %t1, 0
+  br i1 %tobool6, label %do.body8.preheader, label %do.body2
+
+do.body8.preheader:                               ; preds = %do.body2
+  br label %do.body8
+
+do.body8:                                         ; preds = %do.body8.preheader, %do.body8
+  %i.2 = phi i64 [ %inc9, %do.body8 ], [ 0, %do.body8.preheader ]
+  %j.2 = phi i64 [ %inc10, %do.body8 ], [ %inc4, %do.body8.preheader ]
+  tail call void @goo(i64 %i.2, i64 %j.2)
+  %inc9 = add nuw nsw i64 %i.2, 1
+  %inc10 = add nsw i64 %j.2, 1
+  %t2 = load i64, i64* @cond, align 8
+  %tobool12 = icmp eq i64 %t2, 0
+  br i1 %tobool12, label %do.body14.preheader, label %do.body8
+
+do.body14.preheader:                              ; preds = %do.body8
+  br label %do.body14
+
+do.body14:                                        ; preds = %do.body14.preheader, %do.body14
+  %i.3 = phi i64 [ %inc15, %do.body14 ], [ 0, %do.body14.preheader ]
+  %j.3 = phi i64 [ %inc16, %do.body14 ], [ %inc10, %do.body14.preheader ]
+  tail call void @goo(i64 %i.3, i64 %j.3)
+  %inc15 = add nuw nsw i64 %i.3, 1
+  %inc16 = add nsw i64 %j.3, 1
+  %t3 = load i64, i64* @cond, align 8
+  %tobool18 = icmp eq i64 %t3, 0
+  br i1 %tobool18, label %do.body20.preheader, label %do.body14
+
+do.body20.preheader:                              ; preds = %do.body14
+  br label %do.body20
+
+do.body20:                                        ; preds = %do.body20.preheader, %do.body20
+  %i.4 = phi i64 [ %inc21, %do.body20 ], [ 0, %do.body20.preheader ]
+  %j.4 = phi i64 [ %inc22, %do.body20 ], [ %inc16, %do.body20.preheader ]
+  tail call void @goo(i64 %i.4, i64 %j.4)
+  %inc21 = add nuw nsw i64 %i.4, 1
+  %inc22 = add nsw i64 %j.4, 1
+  %t4 = load i64, i64* @cond, align 8
+  %tobool24 = icmp eq i64 %t4, 0
+  br i1 %tobool24, label %do.body26.preheader, label %do.body20
+
+do.body26.preheader:                              ; preds = %do.body20
+  br label %do.body26
+
+do.body26:                                        ; preds = %do.body26.preheader, %do.body26
+  %i.5 = phi i64 [ %inc27, %do.body26 ], [ 0, %do.body26.preheader ]
+  %j.5 = phi i64 [ %inc28, %do.body26 ], [ %inc22, %do.body26.preheader ]
+  tail call void @goo(i64 %i.5, i64 %j.5)
+  %inc27 = add nuw nsw i64 %i.5, 1
+  %inc28 = add nsw i64 %j.5, 1
+  %t5 = load i64, i64* @cond, align 8
+  %tobool30 = icmp eq i64 %t5, 0
+  br i1 %tobool30, label %do.end31, label %do.body26
+
+do.end31:                                         ; preds = %do.body26
+  ret void
+}
+
+declare void @goo(i64, i64) local_unnamed_addr
+
diff --git a/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll b/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
index e732ddc2bc84de161e68c9885754316bd54e1c71..ca8cc32469d8b96344e4804d231859e3a6c9bd00 100644
--- a/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
+++ b/test/Transforms/LoopUnroll/AMDGPU/unroll-barrier.ll
@@ -6,7 +6,7 @@
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK: call void @llvm.amdgcn.s.barrier()
 ; CHECK-NOT: br
-define void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
+define amdgpu_kernel void @test_unroll_convergent_barrier(i32 addrspace(1)* noalias nocapture %out, i32 addrspace(1)* noalias nocapture %in) #0 {
 entry:
   br label %for.body
 
diff --git a/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll b/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e986c3dc2a2832a377cf081214d7af0c1e27175d
--- /dev/null
+++ b/test/Transforms/LoopUnroll/AMDGPU/unroll-for-private.ll
@@ -0,0 +1,154 @@
+; RUN: opt -mtriple=amdgcn-unknown-amdhsa -loop-unroll -S -amdgpu-unroll-threshold-private=20000 %s | FileCheck %s
+
+; Check that we full unroll loop to be able to eliminate alloca
+; CHECK-LABEL: @non_invariant_ind
+; CHECK:       for.body:
+; CHECK-NOT:   br
+; CHECK:       store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
+; CHECK:       ret void
+
+define amdgpu_kernel void @non_invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
+entry:
+  %arr = alloca [64 x i32], align 4
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %arrayidx5 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %x
+  %tmp15 = load i32, i32* %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
+  store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.015 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.015 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %i.015, %tmp1
+  %rem = srem i32 %add, 64
+  %arrayidx3 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %rem
+  store i32 %tmp16, i32* %arrayidx3, align 4
+  %inc = add nuw nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check that we unroll inner loop but not outer
+; CHECK-LABEL: @invariant_ind
+; CHECK:       %[[exitcond:[^ ]+]] = icmp eq i32 %{{.*}}, 32
+; CHECK:       br i1 %[[exitcond]]
+; CHECK-NOT:   icmp eq i32 %{{.*}}, 100
+
+define amdgpu_kernel void @invariant_ind(i32 addrspace(1)* nocapture %a, i32 %x) {
+entry:
+  %arr = alloca [64 x i32], align 4
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  br label %for.cond2.preheader
+
+for.cond2.preheader:                              ; preds = %for.cond.cleanup5, %entry
+  %i.026 = phi i32 [ 0, %entry ], [ %inc10, %for.cond.cleanup5 ]
+  %idxprom = sext i32 %i.026 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %tmp15 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  br label %for.body6
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup5
+  %arrayidx13 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %x
+  %tmp16 = load i32, i32* %arrayidx13, align 4
+  %arrayidx15 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
+  store i32 %tmp16, i32 addrspace(1)* %arrayidx15, align 4
+  ret void
+
+for.cond.cleanup5:                                ; preds = %for.body6
+  %inc10 = add nuw nsw i32 %i.026, 1
+  %exitcond27 = icmp eq i32 %inc10, 32
+  br i1 %exitcond27, label %for.cond.cleanup, label %for.cond2.preheader
+
+for.body6:                                        ; preds = %for.body6, %for.cond2.preheader
+  %j.025 = phi i32 [ 0, %for.cond2.preheader ], [ %inc, %for.body6 ]
+  %add = add nsw i32 %j.025, %tmp1
+  %rem = srem i32 %add, 64
+  %arrayidx8 = getelementptr inbounds [64 x i32], [64 x i32]* %arr, i32 0, i32 %rem
+  store i32 %tmp15, i32* %arrayidx8, align 4
+  %inc = add nuw nsw i32 %j.025, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.cond.cleanup5, label %for.body6
+}
+
+; Check we do not enforce unroll if alloca is too big
+; CHECK-LABEL: @too_big
+; CHECK:       for.body:
+; CHECK:       icmp eq i32 %{{.*}}, 100
+; CHECK:       br
+
+define amdgpu_kernel void @too_big(i32 addrspace(1)* nocapture %a, i32 %x) {
+entry:
+  %arr = alloca [256 x i32], align 4
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %arrayidx5 = getelementptr inbounds [256 x i32], [256 x i32]* %arr, i32 0, i32 %x
+  %tmp15 = load i32, i32* %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
+  store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.015 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.015 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %i.015, %tmp1
+  %rem = srem i32 %add, 64
+  %arrayidx3 = getelementptr inbounds [256 x i32], [256 x i32]* %arr, i32 0, i32 %rem
+  store i32 %tmp16, i32* %arrayidx3, align 4
+  %inc = add nuw nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+; Check we do not enforce unroll if alloca is dynamic
+; CHECK-LABEL: @dynamic_size_alloca(
+; CHECK: alloca i32, i32 %n
+; CHECK:       for.body:
+; CHECK:       icmp eq i32 %{{.*}}, 100
+; CHECK:       br
+
+define amdgpu_kernel void @dynamic_size_alloca(i32 addrspace(1)* nocapture %a, i32 %n, i32 %x) {
+entry:
+  %arr = alloca i32, i32 %n, align 4
+  %tmp1 = tail call i32 @llvm.amdgcn.workitem.id.x() #1
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  %arrayidx5 = getelementptr inbounds i32, i32* %arr, i32 %x
+  %tmp15 = load i32, i32* %arrayidx5, align 4
+  %arrayidx7 = getelementptr inbounds i32, i32 addrspace(1)* %a, i32 %tmp1
+  store i32 %tmp15, i32 addrspace(1)* %arrayidx7, align 4
+  ret void
+
+for.body:                                         ; preds = %for.body, %entry
+  %i.015 = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %idxprom = sext i32 %i.015 to i64
+  %arrayidx = getelementptr inbounds i32, i32 addrspace(1)* %a, i64 %idxprom
+  %tmp16 = load i32, i32 addrspace(1)* %arrayidx, align 4
+  %add = add nsw i32 %i.015, %tmp1
+  %rem = srem i32 %add, 64
+  %arrayidx3 = getelementptr inbounds i32, i32* %arr, i32 %rem
+  store i32 %tmp16, i32* %arrayidx3, align 4
+  %inc = add nuw nsw i32 %i.015, 1
+  %exitcond = icmp eq i32 %inc, 100
+  br i1 %exitcond, label %for.cond.cleanup, label %for.body
+}
+
+declare i8 addrspace(2)* @llvm.amdgcn.dispatch.ptr() #1
+
+declare i32 @llvm.amdgcn.workitem.id.x() #1
+
+declare i32 @llvm.amdgcn.workgroup.id.x() #1
+
+declare i8 addrspace(2)* @llvm.amdgcn.implicitarg.ptr() #1
+
+attributes #1 = { nounwind readnone }
diff --git a/test/Transforms/LoopUnroll/epilog_const_phi.ll b/test/Transforms/LoopUnroll/epilog_const_phi.ll
new file mode 100644
index 0000000000000000000000000000000000000000..22e52576094275b03a92c68f9fc2372ddb6deb08
--- /dev/null
+++ b/test/Transforms/LoopUnroll/epilog_const_phi.ll
@@ -0,0 +1,65 @@
+; RUN: opt -S -loop-unroll -unroll-runtime < %s | FileCheck %s
+
+; Epilog unroll allows to keep PHI constant value.
+; For the test this means that after unroll XOR could be deleted.
+; Check that we do epilogue reminder here.
+
+; CHECK-LABEL: const_phi_val
+; CHECK:  for.body.epil
+
+; Function Attrs: norecurse nounwind uwtable
+define void @const_phi_val(i32 %i0, i32* nocapture %a) {
+entry:
+  %cmp6 = icmp slt i32 %i0, 1000
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = sext i32 %i0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ %tmp, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %s.08 = phi i32 [ 0, %for.body.preheader ], [ %xor, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  store i32 %s.08, i32* %arrayidx, align 4
+  %xor = xor i32 %s.08, 1
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
+
+; When there is no phi with const coming from preheader,
+; there is no need to do epilogue unrolling.
+
+; CHECK-LABEL: var_phi_val
+; CHECK:  for.body.prol
+
+; Function Attrs: norecurse nounwind uwtable
+define void @var_phi_val(i32 %i0, i32* nocapture %a) {
+entry:
+  %cmp6 = icmp slt i32 %i0, 1000
+  br i1 %cmp6, label %for.body.preheader, label %for.end
+
+for.body.preheader:                               ; preds = %entry
+  %tmp = sext i32 %i0 to i64
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %for.body.preheader
+  %indvars.iv = phi i64 [ %tmp, %for.body.preheader ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %indvars.iv.next = add nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 1000
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:                                 ; preds = %for.body
+  br label %for.end
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopUnroll/peel-loop-irreducible.ll b/test/Transforms/LoopUnroll/peel-loop-irreducible.ll
new file mode 100644
index 0000000000000000000000000000000000000000..32a7a0732e10b2476f50c387681117e0cbef017e
--- /dev/null
+++ b/test/Transforms/LoopUnroll/peel-loop-irreducible.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -S -loop-unroll -unroll-force-peel-count=1 | FileCheck %s
+
+; Check we don't peel loops where the latch is not the exiting block.
+; CHECK-LABEL: @invariant_backedge_irreducible
+; CHECK: entry:
+; CHECK: br label %header
+; CHECK-NOT: peel
+; CHECK: header:
+; CHECK: br i1 {{.*}} label %latch, label %exiting
+; CHECK: latch:
+; CHECK: br i1 {{.*}} label %header, label %exiting
+; CHECK: exiting:
+; CHECK: br i1 {{.*}} label %latch, label %exit
+
+define i32 @invariant_backedge_irreducible(i32 %a, i32 %b) {
+entry:
+  br label %header
+
+header:
+  %i = phi i32 [ 0, %entry ], [ %inc, %latch ]
+  %cmp.phi = phi i1 [ false, %entry ], [ %cmp, %latch ]
+  br i1 %cmp.phi, label %latch, label %exiting
+
+latch:
+  %inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i, 1000
+  br i1 %cmp, label %header, label %exiting
+
+exiting:
+  %cmp.exiting = phi i1 [ %cmp.phi, %header ], [ %cmp, %latch ]
+  br i1 %cmp.exiting, label %latch, label %exit
+
+exit:
+  ret i32 0
+}
+
diff --git a/test/Transforms/LoopUnroll/peel-loop-not-forced.ll b/test/Transforms/LoopUnroll/peel-loop-not-forced.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3dcac87f8242fb8249e6833eddbed022ff55c904
--- /dev/null
+++ b/test/Transforms/LoopUnroll/peel-loop-not-forced.ll
@@ -0,0 +1,53 @@
+; RUN: opt < %s -S -loop-unroll -unroll-threshold=4 | FileCheck %s
+
+define i32 @invariant_backedge_1(i32 %a, i32 %b) {
+; CHECK-LABEL: @invariant_backedge_1
+; CHECK-NOT:     %plus = phi
+; CHECK:       loop.peel:
+; CHECK:       loop:
+; CHECK:         %i = phi
+; CHECK:         %sum = phi
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %sum = phi i32 [ 0, %entry ], [ %incsum, %loop ]
+  %plus = phi i32 [ %a, %entry ], [ %b, %loop ]
+
+  %incsum = add i32 %sum, %plus
+  %inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i, 1000
+
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %sum
+}
+
+; Peeling should fail due to method size.
+define i32 @invariant_backedge_2(i32 %a, i32 %b) {
+; CHECK-LABEL: @invariant_backedge_2
+; CHECK-NOT:   loop.peel:
+; CHECK:       loop:
+; CHECK:         %i = phi
+; CHECK:         %sum = phi
+; CHECK:         %plus = phi
+entry:
+  br label %loop
+
+loop:
+  %i = phi i32 [ 0, %entry ], [ %inc, %loop ]
+  %sum = phi i32 [ 0, %entry ], [ %incsum2, %loop ]
+  %plus = phi i32 [ %a, %entry ], [ %b, %loop ]
+
+  %incsum = add i32 %sum, %plus
+  %incsum2 = add i32 %incsum, %plus
+  %inc = add i32 %i, 1
+  %cmp = icmp slt i32 %i, 1000
+
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret i32 %sum
+}
diff --git a/test/Transforms/LoopUnroll/peel-loop-pgo.ll b/test/Transforms/LoopUnroll/peel-loop-pgo.ll
index a87d5643e7e9dcdf1fd7e618d6dea3f7cb0dd2f6..20c3878d03a747aa7529c6b4981e3e5e97910c29 100644
--- a/test/Transforms/LoopUnroll/peel-loop-pgo.ll
+++ b/test/Transforms/LoopUnroll/peel-loop-pgo.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll -unroll-allow-peeling 2>&1 | FileCheck %s
+; RUN: opt < %s -S -debug-only=loop-unroll -loop-unroll 2>&1 | FileCheck %s
 ; REQUIRES: asserts
 
 ; Make sure we use the profile information correctly to peel-off 3 iterations
diff --git a/test/Transforms/LoopUnroll/peel-loop2.ll b/test/Transforms/LoopUnroll/peel-loop2.ll
new file mode 100644
index 0000000000000000000000000000000000000000..99e90797e19966630de6c7c6d0416f25e1736771
--- /dev/null
+++ b/test/Transforms/LoopUnroll/peel-loop2.ll
@@ -0,0 +1,61 @@
+; RUN: opt -S -loop-unroll -unroll-force-peel-count=1 -verify-dom-info <%s
+
+; Check if loop composed of several BBs is peeled correctly.
+
+declare void @funcb()
+@Comma = external global i8
+define void @funca(i8* readnone %b, i8* readnone %e) {
+entry:
+  %cmp2 = icmp eq i8* %b, %e
+  br i1 %cmp2, label %for.end, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.body:
+  %b.addr.03 = phi i8* [ %incdec.ptr, %for.inc ], [ %b, %for.body.preheader ]
+  %0 = load i8, i8* @Comma
+  %tobool = icmp eq i8 %0, 0
+  br i1 %tobool, label %for.inc, label %if.then
+
+if.then:
+  tail call void @funcb()
+  store i8 1, i8* @Comma
+  br label %for.inc
+
+for.inc:
+  %incdec.ptr = getelementptr inbounds i8, i8* %b.addr.03, i64 1
+  %cmp = icmp eq i8* %incdec.ptr, %e
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
+; CHECK_LABEL: @funca
+
+; Peeled iteration
+; CHECK: %[[REG1:[0-9]+]] = load i8, i8* @Comma
+; CHECK: %[[REG2:.*]] = icmp eq i8 %[[REG1]], 0
+; CHECK: br i1 %[[REG2]], label %{{.*}}, label %[[IFTHEN:.*]]
+; CHECK: [[IFTHEN]]:
+; CHECK: call void @funcb()
+; CHECK: store i8 1, i8* @Comma
+; CHECK: br label %[[FORINC]]
+; CHECK: [[FORINC]]:
+; CHECK: %[[REG3:.*]] = getelementptr inbounds i8, i8* %b, i64 1
+; CHECK: %[[REG4:.*]] = icmp eq i8* %[[REG3]], %e
+; CHECK: br i1 %[[REG4]]
+
+; main body
+; CHECK: %[[REG1b:.*]] = load i8, i8* @Comma
+; CHECK: %[[REG2b:.*]] = icmp eq i8 %[[REG1b]], 0
+; CHECK: br i1 %[[REG2b]], label %{{.*}}, label %[[IFTHENb:.*]]
+; CHECK: [[IFTHENb]]:
+; CHECK: call void @funcb()
+; CHECK: store i8 1, i8* @Comma
+; CHECK: br label %[[FORINCb]]
+; CHECK: [[FORINCb]]:
+; CHECK: %[[REG3b:.*]] = getelementptr inbounds i8, i8* %b, i64 1
+; CHECK: %[[REG4b:.*]] = icmp eq i8* %[[REG3b]], %e
+; CHECK: br i1 %[[REG4b]]
diff --git a/test/Transforms/LoopUnroll/revisit.ll b/test/Transforms/LoopUnroll/revisit.ll
index 88c9f7ba21a17fd054a2ee8db45be8ffca1a91a6..fddf6cd1c4e8efb77b30d854bcb9d3d95e0c488c 100644
--- a/test/Transforms/LoopUnroll/revisit.ll
+++ b/test/Transforms/LoopUnroll/revisit.ll
@@ -138,11 +138,11 @@ l0.0.latch:
 ; CHECK-CHILDREN: LoopUnrollPass on Loop at depth 2 containing: %l0.0<header>
 ; CHECK-CHILDREN-NOT: LoopUnrollPass
 ;
-; Revisit the children of the outer loop that are part of the prologue.
+; Revisit the children of the outer loop that are part of the epilogue.
 ; 
-; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.0.prol<header>
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.0.epil<header>
 ; CHECK-NOT: LoopUnrollPass
-; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.1.prol<header>
+; CHECK: LoopUnrollPass on Loop at depth 2 containing: %l0.0.1.epil<header>
 ; CHECK-NOT: LoopUnrollPass
 l0.latch:
   br label %l0
diff --git a/test/Transforms/LoopUnroll/runtime-loop3.ll b/test/Transforms/LoopUnroll/runtime-loop3.ll
index ef39a29fa89e2be96491a67602a9116ca7ca5b55..253993ee42d43c7b6f5cf706f2832d3217a006a2 100644
--- a/test/Transforms/LoopUnroll/runtime-loop3.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop3.ll
@@ -1,6 +1,6 @@
 ; REQUIRES: asserts
-; RUN: opt < %s -disable-output -stats -loop-unroll -unroll-runtime -unroll-threshold=400 -info-output-file - | FileCheck %s --check-prefix=STATS
-; RUN: opt < %s -disable-output -stats -passes='require<opt-remark-emit>,loop(unroll)' -unroll-runtime -unroll-threshold=400 -info-output-file - | FileCheck %s --check-prefix=STATS
+; RUN: opt < %s -disable-output -stats -loop-unroll -unroll-runtime -unroll-partial-threshold=200 -unroll-threshold=400 -info-output-file - | FileCheck %s --check-prefix=STATS
+; RUN: opt < %s -disable-output -stats -passes='require<opt-remark-emit>,loop(unroll)' -unroll-runtime -unroll-partial-threshold=200 -unroll-threshold=400 -info-output-file - | FileCheck %s --check-prefix=STATS
 
 ; Test that nested loops can be unrolled.  We need to increase threshold to do it
 
diff --git a/test/Transforms/LoopUnroll/runtime-loop5.ll b/test/Transforms/LoopUnroll/runtime-loop5.ll
index 6340058411feb6daa871eb9dc0dc987115431285..86a26baca657f2a7aeac2cf8a589e226357e0867 100644
--- a/test/Transforms/LoopUnroll/runtime-loop5.ll
+++ b/test/Transforms/LoopUnroll/runtime-loop5.ll
@@ -14,9 +14,6 @@ entry:
   %cmp1 = icmp eq i3 %n, 0
   br i1 %cmp1, label %for.end, label %for.body
 
-; UNROLL-16-NOT: for.body.prol:
-; UNROLL-4: for.body.prol:
-
 for.body:                                         ; preds = %for.body, %entry
 ; UNROLL-16-LABEL: for.body:
 ; UNROLL-4-LABEL: for.body:
@@ -42,6 +39,10 @@ for.body:                                         ; preds = %for.body, %entry
 
 ; UNROLL-16-LABEL: for.end
 ; UNROLL-4-LABEL: for.end
+
+; UNROLL-16-NOT: for.body.epil:
+; UNROLL-4: for.body.epil:
+
 for.end:                                          ; preds = %for.body, %entry
   %sum.0.lcssa = phi i3 [ 0, %entry ], [ %add, %for.body ]
   ret i3 %sum.0.lcssa
diff --git a/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll b/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
index f7add40b9d157e507ad3bd20605a72af47600651..6778a52b3af87bae907d756a05e6fd9f537266bd 100644
--- a/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
+++ b/test/Transforms/LoopUnroll/unroll-heuristics-pgo.ll
@@ -3,12 +3,12 @@
 @known_constant = internal unnamed_addr constant [9 x i32] [i32 0, i32 -1, i32 0, i32 -1, i32 5, i32 -1, i32 0, i32 -1, i32 0], align 16
 
 ; CHECK-LABEL: @bar_prof
-; CHECK: loop.prol:
 ; CHECK: loop:
 ; CHECK: %mul = mul
 ; CHECK: %mul.1 = mul
 ; CHECK: %mul.2 = mul
 ; CHECK: %mul.3 = mul
+; CHECK: loop.epil:
 define i32 @bar_prof(i32* noalias nocapture readonly %src, i64 %c) !prof !1 {
 entry:
   br label %loop
@@ -32,7 +32,7 @@ loop.end:
 }
 
 ; CHECK-LABEL: @bar_prof_flat
-; CHECK-NOT: loop.prol
+; CHECK-NOT: loop.epil
 define i32 @bar_prof_flat(i32* noalias nocapture readonly %src, i64 %c) !prof !1 {
 entry:
   br label %loop
diff --git a/test/Transforms/LoopUnroll/unroll-pragmas.ll b/test/Transforms/LoopUnroll/unroll-pragmas.ll
index 2843e627b3c1913f8e6fe0a424335de80b97fb4c..88f32c92d694f353ef5fdbbaacb5574425603918 100644
--- a/test/Transforms/LoopUnroll/unroll-pragmas.ll
+++ b/test/Transforms/LoopUnroll/unroll-pragmas.ll
@@ -171,10 +171,6 @@ for.end:                                          ; preds = %for.body, %entry
 ; should be duplicated (original and 4x unrolled).
 ;
 ; CHECK-LABEL: @runtime_loop_with_count4(
-; CHECK: for.body.prol:
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: br i1
 ; CHECK: for.body
 ; CHECK: store
 ; CHECK: store
@@ -182,6 +178,10 @@ for.end:                                          ; preds = %for.body, %entry
 ; CHECK: store
 ; CHECK-NOT: store
 ; CHECK: br i1
+; CHECK: for.body.epil:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1
 define void @runtime_loop_with_count4(i32* nocapture %a, i32 %b) {
 entry:
   %cmp3 = icmp sgt i32 %b, 0
@@ -287,10 +287,6 @@ for.end:                                          ; preds = %for.body
 ; (original and 8x).
 ;
 ; CHECK-LABEL: @runtime_loop_with_enable(
-; CHECK: for.body.prol:
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: br i1
 ; CHECK: for.body:
 ; CHECK: store i32
 ; CHECK: store i32
@@ -302,6 +298,10 @@ for.end:                                          ; preds = %for.body
 ; CHECK: store i32
 ; CHECK-NOT: store i32
 ; CHECK: br i1
+; CHECK: for.body.epil:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1
 define void @runtime_loop_with_enable(i32* nocapture %a, i32 %b) {
 entry:
   %cmp3 = icmp sgt i32 %b, 0
@@ -328,16 +328,16 @@ for.end:                                          ; preds = %for.body, %entry
 ; should be duplicated (original and 3x unrolled).
 ;
 ; CHECK-LABEL: @runtime_loop_with_count3(
-; CHECK: for.body.prol:
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: br i1
 ; CHECK: for.body
 ; CHECK: store
 ; CHECK: store
 ; CHECK: store
 ; CHECK-NOT: store
 ; CHECK: br i1
+; CHECK: for.body.epil:
+; CHECK: store
+; CHECK-NOT: store
+; CHECK: br i1
 define void @runtime_loop_with_count3(i32* nocapture %a, i32 %b) {
 entry:
   %cmp3 = icmp sgt i32 %b, 0
diff --git a/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll b/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1f106bd894a85e929d60b60e14d8f013718858d7
--- /dev/null
+++ b/test/Transforms/LoopUnswitch/AMDGPU/divergent-unswitch.ll
@@ -0,0 +1,85 @@
+; RUN: opt -mtriple=amdgcn-- -O3 -S %s | FileCheck %s
+
+; Check that loop unswitch happened and condition hoisted out of the loop.
+; Condition is uniform so all targets should perform unswitching.
+
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @uniform_unswitch
+; CHECK: entry:
+; CHECK-NEXT: [[LOOP_COND:%[a-z0-9]+]] = icmp
+; CHECK-NEXT: [[IF_COND:%[a-z0-9]+]] = icmp eq i32 %x, 123456
+; CHECK-NEXT: and i1 [[LOOP_COND]], [[IF_COND]]
+; CHECK-NEXT: br i1
+
+define amdgpu_kernel void @uniform_unswitch(i32 * nocapture %out, i32 %n, i32 %x) {
+entry:
+  %cmp6 = icmp sgt i32 %n, 0
+  br i1 %cmp6, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %cmp1 = icmp eq i32 %x, 123456
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.07 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  br i1 %cmp1, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.07
+  store i32 %i.07, i32 * %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i.07, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+; Check that loop unswitch does not happen if condition is divergent.
+
+; CHECK-LABEL: {{^}}define amdgpu_kernel void @divergent_unswitch
+; CHECK: entry:
+; CHECK: icmp
+; CHECK: [[IF_COND:%[a-z0-9]+]] = icmp {{.*}} 567890
+; CHECK: br label
+; CHECK: br i1 [[IF_COND]]
+
+define amdgpu_kernel void @divergent_unswitch(i32 * nocapture %out, i32 %n) {
+entry:
+  %cmp9 = icmp sgt i32 %n, 0
+  br i1 %cmp9, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %call = tail call i32 @llvm.amdgcn.workitem.id.x() #0
+  %cmp2 = icmp eq i32 %call, 567890
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.inc
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  ret void
+
+for.body:                                         ; preds = %for.inc, %for.body.lr.ph
+  %i.010 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.inc ]
+  br i1 %cmp2, label %if.then, label %for.inc
+
+if.then:                                          ; preds = %for.body
+  %arrayidx = getelementptr inbounds i32, i32 * %out, i32 %i.010
+  store i32 %i.010, i32 * %arrayidx, align 4
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body, %if.then
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond = icmp eq i32 %inc, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+declare i32 @llvm.amdgcn.workitem.id.x() #0
+
+attributes #0 = { nounwind readnone }
diff --git a/test/Transforms/LoopUnswitch/AMDGPU/lit.local.cfg b/test/Transforms/LoopUnswitch/AMDGPU/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..2a665f06be72e5515ca6e27018facb35daa201be
--- /dev/null
+++ b/test/Transforms/LoopUnswitch/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/LoopUnswitch/basictest.ll b/test/Transforms/LoopUnswitch/basictest.ll
index a02a463764ddae3f3bed87daef23e42ef24ce1eb..3add848395aeaf3b696d2583d273914b3a706a6c 100644
--- a/test/Transforms/LoopUnswitch/basictest.ll
+++ b/test/Transforms/LoopUnswitch/basictest.ll
@@ -101,6 +101,217 @@ loop_exit:
 ; CHECK: }
 }
 
+; Make sure we unswitch %a == 0 out of the loop.
+;
+; CHECK: define void @and_i2_as_switch_input(i2
+; CHECK: entry:
+; This is an indication that the loop has been unswitched.
+; CHECK: icmp eq i2 %a, 0
+; CHECK: br
+; There should be no more unswitching after the 1st unswitch.
+; CHECK-NOT: icmp eq
+; CHECK: ret
+define void @and_i2_as_switch_input(i2 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i2 [ 0, %entry ], [ %inc, %for.inc ]
+  %and = and i2 %a, %i
+  %and1 = and i2 %and, %i
+  switch i2 %and1, label %sw.default [
+    i2 0, label %sw.bb
+    i2 1, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i2 %i, 1
+  %cmp = icmp slt i2 %inc, 3 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we unswitch %a == !0 out of the loop.
+;
+; CHECK: define void @or_i2_as_switch_input(i2
+; CHECK: entry:
+; This is an indication that the loop has been unswitched.
+; CHECK: icmp eq i2 %a, -1
+; CHECK: br
+; There should be no more unswitching after the 1st unswitch.
+; CHECK-NOT: icmp eq
+; CHECK: ret
+define void @or_i2_as_switch_input(i2 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i2 [ 0, %entry ], [ %inc, %for.inc ]
+  %or = or i2 %a, %i
+  %or1 = or i2 %or, %i
+  switch i2 %or1, label %sw.default [
+    i2 2, label %sw.bb
+    i2 3, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i2 %i, 1
+  %cmp = icmp slt i2 %inc, 3 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we unswitch %a == !0 out of the loop. Even we do not
+; have it as a case value. Unswitching it out allows us to simplify
+; the or operator chain.
+;
+; CHECK: define void @or_i2_as_switch_input_unswitch_default(i2
+; CHECK: entry:
+; This is an indication that the loop has been unswitched.
+; CHECK: icmp eq i2 %a, -1
+; CHECK: br
+; There should be no more unswitching after the 1st unswitch.
+; CHECK-NOT: icmp eq
+; CHECK: ret
+define void @or_i2_as_switch_input_unswitch_default(i2 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i2 [ 0, %entry ], [ %inc, %for.inc ]
+  %or = or i2 %a, %i
+  %or1 = or i2 %or, %i
+  switch i2 %or1, label %sw.default [
+    i2 1, label %sw.bb
+    i2 2, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i2 %i, 1
+  %cmp = icmp slt i2 %inc, 3 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we don't unswitch, as we can not find an input value %a
+; that will effectively unswitch 0 or 3 out of the loop.
+;
+; CHECK: define void @and_or_i2_as_switch_input(i2
+; CHECK: entry:
+; This is an indication that the loop has NOT been unswitched.
+; CHECK-NOT: icmp
+; CHECK: br
+define void @and_or_i2_as_switch_input(i2 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i2 [ 0, %entry ], [ %inc, %for.inc ]
+  %and = and i2 %a, %i 
+  %or = or i2 %and, %i
+  switch i2 %or, label %sw.default [
+    i2 0, label %sw.bb
+    i2 3, label %sw.bb1
+  ]
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i2 %i, 1
+  %cmp = icmp slt i2 %inc, 3 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; Make sure we don't unswitch, as we can not find an input value %a
+; that will effectively unswitch true/false out of the loop.
+;
+; CHECK: define void @and_or_i1_as_branch_input(i1
+; CHECK: entry:
+; This is an indication that the loop has NOT been unswitched.
+; CHECK-NOT: icmp
+; CHECK: br
+define void @and_or_i1_as_branch_input(i1 %a) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i1 [ 0, %entry ], [ %inc, %for.inc ]
+  %and = and i1 %a, %i 
+  %or = or i1 %and, %i
+  br i1 %or, label %sw.bb, label %sw.bb1
+
+sw.bb:
+  br label %sw.epilog
+
+sw.bb1:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i1 %i, 1
+  %cmp = icmp slt i1 %inc, 1 
+  br i1 %cmp, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
 
 declare void @incf() noreturn
 declare void @decf() noreturn
diff --git a/test/Transforms/LoopUnswitch/cold-loop.ll b/test/Transforms/LoopUnswitch/cold-loop.ll
deleted file mode 100644
index 1fbc08038bbdff7d600213939b69b093348cb123..0000000000000000000000000000000000000000
--- a/test/Transforms/LoopUnswitch/cold-loop.ll
+++ /dev/null
@@ -1,52 +0,0 @@
-; RUN: opt < %s -loop-unswitch -loop-unswitch-with-block-frequency -S 2>&1 | FileCheck %s
-
-;; trivial condition should be unswithed regardless of coldness.
-define i32 @test1(i1 %cond1, i1 %cond2) !prof !1 {
-  br i1 %cond1, label %loop_begin, label %loop_exit, !prof !0
-
-loop_begin:
-; CHECK: br i1 true, label %continue, label %loop_exit.loopexit
-  br i1 %cond2, label %continue, label %loop_exit  ; trivial condition
-
-continue:
-  call void @some_func1() noreturn nounwind
-  br label %loop_begin
-
-loop_exit:
-  ret i32 0
-}
-
-;; cold non-trivial condition should not be unswitched.
-define i32 @test2(i32* %var, i1 %cond1, i1 %cond2) !prof !1 {
-  br i1 %cond1, label %loop_begin, label %loop_exit, !prof !0
-
-loop_begin:
-  store i32 1, i32* %var
-; CHECK: br i1 %cond2, label %continue1, label %continue2
-  br i1 %cond2, label %continue1, label %continue2  ; non-trivial condition
-
-continue1:
-  call void @some_func1() noreturn nounwind
-  br label %joint
-
-continue2:
-  call void @some_func2() noreturn nounwind
-  br label %joint
-
-joint:
-;; unswitching will duplicate these calls.
-  call void @some_func3() noreturn nounwind
-  call void @some_func4() noreturn nounwind
-  br label %loop_begin
-
-loop_exit:
-  ret i32 0
-}
-
-declare void @some_func1() noreturn
-declare void @some_func2() noreturn
-declare void @some_func3() noreturn
-declare void @some_func4() noreturn
-
-!0 = !{!"branch_weights", i32 1, i32 100000000}
-!1 = !{!"function_entry_count", i64 100}
diff --git a/test/Transforms/LoopUnswitch/copy-metadata.ll b/test/Transforms/LoopUnswitch/copy-metadata.ll
index 2a634c25a23d8a8d667f05bb6b0bbdf1c7b5b21d..3302bce9a6e58515ad9ad109aa56647e119f0b82 100644
--- a/test/Transforms/LoopUnswitch/copy-metadata.ll
+++ b/test/Transforms/LoopUnswitch/copy-metadata.ll
@@ -3,11 +3,11 @@
 ; This test checks if unswitched condition preserve make.implicit metadata.
 
 define i32 @test(i1 %cond) {
-; CHECK: br i1 %cond, label %..split_crit_edge, label %.loop_exit.split_crit_edge, !make.implicit !0
+; CHECK-LABEL: @test(
+; CHECK:  br i1 %cond, label %..split_crit_edge, label %.loop_exit.split_crit_edge, !make.implicit !0
   br label %loop_begin
 
 loop_begin:
-; CHECK: br i1 true, label %continue, label %loop_exit, !make.implicit !0
   br i1 %cond, label %continue, label %loop_exit, !make.implicit !0
 
 continue:
diff --git a/test/Transforms/LoopUnswitch/crash.ll b/test/Transforms/LoopUnswitch/crash.ll
index 101fb7a2c2ce3fc168a20da492c5e42a352d171c..b273a123c39c19ce9cfc48347453c60bba475e26 100644
--- a/test/Transforms/LoopUnswitch/crash.ll
+++ b/test/Transforms/LoopUnswitch/crash.ll
@@ -30,7 +30,7 @@ return:		; preds = %return.loopexit, %list_Length.exit9
 	ret void
 }
 
-define void @test2(i32 %x1, i32 %y1, i32 %z1, i32 %r1) nounwind {
+define void @test2() nounwind {
 entry:
   br label %bb.nph
 
diff --git a/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll b/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d033b083a1b822ae8072eaab65e2ee8b1f6de701
--- /dev/null
+++ b/test/Transforms/LoopUnswitch/simplify-with-nonvalness.ll
@@ -0,0 +1,58 @@
+; RUN: opt < %s -loop-unswitch -verify-loop-info -S < %s 2>&1 | FileCheck %s
+
+; There are 1 case and 1 default case in the switch. after we unswitch, we know the
+; %a is definitely not 0 in one of the unswitched loop, make sure we take advantage
+; of that and simplify the branches in the loop.
+;
+; CHECK: define void @simplify_with_nonvalness(
+
+; This is the loop in which we know %a is definitely 0.
+; CHECK: sw.bb.us:
+; CHECK: br i1 true, label %if.then.us, label %if.end.us
+
+; This is the loop in which we do not know what %a is but we know %a is definitely NOT 0.
+; Make sure we use that information to simplify.
+; The icmp eq i32 %a, 0 in one of the unswitched loop is simplified to false.
+; CHECK: sw.bb.split:
+; CHECK: br i1 false, label %if.then, label %if.end
+
+define void @simplify_with_nonvalness(i32 %a) #0 {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 1024
+  br i1 %cmp, label %for.body, label %for.end
+
+for.body:
+  switch i32 %a, label %sw.default [
+    i32 0, label %sw.bb
+  ]
+
+sw.bb:
+  %cmp1 = icmp eq i32 %a, 0
+  br i1 %cmp1, label %if.then, label %if.end
+
+if.then:
+  call void (...) @bar()
+  br label %if.end
+
+if.end:
+  br label %sw.epilog
+
+sw.default:
+  br label %sw.epilog
+
+sw.epilog:
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end:
+  ret void
+}
+
+declare void @bar(...) 
diff --git a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
index 21b59f87d042752f536deb6756ee0ad00f9dbe45..37a6d4e7998442c4fb46952c04dfd2f32167a4d3 100644
--- a/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
+++ b/test/Transforms/LoopVectorize/AArch64/aarch64-predication.ll
@@ -1,40 +1,55 @@
-; RUN: opt < %s -loop-vectorize -simplifycfg -S | FileCheck %s
-; RUN: opt < %s -force-vector-width=2 -loop-vectorize -simplifycfg -S | FileCheck %s
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -disable-output -debug-only=loop-vectorize 2>&1 | FileCheck %s --check-prefix=COST
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -instcombine -simplifycfg -S | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnu"
 
-; CHECK-LABEL: predicated_udiv_scalarized_operand
-;
 ; This test checks that we correctly compute the scalarized operands for a
 ; user-specified vectorization factor when interleaving is disabled. We use the
-; "optsize" attribute to disable all interleaving calculations.
+; "optsize" attribute to disable all interleaving calculations.  A cost of 4
+; for %tmp4 indicates that we would scalarize it's operand (%tmp3), giving
+; %tmp4 a lower scalarization overhead.
 ;
-; CHECK: vector.body:
-; CHECK:   %wide.load = load <2 x i64>, <2 x i64>* {{.*}}, align 4
-; CHECK:   br i1 {{.*}}, label %[[IF0:.+]], label %[[CONT0:.+]]
-; CHECK: [[IF0]]:
-; CHECK:   %[[T00:.+]] = extractelement <2 x i64> %wide.load, i32 0
-; CHECK:   %[[T01:.+]] = extractelement <2 x i64> %wide.load, i32 0
-; CHECK:   %[[T02:.+]] = add nsw i64 %[[T01]], %x
-; CHECK:   %[[T03:.+]] = udiv i64 %[[T00]], %[[T02]]
-; CHECK:   %[[T04:.+]] = insertelement <2 x i64> undef, i64 %[[T03]], i32 0
-; CHECK:   br label %[[CONT0]]
-; CHECK: [[CONT0]]:
-; CHECK:   %[[T05:.+]] = phi <2 x i64> [ undef, %vector.body ], [ %[[T04]], %[[IF0]] ]
-; CHECK:   br i1 {{.*}}, label %[[IF1:.+]], label %[[CONT1:.+]]
-; CHECK: [[IF1]]:
-; CHECK:   %[[T06:.+]] = extractelement <2 x i64> %wide.load, i32 1
-; CHECK:   %[[T07:.+]] = extractelement <2 x i64> %wide.load, i32 1
-; CHECK:   %[[T08:.+]] = add nsw i64 %[[T07]], %x
-; CHECK:   %[[T09:.+]] = udiv i64 %[[T06]], %[[T08]]
-; CHECK:   %[[T10:.+]] = insertelement <2 x i64> %[[T05]], i64 %[[T09]], i32 1
-; CHECK:   br label %[[CONT1]]
-; CHECK: [[CONT1]]:
-; CHECK:   phi <2 x i64> [ %[[T05]], %[[CONT0]] ], [ %[[T10]], %[[IF1]] ]
-; CHECK:   br i1 {{.*}}, label %middle.block, label %vector.body
-
-define i64 @predicated_udiv_scalarized_operand(i64* %a, i1 %c, i64 %x) optsize {
+; COST-LABEL:  predicated_udiv_scalarized_operand
+; COST:        LV: Found an estimated cost of 4 for VF 2 For instruction: %tmp4 = udiv i64 %tmp2, %tmp3
+;
+; CHECK-LABEL: @predicated_udiv_scalarized_operand(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %entry ], [ [[INDEX_NEXT:%.*]], %[[PRED_UDIV_CONTINUE2:.*]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, %entry ], [ [[TMP17:%.*]], %[[PRED_UDIV_CONTINUE2]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, i64* %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i64* [[TMP0]] to <2 x i64>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, <2 x i64>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt <2 x i64> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i1> [[TMP2]], i32 0
+; CHECK-NEXT:    br i1 [[TMP3]], label %[[PRED_UDIV_IF:.*]], label %[[PRED_UDIV_CONTINUE:.*]]
+; CHECK:       [[PRED_UDIV_IF]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw i64 [[TMP5]], %x
+; CHECK-NEXT:    [[TMP7:%.*]] = udiv i64 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <2 x i64> undef, i64 [[TMP7]], i32 0
+; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE]]
+; CHECK:       [[PRED_UDIV_CONTINUE]]:
+; CHECK-NEXT:    [[TMP9:%.*]] = phi <2 x i64> [ undef, %vector.body ], [ [[TMP8]], %[[PRED_UDIV_IF]] ]
+; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x i1> [[TMP2]], i32 1
+; CHECK-NEXT:    br i1 [[TMP10]], label %[[PRED_UDIV_IF1:.*]], label %[[PRED_UDIV_CONTINUE2]]
+; CHECK:       [[PRED_UDIV_IF1]]:
+; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <2 x i64> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i64 [[TMP12]], %x
+; CHECK-NEXT:    [[TMP14:%.*]] = udiv i64 [[TMP11]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = insertelement <2 x i64> [[TMP9]], i64 [[TMP14]], i32 1
+; CHECK-NEXT:    br label %[[PRED_UDIV_CONTINUE2]]
+; CHECK:       [[PRED_UDIV_CONTINUE2]]:
+; CHECK-NEXT:    [[TMP16:%.*]] = phi <2 x i64> [ [[TMP9]], %[[PRED_UDIV_CONTINUE]] ], [ [[TMP15]], %[[PRED_UDIV_IF1]] ]
+; CHECK-NEXT:    [[PREDPHI:%.*]] = select <2 x i1> [[TMP2]], <2 x i64> [[TMP16]], <2 x i64> [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP17]] = add <2 x i64> [[VEC_PHI]], [[PREDPHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i64 @predicated_udiv_scalarized_operand(i64* %a, i64 %x) optsize {
 entry:
   br label %for.body
 
@@ -43,7 +58,8 @@ for.body:
   %r = phi i64 [ 0, %entry ], [ %tmp6, %for.inc ]
   %tmp0 = getelementptr inbounds i64, i64* %a, i64 %i
   %tmp2 = load i64, i64* %tmp0, align 4
-  br i1 %c, label %if.then, label %for.inc
+  %cond0 = icmp sgt i64 %tmp2, 0
+  br i1 %cond0, label %if.then, label %for.inc
 
 if.then:
   %tmp3 = add nsw i64 %tmp2, %x
@@ -54,8 +70,8 @@ for.inc:
   %tmp5 = phi i64 [ %tmp2, %for.body ], [ %tmp4, %if.then]
   %tmp6 = add i64 %r, %tmp5
   %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp slt i64 %i.next, 100
-  br i1 %cond, label %for.body, label %for.end
+  %cond1 = icmp slt i64 %i.next, 100
+  br i1 %cond1, label %for.body, label %for.end
 
 for.end:
   %tmp7 = phi i64 [ %tmp6, %for.inc ]
diff --git a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll b/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
deleted file mode 100644
index fc68adb59df366bea16001f8e4c03ce9e7bc9cd9..0000000000000000000000000000000000000000
--- a/test/Transforms/LoopVectorize/AArch64/first-order-recurrence.ll
+++ /dev/null
@@ -1,341 +0,0 @@
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
-; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
-
-target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
-
-; CHECK-LABEL: @recurrence_1
-;
-; void recurrence_1(int *a, int *b, int n) {
-;   for(int i = 0; i < n; i++)
-;     b[i] =  a[i] + a[i - 1]
-; }
-;
-; CHECK:  vector.ph:
-; CHECK:    %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3
-;
-; CHECK:  vector.body:
-; CHECK:    %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:    [[L1]] = load <4 x i32>
-; CHECK:    {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK:  middle.block:
-; CHECK:    %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
-;
-; CHECK:  scalar.ph:
-; CHECK:    %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ]
-;
-; CHECK:  scalar.body:
-; CHECK:    %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL:   %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL:   [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
-; UNROLL:   [[L2]] = load <4 x i32>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL:   %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
-;
-define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) {
-entry:
-  br label %for.preheader
-
-for.preheader:
-  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0
-  %pre_load = load i32, i32* %arrayidx.phi.trans.insert
-  br label %scalar.body
-
-scalar.body:
-  %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ]
-  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
-  %1 = load i32, i32* %arrayidx32
-  %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
-  %add35 = add i32 %1, %0
-  store i32 %add35, i32* %arrayidx34
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.exit, label %scalar.body
-
-for.exit:
-  ret void
-}
-
-; CHECK-LABEL: @recurrence_2
-;
-; int recurrence_2(int *a, int n) {
-;   int minmax;
-;   for (int i = 0; i < n; ++i)
-;     minmax = min(minmax, max(a[i] - a[i-1], 0));
-;   return minmax;
-; }
-;
-; CHECK:  vector.ph:
-; CHECK:    %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
-;
-; CHECK:  vector.body:
-; CHECK:    %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:    [[L1]] = load <4 x i32>
-; CHECK:    {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK:  middle.block:
-; CHECK:    %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
-;
-; CHECK:  scalar.ph:
-; CHECK:    %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ]
-;
-; CHECK:  scalar.body:
-; CHECK:    %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL:   %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL:   [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
-; UNROLL:   [[L2]] = load <4 x i32>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL:   {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL:   %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
-;
-define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
-entry:
-  %cmp27 = icmp sgt i32 %n, 0
-  br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
-
-for.preheader:
-  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1
-  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
-  br label %scalar.body
-
-for.cond.cleanup.loopexit:
-  %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ]
-  br label %for.cond.cleanup
-
-for.cond.cleanup:
-  %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
-  ret i32 %minmax.0.lcssa
-
-scalar.body:
-  %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
-  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
-  %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
-  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
-  %1 = load i32, i32* %arrayidx, align 4
-  %sub3 = sub nsw i32 %1, %0
-  %cmp4 = icmp sgt i32 %sub3, 0
-  %cond = select i1 %cmp4, i32 %sub3, i32 0
-  %cmp5 = icmp slt i32 %minmax.028, %cond
-  %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body
-}
-
-; CHECK-LABEL: @recurrence_3
-;
-; void recurrence_3(short *a, double *b, int n, float f, short p) {
-;   b[0] = (double)a[0] - f * (double)p;
-;   for (int i = 1; i < n; i++)
-;     b[i] = (double)a[i] - f * (double)a[i - 1];
-; }
-;
-;
-; CHECK:  vector.ph:
-; CHECK:    %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3
-;
-; CHECK:  vector.body:
-; CHECK:    %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
-; CHECK:    [[L1]] = load <4 x i16>
-; CHECK:    {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; CHECK:  middle.block:
-; CHECK:    %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
-;
-; CHECK:  scalar.ph:
-; CHECK:    %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ]
-;
-; CHECK:  scalar.body:
-; CHECK:    %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-; UNROLL: vector.body:
-; UNROLL:   %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
-; UNROLL:   [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16>
-; UNROLL:   [[L2]] = load <4 x i16>
-; UNROLL:   {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL:   {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL: middle.block:
-; UNROLL:   %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3
-;
-define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) {
-entry:
-  %0 = load i16, i16* %a, align 2
-  %conv = sitofp i16 %0 to double
-  %conv1 = fpext float %f to double
-  %conv2 = sitofp i16 %p to double
-  %mul = fmul fast double %conv2, %conv1
-  %sub = fsub fast double %conv, %mul
-  store double %sub, double* %b, align 8
-  %cmp25 = icmp sgt i32 %n, 1
-  br i1 %cmp25, label %for.preheader, label %for.end
-
-for.preheader:
-  br label %scalar.body
-
-scalar.body:
-  %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ]
-  %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ]
-  %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv
-  %2 = load i16, i16* %arrayidx5, align 2
-  %conv6 = sitofp i16 %2 to double
-  %conv11 = sitofp i16 %1 to double
-  %mul12 = fmul fast double %conv11, %conv1
-  %sub13 = fsub fast double %conv6, %mul12
-  %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv
-  store double %sub13, double* %arrayidx15, align 8
-  %advars.iv.next = add nuw nsw i64 %advars.iv, 1
-  %lftr.wideiv = trunc i64 %advars.iv.next to i32
-  %exitcond = icmp eq i32 %lftr.wideiv, %n
-  br i1 %exitcond, label %for.end.loopexit, label %scalar.body
-
-for.end.loopexit:
-  br label %for.end
-
-for.end:
-  ret void
-}
-
-; CHECK-LABEL: @PR26734
-;
-; void PR26734(short *a, int *b, int *c, int d, short *e) {
-;   for (; d != 21; d++) {
-;     *b &= *c;
-;     *e = *a - 6;
-;     *c = *e;
-;   }
-; }
-;
-; CHECK-NOT: vector.ph:
-;
-define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) {
-entry:
-  %cmp4 = icmp eq i32 %d, 21
-  br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph
-
-entry.for.end_crit_edge:
-  %.pre = load i32, i32* %b, align 4
-  br label %for.end
-
-for.body.lr.ph:
-  %0 = load i16, i16* %a, align 2
-  %sub = add i16 %0, -6
-  %conv2 = sext i16 %sub to i32
-  %c.promoted = load i32, i32* %c, align 4
-  %b.promoted = load i32, i32* %b, align 4
-  br label %for.body
-
-for.body:
-  %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ]
-  %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ]
-  %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ]
-  %and = and i32 %and6, %conv25
-  %inc = add nsw i32 %inc7, 1
-  %cmp = icmp eq i32 %inc, 21
-  br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body
-
-for.cond.for.end_crit_edge:
-  %and.lcssa = phi i32 [ %and, %for.body ]
-  store i32 %conv2, i32* %c, align 4
-  store i32 %and.lcssa, i32* %b, align 4
-  store i16 %sub, i16* %e, align 2
-  br label %for.end
-
-for.end:
-  ret void
-}
-
-; CHECK-LABEL: @PR27246
-;
-; int PR27246() {
-;   unsigned int e, n;
-;   for (int i = 1; i < 49; ++i) {
-;     for (int k = i; k > 1; --k)
-;       e = k;
-;     n = e;
-;   }
-;   return n;
-; }
-;
-; CHECK-NOT: vector.ph:
-;
-define i32 @PR27246() {
-entry:
-  br label %for.cond1.preheader
-
-for.cond1.preheader:
-  %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ]
-  %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ]
-  br label %for.cond1
-
-for.cond.cleanup:
-  %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ]
-  ret i32 %e.1.lcssa.lcssa
-
-for.cond1:
-  %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
-  %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
-  %cmp2 = icmp sgt i32 %k.0, 1
-  %dec = add nsw i32 %k.0, -1
-  br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
-
-for.cond.cleanup3:
-  %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ]
-  %inc = add nuw nsw i32 %i.016, 1
-  %exitcond = icmp eq i32 %inc, 49
-  br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader
-}
-
-; CHECK-LABEL: @PR29559
-;
-; UNROLL-NO-IC: vector.ph:
-; UNROLL-NO-IC:   br label %vector.body
-;
-; UNROLL-NO-IC: vector.body:
-; UNROLL-NO-IC:   %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; UNROLL-NO-IC:   %vector.recur = phi <4 x float*> [ undef, %vector.ph ], [ %[[I4:.+]], %vector.body ]
-; UNROLL-NO-IC:   %[[G1:.+]] = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0
-; UNROLL-NO-IC:   %[[I1:.+]] = insertelement <4 x float*> undef, float* %[[G1]], i32 0
-; UNROLL-NO-IC:   %[[I2:.+]] = insertelement <4 x float*> %[[I1]], float* %[[G1]], i32 1
-; UNROLL-NO-IC:   %[[I3:.+]] = insertelement <4 x float*> %[[I2]], float* %[[G1]], i32 2
-; UNROLL-NO-IC:   %[[I4]] = insertelement <4 x float*> %[[I3]], float* %[[G1]], i32 3
-; UNROLL-NO-IC:   {{.*}} = shufflevector <4 x float*> %vector.recur, <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-; UNROLL-NO-IC:   {{.*}} = shufflevector <4 x float*> %[[I4]], <4 x float*> %[[I4]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
-;
-; UNROLL-NO-IC: middle.block:
-; UNROLL-NO-IC:   %vector.recur.extract = extractelement <4 x float*> %[[I4]], i32 3
-;
-; UNROLL-NO-IC: scalar.ph:
-; UNROLL-NO-IC:   %scalar.recur.init = phi float* [ %vector.recur.extract, %middle.block ], [ undef, %min.iters.checked ], [ undef, %entry ]
-;
-; UNROLL-NO-IC: scalar.body:
-; UNROLL-NO-IC:   %scalar.recur = phi float* [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
-;
-define void @PR29559() {
-entry:
-  br label %scalar.body
-
-scalar.body:
-  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
-  %tmp2 = phi float* [ undef, %entry ], [ %tmp3, %scalar.body ]
-  %tmp3 = getelementptr inbounds [3 x float], [3 x float]* undef, i64 0, i64 0
-  %i.next = add nuw nsw i64 %i, 1
-  %cond = icmp eq i64 %i.next, undef
-  br i1 %cond, label %for.end, label %scalar.body
-
-for.end:
-  ret void
-}
diff --git a/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e8ef425623564529fd163a3ce427beab21d18aa8
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/induction-trunc.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -force-vector-width=1 -force-vector-interleave=2 -loop-vectorize -S | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: @non_primary_iv_trunc_free(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[INDUCTION1:%.*]] = add i64 [[OFFSET_IDX]], 5
+; CHECK-NEXT:    [[TMP4:%.*]] = trunc i64 [[INDUCTION]] to i32
+; CHECK-NEXT:    [[TMP5:%.*]] = trunc i64 [[INDUCTION1]] to i32
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @non_primary_iv_trunc_free(i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = trunc i64 %i to i32
+  %i.next = add nuw nsw i64 %i, 5
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0ebb7a92edaea24f0e4a9f613f6a2c30b92c1fe7
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt < %s -force-vector-width=2 -force-vector-interleave=1 -loop-vectorize -S --debug-only=loop-vectorize 2>&1 | FileCheck %s
+
+; This test shows extremely high interleaving cost that, probably, should be fixed.
+; Due to the high cost, interleaving is not beneficial and the cost model chooses to scalarize
+; the load instructions.
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+%pair = type { i8, i8 }
+
+; CHECK-LABEL: test
+; CHECK: Found an estimated cost of 20 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: vector.body
+; CHECK: load i8
+; CHECK: load i8
+; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @test(%pair* %p, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr %pair, %pair* %p, i64 %i, i32 0
+  %tmp1 = load i8, i8* %tmp0, align 1
+  %tmp2 = getelementptr %pair, %pair* %p, i64 %i, i32 1
+  %tmp3 = load i8, i8* %tmp2, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, %n
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
index df1f9c6194085c7890e7f3ef1174fe28146b127d..54ee8fc6e73fd3e9059c042990820d8506f50d61 100644
--- a/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
+++ b/test/Transforms/LoopVectorize/AArch64/interleaved_cost.ll
@@ -1,81 +1,189 @@
-; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-gnueabi"
 
-@AB = common global [1024 x i8] zeroinitializer, align 4
-@CD = common global [1024 x i8] zeroinitializer, align 4
+%i8.2 = type {i8, i8}
+define void @i8_factor_2(%i8.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
+  %tmp2 = load i8, i8* %tmp0, align 1
+  %tmp3 = load i8, i8* %tmp1, align 1
+  store i8 0, i8* %tmp0, align 1
+  store i8 0, i8* %tmp1, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i16.2 = type {i16, i16}
+define void @i16_factor_2(%i16.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_4-LABEL: Checking a loop in "i16_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
+  %tmp2 = load i16, i16* %tmp0, align 2
+  %tmp3 = load i16, i16* %tmp1, align 2
+  store i16 0, i16* %tmp0, align 2
+  store i16 0, i16* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
 
-define void @test_byte_interleaved_cost(i8 %C, i8 %D) {
+for.end:
+  ret void
+}
+
+%i32.2 = type {i32, i32}
+define void @i32_factor_2(%i32.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved
-; access group is 2.
-
-; CHECK: LV: Checking a loop in "test_byte_interleaved_cost"
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv
-  %tmp = load i8, i8* %arrayidx0, align 4
-  %tmp1 = or i64 %indvars.iv, 1
-  %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1
-  %tmp2 = load i8, i8* %arrayidx1, align 4
-  %add = add nsw i8 %tmp, %C
-  %mul = mul nsw i8 %tmp2, %D
-  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv
-  store i8 %add, i8* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1
-  store i8 %mul, i8* %arrayidx3, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
-  %cmp = icmp slt i64 %indvars.iv.next, 1024
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
+; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  store i32 0, i32* %tmp0, align 4
+  store i32 0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
   ret void
 }
 
-%ig.factor.8 = type { double*, double, double, double, double, double, double, double }
-define double @wide_interleaved_group(%ig.factor.8* %s, double %a, double %b, i32 %n) {
+%i64.2 = type {i64, i64}
+define void @i64_factor_2(%i64.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; Check the default cost of a strided load with a factor that is greater than
-; the maximum allowed. In this test, the interleave factor would be 8, which is
-; not supported.
+; VF_2-LABEL:  Checking a loop in "i64_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_4-LABEL:  Checking a loop in "i64_factor_2"
+; VF_4:          Found an estimated cost of 4 for VF 4 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_4-NEXT:     Found an estimated cost of 4 for VF 4 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_8-LABEL:  Checking a loop in "i64_factor_2"
+; VF_8:          Found an estimated cost of 8 for VF 8 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_8-NEXT:     Found an estimated cost of 8 for VF 8 For instruction: store i64 0, i64* %tmp1, align 8
+; VF_16-LABEL: Checking a loop in "i64_factor_2"
+; VF_16:         Found an estimated cost of 16 for VF 16 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_16-NEXT:    Found an estimated cost of 16 for VF 16 For instruction: store i64 0, i64* %tmp1, align 8
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i64.2, %i64.2* %data, i64 %i, i32 1
+  %tmp2 = load i64, i64* %tmp0, align 8
+  %tmp3 = load i64, i64* %tmp1, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
 
-; CHECK: LV: Checking a loop in "wide_interleaved_group"
-; CHECK: LV: Found an estimated cost of 6 for VF 2 For instruction:   %1 = load double, double* %0, align 8
-; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   %5 = load double, double* %4, align 8
-; CHECK: LV: Found an estimated cost of 10 for VF 2 For instruction:   store double %9, double* %10, align 8
+%i64.8 = type {i64, i64, i64, i64, i64, i64, i64, i64}
+define void @i64_factor_8(%i64.8* %data, i64 %n) {
+entry:
+  br label %for.body
 
+; The interleave factor in this test is 8, which is greater than the maximum
+; allowed factor for AArch64 (4). Thus, we will fall back to the basic TTI
+; implementation for determining the cost of the interleaved load group. The
+; stores do not form a legal interleaved group because the group would contain
+; gaps.
+;
+; VF_2-LABEL: Checking a loop in "i64_factor_8"
+; VF_2:         Found an estimated cost of 6 for VF 2 For instruction: %tmp2 = load i64, i64* %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i64, i64* %tmp1, align 8
+; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp0, align 8
+; VF_2-NEXT:    Found an estimated cost of 7 for VF 2 For instruction: store i64 0, i64* %tmp1, align 8
 for.body:
   %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
-  %r = phi double [ 0.000000e+00, %entry ], [ %12, %for.body ]
-  %0 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 2
-  %1 = load double, double* %0, align 8
-  %2 = fcmp fast olt double %1, %a
-  %3 = select i1 %2, double 0.000000e+00, double %1
-  %4 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 6
-  %5 = load double, double* %4, align 8
-  %6 = fcmp fast olt double %5, %a
-  %7 = select i1 %6, double 0.000000e+00, double %5
-  %8 = fmul fast double %7, %b
-  %9 = fadd fast double %8, %3
-  %10 = getelementptr inbounds %ig.factor.8, %ig.factor.8* %s, i64 %i, i32 3
-  store double %9, double* %10, align 8
-  %11 = fmul fast double %9, %9
-  %12 = fadd fast double %11, %r
+  %tmp0 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 2
+  %tmp1 = getelementptr inbounds %i64.8, %i64.8* %data, i64 %i, i32 6
+  %tmp2 = load i64, i64* %tmp0, align 8
+  %tmp3 = load i64, i64* %tmp1, align 8
+  store i64 0, i64* %tmp0, align 8
+  store i64 0, i64* %tmp1, align 8
   %i.next = add nuw nsw i64 %i, 1
-  %13 = trunc i64 %i.next to i32
-  %cond = icmp eq i32 %13, %n
-  br i1 %cond, label %for.exit, label %for.body
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
 
-for.exit:
-  %r.lcssa = phi double [ %12, %for.body ]
-  ret double %r.lcssa
+for.end:
+  ret void
 }
diff --git a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
index c7ced757581aa399264f0afe85be0c782a4ae8d1..e090ddf1d1aaea298712aabe2556ad824f4549a5 100644
--- a/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
+++ b/test/Transforms/LoopVectorize/AArch64/loop-vectorization-factors.ll
@@ -235,10 +235,13 @@ for.body:                                         ; preds = %entry, %for.body
 }
 
 ; CHECK-LABEL: @add_phifail2(
-; CHECK: load <16 x i8>, <16 x i8>*
-; CHECK: add nuw nsw <16 x i32>
-; CHECK: store <16 x i8>
+; CHECK-NOT: load <16 x i8>, <16 x i8>*
+; CHECK-NOT: add nuw nsw <16 x i32>
+; CHECK-NOT: store <16 x i8>
 ; Function Attrs: nounwind
+; FIXME: Currently, if we vectorize this loop, we will generate incorrect code
+; if %len evenly divides VF. Vectorized loop code gen returns a_phi = p[len -1],
+; whereas it should be the previous value a_phi = p[len -2]
 define i8 @add_phifail2(i8* noalias nocapture readonly %p, i8* noalias nocapture %q, i32 %len) #0 {
 entry:
   br label %for.body
diff --git a/test/Transforms/LoopVectorize/AArch64/pr31900.ll b/test/Transforms/LoopVectorize/AArch64/pr31900.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5ea38a4a246dc9cfc7d96abd61d3eecff6b3d9b0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/pr31900.ll
@@ -0,0 +1,37 @@
+; RUN: opt -S -mtriple=aarch64-apple-ios -loop-vectorize -enable-interleaved-mem-accesses -force-vector-width=2 < %s | FileCheck %s
+
+; Reproducer for address space fault in the LoopVectorizer (pr31900). Added
+; different sized address space pointers (p:16:16-p4:32:16) to the aarch64
+; datalayout to reproduce the fault.
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128-p:16:16-p4:32:16"
+
+; Check that all the loads are scalarized
+; CHECK: load i16, i16*
+; CHECK: load i16, i16*
+; CHECK: load i16, i16 addrspace(4)*
+; CHECK: load i16, i16 addrspace(4)*
+
+%rec1445 = type { i16, i16, i16, i16, i16 }
+
+define void @foo() {
+bb1:
+  br label %bb4
+
+bb4:
+  %tmp1 = phi i16 [ undef, %bb1 ], [ %_tmp1013, %bb4 ]
+  %tmp2 = phi %rec1445* [ undef, %bb1 ], [ %_tmp1015, %bb4 ]
+  %tmp3 = phi %rec1445 addrspace(4)* [ undef, %bb1 ], [ %_tmp1017, %bb4 ]
+  %0 = getelementptr %rec1445, %rec1445* %tmp2, i16 0, i32 1
+  %_tmp987 = load i16, i16* %0, align 1
+  %1 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 0, i32 1
+  %_tmp993 = load i16, i16 addrspace(4)* %1, align 1
+  %_tmp1013 = add i16 %tmp1, 1
+  %_tmp1015 = getelementptr %rec1445, %rec1445* %tmp2, i16 1
+  %_tmp1017 = getelementptr %rec1445, %rec1445 addrspace(4)* %tmp3, i32 1
+  %_tmp1019 = icmp ult i16 %_tmp1013, 24
+  br i1 %_tmp1019, label %bb4, label %bb16
+
+bb16:
+  unreachable
+}
diff --git a/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1ae7dadeffd7f71d96d4d36b1f22fbfc8d4dbd38
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AArch64/smallest-and-widest-types.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+; CHECK-LABEL: Checking a loop in "interleaved_access"
+; CHECK:         The Smallest and Widest types: 64 / 64 bits
+;
+define void @interleaved_access(i8** %A, i64 %N) {
+for.ph:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next.3, %for.body ], [ 0, %for.ph ]
+  %tmp0 = getelementptr inbounds i8*, i8** %A, i64 %i
+  store i8* null, i8** %tmp0, align 8
+  %i.next.0 = add nuw nsw i64 %i, 1
+  %tmp1 = getelementptr inbounds i8*, i8** %A, i64 %i.next.0
+  store i8* null, i8** %tmp1, align 8
+  %i.next.1 = add nsw i64 %i, 2
+  %tmp2 = getelementptr inbounds i8*, i8** %A, i64 %i.next.1
+  store i8* null, i8** %tmp2, align 8
+  %i.next.2 = add nsw i64 %i, 3
+  %tmp3 = getelementptr inbounds i8*, i8** %A, i64 %i.next.2
+  store i8* null, i8** %tmp3, align 8
+  %i.next.3 = add nsw i64 %i, 4
+  %cond = icmp slt i64 %i.next.3, %N
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg b/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..2a665f06be72e5515ca6e27018facb35daa201be
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll b/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f303ed5377e202ec8f62939a6c00bdd2613b26d1
--- /dev/null
+++ b/test/Transforms/LoopVectorize/AMDGPU/unroll-in-loop-vectorizer.ll
@@ -0,0 +1,28 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji -loop-vectorize < %s | FileCheck %s
+
+
+; For AMDGPU, loop unroll in loop vectorizer is disabled when VF==1.
+;
+; CHECK-LABEL: @small_loop(
+; CHECK: store i32
+; CHECK-NOT: store i32
+; CHECK: ret
+define amdgpu_kernel void @small_loop(i32* nocapture %inArray, i32 %size) nounwind {
+entry:
+  %0 = icmp sgt i32 %size, 0
+  br i1 %0, label %loop, label %exit
+
+loop:                                          ; preds = %entry, %loop
+  %iv = phi i32 [ %iv1, %loop ], [ 0, %entry ]
+  %1 = getelementptr inbounds i32, i32* %inArray, i32 %iv
+  %2 = load i32, i32* %1, align 4
+  %3 = add nsw i32 %2, 6
+  store i32 %3, i32* %1, align 4
+  %iv1 = add i32 %iv, 1
+;  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %cond = icmp eq i32 %iv1, %size
+  br i1 %cond, label %exit, label %loop
+
+exit:                                         ; preds = %loop, %entry
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
index de3626b57d831c9e5c163dafea491189376a6285..29adec049f674995526300708ce5c69d67f85b61 100644
--- a/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
+++ b/test/Transforms/LoopVectorize/ARM/interleaved_cost.ll
@@ -1,39 +1,147 @@
-; RUN: opt -S -debug-only=loop-vectorize -loop-vectorize -instcombine  < %s 2>&1 | FileCheck %s
+; RUN: opt -loop-vectorize -force-vector-width=2 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_2
+; RUN: opt -loop-vectorize -force-vector-width=4 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_4
+; RUN: opt -loop-vectorize -force-vector-width=8 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_8
+; RUN: opt -loop-vectorize -force-vector-width=16 -debug-only=loop-vectorize -disable-output < %s 2>&1 | FileCheck %s --check-prefix=VF_16
 ; REQUIRES: asserts
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "armv8--linux-gnueabihf"
 
-@AB = common global [1024 x i8] zeroinitializer, align 4
-@CD = common global [1024 x i8] zeroinitializer, align 4
+%i8.2 = type {i8, i8}
+define void @i8_factor_2(%i8.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_8-LABEL:  Checking a loop in "i8_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i8 0, i8* %tmp1, align 1
+; VF_16-LABEL: Checking a loop in "i8_factor_2"
+; VF_16:         Found an estimated cost of 2 for VF 16 For instruction: %tmp2 = load i8, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i8, i8* %tmp1, align 1
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i8 0, i8* %tmp0, align 1
+; VF_16-NEXT:    Found an estimated cost of 2 for VF 16 For instruction: store i8 0, i8* %tmp1, align 1
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i8.2, %i8.2* %data, i64 %i, i32 1
+  %tmp2 = load i8, i8* %tmp0, align 1
+  %tmp3 = load i8, i8* %tmp1, align 1
+  store i8 0, i8* %tmp0, align 1
+  store i8 0, i8* %tmp1, align 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
 
-define void @test_byte_interleaved_cost(i8 %C, i8 %D) {
+%i16.2 = type {i16, i16}
+define void @i16_factor_2(%i16.2* %data, i64 %n) {
 entry:
   br label %for.body
 
-; 8xi8 and 16xi8 are valid i8 vector types, so the cost of the interleaved
-; access group is 2.
-
-; CHECK: LV: Found an estimated cost of 2 for VF 8 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
-; CHECK: LV: Found an estimated cost of 2 for VF 16 For instruction:   %tmp = load i8, i8* %arrayidx0, align 4
-
-for.body:                                         ; preds = %for.body, %entry
-  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  %arrayidx0 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %indvars.iv
-  %tmp = load i8, i8* %arrayidx0, align 4
-  %tmp1 = or i64 %indvars.iv, 1
-  %arrayidx1 = getelementptr inbounds [1024 x i8], [1024 x i8]* @AB, i64 0, i64 %tmp1
-  %tmp2 = load i8, i8* %arrayidx1, align 4
-  %add = add nsw i8 %tmp, %C
-  %mul = mul nsw i8 %tmp2, %D
-  %arrayidx2 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %indvars.iv
-  store i8 %add, i8* %arrayidx2, align 4
-  %arrayidx3 = getelementptr inbounds [1024 x i8], [1024 x i8]* @CD, i64 0, i64 %tmp1
-  store i8 %mul, i8* %arrayidx3, align 4
-  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 2
-  %cmp = icmp slt i64 %indvars.iv.next, 1024
-  br i1 %cmp, label %for.body, label %for.end
-
-for.end:                                          ; preds = %for.body
+; VF_4-LABEL:  Checking a loop in "i16_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_8-LABEL:  Checking a loop in "i16_factor_2"
+; VF_8:          Found an estimated cost of 2 for VF 8 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_8-NEXT:     Found an estimated cost of 2 for VF 8 For instruction: store i16 0, i16* %tmp1, align 2
+; VF_16-LABEL: Checking a loop in "i16_factor_2"
+; VF_16:         Found an estimated cost of 4 for VF 16 For instruction: %tmp2 = load i16, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i16, i16* %tmp1, align 2
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i16 0, i16* %tmp0, align 2
+; VF_16-NEXT:    Found an estimated cost of 4 for VF 16 For instruction: store i16 0, i16* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i16.2, %i16.2* %data, i64 %i, i32 1
+  %tmp2 = load i16, i16* %tmp0, align 2
+  %tmp3 = load i16, i16* %tmp1, align 2
+  store i16 0, i16* %tmp0, align 2
+  store i16 0, i16* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%i32.2 = type {i32, i32}
+define void @i32_factor_2(%i32.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_2-LABEL:  Checking a loop in "i32_factor_2"
+; VF_2:          Found an estimated cost of 2 for VF 2 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_2-NEXT:     Found an estimated cost of 0 for VF 2 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_2-NEXT:     Found an estimated cost of 2 for VF 2 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_4-LABEL:  Checking a loop in "i32_factor_2"
+; VF_4:          Found an estimated cost of 2 for VF 4 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_4-NEXT:     Found an estimated cost of 0 for VF 4 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_4-NEXT:     Found an estimated cost of 2 for VF 4 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_8-LABEL:  Checking a loop in "i32_factor_2"
+; VF_8:          Found an estimated cost of 4 for VF 8 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_8-NEXT:     Found an estimated cost of 0 for VF 8 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_8-NEXT:     Found an estimated cost of 4 for VF 8 For instruction: store i32 0, i32* %tmp1, align 4
+; VF_16-LABEL: Checking a loop in "i32_factor_2"
+; VF_16:         Found an estimated cost of 8 for VF 16 For instruction: %tmp2 = load i32, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: %tmp3 = load i32, i32* %tmp1, align 4
+; VF_16-NEXT:    Found an estimated cost of 0 for VF 16 For instruction: store i32 0, i32* %tmp0, align 4
+; VF_16-NEXT:    Found an estimated cost of 8 for VF 16 For instruction: store i32 0, i32* %tmp1, align 4
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %i32.2, %i32.2* %data, i64 %i, i32 1
+  %tmp2 = load i32, i32* %tmp0, align 4
+  %tmp3 = load i32, i32* %tmp1, align 4
+  store i32 0, i32* %tmp0, align 4
+  store i32 0, i32* %tmp1, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+%half.2 = type {half, half}
+define void @half_factor_2(%half.2* %data, i64 %n) {
+entry:
+  br label %for.body
+
+; VF_4-LABEL: Checking a loop in "half_factor_2"
+; VF_4:         Found an estimated cost of 40 for VF 4 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_4-NEXT:    Found an estimated cost of 0 for VF 4 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_4-NEXT:    Found an estimated cost of 32 for VF 4 For instruction: store half 0xH0000, half* %tmp1, align 2
+; VF_8-LABEL: Checking a loop in "half_factor_2"
+; VF_8:         Found an estimated cost of 80 for VF 8 For instruction: %tmp2 = load half, half* %tmp0, align 2
+; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: %tmp3 = load half, half* %tmp1, align 2
+; VF_8-NEXT:    Found an estimated cost of 0 for VF 8 For instruction: store half 0xH0000, half* %tmp0, align 2
+; VF_8-NEXT:    Found an estimated cost of 64 for VF 8 For instruction: store half 0xH0000, half* %tmp1, align 2
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 0
+  %tmp1 = getelementptr inbounds %half.2, %half.2* %data, i64 %i, i32 1
+  %tmp2 = load half, half* %tmp0, align 2
+  %tmp3 = load half, half* %tmp1, align 2
+  store half 0., half* %tmp0, align 2
+  store half 0., half* %tmp1, align 2
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
   ret void
 }
diff --git a/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll b/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d2e5945203321e808a457905cbd530acdcc0edf0
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/branch-for-predicated-block.ll
@@ -0,0 +1,38 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=2 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+
+; Check costs for branches inside a vectorized loop around predicated
+; blocks. Each such branch will be guarded with an extractelement from the
+; vector compare plus a test under mask instruction. This cost is modelled on
+; the extractelement of i1.
+
+define void @fun(i32* %arr, i64 %trip.count) {
+entry:
+  br label %for.body
+
+for.body:
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i32, i32* %arr, i64 %indvars.iv
+  %l = load i32, i32* %arrayidx, align 4
+  %cmp55 = icmp sgt i32 %l, 0
+  br i1 %cmp55, label %if.then, label %for.inc
+
+if.then:
+  %sub = sub nsw i32 0, %l
+  store i32 %sub, i32* %arrayidx, align 4
+  br label %for.inc
+
+for.inc:
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, %trip.count
+  br i1 %exitcond, label %for.end.loopexit, label %for.body
+
+for.end.loopexit:
+  ret void
+
+; CHECK: LV: Found an estimated cost of 5 for VF 2 For instruction:   br i1 %cmp55, label %if.then, label %for.inc
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br label %for.inc
+; CHECK: LV: Found an estimated cost of 0 for VF 2 For instruction:   br i1 %exitcond, label %for.end.loopexit, label %for.body
+}
diff --git a/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll b/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e7096c29b9949e0d5ac7f9ff535fe13a2eccf598
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/load-store-scalarization-cost.ll
@@ -0,0 +1,33 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -disable-output -enable-interleaved-mem-accesses=false < %s 2>&1 | \
+; RUN:   FileCheck %s
+;
+; Check that a scalarized load/store does not get a cost for insterts/
+; extracts, since z13 supports element load/store.
+
+define void @fun(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp2, i32* %tmp0, align 4
+
+; CHECK: LV: Scalarizing:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Scalarizing:  store i32 %tmp2, i32* %tmp0, align 4
+}
+
diff --git a/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5c15ee4f2d9f17148216c8e29237b5d4e12a42b3
--- /dev/null
+++ b/test/Transforms/LoopVectorize/SystemZ/mem-interleaving-costs.ll
@@ -0,0 +1,70 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z13 -loop-vectorize \
+; RUN:   -force-vector-width=4 -debug-only=loop-vectorize \
+; RUN:   -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that the loop vectorizer performs memory interleaving with accurate
+; cost estimations.
+
+
+; Simple case where just the load is interleaved, because the store group
+; would have gaps.
+define void @fun0(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %tmp2 = add i32 %tmp1, 1
+  store i32 %tmp2, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Creating an interleave group with:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 3 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+;        (vl; vl; vperm)
+}
+
+; Interleaving of both load and stores.
+define void @fun1(i32* %data, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i32, i32* %data, i64 %i
+  %tmp1 = load i32, i32* %tmp0, align 4
+  %i_1  = add i64 %i, 1
+  %tmp2 = getelementptr inbounds i32, i32* %data, i64 %i_1
+  %tmp3 = load i32, i32* %tmp2, align 4
+  store i32 %tmp1, i32* %tmp2, align 4
+  store i32 %tmp3, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+
+; CHECK: LV: Creating an interleave group with:  store i32 %tmp3, i32* %tmp0, align 4
+; CHECK: LV: Inserted:  store i32 %tmp1, i32* %tmp2, align 4
+; CHECK:     into the interleave group with  store i32 %tmp3, i32* %tmp0, align 4
+; CHECK: LV: Creating an interleave group with:  %tmp3 = load i32, i32* %tmp2, align 4
+; CHECK: LV: Inserted:  %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK:     into the interleave group with  %tmp3 = load i32, i32* %tmp2, align 4
+
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   %tmp1 = load i32, i32* %tmp0, align 4
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   %tmp3 = load i32, i32* %tmp2, align 4
+;            (vl; vl; vperm, vpkg)
+
+; CHECK: LV: Found an estimated cost of 0 for VF 4 For instruction:   store i32 %tmp1, i32* %tmp2, align 4
+; CHECK: LV: Found an estimated cost of 4 for VF 4 For instruction:   store i32 %tmp3, i32* %tmp0, align 4
+;            (vmrlf; vmrhf; vst; vst)
+}
+
diff --git a/test/Transforms/LoopVectorize/X86/avx512.ll b/test/Transforms/LoopVectorize/X86/avx512.ll
index fb01454c253b0a4d2be7195044a553f0febcb0ce..1eb1cd3f5d7a6e08f0cde38e76cdb733ddf248cf 100644
--- a/test/Transforms/LoopVectorize/X86/avx512.ll
+++ b/test/Transforms/LoopVectorize/X86/avx512.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.9.0"
 ; loop.
 
 ; CHECK-LABEL: f:
-; CHECK: vmovups %zmm{{.}},
+; CHECK: vmovdqu32 %zmm{{.}},
 ; CHECK-NOT: %ymm
 
 define void @f(i32* %a, i32 %n) {
diff --git a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
index 32bfcd2275acad1738a8b1df01370a3242c95ae3..82f2e064a5816c2cd080fb1766201d7168cecfcc 100644
--- a/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
+++ b/test/Transforms/LoopVectorize/X86/consecutive-ptr-uniforms.ll
@@ -13,22 +13,33 @@ target triple = "x86_64-unknown-linux-gnu"
 ; scatter operation. %tmp3 (and the induction variable) should not be marked
 ; uniform-after-vectorization.
 ;
-; CHECK:     LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
-; CHECK-NOT: LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
-; CHECK-NOT: LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
-; CHECK-NOT: LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
-; CHECK:     vector.body:
-; CHECK:       %vec.ind = phi <16 x i64>
-; CHECK:       %[[T0:.+]] = extractelement <16 x i64> %vec.ind, i32 0
-; CHECK:       %[[T1:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %[[T0]]
-; CHECK:       %[[T2:.+]] = bitcast float* %[[T1]] to <80 x float>*
-; CHECK:       load <80 x float>, <80 x float>* %[[T2]], align 4
-; CHECK:       %[[T3:.+]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %[[T0]]
-; CHECK:       %[[T4:.+]] = bitcast float* %[[T3]] to <80 x float>*
-; CHECK:       load <80 x float>, <80 x float>* %[[T4]], align 4
-; CHECK:       %VectorGep = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> %vec.ind
-; CHECK:       call void @llvm.masked.scatter.v16f32({{.*}}, <16 x float*> %VectorGep, {{.*}})
-; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
+; CHECK:       LV: Found uniform instruction: %tmp0 = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 %i
+; CHECK-NOT:   LV: Found uniform instruction: %tmp3 = getelementptr inbounds %data, %data* %d, i64 0, i32 0, i64 %i
+; CHECK-NOT:   LV: Found uniform instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NOT:   LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 5
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x float> undef, float %x, i32 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x float> [[BROADCAST_SPLATINSERT]], <16 x float> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 0, i64 5, i64 10, i64 15, i64 20, i64 25, i64 30, i64 35, i64 40, i64 45, i64 50, i64 55, i64 60, i64 65, i64 70, i64 75>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 5
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 3, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[TMP0]] to <80 x float>*
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <80 x float>, <80 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <80 x float> [[WIDE_VEC]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
+; CHECK-NEXT:    [[TMP2:%.*]] = fmul <16 x float> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds %data, %data* %d, i64 0, i32 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[BC:%.*]] = bitcast <16 x float*> [[TMP3]] to <16 x <80 x float>*>
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <16 x <80 x float>*> [[BC]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <80 x float>, <80 x float>* [[TMP4]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <80 x float> [[WIDE_VEC1]], <80 x float> undef, <16 x i32> <i32 0, i32 5, i32 10, i32 15, i32 20, i32 25, i32 30, i32 35, i32 40, i32 45, i32 50, i32 55, i32 60, i32 65, i32 70, i32 75>
+; CHECK-NEXT:    [[TMP5:%.*]] = fadd <16 x float> [[STRIDED_VEC2]], [[TMP2]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16f32(<16 x float> [[TMP5]], <16 x float*> [[TMP3]], i32 4, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80, i64 80>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 %data = type { [32000 x float], [3 x i32], [4 x i8], [32000 x float] }
 
diff --git a/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll b/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
new file mode 100644
index 0000000000000000000000000000000000000000..76b6cae5c3b4df1f77406159a8ca766dea499bf9
--- /dev/null
+++ b/test/Transforms/LoopVectorize/X86/gather-vs-interleave.ll
@@ -0,0 +1,41 @@
+; RUN: opt -loop-vectorize -S -mcpu=skylake-avx512  < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; This test checks that "gather" operation is choosen since it's cost is better
+; than interleaving pattern.
+;
+;unsigned long A[SIZE];
+;unsigned long B[SIZE];
+;
+;void foo() {
+;  for (int i=0; i<N; i+=8) {
+;    B[i] = A[i] + 5;
+;  }
+;}
+
+@A = global [10240 x i64] zeroinitializer, align 16
+@B = global [10240 x i64] zeroinitializer, align 16
+
+
+; CHECK_LABEL: strided_load_i64
+; CHECK: masked.gather
+define void @strided_load_i64() {
+  br label %1
+
+; <label>:1:                                      ; preds = %0, %1
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %1 ]
+  %2 = getelementptr inbounds [10240 x i64], [10240 x i64]* @A, i64 0, i64 %indvars.iv
+  %3 = load i64, i64* %2, align 16
+  %4 = add i64 %3, 5
+  %5 = getelementptr inbounds [10240 x i64], [10240 x i64]* @B, i64 0, i64 %indvars.iv
+  store i64 %4, i64* %5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 8
+  %6 = icmp slt i64 %indvars.iv.next, 1024
+  br i1 %6, label %1, label %7
+
+; <label>:7:                                      ; preds = %1
+  ret void
+}
+
diff --git a/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
index fbea275cb40f008444b2ef79dbdd5705a4a145cb..4d7c0b6f64b8732bbeae8e72d54f4762cad5c445 100644
--- a/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
+++ b/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
@@ -71,6 +71,6 @@ declare i32 @printf(i8*, ...) #1
 ; Function Attrs: nounwind
 declare i32 @puts(i8* nocapture readonly) #2
 
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pcommit,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pcommit,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index 74c0c16086fedb84a5d54c3d378460615605b1a4..e1793bcc3218498f37448bc5ecb9e12d654d68a1 100644
--- a/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -1,13 +1,14 @@
 ; RUN: opt < %s -mcpu=corei7 -O1 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1
 ; RUN: opt < %s -mcpu=corei7 -O2 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O2
-; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-threshold=150 -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3
+; RUN: opt < %s -mcpu=corei7 -O3 -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DEFAULT
 ; RUN: opt < %s -mcpu=corei7 -Os -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Os
 ; RUN: opt < %s -mcpu=corei7 -Oz -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=Oz
 ; RUN: opt < %s -mcpu=corei7 -O1 -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC
 ; RUN: opt < %s -mcpu=corei7 -Oz -vectorize-loops -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC
 ; RUN: opt < %s -mcpu=corei7 -O1 -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O1VEC2
 ; RUN: opt < %s -mcpu=corei7 -Oz -loop-vectorize -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=OzVEC2
-; RUN: opt < %s -mcpu=corei7 -O3 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
+; RUN: opt < %s -mcpu=corei7 -O3 -unroll-threshold=150 -disable-loop-vectorization -S -unroll-allow-partial=0 | FileCheck %s --check-prefix=O3DIS
 
 ; This file tests the llvm.loop.vectorize.enable metadata forcing
 ; vectorization even when optimization levels are too low, or when
@@ -25,6 +26,9 @@ target triple = "x86_64-unknown-linux-gnu"
 ; O3-LABEL: @enabled(
 ; O3: store <4 x i32>
 ; O3: ret i32
+; O3DEFAULT-LABEL: @enabled(
+; O3DEFAULT: store <4 x i32>
+; O3DEFAULT: ret i32
 ; Pragma always wins!
 ; O3DIS-LABEL: @enabled(
 ; O3DIS: store <4 x i32>
@@ -77,6 +81,9 @@ for.end:                                          ; preds = %for.body
 ; O3-LABEL: @nopragma(
 ; O3: store <4 x i32>
 ; O3: ret i32
+; O3DEFAULT-LABEL: @nopragma(
+; O3DEFAULT: store <4 x i32>
+; O3DEFAULT: ret i32
 ; O3DIS-LABEL: @nopragma(
 ; O3DIS-NOT: store <4 x i32>
 ; O3DIS: ret i32
@@ -128,6 +135,9 @@ for.end:                                          ; preds = %for.body
 ; O3-LABEL: @disabled(
 ; O3-NOT: store <4 x i32>
 ; O3: ret i32
+; O3DEFAULT-LABEL: @disabled(
+; O3DEFAULT: store <4 x i32>
+; O3DEFAULT: ret i32
 ; O3DIS-LABEL: @disabled(
 ; O3DIS-NOT: store <4 x i32>
 ; O3DIS: ret i32
diff --git a/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index ec67e632efbdcafbfcf98c94762e152d8f553878..bda4b2454ee2d1418fadbfcf898132528fc6819f 100755
--- a/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -16,97 +16,23 @@ target triple = "x86_64-apple-macosx10.11.0"
 define void @_Z3fn1v() #0 {
 ; CHECK-LABEL: @_Z3fn1v(
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX:%.*]].next, %vector.body ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ 
-; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <16 x i64> [ 
-; CHECK-NEXT:    [[SHL:%.*]] = shl i64 %index, 1
-; CHECK-NEXT:    %offset.idx = add i64 [[SHL]], 8
-; CHECK-NEXT:    [[IND00:%.*]] = add i64 %offset.idx, 0
-; CHECK-NEXT:    [[IND02:%.*]] = add i64 %offset.idx, 2
-; CHECK-NEXT:    [[IND04:%.*]] = add i64 %offset.idx, 4
-; CHECK-NEXT:    [[IND06:%.*]] = add i64 %offset.idx, 6
-; CHECK-NEXT:    [[IND08:%.*]] = add i64 %offset.idx, 8
-; CHECK-NEXT:    [[IND10:%.*]] = add i64 %offset.idx, 10
-; CHECK-NEXT:    [[IND12:%.*]] = add i64 %offset.idx, 12
-; CHECK-NEXT:    [[IND14:%.*]] = add i64 %offset.idx, 14
-; CHECK-NEXT:    [[IND16:%.*]] = add i64 %offset.idx, 16
-; CHECK-NEXT:    [[IND18:%.*]] = add i64 %offset.idx, 18
-; CHECK-NEXT:    [[IND20:%.*]] = add i64 %offset.idx, 20
-; CHECK-NEXT:    [[IND22:%.*]] = add i64 %offset.idx, 22
-; CHECK-NEXT:    [[IND24:%.*]] = add i64 %offset.idx, 24
-; CHECK-NEXT:    [[IND26:%.*]] = add i64 %offset.idx, 26
-; CHECK-NEXT:    [[IND28:%.*]] = add i64 %offset.idx, 28
-; CHECK-NEXT:    [[IND30:%.*]] = add i64 %offset.idx, 30
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <16 x i64> [ <i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30, i64 32, i64 34, i64 36, i64 38>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND3:%.*]] = phi <16 x i64> [ <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14, i64 16, i64 18, i64 20, i64 22, i64 24, i64 26, i64 28, i64 30>, %vector.ph ], [ [[VEC_IND_NEXT4:%.*]], %vector.body ]
 ; CHECK-NEXT:    [[TMP10:%.*]] = sub nsw <16 x i64> <i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8, i64 8>, [[VEC_IND]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND00]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND02]]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND04]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND06]]
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND08]]
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND10]]
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND12]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND14]]
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND16]]
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND18]]
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND20]]
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND22]]
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND24]]
-; CHECK-NEXT:    [[TMP51:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND26]]
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND28]]
-; CHECK-NEXT:    [[TMP57:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, i64 [[IND30]]
-; CHECK-NEXT:    [[TMP13:%.*]] = insertelement <16 x [10 x i32]*> undef, [10 x i32]* [[TMP12]], i32 0
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <16 x [10 x i32]*> [[TMP13]], [10 x i32]* [[TMP15]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <16 x [10 x i32]*> [[TMP16]], [10 x i32]* [[TMP18]], i32 2
-; CHECK-NEXT:    [[TMP22:%.*]] = insertelement <16 x [10 x i32]*> [[TMP19]], [10 x i32]* [[TMP21]], i32 3
-; CHECK-NEXT:    [[TMP25:%.*]] = insertelement <16 x [10 x i32]*> [[TMP22]], [10 x i32]* [[TMP24]], i32 4
-; CHECK-NEXT:    [[TMP28:%.*]] = insertelement <16 x [10 x i32]*> [[TMP25]], [10 x i32]* [[TMP27]], i32 5
-; CHECK-NEXT:    [[TMP31:%.*]] = insertelement <16 x [10 x i32]*> [[TMP28]], [10 x i32]* [[TMP30]], i32 6
-; CHECK-NEXT:    [[TMP34:%.*]] = insertelement <16 x [10 x i32]*> [[TMP31]], [10 x i32]* [[TMP33]], i32 7
-; CHECK-NEXT:    [[TMP37:%.*]] = insertelement <16 x [10 x i32]*> [[TMP34]], [10 x i32]* [[TMP36]], i32 8
-; CHECK-NEXT:    [[TMP40:%.*]] = insertelement <16 x [10 x i32]*> [[TMP37]], [10 x i32]* [[TMP39]], i32 9
-; CHECK-NEXT:    [[TMP43:%.*]] = insertelement <16 x [10 x i32]*> [[TMP40]], [10 x i32]* [[TMP42]], i32 10
-; CHECK-NEXT:    [[TMP46:%.*]] = insertelement <16 x [10 x i32]*> [[TMP43]], [10 x i32]* [[TMP45]], i32 11
-; CHECK-NEXT:    [[TMP49:%.*]] = insertelement <16 x [10 x i32]*> [[TMP46]], [10 x i32]* [[TMP48]], i32 12
-; CHECK-NEXT:    [[TMP52:%.*]] = insertelement <16 x [10 x i32]*> [[TMP49]], [10 x i32]* [[TMP51]], i32 13
-; CHECK-NEXT:    [[TMP55:%.*]] = insertelement <16 x [10 x i32]*> [[TMP52]], [10 x i32]* [[TMP54]], i32 14
-; CHECK-NEXT:    [[TMP58:%.*]] = insertelement <16 x [10 x i32]*> [[TMP55]], [10 x i32]* [[TMP57]], i32 15
-; CHECK-NEXT:    [[TMP59:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
-; CHECK-NEXT:    [[TMP61:%.*]] = extractelement <16 x i64> [[TMP59]], i32 0
-; CHECK-NEXT:    [[TMP62:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP12]], i64 [[TMP61]], i64 0
-; CHECK-NEXT:    [[TMP65:%.*]] = extractelement <16 x i64> [[TMP59]], i32 1
-; CHECK-NEXT:    [[TMP66:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP15]], i64 [[TMP65]], i64 0
-; CHECK-NEXT:    [[TMP69:%.*]] = extractelement <16 x i64> [[TMP59]], i32 2
-; CHECK-NEXT:    [[TMP70:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP18]], i64 [[TMP69]], i64 0
-; CHECK-NEXT:    [[TMP73:%.*]] = extractelement <16 x i64> [[TMP59]], i32 3
-; CHECK-NEXT:    [[TMP74:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP21]], i64 [[TMP73]], i64 0
-; CHECK-NEXT:    [[TMP77:%.*]] = extractelement <16 x i64> [[TMP59]], i32 4
-; CHECK-NEXT:    [[TMP78:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP24]], i64 [[TMP77]], i64 0
-; CHECK-NEXT:    [[TMP81:%.*]] = extractelement <16 x i64> [[TMP59]], i32 5
-; CHECK-NEXT:    [[TMP82:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP27]], i64 [[TMP81]], i64 0
-; CHECK-NEXT:    [[TMP85:%.*]] = extractelement <16 x i64> [[TMP59]], i32 6
-; CHECK-NEXT:    [[TMP86:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP30]], i64 [[TMP85]], i64 0
-; CHECK-NEXT:    [[TMP89:%.*]] = extractelement <16 x i64> [[TMP59]], i32 7
-; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP33]], i64 [[TMP89]], i64 0
-; CHECK-NEXT:    [[TMP93:%.*]] = extractelement <16 x i64> [[TMP59]], i32 8
-; CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP36]], i64 [[TMP93]], i64 0
-; CHECK-NEXT:    [[TMP97:%.*]] = extractelement <16 x i64> [[TMP59]], i32 9
-; CHECK-NEXT:    [[TMP98:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP39]], i64 [[TMP97]], i64 0
-; CHECK-NEXT:    [[TMP101:%.*]] = extractelement <16 x i64> [[TMP59]], i32 10
-; CHECK-NEXT:    [[TMP102:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP42]], i64 [[TMP101]], i64 0
-; CHECK-NEXT:    [[TMP105:%.*]] = extractelement <16 x i64> [[TMP59]], i32 11
-; CHECK-NEXT:    [[TMP106:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP45]], i64 [[TMP105]], i64 0
-; CHECK-NEXT:    [[TMP109:%.*]] = extractelement <16 x i64> [[TMP59]], i32 12
-; CHECK-NEXT:    [[TMP110:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP48]], i64 [[TMP109]], i64 0
-; CHECK-NEXT:    [[TMP113:%.*]] = extractelement <16 x i64> [[TMP59]], i32 13
-; CHECK-NEXT:    [[TMP114:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP51]], i64 [[TMP113]], i64 0
-; CHECK-NEXT:    [[TMP117:%.*]] = extractelement <16 x i64> [[TMP59]], i32 14
-; CHECK-NEXT:    [[TMP118:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP54]], i64 [[TMP117]], i64 0
-; CHECK-NEXT:    [[TMP121:%.*]] = extractelement <16 x i64> [[TMP59]], i32 15
-; CHECK-NEXT:    [[TMP122:%.*]] = getelementptr inbounds [10 x i32], [10 x i32]* [[TMP57]], i64 [[TMP121]], i64 0
-; CHECK-NEXT:    [[VECTORGEP:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP58]], <16 x i64> [[TMP59]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[VECTORGEP]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
-; CHECK:         [[STEP_ADD:%.*]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
-; CHECK:         [[STEP_ADD4:%.*]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [10 x [10 x i32]], [10 x [10 x i32]]* @d, i64 0, <16 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <16 x i64> [[TMP10]], [[VEC_IND3]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP12]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP13]], i32 16, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i64> [[VEC_IND3]], <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <16 x i64> [[TMP10]], [[TMP14]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds [10 x i32], <16 x [10 x i32]*> [[TMP11]], <16 x i64> [[TMP15]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.v16i32(<16 x i32> <i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8>, <16 x i32*> [[TMP16]], i32 8, <16 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK-NEXT:    [[VEC_IND_NEXT4]] = add <16 x i64> [[VEC_IND3]], <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
 entry:
   %0 = load i32, i32* @c, align 4
   %cmp34 = icmp sgt i32 %0, 8
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
index f28e6be235299beef80188162f77a8de63b53e1a..b2933c4b56f204a535e423e9607ff5ac46586f50 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-missed.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -loop-vectorize -S -pass-remarks-missed='loop-vectorize' -pass-remarks-analysis='loop-vectorize' 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -o /dev/null -pass-remarks-output=%t.yaml
+; RUN: cat %t.yaml | FileCheck -check-prefix=YAML %s
 
 ; C/C++ code for tests
 ; void test(int *A, int Length) {
@@ -42,6 +44,61 @@
 ; CHECK-NOT: x i32>
 ; CHECK: ret
 
+; YAML:       --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            CantComputeNumberOfIterations
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 4, Column: 5 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          could not determine number of loop iterations
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 4, Column: 5 }
+; YAML-NEXT: Function:        _Z4testPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            AllDisabled
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 13, Column: 5 }
+; YAML-NEXT: Function:        _Z13test_disabledPii
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: vectorization and interleaving are explicitly disabled, or vectorize width and interleave count are both set to 1'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Analysis
+; YAML-NEXT: Pass:            ''
+; YAML-NEXT: Name:            CantIdentifyArrayBounds
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          cannot identify array bounds
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Missed
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            MissedDetails
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          loop not vectorized
+; YAML-NEXT:   - String:          ' (Force='
+; YAML-NEXT:   - Force:           'true'
+; YAML-NEXT:   - String:          ')'
+; YAML-NEXT: ...
+; YAML-NEXT: --- !Failure
+; YAML-NEXT: Pass:            loop-vectorize
+; YAML-NEXT: Name:            FailedRequestedVectorization
+; YAML-NEXT: DebugLoc:        { File: source.cpp, Line: 19, Column: 5 }
+; YAML-NEXT: Function:        _Z17test_array_boundsPiS_i
+; YAML-NEXT: Args:
+; YAML-NEXT:   - String:          'loop not vectorized: '
+; YAML-NEXT:   - String:          failed explicitly specified loop vectorization
+; YAML-NEXT: ...
+
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind optsize ssp uwtable
diff --git a/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
index fc9f97328fb7c48850ac52c49eee053098c32929..91466e65078fbd8b09726fe6fc28918986fb2218 100644
--- a/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
+++ b/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -loop-vectorize -pass-remarks-analysis='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
+; RUN: opt < %s -loop-vectorize -pass-remarks-missed='loop-vectorize' -mtriple=x86_64-unknown-linux -S 2>&1 | FileCheck %s
 
 ; Verify analysis remarks are generated when interleaving is not beneficial.
 ; CHECK: remark: vectorization-remarks-profitable.c:5:17: the cost-model indicates that vectorization is not beneficial
diff --git a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
index 88b2aa36b08c40722e86644351752906b6240ab0..125829090c3f2dd6ee4d5d6c5482d32fd2bc359d 100644
--- a/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+++ b/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@@ -438,3 +438,53 @@ for.end:
   %tmp5 = phi i32 [ %tmp2, %for.body ]
   ret i32 %tmp5
 }
+
+; INTER-LABEL: bitcast_pointer_operand
+;
+; Check that a pointer operand having a user other than a memory access is
+; recognized as uniform after vectorization. In this test case, %tmp1 is a
+; bitcast that is used by a load and a getelementptr instruction (%tmp2). Once
+; %tmp2 is marked uniform, %tmp1 should be marked uniform as well.
+;
+; INTER:       LV: Found uniform instruction: %cond = icmp slt i64 %i.next, %n
+; INTER-NEXT:  LV: Found uniform instruction: %tmp2 = getelementptr inbounds i8, i8* %tmp1, i64 3
+; INTER-NEXT:  LV: Found uniform instruction: %tmp6 = getelementptr inbounds i8, i8* %B, i64 %i
+; INTER-NEXT:  LV: Found uniform instruction: %tmp1 = bitcast i64* %tmp0 to i8*
+; INTER-NEXT:  LV: Found uniform instruction: %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i
+; INTER-NEXT:  LV: Found uniform instruction: %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+; INTER-NEXT:  LV: Found uniform instruction: %i.next = add nuw nsw i64 %i, 1
+; INTER:       vector.body:
+; INTER-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; INTER-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, i64* %A, i64 [[INDEX]]
+; INTER-NEXT:    [[TMP5:%.*]] = bitcast i64* [[TMP4]] to <32 x i8>*
+; INTER-NEXT:    [[WIDE_VEC:%.*]] = load <32 x i8>, <32 x i8>* [[TMP5]], align 1
+; INTER-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <4 x i32> <i32 0, i32 8, i32 16, i32 24>
+; INTER-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <32 x i8> [[WIDE_VEC]], <32 x i8> undef, <4 x i32> <i32 3, i32 11, i32 19, i32 27>
+; INTER-NEXT:    [[TMP6:%.*]] = xor <4 x i8> [[STRIDED_VEC5]], [[STRIDED_VEC]]
+; INTER-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, i8* %B, i64 [[INDEX]]
+; INTER-NEXT:    [[TMP8:%.*]] = bitcast i8* [[TMP7]] to <4 x i8>*
+; INTER-NEXT:    store <4 x i8> [[TMP6]], <4 x i8>* [[TMP8]], align 1
+; INTER-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; INTER:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @bitcast_pointer_operand(i64* %A, i8* %B, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %for.body ]
+  %tmp0 = getelementptr inbounds i64, i64* %A, i64 %i
+  %tmp1 = bitcast i64* %tmp0 to i8*
+  %tmp2 = getelementptr inbounds i8, i8* %tmp1, i64 3
+  %tmp3 = load i8, i8* %tmp2, align 1
+  %tmp4 = load i8, i8* %tmp1, align 1
+  %tmp5 = xor i8 %tmp3, %tmp4
+  %tmp6 = getelementptr inbounds i8, i8* %B, i64 %i
+  store i8 %tmp5, i8* %tmp6
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/discriminator.ll b/test/Transforms/LoopVectorize/discriminator.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b7d34582dbd8a8ad7051f2503ccef7266cba698b
--- /dev/null
+++ b/test/Transforms/LoopVectorize/discriminator.ll
@@ -0,0 +1,70 @@
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 < %s | FileCheck --check-prefix=LOOPVEC_4_1 %s
+; RUN: opt -S -loop-vectorize -force-vector-width=2 -force-vector-interleave=3 < %s | FileCheck --check-prefix=LOOPVEC_2_3 %s
+; RUN: opt -S -loop-unroll  -unroll-count=5 < %s | FileCheck --check-prefix=LOOPUNROLL_5 %s
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=4 -loop-unroll -unroll-count=2 < %s | FileCheck --check-prefix=LOOPVEC_UNROLL %s
+
+; Test if vectorization/unroll factor is recorded in discriminator.
+;
+; Original source code:
+;  1 int *a;
+;  2 int *b;
+;  3 
+;  4 void foo() {
+;  5   for (int i = 0; i < 4096; i++)
+;  6     a[i] += b[i];
+;  7 }
+
+@a = local_unnamed_addr global i32* null, align 8
+@b = local_unnamed_addr global i32* null, align 8
+
+define void @_Z3foov() local_unnamed_addr #0 !dbg !6 {
+  %1 = load i32*, i32** @b, align 8, !dbg !8, !tbaa !9
+  %2 = load i32*, i32** @a, align 8, !dbg !13, !tbaa !9
+  br label %3, !dbg !14
+
+; <label>:3:                                      ; preds = %3, %0
+  %indvars.iv = phi i64 [ 0, %0 ], [ %indvars.iv.next, %3 ]
+  %4 = getelementptr inbounds i32, i32* %1, i64 %indvars.iv, !dbg !8
+  %5 = load i32, i32* %4, align 4, !dbg !8, !tbaa !15
+  %6 = getelementptr inbounds i32, i32* %2, i64 %indvars.iv, !dbg !13
+  %7 = load i32, i32* %6, align 4, !dbg !17, !tbaa !15
+  %8 = add nsw i32 %7, %5, !dbg !17
+  store i32 %8, i32* %6, align 4, !dbg !17, !tbaa !15
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1, !dbg !18
+  %exitcond = icmp eq i64 %indvars.iv.next, 4096, !dbg !19
+  br i1 %exitcond, label %9, label %3, !dbg !14, !llvm.loop !20
+
+; <label>:9:                                      ; preds = %3
+  ret void, !dbg !21
+}
+
+;LOOPVEC_4_1: discriminator: 17
+;LOOPVEC_2_3: discriminator: 25
+;LOOPUNROLL_5: discriminator: 21
+; When unrolling after loop vectorize, both vec_body and remainder loop
+; are unrolled.
+;LOOPVEC_UNROLL: discriminator: 385
+;LOOPVEC_UNROLL: discriminator: 9
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, debugInfoForProfiling: true)
+!1 = !DIFile(filename: "a.cc", directory: "/")
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 4, unit: !0)
+!8 = !DILocation(line: 6, column: 13, scope: !6)
+!9 = !{!10, !10, i64 0}
+!10 = !{!"any pointer", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C++ TBAA"}
+!13 = !DILocation(line: 6, column: 5, scope: !6)
+!14 = !DILocation(line: 5, column: 3, scope: !6)
+!15 = !{!16, !16, i64 0}
+!16 = !{!"int", !11, i64 0}
+!17 = !DILocation(line: 6, column: 10, scope: !6)
+!18 = !DILocation(line: 5, column: 30, scope: !6)
+!19 = !DILocation(line: 5, column: 21, scope: !6)
+!20 = distinct !{!20, !14}
+!21 = !DILocation(line: 7, column: 1, scope: !6)
diff --git a/test/Transforms/LoopVectorize/first-order-recurrence.ll b/test/Transforms/LoopVectorize/first-order-recurrence.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9155820216b24764bcea871f002f9b7ba834cb30
--- /dev/null
+++ b/test/Transforms/LoopVectorize/first-order-recurrence.ll
@@ -0,0 +1,373 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -dce -instcombine -S | FileCheck %s
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -dce -instcombine -S | FileCheck %s --check-prefix=UNROLL
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=2 -S | FileCheck %s --check-prefix=UNROLL-NO-IC
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; void recurrence_1(int *a, int *b, int n) {
+;   for(int i = 0; i < n; i++)
+;     b[i] =  a[i] + a[i - 1]
+; }
+;
+; CHECK-LABEL: @recurrence_1(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i32> undef, i32 %pre_load, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i32>
+; CHECK:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %pre_load, %vector.memcheck ], [ %pre_load, %min.iters.checked ], [ %pre_load, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_1(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; UNROLL:         [[L2]] = load <4 x i32>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
+;
+define void @recurrence_1(i32* nocapture readonly %a, i32* nocapture %b, i32 %n) {
+entry:
+  br label %for.preheader
+
+for.preheader:
+  %arrayidx.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 0
+  %pre_load = load i32, i32* %arrayidx.phi.trans.insert
+  br label %scalar.body
+
+scalar.body:
+  %0 = phi i32 [ %pre_load, %for.preheader ], [ %1, %scalar.body ]
+  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %arrayidx32 = getelementptr inbounds i32, i32* %a, i64 %indvars.iv.next
+  %1 = load i32, i32* %arrayidx32
+  %arrayidx34 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  %add35 = add i32 %1, %0
+  store i32 %add35, i32* %arrayidx34
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.exit, label %scalar.body
+
+for.exit:
+  ret void
+}
+
+; int recurrence_2(int *a, int n) {
+;   int minmax;
+;   for (int i = 0; i < n; ++i)
+;     minmax = min(minmax, max(a[i] - a[i-1], 0));
+;   return minmax;
+; }
+;
+; CHECK-LABEL: @recurrence_2(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i32> undef, i32 %.pre, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i32>
+; CHECK:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i32> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i32 [ %vector.recur.extract, %middle.block ], [ %.pre, %min.iters.checked ], [ %.pre, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i32 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_2(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i32> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i32>
+; UNROLL:         [[L2]] = load <4 x i32>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> %vector.recur, <4 x i32> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i32> [[L1]], <4 x i32> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i32> [[L2]], i32 3
+;
+define i32 @recurrence_2(i32* nocapture readonly %a, i32 %n) {
+entry:
+  %cmp27 = icmp sgt i32 %n, 0
+  br i1 %cmp27, label %for.preheader, label %for.cond.cleanup
+
+for.preheader:
+  %arrayidx2.phi.trans.insert = getelementptr inbounds i32, i32* %a, i64 -1
+  %.pre = load i32, i32* %arrayidx2.phi.trans.insert, align 4
+  br label %scalar.body
+
+for.cond.cleanup.loopexit:
+  %minmax.0.cond.lcssa = phi i32 [ %minmax.0.cond, %scalar.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %minmax.0.lcssa = phi i32 [ undef, %entry ], [ %minmax.0.cond.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %minmax.0.lcssa
+
+scalar.body:
+  %0 = phi i32 [ %.pre, %for.preheader ], [ %1, %scalar.body ]
+  %indvars.iv = phi i64 [ 0, %for.preheader ], [ %indvars.iv.next, %scalar.body ]
+  %minmax.028 = phi i32 [ undef, %for.preheader ], [ %minmax.0.cond, %scalar.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %1 = load i32, i32* %arrayidx, align 4
+  %sub3 = sub nsw i32 %1, %0
+  %cmp4 = icmp sgt i32 %sub3, 0
+  %cond = select i1 %cmp4, i32 %sub3, i32 0
+  %cmp5 = icmp slt i32 %minmax.028, %cond
+  %minmax.0.cond = select i1 %cmp5, i32 %minmax.028, i32 %cond
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %scalar.body
+}
+
+; void recurrence_3(short *a, double *b, int n, float f, short p) {
+;   b[0] = (double)a[0] - f * (double)p;
+;   for (int i = 1; i < n; i++)
+;     b[i] = (double)a[i] - f * (double)a[i - 1];
+; }
+;
+; CHECK-LABEL: @recurrence_3(
+; CHECK:       vector.ph:
+; CHECK:         %vector.recur.init = insertelement <4 x i16> undef, i16 %0, i32 3
+; CHECK:       vector.body:
+; CHECK:         %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L1:%[a-zA-Z0-9.]+]], %vector.body ]
+; CHECK:         [[L1]] = load <4 x i16>
+; CHECK:         {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; CHECK:       middle.block:
+; CHECK:         %vector.recur.extract = extractelement <4 x i16> [[L1]], i32 3
+; CHECK:       scalar.ph:
+; CHECK:         %scalar.recur.init = phi i16 [ %vector.recur.extract, %middle.block ], [ %0, %vector.memcheck ], [ %0, %min.iters.checked ], [ %0, %for.preheader ]
+; CHECK:       scalar.body:
+; CHECK:         %scalar.recur = phi i16 [ %scalar.recur.init, %scalar.ph ], [ {{.*}}, %scalar.body ]
+;
+; UNROLL-LABEL: @recurrence_3(
+; UNROLL:       vector.body:
+; UNROLL:         %vector.recur = phi <4 x i16> [ %vector.recur.init, %vector.ph ], [ [[L2:%[a-zA-Z0-9.]+]], %vector.body ]
+; UNROLL:         [[L1:%[a-zA-Z0-9.]+]] = load <4 x i16>
+; UNROLL:         [[L2]] = load <4 x i16>
+; UNROLL:         {{.*}} = shufflevector <4 x i16> %vector.recur, <4 x i16> [[L1]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:         {{.*}} = shufflevector <4 x i16> [[L1]], <4 x i16> [[L2]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL:       middle.block:
+; UNROLL:         %vector.recur.extract = extractelement <4 x i16> [[L2]], i32 3
+;
+define void @recurrence_3(i16* nocapture readonly %a, double* nocapture %b, i32 %n, float %f, i16 %p) {
+entry:
+  %0 = load i16, i16* %a, align 2
+  %conv = sitofp i16 %0 to double
+  %conv1 = fpext float %f to double
+  %conv2 = sitofp i16 %p to double
+  %mul = fmul fast double %conv2, %conv1
+  %sub = fsub fast double %conv, %mul
+  store double %sub, double* %b, align 8
+  %cmp25 = icmp sgt i32 %n, 1
+  br i1 %cmp25, label %for.preheader, label %for.end
+
+for.preheader:
+  br label %scalar.body
+
+scalar.body:
+  %1 = phi i16 [ %0, %for.preheader ], [ %2, %scalar.body ]
+  %advars.iv = phi i64 [ %advars.iv.next, %scalar.body ], [ 1, %for.preheader ]
+  %arrayidx5 = getelementptr inbounds i16, i16* %a, i64 %advars.iv
+  %2 = load i16, i16* %arrayidx5, align 2
+  %conv6 = sitofp i16 %2 to double
+  %conv11 = sitofp i16 %1 to double
+  %mul12 = fmul fast double %conv11, %conv1
+  %sub13 = fsub fast double %conv6, %mul12
+  %arrayidx15 = getelementptr inbounds double, double* %b, i64 %advars.iv
+  store double %sub13, double* %arrayidx15, align 8
+  %advars.iv.next = add nuw nsw i64 %advars.iv, 1
+  %lftr.wideiv = trunc i64 %advars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end.loopexit, label %scalar.body
+
+for.end.loopexit:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; void PR26734(short *a, int *b, int *c, int d, short *e) {
+;   for (; d != 21; d++) {
+;     *b &= *c;
+;     *e = *a - 6;
+;     *c = *e;
+;   }
+; }
+;
+; CHECK-LABEL: @PR26734(
+; CHECK-NOT:   vector.ph:
+; CHECK:       }
+;
+define void @PR26734(i16* %a, i32* %b, i32* %c, i32 %d, i16* %e) {
+entry:
+  %cmp4 = icmp eq i32 %d, 21
+  br i1 %cmp4, label %entry.for.end_crit_edge, label %for.body.lr.ph
+
+entry.for.end_crit_edge:
+  %.pre = load i32, i32* %b, align 4
+  br label %for.end
+
+for.body.lr.ph:
+  %0 = load i16, i16* %a, align 2
+  %sub = add i16 %0, -6
+  %conv2 = sext i16 %sub to i32
+  %c.promoted = load i32, i32* %c, align 4
+  %b.promoted = load i32, i32* %b, align 4
+  br label %for.body
+
+for.body:
+  %inc7 = phi i32 [ %d, %for.body.lr.ph ], [ %inc, %for.body ]
+  %and6 = phi i32 [ %b.promoted, %for.body.lr.ph ], [ %and, %for.body ]
+  %conv25 = phi i32 [ %c.promoted, %for.body.lr.ph ], [ %conv2, %for.body ]
+  %and = and i32 %and6, %conv25
+  %inc = add nsw i32 %inc7, 1
+  %cmp = icmp eq i32 %inc, 21
+  br i1 %cmp, label %for.cond.for.end_crit_edge, label %for.body
+
+for.cond.for.end_crit_edge:
+  %and.lcssa = phi i32 [ %and, %for.body ]
+  store i32 %conv2, i32* %c, align 4
+  store i32 %and.lcssa, i32* %b, align 4
+  store i16 %sub, i16* %e, align 2
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+; int PR27246() {
+;   unsigned int e, n;
+;   for (int i = 1; i < 49; ++i) {
+;     for (int k = i; k > 1; --k)
+;       e = k;
+;     n = e;
+;   }
+;   return n;
+; }
+;
+; CHECK-LABEL: @PR27246(
+; CHECK-NOT:   vector.ph:
+; CHECK:       }
+;
+define i32 @PR27246() {
+entry:
+  br label %for.cond1.preheader
+
+for.cond1.preheader:
+  %i.016 = phi i32 [ 1, %entry ], [ %inc, %for.cond.cleanup3 ]
+  %e.015 = phi i32 [ undef, %entry ], [ %e.1.lcssa, %for.cond.cleanup3 ]
+  br label %for.cond1
+
+for.cond.cleanup:
+  %e.1.lcssa.lcssa = phi i32 [ %e.1.lcssa, %for.cond.cleanup3 ]
+  ret i32 %e.1.lcssa.lcssa
+
+for.cond1:
+  %e.1 = phi i32 [ %k.0, %for.cond1 ], [ %e.015, %for.cond1.preheader ]
+  %k.0 = phi i32 [ %dec, %for.cond1 ], [ %i.016, %for.cond1.preheader ]
+  %cmp2 = icmp sgt i32 %k.0, 1
+  %dec = add nsw i32 %k.0, -1
+  br i1 %cmp2, label %for.cond1, label %for.cond.cleanup3
+
+for.cond.cleanup3:
+  %e.1.lcssa = phi i32 [ %e.1, %for.cond1 ]
+  %inc = add nuw nsw i32 %i.016, 1
+  %exitcond = icmp eq i32 %inc, 49
+  br i1 %exitcond, label %for.cond.cleanup, label %for.cond1.preheader
+}
+
+; UNROLL-NO-IC-LABEL: @PR30183(
+; UNROLL-NO-IC:       vector.ph:
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <4 x i32> undef, i32 [[PRE_LOAD:%.*]], i32 3
+; UNROLL-NO-IC-NEXT:    br label %vector.body
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; UNROLL-NO-IC-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i32> [ [[VECTOR_RECUR_INIT]], %vector.ph ], [ [[TMP42:%.*]], %vector.body ]
+; UNROLL-NO-IC:         [[TMP27:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP28:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP29:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP30:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP31:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP32:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP33:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP34:%.*]] = load i32, i32* {{.*}}
+; UNROLL-NO-IC-NEXT:    [[TMP35:%.*]] = insertelement <4 x i32> undef, i32 [[TMP27]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP36:%.*]] = insertelement <4 x i32> [[TMP35]], i32 [[TMP28]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP37:%.*]] = insertelement <4 x i32> [[TMP36]], i32 [[TMP29]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP38:%.*]] = insertelement <4 x i32> [[TMP37]], i32 [[TMP30]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP39:%.*]] = insertelement <4 x i32> undef, i32 [[TMP31]], i32 0
+; UNROLL-NO-IC-NEXT:    [[TMP40:%.*]] = insertelement <4 x i32> [[TMP39]], i32 [[TMP32]], i32 1
+; UNROLL-NO-IC-NEXT:    [[TMP41:%.*]] = insertelement <4 x i32> [[TMP40]], i32 [[TMP33]], i32 2
+; UNROLL-NO-IC-NEXT:    [[TMP42]] = insertelement <4 x i32> [[TMP41]], i32 [[TMP34]], i32 3
+; UNROLL-NO-IC-NEXT:    [[TMP43:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[TMP38]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[TMP44:%.*]] = shufflevector <4 x i32> [[TMP38]], <4 x i32> [[TMP42]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; UNROLL-NO-IC:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @PR30183(i32 %pre_load, i32* %a, i32* %b, i64 %n) {
+entry:
+  br label %scalar.body
+
+scalar.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
+  %tmp0 = phi i32 [ %pre_load, %entry ], [ %tmp2, %scalar.body ]
+  %i.next = add nuw nsw i64 %i, 2
+  %tmp1 = getelementptr inbounds i32, i32* %a, i64 %i.next
+  %tmp2 = load i32, i32* %tmp1
+  %cond = icmp eq i64 %i.next,%n
+  br i1 %cond, label %for.end, label %scalar.body
+
+for.end:
+  ret void
+}
+
+; UNROLL-NO-IC-LABEL: @constant_folded_previous_value(
+; UNROLL-NO-IC:       vector.body:
+; UNROLL-NO-IC:         [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 undef, i64 undef, i64 undef, i64 0>, %vector.ph ], [ <i64 1, i64 1, i64 1, i64 1>, %vector.body ]
+; UNROLL-NO-IC-NEXT:    [[TMP0:%.*]] = shufflevector <4 x i64> [[VECTOR_RECUR]], <4 x i64> <i64 1, i64 1, i64 1, i64 1>, <4 x i32> <i32 3, i32 4, i32 5, i32 6>
+; UNROLL-NO-IC:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @constant_folded_previous_value() {
+entry:
+  br label %scalar.body
+
+scalar.body:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %scalar.body ]
+  %tmp2 = phi i64 [ 0, %entry ], [ %tmp3, %scalar.body ]
+  %tmp3 = add i64 0, 1
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp eq i64 %i.next, undef
+  br i1 %cond, label %for.end, label %scalar.body
+
+for.end:
+  ret void
+}
+
+; FIXME: we can vectorize this first order recurrence, by generating two
+; extracts - one for the phi `val.phi` and other for the phi update `addx`.
+; val.phi at end of loop is 94 + x.
+; CHECK-LABEL: extract_second_last_iteration
+; CHECK-NOT: vector.body
+define i32 @extract_second_last_iteration(i32* %cval, i32 %x)  {
+entry:
+  br label %for.body
+
+for.body:
+  %inc.phi = phi i32 [ 0, %entry ], [ %inc, %for.body ]
+  %val.phi = phi i32 [ 0, %entry ], [ %addx, %for.body ]
+  %inc = add i32 %inc.phi, 1
+  %bc = zext i32 %inc.phi to i64
+  %addx = add i32 %inc.phi, %x
+  %cmp = icmp eq i32 %inc.phi, 95
+  br i1 %cmp, label %for.end, label %for.body
+
+for.end:
+  ret i32 %val.phi
+}
diff --git a/test/Transforms/LoopVectorize/float-induction.ll b/test/Transforms/LoopVectorize/float-induction.ll
index 79bddf471c2632f5bb8752683d6937367b707c80..8eec6e262c1a1fe6156c1bda0bfa30770df0faf8 100644
--- a/test/Transforms/LoopVectorize/float-induction.ll
+++ b/test/Transforms/LoopVectorize/float-induction.ll
@@ -1,43 +1,7 @@
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL1 %s
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=2 -force-vector-width=4 -dce -instcombine -S | FileCheck --check-prefix VEC4_INTERL2 %s
 ; RUN: opt < %s  -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -dce -instcombine -S | FileCheck --check-prefix VEC1_INTERL2 %s
-
-; VEC4_INTERL1-LABEL: @fp_iv_loop1(
-; VEC4_INTERL1:       %[[FP_INC:.*]] = load float, float* @fp_inc
-; VEC4_INTERL1: vector.body:
-; VEC4_INTERL1:       %[[FP_INDEX:.*]] = sitofp i64 {{.*}} to float
-; VEC4_INTERL1:       %[[VEC_INCR:.*]] = fmul fast float {{.*}}, %[[FP_INDEX]]
-; VEC4_INTERL1:       %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[VEC_INCR]]
-; VEC4_INTERL1:       %[[BRCT_INSERT:.*]] = insertelement <4 x float> undef, float %[[FP_OFFSET_IDX]], i32 0
-; VEC4_INTERL1-NEXT:  %[[BRCT_SPLAT:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1:       %[[BRCT_INSERT:.*]] = insertelement {{.*}} %[[FP_INC]]
-; VEC4_INTERL1-NEXT:  %[[FP_INC_BCST:.*]] = shufflevector <4 x float> %[[BRCT_INSERT]], {{.*}} zeroinitializer
-; VEC4_INTERL1:       %[[VSTEP:.*]] = fmul fast <4 x float> %[[FP_INC_BCST]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; VEC4_INTERL1-NEXT:  %[[VEC_INDUCTION:.*]] = fsub fast <4 x float> %[[BRCT_SPLAT]], %[[VSTEP]]
-; VEC4_INTERL1:       store <4 x float> %[[VEC_INDUCTION]]
-
-; VEC4_INTERL2-LABEL: @fp_iv_loop1(
-; VEC4_INTERL2:       %[[FP_INC:.*]] = load float, float* @fp_inc
-; VEC4_INTERL2: vector.body:
-; VEC4_INTERL2:       %[[INDEX:.*]] = sitofp i64 {{.*}} to float
-; VEC4_INTERL2:       %[[VEC_INCR:.*]] = fmul fast float %{{.*}}, %[[INDEX]]
-; VEC4_INTERL2:       fsub fast float %init, %[[VEC_INCR]]
-; VEC4_INTERL2:       %[[VSTEP1:.*]] = fmul fast <4 x float> %{{.*}}, <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
-; VEC4_INTERL2-NEXT:  %[[VEC_INDUCTION1:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP1]]
-; VEC4_INTERL2:       %[[VSTEP2:.*]] = fmul fast <4 x float> %{{.*}}, <float 4.000000e+00, float 5.000000e+00, float 6.000000e+00, float 7.000000e+00>
-; VEC4_INTERL2-NEXT:  %[[VEC_INDUCTION2:.*]] = fsub fast <4 x float> {{.*}}, %[[VSTEP2]]
-; VEC4_INTERL2:       store <4 x float> %[[VEC_INDUCTION1]]
-; VEC4_INTERL2:       store <4 x float> %[[VEC_INDUCTION2]]
-
-; VEC1_INTERL2-LABEL: @fp_iv_loop1(
-; VEC1_INTERL2:       %[[FP_INC:.*]] = load float, float* @fp_inc
-; VEC1_INTERL2: vector.body:
-; VEC1_INTERL2:         %[[INDEX:.*]] = sitofp i64 {{.*}} to float
-; VEC1_INTERL2:         %[[STEP:.*]] = fmul fast float %{{.*}}, %[[INDEX]]
-; VEC1_INTERL2:         %[[FP_OFFSET_IDX:.*]] = fsub fast float %init, %[[STEP]]
-; VEC1_INTERL2:         %[[SCALAR_INDUCTION2:.*]] = fsub fast float %[[FP_OFFSET_IDX]], %[[FP_INC]]
-; VEC1_INTERL2:         store float %[[FP_OFFSET_IDX]]
-; VEC1_INTERL2:         store float %[[SCALAR_INDUCTION2]]
+; RUN: opt < %s  -loop-vectorize -force-vector-interleave=1 -force-vector-width=2 -dce -simplifycfg -instcombine -S | FileCheck --check-prefix VEC2_INTERL1_PRED_STORE %s
 
 @fp_inc = common global float 0.000000e+00, align 4
 
@@ -49,6 +13,71 @@
 ;  }
 ;}
 
+; VEC4_INTERL1-LABEL: @fp_iv_loop1(
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT2]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT3]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:    [[INDUCTION4:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]]
+; VEC4_INTERL1-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION4]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP9:%.*]] = bitcast float* [[TMP8]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP9]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT6]]
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+; VEC4_INTERL2-LABEL: @fp_iv_loop1(
+; VEC4_INTERL2:       vector.ph:
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT3:%.*]] = insertelement <4 x float> undef, float %fpinc, i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT4:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT3]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = fmul fast <4 x float> [[DOTSPLAT4]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL2-NEXT:    [[INDUCTION5:%.*]] = fsub fast <4 x float> [[DOTSPLAT]], [[TMP5]]
+; VEC4_INTERL2-NEXT:    [[TMP6:%.*]] = fmul fast float %fpinc, 4.000000e+00
+; VEC4_INTERL2-NEXT:    [[DOTSPLATINSERT6:%.*]] = insertelement <4 x float> undef, float [[TMP6]], i32 0
+; VEC4_INTERL2-NEXT:    [[DOTSPLAT7:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT6]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL2-NEXT:    br label %vector.body
+; VEC4_INTERL2:       vector.body:
+; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION5]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT7]]
+; VEC4_INTERL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL2-NEXT:    [[TMP10:%.*]] = bitcast float* [[TMP9]] to <4 x float>*
+; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP10]], align 4
+; VEC4_INTERL2-NEXT:    [[TMP11:%.*]] = getelementptr float, float* [[TMP9]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = bitcast float* [[TMP11]] to <4 x float>*
+; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], <4 x float>* [[TMP12]], align 4
+; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 8
+; VEC4_INTERL2-NEXT:    [[VEC_IND_NEXT]] = fsub fast <4 x float> [[STEP_ADD]], [[DOTSPLAT7]]
+; VEC4_INTERL2:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+; VEC1_INTERL2-LABEL: @fp_iv_loop1(
+; VEC1_INTERL2:       vector.ph:
+; VEC1_INTERL2-NEXT:    br label %vector.body
+; VEC1_INTERL2:       vector.body:
+; VEC1_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC1_INTERL2-NEXT:    [[INDUCTION2:%.*]] = or i64 [[INDEX]], 1
+; VEC1_INTERL2-NEXT:    [[TMP6:%.*]] = sitofp i64 [[INDEX]] to float
+; VEC1_INTERL2-NEXT:    [[TMP7:%.*]] = fmul fast float %fpinc, [[TMP6]]
+; VEC1_INTERL2-NEXT:    [[FP_OFFSET_IDX:%.*]] = fsub fast float %init, [[TMP7]]
+; VEC1_INTERL2-NEXT:    [[TMP8:%.*]] = fsub fast float [[FP_OFFSET_IDX]], %fpinc
+; VEC1_INTERL2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC1_INTERL2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDUCTION2]]
+; VEC1_INTERL2-NEXT:    store float [[FP_OFFSET_IDX]], float* [[TMP9]], align 4
+; VEC1_INTERL2-NEXT:    store float [[TMP8]], float* [[TMP10]], align 4
+; VEC1_INTERL2-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VEC1_INTERL2:         br i1 {{.*}}, label %middle.block, label %vector.body
+
 define void @fp_iv_loop1(float %init, float* noalias nocapture %A, i32 %N) #1 {
 entry:
   %cmp4 = icmp sgt i32 %N, 0
@@ -85,15 +114,20 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;}
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop2(
-; VEC4_INTERL1: vector.body
-; VEC4_INTERL1:  %[[index:.*]] = phi i64 [ 0, %vector.ph ]
-; VEC4_INTERL1: sitofp i64 %[[index]] to float
-; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, 5.000000e-01
-; VEC4_INTERL1: %[[VAR2:.*]] = fadd fast float %[[VAR1]]
-; VEC4_INTERL1:  insertelement <4 x float> undef, float %[[VAR2]], i32 0
-; VEC4_INTERL1:  shufflevector <4 x float> {{.*}}, <4 x float> undef, <4 x i32> zeroinitializer
-; VEC4_INTERL1:  fadd fast <4 x float> {{.*}}, <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
-; VEC4_INTERL1:  store <4 x float> 
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[INDUCTION2:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
+; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION2]], %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define void @fp_iv_loop2(float %init, float* noalias nocapture %A, i32 %N) #0 {
 entry:
@@ -133,14 +167,43 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;    C[i] = y;
 ;  }
 ;}
+
 ; VEC4_INTERL1-LABEL: @fp_iv_loop3(
-; VEC4_INTERL1: vector.body
-; VEC4_INTERL1:  %[[index:.*]] = phi i64 [ 0, %vector.ph ]
-; VEC4_INTERL1: sitofp i64 %[[index]] to float
-; VEC4_INTERL1: %[[VAR1:.*]] = fmul fast float {{.*}}, -5.000000e-01
-; VEC4_INTERL1:  fadd fast float %[[VAR1]]
-; VEC4_INTERL1:  fadd fast <4 x float> {{.*}}, <float -5.000000e-01, float -1.000000e+00, float -1.500000e+00, float -2.000000e+00>
-; VEC4_INTERL1:  store <4 x float>
+; VEC4_INTERL1:       for.body.lr.ph:
+; VEC4_INTERL1:         [[TMP0:%.*]] = load float, float* @fp_inc, align 4
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x float> undef, float %init, i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT5]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = fmul fast <4 x float> [[DOTSPLAT6]], <float 0.000000e+00, float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+; VEC4_INTERL1-NEXT:    [[INDUCTION7:%.*]] = fadd fast <4 x float> [[DOTSPLAT]], [[TMP7]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = fmul fast float [[TMP0]], 4.000000e+00
+; VEC4_INTERL1-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <4 x float> undef, float [[TMP8]], i32 0
+; VEC4_INTERL1-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <4 x float> [[DOTSPLATINSERT8]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <4 x float> undef, float [[TMP0]], i32 0
+; VEC4_INTERL1-NEXT:    [[BROADCAST_SPLAT13:%.*]] = shufflevector <4 x float> [[BROADCAST_SPLATINSERT12]], <4 x float> undef, <4 x i32> zeroinitializer
+; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 0x3FB99999A0000000, float 0xBFD99999A0000000, float 0xBFECCCCCC0000000, float 0xBFF6666660000000>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP13:%.*]] = bitcast float* [[TMP12]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND10]], <4 x float>* [[TMP13]], align 4
+; VEC4_INTERL1-NEXT:    [[TMP14:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT13]]
+; VEC4_INTERL1-NEXT:    [[TMP15:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float -5.000000e-01, float -5.000000e-01, float -5.000000e-01, float -5.000000e-01>
+; VEC4_INTERL1-NEXT:    [[TMP16:%.*]] = fadd fast <4 x float> [[TMP15]], [[TMP14]]
+; VEC4_INTERL1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, float* %B, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP18:%.*]] = bitcast float* [[TMP17]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[TMP16]], <4 x float>* [[TMP18]], align 4
+; VEC4_INTERL1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds float, float* %C, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP20:%.*]] = bitcast float* [[TMP19]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[TMP15]], <4 x float>* [[TMP20]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00>
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT11]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]]
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define void @fp_iv_loop3(float %init, float* noalias nocapture %A, float* noalias nocapture %B, float* noalias nocapture %C, i32 %N) #1 {
 entry:
@@ -186,10 +249,17 @@ for.end:
 ;}
 
 ; VEC4_INTERL1-LABEL: @fp_iv_loop4(
-; VEC4_INTERL1: vector.body
-; VEC4_INTERL1-NOT: fmul fast <4 x float>
-; VEC4_INTERL1:  %[[induction:.*]] = fadd fast <4 x float> %{{.*}}, <float 0.000000e+00, float 5.000000e-01, float 1.000000e+00, float 1.500000e+00>
-; VEC4_INTERL1: store <4 x float> %[[induction]]
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    br label %vector.body
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; VEC4_INTERL1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    [[TMP8:%.*]] = bitcast float* [[TMP7]] to <4 x float>*
+; VEC4_INTERL1-NEXT:    store <4 x float> [[VEC_IND]], <4 x float>* [[TMP8]], align 4
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[VEC_IND_NEXT]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
+; VEC4_INTERL1:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define void @fp_iv_loop4(float* noalias nocapture %A, i32 %N) {
 entry:
@@ -216,3 +286,55 @@ for.end.loopexit:                                 ; preds = %for.body
 for.end:                                          ; preds = %for.end.loopexit, %entry
   ret void
 }
+
+; VEC2_INTERL1_PRED_STORE-LABEL: @non_primary_iv_float_scalar(
+; VEC2_INTERL1_PRED_STORE:       vector.body:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE7:.*]] ], [ 0, %min.iters.checked ]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = sitofp i64 [[INDEX]] to float
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, float* %A, i64 [[INDEX]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP3:%.*]] = bitcast float* [[TMP2]] to <2 x float>*
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, <2 x float>* [[TMP3]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP4:%.*]] = fcmp fast oeq <2 x float> [[WIDE_LOAD]], zeroinitializer
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP5:%.*]] = extractelement <2 x i1> [[TMP4]], i32 0
+; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP5]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_IF]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP1]], float* [[TMP2]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_CONTINUE]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP8:%.*]] = extractelement <2 x i1> [[TMP4]], i32 1
+; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP8]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_IF6]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP9:%.*]] = fadd fast float [[TMP1]], 1.000000e+00
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP10:%.*]] = or i64 [[INDEX]], 1
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, float* %A, i64 [[TMP10]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    store float [[TMP9]], float* [[TMP11]], align 4
+; VEC2_INTERL1_PRED_STORE-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; VEC2_INTERL1_PRED_STORE:       [[PRED_STORE_CONTINUE7]]:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; VEC2_INTERL1_PRED_STORE:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @non_primary_iv_float_scalar(float* %A, i64 %N) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.inc ], [ 0, %entry ]
+  %j = phi float [ %j.next, %for.inc ], [ 0.0, %entry ]
+  %tmp0 = getelementptr inbounds float, float* %A, i64 %i
+  %tmp1 = load float, float* %tmp0, align 4
+  %tmp2 = fcmp fast oeq float %tmp1, 0.0
+  br i1 %tmp2, label %if.pred, label %for.inc
+
+if.pred:
+  store float %j, float* %tmp0, align 4
+  br label %for.inc
+
+for.inc:
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = fadd fast float %j, 1.0
+  %cond = icmp slt i64 %i.next, %N
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/if-conversion.ll b/test/Transforms/LoopVectorize/if-conversion.ll
index acf7b12540d39a8b19893a74fa773c6036107e9a..d3a16e2075d1abecf301b39e155d598eca627f66 100644
--- a/test/Transforms/LoopVectorize/if-conversion.ll
+++ b/test/Transforms/LoopVectorize/if-conversion.ll
@@ -18,9 +18,9 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 
 ;CHECK-LABEL: @function0(
 ;CHECK: load <4 x i32>
+;CHECK: icmp sle <4 x i32>
 ;CHECK: mul <4 x i32>
 ;CHECK: add <4 x i32>
-;CHECK: icmp sle <4 x i32>
 ;CHECK: select <4 x i1>
 ;CHECK: ret i32
 define i32 @function0(i32* nocapture %a, i32* nocapture %b, i32 %start, i32 %end) nounwind uwtable ssp {
@@ -71,8 +71,8 @@ for.end:
 
 ;CHECK-LABEL: @reduction_func(
 ;CHECK: load <4 x i32>
-;CHECK: add <4 x i32>
 ;CHECK: icmp slt <4 x i32>
+;CHECK: add <4 x i32>
 ;CHECK: select <4 x i1>
 ;CHECK: ret i32
 define i32 @reduction_func(i32* nocapture %A, i32 %n) nounwind uwtable readonly ssp {
diff --git a/test/Transforms/LoopVectorize/if-pred-stores.ll b/test/Transforms/LoopVectorize/if-pred-stores.ll
index c4368148caf9d849b395f1385759549a72a2f042..a1837b352eef646876dd571345e5f8a3029a3d00 100644
--- a/test/Transforms/LoopVectorize/if-pred-stores.ll
+++ b/test/Transforms/LoopVectorize/if-pred-stores.ll
@@ -11,6 +11,7 @@ entry:
 
 ; VEC-LABEL: test
 ; VEC:   %[[v0:.+]] = add i64 %index, 0
+; VEC:   %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
 ; VEC:   %[[v8:.+]] = icmp sgt <2 x i32> %{{.*}}, <i32 100, i32 100>
 ; VEC:   %[[v10:.+]] = and <2 x i1> %[[v8]], <i1 true, i1 true>
 ; VEC:   %[[o1:.+]] = or <2 x i1> zeroinitializer, %[[v10]]
@@ -21,7 +22,6 @@ entry:
 ; VEC: [[cond]]:
 ; VEC:   %[[v13:.+]] = extractelement <2 x i32> %wide.load, i32 0
 ; VEC:   %[[v9a:.+]] = add nsw i32 %[[v13]], 20
-; VEC:   %[[v2:.+]] = getelementptr inbounds i32, i32* %f, i64 %[[v0]]
 ; VEC:   store i32 %[[v9a]], i32* %[[v2]], align 4
 ; VEC:   br label %[[else:.+]]
 ;
diff --git a/test/Transforms/LoopVectorize/induction-step.ll b/test/Transforms/LoopVectorize/induction-step.ll
index f56456e82dfa0a2070bfd8a3c83bb065fadce526..33e8ed067160d125a391f22bd8b45e41986c518a 100644
--- a/test/Transforms/LoopVectorize/induction-step.ll
+++ b/test/Transforms/LoopVectorize/induction-step.ll
@@ -12,11 +12,30 @@
 ;}
 
 ; CHECK-LABEL: @induction_with_global(
-; CHECK: %[[INT_INC:.*]] = load i32, i32* @int_inc, align 4
-; CHECK: vector.body:
-; CHECK:  %[[VAR1:.*]] = insertelement <8 x i32> undef, i32 %[[INT_INC]], i32 0
-; CHECK:  %[[VAR2:.*]] = shufflevector <8 x i32> %[[VAR1]], <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK:  mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VAR2]]
+; CHECK:       for.body.lr.ph:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @int_inc, align 4
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %init, i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 [[TMP0]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP6:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul i32 [[TMP0]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP7]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP8:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, i32* [[TMP9]], i32 0
+; CHECK-NEXT:    [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> %vec.ind, <8 x i32>* [[TMP11]], align 4
+; CHECK:         %index.next = add i64 %index, 8
+; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
@@ -66,13 +85,28 @@ for.end:                                          ; preds = %for.end.loopexit, %
 ;}
 
 ; CHECK-LABEL: @induction_with_loop_inv(
-; CHECK: for.cond1.preheader:                            
-; CHECK: %[[INDVAR0:.*]] = phi i32 [ 0,
-; CHECK: %[[INDVAR1:.*]] = phi i32 [ 0,
-; CHECK: vector.body:
-; CHECK:  %[[VAR1:.*]] = insertelement <8 x i32> undef, i32 %[[INDVAR1]], i32 0
-; CHECK:  %[[VAR2:.*]] = shufflevector <8 x i32> %[[VAR1]], <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK:  mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, %[[VAR2]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <8 x i32> undef, i32 %x.011, i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[DOTSPLATINSERT2:%.*]] = insertelement <8 x i32> undef, i32 %j.012, i32 0
+; CHECK-NEXT:    [[DOTSPLAT3:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT2]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT3]]
+; CHECK-NEXT:    [[INDUCTION4:%.*]] = add <8 x i32> [[DOTSPLAT]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 %j.012, 8
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <8 x i32> [ [[INDUCTION4]], %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> %vec.ind, <8 x i32>* [[TMP9]], align 4
+; CHECK:         %index.next = add i64 %index, 8
+; CHECK-NEXT:    %vec.ind.next = add <8 x i32> %vec.ind, [[DOTSPLAT6]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
 
 define i32 @induction_with_loop_inv(i32 %init, i32* noalias nocapture %A, i32 %N, i32 %M) {
 entry:
@@ -122,3 +156,46 @@ for.end6:                                         ; preds = %for.end6.loopexit,
   %x.0.lcssa = phi i32 [ %init, %entry ], [ %x.1.lcssa.lcssa, %for.end6.loopexit ]
   ret i32 %x.0.lcssa
 }
+
+
+; CHECK-LABEL: @non_primary_iv_loop_inv_trunc(
+; CHECK:       vector.ph:
+; CHECK:         [[TMP3:%.*]] = trunc i64 %step to i32
+; CHECK-NEXT:    [[DOTSPLATINSERT5:%.*]] = insertelement <8 x i32> undef, i32 [[TMP3]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT6:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT5]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP4:%.*]] = mul <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>, [[DOTSPLAT6]]
+; CHECK-NEXT:    [[INDUCTION7:%.*]] = add <8 x i32> zeroinitializer, [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i32 [[TMP3]], 8
+; CHECK-NEXT:    [[DOTSPLATINSERT8:%.*]] = insertelement <8 x i32> undef, i32 [[TMP5]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT9:%.*]] = shufflevector <8 x i32> [[DOTSPLATINSERT8]], <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:    br label %vector.body
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:         [[VEC_IND10:%.*]] = phi <8 x i32> [ [[INDUCTION7]], %vector.ph ], [ [[VEC_IND_NEXT11:%.*]], %vector.body ]
+; CHECK:         [[TMP6:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i32, i32* [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>*
+; CHECK-NEXT:    store <8 x i32> [[VEC_IND10]], <8 x i32>* [[TMP9]], align 4
+; CHECK-NEXT:    %index.next = add i64 %index, 8
+; CHECK:         [[VEC_IND_NEXT11]] = add <8 x i32> [[VEC_IND10]], [[DOTSPLAT9]]
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+
+define void @non_primary_iv_loop_inv_trunc(i32* %a, i64 %n, i64 %step) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = trunc i64 %j to i32
+  store i32 %tmp1, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = add nuw nsw i64 %j, %step
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/induction.ll b/test/Transforms/LoopVectorize/induction.ll
index 6213b4a7c2e9d10f3d29af10554a1301c2b957a3..0d7d9fe0c1b8aae03e01a75e1ac428eae8786dcf 100644
--- a/test/Transforms/LoopVectorize/induction.ll
+++ b/test/Transforms/LoopVectorize/induction.ll
@@ -7,11 +7,19 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Make sure that we can handle multiple integer induction variables.
+;
 ; CHECK-LABEL: @multi_int_induction(
-; CHECK: vector.body:
-; CHECK:  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
-; CHECK:  %[[VAR:.*]] = trunc i64 %index to i32
-; CHECK:  %offset.idx = add i32 190, %[[VAR]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK-NEXT:    %vec.ind = phi <2 x i32> [ <i32 190, i32 191>, %vector.ph ], [ %vec.ind.next, %vector.body ]
+; CHECK:         [[TMP3:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* %A, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> %vec.ind, <2 x i32>* [[TMP6]], align 4
+; CHECK:         %index.next = add i64 %index, 2
+; CHECK-NEXT:    %vec.ind.next = add <2 x i32> %vec.ind, <i32 2, i32 2>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
 define void @multi_int_induction(i32* %A, i32 %N) {
 for.body.lr.ph:
   br label %for.body
@@ -765,3 +773,79 @@ for.body:
 exit:
   ret void
 }
+
+; CHECK-LABEL: @non_primary_iv_trunc(
+; CHECK:       vector.body:
+; CHECK-NEXT:    %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+; CHECK:         [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK:         [[TMP3:%.*]] = add i64 %index, 0
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, i32* %a, i64 [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, i32* [[TMP4]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <2 x i32>*
+; CHECK-NEXT:    store <2 x i32> [[VEC_IND]], <2 x i32>* [[TMP6]], align 4
+; CHECK-NEXT:    %index.next = add i64 %index, 2
+; CHECK:         [[VEC_IND_NEXT]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+define void @non_primary_iv_trunc(i32* %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %j = phi i64 [ %j.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %a, i64 %i
+  %tmp1 = trunc i64 %j to i32
+  store i32 %tmp1, i32* %tmp0, align 4
+  %i.next = add nuw nsw i64 %i, 1
+  %j.next = add nuw nsw i64 %j, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; PR32419. Ensure we transform truncated non-primary induction variables. In
+; the test case below we replace %tmp1 with a new induction variable. Because
+; the truncated value is non-primary, we must compute an offset from the
+; primary induction variable.
+;
+; CHECK-LABEL: @PR32419(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %[[PRED_UREM_CONTINUE4:.*]] ]
+; CHECK:         [[OFFSET_IDX:%.*]] = add i32 -20, [[INDEX]]
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[OFFSET_IDX]] to i16
+; CHECK:         [[TMP8:%.*]] = add i16 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP9:%.*]] = urem i16 %b, [[TMP8]]
+; CHECK:         [[TMP15:%.*]] = add i16 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP16:%.*]] = urem i16 %b, [[TMP15]]
+; CHECK:       [[PRED_UREM_CONTINUE4]]:
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define i32 @PR32419(i32 %a, i16 %b) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i32 [ -20, %entry ], [ %i.next, %for.inc ]
+  %tmp0 = phi i32 [ %a, %entry ], [ %tmp6, %for.inc ]
+  %tmp1 = trunc i32 %i to i16
+  %tmp2 = icmp eq i16 %tmp1, 0
+  br i1 %tmp2, label %for.inc, label %for.cond
+
+for.cond:
+  %tmp3 = urem i16 %b, %tmp1
+  br label %for.inc
+
+for.inc:
+  %tmp4 = phi i16 [ %tmp3, %for.cond ], [ 0, %for.body ]
+  %tmp5 = sext i16 %tmp4 to i32
+  %tmp6 = or i32 %tmp0, %tmp5
+  %i.next = add nsw i32 %i, 1
+  %cond = icmp eq i32 %i.next, 0
+  br i1 %cond, label %for.end, label %for.body
+
+for.end:
+  %tmp7 = phi i32 [ %tmp6, %for.inc ]
+  ret i32 %tmp7
+}
diff --git a/test/Transforms/LoopVectorize/lifetime.ll b/test/Transforms/LoopVectorize/lifetime.ll
index 6e525ca1d822d6e5596ad3d1f13507b48b8d1e9a..860fe2d983cdf1ea29feb7ef0ad264fafc87c616 100644
--- a/test/Transforms/LoopVectorize/lifetime.ll
+++ b/test/Transforms/LoopVectorize/lifetime.ll
@@ -13,23 +13,23 @@ define void @test(i32 *%d) {
 entry:
   %arr = alloca [1024 x i32], align 16
   %0 = bitcast [1024 x i32]* %arr to i8*
-  call void @llvm.lifetime.start(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
   br label %for.body
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
-  call void @llvm.lifetime.end(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
   %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
   %1 = load i32, i32* %arrayidx, align 8
   store i32 100, i32* %arrayidx, align 8
-  call void @llvm.lifetime.start(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp ne i32 %lftr.wideiv, 128
   br i1 %exitcond, label %for.body, label %for.end
 
 for.end:
-  call void @llvm.lifetime.end(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
   ret void
 }
 
@@ -42,24 +42,24 @@ define void @testbitcast(i32 *%d) {
 entry:
   %arr = alloca [1024 x i32], align 16
   %0 = bitcast [1024 x i32]* %arr to i8*
-  call void @llvm.lifetime.start(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %0) #1
   br label %for.body
 
 for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %1 = bitcast [1024 x i32]* %arr to i8*
-  call void @llvm.lifetime.end(i64 4096, i8* %1) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1
   %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
   %2 = load i32, i32* %arrayidx, align 8
   store i32 100, i32* %arrayidx, align 8
-  call void @llvm.lifetime.start(i64 4096, i8* %1) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp ne i32 %lftr.wideiv, 128
   br i1 %exitcond, label %for.body, label %for.end
 
 for.end:
-  call void @llvm.lifetime.end(i64 4096, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %0) #1
   ret void
 }
 
@@ -77,11 +77,11 @@ for.body:
   %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
   %0 = getelementptr [1024 x i32], [1024 x i32]* %arr, i32 0, i64 %indvars.iv
   %1 = bitcast [1024 x i32]* %arr to i8*
-  call void @llvm.lifetime.end(i64 4096, i8* %1) #1
+  call void @llvm.lifetime.end.p0i8(i64 4096, i8* %1) #1
   %arrayidx = getelementptr inbounds i32, i32* %d, i64 %indvars.iv
   %2 = load i32, i32* %arrayidx, align 8
   store i32 100, i32* %arrayidx, align 8
-  call void @llvm.lifetime.start(i64 4096, i8* %1) #1
+  call void @llvm.lifetime.start.p0i8(i64 4096, i8* %1) #1
   %indvars.iv.next = add i64 %indvars.iv, 1
   %lftr.wideiv = trunc i64 %indvars.iv.next to i32
   %exitcond = icmp ne i32 %lftr.wideiv, 128
@@ -91,6 +91,6 @@ for.end:
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
diff --git a/test/Transforms/LoopVectorize/loop-scalars.ll b/test/Transforms/LoopVectorize/loop-scalars.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4dcd5993c128b9ccdd1c2d505dc81699ada8d9ff
--- /dev/null
+++ b/test/Transforms/LoopVectorize/loop-scalars.ll
@@ -0,0 +1,143 @@
+; REQUIRES: asserts
+; RUN: opt < %s -loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -instcombine -debug-only=loop-vectorize -disable-output -print-after=instcombine 2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: vector_gep
+; CHECK-NOT:   LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <2 x i64> [ <i64 0, i64 1>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <2 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <2 x i32*>*
+; CHECK-NEXT:    store <2 x i32*> [[TMP1]], <2 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <2 x i64> [[VEC_IND]], <i64 2, i64 2>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @vector_gep(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: scalar_store
+; CHECK:       LV: Found scalar instruction: %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, i32* %b, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]]
+; CHECK-NEXT:    store i32* [[TMP5]], i32** [[TMP7]], align 8
+; CHECK-NEXT:    store i32* [[TMP6]], i32** [[TMP8]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @scalar_store(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: expansion
+; CHECK:       LV: Found scalar instruction: %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp1 = bitcast i64* %tmp0 to i32*
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0
+; CHECK-NEXT:  LV: Found scalar instruction: %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
+; CHECK-NEXT:  LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 2
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = or i64 [[OFFSET_IDX]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, i64* %b, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[TMP4]]
+; CHECK-NEXT:    [[TMP9:%.*]] = bitcast i32** [[TMP7]] to i64**
+; CHECK-NEXT:    store i64* [[TMP5]], i64** [[TMP9]], align 8
+; CHECK-NEXT:    [[TMP10:%.*]] = bitcast i32** [[TMP8]] to i64**
+; CHECK-NEXT:    store i64* [[TMP6]], i64** [[TMP10]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @expansion(i32** %a, i64 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i64, i64* %b, i64 %i
+  %tmp1 = bitcast i64* %tmp0 to i32*
+  %tmp2 = getelementptr inbounds i32*, i32** %a, i64 0
+  %tmp3 = getelementptr inbounds i32*, i32** %tmp2, i64 %i
+  store i32* %tmp1, i32** %tmp3, align 8
+  %i.next = add nuw nsw i64 %i, 2
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: no_gep_or_bitcast
+; CHECK-NOT:   LV: Found scalar instruction: %tmp1 = load i32*, i32** %tmp0, align 8
+; CHECK:       LV: Found scalar instruction: %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+; CHECK-NEXT:  LV: Found scalar instruction: %i.next = add nuw nsw i64 %i, 1
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i32** [[TMP1]] to <2 x i32*>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32*>, <2 x i32*>* [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 0
+; CHECK-NEXT:    store i32 0, i32* [[TMP3]], align 8
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x i32*> [[WIDE_LOAD]], i32 1
+; CHECK-NEXT:    store i32 0, i32* [[TMP4]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @no_gep_or_bitcast(i32** noalias %a, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32*, i32** %a, i64 %i
+  %tmp1 = load i32*, i32** %tmp0, align 8
+  store i32 0, i32* %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll b/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
index a310b10a5c817d099d918347e129e8a75ca7337f..5c87dc435c7c4e87c7df0ef427cfad3dd9f6ba65 100644
--- a/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
+++ b/test/Transforms/LoopVectorize/multiple-strides-vectorization.ll
@@ -13,9 +13,9 @@
 ;       int v3[Z][Z];
 ; } s;
 ;
-; void slow_function (s* const obj) {
+; void slow_function (s* const obj, int z) {
 ;    for (int j=0; j<Z; j++) {
-;        for (int k=0; k<Z; k++) {
+;        for (int k=0; k<z; k++) {
 ;            int x = obj->v1[k] + obj->v2[j];
 ;            obj->v3[j][k] += x;
 ;        }
@@ -31,7 +31,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 %struct.s = type { [32 x i32], [32 x i32], [32 x [32 x i32]] }
 
-define void @Test(%struct.s* nocapture %obj) #0 {
+define void @Test(%struct.s* nocapture %obj, i64 %z) #0 {
   br label %.outer.preheader
 
 
@@ -59,6 +59,6 @@ define void @Test(%struct.s* nocapture %obj) #0 {
   %8 = add nsw i32 %5, %7
   store i32 %8, i32* %6  
   %j.next = add nuw nsw i64 %j, 1
-  %exitcond.inner = icmp eq i64 %j.next, 32
+  %exitcond.inner = icmp eq i64 %j.next, %z
   br i1 %exitcond.inner, label %.outer, label %.inner
 }
diff --git a/test/Transforms/LoopVectorize/pr31098.ll b/test/Transforms/LoopVectorize/pr31098.ll
new file mode 100644
index 0000000000000000000000000000000000000000..368a948557c38f9c238c7e2f17c0a7eb645370c7
--- /dev/null
+++ b/test/Transforms/LoopVectorize/pr31098.ll
@@ -0,0 +1,100 @@
+; REQUIRES: asserts
+; RUN: opt -S -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -enable-interleaved-mem-accesses=true -debug-only=loop-accesses < %s  2>&1 | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; Check that the compile-time-unknown depenendece-distance is resolved 
+; statically. Due to the non-unit stride of the accesses in this testcase
+; we are currently not able to create runtime dependence checks, and therefore
+; if we don't resolve the dependence statically we cannot vectorize the loop.
+;
+; Specifically in this example, during dependence analysis we get 6 unknown 
+; dependence distances between the 8 real/imaginary accesses below: 
+;    dist = 8*D, 4+8*D, -4+8*D, -8*D, 4-8*D, -4-8*D.
+; At compile time we can prove for all of the above that |dist|>loopBound*step
+; (where the step is 8bytes, and the loopBound is D-1), and thereby conclude 
+; that there are no dependencies (without runtime tests):
+; |8*D|>8*D-8, |4+8*D|>8*D-8, |-4+8*D|>8*D-8, etc.
+
+; #include <stdlib.h>
+; class Complex {
+; private:
+;   float real_;
+;   float imaginary_;
+;
+; public:
+;   Complex() : real_(0), imaginary_(0) { }
+;   Complex(float real, float imaginary) : real_(real), imaginary_(imaginary) { }
+;   Complex(const Complex &rhs) : real_(rhs.real()), imaginary_(rhs.imaginary()) { }
+; 
+;   inline float real() const { return real_; }
+;   inline float imaginary() const { return imaginary_; }
+; 
+;   Complex operator+(const Complex& rhs) const
+;   {
+;    return Complex(real_ + rhs.real_, imaginary_ + rhs.imaginary_);
+;   }
+;
+;   Complex operator-(const Complex& rhs) const
+;  {
+;     return Complex(real_ - rhs.real_, imaginary_ - rhs.imaginary_);
+;   }
+; };
+;
+; void Test(Complex *out, size_t size)
+; {
+;     size_t D = size / 2;
+;     for (size_t offset = 0; offset < D; ++offset)
+;     {
+;         Complex t0 = out[offset];
+;         Complex t1 = out[offset + D];
+;         out[offset] = t1 + t0;
+;         out[offset + D] = t0 - t1;
+;     }
+; }
+
+; CHECK-LABEL: Test
+; CHECK: LAA: No unsafe dependent memory operations in loop.  We don't need runtime memory checks.
+; CHECK: vector.body:
+; CHECK: <4 x i32>
+
+%class.Complex = type { float, float }
+
+define void @Test(%class.Complex* nocapture %out, i64 %size) local_unnamed_addr {
+entry:
+  %div = lshr i64 %size, 1
+  %cmp47 = icmp eq i64 %div, 0
+  br i1 %cmp47, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:
+  br label %for.body
+
+for.cond.cleanup.loopexit:
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  ret void
+
+for.body:
+  %offset.048 = phi i64 [ %inc, %for.body ], [ 0, %for.body.preheader ]
+  %0 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 0
+  %1 = load float, float* %0, align 4
+  %imaginary_.i.i = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %offset.048, i32 1
+  %2 = load float, float* %imaginary_.i.i, align 4
+  %add = add nuw i64 %offset.048, %div
+  %3 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 0
+  %4 = load float, float* %3, align 4
+  %imaginary_.i.i28 = getelementptr inbounds %class.Complex, %class.Complex* %out, i64 %add, i32 1
+  %5 = load float, float* %imaginary_.i.i28, align 4
+  %add.i = fadd fast float %4, %1
+  %add4.i = fadd fast float %5, %2
+  store float %add.i, float* %0, align 4
+  store float %add4.i, float* %imaginary_.i.i, align 4
+  %sub.i = fsub fast float %1, %4
+  %sub4.i = fsub fast float %2, %5
+  store float %sub.i, float* %3, align 4
+  store float %sub4.i, float* %imaginary_.i.i28, align 4
+  %inc = add nuw nsw i64 %offset.048, 1
+  %exitcond = icmp eq i64 %inc, %div
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/test/Transforms/LoopVectorize/reverse_iter.ll b/test/Transforms/LoopVectorize/reverse_iter.ll
index a6e2abda36d9755390d38d053f4298dc7020b6d8..bd057698280beafc4823e98a9ac82bb2564c8bb6 100644
--- a/test/Transforms/LoopVectorize/reverse_iter.ll
+++ b/test/Transforms/LoopVectorize/reverse_iter.ll
@@ -2,7 +2,8 @@
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
-; Make sure that the reverse iterators are calculated using 64bit arithmetic, not 32.
+; PR15882: This test ensures that we do not produce wrapping arithmetic when
+; creating constant reverse step vectors.
 ;
 ; int foo(int n, int *A) {
 ;   int sum;
@@ -13,7 +14,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 ;
 
 ;CHECK-LABEL: @foo(
-;CHECK:  <i64 0, i64 -1, i64 -2, i64 -3>
+;CHECK:  <i32 0, i32 -1, i32 -2, i32 -3>
 ;CHECK: ret
 define i32 @foo(i32 %n, i32* nocapture %A) {
   %1 = icmp sgt i32 %n, 0
diff --git a/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll b/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d3112b82d1d535145dca93f7aaf96c9d383558e5
--- /dev/null
+++ b/test/Transforms/LoopVectorize/unroll-novec-memcheck-metadata.ll
@@ -0,0 +1,36 @@
+; RUN: opt < %s -loop-vectorize -force-vector-interleave=2 -force-vector-width=1 -S | FileCheck --enable-var-scope %s
+
+; Make sure we attach memcheck metadata to scalarized memory operations even if
+; we're only unrolling.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+; CHECK-LABEL: vector.memcheck:
+; CHECK-LABEL: vector.body:
+; CHECK: load i32, {{.*}} !alias.scope ![[$MD1:[0-9]+]]
+; CHECK-LABEL: middle.block:
+; CHECK-DAG: ![[$MD1]] = !{![[MD2:[0-9]+]]}
+; CHECK-DAG: ![[MD2]] = distinct !{![[MD2]], ![[MD3:[0-9]+]]}
+; CHECK-DAG: ![[MD3]] = distinct !{![[MD3]], !"LVerDomain"}
+
+; Function Attrs: norecurse nounwind uwtable
+define void @test(i32* nocapture readonly %a, i32* nocapture %b) local_unnamed_addr #0 {
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i32, i32* %a, i64 %indvars.iv
+  %0 = load i32, i32* %arrayidx, align 4
+  %add = add nsw i32 %0, 77
+  %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %indvars.iv
+  store i32 %add, i32* %arrayidx2, align 4
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond = icmp eq i64 %indvars.iv.next, 10000
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+attributes #0 = { norecurse nounwind uwtable }
diff --git a/test/Transforms/LoopVectorize/vector-geps.ll b/test/Transforms/LoopVectorize/vector-geps.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bd79499d5d34eac0c800397117a5f72a507ca978
--- /dev/null
+++ b/test/Transforms/LoopVectorize/vector-geps.ll
@@ -0,0 +1,61 @@
+; RUN: opt < %s -loop-vectorize -force-vector-width=4 -force-vector-interleave=1 -instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+
+; CHECK-LABEL: @vector_gep_stored(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i64> [ <i64 0, i64 1, i64 2, i64 3>, %vector.ph ], [ [[VEC_IND_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, <4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>*
+; CHECK-NEXT:    store <4 x i32*> [[TMP1]], <4 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], <i64 4, i64 4, i64 4, i64 4>
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @vector_gep_stored(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 %i
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
+
+; CHECK-LABEL: @uniform_vector_gep_stored(
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %vector.ph ], [ [[INDEX_NEXT:%.*]], %vector.body ]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, i32* %b, i64 1
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x i32*> undef, i32* [[TMP1]], i32 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x i32*> [[DOTSPLATINSERT]], <4 x i32*> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32*, i32** %a, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i32** [[TMP2]] to <4 x i32*>*
+; CHECK-NEXT:    store <4 x i32*> [[DOTSPLAT]], <4 x i32*>* [[TMP3]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 4
+; CHECK:         br i1 {{.*}}, label %middle.block, label %vector.body
+;
+define void @uniform_vector_gep_stored(i32** %a, i32 *%b, i64 %n) {
+entry:
+  br label %for.body
+
+for.body:
+  %i = phi i64 [ %i.next, %for.body ], [ 0, %entry ]
+  %tmp0 = getelementptr inbounds i32, i32* %b, i64 1
+  %tmp1 = getelementptr inbounds i32*, i32** %a, i64 %i
+  store i32* %tmp0, i32** %tmp1, align 8
+  %i.next = add nuw nsw i64 %i, 1
+  %cond = icmp slt i64 %i.next, %n
+  br i1 %cond, label %for.body, label %for.end
+
+for.end:
+  ret void
+}
diff --git a/test/Transforms/LoopVersioning/loop-invariant-bound.ll b/test/Transforms/LoopVersioning/loop-invariant-bound.ll
index 3411adbf245ef50c148515de85e5a75cb231f87c..01c5a55bd5b26cb1cbfd8b44c432f4a3caf777db 100644
--- a/test/Transforms/LoopVersioning/loop-invariant-bound.ll
+++ b/test/Transforms/LoopVersioning/loop-invariant-bound.ll
@@ -8,12 +8,13 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %Partials.215 = type { [2 x %Dual.213] }
 
 ; Function Attrs: sspreq
-define void @"julia_axpy!_65480"(%Dual.212*) {
+define void @"julia_axpy!_65480"(%Dual.212*, %Dual.212* %other) {
 top:
   br label %if24
 
 ; CHECK-NOT: %bc = bitcast i64* %v2.sroa.0.0..sroa_cast
-; CHECK: %bound0
+; CHECK: %bound0 = icmp ult i8* %[[x:[a-z0-9]+]], %[[y:[a-z0-9]+]]
+; CHECK-NOT: %bound1 = icmp ult i8* %[[y]], %[[x]]
 
 if24:                                             ; preds = %if24, %top
   %"#temp#1.sroa.3.02" = phi i64 [ undef, %top ], [ %2, %if24 ]
@@ -24,7 +25,7 @@ if24:                                             ; preds = %if24, %top
   %v2.sroa.0.0..sroa_cast = bitcast %Dual.212* %0 to i64*
   %v2.sroa.0.0.copyload = load i64, i64* %v2.sroa.0.0..sroa_cast, align 1
   %3 = add i64 %"#temp#1.sroa.0.01", -1
-  %4 = getelementptr inbounds %Dual.212, %Dual.212* undef, i64 %3, i32 1, i32 0, i64 0, i32 1, i32 0, i64 0
+  %4 = getelementptr inbounds %Dual.212, %Dual.212* %other, i64 0, i32 1, i32 0, i64 0, i32 1, i32 0, i64 0
   %5 = bitcast double* %4 to i64*
   store i64 undef, i64* %5, align 8
   %notlhs27 = icmp eq i64 %2, undef
diff --git a/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll b/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
index ff6c25087aa56899f12985619570c519883b2d09..791c2e3210c8e791c8c32508188dc9d0c087a0b5 100644
--- a/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
+++ b/test/Transforms/LoopVersioningLICM/loopversioningLICM1.ll
@@ -16,7 +16,7 @@
 ; CHECK-NEXT: %add8 = add nsw i32 %[[induction]], %add
 ; CHECK-NEXT: %inc = add nuw i32 %j.113, 1
 ; CHECK-NEXT: %cmp2 = icmp ult i32 %inc, %itr
-; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit6, !llvm.loop !5
+; CHECK-NEXT: br i1 %cmp2, label %for.body3, label %for.inc11.loopexit.loopexit7, !llvm.loop !5
 define i32 @foo(i32* nocapture %var1, i32* nocapture readnone %var2, i32* nocapture %var3, i32 %itr) #0 {
 entry:
   %cmp14 = icmp eq i32 %itr, 0
diff --git a/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll b/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll
index 928a6527badca17d105a9649bd930efe2bcefc0a..53add63380228b9880f84c680dd1ec82d9e97754 100644
--- a/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll
+++ b/test/Transforms/LoopVersioningLICM/loopversioningLICM2.ll
@@ -7,7 +7,7 @@
 ; CHECK: Loop: Loop at depth 2 containing: %for.body3.us<header><latch><exiting>
 ; CHECK-NEXT:     Loop Versioning found to be beneficial
 ;
-; CHECK: for.cond1.for.inc17_crit_edge.us.loopexit5:       ; preds = %for.body3.us
+; CHECK: for.cond1.for.inc17_crit_edge.us.loopexit6:       ; preds = %for.body3.us
 ; CHECK-NEXT: %add14.us.lcssa = phi float [ %add14.us, %for.body3.us ]
 ; CHECK-NEXT: store float %add14.us.lcssa, float* %arrayidx.us, align 4, !alias.scope !0, !noalias !0
 ; CHECK-NEXT: br label %for.cond1.for.inc17_crit_edge.us
diff --git a/test/Transforms/LowerTypeTests/export-allones.ll b/test/Transforms/LowerTypeTests/export-allones.ll
index fb09a60a5afab710fbfc773d20ac91ce195a1351..a642ec87355f56b35176aaf8995f14878f44d90b 100644
--- a/test/Transforms/LowerTypeTests/export-allones.ll
+++ b/test/Transforms/LowerTypeTests/export-allones.ll
@@ -153,7 +153,9 @@
 ; SUMMARY-NEXT:     TTRes:
 ; SUMMARY-NEXT:       Kind:            AllOnes
 ; SUMMARY-NEXT:       SizeM1BitWidth:  7
+; SUMMARY-NEXT:     WPDRes:
 ; SUMMARY-NEXT:   typeid2:
 ; SUMMARY-NEXT:     TTRes:
 ; SUMMARY-NEXT:       Kind:            AllOnes
 ; SUMMARY-NEXT:       SizeM1BitWidth:  32
+; SUMMARY-NEXT:     WPDRes:
diff --git a/test/Transforms/LowerTypeTests/export-bytearray.ll b/test/Transforms/LowerTypeTests/export-bytearray.ll
index 814d164cb19501ea6303b3f2facc2ea68f95dd67..7565b85df30f07ad830b2042f09b5f7e5485e566 100644
--- a/test/Transforms/LowerTypeTests/export-bytearray.ll
+++ b/test/Transforms/LowerTypeTests/export-bytearray.ll
@@ -32,7 +32,9 @@
 ; SUMMARY-NEXT:     TTRes:
 ; SUMMARY-NEXT:       Kind:            ByteArray
 ; SUMMARY-NEXT:       SizeM1BitWidth:  7
+; SUMMARY-NEXT:     WPDRes:
 ; SUMMARY-NEXT:   typeid2:
 ; SUMMARY-NEXT:     TTRes:
 ; SUMMARY-NEXT:       Kind:            ByteArray
 ; SUMMARY-NEXT:       SizeM1BitWidth:  32
+; SUMMARY-NEXT:     WPDRes:
diff --git a/test/Transforms/LowerTypeTests/export-inline.ll b/test/Transforms/LowerTypeTests/export-inline.ll
index 62b3187e9d6448edbafc9c034e1f61246e5ccabf..1da5866e88cc1a4b969d819fc48d0c6c9ec105df 100644
--- a/test/Transforms/LowerTypeTests/export-inline.ll
+++ b/test/Transforms/LowerTypeTests/export-inline.ll
@@ -27,7 +27,9 @@
 ; SUMMARY-NEXT:     TTRes:
 ; SUMMARY-NEXT:       Kind:            Inline
 ; SUMMARY-NEXT:       SizeM1BitWidth:  5
+; SUMMARY-NEXT:     WPDRes:
 ; SUMMARY-NEXT:   typeid2:
 ; SUMMARY-NEXT:     TTRes:
 ; SUMMARY-NEXT:       Kind:            Inline
 ; SUMMARY-NEXT:       SizeM1BitWidth:  6
+; SUMMARY-NEXT:     WPDRes:
diff --git a/test/Transforms/LowerTypeTests/import-unsat.ll b/test/Transforms/LowerTypeTests/import-unsat.ll
index 7410bc4b4d885aab81ad9f858c83a90e594c29f8..76b24400198689a07a1ea5b8032e9f2c75af2ca4 100644
--- a/test/Transforms/LowerTypeTests/import-unsat.ll
+++ b/test/Transforms/LowerTypeTests/import-unsat.ll
@@ -4,8 +4,7 @@
 
 ; SUMMARY:      GlobalValueMap:
 ; SUMMARY-NEXT:   42:
-; SUMMARY-NEXT:     - TypeTests:
-; SUMMARY-NEXT:         - 123
+; SUMMARY-NEXT:     - TypeTests: [ 123 ]
 ; SUMMARY-NEXT: TypeIdMap:
 ; SUMMARY-NEXT:   typeid1:
 ; SUMMARY-NEXT:     TTRes:
diff --git a/test/Transforms/Mem2Reg/ignore-lifetime.ll b/test/Transforms/Mem2Reg/ignore-lifetime.ll
index 12adaffc7714fd7b675f71eb676e8d697867627b..b996a659237a7eb95d84cf0004ebda09b00d623c 100644
--- a/test/Transforms/Mem2Reg/ignore-lifetime.ll
+++ b/test/Transforms/Mem2Reg/ignore-lifetime.ll
@@ -1,16 +1,16 @@
 ; RUN: opt -mem2reg -S -o - < %s | FileCheck %s
 
-declare void @llvm.lifetime.start(i64 %size, i8* nocapture %ptr)
-declare void @llvm.lifetime.end(i64 %size, i8* nocapture %ptr)
+declare void @llvm.lifetime.start.p0i8(i64 %size, i8* nocapture %ptr)
+declare void @llvm.lifetime.end.p0i8(i64 %size, i8* nocapture %ptr)
 
 define void @test1() {
 ; CHECK: test1
 ; CHECK-NOT: alloca
   %A = alloca i32
   %B = bitcast i32* %A to i8*
-  call void @llvm.lifetime.start(i64 2, i8* %B)
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* %B)
   store i32 1, i32* %A
-  call void @llvm.lifetime.end(i64 2, i8* %B)
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* %B)
   ret void
 }
 
@@ -19,8 +19,8 @@ define void @test2() {
 ; CHECK-NOT: alloca
   %A = alloca {i8, i16}
   %B = getelementptr {i8, i16}, {i8, i16}* %A, i32 0, i32 0
-  call void @llvm.lifetime.start(i64 2, i8* %B)
+  call void @llvm.lifetime.start.p0i8(i64 2, i8* %B)
   store {i8, i16} zeroinitializer, {i8, i16}* %A
-  call void @llvm.lifetime.end(i64 2, i8* %B)
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* %B)
   ret void
 }
diff --git a/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll b/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll
new file mode 100644
index 0000000000000000000000000000000000000000..33a5b124c555f503e36eb558d2a829c1209d657a
--- /dev/null
+++ b/test/Transforms/Mem2Reg/preserve-nonnull-load-metadata.ll
@@ -0,0 +1,89 @@
+; RUN: opt < %s -mem2reg -S | FileCheck %s
+
+; This tests that mem2reg preserves the !nonnull metadata on loads
+; from allocas that get optimized out.
+
+; Check the case where the alloca in question has a single store.
+define float* @single_store(float** %arg) {
+; CHECK-LABEL: define float* @single_store
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* %arg.load, null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* %arg.load
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float **%buf, !nonnull !0
+  ret float* %buf.load
+}
+
+; Check the case where the alloca in question has more than one
+; store but still within one basic block.
+define float* @single_block(float** %arg) {
+; CHECK-LABEL: define float* @single_block
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* %arg.load, null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* %arg.load
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  store float* null, float** %buf, align 8
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float **%buf, !nonnull !0
+  ret float* %buf.load
+}
+
+; Check the case where the alloca in question has more than one
+; store and also reads ands writes in multiple blocks.
+define float* @multi_block(float** %arg) {
+; CHECK-LABEL: define float* @multi_block
+; CHECK-LABEL: entry:
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: br label %next
+; CHECK-LABEL: next:
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* %arg.load, null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* %arg.load
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  store float* null, float** %buf, align 8
+  br label %next
+next:
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float** %buf, !nonnull !0
+  ret float* %buf.load
+}
+
+; Check that we don't add an assume if it's not
+; necessary i.e. the value is already implied to be nonnull
+define float* @no_assume(float** %arg) {
+; CHECK-LABEL: define float* @no_assume
+; CHECK-LABEL: entry:
+; CHECK: %arg.load = load float*, float** %arg, align 8
+; CHECK: %cn = icmp ne float* %arg.load, null
+; CHECK: br i1 %cn, label %next, label %fin
+; CHECK-LABEL: next:
+; CHECK-NOT: call void @llvm.assume
+; CHECK: ret float* %arg.load
+; CHECK-LABEL: fin:
+; CHECK: ret float* null
+entry:
+  %buf = alloca float*
+  %arg.load = load float*, float** %arg, align 8
+  %cn = icmp ne float* %arg.load, null
+  br i1 %cn, label %next, label %fin
+next:
+; At this point the above nonnull check ensures that
+; the value %arg.load is nonnull in this block and thus
+; we need not add the assume.
+  store float* %arg.load, float** %buf, align 8
+  %buf.load = load float*, float** %buf, !nonnull !0
+  ret float* %buf.load
+fin:
+  ret float* null
+}
+
+!0 = !{}
diff --git a/test/Transforms/MemCpyOpt/lifetime.ll b/test/Transforms/MemCpyOpt/lifetime.ll
index 6a7e44692daab48e26c1458b33531557950f3068..77b495f2b583994c6ac9feb16d8efc99a9fdfba1 100644
--- a/test/Transforms/MemCpyOpt/lifetime.ll
+++ b/test/Transforms/MemCpyOpt/lifetime.ll
@@ -4,8 +4,8 @@
 ; @llvm.lifetime.start and @llvm.memcpy.
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 define void @_ZN4CordC2EOS_(i8* nocapture dereferenceable(16) %arg1) {
 bb:
@@ -14,11 +14,11 @@ bb:
 ; CHECK: ret void
   %tmp = alloca [8 x i8], align 8
   %tmp5 = bitcast [8 x i8]* %tmp to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %tmp5)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %tmp5)
   %tmp10 = getelementptr inbounds i8, i8* %tmp5, i64 7
   store i8 0, i8* %tmp10, align 1
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %arg1, i8* %tmp5, i64 16, i32 8, i1 false)
-  call void @llvm.lifetime.end(i64 16, i8* %tmp5)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %tmp5)
   ret void
 }
 
diff --git a/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll b/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll
index e3e57f09d88f60ffb3405f67e44f76b5af92b592..e21dc87cb6a0f48a0f1b10457aaf9898247787c9 100644
--- a/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll
+++ b/test/Transforms/MemCpyOpt/memcpy-to-memset-with-lifetimes.ll
@@ -7,11 +7,11 @@ define void @foo([8 x i64]* noalias nocapture sret dereferenceable(64) %sret) {
 entry-block:
   %a = alloca [8 x i64], align 8
   %a.cast = bitcast [8 x i64]* %a to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %a.cast)
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* %a.cast)
   call void @llvm.memset.p0i8.i64(i8* %a.cast, i8 0, i64 64, i32 8, i1 false)
   %sret.cast = bitcast [8 x i64]* %sret to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %sret.cast, i8* %a.cast, i64 64, i32 8, i1 false)
-  call void @llvm.lifetime.end(i64 64, i8* %a.cast)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %a.cast)
   ret void
 
 ; CHECK-LABEL: @foo(
@@ -25,14 +25,14 @@ define void @bar([8 x i64]* noalias nocapture sret dereferenceable(64) %sret, [8
 entry-block:
   %a = alloca [8 x i64], align 8
   %a.cast = bitcast [8 x i64]* %a to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %a.cast)
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* %a.cast)
   call void @llvm.memset.p0i8.i64(i8* %a.cast, i8 0, i64 64, i32 8, i1 false)
   %sret.cast = bitcast [8 x i64]* %sret to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %sret.cast, i8* %a.cast, i64 64, i32 8, i1 false)
   call void @llvm.memset.p0i8.i64(i8* %a.cast, i8 42, i64 32, i32 8, i1 false)
   %out.cast = bitcast [8 x i64]* %out to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out.cast, i8* %a.cast, i64 64, i32 8, i1 false)
-  call void @llvm.lifetime.end(i64 64, i8* %a.cast)
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %a.cast)
   ret void
 
 ; CHECK-LABEL: @bar(
@@ -48,8 +48,8 @@ entry-block:
 ; CHECK: ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) nounwind
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
diff --git a/test/Transforms/MemCpyOpt/memcpy-undef.ll b/test/Transforms/MemCpyOpt/memcpy-undef.ll
index c75d020c0786f4755601d960523ba227de3174dd..06a41829a4ee9cbf7bea496951ca7d0d9a0819cb 100644
--- a/test/Transforms/MemCpyOpt/memcpy-undef.ll
+++ b/test/Transforms/MemCpyOpt/memcpy-undef.ll
@@ -22,7 +22,7 @@ define i32 @test1(%struct.foo* nocapture %foobie) nounwind noinline ssp uwtable
 }
 
 define void @test2(i8* sret noalias nocapture %out, i8* %in) nounwind noinline ssp uwtable {
-  call void @llvm.lifetime.start(i64 8, i8* %in)
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %in)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 8, i32 1, i1 false)
   ret void
 
@@ -32,7 +32,7 @@ define void @test2(i8* sret noalias nocapture %out, i8* %in) nounwind noinline s
 }
 
 define void @test3(i8* sret noalias nocapture %out, i8* %in) nounwind noinline ssp uwtable {
-  call void @llvm.lifetime.start(i64 4, i8* %in)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %in)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %out, i8* %in, i64 8, i32 1, i1 false)
   ret void
 
@@ -43,4 +43,4 @@ define void @test3(i8* sret noalias nocapture %out, i8* %in) nounwind noinline s
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
diff --git a/test/Transforms/MemCpyOpt/memcpy.ll b/test/Transforms/MemCpyOpt/memcpy.ll
index 6181543cfc63a7c2b6b121638ebe86e5942adda4..e4d50f7157deb3ffd43061ed0cee5a2a83076a44 100644
--- a/test/Transforms/MemCpyOpt/memcpy.ll
+++ b/test/Transforms/MemCpyOpt/memcpy.ll
@@ -76,8 +76,21 @@ define void @test4(i8 *%P) {
 ; CHECK-NEXT: call void @test4a(
 }
 
+; Make sure we don't remove the memcpy if the source address space doesn't match the byval argument
+define void @test4_addrspace(i8 addrspace(1)* %P) {
+  %A = alloca %1
+  %a = bitcast %1* %A to i8*
+  call void @llvm.memcpy.p0i8.p1i8.i64(i8* %a, i8 addrspace(1)* %P, i64 8, i32 4, i1 false)
+  call void @test4a(i8* align 1 byval %a)
+  ret void
+; CHECK-LABEL: @test4_addrspace(
+; CHECK: call void @llvm.memcpy.p0i8.p1i8.i64(
+; CHECK-NEXT: call void @test4a(
+}
+
 declare void @test4a(i8* align 1 byval)
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
 declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
 
 %struct.S = type { i128, [4 x i8]}
@@ -202,6 +215,21 @@ define void @test10(%opaque* noalias nocapture sret %x, i32 %y) {
   ret void
 }
 
+; don't create new addressspacecasts when we don't know they're safe for the target
+define void @test11([20 x i32] addrspace(1)* nocapture dereferenceable(80) %P) {
+  %A = alloca [20 x i32], align 4
+  %a = bitcast [20 x i32]* %A to i8*
+  %b = bitcast [20 x i32] addrspace(1)* %P to i8 addrspace(1)*
+  call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 80, i32 4, i1 false)
+  call void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* %b, i8* %a, i64 80, i32 4, i1 false)
+  ret void
+; CHECK-LABEL: @test11(
+; CHECK-NOT: addrspacecast
+}
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i32, i1) nounwind
+
 declare void @f1(%struct.big* nocapture sret)
 declare void @f2(%struct.big*)
 
diff --git a/test/Transforms/MemCpyOpt/pr29105.ll b/test/Transforms/MemCpyOpt/pr29105.ll
index 0d3778372266c1c7442b86da180fccc36c12f925..03b176c4d2451b525ae11d69cd9ad6bfb49201cc 100644
--- a/test/Transforms/MemCpyOpt/pr29105.ll
+++ b/test/Transforms/MemCpyOpt/pr29105.ll
@@ -11,25 +11,25 @@ entry-block:
   %0 = bitcast [2048 x i64]* %tmp0 to i8*
   %tmp2 = alloca %Foo, align 8
   %x.sroa.0.0..sroa_cast6 = bitcast [2048 x i64]* %x.sroa.0 to i8*
-  call void @llvm.lifetime.start(i64 16384, i8* %x.sroa.0.0..sroa_cast6)
-  call void @llvm.lifetime.start(i64 16384, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* %x.sroa.0.0..sroa_cast6)
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* %0)
   call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 16384, i32 8, i1 false)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %x.sroa.0.0..sroa_cast6, i8* %0, i64 16384, i32 8, i1 false)
-  call void @llvm.lifetime.end(i64 16384, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* %0)
   %1 = bitcast %Foo* %tmp2 to i8*
-  call void @llvm.lifetime.start(i64 16384, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 16384, i8* %1)
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %1, i8* %x.sroa.0.0..sroa_cast6, i64 16384, i32 8, i1 false)
   call void @bar(%Foo* noalias nocapture nonnull dereferenceable(16384) %tmp2)
-  call void @llvm.lifetime.end(i64 16384, i8* %1)
-  call void @llvm.lifetime.end(i64 16384, i8* %x.sroa.0.0..sroa_cast6)
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 16384, i8* %x.sroa.0.0..sroa_cast6)
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) #1
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 declare void @bar(%Foo* noalias nocapture readonly dereferenceable(16384)) unnamed_addr #0
 
diff --git a/test/Transforms/MetaRenamer/metarenamer.ll b/test/Transforms/MetaRenamer/metarenamer.ll
index 213fbe3bbff737eb4b313a5e965e7960b191dda2..7b527ae54cb1f046a5c8e22f91551fcce141c929 100644
--- a/test/Transforms/MetaRenamer/metarenamer.ll
+++ b/test/Transforms/MetaRenamer/metarenamer.ll
@@ -96,3 +96,18 @@ define i32 @varargs_func_6_xxx(i32 %arg_1_xxx, i32 %arg_2_xxx, ...) nounwind uwt
   store i32 %arg_2_xxx, i32* %2, align 4
   ret i32 6
 }
+
+declare noalias i8* @malloc(i32)
+declare void @free(i8* nocapture)
+
+define void @dont_rename_lib_funcs() {
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP:%.*]] = call i8* @malloc(i32 23)
+; CHECK-NEXT:    call void @free(i8* [[TMP]])
+; CHECK-NEXT:    ret void
+;
+  %x = call i8* @malloc(i32 23)
+  call void @free(i8* %x)
+  ret void
+}
diff --git a/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll b/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
index 402de50c72cfcfe591acb23420501ba51204ebba..27a798bf7dd12b29acc58467f03309b021c94dbc 100644
--- a/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
+++ b/test/Transforms/NewGVN/2007-07-26-PhiErasure.ll
@@ -1,4 +1,4 @@
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -newgvn -S | FileCheck %s
 
 	%struct..0anon = type { i32 }
@@ -9,37 +9,34 @@
 @n_spills = external global i32		; <i32*> [#uses=2]
 
 define i32 @reload(%struct.rtx_def* %first, i32 %global, %struct.FILE* %dumpfile) {
+; CHECK-LABEL: @reload(
+; CHECK-NEXT:  cond_next2835.1:
+; CHECK-NEXT:    br label [[BB2928:%.*]]
+; CHECK:       bb2928:
+; CHECK-NEXT:    br i1 false, label [[COND_NEXT2943:%.*]], label [[COND_TRUE2935:%.*]]
+; CHECK:       cond_true2935:
+; CHECK-NEXT:    br label [[COND_NEXT2943]]
+; CHECK:       cond_next2943:
+; CHECK-NEXT:    br i1 false, label [[BB2982_PREHEADER:%.*]], label [[BB2928]]
+; CHECK:       bb2982.preheader:
+; CHECK-NEXT:    store i8 undef, i8* null
+; CHECK-NEXT:    ret i32 undef
+;
 cond_next2835.1:		; preds = %cond_next2861
-	%tmp2922 = load i32, i32* @n_spills, align 4		; <i32> [#uses=0]
-	br label %bb2928
+  %tmp2922 = load i32, i32* @n_spills, align 4		; <i32> [#uses=0]
+  br label %bb2928
 
 bb2928:		; preds = %cond_next2835.1, %cond_next2943
-	br i1 false, label %cond_next2943, label %cond_true2935
+  br i1 false, label %cond_next2943, label %cond_true2935
 
 cond_true2935:		; preds = %bb2928
-	br label %cond_next2943
+  br label %cond_next2943
 
 cond_next2943:		; preds = %cond_true2935, %bb2928
-	br i1 false, label %bb2982.preheader, label %bb2928
+  br i1 false, label %bb2982.preheader, label %bb2928
 
 bb2982.preheader:		; preds = %cond_next2943
-	%tmp298316 = load i32, i32* @n_spills, align 4		; <i32> [#uses=0]
-	ret i32 %tmp298316
+  %tmp298316 = load i32, i32* @n_spills, align 4		; <i32> [#uses=0]
+  ret i32 %tmp298316
 
 }
-
-; CHECK: define i32 @reload(%struct.rtx_def* %first, i32 %global, %struct.FILE* %dumpfile) {
-; CHECK-NEXT: cond_next2835.1:
-; CHECK-NEXT:   br label %bb2928
-; CHECK: bb2928:
-; CHECK-NEXT:   br i1 false, label %bb2928.cond_next2943_crit_edge, label %cond_true2935
-; CHECK: bb2928.cond_next2943_crit_edge:
-; CHECK-NEXT:   br label %cond_next2943
-; CHECK: cond_true2935:
-; CHECK-NEXT:   br label %cond_next2943
-; CHECK: cond_next2943:
-; CHECK-NEXT:   br i1 false, label %bb2982.preheader, label %bb2928
-; CHECK: bb2982.preheader:
-; CHECK-NEXT:   %tmp298316 = load i32, i32* @n_spills, align 4
-; CHECK-NEXT:   ret i32 %tmp298316
-; CHECK-NEXT: }
diff --git a/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll b/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
index 4b47b06f1657580b50fa91ac8606cecdd870df69..86c80d1d5f21e580dd416bac5168812adf4460a5 100644
--- a/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
+++ b/test/Transforms/NewGVN/2011-07-07-MatchIntrinsicExtract.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt < %s -newgvn -S | FileCheck %s
 ;
 
@@ -9,7 +8,8 @@ entry:
   %uadd = tail call %0 @llvm.uadd.with.overflow.i64(i64 %a, i64 %b)
   %uadd.0 = extractvalue %0 %uadd, 0
   %add1 = add i64 %a, %b
-  ret i64 %add1
+  %add2 =  add i64 %add1, %uadd.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test1(
@@ -21,7 +21,8 @@ entry:
   %usub = tail call %0 @llvm.usub.with.overflow.i64(i64 %a, i64 %b)
   %usub.0 = extractvalue %0 %usub, 0
   %sub1 = sub i64 %a, %b
-  ret i64 %sub1
+  %add2 =  add i64 %sub1, %usub.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test2(
@@ -33,7 +34,8 @@ entry:
   %umul = tail call %0 @llvm.umul.with.overflow.i64(i64 %a, i64 %b)
   %umul.0 = extractvalue %0 %umul, 0
   %mul1 = mul i64 %a, %b
-  ret i64 %mul1
+  %add2 =  add i64 %mul1, %umul.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test3(
@@ -45,7 +47,8 @@ entry:
   %sadd = tail call %0 @llvm.sadd.with.overflow.i64(i64 %a, i64 %b)
   %sadd.0 = extractvalue %0 %sadd, 0
   %add1 = add i64 %a, %b
-  ret i64 %add1
+  %add2 =  add i64 %add1, %sadd.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test4(
@@ -57,7 +60,8 @@ entry:
   %ssub = tail call %0 @llvm.ssub.with.overflow.i64(i64 %a, i64 %b)
   %ssub.0 = extractvalue %0 %ssub, 0
   %sub1 = sub i64 %a, %b
-  ret i64 %sub1
+  %add2 =  add i64 %sub1, %ssub.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test5(
@@ -69,7 +73,8 @@ entry:
   %smul = tail call %0 @llvm.smul.with.overflow.i64(i64 %a, i64 %b)
   %smul.0 = extractvalue %0 %smul, 0
   %mul1 = mul i64 %a, %b
-  ret i64 %mul1
+  %add2 =  add i64 %mul1, %smul.0
+  ret i64 %add2
 }
 
 ; CHECK-LABEL: @test6(
diff --git a/test/Transforms/NewGVN/basic-cyclic-opt.ll b/test/Transforms/NewGVN/basic-cyclic-opt.ll
index 4901a305e5aa6af75df4da26ea44ea82ff6cc787..7830d7ea78a5e96dd41f42e350e9845460d8da74 100644
--- a/test/Transforms/NewGVN/basic-cyclic-opt.ll
+++ b/test/Transforms/NewGVN/basic-cyclic-opt.ll
@@ -228,8 +228,8 @@ bb23:                                             ; preds = %bb4
 }
 
 ;; This is an irreducible test case that will cause a memoryphi node loop
-;; in the two block.
-;; it's equivalent to something like
+;; in the two blocks.
+;; It's equivalent to something like
 ;; *a = 0
 ;; if (<....>) goto loopmiddle
 ;; loopstart:
@@ -245,8 +245,8 @@ bb23:                                             ; preds = %bb4
 ;; Both loads should equal 0, but it requires being
 ;; completely optimistic about MemoryPhis, otherwise
 ;; we will not be able to see through the cycle.
-define i8 @quux(i8* noalias %arg, i8* noalias %arg2) {
-; CHECK-LABEL: @quux(
+define i8 @irreducible_memoryphi(i8* noalias %arg, i8* noalias %arg2) {
+; CHECK-LABEL: @irreducible_memoryphi(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    store i8 0, i8* [[ARG:%.*]]
 ; CHECK-NEXT:    br i1 undef, label [[BB2:%.*]], label [[BB1:%.*]]
@@ -274,6 +274,40 @@ bb3:                                              ; preds = %bb2
   %tmp3 = add i8 %tmp, %tmp2
   ret i8 %tmp3
 }
+;; This is an irreducible test case that will cause a phi node loop
+;; in the two blocks
+;;
+;; It should return 0, but it requires being
+;; completely optimistic about phis, otherwise
+;; we will not be able to see through the cycle.
+define i32 @irreducible_phi(i32 %arg) {
+; CHECK-LABEL: @irreducible_phi(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 undef, label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i32 0
+;
+bb:
+  %tmp = add i32 0, %arg
+  br i1 undef, label %bb2, label %bb1
+
+bb1:                                              ; preds = %bb2, %bb
+  %phi1 = phi i32 [%tmp, %bb], [%phi2, %bb2]
+  br label %bb2
+
+bb2:                                              ; preds = %bb1, %bb
+  %phi2 = phi i32 [%tmp, %bb], [%phi1, %bb1]
+  br i1 undef, label %bb1, label %bb3
+
+bb3:                                              ; preds = %bb2
+  ; This should be zero
+  %tmp3 = sub i32 %tmp, %phi2
+  ret i32 %tmp3
+}
 attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.ident = !{!0, !0, !0}
diff --git a/test/Transforms/NewGVN/bitcast-of-call.ll b/test/Transforms/NewGVN/bitcast-of-call.ll
index 7b25038275b51a56236d261061dd10f4cfaf739a..2b817fbcd01c151578fc75ea4b6b1b538835428f 100644
--- a/test/Transforms/NewGVN/bitcast-of-call.ll
+++ b/test/Transforms/NewGVN/bitcast-of-call.ll
@@ -1,14 +1,20 @@
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -newgvn -S | FileCheck %s
 ; PR2213
 
 define i32* @f(i8* %x) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP:%.*]] = call i8* @m(i32 12)
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i8* [[TMP]] to i32*
+; CHECK-NEXT:    ret i32* [[TMP1]]
+;
 entry:
-        %tmp = call i8* @m( i32 12 )            ; <i8*> [#uses=2]
-        %tmp1 = bitcast i8* %tmp to i32*                ; <i32*> [#uses=0]
-        %tmp2 = bitcast i8* %tmp to i32*                ; <i32*> [#uses=0]
-; CHECK-NOT: %tmp2
-        ret i32* %tmp2
+  %tmp = call i8* @m( i32 12 )            ; <i8*> [#uses=2]
+  %tmp1 = bitcast i8* %tmp to i32*                ; <i32*> [#uses=0]
+  %tmp3 = bitcast i32* %tmp1 to i8*
+  %tmp2 = bitcast i8* %tmp3 to i32*                ; <i32*> [#uses=0]
+  ret i32* %tmp2
 }
 
 declare i8* @m(i32)
diff --git a/test/Transforms/NewGVN/calloc-load-removal.ll b/test/Transforms/NewGVN/calloc-load-removal.ll
index e6870442064bcf1aaf61303858ea17a327228f4f..cdeb971a23e2ffea3272240b4bcfd2d4fc499148 100644
--- a/test/Transforms/NewGVN/calloc-load-removal.ll
+++ b/test/Transforms/NewGVN/calloc-load-removal.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -S -basicaa -newgvn < %s | FileCheck %s
 ; RUN: opt -S -basicaa -newgvn -disable-simplify-libcalls < %s | FileCheck %s -check-prefix=CHECK_NO_LIBCALLS
 ; Check that loads from calloc are recognized as being zero.
diff --git a/test/Transforms/NewGVN/calls-nonlocal.ll b/test/Transforms/NewGVN/calls-nonlocal.ll
index 292060db812ef4b1d366f4f035f5d63ec99d3544..6e918050d5910dbf5c0ab5923454ef6359f35a90 100644
--- a/test/Transforms/NewGVN/calls-nonlocal.ll
+++ b/test/Transforms/NewGVN/calls-nonlocal.ll
@@ -1,4 +1,6 @@
 ; XFAIL: *
+;; NewGVN zaps the strlens, but currently takes two iterations to evaluate the conditions, because
+;; we prune predicateinfo, and the icmps only become equivalent after the strlens are zapped
 ; Two occurrences of strlen should be zapped.
 ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/Transforms/NewGVN/cond_br2.ll b/test/Transforms/NewGVN/cond_br2.ll
index e511ff7ed514e7b83f5d111a8bf7b6076ed16e24..ff7a76d1469590ccf2ba7558d5bc1a6f2460187c 100644
--- a/test/Transforms/NewGVN/cond_br2.ll
+++ b/test/Transforms/NewGVN/cond_br2.ll
@@ -19,7 +19,7 @@ define void @_Z4testv() #0 personality i8* bitcast (i32 (...)* @__gxx_personalit
 entry:
   %sv = alloca %"class.llvm::SmallVector", align 16
   %0 = bitcast %"class.llvm::SmallVector"* %sv to i8*
-  call void @llvm.lifetime.start(i64 64, i8* %0) #1
+  call void @llvm.lifetime.start.p0i8(i64 64, i8* %0) #1
   %BeginX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 0
   %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", %"class.llvm::SmallVector"* %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
   %1 = bitcast %"union.llvm::SmallVectorBase::U"* %FirstEl.i.i.i.i.i.i to i8*
@@ -95,7 +95,7 @@ if.then.i.i.i20:                                  ; preds = %invoke.cont3
   br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
 
 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21:          ; preds = %invoke.cont3, %if.then.i.i.i20
-  call void @llvm.lifetime.end(i64 64, i8* %0) #1
+  call void @llvm.lifetime.end.p0i8(i64 64, i8* %0) #1
   ret void
 
 lpad:                                             ; preds = %if.end.i14, %if.end.i, %invoke.cont2
@@ -114,14 +114,14 @@ eh.resume:                                        ; preds = %if.then.i.i.i, %lpa
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare i32 @__gxx_personality_v0(...)
 
 declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(%"class.llvm::SmallVector"*) #2
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(%"class.llvm::SmallVectorBase"*, i64, i64) #2
 
diff --git a/test/Transforms/NewGVN/condprop-xfail.ll b/test/Transforms/NewGVN/condprop-xfail.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5c049617f8751e4ccb0ad5784211c98c6f99b751
--- /dev/null
+++ b/test/Transforms/NewGVN/condprop-xfail.ll
@@ -0,0 +1,123 @@
+; XFAIL: *
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+
+@a = external global i32		; <i32*> [#uses=7]
+
+;; NewGVN takes two passes to get this, because we prune predicateinfo
+; CHECK-LABEL: @test1(
+define i32 @test1() nounwind {
+entry:
+	%0 = load i32, i32* @a, align 4
+	%1 = icmp eq i32 %0, 4
+	br i1 %1, label %bb, label %bb1
+
+bb:		; preds = %entry
+	br label %bb8
+
+bb1:		; preds = %entry
+	%2 = load i32, i32* @a, align 4
+	%3 = icmp eq i32 %2, 5
+	br i1 %3, label %bb2, label %bb3
+
+bb2:		; preds = %bb1
+	br label %bb8
+
+bb3:		; preds = %bb1
+	%4 = load i32, i32* @a, align 4
+	%5 = icmp eq i32 %4, 4
+; CHECK: br i1 false, label %bb4, label %bb5
+	br i1 %5, label %bb4, label %bb5
+
+bb4:		; preds = %bb3
+	%6 = load i32, i32* @a, align 4
+	%7 = add i32 %6, 5
+	br label %bb8
+
+bb5:		; preds = %bb3
+	%8 = load i32, i32* @a, align 4
+	%9 = icmp eq i32 %8, 5
+; CHECK: br i1 false, label %bb6, label %bb7
+	br i1 %9, label %bb6, label %bb7
+
+bb6:		; preds = %bb5
+	%10 = load i32, i32* @a, align 4
+	%11 = add i32 %10, 4
+	br label %bb8
+
+bb7:		; preds = %bb5
+	%12 = load i32, i32* @a, align 4
+	br label %bb8
+
+bb8:		; preds = %bb7, %bb6, %bb4, %bb2, %bb
+	%.0 = phi i32 [ %12, %bb7 ], [ %11, %bb6 ], [ %7, %bb4 ], [ 4, %bb2 ], [ 5, %bb ]
+	br label %return
+
+return:		; preds = %bb8
+	ret i32 %.0
+}
+;; NewGVN takes two passes to get test[6,8] and test[6,8]_fp's main part
+;; The icmp ne requires an equality table that inserts the inequalities for each
+;; discovered equality while processing.
+; CHECK-LABEL: @test6(
+define i1 @test6(i32 %x, i32 %y) {
+  %cmp2 = icmp ne i32 %x, %y
+  %cmp = icmp eq i32 %x, %y
+  %cmp3 = icmp eq i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+
+; CHECK-LABEL: @test6_fp(
+define i1 @test6_fp(float %x, float %y) {
+  %cmp2 = fcmp une float %x, %y
+  %cmp = fcmp oeq float %x, %y
+  %cmp3 = fcmp oeq float  %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+; CHECK-LABEL: @test8(
+define i1 @test8(i32 %x, i32 %y) {
+  %cmp2 = icmp sle i32 %x, %y
+  %cmp = icmp sgt i32 %x, %y
+  %cmp3 = icmp sgt i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+
+; CHECK-LABEL: @test8_fp(
+define i1 @test8_fp(float %x, float %y) {
+  %cmp2 = fcmp ule float %x, %y
+  %cmp = fcmp ogt float %x, %y
+  %cmp3 = fcmp ogt float %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+; CHECK: ret i1 false
+  ret i1 %cmp2
+
+different:
+; CHECK: ret i1 false
+  ret i1 %cmp3
+}
+
diff --git a/test/Transforms/NewGVN/condprop.ll b/test/Transforms/NewGVN/condprop.ll
index 898690dec19930ab19c766fe8af6d84632e5e42f..6eb9bb6b26194f7b63ef95e0a31dca6594e33c61 100644
--- a/test/Transforms/NewGVN/condprop.ll
+++ b/test/Transforms/NewGVN/condprop.ll
@@ -1,266 +1,211 @@
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
 
-@a = external global i32		; <i32*> [#uses=7]
-
-; CHECK-LABEL: @test1(
-define i32 @test1() nounwind {
-entry:
-	%0 = load i32, i32* @a, align 4
-	%1 = icmp eq i32 %0, 4
-	br i1 %1, label %bb, label %bb1
-
-bb:		; preds = %entry
-	br label %bb8
-
-bb1:		; preds = %entry
-	%2 = load i32, i32* @a, align 4
-	%3 = icmp eq i32 %2, 5
-	br i1 %3, label %bb2, label %bb3
-
-bb2:		; preds = %bb1
-	br label %bb8
-
-bb3:		; preds = %bb1
-	%4 = load i32, i32* @a, align 4
-	%5 = icmp eq i32 %4, 4
-; CHECK: br i1 false, label %bb4, label %bb5
-	br i1 %5, label %bb4, label %bb5
-
-bb4:		; preds = %bb3
-	%6 = load i32, i32* @a, align 4
-	%7 = add i32 %6, 5
-	br label %bb8
-
-bb5:		; preds = %bb3
-	%8 = load i32, i32* @a, align 4
-	%9 = icmp eq i32 %8, 5
-; CHECK: br i1 false, label %bb6, label %bb7
-	br i1 %9, label %bb6, label %bb7
-
-bb6:		; preds = %bb5
-	%10 = load i32, i32* @a, align 4
-	%11 = add i32 %10, 4
-	br label %bb8
-
-bb7:		; preds = %bb5
-	%12 = load i32, i32* @a, align 4
-	br label %bb8
-
-bb8:		; preds = %bb7, %bb6, %bb4, %bb2, %bb
-	%.0 = phi i32 [ %12, %bb7 ], [ %11, %bb6 ], [ %7, %bb4 ], [ 4, %bb2 ], [ 5, %bb ]
-	br label %return
-
-return:		; preds = %bb8
-	ret i32 %.0
-}
 
 declare void @foo(i1)
 declare void @bar(i32)
 
-; CHECK-LABEL: @test3(
 define void @test3(i32 %x, i32 %y) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]]
+; CHECK:       both_zero:
+; CHECK-NEXT:    call void @foo(i1 true)
+; CHECK-NEXT:    call void @foo(i1 true)
+; CHECK-NEXT:    call void @bar(i32 0)
+; CHECK-NEXT:    call void @bar(i32 0)
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 false)
+; CHECK-NEXT:    ret void
+;
   %xz = icmp eq i32 %x, 0
   %yz = icmp eq i32 %y, 0
   %z = and i1 %xz, %yz
   br i1 %z, label %both_zero, label %nope
 both_zero:
   call void @foo(i1 %xz)
-; CHECK: call void @foo(i1 true)
   call void @foo(i1 %yz)
-; CHECK: call void @foo(i1 true)
   call void @bar(i32 %x)
-; CHECK: call void @bar(i32 0)
   call void @bar(i32 %y)
-; CHECK: call void @bar(i32 0)
   ret void
 nope:
   call void @foo(i1 %z)
-; CHECK: call void @foo(i1 false)
   ret void
 }
-
-; CHECK-LABEL: @test4(
 define void @test4(i1 %b, i32 %x) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]]
+; CHECK:       sw:
+; CHECK-NEXT:    switch i32 [[X:%.*]], label [[DEFAULT:%.*]] [
+; CHECK-NEXT:    i32 0, label [[CASE0:%.*]]
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE0]]
+; CHECK-NEXT:    i32 3, label [[CASE3]]
+; CHECK-NEXT:    i32 4, label [[DEFAULT]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       case0:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       case1:
+; CHECK-NEXT:    call void @bar(i32 1)
+; CHECK-NEXT:    ret void
+; CHECK:       case3:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+;
   br i1 %b, label %sw, label %case3
 sw:
   switch i32 %x, label %default [
-    i32 0, label %case0
-    i32 1, label %case1
-    i32 2, label %case0
-    i32 3, label %case3
-    i32 4, label %default
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case0
+  i32 3, label %case3
+  i32 4, label %default
   ]
 default:
-; CHECK: default:
   call void @bar(i32 %x)
-; CHECK: call void @bar(i32 %x)
   ret void
 case0:
-; CHECK: case0:
   call void @bar(i32 %x)
-; CHECK: call void @bar(i32 %x)
   ret void
 case1:
-; CHECK: case1:
   call void @bar(i32 %x)
-; CHECK: call void @bar(i32 1)
   ret void
 case3:
-; CHECK: case3:
   call void @bar(i32 %x)
-; CHECK: call void @bar(i32 %x)
   ret void
 }
 
-; CHECK-LABEL: @test5(
 define i1 @test5(i32 %x, i32 %y) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 false
+;
   %cmp = icmp eq i32 %x, %y
   br i1 %cmp, label %same, label %different
 
 same:
   %cmp2 = icmp ne i32 %x, %y
-; CHECK: ret i1 false
   ret i1 %cmp2
 
 different:
   %cmp3 = icmp eq i32 %x, %y
-; CHECK: ret i1 false
   ret i1 %cmp3
 }
 
-; CHECK-LABEL: @test6(
-define i1 @test6(i32 %x, i32 %y) {
-  %cmp2 = icmp ne i32 %x, %y
-  %cmp = icmp eq i32 %x, %y
-  %cmp3 = icmp eq i32 %x, %y
-  br i1 %cmp, label %same, label %different
-
-same:
-; CHECK: ret i1 false
-  ret i1 %cmp2
 
-different:
-; CHECK: ret i1 false
-  ret i1 %cmp3
-}
-
-; CHECK-LABEL: @test6_fp(
-define i1 @test6_fp(float %x, float %y) {
-  %cmp2 = fcmp une float %x, %y
-  %cmp = fcmp oeq float %x, %y
-  %cmp3 = fcmp oeq float  %x, %y
-  br i1 %cmp, label %same, label %different
-
-same:
-; CHECK: ret i1 false
-  ret i1 %cmp2
-
-different:
-; CHECK: ret i1 false
-  ret i1 %cmp3
-}
-
-; CHECK-LABEL: @test7(
 define i1 @test7(i32 %x, i32 %y) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 false
+;
   %cmp = icmp sgt i32 %x, %y
   br i1 %cmp, label %same, label %different
 
 same:
   %cmp2 = icmp sle i32 %x, %y
-; CHECK: ret i1 false
   ret i1 %cmp2
 
 different:
   %cmp3 = icmp sgt i32 %x, %y
-; CHECK: ret i1 false
   ret i1 %cmp3
 }
 
-; CHECK-LABEL: @test7_fp(
 define i1 @test7_fp(float %x, float %y) {
+; CHECK-LABEL: @test7_fp(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 false
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 false
+;
   %cmp = fcmp ogt float %x, %y
   br i1 %cmp, label %same, label %different
 
 same:
   %cmp2 = fcmp ule float %x, %y
-; CHECK: ret i1 false
   ret i1 %cmp2
 
 different:
   %cmp3 = fcmp ogt float %x, %y
-; CHECK: ret i1 false
-  ret i1 %cmp3
-}
-
-; CHECK-LABEL: @test8(
-define i1 @test8(i32 %x, i32 %y) {
-  %cmp2 = icmp sle i32 %x, %y
-  %cmp = icmp sgt i32 %x, %y
-  %cmp3 = icmp sgt i32 %x, %y
-  br i1 %cmp, label %same, label %different
-
-same:
-; CHECK: ret i1 false
-  ret i1 %cmp2
-
-different:
-; CHECK: ret i1 false
-  ret i1 %cmp3
-}
-
-; CHECK-LABEL: @test8_fp(
-define i1 @test8_fp(float %x, float %y) {
-  %cmp2 = fcmp ule float %x, %y
-  %cmp = fcmp ogt float %x, %y
-  %cmp3 = fcmp ogt float %x, %y
-  br i1 %cmp, label %same, label %different
-
-same:
-; CHECK: ret i1 false
-  ret i1 %cmp2
-
-different:
-; CHECK: ret i1 false
   ret i1 %cmp3
 }
 
 ; PR1768
-; CHECK-LABEL: @test9(
 define i32 @test9(i32 %i, i32 %j) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       ret:
+; CHECK-NEXT:    ret i32 5
+;
   %cmp = icmp eq i32 %i, %j
   br i1 %cmp, label %cond_true, label %ret
 
 cond_true:
   %diff = sub i32 %i, %j
   ret i32 %diff
-; CHECK: ret i32 0
 
 ret:
   ret i32 5
-; CHECK: ret i32 5
 }
 
 ; PR1768
-; CHECK-LABEL: @test10(
 define i32 @test10(i32 %j, i32 %i) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       ret:
+; CHECK-NEXT:    ret i32 5
+;
   %cmp = icmp eq i32 %i, %j
   br i1 %cmp, label %cond_true, label %ret
 
 cond_true:
   %diff = sub i32 %i, %j
   ret i32 %diff
-; CHECK: ret i32 0
 
 ret:
   ret i32 5
-; CHECK: ret i32 5
 }
 
 declare i32 @yogibar()
 
-; CHECK-LABEL: @test11(
 define i32 @test11(i32 %x) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 @yogibar()
+; CHECK-NEXT:    [[V1:%.*]] = call i32 @yogibar()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V0]], [[V1]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[NEXT:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    ret i32 [[V0]]
+; CHECK:       next:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], [[V0]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_TRUE2:%.*]], label [[NEXT2:%.*]]
+; CHECK:       cond_true2:
+; CHECK-NEXT:    ret i32 [[X]]
+; CHECK:       next2:
+; CHECK-NEXT:    ret i32 0
+;
   %v0 = call i32 @yogibar()
   %v1 = call i32 @yogibar()
   %cmp = icmp eq i32 %v0, %v1
@@ -268,7 +213,6 @@ define i32 @test11(i32 %x) {
 
 cond_true:
   ret i32 %v1
-; CHECK: ret i32 %v0
 
 next:
   %cmp2 = icmp eq i32 %x, %v0
@@ -276,14 +220,23 @@ next:
 
 cond_true2:
   ret i32 %v0
-; CHECK: ret i32 %x
 
 next2:
   ret i32 0
 }
 
-; CHECK-LABEL: @test12(
 define i32 @test12(i32 %x) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    br label [[RET:%.*]]
+; CHECK:       cond_false:
+; CHECK-NEXT:    br label [[RET]]
+; CHECK:       ret:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ 0, [[COND_TRUE]] ], [ [[X]], [[COND_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
   %cmp = icmp eq i32 %x, 0
   br i1 %cmp, label %cond_true, label %cond_false
 
@@ -295,6 +248,5 @@ cond_false:
 
 ret:
   %res = phi i32 [ %x, %cond_true ], [ %x, %cond_false ]
-; CHECK: %res = phi i32 [ 0, %cond_true ], [ %x, %cond_false ]
   ret i32 %res
 }
diff --git a/test/Transforms/NewGVN/debugloc.ll b/test/Transforms/NewGVN/debugloc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..55597a078bbc062776e07f7eb6bab78e54b61bb6
--- /dev/null
+++ b/test/Transforms/NewGVN/debugloc.ll
@@ -0,0 +1,78 @@
+; XFAIL: *
+; RUN: opt < %s -newgvn -S | FileCheck %s
+; CHECK: {{^}}for.body:
+; CHECK-NEXT: [[VREG1:%[^ ]+]] = phi{{.*}}[[VREG2:%[^ ]+]],{{.*}}%.sink,
+; CHECK-NOT: !dbg
+; CHECK-SAME: {{$}}
+; CHECK: {{^}}for.inc:
+; CHECK-NEXT: [[VREG2]] = phi{{.*}}%inc,{{.*}}[[VREG1]]
+
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external local_unnamed_addr global i32, align 4
+
+; Function Attrs: nounwind uwtable
+define void @foo(i32 %x, i32 %y, i32 %z) local_unnamed_addr #0 !dbg !4 {
+entry:
+  %not.tobool = icmp eq i32 %x, 0, !dbg !8
+  %.sink = zext i1 %not.tobool to i32, !dbg !8
+  store i32 %.sink, i32* @g, align 4, !tbaa !9
+  %cmp8 = icmp sgt i32 %y, 0, !dbg !13
+  br i1 %cmp8, label %for.body.preheader, label %for.end, !dbg !17
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body, !dbg !19
+
+for.body:                                         ; preds = %for.body.preheader, %for.inc
+  %i.09 = phi i32 [ %inc4, %for.inc ], [ 0, %for.body.preheader ]
+  %cmp1 = icmp sgt i32 %i.09, %z, !dbg !19
+  br i1 %cmp1, label %if.then2, label %for.inc, !dbg !21
+
+if.then2:                                         ; preds = %for.body
+  %0 = load i32, i32* @g, align 4, !dbg !22, !tbaa !9
+  %inc = add nsw i32 %0, 1, !dbg !22
+  store i32 %inc, i32* @g, align 4, !dbg !22, !tbaa !9
+  br label %for.inc, !dbg !23
+
+for.inc:                                          ; preds = %for.body, %if.then2
+  %inc4 = add nuw nsw i32 %i.09, 1, !dbg !24
+  %exitcond = icmp ne i32 %inc4, %y, !dbg !13
+  br i1 %exitcond, label %for.body, label %for.end.loopexit, !dbg !17
+
+for.end.loopexit:                                 ; preds = %for.inc
+  br label %for.end, !dbg !26
+
+for.end:                                          ; preds = %for.end.loopexit, %entry
+  ret void, !dbg !26
+}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1)
+!1 = !DIFile(filename: "foo.c", directory: "b/")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 3, type: !5, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0)
+!5 = !DISubroutineType(types: !6)
+!6 = !{null, !7, !7, !7}
+!7 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!8 = !DILocation(line: 4, column: 7, scope: !4)
+!9 = !{!10, !10, i64 0}
+!10 = !{!"int", !11, i64 0}
+!11 = !{!"omnipotent char", !12, i64 0}
+!12 = !{!"Simple C/C++ TBAA"}
+!13 = !DILocation(line: 10, column: 13, scope: !14)
+!14 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 1)
+!15 = distinct !DILexicalBlock(scope: !16, file: !1, line: 10, column: 3)
+!16 = distinct !DILexicalBlock(scope: !4, file: !1, line: 10, column: 3)
+!17 = !DILocation(line: 10, column: 3, scope: !18)
+!18 = !DILexicalBlockFile(scope: !16, file: !1, discriminator: 1)
+!19 = !DILocation(line: 11, column: 11, scope: !20)
+!20 = distinct !DILexicalBlock(scope: !15, file: !1, line: 11, column: 9)
+!21 = !DILocation(line: 11, column: 9, scope: !15)
+!22 = !DILocation(line: 12, column: 8, scope: !20)
+!23 = !DILocation(line: 12, column: 7, scope: !20)
+!24 = !DILocation(line: 10, column: 20, scope: !25)
+!25 = !DILexicalBlockFile(scope: !15, file: !1, discriminator: 2)
+!26 = !DILocation(line: 13, column: 1, scope: !4)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
diff --git a/test/Transforms/NewGVN/edge.ll b/test/Transforms/NewGVN/edge.ll
index 2d453bda5a4ac23801d77604075170e58d33c20f..a8afc140e21869247876b548cf93521ccc84013a 100644
--- a/test/Transforms/NewGVN/edge.ll
+++ b/test/Transforms/NewGVN/edge.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -newgvn -S < %s | FileCheck %s
 
 define i32 @f1(i32 %x) {
diff --git a/test/Transforms/NewGVN/flags.ll b/test/Transforms/NewGVN/flags.ll
index d03edd6776c9dcd28c0a4242f5da6bfa0a86817f..e849ae2afb647c3adb1b700dcfff87d782a833d7 100644
--- a/test/Transforms/NewGVN/flags.ll
+++ b/test/Transforms/NewGVN/flags.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -newgvn -S < %s | FileCheck %s
 
 declare void @use(i1)
diff --git a/test/Transforms/NewGVN/fold-const-expr.ll b/test/Transforms/NewGVN/fold-const-expr.ll
index 20b74277b1acf5a46f7f50838fbef794ab663541..acd7c8df2530142aed3c21e0e16a7ec8d72bc2e8 100644
--- a/test/Transforms/NewGVN/fold-const-expr.ll
+++ b/test/Transforms/NewGVN/fold-const-expr.ll
@@ -1,11 +1,10 @@
-; XFAIL: *
 ; GVN failed to do constant expression folding and expanded
 ; them unfolded in many places, producing exponentially large const
 ; expressions. As a result, the compilation never fisished.
 ; This test checks that we are folding constant expression
 ; PR 28418
 ; RUN: opt -newgvn -S < %s | FileCheck %s
-
+;; NewGVN fails this due to not having load coercion
 %2 = type { i32, i32, i32, i32, i32 }
 define i32 @_Z16vector3util_mainv(i32 %x, i32 %y)  {
   %tmp1 = alloca %2, align 4
diff --git a/test/Transforms/NewGVN/lifetime-simple.ll b/test/Transforms/NewGVN/lifetime-simple.ll
index 63e361c49eb9bd73c0e879ae8c6314364101ecbb..382c7da2b3fba24c6346c932480a11eaf48b9e50 100644
--- a/test/Transforms/NewGVN/lifetime-simple.ll
+++ b/test/Transforms/NewGVN/lifetime-simple.ll
@@ -8,13 +8,13 @@ define i8 @test(i8* %P) nounwind {
 ; CHECK-NOT: load
 ; CHECK: lifetime.end
 entry:
-  call void @llvm.lifetime.start(i64 32, i8* %P)
+  call void @llvm.lifetime.start.p0i8(i64 32, i8* %P)
   %0 = load i8, i8* %P
   store i8 1, i8* %P
-  call void @llvm.lifetime.end(i64 32, i8* %P)
+  call void @llvm.lifetime.end.p0i8(i64 32, i8* %P)
   %1 = load i8, i8* %P
   ret i8 %1
 }
 
-declare void @llvm.lifetime.start(i64 %S, i8* nocapture %P) readonly
-declare void @llvm.lifetime.end(i64 %S, i8* nocapture %P)
+declare void @llvm.lifetime.start.p0i8(i64 %S, i8* nocapture %P) readonly
+declare void @llvm.lifetime.end.p0i8(i64 %S, i8* nocapture %P)
diff --git a/test/Transforms/NewGVN/load-constant-mem.ll b/test/Transforms/NewGVN/load-constant-mem.ll
index 215258b934c0df3a4c83998a6101eee1aae7e0ea..4c1624e09f600ce0e0c418f33c1309c199fcc727 100644
--- a/test/Transforms/NewGVN/load-constant-mem.ll
+++ b/test/Transforms/NewGVN/load-constant-mem.ll
@@ -1,19 +1,21 @@
-; RUN: opt < %s -basicaa -newgvn -instcombine -S | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
 ; PR4189
 @G = external constant [4 x i32]
 
 define i32 @test(i8* %p, i32 %i) nounwind {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P:%.*]] = getelementptr [4 x i32], [4 x i32]* @G, i32 0, i32 [[I:%.*]]
+; CHECK-NEXT:    store i8 4, i8* [[P:%.*]]
+; CHECK-NEXT:    ret i32 0
+;
 entry:
-	%P = getelementptr [4 x i32], [4 x i32]* @G, i32 0, i32 %i
-	%A = load i32, i32* %P
-	store i8 4, i8* %p
-	%B = load i32, i32* %P
-	%C = sub i32 %A, %B
-	ret i32 %C
+  %P = getelementptr [4 x i32], [4 x i32]* @G, i32 0, i32 %i
+  %A = load i32, i32* %P
+  store i8 4, i8* %p
+  %B = load i32, i32* %P
+  %C = sub i32 %A, %B
+  ret i32 %C
 }
 
-; CHECK: define i32 @test(i8* %p, i32 %i) #0 {
-; CHECK-NEXT: entry:
-; CHECK-NEXT:   store i8 4, i8* %p, align 1
-; CHECK-NEXT:   ret i32 0
-; CHECK-NEXT: }
diff --git a/test/Transforms/NewGVN/malloc-load-removal.ll b/test/Transforms/NewGVN/malloc-load-removal.ll
index c91b6e17f79df7eacc6276bbe1584f916235d99a..72f4839a5545dd4020902cc8a75bbf7b2538fdd6 100644
--- a/test/Transforms/NewGVN/malloc-load-removal.ll
+++ b/test/Transforms/NewGVN/malloc-load-removal.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -S -basicaa -newgvn < %s | FileCheck %s
 ; RUN: opt -S -basicaa -newgvn -disable-simplify-libcalls < %s | FileCheck %s -check-prefix=CHECK_NO_LIBCALLS
 ; PR13694
diff --git a/test/Transforms/NewGVN/phi-edge-handling.ll b/test/Transforms/NewGVN/phi-edge-handling.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6451006a69490d242aec7c1679a97cd92d1ea2c5
--- /dev/null
+++ b/test/Transforms/NewGVN/phi-edge-handling.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -newgvn -S | FileCheck %s
+
+
+;; Block 6 is reachable, but edge 6->4 is not
+;; This means the phi value is undef, not 0
+; Function Attrs: ssp uwtable
+define i16 @hoge() local_unnamed_addr #0 align 2 {
+; CHECK-LABEL: @hoge(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    switch i8 undef, label [[BB7:%.*]] [
+; CHECK-NEXT:    i8 0, label [[BB1:%.*]]
+; CHECK-NEXT:    i8 12, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB6:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB4:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    unreachable
+; CHECK:       bb4:
+; CHECK-NEXT:    ret i16 undef
+; CHECK:       bb6:
+; CHECK-NEXT:    br i1 true, label [[BB3:%.*]], label [[BB4]], !llvm.loop !1
+; CHECK:       bb7:
+; CHECK-NEXT:    unreachable
+;
+bb:
+  switch i8 undef, label %bb7 [
+  i8 0, label %bb1
+  i8 12, label %bb2
+  ]
+
+bb1:                                              ; preds = %bb
+  br label %bb6
+
+bb2:                                              ; preds = %bb
+  br label %bb4
+
+bb3:                                              ; preds = %bb6
+  unreachable
+
+bb4:                                              ; preds = %bb6, %bb2
+  %tmp = phi i16 [ 0, %bb6 ], [ undef, %bb2 ]
+  ret i16 %tmp
+
+bb6:                                              ; preds = %bb4
+  br i1 true, label %bb3, label %bb4, !llvm.loop !1
+
+bb7:                                              ; preds = %bb
+  unreachable
+}
+
+attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.ident = !{!0}
+
+!0 = !{!"clang version 5.0.0 (http://llvm.org/git/clang.git a8b933d4d1d133594fdaed35ee5814514b738f6d) (/Users/dannyb/sources/llvm-clean fc630a9b5613f544c07a8f16abcc173793df62cf)"}
+!1 = distinct !{!1, !2}
+!2 = !{!"llvm.loop.unroll.disable"}
diff --git a/test/Transforms/NewGVN/pr10820.ll b/test/Transforms/NewGVN/pr10820.ll
index d7a02b570aa016055370abb82e797a3b77a8f57c..dbb1376874db67eb77536793ef4be65c86a97724 100644
--- a/test/Transforms/NewGVN/pr10820.ll
+++ b/test/Transforms/NewGVN/pr10820.ll
@@ -1,6 +1,6 @@
 ; XFAIL: *
 ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
-
+; NewGVN fails this due to missing load coercion
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-f128:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/Transforms/NewGVN/pr14166.ll b/test/Transforms/NewGVN/pr14166.ll
index daf27cdb7fd7f5b43fbd61afbd9791e921416c7a..c526c50bc75d5ca64f76f8043dc67c99b06fe96d 100644
--- a/test/Transforms/NewGVN/pr14166.ll
+++ b/test/Transforms/NewGVN/pr14166.ll
@@ -1,5 +1,6 @@
 ; XFAIL: *
 ; RUN: opt -disable-basicaa -newgvn -S < %s | FileCheck %s
+; NewGVN fails this due to missing load coercion
 target datalayout = "e-p:32:32:32"
 target triple = "i386-pc-linux-gnu"
 define <2 x i32> @test1() {
diff --git a/test/Transforms/NewGVN/pr17732.ll b/test/Transforms/NewGVN/pr17732.ll
index 4a194e6a08b505c12a27fe739b95577f94a441b4..6aee6ebeb06501f28e1971c125406a8ba4d7a23b 100644
--- a/test/Transforms/NewGVN/pr17732.ll
+++ b/test/Transforms/NewGVN/pr17732.ll
@@ -1,6 +1,4 @@
-; XFAIL: *
 ; RUN: opt -newgvn -S -o - < %s | FileCheck %s
-
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/test/Transforms/NewGVN/pr31613.ll b/test/Transforms/NewGVN/pr31613.ll
index d3a41830c789ef688fb8f449f317095eddfb02d0..d96ea18466ad4db5cdbaa93a23deb5e186e1eb7e 100644
--- a/test/Transforms/NewGVN/pr31613.ll
+++ b/test/Transforms/NewGVN/pr31613.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+; RUN: opt < %s -basicaa -newgvn -enable-store-refinement -S | FileCheck %s
 target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ;; Both of these tests are tests of phi nodes that end up all equivalent to each other
@@ -78,21 +78,18 @@ define void @e() {
 ; CHECK-NEXT:    br label [[H:%.*]]
 ; CHECK:       h:
 ; CHECK-NEXT:    call void @c.d.p(i64 8, i8* undef)
-; CHECK-NEXT:    [[I:%.*]] = load i32, i32* [[F]]
 ; CHECK-NEXT:    [[J:%.*]] = load i32, i32* null
-; CHECK-NEXT:    [[K:%.*]] = icmp eq i32 [[I]], [[J]]
-; CHECK-NEXT:    br i1 [[K]], label [[L:%.*]], label [[Q:%.*]]
+; CHECK-NEXT:    br i1 true, label [[L:%.*]], label [[Q:%.*]]
 ; CHECK:       l:
 ; CHECK-NEXT:    br label [[R:%.*]]
 ; CHECK:       q:
-; CHECK-NEXT:    [[M:%.*]] = load %struct.a*, %struct.a** null
+; CHECK-NEXT:    store i8 undef, i8* null
 ; CHECK-NEXT:    br label [[R]]
 ; CHECK:       r:
 ; CHECK-NEXT:    switch i32 undef, label [[N:%.*]] [
 ; CHECK-NEXT:    i32 0, label [[S:%.*]]
 ; CHECK-NEXT:    ]
 ; CHECK:       s:
-; CHECK-NEXT:    store i32 undef, i32* [[F]], !g !0
 ; CHECK-NEXT:    br label [[H]]
 ; CHECK:       n:
 ; CHECK-NEXT:    [[O:%.*]] = load %struct.a*, %struct.a** null
diff --git a/test/Transforms/NewGVN/pr31682.ll b/test/Transforms/NewGVN/pr31682.ll
index 108e1e19afbd4c332e68ca7281652df2067b40f6..96103fad15c250cf4bf84f7aee7e01f551b00ef9 100644
--- a/test/Transforms/NewGVN/pr31682.ll
+++ b/test/Transforms/NewGVN/pr31682.ll
@@ -12,7 +12,6 @@ define void @bar() {
 ; CHECK-NEXT:    [[TMP:%.*]] = load %struct.foo*, %struct.foo** @global
 ; CHECK-NEXT:    br label [[BB2:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr [[STRUCT_FOO:%.*]], %struct.foo* [[TMP]], i64 0, i32 1
 ; CHECK-NEXT:    br i1 undef, label [[BB2]], label [[BB7:%.*]]
 ; CHECK:       bb7:
 ; CHECK-NEXT:    br label [[BB10:%.*]]
diff --git a/test/Transforms/NewGVN/pr31758.ll b/test/Transforms/NewGVN/pr31758.ll
index d55c17e5804c1451d11933f3f7e143b6fa6394ef..6052ca973aff070535be9da7a23e9f2f973af3f4 100644
--- a/test/Transforms/NewGVN/pr31758.ll
+++ b/test/Transforms/NewGVN/pr31758.ll
@@ -10,9 +10,6 @@ define void @tinkywinky() {
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB90:%.*]]
 ; CHECK:       bb90:
-; CHECK-NEXT:    [[TMP91:%.*]] = bitcast %struct.dipsy** undef to %struct.patatino**
-; CHECK-NEXT:    [[TMP92:%.*]] = load %struct.patatino*, %struct.patatino** [[TMP91]], align 8
-; CHECK-NEXT:    [[TMP136:%.*]] = load %struct.patatino*, %struct.patatino** [[TMP91]], align 8
 ; CHECK-NEXT:    br label [[BB90]]
 ; CHECK:       bb138:
 ; CHECK-NEXT:    store i8 undef, i8* null
diff --git a/test/Transforms/NewGVN/pr32403.ll b/test/Transforms/NewGVN/pr32403.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2552e0e66ab9eb29326d6f3ca71790260fe73dd4
--- /dev/null
+++ b/test/Transforms/NewGVN/pr32403.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+;RUN: opt -newgvn -S < %s | FileCheck %s
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @reorder_ref_pic_list() local_unnamed_addr {
+; CHECK-LABEL: @reorder_ref_pic_list(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 undef, label [[FOR_END:%.*]], label [[FOR_BODY_PREHEADER:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[REFIDXLX_0:%.*]] = phi i32 [ [[INC_I51:%.*]], [[IF_ELSE58:%.*]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    br i1 undef, label [[IF_THEN13:%.*]], label [[IF_ELSE58]]
+; CHECK:       if.then13:
+; CHECK-NEXT:    [[INC_I:%.*]] = add nsw i32 [[REFIDXLX_0]], 1
+; CHECK-NEXT:    br label [[FOR_BODY8_I:%.*]]
+; CHECK:       for.body8.i:
+; CHECK-NEXT:    br i1 undef, label [[FOR_INC24_I:%.*]], label [[IF_THEN17_I:%.*]]
+; CHECK:       if.then17.i:
+; CHECK-NEXT:    br label [[FOR_INC24_I]]
+; CHECK:       for.inc24.i:
+; CHECK-NEXT:    br label [[FOR_BODY8_I]]
+; CHECK:       if.else58:
+; CHECK-NEXT:    [[INC_I51]] = add nsw i32 [[REFIDXLX_0]], 1
+; CHECK-NEXT:    br label [[FOR_BODY]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 undef, label %for.end, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.body:                                         ; preds = %if.else58, %for.body.preheader
+  %refIdxLX.0 = phi i32 [ %inc.i51, %if.else58 ], [ 0, %for.body.preheader ]
+  br i1 undef, label %if.then13, label %if.else58
+
+if.then13:                                        ; preds = %for.body
+  %inc.i = add nsw i32 %refIdxLX.0, 1
+  br label %for.body8.i
+
+for.body8.i:                                      ; preds = %for.inc24.i, %if.then13
+  %nIdx.052.i = phi i32 [ %inc.i, %if.then13 ], [ %nIdx.1.i, %for.inc24.i ]
+  br i1 undef, label %for.inc24.i, label %if.then17.i
+
+if.then17.i:                                      ; preds = %for.body8.i
+  br label %for.inc24.i
+
+for.inc24.i:                                      ; preds = %if.then17.i, %for.body8.i
+  %nIdx.1.i = phi i32 [ undef, %if.then17.i ], [ %nIdx.052.i, %for.body8.i ]
+  br label %for.body8.i
+
+if.else58:                                        ; preds = %for.body
+  %inc.i51 = add nsw i32 %refIdxLX.0, 1
+  br label %for.body
+
+for.end:                                          ; preds = %entry
+  ret void
+}
+
+
+
diff --git a/test/Transforms/NewGVN/predicates.ll b/test/Transforms/NewGVN/predicates.ll
new file mode 100644
index 0000000000000000000000000000000000000000..61b35c5e5c67c3a1d45107eb4bf59c6537ae48ee
--- /dev/null
+++ b/test/Transforms/NewGVN/predicates.ll
@@ -0,0 +1,111 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -basicaa -newgvn -S < %s | FileCheck %s
+
+; Function Attrs: noinline norecurse nounwind readonly ssp uwtable
+define i32 @mp_unsgn_cmp(i32 %n, i32* nocapture readonly %in1, i32* nocapture readonly %in2) local_unnamed_addr {
+; CHECK-LABEL: @mp_unsgn_cmp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP11:%.*]] = icmp sgt i32 [[N:%.*]], -1
+; CHECK-NEXT:    br i1 [[CMP11]], label [[FOR_INC_PREHEADER:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       for.inc.preheader:
+; CHECK-NEXT:    br label [[FOR_INC:%.*]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[STOREMERGE2:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC]] ], [ 0, [[FOR_INC_PREHEADER]] ]
+; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STOREMERGE2]] to i64
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[IN1:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[IN2:%.*]], i64 [[IDXPROM]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* [[ARRAYIDX4]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[INC]] = add nsw i32 [[STOREMERGE2]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[STOREMERGE2]], [[N]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[SUB]], 0
+; CHECK-NEXT:    [[OR_COND:%.*]] = and i1 [[CMP2]], [[CMP1]]
+; CHECK-NEXT:    br i1 [[OR_COND]], label [[FOR_INC]], label [[FOR_END:%.*]]
+; CHECK:       for.end:
+; CHECK-NEXT:    [[CMP5:%.*]] = icmp sgt i32 [[SUB]], 0
+; CHECK-NEXT:    br i1 [[CMP5]], label [[IF_END8:%.*]], label [[IF_ELSE]]
+; CHECK:       if.else:
+; CHECK-NEXT:    [[SUB1_LCSSA4:%.*]] = phi i32 [ [[SUB]], [[FOR_END]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP6:%.*]] = icmp slt i32 [[SUB1_LCSSA4]], 0
+; CHECK-NEXT:    [[DOTSUB1_LCSSA:%.*]] = select i1 [[CMP6]], i32 -1, i32 [[SUB1_LCSSA4]]
+; CHECK-NEXT:    ret i32 [[DOTSUB1_LCSSA]]
+; CHECK:       if.end8:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  %cmp11 = icmp sgt i32 %n, -1
+  br i1 %cmp11, label %for.inc.preheader, label %if.else
+
+for.inc.preheader:                                ; preds = %entry
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.inc.preheader, %for.inc
+  %storemerge2 = phi i32 [ %inc, %for.inc ], [ 0, %for.inc.preheader ]
+  %idxprom = sext i32 %storemerge2 to i64
+  %arrayidx = getelementptr inbounds i32, i32* %in1, i64 %idxprom
+  %0 = load i32, i32* %arrayidx, align 4
+  %arrayidx4 = getelementptr inbounds i32, i32* %in2, i64 %idxprom
+  %1 = load i32, i32* %arrayidx4, align 4
+  %sub = sub nsw i32 %0, %1
+  %inc = add nsw i32 %storemerge2, 1
+  %cmp1 = icmp slt i32 %storemerge2, %n
+  %cmp2 = icmp eq i32 %sub, 0
+  %or.cond = and i1 %cmp2, %cmp1
+;; This is a self-critical edge to for.inc. If we insert predicate info on it, we will insert
+;; predicateinfo at the end of this block, and think it dominates everthing using only dfs
+;; numbers, instead of proper edge dominance.  We would then proceed to propagate the true value
+;; of sub == 0 everywhere, making this function only ever return 0.
+  br i1 %or.cond, label %for.inc, label %for.end
+
+for.end:                                          ; preds = %for.inc
+  %sub.lcssa = phi i32 [ %sub, %for.inc ]
+  %cmp5 = icmp sgt i32 %sub.lcssa, 0
+  br i1 %cmp5, label %if.end8, label %if.else
+
+if.else:                                          ; preds = %entry, %for.end
+  %sub1.lcssa4 = phi i32 [ %sub.lcssa, %for.end ], [ 0, %entry ]
+  %cmp6 = icmp slt i32 %sub1.lcssa4, 0
+  %.sub1.lcssa = select i1 %cmp6, i32 -1, i32 %sub1.lcssa4
+  ret i32 %.sub1.lcssa
+
+if.end8:                                          ; preds = %for.end
+  ret i32 1
+}
+
+
+;; This test will generate a copy of a copy of predicateinfo to the multiple uses
+;; of branch conditions below.  Make sure we don't try to extract operand info.
+; Function Attrs: uwtable
+define fastcc void @barney() {
+; CHECK-LABEL: @barney(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB22:%.*]]
+; CHECK:       bb22:
+; CHECK-NEXT:    br i1 undef, label [[BB29:%.*]], label [[BB35:%.*]]
+; CHECK:       bb29:
+; CHECK-NEXT:    br i1 true, label [[BB33:%.*]], label [[BB35]]
+; CHECK:       bb33:
+; CHECK-NEXT:    br i1 true, label [[BB35]], label [[BB35]]
+; CHECK:       bb35:
+; CHECK-NEXT:    unreachable
+;
+bb:
+  br label %bb22
+bb22:                                             ; preds = %bb21
+  %tmp23 = icmp eq i32 undef, 2
+  br i1 %tmp23, label %bb29, label %bb35
+
+
+bb29:                                             ; preds = %bb28
+  br i1 %tmp23, label %bb33, label %bb35
+
+
+bb33:                                             ; preds = %bb31
+  br i1 %tmp23, label %bb35, label %bb35
+
+
+bb35:                                             ; preds = %bb33, %bb29, %bb22
+  unreachable
+}
+
diff --git a/test/Transforms/NewGVN/readattrs.ll b/test/Transforms/NewGVN/readattrs.ll
index be5fbf5a806f6911c4196622cbda9936a49c7e94..29ddb97ca1bb1e71903e78ad1f6ab92c2e614a83 100644
--- a/test/Transforms/NewGVN/readattrs.ll
+++ b/test/Transforms/NewGVN/readattrs.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -newgvn -S -o - < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
diff --git a/test/Transforms/NewGVN/refine-stores.ll b/test/Transforms/NewGVN/refine-stores.ll
new file mode 100644
index 0000000000000000000000000000000000000000..a48f2fe7fdb6d94437d5f551f39a84065d506399
--- /dev/null
+++ b/test/Transforms/NewGVN/refine-stores.ll
@@ -0,0 +1,189 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
+;; Now that we do store refinement, we have to verify that we add fake uses
+;; when we skip existing stores.
+;; We also are testing that various variations that cause stores to move classes
+;; have the right class movement happen
+;; All of these tests result in verification failures if it does not.
+%struct.eggs = type {}
+
+define void @spam(i32 *%a) {
+; CHECK-LABEL: @spam(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[FOO:%.*]] = bitcast i32* [[A:%.*]] to %struct.eggs**
+; CHECK-NEXT:    store %struct.eggs* null, %struct.eggs** [[FOO]]
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br i1 undef, label [[BB3:%.*]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    call void @baz()
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       bb3:
+; CHECK-NEXT:    store i32 0, i32* undef
+; CHECK-NEXT:    store %struct.eggs* null, %struct.eggs** [[FOO]]
+; CHECK-NEXT:    unreachable
+;
+bb:
+  %foo = bitcast i32 *%a to %struct.eggs**
+  store %struct.eggs* null, %struct.eggs** %foo
+  br label %bb1
+
+bb1:                                              ; preds = %bb2, %bb
+  br i1 undef, label %bb3, label %bb2
+
+bb2:                                              ; preds = %bb1
+  call void @baz()
+  br label %bb1
+
+bb3:                                              ; preds = %bb1
+  store i32 0, i32* undef
+;; This store is defined by a memoryphi of the call and the first store
+;; At first, we will prove it equivalent to the first store above.
+;; Then the call will become reachable, and the equivalence will be removed
+;; Without it being a use of the first store, we will not update the store
+;; to reflect this.
+  store %struct.eggs* null, %struct.eggs** %foo
+  unreachable
+}
+
+declare void @baz()
+
+
+define void @a() {
+; CHECK-LABEL: @a(
+; CHECK-NEXT:  b:
+; CHECK-NEXT:    br label [[C:%.*]]
+; CHECK:       c:
+; CHECK-NEXT:    store i64 undef, i64* null
+; CHECK-NEXT:    br label [[E:%.*]]
+; CHECK:       e:
+; CHECK-NEXT:    [[G:%.*]] = load i64*, i64** null
+; CHECK-NEXT:    store i64* undef, i64** null
+; CHECK-NEXT:    br i1 undef, label [[C]], label [[E]]
+;
+b:
+  br label %c
+
+c:                                                ; preds = %e, %b
+  %d = phi i64* [ undef, %b ], [ null, %e ]
+  store i64 undef, i64* %d
+  br label %e
+
+e:                                                ; preds = %e, %c
+;; The memory for this load starts out equivalent to just the store in c, we later discover the store after us, and
+;; need to make sure the right set of values get marked as changed after memory leaders change
+  %g = load i64*, i64** null
+  %0 = bitcast i64* %g to i64*
+  store i64* undef, i64** null
+  br i1 undef, label %c, label %e
+}
+
+; ModuleID = 'bugpoint-reduced-simplified.bc'
+source_filename = "bugpoint-output-daef094.bc"
+target triple = "x86_64-apple-darwin16.5.0"
+
+%struct.hoge = type {}
+
+define void @widget(%struct.hoge* %arg) {
+; CHECK-LABEL: @widget(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP:%.*]] = phi %struct.hoge* [ [[ARG:%.*]], [[BB:%.*]] ], [ null, [[BB1]] ]
+; CHECK-NEXT:    store %struct.hoge* [[TMP]], %struct.hoge** undef
+; CHECK-NEXT:    br i1 undef, label [[BB1]], label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[TMP3:%.*]] = phi i64 [ [[TMP8:%.*]], [[BB7:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[TMP3]], 0
+; CHECK-NEXT:    br i1 [[TMP4]], label [[BB7]], label [[BB5:%.*]]
+; CHECK:       bb5:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i64, i64* null
+; CHECK-NEXT:    call void @quux()
+; CHECK-NEXT:    store i64 [[TMP6]], i64* undef
+; CHECK-NEXT:    br label [[BB7]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[TMP8]] = add i64 [[TMP3]], 1
+; CHECK-NEXT:    br label [[BB2]]
+;
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %tmp = phi %struct.hoge* [ %arg, %bb ], [ null, %bb1 ]
+  store %struct.hoge* %tmp, %struct.hoge** undef
+  br i1 undef, label %bb1, label %bb2
+
+bb2:                                              ; preds = %bb7, %bb1
+  %tmp3 = phi i64 [ %tmp8, %bb7 ], [ 0, %bb1 ]
+  %tmp4 = icmp eq i64 %tmp3, 0
+  br i1 %tmp4, label %bb7, label %bb5
+
+bb5:                                              ; preds = %bb2
+  ;; Originally thought equal to the store that comes after it until the phi edges
+  ;; are completely traversed
+  %tmp6 = load i64, i64* null
+  call void @quux()
+  store i64 %tmp6, i64* undef
+  br label %bb7
+
+bb7:                                              ; preds = %bb5, %bb2
+  %tmp8 = add i64 %tmp3, 1
+  br label %bb2
+}
+
+declare void @quux()
+; ModuleID = 'short.ll'
+source_filename = "short.ll"
+
+%struct.a = type {}
+
+define void @b() {
+; CHECK-LABEL: @b(
+; CHECK-NEXT:    [[C:%.*]] = alloca [[STRUCT_A:%.*]]
+; CHECK-NEXT:    br label [[D:%.*]]
+; CHECK:       m:
+; CHECK-NEXT:    unreachable
+; CHECK:       d:
+; CHECK-NEXT:    [[G:%.*]] = bitcast %struct.a* [[C]] to i8*
+; CHECK-NEXT:    [[F:%.*]] = bitcast i8* [[G]] to i32*
+; CHECK-NEXT:    [[E:%.*]] = load i32, i32* [[F]]
+; CHECK-NEXT:    br i1 undef, label [[I:%.*]], label [[J:%.*]]
+; CHECK:       i:
+; CHECK-NEXT:    br i1 undef, label [[K:%.*]], label [[M:%.*]]
+; CHECK:       k:
+; CHECK-NEXT:    br label [[L:%.*]]
+; CHECK:       l:
+; CHECK-NEXT:    unreachable
+; CHECK:       j:
+; CHECK-NEXT:    br label [[M]]
+;
+  %c = alloca %struct.a
+  br label %d
+
+m:                                                ; preds = %j, %i
+  store i32 %e, i32* %f
+  unreachable
+
+d:                                                ; preds = %0
+  %g = bitcast %struct.a* %c to i8*
+  %h = getelementptr i8, i8* %g
+  %f = bitcast i8* %h to i32*
+  %e = load i32, i32* %f
+  br i1 undef, label %i, label %j
+
+i:                                                ; preds = %d
+  br i1 undef, label %k, label %m
+
+k:                                                ; preds = %i
+  br label %l
+
+l:                                                ; preds = %k
+  %n = phi i32 [ %e, %k ]
+  ;; Becomes equal and then not equal to the other store, and
+  ;; along the way, the load.
+  store i32 %n, i32* %f
+  unreachable
+
+j:                                                ; preds = %d
+  br label %m
+}
diff --git a/test/Transforms/NewGVN/rle-nonlocal.ll b/test/Transforms/NewGVN/rle-nonlocal.ll
index 89f5a6affdec86783318b11f641336f28f0b3ce5..d318cd5240d81f3b60e20c5603627c75cb8b4d29 100644
--- a/test/Transforms/NewGVN/rle-nonlocal.ll
+++ b/test/Transforms/NewGVN/rle-nonlocal.ll
@@ -1,23 +1,37 @@
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -basicaa -newgvn -S | FileCheck %s
 
 define i32 @main(i32** %p, i32 %x, i32 %y) {
+; CHECK-LABEL: @main(
+; CHECK-NEXT:  block1:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[BLOCK2:%.*]], label [[BLOCK3:%.*]]
+; CHECK:       block2:
+; CHECK-NEXT:    [[A:%.*]] = load i32*, i32** [[P:%.*]]
+; CHECK-NEXT:    br label [[BLOCK4:%.*]]
+; CHECK:       block3:
+; CHECK-NEXT:    [[B:%.*]] = load i32*, i32** [[P]]
+; CHECK-NEXT:    br label [[BLOCK4]]
+; CHECK:       block4:
+; CHECK-NEXT:    [[EXISTINGPHI:%.*]] = phi i32* [ [[A]], [[BLOCK2]] ], [ [[B]], [[BLOCK3]] ]
+; CHECK-NEXT:    [[C:%.*]] = load i32, i32* [[EXISTINGPHI]]
+; CHECK-NEXT:    [[E:%.*]] = add i32 [[C]], [[C]]
+; CHECK-NEXT:    ret i32 [[E]]
+;
 block1:
-    %cmp = icmp eq i32 %x, %y
-	br i1 %cmp , label %block2, label %block3
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp , label %block2, label %block3
 
 block2:
- %a = load i32*, i32** %p
- br label %block4
+  %a = load i32*, i32** %p
+  br label %block4
 
 block3:
   %b = load i32*, i32** %p
   br label %block4
 
 block4:
-; CHECK-NOT: %existingPHI = phi
-; CHECK: %DEAD = phi
-  %existingPHI = phi i32* [ %a, %block2 ], [ %b, %block3 ] 
+  %existingPHI = phi i32* [ %a, %block2 ], [ %b, %block3 ]
   %DEAD = load i32*, i32** %p
   %c = load i32, i32* %DEAD
   %d = load i32, i32* %existingPHI
diff --git a/test/Transforms/NewGVN/rle.ll b/test/Transforms/NewGVN/rle.ll
new file mode 100644
index 0000000000000000000000000000000000000000..902abe979ea8ab4e84258789f35ac31d3f08aa9d
--- /dev/null
+++ b/test/Transforms/NewGVN/rle.ll
@@ -0,0 +1,59 @@
+; RUN: opt < %s -data-layout="e-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-n8:16:32" -basicaa -newgvn -S -die | FileCheck %s
+; RUN: opt < %s -data-layout="E-p:32:32:32-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-n32"      -basicaa -newgvn -S -die | FileCheck %s
+; memset -> i16 forwarding.
+define signext i16 @memset_to_i16_local(i16* %A) nounwind ssp {
+entry:
+  %conv = bitcast i16* %A to i8*
+  tail call void @llvm.memset.p0i8.i64(i8* %conv, i8 1, i64 200, i32 1, i1 false)
+  %arrayidx = getelementptr inbounds i16, i16* %A, i64 42
+  %tmp2 = load i16, i16* %arrayidx
+  ret i16 %tmp2
+; CHECK-LABEL: @memset_to_i16_local(
+; CHECK-NOT: load
+; CHECK: ret i16 257
+}
+
+@GCst = constant {i32, float, i32 } { i32 42, float 14., i32 97 }
+@GCst_as1 = addrspace(1) constant {i32, float, i32 } { i32 42, float 14., i32 97 }
+
+; memset -> float forwarding.
+define float @memcpy_to_float_local(float* %A) nounwind ssp {
+entry:
+  %conv = bitcast float* %A to i8*                ; <i8*> [#uses=1]
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %conv, i8* bitcast ({i32, float, i32 }* @GCst to i8*), i64 12, i32 1, i1 false)
+  %arrayidx = getelementptr inbounds float, float* %A, i64 1 ; <float*> [#uses=1]
+  %tmp2 = load float, float* %arrayidx                   ; <float> [#uses=1]
+  ret float %tmp2
+; CHECK-LABEL: @memcpy_to_float_local(
+; CHECK-NOT: load
+; CHECK: ret float 1.400000e+01
+}
+; memcpy from address space 1
+define float @memcpy_to_float_local_as1(float* %A) nounwind ssp {
+entry:
+  %conv = bitcast float* %A to i8*                ; <i8*> [#uses=1]
+  tail call void @llvm.memcpy.p0i8.p1i8.i64(i8* %conv, i8 addrspace(1)* bitcast ({i32, float, i32 } addrspace(1)* @GCst_as1 to i8 addrspace(1)*), i64 12, i32 1, i1 false)
+  %arrayidx = getelementptr inbounds float, float* %A, i64 1 ; <float*> [#uses=1]
+  %tmp2 = load float, float* %arrayidx                   ; <float> [#uses=1]
+  ret float %tmp2
+; CHECK-LABEL: @memcpy_to_float_local_as1(
+; CHECK-NOT: load
+; CHECK: ret float 1.400000e+01
+}
+
+; PR6642
+define i32 @memset_to_load() nounwind readnone {
+entry:
+  %x = alloca [256 x i32], align 4                ; <[256 x i32]*> [#uses=2]
+  %tmp = bitcast [256 x i32]* %x to i8*           ; <i8*> [#uses=1]
+  call void @llvm.memset.p0i8.i64(i8* %tmp, i8 0, i64 1024, i32 4, i1 false)
+  %arraydecay = getelementptr inbounds [256 x i32], [256 x i32]* %x, i32 0, i32 0 ; <i32*>
+  %tmp1 = load i32, i32* %arraydecay                   ; <i32> [#uses=1]
+  ret i32 %tmp1
+; CHECK-LABEL: @memset_to_load(
+; CHECK: ret i32 0
+}
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
+declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i32, i1) nounwind
diff --git a/test/Transforms/NewGVN/storeoverstore.ll b/test/Transforms/NewGVN/storeoverstore.ll
index 63f40c511e3cd013bcd8e7ca87bc13f0cd7ffc44..49b55d430dc753ceb7c678ab90711eae93863ca5 100644
--- a/test/Transforms/NewGVN/storeoverstore.ll
+++ b/test/Transforms/NewGVN/storeoverstore.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -newgvn -S < %s | FileCheck %s
 ; RUN: opt -passes=newgvn -S -o - %s | FileCheck %s
 
@@ -7,31 +8,35 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ;; stores of the same value do not change the memory state to eliminate them.
 
 define i32 @foo(i32*, i32)  {
-; CHECK-LABEL: @foo
+; CHECK-LABEL: @foo(
+; CHECK-NEXT:    store i32 5, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
+; CHECK:         br label [[TMP5]]
+; CHECK:         [[DOT0:%.*]] = phi i32 [ 10, [[TMP4]] ], [ 5, [[TMP2:%.*]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP6:%.*]], label [[TMP8:%.*]]
+; CHECK:         [[TMP7:%.*]] = add nsw i32 [[DOT0]], 5
+; CHECK-NEXT:    br label [[TMP8]]
+; CHECK:         [[DOT1:%.*]] = phi i32 [ [[TMP7]], [[TMP6]] ], [ [[DOT0]], [[TMP5]] ]
+; CHECK-NEXT:    ret i32 [[DOT1]]
+;
   store i32 5, i32* %0, align 4
   %3 = icmp ne i32 %1, 0
   br i1 %3, label %4, label %7
 
 ; <label>:4:                                      ; preds = %2
-; CHECK-NOT: load
   %5 = load i32, i32* %0, align 4
-; CHECK-NOT: add
   %6 = add nsw i32 5, %5
   br label %7
 
 ; <label>:7:                                      ; preds = %4, %2
   %.0 = phi i32 [ %6, %4 ], [ 5, %2 ]
-; CHECK: phi i32 [ 10, %4 ], [ 5, %2 ]
   store i32 5, i32* %0, align 4
-; CHECK-NOT: icmp
   %8 = icmp ne i32 %1, 0
-; CHECK: br i1 %3
   br i1 %8, label %9, label %12
 
 ; <label>:9:                                      ; preds = %7
-; CHECK-NOT: load
   %10 = load i32, i32* %0, align 4
-; CHECK: add nsw i32 %.0, 5
   %11 = add nsw i32 %.0, %10
   br label %12
 
@@ -43,15 +48,25 @@ define i32 @foo(i32*, i32)  {
 ;; This is similar to the above, but it is a conditional store of the same value
 ;; which requires value numbering MemoryPhi properly to resolve.
 define i32 @foo2(i32*, i32)  {
-; CHECK-LABEL: @foo2
+; CHECK-LABEL: @foo2(
+; CHECK-NEXT:    store i32 5, i32* [[TMP0:%.*]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1:%.*]], 0
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP4:%.*]], label [[TMP5:%.*]]
+; CHECK:         br label [[TMP6:%.*]]
+; CHECK:         br label [[TMP6]]
+; CHECK:         [[DOT0:%.*]] = phi i32 [ 10, [[TMP4]] ], [ 5, [[TMP5]] ]
+; CHECK-NEXT:    br i1 [[TMP3]], label [[TMP7:%.*]], label [[TMP9:%.*]]
+; CHECK:         [[TMP8:%.*]] = add nsw i32 [[DOT0]], 5
+; CHECK-NEXT:    br label [[TMP9]]
+; CHECK:         [[DOT1:%.*]] = phi i32 [ [[TMP8]], [[TMP7]] ], [ [[DOT0]], [[TMP6]] ]
+; CHECK-NEXT:    ret i32 [[DOT1]]
+;
   store i32 5, i32* %0, align 4
   %3 = icmp ne i32 %1, 0
   br i1 %3, label %4, label %7
 
 ; <label>:4:                                      ; preds = %2
-; CHECK-NOT: load
   %5 = load i32, i32* %0, align 4
-; CHECK-NOT: add
   %6 = add nsw i32 5, %5
   br label %8
 
@@ -60,17 +75,12 @@ define i32 @foo2(i32*, i32)  {
   br label %8
 
 ; <label>:8:                                      ; preds = %7, %4
-; CHECK: phi i32 [ 10, %4 ], [ 5, %5 ]
   %.0 = phi i32 [ %6, %4 ], [ 5, %7 ]
-; CHECK-NOT: icmp
   %9 = icmp ne i32 %1, 0
-; CHECK: br i1 %3
   br i1 %9, label %10, label %13
 
 ; <label>:10:                                     ; preds = %8
-; CHECK-NOT: load
   %11 = load i32, i32* %0, align 4
-; CHECK: add nsw i32 %.0, 5
   %12 = add nsw i32 %.0, %11
   br label %13
 
diff --git a/test/Transforms/NewGVN/tbaa.ll b/test/Transforms/NewGVN/tbaa.ll
index 47e20fae7f9c603803d84321daed970686f00e6a..3dcc4f8acc14aeac1d7937fc267526f6a8dcd919 100644
--- a/test/Transforms/NewGVN/tbaa.ll
+++ b/test/Transforms/NewGVN/tbaa.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -tbaa -basicaa -newgvn -S < %s | FileCheck %s
 
 define i32 @test1(i8* %p, i8* %q) {
diff --git a/test/Transforms/NewGVN/volatile-nonvolatile.ll b/test/Transforms/NewGVN/volatile-nonvolatile.ll
index 8c74f8b28efbb69e5b52ff4619771ab18673b465..46d29bad0f4dd27bffe43465da1217299ce168f6 100644
--- a/test/Transforms/NewGVN/volatile-nonvolatile.ll
+++ b/test/Transforms/NewGVN/volatile-nonvolatile.ll
@@ -1,4 +1,3 @@
-; XFAIL: *
 ; RUN: opt -tbaa -newgvn -S < %s | FileCheck %s
 
 %struct.t = type { i32* }
diff --git a/test/Transforms/ObjCARC/contract-storestrong.ll b/test/Transforms/ObjCARC/contract-storestrong.ll
index 2b83bdb9bfbf9c7f03fa3887f6de54e5199b4500..a02f7b7019125549888aaee8582740cf169b9ee6 100644
--- a/test/Transforms/ObjCARC/contract-storestrong.ll
+++ b/test/Transforms/ObjCARC/contract-storestrong.ll
@@ -243,6 +243,19 @@ entry:
   ret void
 }
 
+; This used to crash.
+; CHECK-LABEL: define i8* @test13(
+; CHECK: tail call void @objc_storeStrong(i8** %{{.*}}, i8* %[[NEW:.*]])
+; CHECK-NEXT: ret i8* %[[NEW]]
+
+define i8* @test13(i8* %a0, i8* %a1, i8** %addr, i8* %new) {
+  %old = load i8*, i8** %addr, align 8
+  call void @objc_release(i8* %old)
+  %retained = call i8* @objc_retain(i8* %new)
+  store i8* %retained, i8** %addr, align 8
+  ret i8* %retained
+}
+
 !0 = !{}
 
 ; CHECK: attributes [[NUW]] = { nounwind }
diff --git a/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext b/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext
new file mode 100644
index 0000000000000000000000000000000000000000..400b29df303657db50db000b44877c34b71e4dd9
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/memop_size_annotation.proftext
@@ -0,0 +1,27 @@
+# IR level Instrumentation Flag
+:ir
+foo
+# Func Hash:
+53929068288
+# Num Counters:
+3
+# Counter Values:
+556
+20
+1
+# Num Value Kinds:
+1
+# ValueKind = IPVK_MemOPSize:
+1
+# NumValueSites:
+1
+9
+7:33
+2:88
+9:72
+4:66
+1:99
+5:55
+6:44
+3:77
+8:22
diff --git a/test/Transforms/PGOProfile/Inputs/thinlto_samplepgo_icp.ll b/test/Transforms/PGOProfile/Inputs/thinlto_samplepgo_icp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..22860f52b5d3beb7c0fc4018d7f1e7b701a24549
--- /dev/null
+++ b/test/Transforms/PGOProfile/Inputs/thinlto_samplepgo_icp.ll
@@ -0,0 +1,27 @@
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@fptr = external local_unnamed_addr global void ()*, align 8
+
+; Function Attrs: norecurse nounwind uwtable
+define void @_Z6updatei(i32 %i) local_unnamed_addr #0 {
+entry:
+  store void ()* @_ZL3foov, void ()** @fptr, align 8
+  ret void
+}
+
+; Function Attrs: norecurse nounwind readnone uwtable
+define internal void @_ZL3foov() #1 {
+entry:
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3}
+!llvm.ident = !{!31}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 297016)", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "b.cc", directory: "/ssd/llvm/abc/small")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!31 = !{!"clang version 5.0.0 (trunk 297016)"}
diff --git a/test/Transforms/PGOProfile/comdat_internal.ll b/test/Transforms/PGOProfile/comdat_internal.ll
index 7df6f91fe72972c4f0ad0ef0bec42270a19dcf25..74630179105a4e2a13c3b3fe55586b34ea6640fc 100644
--- a/test/Transforms/PGOProfile/comdat_internal.ll
+++ b/test/Transforms/PGOProfile/comdat_internal.ll
@@ -12,11 +12,11 @@ $foo = comdat any
 @bar = global i32 ()* @foo, align 8
 
 ; CHECK: @__llvm_profile_raw_version = constant i64 {{[0-9]+}}, comdat
-; CHECK: @__profn__stdin__foo = private constant [11 x i8] c"<stdin>:foo"
+; CHECK-NOT: __profn__stdin__foo
 ; CHECK: @__profc__stdin__foo.[[FOO_HASH]] = private global [1 x i64] zeroinitializer, section "__llvm_prf_cnts", comdat($__profv__stdin__foo.[[FOO_HASH]]), align 8
-; CHECK: @__profd__stdin__foo.[[FOO_HASH]] = private global { i64, i64, i64*, i8*, i8*, i32, [1 x i16] } { i64 -5640069336071256030, i64 [[FOO_HASH]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__stdin__foo.[[FOO_HASH]], i32 0, i32 0), i8* null
+; CHECK: @__profd__stdin__foo.[[FOO_HASH]] = private global { i64, i64, i64*, i8*, i8*, i32, [2 x i16] } { i64 -5640069336071256030, i64 [[FOO_HASH]], i64* getelementptr inbounds ([1 x i64], [1 x i64]* @__profc__stdin__foo.[[FOO_HASH]], i32 0, i32 0), i8* null
 ; CHECK-NOT: bitcast (i32 ()* @foo to i8*)
-; CHECK-SAME: , i8* null, i32 1, [1 x i16] zeroinitializer }, section "__llvm_prf_data", comdat($__profv__stdin__foo.[[FOO_HASH]]), align 8
+; CHECK-SAME: , i8* null, i32 1, [2 x i16] zeroinitializer }, section "__llvm_prf_data", comdat($__profv__stdin__foo.[[FOO_HASH]]), align 8
 ; CHECK: @__llvm_prf_nm
 ; CHECK: @llvm.used
 
diff --git a/test/Transforms/PGOProfile/indirect_call_promotion.ll b/test/Transforms/PGOProfile/indirect_call_promotion.ll
index c35166505eb925d9a30f17dccd096e01ba0cee11..b892c130152c072c61ec9ad804171f8716330e50 100644
--- a/test/Transforms/PGOProfile/indirect_call_promotion.ll
+++ b/test/Transforms/PGOProfile/indirect_call_promotion.ll
@@ -1,4 +1,6 @@
 ; RUN: opt < %s -pgo-icall-prom -S | FileCheck %s --check-prefix=ICALL-PROM
+; RUN: opt < %s -pgo-icall-prom -S -icp-samplepgo | FileCheck %s --check-prefix=ICALL-PROM
+; RUN: opt < %s -pgo-icall-prom -S -icp-samplepgo | FileCheck %s --check-prefix=ICALL-PROM-SAMPLEPGO
 ; RUN: opt < %s -passes=pgo-icall-prom -S | FileCheck %s --check-prefix=ICALL-PROM
 ; RUN: opt < %s -pgo-icall-prom -S -pass-remarks=pgo-icall-prom -icp-count-threshold=0 -icp-percent-threshold=0 -icp-max-prom=4 2>&1 | FileCheck %s --check-prefix=PASS-REMARK
 ; RUN: opt < %s -passes=pgo-icall-prom -S -pass-remarks=pgo-icall-prom -icp-count-threshold=0 -icp-percent-threshold=0 -icp-max-prom=4 2>&1 | FileCheck %s --check-prefix=PASS-REMARK
@@ -40,6 +42,7 @@ entry:
 ; ICALL-PROM:   br i1 [[CMP]], label %if.true.direct_targ, label %if.false.orig_indirect, !prof [[BRANCH_WEIGHT:![0-9]+]]
 ; ICALL-PROM: if.true.direct_targ:
 ; ICALL-PROM:   [[DIRCALL_RET:%[0-9]+]] = call i32 @func4()
+; ICALL-PROM-SAMPLEPGO: call i32 @func4(), !prof [[CALL_METADATA:![0-9]+]]
 ; ICALL-PROM:   br label %if.end.icp
   %call = call i32 %tmp(), !prof !1
 ; ICALL-PROM: if.false.orig_indirect:
@@ -54,3 +57,4 @@ entry:
 
 ; ICALL-PROM: [[BRANCH_WEIGHT]] = !{!"branch_weights", i32 1030, i32 570}
 ; ICALL-PROM: [[NEW_VP_METADATA]] = !{!"VP", i32 0, i64 570, i64 -4377547752858689819, i64 410}
+; ICALL-PROM-SAMPLEPGO: [[CALL_METADATA]] = !{!"branch_weights", i32 1030}
diff --git a/test/Transforms/PGOProfile/memcpy.ll b/test/Transforms/PGOProfile/memcpy.ll
new file mode 100644
index 0000000000000000000000000000000000000000..9db4a4a2dd4c71fc95b39274a4dee2a80b270636
--- /dev/null
+++ b/test/Transforms/PGOProfile/memcpy.ll
@@ -0,0 +1,35 @@
+; RUN: opt < %s -pgo-instr-gen -instrprof -S | FileCheck %s
+; RUN: opt <%s -passes=pgo-instr-gen,instrprof -S | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i8* %dst, i8* %src, i32* %a, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %add, %for.cond1 ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.cond1, label %for.end6
+
+for.cond1:
+  %j.0 = phi i32 [ %inc, %for.body3 ], [ 0, %for.cond ]
+  %idx.ext = sext i32 %i.0 to i64
+  %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
+  %0 = load i32, i32* %add.ptr, align 4
+  %cmp2 = icmp slt i32 %j.0, %0
+  %add = add nsw i32 %i.0, 1
+  br i1 %cmp2, label %for.body3, label %for.cond
+
+for.body3:
+  %conv = sext i32 %add to i64
+; CHECK: call void @__llvm_profile_instrument_range(i64 %conv, i8* bitcast ({ i64, i64, i64*, i8*, i8*, i32, [2 x i16] }* @__profd_foo to i8*), i32 0, i64 0, i64 8, i64 8192)
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false)
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end6:
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
diff --git a/test/Transforms/PGOProfile/memop_size_annotation.ll b/test/Transforms/PGOProfile/memop_size_annotation.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5481d12b1af15496b37248ef5ce4c76a1d906007
--- /dev/null
+++ b/test/Transforms/PGOProfile/memop_size_annotation.ll
@@ -0,0 +1,59 @@
+; RUN: llvm-profdata merge %S/Inputs/memop_size_annotation.proftext -o %t.profdata
+; RUN: opt < %s -pgo-instr-use -memop-max-annotations=9 -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION9
+; RUN: opt < %s -passes=pgo-instr-use -memop-max-annotations=9 -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION9
+; RUN: opt < %s -pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION4
+; RUN: opt < %s -passes=pgo-instr-use -pgo-test-profile-file=%t.profdata -S | FileCheck %s --check-prefixes=MEMOP_ANNOTATION,MEMOP_ANNOTATION4
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i8* %dst, i8* %src, i32* %a, i32 %n) {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end6
+
+for.body:
+  br label %for.cond1
+
+for.cond1:
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %idx.ext = sext i32 %i.0 to i64
+  %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
+  %0 = load i32, i32* %add.ptr, align 4
+  %cmp2 = icmp slt i32 %j.0, %0
+  br i1 %cmp2, label %for.body3, label %for.end
+
+for.body3:
+  %add = add nsw i32 %i.0, 1
+  %conv = sext i32 %add to i64
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false)
+; MEMOP_ANNOTATION: call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false)
+; MEMOP_ANNOTATION-SAME: !prof ![[MEMOP_VALUESITE:[0-9]+]]
+; MEMOP_ANNOTATION9: ![[MEMOP_VALUESITE]] = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66, i64 5, i64 55, i64 6, i64 44, i64 7, i64 33, i64 8, i64 22}
+; MEMOP_ANNOTATION4: ![[MEMOP_VALUESITE]] = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72}
+  br label %for.inc
+
+for.inc:
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:
+  br label %for.inc4
+
+for.inc4:
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:
+  ret void
+}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+declare void @llvm.lifetime.end(i64, i8* nocapture)
diff --git a/test/Transforms/PGOProfile/memop_size_opt.ll b/test/Transforms/PGOProfile/memop_size_opt.ll
new file mode 100644
index 0000000000000000000000000000000000000000..c7c42f3c1d33d830f6b45d7d855f7b1f898cd8e2
--- /dev/null
+++ b/test/Transforms/PGOProfile/memop_size_opt.ll
@@ -0,0 +1,100 @@
+; RUN: opt < %s -passes=pgo-memop-opt -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -S | FileCheck %s --check-prefix=MEMOP_OPT
+; RUN: opt < %s -pgo-memop-opt -pgo-memop-count-threshold=90 -pgo-memop-percent-threshold=15 -S | FileCheck %s --check-prefix=MEMOP_OPT
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo(i8* %dst, i8* %src, i32* %a, i32 %n) !prof !27 {
+entry:
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ 0, %entry ], [ %inc5, %for.inc4 ]
+  %cmp = icmp slt i32 %i.0, %n
+  br i1 %cmp, label %for.body, label %for.end6, !prof !28
+
+for.body:
+  br label %for.cond1
+
+for.cond1:
+  %j.0 = phi i32 [ 0, %for.body ], [ %inc, %for.inc ]
+  %idx.ext = sext i32 %i.0 to i64
+  %add.ptr = getelementptr inbounds i32, i32* %a, i64 %idx.ext
+  %0 = load i32, i32* %add.ptr, align 4
+  %cmp2 = icmp slt i32 %j.0, %0
+  br i1 %cmp2, label %for.body3, label %for.end, !prof !29
+
+for.body3:
+  %add = add nsw i32 %i.0, 1
+  %conv = sext i32 %add to i64
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false), !prof !30
+  br label %for.inc
+
+; MEMOP_OPT:  switch i64 %conv, label %[[Default_LABEL:.*]] [
+; MEMOP_OPT:    i64 1, label %[[CASE_1_LABEL:.*]]
+; MEMOP_OPT:  ], !prof [[SWITCH_BW:![0-9]+]] 
+; MEMOP_OPT: [[CASE_1_LABEL]]:
+; MEMOP_OPT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 1, i32 1, i1 false)
+; MEMOP_OPT:   br label %[[MERGE_LABEL:.*]]
+; MEMOP_OPT: [[Default_LABEL]]:
+; MEMOP_OPT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false)
+; MEMOP_OPT-NOT:   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %conv, i32 1, i1 false), !prof
+; MEMOP_OPT:   br label %[[MERGE_LABEL]]
+; MEMOP_OPT: [[MERGE_LABEL]]:
+; MEMOP_OPT:   br label %for.inc
+; MEMOP_OPT: [[SWITCH_BW]] = !{!"branch_weights", i32 457, i32 99}
+
+for.inc:
+  %inc = add nsw i32 %j.0, 1
+  br label %for.cond1
+
+for.end:
+  br label %for.inc4
+
+for.inc4:
+  %inc5 = add nsw i32 %i.0, 1
+  br label %for.cond
+
+for.end6:
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+
+!0 = !{i32 1, !"ProfileSummary", !1}
+!1 = !{!2, !3, !4, !5, !6, !7, !8, !9}
+!2 = !{!"ProfileFormat", !"InstrProf"}
+!3 = !{!"TotalCount", i64 579}
+!4 = !{!"MaxCount", i64 556}
+!5 = !{!"MaxInternalCount", i64 20}
+!6 = !{!"MaxFunctionCount", i64 556}
+!7 = !{!"NumCounts", i64 6}
+!8 = !{!"NumFunctions", i64 3}
+!9 = !{!"DetailedSummary", !10}
+!10 = !{!11, !12, !13, !14, !15, !16, !16, !17, !17, !18, !19, !20, !21, !22, !23, !24, !25, !26}
+!11 = !{i32 10000, i64 556, i32 1}
+!12 = !{i32 100000, i64 556, i32 1}
+!13 = !{i32 200000, i64 556, i32 1}
+!14 = !{i32 300000, i64 556, i32 1}
+!15 = !{i32 400000, i64 556, i32 1}
+!16 = !{i32 500000, i64 556, i32 1}
+!17 = !{i32 600000, i64 556, i32 1}
+!18 = !{i32 700000, i64 556, i32 1}
+!19 = !{i32 800000, i64 556, i32 1}
+!20 = !{i32 900000, i64 556, i32 1}
+!21 = !{i32 950000, i64 556, i32 1}
+!22 = !{i32 990000, i64 20, i32 2}
+!23 = !{i32 999000, i64 1, i32 5}
+!24 = !{i32 999900, i64 1, i32 5}
+!25 = !{i32 999990, i64 1, i32 5}
+!26 = !{i32 999999, i64 1, i32 5}
+!27 = !{!"function_entry_count", i64 1}
+!28 = !{!"branch_weights", i32 20, i32 1}
+!29 = !{!"branch_weights", i32 556, i32 20}
+!30 = !{!"VP", i32 1, i64 556, i64 1, i64 99, i64 2, i64 88, i64 3, i64 77, i64 9, i64 72, i64 4, i64 66, i64 5, i64 55, i64 6, i64 44, i64 7, i64 33, i64 8, i64 22}
+
+declare void @llvm.lifetime.start(i64, i8* nocapture)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+declare void @llvm.lifetime.end(i64, i8* nocapture)
diff --git a/test/Transforms/PGOProfile/statics_counter_naming.ll b/test/Transforms/PGOProfile/statics_counter_naming.ll
index c882406ffe54c098770dcab00fabe6a7c9195bef..c329ddba9300443c02a312382a193236c6443de6 100644
--- a/test/Transforms/PGOProfile/statics_counter_naming.ll
+++ b/test/Transforms/PGOProfile/statics_counter_naming.ll
@@ -1,9 +1,14 @@
-; RUN: opt %s -pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
-; RUN: opt %s -passes=pgo-instr-gen -S | FileCheck %s --check-prefix=GEN
+; RUN: opt %s -pgo-instr-gen -static-func-full-module-prefix=false -S | FileCheck %s --check-prefix=NOPATH
+; RUN: opt %s -passes=pgo-instr-gen -static-func-full-module-prefix=false -S | FileCheck %s --check-prefix=NOPATH
+; RUN: opt %s --pgo-instr-gen -static-func-strip-dirname-prefix=1000 -S | FileCheck %s --check-prefix=NOPATH
+; RUN: opt %s -passes=pgo-instr-gen -static-func-strip-dirname-prefix=1000 -S | FileCheck %s --check-prefix=NOPATH
+; RUN: opt %s --pgo-instr-gen -static-func-strip-dirname-prefix=1 -S | FileCheck %s --check-prefix=HASPATH
+; RUN: opt %s -passes=pgo-instr-gen -static-func-strip-dirname-prefix=1 -S | FileCheck %s --check-prefix=HASPATH
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
-; GEN: @__profn_statics_counter_naming.ll_func = private constant [30 x i8] c"statics_counter_naming.ll:func"
+; NOPATH: @__profn_statics_counter_naming.ll_func = private constant [30 x i8] c"statics_counter_naming.ll:func"
+; HASPATH-NOT: @__profn_statics_counter_naming.ll_func = private constant [30 x i8] c"statics_counter_naming.ll:func"
 
 define internal i32 @func() {
 entry:
diff --git a/test/Transforms/PGOProfile/thinlto_samplepgo_icp.ll b/test/Transforms/PGOProfile/thinlto_samplepgo_icp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..dfb6816db5f24195771ee970ecafa24e3fd7fc6e
--- /dev/null
+++ b/test/Transforms/PGOProfile/thinlto_samplepgo_icp.ll
@@ -0,0 +1,63 @@
+; Do setup work for all below tests: generate bitcode and combined index
+; RUN: opt -module-summary %s -o %t.bc
+; RUN: opt -module-summary %p/Inputs/thinlto_samplepgo_icp.ll -o %t2.bc
+; RUN: llvm-lto -thinlto -o %t3 %t.bc %t2.bc
+
+; Checks if calls to static target functions are properly imported and promoted
+; by ICP. Note that the GUID in the profile is from the oroginal name.
+; RUN: opt -function-import -summary-file %t3.thinlto.bc %t.bc -o %t4.bc -print-imports 2>&1 | FileCheck %s --check-prefix=IMPORTS
+; IMPORTS: Import _ZL3foov.llvm.0
+; RUN: opt %t4.bc -icp-lto -pgo-icall-prom -S -icp-count-threshold=1 | FileCheck %s --check-prefix=ICALL-PROM
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@fptr = local_unnamed_addr global void ()* null, align 8
+
+; Function Attrs: norecurse uwtable
+define i32 @main() local_unnamed_addr #0 !prof !34 {
+entry:
+  %0 = load void ()*, void ()** @fptr, align 8
+; ICALL-PROM:   br i1 %{{[0-9]+}}, label %if.true.direct_targ, label %if.false.orig_indirect
+  tail call void %0(), !prof !40
+  ret i32 0
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3,!4}
+!llvm.ident = !{!31}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 (trunk 297016)", isOptimized: true, runtimeVersion: 0, emissionKind: NoDebug, enums: !2)
+!1 = !DIFile(filename: "main.cc", directory: ".")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"ProfileSummary", !5}
+!5 = !{!6, !7, !8, !9, !10, !11, !12, !13}
+!6 = !{!"ProfileFormat", !"SampleProfile"}
+!7 = !{!"TotalCount", i64 3003}
+!8 = !{!"MaxCount", i64 3000}
+!9 = !{!"MaxInternalCount", i64 0}
+!10 = !{!"MaxFunctionCount", i64 0}
+!11 = !{!"NumCounts", i64 3}
+!12 = !{!"NumFunctions", i64 1}
+!13 = !{!"DetailedSummary", !14}
+!14 = !{!15, !16, !17, !18, !19, !20, !20, !21, !21, !22, !23, !24, !25, !26, !27, !28, !29, !30}
+!15 = !{i32 10000, i64 3000, i32 1}
+!16 = !{i32 100000, i64 3000, i32 1}
+!17 = !{i32 200000, i64 3000, i32 1}
+!18 = !{i32 300000, i64 3000, i32 1}
+!19 = !{i32 400000, i64 3000, i32 1}
+!20 = !{i32 500000, i64 3000, i32 1}
+!21 = !{i32 600000, i64 3000, i32 1}
+!22 = !{i32 700000, i64 3000, i32 1}
+!23 = !{i32 800000, i64 3000, i32 1}
+!24 = !{i32 900000, i64 3000, i32 1}
+!25 = !{i32 950000, i64 3000, i32 1}
+!26 = !{i32 990000, i64 3000, i32 1}
+!27 = !{i32 999000, i64 3000, i32 1}
+!28 = !{i32 999900, i64 2, i32 2}
+!29 = !{i32 999990, i64 2, i32 2}
+!30 = !{i32 999999, i64 2, i32 2}
+!31 = !{!"clang version 5.0.0 (trunk 297016)"}
+!34 = !{!"function_entry_count", i64 1}
+!40 = !{!"VP", i32 0, i64 3000, i64 -8789629626369651636, i64 3000}
diff --git a/test/Transforms/Reassociate/basictest.ll b/test/Transforms/Reassociate/basictest.ll
index 11c67bea2cb0a2a1680806813f644383b0902494..4703fd7621b63e203dcaae95dfe277b6c91d8979 100644
--- a/test/Transforms/Reassociate/basictest.ll
+++ b/test/Transforms/Reassociate/basictest.ll
@@ -222,3 +222,23 @@ define i32 @test15(i32 %X1, i32 %X2, i32 %X3) {
 ; CHECK-LABEL: @test15
 ; CHECK: and i1 %A, %B
 }
+
+; PR30256 - previously this asserted.
+; CHECK-LABEL: @test16
+; CHECK: %[[FACTOR:.*]] = mul i64 %a, -4
+; CHECK-NEXT: %[[RES:.*]] = add i64 %[[FACTOR]], %b
+; CHECK-NEXT: ret i64 %[[RES]]
+define i64 @test16(i1 %cmp, i64 %a, i64 %b) {
+entry:
+  %shl = shl i64 %a, 1
+  %shl.neg = sub i64 0, %shl
+  br i1 %cmp, label %if.then, label %if.end
+
+if.then:                                          ; preds = %entry
+  %add1 = add i64 %shl.neg, %shl.neg
+  %add2 = add i64 %add1, %b
+  ret i64 %add2
+
+if.end:                                           ; preds = %entry
+  ret i64 0
+}
diff --git a/test/Transforms/RewriteStatepointsForGC/base-vector.ll b/test/Transforms/RewriteStatepointsForGC/base-vector.ll
index 9026275cf682c545342dbaf76208dd5e64b5e8c6..c34462f4516908bd187de0231a65dc09f900d3f5 100644
--- a/test/Transforms/RewriteStatepointsForGC/base-vector.ll
+++ b/test/Transforms/RewriteStatepointsForGC/base-vector.ll
@@ -88,6 +88,7 @@ entry:
 }
 
 declare void @use(i64 addrspace(1)*) "gc-leaf-function"
+declare void @use_vec(<4 x i64 addrspace(1)*>) "gc-leaf-function"
 
 define void @test5(i1 %cnd, i64 addrspace(1)* %obj) gc "statepoint-example" {
 ; CHECK-LABEL: @test5
@@ -245,3 +246,17 @@ next:
   ret i64 addrspace(1)* %bdv
 }
 declare void @do_safepoint()
+
+define void @test11(<4 x i64 addrspace(1)*> %vec1) gc "statepoint-example" {
+; CHECK-LABEL: @test11(
+; CHECK: @llvm.experimental.gc.statepoint.p0f_isVoidf{{.*}}<4 x i64 addrspace(1)*> %vec1)
+; CHECK: %vec1.relocated = call coldcc <4 x i8 addrspace(1)*> @llvm.experimental.gc.relocate.v4p1i8
+; CHECK: %vec1.relocated.casted = bitcast <4 x i8 addrspace(1)*> %vec1.relocated to <4 x i64 addrspace(1)*>
+; CHECK: %vec2.remat = getelementptr i64, <4 x i64 addrspace(1)*> %vec1.relocated.casted, i32 1024
+; CHECK: call void @use_vec(<4 x i64 addrspace(1)*> %vec2.remat)
+entry:
+  %vec2 = getelementptr i64, <4 x i64 addrspace(1)*> %vec1, i32 1024
+  call void @do_safepoint() [ "deopt"(i32 0, i32 -1, i32 0, i32 0, i32 0) ]
+  call void @use_vec(<4 x i64 addrspace(1) *> %vec2)
+  ret void
+}
diff --git a/test/Transforms/SCCP/indirectbr.ll b/test/Transforms/SCCP/indirectbr.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b977961ca49b412d16dc8dd475c03ef21396713c
--- /dev/null
+++ b/test/Transforms/SCCP/indirectbr.ll
@@ -0,0 +1,76 @@
+; RUN: opt -S -sccp < %s | FileCheck %s
+
+declare void @BB0_f()
+declare void @BB1_f()
+
+; Make sure we can eliminate what is in BB0 as we know that the indirectbr is going to BB1.
+;
+; CHECK-LABEL: define void @indbrtest1(
+; CHECK-NOT: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest1() {
+entry:
+  indirectbr i8* blockaddress(@indbrtest1, %BB1), [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+; Make sure we can eliminate what is in BB0 as we know that the indirectbr is going to BB1
+; by looking through the casts. The casts should be folded away when they are visited
+; before the indirectbr instruction.
+;
+; CHECK-LABEL: define void @indbrtest2(
+; CHECK-NOT: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest2() {
+entry:
+  %a = ptrtoint i8* blockaddress(@indbrtest2, %BB1) to i64
+  %b = inttoptr i64 %a to i8*
+  %c = bitcast i8* %b to i8*
+  indirectbr i8* %b, [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+; Make sure we can not eliminate BB0 as we do not know the target of the indirectbr.
+;
+; CHECK-LABEL: define void @indbrtest3(
+; CHECK: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest3(i8** %Q) {
+entry:
+  %t = load i8*, i8** %Q
+  indirectbr i8* %t, [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+; Make sure we eliminate BB1 as we pick the first successor on undef.
+;
+; CHECK-LABEL: define void @indbrtest4(
+; CHECK: call void @BB0_f()
+; CHECK: ret void
+define void @indbrtest4(i8** %Q) {
+entry:
+  indirectbr i8* undef, [label %BB0, label %BB1]
+BB0:
+  call void @BB0_f()
+  br label %BB1
+BB1:
+  call void @BB1_f()
+  ret void
+}
+
+
diff --git a/test/Transforms/SCCP/loadtest.ll b/test/Transforms/SCCP/loadtest.ll
index b88b44b76040bc8ab8540103cd53b776a7394c16..89c7371625ad3eae427a14dc550669654b1b0654 100644
--- a/test/Transforms/SCCP/loadtest.ll
+++ b/test/Transforms/SCCP/loadtest.ll
@@ -1,7 +1,7 @@
 ; This test makes sure that these instructions are properly constant propagated.
 
-; RUN: opt < %s -default-data-layout="e-p:32:32" -sccp -S | FileCheck %s
-; RUN: opt < %s -default-data-layout="E-p:32:32" -sccp -S | FileCheck %s
+; RUN: opt < %s -data-layout="e-p:32:32" -sccp -S | FileCheck %s
+; RUN: opt < %s -data-layout="E-p:32:32" -sccp -S | FileCheck %s
 
 ; CHECK-NOT: load
 
diff --git a/test/Transforms/SLPVectorizer/AArch64/gather-root.ll b/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
index 2a9fc9e1c03e56e3de061e2d45a8029632886d83..b7fa5452f25182895446a882240830a66b9b0438 100644
--- a/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ b/test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -1,3 +1,4 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -S | FileCheck %s --check-prefix=DEFAULT
 ; RUN: opt < %s -slp-schedule-budget=0 -slp-min-tree-size=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=GATHER
 ; RUN: opt < %s -slp-schedule-budget=0 -slp-threshold=-30 -slp-vectorizer -S | FileCheck %s --check-prefix=MAX-COST
@@ -8,7 +9,7 @@ target triple = "aarch64--linux-gnu"
 @a = common global [80 x i8] zeroinitializer, align 16
 
 ; DEFAULT-LABEL: @PR28330(
-; DEFAULT: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
+; DEFAULT: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ]
 ; DEFAULT: %[[S0:.+]] = select <8 x i1> %1, <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
 ; DEFAULT: %[[R0:.+]] = shufflevector <8 x i32> %[[S0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
 ; DEFAULT: %[[R1:.+]] = add <8 x i32> %[[S0]], %[[R0]]
@@ -17,10 +18,10 @@ target triple = "aarch64--linux-gnu"
 ; DEFAULT: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; DEFAULT: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
 ; DEFAULT: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
-; DEFAULT: %tmp34 = add i32 %[[R6]], %tmp17
+; DEFAULT: %bin.extra = add i32 %[[R6]], %tmp17
 ;
 ; GATHER-LABEL: @PR28330(
-; GATHER: %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
+; GATHER: %tmp17 = phi i32 [ %bin.extra, %for.body ], [ 0, %entry ]
 ; GATHER: %tmp19 = select i1 %tmp1, i32 -720, i32 -80
 ; GATHER: %tmp21 = select i1 %tmp3, i32 -720, i32 -80
 ; GATHER: %tmp23 = select i1 %tmp5, i32 -720, i32 -80
@@ -44,7 +45,7 @@ target triple = "aarch64--linux-gnu"
 ; GATHER: %[[R4:.+]] = shufflevector <8 x i32> %[[R3]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; GATHER: %[[R5:.+]] = add <8 x i32> %[[R3]], %[[R4]]
 ; GATHER: %[[R6:.+]] = extractelement <8 x i32> %[[R5]], i32 0
-; GATHER: %tmp34 = add i32 %[[R6]], %tmp17
+; GATHER: %bin.extra = add i32 %[[R6]], %tmp17
 ;
 ; MAX-COST-LABEL: @PR28330(
 ; MAX-COST-NOT: shufflevector
@@ -89,3 +90,126 @@ for.body:
   %tmp34 = add i32 %tmp32, %tmp33
   br label %for.body
 }
+
+define void @PR32038(i32 %n) {
+; DEFAULT-LABEL: @PR32038(
+; DEFAULT-NEXT:  entry:
+; DEFAULT-NEXT:    [[TMP0:%.*]] = load <8 x i8>, <8 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <8 x i8>*), align 1
+; DEFAULT-NEXT:    [[TMP1:%.*]] = icmp eq <8 x i8> [[TMP0]], zeroinitializer
+; DEFAULT-NEXT:    br label [[FOR_BODY:%.*]]
+; DEFAULT:       for.body:
+; DEFAULT-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> <i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720, i32 -720>, <8 x i32> <i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80, i32 -80>
+; DEFAULT-NEXT:    [[TMP20:%.*]] = add i32 -5, undef
+; DEFAULT-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
+; DEFAULT-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], undef
+; DEFAULT-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], undef
+; DEFAULT-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], undef
+; DEFAULT-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], undef
+; DEFAULT-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], undef
+; DEFAULT-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; DEFAULT-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP2]], [[RDX_SHUF]]
+; DEFAULT-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; DEFAULT-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; DEFAULT-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; DEFAULT-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; DEFAULT-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; DEFAULT-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], -5
+; DEFAULT-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], undef
+; DEFAULT-NEXT:    br label [[FOR_BODY]]
+;
+; GATHER-LABEL: @PR32038(
+; GATHER-NEXT:  entry:
+; GATHER-NEXT:    [[TMP0:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+; GATHER-NEXT:    [[TMP1:%.*]] = icmp eq i8 [[TMP0]], 0
+; GATHER-NEXT:    [[TMP2:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+; GATHER-NEXT:    [[TMP3:%.*]] = icmp eq i8 [[TMP2]], 0
+; GATHER-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+; GATHER-NEXT:    [[TMP5:%.*]] = icmp eq i8 [[TMP4]], 0
+; GATHER-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+; GATHER-NEXT:    [[TMP7:%.*]] = icmp eq i8 [[TMP6]], 0
+; GATHER-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+; GATHER-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
+; GATHER-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+; GATHER-NEXT:    [[TMP11:%.*]] = icmp eq i8 [[TMP10]], 0
+; GATHER-NEXT:    [[TMP12:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+; GATHER-NEXT:    [[TMP13:%.*]] = icmp eq i8 [[TMP12]], 0
+; GATHER-NEXT:    [[TMP14:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+; GATHER-NEXT:    [[TMP15:%.*]] = icmp eq i8 [[TMP14]], 0
+; GATHER-NEXT:    br label [[FOR_BODY:%.*]]
+; GATHER:       for.body:
+; GATHER-NEXT:    [[TMP17:%.*]] = phi i32 [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
+; GATHER-NEXT:    [[TMP19:%.*]] = select i1 [[TMP1]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP20:%.*]] = add i32 -5, [[TMP19]]
+; GATHER-NEXT:    [[TMP21:%.*]] = select i1 [[TMP3]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], [[TMP21]]
+; GATHER-NEXT:    [[TMP23:%.*]] = select i1 [[TMP5]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], [[TMP23]]
+; GATHER-NEXT:    [[TMP25:%.*]] = select i1 [[TMP7]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP26:%.*]] = add i32 [[TMP24]], [[TMP25]]
+; GATHER-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
+; GATHER-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP30:%.*]] = add i32 [[TMP28]], [[TMP29]]
+; GATHER-NEXT:    [[TMP31:%.*]] = select i1 [[TMP13]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP32:%.*]] = add i32 [[TMP30]], [[TMP31]]
+; GATHER-NEXT:    [[TMP33:%.*]] = select i1 [[TMP15]], i32 -720, i32 -80
+; GATHER-NEXT:    [[TMP0:%.*]] = insertelement <8 x i32> undef, i32 [[TMP19]], i32 0
+; GATHER-NEXT:    [[TMP1:%.*]] = insertelement <8 x i32> [[TMP0]], i32 [[TMP21]], i32 1
+; GATHER-NEXT:    [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[TMP23]], i32 2
+; GATHER-NEXT:    [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[TMP25]], i32 3
+; GATHER-NEXT:    [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[TMP27]], i32 4
+; GATHER-NEXT:    [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[TMP29]], i32 5
+; GATHER-NEXT:    [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[TMP31]], i32 6
+; GATHER-NEXT:    [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[TMP33]], i32 7
+; GATHER-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP7]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; GATHER-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP7]], [[RDX_SHUF]]
+; GATHER-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GATHER-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; GATHER-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; GATHER-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; GATHER-NEXT:    [[TMP8:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; GATHER-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP8]], -5
+; GATHER-NEXT:    [[TMP34:%.*]] = add i32 [[TMP32]], [[TMP33]]
+; GATHER-NEXT:    br label [[FOR_BODY]]
+;
+; MAX-COST-LABEL: @PR32038(
+entry:
+  %tmp0 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1), align 1
+  %tmp1 = icmp eq i8 %tmp0, 0
+  %tmp2 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 2), align 2
+  %tmp3 = icmp eq i8 %tmp2, 0
+  %tmp4 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
+  %tmp5 = icmp eq i8 %tmp4, 0
+  %tmp6 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
+  %tmp7 = icmp eq i8 %tmp6, 0
+  %tmp8 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
+  %tmp9 = icmp eq i8 %tmp8, 0
+  %tmp10 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
+  %tmp11 = icmp eq i8 %tmp10, 0
+  %tmp12 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 7), align 1
+  %tmp13 = icmp eq i8 %tmp12, 0
+  %tmp14 = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 8), align 8
+  %tmp15 = icmp eq i8 %tmp14, 0
+  br label %for.body
+
+for.body:
+  %tmp17 = phi i32 [ %tmp34, %for.body ], [ 0, %entry ]
+  %tmp19 = select i1 %tmp1, i32 -720, i32 -80
+  %tmp20 = add i32 -5, %tmp19
+  %tmp21 = select i1 %tmp3, i32 -720, i32 -80
+  %tmp22 = add i32 %tmp20, %tmp21
+  %tmp23 = select i1 %tmp5, i32 -720, i32 -80
+  %tmp24 = add i32 %tmp22, %tmp23
+  %tmp25 = select i1 %tmp7, i32 -720, i32 -80
+  %tmp26 = add i32 %tmp24, %tmp25
+  %tmp27 = select i1 %tmp9, i32 -720, i32 -80
+  %tmp28 = add i32 %tmp26, %tmp27
+  %tmp29 = select i1 %tmp11, i32 -720, i32 -80
+  %tmp30 = add i32 %tmp28, %tmp29
+  %tmp31 = select i1 %tmp13, i32 -720, i32 -80
+  %tmp32 = add i32 %tmp30, %tmp31
+  %tmp33 = select i1 %tmp15, i32 -720, i32 -80
+  %tmp34 = add i32 %tmp32, %tmp33
+  br label %for.body
+}
diff --git a/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll b/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
index 35763953911b5faf3bd5c22eec8d83833e563fd9..63c6d77954d8294aa44fcf6db35af4641decfbcb 100644
--- a/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
+++ b/test/Transforms/SLPVectorizer/AMDGPU/simplebb.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:32:32:32-p3:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-
 
 
 ; Simple 3-pair chain with loads and stores
-define void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
+define amdgpu_kernel void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, double addrspace(3)* %c) {
 ; CHECK-LABEL: @test1_as_3_3_3(
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
@@ -29,7 +29,7 @@ define void @test1_as_3_3_3(double addrspace(3)* %a, double addrspace(3)* %b, do
   ret void
 }
 
-define void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
+define amdgpu_kernel void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
 ; CHECK-LABEL: @test1_as_3_0_0(
 ; CHECK: load <2 x double>, <2 x double> addrspace(3)*
 ; CHECK: load <2 x double>, <2 x double>*
@@ -49,7 +49,7 @@ define void @test1_as_3_0_0(double addrspace(3)* %a, double* %b, double* %c) {
   ret void
 }
 
-define void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
+define amdgpu_kernel void @test1_as_0_0_3(double* %a, double* %b, double addrspace(3)* %c) {
 ; CHECK-LABEL: @test1_as_0_0_3(
 ; CHECK: load <2 x double>, <2 x double>*
 ; CHECK: load <2 x double>, <2 x double>*
diff --git a/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll b/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1a32f659066304a3dfad870b3d9a95c18c474785
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/SystemZ/SLP-cmp-cost-query.ll
@@ -0,0 +1,36 @@
+; REQUIRES: asserts
+; RUN: opt -mtriple=systemz-unknown -mcpu=z13 -slp-vectorizer -debug-only=SLP \
+; RUN:   -S -disable-output < %s 2>&1 | FileCheck %s
+;
+; Check that SLP vectorizer gets the right cost difference for a compare
+; node.
+
+; Function Attrs: norecurse nounwind readonly
+define void @fun(i8* nocapture, i32 zeroext) local_unnamed_addr #0 {
+.lr.ph.preheader:
+  br label %.lr.ph
+
+.lr.ph:                                           ; preds = %.lr.ph.preheader, %.lr.ph
+  %2 = phi i32 [ %., %.lr.ph ], [ undef, %.lr.ph.preheader ]
+  %3 = phi i32 [ %.9, %.lr.ph ], [ undef, %.lr.ph.preheader ]
+  %4 = icmp ult i32 %2, %1
+  %5 = select i1 %4, i32 0, i32 %1
+  %. = sub i32 %2, %5
+  %6 = icmp ult i32 %3, %1
+  %7 = select i1 %6, i32 0, i32 %1
+  %.9 = sub i32 %3, %7
+  %8 = zext i32 %. to i64
+  %9 = getelementptr inbounds i8, i8* %0, i64 %8
+  %10 = load i8, i8* %9, align 1
+  %11 = zext i32 %.9 to i64
+  %12 = getelementptr inbounds i8, i8* %0, i64 %11
+  %13 = load i8, i8* %12, align 1
+  %14 = icmp eq i8 %10, %13
+  br i1 %14, label %.lr.ph, label %._crit_edge
+
+._crit_edge:                                      ; preds = %.lr.ph
+  ret void
+
+; CHECK: SLP: Adding cost -1 for bundle that starts with   %4 = icmp ult i32 %2, %1.
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/bitreverse.ll b/test/Transforms/SLPVectorizer/X86/bitreverse.ll
index c6d65bbe6840bf7aea028f7de274f2786caa8091..749e93b04134113429d0de0834f9bf5c1eefb4cb 100644
--- a/test/Transforms/SLPVectorizer/X86/bitreverse.ll
+++ b/test/Transforms/SLPVectorizer/X86/bitreverse.ll
@@ -22,29 +22,11 @@ declare i16 @llvm.bitreverse.i16(i16)
 declare  i8 @llvm.bitreverse.i8(i8)
 
 define void @bitreverse_2i64() #0 {
-; SSE-LABEL: @bitreverse_2i64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
-; SSE-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; SSE-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @bitreverse_2i64(
-; AVX-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
-; AVX-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
-; AVX-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
-; AVX-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
-; AVX-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 0), align 8
-; AVX-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i32 0, i64 1), align 8
-; AVX-NEXT:    ret void
-;
-; XOP-LABEL: @bitreverse_2i64(
-; XOP-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
-; XOP-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
-; XOP-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
-; XOP-NEXT:    ret void
+; CHECK-LABEL: @bitreverse_2i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; CHECK-NEXT:    store <2 x i64> [[TMP2]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 8
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 0), align 8
   %ld1 = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i32 0, i64 1), align 8
@@ -57,40 +39,19 @@ define void @bitreverse_2i64() #0 {
 
 define void @bitreverse_4i64() #0 {
 ; SSE-LABEL: @bitreverse_4i64(
-; SSE-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; SSE-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; SSE-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD3]])
-; SSE-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; SSE-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; SSE-NEXT:    store i64 [[BITREVERSE2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; SSE-NEXT:    store i64 [[BITREVERSE3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
+; SSE-NEXT:    [[TMP1:%.*]] = load <2 x i64>, <2 x i64>* bitcast ([4 x i64]* @src64 to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP2:%.*]] = load <2 x i64>, <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2) to <2 x i64>*), align 4
+; SSE-NEXT:    [[TMP3:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <2 x i64> @llvm.bitreverse.v2i64(<2 x i64> [[TMP2]])
+; SSE-NEXT:    store <2 x i64> [[TMP3]], <2 x i64>* bitcast ([4 x i64]* @dst64 to <2 x i64>*), align 4
+; SSE-NEXT:    store <2 x i64> [[TMP4]], <2 x i64>* bitcast (i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2) to <2 x i64>*), align 4
 ; SSE-NEXT:    ret void
 ;
-; AVX1-LABEL: @bitreverse_4i64(
-; AVX1-NEXT:    [[LD0:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 0), align 4
-; AVX1-NEXT:    [[LD1:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 1), align 4
-; AVX1-NEXT:    [[LD2:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 2), align 4
-; AVX1-NEXT:    [[LD3:%.*]] = load i64, i64* getelementptr inbounds ([4 x i64], [4 x i64]* @src64, i64 0, i64 3), align 4
-; AVX1-NEXT:    [[BITREVERSE0:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD0]])
-; AVX1-NEXT:    [[BITREVERSE1:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD1]])
-; AVX1-NEXT:    [[BITREVERSE2:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD2]])
-; AVX1-NEXT:    [[BITREVERSE3:%.*]] = call i64 @llvm.bitreverse.i64(i64 [[LD3]])
-; AVX1-NEXT:    store i64 [[BITREVERSE0]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 0), align 4
-; AVX1-NEXT:    store i64 [[BITREVERSE1]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 1), align 4
-; AVX1-NEXT:    store i64 [[BITREVERSE2]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 2), align 4
-; AVX1-NEXT:    store i64 [[BITREVERSE3]], i64* getelementptr inbounds ([4 x i64], [4 x i64]* @dst64, i64 0, i64 3), align 4
-; AVX1-NEXT:    ret void
-;
-; AVX2-LABEL: @bitreverse_4i64(
-; AVX2-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
-; AVX2-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
-; AVX2-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
-; AVX2-NEXT:    ret void
+; AVX-LABEL: @bitreverse_4i64(
+; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
+; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i64> @llvm.bitreverse.v4i64(<4 x i64> [[TMP1]])
+; AVX-NEXT:    store <4 x i64> [[TMP2]], <4 x i64>* bitcast ([4 x i64]* @dst64 to <4 x i64>*), align 4
+; AVX-NEXT:    ret void
 ;
 ; XOP-LABEL: @bitreverse_4i64(
 ; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i64>, <4 x i64>* bitcast ([4 x i64]* @src64 to <4 x i64>*), align 4
@@ -114,32 +75,11 @@ define void @bitreverse_4i64() #0 {
 }
 
 define void @bitreverse_4i32() #0 {
-; SSE-LABEL: @bitreverse_4i32(
-; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
-; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
-; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 4
-; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 4
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD3]])
-; SSE-NEXT:    store i32 [[BITREVERSE0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 4
-; SSE-NEXT:    store i32 [[BITREVERSE1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 4
-; SSE-NEXT:    store i32 [[BITREVERSE2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 4
-; SSE-NEXT:    store i32 [[BITREVERSE3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 4
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @bitreverse_4i32(
-; AVX-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; AVX-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
-; AVX-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; AVX-NEXT:    ret void
-;
-; XOP-LABEL: @bitreverse_4i32(
-; XOP-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
-; XOP-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
-; XOP-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
-; XOP-NEXT:    ret void
+; CHECK-LABEL: @bitreverse_4i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
+; CHECK-NEXT:    store <4 x i32> [[TMP2]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 4
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 4
   %ld1 = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 4
@@ -158,30 +98,12 @@ define void @bitreverse_4i32() #0 {
 
 define void @bitreverse_8i32() #0 {
 ; SSE-LABEL: @bitreverse_8i32(
-; SSE-NEXT:    [[LD0:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 0), align 2
-; SSE-NEXT:    [[LD1:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 2), align 2
-; SSE-NEXT:    [[LD3:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4), align 2
-; SSE-NEXT:    [[LD5:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 6), align 2
-; SSE-NEXT:    [[LD7:%.*]] = load i32, i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 7), align 2
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD3]])
-; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD4]])
-; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD5]])
-; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD6]])
-; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i32 @llvm.bitreverse.i32(i32 [[LD7]])
-; SSE-NEXT:    store i32 [[BITREVERSE0]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 0), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE1]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 1), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE2]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 2), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE3]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 3), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE4]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE5]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 5), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE6]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 6), align 2
-; SSE-NEXT:    store i32 [[BITREVERSE7]], i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 7), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([8 x i32]* @src32 to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @src32, i32 0, i64 4) to <4 x i32>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <4 x i32> @llvm.bitreverse.v4i32(<4 x i32> [[TMP2]])
+; SSE-NEXT:    store <4 x i32> [[TMP3]], <4 x i32>* bitcast ([8 x i32]* @dst32 to <4 x i32>*), align 2
+; SSE-NEXT:    store <4 x i32> [[TMP4]], <4 x i32>* bitcast (i32* getelementptr inbounds ([8 x i32], [8 x i32]* @dst32, i32 0, i64 4) to <4 x i32>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @bitreverse_8i32(
@@ -224,44 +146,11 @@ define void @bitreverse_8i32() #0 {
 }
 
 define void @bitreverse_8i16() #0 {
-; SSE-LABEL: @bitreverse_8i16(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD3]])
-; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD4]])
-; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD5]])
-; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD6]])
-; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD7]])
-; SSE-NEXT:    store i16 [[BITREVERSE0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @bitreverse_8i16(
-; AVX-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; AVX-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
-; AVX-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; AVX-NEXT:    ret void
-;
-; XOP-LABEL: @bitreverse_8i16(
-; XOP-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
-; XOP-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
-; XOP-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
-; XOP-NEXT:    ret void
+; CHECK-LABEL: @bitreverse_8i16(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; CHECK-NEXT:    ret void
 ;
   %ld0 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
   %ld1 = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
@@ -292,54 +181,12 @@ define void @bitreverse_8i16() #0 {
 
 define void @bitreverse_16i16() #0 {
 ; SSE-LABEL: @bitreverse_16i16(
-; SSE-NEXT:    [[LD0:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 0), align 2
-; SSE-NEXT:    [[LD1:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 1), align 2
-; SSE-NEXT:    [[LD2:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 2), align 2
-; SSE-NEXT:    [[LD3:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 3), align 2
-; SSE-NEXT:    [[LD4:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 4), align 2
-; SSE-NEXT:    [[LD5:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 5), align 2
-; SSE-NEXT:    [[LD6:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 6), align 2
-; SSE-NEXT:    [[LD7:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 7), align 2
-; SSE-NEXT:    [[LD8:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8), align 2
-; SSE-NEXT:    [[LD9:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 9), align 2
-; SSE-NEXT:    [[LD10:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 10), align 2
-; SSE-NEXT:    [[LD11:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 11), align 2
-; SSE-NEXT:    [[LD12:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 12), align 2
-; SSE-NEXT:    [[LD13:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 13), align 2
-; SSE-NEXT:    [[LD14:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 14), align 2
-; SSE-NEXT:    [[LD15:%.*]] = load i16, i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 15), align 2
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD3]])
-; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD4]])
-; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD5]])
-; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD6]])
-; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD7]])
-; SSE-NEXT:    [[BITREVERSE8:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD8]])
-; SSE-NEXT:    [[BITREVERSE9:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD9]])
-; SSE-NEXT:    [[BITREVERSE10:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD10]])
-; SSE-NEXT:    [[BITREVERSE11:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD11]])
-; SSE-NEXT:    [[BITREVERSE12:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD12]])
-; SSE-NEXT:    [[BITREVERSE13:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD13]])
-; SSE-NEXT:    [[BITREVERSE14:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD14]])
-; SSE-NEXT:    [[BITREVERSE15:%.*]] = call i16 @llvm.bitreverse.i16(i16 [[LD15]])
-; SSE-NEXT:    store i16 [[BITREVERSE0]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 0), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE1]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 1), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE2]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 2), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE3]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 3), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE4]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 4), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE5]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 5), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE6]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 6), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE7]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 7), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE8]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE9]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 9), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE10]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 10), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE11]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 11), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE12]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 12), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE13]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 13), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE14]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 14), align 2
-; SSE-NEXT:    store i16 [[BITREVERSE15]], i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 15), align 2
+; SSE-NEXT:    [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* bitcast ([16 x i16]* @src16 to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP2:%.*]] = load <8 x i16>, <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @src16, i16 0, i64 8) to <8 x i16>*), align 2
+; SSE-NEXT:    [[TMP3:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP1]])
+; SSE-NEXT:    [[TMP4:%.*]] = call <8 x i16> @llvm.bitreverse.v8i16(<8 x i16> [[TMP2]])
+; SSE-NEXT:    store <8 x i16> [[TMP3]], <8 x i16>* bitcast ([16 x i16]* @dst16 to <8 x i16>*), align 2
+; SSE-NEXT:    store <8 x i16> [[TMP4]], <8 x i16>* bitcast (i16* getelementptr inbounds ([16 x i16], [16 x i16]* @dst16, i16 0, i64 8) to <8 x i16>*), align 2
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @bitreverse_16i16(
@@ -406,68 +253,11 @@ define void @bitreverse_16i16() #0 {
 }
 
 define void @bitreverse_16i8() #0 {
-; SSE-LABEL: @bitreverse_16i8(
-; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
-; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
-; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
-; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
-; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
-; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
-; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 6), align 1
-; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 7), align 1
-; SSE-NEXT:    [[LD8:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 8), align 1
-; SSE-NEXT:    [[LD9:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 9), align 1
-; SSE-NEXT:    [[LD10:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
-; SSE-NEXT:    [[LD11:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
-; SSE-NEXT:    [[LD12:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
-; SSE-NEXT:    [[LD13:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
-; SSE-NEXT:    [[LD14:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
-; SSE-NEXT:    [[LD15:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD3]])
-; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD4]])
-; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD5]])
-; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD6]])
-; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD7]])
-; SSE-NEXT:    [[BITREVERSE8:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD8]])
-; SSE-NEXT:    [[BITREVERSE9:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD9]])
-; SSE-NEXT:    [[BITREVERSE10:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD10]])
-; SSE-NEXT:    [[BITREVERSE11:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD11]])
-; SSE-NEXT:    [[BITREVERSE12:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD12]])
-; SSE-NEXT:    [[BITREVERSE13:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD13]])
-; SSE-NEXT:    [[BITREVERSE14:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD14]])
-; SSE-NEXT:    [[BITREVERSE15:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD15]])
-; SSE-NEXT:    store i8 [[BITREVERSE0]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 0), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 1), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 2), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 3), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 4), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 5), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 6), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 7), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 8), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 9), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @bitreverse_16i8(
-; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
-; AVX-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; AVX-NEXT:    ret void
-;
-; XOP-LABEL: @bitreverse_16i8(
-; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; XOP-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
-; XOP-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; XOP-NEXT:    ret void
+; CHECK-LABEL: @bitreverse_16i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
+; CHECK-NEXT:    store <16 x i8> [[TMP2]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
 ;
   %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
   %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
@@ -521,122 +311,14 @@ define void @bitreverse_16i8() #0 {
 }
 
 define void @bitreverse_32i8() #0 {
-; SSE-LABEL: @bitreverse_32i8(
-; SSE-NEXT:    [[LD0:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 0), align 1
-; SSE-NEXT:    [[LD1:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 1), align 1
-; SSE-NEXT:    [[LD2:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 2), align 1
-; SSE-NEXT:    [[LD3:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 3), align 1
-; SSE-NEXT:    [[LD4:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 4), align 1
-; SSE-NEXT:    [[LD5:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 5), align 1
-; SSE-NEXT:    [[LD6:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 6), align 1
-; SSE-NEXT:    [[LD7:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 7), align 1
-; SSE-NEXT:    [[LD8:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 8), align 1
-; SSE-NEXT:    [[LD9:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 9), align 1
-; SSE-NEXT:    [[LD10:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 10), align 1
-; SSE-NEXT:    [[LD11:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 11), align 1
-; SSE-NEXT:    [[LD12:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 12), align 1
-; SSE-NEXT:    [[LD13:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 13), align 1
-; SSE-NEXT:    [[LD14:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 14), align 1
-; SSE-NEXT:    [[LD15:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 15), align 1
-; SSE-NEXT:    [[LD16:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16), align 1
-; SSE-NEXT:    [[LD17:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 17), align 1
-; SSE-NEXT:    [[LD18:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 18), align 1
-; SSE-NEXT:    [[LD19:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 19), align 1
-; SSE-NEXT:    [[LD20:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 20), align 1
-; SSE-NEXT:    [[LD21:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 21), align 1
-; SSE-NEXT:    [[LD22:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 22), align 1
-; SSE-NEXT:    [[LD23:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 23), align 1
-; SSE-NEXT:    [[LD24:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 24), align 1
-; SSE-NEXT:    [[LD25:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 25), align 1
-; SSE-NEXT:    [[LD26:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 26), align 1
-; SSE-NEXT:    [[LD27:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 27), align 1
-; SSE-NEXT:    [[LD28:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 28), align 1
-; SSE-NEXT:    [[LD29:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 29), align 1
-; SSE-NEXT:    [[LD30:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 30), align 1
-; SSE-NEXT:    [[LD31:%.*]] = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 31), align 1
-; SSE-NEXT:    [[BITREVERSE0:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD0]])
-; SSE-NEXT:    [[BITREVERSE1:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD1]])
-; SSE-NEXT:    [[BITREVERSE2:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD2]])
-; SSE-NEXT:    [[BITREVERSE3:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD3]])
-; SSE-NEXT:    [[BITREVERSE4:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD4]])
-; SSE-NEXT:    [[BITREVERSE5:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD5]])
-; SSE-NEXT:    [[BITREVERSE6:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD6]])
-; SSE-NEXT:    [[BITREVERSE7:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD7]])
-; SSE-NEXT:    [[BITREVERSE8:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD8]])
-; SSE-NEXT:    [[BITREVERSE9:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD9]])
-; SSE-NEXT:    [[BITREVERSE10:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD10]])
-; SSE-NEXT:    [[BITREVERSE11:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD11]])
-; SSE-NEXT:    [[BITREVERSE12:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD12]])
-; SSE-NEXT:    [[BITREVERSE13:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD13]])
-; SSE-NEXT:    [[BITREVERSE14:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD14]])
-; SSE-NEXT:    [[BITREVERSE15:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD15]])
-; SSE-NEXT:    [[BITREVERSE16:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD16]])
-; SSE-NEXT:    [[BITREVERSE17:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD17]])
-; SSE-NEXT:    [[BITREVERSE18:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD18]])
-; SSE-NEXT:    [[BITREVERSE19:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD19]])
-; SSE-NEXT:    [[BITREVERSE20:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD20]])
-; SSE-NEXT:    [[BITREVERSE21:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD21]])
-; SSE-NEXT:    [[BITREVERSE22:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD22]])
-; SSE-NEXT:    [[BITREVERSE23:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD23]])
-; SSE-NEXT:    [[BITREVERSE24:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD24]])
-; SSE-NEXT:    [[BITREVERSE25:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD25]])
-; SSE-NEXT:    [[BITREVERSE26:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD26]])
-; SSE-NEXT:    [[BITREVERSE27:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD27]])
-; SSE-NEXT:    [[BITREVERSE28:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD28]])
-; SSE-NEXT:    [[BITREVERSE29:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD29]])
-; SSE-NEXT:    [[BITREVERSE30:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD30]])
-; SSE-NEXT:    [[BITREVERSE31:%.*]] = call i8 @llvm.bitreverse.i8(i8 [[LD31]])
-; SSE-NEXT:    store i8 [[BITREVERSE0]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 0), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE1]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 1), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE2]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 2), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE3]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 3), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE4]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 4), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE5]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 5), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE6]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 6), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE7]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 7), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE8]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 8), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE9]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 9), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE10]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 10), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE11]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 11), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE12]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 12), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE13]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 13), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE14]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 14), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE15]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 15), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE16]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE17]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 17), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE18]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 18), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE19]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 19), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE20]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 20), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE21]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 21), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE22]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 22), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE23]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 23), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE24]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 24), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE25]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 25), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE26]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 26), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE27]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 27), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE28]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 28), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE29]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 29), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE30]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 30), align 1
-; SSE-NEXT:    store i8 [[BITREVERSE31]], i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 31), align 1
-; SSE-NEXT:    ret void
-;
-; AVX-LABEL: @bitreverse_32i8(
-; AVX-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; AVX-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; AVX-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
-; AVX-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
-; AVX-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; AVX-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
-; AVX-NEXT:    ret void
-;
-; XOP-LABEL: @bitreverse_32i8(
-; XOP-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
-; XOP-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
-; XOP-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
-; XOP-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
-; XOP-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
-; XOP-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
-; XOP-NEXT:    ret void
+; CHECK-LABEL: @bitreverse_32i8(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* bitcast ([32 x i8]* @src8 to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <16 x i8>, <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP1]])
+; CHECK-NEXT:    [[TMP4:%.*]] = call <16 x i8> @llvm.bitreverse.v16i8(<16 x i8> [[TMP2]])
+; CHECK-NEXT:    store <16 x i8> [[TMP3]], <16 x i8>* bitcast ([32 x i8]* @dst8 to <16 x i8>*), align 1
+; CHECK-NEXT:    store <16 x i8> [[TMP4]], <16 x i8>* bitcast (i8* getelementptr inbounds ([32 x i8], [32 x i8]* @dst8, i8 0, i64 16) to <16 x i8>*), align 1
+; CHECK-NEXT:    ret void
 ;
   %ld0  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  0), align 1
   %ld1  = load i8, i8* getelementptr inbounds ([32 x i8], [32 x i8]* @src8, i8 0, i64  1), align 1
diff --git a/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll b/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
new file mode 100644
index 0000000000000000000000000000000000000000..ba0059ed4e51ba0527c7d5b11eb9ea871ed4ae9d
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/blending-shuffle.ll
@@ -0,0 +1,167 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -o - -mtriple=x86-64-unknown-linux -mcpu=bdver2 -instcombine | FileCheck %s
+
+define <2 x i8> @g(<2 x i8> %x, <2 x i8> %y) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <2 x i8> undef, i8 [[X0X0]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <2 x i8> [[INS1]], i8 [[Y1Y1]], i32 1
+; CHECK-NEXT:    ret <2 x i8> [[INS2]]
+;
+  %x0 = extractelement <2 x i8> %x, i32 0
+  %y1 = extractelement <2 x i8> %y, i32 1
+  %x0x0 = mul i8 %x0, %x0
+  %y1y1 = mul i8 %y1, %y1
+  %ins1 = insertelement <2 x i8> undef, i8 %x0x0, i32 0
+  %ins2 = insertelement <2 x i8> %ins1, i8 %y1y1, i32 1
+  ret <2 x i8> %ins2
+}
+
+define <4 x i8> @h(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @h(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[INS1:%.*]] = insertelement <4 x i8> undef, i8 [[X0X0]], i32 0
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x i8> [[INS1]], i8 [[X3X3]], i32 1
+; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
+; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
+; CHECK-NEXT:    ret <4 x i8> [[INS4]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+  %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+  %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+  %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+  ret <4 x i8> %ins4
+}
+
+define <4 x i8> @h_undef(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @h_undef(
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x i8> undef, i8 [[X3X3]], i32 1
+; CHECK-NEXT:    [[INS3:%.*]] = insertelement <4 x i8> [[INS2]], i8 [[Y1Y1]], i32 2
+; CHECK-NEXT:    [[INS4:%.*]] = insertelement <4 x i8> [[INS3]], i8 [[Y2Y2]], i32 3
+; CHECK-NEXT:    ret <4 x i8> [[INS4]]
+;
+  %x0 = extractelement <4 x i8> undef, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %ins1 = insertelement <4 x i8> undef, i8 %x0x0, i32 0
+  %ins2 = insertelement <4 x i8> %ins1, i8 %x3x3, i32 1
+  %ins3 = insertelement <4 x i8> %ins2, i8 %y1y1, i32 2
+  %ins4 = insertelement <4 x i8> %ins3, i8 %y2y2, i32 3
+  ret <4 x i8> %ins4
+}
+
+define i8 @i(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @i(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %y1y1, %y2y2
+  %3 = add i8 %1, %2
+  ret i8 %3
+}
+
+define i8 @j(<4 x i8> %x, <4 x i8> %y) {
+; CHECK-LABEL: @j(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <4 x i8> [[Y:%.*]], i32 1
+; CHECK-NEXT:    [[Y2:%.*]] = extractelement <4 x i8> [[Y]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[Y1Y1:%.*]] = mul i8 [[Y1]], [[Y1]]
+; CHECK-NEXT:    [[Y2Y2:%.*]] = mul i8 [[Y2]], [[Y2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[Y1Y1]], [[Y2Y2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %y1 = extractelement <4 x i8> %y, i32 1
+  %y2 = extractelement <4 x i8> %y, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %y1y1 = mul i8 %y1, %y1
+  %y2y2 = mul i8 %y2, %y2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %y1y1, %y2y2
+  %3 = sdiv i8 %1, %2
+  ret i8 %3
+}
+
+define i8 @k(<4 x i8> %x) {
+; CHECK-LABEL: @k(
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <4 x i8> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[X3:%.*]] = extractelement <4 x i8> [[X]], i32 3
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <4 x i8> [[X]], i32 1
+; CHECK-NEXT:    [[X2:%.*]] = extractelement <4 x i8> [[X]], i32 2
+; CHECK-NEXT:    [[X0X0:%.*]] = mul i8 [[X0]], [[X0]]
+; CHECK-NEXT:    [[X3X3:%.*]] = mul i8 [[X3]], [[X3]]
+; CHECK-NEXT:    [[X1X1:%.*]] = mul i8 [[X1]], [[X1]]
+; CHECK-NEXT:    [[X2X2:%.*]] = mul i8 [[X2]], [[X2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = add i8 [[X0X0]], [[X3X3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = add i8 [[X1X1]], [[X2X2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = sdiv i8 [[TMP1]], [[TMP2]]
+; CHECK-NEXT:    ret i8 [[TMP3]]
+;
+  %x0 = extractelement <4 x i8> %x, i32 0
+  %x3 = extractelement <4 x i8> %x, i32 3
+  %x1 = extractelement <4 x i8> %x, i32 1
+  %x2 = extractelement <4 x i8> %x, i32 2
+  %x0x0 = mul i8 %x0, %x0
+  %x3x3 = mul i8 %x3, %x3
+  %x1x1 = mul i8 %x1, %x1
+  %x2x2 = mul i8 %x2, %x2
+  %1 = add i8 %x0x0, %x3x3
+  %2 = add i8 %x1x1, %x2x2
+  %3 = sdiv i8 %1, %2
+  ret i8 %3
+}
diff --git a/test/Transforms/SLPVectorizer/X86/extractelement.ll b/test/Transforms/SLPVectorizer/X86/extractelement.ll
new file mode 100644
index 0000000000000000000000000000000000000000..10675f3be8a667ed3c2266685c566d7df9f2f774
--- /dev/null
+++ b/test/Transforms/SLPVectorizer/X86/extractelement.ll
@@ -0,0 +1,61 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -march=core-avx2 | FileCheck %s
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -march=core-avx2 -slp-threshold=-1 -slp-vectorize-hor-store | FileCheck %s --check-prefix=THRESH1
+; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-unknown-linux -march=core-avx2 -slp-threshold=-2 -slp-vectorize-hor-store | FileCheck %s --check-prefix=THRESH2
+
+@a = global float 0.000000e+00, align 4
+
+define float @f(<2 x float> %x) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:    [[TMP1:%.*]] = fmul <2 x float> [[X:%.*]], [[X]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <2 x float> [[TMP1]], i32 0
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP1]], i32 1
+; CHECK-NEXT:    [[ADD:%.*]] = fadd float [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    ret float [[ADD]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %x0x0 = fmul float %x0, %x0
+  %x1x1 = fmul float %x1, %x1
+  %add = fadd float %x0x0, %x1x1
+  ret float %add
+}
+
+define float @f_used_out_of_tree(<2 x float> %x) {
+; THRESH2-LABEL: @f_used_out_of_tree(
+; THRESH2-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 0
+; THRESH2-NEXT:    [[TMP2:%.*]] = fmul <2 x float> [[X]], [[X]]
+; THRESH2-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESH2-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESH2-NEXT:    [[ADD:%.*]] = fadd float [[TMP3]], [[TMP4]]
+; THRESH2-NEXT:    store float [[ADD]], float* @a
+; THRESH2-NEXT:    ret float [[TMP1]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %x0x0 = fmul float %x0, %x0
+  %x1x1 = fmul float %x1, %x1
+  %add = fadd float %x0x0, %x1x1
+  store float %add, float* @a
+  ret float %x0
+}
+
+define float @f_used_twice_in_tree(<2 x float> %x) {
+; THRESH1-LABEL: @f_used_twice_in_tree(
+; THRESH1-NEXT:    [[TMP1:%.*]] = extractelement <2 x float> [[X:%.*]], i32 1
+; THRESH1-NEXT:    [[TMP2:%.*]] = insertelement <2 x float> undef, float [[TMP1]], i32 0
+; THRESH1-NEXT:    [[TMP3:%.*]] = insertelement <2 x float> [[TMP2]], float [[TMP1]], i32 1
+; THRESH1-NEXT:    [[TMP4:%.*]] = fmul <2 x float> [[X]], [[TMP3]]
+; THRESH1-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP4]], i32 0
+; THRESH1-NEXT:    [[TMP6:%.*]] = extractelement <2 x float> [[TMP4]], i32 1
+; THRESH1-NEXT:    [[ADD:%.*]] = fadd float [[TMP5]], [[TMP6]]
+; THRESH1-NEXT:    ret float [[ADD]]
+;
+  %x0 = extractelement <2 x float> %x, i32 0
+  %x1 = extractelement <2 x float> %x, i32 1
+  %x0x0 = fmul float %x0, %x1
+  %x1x1 = fmul float %x1, %x1
+  %add = fadd float %x0x0, %x1x1
+  ret float %add
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index faef3eade08c99e7d5fb664d17d7f11cf286503b..73844037f12eaa8364ec898e058a7b5c339c3b06 100644
--- a/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 | FileCheck %s
+; RUN: opt -slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=bdver2 -slp-threshold=-10 | FileCheck %s --check-prefix=THRESHOLD
 
 @n = external local_unnamed_addr global i32, align 4
 @arr = common local_unnamed_addr global [20 x float] zeroinitializer, align 16
@@ -34,6 +35,33 @@ define float @baz() {
 ; CHECK-NEXT:    store float [[ADD19_3]], float* @res, align 4
 ; CHECK-NEXT:    ret float [[ADD19_3]]
 ;
+; THRESHOLD-LABEL: @baz(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float [[TMP4]], [[ADD7]]
+; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[TMP5]], [[ADD19]]
+; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP9]], [[ADD19_1]]
+; THRESHOLD-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP10]], [[ADD19_2]]
+; THRESHOLD-NEXT:    store float [[ADD19_3]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[ADD19_3]]
+;
 entry:
   %0 = load i32, i32* @n, align 4
   %mul = mul nsw i32 %0, 3
@@ -69,39 +97,62 @@ define float @bazz() {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP2:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <2 x float> [[TMP2]], [[TMP1]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP3]], i32 0
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[CONV]]
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <2 x float> [[TMP3]], i32 1
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP5]], [[ADD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = fmul fast <2 x float> [[TMP7]], [[TMP6]]
-; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <2 x float> [[TMP8]], i32 0
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP9]], [[ADD_1]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <2 x float> [[TMP8]], i32 1
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP10]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
+; CHECK-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
 ; CHECK-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
 ; CHECK-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
 ; CHECK-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
-; CHECK-NEXT:    [[TMP11:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 4) to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP12:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 4) to <2 x float>*), align 16
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul fast <2 x float> [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <2 x float> [[TMP13]], i32 0
-; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float [[TMP14]], [[ADD7]]
-; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <2 x float> [[TMP13]], i32 1
-; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float [[TMP15]], [[ADD19]]
-; CHECK-NEXT:    [[TMP16:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 6) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x float>, <2 x float>* bitcast (float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 6) to <2 x float>*), align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <2 x float> [[TMP17]], [[TMP16]]
-; CHECK-NEXT:    [[TMP19:%.*]] = extractelement <2 x float> [[TMP18]], i32 0
-; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float [[TMP19]], [[ADD19_1]]
-; CHECK-NEXT:    [[TMP20:%.*]] = extractelement <2 x float> [[TMP18]], i32 1
-; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float [[TMP20]], [[ADD19_2]]
-; CHECK-NEXT:    store float [[ADD19_3]], float* @res, align 4
-; CHECK-NEXT:    ret float [[ADD19_3]]
+; CHECK-NEXT:    [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
+; CHECK-NEXT:    [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
+; CHECK-NEXT:    [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]]
+; CHECK-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
+; CHECK-NEXT:    store float [[BIN_EXTRA5]], float* @res, align 4
+; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
+;
+; THRESHOLD-LABEL: @bazz(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[TMP0]], 3
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr to <8 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* bitcast ([20 x float]* @arr1 to <8 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <8 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[MUL5:%.*]] = shl nsw i32 [[TMP0]], 2
+; THRESHOLD-NEXT:    [[CONV6:%.*]] = sitofp i32 [[MUL5]] to float
+; THRESHOLD-NEXT:    [[ADD7:%.*]] = fadd fast float [[ADD_3]], [[CONV6]]
+; THRESHOLD-NEXT:    [[ADD19:%.*]] = fadd fast float undef, [[ADD7]]
+; THRESHOLD-NEXT:    [[ADD19_1:%.*]] = fadd fast float undef, [[ADD19]]
+; THRESHOLD-NEXT:    [[ADD19_2:%.*]] = fadd fast float undef, [[ADD19_1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP3]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP4]], [[CONV]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV6]]
+; THRESHOLD-NEXT:    [[ADD19_3:%.*]] = fadd fast float undef, [[ADD19_2]]
+; THRESHOLD-NEXT:    store float [[BIN_EXTRA5]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
 ;
 entry:
   %0 = load i32, i32* @n, align 4
@@ -166,6 +217,25 @@ define float @bazzz() {
 ; CHECK-NEXT:    store float [[TMP8]], float* @res, align 4
 ; CHECK-NEXT:    ret float [[TMP8]]
 ;
+; THRESHOLD-LABEL: @bazzz(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; THRESHOLD-NEXT:    store float [[TMP8]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[TMP8]]
+;
 entry:
   %0 = load i32, i32* @n, align 4
   %conv = sitofp i32 %0 to float
@@ -210,6 +280,26 @@ define i32 @foo() {
 ; CHECK-NEXT:    store i32 [[CONV4]], i32* @n, align 4
 ; CHECK-NEXT:    ret i32 [[CONV4]]
 ;
+; THRESHOLD-LABEL: @foo(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load i32, i32* @n, align 4
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[TMP0]] to float
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* bitcast ([20 x float]* @arr1 to <4 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = fmul fast <4 x float> [[TMP2]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = fadd fast float undef, undef
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = fadd fast float undef, [[TMP4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = fadd fast float undef, [[TMP5]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = fmul fast float [[CONV]], [[TMP6]]
+; THRESHOLD-NEXT:    [[CONV4:%.*]] = fptosi float [[TMP8]] to i32
+; THRESHOLD-NEXT:    store i32 [[CONV4]], i32* @n, align 4
+; THRESHOLD-NEXT:    ret i32 [[CONV4]]
+;
 entry:
   %0 = load i32, i32* @n, align 4
   %conv = sitofp i32 %0 to float
@@ -257,6 +347,28 @@ define float @bar() {
 ; CHECK-NEXT:    store float [[MAX_0_MUL3_2]], float* @res, align 4
 ; CHECK-NEXT:    ret float [[MAX_0_MUL3_2]]
 ;
+; THRESHOLD-LABEL: @bar(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <2 x float>, <2 x float>* bitcast ([20 x float]* @arr1 to <2 x float>*), align 16
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = fmul fast <2 x float> [[TMP1]], [[TMP0]]
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
+; THRESHOLD-NEXT:    [[CMP4:%.*]] = fcmp fast ogt float [[TMP3]], [[TMP4]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3:%.*]] = select i1 [[CMP4]], float [[TMP3]], float [[TMP4]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 2), align 8
+; THRESHOLD-NEXT:    [[MUL3_1:%.*]] = fmul fast float [[TMP6]], [[TMP5]]
+; THRESHOLD-NEXT:    [[CMP4_1:%.*]] = fcmp fast ogt float [[MAX_0_MUL3]], [[MUL3_1]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_1:%.*]] = select i1 [[CMP4_1]], float [[MAX_0_MUL3]], float [[MUL3_1]]
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 3), align 4
+; THRESHOLD-NEXT:    [[MUL3_2:%.*]] = fmul fast float [[TMP8]], [[TMP7]]
+; THRESHOLD-NEXT:    [[CMP4_2:%.*]] = fcmp fast ogt float [[MAX_0_MUL3_1]], [[MUL3_2]]
+; THRESHOLD-NEXT:    [[MAX_0_MUL3_2:%.*]] = select i1 [[CMP4_2]], float [[MAX_0_MUL3_1]], float [[MUL3_2]]
+; THRESHOLD-NEXT:    store float [[MAX_0_MUL3_2]], float* @res, align 4
+; THRESHOLD-NEXT:    ret float [[MAX_0_MUL3_2]]
+;
 entry:
   %0 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr, i64 0, i64 0), align 16
   %1 = load float, float* getelementptr inbounds ([20 x float], [20 x float]* @arr1, i64 0, i64 0), align 16
@@ -403,6 +515,129 @@ define float @f(float* nocapture readonly %x) {
 ; CHECK-NEXT:    [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
 ; CHECK-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
 ; CHECK-NEXT:    ret float [[BIN_RDX17]]
+;
+; THRESHOLD-LABEL: @f(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <16 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, <16 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, undef
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; THRESHOLD-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
+; THRESHOLD-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, float* [[X]], i64 32
+; THRESHOLD-NEXT:    [[ARRAYIDX_33:%.*]] = getelementptr inbounds float, float* [[X]], i64 33
+; THRESHOLD-NEXT:    [[ARRAYIDX_34:%.*]] = getelementptr inbounds float, float* [[X]], i64 34
+; THRESHOLD-NEXT:    [[ARRAYIDX_35:%.*]] = getelementptr inbounds float, float* [[X]], i64 35
+; THRESHOLD-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds float, float* [[X]], i64 36
+; THRESHOLD-NEXT:    [[ARRAYIDX_37:%.*]] = getelementptr inbounds float, float* [[X]], i64 37
+; THRESHOLD-NEXT:    [[ARRAYIDX_38:%.*]] = getelementptr inbounds float, float* [[X]], i64 38
+; THRESHOLD-NEXT:    [[ARRAYIDX_39:%.*]] = getelementptr inbounds float, float* [[X]], i64 39
+; THRESHOLD-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds float, float* [[X]], i64 40
+; THRESHOLD-NEXT:    [[ARRAYIDX_41:%.*]] = getelementptr inbounds float, float* [[X]], i64 41
+; THRESHOLD-NEXT:    [[ARRAYIDX_42:%.*]] = getelementptr inbounds float, float* [[X]], i64 42
+; THRESHOLD-NEXT:    [[ARRAYIDX_43:%.*]] = getelementptr inbounds float, float* [[X]], i64 43
+; THRESHOLD-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds float, float* [[X]], i64 44
+; THRESHOLD-NEXT:    [[ARRAYIDX_45:%.*]] = getelementptr inbounds float, float* [[X]], i64 45
+; THRESHOLD-NEXT:    [[ARRAYIDX_46:%.*]] = getelementptr inbounds float, float* [[X]], i64 46
+; THRESHOLD-NEXT:    [[ARRAYIDX_47:%.*]] = getelementptr inbounds float, float* [[X]], i64 47
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_16]] to <32 x float>*
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <32 x float>, <32 x float>* [[TMP2]], align 4
+; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; THRESHOLD-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; THRESHOLD-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; THRESHOLD-NEXT:    [[ADD_32:%.*]] = fadd fast float undef, [[ADD_31]]
+; THRESHOLD-NEXT:    [[ADD_33:%.*]] = fadd fast float undef, [[ADD_32]]
+; THRESHOLD-NEXT:    [[ADD_34:%.*]] = fadd fast float undef, [[ADD_33]]
+; THRESHOLD-NEXT:    [[ADD_35:%.*]] = fadd fast float undef, [[ADD_34]]
+; THRESHOLD-NEXT:    [[ADD_36:%.*]] = fadd fast float undef, [[ADD_35]]
+; THRESHOLD-NEXT:    [[ADD_37:%.*]] = fadd fast float undef, [[ADD_36]]
+; THRESHOLD-NEXT:    [[ADD_38:%.*]] = fadd fast float undef, [[ADD_37]]
+; THRESHOLD-NEXT:    [[ADD_39:%.*]] = fadd fast float undef, [[ADD_38]]
+; THRESHOLD-NEXT:    [[ADD_40:%.*]] = fadd fast float undef, [[ADD_39]]
+; THRESHOLD-NEXT:    [[ADD_41:%.*]] = fadd fast float undef, [[ADD_40]]
+; THRESHOLD-NEXT:    [[ADD_42:%.*]] = fadd fast float undef, [[ADD_41]]
+; THRESHOLD-NEXT:    [[ADD_43:%.*]] = fadd fast float undef, [[ADD_42]]
+; THRESHOLD-NEXT:    [[ADD_44:%.*]] = fadd fast float undef, [[ADD_43]]
+; THRESHOLD-NEXT:    [[ADD_45:%.*]] = fadd fast float undef, [[ADD_44]]
+; THRESHOLD-NEXT:    [[ADD_46:%.*]] = fadd fast float undef, [[ADD_45]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP3]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; THRESHOLD-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <16 x float> [[TMP1]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <16 x float> [[TMP1]], [[RDX_SHUF9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <16 x float> [[BIN_RDX10]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <16 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
+; THRESHOLD-NEXT:    [[RDX_SHUF13:%.*]] = shufflevector <16 x float> [[BIN_RDX12]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX14:%.*]] = fadd fast <16 x float> [[BIN_RDX12]], [[RDX_SHUF13]]
+; THRESHOLD-NEXT:    [[RDX_SHUF15:%.*]] = shufflevector <16 x float> [[BIN_RDX14]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX16:%.*]] = fadd fast <16 x float> [[BIN_RDX14]], [[RDX_SHUF15]]
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = extractelement <16 x float> [[BIN_RDX16]], i32 0
+; THRESHOLD-NEXT:    [[BIN_RDX17:%.*]] = fadd fast float [[TMP4]], [[TMP5]]
+; THRESHOLD-NEXT:    [[ADD_47:%.*]] = fadd fast float undef, [[ADD_46]]
+; THRESHOLD-NEXT:    ret float [[BIN_RDX17]]
 ;
   entry:
   %0 = load float, float* %x, align 4
@@ -555,102 +790,167 @@ define float @f1(float* nocapture readonly %x, i32 %a, i32 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP0]], [[CONV]]
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
-; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[ADD]]
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX_2]], align 4
-; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float [[TMP2]], [[ADD_1]]
 ; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX_3]], align 4
-; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float [[TMP3]], [[ADD_2]]
 ; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX_4]], align 4
-; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float [[TMP4]], [[ADD_3]]
 ; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX_5]], align 4
-; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float [[TMP5]], [[ADD_4]]
 ; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX_6]], align 4
-; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float [[TMP6]], [[ADD_5]]
 ; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX_7]], align 4
-; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float [[TMP7]], [[ADD_6]]
 ; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
-; CHECK-NEXT:    [[TMP8:%.*]] = load float, float* [[ARRAYIDX_8]], align 4
-; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float [[TMP8]], [[ADD_7]]
 ; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
-; CHECK-NEXT:    [[TMP9:%.*]] = load float, float* [[ARRAYIDX_9]], align 4
-; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float [[TMP9]], [[ADD_8]]
 ; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
-; CHECK-NEXT:    [[TMP10:%.*]] = load float, float* [[ARRAYIDX_10]], align 4
-; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float [[TMP10]], [[ADD_9]]
 ; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
-; CHECK-NEXT:    [[TMP11:%.*]] = load float, float* [[ARRAYIDX_11]], align 4
-; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float [[TMP11]], [[ADD_10]]
 ; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
-; CHECK-NEXT:    [[TMP12:%.*]] = load float, float* [[ARRAYIDX_12]], align 4
-; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float [[TMP12]], [[ADD_11]]
 ; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
-; CHECK-NEXT:    [[TMP13:%.*]] = load float, float* [[ARRAYIDX_13]], align 4
-; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float [[TMP13]], [[ADD_12]]
 ; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
-; CHECK-NEXT:    [[TMP14:%.*]] = load float, float* [[ARRAYIDX_14]], align 4
-; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float [[TMP14]], [[ADD_13]]
 ; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
-; CHECK-NEXT:    [[TMP15:%.*]] = load float, float* [[ARRAYIDX_15]], align 4
-; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float [[TMP15]], [[ADD_14]]
 ; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
-; CHECK-NEXT:    [[TMP16:%.*]] = load float, float* [[ARRAYIDX_16]], align 4
-; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float [[TMP16]], [[ADD_15]]
 ; CHECK-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
-; CHECK-NEXT:    [[TMP17:%.*]] = load float, float* [[ARRAYIDX_17]], align 4
-; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float [[TMP17]], [[ADD_16]]
 ; CHECK-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
-; CHECK-NEXT:    [[TMP18:%.*]] = load float, float* [[ARRAYIDX_18]], align 4
-; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float [[TMP18]], [[ADD_17]]
 ; CHECK-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
-; CHECK-NEXT:    [[TMP19:%.*]] = load float, float* [[ARRAYIDX_19]], align 4
-; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float [[TMP19]], [[ADD_18]]
 ; CHECK-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
-; CHECK-NEXT:    [[TMP20:%.*]] = load float, float* [[ARRAYIDX_20]], align 4
-; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float [[TMP20]], [[ADD_19]]
 ; CHECK-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
-; CHECK-NEXT:    [[TMP21:%.*]] = load float, float* [[ARRAYIDX_21]], align 4
-; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float [[TMP21]], [[ADD_20]]
 ; CHECK-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
-; CHECK-NEXT:    [[TMP22:%.*]] = load float, float* [[ARRAYIDX_22]], align 4
-; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float [[TMP22]], [[ADD_21]]
 ; CHECK-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
-; CHECK-NEXT:    [[TMP23:%.*]] = load float, float* [[ARRAYIDX_23]], align 4
-; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float [[TMP23]], [[ADD_22]]
 ; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
-; CHECK-NEXT:    [[TMP24:%.*]] = load float, float* [[ARRAYIDX_24]], align 4
-; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float [[TMP24]], [[ADD_23]]
 ; CHECK-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
-; CHECK-NEXT:    [[TMP25:%.*]] = load float, float* [[ARRAYIDX_25]], align 4
-; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float [[TMP25]], [[ADD_24]]
 ; CHECK-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
-; CHECK-NEXT:    [[TMP26:%.*]] = load float, float* [[ARRAYIDX_26]], align 4
-; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float [[TMP26]], [[ADD_25]]
 ; CHECK-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
-; CHECK-NEXT:    [[TMP27:%.*]] = load float, float* [[ARRAYIDX_27]], align 4
-; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float [[TMP27]], [[ADD_26]]
 ; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
-; CHECK-NEXT:    [[TMP28:%.*]] = load float, float* [[ARRAYIDX_28]], align 4
-; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float [[TMP28]], [[ADD_27]]
 ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
-; CHECK-NEXT:    [[TMP29:%.*]] = load float, float* [[ARRAYIDX_29]], align 4
-; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float [[TMP29]], [[ADD_28]]
 ; CHECK-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
-; CHECK-NEXT:    [[TMP30:%.*]] = load float, float* [[ARRAYIDX_30]], align 4
-; CHECK-NEXT:    [[ADD_30:%.*]] = fadd fast float [[TMP30]], [[ADD_29]]
 ; CHECK-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
-; CHECK-NEXT:    [[TMP31:%.*]] = load float, float* [[ARRAYIDX_31]], align 4
-; CHECK-NEXT:    [[ADD_31:%.*]] = fadd fast float [[TMP31]], [[ADD_30]]
-; CHECK-NEXT:    ret float [[ADD_31]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; CHECK-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; CHECK-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; CHECK-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; CHECK-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; CHECK-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; CHECK-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; CHECK-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; CHECK-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; CHECK-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; CHECK-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; CHECK-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; CHECK-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; CHECK-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; CHECK-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; CHECK-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; CHECK-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; CHECK-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; CHECK-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; CHECK-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; CHECK-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; CHECK-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; CHECK-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; CHECK-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; CHECK-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; CHECK-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; CHECK-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; CHECK-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; CHECK-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; CHECK-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA]]
+;
+; THRESHOLD-LABEL: @f1(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[REM:%.*]] = srem i32 [[A:%.*]], [[B:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[REM]] to float
+; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; THRESHOLD-NEXT:    [[ARRAYIDX_30:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; THRESHOLD-NEXT:    [[ARRAYIDX_31:%.*]] = getelementptr inbounds float, float* [[X]], i64 31
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <32 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <32 x float>, <32 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float undef, [[CONV]]
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; THRESHOLD-NEXT:    [[ADD_30:%.*]] = fadd fast float undef, [[ADD_29]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <32 x float> [[TMP1]], <32 x float> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <32 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x float> [[BIN_RDX]], <32 x float> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <32 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x float> [[BIN_RDX2]], <32 x float> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <32 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x float> [[BIN_RDX4]], <32 x float> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <32 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x float> [[BIN_RDX6]], <32 x float> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <32 x float> [[BIN_RDX6]], [[RDX_SHUF7]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <32 x float> [[BIN_RDX8]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD_31:%.*]] = fadd fast float undef, [[ADD_30]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA]]
 ;
   entry:
   %rem = srem i32 %a, %b
@@ -849,6 +1149,102 @@ define float @loadadd31(float* nocapture readonly %x) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
 ; CHECK-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
 ; CHECK-NEXT:    ret float [[TMP12]]
+;
+; THRESHOLD-LABEL: @loadadd31(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load float, float* [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4
+; THRESHOLD-NEXT:    [[ADD_1:%.*]] = fadd fast float [[TMP1]], [[TMP0]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = bitcast float* [[ARRAYIDX_2]] to <4 x float>*
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = load <4 x float>, <4 x float>* [[TMP2]], align 4
+; THRESHOLD-NEXT:    [[ADD_2:%.*]] = fadd fast float undef, [[ADD_1]]
+; THRESHOLD-NEXT:    [[ADD_3:%.*]] = fadd fast float undef, [[ADD_2]]
+; THRESHOLD-NEXT:    [[ADD_4:%.*]] = fadd fast float undef, [[ADD_3]]
+; THRESHOLD-NEXT:    [[ADD_5:%.*]] = fadd fast float undef, [[ADD_4]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds float, float* [[X]], i64 8
+; THRESHOLD-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds float, float* [[X]], i64 9
+; THRESHOLD-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds float, float* [[X]], i64 10
+; THRESHOLD-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds float, float* [[X]], i64 11
+; THRESHOLD-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds float, float* [[X]], i64 12
+; THRESHOLD-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds float, float* [[X]], i64 13
+; THRESHOLD-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds float, float* [[X]], i64 14
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = bitcast float* [[ARRAYIDX_6]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = load <8 x float>, <8 x float>* [[TMP4]], align 4
+; THRESHOLD-NEXT:    [[ADD_6:%.*]] = fadd fast float undef, [[ADD_5]]
+; THRESHOLD-NEXT:    [[ADD_7:%.*]] = fadd fast float undef, [[ADD_6]]
+; THRESHOLD-NEXT:    [[ADD_8:%.*]] = fadd fast float undef, [[ADD_7]]
+; THRESHOLD-NEXT:    [[ADD_9:%.*]] = fadd fast float undef, [[ADD_8]]
+; THRESHOLD-NEXT:    [[ADD_10:%.*]] = fadd fast float undef, [[ADD_9]]
+; THRESHOLD-NEXT:    [[ADD_11:%.*]] = fadd fast float undef, [[ADD_10]]
+; THRESHOLD-NEXT:    [[ADD_12:%.*]] = fadd fast float undef, [[ADD_11]]
+; THRESHOLD-NEXT:    [[ADD_13:%.*]] = fadd fast float undef, [[ADD_12]]
+; THRESHOLD-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds float, float* [[X]], i64 15
+; THRESHOLD-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds float, float* [[X]], i64 16
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, float* [[X]], i64 17
+; THRESHOLD-NEXT:    [[ARRAYIDX_17:%.*]] = getelementptr inbounds float, float* [[X]], i64 18
+; THRESHOLD-NEXT:    [[ARRAYIDX_18:%.*]] = getelementptr inbounds float, float* [[X]], i64 19
+; THRESHOLD-NEXT:    [[ARRAYIDX_19:%.*]] = getelementptr inbounds float, float* [[X]], i64 20
+; THRESHOLD-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds float, float* [[X]], i64 21
+; THRESHOLD-NEXT:    [[ARRAYIDX_21:%.*]] = getelementptr inbounds float, float* [[X]], i64 22
+; THRESHOLD-NEXT:    [[ARRAYIDX_22:%.*]] = getelementptr inbounds float, float* [[X]], i64 23
+; THRESHOLD-NEXT:    [[ARRAYIDX_23:%.*]] = getelementptr inbounds float, float* [[X]], i64 24
+; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, float* [[X]], i64 25
+; THRESHOLD-NEXT:    [[ARRAYIDX_25:%.*]] = getelementptr inbounds float, float* [[X]], i64 26
+; THRESHOLD-NEXT:    [[ARRAYIDX_26:%.*]] = getelementptr inbounds float, float* [[X]], i64 27
+; THRESHOLD-NEXT:    [[ARRAYIDX_27:%.*]] = getelementptr inbounds float, float* [[X]], i64 28
+; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, float* [[X]], i64 29
+; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, float* [[X]], i64 30
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = bitcast float* [[ARRAYIDX_14]] to <16 x float>*
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = load <16 x float>, <16 x float>* [[TMP6]], align 4
+; THRESHOLD-NEXT:    [[ADD_14:%.*]] = fadd fast float undef, [[ADD_13]]
+; THRESHOLD-NEXT:    [[ADD_15:%.*]] = fadd fast float undef, [[ADD_14]]
+; THRESHOLD-NEXT:    [[ADD_16:%.*]] = fadd fast float undef, [[ADD_15]]
+; THRESHOLD-NEXT:    [[ADD_17:%.*]] = fadd fast float undef, [[ADD_16]]
+; THRESHOLD-NEXT:    [[ADD_18:%.*]] = fadd fast float undef, [[ADD_17]]
+; THRESHOLD-NEXT:    [[ADD_19:%.*]] = fadd fast float undef, [[ADD_18]]
+; THRESHOLD-NEXT:    [[ADD_20:%.*]] = fadd fast float undef, [[ADD_19]]
+; THRESHOLD-NEXT:    [[ADD_21:%.*]] = fadd fast float undef, [[ADD_20]]
+; THRESHOLD-NEXT:    [[ADD_22:%.*]] = fadd fast float undef, [[ADD_21]]
+; THRESHOLD-NEXT:    [[ADD_23:%.*]] = fadd fast float undef, [[ADD_22]]
+; THRESHOLD-NEXT:    [[ADD_24:%.*]] = fadd fast float undef, [[ADD_23]]
+; THRESHOLD-NEXT:    [[ADD_25:%.*]] = fadd fast float undef, [[ADD_24]]
+; THRESHOLD-NEXT:    [[ADD_26:%.*]] = fadd fast float undef, [[ADD_25]]
+; THRESHOLD-NEXT:    [[ADD_27:%.*]] = fadd fast float undef, [[ADD_26]]
+; THRESHOLD-NEXT:    [[ADD_28:%.*]] = fadd fast float undef, [[ADD_27]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <16 x float> [[TMP7]], <16 x float> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <16 x float> [[TMP7]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x float> [[BIN_RDX]], <16 x float> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <16 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x float> [[BIN_RDX2]], <16 x float> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <16 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x float> [[BIN_RDX4]], <16 x float> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX6:%.*]] = fadd fast <16 x float> [[BIN_RDX4]], [[RDX_SHUF5]]
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = extractelement <16 x float> [[BIN_RDX6]], i32 0
+; THRESHOLD-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <8 x float> [[TMP5]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX8:%.*]] = fadd fast <8 x float> [[TMP5]], [[RDX_SHUF7]]
+; THRESHOLD-NEXT:    [[RDX_SHUF9:%.*]] = shufflevector <8 x float> [[BIN_RDX8]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX10:%.*]] = fadd fast <8 x float> [[BIN_RDX8]], [[RDX_SHUF9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF11:%.*]] = shufflevector <8 x float> [[BIN_RDX10]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX12:%.*]] = fadd fast <8 x float> [[BIN_RDX10]], [[RDX_SHUF11]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <8 x float> [[BIN_RDX12]], i32 0
+; THRESHOLD-NEXT:    [[BIN_RDX13:%.*]] = fadd fast float [[TMP8]], [[TMP9]]
+; THRESHOLD-NEXT:    [[RDX_SHUF14:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX15:%.*]] = fadd fast <4 x float> [[TMP3]], [[RDX_SHUF14]]
+; THRESHOLD-NEXT:    [[RDX_SHUF16:%.*]] = shufflevector <4 x float> [[BIN_RDX15]], <4 x float> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX17:%.*]] = fadd fast <4 x float> [[BIN_RDX15]], [[RDX_SHUF16]]
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[BIN_RDX17]], i32 0
+; THRESHOLD-NEXT:    [[BIN_RDX18:%.*]] = fadd fast float [[BIN_RDX13]], [[TMP10]]
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = fadd fast float [[BIN_RDX18]], [[TMP1]]
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = fadd fast float [[TMP11]], [[TMP0]]
+; THRESHOLD-NEXT:    [[ADD_29:%.*]] = fadd fast float undef, [[ADD_28]]
+; THRESHOLD-NEXT:    ret float [[TMP12]]
 ;
   entry:
   %arrayidx = getelementptr inbounds float, float* %x, i64 1
@@ -948,32 +1344,69 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]]
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]]
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
-; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD5]]
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
-; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]]
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
-; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]]
 ; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]], align 4
-; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD4_3]]
 ; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]], align 4
-; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]]
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]], align 4
-; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]]
-; CHECK-NEXT:    ret float [[ADD4_6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
+;
+; THRESHOLD-LABEL: @extra_args(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -1006,39 +1439,188 @@ define float @extra_args(float* nocapture readonly %x, i32 %a, i32 %b) {
   ret float %add4.6
 }
 
+define float @extra_args_same_several_times(float* nocapture readonly %x, i32 %a, i32 %b) {
+; CHECK-LABEL: @extra_args_same_several_times(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; CHECK-NEXT:    [[ADD41:%.*]] = fadd fast float [[ADD4]], 5.000000e+00
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD41]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; CHECK-NEXT:    [[ADD4_11:%.*]] = fadd fast float [[ADD4_1]], 5.000000e+00
+; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_11]]
+; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00
+; CHECK-NEXT:    [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00
+; CHECK-NEXT:    [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA7]]
+;
+; THRESHOLD-LABEL: @extra_args_same_several_times(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], 3.000000e+00
+; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; THRESHOLD-NEXT:    [[ADD41:%.*]] = fadd fast float [[ADD4]], 5.000000e+00
+; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD41]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD5]]
+; THRESHOLD-NEXT:    [[ADD4_11:%.*]] = fadd fast float [[ADD4_1]], 5.000000e+00
+; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_11]]
+; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD4_3]]
+; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], 5.000000e+00
+; THRESHOLD-NEXT:    [[BIN_EXTRA6:%.*]] = fadd fast float [[BIN_EXTRA5]], 5.000000e+00
+; THRESHOLD-NEXT:    [[BIN_EXTRA7:%.*]] = fadd fast float [[BIN_EXTRA6]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA7]]
+;
+  entry:
+  %mul = mul nsw i32 %b, %a
+  %conv = sitofp i32 %mul to float
+  %0 = load float, float* %x, align 4
+  %add = fadd fast float %conv, 3.000000e+00
+  %add1 = fadd fast float %0, %add
+  %arrayidx3 = getelementptr inbounds float, float* %x, i64 1
+  %1 = load float, float* %arrayidx3, align 4
+  %add4 = fadd fast float %1, %add1
+  %add41 = fadd fast float %add4, 5.000000e+00
+  %add5 = fadd fast float %add41, %conv
+  %arrayidx3.1 = getelementptr inbounds float, float* %x, i64 2
+  %2 = load float, float* %arrayidx3.1, align 4
+  %add4.1 = fadd fast float %2, %add5
+  %add4.11 = fadd fast float %add4.1, 5.000000e+00
+  %arrayidx3.2 = getelementptr inbounds float, float* %x, i64 3
+  %3 = load float, float* %arrayidx3.2, align 4
+  %add4.2 = fadd fast float %3, %add4.11
+  %arrayidx3.3 = getelementptr inbounds float, float* %x, i64 4
+  %4 = load float, float* %arrayidx3.3, align 4
+  %add4.3 = fadd fast float %4, %add4.2
+  %arrayidx3.4 = getelementptr inbounds float, float* %x, i64 5
+  %5 = load float, float* %arrayidx3.4, align 4
+  %add4.4 = fadd fast float %5, %add4.3
+  %arrayidx3.5 = getelementptr inbounds float, float* %x, i64 6
+  %6 = load float, float* %arrayidx3.5, align 4
+  %add4.5 = fadd fast float %6, %add4.4
+  %arrayidx3.6 = getelementptr inbounds float, float* %x, i64 7
+  %7 = load float, float* %arrayidx3.6, align 4
+  %add4.6 = fadd fast float %7, %add4.5
+  ret float %add4.6
+}
+
 define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b, i32 %c) {
 ; CHECK-LABEL: @extra_args_no_replace(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
 ; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[X:%.*]], align 4
 ; CHECK-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
 ; CHECK-NEXT:    [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float [[TMP0]], [[ADD]]
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX3]], align 4
-; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float [[TMP1]], [[ADD1]]
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, float* [[ARRAYIDX3_1]], align 4
-; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float [[TMP2]], [[ADD4]]
 ; CHECK-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX3_2]], align 4
-; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float [[TMP3]], [[ADD4_1]]
 ; CHECK-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
-; CHECK-NEXT:    [[TMP4:%.*]] = load float, float* [[ARRAYIDX3_3]], align 4
-; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float [[TMP4]], [[ADD4_2]]
-; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
 ; CHECK-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX3_4]], align 4
-; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float [[TMP5]], [[ADD5]]
 ; CHECK-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
-; CHECK-NEXT:    [[TMP6:%.*]] = load float, float* [[ARRAYIDX3_5]], align 4
-; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float [[TMP6]], [[ADD4_4]]
 ; CHECK-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
-; CHECK-NEXT:    [[TMP7:%.*]] = load float, float* [[ARRAYIDX3_6]], align 4
-; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float [[TMP7]], [[ADD4_5]]
-; CHECK-NEXT:    ret float [[ADD4_6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; CHECK-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
+; CHECK-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; CHECK-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; CHECK-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
+; CHECK-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; CHECK-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; CHECK-NEXT:    ret float [[BIN_EXTRA5]]
+;
+; THRESHOLD-LABEL: @extra_args_no_replace(
+; THRESHOLD-NEXT:  entry:
+; THRESHOLD-NEXT:    [[MUL:%.*]] = mul nsw i32 [[B:%.*]], [[A:%.*]]
+; THRESHOLD-NEXT:    [[CONV:%.*]] = sitofp i32 [[MUL]] to float
+; THRESHOLD-NEXT:    [[CONVC:%.*]] = sitofp i32 [[C:%.*]] to float
+; THRESHOLD-NEXT:    [[ADDC:%.*]] = fadd fast float [[CONVC]], 3.000000e+00
+; THRESHOLD-NEXT:    [[ADD:%.*]] = fadd fast float [[CONV]], [[ADDC]]
+; THRESHOLD-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds float, float* [[X:%.*]], i64 1
+; THRESHOLD-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds float, float* [[X]], i64 2
+; THRESHOLD-NEXT:    [[ARRAYIDX3_2:%.*]] = getelementptr inbounds float, float* [[X]], i64 3
+; THRESHOLD-NEXT:    [[ARRAYIDX3_3:%.*]] = getelementptr inbounds float, float* [[X]], i64 4
+; THRESHOLD-NEXT:    [[ARRAYIDX3_4:%.*]] = getelementptr inbounds float, float* [[X]], i64 5
+; THRESHOLD-NEXT:    [[ARRAYIDX3_5:%.*]] = getelementptr inbounds float, float* [[X]], i64 6
+; THRESHOLD-NEXT:    [[ARRAYIDX3_6:%.*]] = getelementptr inbounds float, float* [[X]], i64 7
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = bitcast float* [[X]] to <8 x float>*
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, <8 x float>* [[TMP0]], align 4
+; THRESHOLD-NEXT:    [[ADD1:%.*]] = fadd fast float undef, [[ADD]]
+; THRESHOLD-NEXT:    [[ADD4:%.*]] = fadd fast float undef, [[ADD1]]
+; THRESHOLD-NEXT:    [[ADD4_1:%.*]] = fadd fast float undef, [[ADD4]]
+; THRESHOLD-NEXT:    [[ADD4_2:%.*]] = fadd fast float undef, [[ADD4_1]]
+; THRESHOLD-NEXT:    [[ADD4_3:%.*]] = fadd fast float undef, [[ADD4_2]]
+; THRESHOLD-NEXT:    [[ADD5:%.*]] = fadd fast float [[ADD4_3]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_4:%.*]] = fadd fast float undef, [[ADD5]]
+; THRESHOLD-NEXT:    [[ADD4_5:%.*]] = fadd fast float undef, [[ADD4_4]]
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = fadd fast <8 x float> [[TMP1]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x float> [[BIN_RDX]], <8 x float> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = fadd fast <8 x float> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x float> [[BIN_RDX2]], <8 x float> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX4:%.*]] = fadd fast <8 x float> [[BIN_RDX2]], [[RDX_SHUF3]]
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = extractelement <8 x float> [[BIN_RDX4]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = fadd fast float [[TMP2]], [[ADD]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA5:%.*]] = fadd fast float [[BIN_EXTRA]], [[CONV]]
+; THRESHOLD-NEXT:    [[ADD4_6:%.*]] = fadd fast float undef, [[ADD4_5]]
+; THRESHOLD-NEXT:    ret float [[BIN_EXTRA5]]
 ;
   entry:
   %mul = mul nsw i32 %b, %a
@@ -1073,3 +1655,81 @@ define float @extra_args_no_replace(float* nocapture readonly %x, i32 %a, i32 %b
   ret float %add4.6
 }
 
+define i32 @wobble(i32 %arg, i32 %bar) {
+; CHECK-LABEL: @wobble(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ARG]], i32 2
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ARG]], i32 3
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[BAR:%.*]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BAR]], i32 1
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[BAR]], i32 2
+; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[BAR]], i32 3
+; CHECK-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; CHECK-NEXT:    [[R1:%.*]] = add nuw i32 [[ARG]], undef
+; CHECK-NEXT:    [[R2:%.*]] = add nsw i32 [[R1]], undef
+; CHECK-NEXT:    [[R3:%.*]] = add nsw i32 [[R2]], undef
+; CHECK-NEXT:    [[R4:%.*]] = add nsw i32 [[R3]], undef
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
+; CHECK-NEXT:    [[BIN_EXTRA3:%.*]] = add nsw i32 [[BIN_EXTRA]], [[TMP9]]
+; CHECK-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
+; CHECK-NEXT:    ret i32 [[BIN_EXTRA3]]
+;
+; THRESHOLD-LABEL: @wobble(
+; THRESHOLD-NEXT:  bb:
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = insertelement <4 x i32> undef, i32 [[ARG:%.*]], i32 0
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[ARG]], i32 1
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[ARG]], i32 2
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[ARG]], i32 3
+; THRESHOLD-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> undef, i32 [[BAR:%.*]], i32 0
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[BAR]], i32 1
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[BAR]], i32 2
+; THRESHOLD-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[BAR]], i32 3
+; THRESHOLD-NEXT:    [[TMP8:%.*]] = xor <4 x i32> [[TMP3]], [[TMP7]]
+; THRESHOLD-NEXT:    [[TMP9:%.*]] = extractelement <4 x i32> [[TMP8]], i32 3
+; THRESHOLD-NEXT:    [[TMP10:%.*]] = icmp eq <4 x i32> [[TMP8]], zeroinitializer
+; THRESHOLD-NEXT:    [[TMP11:%.*]] = sext <4 x i1> [[TMP10]] to <4 x i32>
+; THRESHOLD-NEXT:    [[R1:%.*]] = add nuw i32 [[ARG]], undef
+; THRESHOLD-NEXT:    [[R2:%.*]] = add nsw i32 [[R1]], undef
+; THRESHOLD-NEXT:    [[R3:%.*]] = add nsw i32 [[R2]], undef
+; THRESHOLD-NEXT:    [[R4:%.*]] = add nsw i32 [[R3]], undef
+; THRESHOLD-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP11]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP11]], [[RDX_SHUF]]
+; THRESHOLD-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; THRESHOLD-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; THRESHOLD-NEXT:    [[TMP12:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
+; THRESHOLD-NEXT:    [[BIN_EXTRA:%.*]] = add nuw i32 [[TMP12]], [[ARG]]
+; THRESHOLD-NEXT:    [[BIN_EXTRA3:%.*]] = add nsw i32 [[BIN_EXTRA]], [[TMP9]]
+; THRESHOLD-NEXT:    [[R5:%.*]] = add nsw i32 [[R4]], undef
+; THRESHOLD-NEXT:    ret i32 [[BIN_EXTRA3]]
+;
+  bb:
+  %x1 = xor i32 %arg, %bar
+  %i1 = icmp eq i32 %x1, 0
+  %s1 = sext i1 %i1 to i32
+  %x2 = xor i32 %arg, %bar
+  %i2 = icmp eq i32 %x2, 0
+  %s2 = sext i1 %i2 to i32
+  %x3 = xor i32 %arg, %bar
+  %i3 = icmp eq i32 %x3, 0
+  %s3 = sext i1 %i3 to i32
+  %x4 = xor i32 %arg, %bar
+  %i4 = icmp eq i32 %x4, 0
+  %s4 = sext i1 %i4 to i32
+  %r1 = add nuw i32 %arg, %s1
+  %r2 = add nsw i32 %r1, %s2
+  %r3 = add nsw i32 %r2, %s3
+  %r4 = add nsw i32 %r3, %s4
+  %r5 = add nsw i32 %r4, %x4
+  ret i32 %r5
+}
+
diff --git a/test/Transforms/SLPVectorizer/X86/horizontal.ll b/test/Transforms/SLPVectorizer/X86/horizontal.ll
index f6efd26a4c20f4db459b819e6b3da59546c1bfd7..080f850f91cffb7664e367c26473c5b15d6d1f92 100644
--- a/test/Transforms/SLPVectorizer/X86/horizontal.ll
+++ b/test/Transforms/SLPVectorizer/X86/horizontal.ll
@@ -624,9 +624,9 @@ define void @i32_red_example4(i32* %res) {
 ; STORE-LABEL: @i32_red_example4(
 ; STORE:         [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* bitcast ([32 x i32]* @arr_i32 to <4 x i32>*), align 16
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; STORE-NEXT:    [[TMP1:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
 ; STORE:         store i32 [[TMP1]], i32* %res, align 16
 ; STORE-NEXT:    ret void
@@ -647,11 +647,11 @@ define void @i32_red_example8(i32* %res) {
 ; STORE-LABEL: @i32_red_example8(
 ; STORE:         [[TMP0:%.*]] = load <8 x i32>, <8 x i32>* bitcast ([32 x i32]* @arr_i32 to <8 x i32>*), align 16
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = add <8 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <8 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <8 x i32> [[BIN_RDX]], <8 x i32> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; STORE-NEXT:    [[TMP1:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
 ; STORE:         store i32 [[TMP1]], i32* %res, align 16
 ; STORE-NEXT:    ret void
@@ -680,13 +680,13 @@ define void @i32_red_example16(i32* %res) {
 ; STORE-LABEL: @i32_red_example16(
 ; STORE:         [[TMP0:%.*]] = load <16 x i32>, <16 x i32>* bitcast ([32 x i32]* @arr_i32 to <16 x i32>*), align 16
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> undef, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <16 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <16 x i32> [[BIN_RDX]], <16 x i32> undef, <16 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <16 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <16 x i32> [[BIN_RDX2]], <16 x i32> undef, <16 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX4:%.*]] = add <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <16 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; STORE-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <16 x i32> [[BIN_RDX4]], <16 x i32> undef, <16 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX6:%.*]] = add <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; STORE-NEXT:    [[BIN_RDX6:%.*]] = add nsw <16 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
 ; STORE-NEXT:    [[TMP1:%.*]] = extractelement <16 x i32> [[BIN_RDX6]], i32 0
 ; STORE:         store i32 [[TMP1]], i32* %res, align 16
 ; STORE-NEXT:    ret void
@@ -731,15 +731,15 @@ define void @i32_red_example32(i32* %res) {
 ; STORE-LABEL: @i32_red_example32(
 ; STORE:         [[TMP0:%.*]] = load <32 x i32>, <32 x i32>* bitcast ([32 x i32]* @arr_i32 to <32 x i32>*), align 16
 ; STORE:         [[RDX_SHUF:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> undef, <32 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX:%.*]] = add <32 x i32> [[TMP0]], [[RDX_SHUF]]
+; STORE-NEXT:    [[BIN_RDX:%.*]] = add nsw <32 x i32> [[TMP0]], [[RDX_SHUF]]
 ; STORE-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <32 x i32> [[BIN_RDX]], <32 x i32> undef, <32 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX2:%.*]] = add <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; STORE-NEXT:    [[BIN_RDX2:%.*]] = add nsw <32 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; STORE-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <32 x i32> [[BIN_RDX2]], <32 x i32> undef, <32 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX4:%.*]] = add <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
+; STORE-NEXT:    [[BIN_RDX4:%.*]] = add nsw <32 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
 ; STORE-NEXT:    [[RDX_SHUF5:%.*]] = shufflevector <32 x i32> [[BIN_RDX4]], <32 x i32> undef, <32 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX6:%.*]] = add <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
+; STORE-NEXT:    [[BIN_RDX6:%.*]] = add nsw <32 x i32> [[BIN_RDX4]], [[RDX_SHUF5]]
 ; STORE-NEXT:    [[RDX_SHUF7:%.*]] = shufflevector <32 x i32> [[BIN_RDX6]], <32 x i32> undef, <32 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; STORE-NEXT:    [[BIN_RDX8:%.*]] = add <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
+; STORE-NEXT:    [[BIN_RDX8:%.*]] = add nsw <32 x i32> [[BIN_RDX6]], [[RDX_SHUF7]]
 ; STORE-NEXT:    [[TMP1:%.*]] = extractelement <32 x i32> [[BIN_RDX8]], i32 0
 ; STORE:         store i32 [[TMP1]], i32* %res, align 16
 ; STORE-NEXT:    ret void
@@ -812,3 +812,98 @@ entry:
   ret void
 }
 
+declare i32 @foobar(i32)
+
+define void @i32_red_call(i32 %val) {
+; CHECK-LABEL: @i32_red_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
+; CHECK-NEXT:    [[RES:%.*]] = call i32 @foobar(i32 [[ADD_6]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+  %add.3 = add nsw i32 %4, %add.2
+  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+  %add.4 = add nsw i32 %5, %add.3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+  %add.5 = add nsw i32 %6, %add.4
+  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+  %add.6 = add nsw i32 %7, %add.5
+  %res = call i32 @foobar(i32 %add.6)
+  ret void
+}
+
+define void @i32_red_invoke(i32 %val) personality i32 (...)* @__gxx_personality_v0 {
+; CHECK-LABEL: @i32_red_invoke(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP1]], [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+; CHECK-NEXT:    [[ADD_1:%.*]] = add nsw i32 [[TMP2]], [[ADD]]
+; CHECK-NEXT:    [[TMP3:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+; CHECK-NEXT:    [[ADD_2:%.*]] = add nsw i32 [[TMP3]], [[ADD_1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+; CHECK-NEXT:    [[ADD_3:%.*]] = add nsw i32 [[TMP4]], [[ADD_2]]
+; CHECK-NEXT:    [[TMP5:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+; CHECK-NEXT:    [[ADD_4:%.*]] = add nsw i32 [[TMP5]], [[ADD_3]]
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+; CHECK-NEXT:    [[ADD_5:%.*]] = add nsw i32 [[TMP6]], [[ADD_4]]
+; CHECK-NEXT:    [[TMP7:%.*]] = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+; CHECK-NEXT:    [[ADD_6:%.*]] = add nsw i32 [[TMP7]], [[ADD_5]]
+; CHECK-NEXT:    [[RES:%.*]] = invoke i32 @foobar(i32 [[ADD_6]])
+; CHECK-NEXT:    to label [[NORMAL:%.*]] unwind label [[EXCEPTION:%.*]]
+; CHECK:       exception:
+; CHECK-NEXT:    [[CLEANUP:%.*]] = landingpad i8
+; CHECK-NEXT:    cleanup
+; CHECK-NEXT:    br label [[NORMAL]]
+; CHECK:       normal:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 0), align 16
+  %1 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 1), align 4
+  %add = add nsw i32 %1, %0
+  %2 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 2), align 8
+  %add.1 = add nsw i32 %2, %add
+  %3 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 3), align 4
+  %add.2 = add nsw i32 %3, %add.1
+  %4 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 4), align 16
+  %add.3 = add nsw i32 %4, %add.2
+  %5 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 5), align 4
+  %add.4 = add nsw i32 %5, %add.3
+  %6 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 6), align 8
+  %add.5 = add nsw i32 %6, %add.4
+  %7 = load i32, i32* getelementptr inbounds ([32 x i32], [32 x i32]* @arr_i32, i64 0, i64 7), align 4
+  %add.6 = add nsw i32 %7, %add.5
+  %res = invoke i32 @foobar(i32 %add.6) to label %normal unwind label %exception
+exception:
+  %cleanup = landingpad i8 cleanup
+  br label %normal
+normal:
+  ret void
+}
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
index 1c70a291a866b8c85131869ea81d969c45ffb4ac..06e051a90b0d933519007a2431ba7609be426837 100644
--- a/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
+++ b/test/Transforms/SLPVectorizer/X86/jumbled-load.ll
@@ -1,31 +1,38 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-threshold=-10 -slp-vectorizer | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
 
 
 
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* %in, i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 2, i32 0>
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* %inn, i64 0
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
+; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 0, i32 1, i32 3, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_3]], [[LOAD_5]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_8]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_4]], [[LOAD_7]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_1]], [[LOAD_6]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* %out, i64 0
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_7]], align 4
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* %out, i64 1
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_8]], align 4
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* %out, i64 2
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_9]], align 4
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* %out, i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_10]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0
diff --git a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
index 1ffbe0d87bf236e6dd1a0685ac884084b8f776a0..47a6a44611d8905093ea7361868af3817726994e 100644
--- a/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
+++ b/test/Transforms/SLPVectorizer/X86/reduction_loads.ll
@@ -5,17 +5,17 @@
 define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* %p, i64 1
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* %p, i64 2
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* %p, i64 3
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* %p, i64 4
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* %p, i64 5
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* %p, i64 6
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* %p, i64 7
-; CHECK-NEXT:    br label %for.body
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i32, i32* [[P:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i32, i32* [[P]], i64 7
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, %entry ], [ %add.7, %for.body ]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* %p to <8 x i32>*
+; CHECK-NEXT:    [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[BIN_EXTRA:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i32* [[P]] to <8 x i32>*
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> <i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42, i32 42>, [[TMP1]]
 ; CHECK-NEXT:    [[ADD:%.*]] = add i32 undef, [[SUM]]
@@ -31,11 +31,12 @@ define i32 @test(i32* nocapture readonly %p) {
 ; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <8 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <8 x i32> [[BIN_RDX2]], <8 x i32> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
 ; CHECK-NEXT:    [[BIN_RDX4:%.*]] = add <8 x i32> [[BIN_RDX2]], [[RDX_SHUF3]]
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
-; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 [[TMP4]], [[SUM]]
-; CHECK-NEXT:    br i1 true, label %for.end, label %for.body
+; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <8 x i32> [[BIN_RDX4]], i32 0
+; CHECK-NEXT:    [[BIN_EXTRA]] = add i32 [[TMP3]], [[SUM]]
+; CHECK-NEXT:    [[ADD_7:%.*]] = add i32 undef, [[ADD_6]]
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 [[ADD_7]]
+; CHECK-NEXT:    ret i32 [[BIN_EXTRA]]
 ;
 entry:
   %arrayidx.1 = getelementptr inbounds i32, i32* %p, i64 1
diff --git a/test/Transforms/SLPVectorizer/X86/scheduling.ll b/test/Transforms/SLPVectorizer/X86/scheduling.ll
index 5377ee82cf97ebf844d34b96e70159e2b53fdba6..c4f521c8963efb00dbb333c596497fa5e11152d7 100644
--- a/test/Transforms/SLPVectorizer/X86/scheduling.ll
+++ b/test/Transforms/SLPVectorizer/X86/scheduling.ll
@@ -8,11 +8,11 @@ define i32 @foo(i32* nocapture readonly %diff) #0 {
 ; CHECK: [[S1:%.+]] = add nsw <4 x i32>
 ; CHECK: store <4 x i32> [[S1]],
 ; CHECK:         [[RDX_SHUF:%.*]] = shufflevector <4 x i32> [[S1]], <4 x i32> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <4 x i32> [[S1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[BIN_RDX:%.*]] = add nsw <4 x i32> [[S1]], [[RDX_SHUF]]
 ; CHECK-NEXT:    [[RDX_SHUF1:%.*]] = shufflevector <4 x i32> [[BIN_RDX]], <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
-; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
+; CHECK-NEXT:    [[BIN_RDX2:%.*]] = add nsw <4 x i32> [[BIN_RDX]], [[RDX_SHUF1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <4 x i32> [[BIN_RDX2]], i32 0
-; CHECK-NEXT:    [[ADD52:%.*]] = add nsw i32 [[TMP15]],
+; CHECK:         [[ADD52:%.*]] = add nsw i32 [[TMP15]],
 ; CHECK:          ret i32 [[ADD52]]
 ;
 entry:
diff --git a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
index 3f772c701455522081341d2bbef256d44ca7a71e..1b2c76384e0b42d503a7a27bc43728aa4917768a 100644
--- a/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
+++ b/test/Transforms/SLPVectorizer/X86/store-jumbled.ll
@@ -1,31 +1,38 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-threshold=-10 -slp-vectorizer | FileCheck %s
+; RUN: opt < %s -S -mtriple=x86_64-unknown -mattr=+avx -slp-vectorizer | FileCheck %s
 
 
 
 define i32 @jumbled-load(i32* noalias nocapture %in, i32* noalias nocapture %inn, i32* noalias nocapture %out) {
 ; CHECK-LABEL: @jumbled-load(
 ; CHECK-NEXT:    [[IN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[IN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_1:%.*]] = load i32, i32* [[IN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_2:%.*]] = load i32, i32* [[GEP_1]], align 4
 ; CHECK-NEXT:    [[GEP_2:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_3:%.*]] = load i32, i32* [[GEP_2]], align 4
 ; CHECK-NEXT:    [[GEP_3:%.*]] = getelementptr inbounds i32, i32* [[IN_ADDR]], i64 3
-; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i32* [[IN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i32>, <4 x i32>* [[TMP1]], align 4
-; CHECK-NEXT:    [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
+; CHECK-NEXT:    [[LOAD_4:%.*]] = load i32, i32* [[GEP_3]], align 4
 ; CHECK-NEXT:    [[INN_ADDR:%.*]] = getelementptr inbounds i32, i32* [[INN:%.*]], i64 0
+; CHECK-NEXT:    [[LOAD_5:%.*]] = load i32, i32* [[INN_ADDR]], align 4
 ; CHECK-NEXT:    [[GEP_4:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 1
+; CHECK-NEXT:    [[LOAD_6:%.*]] = load i32, i32* [[GEP_4]], align 4
 ; CHECK-NEXT:    [[GEP_5:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 2
+; CHECK-NEXT:    [[LOAD_7:%.*]] = load i32, i32* [[GEP_5]], align 4
 ; CHECK-NEXT:    [[GEP_6:%.*]] = getelementptr inbounds i32, i32* [[INN_ADDR]], i64 3
-; CHECK-NEXT:    [[TMP4:%.*]] = bitcast i32* [[INN_ADDR]] to <4 x i32>*
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, <4 x i32>* [[TMP4]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 0, i32 2>
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP3]], [[TMP6]]
+; CHECK-NEXT:    [[LOAD_8:%.*]] = load i32, i32* [[GEP_6]], align 4
+; CHECK-NEXT:    [[MUL_1:%.*]] = mul i32 [[LOAD_1]], [[LOAD_5]]
+; CHECK-NEXT:    [[MUL_2:%.*]] = mul i32 [[LOAD_2]], [[LOAD_6]]
+; CHECK-NEXT:    [[MUL_3:%.*]] = mul i32 [[LOAD_3]], [[LOAD_7]]
+; CHECK-NEXT:    [[MUL_4:%.*]] = mul i32 [[LOAD_4]], [[LOAD_8]]
 ; CHECK-NEXT:    [[GEP_7:%.*]] = getelementptr inbounds i32, i32* [[OUT:%.*]], i64 0
 ; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 1
 ; CHECK-NEXT:    [[GEP_9:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 2
 ; CHECK-NEXT:    [[GEP_10:%.*]] = getelementptr inbounds i32, i32* [[OUT]], i64 3
-; CHECK-NEXT:    [[TMP8:%.*]] = bitcast i32* [[GEP_7]] to <4 x i32>*
-; CHECK-NEXT:    store <4 x i32> [[TMP7]], <4 x i32>* [[TMP8]], align 4
+; CHECK-NEXT:    store i32 [[MUL_1]], i32* [[GEP_9]], align 4
+; CHECK-NEXT:    store i32 [[MUL_2]], i32* [[GEP_7]], align 4
+; CHECK-NEXT:    store i32 [[MUL_3]], i32* [[GEP_10]], align 4
+; CHECK-NEXT:    store i32 [[MUL_4]], i32* [[GEP_8]], align 4
 ; CHECK-NEXT:    ret i32 undef
 ;
   %in.addr = getelementptr inbounds i32, i32* %in, i64 0
diff --git a/test/Transforms/SLPVectorizer/X86/vector.ll b/test/Transforms/SLPVectorizer/X86/vector.ll
index 02a18979c659698ebcccc5f6bd254789afa17db2..e1f3fa50ccdbe389967e16d4733ef2eddb4536ec 100644
--- a/test/Transforms/SLPVectorizer/X86/vector.ll
+++ b/test/Transforms/SLPVectorizer/X86/vector.ll
@@ -1,14 +1,49 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -slp-vectorizer -S -mtriple=x86_64-apple-macosx10.8.0 -mcpu=corei7-avx | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; Make sure that we are not crashing or changing the code.
-;CHECK: test
-;CHECK: icmp
-;CHECK: ret
 define void @test(<4 x i32> %in, <4 x i32> %in2) {
+; CHECK-LABEL: @test(
+; CHECK-NEXT:    [[K:%.*]] = icmp eq <4 x i32> [[IN:%.*]], [[IN2:%.*]]
+; CHECK-NEXT:    ret void
+;
   %k = icmp eq <4 x i32> %in, %in2
   ret void
 }
 
+define i1 @cmpv2f32(<2 x i32> %x, <2 x i32> %y) {
+; CHECK-LABEL: @cmpv2f32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X0:%.*]] = extractelement <2 x i32> [[X:%.*]], i32 0
+; CHECK-NEXT:    [[Y0:%.*]] = extractelement <2 x i32> [[Y:%.*]], i32 0
+; CHECK-NEXT:    [[CMP0:%.*]] = icmp eq i32 [[X0]], [[Y0]]
+; CHECK-NEXT:    br i1 [[CMP0]], label [[IF:%.*]], label [[ENDIF:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[X1:%.*]] = extractelement <2 x i32> [[X]], i32 1
+; CHECK-NEXT:    [[Y1:%.*]] = extractelement <2 x i32> [[Y]], i32 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[X1]], [[Y1]]
+; CHECK-NEXT:    br label [[ENDIF]]
+; CHECK:       endif:
+; CHECK-NEXT:    [[AND_OF_CMPS:%.*]] = phi i1 [ false, [[ENTRY:%.*]] ], [ [[CMP1]], [[IF]] ]
+; CHECK-NEXT:    ret i1 [[AND_OF_CMPS]]
+;
+  entry:
+  %x0 = extractelement <2 x i32> %x, i32 0
+  %y0 = extractelement <2 x i32> %y, i32 0
+  %cmp0 = icmp eq i32 %x0, %y0
+  br i1 %cmp0, label %if, label %endif
+
+  if:
+  %x1 = extractelement <2 x i32> %x, i32 1
+  %y1 = extractelement <2 x i32> %y, i32 1
+  %cmp1 = icmp eq i32 %x1, %y1
+  br label %endif
+
+  endif:
+  %and_of_cmps = phi i1 [ false, %entry ], [ %cmp1, %if ]
+  ret i1 %and_of_cmps
+}
+
diff --git a/test/Transforms/SROA/alloca-address-space.ll b/test/Transforms/SROA/alloca-address-space.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6b3b3abbff5ff710f80984d040507d8c793d27c8
--- /dev/null
+++ b/test/Transforms/SROA/alloca-address-space.ll
@@ -0,0 +1,84 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64-A2"
+
+declare void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* nocapture, i8 addrspace(2)* nocapture readonly, i32, i32, i1)
+declare void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(2)* nocapture readonly, i32, i32, i1)
+declare void @llvm.memcpy.p2i8.p1i8.i32(i8 addrspace(2)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1)
+declare void @llvm.memcpy.p1i8.p1i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i32, i32, i1)
+
+
+
+; CHECK-LABEL: @test_address_space_1_1(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
+define void @test_address_space_1_1(<2 x i64> addrspace(1)* %a, i16 addrspace(1)* %b) {
+  %aa = alloca <2 x i64>, align 16, addrspace(2)
+  %aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
+  %aaptr = bitcast <2 x i64> addrspace(2)* %aa to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p1i8.i32(i8 addrspace(2)* %aaptr, i8 addrspace(1)* %aptr, i32 16, i32 2, i1 false)
+  %bptr = bitcast i16 addrspace(1)* %b to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* %bptr, i8 addrspace(2)* %aaptr, i32 16, i32 2, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @test_address_space_1_0(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(1)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(2)* {{.*}}, align 2
+; CHECK: ret void
+define void @test_address_space_1_0(<2 x i64> addrspace(1)* %a, i16 addrspace(2)* %b) {
+  %aa = alloca <2 x i64>, align 16, addrspace(2)
+  %aptr = bitcast <2 x i64> addrspace(1)* %a to i8 addrspace(1)*
+  %aaptr = bitcast <2 x i64> addrspace(2)* %aa to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p1i8.i32(i8 addrspace(2)* %aaptr, i8 addrspace(1)* %aptr, i32 16, i32 2, i1 false)
+  %bptr = bitcast i16 addrspace(2)* %b to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* %bptr, i8 addrspace(2)* %aaptr, i32 16, i32 2, i1 false)
+  ret void
+}
+
+; CHECK-LABEL: @test_address_space_0_1(
+; CHECK: load <2 x i64>, <2 x i64> addrspace(2)* %a, align 2
+; CHECK: store <2 x i64> {{.*}}, <2 x i64> addrspace(1)* {{.*}}, align 2
+; CHECK: ret void
+define void @test_address_space_0_1(<2 x i64> addrspace(2)* %a, i16 addrspace(1)* %b) {
+  %aa = alloca <2 x i64>, align 16, addrspace(2)
+  %aptr = bitcast <2 x i64> addrspace(2)* %a to i8 addrspace(2)*
+  %aaptr = bitcast <2 x i64> addrspace(2)* %aa to i8 addrspace(2)*
+  call void @llvm.memcpy.p2i8.p2i8.i32(i8 addrspace(2)* %aaptr, i8 addrspace(2)* %aptr, i32 16, i32 2, i1 false)
+  %bptr = bitcast i16 addrspace(1)* %b to i8 addrspace(1)*
+  call void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* %bptr, i8 addrspace(2)* %aaptr, i32 16, i32 2, i1 false)
+  ret void
+}
+
+%struct.struct_test_27.0.13 = type { i32, float, i64, i8, [4 x i32] }
+
+; CHECK-LABEL: @copy_struct(
+; CHECK-NOT: memcpy
+define void @copy_struct([5 x i64] %in.coerce) {
+for.end:
+  %in = alloca %struct.struct_test_27.0.13, align 8, addrspace(2)
+  %0 = bitcast %struct.struct_test_27.0.13 addrspace(2)* %in to [5 x i64] addrspace(2)*
+  store [5 x i64] %in.coerce, [5 x i64] addrspace(2)* %0, align 8
+  %scevgep9 = getelementptr %struct.struct_test_27.0.13, %struct.struct_test_27.0.13 addrspace(2)* %in, i32 0, i32 4, i32 0
+  %scevgep910 = bitcast i32 addrspace(2)* %scevgep9 to i8 addrspace(2)*
+  call void @llvm.memcpy.p1i8.p2i8.i32(i8 addrspace(1)* undef, i8 addrspace(2)* %scevgep910, i32 16, i32 4, i1 false)
+  ret void
+}
+
+%union.anon = type { i32* }
+
+@g = common global i32 0, align 4
+@l = common addrspace(3) global i32 0, align 4
+
+; Make sure an illegal bitcast isn't introduced
+; CHECK-LABEL: @pr27557(
+; CHECK: %[[CAST:.*]] = bitcast i32* addrspace(2)* {{.*}} to i32 addrspace(3)* addrspace(2)*
+; CHECK: store i32 addrspace(3)* @l, i32 addrspace(3)* addrspace(2)* %[[CAST]]
+define void @pr27557() {
+  %1 = alloca %union.anon, align 8, addrspace(2)
+  %2 = bitcast %union.anon addrspace(2)* %1 to i32* addrspace(2)*
+  store i32* @g, i32* addrspace(2)* %2, align 8
+  %3 = bitcast %union.anon addrspace(2)* %1 to i32 addrspace(3)* addrspace(2)*
+  store i32 addrspace(3)* @l, i32 addrspace(3)* addrspace(2)* %3, align 8
+  ret void
+}
diff --git a/test/Transforms/SROA/basictest.ll b/test/Transforms/SROA/basictest.ll
index 70096f37be054de60e49e8a675510981f0ac3546..aa00e89ea04f084aaaf60c5dfa5611d827fbf4a5 100644
--- a/test/Transforms/SROA/basictest.ll
+++ b/test/Transforms/SROA/basictest.ll
@@ -3,8 +3,8 @@
 
 target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64"
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 
 define i32 @test0() {
 ; CHECK-LABEL: @test0(
@@ -16,22 +16,22 @@ entry:
   %a2 = alloca float
 
   %a1.i8 = bitcast i32* %a1 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %a1.i8)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %a1.i8)
 
   store i32 0, i32* %a1
   %v1 = load i32, i32* %a1
 
-  call void @llvm.lifetime.end(i64 4, i8* %a1.i8)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %a1.i8)
 
   %a2.i8 = bitcast float* %a2 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %a2.i8)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %a2.i8)
 
   store float 0.0, float* %a2
   %v2 = load float , float * %a2
   %v2.int = bitcast float %v2 to i32
   %sum1 = add i32 %v1, %v2.int
 
-  call void @llvm.lifetime.end(i64 4, i8* %a2.i8)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %a2.i8)
 
   ret i32 %sum1
 }
@@ -1057,7 +1057,7 @@ define void @PR14059.1(double* %d) {
 entry:
   %X.sroa.0.i = alloca double, align 8
   %0 = bitcast double* %X.sroa.0.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0)
 
   ; Store to the low 32-bits...
   %X.sroa.0.0.cast2.i = bitcast double* %X.sroa.0.i to i32*
@@ -1084,7 +1084,7 @@ entry:
   %accum.real.i = load double, double* %d, align 8
   %add.r.i = fadd double %accum.real.i, %X.sroa.0.0.load1.i
   store double %add.r.i, double* %d, align 8
-  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0)
   ret void
 }
 
@@ -1652,7 +1652,7 @@ define void @PR25873(%struct.STest* %outData) {
 entry:
   %tmpData = alloca %struct.STest, align 8
   %0 = bitcast %struct.STest* %tmpData to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %0)
   %x = getelementptr inbounds %struct.STest, %struct.STest* %tmpData, i64 0, i32 0, i32 0
   store float 1.230000e+02, float* %x, align 8
   %y = getelementptr inbounds %struct.STest, %struct.STest* %tmpData, i64 0, i32 0, i32 1
@@ -1664,7 +1664,7 @@ entry:
   store i64 %3, i64* %2, align 8
   %4 = bitcast %struct.STest* %outData to i8*
   call void @llvm.memcpy.p0i8.p0i8.i64(i8* %4, i8* %0, i64 16, i32 4, i1 false)
-  call void @llvm.lifetime.end(i64 16, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %0)
   ret void
 }
 
@@ -1677,10 +1677,10 @@ define void @PR27999() unnamed_addr {
 entry-block:
   %0 = alloca [2 x i64], align 8
   %1 = bitcast [2 x i64]* %0 to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %1)
   %2 = getelementptr inbounds [2 x i64], [2 x i64]* %0, i32 0, i32 1
   %3 = bitcast i64* %2 to i8*
-  call void @llvm.lifetime.end(i64 8, i8* %3)
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %3)
   ret void
 }
 
@@ -1692,6 +1692,6 @@ bb1:
   %e.7.sroa.6.i = alloca i32, align 1
   %e.7.sroa.6.0.load81.i = load i32, i32* %e.7.sroa.6.i, align 1
   %0 = bitcast i32* %e.7.sroa.6.i to i8*
-  call void @llvm.lifetime.end(i64 2, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 2, i8* %0)
   ret void
 }
diff --git a/test/Transforms/SROA/pr26972.ll b/test/Transforms/SROA/pr26972.ll
index a71058c05b9855bd1d472501ff5198c9a60fab2a..3140a805fc4bc4941674eeb691d123e87f127f5d 100644
--- a/test/Transforms/SROA/pr26972.ll
+++ b/test/Transforms/SROA/pr26972.ll
@@ -10,8 +10,8 @@ target triple = "x86_64-pc-linux"
 define void @fn1() {
   %a = alloca [1073741825 x i32], align 16
   %t0 = bitcast [1073741825 x i32]* %a to i8*
-  call void @llvm.lifetime.end(i64 4294967300, i8* %t0)
+  call void @llvm.lifetime.end.p0i8(i64 4294967300, i8* %t0)
   ret void
 }
 
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
diff --git a/test/Transforms/SROA/preserve-nonnull.ll b/test/Transforms/SROA/preserve-nonnull.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fc5ce6a445fa035aed9fea756a35e24838cae6ef
--- /dev/null
+++ b/test/Transforms/SROA/preserve-nonnull.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -sroa -S | FileCheck %s
+;
+; Make sure that SROA doesn't lose nonnull metadata
+; on loads from allocas that get optimized out.
+
+; CHECK-LABEL: define float* @yummy_nonnull
+; CHECK: [[RETURN:%(.*)]] = load float*, float** %arg, align 8
+; CHECK: [[ASSUME:%(.*)]] = icmp ne float* {{.*}}[[RETURN]], null
+; CHECK: call void @llvm.assume(i1 {{.*}}[[ASSUME]])
+; CHECK: ret float* {{.*}}[[RETURN]]
+
+define float* @yummy_nonnull(float** %arg) {
+entry-block:
+	%buf = alloca float*
+
+	%_arg_i8 = bitcast float** %arg to i8*
+	%_buf_i8 = bitcast float** %buf to i8*
+	call void @llvm.memcpy.p0i8.p0i8.i64(i8* %_buf_i8, i8* %_arg_i8, i64 8, i32 8, i1 false)
+
+	%ret = load float*, float** %buf, align 8, !nonnull !0
+	ret float* %ret
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+!0 = !{}
diff --git a/test/Transforms/SROA/vector-lifetime-intrinsic.ll b/test/Transforms/SROA/vector-lifetime-intrinsic.ll
index 37cf394382ac5ebf5dc558919a8157b582a3f6b6..abb5cb2ea33421de85b393543d77b2eb220a9054 100644
--- a/test/Transforms/SROA/vector-lifetime-intrinsic.ll
+++ b/test/Transforms/SROA/vector-lifetime-intrinsic.ll
@@ -3,10 +3,10 @@
 target datalayout = "e-p:64:32-i64:32-v32:32-n32-S64"
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #0
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #0
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #0
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #0
 
 ; CHECK: @wombat
 ; CHECK-NOT: alloca
@@ -15,12 +15,12 @@ define void @wombat(<4 x float> %arg1) {
 bb:
   %tmp = alloca <4 x float>, align 16
   %tmp8 = bitcast <4 x float>* %tmp to i8*
-  call void @llvm.lifetime.start(i64 16, i8* %tmp8)
+  call void @llvm.lifetime.start.p0i8(i64 16, i8* %tmp8)
   store <4 x float> %arg1, <4 x float>* %tmp, align 16
   %tmp17 = bitcast <4 x float>* %tmp to <3 x float>*
   %tmp18 = load <3 x float>, <3 x float>* %tmp17
   %tmp20 = bitcast <4 x float>* %tmp to i8*
-  call void @llvm.lifetime.end(i64 16, i8* %tmp20)
+  call void @llvm.lifetime.end.p0i8(i64 16, i8* %tmp20)
   call void @wombat3(<3 x float> %tmp18)
   ret void
 }
diff --git a/test/Transforms/SafeStack/AArch64/abi_ssp.ll b/test/Transforms/SafeStack/AArch64/abi_ssp.ll
index 5d584d0a76b9d30ba9d631602be29c8a0ab4d41c..c78b20aaa01abce5a32efcf5da2ae605cb1e6e26 100644
--- a/test/Transforms/SafeStack/AArch64/abi_ssp.ll
+++ b/test/Transforms/SafeStack/AArch64/abi_ssp.ll
@@ -1,5 +1,5 @@
-; RUN: opt -safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefix=TLS %s
-
+; RUN: opt -safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
+; RUN: opt -safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
 
 define void @foo() nounwind uwtable safestack sspreq {
 entry:
@@ -7,7 +7,8 @@ entry:
 ; TLS: call i8* @llvm.thread.pointer()
 
 ; TLS: %[[TP2:.*]] = call i8* @llvm.thread.pointer()
-; TLS: %[[B:.*]] = getelementptr i8, i8* %[[TP2]], i32 40
+; ANDROID: %[[B:.*]] = getelementptr i8, i8* %[[TP2]], i32 40
+; FUCHSIA: %[[B:.*]] = getelementptr i8, i8* %[[TP2]], i32 -16
 ; TLS: %[[C:.*]] = bitcast i8* %[[B]] to i8**
 ; TLS: %[[StackGuard:.*]] = load i8*, i8** %[[C]]
 ; TLS: store i8* %[[StackGuard]], i8** %[[StackGuardSlot:.*]]
diff --git a/test/Transforms/SafeStack/X86/abi_ssp.ll b/test/Transforms/SafeStack/X86/abi_ssp.ll
index ba4ced5b88204b4fec2f167ace5b9ec07b259de0..b489e07a88683f21ea323b312d766433a101341e 100644
--- a/test/Transforms/SafeStack/X86/abi_ssp.ll
+++ b/test/Transforms/SafeStack/X86/abi_ssp.ll
@@ -1,18 +1,25 @@
-; RUN: opt -safe-stack -S -mtriple=i686-pc-linux-gnu < %s -o - | FileCheck --check-prefix=TLS --check-prefix=TLS32 %s
-; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefix=TLS --check-prefix=TLS64 %s
-; RUN: opt -safe-stack -S -mtriple=i686-linux-android < %s -o - | FileCheck --check-prefix=TLS --check-prefix=TLS32 %s
-; RUN: opt -safe-stack -S -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefix=TLS --check-prefix=TLS64 %s
+; RUN: opt -safe-stack -S -mtriple=i686-pc-linux-gnu < %s -o - | FileCheck --check-prefixes=COMMON,TLS32 %s
+; RUN: opt -safe-stack -S -mtriple=x86_64-pc-linux-gnu < %s -o - | FileCheck --check-prefixes=COMMON,TLS64 %s
+
+; RUN: opt -safe-stack -S -mtriple=i686-linux-android < %s -o - | FileCheck --check-prefixes=COMMON,GLOBAL32 %s
+; RUN: opt -safe-stack -S -mtriple=i686-linux-android24 < %s -o - | FileCheck --check-prefixes=COMMON,TLS32 %s
+
+; RUN: opt -safe-stack -S -mtriple=x86_64-linux-android < %s -o - | FileCheck --check-prefixes=COMMON,TLS64 %s
+
+; RUN: opt -safe-stack -S -mtriple=x86_64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=COMMON,FUCHSIA64 %s
 
 define void @foo() safestack sspreq {
 entry:
 ; TLS32: %[[StackGuard:.*]] = load i8*, i8* addrspace(256)* inttoptr (i32 20 to i8* addrspace(256)*)
 ; TLS64: %[[StackGuard:.*]] = load i8*, i8* addrspace(257)* inttoptr (i32 40 to i8* addrspace(257)*)
-; TLS:   store i8* %[[StackGuard]], i8** %[[StackGuardSlot:.*]]
+; FUCHSIA64: %[[StackGuard:.*]] = load i8*, i8* addrspace(257)* inttoptr (i32 16 to i8* addrspace(257)*)
+; GLOBAL32: %[[StackGuard:.*]] = load i8*, i8** @__stack_chk_guard
+; COMMON:   store i8* %[[StackGuard]], i8** %[[StackGuardSlot:.*]]
   %a = alloca i8, align 1
   call void @Capture(i8* %a)
 
-; TLS: %[[A:.*]] = load i8*, i8** %[[StackGuardSlot]]
-; TLS: icmp ne i8* %[[StackGuard]], %[[A]]
+; COMMON: %[[A:.*]] = load i8*, i8** %[[StackGuardSlot]]
+; COMMON: icmp ne i8* %[[StackGuard]], %[[A]]
   ret void
 }
 
diff --git a/test/Transforms/SafeStack/X86/call.ll b/test/Transforms/SafeStack/X86/call.ll
index cbac4ce1bb0d35e9e909a8fc2e2b5337d037bc42..2d78bb1a689882e0179a4ba0fb92471c4bbc59e9 100644
--- a/test/Transforms/SafeStack/X86/call.ll
+++ b/test/Transforms/SafeStack/X86/call.ll
@@ -159,8 +159,8 @@ define void @call_lifetime(i32* %p) {
 entry:
   %q = alloca [100 x i8], align 16
   %0 = bitcast [100 x i8]* %q to i8*
-  call void @llvm.lifetime.start(i64 100, i8* %0)
-  call void @llvm.lifetime.end(i64 100, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 100, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 100, i8* %0)
   ret void
 }
 
@@ -174,5 +174,5 @@ declare void @readnone0(i8* nocapture readnone, i8* nocapture)
 
 declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind argmemonly
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind argmemonly
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind argmemonly
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind argmemonly
diff --git a/test/Transforms/SafeStack/X86/coloring-ssp.ll b/test/Transforms/SafeStack/X86/coloring-ssp.ll
index d71babe200df86f272eaf0c641df594a7d560db4..3b04fdf13fbc6c25e0ce0cc726a301992ba65268 100644
--- a/test/Transforms/SafeStack/X86/coloring-ssp.ll
+++ b/test/Transforms/SafeStack/X86/coloring-ssp.ll
@@ -16,19 +16,19 @@ entry:
   %x0 = bitcast i64* %x to i8*
   %y0 = bitcast i64* %y to i8*
 
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 ; CHECK:  getelementptr i8, i8* %[[USP]], i32 -16
   call void @capture64(i64* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
 
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:  getelementptr i8, i8* %[[USP]], i32 -16
   call void @capture64(i64* %y)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
 
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @capture64(i64*)
diff --git a/test/Transforms/SafeStack/X86/coloring.ll b/test/Transforms/SafeStack/X86/coloring.ll
index 3ed9ccb43f39ebaa1b70b231c0e2b5cae87a950c..76bdf37dbf4e84971ff37ce219ab110dc4eab183 100644
--- a/test/Transforms/SafeStack/X86/coloring.ll
+++ b/test/Transforms/SafeStack/X86/coloring.ll
@@ -10,35 +10,35 @@ entry:
   %x1 = alloca i32, align 4
   %x2 = alloca i32, align 4
   %0 = bitcast i32* %x to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %0)
 
 ; CHECK:  %[[A1:.*]] = getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:  %[[A2:.*]] = bitcast i8* %[[A1]] to i32*
 ; CHECK:  call void @capture(i32* nonnull %[[A2]])
 
   call void @capture(i32* nonnull %x)
-  call void @llvm.lifetime.end(i64 4, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %0)
   %1 = bitcast i32* %x1 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %1)
 
 ; CHECK:  %[[B1:.*]] = getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:  %[[B2:.*]] = bitcast i8* %[[B1]] to i32*
 ; CHECK:  call void @capture(i32* nonnull %[[B2]])
 
   call void @capture(i32* nonnull %x1)
-  call void @llvm.lifetime.end(i64 4, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %1)
   %2 = bitcast i32* %x2 to i8*
-  call void @llvm.lifetime.start(i64 4, i8* %2)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %2)
 
 ; CHECK:  %[[C1:.*]] = getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:  %[[C2:.*]] = bitcast i8* %[[C1]] to i32*
 ; CHECK:  call void @capture(i32* nonnull %[[C2]])
 
   call void @capture(i32* nonnull %x2)
-  call void @llvm.lifetime.end(i64 4, i8* %2)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %2)
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @capture(i32*)
diff --git a/test/Transforms/SafeStack/X86/coloring2.ll b/test/Transforms/SafeStack/X86/coloring2.ll
index f3ac6d735c9dc4a904b996da04a64e449d7cf6e3..2a8f871945ffc3ad5995d3bb4117a0b106a5c769 100644
--- a/test/Transforms/SafeStack/X86/coloring2.ll
+++ b/test/Transforms/SafeStack/X86/coloring2.ll
@@ -15,21 +15,21 @@ entry:
   %y0 = bitcast i32* %y to i8*
   %z0 = bitcast i32* %z to i8*
 
-  call void @llvm.lifetime.start(i64 -1, i8* %z0)
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %z0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
   call void @capture32(i32* %y)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
   call void @capture32(i32* %z)
-  call void @llvm.lifetime.end(i64 -1, i8* %z0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %z0)
 
   ret void
 }
@@ -44,11 +44,11 @@ entry:
   %y = alloca i32, align 4
   %x0 = bitcast i32* %x to i8*
 
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
   call void @capture32(i32* %y)
@@ -70,21 +70,21 @@ entry:
   %y0 = bitcast i32* %y to i8*
   %z0 = bitcast i64* %z to i8*
 
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
   call void @capture32(i32* %y)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
-  call void @llvm.lifetime.start(i64 -1, i8* %z0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %z0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
   call void @capture64(i64* %z)
-  call void @llvm.lifetime.end(i64 -1, i8* %z0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %z0)
 
   ret void
 }
@@ -103,9 +103,9 @@ entry:
   %y0 = bitcast i32* %y to i8*
   %z0 = bitcast i64* %z to i8*
 
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
-  call void @llvm.lifetime.start(i64 -1, i8* %z0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %z0)
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -16
   call void @capture32(i32* %x)
@@ -116,9 +116,9 @@ entry:
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
   call void @capture64(i64* %z)
 
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
-  call void @llvm.lifetime.end(i64 -1, i8* %z0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %z0)
 
   ret void
 }
@@ -156,9 +156,9 @@ entry:
   %z1 = alloca i64, align 8
   %z2 = alloca i64, align 8
   %0 = bitcast i64* %x1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0)
   %1 = bitcast i64* %x2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %1)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %x1)
@@ -169,7 +169,7 @@ entry:
 
 if.then:                                          ; preds = %entry
   %2 = bitcast i64* %y to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %2)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -24
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %y)
@@ -177,29 +177,29 @@ if.then:                                          ; preds = %entry
 
 if.then3:                                         ; preds = %if.then
   %3 = bitcast i64* %y1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %3)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %3)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -32
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %y1)
-  call void @llvm.lifetime.end(i64 -1, i8* %3)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %3)
   br label %if.end
 
 if.else:                                          ; preds = %if.then
   %4 = bitcast i64* %y2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %4)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %4)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -32
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %y2)
-  call void @llvm.lifetime.end(i64 -1, i8* %4)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %4)
   br label %if.end
 
 if.end:                                           ; preds = %if.else, %if.then3
-  call void @llvm.lifetime.end(i64 -1, i8* %2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %2)
   br label %if.end9
 
 if.else4:                                         ; preds = %entry
   %5 = bitcast i64* %z to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %5)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %5)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -24
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %z)
@@ -207,29 +207,29 @@ if.else4:                                         ; preds = %entry
 
 if.then6:                                         ; preds = %if.else4
   %6 = bitcast i64* %z1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %6)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %6)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -32
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %z1)
-  call void @llvm.lifetime.end(i64 -1, i8* %6)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %6)
   br label %if.end8
 
 if.else7:                                         ; preds = %if.else4
   %7 = bitcast i64* %z2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %7)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %7)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -32
 ; CHECK:   call void @capture64(
   call void @capture64(i64* nonnull %z2)
-  call void @llvm.lifetime.end(i64 -1, i8* %7)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %7)
   br label %if.end8
 
 if.end8:                                          ; preds = %if.else7, %if.then6
-  call void @llvm.lifetime.end(i64 -1, i8* %5)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %5)
   br label %if.end9
 
 if.end9:                                          ; preds = %if.end8, %if.end
-  call void @llvm.lifetime.end(i64 -1, i8* %1)
-  call void @llvm.lifetime.end(i64 -1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0)
   ret void
 }
 
@@ -243,21 +243,21 @@ entry:
   %y = alloca i32, align 4
   %x0 = bitcast i32* %x to i8*
   %y0 = bitcast i32* %y to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %x)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %y)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
   ret void
 bb3:
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
   ret void
 }
 
@@ -270,18 +270,18 @@ entry:
   %y = alloca i32, align 4
   %x0 = bitcast i32* %x to i8*
   %y0 = bitcast i32* %y to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %y)
-  call void @llvm.lifetime.end(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %y0)
   ret void
 bb3:
   ret void
@@ -297,14 +297,14 @@ entry:
   %y = alloca i32, align 4
   %x0 = bitcast i32* %x to i8*
   %y0 = bitcast i32* %y to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %y)
@@ -323,14 +323,14 @@ entry:
   %y = alloca i32, align 4
   %x0 = bitcast i32* %x to i8*
   %y0 = bitcast i32* %y to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %x0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %x)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %y)
@@ -352,10 +352,10 @@ entry:
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %x)
-  call void @llvm.lifetime.end(i64 -1, i8* %x0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %x0)
   br i1 %d, label %bb2, label %bb3
 bb2:
-  call void @llvm.lifetime.start(i64 -1, i8* %y0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %y0)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
 ; CHECK:   call void @capture32(
   call void @capture32(i32* %y)
@@ -374,29 +374,29 @@ entry:
   %A.i = alloca [100 x i32], align 4
   %B.i = alloca [100 x i32], align 4
   %0 = bitcast [100 x i32]* %A.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0)
   %1 = bitcast [100 x i32]* %B.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %1)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -400
 ; CHECK:   call void @capture100x32(
   call void @capture100x32([100 x i32]* %A.i)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -800
 ; CHECK:   call void @capture100x32(
   call void @capture100x32([100 x i32]* %B.i)
-  call void @llvm.lifetime.end(i64 -1, i8* %0)
-  call void @llvm.lifetime.end(i64 -1, i8* %1)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1)
   %2 = bitcast [100 x i32]* %A.i1 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %2)
   %3 = bitcast [100 x i32]* %B.i2 to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %3)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %3)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -400
 ; CHECK:   call void @capture100x32(
   call void @capture100x32([100 x i32]* %A.i1)
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -800
 ; CHECK:   call void @capture100x32(
   call void @capture100x32([100 x i32]* %B.i2)
-  call void @llvm.lifetime.end(i64 -1, i8* %2)
-  call void @llvm.lifetime.end(i64 -1, i8* %3)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %2)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %3)
   ret void
 }
 
@@ -408,11 +408,11 @@ entry:
   %buf1 = alloca i8, i32 100000, align 16
   %buf2 = alloca i8, i32 100000, align 16
 
-  call void @llvm.lifetime.start(i64 -1, i8* %buf1)
-  call void @llvm.lifetime.end(i64 -1, i8* %buf1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %buf1)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %buf1)
 
-  call void @llvm.lifetime.start(i64 -1, i8* %buf1)
-  call void @llvm.lifetime.start(i64 -1, i8* %buf2)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %buf1)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %buf2)
   call void @capture8(i8* %buf1)
   call void @capture8(i8* %buf2)
   ret void
@@ -435,13 +435,13 @@ entry:
   %A.i = alloca [100 x i32], align 4
   %B.i = alloca [100 x i32], align 4
   %0 = bitcast [100 x i32]* %A.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %0) nounwind
   %1 = bitcast [100 x i32]* %B.i to i8*
-  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %1) nounwind
   call void @capture100x32([100 x i32]* %A.i)
   call void @capture100x32([100 x i32]* %B.i)
-  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
-  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %1) nounwind
   br label %block2
 
 block2:
@@ -464,13 +464,13 @@ entry:
   %b8 = bitcast [4 x %struct.Klass]* %b.i to i8*
   ; I am used outside the lifetime zone below:
   %z2 = getelementptr inbounds [4 x %struct.Klass], [4 x %struct.Klass]* %a.i, i64 0, i64 0, i32 0
-  call void @llvm.lifetime.start(i64 -1, i8* %a8)
-  call void @llvm.lifetime.start(i64 -1, i8* %b8)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %a8)
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %b8)
   call void @capture8(i8* %a8)
   call void @capture8(i8* %b8)
   %z3 = load i32, i32* %z2, align 16
-  call void @llvm.lifetime.end(i64 -1, i8* %a8)
-  call void @llvm.lifetime.end(i64 -1, i8* %b8)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %a8)
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %b8)
   ret i32 %z3
 }
 
@@ -480,12 +480,12 @@ entry:
 ; CHECK:        %[[USP:.*]] = load i8*, i8** @__safestack_unsafe_stack_ptr
 ; CHECK-NEXT:   getelementptr i8, i8* %[[USP]], i32 -16
   %x = alloca i8, align 4
-  call void @llvm.lifetime.start(i64 4, i8* %x) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) nounwind
   br label %l2
 
 l2:
   call void @capture8(i8* %x)
-  call void @llvm.lifetime.end(i64 4, i8* %x) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %x) nounwind
   br label %l2
 }
 
@@ -498,23 +498,23 @@ entry:
 ; CHECK-NEXT:   getelementptr i8, i8* %[[USP]], i32 -16
   %x = alloca i8, align 4
   %y = alloca i8, align 4
-  call void @llvm.lifetime.start(i64 4, i8* %x) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) nounwind
   br label %l2
 
 l2:
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -8
-  call void @llvm.lifetime.start(i64 4, i8* %y) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %y) nounwind
   call void @capture8(i8* %y)
-  call void @llvm.lifetime.end(i64 4, i8* %y) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %y) nounwind
 
 ; CHECK:   getelementptr i8, i8* %[[USP]], i32 -4
-  call void @llvm.lifetime.start(i64 4, i8* %x) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x) nounwind
   call void @capture8(i8* %x)
   br label %l2
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @capture8(i8*)
 declare void @capture32(i32*)
 declare void @capture64(i64*)
diff --git a/test/Transforms/SafeStack/X86/debug-loc2.ll b/test/Transforms/SafeStack/X86/debug-loc2.ll
index 35e9b7711d2f7ba8a6bca01aebb46fa7349583e4..8059a722fd45c62a896d1b200c78017ba7d7bc86 100644
--- a/test/Transforms/SafeStack/X86/debug-loc2.ll
+++ b/test/Transforms/SafeStack/X86/debug-loc2.ll
@@ -40,12 +40,12 @@ entry:
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 declare void @capture(i32*) #2
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
diff --git a/test/Transforms/SafeStack/X86/layout-frag.ll b/test/Transforms/SafeStack/X86/layout-frag.ll
index 125eb0f8be9ae8ae580bd7011f44241f98a22877..b127defc2c5d86d47346e6e94fa3118ceb4876c2 100644
--- a/test/Transforms/SafeStack/X86/layout-frag.ll
+++ b/test/Transforms/SafeStack/X86/layout-frag.ll
@@ -14,16 +14,16 @@ entry:
   %x0a = bitcast i64* %x0 to i8*
   %x2a = bitcast i64* %x2 to i8*
 
-  call void @llvm.lifetime.start(i64 4, i8* %x0a)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x0a)
   call void @capture64(i64* %x0)
-  call void @llvm.lifetime.end(i64 4, i8* %x0a)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %x0a)
 
-  call void @llvm.lifetime.start(i64 4, i8* %x1)
-  call void @llvm.lifetime.start(i64 4, i8* %x2a)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x1)
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %x2a)
   call void @capture8(i8* %x1)
   call void @capture64(i64* %x2)
-  call void @llvm.lifetime.end(i64 4, i8* %x1)
-  call void @llvm.lifetime.end(i64 4, i8* %x2a)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %x1)
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %x2a)
 
 ; Test that i64 allocas share space.
 ; CHECK: getelementptr i8, i8* %unsafe_stack_ptr, i32 -8
@@ -33,7 +33,7 @@ entry:
   ret void
 }
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
 declare void @capture8(i8*)
 declare void @capture64(i64*)
diff --git a/test/Transforms/SampleProfile/Inputs/import.prof b/test/Transforms/SampleProfile/Inputs/import.prof
new file mode 100644
index 0000000000000000000000000000000000000000..efadc0c5c9c6d803e3a0525915936e8d3e4d5c8b
--- /dev/null
+++ b/test/Transforms/SampleProfile/Inputs/import.prof
@@ -0,0 +1,4 @@
+main:10000:0
+ 3: foo:1000
+  3: bar:200
+   4: baz:10
diff --git a/test/Transforms/SampleProfile/Inputs/indirect-call.afdo b/test/Transforms/SampleProfile/Inputs/indirect-call.afdo
new file mode 100644
index 0000000000000000000000000000000000000000..2d5b345e960e75027e9a9d9e69e9d35366224554
Binary files /dev/null and b/test/Transforms/SampleProfile/Inputs/indirect-call.afdo differ
diff --git a/test/Transforms/SampleProfile/Inputs/indirect-call.prof b/test/Transforms/SampleProfile/Inputs/indirect-call.prof
index 534975e6270855e49c39b7e38ae08c611082c82f..ac32967bd546a7dd9b8bb20a0c641e5bcebf59bd 100644
--- a/test/Transforms/SampleProfile/Inputs/indirect-call.prof
+++ b/test/Transforms/SampleProfile/Inputs/indirect-call.prof
@@ -1,2 +1,11 @@
 test:63067:0
  4: 3345 _Z3barv:1398 _Z3foov:2059
+test_inline:3000:0
+ 5: foo_inline:3000
+  1: 3000
+test_noinline:3000:0
+ 5: foo_noinline:3000
+  1: 3000
+test_direct:3000:0
+ 5: foo_direct:3000
+  1: 3000
diff --git a/test/Transforms/SampleProfile/branch.ll b/test/Transforms/SampleProfile/branch.ll
index 2ef01a76b0f0856ea14b68736c876c1f7cb3a740..5a5160e6343aa829eebfce75671412cff254c629 100644
--- a/test/Transforms/SampleProfile/branch.ll
+++ b/test/Transforms/SampleProfile/branch.ll
@@ -87,7 +87,9 @@ for.cond:                                         ; preds = %for.inc, %if.then.2
   %6 = load i32, i32* %u, align 4, !dbg !46
   %7 = load i32, i32* %limit, align 4, !dbg !48
   %cmp5 = icmp slt i32 %6, %7, !dbg !49
-  br i1 %cmp5, label %for.body, label %for.end, !dbg !50
+  br i1 %cmp5, label %for.body, label %for.end, !dbg !50, !prof !80
+; CHECK: edge for.cond -> for.body probability is 0x73333333 / 0x80000000 = 90.00%
+; CHECK: edge for.cond -> for.end probability is 0x0ccccccd / 0x80000000 = 10.00%
 
 for.body:                                         ; preds = %for.cond
   call void @llvm.dbg.declare(metadata double* %x, metadata !51, metadata !17), !dbg !53
@@ -237,3 +239,4 @@ attributes #4 = { nounwind readonly }
 !77 = !DILocation(line: 20, column: 4, scope: !6)
 !78 = !DILocation(line: 21, column: 4, scope: !6)
 !79 = !DILocation(line: 22, column: 2, scope: !6)
+!80 = !{!"branch_weights", i32 90, i32 10}
diff --git a/test/Transforms/SampleProfile/calls.ll b/test/Transforms/SampleProfile/calls.ll
index 45909ddf3e5475d6e15bfcf388c902ceaaedf0d6..3539c771627a911ff905eaec9206405d84c82386 100644
--- a/test/Transforms/SampleProfile/calls.ll
+++ b/test/Transforms/SampleProfile/calls.ll
@@ -48,8 +48,8 @@ while.cond:                                       ; preds = %if.end, %entry
   store i32 %inc, i32* %i, align 4, !dbg !14
   %cmp = icmp slt i32 %0, 400000000, !dbg !14
   br i1 %cmp, label %while.body, label %while.end, !dbg !14
-; CHECK: edge while.cond -> while.body probability is 0x7d9eb367 / 0x80000000 = 98.14% [HOT edge]
-; CHECK: edge while.cond -> while.end probability is 0x02614c99 / 0x80000000 = 1.86%
+; CHECK: edge while.cond -> while.body probability is 0x77f2798d / 0x80000000 = 93.71% [HOT edge]
+; CHECK: edge while.cond -> while.end probability is 0x080d8673 / 0x80000000 = 6.29%
 
 while.body:                                       ; preds = %while.cond
   %1 = load i32, i32* %i, align 4, !dbg !16
@@ -59,8 +59,8 @@ while.body:                                       ; preds = %while.cond
 ; both branches out of while.body had the same weight. In reality,
 ; the edge while.body->if.then is taken most of the time.
 ;
-; CHECK: edge while.body -> if.else probability is 0x00059704 / 0x80000000 = 0.02%
-; CHECK: edge while.body -> if.then probability is 0x7ffa68fc / 0x80000000 = 99.98% [HOT edge]
+; CHECK: edge while.body -> if.else probability is 0x0005b1e0 / 0x80000000 = 0.02%
+; CHECK: edge while.body -> if.then probability is 0x7ffa4e20 / 0x80000000 = 99.98% [HOT edge]
 
 
 if.then:                                          ; preds = %while.body
@@ -103,14 +103,14 @@ declare i32 @printf(i8*, ...) #2
 !12 = !DILocation(line: 8, scope: !7)
 !13 = !DILocation(line: 9, scope: !7)
 !14 = !DILocation(line: 9, scope: !15)
-!15 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !7)
+!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7)
 !16 = !DILocation(line: 10, scope: !17)
 !17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7)
 !18 = !DILocation(line: 10, scope: !19)
-!19 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !17)
+!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
 !20 = !DILocation(line: 10, scope: !21)
-!21 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
+!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17)
 !22 = !DILocation(line: 10, scope: !23)
-!23 = !DILexicalBlockFile(discriminator: 3, file: !1, scope: !17)
+!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17)
 !24 = !DILocation(line: 11, scope: !7)
 !25 = !DILocation(line: 12, scope: !7)
diff --git a/test/Transforms/SampleProfile/cov-zero-samples.ll b/test/Transforms/SampleProfile/cov-zero-samples.ll
index 7ccaa3e7d756b9adfe3e528940636922ae8565e5..5239d74fdc6e7949ebd60dca6c70de3993f75572 100644
--- a/test/Transforms/SampleProfile/cov-zero-samples.ll
+++ b/test/Transforms/SampleProfile/cov-zero-samples.ll
@@ -106,7 +106,7 @@ attributes #0 = { nounwind readnone }
 !13 = !{!14, !14}
 !14 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
 !15 = !DILocation(line: 5, column: 27, scope: !16)
-!16 = !DILexicalBlockFile(scope: !11, file: !3, discriminator: 3)
+!16 = !DILexicalBlockFile(scope: !11, file: !3, discriminator: 6)
 !17 = distinct !DISubprogram(name: "main", scope: !3, file: !3, line: 7, type: !18, isLocal: false, isDefinition: true, scopeLine: 7, flags: DIFlagPrototyped, isOptimized: false, unit: !2, variables: !4)
 !18 = !DISubroutineType(types: !19)
 !19 = !{!14}
@@ -118,7 +118,7 @@ attributes #0 = { nounwind readnone }
 !25 = !DILocation(line: 9, column: 18, scope: !24)
 !26 = !DILocation(line: 9, column: 8, scope: !24)
 !27 = !DILocation(line: 9, column: 25, scope: !28)
-!28 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 1)
+!28 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
 !29 = distinct !DILexicalBlock(scope: !24, file: !3, line: 9, column: 3)
 !30 = !DILocation(line: 9, column: 29, scope: !28)
 !31 = !DILocation(line: 9, column: 27, scope: !28)
@@ -130,7 +130,7 @@ attributes #0 = { nounwind readnone }
 !37 = !DILocation(line: 10, column: 11, scope: !34)
 !38 = !DILocation(line: 10, column: 9, scope: !35)
 !39 = !DILocation(line: 10, column: 36, scope: !40)
-!40 = !DILexicalBlockFile(scope: !34, file: !3, discriminator: 1)
+!40 = !DILexicalBlockFile(scope: !34, file: !3, discriminator: 2)
 !41 = !DILocation(line: 10, column: 23, scope: !40)
 !42 = !DILocation(line: 10, column: 20, scope: !40)
 !43 = !DILocation(line: 10, column: 16, scope: !40)
@@ -139,7 +139,7 @@ attributes #0 = { nounwind readnone }
 !46 = !DILocation(line: 11, column: 9, scope: !35)
 !47 = !DILocation(line: 12, column: 3, scope: !35)
 !48 = !DILocation(line: 9, column: 33, scope: !49)
-!49 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 2)
+!49 = !DILexicalBlockFile(scope: !29, file: !3, discriminator: 4)
 !50 = !DILocation(line: 9, column: 3, scope: !49)
 !51 = !DILocation(line: 13, column: 25, scope: !17)
 !52 = !DILocation(line: 13, column: 3, scope: !17)
diff --git a/test/Transforms/SampleProfile/discriminator.ll b/test/Transforms/SampleProfile/discriminator.ll
index d0b96a9ea16e623d670f67f2752578e01b2bc83f..85f6cbe8fb4af10de69d08fe6c2c89c1047c1305 100644
--- a/test/Transforms/SampleProfile/discriminator.ll
+++ b/test/Transforms/SampleProfile/discriminator.ll
@@ -79,12 +79,12 @@ while.end:                                        ; preds = %while.cond
 !10 = !DILocation(line: 2, scope: !4)
 !11 = !DILocation(line: 3, scope: !4)
 !12 = !DILocation(line: 3, scope: !13)
-!13 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !4)
+!13 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !4)
 !14 = !DILocation(line: 4, scope: !15)
 !15 = distinct !DILexicalBlock(line: 4, column: 0, file: !1, scope: !16)
 !16 = distinct !DILexicalBlock(line: 3, column: 0, file: !1, scope: !4)
 !17 = !DILocation(line: 4, scope: !18)
-!18 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !15)
+!18 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !15)
 !19 = !DILocation(line: 5, scope: !16)
 !20 = !DILocation(line: 6, scope: !16)
 !21 = !DILocation(line: 7, scope: !4)
diff --git a/test/Transforms/SampleProfile/early-inline.ll b/test/Transforms/SampleProfile/early-inline.ll
index 780ff4751f40a6b9a29de1a20c2f6b4a0e180a0d..51e7d243c187d7880bcbac0d300e28c53621b2bb 100644
--- a/test/Transforms/SampleProfile/early-inline.ll
+++ b/test/Transforms/SampleProfile/early-inline.ll
@@ -41,8 +41,8 @@ declare i32 @__gxx_personality_v0(...)
 !1 = !DIFile(filename: "a", directory: "b/")
 !3 = !{i32 2, !"Dwarf Version", i32 4}
 !4 = !{i32 2, !"Debug Info Version", i32 3}
-!6 = distinct !DISubprogram(linkageName: "_Z3foov", scope: !1, line: 5, scopeLine: 5, unit: !0)
+!6 = distinct !DISubprogram(linkageName: "_Z3foov", scope: !1, file: !1, line: 5, scopeLine: 5, unit: !0)
 !9 = !DILocation(line: 6, column: 3, scope: !6)
 !10 = !DILocation(line: 8, column: 5, scope: !11)
 !11 = distinct !DILexicalBlock(scope: !6, file: !1, line: 7, column: 7)
-!12 = distinct !DISubprogram(linkageName: "_ZL3barv", scope: !1, line: 20, scopeLine: 20, unit: !0)
+!12 = distinct !DISubprogram(linkageName: "_ZL3barv", scope: !1, file: !1, line: 20, scopeLine: 20, unit: !0)
diff --git a/test/Transforms/SampleProfile/fnptr.ll b/test/Transforms/SampleProfile/fnptr.ll
index 0c671a7882f6a937c046a78dc8e3eb5d88dc869e..1b01d0c0c85721dbf2ec8fb65dc987526739a40a 100644
--- a/test/Transforms/SampleProfile/fnptr.ll
+++ b/test/Transforms/SampleProfile/fnptr.ll
@@ -8,10 +8,10 @@
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/fnptr.prof | opt -analyze -branch-prob | FileCheck %s
 ; RUN: opt < %s -passes=sample-profile -sample-profile-file=%S/Inputs/fnptr.binprof | opt -analyze -branch-prob | FileCheck %s
 
-; CHECK:   edge for.body3 -> if.then probability is 0x19f584f3 / 0x80000000 = 20.28%
-; CHECK:   edge for.body3 -> if.else probability is 0x660a7b0d / 0x80000000 = 79.72%
-; CHECK:   edge for.inc -> for.inc12 probability is 0x000f92fb / 0x80000000 = 0.05%
-; CHECK:   edge for.inc -> for.body3 probability is 0x7ff06d05 / 0x80000000 = 99.95%
+; CHECK:   edge for.body3 -> if.then probability is 0x1a56a56a / 0x80000000 = 20.58%
+; CHECK:   edge for.body3 -> if.else probability is 0x65a95a96 / 0x80000000 = 79.42%
+; CHECK:   edge for.inc -> for.inc12 probability is 0x000fbd1c / 0x80000000 = 0.05%
+; CHECK:   edge for.inc -> for.body3 probability is 0x7ff042e4 / 0x80000000 = 99.95%
 ; CHECK:   edge for.inc12 -> for.end14 probability is 0x04000000 / 0x80000000 = 3.12%
 ; CHECK:   edge for.inc12 -> for.cond1.preheader probability is 0x7c000000 / 0x80000000 = 96.88%
 
diff --git a/test/Transforms/SampleProfile/import.ll b/test/Transforms/SampleProfile/import.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1ee45fb4fd3ef1291bfec6eb9a3821d506498dc6
--- /dev/null
+++ b/test/Transforms/SampleProfile/import.ll
@@ -0,0 +1,31 @@
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/import.prof -S | FileCheck %s
+
+; Tests whether the functions in the inline stack are added to the
+; function_entry_count metadata.
+
+declare void @foo()
+
+define void @main() !dbg !7 {
+  call void @foo(), !dbg !18
+  ret void
+}
+
+; GUIDs of foo and bar should be included in the metadata to make sure hot
+; inline stacks are imported.
+; CHECK: !{!"function_entry_count", i64 1, i64 6699318081062747564, i64 -2012135647395072713}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9}
+!llvm.ident = !{!10}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.5 ", isOptimized: false, emissionKind: NoDebug, file: !1, enums: !2, retainedTypes: !2, globals: !2, imports: !2)
+!1 = !DIFile(filename: "calls.cc", directory: ".")
+!2 = !{}
+!6 = !DISubroutineType(types: !2)
+!7 = distinct !DISubprogram(name: "main", line: 7, isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: false, unit: !0, scopeLine: 7, file: !1, scope: !1, type: !6, variables: !2)
+!8 = !{i32 2, !"Dwarf Version", i32 4}
+!9 = !{i32 1, !"Debug Info Version", i32 3}
+!10 = !{!"clang version 3.5 "}
+!15 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !7)
+!17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7)
+!18 = !DILocation(line: 10, scope: !17)
diff --git a/test/Transforms/SampleProfile/indirect-call-gcc.ll b/test/Transforms/SampleProfile/indirect-call-gcc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..678c7931250eb889f9bc28362b5cb48e1b641d00
--- /dev/null
+++ b/test/Transforms/SampleProfile/indirect-call-gcc.ll
@@ -0,0 +1,26 @@
+; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call.afdo -S | FileCheck %s
+
+; Checks if indirect call targets are read correctly when reading from gcc
+; format profile.
+; It is expected to fail on certain architectures as gcc profile reader does
+; not work.
+; XFAIL: powerpc64-, s390x, mips-, mips64-, sparc
+
+define void @test(void ()*) !dbg !3 {
+  %2 = alloca void ()*
+  store void ()* %0, void ()** %2
+  %3 = load void ()*, void ()** %2
+  ; CHECK: call {{.*}}, !prof ![[PROF:[0-9]+]]
+  call void %3(), !dbg !4
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1)
+!1 = !DIFile(filename: "test.cc", directory: "/")
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, unit: !0)
+!4 = !DILocation(line: 5, scope: !3)
+; CHECK: ![[PROF]] = !{!"VP", i32 0, i64 3457, i64 9191153033785521275, i64 2059, i64 -1069303473483922844, i64 1398}
diff --git a/test/Transforms/SampleProfile/indirect-call.ll b/test/Transforms/SampleProfile/indirect-call.ll
index 01192d80e94eb3b399e88811414544349fb3313b..e6e294fd6bfdf7c6f7767b35d5dd0881ae34bd2d 100644
--- a/test/Transforms/SampleProfile/indirect-call.ll
+++ b/test/Transforms/SampleProfile/indirect-call.ll
@@ -1,5 +1,6 @@
 ; RUN: opt < %s -sample-profile -sample-profile-file=%S/Inputs/indirect-call.prof -S | FileCheck %s
 
+; CHECK-LABEL: @test
 define void @test(void ()*) !dbg !3 {
   %2 = alloca void ()*
   store void ()* %0, void ()** %2
@@ -9,6 +10,58 @@ define void @test(void ()*) !dbg !3 {
   ret void
 }
 
+; CHECK-LABEL: @test_inline
+; If the indirect call is promoted and inlined in profile, we should promote and inline it.
+define void @test_inline(i64* (i32*)*, i32* %x) !dbg !3 {
+  %2 = alloca i64* (i32*)*
+  store i64* (i32*)* %0, i64* (i32*)** %2
+  %3 = load i64* (i32*)*, i64* (i32*)** %2
+; CHECK: icmp {{.*}} @foo_inline
+; CHECK: if.true.direct_targ:
+; CHECK-NOT: call
+; CHECK: if.false.orig_indirect:
+; CHECK: call
+  call i64* %3(i32* %x), !dbg !5
+  ret void
+}
+
+; CHECK-LABEL: @test_noinline
+; If the indirect call target is not available, we should not promote it.
+define void @test_noinline(void ()*) !dbg !3 {
+  %2 = alloca void ()*
+  store void ()* %0, void ()** %2
+  %3 = load void ()*, void ()** %2
+; CHECK-NOT: icmp
+; CHECK: call
+  call void %3(), !dbg !5
+  ret void
+}
+
+@x = global i32 0, align 4
+
+define i32* @foo_inline(i32* %x) !dbg !3 {
+  ret i32* %x
+}
+
+define i32 @foo_noinline(i32 %x) !dbg !3 {
+  ret i32 %x
+}
+
+define void @foo_direct() !dbg !3 {
+  ret void
+}
+
+; CHECK-LABEL: @test_direct
+; We should not promote a direct call.
+define void @test_direct() !dbg !3 {
+; CHECK-NOT: icmp
+; CHECK: call
+  call void @foo_alias(), !dbg !5
+  ret void
+}
+
+@foo_alias = alias void (), void ()* @foo_direct
+
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!2}
 
@@ -17,4 +70,5 @@ define void @test(void ()*) !dbg !3 {
 !2 = !{i32 2, !"Debug Info Version", i32 3}
 !3 = distinct !DISubprogram(name: "test", scope: !1, file: !1, line: 1, unit: !0)
 !4 = !DILocation(line: 5, scope: !3)
+!5 = !DILocation(line: 6, scope: !3)
 ; CHECK: ![[PROF]] = !{!"VP", i32 0, i64 3457, i64 9191153033785521275, i64 2059, i64 -1069303473483922844, i64 1398}
diff --git a/test/Transforms/SampleProfile/inline-coverage.ll b/test/Transforms/SampleProfile/inline-coverage.ll
index c88e7f865fa2f35f9551897a14418c01d0faaa87..080876a464716c469ce82cf8306842e6c554ed3c 100644
--- a/test/Transforms/SampleProfile/inline-coverage.ll
+++ b/test/Transforms/SampleProfile/inline-coverage.ll
@@ -16,7 +16,7 @@
 ;    11      return sum > 0 ? 0 : 1;
 ;    12    }
 ;
-; CHECK: remark: coverage.cc:10:12: inlined hot callee '_Z3fool' with 172746 samples into 'main'
+; CHECK: remark: coverage.cc:10:12: inlined hot callee '_Z3fool' into 'main'
 ; CHECK: remark: coverage.cc:9:21: Applied 23478 samples from profile (offset: 2.1)
 ; CHECK: remark: coverage.cc:10:16: Applied 23478 samples from profile (offset: 3)
 ; CHECK: remark: coverage.cc:4:10: Applied 31878 samples from profile (offset: 1)
@@ -120,7 +120,7 @@ for.end:                                          ; preds = %for.cond
 !27 = !DILocation(line: 9, column: 12, scope: !26)
 !28 = !DILocation(line: 9, column: 8, scope: !26)
 !29 = !DILocation(line: 9, column: 19, scope: !30)
-!30 = !DILexicalBlockFile(scope: !31, file: !1, discriminator: 1)
+!30 = !DILexicalBlockFile(scope: !31, file: !1, discriminator: 2)
 !31 = distinct !DILexicalBlock(scope: !26, file: !1, line: 9, column: 3)
 !32 = !DILocation(line: 9, column: 21, scope: !30)
 !33 = !DILocation(line: 9, column: 3, scope: !30)
diff --git a/test/Transforms/SampleProfile/inline.ll b/test/Transforms/SampleProfile/inline.ll
index ed353834137bd39bff4fa9ecb583a229e2633cce..3ed8988968f6819ace417ded959ef699aa137e33 100644
--- a/test/Transforms/SampleProfile/inline.ll
+++ b/test/Transforms/SampleProfile/inline.ll
@@ -96,14 +96,14 @@ declare i32 @printf(i8*, ...) #2
 !12 = !DILocation(line: 8, scope: !7)
 !13 = !DILocation(line: 9, scope: !7)
 !14 = !DILocation(line: 9, scope: !15)
-!15 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !7)
+!15 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !7)
 !16 = !DILocation(line: 10, scope: !17)
 !17 = distinct !DILexicalBlock(line: 10, column: 0, file: !1, scope: !7)
 !18 = !DILocation(line: 10, scope: !19)
-!19 = !DILexicalBlockFile(discriminator: 1, file: !1, scope: !17)
+!19 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
 !20 = !DILocation(line: 10, scope: !21)
-!21 = !DILexicalBlockFile(discriminator: 2, file: !1, scope: !17)
+!21 = !DILexicalBlockFile(discriminator: 4, file: !1, scope: !17)
 !22 = !DILocation(line: 10, scope: !23)
-!23 = !DILexicalBlockFile(discriminator: 3, file: !1, scope: !17)
+!23 = !DILexicalBlockFile(discriminator: 6, file: !1, scope: !17)
 !24 = !DILocation(line: 11, scope: !7)
 !25 = !DILocation(line: 12, scope: !7)
diff --git a/test/Transforms/SampleProfile/propagate.ll b/test/Transforms/SampleProfile/propagate.ll
index 45e3b8003ffc1444697d66d34b8639f0e8f54618..5a4922bde93586c33c94a86d8d8669ea532763a4 100644
--- a/test/Transforms/SampleProfile/propagate.ll
+++ b/test/Transforms/SampleProfile/propagate.ll
@@ -244,7 +244,7 @@ attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !31 = !DILocation(line: 7, column: 15, scope: !29)
 !32 = !DILocation(line: 7, column: 10, scope: !29)
 !33 = !DILocation(line: 7, column: 22, scope: !34)
-!34 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 1)
+!34 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 2)
 !35 = distinct !DILexicalBlock(scope: !29, file: !1, line: 7, column: 5)
 !36 = !DILocation(line: 7, column: 26, scope: !34)
 !37 = !DILocation(line: 7, column: 24, scope: !34)
@@ -275,7 +275,7 @@ attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !62 = !DILocation(line: 14, column: 24, scope: !59)
 !63 = !DILocation(line: 14, column: 14, scope: !59)
 !64 = !DILocation(line: 14, column: 31, scope: !65)
-!65 = !DILexicalBlockFile(scope: !66, file: !1, discriminator: 1)
+!65 = !DILexicalBlockFile(scope: !66, file: !1, discriminator: 2)
 !66 = distinct !DILexicalBlock(scope: !59, file: !1, line: 14, column: 9)
 !67 = !DILocation(line: 14, column: 33, scope: !65)
 !68 = !DILocation(line: 14, column: 9, scope: !65)
@@ -285,11 +285,11 @@ attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !72 = !DILocation(line: 16, column: 13, scope: !70)
 !73 = !DILocation(line: 17, column: 9, scope: !70)
 !74 = !DILocation(line: 14, column: 41, scope: !75)
-!75 = !DILexicalBlockFile(scope: !66, file: !1, discriminator: 2)
+!75 = !DILexicalBlockFile(scope: !66, file: !1, discriminator: 4)
 !76 = !DILocation(line: 14, column: 9, scope: !75)
 !77 = !DILocation(line: 19, column: 5, scope: !41)
 !78 = !DILocation(line: 7, column: 30, scope: !79)
-!79 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 2)
+!79 = !DILexicalBlockFile(scope: !35, file: !1, discriminator: 4)
 !80 = !DILocation(line: 7, column: 5, scope: !79)
 !81 = !DILocation(line: 21, column: 10, scope: !6)
 !82 = !DILocation(line: 21, column: 14, scope: !6)
@@ -313,5 +313,5 @@ attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-
 !100 = !DILocation(line: 28, column: 57, scope: !86)
 !101 = !DILocation(line: 28, column: 47, scope: !86)
 !102 = !DILocation(line: 28, column: 3, scope: !103)
-!103 = !DILexicalBlockFile(scope: !86, file: !1, discriminator: 1)
+!103 = !DILexicalBlockFile(scope: !86, file: !1, discriminator: 2)
 !104 = !DILocation(line: 29, column: 3, scope: !86)
diff --git a/test/Transforms/SampleProfile/remarks.ll b/test/Transforms/SampleProfile/remarks.ll
index 908e4f8b10b4c78334dbecc943c02ddfedb5c823..dfb075ee00eafad51d24e22ce5de8b3ddd316053 100644
--- a/test/Transforms/SampleProfile/remarks.ll
+++ b/test/Transforms/SampleProfile/remarks.ll
@@ -19,7 +19,7 @@
 
 ; We are expecting foo() to be inlined in main() (almost all the cycles are
 ; spent inside foo).
-; CHECK: remark: remarks.cc:13:21: inlined hot callee '_Z3foov' with 623868 samples into 'main'
+; CHECK: remark: remarks.cc:13:21: inlined hot callee '_Z3foov' into 'main'
 
 ; The back edge for the loop is the hottest edge in the loop subgraph.
 ; CHECK: remark: remarks.cc:6:9: most popular destination for conditional branches at remarks.cc:5:3
@@ -33,11 +33,11 @@ entry:
   %sum = alloca i64, align 8
   %i = alloca i32, align 4
   %0 = bitcast i64* %sum to i8*, !dbg !19
-  call void @llvm.lifetime.start(i64 8, i8* %0) #4, !dbg !19
+  call void @llvm.lifetime.start.p0i8(i64 8, i8* %0) #4, !dbg !19
   call void @llvm.dbg.declare(metadata i64* %sum, metadata !9, metadata !20), !dbg !21
   store i64 0, i64* %sum, align 8, !dbg !21, !tbaa !22
   %1 = bitcast i32* %i to i8*, !dbg !26
-  call void @llvm.lifetime.start(i64 4, i8* %1) #4, !dbg !26
+  call void @llvm.lifetime.start.p0i8(i64 4, i8* %1) #4, !dbg !26
   call void @llvm.dbg.declare(metadata i32* %i, metadata !10, metadata !20), !dbg !27
   store i32 0, i32* %i, align 4, !dbg !27, !tbaa !28
   br label %for.cond, !dbg !26
@@ -49,7 +49,7 @@ for.cond:                                         ; preds = %for.inc, %entry
 
 for.cond.cleanup:                                 ; preds = %for.cond
   %3 = bitcast i32* %i to i8*, !dbg !36
-  call void @llvm.lifetime.end(i64 4, i8* %3) #4, !dbg !36
+  call void @llvm.lifetime.end.p0i8(i64 4, i8* %3) #4, !dbg !36
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
@@ -88,12 +88,12 @@ for.inc:                                          ; preds = %if.end
 for.end:                                          ; preds = %for.cond.cleanup
   %10 = load i64, i64* %sum, align 8, !dbg !53, !tbaa !22
   %11 = bitcast i64* %sum to i8*, !dbg !54
-  call void @llvm.lifetime.end(i64 8, i8* %11) #4, !dbg !54
+  call void @llvm.lifetime.end.p0i8(i64 8, i8* %11) #4, !dbg !54
   ret i64 %10, !dbg !55
 }
 
 ; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) #1
 
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
@@ -102,7 +102,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
 declare i32 @rand() #3
 
 ; Function Attrs: nounwind argmemonly
-declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) #1
 
 ; Function Attrs: nounwind uwtable
 define i32 @main() #0 !dbg !13 {
diff --git a/test/Transforms/Scalarizer/vector-gep.ll b/test/Transforms/Scalarizer/vector-gep.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eacddf136a324dfd780e2fa1bd7a03a48aefe000
--- /dev/null
+++ b/test/Transforms/Scalarizer/vector-gep.ll
@@ -0,0 +1,122 @@
+; RUN: opt -S -scalarizer %s | FileCheck %s
+
+; Check that the scalarizer can handle vector GEPs with scalar indices
+
+@vec = global <4 x i16*> <i16* null, i16* null, i16* null, i16* null>
+@index = global i16 1
+@ptr = global [4 x i16] [i16 1, i16 2, i16 3, i16 4]
+@ptrptr = global i16* null
+
+; constant index
+define void @test1() {
+bb:
+  %0 = load <4 x i16*>, <4 x i16*>* @vec
+  %1 = getelementptr i16, <4 x i16*> %0, i16 1
+
+  ret void
+}
+
+;CHECK-LABEL: @test1
+;CHECK: %[[I0:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 0
+;CHECK: getelementptr i16, i16* %[[I0]], i16 1
+;CHECK: %[[I1:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 1
+;CHECK: getelementptr i16, i16* %[[I1]], i16 1
+;CHECK: %[[I2:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 2
+;CHECK: getelementptr i16, i16* %[[I2]], i16 1
+;CHECK: %[[I3:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 3
+;CHECK: getelementptr i16, i16* %[[I3]], i16 1
+
+; non-constant index
+define void @test2() {
+bb:
+  %0 = load <4 x i16*>, <4 x i16*>* @vec
+  %index = load i16, i16* @index
+  %1 = getelementptr i16, <4 x i16*> %0, i16 %index
+
+  ret void
+}
+
+;CHECK-LABEL: @test2
+;CHECK: %0 = load <4 x i16*>, <4 x i16*>* @vec
+;CHECK: %[[I0:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 0
+;CHECK: %[[I1:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 1
+;CHECK: %[[I2:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 2
+;CHECK: %[[I3:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 3
+;CHECK: %index = load i16, i16* @index
+;CHECK: %.splatinsert = insertelement <4 x i16> undef, i16 %index, i32 0
+;CHECK: %.splat = shufflevector <4 x i16> %.splatinsert, <4 x i16> undef, <4 x i32> zeroinitializer
+;CHECK: %.splat[[I0]] = extractelement <4 x i16> %.splat, i32 0
+;CHECK: getelementptr i16, i16* %[[I0]], i16 %.splat[[I0]]
+;CHECK: %.splat[[I1]] = extractelement <4 x i16> %.splat, i32 1
+;CHECK: getelementptr i16, i16* %[[I1]], i16 %.splat[[I1]]
+;CHECK: %.splat[[I2]] = extractelement <4 x i16> %.splat, i32 2
+;CHECK: getelementptr i16, i16* %[[I2]], i16 %.splat[[I2]]
+;CHECK: %.splat[[I3]] = extractelement <4 x i16> %.splat, i32 3
+;CHECK: getelementptr i16, i16* %[[I3]], i16 %.splat[[I3]]
+
+
+; Check that the scalarizer can handle vector GEPs with scalar pointer
+
+; constant pointer
+define void @test3() {
+bb:
+  %0 = bitcast [4 x i16]* @ptr to i16*
+  %1 = getelementptr i16, i16* %0, <4 x i16> <i16 0, i16 1, i16 2, i16 3>
+
+  ret void
+}
+
+;CHECK-LABEL: @test3
+;CHECK: %0 = bitcast [4 x i16]* @ptr to i16*
+;CHECK: %.splatinsert = insertelement <4 x i16*> undef, i16* %0, i32 0
+;CHECK: %.splat = shufflevector <4 x i16*> %.splatinsert, <4 x i16*> undef, <4 x i32> zeroinitializer
+;CHECK: %.splat[[I0:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 0
+;CHECK: getelementptr i16, i16* %.splat[[I0]], i16 0
+;CHECK: %.splat[[I1:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 1
+;CHECK: getelementptr i16, i16* %.splat[[I1]], i16 1
+;CHECK: %.splat[[I2:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 2
+;CHECK: getelementptr i16, i16* %.splat[[I2]], i16 2
+;CHECK: %.splat[[I3:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 3
+;CHECK: getelementptr i16, i16* %.splat[[I3]], i16 3
+
+; non-constant pointer
+define void @test4() {
+bb:
+  %0 = load i16*, i16** @ptrptr
+  %1 = getelementptr i16, i16* %0, <4 x i16> <i16 0, i16 1, i16 2, i16 3>
+
+  ret void
+}
+
+;CHECK-LABEL: @test4
+;CHECK: %0 = load i16*, i16** @ptrptr
+;CHECK: %.splatinsert = insertelement <4 x i16*> undef, i16* %0, i32 0
+;CHECK: %.splat = shufflevector <4 x i16*> %.splatinsert, <4 x i16*> undef, <4 x i32> zeroinitializer
+;CHECK: %.splat[[I0:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 0
+;CHECK: getelementptr i16, i16* %.splat[[I0]], i16 0
+;CHECK: %.splat[[I1:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 1
+;CHECK: getelementptr i16, i16* %.splat[[I1]], i16 1
+;CHECK: %.splat[[I2:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 2
+;CHECK: getelementptr i16, i16* %.splat[[I2]], i16 2
+;CHECK: %.splat[[I3:.i[0-9]*]] = extractelement <4 x i16*> %.splat, i32 3
+;CHECK: getelementptr i16, i16* %.splat[[I3]], i16 3
+
+; constant index, inbounds
+define void @test5() {
+bb:
+  %0 = load <4 x i16*>, <4 x i16*>* @vec
+  %1 = getelementptr inbounds i16, <4 x i16*> %0, i16 1
+
+  ret void
+}
+
+;CHECK-LABEL: @test5
+;CHECK: %[[I0:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 0
+;CHECK: getelementptr inbounds i16, i16* %[[I0]], i16 1
+;CHECK: %[[I1:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 1
+;CHECK: getelementptr inbounds i16, i16* %[[I1]], i16 1
+;CHECK: %[[I2:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 2
+;CHECK: getelementptr inbounds i16, i16* %[[I2]], i16 1
+;CHECK: %[[I3:.i[0-9]*]] = extractelement <4 x i16*> %0, i32 3
+;CHECK: getelementptr inbounds i16, i16* %[[I3]], i16 1
+
diff --git a/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll b/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
index 5815ae6273731cacb2e0c340cd299e3a4d5f0696..23ec0ca25544dd9676faff3355cbbc1d5d1cd407 100644
--- a/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
+++ b/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/split-gep-and-gvn-addrspace-addressing-modes.ll
@@ -9,7 +9,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 1
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 32
 ; IR: getelementptr inbounds float, float addrspace(2)* [[BASE_PTR]], i64 33
-define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [32 x float]], [4096 x [32 x float]] addrspace(2)* @array, i64 0, i64 %tmp1, i64 %tmp
@@ -42,7 +42,7 @@ define void @sum_of_array(i32 %x, i32 %y, float addrspace(1)* nocapture %output)
 ; IR: add i32 %x, 256
 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
 ; IR: getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %{{[a-zA-Z0-9]+}}, i64 %{{[a-zA-Z0-9]+}}
-define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp = sext i32 %y to i64
   %tmp1 = sext i32 %x to i64
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(2)* @array2, i64 0, i64 %tmp1, i64 %tmp
@@ -74,7 +74,7 @@ define void @sum_of_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 255
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16128
 ; IR: getelementptr inbounds float, float addrspace(3)* [[BASE_PTR]], i32 16383
-define void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
+define amdgpu_kernel void @sum_of_lds_array_over_max_mubuf_offset(i32 %x, i32 %y, float addrspace(1)* nocapture %output) {
   %tmp2 = getelementptr inbounds [4096 x [4 x float]], [4096 x [4 x float]] addrspace(3)* @lds_array, i32 0, i32 %x, i32 %y
   %tmp4 = load float, float addrspace(3)* %tmp2, align 4
   %tmp5 = fadd float %tmp4, 0.000000e+00
diff --git a/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll b/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll
index 16f028d2e85ab2c22c5e31aa3f9bae9d3e09095c..90a9aa4d95b7dde979c0be159f0a0a1cbac7d7c2 100644
--- a/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll
+++ b/test/Transforms/SimplifyCFG/ARM/switch-to-lookup-table.ll
@@ -1,8 +1,8 @@
-; RUN: opt -S -simplifycfg -mtriple=arm -relocation-model=static    < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: opt -S -simplifycfg -mtriple=arm -relocation-model=pic       < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
-; RUN: opt -S -simplifycfg -mtriple=arm -relocation-model=ropi      < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
-; RUN: opt -S -simplifycfg -mtriple=arm -relocation-model=rwpi      < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
-; RUN: opt -S -simplifycfg -mtriple=arm -relocation-model=ropi-rwpi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: opt -S -latesimplifycfg -mtriple=arm -relocation-model=static    < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: opt -S -latesimplifycfg -mtriple=arm -relocation-model=pic       < %s | FileCheck %s --check-prefix=CHECK --check-prefix=ENABLE
+; RUN: opt -S -latesimplifycfg -mtriple=arm -relocation-model=ropi      < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: opt -S -latesimplifycfg -mtriple=arm -relocation-model=rwpi      < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
+; RUN: opt -S -latesimplifycfg -mtriple=arm -relocation-model=ropi-rwpi < %s | FileCheck %s --check-prefix=CHECK --check-prefix=DISABLE
 
 ; CHECK:       @{{.*}} = private unnamed_addr constant [3 x i32] [i32 1234, i32 5678, i32 15532]
 ; ENABLE:      @{{.*}} = private unnamed_addr constant [3 x i32*] [i32* @c1, i32* @c2, i32* @c3]
diff --git a/test/Transforms/SimplifyCFG/CoveredLookupTable.ll b/test/Transforms/SimplifyCFG/CoveredLookupTable.ll
index 8b45a590bb1fb40c60c1071441176033bfa0596c..a42349e3d874219f168af9e55fb5349c23fce504 100644
--- a/test/Transforms/SimplifyCFG/CoveredLookupTable.ll
+++ b/test/Transforms/SimplifyCFG/CoveredLookupTable.ll
@@ -1,4 +1,4 @@
-; RUN: opt -simplifycfg -S %s | FileCheck %s
+; RUN: opt -latesimplifycfg -S %s | FileCheck %s
 ; rdar://15268442
 
 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
diff --git a/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll b/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll
index f3e5506ad93390dbf554b92512b99f2af42768bb..ae6ff6d10bcf07a7c5102f38972a4181d10461e4 100644
--- a/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch-covered-bug.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -simplifycfg < %s -mtriple=x86_64-apple-darwin12.0.0 | FileCheck %s
+; RUN: opt -S -latesimplifycfg < %s -mtriple=x86_64-apple-darwin12.0.0 | FileCheck %s
 ; rdar://17887153
 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin12.0.0"
diff --git a/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll b/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll
index 26008700f5be64770d5e2838bbe064dc85065942..734312bc7285e03fe88dfa34795dde1b6228d809 100644
--- a/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch-table-bug.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -simplifycfg < %s -mtriple=x86_64-apple-darwin12.0.0 | FileCheck %s
+; RUN: opt -S -latesimplifycfg < %s -mtriple=x86_64-apple-darwin12.0.0 | FileCheck %s
 ; rdar://17735071
 target datalayout = "e-p:64:64:64-S128-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f16:16:16-f32:32:32-f64:64:64-f128:128:128-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin12.0.0"
diff --git a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
index 81c153483c66807956fde1c4eca8e741bde6a92b..4b9227b029ecaf0d681258ddc117506153cfac0e 100644
--- a/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
+++ b/test/Transforms/SimplifyCFG/X86/switch_to_lookup_table.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplifycfg -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: opt < %s -latesimplifycfg -S -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -1178,8 +1178,9 @@ return:
   ret i32 %retval.0
 ; CHECK-LABEL: @reuse_cmp2(
 ; CHECK: entry:
-; CHECK-NEXT: %switch.tableidx = sub i32 %x, 0
-; CHECK-NEXT: [[C:%.+]] = icmp ult i32 %switch.tableidx, 4
+; CHECK-NEXT: %switch = icmp ult i32 %x, 4
+; CHECK-NEXT: %x. = select i1 %switch, i32 %x, i32 4
+; CHECK-NEXT: [[C:%.+]] = icmp ne i32 %x., 4
 ; CHECK:      [[R:%.+]] = select i1 [[C]], i32 {{.*}}, i32 100
 ; CHECK-NEXT: ret i32 [[R]]
 }
diff --git a/test/Transforms/SimplifyCFG/critedge-assume.ll b/test/Transforms/SimplifyCFG/critedge-assume.ll
new file mode 100644
index 0000000000000000000000000000000000000000..bfeb65769deb5f94c2c24a3a4a1d06b0c15e5915
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/critedge-assume.ll
@@ -0,0 +1,83 @@
+; RUN: opt -o %t %s -instcombine -simplifycfg -thinlto-bc -verify-assumption-cache
+; RUN: llvm-dis -o - %t | FileCheck %s
+
+; Test that the simplifycfg pass correctly updates the assumption cache
+; when it clones the llvm.assume call as part of creating a critical
+; edge. To do that, we set up a pass pipeline such that (1) an assumption
+; cache is created for foo before simplifycfg updates it, and (2) foo's
+; assumption cache is verified after simplifycfg has run. To satisfy 1, we
+; run the instcombine pass first in our pipeline. To satisfy 2, we use the
+; ThinLTOBitcodeWriter pass to write bitcode (that pass uses the assumption
+; cache). That ensures that the pass manager does not call releaseMemory()
+; on the AssumptionCacheTracker before the end of the pipeline, which would
+; wipe out the bad assumption cache before it is verified.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%class.F = type { i8 }
+%class.B = type { i8 }
+%class.A = type { %class.C }
+%class.C = type { i32 (...)** }
+
+define void @foo(%class.F* %this, %class.B* %out) {
+entry:
+  %call = tail call i32 @_ZNK1F5beginEv(%class.F* %this)
+  %call2 = tail call i32 @_ZNK1F3endEv(%class.F* %this)
+  %cmp.i22 = icmp eq i32 %call, %call2
+  br i1 %cmp.i22, label %while.end, label %while.body.preheader
+
+while.body.preheader:
+  br label %while.body
+
+while.body:
+  %frame_node.sroa.0.023 = phi i32 [ %inc.i, %_ZN10unique_ptrD2Ev.exit ], [ %call, %while.body.preheader ]
+  %call8 = tail call i8* @_Znwm(i64 8)
+  %inc.i = add nsw i32 %frame_node.sroa.0.023, 1
+  %cmp = icmp eq i32 %inc.i, %call2
+  br i1 %cmp, label %_ZN10unique_ptrD2Ev.exit, label %if.then
+
+if.then:
+  tail call void @_ZN1B6appendEv(%class.B* %out)
+  br label %_ZN10unique_ptrD2Ev.exit
+
+_ZN10unique_ptrD2Ev.exit:
+  %x1 = bitcast i8* %call8 to void (%class.A*)***
+  %vtable.i.i = load void (%class.A*)**, void (%class.A*)*** %x1, align 8
+  %x2 = bitcast void (%class.A*)** %vtable.i.i to i8*
+  %x3 = tail call i1 @llvm.type.test(i8* %x2, metadata !"foo")
+  ; CHECK: call void @llvm.assume
+  ; CHECK: call void @llvm.assume
+  tail call void @llvm.assume(i1 %x3) #5
+  br i1 %cmp, label %while.end.loopexit, label %while.body
+
+while.end.loopexit:
+  br label %while.end
+
+while.end:
+  ret void
+}
+
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+
+declare i32 @_ZNK1F5beginEv(%class.F*)
+
+declare i32 @_ZNK1F3endEv(%class.F*)
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1)
+
+declare noalias nonnull i8* @_Znwm(i64)
+
+declare void @_ZN1B6appendEv(%class.B*)
+
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
+
+declare i1 @llvm.type.test(i8*, metadata)
+
+declare void @llvm.assume(i1)
+
+!llvm.module.flags = !{!0}
+!llvm.ident = !{!1}
+
+!0 = !{i32 1, !"PIC Level", i32 2}
+!1 = !{!"clang version 5.0.0 "}
diff --git a/test/Transforms/SimplifyCFG/div-rem-pairs.ll b/test/Transforms/SimplifyCFG/div-rem-pairs.ll
new file mode 100644
index 0000000000000000000000000000000000000000..85ffe1f4e0f37e61146b35370ac1bfd85cec64e7
--- /dev/null
+++ b/test/Transforms/SimplifyCFG/div-rem-pairs.ll
@@ -0,0 +1,119 @@
+; RUN: opt -simplifycfg -S < %s | FileCheck %s
+
+; FIXME: Hoist the sdiv because it's safe and free.
+; PR31028 - https://bugs.llvm.org/show_bug.cgi?id=31028
+
+define i32 @hoist_sdiv(i32 %a, i32 %b) {
+; CHECK-LABEL: @hoist_sdiv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REM:%.*]] = srem i32 %a, %b
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[REM]], 42
+; CHECK-NEXT:    br i1 [[CMP]], label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i32 %a, %b
+; CHECK-NEXT:    br label %end
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i32 [ [[DIV]], %if ], [ 3, %entry ]
+; CHECK-NEXT:    ret i32 [[RET]]
+;
+entry:
+  %rem = srem i32 %a, %b
+  %cmp = icmp eq i32 %rem, 42
+  br i1 %cmp, label %if, label %end
+
+if:
+  %div = sdiv i32 %a, %b
+  br label %end
+
+end:
+  %ret = phi i32 [ %div, %if ], [ 3, %entry ]
+  ret i32 %ret
+}
+
+; FIXME: Hoist the udiv because it's safe and free.
+
+define i64 @hoist_udiv(i64 %a, i64 %b) {
+; CHECK-LABEL: @hoist_udiv(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[REM:%.*]] = urem i64 %a, %b
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[REM]], 42
+; CHECK-NEXT:    br i1 [[CMP]], label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i64 %a, %b
+; CHECK-NEXT:    br label %end
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i64 [ [[DIV]], %if ], [ 3, %entry ]
+; CHECK-NEXT:    ret i64 [[RET]]
+;
+entry:
+  %rem = urem i64 %a, %b
+  %cmp = icmp eq i64 %rem, 42
+  br i1 %cmp, label %if, label %end
+
+if:
+  %div = udiv i64 %a, %b
+  br label %end
+
+end:
+  %ret = phi i64 [ %div, %if ], [ 3, %entry ]
+  ret i64 %ret
+}
+
+; FIXME: Hoist the srem because it's safe and likely free.
+
+define i16 @hoist_srem(i16 %a, i16 %b) {
+; CHECK-LABEL: @hoist_srem(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = sdiv i16 %a, %b
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i16 [[DIV]], 42
+; CHECK-NEXT:    br i1 [[CMP]], label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:    [[REM:%.*]] = srem i16 %a, %b
+; CHECK-NEXT:    br label %end
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i16 [ [[REM]], %if ], [ 3, %entry ]
+; CHECK-NEXT:    ret i16 [[RET]]
+;
+entry:
+  %div = sdiv i16 %a, %b
+  %cmp = icmp eq i16 %div, 42
+  br i1 %cmp, label %if, label %end
+
+if:
+  %rem = srem i16 %a, %b
+  br label %end
+
+end:
+  %ret = phi i16 [ %rem, %if ], [ 3, %entry ]
+  ret i16 %ret
+}
+
+; FIXME: Hoist the urem because it's safe and likely free.
+
+define i8 @hoist_urem(i8 %a, i8 %b) {
+; CHECK-LABEL: @hoist_urem(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i8 %a, %b
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i8 [[DIV]], 42
+; CHECK-NEXT:    br i1 [[CMP]], label %if, label %end
+; CHECK:       if:
+; CHECK-NEXT:    [[REM:%.*]] = urem i8 %a, %b
+; CHECK-NEXT:    br label %end
+; CHECK:       end:
+; CHECK-NEXT:    [[RET:%.*]] = phi i8 [ [[REM]], %if ], [ 3, %entry ]
+; CHECK-NEXT:    ret i8 [[RET]]
+;
+entry:
+  %div = udiv i8 %a, %b
+  %cmp = icmp eq i8 %div, 42
+  br i1 %cmp, label %if, label %end
+
+if:
+  %rem = urem i8 %a, %b
+  br label %end
+
+end:
+  %ret = phi i8 [ %rem, %if ], [ 3, %entry ]
+  ret i8 %ret
+}
+
diff --git a/test/Transforms/SimplifyCFG/empty-cleanuppad.ll b/test/Transforms/SimplifyCFG/empty-cleanuppad.ll
index 9f657a81a05b9863aa3cdbab457e6068d4d56aae..f2e0114a2a35ef645bc9830bf661372a87ebbadb 100644
--- a/test/Transforms/SimplifyCFG/empty-cleanuppad.ll
+++ b/test/Transforms/SimplifyCFG/empty-cleanuppad.ll
@@ -413,14 +413,14 @@ return:                                           ; preds = %invoke.cont, %catch
 define i32 @f9() personality i32 (...)* @__CxxFrameHandler3 {
 entry:
   %s = alloca i8, align 1
-  call void @llvm.lifetime.start(i64 1, i8* nonnull %s)
+  call void @llvm.lifetime.start.p0i8(i64 1, i8* nonnull %s)
   %bc = bitcast i8* %s to %struct.S2*
   invoke void @"\01??1S2@@QEAA@XZ"(%struct.S2* %bc)
           to label %try.cont unwind label %ehcleanup
 
 ehcleanup:
   %cleanup.pad = cleanuppad within none []
-  call void @llvm.lifetime.end(i64 1, i8* nonnull %s)
+  call void @llvm.lifetime.end.p0i8(i64 1, i8* nonnull %s)
   cleanupret from %cleanup.pad unwind label %catch.dispatch
 
 catch.dispatch:
@@ -466,5 +466,5 @@ declare void @use_x(i32 %x)
 
 declare i32 @__CxxFrameHandler3(...)
 
-declare void @llvm.lifetime.start(i64, i8* nocapture)
-declare void @llvm.lifetime.end(i64, i8* nocapture)
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture)
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture)
diff --git a/test/Transforms/SimplifyCFG/lifetime.ll b/test/Transforms/SimplifyCFG/lifetime.ll
index 7c66be5295007daa64cae3715a31fe3514d3b8fd..270fe4d544228f93af41498f5c3d5d3edfbf79d5 100644
--- a/test/Transforms/SimplifyCFG/lifetime.ll
+++ b/test/Transforms/SimplifyCFG/lifetime.ll
@@ -10,11 +10,11 @@
 define void @foo(i1 %x) {
 entry:
   %a = alloca i8
-  call void @llvm.lifetime.start(i64 -1, i8* %a) nounwind
+  call void @llvm.lifetime.start.p0i8(i64 -1, i8* %a) nounwind
   br i1 %x, label %bb0, label %bb1
 
 bb0:
-  call void @llvm.lifetime.end(i64 -1, i8* %a) nounwind
+  call void @llvm.lifetime.end.p0i8(i64 -1, i8* %a) nounwind
   br label %bb1
 
 bb1:
@@ -24,6 +24,6 @@ bb1:
 
 declare void @f()
 
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.start.p0i8(i64, i8* nocapture) nounwind
 
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+declare void @llvm.lifetime.end.p0i8(i64, i8* nocapture) nounwind
diff --git a/test/Transforms/SimplifyCFG/merge-cond-stores.ll b/test/Transforms/SimplifyCFG/merge-cond-stores.ll
index 77e3158d9bbde0731a0ee738848d093490a0f3c0..d5d0224a4b24cde06a316419a0d6d960063df2ce 100644
--- a/test/Transforms/SimplifyCFG/merge-cond-stores.ll
+++ b/test/Transforms/SimplifyCFG/merge-cond-stores.ll
@@ -1,16 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -simplifycfg -instcombine < %s -simplifycfg-merge-cond-stores=true -simplifycfg-merge-cond-stores-aggressively=false -phi-node-folding-threshold=2 -S | FileCheck %s
 
-; CHECK-LABEL: @test_simple
 ; This test should succeed and end up if-converted.
-; CHECK: icmp eq i32 %b, 0
-; CHECK-NEXT: icmp ne i32 %a, 0
-; CHECK-NEXT: xor i1 %x2, true
-; CHECK-NEXT: %[[x:.*]] = or i1 %{{.*}}, %{{.*}}
-; CHECK-NEXT: br i1 %[[x]]
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: ret
 define void @test_simple(i32* %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[B:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = xor i1 [[X2]], true
+; CHECK-NEXT:    [[TMP2:%.*]] = or i1 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[TMP2]], label [[TMP3:%.*]], label [[TMP4:%.*]]
+; CHECK:         [[NOT_X2:%.*]] = xor i1 [[X2]], true
+; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[NOT_X2]] to i32
+; CHECK-NEXT:    store i32 [[DOT]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[TMP4]]
+; CHECK:         ret void
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %fallthrough, label %yes1
@@ -31,12 +36,26 @@ end:
   ret void
 }
 
-; CHECK-LABEL: @test_recursive
 ; This test should entirely fold away, leaving one large basic block.
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: ret
 define void @test_recursive(i32* %p, i32 %a, i32 %b, i32 %c, i32 %d) {
+; CHECK-LABEL: @test_recursive(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[B:%.*]], [[A:%.*]]
+; CHECK-NEXT:    [[X4:%.*]] = icmp eq i32 [[D:%.*]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = or i32 [[TMP0]], [[C:%.*]]
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i1 [[X4]], true
+; CHECK-NEXT:    [[TMP4:%.*]] = or i1 [[TMP3]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[TMP4]], label [[TMP5:%.*]], label [[TMP6:%.*]]
+; CHECK:         [[X3:%.*]] = icmp eq i32 [[C]], 0
+; CHECK-NEXT:    [[NOT_X2:%.*]] = icmp ne i32 [[B]], 0
+; CHECK-NEXT:    [[DOT:%.*]] = zext i1 [[NOT_X2]] to i32
+; CHECK-NEXT:    [[DOT_:%.*]] = select i1 [[X3]], i32 [[DOT]], i32 2
+; CHECK-NEXT:    [[DOT__:%.*]] = select i1 [[X4]], i32 [[DOT_]], i32 3
+; CHECK-NEXT:    store i32 [[DOT__]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[TMP6]]
+; CHECK:         ret void
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %fallthrough, label %yes1
@@ -74,13 +93,31 @@ end:
   ret void
 }
 
-; CHECK-LABEL: @test_not_ifconverted
 ; The code in each diamond is too large - it won't be if-converted so our
 ; heuristics should say no.
-; CHECK: store
-; CHECK: store
-; CHECK: ret
 define void @test_not_ifconverted(i32* %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_not_ifconverted(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[X1]], label [[FALLTHROUGH:%.*]], label [[YES1:%.*]]
+; CHECK:       yes1:
+; CHECK-NEXT:    [[Y1:%.*]] = or i32 [[B:%.*]], 55
+; CHECK-NEXT:    [[Y2:%.*]] = add i32 [[Y1]], 24
+; CHECK-NEXT:    [[Y3:%.*]] = and i32 [[Y2]], 67
+; CHECK-NEXT:    store i32 [[Y3]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[FALLTHROUGH]]
+; CHECK:       fallthrough:
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[B]], 0
+; CHECK-NEXT:    br i1 [[X2]], label [[END:%.*]], label [[YES2:%.*]]
+; CHECK:       yes2:
+; CHECK-NEXT:    [[Z1:%.*]] = or i32 [[A]], 55
+; CHECK-NEXT:    [[Z2:%.*]] = add i32 [[Z1]], 24
+; CHECK-NEXT:    [[Z3:%.*]] = and i32 [[Z2]], 67
+; CHECK-NEXT:    store i32 [[Z3]], i32* [[P]], align 4
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %fallthrough, label %yes1
@@ -107,13 +144,26 @@ end:
   ret void
 }
 
-; CHECK-LABEL: @test_aliasing1
 ; The store to %p clobbers the previous store, so if-converting this would
 ; be illegal.
-; CHECK: store
-; CHECK: store
-; CHECK: ret
 define void @test_aliasing1(i32* %p, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_aliasing1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[X1]], label [[FALLTHROUGH:%.*]], label [[YES1:%.*]]
+; CHECK:       yes1:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[FALLTHROUGH]]
+; CHECK:       fallthrough:
+; CHECK-NEXT:    [[Y1:%.*]] = load i32, i32* [[P]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[Y1]], 0
+; CHECK-NEXT:    br i1 [[X2]], label [[END:%.*]], label [[YES2:%.*]]
+; CHECK:       yes2:
+; CHECK-NEXT:    store i32 1, i32* [[P]], align 4
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %fallthrough, label %yes1
@@ -135,12 +185,25 @@ end:
   ret void
 }
 
-; CHECK-LABEL: @test_aliasing2
 ; The load from %q aliases with %p, so if-converting this would be illegal.
-; CHECK: store
-; CHECK: store
-; CHECK: ret
 define void @test_aliasing2(i32* %p, i32* %q, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_aliasing2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[X1]], label [[FALLTHROUGH:%.*]], label [[YES1:%.*]]
+; CHECK:       yes1:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[FALLTHROUGH]]
+; CHECK:       fallthrough:
+; CHECK-NEXT:    [[Y1:%.*]] = load i32, i32* [[Q:%.*]], align 4
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[Y1]], 0
+; CHECK-NEXT:    br i1 [[X2]], label [[END:%.*]], label [[YES2:%.*]]
+; CHECK:       yes2:
+; CHECK-NEXT:    store i32 1, i32* [[P]], align 4
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %fallthrough, label %yes1
@@ -164,12 +227,24 @@ end:
 
 declare void @f()
 
-; CHECK-LABEL: @test_diamond_simple
 ; This should get if-converted.
-; CHECK: store
-; CHECK-NOT: store
-; CHECK: ret
 define i32 @test_diamond_simple(i32* %p, i32* %q, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_diamond_simple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    [[Z1:%.*]] = add i32 [[A]], [[B:%.*]]
+; CHECK-NEXT:    [[Z2:%.*]] = select i1 [[X1]], i32 [[Z1]], i32 0
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[B]], 0
+; CHECK-NEXT:    [[Z3:%.*]] = sub i32 [[Z2]], [[B]]
+; CHECK-NEXT:    [[Z4:%.*]] = select i1 [[X2]], i32 [[Z3]], i32 3
+; CHECK-NEXT:    [[TMP0:%.*]] = or i32 [[A]], [[B]]
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label [[TMP3:%.*]], label [[TMP2:%.*]]
+; CHECK:         [[SIMPLIFYCFG_MERGE:%.*]] = select i1 [[X2]], i32 [[Z2]], i32 1
+; CHECK-NEXT:    store i32 [[SIMPLIFYCFG_MERGE]], i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[TMP3]]
+; CHECK:         ret i32 [[Z4]]
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %no1, label %yes1
@@ -200,14 +275,36 @@ end:
   ret i32 %z4
 }
 
-; CHECK-LABEL: @test_diamond_alias3
 ; Now there is a call to f() in the bottom branch. The store in the first
 ; branch would now be reordered with respect to the call if we if-converted,
 ; so we must not.
-; CHECK: store
-; CHECK: store
-; CHECK: ret
 define i32 @test_diamond_alias3(i32* %p, i32* %q, i32 %a, i32 %b) {
+; CHECK-LABEL: @test_diamond_alias3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[X1:%.*]] = icmp eq i32 [[A:%.*]], 0
+; CHECK-NEXT:    br i1 [[X1]], label [[NO1:%.*]], label [[YES1:%.*]]
+; CHECK:       yes1:
+; CHECK-NEXT:    store i32 0, i32* [[P:%.*]], align 4
+; CHECK-NEXT:    br label [[FALLTHROUGH:%.*]]
+; CHECK:       no1:
+; CHECK-NEXT:    call void @f()
+; CHECK-NEXT:    [[Z1:%.*]] = add i32 [[A]], [[B:%.*]]
+; CHECK-NEXT:    br label [[FALLTHROUGH]]
+; CHECK:       fallthrough:
+; CHECK-NEXT:    [[Z2:%.*]] = phi i32 [ [[Z1]], [[NO1]] ], [ 0, [[YES1]] ]
+; CHECK-NEXT:    [[X2:%.*]] = icmp eq i32 [[B]], 0
+; CHECK-NEXT:    br i1 [[X2]], label [[NO2:%.*]], label [[YES2:%.*]]
+; CHECK:       yes2:
+; CHECK-NEXT:    store i32 1, i32* [[P]], align 4
+; CHECK-NEXT:    br label [[END:%.*]]
+; CHECK:       no2:
+; CHECK-NEXT:    call void @f()
+; CHECK-NEXT:    [[Z3:%.*]] = sub i32 [[Z2]], [[B]]
+; CHECK-NEXT:    br label [[END]]
+; CHECK:       end:
+; CHECK-NEXT:    [[Z4:%.*]] = phi i32 [ [[Z3]], [[NO2]] ], [ 3, [[YES2]] ]
+; CHECK-NEXT:    ret i32 [[Z4]]
+;
 entry:
   %x1 = icmp eq i32 %a, 0
   br i1 %x1, label %no1, label %yes1
diff --git a/test/Transforms/SimplifyCFG/rangereduce.ll b/test/Transforms/SimplifyCFG/rangereduce.ll
index 36e932b37be54c1e1d8ca83caedcabd530b5b167..13bbdfe83d079ae0ea1e6a617f8f7f704ec2c27d 100644
--- a/test/Transforms/SimplifyCFG/rangereduce.ll
+++ b/test/Transforms/SimplifyCFG/rangereduce.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -simplifycfg -S | FileCheck %s
+; RUN: opt < %s -latesimplifycfg -S | FileCheck %s
 
 target datalayout = "e-n32"
 
diff --git a/test/Transforms/SimplifyCFG/switch_create.ll b/test/Transforms/SimplifyCFG/switch_create.ll
index 29d3a34a05e6272eb0863cf53330b118de1abab8..c752636ae83da6dd3860b253eabf27b066ab93bd 100644
--- a/test/Transforms/SimplifyCFG/switch_create.ll
+++ b/test/Transforms/SimplifyCFG/switch_create.ll
@@ -1,5 +1,5 @@
 ; RUN: opt -S -simplifycfg < %s | FileCheck %s
-; RUN: opt -S -default-data-layout="p:32:32-p1:16:16" -simplifycfg < %s | FileCheck -check-prefix=CHECK -check-prefix=DL %s
+; RUN: opt -S -data-layout="p:32:32-p1:16:16" -simplifycfg < %s | FileCheck -check-prefix=CHECK -check-prefix=DL %s
 
 declare void @foo1()
 
diff --git a/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll b/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
index f2853aca698f5212f3d5eef614f5070458a76227..9554ae690316cff4906c0cba3fa41d5233f8344d 100644
--- a/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
+++ b/test/Transforms/StraightLineStrengthReduce/AMDGPU/reassociate-geps-and-slsr-addrspace.ll
@@ -6,7 +6,7 @@ target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-p24:
 ; CHECK-LABEL: @slsr_after_reassociate_global_geps_mubuf_max_offset(
 ; CHECK: [[b1:%[0-9]+]] = getelementptr float, float addrspace(1)* %arr, i64 [[bump:%[0-9]+]]
 ; CHECK: [[b2:%[0-9]+]] = getelementptr float, float addrspace(1)* [[b1]], i64 [[bump]]
-define void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_global_geps_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 1023
@@ -33,7 +33,7 @@ bb:
 ; CHECK: %tmp = sext i32 %j1 to i64
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp
 ; CHECK: getelementptr inbounds float, float addrspace(1)* %arr, i64 %tmp5
-define void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_global_geps_over_mubuf_max_offset(float addrspace(1)* %out, float addrspace(1)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 1024
@@ -61,7 +61,7 @@ bb:
 
 ; CHECK: [[B2:%[0-9]+]] = getelementptr float, float addrspace(3)* [[B1]], i32 %i
 ; CHECK: getelementptr inbounds float, float addrspace(3)* [[B2]], i32 16383
-define void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_lds_geps_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 16383
@@ -86,7 +86,7 @@ bb:
 ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j1
 ; CHECK: %j2 = add i32 %j1, %i
 ; CHECK: getelementptr inbounds float, float addrspace(3)* %arr, i32 %j2
-define void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
+define amdgpu_kernel void @slsr_after_reassociate_lds_geps_over_ds_max_offset(float addrspace(1)* %out, float addrspace(3)* noalias %arr, i32 %i) {
 bb:
   %i2 = shl nsw i32 %i, 1
   %j1 = add nsw i32 %i, 16384
diff --git a/test/Transforms/StripSymbols/strip-dead-debug-info.ll b/test/Transforms/StripSymbols/strip-dead-debug-info.ll
index 0e252d70465e5ff2eb56ea5422a5f3845505d820..d18c07d54a90fab78a8175e540261af5389d9432 100644
--- a/test/Transforms/StripSymbols/strip-dead-debug-info.ll
+++ b/test/Transforms/StripSymbols/strip-dead-debug-info.ll
@@ -3,6 +3,9 @@
 ; CHECK: ModuleID = '{{.*}}'
 ; CHECK-NOT: "bar"
 ; CHECK-NOT: "abcd"
+; CHECK-NOT: "GCC"
+; CHECK: "Globals"
+; CHECK: "abcd2"
 
 source_filename = "test/Transforms/StripSymbols/strip-dead-debug-info.ll"
 
@@ -21,7 +24,7 @@ entry:
 define i32 @foo(i32 %i) #2 !dbg !15 {
 entry:
   tail call void @llvm.dbg.value(metadata i32 %i, i64 0, metadata !18, metadata !19), !dbg !20
-  %.0 = load i32, i32* @xyz, align 4
+  %.0 = load i32, i32* @xyz, align 4, !dbg !30
   ret i32 %.0, !dbg !21
 }
 
@@ -29,7 +32,7 @@ attributes #0 = { nounwind readnone }
 attributes #1 = { nounwind readnone ssp }
 attributes #2 = { nounwind readonly ssp }
 
-!llvm.dbg.cu = !{!4}
+!llvm.dbg.cu = !{!4, !23, !24, !28}
 !llvm.module.flags = !{!9}
 
 !0 = !DIGlobalVariableExpression(var: !1)
@@ -55,4 +58,11 @@ attributes #2 = { nounwind readonly ssp }
 !20 = !DILocation(line: 7, scope: !15)
 !21 = !DILocation(line: 10, scope: !22)
 !22 = distinct !DILexicalBlock(scope: !15, file: !2, line: 7)
-
+!23 = distinct !DICompileUnit(language: DW_LANG_C89, file: !2, producer: "GCC", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !5, retainedTypes: !5, globals: !5)
+!24 = distinct !DICompileUnit(language: DW_LANG_C89, file: !2, producer: "Globals", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !5, retainedTypes: !5, globals: !25)
+!25 = !{!26}
+!26 = !DIGlobalVariableExpression(var: !27, expr: !DIExpression(DW_OP_constu, 0, DW_OP_stack_value))
+!27 = !DIGlobalVariable(name: "abcd2", scope: !2, file: !2, line: 2, type: !3, isLocal: true, isDefinition: true)
+!28 = distinct !DICompileUnit(language: DW_LANG_C89, file: !2, producer: "InlineTest", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !5, retainedTypes: !5, globals: !5)
+!29 = distinct !DISubprogram(name: "inlinefunc", linkageName: "inlinefunc", scope: null, file: !2, line: 7, type: !16, isLocal: false, isDefinition: true, isOptimized: true, unit: !28)
+!30 = !DILocation(line: 100, scope: !29, inlinedAt: !21)
diff --git a/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll b/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll
index a635be10d465f30787771fe738843c8ab5173d96..9d3a84396cfc742386993a2c214634384e35648e 100644
--- a/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll
+++ b/test/Transforms/StructurizeCFG/rebuild-ssa-infinite-loop.ll
@@ -6,46 +6,51 @@
 
 target triple = "amdgcn--"
 
-declare <4 x float> @llvm.SI.vs.load.input(<16 x i8>, i32, i32) #0
-declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #1
-declare void @llvm.SI.export(i32, i32, i32, i32, i32, float, float, float, float) #2
-
-define amdgpu_vs void @wrapper(i32 inreg, i32) {
+define amdgpu_vs void @wrapper(i32 inreg %arg, i32 %arg1) {
 main_body:
-  %2 = add i32 %1, %0
-  %3 = call <4 x float> @llvm.SI.vs.load.input(<16 x i8> undef, i32 0, i32 %2)
-  %4 = extractelement <4 x float> %3, i32 1
-  %5 = fptosi float %4 to i32
-  %6 = insertelement <2 x i32> undef, i32 %5, i32 1
+  %tmp = add i32 %arg1, %arg
+  %tmp2 = call <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32> undef, i32 %tmp, i32 0, i1 false, i1 false)
+  %tmp3 = extractelement <4 x float> %tmp2, i32 1
+  %tmp4 = fptosi float %tmp3 to i32
+  %tmp5 = insertelement <2 x i32> undef, i32 %tmp4, i32 1
   br label %loop11.i
 
 loop11.i:                                         ; preds = %endif46.i, %main_body
-  %7 = phi i32 [ 0, %main_body ], [ %15, %endif46.i ]
-  %8 = icmp sgt i32 %7, 999
-  br i1 %8, label %main.exit, label %if16.i
+  %tmp6 = phi i32 [ 0, %main_body ], [ %tmp14, %endif46.i ]
+  %tmp7 = icmp sgt i32 %tmp6, 999
+  br i1 %tmp7, label %main.exit, label %if16.i
 
 if16.i:                                           ; preds = %loop11.i
-  %9 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %6, <8 x i32> undef, i32 15, i1 true, i1 false, i1 false, i1 false)
-  %10 = extractelement <4 x float> %9, i32 0
-  %11 = fcmp ult float 0.000000e+00, %10
-  br i1 %11, label %if28.i, label %endif46.i
+  %tmp8 = call <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32> %tmp5, <8 x i32> undef, i32 15, i1 true, i1 false, i1 false, i1 false)
+  %tmp9 = extractelement <4 x float> %tmp8, i32 0
+  %tmp10 = fcmp ult float 0.000000e+00, %tmp9
+  br i1 %tmp10, label %if28.i, label %endif46.i
 
 if28.i:                                           ; preds = %if16.i
-  %12 = bitcast float %10 to i32
-  %13 = shl i32 %12, 16
-  %14 = bitcast i32 %13 to float
+  %tmp11 = bitcast float %tmp9 to i32
+  %tmp12 = shl i32 %tmp11, 16
+  %tmp13 = bitcast i32 %tmp12 to float
   br label %main.exit
 
 endif46.i:                                        ; preds = %if16.i
-  %15 = add i32 %7, 1
+  %tmp14 = add i32 %tmp6, 1
   br label %loop11.i
 
 main.exit:                                        ; preds = %if28.i, %loop11.i
-  %16 = phi float [ %14, %if28.i ], [ 0x36F0800000000000, %loop11.i ]
-  call void @llvm.SI.export(i32 15, i32 0, i32 0, i32 32, i32 0, float %16, float 0.000000e+00, float 0.000000e+00, float 0x36A0000000000000)
+  %tmp15 = phi float [ %tmp13, %if28.i ], [ 0x36F0800000000000, %loop11.i ]
+  call void @llvm.amdgcn.exp.f32(i32 32, i32 15, float %tmp15, float 0.000000e+00, float 0.000000e+00, float 0x36A0000000000000, i1 false, i1 false) #0
   ret void
 }
 
-attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind }
+; Function Attrs: nounwind
+declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
+
+; Function Attrs: nounwind readnone
+declare <4 x float> @llvm.amdgcn.buffer.load.format.v4f32(<4 x i32>, i32, i32, i1, i1) #2
+
+; Function Attrs: nounwind readonly
+declare <4 x float> @llvm.amdgcn.image.load.v4f32.v2i32.v8i32(<2 x i32>, <8 x i32>, i32, i1, i1, i1, i1) #2
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind readnone }
+attributes #2 = { nounwind readonly }
diff --git a/test/Transforms/ThinLTOBitcodeWriter/circular-reference.ll b/test/Transforms/ThinLTOBitcodeWriter/circular-reference.ll
new file mode 100644
index 0000000000000000000000000000000000000000..eeda79324497f08de8ad223144e62f592122373e
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/circular-reference.ll
@@ -0,0 +1,9 @@
+; RUN: opt -thinlto-bc -o %t %s
+; RUN: llvm-modextract -b -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=M0 %s
+; RUN: llvm-modextract -b -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=M1 %s
+
+; M0: @g = external constant
+; M1: @g = constant
+@g = constant i8* bitcast (i8** @g to i8*), !type !0
+
+!0 = !{i32 0, !"typeid"}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/comdat.ll b/test/Transforms/ThinLTOBitcodeWriter/comdat.ll
new file mode 100644
index 0000000000000000000000000000000000000000..caea48e0a5439a8724049f103e931516609810f2
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/comdat.ll
@@ -0,0 +1,80 @@
+; RUN: opt -thinlto-bc -o %t %s
+; RUN: llvm-modextract -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=THIN %s
+; RUN: llvm-modextract -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=MERGED %s
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+; Internal comdat leader with type metadata. All comdat members need to live
+; in the merged module, and the comdat needs to be renamed.
+; MERGED: ${{"?lwt[^ ]+}} = comdat any
+$lwt = comdat any
+
+; External comdat leader, type metadata on non-leader. All comdat
+; members need to live in the merged module, internal members need to
+; be renamed.
+; MERGED: $nlwt = comdat any
+$nlwt = comdat any
+
+; Comdat with two members without type metadata. All comdat members live in
+; the ThinLTO module and no renaming needs to take place.
+; THIN: $nt = comdat any
+$nt = comdat any
+
+; MERGED: @lwt_aliasee = private unnamed_addr global
+; MERGED-SAME: comdat(${{"?lwt[^ ]+}})
+@lwt_aliasee = private unnamed_addr global [1 x i8*] [i8* null], comdat($lwt), !type !0
+
+; MERGED: {{@"?lwt_nl[^ ]+}} = hidden unnamed_addr global
+; MERGED-SAME: comdat(${{"?lwt[^ ]+}})
+; THIN: {{@"?lwt_nl[^ ]+}} = external hidden
+@lwt_nl = internal unnamed_addr global i32 0, comdat($lwt)
+
+; MERGED: @nlwt_aliasee = private unnamed_addr global
+; MERGED-SAME: comdat($nlwt)
+@nlwt_aliasee = private unnamed_addr global [1 x i8*] [i8* null], comdat($nlwt), !type !0
+
+; MERGED: @nlwt = unnamed_addr global
+; MERGED-SAME: comdat
+; THIN: @nlwt = external
+@nlwt = unnamed_addr global i32 0, comdat
+
+; THIN: @nt = internal
+; THIN-SAME: comdat
+@nt = internal unnamed_addr global [1 x i8*] [i8* null], comdat
+
+; THIN: @nt_nl = internal
+; THIN-SAME: comdat($nt)
+@nt_nl = internal unnamed_addr global i32 0, comdat($nt)
+
+; MERGED: {{@"?lwt[^ ]+}} = hidden unnamed_addr alias
+; THIN: {{@"?lwt[^ ]+}} = external hidden
+@lwt = internal unnamed_addr alias [1 x i8*], [1 x i8*]* @lwt_aliasee
+
+; MERGED: {{@"?nlwt_nl[^ ]+}} = hidden unnamed_addr alias
+; THIN: {{@"?nlwt_nl[^ ]+}} = external hidden
+@nlwt_nl = internal unnamed_addr alias [1 x i8*], [1 x i8*]* @nlwt_aliasee
+
+; The functions below exist just to make sure the globals are used.
+define i8* @lwt_fun() {
+  %1 = load i32, i32* @lwt_nl
+  %2 = getelementptr inbounds [1 x i8*], [1 x i8*]* @lwt, i32 0, i32 %1
+  %3 = load i8*, i8** %2
+  ret i8* %3
+}
+
+define i8* @nlwt_fun() {
+  %1 = load i32, i32* @nlwt
+  %2 = getelementptr inbounds [1 x i8*], [1 x i8*]* @nlwt_nl, i32 0, i32 %1
+  %3 = load i8*, i8** %2
+  ret i8* %3
+}
+
+define i8* @nt_fun() {
+  %1 = load i32, i32* @nt_nl
+  %2 = getelementptr inbounds [1 x i8*], [1 x i8*]* @nt, i32 0, i32 %1
+  %3 = load i8*, i8** %2
+  ret i8* %3
+}
+
+!0 = !{i64 8, !"?AVA@@"}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/filter-alias.ll b/test/Transforms/ThinLTOBitcodeWriter/filter-alias.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d555ab0c1f6de322321c83f8983d3a4df2f955b9
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/filter-alias.ll
@@ -0,0 +1,16 @@
+; RUN: opt -thinlto-bc -o %t %s
+; RUN: llvm-modextract -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-modextract -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=CHECK1 %s
+; CHECK0: @al = external global i8*
+; CHECK1: @al = unnamed_addr alias i8*,
+
+target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-pc-windows-msvc19.0.24215"
+
+$al = comdat any
+
+@anon = private unnamed_addr constant { [1 x i8*] } { [1 x i8*] [i8* null] }, comdat($al), !type !0
+
+@al = external unnamed_addr alias i8*, getelementptr inbounds ({ [1 x i8*] }, { [1 x i8*] }* @anon, i32 0, i32 0, i32 1)
+
+!0 = !{i64 8, !"?AVA@@"}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/no-type-md.ll b/test/Transforms/ThinLTOBitcodeWriter/no-type-md.ll
index f1ada67abe504265a4745e974eabee3e17bc7c13..753e07a326b7d62a79b0165a7c7bb60df3b03dcd 100644
--- a/test/Transforms/ThinLTOBitcodeWriter/no-type-md.ll
+++ b/test/Transforms/ThinLTOBitcodeWriter/no-type-md.ll
@@ -1,6 +1,30 @@
-; RUN: opt -thinlto-bc -o %t %s
-; RUN: llvm-dis -o - %t | FileCheck %s
-; RUN: llvm-bcanalyzer -dump %t | FileCheck --check-prefix=BCA %s
+; Generate bitcode files with summary, as well as minimized bitcode without
+; the debug metadata for the thin link.
+; RUN: opt -thinlto-bc -thin-link-bitcode-file=%t.thinlink.bc -o %t.bc %s
+; RUN: llvm-dis -o - %t.bc | FileCheck %s
+; RUN: llvm-dis -o - %t.thinlink.bc | FileCheck --check-prefix=NODEBUG %s
+; RUN: llvm-bcanalyzer -dump %t.bc | FileCheck --check-prefix=BCA %s
+
+; Make sure the combined index files produced by both the normal and the
+; thin link bitcode files are identical
+; RUN: llvm-lto -thinlto -o %t3 %t.bc
+; Copy the minimized bitcode to the regular bitcode path so the module
+; paths in the index are the same (save and restore the regular bitcode
+; for use again further down).
+; RUN: mv %t.bc %t.bc.sv
+; RUN: cp %t.thinlink.bc %t.bc
+; RUN: llvm-lto -thinlto -o %t4 %t.bc
+; RUN: mv %t.bc.sv %t.bc
+; RUN: diff %t3.thinlto.bc %t4.thinlto.bc
+
+; Try again using -thinlto-action to produce combined index
+; RUN: rm -f %t3.thinlto.bc %t4.thinlto.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t3.thinlto.bc %t.bc
+; Copy the minimized bitcode to the regular bitcode path so the module
+; paths in the index are the same.
+; RUN: cp %t.thinlink.bc %t.bc
+; RUN: llvm-lto -thinlto-action=thinlink -o %t4.thinlto.bc %t.bc
+; RUN: diff %t3.thinlto.bc %t4.thinlto.bc
 
 ; BCA: <GLOBALVAL_SUMMARY_BLOCK
 
@@ -11,3 +35,10 @@
 define void @f() {
   ret void
 }
+
+; CHECK: !llvm.dbg.cu
+; NODEBUG-NOT: !llvm.dbg.cu
+!llvm.dbg.cu = !{}
+
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!llvm.module.flags = !{!1}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/split-vfunc-internal.ll b/test/Transforms/ThinLTOBitcodeWriter/split-vfunc-internal.ll
new file mode 100644
index 0000000000000000000000000000000000000000..087796b5031c8b6fefdb115468407e2be3bb01a9
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/split-vfunc-internal.ll
@@ -0,0 +1,21 @@
+; RUN: opt -thinlto-bc -o %t %s
+; RUN: llvm-modextract -b -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=M0 %s
+; RUN: llvm-modextract -b -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=M1 %s
+
+define [1 x i8*]* @source() {
+  ret [1 x i8*]* @g
+}
+
+; M0: @"g$84f59439b469192440047efc8de357fb" = external hidden constant [1 x i8*]{{$}}
+; M1: @"g$84f59439b469192440047efc8de357fb" = hidden constant [1 x i8*] [i8* bitcast (i64 (i8*)* @"ok$84f59439b469192440047efc8de357fb" to i8*)]
+@g = internal constant [1 x i8*] [
+  i8* bitcast (i64 (i8*)* @ok to i8*)
+], !type !0
+
+; M0: define hidden i64 @"ok$84f59439b469192440047efc8de357fb"
+; M1: define available_externally hidden i64 @"ok$84f59439b469192440047efc8de357fb"
+define internal i64 @ok(i8* %this) {
+  ret i64 42
+}
+
+!0 = !{i32 0, !"typeid"}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/split-vfunc.ll b/test/Transforms/ThinLTOBitcodeWriter/split-vfunc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0793459af4147adfa251d25f501cc047d462f4b4
--- /dev/null
+++ b/test/Transforms/ThinLTOBitcodeWriter/split-vfunc.ll
@@ -0,0 +1,75 @@
+; RUN: opt -thinlto-bc -o %t %s
+; RUN: llvm-modextract -b -n 0 -o - %t | llvm-dis | FileCheck --check-prefix=M0 %s
+; RUN: llvm-modextract -b -n 1 -o - %t | llvm-dis | FileCheck --check-prefix=M1 %s
+
+; M0: @g = external constant [9 x i8*]{{$}}
+; M1: @g = constant [9 x i8*]
+@g = constant [9 x i8*] [
+  i8* bitcast (i64 (i8*)* @ok1 to i8*),
+  i8* bitcast (i64 (i8*, i64)* @ok2 to i8*),
+  i8* bitcast (void (i8*)* @wrongtype1 to i8*),
+  i8* bitcast (i128 (i8*)* @wrongtype2 to i8*),
+  i8* bitcast (i64 ()* @wrongtype3 to i8*),
+  i8* bitcast (i64 (i8*, i8*)* @wrongtype4 to i8*),
+  i8* bitcast (i64 (i8*, i128)* @wrongtype5 to i8*),
+  i8* bitcast (i64 (i8*)* @usesthis to i8*),
+  i8* bitcast (i8 (i8*)* @reads to i8*)
+], !type !0
+
+; M0: define i64 @ok1
+; M1: define available_externally i64 @ok1
+define i64 @ok1(i8* %this) {
+  ret i64 42
+}
+
+; M0: define i64 @ok2
+; M1: define available_externally i64 @ok2
+define i64 @ok2(i8* %this, i64 %arg) {
+  ret i64 %arg
+}
+
+; M0: define void @wrongtype1
+; M1: declare void @wrongtype1()
+define void @wrongtype1(i8*) {
+  ret void
+}
+
+; M0: define i128 @wrongtype2
+; M1: declare void @wrongtype2()
+define i128 @wrongtype2(i8*) {
+  ret i128 0
+}
+
+; M0: define i64 @wrongtype3
+; M1: declare void @wrongtype3()
+define i64 @wrongtype3() {
+  ret i64 0
+}
+
+; M0: define i64 @wrongtype4
+; M1: declare void @wrongtype4()
+define i64 @wrongtype4(i8*, i8*) {
+  ret i64 0
+}
+
+; M0: define i64 @wrongtype5
+; M1: declare void @wrongtype5()
+define i64 @wrongtype5(i8*, i128) {
+  ret i64 0
+}
+
+; M0: define i64 @usesthis
+; M1: declare void @usesthis()
+define i64 @usesthis(i8* %this) {
+  %i = ptrtoint i8* %this to i64
+  ret i64 %i
+}
+
+; M0: define i8 @reads
+; M1: declare void @reads()
+define i8 @reads(i8* %this) {
+  %l = load i8, i8* %this
+  ret i8 %l
+}
+
+!0 = !{i32 0, !"typeid"}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/split.ll b/test/Transforms/ThinLTOBitcodeWriter/split.ll
index b86d7017c8bbe83988ad11d55371efc136fa9065..d37d10bd356014ca132a0c116099480a43eaf331 100644
--- a/test/Transforms/ThinLTOBitcodeWriter/split.ll
+++ b/test/Transforms/ThinLTOBitcodeWriter/split.ll
@@ -1,11 +1,26 @@
-; RUN: opt -thinlto-bc -o %t %s
-; RUN: llvm-modextract -b -n 0 -o %t0 %t
-; RUN: llvm-modextract -b -n 1 -o %t1 %t
+; Generate bitcode files with summary, as well as minimized bitcode without
+; the debug metadata for the thin link.
+; RUN: opt -thinlto-bc -thin-link-bitcode-file=%t2 -o %t %s
+; RUN: llvm-modextract -b -n 0 -o %t0.bc %t
+; RUN: llvm-modextract -b -n 1 -o %t1.bc %t
+; RUN: llvm-modextract -b -n 0 -o %t0.thinlink.bc %t2
+; RUN: llvm-modextract -b -n 1 -o %t1.thinlink.bc %t2
 ; RUN: not llvm-modextract -b -n 2 -o - %t 2>&1 | FileCheck --check-prefix=ERROR %s
-; RUN: llvm-dis -o - %t0 | FileCheck --check-prefix=M0 %s
-; RUN: llvm-dis -o - %t1 | FileCheck --check-prefix=M1 %s
-; RUN: llvm-bcanalyzer -dump %t0 | FileCheck --check-prefix=BCA0 %s
-; RUN: llvm-bcanalyzer -dump %t1 | FileCheck --check-prefix=BCA1 %s
+; RUN: llvm-dis -o - %t0.bc | FileCheck --check-prefix=M0 %s
+; RUN: llvm-dis -o - %t1.bc | FileCheck --check-prefix=M1 %s
+; RUN: llvm-dis -o - %t0.thinlink.bc | FileCheck --check-prefix=NODEBUG %s
+; RUN: llvm-dis -o - %t1.thinlink.bc | FileCheck --check-prefix=NODEBUG %s
+; RUN: llvm-bcanalyzer -dump %t0.bc | FileCheck --check-prefix=BCA0 %s
+; RUN: llvm-bcanalyzer -dump %t1.bc | FileCheck --check-prefix=BCA1 %s
+
+; Make sure the combined index files produced by both the normal and the
+; thin link bitcode files are identical
+; RUN: llvm-lto -thinlto -o %t3 %t0.bc
+; Copy the minimized bitcode to the regular bitcode path so the module
+; paths in the index are the same.
+; RUN: cp %t0.thinlink.bc %t0.bc
+; RUN: llvm-lto -thinlto -o %t4 %t0.bc
+; RUN: diff %t3.thinlto.bc %t4.thinlto.bc
 
 ; ERROR: llvm-modextract: error: module index out of range; bitcode file contains 2 module(s)
 
@@ -26,3 +41,11 @@ define i8* @f() {
 
 ; M1: !0 = !{i32 0, !"typeid"}
 !0 = !{i32 0, !"typeid"}
+
+; M0: !llvm.dbg.cu
+; M1-NOT: !llvm.dbg.cu
+; NODEBUG-NOT: !llvm.dbg.cu
+!llvm.dbg.cu = !{}
+
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!llvm.module.flags = !{!1}
diff --git a/test/Transforms/ThinLTOBitcodeWriter/unsplittable.ll b/test/Transforms/ThinLTOBitcodeWriter/unsplittable.ll
index fbc97a00097126cbfe80515b9eb771490e7bc16a..718013e39b3eea1223f013101f56648c97e56a8d 100644
--- a/test/Transforms/ThinLTOBitcodeWriter/unsplittable.ll
+++ b/test/Transforms/ThinLTOBitcodeWriter/unsplittable.ll
@@ -1,6 +1,9 @@
-; RUN: opt -thinlto-bc -o %t %s
+; RUN: opt -thinlto-bc -thin-link-bitcode-file=%t2 -o %t %s
 ; RUN: llvm-dis -o - %t | FileCheck %s
 ; RUN: llvm-bcanalyzer -dump %t | FileCheck --check-prefix=BCA %s
+; When not splitting the module, the thin link bitcode file should simply be a
+; copy of the regular module.
+; RUN: diff %t %t2
 
 ; BCA-NOT: <GLOBALVAL_SUMMARY_BLOCK
 
diff --git a/test/Transforms/Util/PredicateInfo/condprop.ll b/test/Transforms/Util/PredicateInfo/condprop.ll
new file mode 100644
index 0000000000000000000000000000000000000000..79c76baa6f6191911675dc4876387d9c1a088713
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/condprop.ll
@@ -0,0 +1,471 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo -analyze  < %s 2>&1 | FileCheck %s
+
+@a = external global i32		; <i32*> [#uses=7]
+
+define i32 @test1() nounwind {
+; CHECK-LABEL: @test1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i32 [[TMP0]], 4
+; CHECK-NEXT:    br i1 [[TMP1]], label [[BB:%.*]], label [[BB1:%.*]]
+; CHECK:       bb:
+; CHECK-NEXT:    br label [[BB8:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[TMP2]], 5
+; CHECK-NEXT:    br i1 [[TMP3]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[TMP4:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BB4:%.*]], label [[BB5:%.*]]
+; CHECK:       bb4:
+; CHECK-NEXT:    [[TMP6:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = add i32 [[TMP6]], 5
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb5:
+; CHECK-NEXT:    [[TMP8:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 5
+; CHECK-NEXT:    br i1 [[TMP9]], label [[BB6:%.*]], label [[BB7:%.*]]
+; CHECK:       bb6:
+; CHECK-NEXT:    [[TMP10:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb7:
+; CHECK-NEXT:    [[TMP12:%.*]] = load i32, i32* @a, align 4
+; CHECK-NEXT:    br label [[BB8]]
+; CHECK:       bb8:
+; CHECK-NEXT:    [[DOT0:%.*]] = phi i32 [ [[TMP12]], [[BB7]] ], [ [[TMP11]], [[BB6]] ], [ [[TMP7]], [[BB4]] ], [ 4, [[BB2]] ], [ 5, [[BB]] ]
+; CHECK-NEXT:    br label [[RETURN:%.*]]
+; CHECK:       return:
+; CHECK-NEXT:    ret i32 [[DOT0]]
+;
+entry:
+  %0 = load i32, i32* @a, align 4
+  %1 = icmp eq i32 %0, 4
+  br i1 %1, label %bb, label %bb1
+
+bb:		; preds = %entry
+  br label %bb8
+
+bb1:		; preds = %entry
+  %2 = load i32, i32* @a, align 4
+  %3 = icmp eq i32 %2, 5
+  br i1 %3, label %bb2, label %bb3
+
+bb2:		; preds = %bb1
+  br label %bb8
+
+bb3:		; preds = %bb1
+  %4 = load i32, i32* @a, align 4
+  %5 = icmp eq i32 %4, 4
+  br i1 %5, label %bb4, label %bb5
+
+bb4:		; preds = %bb3
+  %6 = load i32, i32* @a, align 4
+  %7 = add i32 %6, 5
+  br label %bb8
+
+bb5:		; preds = %bb3
+  %8 = load i32, i32* @a, align 4
+  %9 = icmp eq i32 %8, 5
+  br i1 %9, label %bb6, label %bb7
+
+bb6:		; preds = %bb5
+  %10 = load i32, i32* @a, align 4
+  %11 = add i32 %10, 4
+  br label %bb8
+
+bb7:		; preds = %bb5
+  %12 = load i32, i32* @a, align 4
+  br label %bb8
+
+bb8:		; preds = %bb7, %bb6, %bb4, %bb2, %bb
+  %.0 = phi i32 [ %12, %bb7 ], [ %11, %bb6 ], [ %7, %bb4 ], [ 4, %bb2 ], [ 5, %bb ]
+  br label %return
+
+return:		; preds = %bb8
+  ret i32 %.0
+}
+
+declare void @foo(i1)
+declare void @bar(i32)
+
+define void @test3(i32 %x, i32 %y) {
+; CHECK-LABEL: @test3(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK:         [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH_ZERO:%.*]], label [[NOPE:%.*]]
+; CHECK:       both_zero:
+; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    call void @bar(i32 [[Y_0]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = and i1 %xz, %yz
+  br i1 %z, label %both_zero, label %nope
+both_zero:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+  call void @foo(i1 %z)
+  ret void
+}
+
+define void @test4(i1 %b, i32 %x) {
+; CHECK-LABEL: @test4(
+; CHECK-NEXT:    br i1 [[B:%.*]], label [[SW:%.*]], label [[CASE3:%.*]]
+; CHECK:       sw:
+; CHECK:         i32 0, label [[CASE0:%.*]]
+; CHECK-NEXT:    i32 1, label [[CASE1:%.*]]
+; CHECK-NEXT:    i32 2, label [[CASE0]]
+; CHECK-NEXT:    i32 3, label [[CASE3]]
+; CHECK-NEXT:    i32 4, label [[DEFAULT:%.*]]
+; CHECK-NEXT:    ] Edge: [label [[SW]],label %case1] }
+; CHECK-NEXT:    [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    switch i32 [[X]], label [[DEFAULT]] [
+; CHECK-NEXT:    i32 0, label [[CASE0]]
+; CHECK-NEXT:    i32 1, label [[CASE1]]
+; CHECK-NEXT:    i32 2, label [[CASE0]]
+; CHECK-NEXT:    i32 3, label [[CASE3]]
+; CHECK-NEXT:    i32 4, label [[DEFAULT]]
+; CHECK-NEXT:    ]
+; CHECK:       default:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       case0:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+; CHECK:       case1:
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    ret void
+; CHECK:       case3:
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    ret void
+;
+  br i1 %b, label %sw, label %case3
+sw:
+  switch i32 %x, label %default [
+  i32 0, label %case0
+  i32 1, label %case1
+  i32 2, label %case0
+  i32 3, label %case3
+  i32 4, label %default
+  ]
+default:
+  call void @bar(i32 %x)
+  ret void
+case0:
+  call void @bar(i32 %x)
+  ret void
+case1:
+  call void @bar(i32 %x)
+  ret void
+case3:
+  call void @bar(i32 %x)
+  ret void
+}
+
+define i1 @test5(i32 %x, i32 %y) {
+; CHECK-LABEL: @test5(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], [[Y:%.*]]
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Y_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X_0]], [[Y_0]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[X_1]], [[Y_1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp eq i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  %cmp2 = icmp ne i32 %x, %y
+  ret i1 %cmp2
+
+different:
+  %cmp3 = icmp eq i32 %x, %y
+  ret i1 %cmp3
+}
+
+define i1 @test6(i32 %x, i32 %y) {
+; CHECK-LABEL: @test6(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ne i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i32 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = icmp ne i32 %x, %y
+  %cmp = icmp eq i32 %x, %y
+  %cmp3 = icmp eq i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i1 @test6_fp(float %x, float %y) {
+; CHECK-LABEL: @test6_fp(
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp une float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp oeq float [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = fcmp une float %x, %y
+  %cmp = fcmp oeq float %x, %y
+  %cmp3 = fcmp oeq float  %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i1 @test7(i32 %x, i32 %y) {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X:%.*]], [[Y:%.*]]
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Y_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[X_0]], [[Y_0]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[X_1]], [[Y_1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = icmp sgt i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  %cmp2 = icmp sle i32 %x, %y
+  ret i1 %cmp2
+
+different:
+  %cmp3 = icmp sgt i32 %x, %y
+  ret i1 %cmp3
+}
+
+define i1 @test7_fp(float %x, float %y) {
+; CHECK-LABEL: @test7_fp(
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X:%.*]], [[Y:%.*]]
+; CHECK:         [[X_0:%.*]] = call float @llvm.ssa.copy.f32(float [[X]])
+; CHECK:         [[X_1:%.*]] = call float @llvm.ssa.copy.f32(float [[X]])
+; CHECK:         [[Y_0:%.*]] = call float @llvm.ssa.copy.f32(float [[Y]])
+; CHECK:         [[Y_1:%.*]] = call float @llvm.ssa.copy.f32(float [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ule float [[X_0]], [[Y_0]]
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp ogt float [[X_1]], [[Y_1]]
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp = fcmp ogt float %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  %cmp2 = fcmp ule float %x, %y
+  ret i1 %cmp2
+
+different:
+  %cmp3 = fcmp ogt float %x, %y
+  ret i1 %cmp3
+}
+
+define i1 @test8(i32 %x, i32 %y) {
+; CHECK-LABEL: @test8(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sle i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp sgt i32 [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = icmp sle i32 %x, %y
+  %cmp = icmp sgt i32 %x, %y
+  %cmp3 = icmp sgt i32 %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i1 @test8_fp(float %x, float %y) {
+; CHECK-LABEL: @test8_fp(
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ule float [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp ogt float [[X]], [[Y]]
+; CHECK-NEXT:    [[CMP3:%.*]] = fcmp ogt float [[X]], [[Y]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[SAME:%.*]], label [[DIFFERENT:%.*]]
+; CHECK:       same:
+; CHECK-NEXT:    ret i1 [[CMP2]]
+; CHECK:       different:
+; CHECK-NEXT:    ret i1 [[CMP3]]
+;
+  %cmp2 = fcmp ule float %x, %y
+  %cmp = fcmp ogt float %x, %y
+  %cmp3 = fcmp ogt float %x, %y
+  br i1 %cmp, label %same, label %different
+
+same:
+  ret i1 %cmp2
+
+different:
+  ret i1 %cmp3
+}
+
+define i32 @test9(i32 %i, i32 %j) {
+; CHECK-LABEL: @test9(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
+; CHECK:         [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]])
+; CHECK:         [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
+; CHECK-NEXT:    ret i32 [[DIFF]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret i32 5
+;
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %cond_true, label %ret
+
+cond_true:
+  %diff = sub i32 %i, %j
+  ret i32 %diff
+
+ret:
+  ret i32 5
+}
+
+define i32 @test10(i32 %j, i32 %i) {
+; CHECK-LABEL: @test10(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[I:%.*]], [[J:%.*]]
+; CHECK:         [[I_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[I]])
+; CHECK:         [[J_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[J]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[RET:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    [[DIFF:%.*]] = sub i32 [[I_0]], [[J_0]]
+; CHECK-NEXT:    ret i32 [[DIFF]]
+; CHECK:       ret:
+; CHECK-NEXT:    ret i32 5
+;
+  %cmp = icmp eq i32 %i, %j
+  br i1 %cmp, label %cond_true, label %ret
+
+cond_true:
+  %diff = sub i32 %i, %j
+  ret i32 %diff
+
+ret:
+  ret i32 5
+}
+
+declare i32 @yogibar()
+
+define i32 @test11(i32 %x) {
+; CHECK-LABEL: @test11(
+; CHECK-NEXT:    [[V0:%.*]] = call i32 @yogibar()
+; CHECK-NEXT:    [[V1:%.*]] = call i32 @yogibar()
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[V0]], [[V1]]
+; CHECK:         [[V0_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V0]])
+; CHECK:         [[V1_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V1]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[NEXT:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    ret i32 [[V1_0]]
+; CHECK:       next:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[X:%.*]], [[V0_0]]
+; CHECK:         [[V0_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[V0_0]])
+; CHECK-NEXT:    br i1 [[CMP2]], label [[COND_TRUE2:%.*]], label [[NEXT2:%.*]]
+; CHECK:       cond_true2:
+; CHECK-NEXT:    ret i32 [[V0_0_1]]
+; CHECK:       next2:
+; CHECK-NEXT:    ret i32 0
+;
+  %v0 = call i32 @yogibar()
+  %v1 = call i32 @yogibar()
+  %cmp = icmp eq i32 %v0, %v1
+  br i1 %cmp, label %cond_true, label %next
+
+cond_true:
+  ret i32 %v1
+
+next:
+  %cmp2 = icmp eq i32 %x, %v0
+  br i1 %cmp2, label %cond_true2, label %next2
+
+cond_true2:
+  ret i32 %v0
+
+next2:
+  ret i32 0
+}
+
+define i32 @test12(i32 %x) {
+; CHECK-LABEL: @test12(
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]]
+; CHECK:       cond_true:
+; CHECK-NEXT:    br label [[RET:%.*]]
+; CHECK:       cond_false:
+; CHECK-NEXT:    br label [[RET]]
+; CHECK:       ret:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0]], [[COND_TRUE]] ], [ [[X_1]], [[COND_FALSE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %cond_true, label %cond_false
+
+cond_true:
+  br label %ret
+
+cond_false:
+  br label %ret
+
+ret:
+  %res = phi i32 [ %x, %cond_true ], [ %x, %cond_false ]
+  ret i32 %res
+}
diff --git a/test/Transforms/Util/PredicateInfo/diamond.ll b/test/Transforms/Util/PredicateInfo/diamond.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e3f56d88caf0e7af5518507a298575e86da45f4d
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/diamond.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo < %s 2>&1 | FileCheck %s
+define i1 @f(i32 %x, i1 %y) {
+; CHECK-LABEL: @f(
+; CHECK-NEXT:    br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[BB3:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[X2:%.*]] = add nuw nsw i32 [[X]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[X2]], 2
+; CHECK:         [[X2_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X2]])
+; CHECK-NEXT:    br i1 [[CMP2]], label [[BB2]], label [[BB3]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i1 false
+;
+  br i1 %y, label %bb0, label %bb1
+  bb0:
+  %cmp = icmp sge i32 %x, 0  ; x > 0
+  br i1 %cmp, label %bb2, label %bb3
+  bb1:
+  %x2 = add nsw nuw i32 %x, 1
+  %cmp2 = icmp sge i32 %x2, 2     ; x+1 > 2 / x > 1
+  br i1 %cmp2, label %bb2, label %bb3
+  bb2:
+  %x3 = phi i32 [ %x, %bb0 ], [ %x2, %bb1 ]
+  br label %bb3
+  bb3:
+  ret i1 0
+}
+
+define i1 @g(i32 %x, i1 %y) {
+; CHECK-LABEL: @g(
+; CHECK-NEXT:    br i1 [[Y:%.*]], label [[BB0:%.*]], label [[BB1:%.*]]
+; CHECK:       bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sge i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB3:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[X2:%.*]] = add nuw nsw i32 [[X]], 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sge i32 [[X2]], 2
+; CHECK:         [[X2_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X2]])
+; CHECK-NEXT:    br i1 [[CMP2]], label [[BB3]], label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[X3:%.*]] = phi i32 [ [[X_0]], [[BB0]] ], [ [[X2_0]], [[BB1]] ]
+; CHECK-NEXT:    br label [[BB3]]
+; CHECK:       bb3:
+; CHECK-NEXT:    ret i1 false
+;
+  br i1 %y, label %bb0, label %bb1
+  bb0:
+  %cmp = icmp sge i32 %x, 0  ; x > 0
+  br i1 %cmp, label %bb3, label %bb2
+  bb1:
+  %x2 = add nsw nuw i32 %x, 1
+  %cmp2 = icmp sge i32 %x2, 2     ; x+1 > 2 / x > 1
+  br i1 %cmp2, label %bb3, label %bb2
+  bb2:
+  %x3 = phi i32 [ %x, %bb0 ], [ %x2, %bb1 ]
+  br label %bb3
+  bb3:
+  ret i1 0
+}
+
diff --git a/test/Transforms/Util/PredicateInfo/edge.ll b/test/Transforms/Util/PredicateInfo/edge.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6c58540e1050b84869113fd54598d638d7efd4b2
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/edge.ll
@@ -0,0 +1,242 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo -analyze  < %s 2>&1 | FileCheck %s
+
+define i32 @f1(i32 %x) {
+; CHECK-LABEL: @f1(
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB2:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
+; CHECK-NEXT:    ret i32 [[FOO]]
+;
+bb0:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %bb2, label %bb1
+bb1:
+  br label %bb2
+bb2:
+  %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
+  %foo = add i32 %cond, %x
+  ret i32 %foo
+}
+
+define i32 @f2(i32 %x) {
+; CHECK-LABEL: @f2(
+; CHECK-NEXT:  bb0:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[X:%.*]], 0
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[BB1:%.*]], label [[BB2:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
+; CHECK-NEXT:    ret i32 [[FOO]]
+;
+bb0:
+  %cmp = icmp ne i32 %x, 0
+  br i1 %cmp, label %bb1, label %bb2
+bb1:
+  br label %bb2
+bb2:
+  %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
+  %foo = add i32 %cond, %x
+  ret i32 %foo
+}
+
+define i32 @f3(i32 %x) {
+; CHECK-LABEL: @f3(
+; CHECK-NEXT:  bb0:
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X:%.*]])
+; CHECK-NEXT:    switch i32 [[X]], label [[BB1:%.*]] [
+; CHECK-NEXT:    i32 0, label [[BB2:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[COND:%.*]] = phi i32 [ [[X_0]], [[BB0:%.*]] ], [ 0, [[BB1]] ]
+; CHECK-NEXT:    [[FOO:%.*]] = add i32 [[COND]], [[X]]
+; CHECK-NEXT:    ret i32 [[FOO]]
+;
+bb0:
+  switch i32 %x, label %bb1 [ i32 0, label %bb2]
+bb1:
+  br label %bb2
+bb2:
+  %cond = phi i32 [ %x, %bb0 ], [ 0, %bb1 ]
+  %foo = add i32 %cond, %x
+  ret i32 %foo
+}
+
+
+define double @fcmp_oeq_not_zero(double %x, double %y) {
+; CHECK-LABEL: @fcmp_oeq_not_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 2.000000e+00
+; CHECK:         [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %cmp = fcmp oeq double %y, 2.0
+  br i1 %cmp, label %if, label %return
+
+if:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
+
+}
+
+define double @fcmp_une_not_zero(double %x, double %y) {
+; CHECK-LABEL: @fcmp_une_not_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], 2.000000e+00
+; CHECK:         [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %cmp = fcmp une double %y, 2.0
+  br i1 %cmp, label %return, label %else
+
+else:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
+
+}
+
+define double @fcmp_oeq_zero(double %x, double %y) {
+; CHECK-LABEL: @fcmp_oeq_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], 0.000000e+00
+; CHECK:         [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %cmp = fcmp oeq double %y, 0.0
+  br i1 %cmp, label %if, label %return
+
+if:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
+
+}
+
+define double @fcmp_une_zero(double %x, double %y) {
+; CHECK-LABEL: @fcmp_une_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], -0.000000e+00
+; CHECK:         [[Y_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Y]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Y_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %cmp = fcmp une double %y, -0.0
+  br i1 %cmp, label %return, label %else
+
+else:
+  %div = fdiv double %x, %y
+  br label %return
+
+return:
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
+
+}
+
+
+define double @fcmp_oeq_maybe_zero(double %x, double %y, double %z1, double %z2) {
+; CHECK-LABEL: @fcmp_oeq_maybe_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq double [[Y:%.*]], [[Z]]
+; CHECK:         [[Z_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Z]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[IF:%.*]], label [[RETURN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[IF]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %z = fadd double %z1, %z2
+  %cmp = fcmp oeq double %y, %z
+  br i1 %cmp, label %if, label %return
+
+if:
+  %div = fdiv double %x, %z
+  br label %return
+
+return:
+  %retval = phi double [ %div, %if ], [ %x, %entry ]
+  ret double %retval
+
+}
+
+define double @fcmp_une_maybe_zero(double %x, double %y, double %z1, double %z2) {
+; CHECK-LABEL: @fcmp_une_maybe_zero(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[Z:%.*]] = fadd double [[Z1:%.*]], [[Z2:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp une double [[Y:%.*]], [[Z]]
+; CHECK:         [[Z_0:%.*]] = call double @llvm.ssa.copy.f64(double [[Z]])
+; CHECK-NEXT:    br i1 [[CMP]], label [[RETURN:%.*]], label [[ELSE:%.*]]
+; CHECK:       else:
+; CHECK-NEXT:    [[DIV:%.*]] = fdiv double [[X:%.*]], [[Z_0]]
+; CHECK-NEXT:    br label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi double [ [[DIV]], [[ELSE]] ], [ [[X]], [[ENTRY:%.*]] ]
+; CHECK-NEXT:    ret double [[RETVAL]]
+;
+entry:
+  %z = fadd double %z1, %z2
+  %cmp = fcmp une double %y, %z
+  br i1 %cmp, label %return, label %else
+
+else:
+  %div = fdiv double %x, %z
+  br label %return
+
+return:
+  %retval = phi double [ %div, %else ], [ %x, %entry ]
+  ret double %retval
+
+}
diff --git a/test/Transforms/Util/PredicateInfo/testandor.ll b/test/Transforms/Util/PredicateInfo/testandor.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5942ed155318ceb71c9cc5930d166bee427fc486
--- /dev/null
+++ b/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -0,0 +1,211 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -print-predicateinfo < %s 2>&1 | FileCheck %s
+
+declare void @foo(i1)
+declare void @bar(i32)
+declare void @llvm.assume(i1)
+
+define void @testor(i32 %x, i32 %y) {
+; CHECK-LABEL: @testor(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
+; CHECK:         [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[ONEOF:%.*]], label [[NEITHER:%.*]]
+; CHECK:       oneof:
+; CHECK-NEXT:    call void @foo(i1 [[XZ]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ]])
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    call void @bar(i32 [[Y]])
+; CHECK-NEXT:    ret void
+; CHECK:       neither:
+; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    call void @bar(i32 [[Y_0]])
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = or i1 %xz, %yz
+  br i1 %z, label %oneof, label %neither
+oneof:
+;; Should not insert on the true edge for or
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+neither:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  call void @foo(i1 %z)
+  ret void
+}
+define void @testand(i32 %x, i32 %y) {
+; CHECK-LABEL: @testand(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK:         [[XZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[YZ_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[Y_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[XZ_0]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0]])
+; CHECK-NEXT:    call void @bar(i32 [[Y_0]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[XZ]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ]])
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    call void @bar(i32 [[Y]])
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = and i1 %xz, %yz
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+;; Should not insert on the false edge for and
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  call void @foo(i1 %z)
+  ret void
+}
+define void @testandsame(i32 %x, i32 %y) {
+; CHECK-LABEL: @testandsame(
+; CHECK-NEXT:    [[XGT:%.*]] = icmp sgt i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[XLT:%.*]] = icmp slt i32 [[X]], 100
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XGT]], [[XLT]]
+; CHECK:         [[XGT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XGT]])
+; CHECK:         [[X_0:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[X_0_1:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X_0]])
+; CHECK:         [[XLT_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XLT]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[XGT_0]])
+; CHECK-NEXT:    call void @foo(i1 [[XLT_0]])
+; CHECK-NEXT:    call void @bar(i32 [[X_0_1]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[XGT]])
+; CHECK-NEXT:    call void @foo(i1 [[XLT]])
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xgt = icmp sgt i32 %x, 0
+  %xlt = icmp slt i32 %x, 100
+  %z = and i1 %xgt, %xlt
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xgt)
+  call void @foo(i1 %xlt)
+  call void @bar(i32 %x)
+  ret void
+nope:
+  call void @foo(i1 %xgt)
+  call void @foo(i1 %xlt)
+  call void @foo(i1 %z)
+  ret void
+}
+
+define void @testandassume(i32 %x, i32 %y) {
+; CHECK-LABEL: @testandassume(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = and i1 [[XZ]], [[YZ]]
+; CHECK:         [[TMP1:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[XZ]])
+; CHECK:         [[TMP2:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[X]])
+; CHECK:         [[TMP3:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[YZ]])
+; CHECK:         [[TMP4:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[Y]])
+; CHECK:         [[TMP5:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[TMP5]])
+; CHECK:         [[DOT0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP1]])
+; CHECK:         [[DOT01:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP2]])
+; CHECK:         [[DOT02:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP3]])
+; CHECK:         [[DOT03:%.*]] = call i32 @llvm.ssa.copy.i32(i32 [[TMP4]])
+; CHECK:         [[DOT04:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[TMP5]])
+; CHECK-NEXT:    br i1 [[TMP5]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[DOT0]])
+; CHECK-NEXT:    call void @foo(i1 [[DOT02]])
+; CHECK-NEXT:    call void @bar(i32 [[DOT01]])
+; CHECK-NEXT:    call void @bar(i32 [[DOT03]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[DOT04]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = and i1 %xz, %yz
+  call void @llvm.assume(i1 %z)
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+  call void @foo(i1 %z)
+  ret void
+}
+
+;; Unlike and/or for branches, assume is *always* true, so we only match and for it
+define void @testorassume(i32 %x, i32 %y) {
+;
+; CHECK-LABEL: @testorassume(
+; CHECK-NEXT:    [[XZ:%.*]] = icmp eq i32 [[X:%.*]], 0
+; CHECK-NEXT:    [[YZ:%.*]] = icmp eq i32 [[Y:%.*]], 0
+; CHECK-NEXT:    [[Z:%.*]] = or i1 [[XZ]], [[YZ]]
+; CHECK-NEXT:    call void @llvm.assume(i1 [[Z]])
+; CHECK:         [[Z_0:%.*]] = call i1 @llvm.ssa.copy.i1(i1 [[Z]])
+; CHECK-NEXT:    br i1 [[Z]], label [[BOTH:%.*]], label [[NOPE:%.*]]
+; CHECK:       both:
+; CHECK-NEXT:    call void @foo(i1 [[XZ]])
+; CHECK-NEXT:    call void @foo(i1 [[YZ]])
+; CHECK-NEXT:    call void @bar(i32 [[X]])
+; CHECK-NEXT:    call void @bar(i32 [[Y]])
+; CHECK-NEXT:    ret void
+; CHECK:       nope:
+; CHECK-NEXT:    call void @foo(i1 [[Z_0]])
+; CHECK-NEXT:    ret void
+;
+  %xz = icmp eq i32 %x, 0
+  %yz = icmp eq i32 %y, 0
+  %z = or i1 %xz, %yz
+  call void @llvm.assume(i1 %z)
+  br i1 %z, label %both, label %nope
+both:
+  call void @foo(i1 %xz)
+  call void @foo(i1 %yz)
+  call void @bar(i32 %x)
+  call void @bar(i32 %y)
+  ret void
+nope:
+  call void @foo(i1 %z)
+  ret void
+}
diff --git a/test/Transforms/Util/clone-dicompileunit.ll b/test/Transforms/Util/clone-dicompileunit.ll
new file mode 100644
index 0000000000000000000000000000000000000000..3f7b5981752d390a9bc20a87fb1e0507c979f9f8
--- /dev/null
+++ b/test/Transforms/Util/clone-dicompileunit.ll
@@ -0,0 +1,66 @@
+; RUN: opt -run-twice -verify -disable-debug-info-type-map -S -o - %s | FileCheck %s
+
+; Generated using:
+; $ cat p.cpp
+; void sink(void *);
+; class A {
+; public:
+;   template <typename> void m_fn2() { static int a; }
+;   virtual void m_fn1();
+; };
+; void foo() {
+;   class B : public A {
+;   public:
+;     B() { m_fn2<B>(); }
+;   };
+;   sink(new B);
+; }
+; $ clang++ -target x86_64-unknown-linux -fvisibility=hidden -O2 -g2 -flto -S p.cpp -o p.ll
+; # then manually removed function/gv definitions
+
+; Test that when the module is cloned it does not contain a reference to
+; the original DICompileUnit as a result of a collision between the cloned
+; DISubprogram for m_fn2<B> (which refers to the non-ODR entity B via
+; template parameters) and the original DISubprogram.
+
+; CHECK: DICompileUnit
+; CHECK-NOT: DICompileUnit
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux"
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!28, !29}
+!llvm.ident = !{!30}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 5.0.0 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, globals: !3)
+!1 = !DIFile(filename: "p.cpp", directory: "/usr/local/google/home/pcc/b682773-2-repro/small2")
+!2 = !{}
+!3 = !{!4}
+!4 = !DIGlobalVariableExpression(var: !5)
+!5 = distinct !DIGlobalVariable(name: "a", scope: !6, file: !1, line: 5, type: !27, isLocal: true, isDefinition: true)
+!6 = distinct !DISubprogram(name: "m_fn2<B>", linkageName: "_ZN1A5m_fn2IZ3foovE1BEEvv", scope: !7, file: !1, line: 5, type: !8, isLocal: true, isDefinition: true, scopeLine: 5, flags: DIFlagPrototyped, isOptimized: true, unit: !0, templateParams: !11, declaration: !23, variables: !24)
+!7 = !DICompositeType(tag: DW_TAG_class_type, name: "A", file: !1, line: 3, flags: DIFlagFwdDecl, identifier: "_ZTS1A")
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!11 = !{!12}
+!12 = !DITemplateTypeParameter(type: !13)
+!13 = distinct !DICompositeType(tag: DW_TAG_class_type, name: "B", scope: !14, file: !1, line: 10, size: 64, elements: !17, vtableHolder: !7)
+!14 = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov", scope: !1, file: !1, line: 9, type: !15, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !2)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !19}
+!18 = !DIDerivedType(tag: DW_TAG_inheritance, scope: !13, baseType: !7, flags: DIFlagPublic)
+!19 = !DISubprogram(name: "B", scope: !13, file: !1, line: 12, type: !20, isLocal: false, isDefinition: false, scopeLine: 12, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true)
+!20 = !DISubroutineType(types: !21)
+!21 = !{null, !22}
+!22 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !13, size: 64, flags: DIFlagArtificial | DIFlagObjectPointer)
+!23 = !DISubprogram(name: "m_fn2<B>", linkageName: "_ZN1A5m_fn2IZ3foovE1BEEvv", scope: !7, file: !1, line: 5, type: !8, isLocal: false, isDefinition: false, scopeLine: 5, flags: DIFlagPublic | DIFlagPrototyped, isOptimized: true, templateParams: !11)
+!24 = !{!25}
+!25 = !DILocalVariable(name: "this", arg: 1, scope: !6, type: !26, flags: DIFlagArtificial | DIFlagObjectPointer)
+!26 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64)
+!27 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!28 = !{i32 2, !"Dwarf Version", i32 4}
+!29 = !{i32 2, !"Debug Info Version", i32 3}
+!30 = !{!"clang version 5.0.0 "}
diff --git a/test/Transforms/Util/strip-nonlinetable-debuginfo-loops.ll b/test/Transforms/Util/strip-nonlinetable-debuginfo-loops.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5f88e31da9fc4c78d64165a0ecdfbce71da6f1e7
--- /dev/null
+++ b/test/Transforms/Util/strip-nonlinetable-debuginfo-loops.ll
@@ -0,0 +1,71 @@
+; RUN: opt -S -strip-nonlinetable-debuginfo %s -o %t
+; RUN: cat %t | FileCheck %s
+; RUN: cat %t | FileCheck %s --check-prefix=NEGATIVE
+; void f(volatile int *i) {
+;   while (--*i) {}
+; }
+source_filename = "/tmp/loop.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+define void @f(i32* %i) local_unnamed_addr #0 !dbg !7 {
+entry:
+  tail call void @llvm.dbg.value(metadata i32* %i, i64 0, metadata !14, metadata !15), !dbg !16
+  br label %while.cond, !dbg !17
+
+while.cond:                                       ; preds = %while.cond, %entry
+  %0 = load volatile i32, i32* %i, align 4, !dbg !18, !tbaa !19
+  %dec = add nsw i32 %0, -1, !dbg !18
+  store volatile i32 %dec, i32* %i, align 4, !dbg !18, !tbaa !19
+  %tobool = icmp eq i32 %dec, 0, !dbg !17
+  ; CHECK: !llvm.loop ![[LOOP:[0-9]+]]
+  br i1 %tobool, label %while.end, label %while.cond, !dbg !17, !llvm.loop !23
+
+while.end:                                        ; preds = %while.cond
+  ret void, !dbg !25
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
+
+attributes #0 = { nounwind ssp uwtable }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+; CHECK: ![[CU:.*]] = distinct !DICompileUnit(language: DW_LANG_C99,
+; CHECK-SAME:                                 emissionKind: LineTablesOnly
+; NEGATIVE-NOT: !DICompileUnit({{.*}} emissionKind: FullDebug
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (trunk 298880) (llvm/trunk 298875)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "/tmp/loop.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 5.0.0 (trunk 298880) (llvm/trunk 298875)"}
+; CHECK: ![[F:[0-9]]] = distinct !DISubprogram(name: "f", scope: !1
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !13)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !12)
+; NEGATIVE-NOT: !DIBasicType(name: "int",
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !{!14}
+!14 = !DILocalVariable(name: "i", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!15 = !DIExpression()
+!16 = !DILocation(line: 1, column: 22, scope: !7)
+; CHECK: ![[BEGIN:[0-9]+]] = !DILocation(line: 2, column: 3, scope: ![[F]])
+!17 = !DILocation(line: 2, column: 3, scope: !7)
+!18 = !DILocation(line: 2, column: 10, scope: !7)
+!19 = !{!20, !20, i64 0}
+!20 = !{!"int", !21, i64 0}
+!21 = !{!"omnipotent char", !22, i64 0}
+!22 = !{!"Simple C/C++ TBAA"}
+; CHECK: ![[LOOP]] = distinct !{![[LOOP]], ![[BEGIN]], ![[END:[0-9]+]]}
+!23 = distinct !{!23, !17, !24}
+; CHECK: ![[END]] = !DILocation(line: 3, column: 3, scope: ![[F]])
+!24 = !DILocation(line: 3, column: 3, scope: !7)
+!25 = !DILocation(line: 4, column: 1, scope: !7)
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/export.yaml b/test/Transforms/WholeProgramDevirt/Inputs/export.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..0f6f59de75224cc7f97b52e40ad83bbc248a0760
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/export.yaml
@@ -0,0 +1,20 @@
+---
+GlobalValueMap:
+  42:
+    - TypeTestAssumeVCalls:
+        - GUID: 14276520915468743435  # typeid1
+          Offset: 0
+      TypeCheckedLoadVCalls:
+        - GUID: 15427464259790519041  # typeid2
+          Offset: 0
+      TypeTestAssumeConstVCalls:
+        - VFunc:
+            GUID: 3515965990081467659  # typeid3
+            Offset: 0
+          Args: [12, 24]
+      TypeCheckedLoadConstVCalls:
+        - VFunc:
+            GUID: 17525413373118030901  # typeid4
+            Offset: 0
+          Args: [24, 12]
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-indir.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-indir.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..1cb3ad3f134c24879c3068fa49d77801d9f4ebd2
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-indir.yaml
@@ -0,0 +1,41 @@
+---
+GlobalValueMap:
+  42:
+    - TypeTestAssumeVCalls:
+        - GUID: 123
+          Offset: 0
+        - GUID: 456
+          Offset: 4
+      TypeCheckedLoadVCalls:
+        - GUID: 789
+          Offset: 8
+        - GUID: 1234
+          Offset: 16
+      TypeTestAssumeConstVCalls:
+        - VFunc:
+            GUID: 123
+            Offset: 4
+          Args: [12, 24]
+      TypeCheckedLoadConstVCalls:
+        - VFunc:
+            GUID: 456
+            Offset: 8
+          Args: [24, 12]
+TypeIdMap:
+  typeid1:
+    WPDRes:
+      0:
+        Kind: Indir
+      4:
+        Kind: Indir
+        ResByArg:
+          "":
+            Kind: UniformRetVal
+            Info: 12
+          12:
+            Kind: UniformRetVal
+            Info: 24
+          "12,24":
+            Kind: UniformRetVal
+            Info: 48
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-single-impl.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-single-impl.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..26764eb3b29c2f840902b486d51ff53088f87c32
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-single-impl.yaml
@@ -0,0 +1,13 @@
+---
+TypeIdMap:
+  typeid1:
+    WPDRes:
+      0:
+        Kind: SingleImpl
+        SingleImplName: singleimpl1
+  typeid2:
+    WPDRes:
+      8:
+        Kind: SingleImpl
+        SingleImplName: singleimpl2
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-uniform-ret-val.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-uniform-ret-val.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..f1daae63b678273fa9fd06c81a7fcb0cef780f43
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-uniform-ret-val.yaml
@@ -0,0 +1,19 @@
+---
+TypeIdMap:
+  typeid1:
+    WPDRes:
+      0:
+        Kind: Indir
+        ResByArg:
+          1:
+            Kind: UniformRetVal
+            Info: 42
+  typeid2:
+    WPDRes:
+      8:
+        Kind: Indir
+        ResByArg:
+          1:
+            Kind: UniformRetVal
+            Info: 42
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val0.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val0.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..597b17877767f2c7d1f7b579ef358626fc97fb2c
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val0.yaml
@@ -0,0 +1,11 @@
+---
+TypeIdMap:
+  typeid2:
+    WPDRes:
+      8:
+        Kind: Indir
+        ResByArg:
+          3:
+            Kind: UniqueRetVal
+            Info: 0
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val1.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val1.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..737ef1173c3c8b37596c375fb0e5baf7a1948920
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-unique-ret-val1.yaml
@@ -0,0 +1,11 @@
+---
+TypeIdMap:
+  typeid2:
+    WPDRes:
+      8:
+        Kind: Indir
+        ResByArg:
+          3:
+            Kind: UniqueRetVal
+            Info: 1
+...
diff --git a/test/Transforms/WholeProgramDevirt/Inputs/import-vcp.yaml b/test/Transforms/WholeProgramDevirt/Inputs/import-vcp.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..4fbee126d0ea1bd3015d06c62ad535aa9f65dcd1
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/Inputs/import-vcp.yaml
@@ -0,0 +1,19 @@
+---
+TypeIdMap:
+  typeid1:
+    WPDRes:
+      0:
+        Kind: Indir
+        ResByArg:
+          1:
+            Kind: VirtualConstProp
+            Info: 0
+  typeid2:
+    WPDRes:
+      8:
+        Kind: Indir
+        ResByArg:
+          3:
+            Kind: VirtualConstProp
+            Info: 0
+...
diff --git a/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll b/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll
index 4885be7775663d44f29b4accecc1d0b32b345e36..e5d0e74b22e2d8f8f5de6d9fee82612845d3263f 100644
--- a/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll
+++ b/test/Transforms/WholeProgramDevirt/bad-read-from-vtable.ll
@@ -3,8 +3,8 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [2 x i8*] [i8* zeroinitializer, i8* bitcast (void (i8*)* @vf to i8*)], !type !0
-@vt2 = global i8* bitcast (void (i8*)* @vf to i8*), !type !1
+@vt1 = constant [2 x i8*] [i8* zeroinitializer, i8* bitcast (void (i8*)* @vf to i8*)], !type !0
+@vt2 = constant i8* bitcast (void (i8*)* @vf to i8*), !type !1
 
 define void @vf(i8* %this) {
   ret void
diff --git a/test/Transforms/WholeProgramDevirt/export-nothing.ll b/test/Transforms/WholeProgramDevirt/export-nothing.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e0814efbf9c0d984ff7521b2345c4bad02b09144
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-nothing.ll
@@ -0,0 +1,7 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-write-summary=%t -o /dev/null %s
+; RUN: FileCheck %s < %t
+
+; CHECK: ---
+; CHECK-NEXT: GlobalValueMap:
+; CHECK-NEXT: TypeIdMap:
+; CHECK-NEXT: ...
diff --git a/test/Transforms/WholeProgramDevirt/export-single-impl.ll b/test/Transforms/WholeProgramDevirt/export-single-impl.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f4f3fd054c46f28db38376583fa063ea96e59b63
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-single-impl.ll
@@ -0,0 +1,78 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -S -o - %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            SingleImpl
+; SUMMARY-NEXT:         SingleImplName:  vf1
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:   typeid2:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            SingleImpl
+; SUMMARY-NEXT:         SingleImplName:  vf2
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:   typeid3:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            SingleImpl
+; SUMMARY-NEXT:         SingleImplName:  vf3
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:   typeid4:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            SingleImpl
+; SUMMARY-NEXT:         SingleImplName:  'vf4$merged'
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT: ...
+
+; CHECK: @vt1 = constant void (i8*)* @vf1
+@vt1 = constant void (i8*)* @vf1, !type !0
+
+; CHECK: @vt2 = constant void (i8*)* @vf2
+@vt2 = constant void (i8*)* @vf2, !type !1
+
+@vt3 = constant void (i8*)* @vf3, !type !2
+
+; CHECK: @vt4 = constant void (i8*)* @"vf4$merged"
+@vt4 = constant void (i8*)* @vf4, !type !3
+
+@vt5 = constant void (i8*)* @vf5, !type !4
+
+; CHECK: declare void @vf1(i8*)
+declare void @vf1(i8*)
+
+; CHECK: define void @vf2(i8*)
+define void @vf2(i8*) {
+  ret void
+}
+
+declare void @vf3(i8*)
+
+; CHECK: define hidden void @"vf4$merged"
+define internal void @vf4(i8*) {
+  ret void
+}
+
+declare void @vf5(i8*)
+
+!0 = !{i32 0, !"typeid1"}
+!1 = !{i32 0, !"typeid2"}
+!2 = !{i32 0, !"typeid3"}
+!3 = !{i32 0, !"typeid4"}
+!4 = !{i32 0, !5}
+!5 = distinct !{}
diff --git a/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll b/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1d7030c41fd072fcf8f27a76430086958cba0324
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-uniform-ret-val.ll
@@ -0,0 +1,36 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -S -o - %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+; SUMMARY:     - TypeTests:
+; SUMMARY-NEXT:  TypeTestAssumeVCalls:
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid4:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           24,12:
+; SUMMARY-NEXT:             Kind:            UniformRetVal
+; SUMMARY-NEXT:             Info:            36
+
+; CHECK: @vt4a = constant i32 (i8*, i32, i32)* @vf4a
+@vt4a = constant i32 (i8*, i32, i32)* @vf4a, !type !0
+
+; CHECK: @vt4b = constant i32 (i8*, i32, i32)* @vf4b
+@vt4b = constant i32 (i8*, i32, i32)* @vf4b, !type !0
+
+define i32 @vf4a(i8*, i32 %x, i32 %y) {
+  %z = add i32 %x, %y
+  ret i32 %z
+}
+
+define i32 @vf4b(i8*, i32 %x, i32 %y) {
+  ret i32 36
+}
+
+!0 = !{i32 0, !"typeid4"}
diff --git a/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll b/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll
new file mode 100644
index 0000000000000000000000000000000000000000..174a573b5b0de9b6a82154b64ae0954d19bf7d17
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-unique-ret-val.ll
@@ -0,0 +1,79 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -S -o - %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+; SUMMARY:     - TypeTests:
+; SUMMARY-NEXT:  TypeTestAssumeVCalls:
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid3:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           12,24:
+; SUMMARY-NEXT:             Kind:            UniqueRetVal
+; SUMMARY-NEXT:             Info:            0
+; SUMMARY-NEXT:   typeid4:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           24,12:
+; SUMMARY-NEXT:             Kind:            UniqueRetVal
+; SUMMARY-NEXT:             Info:            1
+
+; CHECK: @vt3a = constant i1 (i8*, i32, i32)* @vf3a
+@vt3a = constant i1 (i8*, i32, i32)* @vf3a, !type !0
+
+; CHECK: @vt3b = constant i1 (i8*, i32, i32)* @vf3b
+@vt3b = constant i1 (i8*, i32, i32)* @vf3b, !type !0
+
+; CHECK: @vt3c = constant i1 (i8*, i32, i32)* @vf3c
+@vt3c = constant i1 (i8*, i32, i32)* @vf3c, !type !0
+
+; CHECK: @vt4a = constant i1 (i8*, i32, i32)* @vf4a
+@vt4a = constant i1 (i8*, i32, i32)* @vf4a, !type !1
+
+; CHECK: @vt4b = constant i1 (i8*, i32, i32)* @vf4b
+@vt4b = constant i1 (i8*, i32, i32)* @vf4b, !type !1
+
+; CHECK: @vt4c = constant i1 (i8*, i32, i32)* @vf4c
+@vt4c = constant i1 (i8*, i32, i32)* @vf4c, !type !1
+
+; CHECK: @__typeid_typeid3_0_12_24_unique_member = hidden alias i8, bitcast (i1 (i8*, i32, i32)** @vt3b to i8*)
+; CHECK: @__typeid_typeid4_0_24_12_unique_member = hidden alias i8, bitcast (i1 (i8*, i32, i32)** @vt4b to i8*)
+
+define i1 @vf3a(i8*, i32, i32) {
+  ret i1 true
+}
+
+define i1 @vf3b(i8*, i32, i32) {
+  ret i1 false
+}
+
+define i1 @vf3c(i8*, i32, i32) {
+  ret i1 true
+}
+
+define i1 @vf4a(i8*, i32, i32) {
+  ret i1 false
+}
+
+define i1 @vf4b(i8*, i32, i32) {
+  ret i1 true
+}
+
+define i1 @vf4c(i8*, i32, i32) {
+  ret i1 false
+}
+
+!0 = !{i32 0, !"typeid3"}
+!1 = !{i32 0, !"typeid4"}
diff --git a/test/Transforms/WholeProgramDevirt/export-unsuccessful-checked.ll b/test/Transforms/WholeProgramDevirt/export-unsuccessful-checked.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0785ade28570b8eb9438bff2c6da1f46f0b765d7
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-unsuccessful-checked.ll
@@ -0,0 +1,28 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -o /dev/null %s
+; RUN: FileCheck %s < %t
+
+; CHECK:     - TypeTests: [ 15427464259790519041, 17525413373118030901 ]
+; CHECK-NEXT:  TypeTestAssumeVCalls:
+
+@vt1a = constant void (i8*)* @vf1a, !type !0
+@vt1b = constant void (i8*)* @vf1b, !type !0
+@vt2a = constant void (i8*)* @vf2a, !type !1
+@vt2b = constant void (i8*)* @vf2b, !type !1
+@vt3a = constant void (i8*)* @vf3a, !type !2
+@vt3b = constant void (i8*)* @vf3b, !type !2
+@vt4a = constant void (i8*)* @vf4a, !type !3
+@vt4b = constant void (i8*)* @vf4b, !type !3
+
+declare void @vf1a(i8*)
+declare void @vf1b(i8*)
+declare void @vf2a(i8*)
+declare void @vf2b(i8*)
+declare void @vf3a(i8*)
+declare void @vf3b(i8*)
+declare void @vf4a(i8*)
+declare void @vf4b(i8*)
+
+!0 = !{i32 0, !"typeid1"}
+!1 = !{i32 0, !"typeid2"}
+!2 = !{i32 0, !"typeid3"}
+!3 = !{i32 0, !"typeid4"}
diff --git a/test/Transforms/WholeProgramDevirt/export-vcp.ll b/test/Transforms/WholeProgramDevirt/export-vcp.ll
new file mode 100644
index 0000000000000000000000000000000000000000..8e6e69b9bd439a5362f53d647f84fbe1d5d0bb12
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/export-vcp.ll
@@ -0,0 +1,83 @@
+; RUN: opt -wholeprogramdevirt -wholeprogramdevirt-summary-action=export -wholeprogramdevirt-read-summary=%S/Inputs/export.yaml -wholeprogramdevirt-write-summary=%t -S -o - %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; SUMMARY:      TypeIdMap:
+; SUMMARY-NEXT:   typeid3:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           12,24:
+; SUMMARY-NEXT:             Kind:            VirtualConstProp
+; SUMMARY-NEXT:             Info:            0
+; SUMMARY-NEXT:   typeid4:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           24,12:
+; SUMMARY-NEXT:             Kind:            VirtualConstProp
+; SUMMARY-NEXT:             Info:            0
+
+; CHECK: [[CVT3A:.*]] = private constant { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] zeroinitializer, i1 (i8*, i32, i32)* @vf0i1, [0 x i8] zeroinitializer }, !type !0
+@vt3a = constant i1 (i8*, i32, i32)* @vf0i1, !type !0
+
+; CHECK: [[CVT3B:.*]] = private constant { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] c"\00\00\00\00\00\00\00\01", i1 (i8*, i32, i32)* @vf1i1, [0 x i8] zeroinitializer }, !type !0
+@vt3b = constant i1 (i8*, i32, i32)* @vf1i1, !type !0
+
+; CHECK: [[CVT3C:.*]] = private constant { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] zeroinitializer, i1 (i8*, i32, i32)* @vf0i1, [0 x i8] zeroinitializer }, !type !0
+@vt3c = constant i1 (i8*, i32, i32)* @vf0i1, !type !0
+
+; CHECK: [[CVT3D:.*]] = private constant { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] c"\00\00\00\00\00\00\00\01", i1 (i8*, i32, i32)* @vf1i1, [0 x i8] zeroinitializer }, !type !0
+@vt3d = constant i1 (i8*, i32, i32)* @vf1i1, !type !0
+
+; CHECK: [[CVT4A:.*]] = private constant { [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] c"\00\00\00\00\01\00\00\00", i32 (i8*, i32, i32)* @vf1i32, [0 x i8] zeroinitializer }, !type !1
+@vt4a = constant i32 (i8*, i32, i32)* @vf1i32, !type !1
+
+; CHECK: [[CVT4B:.*]] = private constant { [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] } { [8 x i8] c"\00\00\00\00\02\00\00\00", i32 (i8*, i32, i32)* @vf2i32, [0 x i8] zeroinitializer }, !type !1
+@vt4b = constant i32 (i8*, i32, i32)* @vf2i32, !type !1
+
+; CHECK: @__typeid_typeid3_0_12_24_byte = hidden alias i8, inttoptr (i32 -1 to i8*)
+; CHECK: @__typeid_typeid3_0_12_24_bit = hidden alias i8, inttoptr (i8 1 to i8*)
+; CHECK: @__typeid_typeid4_0_24_12_byte = hidden alias i8, inttoptr (i32 -4 to i8*)
+; CHECK: @__typeid_typeid4_0_24_12_bit = hidden alias i8, inttoptr (i8 1 to i8*)
+
+; CHECK: @vt3a = alias i1 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }* [[CVT3A]], i32 0, i32 1)
+; CHECK: @vt3b = alias i1 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }* [[CVT3B]], i32 0, i32 1)
+; CHECK: @vt3c = alias i1 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }* [[CVT3C]], i32 0, i32 1)
+; CHECK: @vt3d = alias i1 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i1 (i8*, i32, i32)*, [0 x i8] }* [[CVT3D]], i32 0, i32 1)
+; CHECK: @vt4a = alias i32 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] }* [[CVT4A]], i32 0, i32 1)
+; CHECK: @vt4b = alias i32 (i8*, i32, i32)*, getelementptr inbounds ({ [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] }, { [8 x i8], i32 (i8*, i32, i32)*, [0 x i8] }* [[CVT4B]], i32 0, i32 1)
+
+define i1 @vf0i1(i8* %this, i32, i32) readnone {
+  ret i1 0
+}
+
+define i1 @vf1i1(i8* %this, i32, i32) readnone {
+  ret i1 1
+}
+
+define i32 @vf1i32(i8* %this, i32, i32) readnone {
+  ret i32 1
+}
+
+define i32 @vf2i32(i8* %this, i32, i32) readnone {
+  ret i32 2
+}
+
+; CHECK: !0 = !{i32 8, !"typeid3"}
+; CHECK: !1 = !{i32 8, !"typeid4"}
+
+!0 = !{i32 0, !"typeid3"}
+!1 = !{i32 0, !"typeid4"}
diff --git a/test/Transforms/WholeProgramDevirt/import-indir.ll b/test/Transforms/WholeProgramDevirt/import-indir.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1de9352eeb22a65b141d5ca56cd4bc3b682a865d
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/import-indir.ll
@@ -0,0 +1,95 @@
+; Test that we correctly import an indir resolution for type identifier "typeid1".
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-indir.yaml -wholeprogramdevirt-write-summary=%t < %s | FileCheck %s
+; RUN: FileCheck --check-prefix=SUMMARY %s < %t
+
+; SUMMARY:     GlobalValueMap:
+; SUMMARY-NEXT:  42:
+; SUMMARY-NEXT:    - TypeTests:
+; SUMMARY-NEXT:      TypeTestAssumeVCalls:
+; SUMMARY-NEXT:        - GUID:            123
+; SUMMARY-NEXT:          Offset:          0
+; SUMMARY-NEXT:        - GUID:            456
+; SUMMARY-NEXT:          Offset:          4
+; SUMMARY-NEXT:      TypeCheckedLoadVCalls:
+; SUMMARY-NEXT:        - GUID:            789
+; SUMMARY-NEXT:          Offset:          8
+; SUMMARY-NEXT:        - GUID:            1234
+; SUMMARY-NEXT:          Offset:          16
+; SUMMARY-NEXT:      TypeTestAssumeConstVCalls:
+; SUMMARY-NEXT:        - VFunc:
+; SUMMARY-NEXT:            GUID:            123
+; SUMMARY-NEXT:            Offset:          4
+; SUMMARY-NEXT:          Args: [ 12, 24 ]
+; SUMMARY-NEXT:      TypeCheckedLoadConstVCalls:
+; SUMMARY-NEXT:        - VFunc:
+; SUMMARY-NEXT:            GUID:            456
+; SUMMARY-NEXT:            Offset:          8
+; SUMMARY-NEXT:          Args: [ 24, 12 ]
+; SUMMARY-NEXT: TypeIdMap:
+; SUMMARY-NEXT:   typeid1:
+; SUMMARY-NEXT:     TTRes:
+; SUMMARY-NEXT:       Kind:            Unsat
+; SUMMARY-NEXT:       SizeM1BitWidth:  0
+; SUMMARY-NEXT:     WPDRes:
+; SUMMARY-NEXT:       0:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:       4:
+; SUMMARY-NEXT:         Kind:            Indir
+; SUMMARY-NEXT:         SingleImplName:  ''
+; SUMMARY-NEXT:         ResByArg:
+; SUMMARY-NEXT:           :
+; SUMMARY-NEXT:             Kind:            UniformRetVal
+; SUMMARY-NEXT:             Info:            12
+; SUMMARY-NEXT:           12:
+; SUMMARY-NEXT:             Kind:            UniformRetVal
+; SUMMARY-NEXT:             Info:            24
+; SUMMARY-NEXT:           12,24:
+; SUMMARY-NEXT:             Kind:            UniformRetVal
+; SUMMARY-NEXT:             Info:            48
+
+target datalayout = "e-p:32:32"
+
+declare void @llvm.assume(i1)
+declare void @llvm.trap()
+declare {i8*, i1} @llvm.type.checked.load(i8*, i32, metadata)
+declare i1 @llvm.type.test(i8*, metadata)
+
+; CHECK: define i1 @f1
+define i1 @f1(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i1 (i8*, i32)*
+  ; CHECK: call i1 %
+  %result = call i1 %fptr_casted(i8* %obj, i32 5)
+  ret i1 %result
+}
+
+; CHECK: define i1 @f2
+define i1 @f2(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 4, metadata !"typeid1")
+  %fptr = extractvalue {i8*, i1} %pair, 0
+  %p = extractvalue {i8*, i1} %pair, 1
+  ; CHECK: [[P:%.*]] = call i1 @llvm.type.test
+  ; CHECK: br i1 [[P]]
+  br i1 %p, label %cont, label %trap
+
+cont:
+  %fptr_casted = bitcast i8* %fptr to i1 (i8*, i32)*
+  ; CHECK: call i1 %
+  %result = call i1 %fptr_casted(i8* %obj, i32 undef)
+  ret i1 %result
+
+trap:
+  call void @llvm.trap()
+  unreachable
+}
diff --git a/test/Transforms/WholeProgramDevirt/import.ll b/test/Transforms/WholeProgramDevirt/import.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7f34b04ce1193658ad5d3631cafeec7c9f124db4
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/import.ll
@@ -0,0 +1,108 @@
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-single-impl.yaml < %s | FileCheck --check-prefixes=CHECK,SINGLE-IMPL %s
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-uniform-ret-val.yaml < %s | FileCheck --check-prefixes=CHECK,UNIFORM-RET-VAL %s
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-unique-ret-val0.yaml < %s | FileCheck --check-prefixes=CHECK,UNIQUE-RET-VAL0 %s
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-unique-ret-val1.yaml < %s | FileCheck --check-prefixes=CHECK,UNIQUE-RET-VAL1 %s
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-vcp.yaml < %s | FileCheck --check-prefixes=CHECK,VCP,VCP64 %s
+; RUN: opt -S -wholeprogramdevirt -wholeprogramdevirt-summary-action=import -wholeprogramdevirt-read-summary=%S/Inputs/import-vcp.yaml -mtriple=i686-unknown-linux -data-layout=e-p:32:32 < %s | FileCheck --check-prefixes=CHECK,VCP,VCP32 %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; VCP: @__typeid_typeid1_0_1_byte = external hidden global i8, !absolute_symbol !0
+; VCP: @__typeid_typeid1_0_1_bit = external hidden global i8, !absolute_symbol !1
+; VCP: @__typeid_typeid2_8_3_byte = external hidden global i8, !absolute_symbol !0
+; VCP: @__typeid_typeid2_8_3_bit = external hidden global i8, !absolute_symbol !1
+
+; Test cases where the argument values are known and we can apply virtual
+; constant propagation.
+
+; CHECK: define i32 @call1
+define i32 @call1(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [3 x i8*]**
+  %vtable = load [3 x i8*]*, [3 x i8*]** %vtableptr
+  %vtablei8 = bitcast [3 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i32 (i8*, i32)*
+  ; SINGLE-IMPL: call i32 bitcast (void ()* @singleimpl1 to i32 (i8*, i32)*)
+  %result = call i32 %fptr_casted(i8* %obj, i32 1)
+  ; UNIFORM-RET-VAL: ret i32 42
+  ; VCP: [[VT1:%.*]] = bitcast {{.*}} to i8*
+  ; VCP: [[GEP1:%.*]] = getelementptr i8, i8* [[VT1]], i32 ptrtoint (i8* @__typeid_typeid1_0_1_byte to i32)
+  ; VCP: [[BC1:%.*]] = bitcast i8* [[GEP1]] to i32*
+  ; VCP: [[LOAD1:%.*]] = load i32, i32* [[BC1]]
+  ; VCP: ret i32 [[LOAD1]]
+  ret i32 %result
+}
+
+; Test cases where the argument values are unknown, so we cannot apply virtual
+; constant propagation.
+
+; CHECK: define i1 @call2
+define i1 @call2(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 8, metadata !"typeid2")
+  %fptr = extractvalue {i8*, i1} %pair, 0
+  %p = extractvalue {i8*, i1} %pair, 1
+  ; SINGLE-IMPL: br i1 true,
+  br i1 %p, label %cont, label %trap
+
+cont:
+  %fptr_casted = bitcast i8* %fptr to i1 (i8*, i32)*
+  ; SINGLE-IMPL: call i1 bitcast (void ()* @singleimpl2 to i1 (i8*, i32)*)
+  ; UNIFORM-RET-VAL: call i1 %
+  ; UNIQUE-RET-VAL0: call i1 %
+  ; UNIQUE-RET-VAL1: call i1 %
+  %result = call i1 %fptr_casted(i8* %obj, i32 undef)
+  ret i1 %result
+
+trap:
+  call void @llvm.trap()
+  unreachable
+}
+
+; CHECK: define i1 @call3
+define i1 @call3(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 8, metadata !"typeid2")
+  %fptr = extractvalue {i8*, i1} %pair, 0
+  %p = extractvalue {i8*, i1} %pair, 1
+  br i1 %p, label %cont, label %trap
+
+cont:
+  %fptr_casted = bitcast i8* %fptr to i1 (i8*, i32)*
+  %result = call i1 %fptr_casted(i8* %obj, i32 3)
+  ; UNIQUE-RET-VAL0: icmp ne i8* %vtablei8, @__typeid_typeid2_8_3_unique_member
+  ; UNIQUE-RET-VAL1: icmp eq i8* %vtablei8, @__typeid_typeid2_8_3_unique_member
+  ; VCP: [[VT2:%.*]] = bitcast {{.*}} to i8*
+  ; VCP: [[GEP2:%.*]] = getelementptr i8, i8* [[VT2]], i32 ptrtoint (i8* @__typeid_typeid2_8_3_byte to i32)
+  ; VCP: [[LOAD2:%.*]] = load i8, i8* [[GEP2]]
+  ; VCP: [[AND2:%.*]] = and i8 [[LOAD2]], ptrtoint (i8* @__typeid_typeid2_8_3_bit to i8)
+  ; VCP: [[ICMP2:%.*]] = icmp ne i8 [[AND2]], 0
+  ; VCP: ret i1 [[ICMP2]]
+  ret i1 %result
+
+trap:
+  call void @llvm.trap()
+  unreachable
+}
+
+; SINGLE-IMPL-DAG: declare void @singleimpl1()
+; SINGLE-IMPL-DAG: declare void @singleimpl2()
+
+; VCP32: !0 = !{i32 -1, i32 -1}
+; VCP64: !0 = !{i64 0, i64 4294967296}
+
+; VCP32: !1 = !{i32 0, i32 256}
+; VCP64: !1 = !{i64 0, i64 256}
+
+declare void @llvm.assume(i1)
+declare void @llvm.trap()
+declare {i8*, i1} @llvm.type.checked.load(i8*, i32, metadata)
+declare i1 @llvm.type.test(i8*, metadata)
diff --git a/test/Transforms/WholeProgramDevirt/unique-retval.ll b/test/Transforms/WholeProgramDevirt/unique-retval.ll
index 50b938c43e4ad06bf1b340ca76ab7ee0e6fb9b5a..e9ae176fe8ac42ed51f928504de3b36bc565c7fa 100644
--- a/test/Transforms/WholeProgramDevirt/unique-retval.ll
+++ b/test/Transforms/WholeProgramDevirt/unique-retval.ll
@@ -33,8 +33,8 @@ define i1 @call1(i8* %obj) {
   ret i1 %result
 }
 
-; CHECK: define i1 @call2
-define i1 @call2(i8* %obj) {
+; CHECK: define i32 @call2
+define i32 @call2(i8* %obj) {
   %vtableptr = bitcast i8* %obj to [1 x i8*]**
   %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
   ; CHECK: [[VT2:%[^ ]*]] = bitcast [1 x i8*]* {{.*}} to i8*
@@ -43,10 +43,13 @@ define i1 @call2(i8* %obj) {
   call void @llvm.assume(i1 %p)
   %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
-  %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[RES1:%[^ ]*]] = icmp ne i8* [[VT1]], bitcast ([1 x i8*]* @vt2 to i8*)
-  %result = call i1 %fptr_casted(i8* %obj)
-  ret i1 %result
+  ; Intentional type mismatch to test zero extend.
+  %fptr_casted = bitcast i8* %fptr to i32 (i8*)*
+  ; CHECK: [[RES2:%[^ ]*]] = icmp ne i8* [[VT1]], bitcast ([1 x i8*]* @vt2 to i8*)
+  %result = call i32 %fptr_casted(i8* %obj)
+  ; CHECK: [[ZEXT2:%[^ ]*]] = zext i1 [[RES2]] to i32
+  ; CHECK: ret i32 [[ZEXT2:%[^ ]*]]
+  ret i32 %result
 }
 
 declare i1 @llvm.type.test(i8*, metadata)
diff --git a/test/Transforms/WholeProgramDevirt/vcp-accesses-memory.ll b/test/Transforms/WholeProgramDevirt/vcp-accesses-memory.ll
index b5d51f2d4637960267ee952a9be73f8e563a1f03..ca76383c49434c361a40da35c6011951d7d93d65 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-accesses-memory.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-accesses-memory.ll
@@ -1,21 +1,37 @@
 ; RUN: opt -S -wholeprogramdevirt %s | FileCheck %s
+; RUN: opt -S -passes=wholeprogramdevirt %s | FileCheck %s
 
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
+@vt1 = constant [2 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1a to i8*), i8* bitcast (i32 (i8*, i32)* @vf1b to i8*)], !type !0
+@vt2 = constant [2 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2a to i8*), i8* bitcast (i32 (i8*, i32)* @vf2b to i8*)], !type !0
 
-define i32 @vf1(i8* %this, i32 %arg) {
+@sink = external global i32
+
+define i32 @vf1a(i8* %this, i32 %arg) {
+  store i32 %arg, i32* @sink
+  ret i32 %arg
+}
+
+define i32 @vf2a(i8* %this, i32 %arg) {
+  store i32 %arg, i32* @sink
+  ret i32 %arg
+}
+
+define i32 @vf1b(i8* %this, i32 %arg) {
   ret i32 %arg
 }
 
-define i32 @vf2(i8* %this, i32 %arg) {
+define i32 @vf2b(i8* %this, i32 %arg) {
   ret i32 %arg
 }
 
-; CHECK: define i32 @call
-define i32 @call(i8* %obj) {
+; Test that we don't apply VCP if the virtual function body accesses memory,
+; even if the function returns a constant.
+
+; CHECK: define i32 @call1
+define i32 @call1(i8* %obj) {
   %vtableptr = bitcast i8* %obj to [1 x i8*]**
   %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
   %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
@@ -29,6 +45,24 @@ define i32 @call(i8* %obj) {
   ret i32 %result
 }
 
+; Test that we can apply VCP regardless of the function attributes by analyzing
+; the function body itself.
+
+; CHECK: define i32 @call2
+define i32 @call2(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 1
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i32 (i8*, i32)*
+  %result = call i32 %fptr_casted(i8* %obj, i32 1)
+  ; CHECK: ret i32 1
+  ret i32 %result
+}
+
 declare i1 @llvm.type.test(i8*, metadata)
 declare void @llvm.assume(i1)
 
diff --git a/test/Transforms/WholeProgramDevirt/vcp-decl.ll b/test/Transforms/WholeProgramDevirt/vcp-decl.ll
new file mode 100644
index 0000000000000000000000000000000000000000..1c4e2fbe97aa4cdb9581450a0134c90971e86351
--- /dev/null
+++ b/test/Transforms/WholeProgramDevirt/vcp-decl.ll
@@ -0,0 +1,32 @@
+; RUN: opt -S -wholeprogramdevirt %s | FileCheck %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+@vt1 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
+
+declare i32 @vf1(i8* %this, i32 %arg) readnone
+
+define i32 @vf2(i8* %this, i32 %arg) readnone {
+  ret i32 %arg
+}
+
+; CHECK: define i32 @fn
+define i32 @fn(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i32 (i8*, i32)*
+  ; CHECK: call i32 %
+  %result = call i32 %fptr_casted(i8* %obj, i32 1)
+  ret i32 %result
+}
+declare i1 @llvm.type.test(i8*, metadata)
+declare void @llvm.assume(i1)
+
+!0 = !{i32 0, !"typeid"}
diff --git a/test/Transforms/WholeProgramDevirt/vcp-no-this.ll b/test/Transforms/WholeProgramDevirt/vcp-no-this.ll
index c564665471cfc3e020651213d4953b8b0b11d0d0..ce76c8e6797e381e818fabcc6942f919e9aee2ee 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-no-this.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-no-this.ll
@@ -3,8 +3,8 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i32 ()* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i32 ()* @vf2 to i8*)], !type !0
+@vt1 = constant [1 x i8*] [i8* bitcast (i32 ()* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i32 ()* @vf2 to i8*)], !type !0
 
 define i32 @vf1() readnone {
   ret i32 1
diff --git a/test/Transforms/WholeProgramDevirt/vcp-non-constant-arg.ll b/test/Transforms/WholeProgramDevirt/vcp-non-constant-arg.ll
index 197c923c3a1cb769c7f53754d3f9b9da502a9e56..cc2ff33296a9988f743732bb02789fc3decb136e 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-non-constant-arg.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-non-constant-arg.ll
@@ -3,8 +3,8 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
+@vt1 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
 
 define i32 @vf1(i8* %this, i32 %arg) readnone {
   ret i32 %arg
diff --git a/test/Transforms/WholeProgramDevirt/vcp-too-wide-ints.ll b/test/Transforms/WholeProgramDevirt/vcp-too-wide-ints.ll
index 93936d5e1d27aea1faab159627e763e74bf89037..c24c3b4be68323c2439f34ad435e8c36b3820ea6 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-too-wide-ints.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-too-wide-ints.ll
@@ -3,33 +3,63 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i128 (i8*, i128)* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i128 (i8*, i128)* @vf2 to i8*)], !type !0
+@vt1 = constant [1 x i8*] [i8* bitcast (i64 (i8*, i128)* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i64 (i8*, i128)* @vf2 to i8*)], !type !0
+@vt3 = constant [1 x i8*] [i8* bitcast (i128 (i8*, i64)* @vf3 to i8*)], !type !1
+@vt4 = constant [1 x i8*] [i8* bitcast (i128 (i8*, i64)* @vf4 to i8*)], !type !1
 
-define i128 @vf1(i8* %this, i128 %arg) readnone {
-  ret i128 %arg
+define i64 @vf1(i8* %this, i128 %arg) readnone {
+  %argtrunc = trunc i128 %arg to i64
+  ret i64 %argtrunc
 }
 
-define i128 @vf2(i8* %this, i128 %arg) readnone {
-  ret i128 %arg
+define i64 @vf2(i8* %this, i128 %arg) readnone {
+  %argtrunc = trunc i128 %arg to i64
+  ret i64 %argtrunc
 }
 
-; CHECK: define i128 @call
-define i128 @call(i8* %obj) {
+define i128 @vf3(i8* %this, i64 %arg) readnone {
+  %argzext = zext i64 %arg to i128
+  ret i128 %argzext
+}
+
+define i128 @vf4(i8* %this, i64 %arg) readnone {
+  %argzext = zext i64 %arg to i128
+  ret i128 %argzext
+}
+
+; CHECK: define i64 @call1
+define i64 @call1(i8* %obj) {
+  %vtableptr = bitcast i8* %obj to [1 x i8*]**
+  %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
+  %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
+  %fptr = load i8*, i8** %fptrptr
+  %fptr_casted = bitcast i8* %fptr to i64 (i8*, i128)*
+  ; CHECK: call i64 %
+  %result = call i64 %fptr_casted(i8* %obj, i128 1)
+  ret i64 %result
+}
+
+; CHECK: define i128 @call2
+define i128 @call2(i8* %obj) {
   %vtableptr = bitcast i8* %obj to [1 x i8*]**
   %vtable = load [1 x i8*]*, [1 x i8*]** %vtableptr
   %vtablei8 = bitcast [1 x i8*]* %vtable to i8*
-  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid")
+  %p = call i1 @llvm.type.test(i8* %vtablei8, metadata !"typeid2")
   call void @llvm.assume(i1 %p)
   %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
-  %fptr_casted = bitcast i8* %fptr to i128 (i8*, i128)*
+  %fptr_casted = bitcast i8* %fptr to i128 (i8*, i64)*
   ; CHECK: call i128 %
-  %result = call i128 %fptr_casted(i8* %obj, i128 1)
+  %result = call i128 %fptr_casted(i8* %obj, i64 1)
   ret i128 %result
 }
 
 declare i1 @llvm.type.test(i8*, metadata)
 declare void @llvm.assume(i1)
 
-!0 = !{i32 0, !"typeid"}
+!0 = !{i32 0, !"typeid1"}
+!1 = !{i32 0, !"typeid2"}
diff --git a/test/Transforms/WholeProgramDevirt/vcp-type-mismatch.ll b/test/Transforms/WholeProgramDevirt/vcp-type-mismatch.ll
index 3124889a7070d7ab07d0a2f1439900a008490902..7016263f8f7ba2054756b327ddffae9a625ac177 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-type-mismatch.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-type-mismatch.ll
@@ -1,10 +1,16 @@
 ; RUN: opt -S -wholeprogramdevirt %s | FileCheck %s
 
+; Test that we correctly handle function type mismatches in argument counts
+; and bitwidths. We handle an argument count mismatch by refusing
+; to optimize. For bitwidth mismatches, we allow the optimization in order
+; to simplify the implementation. This is legal because the bitwidth mismatch
+; gives the call undefined behavior.
+
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
+@vt1 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i32 (i8*, i32)* @vf2 to i8*)], !type !0
 
 define i32 @vf1(i8* %this, i32 %arg) readnone {
   ret i32 %arg
@@ -24,8 +30,8 @@ define i32 @bad_arg_type(i8* %obj) {
   %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i32 (i8*, i64)*
-  ; CHECK: call i32 %
   %result = call i32 %fptr_casted(i8* %obj, i64 1)
+  ; CHECK: ret i32 1
   ret i32 %result
 }
 
@@ -54,8 +60,8 @@ define i64 @bad_return_type(i8* %obj) {
   %fptrptr = getelementptr [1 x i8*], [1 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i64 (i8*, i32)*
-  ; CHECK: call i64 %
   %result = call i64 %fptr_casted(i8* %obj, i32 1)
+  ; CHECK: ret i64 1
   ret i64 %result
 }
 
diff --git a/test/Transforms/WholeProgramDevirt/vcp-uses-this.ll b/test/Transforms/WholeProgramDevirt/vcp-uses-this.ll
index fc4dee37dba7567ed57102393bfef14158a77ee3..542402e1657727d554b7810bebab3d6a2c2cd750 100644
--- a/test/Transforms/WholeProgramDevirt/vcp-uses-this.ll
+++ b/test/Transforms/WholeProgramDevirt/vcp-uses-this.ll
@@ -3,8 +3,8 @@
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
-@vt1 = global [1 x i8*] [i8* bitcast (i32 (i8*)* @vf1 to i8*)], !type !0
-@vt2 = global [1 x i8*] [i8* bitcast (i32 (i8*)* @vf2 to i8*)], !type !0
+@vt1 = constant [1 x i8*] [i8* bitcast (i32 (i8*)* @vf1 to i8*)], !type !0
+@vt2 = constant [1 x i8*] [i8* bitcast (i32 (i8*)* @vf2 to i8*)], !type !0
 
 define i32 @vf1(i8* %this) readnone {
   %this_int = ptrtoint i8* %this to i32
diff --git a/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll b/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll
index 530fe8aa89d0ffc504371d0c1495af57cede5118..080ed6caac5ec6bbc1e27864f4e02bae34abd114 100644
--- a/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll
+++ b/test/Transforms/WholeProgramDevirt/virtual-const-prop-begin.ll
@@ -78,7 +78,7 @@ define i1 @call1(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i64 -5
+  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i32 -5
   ; CHECK: [[VTLOAD1:%[^ ]*]] = load i8, i8* [[VTGEP1]]
   ; CHECK: [[VTAND1:%[^ ]*]] = and i8 [[VTLOAD1]], 2
   ; CHECK: [[VTCMP1:%[^ ]*]] = icmp ne i8 [[VTAND1]], 0
@@ -98,7 +98,7 @@ define i1 @call2(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 1
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i64 -5
+  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i32 -5
   ; CHECK: [[VTLOAD2:%[^ ]*]] = load i8, i8* [[VTGEP2]]
   ; CHECK: [[VTAND2:%[^ ]*]] = and i8 [[VTLOAD2]], 1
   ; CHECK: [[VTCMP2:%[^ ]*]] = icmp ne i8 [[VTAND2]], 0
@@ -118,7 +118,7 @@ define i32 @call3(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 2
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i32 (i8*)*
-  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i64 -4
+  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i32 -4
   ; CHECK: [[VTBC3:%[^ ]*]] = bitcast i8* [[VTGEP3]] to i32*
   ; CHECK: [[VTLOAD3:%[^ ]*]] = load i32, i32* [[VTBC3]]
   %result = call i32 %fptr_casted(i8* %obj)
diff --git a/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll b/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
index fcf00d6d86c21c76c5c6374494e6a89c5b8548bb..3299f7bce65bca82c939e8afd43a6fbc2ec83d10 100644
--- a/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
+++ b/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
@@ -87,7 +87,7 @@ define i1 @call1(i8* %obj) {
   %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 0, metadata !"typeid")
   %fptr = extractvalue {i8*, i1} %pair, 0
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i64 -5
+  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i32 -5
   ; CHECK: [[VTLOAD1:%[^ ]*]] = load i8, i8* [[VTGEP1]]
   ; CHECK: [[VTAND1:%[^ ]*]] = and i8 [[VTLOAD1]], 2
   ; CHECK: [[VTCMP1:%[^ ]*]] = icmp ne i8 [[VTAND1]], 0
@@ -108,7 +108,7 @@ define i1 @call2(i8* %obj) {
   %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 8, metadata !"typeid")
   %fptr = extractvalue {i8*, i1} %pair, 0
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i64 -5
+  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i32 -5
   ; CHECK: [[VTLOAD2:%[^ ]*]] = load i8, i8* [[VTGEP2]]
   ; CHECK: [[VTAND2:%[^ ]*]] = and i8 [[VTLOAD2]], 1
   ; CHECK: [[VTCMP2:%[^ ]*]] = icmp ne i8 [[VTAND2]], 0
@@ -129,7 +129,7 @@ define i32 @call3(i8* %obj) {
   %pair = call {i8*, i1} @llvm.type.checked.load(i8* %vtablei8, i32 16, metadata !"typeid")
   %fptr = extractvalue {i8*, i1} %pair, 0
   %fptr_casted = bitcast i8* %fptr to i32 (i8*)*
-  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i64 -4
+  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i32 -4
   ; CHECK: [[VTBC3:%[^ ]*]] = bitcast i8* [[VTGEP3]] to i32*
   ; CHECK: [[VTLOAD3:%[^ ]*]] = load i32, i32* [[VTBC3]]
   %result = call i32 %fptr_casted(i8* %obj)
diff --git a/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll b/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll
index 75ec6ba95ef1389a0033d4d3d919d25ab04c782f..14360c78d950228c691179d60f883ea5dad93f91 100644
--- a/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll
+++ b/test/Transforms/WholeProgramDevirt/virtual-const-prop-end.ll
@@ -73,7 +73,7 @@ define i1 @call1(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 0
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i64 28
+  ; CHECK: [[VTGEP1:%[^ ]*]] = getelementptr i8, i8* [[VT1]], i32 28
   ; CHECK: [[VTLOAD1:%[^ ]*]] = load i8, i8* [[VTGEP1]]
   ; CHECK: [[VTAND1:%[^ ]*]] = and i8 [[VTLOAD1]], 2
   ; CHECK: [[VTCMP1:%[^ ]*]] = icmp ne i8 [[VTAND1]], 0
@@ -93,7 +93,7 @@ define i1 @call2(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 1
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i1 (i8*)*
-  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i64 28
+  ; CHECK: [[VTGEP2:%[^ ]*]] = getelementptr i8, i8* [[VT2]], i32 28
   ; CHECK: [[VTLOAD2:%[^ ]*]] = load i8, i8* [[VTGEP2]]
   ; CHECK: [[VTAND2:%[^ ]*]] = and i8 [[VTLOAD2]], 1
   ; CHECK: [[VTCMP2:%[^ ]*]] = icmp ne i8 [[VTAND2]], 0
@@ -113,7 +113,7 @@ define i32 @call3(i8* %obj) {
   %fptrptr = getelementptr [3 x i8*], [3 x i8*]* %vtable, i32 0, i32 2
   %fptr = load i8*, i8** %fptrptr
   %fptr_casted = bitcast i8* %fptr to i32 (i8*)*
-  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i64 24
+  ; CHECK: [[VTGEP3:%[^ ]*]] = getelementptr i8, i8* [[VT3]], i32 24
   ; CHECK: [[VTBC3:%[^ ]*]] = bitcast i8* [[VTGEP3]] to i32*
   ; CHECK: [[VTLOAD3:%[^ ]*]] = load i32, i32* [[VTBC3]]
   %result = call i32 %fptr_casted(i8* %obj)
diff --git a/test/Verifier/amdgpu-cc.ll b/test/Verifier/amdgpu-cc.ll
new file mode 100644
index 0000000000000000000000000000000000000000..68c7f309b6e1ff081c2a797b3479f8455cb301f9
--- /dev/null
+++ b/test/Verifier/amdgpu-cc.ll
@@ -0,0 +1,55 @@
+; RUN: not llvm-as < %s 2>&1 | FileCheck %s
+
+; CHECK: Calling convention requires void return type
+; CHECK-NEXT: i32 ()* @nonvoid_cc_amdgpu_kernel
+define amdgpu_kernel i32 @nonvoid_cc_amdgpu_kernel() {
+  ret i32 0
+}
+
+; CHECK: Calling convention does not support varargs or perfect forwarding!
+; CHECK-NEXT: void (...)* @varargs_amdgpu_kernel
+define amdgpu_kernel void @varargs_amdgpu_kernel(...) {
+  ret void
+}
+
+; CHECK: Calling convention does not allow sret
+; CHECK-NEXT: void (i32*)* @sret_cc_amdgpu_kernel
+define amdgpu_kernel void @sret_cc_amdgpu_kernel(i32* sret %ptr) {
+  ret void
+}
+
+; CHECK: Calling convention does not support varargs or perfect forwarding!
+; CHECK-NEXT: void (...)* @varargs_amdgpu_vs
+define amdgpu_vs void @varargs_amdgpu_vs(...) {
+  ret void
+}
+
+; CHECK: Calling convention does not support varargs or perfect forwarding!
+; CHECK-NEXT: void (...)* @varargs_amdgpu_gs
+define amdgpu_gs void @varargs_amdgpu_gs(...) {
+  ret void
+}
+
+; CHECK: Calling convention does not support varargs or perfect forwarding!
+; CHECK-NEXT: void (...)* @varargs_amdgpu_ps
+define amdgpu_ps void @varargs_amdgpu_ps(...) {
+  ret void
+}
+
+; CHECK: Calling convention does not support varargs or perfect forwarding!
+; CHECK-NEXT: void (...)* @varargs_amdgpu_cs
+define amdgpu_cs void @varargs_amdgpu_cs(...) {
+  ret void
+}
+
+; CHECK: Calling convention requires void return type
+; CHECK-NEXT: i32 ()* @nonvoid_cc_spir_kernel
+define spir_kernel i32 @nonvoid_cc_spir_kernel() {
+  ret i32 0
+}
+
+; CHECK: Calling convention does not support varargs or perfect forwarding!
+; CHECK-NEXT: void (...)* @varargs_spir_kernel
+define spir_kernel void @varargs_spir_kernel(...) {
+  ret void
+}
diff --git a/test/Verifier/dbg-line-without-file.ll b/test/Verifier/dbg-line-without-file.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4d5725959ef1484cfd48d9e0d8a0c14de0473663
--- /dev/null
+++ b/test/Verifier/dbg-line-without-file.ll
@@ -0,0 +1,15 @@
+; RUN: not llvm-as -disable-output <%s 2>&1 | FileCheck %s
+; CHECK: assembly parsed, but does not verify
+; CHECK: line specified with no file
+
+define void @foo() !dbg !3 {
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!1}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = distinct !DICompileUnit(language: DW_LANG_C, file: !2)
+!2 = !DIFile(filename: "foo.c", directory: "")
+!3 = distinct !DISubprogram(name: "foo", scope: !1, line: 1, unit: !1)
diff --git a/test/Verifier/dbg-orphaned-compileunit.ll b/test/Verifier/dbg-orphaned-compileunit.ll
index 0be14a2fa66214f040609430545ab3da470632ce..9ab72824624df592eeb0bb26c74d9c1fdbc3c885 100644
--- a/test/Verifier/dbg-orphaned-compileunit.ll
+++ b/test/Verifier/dbg-orphaned-compileunit.ll
@@ -1,6 +1,7 @@
 ; RUN: not llvm-as -disable-output <%s 2>&1 | FileCheck %s
 ; CHECK:      assembly parsed, but does not verify
-; CHECK-NEXT: All DICompileUnits must be listed in llvm.dbg.cu
+; CHECK-NEXT: DICompileUnit not listed in llvm.dbg.cu
+; CHECK-NEXT: !0 = distinct !DICompileUnit(language: DW_LANG_Fortran77, file: !1, isOptimized: false, runtimeVersion: 0, emissionKind: NoDebug)
 
 !named = !{!1}
 !llvm.module.flags = !{!0}
diff --git a/test/Verifier/diderivedtype-address-space-atomic-type.ll b/test/Verifier/diderivedtype-address-space-atomic-type.ll
new file mode 100644
index 0000000000000000000000000000000000000000..f7926ed9494641392cf22651f00123d9acfda459
--- /dev/null
+++ b/test/Verifier/diderivedtype-address-space-atomic-type.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+!0 = !DIBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK: DWARF address space only applies to pointer or reference types
+!1 = !DIDerivedType(tag: DW_TAG_atomic_type, baseType: !0, size: 32, align: 32, dwarfAddressSpace: 1)
diff --git a/test/Verifier/diderivedtype-address-space-const-type.ll b/test/Verifier/diderivedtype-address-space-const-type.ll
new file mode 100644
index 0000000000000000000000000000000000000000..deba6394381677273e5cc4ad117bae5903dac2da
--- /dev/null
+++ b/test/Verifier/diderivedtype-address-space-const-type.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+!0 = !DIBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK: DWARF address space only applies to pointer or reference types
+!1 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !0, size: 32, align: 32, dwarfAddressSpace: 1)
diff --git a/test/Verifier/diderivedtype-address-space-friend.ll b/test/Verifier/diderivedtype-address-space-friend.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d3d3df47ed282541809981867976a5d7032a5b5d
--- /dev/null
+++ b/test/Verifier/diderivedtype-address-space-friend.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+!0 = !DIBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK: DWARF address space only applies to pointer or reference types
+!1 = !DIDerivedType(tag: DW_TAG_friend, baseType: !0, size: 32, align: 32, dwarfAddressSpace: 1)
diff --git a/test/Verifier/diderivedtype-address-space-inheritance.ll b/test/Verifier/diderivedtype-address-space-inheritance.ll
new file mode 100644
index 0000000000000000000000000000000000000000..2020f030d7e8759fccb3f23bced5ad11f7ea6f19
--- /dev/null
+++ b/test/Verifier/diderivedtype-address-space-inheritance.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+!0 = !DIBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK: DWARF address space only applies to pointer or reference types
+!1 = !DIDerivedType(tag: DW_TAG_inheritance, baseType: !0, size: 32, align: 32, dwarfAddressSpace: 1)
diff --git a/test/Verifier/diderivedtype-address-space-member.ll b/test/Verifier/diderivedtype-address-space-member.ll
new file mode 100644
index 0000000000000000000000000000000000000000..366bc4896bb24974f94f539bedea246599a0ea41
--- /dev/null
+++ b/test/Verifier/diderivedtype-address-space-member.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+!0 = !DIBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK: DWARF address space only applies to pointer or reference types
+!1 = !DIDerivedType(tag: DW_TAG_member, baseType: !0, size: 32, align: 32, dwarfAddressSpace: 1)
diff --git a/test/Verifier/diderivedtype-address-space-ptr-to-member-type.ll b/test/Verifier/diderivedtype-address-space-ptr-to-member-type.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0ae6539d3662260f23a8bb6088991f1264f0a914
--- /dev/null
+++ b/test/Verifier/diderivedtype-address-space-ptr-to-member-type.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+!0 = !DIBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK: DWARF address space only applies to pointer or reference types
+!1 = !DIDerivedType(tag: DW_TAG_ptr_to_member_type, baseType: !0, size: 32, align: 32, dwarfAddressSpace: 1)
diff --git a/test/Verifier/diderivedtype-address-space-restrict-type.ll b/test/Verifier/diderivedtype-address-space-restrict-type.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b140a9e28b40e4c435cb24dc1308707dbe4602ec
--- /dev/null
+++ b/test/Verifier/diderivedtype-address-space-restrict-type.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+!0 = !DIBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK: DWARF address space only applies to pointer or reference types
+!1 = !DIDerivedType(tag: DW_TAG_restrict_type, baseType: !0, size: 32, align: 32, dwarfAddressSpace: 1)
diff --git a/test/Verifier/diderivedtype-address-space-rvalue-reference-type.ll b/test/Verifier/diderivedtype-address-space-rvalue-reference-type.ll
new file mode 100644
index 0000000000000000000000000000000000000000..5bcdc3b8d527d1dd4b455005dbb8fe1507700088
--- /dev/null
+++ b/test/Verifier/diderivedtype-address-space-rvalue-reference-type.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+!0 = !DIBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK: DWARF address space only applies to pointer or reference types
+!1 = !DIDerivedType(tag: DW_TAG_rvalue_reference_type, baseType: !0, size: 32, align: 32, dwarfAddressSpace: 1)
diff --git a/test/Verifier/diderivedtype-address-space-typedef.ll b/test/Verifier/diderivedtype-address-space-typedef.ll
new file mode 100644
index 0000000000000000000000000000000000000000..03a5c6af88d3f0a60cb3d044ea6a9e176e72ddf3
--- /dev/null
+++ b/test/Verifier/diderivedtype-address-space-typedef.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+!0 = !DIBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK: DWARF address space only applies to pointer or reference types
+!1 = !DIDerivedType(tag: DW_TAG_typedef, baseType: !0, size: 32, align: 32, dwarfAddressSpace: 1)
diff --git a/test/Verifier/diderivedtype-address-space-volatile-type.ll b/test/Verifier/diderivedtype-address-space-volatile-type.ll
new file mode 100644
index 0000000000000000000000000000000000000000..e8e70bc7959acd0e3a1270c354b99f33dabb75d8
--- /dev/null
+++ b/test/Verifier/diderivedtype-address-space-volatile-type.ll
@@ -0,0 +1,6 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0, !1}
+!0 = !DIBasicType(tag: DW_TAG_base_type, name: "name", size: 1, align: 2, encoding: DW_ATE_unsigned_char)
+; CHECK: DWARF address space only applies to pointer or reference types
+!1 = !DIDerivedType(tag: DW_TAG_volatile_type, baseType: !0, size: 32, align: 32, dwarfAddressSpace: 1)
diff --git a/test/Verifier/diexpression-swap.ll b/test/Verifier/diexpression-swap.ll
new file mode 100644
index 0000000000000000000000000000000000000000..b227c54bfa8877713a95096c7d10b1809d7eff01
--- /dev/null
+++ b/test/Verifier/diexpression-swap.ll
@@ -0,0 +1,5 @@
+; RUN: not opt -S < %s 2>&1 | FileCheck %s
+
+!named = !{!0}
+; CHECK: invalid expression
+!0 = !DIExpression(DW_OP_swap)
diff --git a/test/Verifier/fnarg-debuginfo.ll b/test/Verifier/fnarg-debuginfo.ll
new file mode 100644
index 0000000000000000000000000000000000000000..7cbe9ce93b974de25f230ca13427a0c3eee285c3
--- /dev/null
+++ b/test/Verifier/fnarg-debuginfo.ll
@@ -0,0 +1,26 @@
+; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
+
+declare void @llvm.dbg.declare(metadata, metadata, metadata) 
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
+
+define void @foo() !dbg !2 {
+entry:
+  %a = alloca i32
+  ; CHECK: conflicting debug info for argument
+  call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !3, metadata !DIExpression()), !dbg !6
+  call void @llvm.dbg.declare(metadata i32* %a, metadata !4, metadata !DIExpression()), !dbg !6
+  ret void, !dbg !6
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!7, !8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", emissionKind: FullDebug)
+!1 = !DIFile(filename: "x.c", directory: "/")
+!2 = distinct !DISubprogram(name: "foo", scope: !0, isDefinition: true, unit: !0)
+!3 = !DILocalVariable(name: "a", arg: 1, scope: !2, file: !1, line: 1, type: !5)
+!4 = !DILocalVariable(name: "b", arg: 1, scope: !2, file: !1, line: 1, type: !5)
+!5 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!6 = !DILocation(line: 1, scope: !2)
+!7 = !{i32 2, !"Dwarf Version", i32 4}
+!8 = !{i32 1, !"Debug Info Version", i32 3}
diff --git a/test/Verifier/fnarg-nodebug.ll b/test/Verifier/fnarg-nodebug.ll
new file mode 100644
index 0000000000000000000000000000000000000000..0c66f5d6d1a0a115c342f6e93c1882e39052ca30
--- /dev/null
+++ b/test/Verifier/fnarg-nodebug.ll
@@ -0,0 +1,59 @@
+; RUN: llvm-as < %s -o %t
+; RUN: llvm-dis < %t -o - | FileCheck %s
+; Created at -O1 from:
+; int sink(int);
+; __attribute__((always_inline)) int f(int i) { return sink(i); }
+; __attribute__((always_inline)) int g(int j) { return sink(j); }
+; __attribute__((nodebug)) int nodebug(int k) { return f(k)+g(k); }
+source_filename = "t.c"
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.12.0"
+
+declare i32 @sink(i32) local_unnamed_addr
+
+define i32 @nodebug(i32 %k) local_unnamed_addr #2 {
+entry:
+; This should not set off the FnArg Verifier. The two variables are in differrent scopes.
+  tail call void @llvm.dbg.value(metadata i32 %k, i64 0, metadata !12, metadata !13) #4, !dbg !14
+  %call.k = tail call i32 @sink(i32 %k) #4, !dbg !15
+  tail call void @llvm.dbg.value(metadata i32 %k, i64 0, metadata !19, metadata !13) #4, !dbg !20
+  %call.k3 = tail call i32 @sink(i32 %k) #4, !dbg !21
+  %add = add nsw i32 %call.k3, %call.k
+  ret i32 %add
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
+
+attributes #2 = { nounwind ssp uwtable }
+attributes #3 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0 (trunk 297153) (llvm/trunk 297155)", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "t.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 1, !"PIC Level", i32 2}
+!6 = !{!"clang version 5.0.0 (trunk 297153) (llvm/trunk 297155)"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 2, type: !8, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !11)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10, !10}
+!10 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!11 = !{!12}
+; CHECK: !DILocalVariable(name: "i", arg: 1
+!12 = !DILocalVariable(name: "i", arg: 1, scope: !7, file: !1, line: 2, type: !10)
+!13 = !DIExpression()
+!14 = !DILocation(line: 2, column: 42, scope: !7)
+!15 = !DILocation(line: 2, column: 54, scope: !7)
+!16 = !DILocation(line: 2, column: 47, scope: !7)
+!17 = distinct !DISubprogram(name: "g", scope: !1, file: !1, line: 3, type: !8, isLocal: false, isDefinition: true, scopeLine: 3, flags: DIFlagPrototyped, isOptimized: true, unit: !0, variables: !18)
+!18 = !{!19}
+; CHECK: !DILocalVariable(name: "j", arg: 1
+!19 = !DILocalVariable(name: "j", arg: 1, scope: !17, file: !1, line: 3, type: !10)
+!20 = !DILocation(line: 3, column: 42, scope: !17)
+!21 = !DILocation(line: 3, column: 54, scope: !17)
+!22 = !DILocation(line: 3, column: 47, scope: !17)
diff --git a/test/Verifier/function-metadata-bad.ll b/test/Verifier/function-metadata-bad.ll
index 9e7ba225408e50a99775348d237f380b4340b14a..b3bd3c27c6d49253c27a76e947f377d67f882e67 100644
--- a/test/Verifier/function-metadata-bad.ll
+++ b/test/Verifier/function-metadata-bad.ll
@@ -14,7 +14,7 @@ define i32 @bad2() !prof !1 {
 }
 
 !1 = !{!"function_entry_count"}
-; CHECK-NEXT: !prof annotations should have exactly 2 operands
+; CHECK-NEXT: !prof annotations should have no less than 2 operands
 ; CHECK-NEXT: !1 = !{!"function_entry_count"}
 
 
diff --git a/test/Verifier/metadata-function-prof.ll b/test/Verifier/metadata-function-prof.ll
index d84a7fe544026eabc5a8a425e718487205d8d713..70548b1fa41f4bab7a0a88f462acb803f515ff41 100644
--- a/test/Verifier/metadata-function-prof.ll
+++ b/test/Verifier/metadata-function-prof.ll
@@ -12,4 +12,4 @@ define void @f3() !prof !0 !prof !0 {
   unreachable
 }
 
-!0 = !{}
+!0 = !{!"function_entry_count", i64 100}
diff --git a/test/tools/dsymutil/X86/generate-empty-CU.test b/test/tools/dsymutil/X86/generate-empty-CU.test
new file mode 100644
index 0000000000000000000000000000000000000000..233611460b62b62f0b321e850ea0c94b5508ae93
--- /dev/null
+++ b/test/tools/dsymutil/X86/generate-empty-CU.test
@@ -0,0 +1,33 @@
+# RUN: llvm-dsymutil -f -o - -oso-prepend-path=%p/.. -y %s | llvm-dwarfdump - | FileCheck %s
+
+# This test on links the Dwarf for an LTO binary and on purpose doesn't retain
+# any symbol in the second CU out of 3. This is the only case where dsymutil
+# will generate an empty CU and it requires special handling.
+
+---
+triple:          'x86_64-apple-darwin'
+objects:
+  - filename:        /Inputs/basic-lto.macho.x86_64.o
+    timestamp:       1417654896
+    symbols:
+      - { sym: _main, objAddr: 0x0000000000000000, binAddr: 0x0000000100000F40, size: 0x00000010 }
+      - { sym: _bar, objAddr: 0x0000000000000050, binAddr: 0x0000000100000F90, size: 0x00000024 }
+...
+
+.debug_info contents:
+CHECK: Compile Unit: length = 0x0000007d version = 0x0002 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000081)
+
+CHECK: DW_TAG_compile_unit
+CHECK:        DW_AT_name {{.*}} "basic1.c"
+CHECK:   DW_TAG_subprogram
+                DW_AT_name {{.*}} "main"
+
+CHECK: Compile Unit: length = 0x00000007 version = 0x0002 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x0000008c)
+
+CHECK: Compile Unit: length = 0x00000089 version = 0x0002 abbr_offset = 0x0000 addr_size = 0x08 (next unit at 0x00000119)
+
+CHECK: DW_TAG_compile_unit
+CHECK:        DW_AT_name {{.*}} "basic3.c"
+
+CHECK:   DW_TAG_subprogram [7] *
+CHECK:          DW_AT_name {{.*}} = "bar"
diff --git a/test/tools/gold/X86/cache.ll b/test/tools/gold/X86/cache.ll
index cef983c4a1ac3a2ead6e65d6a904d2a892c5e08e..8d22a8606df319db3d8b6886c8f4ff1c82be08bb 100644
--- a/test/tools/gold/X86/cache.ll
+++ b/test/tools/gold/X86/cache.ll
@@ -2,7 +2,7 @@
 ; RUN: opt -module-summary %s -o %t.o
 ; RUN: opt -module-summary %p/Inputs/cache.ll -o %t2.o
 
-; RUN: rm -Rf %t.cache && mkdir %t.cache
+; RUN: rm -Rf %t.cache
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:     --plugin-opt=thinlto \
 ; RUN:     --plugin-opt=cache-dir=%t.cache \
@@ -16,7 +16,7 @@
 ; RUN: opt -module-hash -module-summary %s -o %t.o
 ; RUN: opt -module-hash -module-summary %p/Inputs/cache.ll -o %t2.o
 
-; RUN: rm -Rf %t.cache && mkdir %t.cache
+; RUN: rm -Rf %t.cache
 ; RUN: %gold -m elf_x86_64 -plugin %llvmshlibdir/LLVMgold.so \
 ; RUN:     --plugin-opt=thinlto \
 ; RUN:     --plugin-opt=cache-dir=%t.cache \
diff --git a/test/tools/gold/X86/stats.ll b/test/tools/gold/X86/stats.ll
index 15aa080d6fc0225c06935949fce94c7885ee3bf0..255a2bd90bcdc61ac98e2e57d9c7efc7588d3bbe 100644
--- a/test/tools/gold/X86/stats.ll
+++ b/test/tools/gold/X86/stats.ll
@@ -5,6 +5,13 @@
 ; RUN:    -m elf_x86_64 \
 ; RUN:    -plugin-opt=-stats %t.o -o %t2 2>&1 | FileCheck %s
 
+; RUN: llvm-as %s -o %t.o
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so  -shared \
+; RUN:    -m elf_x86_64 \
+; RUN:    -plugin-opt=thinlto \
+; RUN:    -plugin-opt=thinlto-index-only \
+; RUN:    -plugin-opt=-stats %t.o -o %t2 2>&1 | FileCheck %s
+
 ; CHECK: Statistics Collected
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/test/tools/gold/X86/thinlto_object_suffix_replace.ll b/test/tools/gold/X86/thinlto_object_suffix_replace.ll
new file mode 100644
index 0000000000000000000000000000000000000000..af4adad1655ec0e81b8f9f776fb131acb4e53f9f
--- /dev/null
+++ b/test/tools/gold/X86/thinlto_object_suffix_replace.ll
@@ -0,0 +1,41 @@
+; Test to make sure the thinlto-object-suffix-replace option is handled
+; correctly.
+
+; Generate bitcode file with summary, as well as a minimized bitcode without
+; the debug metadata for the thin link.
+; RUN: opt -thinlto-bc %s -thin-link-bitcode-file=%t1.thinlink.bc -o %t1.o
+
+; First perform the thin link on the normal bitcode file, and save the
+; resulting index.
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    -m elf_x86_64 \
+; RUN:    --plugin-opt=thinlto \
+; RUN:    --plugin-opt=thinlto-index-only \
+; RUN:    -shared %t1.o -o %t3
+; RUN: cp %t1.o.thinlto.bc %t1.o.thinlto.bc.orig
+
+; Next perform the thin link on the minimized bitcode file, and compare dump
+; of the resulting index to the above dump to ensure they are identical.
+; RUN: rm -f %t1.o.thinlto.bc
+; Make sure it isn't inadvertently using the regular bitcode file.
+; RUN: rm -f %t1.o
+; RUN: %gold -plugin %llvmshlibdir/LLVMgold.so \
+; RUN:    -m elf_x86_64 \
+; RUN:    --plugin-opt=thinlto \
+; RUN:    --plugin-opt=thinlto-index-only \
+; RUN:    --plugin-opt=thinlto-object-suffix-replace=".thinlink.bc;.o" \
+; RUN:    -shared %t1.thinlink.bc -o %t3
+; RUN: diff %t1.o.thinlto.bc.orig %t1.o.thinlto.bc
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @f() {
+entry:
+  ret void
+}
+
+!llvm.dbg.cu = !{}
+
+!1 = !{i32 2, !"Debug Info Version", i32 3}
+!llvm.module.flags = !{!1}
diff --git a/test/tools/llvm-ar/Inputs/absolute-paths.lib b/test/tools/llvm-ar/Inputs/absolute-paths.lib
new file mode 100644
index 0000000000000000000000000000000000000000..a4d8147dfef585c9325d8a25b7b68a5e64c8cbb1
Binary files /dev/null and b/test/tools/llvm-ar/Inputs/absolute-paths.lib differ
diff --git a/test/tools/llvm-ar/absolute-paths.test b/test/tools/llvm-ar/absolute-paths.test
new file mode 100644
index 0000000000000000000000000000000000000000..0b42d7d2dcbd88e59c87e3a790e2e25bc3e6743b
--- /dev/null
+++ b/test/tools/llvm-ar/absolute-paths.test
@@ -0,0 +1,20 @@
+MSVC's lib.exe produces archives with absolute paths to the members. It's useful
+for llvm-ar to extract them to their basename in the CWD, since usually the
+directories in the path in the archive won't exist during archive extraction.
+
+Get a temp clean cwd to extract into.
+RUN: rm -rf %t && mkdir %t && cd %t
+
+RUN: llvm-ar t %S/Inputs/absolute-paths.lib | FileCheck %s --check-prefix=CHECK-LIST
+CHECK-LIST: C:/src/llvm-project/build/dne/b.o
+CHECK-LIST: C:/src/llvm-project/build/dne/a.o
+
+Check that a.o comes out and defines foo.
+RUN: llvm-ar x %S/Inputs/absolute-paths.lib 'C:/src/llvm-project/build/dne/a.o'
+RUN: llvm-nm a.o | FileCheck %s --check-prefix=CHECK-A
+CHECK-A: T foo
+
+Check that b.o comes out and defines bar.
+RUN: llvm-ar x %S/Inputs/absolute-paths.lib C:/src/llvm-project/build/dne/b.o
+RUN: llvm-nm b.o | FileCheck %s --check-prefix=CHECK-B
+CHECK-B: T bar
diff --git a/test/tools/llvm-config/paths.test b/test/tools/llvm-config/paths.test
new file mode 100644
index 0000000000000000000000000000000000000000..419f155ae1f830ce3e66dc93bb9ba2b3867ed671
--- /dev/null
+++ b/test/tools/llvm-config/paths.test
@@ -0,0 +1,21 @@
+# Check directory options for obvious issues.
+
+RUN: llvm-config --bindir 2>&1 | FileCheck --check-prefix=CHECK-BINDIR %s
+CHECK-BINDIR: {{.*}}{{/|\\}}bin
+CHECK-BINDIR-NOT: error:
+CHECK-BINDIR-NOT: warning
+
+RUN: llvm-config --includedir 2>&1 | FileCheck --check-prefix=CHECK-INCLUDEDIR %s
+CHECK-INCLUDEDIR: {{.*}}{{/|\\}}include
+CHECK-INCLUDEDIR-NOT: error:
+CHECK-INCLUDEDIR-NOT: warning
+
+RUN: llvm-config --libdir 2>&1 | FileCheck --check-prefix=CHECK-LIBDIR %s
+CHECK-LIBDIR: {{.*}}{{/|\\}}lib{{.*}}
+CHECK-LIBDIR-NOT: error:
+CHECK-LIBDIR-NOT: warning
+
+RUN: llvm-config --cmakedir 2>&1 | FileCheck --check-prefix=CHECK-CMAKEDIR %s
+CHECK-CMAKEDIR: {{.*}}{{/|\\}}cmake{{/|\\}}llvm
+CHECK-CMAKEDIR-NOT: error:
+CHECK-CMAKEDIR-NOT: warning
diff --git a/test/tools/llvm-cov/Inputs/multiple-files2.covmapping b/test/tools/llvm-cov/Inputs/multiple-files2.covmapping
new file mode 100644
index 0000000000000000000000000000000000000000..770817a53806e4f135a1ae4a79dad4ad0d5f135a
Binary files /dev/null and b/test/tools/llvm-cov/Inputs/multiple-files2.covmapping differ
diff --git a/test/tools/llvm-cov/demangle.test b/test/tools/llvm-cov/demangle.test
index 90a26b734067240a9ad8d64f293ec2ff7fe3e6f9..5ca113262e409700e60db2ac5d52345bd097f5af 100644
--- a/test/tools/llvm-cov/demangle.test
+++ b/test/tools/llvm-cov/demangle.test
@@ -4,5 +4,8 @@ RUN: llvm-cov show %S/Inputs/templateInstantiations.covmapping -instr-profile %S
 RUN: llvm-profdata merge %S/Inputs/hideUnexecutedSubviews.proftext -o %t.profdata
 RUN: llvm-cov show %S/Inputs/templateInstantiations.covmapping -instr-profile %t.profdata -Xdemangler sed -Xdemangler 's/_/X/g' -filename-equivalence %S/showTemplateInstantiations.cpp | FileCheck %s
 
+// Check that we demangle names when printing out function summaries.
+RUN: llvm-cov report -show-functions %S/Inputs/templateInstantiations.covmapping -instr-profile %S/Inputs/templateInstantiations.profdata -Xdemangler sed -Xdemangler 's/_/X/g' -filename-equivalence %S/showTemplateInstantiations.cpp | FileCheck %s
+
 CHECK-DAG: XZ4funcIbEiTX
 CHECK-DAG: XZ4funcIiEiTX
diff --git a/test/tools/llvm-cov/multiple-files.test b/test/tools/llvm-cov/multiple-files.test
index 0b3fb855fedcbddbacf318e89c71d3695737ea90..d0dbdd8c0fcfcb042f96d9d8a3bf3a1c0a3420bd 100644
--- a/test/tools/llvm-cov/multiple-files.test
+++ b/test/tools/llvm-cov/multiple-files.test
@@ -1,9 +1,15 @@
 // RUN: llvm-profdata merge %S/Inputs/multiple-files.proftext -o %t.profdata
-// RUN: llvm-cov report %S/Inputs/multiple-files.covmapping -instr-profile %t.profdata | FileCheck %s
+// RUN: llvm-cov report %S/Inputs/multiple-files.covmapping -instr-profile %t.profdata | FileCheck %s -check-prefix=MANY_COMPONENTS
+// RUN: llvm-cov report %S/Inputs/multiple-files2.covmapping -instr-profile %t.profdata | FileCheck %s -check-prefix=ONE_COMPONENT
 
-// CHECK: Filename
-// CHECK-NEXT: ---
-// CHECK-NEXT: {{^}}a{{[/\\]}}f2.c
-// CHECK-NEXT: {{^}}b{{[/\\]}}c{{[/\\]}}f4.c
-// CHECK-NEXT: {{^}}b{{[/\\]}}f3.c
-// CHECK-NEXT: {{^}}f1.c
+// MANY_COMPONENTS: Filename
+// MANY_COMPONENTS-NEXT: ---
+// MANY_COMPONENTS-NEXT: {{^}}a{{[/\\]}}f2.c
+// MANY_COMPONENTS-NEXT: {{^}}b{{[/\\]}}c{{[/\\]}}f4.c
+// MANY_COMPONENTS-NEXT: {{^}}b{{[/\\]}}f3.c
+// MANY_COMPONENTS-NEXT: {{^}}f1.c
+
+// ONE_COMPONENT: Filename
+// ONE_COMPONENT-NEXT: ---
+// ONE_COMPONENT-NEXT: {{^}}cov.c
+// ONE_COMPONENT-NEXT: {{^}}cov.h
diff --git a/test/tools/llvm-cov/report.cpp b/test/tools/llvm-cov/report.cpp
index c28dd7589408be46ffecbaf29401542a10571ef4..49425eb5f62415b525703d9f3591f6c53417bb3a 100644
--- a/test/tools/llvm-cov/report.cpp
+++ b/test/tools/llvm-cov/report.cpp
@@ -1,6 +1,6 @@
 // RUN: llvm-cov report %S/Inputs/report.covmapping -instr-profile %S/Inputs/report.profdata -filename-equivalence 2>&1 | FileCheck %s
-// RUN: llvm-cov report %S/Inputs/report.covmapping -instr-profile %S/Inputs/report.profdata -filename-equivalence report.cpp 2>&1 | FileCheck -check-prefix=FILT %s
-// RUN: llvm-cov report %S/Inputs/report.covmapping -instr-profile %S/Inputs/report.profdata -filename-equivalence report.cpp does-not-exist.cpp 2>&1 | FileCheck -check-prefix=FILT %s
+// RUN: llvm-cov report -show-functions %S/Inputs/report.covmapping -instr-profile %S/Inputs/report.profdata -filename-equivalence report.cpp 2>&1 | FileCheck -check-prefix=FILT %s
+// RUN: llvm-cov report -show-functions %S/Inputs/report.covmapping -instr-profile %S/Inputs/report.profdata -filename-equivalence report.cpp does-not-exist.cpp 2>&1 | FileCheck -check-prefix=FILT %s
 
 // CHECK: Regions    Missed Regions     Cover   Functions  Missed Functions  Executed  Instantiations   Missed Insts.  Executed       Lines      Missed Lines     Cover
 // CHECK-NEXT: ---
diff --git a/test/tools/llvm-cov/warnings.h b/test/tools/llvm-cov/warnings.h
index 0517b6a7c875f02a647025b0c2a70c3dcadb6cba..a06e02f92d56e3d674ed9ebfec5ff69506daebf7 100644
--- a/test/tools/llvm-cov/warnings.h
+++ b/test/tools/llvm-cov/warnings.h
@@ -1,5 +1,7 @@
 // RUN: llvm-cov show %S/Inputs/prevent_false_instantiations.covmapping -instr-profile %S/Inputs/elf_binary_comdat.profdata -filename-equivalence /dev/null | FileCheck %s -allow-empty -check-prefix=FAKE-FILE-STDOUT
 // RUN: llvm-cov show %S/Inputs/prevent_false_instantiations.covmapping -instr-profile %S/Inputs/elf_binary_comdat.profdata -filename-equivalence /dev/null 2>&1 | FileCheck %s -check-prefix=FAKE-FILE-STDERR
+// RUN: not llvm-cov report %S/Inputs/prevent_false_instantiations.covmapping -instr-profile %S/Inputs/elf_binary_comdat.profdata -format=html
+// RUN: not llvm-cov export %S/Inputs/prevent_false_instantiations.covmapping -instr-profile %S/Inputs/elf_binary_comdat.profdata -format=html
 
 // FAKE-FILE-STDOUT-NOT: warning: The file '{{.*}}' isn't covered.
 // FAKE-FILE-STDERR: warning: The file '{{.*}}' isn't covered.
diff --git a/test/tools/llvm-cxxfilt/coff-import.test b/test/tools/llvm-cxxfilt/coff-import.test
new file mode 100644
index 0000000000000000000000000000000000000000..35494d7a83269abae01bcc8c4fcf9b386d5258b1
--- /dev/null
+++ b/test/tools/llvm-cxxfilt/coff-import.test
@@ -0,0 +1,5 @@
+RUN: llvm-cxxfilt -_ ___imp__ZSt6futureIvE | FileCheck %s
+RUN: llvm-cxxfilt __imp__ZSt6futureIvE | FileCheck %s
+
+CHECK: import thunk for std::future<void>
+
diff --git a/test/tools/llvm-extract/recursive.ll b/test/tools/llvm-extract/recursive.ll
new file mode 100644
index 0000000000000000000000000000000000000000..54813dba796883c6035400e74278f959c6b5314e
--- /dev/null
+++ b/test/tools/llvm-extract/recursive.ll
@@ -0,0 +1,32 @@
+; RUN: llvm-extract -func=a --recursive %s -S | FileCheck --check-prefix=CHECK-AB %s
+; RUN: llvm-extract -func=a --recursive --delete %s -S | FileCheck --check-prefix=CHECK-CD %s
+; RUN: llvm-extract -func=d --recursive %s -S | FileCheck --check-prefix=CHECK-CD %s
+
+; CHECK-AB: define void @a
+; CHECK-AB: define void @b
+; CHECK-AB-NOT: define void @c
+; CHECK-AB-NOT: define void @d
+
+; CHECK-CD-NOT: define void @a
+; CHECK-CD-NOT: define void @b
+; CHECK-CD: define void @c
+; CHECK-CD: define void @d
+
+define void @a() {
+  call void @b()
+  ret void
+}
+
+define void @b() {
+  ret void
+}
+
+define void @c() {
+  call void @d()
+  ret void
+}
+
+define void @d() {
+  call void @c()
+  ret void
+}
diff --git a/test/tools/llvm-lto2/X86/nodatalayout.ll b/test/tools/llvm-lto2/X86/nodatalayout.ll
index ee5cfb0e47058f1d1f736c07af898cec2b12cd68..f5f44e3e4a9f62c7454be03dbf332d252cb63731 100644
--- a/test/tools/llvm-lto2/X86/nodatalayout.ll
+++ b/test/tools/llvm-lto2/X86/nodatalayout.ll
@@ -1,7 +1,7 @@
 ; RUN: llvm-as < %s > %t1.bc
 
 ; Reject input modules without a datalayout.
-; RUN: not llvm-lto2 %t1.bc -o %t.o \
+; RUN: not llvm-lto2 run %t1.bc -o %t.o \
 ; RUN:  -r %t1.bc,patatino,px 2>&1 | FileCheck %s
 
 ; CHECK: input module has no datalayout
diff --git a/test/tools/llvm-lto2/X86/pipeline.ll b/test/tools/llvm-lto2/X86/pipeline.ll
index e0d03bd932f3b03f0e91221c2a39945b873a82a0..dbec9ab225270b69055a7b70fff81387f026c25d 100644
--- a/test/tools/llvm-lto2/X86/pipeline.ll
+++ b/test/tools/llvm-lto2/X86/pipeline.ll
@@ -1,14 +1,14 @@
 ; RUN: llvm-as < %s > %t1.bc
 
 ; Try a custom pipeline
-; RUN: llvm-lto2 %t1.bc -o %t.o -save-temps \
+; RUN: llvm-lto2 run %t1.bc -o %t.o -save-temps \
 ; RUN:  -r %t1.bc,patatino,px -opt-pipeline loweratomic \
 ; RUN:  -aa-pipeline basic-aa
 ; RUN: llvm-dis < %t.o.0.4.opt.bc | FileCheck %s --check-prefix=CUSTOM
 
 ; Try the new pass manager LTO default pipeline (make sure the option
 ; is accepted).
-; RUN: llvm-lto2 %t1.bc -o %t.o -lto-use-new-pm -r %t1.bc,patatino,px
+; RUN: llvm-lto2 run %t1.bc -o %t.o -lto-use-new-pm -r %t1.bc,patatino,px
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -23,13 +23,13 @@ define void @patatino() {
 ; CUSTOM-NEXT: }
 
 ; Check that invalid pipelines are caught as errors.
-; RUN: not llvm-lto2 %t1.bc -o %t.o \
+; RUN: not llvm-lto2 run %t1.bc -o %t.o \
 ; RUN:  -r %t1.bc,patatino,px -opt-pipeline foogoo 2>&1 | \
 ; RUN:  FileCheck %s --check-prefix=ERR
 
 ; ERR: LLVM ERROR: unable to parse pass pipeline description: foogoo
 
-; RUN: not llvm-lto2 %t1.bc -o %t.o \
+; RUN: not llvm-lto2 run %t1.bc -o %t.o \
 ; RUN:  -r %t1.bc,patatino,px -aa-pipeline patatino \
 ; RUN:  -opt-pipeline loweratomic 2>&1 | \
 ; RUN:  FileCheck %s --check-prefix=AAERR
diff --git a/test/tools/llvm-lto2/errors.ll b/test/tools/llvm-lto2/errors.ll
index 25c05430c935c8d02d1287877aa8f13edb8275b5..aa12a67175997ab0e6c30678843ef2fdd77433eb 100644
--- a/test/tools/llvm-lto2/errors.ll
+++ b/test/tools/llvm-lto2/errors.ll
@@ -1,8 +1,8 @@
 ; RUN: llvm-as %s -o %t.bc
-; RUN: not llvm-lto2 -o %t2.o %t.bc 2>&1 | FileCheck --check-prefix=ERR1 %s
-; RUN: not llvm-lto2 -o %t2.o -r %t.bc,foo,p -r %t.bc,bar,p %t.bc 2>&1 | FileCheck --check-prefix=ERR2 %s
-; RUN: not llvm-lto2 -o %t2.o -r %t.bc,foo,q %t.bc 2>&1 | FileCheck --check-prefix=ERR3 %s
-; RUN: not llvm-lto2 -o %t2.o -r foo %t.bc 2>&1 | FileCheck --check-prefix=ERR4 %s
+; RUN: not llvm-lto2 run -o %t2.o %t.bc 2>&1 | FileCheck --check-prefix=ERR1 %s
+; RUN: not llvm-lto2 run -o %t2.o -r %t.bc,foo,p -r %t.bc,bar,p %t.bc 2>&1 | FileCheck --check-prefix=ERR2 %s
+; RUN: not llvm-lto2 run -o %t2.o -r %t.bc,foo,q %t.bc 2>&1 | FileCheck --check-prefix=ERR3 %s
+; RUN: not llvm-lto2 run -o %t2.o -r foo %t.bc 2>&1 | FileCheck --check-prefix=ERR4 %s
 
 ; ERR1: missing symbol resolution for {{.*}}.bc,foo
 ; ERR2: unused symbol resolution for {{.*}}.bc,bar
@@ -10,5 +10,6 @@
 ; ERR4: invalid resolution: foo
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
 
 @foo = global i32 0
diff --git a/test/tools/llvm-objdump/AArch64/Inputs/print-armv8crypto.obj.macho-aarch64 b/test/tools/llvm-objdump/AArch64/Inputs/print-armv8crypto.obj.macho-aarch64
new file mode 100644
index 0000000000000000000000000000000000000000..b6ea824431c1cfa244707c59b311010eed342f80
Binary files /dev/null and b/test/tools/llvm-objdump/AArch64/Inputs/print-armv8crypto.obj.macho-aarch64 differ
diff --git a/test/tools/llvm-objdump/AArch64/mach-print-armv8crypto.test b/test/tools/llvm-objdump/AArch64/mach-print-armv8crypto.test
new file mode 100644
index 0000000000000000000000000000000000000000..e5ac0cb453e4ea78c435732f3db0fdf711dd0167
--- /dev/null
+++ b/test/tools/llvm-objdump/AArch64/mach-print-armv8crypto.test
@@ -0,0 +1,3 @@
+RUN: llvm-objdump -d -m -no-show-raw-insn %p/Inputs/print-armv8crypto.obj.macho-aarch64 | FileCheck %s
+
+CHECK: 0:	sha1su0.4s	v0, v1, v2
diff --git a/test/tools/llvm-objdump/AArch64/macho-print-mrs.test b/test/tools/llvm-objdump/AArch64/macho-print-mrs.test
index cc1d14faf8d33ead818aef93805a832b070c8fdb..c629a4de38768e51bb4c62bf2509629b01693b5d 100644
--- a/test/tools/llvm-objdump/AArch64/macho-print-mrs.test
+++ b/test/tools/llvm-objdump/AArch64/macho-print-mrs.test
@@ -1,3 +1,3 @@
 RUN: llvm-objdump -d -m -no-show-raw-insn %p/Inputs/print-mrs.obj.macho-aarch64 | FileCheck %s
 
-CHECK: 0:  mrs x0, S3_7_C15_C2_0
+CHECK: 0:  mrs x0, CPM_IOACC_CTL_EL3
diff --git a/test/tools/llvm-objdump/AMDGPU/Inputs/source-lines.cl b/test/tools/llvm-objdump/AMDGPU/Inputs/source-lines.cl
new file mode 100644
index 0000000000000000000000000000000000000000..9179056318bdc5f85847c49cf74a0ab10e52f152
--- /dev/null
+++ b/test/tools/llvm-objdump/AMDGPU/Inputs/source-lines.cl
@@ -0,0 +1,6 @@
+kernel void source_lines_test(global int *Out) {
+  int var0 = 0x777;
+  int var1 = 0x888;
+  int var2 = var0 + var1;
+  *Out = var2;
+}
diff --git a/test/tools/llvm-objdump/AMDGPU/lit.local.cfg b/test/tools/llvm-objdump/AMDGPU/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..2a665f06be72e5515ca6e27018facb35daa201be
--- /dev/null
+++ b/test/tools/llvm-objdump/AMDGPU/lit.local.cfg
@@ -0,0 +1,2 @@
+if not 'AMDGPU' in config.root.targets:
+    config.unsupported = True
diff --git a/test/tools/llvm-objdump/AMDGPU/source-lines.ll b/test/tools/llvm-objdump/AMDGPU/source-lines.ll
new file mode 100644
index 0000000000000000000000000000000000000000..94c4952e338617b173d235ebdd609c3d08e27713
--- /dev/null
+++ b/test/tools/llvm-objdump/AMDGPU/source-lines.ll
@@ -0,0 +1,109 @@
+; RUN: sed -e "s,SRC_COMPDIR,%/p/Inputs,g" %s > %t.ll
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx800 -filetype=obj -O0 -o %t.o %t.ll
+; RUN: llvm-objdump -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -disassemble -line-numbers %t.o | FileCheck --check-prefix=LINE %t.ll
+; RUN: llvm-objdump -triple=amdgcn-amd-amdhsa -mcpu=gfx800 -disassemble -source %t.o | FileCheck --check-prefix=SOURCE %t.ll
+
+; Prologue.
+; LINE:      source_lines_test:
+; LINE-NEXT: ; {{.*}}source-lines.cl:1
+; Kernel.
+; LINE: ; {{.*}}source-lines.cl:2
+; LINE: v_mov_b32_e32 v{{[0-9]+}}, 0x777
+; LINE: ; {{.*}}source-lines.cl:3
+; LINE: v_mov_b32_e32 v{{[0-9]+}}, 0x888
+; LINE: ; {{.*}}source-lines.cl:4
+; LINE: v_add_i32_e32
+; LINE: ; {{.*}}source-lines.cl:5
+; LINE: flat_store_dword
+; Epilogue.
+; LINE:      ; {{.*}}source-lines.cl:6
+; LINE-NEXT: s_endpgm
+
+; Prologue.
+; SOURCE:      source_lines_test:
+; SOURCE-NEXT: ; kernel void source_lines_test(global int *Out) {
+; Kernel.
+; SOURCE: ; int var0 = 0x777;
+; SOURCE: v_mov_b32_e32 v{{[0-9]+}}, 0x777
+; SOURCE: ; int var1 = 0x888;
+; SOURCE: v_mov_b32_e32 v{{[0-9]+}}, 0x888
+; SOURCE: ; int var2 = var0 + var1;
+; SOURCE: v_add_i32_e32
+; SOURCE: ; *Out = var2;
+; SOURCE: flat_store_dword
+; Epilogue.
+; SOURCE:      ; }
+; SOURCE-NEXT: s_endpgm
+
+; ModuleID = 'source-lines.cl'
+source_filename = "source-lines.cl"
+target datalayout = "e-p:32:32-p1:64:64-p2:64:64-p3:32:32-p4:64:64-p5:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64"
+target triple = "amdgcn-amd-amdhsa"
+
+; Function Attrs: noinline nounwind
+define amdgpu_kernel void @source_lines_test(i32 addrspace(1)* %Out) #0 !dbg !7 !kernel_arg_addr_space !12 !kernel_arg_access_qual !13 !kernel_arg_type !14 !kernel_arg_base_type !14 !kernel_arg_type_qual !15 {
+entry:
+  %Out.addr = alloca i32 addrspace(1)*, align 4
+  %var0 = alloca i32, align 4
+  %var1 = alloca i32, align 4
+  %var2 = alloca i32, align 4
+  store i32 addrspace(1)* %Out, i32 addrspace(1)** %Out.addr, align 4
+  call void @llvm.dbg.declare(metadata i32 addrspace(1)** %Out.addr, metadata !16, metadata !17), !dbg !18
+  call void @llvm.dbg.declare(metadata i32* %var0, metadata !19, metadata !17), !dbg !20
+  store i32 1911, i32* %var0, align 4, !dbg !20
+  call void @llvm.dbg.declare(metadata i32* %var1, metadata !21, metadata !17), !dbg !22
+  store i32 2184, i32* %var1, align 4, !dbg !22
+  call void @llvm.dbg.declare(metadata i32* %var2, metadata !23, metadata !17), !dbg !24
+  %0 = load i32, i32* %var0, align 4, !dbg !25
+  %1 = load i32, i32* %var1, align 4, !dbg !26
+  %add = add nsw i32 %0, %1, !dbg !27
+  store i32 %add, i32* %var2, align 4, !dbg !24
+  %2 = load i32, i32* %var2, align 4, !dbg !28
+  %3 = load i32 addrspace(1)*, i32 addrspace(1)** %Out.addr, align 4, !dbg !29
+  store i32 %2, i32 addrspace(1)* %3, align 4, !dbg !30
+  ret void, !dbg !31
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="gfx700" "target-features"="+fp64-fp16-denormals,-fp32-denormals" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+
+!llvm.dbg.cu = !{!0}
+!opencl.ocl.version = !{!3}
+!llvm.module.flags = !{!4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 5.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "source-lines.cl", directory: "SRC_COMPDIR")
+!2 = !{}
+!3 = !{i32 1, i32 0}
+!4 = !{i32 2, !"Dwarf Version", i32 2}
+!5 = !{i32 2, !"Debug Info Version", i32 3}
+!6 = !{!"clang version 5.0.0"}
+!7 = distinct !DISubprogram(name: "source_lines_test", scope: !1, file: !1, line: 1, type: !8, isLocal: false, isDefinition: true, scopeLine: 1, flags: DIFlagPrototyped, isOptimized: false, unit: !0, variables: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{i32 1}
+!13 = !{!"none"}
+!14 = !{!"int*"}
+!15 = !{!""}
+!16 = !DILocalVariable(name: "Out", arg: 1, scope: !7, file: !1, line: 1, type: !10)
+!17 = !DIExpression()
+!18 = !DILocation(line: 1, column: 43, scope: !7)
+!19 = !DILocalVariable(name: "var0", scope: !7, file: !1, line: 2, type: !11)
+!20 = !DILocation(line: 2, column: 7, scope: !7)
+!21 = !DILocalVariable(name: "var1", scope: !7, file: !1, line: 3, type: !11)
+!22 = !DILocation(line: 3, column: 7, scope: !7)
+!23 = !DILocalVariable(name: "var2", scope: !7, file: !1, line: 4, type: !11)
+!24 = !DILocation(line: 4, column: 7, scope: !7)
+!25 = !DILocation(line: 4, column: 14, scope: !7)
+!26 = !DILocation(line: 4, column: 21, scope: !7)
+!27 = !DILocation(line: 4, column: 19, scope: !7)
+!28 = !DILocation(line: 5, column: 10, scope: !7)
+!29 = !DILocation(line: 5, column: 4, scope: !7)
+!30 = !DILocation(line: 5, column: 8, scope: !7)
+!31 = !DILocation(line: 6, column: 1, scope: !7)
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-add-addr-imm-scaled b/test/tools/llvm-objdump/Inputs/macho-bind-add-addr-imm-scaled
new file mode 100755
index 0000000000000000000000000000000000000000..2180437408c94cc7aaae414196e8411aeb1e3b86
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-add-addr-imm-scaled differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-add_addr_uleb b/test/tools/llvm-objdump/Inputs/macho-bind-add_addr_uleb
new file mode 100755
index 0000000000000000000000000000000000000000..fc950db155a6ce16f9240b28df24ffdecf306e60
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-add_addr_uleb differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-bad-opcode-value b/test/tools/llvm-objdump/Inputs/macho-bind-bad-opcode-value
new file mode 100755
index 0000000000000000000000000000000000000000..c9195314c8e19ca3e87385c0662968f96474b34e
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-bad-opcode-value differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-bind-add-addr-uleb b/test/tools/llvm-objdump/Inputs/macho-bind-bind-add-addr-uleb
new file mode 100755
index 0000000000000000000000000000000000000000..11abd6246b8d9355a26f232b56f344cff0d9e6fe
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-bind-add-addr-uleb differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-do-bind-no-segIndex b/test/tools/llvm-objdump/Inputs/macho-bind-do-bind-no-segIndex
new file mode 100755
index 0000000000000000000000000000000000000000..cc4f09708c424f05a822b21641fe7f0469a0aa64
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-do-bind-no-segIndex differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-dylib-ordinal-uleb b/test/tools/llvm-objdump/Inputs/macho-bind-dylib-ordinal-uleb
new file mode 100755
index 0000000000000000000000000000000000000000..7769195d44c92c59455b0de8975a0cec1b1e519a
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-dylib-ordinal-uleb differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-dylib-ordinal-uleb-malformed-uleb128 b/test/tools/llvm-objdump/Inputs/macho-bind-dylib-ordinal-uleb-malformed-uleb128
new file mode 100755
index 0000000000000000000000000000000000000000..0d5410e976e654462beb2c85be1345e477f85eb9
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-dylib-ordinal-uleb-malformed-uleb128 differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-dylib-ordinal-uleb-too-big b/test/tools/llvm-objdump/Inputs/macho-bind-dylib-ordinal-uleb-too-big
new file mode 100755
index 0000000000000000000000000000000000000000..40564b5a262fbaf6c7296127cf85733b5e91b245
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-dylib-ordinal-uleb-too-big differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-dylib-special-imm b/test/tools/llvm-objdump/Inputs/macho-bind-dylib-special-imm
new file mode 100755
index 0000000000000000000000000000000000000000..09bf10ded89602b3af42264411730dc36d837bc1
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-dylib-special-imm differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-seg-too-big b/test/tools/llvm-objdump/Inputs/macho-bind-seg-too-big
new file mode 100755
index 0000000000000000000000000000000000000000..20be9957919e6e4e4a24fd007e7b1491531304dc
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-seg-too-big differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-segoff-too-big b/test/tools/llvm-objdump/Inputs/macho-bind-segoff-too-big
new file mode 100755
index 0000000000000000000000000000000000000000..3f8e5ee8384c0182115e39e23959b5f86a6a5f43
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-segoff-too-big differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-set-addend-sleb b/test/tools/llvm-objdump/Inputs/macho-bind-set-addend-sleb
new file mode 100755
index 0000000000000000000000000000000000000000..726b96d3de3059f6ddaf13453d4bf8e377b73fde
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-set-addend-sleb differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-set-symbol b/test/tools/llvm-objdump/Inputs/macho-bind-set-symbol
new file mode 100755
index 0000000000000000000000000000000000000000..b8201c3ad198586e3afdaf8ebe2ebe40db51ee3a
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-set-symbol differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-set-type-imm b/test/tools/llvm-objdump/Inputs/macho-bind-set-type-imm
new file mode 100755
index 0000000000000000000000000000000000000000..002057e6b86fbf1bb66f7ebd42b18944ffe4cbbd
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-set-type-imm differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-bind-uleb-times-skipping-uleb b/test/tools/llvm-objdump/Inputs/macho-bind-uleb-times-skipping-uleb
new file mode 100755
index 0000000000000000000000000000000000000000..81ab8130f66ac658b46a3936c16e3fc26b4303f8
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-bind-uleb-times-skipping-uleb differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-do-bind-no-dylib-ordinal b/test/tools/llvm-objdump/Inputs/macho-do-bind-no-dylib-ordinal
new file mode 100755
index 0000000000000000000000000000000000000000..77daede78684f8df17f1ef5ba1f726c00a7d538b
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-do-bind-no-dylib-ordinal differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-do-bind-no-symbol b/test/tools/llvm-objdump/Inputs/macho-do-bind-no-symbol
new file mode 100755
index 0000000000000000000000000000000000000000..0592b9bfe407e78d5d9dfd357b5f61dfd1aefa8e
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-do-bind-no-symbol differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-lazy-do-bind-add-addr-imm-scaled b/test/tools/llvm-objdump/Inputs/macho-lazy-do-bind-add-addr-imm-scaled
new file mode 100755
index 0000000000000000000000000000000000000000..a7d5abeef743ec39b38df8fc08d3d0b008e459bf
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-lazy-do-bind-add-addr-imm-scaled differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-lazy-do-bind-uleb-times-skipping-uleb b/test/tools/llvm-objdump/Inputs/macho-lazy-do-bind-uleb-times-skipping-uleb
new file mode 100755
index 0000000000000000000000000000000000000000..1f0288342c48439f6322f64ce0ee13ea7f706e1a
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-lazy-do-bind-uleb-times-skipping-uleb differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-lazy-do_bind_add_addr_uleb b/test/tools/llvm-objdump/Inputs/macho-lazy-do_bind_add_addr_uleb
new file mode 100755
index 0000000000000000000000000000000000000000..63f034688ff293912cca95339e53bd792a3d1d89
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-lazy-do_bind_add_addr_uleb differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-add-addr-imm-scaled b/test/tools/llvm-objdump/Inputs/macho-rebase-add-addr-imm-scaled
new file mode 100755
index 0000000000000000000000000000000000000000..6b0c1bd4566d8d1352526b06bc251c45130400ae
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-add-addr-imm-scaled differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-add-addr-uleb b/test/tools/llvm-objdump/Inputs/macho-rebase-add-addr-uleb
new file mode 100755
index 0000000000000000000000000000000000000000..e409590dc2dc95565e4fd462a2e737f8bdf1c14e
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-add-addr-uleb differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-add-addr-uleb-too-big b/test/tools/llvm-objdump/Inputs/macho-rebase-add-addr-uleb-too-big
new file mode 100755
index 0000000000000000000000000000000000000000..68b72ec6a3b5156056c4a13caf112942c7f4b43b
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-add-addr-uleb-too-big differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-bad-opcode-value b/test/tools/llvm-objdump/Inputs/macho-rebase-bad-opcode-value
new file mode 100755
index 0000000000000000000000000000000000000000..59e0d4fe619b3e69d6a420ff99163402de0eb9c7
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-bad-opcode-value differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-imm-times b/test/tools/llvm-objdump/Inputs/macho-rebase-imm-times
new file mode 100755
index 0000000000000000000000000000000000000000..be2286baf6b3a27c6f0c32451e0fc5d22fd6aad6
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-imm-times differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-seg-too-big b/test/tools/llvm-objdump/Inputs/macho-rebase-seg-too-big
new file mode 100755
index 0000000000000000000000000000000000000000..12b52328a9684d8e89b8a8498cbcc16120c97e8a
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-seg-too-big differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-segoff-too-big b/test/tools/llvm-objdump/Inputs/macho-rebase-segoff-too-big
new file mode 100755
index 0000000000000000000000000000000000000000..4dfb19dea80f77972ad3c374e76ca623dae7f590
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-segoff-too-big differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-set-type-imm b/test/tools/llvm-objdump/Inputs/macho-rebase-set-type-imm
new file mode 100755
index 0000000000000000000000000000000000000000..947db0ee915f435659ff6b833bef998f0f6b3ee9
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-set-type-imm differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-uleb-malformed-uleb128 b/test/tools/llvm-objdump/Inputs/macho-rebase-uleb-malformed-uleb128
new file mode 100755
index 0000000000000000000000000000000000000000..045f425b2233f284215124a6ad0742e338c0b26d
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-uleb-malformed-uleb128 differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-uleb-times b/test/tools/llvm-objdump/Inputs/macho-rebase-uleb-times
new file mode 100755
index 0000000000000000000000000000000000000000..c12f256f66002f2d371bdefceaf971757441b300
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-uleb-times differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-rebase-uleb-times-skipping-uleb b/test/tools/llvm-objdump/Inputs/macho-rebase-uleb-times-skipping-uleb
new file mode 100755
index 0000000000000000000000000000000000000000..5bec8ca115788fde25c3ca10ac7aafdf7bd24980
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-rebase-uleb-times-skipping-uleb differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-weak-bind-set-dylib-ordinal-imm b/test/tools/llvm-objdump/Inputs/macho-weak-bind-set-dylib-ordinal-imm
new file mode 100755
index 0000000000000000000000000000000000000000..1d8785c55d0377eeeff7a77349ed427972d857e7
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-weak-bind-set-dylib-ordinal-imm differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-weak-bind-set-dylib-ordinal-uleb b/test/tools/llvm-objdump/Inputs/macho-weak-bind-set-dylib-ordinal-uleb
new file mode 100755
index 0000000000000000000000000000000000000000..bf7babc09e6fb14331716e4298db5e6922632e82
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-weak-bind-set-dylib-ordinal-uleb differ
diff --git a/test/tools/llvm-objdump/Inputs/macho-weak-bind-set-dylib-special-imm b/test/tools/llvm-objdump/Inputs/macho-weak-bind-set-dylib-special-imm
new file mode 100755
index 0000000000000000000000000000000000000000..d13f6ec981eba92f4f792438d88102503bdaab23
Binary files /dev/null and b/test/tools/llvm-objdump/Inputs/macho-weak-bind-set-dylib-special-imm differ
diff --git a/test/tools/llvm-objdump/Inputs/test.wasm b/test/tools/llvm-objdump/Inputs/test.wasm
index b24ac79c716374a77d9ffc06f183251720b1c438..d3906eeaf6f8618a9b148d56e0997714655bd847 100644
Binary files a/test/tools/llvm-objdump/Inputs/test.wasm and b/test/tools/llvm-objdump/Inputs/test.wasm differ
diff --git a/test/tools/llvm-objdump/Mips/disassemble-all.test b/test/tools/llvm-objdump/Mips/disassemble-all.test
new file mode 100644
index 0000000000000000000000000000000000000000..4554a0e030ae6539ff3276438f2a9ed093496356
--- /dev/null
+++ b/test/tools/llvm-objdump/Mips/disassemble-all.test
@@ -0,0 +1,16 @@
+# RUN: yaml2obj %s | llvm-objdump -D -
+
+# Test that -D does not crash llvm-objdump encounters a section who size is a
+# not a multiple of the size of an instruction.
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_EXEC
+  Machine:         EM_MIPS
+Sections:
+  - Name:            .note.llvm.crash
+    Type:            SHT_NOTE
+    Address:         0x0
+    Content:         002E746578
diff --git a/test/tools/llvm-objdump/Mips/lit.local.cfg b/test/tools/llvm-objdump/Mips/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..a3183a25afaa9d7cda45a83cfc6f7adc91ef2eb7
--- /dev/null
+++ b/test/tools/llvm-objdump/Mips/lit.local.cfg
@@ -0,0 +1,3 @@
+if not 'Mips' in config.root.targets:
+    config.unsupported = True
+
diff --git a/test/tools/llvm-objdump/X86/Inputs/Objc2.64bit.obj.dylib-x86_64 b/test/tools/llvm-objdump/X86/Inputs/Objc2.64bit.obj.dylib-x86_64
new file mode 100755
index 0000000000000000000000000000000000000000..07d465bfb6338cf1e1503b29675c76f89127cd65
Binary files /dev/null and b/test/tools/llvm-objdump/X86/Inputs/Objc2.64bit.obj.dylib-x86_64 differ
diff --git a/test/tools/llvm-objdump/X86/Inputs/macho-invalid-bind-entry b/test/tools/llvm-objdump/X86/Inputs/macho-invalid-bind-entry
new file mode 100644
index 0000000000000000000000000000000000000000..afdd0838c911bdee331f518dc069d9e5bdffa73f
Binary files /dev/null and b/test/tools/llvm-objdump/X86/Inputs/macho-invalid-bind-entry differ
diff --git a/test/tools/llvm-objdump/X86/Inputs/nofirst-symbol.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/nofirst-symbol.macho-x86_64
new file mode 100644
index 0000000000000000000000000000000000000000..4d1ef25e67695697b5cffb2015596a976b55a325
Binary files /dev/null and b/test/tools/llvm-objdump/X86/Inputs/nofirst-symbol.macho-x86_64 differ
diff --git a/test/tools/llvm-objdump/X86/Inputs/stripped-elf.so b/test/tools/llvm-objdump/X86/Inputs/stripped-elf.so
new file mode 100644
index 0000000000000000000000000000000000000000..b88b77501d9f8763fab93eab7ed9a929e39070c5
Binary files /dev/null and b/test/tools/llvm-objdump/X86/Inputs/stripped-elf.so differ
diff --git a/test/tools/llvm-objdump/X86/Inputs/stub-nosyms.macho-x86_64 b/test/tools/llvm-objdump/X86/Inputs/stub-nosyms.macho-x86_64
new file mode 100644
index 0000000000000000000000000000000000000000..a7f122b55084e84e62c79c96f286c2b3c7b233b3
Binary files /dev/null and b/test/tools/llvm-objdump/X86/Inputs/stub-nosyms.macho-x86_64 differ
diff --git a/test/tools/llvm-objdump/X86/macho-info-plist-nofollow.test b/test/tools/llvm-objdump/X86/macho-info-plist-nofollow.test
new file mode 100644
index 0000000000000000000000000000000000000000..12ad166c5a2f08c4a57c4d18a2c8a03a44d6b0eb
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-info-plist-nofollow.test
@@ -0,0 +1,10 @@
+# RUN: llvm-mc < %s -triple x86_64-apple-darwin -filetype=obj | llvm-objdump -m -info-plist - | FileCheck %s
+
+.section  __TEXT, __info_plist
+.ascii "This is the (__TEXT,__info_plist) section\n"
+.section __TEXT, __follow
+.asciz "This is the (__TEXT,__follow) section\n"
+
+# CHECK: Contents of (__TEXT,__info_plist) section
+# CHECK: This is the (__TEXT,__info_plist) section
+# CHECK-NOT: This is the (__TEXT,__follow) section
diff --git a/test/tools/llvm-objdump/X86/macho-nofirst-symbol-disassembly.test b/test/tools/llvm-objdump/X86/macho-nofirst-symbol-disassembly.test
new file mode 100644
index 0000000000000000000000000000000000000000..98964ac8047a41313ec789ccd857ce5ebe1d80b4
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-nofirst-symbol-disassembly.test
@@ -0,0 +1,8 @@
+// RUN: llvm-objdump -d -m %p/Inputs/nofirst-symbol.macho-x86_64 | FileCheck %s
+
+CHECK:        0:	90 	nop
+CHECK: _foo:
+CHECK:        1:	c3 	retq
+CHECK: _bar:
+CHECK:        2:	90 	nop
+CHECK:        3:	c3 	retq
diff --git a/test/tools/llvm-objdump/X86/macho-objc-meta-data.test b/test/tools/llvm-objdump/X86/macho-objc-meta-data.test
index f4abf6cdb49e74b65b650d6f8697c95adc599464..0bdb39cdff843112e3b1897d7915c5e79f94b784 100644
--- a/test/tools/llvm-objdump/X86/macho-objc-meta-data.test
+++ b/test/tools/llvm-objdump/X86/macho-objc-meta-data.test
@@ -5,6 +5,7 @@
 # RUN: llvm-objdump -m -objc-meta-data %p/Inputs/Objc1.32bit.exe.macho-i386 | FileCheck %s -check-prefix=OBJC1_32BIT_EXE
 # RUN: llvm-objdump -m -objc-meta-data %p/Inputs/Objc1.32bit.obj.macho-i386 | FileCheck %s -check-prefix=OBJC1_32BIT_OBJ
 # RUN: llvm-objdump -m -section __OBJC,__protocol %p/Inputs/Objc1.32bit.exe.macho-i386 | FileCheck %s -check-prefix=PROTOCOL
+# RUN: llvm-objdump -m -objc-meta-data %p/Inputs/Objc2.64bit.obj.dylib-x86_64 | FileCheck %s -check-prefix=OBJC2_64BIT_DYLIB
 
 OBJC2_64BIT_EXE: Contents of (__DATA,__objc_classlist) section
 OBJC2_64BIT_EXE: 0000000100002028 0x1000029f0
@@ -1037,3 +1038,64 @@ PROTOCOL:                 types 0x00002e04 @8@0:4
 PROTOCOL:         class_methods 0x00000000 (not in an __OBJC section)
 PROTOCOL:  instance_methods 0x00000000 (not in an __OBJC section)
 PROTOCOL:     class_methods 0x00000000 (not in an __OBJC section)
+
+OBJC2_64BIT_DYLIB: Contents of (__DATA_CONST,__objc_classlist) section
+OBJC2_64BIT_DYLIB: 000000000000c038 0x8030 _OBJC_CLASS_$_Test
+OBJC2_64BIT_DYLIB:            isa 0x8008 _OBJC_METACLASS_$_Test
+OBJC2_64BIT_DYLIB:     superclass 0x0
+OBJC2_64BIT_DYLIB:          cache 0x0
+OBJC2_64BIT_DYLIB:         vtable 0x0
+OBJC2_64BIT_DYLIB:           data 0xc120 (struct class_ro_t *)
+OBJC2_64BIT_DYLIB:                     flags 0x0
+OBJC2_64BIT_DYLIB:             instanceStart 8
+OBJC2_64BIT_DYLIB:              instanceSize 16
+OBJC2_64BIT_DYLIB:                  reserved 0x0
+OBJC2_64BIT_DYLIB:                ivarLayout 0x0
+OBJC2_64BIT_DYLIB:                      name 0x4f59 Test
+OBJC2_64BIT_DYLIB:               baseMethods 0xc090 (struct method_list_t *)
+OBJC2_64BIT_DYLIB: 		   entsize 24
+OBJC2_64BIT_DYLIB: 		     count 3
+OBJC2_64BIT_DYLIB: 		      name 0x4f5e testMethod
+OBJC2_64BIT_DYLIB: 		     types 0x4f89 v16@0:8
+OBJC2_64BIT_DYLIB: 		       imp -[Test testMethod]
+OBJC2_64BIT_DYLIB: 		      name 0x4f69 testProp
+OBJC2_64BIT_DYLIB: 		     types 0x4f91 Q16@0:8
+OBJC2_64BIT_DYLIB: 		       imp -[Test testProp]
+OBJC2_64BIT_DYLIB: 		      name 0x4f72 setTestProp:
+OBJC2_64BIT_DYLIB: 		     types 0x4f99 v24@0:8Q16
+OBJC2_64BIT_DYLIB: 		       imp -[Test setTestProp:]
+OBJC2_64BIT_DYLIB:             baseProtocols 0x0
+OBJC2_64BIT_DYLIB:                     ivars 0xc0e0
+OBJC2_64BIT_DYLIB:                     entsize 32
+OBJC2_64BIT_DYLIB:                       count 1
+OBJC2_64BIT_DYLIB: 			   offset 0x8000 8
+OBJC2_64BIT_DYLIB: 			     name 0x4f7f _testProp
+OBJC2_64BIT_DYLIB: 			     type 0x4fa4 Q
+OBJC2_64BIT_DYLIB: 			alignment 3
+OBJC2_64BIT_DYLIB: 			     size 8
+OBJC2_64BIT_DYLIB:            weakIvarLayout 0x0
+OBJC2_64BIT_DYLIB:            baseProperties 0xc108
+OBJC2_64BIT_DYLIB:                     entsize 16
+OBJC2_64BIT_DYLIB:                       count 1
+OBJC2_64BIT_DYLIB: 			     name 0x4f42 testProp
+OBJC2_64BIT_DYLIB: 			attributes 0x4f4b TQ,V_testProp
+OBJC2_64BIT_DYLIB: Meta Class
+OBJC2_64BIT_DYLIB:            isa 0x0
+OBJC2_64BIT_DYLIB:     superclass 0x0
+OBJC2_64BIT_DYLIB:          cache 0x0
+OBJC2_64BIT_DYLIB:         vtable 0x0
+OBJC2_64BIT_DYLIB:           data 0xc048 (struct class_ro_t *)
+OBJC2_64BIT_DYLIB:                     flags 0x1 RO_META
+OBJC2_64BIT_DYLIB:             instanceStart 40
+OBJC2_64BIT_DYLIB:              instanceSize 40
+OBJC2_64BIT_DYLIB:                  reserved 0x0
+OBJC2_64BIT_DYLIB:                ivarLayout 0x0
+OBJC2_64BIT_DYLIB:                      name 0x4f59 Test
+OBJC2_64BIT_DYLIB:               baseMethods 0x0 (struct method_list_t *)
+OBJC2_64BIT_DYLIB:             baseProtocols 0x0
+OBJC2_64BIT_DYLIB:                     ivars 0x0
+OBJC2_64BIT_DYLIB:            weakIvarLayout 0x0
+OBJC2_64BIT_DYLIB:            baseProperties 0x0
+OBJC2_64BIT_DYLIB: Contents of (__DATA_CONST,__objc_imageinfo) section
+OBJC2_64BIT_DYLIB:   version 0
+OBJC2_64BIT_DYLIB:     flags 0x40
diff --git a/test/tools/llvm-objdump/X86/macho-stub-nosyms-disassembly.test b/test/tools/llvm-objdump/X86/macho-stub-nosyms-disassembly.test
new file mode 100644
index 0000000000000000000000000000000000000000..af66b0e7a63f6b9f3bc80f40c9eb4d8392f31d05
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/macho-stub-nosyms-disassembly.test
@@ -0,0 +1,3 @@
+// RUN: llvm-objdump -d -m -no-show-raw-insn -full-leading-addr -print-imm-hex %p/Inputs/stub-nosyms.macho-x86_64 | FileCheck %s
+
+CHECK: 0000000000000001	callq	0x7 ## symbol stub for: _foo
diff --git a/test/tools/llvm-objdump/X86/malformed-machos.test b/test/tools/llvm-objdump/X86/malformed-machos.test
index 83ebfc4364e6870d6e7fe6fa35c692033fe1341f..292666a37254c99afb893b22ebef189ada741886 100644
--- a/test/tools/llvm-objdump/X86/malformed-machos.test
+++ b/test/tools/llvm-objdump/X86/malformed-machos.test
@@ -63,3 +63,6 @@ INVALID-SYMBOL-STRX-UNIVERSAL: macho-invalid-symbol-strx-universal' (for archite
 
 RUN: not llvm-objdump -macho -disassemble %p/Inputs/macho-invalid-symbol-lib_ordinal 2>&1 | FileCheck -check-prefix INVALID-SYMBOL-LIB_ORDINAL %s
 INVALID-SYMBOL-LIB_ORDINAL: macho-invalid-symbol-lib_ordinal': truncated or malformed object (bad library ordinal: 7 for symbol at index 2)
+
+RUN: not llvm-objdump -macho -objc-meta-data %p/Inputs/macho-invalid-bind-entry 2>&1 | FileCheck -check-prefix INVALID-BIND-ENTRY %s
+INVALID-BIND-ENTRY: macho-invalid-bind-entry': truncated or malformed object (for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad library ordinal: 83 (max 0) for opcode at: 0x0)
diff --git a/test/tools/llvm-objdump/X86/stripped-shared.test b/test/tools/llvm-objdump/X86/stripped-shared.test
new file mode 100644
index 0000000000000000000000000000000000000000..c57155f4cd7bab59be7f7e52f318b44b3e625ebe
--- /dev/null
+++ b/test/tools/llvm-objdump/X86/stripped-shared.test
@@ -0,0 +1,10 @@
+// This test checks that dynamic symbols are used when disassembling elf files.
+// RUN: llvm-objdump -d %p/Inputs/stripped-elf.so | FileCheck %s
+
+# CHECK: .init
+# CHECK: .plt
+# CHECK: .text
+# CHECK: func0
+# CHECK: func1
+# CHECK: func2
+# CHECK: .fini
diff --git a/test/tools/llvm-objdump/macho-bad-bind.test b/test/tools/llvm-objdump/macho-bad-bind.test
new file mode 100644
index 0000000000000000000000000000000000000000..98fd08f4009e80671884fb0f19d5146fe5d9a988
--- /dev/null
+++ b/test/tools/llvm-objdump/macho-bad-bind.test
@@ -0,0 +1,101 @@
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-dylib-ordinal-uleb 2>&1 | FileCheck -check-prefix DYLIB-ORDINAL-ULEB %s 
+DYLIB-ORDINAL-ULEB: macho-bind-dylib-ordinal-uleb': truncated or malformed object (for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad library ordinal: 355 (max 1) for opcode at: 0x0)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-dylib-ordinal-uleb-malformed-uleb128 2>&1 | FileCheck -check-prefix DYLIB-ORDINAL-ULEB-MALFORMED-ULEB128 %s 
+DYLIB-ORDINAL-ULEB-MALFORMED-ULEB128: macho-bind-dylib-ordinal-uleb-malformed-uleb128': truncated or malformed object (for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB malformed uleb128, extends past end for opcode at: 0x0)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-dylib-ordinal-uleb-too-big 2>&1 | FileCheck -check-prefix DYLIB-ORDINAL-ULEB-TOO-BIG %s 
+DYLIB-ORDINAL-ULEB-TOO-BIG: macho-bind-dylib-ordinal-uleb-too-big': truncated or malformed object (for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB uleb128 too big for uint64 for opcode at: 0x0)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-dylib-special-imm 2>&1 | FileCheck -check-prefix DYLIB-SPECIAL-IMM %s 
+DYLIB-SPECIAL-IMM: macho-bind-dylib-special-imm': truncated or malformed object (for BIND_OPCODE_SET_DYLIB_SPECIAL_IMM unknown special ordinal: -5 for opcode at: 0x0)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-set-symbol 2>&1 | FileCheck -check-prefix BIND-SET-SYMBOL %s 
+BIND-SET-SYMBOL: macho-bind-set-symbol': truncated or malformed object (for BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM symbol name extends past opcodes for opcode at: 0x2)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-set-type-imm 2>&1 | FileCheck -check-prefix SET-TYPE-IMM %s 
+SET-TYPE-IMM: macho-bind-set-type-imm': truncated or malformed object (for BIND_OPCODE_SET_TYPE_IMM bad bind type: 5 for opcode at: 0x14)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-set-addend-sleb 2>&1 | FileCheck -check-prefix SET-ADDEND-SLEB %s 
+SET-ADDEND-SLEB: macho-bind-set-addend-sleb': truncated or malformed object (for BIND_OPCODE_SET_ADDEND_SLEB malformed sleb128, extends past end for opcode at: 0x14)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-seg-too-big 2>&1 | FileCheck -check-prefix SEG-TOO-BIG %s 
+SEG-TOO-BIG: macho-bind-seg-too-big': truncated or malformed object (for BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB bad segIndex (too large) for opcode at: 0x15)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-segoff-too-big 2>&1 | FileCheck -check-prefix SEGOFF-TOO-BIG %s 
+SEGOFF-TOO-BIG: macho-bind-segoff-too-big': truncated or malformed object (for BIND_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB bad segOffset, too large for opcode at: 0x15)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-add_addr_uleb 2>&1 | FileCheck -check-prefix ADD_ADDR_ULEB %s 
+ADD_ADDR_ULEB: macho-bind-add_addr_uleb': truncated or malformed object (for BIND_OPCODE_ADD_ADDR_ULEB bad segOffset, too large for opcode at: 0x17)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-do-bind-no-segIndex 2>&1 | FileCheck -check-prefix BIND-NO-SEGINDEX %s 
+BIND-NO-SEGINDEX: macho-bind-do-bind-no-segIndex': truncated or malformed object (for BIND_OPCODE_DO_BIND missing preceding *_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB for opcode at: 0x15)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-bind-add-addr-uleb 2>&1 | FileCheck -check-prefix ADD-ADDR-ULEB %s 
+ADD-ADDR-ULEB: macho-bind-bind-add-addr-uleb': truncated or malformed object (for BIND_OPCODE_ADD_ADDR_ULEB (after adding ULEB) bad segOffset, too large for opcode at: 0x18)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-add-addr-imm-scaled 2>&1 | FileCheck -check-prefix ADD-ADDR-IMM-SCALED %s 
+ADD-ADDR-IMM-SCALED: macho-bind-add-addr-imm-scaled': truncated or malformed object (for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED  (after adding immediate times the pointer size) bad segOffset, too large for opcode at: 0x17)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-uleb-times-skipping-uleb 2>&1 | FileCheck -check-prefix ULEB-TIMES-SKIPPING-ULEB %s 
+ULEB-TIMES-SKIPPING-ULEB: macho-bind-uleb-times-skipping-uleb': truncated or malformed object (for BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB bad count and skip, too large for opcode at: 0x17)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-do-bind-no-symbol 2>&1 | FileCheck -check-prefix DO-BIND-NO-SYMBOL %s 
+DO-BIND-NO-SYMBOL: macho-do-bind-no-symbol': truncated or malformed object (for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB missing preceding BIND_OPCODE_SET_SYMBOL_TRAILING_FLAGS_IMM for opcode at: 0x5)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-do-bind-no-dylib-ordinal 2>&1 | FileCheck -check-prefix DO-BIND-NO-DYLIB-ORDINAL %s 
+DO-BIND-NO-DYLIB-ORDINAL: macho-do-bind-no-dylib-ordinal': truncated or malformed object (for BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB missing preceding BIND_OPCODE_SET_DYLIB_ORDINAL_* for opcode at: 0x15)
+
+RUN: not llvm-objdump -macho -bind %p/Inputs/macho-bind-bad-opcode-value 2>&1 | FileCheck -check-prefix BAD-OPCODE-VALUE %s 
+BAD-OPCODE-VALUE: macho-bind-bad-opcode-value': truncated or malformed object (bad bind info (bad opcode value 0xD0 for opcode at: 0x18)
+
+RUN: not llvm-objdump -macho -lazy-bind %p/Inputs/macho-lazy-do_bind_add_addr_uleb 2>&1 | FileCheck -check-prefix LAZY_DO_BIND_ADD_ADDR_ULEB %s 
+LAZY_DO_BIND_ADD_ADDR_ULEB: macho-lazy-do_bind_add_addr_uleb': truncated or malformed object (BIND_OPCODE_DO_BIND_ADD_ADDR_ULEB not allowed in lazy bind table for opcode at: 0xC)
+
+RUN: not llvm-objdump -macho -lazy-bind %p/Inputs/macho-lazy-do-bind-add-addr-imm-scaled 2>&1 | FileCheck -check-prefix LAZY-DO-BIND-ADD-ADDR-IMM-SCALED %s 
+LAZY-DO-BIND-ADD-ADDR-IMM-SCALED: macho-lazy-do-bind-add-addr-imm-scaled': truncated or malformed object (BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED not allowed in lazy bind table for opcode at: 0xC)
+
+RUN: not llvm-objdump -macho -lazy-bind %p/Inputs/macho-lazy-do-bind-uleb-times-skipping-uleb 2>&1 | FileCheck -check-prefix LAZY-DO-BIND-ULEB-TIMES-SKIPPING-ULEB %s 
+LAZY-DO-BIND-ULEB-TIMES-SKIPPING-ULEB: macho-lazy-do-bind-uleb-times-skipping-uleb': truncated or malformed object (BIND_OPCODE_DO_BIND_ULEB_TIMES_SKIPPING_ULEB not allowed in lazy bind table for opcode at: 0xC)
+
+RUN: not llvm-objdump -macho -weak-bind %p/Inputs/macho-weak-bind-set-dylib-ordinal-imm 2>&1 | FileCheck -check-prefix WEAK-BIND-SET-DYLIB-ORDINAL-IMM %s 
+WEAK-BIND-SET-DYLIB-ORDINAL-IMM: macho-weak-bind-set-dylib-ordinal-imm': truncated or malformed object (BIND_OPCODE_SET_DYLIB_ORDINAL_IMM not allowed in weak bind table for opcode at: 0x2)
+
+RUN: not llvm-objdump -macho -weak-bind %p/Inputs/macho-weak-bind-set-dylib-ordinal-uleb 2>&1 | FileCheck -check-prefix WEAK-BIND-SET-DYLIB-ORDINAL-ULEB %s 
+WEAK-BIND-SET-DYLIB-ORDINAL-ULEB: macho-weak-bind-set-dylib-ordinal-uleb': truncated or malformed object (BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB not allowed in weak bind table for opcode at: 0x2)
+
+RUN: not llvm-objdump -macho -weak-bind %p/Inputs/macho-weak-bind-set-dylib-special-imm 2>&1 | FileCheck -check-prefix WEAK-BIND-SET-DYLIB-SPECIAL-IMM %s 
+WEAK-BIND-SET-DYLIB-SPECIAL-IMM: macho-weak-bind-set-dylib-special-imm': truncated or malformed object (BIND_OPCODE_SET_DYLIB_SPECIAL_IMM not allowed in weak bind table for opcode at: 0x2)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-set-type-imm 2>&1 | FileCheck -check-prefix REBASE-SET-TYPE-IMM %s 
+REBASE-SET-TYPE-IMM: macho-rebase-set-type-imm': truncated or malformed object (for REBASE_OPCODE_SET_TYPE_IMM bad bind type: 5 for opcode at: 0x0)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-uleb-malformed-uleb128 2>&1 | FileCheck -check-prefix REBASE-ULEB-MALFORMED-ULEB128 %s 
+REBASE-ULEB-MALFORMED-ULEB128: macho-rebase-uleb-malformed-uleb128': truncated or malformed object (for REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB malformed uleb128, extends past end for opcode at: 0x1)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-seg-too-big 2>&1 | FileCheck -check-prefix REBASE-SEG-TOO-BIG %s 
+REBASE-SEG-TOO-BIG: macho-rebase-seg-too-big': truncated or malformed object (for REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB bad segIndex (too large) for opcode at: 0x1)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-segoff-too-big 2>&1 | FileCheck -check-prefix REBASE-SEGOFF-TOO-BIG %s 
+REBASE-SEGOFF-TOO-BIG: macho-rebase-segoff-too-big': truncated or malformed object (for REBASE_OPCODE_SET_SEGMENT_AND_OFFSET_ULEB bad segOffset, too large for opcode at: 0x1)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-add-addr-uleb 2>&1 | FileCheck -check-prefix REBASE-ADD-ADDR-ULEB %s 
+REBASE-ADD-ADDR-ULEB: macho-rebase-add-addr-uleb': truncated or malformed object (for REBASE_OPCODE_ADD_ADDR_ULEB bad segOffset, too large for opcode at: 0x3)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-add-addr-imm-scaled 2>&1 | FileCheck -check-prefix REBASE-ADD-ADDR-IMM-SCALED %s 
+REBASE-ADD-ADDR-IMM-SCALED: macho-rebase-add-addr-imm-scaled': truncated or malformed object (for REBASE_OPCODE_ADD_ADDR_IMM_SCALED  (after adding immediate times the pointer size) bad segOffset, too large for opcode at: 0x3)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-imm-times 2>&1 | FileCheck -check-prefix REBASE-IMM-TIMES %s 
+REBASE-IMM-TIMES: macho-rebase-imm-times': truncated or malformed object (for REBASE_OPCODE_DO_REBASE_IMM_TIMES bad count and skip, too large for opcode at: 0x3)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-uleb-times 2>&1 | FileCheck -check-prefix REBASE-ULEB-TIMES %s 
+REBASE-ULEB-TIMES: macho-rebase-uleb-times': truncated or malformed object (for REBASE_OPCODE_DO_REBASE_ULEB_TIMES bad count and skip, too large for opcode at: 0x3)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-add-addr-uleb-too-big 2>&1 | FileCheck -check-prefix REBASE-ADD-ADDR-ULEB-TOO-BIG %s 
+REBASE-ADD-ADDR-ULEB-TOO-BIG: macho-rebase-add-addr-uleb-too-big': truncated or malformed object (for REBASE_OPCODE_DO_REBASE_ADD_ADDR_ULEB bad count and skip, too large for opcode at: 0x3)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-uleb-times-skipping-uleb 2>&1 | FileCheck -check-prefix REBASE-ULEB-TIMES-SKIPPING-ULEB %s 
+REBASE-ULEB-TIMES-SKIPPING-ULEB: macho-rebase-uleb-times-skipping-uleb': truncated or malformed object (for REBASE_OPCODE_DO_REBASE_ULEB_TIMES_SKIPPING_ULEB bad count and skip, too large for opcode at: 0x3)
+
+RUN: not llvm-objdump -macho -rebase %p/Inputs/macho-rebase-bad-opcode-value 2>&1 | FileCheck -check-prefix REBASE-BAD-OPCODE-VALUE %s 
+REBASE-BAD-OPCODE-VALUE: macho-rebase-bad-opcode-value': truncated or malformed object (bad rebase info (bad opcode value 0xD0 for opcode at: 0x4)
diff --git a/test/tools/llvm-objdump/macho-bad-ordinal.test b/test/tools/llvm-objdump/macho-bad-ordinal.test
index 16badcc878d8d6068a02c2a460c0b5e4b66656b9..fb49f77f0751db9da5fb93207fdbc8874fb17f6c 100644
--- a/test/tools/llvm-objdump/macho-bad-ordinal.test
+++ b/test/tools/llvm-objdump/macho-bad-ordinal.test
@@ -1,6 +1,4 @@
-# RUN: llvm-objdump -macho -bind -lazy-bind %p/Inputs/bad-ordinal.macho-x86_64 \
-# RUN:   | FileCheck %s 
+# RUN: not llvm-objdump -macho -lazy-bind %p/Inputs/bad-ordinal.macho-x86_64 \
+# RUN: 2>&1 | FileCheck %s 
 
-
-# CHECK: __DATA   __nl_symbol_ptr    0x100001000 pointer         0 <<bad library ordinal>> dyld_stub_binder
-# CHECK: __DATA   __la_symbol_ptr    0x100001010 <<bad library ordinal>> _printf
+# CHECK: bad-ordinal.macho-x86_64': truncated or malformed object (for BIND_OPCODE_SET_DYLIB_ORDINAL_ULEB bad library ordinal: 2 (max 1) for opcode at: 0x2)
diff --git a/test/tools/llvm-objdump/wasm.txt b/test/tools/llvm-objdump/wasm.txt
index ebda2249f8b8953006320e96ca9b4a7ca175d0fe..4aa40c6c9df8c916f7f6537d9d43a2fc3a887aee 100644
--- a/test/tools/llvm-objdump/wasm.txt
+++ b/test/tools/llvm-objdump/wasm.txt
@@ -9,12 +9,12 @@
 # CHECK:   4 EXPORT        0000000e 0000000000000000 
 # CHECK:   5 ELEM          00000007 0000000000000000 
 # CHECK:   6 CODE          0000002a 0000000000000000 TEXT 
-# CHECK:   7 name          0000002c 0000000000000000 
+# CHECK:   7 name          0000003c 0000000000000000 
 
 # RUN: llvm-objdump -p %p/Inputs/test.wasm | FileCheck %s -check-prefix CHECK-HEADER
 
 # CHECK-HEADER: Program Header:
-# CHECK-HEADER: Version: 0xd
+# CHECK-HEADER: Version: 0x1
 
 # RUN: llvm-objdump -s --section=CODE %p/Inputs/test.wasm | FileCheck %s -check-prefix CHECK-SECTIONS
 
diff --git a/test/tools/llvm-pdbdump/class-layout.test b/test/tools/llvm-pdbdump/class-layout.test
index d2e98de2a201308d7af8ac4427359a84f8359840..e2921d298fce2ab16618a8ef7d0e688b794f6ac6 100644
--- a/test/tools/llvm-pdbdump/class-layout.test
+++ b/test/tools/llvm-pdbdump/class-layout.test
@@ -17,9 +17,7 @@
 ; MEMBERS_TEST: class MembersTest::A {
 ; MEMBERS_TEST-DAG: typedef int NestedTypedef
 ; MEMBERS_TEST-DAG: enum NestedEnum
-; MEMBERS_TEST: public:
-; MEMBERS_TEST-NEXT: void MemberFunc()
-; MEMBERS_TEST-NEXT: private:
+; MEMBERS_TEST: void MemberFunc()
 ; MEMBERS_TEST-DAG: int IntMemberVar
 ; MEMBERS_TEST-DAG: double DoubleMemberVar
 ; MEMBERS_TEST: }
@@ -48,10 +46,9 @@
 
 ; BITFIELD_TEST: ---TYPES---
 ; BITFIELD_TEST: struct BitFieldTest::A {
-; BITFIELD_TEST-NEXT: public:
-; BITFIELD_TEST-NEXT: +0x00 int Bits1 : 1
-; BITFIELD_TEST-NEXT: +0x00 int Bits2 : 2
-; BITFIELD_TEST-NEXT: +0x00 int Bits3 : 3
-; BITFIELD_TEST-NEXT: +0x00 int Bits4 : 4
-; BITFIELD_TEST-NEXT: +0x00 int Bits22 : 22
-; BITFIELD_TEST-NEXT: +0x04 int Offset0x04
+; BITFIELD_TEST-NEXT: +0x00 [sizeof=4] int Bits1 : 1
+; BITFIELD_TEST-NEXT: +0x00 [sizeof=4] int Bits2 : 2
+; BITFIELD_TEST-NEXT: +0x00 [sizeof=4] int Bits3 : 3
+; BITFIELD_TEST-NEXT: +0x00 [sizeof=4] int Bits4 : 4
+; BITFIELD_TEST-NEXT: +0x00 [sizeof=4] int Bits22 : 22
+; BITFIELD_TEST-NEXT: +0x04 [sizeof=4] int Offset0x04
diff --git a/test/tools/llvm-pdbdump/regex-filter.test b/test/tools/llvm-pdbdump/regex-filter.test
index b845f5a28cffd8cd18cf16777b5d5e10f885185e..d2f500e88c33a787948e21bd62f6640d3e603048 100644
--- a/test/tools/llvm-pdbdump/regex-filter.test
+++ b/test/tools/llvm-pdbdump/regex-filter.test
@@ -1,9 +1,16 @@
 ; RUN: llvm-pdbdump pretty -symbols -globals -types %p/Inputs/FilterTest.pdb \
 ; RUN:    | FileCheck --check-prefix=NO_FILTER %s
+
 ; RUN: llvm-pdbdump pretty -types -exclude-types="GlobalTypedef|NestedTypedef" \
 ; RUN:    %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=EXCLUDE_TYPEDEFS %s
+; RUN: llvm-pdbdump pretty -classes -enums %p/Inputs/FilterTest.pdb \
+; RUN:    | FileCheck --check-prefix=EXCLUDE_TYPEDEFS %s
+
 ; RUN: llvm-pdbdump pretty -types -exclude-types="GlobalEnum|NestedEnum" \
 ; RUN:    %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=EXCLUDE_ENUMS %s
+; RUN: llvm-pdbdump pretty -classes -typedefs %p/Inputs/FilterTest.pdb \
+; RUN:    | FileCheck --check-prefix=EXCLUDE_ENUMS %s
+
 ; RUN: llvm-pdbdump pretty -types -symbols -globals -exclude-symbols="MemberVar|GlobalVar" \
 ; RUN:    %p/Inputs/FilterTest.pdb | FileCheck --check-prefix=EXCLUDE_VARS %s
 ; RUN: llvm-pdbdump pretty -types -exclude-types="FilterTestClass" \
@@ -36,31 +43,25 @@
 ; NO_FILTER-DAG: GlobalEnum GlobalEnumVar
 
 ; EXCLUDE_TYPEDEFS: ---TYPES---
-; EXCLUDE_TYPEDEFS: Enums:
-; EXCLUDE_TYPEDEFS: GlobalEnum
-; EXCLUDE_TYPEDEFS: Typedefs
 ; EXCLUDE_TYPEDEFS-NOT: GlobalTypedef
-; EXCLUDE_TYPEDEFS: Classes
-; EXCLUDE_TYPEDEFS: class FilterTestClass
 ; EXCLUDE_TYPEDEFS-NOT: NestedTypedef
-; EXCLUDE_TYPEDEFS: private:
+; EXCLUDE_TYPEDEFS-DAG: GlobalEnum
+; EXCLUDE_TYPEDEFS-DAG: NestedEnum
+; EXCLUDE_TYPEDEFS: class FilterTestClass
 
 ; EXCLUDE_ENUMS: ---TYPES---
-; EXCLUDE_ENUMS: Enums:
 ; EXCLUDE_ENUMS-NOT: GlobalEnum
-; EXCLUDE_ENUMS: Typedefs
+; EXCLUDE_ENUMS-NOT: NestedEnum
 ; EXCLUDE_ENUMS: GlobalTypedef
-; EXCLUDE_ENUMS: Classes
 ; EXCLUDE_ENUMS: class FilterTestClass
-; EXCLUDE_ENUMS-NOT: NestedEnum
-; EXCLUDE_ENUMS: private:
 
 ; EXCLUDE_VARS: ---TYPES---
-; EXCLUDE_VARS: Classes:
-; EXCLUDE_VARS: class FilterTestClass
-; EXCLUDE_VARS: private:
 ; EXCLUDE_VARS-NOT: IntMemberVar
 ; EXCLUDE_VARS-NOT: DoubleMemberVar
+; EXCLUDE_VARS-DAG: GlobalEnum
+; EXCLUDE_VARS-DAG: NestedEnum
+; EXCLUDE_VARS: GlobalTypedef
+; EXCLUDE_VARS: class FilterTestClass
 ; EXCLUDE_VARS: ---GLOBALS---
 ; EXCLUDE_VARS-NOT: DoubleGlobalVar
 ; EXCLUDE_VARS-NOT: IntGlobalVar
diff --git a/test/tools/llvm-profdata/memop-size-prof.proftext b/test/tools/llvm-profdata/memop-size-prof.proftext
new file mode 100644
index 0000000000000000000000000000000000000000..882fc1ecf2967486e415a05a2d74ed4b7d3620ec
--- /dev/null
+++ b/test/tools/llvm-profdata/memop-size-prof.proftext
@@ -0,0 +1,123 @@
+# RUN: llvm-profdata show -memop-sizes -ic-targets -function=foo %s | FileCheck %s --check-prefixes=MEMOP,MEMOP_SUM,ICALL,ICALL_SUM
+# RUN: llvm-profdata show -memop-sizes -ic-targets -counts -text -function=foo %s | FileCheck %s --check-prefixes=TEXT,MEMOP_TEXT,ICALL_TEXT
+# RUN: llvm-profdata merge -o %t.profdata  %s
+# RUN: llvm-profdata show -memop-sizes -ic-targets -function=foo %t.profdata | FileCheck %s --check-prefixes=MEMOP,MEMOP_SUM,ICALL,ICALL_SUM
+# RUN: llvm-profdata merge -o %t.proftext -text %s
+# RUN: llvm-profdata show -memop-sizes -ic-targets -function=foo %t.proftext| FileCheck %s --check-prefixes=MEMOP,MEMOP_SUM,ICALL,ICALL_SUM
+
+# IR level Instrumentation Flag
+:ir
+ic1
+# Func Hash:
+10
+# Num Counters:
+2
+# Counter Values:
+999000
+359800
+
+ic2
+# Func Hash:
+10
+# Num Counters:
+2
+# Counter Values:
+1001000
+360200
+
+foo
+# Func Hash:
+35277121310
+# Num Counters:
+3
+# Counter Values:
+20
+556
+1
+# Num Value Kinds:
+2
+# Value Kind IPVK_IndirectCallTarget
+0
+# NumSites
+3
+# Values for each site
+0
+2
+ic2:1000
+ic1:100
+1
+ic2:20000
+#ICALL: Indirect Target Results:
+#ICALL-NEXT:  [ 1, ic2, 1000 ]
+#ICALL-NEXT:  [ 1, ic1, 100 ]
+#ICALL-NEXT:  [ 2, ic2, 20000 ]
+
+# ValueKind = IPVK_MemOPSize:
+1
+# NumValueSites:
+1
+9
+1:99
+2:88
+3:77
+9:72
+4:66
+5:55
+6:44
+7:33
+8:22
+
+#MEMOP: Memory Instrinsic Size Results:
+#MEMOP-NEXT:  [ 0, 1, 99 ]
+#MEMOP-NEXT:  [ 0, 2, 88 ]
+#MEMOP-NEXT:  [ 0, 3, 77 ]
+#MEMOP-NEXT:  [ 0, 9, 72 ]
+#MEMOP-NEXT:  [ 0, 4, 66 ]
+#MEMOP-NEXT:  [ 0, 5, 55 ]
+#MEMOP-NEXT:  [ 0, 6, 44 ]
+#MEMOP-NEXT:  [ 0, 7, 33 ]
+#MEMOP-NEXT:  [ 0, 8, 22 ]
+
+#ICALL_SUM: Statistics for indirect call sites profile:
+#ICALL_SUM: Total number of sites: 3
+#ICALL_SUM: Total number of sites with values: 2
+#ICALL_SUM: Total number of profiled values: 3
+#ICALL_SUM: Value sites histogram:
+#ICALL_SUM:     NumTargets, SiteCount
+#ICALL_SUM:         1, 1
+#ICALL_SUM:         2, 1
+
+#MEMOP_SUM: Statistics for memory intrinsic calls sizes profile:
+#MEMOP_SUM: Total number of sites: 1
+#MEMOP_SUM: Total number of sites with values: 1
+#MEMOP_SUM: Total number of profiled values: 9
+#MEMOP_SUM: Value sites histogram:
+#MEMOP_SUM:	NumTargets, SiteCount
+#MEMOP_SUM:	9, 1
+
+#TEXT: # Num Value Kinds:
+#TEXT: 2
+#ICALL_TEXT: # ValueKind = IPVK_IndirectCallTarget:
+#ICALL_TEXT: 0
+#ICALL_TEXT: # NumValueSites:
+#ICALL_TEXT: 3
+#ICALL_TEXT: 0
+#ICALL_TEXT: 2
+#ICALL_TEXT: ic2:1000
+#ICALL_TEXT: ic1:100
+#ICALL_TEXT: 1
+#ICALL_TEXT: ic2:20000
+#MEMOP_TEXT: # ValueKind = IPVK_MemOPSize:
+#MEMOP_TEXT: 1
+#MEMOP_TEXT: # NumValueSites:
+#MEMOP_TEXT: 1
+#MEMOP_TEXT: 9
+#MEMOP_TEXT: 1:99
+#MEMOP_TEXT: 2:88
+#MEMOP_TEXT: 3:77
+#MEMOP_TEXT: 9:72
+#MEMOP_TEXT: 4:66
+#MEMOP_TEXT: 5:55
+#MEMOP_TEXT: 6:44
+#MEMOP_TEXT: 7:33
+#MEMOP_TEXT: 8:22
diff --git a/test/tools/llvm-profdata/value-prof.proftext b/test/tools/llvm-profdata/value-prof.proftext
index b5979c842498c0dcc641c8837850ca6c373b39e9..31a7698895ddbc69d60f008f4d6b2b5a46182846 100644
--- a/test/tools/llvm-profdata/value-prof.proftext
+++ b/test/tools/llvm-profdata/value-prof.proftext
@@ -46,13 +46,13 @@ foo2:1000
 foo2:20000
 
 #ICTXT: Indirect Call Site Count: 3
-#ICTXT-NEXT:    Indirect Target Results: 
+#ICTXT-NEXT:    Indirect Target Results:
 #ICTXT-NEXT:	[ 1, foo, 100 ]
 #ICTXT-NEXT:	[ 1, foo2, 1000 ]
 #ICTXT-NEXT:	[ 2, foo2, 20000 ]
 
 #IC: Indirect Call Site Count: 3
-#IC-NEXT:    Indirect Target Results: 
+#IC-NEXT:    Indirect Target Results:
 #IC-NEXT:	[ 1, foo2, 1000 ]
 #IC-NEXT:	[ 1, foo, 100 ]
 #IC-NEXT:	[ 2, foo2, 20000 ]
@@ -63,10 +63,19 @@ foo2:20000
 #ICTEXT-NEXT: foo2:20000
 #
 
-#ICSUM: Total Number of Indirect Call Sites : 3
-#ICSUM: Total Number of Sites With Values : 2
-#ICSUM: Total Number of Profiled Values : 3
-#ICSUM:	NumTargets, SiteCount
-#ICSUM	1, 1
-#ICSUM	2, 1
+bar
+# Func Hash:
+10
+# Num Counters:
+2
+# Counter Values:
+999000
+359800
 
+#ICSUM: Statistics for indirect call sites profile:
+#ICSUM: Total number of sites: 3
+#ICSUM: Total number of sites with values: 2
+#ICSUM: Total number of profiled values: 3
+#ICSUM:	NumTargets, SiteCount
+#ICSUM	  1, 1
+#ICSUM	  2, 1
diff --git a/test/tools/llvm-readobj/Inputs/codeview-cycle.obj b/test/tools/llvm-readobj/Inputs/codeview-cycle.obj
new file mode 100644
index 0000000000000000000000000000000000000000..85c2d0e55fe4169be8a88f3ce70accf1623958f5
Binary files /dev/null and b/test/tools/llvm-readobj/Inputs/codeview-cycle.obj differ
diff --git a/test/tools/llvm-readobj/Inputs/codeview-label.obj b/test/tools/llvm-readobj/Inputs/codeview-label.obj
new file mode 100644
index 0000000000000000000000000000000000000000..ae49a061bb7ccf2c34ddc4e61220ccf54ea295d1
Binary files /dev/null and b/test/tools/llvm-readobj/Inputs/codeview-label.obj differ
diff --git a/test/tools/llvm-readobj/Inputs/codeview-merging-anon.obj b/test/tools/llvm-readobj/Inputs/codeview-merging-anon.obj
new file mode 100644
index 0000000000000000000000000000000000000000..3cb58fbd4d54f3efb289b54ab3eee0d9fd3851b5
Binary files /dev/null and b/test/tools/llvm-readobj/Inputs/codeview-merging-anon.obj differ
diff --git a/test/tools/llvm-readobj/Inputs/codeview-unsorted.obj b/test/tools/llvm-readobj/Inputs/codeview-unsorted.obj
new file mode 100644
index 0000000000000000000000000000000000000000..08a376de3b96617f386030e89a147d2b33be1739
Binary files /dev/null and b/test/tools/llvm-readobj/Inputs/codeview-unsorted.obj differ
diff --git a/test/tools/llvm-readobj/Inputs/trivial.obj.wasm b/test/tools/llvm-readobj/Inputs/trivial.obj.wasm
index b24ac79c716374a77d9ffc06f183251720b1c438..d3906eeaf6f8618a9b148d56e0997714655bd847 100644
Binary files a/test/tools/llvm-readobj/Inputs/trivial.obj.wasm and b/test/tools/llvm-readobj/Inputs/trivial.obj.wasm differ
diff --git a/test/tools/llvm-readobj/codeview-label.test b/test/tools/llvm-readobj/codeview-label.test
new file mode 100644
index 0000000000000000000000000000000000000000..3bf6debe0d7fced54b740eba2175bbd09a5faf5f
--- /dev/null
+++ b/test/tools/llvm-readobj/codeview-label.test
@@ -0,0 +1,16 @@
+; RUN: llvm-readobj -codeview %S/Inputs/codeview-label.obj | FileCheck %s
+
+; CHECK-LABEL:  Label (0x1000) {
+; CHECK-NEXT:     TypeLeafKind: LF_LABEL (0xE)
+; CHECK-NEXT:     Mode: Near (0x0)
+; CHECK-NEXT:   }
+
+; To reproduce codeview-label.obj:
+; $ cat codeview-label.asm
+;         .model flat, C
+;         .code
+;         public  foo
+; foo:
+;         ret
+; end
+; $ ml -c -Zi codeview-label.asm
diff --git a/test/tools/llvm-readobj/codeview-merging-anon.test b/test/tools/llvm-readobj/codeview-merging-anon.test
new file mode 100644
index 0000000000000000000000000000000000000000..cf0484074d025745dcb5b24bf3683701c5276375
--- /dev/null
+++ b/test/tools/llvm-readobj/codeview-merging-anon.test
@@ -0,0 +1,29 @@
+# Test what happens when the first type record (0x1000) is a LF_FIELDLIST
+# record.
+
+# Steps to regenerate input:
+# $ cat t.c
+# struct { int x; } o;
+# $ cl -Z7 t.c
+
+RUN: llvm-readobj -codeview %S/Inputs/codeview-merging-anon.obj | FileCheck %s
+RUN: llvm-readobj -codeview-merged-types %S/Inputs/codeview-merging-anon.obj | FileCheck %s
+
+CHECK-LABEL:  FieldList (0x1000) {
+CHECK-NEXT:     TypeLeafKind: LF_FIELDLIST (0x1203)
+CHECK-NEXT:     DataMember {
+CHECK-NEXT:       TypeLeafKind: LF_MEMBER (0x150D)
+CHECK-NEXT:       AccessSpecifier: Public (0x3)
+CHECK-NEXT:       Type: int (0x74)
+CHECK-NEXT:       FieldOffset: 0x0
+CHECK-NEXT:       Name: x
+CHECK-NEXT:     }
+CHECK-NEXT:   }
+CHECK-LABEL:  Struct (0x1001) {
+CHECK:          TypeLeafKind: LF_STRUCTURE (0x1505)
+CHECK:          MemberCount: 1
+CHECK:          FieldList: <field list> (0x1000)
+CHECK:          Name: <unnamed-tag>
+CHECK:          LinkageName: .?AU<unnamed-tag>@@
+CHECK:        }
+CHECK-LABEL:  StringId
diff --git a/test/tools/llvm-readobj/codeview-merging-cycle.test b/test/tools/llvm-readobj/codeview-merging-cycle.test
new file mode 100644
index 0000000000000000000000000000000000000000..3a96be9ca9855e9b5c76cb95828f1ea3272b98a9
--- /dev/null
+++ b/test/tools/llvm-readobj/codeview-merging-cycle.test
@@ -0,0 +1,19 @@
+; RUN: not llvm-readobj -codeview-merged-types %S/Inputs/codeview-cycle.obj 2>&1 | FileCheck %s
+
+; CHECK: Error{{.*}} input type graph contains cycles
+
+; To reproduce codeview-cycle.obj:
+; $ cat codeview-cycle.asm
+;       .model  flat, C
+;       .code
+; pfoo_list TYPEDEF PTR foo_list
+; foo_list STRUCT
+;       next pfoo_list ?
+;       data dd ?
+; foo_list ENDS
+;       public  foo
+; foo proc dst:ptr foo_list
+;       ret
+; foo   endp
+;       end
+; $ ml -c -Zi codeview-cycle.asm
diff --git a/test/tools/llvm-readobj/codeview-merging-unsorted.test b/test/tools/llvm-readobj/codeview-merging-unsorted.test
new file mode 100644
index 0000000000000000000000000000000000000000..6aaab3a891eeb6f0219ff684f8a7458d9c6882ea
--- /dev/null
+++ b/test/tools/llvm-readobj/codeview-merging-unsorted.test
@@ -0,0 +1,40 @@
+; RUN: llvm-readobj -codeview %S/Inputs/codeview-unsorted.obj | FileCheck %s
+; RUN: llvm-readobj -codeview-merged-types %S/Inputs/codeview-unsorted.obj | FileCheck %s --check-prefix=MERGED
+
+; The input type stream has records that refer to later type indices in the same
+; stream:
+
+; CHECK: Pointer (0x1000)
+; CHECK: Struct (0x1001)
+; CHECK:   FieldList: {{.*}} (0x1002)
+; CHECK: FieldList (0x1002)
+; CHECK: Pointer (0x1003)
+; CHECK: Procedure (0x1004)
+; CHECK:   ArgListType: {{.*}} (0x1005)
+; CHECK: ArgList (0x1005)
+
+; MERGED: Pointer (0x1000)
+; MERGED: FieldList (0x1001)
+; MERGED: Struct (0x1002)
+; MERGED:   FieldList: {{.*}} (0x1001)
+; MERGED: Pointer (0x1003)
+; MERGED: ArgList (0x1004)
+; MERGED: Procedure (0x1005)
+; MERGED:   ArgListType: {{.*}} (0x1004)
+
+
+; To reproduce codeview-unsorted.obj:
+; $ cat codeview-unsorted.asm
+;       .model  flat, C
+;       .code
+; PBYTE TYPEDEF PTR BYTE
+; foo_list STRUCT
+;       next PBYTE ?
+;       data dd ?
+; foo_list ENDS
+;       public  foo
+; foo proc dst:ptr foo_list
+;       ret
+; foo   endp
+;       end
+; $ ml -c -Zi codeview-unsorted.asm
diff --git a/test/tools/llvm-readobj/codeview-merging.test b/test/tools/llvm-readobj/codeview-merging.test
index 60894eff33eb1a817766511be5b9d9a583d490f9..4d453e5a1167a51521de9b26bc606e05b0f13d8e 100644
--- a/test/tools/llvm-readobj/codeview-merging.test
+++ b/test/tools/llvm-readobj/codeview-merging.test
@@ -21,6 +21,15 @@ RUN: llvm-readobj -codeview %S/Inputs/codeview-merging-1.obj | FileCheck %s --ch
 RUN: llvm-readobj -codeview %S/Inputs/codeview-merging-2.obj | FileCheck %s --check-prefix=OBJ2
 RUN: llvm-readobj -codeview-merged-types %S/Inputs/codeview-merging-1.obj %S/Inputs/codeview-merging-2.obj | FileCheck %s
 
+OBJ1:       Procedure ({{.*}}) {
+OBJ1-NEXT:    TypeLeafKind: LF_PROCEDURE (0x1008)
+OBJ1-NEXT:    ReturnType: int (0x74)
+OBJ1-NEXT:    CallingConvention: NearC (0x0)
+OBJ1-NEXT:    FunctionOptions [ (0x0)
+OBJ1-NEXT:    ]
+OBJ1-NEXT:    NumParameters: 1
+OBJ1-NEXT:    ArgListType: (A*) (0x1002)
+OBJ1-NEXT:  }
 OBJ1:       FuncId (0x100D) {
 OBJ1-NEXT:    TypeLeafKind: LF_FUNC_ID (0x1601)
 OBJ1-NEXT:    ParentScope: 0x0
@@ -50,16 +59,55 @@ OBJ2-NEXT:    Name: g
 OBJ2-NEXT:  }
 OBJ2-NOT: FuncId
 
-CHECK:       FuncId (0x100D) {
+CHECK: MergedTypeStream [
+CHECK:       Procedure ({{.*}}) {
+CHECK-NEXT:    TypeLeafKind: LF_PROCEDURE (0x1008)
+CHECK-NEXT:    ReturnType: int (0x74)
+CHECK-NEXT:    CallingConvention: NearC (0x0)
+CHECK-NEXT:    FunctionOptions [ (0x0)
+CHECK-NEXT:    ]
+CHECK-NEXT:    NumParameters: 1
+CHECK-NEXT:    ArgListType: (A*) (0x1002)
+CHECK-NEXT:  }
+CHECK:       Struct (0x1007) {
+CHECK-NEXT:    TypeLeafKind: LF_STRUCTURE (0x1505)
+CHECK-NEXT:    MemberCount: 1
+CHECK-NEXT:    Properties [ (0x200)
+CHECK-NEXT:      HasUniqueName (0x200)
+CHECK-NEXT:    ]
+CHECK-NEXT:    FieldList: <field list> (0x1006)
+CHECK-NEXT:    DerivedFrom: 0x0
+CHECK-NEXT:    VShape: 0x0
+CHECK-NEXT:    SizeOf: 8
+CHECK-NEXT:    Name: B
+CHECK-NEXT:    LinkageName: .?AUB@@
+CHECK-NEXT:  }
+CHECK: ]
+
+CHECK: MergedIDStream [
+CHECK-NEXT:  StringId (0x1000) {
+CHECK-NEXT:    TypeLeafKind: LF_STRING_ID (0x1605)
+CHECK-NEXT:    Id: 0x0
+CHECK-NEXT:    StringData: d:\src\llvm\build\t.cpp
+CHECK-NEXT:  }
+# Test that we contextually dump item ids and type ids from different databases.
+CHECK-NEXT:  UdtSourceLine (0x1001) {
+CHECK-NEXT:    TypeLeafKind: LF_UDT_SRC_LINE (0x1606)
+CHECK-NEXT:    UDT: B (0x1007)
+CHECK-NEXT:    SourceFile: d:\src\llvm\build\t.cpp (0x1000)
+CHECK-NEXT:    LineNumber: 3
+CHECK-NEXT:  }
+CHECK:       FuncId (0x1002) {
 CHECK-NEXT:    TypeLeafKind: LF_FUNC_ID (0x1601)
 CHECK-NEXT:    ParentScope: 0x0
-CHECK-NEXT:    FunctionType: int (B*) (0x100C)
+CHECK-NEXT:    FunctionType: int (B*)
 CHECK-NEXT:    Name: g
 CHECK-NEXT:  }
-CHECK-NEXT:  FuncId (0x100E) {
+CHECK-NEXT:  FuncId (0x1003) {
 CHECK-NEXT:    TypeLeafKind: LF_FUNC_ID (0x1601)
 CHECK-NEXT:    ParentScope: 0x0
-CHECK-NEXT:    FunctionType: int (A*) (0x1003)
+CHECK-NEXT:    FunctionType: int (A*)
 CHECK-NEXT:    Name: f
 CHECK-NEXT:  }
 CHECK-NOT: FuncId
+CHECK: ]
diff --git a/test/tools/llvm-readobj/file-headers.test b/test/tools/llvm-readobj/file-headers.test
index 4fcb2859d27a5c40b57719833013970ed61c3b28..52485f7a34a2e2f81884a080aace66c18acb0406 100644
--- a/test/tools/llvm-readobj/file-headers.test
+++ b/test/tools/llvm-readobj/file-headers.test
@@ -377,4 +377,4 @@ ELF-LANAI-NEXT: }
 WASM: Format: WASM
 WASM-NEXT: Arch: wasm32
 WASM-NEXT: AddressSize: 32bit
-WASM-NEXT: Version: 0xD
+WASM-NEXT: Version: 0x1
diff --git a/test/tools/llvm-readobj/sections.test b/test/tools/llvm-readobj/sections.test
index 26a72d85e49cf41c5b52c935cad3ce521744d42e..53705a7a696f06d9f486152d10eee4ddb86c1b25 100644
--- a/test/tools/llvm-readobj/sections.test
+++ b/test/tools/llvm-readobj/sections.test
@@ -531,7 +531,7 @@ WASM-NEXT:     Offset: 75
 WASM-NEXT:   }
 WASM-NEXT:   Section {
 WASM-NEXT:     Type: CUSTOM (0x0)
-WASM-NEXT:     Size: 44
+WASM-NEXT:     Size: 60
 WASM-NEXT:     Offset: 119
 WASM-NEXT:     Name: name
 WASM-NEXT:   }
diff --git a/test/tools/llvm-symbolizer/Inputs/discrim b/test/tools/llvm-symbolizer/Inputs/discrim
new file mode 100644
index 0000000000000000000000000000000000000000..ec61fe960bffb5efd7cc3cf7d0d96e58ed231c49
Binary files /dev/null and b/test/tools/llvm-symbolizer/Inputs/discrim differ
diff --git a/test/tools/llvm-symbolizer/Inputs/discrim.c b/test/tools/llvm-symbolizer/Inputs/discrim.c
new file mode 100644
index 0000000000000000000000000000000000000000..decbce8d454e6ec671a209403bb276c1a8564a5d
--- /dev/null
+++ b/test/tools/llvm-symbolizer/Inputs/discrim.c
@@ -0,0 +1,8 @@
+static volatile int do_mul;
+static volatile int do_inc;
+
+int main () {
+  int x = 1;
+  if (do_mul) x *= 2; else x /= 2;
+  return do_inc ? ++x : --x;
+}
diff --git a/test/tools/llvm-symbolizer/Inputs/discrim.inp b/test/tools/llvm-symbolizer/Inputs/discrim.inp
new file mode 100644
index 0000000000000000000000000000000000000000..f8ad6018d70929825644deb2c67a6a29136e372d
--- /dev/null
+++ b/test/tools/llvm-symbolizer/Inputs/discrim.inp
@@ -0,0 +1,5 @@
+some text
+0x4004f2
+0x400509
+0x40050d
+some more text
diff --git a/test/tools/llvm-symbolizer/sym-verbose.test b/test/tools/llvm-symbolizer/sym-verbose.test
new file mode 100644
index 0000000000000000000000000000000000000000..ef66db919faaee19d14ae21edc08af83b841c735
--- /dev/null
+++ b/test/tools/llvm-symbolizer/sym-verbose.test
@@ -0,0 +1,39 @@
+#static volatile int do_mul;
+#static volatile int do_inc;
+#
+#int main () {
+#  int x = 1;
+#  if (do_mul) x *= 2; else x /= 2;
+#  return do_inc ? ++x : --x;
+#}
+#Build as : clang -g -O2 discrim.c -o discrim
+
+RUN: llvm-symbolizer -verbose -print-address -obj=%p/Inputs/discrim < %p/Inputs/discrim.inp | FileCheck %s
+
+#CHECK: some text
+
+#CHECK: 0x4004f2
+#CHECK-NEXT: main
+#CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c
+#CHECK-NEXT: Function start line: 4
+#CHECK-NEXT: Line: 6
+#CHECK-NEXT: Column: 7
+#CHECK-NOT: Discriminator: 0
+
+#CHECK: 0x400509
+#CHECK-NEXT: main
+#CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c
+#CHECK-NEXT: Function start line: 4
+#CHECK-NEXT: Line: 7
+#CHECK-NEXT: Column: 3
+#CHECK-NEXT: Discriminator: 1
+
+#CHECK: 0x40050d
+#CHECK-NEXT: main
+#CHECK-NEXT: Filename: /tmp{{[\\/]}}discrim.c
+#CHECK-NEXT: Function start line: 4
+#CHECK-NEXT: Line: 7
+#CHECK-NEXT: Column: 3
+#CHECK-NEXT: Discriminator: 2
+
+#CHECK: some more text
diff --git a/test/tools/llvm-xray/X86/Inputs/fdr-log-version-1.xray b/test/tools/llvm-xray/X86/Inputs/fdr-log-version-1.xray
new file mode 100644
index 0000000000000000000000000000000000000000..628be9a5dc2065857028bf8da5ac974cba2db1a5
Binary files /dev/null and b/test/tools/llvm-xray/X86/Inputs/fdr-log-version-1.xray differ
diff --git a/test/tools/llvm-xray/X86/Inputs/simple-xray-instrmap.yaml b/test/tools/llvm-xray/X86/Inputs/simple-xray-instrmap.yaml
index 483d3e4f2c8fa332dbe8859c9dd07d197841bd10..9c2493392f038633bde32d19efdca4f43d82e497 100644
--- a/test/tools/llvm-xray/X86/Inputs/simple-xray-instrmap.yaml
+++ b/test/tools/llvm-xray/X86/Inputs/simple-xray-instrmap.yaml
@@ -1,14 +1,8 @@
 ---
-- { id: 1, address: 0x000000000041CA40, function: 0x000000000041CA40, kind: function-enter, 
-    always-instrument: true }
-- { id: 1, address: 0x000000000041CA50, function: 0x000000000041CA40, kind: tail-exit, 
-    always-instrument: true }
-- { id: 2, address: 0x000000000041CA70, function: 0x000000000041CA70, kind: function-enter, 
-    always-instrument: true }
-- { id: 2, address: 0x000000000041CA7C, function: 0x000000000041CA70, kind: tail-exit, 
-    always-instrument: true }
-- { id: 3, address: 0x000000000041CAA0, function: 0x000000000041CAA0, kind: function-enter, 
-    always-instrument: true }
-- { id: 3, address: 0x000000000041CAB4, function: 0x000000000041CAA0, kind: function-exit, 
-    always-instrument: true }
+- { id: 1, address: 0x000000000041CA40, function: 0x000000000041CA40, kind: function-enter, always-instrument: true }
+- { id: 1, address: 0x000000000041CA50, function: 0x000000000041CA40, kind: tail-exit, always-instrument: true }
+- { id: 2, address: 0x000000000041CA70, function: 0x000000000041CA70, kind: function-enter, always-instrument: true }
+- { id: 2, address: 0x000000000041CA7C, function: 0x000000000041CA70, kind: tail-exit, always-instrument: true }
+- { id: 3, address: 0x000000000041CAA0, function: 0x000000000041CAA0, kind: function-enter, always-instrument: true }
+- { id: 3, address: 0x000000000041CAB4, function: 0x000000000041CAA0, kind: function-exit, always-instrument: true }
 ...
diff --git a/test/tools/llvm-xray/X86/account-deduce-tail-call.yaml b/test/tools/llvm-xray/X86/account-deduce-tail-call.yaml
index 6e926974141f378a9f387169dc052b17bcf75d32..e8b46cbf1766b244442867613dc141a9d4317624 100644
--- a/test/tools/llvm-xray/X86/account-deduce-tail-call.yaml
+++ b/test/tools/llvm-xray/X86/account-deduce-tail-call.yaml
@@ -1,4 +1,4 @@
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d | FileCheck %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -d | FileCheck %s
 ---
 header:
   version: 1
diff --git a/test/tools/llvm-xray/X86/account-keep-going.yaml b/test/tools/llvm-xray/X86/account-keep-going.yaml
index 1b234c0d7e8e4adc6d2d2f45a01b3c12c1db3a7f..76011ee8e6e5e66fa43cc19da0e06cc071972a4f 100644
--- a/test/tools/llvm-xray/X86/account-keep-going.yaml
+++ b/test/tools/llvm-xray/X86/account-keep-going.yaml
@@ -1,4 +1,4 @@
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -k | FileCheck %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -k | FileCheck %s
 ---
 header:
   version: 1
diff --git a/test/tools/llvm-xray/X86/account-simple-case.yaml b/test/tools/llvm-xray/X86/account-simple-case.yaml
index 82d83aae033ef4792f10d1554d80d0db012576c4..c995a7a77dfed3ed4bf5ca1c284ae7bec9b84777 100644
--- a/test/tools/llvm-xray/X86/account-simple-case.yaml
+++ b/test/tools/llvm-xray/X86/account-simple-case.yaml
@@ -1,4 +1,4 @@
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml | FileCheck %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck %s
 ---
 header:
   version: 1
@@ -7,10 +7,8 @@ header:
   nonstop-tsc: true
   cycle-frequency: 2601000000
 records:
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
-    tsc: 10001 }
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
-    tsc: 10100 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 10100 }
 ...
 
 #CHECK:       Functions with latencies: 1
diff --git a/test/tools/llvm-xray/X86/account-simple-sorting.yaml b/test/tools/llvm-xray/X86/account-simple-sorting.yaml
index d25aef24a272110d16a063c270f7fba87eb11e54..e0f32696caf0fd6d73f9c98e2f7422bf341628b2 100644
--- a/test/tools/llvm-xray/X86/account-simple-sorting.yaml
+++ b/test/tools/llvm-xray/X86/account-simple-sorting.yaml
@@ -1,13 +1,13 @@
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml | FileCheck --check-prefix DEFAULT %s
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s count | FileCheck --check-prefix COUNT-ASC %s
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s min | FileCheck --check-prefix MIN-ASC %s
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s max | FileCheck --check-prefix MAX-ASC %s
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s sum | FileCheck --check-prefix SUM-ASC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck --check-prefix DEFAULT %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -s count | FileCheck --check-prefix COUNT-ASC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -s min | FileCheck --check-prefix MIN-ASC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -s max | FileCheck --check-prefix MAX-ASC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -s sum | FileCheck --check-prefix SUM-ASC %s
 
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s count -r dsc | FileCheck --check-prefix COUNT-DSC %s
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s min -r dsc | FileCheck --check-prefix MIN-DSC %s
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s max -r dsc | FileCheck --check-prefix MAX-DSC %s
-#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -s sum -r dsc | FileCheck --check-prefix SUM-DSC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -s count -r dsc | FileCheck --check-prefix COUNT-DSC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -s min -r dsc | FileCheck --check-prefix MIN-DSC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -s max -r dsc | FileCheck --check-prefix MAX-DSC %s
+#RUN: llvm-xray account %s -o - -m %S/Inputs/simple-instrmap.yaml -s sum -r dsc | FileCheck --check-prefix SUM-DSC %s
 ---
 header:
   version: 1
@@ -17,27 +17,17 @@ header:
   cycle-frequency: 1
 records:
   # Function id: 1
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
-    tsc: 10001 }
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
-    tsc: 10100 }
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
-    tsc: 10101 }
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
-    tsc: 10200 }
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
-    tsc: 10201 }
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
-    tsc: 10300 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 10100 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10101 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 10200 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10201 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 10300 }
   # Function id: 2
-  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-enter,
-    tsc: 10001 }
-  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-exit,
-    tsc: 10002 }
-  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-enter,
-    tsc: 10101 }
-  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-exit,
-    tsc: 10102 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-enter, tsc: 10001 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-exit, tsc: 10002 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-enter, tsc: 10101 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 222, kind: function-exit, tsc: 10102 }
 
 #DEFAULT:       Functions with latencies: 2
 #DEFAULT-NEXT:  funcid  count  [ min, med, 90p, 99p, max] sum function
diff --git a/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt b/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
new file mode 100644
index 0000000000000000000000000000000000000000..5a5852e7201890b84074af0fc118dd651bed51ef
--- /dev/null
+++ b/test/tools/llvm-xray/X86/convert-fdr-to-yaml.txt
@@ -0,0 +1,24 @@
+; RUN: llvm-xray convert %S/Inputs/fdr-log-version-1.xray -f=yaml -o - | FileCheck %s
+
+; CHECK:      ---
+; CHECK-NEXT: header:
+; CHECK-NEXT:   version:         1
+; CHECK-NEXT:   type:            1
+; CHECK-NEXT:   constant-tsc:    true
+; CHECK-NEXT:   nonstop-tsc:     true
+; CHECK-NEXT:   cycle-frequency: 5678
+; CHECK-NEXT: records:
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407340 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407346 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407347 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407387 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407437 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407467 }
+; CHECK-NEXT:   - { type: 0, func-id: 4, function: '4', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407492 }
+; CHECK-NEXT:   - { type: 0, func-id: 5, function: '5', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407517 }
+; CHECK-NEXT:   - { type: 0, func-id: 5, function: '5', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407542 }
+; CHECK-NEXT:   - { type: 0, func-id: 268435455, function: '268435455', cpu: 5, thread: 5, kind: function-enter, tsc: 7238225556407552 }
+; CHECK-NEXT:   - { type: 0, func-id: 268435455, function: '268435455', cpu: 5, thread: 5, kind: function-exit, tsc: 7238225556407562 }
+; CHECK-NEXT:   - { type: 0, func-id: 6, function: '6', cpu: 6, thread: 5, kind: function-enter, tsc: 7238225556407682 }
+; CHECK-NEXT:   - { type: 0, func-id: 6, function: '6', cpu: 6, thread: 5, kind: function-exit, tsc: 7238225556407755 }
+; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-roundtrip.yaml b/test/tools/llvm-xray/X86/convert-roundtrip.yaml
index 8444262842643f54350f1efe6722678316284e6a..4c5dfd181488deff85fd448d8fbe4689a1b68e77 100644
--- a/test/tools/llvm-xray/X86/convert-roundtrip.yaml
+++ b/test/tools/llvm-xray/X86/convert-roundtrip.yaml
@@ -7,10 +7,8 @@ header:
   nonstop-tsc: true
   cycle-frequency: 2601000000
 records:
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
-    tsc: 10001 }
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
-    tsc: 10100 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 10100 }
 ...
 
 #CHECK:       ---
@@ -21,8 +19,6 @@ records:
 #CHECK-NEXT:    nonstop-tsc: true
 #CHECK-NEXT:    cycle-frequency: 2601000000
 #CHECK-NEXT:  records:
-#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-enter,
-#CHECK-NEXT:      tsc: 10001 }
-#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-exit,
-#CHECK-NEXT:      tsc: 10100 }
+#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
+#CHECK-NEXT:    - { type: 0, func-id: 1, function: '1', cpu: 1, thread: 111, kind: function-exit, tsc: 10100 }
 #CHECK-NEXT:  ...
diff --git a/test/tools/llvm-xray/X86/convert-to-yaml.txt b/test/tools/llvm-xray/X86/convert-to-yaml.txt
index c402bc18d83dcfb63ff2e5936c5a867a15ed26d0..66a5618e12f6eaa011990a6979288d3b679bc38a 100644
--- a/test/tools/llvm-xray/X86/convert-to-yaml.txt
+++ b/test/tools/llvm-xray/X86/convert-to-yaml.txt
@@ -8,16 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter,
-; CHECK-NEXT:       tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter,
-; CHECK-NEXT:       tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit,
-; CHECK-NEXT:       tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter,
-; CHECK-NEXT:       tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit,
-; CHECK-NEXT:       tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit,
-; CHECK-NEXT:       tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-debug-syms.txt b/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
index ddb8b6bdb1cc8f91fdc35d05ac729cde35d98307..76cee99d4b51ed52771783c72286cf54bee8c0ce 100644
--- a/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
+++ b/test/tools/llvm-xray/X86/convert-with-debug-syms.txt
@@ -8,16 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-enter,
-; CHECK-NEXT:       tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-enter,
-; CHECK-NEXT:       tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-exit,
-; CHECK-NEXT:       tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-enter,
-; CHECK-NEXT:       tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-exit,
-; CHECK-NEXT:       tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-exit,
-; CHECK-NEXT:       tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: {{.*foo.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: {{.*bar.*}}, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: main, cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt b/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
index 71c17280df402c89afcbe4d098bb72a4f6cd01a6..700fa38ed38c61037a2f16b967cac82105a08192 100644
--- a/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
+++ b/test/tools/llvm-xray/X86/convert-with-standalone-instrmap.txt
@@ -8,16 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697,
-; CHECK-NEXT:       kind: function-enter, tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697,
-; CHECK-NEXT:       kind: function-enter, tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697,
-; CHECK-NEXT:       kind: function-exit, tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697,
-; CHECK-NEXT:       kind: function-enter, tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697,
-; CHECK-NEXT:       kind: function-exit, tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697,
-; CHECK-NEXT:       kind: function-exit, tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '@(41ca70)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '@(41ca40)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '@(41caa0)', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt b/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
index 01191c9c2a31eba71fec194348bc1f3d426c359f..6837072a1fc5f692b042e58934ffdb2ec0c76cb5 100644
--- a/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
+++ b/test/tools/llvm-xray/X86/convert-with-yaml-instrmap.txt
@@ -1,4 +1,4 @@
-; RUN: llvm-xray convert -m %S/Inputs/simple-xray-instrmap.yaml -t yaml %S/Inputs/naive-log-simple.xray -f=yaml -o - | FileCheck %s
+; RUN: llvm-xray convert -m %S/Inputs/simple-xray-instrmap.yaml %S/Inputs/naive-log-simple.xray -f=yaml -o - | FileCheck %s
 
 ; CHECK:      ---
 ; CHECK-NEXT: header:
@@ -8,16 +8,10 @@
 ; CHECK-NEXT:   nonstop-tsc:     true
 ; CHECK-NEXT:   cycle-frequency: 2601000000
 ; CHECK-NEXT: records:
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter,
-; CHECK-NEXT:       tsc: 3315356841453914 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter,
-; CHECK-NEXT:       tsc: 3315356841454542 }
-; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit,
-; CHECK-NEXT:       tsc: 3315356841454670 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter,
-; CHECK-NEXT:       tsc: 3315356841454762 }
-; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit,
-; CHECK-NEXT:       tsc: 3315356841454802 }
-; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit,
-; CHECK-NEXT:       tsc: 3315356841494828 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841453914 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454542 }
+; CHECK-NEXT:   - { type: 0, func-id: 2, function: '2', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454670 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-enter, tsc: 3315356841454762 }
+; CHECK-NEXT:   - { type: 0, func-id: 1, function: '1', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841454802 }
+; CHECK-NEXT:   - { type: 0, func-id: 3, function: '3', cpu: 37, thread: 84697, kind: function-exit, tsc: 3315356841494828 }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/extract-instrmap.ll b/test/tools/llvm-xray/X86/extract-instrmap.ll
index 8155d57f814b4f618058785aac67cb4b46a459d9..7447aec68114485bf67aa823426f3f1f1c708c22 100644
--- a/test/tools/llvm-xray/X86/extract-instrmap.ll
+++ b/test/tools/llvm-xray/X86/extract-instrmap.ll
@@ -4,12 +4,8 @@
 ; RUN: llvm-xray extract %S/Inputs/elf64-example.bin | FileCheck %s
 
 ; CHECK:      ---
-; CHECK-NEXT: - { id: 1, address: 0x000000000041C900, function: 0x000000000041C900, kind: function-enter,
-; CHECK-NEXT:     always-instrument: true }
-; CHECK-NEXT: - { id: 1, address: 0x000000000041C912, function: 0x000000000041C900, kind: function-exit,
-; CHECK-NEXT:     always-instrument: true }
-; CHECK-NEXT: - { id: 2, address: 0x000000000041C930, function: 0x000000000041C930, kind: function-enter,
-; CHECK-NEXT:    always-instrument: true }
-; CHECK-NEXT: - { id: 2, address: 0x000000000041C946, function: 0x000000000041C930, kind: function-exit,
-; CHECK-NEXT:     always-instrument: true }
+; CHECK-NEXT: - { id: 1, address: 0x000000000041C900, function: 0x000000000041C900, kind: function-enter, always-instrument: true }
+; CHECK-NEXT: - { id: 1, address: 0x000000000041C912, function: 0x000000000041C900, kind: function-exit, always-instrument: true }
+; CHECK-NEXT: - { id: 2, address: 0x000000000041C930, function: 0x000000000041C930, kind: function-enter, always-instrument: true }
+; CHECK-NEXT: - { id: 2, address: 0x000000000041C946, function: 0x000000000041C930, kind: function-exit, always-instrument: true }
 ; CHECK-NEXT: ...
diff --git a/test/tools/llvm-xray/X86/graph-color-simple-case.yaml b/test/tools/llvm-xray/X86/graph-color-simple-case.yaml
index e1d0d9ad52c91e824e9956d0fc389ba89b07ba2c..3950c8c99962e4e09fd069641574e3cc11d0026e 100644
--- a/test/tools/llvm-xray/X86/graph-color-simple-case.yaml
+++ b/test/tools/llvm-xray/X86/graph-color-simple-case.yaml
@@ -1,6 +1,6 @@
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -e sum -c sum \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -e sum -c sum \
 #RUN:    | FileCheck %s -check-prefix=EDGE
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -v sum -b sum \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -v sum -b sum \
 #RUN:    | FileCheck %s -check-prefix=VERTEX
 ---
 header:
@@ -32,15 +32,15 @@ records:
 
 
 #EDGE:     digraph xray {
-#EDGE-DAG: F0 -> F7 [label="7.{{[0-9]*}}e+01" color="#B00100"];
-#EDGE-DAG: F0 -> F2 [label="2.{{[0-9]*}}e+01" color="#FD9965"];
+#EDGE-DAG: F0 -> F7 [label="7.{{[0-9]*}}e+01" color="#B00000"];
+#EDGE-DAG: F0 -> F2 [label="2.{{[0-9]*}}e+01" color="#FC9963"];
 #EDGE-DAG: F0 -> F9 [label="9.{{[0-9]*}}e+01" color="#7F0000"];
-#EDGE-DAG: F0 -> F4 [label="4.{{[0-9]*}}e+01" color="#E8543b"];
-#EDGE-DAG: F0 -> F6 [label="6.{{[0-9]*}}e+01" color="#C5140a"];
-#EDGE-DAG: F0 -> F1 [label="1.{{[0-9]*}}e+01" color="#FDC58c"];
-#EDGE-DAG: F0 -> F8 [label="8.{{[0-9]*}}e+01" color="#990101"];
-#EDGE-DAG: F0 -> F3 [label="3.{{[0-9]*}}e+01" color="#F5744d"];
-#EDGE-DAG: F0 -> F5 [label="5.{{[0-9]*}}e+01" color="#D83323"];
+#EDGE-DAG: F0 -> F4 [label="4.{{[0-9]*}}e+01" color="#E75339"];
+#EDGE-DAG: F0 -> F6 [label="6.{{[0-9]*}}e+01" color="#C4150D"];
+#EDGE-DAG: F0 -> F1 [label="1.{{[0-9]*}}e+01" color="#FDC48D"];
+#EDGE-DAG: F0 -> F8 [label="8.{{[0-9]*}}e+01" color="#970000"];
+#EDGE-DAG: F0 -> F3 [label="3.{{[0-9]*}}e+01" color="#F4744E"];
+#EDGE-DAG: F0 -> F5 [label="5.{{[0-9]*}}e+01" color="#D83220"];
 #EDGE-DAG: F7 [label="@(7)"];
 #EDGE-DAG: F2 [label="@(2)"];
 #EDGE-DAG: F9 [label="@(9)"];
@@ -63,13 +63,13 @@ records:
 #VERTEX-DAG: F0 -> F8 [label=""];
 #VERTEX-DAG: F0 -> F3 [label=""];
 #VERTEX-DAG: F0 -> F5 [label=""];
-#VERTEX-DAG: F7 [label="{@(7)|7.{{[0-9]*}}e+01}" color="#B00100"];
-#VERTEX-DAG: F2 [label="{@(2)|2.{{[0-9]*}}e+01}" color="#FD9965"];
+#VERTEX-DAG: F7 [label="{@(7)|7.{{[0-9]*}}e+01}" color="#B00000"];
+#VERTEX-DAG: F2 [label="{@(2)|2.{{[0-9]*}}e+01}" color="#FC9963"];
 #VERTEX-DAG: F9 [label="{@(9)|9.{{[0-9]*}}e+01}" color="#7F0000"];
-#VERTEX-DAG: F4 [label="{@(4)|4.{{[0-9]*}}e+01}" color="#E8543b"];
-#VERTEX-DAG: F6 [label="{@(6)|6.{{[0-9]*}}e+01}" color="#C5140a"];
-#VERTEX-DAG: F1 [label="{@(1)|1.{{[0-9]*}}e+01}" color="#FDC58c"];
-#VERTEX-DAG: F8 [label="{@(8)|8.{{[0-9]*}}e+01}" color="#990101"];
-#VERTEX-DAG: F3 [label="{@(3)|3.{{[0-9]*}}e+01}" color="#F5744d"];
-#VERTEX-DAG: F5 [label="{@(5)|5.{{[0-9]*}}e+01}" color="#D83323"];
+#VERTEX-DAG: F4 [label="{@(4)|4.{{[0-9]*}}e+01}" color="#E75339"];
+#VERTEX-DAG: F6 [label="{@(6)|6.{{[0-9]*}}e+01}" color="#C4150D"];
+#VERTEX-DAG: F1 [label="{@(1)|1.{{[0-9]*}}e+01}" color="#FDC48D"];
+#VERTEX-DAG: F8 [label="{@(8)|8.{{[0-9]*}}e+01}" color="#970000"];
+#VERTEX-DAG: F3 [label="{@(3)|3.{{[0-9]*}}e+01}" color="#F4744E"];
+#VERTEX-DAG: F5 [label="{@(5)|5.{{[0-9]*}}e+01}" color="#D83220"];
 #VERTEX-NEXT: }
diff --git a/test/tools/llvm-xray/X86/graph-deduce-tail-call.yaml b/test/tools/llvm-xray/X86/graph-deduce-tail-call.yaml
index 1654f672110f16cd11316f91d6b307ce1dd4e52f..6f756bf018f904a99ba79964e414ea0d8b614406 100644
--- a/test/tools/llvm-xray/X86/graph-deduce-tail-call.yaml
+++ b/test/tools/llvm-xray/X86/graph-deduce-tail-call.yaml
@@ -1,19 +1,19 @@
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -d \
 #RUN:    | FileCheck %s -check-prefix=EMPTY
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d -e count \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -d -e count \
 #RUN:    | FileCheck %s -check-prefix=COUNT
 #
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d -e min \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -d -e min \
 #RUN:    | FileCheck %s -check-prefix=TIME
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d -e med \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -d -e med \
 #RUN:    | FileCheck %s -check-prefix=TIME
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d -e 90p \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -d -e 90p \
 #RUN:    | FileCheck %s -check-prefix=TIME
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d -e 99p \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -d -e 99p \
 #RUN:    | FileCheck %s -check-prefix=TIME
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d -e max \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -d -e max \
 #RUN:    | FileCheck %s -check-prefix=TIME
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -d -e sum \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -d -e sum \
 #RUN:    | FileCheck %s -check-prefix=TIME
 #
 ---
diff --git a/test/tools/llvm-xray/X86/graph-simple-case.yaml b/test/tools/llvm-xray/X86/graph-simple-case.yaml
index 0b465d091345e19daaae77259af95990f9a73f93..b0d6dcf2fb4cb753c7d4342d9656783f5e98bfc6 100644
--- a/test/tools/llvm-xray/X86/graph-simple-case.yaml
+++ b/test/tools/llvm-xray/X86/graph-simple-case.yaml
@@ -1,19 +1,19 @@
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml \
 #RUN:    | FileCheck %s -check-prefix=EMPTY
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -e count \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -e count \
 #RUN:    | FileCheck %s -check-prefix=COUNT
 #
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -e min \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -e min \
 #RUN:    | FileCheck %s -check-prefix=TIME
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -e med \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -e med \
 #RUN:    | FileCheck %s -check-prefix=TIME
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -e 90p \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -e 90p \
 #RUN:    | FileCheck %s -check-prefix=TIME
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -e 99p \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -e 99p \
 #RUN:    | FileCheck %s -check-prefix=TIME
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -e max \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -e max \
 #RUN:    | FileCheck %s -check-prefix=TIME
-#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -t yaml -e sum \
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml -e sum \
 #RUN:    | FileCheck %s -check-prefix=TIME
 ---
 header:
@@ -23,10 +23,8 @@ header:
   nonstop-tsc: true
   cycle-frequency: 2601000000
 records:
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter,
-    tsc: 10001 }
-  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit,
-    tsc: 10100 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 10100 }
 ...
 
 
diff --git a/test/tools/llvm-xray/X86/graph-zero-latency-calls.yaml b/test/tools/llvm-xray/X86/graph-zero-latency-calls.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..602f209072afc7f53bf81dc1c18967805064a120
--- /dev/null
+++ b/test/tools/llvm-xray/X86/graph-zero-latency-calls.yaml
@@ -0,0 +1,20 @@
+#RUN: llvm-xray graph %s -o - -m %S/Inputs/simple-instrmap.yaml | FileCheck %s
+
+---
+header:
+  version: 1
+  type: 0
+  constant-tsc: true
+  nonstop-tsc: true
+  cycle-frequency: 2601000000
+records:
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-enter, tsc: 10001 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-enter, tsc: 10002 }
+  - { type: 0, func-id: 2, cpu: 1, thread: 111, kind: function-exit, tsc: 10002 }
+  - { type: 0, func-id: 1, cpu: 1, thread: 111, kind: function-exit, tsc: 10100 }
+...
+
+#CHECK:     digraph xray {
+#CHECK-DAG:   F0 -> F1 [{{.*}}];
+#CHECK-DAG:   F1 -> F2 [{{.*}}];
+#CHECK-DAG: }
diff --git a/test/tools/llvm-xray/X86/no-subcommand-noassert.txt b/test/tools/llvm-xray/X86/no-subcommand-noassert.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3de328d73a433057a1636c6bfa03b9a1fff71bec
--- /dev/null
+++ b/test/tools/llvm-xray/X86/no-subcommand-noassert.txt
@@ -0,0 +1,3 @@
+; RUN: llvm-xray | FileCheck %s
+
+; CHECK:  OVERVIEW: XRay Tools
diff --git a/test/tools/sancov/AArch64/print_coverage_pcs.test b/test/tools/sancov/AArch64/print_coverage_pcs.test
index b71eabb8a357072b13abd600b4beb94e4dd63b14..d5fff4db1320db550c118dccc5c849a00da93594 100644
--- a/test/tools/sancov/AArch64/print_coverage_pcs.test
+++ b/test/tools/sancov/AArch64/print_coverage_pcs.test
@@ -1,4 +1,4 @@
 REQUIRES: aarch64-registered-target
 RUN: not sancov -print-coverage-pcs %p/../Inputs/test-linux_android_aarch64 2>&1 | FileCheck %s --check-prefix=AARCH64
 
-AARCH64: Error: __sanitizer_cov* functions not found
+AARCH64: ERROR: __sanitizer_cov* functions not found
diff --git a/test/tools/sancov/validation.test b/test/tools/sancov/validation.test
new file mode 100644
index 0000000000000000000000000000000000000000..437870cf597bfba16ac7538e062dafde21a2d9fb
--- /dev/null
+++ b/test/tools/sancov/validation.test
@@ -0,0 +1,6 @@
+REQUIRES: x86_64-linux
+RUN: not sancov -covered-functions %p/Inputs/test-linux_x86_64 2>&1 | FileCheck --check-prefix=NOCFILE %s
+
+NOCFILE: WARNING: No coverage file for {{.*}}test-linux_x86_64
+NOCFILE: ERROR: No valid coverage files given.
+
diff --git a/test/tools/yaml2obj/invalid_output_file.test b/test/tools/yaml2obj/invalid_output_file.test
new file mode 100644
index 0000000000000000000000000000000000000000..3045a0b21f56f161d5dc8711606aa8f8c5027095
--- /dev/null
+++ b/test/tools/yaml2obj/invalid_output_file.test
@@ -0,0 +1,4 @@
+# RUN: not yaml2obj -o %p/path/does/not/exist 2>&1 | FileCheck %s
+
+# Don't check the OS-dependent message "No such file or directory".
+# CHECK: yaml2obj: Error opening '{{.*}}/path/does/not/exist': {{.*}}
diff --git a/test/tools/yaml2obj/lit.local.cfg b/test/tools/yaml2obj/lit.local.cfg
new file mode 100644
index 0000000000000000000000000000000000000000..8169b9f95e118cf2bb7e20d91d4fe1e3c1fb67d3
--- /dev/null
+++ b/test/tools/yaml2obj/lit.local.cfg
@@ -0,0 +1 @@
+config.suffixes = ['.yaml']
diff --git a/test/tools/yaml2obj/missing_document_tag.yaml b/test/tools/yaml2obj/missing_document_tag.yaml
index 3cec172d580136eea3043bc2ab0372e8020e9c71..8cfd9a16700a30ab95ecac5014ac3f97ced8f172 100644
--- a/test/tools/yaml2obj/missing_document_tag.yaml
+++ b/test/tools/yaml2obj/missing_document_tag.yaml
@@ -4,3 +4,6 @@
 DummyData:
   foo:           0
 ...
+
+# CHECK: YAML:4:1: error: YAML Object File missing document type tag!
+# CHECK: yaml2obj: Failed to parse YAML file!
diff --git a/test/tools/yaml2obj/unsupported_document_tag.yaml b/test/tools/yaml2obj/unsupported_document_tag.yaml
index e73d450a9bb817a6deafbe65afef0360d1fdf3ab..b25b08096cfbd69c8d179dbef4dbf9cf07bdec48 100644
--- a/test/tools/yaml2obj/unsupported_document_tag.yaml
+++ b/test/tools/yaml2obj/unsupported_document_tag.yaml
@@ -5,4 +5,4 @@ DummyData:
   foo:           0
 ...
 
-#check error: YAML Object File unsupported document type tag '!unsupported-tag'!
+# CHECK: error: YAML Object File unsupported document type tag '!unsupported-tag'!
diff --git a/tools/bugpoint/CrashDebugger.cpp b/tools/bugpoint/CrashDebugger.cpp
index 0cae0669477f2f5be640ca44cd384c771d9eb516..c076309b22bb9a73480dd327631d1ee2337e313a 100644
--- a/tools/bugpoint/CrashDebugger.cpp
+++ b/tools/bugpoint/CrashDebugger.cpp
@@ -731,7 +731,8 @@ bool ReduceCrashingInstructions::TestInsts(
       for (BasicBlock::iterator I = FI->begin(), E = FI->end(); I != E;) {
         Instruction *Inst = &*I++;
         if (!Instructions.count(Inst) && !isa<TerminatorInst>(Inst) &&
-            !Inst->isEHPad() && !Inst->getType()->isTokenTy()) {
+            !Inst->isEHPad() && !Inst->getType()->isTokenTy() &&
+            !Inst->isSwiftError()) {
           if (!Inst->getType()->isVoidTy())
             Inst->replaceAllUsesWith(UndefValue::get(Inst->getType()));
           Inst->eraseFromParent();
@@ -1015,7 +1016,8 @@ static Error ReduceInsts(BugDriver &BD,
                 // TODO: Should this be some kind of interrupted error?
                 return Error::success();
 
-              if (I->isEHPad() || I->getType()->isTokenTy())
+              if (I->isEHPad() || I->getType()->isTokenTy() ||
+                  I->isSwiftError())
                 continue;
 
               outs() << "Checking instruction: " << *I;
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index d57613ec5e37688f6a744e4ec3e5be2b9f651220..82c61b6e1be7aa52a76750c82b4cbbc9ea378e1d 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -209,6 +209,7 @@ static void eliminateAliases(GlobalValue *GV) {
 void llvm::DeleteGlobalInitializer(GlobalVariable *GV) {
   eliminateAliases(GV);
   GV->setInitializer(nullptr);
+  GV->setComdat(nullptr);
 }
 
 // DeleteFunctionBody - "Remove" the function by deleting all of its basic
diff --git a/tools/bugpoint/FindBugs.cpp b/tools/bugpoint/FindBugs.cpp
index 156f4d0d78fe1ed0c25cdc657fc5d033a7c2231b..3093169ba8b0010b637cbc39e02e8c6b075130f0 100644
--- a/tools/bugpoint/FindBugs.cpp
+++ b/tools/bugpoint/FindBugs.cpp
@@ -21,6 +21,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <ctime>
+#include <random>
 using namespace llvm;
 
 Error
@@ -39,14 +40,13 @@ BugDriver::runManyPasses(const std::vector<std::string> &AllPasses) {
       return E;
   }
 
-  srand(time(nullptr));
-
+  std::mt19937 randomness(std::random_device{}());
   unsigned num = 1;
   while (1) {
     //
     // Step 1: Randomize the order of the optimizer passes.
     //
-    std::random_shuffle(PassesToRun.begin(), PassesToRun.end());
+    std::shuffle(PassesToRun.begin(), PassesToRun.end(), randomness);
 
     //
     // Step 2: Run optimizer passes on the program and check for success.
diff --git a/tools/bugpoint/ListReducer.h b/tools/bugpoint/ListReducer.h
index dcfa11d06927f42db05aa1f5838c106a5ca93fe8..0f9db022d555e34ae6ba536648015e47efbe7dc8 100644
--- a/tools/bugpoint/ListReducer.h
+++ b/tools/bugpoint/ListReducer.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdlib>
+#include <random>
 #include <vector>
 
 namespace llvm {
@@ -46,7 +47,7 @@ template <typename ElTy> struct ListReducer {
   /// that bugpoint does.
   Expected<bool> reduceList(std::vector<ElTy> &TheList) {
     std::vector<ElTy> empty;
-    std::srand(0x6e5ea738); // Seed the random number generator
+    std::mt19937 randomness(0x6e5ea738);  // Seed the random number generator
     Expected<TestResult> Result = doTest(TheList, empty);
     if (Error E = Result.takeError())
       return std::move(E);
@@ -92,7 +93,7 @@ template <typename ElTy> struct ListReducer {
       // distribution (improving the speed of convergence).
       if (ShufflingEnabled && NumOfIterationsWithoutProgress > MaxIterations) {
         std::vector<ElTy> ShuffledList(TheList);
-        std::random_shuffle(ShuffledList.begin(), ShuffledList.end());
+        std::shuffle(ShuffledList.begin(), ShuffledList.end(), randomness);
         errs() << "\n\n*** Testing shuffled set...\n\n";
         // Check that random shuffle doesn't lose the bug
         Expected<TestResult> Result = doTest(ShuffledList, empty);
diff --git a/tools/bugpoint/Miscompilation.cpp b/tools/bugpoint/Miscompilation.cpp
index 792fab07bf11ebe5fe72c9d4d03d54d24dbaa062..80f4cea234815c4210511147f1a00a09b4b68ba0 100644
--- a/tools/bugpoint/Miscompilation.cpp
+++ b/tools/bugpoint/Miscompilation.cpp
@@ -225,19 +225,22 @@ public:
 /// output is different. If the DeleteInputs argument is set to true then this
 /// function deletes both input modules before it returns.
 ///
-static Expected<std::unique_ptr<Module>>
-testMergedProgram(const BugDriver &BD, std::unique_ptr<Module> M1,
-                  std::unique_ptr<Module> M2, bool &Broken) {
-  if (Linker::linkModules(*M1, std::move(M2)))
+static Expected<std::unique_ptr<Module>> testMergedProgram(const BugDriver &BD,
+                                                           const Module &M1,
+                                                           const Module &M2,
+                                                           bool &Broken) {
+  // Resulting merge of M1 and M2.
+  auto Merged = CloneModule(&M1);
+  if (Linker::linkModules(*Merged, CloneModule(&M2)))
     // TODO: Shouldn't we thread the error up instead of exiting?
     exit(1);
 
   // Execute the program.
-  Expected<bool> Diff = BD.diffProgram(M1.get(), "", "", false);
+  Expected<bool> Diff = BD.diffProgram(Merged.get(), "", "", false);
   if (Error E = Diff.takeError())
     return std::move(E);
   Broken = *Diff;
-  return std::move(M1);
+  return std::move(Merged);
 }
 
 /// TestFuncs - split functions in a Module into two groups: those that are
@@ -335,9 +338,8 @@ ExtractLoops(BugDriver &BD,
     // extraction.
     AbstractInterpreter *AI = BD.switchToSafeInterpreter();
     bool Failure;
-    Expected<std::unique_ptr<Module>> New =
-        testMergedProgram(BD, std::move(ToOptimizeLoopExtracted),
-                          std::move(ToNotOptimize), Failure);
+    Expected<std::unique_ptr<Module>> New = testMergedProgram(
+        BD, *ToOptimizeLoopExtracted, *ToNotOptimize, Failure);
     if (Error E = New.takeError())
       return std::move(E);
     if (!*New)
@@ -726,8 +728,7 @@ static Expected<bool> TestOptimizer(BugDriver &BD, std::unique_ptr<Module> Test,
 
   outs() << "  Checking to see if the merged program executes correctly: ";
   bool Broken;
-  auto Result =
-      testMergedProgram(BD, std::move(Optimized), std::move(Safe), Broken);
+  auto Result = testMergedProgram(BD, *Optimized, *Safe, Broken);
   if (Error E = Result.takeError())
     return std::move(E);
   if (auto New = std::move(*Result)) {
@@ -840,7 +841,7 @@ static void CleanupAndPrepareModules(BugDriver &BD,
   // Prototype: void *getPointerToNamedFunction(const char* Name)
   Constant *resolverFunc = Safe->getOrInsertFunction(
       "getPointerToNamedFunction", Type::getInt8PtrTy(Safe->getContext()),
-      Type::getInt8PtrTy(Safe->getContext()), (Type *)nullptr);
+      Type::getInt8PtrTy(Safe->getContext()));
 
   // Use the function we just added to get addresses of functions we need.
   for (Module::iterator F = Safe->begin(), E = Safe->end(); F != E; ++F) {
diff --git a/tools/bugpoint/ToolRunner.cpp b/tools/bugpoint/ToolRunner.cpp
index 4633d64373368d591119a45849a7a45bc89335de..10532ef8395b883e3676de077509c3aa21a6121b 100644
--- a/tools/bugpoint/ToolRunner.cpp
+++ b/tools/bugpoint/ToolRunner.cpp
@@ -355,37 +355,62 @@ Expected<int> CustomExecutor::ExecuteProgram(
 // Tokenize the CommandLine to the command and the args to allow
 // defining a full command line as the command instead of just the
 // executed program. We cannot just pass the whole string after the command
-// as a single argument because then program sees only a single
+// as a single argument because then the program sees only a single
 // command line argument (with spaces in it: "foo bar" instead
 // of "foo" and "bar").
 //
-// code borrowed from:
-// http://oopweb.com/CPP/Documents/CPPHOWTO/Volume/C++Programming-HOWTO-7.html
+// Spaces are used as a delimiter; however repeated, leading, and trailing
+// whitespace are ignored. Simple escaping is allowed via the '\'
+// character, as seen below:
+//
+// Two consecutive '\' evaluate to a single '\'.
+// A space after a '\' evaluates to a space that is not interpreted as a
+// delimiter.
+// Any other instances of the '\' character are removed.
+//
+// Example:
+// '\\' -> '\'
+// '\ ' -> ' '
+// 'exa\mple' -> 'example'
+//
 static void lexCommand(std::string &Message, const std::string &CommandLine,
                        std::string &CmdPath, std::vector<std::string> &Args) {
 
-  std::string Command = "";
-  std::string delimiters = " ";
-
-  std::string::size_type lastPos = CommandLine.find_first_not_of(delimiters, 0);
-  std::string::size_type pos = CommandLine.find_first_of(delimiters, lastPos);
-
-  while (std::string::npos != pos || std::string::npos != lastPos) {
-    std::string token = CommandLine.substr(lastPos, pos - lastPos);
-    if (Command == "")
-      Command = token;
-    else
-      Args.push_back(token);
-    // Skip delimiters.  Note the "not_of"
-    lastPos = CommandLine.find_first_not_of(delimiters, pos);
-    // Find next "non-delimiter"
-    pos = CommandLine.find_first_of(delimiters, lastPos);
+  std::string Token;
+  std::string Command;
+  bool FoundPath = false;
+
+  // first argument is the PATH.
+  // Skip repeated whitespace, leading whitespace and trailing whitespace.
+  for (std::size_t Pos = 0u; Pos <= CommandLine.size(); ++Pos) {
+    if ('\\' == CommandLine[Pos]) {
+      if (Pos + 1 < CommandLine.size())
+        Token.push_back(CommandLine[++Pos]);
+
+      continue;
+    }
+    if (' ' == CommandLine[Pos] || CommandLine.size() == Pos) {
+      if (Token.empty())
+        continue;
+
+      if (!FoundPath) {
+        Command = Token;
+        FoundPath = true;
+        Token.clear();
+        continue;
+      }
+
+      Args.push_back(Token);
+      Token.clear();
+      continue;
+    }
+    Token.push_back(CommandLine[Pos]);
   }
 
   auto Path = sys::findProgramByName(Command);
   if (!Path) {
-    Message = std::string("Cannot find '") + Command + "' in PATH: " +
-              Path.getError().message() + "\n";
+    Message = std::string("Cannot find '") + Command +
+              "' in PATH: " + Path.getError().message() + "\n";
     return;
   }
   CmdPath = *Path;
diff --git a/tools/bugpoint/bugpoint.cpp b/tools/bugpoint/bugpoint.cpp
index a5de953b2b75bdaff7a98992ed8e0209f853274c..85c1ddd8277d9017331ef771f44c207354e1fc48 100644
--- a/tools/bugpoint/bugpoint.cpp
+++ b/tools/bugpoint/bugpoint.cpp
@@ -181,7 +181,8 @@ int main(int argc, char **argv) {
     if (OptLevelO1)
       Builder.Inliner = createAlwaysInlinerLegacyPass();
     else if (OptLevelOs || OptLevelO2)
-      Builder.Inliner = createFunctionInliningPass(2, OptLevelOs ? 1 : 0);
+      Builder.Inliner = createFunctionInliningPass(
+          2, OptLevelOs ? 1 : 0, false);
     else
       Builder.Inliner = createFunctionInliningPass(275);
     Builder.populateFunctionPassManager(PM);
diff --git a/tools/dsymutil/DwarfLinker.cpp b/tools/dsymutil/DwarfLinker.cpp
index 2fe4f29d40858e0579d62d9b23d64a4552239bdd..25f1a0f271223d46e7dc1a81ff3c097edf3c3a91 100644
--- a/tools/dsymutil/DwarfLinker.cpp
+++ b/tools/dsymutil/DwarfLinker.cpp
@@ -197,11 +197,8 @@ public:
 
   CompileUnit(DWARFUnit &OrigUnit, unsigned ID, bool CanUseODR,
               StringRef ClangModuleName)
-      : OrigUnit(OrigUnit), ID(ID), NewUnit(OrigUnit.getVersion(),
-                                            OrigUnit.getAddressByteSize(),
-                                            OrigUnit.getUnitDIE().getTag()),
-          LowPc(UINT64_MAX), HighPc(0), RangeAlloc(), Ranges(RangeAlloc),
-          ClangModuleName(ClangModuleName) {
+      : OrigUnit(OrigUnit), ID(ID), LowPc(UINT64_MAX), HighPc(0), RangeAlloc(),
+        Ranges(RangeAlloc), ClangModuleName(ClangModuleName) {
     Info.resize(OrigUnit.getNumDIEs());
 
     auto CUDie = OrigUnit.getUnitDIE(false);
@@ -219,8 +216,15 @@ public:
 
   unsigned getUniqueID() const { return ID; }
 
+  void createOutputDIE() {
+    NewUnit.emplace(OrigUnit.getVersion(), OrigUnit.getAddressByteSize(),
+                    OrigUnit.getUnitDIE().getTag());
+  }
+
   DIE *getOutputUnitDIE() const {
-    return &const_cast<DIEUnit &>(NewUnit).getUnitDie();
+    if (NewUnit)
+      return &const_cast<DIEUnit &>(*NewUnit).getUnitDie();
+    return nullptr;
   }
 
   bool hasODR() const { return HasODR; }
@@ -329,7 +333,7 @@ private:
   DWARFUnit &OrigUnit;
   unsigned ID;
   std::vector<DIEInfo> Info; ///< DIE info indexed by DIE index.
-  DIEUnit NewUnit;
+  Optional<DIEUnit> NewUnit;
 
   uint64_t StartOffset;
   uint64_t NextUnitOffset;
@@ -359,7 +363,7 @@ private:
   Optional<PatchLocation> UnitRangeAttribute;
   /// @}
 
-  /// \brief Location attributes that need to be transfered from th
+  /// \brief Location attributes that need to be transferred from the
   /// original debug_loc section to the liked one. They are stored
   /// along with the PC offset that is to be applied to their
   /// function's address.
@@ -397,7 +401,8 @@ uint64_t CompileUnit::computeNextUnitOffset() {
   // The root DIE might be null, meaning that the Unit had nothing to
   // contribute to the linked output. In that case, we will emit the
   // unit header without any actual DIE.
-  NextUnitOffset += NewUnit.getUnitDie().getSize();
+  if (NewUnit)
+    NextUnitOffset += NewUnit->getUnitDie().getSize();
   return NextUnitOffset;
 }
 
@@ -1079,7 +1084,7 @@ void DwarfStreamer::emitCIE(StringRef CIEBytes) {
 
 /// \brief Emit a FDE into the debug_frame section. \p FDEBytes
 /// contains the FDE data without the length, CIE offset and address
-/// which will be replaced with the paramter values.
+/// which will be replaced with the parameter values.
 void DwarfStreamer::emitFDE(uint32_t CIEOffset, uint32_t AddrSize,
                             uint32_t Address, StringRef FDEBytes) {
   MS->SwitchSection(MC->getObjectFileInfo()->getDwarfFrameSection());
@@ -3066,7 +3071,7 @@ void DwarfLinker::patchLineTableForUnit(CompileUnit &Unit,
   if (LineTable.Prologue.Version != 2 ||
       LineTable.Prologue.DefaultIsStmt != DWARF2_LINE_DEFAULT_IS_STMT ||
       LineTable.Prologue.OpcodeBase > 13)
-    reportWarning("line table paramters mismatch. Cannot emit.");
+    reportWarning("line table parameters mismatch. Cannot emit.");
   else {
     MCDwarfLineTableParams Params;
     Params.DWARF2LineOpcodeBase = LineTable.Prologue.OpcodeBase;
@@ -3357,12 +3362,13 @@ void DwarfLinker::DIECloner::cloneAllCompileUnits(
   for (auto &CurrentUnit : CompileUnits) {
     auto InputDIE = CurrentUnit->getOrigUnit().getUnitDIE();
     CurrentUnit->setStartOffset(Linker.OutputDebugInfoSize);
-    // Clonse the InputDIE into your Unit DIE in our compile unit since it
-    // already has a DIE inside of it.
-    if (!cloneDIE(InputDIE, *CurrentUnit, 0 /* PC offset */,
-                  11 /* Unit Header size */, 0,
-                  CurrentUnit->getOutputUnitDIE()))
-      continue;
+    if (CurrentUnit->getInfo(0).Keep) {
+      // Clone the InputDIE into your Unit DIE in our compile unit since it
+      // already has a DIE inside of it.
+      CurrentUnit->createOutputDIE();
+      cloneDIE(InputDIE, *CurrentUnit, 0 /* PC offset */,
+               11 /* Unit Header size */, 0, CurrentUnit->getOutputUnitDIE());
+    }
     Linker.OutputDebugInfoSize = CurrentUnit->computeNextUnitOffset();
     if (Linker.Options.NoOutput)
       continue;
diff --git a/tools/dsymutil/MachOUtils.cpp b/tools/dsymutil/MachOUtils.cpp
index 8a730a1d0c8a2e36989071661a889690f2e981ba..ea6f113e4fae61fd9de97ead27aa59215120af94 100644
--- a/tools/dsymutil/MachOUtils.cpp
+++ b/tools/dsymutil/MachOUtils.cpp
@@ -220,7 +220,7 @@ getSection(const object::MachOObjectFile &Obj,
 // The function also tries to find a hole in the address map to fit the __DWARF
 // segment of \a DwarfSegmentSize size. \a EndAddress is updated to point at the
 // highest segment address.
-// When the __LINKEDIT segment is transfered, its offset and size are set resp.
+// When the __LINKEDIT segment is transferred, its offset and size are set resp.
 // to \a LinkeditOffset and \a LinkeditSize.
 template <typename SegmentTy>
 static void transferSegmentAndSections(
@@ -236,6 +236,8 @@ static void transferSegmentAndSections(
   if (StringRef("__LINKEDIT") == Segment.segname) {
     Segment.fileoff = LinkeditOffset;
     Segment.filesize = LinkeditSize;
+    // Resize vmsize by rounding to the page size.
+    Segment.vmsize = alignTo(LinkeditSize, 0x1000);
   }
 
   // Check if the end address of the last segment and our current
diff --git a/tools/gold/gold-plugin.cpp b/tools/gold/gold-plugin.cpp
index f8e708076131ffe69fd2afb0ffd2c2a012f363b7..9b783d19a2834917a83fab2881da98f1a10e5240 100644
--- a/tools/gold/gold-plugin.cpp
+++ b/tools/gold/gold-plugin.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/CodeGen/CommandFlags.h"
@@ -20,7 +21,9 @@
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/LTO/Caching.h"
 #include "llvm/LTO/LTO.h"
+#include "llvm/Object/Error.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/ManagedStatic.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -105,7 +108,6 @@ static std::list<claimed_file> Modules;
 static DenseMap<int, void *> FDToLeaderHandle;
 static StringMap<ResolutionInfo> ResInfo;
 static std::vector<std::string> Cleanup;
-static llvm::TargetOptions TargetOpts;
 
 namespace options {
   enum OutputType {
@@ -164,6 +166,12 @@ namespace options {
   // corresponding bitcode file, will use a path formed by replacing the
   // bitcode file's path prefix matching oldprefix with newprefix.
   static std::string thinlto_prefix_replace;
+  // Option to control the name of modules encoded in the individual index
+  // files for a distributed backend. This enables the use of minimized
+  // bitcode files for the thin link, assuming the name of the full bitcode
+  // file used in the backend differs just in some part of the file suffix.
+  // If specified, expects a string of the form "oldsuffix:newsuffix".
+  static std::string thinlto_object_suffix_replace;
   // Optional path to a directory for caching ThinLTO objects.
   static std::string cache_dir;
   // Additional options to pass into the code generator.
@@ -206,6 +214,12 @@ namespace options {
       thinlto_prefix_replace = opt.substr(strlen("thinlto-prefix-replace="));
       if (thinlto_prefix_replace.find(';') == std::string::npos)
         message(LDPL_FATAL, "thinlto-prefix-replace expects 'old;new' format");
+    } else if (opt.startswith("thinlto-object-suffix-replace=")) {
+      thinlto_object_suffix_replace =
+          opt.substr(strlen("thinlto-object-suffix-replace="));
+      if (thinlto_object_suffix_replace.find(';') == std::string::npos)
+        message(LDPL_FATAL,
+                "thinlto-object-suffix-replace expects 'old;new' format");
     } else if (opt.startswith("cache-dir=")) {
       cache_dir = opt.substr(strlen("cache-dir="));
     } else if (opt.size() == 2 && opt[0] == 'O') {
@@ -451,7 +465,7 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
           EC == object::object_error::bitcode_section_not_found)
         *claimed = 0;
       else
-        message(LDPL_ERROR,
+        message(LDPL_FATAL,
                 "LLVM gold plugin has failed to create LTO module: %s",
                 EI.message().c_str());
     });
@@ -484,8 +498,6 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
                sys::path::filename(Obj->getSourceFileName()).str();
 
   for (auto &Sym : Obj->symbols()) {
-    uint32_t Symflags = Sym.getFlags();
-
     cf.syms.push_back(ld_plugin_symbol());
     ld_plugin_symbol &sym = cf.syms.back();
     sym.version = nullptr;
@@ -511,20 +523,20 @@ static ld_plugin_status claim_file_hook(const ld_plugin_input_file *file,
       break;
     }
 
-    if (Symflags & object::BasicSymbolRef::SF_Undefined) {
+    if (Sym.isUndefined()) {
       sym.def = LDPK_UNDEF;
-      if (Symflags & object::BasicSymbolRef::SF_Weak)
+      if (Sym.isWeak())
         sym.def = LDPK_WEAKUNDEF;
-    } else if (Symflags & object::BasicSymbolRef::SF_Common)
+    } else if (Sym.isCommon())
       sym.def = LDPK_COMMON;
-    else if (Symflags & object::BasicSymbolRef::SF_Weak)
+    else if (Sym.isWeak())
       sym.def = LDPK_WEAKDEF;
     else
       sym.def = LDPK_DEF;
 
     sym.size = 0;
     sym.comdat_key = nullptr;
-    int CI = check(Sym.getComdatIndex());
+    int CI = Sym.getComdatIndex();
     if (CI != -1) {
       StringRef C = Obj->getComdatTable()[CI];
       sym.comdat_key = strdup(C.str().c_str());
@@ -566,8 +578,35 @@ static const void *getSymbolsAndView(claimed_file &F) {
   return View;
 }
 
-static void addModule(LTO &Lto, claimed_file &F, const void *View) {
-  MemoryBufferRef BufferRef(StringRef((const char *)View, F.filesize), F.name);
+/// Parse the thinlto-object-suffix-replace option into the \p OldSuffix and
+/// \p NewSuffix strings, if it was specified.
+static void getThinLTOOldAndNewSuffix(std::string &OldSuffix,
+                                      std::string &NewSuffix) {
+  assert(options::thinlto_object_suffix_replace.empty() ||
+         options::thinlto_object_suffix_replace.find(";") != StringRef::npos);
+  StringRef SuffixReplace = options::thinlto_object_suffix_replace;
+  std::pair<StringRef, StringRef> Split = SuffixReplace.split(";");
+  OldSuffix = Split.first.str();
+  NewSuffix = Split.second.str();
+}
+
+/// Given the original \p Path to an output file, replace any filename
+/// suffix matching \p OldSuffix with \p NewSuffix.
+static std::string getThinLTOObjectFileName(const std::string Path,
+                                            const std::string &OldSuffix,
+                                            const std::string &NewSuffix) {
+  if (OldSuffix.empty() && NewSuffix.empty())
+    return Path;
+  StringRef NewPath = Path;
+  NewPath.consume_back(OldSuffix);
+  std::string NewNewPath = NewPath.str() + NewSuffix;
+  return NewPath.str() + NewSuffix;
+}
+
+static void addModule(LTO &Lto, claimed_file &F, const void *View,
+                      StringRef Filename) {
+  MemoryBufferRef BufferRef(StringRef((const char *)View, F.filesize),
+                            Filename);
   Expected<std::unique_ptr<InputFile>> ObjOrErr = InputFile::create(BufferRef);
 
   if (!ObjOrErr)
@@ -789,19 +828,31 @@ static ld_plugin_status allSymbolsReadHook() {
   if (options::thinlto_index_only)
     getThinLTOOldAndNewPrefix(OldPrefix, NewPrefix);
 
+  std::string OldSuffix, NewSuffix;
+  getThinLTOOldAndNewSuffix(OldSuffix, NewSuffix);
+  // Set for owning string objects used as buffer identifiers.
+  StringSet<> ObjectFilenames;
+
   for (claimed_file &F : Modules) {
     if (options::thinlto && !HandleToInputFile.count(F.leader_handle))
       HandleToInputFile.insert(std::make_pair(
           F.leader_handle, llvm::make_unique<PluginInputFile>(F.handle)));
     const void *View = getSymbolsAndView(F);
+    // In case we are thin linking with a minimized bitcode file, ensure
+    // the module paths encoded in the index reflect where the backends
+    // will locate the full bitcode files for compiling/importing.
+    std::string Identifier =
+        getThinLTOObjectFileName(F.name, OldSuffix, NewSuffix);
+    auto ObjFilename = ObjectFilenames.insert(Identifier);
+    assert(ObjFilename.second);
     if (!View) {
       if (options::thinlto_index_only)
         // Write empty output files that may be expected by the distributed
         // build system.
-        writeEmptyDistributedBuildOutputs(F.name, OldPrefix, NewPrefix);
+        writeEmptyDistributedBuildOutputs(Identifier, OldPrefix, NewPrefix);
       continue;
     }
-    addModule(*Lto, F, View);
+    addModule(*Lto, F, View, ObjFilename.first->first());
   }
 
   SmallString<128> Filename;
@@ -831,11 +882,15 @@ static ld_plugin_status allSymbolsReadHook() {
         llvm::make_unique<llvm::raw_fd_ostream>(FD, true));
   };
 
-  auto AddFile = [&](size_t Task, StringRef Path) { Filenames[Task] = Path; };
+  auto AddBuffer = [&](size_t Task, std::unique_ptr<MemoryBuffer> MB) {
+    // Note that this requires that the memory buffers provided to AddBuffer are
+    // backed by a file.
+    Filenames[Task] = MB->getBufferIdentifier();
+  };
 
   NativeObjectCache Cache;
   if (!options::cache_dir.empty())
-    Cache = localCache(options::cache_dir, AddFile);
+    Cache = check(localCache(options::cache_dir, AddBuffer));
 
   check(Lto->run(AddStream, Cache));
 
@@ -844,6 +899,8 @@ static ld_plugin_status allSymbolsReadHook() {
     return LDPS_OK;
 
   if (options::thinlto_index_only) {
+    if (llvm::AreStatisticsEnabled())
+      llvm::PrintStatistics();
     cleanup_hook();
     exit(0);
   }
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 597504c08fb49f59f8db5de687d19dae81627f73..43f97f112f6bc99d8628041f6f2f156d4484edaf 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -253,6 +253,19 @@ static void DiagnosticHandler(const DiagnosticInfo &DI, void *Context) {
   errs() << "\n";
 }
 
+static void InlineAsmDiagHandler(const SMDiagnostic &SMD, void *Context,
+                                 unsigned LocCookie) {
+  bool *HasError = static_cast<bool *>(Context);
+  if (SMD.getKind() == SourceMgr::DK_Error)
+    *HasError = true;
+
+  SMD.print(nullptr, errs());
+
+  // For testing purposes, we print the LocCookie here.
+  if (LocCookie)
+    errs() << "note: !srcloc = " << LocCookie << "\n";
+}
+
 // main - Entry point for the llc compiler.
 //
 int main(int argc, char **argv) {
@@ -294,6 +307,8 @@ int main(int argc, char **argv) {
   // Set a diagnostic handler that doesn't exit on the first error
   bool HasError = false;
   Context.setDiagnosticHandler(DiagnosticHandler, &HasError);
+  Context.setInlineAsmDiagnosticHandler(InlineAsmDiagHandler, &HasError);
+
   if (PassRemarksWithHotness)
     Context.setDiagnosticHotnessRequested(true);
 
diff --git a/tools/lli/OrcLazyJIT.h b/tools/lli/OrcLazyJIT.h
index 05319c345484feaec63b85e85dc386806afa40a2..56e7d36d05fb47e9a33bef869f53fe7f87f85058 100644
--- a/tools/lli/OrcLazyJIT.h
+++ b/tools/lli/OrcLazyJIT.h
@@ -21,7 +21,7 @@
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 
 namespace llvm {
@@ -30,7 +30,7 @@ class OrcLazyJIT {
 public:
 
   typedef orc::JITCompileCallbackManager CompileCallbackMgr;
-  typedef orc::ObjectLinkingLayer<> ObjLayerT;
+  typedef orc::RTDyldObjectLinkingLayer<> ObjLayerT;
   typedef orc::IRCompileLayer<ObjLayerT> CompileLayerT;
   typedef std::function<std::unique_ptr<Module>(std::unique_ptr<Module>)>
     TransformFtor;
diff --git a/tools/lli/lli.cpp b/tools/lli/lli.cpp
index 0823ff469de6d97ae0de1874e3e2f16ab4a10cca..f228a36194573bc0971d70e609acabaa48ae1dec 100644
--- a/tools/lli/lli.cpp
+++ b/tools/lli/lli.cpp
@@ -606,8 +606,7 @@ int main(int argc, char **argv, char * const *envp) {
     // If the program doesn't explicitly call exit, we will need the Exit
     // function later on to make an explicit call, so get the function now.
     Constant *Exit = Mod->getOrInsertFunction("exit", Type::getVoidTy(Context),
-                                                      Type::getInt32Ty(Context),
-                                                      nullptr);
+                                                      Type::getInt32Ty(Context));
 
     // Run static constructors.
     if (!ForceInterpreter) {
diff --git a/tools/llvm-ar/llvm-ar.cpp b/tools/llvm-ar/llvm-ar.cpp
index b99a396da62ac7451280cb5c0d91d141a6569b0e..1519464521dd0702280784738e5173fe7feab455 100644
--- a/tools/llvm-ar/llvm-ar.cpp
+++ b/tools/llvm-ar/llvm-ar.cpp
@@ -52,7 +52,7 @@ static StringRef ToolName;
 
 // Show the error message and exit.
 LLVM_ATTRIBUTE_NORETURN static void fail(Twine Error) {
-  outs() << ToolName << ": " << Error << ".\n";
+  errs() << ToolName << ": " << Error << ".\n";
   exit(1);
 }
 
@@ -87,13 +87,14 @@ static cl::opt<bool> MRI("M", cl::desc(""));
 static cl::opt<std::string> Plugin("plugin", cl::desc("plugin (ignored for compatibility"));
 
 namespace {
-enum Format { Default, GNU, BSD };
+enum Format { Default, GNU, BSD, DARWIN };
 }
 
 static cl::opt<Format>
     FormatOpt("format", cl::desc("Archive format to create"),
               cl::values(clEnumValN(Default, "default", "default"),
                          clEnumValN(GNU, "gnu", "gnu"),
+                         clEnumValN(DARWIN, "darwin", "darwin"),
                          clEnumValN(BSD, "bsd", "bsd")));
 
 static std::string Options;
@@ -167,7 +168,7 @@ LLVM_ATTRIBUTE_NORETURN static void
 show_help(const std::string &msg) {
   errs() << ToolName << ": " << msg << "\n\n";
   cl::PrintHelpMessage();
-  std::exit(1);
+  exit(1);
 }
 
 // Extract the member filename from the command line for the [relpos] argument
@@ -376,7 +377,9 @@ static void doExtract(StringRef Name, const object::Archive::Child &C) {
   sys::fs::perms Mode = ModeOrErr.get();
 
   int FD;
-  failIfError(sys::fs::openFileForWrite(Name, FD, sys::fs::F_None, Mode), Name);
+  failIfError(sys::fs::openFileForWrite(sys::path::filename(Name), FD,
+                                        sys::fs::F_None, Mode),
+              Name);
 
   {
     raw_fd_ostream file(FD, false);
@@ -462,7 +465,7 @@ static void performReadOperation(ArchiveOperation Operation,
     return;
   for (StringRef Name : Members)
     errs() << Name << " was not found\n";
-  std::exit(1);
+  exit(1);
 }
 
 static void addMember(std::vector<NewArchiveMember> &Members,
@@ -623,8 +626,9 @@ computeNewArchiveMembers(ArchiveOperation Operation,
 }
 
 static object::Archive::Kind getDefaultForHost() {
-  return Triple(sys::getProcessTriple()).isOSDarwin() ? object::Archive::K_BSD
-                                                      : object::Archive::K_GNU;
+  return Triple(sys::getProcessTriple()).isOSDarwin()
+             ? object::Archive::K_DARWIN
+             : object::Archive::K_GNU;
 }
 
 static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) {
@@ -633,7 +637,7 @@ static object::Archive::Kind getKindFromMember(const NewArchiveMember &Member) {
 
   if (OptionalObject)
     return isa<object::MachOObjectFile>(**OptionalObject)
-               ? object::Archive::K_BSD
+               ? object::Archive::K_DARWIN
                : object::Archive::K_GNU;
 
   // squelch the error in case we had a non-object file
@@ -672,6 +676,11 @@ performWriteOperation(ArchiveOperation Operation,
       fail("Only the gnu format has a thin mode");
     Kind = object::Archive::K_BSD;
     break;
+  case DARWIN:
+    if (Thin)
+      fail("Only the gnu format has a thin mode");
+    Kind = object::Archive::K_DARWIN;
+    break;
   }
 
   std::pair<StringRef, std::error_code> Result =
diff --git a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
index b84c4a83dee459fbe6a703f7365412c4843c2c45..abc6fa27a0e05d1899d5f138d9c11c7764f538ca 100644
--- a/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
+++ b/tools/llvm-bcanalyzer/llvm-bcanalyzer.cpp
@@ -171,7 +171,6 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(MODULE_CODE, GLOBALVAR)
       STRINGIFY_CODE(MODULE_CODE, FUNCTION)
       STRINGIFY_CODE(MODULE_CODE, ALIAS)
-      STRINGIFY_CODE(MODULE_CODE, PURGEVALS)
       STRINGIFY_CODE(MODULE_CODE, GCNAME)
       STRINGIFY_CODE(MODULE_CODE, VSTOFFSET)
       STRINGIFY_CODE(MODULE_CODE, METADATA_VALUES_UNUSED)
@@ -312,6 +311,10 @@ static const char *GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, COMBINED_ORIGINAL_NAME)
       STRINGIFY_CODE(FS, VERSION)
       STRINGIFY_CODE(FS, TYPE_TESTS)
+      STRINGIFY_CODE(FS, TYPE_TEST_ASSUME_VCALLS)
+      STRINGIFY_CODE(FS, TYPE_CHECKED_LOAD_VCALLS)
+      STRINGIFY_CODE(FS, TYPE_TEST_ASSUME_CONST_VCALL)
+      STRINGIFY_CODE(FS, TYPE_CHECKED_LOAD_CONST_VCALL)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch(CodeID) {
diff --git a/tools/llvm-c-test/echo.cpp b/tools/llvm-c-test/echo.cpp
index 72ff138c74e325d16ce648909f519ea3a9ca19b0..52ce85c577821f1c2c5b0900f4468b7939711842 100644
--- a/tools/llvm-c-test/echo.cpp
+++ b/tools/llvm-c-test/echo.cpp
@@ -591,7 +591,7 @@ struct FunCloner {
         break;
       }
       case LLVMPHI: {
-        // We need to agressively set things here because of loops.
+        // We need to aggressively set things here because of loops.
         VMap[Src] = Dst = LLVMBuildPhi(Builder, CloneType(Src), Name);
 
         SmallVector<LLVMValueRef, 8> Values;
diff --git a/tools/llvm-cat/llvm-cat.cpp b/tools/llvm-cat/llvm-cat.cpp
index d884970309b44d9a4386aad9a0b573d8e8fc91cb..4d62099094bb8f847f414a180e622ce6d9bcff40 100644
--- a/tools/llvm-cat/llvm-cat.cpp
+++ b/tools/llvm-cat/llvm-cat.cpp
@@ -40,7 +40,7 @@ int main(int argc, char **argv) {
   SmallVector<char, 0> Buffer;
   BitcodeWriter Writer(Buffer);
   if (BinaryCat) {
-    for (std::string InputFilename : InputFilenames) {
+    for (const auto &InputFilename : InputFilenames) {
       std::unique_ptr<MemoryBuffer> MB = ExitOnErr(
           errorOrToExpected(MemoryBuffer::getFileOrSTDIN(InputFilename)));
       std::vector<BitcodeModule> Mods = ExitOnErr(getBitcodeModuleList(*MB));
@@ -49,7 +49,7 @@ int main(int argc, char **argv) {
                       BitcodeMod.getBuffer().end());
     }
   } else {
-    for (std::string InputFilename : InputFilenames) {
+    for (const auto &InputFilename : InputFilenames) {
       SMDiagnostic Err;
       std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
       if (!M) {
diff --git a/tools/llvm-cov/CodeCoverage.cpp b/tools/llvm-cov/CodeCoverage.cpp
index 0a9807ab00334c467b4cee5c0ae726849d632072..6179c760d5b20b40496e8b0dffdb5cea92a6c754 100644
--- a/tools/llvm-cov/CodeCoverage.cpp
+++ b/tools/llvm-cov/CodeCoverage.cpp
@@ -15,6 +15,7 @@
 
 #include "CoverageFilters.h"
 #include "CoverageReport.h"
+#include "CoverageSummaryInfo.h"
 #include "CoverageViewOptions.h"
 #include "RenderingSupport.h"
 #include "SourceCoverageView.h"
@@ -98,9 +99,6 @@ private:
   /// \brief If a demangler is available, demangle all symbol names.
   void demangleSymbols(const CoverageMapping &Coverage);
 
-  /// \brief Demangle \p Sym if possible. Otherwise, just return \p Sym.
-  StringRef getSymbolForHumans(StringRef Sym) const;
-
   /// \brief Write out a source file view to the filesystem.
   void writeSourceFileView(StringRef SourceFile, CoverageMapping *Coverage,
                            CoveragePrinter *Printer, bool ShowFilenames);
@@ -136,10 +134,10 @@ private:
   /// The architecture the coverage mapping data targets.
   std::string CoverageArch;
 
-  /// A cache for demangled symbol names.
-  StringMap<std::string> DemangledNames;
+  /// A cache for demangled symbols.
+  DemangleCache DC;
 
-  /// Errors and warnings which have not been printed.
+  /// A lock which guards printing to stderr.
   std::mutex ErrsLock;
 
   /// A container for input source file buffers.
@@ -267,7 +265,7 @@ CodeCoverageTool::createFunctionView(const FunctionRecord &Function,
     return nullptr;
 
   auto Expansions = FunctionCoverage.getExpansions();
-  auto View = SourceCoverageView::create(getSymbolForHumans(Function.Name),
+  auto View = SourceCoverageView::create(DC.demangle(Function.Name),
                                          SourceBuffer.get(), ViewOpts,
                                          std::move(FunctionCoverage));
   attachExpansionSubViews(*View, Expansions, Coverage);
@@ -293,7 +291,7 @@ CodeCoverageTool::createSourceFileView(StringRef SourceFile,
   for (const auto *Function : Coverage.getInstantiations(SourceFile)) {
     std::unique_ptr<SourceCoverageView> SubView{nullptr};
 
-    StringRef Funcname = getSymbolForHumans(Function->Name);
+    StringRef Funcname = DC.demangle(Function->Name);
 
     if (Function->ExecutionCount > 0) {
       auto SubViewCoverage = Coverage.getCoverageForFunction(*Function);
@@ -453,14 +451,9 @@ void CodeCoverageTool::demangleSymbols(const CoverageMapping &Coverage) {
   // Cache the demangled names.
   unsigned I = 0;
   for (const auto &Function : Coverage.getCoveredFunctions())
-    DemangledNames[Function.Name] = Symbols[I++];
-}
-
-StringRef CodeCoverageTool::getSymbolForHumans(StringRef Sym) const {
-  const auto DemangledName = DemangledNames.find(Sym);
-  if (DemangledName == DemangledNames.end())
-    return Sym;
-  return DemangledName->getValue();
+    // On Windows, lines in the demangler's output file end with "\r\n".
+    // Splitting by '\n' keeps '\r's, so cut them now.
+    DC.DemangledNames[Function.Name] = Symbols[I++].rtrim();
 }
 
 void CodeCoverageTool::writeSourceFileView(StringRef SourceFile,
@@ -817,22 +810,28 @@ int CodeCoverageTool::show(int argc, const char **argv,
 
 int CodeCoverageTool::report(int argc, const char **argv,
                              CommandLineParserType commandLineParser) {
+  cl::opt<bool> ShowFunctionSummaries(
+      "show-functions", cl::Optional, cl::init(false),
+      cl::desc("Show coverage summaries for each function"));
+
   auto Err = commandLineParser(argc, argv);
   if (Err)
     return Err;
 
-  if (ViewOpts.Format == CoverageViewOptions::OutputFormat::HTML)
+  if (ViewOpts.Format == CoverageViewOptions::OutputFormat::HTML) {
     error("HTML output for summary reports is not yet supported.");
+    return 1;
+  }
 
   auto Coverage = load();
   if (!Coverage)
     return 1;
 
   CoverageReport Report(ViewOpts, *Coverage.get());
-  if (SourceFiles.empty())
+  if (!ShowFunctionSummaries)
     Report.renderFileReports(llvm::outs());
   else
-    Report.renderFunctionReports(SourceFiles, llvm::outs());
+    Report.renderFunctionReports(SourceFiles, DC, llvm::outs());
   return 0;
 }
 
@@ -843,6 +842,11 @@ int CodeCoverageTool::export_(int argc, const char **argv,
   if (Err)
     return Err;
 
+  if (ViewOpts.Format != CoverageViewOptions::OutputFormat::Text) {
+    error("Coverage data can only be exported as textual JSON.");
+    return 1;
+  }
+
   auto Coverage = load();
   if (!Coverage) {
     error("Could not load coverage information");
diff --git a/tools/llvm-cov/CoverageReport.cpp b/tools/llvm-cov/CoverageReport.cpp
index e88cb186acd667b72f75ec2f7cca2a6a8748082f..c68bb9048df1b18bf39fbeeb1f4cd8ee93022d3f 100644
--- a/tools/llvm-cov/CoverageReport.cpp
+++ b/tools/llvm-cov/CoverageReport.cpp
@@ -118,19 +118,51 @@ raw_ostream::Colors determineCoveragePercentageColor(const T &Info) {
                                           : raw_ostream::RED;
 }
 
-/// \brief Determine the length of the longest common prefix of the strings in
-/// \p Strings.
-unsigned getLongestCommonPrefixLen(ArrayRef<std::string> Strings) {
-  unsigned LCP = Strings[0].size();
-  for (unsigned I = 1, E = Strings.size(); LCP > 0 && I < E; ++I) {
-    unsigned Cursor;
-    StringRef S = Strings[I];
-    for (Cursor = 0; Cursor < LCP && Cursor < S.size(); ++Cursor)
-      if (Strings[0][Cursor] != S[Cursor])
+/// \brief Get the number of redundant path components in each path in \p Paths.
+unsigned getNumRedundantPathComponents(ArrayRef<std::string> Paths) {
+  // To start, set the number of redundant path components to the maximum
+  // possible value.
+  SmallVector<StringRef, 8> FirstPathComponents{sys::path::begin(Paths[0]),
+                                                sys::path::end(Paths[0])};
+  unsigned NumRedundant = FirstPathComponents.size();
+
+  for (unsigned I = 1, E = Paths.size(); NumRedundant > 0 && I < E; ++I) {
+    StringRef Path = Paths[I];
+    for (const auto &Component :
+         enumerate(make_range(sys::path::begin(Path), sys::path::end(Path)))) {
+      // Do not increase the number of redundant components: that would remove
+      // useful parts of already-visited paths.
+      if (Component.index() >= NumRedundant)
         break;
-    LCP = std::min(LCP, Cursor);
+
+      // Lower the number of redundant components when there's a mismatch
+      // between the first path, and the path under consideration.
+      if (FirstPathComponents[Component.index()] != Component.value()) {
+        NumRedundant = Component.index();
+        break;
+      }
+    }
+  }
+
+  return NumRedundant;
+}
+
+/// \brief Determine the length of the longest redundant prefix of the paths in
+/// \p Paths.
+unsigned getRedundantPrefixLen(ArrayRef<std::string> Paths) {
+  // If there's at most one path, no path components are redundant.
+  if (Paths.size() <= 1)
+    return 0;
+
+  unsigned PrefixLen = 0;
+  unsigned NumRedundant = getNumRedundantPathComponents(Paths);
+  auto Component = sys::path::begin(Paths[0]);
+  for (unsigned I = 0; I < NumRedundant; ++I) {
+    auto LastComponent = Component;
+    ++Component;
+    PrefixLen += Component - LastComponent;
   }
-  return LCP;
+  return PrefixLen;
 }
 
 } // end anonymous namespace
@@ -200,12 +232,14 @@ void CoverageReport::render(const FileCoverageSummary &File,
 }
 
 void CoverageReport::render(const FunctionCoverageSummary &Function,
+                            const DemangleCache &DC,
                             raw_ostream &OS) const {
   auto FuncCoverageColor =
       determineCoveragePercentageColor(Function.RegionCoverage);
   auto LineCoverageColor =
       determineCoveragePercentageColor(Function.LineCoverage);
-  OS << column(Function.Name, FunctionReportColumns[0], Column::RightTrim)
+  OS << column(DC.demangle(Function.Name), FunctionReportColumns[0],
+               Column::RightTrim)
      << format("%*u", FunctionReportColumns[1],
                (unsigned)Function.RegionCoverage.NumRegions);
   Options.colored_ostream(OS, FuncCoverageColor)
@@ -230,6 +264,7 @@ void CoverageReport::render(const FunctionCoverageSummary &Function,
 }
 
 void CoverageReport::renderFunctionReports(ArrayRef<std::string> Files,
+                                           const DemangleCache &DC,
                                            raw_ostream &OS) {
   bool isFirst = true;
   for (StringRef Filename : Files) {
@@ -242,7 +277,7 @@ void CoverageReport::renderFunctionReports(ArrayRef<std::string> Files,
 
     std::vector<StringRef> Funcnames;
     for (const auto &F : Functions)
-      Funcnames.emplace_back(F.Name);
+      Funcnames.emplace_back(DC.demangle(F.Name));
     adjustColumnWidths({}, Funcnames);
 
     OS << "File '" << Filename << "':\n";
@@ -262,12 +297,12 @@ void CoverageReport::renderFunctionReports(ArrayRef<std::string> Files,
       ++Totals.ExecutionCount;
       Totals.RegionCoverage += Function.RegionCoverage;
       Totals.LineCoverage += Function.LineCoverage;
-      render(Function, OS);
+      render(Function, DC, OS);
     }
     if (Totals.ExecutionCount) {
       renderDivider(FunctionReportColumns, OS);
       OS << "\n";
-      render(Totals, OS);
+      render(Totals, DC, OS);
     }
   }
 }
@@ -277,9 +312,7 @@ CoverageReport::prepareFileReports(const coverage::CoverageMapping &Coverage,
                                    FileCoverageSummary &Totals,
                                    ArrayRef<std::string> Files) {
   std::vector<FileCoverageSummary> FileReports;
-  unsigned LCP = 0;
-  if (Files.size() > 1)
-    LCP = getLongestCommonPrefixLen(Files);
+  unsigned LCP = getRedundantPrefixLen(Files);
 
   for (StringRef Filename : Files) {
     FileCoverageSummary Summary(Filename.drop_front(LCP));
diff --git a/tools/llvm-cov/CoverageReport.h b/tools/llvm-cov/CoverageReport.h
index 7a416497e258eda35296619299ebf62a54815de4..071be2e21594c6f65f783df571e91596c31944f7 100644
--- a/tools/llvm-cov/CoverageReport.h
+++ b/tools/llvm-cov/CoverageReport.h
@@ -25,14 +25,16 @@ class CoverageReport {
   const coverage::CoverageMapping &Coverage;
 
   void render(const FileCoverageSummary &File, raw_ostream &OS) const;
-  void render(const FunctionCoverageSummary &Function, raw_ostream &OS) const;
+  void render(const FunctionCoverageSummary &Function, const DemangleCache &DC,
+              raw_ostream &OS) const;
 
 public:
   CoverageReport(const CoverageViewOptions &Options,
                  const coverage::CoverageMapping &Coverage)
       : Options(Options), Coverage(Coverage) {}
 
-  void renderFunctionReports(ArrayRef<std::string> Files, raw_ostream &OS);
+  void renderFunctionReports(ArrayRef<std::string> Files,
+                             const DemangleCache &DC, raw_ostream &OS);
 
   /// Prepare file reports for the files specified in \p Files.
   static std::vector<FileCoverageSummary>
diff --git a/tools/llvm-cov/CoverageSummaryInfo.h b/tools/llvm-cov/CoverageSummaryInfo.h
index c04a4d42ccd7417df48a59bb2d10297bdeeecafc..680fc3757686f6c0c5f16d184c3029a413ce57b4 100644
--- a/tools/llvm-cov/CoverageSummaryInfo.h
+++ b/tools/llvm-cov/CoverageSummaryInfo.h
@@ -160,6 +160,19 @@ struct FileCoverageSummary {
   }
 };
 
+/// \brief A cache for demangled symbols.
+struct DemangleCache {
+  StringMap<std::string> DemangledNames;
+
+  /// \brief Demangle \p Sym if possible. Otherwise, just return \p Sym.
+  StringRef demangle(StringRef Sym) const {
+    const auto DemangledName = DemangledNames.find(Sym);
+    if (DemangledName == DemangledNames.end())
+      return Sym;
+    return DemangledName->getValue();
+  }
+};
+
 } // namespace llvm
 
 #endif // LLVM_COV_COVERAGESUMMARYINFO_H
diff --git a/tools/llvm-cxxfilt/llvm-cxxfilt.cpp b/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
index 8be2a6d3b456e5a59658ead325d536d25ed9a5d1..13024fbeaeaa09dae3cfd2c163ff73404f89f60b 100644
--- a/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
+++ b/tools/llvm-cxxfilt/llvm-cxxfilt.cpp
@@ -68,6 +68,12 @@ static void demangle(llvm::raw_ostream &OS, const std::string &Mangled) {
                 (DecoratedLength >= 4 && strncmp(Decorated, "___Z", 4) == 0)))
     Undecorated = itaniumDemangle(Decorated, nullptr, nullptr, &Status);
 
+  if (!Undecorated &&
+      (DecoratedLength > 6 && strncmp(Decorated, "__imp_", 6) == 0)) {
+    OS << "import thunk for ";
+    Undecorated = itaniumDemangle(Decorated + 6, nullptr, nullptr, &Status);
+  }
+
   OS << (Undecorated ? Undecorated : Mangled) << '\n';
 
   free(Undecorated);
diff --git a/tools/llvm-diff/DifferenceEngine.cpp b/tools/llvm-diff/DifferenceEngine.cpp
index df208a26ab7dfdc8a3854b275b7250b6aafd03d2..95a63d7f9c835c540088b5cde5d29f657f46bf2c 100644
--- a/tools/llvm-diff/DifferenceEngine.cpp
+++ b/tools/llvm-diff/DifferenceEngine.cpp
@@ -315,17 +315,15 @@ class FunctionDifferenceEngine {
       bool Difference = false;
 
       DenseMap<ConstantInt*,BasicBlock*> LCases;
-      
-      for (SwitchInst::CaseIt I = LI->case_begin(), E = LI->case_end();
-           I != E; ++I)
-        LCases[I.getCaseValue()] = I.getCaseSuccessor();
-        
-      for (SwitchInst::CaseIt I = RI->case_begin(), E = RI->case_end();
-           I != E; ++I) {
-        ConstantInt *CaseValue = I.getCaseValue();
+      for (auto Case : LI->cases())
+        LCases[Case.getCaseValue()] = Case.getCaseSuccessor();
+
+      for (auto Case : RI->cases()) {
+        ConstantInt *CaseValue = Case.getCaseValue();
         BasicBlock *LCase = LCases[CaseValue];
         if (LCase) {
-          if (TryUnify) tryUnify(LCase, I.getCaseSuccessor());
+          if (TryUnify)
+            tryUnify(LCase, Case.getCaseSuccessor());
           LCases.erase(CaseValue);
         } else if (Complain || !Difference) {
           if (Complain)
diff --git a/tools/llvm-extract/llvm-extract.cpp b/tools/llvm-extract/llvm-extract.cpp
index aa1eda2f094a63af8b7a8fd6dd3010be7b1f9710..d868db7f78ad16c1aefa9a74baef0a58ef524819 100644
--- a/tools/llvm-extract/llvm-extract.cpp
+++ b/tools/llvm-extract/llvm-extract.cpp
@@ -17,10 +17,11 @@
 #include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IRReader/IRReader.h"
-#include "llvm/IR/LegacyPassManager.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
@@ -50,6 +51,10 @@ Force("f", cl::desc("Enable binary output on terminals"));
 static cl::opt<bool>
 DeleteFn("delete", cl::desc("Delete specified Globals from Module"));
 
+static cl::opt<bool>
+    Recursive("recursive",
+              cl::desc("Recursively extract all called functions"));
+
 // ExtractFuncs - The functions to extract from the module.
 static cl::list<std::string>
 ExtractFuncs("func", cl::desc("Specify function to extract"),
@@ -226,6 +231,34 @@ int main(int argc, char **argv) {
   // Use *argv instead of argv[0] to work around a wrong GCC warning.
   ExitOnError ExitOnErr(std::string(*argv) + ": error reading input: ");
 
+  if (Recursive) {
+    std::vector<llvm::Function *> Workqueue;
+    for (GlobalValue *GV : GVs) {
+      if (auto *F = dyn_cast<Function>(GV)) {
+        Workqueue.push_back(F);
+      }
+    }
+    while (!Workqueue.empty()) {
+      Function *F = &*Workqueue.back();
+      Workqueue.pop_back();
+      ExitOnErr(F->materialize());
+      for (auto &BB : *F) {
+        for (auto &I : BB) {
+          auto *CI = dyn_cast<CallInst>(&I);
+          if (!CI)
+            continue;
+          Function *CF = CI->getCalledFunction();
+          if (!CF)
+            continue;
+          if (CF->isDeclaration() || GVs.count(CF))
+            continue;
+          GVs.insert(CF);
+          Workqueue.push_back(CF);
+        }
+      }
+    }
+  }
+
   auto Materialize = [&](GlobalValue &GV) { ExitOnErr(GV.materialize()); };
 
   // Materialize requisite global values.
diff --git a/tools/llvm-link/llvm-link.cpp b/tools/llvm-link/llvm-link.cpp
index e89696e7e7c245176ddc2d4c306e9013997190e7..a024b6926d5dd805d7638bb8809e00747649c2b7 100644
--- a/tools/llvm-link/llvm-link.cpp
+++ b/tools/llvm-link/llvm-link.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/SystemUtils.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
+#include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/Utils/FunctionImportUtils.h"
 
 #include <memory>
@@ -272,6 +273,8 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
                       unsigned Flags) {
   // Filter out flags that don't apply to the first file we load.
   unsigned ApplicableFlags = Flags & Linker::Flags::OverrideFromSrc;
+  // Similar to some flags, internalization doesn't apply to the first file.
+  bool InternalizeLinkedSymbols = false;
   for (const auto &File : Files) {
     std::unique_ptr<Module> M = loadFile(argv0, File, Context);
     if (!M.get()) {
@@ -311,8 +314,24 @@ static bool linkFiles(const char *argv0, LLVMContext &Context, Linker &L,
     if (Verbose)
       errs() << "Linking in '" << File << "'\n";
 
-    if (L.linkInModule(std::move(M), ApplicableFlags))
+    bool Err = false;
+    if (InternalizeLinkedSymbols) {
+      Err = L.linkInModule(
+          std::move(M), ApplicableFlags, [](Module &M, const StringSet<> &GVS) {
+            internalizeModule(M, [&GVS](const GlobalValue &GV) {
+              return !GV.hasName() || (GVS.count(GV.getName()) == 0);
+            });
+          });
+    } else {
+      Err = L.linkInModule(std::move(M), ApplicableFlags);
+    }
+
+    if (Err)
       return false;
+
+    // Internalization applies to linking of subsequent files.
+    InternalizeLinkedSymbols = Internalize;
+
     // All linker flags apply to linking of subsequent files.
     ApplicableFlags = Flags;
   }
@@ -340,8 +359,6 @@ int main(int argc, char **argv) {
   Linker L(*Composite);
 
   unsigned Flags = Linker::Flags::None;
-  if (Internalize)
-    Flags |= Linker::Flags::InternalizeLinkedSymbols;
   if (OnlyNeeded)
     Flags |= Linker::Flags::LinkOnlyNeeded;
 
diff --git a/tools/llvm-lto/llvm-lto.cpp b/tools/llvm-lto/llvm-lto.cpp
index 475350c8ecf125d19619ad1de2a13e1f4a371220..2f005412a3b928e12664494ec0b2b947de6324b0 100644
--- a/tools/llvm-lto/llvm-lto.cpp
+++ b/tools/llvm-lto/llvm-lto.cpp
@@ -63,6 +63,10 @@ static cl::opt<bool> DisableLTOVectorization(
     "disable-lto-vectorization", cl::init(false),
     cl::desc("Do not run loop or slp vectorization during LTO"));
 
+static cl::opt<bool> EnableFreestanding(
+    "lto-freestanding", cl::init(false),
+    cl::desc("Enable Freestanding (disable builtins / TLI) during LTO"));
+
 static cl::opt<bool> UseDiagnosticHandler(
     "use-diagnostic-handler", cl::init(false),
     cl::desc("Use a diagnostic handler to test the handler interface"));
@@ -433,6 +437,7 @@ public:
     ThinGenerator.setCodePICModel(getRelocModel());
     ThinGenerator.setTargetOptions(Options);
     ThinGenerator.setCacheDir(ThinLTOCacheDir);
+    ThinGenerator.setFreestanding(EnableFreestanding);
 
     // Add all the exported symbols to the table of symbols to preserve.
     for (unsigned i = 0; i < ExportedSymbols.size(); ++i)
@@ -809,6 +814,7 @@ int main(int argc, char **argv) {
     CodeGen.setDiagnosticHandler(handleDiagnostics, nullptr);
 
   CodeGen.setCodePICModel(getRelocModel());
+  CodeGen.setFreestanding(EnableFreestanding);
 
   CodeGen.setDebugInfo(LTO_DEBUG_MODEL_DWARF);
   CodeGen.setTargetOptions(Options);
diff --git a/tools/llvm-lto2/llvm-lto2.cpp b/tools/llvm-lto2/llvm-lto2.cpp
index c09311a05b90dcbe5f105bc66a448033f05f385d..faa658d93a3ea518d2af3fc07900d5c61793e63a 100644
--- a/tools/llvm-lto2/llvm-lto2.cpp
+++ b/tools/llvm-lto2/llvm-lto2.cpp
@@ -21,12 +21,12 @@
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/LTO/LTO.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Support/Threading.h"
 
 using namespace llvm;
 using namespace lto;
-using namespace object;
 
 static cl::opt<char>
     OptLevel("O", cl::desc("Optimization level. [-O0, -O1, -O2, or -O3] "
@@ -90,11 +90,20 @@ static cl::opt<std::string> DefaultTriple(
     cl::desc(
         "Replace unspecified target triples in input files with this triple"));
 
+static cl::opt<std::string>
+    OptRemarksOutput("pass-remarks-output",
+                     cl::desc("YAML output file for optimization remarks"));
+
+static cl::opt<bool> OptRemarksWithHotness(
+    "pass-remarks-with-hotness",
+    cl::desc("Whether to include hotness informations in the remarks.\n"
+             "Has effect only if -pass-remarks-output is specified."));
+
 static void check(Error E, std::string Msg) {
   if (!E)
     return;
   handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
-    errs() << "llvm-lto: " << Msg << ": " << EIB.message().c_str() << '\n';
+    errs() << "llvm-lto2: " << Msg << ": " << EIB.message().c_str() << '\n';
   });
   exit(1);
 }
@@ -117,12 +126,12 @@ template <typename T> static T check(ErrorOr<T> E, std::string Msg) {
   return T();
 }
 
-int main(int argc, char **argv) {
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  InitializeAllAsmPrinters();
-  InitializeAllAsmParsers();
+static int usage() {
+  errs() << "Available subcommands: run\n";
+  return 1;
+}
 
+static int run(int argc, char **argv) {
   cl::ParseCommandLineOptions(argc, argv, "Resolution-based LTO test harness");
 
   // FIXME: Workaround PR30396 which means that a symbol can appear
@@ -148,9 +157,11 @@ int main(int argc, char **argv) {
         Res.FinalDefinitionInLinkageUnit = true;
       else if (C == 'x')
         Res.VisibleToRegularObj = true;
-      else
+      else {
         llvm::errs() << "invalid character " << C << " in resolution: " << R
                      << '\n';
+        return 1;
+      }
     }
     CommandLineResolutions[{FileName, SymbolName}].push_back(Res);
   }
@@ -176,6 +187,10 @@ int main(int argc, char **argv) {
     check(Conf.addSaveTemps(OutputFilename + "."),
           "Config::addSaveTemps failed");
 
+  // Optimization remarks.
+  Conf.RemarksFilename = OptRemarksOutput;
+  Conf.RemarksWithHotness = OptRemarksWithHotness;
+
   // Run a custom pipeline, if asked for.
   Conf.OptPipeline = OptPipeline;
   Conf.AAPipeline = AAPipeline;
@@ -199,6 +214,9 @@ int main(int argc, char **argv) {
     return 1;
   }
 
+  if (FileType.getNumOccurrences())
+    Conf.CGFileType = FileType;
+
   Conf.OverrideTriple = OverrideTriple;
   Conf.DefaultTriple = DefaultTriple;
 
@@ -257,18 +275,34 @@ int main(int argc, char **argv) {
     return llvm::make_unique<lto::NativeObjectStream>(std::move(S));
   };
 
-  auto AddFile = [&](size_t Task, StringRef Path) {
-    auto ReloadedBufferOrErr = MemoryBuffer::getFile(Path);
-    if (auto EC = ReloadedBufferOrErr.getError())
-      report_fatal_error(Twine("Can't reload cached file '") + Path + "': " +
-                         EC.message() + "\n");
-
-    *AddStream(Task)->OS << (*ReloadedBufferOrErr)->getBuffer();
+  auto AddBuffer = [&](size_t Task, std::unique_ptr<MemoryBuffer> MB) {
+    *AddStream(Task)->OS << MB->getBuffer();
   };
 
   NativeObjectCache Cache;
   if (!CacheDir.empty())
-    Cache = localCache(CacheDir, AddFile);
+    Cache = check(localCache(CacheDir, AddBuffer), "failed to create cache");
 
   check(Lto.run(AddStream, Cache), "LTO::run failed");
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
+  InitializeAllAsmParsers();
+
+  // FIXME: This should use llvm::cl subcommands, but it isn't currently
+  // possible to pass an argument not associated with a subcommand to a
+  // subcommand (e.g. -lto-use-new-pm).
+  if (argc < 2)
+    return usage();
+
+  StringRef Subcommand = argv[1];
+  // Ensure that argv[0] is correct after adjusting argv/argc.
+  argv[1] = argv[0];
+  if (Subcommand == "run")
+    return run(argc - 1, argv + 1);
+  return usage();
 }
diff --git a/tools/llvm-mc-assemble-fuzzer/CMakeLists.txt b/tools/llvm-mc-assemble-fuzzer/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c5fb62166cfd4fc99f1719cece43e32cb187fa8b
--- /dev/null
+++ b/tools/llvm-mc-assemble-fuzzer/CMakeLists.txt
@@ -0,0 +1,19 @@
+if( LLVM_USE_SANITIZE_COVERAGE )
+  include_directories(BEFORE
+    ${CMAKE_CURRENT_SOURCE_DIR}/../../lib/Fuzzer)
+
+  set(LLVM_LINK_COMPONENTS
+      AllTargetsAsmPrinters
+      AllTargetsAsmParsers
+      AllTargetsDescs
+      AllTargetsInfos
+      MC
+      MCParser
+      Support
+      )
+  add_llvm_tool(llvm-mc-assemble-fuzzer 
+                llvm-mc-assemble-fuzzer.cpp)
+  target_link_libraries(llvm-mc-assemble-fuzzer
+                        LLVMFuzzer
+                        )
+endif()
diff --git a/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp b/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..0344d8cd8c9a286f7ea3fc3e7d812c2087111404
--- /dev/null
+++ b/tools/llvm-mc-assemble-fuzzer/llvm-mc-assemble-fuzzer.cpp
@@ -0,0 +1,313 @@
+//===--- llvm-mc-fuzzer.cpp - Fuzzer for the MC layer ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+//===----------------------------------------------------------------------===//
+
+#include "FuzzerInterface.h"
+#include "llvm-c/Target.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionMachO.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+using namespace llvm;
+
+static cl::opt<std::string>
+    TripleName("triple", cl::desc("Target triple to assemble for, "
+                                  "see -version for available targets"));
+
+static cl::opt<std::string>
+    MCPU("mcpu",
+         cl::desc("Target a specific cpu type (-mcpu=help for details)"),
+         cl::value_desc("cpu-name"), cl::init(""));
+
+// This is useful for variable-length instruction sets.
+static cl::opt<unsigned> InsnLimit(
+    "insn-limit",
+    cl::desc("Limit the number of instructions to process (0 for no limit)"),
+    cl::value_desc("count"), cl::init(0));
+
+static cl::list<std::string>
+    MAttrs("mattr", cl::CommaSeparated,
+           cl::desc("Target specific attributes (-mattr=help for details)"),
+           cl::value_desc("a1,+a2,-a3,..."));
+// The feature string derived from -mattr's values.
+std::string FeaturesStr;
+
+static cl::list<std::string>
+    FuzzerArgs("fuzzer-args", cl::Positional,
+               cl::desc("Options to pass to the fuzzer"), cl::ZeroOrMore,
+               cl::PositionalEatsArgs);
+static std::vector<char *> ModifiedArgv;
+
+enum OutputFileType {
+  OFT_Null,
+  OFT_AssemblyFile,
+  OFT_ObjectFile
+};
+static cl::opt<OutputFileType>
+FileType("filetype", cl::init(OFT_AssemblyFile),
+  cl::desc("Choose an output file type:"),
+  cl::values(
+       clEnumValN(OFT_AssemblyFile, "asm",
+                  "Emit an assembly ('.s') file"),
+       clEnumValN(OFT_Null, "null",
+                  "Don't emit anything (for timing purposes)"),
+       clEnumValN(OFT_ObjectFile, "obj",
+                  "Emit a native object ('.o') file")));
+
+
+class LLVMFuzzerInputBuffer : public MemoryBuffer
+{
+  public:
+    LLVMFuzzerInputBuffer(const uint8_t *data_, size_t size_)
+      : Data(reinterpret_cast<const char *>(data_)), 
+        Size(size_) {
+        init(Data, Data+Size, false);
+      }
+
+
+    virtual BufferKind getBufferKind() const {
+      return MemoryBuffer_Malloc; // it's not disk-backed so I think that's
+                                  // the intent ... though AFAIK it
+                                  // probably came from an mmap or sbrk
+    }
+
+  private:
+    const char *Data;
+    size_t Size;
+};
+
+static int AssembleInput(const char *ProgName, const Target *TheTarget,
+                         SourceMgr &SrcMgr, MCContext &Ctx, MCStreamer &Str,
+                         MCAsmInfo &MAI, MCSubtargetInfo &STI,
+                         MCInstrInfo &MCII, MCTargetOptions &MCOptions) {
+  static const bool NoInitialTextSection = false;
+
+  std::unique_ptr<MCAsmParser> Parser(
+    createMCAsmParser(SrcMgr, Ctx, Str, MAI));
+
+  std::unique_ptr<MCTargetAsmParser> TAP(
+    TheTarget->createMCAsmParser(STI, *Parser, MCII, MCOptions));
+
+  if (!TAP) {
+    errs() << ProgName
+           << ": error: this target '" << TripleName
+           << "', does not support assembly parsing.\n";
+    abort();
+  }
+
+  Parser->setTargetParser(*TAP);
+
+  return Parser->Run(NoInitialTextSection);
+}
+
+
+int AssembleOneInput(const uint8_t *Data, size_t Size) {
+  const bool ShowInst = false;
+  const bool AsmVerbose = false;
+  const bool UseDwarfDirectory = true;
+
+  Triple TheTriple(Triple::normalize(TripleName));
+
+  SourceMgr SrcMgr;
+
+  std::unique_ptr<MemoryBuffer> BufferPtr(new LLVMFuzzerInputBuffer(Data, Size));
+
+  // Tell SrcMgr about this buffer, which is what the parser will pick up.
+  SrcMgr.AddNewSourceBuffer(std::move(BufferPtr), SMLoc());
+
+  static const std::vector<std::string> NoIncludeDirs;
+  SrcMgr.setIncludeDirs(NoIncludeDirs);
+
+  static std::string ArchName;
+  std::string Error;
+  const Target *TheTarget = TargetRegistry::lookupTarget(ArchName, TheTriple,
+      Error);
+  if (!TheTarget) {
+    errs() << "error: this target '" << TheTriple.normalize()
+      << "/" << ArchName << "', was not found: '" << Error << "'\n";
+
+    abort();
+  }
+
+  std::unique_ptr<MCRegisterInfo> MRI(TheTarget->createMCRegInfo(TripleName));
+  if (!MRI) {
+    errs() << "Unable to create target register info!";
+    abort();
+  }
+
+  std::unique_ptr<MCAsmInfo> MAI(TheTarget->createMCAsmInfo(*MRI, TripleName));
+  if (!MAI) {
+    errs() << "Unable to create target asm info!";
+    abort();
+  }
+
+
+  MCObjectFileInfo MOFI;
+  MCContext Ctx(MAI.get(), MRI.get(), &MOFI, &SrcMgr);
+
+  static const bool UsePIC = false;
+  static const CodeModel::Model CMModel = CodeModel::Default;
+  MOFI.InitMCObjectFileInfo(TheTriple, UsePIC, CMModel, Ctx);
+
+  const unsigned OutputAsmVariant = 0;
+  std::unique_ptr<MCInstrInfo> MCII(TheTarget->createMCInstrInfo());
+  MCInstPrinter *IP = TheTarget->createMCInstPrinter(Triple(TripleName), OutputAsmVariant,
+      *MAI, *MCII, *MRI);
+  if (!IP) {
+    errs()
+      << "error: unable to create instruction printer for target triple '"
+      << TheTriple.normalize() << "' with assembly variant "
+      << OutputAsmVariant << ".\n";
+
+    abort();
+  }
+
+  const char *ProgName = "llvm-mc-fuzzer";
+  std::unique_ptr<MCSubtargetInfo> STI(
+      TheTarget->createMCSubtargetInfo(TripleName, MCPU, FeaturesStr));
+  MCCodeEmitter *CE = nullptr;
+  MCAsmBackend *MAB = nullptr;
+
+  MCTargetOptions MCOptions = InitMCTargetOptionsFromFlags();
+
+  std::string OutputString;
+  raw_string_ostream Out(OutputString);
+  auto FOut = llvm::make_unique<formatted_raw_ostream>(Out);
+
+  std::unique_ptr<MCStreamer> Str;
+
+  if (FileType == OFT_AssemblyFile) {
+    Str.reset(TheTarget->createAsmStreamer(
+        Ctx,  std::move(FOut), AsmVerbose,
+        UseDwarfDirectory, IP, CE, MAB, ShowInst));
+  } else {
+    assert(FileType == OFT_ObjectFile && "Invalid file type!");
+
+    std::error_code EC;
+    const std::string OutputFilename = "-";
+    auto Out = llvm::make_unique<tool_output_file>(OutputFilename, EC,
+                                                 sys::fs::F_None);
+    if (EC) {
+      errs() << EC.message() << '\n';
+      abort();
+    }
+
+    // Don't waste memory on names of temp labels.
+    Ctx.setUseNamesOnTempLabels(false);
+
+    std::unique_ptr<buffer_ostream> BOS;
+    raw_pwrite_stream *OS = &Out->os();
+    if (!Out->os().supportsSeeking()) {
+      BOS = make_unique<buffer_ostream>(Out->os());
+      OS = BOS.get();
+    }
+
+    MCCodeEmitter *CE = TheTarget->createMCCodeEmitter(*MCII, *MRI, Ctx);
+    MCAsmBackend *MAB = TheTarget->createMCAsmBackend(*MRI, TripleName, MCPU,
+                                                      MCOptions);
+    Str.reset(TheTarget->createMCObjectStreamer(
+        TheTriple, Ctx, *MAB, *OS, CE, *STI, MCOptions.MCRelaxAll,
+        MCOptions.MCIncrementalLinkerCompatible,
+        /*DWARFMustBeAtTheEnd*/ false));
+  }
+  const int Res = AssembleInput(ProgName, TheTarget, SrcMgr, Ctx, *Str, *MAI, *STI,
+      *MCII, MCOptions);
+
+  (void) Res;
+
+  return 0;
+}
+
+int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
+  return AssembleOneInput(Data, Size);
+}
+
+int LLVMFuzzerInitialize(int *argc, char ***argv) {
+  // The command line is unusual compared to other fuzzers due to the need to
+  // specify the target. Options like -triple, -mcpu, and -mattr work like
+  // their counterparts in llvm-mc, while -fuzzer-args collects options for the
+  // fuzzer itself.
+  //
+  // Examples:
+  //
+  // Fuzz the big-endian MIPS32R6 disassembler using 100,000 inputs of up to
+  // 4-bytes each and use the contents of ./corpus as the test corpus:
+  //   llvm-mc-fuzzer -triple mips-linux-gnu -mcpu=mips32r6 -disassemble \
+  //       -fuzzer-args -max_len=4 -runs=100000 ./corpus
+  //
+  // Infinitely fuzz the little-endian MIPS64R2 disassembler with the MSA
+  // feature enabled using up to 64-byte inputs:
+  //   llvm-mc-fuzzer -triple mipsel-linux-gnu -mcpu=mips64r2 -mattr=msa \
+  //       -disassemble -fuzzer-args ./corpus
+  //
+  // If your aim is to find instructions that are not tested, then it is
+  // advisable to constrain the maximum input size to a single instruction
+  // using -max_len as in the first example. This results in a test corpus of
+  // individual instructions that test unique paths. Without this constraint,
+  // there will be considerable redundancy in the corpus.
+
+  char **OriginalArgv = *argv;
+
+  LLVMInitializeAllTargetInfos();
+  LLVMInitializeAllTargetMCs();
+  LLVMInitializeAllAsmParsers();
+
+  cl::ParseCommandLineOptions(*argc, OriginalArgv);
+
+  // Rebuild the argv without the arguments llvm-mc-fuzzer consumed so that
+  // the driver can parse its arguments.
+  //
+  // FuzzerArgs cannot provide the non-const pointer that OriginalArgv needs.
+  // Re-use the strings from OriginalArgv instead of copying FuzzerArg to a
+  // non-const buffer to avoid the need to clean up when the fuzzer terminates.
+  ModifiedArgv.push_back(OriginalArgv[0]);
+  for (const auto &FuzzerArg : FuzzerArgs) {
+    for (int i = 1; i < *argc; ++i) {
+      if (FuzzerArg == OriginalArgv[i])
+        ModifiedArgv.push_back(OriginalArgv[i]);
+    }
+  }
+  *argc = ModifiedArgv.size();
+  *argv = ModifiedArgv.data();
+
+  // Package up features to be passed to target/subtarget
+  // We have to pass it via a global since the callback doesn't
+  // permit any user data.
+  if (MAttrs.size()) {
+    SubtargetFeatures Features;
+    for (unsigned i = 0; i != MAttrs.size(); ++i)
+      Features.AddFeature(MAttrs[i]);
+    FeaturesStr = Features.getString();
+  }
+
+  if (TripleName.empty())
+    TripleName = sys::getDefaultTargetTriple();
+
+  return 0;
+}
diff --git a/tools/llvm-mc-fuzzer/CMakeLists.txt b/tools/llvm-mc-disassemble-fuzzer/CMakeLists.txt
similarity index 64%
rename from tools/llvm-mc-fuzzer/CMakeLists.txt
rename to tools/llvm-mc-disassemble-fuzzer/CMakeLists.txt
index b42b3eee3c981decd36d2237a5825f079e66854e..c539f823e57f460135ee0b5a6520a237327d5746 100644
--- a/tools/llvm-mc-fuzzer/CMakeLists.txt
+++ b/tools/llvm-mc-disassemble-fuzzer/CMakeLists.txt
@@ -3,16 +3,19 @@ if( LLVM_USE_SANITIZE_COVERAGE )
     ${CMAKE_CURRENT_SOURCE_DIR}/../../lib/Fuzzer)
 
   set(LLVM_LINK_COMPONENTS
+      AllTargetsAsmPrinters
       AllTargetsDescs
       AllTargetsDisassemblers
       AllTargetsInfos
       MC
       MCDisassembler
+      MCParser
       Support
       )
-  add_llvm_tool(llvm-mc-fuzzer 
-                llvm-mc-fuzzer.cpp)
-  target_link_libraries(llvm-mc-fuzzer
+  add_llvm_tool(llvm-mc-disassemble-fuzzer 
+                llvm-mc-disassemble-fuzzer.cpp)
+
+  target_link_libraries(llvm-mc-disassemble-fuzzer
                         LLVMFuzzer
                         )
 endif()
diff --git a/tools/llvm-mc-fuzzer/llvm-mc-fuzzer.cpp b/tools/llvm-mc-disassemble-fuzzer/llvm-mc-disassemble-fuzzer.cpp
similarity index 89%
rename from tools/llvm-mc-fuzzer/llvm-mc-fuzzer.cpp
rename to tools/llvm-mc-disassemble-fuzzer/llvm-mc-disassemble-fuzzer.cpp
index e31ea762add53defd32dd4c4e29bc3d4e6894b82..643afe64073e6eb23afc2d0f1dd2b5f9adc7de10 100644
--- a/tools/llvm-mc-fuzzer/llvm-mc-fuzzer.cpp
+++ b/tools/llvm-mc-disassemble-fuzzer/llvm-mc-disassemble-fuzzer.cpp
@@ -20,19 +20,6 @@ using namespace llvm;
 
 const unsigned AssemblyTextBufSize = 80;
 
-enum ActionType {
-  AC_Assemble,
-  AC_Disassemble
-};
-
-static cl::opt<ActionType>
-Action(cl::desc("Action to perform:"),
-       cl::init(AC_Assemble),
-       cl::values(clEnumValN(AC_Assemble, "assemble",
-                             "Assemble a .s file (default)"),
-                  clEnumValN(AC_Disassemble, "disassemble",
-                             "Disassemble strings of hex bytes")));
-
 static cl::opt<std::string>
     TripleName("triple", cl::desc("Target triple to assemble for, "
                                   "see -version for available targets"));
@@ -88,13 +75,7 @@ int DisassembleOneInput(const uint8_t *Data, size_t Size) {
 }
 
 int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) {
-  if (Action == AC_Assemble)
-    errs() << "error: -assemble is not implemented\n";
-  else if (Action == AC_Disassemble)
-    return DisassembleOneInput(Data, Size);
-
-  llvm_unreachable("Unknown action");
-  return 0;
+  return DisassembleOneInput(Data, Size);
 }
 
 int LLVMFuzzerInitialize(int *argc, char ***argv) {
@@ -155,5 +136,8 @@ int LLVMFuzzerInitialize(int *argc, char ***argv) {
     FeaturesStr = Features.getString();
   }
 
+  if (TripleName.empty())
+    TripleName = sys::getDefaultTargetTriple();
+
   return 0;
 }
diff --git a/tools/llvm-mc/llvm-mc.cpp b/tools/llvm-mc/llvm-mc.cpp
index 497fb1987764e4a36f8ef2025b9769a55030923b..87efac2d33cf72767e408a7ef04b801c5f27eb9f 100644
--- a/tools/llvm-mc/llvm-mc.cpp
+++ b/tools/llvm-mc/llvm-mc.cpp
@@ -516,7 +516,7 @@ int main(int argc, char **argv) {
   Ctx.setGenDwarfForAssembly(GenDwarfForAssembly);
   // Default to 4 for dwarf version.
   unsigned DwarfVersion = MCOptions.DwarfVersion ? MCOptions.DwarfVersion : 4;
-  if (DwarfVersion < 2 || DwarfVersion > 4) {
+  if (DwarfVersion < 2 || DwarfVersion > 5) {
     errs() << ProgName << ": Dwarf version " << DwarfVersion
            << " is not supported." << '\n';
     return 1;
diff --git a/tools/llvm-objdump/MachODump.cpp b/tools/llvm-objdump/MachODump.cpp
index 80cbe4bc6575b735c25ebbbd1a7bde9947922ec9..9e02951a4a93f63c75fff1be0f191916c7a4e033 100644
--- a/tools/llvm-objdump/MachODump.cpp
+++ b/tools/llvm-objdump/MachODump.cpp
@@ -68,9 +68,6 @@ static cl::opt<std::string> DSYMFile("dsym",
 static cl::opt<bool> FullLeadingAddr("full-leading-addr",
                                      cl::desc("Print full leading address"));
 
-static cl::opt<bool> NoLeadingAddr("no-leading-addr",
-                                   cl::desc("Print no leading address"));
-
 static cl::opt<bool> NoLeadingHeaders("no-leading-headers",
                                       cl::desc("Print no leading headers"));
 
@@ -1142,7 +1139,7 @@ static void DumpInfoPlistSectionContents(StringRef Filename,
       StringRef BytesStr;
       Section.getContents(BytesStr);
       const char *sect = reinterpret_cast<const char *>(BytesStr.data());
-      outs() << sect;
+      outs() << format("%.*s", BytesStr.size(), sect) << "\n";
       return;
     }
   }
@@ -5274,42 +5271,70 @@ static void printObjc2_64bit_MetaData(MachOObjectFile *O, bool verbose) {
   SectionRef CL = get_section(O, "__OBJC2", "__class_list");
   if (CL == SectionRef())
     CL = get_section(O, "__DATA", "__objc_classlist");
+  if (CL == SectionRef())
+    CL = get_section(O, "__DATA_CONST", "__objc_classlist");
+  if (CL == SectionRef())
+    CL = get_section(O, "__DATA_DIRTY", "__objc_classlist");
   info.S = CL;
   walk_pointer_list_64("class", CL, O, &info, print_class64_t);
 
   SectionRef CR = get_section(O, "__OBJC2", "__class_refs");
   if (CR == SectionRef())
     CR = get_section(O, "__DATA", "__objc_classrefs");
+  if (CR == SectionRef())
+    CR = get_section(O, "__DATA_CONST", "__objc_classrefs");
+  if (CR == SectionRef())
+    CR = get_section(O, "__DATA_DIRTY", "__objc_classrefs");
   info.S = CR;
   walk_pointer_list_64("class refs", CR, O, &info, nullptr);
 
   SectionRef SR = get_section(O, "__OBJC2", "__super_refs");
   if (SR == SectionRef())
     SR = get_section(O, "__DATA", "__objc_superrefs");
+  if (SR == SectionRef())
+    SR = get_section(O, "__DATA_CONST", "__objc_superrefs");
+  if (SR == SectionRef())
+    SR = get_section(O, "__DATA_DIRTY", "__objc_superrefs");
   info.S = SR;
   walk_pointer_list_64("super refs", SR, O, &info, nullptr);
 
   SectionRef CA = get_section(O, "__OBJC2", "__category_list");
   if (CA == SectionRef())
     CA = get_section(O, "__DATA", "__objc_catlist");
+  if (CA == SectionRef())
+    CA = get_section(O, "__DATA_CONST", "__objc_catlist");
+  if (CA == SectionRef())
+    CA = get_section(O, "__DATA_DIRTY", "__objc_catlist");
   info.S = CA;
   walk_pointer_list_64("category", CA, O, &info, print_category64_t);
 
   SectionRef PL = get_section(O, "__OBJC2", "__protocol_list");
   if (PL == SectionRef())
     PL = get_section(O, "__DATA", "__objc_protolist");
+  if (PL == SectionRef())
+    PL = get_section(O, "__DATA_CONST", "__objc_protolist");
+  if (PL == SectionRef())
+    PL = get_section(O, "__DATA_DIRTY", "__objc_protolist");
   info.S = PL;
   walk_pointer_list_64("protocol", PL, O, &info, nullptr);
 
   SectionRef MR = get_section(O, "__OBJC2", "__message_refs");
   if (MR == SectionRef())
     MR = get_section(O, "__DATA", "__objc_msgrefs");
+  if (MR == SectionRef())
+    MR = get_section(O, "__DATA_CONST", "__objc_msgrefs");
+  if (MR == SectionRef())
+    MR = get_section(O, "__DATA_DIRTY", "__objc_msgrefs");
   info.S = MR;
   print_message_refs64(MR, &info);
 
   SectionRef II = get_section(O, "__OBJC2", "__image_info");
   if (II == SectionRef())
     II = get_section(O, "__DATA", "__objc_imageinfo");
+  if (II == SectionRef())
+    II = get_section(O, "__DATA_CONST", "__objc_imageinfo");
+  if (II == SectionRef())
+    II = get_section(O, "__DATA_DIRTY", "__objc_imageinfo");
   info.S = II;
   print_image_info64(II, &info);
 }
@@ -5340,75 +5365,75 @@ static void printObjc2_32bit_MetaData(MachOObjectFile *O, bool verbose) {
   info.adrp_addr = 0;
   info.adrp_inst = 0;
 
-  const SectionRef CL = get_section(O, "__OBJC2", "__class_list");
-  if (CL != SectionRef()) {
-    info.S = CL;
-    walk_pointer_list_32("class", CL, O, &info, print_class32_t);
-  } else {
-    const SectionRef CL = get_section(O, "__DATA", "__objc_classlist");
-    info.S = CL;
-    walk_pointer_list_32("class", CL, O, &info, print_class32_t);
-  }
+  SectionRef CL = get_section(O, "__OBJC2", "__class_list");
+  if (CL == SectionRef())
+    CL = get_section(O, "__DATA", "__objc_classlist");
+  if (CL == SectionRef())
+    CL = get_section(O, "__DATA_CONST", "__objc_classlist");
+  if (CL == SectionRef())
+    CL = get_section(O, "__DATA_DIRTY", "__objc_classlist");
+  info.S = CL;
+  walk_pointer_list_32("class", CL, O, &info, print_class32_t);
 
-  const SectionRef CR = get_section(O, "__OBJC2", "__class_refs");
-  if (CR != SectionRef()) {
-    info.S = CR;
-    walk_pointer_list_32("class refs", CR, O, &info, nullptr);
-  } else {
-    const SectionRef CR = get_section(O, "__DATA", "__objc_classrefs");
-    info.S = CR;
-    walk_pointer_list_32("class refs", CR, O, &info, nullptr);
-  }
+  SectionRef CR = get_section(O, "__OBJC2", "__class_refs");
+  if (CR == SectionRef())
+    CR = get_section(O, "__DATA", "__objc_classrefs");
+  if (CR == SectionRef())
+    CR = get_section(O, "__DATA_CONST", "__objc_classrefs");
+  if (CR == SectionRef())
+    CR = get_section(O, "__DATA_DIRTY", "__objc_classrefs");
+  info.S = CR;
+  walk_pointer_list_32("class refs", CR, O, &info, nullptr);
 
-  const SectionRef SR = get_section(O, "__OBJC2", "__super_refs");
-  if (SR != SectionRef()) {
-    info.S = SR;
-    walk_pointer_list_32("super refs", SR, O, &info, nullptr);
-  } else {
-    const SectionRef SR = get_section(O, "__DATA", "__objc_superrefs");
-    info.S = SR;
-    walk_pointer_list_32("super refs", SR, O, &info, nullptr);
-  }
+  SectionRef SR = get_section(O, "__OBJC2", "__super_refs");
+  if (SR == SectionRef())
+    SR = get_section(O, "__DATA", "__objc_superrefs");
+  if (SR == SectionRef())
+    SR = get_section(O, "__DATA_CONST", "__objc_superrefs");
+  if (SR == SectionRef())
+    SR = get_section(O, "__DATA_DIRTY", "__objc_superrefs");
+  info.S = SR;
+  walk_pointer_list_32("super refs", SR, O, &info, nullptr);
 
-  const SectionRef CA = get_section(O, "__OBJC2", "__category_list");
-  if (CA != SectionRef()) {
-    info.S = CA;
-    walk_pointer_list_32("category", CA, O, &info, print_category32_t);
-  } else {
-    const SectionRef CA = get_section(O, "__DATA", "__objc_catlist");
-    info.S = CA;
-    walk_pointer_list_32("category", CA, O, &info, print_category32_t);
-  }
+  SectionRef CA = get_section(O, "__OBJC2", "__category_list");
+  if (CA == SectionRef())
+    CA = get_section(O, "__DATA", "__objc_catlist");
+  if (CA == SectionRef())
+    CA = get_section(O, "__DATA_CONST", "__objc_catlist");
+  if (CA == SectionRef())
+    CA = get_section(O, "__DATA_DIRTY", "__objc_catlist");
+  info.S = CA;
+  walk_pointer_list_32("category", CA, O, &info, print_category32_t);
 
-  const SectionRef PL = get_section(O, "__OBJC2", "__protocol_list");
-  if (PL != SectionRef()) {
-    info.S = PL;
-    walk_pointer_list_32("protocol", PL, O, &info, nullptr);
-  } else {
-    const SectionRef PL = get_section(O, "__DATA", "__objc_protolist");
-    info.S = PL;
-    walk_pointer_list_32("protocol", PL, O, &info, nullptr);
-  }
+  SectionRef PL = get_section(O, "__OBJC2", "__protocol_list");
+  if (PL == SectionRef())
+    PL = get_section(O, "__DATA", "__objc_protolist");
+  if (PL == SectionRef())
+    PL = get_section(O, "__DATA_CONST", "__objc_protolist");
+  if (PL == SectionRef())
+    PL = get_section(O, "__DATA_DIRTY", "__objc_protolist");
+  info.S = PL;
+  walk_pointer_list_32("protocol", PL, O, &info, nullptr);
 
-  const SectionRef MR = get_section(O, "__OBJC2", "__message_refs");
-  if (MR != SectionRef()) {
-    info.S = MR;
-    print_message_refs32(MR, &info);
-  } else {
-    const SectionRef MR = get_section(O, "__DATA", "__objc_msgrefs");
-    info.S = MR;
-    print_message_refs32(MR, &info);
-  }
+  SectionRef MR = get_section(O, "__OBJC2", "__message_refs");
+  if (MR == SectionRef())
+    MR = get_section(O, "__DATA", "__objc_msgrefs");
+  if (MR == SectionRef())
+    MR = get_section(O, "__DATA_CONST", "__objc_msgrefs");
+  if (MR == SectionRef())
+    MR = get_section(O, "__DATA_DIRTY", "__objc_msgrefs");
+  info.S = MR;
+  print_message_refs32(MR, &info);
 
-  const SectionRef II = get_section(O, "__OBJC2", "__image_info");
-  if (II != SectionRef()) {
-    info.S = II;
-    print_image_info32(II, &info);
-  } else {
-    const SectionRef II = get_section(O, "__DATA", "__objc_imageinfo");
-    info.S = II;
-    print_image_info32(II, &info);
-  }
+  SectionRef II = get_section(O, "__OBJC2", "__image_info");
+  if (II == SectionRef())
+    II = get_section(O, "__DATA", "__objc_imageinfo");
+  if (II == SectionRef())
+    II = get_section(O, "__DATA_CONST", "__objc_imageinfo");
+  if (II == SectionRef())
+    II = get_section(O, "__DATA_DIRTY", "__objc_imageinfo");
+  info.S = II;
+  print_image_info32(II, &info);
 }
 
 static bool printObjc1_32bit_MetaData(MachOObjectFile *O, bool verbose) {
@@ -6602,6 +6627,12 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
     if (Bytes.size() == 0)
       return;
 
+    // If the section has symbols but no symbol at the start of the section
+    // these are used to make sure the bytes before the first symbol are
+    // disassembled.
+    bool FirstSymbol = true;
+    bool FirstSymbolAtSectionStart = true;
+
     // Disassemble symbol by symbol.
     for (unsigned SymIdx = 0; SymIdx != Symbols.size(); SymIdx++) {
       Expected<StringRef> SymNameOrErr = Symbols[SymIdx].getName();
@@ -6691,11 +6722,29 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
       // (i.e. we're not targeting M-class) and the function is Thumb.
       bool UseThumbTarget = IsThumb && ThumbTarget;
 
-      outs() << SymName << ":\n";
+      // If we are not specifying a symbol to start disassembly with and this
+      // is the first symbol in the section but not at the start of the section
+      // then move the disassembly index to the start of the section and
+      // don't print the symbol name just yet.  This is so the bytes before the
+      // first symbol are disassembled.
+      uint64_t SymbolStart = Start;
+      if (DisSymName.empty() && FirstSymbol && Start != 0) {
+        FirstSymbolAtSectionStart = false;
+        Start = 0;
+      }
+      else
+        outs() << SymName << ":\n";
+
       DILineInfo lastLine;
       for (uint64_t Index = Start; Index < End; Index += Size) {
         MCInst Inst;
 
+        // If this is the first symbol in the section and it was not at the
+        // start of the section, see if we are at its Index now and if so print
+        // the symbol name.
+        if (FirstSymbol && !FirstSymbolAtSectionStart && Index == SymbolStart)
+          outs() << SymName << ":\n";
+
         uint64_t PC = SectAddress + Index;
         if (!NoLeadingAddr) {
           if (FullLeadingAddr) {
@@ -6788,6 +6837,9 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
           }
         }
       }
+      // Now that we are done disassembled the first symbol set the bool that
+      // were doing this to false.
+      FirstSymbol = false;
     }
     if (!symbolTableWorked) {
       // Reading the symbol table didn't work, disassemble the whole section.
@@ -6798,8 +6850,10 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
         MCInst Inst;
 
         uint64_t PC = SectAddress + Index;
+        SmallVector<char, 64> AnnotationsBytes;
+        raw_svector_ostream Annotations(AnnotationsBytes);
         if (DisAsm->getInstruction(Inst, InstSize, Bytes.slice(Index), PC,
-                                   DebugOut, nulls())) {
+                                   DebugOut, Annotations)) {
           if (!NoLeadingAddr) {
             if (FullLeadingAddr) {
               if (MachOOF->is64Bit())
@@ -6814,7 +6868,8 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
             outs() << "\t";
             dumpBytes(makeArrayRef(Bytes.data() + Index, InstSize), outs());
           }
-          IP->printInst(&Inst, outs(), "", *STI);
+          StringRef AnnotationsStr = Annotations.str();
+          IP->printInst(&Inst, outs(), AnnotationsStr, *STI);
           outs() << "\n";
         } else {
           unsigned int Arch = MachOOF->getArch();
@@ -9336,117 +9391,21 @@ void llvm::printMachOExportsTrie(const object::MachOObjectFile *Obj) {
 // rebase table dumping
 //===----------------------------------------------------------------------===//
 
-namespace {
-class SegInfo {
-public:
-  SegInfo(const object::MachOObjectFile *Obj);
-
-  StringRef segmentName(uint32_t SegIndex);
-  StringRef sectionName(uint32_t SegIndex, uint64_t SegOffset);
-  uint64_t address(uint32_t SegIndex, uint64_t SegOffset);
-  bool isValidSegIndexAndOffset(uint32_t SegIndex, uint64_t SegOffset);
-
-private:
-  struct SectionInfo {
-    uint64_t Address;
-    uint64_t Size;
-    StringRef SectionName;
-    StringRef SegmentName;
-    uint64_t OffsetInSegment;
-    uint64_t SegmentStartAddress;
-    uint32_t SegmentIndex;
-  };
-  const SectionInfo &findSection(uint32_t SegIndex, uint64_t SegOffset);
-  SmallVector<SectionInfo, 32> Sections;
-};
-}
-
-SegInfo::SegInfo(const object::MachOObjectFile *Obj) {
-  // Build table of sections so segIndex/offset pairs can be translated.
-  uint32_t CurSegIndex = Obj->hasPageZeroSegment() ? 1 : 0;
-  StringRef CurSegName;
-  uint64_t CurSegAddress;
-  for (const SectionRef &Section : Obj->sections()) {
-    SectionInfo Info;
-    error(Section.getName(Info.SectionName));
-    Info.Address = Section.getAddress();
-    Info.Size = Section.getSize();
-    Info.SegmentName =
-        Obj->getSectionFinalSegmentName(Section.getRawDataRefImpl());
-    if (!Info.SegmentName.equals(CurSegName)) {
-      ++CurSegIndex;
-      CurSegName = Info.SegmentName;
-      CurSegAddress = Info.Address;
-    }
-    Info.SegmentIndex = CurSegIndex - 1;
-    Info.OffsetInSegment = Info.Address - CurSegAddress;
-    Info.SegmentStartAddress = CurSegAddress;
-    Sections.push_back(Info);
-  }
-}
-
-StringRef SegInfo::segmentName(uint32_t SegIndex) {
-  for (const SectionInfo &SI : Sections) {
-    if (SI.SegmentIndex == SegIndex)
-      return SI.SegmentName;
-  }
-  llvm_unreachable("invalid segIndex");
-}
-
-bool SegInfo::isValidSegIndexAndOffset(uint32_t SegIndex,
-                                       uint64_t OffsetInSeg) {
-  for (const SectionInfo &SI : Sections) {
-    if (SI.SegmentIndex != SegIndex)
-      continue;
-    if (SI.OffsetInSegment > OffsetInSeg)
-      continue;
-    if (OffsetInSeg >= (SI.OffsetInSegment + SI.Size))
-      continue;
-    return true;
-  }
-  return false;
-}
-
-const SegInfo::SectionInfo &SegInfo::findSection(uint32_t SegIndex,
-                                                 uint64_t OffsetInSeg) {
-  for (const SectionInfo &SI : Sections) {
-    if (SI.SegmentIndex != SegIndex)
-      continue;
-    if (SI.OffsetInSegment > OffsetInSeg)
-      continue;
-    if (OffsetInSeg >= (SI.OffsetInSegment + SI.Size))
-      continue;
-    return SI;
-  }
-  llvm_unreachable("segIndex and offset not in any section");
-}
-
-StringRef SegInfo::sectionName(uint32_t SegIndex, uint64_t OffsetInSeg) {
-  return findSection(SegIndex, OffsetInSeg).SectionName;
-}
-
-uint64_t SegInfo::address(uint32_t SegIndex, uint64_t OffsetInSeg) {
-  const SectionInfo &SI = findSection(SegIndex, OffsetInSeg);
-  return SI.SegmentStartAddress + OffsetInSeg;
-}
-
-void llvm::printMachORebaseTable(const object::MachOObjectFile *Obj) {
-  // Build table of sections so names can used in final output.
-  SegInfo sectionTable(Obj);
-
+void llvm::printMachORebaseTable(object::MachOObjectFile *Obj) {
   outs() << "segment  section            address     type\n";
-  for (const llvm::object::MachORebaseEntry &Entry : Obj->rebaseTable()) {
-    uint32_t SegIndex = Entry.segmentIndex();
-    uint64_t OffsetInSeg = Entry.segmentOffset();
-    StringRef SegmentName = sectionTable.segmentName(SegIndex);
-    StringRef SectionName = sectionTable.sectionName(SegIndex, OffsetInSeg);
-    uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
+  Error Err = Error::success();
+  for (const llvm::object::MachORebaseEntry &Entry : Obj->rebaseTable(Err)) {
+    StringRef SegmentName = Entry.segmentName();
+    StringRef SectionName = Entry.sectionName();
+    uint64_t Address = Entry.address();
 
     // Table lines look like: __DATA  __nl_symbol_ptr  0x0000F00C  pointer
     outs() << format("%-8s %-18s 0x%08" PRIX64 "  %s\n",
                      SegmentName.str().c_str(), SectionName.str().c_str(),
                      Address, Entry.typeName().str().c_str());
   }
+  if (Err)
+    report_error(Obj->getFileName(), std::move(Err));
 }
 
 static StringRef ordinalName(const object::MachOObjectFile *Obj, int Ordinal) {
@@ -9474,18 +9433,15 @@ static StringRef ordinalName(const object::MachOObjectFile *Obj, int Ordinal) {
 // bind table dumping
 //===----------------------------------------------------------------------===//
 
-void llvm::printMachOBindTable(const object::MachOObjectFile *Obj) {
+void llvm::printMachOBindTable(object::MachOObjectFile *Obj) {
   // Build table of sections so names can used in final output.
-  SegInfo sectionTable(Obj);
-
   outs() << "segment  section            address    type       "
             "addend dylib            symbol\n";
-  for (const llvm::object::MachOBindEntry &Entry : Obj->bindTable()) {
-    uint32_t SegIndex = Entry.segmentIndex();
-    uint64_t OffsetInSeg = Entry.segmentOffset();
-    StringRef SegmentName = sectionTable.segmentName(SegIndex);
-    StringRef SectionName = sectionTable.sectionName(SegIndex, OffsetInSeg);
-    uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
+  Error Err = Error::success();
+  for (const llvm::object::MachOBindEntry &Entry : Obj->bindTable(Err)) {
+    StringRef SegmentName = Entry.segmentName();
+    StringRef SectionName = Entry.sectionName();
+    uint64_t Address = Entry.address();
 
     // Table lines look like:
     //  __DATA  __got  0x00012010    pointer   0 libSystem ___stack_chk_guard
@@ -9500,24 +9456,22 @@ void llvm::printMachOBindTable(const object::MachOObjectFile *Obj) {
            << left_justify(ordinalName(Obj, Entry.ordinal()), 16) << " "
            << Entry.symbolName() << Attr << "\n";
   }
+  if (Err)
+    report_error(Obj->getFileName(), std::move(Err));
 }
 
 //===----------------------------------------------------------------------===//
 // lazy bind table dumping
 //===----------------------------------------------------------------------===//
 
-void llvm::printMachOLazyBindTable(const object::MachOObjectFile *Obj) {
-  // Build table of sections so names can used in final output.
-  SegInfo sectionTable(Obj);
-
+void llvm::printMachOLazyBindTable(object::MachOObjectFile *Obj) {
   outs() << "segment  section            address     "
             "dylib            symbol\n";
-  for (const llvm::object::MachOBindEntry &Entry : Obj->lazyBindTable()) {
-    uint32_t SegIndex = Entry.segmentIndex();
-    uint64_t OffsetInSeg = Entry.segmentOffset();
-    StringRef SegmentName = sectionTable.segmentName(SegIndex);
-    StringRef SectionName = sectionTable.sectionName(SegIndex, OffsetInSeg);
-    uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
+  Error Err = Error::success();
+  for (const llvm::object::MachOBindEntry &Entry : Obj->lazyBindTable(Err)) {
+    StringRef SegmentName = Entry.segmentName();
+    StringRef SectionName = Entry.sectionName();
+    uint64_t Address = Entry.address();
 
     // Table lines look like:
     //  __DATA  __got  0x00012010 libSystem ___stack_chk_guard
@@ -9527,30 +9481,28 @@ void llvm::printMachOLazyBindTable(const object::MachOObjectFile *Obj) {
            << left_justify(ordinalName(Obj, Entry.ordinal()), 16) << " "
            << Entry.symbolName() << "\n";
   }
+  if (Err)
+    report_error(Obj->getFileName(), std::move(Err));
 }
 
 //===----------------------------------------------------------------------===//
 // weak bind table dumping
 //===----------------------------------------------------------------------===//
 
-void llvm::printMachOWeakBindTable(const object::MachOObjectFile *Obj) {
-  // Build table of sections so names can used in final output.
-  SegInfo sectionTable(Obj);
-
+void llvm::printMachOWeakBindTable(object::MachOObjectFile *Obj) {
   outs() << "segment  section            address     "
             "type       addend   symbol\n";
-  for (const llvm::object::MachOBindEntry &Entry : Obj->weakBindTable()) {
+  Error Err = Error::success();
+  for (const llvm::object::MachOBindEntry &Entry : Obj->weakBindTable(Err)) {
     // Strong symbols don't have a location to update.
     if (Entry.flags() & MachO::BIND_SYMBOL_FLAGS_NON_WEAK_DEFINITION) {
       outs() << "                                        strong              "
              << Entry.symbolName() << "\n";
       continue;
     }
-    uint32_t SegIndex = Entry.segmentIndex();
-    uint64_t OffsetInSeg = Entry.segmentOffset();
-    StringRef SegmentName = sectionTable.segmentName(SegIndex);
-    StringRef SectionName = sectionTable.sectionName(SegIndex, OffsetInSeg);
-    uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
+    StringRef SegmentName = Entry.segmentName();
+    StringRef SectionName = Entry.sectionName();
+    uint64_t Address = Entry.address();
 
     // Table lines look like:
     // __DATA  __data  0x00001000  pointer    0   _foo
@@ -9561,6 +9513,8 @@ void llvm::printMachOWeakBindTable(const object::MachOObjectFile *Obj) {
            << format_decimal(Entry.addend(), 8) << "   " << Entry.symbolName()
            << "\n";
   }
+  if (Err)
+    report_error(Obj->getFileName(), std::move(Err));
 }
 
 // get_dyld_bind_info_symbolname() is used for disassembly and passed an
@@ -9571,17 +9525,15 @@ static const char *get_dyld_bind_info_symbolname(uint64_t ReferenceValue,
                                                  struct DisassembleInfo *info) {
   if (info->bindtable == nullptr) {
     info->bindtable = llvm::make_unique<SymbolAddressMap>();
-    SegInfo sectionTable(info->O);
-    for (const llvm::object::MachOBindEntry &Entry : info->O->bindTable()) {
-      uint32_t SegIndex = Entry.segmentIndex();
-      uint64_t OffsetInSeg = Entry.segmentOffset();
-      if (!sectionTable.isValidSegIndexAndOffset(SegIndex, OffsetInSeg))
-        continue;
-      uint64_t Address = sectionTable.address(SegIndex, OffsetInSeg);
+    Error Err = Error::success();
+    for (const llvm::object::MachOBindEntry &Entry : info->O->bindTable(Err)) {
+      uint64_t Address = Entry.address();
       StringRef name = Entry.symbolName();
       if (!name.empty())
         (*info->bindtable)[Address] = name;
     }
+    if (Err)
+      report_error(info->O->getFileName(), std::move(Err));
   }
   auto name = info->bindtable->lookup(ReferenceValue);
   return !name.empty() ? name.data() : nullptr;
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index 6b8a43ea1bdd35d03b673e9a152d6775bb65c33d..613d0643b43357e0ea8c823b9b52d05cacdb1e9f 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -158,6 +158,8 @@ cl::opt<bool>
 llvm::NoShowRawInsn("no-show-raw-insn", cl::desc("When disassembling "
                                                  "instructions, do not print "
                                                  "the instruction bytes."));
+cl::opt<bool>
+llvm::NoLeadingAddr("no-leading-addr", cl::desc("Print no leading address"));
 
 cl::opt<bool>
 llvm::UnwindInfo("unwind-info", cl::desc("Display unwind information"));
@@ -213,6 +215,8 @@ cl::opt<unsigned long long>
                 cl::value_desc("address"), cl::init(UINT64_MAX));
 static StringRef ToolName;
 
+typedef std::vector<std::tuple<uint64_t, StringRef, uint8_t>> SectionSymbolsTy;
+
 namespace {
 typedef std::function<bool(llvm::object::SectionRef const &)> FilterPredicate;
 
@@ -508,7 +512,8 @@ public:
                          MCSubtargetInfo const &STI, SourcePrinter *SP) {
     if (SP && (PrintSource || PrintLines))
       SP->printSourceLine(OS, Address);
-    OS << format("%8" PRIx64 ":", Address);
+    if (!NoLeadingAddr)
+      OS << format("%8" PRIx64 ":", Address);
     if (!NoShowRawInsn) {
       OS << "\t";
       dumpBytes(Bytes, OS);
@@ -526,7 +531,8 @@ public:
                  raw_ostream &OS) {
     uint32_t opcode =
       (Bytes[3] << 24) | (Bytes[2] << 16) | (Bytes[1] << 8) | Bytes[0];
-    OS << format("%8" PRIx64 ":", Address);
+    if (!NoLeadingAddr)
+      OS << format("%8" PRIx64 ":", Address);
     if (!NoShowRawInsn) {
       OS << "\t";
       dumpBytes(Bytes.slice(0, 4), OS);
@@ -587,6 +593,9 @@ public:
   void printInst(MCInstPrinter &IP, const MCInst *MI, ArrayRef<uint8_t> Bytes,
                  uint64_t Address, raw_ostream &OS, StringRef Annot,
                  MCSubtargetInfo const &STI, SourcePrinter *SP) override {
+    if (SP && (PrintSource || PrintLines))
+      SP->printSourceLine(OS, Address);
+
     if (!MI) {
       OS << " <unknown>";
       return;
@@ -618,7 +627,8 @@ public:
                  MCSubtargetInfo const &STI, SourcePrinter *SP) override {
     if (SP && (PrintSource || PrintLines))
       SP->printSourceLine(OS, Address);
-    OS << format("%8" PRId64 ":", Address / 8);
+    if (!NoLeadingAddr)
+      OS << format("%8" PRId64 ":", Address / 8);
     if (!NoShowRawInsn) {
       OS << "\t";
       dumpBytes(Bytes, OS);
@@ -1108,6 +1118,52 @@ static uint8_t getElfSymbolType(const ObjectFile *Obj, const SymbolRef &Sym) {
   llvm_unreachable("Unsupported binary format");
 }
 
+template <class ELFT> static void
+addDynamicElfSymbols(const ELFObjectFile<ELFT> *Obj,
+                     std::map<SectionRef, SectionSymbolsTy> &AllSymbols) {
+  for (auto Symbol : Obj->getDynamicSymbolIterators()) {
+    uint8_t SymbolType = Symbol.getELFType();
+    if (SymbolType != ELF::STT_FUNC || Symbol.getSize() == 0)
+      continue;
+
+    Expected<uint64_t> AddressOrErr = Symbol.getAddress();
+    if (!AddressOrErr)
+      report_error(Obj->getFileName(), AddressOrErr.takeError());
+    uint64_t Address = *AddressOrErr;
+
+    Expected<StringRef> Name = Symbol.getName();
+    if (!Name)
+      report_error(Obj->getFileName(), Name.takeError());
+    if (Name->empty())
+      continue;
+
+    Expected<section_iterator> SectionOrErr = Symbol.getSection();
+    if (!SectionOrErr)
+      report_error(Obj->getFileName(), SectionOrErr.takeError());
+    section_iterator SecI = *SectionOrErr;
+    if (SecI == Obj->section_end())
+      continue;
+
+    AllSymbols[*SecI].emplace_back(Address, *Name, SymbolType);
+  }
+}
+
+static void
+addDynamicElfSymbols(const ObjectFile *Obj,
+                     std::map<SectionRef, SectionSymbolsTy> &AllSymbols) {
+  assert(Obj->isELF());
+  if (auto *Elf32LEObj = dyn_cast<ELF32LEObjectFile>(Obj))
+    addDynamicElfSymbols(Elf32LEObj, AllSymbols);
+  else if (auto *Elf64LEObj = dyn_cast<ELF64LEObjectFile>(Obj))
+    addDynamicElfSymbols(Elf64LEObj, AllSymbols);
+  else if (auto *Elf32BEObj = dyn_cast<ELF32BEObjectFile>(Obj))
+    addDynamicElfSymbols(Elf32BEObj, AllSymbols);
+  else if (auto *Elf64BEObj = cast<ELF64BEObjectFile>(Obj))
+    addDynamicElfSymbols(Elf64BEObj, AllSymbols);
+  else
+    llvm_unreachable("Unsupported binary format");
+}
+
 static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
   if (StartAddress > StopAddress)
     error("Start address should be less than stop address");
@@ -1182,7 +1238,6 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
 
   // Create a mapping from virtual address to symbol name.  This is used to
   // pretty print the symbols while disassembling.
-  typedef std::vector<std::tuple<uint64_t, StringRef, uint8_t>> SectionSymbolsTy;
   std::map<SectionRef, SectionSymbolsTy> AllSymbols;
   for (const SymbolRef &Symbol : Obj->symbols()) {
     Expected<uint64_t> AddressOrErr = Symbol.getAddress();
@@ -1210,6 +1265,8 @@ static void DisassembleObject(const ObjectFile *Obj, bool InlineRelocs) {
     AllSymbols[*SecI].emplace_back(Address, *Name, SymbolType);
 
   }
+  if (AllSymbols.empty() && Obj->isELF())
+    addDynamicElfSymbols(Obj, AllSymbols);
 
   // Create a mapping from virtual address to section.
   std::vector<std::pair<uint64_t, SectionRef>> SectionAddresses;
@@ -1828,9 +1885,9 @@ void llvm::printExportsTrie(const ObjectFile *o) {
   }
 }
 
-void llvm::printRebaseTable(const ObjectFile *o) {
+void llvm::printRebaseTable(ObjectFile *o) {
   outs() << "Rebase table:\n";
-  if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
+  if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachORebaseTable(MachO);
   else {
     errs() << "This operation is only currently supported "
@@ -1839,9 +1896,9 @@ void llvm::printRebaseTable(const ObjectFile *o) {
   }
 }
 
-void llvm::printBindTable(const ObjectFile *o) {
+void llvm::printBindTable(ObjectFile *o) {
   outs() << "Bind table:\n";
-  if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
+  if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOBindTable(MachO);
   else {
     errs() << "This operation is only currently supported "
@@ -1850,9 +1907,9 @@ void llvm::printBindTable(const ObjectFile *o) {
   }
 }
 
-void llvm::printLazyBindTable(const ObjectFile *o) {
+void llvm::printLazyBindTable(ObjectFile *o) {
   outs() << "Lazy bind table:\n";
-  if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
+  if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOLazyBindTable(MachO);
   else {
     errs() << "This operation is only currently supported "
@@ -1861,9 +1918,9 @@ void llvm::printLazyBindTable(const ObjectFile *o) {
   }
 }
 
-void llvm::printWeakBindTable(const ObjectFile *o) {
+void llvm::printWeakBindTable(ObjectFile *o) {
   outs() << "Weak bind table:\n";
-  if (const MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
+  if (MachOObjectFile *MachO = dyn_cast<MachOObjectFile>(o))
     printMachOWeakBindTable(MachO);
   else {
     errs() << "This operation is only currently supported "
@@ -1961,7 +2018,7 @@ static void printPrivateFileHeaders(const ObjectFile *o, bool onlyFirst) {
   report_error(o->getFileName(), "Invalid/Unsupported object file format");
 }
 
-static void DumpObject(const ObjectFile *o, const Archive *a = nullptr) {
+static void DumpObject(ObjectFile *o, const Archive *a = nullptr) {
   StringRef ArchiveName = a != nullptr ? a->getFileName() : "";
   // Avoid other output when using a raw option.
   if (!RawClangAST) {
diff --git a/tools/llvm-objdump/llvm-objdump.h b/tools/llvm-objdump/llvm-objdump.h
index dace82af3d07d5496e18d4f6bf1aca4d41b9e66d..2fcd506884b18275650a8b6011124cfde6b57608 100644
--- a/tools/llvm-objdump/llvm-objdump.h
+++ b/tools/llvm-objdump/llvm-objdump.h
@@ -35,6 +35,7 @@ extern cl::list<std::string> FilterSections;
 extern cl::opt<bool> Disassemble;
 extern cl::opt<bool> DisassembleAll;
 extern cl::opt<bool> NoShowRawInsn;
+extern cl::opt<bool> NoLeadingAddr;
 extern cl::opt<bool> PrivateHeaders;
 extern cl::opt<bool> FirstPrivateHeader;
 extern cl::opt<bool> ExportsTrie;
@@ -69,10 +70,10 @@ void ParseInputMachO(StringRef Filename);
 void printCOFFUnwindInfo(const object::COFFObjectFile* o);
 void printMachOUnwindInfo(const object::MachOObjectFile* o);
 void printMachOExportsTrie(const object::MachOObjectFile* o);
-void printMachORebaseTable(const object::MachOObjectFile* o);
-void printMachOBindTable(const object::MachOObjectFile* o);
-void printMachOLazyBindTable(const object::MachOObjectFile* o);
-void printMachOWeakBindTable(const object::MachOObjectFile* o);
+void printMachORebaseTable(object::MachOObjectFile* o);
+void printMachOBindTable(object::MachOObjectFile* o);
+void printMachOLazyBindTable(object::MachOObjectFile* o);
+void printMachOWeakBindTable(object::MachOObjectFile* o);
 void printELFFileHeader(const object::ObjectFile *o);
 void printCOFFFileHeader(const object::ObjectFile *o);
 void printCOFFSymbolTable(const object::COFFImportFile *i);
@@ -81,10 +82,10 @@ void printMachOFileHeader(const object::ObjectFile *o);
 void printMachOLoadCommands(const object::ObjectFile *o);
 void printWasmFileHeader(const object::ObjectFile *o);
 void printExportsTrie(const object::ObjectFile *o);
-void printRebaseTable(const object::ObjectFile *o);
-void printBindTable(const object::ObjectFile *o);
-void printLazyBindTable(const object::ObjectFile *o);
-void printWeakBindTable(const object::ObjectFile *o);
+void printRebaseTable(object::ObjectFile *o);
+void printBindTable(object::ObjectFile *o);
+void printLazyBindTable(object::ObjectFile *o);
+void printWeakBindTable(object::ObjectFile *o);
 void printRawClangAST(const object::ObjectFile *o);
 void PrintRelocations(const object::ObjectFile *o);
 void PrintSectionHeaders(const object::ObjectFile *o);
diff --git a/tools/llvm-pdbdump/Analyze.cpp b/tools/llvm-pdbdump/Analyze.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b65dd40d25ff1b2e7d154c906cb03217fee1c78a
--- /dev/null
+++ b/tools/llvm-pdbdump/Analyze.cpp
@@ -0,0 +1,164 @@
+//===- Analyze.cpp - PDB analysis functions ---------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Analyze.h"
+
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabase.h"
+#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawError.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <list>
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+static StringRef getLeafTypeName(TypeLeafKind LT) {
+  switch (LT) {
+#define TYPE_RECORD(ename, value, name)                                        \
+  case ename:                                                                  \
+    return #name;
+#include "llvm/DebugInfo/CodeView/TypeRecords.def"
+  default:
+    break;
+  }
+  return "UnknownLeaf";
+}
+
+namespace {
+struct HashLookupVisitor : public TypeVisitorCallbacks {
+  struct Entry {
+    TypeIndex TI;
+    CVType Record;
+  };
+
+  explicit HashLookupVisitor(TpiStream &Tpi) : Tpi(Tpi) {}
+
+  Error visitTypeBegin(CVType &Record) override {
+    uint32_t H = Tpi.getHashValues()[I];
+    Record.Hash = H;
+    TypeIndex TI(I + TypeIndex::FirstNonSimpleIndex);
+    Lookup[H].push_back(Entry{TI, Record});
+    ++I;
+    return Error::success();
+  }
+
+  uint32_t I = 0;
+  DenseMap<uint32_t, std::list<Entry>> Lookup;
+  TpiStream &Tpi;
+};
+}
+
+AnalysisStyle::AnalysisStyle(PDBFile &File) : File(File) {}
+
+Error AnalysisStyle::dump() {
+  auto Tpi = File.getPDBTpiStream();
+  if (!Tpi)
+    return Tpi.takeError();
+
+  TypeDatabase TypeDB;
+  TypeDatabaseVisitor DBV(TypeDB);
+  TypeDeserializer Deserializer;
+  TypeVisitorCallbackPipeline Pipeline;
+  HashLookupVisitor Hasher(*Tpi);
+  // Deserialize the types
+  Pipeline.addCallbackToPipeline(Deserializer);
+  // Add them to the database
+  Pipeline.addCallbackToPipeline(DBV);
+  // Store their hash values
+  Pipeline.addCallbackToPipeline(Hasher);
+
+  CVTypeVisitor Visitor(Pipeline);
+
+  bool Error = false;
+  for (auto Item : Tpi->types(&Error)) {
+    if (auto EC = Visitor.visitTypeRecord(Item))
+      return EC;
+  }
+  if (Error)
+    return make_error<RawError>(raw_error_code::corrupt_file,
+                                "TPI stream contained corrupt record");
+
+  auto &Adjusters = Tpi->getHashAdjusters();
+  DenseSet<uint32_t> AdjusterSet;
+  for (const auto &Adj : Adjusters) {
+    assert(AdjusterSet.find(Adj.second) == AdjusterSet.end());
+    AdjusterSet.insert(Adj.second);
+  }
+
+  uint32_t Count = 0;
+  outs() << "Searching for hash collisions\n";
+  for (const auto &H : Hasher.Lookup) {
+    if (H.second.size() <= 1)
+      continue;
+    ++Count;
+    outs() << formatv("Hash: {0}, Count: {1} records\n", H.first,
+                      H.second.size());
+    for (const auto &R : H.second) {
+      auto Iter = AdjusterSet.find(R.TI.getIndex());
+      StringRef Prefix;
+      if (Iter != AdjusterSet.end()) {
+        Prefix = "[HEAD]";
+        AdjusterSet.erase(Iter);
+      }
+      StringRef LeafName = getLeafTypeName(R.Record.Type);
+      uint32_t TI = R.TI.getIndex();
+      StringRef TypeName = TypeDB.getTypeName(R.TI);
+      outs() << formatv("{0,-6} {1} ({2:x}) {3}\n", Prefix, LeafName, TI,
+                        TypeName);
+    }
+  }
+
+  outs() << "\n";
+  outs() << "Dumping hash adjustment chains\n";
+  for (const auto &A : Tpi->getHashAdjusters()) {
+    TypeIndex TI(A.second);
+    StringRef TypeName = TypeDB.getTypeName(TI);
+    const CVType &HeadRecord = TypeDB.getTypeRecord(TI);
+    assert(HeadRecord.Hash.hasValue());
+
+    auto CollisionsIter = Hasher.Lookup.find(*HeadRecord.Hash);
+    if (CollisionsIter == Hasher.Lookup.end())
+      continue;
+
+    const auto &Collisions = CollisionsIter->second;
+    outs() << TypeName << "\n";
+    outs() << formatv("    [HEAD] {0:x} {1} {2}\n", A.second,
+                      getLeafTypeName(HeadRecord.Type), TypeName);
+    for (const auto &Chain : Collisions) {
+      if (Chain.TI == TI)
+        continue;
+      const CVType &TailRecord = TypeDB.getTypeRecord(Chain.TI);
+      outs() << formatv("           {0:x} {1} {2}\n", Chain.TI.getIndex(),
+                        getLeafTypeName(TailRecord.Type),
+                        TypeDB.getTypeName(Chain.TI));
+    }
+  }
+  outs() << formatv("There are {0} orphaned hash adjusters\n",
+                    AdjusterSet.size());
+  for (const auto &Adj : AdjusterSet) {
+    outs() << formatv("    {0}\n", Adj);
+  }
+
+  uint32_t DistinctHashValues = Hasher.Lookup.size();
+  outs() << formatv("{0}/{1} hash collisions", Count, DistinctHashValues);
+  return Error::success();
+}
diff --git a/tools/llvm-pdbdump/Analyze.h b/tools/llvm-pdbdump/Analyze.h
new file mode 100644
index 0000000000000000000000000000000000000000..7230ae45b0c8cef3d06a41494b8e49e70be69eb8
--- /dev/null
+++ b/tools/llvm-pdbdump/Analyze.h
@@ -0,0 +1,30 @@
+//===- Analyze.h - PDB analysis functions -----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_ANALYSIS_H
+#define LLVM_TOOLS_LLVMPDBDUMP_ANALYSIS_H
+
+#include "OutputStyle.h"
+
+namespace llvm {
+namespace pdb {
+class PDBFile;
+class AnalysisStyle : public OutputStyle {
+public:
+  explicit AnalysisStyle(PDBFile &File);
+
+  Error dump() override;
+
+private:
+  PDBFile &File;
+};
+}
+}
+
+#endif
diff --git a/tools/llvm-pdbdump/CMakeLists.txt b/tools/llvm-pdbdump/CMakeLists.txt
index cb6abb1326de73d19aa12ff7ee02ad80438c1e13..900508a02131f040105b738a2fe081d8e75510a7 100644
--- a/tools/llvm-pdbdump/CMakeLists.txt
+++ b/tools/llvm-pdbdump/CMakeLists.txt
@@ -7,6 +7,8 @@ set(LLVM_LINK_COMPONENTS
   )
 
 add_llvm_tool(llvm-pdbdump
+  Analyze.cpp
+  Diff.cpp
   CompactTypeDumpVisitor.cpp
   llvm-pdbdump.cpp
   YamlSymbolDumper.cpp
@@ -23,6 +25,7 @@ add_llvm_tool(llvm-pdbdump
   PrettyTypeDumper.cpp
   PrettyTypedefDumper.cpp
   PrettyVariableDumper.cpp
+  StreamUtil.cpp
   YAMLOutputStyle.cpp
   )
 
diff --git a/tools/llvm-pdbdump/Diff.cpp b/tools/llvm-pdbdump/Diff.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..8c02d36044d82b038e7a33ac1248b83ba911ac7b
--- /dev/null
+++ b/tools/llvm-pdbdump/Diff.cpp
@@ -0,0 +1,523 @@
+//===- Diff.cpp - PDB diff utility ------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Diff.h"
+
+#include "StreamUtil.h"
+#include "llvm-pdbdump.h"
+
+#include "llvm/DebugInfo/PDB/Native/Formatters.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/RawConstants.h"
+#include "llvm/DebugInfo/PDB/Native/StringTable.h"
+
+#include "llvm/Support/FormatAdapters.h"
+#include "llvm/Support/FormatProviders.h"
+#include "llvm/Support/FormatVariadic.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+namespace llvm {
+template <> struct format_provider<PdbRaw_FeatureSig> {
+  static void format(const PdbRaw_FeatureSig &Sig, raw_ostream &Stream,
+                     StringRef Style) {
+    switch (Sig) {
+    case PdbRaw_FeatureSig::MinimalDebugInfo:
+      Stream << "MinimalDebugInfo";
+      break;
+    case PdbRaw_FeatureSig::NoTypeMerge:
+      Stream << "NoTypeMerge";
+      break;
+    case PdbRaw_FeatureSig::VC110:
+      Stream << "VC110";
+      break;
+    case PdbRaw_FeatureSig::VC140:
+      Stream << "VC140";
+      break;
+    }
+  }
+};
+}
+
+template <typename R> using ValueOfRange = llvm::detail::ValueOfRange<R>;
+
+template <typename Range, typename Comp>
+static void set_differences(Range &&R1, Range &&R2,
+                            SmallVectorImpl<ValueOfRange<Range>> *OnlyLeft,
+                            SmallVectorImpl<ValueOfRange<Range>> *OnlyRight,
+                            SmallVectorImpl<ValueOfRange<Range>> *Intersection,
+                            Comp Comparator) {
+
+  std::sort(R1.begin(), R1.end(), Comparator);
+  std::sort(R2.begin(), R2.end(), Comparator);
+
+  if (OnlyLeft) {
+    OnlyLeft->reserve(R1.size());
+    auto End = std::set_difference(R1.begin(), R1.end(), R2.begin(), R2.end(),
+                                   OnlyLeft->begin(), Comparator);
+    OnlyLeft->set_size(std::distance(OnlyLeft->begin(), End));
+  }
+  if (OnlyRight) {
+    OnlyLeft->reserve(R2.size());
+    auto End = std::set_difference(R2.begin(), R2.end(), R1.begin(), R1.end(),
+                                   OnlyRight->begin(), Comparator);
+    OnlyRight->set_size(std::distance(OnlyRight->begin(), End));
+  }
+  if (Intersection) {
+    Intersection->reserve(std::min(R1.size(), R2.size()));
+    auto End = std::set_intersection(R1.begin(), R1.end(), R2.begin(), R2.end(),
+                                     Intersection->begin(), Comparator);
+    Intersection->set_size(std::distance(Intersection->begin(), End));
+  }
+}
+
+template <typename Range>
+static void
+set_differences(Range &&R1, Range &&R2,
+                SmallVectorImpl<ValueOfRange<Range>> *OnlyLeft,
+                SmallVectorImpl<ValueOfRange<Range>> *OnlyRight,
+                SmallVectorImpl<ValueOfRange<Range>> *Intersection = nullptr) {
+  std::less<ValueOfRange<Range>> Comp;
+  set_differences(std::forward<Range>(R1), std::forward<Range>(R2), OnlyLeft,
+                  OnlyRight, Intersection, Comp);
+}
+
+DiffStyle::DiffStyle(PDBFile &File1, PDBFile &File2)
+    : File1(File1), File2(File2) {}
+
+Error DiffStyle::dump() {
+  if (auto EC = diffSuperBlock())
+    return EC;
+
+  if (auto EC = diffFreePageMap())
+    return EC;
+
+  if (auto EC = diffStreamDirectory())
+    return EC;
+
+  if (auto EC = diffStringTable())
+    return EC;
+
+  if (auto EC = diffInfoStream())
+    return EC;
+
+  if (auto EC = diffDbiStream())
+    return EC;
+
+  if (auto EC = diffSectionContribs())
+    return EC;
+
+  if (auto EC = diffSectionMap())
+    return EC;
+
+  if (auto EC = diffFpoStream())
+    return EC;
+
+  if (auto EC = diffTpiStream(StreamTPI))
+    return EC;
+
+  if (auto EC = diffTpiStream(StreamIPI))
+    return EC;
+
+  if (auto EC = diffPublics())
+    return EC;
+
+  if (auto EC = diffGlobals())
+    return EC;
+
+  return Error::success();
+}
+
+template <typename T>
+static bool diffAndPrint(StringRef Label, PDBFile &File1, PDBFile &File2, T V1,
+                         T V2) {
+  if (V1 == V2) {
+    outs() << formatv("  {0}: No differences detected!\n", Label);
+    return false;
+  }
+
+  outs().indent(2) << Label << "\n";
+  outs().indent(4) << formatv("{0}: {1}\n", File1.getFilePath(), V1);
+  outs().indent(4) << formatv("{0}: {1}\n", File2.getFilePath(), V2);
+  return true;
+}
+
+template <typename T>
+static bool diffAndPrint(StringRef Label, PDBFile &File1, PDBFile &File2,
+                         ArrayRef<T> V1, ArrayRef<T> V2) {
+  if (V1 == V2) {
+    outs() << formatv("  {0}: No differences detected!\n", Label);
+    return false;
+  }
+
+  outs().indent(2) << Label << "\n";
+  outs().indent(4) << formatv("{0}: {1}\n", File1.getFilePath(),
+                              make_range(V1.begin(), V1.end()));
+  outs().indent(4) << formatv("{0}: {1}\n", File2.getFilePath(),
+                              make_range(V2.begin(), V2.end()));
+  return true;
+}
+
+template <typename T>
+static bool printSymmetricDifferences(PDBFile &File1, PDBFile &File2,
+                                      T &&OnlyRange1, T &&OnlyRange2,
+                                      StringRef Label) {
+  bool HasDiff = false;
+  if (!OnlyRange1.empty()) {
+    HasDiff = true;
+    outs() << formatv("  {0} {1}(s) only in ({2})\n", OnlyRange1.size(), Label,
+                      File1.getFilePath());
+    for (const auto &Item : OnlyRange1)
+      outs() << formatv("    {0}\n", Label, Item);
+  }
+  if (!OnlyRange2.empty()) {
+    HasDiff = true;
+    outs() << formatv("  {0} {1}(s) only in ({2})\n", OnlyRange2.size(),
+                      File2.getFilePath());
+    for (const auto &Item : OnlyRange2)
+      outs() << formatv("    {0}\n", Item);
+  }
+  return HasDiff;
+}
+
+Error DiffStyle::diffSuperBlock() {
+  outs() << "MSF Super Block: Searching for differences...\n";
+  bool Diffs = false;
+
+  Diffs |= diffAndPrint("Block Size", File1, File2, File1.getBlockSize(),
+                        File2.getBlockSize());
+  Diffs |= diffAndPrint("Block Count", File1, File2, File1.getBlockCount(),
+                        File2.getBlockCount());
+  Diffs |= diffAndPrint("Unknown 1", File1, File2, File1.getUnknown1(),
+                        File2.getUnknown1());
+
+  if (opts::diff::Pedantic) {
+    Diffs |= diffAndPrint("Free Block Map", File1, File2,
+                          File1.getFreeBlockMapBlock(),
+                          File2.getFreeBlockMapBlock());
+    Diffs |= diffAndPrint("Directory Size", File1, File2,
+                          File1.getNumDirectoryBytes(),
+                          File2.getNumDirectoryBytes());
+    Diffs |= diffAndPrint("Block Map Addr", File1, File2,
+                          File1.getBlockMapOffset(), File2.getBlockMapOffset());
+  }
+  if (!Diffs)
+    outs() << "MSF Super Block: No differences detected...\n";
+  return Error::success();
+}
+
+Error DiffStyle::diffStreamDirectory() {
+  SmallVector<std::string, 32> P;
+  SmallVector<std::string, 32> Q;
+  discoverStreamPurposes(File1, P);
+  discoverStreamPurposes(File2, Q);
+  outs() << "Stream Directory: Searching for differences...\n";
+
+  bool HasDifferences = false;
+  if (opts::diff::Pedantic) {
+    size_t Min = std::min(P.size(), Q.size());
+    for (size_t I = 0; I < Min; ++I) {
+      StringRef Names[] = {P[I], Q[I]};
+      uint32_t Sizes[] = {File1.getStreamByteSize(I),
+                          File2.getStreamByteSize(I)};
+      bool NamesDiffer = Names[0] != Names[1];
+      bool SizesDiffer = Sizes[0] != Sizes[1];
+      if (NamesDiffer) {
+        HasDifferences = true;
+        outs().indent(2) << formatv("Stream {0} - {1}: {2}, {3}: {4}\n", I,
+                                    File1.getFilePath(), Names[0],
+                                    File2.getFilePath(), Names[1]);
+        continue;
+      }
+      if (SizesDiffer) {
+        HasDifferences = true;
+        outs().indent(2) << formatv(
+            "Stream {0} ({1}): {2}: {3} bytes, {4}: {5} bytes\n", I, Names[0],
+            File1.getFilePath(), Sizes[0], File2.getFilePath(), Sizes[1]);
+        continue;
+      }
+    }
+
+    ArrayRef<std::string> MaxNames = (P.size() > Q.size() ? P : Q);
+    size_t Max = std::max(P.size(), Q.size());
+    PDBFile &MaxFile = (P.size() > Q.size() ? File1 : File2);
+    StringRef MinFileName =
+        (P.size() < Q.size() ? File1.getFilePath() : File2.getFilePath());
+    for (size_t I = Min; I < Max; ++I) {
+      HasDifferences = true;
+      StringRef StreamName = MaxNames[I];
+
+      outs().indent(2) << formatv(
+          "Stream {0} - {1}: <not present>, {2}: Index {3}, {4} bytes\n",
+          StreamName, MinFileName, MaxFile.getFilePath(), I,
+          MaxFile.getStreamByteSize(I));
+    }
+    if (!HasDifferences)
+      outs() << "Stream Directory: No differences detected...\n";
+  } else {
+    auto PI = to_vector<32>(enumerate(P));
+    auto QI = to_vector<32>(enumerate(Q));
+
+    typedef decltype(PI) ContainerType;
+    typedef typename ContainerType::value_type value_type;
+
+    auto Comparator = [](const value_type &I1, const value_type &I2) {
+      return I1.value() < I2.value();
+    };
+
+    decltype(PI) OnlyP;
+    decltype(QI) OnlyQ;
+    decltype(PI) Common;
+
+    set_differences(PI, QI, &OnlyP, &OnlyQ, &Common, Comparator);
+
+    if (!OnlyP.empty()) {
+      HasDifferences = true;
+      outs().indent(2) << formatv("{0} Stream(s) only in ({1})\n", OnlyP.size(),
+                                  File1.getFilePath());
+      for (auto &Item : OnlyP) {
+        outs().indent(4) << formatv("Stream {0} - {1}\n", Item.index(),
+                                    Item.value());
+      }
+    }
+
+    if (!OnlyQ.empty()) {
+      HasDifferences = true;
+      outs().indent(2) << formatv("{0} Streams(s) only in ({1})\n",
+                                  OnlyQ.size(), File2.getFilePath());
+      for (auto &Item : OnlyQ) {
+        outs().indent(4) << formatv("Stream {0} - {1}\n", Item.index(),
+                                    Item.value());
+      }
+    }
+    if (!Common.empty()) {
+      outs().indent(2) << formatv("Found {0} common streams.  Searching for "
+                                  "intra-stream differences.\n",
+                                  Common.size());
+      bool HasCommonDifferences = false;
+      for (const auto &Left : Common) {
+        // Left was copied from the first range so its index refers to a stream
+        // index in the first file.  Find the corresponding stream index in the
+        // second file.
+        auto Range =
+            std::equal_range(QI.begin(), QI.end(), Left,
+                             [](const value_type &L, const value_type &R) {
+                               return L.value() < R.value();
+                             });
+        const auto &Right = *Range.first;
+        assert(Left.value() == Right.value());
+        uint32_t LeftSize = File1.getStreamByteSize(Left.index());
+        uint32_t RightSize = File2.getStreamByteSize(Right.index());
+        if (LeftSize != RightSize) {
+          HasDifferences = true;
+          HasCommonDifferences = true;
+          outs().indent(4) << formatv("{0} ({1}: {2} bytes, {3}: {4} bytes)\n",
+                                      Left.value(), File1.getFilePath(),
+                                      LeftSize, File2.getFilePath(), RightSize);
+        }
+      }
+      if (!HasCommonDifferences)
+        outs().indent(2) << "Common Streams:  No differences detected!\n";
+    }
+    if (!HasDifferences)
+      outs() << "Stream Directory: No differences detected!\n";
+  }
+
+  return Error::success();
+}
+
+Error DiffStyle::diffStringTable() {
+  auto ExpectedST1 = File1.getStringTable();
+  auto ExpectedST2 = File2.getStringTable();
+  outs() << "String Table: Searching for differences...\n";
+  bool Has1 = !!ExpectedST1;
+  bool Has2 = !!ExpectedST2;
+  if (!(Has1 && Has2)) {
+    // If one has a string table and the other doesn't, we can print less
+    // output.
+    if (Has1 != Has2) {
+      if (Has1) {
+        outs() << formatv("  {0}: ({1} strings)\n", File1.getFilePath(),
+                          ExpectedST1->getNameCount());
+        outs() << formatv("  {0}: (string table not present)\n",
+                          File2.getFilePath());
+      } else {
+        outs() << formatv("  {0}: (string table not present)\n",
+                          File1.getFilePath());
+        outs() << formatv("  {0}: ({1})\n", File2.getFilePath(),
+                          ExpectedST2->getNameCount());
+      }
+    }
+    consumeError(ExpectedST1.takeError());
+    consumeError(ExpectedST2.takeError());
+    return Error::success();
+  }
+
+  bool HasDiff = false;
+  auto &ST1 = *ExpectedST1;
+  auto &ST2 = *ExpectedST2;
+
+  if (ST1.getByteSize() != ST2.getByteSize()) {
+    outs() << "  Stream Size\n";
+    outs() << formatv("    {0} - {1} byte(s)\n", File1.getFilePath(),
+                      ST1.getByteSize());
+    outs() << formatv("    {0} - {1} byte(s)\n", File2.getFilePath(),
+                      ST2.getByteSize());
+    outs() << formatv("    Difference: {0} bytes\n",
+                      AbsoluteDifference(ST1.getByteSize(), ST2.getByteSize()));
+    HasDiff = true;
+  }
+  HasDiff |= diffAndPrint("Hash Version", File1, File2, ST1.getHashVersion(),
+                          ST1.getHashVersion());
+  HasDiff |= diffAndPrint("Signature", File1, File2, ST1.getSignature(),
+                          ST1.getSignature());
+
+  // Both have a valid string table, dive in and compare individual strings.
+
+  auto IdList1 = ST1.name_ids();
+  auto IdList2 = ST2.name_ids();
+  if (opts::diff::Pedantic) {
+    // In pedantic mode, we compare index by index (i.e. the strings are in the
+    // same order
+    // in both tables.
+    uint32_t Max = std::max(IdList1.size(), IdList2.size());
+    for (uint32_t I = 0; I < Max; ++I) {
+      Optional<uint32_t> Id1, Id2;
+      StringRef S1, S2;
+      if (I < IdList1.size()) {
+        Id1 = IdList1[I];
+        S1 = ST1.getStringForID(*Id1);
+      }
+      if (I < IdList2.size()) {
+        Id2 = IdList2[I];
+        S2 = ST2.getStringForID(*Id2);
+      }
+      if (Id1 == Id2 && S1 == S2)
+        continue;
+
+      std::string OutId1 =
+          Id1 ? formatv("{0}", *Id1).str() : "(index not present)";
+      std::string OutId2 =
+          Id2 ? formatv("{0}", *Id2).str() : "(index not present)";
+      outs() << formatv("  String {0}\n", I);
+      outs() << formatv("    {0}: Hash - {1}, Value - {2}\n",
+                        File1.getFilePath(), OutId1, S1);
+      outs() << formatv("    {0}: Hash - {1}, Value - {2}\n",
+                        File2.getFilePath(), OutId2, S2);
+      HasDiff = true;
+    }
+  } else {
+    std::vector<StringRef> Strings1, Strings2;
+    Strings1.reserve(IdList1.size());
+    Strings2.reserve(IdList2.size());
+    for (auto ID : IdList1)
+      Strings1.push_back(ST1.getStringForID(ID));
+    for (auto ID : IdList2)
+      Strings2.push_back(ST2.getStringForID(ID));
+
+    SmallVector<StringRef, 64> OnlyP;
+    SmallVector<StringRef, 64> OnlyQ;
+    auto End1 = std::remove(Strings1.begin(), Strings1.end(), "");
+    auto End2 = std::remove(Strings2.begin(), Strings2.end(), "");
+    uint32_t Empty1 = std::distance(End1, Strings1.end());
+    uint32_t Empty2 = std::distance(End2, Strings2.end());
+    Strings1.erase(End1, Strings1.end());
+    Strings2.erase(End2, Strings2.end());
+    set_differences(Strings1, Strings2, &OnlyP, &OnlyQ);
+    printSymmetricDifferences(File1, File2, OnlyP, OnlyQ, "String");
+
+    if (Empty1 != Empty2) {
+      PDBFile &MoreF = (Empty1 > Empty2) ? File1 : File2;
+      PDBFile &LessF = (Empty1 < Empty2) ? File1 : File2;
+      uint32_t Difference = AbsoluteDifference(Empty1, Empty2);
+      outs() << formatv("  {0} had {1} more empty strings than {2}\n",
+                        MoreF.getFilePath(), Difference, LessF.getFilePath());
+    }
+  }
+  if (!HasDiff)
+    outs() << "String Table: No differences detected!\n";
+  return Error::success();
+}
+
+Error DiffStyle::diffFreePageMap() { return Error::success(); }
+
+Error DiffStyle::diffInfoStream() {
+  auto ExpectedInfo1 = File1.getPDBInfoStream();
+  auto ExpectedInfo2 = File2.getPDBInfoStream();
+
+  outs() << "PDB Stream: Searching for differences...\n";
+  bool Has1 = !!ExpectedInfo1;
+  bool Has2 = !!ExpectedInfo2;
+  if (!(Has1 && Has2)) {
+    if (Has1 != Has2)
+      outs() << formatv("{0} does not have a PDB Stream!\n",
+                        Has1 ? File1.getFilePath() : File2.getFilePath());
+    consumeError(ExpectedInfo2.takeError());
+    consumeError(ExpectedInfo2.takeError());
+    return Error::success();
+  }
+
+  bool HasDiff = false;
+  auto &IS1 = *ExpectedInfo1;
+  auto &IS2 = *ExpectedInfo2;
+  if (IS1.getStreamSize() != IS2.getStreamSize()) {
+    outs() << "  Stream Size\n";
+    outs() << formatv("    {0} - {1} byte(s)\n", File1.getFilePath(),
+                      IS1.getStreamSize());
+    outs() << formatv("    {0} - {1} byte(s)\n", File2.getFilePath(),
+                      IS2.getStreamSize());
+    outs() << formatv(
+        "    Difference: {0} bytes\n",
+        AbsoluteDifference(IS1.getStreamSize(), IS2.getStreamSize()));
+    HasDiff = true;
+  }
+  HasDiff |= diffAndPrint("Age", File1, File2, IS1.getAge(), IS2.getAge());
+  HasDiff |= diffAndPrint("Guid", File1, File2, IS1.getGuid(), IS2.getGuid());
+  HasDiff |= diffAndPrint("Signature", File1, File2, IS1.getSignature(),
+                          IS2.getSignature());
+  HasDiff |=
+      diffAndPrint("Version", File1, File2, IS1.getVersion(), IS2.getVersion());
+  HasDiff |= diffAndPrint("Features", File1, File2, IS1.getFeatureSignatures(),
+                          IS2.getFeatureSignatures());
+  HasDiff |= diffAndPrint("Named Stream Byte Size", File1, File2,
+                          IS1.getNamedStreamMapByteSize(),
+                          IS2.getNamedStreamMapByteSize());
+  SmallVector<StringRef, 4> NS1;
+  SmallVector<StringRef, 4> NS2;
+  for (const auto &X : IS1.getNamedStreams().entries())
+    NS1.push_back(X.getKey());
+  for (const auto &X : IS2.getNamedStreams().entries())
+    NS2.push_back(X.getKey());
+  SmallVector<StringRef, 4> OnlyP;
+  SmallVector<StringRef, 4> OnlyQ;
+  set_differences(NS1, NS2, &OnlyP, &OnlyQ);
+  printSymmetricDifferences(File1, File2, OnlyP, OnlyQ, "Named Streams");
+  if (!HasDiff)
+    outs() << "PDB Stream: No differences detected!\n";
+
+  return Error::success();
+}
+
+Error DiffStyle::diffDbiStream() { return Error::success(); }
+
+Error DiffStyle::diffSectionContribs() { return Error::success(); }
+
+Error DiffStyle::diffSectionMap() { return Error::success(); }
+
+Error DiffStyle::diffFpoStream() { return Error::success(); }
+
+Error DiffStyle::diffTpiStream(int Index) { return Error::success(); }
+
+Error DiffStyle::diffModuleInfoStream(int Index) { return Error::success(); }
+
+Error DiffStyle::diffPublics() { return Error::success(); }
+
+Error DiffStyle::diffGlobals() { return Error::success(); }
diff --git a/tools/llvm-pdbdump/Diff.h b/tools/llvm-pdbdump/Diff.h
new file mode 100644
index 0000000000000000000000000000000000000000..6037576e21bb66912f0f4f658ab16acd4e17a264
--- /dev/null
+++ b/tools/llvm-pdbdump/Diff.h
@@ -0,0 +1,45 @@
+//===- Diff.h - PDB diff utility --------------------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_DIFF_H
+#define LLVM_TOOLS_LLVMPDBDUMP_DIFF_H
+
+#include "OutputStyle.h"
+
+namespace llvm {
+namespace pdb {
+class PDBFile;
+class DiffStyle : public OutputStyle {
+public:
+  explicit DiffStyle(PDBFile &File1, PDBFile &File2);
+
+  Error dump() override;
+
+private:
+  Error diffSuperBlock();
+  Error diffStreamDirectory();
+  Error diffStringTable();
+  Error diffFreePageMap();
+  Error diffInfoStream();
+  Error diffDbiStream();
+  Error diffSectionContribs();
+  Error diffSectionMap();
+  Error diffFpoStream();
+  Error diffTpiStream(int Index);
+  Error diffModuleInfoStream(int Index);
+  Error diffPublics();
+  Error diffGlobals();
+
+  PDBFile &File1;
+  PDBFile &File2;
+};
+}
+}
+
+#endif
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.cpp b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
index 2171bd48bced5c91d4ba6d8c0c74fb07ac667744..8348751703f14bdfd3b350c557c00f4dbdbb9ef1 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.cpp
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.cpp
@@ -10,6 +10,7 @@
 #include "LLVMOutputStyle.h"
 
 #include "CompactTypeDumpVisitor.h"
+#include "StreamUtil.h"
 #include "llvm-pdbdump.h"
 
 #include "llvm/DebugInfo/CodeView/CVTypeDumper.h"
@@ -22,7 +23,6 @@
 #include "llvm/DebugInfo/CodeView/TypeDumpVisitor.h"
 #include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/MSF/MappedBlockStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/EnumTables.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
@@ -36,6 +36,7 @@
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/Object/COFF.h"
+#include "llvm/Support/BinaryStreamReader.h"
 #include "llvm/Support/FormatVariadic.h"
 
 #include <unordered_map>
@@ -172,124 +173,12 @@ Error LLVMOutputStyle::dumpFileHeaders() {
   return Error::success();
 }
 
-void LLVMOutputStyle::discoverStreamPurposes() {
-  if (!StreamPurposes.empty())
-    return;
-
-  // It's OK if we fail to load some of these streams, we still attempt to print
-  // what we can.
-  auto Dbi = File.getPDBDbiStream();
-  auto Tpi = File.getPDBTpiStream();
-  auto Ipi = File.getPDBIpiStream();
-  auto Info = File.getPDBInfoStream();
-
-  uint32_t StreamCount = File.getNumStreams();
-  std::unordered_map<uint16_t, const ModuleInfoEx *> ModStreams;
-  std::unordered_map<uint16_t, std::string> NamedStreams;
-
-  if (Dbi) {
-    for (auto &ModI : Dbi->modules()) {
-      uint16_t SN = ModI.Info.getModuleStreamIndex();
-      ModStreams[SN] = &ModI;
-    }
-  }
-  if (Info) {
-    for (auto &NSE : Info->named_streams()) {
-      NamedStreams[NSE.second] = NSE.first();
-    }
-  }
-
-  StreamPurposes.resize(StreamCount);
-  for (uint16_t StreamIdx = 0; StreamIdx < StreamCount; ++StreamIdx) {
-    std::string Value;
-    if (StreamIdx == OldMSFDirectory)
-      Value = "Old MSF Directory";
-    else if (StreamIdx == StreamPDB)
-      Value = "PDB Stream";
-    else if (StreamIdx == StreamDBI)
-      Value = "DBI Stream";
-    else if (StreamIdx == StreamTPI)
-      Value = "TPI Stream";
-    else if (StreamIdx == StreamIPI)
-      Value = "IPI Stream";
-    else if (Dbi && StreamIdx == Dbi->getGlobalSymbolStreamIndex())
-      Value = "Global Symbol Hash";
-    else if (Dbi && StreamIdx == Dbi->getPublicSymbolStreamIndex())
-      Value = "Public Symbol Hash";
-    else if (Dbi && StreamIdx == Dbi->getSymRecordStreamIndex())
-      Value = "Public Symbol Records";
-    else if (Tpi && StreamIdx == Tpi->getTypeHashStreamIndex())
-      Value = "TPI Hash";
-    else if (Tpi && StreamIdx == Tpi->getTypeHashStreamAuxIndex())
-      Value = "TPI Aux Hash";
-    else if (Ipi && StreamIdx == Ipi->getTypeHashStreamIndex())
-      Value = "IPI Hash";
-    else if (Ipi && StreamIdx == Ipi->getTypeHashStreamAuxIndex())
-      Value = "IPI Aux Hash";
-    else if (Dbi &&
-             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Exception))
-      Value = "Exception Data";
-    else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Fixup))
-      Value = "Fixup Data";
-    else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::FPO))
-      Value = "FPO Data";
-    else if (Dbi &&
-             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::NewFPO))
-      Value = "New FPO Data";
-    else if (Dbi &&
-             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::OmapFromSrc))
-      Value = "Omap From Source Data";
-    else if (Dbi &&
-             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::OmapToSrc))
-      Value = "Omap To Source Data";
-    else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Pdata))
-      Value = "Pdata";
-    else if (Dbi &&
-             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::SectionHdr))
-      Value = "Section Header Data";
-    else if (Dbi &&
-             StreamIdx ==
-                 Dbi->getDebugStreamIndex(DbgHeaderType::SectionHdrOrig))
-      Value = "Section Header Original Data";
-    else if (Dbi &&
-             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::TokenRidMap))
-      Value = "Token Rid Data";
-    else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Xdata))
-      Value = "Xdata";
-    else {
-      auto ModIter = ModStreams.find(StreamIdx);
-      auto NSIter = NamedStreams.find(StreamIdx);
-      if (ModIter != ModStreams.end()) {
-        Value = "Module \"";
-        Value += ModIter->second->Info.getModuleName().str();
-        Value += "\"";
-      } else if (NSIter != NamedStreams.end()) {
-        Value = "Named Stream \"";
-        Value += NSIter->second;
-        Value += "\"";
-      } else {
-        Value = "???";
-      }
-    }
-    StreamPurposes[StreamIdx] = Value;
-  }
-
-  // Consume errors from missing streams.
-  if (!Dbi)
-    consumeError(Dbi.takeError());
-  if (!Tpi)
-    consumeError(Tpi.takeError());
-  if (!Ipi)
-    consumeError(Ipi.takeError());
-  if (!Info)
-    consumeError(Info.takeError());
-}
-
 Error LLVMOutputStyle::dumpStreamSummary() {
   if (!opts::raw::DumpStreamSummary)
     return Error::success();
 
-  discoverStreamPurposes();
+  if (StreamPurposes.empty())
+    discoverStreamPurposes(File, StreamPurposes);
 
   uint32_t StreamCount = File.getNumStreams();
 
@@ -431,7 +320,8 @@ Error LLVMOutputStyle::dumpStreamBytes() {
   if (opts::raw::DumpStreamData.empty())
     return Error::success();
 
-  discoverStreamPurposes();
+  if (StreamPurposes.empty())
+    discoverStreamPurposes(File, StreamPurposes);
 
   DictScope D(P, "Stream Data");
   for (uint32_t SI : opts::raw::DumpStreamData) {
@@ -450,7 +340,7 @@ Error LLVMOutputStyle::dumpStreamBytes() {
     auto Blocks = File.getMsfLayout().StreamMap[SI];
     P.printList("Blocks", Blocks);
 
-    StreamReader R(*S);
+    BinaryStreamReader R(*S);
     ArrayRef<uint8_t> StreamData;
     if (auto EC = R.readBytes(StreamData, S->getLength()))
       return EC;
@@ -497,6 +387,7 @@ Error LLVMOutputStyle::dumpInfoStream() {
   P.printHex("Signature", IS->getSignature());
   P.printNumber("Age", IS->getAge());
   P.printObject("Guid", IS->getGuid());
+  P.printHex("Features", IS->getFeatures());
   {
     DictScope DD(P, "Named Streams");
     for (const auto &S : IS->getNamedStreams().entries())
@@ -568,9 +459,13 @@ Error LLVMOutputStyle::dumpTpiStream(uint32_t StreamIdx) {
     P.printNumber("Record count", Tpi->NumTypeRecords());
   }
 
-  TypeDatabaseVisitor DBV(TypeDB);
-  CompactTypeDumpVisitor CTDV(TypeDB, &P);
+  TypeDatabase &StreamDB = (StreamIdx == StreamTPI) ? TypeDB : ItemDB;
+
+  TypeDatabaseVisitor DBV(StreamDB);
+  CompactTypeDumpVisitor CTDV(StreamDB, &P);
   TypeDumpVisitor TDV(TypeDB, &P, false);
+  if (StreamIdx == StreamIPI)
+    TDV.setItemDB(ItemDB);
   RecordBytesVisitor RBV(P);
   TypeDeserializer Deserializer;
 
@@ -745,10 +640,10 @@ Error LLVMOutputStyle::dumpDbiStream() {
           public:
             RecordVisitor(ScopedPrinter &P, PDBFile &F) : P(P), F(F) {}
             Error visitUnknown(ModuleSubstreamKind Kind,
-                               ReadableStreamRef Stream) override {
+                               BinaryStreamRef Stream) override {
               DictScope DD(P, "Unknown");
               ArrayRef<uint8_t> Data;
-              StreamReader R(Stream);
+              BinaryStreamReader R(Stream);
               if (auto EC = R.readBytes(Data, R.bytesRemaining())) {
                 return make_error<RawError>(
                     raw_error_code::corrupt_file,
@@ -758,7 +653,7 @@ Error LLVMOutputStyle::dumpDbiStream() {
               return Error::success();
             }
             Error
-            visitFileChecksums(ReadableStreamRef Data,
+            visitFileChecksums(BinaryStreamRef Data,
                                const FileChecksumArray &Checksums) override {
               DictScope DD(P, "FileChecksums");
               for (const auto &C : Checksums) {
@@ -774,7 +669,7 @@ Error LLVMOutputStyle::dumpDbiStream() {
               return Error::success();
             }
 
-            Error visitLines(ReadableStreamRef Data,
+            Error visitLines(BinaryStreamRef Data,
                              const LineSubstreamHeader *Header,
                              const LineInfoArray &Lines) override {
               DictScope DD(P, "Lines");
diff --git a/tools/llvm-pdbdump/LLVMOutputStyle.h b/tools/llvm-pdbdump/LLVMOutputStyle.h
index 4aef78d3253f8725c83d80d108510d40c8638154..bfff3b8308db93abf2b919cc2ab965d930b15281 100644
--- a/tools/llvm-pdbdump/LLVMOutputStyle.h
+++ b/tools/llvm-pdbdump/LLVMOutputStyle.h
@@ -12,9 +12,12 @@
 
 #include "OutputStyle.h"
 
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/DebugInfo/CodeView/TypeDatabase.h"
 #include "llvm/Support/ScopedPrinter.h"
 
+#include <string>
+
 namespace llvm {
 class BitVector;
 namespace pdb {
@@ -25,8 +28,6 @@ public:
   Error dump() override;
 
 private:
-  void discoverStreamPurposes();
-
   Error dumpFileHeaders();
   Error dumpStreamSummary();
   Error dumpFreePageMap();
@@ -51,7 +52,8 @@ private:
   PDBFile &File;
   ScopedPrinter P;
   codeview::TypeDatabase TypeDB;
-  std::vector<std::string> StreamPurposes;
+  codeview::TypeDatabase ItemDB;
+  SmallVector<std::string, 32> StreamPurposes;
 };
 }
 }
diff --git a/tools/llvm-pdbdump/LinePrinter.cpp b/tools/llvm-pdbdump/LinePrinter.cpp
index 47c7d3e3c0e74ff24ebb7eb25ee9a554bf63a032..e5dd66fd9aa649673e7dfb5871ee6a90dcff9c93 100644
--- a/tools/llvm-pdbdump/LinePrinter.cpp
+++ b/tools/llvm-pdbdump/LinePrinter.cpp
@@ -42,8 +42,8 @@ bool IsItemExcluded(llvm::StringRef Item,
 
 using namespace llvm;
 
-LinePrinter::LinePrinter(int Indent, llvm::raw_ostream &Stream)
-    : OS(Stream), IndentSpaces(Indent), CurrentIndent(0) {
+LinePrinter::LinePrinter(int Indent, bool UseColor, llvm::raw_ostream &Stream)
+    : OS(Stream), IndentSpaces(Indent), CurrentIndent(0), UseColor(UseColor) {
   SetFilters(ExcludeTypeFilters, opts::pretty::ExcludeTypes.begin(),
              opts::pretty::ExcludeTypes.end());
   SetFilters(ExcludeSymbolFilters, opts::pretty::ExcludeSymbols.begin(),
@@ -83,17 +83,25 @@ bool LinePrinter::IsCompilandExcluded(llvm::StringRef CompilandName) {
                         ExcludeCompilandFilters);
 }
 
-WithColor::WithColor(LinePrinter &P, PDB_ColorItem C) : OS(P.OS) {
-  applyColor(C);
+WithColor::WithColor(LinePrinter &P, PDB_ColorItem C)
+    : OS(P.OS), UseColor(P.hasColor()) {
+  if (UseColor)
+    applyColor(C);
 }
 
-WithColor::~WithColor() { OS.resetColor(); }
+WithColor::~WithColor() {
+  if (UseColor)
+    OS.resetColor();
+}
 
 void WithColor::applyColor(PDB_ColorItem C) {
   switch (C) {
   case PDB_ColorItem::None:
     OS.resetColor();
     return;
+  case PDB_ColorItem::Comment:
+    OS.changeColor(raw_ostream::GREEN, false);
+    return;
   case PDB_ColorItem::Address:
     OS.changeColor(raw_ostream::YELLOW, /*bold=*/true);
     return;
@@ -113,6 +121,7 @@ void WithColor::applyColor(PDB_ColorItem C) {
   case PDB_ColorItem::Path:
     OS.changeColor(raw_ostream::CYAN, false);
     return;
+  case PDB_ColorItem::Padding:
   case PDB_ColorItem::SectionHeader:
     OS.changeColor(raw_ostream::RED, true);
     return;
diff --git a/tools/llvm-pdbdump/LinePrinter.h b/tools/llvm-pdbdump/LinePrinter.h
index a4401f8af9552c9947335b9a5166b383c2316067..8b3d8755ad8c0db400d655a4ae0106203e54ec5a 100644
--- a/tools/llvm-pdbdump/LinePrinter.h
+++ b/tools/llvm-pdbdump/LinePrinter.h
@@ -24,12 +24,13 @@ class LinePrinter {
   friend class WithColor;
 
 public:
-  LinePrinter(int Indent, raw_ostream &Stream);
+  LinePrinter(int Indent, bool UseColor, raw_ostream &Stream);
 
   void Indent();
   void Unindent();
   void NewLine();
 
+  bool hasColor() const { return UseColor; }
   raw_ostream &getStream() { return OS; }
   int getIndentLevel() const { return CurrentIndent; }
 
@@ -48,6 +49,7 @@ private:
   raw_ostream &OS;
   int IndentSpaces;
   int CurrentIndent;
+  bool UseColor;
 
   std::list<Regex> ExcludeCompilandFilters;
   std::list<Regex> ExcludeTypeFilters;
@@ -68,6 +70,8 @@ enum class PDB_ColorItem {
   None,
   Address,
   Type,
+  Comment,
+  Padding,
   Keyword,
   Offset,
   Identifier,
@@ -87,6 +91,7 @@ public:
 private:
   void applyColor(PDB_ColorItem C);
   raw_ostream &OS;
+  bool UseColor;
 };
 }
 }
diff --git a/tools/llvm-pdbdump/PdbYaml.cpp b/tools/llvm-pdbdump/PdbYaml.cpp
index ba2acf7e7f1238556652d6b2bcc3d4526a8e0d3d..e2c4ee967ed36e12bbc5a5804dda6e7287fbb032 100644
--- a/tools/llvm-pdbdump/PdbYaml.cpp
+++ b/tools/llvm-pdbdump/PdbYaml.cpp
@@ -16,6 +16,7 @@
 #include "llvm/DebugInfo/CodeView/CVSymbolVisitor.h"
 #include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
+#include "llvm/DebugInfo/CodeView/SymbolSerializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolVisitorCallbackPipeline.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeSerializer.h"
@@ -37,6 +38,7 @@ LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbDbiModuleInfo)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbSymbolRecord)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::PdbTpiRecord)
 LLVM_YAML_IS_SEQUENCE_VECTOR(llvm::pdb::yaml::StreamBlockList)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(llvm::pdb::PdbRaw_FeatureSig)
 
 namespace llvm {
 namespace yaml {
@@ -133,26 +135,45 @@ template <> struct ScalarEnumerationTraits<llvm::pdb::PdbRaw_TpiVer> {
     io.enumCase(Value, "VC80", llvm::pdb::PdbRaw_TpiVer::PdbTpiV80);
   }
 };
+
+template <> struct ScalarEnumerationTraits<llvm::pdb::PdbRaw_FeatureSig> {
+  static void enumeration(IO &io, PdbRaw_FeatureSig &Features) {
+    io.enumCase(Features, "MinimalDebugInfo",
+                PdbRaw_FeatureSig::MinimalDebugInfo);
+    io.enumCase(Features, "NoTypeMerge", PdbRaw_FeatureSig::NoTypeMerge);
+    io.enumCase(Features, "VC110", PdbRaw_FeatureSig::VC110);
+    io.enumCase(Features, "VC140", PdbRaw_FeatureSig::VC140);
+  }
+};
 }
 }
 
 void MappingTraits<PdbObject>::mapping(IO &IO, PdbObject &Obj) {
+  // Create a single serialization context that will be passed through the
+  // entire process of serializing / deserializing a Tpi Stream.  This is
+  // especially important when we are going from Pdb -> Yaml because we need
+  // to maintain state in a TypeTableBuilder across mappings, and at the end of
+  // the entire process, we need to have one TypeTableBuilder that has every
+  // record.
+  pdb::yaml::SerializationContext Context(IO, Obj.Allocator);
+
+
   IO.mapOptional("MSF", Obj.Headers);
   IO.mapOptional("StreamSizes", Obj.StreamSizes);
   IO.mapOptional("StreamMap", Obj.StreamMap);
   IO.mapOptional("StringTable", Obj.StringTable);
   IO.mapOptional("PdbStream", Obj.PdbStream);
-  IO.mapOptional("DbiStream", Obj.DbiStream);
-  IO.mapOptionalWithContext("TpiStream", Obj.TpiStream, Obj.Allocator);
-  IO.mapOptionalWithContext("IpiStream", Obj.IpiStream, Obj.Allocator);
+  IO.mapOptionalWithContext("DbiStream", Obj.DbiStream, Context);
+  IO.mapOptionalWithContext("TpiStream", Obj.TpiStream, Context);
+  IO.mapOptionalWithContext("IpiStream", Obj.IpiStream, Context);
 }
 
 void MappingTraits<MSFHeaders>::mapping(IO &IO, MSFHeaders &Obj) {
-  IO.mapRequired("SuperBlock", Obj.SuperBlock);
-  IO.mapRequired("NumDirectoryBlocks", Obj.NumDirectoryBlocks);
-  IO.mapRequired("DirectoryBlocks", Obj.DirectoryBlocks);
-  IO.mapRequired("NumStreams", Obj.NumStreams);
-  IO.mapRequired("FileSize", Obj.FileSize);
+  IO.mapOptional("SuperBlock", Obj.SuperBlock);
+  IO.mapOptional("NumDirectoryBlocks", Obj.NumDirectoryBlocks);
+  IO.mapOptional("DirectoryBlocks", Obj.DirectoryBlocks);
+  IO.mapOptional("NumStreams", Obj.NumStreams);
+  IO.mapOptional("FileSize", Obj.FileSize);
 }
 
 void MappingTraits<msf::SuperBlock>::mapping(IO &IO, msf::SuperBlock &SB) {
@@ -160,12 +181,13 @@ void MappingTraits<msf::SuperBlock>::mapping(IO &IO, msf::SuperBlock &SB) {
     ::memcpy(SB.MagicBytes, msf::Magic, sizeof(msf::Magic));
   }
 
-  IO.mapRequired("BlockSize", SB.BlockSize);
-  IO.mapRequired("FreeBlockMap", SB.FreeBlockMapBlock);
-  IO.mapRequired("NumBlocks", SB.NumBlocks);
-  IO.mapRequired("NumDirectoryBytes", SB.NumDirectoryBytes);
-  IO.mapRequired("Unknown1", SB.Unknown1);
-  IO.mapRequired("BlockMapAddr", SB.BlockMapAddr);
+  using u32 = support::ulittle32_t;
+  IO.mapOptional("BlockSize", SB.BlockSize, u32(4096U));
+  IO.mapOptional("FreeBlockMap", SB.FreeBlockMapBlock, u32(0U));
+  IO.mapOptional("NumBlocks", SB.NumBlocks, u32(0U));
+  IO.mapOptional("NumDirectoryBytes", SB.NumDirectoryBytes, u32(0U));
+  IO.mapOptional("Unknown1", SB.Unknown1, u32(0U));
+  IO.mapOptional("BlockMapAddr", SB.BlockMapAddr, u32(0U));
 }
 
 void MappingTraits<StreamBlockList>::mapping(IO &IO, StreamBlockList &SB) {
@@ -173,34 +195,27 @@ void MappingTraits<StreamBlockList>::mapping(IO &IO, StreamBlockList &SB) {
 }
 
 void MappingTraits<PdbInfoStream>::mapping(IO &IO, PdbInfoStream &Obj) {
-  IO.mapRequired("Age", Obj.Age);
-  IO.mapRequired("Guid", Obj.Guid);
-  IO.mapRequired("Signature", Obj.Signature);
-  IO.mapRequired("Version", Obj.Version);
+  IO.mapOptional("Age", Obj.Age, 1U);
+  IO.mapOptional("Guid", Obj.Guid);
+  IO.mapOptional("Signature", Obj.Signature, 0U);
+  IO.mapOptional("Features", Obj.Features);
+  IO.mapOptional("Version", Obj.Version, PdbImplVC70);
 }
 
-void MappingTraits<PdbDbiStream>::mapping(IO &IO, PdbDbiStream &Obj) {
-  IO.mapRequired("VerHeader", Obj.VerHeader);
-  IO.mapRequired("Age", Obj.Age);
-  IO.mapRequired("BuildNumber", Obj.BuildNumber);
-  IO.mapRequired("PdbDllVersion", Obj.PdbDllVersion);
-  IO.mapRequired("PdbDllRbld", Obj.PdbDllRbld);
-  IO.mapRequired("Flags", Obj.Flags);
-  IO.mapRequired("MachineType", Obj.MachineType);
-  IO.mapOptional("Modules", Obj.ModInfos);
+void MappingContextTraits<PdbDbiStream, pdb::yaml::SerializationContext>::mapping(IO &IO, PdbDbiStream &Obj, pdb::yaml::SerializationContext &Context) {
+  IO.mapOptional("VerHeader", Obj.VerHeader, PdbDbiV70);
+  IO.mapOptional("Age", Obj.Age, 1U);
+  IO.mapOptional("BuildNumber", Obj.BuildNumber, uint16_t(0U));
+  IO.mapOptional("PdbDllVersion", Obj.PdbDllVersion, 0U);
+  IO.mapOptional("PdbDllRbld", Obj.PdbDllRbld, uint16_t(0U));
+  IO.mapOptional("Flags", Obj.Flags, uint16_t(1U));
+  IO.mapOptional("MachineType", Obj.MachineType, PDB_Machine::x86);
+  IO.mapOptionalWithContext("Modules", Obj.ModInfos, Context);
 }
 
-void MappingContextTraits<PdbTpiStream, BumpPtrAllocator>::mapping(
-    IO &IO, pdb::yaml::PdbTpiStream &Obj, BumpPtrAllocator &Allocator) {
-  // Create a single serialization context that will be passed through the
-  // entire process of serializing / deserializing a Tpi Stream.  This is
-  // especially important when we are going from Pdb -> Yaml because we need
-  // to maintain state in a TypeTableBuilder across mappings, and at the end of
-  // the entire process, we need to have one TypeTableBuilder that has every
-  // record.
-  pdb::yaml::SerializationContext Context(IO, Allocator);
-
-  IO.mapRequired("Version", Obj.Version);
+void MappingContextTraits<PdbTpiStream, pdb::yaml::SerializationContext>::mapping(
+    IO &IO, pdb::yaml::PdbTpiStream &Obj, pdb::yaml::SerializationContext &Context) {
+  IO.mapOptional("Version", Obj.Version, PdbTpiV80);
   IO.mapRequired("Records", Obj.Records, Context);
 }
 
@@ -210,8 +225,9 @@ void MappingTraits<NamedStreamMapping>::mapping(IO &IO,
   IO.mapRequired("StreamNum", Obj.StreamNumber);
 }
 
-void MappingTraits<PdbSymbolRecord>::mapping(IO &IO, PdbSymbolRecord &Obj) {
+void MappingContextTraits<PdbSymbolRecord, pdb::yaml::SerializationContext>::mapping(IO &IO, PdbSymbolRecord &Obj, pdb::yaml::SerializationContext &Context) {
   codeview::SymbolVisitorCallbackPipeline Pipeline;
+  codeview::SymbolSerializer Serializer(Context.Allocator);
   codeview::SymbolDeserializer Deserializer(nullptr);
   codeview::yaml::YamlSymbolDumper Dumper(IO);
 
@@ -220,23 +236,26 @@ void MappingTraits<PdbSymbolRecord>::mapping(IO &IO, PdbSymbolRecord &Obj) {
     Pipeline.addCallbackToPipeline(Deserializer);
     Pipeline.addCallbackToPipeline(Dumper);
   } else {
-    return;
+    // For the other way around, dump it into a concrete structure, and then
+    // serialize it into the CVRecord.
+    Pipeline.addCallbackToPipeline(Dumper);
+    Pipeline.addCallbackToPipeline(Serializer);
   }
 
   codeview::CVSymbolVisitor Visitor(Pipeline);
   consumeError(Visitor.visitSymbolRecord(Obj.Record));
 }
 
-void MappingTraits<PdbModiStream>::mapping(IO &IO, PdbModiStream &Obj) {
-  IO.mapRequired("Signature", Obj.Signature);
-  IO.mapRequired("Records", Obj.Symbols);
+void MappingContextTraits<PdbModiStream, pdb::yaml::SerializationContext>::mapping(IO &IO, PdbModiStream &Obj, pdb::yaml::SerializationContext &Context) {
+  IO.mapOptional("Signature", Obj.Signature, 4U);
+  IO.mapRequired("Records", Obj.Symbols, Context);
 }
 
-void MappingTraits<PdbDbiModuleInfo>::mapping(IO &IO, PdbDbiModuleInfo &Obj) {
+void MappingContextTraits<PdbDbiModuleInfo, pdb::yaml::SerializationContext>::mapping(IO &IO, PdbDbiModuleInfo &Obj, pdb::yaml::SerializationContext &Context) {
   IO.mapRequired("Module", Obj.Mod);
-  IO.mapRequired("ObjFile", Obj.Obj);
+  IO.mapOptional("ObjFile", Obj.Obj, Obj.Mod);
   IO.mapOptional("SourceFiles", Obj.SourceFiles);
-  IO.mapOptional("Modi", Obj.Modi);
+  IO.mapOptionalWithContext("Modi", Obj.Modi, Context);
 }
 
 void MappingContextTraits<PdbTpiRecord, pdb::yaml::SerializationContext>::
diff --git a/tools/llvm-pdbdump/PdbYaml.h b/tools/llvm-pdbdump/PdbYaml.h
index c5b49522b454f51e1ca0f1ceb0956a03736cda93..2c4cd237f8d7f69187dae24a0741954ee6f9ee35 100644
--- a/tools/llvm-pdbdump/PdbYaml.h
+++ b/tools/llvm-pdbdump/PdbYaml.h
@@ -32,10 +32,10 @@ struct SerializationContext;
 
 struct MSFHeaders {
   msf::SuperBlock SuperBlock;
-  uint32_t NumDirectoryBlocks;
+  uint32_t NumDirectoryBlocks = 0;
   std::vector<uint32_t> DirectoryBlocks;
-  uint32_t NumStreams;
-  uint32_t FileSize;
+  uint32_t NumStreams = 0;
+  uint32_t FileSize = 0;
 };
 
 struct StreamBlockList {
@@ -48,10 +48,11 @@ struct NamedStreamMapping {
 };
 
 struct PdbInfoStream {
-  PdbRaw_ImplVer Version;
-  uint32_t Signature;
-  uint32_t Age;
+  PdbRaw_ImplVer Version = PdbImplVC70;
+  uint32_t Signature = 0;
+  uint32_t Age = 1;
   PDB_UniqueId Guid;
+  std::vector<PdbRaw_FeatureSig> Features;
   std::vector<NamedStreamMapping> NamedStreams;
 };
 
@@ -72,13 +73,13 @@ struct PdbDbiModuleInfo {
 };
 
 struct PdbDbiStream {
-  PdbRaw_DbiVer VerHeader;
-  uint32_t Age;
-  uint16_t BuildNumber;
-  uint32_t PdbDllVersion;
-  uint16_t PdbDllRbld;
-  uint16_t Flags;
-  PDB_Machine MachineType;
+  PdbRaw_DbiVer VerHeader = PdbDbiV70;
+  uint32_t Age = 1;
+  uint16_t BuildNumber = 0;
+  uint32_t PdbDllVersion = 0;
+  uint16_t PdbDllRbld = 0;
+  uint16_t Flags = 1;
+  PDB_Machine MachineType = PDB_Machine::x86;
 
   std::vector<PdbDbiModuleInfo> ModInfos;
 };
@@ -92,7 +93,7 @@ struct PdbTpiFieldListRecord {
 };
 
 struct PdbTpiStream {
-  PdbRaw_TpiVer Version;
+  PdbRaw_TpiVer Version = PdbTpiV80;
   std::vector<PdbTpiRecord> Records;
 };
 
@@ -138,30 +139,30 @@ template <> struct MappingTraits<pdb::yaml::PdbInfoStream> {
   static void mapping(IO &IO, pdb::yaml::PdbInfoStream &Obj);
 };
 
-template <> struct MappingTraits<pdb::yaml::PdbDbiStream> {
-  static void mapping(IO &IO, pdb::yaml::PdbDbiStream &Obj);
+template <> struct MappingContextTraits<pdb::yaml::PdbDbiStream, pdb::yaml::SerializationContext> {
+  static void mapping(IO &IO, pdb::yaml::PdbDbiStream &Obj, pdb::yaml::SerializationContext &Context);
 };
 
 template <>
-struct MappingContextTraits<pdb::yaml::PdbTpiStream, llvm::BumpPtrAllocator> {
+struct MappingContextTraits<pdb::yaml::PdbTpiStream, pdb::yaml::SerializationContext> {
   static void mapping(IO &IO, pdb::yaml::PdbTpiStream &Obj,
-                      llvm::BumpPtrAllocator &Allocator);
+    pdb::yaml::SerializationContext &Context);
 };
 
 template <> struct MappingTraits<pdb::yaml::NamedStreamMapping> {
   static void mapping(IO &IO, pdb::yaml::NamedStreamMapping &Obj);
 };
 
-template <> struct MappingTraits<pdb::yaml::PdbSymbolRecord> {
-  static void mapping(IO &IO, pdb::yaml::PdbSymbolRecord &Obj);
+template <> struct MappingContextTraits<pdb::yaml::PdbSymbolRecord, pdb::yaml::SerializationContext> {
+  static void mapping(IO &IO, pdb::yaml::PdbSymbolRecord &Obj, pdb::yaml::SerializationContext &Context);
 };
 
-template <> struct MappingTraits<pdb::yaml::PdbModiStream> {
-  static void mapping(IO &IO, pdb::yaml::PdbModiStream &Obj);
+template <> struct MappingContextTraits<pdb::yaml::PdbModiStream, pdb::yaml::SerializationContext> {
+  static void mapping(IO &IO, pdb::yaml::PdbModiStream &Obj, pdb::yaml::SerializationContext &Context);
 };
 
-template <> struct MappingTraits<pdb::yaml::PdbDbiModuleInfo> {
-  static void mapping(IO &IO, pdb::yaml::PdbDbiModuleInfo &Obj);
+template <> struct MappingContextTraits<pdb::yaml::PdbDbiModuleInfo, pdb::yaml::SerializationContext> {
+  static void mapping(IO &IO, pdb::yaml::PdbDbiModuleInfo &Obj, pdb::yaml::SerializationContext &Context);
 };
 
 template <>
diff --git a/tools/llvm-pdbdump/PrettyBuiltinDumper.cpp b/tools/llvm-pdbdump/PrettyBuiltinDumper.cpp
index f866132aa8866a4933ace6c1e916162d5a4e5988..591d5e70cfd6ed7927774ed3438bb6e75cb63b5c 100644
--- a/tools/llvm-pdbdump/PrettyBuiltinDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyBuiltinDumper.cpp
@@ -20,6 +20,10 @@ BuiltinDumper::BuiltinDumper(LinePrinter &P)
     : PDBSymDumper(false), Printer(P) {}
 
 void BuiltinDumper::start(const PDBSymbolTypeBuiltin &Symbol) {
+  if (Symbol.isConstType())
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << "const ";
+  if (Symbol.isVolatileType())
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << "volatile ";
   WithColor(Printer, PDB_ColorItem::Type).get() << getTypeName(Symbol);
 }
 
diff --git a/tools/llvm-pdbdump/PrettyClassDefinitionDumper.cpp b/tools/llvm-pdbdump/PrettyClassDefinitionDumper.cpp
index b0c534f7c5b17d8e4e6c51ae356a24e3db743752..b48ed23c1c714e8cc281fa4c41f39495c6143db8 100644
--- a/tools/llvm-pdbdump/PrettyClassDefinitionDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyClassDefinitionDumper.cpp
@@ -16,6 +16,8 @@
 #include "PrettyVariableDumper.h"
 #include "llvm-pdbdump.h"
 
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBExtras.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
@@ -26,6 +28,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeVTable.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Format.h"
 
 using namespace llvm;
@@ -34,8 +37,56 @@ using namespace llvm::pdb;
 ClassDefinitionDumper::ClassDefinitionDumper(LinePrinter &P)
     : PDBSymDumper(true), Printer(P) {}
 
+static void analyzePadding(const PDBSymbolTypeUDT &Class, BitVector &Padding,
+                           uint32_t &FirstFieldOffset) {
+  Padding.resize(Class.getLength(), true);
+  auto Children = Class.findAllChildren<PDBSymbolData>();
+  bool IsFirst = true;
+  FirstFieldOffset = Class.getLength();
+
+  while (auto Data = Children->getNext()) {
+    // Ignore data members which are not relative to this.  Usually these are
+    // static data members or constexpr and occupy no space.  We also need to
+    // handle BitFields since the PDB doesn't consider them ThisRel, but they
+    // still occupy space in the record layout.
+    auto LocType = Data->getLocationType();
+    if (LocType != PDB_LocType::ThisRel && LocType != PDB_LocType::BitField)
+      continue;
+
+    uint64_t Start = Data->getOffset();
+    if (IsFirst) {
+      FirstFieldOffset = Start;
+      IsFirst = false;
+    }
+
+    auto VarType = Data->getType();
+    uint64_t Size = VarType->getRawSymbol().getLength();
+    Padding.reset(Start, Start + Size);
+  }
+
+  // Unmark anything that comes before the first field so it doesn't get
+  // counted as padding.  In reality this is going to be vptrs or base class
+  // members, but we don't correctly handle that yet.
+  // FIXME: Handle it.
+  Padding.reset(0, FirstFieldOffset);
+}
+
 void ClassDefinitionDumper::start(const PDBSymbolTypeUDT &Class) {
-  std::string Name = Class.getName();
+  assert(opts::pretty::ClassFormat !=
+         opts::pretty::ClassDefinitionFormat::None);
+
+  uint32_t Size = Class.getLength();
+  uint32_t FirstFieldOffset = 0;
+  BitVector Padding;
+  analyzePadding(Class, Padding, FirstFieldOffset);
+
+  if (opts::pretty::OnlyPaddingClasses && (Padding.count() == 0))
+    return;
+
+  Printer.NewLine();
+  WithColor(Printer, PDB_ColorItem::Comment).get() << "// sizeof = " << Size;
+  Printer.NewLine();
+
   WithColor(Printer, PDB_ColorItem::Keyword).get() << Class.getUdtKind() << " ";
   WithColor(Printer, PDB_ColorItem::Type).get() << Class.getName();
 
@@ -61,96 +112,62 @@ void ClassDefinitionDumper::start(const PDBSymbolTypeUDT &Class) {
 
   Printer << " {";
   auto Children = Class.findAllChildren();
-  if (Children->getChildCount() == 0) {
-    Printer << "}";
-    return;
-  }
-
-  // Try to dump symbols organized by member access level.  Public members
-  // first, then protected, then private.  This might be slow, so it's worth
-  // reconsidering the value of this if performance of large PDBs is a problem.
-  // NOTE: Access level of nested types is not recorded in the PDB, so we have
-  // a special case for them.
-  SymbolGroupByAccess Groups;
-  Groups.insert(std::make_pair(0, SymbolGroup()));
-  Groups.insert(std::make_pair((int)PDB_MemberAccess::Private, SymbolGroup()));
-  Groups.insert(
-      std::make_pair((int)PDB_MemberAccess::Protected, SymbolGroup()));
-  Groups.insert(std::make_pair((int)PDB_MemberAccess::Public, SymbolGroup()));
+  Printer.Indent();
+  int DumpedCount = 0;
 
+  int NextPaddingByte = Padding.find_first();
   while (auto Child = Children->getNext()) {
-    PDB_MemberAccess Access = Child->getRawSymbol().getAccess();
-    if (isa<PDBSymbolTypeBaseClass>(*Child))
-      continue;
-
-    auto &AccessGroup = Groups.find((int)Access)->second;
+    if (auto Data = llvm::dyn_cast<PDBSymbolData>(Child.get())) {
+      if (Data->getDataKind() == PDB_DataKind::Member && NextPaddingByte >= 0) {
+        // If there are padding bytes remaining, see if this field is the first
+        // to cross a padding boundary, and print a padding field indicator if
+        // so.
+        int Off = Data->getOffset();
+        if (Off > NextPaddingByte) {
+          uint32_t Amount = Off - NextPaddingByte;
+          Printer.NewLine();
+          WithColor(Printer, PDB_ColorItem::Padding).get()
+              << "<padding> (" << Amount << " bytes)";
+          assert(Padding.find_next_unset(NextPaddingByte) == Off);
+          NextPaddingByte = Padding.find_next(Off);
+        }
+      }
+    }
 
-    if (auto Func = dyn_cast<PDBSymbolFunc>(Child.get())) {
+    if (auto Func = Child->cast<PDBSymbolFunc>()) {
       if (Func->isCompilerGenerated() && opts::pretty::ExcludeCompilerGenerated)
         continue;
+
       if (Func->getLength() == 0 && !Func->isPureVirtual() &&
           !Func->isIntroVirtualFunction())
         continue;
-      Child.release();
-      AccessGroup.Functions.push_back(std::unique_ptr<PDBSymbolFunc>(Func));
-    } else if (auto Data = dyn_cast<PDBSymbolData>(Child.get())) {
-      Child.release();
-      AccessGroup.Data.push_back(std::unique_ptr<PDBSymbolData>(Data));
-    } else {
-      AccessGroup.Unknown.push_back(std::move(Child));
     }
-  }
 
-  int Count = 0;
-  Count += dumpAccessGroup((PDB_MemberAccess)0, Groups[0]);
-  Count += dumpAccessGroup(PDB_MemberAccess::Public,
-                           Groups[(int)PDB_MemberAccess::Public]);
-  Count += dumpAccessGroup(PDB_MemberAccess::Protected,
-                           Groups[(int)PDB_MemberAccess::Protected]);
-  Count += dumpAccessGroup(PDB_MemberAccess::Private,
-                           Groups[(int)PDB_MemberAccess::Private]);
-  if (Count > 0)
-    Printer.NewLine();
-  Printer << "}";
-}
-
-int ClassDefinitionDumper::dumpAccessGroup(PDB_MemberAccess Access,
-                                           const SymbolGroup &Group) {
-  if (Group.Functions.empty() && Group.Data.empty() && Group.Unknown.empty())
-    return 0;
+    ++DumpedCount;
+    Child->dump(*this);
+  }
 
-  int Count = 0;
-  if (Access == PDB_MemberAccess::Private) {
+  if (NextPaddingByte >= 0) {
+    uint32_t Amount = Size - NextPaddingByte;
     Printer.NewLine();
-    WithColor(Printer, PDB_ColorItem::Keyword).get() << "private";
-    Printer << ":";
-  } else if (Access == PDB_MemberAccess::Protected) {
+    WithColor(Printer, PDB_ColorItem::Padding).get() << "<padding> (" << Amount
+                                                     << " bytes)";
+  }
+  Printer.Unindent();
+  if (DumpedCount > 0)
     Printer.NewLine();
-    WithColor(Printer, PDB_ColorItem::Keyword).get() << "protected";
-    Printer << ":";
-  } else if (Access == PDB_MemberAccess::Public) {
+  Printer << "}";
+  Printer.NewLine();
+  if (Padding.count() > 0) {
+    APFloat Pct(100.0 * (double)Padding.count() /
+                (double)(Size - FirstFieldOffset));
+    SmallString<8> PctStr;
+    Pct.toString(PctStr, 4);
+    WithColor(Printer, PDB_ColorItem::Padding).get()
+        << "Total padding " << Padding.count() << " bytes (" << PctStr
+        << "% of class size)";
     Printer.NewLine();
-    WithColor(Printer, PDB_ColorItem::Keyword).get() << "public";
-    Printer << ":";
   }
-  Printer.Indent();
-  for (auto iter = Group.Functions.begin(), end = Group.Functions.end();
-       iter != end; ++iter) {
-    ++Count;
-    (*iter)->dump(*this);
-  }
-  for (auto iter = Group.Data.begin(), end = Group.Data.end(); iter != end;
-       ++iter) {
-    ++Count;
-    (*iter)->dump(*this);
-  }
-  for (auto iter = Group.Unknown.begin(), end = Group.Unknown.end();
-       iter != end; ++iter) {
-    ++Count;
-    (*iter)->dump(*this);
-  }
-  Printer.Unindent();
-  return Count;
 }
 
 void ClassDefinitionDumper::dump(const PDBSymbolTypeBaseClass &Symbol) {}
diff --git a/tools/llvm-pdbdump/PrettyClassDefinitionDumper.h b/tools/llvm-pdbdump/PrettyClassDefinitionDumper.h
index 0831f47557ed6c0b3ced3ee41542537cef7f9427..8f0c35cba810f7558fca868d4e66e7f37be347f7 100644
--- a/tools/llvm-pdbdump/PrettyClassDefinitionDumper.h
+++ b/tools/llvm-pdbdump/PrettyClassDefinitionDumper.h
@@ -10,6 +10,8 @@
 #ifndef LLVM_TOOLS_LLVMPDBDUMP_PRETTYCLASSDEFINITIONDUMPER_H
 #define LLVM_TOOLS_LLVMPDBDUMP_PRETTYCLASSDEFINITIONDUMPER_H
 
+#include "llvm/ADT/BitVector.h"
+
 #include "llvm/DebugInfo/PDB/PDBSymDumper.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
@@ -39,24 +41,6 @@ public:
 
 private:
   LinePrinter &Printer;
-
-  struct SymbolGroup {
-    SymbolGroup() {}
-    SymbolGroup(SymbolGroup &&Other) {
-      Functions = std::move(Other.Functions);
-      Data = std::move(Other.Data);
-      Unknown = std::move(Other.Unknown);
-    }
-
-    std::list<std::unique_ptr<PDBSymbolFunc>> Functions;
-    std::list<std::unique_ptr<PDBSymbolData>> Data;
-    std::list<std::unique_ptr<PDBSymbol>> Unknown;
-    SymbolGroup(const SymbolGroup &other) = delete;
-    SymbolGroup &operator=(const SymbolGroup &other) = delete;
-  };
-  typedef std::unordered_map<int, SymbolGroup> SymbolGroupByAccess;
-
-  int dumpAccessGroup(PDB_MemberAccess Access, const SymbolGroup &Group);
 };
 }
 }
diff --git a/tools/llvm-pdbdump/PrettyFunctionDumper.cpp b/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
index 2f6ca894fadf30e82d7e90cec4eec7a426348503..0e0da026e567814447c2540dc0cc98218787f794 100644
--- a/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyFunctionDumper.cpp
@@ -195,10 +195,7 @@ void FunctionDumper::start(const PDBSymbolFunc &Symbol, PointerType Pointer) {
 }
 
 void FunctionDumper::dump(const PDBSymbolTypeArray &Symbol) {
-  uint32_t ElementTypeId = Symbol.getTypeId();
-  auto ElementType = Symbol.getSession().getSymbolById(ElementTypeId);
-  if (!ElementType)
-    return;
+  auto ElementType = Symbol.getElementType();
 
   ElementType->dump(*this);
   Printer << "[";
@@ -232,12 +229,11 @@ void FunctionDumper::dump(const PDBSymbolTypeTypedef &Symbol) {
 }
 
 void FunctionDumper::dump(const PDBSymbolTypePointer &Symbol) {
-  uint32_t PointeeId = Symbol.getTypeId();
-  auto PointeeType = Symbol.getSession().getSymbolById(PointeeId);
+  auto PointeeType = Symbol.getPointeeType();
   if (!PointeeType)
     return;
 
-  if (auto FuncSig = dyn_cast<PDBSymbolTypeFunctionSig>(PointeeType.get())) {
+  if (auto FuncSig = PointeeType->cast<PDBSymbolTypeFunctionSig>()) {
     FunctionDumper NestedDumper(Printer);
     PointerType Pointer =
         Symbol.isReference() ? PointerType::Reference : PointerType::Pointer;
diff --git a/tools/llvm-pdbdump/PrettyTypeDumper.cpp b/tools/llvm-pdbdump/PrettyTypeDumper.cpp
index 4f70c8047337a606bd400e9f068f8241e8272cda..12a47d24f79292d34713750d056a2be99db0630f 100644
--- a/tools/llvm-pdbdump/PrettyTypeDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyTypeDumper.cpp
@@ -29,37 +29,43 @@ using namespace llvm::pdb;
 TypeDumper::TypeDumper(LinePrinter &P) : PDBSymDumper(true), Printer(P) {}
 
 void TypeDumper::start(const PDBSymbolExe &Exe) {
-  auto Enums = Exe.findAllChildren<PDBSymbolTypeEnum>();
-  Printer.NewLine();
-  WithColor(Printer, PDB_ColorItem::Identifier).get() << "Enums";
-  Printer << ": (" << Enums->getChildCount() << " items)";
-  Printer.Indent();
-  while (auto Enum = Enums->getNext())
-    Enum->dump(*this);
-  Printer.Unindent();
-
-  auto Typedefs = Exe.findAllChildren<PDBSymbolTypeTypedef>();
-  Printer.NewLine();
-  WithColor(Printer, PDB_ColorItem::Identifier).get() << "Typedefs";
-  Printer << ": (" << Typedefs->getChildCount() << " items)";
-  Printer.Indent();
-  while (auto Typedef = Typedefs->getNext())
-    Typedef->dump(*this);
-  Printer.Unindent();
-
-  auto Classes = Exe.findAllChildren<PDBSymbolTypeUDT>();
-  Printer.NewLine();
-  WithColor(Printer, PDB_ColorItem::Identifier).get() << "Classes";
-  Printer << ": (" << Classes->getChildCount() << " items)";
-  Printer.Indent();
-  while (auto Class = Classes->getNext())
-    Class->dump(*this);
-  Printer.Unindent();
+  if (opts::pretty::Enums) {
+    auto Enums = Exe.findAllChildren<PDBSymbolTypeEnum>();
+    Printer.NewLine();
+    WithColor(Printer, PDB_ColorItem::Identifier).get() << "Enums";
+    Printer << ": (" << Enums->getChildCount() << " items)";
+    Printer.Indent();
+    while (auto Enum = Enums->getNext())
+      Enum->dump(*this);
+    Printer.Unindent();
+  }
+
+  if (opts::pretty::Typedefs) {
+    auto Typedefs = Exe.findAllChildren<PDBSymbolTypeTypedef>();
+    Printer.NewLine();
+    WithColor(Printer, PDB_ColorItem::Identifier).get() << "Typedefs";
+    Printer << ": (" << Typedefs->getChildCount() << " items)";
+    Printer.Indent();
+    while (auto Typedef = Typedefs->getNext())
+      Typedef->dump(*this);
+    Printer.Unindent();
+  }
+
+  if (opts::pretty::Classes) {
+    auto Classes = Exe.findAllChildren<PDBSymbolTypeUDT>();
+    Printer.NewLine();
+    WithColor(Printer, PDB_ColorItem::Identifier).get() << "Classes";
+    Printer << ": (" << Classes->getChildCount() << " items)";
+    Printer.Indent();
+    while (auto Class = Classes->getNext())
+      Class->dump(*this);
+    Printer.Unindent();
+  }
 }
 
 void TypeDumper::dump(const PDBSymbolTypeEnum &Symbol) {
-  if (Symbol.getUnmodifiedTypeId() != 0)
-    return;
+  assert(opts::pretty::Enums);
+
   if (Printer.IsTypeExcluded(Symbol.getName()))
     return;
   // Dump member enums when dumping their class definition.
@@ -72,6 +78,8 @@ void TypeDumper::dump(const PDBSymbolTypeEnum &Symbol) {
 }
 
 void TypeDumper::dump(const PDBSymbolTypeTypedef &Symbol) {
+  assert(opts::pretty::Typedefs);
+
   if (Printer.IsTypeExcluded(Symbol.getName()))
     return;
 
@@ -81,14 +89,15 @@ void TypeDumper::dump(const PDBSymbolTypeTypedef &Symbol) {
 }
 
 void TypeDumper::dump(const PDBSymbolTypeUDT &Symbol) {
+  assert(opts::pretty::Classes);
+
   if (Symbol.getUnmodifiedTypeId() != 0)
     return;
   if (Printer.IsTypeExcluded(Symbol.getName()))
     return;
 
-  Printer.NewLine();
-
-  if (opts::pretty::NoClassDefs) {
+  if (opts::pretty::ClassFormat == opts::pretty::ClassDefinitionFormat::None) {
+    Printer.NewLine();
     WithColor(Printer, PDB_ColorItem::Keyword).get() << "class ";
     WithColor(Printer, PDB_ColorItem::Identifier).get() << Symbol.getName();
   } else {
diff --git a/tools/llvm-pdbdump/PrettyTypedefDumper.cpp b/tools/llvm-pdbdump/PrettyTypedefDumper.cpp
index c458755cb7806a875bba704914dbc70889e9ef89..861f7e28b3858741a6398b916b878daa5c33cb99 100644
--- a/tools/llvm-pdbdump/PrettyTypedefDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyTypedefDumper.cpp
@@ -53,11 +53,8 @@ void TypedefDumper::dump(const PDBSymbolTypePointer &Symbol) {
     WithColor(Printer, PDB_ColorItem::Keyword).get() << "const ";
   if (Symbol.isVolatileType())
     WithColor(Printer, PDB_ColorItem::Keyword).get() << "volatile ";
-  uint32_t PointeeId = Symbol.getTypeId();
-  auto PointeeType = Symbol.getSession().getSymbolById(PointeeId);
-  if (!PointeeType)
-    return;
-  if (auto FuncSig = dyn_cast<PDBSymbolTypeFunctionSig>(PointeeType.get())) {
+  auto PointeeType = Symbol.getPointeeType();
+  if (auto FuncSig = PointeeType->cast<PDBSymbolTypeFunctionSig>()) {
     FunctionDumper::PointerType Pointer = FunctionDumper::PointerType::Pointer;
     if (Symbol.isReference())
       Pointer = FunctionDumper::PointerType::Reference;
diff --git a/tools/llvm-pdbdump/PrettyVariableDumper.cpp b/tools/llvm-pdbdump/PrettyVariableDumper.cpp
index e1469186ad8b541eba61b4d1960487c6b675b8a3..65f0139dfbc5d9c3e907bb9bfe175342e62aeac8 100644
--- a/tools/llvm-pdbdump/PrettyVariableDumper.cpp
+++ b/tools/llvm-pdbdump/PrettyVariableDumper.cpp
@@ -14,6 +14,7 @@
 #include "PrettyFunctionDumper.h"
 #include "llvm-pdbdump.h"
 
+#include "llvm/DebugInfo/PDB/IPDBSession.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolData.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeArray.h"
@@ -23,10 +24,12 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolTypePointer.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeTypedef.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeUDT.h"
+#include "llvm/DebugInfo/PDB/PDBTypes.h"
 
 #include "llvm/Support/Format.h"
 
 using namespace llvm;
+using namespace llvm::codeview;
 using namespace llvm::pdb;
 
 VariableDumper::VariableDumper(LinePrinter &P)
@@ -40,13 +43,15 @@ void VariableDumper::start(const PDBSymbolData &Var) {
 
   auto VarType = Var.getType();
 
+  uint64_t Length = VarType->getRawSymbol().getLength();
+
   switch (auto LocType = Var.getLocationType()) {
   case PDB_LocType::Static:
     Printer.NewLine();
     Printer << "data [";
     WithColor(Printer, PDB_ColorItem::Address).get()
         << format_hex(Var.getVirtualAddress(), 10);
-    Printer << "] ";
+    Printer << ", sizeof=" << Length << "] ";
     WithColor(Printer, PDB_ColorItem::Keyword).get() << "static ";
     dumpSymbolTypeAndName(*VarType, Var.getName());
     break;
@@ -54,8 +59,7 @@ void VariableDumper::start(const PDBSymbolData &Var) {
     if (isa<PDBSymbolTypeEnum>(*VarType))
       break;
     Printer.NewLine();
-    Printer << "data ";
-    WithColor(Printer, PDB_ColorItem::Keyword).get() << "const ";
+    Printer << "data [sizeof=" << Length << "] ";
     dumpSymbolTypeAndName(*VarType, Var.getName());
     Printer << " = ";
     WithColor(Printer, PDB_ColorItem::LiteralValue).get() << Var.getValue();
@@ -64,27 +68,46 @@ void VariableDumper::start(const PDBSymbolData &Var) {
     Printer.NewLine();
     Printer << "data ";
     WithColor(Printer, PDB_ColorItem::Offset).get()
-        << "+" << format_hex(Var.getOffset(), 4) << " ";
+        << "+" << format_hex(Var.getOffset(), 4) << " [sizeof=" << Length
+        << "] ";
     dumpSymbolTypeAndName(*VarType, Var.getName());
     break;
   case PDB_LocType::BitField:
     Printer.NewLine();
     Printer << "data ";
     WithColor(Printer, PDB_ColorItem::Offset).get()
-        << "+" << format_hex(Var.getOffset(), 4) << " ";
+        << "+" << format_hex(Var.getOffset(), 4) << " [sizeof=" << Length
+        << "] ";
     dumpSymbolTypeAndName(*VarType, Var.getName());
     Printer << " : ";
     WithColor(Printer, PDB_ColorItem::LiteralValue).get() << Var.getLength();
     break;
   default:
     Printer.NewLine();
-    Printer << "data ";
+    Printer << "data [sizeof=" << Length << "] ";
     Printer << "unknown(" << LocType << ") ";
     WithColor(Printer, PDB_ColorItem::Identifier).get() << Var.getName();
     break;
   }
 }
 
+void VariableDumper::dump(const PDBSymbolTypeArray &Symbol) {
+  auto ElementType = Symbol.getElementType();
+  assert(ElementType);
+  if (!ElementType)
+    return;
+  ElementType->dump(*this);
+}
+
+void VariableDumper::dumpRight(const PDBSymbolTypeArray &Symbol) {
+  auto ElementType = Symbol.getElementType();
+  assert(ElementType);
+  if (!ElementType)
+    return;
+  Printer << '[' << Symbol.getCount() << ']';
+  ElementType->dumpRight(*this);
+}
+
 void VariableDumper::dump(const PDBSymbolTypeBuiltin &Symbol) {
   BuiltinDumper Dumper(Printer);
   Dumper.start(Symbol);
@@ -94,27 +117,71 @@ void VariableDumper::dump(const PDBSymbolTypeEnum &Symbol) {
   WithColor(Printer, PDB_ColorItem::Type).get() << Symbol.getName();
 }
 
-void VariableDumper::dump(const PDBSymbolTypeFunctionSig &Symbol) {}
+void VariableDumper::dump(const PDBSymbolTypeFunctionSig &Symbol) {
+  auto ReturnType = Symbol.getReturnType();
+  ReturnType->dump(*this);
+  Printer << " ";
+
+  uint32_t ClassParentId = Symbol.getClassParentId();
+  auto ClassParent =
+      Symbol.getSession().getConcreteSymbolById<PDBSymbolTypeUDT>(
+          ClassParentId);
+
+  if (ClassParent) {
+    WithColor(Printer, PDB_ColorItem::Identifier).get()
+      << ClassParent->getName();
+    Printer << "::";
+  }
+}
+
+void VariableDumper::dumpRight(const PDBSymbolTypeFunctionSig &Symbol) {
+  Printer << "(";
+  if (auto Arguments = Symbol.getArguments()) {
+    uint32_t Index = 0;
+    while (auto Arg = Arguments->getNext()) {
+      Arg->dump(*this);
+      if (++Index < Arguments->getChildCount())
+        Printer << ", ";
+    }
+  }
+  Printer << ")";
+
+  if (Symbol.isConstType())
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << " const";
+  if (Symbol.isVolatileType())
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << " volatile";
+}
 
 void VariableDumper::dump(const PDBSymbolTypePointer &Symbol) {
   auto PointeeType = Symbol.getPointeeType();
   if (!PointeeType)
     return;
+  PointeeType->dump(*this);
+  if (auto Func = PointeeType->cast<PDBSymbolTypeFunctionSig>()) {
+    // A hack to get the calling convention in the right spot.
+    Printer << " (";
+    PDB_CallingConv CC = Func->getCallingConvention();
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << CC << " ";
+  } else if (isa<PDBSymbolTypeArray>(PointeeType.get())) {
+    Printer << " (";
+  }
+  Printer << (Symbol.isReference() ? "&" : "*");
+  if (Symbol.isConstType())
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << " const ";
+  if (Symbol.isVolatileType())
+    WithColor(Printer, PDB_ColorItem::Keyword).get() << " volatile ";
+}
 
-  if (auto Func = dyn_cast<PDBSymbolFunc>(PointeeType.get())) {
-    FunctionDumper NestedDumper(Printer);
-    FunctionDumper::PointerType Pointer =
-        Symbol.isReference() ? FunctionDumper::PointerType::Reference
-                             : FunctionDumper::PointerType::Pointer;
-    NestedDumper.start(*Func, Pointer);
-  } else {
-    if (Symbol.isConstType())
-      WithColor(Printer, PDB_ColorItem::Keyword).get() << "const ";
-    if (Symbol.isVolatileType())
-      WithColor(Printer, PDB_ColorItem::Keyword).get() << "volatile ";
-    PointeeType->dump(*this);
-    Printer << (Symbol.isReference() ? "&" : "*");
+void VariableDumper::dumpRight(const PDBSymbolTypePointer &Symbol) {
+  auto PointeeType = Symbol.getPointeeType();
+  assert(PointeeType);
+  if (!PointeeType)
+    return;
+  if (isa<PDBSymbolTypeFunctionSig>(PointeeType.get()) ||
+      isa<PDBSymbolTypeArray>(PointeeType.get())) {
+    Printer << ")";
   }
+  PointeeType->dumpRight(*this);
 }
 
 void VariableDumper::dump(const PDBSymbolTypeTypedef &Symbol) {
@@ -128,44 +195,7 @@ void VariableDumper::dump(const PDBSymbolTypeUDT &Symbol) {
 
 void VariableDumper::dumpSymbolTypeAndName(const PDBSymbol &Type,
                                            StringRef Name) {
-  if (auto *ArrayType = dyn_cast<PDBSymbolTypeArray>(&Type)) {
-    std::string IndexSpec;
-    raw_string_ostream IndexStream(IndexSpec);
-    std::unique_ptr<PDBSymbol> ElementType = ArrayType->getElementType();
-    while (auto NestedArray = dyn_cast<PDBSymbolTypeArray>(ElementType.get())) {
-      IndexStream << "[";
-      IndexStream << NestedArray->getCount();
-      IndexStream << "]";
-      ElementType = NestedArray->getElementType();
-    }
-    IndexStream << "[" << ArrayType->getCount() << "]";
-    ElementType->dump(*this);
-    WithColor(Printer, PDB_ColorItem::Identifier).get() << " " << Name;
-    Printer << IndexStream.str();
-  } else {
-    if (!tryDumpFunctionPointer(Type, Name)) {
-      Type.dump(*this);
-      WithColor(Printer, PDB_ColorItem::Identifier).get() << " " << Name;
-    }
-  }
-}
-
-bool VariableDumper::tryDumpFunctionPointer(const PDBSymbol &Type,
-                                            StringRef Name) {
-  // Function pointers come across as pointers to function signatures.  But the
-  // signature carries no name, so we have to handle this case separately.
-  if (auto *PointerType = dyn_cast<PDBSymbolTypePointer>(&Type)) {
-    auto PointeeType = PointerType->getPointeeType();
-    if (auto *FunctionSig =
-            dyn_cast<PDBSymbolTypeFunctionSig>(PointeeType.get())) {
-      FunctionDumper Dumper(Printer);
-      FunctionDumper::PointerType PT = FunctionDumper::PointerType::Pointer;
-      if (PointerType->isReference())
-        PT = FunctionDumper::PointerType::Reference;
-      std::string NameStr(Name.begin(), Name.end());
-      Dumper.start(*FunctionSig, NameStr.c_str(), PT);
-      return true;
-    }
-  }
-  return false;
+  Type.dump(*this);
+  WithColor(Printer, PDB_ColorItem::Identifier).get() << " " << Name;
+  Type.dumpRight(*this);
 }
diff --git a/tools/llvm-pdbdump/PrettyVariableDumper.h b/tools/llvm-pdbdump/PrettyVariableDumper.h
index a122bb86058cfd0f2bd9b1c75a842ec160641b80..eec389b170777205c74d8300674e7ecc64cf0649 100644
--- a/tools/llvm-pdbdump/PrettyVariableDumper.h
+++ b/tools/llvm-pdbdump/PrettyVariableDumper.h
@@ -26,6 +26,7 @@ public:
 
   void start(const PDBSymbolData &Var);
 
+  void dump(const PDBSymbolTypeArray &Symbol) override;
   void dump(const PDBSymbolTypeBuiltin &Symbol) override;
   void dump(const PDBSymbolTypeEnum &Symbol) override;
   void dump(const PDBSymbolTypeFunctionSig &Symbol) override;
@@ -33,9 +34,12 @@ public:
   void dump(const PDBSymbolTypeTypedef &Symbol) override;
   void dump(const PDBSymbolTypeUDT &Symbol) override;
 
+  void dumpRight(const PDBSymbolTypeArray &Symbol) override;
+  void dumpRight(const PDBSymbolTypeFunctionSig &Symbol) override;
+  void dumpRight(const PDBSymbolTypePointer &Symbol) override;
+
 private:
   void dumpSymbolTypeAndName(const PDBSymbol &Type, StringRef Name);
-  bool tryDumpFunctionPointer(const PDBSymbol &Type, StringRef Name);
 
   LinePrinter &Printer;
 };
diff --git a/tools/llvm-pdbdump/StreamUtil.cpp b/tools/llvm-pdbdump/StreamUtil.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..db1e01aa015435c64187ea60de8129e062d3045d
--- /dev/null
+++ b/tools/llvm-pdbdump/StreamUtil.cpp
@@ -0,0 +1,136 @@
+//===- StreamUtil.cpp - PDB stream utilities --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "StreamUtil.h"
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseMapInfo.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
+#include "llvm/DebugInfo/PDB/Native/InfoStream.h"
+#include "llvm/DebugInfo/PDB/Native/ModInfo.h"
+#include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/TpiStream.h"
+
+namespace llvm {
+namespace pdb {
+void discoverStreamPurposes(PDBFile &File,
+                            SmallVectorImpl<std::string> &Purposes) {
+
+  // It's OK if we fail to load some of these streams, we still attempt to print
+  // what we can.
+  auto Dbi = File.getPDBDbiStream();
+  auto Tpi = File.getPDBTpiStream();
+  auto Ipi = File.getPDBIpiStream();
+  auto Info = File.getPDBInfoStream();
+
+  uint32_t StreamCount = File.getNumStreams();
+  DenseMap<uint16_t, const ModuleInfoEx *> ModStreams;
+  DenseMap<uint16_t, std::string> NamedStreams;
+
+  if (Dbi) {
+    for (auto &ModI : Dbi->modules()) {
+      uint16_t SN = ModI.Info.getModuleStreamIndex();
+      if (SN != kInvalidStreamIndex)
+        ModStreams[SN] = &ModI;
+    }
+  }
+  if (Info) {
+    for (auto &NSE : Info->named_streams()) {
+      if (NSE.second != kInvalidStreamIndex)
+        NamedStreams[NSE.second] = NSE.first();
+    }
+  }
+
+  Purposes.resize(StreamCount);
+  for (uint16_t StreamIdx = 0; StreamIdx < StreamCount; ++StreamIdx) {
+    std::string Value;
+    if (StreamIdx == OldMSFDirectory)
+      Value = "Old MSF Directory";
+    else if (StreamIdx == StreamPDB)
+      Value = "PDB Stream";
+    else if (StreamIdx == StreamDBI)
+      Value = "DBI Stream";
+    else if (StreamIdx == StreamTPI)
+      Value = "TPI Stream";
+    else if (StreamIdx == StreamIPI)
+      Value = "IPI Stream";
+    else if (Dbi && StreamIdx == Dbi->getGlobalSymbolStreamIndex())
+      Value = "Global Symbol Hash";
+    else if (Dbi && StreamIdx == Dbi->getPublicSymbolStreamIndex())
+      Value = "Public Symbol Hash";
+    else if (Dbi && StreamIdx == Dbi->getSymRecordStreamIndex())
+      Value = "Public Symbol Records";
+    else if (Tpi && StreamIdx == Tpi->getTypeHashStreamIndex())
+      Value = "TPI Hash";
+    else if (Tpi && StreamIdx == Tpi->getTypeHashStreamAuxIndex())
+      Value = "TPI Aux Hash";
+    else if (Ipi && StreamIdx == Ipi->getTypeHashStreamIndex())
+      Value = "IPI Hash";
+    else if (Ipi && StreamIdx == Ipi->getTypeHashStreamAuxIndex())
+      Value = "IPI Aux Hash";
+    else if (Dbi &&
+             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Exception))
+      Value = "Exception Data";
+    else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Fixup))
+      Value = "Fixup Data";
+    else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::FPO))
+      Value = "FPO Data";
+    else if (Dbi &&
+             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::NewFPO))
+      Value = "New FPO Data";
+    else if (Dbi &&
+             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::OmapFromSrc))
+      Value = "Omap From Source Data";
+    else if (Dbi &&
+             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::OmapToSrc))
+      Value = "Omap To Source Data";
+    else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Pdata))
+      Value = "Pdata";
+    else if (Dbi &&
+             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::SectionHdr))
+      Value = "Section Header Data";
+    else if (Dbi &&
+             StreamIdx ==
+                 Dbi->getDebugStreamIndex(DbgHeaderType::SectionHdrOrig))
+      Value = "Section Header Original Data";
+    else if (Dbi &&
+             StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::TokenRidMap))
+      Value = "Token Rid Data";
+    else if (Dbi && StreamIdx == Dbi->getDebugStreamIndex(DbgHeaderType::Xdata))
+      Value = "Xdata";
+    else {
+      auto ModIter = ModStreams.find(StreamIdx);
+      auto NSIter = NamedStreams.find(StreamIdx);
+      if (ModIter != ModStreams.end()) {
+        Value = "Module \"";
+        Value += ModIter->second->Info.getModuleName().str();
+        Value += "\"";
+      } else if (NSIter != NamedStreams.end()) {
+        Value = "Named Stream \"";
+        Value += NSIter->second;
+        Value += "\"";
+      } else {
+        Value = "???";
+      }
+    }
+    Purposes[StreamIdx] = Value;
+  }
+
+  // Consume errors from missing streams.
+  if (!Dbi)
+    consumeError(Dbi.takeError());
+  if (!Tpi)
+    consumeError(Tpi.takeError());
+  if (!Ipi)
+    consumeError(Ipi.takeError());
+  if (!Info)
+    consumeError(Info.takeError());
+}
+}
+}
diff --git a/tools/llvm-pdbdump/StreamUtil.h b/tools/llvm-pdbdump/StreamUtil.h
new file mode 100644
index 0000000000000000000000000000000000000000..b5c0beba44fed63a482c195a9ab7cacfc284e6d2
--- /dev/null
+++ b/tools/llvm-pdbdump/StreamUtil.h
@@ -0,0 +1,25 @@
+//===- Streamutil.h - PDB stream utilities ----------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVMPDBDUMP_STREAMUTIL_H
+#define LLVM_TOOLS_LLVMPDBDUMP_STREAMUTIL_H
+
+#include "llvm/ADT/SmallVector.h"
+
+#include <string>
+
+namespace llvm {
+namespace pdb {
+class PDBFile;
+void discoverStreamPurposes(PDBFile &File,
+                            SmallVectorImpl<std::string> &Purposes);
+}
+}
+
+#endif
diff --git a/tools/llvm-pdbdump/YAMLOutputStyle.cpp b/tools/llvm-pdbdump/YAMLOutputStyle.cpp
index 065a60d32a8cc23d6c3be1b52444661580f29ab6..5b53d2137166a02c288e2e0b418c74288b3f947f 100644
--- a/tools/llvm-pdbdump/YAMLOutputStyle.cpp
+++ b/tools/llvm-pdbdump/YAMLOutputStyle.cpp
@@ -24,7 +24,9 @@ using namespace llvm;
 using namespace llvm::pdb;
 
 YAMLOutputStyle::YAMLOutputStyle(PDBFile &File)
-    : File(File), Out(outs()), Obj(File.getAllocator()) {}
+    : File(File), Out(outs()), Obj(File.getAllocator()) {
+  Out.setWriteDefaultValues(!opts::pdb2yaml::Minimal);
+}
 
 Error YAMLOutputStyle::dump() {
   if (opts::pdb2yaml::StreamDirectory)
@@ -143,6 +145,7 @@ Error YAMLOutputStyle::dumpPDBStream() {
   Obj.PdbStream->Guid = InfoS.getGuid();
   Obj.PdbStream->Signature = InfoS.getSignature();
   Obj.PdbStream->Version = InfoS.getVersion();
+  Obj.PdbStream->Features = InfoS.getFeatureSignatures();
 
   return Error::success();
 }
diff --git a/tools/llvm-pdbdump/YamlSymbolDumper.cpp b/tools/llvm-pdbdump/YamlSymbolDumper.cpp
index 210260a03b5fc850d80b3fc75f66744a815db9a7..431bf404fb04016e943bc4cfb970277775bc5133 100644
--- a/tools/llvm-pdbdump/YamlSymbolDumper.cpp
+++ b/tools/llvm-pdbdump/YamlSymbolDumper.cpp
@@ -113,6 +113,7 @@ template <> struct ScalarEnumerationTraits<RegisterId> {
     for (const auto &E : RegNames) {
       io.enumCase(Reg, E.Name.str().c_str(), static_cast<RegisterId>(E.Value));
     }
+    io.enumFallback<Hex16>(Reg);
   }
 };
 
diff --git a/tools/llvm-pdbdump/YamlTypeDumper.cpp b/tools/llvm-pdbdump/YamlTypeDumper.cpp
index 80bf9349b0cb7b16401bcbdc0cb31ef47d4cf342..b4eb197e866a4631aa8eaaffb15faf625814ead8 100644
--- a/tools/llvm-pdbdump/YamlTypeDumper.cpp
+++ b/tools/llvm-pdbdump/YamlTypeDumper.cpp
@@ -194,6 +194,13 @@ template <> struct ScalarEnumerationTraits<WindowsRTClassKind> {
   }
 };
 
+template <> struct ScalarEnumerationTraits<LabelType> {
+  static void enumeration(IO &IO, LabelType &Value) {
+    IO.enumCase(Value, "Near", LabelType::Near);
+    IO.enumCase(Value, "Far", LabelType::Far);
+  }
+};
+
 template <> struct ScalarBitSetTraits<PointerOptions> {
   static void bitset(IO &IO, PointerOptions &Options) {
     IO.bitSetCase(Options, "None", PointerOptions::None);
@@ -291,7 +298,11 @@ void MappingTraits<StringIdRecord>::mapping(IO &IO, StringIdRecord &String) {
 }
 
 void MappingTraits<ArgListRecord>::mapping(IO &IO, ArgListRecord &Args) {
-  IO.mapRequired("ArgIndices", Args.StringIndices);
+  IO.mapRequired("ArgIndices", Args.ArgIndices);
+}
+
+void MappingTraits<StringListRecord>::mapping(IO &IO, StringListRecord &Strings) {
+  IO.mapRequired("StringIndices", Strings.StringIndices);
 }
 
 void MappingTraits<ClassRecord>::mapping(IO &IO, ClassRecord &Class) {
@@ -427,6 +438,10 @@ void MappingTraits<BuildInfoRecord>::mapping(IO &IO, BuildInfoRecord &Args) {
   IO.mapRequired("ArgIndices", Args.ArgIndices);
 }
 
+void MappingTraits<LabelRecord>::mapping(IO &IO, LabelRecord &R) {
+  IO.mapRequired("Mode", R.Mode);
+}
+
 void MappingTraits<NestedTypeRecord>::mapping(IO &IO,
                                               NestedTypeRecord &Nested) {
   IO.mapRequired("Type", Nested.Type);
@@ -573,8 +588,8 @@ struct MappingContextTraits<pdb::yaml::PdbTpiFieldListRecord,
     assert(IO.outputting());
     codeview::TypeVisitorCallbackPipeline Pipeline;
 
-    msf::ByteStream Data(Obj.Record.Data);
-    msf::StreamReader FieldReader(Data);
+    BinaryByteStream Data(Obj.Record.Data, llvm::support::little);
+    BinaryStreamReader FieldReader(Data);
     codeview::FieldListDeserializer Deserializer(FieldReader);
 
     // For PDB to Yaml, deserialize into a high level record type, then dump
diff --git a/tools/llvm-pdbdump/fuzzer/llvm-pdbdump-fuzzer.cpp b/tools/llvm-pdbdump/fuzzer/llvm-pdbdump-fuzzer.cpp
index e818dda32fc027320f7d5e0a3f6d0bc74407f290..38eaf16c65b05dae4ccbca9f3fe76f40e0ee65e9 100644
--- a/tools/llvm-pdbdump/fuzzer/llvm-pdbdump-fuzzer.cpp
+++ b/tools/llvm-pdbdump/fuzzer/llvm-pdbdump-fuzzer.cpp
@@ -13,7 +13,7 @@
 ///
 //===----------------------------------------------------------------------===//
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/DebugInfo/CodeView/ByteStream.h"
+#include "llvm/DebugInfo/CodeView/BinaryByteStream.h"
 #include "llvm/DebugInfo/CodeView/SymbolDumper.h"
 #include "llvm/DebugInfo/CodeView/TypeDumper.h"
 #include "llvm/DebugInfo/PDB/Raw/DbiStream.h"
@@ -28,14 +28,15 @@
 using namespace llvm;
 
 namespace {
-// We need a class which behaves like an immutable ByteStream, but whose data
+// We need a class which behaves like an immutable BinaryByteStream, but whose
+// data
 // is backed by an llvm::MemoryBuffer.  It also needs to own the underlying
 // MemoryBuffer, so this simple adapter is a good way to achieve that.
-class InputByteStream : public codeview::ByteStream<false> {
+class InputByteStream : public codeview::BinaryByteStream<false> {
 public:
   explicit InputByteStream(std::unique_ptr<MemoryBuffer> Buffer)
-      : ByteStream(ArrayRef<uint8_t>(Buffer->getBuffer().bytes_begin(),
-                                     Buffer->getBuffer().bytes_end())),
+      : BinaryByteStream(ArrayRef<uint8_t>(Buffer->getBuffer().bytes_begin(),
+                                           Buffer->getBuffer().bytes_end())),
         MemBuffer(std::move(Buffer)) {}
 
   std::unique_ptr<MemoryBuffer> MemBuffer;
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.cpp b/tools/llvm-pdbdump/llvm-pdbdump.cpp
index d076c445c2e9e73b255cfbc4700ec1f4a8983a4d..8a749bab98e8275b03f92def1ccfe000f728f141 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.cpp
+++ b/tools/llvm-pdbdump/llvm-pdbdump.cpp
@@ -7,13 +7,14 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// Dumps debug information present in PDB files.  This utility makes use of
-// the Microsoft Windows SDK, so will not compile or run on non-Windows
-// platforms.
+// Dumps debug information present in PDB files.
 //
 //===----------------------------------------------------------------------===//
 
 #include "llvm-pdbdump.h"
+
+#include "Analyze.h"
+#include "Diff.h"
 #include "LLVMOutputStyle.h"
 #include "LinePrinter.h"
 #include "OutputStyle.h"
@@ -29,7 +30,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/PDB/GenericError.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
@@ -39,6 +39,7 @@
 #include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStream.h"
 #include "llvm/DebugInfo/PDB/Native/InfoStreamBuilder.h"
+#include "llvm/DebugInfo/PDB/Native/ModInfoBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
@@ -53,6 +54,7 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolFunc.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolThunk.h"
+#include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/COM.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ConvertUTF.h"
@@ -79,6 +81,9 @@ cl::SubCommand RawSubcommand("raw", "Dump raw structure of the PDB file");
 cl::SubCommand
     PrettySubcommand("pretty",
                      "Dump semantic information about types and symbols");
+
+cl::SubCommand DiffSubcommand("diff", "Diff the contents of 2 PDB files");
+
 cl::SubCommand
     YamlToPdbSubcommand("yaml2pdb",
                         "Generate a PDB file from a YAML description");
@@ -86,6 +91,10 @@ cl::SubCommand
     PdbToYamlSubcommand("pdb2yaml",
                         "Generate a detailed YAML description of a PDB File");
 
+cl::SubCommand
+    AnalyzeSubcommand("analyze",
+                      "Analyze various aspects of a PDB's structure");
+
 cl::OptionCategory TypeCategory("Symbol Type Options");
 cl::OptionCategory FilterCategory("Filtering Options");
 cl::OptionCategory OtherOptions("Other Options");
@@ -103,8 +112,25 @@ cl::opt<bool> Globals("globals", cl::desc("Dump global symbols"),
                       cl::cat(TypeCategory), cl::sub(PrettySubcommand));
 cl::opt<bool> Externals("externals", cl::desc("Dump external symbols"),
                         cl::cat(TypeCategory), cl::sub(PrettySubcommand));
-cl::opt<bool> Types("types", cl::desc("Display types"), cl::cat(TypeCategory),
-                    cl::sub(PrettySubcommand));
+cl::opt<bool>
+    Types("types",
+          cl::desc("Display all types (implies -classes, -enums, -typedefs)"),
+          cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+cl::opt<bool> Classes("classes", cl::desc("Display class types"),
+                      cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+cl::opt<bool> Enums("enums", cl::desc("Display enum types"),
+                    cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+cl::opt<bool> Typedefs("typedefs", cl::desc("Display typedef types"),
+                       cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+cl::opt<ClassDefinitionFormat>
+    ClassFormat("class-definitions", cl::desc("Class definition format"),
+                cl::init(ClassDefinitionFormat::Standard),
+                cl::values(clEnumValN(ClassDefinitionFormat::Standard, "full",
+                                      "Display complete class definition"),
+                           clEnumValN(ClassDefinitionFormat::None, "none",
+                                      "Don't display class definitions")),
+                cl::cat(TypeCategory), cl::sub(PrettySubcommand));
+
 cl::opt<bool> Lines("lines", cl::desc("Line tables"), cl::cat(TypeCategory),
                     cl::sub(PrettySubcommand));
 cl::opt<bool>
@@ -115,6 +141,12 @@ cl::opt<uint64_t> LoadAddress(
     "load-address",
     cl::desc("Assume the module is loaded at the specified address"),
     cl::cat(OtherOptions), cl::sub(PrettySubcommand));
+cl::opt<bool> Native("native", cl::desc("Use native PDB reader instead of DIA"),
+                     cl::cat(OtherOptions), cl::sub(PrettySubcommand));
+cl::opt<cl::boolOrDefault>
+    ColorOutput("color-output",
+                cl::desc("Override use of color (default = isatty)"),
+                cl::cat(OtherOptions), cl::sub(PrettySubcommand));
 cl::list<std::string> ExcludeTypes(
     "exclude-types", cl::desc("Exclude types by regular expression"),
     cl::ZeroOrMore, cl::cat(FilterCategory), cl::sub(PrettySubcommand));
@@ -137,6 +169,10 @@ cl::list<std::string> IncludeCompilands(
     "include-compilands",
     cl::desc("Include only compilands those which match a regular expression"),
     cl::ZeroOrMore, cl::cat(FilterCategory), cl::sub(PrettySubcommand));
+cl::opt<bool> OnlyPaddingClasses(
+    "only-padding-classes", cl::desc("When dumping classes, only display those "
+                                     "with non-zero amounts of padding bytes"),
+    cl::ZeroOrMore, cl::cat(FilterCategory), cl::sub(PrettySubcommand));
 
 cl::opt<bool> ExcludeCompilerGenerated(
     "no-compiler-generated",
@@ -146,14 +182,23 @@ cl::opt<bool>
     ExcludeSystemLibraries("no-system-libs",
                            cl::desc("Don't show symbols from system libraries"),
                            cl::cat(FilterCategory), cl::sub(PrettySubcommand));
-cl::opt<bool> NoClassDefs("no-class-definitions",
-                          cl::desc("Don't display full class definitions"),
-                          cl::cat(FilterCategory), cl::sub(PrettySubcommand));
+
 cl::opt<bool> NoEnumDefs("no-enum-definitions",
                          cl::desc("Don't display full enum definitions"),
                          cl::cat(FilterCategory), cl::sub(PrettySubcommand));
 }
 
+namespace diff {
+cl::opt<bool> Pedantic("pedantic",
+                       cl::desc("Finds all differences (even structural ones "
+                                "that produce otherwise identical PDBs)"),
+                       cl::sub(DiffSubcommand));
+
+cl::list<std::string> InputFilenames(cl::Positional,
+                                     cl::desc("<first> <second>"),
+                                     cl::OneOrMore, cl::sub(DiffSubcommand));
+}
+
 namespace raw {
 
 cl::OptionCategory MsfOptions("MSF Container Options");
@@ -271,6 +316,9 @@ cl::opt<bool>
                   cl::desc("Do not dump MSF file headers (you will not be able "
                            "to generate a fresh PDB from the resulting YAML)"),
                   cl::sub(PdbToYamlSubcommand), cl::init(false));
+cl::opt<bool> Minimal("minimal",
+                      cl::desc("Don't write fields with default values"),
+                      cl::sub(PdbToYamlSubcommand), cl::init(false));
 
 cl::opt<bool> StreamMetadata(
     "stream-metadata",
@@ -318,6 +366,14 @@ cl::list<std::string> InputFilename(cl::Positional,
                                     cl::desc("<input PDB file>"), cl::Required,
                                     cl::sub(PdbToYamlSubcommand));
 }
+
+namespace analyze {
+cl::opt<bool> StringTable("hash-collisions", cl::desc("Find hash collisions"),
+                          cl::sub(AnalyzeSubcommand), cl::init(false));
+cl::list<std::string> InputFilename(cl::Positional,
+                                    cl::desc("<input PDB file>"), cl::Required,
+                                    cl::sub(AnalyzeSubcommand));
+}
 }
 
 static ExitOnError ExitOnErr;
@@ -337,13 +393,13 @@ static void yamlToPdb(StringRef Path) {
   llvm::yaml::Input In(Buffer->getBuffer());
   pdb::yaml::PdbObject YamlObj(Allocator);
   In >> YamlObj;
-  if (!YamlObj.Headers.hasValue())
-    ExitOnErr(make_error<GenericError>(generic_error_code::unspecified,
-                                       "Yaml does not contain MSF headers"));
 
   PDBFileBuilder Builder(Allocator);
 
-  ExitOnErr(Builder.initialize(YamlObj.Headers->SuperBlock.BlockSize));
+  uint32_t BlockSize = 4096;
+  if (YamlObj.Headers.hasValue())
+    BlockSize = YamlObj.Headers->SuperBlock.BlockSize;
+  ExitOnErr(Builder.initialize(BlockSize));
   // Add each of the reserved streams.  We ignore stream metadata in the
   // yaml, because we will reconstruct our own view of the streams.  For
   // example, the YAML may say that there were 20 streams in the original
@@ -359,53 +415,68 @@ static void yamlToPdb(StringRef Path) {
       Strings.insert(S);
   }
 
-  if (YamlObj.PdbStream.hasValue()) {
-    auto &InfoBuilder = Builder.getInfoBuilder();
-    InfoBuilder.setAge(YamlObj.PdbStream->Age);
-    InfoBuilder.setGuid(YamlObj.PdbStream->Guid);
-    InfoBuilder.setSignature(YamlObj.PdbStream->Signature);
-    InfoBuilder.setVersion(YamlObj.PdbStream->Version);
-  }
-
-  if (YamlObj.DbiStream.hasValue()) {
-    auto &DbiBuilder = Builder.getDbiBuilder();
-    DbiBuilder.setAge(YamlObj.DbiStream->Age);
-    DbiBuilder.setBuildNumber(YamlObj.DbiStream->BuildNumber);
-    DbiBuilder.setFlags(YamlObj.DbiStream->Flags);
-    DbiBuilder.setMachineType(YamlObj.DbiStream->MachineType);
-    DbiBuilder.setPdbDllRbld(YamlObj.DbiStream->PdbDllRbld);
-    DbiBuilder.setPdbDllVersion(YamlObj.DbiStream->PdbDllVersion);
-    DbiBuilder.setVersionHeader(YamlObj.DbiStream->VerHeader);
-    for (const auto &MI : YamlObj.DbiStream->ModInfos) {
-      ExitOnErr(DbiBuilder.addModuleInfo(MI.Obj, MI.Mod));
-      for (auto S : MI.SourceFiles)
-        ExitOnErr(DbiBuilder.addModuleSourceFile(MI.Mod, S));
+  pdb::yaml::PdbInfoStream DefaultInfoStream;
+  pdb::yaml::PdbDbiStream DefaultDbiStream;
+  pdb::yaml::PdbTpiStream DefaultTpiStream;
+
+  const auto &Info = YamlObj.PdbStream.getValueOr(DefaultInfoStream);
+
+  auto &InfoBuilder = Builder.getInfoBuilder();
+  InfoBuilder.setAge(Info.Age);
+  InfoBuilder.setGuid(Info.Guid);
+  InfoBuilder.setSignature(Info.Signature);
+  InfoBuilder.setVersion(Info.Version);
+  for (auto F : Info.Features)
+    InfoBuilder.addFeature(F);
+
+  const auto &Dbi = YamlObj.DbiStream.getValueOr(DefaultDbiStream);
+  auto &DbiBuilder = Builder.getDbiBuilder();
+  DbiBuilder.setAge(Dbi.Age);
+  DbiBuilder.setBuildNumber(Dbi.BuildNumber);
+  DbiBuilder.setFlags(Dbi.Flags);
+  DbiBuilder.setMachineType(Dbi.MachineType);
+  DbiBuilder.setPdbDllRbld(Dbi.PdbDllRbld);
+  DbiBuilder.setPdbDllVersion(Dbi.PdbDllVersion);
+  DbiBuilder.setVersionHeader(Dbi.VerHeader);
+  for (const auto &MI : Dbi.ModInfos) {
+    auto &ModiBuilder = ExitOnErr(DbiBuilder.addModuleInfo(MI.Mod));
+
+    for (auto S : MI.SourceFiles)
+      ExitOnErr(DbiBuilder.addModuleSourceFile(MI.Mod, S));
+    if (MI.Modi.hasValue()) {
+      const auto &ModiStream = *MI.Modi;
+      ModiBuilder.setObjFileName(MI.Obj);
+      for (auto Symbol : ModiStream.Symbols)
+        ModiBuilder.addSymbol(Symbol.Record);
     }
   }
 
-  if (YamlObj.TpiStream.hasValue()) {
-    auto &TpiBuilder = Builder.getTpiBuilder();
-    TpiBuilder.setVersionHeader(YamlObj.TpiStream->Version);
-    for (const auto &R : YamlObj.TpiStream->Records)
-      TpiBuilder.addTypeRecord(R.Record);
-  }
+  auto &TpiBuilder = Builder.getTpiBuilder();
+  const auto &Tpi = YamlObj.TpiStream.getValueOr(DefaultTpiStream);
+  TpiBuilder.setVersionHeader(Tpi.Version);
+  for (const auto &R : Tpi.Records)
+    TpiBuilder.addTypeRecord(R.Record.data(), R.Record.Hash);
 
-  if (YamlObj.IpiStream.hasValue()) {
-    auto &IpiBuilder = Builder.getIpiBuilder();
-    IpiBuilder.setVersionHeader(YamlObj.IpiStream->Version);
-    for (const auto &R : YamlObj.IpiStream->Records)
-      IpiBuilder.addTypeRecord(R.Record);
-  }
+  const auto &Ipi = YamlObj.IpiStream.getValueOr(DefaultTpiStream);
+  auto &IpiBuilder = Builder.getIpiBuilder();
+  IpiBuilder.setVersionHeader(Ipi.Version);
+  for (const auto &R : Ipi.Records)
+    TpiBuilder.addTypeRecord(R.Record.data(), R.Record.Hash);
 
   ExitOnErr(Builder.commit(opts::yaml2pdb::YamlPdbOutputFile));
 }
 
+static PDBFile &loadPDB(StringRef Path, std::unique_ptr<IPDBSession> &Session) {
+  ExitOnErr(loadDataForPDB(PDB_ReaderType::Native, Path, Session));
+
+  NativeSession *NS = static_cast<NativeSession *>(Session.get());
+  return NS->getPDBFile();
+}
+
 static void pdb2Yaml(StringRef Path) {
   std::unique_ptr<IPDBSession> Session;
-  ExitOnErr(loadDataForPDB(PDB_ReaderType::Native, Path, Session));
+  auto &File = loadPDB(Path, Session);
 
-  NativeSession *RS = static_cast<NativeSession *>(Session.get());
-  PDBFile &File = RS->getPDBFile();
   auto O = llvm::make_unique<YAMLOutputStyle>(File);
   O = llvm::make_unique<YAMLOutputStyle>(File);
 
@@ -414,24 +485,48 @@ static void pdb2Yaml(StringRef Path) {
 
 static void dumpRaw(StringRef Path) {
   std::unique_ptr<IPDBSession> Session;
-  ExitOnErr(loadDataForPDB(PDB_ReaderType::Native, Path, Session));
+  auto &File = loadPDB(Path, Session);
 
-  NativeSession *RS = static_cast<NativeSession *>(Session.get());
-  PDBFile &File = RS->getPDBFile();
   auto O = llvm::make_unique<LLVMOutputStyle>(File);
 
   ExitOnErr(O->dump());
 }
 
+static void dumpAnalysis(StringRef Path) {
+  std::unique_ptr<IPDBSession> Session;
+  auto &File = loadPDB(Path, Session);
+  auto O = llvm::make_unique<AnalysisStyle>(File);
+
+  ExitOnErr(O->dump());
+}
+
+static void diff(StringRef Path1, StringRef Path2) {
+  std::unique_ptr<IPDBSession> Session1;
+  std::unique_ptr<IPDBSession> Session2;
+
+  auto &File1 = loadPDB(Path1, Session1);
+  auto &File2 = loadPDB(Path2, Session2);
+
+  auto O = llvm::make_unique<DiffStyle>(File1, File2);
+
+  ExitOnErr(O->dump());
+}
+
 static void dumpPretty(StringRef Path) {
   std::unique_ptr<IPDBSession> Session;
 
-  ExitOnErr(loadDataForPDB(PDB_ReaderType::DIA, Path, Session));
+  const auto ReaderType =
+      opts::pretty::Native ? PDB_ReaderType::Native : PDB_ReaderType::DIA;
+  ExitOnErr(loadDataForPDB(ReaderType, Path, Session));
 
   if (opts::pretty::LoadAddress)
     Session->setLoadAddress(opts::pretty::LoadAddress);
 
-  LinePrinter Printer(2, outs());
+  auto &Stream = outs();
+  const bool UseColor = opts::pretty::ColorOutput == cl::BOU_UNSET
+                            ? Stream.has_colors()
+                            : opts::pretty::ColorOutput == cl::BOU_TRUE;
+  LinePrinter Printer(2, UseColor, Stream);
 
   auto GlobalScope(Session->getGlobalScope());
   std::string FileName(GlobalScope->getSymbolsFileName());
@@ -481,7 +576,7 @@ static void dumpPretty(StringRef Path) {
     Printer.Unindent();
   }
 
-  if (opts::pretty::Types) {
+  if (opts::pretty::Classes || opts::pretty::Enums || opts::pretty::Typedefs) {
     Printer.NewLine();
     WithColor(Printer, PDB_ColorItem::SectionHeader).get() << "---TYPES---";
     Printer.Indent();
@@ -608,6 +703,8 @@ int main(int argc_, const char *argv_[]) {
     pdb2Yaml(opts::pdb2yaml::InputFilename.front());
   } else if (opts::YamlToPdbSubcommand) {
     yamlToPdb(opts::yaml2pdb::InputFilename.front());
+  } else if (opts::AnalyzeSubcommand) {
+    dumpAnalysis(opts::analyze::InputFilename.front());
   } else if (opts::PrettySubcommand) {
     if (opts::pretty::Lines)
       opts::pretty::Compilands = true;
@@ -621,6 +718,12 @@ int main(int argc_, const char *argv_[]) {
       opts::pretty::Lines = true;
     }
 
+    if (opts::pretty::Types) {
+      opts::pretty::Classes = true;
+      opts::pretty::Typedefs = true;
+      opts::pretty::Enums = true;
+    }
+
     // When adding filters for excluded compilands and types, we need to
     // remember that these are regexes.  So special characters such as * and \
     // need to be escaped in the regex.  In the case of a literal \, this means
@@ -642,6 +745,12 @@ int main(int argc_, const char *argv_[]) {
   } else if (opts::RawSubcommand) {
     std::for_each(opts::raw::InputFilenames.begin(),
                   opts::raw::InputFilenames.end(), dumpRaw);
+  } else if (opts::DiffSubcommand) {
+    if (opts::diff::InputFilenames.size() != 2) {
+      errs() << "diff subcommand expects exactly 2 arguments.\n";
+      exit(1);
+    }
+    diff(opts::diff::InputFilenames[0], opts::diff::InputFilenames[1]);
   }
 
   outs().flush();
diff --git a/tools/llvm-pdbdump/llvm-pdbdump.h b/tools/llvm-pdbdump/llvm-pdbdump.h
index d4f082cae7c4596c5d144b1cfda083e1c391a3c1..a335d30c4cf289e52d62b6dc4ee015ee186b3720 100644
--- a/tools/llvm-pdbdump/llvm-pdbdump.h
+++ b/tools/llvm-pdbdump/llvm-pdbdump.h
@@ -17,14 +17,18 @@
 namespace opts {
 
 namespace pretty {
+
+enum class ClassDefinitionFormat { None, Standard };
+
 extern llvm::cl::opt<bool> Compilands;
 extern llvm::cl::opt<bool> Symbols;
 extern llvm::cl::opt<bool> Globals;
-extern llvm::cl::opt<bool> Types;
+extern llvm::cl::opt<bool> Classes;
+extern llvm::cl::opt<bool> Enums;
+extern llvm::cl::opt<bool> Typedefs;
 extern llvm::cl::opt<bool> All;
 extern llvm::cl::opt<bool> ExcludeCompilerGenerated;
 
-extern llvm::cl::opt<bool> NoClassDefs;
 extern llvm::cl::opt<bool> NoEnumDefs;
 extern llvm::cl::list<std::string> ExcludeTypes;
 extern llvm::cl::list<std::string> ExcludeSymbols;
@@ -32,6 +36,8 @@ extern llvm::cl::list<std::string> ExcludeCompilands;
 extern llvm::cl::list<std::string> IncludeTypes;
 extern llvm::cl::list<std::string> IncludeSymbols;
 extern llvm::cl::list<std::string> IncludeCompilands;
+extern llvm::cl::opt<bool> OnlyPaddingClasses;
+extern llvm::cl::opt<ClassDefinitionFormat> ClassFormat;
 }
 
 namespace raw {
@@ -67,8 +73,13 @@ extern llvm::cl::opt<bool> DumpFpo;
 extern llvm::cl::opt<bool> DumpStringTable;
 }
 
+namespace diff {
+extern llvm::cl::opt<bool> Pedantic;
+}
+
 namespace pdb2yaml {
 extern llvm::cl::opt<bool> NoFileHeaders;
+extern llvm::cl::opt<bool> Minimal;
 extern llvm::cl::opt<bool> StreamMetadata;
 extern llvm::cl::opt<bool> StreamDirectory;
 extern llvm::cl::opt<bool> StringTable;
diff --git a/tools/llvm-profdata/llvm-profdata.cpp b/tools/llvm-profdata/llvm-profdata.cpp
index 6715566a166c2915443db8821f3172a8945c730e..a257910ecf7766a80b2a1d6e9652519ffe645cce 100644
--- a/tools/llvm-profdata/llvm-profdata.cpp
+++ b/tools/llvm-profdata/llvm-profdata.cpp
@@ -446,8 +446,58 @@ static int merge_main(int argc, const char *argv[]) {
   return 0;
 }
 
+typedef struct ValueSitesStats {
+  ValueSitesStats()
+      : TotalNumValueSites(0), TotalNumValueSitesWithValueProfile(0),
+        TotalNumValues(0) {}
+  uint64_t TotalNumValueSites;
+  uint64_t TotalNumValueSitesWithValueProfile;
+  uint64_t TotalNumValues;
+  std::vector<unsigned> ValueSitesHistogram;
+} ValueSitesStats;
+
+static void traverseAllValueSites(const InstrProfRecord &Func, uint32_t VK,
+                                  ValueSitesStats &Stats, raw_fd_ostream &OS,
+                                  InstrProfSymtab *Symtab) {
+  uint32_t NS = Func.getNumValueSites(VK);
+  Stats.TotalNumValueSites += NS;
+  for (size_t I = 0; I < NS; ++I) {
+    uint32_t NV = Func.getNumValueDataForSite(VK, I);
+    std::unique_ptr<InstrProfValueData[]> VD = Func.getValueForSite(VK, I);
+    Stats.TotalNumValues += NV;
+    if (NV) {
+      Stats.TotalNumValueSitesWithValueProfile++;
+      if (NV > Stats.ValueSitesHistogram.size())
+        Stats.ValueSitesHistogram.resize(NV, 0);
+      Stats.ValueSitesHistogram[NV - 1]++;
+    }
+    for (uint32_t V = 0; V < NV; V++) {
+      OS << "\t[ " << I << ", ";
+      if (Symtab == nullptr)
+        OS << VD[V].Value;
+      else
+        OS << Symtab->getFuncName(VD[V].Value);
+      OS << ", " << VD[V].Count << " ]\n";
+    }
+  }
+}
+
+static void showValueSitesStats(raw_fd_ostream &OS, uint32_t VK,
+                                ValueSitesStats &Stats) {
+  OS << "  Total number of sites: " << Stats.TotalNumValueSites << "\n";
+  OS << "  Total number of sites with values: "
+     << Stats.TotalNumValueSitesWithValueProfile << "\n";
+  OS << "  Total number of profiled values: " << Stats.TotalNumValues << "\n";
+
+  OS << "  Value sites histogram:\n\tNumTargets, SiteCount\n";
+  for (unsigned I = 0; I < Stats.ValueSitesHistogram.size(); I++) {
+    if (Stats.ValueSitesHistogram[I] > 0)
+      OS << "\t" << I + 1 << ", " << Stats.ValueSitesHistogram[I] << "\n";
+  }
+}
+
 static int showInstrProfile(const std::string &Filename, bool ShowCounts,
-                            bool ShowIndirectCallTargets,
+                            bool ShowIndirectCallTargets, bool ShowMemOPSizes,
                             bool ShowDetailedSummary,
                             std::vector<uint32_t> DetailedSummaryCutoffs,
                             bool ShowAllFunctions,
@@ -465,10 +515,8 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts,
   auto Reader = std::move(ReaderOrErr.get());
   bool IsIRInstr = Reader->isIRLevelProfile();
   size_t ShownFunctions = 0;
-  uint64_t TotalNumValueSites = 0;
-  uint64_t TotalNumValueSitesWithValueProfile = 0;
-  uint64_t TotalNumValues = 0;
-  std::vector<unsigned> ICHistogram;
+  int NumVPKind = IPVK_Last - IPVK_First + 1;
+  std::vector<ValueSitesStats> VPStats(NumVPKind);
   for (const auto &Func : *Reader) {
     bool Show =
         ShowAllFunctions || (!ShowFunction.empty() &&
@@ -502,6 +550,11 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts,
         OS << "    Indirect Call Site Count: "
            << Func.getNumValueSites(IPVK_IndirectCallTarget) << "\n";
 
+      uint32_t NumMemOPCalls = Func.getNumValueSites(IPVK_MemOPSize);
+      if (ShowMemOPSizes && NumMemOPCalls > 0)
+        OS << "    Number of Memory Intrinsics Calls: " << NumMemOPCalls
+           << "\n";
+
       if (ShowCounts) {
         OS << "    Block counts: [";
         size_t Start = (IsIRInstr ? 0 : 1);
@@ -512,27 +565,16 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts,
       }
 
       if (ShowIndirectCallTargets) {
-        InstrProfSymtab &Symtab = Reader->getSymtab();
-        uint32_t NS = Func.getNumValueSites(IPVK_IndirectCallTarget);
-        OS << "    Indirect Target Results: \n";
-        TotalNumValueSites += NS;
-        for (size_t I = 0; I < NS; ++I) {
-          uint32_t NV = Func.getNumValueDataForSite(IPVK_IndirectCallTarget, I);
-          std::unique_ptr<InstrProfValueData[]> VD =
-              Func.getValueForSite(IPVK_IndirectCallTarget, I);
-          TotalNumValues += NV;
-          if (NV) {
-            TotalNumValueSitesWithValueProfile++;
-            if (NV > ICHistogram.size())
-              ICHistogram.resize(NV, 0);
-            ICHistogram[NV - 1]++;
-          }
-          for (uint32_t V = 0; V < NV; V++) {
-            OS << "\t[ " << I << ", ";
-            OS << Symtab.getFuncName(VD[V].Value) << ", " << VD[V].Count
-               << " ]\n";
-          }
-        }
+        OS << "    Indirect Target Results:\n";
+        traverseAllValueSites(Func, IPVK_IndirectCallTarget,
+                              VPStats[IPVK_IndirectCallTarget], OS,
+                              &(Reader->getSymtab()));
+      }
+
+      if (ShowMemOPSizes && NumMemOPCalls > 0) {
+        OS << "    Memory Instrinsic Size Results:\n";
+        traverseAllValueSites(Func, IPVK_MemOPSize, VPStats[IPVK_MemOPSize], OS,
+                              nullptr);
       }
     }
   }
@@ -547,17 +589,16 @@ static int showInstrProfile(const std::string &Filename, bool ShowCounts,
   OS << "Total functions: " << PS->getNumFunctions() << "\n";
   OS << "Maximum function count: " << PS->getMaxFunctionCount() << "\n";
   OS << "Maximum internal block count: " << PS->getMaxInternalCount() << "\n";
+
   if (ShownFunctions && ShowIndirectCallTargets) {
-    OS << "Total Number of Indirect Call Sites : " << TotalNumValueSites
-       << "\n";
-    OS << "Total Number of Sites With Values : "
-       << TotalNumValueSitesWithValueProfile << "\n";
-    OS << "Total Number of Profiled Values : " << TotalNumValues << "\n";
-
-    OS << "IC Value histogram : \n\tNumTargets, SiteCount\n";
-    for (unsigned I = 0; I < ICHistogram.size(); I++) {
-      OS << "\t" << I + 1 << ", " << ICHistogram[I] << "\n";
-    }
+    OS << "Statistics for indirect call sites profile:\n";
+    showValueSitesStats(OS, IPVK_IndirectCallTarget,
+                        VPStats[IPVK_IndirectCallTarget]);
+  }
+
+  if (ShownFunctions && ShowMemOPSizes) {
+    OS << "Statistics for memory intrinsic calls sizes profile:\n";
+    showValueSitesStats(OS, IPVK_MemOPSize, VPStats[IPVK_MemOPSize]);
   }
 
   if (ShowDetailedSummary) {
@@ -608,6 +649,10 @@ static int show_main(int argc, const char *argv[]) {
   cl::opt<bool> ShowIndirectCallTargets(
       "ic-targets", cl::init(false),
       cl::desc("Show indirect call site target values for shown functions"));
+  cl::opt<bool> ShowMemOPSizes(
+      "memop-sizes", cl::init(false),
+      cl::desc("Show the profiled sizes of the memory intrinsic calls "
+               "for shown functions"));
   cl::opt<bool> ShowDetailedSummary("detailed-summary", cl::init(false),
                                     cl::desc("Show detailed profile summary"));
   cl::list<uint32_t> DetailedSummaryCutoffs(
@@ -646,8 +691,9 @@ static int show_main(int argc, const char *argv[]) {
                                 DetailedSummaryCutoffs.end());
   if (ProfileKind == instr)
     return showInstrProfile(Filename, ShowCounts, ShowIndirectCallTargets,
-                            ShowDetailedSummary, DetailedSummaryCutoffs,
-                            ShowAllFunctions, ShowFunction, TextFormat, OS);
+                            ShowMemOPSizes, ShowDetailedSummary,
+                            DetailedSummaryCutoffs, ShowAllFunctions,
+                            ShowFunction, TextFormat, OS);
   else
     return showSampleProfile(Filename, ShowCounts, ShowAllFunctions,
                              ShowFunction, OS);
diff --git a/tools/llvm-readobj/CMakeLists.txt b/tools/llvm-readobj/CMakeLists.txt
index 5fd45a8cff688a4555a649b10aa85262d0e8a93b..0ad149538f635ce5c4d65811ea6eb021673bd020 100644
--- a/tools/llvm-readobj/CMakeLists.txt
+++ b/tools/llvm-readobj/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
   Support
   DebugInfoCodeView
   DebugInfoMSF
+  DebugInfoPDB
   )
 
 add_llvm_tool(llvm-readobj
diff --git a/tools/llvm-readobj/COFFDumper.cpp b/tools/llvm-readobj/COFFDumper.cpp
index c83655fe4d223ea9cf1a824df29643e410a398d5..9836c137ed2ca862240315c738538e869a6501cf 100644
--- a/tools/llvm-readobj/COFFDumper.cpp
+++ b/tools/llvm-readobj/COFFDumper.cpp
@@ -35,14 +35,15 @@
 #include "llvm/DebugInfo/CodeView/TypeRecord.h"
 #include "llvm/DebugInfo/CodeView/TypeStreamMerger.h"
 #include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
-#include "llvm/DebugInfo/MSF/ByteStream.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/COFF.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/Path.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/Win64EH.h"
@@ -55,7 +56,6 @@
 using namespace llvm;
 using namespace llvm::object;
 using namespace llvm::codeview;
-using namespace llvm::msf;
 using namespace llvm::support;
 using namespace llvm::Win64EH;
 
@@ -79,7 +79,8 @@ public:
   void printCOFFBaseReloc() override;
   void printCOFFDebugDirectory() override;
   void printCodeViewDebugInfo() override;
-  void mergeCodeViewTypes(llvm::codeview::TypeTableBuilder &CVTypes) override;
+  void mergeCodeViewTypes(llvm::codeview::TypeTableBuilder &CVIDs,
+                          llvm::codeview::TypeTableBuilder &CVTypes) override;
   void printStackMap() const override;
 private:
   void printSymbol(const SymbolRef &Sym);
@@ -154,7 +155,7 @@ public:
     Sec = Obj->getCOFFSection(SR);
   }
 
-  uint32_t getRecordOffset(msf::StreamReader Reader) override {
+  uint32_t getRecordOffset(BinaryStreamReader Reader) override {
     ArrayRef<uint8_t> Data;
     if (auto EC = Reader.readLongestContiguousChunk(Data)) {
       llvm::consumeError(std::move(EC));
@@ -840,8 +841,8 @@ void COFFDumper::printCodeViewSymbolSection(StringRef SectionName,
     }
     case ModuleSubstreamKind::FrameData: {
       // First four bytes is a relocation against the function.
-      msf::ByteStream S(Contents);
-      msf::StreamReader SR(S);
+      BinaryByteStream S(Contents, llvm::support::little);
+      BinaryStreamReader SR(S);
       const uint32_t *CodePtr;
       error(SR.readObject(CodePtr));
       StringRef LinkageName;
@@ -965,9 +966,9 @@ void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
 
   CVSymbolDumper CVSD(W, TypeDB, std::move(CODD),
                       opts::CodeViewSubsectionBytes);
-  ByteStream Stream(BinaryData);
+  BinaryByteStream Stream(BinaryData, llvm::support::little);
   CVSymbolArray Symbols;
-  StreamReader Reader(Stream);
+  BinaryStreamReader Reader(Stream);
   if (auto EC = Reader.readArray(Symbols, Reader.getLength())) {
     consumeError(std::move(EC));
     W.flush();
@@ -982,8 +983,8 @@ void COFFDumper::printCodeViewSymbolsSubsection(StringRef Subsection,
 }
 
 void COFFDumper::printCodeViewFileChecksums(StringRef Subsection) {
-  msf::ByteStream S(Subsection);
-  msf::StreamReader SR(S);
+  BinaryByteStream S(Subsection, llvm::support::little);
+  BinaryStreamReader SR(S);
   while (!SR.empty()) {
     DictScope S(W, "FileChecksum");
     const FileChecksum *FC;
@@ -1011,8 +1012,8 @@ void COFFDumper::printCodeViewFileChecksums(StringRef Subsection) {
 }
 
 void COFFDumper::printCodeViewInlineeLines(StringRef Subsection) {
-  msf::ByteStream S(Subsection);
-  msf::StreamReader SR(S);
+  BinaryByteStream S(Subsection, llvm::support::little);
+  BinaryStreamReader SR(S);
   uint32_t Signature;
   error(SR.readInteger(Signature));
   bool HasExtraFiles = Signature == unsigned(InlineeLinesSignature::ExtraFiles);
@@ -1064,7 +1065,8 @@ void COFFDumper::printFileNameForOffset(StringRef Label, uint32_t FileOffset) {
   W.printHex(Label, getFileNameForFileOffset(FileOffset), FileOffset);
 }
 
-void COFFDumper::mergeCodeViewTypes(TypeTableBuilder &CVTypes) {
+void COFFDumper::mergeCodeViewTypes(TypeTableBuilder &CVIDs,
+                                    TypeTableBuilder &CVTypes) {
   for (const SectionRef &S : Obj->sections()) {
     StringRef SectionName;
     error(S.getName(SectionName));
@@ -1077,17 +1079,17 @@ void COFFDumper::mergeCodeViewTypes(TypeTableBuilder &CVTypes) {
         error(object_error::parse_failed);
       ArrayRef<uint8_t> Bytes(reinterpret_cast<const uint8_t *>(Data.data()),
                               Data.size());
-      ByteStream Stream(Bytes);
+      BinaryByteStream Stream(Bytes, llvm::support::little);
       CVTypeArray Types;
-      StreamReader Reader(Stream);
+      BinaryStreamReader Reader(Stream);
       if (auto EC = Reader.readArray(Types, Reader.getLength())) {
         consumeError(std::move(EC));
         W.flush();
         error(object_error::parse_failed);
       }
 
-      if (!mergeTypeStreams(CVTypes, Types))
-        return error(object_error::parse_failed);
+      if (auto EC = mergeTypeStreams(CVIDs, CVTypes, nullptr, Types))
+        return error(std::move(EC));
     }
   }
 }
@@ -1435,12 +1437,18 @@ void COFFDumper::printCOFFImports() {
     StringRef Name;
     error(I.getName(Name));
     W.printString("Name", Name);
-    uint32_t Addr;
-    error(I.getImportLookupTableRVA(Addr));
-    W.printHex("ImportLookupTableRVA", Addr);
-    error(I.getImportAddressTableRVA(Addr));
-    W.printHex("ImportAddressTableRVA", Addr);
-    printImportedSymbols(I.imported_symbols());
+    uint32_t ILTAddr;
+    error(I.getImportLookupTableRVA(ILTAddr));
+    W.printHex("ImportLookupTableRVA", ILTAddr);
+    uint32_t IATAddr;
+    error(I.getImportAddressTableRVA(IATAddr));
+    W.printHex("ImportAddressTableRVA", IATAddr);
+    // The import lookup table can be missing with certain older linkers, so
+    // fall back to the import address table in that case.
+    if (ILTAddr)
+      printImportedSymbols(I.lookup_table_symbols());
+    else
+      printImportedSymbols(I.imported_symbols());
   }
 
   // Delay imports
@@ -1549,20 +1557,43 @@ void COFFDumper::printStackMap() const {
 }
 
 void llvm::dumpCodeViewMergedTypes(ScopedPrinter &Writer,
+                                   llvm::codeview::TypeTableBuilder &IDTable,
                                    llvm::codeview::TypeTableBuilder &CVTypes) {
   // Flatten it first, then run our dumper on it.
-  ListScope S(Writer, "MergedTypeStream");
-  SmallString<0> Buf;
+  SmallString<0> TypeBuf;
   CVTypes.ForEachRecord([&](TypeIndex TI, ArrayRef<uint8_t> Record) {
-    Buf.append(Record.begin(), Record.end());
+    TypeBuf.append(Record.begin(), Record.end());
   });
 
   TypeDatabase TypeDB;
-  CVTypeDumper CVTD(TypeDB);
-  TypeDumpVisitor TDV(TypeDB, &Writer, opts::CodeViewSubsectionBytes);
-  if (auto EC =
-          CVTD.dump({Buf.str().bytes_begin(), Buf.str().bytes_end()}, TDV)) {
-    Writer.flush();
-    error(llvm::errorToErrorCode(std::move(EC)));
+  {
+    ListScope S(Writer, "MergedTypeStream");
+    CVTypeDumper CVTD(TypeDB);
+    TypeDumpVisitor TDV(TypeDB, &Writer, opts::CodeViewSubsectionBytes);
+    if (auto EC = CVTD.dump(
+            {TypeBuf.str().bytes_begin(), TypeBuf.str().bytes_end()}, TDV)) {
+      Writer.flush();
+      error(std::move(EC));
+    }
+  }
+
+  // Flatten the id stream and print it next. The ID stream refers to names from
+  // the type stream.
+  SmallString<0> IDBuf;
+  IDTable.ForEachRecord([&](TypeIndex TI, ArrayRef<uint8_t> Record) {
+    IDBuf.append(Record.begin(), Record.end());
+  });
+
+  {
+    ListScope S(Writer, "MergedIDStream");
+    TypeDatabase IDDB;
+    CVTypeDumper CVTD(IDDB);
+    TypeDumpVisitor TDV(TypeDB, &Writer, opts::CodeViewSubsectionBytes);
+    TDV.setItemDB(IDDB);
+    if (auto EC = CVTD.dump(
+            {IDBuf.str().bytes_begin(), IDBuf.str().bytes_end()}, TDV)) {
+      Writer.flush();
+      error(std::move(EC));
+    }
   }
 }
diff --git a/tools/llvm-readobj/ELFDumper.cpp b/tools/llvm-readobj/ELFDumper.cpp
index eda3390b0f58a0f4cf51bcfc0575eb56da313253..7893eea5d2209e9c43416dfe3ba67c5292fc7221 100644
--- a/tools/llvm-readobj/ELFDumper.cpp
+++ b/tools/llvm-readobj/ELFDumper.cpp
@@ -129,7 +129,7 @@ public:
   void printMipsReginfo() override;
   void printMipsOptions() override;
 
-  void printAMDGPURuntimeMD() override;
+  void printAMDGPUCodeObjectMetadata() override;
 
   void printStackMap() const override;
 
@@ -1003,6 +1003,7 @@ static const char *getElfSectionType(unsigned Arch, unsigned Type) {
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_MIPS_REGINFO);
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_MIPS_OPTIONS);
     LLVM_READOBJ_ENUM_CASE(ELF, SHT_MIPS_ABIFLAGS);
+    LLVM_READOBJ_ENUM_CASE(ELF, SHT_MIPS_DWARF);
     }
   }
 
@@ -2356,7 +2357,7 @@ template <class ELFT> void ELFDumper<ELFT>::printMipsOptions() {
   }
 }
 
-template <class ELFT> void ELFDumper<ELFT>::printAMDGPURuntimeMD() {
+template <class ELFT> void ELFDumper<ELFT>::printAMDGPUCodeObjectMetadata() {
   const Elf_Shdr *Shdr = findSectionByName(*Obj, ".note");
   if (!Shdr) {
     W.startLine() << "There is no .note section in the file.\n";
@@ -2364,7 +2365,7 @@ template <class ELFT> void ELFDumper<ELFT>::printAMDGPURuntimeMD() {
   }
   ArrayRef<uint8_t> Sec = unwrapOrError(Obj->getSectionContents(Shdr));
 
-  const uint32_t RuntimeMDNoteType = 8;
+  const uint32_t CodeObjectMetadataNoteType = 10;
   for (auto I = reinterpret_cast<const Elf_Word *>(&Sec[0]),
        E = I + Sec.size()/4; I != E;) {
     uint32_t NameSZ = I[0];
@@ -2378,7 +2379,7 @@ template <class ELFT> void ELFDumper<ELFT>::printAMDGPURuntimeMD() {
       I += alignTo<4>(NameSZ)/4;
     }
 
-    if (Name == "AMD" && Type == RuntimeMDNoteType) {
+    if (Name == "AMD" && Type == CodeObjectMetadataNoteType) {
       StringRef Desc(reinterpret_cast<const char *>(I), DescSZ);
       W.printString(Desc);
     }
@@ -2627,6 +2628,8 @@ std::string getSectionTypeString(unsigned Arch, unsigned Type) {
       return "MIPS_OPTIONS";
     case SHT_MIPS_ABIFLAGS:
       return "MIPS_ABIFLAGS";
+    case SHT_MIPS_DWARF:
+      return "SHT_MIPS_DWARF";
     }
   }
   switch (Type) {
@@ -3337,9 +3340,38 @@ static std::string getGNUNoteTypeName(const uint32_t NT) {
   return string;
 }
 
+static std::string getFreeBSDNoteTypeName(const uint32_t NT) {
+  static const struct {
+    uint32_t ID;
+    const char *Name;
+  } Notes[] = {
+      {ELF::NT_FREEBSD_THRMISC, "NT_THRMISC (thrmisc structure)"},
+      {ELF::NT_FREEBSD_PROCSTAT_PROC, "NT_PROCSTAT_PROC (proc data)"},
+      {ELF::NT_FREEBSD_PROCSTAT_FILES, "NT_PROCSTAT_FILES (files data)"},
+      {ELF::NT_FREEBSD_PROCSTAT_VMMAP, "NT_PROCSTAT_VMMAP (vmmap data)"},
+      {ELF::NT_FREEBSD_PROCSTAT_GROUPS, "NT_PROCSTAT_GROUPS (groups data)"},
+      {ELF::NT_FREEBSD_PROCSTAT_UMASK, "NT_PROCSTAT_UMASK (umask data)"},
+      {ELF::NT_FREEBSD_PROCSTAT_RLIMIT, "NT_PROCSTAT_RLIMIT (rlimit data)"},
+      {ELF::NT_FREEBSD_PROCSTAT_OSREL, "NT_PROCSTAT_OSREL (osreldate data)"},
+      {ELF::NT_FREEBSD_PROCSTAT_PSSTRINGS,
+       "NT_PROCSTAT_PSSTRINGS (ps_strings data)"},
+      {ELF::NT_FREEBSD_PROCSTAT_AUXV, "NT_PROCSTAT_AUXV (auxv data)"},
+  };
+
+  for (const auto &Note : Notes)
+    if (Note.ID == NT)
+      return std::string(Note.Name);
+
+  std::string string;
+  raw_string_ostream OS(string);
+  OS << format("Unknown note type (0x%08x)", NT);
+  return string;
+}
+
 template <typename ELFT>
 static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
-                         ArrayRef<typename ELFFile<ELFT>::Elf_Word> Words) {
+                         ArrayRef<typename ELFFile<ELFT>::Elf_Word> Words,
+                         size_t Size) {
   switch (NoteType) {
   default:
     return;
@@ -3362,16 +3394,14 @@ static void printGNUNote(raw_ostream &OS, uint32_t NoteType,
   }
   case ELF::NT_GNU_BUILD_ID: {
     OS << "    Build ID: ";
-    ArrayRef<uint8_t> ID(reinterpret_cast<const uint8_t *>(Words.data()),
-                         Words.size() * 4);
+    ArrayRef<uint8_t> ID(reinterpret_cast<const uint8_t *>(Words.data()), Size);
     for (const auto &B : ID)
       OS << format_hex_no_prefix(B, 2);
     break;
   }
   case ELF::NT_GNU_GOLD_VERSION:
     OS << "    Version: "
-       << StringRef(reinterpret_cast<const char *>(Words.data()),
-                    Words.size() * 4);
+       << StringRef(reinterpret_cast<const char *>(Words.data()), Size);
     break;
   }
 
@@ -3415,11 +3445,15 @@ void GNUStyle<ELFT>::printNotes(const ELFFile<ELFT> *Obj) {
 
       if (Name == "GNU") {
         OS << getGNUNoteTypeName(Type) << '\n';
-        printGNUNote<ELFT>(OS, Type, Descriptor);
+        printGNUNote<ELFT>(OS, Type, Descriptor, DescriptorSize);
+      } else if (Name == "FreeBSD") {
+        OS << getFreeBSDNoteTypeName(Type) << '\n';
+      } else {
+        OS << "Unknown note type: (" << format_hex(Type, 10) << ')';
       }
       OS << '\n';
 
-      P = P + 3 * sizeof(Elf_Word) * alignTo<4>(NameSize) +
+      P = P + 3 * sizeof(Elf_Word) + alignTo<4>(NameSize) +
           alignTo<4>(DescriptorSize);
     }
   };
diff --git a/tools/llvm-readobj/LLVMBuild.txt b/tools/llvm-readobj/LLVMBuild.txt
index 76dd436e21d6cdaa6902d8e8e6a746426d48d573..c0ed38e18d0c08bbf13395fedd8e0ef3831bb2ae 100644
--- a/tools/llvm-readobj/LLVMBuild.txt
+++ b/tools/llvm-readobj/LLVMBuild.txt
@@ -19,4 +19,4 @@
 type = Tool
 name = llvm-readobj
 parent = Tools
-required_libraries = all-targets BitReader Object DebugInfoCodeView DebugInfoMSF
+required_libraries = all-targets BitReader Object DebugInfoCodeView DebugInfoPDB DebugInfoMSF
diff --git a/tools/llvm-readobj/ObjDumper.h b/tools/llvm-readobj/ObjDumper.h
index 75090312b214f4b4d3462cacac0c75ef2b703ad8..ff780dae57843b2589ebaa5bd6f2c85d83e057bb 100644
--- a/tools/llvm-readobj/ObjDumper.h
+++ b/tools/llvm-readobj/ObjDumper.h
@@ -59,7 +59,7 @@ public:
   virtual void printMipsOptions() { }
 
   // Only implemented for AMDGPU ELF at this time.
-  virtual void printAMDGPURuntimeMD() {}
+  virtual void printAMDGPUCodeObjectMetadata() {}
 
   // Only implemented for PE/COFF.
   virtual void printCOFFImports() { }
@@ -68,7 +68,8 @@ public:
   virtual void printCOFFBaseReloc() { }
   virtual void printCOFFDebugDirectory() { }
   virtual void printCodeViewDebugInfo() { }
-  virtual void mergeCodeViewTypes(llvm::codeview::TypeTableBuilder &CVTypes) {}
+  virtual void mergeCodeViewTypes(llvm::codeview::TypeTableBuilder &CVIDs,
+                                  llvm::codeview::TypeTableBuilder &CVTypes) {}
 
   // Only implemented for MachO.
   virtual void printMachODataInCode() { }
@@ -103,7 +104,8 @@ std::error_code createWasmDumper(const object::ObjectFile *Obj,
 void dumpCOFFImportFile(const object::COFFImportFile *File);
 
 void dumpCodeViewMergedTypes(ScopedPrinter &Writer,
-                             llvm::codeview::TypeTableBuilder &CVTypes);
+                             llvm::codeview::TypeTableBuilder &IDTable,
+                             llvm::codeview::TypeTableBuilder &TypeTable);
 
 } // namespace llvm
 
diff --git a/tools/llvm-readobj/WasmDumper.cpp b/tools/llvm-readobj/WasmDumper.cpp
index 020a014220a816eaca9d3bf4b91fdf3f1052c667..5be090eb18cee6479198c0dc8585dfcedb4f5f57 100644
--- a/tools/llvm-readobj/WasmDumper.cpp
+++ b/tools/llvm-readobj/WasmDumper.cpp
@@ -55,14 +55,14 @@ public:
   void printSections() override {
     ListScope Group(W, "Sections");
     for (const SectionRef &Section : Obj->sections()) {
-      const wasm::WasmSection *WasmSec = Obj->getWasmSection(Section);
+      const WasmSection &WasmSec = Obj->getWasmSection(Section);
       DictScope SectionD(W, "Section");
-      const char *Type = wasmSectionTypeToString(WasmSec->Type);
-      W.printHex("Type", Type, WasmSec->Type);
-      W.printNumber("Size", (uint64_t)WasmSec->Content.size());
-      W.printNumber("Offset", WasmSec->Offset);
-      if (WasmSec->Type == wasm::WASM_SEC_CUSTOM) {
-        W.printString("Name", WasmSec->Name);
+      const char *Type = wasmSectionTypeToString(WasmSec.Type);
+      W.printHex("Type", Type, WasmSec.Type);
+      W.printNumber("Size", (uint64_t)WasmSec.Content.size());
+      W.printNumber("Offset", WasmSec.Offset);
+      if (WasmSec.Type == wasm::WASM_SEC_CUSTOM) {
+        W.printString("Name", WasmSec.Name);
       }
     }
   }
diff --git a/tools/llvm-readobj/llvm-readobj.cpp b/tools/llvm-readobj/llvm-readobj.cpp
index fffcd1790e3f635acb751eefd88757ccd8ec5d27..bc2a62e799ab00c18d4c3ce024dec3a70e1feecf 100644
--- a/tools/llvm-readobj/llvm-readobj.cpp
+++ b/tools/llvm-readobj/llvm-readobj.cpp
@@ -186,9 +186,10 @@ namespace opts {
   cl::opt<bool> MipsOptions("mips-options",
                             cl::desc("Display the MIPS .MIPS.options section"));
 
-  // -amdgpu-runtime-metadata
-  cl::opt<bool> AMDGPURuntimeMD("amdgpu-runtime-metadata",
-                                cl::desc("Display AMDGPU runtime metadata"));
+  // -amdgpu-code-object-metadata
+  cl::opt<bool> AMDGPUCodeObjectMetadata(
+      "amdgpu-code-object-metadata",
+      cl::desc("Display AMDGPU code object metadata"));
 
   // -coff-imports
   cl::opt<bool>
@@ -337,10 +338,12 @@ static bool isMipsArch(unsigned Arch) {
 }
 namespace {
 struct ReadObjTypeTableBuilder {
-  ReadObjTypeTableBuilder() : Allocator(), Builder(Allocator) {}
+  ReadObjTypeTableBuilder()
+      : Allocator(), IDTable(Allocator), TypeTable(Allocator) {}
 
   llvm::BumpPtrAllocator Allocator;
-  llvm::codeview::TypeTableBuilder Builder;
+  llvm::codeview::TypeTableBuilder IDTable;
+  llvm::codeview::TypeTableBuilder TypeTable;
 };
 }
 static ReadObjTypeTableBuilder CVTypes;
@@ -422,8 +425,8 @@ static void dumpObject(const ObjectFile *Obj) {
         Dumper->printMipsOptions();
     }
     if (Obj->getArch() == llvm::Triple::amdgcn)
-      if (opts::AMDGPURuntimeMD)
-        Dumper->printAMDGPURuntimeMD();
+      if (opts::AMDGPUCodeObjectMetadata)
+        Dumper->printAMDGPUCodeObjectMetadata();
     if (opts::SectionGroups)
       Dumper->printGroupSections();
     if (opts::HashHistogram)
@@ -445,7 +448,7 @@ static void dumpObject(const ObjectFile *Obj) {
     if (opts::CodeView)
       Dumper->printCodeViewDebugInfo();
     if (opts::CodeViewMergedTypes)
-      Dumper->mergeCodeViewTypes(CVTypes.Builder);
+      Dumper->mergeCodeViewTypes(CVTypes.IDTable, CVTypes.TypeTable);
   }
   if (Obj->isMachO()) {
     if (opts::MachODataInCode)
@@ -550,7 +553,7 @@ int main(int argc, const char *argv[]) {
 
   if (opts::CodeViewMergedTypes) {
     ScopedPrinter W(outs());
-    dumpCodeViewMergedTypes(W, CVTypes.Builder);
+    dumpCodeViewMergedTypes(W, CVTypes.IDTable, CVTypes.TypeTable);
   }
 
   return 0;
diff --git a/tools/llvm-shlib/CMakeLists.txt b/tools/llvm-shlib/CMakeLists.txt
index edadb82c3b435435878d509360c499adf8cbb650..c68a2b0e60eae7a019543ab51da4df1cafc2e828 100644
--- a/tools/llvm-shlib/CMakeLists.txt
+++ b/tools/llvm-shlib/CMakeLists.txt
@@ -37,7 +37,7 @@ endif()
 add_llvm_library(LLVM SHARED DISABLE_LLVM_LINK_LLVM_DYLIB SONAME ${SOURCES})
 
 list(REMOVE_DUPLICATES LIB_NAMES)
-if("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux" OR MINGW) # FIXME: It should be "GNU ld for elf"
+if(("${CMAKE_SYSTEM_NAME}" STREQUAL "Linux") OR (MINGW) OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "FreeBSD") OR ("${CMAKE_SYSTEM_NAME}" STREQUAL "DragonFly")) # FIXME: It should be "GNU ld for elf"
   # GNU ld doesn't resolve symbols in the version script.
   set(LIB_NAMES -Wl,--whole-archive ${LIB_NAMES} -Wl,--no-whole-archive)
 elseif("${CMAKE_SYSTEM_NAME}" STREQUAL "Darwin")
diff --git a/tools/llvm-stress/llvm-stress.cpp b/tools/llvm-stress/llvm-stress.cpp
index 731a24d0ac2d2b58e895e30585313d0a314a93e8..74b7735f8cd15f568f01fdb848a6c0c36b5e03c2 100644
--- a/tools/llvm-stress/llvm-stress.cpp
+++ b/tools/llvm-stress/llvm-stress.cpp
@@ -28,6 +28,7 @@
 #include "llvm/Support/PrettyStackTrace.h"
 #include "llvm/Support/ToolOutputFile.h"
 #include <algorithm>
+#include <random>
 #include <vector>
 
 namespace llvm {
@@ -113,6 +114,12 @@ public:
     return  Rand64() % y;
   }
 
+  /// Make this like a C++11 random device
+  typedef uint32_t result_type;
+  uint32_t operator()() { return Rand32(); }
+  static constexpr result_type min() { return 0; }
+  static constexpr result_type max() { return 0x7ffff; }
+  
 private:
   unsigned Seed;
 };
@@ -417,7 +424,9 @@ struct AllocaModifier: public Modifier {
 
   void Act() override {
     Type *Tp = pickType();
-    PT->push_back(new AllocaInst(Tp, "A", BB->getFirstNonPHI()));
+    const DataLayout &DL = BB->getModule()->getDataLayout();
+    PT->push_back(new AllocaInst(Tp, DL.getAllocaAddrSpace(),
+                                 "A", BB->getFirstNonPHI()));
   }
 };
 
@@ -662,7 +671,7 @@ static void IntroduceControlFlow(Function *F, Random &R) {
       BoolInst.push_back(&Instr);
   }
 
-  std::random_shuffle(BoolInst.begin(), BoolInst.end(), R);
+  std::shuffle(BoolInst.begin(), BoolInst.end(), R);
 
   for (auto *Instr : BoolInst) {
     BasicBlock *Curr = Instr->getParent();
diff --git a/tools/llvm-symbolizer/llvm-symbolizer.cpp b/tools/llvm-symbolizer/llvm-symbolizer.cpp
index fc37dea4c484d1583f2cc9f6ee6aa80d5a8c09fc..c9e0cc2b3b05cf99fe1cb553b19857ab566c5d5e 100644
--- a/tools/llvm-symbolizer/llvm-symbolizer.cpp
+++ b/tools/llvm-symbolizer/llvm-symbolizer.cpp
@@ -85,6 +85,9 @@ static cl::opt<int> ClPrintSourceContextLines(
     "print-source-context-lines", cl::init(0),
     cl::desc("Print N number of source file context"));
 
+static cl::opt<bool> ClVerbose("verbose", cl::init(false),
+                               cl::desc("Print verbose line info"));
+
 template<typename T>
 static bool error(Expected<T> &ResOrErr) {
   if (ResOrErr)
@@ -160,7 +163,7 @@ int main(int argc, char **argv) {
   LLVMSymbolizer Symbolizer(Opts);
 
   DIPrinter Printer(outs(), ClPrintFunctions != FunctionNameKind::None,
-                    ClPrettyPrint, ClPrintSourceContextLines);
+                    ClPrettyPrint, ClPrintSourceContextLines, ClVerbose);
 
   const int kMaxInputStringLength = 1024;
   char InputString[kMaxInputStringLength];
diff --git a/tools/llvm-xray/CMakeLists.txt b/tools/llvm-xray/CMakeLists.txt
index 5f245f081d1b05625bed53b5656c266bc64ca1ab..3baf4e64e81cb81033f9c61299dbb81748d7289d 100644
--- a/tools/llvm-xray/CMakeLists.txt
+++ b/tools/llvm-xray/CMakeLists.txt
@@ -9,6 +9,7 @@ set(LLVM_LINK_COMPONENTS
 set(LLVM_XRAY_TOOLS
   func-id-helper.cc
   xray-account.cc
+  xray-color-helper.cc
   xray-converter.cc
   xray-extract.cc
   xray-extract.cc
diff --git a/tools/llvm-xray/llvm-xray.cc b/tools/llvm-xray/llvm-xray.cc
index ac5faaa408b504d504551eddf9358ebaa6b8e9c4..98303e7be15c0bc286da9f3b85f5c6d68145776b 100644
--- a/tools/llvm-xray/llvm-xray.cc
+++ b/tools/llvm-xray/llvm-xray.cc
@@ -30,12 +30,20 @@ int main(int argc, char *argv[]) {
                               "  This program consolidates multiple XRay trace "
                               "processing tools for convenient access.\n");
   for (auto *SC : cl::getRegisteredSubcommands()) {
-    if (*SC)
+    if (*SC) {
+      // If no subcommand was provided, we need to explicitly check if this is
+      // the top-level subcommand.
+      if (SC == &*cl::TopLevelSubCommand) {
+        cl::PrintHelpMessage(false, true);
+        return 0;
+      }
       if (auto C = dispatch(SC)) {
         ExitOnError("llvm-xray: ")(C());
         return 0;
       }
+    }
   }
 
+  // If all else fails, we still print the usage message.
   cl::PrintHelpMessage(false, true);
 }
diff --git a/tools/llvm-xray/xray-account.cc b/tools/llvm-xray/xray-account.cc
index 671a5a073eeccc4ba1cb01ae5f8cf16d733b25e9..13654c3911f7756d4b3da98003f9d03109a4032f 100644
--- a/tools/llvm-xray/xray-account.cc
+++ b/tools/llvm-xray/xray-account.cc
@@ -18,10 +18,10 @@
 #include <utility>
 
 #include "xray-account.h"
-#include "xray-extract.h"
 #include "xray-registry.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/XRay/InstrumentationMap.h"
 #include "llvm/XRay/Trace.h"
 
 using namespace llvm;
@@ -120,16 +120,6 @@ static cl::opt<std::string>
 static cl::alias AccountInstrMap2("m", cl::aliasopt(AccountInstrMap),
                                   cl::desc("Alias for -instr_map"),
                                   cl::sub(Account));
-static cl::opt<InstrumentationMapExtractor::InputFormats> InstrMapFormat(
-    "instr-map-format", cl::desc("format of instrumentation map"),
-    cl::values(clEnumValN(InstrumentationMapExtractor::InputFormats::ELF, "elf",
-                          "instrumentation map in an ELF header"),
-               clEnumValN(InstrumentationMapExtractor::InputFormats::YAML,
-                          "yaml", "instrumentation map in YAML")),
-    cl::sub(Account), cl::init(InstrumentationMapExtractor::InputFormats::ELF));
-static cl::alias InstrMapFormat2("t", cl::aliasopt(InstrMapFormat),
-                                 cl::desc("Alias for -instr-map-format"),
-                                 cl::sub(Account));
 
 namespace {
 
@@ -418,67 +408,63 @@ void LatencyAccountant::exportStatsAsCSV(raw_ostream &OS,
 using namespace llvm::xray;
 
 static CommandRegistration Unused(&Account, []() -> Error {
-  int Fd;
-  auto EC = sys::fs::openFileForRead(AccountInput, Fd);
-  if (EC)
-    return make_error<StringError>(
-        Twine("Cannot open file '") + AccountInput + "'", EC);
-
-  Error Err = Error::success();
-  xray::InstrumentationMapExtractor Extractor(AccountInstrMap, InstrMapFormat,
-                                              Err);
-  if (auto E = handleErrors(
-        std::move(Err), [&](std::unique_ptr<StringError> SE) -> Error {
-          if (SE->convertToErrorCode() == std::errc::no_such_file_or_directory)
-            return Error::success();
-          return Error(std::move(SE));
-        }))
-    return E;
+  InstrumentationMap Map;
+  if (!AccountInstrMap.empty()) {
+    auto InstrumentationMapOrError = loadInstrumentationMap(AccountInstrMap);
+    if (!InstrumentationMapOrError)
+      return joinErrors(make_error<StringError>(
+                            Twine("Cannot open instrumentation map '") +
+                                AccountInstrMap + "'",
+                            std::make_error_code(std::errc::invalid_argument)),
+                        InstrumentationMapOrError.takeError());
+    Map = std::move(*InstrumentationMapOrError);
+  }
 
+  std::error_code EC;
   raw_fd_ostream OS(AccountOutput, EC, sys::fs::OpenFlags::F_Text);
   if (EC)
     return make_error<StringError>(
         Twine("Cannot open file '") + AccountOutput + "' for writing.", EC);
 
-  const auto &FunctionAddresses = Extractor.getFunctionAddresses();
+  const auto &FunctionAddresses = Map.getFunctionAddresses();
   symbolize::LLVMSymbolizer::Options Opts(
       symbolize::FunctionNameKind::LinkageName, true, true, false, "");
   symbolize::LLVMSymbolizer Symbolizer(Opts);
   llvm::xray::FuncIdConversionHelper FuncIdHelper(AccountInstrMap, Symbolizer,
                                                   FunctionAddresses);
   xray::LatencyAccountant FCA(FuncIdHelper, AccountDeduceSiblingCalls);
-  if (auto TraceOrErr = loadTraceFile(AccountInput)) {
-    auto &T = *TraceOrErr;
-    for (const auto &Record : T) {
-      if (FCA.accountRecord(Record))
-        continue;
-      for (const auto &ThreadStack : FCA.getPerThreadFunctionStack()) {
-        errs() << "Thread ID: " << ThreadStack.first << "\n";
-        auto Level = ThreadStack.second.size();
-        for (const auto &Entry : llvm::reverse(ThreadStack.second))
-          errs() << "#" << Level-- << "\t"
-                 << FuncIdHelper.SymbolOrNumber(Entry.first) << '\n';
-      }
-      if (!AccountKeepGoing)
-        return make_error<StringError>(
-            Twine("Failed accounting function calls in file '") + AccountInput +
-                "'.",
-            std::make_error_code(std::errc::executable_format_error));
-    }
-    switch (AccountOutputFormat) {
-    case AccountOutputFormats::TEXT:
-      FCA.exportStatsAsText(OS, T.getFileHeader());
-      break;
-    case AccountOutputFormats::CSV:
-      FCA.exportStatsAsCSV(OS, T.getFileHeader());
-      break;
-    }
-  } else {
+  auto TraceOrErr = loadTraceFile(AccountInput);
+  if (!TraceOrErr)
     return joinErrors(
         make_error<StringError>(
             Twine("Failed loading input file '") + AccountInput + "'",
             std::make_error_code(std::errc::executable_format_error)),
         TraceOrErr.takeError());
+
+  auto &T = *TraceOrErr;
+  for (const auto &Record : T) {
+    if (FCA.accountRecord(Record))
+      continue;
+    for (const auto &ThreadStack : FCA.getPerThreadFunctionStack()) {
+      errs() << "Thread ID: " << ThreadStack.first << "\n";
+      auto Level = ThreadStack.second.size();
+      for (const auto &Entry : llvm::reverse(ThreadStack.second))
+        errs() << "#" << Level-- << "\t"
+               << FuncIdHelper.SymbolOrNumber(Entry.first) << '\n';
+    }
+    if (!AccountKeepGoing)
+      return make_error<StringError>(
+          Twine("Failed accounting function calls in file '") + AccountInput +
+              "'.",
+          std::make_error_code(std::errc::executable_format_error));
+  }
+  switch (AccountOutputFormat) {
+  case AccountOutputFormats::TEXT:
+    FCA.exportStatsAsText(OS, T.getFileHeader());
+    break;
+  case AccountOutputFormats::CSV:
+    FCA.exportStatsAsCSV(OS, T.getFileHeader());
+    break;
   }
 
   return Error::success();
diff --git a/tools/llvm-xray/xray-color-helper.cc b/tools/llvm-xray/xray-color-helper.cc
new file mode 100644
index 0000000000000000000000000000000000000000..925bb7483d8f0227347df1ad5846e9fabb5c0da9
--- /dev/null
+++ b/tools/llvm-xray/xray-color-helper.cc
@@ -0,0 +1,198 @@
+//===-- xray-graph.cc - XRay Function Call Graph Renderer -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A class to get a color from a specified gradient.
+//
+//===----------------------------------------------------------------------===//
+#include <algorithm>
+
+#include "xray-color-helper.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+using namespace xray;
+
+//  Sequential ColorMaps, which are used to represent information
+//  from some minimum to some maximum.
+
+static const std::tuple<uint8_t, uint8_t, uint8_t> SequentialMaps[][9] = {
+    {// The greys color scheme from http://colorbrewer2.org/
+     std::make_tuple(255, 255, 255), std::make_tuple(240, 240, 240),
+     std::make_tuple(217, 217, 217), std::make_tuple(189, 189, 189),
+     std::make_tuple(150, 150, 150), std::make_tuple(115, 115, 115),
+     std::make_tuple(82, 82, 82), std::make_tuple(37, 37, 37),
+     std::make_tuple(0, 0, 0)},
+    {// The OrRd color scheme from http://colorbrewer2.org/
+     std::make_tuple(255, 247, 236), std::make_tuple(254, 232, 200),
+     std::make_tuple(253, 212, 158), std::make_tuple(253, 187, 132),
+     std::make_tuple(252, 141, 89), std::make_tuple(239, 101, 72),
+     std::make_tuple(215, 48, 31), std::make_tuple(179, 0, 0),
+     std::make_tuple(127, 0, 0)},
+    {// The PuBu color scheme from http://colorbrewer2.org/
+     std::make_tuple(255, 247, 251), std::make_tuple(236, 231, 242),
+     std::make_tuple(208, 209, 230), std::make_tuple(166, 189, 219),
+     std::make_tuple(116, 169, 207), std::make_tuple(54, 144, 192),
+     std::make_tuple(5, 112, 176), std::make_tuple(4, 90, 141),
+     std::make_tuple(2, 56, 88)}};
+
+ColorHelper::ColorHelper(ColorHelper::SequentialScheme S)
+    : MinIn(0.0), MaxIn(1.0), ColorMap(SequentialMaps[static_cast<int>(S)]) {}
+
+// Diverging ColorMaps, which are used to represent information
+// representing differenes, or a range that goes from negative to positive.
+// These take an input in the range [-1,1].
+
+static const std::tuple<uint8_t, uint8_t, uint8_t> DivergingCoeffs[][11] = {
+    {// The PiYG color scheme from http://colorbrewer2.org/
+     std::make_tuple(142, 1, 82), std::make_tuple(197, 27, 125),
+     std::make_tuple(222, 119, 174), std::make_tuple(241, 182, 218),
+     std::make_tuple(253, 224, 239), std::make_tuple(247, 247, 247),
+     std::make_tuple(230, 245, 208), std::make_tuple(184, 225, 134),
+     std::make_tuple(127, 188, 65), std::make_tuple(77, 146, 33),
+     std::make_tuple(39, 100, 25)}};
+
+ColorHelper::ColorHelper(ColorHelper::DivergingScheme S)
+    : MinIn(-1.0), MaxIn(1.0), ColorMap(DivergingCoeffs[static_cast<int>(S)]) {}
+
+// Takes a tuple of uint8_ts representing a color in RGB and converts them to
+// HSV represented by a tuple of doubles
+static std::tuple<double, double, double>
+convertToHSV(const std::tuple<uint8_t, uint8_t, uint8_t> &Color) {
+  double Scaled[3] = {std::get<0>(Color) / 255.0, std::get<1>(Color) / 255.0,
+                      std::get<2>(Color) / 255.0};
+  int Min = 0;
+  int Max = 0;
+  for (int i = 1; i < 3; ++i) {
+    if (Scaled[i] < Scaled[Min])
+      Min = i;
+    if (Scaled[i] > Scaled[Max])
+      Max = i;
+  }
+
+  double C = Scaled[Max] - Scaled[Min];
+
+  double HPrime = (Scaled[(Max + 1) % 3] - Scaled[(Max + 2) % 3]) / C;
+  HPrime = HPrime + 2.0 * Max;
+
+  double H = (HPrime < 0) ? (HPrime + 6.0) * 60
+                          : HPrime * 60; // Scale to between 0 and 360
+
+  double V = Scaled[Max];
+
+  double S = (V == 0.0) ? 0.0 : C / V;
+
+  return std::make_tuple(H, S, V);
+}
+
+// Takes a double precision number, clips it between 0 and 1 and then converts
+// that to an integer between 0x00 and 0xFF with proxpper rounding.
+static uint8_t unitIntervalTo8BitChar(double B) {
+  double n = std::max(std::min(B, 1.0), 0.0);
+  return static_cast<uint8_t>(255 * n + 0.5);
+}
+
+// Takes a typle of doubles representing a color in HSV and converts them to
+// RGB represented as a tuple of uint8_ts
+static std::tuple<uint8_t, uint8_t, uint8_t>
+convertToRGB(const std::tuple<double, double, double> &Color) {
+  const double &H = std::get<0>(Color);
+  const double &S = std::get<1>(Color);
+  const double &V = std::get<2>(Color);
+
+  double C = V * S;
+
+  double HPrime = H / 60;
+  double X = C * (1 - std::abs(std::fmod(HPrime, 2.0) - 1));
+
+  double RGB1[3];
+  int HPrimeInt = static_cast<int>(HPrime);
+  if (HPrimeInt % 2 == 0) {
+    RGB1[(HPrimeInt / 2) % 3] = C;
+    RGB1[(HPrimeInt / 2 + 1) % 3] = X;
+    RGB1[(HPrimeInt / 2 + 2) % 3] = 0.0;
+  } else {
+    RGB1[(HPrimeInt / 2) % 3] = X;
+    RGB1[(HPrimeInt / 2 + 1) % 3] = C;
+    RGB1[(HPrimeInt / 2 + 2) % 3] = 0.0;
+  }
+
+  double Min = V - C;
+  double RGB2[3] = {RGB1[0] + Min, RGB1[1] + Min, RGB1[2] + Min};
+
+  return std::make_tuple(unitIntervalTo8BitChar(RGB2[0]),
+                         unitIntervalTo8BitChar(RGB2[1]),
+                         unitIntervalTo8BitChar(RGB2[2]));
+}
+
+// The Hue component of the HSV interpolation Routine
+static double interpolateHue(double H0, double H1, double T) {
+  double D = H1 - H0;
+  if (H0 > H1) {
+    std::swap(H0, H1);
+
+    D = -D;
+    T = 1 - T;
+  }
+
+  if (D <= 180) {
+    return H0 + T * (H1 - H0);
+  } else {
+    H0 = H0 + 360;
+    return std::fmod(H0 + T * (H1 - H0) + 720, 360);
+  }
+}
+
+// Interpolates between two HSV Colors both represented as a tuple of doubles
+// Returns an HSV Color represented as a tuple of doubles
+static std::tuple<double, double, double>
+interpolateHSV(const std::tuple<double, double, double> &C0,
+               const std::tuple<double, double, double> &C1, double T) {
+  double H = interpolateHue(std::get<0>(C0), std::get<0>(C1), T);
+  double S = std::get<1>(C0) + T * (std::get<1>(C1) - std::get<1>(C0));
+  double V = std::get<2>(C0) + T * (std::get<2>(C1) - std::get<2>(C0));
+  return std::make_tuple(H, S, V);
+}
+
+// Get the Color as a tuple of uint8_ts
+std::tuple<uint8_t, uint8_t, uint8_t>
+ColorHelper::getColorTuple(double Point) const {
+  assert(!ColorMap.empty() && "ColorMap must not be empty!");
+  size_t MaxIndex = ColorMap.size() - 1;
+  double IntervalWidth = MaxIn - MinIn;
+  double OffsetP = Point - MinIn;
+  double SectionWidth = IntervalWidth / static_cast<double>(MaxIndex);
+  size_t SectionNo = std::floor(OffsetP / SectionWidth);
+  double T = (OffsetP - SectionNo * SectionWidth) / SectionWidth;
+
+  auto &RGBColor0 = ColorMap[SectionNo];
+  auto &RGBColor1 = ColorMap[std::min(SectionNo + 1, MaxIndex)];
+
+  auto HSVColor0 = convertToHSV(RGBColor0);
+  auto HSVColor1 = convertToHSV(RGBColor1);
+
+  auto InterpolatedHSVColor = interpolateHSV(HSVColor0, HSVColor1, T);
+  return convertToRGB(InterpolatedHSVColor);
+}
+
+// A helper method to convert a color represented as tuple of uint8s to a hex
+// string.
+std::string
+ColorHelper::getColorString(std::tuple<uint8_t, uint8_t, uint8_t> t) {
+  return llvm::formatv("#{0:X-2}{1:X-2}{2:X-2}", std::get<0>(t), std::get<1>(t),
+                       std::get<2>(t));
+}
+
+// Gets a color in a gradient given a number in the interval [0,1], it does this
+// by evaluating a polynomial which maps [0, 1] -> [0, 1] for each of the R G
+// and B values in the color. It then converts this [0,1] colors to a 24 bit
+// color as a hex string.
+std::string ColorHelper::getColorString(double Point) const {
+  return getColorString(getColorTuple(Point));
+}
diff --git a/tools/llvm-xray/xray-color-helper.h b/tools/llvm-xray/xray-color-helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..d3c77de03cb2999ec9f6a4b764a2fa2cd644a3f8
--- /dev/null
+++ b/tools/llvm-xray/xray-color-helper.h
@@ -0,0 +1,81 @@
+//===-- xray-graph.h - XRay Function Call Graph Renderer --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// A class to get a color from a specified gradient.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef XRAY_COLOR_HELPER_H
+#define XRAY_COLOR_HELPER_H
+
+#include <tuple>
+
+#include "llvm/ADT/ArrayRef.h"
+
+namespace llvm {
+namespace xray {
+
+/// The color helper class it a healper class which allows you to easily get a
+/// color in a gradient. This is used to color-code edges in XRay-Graph tools.
+///
+/// There are two types of color schemes in this class:
+///   - Sequential schemes, which are used to represent information from some
+///     minimum to some maximum. These take an input in the range [0,1]
+///   - Diverging schemes, which are used to represent information representing
+///     differenes, or a range that goes from negative to positive. These take
+///     an input in the range [-1,1].
+/// Usage;
+/// ColorHelper S(ColorHelper::SequentialScheme::OrRd); //Chose a color scheme.
+/// for (double p = 0.0; p <= 1; p += 0.1){
+///   cout() << S.getColor(p) << " \n"; // Sample the gradient at 0.1 intervals
+/// }
+///
+/// ColorHelper D(ColorHelper::DivergingScheme::Spectral); // Choose a color
+///                                                        // scheme.
+/// for (double p= -1; p <= 1 ; p += 0.1){
+///   cout() << D.getColor(p) << " \n"; // sample the gradient at 0.1 intervals
+/// }
+class ColorHelper {
+  double MinIn;
+  double MaxIn;
+
+  ArrayRef<std::tuple<uint8_t, uint8_t, uint8_t>> ColorMap;
+
+public:
+  /// Enum of the availible Sequential Color Schemes
+  enum class SequentialScheme {
+    // Schemes based on the ColorBrewer Color schemes of the same name from
+    // http://www.colorbrewer.org/ by Cynthis A Brewer Penn State University.
+    Greys,
+    OrRd,
+    PuBu
+  };
+
+  ColorHelper(SequentialScheme S);
+
+  /// Enum of the availible Diverging Color Schemes
+  enum class DivergingScheme {
+    // Schemes based on the ColorBrewer Color schemes of the same name from
+    // http://www.colorbrewer.org/ by Cynthis A Brewer Penn State University.
+    PiYG
+  };
+
+  ColorHelper(DivergingScheme S);
+
+  // Sample the gradient at the input point.
+  std::tuple<uint8_t, uint8_t, uint8_t> getColorTuple(double Point) const;
+
+  std::string getColorString(double Point) const;
+
+  // Convert a tuple to a string
+  static std::string getColorString(std::tuple<uint8_t, uint8_t, uint8_t> t);
+};
+}
+}
+#endif
diff --git a/tools/llvm-xray/xray-converter.cc b/tools/llvm-xray/xray-converter.cc
index b1fbc16d205d99a96a19a96394ee3204e0dabcbc..2583ec951495bd01b40fcfa4df65b98f9e8d6e6f 100644
--- a/tools/llvm-xray/xray-converter.cc
+++ b/tools/llvm-xray/xray-converter.cc
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 #include "xray-converter.h"
 
-#include "xray-extract.h"
 #include "xray-registry.h"
 #include "llvm/DebugInfo/Symbolize/Symbolize.h"
 #include "llvm/Support/EndianStream.h"
@@ -20,6 +19,7 @@
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/XRay/InstrumentationMap.h"
 #include "llvm/XRay/Trace.h"
 #include "llvm/XRay/YAMLXRayRecord.h"
 
@@ -73,16 +73,6 @@ static cl::opt<bool> ConvertSortInput(
 static cl::alias ConvertSortInput2("s", cl::aliasopt(ConvertSortInput),
                                    cl::desc("Alias for -sort"),
                                    cl::sub(Convert));
-static cl::opt<InstrumentationMapExtractor::InputFormats> InstrMapFormat(
-    "instr-map-format", cl::desc("format of instrumentation map"),
-    cl::values(clEnumValN(InstrumentationMapExtractor::InputFormats::ELF, "elf",
-                          "instrumentation map in an ELF header"),
-               clEnumValN(InstrumentationMapExtractor::InputFormats::YAML,
-                          "yaml", "instrumentation map in YAML")),
-    cl::sub(Convert), cl::init(InstrumentationMapExtractor::InputFormats::ELF));
-static cl::alias InstrMapFormat2("t", cl::aliasopt(InstrMapFormat),
-                                 cl::desc("Alias for -instr-map-format"),
-                                 cl::sub(Convert));
 
 using llvm::yaml::Output;
 
@@ -98,7 +88,7 @@ void TraceConverter::exportAsYAML(const Trace &Records, raw_ostream &OS) {
                                        : llvm::to_string(R.FuncId),
                              R.TSC, R.TId});
   }
-  Output Out(OS);
+  Output Out(OS, nullptr, 0);
   Out << Trace;
 }
 
@@ -128,7 +118,9 @@ void TraceConverter::exportAsRAWv1(const Trace &Records, raw_ostream &OS) {
   // format.
   for (const auto &R : Records) {
     Writer.write(R.RecordType);
-    Writer.write(R.CPU);
+    // The on disk naive raw format uses 8 bit CPUs, but the record has 16.
+    // There's no choice but truncation.
+    Writer.write(static_cast<uint8_t>(R.CPU));
     switch (R.Type) {
     case RecordTypes::ENTER:
       Writer.write(uint8_t{0});
@@ -151,25 +143,26 @@ namespace xray {
 
 static CommandRegistration Unused(&Convert, []() -> Error {
   // FIXME: Support conversion to BINARY when upgrading XRay trace versions.
-  int Fd;
-  auto EC = sys::fs::openFileForRead(ConvertInput, Fd);
-  if (EC)
-    return make_error<StringError>(
-        Twine("Cannot open file '") + ConvertInput + "'", EC);
-
-  Error Err = Error::success();
-  xray::InstrumentationMapExtractor Extractor(ConvertInstrMap, InstrMapFormat,
-                                              Err);
-  handleAllErrors(std::move(Err),
-                  [&](const ErrorInfoBase &E) { E.log(errs()); });
+  InstrumentationMap Map;
+  if (!ConvertInstrMap.empty()) {
+    auto InstrumentationMapOrError = loadInstrumentationMap(ConvertInstrMap);
+    if (!InstrumentationMapOrError)
+      return joinErrors(make_error<StringError>(
+                            Twine("Cannot open instrumentation map '") +
+                                ConvertInstrMap + "'",
+                            std::make_error_code(std::errc::invalid_argument)),
+                        InstrumentationMapOrError.takeError());
+    Map = std::move(*InstrumentationMapOrError);
+  }
 
-  const auto &FunctionAddresses = Extractor.getFunctionAddresses();
+  const auto &FunctionAddresses = Map.getFunctionAddresses();
   symbolize::LLVMSymbolizer::Options Opts(
       symbolize::FunctionNameKind::LinkageName, true, true, false, "");
   symbolize::LLVMSymbolizer Symbolizer(Opts);
   llvm::xray::FuncIdConversionHelper FuncIdHelper(ConvertInstrMap, Symbolizer,
                                                   FunctionAddresses);
   llvm::xray::TraceConverter TC(FuncIdHelper, ConvertSymbolize);
+  std::error_code EC;
   raw_fd_ostream OS(ConvertOutput, EC,
                     ConvertOutputFormat == ConvertFormats::BINARY
                         ? sys::fs::OpenFlags::F_None
@@ -178,22 +171,22 @@ static CommandRegistration Unused(&Convert, []() -> Error {
     return make_error<StringError>(
         Twine("Cannot open file '") + ConvertOutput + "' for writing.", EC);
 
-  if (auto TraceOrErr = loadTraceFile(ConvertInput, ConvertSortInput)) {
-    auto &T = *TraceOrErr;
-    switch (ConvertOutputFormat) {
-    case ConvertFormats::YAML:
-      TC.exportAsYAML(T, OS);
-      break;
-    case ConvertFormats::BINARY:
-      TC.exportAsRAWv1(T, OS);
-      break;
-    }
-  } else {
+  auto TraceOrErr = loadTraceFile(ConvertInput, ConvertSortInput);
+  if (!TraceOrErr)
     return joinErrors(
         make_error<StringError>(
             Twine("Failed loading input file '") + ConvertInput + "'.",
             std::make_error_code(std::errc::executable_format_error)),
         TraceOrErr.takeError());
+
+  auto &T = *TraceOrErr;
+  switch (ConvertOutputFormat) {
+  case ConvertFormats::YAML:
+    TC.exportAsYAML(T, OS);
+    break;
+  case ConvertFormats::BINARY:
+    TC.exportAsRAWv1(T, OS);
+    break;
   }
   return Error::success();
 });
diff --git a/tools/llvm-xray/xray-extract.cc b/tools/llvm-xray/xray-extract.cc
index 49ecd742113743b42c72b720f709396becaa9668..26e461869a083c8c1d6eba8349ba5f3d990e1924 100644
--- a/tools/llvm-xray/xray-extract.cc
+++ b/tools/llvm-xray/xray-extract.cc
@@ -16,10 +16,7 @@
 #include <type_traits>
 #include <utility>
 
-#include "xray-extract.h"
-
 #include "xray-registry.h"
-#include "xray-sleds.h"
 #include "llvm/Object/ELF.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/CommandLine.h"
@@ -28,8 +25,8 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
-#include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/XRay/InstrumentationMap.h"
 
 using namespace llvm;
 using namespace llvm::xray;
@@ -49,243 +46,40 @@ static cl::alias ExtractOutput2("o", cl::aliasopt(ExtractOutput),
                                 cl::desc("Alias for -output"),
                                 cl::sub(Extract));
 
-struct YAMLXRaySledEntry {
-  int32_t FuncId;
-  Hex64 Address;
-  Hex64 Function;
-  SledEntry::FunctionKinds Kind;
-  bool AlwaysInstrument;
-};
-
-namespace llvm {
-namespace yaml {
-
-template <> struct ScalarEnumerationTraits<SledEntry::FunctionKinds> {
-  static void enumeration(IO &IO, SledEntry::FunctionKinds &Kind) {
-    IO.enumCase(Kind, "function-enter", SledEntry::FunctionKinds::ENTRY);
-    IO.enumCase(Kind, "function-exit", SledEntry::FunctionKinds::EXIT);
-    IO.enumCase(Kind, "tail-exit", SledEntry::FunctionKinds::TAIL);
-  }
-};
-
-template <> struct MappingTraits<YAMLXRaySledEntry> {
-  static void mapping(IO &IO, YAMLXRaySledEntry &Entry) {
-    IO.mapRequired("id", Entry.FuncId);
-    IO.mapRequired("address", Entry.Address);
-    IO.mapRequired("function", Entry.Function);
-    IO.mapRequired("kind", Entry.Kind);
-    IO.mapRequired("always-instrument", Entry.AlwaysInstrument);
-  }
-
-  static constexpr bool flow = true;
-};
-}
-}
-
-LLVM_YAML_IS_SEQUENCE_VECTOR(YAMLXRaySledEntry)
-
 namespace {
 
-llvm::Error LoadBinaryInstrELF(
-    StringRef Filename, std::deque<SledEntry> &OutputSleds,
-    InstrumentationMapExtractor::FunctionAddressMap &InstrMap,
-    InstrumentationMapExtractor::FunctionAddressReverseMap &FunctionIds) {
-  auto ObjectFile = object::ObjectFile::createObjectFile(Filename);
-
-  if (!ObjectFile)
-    return ObjectFile.takeError();
-
-  // FIXME: Maybe support other ELF formats. For now, 64-bit Little Endian only.
-  if (!ObjectFile->getBinary()->isELF())
-    return make_error<StringError>(
-        "File format not supported (only does ELF).",
-        std::make_error_code(std::errc::not_supported));
-  if (ObjectFile->getBinary()->getArch() != Triple::x86_64)
-    return make_error<StringError>(
-        "File format not supported (only does ELF little endian 64-bit).",
-        std::make_error_code(std::errc::not_supported));
-
-  // Find the section named "xray_instr_map".
-  StringRef Contents = "";
-  const auto &Sections = ObjectFile->getBinary()->sections();
-  auto I = find_if(Sections, [&](object::SectionRef Section) {
-    StringRef Name = "";
-    if (Section.getName(Name))
-      return false;
-    return Name == "xray_instr_map";
-  });
-  if (I == Sections.end())
-    return make_error<StringError>(
-        "Failed to find XRay instrumentation map.",
-        std::make_error_code(std::errc::not_supported));
-  if (I->getContents(Contents))
-    return make_error<StringError>(
-        "Failed to get contents of 'xray_instr_map' section.",
-        std::make_error_code(std::errc::executable_format_error));
-
-  // Copy the instrumentation map data into the Sleds data structure.
-  auto C = Contents.bytes_begin();
-  static constexpr size_t ELF64SledEntrySize = 32;
-
-  if ((C - Contents.bytes_end()) % ELF64SledEntrySize != 0)
-    return make_error<StringError>(
-        "Instrumentation map entries not evenly divisible by size of an XRay "
-        "sled entry in ELF64.",
-        std::make_error_code(std::errc::executable_format_error));
-
-  int32_t FuncId = 1;
-  uint64_t CurFn = 0;
-  std::deque<SledEntry> Sleds;
-  for (; C != Contents.bytes_end(); C += ELF64SledEntrySize) {
-    DataExtractor Extractor(
-        StringRef(reinterpret_cast<const char *>(C), ELF64SledEntrySize), true,
-        8);
-    Sleds.push_back({});
-    auto &Entry = Sleds.back();
-    uint32_t OffsetPtr = 0;
-    Entry.Address = Extractor.getU64(&OffsetPtr);
-    Entry.Function = Extractor.getU64(&OffsetPtr);
-    auto Kind = Extractor.getU8(&OffsetPtr);
-    switch (Kind) {
-    case 0: // ENTRY
-      Entry.Kind = SledEntry::FunctionKinds::ENTRY;
-      break;
-    case 1: // EXIT
-      Entry.Kind = SledEntry::FunctionKinds::EXIT;
-      break;
-    case 2: // TAIL
-      Entry.Kind = SledEntry::FunctionKinds::TAIL;
-      break;
-    default:
-      return make_error<StringError>(
-          Twine("Encountered unknown sled type ") + "'" + Twine(int32_t{Kind}) +
-              "'.",
-          std::make_error_code(std::errc::executable_format_error));
-    }
-    Entry.AlwaysInstrument = Extractor.getU8(&OffsetPtr) != 0;
-
-    // We replicate the function id generation scheme implemented in the runtime
-    // here. Ideally we should be able to break it out, or output this map from
-    // the runtime, but that's a design point we can discuss later on. For now,
-    // we replicate the logic and move on.
-    if (CurFn == 0) {
-      CurFn = Entry.Function;
-      InstrMap[FuncId] = Entry.Function;
-      FunctionIds[Entry.Function] = FuncId;
-    }
-    if (Entry.Function != CurFn) {
-      ++FuncId;
-      CurFn = Entry.Function;
-      InstrMap[FuncId] = Entry.Function;
-      FunctionIds[Entry.Function] = FuncId;
-    }
-  }
-  OutputSleds = std::move(Sleds);
-  return llvm::Error::success();
-}
-
-Error LoadYAMLInstrMap(
-    StringRef Filename, std::deque<SledEntry> &Sleds,
-    InstrumentationMapExtractor::FunctionAddressMap &InstrMap,
-    InstrumentationMapExtractor::FunctionAddressReverseMap &FunctionIds) {
-  int Fd;
-  if (auto EC = sys::fs::openFileForRead(Filename, Fd))
-    return make_error<StringError>(
-        Twine("Failed opening file '") + Filename + "' for reading.", EC);
-
-  uint64_t FileSize;
-  if (auto EC = sys::fs::file_size(Filename, FileSize))
-    return make_error<StringError>(
-        Twine("Failed getting size of file '") + Filename + "'.", EC);
-
-  std::error_code EC;
-  sys::fs::mapped_file_region MappedFile(
-      Fd, sys::fs::mapped_file_region::mapmode::readonly, FileSize, 0, EC);
-  if (EC)
-    return make_error<StringError>(
-        Twine("Failed memory-mapping file '") + Filename + "'.", EC);
-
-  std::vector<YAMLXRaySledEntry> YAMLSleds;
-  Input In(StringRef(MappedFile.data(), MappedFile.size()));
-  In >> YAMLSleds;
-  if (In.error())
-    return make_error<StringError>(
-        Twine("Failed loading YAML document from '") + Filename + "'.",
-        In.error());
-
-  for (const auto &Y : YAMLSleds) {
-    InstrMap[Y.FuncId] = Y.Function;
-    FunctionIds[Y.Function] = Y.FuncId;
-    Sleds.push_back(
-        SledEntry{Y.Address, Y.Function, Y.Kind, Y.AlwaysInstrument});
-  }
-  return Error::success();
-}
-
-} // namespace
-
-InstrumentationMapExtractor::InstrumentationMapExtractor(std::string Filename,
-                                                         InputFormats Format,
-                                                         Error &EC) {
-  ErrorAsOutParameter ErrAsOutputParam(&EC);
-  if (Filename.empty()) {
-    EC = Error::success();
-    return;
-  }
-  switch (Format) {
-  case InputFormats::ELF: {
-    EC = handleErrors(
-        LoadBinaryInstrELF(Filename, Sleds, FunctionAddresses, FunctionIds),
-        [&](std::unique_ptr<ErrorInfoBase> E) {
-          return joinErrors(
-              make_error<StringError>(
-                  Twine("Cannot extract instrumentation map from '") +
-                      Filename + "'.",
-                  std::make_error_code(std::errc::executable_format_error)),
-              std::move(E));
-        });
-    break;
-  }
-  case InputFormats::YAML: {
-    EC = handleErrors(
-        LoadYAMLInstrMap(Filename, Sleds, FunctionAddresses, FunctionIds),
-        [&](std::unique_ptr<ErrorInfoBase> E) {
-          return joinErrors(
-              make_error<StringError>(
-                  Twine("Cannot load YAML instrumentation map from '") +
-                      Filename + "'.",
-                  std::make_error_code(std::errc::executable_format_error)),
-              std::move(E));
-        });
-    break;
-  }
-  }
-}
-
-void InstrumentationMapExtractor::exportAsYAML(raw_ostream &OS) {
+void exportAsYAML(const InstrumentationMap &Map, raw_ostream &OS) {
   // First we translate the sleds into the YAMLXRaySledEntry objects in a deque.
   std::vector<YAMLXRaySledEntry> YAMLSleds;
-  YAMLSleds.reserve(Sleds.size());
+  auto Sleds = Map.sleds();
+  YAMLSleds.reserve(std::distance(Sleds.begin(), Sleds.end()));
   for (const auto &Sled : Sleds) {
-    YAMLSleds.push_back({FunctionIds[Sled.Function], Sled.Address,
-                         Sled.Function, Sled.Kind, Sled.AlwaysInstrument});
+    auto FuncId = Map.getFunctionId(Sled.Function);
+    if (!FuncId)
+      return;
+    YAMLSleds.push_back({*FuncId, Sled.Address, Sled.Function, Sled.Kind,
+                         Sled.AlwaysInstrument});
   }
-  Output Out(OS);
+  Output Out(OS, nullptr, 0);
   Out << YAMLSleds;
 }
 
+} // namespace
+
 static CommandRegistration Unused(&Extract, []() -> Error {
-  Error Err = Error::success();
-  xray::InstrumentationMapExtractor Extractor(
-      ExtractInput, InstrumentationMapExtractor::InputFormats::ELF, Err);
-  if (Err)
-    return Err;
+  auto InstrumentationMapOrError = loadInstrumentationMap(ExtractInput);
+  if (!InstrumentationMapOrError)
+    return joinErrors(make_error<StringError>(
+                          Twine("Cannot extract instrumentation map from '") +
+                              ExtractInput + "'.",
+                          std::make_error_code(std::errc::invalid_argument)),
+                      InstrumentationMapOrError.takeError());
 
   std::error_code EC;
   raw_fd_ostream OS(ExtractOutput, EC, sys::fs::OpenFlags::F_Text);
   if (EC)
     return make_error<StringError>(
         Twine("Cannot open file '") + ExtractOutput + "' for writing.", EC);
-  Extractor.exportAsYAML(OS);
+  exportAsYAML(*InstrumentationMapOrError, OS);
   return Error::success();
 });
diff --git a/tools/llvm-xray/xray-extract.h b/tools/llvm-xray/xray-extract.h
deleted file mode 100644
index 91e4db36805fe6c40f3e9f7d9041ed81c02c1272..0000000000000000000000000000000000000000
--- a/tools/llvm-xray/xray-extract.h
+++ /dev/null
@@ -1,58 +0,0 @@
-//===- xray-extract.h - XRay Instrumentation Map Extraction ---------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Defines the interface for extracting the instrumentation map from an
-// XRay-instrumented binary.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_XRAY_EXTRACT_H
-#define LLVM_TOOLS_XRAY_EXTRACT_H
-
-#include <deque>
-#include <map>
-#include <string>
-#include <unordered_map>
-
-#include "xray-sleds.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/raw_ostream.h"
-
-namespace llvm {
-namespace xray {
-
-class InstrumentationMapExtractor {
-public:
-  typedef std::unordered_map<int32_t, uint64_t> FunctionAddressMap;
-  typedef std::unordered_map<uint64_t, int32_t> FunctionAddressReverseMap;
-
-  enum class InputFormats { ELF, YAML };
-
-private:
-  std::deque<SledEntry> Sleds;
-  FunctionAddressMap FunctionAddresses;
-  FunctionAddressReverseMap FunctionIds;
-
-public:
-  /// Loads the instrumentation map from |Filename|. Updates |EC| in case there
-  /// were errors encountered opening the file. |Format| defines what the input
-  /// instrumentation map is in.
-  InstrumentationMapExtractor(std::string Filename, InputFormats Format,
-                              Error &EC);
-
-  const FunctionAddressMap &getFunctionAddresses() { return FunctionAddresses; }
-
-  /// Exports the loaded function address map as YAML through |OS|.
-  void exportAsYAML(raw_ostream &OS);
-};
-
-} // namespace xray
-} // namespace llvm
-
-#endif // LLVM_TOOLS_XRAY_EXTRACT_H
diff --git a/tools/llvm-xray/xray-graph.cc b/tools/llvm-xray/xray-graph.cc
index e6ec7aad96421cd151ca9943c8f97abfcf217c6c..9be0b70c2cdd8324991e4b30421594081b820eda 100644
--- a/tools/llvm-xray/xray-graph.cc
+++ b/tools/llvm-xray/xray-graph.cc
@@ -1,4 +1,4 @@
-//===-- xray-graph.c - XRay Function Call Graph Renderer ------------------===//
+//===-- xray-graph.cc - XRay Function Call Graph Renderer -----------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -17,71 +17,60 @@
 #include <system_error>
 #include <utility>
 
-#include "xray-extract.h"
 #include "xray-graph.h"
 #include "xray-registry.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormatVariadic.h"
+#include "llvm/XRay/InstrumentationMap.h"
 #include "llvm/XRay/Trace.h"
 #include "llvm/XRay/YAMLXRayRecord.h"
 
 using namespace llvm;
-using namespace xray;
+using namespace llvm::xray;
 
 // Setup llvm-xray graph subcommand and its options.
-static cl::SubCommand Graph("graph", "Generate function-call graph");
+static cl::SubCommand GraphC("graph", "Generate function-call graph");
 static cl::opt<std::string> GraphInput(cl::Positional,
                                        cl::desc("<xray log file>"),
-                                       cl::Required, cl::sub(Graph));
+                                       cl::Required, cl::sub(GraphC));
 
 static cl::opt<bool>
     GraphKeepGoing("keep-going", cl::desc("Keep going on errors encountered"),
-                   cl::sub(Graph), cl::init(false));
+                   cl::sub(GraphC), cl::init(false));
 static cl::alias GraphKeepGoing2("k", cl::aliasopt(GraphKeepGoing),
                                  cl::desc("Alias for -keep-going"),
-                                 cl::sub(Graph));
+                                 cl::sub(GraphC));
 
 static cl::opt<std::string>
     GraphOutput("output", cl::value_desc("Output file"), cl::init("-"),
-                cl::desc("output file; use '-' for stdout"), cl::sub(Graph));
+                cl::desc("output file; use '-' for stdout"), cl::sub(GraphC));
 static cl::alias GraphOutput2("o", cl::aliasopt(GraphOutput),
-                              cl::desc("Alias for -output"), cl::sub(Graph));
+                              cl::desc("Alias for -output"), cl::sub(GraphC));
 
 static cl::opt<std::string>
     GraphInstrMap("instr_map",
                   cl::desc("binary with the instrumrntation map, or "
                            "a separate instrumentation map"),
-                  cl::value_desc("binary with xray_instr_map"), cl::sub(Graph),
+                  cl::value_desc("binary with xray_instr_map"), cl::sub(GraphC),
                   cl::init(""));
 static cl::alias GraphInstrMap2("m", cl::aliasopt(GraphInstrMap),
                                 cl::desc("alias for -instr_map"),
-                                cl::sub(Graph));
-
-static cl::opt<InstrumentationMapExtractor::InputFormats> InstrMapFormat(
-    "instr-map-format", cl::desc("format of instrumentation map"),
-    cl::values(clEnumValN(InstrumentationMapExtractor::InputFormats::ELF, "elf",
-                          "instrumentation map in an ELF header"),
-               clEnumValN(InstrumentationMapExtractor::InputFormats::YAML,
-                          "yaml", "instrumentation map in YAML")),
-    cl::sub(Graph), cl::init(InstrumentationMapExtractor::InputFormats::ELF));
-static cl::alias InstrMapFormat2("t", cl::aliasopt(InstrMapFormat),
-                                 cl::desc("Alias for -instr-map-format"),
-                                 cl::sub(Graph));
+                                cl::sub(GraphC));
 
 static cl::opt<bool> GraphDeduceSiblingCalls(
     "deduce-sibling-calls",
     cl::desc("Deduce sibling calls when unrolling function call stacks"),
-    cl::sub(Graph), cl::init(false));
+    cl::sub(GraphC), cl::init(false));
 static cl::alias
     GraphDeduceSiblingCalls2("d", cl::aliasopt(GraphDeduceSiblingCalls),
                              cl::desc("Alias for -deduce-sibling-calls"),
-                             cl::sub(Graph));
+                             cl::sub(GraphC));
 
 static cl::opt<GraphRenderer::StatType>
     GraphEdgeLabel("edge-label",
                    cl::desc("Output graphs with edges labeled with this field"),
-                   cl::value_desc("field"), cl::sub(Graph),
+                   cl::value_desc("field"), cl::sub(GraphC),
                    cl::init(GraphRenderer::StatType::NONE),
                    cl::values(clEnumValN(GraphRenderer::StatType::NONE, "none",
                                          "Do not label Edges"),
@@ -101,12 +90,12 @@ static cl::opt<GraphRenderer::StatType>
                                          "sum of call durations")));
 static cl::alias GraphEdgeLabel2("e", cl::aliasopt(GraphEdgeLabel),
                                  cl::desc("Alias for -edge-label"),
-                                 cl::sub(Graph));
+                                 cl::sub(GraphC));
 
 static cl::opt<GraphRenderer::StatType> GraphVertexLabel(
     "vertex-label",
     cl::desc("Output graphs with vertices labeled with this field"),
-    cl::value_desc("field"), cl::sub(Graph),
+    cl::value_desc("field"), cl::sub(GraphC),
     cl::init(GraphRenderer::StatType::NONE),
     cl::values(clEnumValN(GraphRenderer::StatType::NONE, "none",
                           "Do not label Edges"),
@@ -126,12 +115,12 @@ static cl::opt<GraphRenderer::StatType> GraphVertexLabel(
                           "sum of call durations")));
 static cl::alias GraphVertexLabel2("v", cl::aliasopt(GraphVertexLabel),
                                    cl::desc("Alias for -edge-label"),
-                                   cl::sub(Graph));
+                                   cl::sub(GraphC));
 
 static cl::opt<GraphRenderer::StatType> GraphEdgeColorType(
     "color-edges",
     cl::desc("Output graphs with edge colors determined by this field"),
-    cl::value_desc("field"), cl::sub(Graph),
+    cl::value_desc("field"), cl::sub(GraphC),
     cl::init(GraphRenderer::StatType::NONE),
     cl::values(clEnumValN(GraphRenderer::StatType::NONE, "none",
                           "Do not label Edges"),
@@ -151,12 +140,12 @@ static cl::opt<GraphRenderer::StatType> GraphEdgeColorType(
                           "sum of call durations")));
 static cl::alias GraphEdgeColorType2("c", cl::aliasopt(GraphEdgeColorType),
                                      cl::desc("Alias for -color-edges"),
-                                     cl::sub(Graph));
+                                     cl::sub(GraphC));
 
 static cl::opt<GraphRenderer::StatType> GraphVertexColorType(
     "color-vertices",
     cl::desc("Output graphs with vertex colors determined by this field"),
-    cl::value_desc("field"), cl::sub(Graph),
+    cl::value_desc("field"), cl::sub(GraphC),
     cl::init(GraphRenderer::StatType::NONE),
     cl::values(clEnumValN(GraphRenderer::StatType::NONE, "none",
                           "Do not label Edges"),
@@ -176,7 +165,7 @@ static cl::opt<GraphRenderer::StatType> GraphVertexColorType(
                           "sum of call durations")));
 static cl::alias GraphVertexColorType2("b", cl::aliasopt(GraphVertexColorType),
                                        cl::desc("Alias for -edge-label"),
-                                       cl::sub(Graph));
+                                       cl::sub(GraphC));
 
 template <class T> T diff(T L, T R) { return std::max(L, R) - std::min(L, R); }
 
@@ -221,14 +210,13 @@ Error GraphRenderer::accountRecord(const XRayRecord &Record) {
   auto &ThreadStack = PerThreadFunctionStack[Record.TId];
   switch (Record.Type) {
   case RecordTypes::ENTER: {
-    if (VertexAttrs.count(Record.FuncId) == 0)
-      VertexAttrs[Record.FuncId].SymbolName =
-          FuncIdHelper.SymbolOrNumber(Record.FuncId);
+    if (G.count(Record.FuncId) == 0)
+      G[Record.FuncId].SymbolName = FuncIdHelper.SymbolOrNumber(Record.FuncId);
     ThreadStack.push_back({Record.FuncId, Record.TSC});
     break;
   }
   case RecordTypes::EXIT: {
-    // FIXME: Refactor this and the account subcommand to reducr code
+    // FIXME: Refactor this and the account subcommand to reduce code
     // duplication
     if (ThreadStack.size() == 0 || ThreadStack.back().FuncId != Record.FuncId) {
       if (!DeduceSiblingCalls)
@@ -243,23 +231,25 @@ Error GraphRenderer::accountRecord(const XRayRecord &Record) {
             make_error_code(errc::invalid_argument)); // There is no matching
                                                       // Function for this exit.
       while (ThreadStack.back().FuncId != Record.FuncId) {
-        uint64_t D = diff(ThreadStack.back().TSC, Record.TSC);
-        int32_t TopFuncId = ThreadStack.back().FuncId;
+        TimestampT D = diff(ThreadStack.back().TSC, Record.TSC);
+        VertexIdentifier TopFuncId = ThreadStack.back().FuncId;
         ThreadStack.pop_back();
         assert(ThreadStack.size() != 0);
-        auto &EA = Graph[ThreadStack.back().FuncId][TopFuncId];
+        EdgeIdentifier EI(ThreadStack.back().FuncId, TopFuncId);
+        auto &EA = G[EI];
         EA.Timings.push_back(D);
         updateStat(EA.S, D);
-        updateStat(VertexAttrs[TopFuncId].S, D);
+        updateStat(G[TopFuncId].S, D);
       }
     }
     uint64_t D = diff(ThreadStack.back().TSC, Record.TSC);
     ThreadStack.pop_back();
-    auto &V = Graph[ThreadStack.empty() ? 0 : ThreadStack.back().FuncId];
-    auto &EA = V[Record.FuncId];
+    VertexIdentifier VI = ThreadStack.empty() ? 0 : ThreadStack.back().FuncId;
+    EdgeIdentifier EI(VI, Record.FuncId);
+    auto &EA = G[EI];
     EA.Timings.push_back(D);
     updateStat(EA.S, D);
-    updateStat(VertexAttrs[Record.FuncId].S, D);
+    updateStat(G[Record.FuncId].S, D);
     break;
   }
   }
@@ -269,7 +259,7 @@ Error GraphRenderer::accountRecord(const XRayRecord &Record) {
 
 template <typename U>
 void GraphRenderer::getStats(U begin, U end, GraphRenderer::TimeStat &S) {
-  assert(begin != end);
+  if (begin == end) return;
   std::ptrdiff_t MedianOff = S.Count / 2;
   std::nth_element(begin, begin + MedianOff, end);
   S.Median = *(begin + MedianOff);
@@ -293,38 +283,28 @@ void GraphRenderer::updateMaxStats(const GraphRenderer::TimeStat &S,
 }
 
 void GraphRenderer::calculateEdgeStatistics() {
-  for (auto &V : Graph) {
-    for (auto &E : V.second) {
-      auto &A = E.second;
-      getStats(A.Timings.begin(), A.Timings.end(), A.S);
-      updateMaxStats(A.S, GraphEdgeMax);
-    }
+  assert(!G.edges().empty());
+  for (auto &E : G.edges()) {
+    auto &A = E.second;
+    assert(!A.Timings.empty());
+    getStats(A.Timings.begin(), A.Timings.end(), A.S);
+    updateMaxStats(A.S, G.GraphEdgeMax);
   }
 }
 
 void GraphRenderer::calculateVertexStatistics() {
-  DenseMap<int32_t, std::pair<uint64_t, SmallVector<EdgeAttribute *, 4>>>
-      IncommingEdges;
-  uint64_t MaxCount = 0;
-  for (auto &V : Graph) {
-    for (auto &E : V.second) {
-      auto &IEV = IncommingEdges[E.first];
-      IEV.second.push_back(&E.second);
-      IEV.first += E.second.S.Count;
-      if (IEV.first > MaxCount)
-        MaxCount = IEV.first;
-    }
-  }
   std::vector<uint64_t> TempTimings;
-  TempTimings.reserve(MaxCount);
-  for (auto &V : IncommingEdges) {
-    for (auto &P : V.second.second) {
-      TempTimings.insert(TempTimings.end(), P->Timings.begin(),
-                         P->Timings.end());
+  for (auto &V : G.vertices()) {
+    if (V.first != 0) {
+      for (auto &E : G.inEdges(V.first)) {
+        auto &A = E.second;
+        TempTimings.insert(TempTimings.end(), A.Timings.begin(),
+                           A.Timings.end());
+      }
+      getStats(TempTimings.begin(), TempTimings.end(), G[V.first].S);
+      updateMaxStats(G[V.first].S, G.GraphVertexMax);
+      TempTimings.clear();
     }
-    getStats(TempTimings.begin(), TempTimings.end(), VertexAttrs[V.first].S);
-    updateMaxStats(VertexAttrs[V.first].S, GraphVertexMax);
-    TempTimings.clear();
   }
 }
 
@@ -342,19 +322,17 @@ static void normalizeTimeStat(GraphRenderer::TimeStat &S,
 
 // Normalises the statistics in the graph for a given TSC frequency.
 void GraphRenderer::normalizeStatistics(double CycleFrequency) {
-  for (auto &V : Graph) {
-    for (auto &E : V.second) {
-      auto &S = E.second.S;
-      normalizeTimeStat(S, CycleFrequency);
-    }
+  for (auto &E : G.edges()) {
+    auto &S = E.second.S;
+    normalizeTimeStat(S, CycleFrequency);
   }
-  for (auto &V : VertexAttrs) {
+  for (auto &V : G.vertices()) {
     auto &S = V.second.S;
     normalizeTimeStat(S, CycleFrequency);
   }
 
-  normalizeTimeStat(GraphEdgeMax, CycleFrequency);
-  normalizeTimeStat(GraphVertexMax, CycleFrequency);
+  normalizeTimeStat(G.GraphEdgeMax, CycleFrequency);
+  normalizeTimeStat(G.GraphVertexMax, CycleFrequency);
 }
 
 // Returns a string containing the value of statistic field T
@@ -390,61 +368,6 @@ GraphRenderer::TimeStat::getAsString(GraphRenderer::StatType T) const {
   return S.str();
 }
 
-// Evaluates a polynomial given the coefficints provided in an ArrayRef
-// evaluating:
-//
-//    p(x) = a[n-0]*x^0 + a[n-1]*x^1 + ... a[n-n]*x^n
-//
-// at x_0 using Horner's Method for both performance and stability reasons.
-static double polyEval(ArrayRef<double> a, double x_0) {
-  double B = 0;
-  for (const auto &c : a) {
-    B = c + B * x_0;
-  }
-  return B;
-}
-
-// Takes a double precision number, clips it between 0 and 1 and then converts
-// that to an integer between 0x00 and 0xFF with proxpper rounding.
-static uint8_t uintIntervalTo8bitChar(double B) {
-  double n = std::max(std::min(B, 1.0), 0.0);
-  return static_cast<uint8_t>(255 * n + 0.5);
-}
-
-// Gets a color in a gradient given a number in the interval [0,1], it does this
-// by evaluating a polynomial which maps [0, 1] -> [0, 1] for each of the R G
-// and B values in the color. It then converts this [0,1] colors to a 24 bit
-// color.
-//
-// In order to calculate these polynomials,
-//   1. Convert the OrRed9 color scheme from http://colorbrewer2.org/ from sRGB
-//      to LAB color space.
-//   2. Interpolate between the descrete colors in LAB space using a cubic
-//      spline interpolation.
-//   3. Sample this interpolation at 100 points and convert to sRGB.
-//   4. Calculate a polynomial fit for these 100 points for each of R G and B.
-//      We used a polynomial of degree 9 arbitrarily based on a fuzzy goodness
-//      of fit metric (using human judgement);
-//   5. Extract these polynomial coefficients from matlab as a set of constants.
-static std::string getColor(double point) {
-  assert(point >= 0.0 && point <= 1);
-  const static double RedPoly[] = {-38.4295,  239.239, -600.108, 790.544,
-                                   -591.26,   251.304, -58.0983, 6.62999,
-                                   -0.325899, 1.00173};
-  const static double GreenPoly[] = {-603.634,   2338.15, -3606.74, 2786.16,
-                                     -1085.19,   165.15,  11.2584,  -6.11338,
-                                     -0.0091078, 0.965469};
-  const static double BluePoly[] = {-325.686, 947.415,  -699.079, -513.75,
-                                    1127.78,  -732.617, 228.092,  -33.8202,
-                                    0.732108, 0.913916};
-
-  uint8_t r = uintIntervalTo8bitChar(polyEval(RedPoly, point));
-  uint8_t g = uintIntervalTo8bitChar(polyEval(GreenPoly, point));
-  uint8_t b = uintIntervalTo8bitChar(polyEval(BluePoly, point));
-
-  return llvm::formatv("#{0:X-2}{1:X-2}{2:x-2}", r, g, b);
-}
-
 // Returns the quotient between the property T of this and another TimeStat as
 // a double
 double GraphRenderer::TimeStat::compare(StatType T, const TimeStat &O) const {
@@ -490,8 +413,11 @@ double GraphRenderer::TimeStat::compare(StatType T, const TimeStat &O) const {
 void GraphRenderer::exportGraphAsDOT(raw_ostream &OS, const XRayFileHeader &H,
                                      StatType ET, StatType EC, StatType VT,
                                      StatType VC) {
+  G.GraphEdgeMax = {};
+  G.GraphVertexMax = {};
   calculateEdgeStatistics();
   calculateVertexStatistics();
+
   if (H.CycleFrequency)
     normalizeStatistics(H.CycleFrequency);
 
@@ -500,18 +426,20 @@ void GraphRenderer::exportGraphAsDOT(raw_ostream &OS, const XRayFileHeader &H,
   if (VT != StatType::NONE)
     OS << "node [shape=record];\n";
 
-  for (const auto &V : Graph)
-    for (const auto &E : V.second) {
-      const auto &S = E.second.S;
-      OS << "F" << V.first << " -> "
-         << "F" << E.first << " [label=\"" << S.getAsString(ET) << "\"";
-      if (EC != StatType::NONE)
-        OS << " color=\"" << getColor(S.compare(EC, GraphEdgeMax)) << "\"";
-      OS << "];\n";
-    }
+  for (const auto &E : G.edges()) {
+    const auto &S = E.second.S;
+    OS << "F" << E.first.first << " -> "
+       << "F" << E.first.second << " [label=\"" << S.getAsString(ET) << "\"";
+    if (EC != StatType::NONE)
+      OS << " color=\"" << CHelper.getColorString(S.compare(EC, G.GraphEdgeMax))
+         << "\"";
+    OS << "];\n";
+  }
 
-  for (const auto &V : VertexAttrs) {
+  for (const auto &V : G.vertices()) {
     const auto &VA = V.second;
+    if (V.first == 0)
+      continue;
     OS << "F" << V.first << " [label=\"" << (VT != StatType::NONE ? "{" : "")
        << (VA.SymbolName.size() > 40 ? VA.SymbolName.substr(0, 40) + "..."
                                      : VA.SymbolName);
@@ -520,7 +448,8 @@ void GraphRenderer::exportGraphAsDOT(raw_ostream &OS, const XRayFileHeader &H,
     else
       OS << "\"";
     if (VC != StatType::NONE)
-      OS << " color=\"" << getColor(VA.S.compare(VC, GraphVertexMax)) << "\"";
+      OS << " color=\"" << CHelper.getColorString(VA.S.compare(VC, G.GraphVertexMax))
+         << "\"";
     OS << "];\n";
   }
   OS << "}\n";
@@ -534,51 +463,45 @@ void GraphRenderer::exportGraphAsDOT(raw_ostream &OS, const XRayFileHeader &H,
 //
 // FIXME: include additional filtering and annalysis passes to provide more
 // specific useful information.
-static CommandRegistration Unused(&Graph, []() -> Error {
-  int Fd;
-  auto EC = sys::fs::openFileForRead(GraphInput, Fd);
-  if (EC)
-    return make_error<StringError>(
-        Twine("Cannot open file '") + GraphInput + "'", EC);
-
-  Error Err = Error::success();
-  xray::InstrumentationMapExtractor Extractor(GraphInstrMap, InstrMapFormat,
-                                              Err);
-  handleAllErrors(std::move(Err),
-                  [&](const ErrorInfoBase &E) { E.log(errs()); });
-
-  const auto &FunctionAddresses = Extractor.getFunctionAddresses();
+static CommandRegistration Unused(&GraphC, []() -> Error {
+  InstrumentationMap Map;
+  if (!GraphInstrMap.empty()) {
+    auto InstrumentationMapOrError = loadInstrumentationMap(GraphInstrMap);
+    if (!InstrumentationMapOrError)
+      return joinErrors(
+          make_error<StringError>(
+              Twine("Cannot open instrumentation map '") + GraphInstrMap + "'",
+              std::make_error_code(std::errc::invalid_argument)),
+          InstrumentationMapOrError.takeError());
+    Map = std::move(*InstrumentationMapOrError);
+  }
 
+  const auto &FunctionAddresses = Map.getFunctionAddresses();
   symbolize::LLVMSymbolizer::Options Opts(
       symbolize::FunctionNameKind::LinkageName, true, true, false, "");
-
   symbolize::LLVMSymbolizer Symbolizer(Opts);
-
   llvm::xray::FuncIdConversionHelper FuncIdHelper(GraphInstrMap, Symbolizer,
                                                   FunctionAddresses);
-
   xray::GraphRenderer GR(FuncIdHelper, GraphDeduceSiblingCalls);
-
+  std::error_code EC;
   raw_fd_ostream OS(GraphOutput, EC, sys::fs::OpenFlags::F_Text);
-
   if (EC)
     return make_error<StringError>(
         Twine("Cannot open file '") + GraphOutput + "' for writing.", EC);
 
   auto TraceOrErr = loadTraceFile(GraphInput, true);
-
-  if (!TraceOrErr) {
+  if (!TraceOrErr)
     return joinErrors(
         make_error<StringError>(Twine("Failed loading input file '") +
                                     GraphInput + "'",
                                 make_error_code(llvm::errc::invalid_argument)),
-        std::move(Err));
-  }
+        TraceOrErr.takeError());
 
   auto &Trace = *TraceOrErr;
   const auto &Header = Trace.getFileHeader();
+
+  // Here we generate the call graph from entries we find in the trace.
   for (const auto &Record : Trace) {
-    // Generate graph.
     auto E = GR.accountRecord(Record);
     if (!E)
       continue;
@@ -592,12 +515,15 @@ static CommandRegistration Unused(&Graph, []() -> Error {
     }
 
     if (!GraphKeepGoing)
-      return joinErrors(std::move(E), std::move(Err));
+      return joinErrors(make_error<StringError>(
+                            "Error encountered generating the call graph.",
+                            std::make_error_code(std::errc::invalid_argument)),
+                        std::move(E));
+
     handleAllErrors(std::move(E),
                     [&](const ErrorInfoBase &E) { E.log(errs()); });
   }
-
   GR.exportGraphAsDOT(OS, Header, GraphEdgeLabel, GraphEdgeColorType,
                       GraphVertexLabel, GraphVertexColorType);
-  return Err;
+  return Error::success();
 });
diff --git a/tools/llvm-xray/xray-graph.h b/tools/llvm-xray/xray-graph.h
index 8b0e20825204ff4512bb878eca38797c7d603e65..1c7a3c0ef454b7e8dc8834b39632a49fa9b6d570 100644
--- a/tools/llvm-xray/xray-graph.h
+++ b/tools/llvm-xray/xray-graph.h
@@ -19,11 +19,13 @@
 #include <vector>
 
 #include "func-id-helper.h"
+#include "xray-color-helper.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Program.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/XRay/Graph.h"
 #include "llvm/XRay/Trace.h"
 #include "llvm/XRay/XRayRecord.h"
 
@@ -49,21 +51,22 @@ public:
     std::string getAsString(StatType T) const;
     double compare(StatType T, const TimeStat &Other) const;
   };
+  typedef uint64_t TimestampT;
 
   /// An inner struct for storing edge attributes for our graph. Here the
   /// attributes are mainly function call statistics.
   ///
   /// FIXME: expand to contain more information eg call latencies.
-  struct EdgeAttribute {
+  struct CallStats {
     TimeStat S;
-    std::vector<uint64_t> Timings;
+    std::vector<TimestampT> Timings;
   };
 
   /// An Inner Struct for storing vertex attributes, at the moment just
   /// SymbolNames, however in future we could store bulk function statistics.
   ///
   /// FIXME: Store more attributes based on instrumentation map.
-  struct VertexAttribute {
+  struct FunctionStats {
     std::string SymbolName;
     TimeStat S;
   };
@@ -78,17 +81,15 @@ public:
   typedef DenseMap<llvm::sys::ProcessInfo::ProcessId, FunctionStack>
       PerThreadFunctionStackMap;
 
-private:
-  /// The Graph stored in an edge-list like format, with the edges also having
-  /// An attached set of attributes.
-  DenseMap<int32_t, DenseMap<int32_t, EdgeAttribute>> Graph;
-
-  /// Graph Vertex Attributes. These are presently stored seperate from the
-  /// main graph.
-  DenseMap<int32_t, VertexAttribute> VertexAttrs;
+  class GraphT : public Graph<FunctionStats, CallStats, int32_t> {
+  public:
+    TimeStat GraphEdgeMax = {};
+    TimeStat GraphVertexMax = {};
+  };
 
-  TimeStat GraphEdgeMax;
-  TimeStat GraphVertexMax;
+  GraphT G;
+  typedef typename decltype(G)::VertexIdentifier VertexIdentifier;
+  typedef typename decltype(G)::EdgeIdentifier EdgeIdentifier;
 
   /// Use a Map to store the Function stack for each thread whilst building the
   /// graph.
@@ -97,9 +98,9 @@ private:
   PerThreadFunctionStackMap PerThreadFunctionStack;
 
   /// Usefull object for getting human readable Symbol Names.
-  FuncIdConversionHelper &FuncIdHelper;
+  const FuncIdConversionHelper &FuncIdHelper;
   bool DeduceSiblingCalls = false;
-  uint64_t CurrentMaxTSC = 0;
+  TimestampT CurrentMaxTSC = 0;
 
   /// A private function to help implement the statistic generation functions;
   template <typename U>
@@ -117,11 +118,17 @@ private:
   /// Normalises latency statistics for each edge and vertex by CycleFrequency;
   void normalizeStatistics(double CycleFrequency);
 
+  /// An object to color gradients
+  ColorHelper CHelper;
+
 public:
   /// Takes in a reference to a FuncIdHelper in order to have ready access to
   /// Symbol names.
-  explicit GraphRenderer(FuncIdConversionHelper &FuncIdHelper, bool DSC)
-      : FuncIdHelper(FuncIdHelper), DeduceSiblingCalls(DSC) {}
+  explicit GraphRenderer(const FuncIdConversionHelper &FuncIdHelper, bool DSC)
+      : FuncIdHelper(FuncIdHelper), DeduceSiblingCalls(DSC),
+        CHelper(ColorHelper::SequentialScheme::OrRd) {
+    G[0] = {};
+  }
 
   /// Process an Xray record and expand the graph.
   ///
@@ -132,7 +139,7 @@ public:
   /// FIXME: Make this more robust against small irregularities.
   Error accountRecord(const XRayRecord &Record);
 
-  const PerThreadFunctionStackMap getPerThreadFunctionStack() const {
+  const PerThreadFunctionStackMap &getPerThreadFunctionStack() const {
     return PerThreadFunctionStack;
   }
 
@@ -143,6 +150,13 @@ public:
                         StatType EdgeColor = StatType::NONE,
                         StatType VertexLabel = StatType::NONE,
                         StatType VertexColor = StatType::NONE);
+
+  /// Get a reference to the internal graph.
+  const GraphT &getGraph() {
+    calculateEdgeStatistics();
+    calculateVertexStatistics();
+    return G;
+  }
 };
 }
 }
diff --git a/tools/llvm-xray/xray-sleds.h b/tools/llvm-xray/xray-sleds.h
deleted file mode 100644
index 99279579ed471f2274924b9f3f42803670486f65..0000000000000000000000000000000000000000
--- a/tools/llvm-xray/xray-sleds.h
+++ /dev/null
@@ -1,32 +0,0 @@
-//===- xray-sleds.h - XRay Sleds Data Structure ---------------------------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-//
-// Defines the structure used to represent XRay instrumentation map entries.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLVM_XRAY_XRAY_SLEDS_H
-#define LLVM_TOOLS_LLVM_XRAY_XRAY_SLEDS_H
-
-namespace llvm {
-namespace xray {
-
-struct SledEntry {
-  enum class FunctionKinds { ENTRY, EXIT, TAIL };
-
-  uint64_t Address;
-  uint64_t Function;
-  FunctionKinds Kind;
-  bool AlwaysInstrument;
-};
-
-} // namespace xray
-} // namespace llvm
-
-#endif // LLVM_TOOLS_LLVM_XRAY_XRAY_SLEDS_H
diff --git a/tools/lto/lto.cpp b/tools/lto/lto.cpp
index a1dd4ebbccba5887d87f3e9335d6af0ec586f470..1b218a64cbf5695b8753079e2bb1ea95f5b7e716 100644
--- a/tools/lto/lto.cpp
+++ b/tools/lto/lto.cpp
@@ -44,9 +44,13 @@ static cl::opt<bool>
 DisableGVNLoadPRE("disable-gvn-loadpre", cl::init(false),
   cl::desc("Do not run the GVN load PRE pass"));
 
-static cl::opt<bool>
-DisableLTOVectorization("disable-lto-vectorization", cl::init(false),
-  cl::desc("Do not run loop or slp vectorization during LTO"));
+static cl::opt<bool> DisableLTOVectorization(
+    "disable-lto-vectorization", cl::init(false),
+    cl::desc("Do not run loop or slp vectorization during LTO"));
+
+static cl::opt<bool> EnableFreestanding(
+    "lto-freestanding", cl::init(false),
+    cl::desc("Enable Freestanding (disable builtins / TLI) during LTO"));
 
 #ifdef NDEBUG
 static bool VerifyByDefault = false;
@@ -159,6 +163,7 @@ static void lto_add_attrs(lto_code_gen_t cg) {
   if (OptLevel < '0' || OptLevel > '3')
     report_fatal_error("Optimization level must be between 0 and 3");
   CG->setOptLevel(OptLevel - '0');
+  CG->setFreestanding(EnableFreestanding);
 }
 
 extern const char* lto_get_version() {
@@ -267,7 +272,7 @@ lto_module_t lto_module_create_in_local_context(const void *mem, size_t length,
   lto_initialize();
   llvm::TargetOptions Options = InitTargetOptionsFromCodeGenFlags();
 
-  // Create a local context. Ownership will be transfered to LTOModule.
+  // Create a local context. Ownership will be transferred to LTOModule.
   std::unique_ptr<LLVMContext> Context = llvm::make_unique<LLVMContext>();
   Context->setDiagnosticHandler(diagnosticHandler, nullptr, true);
 
@@ -464,6 +469,7 @@ thinlto_code_gen_t thinlto_create_codegen(void) {
   lto_initialize();
   ThinLTOCodeGenerator *CodeGen = new ThinLTOCodeGenerator();
   CodeGen->setTargetOptions(InitTargetOptionsFromCodeGenFlags());
+  CodeGen->setFreestanding(EnableFreestanding);
 
   if (OptLevel.getNumOccurrences()) {
     if (OptLevel < '0' || OptLevel > '3')
diff --git a/tools/msbuild/CMakeLists.txt b/tools/msbuild/CMakeLists.txt
index 4f471e5408ba42410acb9a48fdadc2df69e7df6a..9d132ea58d5d9dc48822c4b0cc1f0c278522ff4b 100644
--- a/tools/msbuild/CMakeLists.txt
+++ b/tools/msbuild/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (WIN32)
+if (MSVC)
   # CPack will install a registry key in this format that we wish to reference.
   set(REG_KEY "${CPACK_PACKAGE_INSTALL_REGISTRY_KEY}")
   set(LIB_PATH_VERSION "${LLVM_VERSION_MAJOR}.${LLVM_VERSION_MINOR}")
diff --git a/tools/obj2yaml/CMakeLists.txt b/tools/obj2yaml/CMakeLists.txt
index 0fab6d104dce9da73a5698682c0b16b7090fde5a..ecd958d75b37b81c01bf84232eb35387160b4995 100644
--- a/tools/obj2yaml/CMakeLists.txt
+++ b/tools/obj2yaml/CMakeLists.txt
@@ -11,5 +11,6 @@ add_llvm_tool(obj2yaml
   dwarf2yaml.cpp
   elf2yaml.cpp
   macho2yaml.cpp
+  wasm2yaml.cpp
   Error.cpp
   )
diff --git a/tools/obj2yaml/dwarf2yaml.cpp b/tools/obj2yaml/dwarf2yaml.cpp
index 4e320cff441a0cdc4d242d7e6a93522385c144a3..d41b44c0681080a2149721dc3cd93e02a42a0b9d 100644
--- a/tools/obj2yaml/dwarf2yaml.cpp
+++ b/tools/obj2yaml/dwarf2yaml.cpp
@@ -17,6 +17,13 @@
 
 using namespace llvm;
 
+void dumpInitialLength(DataExtractor &Data, uint32_t &Offset,
+                       DWARFYAML::InitialLength &InitialLength) {
+  InitialLength.TotalLength = Data.getU32(&Offset);
+  if (InitialLength.isDWARF64())
+    InitialLength.TotalLength64 = Data.getU64(&Offset);
+}
+
 void dumpDebugAbbrev(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
   auto AbbrevSetPtr = DCtx.getDebugAbbrev();
   if (AbbrevSetPtr) {
@@ -31,6 +38,8 @@ void dumpDebugAbbrev(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
           DWARFYAML::AttributeAbbrev AttAbrv;
           AttAbrv.Attribute = Attribute.Attr;
           AttAbrv.Form = Attribute.Form;
+          if (AttAbrv.Form == dwarf::DW_FORM_implicit_const)
+            AttAbrv.Value = *Attribute.ByteSizeOrValue;
           Abbrv.Attributes.push_back(AttAbrv);
         }
         Y.AbbrevDecls.push_back(Abbrv);
@@ -55,7 +64,7 @@ void dumpDebugARanges(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
 
   while (Set.extract(ArangesData, &Offset)) {
     DWARFYAML::ARange Range;
-    Range.Length = Set.getHeader().Length;
+    Range.Length.setLength(Set.getHeader().Length);
     Range.Version = Set.getHeader().Version;
     Range.CuOffset = Set.getHeader().CuOffset;
     Range.AddrSize = Set.getHeader().AddrSize;
@@ -74,11 +83,11 @@ void dumpPubSection(DWARFContextInMemory &DCtx, DWARFYAML::PubSection &Y,
                     StringRef Section) {
   DataExtractor PubSectionData(Section, DCtx.isLittleEndian(), 0);
   uint32_t Offset = 0;
-  Y.Length = PubSectionData.getU32(&Offset);
+  dumpInitialLength(PubSectionData, Offset, Y.Length);
   Y.Version = PubSectionData.getU16(&Offset);
   Y.UnitOffset = PubSectionData.getU32(&Offset);
   Y.UnitSize = PubSectionData.getU32(&Offset);
-  while (Offset < Y.Length) {
+  while (Offset < Y.Length.getLength()) {
     DWARFYAML::PubEntry NewEntry;
     NewEntry.DieOffset = PubSectionData.getU32(&Offset);
     if (Y.IsGNUStyle)
@@ -105,8 +114,10 @@ void dumpDebugPubSections(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
 void dumpDebugInfo(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
   for (const auto &CU : DCtx.compile_units()) {
     DWARFYAML::Unit NewUnit;
-    NewUnit.Length = CU->getLength();
+    NewUnit.Length.setLength(CU->getLength());
     NewUnit.Version = CU->getVersion();
+    if(NewUnit.Version >= 5)
+      NewUnit.Type = (dwarf::UnitType)CU->getUnitType();
     NewUnit.AbbrOffset = CU->getAbbreviations()->getOffset();
     NewUnit.AddrSize = CU->getAddressByteSize();
     for (auto DIE : CU->dies()) {
@@ -168,6 +179,8 @@ void dumpDebugInfo(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
             case dwarf::DW_FORM_data8:
             case dwarf::DW_FORM_sdata:
             case dwarf::DW_FORM_udata:
+            case dwarf::DW_FORM_ref_sup4:
+            case dwarf::DW_FORM_ref_sup8:
               if (auto Val = FormValue.getValue().getAsUnsignedConstant())
                 NewValue.Value = Val.getValue();
               break;
@@ -189,7 +202,6 @@ void dumpDebugInfo(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
             case dwarf::DW_FORM_GNU_strp_alt:
             case dwarf::DW_FORM_line_strp:
             case dwarf::DW_FORM_strp_sup:
-            case dwarf::DW_FORM_ref_sup:
             case dwarf::DW_FORM_GNU_str_index:
               if (auto Val = FormValue.getValue().getAsCStringOffset())
                 NewValue.Value = Val.getValue();
@@ -233,14 +245,9 @@ void dumpDebugLines(DWARFContextInMemory &DCtx, DWARFYAML::Data &Y) {
       DataExtractor LineData(DCtx.getLineSection().Data, DCtx.isLittleEndian(),
                              CU->getAddressByteSize());
       uint32_t Offset = *StmtOffset;
-      uint64_t SizeOfPrologueLength = 4;
-      DebugLines.TotalLength = LineData.getU32(&Offset);
-      uint64_t LineTableLength = DebugLines.TotalLength;
-      if (DebugLines.TotalLength == UINT32_MAX) {
-        DebugLines.TotalLength64 = LineData.getU64(&Offset);
-        LineTableLength = DebugLines.TotalLength64;
-        SizeOfPrologueLength = 8;
-      }
+      dumpInitialLength(LineData, Offset, DebugLines.Length);
+      uint64_t LineTableLength = DebugLines.Length.getLength();
+      uint64_t SizeOfPrologueLength = DebugLines.Length.isDWARF64() ? 8 : 4;
       DebugLines.Version = LineData.getU16(&Offset);
       DebugLines.PrologueLength =
           LineData.getUnsigned(&Offset, SizeOfPrologueLength);
diff --git a/tools/obj2yaml/obj2yaml.cpp b/tools/obj2yaml/obj2yaml.cpp
index 3f9373ee17e38e9881b2f024b379cd041094eb7e..31712af263627fe9f963ea67b6bc0821312c3b19 100644
--- a/tools/obj2yaml/obj2yaml.cpp
+++ b/tools/obj2yaml/obj2yaml.cpp
@@ -24,6 +24,8 @@ static std::error_code dumpObject(const ObjectFile &Obj) {
     return coff2yaml(outs(), cast<COFFObjectFile>(Obj));
   if (Obj.isELF())
     return elf2yaml(outs(), Obj);
+  if (Obj.isWasm())
+    return wasm2yaml(outs(), cast<WasmObjectFile>(Obj));
 
   return obj2yaml_error::unsupported_obj_file_format;
 }
diff --git a/tools/obj2yaml/obj2yaml.h b/tools/obj2yaml/obj2yaml.h
index 70d4ebdd3d14ea87c01e3f1f4a7da401a2a698e2..69c753296efda97de979b8b13c6082a919f4153f 100644
--- a/tools/obj2yaml/obj2yaml.h
+++ b/tools/obj2yaml/obj2yaml.h
@@ -14,6 +14,7 @@
 #define LLVM_TOOLS_OBJ2YAML_OBJ2YAML_H
 
 #include "llvm/Object/COFF.h"
+#include "llvm/Object/Wasm.h"
 #include "llvm/Support/raw_ostream.h"
 #include <system_error>
 
@@ -23,6 +24,8 @@ std::error_code elf2yaml(llvm::raw_ostream &Out,
                          const llvm::object::ObjectFile &Obj);
 std::error_code macho2yaml(llvm::raw_ostream &Out,
                            const llvm::object::Binary &Obj);
+std::error_code wasm2yaml(llvm::raw_ostream &Out,
+                          const llvm::object::WasmObjectFile &Obj);
 
 // Forward decls for dwarf2yaml
 namespace llvm {
diff --git a/tools/obj2yaml/wasm2yaml.cpp b/tools/obj2yaml/wasm2yaml.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f6b530c41969da66d491bf90e00d53424cde2e20
--- /dev/null
+++ b/tools/obj2yaml/wasm2yaml.cpp
@@ -0,0 +1,219 @@
+//===------ utils/wasm2yaml.cpp - obj2yaml conversion tool ------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "obj2yaml.h"
+#include "llvm/Object/COFF.h"
+#include "llvm/ObjectYAML/WasmYAML.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/YAMLTraits.h"
+
+using namespace llvm;
+
+namespace {
+
+class WasmDumper {
+  const object::WasmObjectFile &Obj;
+
+public:
+  WasmDumper(const object::WasmObjectFile &O) : Obj(O) {}
+  ErrorOr<WasmYAML::Object *> dump();
+};
+
+ErrorOr<WasmYAML::Object *> WasmDumper::dump() {
+  auto Y = make_unique<WasmYAML::Object>();
+
+  // Dump header
+  Y->Header.Version = Obj.getHeader().Version;
+
+  // Dump sections
+  for (const auto &Sec : Obj.sections()) {
+    const object::WasmSection &WasmSec = Obj.getWasmSection(Sec);
+    std::unique_ptr<WasmYAML::Section> S;
+    switch (WasmSec.Type) {
+    case wasm::WASM_SEC_CUSTOM: {
+      if (WasmSec.Name.startswith("reloc.")) {
+        // Relocations are attached the sections they apply to rather than
+        // being represented as a custom section in the YAML output.
+        continue;
+      }
+      auto CustomSec = make_unique<WasmYAML::CustomSection>();
+      CustomSec->Name = WasmSec.Name;
+      CustomSec->Payload = yaml::BinaryRef(WasmSec.Content);
+      S = std::move(CustomSec);
+      break;
+    }
+    case wasm::WASM_SEC_TYPE: {
+      auto TypeSec = make_unique<WasmYAML::TypeSection>();
+      uint32_t Index = 0;
+      for (const auto &FunctionSig : Obj.types()) {
+        WasmYAML::Signature Sig;
+        Sig.Index = Index++;
+        Sig.ReturnType = FunctionSig.ReturnType;
+        for (const auto &ParamType : FunctionSig.ParamTypes)
+          Sig.ParamTypes.push_back(ParamType);
+        TypeSec->Signatures.push_back(Sig);
+      }
+      S = std::move(TypeSec);
+      break;
+    }
+    case wasm::WASM_SEC_IMPORT: {
+      auto ImportSec = make_unique<WasmYAML::ImportSection>();
+      for (auto &Import : Obj.imports()) {
+        WasmYAML::Import Ex;
+        Ex.Module = Import.Module;
+        Ex.Field = Import.Field;
+        Ex.Kind = Import.Kind;
+        if (Ex.Kind == wasm::WASM_EXTERNAL_FUNCTION) {
+          Ex.SigIndex = Import.SigIndex;
+        } else if (Ex.Kind == wasm::WASM_EXTERNAL_GLOBAL) {
+          Ex.GlobalType = Import.GlobalType;
+          Ex.GlobalMutable = Import.GlobalMutable;
+        }
+        ImportSec->Imports.push_back(Ex);
+      }
+      S = std::move(ImportSec);
+      break;
+    }
+    case wasm::WASM_SEC_FUNCTION: {
+      auto FuncSec = make_unique<WasmYAML::FunctionSection>();
+      for (const auto &Func : Obj.functionTypes()) {
+        FuncSec->FunctionTypes.push_back(Func);
+      }
+      S = std::move(FuncSec);
+      break;
+    }
+    case wasm::WASM_SEC_TABLE: {
+      auto TableSec = make_unique<WasmYAML::TableSection>();
+      for (auto &Table : Obj.tables()) {
+        WasmYAML::Table T;
+        T.ElemType = Table.ElemType;
+        T.TableLimits.Flags = Table.Limits.Flags;
+        T.TableLimits.Initial = Table.Limits.Initial;
+        T.TableLimits.Maximum = Table.Limits.Maximum;
+        TableSec->Tables.push_back(T);
+      }
+      S = std::move(TableSec);
+      break;
+    }
+    case wasm::WASM_SEC_MEMORY: {
+      auto MemorySec = make_unique<WasmYAML::MemorySection>();
+      for (auto &Memory : Obj.memories()) {
+        WasmYAML::Limits L;
+        L.Flags = Memory.Flags;
+        L.Initial = Memory.Initial;
+        L.Maximum = Memory.Maximum;
+        MemorySec->Memories.push_back(L);
+      }
+      S = std::move(MemorySec);
+      break;
+    }
+    case wasm::WASM_SEC_GLOBAL: {
+      auto GlobalSec = make_unique<WasmYAML::GlobalSection>();
+      for (auto &Global : Obj.globals()) {
+        WasmYAML::Global G;
+        G.Type = Global.Type;
+        G.Mutable = Global.Mutable;
+        G.InitExpr = Global.InitExpr;
+        GlobalSec->Globals.push_back(G);
+      }
+      S = std::move(GlobalSec);
+      break;
+    }
+    case wasm::WASM_SEC_START: {
+      auto StartSec = make_unique<WasmYAML::StartSection>();
+      StartSec->StartFunction = Obj.startFunction();
+      S = std::move(StartSec);
+      break;
+    }
+    case wasm::WASM_SEC_EXPORT: {
+      auto ExportSec = make_unique<WasmYAML::ExportSection>();
+      for (auto &Export : Obj.exports()) {
+        WasmYAML::Export Ex;
+        Ex.Name = Export.Name;
+        Ex.Kind = Export.Kind;
+        Ex.Index = Export.Index;
+        ExportSec->Exports.push_back(Ex);
+      }
+      S = std::move(ExportSec);
+      break;
+    }
+    case wasm::WASM_SEC_ELEM: {
+      auto ElemSec = make_unique<WasmYAML::ElemSection>();
+      for (auto &Segment : Obj.elements()) {
+        WasmYAML::ElemSegment Seg;
+        Seg.TableIndex = Segment.TableIndex;
+        Seg.Offset = Segment.Offset;
+        for (auto &Func : Segment.Functions) {
+          Seg.Functions.push_back(Func);
+        }
+        ElemSec->Segments.push_back(Seg);
+      }
+      S = std::move(ElemSec);
+      break;
+    }
+    case wasm::WASM_SEC_CODE: {
+      auto CodeSec = make_unique<WasmYAML::CodeSection>();
+      for (auto &Func : Obj.functions()) {
+        WasmYAML::Function Function;
+        for (auto &Local : Func.Locals) {
+          WasmYAML::LocalDecl LocalDecl;
+          LocalDecl.Type = Local.Type;
+          LocalDecl.Count = Local.Count;
+          Function.Locals.push_back(LocalDecl);
+        }
+        Function.Body = yaml::BinaryRef(Func.Body);
+        CodeSec->Functions.push_back(Function);
+      }
+      S = std::move(CodeSec);
+      break;
+    }
+    case wasm::WASM_SEC_DATA: {
+      auto DataSec = make_unique<WasmYAML::DataSection>();
+      for (auto &Segment : Obj.dataSegments()) {
+        WasmYAML::DataSegment Seg;
+        Seg.Index = Segment.Index;
+        Seg.Offset = Segment.Offset;
+        Seg.Content = yaml::BinaryRef(Segment.Content);
+        DataSec->Segments.push_back(Seg);
+      }
+      S = std::move(DataSec);
+      break;
+    }
+    default:
+      llvm_unreachable("Unknown section type");
+      break;
+    }
+    for (const wasm::WasmRelocation &Reloc: WasmSec.Relocations) {
+      WasmYAML::Relocation R;
+      R.Type = Reloc.Type;
+      R.Index = Reloc.Index;
+      R.Offset = Reloc.Offset;
+      R.Addend = Reloc.Addend;
+      S->Relocations.push_back(R);
+    }
+    Y->Sections.push_back(std::move(S));
+  }
+
+  return Y.release();
+}
+
+} // namespace
+
+std::error_code wasm2yaml(raw_ostream &Out, const object::WasmObjectFile &Obj) {
+  WasmDumper Dumper(Obj);
+  ErrorOr<WasmYAML::Object *> YAMLOrErr = Dumper.dump();
+  if (std::error_code EC = YAMLOrErr.getError())
+    return EC;
+
+  std::unique_ptr<WasmYAML::Object> YAML(YAMLOrErr.get());
+  yaml::Output Yout(Out);
+  Yout << *YAML;
+
+  return std::error_code();
+}
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index d5c74eef700f95f611aaf1c04aad336fbe43dea6..40459e559986b99b4a952143f8aa32d496c580f7 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -102,6 +102,11 @@ static cl::opt<bool>
     OutputThinLTOBC("thinlto-bc",
                     cl::desc("Write output as ThinLTO-ready bitcode"));
 
+static cl::opt<std::string> ThinLinkBitcodeFile(
+    "thin-link-bitcode-file", cl::value_desc("filename"),
+    cl::desc(
+        "A file in which to write minimized bitcode for the thin link only"));
+
 static cl::opt<bool>
 NoVerify("disable-verify", cl::desc("Do not run the verifier"), cl::Hidden);
 
@@ -201,10 +206,10 @@ static cl::opt<bool>
 PrintBreakpoints("print-breakpoints-for-testing",
                  cl::desc("Print select breakpoints location for testing"));
 
-static cl::opt<std::string>
-DefaultDataLayout("default-data-layout",
-          cl::desc("data layout string to use if not specified by module"),
-          cl::value_desc("layout-string"), cl::init(""));
+static cl::opt<std::string> ClDataLayout("data-layout",
+                                         cl::desc("data layout string to use"),
+                                         cl::value_desc("layout-string"),
+                                         cl::init(""));
 
 static cl::opt<bool> PreserveBitcodeUseListOrder(
     "preserve-bc-uselistorder",
@@ -268,7 +273,7 @@ static void AddOptimizationPasses(legacy::PassManagerBase &MPM,
   if (DisableInline) {
     // No inlining pass
   } else if (OptLevel > 1) {
-    Builder.Inliner = createFunctionInliningPass(OptLevel, SizeLevel);
+    Builder.Inliner = createFunctionInliningPass(OptLevel, SizeLevel, false);
   } else {
     Builder.Inliner = createAlwaysInlinerLegacyPass();
   }
@@ -448,12 +453,15 @@ int main(int argc, char **argv) {
     return 1;
   }
 
-  // If we are supposed to override the target triple, do so now.
+  // If we are supposed to override the target triple or data layout, do so now.
   if (!TargetTriple.empty())
     M->setTargetTriple(Triple::normalize(TargetTriple));
+  if (!ClDataLayout.empty())
+    M->setDataLayout(ClDataLayout);
 
   // Figure out what stream we are supposed to write to...
   std::unique_ptr<tool_output_file> Out;
+  std::unique_ptr<tool_output_file> ThinLinkOut;
   if (NoOutput) {
     if (!OutputFilename.empty())
       errs() << "WARNING: The -o (output filename) option is ignored when\n"
@@ -469,6 +477,15 @@ int main(int argc, char **argv) {
       errs() << EC.message() << '\n';
       return 1;
     }
+
+    if (!ThinLinkBitcodeFile.empty()) {
+      ThinLinkOut.reset(
+          new tool_output_file(ThinLinkBitcodeFile, EC, sys::fs::F_None));
+      if (EC) {
+        errs() << EC.message() << '\n';
+        return 1;
+      }
+    }
   }
 
   Triple ModuleTriple(M->getTargetTriple());
@@ -530,12 +547,6 @@ int main(int argc, char **argv) {
     TLII.disableAllFunctions();
   Passes.add(new TargetLibraryInfoWrapperPass(TLII));
 
-  // Add an appropriate DataLayout instance for this module.
-  const DataLayout &DL = M->getDataLayout();
-  if (DL.isDefault() && !DefaultDataLayout.empty()) {
-    M->setDataLayout(DefaultDataLayout);
-  }
-
   // Add internal analysis passes from the target machine.
   Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
                                                      : TargetIRAnalysis()));
@@ -704,7 +715,8 @@ int main(int argc, char **argv) {
         report_fatal_error("Text output is incompatible with -module-hash");
       Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
     } else if (OutputThinLTOBC)
-      Passes.add(createWriteThinLTOBitcodePass(*OS));
+      Passes.add(createWriteThinLTOBitcodePass(
+          *OS, ThinLinkOut ? &ThinLinkOut->os() : nullptr));
     else
       Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder,
                                          EmitSummaryIndex, EmitModuleHash));
@@ -751,5 +763,8 @@ int main(int argc, char **argv) {
   if (YamlFile)
     YamlFile->keep();
 
+  if (ThinLinkOut)
+    ThinLinkOut->keep();
+
   return 0;
 }
diff --git a/tools/sancov/coverage-report-server.py b/tools/sancov/coverage-report-server.py
index ac3206cba393b19ac6960b15a24453fb3f4d441d..428276f95d3b398c90e11a3b067b9c0e2f63866a 100755
--- a/tools/sancov/coverage-report-server.py
+++ b/tools/sancov/coverage-report-server.py
@@ -138,7 +138,7 @@ class ServerHandler(http.server.BaseHTTPRequestHandler):
                 if not file_coverage:
                     continue
                 filelist.append(
-                        "<tr><td><a href=\"/{name}\">{name}</a></td>"
+                        "<tr><td><a href=\"./{name}\">{name}</a></td>"
                         "<td>{coverage}%</td></tr>".format(
                             name=html.escape(filename, quote=True), 
                             coverage=format_pct(file_coverage)))
@@ -165,7 +165,7 @@ class ServerHandler(http.server.BaseHTTPRequestHandler):
                         ["<span class='{cls}'>{line}&nbsp;</span>".format(
                             line=html.escape(line.rstrip()), 
                             cls=linemap.get(line_no, ""))
-                            for line_no, line in enumerate(f)])
+                            for line_no, line in enumerate(f, start=1)])
 
             response = string.Template(CONTENT_PAGE_TMPL).safe_substitute(
                 path=self.path[1:],
diff --git a/tools/sancov/sancov.cc b/tools/sancov/sancov.cc
index c56ea67b12cd19041438334c9f61258aad3089d6..7f103ebb904b6c1250b47f609d0c0d999c494dee 100644
--- a/tools/sancov/sancov.cc
+++ b/tools/sancov/sancov.cc
@@ -96,7 +96,8 @@ cl::opt<ActionType> Action(
 
 static cl::list<std::string>
     ClInputFiles(cl::Positional, cl::OneOrMore,
-                 cl::desc("(<binary file>|<.sancov file>)..."));
+                 cl::desc("<action> <binary files...> <.sancov files...> "
+                          "<.symcov files...>"));
 
 static cl::opt<bool> ClDemangle("demangle", cl::init(true),
                                 cl::desc("Print demangled function name."));
@@ -179,7 +180,7 @@ struct CoverageStats {
 // --------- ERROR HANDLING ---------
 
 static void fail(const llvm::Twine &E) {
-  errs() << "Error: " << E << "\n";
+  errs() << "ERROR: " << E << "\n";
   exit(1);
 }
 
@@ -191,7 +192,7 @@ static void failIf(bool B, const llvm::Twine &E) {
 static void failIfError(std::error_code Error) {
   if (!Error)
     return;
-  errs() << "Error: " << Error.message() << "(" << Error.value() << ")\n";
+  errs() << "ERROR: " << Error.message() << "(" << Error.value() << ")\n";
   exit(1);
 }
 
@@ -201,7 +202,7 @@ template <typename T> static void failIfError(const ErrorOr<T> &E) {
 
 static void failIfError(Error Err) {
   if (Err) {
-    logAllUnhandledErrors(std::move(Err), errs(), "Error: ");
+    logAllUnhandledErrors(std::move(Err), errs(), "ERROR: ");
     exit(1);
   }
 }
@@ -1085,6 +1086,9 @@ static void readAndPrintRawCoverage(const std::vector<std::string> &FileNames,
 
 static std::unique_ptr<SymbolizedCoverage>
 merge(const std::vector<std::unique_ptr<SymbolizedCoverage>> &Coverages) {
+  if (Coverages.empty())
+    return nullptr;
+
   auto Result = make_unique<SymbolizedCoverage>();
 
   for (size_t I = 0; I < Coverages.size(); ++I) {
@@ -1167,11 +1171,17 @@ readSymbolizeAndMergeCmdArguments(std::vector<std::string> FileNames) {
       CoverageByObjFile[Iter->second].push_back(FileName);
     };
 
+    for (const auto &Pair : ObjFiles) {
+      auto FileName = Pair.second;
+      if (CoverageByObjFile.find(FileName) == CoverageByObjFile.end())
+        errs() << "WARNING: No coverage file for " << FileName << "\n";
+    }
+
     // Read raw coverage and symbolize it.
     for (const auto &Pair : CoverageByObjFile) {
       if (findSanitizerCovFunctions(Pair.first).empty()) {
         errs()
-            << "Ignoring " << Pair.first
+            << "WARNING: Ignoring " << Pair.first
             << " and its coverage because  __sanitizer_cov* functions were not "
                "found.\n";
         continue;
@@ -1200,7 +1210,17 @@ int main(int Argc, char **Argv) {
   llvm::InitializeAllTargetMCs();
   llvm::InitializeAllDisassemblers();
 
-  cl::ParseCommandLineOptions(Argc, Argv, "Sanitizer Coverage Processing Tool");
+  cl::ParseCommandLineOptions(Argc, Argv, 
+      "Sanitizer Coverage Processing Tool (sancov)\n\n"
+      "  This tool can extract various coverage-related information from: \n"
+      "  coverage-instrumented binary files, raw .sancov files and their "
+      "symbolized .symcov version.\n"
+      "  Depending on chosen action the tool expects different input files:\n"
+      "    -print-coverage-pcs     - coverage-instrumented binary files\n"
+      "    -print-coverage         - .sancov files\n"
+      "    <other actions>         - .sancov files & corresponding binary "
+      "files, .symcov files\n"
+      );
 
   // -print doesn't need object files.
   if (Action == PrintAction) {
diff --git a/tools/yaml2obj/CMakeLists.txt b/tools/yaml2obj/CMakeLists.txt
index 885a69f5d3c36b810185f45daee7f630a0b52149..a885547598d852ad5faae899034263b68f0142ba 100644
--- a/tools/yaml2obj/CMakeLists.txt
+++ b/tools/yaml2obj/CMakeLists.txt
@@ -10,4 +10,5 @@ add_llvm_tool(yaml2obj
   yaml2coff.cpp
   yaml2elf.cpp
   yaml2macho.cpp
+  yaml2wasm.cpp
   )
diff --git a/tools/yaml2obj/yaml2obj.cpp b/tools/yaml2obj/yaml2obj.cpp
index f746d84a3898570a17fe377270d327327cae9cf4..e64e3dc1d17987575b68e188a22b1df88278b143 100644
--- a/tools/yaml2obj/yaml2obj.cpp
+++ b/tools/yaml2obj/yaml2obj.cpp
@@ -40,31 +40,33 @@ DocNum("docnum", cl::init(1),
 static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
                                            cl::value_desc("filename"));
 
+LLVM_ATTRIBUTE_NORETURN static void error(Twine Message) {
+  errs() << Message << "\n";
+  exit(1);
+}
+
 static int convertYAML(yaml::Input &YIn, raw_ostream &Out) {
   unsigned CurDocNum = 0;
   do {
     if (++CurDocNum == DocNum) {
       yaml::YamlObjectFile Doc;
       YIn >> Doc;
-      if (YIn.error()) {
-        errs() << "yaml2obj: Failed to parse YAML file!\n";
-        return 1;
-      }
-
+      if (YIn.error())
+        error("yaml2obj: Failed to parse YAML file!");
       if (Doc.Elf)
         return yaml2elf(*Doc.Elf, Out);
       if (Doc.Coff)
         return yaml2coff(*Doc.Coff, Out);
       if (Doc.MachO || Doc.FatMachO)
         return yaml2macho(Doc, Out);
-      errs() << "yaml2obj: Unknown document type!\n";
-      return 1;
+      if (Doc.Wasm)
+        return yaml2wasm(*Doc.Wasm, Out);
+      error("yaml2obj: Unknown document type!");
     }
   } while (YIn.nextDocument());
 
-  errs() << "yaml2obj: Cannot find the " << DocNum
-         << llvm::getOrdinalSuffix(DocNum) << " document\n";
-  return 1;
+  error("yaml2obj: Cannot find the " + utostr(DocNum) +
+        llvm::getOrdinalSuffix(DocNum) + " document");
 }
 
 int main(int argc, char **argv) {
@@ -79,10 +81,8 @@ int main(int argc, char **argv) {
   std::error_code EC;
   std::unique_ptr<tool_output_file> Out(
       new tool_output_file(OutputFilename, EC, sys::fs::F_None));
-  if (EC) {
-    errs() << EC.message() << '\n';
-    return 1;
-  }
+  if (EC)
+    error("yaml2obj: Error opening '" + OutputFilename + "': " + EC.message());
 
   ErrorOr<std::unique_ptr<MemoryBuffer>> Buf =
       MemoryBuffer::getFileOrSTDIN(Input);
diff --git a/tools/yaml2obj/yaml2obj.h b/tools/yaml2obj/yaml2obj.h
index b5025e860bd751281be43b9daf83b5e6af8b8bba..cb8f11904916cbaf706c4e66f83c89b376876aba 100644
--- a/tools/yaml2obj/yaml2obj.h
+++ b/tools/yaml2obj/yaml2obj.h
@@ -23,6 +23,10 @@ namespace ELFYAML {
 struct Object;
 }
 
+namespace WasmYAML {
+struct Object;
+}
+
 namespace yaml {
 class Input;
 struct YamlObjectFile;
@@ -32,5 +36,6 @@ struct YamlObjectFile;
 int yaml2coff(llvm::COFFYAML::Object &Doc, llvm::raw_ostream &Out);
 int yaml2elf(llvm::ELFYAML::Object &Doc, llvm::raw_ostream &Out);
 int yaml2macho(llvm::yaml::YamlObjectFile &Doc, llvm::raw_ostream &Out);
+int yaml2wasm(llvm::WasmYAML::Object &Doc, llvm::raw_ostream &Out);
 
 #endif
diff --git a/tools/yaml2obj/yaml2wasm.cpp b/tools/yaml2obj/yaml2wasm.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..55267ce0392d9868d98904a0142623f188f41b7d
--- /dev/null
+++ b/tools/yaml2obj/yaml2wasm.cpp
@@ -0,0 +1,377 @@
+//===- yaml2wasm - Convert YAML to a Wasm object file --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// \brief The Wasm component of yaml2obj.
+///
+//===----------------------------------------------------------------------===//
+//
+#include "yaml2obj.h"
+#include "llvm/Object/Wasm.h"
+#include "llvm/ObjectYAML/ObjectYAML.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/LEB128.h"
+
+using namespace llvm;
+
+/// This parses a yaml stream that represents a Wasm object file.
+/// See docs/yaml2obj for the yaml scheema.
+class WasmWriter {
+public:
+  WasmWriter(WasmYAML::Object &Obj) : Obj(Obj) {}
+  int writeWasm(raw_ostream &OS);
+  int writeRelocSection(raw_ostream &OS, WasmYAML::Section &Sec);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::CustomSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::TypeSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::ImportSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::FunctionSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::TableSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::MemorySection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::GlobalSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::ExportSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::StartSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::ElemSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::CodeSection &Section);
+  int writeSectionContent(raw_ostream &OS, WasmYAML::DataSection &Section);
+
+private:
+  WasmYAML::Object &Obj;
+};
+
+static int writeUint64(raw_ostream &OS, uint64_t Value) {
+  char Data[sizeof(Value)];
+  support::endian::write64le(Data, Value);
+  OS.write(Data, sizeof(Data));
+  return 0;
+}
+
+static int writeUint32(raw_ostream &OS, uint32_t Value) {
+  char Data[sizeof(Value)];
+  support::endian::write32le(Data, Value);
+  OS.write(Data, sizeof(Data));
+  return 0;
+}
+
+static int writeUint8(raw_ostream &OS, uint8_t Value) {
+  char Data[sizeof(Value)];
+  memcpy(Data, &Value, sizeof(Data));
+  OS.write(Data, sizeof(Data));
+  return 0;
+}
+
+static int writeStringRef(StringRef &Str, raw_ostream &OS) {
+  encodeULEB128(Str.size(), OS);
+  OS << Str;
+  return 0;
+}
+
+static int writeLimits(WasmYAML::Limits Lim, raw_ostream &OS) {
+  encodeULEB128(Lim.Flags, OS);
+  encodeULEB128(Lim.Initial, OS);
+  if (Lim.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
+    encodeULEB128(Lim.Maximum, OS);
+  return 0;
+}
+
+static int writeInitExpr(wasm::WasmInitExpr InitExpr, raw_ostream &OS) {
+  writeUint8(OS, InitExpr.Opcode);
+  switch (InitExpr.Opcode) {
+  case wasm::WASM_OPCODE_I32_CONST:
+    encodeSLEB128(InitExpr.Value.Int32, OS);
+    break;
+  case wasm::WASM_OPCODE_I64_CONST:
+    encodeSLEB128(InitExpr.Value.Int64, OS);
+    break;
+  case wasm::WASM_OPCODE_F32_CONST:
+    writeUint32(OS, InitExpr.Value.Float32);
+    break;
+  case wasm::WASM_OPCODE_F64_CONST:
+    writeUint64(OS, InitExpr.Value.Float64);
+    break;
+  case wasm::WASM_OPCODE_GET_GLOBAL:
+    encodeULEB128(InitExpr.Value.Global, OS);
+    break;
+  default:
+    errs() << "Unknown opcode in init_expr: " << InitExpr.Opcode;
+    return 1;
+  }
+  writeUint8(OS, wasm::WASM_OPCODE_END);
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::CustomSection &Section) {
+  // writeStringRef(Section.Name, OS);
+  // encodeULEB128(Section.Payload.binary_size(), OS);
+  Section.Payload.writeAsBinary(OS);
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::TypeSection &Section) {
+  encodeULEB128(Section.Signatures.size(), OS);
+  for (auto &Sig : Section.Signatures) {
+    encodeSLEB128(Sig.Form, OS);
+    encodeULEB128(Sig.ParamTypes.size(), OS);
+    for (auto ParamType : Sig.ParamTypes)
+      encodeSLEB128(ParamType, OS);
+    if (Sig.ReturnType == wasm::WASM_TYPE_NORESULT) {
+      encodeSLEB128(0, OS);
+    } else {
+      encodeULEB128(1, OS);
+      encodeSLEB128(Sig.ReturnType, OS);
+    }
+  }
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::ImportSection &Section) {
+  encodeULEB128(Section.Imports.size(), OS);
+  for (auto &Import : Section.Imports) {
+    writeStringRef(Import.Module, OS);
+    writeStringRef(Import.Field, OS);
+    encodeULEB128(Import.Kind, OS);
+    switch (Import.Kind) {
+    case wasm::WASM_EXTERNAL_FUNCTION:
+      encodeULEB128(Import.SigIndex, OS);
+      break;
+    case wasm::WASM_EXTERNAL_GLOBAL:
+      encodeSLEB128(Import.GlobalType, OS);
+      writeUint8(OS, Import.GlobalMutable);
+      break;
+    default:
+      errs() << "Unknown import type: " << Import.Kind;
+      return 1;
+    }
+  }
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::FunctionSection &Section) {
+  encodeULEB128(Section.FunctionTypes.size(), OS);
+  for (uint32_t FuncType : Section.FunctionTypes) {
+    encodeULEB128(FuncType, OS);
+  }
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::ExportSection &Section) {
+  encodeULEB128(Section.Exports.size(), OS);
+  for (auto &Export : Section.Exports) {
+    writeStringRef(Export.Name, OS);
+    encodeULEB128(Export.Kind, OS);
+    encodeULEB128(Export.Index, OS);
+  }
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::StartSection &Section) {
+  encodeULEB128(Section.StartFunction, OS);
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::TableSection &Section) {
+  encodeULEB128(Section.Tables.size(), OS);
+  for (auto &Table : Section.Tables) {
+    encodeSLEB128(Table.ElemType, OS);
+    writeLimits(Table.TableLimits, OS);
+  }
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::MemorySection &Section) {
+  encodeULEB128(Section.Memories.size(), OS);
+  for (auto &Mem : Section.Memories) {
+    writeLimits(Mem, OS);
+  }
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::GlobalSection &Section) {
+  encodeULEB128(Section.Globals.size(), OS);
+  for (auto &Global : Section.Globals) {
+    encodeSLEB128(Global.Type, OS);
+    writeUint8(OS, Global.Mutable);
+    writeInitExpr(Global.InitExpr, OS);
+  }
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::ElemSection &Section) {
+  encodeULEB128(Section.Segments.size(), OS);
+  for (auto &Segment : Section.Segments) {
+    encodeULEB128(Segment.TableIndex, OS);
+    writeInitExpr(Segment.Offset, OS);
+
+    encodeULEB128(Segment.Functions.size(), OS);
+    for (auto &Function : Segment.Functions) {
+      encodeULEB128(Function, OS);
+    }
+  }
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::CodeSection &Section) {
+  encodeULEB128(Section.Functions.size(), OS);
+  for (auto &Func : Section.Functions) {
+    std::string OutString;
+    raw_string_ostream StringStream(OutString);
+
+    encodeULEB128(Func.Locals.size(), StringStream);
+    for (auto &LocalDecl : Func.Locals) {
+      encodeULEB128(LocalDecl.Count, StringStream);
+      encodeSLEB128(LocalDecl.Type, StringStream);
+    }
+
+    Func.Body.writeAsBinary(StringStream);
+
+    // Write the section size followed by the content
+    StringStream.flush();
+    encodeULEB128(OutString.size(), OS);
+    OS << OutString;
+  }
+  return 0;
+}
+
+int WasmWriter::writeSectionContent(raw_ostream &OS,
+                                    WasmYAML::DataSection &Section) {
+  encodeULEB128(Section.Segments.size(), OS);
+  for (auto &Segment : Section.Segments) {
+    encodeULEB128(Segment.Index, OS);
+    writeInitExpr(Segment.Offset, OS);
+    encodeULEB128(Segment.Content.binary_size(), OS);
+    Segment.Content.writeAsBinary(OS);
+  }
+  return 0;
+}
+
+int WasmWriter::writeRelocSection(raw_ostream &OS,
+                                  WasmYAML::Section &Sec) {
+  StringRef Name;
+  switch (Sec.Type) {
+    case wasm::WASM_SEC_CODE:
+      Name = "reloc.CODE";
+      break;
+    case wasm::WASM_SEC_DATA:
+      Name = "reloc.DATA";
+      break;
+    default:
+      llvm_unreachable("not yet implemented");
+      return 1;
+  }
+
+  writeStringRef(Name, OS);
+  encodeULEB128(Sec.Type, OS);
+  encodeULEB128(Sec.Relocations.size(), OS);
+
+  for (auto Reloc: Sec.Relocations) {
+    encodeULEB128(Reloc.Type, OS);
+    encodeULEB128(Reloc.Offset, OS);
+    encodeULEB128(Reloc.Index, OS);
+    switch (Reloc.Type) {
+      case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_LEB:
+      case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_SLEB:
+      case wasm::R_WEBASSEMBLY_GLOBAL_ADDR_I32:
+        encodeULEB128(Reloc.Addend, OS);
+    }
+  }
+  return 0;
+}
+
+
+int WasmWriter::writeWasm(raw_ostream &OS) {
+  // Write headers
+  OS.write(wasm::WasmMagic, sizeof(wasm::WasmMagic));
+  writeUint32(OS, Obj.Header.Version);
+
+  // Write each section
+  for (const std::unique_ptr<WasmYAML::Section> &Sec : Obj.Sections) {
+    encodeULEB128(Sec->Type, OS);
+
+    std::string OutString;
+    raw_string_ostream StringStream(OutString);
+    if (auto S = dyn_cast<WasmYAML::CustomSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::TypeSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::ImportSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::FunctionSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::TableSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::MemorySection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::GlobalSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::ExportSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::StartSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::ElemSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::CodeSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else if (auto S = dyn_cast<WasmYAML::DataSection>(Sec.get())) {
+      if (auto Err = writeSectionContent(StringStream, *S))
+        return Err;
+    } else {
+      errs() << "Unknown section type: " << Sec->Type << "\n";
+      return 1;
+    }
+    StringStream.flush();
+
+    // Write the section size followed by the content
+    encodeULEB128(OutString.size(), OS);
+    OS << OutString;
+  }
+
+  // write reloc sections for any section that have relocations
+  for (const std::unique_ptr<WasmYAML::Section> &Sec : Obj.Sections) {
+    if (Sec->Relocations.empty())
+      continue;
+
+    encodeULEB128(wasm::WASM_SEC_CUSTOM, OS);
+    std::string OutString;
+    raw_string_ostream StringStream(OutString);
+    writeRelocSection(StringStream, *Sec);
+    StringStream.flush();
+
+    encodeULEB128(OutString.size(), OS);
+    OS << OutString;
+  }
+
+  return 0;
+}
+
+int yaml2wasm(llvm::WasmYAML::Object &Doc, raw_ostream &Out) {
+  WasmWriter Writer(Doc);
+
+  return Writer.writeWasm(Out);
+}
diff --git a/unittests/ADT/APFloatTest.cpp b/unittests/ADT/APFloatTest.cpp
index 1cd98478fad0d56f37778aeedd443154700888fb..378c48d7e0a6b27a8fd018cae5240fc1263c5ea6 100644
--- a/unittests/ADT/APFloatTest.cpp
+++ b/unittests/ADT/APFloatTest.cpp
@@ -3192,6 +3192,70 @@ TEST(APFloatTest, frexp) {
   EXPECT_TRUE(APFloat(APFloat::IEEEdouble(), "0x1.c60f120d9f87cp-1").bitwiseIsEqual(Frac));
 }
 
+TEST(APFloatTest, mod) {
+  {
+    APFloat f1(APFloat::IEEEdouble(), "1.5");
+    APFloat f2(APFloat::IEEEdouble(), "1.0");
+    APFloat expected(APFloat::IEEEdouble(), "0.5");
+    EXPECT_EQ(f1.mod(f2), APFloat::opOK);
+    EXPECT_TRUE(f1.bitwiseIsEqual(expected));
+  }
+  {
+    APFloat f1(APFloat::IEEEdouble(), "0.5");
+    APFloat f2(APFloat::IEEEdouble(), "1.0");
+    APFloat expected(APFloat::IEEEdouble(), "0.5");
+    EXPECT_EQ(f1.mod(f2), APFloat::opOK);
+    EXPECT_TRUE(f1.bitwiseIsEqual(expected));
+  }
+  {
+    APFloat f1(APFloat::IEEEdouble(), "0x1.3333333333333p-2"); // 0.3
+    APFloat f2(APFloat::IEEEdouble(), "0x1.47ae147ae147bp-7"); // 0.01
+    APFloat expected(APFloat::IEEEdouble(),
+                     "0x1.47ae147ae1471p-7"); // 0.009999999999999983
+    EXPECT_EQ(f1.mod(f2), APFloat::opOK);
+    EXPECT_TRUE(f1.bitwiseIsEqual(expected));
+  }
+  {
+    APFloat f1(APFloat::IEEEdouble(), "0x1p64"); // 1.8446744073709552e19
+    APFloat f2(APFloat::IEEEdouble(), "1.5");
+    APFloat expected(APFloat::IEEEdouble(), "1.0");
+    EXPECT_EQ(f1.mod(f2), APFloat::opOK);
+    EXPECT_TRUE(f1.bitwiseIsEqual(expected));
+  }
+  {
+    APFloat f1(APFloat::IEEEdouble(), "0x1p1000");
+    APFloat f2(APFloat::IEEEdouble(), "0x1p-1000");
+    APFloat expected(APFloat::IEEEdouble(), "0.0");
+    EXPECT_EQ(f1.mod(f2), APFloat::opOK);
+    EXPECT_TRUE(f1.bitwiseIsEqual(expected));
+  }
+  {
+    APFloat f1(APFloat::IEEEdouble(), "0.0");
+    APFloat f2(APFloat::IEEEdouble(), "1.0");
+    APFloat expected(APFloat::IEEEdouble(), "0.0");
+    EXPECT_EQ(f1.mod(f2), APFloat::opOK);
+    EXPECT_TRUE(f1.bitwiseIsEqual(expected));
+  }
+  {
+    APFloat f1(APFloat::IEEEdouble(), "1.0");
+    APFloat f2(APFloat::IEEEdouble(), "0.0");
+    EXPECT_EQ(f1.mod(f2), APFloat::opInvalidOp);
+    EXPECT_TRUE(f1.isNaN());
+  }
+  {
+    APFloat f1(APFloat::IEEEdouble(), "0.0");
+    APFloat f2(APFloat::IEEEdouble(), "0.0");
+    EXPECT_EQ(f1.mod(f2), APFloat::opInvalidOp);
+    EXPECT_TRUE(f1.isNaN());
+  }
+  {
+    APFloat f1 = APFloat::getInf(APFloat::IEEEdouble(), false);
+    APFloat f2(APFloat::IEEEdouble(), "1.0");
+    EXPECT_EQ(f1.mod(f2), APFloat::opInvalidOp);
+    EXPECT_TRUE(f1.isNaN());
+  }
+}
+
 TEST(APFloatTest, PPCDoubleDoubleAddSpecial) {
   using DataType = std::tuple<uint64_t, uint64_t, uint64_t, uint64_t,
                               APFloat::fltCategory, APFloat::roundingMode>;
diff --git a/unittests/ADT/APIntTest.cpp b/unittests/ADT/APIntTest.cpp
index ac6d9ba589128d20b0b6b246d6e9a11e12340951..9962cc9fa788547c2fae1599de7eab95457c90cd 100644
--- a/unittests/ADT/APIntTest.cpp
+++ b/unittests/ADT/APIntTest.cpp
@@ -63,6 +63,26 @@ TEST(APIntTest, i33_Count) {
   EXPECT_EQ(((uint64_t)-2)&((1ull<<33) -1), i33minus2.getZExtValue());
 }
 
+TEST(APIntTest, i61_Count) {
+  APInt i61(61, 1 << 15);
+  EXPECT_EQ(45u, i61.countLeadingZeros());
+  EXPECT_EQ(0u, i61.countLeadingOnes());
+  EXPECT_EQ(16u, i61.getActiveBits());
+  EXPECT_EQ(15u, i61.countTrailingZeros());
+  EXPECT_EQ(1u, i61.countPopulation());
+  EXPECT_EQ(static_cast<int64_t>(1 << 15), i61.getSExtValue());
+  EXPECT_EQ(static_cast<uint64_t>(1 << 15), i61.getZExtValue());
+
+  i61.setBits(8, 19);
+  EXPECT_EQ(42u, i61.countLeadingZeros());
+  EXPECT_EQ(0u, i61.countLeadingOnes());
+  EXPECT_EQ(19u, i61.getActiveBits());
+  EXPECT_EQ(8u, i61.countTrailingZeros());
+  EXPECT_EQ(11u, i61.countPopulation());
+  EXPECT_EQ(static_cast<int64_t>((1 << 19) - (1 << 8)), i61.getSExtValue());
+  EXPECT_EQ(static_cast<uint64_t>((1 << 19) - (1 << 8)), i61.getZExtValue());
+}
+
 TEST(APIntTest, i65_Count) {
   APInt i65(65, 0, true);
   EXPECT_EQ(65u, i65.countLeadingZeros());
@@ -118,6 +138,80 @@ TEST(APIntTest, i128_PositiveCount) {
   EXPECT_EQ(1u, one.countPopulation());
   EXPECT_EQ(1, one.getSExtValue());
   EXPECT_EQ(1u, one.getZExtValue());
+
+  APInt s128(128, 2, true);
+  EXPECT_EQ(126u, s128.countLeadingZeros());
+  EXPECT_EQ(0u, s128.countLeadingOnes());
+  EXPECT_EQ(2u, s128.getActiveBits());
+  EXPECT_EQ(1u, s128.countTrailingZeros());
+  EXPECT_EQ(0u, s128.countTrailingOnes());
+  EXPECT_EQ(1u, s128.countPopulation());
+  EXPECT_EQ(2, s128.getSExtValue());
+  EXPECT_EQ(2u, s128.getZExtValue());
+
+  // NOP Test
+  s128.setBits(42, 42);
+  EXPECT_EQ(126u, s128.countLeadingZeros());
+  EXPECT_EQ(0u, s128.countLeadingOnes());
+  EXPECT_EQ(2u, s128.getActiveBits());
+  EXPECT_EQ(1u, s128.countTrailingZeros());
+  EXPECT_EQ(0u, s128.countTrailingOnes());
+  EXPECT_EQ(1u, s128.countPopulation());
+  EXPECT_EQ(2, s128.getSExtValue());
+  EXPECT_EQ(2u, s128.getZExtValue());
+
+  s128.setBits(3, 32);
+  EXPECT_EQ(96u, s128.countLeadingZeros());
+  EXPECT_EQ(0u, s128.countLeadingOnes());
+  EXPECT_EQ(32u, s128.getActiveBits());
+  EXPECT_EQ(33u, s128.getMinSignedBits());
+  EXPECT_EQ(1u, s128.countTrailingZeros());
+  EXPECT_EQ(0u, s128.countTrailingOnes());
+  EXPECT_EQ(30u, s128.countPopulation());
+  EXPECT_EQ(static_cast<uint32_t>((~0u << 3) | 2), s128.getZExtValue());
+
+  s128.setBits(62, 128);
+  EXPECT_EQ(0u, s128.countLeadingZeros());
+  EXPECT_EQ(66u, s128.countLeadingOnes());
+  EXPECT_EQ(128u, s128.getActiveBits());
+  EXPECT_EQ(63u, s128.getMinSignedBits());
+  EXPECT_EQ(1u, s128.countTrailingZeros());
+  EXPECT_EQ(0u, s128.countTrailingOnes());
+  EXPECT_EQ(96u, s128.countPopulation());
+  EXPECT_EQ(static_cast<int64_t>((3ull << 62) |
+                                 static_cast<uint32_t>((~0u << 3) | 2)),
+            s128.getSExtValue());
+}
+
+TEST(APIntTest, i256) {
+  APInt s256(256, 15, true);
+  EXPECT_EQ(252u, s256.countLeadingZeros());
+  EXPECT_EQ(0u, s256.countLeadingOnes());
+  EXPECT_EQ(4u, s256.getActiveBits());
+  EXPECT_EQ(0u, s256.countTrailingZeros());
+  EXPECT_EQ(4u, s256.countTrailingOnes());
+  EXPECT_EQ(4u, s256.countPopulation());
+  EXPECT_EQ(15, s256.getSExtValue());
+  EXPECT_EQ(15u, s256.getZExtValue());
+
+  s256.setBits(62, 66);
+  EXPECT_EQ(190u, s256.countLeadingZeros());
+  EXPECT_EQ(0u, s256.countLeadingOnes());
+  EXPECT_EQ(66u, s256.getActiveBits());
+  EXPECT_EQ(67u, s256.getMinSignedBits());
+  EXPECT_EQ(0u, s256.countTrailingZeros());
+  EXPECT_EQ(4u, s256.countTrailingOnes());
+  EXPECT_EQ(8u, s256.countPopulation());
+
+  s256.setBits(60, 256);
+  EXPECT_EQ(0u, s256.countLeadingZeros());
+  EXPECT_EQ(196u, s256.countLeadingOnes());
+  EXPECT_EQ(256u, s256.getActiveBits());
+  EXPECT_EQ(61u, s256.getMinSignedBits());
+  EXPECT_EQ(0u, s256.countTrailingZeros());
+  EXPECT_EQ(4u, s256.countTrailingOnes());
+  EXPECT_EQ(200u, s256.countPopulation());
+  EXPECT_EQ(static_cast<int64_t>((~0ull << 60) | 15), s256.getSExtValue());
 }
 
 TEST(APIntTest, i1) {
@@ -446,6 +540,58 @@ TEST(APIntTest, compareLargeIntegers) {
   EXPECT_TRUE(!MinusTwo.slt(MinusTwo));
 }
 
+TEST(APIntTest, binaryOpsWithRawIntegers) {
+  // Single word check.
+  uint64_t E1 = 0x2CA7F46BF6569915ULL;
+  APInt A1(64, E1);
+
+  EXPECT_EQ(A1 & E1, E1);
+  EXPECT_EQ(A1 & 0, 0);
+  EXPECT_EQ(A1 & 1, 1);
+  EXPECT_EQ(A1 & 5, 5);
+  EXPECT_EQ(A1 & UINT64_MAX, E1);
+
+  EXPECT_EQ(A1 | E1, E1);
+  EXPECT_EQ(A1 | 0, E1);
+  EXPECT_EQ(A1 | 1, E1);
+  EXPECT_EQ(A1 | 2, E1 | 2);
+  EXPECT_EQ(A1 | UINT64_MAX, UINT64_MAX);
+
+  EXPECT_EQ(A1 ^ E1, 0);
+  EXPECT_EQ(A1 ^ 0, E1);
+  EXPECT_EQ(A1 ^ 1, E1 ^ 1);
+  EXPECT_EQ(A1 ^ 7, E1 ^ 7);
+  EXPECT_EQ(A1 ^ UINT64_MAX, ~E1);
+
+  // Multiword check.
+  uint64_t N = 0xEB6EB136591CBA21ULL;
+  APInt::WordType E2[4] = {
+    N,
+    0x7B9358BD6A33F10AULL,
+    0x7E7FFA5EADD8846ULL,
+    0x305F341CA00B613DULL
+  };
+  APInt A2(APInt::APINT_BITS_PER_WORD*4, E2);
+
+  EXPECT_EQ(A2 & N, N);
+  EXPECT_EQ(A2 & 0, 0);
+  EXPECT_EQ(A2 & 1, 1);
+  EXPECT_EQ(A2 & 5, 1);
+  EXPECT_EQ(A2 & UINT64_MAX, N);
+
+  EXPECT_EQ(A2 | N, A2);
+  EXPECT_EQ(A2 | 0, A2);
+  EXPECT_EQ(A2 | 1, A2);
+  EXPECT_EQ(A2 | 2, A2 + 2);
+  EXPECT_EQ(A2 | UINT64_MAX, A2 - N + UINT64_MAX);
+
+  EXPECT_EQ(A2 ^ N, A2 - N);
+  EXPECT_EQ(A2 ^ 0, A2);
+  EXPECT_EQ(A2 ^ 1, A2 - 1);
+  EXPECT_EQ(A2 ^ 7, A2 + 5);
+  EXPECT_EQ(A2 ^ UINT64_MAX, A2 - N + ~N);
+}
+
 TEST(APIntTest, rvalue_arithmetic) {
   // Test all combinations of lvalue/rvalue lhs/rhs of add/sub
 
@@ -615,6 +761,150 @@ TEST(APIntTest, rvalue_arithmetic) {
   }
 }
 
+TEST(APIntTest, rvalue_bitwise) {
+  // Test all combinations of lvalue/rvalue lhs/rhs of and/or/xor
+
+  // Lamdba to return an APInt by value, but also provide the raw value of the
+  // allocated data.
+  auto getRValue = [](const char *HexString, uint64_t const *&RawData) {
+    APInt V(129, HexString, 16);
+    RawData = V.getRawData();
+    return V;
+  };
+
+  APInt Ten(129, "A", 16);
+  APInt Twelve(129, "C", 16);
+
+  const uint64_t *RawDataL = nullptr;
+  const uint64_t *RawDataR = nullptr;
+
+  {
+    // 12 & 10 = 8
+    APInt AndLL = Ten & Twelve;
+    EXPECT_EQ(AndLL, 0x8);
+
+    APInt AndLR = Ten & getRValue("C", RawDataR);
+    EXPECT_EQ(AndLR, 0x8);
+    EXPECT_EQ(AndLR.getRawData(), RawDataR);
+
+    APInt AndRL = getRValue("A", RawDataL) & Twelve;
+    EXPECT_EQ(AndRL, 0x8);
+    EXPECT_EQ(AndRL.getRawData(), RawDataL);
+
+    APInt AndRR = getRValue("A", RawDataL) & getRValue("C", RawDataR);
+    EXPECT_EQ(AndRR, 0x8);
+    EXPECT_EQ(AndRR.getRawData(), RawDataR);
+
+    // LValue's and constants
+    APInt AndLK = Ten & 0xc;
+    EXPECT_EQ(AndLK, 0x8);
+
+    APInt AndKL = 0xa & Twelve;
+    EXPECT_EQ(AndKL, 0x8);
+
+    // RValue's and constants
+    APInt AndRK = getRValue("A", RawDataL) & 0xc;
+    EXPECT_EQ(AndRK, 0x8);
+    EXPECT_EQ(AndRK.getRawData(), RawDataL);
+
+    APInt AndKR = 0xa & getRValue("C", RawDataR);
+    EXPECT_EQ(AndKR, 0x8);
+    EXPECT_EQ(AndKR.getRawData(), RawDataR);
+  }
+
+  {
+    // 12 | 10 = 14
+    APInt OrLL = Ten | Twelve;
+    EXPECT_EQ(OrLL, 0xe);
+
+    APInt OrLR = Ten | getRValue("C", RawDataR);
+    EXPECT_EQ(OrLR, 0xe);
+    EXPECT_EQ(OrLR.getRawData(), RawDataR);
+
+    APInt OrRL = getRValue("A", RawDataL) | Twelve;
+    EXPECT_EQ(OrRL, 0xe);
+    EXPECT_EQ(OrRL.getRawData(), RawDataL);
+
+    APInt OrRR = getRValue("A", RawDataL) | getRValue("C", RawDataR);
+    EXPECT_EQ(OrRR, 0xe);
+    EXPECT_EQ(OrRR.getRawData(), RawDataR);
+
+    // LValue's and constants
+    APInt OrLK = Ten | 0xc;
+    EXPECT_EQ(OrLK, 0xe);
+
+    APInt OrKL = 0xa | Twelve;
+    EXPECT_EQ(OrKL, 0xe);
+
+    // RValue's and constants
+    APInt OrRK = getRValue("A", RawDataL) | 0xc;
+    EXPECT_EQ(OrRK, 0xe);
+    EXPECT_EQ(OrRK.getRawData(), RawDataL);
+
+    APInt OrKR = 0xa | getRValue("C", RawDataR);
+    EXPECT_EQ(OrKR, 0xe);
+    EXPECT_EQ(OrKR.getRawData(), RawDataR);
+  }
+
+  {
+    // 12 ^ 10 = 6
+    APInt XorLL = Ten ^ Twelve;
+    EXPECT_EQ(XorLL, 0x6);
+
+    APInt XorLR = Ten ^ getRValue("C", RawDataR);
+    EXPECT_EQ(XorLR, 0x6);
+    EXPECT_EQ(XorLR.getRawData(), RawDataR);
+
+    APInt XorRL = getRValue("A", RawDataL) ^ Twelve;
+    EXPECT_EQ(XorRL, 0x6);
+    EXPECT_EQ(XorRL.getRawData(), RawDataL);
+
+    APInt XorRR = getRValue("A", RawDataL) ^ getRValue("C", RawDataR);
+    EXPECT_EQ(XorRR, 0x6);
+    EXPECT_EQ(XorRR.getRawData(), RawDataR);
+
+    // LValue's and constants
+    APInt XorLK = Ten ^ 0xc;
+    EXPECT_EQ(XorLK, 0x6);
+
+    APInt XorKL = 0xa ^ Twelve;
+    EXPECT_EQ(XorKL, 0x6);
+
+    // RValue's and constants
+    APInt XorRK = getRValue("A", RawDataL) ^ 0xc;
+    EXPECT_EQ(XorRK, 0x6);
+    EXPECT_EQ(XorRK.getRawData(), RawDataL);
+
+    APInt XorKR = 0xa ^ getRValue("C", RawDataR);
+    EXPECT_EQ(XorKR, 0x6);
+    EXPECT_EQ(XorKR.getRawData(), RawDataR);
+  }
+}
+
+TEST(APIntTest, rvalue_invert) {
+  // Lamdba to return an APInt by value, but also provide the raw value of the
+  // allocated data.
+  auto getRValue = [](const char *HexString, uint64_t const *&RawData) {
+    APInt V(129, HexString, 16);
+    RawData = V.getRawData();
+    return V;
+  };
+
+  APInt One(129, 1);
+  APInt NegativeTwo(129, -2ULL, true);
+
+  const uint64_t *RawData = nullptr;
+
+  {
+    // ~1 = -2
+    APInt NegL = ~One;
+    EXPECT_EQ(NegL, NegativeTwo);
+
+    APInt NegR = ~getRValue("1", RawData);
+    EXPECT_EQ(NegR, NegativeTwo);
+    EXPECT_EQ(NegR.getRawData(), RawData);
+  }
+}
 
 // Tests different div/rem varaints using scheme (a * b + c) / a
 void testDiv(APInt a, APInt b, APInt c) {
@@ -731,7 +1021,6 @@ TEST(APIntTest, fromString) {
   EXPECT_EQ(APInt(32, uint64_t(-3LL)), APInt(32,  "-11", 2));
   EXPECT_EQ(APInt(32, uint64_t(-4LL)), APInt(32, "-100", 2));
 
-
   EXPECT_EQ(APInt(32,  0), APInt(32,  "0",  8));
   EXPECT_EQ(APInt(32,  1), APInt(32,  "1",  8));
   EXPECT_EQ(APInt(32,  7), APInt(32,  "7",  8));
@@ -753,7 +1042,6 @@ TEST(APIntTest, fromString) {
   EXPECT_EQ(APInt(32, uint64_t(-15LL)), APInt(32,  "-17", 8));
   EXPECT_EQ(APInt(32, uint64_t(-16LL)), APInt(32,  "-20", 8));
 
-
   EXPECT_EQ(APInt(32,  0), APInt(32,  "0", 10));
   EXPECT_EQ(APInt(32,  1), APInt(32,  "1", 10));
   EXPECT_EQ(APInt(32,  9), APInt(32,  "9", 10));
@@ -768,7 +1056,6 @@ TEST(APIntTest, fromString) {
   EXPECT_EQ(APInt(32, uint64_t(-19LL)), APInt(32, "-19", 10));
   EXPECT_EQ(APInt(32, uint64_t(-20LL)), APInt(32, "-20", 10));
 
-
   EXPECT_EQ(APInt(32,  0), APInt(32,  "0", 16));
   EXPECT_EQ(APInt(32,  1), APInt(32,  "1", 16));
   EXPECT_EQ(APInt(32, 15), APInt(32,  "F", 16));
@@ -789,7 +1076,7 @@ TEST(APIntTest, fromString) {
   EXPECT_EQ(APInt(32, 36), APInt(32, "10", 36));
   EXPECT_EQ(APInt(32, 71), APInt(32, "1Z", 36));
   EXPECT_EQ(APInt(32, 72), APInt(32, "20", 36));
-  
+
   EXPECT_EQ(APInt(32,  uint64_t(-0LL)), APInt(32,  "-0", 36));
   EXPECT_EQ(APInt(32,  uint64_t(-1LL)), APInt(32,  "-1", 36));
   EXPECT_EQ(APInt(32, uint64_t(-35LL)), APInt(32,  "-Z", 36));
@@ -1008,6 +1295,29 @@ TEST(APIntTest, Rotate) {
   EXPECT_EQ(APInt(8, 1),  APInt(8, 16).rotl(4));
   EXPECT_EQ(APInt(8, 16), APInt(8, 16).rotl(8));
 
+  EXPECT_EQ(APInt(32, 2), APInt(32, 1).rotl(33));
+  EXPECT_EQ(APInt(32, 2), APInt(32, 1).rotl(APInt(32, 33)));
+
+  EXPECT_EQ(APInt(32, 2), APInt(32, 1).rotl(33));
+  EXPECT_EQ(APInt(32, 2), APInt(32, 1).rotl(APInt(32, 33)));
+  EXPECT_EQ(APInt(32, 2), APInt(32, 1).rotl(APInt(33, 33)));
+  EXPECT_EQ(APInt(32, (1 << 8)), APInt(32, 1).rotl(APInt(32, 40)));
+  EXPECT_EQ(APInt(32, (1 << 30)), APInt(32, 1).rotl(APInt(31, 30)));
+  EXPECT_EQ(APInt(32, (1 << 31)), APInt(32, 1).rotl(APInt(31, 31)));
+
+  EXPECT_EQ(APInt(32, 1), APInt(32, 1).rotl(APInt(1, 0)));
+  EXPECT_EQ(APInt(32, 2), APInt(32, 1).rotl(APInt(1, 1)));
+
+  EXPECT_EQ(APInt(32, 16), APInt(32, 1).rotl(APInt(3, 4)));
+
+  EXPECT_EQ(APInt(32, 1), APInt(32, 1).rotl(APInt(64, 64)));
+  EXPECT_EQ(APInt(32, 2), APInt(32, 1).rotl(APInt(64, 65)));
+
+  EXPECT_EQ(APInt(7, 24), APInt(7, 3).rotl(APInt(7, 3)));
+  EXPECT_EQ(APInt(7, 24), APInt(7, 3).rotl(APInt(7, 10)));
+  EXPECT_EQ(APInt(7, 24), APInt(7, 3).rotl(APInt(5, 10)));
+  EXPECT_EQ(APInt(7, 6), APInt(7, 3).rotl(APInt(12, 120)));
+
   EXPECT_EQ(APInt(8, 16), APInt(8, 16).rotr(0));
   EXPECT_EQ(APInt(8, 8),  APInt(8, 16).rotr(1));
   EXPECT_EQ(APInt(8, 4),  APInt(8, 16).rotr(2));
@@ -1020,9 +1330,36 @@ TEST(APIntTest, Rotate) {
   EXPECT_EQ(APInt(8, 16),  APInt(8, 1).rotr(4));
   EXPECT_EQ(APInt(8, 1),   APInt(8, 1).rotr(8));
 
-  APInt Big(256, "00004000800000000000000000003fff8000000000000000", 16);
-  APInt Rot(256, "3fff80000000000000000000000000000000000040008000", 16);
+  EXPECT_EQ(APInt(32, (1 << 31)), APInt(32, 1).rotr(33));
+  EXPECT_EQ(APInt(32, (1 << 31)), APInt(32, 1).rotr(APInt(32, 33)));
+
+  EXPECT_EQ(APInt(32, (1 << 31)), APInt(32, 1).rotr(33));
+  EXPECT_EQ(APInt(32, (1 << 31)), APInt(32, 1).rotr(APInt(32, 33)));
+  EXPECT_EQ(APInt(32, (1 << 31)), APInt(32, 1).rotr(APInt(33, 33)));
+  EXPECT_EQ(APInt(32, (1 << 24)), APInt(32, 1).rotr(APInt(32, 40)));
+
+  EXPECT_EQ(APInt(32, (1 << 2)), APInt(32, 1).rotr(APInt(31, 30)));
+  EXPECT_EQ(APInt(32, (1 << 1)), APInt(32, 1).rotr(APInt(31, 31)));
+
+  EXPECT_EQ(APInt(32, 1), APInt(32, 1).rotr(APInt(1, 0)));
+  EXPECT_EQ(APInt(32, (1 << 31)), APInt(32, 1).rotr(APInt(1, 1)));
+
+  EXPECT_EQ(APInt(32, (1 << 28)), APInt(32, 1).rotr(APInt(3, 4)));
+
+  EXPECT_EQ(APInt(32, 1), APInt(32, 1).rotr(APInt(64, 64)));
+  EXPECT_EQ(APInt(32, (1 << 31)), APInt(32, 1).rotr(APInt(64, 65)));
+
+  EXPECT_EQ(APInt(7, 48), APInt(7, 3).rotr(APInt(7, 3)));
+  EXPECT_EQ(APInt(7, 48), APInt(7, 3).rotr(APInt(7, 10)));
+  EXPECT_EQ(APInt(7, 48), APInt(7, 3).rotr(APInt(5, 10)));
+  EXPECT_EQ(APInt(7, 65), APInt(7, 3).rotr(APInt(12, 120)));
+
+  APInt Big(256, "00004000800000000000000000003fff8000000000000003", 16);
+  APInt Rot(256, "3fff80000000000000030000000000000000000040008000", 16);
   EXPECT_EQ(Rot, Big.rotr(144));
+
+  EXPECT_EQ(APInt(32, 8), APInt(32, 1).rotl(Big));
+  EXPECT_EQ(APInt(32, (1 << 29)), APInt(32, 1).rotr(Big));
 }
 
 TEST(APIntTest, Splat) {
@@ -1040,63 +1377,63 @@ TEST(APIntTest, tcDecrement) {
 
   // No out borrow.
   {
-    integerPart singleWord = ~integerPart(0) << (integerPartWidth - 1);
-    integerPart carry = APInt::tcDecrement(&singleWord, 1);
-    EXPECT_EQ(carry, integerPart(0));
-    EXPECT_EQ(singleWord, ~integerPart(0) >> 1);
+    APInt::WordType singleWord = ~APInt::WordType(0) << (APInt::APINT_BITS_PER_WORD - 1);
+    APInt::WordType carry = APInt::tcDecrement(&singleWord, 1);
+    EXPECT_EQ(carry, APInt::WordType(0));
+    EXPECT_EQ(singleWord, ~APInt::WordType(0) >> 1);
   }
 
   // With out borrow.
   {
-    integerPart singleWord = 0;
-    integerPart carry = APInt::tcDecrement(&singleWord, 1);
-    EXPECT_EQ(carry, integerPart(1));
-    EXPECT_EQ(singleWord, ~integerPart(0));
+    APInt::WordType singleWord = 0;
+    APInt::WordType carry = APInt::tcDecrement(&singleWord, 1);
+    EXPECT_EQ(carry, APInt::WordType(1));
+    EXPECT_EQ(singleWord, ~APInt::WordType(0));
   }
 
   // Test multiword decrement.
 
   // No across word borrow, no out borrow.
   {
-    integerPart test[4] = {0x1, 0x1, 0x1, 0x1};
-    integerPart expected[4] = {0x0, 0x1, 0x1, 0x1};
+    APInt::WordType test[4] = {0x1, 0x1, 0x1, 0x1};
+    APInt::WordType expected[4] = {0x0, 0x1, 0x1, 0x1};
     APInt::tcDecrement(test, 4);
     EXPECT_EQ(APInt::tcCompare(test, expected, 4), 0);
   }
 
   // 1 across word borrow, no out borrow.
   {
-    integerPart test[4] = {0x0, 0xF, 0x1, 0x1};
-    integerPart expected[4] = {~integerPart(0), 0xE, 0x1, 0x1};
-    integerPart carry = APInt::tcDecrement(test, 4);
-    EXPECT_EQ(carry, integerPart(0));
+    APInt::WordType test[4] = {0x0, 0xF, 0x1, 0x1};
+    APInt::WordType expected[4] = {~APInt::WordType(0), 0xE, 0x1, 0x1};
+    APInt::WordType carry = APInt::tcDecrement(test, 4);
+    EXPECT_EQ(carry, APInt::WordType(0));
     EXPECT_EQ(APInt::tcCompare(test, expected, 4), 0);
   }
 
   // 2 across word borrow, no out borrow.
   {
-    integerPart test[4] = {0x0, 0x0, 0xC, 0x1};
-    integerPart expected[4] = {~integerPart(0), ~integerPart(0), 0xB, 0x1};
-    integerPart carry = APInt::tcDecrement(test, 4);
-    EXPECT_EQ(carry, integerPart(0));
+    APInt::WordType test[4] = {0x0, 0x0, 0xC, 0x1};
+    APInt::WordType expected[4] = {~APInt::WordType(0), ~APInt::WordType(0), 0xB, 0x1};
+    APInt::WordType carry = APInt::tcDecrement(test, 4);
+    EXPECT_EQ(carry, APInt::WordType(0));
     EXPECT_EQ(APInt::tcCompare(test, expected, 4), 0);
   }
 
   // 3 across word borrow, no out borrow.
   {
-    integerPart test[4] = {0x0, 0x0, 0x0, 0x1};
-    integerPart expected[4] = {~integerPart(0), ~integerPart(0), ~integerPart(0), 0x0};
-    integerPart carry = APInt::tcDecrement(test, 4);
-    EXPECT_EQ(carry, integerPart(0));
+    APInt::WordType test[4] = {0x0, 0x0, 0x0, 0x1};
+    APInt::WordType expected[4] = {~APInt::WordType(0), ~APInt::WordType(0), ~APInt::WordType(0), 0x0};
+    APInt::WordType carry = APInt::tcDecrement(test, 4);
+    EXPECT_EQ(carry, APInt::WordType(0));
     EXPECT_EQ(APInt::tcCompare(test, expected, 4), 0);
   }
 
   // 3 across word borrow, with out borrow.
   {
-    integerPart test[4] = {0x0, 0x0, 0x0, 0x0};
-    integerPart expected[4] = {~integerPart(0), ~integerPart(0), ~integerPart(0), ~integerPart(0)};
-    integerPart carry = APInt::tcDecrement(test, 4);
-    EXPECT_EQ(carry, integerPart(1));
+    APInt::WordType test[4] = {0x0, 0x0, 0x0, 0x0};
+    APInt::WordType expected[4] = {~APInt::WordType(0), ~APInt::WordType(0), ~APInt::WordType(0), ~APInt::WordType(0)};
+    APInt::WordType carry = APInt::tcDecrement(test, 4);
+    EXPECT_EQ(carry, APInt::WordType(1));
     EXPECT_EQ(APInt::tcCompare(test, expected, 4), 0);
   }
 }
@@ -1111,17 +1448,17 @@ TEST(APIntTest, arrayAccess) {
   }
 
   // Multiword check.
-  integerPart E2[4] = {
+  APInt::WordType E2[4] = {
     0xEB6EB136591CBA21ULL,
     0x7B9358BD6A33F10AULL,
     0x7E7FFA5EADD8846ULL,
     0x305F341CA00B613DULL
   };
-  APInt A2(integerPartWidth*4, E2);
+  APInt A2(APInt::APINT_BITS_PER_WORD*4, E2);
   for (unsigned i = 0; i < 4; ++i) {
-    for (unsigned j = 0; j < integerPartWidth; ++j) {
+    for (unsigned j = 0; j < APInt::APINT_BITS_PER_WORD; ++j) {
       EXPECT_EQ(bool(E2[i] & (1ULL << j)),
-                A2[i*integerPartWidth + j]);
+                A2[i*APInt::APINT_BITS_PER_WORD + j]);
     }
   }
 }
@@ -1155,18 +1492,18 @@ TEST(APIntTest, nearestLogBase2) {
   // Multiple word check.
 
   // Test round up.
-  integerPart I4[4] = {0x0, 0xF, 0x18, 0x0};
-  APInt A4(integerPartWidth*4, I4);
+  APInt::WordType I4[4] = {0x0, 0xF, 0x18, 0x0};
+  APInt A4(APInt::APINT_BITS_PER_WORD*4, I4);
   EXPECT_EQ(A4.nearestLogBase2(), A4.ceilLogBase2());
 
   // Test round down.
-  integerPart I5[4] = {0x0, 0xF, 0x10, 0x0};
-  APInt A5(integerPartWidth*4, I5);
+  APInt::WordType I5[4] = {0x0, 0xF, 0x10, 0x0};
+  APInt A5(APInt::APINT_BITS_PER_WORD*4, I5);
   EXPECT_EQ(A5.nearestLogBase2(), A5.logBase2());
 
   // Test ties round up.
   uint64_t I6[4] = {0x0, 0x0, 0x0, 0x18};
-  APInt A6(integerPartWidth*4, I6);
+  APInt A6(APInt::APINT_BITS_PER_WORD*4, I6);
   EXPECT_EQ(A6.nearestLogBase2(), A6.ceilLogBase2());
 
   // Test BitWidth == 1 special cases.
@@ -1222,18 +1559,44 @@ TEST(APIntTest, IsSplat) {
 }
 
 TEST(APIntTest, isMask) {
-  EXPECT_FALSE(APIntOps::isMask(APInt(32, 0x01010101)));
-  EXPECT_FALSE(APIntOps::isMask(APInt(32, 0xf0000000)));
-  EXPECT_FALSE(APIntOps::isMask(APInt(32, 0xffff0000)));
-  EXPECT_FALSE(APIntOps::isMask(APInt(32, 0xff << 1)));
+  EXPECT_FALSE(APInt(32, 0x01010101).isMask());
+  EXPECT_FALSE(APInt(32, 0xf0000000).isMask());
+  EXPECT_FALSE(APInt(32, 0xffff0000).isMask());
+  EXPECT_FALSE(APInt(32, 0xff << 1).isMask());
 
   for (int N : { 1, 2, 3, 4, 7, 8, 16, 32, 64, 127, 128, 129, 256 }) {
-    EXPECT_FALSE(APIntOps::isMask(APInt(N, 0)));
+    EXPECT_FALSE(APInt(N, 0).isMask());
 
     APInt One(N, 1);
     for (int I = 1; I <= N; ++I) {
       APInt MaskVal = One.shl(I) - 1;
-      EXPECT_TRUE(APIntOps::isMask(MaskVal));
+      EXPECT_TRUE(MaskVal.isMask());
+      EXPECT_TRUE(MaskVal.isMask(I));
+    }
+  }
+}
+
+TEST(APIntTest, isShiftedMask) {
+  EXPECT_FALSE(APInt(32, 0x01010101).isShiftedMask());
+  EXPECT_TRUE(APInt(32, 0xf0000000).isShiftedMask());
+  EXPECT_TRUE(APInt(32, 0xffff0000).isShiftedMask());
+  EXPECT_TRUE(APInt(32, 0xff << 1).isShiftedMask());
+
+  for (int N : { 1, 2, 3, 4, 7, 8, 16, 32, 64, 127, 128, 129, 256 }) {
+    EXPECT_FALSE(APInt(N, 0).isShiftedMask());
+
+    APInt One(N, 1);
+    for (int I = 1; I < N; ++I) {
+      APInt MaskVal = One.shl(I) - 1;
+      EXPECT_TRUE(MaskVal.isShiftedMask());
+    }
+    for (int I = 1; I < N - 1; ++I) {
+      APInt MaskVal = One.shl(I);
+      EXPECT_TRUE(MaskVal.isShiftedMask());
+    }
+    for (int I = 1; I < N; ++I) {
+      APInt MaskVal = APInt::getHighBitsSet(N, I);
+      EXPECT_TRUE(MaskVal.isShiftedMask());
     }
   }
 }
@@ -1309,3 +1672,308 @@ TEST(APIntTest, reverseBits) {
     }
   }
 }
+
+TEST(APIntTest, insertBits) {
+  APInt iSrc(31, 0x00123456);
+
+  // Direct copy.
+  APInt i31(31, 0x76543210ull);
+  i31.insertBits(iSrc, 0);
+  EXPECT_EQ(static_cast<int64_t>(0x00123456ull), i31.getSExtValue());
+
+  // Single word src/dst insertion.
+  APInt i63(63, 0x01234567FFFFFFFFull);
+  i63.insertBits(iSrc, 4);
+  EXPECT_EQ(static_cast<int64_t>(0x012345600123456Full), i63.getSExtValue());
+
+  // Insert single word src into one word of dst.
+  APInt i120(120, UINT64_MAX, true);
+  i120.insertBits(iSrc, 8);
+  EXPECT_EQ(static_cast<int64_t>(0xFFFFFF80123456FFull), i120.getSExtValue());
+
+  // Insert single word src into two words of dst.
+  APInt i127(127, UINT64_MAX, true);
+  i127.insertBits(iSrc, 48);
+  EXPECT_EQ(i127.extractBits(64, 0).getZExtValue(), 0x3456FFFFFFFFFFFFull);
+  EXPECT_EQ(i127.extractBits(63, 64).getZExtValue(), 0x7FFFFFFFFFFF8012ull);
+
+  // Insert on word boundaries.
+  APInt i128(128, 0);
+  i128.insertBits(APInt(64, UINT64_MAX, true), 0);
+  i128.insertBits(APInt(64, UINT64_MAX, true), 64);
+  EXPECT_EQ(-1, i128.getSExtValue());
+
+  APInt i256(256, UINT64_MAX, true);
+  i256.insertBits(APInt(65, 0), 0);
+  i256.insertBits(APInt(69, 0), 64);
+  i256.insertBits(APInt(128, 0), 128);
+  EXPECT_EQ(0u, i256.getSExtValue());
+
+  APInt i257(257, 0);
+  i257.insertBits(APInt(96, UINT64_MAX, true), 64);
+  EXPECT_EQ(i257.extractBits(64, 0).getZExtValue(), 0x0000000000000000ull);
+  EXPECT_EQ(i257.extractBits(64, 64).getZExtValue(), 0xFFFFFFFFFFFFFFFFull);
+  EXPECT_EQ(i257.extractBits(64, 128).getZExtValue(), 0x00000000FFFFFFFFull);
+  EXPECT_EQ(i257.extractBits(65, 192).getZExtValue(), 0x0000000000000000ull);
+
+  // General insertion.
+  APInt i260(260, UINT64_MAX, true);
+  i260.insertBits(APInt(129, 1ull << 48), 15);
+  EXPECT_EQ(i260.extractBits(64, 0).getZExtValue(), 0x8000000000007FFFull);
+  EXPECT_EQ(i260.extractBits(64, 64).getZExtValue(), 0x0000000000000000ull);
+  EXPECT_EQ(i260.extractBits(64, 128).getZExtValue(), 0xFFFFFFFFFFFF0000ull);
+  EXPECT_EQ(i260.extractBits(64, 192).getZExtValue(), 0xFFFFFFFFFFFFFFFFull);
+  EXPECT_EQ(i260.extractBits(4, 256).getZExtValue(), 0x000000000000000Full);
+}
+
+TEST(APIntTest, extractBits) {
+  APInt i32(32, 0x1234567);
+  EXPECT_EQ(0x3456, i32.extractBits(16, 4));
+
+  APInt i257(257, 0xFFFFFFFFFF0000FFull, true);
+  EXPECT_EQ(0xFFu, i257.extractBits(16, 0));
+  EXPECT_EQ((0xFFu >> 1), i257.extractBits(16, 1));
+  EXPECT_EQ(-1, i257.extractBits(32, 64).getSExtValue());
+  EXPECT_EQ(-1, i257.extractBits(128, 128).getSExtValue());
+  EXPECT_EQ(-1, i257.extractBits(66, 191).getSExtValue());
+  EXPECT_EQ(static_cast<int64_t>(0xFFFFFFFFFF80007Full),
+            i257.extractBits(128, 1).getSExtValue());
+  EXPECT_EQ(static_cast<int64_t>(0xFFFFFFFFFF80007Full),
+            i257.extractBits(129, 1).getSExtValue());
+}
+
+TEST(APIntTest, getLowBitsSet) {
+  APInt i128lo64 = APInt::getLowBitsSet(128, 64);
+  EXPECT_EQ(0u, i128lo64.countLeadingOnes());
+  EXPECT_EQ(64u, i128lo64.countLeadingZeros());
+  EXPECT_EQ(64u, i128lo64.getActiveBits());
+  EXPECT_EQ(0u, i128lo64.countTrailingZeros());
+  EXPECT_EQ(64u, i128lo64.countTrailingOnes());
+  EXPECT_EQ(64u, i128lo64.countPopulation());
+}
+
+TEST(APIntTest, getBitsSet) {
+  APInt i64hi1lo1 = APInt::getBitsSet(64, 63, 1);
+  EXPECT_EQ(1u, i64hi1lo1.countLeadingOnes());
+  EXPECT_EQ(0u, i64hi1lo1.countLeadingZeros());
+  EXPECT_EQ(64u, i64hi1lo1.getActiveBits());
+  EXPECT_EQ(0u, i64hi1lo1.countTrailingZeros());
+  EXPECT_EQ(1u, i64hi1lo1.countTrailingOnes());
+  EXPECT_EQ(2u, i64hi1lo1.countPopulation());
+
+  APInt i127hi1lo1 = APInt::getBitsSet(127, 126, 1);
+  EXPECT_EQ(1u, i127hi1lo1.countLeadingOnes());
+  EXPECT_EQ(0u, i127hi1lo1.countLeadingZeros());
+  EXPECT_EQ(127u, i127hi1lo1.getActiveBits());
+  EXPECT_EQ(0u, i127hi1lo1.countTrailingZeros());
+  EXPECT_EQ(1u, i127hi1lo1.countTrailingOnes());
+  EXPECT_EQ(2u, i127hi1lo1.countPopulation());
+}
+
+TEST(APIntTest, getHighBitsSet) {
+  APInt i64hi32 = APInt::getHighBitsSet(64, 32);
+  EXPECT_EQ(32u, i64hi32.countLeadingOnes());
+  EXPECT_EQ(0u, i64hi32.countLeadingZeros());
+  EXPECT_EQ(64u, i64hi32.getActiveBits());
+  EXPECT_EQ(32u, i64hi32.countTrailingZeros());
+  EXPECT_EQ(0u, i64hi32.countTrailingOnes());
+  EXPECT_EQ(32u, i64hi32.countPopulation());
+}
+
+TEST(APIntTest, getBitsSetFrom) {
+  APInt i64hi31 = APInt::getBitsSetFrom(64, 33);
+  EXPECT_EQ(31u, i64hi31.countLeadingOnes());
+  EXPECT_EQ(0u, i64hi31.countLeadingZeros());
+  EXPECT_EQ(64u, i64hi31.getActiveBits());
+  EXPECT_EQ(33u, i64hi31.countTrailingZeros());
+  EXPECT_EQ(0u, i64hi31.countTrailingOnes());
+  EXPECT_EQ(31u, i64hi31.countPopulation());
+}
+
+TEST(APIntTest, setLowBits) {
+  APInt i64lo32(64, 0);
+  i64lo32.setLowBits(32);
+  EXPECT_EQ(0u, i64lo32.countLeadingOnes());
+  EXPECT_EQ(32u, i64lo32.countLeadingZeros());
+  EXPECT_EQ(32u, i64lo32.getActiveBits());
+  EXPECT_EQ(0u, i64lo32.countTrailingZeros());
+  EXPECT_EQ(32u, i64lo32.countTrailingOnes());
+  EXPECT_EQ(32u, i64lo32.countPopulation());
+
+  APInt i128lo64(128, 0);
+  i128lo64.setLowBits(64);
+  EXPECT_EQ(0u, i128lo64.countLeadingOnes());
+  EXPECT_EQ(64u, i128lo64.countLeadingZeros());
+  EXPECT_EQ(64u, i128lo64.getActiveBits());
+  EXPECT_EQ(0u, i128lo64.countTrailingZeros());
+  EXPECT_EQ(64u, i128lo64.countTrailingOnes());
+  EXPECT_EQ(64u, i128lo64.countPopulation());
+
+  APInt i128lo24(128, 0);
+  i128lo24.setLowBits(24);
+  EXPECT_EQ(0u, i128lo24.countLeadingOnes());
+  EXPECT_EQ(104u, i128lo24.countLeadingZeros());
+  EXPECT_EQ(24u, i128lo24.getActiveBits());
+  EXPECT_EQ(0u, i128lo24.countTrailingZeros());
+  EXPECT_EQ(24u, i128lo24.countTrailingOnes());
+  EXPECT_EQ(24u, i128lo24.countPopulation());
+
+  APInt i128lo104(128, 0);
+  i128lo104.setLowBits(104);
+  EXPECT_EQ(0u, i128lo104.countLeadingOnes());
+  EXPECT_EQ(24u, i128lo104.countLeadingZeros());
+  EXPECT_EQ(104u, i128lo104.getActiveBits());
+  EXPECT_EQ(0u, i128lo104.countTrailingZeros());
+  EXPECT_EQ(104u, i128lo104.countTrailingOnes());
+  EXPECT_EQ(104u, i128lo104.countPopulation());
+
+  APInt i128lo0(128, 0);
+  i128lo0.setLowBits(0);
+  EXPECT_EQ(0u, i128lo0.countLeadingOnes());
+  EXPECT_EQ(128u, i128lo0.countLeadingZeros());
+  EXPECT_EQ(0u, i128lo0.getActiveBits());
+  EXPECT_EQ(128u, i128lo0.countTrailingZeros());
+  EXPECT_EQ(0u, i128lo0.countTrailingOnes());
+  EXPECT_EQ(0u, i128lo0.countPopulation());
+
+  APInt i80lo79(80, 0);
+  i80lo79.setLowBits(79);
+  EXPECT_EQ(0u, i80lo79.countLeadingOnes());
+  EXPECT_EQ(1u, i80lo79.countLeadingZeros());
+  EXPECT_EQ(79u, i80lo79.getActiveBits());
+  EXPECT_EQ(0u, i80lo79.countTrailingZeros());
+  EXPECT_EQ(79u, i80lo79.countTrailingOnes());
+  EXPECT_EQ(79u, i80lo79.countPopulation());
+}
+
+TEST(APIntTest, setHighBits) {
+  APInt i64hi32(64, 0);
+  i64hi32.setHighBits(32);
+  EXPECT_EQ(32u, i64hi32.countLeadingOnes());
+  EXPECT_EQ(0u, i64hi32.countLeadingZeros());
+  EXPECT_EQ(64u, i64hi32.getActiveBits());
+  EXPECT_EQ(32u, i64hi32.countTrailingZeros());
+  EXPECT_EQ(0u, i64hi32.countTrailingOnes());
+  EXPECT_EQ(32u, i64hi32.countPopulation());
+
+  APInt i128hi64(128, 0);
+  i128hi64.setHighBits(64);
+  EXPECT_EQ(64u, i128hi64.countLeadingOnes());
+  EXPECT_EQ(0u, i128hi64.countLeadingZeros());
+  EXPECT_EQ(128u, i128hi64.getActiveBits());
+  EXPECT_EQ(64u, i128hi64.countTrailingZeros());
+  EXPECT_EQ(0u, i128hi64.countTrailingOnes());
+  EXPECT_EQ(64u, i128hi64.countPopulation());
+
+  APInt i128hi24(128, 0);
+  i128hi24.setHighBits(24);
+  EXPECT_EQ(24u, i128hi24.countLeadingOnes());
+  EXPECT_EQ(0u, i128hi24.countLeadingZeros());
+  EXPECT_EQ(128u, i128hi24.getActiveBits());
+  EXPECT_EQ(104u, i128hi24.countTrailingZeros());
+  EXPECT_EQ(0u, i128hi24.countTrailingOnes());
+  EXPECT_EQ(24u, i128hi24.countPopulation());
+
+  APInt i128hi104(128, 0);
+  i128hi104.setHighBits(104);
+  EXPECT_EQ(104u, i128hi104.countLeadingOnes());
+  EXPECT_EQ(0u, i128hi104.countLeadingZeros());
+  EXPECT_EQ(128u, i128hi104.getActiveBits());
+  EXPECT_EQ(24u, i128hi104.countTrailingZeros());
+  EXPECT_EQ(0u, i128hi104.countTrailingOnes());
+  EXPECT_EQ(104u, i128hi104.countPopulation());
+
+  APInt i128hi0(128, 0);
+  i128hi0.setHighBits(0);
+  EXPECT_EQ(0u, i128hi0.countLeadingOnes());
+  EXPECT_EQ(128u, i128hi0.countLeadingZeros());
+  EXPECT_EQ(0u, i128hi0.getActiveBits());
+  EXPECT_EQ(128u, i128hi0.countTrailingZeros());
+  EXPECT_EQ(0u, i128hi0.countTrailingOnes());
+  EXPECT_EQ(0u, i128hi0.countPopulation());
+
+  APInt i80hi1(80, 0);
+  i80hi1.setHighBits(1);
+  EXPECT_EQ(1u, i80hi1.countLeadingOnes());
+  EXPECT_EQ(0u, i80hi1.countLeadingZeros());
+  EXPECT_EQ(80u, i80hi1.getActiveBits());
+  EXPECT_EQ(79u, i80hi1.countTrailingZeros());
+  EXPECT_EQ(0u, i80hi1.countTrailingOnes());
+  EXPECT_EQ(1u, i80hi1.countPopulation());
+
+  APInt i32hi16(32, 0);
+  i32hi16.setHighBits(16);
+  EXPECT_EQ(16u, i32hi16.countLeadingOnes());
+  EXPECT_EQ(0u, i32hi16.countLeadingZeros());
+  EXPECT_EQ(32u, i32hi16.getActiveBits());
+  EXPECT_EQ(16u, i32hi16.countTrailingZeros());
+  EXPECT_EQ(0u, i32hi16.countTrailingOnes());
+  EXPECT_EQ(16u, i32hi16.countPopulation());
+}
+
+TEST(APIntTest, setBitsFrom) {
+  APInt i64from63(64, 0);
+  i64from63.setBitsFrom(63);
+  EXPECT_EQ(1u, i64from63.countLeadingOnes());
+  EXPECT_EQ(0u, i64from63.countLeadingZeros());
+  EXPECT_EQ(64u, i64from63.getActiveBits());
+  EXPECT_EQ(63u, i64from63.countTrailingZeros());
+  EXPECT_EQ(0u, i64from63.countTrailingOnes());
+  EXPECT_EQ(1u, i64from63.countPopulation());
+}
+
+TEST(APIntTest, setAllBits) {
+  APInt i32(32, 0);
+  i32.setAllBits();
+  EXPECT_EQ(32u, i32.countLeadingOnes());
+  EXPECT_EQ(0u, i32.countLeadingZeros());
+  EXPECT_EQ(32u, i32.getActiveBits());
+  EXPECT_EQ(0u, i32.countTrailingZeros());
+  EXPECT_EQ(32u, i32.countTrailingOnes());
+  EXPECT_EQ(32u, i32.countPopulation());
+
+  APInt i64(64, 0);
+  i64.setAllBits();
+  EXPECT_EQ(64u, i64.countLeadingOnes());
+  EXPECT_EQ(0u, i64.countLeadingZeros());
+  EXPECT_EQ(64u, i64.getActiveBits());
+  EXPECT_EQ(0u, i64.countTrailingZeros());
+  EXPECT_EQ(64u, i64.countTrailingOnes());
+  EXPECT_EQ(64u, i64.countPopulation());
+
+  APInt i96(96, 0);
+  i96.setAllBits();
+  EXPECT_EQ(96u, i96.countLeadingOnes());
+  EXPECT_EQ(0u, i96.countLeadingZeros());
+  EXPECT_EQ(96u, i96.getActiveBits());
+  EXPECT_EQ(0u, i96.countTrailingZeros());
+  EXPECT_EQ(96u, i96.countTrailingOnes());
+  EXPECT_EQ(96u, i96.countPopulation());
+
+  APInt i128(128, 0);
+  i128.setAllBits();
+  EXPECT_EQ(128u, i128.countLeadingOnes());
+  EXPECT_EQ(0u, i128.countLeadingZeros());
+  EXPECT_EQ(128u, i128.getActiveBits());
+  EXPECT_EQ(0u, i128.countTrailingZeros());
+  EXPECT_EQ(128u, i128.countTrailingOnes());
+  EXPECT_EQ(128u, i128.countPopulation());
+}
+
+TEST(APIntTest, getLoBits) {
+  APInt i32(32, 0xfa);
+  i32.setHighBits(1);
+  EXPECT_EQ(0xa, i32.getLoBits(4));
+  APInt i128(128, 0xfa);
+  i128.setHighBits(1);
+  EXPECT_EQ(0xa, i128.getLoBits(4));
+}
+
+TEST(APIntTest, getHiBits) {
+  APInt i32(32, 0xfa);
+  i32.setHighBits(2);
+  EXPECT_EQ(0xc, i32.getHiBits(4));
+  APInt i128(128, 0xfa);
+  i128.setHighBits(2);
+  EXPECT_EQ(0xc, i128.getHiBits(4));
+}
diff --git a/unittests/ADT/BitVectorTest.cpp b/unittests/ADT/BitVectorTest.cpp
index 76e796be9eb92d2f337aed5f28e62a2aa28485ad..98ef66735ad2329904dc67afdd89b87cd246009d 100644
--- a/unittests/ADT/BitVectorTest.cpp
+++ b/unittests/ADT/BitVectorTest.cpp
@@ -182,6 +182,45 @@ TYPED_TEST(BitVectorTest, TrivialOperation) {
   EXPECT_TRUE(Vec.empty());
 }
 
+TYPED_TEST(BitVectorTest, FindOperations) {
+  // Test finding in an empty BitVector.
+  TypeParam A;
+  EXPECT_EQ(-1, A.find_first());
+  EXPECT_EQ(-1, A.find_first_unset());
+  EXPECT_EQ(-1, A.find_next(0));
+  EXPECT_EQ(-1, A.find_next_unset(0));
+
+  // Test finding next set and unset bits in a BitVector with multiple words
+  A.resize(100);
+  A.set(12);
+  A.set(13);
+  A.set(75);
+
+  EXPECT_EQ(12, A.find_first());
+  EXPECT_EQ(13, A.find_next(12));
+  EXPECT_EQ(75, A.find_next(13));
+  EXPECT_EQ(-1, A.find_next(75));
+
+  EXPECT_EQ(0, A.find_first_unset());
+  EXPECT_EQ(14, A.find_next_unset(11));
+  EXPECT_EQ(14, A.find_next_unset(12));
+  EXPECT_EQ(14, A.find_next_unset(13));
+  EXPECT_EQ(16, A.find_next_unset(15));
+  EXPECT_EQ(76, A.find_next_unset(74));
+  EXPECT_EQ(76, A.find_next_unset(75));
+  EXPECT_EQ(-1, A.find_next_unset(99));
+
+  A.set(0, 100);
+  EXPECT_EQ(100U, A.count());
+  EXPECT_EQ(0, A.find_first());
+  EXPECT_EQ(-1, A.find_first_unset());
+
+  A.reset(0, 100);
+  EXPECT_EQ(0U, A.count());
+  EXPECT_EQ(-1, A.find_first());
+  EXPECT_EQ(0, A.find_first_unset());
+}
+
 TYPED_TEST(BitVectorTest, CompoundAssignment) {
   TypeParam A;
   A.resize(10);
diff --git a/unittests/ADT/BreadthFirstIteratorTest.cpp b/unittests/ADT/BreadthFirstIteratorTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..42a07bbe930ba380170a613fc3cd6e0a4573bcf2
--- /dev/null
+++ b/unittests/ADT/BreadthFirstIteratorTest.cpp
@@ -0,0 +1,74 @@
+//=== llvm/unittest/ADT/BreadthFirstIteratorTest.cpp - BFS iterator tests -===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/BreadthFirstIterator.h"
+#include "TestGraph.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+namespace llvm {
+
+TEST(BreadthFristIteratorTest, Basic) {
+  typedef bf_iterator<Graph<4>> BFIter;
+
+  Graph<4> G;
+  G.AddEdge(0, 1);
+  G.AddEdge(0, 2);
+  G.AddEdge(1, 3);
+
+  auto It = BFIter::begin(G);
+  auto End = BFIter::end(G);
+  EXPECT_EQ(It.getLevel(), 0U);
+  EXPECT_EQ(*It, G.AccessNode(0));
+  ++It;
+  EXPECT_EQ(It.getLevel(), 1U);
+  EXPECT_EQ(*It, G.AccessNode(1));
+  ++It;
+  EXPECT_EQ(It.getLevel(), 1U);
+  EXPECT_EQ(*It, G.AccessNode(2));
+  ++It;
+  EXPECT_EQ(It.getLevel(), 2U);
+  EXPECT_EQ(*It, G.AccessNode(3));
+  ++It;
+  EXPECT_EQ(It, End);
+}
+
+TEST(BreadthFristIteratorTest, Cycle) {
+  typedef bf_iterator<Graph<4>> BFIter;
+
+  Graph<4> G;
+  G.AddEdge(0, 1);
+  G.AddEdge(1, 0);
+  G.AddEdge(1, 2);
+  G.AddEdge(2, 1);
+  G.AddEdge(2, 1);
+  G.AddEdge(2, 3);
+  G.AddEdge(3, 2);
+  G.AddEdge(3, 1);
+  G.AddEdge(3, 0);
+
+  auto It = BFIter::begin(G);
+  auto End = BFIter::end(G);
+  EXPECT_EQ(It.getLevel(), 0U);
+  EXPECT_EQ(*It, G.AccessNode(0));
+  ++It;
+  EXPECT_EQ(It.getLevel(), 1U);
+  EXPECT_EQ(*It, G.AccessNode(1));
+  ++It;
+  EXPECT_EQ(It.getLevel(), 2U);
+  EXPECT_EQ(*It, G.AccessNode(2));
+  ++It;
+  EXPECT_EQ(It.getLevel(), 3U);
+  EXPECT_EQ(*It, G.AccessNode(3));
+  ++It;
+  EXPECT_EQ(It, End);
+}
+
+} // end namespace llvm
diff --git a/unittests/ADT/CMakeLists.txt b/unittests/ADT/CMakeLists.txt
index 738f6efe92d63e123dbc7b679812179bdbcb2098..fa977ac5d3f5fb1f5796fee1b1a1afb0b66bc758 100644
--- a/unittests/ADT/CMakeLists.txt
+++ b/unittests/ADT/CMakeLists.txt
@@ -9,6 +9,7 @@ set(ADTSources
   ArrayRefTest.cpp
   BitmaskEnumTest.cpp
   BitVectorTest.cpp
+  BreadthFirstIteratorTest.cpp
   BumpPtrListTest.cpp
   DAGDeltaAlgorithmTest.cpp
   DeltaAlgorithmTest.cpp
diff --git a/unittests/ADT/DenseMapTest.cpp b/unittests/ADT/DenseMapTest.cpp
index 80f0462bc8fb605b2b667e4b8603037179792896..273f4da021c4aad9db5827340a294c5384395215 100644
--- a/unittests/ADT/DenseMapTest.cpp
+++ b/unittests/ADT/DenseMapTest.cpp
@@ -580,4 +580,18 @@ TEST(DenseMapCustomTest, TryEmplaceTest) {
   EXPECT_EQ(Try1.first, Try2.first);
   EXPECT_NE(nullptr, P);
 }
+
+TEST(DenseMapCustomTest, ConstTest) {
+  // Test that const pointers work okay for count and find, even when the
+  // underlying map is a non-const pointer.
+  DenseMap<int *, int> Map;
+  int A;
+  int *B = &A;
+  const int *C = &A;
+  Map.insert({B, 0});
+  EXPECT_EQ(Map.count(B), 1u);
+  EXPECT_EQ(Map.count(C), 1u);
+  EXPECT_NE(Map.find(B), Map.end());
+  EXPECT_NE(Map.find(C), Map.end());
+}
 }
diff --git a/unittests/ADT/DenseSetTest.cpp b/unittests/ADT/DenseSetTest.cpp
index 19feb415a9dd564f0c425a6ac3db18a731a4d9a6..a09537a3e99046d97ec35b2391c6f49f27beef92 100644
--- a/unittests/ADT/DenseSetTest.cpp
+++ b/unittests/ADT/DenseSetTest.cpp
@@ -185,4 +185,17 @@ TEST(DenseSetCustomTest, ReserveTest) {
     EXPECT_EQ(0, CountCopyAndMove::Copy);
   }
 }
+TEST(DenseSetCustomTest, ConstTest) {
+  // Test that const pointers work okay for count and find, even when the
+  // underlying map is a non-const pointer.
+  DenseSet<int *> Map;
+  int A;
+  int *B = &A;
+  const int *C = &A;
+  Map.insert(B);
+  EXPECT_EQ(Map.count(B), 1u);
+  EXPECT_EQ(Map.count(C), 1u);
+  EXPECT_NE(Map.find(B), Map.end());
+  EXPECT_NE(Map.find(C), Map.end());
+}
 }
diff --git a/unittests/ADT/IListIteratorTest.cpp b/unittests/ADT/IListIteratorTest.cpp
index ddcab781b9badffbd7451a630a9637ccdb2d9d7b..8b2aa62f1f8b7d7e34c6d90aa03d0397761862c5 100644
--- a/unittests/ADT/IListIteratorTest.cpp
+++ b/unittests/ADT/IListIteratorTest.cpp
@@ -131,4 +131,44 @@ TEST(IListIteratorTest, CheckEraseReverse) {
   EXPECT_EQ(L.rend(), RI);
 }
 
+TEST(IListIteratorTest, ReverseConstructor) {
+  simple_ilist<Node> L;
+  const simple_ilist<Node> &CL = L;
+  Node A, B;
+  L.insert(L.end(), A);
+  L.insert(L.end(), B);
+
+  // Save typing.
+  typedef simple_ilist<Node>::iterator iterator;
+  typedef simple_ilist<Node>::reverse_iterator reverse_iterator;
+  typedef simple_ilist<Node>::const_iterator const_iterator;
+  typedef simple_ilist<Node>::const_reverse_iterator const_reverse_iterator;
+
+  // Check conversion values.
+  EXPECT_EQ(L.begin(), iterator(L.rend()));
+  EXPECT_EQ(++L.begin(), iterator(++L.rbegin()));
+  EXPECT_EQ(L.end(), iterator(L.rbegin()));
+  EXPECT_EQ(L.rbegin(), reverse_iterator(L.end()));
+  EXPECT_EQ(++L.rbegin(), reverse_iterator(++L.begin()));
+  EXPECT_EQ(L.rend(), reverse_iterator(L.begin()));
+
+  // Check const iterator constructors.
+  EXPECT_EQ(CL.begin(), const_iterator(L.rend()));
+  EXPECT_EQ(CL.begin(), const_iterator(CL.rend()));
+  EXPECT_EQ(CL.rbegin(), const_reverse_iterator(L.end()));
+  EXPECT_EQ(CL.rbegin(), const_reverse_iterator(CL.end()));
+
+  // Confirm lack of implicit conversions.
+  static_assert(!std::is_convertible<iterator, reverse_iterator>::value,
+                "unexpected implicit conversion");
+  static_assert(!std::is_convertible<reverse_iterator, iterator>::value,
+                "unexpected implicit conversion");
+  static_assert(
+      !std::is_convertible<const_iterator, const_reverse_iterator>::value,
+      "unexpected implicit conversion");
+  static_assert(
+      !std::is_convertible<const_reverse_iterator, const_iterator>::value,
+      "unexpected implicit conversion");
+}
+
 } // end namespace
diff --git a/unittests/ADT/IteratorTest.cpp b/unittests/ADT/IteratorTest.cpp
index a8d5b33a0b4975f3567ffd81d5db1ebfd1f074ab..7f261824b499c9464d9c2ac618a3b12406280fe7 100644
--- a/unittests/ADT/IteratorTest.cpp
+++ b/unittests/ADT/IteratorTest.cpp
@@ -7,9 +7,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/iterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/iterator.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -35,14 +35,15 @@ static_assert(std::is_same<typename AdaptedIter::reference, Shadow<3>>::value,
               "");
 
 TEST(PointeeIteratorTest, Basic) {
-  int arr[4] = { 1, 2, 3, 4 };
+  int arr[4] = {1, 2, 3, 4};
   SmallVector<int *, 4> V;
   V.push_back(&arr[0]);
   V.push_back(&arr[1]);
   V.push_back(&arr[2]);
   V.push_back(&arr[3]);
 
-  typedef pointee_iterator<SmallVectorImpl<int *>::const_iterator> test_iterator;
+  typedef pointee_iterator<SmallVectorImpl<int *>::const_iterator>
+      test_iterator;
 
   test_iterator Begin, End;
   Begin = V.begin();
@@ -83,7 +84,8 @@ TEST(PointeeIteratorTest, SmartPointer) {
   V.push_back(make_unique<int>(4));
 
   typedef pointee_iterator<
-      SmallVectorImpl<std::unique_ptr<int>>::const_iterator> test_iterator;
+      SmallVectorImpl<std::unique_ptr<int>>::const_iterator>
+      test_iterator;
 
   test_iterator Begin, End;
   Begin = V.begin();
@@ -116,6 +118,15 @@ TEST(PointeeIteratorTest, SmartPointer) {
   EXPECT_EQ(End, I);
 }
 
+TEST(PointeeIteratorTest, Range) {
+  int A[] = {1, 2, 3, 4};
+  SmallVector<int *, 4> V{&A[0], &A[1], &A[2], &A[3]};
+
+  int I = 0;
+  for (int II : make_pointee_range(V))
+    EXPECT_EQ(A[I++], II);
+}
+
 TEST(FilterIteratorTest, Lambda) {
   auto IsOdd = [](int N) { return N % 2 == 1; };
   int A[] = {0, 1, 2, 3, 4, 5, 6};
@@ -209,6 +220,13 @@ TEST(PointerIterator, Const) {
   EXPECT_EQ(A + 4, std::next(*Begin, 4));
 }
 
+TEST(PointerIterator, Range) {
+  int A[] = {1, 2, 3, 4};
+  int I = 0;
+  for (int *P : make_pointer_range(A))
+    EXPECT_EQ(A + I++, P);
+}
+
 TEST(ZipIteratorTest, Basic) {
   using namespace std;
   const SmallVector<unsigned, 6> pi{3, 1, 4, 1, 5, 9};
@@ -272,4 +290,51 @@ TEST(ZipIteratorTest, ZipFirstMutability) {
   }
 }
 
+TEST(ZipIteratorTest, Filter) {
+  using namespace std;
+  vector<unsigned> pi{3, 1, 4, 1, 5, 9};
+
+  unsigned iters = 0;
+  // pi is length 6, but the zip RHS is length 7.
+  auto zipped = zip_first(pi, vector<bool>{1, 1, 0, 1, 1, 1, 0});
+  for (auto tup : make_filter_range(
+           zipped, [](decltype(zipped)::value_type t) { return get<1>(t); })) {
+    EXPECT_EQ(get<0>(tup) & 0x01, get<1>(tup));
+    get<0>(tup) += 1;
+    iters += 1;
+  }
+
+  // Should have skipped pi[2].
+  EXPECT_EQ(iters, 5u);
+
+  // Ensure that in-place mutation works.
+  EXPECT_TRUE(all_of(pi, [](unsigned n) { return (n & 0x01) == 0; }));
+}
+
+TEST(ZipIteratorTest, Reverse) {
+  using namespace std;
+  vector<unsigned> ascending{0, 1, 2, 3, 4, 5};
+
+  auto zipped = zip_first(ascending, vector<bool>{0, 1, 0, 1, 0, 1});
+  unsigned last = 6;
+  for (auto tup : reverse(zipped)) {
+    // Check that this is in reverse.
+    EXPECT_LT(get<0>(tup), last);
+    last = get<0>(tup);
+    EXPECT_EQ(get<0>(tup) & 0x01, get<1>(tup));
+  }
+
+  auto odds = [](decltype(zipped)::value_type tup) { return get<1>(tup); };
+  last = 6;
+  for (auto tup : make_filter_range(reverse(zipped), odds)) {
+    EXPECT_LT(get<0>(tup), last);
+    last = get<0>(tup);
+    EXPECT_TRUE(get<0>(tup) & 0x01);
+    get<0>(tup) += 1;
+  }
+
+  // Ensure that in-place mutation works.
+  EXPECT_TRUE(all_of(ascending, [](unsigned n) { return (n & 0x01) == 0; }));
+}
+
 } // anonymous namespace
diff --git a/unittests/ADT/STLExtrasTest.cpp b/unittests/ADT/STLExtrasTest.cpp
index f17d24f36b2385a5ca7eea5fff25cbeb0cc0e9b8..2e6eb6f413f6cb05cd2944bca69665af27ee5699 100644
--- a/unittests/ADT/STLExtrasTest.cpp
+++ b/unittests/ADT/STLExtrasTest.cpp
@@ -48,7 +48,7 @@ TEST(STLExtrasTest, EnumerateLValue) {
   std::vector<CharPairType> CharResults;
 
   for (auto X : llvm::enumerate(foo)) {
-    CharResults.emplace_back(X.Index, X.Value);
+    CharResults.emplace_back(X.index(), X.value());
   }
   ASSERT_EQ(3u, CharResults.size());
   EXPECT_EQ(CharPairType(0u, 'a'), CharResults[0]);
@@ -60,7 +60,7 @@ TEST(STLExtrasTest, EnumerateLValue) {
   std::vector<IntPairType> IntResults;
   const std::vector<int> bar = {1, 2, 3};
   for (auto X : llvm::enumerate(bar)) {
-    IntResults.emplace_back(X.Index, X.Value);
+    IntResults.emplace_back(X.index(), X.value());
   }
   ASSERT_EQ(3u, IntResults.size());
   EXPECT_EQ(IntPairType(0u, 1), IntResults[0]);
@@ -69,9 +69,9 @@ TEST(STLExtrasTest, EnumerateLValue) {
 
   // Test an empty range.
   IntResults.clear();
-  const std::vector<int> baz;
+  const std::vector<int> baz{};
   for (auto X : llvm::enumerate(baz)) {
-    IntResults.emplace_back(X.Index, X.Value);
+    IntResults.emplace_back(X.index(), X.value());
   }
   EXPECT_TRUE(IntResults.empty());
 }
@@ -82,7 +82,7 @@ TEST(STLExtrasTest, EnumerateModifyLValue) {
   std::vector<char> foo = {'a', 'b', 'c'};
 
   for (auto X : llvm::enumerate(foo)) {
-    ++X.Value;
+    ++X.value();
   }
   EXPECT_EQ('b', foo[0]);
   EXPECT_EQ('c', foo[1]);
@@ -97,7 +97,7 @@ TEST(STLExtrasTest, EnumerateRValueRef) {
   auto Enumerator = llvm::enumerate(std::vector<int>{1, 2, 3});
 
   for (auto X : llvm::enumerate(std::vector<int>{1, 2, 3})) {
-    Results.emplace_back(X.Index, X.Value);
+    Results.emplace_back(X.index(), X.value());
   }
 
   ASSERT_EQ(3u, Results.size());
@@ -114,8 +114,8 @@ TEST(STLExtrasTest, EnumerateModifyRValue) {
   std::vector<PairType> Results;
 
   for (auto X : llvm::enumerate(std::vector<char>{'1', '2', '3'})) {
-    ++X.Value;
-    Results.emplace_back(X.Index, X.Value);
+    ++X.value();
+    Results.emplace_back(X.index(), X.value());
   }
 
   ASSERT_EQ(3u, Results.size());
@@ -255,6 +255,16 @@ TEST(STLExtrasTest, CountAdaptor) {
   EXPECT_EQ(1, count(v, 4));
 }
 
+TEST(STLExtrasTest, ToVector) {
+  std::vector<char> v = {'a', 'b', 'c'};
+  auto Enumerated = to_vector<4>(enumerate(v));
+  ASSERT_EQ(3u, Enumerated.size());
+  for (size_t I = 0; I < v.size(); ++I) {
+    EXPECT_EQ(I, Enumerated[I].index());
+    EXPECT_EQ(v[I], Enumerated[I].value());
+  }
+}
+
 TEST(STLExtrasTest, ConcatRange) {
   std::vector<int> Expected = {1, 2, 3, 4, 5, 6, 7, 8};
   std::vector<int> Test;
diff --git a/unittests/ADT/SmallPtrSetTest.cpp b/unittests/ADT/SmallPtrSetTest.cpp
index d4d963fdc5bdc18ef7b5b8945ff84367e5736614..fc14c684d67f387ee910b5f6dd358ea2358e4f8a 100644
--- a/unittests/ADT/SmallPtrSetTest.cpp
+++ b/unittests/ADT/SmallPtrSetTest.cpp
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "gtest/gtest.h"
+#include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/Support/PointerLikeTypeTraits.h"
 
 using namespace llvm;
 
@@ -279,3 +281,52 @@ TEST(SmallPtrSetTest, EraseTest) {
   SmallPtrSet<int *, 2> A;
   checkEraseAndIterators(A);
 }
+
+// Verify that dereferencing and iteration work.
+TEST(SmallPtrSetTest, dereferenceAndIterate) {
+  int Ints[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  SmallPtrSet<const int *, 4> S;
+  for (int &I : Ints) {
+    EXPECT_EQ(&I, *S.insert(&I).first);
+    EXPECT_EQ(&I, *S.find(&I));
+  }
+
+  // Iterate from each and count how many times each element is found.
+  int Found[sizeof(Ints)/sizeof(int)] = {0};
+  for (int &I : Ints)
+    for (auto F = S.find(&I), E = S.end(); F != E; ++F)
+      ++Found[*F - Ints];
+
+  // Sort.  We should hit the first element just once and the final element N
+  // times.
+  std::sort(std::begin(Found), std::end(Found));
+  for (auto F = std::begin(Found), E = std::end(Found); F != E; ++F)
+    EXPECT_EQ(F - Found + 1, *F);
+}
+
+// Verify that const pointers work for count and find even when the underlying
+// SmallPtrSet is not for a const pointer type.
+TEST(SmallPtrSetTest, ConstTest) {
+  SmallPtrSet<int *, 8> IntSet;
+  int A;
+  int *B = &A;
+  const int *C = &A;
+  IntSet.insert(B);
+  EXPECT_EQ(IntSet.count(B), 1u);
+  EXPECT_EQ(IntSet.count(C), 1u);
+  EXPECT_NE(IntSet.find(B), IntSet.end());
+  EXPECT_NE(IntSet.find(C), IntSet.end());
+}
+
+// Verify that we automatically get the const version of PointerLikeTypeTraits
+// filled in for us, even for a non-pointer type
+using TestPair = PointerIntPair<int *, 1>;
+
+TEST(SmallPtrSetTest, ConstNonPtrTest) {
+  SmallPtrSet<TestPair, 8> IntSet;
+  int A[1];
+  TestPair Pair(&A[0], 1);
+  IntSet.insert(Pair);
+  EXPECT_EQ(IntSet.count(Pair), 1u);
+  EXPECT_NE(IntSet.find(Pair), IntSet.end());
+}
diff --git a/unittests/ADT/StringMapTest.cpp b/unittests/ADT/StringMapTest.cpp
index 911c72d7496192833894c7b942d221fe16a36880..b5c63695ff35cae50bdc1a6a5588e821532d44f9 100644
--- a/unittests/ADT/StringMapTest.cpp
+++ b/unittests/ADT/StringMapTest.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/DataTypes.h"
 #include "gtest/gtest.h"
@@ -269,6 +270,34 @@ TEST_F(StringMapTest, InsertRehashingPairTest) {
   EXPECT_EQ(42u, It->second);
 }
 
+TEST_F(StringMapTest, IterMapKeys) {
+  StringMap<int> Map;
+  Map["A"] = 1;
+  Map["B"] = 2;
+  Map["C"] = 3;
+  Map["D"] = 3;
+
+  auto Keys = to_vector<4>(Map.keys());
+  std::sort(Keys.begin(), Keys.end());
+
+  SmallVector<StringRef, 4> Expected = {"A", "B", "C", "D"};
+  EXPECT_EQ(Expected, Keys);
+}
+
+TEST_F(StringMapTest, IterSetKeys) {
+  StringSet<> Set;
+  Set.insert("A");
+  Set.insert("B");
+  Set.insert("C");
+  Set.insert("D");
+
+  auto Keys = to_vector<4>(Set.keys());
+  std::sort(Keys.begin(), Keys.end());
+
+  SmallVector<StringRef, 4> Expected = {"A", "B", "C", "D"};
+  EXPECT_EQ(Expected, Keys);
+}
+
 // Create a non-default constructable value
 struct StringMapTestStruct {
   StringMapTestStruct(int i) : i(i) {}
@@ -425,7 +454,7 @@ TEST(StringMapCustomTest, InitialSizeTest) {
       Map.insert(std::pair<std::string, CountCtorCopyAndMove>(
           std::piecewise_construct, std::forward_as_tuple(Twine(i).str()),
           std::forward_as_tuple(i)));
-    // After the inital move, the map will move the Elts in the Entry.
+    // After the initial move, the map will move the Elts in the Entry.
     EXPECT_EQ((unsigned)Size * 2, CountCtorCopyAndMove::Move);
     // We copy once the pair from the Elts vector
     EXPECT_EQ(0u, CountCtorCopyAndMove::Copy);
diff --git a/unittests/ADT/StringRefTest.cpp b/unittests/ADT/StringRefTest.cpp
index 5b6822ed757df736d92c421db934f516e761c8b7..bd9387837df40db42d2df27ba7d02ae787706d61 100644
--- a/unittests/ADT/StringRefTest.cpp
+++ b/unittests/ADT/StringRefTest.cpp
@@ -852,6 +852,27 @@ TEST(StringRefTest, consumeIntegerSigned) {
   }
 }
 
+struct GetDoubleStrings {
+  const char *Str;
+  bool AllowInexact;
+  bool ShouldFail;
+  double D;
+} DoubleStrings[] = {{"0", false, false, 0.0},
+                     {"0.0", false, false, 0.0},
+                     {"-0.0", false, false, -0.0},
+                     {"123.45", false, true, 123.45},
+                     {"123.45", true, false, 123.45}};
+
+TEST(StringRefTest, getAsDouble) {
+  for (const auto &Entry : DoubleStrings) {
+    double Result;
+    StringRef S(Entry.Str);
+    EXPECT_EQ(Entry.ShouldFail, S.getAsDouble(Result, Entry.AllowInexact));
+    if (!Entry.ShouldFail)
+      EXPECT_EQ(Result, Entry.D);
+  }
+}
+
 static const char *join_input[] = { "a", "b", "c" };
 static const char join_result1[] = "a";
 static const char join_result2[] = "a:b:c";
@@ -878,6 +899,8 @@ TEST(StringRefTest, joinStrings) {
   EXPECT_TRUE(v2_join2);
   bool v2_join3 = join(v2.begin(), v2.end(), "::") == join_result3;
   EXPECT_TRUE(v2_join3);
+  v2_join3 = join(v2, "::") == join_result3;
+  EXPECT_TRUE(v2_join3);
 }
 
 
diff --git a/unittests/ADT/TinyPtrVectorTest.cpp b/unittests/ADT/TinyPtrVectorTest.cpp
index 26189b76394fc9460023d38d1cc828d761a5755c..8d5fa4060913ba3081b6871b8265a25593b29807 100644
--- a/unittests/ADT/TinyPtrVectorTest.cpp
+++ b/unittests/ADT/TinyPtrVectorTest.cpp
@@ -17,19 +17,13 @@
 #include "llvm/Support/type_traits.h"
 #include "gtest/gtest.h"
 #include <algorithm>
+#include <random>
 #include <vector>
 
 using namespace llvm;
 
 namespace {
 
-// The world's worst RNG, but it is deterministic and makes it easy to get
-// *some* shuffling of elements.
-static ptrdiff_t test_shuffle_rng(ptrdiff_t i) {
-  return (i + i * 33) % i;
-}
-static ptrdiff_t (*test_shuffle_rng_p)(ptrdiff_t) = &test_shuffle_rng;
-
 template <typename VectorT>
 class TinyPtrVectorTest : public testing::Test {
 protected:
@@ -46,7 +40,7 @@ protected:
     for (size_t i = 0, e = array_lengthof(TestValues); i != e; ++i)
       TestPtrs.push_back(&TestValues[i]);
 
-    std::random_shuffle(TestPtrs.begin(), TestPtrs.end(), test_shuffle_rng_p);
+    std::shuffle(TestPtrs.begin(), TestPtrs.end(), std::mt19937{});
   }
 
   ArrayRef<PtrT> testArray(size_t N) {
diff --git a/unittests/ADT/TripleTest.cpp b/unittests/ADT/TripleTest.cpp
index d5605cbb34d32462ba622de49f7775159d4d845f..78616d36e4f8a1e53928918000128b8b7898c36b 100644
--- a/unittests/ADT/TripleTest.cpp
+++ b/unittests/ADT/TripleTest.cpp
@@ -685,6 +685,54 @@ TEST(TripleTest, BitWidthArchVariants) {
   T.setArch(Triple::riscv64);
   EXPECT_EQ(Triple::riscv32, T.get32BitArchVariant().getArch());
   EXPECT_EQ(Triple::riscv64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::thumbeb);
+  EXPECT_EQ(Triple::thumbeb, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::aarch64_be, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::thumb);
+  EXPECT_EQ(Triple::thumb, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::aarch64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::aarch64);
+  EXPECT_EQ(Triple::arm, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::aarch64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::aarch64_be);
+  EXPECT_EQ(Triple::armeb, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::aarch64_be, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::renderscript32);
+  EXPECT_EQ(Triple::renderscript32, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::renderscript64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::renderscript64);
+  EXPECT_EQ(Triple::renderscript32, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::renderscript64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::le32);
+  EXPECT_EQ(Triple::le32, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::le64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::le64);
+  EXPECT_EQ(Triple::le32, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::le64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::armeb);
+  EXPECT_EQ(Triple::armeb, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::aarch64_be, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::arm);
+  EXPECT_EQ(Triple::arm, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::aarch64, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::systemz);
+  EXPECT_EQ(Triple::UnknownArch, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::systemz, T.get64BitArchVariant().getArch());
+
+  T.setArch(Triple::xcore);
+  EXPECT_EQ(Triple::xcore, T.get32BitArchVariant().getArch());
+  EXPECT_EQ(Triple::UnknownArch, T.get64BitArchVariant().getArch());
 }
 
 TEST(TripleTest, EndianArchVariants) {
@@ -775,6 +823,22 @@ TEST(TripleTest, EndianArchVariants) {
   T.setArch(Triple::lanai);
   EXPECT_EQ(Triple::lanai, T.getBigEndianArchVariant().getArch());
   EXPECT_EQ(Triple::UnknownArch, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::tcele);
+  EXPECT_EQ(Triple::tce, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::tcele, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::tce);
+  EXPECT_EQ(Triple::tce, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::tcele, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::le32);
+  EXPECT_EQ(Triple::UnknownArch, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::le32, T.getLittleEndianArchVariant().getArch());
+
+  T.setArch(Triple::le64);
+  EXPECT_EQ(Triple::UnknownArch, T.getBigEndianArchVariant().getArch());
+  EXPECT_EQ(Triple::le64, T.getLittleEndianArchVariant().getArch());
 }
 
 TEST(TripleTest, getOSVersion) {
@@ -906,6 +970,9 @@ TEST(TripleTest, FileFormat) {
   Triple T = Triple("");
   T.setObjectFormat(Triple::ELF);
   EXPECT_EQ(Triple::ELF, T.getObjectFormat());
+
+  T.setObjectFormat(Triple::MachO);
+  EXPECT_EQ(Triple::MachO, T.getObjectFormat());
 }
 
 TEST(TripleTest, NormalizeWindows) {
@@ -950,6 +1017,10 @@ TEST(TripleTest, getARMCPUForArch) {
     llvm::Triple Triple("arm--nacl");
     EXPECT_EQ("cortex-a8", Triple.getARMCPUForArch());
   }
+  {
+    llvm::Triple Triple("arm--openbsd");
+    EXPECT_EQ("cortex-a8", Triple.getARMCPUForArch());
+  }
   {
     llvm::Triple Triple("armv6-unknown-freebsd");
     EXPECT_EQ("arm1176jzf-s", Triple.getARMCPUForArch());
diff --git a/unittests/Analysis/CMakeLists.txt b/unittests/Analysis/CMakeLists.txt
index 625816ffb0887b0c3422386a3bc22ffd97a71a5d..40d5ea5f5ad783f0bc4cd6287cb88aeb29b1443c 100644
--- a/unittests/Analysis/CMakeLists.txt
+++ b/unittests/Analysis/CMakeLists.txt
@@ -15,6 +15,7 @@ add_llvm_unittest(AnalysisTests
   LazyCallGraphTest.cpp
   LoopInfoTest.cpp
   MemoryBuiltinsTest.cpp
+  MemorySSA.cpp
   ProfileSummaryInfoTest.cpp
   ScalarEvolutionTest.cpp
   TBAATest.cpp
diff --git a/unittests/Analysis/LazyCallGraphTest.cpp b/unittests/Analysis/LazyCallGraphTest.cpp
index 5bb9dec3449fe8ba3d64553dec7326ca773c2aa2..6955beb37109d583c495c788fd83c1dcc78f5a0f 100644
--- a/unittests/Analysis/LazyCallGraphTest.cpp
+++ b/unittests/Analysis/LazyCallGraphTest.cpp
@@ -225,29 +225,29 @@ TEST(LazyCallGraphTest, BasicGraphFormation) {
   // the IR, and everything in our module is an entry node, so just directly
   // build variables for each node.
   auto I = CG.begin();
-  LazyCallGraph::Node &A1 = (I++)->getNode(CG);
+  LazyCallGraph::Node &A1 = (I++)->getNode();
   EXPECT_EQ("a1", A1.getFunction().getName());
-  LazyCallGraph::Node &A2 = (I++)->getNode(CG);
+  LazyCallGraph::Node &A2 = (I++)->getNode();
   EXPECT_EQ("a2", A2.getFunction().getName());
-  LazyCallGraph::Node &A3 = (I++)->getNode(CG);
+  LazyCallGraph::Node &A3 = (I++)->getNode();
   EXPECT_EQ("a3", A3.getFunction().getName());
-  LazyCallGraph::Node &B1 = (I++)->getNode(CG);
+  LazyCallGraph::Node &B1 = (I++)->getNode();
   EXPECT_EQ("b1", B1.getFunction().getName());
-  LazyCallGraph::Node &B2 = (I++)->getNode(CG);
+  LazyCallGraph::Node &B2 = (I++)->getNode();
   EXPECT_EQ("b2", B2.getFunction().getName());
-  LazyCallGraph::Node &B3 = (I++)->getNode(CG);
+  LazyCallGraph::Node &B3 = (I++)->getNode();
   EXPECT_EQ("b3", B3.getFunction().getName());
-  LazyCallGraph::Node &C1 = (I++)->getNode(CG);
+  LazyCallGraph::Node &C1 = (I++)->getNode();
   EXPECT_EQ("c1", C1.getFunction().getName());
-  LazyCallGraph::Node &C2 = (I++)->getNode(CG);
+  LazyCallGraph::Node &C2 = (I++)->getNode();
   EXPECT_EQ("c2", C2.getFunction().getName());
-  LazyCallGraph::Node &C3 = (I++)->getNode(CG);
+  LazyCallGraph::Node &C3 = (I++)->getNode();
   EXPECT_EQ("c3", C3.getFunction().getName());
-  LazyCallGraph::Node &D1 = (I++)->getNode(CG);
+  LazyCallGraph::Node &D1 = (I++)->getNode();
   EXPECT_EQ("d1", D1.getFunction().getName());
-  LazyCallGraph::Node &D2 = (I++)->getNode(CG);
+  LazyCallGraph::Node &D2 = (I++)->getNode();
   EXPECT_EQ("d2", D2.getFunction().getName());
-  LazyCallGraph::Node &D3 = (I++)->getNode(CG);
+  LazyCallGraph::Node &D3 = (I++)->getNode();
   EXPECT_EQ("d3", D3.getFunction().getName());
   EXPECT_EQ(CG.end(), I);
 
@@ -255,7 +255,7 @@ TEST(LazyCallGraphTest, BasicGraphFormation) {
   // independent of order.
   std::vector<std::string> Nodes;
 
-  for (LazyCallGraph::Edge &E : A1)
+  for (LazyCallGraph::Edge &E : A1.populate())
     Nodes.push_back(E.getFunction().getName());
   std::sort(Nodes.begin(), Nodes.end());
   EXPECT_EQ("a2", Nodes[0]);
@@ -263,43 +263,53 @@ TEST(LazyCallGraphTest, BasicGraphFormation) {
   EXPECT_EQ("c3", Nodes[2]);
   Nodes.clear();
 
-  EXPECT_EQ(A2.end(), std::next(A2.begin()));
-  EXPECT_EQ("a3", A2.begin()->getFunction().getName());
-  EXPECT_EQ(A3.end(), std::next(A3.begin()));
-  EXPECT_EQ("a1", A3.begin()->getFunction().getName());
+  A2.populate();
+  EXPECT_EQ(A2->end(), std::next(A2->begin()));
+  EXPECT_EQ("a3", A2->begin()->getFunction().getName());
+  A3.populate();
+  EXPECT_EQ(A3->end(), std::next(A3->begin()));
+  EXPECT_EQ("a1", A3->begin()->getFunction().getName());
 
-  for (LazyCallGraph::Edge &E : B1)
+  for (LazyCallGraph::Edge &E : B1.populate())
     Nodes.push_back(E.getFunction().getName());
   std::sort(Nodes.begin(), Nodes.end());
   EXPECT_EQ("b2", Nodes[0]);
   EXPECT_EQ("d3", Nodes[1]);
   Nodes.clear();
 
-  EXPECT_EQ(B2.end(), std::next(B2.begin()));
-  EXPECT_EQ("b3", B2.begin()->getFunction().getName());
-  EXPECT_EQ(B3.end(), std::next(B3.begin()));
-  EXPECT_EQ("b1", B3.begin()->getFunction().getName());
+  B2.populate();
+  EXPECT_EQ(B2->end(), std::next(B2->begin()));
+  EXPECT_EQ("b3", B2->begin()->getFunction().getName());
+  B3.populate();
+  EXPECT_EQ(B3->end(), std::next(B3->begin()));
+  EXPECT_EQ("b1", B3->begin()->getFunction().getName());
 
-  for (LazyCallGraph::Edge &E : C1)
+  for (LazyCallGraph::Edge &E : C1.populate())
     Nodes.push_back(E.getFunction().getName());
   std::sort(Nodes.begin(), Nodes.end());
   EXPECT_EQ("c2", Nodes[0]);
   EXPECT_EQ("d2", Nodes[1]);
   Nodes.clear();
 
-  EXPECT_EQ(C2.end(), std::next(C2.begin()));
-  EXPECT_EQ("c3", C2.begin()->getFunction().getName());
-  EXPECT_EQ(C3.end(), std::next(C3.begin()));
-  EXPECT_EQ("c1", C3.begin()->getFunction().getName());
-
-  EXPECT_EQ(D1.end(), std::next(D1.begin()));
-  EXPECT_EQ("d2", D1.begin()->getFunction().getName());
-  EXPECT_EQ(D2.end(), std::next(D2.begin()));
-  EXPECT_EQ("d3", D2.begin()->getFunction().getName());
-  EXPECT_EQ(D3.end(), std::next(D3.begin()));
-  EXPECT_EQ("d1", D3.begin()->getFunction().getName());
+  C2.populate();
+  EXPECT_EQ(C2->end(), std::next(C2->begin()));
+  EXPECT_EQ("c3", C2->begin()->getFunction().getName());
+  C3.populate();
+  EXPECT_EQ(C3->end(), std::next(C3->begin()));
+  EXPECT_EQ("c1", C3->begin()->getFunction().getName());
+
+  D1.populate();
+  EXPECT_EQ(D1->end(), std::next(D1->begin()));
+  EXPECT_EQ("d2", D1->begin()->getFunction().getName());
+  D2.populate();
+  EXPECT_EQ(D2->end(), std::next(D2->begin()));
+  EXPECT_EQ("d3", D2->begin()->getFunction().getName());
+  D3.populate();
+  EXPECT_EQ(D3->end(), std::next(D3->begin()));
+  EXPECT_EQ("d1", D3->begin()->getFunction().getName());
 
   // Now lets look at the RefSCCs and SCCs.
+  CG.buildRefSCCs();
   auto J = CG.postorder_ref_scc_begin();
 
   LazyCallGraph::RefSCC &D = *J++;
@@ -401,32 +411,35 @@ TEST(LazyCallGraphTest, BasicGraphMutation) {
 
   LazyCallGraph::Node &A = CG.get(lookupFunction(*M, "a"));
   LazyCallGraph::Node &B = CG.get(lookupFunction(*M, "b"));
-  EXPECT_EQ(2, std::distance(A.begin(), A.end()));
-  EXPECT_EQ(0, std::distance(B.begin(), B.end()));
-
-  CG.insertEdge(B, lookupFunction(*M, "c"), LazyCallGraph::Edge::Call);
-  EXPECT_EQ(1, std::distance(B.begin(), B.end()));
-  LazyCallGraph::Node &C = B.begin()->getNode(CG);
-  EXPECT_EQ(0, std::distance(C.begin(), C.end()));
-
-  CG.insertEdge(C, B.getFunction(), LazyCallGraph::Edge::Call);
-  EXPECT_EQ(1, std::distance(C.begin(), C.end()));
-  EXPECT_EQ(&B, C.begin()->getNode());
-
-  CG.insertEdge(C, C.getFunction(), LazyCallGraph::Edge::Call);
-  EXPECT_EQ(2, std::distance(C.begin(), C.end()));
-  EXPECT_EQ(&B, C.begin()->getNode());
-  EXPECT_EQ(&C, std::next(C.begin())->getNode());
-
-  CG.removeEdge(C, B.getFunction());
-  EXPECT_EQ(1, std::distance(C.begin(), C.end()));
-  EXPECT_EQ(&C, C.begin()->getNode());
-
-  CG.removeEdge(C, C.getFunction());
-  EXPECT_EQ(0, std::distance(C.begin(), C.end()));
-
-  CG.removeEdge(B, C.getFunction());
-  EXPECT_EQ(0, std::distance(B.begin(), B.end()));
+  A.populate();
+  EXPECT_EQ(2, std::distance(A->begin(), A->end()));
+  B.populate();
+  EXPECT_EQ(0, std::distance(B->begin(), B->end()));
+
+  LazyCallGraph::Node &C = CG.get(lookupFunction(*M, "c"));
+  C.populate();
+  CG.insertEdge(B, C, LazyCallGraph::Edge::Call);
+  EXPECT_EQ(1, std::distance(B->begin(), B->end()));
+  EXPECT_EQ(0, std::distance(C->begin(), C->end()));
+
+  CG.insertEdge(C, B, LazyCallGraph::Edge::Call);
+  EXPECT_EQ(1, std::distance(C->begin(), C->end()));
+  EXPECT_EQ(&B, &C->begin()->getNode());
+
+  CG.insertEdge(C, C, LazyCallGraph::Edge::Call);
+  EXPECT_EQ(2, std::distance(C->begin(), C->end()));
+  EXPECT_EQ(&B, &C->begin()->getNode());
+  EXPECT_EQ(&C, &std::next(C->begin())->getNode());
+
+  CG.removeEdge(C, B);
+  EXPECT_EQ(1, std::distance(C->begin(), C->end()));
+  EXPECT_EQ(&C, &C->begin()->getNode());
+
+  CG.removeEdge(C, C);
+  EXPECT_EQ(0, std::distance(C->begin(), C->end()));
+
+  CG.removeEdge(B, C);
+  EXPECT_EQ(0, std::distance(B->begin(), B->end()));
 }
 
 TEST(LazyCallGraphTest, InnerSCCFormation) {
@@ -436,14 +449,18 @@ TEST(LazyCallGraphTest, InnerSCCFormation) {
 
   // Now mutate the graph to connect every node into a single RefSCC to ensure
   // that our inner SCC formation handles the rest.
-  CG.insertEdge(lookupFunction(*M, "d1"), lookupFunction(*M, "a1"),
-                LazyCallGraph::Edge::Ref);
+  LazyCallGraph::Node &D1 = CG.get(lookupFunction(*M, "d1"));
+  LazyCallGraph::Node &A1 = CG.get(lookupFunction(*M, "a1"));
+  A1.populate();
+  D1.populate();
+  CG.insertEdge(D1, A1, LazyCallGraph::Edge::Ref);
 
   // Build vectors and sort them for the rest of the assertions to make them
   // independent of order.
   std::vector<std::string> Nodes;
 
   // We should build a single RefSCC for the entire graph.
+  CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin();
   LazyCallGraph::RefSCC &RC = *I++;
   EXPECT_EQ(CG.postorder_ref_scc_end(), I);
@@ -528,6 +545,7 @@ TEST(LazyCallGraphTest, MultiArmSCC) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin();
   LazyCallGraph::RefSCC &RC = *I++;
   EXPECT_EQ(CG.postorder_ref_scc_end(), I);
@@ -578,6 +596,7 @@ TEST(LazyCallGraphTest, OutgoingEdgeMutation) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs())
     dbgs() << "Formed RefSCC: " << RC << "\n";
 
@@ -610,13 +629,13 @@ TEST(LazyCallGraphTest, OutgoingEdgeMutation) {
   EXPECT_TRUE(DRC.isChildOf(CRC));
   EXPECT_TRUE(DC.isChildOf(CC));
 
-  EXPECT_EQ(2, std::distance(A.begin(), A.end()));
+  EXPECT_EQ(2, std::distance(A->begin(), A->end()));
   ARC.insertOutgoingEdge(A, D, LazyCallGraph::Edge::Call);
-  EXPECT_EQ(3, std::distance(A.begin(), A.end()));
-  const LazyCallGraph::Edge &NewE = A[D];
+  EXPECT_EQ(3, std::distance(A->begin(), A->end()));
+  const LazyCallGraph::Edge &NewE = (*A)[D];
   EXPECT_TRUE(NewE);
   EXPECT_TRUE(NewE.isCall());
-  EXPECT_EQ(&D, NewE.getNode());
+  EXPECT_EQ(&D, &NewE.getNode());
 
   // Only the parent and child tests sholud have changed. The rest of the graph
   // remains the same.
@@ -680,7 +699,7 @@ TEST(LazyCallGraphTest, OutgoingEdgeMutation) {
   EXPECT_EQ(&DRC, CG.lookupRefSCC(D));
 
   ARC.removeOutgoingEdge(A, D);
-  EXPECT_EQ(2, std::distance(A.begin(), A.end()));
+  EXPECT_EQ(2, std::distance(A->begin(), A->end()));
 
   // Now the parent and child tests fail again but the rest remains the same.
   EXPECT_FALSE(ARC.isParentOf(DRC));
@@ -723,6 +742,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertion) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs())
     dbgs() << "Formed RefSCC: " << RC << "\n";
 
@@ -750,7 +770,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertion) {
   ASSERT_EQ(&CRC, CG.lookupRefSCC(C3));
   ASSERT_EQ(&DRC, CG.lookupRefSCC(D2));
   ASSERT_EQ(&DRC, CG.lookupRefSCC(D3));
-  ASSERT_EQ(1, std::distance(D2.begin(), D2.end()));
+  ASSERT_EQ(1, std::distance(D2->begin(), D2->end()));
 
   // Add an edge to make the graph:
   //
@@ -767,10 +787,10 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertion) {
   //      a3--a2        |
   auto MergedRCs = CRC.insertIncomingRefEdge(D2, C2);
   // Make sure we connected the nodes.
-  for (LazyCallGraph::Edge E : D2) {
-    if (E.getNode() == &D3)
+  for (LazyCallGraph::Edge E : *D2) {
+    if (&E.getNode() == &D3)
       continue;
-    EXPECT_EQ(&C2, E.getNode());
+    EXPECT_EQ(&C2, &E.getNode());
   }
   // And marked the D ref-SCC as no longer valid.
   EXPECT_EQ(1u, MergedRCs.size());
@@ -805,102 +825,6 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertion) {
   EXPECT_EQ(++I, E);
 }
 
-TEST(LazyCallGraphTest, IncomingEdgeInsertionMidTraversal) {
-  LLVMContext Context;
-  // This is the same fundamental test as the previous, but we perform it
-  // having only partially walked the RefSCCs of the graph.
-  std::unique_ptr<Module> M = parseAssembly(Context, DiamondOfTriangles);
-  LazyCallGraph CG(*M);
-
-  // Walk the RefSCCs until we find the one containing 'c1'.
-  auto I = CG.postorder_ref_scc_begin(), E = CG.postorder_ref_scc_end();
-  ASSERT_NE(I, E);
-  LazyCallGraph::RefSCC &DRC = *I;
-  ASSERT_NE(&DRC, nullptr);
-  ++I;
-  ASSERT_NE(I, E);
-  LazyCallGraph::RefSCC &CRC = *I;
-  ASSERT_NE(&CRC, nullptr);
-
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "a1")));
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "a2")));
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "a3")));
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "b1")));
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "b2")));
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "b3")));
-  LazyCallGraph::Node &C1 = *CG.lookup(lookupFunction(*M, "c1"));
-  LazyCallGraph::Node &C2 = *CG.lookup(lookupFunction(*M, "c2"));
-  LazyCallGraph::Node &C3 = *CG.lookup(lookupFunction(*M, "c3"));
-  LazyCallGraph::Node &D1 = *CG.lookup(lookupFunction(*M, "d1"));
-  LazyCallGraph::Node &D2 = *CG.lookup(lookupFunction(*M, "d2"));
-  LazyCallGraph::Node &D3 = *CG.lookup(lookupFunction(*M, "d3"));
-  ASSERT_EQ(&CRC, CG.lookupRefSCC(C1));
-  ASSERT_EQ(&CRC, CG.lookupRefSCC(C2));
-  ASSERT_EQ(&CRC, CG.lookupRefSCC(C3));
-  ASSERT_EQ(&DRC, CG.lookupRefSCC(D1));
-  ASSERT_EQ(&DRC, CG.lookupRefSCC(D2));
-  ASSERT_EQ(&DRC, CG.lookupRefSCC(D3));
-  ASSERT_EQ(1, std::distance(D2.begin(), D2.end()));
-
-  auto MergedRCs = CRC.insertIncomingRefEdge(D2, C2);
-  // Make sure we connected the nodes.
-  for (LazyCallGraph::Edge E : D2) {
-    if (E.getNode() == &D3)
-      continue;
-    EXPECT_EQ(&C2, E.getNode());
-  }
-  // And marked the D ref-SCC as no longer valid.
-  EXPECT_EQ(1u, MergedRCs.size());
-  EXPECT_EQ(&DRC, MergedRCs[0]);
-
-  // Make sure we have the correct nodes in the RefSCCs.
-  EXPECT_EQ(&CRC, CG.lookupRefSCC(C1));
-  EXPECT_EQ(&CRC, CG.lookupRefSCC(C2));
-  EXPECT_EQ(&CRC, CG.lookupRefSCC(C3));
-  EXPECT_EQ(&CRC, CG.lookupRefSCC(D1));
-  EXPECT_EQ(&CRC, CG.lookupRefSCC(D2));
-  EXPECT_EQ(&CRC, CG.lookupRefSCC(D3));
-
-  // Verify that the post-order walk reflects the updated but still incomplete
-  // structure.
-  auto J = CG.postorder_ref_scc_begin();
-  EXPECT_NE(J, E);
-  EXPECT_EQ(&CRC, &*J) << "Actual RefSCC: " << *J;
-  EXPECT_EQ(I, J);
-
-  // Check that we can form the last two RefSCCs now, and even that we can do
-  // it with alternating iterators.
-  ++J;
-  EXPECT_NE(J, E);
-  LazyCallGraph::RefSCC &BRC = *J;
-  EXPECT_NE(&BRC, nullptr);
-  EXPECT_EQ(&BRC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "b1"))));
-  EXPECT_EQ(&BRC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "b2"))));
-  EXPECT_EQ(&BRC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "b3"))));
-  EXPECT_TRUE(BRC.isParentOf(CRC));
-  ++I;
-  EXPECT_EQ(J, I);
-  EXPECT_EQ(&BRC, &*I) << "Actual RefSCC: " << *I;
-
-  // Increment I this time to form the new RefSCC, flopping back to the first
-  // iterator.
-  ++I;
-  EXPECT_NE(I, E);
-  LazyCallGraph::RefSCC &ARC = *I;
-  EXPECT_NE(&ARC, nullptr);
-  EXPECT_EQ(&ARC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "a1"))));
-  EXPECT_EQ(&ARC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "a2"))));
-  EXPECT_EQ(&ARC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "a3"))));
-  EXPECT_TRUE(ARC.isParentOf(CRC));
-  ++J;
-  EXPECT_EQ(I, J);
-  EXPECT_EQ(&ARC, &*J) << "Actual RefSCC: " << *J;
-  ++I;
-  EXPECT_EQ(E, I);
-  ++J;
-  EXPECT_EQ(E, J);
-}
-
 TEST(LazyCallGraphTest, IncomingEdgeInsertionRefGraph) {
   LLVMContext Context;
   // Another variation of the above test but with all the edges switched to
@@ -910,6 +834,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionRefGraph) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs())
     dbgs() << "Formed RefSCC: " << RC << "\n";
 
@@ -937,7 +862,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionRefGraph) {
   ASSERT_EQ(&CRC, CG.lookupRefSCC(C3));
   ASSERT_EQ(&DRC, CG.lookupRefSCC(D2));
   ASSERT_EQ(&DRC, CG.lookupRefSCC(D3));
-  ASSERT_EQ(1, std::distance(D2.begin(), D2.end()));
+  ASSERT_EQ(1, std::distance(D2->begin(), D2->end()));
 
   // Add an edge to make the graph:
   //
@@ -954,10 +879,10 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionRefGraph) {
   //      a3--a2        |
   auto MergedRCs = CRC.insertIncomingRefEdge(D2, C2);
   // Make sure we connected the nodes.
-  for (LazyCallGraph::Edge E : D2) {
-    if (E.getNode() == &D3)
+  for (LazyCallGraph::Edge E : *D2) {
+    if (&E.getNode() == &D3)
       continue;
-    EXPECT_EQ(&C2, E.getNode());
+    EXPECT_EQ(&C2, &E.getNode());
   }
   // And marked the D ref-SCC as no longer valid.
   EXPECT_EQ(1u, MergedRCs.size());
@@ -1016,6 +941,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeCallCycle) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs())
     dbgs() << "Formed RefSCC: " << RC << "\n";
 
@@ -1035,8 +961,8 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeCallCycle) {
   // Connect the top to the bottom forming a large RefSCC made up mostly of calls.
   auto MergedRCs = ARC.insertIncomingRefEdge(D, A);
   // Make sure we connected the nodes.
-  EXPECT_NE(D.begin(), D.end());
-  EXPECT_EQ(&A, D.begin()->getNode());
+  EXPECT_NE(D->begin(), D->end());
+  EXPECT_EQ(&A, &D->begin()->getNode());
 
   // Check that we have the dead RCs, but ignore the order.
   EXPECT_EQ(3u, MergedRCs.size());
@@ -1092,6 +1018,7 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeRefCycle) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs())
     dbgs() << "Formed RefSCC: " << RC << "\n";
 
@@ -1108,8 +1035,8 @@ TEST(LazyCallGraphTest, IncomingEdgeInsertionLargeRefCycle) {
   // references.
   auto MergedRCs = ARC.insertIncomingRefEdge(D, A);
   // Make sure we connected the nodes.
-  EXPECT_NE(D.begin(), D.end());
-  EXPECT_EQ(&A, D.begin()->getNode());
+  EXPECT_NE(D->begin(), D->end());
+  EXPECT_EQ(&A, &D->begin()->getNode());
 
   // Check that we have the dead RCs, but ignore the order.
   EXPECT_EQ(3u, MergedRCs.size());
@@ -1153,6 +1080,7 @@ TEST(LazyCallGraphTest, InlineAndDeleteFunction) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   for (LazyCallGraph::RefSCC &RC : CG.postorder_ref_sccs())
     dbgs() << "Formed RefSCC: " << RC << "\n";
 
@@ -1180,7 +1108,7 @@ TEST(LazyCallGraphTest, InlineAndDeleteFunction) {
   ASSERT_EQ(&CRC, CG.lookupRefSCC(C3));
   ASSERT_EQ(&DRC, CG.lookupRefSCC(D2));
   ASSERT_EQ(&DRC, CG.lookupRefSCC(D3));
-  ASSERT_EQ(1, std::distance(D2.begin(), D2.end()));
+  ASSERT_EQ(1, std::distance(D2->begin(), D2->end()));
 
   // Delete d2 from the graph, as if it had been inlined.
   //
@@ -1276,177 +1204,6 @@ TEST(LazyCallGraphTest, InlineAndDeleteFunction) {
   EXPECT_EQ(++I, E);
 }
 
-TEST(LazyCallGraphTest, InlineAndDeleteFunctionMidTraversal) {
-  LLVMContext Context;
-  // This is the same fundamental test as the previous, but we perform it
-  // having only partially walked the RefSCCs of the graph.
-  //
-  // The ascii diagram is repeated here for easy reference.
-  //
-  //         d1       |
-  //        /  \      |
-  //       d3--d2     |
-  //      /     \     |
-  //     b1     c1    |
-  //   /  \    /  \   |
-  //  b3--b2  c3--c2  |
-  //       \  /       |
-  //        a1        |
-  //       /  \       |
-  //      a3--a2      |
-  //
-  std::unique_ptr<Module> M = parseAssembly(Context, DiamondOfTriangles);
-  LazyCallGraph CG(*M);
-
-  // Walk the RefSCCs until we find the one containing 'c1'.
-  auto I = CG.postorder_ref_scc_begin(), E = CG.postorder_ref_scc_end();
-  ASSERT_NE(I, E);
-  LazyCallGraph::RefSCC &DRC = *I;
-  ASSERT_NE(&DRC, nullptr);
-  ++I;
-  ASSERT_NE(I, E);
-  LazyCallGraph::RefSCC &CRC = *I;
-  ASSERT_NE(&CRC, nullptr);
-
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "a1")));
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "a2")));
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "a3")));
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "b1")));
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "b2")));
-  ASSERT_EQ(nullptr, CG.lookup(lookupFunction(*M, "b3")));
-  LazyCallGraph::Node &C1 = *CG.lookup(lookupFunction(*M, "c1"));
-  LazyCallGraph::Node &C2 = *CG.lookup(lookupFunction(*M, "c2"));
-  LazyCallGraph::Node &C3 = *CG.lookup(lookupFunction(*M, "c3"));
-  LazyCallGraph::Node &D1 = *CG.lookup(lookupFunction(*M, "d1"));
-  LazyCallGraph::Node &D2 = *CG.lookup(lookupFunction(*M, "d2"));
-  LazyCallGraph::Node &D3 = *CG.lookup(lookupFunction(*M, "d3"));
-  ASSERT_EQ(&CRC, CG.lookupRefSCC(C1));
-  ASSERT_EQ(&CRC, CG.lookupRefSCC(C2));
-  ASSERT_EQ(&CRC, CG.lookupRefSCC(C3));
-  ASSERT_EQ(&DRC, CG.lookupRefSCC(D1));
-  ASSERT_EQ(&DRC, CG.lookupRefSCC(D2));
-  ASSERT_EQ(&DRC, CG.lookupRefSCC(D3));
-  ASSERT_EQ(1, std::distance(D2.begin(), D2.end()));
-
-  // Delete d2 from the graph, as if it had been inlined.
-  //
-  //         d1         |
-  //        / /         |
-  //       d3--.        |
-  //      /     \       |
-  //     b1     c1      |
-  //   /  \    /  \     |
-  //  b3--b2  c3--c2    |
-  //       \  /         |
-  //        a1          |
-  //       /  \         |
-  //      a3--a2        |
-
-  Function &D2F = D2.getFunction();
-  CallInst *C1Call = nullptr, *D1Call = nullptr;
-  for (User *U : D2F.users()) {
-    CallInst *CI = dyn_cast<CallInst>(U);
-    ASSERT_TRUE(CI) << "Expected a call: " << *U;
-    if (CI->getParent()->getParent() == &C1.getFunction()) {
-      ASSERT_EQ(nullptr, C1Call) << "Found too many C1 calls: " << *CI;
-      C1Call = CI;
-    } else if (CI->getParent()->getParent() == &D1.getFunction()) {
-      ASSERT_EQ(nullptr, D1Call) << "Found too many D1 calls: " << *CI;
-      D1Call = CI;
-    } else {
-      FAIL() << "Found an unexpected call instruction: " << *CI;
-    }
-  }
-  ASSERT_NE(C1Call, nullptr);
-  ASSERT_NE(D1Call, nullptr);
-  ASSERT_EQ(&D2F, C1Call->getCalledFunction());
-  ASSERT_EQ(&D2F, D1Call->getCalledFunction());
-  C1Call->setCalledFunction(&D3.getFunction());
-  D1Call->setCalledFunction(&D3.getFunction());
-  ASSERT_EQ(0u, D2F.getNumUses());
-
-  // Insert new edges first.
-  CRC.insertTrivialCallEdge(C1, D3);
-  DRC.insertTrivialCallEdge(D1, D3);
-
-  // Then remove the old ones.
-  LazyCallGraph::SCC &DC = *CG.lookupSCC(D2);
-  auto NewCs = DRC.switchInternalEdgeToRef(D1, D2);
-  EXPECT_EQ(&DC, CG.lookupSCC(D2));
-  EXPECT_EQ(NewCs.end(), std::next(NewCs.begin()));
-  LazyCallGraph::SCC &NewDC = *NewCs.begin();
-  EXPECT_EQ(&NewDC, CG.lookupSCC(D1));
-  EXPECT_EQ(&NewDC, CG.lookupSCC(D3));
-  auto NewRCs = DRC.removeInternalRefEdge(D1, D2);
-  EXPECT_EQ(&DRC, CG.lookupRefSCC(D2));
-  EXPECT_EQ(NewRCs.end(), std::next(NewRCs.begin()));
-  LazyCallGraph::RefSCC &NewDRC = **NewRCs.begin();
-  EXPECT_EQ(&NewDRC, CG.lookupRefSCC(D1));
-  EXPECT_EQ(&NewDRC, CG.lookupRefSCC(D3));
-  EXPECT_FALSE(NewDRC.isParentOf(DRC));
-  EXPECT_TRUE(CRC.isParentOf(DRC));
-  EXPECT_TRUE(CRC.isParentOf(NewDRC));
-  EXPECT_TRUE(DRC.isParentOf(NewDRC));
-  CRC.removeOutgoingEdge(C1, D2);
-  EXPECT_FALSE(CRC.isParentOf(DRC));
-  EXPECT_TRUE(CRC.isParentOf(NewDRC));
-  EXPECT_TRUE(DRC.isParentOf(NewDRC));
-
-  // Now that we've updated the call graph, D2 is dead, so remove it.
-  CG.removeDeadFunction(D2F);
-
-  // Check that the graph still looks the same.
-  EXPECT_EQ(&CRC, CG.lookupRefSCC(C1));
-  EXPECT_EQ(&CRC, CG.lookupRefSCC(C2));
-  EXPECT_EQ(&CRC, CG.lookupRefSCC(C3));
-  EXPECT_EQ(&NewDRC, CG.lookupRefSCC(D1));
-  EXPECT_EQ(&NewDRC, CG.lookupRefSCC(D3));
-  EXPECT_TRUE(CRC.isParentOf(NewDRC));
-
-  // Verify that the post-order walk reflects the updated but still incomplete
-  // structure.
-  auto J = CG.postorder_ref_scc_begin();
-  EXPECT_NE(J, E);
-  EXPECT_EQ(&NewDRC, &*J) << "Actual RefSCC: " << *J;
-  ++J;
-  EXPECT_NE(J, E);
-  EXPECT_EQ(&CRC, &*J) << "Actual RefSCC: " << *J;
-  EXPECT_EQ(I, J);
-
-  // Check that we can form the last two RefSCCs now, and even that we can do
-  // it with alternating iterators.
-  ++J;
-  EXPECT_NE(J, E);
-  LazyCallGraph::RefSCC &BRC = *J;
-  EXPECT_NE(&BRC, nullptr);
-  EXPECT_EQ(&BRC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "b1"))));
-  EXPECT_EQ(&BRC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "b2"))));
-  EXPECT_EQ(&BRC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "b3"))));
-  EXPECT_TRUE(BRC.isParentOf(NewDRC));
-  ++I;
-  EXPECT_EQ(J, I);
-  EXPECT_EQ(&BRC, &*I) << "Actual RefSCC: " << *I;
-
-  // Increment I this time to form the new RefSCC, flopping back to the first
-  // iterator.
-  ++I;
-  EXPECT_NE(I, E);
-  LazyCallGraph::RefSCC &ARC = *I;
-  EXPECT_NE(&ARC, nullptr);
-  EXPECT_EQ(&ARC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "a1"))));
-  EXPECT_EQ(&ARC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "a2"))));
-  EXPECT_EQ(&ARC, CG.lookupRefSCC(*CG.lookup(lookupFunction(*M, "a3"))));
-  EXPECT_TRUE(ARC.isParentOf(BRC));
-  EXPECT_TRUE(ARC.isParentOf(CRC));
-  ++J;
-  EXPECT_EQ(I, J);
-  EXPECT_EQ(&ARC, &*J) << "Actual RefSCC: " << *J;
-  ++I;
-  EXPECT_EQ(E, I);
-  ++J;
-  EXPECT_EQ(E, J);
-}
-
 TEST(LazyCallGraphTest, InternalEdgeMutation) {
   LLVMContext Context;
   std::unique_ptr<Module> M = parseAssembly(Context, "define void @a() {\n"
@@ -1467,6 +1224,7 @@ TEST(LazyCallGraphTest, InternalEdgeMutation) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin();
   LazyCallGraph::RefSCC &RC = *I++;
   EXPECT_EQ(CG.postorder_ref_scc_end(), I);
@@ -1484,7 +1242,7 @@ TEST(LazyCallGraphTest, InternalEdgeMutation) {
 
   // Insert an edge from 'a' to 'c'. Nothing changes about the graph.
   RC.insertInternalRefEdge(A, C);
-  EXPECT_EQ(2, std::distance(A.begin(), A.end()));
+  EXPECT_EQ(2, std::distance(A->begin(), A->end()));
   EXPECT_EQ(&RC, CG.lookupRefSCC(A));
   EXPECT_EQ(&RC, CG.lookupRefSCC(B));
   EXPECT_EQ(&RC, CG.lookupRefSCC(C));
@@ -1559,6 +1317,7 @@ TEST(LazyCallGraphTest, InternalEdgeRemoval) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin(), E = CG.postorder_ref_scc_end();
   LazyCallGraph::RefSCC &RC = *I;
   EXPECT_EQ(E, std::next(I));
@@ -1633,6 +1392,7 @@ TEST(LazyCallGraphTest, InternalNoOpEdgeRemoval) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin(), E = CG.postorder_ref_scc_end();
   LazyCallGraph::RefSCC &RC = *I;
   EXPECT_EQ(E, std::next(I));
@@ -1709,6 +1469,7 @@ TEST(LazyCallGraphTest, InternalCallEdgeToRef) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin();
   LazyCallGraph::RefSCC &RC = *I++;
   EXPECT_EQ(CG.postorder_ref_scc_end(), I);
@@ -1801,6 +1562,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCall) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin();
   LazyCallGraph::RefSCC &RC = *I++;
   EXPECT_EQ(CG.postorder_ref_scc_end(), I);
@@ -1913,6 +1675,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallNoCycleInterleaved) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin();
   LazyCallGraph::RefSCC &RC = *I++;
   EXPECT_EQ(CG.postorder_ref_scc_end(), I);
@@ -2043,6 +1806,7 @@ TEST(LazyCallGraphTest, InternalRefEdgeToCallBothPartitionAndMerge) {
   LazyCallGraph CG(*M);
 
   // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin();
   LazyCallGraph::RefSCC &RC = *I++;
   EXPECT_EQ(CG.postorder_ref_scc_end(), I);
@@ -2122,6 +1886,7 @@ TEST(LazyCallGraphTest, HandleBlockAddress) {
                              "}\n");
   LazyCallGraph CG(*M);
 
+  CG.buildRefSCCs();
   auto I = CG.postorder_ref_scc_begin();
   LazyCallGraph::RefSCC &FRC = *I++;
   LazyCallGraph::RefSCC &GRC = *I++;
@@ -2134,4 +1899,165 @@ TEST(LazyCallGraphTest, HandleBlockAddress) {
   EXPECT_TRUE(GRC.isParentOf(FRC));
 }
 
+TEST(LazyCallGraphTest, ReplaceNodeFunction) {
+  LLVMContext Context;
+  // A graph with several different kinds of edges pointing at a particular
+  // function.
+  std::unique_ptr<Module> M =
+      parseAssembly(Context,
+                    "define void @a(i8** %ptr) {\n"
+                    "entry:\n"
+                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  ret void\n"
+                    "}\n"
+                    "define void @b(i8** %ptr) {\n"
+                    "entry:\n"
+                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  call void @d(i8** %ptr)"
+                    "  ret void\n"
+                    "}\n"
+                    "define void @c(i8** %ptr) {\n"
+                    "entry:\n"
+                    "  call void @d(i8** %ptr)"
+                    "  call void @d(i8** %ptr)"
+                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  ret void\n"
+                    "}\n"
+                    "define void @d(i8** %ptr) {\n"
+                    "entry:\n"
+                    "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
+                    "  call void @c(i8** %ptr)"
+                    "  call void @d(i8** %ptr)"
+                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  ret void\n"
+                    "}\n");
+  LazyCallGraph CG(*M);
+
+  // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
+  auto I = CG.postorder_ref_scc_begin();
+  LazyCallGraph::RefSCC &RC1 = *I++;
+  LazyCallGraph::RefSCC &RC2 = *I++;
+  EXPECT_EQ(CG.postorder_ref_scc_end(), I);
+
+  ASSERT_EQ(2, RC1.size());
+  LazyCallGraph::SCC &C1 = RC1[0];
+  LazyCallGraph::SCC &C2 = RC1[1];
+
+  LazyCallGraph::Node &AN = *CG.lookup(lookupFunction(*M, "a"));
+  LazyCallGraph::Node &BN = *CG.lookup(lookupFunction(*M, "b"));
+  LazyCallGraph::Node &CN = *CG.lookup(lookupFunction(*M, "c"));
+  LazyCallGraph::Node &DN = *CG.lookup(lookupFunction(*M, "d"));
+  EXPECT_EQ(&C1, CG.lookupSCC(DN));
+  EXPECT_EQ(&C1, CG.lookupSCC(CN));
+  EXPECT_EQ(&C2, CG.lookupSCC(BN));
+  EXPECT_EQ(&RC1, CG.lookupRefSCC(DN));
+  EXPECT_EQ(&RC1, CG.lookupRefSCC(CN));
+  EXPECT_EQ(&RC1, CG.lookupRefSCC(BN));
+  EXPECT_EQ(&RC2, CG.lookupRefSCC(AN));
+
+  // Now we need to build a new function 'e' with the same signature as 'd'.
+  Function &D = DN.getFunction();
+  Function &E = *Function::Create(D.getFunctionType(), D.getLinkage(), "e");
+  D.getParent()->getFunctionList().insert(D.getIterator(), &E);
+
+  // Change each use of 'd' to use 'e'. This is particularly easy as they have
+  // the same type.
+  D.replaceAllUsesWith(&E);
+
+  // Splice the body of the old function into the new one.
+  E.getBasicBlockList().splice(E.begin(), D.getBasicBlockList());
+  // And fix up the one argument.
+  D.arg_begin()->replaceAllUsesWith(&*E.arg_begin());
+  E.arg_begin()->takeName(&*D.arg_begin());
+
+  // Now replace the function in the graph.
+  RC1.replaceNodeFunction(DN, E);
+
+  EXPECT_EQ(&E, &DN.getFunction());
+  EXPECT_EQ(&DN, &(*CN)[DN].getNode());
+  EXPECT_EQ(&DN, &(*BN)[DN].getNode());
+}
+
+TEST(LazyCallGraphTest, RemoveFunctionWithSpurriousRef) {
+  LLVMContext Context;
+  // A graph with a couple of RefSCCs.
+  std::unique_ptr<Module> M =
+      parseAssembly(Context,
+                    "define void @a(i8** %ptr) {\n"
+                    "entry:\n"
+                    "  store i8* bitcast (void(i8**)* @d to i8*), i8** %ptr\n"
+                    "  ret void\n"
+                    "}\n"
+                    "define void @b(i8** %ptr) {\n"
+                    "entry:\n"
+                    "  store i8* bitcast (void(i8**)* @c to i8*), i8** %ptr\n"
+                    "  ret void\n"
+                    "}\n"
+                    "define void @c(i8** %ptr) {\n"
+                    "entry:\n"
+                    "  call void @d(i8** %ptr)"
+                    "  ret void\n"
+                    "}\n"
+                    "define void @d(i8** %ptr) {\n"
+                    "entry:\n"
+                    "  call void @c(i8** %ptr)"
+                    "  store i8* bitcast (void(i8**)* @b to i8*), i8** %ptr\n"
+                    "  ret void\n"
+                    "}\n"
+                    "define void @dead() {\n"
+                    "entry:\n"
+                    "  ret void\n"
+                    "}\n");
+  LazyCallGraph CG(*M);
+
+  // Insert spurious ref edges.
+  LazyCallGraph::Node &AN = CG.get(lookupFunction(*M, "a"));
+  LazyCallGraph::Node &BN = CG.get(lookupFunction(*M, "b"));
+  LazyCallGraph::Node &CN = CG.get(lookupFunction(*M, "c"));
+  LazyCallGraph::Node &DN = CG.get(lookupFunction(*M, "d"));
+  LazyCallGraph::Node &DeadN = CG.get(lookupFunction(*M, "dead"));
+  AN.populate();
+  BN.populate();
+  CN.populate();
+  DN.populate();
+  DeadN.populate();
+  CG.insertEdge(AN, DeadN, LazyCallGraph::Edge::Ref);
+  CG.insertEdge(BN, DeadN, LazyCallGraph::Edge::Ref);
+  CG.insertEdge(CN, DeadN, LazyCallGraph::Edge::Ref);
+  CG.insertEdge(DN, DeadN, LazyCallGraph::Edge::Ref);
+
+  // Force the graph to be fully expanded.
+  CG.buildRefSCCs();
+  auto I = CG.postorder_ref_scc_begin();
+  LazyCallGraph::RefSCC &DeadRC = *I++;
+  LazyCallGraph::RefSCC &RC1 = *I++;
+  LazyCallGraph::RefSCC &RC2 = *I++;
+  EXPECT_EQ(CG.postorder_ref_scc_end(), I);
+
+  ASSERT_EQ(2, RC1.size());
+  LazyCallGraph::SCC &C1 = RC1[0];
+  LazyCallGraph::SCC &C2 = RC1[1];
+
+  EXPECT_EQ(&DeadRC, CG.lookupRefSCC(DeadN));
+  EXPECT_EQ(&C1, CG.lookupSCC(DN));
+  EXPECT_EQ(&C1, CG.lookupSCC(CN));
+  EXPECT_EQ(&C2, CG.lookupSCC(BN));
+  EXPECT_EQ(&RC1, CG.lookupRefSCC(DN));
+  EXPECT_EQ(&RC1, CG.lookupRefSCC(CN));
+  EXPECT_EQ(&RC1, CG.lookupRefSCC(BN));
+  EXPECT_EQ(&RC2, CG.lookupRefSCC(AN));
+
+  // Now delete 'dead'. There are no uses of this function but there are
+  // spurious references.
+  CG.removeDeadFunction(DeadN.getFunction());
+
+  // The only observable change should be that the RefSCC is gone from the
+  // postorder sequence.
+  I = CG.postorder_ref_scc_begin();
+  EXPECT_EQ(&RC1, &*I++);
+  EXPECT_EQ(&RC2, &*I++);
+  EXPECT_EQ(CG.postorder_ref_scc_end(), I);
+}
 }
diff --git a/unittests/Transforms/Utils/MemorySSA.cpp b/unittests/Analysis/MemorySSA.cpp
similarity index 94%
rename from unittests/Transforms/Utils/MemorySSA.cpp
rename to unittests/Analysis/MemorySSA.cpp
index 0df476bc28c4ce7160b6d1cdc91d30dd0452f9a8..08b0e830a9b2e02e6e91d04145511d9db1f4415c 100644
--- a/unittests/Transforms/Utils/MemorySSA.cpp
+++ b/unittests/Analysis/MemorySSA.cpp
@@ -6,16 +6,16 @@
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
-#include "llvm/Transforms/Utils/MemorySSA.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
-#include "llvm/Transforms/Utils/MemorySSAUpdater.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -90,6 +90,7 @@ TEST_F(MemorySSATest, CreateALoad) {
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
+  MemorySSAUpdater Updater(&MSSA);
   // Add the load
   B.SetInsertPoint(Merge);
   LoadInst *LoadInst = B.CreateLoad(PointerArg);
@@ -99,8 +100,8 @@ TEST_F(MemorySSATest, CreateALoad) {
   EXPECT_NE(MP, nullptr);
 
   // Create the load memory acccess
-  MemoryUse *LoadAccess = cast<MemoryUse>(
-      MSSA.createMemoryAccessInBB(LoadInst, MP, Merge, MemorySSA::Beginning));
+  MemoryUse *LoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
+      LoadInst, MP, Merge, MemorySSA::Beginning));
   MemoryAccess *DefiningAccess = LoadAccess->getDefiningAccess();
   EXPECT_TRUE(isa<MemoryPhi>(DefiningAccess));
   MSSA.verifyMemorySSA();
@@ -132,7 +133,7 @@ TEST_F(MemorySSATest, CreateLoadsAndStoreUpdater) {
   // Add the store
   B.SetInsertPoint(Entry, Entry->begin());
   StoreInst *EntryStore = B.CreateStore(B.getInt8(16), PointerArg);
-  MemoryAccess *EntryStoreAccess = MSSA.createMemoryAccessInBB(
+  MemoryAccess *EntryStoreAccess = Updater.createMemoryAccessInBB(
       EntryStore, nullptr, Entry, MemorySSA::Beginning);
   Updater.insertDef(cast<MemoryDef>(EntryStoreAccess));
 
@@ -145,7 +146,7 @@ TEST_F(MemorySSATest, CreateLoadsAndStoreUpdater) {
   EXPECT_EQ(MP, nullptr);
 
   // Create the load memory access
-  MemoryUse *FirstLoadAccess = cast<MemoryUse>(MSSA.createMemoryAccessInBB(
+  MemoryUse *FirstLoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       FirstLoad, nullptr, Merge, MemorySSA::Beginning));
   Updater.insertUse(FirstLoadAccess);
   // Should just have a load using the entry access, because it should discover
@@ -156,9 +157,9 @@ TEST_F(MemorySSATest, CreateLoadsAndStoreUpdater) {
   // Add the store
   B.SetInsertPoint(Left, Left->begin());
   StoreInst *LeftStore = B.CreateStore(B.getInt8(16), PointerArg);
-  MemoryAccess *LeftStoreAccess = MSSA.createMemoryAccessInBB(
+  MemoryAccess *LeftStoreAccess = Updater.createMemoryAccessInBB(
       LeftStore, nullptr, Left, MemorySSA::Beginning);
-  Updater.insertDef(cast<MemoryDef>(LeftStoreAccess));
+  Updater.insertDef(cast<MemoryDef>(LeftStoreAccess), false);
   // We don't touch existing loads, so we need to create a new one to get a phi
   // Add the second load
   B.SetInsertPoint(Merge, Merge->begin());
@@ -169,7 +170,7 @@ TEST_F(MemorySSATest, CreateLoadsAndStoreUpdater) {
   EXPECT_EQ(MP, nullptr);
 
   // Create the load memory access
-  MemoryUse *SecondLoadAccess = cast<MemoryUse>(MSSA.createMemoryAccessInBB(
+  MemoryUse *SecondLoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       SecondLoad, nullptr, Merge, MemorySSA::Beginning));
   Updater.insertUse(SecondLoadAccess);
   // Now the load should be a phi of the entry store and the left store
@@ -181,9 +182,13 @@ TEST_F(MemorySSATest, CreateLoadsAndStoreUpdater) {
   // Now create a store below the existing one in the entry
   B.SetInsertPoint(Entry, --Entry->end());
   StoreInst *SecondEntryStore = B.CreateStore(B.getInt8(16), PointerArg);
-  MemoryAccess *SecondEntryStoreAccess = MSSA.createMemoryAccessInBB(
+  MemoryAccess *SecondEntryStoreAccess = Updater.createMemoryAccessInBB(
       SecondEntryStore, nullptr, Entry, MemorySSA::End);
-  Updater.insertDef(cast<MemoryDef>(SecondEntryStoreAccess));
+  // Insert it twice just to test renaming
+  Updater.insertDef(cast<MemoryDef>(SecondEntryStoreAccess), false);
+  EXPECT_NE(FirstLoadAccess->getDefiningAccess(), MergePhi);
+  Updater.insertDef(cast<MemoryDef>(SecondEntryStoreAccess), true);
+  EXPECT_EQ(FirstLoadAccess->getDefiningAccess(), MergePhi);
   // and make sure the phi below it got updated, despite being blocks away
   MergePhi = dyn_cast<MemoryPhi>(SecondLoadAccess->getDefiningAccess());
   EXPECT_NE(MergePhi, nullptr);
@@ -219,7 +224,7 @@ TEST_F(MemorySSATest, CreateALoadUpdater) {
   // Add the store
   StoreInst *SI = B.CreateStore(B.getInt8(16), PointerArg);
   MemoryAccess *StoreAccess =
-      MSSA.createMemoryAccessInBB(SI, nullptr, Left, MemorySSA::Beginning);
+      Updater.createMemoryAccessInBB(SI, nullptr, Left, MemorySSA::Beginning);
   Updater.insertDef(cast<MemoryDef>(StoreAccess));
 
   // Add the load
@@ -231,7 +236,7 @@ TEST_F(MemorySSATest, CreateALoadUpdater) {
   EXPECT_EQ(MP, nullptr);
 
   // Create the load memory acccess
-  MemoryUse *LoadAccess = cast<MemoryUse>(MSSA.createMemoryAccessInBB(
+  MemoryUse *LoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       LoadInst, nullptr, Merge, MemorySSA::Beginning));
   Updater.insertUse(LoadAccess);
   MemoryAccess *DefiningAccess = LoadAccess->getDefiningAccess();
@@ -263,15 +268,15 @@ TEST_F(MemorySSATest, MoveAStore) {
   B.CreateLoad(PointerArg);
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
-
+  MemorySSAUpdater Updater(&MSSA);
   // Move the store
   SideStore->moveBefore(Entry->getTerminator());
   MemoryAccess *EntryStoreAccess = MSSA.getMemoryAccess(EntryStore);
   MemoryAccess *SideStoreAccess = MSSA.getMemoryAccess(SideStore);
-  MemoryAccess *NewStoreAccess = MSSA.createMemoryAccessAfter(
+  MemoryAccess *NewStoreAccess = Updater.createMemoryAccessAfter(
       SideStore, EntryStoreAccess, EntryStoreAccess);
   EntryStoreAccess->replaceAllUsesWith(NewStoreAccess);
-  MSSA.removeMemoryAccess(SideStoreAccess);
+  Updater.removeMemoryAccess(SideStoreAccess);
   MSSA.verifyMemorySSA();
 }
 
@@ -305,7 +310,7 @@ TEST_F(MemorySSATest, MoveAStoreUpdater) {
   SideStore->moveBefore(Entry->getTerminator());
   auto *EntryStoreAccess = MSSA.getMemoryAccess(EntryStore);
   auto *SideStoreAccess = MSSA.getMemoryAccess(SideStore);
-  auto *NewStoreAccess = MSSA.createMemoryAccessAfter(
+  auto *NewStoreAccess = Updater.createMemoryAccessAfter(
       SideStore, EntryStoreAccess, EntryStoreAccess);
   // Before, the load will point to a phi of the EntryStore and SideStore.
   auto *LoadAccess = cast<MemoryUse>(MSSA.getMemoryAccess(MergeLoad));
@@ -313,7 +318,7 @@ TEST_F(MemorySSATest, MoveAStoreUpdater) {
   MemoryPhi *MergePhi = cast<MemoryPhi>(LoadAccess->getDefiningAccess());
   EXPECT_EQ(MergePhi->getIncomingValue(1), EntryStoreAccess);
   EXPECT_EQ(MergePhi->getIncomingValue(0), SideStoreAccess);
-  MSSA.removeMemoryAccess(SideStoreAccess);
+  Updater.removeMemoryAccess(SideStoreAccess);
   Updater.insertDef(cast<MemoryDef>(NewStoreAccess));
   // After it's a phi of the new side store access.
   EXPECT_EQ(MergePhi->getIncomingValue(0), NewStoreAccess);
@@ -444,13 +449,15 @@ TEST_F(MemorySSATest, RemoveAPhi) {
 
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
+  MemorySSAUpdater Updater(&MSSA);
+
   // Before, the load will be a use of a phi<store, liveonentry>.
   MemoryUse *LoadAccess = cast<MemoryUse>(MSSA.getMemoryAccess(LoadInst));
   MemoryDef *StoreAccess = cast<MemoryDef>(MSSA.getMemoryAccess(StoreInst));
   MemoryAccess *DefiningAccess = LoadAccess->getDefiningAccess();
   EXPECT_TRUE(isa<MemoryPhi>(DefiningAccess));
   // Kill the store
-  MSSA.removeMemoryAccess(StoreAccess);
+  Updater.removeMemoryAccess(StoreAccess);
   MemoryPhi *MP = cast<MemoryPhi>(DefiningAccess);
   // Verify the phi ended up as liveonentry, liveonentry
   for (auto &Op : MP->incoming_values())
@@ -460,7 +467,7 @@ TEST_F(MemorySSATest, RemoveAPhi) {
   // Verify the load is now defined by liveOnEntryDef
   EXPECT_TRUE(MSSA.isLiveOnEntryDef(LoadAccess->getDefiningAccess()));
   // Remove the PHI
-  MSSA.removeMemoryAccess(MP);
+  Updater.removeMemoryAccess(MP);
   MSSA.verifyMemorySSA();
 }
 
@@ -488,6 +495,7 @@ TEST_F(MemorySSATest, RemoveMemoryAccess) {
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
+  MemorySSAUpdater Updater(&MSSA);
 
   // Before, the load will be a use of a phi<store, liveonentry>. It should be
   // the same after.
@@ -498,7 +506,7 @@ TEST_F(MemorySSATest, RemoveMemoryAccess) {
   // The load is currently clobbered by one of the phi arguments, so the walker
   // should determine the clobbering access as the phi.
   EXPECT_EQ(DefiningAccess, Walker->getClobberingMemoryAccess(LoadInst));
-  MSSA.removeMemoryAccess(StoreAccess);
+  Updater.removeMemoryAccess(StoreAccess);
   MSSA.verifyMemorySSA();
   // After the removeaccess, let's see if we got the right accesses
   // The load should still point to the phi ...
@@ -522,7 +530,7 @@ TEST_F(MemorySSATest, RemoveMemoryAccess) {
   }
 
   // Now we try to remove the single valued phi
-  MSSA.removeMemoryAccess(DefiningAccess);
+  Updater.removeMemoryAccess(DefiningAccess);
   MSSA.verifyMemorySSA();
   // Now the load should be a load of live on entry.
   EXPECT_TRUE(MSSA.isLiveOnEntryDef(LoadAccess->getDefiningAccess()));
@@ -676,10 +684,11 @@ TEST_F(MemorySSATest, PartialWalkerCacheWithPhis) {
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
+  MemorySSAUpdater Updater(&MSSA);
 
   // Kill `KillStore`; it exists solely so that the load after it won't be
   // optimized to FirstStore.
-  MSSA.removeMemoryAccess(MSSA.getMemoryAccess(KillStore));
+  Updater.removeMemoryAccess(MSSA.getMemoryAccess(KillStore));
   KillStore->eraseFromParent();
   auto *ALoadMA = cast<MemoryUse>(MSSA.getMemoryAccess(ALoad));
   EXPECT_EQ(ALoadMA->getDefiningAccess(), MSSA.getMemoryAccess(BStore));
@@ -751,15 +760,16 @@ TEST_F(MemorySSATest, WalkerReopt) {
   setupAnalyses();
   MemorySSA &MSSA = *Analyses->MSSA;
   MemorySSAWalker *Walker = Analyses->Walker;
+  MemorySSAUpdater Updater(&MSSA);
 
   MemoryAccess *LoadClobber = Walker->getClobberingMemoryAccess(LIA);
   MemoryUse *LoadAccess = cast<MemoryUse>(MSSA.getMemoryAccess(LIA));
   EXPECT_EQ(LoadClobber, MSSA.getMemoryAccess(SIA));
   EXPECT_TRUE(MSSA.isLiveOnEntryDef(Walker->getClobberingMemoryAccess(SIA)));
-  MSSA.removeMemoryAccess(LoadAccess);
+  Updater.removeMemoryAccess(LoadAccess);
 
   // Create the load memory access pointing to an unoptimized place.
-  MemoryUse *NewLoadAccess = cast<MemoryUse>(MSSA.createMemoryAccessInBB(
+  MemoryUse *NewLoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       LIA, MSSA.getMemoryAccess(SIB), LIA->getParent(), MemorySSA::End));
   // This should it cause it to be optimized
   EXPECT_EQ(Walker->getClobberingMemoryAccess(NewLoadAccess), LoadClobber);
@@ -848,7 +858,7 @@ TEST_F(MemorySSATest, Irreducible) {
   MemorySSAUpdater Updater(&MSSA);
   // Create the load memory acccess
   LoadInst *LoadInst = B.CreateLoad(FirstArg);
-  MemoryUse *LoadAccess = cast<MemoryUse>(MSSA.createMemoryAccessInBB(
+  MemoryUse *LoadAccess = cast<MemoryUse>(Updater.createMemoryAccessInBB(
       LoadInst, nullptr, AfterLoopBB, MemorySSA::Beginning));
   Updater.insertUse(LoadAccess);
   MSSA.verifyMemorySSA();
diff --git a/unittests/Analysis/ProfileSummaryInfoTest.cpp b/unittests/Analysis/ProfileSummaryInfoTest.cpp
index 56a8c2ec14db713fec58a7ea754f5dfabcfbae81..0b4b1de28053bc23278fc901d608416248beb65c 100644
--- a/unittests/Analysis/ProfileSummaryInfoTest.cpp
+++ b/unittests/Analysis/ProfileSummaryInfoTest.cpp
@@ -162,12 +162,6 @@ TEST_F(ProfileSummaryInfoTest, InstrProf) {
 
   EXPECT_TRUE(PSI.isHotCallSite(CS1, &BFI));
   EXPECT_FALSE(PSI.isHotCallSite(CS2, &BFI));
-
-  // Test that adding an MD_prof metadata with a hot count on CS2 does not
-  // change itas hotness as it has no effect in instrumented profiling.
-  MDBuilder MDB(M->getContext());
-  CI2->setMetadata(llvm::LLVMContext::MD_prof, MDB.createBranchWeights({400}));
-  EXPECT_FALSE(PSI.isHotCallSite(CS2, &BFI));
 }
 
 TEST_F(ProfileSummaryInfoTest, SampleProf) {
diff --git a/unittests/Analysis/ScalarEvolutionTest.cpp b/unittests/Analysis/ScalarEvolutionTest.cpp
index 752cc8128248e887d1349326e2fd96cf003816e0..870a27342c1ab80f2b1cc618e7eb6800b642d6a4 100644
--- a/unittests/Analysis/ScalarEvolutionTest.cpp
+++ b/unittests/Analysis/ScalarEvolutionTest.cpp
@@ -306,9 +306,11 @@ TEST_F(ScalarEvolutionsTest, ExpandPtrTypeSCEV) {
   //   %bitcast2 = bitcast i8* %select to i32*
   //   br i1 undef, label %loop, label %exit
 
+  const DataLayout &DL = F->getParent()->getDataLayout();
   BranchInst *Br = BranchInst::Create(
       LoopBB, ExitBB, UndefValue::get(Type::getInt1Ty(Context)), LoopBB);
-  AllocaInst *Alloca = new AllocaInst(I32Ty, "alloca", Br);
+  AllocaInst *Alloca = new AllocaInst(I32Ty, DL.getAllocaAddrSpace(),
+                                      "alloca", Br);
   ConstantInt *Ci32 = ConstantInt::get(Context, APInt(32, 1));
   GetElementPtrInst *Gep0 =
       GetElementPtrInst::Create(I32Ty, Alloca, Ci32, "gep0", Br);
@@ -465,7 +467,7 @@ TEST_F(ScalarEvolutionsTest, CommutativeExprOperandOrder) {
     });
 }
 
-TEST_F(ScalarEvolutionsTest, SCEVCompareComplexity) {
+TEST_F(ScalarEvolutionsTest, CompareSCEVComplexity) {
   FunctionType *FTy =
       FunctionType::get(Type::getVoidTy(Context), std::vector<Type *>(), false);
   Function *F = cast<Function>(M.getOrInsertFunction("f", FTy));
@@ -532,5 +534,73 @@ TEST_F(ScalarEvolutionsTest, SCEVCompareComplexity) {
   EXPECT_NE(nullptr, SE.getSCEV(Acc[0]));
 }
 
+TEST_F(ScalarEvolutionsTest, CompareValueComplexity) {
+  IntegerType *IntPtrTy = M.getDataLayout().getIntPtrType(Context);
+  PointerType *IntPtrPtrTy = IntPtrTy->getPointerTo();
+
+  FunctionType *FTy =
+      FunctionType::get(Type::getVoidTy(Context), {IntPtrTy, IntPtrTy}, false);
+  Function *F = cast<Function>(M.getOrInsertFunction("f", FTy));
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
+
+  Value *X = &*F->arg_begin();
+  Value *Y = &*std::next(F->arg_begin());
+
+  const int ValueDepth = 10;
+  for (int i = 0; i < ValueDepth; i++) {
+    X = new LoadInst(new IntToPtrInst(X, IntPtrPtrTy, "", EntryBB), "",
+                     /*isVolatile*/ false, EntryBB);
+    Y = new LoadInst(new IntToPtrInst(Y, IntPtrPtrTy, "", EntryBB), "",
+                     /*isVolatile*/ false, EntryBB);
+  }
+
+  auto *MulA = BinaryOperator::CreateMul(X, Y, "", EntryBB);
+  auto *MulB = BinaryOperator::CreateMul(Y, X, "", EntryBB);
+  ReturnInst::Create(Context, nullptr, EntryBB);
+
+  // This test isn't checking for correctness.  Today making A and B resolve to
+  // the same SCEV would require deeper searching in CompareValueComplexity,
+  // which will slow down compilation.  However, this test can fail (with LLVM's
+  // behavior still being correct) if we ever have a smarter
+  // CompareValueComplexity that is both fast and more accurate.
+
+  ScalarEvolution SE = buildSE(*F);
+  auto *A = SE.getSCEV(MulA);
+  auto *B = SE.getSCEV(MulB);
+  EXPECT_NE(A, B);
+}
+
+TEST_F(ScalarEvolutionsTest, SCEVAddExpr) {
+  Type *Ty32 = Type::getInt32Ty(Context);
+  Type *ArgTys[] = {Type::getInt64Ty(Context), Ty32};
+
+  FunctionType *FTy =
+      FunctionType::get(Type::getVoidTy(Context), ArgTys, false);
+  Function *F = cast<Function>(M.getOrInsertFunction("f", FTy));
+
+  Argument *A1 = &*F->arg_begin();
+  Argument *A2 = &*(std::next(F->arg_begin()));
+  BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", F);
+
+  Instruction *Trunc = CastInst::CreateTruncOrBitCast(A1, Ty32, "", EntryBB);
+  Instruction *Mul1 = BinaryOperator::CreateMul(Trunc, A2, "", EntryBB);
+  Instruction *Add1 = BinaryOperator::CreateAdd(Mul1, Trunc, "", EntryBB);
+  Mul1 = BinaryOperator::CreateMul(Add1, Trunc, "", EntryBB);
+  Instruction *Add2 = BinaryOperator::CreateAdd(Mul1, Add1, "", EntryBB);
+  // FIXME: The size of this is arbitrary and doesn't seem to change the
+  // result, but SCEV will do quadratic work for these so a large number here
+  // will be extremely slow. We should revisit what and how this is testing
+  // SCEV.
+  for (int i = 0; i < 10; i++) {
+    Mul1 = BinaryOperator::CreateMul(Add2, Add1, "", EntryBB);
+    Add1 = Add2;
+    Add2 = BinaryOperator::CreateAdd(Mul1, Add1, "", EntryBB);
+  }
+
+  ReturnInst::Create(Context, nullptr, EntryBB);
+  ScalarEvolution SE = buildSE(*F);
+  EXPECT_NE(nullptr, SE.getSCEV(Mul1));
+}
+
 }  // end anonymous namespace
 }  // end namespace llvm
diff --git a/unittests/Analysis/ValueTrackingTest.cpp b/unittests/Analysis/ValueTrackingTest.cpp
index ba0d30d59b6622bad6a15504c187e78797ac3084..a1d3695e969119d7fc90bd527ba74fab2fb100c3 100644
--- a/unittests/Analysis/ValueTrackingTest.cpp
+++ b/unittests/Analysis/ValueTrackingTest.cpp
@@ -239,3 +239,22 @@ TEST(ValueTracking, GuaranteedToTransferExecutionToSuccessor) {
     Index++;
   }
 }
+
+TEST(ValueTracking, ComputeNumSignBits_PR32045) {
+  StringRef Assembly = "define i32 @f(i32 %a) { "
+                       "  %val = ashr i32 %a, -1 "
+                       "  ret i32 %val "
+                       "} ";
+
+  LLVMContext Context;
+  SMDiagnostic Error;
+  auto M = parseAssemblyString(Assembly, Error, Context);
+  assert(M && "Bad assembly?");
+
+  auto *F = M->getFunction("f");
+  assert(F && "Bad assembly?");
+
+  auto *RVal =
+      cast<ReturnInst>(F->getEntryBlock().getTerminator())->getOperand(0);
+  EXPECT_EQ(ComputeNumSignBits(RVal, M->getDataLayout()), 1u);
+}
diff --git a/unittests/CMakeLists.txt b/unittests/CMakeLists.txt
index 8dbca211d0268dec2141cd2e3f49f9928904a9d1..8e40f141463bac37063c63254db8d2b661dbbb25 100644
--- a/unittests/CMakeLists.txt
+++ b/unittests/CMakeLists.txt
@@ -24,3 +24,4 @@ add_subdirectory(ProfileData)
 add_subdirectory(Support)
 add_subdirectory(Target)
 add_subdirectory(Transforms)
+add_subdirectory(XRay)
diff --git a/unittests/CodeGen/LowLevelTypeTest.cpp b/unittests/CodeGen/LowLevelTypeTest.cpp
index 4ea181c1c9d3cd4328d8d80ffc50374cd86db484..67113005a46a1a6ffcee459f0db752b0d493d855 100644
--- a/unittests/CodeGen/LowLevelTypeTest.cpp
+++ b/unittests/CodeGen/LowLevelTypeTest.cpp
@@ -68,7 +68,7 @@ TEST(LowLevelTypeTest, Scalar) {
 
     // Test Type->LLT conversion.
     Type *IRTy = IntegerType::get(C, S);
-    EXPECT_EQ(Ty, LLT(*IRTy, DL));
+    EXPECT_EQ(Ty, getLLTForType(*IRTy, DL));
   }
 }
 
@@ -160,7 +160,7 @@ TEST(LowLevelTypeTest, Vector) {
       // Test Type->LLT conversion.
       Type *IRSTy = IntegerType::get(C, S);
       Type *IRTy = VectorType::get(IRSTy, Elts);
-      EXPECT_EQ(VTy, LLT(*IRTy, DL));
+      EXPECT_EQ(VTy, getLLTForType(*IRTy, DL));
     }
   }
 }
@@ -188,7 +188,7 @@ TEST(LowLevelTypeTest, Pointer) {
 
     // Test Type->LLT conversion.
     Type *IRTy = PointerType::get(IntegerType::get(C, 8), AS);
-    EXPECT_EQ(Ty, LLT(*IRTy, DL));
+    EXPECT_EQ(Ty, getLLTForType(*IRTy, DL));
   }
 }
 
diff --git a/unittests/CodeGen/MachineInstrBundleIteratorTest.cpp b/unittests/CodeGen/MachineInstrBundleIteratorTest.cpp
index 416f5774f4c68cee0723b083a1a1b52ce0977e74..8f15fbf3941dd9fa6af38ba62a59c1bee8cb63d6 100644
--- a/unittests/CodeGen/MachineInstrBundleIteratorTest.cpp
+++ b/unittests/CodeGen/MachineInstrBundleIteratorTest.cpp
@@ -130,4 +130,68 @@ TEST(MachineInstrBundleIteratorTest, CompareToBundledMI) {
   ASSERT_TRUE(CI != CMBI.getIterator());
 }
 
+struct MyUnbundledInstr
+    : ilist_node<MyUnbundledInstr, ilist_sentinel_tracking<true>> {
+  bool isBundledWithPred() const { return false; }
+  bool isBundledWithSucc() const { return false; }
+};
+typedef MachineInstrBundleIterator<MyUnbundledInstr> unbundled_iterator;
+typedef MachineInstrBundleIterator<const MyUnbundledInstr>
+    const_unbundled_iterator;
+typedef MachineInstrBundleIterator<MyUnbundledInstr, true>
+    reverse_unbundled_iterator;
+typedef MachineInstrBundleIterator<const MyUnbundledInstr, true>
+    const_reverse_unbundled_iterator;
+
+TEST(MachineInstrBundleIteratorTest, ReverseConstructor) {
+  simple_ilist<MyUnbundledInstr, ilist_sentinel_tracking<true>> L;
+  const auto &CL = L;
+  MyUnbundledInstr A, B;
+  L.insert(L.end(), A);
+  L.insert(L.end(), B);
+
+  // Save typing.
+  typedef MachineInstrBundleIterator<MyUnbundledInstr> iterator;
+  typedef MachineInstrBundleIterator<MyUnbundledInstr, true> reverse_iterator;
+  typedef MachineInstrBundleIterator<const MyUnbundledInstr> const_iterator;
+  typedef MachineInstrBundleIterator<const MyUnbundledInstr, true>
+      const_reverse_iterator;
+
+  // Convert to bundle iterators.
+  auto begin = [&]() -> iterator { return L.begin(); };
+  auto end = [&]() -> iterator { return L.end(); };
+  auto rbegin = [&]() -> reverse_iterator { return L.rbegin(); };
+  auto rend = [&]() -> reverse_iterator { return L.rend(); };
+  auto cbegin = [&]() -> const_iterator { return CL.begin(); };
+  auto cend = [&]() -> const_iterator { return CL.end(); };
+  auto crbegin = [&]() -> const_reverse_iterator { return CL.rbegin(); };
+  auto crend = [&]() -> const_reverse_iterator { return CL.rend(); };
+
+  // Check conversion values.
+  EXPECT_EQ(begin(), iterator(rend()));
+  EXPECT_EQ(++begin(), iterator(++rbegin()));
+  EXPECT_EQ(end(), iterator(rbegin()));
+  EXPECT_EQ(rbegin(), reverse_iterator(end()));
+  EXPECT_EQ(++rbegin(), reverse_iterator(++begin()));
+  EXPECT_EQ(rend(), reverse_iterator(begin()));
+
+  // Check const iterator constructors.
+  EXPECT_EQ(cbegin(), const_iterator(rend()));
+  EXPECT_EQ(cbegin(), const_iterator(crend()));
+  EXPECT_EQ(crbegin(), const_reverse_iterator(end()));
+  EXPECT_EQ(crbegin(), const_reverse_iterator(cend()));
+
+  // Confirm lack of implicit conversions.
+  static_assert(!std::is_convertible<iterator, reverse_iterator>::value,
+                "unexpected implicit conversion");
+  static_assert(!std::is_convertible<reverse_iterator, iterator>::value,
+                "unexpected implicit conversion");
+  static_assert(
+      !std::is_convertible<const_iterator, const_reverse_iterator>::value,
+      "unexpected implicit conversion");
+  static_assert(
+      !std::is_convertible<const_reverse_iterator, const_iterator>::value,
+      "unexpected implicit conversion");
+}
+
 } // end namespace
diff --git a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
index 1adc0aafdc24d1161541f1693c48636250dc414f..a6c5b3a34ccb761238cda09930a94ee1019cefc5 100644
--- a/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
+++ b/unittests/DebugInfo/DWARF/DWARFDebugInfoTest.cpp
@@ -8,18 +8,27 @@
 //===----------------------------------------------------------------------===//
 
 #include "DwarfGenerator.h"
-#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/DebugInfo/DWARF/DWARFCompileUnit.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDie.h"
 #include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
-#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Object/ObjectFile.h"
 #include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/ObjectYAML/DWARFEmitter.h"
 #include "llvm/Support/Dwarf.h"
-#include "llvm/Support/Host.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/TargetSelect.h"
 #include "gtest/gtest.h"
 #include <climits>
+#include <cstdint>
+#include <cstring>
+#include <string>
 
 using namespace llvm;
 using namespace dwarf;
@@ -54,7 +63,7 @@ Triple getHostTripleForAddrSize(uint8_t AddrSize) {
 template <typename T>
 static bool HandleExpectedError(T &Expected) {
   std::string ErrorMsg;
-  handleAllErrors(Expected.takeError(), [&](const llvm::ErrorInfoBase &EI) {
+  handleAllErrors(Expected.takeError(), [&](const ErrorInfoBase &EI) {
     ErrorMsg = EI.message();
   });
   if (!ErrorMsg.empty()) {
@@ -853,8 +862,7 @@ template <uint16_t Version, class AddrType> void TestAddresses() {
   OptU64 = SubprogramDieNoPC.getHighPC(ActualLowPC);
   EXPECT_FALSE((bool)OptU64);
   EXPECT_FALSE(SubprogramDieNoPC.getLowAndHighPC(LowPC, HighPC));
-  
-  
+ 
   // Verify the that our subprogram with only a low PC value succeeds when
   // we ask for the Low PC, but fails appropriately when asked for the high PC
   // or both low and high PC values.
@@ -872,7 +880,6 @@ template <uint16_t Version, class AddrType> void TestAddresses() {
   EXPECT_FALSE((bool)OptU64);
   EXPECT_FALSE(SubprogramDieLowPC.getLowAndHighPC(LowPC, HighPC));
 
-  
   // Verify the that our subprogram with only a low PC value succeeds when
   // we ask for the Low PC, but fails appropriately when asked for the high PC
   // or both low and high PC values.
@@ -1075,7 +1082,6 @@ TEST(DWARFDebugInfo, TestRelations) {
 }
 
 TEST(DWARFDebugInfo, TestDWARFDie) {
-
   // Make sure a default constructed DWARFDie doesn't have any parent, sibling
   // or child;
   DWARFDie DefaultDie;
@@ -1166,7 +1172,8 @@ TEST(DWARFDebugInfo, TestEmptyChildren) {
                          "    Children:        DW_CHILDREN_yes\n"
                          "    Attributes:\n"
                          "debug_info:\n"
-                         "  - Length:          9\n"
+                         "  - Length:\n"
+                         "      TotalLength:          9\n"
                          "    Version:         4\n"
                          "    AbbrOffset:      0\n"
                          "    AddrSize:        8\n"
@@ -1272,20 +1279,21 @@ TEST(DWARFDebugInfo, TestFindRecurse) {
   dwarfgen::Generator *DG = ExpectedDG.get().get();
   dwarfgen::CompileUnit &CU = DG->addCompileUnit();
   
-  StringRef SpecDieName("spec");
-  StringRef AbsDieName("abs");
+  StringRef SpecDieName = "spec";
+  StringRef SpecLinkageName = "spec_linkage";
+  StringRef AbsDieName = "abs";
   // Scope to allow us to re-use the same DIE names
   {
-    // Create a compile unit DIE that has an abbreviation that says it has
-    // children, but doesn't have any actual attributes. This helps us test
-    // a DIE that has only one child: a NULL DIE.
     auto CUDie = CU.getUnitDIE();
     auto FuncSpecDie = CUDie.addChild(DW_TAG_subprogram);
+    auto FuncAbsDie = CUDie.addChild(DW_TAG_subprogram);
     auto FuncDie = CUDie.addChild(DW_TAG_subprogram);
     auto VarAbsDie = CUDie.addChild(DW_TAG_variable);
     auto VarDie = CUDie.addChild(DW_TAG_variable);
     FuncSpecDie.addAttribute(DW_AT_name, DW_FORM_strp, SpecDieName);
-    FuncDie.addAttribute(DW_AT_specification, DW_FORM_ref4, FuncSpecDie);
+    FuncAbsDie.addAttribute(DW_AT_linkage_name, DW_FORM_strp, SpecLinkageName);
+    FuncAbsDie.addAttribute(DW_AT_specification, DW_FORM_ref4, FuncSpecDie);
+    FuncDie.addAttribute(DW_AT_abstract_origin, DW_FORM_ref4, FuncAbsDie);
     VarAbsDie.addAttribute(DW_AT_name, DW_FORM_strp, AbsDieName);
     VarDie.addAttribute(DW_AT_abstract_origin, DW_FORM_ref4, VarAbsDie);
   }
@@ -1305,41 +1313,43 @@ TEST(DWARFDebugInfo, TestFindRecurse) {
   EXPECT_TRUE(CUDie.isValid());
   
   auto FuncSpecDie = CUDie.getFirstChild();
-  auto FuncDie = FuncSpecDie.getSibling();
+  auto FuncAbsDie = FuncSpecDie.getSibling();
+  auto FuncDie = FuncAbsDie.getSibling();
   auto VarAbsDie = FuncDie.getSibling();
   auto VarDie = VarAbsDie.getSibling();
 
   // Make sure we can't extract the name from the specification die when using
   // DWARFDie::find() since it won't check the DW_AT_specification DIE.
-  EXPECT_FALSE(FuncDie.find(DW_AT_name).hasValue());
+  EXPECT_FALSE(FuncDie.find(DW_AT_name));
 
   // Make sure we can extract the name from the specification die when using
   // DWARFDie::findRecursively() since it should recurse through the
   // DW_AT_specification DIE.
   auto NameOpt = FuncDie.findRecursively(DW_AT_name);
-  EXPECT_TRUE(NameOpt.hasValue());
+  EXPECT_TRUE(NameOpt);
   // Test the dwarf::toString() helper function.
   auto StringOpt = toString(NameOpt);
-  EXPECT_TRUE(StringOpt.hasValue());
+  EXPECT_TRUE(StringOpt);
   EXPECT_EQ(SpecDieName, StringOpt.getValueOr(nullptr));
   // Test the dwarf::toString() helper function with a default value specified.
   EXPECT_EQ(SpecDieName, toString(NameOpt, nullptr));
+
+  auto LinkageNameOpt = FuncDie.findRecursively(DW_AT_linkage_name);
+  EXPECT_EQ(SpecLinkageName, toString(LinkageNameOpt).getValueOr(nullptr));
   
   // Make sure we can't extract the name from the abstract origin die when using
   // DWARFDie::find() since it won't check the DW_AT_abstract_origin DIE.
-  EXPECT_FALSE(VarDie.find(DW_AT_name).hasValue());
+  EXPECT_FALSE(VarDie.find(DW_AT_name));
   
   // Make sure we can extract the name from the abstract origin die when using
   // DWARFDie::findRecursively() since it should recurse through the
   // DW_AT_abstract_origin DIE.
   NameOpt = VarDie.findRecursively(DW_AT_name);
-  EXPECT_TRUE(NameOpt.hasValue());
+  EXPECT_TRUE(NameOpt);
   // Test the dwarf::toString() helper function.
   StringOpt = toString(NameOpt);
-  EXPECT_TRUE(StringOpt.hasValue());
+  EXPECT_TRUE(StringOpt);
   EXPECT_EQ(AbsDieName, StringOpt.getValueOr(nullptr));
-  // Test the dwarf::toString() helper function with a default value specified.
-  EXPECT_EQ(AbsDieName, toString(NameOpt, nullptr));
 }
 
 TEST(DWARFDebugInfo, TestDwarfToFunctions) {
@@ -1365,7 +1375,6 @@ TEST(DWARFDebugInfo, TestDwarfToFunctions) {
   EXPECT_EQ(InvalidU64, toSectionOffset(FormValOpt, InvalidU64));
   EXPECT_EQ(InvalidS64, toSigned(FormValOpt, InvalidS64));
 
-  
   // Test successful and unsuccessful address decoding.
   uint64_t Address = 0x100000000ULL;
   FormVal.setForm(DW_FORM_addr);
@@ -1489,9 +1498,6 @@ TEST(DWARFDebugInfo, TestFindAttrs) {
   StringRef DieMangled("_Z3fooi");
   // Scope to allow us to re-use the same DIE names
   {
-    // Create a compile unit DIE that has an abbreviation that says it has
-    // children, but doesn't have any actual attributes. This helps us test
-    // a DIE that has only one child: a NULL DIE.
     auto CUDie = CU.getUnitDIE();
     auto FuncSpecDie = CUDie.addChild(DW_TAG_subprogram);
     auto FuncDie = CUDie.addChild(DW_TAG_subprogram);
@@ -1536,7 +1542,123 @@ TEST(DWARFDebugInfo, TestFindAttrs) {
   auto NameOpt = FuncDie.findRecursively(Attrs);
   EXPECT_TRUE(NameOpt.hasValue());
   EXPECT_EQ(DieMangled, toString(NameOpt, ""));
-  
+}
+
+TEST(DWARFDebugInfo, TestImplicitConstAbbrevs) {
+  uint16_t Version = 5;
+
+  const uint8_t AddrSize = sizeof(void *);
+  initLLVMIfNeeded();
+  Triple Triple = getHostTripleForAddrSize(AddrSize);
+  auto ExpectedDG = dwarfgen::Generator::create(Triple, Version);
+  if (HandleExpectedError(ExpectedDG))
+    return;
+  dwarfgen::Generator *DG = ExpectedDG.get().get();
+  dwarfgen::CompileUnit &CU = DG->addCompileUnit();
+  dwarfgen::DIE CUDie = CU.getUnitDIE();
+  const dwarf::Attribute Attr = DW_AT_lo_user;
+  const int64_t Val1 = 42;
+  const int64_t Val2 = 43;
+
+  auto FirstVal1DIE = CUDie.addChild(DW_TAG_class_type);
+  FirstVal1DIE.addAttribute(Attr, DW_FORM_implicit_const, Val1);
+
+  auto SecondVal1DIE = CUDie.addChild(DW_TAG_class_type);
+  SecondVal1DIE.addAttribute(Attr, DW_FORM_implicit_const, Val1);
+
+  auto Val2DIE = CUDie.addChild(DW_TAG_class_type);
+  Val2DIE.addAttribute(Attr, DW_FORM_implicit_const, Val2);
+
+  MemoryBufferRef FileBuffer(DG->generate(), "dwarf");
+  auto Obj = object::ObjectFile::createObjectFile(FileBuffer);
+  EXPECT_TRUE((bool)Obj);
+  DWARFContextInMemory DwarfContext(*Obj.get());
+  DWARFCompileUnit *U = DwarfContext.getCompileUnitAtIndex(0);
+  EXPECT_TRUE((bool)U);
+
+  const auto *Abbrevs = U->getAbbreviations();
+  EXPECT_TRUE((bool)Abbrevs);
+
+  // Let's find implicit_const abbrevs and verify,
+  // that there are exactly two of them and both of them
+  // can be dumped correctly.
+  typedef decltype(Abbrevs->begin()) AbbrevIt;
+  AbbrevIt Val1Abbrev = Abbrevs->end();
+  AbbrevIt Val2Abbrev = Abbrevs->end();
+  for(auto it = Abbrevs->begin(); it != Abbrevs->end(); ++it) {
+    if (it->getNumAttributes() == 0)
+      continue; // root abbrev for DW_TAG_compile_unit
+
+    auto A = it->getAttrByIndex(0);
+    EXPECT_EQ(A, Attr);
+
+    auto FormValue = it->getAttributeValue(/* offset */ 0, A, *U);
+    EXPECT_TRUE((bool)FormValue);
+    EXPECT_EQ(FormValue->getForm(), dwarf::DW_FORM_implicit_const);
+
+    const auto V = FormValue->getAsSignedConstant();
+    EXPECT_TRUE((bool)V);
+
+    auto VerifyAbbrevDump = [&V](AbbrevIt it) {
+      std::string S;
+      llvm::raw_string_ostream OS(S);
+      it->dump(OS);
+      auto FormPos = OS.str().find("DW_FORM_implicit_const");
+      EXPECT_NE(FormPos, std::string::npos);
+      auto ValPos = S.find_first_of("-0123456789", FormPos);
+      EXPECT_NE(ValPos, std::string::npos);
+      int64_t Val = std::atoll(S.substr(ValPos).c_str());
+      EXPECT_EQ(Val, *V);
+    };
+
+    switch(*V) {
+    case Val1:
+      EXPECT_EQ(Val1Abbrev, Abbrevs->end());
+      Val1Abbrev = it;
+      VerifyAbbrevDump(it);
+      break;
+    case Val2:
+      EXPECT_EQ(Val2Abbrev, Abbrevs->end());
+      Val2Abbrev = it;
+      VerifyAbbrevDump(it);
+      break;
+    default:
+      FAIL() << "Unexpected attribute value: " << *V;
+    }
+  }
+
+  // Now let's make sure that two Val1-DIEs refer to the same abbrev,
+  // and Val2-DIE refers to another one.
+  auto DieDG = U->getUnitDIE(false);
+  auto it = DieDG.begin();
+  std::multimap<int64_t, decltype(it->getAbbreviationDeclarationPtr())> DIEs;
+  const DWARFAbbreviationDeclaration *AbbrevPtrVal1 = nullptr;
+  const DWARFAbbreviationDeclaration *AbbrevPtrVal2 = nullptr;
+  for (; it != DieDG.end(); ++it) {
+    const auto *AbbrevPtr = it->getAbbreviationDeclarationPtr();
+    EXPECT_TRUE((bool)AbbrevPtr);
+    auto FormValue = it->find(Attr);
+    EXPECT_TRUE((bool)FormValue);
+    const auto V = FormValue->getAsSignedConstant();
+    EXPECT_TRUE((bool)V);
+    switch(*V) {
+    case Val1:
+      AbbrevPtrVal1 = AbbrevPtr;
+      break;
+    case Val2:
+      AbbrevPtrVal2 = AbbrevPtr;
+      break;
+    default:
+      FAIL() << "Unexpected attribute value: " << *V;
+    }
+    DIEs.insert(std::make_pair(*V, AbbrevPtr));
+  }
+  EXPECT_EQ(DIEs.count(Val1), 2u);
+  EXPECT_EQ(DIEs.count(Val2), 1u);
+  auto Val1Range = DIEs.equal_range(Val1);
+  for (auto it = Val1Range.first; it != Val1Range.second; ++it)
+    EXPECT_EQ(it->second, AbbrevPtrVal1);
+  EXPECT_EQ(DIEs.find(Val2)->second, AbbrevPtrVal2);
 }
 
 } // end anonymous namespace
diff --git a/unittests/DebugInfo/DWARF/DwarfGenerator.cpp b/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
index 9ec43cab4dc093b53754918fe0e757529b6b8cb4..ac63bbaf0a11ba444321f8fe2f35d6abd92b0947 100644
--- a/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
+++ b/unittests/DebugInfo/DWARF/DwarfGenerator.cpp
@@ -108,10 +108,6 @@ dwarfgen::DIE dwarfgen::CompileUnit::getUnitDIE() {
   return dwarfgen::DIE(this, &DU.getUnitDie());
 }
 
-void dwarfgen::DIE::setForceChildren() {
-  Die->setForceChildren(true);
-}
-
 //===----------------------------------------------------------------------===//
 /// dwarfgen::Generator implementation.
 //===----------------------------------------------------------------------===//
@@ -240,8 +236,14 @@ StringRef dwarfgen::Generator::generate() {
     assert(Length != -1U);
     Asm->EmitInt32(Length);
     Asm->EmitInt16(Version);
-    Asm->EmitInt32(0);
-    Asm->EmitInt8(CU->getAddressSize());
+    if (Version <= 4) {
+      Asm->EmitInt32(0);
+      Asm->EmitInt8(CU->getAddressSize());
+    } else {
+      Asm->EmitInt8(dwarf::DW_UT_compile);
+      Asm->EmitInt8(CU->getAddressSize());
+      Asm->EmitInt32(0);
+    }
     Asm->emitDwarfDIE(*CU->getUnitDIE().Die);
   }
 
diff --git a/unittests/DebugInfo/DWARF/DwarfGenerator.h b/unittests/DebugInfo/DWARF/DwarfGenerator.h
index 2978d1ca0021e5ede6ce3d054df59114ded6e9b1..966725b4fa4e77d271c20ca11a5bf1bd3e9a918a 100644
--- a/unittests/DebugInfo/DWARF/DwarfGenerator.h
+++ b/unittests/DebugInfo/DWARF/DwarfGenerator.h
@@ -129,9 +129,6 @@ public:
   /// \returns the newly created DIE object that is now a child owned by this
   /// object.
   dwarfgen::DIE addChild(dwarf::Tag Tag);
-  
-  /// Force a DIE to say it has children even when it doesn't.
-  void setForceChildren();
 };
 
 /// A DWARF compile unit used to generate DWARF compile/type units.
diff --git a/unittests/DebugInfo/PDB/CMakeLists.txt b/unittests/DebugInfo/PDB/CMakeLists.txt
index 3ba3196ed1c7f72f1f0957e321c4ce41df212360..cbbbd81774837dd77bfd7869735dc962d19a8e92 100644
--- a/unittests/DebugInfo/PDB/CMakeLists.txt
+++ b/unittests/DebugInfo/PDB/CMakeLists.txt
@@ -10,6 +10,7 @@ set(DebugInfoPDBSources
   StringTableBuilderTest.cpp
   MSFBuilderTest.cpp
   PDBApiTest.cpp
+  TypeServerHandlerTest.cpp
   )
 
 add_llvm_unittest(DebugInfoPDBTests
diff --git a/unittests/DebugInfo/PDB/HashTableTest.cpp b/unittests/DebugInfo/PDB/HashTableTest.cpp
index 3f5875534b7b5db3b5d12b5f8032323accc96a93..94c9ee86c4a63c08a2f203870c001ea4c16df85e 100644
--- a/unittests/DebugInfo/PDB/HashTableTest.cpp
+++ b/unittests/DebugInfo/PDB/HashTableTest.cpp
@@ -10,15 +10,16 @@
 #include "ErrorChecking.h"
 #include "gtest/gtest.h"
 
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
 #include "llvm/DebugInfo/PDB/Native/HashTable.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 #include <vector>
 
 using namespace llvm;
 using namespace llvm::pdb;
+using namespace llvm::support;
 
 namespace {
 class HashTableInternals : public HashTable {
@@ -147,14 +148,14 @@ TEST(HashTableTest, Serialization) {
   }
 
   std::vector<uint8_t> Buffer(Table.calculateSerializedLength());
-  msf::MutableByteStream Stream(Buffer);
-  msf::StreamWriter Writer(Stream);
+  MutableBinaryByteStream Stream(Buffer, little);
+  BinaryStreamWriter Writer(Stream);
   EXPECT_NO_ERROR(Table.commit(Writer));
   // We should have written precisely the number of bytes we calculated earlier.
   EXPECT_EQ(Buffer.size(), Writer.getOffset());
 
   HashTableInternals Table2;
-  msf::StreamReader Reader(Stream);
+  BinaryStreamReader Reader(Stream);
   EXPECT_NO_ERROR(Table2.load(Reader));
   // We should have read precisely the number of bytes we calculated earlier.
   EXPECT_EQ(Buffer.size(), Reader.getOffset());
diff --git a/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp b/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp
index 07591ca69d308d0ea5f543638137704f37c5b693..9f8940b77f28d769ad84191b25cb3c6865678c80 100644
--- a/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp
+++ b/unittests/DebugInfo/PDB/MappedBlockStreamTest.cpp
@@ -9,26 +9,28 @@
 
 #include "ErrorChecking.h"
 
-#include "llvm/DebugInfo/MSF/ByteStream.h"
 #include "llvm/DebugInfo/MSF/IMSFFile.h"
-#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/DebugInfo/MSF/MSFError.h"
 #include "llvm/DebugInfo/MSF/MSFStreamLayout.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamRef.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
+#include "llvm/DebugInfo/MSF/MappedBlockStream.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 #include "gtest/gtest.h"
 
 #include <unordered_map>
 
 using namespace llvm;
 using namespace llvm::msf;
+using namespace llvm::support;
 
 namespace {
 
 static const uint32_t BlocksAry[] = {0, 1, 2, 5, 4, 3, 6, 7, 8, 9};
 static uint8_t DataAry[] = {'A', 'B', 'C', 'F', 'E', 'D', 'G', 'H', 'I', 'J'};
 
-class DiscontiguousStream : public WritableStream {
+class DiscontiguousStream : public WritableBinaryStream {
 public:
   DiscontiguousStream(ArrayRef<uint32_t> Blocks, MutableArrayRef<uint8_t> Data)
       : Blocks(Blocks.begin(), Blocks.end()), Data(Data.begin(), Data.end()) {}
@@ -36,31 +38,33 @@ public:
   uint32_t block_size() const { return 1; }
   uint32_t block_count() const { return Blocks.size(); }
 
+  endianness getEndian() const override { return little; }
+
   Error readBytes(uint32_t Offset, uint32_t Size,
-                  ArrayRef<uint8_t> &Buffer) const override {
-    if (Offset + Size > Data.size())
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
+                  ArrayRef<uint8_t> &Buffer) override {
+    if (auto EC = checkOffset(Offset, Size))
+      return EC;
     Buffer = Data.slice(Offset, Size);
     return Error::success();
   }
 
   Error readLongestContiguousChunk(uint32_t Offset,
-                                   ArrayRef<uint8_t> &Buffer) const override {
-    if (Offset >= Data.size())
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
+                                   ArrayRef<uint8_t> &Buffer) override {
+    if (auto EC = checkOffset(Offset, 1))
+      return EC;
     Buffer = Data.drop_front(Offset);
     return Error::success();
   }
 
-  uint32_t getLength() const override { return Data.size(); }
+  uint32_t getLength() override { return Data.size(); }
 
-  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> SrcData) const override {
-    if (Offset + SrcData.size() > Data.size())
-      return make_error<MSFError>(msf_error_code::insufficient_buffer);
+  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> SrcData) override {
+    if (auto EC = checkOffset(Offset, SrcData.size()))
+      return EC;
     ::memcpy(&Data[Offset], SrcData.data(), SrcData.size());
     return Error::success();
   }
-  Error commit() const override { return Error::success(); }
+  Error commit() override { return Error::success(); }
 
   MSFStreamLayout layout() const {
     return MSFStreamLayout{static_cast<uint32_t>(Data.size()), Blocks};
@@ -78,8 +82,8 @@ TEST(MappedBlockStreamTest, ReadBeyondEndOfStreamRef) {
   auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
                                            F.layout(), F);
 
-  StreamReader R(*S);
-  ReadableStreamRef SR;
+  BinaryStreamReader R(*S);
+  BinaryStreamRef SR;
   EXPECT_NO_ERROR(R.readStreamRef(SR, 0U));
   ArrayRef<uint8_t> Buffer;
   EXPECT_ERROR(SR.readBytes(0U, 1U, Buffer));
@@ -94,7 +98,7 @@ TEST(MappedBlockStreamTest, ReadOntoNonEmptyBuffer) {
   auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
                                            F.layout(), F);
 
-  StreamReader R(*S);
+  BinaryStreamReader R(*S);
   StringRef Str = "ZYXWVUTSRQPONMLKJIHGFEDCBA";
   EXPECT_NO_ERROR(R.readFixedString(Str, 1));
   EXPECT_EQ(Str, StringRef("A"));
@@ -108,7 +112,7 @@ TEST(MappedBlockStreamTest, ZeroCopyReadContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
   auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
                                            F.layout(), F);
-  StreamReader R(*S);
+  BinaryStreamReader R(*S);
   StringRef Str;
   EXPECT_NO_ERROR(R.readFixedString(Str, 2));
   EXPECT_EQ(Str, StringRef("AB"));
@@ -127,7 +131,7 @@ TEST(MappedBlockStreamTest, CopyReadNonContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
   auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
                                            F.layout(), F);
-  StreamReader R(*S);
+  BinaryStreamReader R(*S);
   StringRef Str;
   EXPECT_NO_ERROR(R.readFixedString(Str, 10));
   EXPECT_EQ(Str, StringRef("ABCDEFGHIJ"));
@@ -140,7 +144,7 @@ TEST(MappedBlockStreamTest, InvalidReadSizeNoBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
   auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
                                            F.layout(), F);
-  StreamReader R(*S);
+  BinaryStreamReader R(*S);
   StringRef Str;
 
   R.setOffset(10);
@@ -154,7 +158,7 @@ TEST(MappedBlockStreamTest, InvalidReadSizeContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
   auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
                                            F.layout(), F);
-  StreamReader R(*S);
+  BinaryStreamReader R(*S);
   StringRef Str;
 
   R.setOffset(6);
@@ -168,7 +172,7 @@ TEST(MappedBlockStreamTest, InvalidReadSizeNonContiguousBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
   auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
                                            F.layout(), F);
-  StreamReader R(*S);
+  BinaryStreamReader R(*S);
   StringRef Str;
 
   EXPECT_ERROR(R.readFixedString(Str, 11));
@@ -181,7 +185,7 @@ TEST(MappedBlockStreamTest, ZeroCopyReadNoBreak) {
   DiscontiguousStream F(BlocksAry, DataAry);
   auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
                                            F.layout(), F);
-  StreamReader R(*S);
+  BinaryStreamReader R(*S);
   StringRef Str;
   EXPECT_NO_ERROR(R.readFixedString(Str, 1));
   EXPECT_EQ(Str, StringRef("A"));
@@ -195,7 +199,7 @@ TEST(MappedBlockStreamTest, UnalignedOverlappingRead) {
   DiscontiguousStream F(BlocksAry, DataAry);
   auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
                                            F.layout(), F);
-  StreamReader R(*S);
+  BinaryStreamReader R(*S);
   StringRef Str1;
   StringRef Str2;
   EXPECT_NO_ERROR(R.readFixedString(Str1, 7));
@@ -216,7 +220,7 @@ TEST(MappedBlockStreamTest, UnalignedOverlappingReadFail) {
   DiscontiguousStream F(BlocksAry, DataAry);
   auto S = MappedBlockStream::createStream(F.block_size(), F.block_count(),
                                            F.layout(), F);
-  StreamReader R(*S);
+  BinaryStreamReader R(*S);
   StringRef Str1;
   StringRef Str2;
   EXPECT_NO_ERROR(R.readFixedString(Str1, 6));
@@ -323,8 +327,8 @@ TEST(MappedBlockStreamTest, TestWriteThenRead) {
   uint32_t intArr1[] = {890723408, 29082234};
   ArrayRef<uint32_t> intArray[] = {intArr0, intArr1};
 
-  StreamReader Reader(*S);
-  StreamWriter Writer(*S);
+  BinaryStreamReader Reader(*S);
+  BinaryStreamWriter Writer(*S);
   EXPECT_NO_ERROR(Writer.writeInteger(u16[0]));
   EXPECT_NO_ERROR(Reader.readInteger(u16[1]));
   EXPECT_EQ(u16[0], u16[1]);
@@ -352,8 +356,8 @@ TEST(MappedBlockStreamTest, TestWriteThenRead) {
   Reader.setOffset(0);
   Writer.setOffset(0);
   ::memset(DataBytes.data(), 0, 10);
-  EXPECT_NO_ERROR(Writer.writeZeroString(ZStr[0]));
-  EXPECT_NO_ERROR(Reader.readZeroString(ZStr[1]));
+  EXPECT_NO_ERROR(Writer.writeCString(ZStr[0]));
+  EXPECT_NO_ERROR(Reader.readCString(ZStr[1]));
   EXPECT_EQ(ZStr[0], ZStr[1]);
   EXPECT_EQ(
       std::vector<uint8_t>({'r', 'e', 'Z', ' ', 'S', 't', 'o', 'r', 0, 0}),
@@ -399,22 +403,22 @@ TEST(MappedBlockStreamTest, TestWriteContiguousStreamRef) {
       F.block_size(), F.block_count(), F.layout(), F);
 
   // First write "Test Str" into the source stream.
-  MutableByteStream SourceStream(SrcData);
-  StreamWriter SourceWriter(SourceStream);
-  EXPECT_NO_ERROR(SourceWriter.writeZeroString("Test Str"));
+  MutableBinaryByteStream SourceStream(SrcData, little);
+  BinaryStreamWriter SourceWriter(SourceStream);
+  EXPECT_NO_ERROR(SourceWriter.writeCString("Test Str"));
   EXPECT_EQ(SrcDataBytes, std::vector<uint8_t>(
                               {'T', 'e', 's', 't', ' ', 'S', 't', 'r', 0, 0}));
 
   // Then write the source stream into the dest stream.
-  StreamWriter DestWriter(*DestStream);
+  BinaryStreamWriter DestWriter(*DestStream);
   EXPECT_NO_ERROR(DestWriter.writeStreamRef(SourceStream));
   EXPECT_EQ(DestDataBytes, std::vector<uint8_t>(
                                {'s', 'e', 'T', ' ', 'S', 't', 't', 'r', 0, 0}));
 
   // Then read the string back out of the dest stream.
   StringRef Result;
-  StreamReader DestReader(*DestStream);
-  EXPECT_NO_ERROR(DestReader.readZeroString(Result));
+  BinaryStreamReader DestReader(*DestStream);
+  EXPECT_NO_ERROR(DestReader.readCString(Result));
   EXPECT_EQ(Result, "Test Str");
 }
 
@@ -436,21 +440,21 @@ TEST(MappedBlockStreamTest, TestWriteDiscontiguousStreamRef) {
       SrcF.block_size(), SrcF.block_count(), SrcF.layout(), SrcF);
 
   // First write "Test Str" into the source stream.
-  StreamWriter SourceWriter(*Src);
-  EXPECT_NO_ERROR(SourceWriter.writeZeroString("Test Str"));
+  BinaryStreamWriter SourceWriter(*Src);
+  EXPECT_NO_ERROR(SourceWriter.writeCString("Test Str"));
   EXPECT_EQ(SrcDataBytes, std::vector<uint8_t>(
                               {'e', 'T', 't', 't', ' ', 'S', 's', 'r', 0, 0}));
 
   // Then write the source stream into the dest stream.
-  StreamWriter DestWriter(*Dest);
+  BinaryStreamWriter DestWriter(*Dest);
   EXPECT_NO_ERROR(DestWriter.writeStreamRef(*Src));
   EXPECT_EQ(DestDataBytes, std::vector<uint8_t>(
                                {'s', 'e', 'T', ' ', 'S', 't', 't', 'r', 0, 0}));
 
   // Then read the string back out of the dest stream.
   StringRef Result;
-  StreamReader DestReader(*Dest);
-  EXPECT_NO_ERROR(DestReader.readZeroString(Result));
+  BinaryStreamReader DestReader(*Dest);
+  EXPECT_NO_ERROR(DestReader.readCString(Result));
   EXPECT_EQ(Result, "Test Str");
 }
 
diff --git a/unittests/DebugInfo/PDB/PDBApiTest.cpp b/unittests/DebugInfo/PDB/PDBApiTest.cpp
index cd0f928a08ab55e88edc69801f5963ce8b7c9169..ba09a8e28424a66414ca825e0a42796a641cb668 100644
--- a/unittests/DebugInfo/PDB/PDBApiTest.cpp
+++ b/unittests/DebugInfo/PDB/PDBApiTest.cpp
@@ -63,7 +63,7 @@ namespace {
 class MockSession : public IPDBSession {
   uint64_t getLoadAddress() const override { return 0; }
   void setLoadAddress(uint64_t Address) override {}
-  std::unique_ptr<PDBSymbolExe> getGlobalScope() const override {
+  std::unique_ptr<PDBSymbolExe> getGlobalScope() override {
     return nullptr;
   }
   std::unique_ptr<PDBSymbol> getSymbolById(uint32_t SymbolId) const override {
diff --git a/unittests/DebugInfo/PDB/StringTableBuilderTest.cpp b/unittests/DebugInfo/PDB/StringTableBuilderTest.cpp
index ebd6073f4a8d50ed4a113d6d8a4a06345332730b..7c4838778e43b5d7c1b6300601f35b0738aebad4 100644
--- a/unittests/DebugInfo/PDB/StringTableBuilderTest.cpp
+++ b/unittests/DebugInfo/PDB/StringTableBuilderTest.cpp
@@ -9,16 +9,17 @@
 
 #include "ErrorChecking.h"
 
-#include "llvm/DebugInfo/MSF/ByteStream.h"
-#include "llvm/DebugInfo/MSF/StreamReader.h"
-#include "llvm/DebugInfo/MSF/StreamWriter.h"
 #include "llvm/DebugInfo/PDB/Native/StringTable.h"
 #include "llvm/DebugInfo/PDB/Native/StringTableBuilder.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 #include "gtest/gtest.h"
 
 using namespace llvm;
 using namespace llvm::pdb;
+using namespace llvm::support;
 
 namespace {
 class StringTableBuilderTest : public ::testing::Test {};
@@ -33,13 +34,13 @@ TEST_F(StringTableBuilderTest, Simple) {
   EXPECT_EQ(9U, Builder.insert("baz"));
 
   std::vector<uint8_t> Buffer(Builder.finalize());
-  msf::MutableByteStream OutStream(Buffer);
-  msf::StreamWriter Writer(OutStream);
+  MutableBinaryByteStream OutStream(Buffer, little);
+  BinaryStreamWriter Writer(OutStream);
   EXPECT_NO_ERROR(Builder.commit(Writer));
 
   // Reads the contents back.
-  msf::ByteStream InStream(Buffer);
-  msf::StreamReader Reader(InStream);
+  BinaryByteStream InStream(Buffer, little);
+  BinaryStreamReader Reader(InStream);
   StringTable Table;
   EXPECT_NO_ERROR(Table.load(Reader));
 
diff --git a/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp b/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..6995e8f9dded2fa4b58ae1486529e2fabc522bb2
--- /dev/null
+++ b/unittests/DebugInfo/PDB/TypeServerHandlerTest.cpp
@@ -0,0 +1,175 @@
+//===- llvm/unittest/DebugInfo/PDB/TypeServerHandlerTest.cpp --------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ErrorChecking.h"
+
+#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
+#include "llvm/DebugInfo/CodeView/TypeSerializer.h"
+#include "llvm/DebugInfo/CodeView/TypeServerHandler.h"
+#include "llvm/DebugInfo/CodeView/TypeTableBuilder.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h"
+#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h"
+#include "llvm/DebugInfo/PDB/Native/RawTypes.h"
+#include "llvm/Support/Allocator.h"
+#include "llvm/Support/Error.h"
+
+#include "gtest/gtest.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+namespace {
+
+constexpr uint8_t Guid[] = {0x2a, 0x2c, 0x1c, 0x2a, 0xcb, 0x9e, 0x48, 0x18,
+                            0x82, 0x82, 0x7a, 0x87, 0xc3, 0xfe, 0x16, 0xe8};
+StringRef GuidStr(reinterpret_cast<const char *>(Guid),
+                  llvm::array_lengthof(Guid));
+
+constexpr const char *Name = "Test Name";
+constexpr int Age = 1;
+
+class MockTypeServerHandler : public TypeServerHandler {
+public:
+  explicit MockTypeServerHandler(bool HandleAlways)
+      : HandleAlways(HandleAlways) {}
+
+  Expected<bool> handle(TypeServer2Record &TS,
+                        TypeVisitorCallbacks &Callbacks) override {
+    if (TS.Age != Age || TS.Guid != GuidStr || TS.Name != Name)
+      return make_error<CodeViewError>(cv_error_code::corrupt_record,
+                                       "Invalid TypeServer record!");
+
+    if (Handled && !HandleAlways)
+      return false;
+
+    Handled = true;
+    return true;
+  }
+
+  bool Handled = false;
+  bool HandleAlways;
+};
+
+class MockTypeVisitorCallbacks : public TypeVisitorCallbacks {
+public:
+  enum class State {
+    Ready,
+    VisitTypeBegin,
+    VisitKnownRecord,
+    VisitTypeEnd,
+  };
+  Error visitTypeBegin(CVType &CVT) override {
+    if (S != State::Ready)
+      return make_error<CodeViewError>(cv_error_code::unspecified,
+                                       "Invalid visitor state!");
+
+    S = State::VisitTypeBegin;
+    return Error::success();
+  }
+
+  Error visitKnownRecord(CVType &CVT, TypeServer2Record &TS) override {
+    if (S != State::VisitTypeBegin)
+      return make_error<CodeViewError>(cv_error_code::unspecified,
+                                       "Invalid visitor state!");
+
+    S = State::VisitKnownRecord;
+    return Error::success();
+  }
+
+  Error visitTypeEnd(CVType &CVT) override {
+    if (S != State::VisitKnownRecord)
+      return make_error<CodeViewError>(cv_error_code::unspecified,
+                                       "Invalid visitor state!");
+
+    S = State::VisitTypeEnd;
+    return Error::success();
+  }
+
+  State S = State::Ready;
+};
+
+class TypeServerHandlerTest : public testing::Test {
+public:
+  void SetUp() override {
+    TypeServer2Record R(TypeRecordKind::TypeServer2);
+    R.Age = Age;
+    R.Guid = GuidStr;
+    R.Name = Name;
+
+    TypeTableBuilder Builder(Allocator);
+    Builder.writeKnownType(R);
+    TypeServerRecord.RecordData = Builder.records().front();
+    TypeServerRecord.Type = TypeLeafKind::LF_TYPESERVER2;
+  }
+
+protected:
+  BumpPtrAllocator Allocator;
+  CVType TypeServerRecord;
+};
+
+// Test that when no type server handler is registered, it gets handled by the
+// normal
+// visitor callbacks.
+TEST_F(TypeServerHandlerTest, VisitRecordNoTypeServer) {
+  MockTypeVisitorCallbacks C2;
+  MockTypeVisitorCallbacks C1;
+  TypeVisitorCallbackPipeline Pipeline;
+
+  Pipeline.addCallbackToPipeline(C1);
+  Pipeline.addCallbackToPipeline(C2);
+  CVTypeVisitor Visitor(Pipeline);
+  EXPECT_NO_ERROR(Visitor.visitTypeRecord(TypeServerRecord));
+
+  EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C1.S);
+  EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C2.S);
+}
+
+// Test that when a TypeServerHandler is registered, it gets consumed by the
+// handler if and only if the handler returns true.
+TEST_F(TypeServerHandlerTest, VisitRecordWithTypeServerOnce) {
+  MockTypeServerHandler Handler(false);
+
+  MockTypeVisitorCallbacks C1;
+  CVTypeVisitor Visitor(C1);
+  Visitor.addTypeServerHandler(Handler);
+
+  // Our mock server returns true the first time.
+  EXPECT_NO_ERROR(Visitor.visitTypeRecord(TypeServerRecord));
+  EXPECT_TRUE(Handler.Handled);
+  EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S);
+
+  // And false the second time.
+  EXPECT_NO_ERROR(Visitor.visitTypeRecord(TypeServerRecord));
+  EXPECT_TRUE(Handler.Handled);
+  EXPECT_EQ(MockTypeVisitorCallbacks::State::VisitTypeEnd, C1.S);
+}
+
+// Test that when a type server handler is registered, if the handler keeps
+// returning true, it will keep getting consumed by the handler and not go
+// to the default processor.
+TEST_F(TypeServerHandlerTest, VisitRecordWithTypeServerAlways) {
+  MockTypeServerHandler Handler(true);
+
+  MockTypeVisitorCallbacks C1;
+  CVTypeVisitor Visitor(C1);
+  Visitor.addTypeServerHandler(Handler);
+
+  EXPECT_NO_ERROR(Visitor.visitTypeRecord(TypeServerRecord));
+  EXPECT_TRUE(Handler.Handled);
+  EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S);
+
+  EXPECT_NO_ERROR(Visitor.visitTypeRecord(TypeServerRecord));
+  EXPECT_TRUE(Handler.Handled);
+  EXPECT_EQ(MockTypeVisitorCallbacks::State::Ready, C1.S);
+}
+
+} // end anonymous namespace
diff --git a/unittests/ExecutionEngine/Orc/CMakeLists.txt b/unittests/ExecutionEngine/Orc/CMakeLists.txt
index 68f6d0c28d7caab11fc85870de031dc2a6c6b13d..db40c4213bd70c51e3a3e7f70161c5557fd8934c 100644
--- a/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -14,11 +14,12 @@ add_llvm_unittest(OrcJITTests
   IndirectionUtilsTest.cpp
   GlobalMappingLayerTest.cpp
   LazyEmittingLayerTest.cpp
-  ObjectLinkingLayerTest.cpp
   ObjectTransformLayerTest.cpp
   OrcCAPITest.cpp
   OrcTestCommon.cpp
+  QueueChannel.cpp
   RPCUtilsTest.cpp
+  RTDyldObjectLinkingLayerTest.cpp
   )
 
-target_link_libraries(OrcJITTests ${PTHREAD_LIB})
+target_link_libraries(OrcJITTests ${LLVM_PTHREAD_LIB})
diff --git a/unittests/ExecutionEngine/Orc/IndirectionUtilsTest.cpp b/unittests/ExecutionEngine/Orc/IndirectionUtilsTest.cpp
index ac847039d9fb2d7379a709f885e9b007372eeff3..48c9f7e6094334e3b9f07d3b6784a3a432686cdc 100644
--- a/unittests/ExecutionEngine/Orc/IndirectionUtilsTest.cpp
+++ b/unittests/ExecutionEngine/Orc/IndirectionUtilsTest.cpp
@@ -20,17 +20,17 @@ TEST(IndirectionUtilsTest, MakeStub) {
   LLVMContext Context;
   ModuleBuilder MB(Context, "x86_64-apple-macosx10.10", "");
   Function *F = MB.createFunctionDecl<void(DummyStruct, DummyStruct)>("");
-  SmallVector<AttributeSet, 4> Attrs;
+  SmallVector<AttributeList, 4> Attrs;
   Attrs.push_back(
-    AttributeSet::get(MB.getModule()->getContext(), 1U,
-                      AttrBuilder().addAttribute(Attribute::StructRet)));
+      AttributeList::get(MB.getModule()->getContext(), 1U,
+                         AttrBuilder().addAttribute(Attribute::StructRet)));
   Attrs.push_back(
-    AttributeSet::get(MB.getModule()->getContext(), 2U,
-                      AttrBuilder().addAttribute(Attribute::ByVal)));
+      AttributeList::get(MB.getModule()->getContext(), 2U,
+                         AttrBuilder().addAttribute(Attribute::ByVal)));
   Attrs.push_back(
-    AttributeSet::get(MB.getModule()->getContext(), ~0U,
-                      AttrBuilder().addAttribute(Attribute::NoUnwind)));
-  F->setAttributes(AttributeSet::get(MB.getModule()->getContext(), Attrs));
+      AttributeList::get(MB.getModule()->getContext(), ~0U,
+                         AttrBuilder().addAttribute(Attribute::NoUnwind)));
+  F->setAttributes(AttributeList::get(MB.getModule()->getContext(), Attrs));
 
   auto ImplPtr = orc::createImplPointer(*F->getType(), *MB.getModule(), "", nullptr);
   orc::makeStub(*F, *ImplPtr);
diff --git a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
index 63b85dc82ca84afed076514c41b8b1dd51d78c63..96214a368dce0ddf8b2068ee6f9517680e0f8d91 100644
--- a/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/ObjectTransformLayerTest.cpp
@@ -12,7 +12,7 @@
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
 #include "llvm/ExecutionEngine/Orc/NullResolver.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/Object/ObjectFile.h"
 #include "gtest/gtest.h"
@@ -309,7 +309,7 @@ TEST(ObjectTransformLayerTest, Main) {
   };
 
   // Construct the jit layers.
-  ObjectLinkingLayer<> BaseLayer;
+  RTDyldObjectLinkingLayer<> BaseLayer;
   auto IdentityTransform = [](
       std::unique_ptr<llvm::object::OwningBinary<llvm::object::ObjectFile>>
           Obj) { return Obj; };
diff --git a/unittests/ExecutionEngine/Orc/OrcTestCommon.cpp b/unittests/ExecutionEngine/Orc/OrcTestCommon.cpp
index 17d1e9c9276e76ce7b086e13e710a6779de334db..ccd2fc0fb189253d66fe8b042c4d44054b6ef9d2 100644
--- a/unittests/ExecutionEngine/Orc/OrcTestCommon.cpp
+++ b/unittests/ExecutionEngine/Orc/OrcTestCommon.cpp
@@ -15,7 +15,7 @@
 
 using namespace llvm;
 
-bool OrcExecutionTest::NativeTargetInitialized = false;
+bool OrcNativeTarget::NativeTargetInitialized = false;
 
 ModuleBuilder::ModuleBuilder(LLVMContext &Context, StringRef Triple,
                              StringRef Name)
diff --git a/unittests/ExecutionEngine/Orc/OrcTestCommon.h b/unittests/ExecutionEngine/Orc/OrcTestCommon.h
index f3972a3084e51c941f3977d5c6bebba3248c3163..7fb26634c7a7a133c473822cb19c4361aff6ca4a 100644
--- a/unittests/ExecutionEngine/Orc/OrcTestCommon.h
+++ b/unittests/ExecutionEngine/Orc/OrcTestCommon.h
@@ -28,17 +28,29 @@
 
 namespace llvm {
 
-// Base class for Orc tests that will execute code.
-class OrcExecutionTest {
+class OrcNativeTarget {
 public:
-
-  OrcExecutionTest() {
+  static void initialize() {
     if (!NativeTargetInitialized) {
       InitializeNativeTarget();
       InitializeNativeTargetAsmParser();
       InitializeNativeTargetAsmPrinter();
       NativeTargetInitialized = true;
     }
+  }
+
+private:
+  static bool NativeTargetInitialized;
+};
+
+// Base class for Orc tests that will execute code.
+class OrcExecutionTest {
+public:
+
+  OrcExecutionTest() {
+
+    // Initialize the native target if it hasn't been done already.
+    OrcNativeTarget::initialize();
 
     // Try to select a TargetMachine for the host.
     TM.reset(EngineBuilder().selectTarget());
@@ -56,8 +68,6 @@ public:
 protected:
   LLVMContext Context;
   std::unique_ptr<TargetMachine> TM;
-private:
-  static bool NativeTargetInitialized;
 };
 
 class ModuleBuilder {
diff --git a/unittests/ExecutionEngine/Orc/QueueChannel.cpp b/unittests/ExecutionEngine/Orc/QueueChannel.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e309a7e428c0404ab682f0160af84ccf25735ce6
--- /dev/null
+++ b/unittests/ExecutionEngine/Orc/QueueChannel.cpp
@@ -0,0 +1,14 @@
+//===-------- QueueChannel.cpp - Unit tests the remote executors ----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "QueueChannel.h"
+
+char llvm::QueueChannelError::ID;
+char llvm::QueueChannelClosedError::ID;
+
diff --git a/unittests/ExecutionEngine/Orc/QueueChannel.h b/unittests/ExecutionEngine/Orc/QueueChannel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3d1058a83ebc13c0f7aa35fcb25f068913aa914b
--- /dev/null
+++ b/unittests/ExecutionEngine/Orc/QueueChannel.h
@@ -0,0 +1,146 @@
+//===----------------------- Queue.h - RPC Queue ------------------*-c++-*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_UNITTESTS_EXECUTIONENGINE_ORC_QUEUECHANNEL_H
+#define LLVM_UNITTESTS_EXECUTIONENGINE_ORC_QUEUECHANNEL_H
+
+#include "llvm/ExecutionEngine/Orc/RawByteChannel.h"
+#include "llvm/Support/Error.h"
+
+#include <queue>
+#include <condition_variable>
+
+namespace llvm {
+
+class QueueChannelError : public ErrorInfo<QueueChannelError> {
+public:
+  static char ID;
+};
+
+class QueueChannelClosedError
+    : public ErrorInfo<QueueChannelClosedError, QueueChannelError> {
+public:
+  static char ID;
+  std::error_code convertToErrorCode() const override {
+    return inconvertibleErrorCode();
+  }
+
+  void log(raw_ostream &OS) const override {
+    OS << "Queue closed";
+  }
+};
+
+class Queue : public std::queue<char> {
+public:
+  using ErrorInjector = std::function<Error()>;
+
+  Queue()
+    : ReadError([]() { return Error::success(); }),
+      WriteError([]() { return Error::success(); }) {}
+
+  Queue(const Queue&) = delete;
+  Queue& operator=(const Queue&) = delete;
+  Queue(Queue&&) = delete;
+  Queue& operator=(Queue&&) = delete;
+
+  std::mutex &getMutex() { return M; }
+  std::condition_variable &getCondVar() { return CV; }
+  Error checkReadError() { return ReadError(); }
+  Error checkWriteError() { return WriteError(); }
+  void setReadError(ErrorInjector NewReadError) {
+    {
+      std::lock_guard<std::mutex> Lock(M);
+      ReadError = std::move(NewReadError);
+    }
+    CV.notify_one();
+  }
+  void setWriteError(ErrorInjector NewWriteError) {
+    std::lock_guard<std::mutex> Lock(M);
+    WriteError = std::move(NewWriteError);
+  }
+private:
+  std::mutex M;
+  std::condition_variable CV;
+  std::function<Error()> ReadError, WriteError;
+};
+
+class QueueChannel : public orc::rpc::RawByteChannel {
+public:
+  QueueChannel(std::shared_ptr<Queue> InQueue,
+               std::shared_ptr<Queue> OutQueue)
+      : InQueue(InQueue), OutQueue(OutQueue) {}
+
+  QueueChannel(const QueueChannel&) = delete;
+  QueueChannel& operator=(const QueueChannel&) = delete;
+  QueueChannel(QueueChannel&&) = delete;
+  QueueChannel& operator=(QueueChannel&&) = delete;
+
+  Error readBytes(char *Dst, unsigned Size) override {
+    std::unique_lock<std::mutex> Lock(InQueue->getMutex());
+    while (Size) {
+      {
+        Error Err = InQueue->checkReadError();
+        while (!Err && InQueue->empty()) {
+          InQueue->getCondVar().wait(Lock);
+          Err = InQueue->checkReadError();
+        }
+        if (Err)
+          return Err;
+      }
+      *Dst++ = InQueue->front();
+      --Size;
+      ++NumRead;
+      InQueue->pop();
+    }
+    return Error::success();
+  }
+
+  Error appendBytes(const char *Src, unsigned Size) override {
+    std::unique_lock<std::mutex> Lock(OutQueue->getMutex());
+    while (Size--) {
+      if (Error Err = OutQueue->checkWriteError())
+        return Err;
+      OutQueue->push(*Src++);
+      ++NumWritten;
+    }
+    OutQueue->getCondVar().notify_one();
+    return Error::success();
+  }
+
+  Error send() override { return Error::success(); }
+
+  void close() {
+    auto ChannelClosed = []() { return make_error<QueueChannelClosedError>(); };
+    InQueue->setReadError(ChannelClosed);
+    InQueue->setWriteError(ChannelClosed);
+    OutQueue->setReadError(ChannelClosed);
+    OutQueue->setWriteError(ChannelClosed);
+  }
+
+  uint64_t NumWritten = 0;
+  uint64_t NumRead = 0;
+
+private:
+
+  std::shared_ptr<Queue> InQueue;
+  std::shared_ptr<Queue> OutQueue;
+};
+
+inline std::pair<std::unique_ptr<QueueChannel>, std::unique_ptr<QueueChannel>>
+createPairedQueueChannels() {
+  auto Q1 = std::make_shared<Queue>();
+  auto Q2 = std::make_shared<Queue>();
+  auto C1 = llvm::make_unique<QueueChannel>(Q1, Q2);
+  auto C2 = llvm::make_unique<QueueChannel>(Q2, Q1);
+  return std::make_pair(std::move(C1), std::move(C2));
+}
+
+}
+
+#endif
diff --git a/unittests/ExecutionEngine/Orc/RPCUtilsTest.cpp b/unittests/ExecutionEngine/Orc/RPCUtilsTest.cpp
index d21a4acc08dc5289de2a2716b4347352d0933e89..3d46ef88f7c5173bf8bbb21e7d1201259b00e464 100644
--- a/unittests/ExecutionEngine/Orc/RPCUtilsTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RPCUtilsTest.cpp
@@ -7,8 +7,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ExecutionEngine/Orc/RawByteChannel.h"
 #include "llvm/ExecutionEngine/Orc/RPCUtils.h"
+#include "QueueChannel.h"
 #include "gtest/gtest.h"
 
 #include <queue>
@@ -17,47 +17,6 @@ using namespace llvm;
 using namespace llvm::orc;
 using namespace llvm::orc::rpc;
 
-class Queue : public std::queue<char> {
-public:
-  std::mutex &getMutex() { return M; }
-  std::condition_variable &getCondVar() { return CV; }
-private:
-  std::mutex M;
-  std::condition_variable CV;
-};
-
-class QueueChannel : public RawByteChannel {
-public:
-  QueueChannel(Queue &InQueue, Queue &OutQueue)
-      : InQueue(InQueue), OutQueue(OutQueue) {}
-
-  Error readBytes(char *Dst, unsigned Size) override {
-    std::unique_lock<std::mutex> Lock(InQueue.getMutex());
-    while (Size) {
-      while (InQueue.empty())
-        InQueue.getCondVar().wait(Lock);
-      *Dst++ = InQueue.front();
-      --Size;
-      InQueue.pop();
-    }
-    return Error::success();
-  }
-
-  Error appendBytes(const char *Src, unsigned Size) override {
-    std::unique_lock<std::mutex> Lock(OutQueue.getMutex());
-    while (Size--)
-      OutQueue.push(*Src++);
-    OutQueue.getCondVar().notify_one();
-    return Error::success();
-  }
-
-  Error send() override { return Error::success(); }
-
-private:
-  Queue &InQueue;
-  Queue &OutQueue;
-};
-
 class RPCFoo {};
 
 namespace llvm {
@@ -120,6 +79,11 @@ namespace DummyRPCAPI {
     static const char* getName() { return "IntInt"; }
   };
 
+  class VoidString : public Function<VoidString, void(std::string)> {
+  public:
+    static const char* getName() { return "VoidString"; }
+  };
+
   class AllTheTypes
     : public Function<AllTheTypes,
                       void(int8_t, uint8_t, int16_t, uint16_t, int32_t,
@@ -138,17 +102,24 @@ namespace DummyRPCAPI {
 
 class DummyRPCEndpoint : public SingleThreadedRPCEndpoint<QueueChannel> {
 public:
-  DummyRPCEndpoint(Queue &Q1, Queue &Q2)
-      : SingleThreadedRPCEndpoint(C, true), C(Q1, Q2) {}
-private:
-  QueueChannel C;
+  DummyRPCEndpoint(QueueChannel &C)
+      : SingleThreadedRPCEndpoint(C, true) {}
 };
 
 
-TEST(DummyRPC, TestAsyncVoidBool) {
-  Queue Q1, Q2;
-  DummyRPCEndpoint Client(Q1, Q2);
-  DummyRPCEndpoint Server(Q2, Q1);
+void freeVoidBool(bool B) {
+}
+
+TEST(DummyRPC, TestFreeFunctionHandler) {
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Server(*Channels.first);
+  Server.addHandler<DummyRPCAPI::VoidBool>(freeVoidBool);
+}
+
+TEST(DummyRPC, TestCallAsyncVoidBool) {
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Client(*Channels.first);
+  DummyRPCEndpoint Server(*Channels.second);
 
   std::thread ServerThread([&]() {
       Server.addHandler<DummyRPCAPI::VoidBool>(
@@ -189,10 +160,10 @@ TEST(DummyRPC, TestAsyncVoidBool) {
   ServerThread.join();
 }
 
-TEST(DummyRPC, TestAsyncIntInt) {
-  Queue Q1, Q2;
-  DummyRPCEndpoint Client(Q1, Q2);
-  DummyRPCEndpoint Server(Q2, Q1);
+TEST(DummyRPC, TestCallAsyncIntInt) {
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Client(*Channels.first);
+  DummyRPCEndpoint Server(*Channels.second);
 
   std::thread ServerThread([&]() {
       Server.addHandler<DummyRPCAPI::IntInt>(
@@ -234,10 +205,147 @@ TEST(DummyRPC, TestAsyncIntInt) {
   ServerThread.join();
 }
 
+TEST(DummyRPC, TestAsyncIntIntHandler) {
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Client(*Channels.first);
+  DummyRPCEndpoint Server(*Channels.second);
+
+  std::thread ServerThread([&]() {
+      Server.addAsyncHandler<DummyRPCAPI::IntInt>(
+          [](std::function<Error(Expected<int32_t>)> SendResult,
+             int32_t X) {
+            EXPECT_EQ(X, 21) << "Server int(int) receieved unexpected result";
+            return SendResult(2 * X);
+          });
+
+      {
+        // Poke the server to handle the negotiate call.
+        auto Err = Server.handleOne();
+        EXPECT_FALSE(!!Err) << "Server failed to handle call to negotiate";
+      }
+
+      {
+        // Poke the server to handle the VoidBool call.
+        auto Err = Server.handleOne();
+        EXPECT_FALSE(!!Err) << "Server failed to handle call to void(bool)";
+      }
+  });
+
+  {
+    auto Err = Client.callAsync<DummyRPCAPI::IntInt>(
+        [](Expected<int> Result) {
+          EXPECT_TRUE(!!Result) << "Async int(int) response handler failed";
+          EXPECT_EQ(*Result, 42)
+            << "Async int(int) response handler received incorrect result";
+          return Error::success();
+        }, 21);
+    EXPECT_FALSE(!!Err) << "Client.callAsync failed for int(int)";
+  }
+
+  {
+    // Poke the client to process the result.
+    auto Err = Client.handleOne();
+    EXPECT_FALSE(!!Err) << "Client failed to handle response from void(bool)";
+  }
+
+  ServerThread.join();
+}
+
+TEST(DummyRPC, TestAsyncIntIntHandlerMethod) {
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Client(*Channels.first);
+  DummyRPCEndpoint Server(*Channels.second);
+
+  class Dummy {
+  public:
+    Error handler(std::function<Error(Expected<int32_t>)> SendResult,
+             int32_t X) {
+      EXPECT_EQ(X, 21) << "Server int(int) receieved unexpected result";
+      return SendResult(2 * X);
+    }
+  };
+
+  std::thread ServerThread([&]() {
+      Dummy D;
+      Server.addAsyncHandler<DummyRPCAPI::IntInt>(D, &Dummy::handler);
+
+      {
+        // Poke the server to handle the negotiate call.
+        auto Err = Server.handleOne();
+        EXPECT_FALSE(!!Err) << "Server failed to handle call to negotiate";
+      }
+
+      {
+        // Poke the server to handle the VoidBool call.
+        auto Err = Server.handleOne();
+        EXPECT_FALSE(!!Err) << "Server failed to handle call to void(bool)";
+      }
+  });
+
+  {
+    auto Err = Client.callAsync<DummyRPCAPI::IntInt>(
+        [](Expected<int> Result) {
+          EXPECT_TRUE(!!Result) << "Async int(int) response handler failed";
+          EXPECT_EQ(*Result, 42)
+            << "Async int(int) response handler received incorrect result";
+          return Error::success();
+        }, 21);
+    EXPECT_FALSE(!!Err) << "Client.callAsync failed for int(int)";
+  }
+
+  {
+    // Poke the client to process the result.
+    auto Err = Client.handleOne();
+    EXPECT_FALSE(!!Err) << "Client failed to handle response from void(bool)";
+  }
+
+  ServerThread.join();
+}
+
+TEST(DummyRPC, TestCallAsyncVoidString) {
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Client(*Channels.first);
+  DummyRPCEndpoint Server(*Channels.second);
+
+  std::thread ServerThread([&]() {
+      Server.addHandler<DummyRPCAPI::VoidString>(
+          [](const std::string &S) {
+            EXPECT_EQ(S, "hello")
+              << "Server void(std::string) received unexpected result";
+          });
+
+      // Poke the server to handle the negotiate call.
+      for (int I = 0; I < 4; ++I) {
+        auto Err = Server.handleOne();
+        EXPECT_FALSE(!!Err) << "Server failed to handle call";
+      }
+  });
+
+  {
+    // Make an call using a std::string.
+    auto Err = Client.callB<DummyRPCAPI::VoidString>(std::string("hello"));
+    EXPECT_FALSE(!!Err) << "Client.callAsync failed for void(std::string)";
+  }
+
+  {
+    // Make an call using a std::string.
+    auto Err = Client.callB<DummyRPCAPI::VoidString>(StringRef("hello"));
+    EXPECT_FALSE(!!Err) << "Client.callAsync failed for void(std::string)";
+  }
+
+  {
+    // Make an call using a std::string.
+    auto Err = Client.callB<DummyRPCAPI::VoidString>("hello");
+    EXPECT_FALSE(!!Err) << "Client.callAsync failed for void(string)";
+  }
+
+  ServerThread.join();
+}
+
 TEST(DummyRPC, TestSerialization) {
-  Queue Q1, Q2;
-  DummyRPCEndpoint Client(Q1, Q2);
-  DummyRPCEndpoint Server(Q2, Q1);
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Client(*Channels.first);
+  DummyRPCEndpoint Server(*Channels.second);
 
   std::thread ServerThread([&]() {
       Server.addHandler<DummyRPCAPI::AllTheTypes>(
@@ -300,9 +408,9 @@ TEST(DummyRPC, TestSerialization) {
 }
 
 TEST(DummyRPC, TestCustomType) {
-  Queue Q1, Q2;
-  DummyRPCEndpoint Client(Q1, Q2);
-  DummyRPCEndpoint Server(Q2, Q1);
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Client(*Channels.first);
+  DummyRPCEndpoint Server(*Channels.second);
 
   std::thread ServerThread([&]() {
       Server.addHandler<DummyRPCAPI::CustomType>(
@@ -343,9 +451,9 @@ TEST(DummyRPC, TestCustomType) {
 }
 
 TEST(DummyRPC, TestWithAltCustomType) {
-  Queue Q1, Q2;
-  DummyRPCEndpoint Client(Q1, Q2);
-  DummyRPCEndpoint Server(Q2, Q1);
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Client(*Channels.first);
+  DummyRPCEndpoint Server(*Channels.second);
 
   std::thread ServerThread([&]() {
       Server.addHandler<DummyRPCAPI::CustomType>(
@@ -386,9 +494,9 @@ TEST(DummyRPC, TestWithAltCustomType) {
 }
 
 TEST(DummyRPC, TestParallelCallGroup) {
-  Queue Q1, Q2;
-  DummyRPCEndpoint Client(Q1, Q2);
-  DummyRPCEndpoint Server(Q2, Q1);
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Client(*Channels.first);
+  DummyRPCEndpoint Server(*Channels.second);
 
   std::thread ServerThread([&]() {
       Server.addHandler<DummyRPCAPI::IntInt>(
@@ -468,9 +576,9 @@ TEST(DummyRPC, TestAPICalls) {
   static_assert(!DummyCalls1::Contains<DummyRPCAPI::CustomType>::value,
                 "Contains<Func> template should return false here");
 
-  Queue Q1, Q2;
-  DummyRPCEndpoint Client(Q1, Q2);
-  DummyRPCEndpoint Server(Q2, Q1);
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Client(*Channels.first);
+  DummyRPCEndpoint Server(*Channels.second);
 
   std::thread ServerThread(
     [&]() {
@@ -506,8 +614,8 @@ TEST(DummyRPC, TestAPICalls) {
 }
 
 TEST(DummyRPC, TestRemoveHandler) {
-  Queue Q1, Q2;
-  DummyRPCEndpoint Server(Q1, Q2);
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Server(*Channels.second);
 
   Server.addHandler<DummyRPCAPI::VoidBool>(
     [](bool B) {
@@ -519,8 +627,8 @@ TEST(DummyRPC, TestRemoveHandler) {
 }
 
 TEST(DummyRPC, TestClearHandlers) {
-  Queue Q1, Q2;
-  DummyRPCEndpoint Server(Q1, Q2);
+  auto Channels = createPairedQueueChannels();
+  DummyRPCEndpoint Server(*Channels.second);
 
   Server.addHandler<DummyRPCAPI::VoidBool>(
     [](bool B) {
diff --git a/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
similarity index 90%
rename from unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
rename to unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
index 44b44f6041590d0e349c5b113d0fb776a7aa0324..de99c022fb9dc66f5f277730b05edfac1b9d88fb 100644
--- a/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
+++ b/unittests/ExecutionEngine/Orc/RTDyldObjectLinkingLayerTest.cpp
@@ -1,4 +1,4 @@
-//===-- ObjectLinkingLayerTest.cpp - Unit tests for object linking layer --===//
+//===- RTDyldObjectLinkingLayerTest.cpp - RTDyld linking layer unit tests -===//
 //
 //                     The LLVM Compiler Infrastructure
 //
@@ -13,7 +13,7 @@
 #include "llvm/ExecutionEngine/Orc/CompileUtils.h"
 #include "llvm/ExecutionEngine/Orc/LambdaResolver.h"
 #include "llvm/ExecutionEngine/Orc/NullResolver.h"
-#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
+#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/LLVMContext.h"
 #include "gtest/gtest.h"
@@ -23,8 +23,8 @@ using namespace llvm::orc;
 
 namespace {
 
-class ObjectLinkingLayerExecutionTest : public testing::Test,
-                                        public OrcExecutionTest {
+class RTDyldObjectLinkingLayerExecutionTest : public testing::Test,
+                                              public OrcExecutionTest {
 
 };
 
@@ -44,7 +44,7 @@ public:
   }
 };
 
-TEST(ObjectLinkingLayerTest, TestSetProcessAllSections) {
+TEST(RTDyldObjectLinkingLayerTest, TestSetProcessAllSections) {
   class SectionMemoryManagerWrapper : public SectionMemoryManager {
   public:
     SectionMemoryManagerWrapper(bool &DebugSeen) : DebugSeen(DebugSeen) {}
@@ -60,10 +60,10 @@ TEST(ObjectLinkingLayerTest, TestSetProcessAllSections) {
                                                          IsReadOnly);
     }
   private:
-    bool DebugSeen;
+    bool &DebugSeen;
   };
 
-  ObjectLinkingLayer<> ObjLayer;
+  RTDyldObjectLinkingLayer<> ObjLayer;
 
   LLVMContext Context;
   auto M = llvm::make_unique<Module>("", Context);
@@ -75,6 +75,10 @@ TEST(ObjectLinkingLayerTest, TestSetProcessAllSections) {
 
   GV->setSection(".debug_str");
 
+
+  // Initialize the native target in case this is the first unit test
+  // to try to build a TM.
+  OrcNativeTarget::initialize();
   std::unique_ptr<TargetMachine> TM(
     EngineBuilder().selectTarget(Triple(M->getTargetTriple()), "", "",
                                  SmallVector<std::string, 1>()));
@@ -99,6 +103,7 @@ TEST(ObjectLinkingLayerTest, TestSetProcessAllSections) {
   {
     // Test with ProcessAllSections = false (the default).
     auto H = ObjLayer.addObjectSet(Objs, &SMMW, &*Resolver);
+    ObjLayer.emitAndFinalize(H);
     EXPECT_EQ(DebugSectionSeen, false)
       << "Unexpected debug info section";
     ObjLayer.removeObjectSet(H);
@@ -108,17 +113,18 @@ TEST(ObjectLinkingLayerTest, TestSetProcessAllSections) {
     // Test with ProcessAllSections = true.
     ObjLayer.setProcessAllSections(true);
     auto H = ObjLayer.addObjectSet(Objs, &SMMW, &*Resolver);
+    ObjLayer.emitAndFinalize(H);
     EXPECT_EQ(DebugSectionSeen, true)
       << "Expected debug info section not seen";
     ObjLayer.removeObjectSet(H);
   }
 }
 
-TEST_F(ObjectLinkingLayerExecutionTest, NoDuplicateFinalization) {
+TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoDuplicateFinalization) {
   if (!TM)
     return;
 
-  ObjectLinkingLayer<> ObjLayer;
+  RTDyldObjectLinkingLayer<> ObjLayer;
   SimpleCompiler Compile(*TM);
 
   // Create a pair of modules that will trigger recursive finalization:
@@ -183,11 +189,11 @@ TEST_F(ObjectLinkingLayerExecutionTest, NoDuplicateFinalization) {
       << "Extra call to finalize";
 }
 
-TEST_F(ObjectLinkingLayerExecutionTest, NoPrematureAllocation) {
+TEST_F(RTDyldObjectLinkingLayerExecutionTest, NoPrematureAllocation) {
   if (!TM)
     return;
 
-  ObjectLinkingLayer<> ObjLayer;
+  RTDyldObjectLinkingLayer<> ObjLayer;
   SimpleCompiler Compile(*TM);
 
   // Create a pair of unrelated modules:
diff --git a/unittests/IR/AttributesTest.cpp b/unittests/IR/AttributesTest.cpp
index 9f8013ff181cd61d02c138c2a02a378616547d12..b5b221c63a173d20795e07b3977939e64fe09019 100644
--- a/unittests/IR/AttributesTest.cpp
+++ b/unittests/IR/AttributesTest.cpp
@@ -21,13 +21,11 @@ TEST(Attributes, Uniquing) {
   Attribute AttrB = Attribute::get(C, Attribute::AlwaysInline);
   EXPECT_EQ(AttrA, AttrB);
 
-  AttributeSet ASs[] = {
-    AttributeSet::get(C, 1, Attribute::ZExt),
-    AttributeSet::get(C, 2, Attribute::SExt)
-  };
+  AttributeList ASs[] = {AttributeList::get(C, 1, Attribute::ZExt),
+                         AttributeList::get(C, 2, Attribute::SExt)};
 
-  AttributeSet SetA = AttributeSet::get(C, ASs);
-  AttributeSet SetB = AttributeSet::get(C, ASs);
+  AttributeList SetA = AttributeList::get(C, ASs);
+  AttributeList SetB = AttributeList::get(C, ASs);
   EXPECT_EQ(SetA, SetB);
 }
 
@@ -43,13 +41,11 @@ TEST(Attributes, Ordering) {
   EXPECT_TRUE(Align4 < Deref5);
   EXPECT_TRUE(Align5 < Deref4);
 
-  AttributeSet ASs[] = {
-    AttributeSet::get(C, 2, Attribute::ZExt),
-    AttributeSet::get(C, 1, Attribute::SExt)
-  };
+  AttributeList ASs[] = {AttributeList::get(C, 2, Attribute::ZExt),
+                         AttributeList::get(C, 1, Attribute::SExt)};
 
-  AttributeSet SetA = AttributeSet::get(C, ASs);
-  AttributeSet SetB = SetA.removeAttributes(C, 1, ASs[1]);
+  AttributeList SetA = AttributeList::get(C, ASs);
+  AttributeList SetB = SetA.removeAttributes(C, 1, ASs[1]);
   EXPECT_NE(SetA, SetB);
 }
 
diff --git a/unittests/IR/FunctionTest.cpp b/unittests/IR/FunctionTest.cpp
index fb458597c37a45af4cdee5c5834e9ae9576e5dad..6838d7e2527ff0c608c81d75b2e9da4ef91bfad8 100644
--- a/unittests/IR/FunctionTest.cpp
+++ b/unittests/IR/FunctionTest.cpp
@@ -8,6 +8,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
 #include "gtest/gtest.h"
 using namespace llvm;
 
@@ -109,4 +110,24 @@ TEST(FunctionTest, stealArgumentListFrom) {
   EXPECT_TRUE(F2->hasLazyArguments());
 }
 
+// Test setting and removing section information
+TEST(FunctionTest, setSection) {
+  LLVMContext C;
+  Module M("test", C);
+
+  llvm::Function *F =
+      Function::Create(llvm::FunctionType::get(llvm::Type::getVoidTy(C), false),
+                       llvm::GlobalValue::ExternalLinkage, "F", &M);
+
+  F->setSection(".text.test");
+  EXPECT_TRUE(F->getSection() == ".text.test");
+  EXPECT_TRUE(F->hasSection());
+  F->setSection("");
+  EXPECT_FALSE(F->hasSection());
+  F->setSection(".text.test");
+  F->setSection(".text.test2");
+  EXPECT_TRUE(F->getSection() == ".text.test2");
+  EXPECT_TRUE(F->hasSection());
+}
+
 } // end namespace
diff --git a/unittests/IR/IRBuilderTest.cpp b/unittests/IR/IRBuilderTest.cpp
index 1812cd39d135c642a81f7d80ceb9a1a11eee008b..830ae9587691c7bde655b7038e12bb346bb86067 100644
--- a/unittests/IR/IRBuilderTest.cpp
+++ b/unittests/IR/IRBuilderTest.cpp
@@ -207,7 +207,26 @@ TEST_F(IRBuilderTest, FastMathFlags) {
   EXPECT_TRUE(FCmp->hasAllowReciprocal());
 
   Builder.clearFastMathFlags();
- 
+
+  // Test FP-contract
+  FC = Builder.CreateFAdd(F, F);
+  ASSERT_TRUE(isa<Instruction>(FC));
+  FAdd = cast<Instruction>(FC);
+  EXPECT_FALSE(FAdd->hasAllowContract());
+
+  FMF.clear();
+  FMF.setAllowContract(true);
+  Builder.setFastMathFlags(FMF);
+
+  FC = Builder.CreateFAdd(F, F);
+  EXPECT_TRUE(Builder.getFastMathFlags().any());
+  EXPECT_TRUE(Builder.getFastMathFlags().AllowContract);
+  ASSERT_TRUE(isa<Instruction>(FC));
+  FAdd = cast<Instruction>(FC);
+  EXPECT_TRUE(FAdd->hasAllowContract());
+
+  Builder.clearFastMathFlags();
+
   // Test a call with FMF.
   auto CalleeTy = FunctionType::get(Type::getFloatTy(Ctx),
                                     /*isVarArg=*/false);
@@ -245,6 +264,7 @@ TEST_F(IRBuilderTest, FastMathFlags) {
   EXPECT_FALSE(FDiv->getFastMathFlags().any());
   FDiv->setHasAllowReciprocal(true);
   FAdd->setHasAllowReciprocal(false);
+  FAdd->setHasNoNaNs(true);
   FDiv->copyFastMathFlags(FAdd);
   EXPECT_TRUE(FDiv->hasNoNaNs());
   EXPECT_FALSE(FDiv->hasAllowReciprocal());
diff --git a/unittests/IR/InstructionsTest.cpp b/unittests/IR/InstructionsTest.cpp
index 0dac7c1bcfb11064033c0b7a92ec05d0497a0afe..7c75aaec1753982f458d374db94367bbff242b12 100644
--- a/unittests/IR/InstructionsTest.cpp
+++ b/unittests/IR/InstructionsTest.cpp
@@ -19,6 +19,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
 #include "gtest/gtest.h"
 #include <memory>
@@ -516,7 +517,8 @@ TEST(InstructionsTest, CloneCall) {
   {
     AttrBuilder AB;
     AB.addAttribute(Attribute::ReadOnly);
-    Call->setAttributes(AttributeSet::get(C, AttributeSet::FunctionIndex, AB));
+    Call->setAttributes(
+        AttributeList::get(C, AttributeList::FunctionIndex, AB));
     std::unique_ptr<CallInst> Clone(cast<CallInst>(Call->clone()));
     EXPECT_TRUE(Clone->onlyReadsMemory());
   }
@@ -534,7 +536,7 @@ TEST(InstructionsTest, AlterCallBundles) {
   Call->setTailCallKind(CallInst::TailCallKind::TCK_NoTail);
   AttrBuilder AB;
   AB.addAttribute(Attribute::Cold);
-  Call->setAttributes(AttributeSet::get(C, AttributeSet::FunctionIndex, AB));
+  Call->setAttributes(AttributeList::get(C, AttributeList::FunctionIndex, AB));
   Call->setDebugLoc(DebugLoc(MDNode::get(C, None)));
 
   OperandBundleDef NewBundle("after", ConstantInt::get(Int32Ty, 7));
@@ -562,7 +564,8 @@ TEST(InstructionsTest, AlterInvokeBundles) {
       Callee, NormalDest.get(), UnwindDest.get(), Args, OldBundle, "result"));
   AttrBuilder AB;
   AB.addAttribute(Attribute::Cold);
-  Invoke->setAttributes(AttributeSet::get(C, AttributeSet::FunctionIndex, AB));
+  Invoke->setAttributes(
+      AttributeList::get(C, AttributeList::FunctionIndex, AB));
   Invoke->setDebugLoc(DebugLoc(MDNode::get(C, None)));
 
   OperandBundleDef NewBundle("after", ConstantInt::get(Int32Ty, 7));
@@ -579,5 +582,163 @@ TEST(InstructionsTest, AlterInvokeBundles) {
   EXPECT_TRUE(Clone->getOperandBundle("after").hasValue());
 }
 
+TEST_F(ModuleWithFunctionTest, DropPoisonGeneratingFlags) {
+  auto *OnlyBB = BasicBlock::Create(Ctx, "bb", F);
+  auto *Arg0 = &*F->arg_begin();
+
+  IRBuilder<NoFolder> B(Ctx);
+  B.SetInsertPoint(OnlyBB);
+
+  {
+    auto *UI =
+        cast<Instruction>(B.CreateUDiv(Arg0, Arg0, "", /*isExact*/ true));
+    ASSERT_TRUE(UI->isExact());
+    UI->dropPoisonGeneratingFlags();
+    ASSERT_FALSE(UI->isExact());
+  }
+
+  {
+    auto *ShrI =
+        cast<Instruction>(B.CreateLShr(Arg0, Arg0, "", /*isExact*/ true));
+    ASSERT_TRUE(ShrI->isExact());
+    ShrI->dropPoisonGeneratingFlags();
+    ASSERT_FALSE(ShrI->isExact());
+  }
+
+  {
+    auto *AI = cast<Instruction>(
+        B.CreateAdd(Arg0, Arg0, "", /*HasNUW*/ true, /*HasNSW*/ false));
+    ASSERT_TRUE(AI->hasNoUnsignedWrap());
+    AI->dropPoisonGeneratingFlags();
+    ASSERT_FALSE(AI->hasNoUnsignedWrap());
+    ASSERT_FALSE(AI->hasNoSignedWrap());
+  }
+
+  {
+    auto *SI = cast<Instruction>(
+        B.CreateAdd(Arg0, Arg0, "", /*HasNUW*/ false, /*HasNSW*/ true));
+    ASSERT_TRUE(SI->hasNoSignedWrap());
+    SI->dropPoisonGeneratingFlags();
+    ASSERT_FALSE(SI->hasNoUnsignedWrap());
+    ASSERT_FALSE(SI->hasNoSignedWrap());
+  }
+
+  {
+    auto *ShlI = cast<Instruction>(
+        B.CreateShl(Arg0, Arg0, "", /*HasNUW*/ true, /*HasNSW*/ true));
+    ASSERT_TRUE(ShlI->hasNoSignedWrap());
+    ASSERT_TRUE(ShlI->hasNoUnsignedWrap());
+    ShlI->dropPoisonGeneratingFlags();
+    ASSERT_FALSE(ShlI->hasNoUnsignedWrap());
+    ASSERT_FALSE(ShlI->hasNoSignedWrap());
+  }
+
+  {
+    Value *GEPBase = Constant::getNullValue(B.getInt8PtrTy());
+    auto *GI = cast<GetElementPtrInst>(B.CreateInBoundsGEP(GEPBase, {Arg0}));
+    ASSERT_TRUE(GI->isInBounds());
+    GI->dropPoisonGeneratingFlags();
+    ASSERT_FALSE(GI->isInBounds());
+  }
+}
+
+TEST(InstructionsTest, GEPIndices) {
+  LLVMContext Context;
+  IRBuilder<NoFolder> Builder(Context);
+  Type *ElementTy = Builder.getInt8Ty();
+  Type *ArrTy = ArrayType::get(ArrayType::get(ElementTy, 64), 64);
+  Value *Indices[] = {
+    Builder.getInt32(0),
+    Builder.getInt32(13),
+    Builder.getInt32(42) };
+
+  Value *V = Builder.CreateGEP(ArrTy, UndefValue::get(PointerType::getUnqual(ArrTy)),
+                               Indices);
+  ASSERT_TRUE(isa<GetElementPtrInst>(V));
+
+  auto *GEPI = cast<GetElementPtrInst>(V);
+  ASSERT_NE(GEPI->idx_begin(), GEPI->idx_end());
+  ASSERT_EQ(GEPI->idx_end(), std::next(GEPI->idx_begin(), 3));
+  EXPECT_EQ(Indices[0], GEPI->idx_begin()[0]);
+  EXPECT_EQ(Indices[1], GEPI->idx_begin()[1]);
+  EXPECT_EQ(Indices[2], GEPI->idx_begin()[2]);
+  EXPECT_EQ(GEPI->idx_begin(), GEPI->indices().begin());
+  EXPECT_EQ(GEPI->idx_end(), GEPI->indices().end());
+
+  const auto *CGEPI = GEPI;
+  ASSERT_NE(CGEPI->idx_begin(), CGEPI->idx_end());
+  ASSERT_EQ(CGEPI->idx_end(), std::next(CGEPI->idx_begin(), 3));
+  EXPECT_EQ(Indices[0], CGEPI->idx_begin()[0]);
+  EXPECT_EQ(Indices[1], CGEPI->idx_begin()[1]);
+  EXPECT_EQ(Indices[2], CGEPI->idx_begin()[2]);
+  EXPECT_EQ(CGEPI->idx_begin(), CGEPI->indices().begin());
+  EXPECT_EQ(CGEPI->idx_end(), CGEPI->indices().end());
+
+  delete GEPI;
+}
+
+TEST(InstructionsTest, SwitchInst) {
+  LLVMContext C;
+
+  std::unique_ptr<BasicBlock> BB1, BB2, BB3;
+  BB1.reset(BasicBlock::Create(C));
+  BB2.reset(BasicBlock::Create(C));
+  BB3.reset(BasicBlock::Create(C));
+
+  // We create block 0 after the others so that it gets destroyed first and
+  // clears the uses of the other basic blocks.
+  std::unique_ptr<BasicBlock> BB0(BasicBlock::Create(C));
+
+  auto *Int32Ty = Type::getInt32Ty(C);
+
+  SwitchInst *SI =
+      SwitchInst::Create(UndefValue::get(Int32Ty), BB0.get(), 3, BB0.get());
+  SI->addCase(ConstantInt::get(Int32Ty, 1), BB1.get());
+  SI->addCase(ConstantInt::get(Int32Ty, 2), BB2.get());
+  SI->addCase(ConstantInt::get(Int32Ty, 3), BB3.get());
+
+  auto CI = SI->case_begin();
+  ASSERT_NE(CI, SI->case_end());
+  EXPECT_EQ(1, CI->getCaseValue()->getSExtValue());
+  EXPECT_EQ(BB1.get(), CI->getCaseSuccessor());
+  EXPECT_EQ(2, (CI + 1)->getCaseValue()->getSExtValue());
+  EXPECT_EQ(BB2.get(), (CI + 1)->getCaseSuccessor());
+  EXPECT_EQ(3, (CI + 2)->getCaseValue()->getSExtValue());
+  EXPECT_EQ(BB3.get(), (CI + 2)->getCaseSuccessor());
+  EXPECT_EQ(CI + 1, std::next(CI));
+  EXPECT_EQ(CI + 2, std::next(CI, 2));
+  EXPECT_EQ(CI + 3, std::next(CI, 3));
+  EXPECT_EQ(SI->case_end(), CI + 3);
+  EXPECT_EQ(0, CI - CI);
+  EXPECT_EQ(1, (CI + 1) - CI);
+  EXPECT_EQ(2, (CI + 2) - CI);
+  EXPECT_EQ(3, SI->case_end() - CI);
+  EXPECT_EQ(3, std::distance(CI, SI->case_end()));
+
+  auto CCI = const_cast<const SwitchInst *>(SI)->case_begin();
+  SwitchInst::ConstCaseIt CCE = SI->case_end();
+  ASSERT_NE(CCI, SI->case_end());
+  EXPECT_EQ(1, CCI->getCaseValue()->getSExtValue());
+  EXPECT_EQ(BB1.get(), CCI->getCaseSuccessor());
+  EXPECT_EQ(2, (CCI + 1)->getCaseValue()->getSExtValue());
+  EXPECT_EQ(BB2.get(), (CCI + 1)->getCaseSuccessor());
+  EXPECT_EQ(3, (CCI + 2)->getCaseValue()->getSExtValue());
+  EXPECT_EQ(BB3.get(), (CCI + 2)->getCaseSuccessor());
+  EXPECT_EQ(CCI + 1, std::next(CCI));
+  EXPECT_EQ(CCI + 2, std::next(CCI, 2));
+  EXPECT_EQ(CCI + 3, std::next(CCI, 3));
+  EXPECT_EQ(CCE, CCI + 3);
+  EXPECT_EQ(0, CCI - CCI);
+  EXPECT_EQ(1, (CCI + 1) - CCI);
+  EXPECT_EQ(2, (CCI + 2) - CCI);
+  EXPECT_EQ(3, CCE - CCI);
+  EXPECT_EQ(3, std::distance(CCI, CCE));
+
+  // Make sure that the const iterator is compatible with a const auto ref.
+  const auto &Handle = *CCI;
+  EXPECT_EQ(1, Handle.getCaseValue()->getSExtValue());
+  EXPECT_EQ(BB1.get(), Handle.getCaseSuccessor());
+}
+
 } // end anonymous namespace
 } // end namespace llvm
diff --git a/unittests/IR/LegacyPassManagerTest.cpp b/unittests/IR/LegacyPassManagerTest.cpp
index 9dceb976c9375eb21d38511b8460b4004a5801de..0f67d3fb5ac9e71e6a5777ff86d12e983b51683a 100644
--- a/unittests/IR/LegacyPassManagerTest.cpp
+++ b/unittests/IR/LegacyPassManagerTest.cpp
@@ -429,7 +429,7 @@ namespace llvm {
         /*Linkage=*/GlobalValue::ExternalLinkage,
         /*Name=*/"test1", mod);
       func_test1->setCallingConv(CallingConv::C);
-      AttributeSet func_test1_PAL;
+      AttributeList func_test1_PAL;
       func_test1->setAttributes(func_test1_PAL);
 
       Function* func_test2 = Function::Create(
@@ -437,7 +437,7 @@ namespace llvm {
         /*Linkage=*/GlobalValue::ExternalLinkage,
         /*Name=*/"test2", mod);
       func_test2->setCallingConv(CallingConv::C);
-      AttributeSet func_test2_PAL;
+      AttributeList func_test2_PAL;
       func_test2->setAttributes(func_test2_PAL);
 
       Function* func_test3 = Function::Create(
@@ -445,7 +445,7 @@ namespace llvm {
         /*Linkage=*/GlobalValue::ExternalLinkage,
         /*Name=*/"test3", mod);
       func_test3->setCallingConv(CallingConv::C);
-      AttributeSet func_test3_PAL;
+      AttributeList func_test3_PAL;
       func_test3->setAttributes(func_test3_PAL);
 
       Function* func_test4 = Function::Create(
@@ -453,7 +453,7 @@ namespace llvm {
         /*Linkage=*/GlobalValue::ExternalLinkage,
         /*Name=*/"test4", mod);
       func_test4->setCallingConv(CallingConv::C);
-      AttributeSet func_test4_PAL;
+      AttributeList func_test4_PAL;
       func_test4->setAttributes(func_test4_PAL);
 
       // Global Variable Declarations
@@ -474,7 +474,8 @@ namespace llvm {
         // Block entry (label_entry)
         CallInst* int32_3 = CallInst::Create(func_test2, "", label_entry);
         int32_3->setCallingConv(CallingConv::C);
-        int32_3->setTailCall(false);AttributeSet int32_3_PAL;
+        int32_3->setTailCall(false);
+        AttributeList int32_3_PAL;
         int32_3->setAttributes(int32_3_PAL);
 
         ReturnInst::Create(Context, int32_3, label_entry);
@@ -489,7 +490,8 @@ namespace llvm {
         // Block entry (label_entry_5)
         CallInst* int32_6 = CallInst::Create(func_test3, "", label_entry_5);
         int32_6->setCallingConv(CallingConv::C);
-        int32_6->setTailCall(false);AttributeSet int32_6_PAL;
+        int32_6->setTailCall(false);
+        AttributeList int32_6_PAL;
         int32_6->setAttributes(int32_6_PAL);
 
         ReturnInst::Create(Context, int32_6, label_entry_5);
@@ -504,7 +506,8 @@ namespace llvm {
         // Block entry (label_entry_8)
         CallInst* int32_9 = CallInst::Create(func_test1, "", label_entry_8);
         int32_9->setCallingConv(CallingConv::C);
-        int32_9->setTailCall(false);AttributeSet int32_9_PAL;
+        int32_9->setTailCall(false);
+        AttributeList int32_9_PAL;
         int32_9->setAttributes(int32_9_PAL);
 
         ReturnInst::Create(Context, int32_9, label_entry_8);
diff --git a/unittests/IR/MetadataTest.cpp b/unittests/IR/MetadataTest.cpp
index 7bb8d4010d38a79958ccb74ea5c720f313821f49..103ba4c92ddf1c254944681881b26ed53522dca4 100644
--- a/unittests/IR/MetadataTest.cpp
+++ b/unittests/IR/MetadataTest.cpp
@@ -95,7 +95,7 @@ protected:
     return DICompileUnit::getDistinct(Context, 1, getFile(), "clang", false,
                                       "-g", 2, "", DICompileUnit::FullDebug,
                                       getTuple(), getTuple(), getTuple(),
-                                      getTuple(), getTuple(), 0, true);
+                                      getTuple(), getTuple(), 0, true, false);
   }
   DIType *getBasicType(StringRef Name) {
     return DIBasicType::get(Context, dwarf::DW_TAG_unspecified_type, Name);
@@ -103,7 +103,7 @@ protected:
   DIType *getDerivedType() {
     return DIDerivedType::getDistinct(
         Context, dwarf::DW_TAG_pointer_type, "", nullptr, 0, nullptr,
-        getBasicType("basictype"), 1, 2, 0, DINode::FlagZero);
+        getBasicType("basictype"), 1, 2, 0, None, DINode::FlagZero);
   }
   Constant *getConstant() {
     return ConstantInt::get(Type::getInt32Ty(Context), Counter++);
@@ -1053,12 +1053,14 @@ TEST_F(DIDerivedTypeTest, get) {
   DIScope *Scope = getSubprogram();
   DIType *BaseType = getBasicType("basic");
   MDTuple *ExtraData = getTuple();
+  unsigned DWARFAddressSpace = 8;
   DINode::DIFlags Flags5 = static_cast<DINode::DIFlags>(5);
   DINode::DIFlags Flags4 = static_cast<DINode::DIFlags>(4);
 
   auto *N =
       DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "something", File,
-                         1, Scope, BaseType, 2, 3, 4, Flags5, ExtraData);
+                         1, Scope, BaseType, 2, 3, 4, DWARFAddressSpace, Flags5,
+                         ExtraData);
   EXPECT_EQ(dwarf::DW_TAG_pointer_type, N->getTag());
   EXPECT_EQ("something", N->getName());
   EXPECT_EQ(File, N->getFile());
@@ -1068,45 +1070,51 @@ TEST_F(DIDerivedTypeTest, get) {
   EXPECT_EQ(2u, N->getSizeInBits());
   EXPECT_EQ(3u, N->getAlignInBits());
   EXPECT_EQ(4u, N->getOffsetInBits());
+  EXPECT_EQ(DWARFAddressSpace, N->getDWARFAddressSpace().getValue());
   EXPECT_EQ(5u, N->getFlags());
   EXPECT_EQ(ExtraData, N->getExtraData());
   EXPECT_EQ(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, Flags5, ExtraData));
 
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_reference_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type, "else",
-                                  File, 1, Scope, BaseType, 2, 3, 4, Flags5,
-                                  ExtraData));
+                                  File, 1, Scope, BaseType, 2, 3,
+                                  4, DWARFAddressSpace, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", getFile(), 1, Scope, BaseType, 2,
-                                  3, 4, Flags5, ExtraData));
+                                  3, 4, DWARFAddressSpace, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 2, Scope, BaseType, 2, 3,
-                                  4, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, getSubprogram(),
-                                  BaseType, 2, 3, 4, Flags5, ExtraData));
+                                  BaseType, 2, 3, 4, DWARFAddressSpace, Flags5,
+                                  ExtraData));
   EXPECT_NE(N, DIDerivedType::get(
                    Context, dwarf::DW_TAG_pointer_type, "something", File, 1,
-                   Scope, getBasicType("basic2"), 2, 3, 4, Flags5, ExtraData));
+                   Scope, getBasicType("basic2"), 2, 3, 4, DWARFAddressSpace,
+                   Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 3, 3,
-                                  4, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 2,
-                                  4, Flags5, ExtraData));
+                                  4, DWARFAddressSpace, Flags5, ExtraData));
+  EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
+                                  "something", File, 1, Scope, BaseType, 2, 3,
+                                  5, DWARFAddressSpace, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  5, Flags5, ExtraData));
+                                  4, DWARFAddressSpace + 1, Flags5, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, Flags4, ExtraData));
+                                  4, DWARFAddressSpace, Flags4, ExtraData));
   EXPECT_NE(N, DIDerivedType::get(Context, dwarf::DW_TAG_pointer_type,
                                   "something", File, 1, Scope, BaseType, 2, 3,
-                                  4, Flags5, getTuple()));
+                                  4, DWARFAddressSpace, Flags5, getTuple()));
 
   TempDIDerivedType Temp = N->clone();
   EXPECT_EQ(N, MDNode::replaceWithUniqued(std::move(Temp)));
@@ -1121,10 +1129,12 @@ TEST_F(DIDerivedTypeTest, getWithLargeValues) {
 
   auto *N = DIDerivedType::get(
       Context, dwarf::DW_TAG_pointer_type, "something", File, 1, Scope,
-      BaseType, UINT64_MAX, UINT32_MAX - 1, UINT64_MAX - 2, Flags, ExtraData);
+      BaseType, UINT64_MAX, UINT32_MAX - 1, UINT64_MAX - 2, UINT32_MAX - 3,
+      Flags, ExtraData);
   EXPECT_EQ(UINT64_MAX, N->getSizeInBits());
   EXPECT_EQ(UINT32_MAX - 1, N->getAlignInBits());
   EXPECT_EQ(UINT64_MAX - 2, N->getOffsetInBits());
+  EXPECT_EQ(UINT32_MAX - 3, N->getDWARFAddressSpace().getValue());
 }
 
 typedef MetadataTest DICompositeTypeTest;
@@ -1406,7 +1416,8 @@ TEST_F(DICompileUnitTest, get) {
   auto *N = DICompileUnit::getDistinct(
       Context, SourceLanguage, File, Producer, IsOptimized, Flags,
       RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
-      RetainedTypes, GlobalVariables, ImportedEntities, Macros, DWOId, true);
+      RetainedTypes, GlobalVariables, ImportedEntities, Macros, DWOId, true,
+      false);
 
   EXPECT_EQ(dwarf::DW_TAG_compile_unit, N->getTag());
   EXPECT_EQ(SourceLanguage, N->getSourceLanguage());
@@ -1463,7 +1474,7 @@ TEST_F(DICompileUnitTest, replaceArrays) {
   auto *N = DICompileUnit::getDistinct(
       Context, SourceLanguage, File, Producer, IsOptimized, Flags,
       RuntimeVersion, SplitDebugFilename, EmissionKind, EnumTypes,
-      RetainedTypes, nullptr, ImportedEntities, nullptr, DWOId, true);
+      RetainedTypes, nullptr, ImportedEntities, nullptr, DWOId, true, false);
 
   auto *GlobalVariables = MDTuple::getDistinct(Context, None);
   EXPECT_EQ(nullptr, N->getGlobalVariables().get());
diff --git a/unittests/IR/ValueTest.cpp b/unittests/IR/ValueTest.cpp
index 607b7a1bd2c9b93fc6c4b423be25aa1465c07f8b..142444a809c6d92149ed74a7c742a426f95cdcb7 100644
--- a/unittests/IR/ValueTest.cpp
+++ b/unittests/IR/ValueTest.cpp
@@ -40,7 +40,7 @@ TEST(ValueTest, UsedInBasicBlock) {
   Function *F = M->getFunction("f");
 
   EXPECT_FALSE(F->isUsedInBasicBlock(&F->front()));
-  EXPECT_TRUE((++F->arg_begin())->isUsedInBasicBlock(&F->front()));
+  EXPECT_TRUE(std::next(F->arg_begin())->isUsedInBasicBlock(&F->front()));
   EXPECT_TRUE(F->arg_begin()->isUsedInBasicBlock(&F->front()));
 }
 
diff --git a/unittests/IR/VerifierTest.cpp b/unittests/IR/VerifierTest.cpp
index ad6940afd05ef3241f0efa9eb9b0ff0e1c9f8477..188509aadf77a95fe88cabe1195b6a30b20c0ea8 100644
--- a/unittests/IR/VerifierTest.cpp
+++ b/unittests/IR/VerifierTest.cpp
@@ -52,9 +52,9 @@ TEST(VerifierTest, InvalidRetAttribute) {
   Module M("M", C);
   FunctionType *FTy = FunctionType::get(Type::getInt32Ty(C), /*isVarArg=*/false);
   Function *F = cast<Function>(M.getOrInsertFunction("foo", FTy));
-  AttributeSet AS = F->getAttributes();
-  F->setAttributes(AS.addAttribute(C, AttributeSet::ReturnIndex,
-                                   Attribute::UWTable));
+  AttributeList AS = F->getAttributes();
+  F->setAttributes(
+      AS.addAttribute(C, AttributeList::ReturnIndex, Attribute::UWTable));
 
   std::string Error;
   raw_string_ostream ErrorOS(Error);
diff --git a/unittests/Linker/LinkModulesTest.cpp b/unittests/Linker/LinkModulesTest.cpp
index 92c483278be9d88dd14a2695f333aa10eaff05a3..f31409c501211e051bb40f312fdc8fa2b5f11aaf 100644
--- a/unittests/Linker/LinkModulesTest.cpp
+++ b/unittests/Linker/LinkModulesTest.cpp
@@ -317,34 +317,34 @@ TEST_F(LinkModuleTest, RemangleIntrinsics) {
   const char *FooStr =
     "%struct.rtx_def = type { i16 }\n"
     "define void @foo(%struct.rtx_def* %a, i8 %b, i32 %c) {\n"
-    "  call void  @llvm.memset.p0struct.rtx_def.i32(%struct.rtx_def* %a, i8 %b, i32 %c, i32 4, i1 true)\n"
+    "  call void  @llvm.memset.p0s_struct.rtx_defs.i32(%struct.rtx_def* %a, i8 %b, i32 %c, i32 4, i1 true)\n"
     "  ret void\n"
     "}\n"
-    "declare void @llvm.memset.p0struct.rtx_def.i32(%struct.rtx_def*, i8, i32, i32, i1)\n";
+    "declare void @llvm.memset.p0s_struct.rtx_defs.i32(%struct.rtx_def*, i8, i32, i32, i1)\n";
 
   const char *BarStr =
     "%struct.rtx_def = type { i16 }\n"
     "define void @bar(%struct.rtx_def* %a, i8 %b, i32 %c) {\n"
-    "  call void  @llvm.memset.p0struct.rtx_def.i32(%struct.rtx_def* %a, i8 %b, i32 %c, i32 4, i1 true)\n"
+    "  call void  @llvm.memset.p0s_struct.rtx_defs.i32(%struct.rtx_def* %a, i8 %b, i32 %c, i32 4, i1 true)\n"
     "  ret void\n"
     "}\n"
-    "declare void @llvm.memset.p0struct.rtx_def.i32(%struct.rtx_def*, i8, i32, i32, i1)\n";
+    "declare void @llvm.memset.p0s_struct.rtx_defs.i32(%struct.rtx_def*, i8, i32, i32, i1)\n";
 
   std::unique_ptr<Module> Foo = parseAssemblyString(FooStr, Err, C);
   assert(Foo);
   ASSERT_TRUE(Foo.get());
   // Foo is loaded first, so the type and the intrinsic have theis original
   // names.
-  ASSERT_TRUE(Foo->getFunction("llvm.memset.p0struct.rtx_def.i32"));
-  ASSERT_FALSE(Foo->getFunction("llvm.memset.p0struct.rtx_def.0.i32"));
+  ASSERT_TRUE(Foo->getFunction("llvm.memset.p0s_struct.rtx_defs.i32"));
+  ASSERT_FALSE(Foo->getFunction("llvm.memset.p0s_struct.rtx_defs.0.i32"));
 
   std::unique_ptr<Module> Bar = parseAssemblyString(BarStr, Err, C);
   assert(Bar);
   ASSERT_TRUE(Bar.get());
   // Bar is loaded after Foo, so the type is renamed to struct.rtx_def.0. Check
   // that the intrinsic is also renamed.
-  ASSERT_FALSE(Bar->getFunction("llvm.memset.p0struct.rtx_def.i32"));
-  ASSERT_TRUE(Bar->getFunction("llvm.memset.p0struct.rtx_def.0.i32"));
+  ASSERT_FALSE(Bar->getFunction("llvm.memset.p0s_struct.rtx_defs.i32"));
+  ASSERT_TRUE(Bar->getFunction("llvm.memset.p0s_struct.rtx_def.0s.i32"));
 
   // Link two modules together.
   auto Dst = llvm::make_unique<Module>("Linked", C);
@@ -356,7 +356,7 @@ TEST_F(LinkModuleTest, RemangleIntrinsics) {
   // "struct.rtx_def" from Foo and "struct.rtx_def.0" from Bar are isomorphic
   // types, so they must be uniquified by linker. Check that they use the same
   // intrinsic definition.
-  Function *F = Foo->getFunction("llvm.memset.p0struct.rtx_def.i32");
+  Function *F = Foo->getFunction("llvm.memset.p0s_struct.rtx_defs.i32");
   ASSERT_EQ(F->getNumUses(), (unsigned)2);
 }
 
diff --git a/unittests/MI/LiveIntervalTest.cpp b/unittests/MI/LiveIntervalTest.cpp
index 1d6df97a32007c20bac1f6bb9619e42cb5199bd4..026fb42d345f75300190f9d4baf2ab8d2280e203 100644
--- a/unittests/MI/LiveIntervalTest.cpp
+++ b/unittests/MI/LiveIntervalTest.cpp
@@ -142,15 +142,15 @@ static void liveIntervalTest(StringRef MIRFunc, LiveIntervalTest T) {
   legacy::PassManager PM;
 
   SmallString<160> S;
-  StringRef MIRString = (Twine(
-"---\n"
-"...\n"
-"name: func\n"
-"registers:\n"
-"  - { id: 0, class: sreg_64 }\n"
-"body: |\n"
-"  bb.0:\n"
-  ) + Twine(MIRFunc) + Twine("...\n")).toNullTerminatedStringRef(S);
+  StringRef MIRString = (Twine(R"MIR(
+---
+...
+name: func
+registers:
+  - { id: 0, class: sreg_64 }
+body: |
+  bb.0:
+)MIR") + Twine(MIRFunc) + Twine("...\n")).toNullTerminatedStringRef(S);
   std::unique_ptr<MIRParser> MIR;
   std::unique_ptr<Module> M = parseMIR(Context, PM, MIR, *TM, MIRString,
                                        "func");
@@ -167,66 +167,66 @@ INITIALIZE_PASS(TestPass, "testpass", "testpass", false, false)
 
 TEST(LiveIntervalTest, MoveUpDef) {
   // Value defined.
-  liveIntervalTest(
-"    S_NOP 0\n"
-"    S_NOP 0\n"
-"    early-clobber %0 = IMPLICIT_DEF\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    S_NOP 0
+    S_NOP 0
+    early-clobber %0 = IMPLICIT_DEF
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 2, 1);
   });
 }
 
 TEST(LiveIntervalTest, MoveUpRedef) {
-  liveIntervalTest(
-"    %0 = IMPLICIT_DEF\n"
-"    S_NOP 0\n"
-"    %0 = IMPLICIT_DEF implicit %0(tied-def 0)\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %0 = IMPLICIT_DEF
+    S_NOP 0
+    %0 = IMPLICIT_DEF implicit %0(tied-def 0)
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 2, 1);
   });
 }
 
 TEST(LiveIntervalTest, MoveUpEarlyDef) {
-  liveIntervalTest(
-"    S_NOP 0\n"
-"    S_NOP 0\n"
-"    early-clobber %0 = IMPLICIT_DEF\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    S_NOP 0
+    S_NOP 0
+    early-clobber %0 = IMPLICIT_DEF
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 2, 1);
   });
 }
 
 TEST(LiveIntervalTest, MoveUpEarlyRedef) {
-  liveIntervalTest(
-"    %0 = IMPLICIT_DEF\n"
-"    S_NOP 0\n"
-"    early-clobber %0 = IMPLICIT_DEF implicit %0(tied-def 0)\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %0 = IMPLICIT_DEF
+    S_NOP 0
+    early-clobber %0 = IMPLICIT_DEF implicit %0(tied-def 0)
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 2, 1);
   });
 }
 
 TEST(LiveIntervalTest, MoveUpKill) {
-  liveIntervalTest(
-"    %0 = IMPLICIT_DEF\n"
-"    S_NOP 0\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %0 = IMPLICIT_DEF
+    S_NOP 0
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 2, 1);
   });
 }
 
 TEST(LiveIntervalTest, MoveUpKillFollowing) {
-  liveIntervalTest(
-"    %0 = IMPLICIT_DEF\n"
-"    S_NOP 0\n"
-"    S_NOP 0, implicit %0\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %0 = IMPLICIT_DEF
+    S_NOP 0
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 2, 1);
   });
 }
@@ -236,77 +236,77 @@ TEST(LiveIntervalTest, MoveUpKillFollowing) {
 
 TEST(LiveIntervalTest, MoveDownDef) {
   // Value defined.
-  liveIntervalTest(
-"    S_NOP 0\n"
-"    early-clobber %0 = IMPLICIT_DEF\n"
-"    S_NOP 0\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    S_NOP 0
+    early-clobber %0 = IMPLICIT_DEF
+    S_NOP 0
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 1, 2);
   });
 }
 
 TEST(LiveIntervalTest, MoveDownRedef) {
-  liveIntervalTest(
-"    %0 = IMPLICIT_DEF\n"
-"    %0 = IMPLICIT_DEF implicit %0(tied-def 0)\n"
-"    S_NOP 0\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %0 = IMPLICIT_DEF
+    %0 = IMPLICIT_DEF implicit %0(tied-def 0)
+    S_NOP 0
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 1, 2);
   });
 }
 
 TEST(LiveIntervalTest, MoveDownEarlyDef) {
-  liveIntervalTest(
-"    S_NOP 0\n"
-"    early-clobber %0 = IMPLICIT_DEF\n"
-"    S_NOP 0\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    S_NOP 0
+    early-clobber %0 = IMPLICIT_DEF
+    S_NOP 0
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 1, 2);
   });
 }
 
 TEST(LiveIntervalTest, MoveDownEarlyRedef) {
-  liveIntervalTest(
-"    %0 = IMPLICIT_DEF\n"
-"    early-clobber %0 = IMPLICIT_DEF implicit %0(tied-def 0)\n"
-"    S_NOP 0\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %0 = IMPLICIT_DEF
+    early-clobber %0 = IMPLICIT_DEF implicit %0(tied-def 0)
+    S_NOP 0
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 1, 2);
   });
 }
 
 TEST(LiveIntervalTest, MoveDownKill) {
-  liveIntervalTest(
-"    %0 = IMPLICIT_DEF\n"
-"    S_NOP 0, implicit %0\n"
-"    S_NOP 0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %0 = IMPLICIT_DEF
+    S_NOP 0, implicit %0
+    S_NOP 0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 1, 2);
   });
 }
 
 TEST(LiveIntervalTest, MoveDownKillFollowing) {
-  liveIntervalTest(
-"    %0 = IMPLICIT_DEF\n"
-"    S_NOP 0\n"
-"    S_NOP 0, implicit %0\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %0 = IMPLICIT_DEF
+    S_NOP 0
+    S_NOP 0, implicit %0
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 1, 2);
   });
 }
 
 TEST(LiveIntervalTest, MoveUndefUse) {
-  liveIntervalTest(
-"    %0 = IMPLICIT_DEF\n"
-"    S_NOP 0, implicit undef %0\n"
-"    S_NOP 0, implicit %0\n"
-"    S_NOP 0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %0 = IMPLICIT_DEF
+    S_NOP 0, implicit undef %0
+    S_NOP 0, implicit %0
+    S_NOP 0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 1, 3);
   });
 }
@@ -315,44 +315,44 @@ TEST(LiveIntervalTest, MoveUpValNos) {
   // handleMoveUp() had a bug where it would reuse the value number of the
   // destination segment, even though we have no guarntee that this valno wasn't
   // used in other segments.
-  liveIntervalTest(
-"    successors: %bb.1, %bb.2\n"
-"    %0 = IMPLICIT_DEF\n"
-"    S_CBRANCH_VCCNZ %bb.2, implicit undef %vcc\n"
-"    S_BRANCH %bb.1\n"
-"  bb.2:\n"
-"    S_NOP 0, implicit %0\n"
-"  bb.1:\n"
-"    successors: %bb.2\n"
-"    %0 = IMPLICIT_DEF implicit %0(tied-def 0)\n"
-"    %0 = IMPLICIT_DEF implicit %0(tied-def 0)\n"
-"    %0 = IMPLICIT_DEF implicit %0(tied-def 0)\n"
-"    S_BRANCH %bb.2\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    successors: %bb.1, %bb.2
+    %0 = IMPLICIT_DEF
+    S_CBRANCH_VCCNZ %bb.2, implicit undef %vcc
+    S_BRANCH %bb.1
+  bb.2:
+    S_NOP 0, implicit %0
+  bb.1:
+    successors: %bb.2
+    %0 = IMPLICIT_DEF implicit %0(tied-def 0)
+    %0 = IMPLICIT_DEF implicit %0(tied-def 0)
+    %0 = IMPLICIT_DEF implicit %0(tied-def 0)
+    S_BRANCH %bb.2
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 2, 0, 2);
   });
 }
 
 TEST(LiveIntervalTest, MoveOverUndefUse0) {
   // findLastUseBefore() used by handleMoveUp() must ignore undef operands.
-  liveIntervalTest(
-"    %0 = IMPLICIT_DEF\n"
-"    S_NOP 0\n"
-"    S_NOP 0, implicit undef %0\n"
-"    %0 = IMPLICIT_DEF implicit %0(tied-def 0)\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %0 = IMPLICIT_DEF
+    S_NOP 0
+    S_NOP 0, implicit undef %0
+    %0 = IMPLICIT_DEF implicit %0(tied-def 0)
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 3, 1);
   });
 }
 
 TEST(LiveIntervalTest, MoveOverUndefUse1) {
   // findLastUseBefore() used by handleMoveUp() must ignore undef operands.
-  liveIntervalTest(
-"    %sgpr0 = IMPLICIT_DEF\n"
-"    S_NOP 0\n"
-"    S_NOP 0, implicit undef %sgpr0\n"
-"    %sgpr0 = IMPLICIT_DEF implicit %sgpr0(tied-def 0)\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    %sgpr0 = IMPLICIT_DEF
+    S_NOP 0
+    S_NOP 0, implicit undef %sgpr0
+    %sgpr0 = IMPLICIT_DEF implicit %sgpr0(tied-def 0)
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     testHandleMove(MF, LIS, 3, 1);
   });
 }
@@ -360,21 +360,21 @@ TEST(LiveIntervalTest, MoveOverUndefUse1) {
 TEST(LiveIntervalTest, SubRegMoveDown) {
   // Subregister ranges can have holes inside a basic block. Check for a
   // movement of the form 32->150 in a liverange [16, 32) [100,200).
-  liveIntervalTest(
-"    successors: %bb.1, %bb.2\n"
-"    %0 = IMPLICIT_DEF\n"
-"    S_CBRANCH_VCCNZ %bb.2, implicit undef %vcc\n"
-"    S_BRANCH %bb.1\n"
-"  bb.2:\n"
-"    successors: %bb.1\n"
-"    S_NOP 0, implicit %0.sub0\n"
-"    S_NOP 0, implicit %0.sub1\n"
-"    S_NOP 0\n"
-"    undef %0.sub0 = IMPLICIT_DEF\n"
-"    %0.sub1 = IMPLICIT_DEF\n"
-"  bb.1:\n"
-"    S_NOP 0, implicit %0\n",
-  [](MachineFunction &MF, LiveIntervals &LIS) {
+  liveIntervalTest(R"MIR(
+    successors: %bb.1, %bb.2
+    %0 = IMPLICIT_DEF
+    S_CBRANCH_VCCNZ %bb.2, implicit undef %vcc
+    S_BRANCH %bb.1
+  bb.2:
+    successors: %bb.1
+    S_NOP 0, implicit %0.sub0
+    S_NOP 0, implicit %0.sub1
+    S_NOP 0
+    undef %0.sub0 = IMPLICIT_DEF
+    %0.sub1 = IMPLICIT_DEF
+  bb.1:
+    S_NOP 0, implicit %0
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
     // Scheduler behaviour: Clear def,read-undef flag and move.
     MachineInstr &MI = getMI(MF, 3, /*BlockNum=*/1);
     MI.getOperand(0).setIsUndef(false);
@@ -382,6 +382,24 @@ TEST(LiveIntervalTest, SubRegMoveDown) {
   });
 }
 
+TEST(LiveIntervalTest, SubRegMoveUp) {
+  // handleMoveUp had a bug not updating valno of segment incoming to bb.2
+  // after swapping subreg definitions.
+  liveIntervalTest(R"MIR(
+    successors: %bb.1, %bb.2
+    undef %0.sub0 = IMPLICIT_DEF
+    %0.sub1 = IMPLICIT_DEF
+    S_CBRANCH_VCCNZ %bb.2, implicit undef %vcc
+    S_BRANCH %bb.1
+  bb.1:
+    S_NOP 0, implicit %0.sub1
+  bb.2:
+    S_NOP 0, implicit %0.sub1
+)MIR", [](MachineFunction &MF, LiveIntervals &LIS) {
+    testHandleMove(MF, LIS, 1, 0);
+  });
+}
+
 int main(int argc, char **argv) {
   ::testing::InitGoogleTest(&argc, argv);
   initLLVM();
diff --git a/unittests/Object/CMakeLists.txt b/unittests/Object/CMakeLists.txt
index 7a63c167a30b9aabb1b0700272d8f13b5fb5b654..e1376bffbc0f59a760a5cc568daf22c50f70c8c2 100644
--- a/unittests/Object/CMakeLists.txt
+++ b/unittests/Object/CMakeLists.txt
@@ -4,5 +4,6 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_unittest(ObjectTests
   SymbolSizeTest.cpp
+  SymbolicFileTest.cpp
   )
 
diff --git a/unittests/Object/SymbolicFileTest.cpp b/unittests/Object/SymbolicFileTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..ec954e5e67d08abb90eaf720a83a84cb146c6ea4
--- /dev/null
+++ b/unittests/Object/SymbolicFileTest.cpp
@@ -0,0 +1,42 @@
+//===- SymbolicFileTest.cpp - Tests for SymbolicFile.cpp ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Object/SymbolicFile.h"
+#include "llvm/Support/Host.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+#include <sstream>
+
+TEST(Object, DataRefImplOstream) {
+  std::string s;
+  llvm::raw_string_ostream OS(s);
+  llvm::object::DataRefImpl Data;
+  Data.d.a = 0xeeee0000;
+  Data.d.b = 0x0000ffff;
+
+  static_assert(sizeof Data.p == sizeof(uint64_t) ||
+                    sizeof Data.p == sizeof(uint32_t),
+                "Test expected pointer type to be 32 or 64-bit.");
+
+  char const *Expected;
+
+  if (sizeof Data.p == sizeof(uint64_t)) {
+    Expected = llvm::sys::IsLittleEndianHost
+                             ? "(0xffffeeee0000 (0xeeee0000, 0x0000ffff))"
+                             : "(0xeeee00000000ffff (0xeeee0000, 0x0000ffff))";
+  }
+  else {
+    Expected = "(0xeeee0000 (0xeeee0000, 0x0000ffff))";
+  }
+
+  OS << Data;
+  OS.flush();
+
+  EXPECT_EQ(Expected, s);
+}
diff --git a/unittests/ProfileData/CoverageMappingTest.cpp b/unittests/ProfileData/CoverageMappingTest.cpp
index 49eab4ad7887a59db9e16db60e035c7394eaa06b..0783a23a67b06689946121a8c96acfe1739b8b95 100644
--- a/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/unittests/ProfileData/CoverageMappingTest.cpp
@@ -328,7 +328,7 @@ TEST_P(CoverageMappingTest, load_coverage_for_several_functions) {
   loadCoverageMapping();
 
   const auto FunctionRecords = LoadedCoverage->getCoveredFunctions();
-  EXPECT_EQ(2U, std::distance(FunctionRecords.begin(), FunctionRecords.end()));
+  EXPECT_EQ(2, std::distance(FunctionRecords.begin(), FunctionRecords.end()));
   for (const auto &FunctionRecord : FunctionRecords) {
     CoverageData Data = LoadedCoverage->getCoverageForFunction(FunctionRecord);
     std::vector<CoverageSegment> Segments(Data.begin(), Data.end());
diff --git a/unittests/Support/ARMAttributeParser.cpp b/unittests/Support/ARMAttributeParser.cpp
index 82cbcd1fc153d26a4cb5a149766d38fc2e17463f..c2df6537ff63d936fb49327ebcc2155c24497f7c 100644
--- a/unittests/Support/ARMAttributeParser.cpp
+++ b/unittests/Support/ARMAttributeParser.cpp
@@ -26,12 +26,13 @@ struct AttributeSection {
 };
 
 bool testBuildAttr(unsigned Tag, unsigned Value,
-                      unsigned ExpectedTag, unsigned ExpectedValue) {
+                   unsigned ExpectedTag, unsigned ExpectedValue) {
   std::string buffer;
   raw_string_ostream OS(buffer);
   AttributeSection Section(Tag, Value);
   Section.write(OS);
-  ArrayRef<uint8_t> Bytes((uint8_t*)OS.str().c_str(), OS.str().size());
+  ArrayRef<uint8_t> Bytes(
+    reinterpret_cast<const uint8_t*>(OS.str().c_str()), OS.str().size());
 
   ARMAttributeParser Parser;
   Parser.Parse(Bytes, true);
diff --git a/unittests/Support/AllocatorTest.cpp b/unittests/Support/AllocatorTest.cpp
index 4b544641e9bffa2fa4035701bb4dbe149f5e1026..4897c47eb28bacec0375d7ef5ed74c8f221a4210 100644
--- a/unittests/Support/AllocatorTest.cpp
+++ b/unittests/Support/AllocatorTest.cpp
@@ -17,9 +17,9 @@ namespace {
 
 TEST(AllocatorTest, Basics) {
   BumpPtrAllocator Alloc;
-  int *a = (int*)Alloc.Allocate(sizeof(int), 1);
-  int *b = (int*)Alloc.Allocate(sizeof(int) * 10, 1);
-  int *c = (int*)Alloc.Allocate(sizeof(int), 1);
+  int *a = (int*)Alloc.Allocate(sizeof(int), alignof(int));
+  int *b = (int*)Alloc.Allocate(sizeof(int) * 10, alignof(int));
+  int *c = (int*)Alloc.Allocate(sizeof(int), alignof(int));
   *a = 1;
   b[0] = 2;
   b[9] = 2;
diff --git a/unittests/Support/BinaryStreamTest.cpp b/unittests/Support/BinaryStreamTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..1e646a6cf90013050e0cc47f3b29cd97c2c13316
--- /dev/null
+++ b/unittests/Support/BinaryStreamTest.cpp
@@ -0,0 +1,711 @@
+//===- llvm/unittest/Support/BinaryStreamTest.cpp -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/BinaryItemStream.h"
+#include "llvm/Support/BinaryStreamArray.h"
+#include "llvm/Support/BinaryStreamReader.h"
+#include "llvm/Support/BinaryStreamRef.h"
+#include "llvm/Support/BinaryStreamWriter.h"
+#include "gtest/gtest.h"
+
+#include <unordered_map>
+
+using namespace llvm;
+using namespace llvm::support;
+
+#define EXPECT_NO_ERROR(Err)                                                   \
+  {                                                                            \
+    auto E = Err;                                                              \
+    EXPECT_FALSE(static_cast<bool>(E));                                        \
+    if (E)                                                                     \
+      consumeError(std::move(E));                                              \
+  }
+
+#define ASSERT_NO_ERROR(Err)                                                   \
+  {                                                                            \
+    auto E = Err;                                                              \
+    ASSERT_FALSE(static_cast<bool>(E));                                        \
+    if (E)                                                                     \
+      consumeError(std::move(E));                                              \
+  }
+
+#define EXPECT_ERROR(Err)                                                      \
+  {                                                                            \
+    auto E = Err;                                                              \
+    EXPECT_TRUE(static_cast<bool>(E));                                         \
+    if (E)                                                                     \
+      consumeError(std::move(E));                                              \
+  }
+
+namespace {
+
+class BrokenStream : public WritableBinaryStream {
+public:
+  BrokenStream(MutableArrayRef<uint8_t> Data, endianness Endian,
+                      uint32_t Align)
+      : Data(Data), PartitionIndex(alignDown(Data.size() / 2, Align)),
+        Endian(Endian) {}
+
+  endianness getEndian() const override { return Endian; }
+
+  Error readBytes(uint32_t Offset, uint32_t Size,
+                  ArrayRef<uint8_t> &Buffer) override {
+    if (auto EC = checkOffset(Offset, Size))
+      return EC;
+    uint32_t S = startIndex(Offset);
+    auto Ref = Data.drop_front(S);
+    if (Ref.size() >= Size) {
+      Buffer = Ref.take_front(Size);
+      return Error::success();
+    }
+
+    uint32_t BytesLeft = Size - Ref.size();
+    uint8_t *Ptr = Allocator.Allocate<uint8_t>(Size);
+    ::memcpy(Ptr, Ref.data(), Ref.size());
+    ::memcpy(Ptr + Ref.size(), Data.data(), BytesLeft);
+    Buffer = makeArrayRef<uint8_t>(Ptr, Size);
+    return Error::success();
+  }
+
+  Error readLongestContiguousChunk(uint32_t Offset,
+                                   ArrayRef<uint8_t> &Buffer) override {
+    if (auto EC = checkOffset(Offset, 1))
+      return EC;
+    uint32_t S = startIndex(Offset);
+    Buffer = Data.drop_front(S);
+    return Error::success();
+  }
+
+  uint32_t getLength() override { return Data.size(); }
+
+  Error writeBytes(uint32_t Offset, ArrayRef<uint8_t> SrcData) override {
+    if (auto EC = checkOffset(Offset, SrcData.size()))
+      return EC;
+    if (SrcData.empty())
+      return Error::success();
+
+    uint32_t S = startIndex(Offset);
+    MutableArrayRef<uint8_t> Ref(Data);
+    Ref = Ref.drop_front(S);
+    if (Ref.size() >= SrcData.size()) {
+      ::memcpy(Ref.data(), SrcData.data(), SrcData.size());
+      return Error::success();
+    }
+
+    uint32_t BytesLeft = SrcData.size() - Ref.size();
+    ::memcpy(Ref.data(), SrcData.data(), Ref.size());
+    ::memcpy(&Data[0], SrcData.data() + Ref.size(), BytesLeft);
+    return Error::success();
+  }
+  Error commit() override { return Error::success(); }
+
+private:
+  uint32_t startIndex(uint32_t Offset) const {
+    return (Offset + PartitionIndex) % Data.size();
+  }
+
+  uint32_t endIndex(uint32_t Offset, uint32_t Size) const {
+    return (startIndex(Offset) + Size - 1) % Data.size();
+  }
+
+  // Buffer is organized like this:
+  // -------------------------------------------------
+  // | N/2 | N/2+1 | ... | N-1 | 0 | 1 | ... | N-2-1 |
+  // -------------------------------------------------
+  // So reads from the beginning actually come from the middle.
+  MutableArrayRef<uint8_t> Data;
+  uint32_t PartitionIndex = 0;
+  endianness Endian;
+  BumpPtrAllocator Allocator;
+};
+
+constexpr endianness Endians[] = {big, little, native};
+constexpr uint32_t NumEndians = llvm::array_lengthof(Endians);
+constexpr uint32_t NumStreams = 2 * NumEndians;
+
+class BinaryStreamTest : public testing::Test {
+
+public:
+  BinaryStreamTest() {}
+
+  void SetUp() override {
+    Streams.clear();
+    Streams.resize(NumStreams);
+    for (uint32_t I = 0; I < NumStreams; ++I)
+      Streams[I].IsContiguous = (I % 2 == 0);
+
+    InputData.clear();
+    OutputData.clear();
+  }
+
+protected:
+  struct StreamPair {
+    bool IsContiguous;
+    std::unique_ptr<BinaryStream> Input;
+    std::unique_ptr<WritableBinaryStream> Output;
+  };
+
+  void initializeInput(ArrayRef<uint8_t> Input, uint32_t Align) {
+    InputData = Input;
+
+    BrokenInputData.resize(InputData.size());
+    if (!Input.empty()) {
+      uint32_t PartitionIndex = alignDown(InputData.size() / 2, Align);
+      uint32_t RightBytes = InputData.size() - PartitionIndex;
+      uint32_t LeftBytes = PartitionIndex;
+      if (RightBytes > 0)
+        ::memcpy(&BrokenInputData[PartitionIndex], Input.data(), RightBytes);
+      if (LeftBytes > 0)
+        ::memcpy(&BrokenInputData[0], Input.data() + RightBytes, LeftBytes);
+    }
+
+    for (uint32_t I = 0; I < NumEndians; ++I) {
+      auto InByteStream =
+          llvm::make_unique<BinaryByteStream>(InputData, Endians[I]);
+      auto InBrokenStream = llvm::make_unique<BrokenStream>(
+          BrokenInputData, Endians[I], Align);
+
+      Streams[I * 2].Input = std::move(InByteStream);
+      Streams[I * 2 + 1].Input = std::move(InBrokenStream);
+    }
+  }
+
+  void initializeOutput(uint32_t Size, uint32_t Align) {
+    OutputData.resize(Size);
+    BrokenOutputData.resize(Size);
+
+    for (uint32_t I = 0; I < NumEndians; ++I) {
+      Streams[I * 2].Output =
+          llvm::make_unique<MutableBinaryByteStream>(OutputData, Endians[I]);
+      Streams[I * 2 + 1].Output = llvm::make_unique<BrokenStream>(
+          BrokenOutputData, Endians[I], Align);
+    }
+  }
+
+  void initializeOutputFromInput(uint32_t Align) {
+    for (uint32_t I = 0; I < NumEndians; ++I) {
+      Streams[I * 2].Output =
+          llvm::make_unique<MutableBinaryByteStream>(InputData, Endians[I]);
+      Streams[I * 2 + 1].Output = llvm::make_unique<BrokenStream>(
+          BrokenInputData, Endians[I], Align);
+    }
+  }
+
+  void initializeInputFromOutput(uint32_t Align) {
+    for (uint32_t I = 0; I < NumEndians; ++I) {
+      Streams[I * 2].Input =
+          llvm::make_unique<BinaryByteStream>(OutputData, Endians[I]);
+      Streams[I * 2 + 1].Input = llvm::make_unique<BrokenStream>(
+          BrokenOutputData, Endians[I], Align);
+    }
+  }
+
+  std::vector<uint8_t> InputData;
+  std::vector<uint8_t> BrokenInputData;
+
+  std::vector<uint8_t> OutputData;
+  std::vector<uint8_t> BrokenOutputData;
+
+  std::vector<StreamPair> Streams;
+};
+
+// Tests that a we can read from a BinaryByteStream without a StreamReader.
+TEST_F(BinaryStreamTest, BinaryByteStreamBounds) {
+  std::vector<uint8_t> InputData = {1, 2, 3, 4, 5};
+  initializeInput(InputData, 1);
+
+  for (auto &Stream : Streams) {
+    ArrayRef<uint8_t> Buffer;
+
+    // 1. If the read fits it should work.
+    ASSERT_EQ(InputData.size(), Stream.Input->getLength());
+    ASSERT_NO_ERROR(Stream.Input->readBytes(2, 1, Buffer));
+    EXPECT_EQ(makeArrayRef(InputData).slice(2, 1), Buffer);
+    ASSERT_NO_ERROR(Stream.Input->readBytes(0, 4, Buffer));
+    EXPECT_EQ(makeArrayRef(InputData).slice(0, 4), Buffer);
+
+    // 2. Reading past the bounds of the input should fail.
+    EXPECT_ERROR(Stream.Input->readBytes(4, 2, Buffer));
+  }
+}
+
+TEST_F(BinaryStreamTest, StreamRefBounds) {
+  std::vector<uint8_t> InputData = {1, 2, 3, 4, 5};
+  initializeInput(InputData, 1);
+
+  for (const auto &Stream : Streams) {
+    ArrayRef<uint8_t> Buffer;
+    BinaryStreamRef Ref(*Stream.Input);
+
+    // Read 1 byte from offset 2 should work
+    ASSERT_EQ(InputData.size(), Ref.getLength());
+    ASSERT_NO_ERROR(Ref.readBytes(2, 1, Buffer));
+    EXPECT_EQ(makeArrayRef(InputData).slice(2, 1), Buffer);
+
+    // Reading everything from offset 2 on.
+    ASSERT_NO_ERROR(Ref.readLongestContiguousChunk(2, Buffer));
+    if (Stream.IsContiguous)
+      EXPECT_EQ(makeArrayRef(InputData).slice(2), Buffer);
+    else
+      EXPECT_FALSE(Buffer.empty());
+
+    // Reading 6 bytes from offset 0 is too big.
+    EXPECT_ERROR(Ref.readBytes(0, 6, Buffer));
+    EXPECT_ERROR(Ref.readLongestContiguousChunk(6, Buffer));
+
+    // Reading 1 byte from offset 2 after dropping 1 byte is the same as reading
+    // 1 byte from offset 3.
+    Ref = Ref.drop_front(1);
+    ASSERT_NO_ERROR(Ref.readBytes(2, 1, Buffer));
+    if (Stream.IsContiguous)
+      EXPECT_EQ(makeArrayRef(InputData).slice(3, 1), Buffer);
+    else
+      EXPECT_FALSE(Buffer.empty());
+
+    // Reading everything from offset 2 on after dropping 1 byte.
+    ASSERT_NO_ERROR(Ref.readLongestContiguousChunk(2, Buffer));
+    if (Stream.IsContiguous)
+      EXPECT_EQ(makeArrayRef(InputData).slice(3), Buffer);
+    else
+      EXPECT_FALSE(Buffer.empty());
+
+    // Reading 2 bytes from offset 2 after dropping 2 bytes is the same as
+    // reading 2 bytes from offset 4, and should fail.
+    Ref = Ref.drop_front(1);
+    EXPECT_ERROR(Ref.readBytes(2, 2, Buffer));
+
+    // But if we read the longest contiguous chunk instead, we should still
+    // get the 1 byte at the end.
+    ASSERT_NO_ERROR(Ref.readLongestContiguousChunk(2, Buffer));
+    EXPECT_EQ(makeArrayRef(InputData).take_back(), Buffer);
+  }
+}
+
+// Test that we can write to a BinaryStream without a StreamWriter.
+TEST_F(BinaryStreamTest, MutableBinaryByteStreamBounds) {
+  std::vector<uint8_t> InputData = {'T', 'e', 's', 't', '\0'};
+  initializeInput(InputData, 1);
+  initializeOutput(InputData.size(), 1);
+
+  // For every combination of input stream and output stream.
+  for (auto &Stream : Streams) {
+    MutableArrayRef<uint8_t> Buffer;
+    ASSERT_EQ(InputData.size(), Stream.Input->getLength());
+
+    // 1. Try two reads that are supposed to work.  One from offset 0, and one
+    // from the middle.
+    uint32_t Offsets[] = {0, 3};
+    for (auto Offset : Offsets) {
+      uint32_t ExpectedSize = Stream.Input->getLength() - Offset;
+
+      // Read everything from Offset until the end of the input data.
+      ArrayRef<uint8_t> Data;
+      ASSERT_NO_ERROR(Stream.Input->readBytes(Offset, ExpectedSize, Data));
+      ASSERT_EQ(ExpectedSize, Data.size());
+
+      // Then write it to the destination.
+      ASSERT_NO_ERROR(Stream.Output->writeBytes(0, Data));
+
+      // Then we read back what we wrote, it should match the corresponding
+      // slice of the original input data.
+      ArrayRef<uint8_t> Data2;
+      ASSERT_NO_ERROR(Stream.Output->readBytes(Offset, ExpectedSize, Data2));
+      EXPECT_EQ(makeArrayRef(InputData).drop_front(Offset), Data2);
+    }
+
+    std::vector<uint8_t> BigData = {0, 1, 2, 3, 4};
+    // 2. If the write is too big, it should fail.
+    EXPECT_ERROR(Stream.Output->writeBytes(3, BigData));
+  }
+}
+
+// Test that FixedStreamArray works correctly.
+TEST_F(BinaryStreamTest, FixedStreamArray) {
+  std::vector<uint32_t> Ints = {90823, 12908, 109823, 209823};
+  ArrayRef<uint8_t> IntBytes(reinterpret_cast<uint8_t *>(Ints.data()),
+                             Ints.size() * sizeof(uint32_t));
+
+  initializeInput(IntBytes, alignof(uint32_t));
+
+  for (auto &Stream : Streams) {
+    MutableArrayRef<uint8_t> Buffer;
+    ASSERT_EQ(InputData.size(), Stream.Input->getLength());
+
+    FixedStreamArray<uint32_t> Array(*Stream.Input);
+    auto Iter = Array.begin();
+    ASSERT_EQ(Ints[0], *Iter++);
+    ASSERT_EQ(Ints[1], *Iter++);
+    ASSERT_EQ(Ints[2], *Iter++);
+    ASSERT_EQ(Ints[3], *Iter++);
+    ASSERT_EQ(Array.end(), Iter);
+  }
+}
+
+// Test that VarStreamArray works correctly.
+TEST_F(BinaryStreamTest, VarStreamArray) {
+  StringLiteral Strings("1. Test2. Longer Test3. Really Long Test4. Super "
+                        "Extra Longest Test Of All");
+  ArrayRef<uint8_t> StringBytes(
+      reinterpret_cast<const uint8_t *>(Strings.data()), Strings.size());
+  initializeInput(StringBytes, 1);
+
+  struct StringExtractor {
+  public:
+    Error operator()(BinaryStreamRef Stream, uint32_t &Len, StringRef &Item) {
+      if (Index == 0)
+        Len = strlen("1. Test");
+      else if (Index == 1)
+        Len = strlen("2. Longer Test");
+      else if (Index == 2)
+        Len = strlen("3. Really Long Test");
+      else
+        Len = strlen("4. Super Extra Longest Test Of All");
+      ArrayRef<uint8_t> Bytes;
+      if (auto EC = Stream.readBytes(0, Len, Bytes))
+        return EC;
+      Item =
+          StringRef(reinterpret_cast<const char *>(Bytes.data()), Bytes.size());
+      ++Index;
+      return Error::success();
+    }
+
+  private:
+    uint32_t Index = 0;
+  };
+
+  for (auto &Stream : Streams) {
+    VarStreamArray<StringRef, StringExtractor> Array(*Stream.Input);
+    auto Iter = Array.begin();
+    ASSERT_EQ("1. Test", *Iter++);
+    ASSERT_EQ("2. Longer Test", *Iter++);
+    ASSERT_EQ("3. Really Long Test", *Iter++);
+    ASSERT_EQ("4. Super Extra Longest Test Of All", *Iter++);
+    ASSERT_EQ(Array.end(), Iter);
+  }
+}
+
+TEST_F(BinaryStreamTest, StreamReaderBounds) {
+  std::vector<uint8_t> Bytes;
+
+  initializeInput(Bytes, 1);
+  for (auto &Stream : Streams) {
+    StringRef S;
+    BinaryStreamReader Reader(*Stream.Input);
+    EXPECT_EQ(0U, Reader.bytesRemaining());
+    EXPECT_ERROR(Reader.readFixedString(S, 1));
+  }
+
+  Bytes.resize(5);
+  initializeInput(Bytes, 1);
+  for (auto &Stream : Streams) {
+    StringRef S;
+    BinaryStreamReader Reader(*Stream.Input);
+    EXPECT_EQ(Bytes.size(), Reader.bytesRemaining());
+    EXPECT_NO_ERROR(Reader.readFixedString(S, 5));
+    EXPECT_ERROR(Reader.readFixedString(S, 6));
+  }
+}
+
+TEST_F(BinaryStreamTest, StreamReaderIntegers) {
+  support::ulittle64_t Little{908234};
+  support::ubig32_t Big{28907823};
+  short NS = 2897;
+  int NI = -89723;
+  unsigned long NUL = 902309023UL;
+  constexpr uint32_t Size =
+      sizeof(Little) + sizeof(Big) + sizeof(NS) + sizeof(NI) + sizeof(NUL);
+
+  initializeOutput(Size, alignof(support::ulittle64_t));
+  initializeInputFromOutput(alignof(support::ulittle64_t));
+
+  for (auto &Stream : Streams) {
+    BinaryStreamWriter Writer(*Stream.Output);
+    ASSERT_NO_ERROR(Writer.writeObject(Little));
+    ASSERT_NO_ERROR(Writer.writeObject(Big));
+    ASSERT_NO_ERROR(Writer.writeInteger(NS));
+    ASSERT_NO_ERROR(Writer.writeInteger(NI));
+    ASSERT_NO_ERROR(Writer.writeInteger(NUL));
+
+    const support::ulittle64_t *Little2;
+    const support::ubig32_t *Big2;
+    short NS2;
+    int NI2;
+    unsigned long NUL2;
+
+    // 1. Reading fields individually.
+    BinaryStreamReader Reader(*Stream.Input);
+    ASSERT_NO_ERROR(Reader.readObject(Little2));
+    ASSERT_NO_ERROR(Reader.readObject(Big2));
+    ASSERT_NO_ERROR(Reader.readInteger(NS2));
+    ASSERT_NO_ERROR(Reader.readInteger(NI2));
+    ASSERT_NO_ERROR(Reader.readInteger(NUL2));
+    ASSERT_EQ(0U, Reader.bytesRemaining());
+
+    EXPECT_EQ(Little, *Little2);
+    EXPECT_EQ(Big, *Big2);
+    EXPECT_EQ(NS, NS2);
+    EXPECT_EQ(NI, NI2);
+    EXPECT_EQ(NUL, NUL2);
+  }
+}
+
+TEST_F(BinaryStreamTest, StreamReaderIntegerArray) {
+  // 1. Arrays of integers
+  std::vector<int> Ints = {1, 2, 3, 4, 5};
+  ArrayRef<uint8_t> IntBytes(reinterpret_cast<uint8_t *>(&Ints[0]),
+                             Ints.size() * sizeof(int));
+
+  initializeInput(IntBytes, alignof(int));
+  for (auto &Stream : Streams) {
+    BinaryStreamReader Reader(*Stream.Input);
+    ArrayRef<int> IntsRef;
+    ASSERT_NO_ERROR(Reader.readArray(IntsRef, Ints.size()));
+    ASSERT_EQ(0U, Reader.bytesRemaining());
+    EXPECT_EQ(makeArrayRef(Ints), IntsRef);
+
+    Reader.setOffset(0);
+    FixedStreamArray<int> FixedIntsRef;
+    ASSERT_NO_ERROR(Reader.readArray(FixedIntsRef, Ints.size()));
+    ASSERT_EQ(0U, Reader.bytesRemaining());
+    ASSERT_EQ(Ints, std::vector<int>(FixedIntsRef.begin(), FixedIntsRef.end()));
+  }
+}
+
+TEST_F(BinaryStreamTest, StreamReaderEnum) {
+  enum class MyEnum : int64_t { Foo = -10, Bar = 0, Baz = 10 };
+
+  std::vector<MyEnum> Enums = {MyEnum::Bar, MyEnum::Baz, MyEnum::Foo};
+
+  initializeOutput(Enums.size() * sizeof(MyEnum), alignof(MyEnum));
+  initializeInputFromOutput(alignof(MyEnum));
+  for (auto &Stream : Streams) {
+    BinaryStreamWriter Writer(*Stream.Output);
+    for (auto Value : Enums)
+      ASSERT_NO_ERROR(Writer.writeEnum(Value));
+
+    BinaryStreamReader Reader(*Stream.Input);
+
+    ArrayRef<MyEnum> Array;
+    FixedStreamArray<MyEnum> FSA;
+
+    for (size_t I = 0; I < Enums.size(); ++I) {
+      MyEnum Value;
+      ASSERT_NO_ERROR(Reader.readEnum(Value));
+      EXPECT_EQ(Enums[I], Value);
+    }
+    ASSERT_EQ(0U, Reader.bytesRemaining());
+  }
+}
+
+TEST_F(BinaryStreamTest, StreamReaderObject) {
+  struct Foo {
+    int X;
+    double Y;
+    char Z;
+
+    bool operator==(const Foo &Other) const {
+      return X == Other.X && Y == Other.Y && Z == Other.Z;
+    }
+  };
+
+  std::vector<Foo> Foos;
+  Foos.push_back({-42, 42.42, 42});
+  Foos.push_back({100, 3.1415, static_cast<char>(-89)});
+  Foos.push_back({200, 2.718, static_cast<char>(-12) });
+
+  const uint8_t *Bytes = reinterpret_cast<const uint8_t *>(&Foos[0]);
+
+  initializeInput(makeArrayRef(Bytes, 3 * sizeof(Foo)), alignof(Foo));
+
+  for (auto &Stream : Streams) {
+    // 1. Reading object pointers.
+    BinaryStreamReader Reader(*Stream.Input);
+    const Foo *FPtrOut = nullptr;
+    const Foo *GPtrOut = nullptr;
+    const Foo *HPtrOut = nullptr;
+    ASSERT_NO_ERROR(Reader.readObject(FPtrOut));
+    ASSERT_NO_ERROR(Reader.readObject(GPtrOut));
+    ASSERT_NO_ERROR(Reader.readObject(HPtrOut));
+    EXPECT_EQ(0U, Reader.bytesRemaining());
+    EXPECT_EQ(Foos[0], *FPtrOut);
+    EXPECT_EQ(Foos[1], *GPtrOut);
+    EXPECT_EQ(Foos[2], *HPtrOut);
+  }
+}
+
+TEST_F(BinaryStreamTest, StreamReaderStrings) {
+  std::vector<uint8_t> Bytes = {'O',  'n', 'e', '\0', 'T', 'w', 'o',
+                                '\0', 'T', 'h', 'r',  'e', 'e', '\0',
+                                'F',  'o', 'u', 'r',  '\0'};
+  initializeInput(Bytes, 1);
+
+  for (auto &Stream : Streams) {
+    BinaryStreamReader Reader(*Stream.Input);
+
+    StringRef S1;
+    StringRef S2;
+    StringRef S3;
+    StringRef S4;
+    ASSERT_NO_ERROR(Reader.readCString(S1));
+    ASSERT_NO_ERROR(Reader.readCString(S2));
+    ASSERT_NO_ERROR(Reader.readCString(S3));
+    ASSERT_NO_ERROR(Reader.readCString(S4));
+    ASSERT_EQ(0U, Reader.bytesRemaining());
+
+    EXPECT_EQ("One", S1);
+    EXPECT_EQ("Two", S2);
+    EXPECT_EQ("Three", S3);
+    EXPECT_EQ("Four", S4);
+
+    S1 = S2 = S3 = S4 = "";
+    Reader.setOffset(0);
+    ASSERT_NO_ERROR(Reader.readFixedString(S1, 3));
+    ASSERT_NO_ERROR(Reader.skip(1));
+    ASSERT_NO_ERROR(Reader.readFixedString(S2, 3));
+    ASSERT_NO_ERROR(Reader.skip(1));
+    ASSERT_NO_ERROR(Reader.readFixedString(S3, 5));
+    ASSERT_NO_ERROR(Reader.skip(1));
+    ASSERT_NO_ERROR(Reader.readFixedString(S4, 4));
+    ASSERT_NO_ERROR(Reader.skip(1));
+    ASSERT_EQ(0U, Reader.bytesRemaining());
+
+    EXPECT_EQ("One", S1);
+    EXPECT_EQ("Two", S2);
+    EXPECT_EQ("Three", S3);
+    EXPECT_EQ("Four", S4);
+  }
+}
+
+TEST_F(BinaryStreamTest, StreamWriterBounds) {
+  initializeOutput(5, 1);
+
+  for (auto &Stream : Streams) {
+    BinaryStreamWriter Writer(*Stream.Output);
+
+    // 1. Can write a string that exactly fills the buffer.
+    EXPECT_EQ(5U, Writer.bytesRemaining());
+    EXPECT_NO_ERROR(Writer.writeFixedString("abcde"));
+    EXPECT_EQ(0U, Writer.bytesRemaining());
+
+    // 2. Can write an empty string even when you're full
+    EXPECT_NO_ERROR(Writer.writeFixedString(""));
+    EXPECT_ERROR(Writer.writeFixedString("a"));
+
+    // 3. Can't write a string that is one character too long.
+    Writer.setOffset(0);
+    EXPECT_ERROR(Writer.writeFixedString("abcdef"));
+  }
+}
+
+TEST_F(BinaryStreamTest, StreamWriterIntegerArrays) {
+  // 3. Arrays of integers
+  std::vector<int> SourceInts = {1, 2, 3, 4, 5};
+  ArrayRef<uint8_t> SourceBytes(reinterpret_cast<uint8_t *>(&SourceInts[0]),
+                                SourceInts.size() * sizeof(int));
+
+  initializeInput(SourceBytes, alignof(int));
+  initializeOutputFromInput(alignof(int));
+
+  for (auto &Stream : Streams) {
+    BinaryStreamReader Reader(*Stream.Input);
+    BinaryStreamWriter Writer(*Stream.Output);
+    ArrayRef<int> Ints;
+    ArrayRef<int> Ints2;
+    // First read them, then write them, then read them back.
+    ASSERT_NO_ERROR(Reader.readArray(Ints, SourceInts.size()));
+    ASSERT_NO_ERROR(Writer.writeArray(Ints));
+
+    BinaryStreamReader ReaderBacker(*Stream.Output);
+    ASSERT_NO_ERROR(ReaderBacker.readArray(Ints2, SourceInts.size()));
+
+    EXPECT_EQ(makeArrayRef(SourceInts), Ints2);
+  }
+}
+
+TEST_F(BinaryStreamTest, StringWriterStrings) {
+  StringRef Strings[] = {"First", "Second", "Third", "Fourth"};
+
+  size_t Length = 0;
+  for (auto S : Strings)
+    Length += S.size() + 1;
+  initializeOutput(Length, 1);
+  initializeInputFromOutput(1);
+
+  for (auto &Stream : Streams) {
+    BinaryStreamWriter Writer(*Stream.Output);
+    for (auto S : Strings)
+      ASSERT_NO_ERROR(Writer.writeCString(S));
+    std::vector<StringRef> InStrings;
+    BinaryStreamReader Reader(*Stream.Input);
+    while (!Reader.empty()) {
+      StringRef S;
+      ASSERT_NO_ERROR(Reader.readCString(S));
+      InStrings.push_back(S);
+    }
+    EXPECT_EQ(makeArrayRef(Strings), makeArrayRef(InStrings));
+  }
+}
+}
+
+namespace {
+struct BinaryItemStreamObject {
+  explicit BinaryItemStreamObject(ArrayRef<uint8_t> Bytes) : Bytes(Bytes) {}
+
+  ArrayRef<uint8_t> Bytes;
+};
+}
+
+namespace llvm {
+template <> struct BinaryItemTraits<BinaryItemStreamObject> {
+  static size_t length(const BinaryItemStreamObject &Item) {
+    return Item.Bytes.size();
+  }
+
+  static ArrayRef<uint8_t> bytes(const BinaryItemStreamObject &Item) {
+    return Item.Bytes;
+  }
+};
+}
+
+namespace {
+
+TEST_F(BinaryStreamTest, BinaryItemStream) {
+  std::vector<BinaryItemStreamObject> Objects;
+
+  struct Foo {
+    int X;
+    double Y;
+  };
+  std::vector<Foo> Foos = {{1, 1.0}, {2, 2.0}, {3, 3.0}};
+  BumpPtrAllocator Allocator;
+  for (const auto &F : Foos) {
+    uint8_t *Ptr = static_cast<uint8_t *>(Allocator.Allocate(sizeof(Foo), 
+                                                             alignof(Foo)));
+    MutableArrayRef<uint8_t> Buffer(Ptr, sizeof(Foo));
+    MutableBinaryByteStream Stream(Buffer, llvm::support::big);
+    BinaryStreamWriter Writer(Stream);
+    ASSERT_NO_ERROR(Writer.writeObject(F));
+    Objects.push_back(BinaryItemStreamObject(Buffer));
+  }
+
+  BinaryItemStream<BinaryItemStreamObject> ItemStream(big);
+  ItemStream.setItems(Objects);
+  BinaryStreamReader Reader(ItemStream);
+
+  for (const auto &F : Foos) {
+    const Foo *F2;
+    ASSERT_NO_ERROR(Reader.readObject(F2));
+
+    EXPECT_EQ(F.X, F2->X);
+    EXPECT_DOUBLE_EQ(F.Y, F2->Y);
+  }
+}
+
+} // end anonymous namespace
diff --git a/unittests/Support/CMakeLists.txt b/unittests/Support/CMakeLists.txt
index 4c9bb5eea38587f4d5dd9a9be34ca4cf19f4d1f3..a7be18b6a3c538b67a256c3626d1474cb8702286 100644
--- a/unittests/Support/CMakeLists.txt
+++ b/unittests/Support/CMakeLists.txt
@@ -7,8 +7,10 @@ add_llvm_unittest(SupportTests
   AllocatorTest.cpp
   ARMAttributeParser.cpp
   ArrayRecyclerTest.cpp
+  BinaryStreamTest.cpp
   BlockFrequencyTest.cpp
   BranchProbabilityTest.cpp
+  CachePruningTest.cpp
   Casting.cpp
   Chrono.cpp
   CommandLineTest.cpp
@@ -64,4 +66,4 @@ add_llvm_unittest(SupportTests
   )
 
 # ManagedStatic.cpp uses <pthread>.
-target_link_libraries(SupportTests ${PTHREAD_LIB})
+target_link_libraries(SupportTests ${LLVM_PTHREAD_LIB})
diff --git a/unittests/Support/CachePruningTest.cpp b/unittests/Support/CachePruningTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..04ac0d09b49356ae5d9a3878b6f9eae05eac76bd
--- /dev/null
+++ b/unittests/Support/CachePruningTest.cpp
@@ -0,0 +1,71 @@
+//===- CachePruningTest.cpp -----------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/CachePruning.h"
+#include "llvm/Support/Error.h"
+#include "gtest/gtest.h"
+
+using namespace llvm;
+
+TEST(CachePruningPolicyParser, Empty) {
+  auto P = parseCachePruningPolicy("");
+  ASSERT_TRUE(bool(P));
+  EXPECT_EQ(std::chrono::seconds(1200), P->Interval);
+  EXPECT_EQ(std::chrono::hours(7 * 24), P->Expiration);
+  EXPECT_EQ(75u, P->PercentageOfAvailableSpace);
+}
+
+TEST(CachePruningPolicyParser, Interval) {
+  auto P = parseCachePruningPolicy("prune_interval=1s");
+  ASSERT_TRUE(bool(P));
+  EXPECT_EQ(std::chrono::seconds(1), P->Interval);
+  P = parseCachePruningPolicy("prune_interval=2m");
+  ASSERT_TRUE(bool(P));
+  EXPECT_EQ(std::chrono::minutes(2), P->Interval);
+  P = parseCachePruningPolicy("prune_interval=3h");
+  ASSERT_TRUE(bool(P));
+  EXPECT_EQ(std::chrono::hours(3), P->Interval);
+}
+
+TEST(CachePruningPolicyParser, Expiration) {
+  auto P = parseCachePruningPolicy("prune_after=1s");
+  ASSERT_TRUE(bool(P));
+  EXPECT_EQ(std::chrono::seconds(1), P->Expiration);
+}
+
+TEST(CachePruningPolicyParser, PercentageOfAvailableSpace) {
+  auto P = parseCachePruningPolicy("cache_size=100%");
+  ASSERT_TRUE(bool(P));
+  EXPECT_EQ(100u, P->PercentageOfAvailableSpace);
+}
+
+TEST(CachePruningPolicyParser, Multiple) {
+  auto P = parseCachePruningPolicy("prune_after=1s:cache_size=50%");
+  ASSERT_TRUE(bool(P));
+  EXPECT_EQ(std::chrono::seconds(1200), P->Interval);
+  EXPECT_EQ(std::chrono::seconds(1), P->Expiration);
+  EXPECT_EQ(50u, P->PercentageOfAvailableSpace);
+}
+
+TEST(CachePruningPolicyParser, Errors) {
+  EXPECT_EQ("Duration must not be empty",
+            toString(parseCachePruningPolicy("prune_interval=").takeError()));
+  EXPECT_EQ("'foo' not an integer",
+            toString(parseCachePruningPolicy("prune_interval=foos").takeError()));
+  EXPECT_EQ("'24x' must end with one of 's', 'm' or 'h'",
+            toString(parseCachePruningPolicy("prune_interval=24x").takeError()));
+  EXPECT_EQ("'foo' must be a percentage",
+            toString(parseCachePruningPolicy("cache_size=foo").takeError()));
+  EXPECT_EQ("'foo' not an integer",
+            toString(parseCachePruningPolicy("cache_size=foo%").takeError()));
+  EXPECT_EQ("'101' must be between 0 and 100",
+            toString(parseCachePruningPolicy("cache_size=101%").takeError()));
+  EXPECT_EQ("Unknown key: 'foo'",
+            toString(parseCachePruningPolicy("foo=bar").takeError()));
+}
diff --git a/unittests/Support/Chrono.cpp b/unittests/Support/Chrono.cpp
index 3d5787807563e1b42676bff1e293be620bab3fd6..1410baf848bb8380190cbac3efd2f09ad1f5eb32 100644
--- a/unittests/Support/Chrono.cpp
+++ b/unittests/Support/Chrono.cpp
@@ -9,6 +9,7 @@
 
 #include "llvm/Support/Chrono.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
@@ -76,4 +77,34 @@ TEST(Chrono, ImplicitConversions) {
   EXPECT_EQ(TimeT, toTimeT(Nano));
 }
 
+TEST(Chrono, DurationFormat) {
+  EXPECT_EQ("1 h", formatv("{0}", hours(1)).str());
+  EXPECT_EQ("1 m", formatv("{0}", minutes(1)).str());
+  EXPECT_EQ("1 s", formatv("{0}", seconds(1)).str());
+  EXPECT_EQ("1 ms", formatv("{0}", milliseconds(1)).str());
+  EXPECT_EQ("1 us", formatv("{0}", microseconds(1)).str());
+  EXPECT_EQ("1 ns", formatv("{0}", nanoseconds(1)).str());
+
+  EXPECT_EQ("1 s", formatv("{0:+}", seconds(1)).str());
+  EXPECT_EQ("1", formatv("{0:-}", seconds(1)).str());
+
+  EXPECT_EQ("1000 ms", formatv("{0:ms}", seconds(1)).str());
+  EXPECT_EQ("1000000 us", formatv("{0:us}", seconds(1)).str());
+  EXPECT_EQ("1000", formatv("{0:ms-}", seconds(1)).str());
+
+  EXPECT_EQ("1,000 ms", formatv("{0:+n}", milliseconds(1000)).str());
+  EXPECT_EQ("0x3e8", formatv("{0:-x}", milliseconds(1000)).str());
+  EXPECT_EQ("010", formatv("{0:-3}", milliseconds(10)).str());
+  EXPECT_EQ("10,000", formatv("{0:ms-n}", seconds(10)).str());
+
+  EXPECT_EQ("1.00 s", formatv("{0}", duration<float>(1)).str());
+  EXPECT_EQ("0.123 s", formatv("{0:+3}", duration<float>(0.123f)).str());
+  EXPECT_EQ("1.230e-01 s", formatv("{0:+e3}", duration<float>(0.123f)).str());
+
+  typedef duration<float, std::ratio<60 * 60 * 24 * 14, 1000000>>
+      microfortnights;
+  EXPECT_EQ("1.00", formatv("{0:-}", microfortnights(1)).str());
+  EXPECT_EQ("1209.60 ms", formatv("{0:ms}", microfortnights(1)).str());
+}
+
 } // anonymous namespace
diff --git a/unittests/Support/CommandLineTest.cpp b/unittests/Support/CommandLineTest.cpp
index 945eb1d4e1cfeb3416b40d88bf4c77ab0b0cbdf0..33573c4e696077cf354bf74991319d7d14ed19c4 100644
--- a/unittests/Support/CommandLineTest.cpp
+++ b/unittests/Support/CommandLineTest.cpp
@@ -303,7 +303,8 @@ TEST(CommandLineTest, SetValueInSubcategories) {
   EXPECT_FALSE(SC1Opt);
   EXPECT_FALSE(SC2Opt);
   const char *args[] = {"prog", "-top-level"};
-  EXPECT_TRUE(cl::ParseCommandLineOptions(2, args, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(2, args, StringRef(), &llvm::nulls()));
   EXPECT_TRUE(TopLevelOpt);
   EXPECT_FALSE(SC1Opt);
   EXPECT_FALSE(SC2Opt);
@@ -315,7 +316,8 @@ TEST(CommandLineTest, SetValueInSubcategories) {
   EXPECT_FALSE(SC1Opt);
   EXPECT_FALSE(SC2Opt);
   const char *args2[] = {"prog", "sc1", "-sc1"};
-  EXPECT_TRUE(cl::ParseCommandLineOptions(3, args2, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(3, args2, StringRef(), &llvm::nulls()));
   EXPECT_FALSE(TopLevelOpt);
   EXPECT_TRUE(SC1Opt);
   EXPECT_FALSE(SC2Opt);
@@ -327,7 +329,8 @@ TEST(CommandLineTest, SetValueInSubcategories) {
   EXPECT_FALSE(SC1Opt);
   EXPECT_FALSE(SC2Opt);
   const char *args3[] = {"prog", "sc2", "-sc2"};
-  EXPECT_TRUE(cl::ParseCommandLineOptions(3, args3, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(3, args3, StringRef(), &llvm::nulls()));
   EXPECT_FALSE(TopLevelOpt);
   EXPECT_FALSE(SC1Opt);
   EXPECT_TRUE(SC2Opt);
@@ -342,8 +345,13 @@ TEST(CommandLineTest, LookupFailsInWrongSubCommand) {
   StackOption<bool> SC1Opt("sc1", cl::sub(SC1), cl::init(false));
   StackOption<bool> SC2Opt("sc2", cl::sub(SC2), cl::init(false));
 
+  std::string Errs;
+  raw_string_ostream OS(Errs);
+
   const char *args[] = {"prog", "sc1", "-sc2"};
-  EXPECT_FALSE(cl::ParseCommandLineOptions(3, args, StringRef(), true));
+  EXPECT_FALSE(cl::ParseCommandLineOptions(3, args, StringRef(), &OS));
+  OS.flush();
+  EXPECT_FALSE(Errs.empty());
 }
 
 TEST(CommandLineTest, AddToAllSubCommands) {
@@ -358,23 +366,30 @@ TEST(CommandLineTest, AddToAllSubCommands) {
   const char *args2[] = {"prog", "sc1", "-everywhere"};
   const char *args3[] = {"prog", "sc2", "-everywhere"};
 
+  std::string Errs;
+  raw_string_ostream OS(Errs);
+
   EXPECT_FALSE(AllOpt);
-  EXPECT_TRUE(cl::ParseCommandLineOptions(2, args, StringRef(), true));
+  EXPECT_TRUE(cl::ParseCommandLineOptions(2, args, StringRef(), &OS));
   EXPECT_TRUE(AllOpt);
 
   AllOpt = false;
 
   cl::ResetAllOptionOccurrences();
   EXPECT_FALSE(AllOpt);
-  EXPECT_TRUE(cl::ParseCommandLineOptions(3, args2, StringRef(), true));
+  EXPECT_TRUE(cl::ParseCommandLineOptions(3, args2, StringRef(), &OS));
   EXPECT_TRUE(AllOpt);
 
   AllOpt = false;
 
   cl::ResetAllOptionOccurrences();
   EXPECT_FALSE(AllOpt);
-  EXPECT_TRUE(cl::ParseCommandLineOptions(3, args3, StringRef(), true));
+  EXPECT_TRUE(cl::ParseCommandLineOptions(3, args3, StringRef(), &OS));
   EXPECT_TRUE(AllOpt);
+
+  // Since all parsing succeeded, the error message should be empty.
+  OS.flush();
+  EXPECT_TRUE(Errs.empty());
 }
 
 TEST(CommandLineTest, ReparseCommandLineOptions) {
@@ -386,14 +401,16 @@ TEST(CommandLineTest, ReparseCommandLineOptions) {
   const char *args[] = {"prog", "-top-level"};
 
   EXPECT_FALSE(TopLevelOpt);
-  EXPECT_TRUE(cl::ParseCommandLineOptions(2, args, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(2, args, StringRef(), &llvm::nulls()));
   EXPECT_TRUE(TopLevelOpt);
 
   TopLevelOpt = false;
 
   cl::ResetAllOptionOccurrences();
   EXPECT_FALSE(TopLevelOpt);
-  EXPECT_TRUE(cl::ParseCommandLineOptions(2, args, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(2, args, StringRef(), &llvm::nulls()));
   EXPECT_TRUE(TopLevelOpt);
 }
 
@@ -406,14 +423,21 @@ TEST(CommandLineTest, RemoveFromRegularSubCommand) {
 
   const char *args[] = {"prog", "sc", "-remove-option"};
 
+  std::string Errs;
+  raw_string_ostream OS(Errs);
+
   EXPECT_FALSE(RemoveOption);
-  EXPECT_TRUE(cl::ParseCommandLineOptions(3, args, StringRef(), true));
+  EXPECT_TRUE(cl::ParseCommandLineOptions(3, args, StringRef(), &OS));
   EXPECT_TRUE(RemoveOption);
+  OS.flush();
+  EXPECT_TRUE(Errs.empty());
 
   RemoveOption.removeArgument();
 
   cl::ResetAllOptionOccurrences();
-  EXPECT_FALSE(cl::ParseCommandLineOptions(3, args, StringRef(), true));
+  EXPECT_FALSE(cl::ParseCommandLineOptions(3, args, StringRef(), &OS));
+  OS.flush();
+  EXPECT_FALSE(Errs.empty());
 }
 
 TEST(CommandLineTest, RemoveFromTopLevelSubCommand) {
@@ -427,13 +451,15 @@ TEST(CommandLineTest, RemoveFromTopLevelSubCommand) {
   const char *args[] = {"prog", "-top-level-remove"};
 
   EXPECT_FALSE(TopLevelRemove);
-  EXPECT_TRUE(cl::ParseCommandLineOptions(2, args, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(2, args, StringRef(), &llvm::nulls()));
   EXPECT_TRUE(TopLevelRemove);
 
   TopLevelRemove.removeArgument();
 
   cl::ResetAllOptionOccurrences();
-  EXPECT_FALSE(cl::ParseCommandLineOptions(2, args, StringRef(), true));
+  EXPECT_FALSE(
+      cl::ParseCommandLineOptions(2, args, StringRef(), &llvm::nulls()));
 }
 
 TEST(CommandLineTest, RemoveFromAllSubCommands) {
@@ -452,32 +478,38 @@ TEST(CommandLineTest, RemoveFromAllSubCommands) {
 
   // It should work for all subcommands including the top-level.
   EXPECT_FALSE(RemoveOption);
-  EXPECT_TRUE(cl::ParseCommandLineOptions(2, args0, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(2, args0, StringRef(), &llvm::nulls()));
   EXPECT_TRUE(RemoveOption);
 
   RemoveOption = false;
 
   cl::ResetAllOptionOccurrences();
   EXPECT_FALSE(RemoveOption);
-  EXPECT_TRUE(cl::ParseCommandLineOptions(3, args1, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(3, args1, StringRef(), &llvm::nulls()));
   EXPECT_TRUE(RemoveOption);
 
   RemoveOption = false;
 
   cl::ResetAllOptionOccurrences();
   EXPECT_FALSE(RemoveOption);
-  EXPECT_TRUE(cl::ParseCommandLineOptions(3, args2, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(3, args2, StringRef(), &llvm::nulls()));
   EXPECT_TRUE(RemoveOption);
 
   RemoveOption.removeArgument();
 
   // It should not work for any subcommands including the top-level.
   cl::ResetAllOptionOccurrences();
-  EXPECT_FALSE(cl::ParseCommandLineOptions(2, args0, StringRef(), true));
+  EXPECT_FALSE(
+      cl::ParseCommandLineOptions(2, args0, StringRef(), &llvm::nulls()));
   cl::ResetAllOptionOccurrences();
-  EXPECT_FALSE(cl::ParseCommandLineOptions(3, args1, StringRef(), true));
+  EXPECT_FALSE(
+      cl::ParseCommandLineOptions(3, args1, StringRef(), &llvm::nulls()));
   cl::ResetAllOptionOccurrences();
-  EXPECT_FALSE(cl::ParseCommandLineOptions(3, args2, StringRef(), true));
+  EXPECT_FALSE(
+      cl::ParseCommandLineOptions(3, args2, StringRef(), &llvm::nulls()));
 }
 
 TEST(CommandLineTest, GetRegisteredSubcommands) {
@@ -491,7 +523,8 @@ TEST(CommandLineTest, GetRegisteredSubcommands) {
   const char *args0[] = {"prog", "sc1"};
   const char *args1[] = {"prog", "sc2"};
 
-  EXPECT_TRUE(cl::ParseCommandLineOptions(2, args0, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(2, args0, StringRef(), &llvm::nulls()));
   EXPECT_FALSE(Opt1);
   EXPECT_FALSE(Opt2);
   for (auto *S : cl::getRegisteredSubcommands()) {
@@ -500,7 +533,8 @@ TEST(CommandLineTest, GetRegisteredSubcommands) {
   }
 
   cl::ResetAllOptionOccurrences();
-  EXPECT_TRUE(cl::ParseCommandLineOptions(2, args1, StringRef(), true));
+  EXPECT_TRUE(
+      cl::ParseCommandLineOptions(2, args1, StringRef(), &llvm::nulls()));
   EXPECT_FALSE(Opt1);
   EXPECT_FALSE(Opt2);
   for (auto *S : cl::getRegisteredSubcommands()) {
diff --git a/unittests/Support/ErrorTest.cpp b/unittests/Support/ErrorTest.cpp
index 29a173a058b61a01c00552bf96fb5b1bf570f5a9..382346cd231acc57d660e01cdf7722e5831b2960 100644
--- a/unittests/Support/ErrorTest.cpp
+++ b/unittests/Support/ErrorTest.cpp
@@ -469,6 +469,34 @@ TEST(Error, ExitOnError) {
       << "exitOnError returned an unexpected error result";
 }
 
+// Test that the ExitOnError utility works as expected.
+TEST(Error, CantFailSuccess) {
+  cantFail(Error::success());
+
+  int X = cantFail(Expected<int>(42));
+  EXPECT_EQ(X, 42) << "Expected value modified by cantFail";
+}
+
+// Test that cantFail results in a crash if you pass it a failure value.
+#if LLVM_ENABLE_ABI_BREAKING_CHECKS
+TEST(Error, CantFailDeath) {
+  EXPECT_DEATH(
+      cantFail(make_error<StringError>("foo", inconvertibleErrorCode())),
+      "Failure value returned from cantFail wrapped call")
+    << "cantFail(Error) did not cause an abort for failure value";
+
+  EXPECT_DEATH(
+      {
+        auto IEC = inconvertibleErrorCode();
+        int X = cantFail(Expected<int>(make_error<StringError>("foo", IEC)));
+        (void)X;
+      },
+      "Failure value returned from cantFail wrapped call")
+    << "cantFail(Expected<int>) did not cause an abort for failure value";
+}
+#endif
+
+
 // Test Checked Expected<T> in success mode.
 TEST(Error, CheckedExpectedInSuccessMode) {
   Expected<int> A = 7;
diff --git a/unittests/Support/FormatVariadicTest.cpp b/unittests/Support/FormatVariadicTest.cpp
index 9307c6d8e09b53c81cddcc8b330eabf7fe2647b1..b0c843870afc24a486d76952cbabd60f375c5c17 100644
--- a/unittests/Support/FormatVariadicTest.cpp
+++ b/unittests/Support/FormatVariadicTest.cpp
@@ -324,11 +324,13 @@ TEST(FormatVariadicTest, StringFormatting) {
   const char FooArray[] = "FooArray";
   const char *FooPtr = "FooPtr";
   llvm::StringRef FooRef("FooRef");
+  constexpr StringLiteral FooLiteral("FooLiteral");
   std::string FooString("FooString");
   // 1. Test that we can print various types of strings.
   EXPECT_EQ(FooArray, formatv("{0}", FooArray).str());
   EXPECT_EQ(FooPtr, formatv("{0}", FooPtr).str());
   EXPECT_EQ(FooRef, formatv("{0}", FooRef).str());
+  EXPECT_EQ(FooLiteral, formatv("{0}", FooLiteral).str());
   EXPECT_EQ(FooString, formatv("{0}", FooString).str());
 
   // 2. Test that the precision specifier prints the correct number of
diff --git a/unittests/Support/Host.cpp b/unittests/Support/Host.cpp
index 934a60495427291241062b3f75babc519e631e75..fd53697793c7e8132b58c943fb86e23e79e0217b 100644
--- a/unittests/Support/Host.cpp
+++ b/unittests/Support/Host.cpp
@@ -17,25 +17,17 @@ using namespace llvm;
 
 class HostTest : public testing::Test {
   Triple Host;
-  SmallVector<std::pair<Triple::ArchType, Triple::OSType>, 4> SupportedArchAndOSs;
 
 protected:
   bool isSupportedArchAndOS() {
-    if (is_contained(SupportedArchAndOSs, std::make_pair(Host.getArch(), Host.getOS())))
-      return true;
-
-    return false;
-  }
-
-  HostTest() {
-    Host.setTriple(Triple::normalize(sys::getProcessTriple()));
-
     // Initially this is only testing detection of the number of
     // physical cores, which is currently only supported/tested for
     // x86_64 Linux and Darwin.
-    SupportedArchAndOSs.push_back(std::make_pair(Triple::x86_64, Triple::Linux));
-    SupportedArchAndOSs.push_back(std::make_pair(Triple::x86_64, Triple::Darwin));
+    return (Host.getArch() == Triple::x86_64 &&
+            (Host.isOSDarwin() || Host.getOS() == Triple::Linux));
   }
+
+  HostTest() : Host(Triple::normalize(sys::getProcessTriple())) {}
 };
 
 TEST_F(HostTest, NumPhysicalCores) {
@@ -46,3 +38,79 @@ TEST_F(HostTest, NumPhysicalCores) {
   else
     ASSERT_EQ(Num, -1);
 }
+
+TEST(getLinuxHostCPUName, ARM) {
+  StringRef CortexA9ProcCpuinfo = R"(
+processor       : 0
+model name      : ARMv7 Processor rev 10 (v7l)
+BogoMIPS        : 1393.66
+Features        : half thumb fastmult vfp edsp thumbee neon vfpv3 tls vfpd32
+CPU implementer : 0x41
+CPU architecture: 7
+CPU variant     : 0x2
+CPU part        : 0xc09
+CPU revision    : 10
+
+processor       : 1
+model name      : ARMv7 Processor rev 10 (v7l)
+BogoMIPS        : 1393.66
+Features        : half thumb fastmult vfp edsp thumbee neon vfpv3 tls vfpd32
+CPU implementer : 0x41
+CPU architecture: 7
+CPU variant     : 0x2
+CPU part        : 0xc09
+CPU revision    : 10
+
+Hardware        : Generic OMAP4 (Flattened Device Tree)
+Revision        : 0000
+Serial          : 0000000000000000
+)";
+
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM(CortexA9ProcCpuinfo),
+            "cortex-a9");
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n"
+                                              "CPU part        : 0xc0f"),
+            "cortex-a15");
+  // Verify that both CPU implementer and CPU part are checked:
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x40\n"
+                                              "CPU part        : 0xc0f"),
+            "generic");
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x51\n"
+                                              "CPU part        : 0x06f"),
+            "krait");
+}
+
+TEST(getLinuxHostCPUName, AArch64) {
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x41\n"
+                                              "CPU part        : 0xd03"),
+            "cortex-a53");
+  // Verify that both CPU implementer and CPU part are checked:
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x40\n"
+                                              "CPU part        : 0xd03"),
+            "generic");
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM("CPU implementer : 0x51\n"
+                                              "CPU part        : 0x201"),
+            "kryo");
+
+  // MSM8992/4 weirdness
+  StringRef MSM8992ProcCpuInfo = R"(
+Processor       : AArch64 Processor rev 3 (aarch64)
+processor       : 0
+processor       : 1
+processor       : 2
+processor       : 3
+processor       : 4
+processor       : 5
+Features        : fp asimd evtstrm aes pmull sha1 sha2 crc32
+CPU implementer : 0x41
+CPU architecture: 8
+CPU variant     : 0x0
+CPU part        : 0xd03
+CPU revision    : 3
+
+Hardware        : Qualcomm Technologies, Inc MSM8992
+)";
+
+  EXPECT_EQ(sys::detail::getHostCPUNameForARM(MSM8992ProcCpuInfo),
+            "cortex-a53");
+}
diff --git a/unittests/Support/LEB128Test.cpp b/unittests/Support/LEB128Test.cpp
index 76b63e5a8381777e40b0beff961a1a77b5131bf7..061936df1d19d3f552cea1c231c43aeab122857f 100644
--- a/unittests/Support/LEB128Test.cpp
+++ b/unittests/Support/LEB128Test.cpp
@@ -17,26 +17,45 @@ using namespace llvm;
 namespace {
 
 TEST(LEB128Test, EncodeSLEB128) {
-#define EXPECT_SLEB128_EQ(EXPECTED, VALUE) \
+#define EXPECT_SLEB128_EQ(EXPECTED, VALUE, PAD) \
   do { \
-    /* encodeSLEB128(uint64_t, raw_ostream &) */ \
     std::string Expected(EXPECTED, sizeof(EXPECTED) - 1); \
-    std::string Actual; \
-    raw_string_ostream Stream(Actual); \
-    encodeSLEB128(VALUE, Stream); \
+    \
+    /* encodeSLEB128(uint64_t, raw_ostream &, unsigned) */ \
+    std::string Actual1; \
+    raw_string_ostream Stream(Actual1); \
+    encodeSLEB128(VALUE, Stream, PAD); \
     Stream.flush(); \
-    EXPECT_EQ(Expected, Actual); \
+    EXPECT_EQ(Expected, Actual1); \
+    \
+    /* encodeSLEB128(uint64_t, uint8_t *, unsigned) */ \
+    uint8_t Buffer[32]; \
+    unsigned Size = encodeSLEB128(VALUE, Buffer, PAD); \
+    std::string Actual2(reinterpret_cast<const char *>(Buffer), Size); \
+    EXPECT_EQ(Expected, Actual2); \
   } while (0)
 
   // Encode SLEB128
-  EXPECT_SLEB128_EQ("\x00", 0);
-  EXPECT_SLEB128_EQ("\x01", 1);
-  EXPECT_SLEB128_EQ("\x7f", -1);
-  EXPECT_SLEB128_EQ("\x3f", 63);
-  EXPECT_SLEB128_EQ("\x41", -63);
-  EXPECT_SLEB128_EQ("\x40", -64);
-  EXPECT_SLEB128_EQ("\xbf\x7f", -65);
-  EXPECT_SLEB128_EQ("\xc0\x00", 64);
+  EXPECT_SLEB128_EQ("\x00", 0, 0);
+  EXPECT_SLEB128_EQ("\x01", 1, 0);
+  EXPECT_SLEB128_EQ("\x7f", -1, 0);
+  EXPECT_SLEB128_EQ("\x3f", 63, 0);
+  EXPECT_SLEB128_EQ("\x41", -63, 0);
+  EXPECT_SLEB128_EQ("\x40", -64, 0);
+  EXPECT_SLEB128_EQ("\xbf\x7f", -65, 0);
+  EXPECT_SLEB128_EQ("\xc0\x00", 64, 0);
+
+  // Encode SLEB128 with some extra padding bytes
+  EXPECT_SLEB128_EQ("\x80\x00", 0, 1);
+  EXPECT_SLEB128_EQ("\x80\x80\x00", 0, 2);
+  EXPECT_SLEB128_EQ("\xff\x80\x00", 0x7f, 1);
+  EXPECT_SLEB128_EQ("\xff\x80\x80\x00", 0x7f, 2);
+  EXPECT_SLEB128_EQ("\x80\x81\x00", 0x80, 1);
+  EXPECT_SLEB128_EQ("\x80\x81\x80\x00", 0x80, 2);
+  EXPECT_SLEB128_EQ("\xc0\x7f", -0x40, 1);
+  EXPECT_SLEB128_EQ("\xc0\xff\x7f", -0x40, 2);
+  EXPECT_SLEB128_EQ("\x80\xff\x7f", -0x80, 1);
+  EXPECT_SLEB128_EQ("\x80\xff\xff\x7f", -0x80, 2);
 
 #undef EXPECT_SLEB128_EQ
 }
diff --git a/unittests/Support/MD5Test.cpp b/unittests/Support/MD5Test.cpp
index 4d790254503e06706c1b6fa89cf532e8d738f413..fa9372fde33fa317928dc7db8243aad1cd64f442 100644
--- a/unittests/Support/MD5Test.cpp
+++ b/unittests/Support/MD5Test.cpp
@@ -63,8 +63,10 @@ TEST(MD5HashTest, MD5) {
   std::array<uint8_t, 16> Vec = MD5::hash(Input);
   MD5::MD5Result MD5Res;
   SmallString<32> Res;
-  memcpy(MD5Res, Vec.data(), Vec.size());
+  memcpy(MD5Res.Bytes.data(), Vec.data(), Vec.size());
   MD5::stringifyResult(MD5Res, Res);
   EXPECT_EQ(Res, "c3fcd3d76192e4007dfb496cca67e13b");
+  EXPECT_EQ(0x3be167ca6c49fb7dULL, MD5Res.high());
+  EXPECT_EQ(0x00e49261d7d3fcc3ULL, MD5Res.low());
 }
 }
diff --git a/unittests/Support/Path.cpp b/unittests/Support/Path.cpp
index bd57ba98f31be00b11c00bcfbaa7201754ffcda4..86ad57f3f3ffc511aecb6facd4485898fb8d564a 100644
--- a/unittests/Support/Path.cpp
+++ b/unittests/Support/Path.cpp
@@ -8,12 +8,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Path.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/FileUtilities.h"
+#include "llvm/Support/Host.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/raw_ostream.h"
 #include "gtest/gtest.h"
@@ -25,6 +28,7 @@
 #endif
 
 #ifdef LLVM_ON_UNIX
+#include <pwd.h>
 #include <sys/stat.h>
 #endif
 
@@ -50,6 +54,9 @@ TEST(is_separator, Works) {
   EXPECT_FALSE(path::is_separator('-'));
   EXPECT_FALSE(path::is_separator(' '));
 
+  EXPECT_TRUE(path::is_separator('\\', path::Style::windows));
+  EXPECT_FALSE(path::is_separator('\\', path::Style::posix));
+
 #ifdef LLVM_ON_WIN32
   EXPECT_TRUE(path::is_separator('\\'));
 #else
@@ -249,7 +256,6 @@ TEST(Support, AbsolutePathDotIterator) {
   }
 }
 
-#ifdef LLVM_ON_WIN32
 TEST(Support, AbsolutePathIteratorWin32) {
   SmallString<64> Path(StringRef("c:\\c\\e\\foo.txt"));
   typedef SmallVector<StringRef, 4> PathComponents;
@@ -262,8 +268,9 @@ TEST(Support, AbsolutePathIteratorWin32) {
   // when iterating.
   ExpectedPathComponents.insert(ExpectedPathComponents.begin()+1, "\\");
 
-  for (path::const_iterator I = path::begin(Path), E = path::end(Path); I != E;
-       ++I) {
+  for (path::const_iterator I = path::begin(Path, path::Style::windows),
+                            E = path::end(Path);
+       I != E; ++I) {
     ActualPathComponents.push_back(*I);
   }
 
@@ -273,34 +280,29 @@ TEST(Support, AbsolutePathIteratorWin32) {
     EXPECT_EQ(ExpectedPathComponents[i].str(), ActualPathComponents[i].str());
   }
 }
-#endif // LLVM_ON_WIN32
 
 TEST(Support, AbsolutePathIteratorEnd) {
   // Trailing slashes are converted to '.' unless they are part of the root path.
-  SmallVector<StringRef, 4> Paths;
-  Paths.push_back("/foo/");
-  Paths.push_back("/foo//");
-  Paths.push_back("//net//");
-#ifdef LLVM_ON_WIN32
-  Paths.push_back("c:\\\\");
-#endif
-
-  for (StringRef Path : Paths) {
-    StringRef LastComponent = *path::rbegin(Path);
+  SmallVector<std::pair<StringRef, path::Style>, 4> Paths;
+  Paths.emplace_back("/foo/", path::Style::native);
+  Paths.emplace_back("/foo//", path::Style::native);
+  Paths.emplace_back("//net//", path::Style::native);
+  Paths.emplace_back("c:\\\\", path::Style::windows);
+
+  for (auto &Path : Paths) {
+    StringRef LastComponent = *path::rbegin(Path.first, Path.second);
     EXPECT_EQ(".", LastComponent);
   }
 
-  SmallVector<StringRef, 3> RootPaths;
-  RootPaths.push_back("/");
-  RootPaths.push_back("//net/");
-#ifdef LLVM_ON_WIN32
-  RootPaths.push_back("c:\\");
-#endif
+  SmallVector<std::pair<StringRef, path::Style>, 3> RootPaths;
+  RootPaths.emplace_back("/", path::Style::native);
+  RootPaths.emplace_back("//net/", path::Style::native);
+  RootPaths.emplace_back("c:\\", path::Style::windows);
 
-  for (StringRef Path : RootPaths) {
-    StringRef LastComponent = *path::rbegin(Path);
+  for (auto &Path : RootPaths) {
+    StringRef LastComponent = *path::rbegin(Path.first, Path.second);
     EXPECT_EQ(1u, LastComponent.size());
-    EXPECT_TRUE(path::is_separator(LastComponent[0]));
+    EXPECT_TRUE(path::is_separator(LastComponent[0], Path.second));
   }
 }
 
@@ -327,6 +329,36 @@ TEST(Support, HomeDirectory) {
   }
 }
 
+#ifdef LLVM_ON_UNIX
+TEST(Support, HomeDirectoryWithNoEnv) {
+  std::string OriginalStorage;
+  char const *OriginalEnv = ::getenv("HOME");
+  if (OriginalEnv) {
+    // We're going to unset it, so make a copy and save a pointer to the copy
+    // so that we can reset it at the end of the test.
+    OriginalStorage = OriginalEnv;
+    OriginalEnv = OriginalStorage.c_str();
+  }
+
+  // Don't run the test if we have nothing to compare against.
+  struct passwd *pw = getpwuid(getuid());
+  if (!pw || !pw->pw_dir) return;
+
+  ::unsetenv("HOME");
+  EXPECT_EQ(nullptr, ::getenv("HOME"));
+  std::string PwDir = pw->pw_dir;
+
+  SmallString<128> HomeDir;
+  auto status = path::home_directory(HomeDir);
+  EXPECT_TRUE(status);
+  EXPECT_EQ(PwDir, HomeDir);
+
+  // Now put the environment back to its original state (meaning that if it was
+  // unset before, we don't reset it).
+  if (OriginalEnv) ::setenv("HOME", OriginalEnv, 1);
+}
+#endif
+
 TEST(Support, UserCacheDirectory) {
   SmallString<13> CacheDir;
   SmallString<20> CacheDir2;
@@ -496,6 +528,41 @@ TEST_F(FileSystemTest, Unique) {
   ASSERT_NO_ERROR(fs::remove(TempPath));
 }
 
+TEST_F(FileSystemTest, RealPath) {
+  ASSERT_NO_ERROR(
+      fs::create_directories(Twine(TestDirectory) + "/test1/test2/test3"));
+  ASSERT_TRUE(fs::exists(Twine(TestDirectory) + "/test1/test2/test3"));
+
+  SmallString<64> RealBase;
+  SmallString<64> Expected;
+  SmallString<64> Actual;
+
+  // TestDirectory itself might be under a symlink or have been specified with
+  // a different case than the existing temp directory.  In such cases real_path
+  // on the concatenated path will differ in the TestDirectory portion from
+  // how we specified it.  Make sure to compare against the real_path of the
+  // TestDirectory, and not just the value of TestDirectory.
+  ASSERT_NO_ERROR(fs::real_path(TestDirectory, RealBase));
+  path::native(Twine(RealBase) + "/test1/test2", Expected);
+
+  ASSERT_NO_ERROR(fs::real_path(
+      Twine(TestDirectory) + "/././test1/../test1/test2/./test3/..", Actual));
+
+  EXPECT_EQ(Expected, Actual);
+
+  SmallString<64> HomeDir;
+  bool Result = llvm::sys::path::home_directory(HomeDir);
+  if (Result) {
+    ASSERT_NO_ERROR(fs::real_path(HomeDir, Expected));
+    ASSERT_NO_ERROR(fs::real_path("~", Actual, true));
+    EXPECT_EQ(Expected, Actual);
+    ASSERT_NO_ERROR(fs::real_path("~/", Actual, true));
+    EXPECT_EQ(Expected, Actual);
+  }
+
+  ASSERT_NO_ERROR(fs::remove_directories(Twine(TestDirectory) + "/test1"));
+}
+
 TEST_F(FileSystemTest, TempFiles) {
   // Create a temp file.
   int FileDescriptor;
@@ -740,6 +807,118 @@ TEST_F(FileSystemTest, DirectoryIteration) {
   ASSERT_NO_ERROR(fs::remove(Twine(TestDirectory) + "/reclevel"));
 }
 
+#ifdef LLVM_ON_UNIX
+TEST_F(FileSystemTest, BrokenSymlinkDirectoryIteration) {
+  // Create a known hierarchy to recurse over.
+  ASSERT_NO_ERROR(fs::create_directories(Twine(TestDirectory) + "/symlink"));
+  ASSERT_NO_ERROR(
+      fs::create_link("no_such_file", Twine(TestDirectory) + "/symlink/a"));
+  ASSERT_NO_ERROR(
+      fs::create_directories(Twine(TestDirectory) + "/symlink/b/bb"));
+  ASSERT_NO_ERROR(
+      fs::create_link("no_such_file", Twine(TestDirectory) + "/symlink/b/ba"));
+  ASSERT_NO_ERROR(
+      fs::create_link("no_such_file", Twine(TestDirectory) + "/symlink/b/bc"));
+  ASSERT_NO_ERROR(
+      fs::create_link("no_such_file", Twine(TestDirectory) + "/symlink/c"));
+  ASSERT_NO_ERROR(
+      fs::create_directories(Twine(TestDirectory) + "/symlink/d/dd/ddd"));
+  ASSERT_NO_ERROR(fs::create_link(Twine(TestDirectory) + "/symlink/d/dd",
+                                  Twine(TestDirectory) + "/symlink/d/da"));
+  ASSERT_NO_ERROR(
+      fs::create_link("no_such_file", Twine(TestDirectory) + "/symlink/e"));
+
+  typedef std::vector<std::string> v_t;
+  v_t visited;
+
+  // The directory iterator doesn't stat the file, so we should be able to
+  // iterate over the whole directory.
+  std::error_code ec;
+  for (fs::directory_iterator i(Twine(TestDirectory) + "/symlink", ec), e;
+       i != e; i.increment(ec)) {
+    ASSERT_NO_ERROR(ec);
+    visited.push_back(path::filename(i->path()));
+  }
+  std::sort(visited.begin(), visited.end());
+  v_t expected = {"a", "b", "c", "d", "e"};
+  ASSERT_TRUE(visited.size() == expected.size());
+  ASSERT_TRUE(std::equal(visited.begin(), visited.end(), expected.begin()));
+  visited.clear();
+
+  // The recursive directory iterator has to stat the file, so we need to skip
+  // the broken symlinks.
+  for (fs::recursive_directory_iterator
+           i(Twine(TestDirectory) + "/symlink", ec),
+       e;
+       i != e; i.increment(ec)) {
+    ASSERT_NO_ERROR(ec);
+
+    fs::file_status status;
+    if (i->status(status) ==
+        std::make_error_code(std::errc::no_such_file_or_directory)) {
+      i.no_push();
+      continue;
+    }
+
+    visited.push_back(path::filename(i->path()));
+  }
+  std::sort(visited.begin(), visited.end());
+  expected = {"b", "bb", "d", "da", "dd", "ddd", "ddd"};
+  ASSERT_TRUE(visited.size() == expected.size());
+  ASSERT_TRUE(std::equal(visited.begin(), visited.end(), expected.begin()));
+  visited.clear();
+
+  // This recursive directory iterator doesn't follow symlinks, so we don't need
+  // to skip them.
+  for (fs::recursive_directory_iterator
+           i(Twine(TestDirectory) + "/symlink", ec, /*follow_symlinks=*/false),
+       e;
+       i != e; i.increment(ec)) {
+    ASSERT_NO_ERROR(ec);
+    visited.push_back(path::filename(i->path()));
+  }
+  std::sort(visited.begin(), visited.end());
+  expected = {"a", "b", "ba", "bb", "bc", "c", "d", "da", "dd", "ddd", "e"};
+  ASSERT_TRUE(visited.size() == expected.size());
+  ASSERT_TRUE(std::equal(visited.begin(), visited.end(), expected.begin()));
+
+  ASSERT_NO_ERROR(fs::remove_directories(Twine(TestDirectory) + "/symlink"));
+}
+#endif
+
+TEST_F(FileSystemTest, Remove) {
+  SmallString<64> BaseDir;
+  SmallString<64> Paths[4];
+  int fds[4];
+  ASSERT_NO_ERROR(fs::createUniqueDirectory("fs_remove", BaseDir));
+
+  ASSERT_NO_ERROR(fs::create_directories(Twine(BaseDir) + "/foo/bar/baz"));
+  ASSERT_NO_ERROR(fs::create_directories(Twine(BaseDir) + "/foo/bar/buzz"));
+  ASSERT_NO_ERROR(fs::createUniqueFile(
+      Twine(BaseDir) + "/foo/bar/baz/%%%%%%.tmp", fds[0], Paths[0]));
+  ASSERT_NO_ERROR(fs::createUniqueFile(
+      Twine(BaseDir) + "/foo/bar/baz/%%%%%%.tmp", fds[1], Paths[1]));
+  ASSERT_NO_ERROR(fs::createUniqueFile(
+      Twine(BaseDir) + "/foo/bar/buzz/%%%%%%.tmp", fds[2], Paths[2]));
+  ASSERT_NO_ERROR(fs::createUniqueFile(
+      Twine(BaseDir) + "/foo/bar/buzz/%%%%%%.tmp", fds[3], Paths[3]));
+
+  for (int fd : fds)
+    ::close(fd);
+
+  EXPECT_TRUE(fs::exists(Twine(BaseDir) + "/foo/bar/baz"));
+  EXPECT_TRUE(fs::exists(Twine(BaseDir) + "/foo/bar/buzz"));
+  EXPECT_TRUE(fs::exists(Paths[0]));
+  EXPECT_TRUE(fs::exists(Paths[1]));
+  EXPECT_TRUE(fs::exists(Paths[2]));
+  EXPECT_TRUE(fs::exists(Paths[3]));
+
+  ASSERT_NO_ERROR(fs::remove_directories("D:/footest"));
+
+  ASSERT_NO_ERROR(fs::remove_directories(BaseDir));
+  ASSERT_FALSE(fs::exists(BaseDir));
+}
+
 const char archive[] = "!<arch>\x0A";
 const char bitcode[] = "\xde\xc0\x17\x0b";
 const char coff_object[] = "\x00\x00......";
@@ -863,6 +1042,20 @@ TEST_F(FileSystemTest, Resize) {
   ASSERT_NO_ERROR(fs::remove(TempPath));
 }
 
+TEST_F(FileSystemTest, MD5) {
+  int FD;
+  SmallString<64> TempPath;
+  ASSERT_NO_ERROR(fs::createTemporaryFile("prefix", "temp", FD, TempPath));
+  StringRef Data("abcdefghijklmnopqrstuvwxyz");
+  write(FD, Data.data(), Data.size());
+  lseek(FD, 0, SEEK_SET);
+  auto Hash = fs::md5_contents(FD);
+  ::close(FD);
+  ASSERT_NO_ERROR(Hash.getError());
+
+  EXPECT_STREQ("c3fcd3d76192e4007dfb496cca67e13b", Hash->digest().c_str());
+}
+
 TEST_F(FileSystemTest, FileMapping) {
   // Create a temp file.
   int FileDescriptor;
@@ -906,40 +1099,50 @@ TEST_F(FileSystemTest, FileMapping) {
 }
 
 TEST(Support, NormalizePath) {
+  using TestTuple = std::tuple<const char *, const char *, const char *>;
+  std::vector<TestTuple> Tests;
+  Tests.emplace_back("a", "a", "a");
+  Tests.emplace_back("a/b", "a\\b", "a/b");
+  Tests.emplace_back("a\\b", "a\\b", "a/b");
+  Tests.emplace_back("a\\\\b", "a\\\\b", "a\\\\b");
+  Tests.emplace_back("\\a", "\\a", "/a");
+  Tests.emplace_back("a\\", "a\\", "a/");
+
+  for (auto &T : Tests) {
+    SmallString<64> Win(std::get<0>(T));
+    SmallString<64> Posix(Win);
+    path::native(Win, path::Style::windows);
+    path::native(Posix, path::Style::posix);
+    EXPECT_EQ(std::get<1>(T), Win);
+    EXPECT_EQ(std::get<2>(T), Posix);
+  }
+
 #if defined(LLVM_ON_WIN32)
-#define EXPECT_PATH_IS(path__, windows__, not_windows__)                        \
-  EXPECT_EQ(path__, windows__);
-#else
-#define EXPECT_PATH_IS(path__, windows__, not_windows__)                        \
-  EXPECT_EQ(path__, not_windows__);
+  SmallString<64> PathHome;
+  path::home_directory(PathHome);
+
+  const char *Path7a = "~/aaa";
+  SmallString<64> Path7(Path7a);
+  path::native(Path7);
+  EXPECT_TRUE(Path7.endswith("\\aaa"));
+  EXPECT_TRUE(Path7.startswith(PathHome));
+  EXPECT_EQ(Path7.size(), PathHome.size() + strlen(Path7a + 1));
+
+  const char *Path8a = "~";
+  SmallString<64> Path8(Path8a);
+  path::native(Path8);
+  EXPECT_EQ(Path8, PathHome);
+
+  const char *Path9a = "~aaa";
+  SmallString<64> Path9(Path9a);
+  path::native(Path9);
+  EXPECT_EQ(Path9, "~aaa");
+
+  const char *Path10a = "aaa/~/b";
+  SmallString<64> Path10(Path10a);
+  path::native(Path10);
+  EXPECT_EQ(Path10, "aaa\\~\\b");
 #endif
-
-  SmallString<64> Path1("a");
-  SmallString<64> Path2("a/b");
-  SmallString<64> Path3("a\\b");
-  SmallString<64> Path4("a\\\\b");
-  SmallString<64> Path5("\\a");
-  SmallString<64> Path6("a\\");
-
-  path::native(Path1);
-  EXPECT_PATH_IS(Path1, "a", "a");
-
-  path::native(Path2);
-  EXPECT_PATH_IS(Path2, "a\\b", "a/b");
-
-  path::native(Path3);
-  EXPECT_PATH_IS(Path3, "a\\b", "a/b");
-
-  path::native(Path4);
-  EXPECT_PATH_IS(Path4, "a\\\\b", "a\\\\b");
-
-  path::native(Path5);
-  EXPECT_PATH_IS(Path5, "\\a", "/a");
-
-  path::native(Path6);
-  EXPECT_PATH_IS(Path6, "a\\", "a/");
-
-#undef EXPECT_PATH_IS
 }
 
 TEST(Support, RemoveLeadingDotSlash) {
@@ -952,43 +1155,48 @@ TEST(Support, RemoveLeadingDotSlash) {
   EXPECT_EQ(Path2, "");
 }
 
-static std::string remove_dots(StringRef path,
-    bool remove_dot_dot) {
+static std::string remove_dots(StringRef path, bool remove_dot_dot,
+                               path::Style style) {
   SmallString<256> buffer(path);
-  path::remove_dots(buffer, remove_dot_dot);
+  path::remove_dots(buffer, remove_dot_dot, style);
   return buffer.str();
 }
 
 TEST(Support, RemoveDots) {
-#if defined(LLVM_ON_WIN32)
-  EXPECT_EQ("foolz\\wat", remove_dots(".\\.\\\\foolz\\wat", false));
-  EXPECT_EQ("", remove_dots(".\\\\\\\\\\", false));
-
-  EXPECT_EQ("a\\..\\b\\c", remove_dots(".\\a\\..\\b\\c", false));
-  EXPECT_EQ("b\\c", remove_dots(".\\a\\..\\b\\c", true));
-  EXPECT_EQ("c", remove_dots(".\\.\\c", true));
-  EXPECT_EQ("..\\a\\c", remove_dots("..\\a\\b\\..\\c", true));
-  EXPECT_EQ("..\\..\\a\\c", remove_dots("..\\..\\a\\b\\..\\c", true));
+  EXPECT_EQ("foolz\\wat",
+            remove_dots(".\\.\\\\foolz\\wat", false, path::Style::windows));
+  EXPECT_EQ("", remove_dots(".\\\\\\\\\\", false, path::Style::windows));
+
+  EXPECT_EQ("a\\..\\b\\c",
+            remove_dots(".\\a\\..\\b\\c", false, path::Style::windows));
+  EXPECT_EQ("b\\c", remove_dots(".\\a\\..\\b\\c", true, path::Style::windows));
+  EXPECT_EQ("c", remove_dots(".\\.\\c", true, path::Style::windows));
+  EXPECT_EQ("..\\a\\c",
+            remove_dots("..\\a\\b\\..\\c", true, path::Style::windows));
+  EXPECT_EQ("..\\..\\a\\c",
+            remove_dots("..\\..\\a\\b\\..\\c", true, path::Style::windows));
 
   SmallString<64> Path1(".\\.\\c");
-  EXPECT_TRUE(path::remove_dots(Path1, true));
-  EXPECT_EQ("c", Path1);
-#else
-  EXPECT_EQ("foolz/wat", remove_dots("././/foolz/wat", false));
-  EXPECT_EQ("", remove_dots("./////", false));
-
-  EXPECT_EQ("a/../b/c", remove_dots("./a/../b/c", false));
-  EXPECT_EQ("b/c", remove_dots("./a/../b/c", true));
-  EXPECT_EQ("c", remove_dots("././c", true));
-  EXPECT_EQ("../a/c", remove_dots("../a/b/../c", true));
-  EXPECT_EQ("../../a/c", remove_dots("../../a/b/../c", true));
-  EXPECT_EQ("/a/c", remove_dots("/../../a/c", true));
-  EXPECT_EQ("/a/c", remove_dots("/../a/b//../././/c", true));
-
-  SmallString<64> Path1("././c");
-  EXPECT_TRUE(path::remove_dots(Path1, true));
+  EXPECT_TRUE(path::remove_dots(Path1, true, path::Style::windows));
   EXPECT_EQ("c", Path1);
-#endif
+
+  EXPECT_EQ("foolz/wat",
+            remove_dots("././/foolz/wat", false, path::Style::posix));
+  EXPECT_EQ("", remove_dots("./////", false, path::Style::posix));
+
+  EXPECT_EQ("a/../b/c", remove_dots("./a/../b/c", false, path::Style::posix));
+  EXPECT_EQ("b/c", remove_dots("./a/../b/c", true, path::Style::posix));
+  EXPECT_EQ("c", remove_dots("././c", true, path::Style::posix));
+  EXPECT_EQ("../a/c", remove_dots("../a/b/../c", true, path::Style::posix));
+  EXPECT_EQ("../../a/c",
+            remove_dots("../../a/b/../c", true, path::Style::posix));
+  EXPECT_EQ("/a/c", remove_dots("/../../a/c", true, path::Style::posix));
+  EXPECT_EQ("/a/c",
+            remove_dots("/../a/b//../././/c", true, path::Style::posix));
+
+  SmallString<64> Path2("././c");
+  EXPECT_TRUE(path::remove_dots(Path2, true, path::Style::posix));
+  EXPECT_EQ("c", Path2);
 }
 
 TEST(Support, ReplacePathPrefix) {
@@ -1158,4 +1366,175 @@ TEST_F(FileSystemTest, set_current_path) {
   ASSERT_EQ(D1, D2) << "D1: " << TestDirectory << "\nD2: " << path;
 }
 
+TEST_F(FileSystemTest, permissions) {
+  int FD;
+  SmallString<64> TempPath;
+  ASSERT_NO_ERROR(fs::createTemporaryFile("prefix", "temp", FD, TempPath));
+  FileRemover Cleanup(TempPath);
+
+  // Make sure it exists.
+  ASSERT_TRUE(fs::exists(Twine(TempPath)));
+
+  auto CheckPermissions = [&](fs::perms Expected) {
+    ErrorOr<fs::perms> Actual = fs::getPermissions(TempPath);
+    return Actual && *Actual == Expected;
+  };
+
+  std::error_code NoError;
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_all), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_read | fs::all_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_read | fs::all_exe));
+
+#if defined(LLVM_ON_WIN32)
+  fs::perms ReadOnly = fs::all_read | fs::all_exe;
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::no_perms), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::owner_read), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::owner_write), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::owner_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::owner_all), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::group_read), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::group_write), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::group_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::group_all), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::others_read), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::others_write), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::others_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::others_all), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_read), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_write), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::set_uid_on_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::set_gid_on_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::sticky_bit), NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::set_uid_on_exe |
+                                             fs::set_gid_on_exe |
+                                             fs::sticky_bit),
+            NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, ReadOnly | fs::set_uid_on_exe |
+                                             fs::set_gid_on_exe |
+                                             fs::sticky_bit),
+            NoError);
+  EXPECT_TRUE(CheckPermissions(ReadOnly));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_perms), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_all));
+#else
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::no_perms), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::no_perms));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::owner_read), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::owner_read));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::owner_write), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::owner_write));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::owner_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::owner_exe));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::owner_all), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::owner_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::group_read), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::group_read));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::group_write), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::group_write));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::group_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::group_exe));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::group_all), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::group_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::others_read), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::others_read));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::others_write), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::others_write));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::others_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::others_exe));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::others_all), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::others_all));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_read), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_read));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_write), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_write));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_exe));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::set_uid_on_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::set_uid_on_exe));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::set_gid_on_exe), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::set_gid_on_exe));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::sticky_bit), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::sticky_bit));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::set_uid_on_exe |
+                                             fs::set_gid_on_exe |
+                                             fs::sticky_bit),
+            NoError);
+  EXPECT_TRUE(CheckPermissions(fs::set_uid_on_exe | fs::set_gid_on_exe |
+                               fs::sticky_bit));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_read | fs::set_uid_on_exe |
+                                             fs::set_gid_on_exe |
+                                             fs::sticky_bit),
+            NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_read | fs::set_uid_on_exe |
+                               fs::set_gid_on_exe | fs::sticky_bit));
+
+  EXPECT_EQ(fs::setPermissions(TempPath, fs::all_perms), NoError);
+  EXPECT_TRUE(CheckPermissions(fs::all_perms));
+#endif
+}
+
 } // anonymous namespace
diff --git a/unittests/Support/TargetParserTest.cpp b/unittests/Support/TargetParserTest.cpp
index a3d806f76fb514d84882328314dfc7949e9b3f95..f0bfe7dbde96482f46a7571d409914e64489e69a 100644
--- a/unittests/Support/TargetParserTest.cpp
+++ b/unittests/Support/TargetParserTest.cpp
@@ -17,17 +17,17 @@ using namespace llvm;
 
 namespace {
 const char *ARMArch[] = {
-    "armv2",        "armv2a",      "armv3",        "armv3m",      "armv4",
-    "armv4t",       "armv5",       "armv5t",       "armv5e",      "armv5te",
-    "armv5tej",     "armv6",       "armv6j",       "armv6k",      "armv6hl",
-    "armv6t2",      "armv6kz",     "armv6z",       "armv6zk",     "armv6-m",
-    "armv6m",       "armv6sm",     "armv6s-m",     "armv7-a",     "armv7",
-    "armv7a",       "armv7hl",     "armv7l",       "armv7-r",     "armv7r",
-    "armv7-m",      "armv7m",      "armv7k",       "armv7s",      "armv7e-m",
-    "armv7em",      "armv8-a",     "armv8",        "armv8a",      "armv8.1-a",
-    "armv8.1a",     "armv8.2-a",   "armv8.2a",     "armv8-r",     "armv8r",
-    "armv8-m.base", "armv8m.base", "armv8-m.main", "armv8m.main", "iwmmxt",
-    "iwmmxt2",      "xscale"};
+    "armv2",     "armv2a",       "armv3",       "armv3m",       "armv4",
+    "armv4t",    "armv5",        "armv5t",      "armv5e",       "armv5te",
+    "armv5tej",  "armv6",        "armv6j",      "armv6k",       "armv6hl",
+    "armv6t2",   "armv6kz",      "armv6z",      "armv6zk",      "armv6-m",
+    "armv6m",    "armv6sm",      "armv6s-m",    "armv7-a",      "armv7",
+    "armv7a",    "armv7ve",      "armv7hl",     "armv7l",       "armv7-r",
+    "armv7r",    "armv7-m",      "armv7m",      "armv7k",       "armv7s",
+    "armv7e-m",  "armv7em",      "armv8-a",     "armv8",        "armv8a",
+    "armv8.1-a", "armv8.1a",     "armv8.2-a",   "armv8.2a",     "armv8-r",
+    "armv8r",    "armv8-m.base", "armv8m.base", "armv8-m.main", "armv8m.main",
+    "iwmmxt",    "iwmmxt2",      "xscale"};
 
 bool testARMCPU(StringRef CPUName, StringRef ExpectedArch,
                 StringRef ExpectedFPU, unsigned ExpectedFlags,
@@ -246,6 +246,10 @@ TEST(TargetParserTest, testARMCPU) {
                          ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
                          ARM::AEK_HWDIV | ARM::AEK_DSP,
                          "8-A"));
+  EXPECT_TRUE(testARMCPU("cortex-m23", "armv8-m.base", "none",
+                         ARM::AEK_HWDIV, "8-M.Baseline"));
+  EXPECT_TRUE(testARMCPU("cortex-m33", "armv8-m.main", "fpv5-sp-d16",
+                         ARM::AEK_HWDIV | ARM::AEK_DSP, "8-M.Mainline"));
   EXPECT_TRUE(testARMCPU("iwmmxt", "iwmmxt", "none",
                          ARM::AEK_NONE, "iwmmxt"));
   EXPECT_TRUE(testARMCPU("xscale", "xscale", "none",
@@ -310,6 +314,9 @@ TEST(TargetParserTest, testARMArch) {
   EXPECT_TRUE(
       testARMArch("armv7-a", "cortex-a8", "v7",
                           ARMBuildAttrs::CPUArch::v7));
+  EXPECT_TRUE(
+      testARMArch("armv7ve", "generic", "v7ve",
+                          ARMBuildAttrs::CPUArch::v7));
   EXPECT_TRUE(
       testARMArch("armv7-r", "cortex-r4", "v7r",
                           ARMBuildAttrs::CPUArch::v7));
@@ -498,12 +505,12 @@ TEST(TargetParserTest, ARMparseHWDiv) {
 
 TEST(TargetParserTest, ARMparseArchEndianAndISA) {
   const char *Arch[] = {
-      "v2",    "v2a",    "v3",    "v3m",  "v4",   "v4t",  "v5",    "v5t",
-      "v5e",   "v5te",   "v5tej", "v6",   "v6j",  "v6k",  "v6hl",  "v6t2",
-      "v6kz",  "v6z",    "v6zk",  "v6-m", "v6m",  "v6sm", "v6s-m", "v7-a",
-      "v7",    "v7a",    "v7hl",  "v7l",  "v7-r", "v7r",  "v7-m",  "v7m",
-      "v7k",   "v7s",    "v7e-m", "v7em", "v8-a", "v8",   "v8a",   "v8.1-a",
-      "v8.1a", "v8.2-a", "v8.2a", "v8-r"};
+      "v2",     "v2a",   "v3",     "v3m",   "v4",   "v4t",  "v5",    "v5t",
+      "v5e",    "v5te",  "v5tej",  "v6",    "v6j",  "v6k",  "v6hl",  "v6t2",
+      "v6kz",   "v6z",   "v6zk",   "v6-m",  "v6m",  "v6sm", "v6s-m", "v7-a",
+      "v7",     "v7a",   "v7ve",   "v7hl",  "v7l",  "v7-r", "v7r",   "v7-m",
+      "v7m",    "v7k",   "v7s",    "v7e-m", "v7em", "v8-a", "v8",    "v8a",
+      "v8.1-a", "v8.1a", "v8.2-a", "v8.2a", "v8-r"};
 
   for (unsigned i = 0; i < array_lengthof(Arch); i++) {
     std::string arm_1 = "armeb" + (std::string)(Arch[i]);
@@ -555,6 +562,7 @@ TEST(TargetParserTest, ARMparseArchProfile) {
       EXPECT_EQ(ARM::PK_R, ARM::parseArchProfile(ARMArch[i]));
       continue;
     case ARM::AK_ARMV7A:
+    case ARM::AK_ARMV7VE:
     case ARM::AK_ARMV7K:
     case ARM::AK_ARMV8A:
     case ARM::AK_ARMV8_1A:
@@ -635,8 +643,29 @@ TEST(TargetParserTest, testAArch64CPU) {
       "kryo", "armv8-a", "crypto-neon-fp-armv8",
       AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8-A"));
   EXPECT_TRUE(testAArch64CPU(
-      "vulcan", "armv8.1-a", "crypto-neon-fp-armv8",
-      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD, "8.1-A"));
+      "thunderx2t99", "armv8.1-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_LSE |
+      AArch64::AEK_SIMD, "8.1-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "thunderx", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD |
+      AArch64::AEK_FP | AArch64::AEK_PROFILE,
+      "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "thunderxt81", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD |
+      AArch64::AEK_FP | AArch64::AEK_PROFILE,
+      "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "thunderxt83", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD |
+      AArch64::AEK_FP | AArch64::AEK_PROFILE,
+      "8-A"));
+  EXPECT_TRUE(testAArch64CPU(
+      "thunderxt88", "armv8-a", "crypto-neon-fp-armv8",
+      AArch64::AEK_CRC | AArch64::AEK_CRYPTO | AArch64::AEK_SIMD |
+      AArch64::AEK_FP | AArch64::AEK_PROFILE,
+      "8-A"));
 }
 
 bool testAArch64Arch(StringRef Arch, StringRef DefaultCPU, StringRef SubArch,
@@ -672,7 +701,11 @@ TEST(TargetParserTest, testAArch64Extension) {
   EXPECT_FALSE(testAArch64Extension("cyclone", 0, "ras"));
   EXPECT_FALSE(testAArch64Extension("exynos-m1", 0, "ras"));
   EXPECT_FALSE(testAArch64Extension("kryo", 0, "ras"));
-  EXPECT_FALSE(testAArch64Extension("vulcan", 0, "ras"));
+  EXPECT_FALSE(testAArch64Extension("thunderx2t99", 0, "ras"));
+  EXPECT_FALSE(testAArch64Extension("thunderx", 0, "lse"));
+  EXPECT_FALSE(testAArch64Extension("thunderxt81", 0, "lse"));
+  EXPECT_FALSE(testAArch64Extension("thunderxt83", 0, "lse"));
+  EXPECT_FALSE(testAArch64Extension("thunderxt88", 0, "lse"));
 
   EXPECT_FALSE(testAArch64Extension(
       "generic", static_cast<unsigned>(AArch64::ArchKind::AK_ARMV8A), "ras"));
diff --git a/unittests/Support/TrailingObjectsTest.cpp b/unittests/Support/TrailingObjectsTest.cpp
index cb5c47d1b25be444b7117912a102ddfb204ddf25..23acc54d23761149e0a7af1bd8e7d38fb30e70d7 100644
--- a/unittests/Support/TrailingObjectsTest.cpp
+++ b/unittests/Support/TrailingObjectsTest.cpp
@@ -236,3 +236,24 @@ TEST(TrailingObjects, Realignment) {
                 reinterpret_cast<char *>(C + 1) + 1, alignof(long))));
 }
 }
+
+// Test the use of TrailingObjects with a template class. This
+// previously failed to compile due to a bug in MSVC's member access
+// control/lookup handling for OverloadToken.
+template <typename Derived>
+class Class5Tmpl : private llvm::TrailingObjects<Derived, float, int> {
+  using TrailingObjects = typename llvm::TrailingObjects<Derived, float>;
+  friend TrailingObjects;
+
+  size_t numTrailingObjects(
+      typename TrailingObjects::template OverloadToken<float>) const {
+    return 1;
+  }
+
+  size_t numTrailingObjects(
+      typename TrailingObjects::template OverloadToken<int>) const {
+    return 2;
+  }
+};
+
+class Class5 : public Class5Tmpl<Class5> {};
diff --git a/unittests/Support/YAMLIOTest.cpp b/unittests/Support/YAMLIOTest.cpp
index dc7c5d47cba96820d2646362ba83e33ce2c0b35d..5a0280c8ca5ba41e66c1f653895c1b6ea53cc6f7 100644
--- a/unittests/Support/YAMLIOTest.cpp
+++ b/unittests/Support/YAMLIOTest.cpp
@@ -1740,7 +1740,7 @@ TEST(YAMLIO, TestFlagsReadError) {
 //
 // Test error handling reading built-in uint8_t type
 //
-LLVM_YAML_IS_SEQUENCE_VECTOR(uint8_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint8_t)
 TEST(YAMLIO, TestReadBuiltInTypesUint8Error) {
   std::vector<uint8_t> seq;
   Input yin("---\n"
@@ -1759,7 +1759,7 @@ TEST(YAMLIO, TestReadBuiltInTypesUint8Error) {
 //
 // Test error handling reading built-in uint16_t type
 //
-LLVM_YAML_IS_SEQUENCE_VECTOR(uint16_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint16_t)
 TEST(YAMLIO, TestReadBuiltInTypesUint16Error) {
   std::vector<uint16_t> seq;
   Input yin("---\n"
@@ -1778,7 +1778,7 @@ TEST(YAMLIO, TestReadBuiltInTypesUint16Error) {
 //
 // Test error handling reading built-in uint32_t type
 //
-LLVM_YAML_IS_SEQUENCE_VECTOR(uint32_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint32_t)
 TEST(YAMLIO, TestReadBuiltInTypesUint32Error) {
   std::vector<uint32_t> seq;
   Input yin("---\n"
@@ -1797,7 +1797,7 @@ TEST(YAMLIO, TestReadBuiltInTypesUint32Error) {
 //
 // Test error handling reading built-in uint64_t type
 //
-LLVM_YAML_IS_SEQUENCE_VECTOR(uint64_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(uint64_t)
 TEST(YAMLIO, TestReadBuiltInTypesUint64Error) {
   std::vector<uint64_t> seq;
   Input yin("---\n"
@@ -1816,7 +1816,7 @@ TEST(YAMLIO, TestReadBuiltInTypesUint64Error) {
 //
 // Test error handling reading built-in int8_t type
 //
-LLVM_YAML_IS_SEQUENCE_VECTOR(int8_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(int8_t)
 TEST(YAMLIO, TestReadBuiltInTypesint8OverError) {
   std::vector<int8_t> seq;
   Input yin("---\n"
@@ -1854,7 +1854,7 @@ TEST(YAMLIO, TestReadBuiltInTypesint8UnderError) {
 //
 // Test error handling reading built-in int16_t type
 //
-LLVM_YAML_IS_SEQUENCE_VECTOR(int16_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(int16_t)
 TEST(YAMLIO, TestReadBuiltInTypesint16UnderError) {
   std::vector<int16_t> seq;
   Input yin("---\n"
@@ -1893,7 +1893,7 @@ TEST(YAMLIO, TestReadBuiltInTypesint16OverError) {
 //
 // Test error handling reading built-in int32_t type
 //
-LLVM_YAML_IS_SEQUENCE_VECTOR(int32_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(int32_t)
 TEST(YAMLIO, TestReadBuiltInTypesint32UnderError) {
   std::vector<int32_t> seq;
   Input yin("---\n"
@@ -1931,7 +1931,7 @@ TEST(YAMLIO, TestReadBuiltInTypesint32OverError) {
 //
 // Test error handling reading built-in int64_t type
 //
-LLVM_YAML_IS_SEQUENCE_VECTOR(int64_t)
+LLVM_YAML_IS_FLOW_SEQUENCE_VECTOR(int64_t)
 TEST(YAMLIO, TestReadBuiltInTypesint64UnderError) {
   std::vector<int64_t> seq;
   Input yin("---\n"
diff --git a/unittests/Support/raw_ostream_test.cpp b/unittests/Support/raw_ostream_test.cpp
index f87d2f60d169e855a4c6cd73420306dfbcf6ba23..777e555949eedb2047c7509903f18a79b6e6c5fe 100644
--- a/unittests/Support/raw_ostream_test.cpp
+++ b/unittests/Support/raw_ostream_test.cpp
@@ -9,6 +9,7 @@
 
 #include "gtest/gtest.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 
@@ -330,4 +331,11 @@ TEST(raw_ostreamTest, FormattedHexBytes) {
             "0007: 68 69 6a 6b 6c        |hijkl|",
             format_bytes_with_ascii_str(B.take_front(12), 0, 7, 1));
 }
+
+TEST(raw_fd_ostreamTest, multiple_raw_fd_ostream_to_stdout) {
+  std::error_code EC;
+
+  { raw_fd_ostream("-", EC, sys::fs::OpenFlags::F_None); }
+  { raw_fd_ostream("-", EC, sys::fs::OpenFlags::F_None); }
+}
 }
diff --git a/unittests/Transforms/Scalar/LoopPassManagerTest.cpp b/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
index 0209927b9f4a32564ca530e43f1b080a7b2c4c38..227060f0a46e1e9112c564a72cd460f70a832649 100644
--- a/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
+++ b/unittests/Transforms/Scalar/LoopPassManagerTest.cpp
@@ -46,7 +46,10 @@ public:
 
     DerivedT *Handle;
 
-    Analysis(DerivedT &Handle) : Handle(&Handle) {}
+    Analysis(DerivedT &Handle) : Handle(&Handle) {
+      static_assert(std::is_base_of<MockAnalysisHandleBase, DerivedT>::value,
+                    "Must pass the derived type to this template!");
+    }
 
   public:
     class Result {
@@ -152,7 +155,10 @@ public:
 
     DerivedT *Handle;
 
-    Pass(DerivedT &Handle) : Handle(&Handle) {}
+    Pass(DerivedT &Handle) : Handle(&Handle) {
+      static_assert(std::is_base_of<MockPassHandleBase, DerivedT>::value,
+                    "Must pass the derived type to this template!");
+    }
 
   public:
     PreservedAnalyses run(IRUnitT &IR, AnalysisManagerT &AM,
diff --git a/unittests/Transforms/Utils/CMakeLists.txt b/unittests/Transforms/Utils/CMakeLists.txt
index c0f37418e492850a1ca36743d32548599afbc6ae..0fc19ef09fb01ffa5d97a1f443a12e2e9a3abc20 100644
--- a/unittests/Transforms/Utils/CMakeLists.txt
+++ b/unittests/Transforms/Utils/CMakeLists.txt
@@ -11,6 +11,5 @@ add_llvm_unittest(UtilsTests
   FunctionComparator.cpp
   IntegerDivision.cpp
   Local.cpp
-  MemorySSA.cpp
   ValueMapperTest.cpp
   )
diff --git a/unittests/Transforms/Utils/Cloning.cpp b/unittests/Transforms/Utils/Cloning.cpp
index 634aa9e7e65f8142eee76bb7a530ab4469d157f1..403c9c06c18a2171f01c3c95f1e3bdc30008ea4e 100644
--- a/unittests/Transforms/Utils/Cloning.cpp
+++ b/unittests/Transforms/Utils/Cloning.cpp
@@ -163,7 +163,7 @@ TEST_F(CloneInstruction, Attributes) {
   Function *F2 = Function::Create(FT1, Function::ExternalLinkage);
 
   Attribute::AttrKind AK[] = { Attribute::NoCapture };
-  AttributeSet AS = AttributeSet::get(context, 0, AK);
+  AttributeList AS = AttributeList::get(context, 0, AK);
   Argument *A = &*F1->arg_begin();
   A->addAttr(AS);
 
@@ -201,6 +201,53 @@ TEST_F(CloneInstruction, CallingConvention) {
   delete F2;
 }
 
+TEST_F(CloneInstruction, DuplicateInstructionsToSplit) {
+  Type *ArgTy1[] = {Type::getInt32PtrTy(context)};
+  FunctionType *FT = FunctionType::get(Type::getVoidTy(context), ArgTy1, false);
+  V = new Argument(Type::getInt32Ty(context));
+
+  Function *F = Function::Create(FT, Function::ExternalLinkage);
+
+  BasicBlock *BB1 = BasicBlock::Create(context, "", F);
+  IRBuilder<> Builder1(BB1);
+
+  BasicBlock *BB2 = BasicBlock::Create(context, "", F);
+  IRBuilder<> Builder2(BB2);
+
+  Builder1.CreateBr(BB2);
+
+  Instruction *AddInst = cast<Instruction>(Builder2.CreateAdd(V, V));
+  Instruction *MulInst = cast<Instruction>(Builder2.CreateMul(AddInst, V));
+  Instruction *SubInst = cast<Instruction>(Builder2.CreateSub(MulInst, V));
+  Builder2.CreateRetVoid();
+
+  ValueToValueMapTy Mapping;
+
+  auto Split = DuplicateInstructionsInSplitBetween(BB2, BB1, SubInst, Mapping);
+
+  EXPECT_TRUE(Split);
+  EXPECT_EQ(Mapping.size(), 2u);
+  EXPECT_TRUE(Mapping.find(AddInst) != Mapping.end());
+  EXPECT_TRUE(Mapping.find(MulInst) != Mapping.end());
+
+  auto AddSplit = dyn_cast<Instruction>(Mapping[AddInst]);
+  EXPECT_TRUE(AddSplit);
+  EXPECT_EQ(AddSplit->getOperand(0), V);
+  EXPECT_EQ(AddSplit->getOperand(1), V);
+  EXPECT_EQ(AddSplit->getParent(), Split);
+
+  auto MulSplit = dyn_cast<Instruction>(Mapping[MulInst]);
+  EXPECT_TRUE(MulSplit);
+  EXPECT_EQ(MulSplit->getOperand(0), AddSplit);
+  EXPECT_EQ(MulSplit->getOperand(1), V);
+  EXPECT_EQ(MulSplit->getParent(), Split);
+
+  EXPECT_EQ(AddSplit->getNextNode(), MulSplit);
+  EXPECT_EQ(MulSplit->getNextNode(), Split->getTerminator());
+
+  delete F;
+}
+
 class CloneFunc : public ::testing::Test {
 protected:
   void SetUp() override {
diff --git a/unittests/Transforms/Utils/IntegerDivision.cpp b/unittests/Transforms/Utils/IntegerDivision.cpp
index b6b1b1665ab1f2b7be25bea1c111850469fbf890..e337b9f547a89412cf6ae82f0f5edaf3cf9fd8ca 100644
--- a/unittests/Transforms/Utils/IntegerDivision.cpp
+++ b/unittests/Transforms/Utils/IntegerDivision.cpp
@@ -29,7 +29,7 @@ TEST(IntegerDivision, SDiv) {
   Function *F = Function::Create(FunctionType::get(Builder.getInt32Ty(),
                                                    ArgTys, false),
                                  GlobalValue::ExternalLinkage, "F", &M);
-  assert(F->getArgumentList().size() == 2);
+  assert(F->arg_size() == 2);
 
   BasicBlock *BB = BasicBlock::Create(C, "", F);
   Builder.SetInsertPoint(BB);
@@ -59,7 +59,7 @@ TEST(IntegerDivision, UDiv) {
   Function *F = Function::Create(FunctionType::get(Builder.getInt32Ty(),
                                                    ArgTys, false),
                                  GlobalValue::ExternalLinkage, "F", &M);
-  assert(F->getArgumentList().size() == 2);
+  assert(F->arg_size() == 2);
 
   BasicBlock *BB = BasicBlock::Create(C, "", F);
   Builder.SetInsertPoint(BB);
@@ -89,7 +89,7 @@ TEST(IntegerDivision, SRem) {
   Function *F = Function::Create(FunctionType::get(Builder.getInt32Ty(),
                                                    ArgTys, false),
                                  GlobalValue::ExternalLinkage, "F", &M);
-  assert(F->getArgumentList().size() == 2);
+  assert(F->arg_size() == 2);
 
   BasicBlock *BB = BasicBlock::Create(C, "", F);
   Builder.SetInsertPoint(BB);
@@ -119,7 +119,7 @@ TEST(IntegerDivision, URem) {
   Function *F = Function::Create(FunctionType::get(Builder.getInt32Ty(),
                                                    ArgTys, false),
                                  GlobalValue::ExternalLinkage, "F", &M);
-  assert(F->getArgumentList().size() == 2);
+  assert(F->arg_size() == 2);
 
   BasicBlock *BB = BasicBlock::Create(C, "", F);
   Builder.SetInsertPoint(BB);
@@ -150,7 +150,7 @@ TEST(IntegerDivision, SDiv64) {
   Function *F = Function::Create(FunctionType::get(Builder.getInt64Ty(),
                                                    ArgTys, false),
                                  GlobalValue::ExternalLinkage, "F", &M);
-  assert(F->getArgumentList().size() == 2);
+  assert(F->arg_size() == 2);
 
   BasicBlock *BB = BasicBlock::Create(C, "", F);
   Builder.SetInsertPoint(BB);
@@ -180,7 +180,7 @@ TEST(IntegerDivision, UDiv64) {
   Function *F = Function::Create(FunctionType::get(Builder.getInt64Ty(),
                                                    ArgTys, false),
                                  GlobalValue::ExternalLinkage, "F", &M);
-  assert(F->getArgumentList().size() == 2);
+  assert(F->arg_size() == 2);
 
   BasicBlock *BB = BasicBlock::Create(C, "", F);
   Builder.SetInsertPoint(BB);
@@ -210,7 +210,7 @@ TEST(IntegerDivision, SRem64) {
   Function *F = Function::Create(FunctionType::get(Builder.getInt64Ty(),
                                                    ArgTys, false),
                                  GlobalValue::ExternalLinkage, "F", &M);
-  assert(F->getArgumentList().size() == 2);
+  assert(F->arg_size() == 2);
 
   BasicBlock *BB = BasicBlock::Create(C, "", F);
   Builder.SetInsertPoint(BB);
@@ -240,7 +240,7 @@ TEST(IntegerDivision, URem64) {
   Function *F = Function::Create(FunctionType::get(Builder.getInt64Ty(),
                                                    ArgTys, false),
                                  GlobalValue::ExternalLinkage, "F", &M);
-  assert(F->getArgumentList().size() == 2);
+  assert(F->arg_size() == 2);
 
   BasicBlock *BB = BasicBlock::Create(C, "", F);
   Builder.SetInsertPoint(BB);
diff --git a/unittests/XRay/CMakeLists.txt b/unittests/XRay/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..30bccd1bbe6264d4d92e34065bab2f9f748ffd38
--- /dev/null
+++ b/unittests/XRay/CMakeLists.txt
@@ -0,0 +1,13 @@
+set(LLVM_LINK_COMPONENTS
+  Support
+  )
+
+set(XRAYSources
+ GraphTest.cpp
+ )
+
+add_llvm_unittest(XRayTests
+    ${XRAYSources}
+  )
+
+add_dependencies(XRayTests intrinsics_gen)
diff --git a/unittests/XRay/GraphTest.cpp b/unittests/XRay/GraphTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..b17858f0c1c0b43b6fb8e80b6ba8c0812cdbef69
--- /dev/null
+++ b/unittests/XRay/GraphTest.cpp
@@ -0,0 +1,261 @@
+//===- llvm/unittest/XRay/GraphTest.cpp - XRay Graph unit tests -*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/XRay/Graph.h"
+#include "gtest/gtest.h"
+#include <iostream>
+#include <set>
+#include <type_traits>
+
+using namespace llvm;
+using namespace xray;
+
+namespace {
+struct VAttr {
+  unsigned VA;
+};
+struct EAttr {
+  unsigned EA;
+};
+typedef Graph<VAttr, EAttr, unsigned> GraphT;
+typedef typename GraphT::VertexIdentifier VI;
+typedef typename GraphT::EdgeIdentifier EI;
+
+// Test Fixture
+template <typename T> class GraphTest : public testing::Test {
+protected:
+  T Graph = getTestGraph();
+
+private:
+  static T getTestGraph() {
+    using std::make_pair;
+    typename std::remove_const<T>::type G;
+    G.insert(make_pair(1u, VAttr({3u})));
+    G.insert(make_pair(2u, VAttr({5u})));
+    G.insert(make_pair(3u, VAttr({7u})));
+    G.insert(make_pair(4u, VAttr({11u})));
+    G.insert(make_pair(5u, VAttr({13u})));
+    G.insert(make_pair(6u, VAttr({17u})));
+
+    G.insert(std::make_pair(EI(1u, 2u), EAttr({3u * 5u})));
+    G.insert(std::make_pair(EI(2u, 3u), EAttr({5u * 7u})));
+    G.insert(std::make_pair(EI(6u, 3u), EAttr({2u * 7u * 17u})));
+    G.insert(std::make_pair(EI(4u, 6u), EAttr({11u * 17u})));
+    G.insert(std::make_pair(EI(2u, 4u), EAttr({5u * 11u})));
+    G.insert(std::make_pair(EI(2u, 5u), EAttr({5u * 13u})));
+    G.insert(std::make_pair(EI(4u, 5u), EAttr({11u * 13u})));
+
+    return G;
+  }
+};
+
+typedef ::testing::Types<GraphT, const GraphT> GraphTestTypes;
+
+using VVT = typename GraphT::VertexValueType;
+using EVT = typename GraphT::EdgeValueType;
+
+TYPED_TEST_CASE(GraphTest, GraphTestTypes);
+
+template <typename T> void graphVertexTester(T &G) {
+  std::set<unsigned> V({1u, 2u, 3u, 4u, 5u, 6u});
+  std::vector<unsigned> VA({0u, 3u, 5u, 7u, 11u, 13u, 17u});
+
+  EXPECT_EQ(V.size(), G.vertices().size());
+  EXPECT_FALSE(G.vertices().empty());
+  for (unsigned u : V) {
+    auto EVV = G.at(u);
+    ASSERT_TRUE(!!EVV);
+    EXPECT_EQ(1u, G.count(u));
+    EXPECT_EQ(VA[u], EVV->VA);
+    EXPECT_NE(G.vertices().end(),
+              std::find_if(G.vertices().begin(), G.vertices().end(),
+                           [&](const VVT &VV) { return VV.first == u; }));
+    consumeError(EVV.takeError());
+  }
+
+  for (auto &VVT : G.vertices()) {
+    EXPECT_EQ(1u, V.count(VVT.first));
+    EXPECT_EQ(VA[VVT.first], VVT.second.VA);
+  }
+}
+
+template <typename T> void graphEdgeTester(T &G) {
+  std::set<unsigned> V({1u, 2u, 3u, 4u, 5u, 6u});
+
+  std::set<std::pair<unsigned, unsigned>> E(
+      {{1u, 2u}, {2u, 3u}, {6u, 3u}, {4u, 6u}, {2u, 4u}, {2u, 5u}, {4u, 5u}});
+  std::vector<unsigned> VA({0u, 3u, 5u, 7u, 11u, 13u, 17u});
+
+  EXPECT_EQ(E.size(), G.edges().size());
+  EXPECT_FALSE(G.edges().empty());
+  for (std::pair<unsigned, unsigned> u : E) {
+    auto EEV = G.at(u);
+    ASSERT_TRUE(!!EEV);
+    EXPECT_EQ(1u, G.count(u));
+    EXPECT_EQ(VA[u.first] * VA[u.second] * ((u.first > u.second) ? 2 : 1),
+              EEV->EA);
+    auto Pred = [&](const EVT &EV) { return EV.first == u; };
+    EXPECT_NE(G.edges().end(),
+              std::find_if(G.edges().begin(), G.edges().end(), Pred));
+    consumeError(EEV.takeError());
+  }
+
+  for (auto &EV : G.edges()) {
+    EXPECT_EQ(1u, E.count(EV.first));
+    EXPECT_EQ(VA[EV.first.first] * VA[EV.first.second] *
+                  ((EV.first.first > EV.first.second) ? 2 : 1),
+              EV.second.EA);
+    const auto &IE = G.inEdges(EV.first.second);
+    const auto &OE = G.outEdges(EV.first.first);
+    EXPECT_NE(IE.size(), 0u);
+    EXPECT_NE(OE.size(), 0u);
+    EXPECT_NE(IE.begin(), IE.end());
+    EXPECT_NE(OE.begin(), OE.end());
+    {
+      auto It = std::find_if(
+          G.inEdges(EV.first.second).begin(), G.inEdges(EV.first.second).end(),
+          [&](const EVT &EVI) { return EVI.first == EV.first; });
+      EXPECT_NE(G.inEdges(EV.first.second).end(), It);
+    }
+    {
+      auto It = std::find_if(
+          G.inEdges(EV.first.first).begin(), G.inEdges(EV.first.first).end(),
+          [&](const EVT &EVI) { return EVI.first == EV.first; });
+      EXPECT_EQ(G.inEdges(EV.first.first).end(), It);
+    }
+    {
+      auto It =
+          std::find_if(G.outEdges(EV.first.second).begin(),
+                       G.outEdges(EV.first.second).end(),
+                       [&](const EVT &EVI) { return EVI.first == EV.first; });
+      EXPECT_EQ(G.outEdges(EV.first.second).end(), It);
+    }
+    {
+      auto It = std::find_if(
+          G.outEdges(EV.first.first).begin(), G.outEdges(EV.first.first).end(),
+          [&](const EVT &EVI) { return EVI.first == EV.first; });
+      EXPECT_NE(G.outEdges(EV.first.first).end(), It);
+    }
+  }
+}
+
+TYPED_TEST(GraphTest, TestGraphEdge) {
+  auto &G = this->Graph;
+
+  graphEdgeTester(G);
+}
+
+TYPED_TEST(GraphTest, TestGraphVertex) {
+  auto &G = this->Graph;
+
+  graphVertexTester(G);
+}
+
+TYPED_TEST(GraphTest, TestCopyConstructor) {
+  TypeParam G(this->Graph);
+
+  graphEdgeTester(G);
+  graphVertexTester(G);
+}
+
+TYPED_TEST(GraphTest, TestCopyAssign) {
+  TypeParam G = this->Graph;
+
+  graphEdgeTester(G);
+  graphVertexTester(G);
+}
+
+TYPED_TEST(GraphTest, TestMoveConstructor) {
+  TypeParam G(std::move(this->Graph));
+
+  graphEdgeTester(G);
+  graphVertexTester(G);
+}
+
+// Tests the incremental Construction of a graph
+TEST(GraphTest, TestConstruction) {
+  GraphT MG;
+  const GraphT &G = MG;
+  EXPECT_EQ(0u, G.count(0u));
+  EXPECT_EQ(0u, G.count({0u, 1u}));
+  auto VE = G.at(0);
+  auto EE = G.at({0, 0});
+  EXPECT_FALSE(VE); // G.at[0] returns an error
+  EXPECT_FALSE(EE); // G.at[{0,0}] returns an error
+  consumeError(VE.takeError());
+  consumeError(EE.takeError());
+  EXPECT_TRUE(G.vertices().empty());
+  EXPECT_TRUE(G.edges().empty());
+  EXPECT_EQ(G.vertices().begin(), G.vertices().end());
+  EXPECT_EQ(G.edges().begin(), G.edges().end());
+}
+
+TEST(GraphTest, TestiVertexAccessOperator) {
+  GraphT MG;
+  const GraphT &G = MG;
+
+  MG[0u] = {1u};
+  EXPECT_EQ(1u, MG[0u].VA);
+  EXPECT_EQ(1u, G.count(0u));
+  EXPECT_EQ(0u, G.count(1u));
+  EXPECT_EQ(1u, MG[0u].VA);
+  auto T = G.at(0u);
+  EXPECT_TRUE(!!T);
+  EXPECT_EQ(1u, T->VA);
+
+  EXPECT_EQ(1u, G.vertices().size());
+  EXPECT_EQ(0u, G.edges().size());
+  EXPECT_FALSE(G.vertices().empty());
+  EXPECT_TRUE(G.edges().empty());
+  EXPECT_NE(G.vertices().begin(), G.vertices().end());
+  EXPECT_EQ(G.edges().begin(), G.edges().end());
+  EXPECT_EQ(1u, G.vertices().begin()->second.VA);
+  EXPECT_EQ(0u, G.vertices().begin()->first);
+  EXPECT_EQ(0u, G.outEdges(0u).size());
+  EXPECT_TRUE(G.outEdges(0u).empty());
+  EXPECT_EQ(G.outEdges(0u).begin(), G.outEdges(0u).end());
+  EXPECT_EQ(0u, G.inEdges(0u).size());
+  EXPECT_TRUE(G.inEdges(0u).empty());
+  EXPECT_EQ(G.inEdges(0u).begin(), G.inEdges(0u).end());
+}
+
+TEST(GraphTest, TestEdgeAccessOperator) {
+  GraphT MG;
+  const GraphT &G = MG;
+
+  MG[{0u, 0u}] = {2u};
+  EI EdgeIdent({0u, 0u});
+  EXPECT_EQ(2u, MG[EdgeIdent].EA);
+  EXPECT_EQ(1u, G.count({0u, 0u}));
+  EXPECT_EQ(0u, G.count({0u, 1u}));
+  EXPECT_EQ(1u, G.count(0u));
+  EXPECT_NE(1u, G.count(1u));
+  auto T = G.at({0u, 0u});
+  EXPECT_TRUE(T && T->EA == 2u);
+  EXPECT_EQ(1u, G.edges().size());
+  EXPECT_EQ(1u, G.vertices().size());
+  EXPECT_FALSE(G.edges().empty());
+  EXPECT_FALSE(G.vertices().empty());
+  EXPECT_NE(G.edges().begin(), G.edges().end());
+  EXPECT_EQ(EI(0u, 0u), G.edges().begin()->first);
+  EXPECT_EQ(2u, G.edges().begin()->second.EA);
+  EXPECT_EQ(1u, G.outEdges(0u).size());
+  EXPECT_FALSE(G.outEdges(0u).empty());
+  EXPECT_NE(G.outEdges(0u).begin(), G.outEdges(0u).end());
+  EXPECT_EQ(EI(0u, 0u), G.outEdges(0u).begin()->first);
+  EXPECT_EQ(2u, G.outEdges(0u).begin()->second.EA);
+  EXPECT_EQ(++(G.outEdges(0u).begin()), G.outEdges(0u).end());
+  EXPECT_EQ(1u, G.inEdges(0u).size());
+  EXPECT_FALSE(G.inEdges(0u).empty());
+  EXPECT_NE(G.inEdges(0u).begin(), G.inEdges(0u).end());
+  EXPECT_EQ(EI(0u, 0u), G.inEdges(0u).begin()->first);
+  EXPECT_EQ(2u, G.inEdges(0u).begin()->second.EA);
+  EXPECT_EQ(++(G.inEdges(0u).begin()), G.inEdges(0u).end());
+}
+}
diff --git a/utils/FileCheck/FileCheck.cpp b/utils/FileCheck/FileCheck.cpp
index 9e177574625f6ea163e2617e1ecd004a69ee7410..f563cadc92c3dfba764ea8195c796399fb8c4e48 100644
--- a/utils/FileCheck/FileCheck.cpp
+++ b/utils/FileCheck/FileCheck.cpp
@@ -10,7 +10,7 @@
 // FileCheck does a line-by line check of a file that validates whether it
 // contains the expected content.  This is useful for regression tests etc.
 //
-// This program exits with an error status of 2 on error, exit status of 0 if
+// This program exits with an exit status of 2 on error, exit status of 0 if
 // the file matched the expected contents, and exit status of 1 if it did not
 // contain the expected contents.
 //
@@ -73,6 +73,12 @@ static cl::opt<bool> MatchFullLines(
              "Allows leading and trailing whitespace if --strict-whitespace\n"
              "is not also passed."));
 
+static cl::opt<bool> EnableVarScope(
+    "enable-var-scope", cl::init(false),
+    cl::desc("Enables scope for regex variables. Variables with names that\n"
+             "do not start with '$' will be reset at the beginning of\n"
+             "each CHECK-LABEL block."));
+
 typedef cl::list<std::string>::const_iterator prefix_iterator;
 
 //===----------------------------------------------------------------------===//
@@ -263,15 +269,19 @@ bool Pattern::ParsePattern(StringRef PatternStr, StringRef Prefix,
       // is relaxed, more strict check is performed in \c EvaluateExpression.
       bool IsExpression = false;
       for (unsigned i = 0, e = Name.size(); i != e; ++i) {
-        if (i == 0 && Name[i] == '@') {
-          if (NameEnd != StringRef::npos) {
-            SM.PrintMessage(SMLoc::getFromPointer(Name.data()),
-                            SourceMgr::DK_Error,
-                            "invalid name in named regex definition");
-            return true;
+        if (i == 0) {
+          if (Name[i] == '$')  // Global vars start with '$'
+            continue;
+          if (Name[i] == '@') {
+            if (NameEnd != StringRef::npos) {
+              SM.PrintMessage(SMLoc::getFromPointer(Name.data()),
+                              SourceMgr::DK_Error,
+                              "invalid name in named regex definition");
+              return true;
+            }
+            IsExpression = true;
+            continue;
           }
-          IsExpression = true;
-          continue;
         }
         if (Name[i] != '_' && !isalnum(Name[i]) &&
             (!IsExpression || (Name[i] != '+' && Name[i] != '-'))) {
@@ -1262,6 +1272,18 @@ static void DumpCommandLine(int argc, char **argv) {
   errs() << "\n";
 }
 
+// Remove local variables from \p VariableTable. Global variables
+// (start with '$') are preserved.
+static void ClearLocalVars(StringMap<StringRef> &VariableTable) {
+  SmallVector<StringRef, 16> LocalVars;
+  for (const auto &Var : VariableTable)
+    if (Var.first()[0] != '$')
+      LocalVars.push_back(Var.first());
+
+  for (const auto &Var : LocalVars)
+    VariableTable.erase(Var);
+}
+
 /// Check the input to FileCheck provided in the \p Buffer against the \p
 /// CheckStrings read from the check file.
 ///
@@ -1298,6 +1320,9 @@ bool CheckInput(SourceMgr &SM, StringRef Buffer,
       ++j;
     }
 
+    if (EnableVarScope)
+      ClearLocalVars(VariableTable);
+
     for (; i != j; ++i) {
       const CheckString &CheckStr = CheckStrings[i];
 
diff --git a/utils/TableGen/AsmMatcherEmitter.cpp b/utils/TableGen/AsmMatcherEmitter.cpp
index 1272d2b9f2872290f857f35f433d74ae4a363a2e..3947d0220ed5e46bf8024018a2285410c6c9a0a8 100644
--- a/utils/TableGen/AsmMatcherEmitter.cpp
+++ b/utils/TableGen/AsmMatcherEmitter.cpp
@@ -123,9 +123,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-matcher-emitter"
 
+cl::OptionCategory AsmMatcherEmitterCat("Options for -gen-asm-matcher");
+
 static cl::opt<std::string>
-MatchPrefix("match-prefix", cl::init(""),
-            cl::desc("Only match instructions with the given prefix"));
+    MatchPrefix("match-prefix", cl::init(""),
+                cl::desc("Only match instructions with the given prefix"),
+                cl::cat(AsmMatcherEmitterCat));
 
 namespace {
 class AsmMatcherInfo;
@@ -2784,8 +2787,6 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   }
   OS << "  void convertToMapAndConstraints(unsigned Kind,\n                ";
   OS << "           const OperandVector &Operands) override;\n";
-  if (HasMnemonicFirst)
-    OS << "  bool mnemonicIsValid(StringRef Mnemonic, unsigned VariantID);\n";
   OS << "  unsigned MatchInstructionImpl(const OperandVector &Operands,\n"
      << "                                MCInst &Inst,\n"
      << "                                uint64_t &ErrorInfo,"
@@ -2883,7 +2884,7 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
   StringTable.EmitString(OS);
   OS << ";\n\n";
 
-  // Emit the static match table; unused classes get initalized to 0 which is
+  // Emit the static match table; unused classes get initialized to 0 which is
   // guaranteed to be InvalidMatchClass.
   //
   // FIXME: We can reduce the size of this table very easily. First, we change
@@ -2967,28 +2968,6 @@ void AsmMatcherEmitter::run(raw_ostream &OS) {
     OS << "};\n\n";
   }
 
-  // A method to determine if a mnemonic is in the list.
-  if (HasMnemonicFirst) {
-    OS << "bool " << Target.getName() << ClassName << "::\n"
-       << "mnemonicIsValid(StringRef Mnemonic, unsigned VariantID) {\n";
-    OS << "  // Find the appropriate table for this asm variant.\n";
-    OS << "  const MatchEntry *Start, *End;\n";
-    OS << "  switch (VariantID) {\n";
-    OS << "  default: llvm_unreachable(\"invalid variant!\");\n";
-    for (unsigned VC = 0; VC != VariantCount; ++VC) {
-      Record *AsmVariant = Target.getAsmParserVariant(VC);
-      int AsmVariantNo = AsmVariant->getValueAsInt("Variant");
-      OS << "  case " << AsmVariantNo << ": Start = std::begin(MatchTable" << VC
-         << "); End = std::end(MatchTable" << VC << "); break;\n";
-    }
-    OS << "  }\n";
-    OS << "  // Search the table.\n";
-    OS << "  auto MnemonicRange = ";
-    OS << "std::equal_range(Start, End, Mnemonic, LessOpcode());\n";
-    OS << "  return MnemonicRange.first != MnemonicRange.second;\n";
-    OS << "}\n\n";
-  }
-
   // Finally, build the match function.
   OS << "unsigned " << Target.getName() << ClassName << "::\n"
      << "MatchInstructionImpl(const OperandVector &Operands,\n";
diff --git a/utils/TableGen/AsmWriterEmitter.cpp b/utils/TableGen/AsmWriterEmitter.cpp
index a7c6104aaa214500a2867b8441246bf339e43e9b..40b7857ab994ba3a08df31ee4835da8fb0b3b604 100644
--- a/utils/TableGen/AsmWriterEmitter.cpp
+++ b/utils/TableGen/AsmWriterEmitter.cpp
@@ -741,7 +741,7 @@ struct AliasPriorityComparator {
     if (LHS.second ==  RHS.second) {
       // We don't actually care about the order, but for consistency it
       // shouldn't depend on pointer comparisons.
-      return LHS.first.TheDef->getName() < RHS.first.TheDef->getName();
+      return LessRecordByID()(LHS.first.TheDef, RHS.first.TheDef);
     }
 
     // Aliases with larger priorities should be considered first.
@@ -813,10 +813,9 @@ void AsmWriterEmitter::EmitPrintAliasInstruction(raw_ostream &O) {
         // We only consider ReqFeatures predicates if PassSubtarget
         std::vector<Record *> RF =
             CGA.TheDef->getValueAsListOfDefs("Predicates");
-        std::copy_if(RF.begin(), RF.end(), std::back_inserter(ReqFeatures),
-                     [](Record *R) {
-                       return R->getValueAsBit("AssemblerMatcherPredicate");
-                     });
+        copy_if(RF, std::back_inserter(ReqFeatures), [](Record *R) {
+          return R->getValueAsBit("AssemblerMatcherPredicate");
+        });
       }
 
       unsigned NumMIOps = 0;
diff --git a/utils/TableGen/CMakeLists.txt b/utils/TableGen/CMakeLists.txt
index 62fb5dd8ffff40bd563fe5e91df9ae21d8ba3277..b2913afae12a4c49d3474d8855156a4232027fff 100644
--- a/utils/TableGen/CMakeLists.txt
+++ b/utils/TableGen/CMakeLists.txt
@@ -35,6 +35,7 @@ add_tablegen(llvm-tblgen LLVM
   TableGen.cpp
   Types.cpp
   X86DisassemblerTables.cpp
+  X86EVEX2VEXTablesEmitter.cpp
   X86ModRMFilters.cpp
   X86RecognizableInstr.cpp
   CTagsEmitter.cpp
diff --git a/utils/TableGen/CodeGenDAGPatterns.cpp b/utils/TableGen/CodeGenDAGPatterns.cpp
index 4eec2e6aa3f08e10cb795f56138e76fbb088c0a8..972eb9cd3403ede4e3002b056e8e9831d14b9684 100644
--- a/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -580,56 +580,74 @@ bool EEVT::TypeSet::EnforceVectorSubVectorTypeIs(EEVT::TypeSet &VTOperand,
   return MadeChange;
 }
 
-/// EnforceVectorSameNumElts - 'this' is now constrained to
-/// be a vector with same num elements as VTOperand.
-bool EEVT::TypeSet::EnforceVectorSameNumElts(EEVT::TypeSet &VTOperand,
-                                             TreePattern &TP) {
+/// EnforceameNumElts - If VTOperand is a scalar, then 'this' is a scalar. If
+/// VTOperand is a vector, then 'this' must have the same number of elements.
+bool EEVT::TypeSet::EnforceSameNumElts(EEVT::TypeSet &VTOperand,
+                                       TreePattern &TP) {
   if (TP.hasError())
     return false;
 
-  // "This" must be a vector and "VTOperand" must be a vector.
   bool MadeChange = false;
-  MadeChange |= EnforceVector(TP);
-  MadeChange |= VTOperand.EnforceVector(TP);
 
-  // If we know one of the vector types, it forces the other type to agree.
+  if (isCompletelyUnknown())
+    MadeChange = FillWithPossibleTypes(TP);
+
+  if (VTOperand.isCompletelyUnknown())
+    MadeChange = VTOperand.FillWithPossibleTypes(TP);
+
+  // If one contains vectors but the other doesn't pull vectors out.
+  if (!hasVectorTypes())
+    MadeChange |= VTOperand.EnforceScalar(TP);
+  else if (!hasScalarTypes())
+    MadeChange |= VTOperand.EnforceVector(TP);
+  if (!VTOperand.hasVectorTypes())
+    MadeChange |= EnforceScalar(TP);
+  else if (!VTOperand.hasScalarTypes())
+    MadeChange |= EnforceVector(TP);
+
+  // If one type is a vector, make sure the other has the same element count.
+  // If this a scalar, then we are already done with the above.
   if (isConcrete()) {
     MVT IVT = getConcrete();
-    unsigned NumElems = IVT.getVectorNumElements();
+    if (IVT.isVector()) {
+      unsigned NumElems = IVT.getVectorNumElements();
 
-    // Only keep types that have same elements as 'this'.
-    TypeSet InputSet(VTOperand);
+      // Only keep types that have same elements as 'this'.
+      TypeSet InputSet(VTOperand);
 
-    auto I = remove_if(VTOperand.TypeVec, [NumElems](MVT VVT) {
-      return VVT.getVectorNumElements() != NumElems;
-    });
-    MadeChange |= I != VTOperand.TypeVec.end();
-    VTOperand.TypeVec.erase(I, VTOperand.TypeVec.end());
+      auto I = remove_if(VTOperand.TypeVec, [NumElems](MVT VVT) {
+        return VVT.getVectorNumElements() != NumElems;
+      });
+      MadeChange |= I != VTOperand.TypeVec.end();
+      VTOperand.TypeVec.erase(I, VTOperand.TypeVec.end());
 
-    if (VTOperand.TypeVec.empty()) {  // FIXME: Really want an SMLoc here!
-      TP.error("Type inference contradiction found, forcing '" +
-               InputSet.getName() + "' to have same number elements as '" +
-               getName() + "'");
-      return false;
+      if (VTOperand.TypeVec.empty()) {  // FIXME: Really want an SMLoc here!
+        TP.error("Type inference contradiction found, forcing '" +
+                 InputSet.getName() + "' to have same number elements as '" +
+                 getName() + "'");
+        return false;
+      }
     }
   } else if (VTOperand.isConcrete()) {
     MVT IVT = VTOperand.getConcrete();
-    unsigned NumElems = IVT.getVectorNumElements();
+    if (IVT.isVector()) {
+      unsigned NumElems = IVT.getVectorNumElements();
 
-    // Only keep types that have same elements as VTOperand.
-    TypeSet InputSet(*this);
+      // Only keep types that have same elements as VTOperand.
+      TypeSet InputSet(*this);
 
-    auto I = remove_if(TypeVec, [NumElems](MVT VVT) {
-      return VVT.getVectorNumElements() != NumElems;
-    });
-    MadeChange |= I != TypeVec.end();
-    TypeVec.erase(I, TypeVec.end());
+      auto I = remove_if(TypeVec, [NumElems](MVT VVT) {
+        return VVT.getVectorNumElements() != NumElems;
+      });
+      MadeChange |= I != TypeVec.end();
+      TypeVec.erase(I, TypeVec.end());
 
-    if (TypeVec.empty()) {  // FIXME: Really want an SMLoc here!
-      TP.error("Type inference contradiction found, forcing '" +
-               InputSet.getName() + "' to have same number elements than '" +
-               VTOperand.getName() + "'");
-      return false;
+      if (TypeVec.empty()) {  // FIXME: Really want an SMLoc here!
+        TP.error("Type inference contradiction found, forcing '" +
+                 InputSet.getName() + "' to have same number elements than '" +
+                 VTOperand.getName() + "'");
+        return false;
+      }
     }
   }
 
@@ -644,6 +662,12 @@ bool EEVT::TypeSet::EnforceSameSize(EEVT::TypeSet &VTOperand,
 
   bool MadeChange = false;
 
+  if (isCompletelyUnknown())
+    MadeChange = FillWithPossibleTypes(TP);
+
+  if (VTOperand.isCompletelyUnknown())
+    MadeChange = VTOperand.FillWithPossibleTypes(TP);
+
   // If we know one of the types, it forces the other type agree.
   if (isConcrete()) {
     MVT IVT = getConcrete();
@@ -1058,7 +1082,7 @@ bool SDTypeConstraint::ApplyTypeConstraint(TreePatternNode *N,
       getOperandNum(x.SDTCisSameNumEltsAs_Info.OtherOperandNum,
                     N, NodeInfo, OResNo);
     return OtherNode->getExtType(OResNo).
-      EnforceVectorSameNumElts(NodeToApply->getExtType(ResNo), TP);
+      EnforceSameNumElts(NodeToApply->getExtType(ResNo), TP);
   }
   case SDTCisSameSizeAs: {
     unsigned OResNo = 0;
diff --git a/utils/TableGen/CodeGenDAGPatterns.h b/utils/TableGen/CodeGenDAGPatterns.h
index 97401cd81713effcc6a2fb2f46f859c67c1ad43b..189d6e382ee7c3b41450fd9ee8a00354421d4691 100644
--- a/utils/TableGen/CodeGenDAGPatterns.h
+++ b/utils/TableGen/CodeGenDAGPatterns.h
@@ -144,9 +144,10 @@ namespace EEVT {
     /// be a vector type VT.
     bool EnforceVectorSubVectorTypeIs(EEVT::TypeSet &VT, TreePattern &TP);
 
-    /// EnforceVectorSameNumElts - 'this' is now constrained to
-    /// be a vector with same num elements as VT.
-    bool EnforceVectorSameNumElts(EEVT::TypeSet &VT, TreePattern &TP);
+    /// EnforceSameNumElts - If VTOperand is a scalar, then 'this' is a scalar.
+    /// If VTOperand is a vector, then 'this' must have the same number of
+    /// elements.
+    bool EnforceSameNumElts(EEVT::TypeSet &VT, TreePattern &TP);
 
     /// EnforceSameSize - 'this' is now constrained to be the same size as VT.
     bool EnforceSameSize(EEVT::TypeSet &VT, TreePattern &TP);
diff --git a/utils/TableGen/CodeGenMapTable.cpp b/utils/TableGen/CodeGenMapTable.cpp
index 8032d7b3ee95af897ceceb6ab9c2083915c26880..60db6c267ad7302bfef25bf9968061bf6f2ad874 100644
--- a/utils/TableGen/CodeGenMapTable.cpp
+++ b/utils/TableGen/CodeGenMapTable.cpp
@@ -367,7 +367,7 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
 
   ArrayRef<const CodeGenInstruction*> NumberedInstructions =
                                             Target.getInstructionsByEnumValue();
-  std::string TargetName = Target.getName();
+  std::string Namespace = Target.getInstNamespace();
   const std::vector<ListInit*> &ValueCols = InstrMapDesc.getValueCols();
   unsigned NumCol = ValueCols.size();
   unsigned TotalNumInstr = NumberedInstructions.size();
@@ -387,22 +387,22 @@ unsigned MapTableEmitter::emitBinSearchTable(raw_ostream &OS) {
         if (ColInstrs[j] != nullptr) {
           RelExists = 1;
           OutStr += ", ";
-          OutStr += TargetName;
+          OutStr += Namespace;
           OutStr += "::";
           OutStr += ColInstrs[j]->getName();
         } else { OutStr += ", (uint16_t)-1U";}
       }
 
       if (RelExists) {
-        OS << "  { " << TargetName << "::" << CurInstr->getName();
+        OS << "  { " << Namespace << "::" << CurInstr->getName();
         OS << OutStr <<" },\n";
         TableSize++;
       }
     }
   }
   if (!TableSize) {
-    OS << "  { " << TargetName << "::" << "INSTRUCTION_LIST_END, ";
-    OS << TargetName << "::" << "INSTRUCTION_LIST_END }";
+    OS << "  { " << Namespace << "::" << "INSTRUCTION_LIST_END, ";
+    OS << Namespace << "::" << "INSTRUCTION_LIST_END }";
   }
   OS << "}; // End of " << InstrMapDesc.getName() << "Table\n\n";
   return TableSize;
@@ -567,7 +567,7 @@ namespace llvm {
 //===----------------------------------------------------------------------===//
 void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
   CodeGenTarget Target(Records);
-  std::string TargetName = Target.getName();
+  std::string NameSpace = Target.getInstNamespace();
   std::vector<Record*> InstrMapVec;
   InstrMapVec = Records.getAllDerivedDefinitions("InstrMapping");
 
@@ -577,7 +577,7 @@ void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
   OS << "#ifdef GET_INSTRMAP_INFO\n";
   OS << "#undef GET_INSTRMAP_INFO\n";
   OS << "namespace llvm {\n\n";
-  OS << "namespace " << TargetName << " {\n\n";
+  OS << "namespace " << NameSpace << " {\n\n";
 
   // Emit coulumn field names and their values as enums.
   emitEnums(OS, Records);
@@ -600,7 +600,7 @@ void EmitMapTable(RecordKeeper &Records, raw_ostream &OS) {
     // Emit map tables and the functions to query them.
     IMap.emitTablesWithFunc(OS);
   }
-  OS << "} // End " << TargetName << " namespace\n";
+  OS << "} // End " << NameSpace << " namespace\n";
   OS << "} // End llvm namespace\n";
   OS << "#endif // GET_INSTRMAP_INFO\n\n";
 }
diff --git a/utils/TableGen/CodeGenRegisters.cpp b/utils/TableGen/CodeGenRegisters.cpp
index c03e0d1fcf6b12e92d2190044079bf78ff57d0c4..627614d991d52d13d0615053856404edb41c2025 100644
--- a/utils/TableGen/CodeGenRegisters.cpp
+++ b/utils/TableGen/CodeGenRegisters.cpp
@@ -1668,7 +1668,7 @@ void CodeGenRegBank::computeRegUnitSets() {
           dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name
                  << ":";
           for (auto &U : RegUnitSets[USIdx].Units)
-            dbgs() << " " << RegUnits[U].Roots[0]->getName();
+            printRegUnitName(U);
           dbgs() << "\n";
         });
 
@@ -1681,7 +1681,7 @@ void CodeGenRegBank::computeRegUnitSets() {
           dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name
                  << ":";
           for (auto &U : RegUnitSets[USIdx].Units)
-            dbgs() << " " << RegUnits[U].Roots[0]->getName();
+            printRegUnitName(U);
           dbgs() << "\n";
         }
         dbgs() << "\nUnion sets:\n");
@@ -1727,7 +1727,7 @@ void CodeGenRegBank::computeRegUnitSets() {
         DEBUG(dbgs() << "UnitSet " << RegUnitSets.size()-1
               << " " << RegUnitSets.back().Name << ":";
               for (auto &U : RegUnitSets.back().Units)
-                dbgs() << " " << RegUnits[U].Roots[0]->getName();
+                printRegUnitName(U);
               dbgs() << "\n";);
       }
     }
@@ -1742,7 +1742,7 @@ void CodeGenRegBank::computeRegUnitSets() {
           dbgs() << "UnitSet " << USIdx << " " << RegUnitSets[USIdx].Name
                  << ":";
           for (auto &U : RegUnitSets[USIdx].Units)
-            dbgs() << " " << RegUnits[U].Roots[0]->getName();
+            printRegUnitName(U);
           dbgs() << "\n";
         });
 
@@ -1763,8 +1763,8 @@ void CodeGenRegBank::computeRegUnitSets() {
       continue;
 
     DEBUG(dbgs() << "RC " << RC.getName() << " Units: \n";
-          for (auto &U : RCRegUnits)
-            dbgs() << RegUnits[U].getRoots()[0]->getName() << " ";
+          for (auto U : RCRegUnits)
+            printRegUnitName(U);
           dbgs() << "\n  UnitSetIDs:");
 
     // Find all supersets.
@@ -2170,3 +2170,10 @@ BitVector CodeGenRegBank::computeCoveredRegisters(ArrayRef<Record*> Regs) {
     BV.set(Set[i]->EnumValue);
   return BV;
 }
+
+void CodeGenRegBank::printRegUnitName(unsigned Unit) const {
+  if (Unit < NumNativeRegUnits)
+    dbgs() << ' ' << RegUnits[Unit].Roots[0]->getName();
+  else
+    dbgs() << " #" << Unit;
+}
diff --git a/utils/TableGen/CodeGenRegisters.h b/utils/TableGen/CodeGenRegisters.h
index 3ed26fa401a13f6812e46d516ca6d889dc14ae58..9366838c77cd0f766c7b476ed293dfd2137f6f36 100644
--- a/utils/TableGen/CodeGenRegisters.h
+++ b/utils/TableGen/CodeGenRegisters.h
@@ -735,6 +735,10 @@ namespace llvm {
     // LaneMask is contained in CoveringLanes will be completely covered by
     // another sub-register with the same or larger lane mask.
     LaneBitmask CoveringLanes;
+
+    // Helper function for printing debug information. Handles artificial
+    // (non-native) reg units.
+    void printRegUnitName(unsigned Unit) const;
   };
 
 } // end namespace llvm
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 6503d5af2d48d9a386d1394792670376f372742e..d93511b0d87339fef6881d245df67c2ea00b1680 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -25,13 +25,18 @@
 #include <algorithm>
 using namespace llvm;
 
+cl::OptionCategory AsmParserCat("Options for -gen-asm-parser");
+cl::OptionCategory AsmWriterCat("Options for -gen-asm-writer");
+
 static cl::opt<unsigned>
-AsmParserNum("asmparsernum", cl::init(0),
-             cl::desc("Make -gen-asm-parser emit assembly parser #N"));
+    AsmParserNum("asmparsernum", cl::init(0),
+                 cl::desc("Make -gen-asm-parser emit assembly parser #N"),
+                 cl::cat(AsmParserCat));
 
 static cl::opt<unsigned>
-AsmWriterNum("asmwriternum", cl::init(0),
-             cl::desc("Make -gen-asm-writer emit assembly writer #N"));
+    AsmWriterNum("asmwriternum", cl::init(0),
+                 cl::desc("Make -gen-asm-writer emit assembly writer #N"),
+                 cl::cat(AsmWriterCat));
 
 /// getValueType - Return the MVT::SimpleValueType that the specified TableGen
 /// record corresponds to.
diff --git a/utils/TableGen/DAGISelMatcherEmitter.cpp b/utils/TableGen/DAGISelMatcherEmitter.cpp
index d30fc5131cbaf2fdf808ec42a1131d5466bd400f..67e8f15b248e77d38c218b8e9ab3136541905fb2 100644
--- a/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -11,14 +11,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "DAGISelMatcher.h"
 #include "CodeGenDAGPatterns.h"
+#include "DAGISelMatcher.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/TinyPtrVector.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 using namespace llvm;
 
@@ -26,10 +30,17 @@ enum {
   CommentIndent = 30
 };
 
+cl::OptionCategory DAGISelCat("Options for -gen-dag-isel");
+
 // To reduce generated source code size.
-static cl::opt<bool>
-OmitComments("omit-comments", cl::desc("Do not generate comments"),
-             cl::init(false));
+static cl::opt<bool> OmitComments("omit-comments",
+                                  cl::desc("Do not generate comments"),
+                                  cl::init(false), cl::cat(DAGISelCat));
+
+static cl::opt<bool> InstrumentCoverage(
+    "instrument-coverage",
+    cl::desc("Generates tables to help identify patterns matched"),
+    cl::init(false), cl::cat(DAGISelCat));
 
 namespace {
 class MatcherTableEmitter {
@@ -52,6 +63,19 @@ class MatcherTableEmitter {
   DenseMap<Record*, unsigned> NodeXFormMap;
   std::vector<Record*> NodeXForms;
 
+  std::vector<std::string> VecIncludeStrings;
+  MapVector<std::string, unsigned, StringMap<unsigned> > VecPatterns;
+
+  unsigned getPatternIdxFromTable(std::string &&P, std::string &&include_loc) {
+    const auto It = VecPatterns.find(P);
+    if (It == VecPatterns.end()) {
+      VecPatterns.insert(make_pair(std::move(P), VecPatterns.size()));
+      VecIncludeStrings.push_back(std::move(include_loc));
+      return VecIncludeStrings.size() - 1;
+    }
+    return It->second;
+  }
+
 public:
   MatcherTableEmitter(const CodeGenDAGPatterns &cgp)
     : CGP(cgp) {}
@@ -62,6 +86,9 @@ public:
   void EmitPredicateFunctions(formatted_raw_ostream &OS);
 
   void EmitHistogram(const Matcher *N, formatted_raw_ostream &OS);
+
+  void EmitPatternMatchTable(raw_ostream &OS);
+
 private:
   unsigned EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
                        formatted_raw_ostream &OS);
@@ -117,6 +144,14 @@ private:
 };
 } // end anonymous namespace.
 
+static std::string GetPatFromTreePatternNode(const TreePatternNode *N) {
+  std::string str;
+  raw_string_ostream Stream(str);
+  Stream << *N;
+  Stream.str();
+  return str;
+}
+
 static unsigned GetVBRSize(unsigned Val) {
   if (Val <= 127) return 1;
 
@@ -150,6 +185,56 @@ static uint64_t EmitVBRValue(uint64_t Val, raw_ostream &OS) {
   return NumBytes+1;
 }
 
+// This is expensive and slow.
+static std::string getIncludePath(const Record *R) {
+  std::string str;
+  raw_string_ostream Stream(str);
+  auto Locs = R->getLoc();
+  SMLoc L;
+  if (Locs.size() > 1) {
+    // Get where the pattern prototype was instantiated
+    L = Locs[1];
+  } else if (Locs.size() == 1) {
+    L = Locs[0];
+  }
+  unsigned CurBuf = SrcMgr.FindBufferContainingLoc(L);
+  assert(CurBuf && "Invalid or unspecified location!");
+
+  Stream << SrcMgr.getBufferInfo(CurBuf).Buffer->getBufferIdentifier() << ":"
+         << SrcMgr.FindLineNumber(L, CurBuf);
+  Stream.str();
+  return str;
+}
+
+void MatcherTableEmitter::EmitPatternMatchTable(raw_ostream &OS) {
+
+  assert(isUInt<16>(VecPatterns.size()) &&
+         "Using only 16 bits to encode offset into Pattern Table");
+  assert(VecPatterns.size() == VecIncludeStrings.size() &&
+         "The sizes of Pattern and include vectors should be the same");
+  OS << "StringRef getPatternForIndex(unsigned Index) override {\n";
+  OS << "static const char * PATTERN_MATCH_TABLE[] = {\n";
+
+  for (const auto &It : VecPatterns) {
+    OS << "\"" << It.first << "\",\n";
+  }
+
+  OS << "\n};";
+  OS << "\nreturn StringRef(PATTERN_MATCH_TABLE[Index]);";
+  OS << "\n}";
+
+  OS << "\nStringRef getIncludePathForIndex(unsigned Index) override {\n";
+  OS << "static const char * INCLUDE_PATH_TABLE[] = {\n";
+
+  for (const auto &It : VecIncludeStrings) {
+    OS << "\"" << It << "\",\n";
+  }
+
+  OS << "\n};";
+  OS << "\nreturn StringRef(INCLUDE_PATH_TABLE[Index]);";
+  OS << "\n}";
+}
+
 /// EmitMatcher - Emit bytes for the specified matcher and return
 /// the number of bytes emitted.
 unsigned MatcherTableEmitter::
@@ -537,6 +622,23 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
 
   case Matcher::EmitNode:
   case Matcher::MorphNodeTo: {
+    auto NumCoveredBytes = 0;
+    if (InstrumentCoverage) {
+      if (const MorphNodeToMatcher *SNT = dyn_cast<MorphNodeToMatcher>(N)) {
+        NumCoveredBytes = 3;
+        OS << "OPC_Coverage, ";
+        std::string src =
+            GetPatFromTreePatternNode(SNT->getPattern().getSrcPattern());
+        std::string dst =
+            GetPatFromTreePatternNode(SNT->getPattern().getDstPattern());
+        Record *PatRecord = SNT->getPattern().getSrcRecord();
+        std::string include_src = getIncludePath(PatRecord);
+        unsigned Offset =
+            getPatternIdxFromTable(src + " -> " + dst, std::move(include_src));
+        OS << "TARGET_VAL(" << Offset << "),\n";
+        OS.PadToColumn(Indent * 2);
+      }
+    }
     const EmitNodeMatcherCommon *EN = cast<EmitNodeMatcherCommon>(N);
     OS << (isa<EmitNodeMatcher>(EN) ? "OPC_EmitNode" : "OPC_MorphNodeTo");
     bool CompressVTs = EN->getNumVTs() < 3;
@@ -593,10 +695,26 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
     } else
       OS << '\n';
 
-    return 5 + !CompressVTs + EN->getNumVTs() + NumOperandBytes;
+    return 5 + !CompressVTs + EN->getNumVTs() + NumOperandBytes +
+           NumCoveredBytes;
   }
   case Matcher::CompleteMatch: {
     const CompleteMatchMatcher *CM = cast<CompleteMatchMatcher>(N);
+    auto NumCoveredBytes = 0;
+    if (InstrumentCoverage) {
+      NumCoveredBytes = 3;
+      OS << "OPC_Coverage, ";
+      std::string src =
+          GetPatFromTreePatternNode(CM->getPattern().getSrcPattern());
+      std::string dst =
+          GetPatFromTreePatternNode(CM->getPattern().getDstPattern());
+      Record *PatRecord = CM->getPattern().getSrcRecord();
+      std::string include_src = getIncludePath(PatRecord);
+      unsigned Offset =
+          getPatternIdxFromTable(src + " -> " + dst, std::move(include_src));
+      OS << "TARGET_VAL(" << Offset << "),\n";
+      OS.PadToColumn(Indent * 2);
+    }
     OS << "OPC_CompleteMatch, " << CM->getNumResults() << ", ";
     unsigned NumResultBytes = 0;
     for (unsigned i = 0, e = CM->getNumResults(); i != e; ++i)
@@ -610,7 +728,7 @@ EmitMatcher(const Matcher *N, unsigned Indent, unsigned CurrentIdx,
         << *CM->getPattern().getDstPattern();
     }
     OS << '\n';
-    return 2 + NumResultBytes;
+    return 2 + NumResultBytes + NumCoveredBytes;
   }
   }
   llvm_unreachable("Unreachable");
@@ -686,8 +804,13 @@ void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
         ++NumOps;  // Get the chained node too.
 
       OS << "  case " << i << ":\n";
+      if (InstrumentCoverage)
+        OS << "  {\n";
       OS << "    Result.resize(NextRes+" << NumOps << ");\n";
-      OS << "    return "  << P.getSelectFunc();
+      if (InstrumentCoverage)
+        OS << "    bool Succeeded = " << P.getSelectFunc();
+      else
+        OS << "  return " << P.getSelectFunc();
 
       OS << "(";
       // If the complex pattern wants the root of the match, pass it in as the
@@ -704,6 +827,13 @@ void MatcherTableEmitter::EmitPredicateFunctions(formatted_raw_ostream &OS) {
       for (unsigned i = 0; i != NumOps; ++i)
         OS << ", Result[NextRes+" << i << "].first";
       OS << ");\n";
+      if (InstrumentCoverage) {
+        OS << "    if (Succeeded)\n";
+        OS << "       dbgs() << \"\\nCOMPLEX_PATTERN: " << P.getSelectFunc()
+           << "\\n\" ;\n";
+        OS << "    return Succeeded;\n";
+        OS << "    }\n";
+      }
     }
     OS << "  }\n";
     OS << "}\n\n";
@@ -827,7 +957,7 @@ void llvm::EmitMatcherTable(const Matcher *TheMatcher,
   formatted_raw_ostream OS(O);
 
   OS << "// The main instruction selector code.\n";
-  OS << "SDNode *SelectCode(SDNode *N) {\n";
+  OS << "void SelectCode(SDNode *N) {\n";
 
   MatcherTableEmitter MatcherEmitter(CGP);
 
@@ -842,9 +972,11 @@ void llvm::EmitMatcherTable(const Matcher *TheMatcher,
 
   OS << "  #undef TARGET_VAL\n";
   OS << "  SelectCodeCommon(N, MatcherTable,sizeof(MatcherTable));\n";
-  OS << "  return nullptr;\n";
   OS << "}\n";
 
   // Next up, emit the function for node and pattern predicates:
   MatcherEmitter.EmitPredicateFunctions(OS);
+
+  if (InstrumentCoverage)
+    MatcherEmitter.EmitPatternMatchTable(OS);
 }
diff --git a/utils/TableGen/DAGISelMatcherOpt.cpp b/utils/TableGen/DAGISelMatcherOpt.cpp
index 783b35e745f8f970cab9a491bb140d4a7ecfa856..0bb656826fbdf9f69fcda5ac3a6dc3d55103f42b 100644
--- a/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -181,15 +181,21 @@ static Matcher *FindNodeWithKind(Matcher *M, Matcher::KindTy Kind) {
 ///       ABC
 ///       XYZ
 ///
-static void FactorNodes(std::unique_ptr<Matcher> &MatcherPtr) {
-  // If we reached the end of the chain, we're done.
-  Matcher *N = MatcherPtr.get();
-  if (!N) return;
-  
-  // If this is not a push node, just scan for one.
-  ScopeMatcher *Scope = dyn_cast<ScopeMatcher>(N);
-  if (!Scope)
-    return FactorNodes(N->getNextPtr());
+static void FactorNodes(std::unique_ptr<Matcher> &InputMatcherPtr) {
+  // Look for a push node. Iterates instead of recurses to reduce stack usage.
+  ScopeMatcher *Scope = nullptr;
+  std::unique_ptr<Matcher> *RebindableMatcherPtr = &InputMatcherPtr;
+  while (!Scope) {
+    // If we reached the end of the chain, we're done.
+    Matcher *N = RebindableMatcherPtr->get();
+    if (!N) return;
+
+    // If this is not a push node, just scan for one.
+    Scope = dyn_cast<ScopeMatcher>(N);
+    if (!Scope)
+      RebindableMatcherPtr = &(N->getNextPtr());
+  }
+  std::unique_ptr<Matcher> &MatcherPtr = *RebindableMatcherPtr;
   
   // Okay, pull together the children of the scope node into a vector so we can
   // inspect it more easily.
diff --git a/utils/TableGen/GlobalISelEmitter.cpp b/utils/TableGen/GlobalISelEmitter.cpp
index 2682e26fc8c9b82350fffb43dc2aea0df25210df..bd9d9910d4405e5fce4f1666b758ac553f95f174 100644
--- a/utils/TableGen/GlobalISelEmitter.cpp
+++ b/utils/TableGen/GlobalISelEmitter.cpp
@@ -32,76 +32,119 @@
 
 #include "CodeGenDAGPatterns.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineValueType.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
+#include "llvm/Support/ScopedPrinter.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 #include <string>
+#include <numeric>
 using namespace llvm;
 
 #define DEBUG_TYPE "gisel-emitter"
 
 STATISTIC(NumPatternTotal, "Total number of patterns");
-STATISTIC(NumPatternSkipped, "Number of patterns skipped");
+STATISTIC(NumPatternImported, "Number of patterns imported from SelectionDAG");
+STATISTIC(NumPatternImportsSkipped, "Number of SelectionDAG imports skipped");
 STATISTIC(NumPatternEmitted, "Number of patterns emitted");
 
+cl::OptionCategory GlobalISelEmitterCat("Options for -gen-global-isel");
+
 static cl::opt<bool> WarnOnSkippedPatterns(
     "warn-on-skipped-patterns",
     cl::desc("Explain why a pattern was skipped for inclusion "
              "in the GlobalISel selector"),
-    cl::init(false));
+    cl::init(false), cl::cat(GlobalISelEmitterCat));
 
 namespace {
+//===- Helper functions ---------------------------------------------------===//
+
+/// This class stands in for LLT wherever we want to tablegen-erate an
+/// equivalent at compiler run-time.
+class LLTCodeGen {
+private:
+  LLT Ty;
 
-class GlobalISelEmitter {
 public:
-  explicit GlobalISelEmitter(RecordKeeper &RK);
-  void run(raw_ostream &OS);
+  LLTCodeGen(const LLT &Ty) : Ty(Ty) {}
 
-private:
-  const RecordKeeper &RK;
-  const CodeGenDAGPatterns CGP;
-  const CodeGenTarget &Target;
+  void emitCxxConstructorCall(raw_ostream &OS) const {
+    if (Ty.isScalar()) {
+      OS << "LLT::scalar(" << Ty.getSizeInBits() << ")";
+      return;
+    }
+    if (Ty.isVector()) {
+      OS << "LLT::vector(" << Ty.getNumElements() << ", " << Ty.getSizeInBits()
+         << ")";
+      return;
+    }
+    llvm_unreachable("Unhandled LLT");
+  }
 
-  /// Keep track of the equivalence between SDNodes and Instruction.
-  /// This is defined using 'GINodeEquiv' in the target description.
-  DenseMap<Record *, const CodeGenInstruction *> NodeEquivs;
+  const LLT &get() const { return Ty; }
+};
 
-  void gatherNodeEquivs();
-  const CodeGenInstruction *findNodeEquiv(Record *N);
+class InstructionMatcher;
+class OperandPlaceholder {
+private:
+  enum PlaceholderKind {
+    OP_MatchReference,
+    OP_Temporary,
+  } Kind;
+
+  struct MatchReferenceData {
+    InstructionMatcher *InsnMatcher;
+    StringRef InsnVarName;
+    StringRef SymbolicName;
+  };
 
-  struct SkipReason {
-    std::string Reason;
+  struct TemporaryData {
+    unsigned OpIdx;
   };
 
-  /// Analyze pattern \p P, possibly emitting matching code for it to \p OS.
-  /// Otherwise, return a reason why this pattern was skipped for emission.
-  Optional<SkipReason> runOnPattern(const PatternToMatch &P,
-                                    raw_ostream &OS);
-};
+  union {
+    struct MatchReferenceData MatchReference;
+    struct TemporaryData Temporary;
+  };
 
-} // end anonymous namespace
+  OperandPlaceholder(PlaceholderKind Kind) : Kind(Kind) {}
 
-//===- Helper functions ---------------------------------------------------===//
+public:
+  ~OperandPlaceholder() {}
+
+  static OperandPlaceholder
+  CreateMatchReference(InstructionMatcher *InsnMatcher,
+                       StringRef InsnVarName, StringRef SymbolicName) {
+    OperandPlaceholder Result(OP_MatchReference);
+    Result.MatchReference.InsnMatcher = InsnMatcher;
+    Result.MatchReference.InsnVarName = InsnVarName;
+    Result.MatchReference.SymbolicName = SymbolicName;
+    return Result;
+  }
+
+  static OperandPlaceholder CreateTemporary(unsigned OpIdx) {
+    OperandPlaceholder Result(OP_Temporary);
+    Result.Temporary.OpIdx = OpIdx;
+    return Result;
+  }
+
+  void emitCxxValueExpr(raw_ostream &OS) const;
+};
 
 /// Convert an MVT to an equivalent LLT if possible, or the invalid LLT() for
 /// MVTs that don't map cleanly to an LLT (e.g., iPTR, *any, ...).
-static Optional<std::string> MVTToLLT(MVT::SimpleValueType SVT) {
-  std::string TyStr;
-  raw_string_ostream OS(TyStr);
+static Optional<LLTCodeGen> MVTToLLT(MVT::SimpleValueType SVT) {
   MVT VT(SVT);
-  if (VT.isVector() && VT.getVectorNumElements() != 1) {
-    OS << "LLT::vector(" << VT.getVectorNumElements() << ", "
-       << VT.getScalarSizeInBits() << ")";
-  } else if (VT.isInteger() || VT.isFloatingPoint()) {
-    OS << "LLT::scalar(" << VT.getSizeInBits() << ")";
-  } else {
-    return None;
-  }
-  OS.flush();
-  return TyStr;
+  if (VT.isVector() && VT.getVectorNumElements() != 1)
+    return LLTCodeGen(LLT::vector(VT.getVectorNumElements(), VT.getScalarSizeInBits()));
+  if (VT.isInteger() || VT.isFloatingPoint())
+    return LLTCodeGen(LLT::scalar(VT.getSizeInBits()));
+  return None;
 }
 
 static bool isTrivialOperatorNode(const TreePatternNode *N) {
@@ -110,15 +153,56 @@ static bool isTrivialOperatorNode(const TreePatternNode *N) {
 
 //===- Matchers -----------------------------------------------------------===//
 
-struct MatchAction {
-  virtual ~MatchAction() {}
-  virtual void emit(raw_ostream &OS) const = 0;
-};
+class OperandMatcher;
+class MatchAction;
 
-raw_ostream &operator<<(raw_ostream &S, const MatchAction &A) {
-  A.emit(S);
-  return S;
-}
+/// Generates code to check that a match rule matches.
+class RuleMatcher {
+  /// A list of matchers that all need to succeed for the current rule to match.
+  /// FIXME: This currently supports a single match position but could be
+  /// extended to support multiple positions to support div/rem fusion or
+  /// load-multiple instructions.
+  std::vector<std::unique_ptr<InstructionMatcher>> Matchers;
+
+  /// A list of actions that need to be taken when all predicates in this rule
+  /// have succeeded.
+  std::vector<std::unique_ptr<MatchAction>> Actions;
+
+  /// A map of instruction matchers to the local variables created by
+  /// emitCxxCaptureStmts().
+  std::map<const InstructionMatcher *, std::string> InsnVariableNames;
+
+  /// ID for the next instruction variable defined with defineInsnVar()
+  unsigned NextInsnVarID;
+
+public:
+  RuleMatcher()
+      : Matchers(), Actions(), InsnVariableNames(), NextInsnVarID(0) {}
+  RuleMatcher(RuleMatcher &&Other) = default;
+  RuleMatcher &operator=(RuleMatcher &&Other) = default;
+
+  InstructionMatcher &addInstructionMatcher();
+
+  template <class Kind, class... Args> Kind &addAction(Args &&... args);
+
+  std::string defineInsnVar(raw_ostream &OS, const InstructionMatcher &Matcher,
+                            StringRef Value);
+  StringRef getInsnVarName(const InstructionMatcher &InsnMatcher) const;
+
+  void emitCxxCapturedInsnList(raw_ostream &OS);
+  void emitCxxCaptureStmts(raw_ostream &OS, StringRef Expr);
+
+  void emit(raw_ostream &OS);
+
+  /// Compare the priority of this object and B.
+  ///
+  /// Returns true if this object is more important than B.
+  bool isHigherPriorityThan(const RuleMatcher &B) const;
+
+  /// Report the maximum number of temporary operands needed by the rule
+  /// matcher.
+  unsigned countTemporaryOperands() const;
+};
 
 template <class PredicateTy> class PredicateListMatcher {
 private:
@@ -139,6 +223,7 @@ public:
   iterator_range<typename PredicateVec::const_iterator> predicates() const {
     return make_range(predicates_begin(), predicates_end());
   }
+  typename PredicateVec::size_type predicates_size() const { return Predicates.size(); }
 
   /// Emit a C++ expression that tests whether all the predicates are met.
   template <class... Args>
@@ -153,7 +238,7 @@ public:
       OS << Separator << "(";
       Predicate->emitCxxPredicateExpr(OS, std::forward<Args>(args)...);
       OS << ")";
-      Separator = " && ";
+      Separator = " &&\n";
     }
   }
 };
@@ -166,27 +251,120 @@ public:
 /// * Operand is an MBB.
 class OperandPredicateMatcher {
 public:
+  /// This enum is used for RTTI and also defines the priority that is given to
+  /// the predicate when generating the matcher code. Kinds with higher priority
+  /// must be tested first.
+  ///
+  /// The relative priority of OPM_LLT, OPM_RegBank, and OPM_MBB do not matter
+  /// but OPM_Int must have priority over OPM_RegBank since constant integers
+  /// are represented by a virtual register defined by a G_CONSTANT instruction.
+  enum PredicateKind {
+    OPM_ComplexPattern,
+    OPM_Instruction,
+    OPM_Int,
+    OPM_LLT,
+    OPM_RegBank,
+    OPM_MBB,
+  };
+
+protected:
+  PredicateKind Kind;
+
+public:
+  OperandPredicateMatcher(PredicateKind Kind) : Kind(Kind) {}
   virtual ~OperandPredicateMatcher() {}
 
-  /// Emit a C++ expression that checks the predicate for the OpIdx operand of
-  /// the instruction given in InsnVarName.
-  virtual void emitCxxPredicateExpr(raw_ostream &OS,
-                                    const StringRef InsnVarName,
-                                    unsigned OpIdx) const = 0;
+  PredicateKind getKind() const { return Kind; }
+
+  /// Return the OperandMatcher for the specified operand or nullptr if there
+  /// isn't one by that name in this operand predicate matcher.
+  ///
+  /// InstructionOperandMatcher is the only subclass that can return non-null
+  /// for this.
+  virtual Optional<const OperandMatcher *>
+  getOptionalOperand(StringRef SymbolicName) const {
+    assert(!SymbolicName.empty() && "Cannot lookup unnamed operand");
+    return None;
+  }
+
+  /// Emit C++ statements to capture instructions into local variables.
+  ///
+  /// Only InstructionOperandMatcher needs to do anything for this method.
+  virtual void emitCxxCaptureStmts(raw_ostream &OS, RuleMatcher &Rule,
+                                   StringRef Expr) const {}
+
+  /// Emit a C++ expression that checks the predicate for the given operand.
+  virtual void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                                    StringRef OperandExpr) const = 0;
+
+  /// Compare the priority of this object and B.
+  ///
+  /// Returns true if this object is more important than B.
+  virtual bool isHigherPriorityThan(const OperandPredicateMatcher &B) const {
+    return Kind < B.Kind;
+  };
+
+  /// Report the maximum number of temporary operands needed by the predicate
+  /// matcher.
+  virtual unsigned countTemporaryOperands() const { return 0; }
 };
 
 /// Generates code to check that an operand is a particular LLT.
 class LLTOperandMatcher : public OperandPredicateMatcher {
 protected:
-  std::string Ty;
+  LLTCodeGen Ty;
 
 public:
-  LLTOperandMatcher(std::string Ty) : Ty(Ty) {}
+  LLTOperandMatcher(const LLTCodeGen &Ty)
+      : OperandPredicateMatcher(OPM_LLT), Ty(Ty) {}
 
-  void emitCxxPredicateExpr(raw_ostream &OS, const StringRef InsnVarName,
-                            unsigned OpIdx) const override {
-    OS << "MRI.getType(" << InsnVarName << ".getOperand(" << OpIdx
-       << ").getReg()) == (" << Ty << ")";
+  static bool classof(const OperandPredicateMatcher *P) {
+    return P->getKind() == OPM_LLT;
+  }
+
+  void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                            StringRef OperandExpr) const override {
+    OS << "MRI.getType(" << OperandExpr << ".getReg()) == (";
+    Ty.emitCxxConstructorCall(OS);
+    OS << ")";
+  }
+};
+
+/// Generates code to check that an operand is a particular target constant.
+class ComplexPatternOperandMatcher : public OperandPredicateMatcher {
+protected:
+  const OperandMatcher &Operand;
+  const Record &TheDef;
+
+  unsigned getNumOperands() const {
+    return TheDef.getValueAsDag("Operands")->getNumArgs();
+  }
+
+  unsigned getAllocatedTemporariesBaseID() const;
+
+public:
+  ComplexPatternOperandMatcher(const OperandMatcher &Operand,
+                               const Record &TheDef)
+      : OperandPredicateMatcher(OPM_ComplexPattern), Operand(Operand),
+        TheDef(TheDef) {}
+
+  static bool classof(const OperandPredicateMatcher *P) {
+    return P->getKind() == OPM_ComplexPattern;
+  }
+
+  void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                            StringRef OperandExpr) const override {
+    OS << TheDef.getValueAsString("MatcherFn") << "(" << OperandExpr;
+    for (unsigned I = 0; I < getNumOperands(); ++I) {
+      OS << ", ";
+      OperandPlaceholder::CreateTemporary(getAllocatedTemporariesBaseID() + I)
+          .emitCxxValueExpr(OS);
+    }
+    OS << ")";
+  }
+
+  unsigned countTemporaryOperands() const override {
+    return getNumOperands();
   }
 };
 
@@ -196,22 +374,52 @@ protected:
   const CodeGenRegisterClass &RC;
 
 public:
-  RegisterBankOperandMatcher(const CodeGenRegisterClass &RC) : RC(RC) {}
+  RegisterBankOperandMatcher(const CodeGenRegisterClass &RC)
+      : OperandPredicateMatcher(OPM_RegBank), RC(RC) {}
 
-  void emitCxxPredicateExpr(raw_ostream &OS, const StringRef InsnVarName,
-                            unsigned OpIdx) const override {
+  static bool classof(const OperandPredicateMatcher *P) {
+    return P->getKind() == OPM_RegBank;
+  }
+
+  void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                            StringRef OperandExpr) const override {
     OS << "(&RBI.getRegBankFromRegClass(" << RC.getQualifiedName()
-       << "RegClass) == RBI.getRegBank(" << InsnVarName << ".getOperand("
-       << OpIdx << ").getReg(), MRI, TRI))";
+       << "RegClass) == RBI.getRegBank(" << OperandExpr
+       << ".getReg(), MRI, TRI))";
   }
 };
 
 /// Generates code to check that an operand is a basic block.
 class MBBOperandMatcher : public OperandPredicateMatcher {
 public:
-  void emitCxxPredicateExpr(raw_ostream &OS, const StringRef InsnVarName,
-                            unsigned OpIdx) const override {
-    OS << InsnVarName << ".getOperand(" << OpIdx << ").isMBB()";
+  MBBOperandMatcher() : OperandPredicateMatcher(OPM_MBB) {}
+
+  static bool classof(const OperandPredicateMatcher *P) {
+    return P->getKind() == OPM_MBB;
+  }
+
+  void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                            StringRef OperandExpr) const override {
+    OS << OperandExpr << ".isMBB()";
+  }
+};
+
+/// Generates code to check that an operand is a particular int.
+class IntOperandMatcher : public OperandPredicateMatcher {
+protected:
+  int64_t Value;
+
+public:
+  IntOperandMatcher(int64_t Value)
+      : OperandPredicateMatcher(OPM_Int), Value(Value) {}
+
+  static bool classof(const OperandPredicateMatcher *P) {
+    return P->getKind() == OPM_Int;
+  }
+
+  void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                            StringRef OperandExpr) const override {
+    OS << "isOperandImmEqual(" << OperandExpr << ", " << Value << ", MRI)";
   }
 };
 
@@ -219,33 +427,148 @@ public:
 /// operand.
 class OperandMatcher : public PredicateListMatcher<OperandPredicateMatcher> {
 protected:
+  InstructionMatcher &Insn;
   unsigned OpIdx;
+  std::string SymbolicName;
+
+  /// The index of the first temporary variable allocated to this operand. The
+  /// number of allocated temporaries can be found with
+  /// countTemporaryOperands().
+  unsigned AllocatedTemporariesBaseID;
 
 public:
-  OperandMatcher(unsigned OpIdx) : OpIdx(OpIdx) {}
+  OperandMatcher(InstructionMatcher &Insn, unsigned OpIdx,
+                 const std::string &SymbolicName,
+                 unsigned AllocatedTemporariesBaseID)
+      : Insn(Insn), OpIdx(OpIdx), SymbolicName(SymbolicName),
+        AllocatedTemporariesBaseID(AllocatedTemporariesBaseID) {}
+
+  bool hasSymbolicName() const { return !SymbolicName.empty(); }
+  const StringRef getSymbolicName() const { return SymbolicName; }
+  void setSymbolicName(StringRef Name) {
+    assert(SymbolicName.empty() && "Operand already has a symbolic name");
+    SymbolicName = Name;
+  }
+  unsigned getOperandIndex() const { return OpIdx; }
+
+  std::string getOperandExpr(StringRef InsnVarName) const {
+    return (InsnVarName + ".getOperand(" + llvm::to_string(OpIdx) + ")").str();
+  }
+
+  Optional<const OperandMatcher *>
+  getOptionalOperand(StringRef DesiredSymbolicName) const {
+    assert(!DesiredSymbolicName.empty() && "Cannot lookup unnamed operand");
+    if (DesiredSymbolicName == SymbolicName)
+      return this;
+    for (const auto &OP : predicates()) {
+      const auto &MaybeOperand = OP->getOptionalOperand(DesiredSymbolicName);
+      if (MaybeOperand.hasValue())
+        return MaybeOperand.getValue();
+    }
+    return None;
+  }
+
+  InstructionMatcher &getInstructionMatcher() const { return Insn; }
+
+  /// Emit C++ statements to capture instructions into local variables.
+  void emitCxxCaptureStmts(raw_ostream &OS, RuleMatcher &Rule,
+                           StringRef OperandExpr) const {
+    for (const auto &Predicate : predicates())
+      Predicate->emitCxxCaptureStmts(OS, Rule, OperandExpr);
+  }
 
   /// Emit a C++ expression that tests whether the instruction named in
   /// InsnVarName matches all the predicate and all the operands.
-  void emitCxxPredicateExpr(raw_ostream &OS, const StringRef InsnVarName) const {
-    OS << "(";
-    emitCxxPredicateListExpr(OS, InsnVarName, OpIdx);
+  void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                            StringRef InsnVarName) const {
+    OS << "(/* ";
+    if (SymbolicName.empty())
+      OS << "Operand " << OpIdx;
+    else
+      OS << SymbolicName;
+    OS << " */ ";
+    emitCxxPredicateListExpr(OS, Rule, getOperandExpr(InsnVarName));
     OS << ")";
   }
+
+  /// Compare the priority of this object and B.
+  ///
+  /// Returns true if this object is more important than B.
+  bool isHigherPriorityThan(const OperandMatcher &B) const {
+    // Operand matchers involving more predicates have higher priority.
+    if (predicates_size() > B.predicates_size())
+      return true;
+    if (predicates_size() < B.predicates_size())
+      return false;
+
+    // This assumes that predicates are added in a consistent order.
+    for (const auto &Predicate : zip(predicates(), B.predicates())) {
+      if (std::get<0>(Predicate)->isHigherPriorityThan(*std::get<1>(Predicate)))
+        return true;
+      if (std::get<1>(Predicate)->isHigherPriorityThan(*std::get<0>(Predicate)))
+        return false;
+    }
+
+    return false;
+  };
+
+  /// Report the maximum number of temporary operands needed by the operand
+  /// matcher.
+  unsigned countTemporaryOperands() const {
+    return std::accumulate(
+        predicates().begin(), predicates().end(), 0,
+        [](unsigned A,
+           const std::unique_ptr<OperandPredicateMatcher> &Predicate) {
+          return A + Predicate->countTemporaryOperands();
+        });
+  }
+
+  unsigned getAllocatedTemporariesBaseID() const {
+    return AllocatedTemporariesBaseID;
+  }
 };
 
+unsigned ComplexPatternOperandMatcher::getAllocatedTemporariesBaseID() const {
+  return Operand.getAllocatedTemporariesBaseID();
+}
+
 /// Generates code to check a predicate on an instruction.
 ///
 /// Typical predicates include:
 /// * The opcode of the instruction is a particular value.
 /// * The nsw/nuw flag is/isn't set.
 class InstructionPredicateMatcher {
+protected:
+  /// This enum is used for RTTI and also defines the priority that is given to
+  /// the predicate when generating the matcher code. Kinds with higher priority
+  /// must be tested first.
+  enum PredicateKind {
+    IPM_Opcode,
+  };
+
+  PredicateKind Kind;
+
 public:
+  InstructionPredicateMatcher(PredicateKind Kind) : Kind(Kind) {}
   virtual ~InstructionPredicateMatcher() {}
 
+  PredicateKind getKind() const { return Kind; }
+
   /// Emit a C++ expression that tests whether the instruction named in
   /// InsnVarName matches the predicate.
-  virtual void emitCxxPredicateExpr(raw_ostream &OS,
-                                    const StringRef InsnVarName) const = 0;
+  virtual void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                                    StringRef InsnVarName) const = 0;
+
+  /// Compare the priority of this object and B.
+  ///
+  /// Returns true if this object is more important than B.
+  virtual bool isHigherPriorityThan(const InstructionPredicateMatcher &B) const {
+    return Kind < B.Kind;
+  };
+
+  /// Report the maximum number of temporary operands needed by the predicate
+  /// matcher.
+  virtual unsigned countTemporaryOperands() const { return 0; }
 };
 
 /// Generates code to check the opcode of an instruction.
@@ -254,13 +577,37 @@ protected:
   const CodeGenInstruction *I;
 
 public:
-  InstructionOpcodeMatcher(const CodeGenInstruction *I) : I(I) {}
+  InstructionOpcodeMatcher(const CodeGenInstruction *I)
+      : InstructionPredicateMatcher(IPM_Opcode), I(I) {}
+
+  static bool classof(const InstructionPredicateMatcher *P) {
+    return P->getKind() == IPM_Opcode;
+  }
 
-  void emitCxxPredicateExpr(raw_ostream &OS,
-                            const StringRef InsnVarName) const override {
+  void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                            StringRef InsnVarName) const override {
     OS << InsnVarName << ".getOpcode() == " << I->Namespace
        << "::" << I->TheDef->getName();
   }
+
+  /// Compare the priority of this object and B.
+  ///
+  /// Returns true if this object is more important than B.
+  bool isHigherPriorityThan(const InstructionPredicateMatcher &B) const override {
+    if (InstructionPredicateMatcher::isHigherPriorityThan(B))
+      return true;
+    if (B.InstructionPredicateMatcher::isHigherPriorityThan(*this))
+      return false;
+
+    // Prioritize opcodes for cosmetic reasons in the generated source. Although
+    // this is cosmetic at the moment, we may want to drive a similar ordering
+    // using instruction frequency information to improve compile time.
+    if (const InstructionOpcodeMatcher *BO =
+            dyn_cast<InstructionOpcodeMatcher>(&B))
+      return I->TheDef->getName() < BO->I->TheDef->getName();
+
+    return false;
+  };
 };
 
 /// Generates code to check that a set of predicates and operands match for a
@@ -272,97 +619,629 @@ public:
 class InstructionMatcher
     : public PredicateListMatcher<InstructionPredicateMatcher> {
 protected:
-  std::vector<OperandMatcher> Operands;
+  typedef std::vector<std::unique_ptr<OperandMatcher>> OperandVec;
+
+  /// The operands to match. All rendered operands must be present even if the
+  /// condition is always true.
+  OperandVec Operands;
 
 public:
   /// Add an operand to the matcher.
-  OperandMatcher &addOperand(unsigned OpIdx) {
-    Operands.emplace_back(OpIdx);
-    return Operands.back();
+  OperandMatcher &addOperand(unsigned OpIdx, const std::string &SymbolicName,
+                             unsigned AllocatedTemporariesBaseID) {
+    Operands.emplace_back(new OperandMatcher(*this, OpIdx, SymbolicName,
+                                             AllocatedTemporariesBaseID));
+    return *Operands.back();
+  }
+
+  OperandMatcher &getOperand(unsigned OpIdx) {
+    auto I = std::find_if(Operands.begin(), Operands.end(),
+                          [&OpIdx](const std::unique_ptr<OperandMatcher> &X) {
+                            return X->getOperandIndex() == OpIdx;
+                          });
+    if (I != Operands.end())
+      return **I;
+    llvm_unreachable("Failed to lookup operand");
+  }
+
+  Optional<const OperandMatcher *>
+  getOptionalOperand(StringRef SymbolicName) const {
+    assert(!SymbolicName.empty() && "Cannot lookup unnamed operand");
+    for (const auto &Operand : Operands) {
+      const auto &OM = Operand->getOptionalOperand(SymbolicName);
+      if (OM.hasValue())
+        return OM.getValue();
+    }
+    return None;
+  }
+
+  const OperandMatcher &getOperand(StringRef SymbolicName) const {
+    Optional<const OperandMatcher *>OM = getOptionalOperand(SymbolicName);
+    if (OM.hasValue())
+      return *OM.getValue();
+    llvm_unreachable("Failed to lookup operand");
+  }
+
+  unsigned getNumOperands() const { return Operands.size(); }
+  OperandVec::iterator operands_begin() { return Operands.begin(); }
+  OperandVec::iterator operands_end() { return Operands.end(); }
+  iterator_range<OperandVec::iterator> operands() {
+    return make_range(operands_begin(), operands_end());
+  }
+  OperandVec::const_iterator operands_begin() const { return Operands.begin(); }
+  OperandVec::const_iterator operands_end() const { return Operands.end(); }
+  iterator_range<OperandVec::const_iterator> operands() const {
+    return make_range(operands_begin(), operands_end());
+  }
+
+  /// Emit C++ statements to check the shape of the match and capture
+  /// instructions into local variables.
+  void emitCxxCaptureStmts(raw_ostream &OS, RuleMatcher &Rule, StringRef Expr) {
+    OS << "if (" << Expr << ".getNumOperands() < " << getNumOperands() << ")\n"
+       << "  return false;\n";
+    for (const auto &Operand : Operands) {
+      Operand->emitCxxCaptureStmts(OS, Rule, Operand->getOperandExpr(Expr));
+    }
   }
 
   /// Emit a C++ expression that tests whether the instruction named in
   /// InsnVarName matches all the predicates and all the operands.
-  void emitCxxPredicateExpr(raw_ostream &OS, const StringRef InsnVarName) const {
-    emitCxxPredicateListExpr(OS, InsnVarName);
+  void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                            StringRef InsnVarName) const {
+    emitCxxPredicateListExpr(OS, Rule, InsnVarName);
     for (const auto &Operand : Operands) {
-      OS << " && (";
-      Operand.emitCxxPredicateExpr(OS, InsnVarName);
+      OS << " &&\n(";
+      Operand->emitCxxPredicateExpr(OS, Rule, InsnVarName);
       OS << ")";
     }
   }
+
+  /// Compare the priority of this object and B.
+  ///
+  /// Returns true if this object is more important than B.
+  bool isHigherPriorityThan(const InstructionMatcher &B) const {
+    // Instruction matchers involving more operands have higher priority.
+    if (Operands.size() > B.Operands.size())
+      return true;
+    if (Operands.size() < B.Operands.size())
+      return false;
+
+    for (const auto &Predicate : zip(predicates(), B.predicates())) {
+      if (std::get<0>(Predicate)->isHigherPriorityThan(*std::get<1>(Predicate)))
+        return true;
+      if (std::get<1>(Predicate)->isHigherPriorityThan(*std::get<0>(Predicate)))
+        return false;
+    }
+
+    for (const auto &Operand : zip(Operands, B.Operands)) {
+      if (std::get<0>(Operand)->isHigherPriorityThan(*std::get<1>(Operand)))
+        return true;
+      if (std::get<1>(Operand)->isHigherPriorityThan(*std::get<0>(Operand)))
+        return false;
+    }
+
+    return false;
+  };
+
+  /// Report the maximum number of temporary operands needed by the instruction
+  /// matcher.
+  unsigned countTemporaryOperands() const {
+    return std::accumulate(predicates().begin(), predicates().end(), 0,
+                           [](unsigned A,
+                              const std::unique_ptr<InstructionPredicateMatcher>
+                                  &Predicate) {
+                             return A + Predicate->countTemporaryOperands();
+                           }) +
+           std::accumulate(
+               Operands.begin(), Operands.end(), 0,
+               [](unsigned A, const std::unique_ptr<OperandMatcher> &Operand) {
+                 return A + Operand->countTemporaryOperands();
+               });
+  }
 };
 
-struct MutateOpcode : public MatchAction {
-  MutateOpcode(const CodeGenInstruction *I) : I(I) {}
-  const CodeGenInstruction *I;
+/// Generates code to check that the operand is a register defined by an
+/// instruction that matches the given instruction matcher.
+///
+/// For example, the pattern:
+///   (set $dst, (G_MUL (G_ADD $src1, $src2), $src3))
+/// would use an InstructionOperandMatcher for operand 1 of the G_MUL to match
+/// the:
+///   (G_ADD $src1, $src2)
+/// subpattern.
+class InstructionOperandMatcher : public OperandPredicateMatcher {
+protected:
+  std::unique_ptr<InstructionMatcher> InsnMatcher;
+
+public:
+  InstructionOperandMatcher()
+      : OperandPredicateMatcher(OPM_Instruction),
+        InsnMatcher(new InstructionMatcher()) {}
+
+  static bool classof(const OperandPredicateMatcher *P) {
+    return P->getKind() == OPM_Instruction;
+  }
+
+  InstructionMatcher &getInsnMatcher() const { return *InsnMatcher; }
 
-  virtual void emit(raw_ostream &OS) const {
-    OS << "I.setDesc(TII.get(" << I->Namespace << "::" << I->TheDef->getName()
-       << "));";
+  Optional<const OperandMatcher *>
+  getOptionalOperand(StringRef SymbolicName) const override {
+    assert(!SymbolicName.empty() && "Cannot lookup unnamed operand");
+    return InsnMatcher->getOptionalOperand(SymbolicName);
+  }
+
+  void emitCxxCaptureStmts(raw_ostream &OS, RuleMatcher &Rule,
+                           StringRef OperandExpr) const override {
+    OS << "if (!" << OperandExpr + ".isReg())\n"
+       << "  return false;\n";
+    std::string InsnVarName = Rule.defineInsnVar(
+        OS, *InsnMatcher,
+        ("*MRI.getVRegDef(" + OperandExpr + ".getReg())").str());
+    InsnMatcher->emitCxxCaptureStmts(OS, Rule, InsnVarName);
+  }
+
+  void emitCxxPredicateExpr(raw_ostream &OS, RuleMatcher &Rule,
+                            StringRef OperandExpr) const override {
+    OperandExpr = Rule.getInsnVarName(*InsnMatcher);
+    OS << "(";
+    InsnMatcher->emitCxxPredicateExpr(OS, Rule, OperandExpr);
+    OS << ")\n";
   }
 };
 
-/// Generates code to check that a match rule matches.
+//===- Actions ------------------------------------------------------------===//
+void OperandPlaceholder::emitCxxValueExpr(raw_ostream &OS) const {
+  switch (Kind) {
+  case OP_MatchReference:
+    OS << MatchReference.InsnMatcher->getOperand(MatchReference.SymbolicName)
+              .getOperandExpr(MatchReference.InsnVarName);
+    break;
+  case OP_Temporary:
+    OS << "TempOp" << Temporary.OpIdx;
+    break;
+  }
+}
+
+class OperandRenderer {
+public:
+  enum RendererKind { OR_Copy, OR_Imm, OR_Register, OR_ComplexPattern };
+
+protected:
+  RendererKind Kind;
+
+public:
+  OperandRenderer(RendererKind Kind) : Kind(Kind) {}
+  virtual ~OperandRenderer() {}
+
+  RendererKind getKind() const { return Kind; }
+
+  virtual void emitCxxRenderStmts(raw_ostream &OS, RuleMatcher &Rule) const = 0;
+};
+
+/// A CopyRenderer emits code to copy a single operand from an existing
+/// instruction to the one being built.
+class CopyRenderer : public OperandRenderer {
+protected:
+  /// The matcher for the instruction that this operand is copied from.
+  /// This provides the facility for looking up an a operand by it's name so
+  /// that it can be used as a source for the instruction being built.
+  const InstructionMatcher &Matched;
+  /// The name of the operand.
+  const StringRef SymbolicName;
+
+public:
+  CopyRenderer(const InstructionMatcher &Matched, StringRef SymbolicName)
+      : OperandRenderer(OR_Copy), Matched(Matched), SymbolicName(SymbolicName) {
+  }
+
+  static bool classof(const OperandRenderer *R) {
+    return R->getKind() == OR_Copy;
+  }
+
+  const StringRef getSymbolicName() const { return SymbolicName; }
+
+  void emitCxxRenderStmts(raw_ostream &OS, RuleMatcher &Rule) const override {
+    const OperandMatcher &Operand = Matched.getOperand(SymbolicName);
+    StringRef InsnVarName =
+        Rule.getInsnVarName(Operand.getInstructionMatcher());
+    std::string OperandExpr = Operand.getOperandExpr(InsnVarName);
+    OS << "    MIB.add(" << OperandExpr << "/*" << SymbolicName << "*/);\n";
+  }
+};
+
+/// Adds a specific physical register to the instruction being built.
+/// This is typically useful for WZR/XZR on AArch64.
+class AddRegisterRenderer : public OperandRenderer {
+protected:
+  const Record *RegisterDef;
+
+public:
+  AddRegisterRenderer(const Record *RegisterDef)
+      : OperandRenderer(OR_Register), RegisterDef(RegisterDef) {}
+
+  static bool classof(const OperandRenderer *R) {
+    return R->getKind() == OR_Register;
+  }
+
+  void emitCxxRenderStmts(raw_ostream &OS, RuleMatcher &Rule) const override {
+    OS << "    MIB.addReg(" << RegisterDef->getValueAsString("Namespace")
+       << "::" << RegisterDef->getName() << ");\n";
+  }
+};
+
+/// Adds a specific immediate to the instruction being built.
+class ImmRenderer : public OperandRenderer {
+protected:
+  int64_t Imm;
+
+public:
+  ImmRenderer(int64_t Imm)
+      : OperandRenderer(OR_Imm), Imm(Imm) {}
+
+  static bool classof(const OperandRenderer *R) {
+    return R->getKind() == OR_Imm;
+  }
+
+  void emitCxxRenderStmts(raw_ostream &OS, RuleMatcher &Rule) const override {
+    OS << "    MIB.addImm(" << Imm << ");\n";
+  }
+};
+
+class RenderComplexPatternOperand : public OperandRenderer {
+private:
+  const Record &TheDef;
+  std::vector<OperandPlaceholder> Sources;
+
+  unsigned getNumOperands() const {
+    return TheDef.getValueAsDag("Operands")->getNumArgs();
+  }
+
+public:
+  RenderComplexPatternOperand(const Record &TheDef,
+                              const ArrayRef<OperandPlaceholder> Sources)
+      : OperandRenderer(OR_ComplexPattern), TheDef(TheDef), Sources(Sources) {}
+
+  static bool classof(const OperandRenderer *R) {
+    return R->getKind() == OR_ComplexPattern;
+  }
+
+  void emitCxxRenderStmts(raw_ostream &OS, RuleMatcher &Rule) const override {
+    assert(Sources.size() == getNumOperands() && "Inconsistent number of operands");
+    for (const auto &Source : Sources) {
+      OS << "MIB.add(";
+      Source.emitCxxValueExpr(OS);
+      OS << ");\n";
+    }
+  }
+};
+
+/// An action taken when all Matcher predicates succeeded for a parent rule.
 ///
-/// This currently supports a single match position but could be extended to
-/// support multiple positions to support div/rem fusion or load-multiple
-/// instructions.
-class RuleMatcher {
-  const PatternToMatch &P;
+/// Typical actions include:
+/// * Changing the opcode of an instruction.
+/// * Adding an operand to an instruction.
+class MatchAction {
+public:
+  virtual ~MatchAction() {}
 
-  std::vector<std::unique_ptr<InstructionMatcher>> Matchers;
+  /// Emit the C++ statements to implement the action.
+  ///
+  /// \param RecycleVarName If given, it's an instruction to recycle. The
+  ///                       requirements on the instruction vary from action to
+  ///                       action.
+  virtual void emitCxxActionStmts(raw_ostream &OS, RuleMatcher &Rule,
+                                  StringRef RecycleVarName) const = 0;
+};
+
+/// Generates a comment describing the matched rule being acted upon.
+class DebugCommentAction : public MatchAction {
+private:
+  const PatternToMatch &P;
 
 public:
-  std::vector<std::unique_ptr<MatchAction>> Actions;
+  DebugCommentAction(const PatternToMatch &P) : P(P) {}
 
-  RuleMatcher(const PatternToMatch &P) : P(P) {}
+  void emitCxxActionStmts(raw_ostream &OS, RuleMatcher &Rule,
+                          StringRef RecycleVarName) const override {
+    OS << "// " << *P.getSrcPattern() << "  =>  " << *P.getDstPattern() << "\n";
+  }
+};
 
-  InstructionMatcher &addInstructionMatcher() {
-    Matchers.emplace_back(new InstructionMatcher());
-    return *Matchers.back();
+/// Generates code to build an instruction or mutate an existing instruction
+/// into the desired instruction when this is possible.
+class BuildMIAction : public MatchAction {
+private:
+  const CodeGenInstruction *I;
+  const InstructionMatcher &Matched;
+  std::vector<std::unique_ptr<OperandRenderer>> OperandRenderers;
+
+  /// True if the instruction can be built solely by mutating the opcode.
+  bool canMutate() const {
+    for (const auto &Renderer : enumerate(OperandRenderers)) {
+      if (const auto *Copy = dyn_cast<CopyRenderer>(&*Renderer.value())) {
+        if (Matched.getOperand(Copy->getSymbolicName()).getOperandIndex() !=
+            Renderer.index())
+          return false;
+      } else
+        return false;
+    }
+
+    return true;
   }
 
-  void emit(raw_ostream &OS) {
-    if (Matchers.empty())
-      llvm_unreachable("Unexpected empty matcher!");
+public:
+  BuildMIAction(const CodeGenInstruction *I, const InstructionMatcher &Matched)
+      : I(I), Matched(Matched) {}
 
-    OS << "  // Src: " << *P.getSrcPattern() << "\n"
-       << "  // Dst: " << *P.getDstPattern() << "\n";
+  template <class Kind, class... Args>
+  Kind &addRenderer(Args&&... args) {
+    OperandRenderers.emplace_back(
+        llvm::make_unique<Kind>(std::forward<Args>(args)...));
+    return *static_cast<Kind *>(OperandRenderers.back().get());
+  }
 
-    // The representation supports rules that require multiple roots such as:
-    //    %ptr(p0) = ...
-    //    %elt0(s32) = G_LOAD %ptr
-    //    %1(p0) = G_ADD %ptr, 4
-    //    %elt1(s32) = G_LOAD p0 %1
-    // which could be usefully folded into:
-    //    %ptr(p0) = ...
-    //    %elt0(s32), %elt1(s32) = TGT_LOAD_PAIR %ptr
-    // on some targets but we don't need to make use of that yet.
-    assert(Matchers.size() == 1 && "Cannot handle multi-root matchers yet");
-    OS << "  if (";
-    Matchers.front()->emitCxxPredicateExpr(OS, "I");
-    OS << ") {\n";
+  void emitCxxActionStmts(raw_ostream &OS, RuleMatcher &Rule,
+                          StringRef RecycleVarName) const override {
+    if (canMutate()) {
+      OS << "    " << RecycleVarName << ".setDesc(TII.get(" << I->Namespace
+         << "::" << I->TheDef->getName() << "));\n";
 
-    for (auto &MA : Actions)
-      OS << "    " << *MA << "\n";
+      if (!I->ImplicitDefs.empty() || !I->ImplicitUses.empty()) {
+        OS << "    auto MIB = MachineInstrBuilder(MF, &" << RecycleVarName
+           << ");\n";
+
+        for (auto Def : I->ImplicitDefs) {
+          auto Namespace = Def->getValueAsString("Namespace");
+          OS << "    MIB.addDef(" << Namespace << "::" << Def->getName()
+             << ", RegState::Implicit);\n";
+        }
+        for (auto Use : I->ImplicitUses) {
+          auto Namespace = Use->getValueAsString("Namespace");
+          OS << "    MIB.addUse(" << Namespace << "::" << Use->getName()
+             << ", RegState::Implicit);\n";
+        }
+      }
+
+      OS << "    MachineInstr &NewI = " << RecycleVarName << ";\n";
+      return;
+    }
 
-    OS << "    constrainSelectedInstRegOperands(I, TII, TRI, RBI);\n";
-    OS << "    return true;\n";
-    OS << "  }\n";
+    // TODO: Simple permutation looks like it could be almost as common as
+    //       mutation due to commutative operations.
+
+    OS << "MachineInstrBuilder MIB = BuildMI(*I.getParent(), I, "
+          "I.getDebugLoc(), TII.get("
+       << I->Namespace << "::" << I->TheDef->getName() << "));\n";
+    for (const auto &Renderer : OperandRenderers)
+      Renderer->emitCxxRenderStmts(OS, Rule);
+    OS << "    for (const auto *FromMI : ";
+    Rule.emitCxxCapturedInsnList(OS);
+    OS << ")\n";
+    OS << "      for (const auto &MMO : FromMI->memoperands())\n";
+    OS << "        MIB.addMemOperand(MMO);\n";
+    OS << "    " << RecycleVarName << ".eraseFromParent();\n";
+    OS << "    MachineInstr &NewI = *MIB;\n";
   }
 };
 
+InstructionMatcher &RuleMatcher::addInstructionMatcher() {
+  Matchers.emplace_back(new InstructionMatcher());
+  return *Matchers.back();
+}
+
+template <class Kind, class... Args>
+Kind &RuleMatcher::addAction(Args &&... args) {
+  Actions.emplace_back(llvm::make_unique<Kind>(std::forward<Args>(args)...));
+  return *static_cast<Kind *>(Actions.back().get());
+}
+
+std::string RuleMatcher::defineInsnVar(raw_ostream &OS,
+                                       const InstructionMatcher &Matcher,
+                                       StringRef Value) {
+  std::string InsnVarName = "MI" + llvm::to_string(NextInsnVarID++);
+  OS << "MachineInstr &" << InsnVarName << " = " << Value << ";\n";
+  InsnVariableNames[&Matcher] = InsnVarName;
+  return InsnVarName;
+}
+
+StringRef RuleMatcher::getInsnVarName(const InstructionMatcher &InsnMatcher) const {
+  const auto &I = InsnVariableNames.find(&InsnMatcher);
+  if (I != InsnVariableNames.end())
+    return I->second;
+  llvm_unreachable("Matched Insn was not captured in a local variable");
+}
+
+/// Emit a C++ initializer_list containing references to every matched instruction.
+void RuleMatcher::emitCxxCapturedInsnList(raw_ostream &OS) {
+  SmallVector<StringRef, 2> Names;
+  for (const auto &Pair : InsnVariableNames)
+    Names.push_back(Pair.second);
+  std::sort(Names.begin(), Names.end());
+
+  OS << "{";
+  for (const auto &Name : Names)
+    OS << "&" << Name << ", ";
+  OS << "}";
+}
+
+/// Emit C++ statements to check the shape of the match and capture
+/// instructions into local variables.
+void RuleMatcher::emitCxxCaptureStmts(raw_ostream &OS, StringRef Expr) {
+  assert(Matchers.size() == 1 && "Cannot handle multi-root matchers yet");
+  std::string InsnVarName = defineInsnVar(OS, *Matchers.front(), Expr);
+  Matchers.front()->emitCxxCaptureStmts(OS, *this, InsnVarName);
+}
+
+void RuleMatcher::emit(raw_ostream &OS) {
+  if (Matchers.empty())
+    llvm_unreachable("Unexpected empty matcher!");
+
+  // The representation supports rules that require multiple roots such as:
+  //    %ptr(p0) = ...
+  //    %elt0(s32) = G_LOAD %ptr
+  //    %1(p0) = G_ADD %ptr, 4
+  //    %elt1(s32) = G_LOAD p0 %1
+  // which could be usefully folded into:
+  //    %ptr(p0) = ...
+  //    %elt0(s32), %elt1(s32) = TGT_LOAD_PAIR %ptr
+  // on some targets but we don't need to make use of that yet.
+  assert(Matchers.size() == 1 && "Cannot handle multi-root matchers yet");
+  OS << "if ([&]() {\n";
+
+  emitCxxCaptureStmts(OS, "I");
+
+  OS << "    if (";
+  Matchers.front()->emitCxxPredicateExpr(OS, *this,
+                                         getInsnVarName(*Matchers.front()));
+  OS << ") {\n";
+
+  // We must also check if it's safe to fold the matched instructions.
+  if (InsnVariableNames.size() >= 2) {
+    for (const auto &Pair : InsnVariableNames) {
+      // Skip the root node since it isn't moving anywhere. Everything else is
+      // sinking to meet it.
+      if (Pair.first == Matchers.front().get())
+        continue;
+
+      // Reject the difficult cases until we have a more accurate check.
+      OS << "      if (!isObviouslySafeToFold(" << Pair.second
+         << ")) return false;\n";
+
+      // FIXME: Emit checks to determine it's _actually_ safe to fold and/or
+      //        account for unsafe cases.
+      //
+      //        Example:
+      //          MI1--> %0 = ...
+      //                 %1 = ... %0
+      //          MI0--> %2 = ... %0
+      //          It's not safe to erase MI1. We currently handle this by not
+      //          erasing %0 (even when it's dead).
+      //
+      //        Example:
+      //          MI1--> %0 = load volatile @a
+      //                 %1 = load volatile @a
+      //          MI0--> %2 = ... %0
+      //          It's not safe to sink %0's def past %1. We currently handle
+      //          this by rejecting all loads.
+      //
+      //        Example:
+      //          MI1--> %0 = load @a
+      //                 %1 = store @a
+      //          MI0--> %2 = ... %0
+      //          It's not safe to sink %0's def past %1. We currently handle
+      //          this by rejecting all loads.
+      //
+      //        Example:
+      //                   G_CONDBR %cond, @BB1
+      //                 BB0:
+      //          MI1-->   %0 = load @a
+      //                   G_BR @BB1
+      //                 BB1:
+      //          MI0-->   %2 = ... %0
+      //          It's not always safe to sink %0 across control flow. In this
+      //          case it may introduce a memory fault. We currentl handle this
+      //          by rejecting all loads.
+    }
+  }
+
+  for (const auto &MA : Actions) {
+    MA->emitCxxActionStmts(OS, *this, "I");
+  }
+
+  OS << "      constrainSelectedInstRegOperands(NewI, TII, TRI, RBI);\n";
+  OS << "      return true;\n";
+  OS << "    }\n";
+  OS << "    return false;\n";
+  OS << "  }()) { return true; }\n\n";
+}
+
+bool RuleMatcher::isHigherPriorityThan(const RuleMatcher &B) const {
+  // Rules involving more match roots have higher priority.
+  if (Matchers.size() > B.Matchers.size())
+    return true;
+  if (Matchers.size() < B.Matchers.size())
+    return false;
+
+  for (const auto &Matcher : zip(Matchers, B.Matchers)) {
+    if (std::get<0>(Matcher)->isHigherPriorityThan(*std::get<1>(Matcher)))
+      return true;
+    if (std::get<1>(Matcher)->isHigherPriorityThan(*std::get<0>(Matcher)))
+      return false;
+  }
+
+  return false;
+}
+
+unsigned RuleMatcher::countTemporaryOperands() const {
+  return std::accumulate(
+      Matchers.begin(), Matchers.end(), 0,
+      [](unsigned A, const std::unique_ptr<InstructionMatcher> &Matcher) {
+        return A + Matcher->countTemporaryOperands();
+      });
+}
+
 //===- GlobalISelEmitter class --------------------------------------------===//
 
+class GlobalISelEmitter {
+public:
+  explicit GlobalISelEmitter(RecordKeeper &RK);
+  void run(raw_ostream &OS);
+
+private:
+  const RecordKeeper &RK;
+  const CodeGenDAGPatterns CGP;
+  const CodeGenTarget &Target;
+
+  /// Keep track of the equivalence between SDNodes and Instruction.
+  /// This is defined using 'GINodeEquiv' in the target description.
+  DenseMap<Record *, const CodeGenInstruction *> NodeEquivs;
+
+  /// Keep track of the equivalence between ComplexPattern's and
+  /// GIComplexOperandMatcher. Map entries are specified by subclassing
+  /// GIComplexPatternEquiv.
+  DenseMap<const Record *, const Record *> ComplexPatternEquivs;
+
+  void gatherNodeEquivs();
+  const CodeGenInstruction *findNodeEquiv(Record *N) const;
+
+  Error importRulePredicates(RuleMatcher &M, ArrayRef<Init *> Predicates) const;
+  Expected<InstructionMatcher &>
+  createAndImportSelDAGMatcher(InstructionMatcher &InsnMatcher,
+                               const TreePatternNode *Src) const;
+  Error importChildMatcher(InstructionMatcher &InsnMatcher,
+                           TreePatternNode *SrcChild, unsigned OpIdx,
+                           unsigned &TempOpIdx) const;
+  Expected<BuildMIAction &> createAndImportInstructionRenderer(
+      RuleMatcher &M, const TreePatternNode *Dst,
+      const InstructionMatcher &InsnMatcher) const;
+  Error importExplicitUseRenderer(BuildMIAction &DstMIBuilder,
+                                  TreePatternNode *DstChild,
+                                  const InstructionMatcher &InsnMatcher) const;
+  Error
+  importImplicitDefRenderers(BuildMIAction &DstMIBuilder,
+                             const std::vector<Record *> &ImplicitDefs) const;
+
+  /// Analyze pattern \p P, returning a matcher for it if possible.
+  /// Otherwise, return an Error explaining why we don't support it.
+  Expected<RuleMatcher> runOnPattern(const PatternToMatch &P);
+};
+
 void GlobalISelEmitter::gatherNodeEquivs() {
   assert(NodeEquivs.empty());
   for (Record *Equiv : RK.getAllDerivedDefinitions("GINodeEquiv"))
     NodeEquivs[Equiv->getValueAsDef("Node")] =
         &Target.getInstruction(Equiv->getValueAsDef("I"));
+
+  assert(ComplexPatternEquivs.empty());
+  for (Record *Equiv : RK.getAllDerivedDefinitions("GIComplexPatternEquiv")) {
+    Record *SelDAGEquiv = Equiv->getValueAsDef("SelDAGEquivalent");
+    if (!SelDAGEquiv)
+      continue;
+    ComplexPatternEquivs[SelDAGEquiv] = Equiv;
+ }
 }
 
-const CodeGenInstruction *GlobalISelEmitter::findNodeEquiv(Record *N) {
+const CodeGenInstruction *GlobalISelEmitter::findNodeEquiv(Record *N) const {
   return NodeEquivs.lookup(N);
 }
 
@@ -371,129 +1250,357 @@ GlobalISelEmitter::GlobalISelEmitter(RecordKeeper &RK)
 
 //===- Emitter ------------------------------------------------------------===//
 
-Optional<GlobalISelEmitter::SkipReason>
-GlobalISelEmitter::runOnPattern(const PatternToMatch &P, raw_ostream &OS) {
-
-  // Keep track of the matchers and actions to emit.
-  RuleMatcher M(P);
-
-  // First, analyze the whole pattern.
-  // If the entire pattern has a predicate (e.g., target features), ignore it.
-  if (!P.getPredicates()->getValues().empty())
-    return SkipReason{"Pattern has a predicate"};
-
-  // Physreg imp-defs require additional logic.  Ignore the pattern.
-  if (!P.getDstRegs().empty())
-    return SkipReason{"Pattern defines a physical register"};
-
-  // Next, analyze the pattern operators.
-  TreePatternNode *Src = P.getSrcPattern();
-  TreePatternNode *Dst = P.getDstPattern();
-
-  // If the root of either pattern isn't a simple operator, ignore it.
-  if (!isTrivialOperatorNode(Dst))
-    return SkipReason{"Dst pattern root isn't a trivial operator"};
-  if (!isTrivialOperatorNode(Src))
-    return SkipReason{"Src pattern root isn't a trivial operator"};
+/// Helper function to let the emitter report skip reason error messages.
+static Error failedImport(const Twine &Reason) {
+  return make_error<StringError>(Reason, inconvertibleErrorCode());
+}
 
-  Record *DstOp = Dst->getOperator();
-  if (!DstOp->isSubClassOf("Instruction"))
-    return SkipReason{"Pattern operator isn't an instruction"};
+Error
+GlobalISelEmitter::importRulePredicates(RuleMatcher &M,
+                                        ArrayRef<Init *> Predicates) const {
+  if (!Predicates.empty())
+    return failedImport("Pattern has a predicate");
+  return Error::success();
+}
 
-  auto &DstI = Target.getInstruction(DstOp);
+Expected<InstructionMatcher &> GlobalISelEmitter::createAndImportSelDAGMatcher(
+    InstructionMatcher &InsnMatcher, const TreePatternNode *Src) const {
+  // Start with the defined operands (i.e., the results of the root operator).
+  if (Src->getExtTypes().size() > 1)
+    return failedImport("Src pattern has multiple results");
 
   auto SrcGIOrNull = findNodeEquiv(Src->getOperator());
   if (!SrcGIOrNull)
-    return SkipReason{"Pattern operator lacks an equivalent Instruction"};
+    return failedImport("Pattern operator lacks an equivalent Instruction");
   auto &SrcGI = *SrcGIOrNull;
 
   // The operators look good: match the opcode and mutate it to the new one.
-  InstructionMatcher &InsnMatcher = M.addInstructionMatcher();
   InsnMatcher.addPredicate<InstructionOpcodeMatcher>(&SrcGI);
-  M.Actions.emplace_back(new MutateOpcode(&DstI));
-
-  // Next, analyze the children, only accepting patterns that don't require
-  // any change to operands.
-  if (Src->getNumChildren() != Dst->getNumChildren())
-    return SkipReason{"Src/dst patterns have a different # of children"};
 
   unsigned OpIdx = 0;
-
-  // Start with the defined operands (i.e., the results of the root operator).
-  if (DstI.Operands.NumDefs != Src->getExtTypes().size())
-    return SkipReason{"Src pattern results and dst MI defs are different"};
-
+  unsigned TempOpIdx = 0;
   for (const EEVT::TypeSet &Ty : Src->getExtTypes()) {
-    Record *DstIOpRec = DstI.Operands[OpIdx].Rec;
-    if (!DstIOpRec->isSubClassOf("RegisterClass"))
-      return SkipReason{"Dst MI def isn't a register class"};
-
     auto OpTyOrNone = MVTToLLT(Ty.getConcrete());
+
     if (!OpTyOrNone)
-      return SkipReason{"Dst operand has an unsupported type"};
+      return failedImport(
+          "Result of Src pattern operator has an unsupported type");
 
-    OperandMatcher &OM = InsnMatcher.addOperand(OpIdx);
+    // Results don't have a name unless they are the root node. The caller will
+    // set the name if appropriate.
+    OperandMatcher &OM = InsnMatcher.addOperand(OpIdx++, "", TempOpIdx);
     OM.addPredicate<LLTOperandMatcher>(*OpTyOrNone);
-    OM.addPredicate<RegisterBankOperandMatcher>(
-        Target.getRegisterClass(DstIOpRec));
-    ++OpIdx;
   }
 
-  // Finally match the used operands (i.e., the children of the root operator).
+  // Match the used operands (i.e. the children of the operator).
   for (unsigned i = 0, e = Src->getNumChildren(); i != e; ++i) {
-    auto *SrcChild = Src->getChild(i);
-    auto *DstChild = Dst->getChild(i);
-
-    // Patterns can reorder operands.  Ignore those for now.
-    if (SrcChild->getName() != DstChild->getName())
-      return SkipReason{"Src/dst pattern children not in same order"};
-
-    // The only non-leaf child we accept is 'bb': it's an operator because
-    // BasicBlockSDNode isn't inline, but in MI it's just another operand.
-    if (!SrcChild->isLeaf()) {
-      if (DstChild->isLeaf() ||
-          SrcChild->getOperator() != DstChild->getOperator())
-        return SkipReason{"Src/dst pattern child operator mismatch"};
-
-      if (SrcChild->getOperator()->isSubClassOf("SDNode")) {
-        auto &ChildSDNI = CGP.getSDNodeInfo(SrcChild->getOperator());
-        if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") {
-          InsnMatcher.addOperand(OpIdx++).addPredicate<MBBOperandMatcher>();
-          continue;
-        }
+    if (auto Error = importChildMatcher(InsnMatcher, Src->getChild(i), OpIdx++,
+                                        TempOpIdx))
+      return std::move(Error);
+  }
+
+  return InsnMatcher;
+}
+
+Error GlobalISelEmitter::importChildMatcher(InstructionMatcher &InsnMatcher,
+                                            TreePatternNode *SrcChild,
+                                            unsigned OpIdx,
+                                            unsigned &TempOpIdx) const {
+  OperandMatcher &OM =
+      InsnMatcher.addOperand(OpIdx, SrcChild->getName(), TempOpIdx);
+
+  if (SrcChild->hasAnyPredicate())
+    return failedImport("Src pattern child has predicate");
+
+  ArrayRef<EEVT::TypeSet> ChildTypes = SrcChild->getExtTypes();
+  if (ChildTypes.size() != 1)
+    return failedImport("Src pattern child has multiple results");
+
+  // Check MBB's before the type check since they are not a known type.
+  if (!SrcChild->isLeaf()) {
+    if (SrcChild->getOperator()->isSubClassOf("SDNode")) {
+      auto &ChildSDNI = CGP.getSDNodeInfo(SrcChild->getOperator());
+      if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") {
+        OM.addPredicate<MBBOperandMatcher>();
+        return Error::success();
       }
-      return SkipReason{"Src pattern child isn't a leaf node"};
     }
+  }
+
+  auto OpTyOrNone = MVTToLLT(ChildTypes.front().getConcrete());
+  if (!OpTyOrNone)
+    return failedImport("Src operand has an unsupported type");
+  OM.addPredicate<LLTOperandMatcher>(*OpTyOrNone);
+
+  // Check for nested instructions.
+  if (!SrcChild->isLeaf()) {
+    // Map the node to a gMIR instruction.
+    InstructionOperandMatcher &InsnOperand =
+        OM.addPredicate<InstructionOperandMatcher>();
+    auto InsnMatcherOrError =
+        createAndImportSelDAGMatcher(InsnOperand.getInsnMatcher(), SrcChild);
+    if (auto Error = InsnMatcherOrError.takeError())
+      return Error;
+
+    return Error::success();
+  }
 
-    if (SrcChild->getLeafValue() != DstChild->getLeafValue())
-      return SkipReason{"Src/dst pattern child leaf mismatch"};
+  // Check for constant immediates.
+  if (auto *ChildInt = dyn_cast<IntInit>(SrcChild->getLeafValue())) {
+    OM.addPredicate<IntOperandMatcher>(ChildInt->getValue());
+    return Error::success();
+  }
 
-    // Otherwise, we're looking for a bog-standard RegisterClass operand.
-    if (SrcChild->hasAnyPredicate())
-      return SkipReason{"Src pattern child has predicate"};
-    auto *ChildRec = cast<DefInit>(SrcChild->getLeafValue())->getDef();
-    if (!ChildRec->isSubClassOf("RegisterClass"))
-      return SkipReason{"Src pattern child isn't a RegisterClass"};
+  // Check for def's like register classes or ComplexPattern's.
+  if (auto *ChildDefInit = dyn_cast<DefInit>(SrcChild->getLeafValue())) {
+    auto *ChildRec = ChildDefInit->getDef();
 
-    ArrayRef<EEVT::TypeSet> ChildTypes = SrcChild->getExtTypes();
+    // Check for register classes.
+    if (ChildRec->isSubClassOf("RegisterClass")) {
+      OM.addPredicate<RegisterBankOperandMatcher>(
+          Target.getRegisterClass(ChildRec));
+      return Error::success();
+    }
+
+    // Check for ComplexPattern's.
+    if (ChildRec->isSubClassOf("ComplexPattern")) {
+      const auto &ComplexPattern = ComplexPatternEquivs.find(ChildRec);
+      if (ComplexPattern == ComplexPatternEquivs.end())
+        return failedImport(
+            "SelectionDAG ComplexPattern not mapped to GlobalISel");
+
+      const auto &Predicate = OM.addPredicate<ComplexPatternOperandMatcher>(
+          OM, *ComplexPattern->second);
+      TempOpIdx += Predicate.countTemporaryOperands();
+      return Error::success();
+    }
+
+    return failedImport(
+        "Src pattern child def is an unsupported tablegen class");
+  }
+
+  return failedImport("Src pattern child is an unsupported kind");
+}
+
+Error GlobalISelEmitter::importExplicitUseRenderer(
+    BuildMIAction &DstMIBuilder, TreePatternNode *DstChild,
+    const InstructionMatcher &InsnMatcher) const {
+  // The only non-leaf child we accept is 'bb': it's an operator because
+  // BasicBlockSDNode isn't inline, but in MI it's just another operand.
+  if (!DstChild->isLeaf()) {
+    if (DstChild->getOperator()->isSubClassOf("SDNode")) {
+      auto &ChildSDNI = CGP.getSDNodeInfo(DstChild->getOperator());
+      if (ChildSDNI.getSDClassName() == "BasicBlockSDNode") {
+        DstMIBuilder.addRenderer<CopyRenderer>(InsnMatcher,
+                                               DstChild->getName());
+        return Error::success();
+      }
+    }
+    return failedImport("Dst pattern child isn't a leaf node or an MBB");
+  }
+
+  // Otherwise, we're looking for a bog-standard RegisterClass operand.
+  if (DstChild->hasAnyPredicate())
+    return failedImport("Dst pattern child has predicate");
+
+  if (auto *ChildDefInit = dyn_cast<DefInit>(DstChild->getLeafValue())) {
+    auto *ChildRec = ChildDefInit->getDef();
+
+    ArrayRef<EEVT::TypeSet> ChildTypes = DstChild->getExtTypes();
     if (ChildTypes.size() != 1)
-      return SkipReason{"Src pattern child has multiple results"};
+      return failedImport("Dst pattern child has multiple results");
 
     auto OpTyOrNone = MVTToLLT(ChildTypes.front().getConcrete());
     if (!OpTyOrNone)
-      return SkipReason{"Src operand has an unsupported type"};
+      return failedImport("Dst operand has an unsupported type");
 
-    OperandMatcher &OM = InsnMatcher.addOperand(OpIdx);
-    OM.addPredicate<LLTOperandMatcher>(*OpTyOrNone);
+    if (ChildRec->isSubClassOf("Register")) {
+      DstMIBuilder.addRenderer<AddRegisterRenderer>(ChildRec);
+      return Error::success();
+    }
+
+    if (ChildRec->isSubClassOf("RegisterClass")) {
+      DstMIBuilder.addRenderer<CopyRenderer>(InsnMatcher, DstChild->getName());
+      return Error::success();
+    }
+
+    if (ChildRec->isSubClassOf("ComplexPattern")) {
+      const auto &ComplexPattern = ComplexPatternEquivs.find(ChildRec);
+      if (ComplexPattern == ComplexPatternEquivs.end())
+        return failedImport(
+            "SelectionDAG ComplexPattern not mapped to GlobalISel");
+
+      SmallVector<OperandPlaceholder, 2> RenderedOperands;
+      const OperandMatcher &OM = InsnMatcher.getOperand(DstChild->getName());
+      for (unsigned I = 0; I < OM.countTemporaryOperands(); ++I)
+        RenderedOperands.push_back(OperandPlaceholder::CreateTemporary(
+            OM.getAllocatedTemporariesBaseID() + I));
+      DstMIBuilder.addRenderer<RenderComplexPatternOperand>(
+          *ComplexPattern->second, RenderedOperands);
+      return Error::success();
+    }
+
+    return failedImport(
+        "Dst pattern child def is an unsupported tablegen class");
+  }
+
+  return failedImport("Dst pattern child is an unsupported kind");
+}
+
+Expected<BuildMIAction &> GlobalISelEmitter::createAndImportInstructionRenderer(
+    RuleMatcher &M, const TreePatternNode *Dst,
+    const InstructionMatcher &InsnMatcher) const {
+  Record *DstOp = Dst->getOperator();
+  if (!DstOp->isSubClassOf("Instruction"))
+    return failedImport("Pattern operator isn't an instruction");
+  auto &DstI = Target.getInstruction(DstOp);
+
+  auto &DstMIBuilder = M.addAction<BuildMIAction>(&DstI, InsnMatcher);
+
+  // Render the explicit defs.
+  for (unsigned I = 0; I < DstI.Operands.NumDefs; ++I) {
+    const auto &DstIOperand = DstI.Operands[I];
+    DstMIBuilder.addRenderer<CopyRenderer>(InsnMatcher, DstIOperand.Name);
+  }
+
+  // Figure out which operands need defaults inserted. Operands that subclass
+  // OperandWithDefaultOps are considered from left to right until we have
+  // enough operands to render the instruction.
+  SmallSet<unsigned, 2> DefaultOperands;
+  unsigned DstINumUses = DstI.Operands.size() - DstI.Operands.NumDefs;
+  unsigned NumDefaultOperands = 0;
+  for (unsigned I = 0; I < DstINumUses &&
+                       DstINumUses > Dst->getNumChildren() + NumDefaultOperands;
+       ++I) {
+    const auto &DstIOperand = DstI.Operands[DstI.Operands.NumDefs + I];
+    if (DstIOperand.Rec->isSubClassOf("OperandWithDefaultOps")) {
+      DefaultOperands.insert(I);
+      NumDefaultOperands +=
+          DstIOperand.Rec->getValueAsDag("DefaultOps")->getNumArgs();
+    }
+  }
+  if (DstINumUses > Dst->getNumChildren() + DefaultOperands.size())
+    return failedImport("Insufficient operands supplied and default ops "
+                        "couldn't make up the shortfall");
+  if (DstINumUses < Dst->getNumChildren() + DefaultOperands.size())
+    return failedImport("Too many operands supplied");
+
+  // Render the explicit uses.
+  unsigned Child = 0;
+  for (unsigned I = 0; I != DstINumUses; ++I) {
+    // If we need to insert default ops here, then do so.
+    if (DefaultOperands.count(I)) {
+      const auto &DstIOperand = DstI.Operands[DstI.Operands.NumDefs + I];
+
+      DagInit *DefaultOps = DstIOperand.Rec->getValueAsDag("DefaultOps");
+      for (const auto *DefaultOp : DefaultOps->args()) {
+        // Look through ValueType operators.
+        if (const DagInit *DefaultDagOp = dyn_cast<DagInit>(DefaultOp)) {
+          if (const DefInit *DefaultDagOperator =
+                  dyn_cast<DefInit>(DefaultDagOp->getOperator())) {
+            if (DefaultDagOperator->getDef()->isSubClassOf("ValueType"))
+              DefaultOp = DefaultDagOp->getArg(0);
+          }
+        }
+
+        if (const DefInit *DefaultDefOp = dyn_cast<DefInit>(DefaultOp)) {
+          DstMIBuilder.addRenderer<AddRegisterRenderer>(DefaultDefOp->getDef());
+          continue;
+        }
+
+        if (const IntInit *DefaultIntOp = dyn_cast<IntInit>(DefaultOp)) {
+          DstMIBuilder.addRenderer<ImmRenderer>(DefaultIntOp->getValue());
+          continue;
+        }
+
+        return failedImport("Could not add default op");
+      }
+
+      continue;
+    }
+
+    if (auto Error = importExplicitUseRenderer(
+            DstMIBuilder, Dst->getChild(Child), InsnMatcher))
+      return std::move(Error);
+    ++Child;
+  }
+
+  return DstMIBuilder;
+}
+
+Error GlobalISelEmitter::importImplicitDefRenderers(
+    BuildMIAction &DstMIBuilder,
+    const std::vector<Record *> &ImplicitDefs) const {
+  if (!ImplicitDefs.empty())
+    return failedImport("Pattern defines a physical register");
+  return Error::success();
+}
+
+Expected<RuleMatcher> GlobalISelEmitter::runOnPattern(const PatternToMatch &P) {
+  // Keep track of the matchers and actions to emit.
+  RuleMatcher M;
+  M.addAction<DebugCommentAction>(P);
+
+  if (auto Error = importRulePredicates(M, P.getPredicates()->getValues()))
+    return std::move(Error);
+
+  // Next, analyze the pattern operators.
+  TreePatternNode *Src = P.getSrcPattern();
+  TreePatternNode *Dst = P.getDstPattern();
+
+  // If the root of either pattern isn't a simple operator, ignore it.
+  if (!isTrivialOperatorNode(Dst))
+    return failedImport("Dst pattern root isn't a trivial operator");
+  if (!isTrivialOperatorNode(Src))
+    return failedImport("Src pattern root isn't a trivial operator");
+
+  // Start with the defined operands (i.e., the results of the root operator).
+  Record *DstOp = Dst->getOperator();
+  if (!DstOp->isSubClassOf("Instruction"))
+    return failedImport("Pattern operator isn't an instruction");
+
+  auto &DstI = Target.getInstruction(DstOp);
+  if (DstI.Operands.NumDefs != Src->getExtTypes().size())
+    return failedImport("Src pattern results and dst MI defs are different");
+
+  InstructionMatcher &InsnMatcherTemp = M.addInstructionMatcher();
+  auto InsnMatcherOrError = createAndImportSelDAGMatcher(InsnMatcherTemp, Src);
+  if (auto Error = InsnMatcherOrError.takeError())
+    return std::move(Error);
+  InstructionMatcher &InsnMatcher = InsnMatcherOrError.get();
+
+  // The root of the match also has constraints on the register bank so that it
+  // matches the result instruction.
+  unsigned OpIdx = 0;
+  for (const EEVT::TypeSet &Ty : Src->getExtTypes()) {
+    (void)Ty;
+
+    const auto &DstIOperand = DstI.Operands[OpIdx];
+    Record *DstIOpRec = DstIOperand.Rec;
+    if (!DstIOpRec->isSubClassOf("RegisterClass"))
+      return failedImport("Dst MI def isn't a register class");
+
+    OperandMatcher &OM = InsnMatcher.getOperand(OpIdx);
+    OM.setSymbolicName(DstIOperand.Name);
     OM.addPredicate<RegisterBankOperandMatcher>(
-        Target.getRegisterClass(ChildRec));
+        Target.getRegisterClass(DstIOpRec));
     ++OpIdx;
   }
 
-  // We're done with this pattern!  Emit the processed result.
-  M.emit(OS);
-  ++NumPatternEmitted;
-  return None;
+  auto DstMIBuilderOrError =
+      createAndImportInstructionRenderer(M, Dst, InsnMatcher);
+  if (auto Error = DstMIBuilderOrError.takeError())
+    return std::move(Error);
+  BuildMIAction &DstMIBuilder = DstMIBuilderOrError.get();
+
+  // Render the implicit defs.
+  // These are only added to the root of the result.
+  if (auto Error = importImplicitDefRenderers(DstMIBuilder, P.getDstRegs()))
+    return std::move(Error);
+
+  // We're done with this pattern!  It's eligible for GISel emission; return it.
+  ++NumPatternImported;
+  return std::move(M);
 }
 
 void GlobalISelEmitter::run(raw_ostream &OS) {
@@ -502,26 +1609,71 @@ void GlobalISelEmitter::run(raw_ostream &OS) {
 
   emitSourceFileHeader(("Global Instruction Selector for the " +
                        Target.getName() + " target").str(), OS);
-  OS << "bool " << Target.getName()
-     << "InstructionSelector::selectImpl"
-        "(MachineInstr &I) const {\n  const MachineRegisterInfo &MRI = "
-        "I.getParent()->getParent()->getRegInfo();\n";
-
+  std::vector<RuleMatcher> Rules;
   // Look through the SelectionDAG patterns we found, possibly emitting some.
   for (const PatternToMatch &Pat : CGP.ptms()) {
     ++NumPatternTotal;
-    if (auto SkipReason = runOnPattern(Pat, OS)) {
+    auto MatcherOrErr = runOnPattern(Pat);
+
+    // The pattern analysis can fail, indicating an unsupported pattern.
+    // Report that if we've been asked to do so.
+    if (auto Err = MatcherOrErr.takeError()) {
       if (WarnOnSkippedPatterns) {
         PrintWarning(Pat.getSrcRecord()->getLoc(),
-                     "Skipped pattern: " + SkipReason->Reason);
+                     "Skipped pattern: " + toString(std::move(Err)));
+      } else {
+        consumeError(std::move(Err));
       }
-      ++NumPatternSkipped;
+      ++NumPatternImportsSkipped;
+      continue;
     }
+
+    Rules.push_back(std::move(MatcherOrErr.get()));
+  }
+
+  std::stable_sort(Rules.begin(), Rules.end(),
+            [&](const RuleMatcher &A, const RuleMatcher &B) {
+              if (A.isHigherPriorityThan(B)) {
+                assert(!B.isHigherPriorityThan(A) && "Cannot be more important "
+                                                     "and less important at "
+                                                     "the same time");
+                return true;
+              }
+              return false;
+            });
+
+  unsigned MaxTemporaries = 0;
+  for (const auto &Rule : Rules)
+    MaxTemporaries = std::max(MaxTemporaries, Rule.countTemporaryOperands());
+
+  OS << "#ifdef GET_GLOBALISEL_TEMPORARIES_DECL\n";
+  for (unsigned I = 0; I < MaxTemporaries; ++I)
+    OS << "  mutable MachineOperand TempOp" << I << ";\n";
+  OS << "#endif // ifdef GET_GLOBALISEL_TEMPORARIES_DECL\n\n";
+
+  OS << "#ifdef GET_GLOBALISEL_TEMPORARIES_INIT\n";
+  for (unsigned I = 0; I < MaxTemporaries; ++I)
+    OS << ", TempOp" << I << "(MachineOperand::CreatePlaceholder())\n";
+  OS << "#endif // ifdef GET_GLOBALISEL_TEMPORARIES_INIT\n\n";
+
+  OS << "#ifdef GET_GLOBALISEL_IMPL\n"
+     << "bool " << Target.getName()
+     << "InstructionSelector::selectImpl(MachineInstr &I) const {\n"
+     << "  MachineFunction &MF = *I.getParent()->getParent();\n"
+     << "  const MachineRegisterInfo &MRI = MF.getRegInfo();\n";
+
+  for (auto &Rule : Rules) {
+    Rule.emit(OS);
+    ++NumPatternEmitted;
   }
 
-  OS << "  return false;\n}\n";
+  OS << "  return false;\n"
+     << "}\n"
+     << "#endif // ifdef GET_GLOBALISEL_IMPL\n";
 }
 
+} // end anonymous namespace
+
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
diff --git a/utils/TableGen/IntrinsicEmitter.cpp b/utils/TableGen/IntrinsicEmitter.cpp
index eb30ddcb28025b441b83ca9ba931bda3faa1d33e..e9dd2fa0aca00f886da5530d612bf14cd2e3ed0d 100644
--- a/utils/TableGen/IntrinsicEmitter.cpp
+++ b/utils/TableGen/IntrinsicEmitter.cpp
@@ -497,10 +497,10 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
   OS << "// Add parameter attributes that are not common to all intrinsics.\n";
   OS << "#ifdef GET_INTRINSIC_ATTRIBUTES\n";
   if (TargetOnly)
-    OS << "static AttributeSet getAttributes(LLVMContext &C, " << TargetPrefix
+    OS << "static AttributeList getAttributes(LLVMContext &C, " << TargetPrefix
        << "Intrinsic::ID id) {\n";
   else
-    OS << "AttributeSet Intrinsic::getAttributes(LLVMContext &C, ID id) {\n";
+    OS << "AttributeList Intrinsic::getAttributes(LLVMContext &C, ID id) {\n";
 
   // Compute the maximum number of attribute arguments and the map
   typedef std::map<const CodeGenIntrinsic*, unsigned,
@@ -518,7 +518,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
     N = ++AttrNum;
   }
 
-  // Emit an array of AttributeSet.  Most intrinsics will have at least one
+  // Emit an array of AttributeList.  Most intrinsics will have at least one
   // entry, for the function itself (index ~1), which is usually nounwind.
   OS << "  static const uint8_t IntrinsicsToAttributesMap[] = {\n";
 
@@ -530,7 +530,7 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
   }
   OS << "  };\n\n";
 
-  OS << "  AttributeSet AS[" << maxArgAttrs+1 << "];\n";
+  OS << "  AttributeList AS[" << maxArgAttrs + 1 << "];\n";
   OS << "  unsigned NumAttrs = 0;\n";
   OS << "  if (id != 0) {\n";
   OS << "    switch(IntrinsicsToAttributesMap[id - ";
@@ -595,8 +595,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
           ++ai;
         } while (ai != ae && intrinsic.ArgumentAttributes[ai].first == argNo);
         OS << "};\n";
-        OS << "      AS[" << numAttrs++ << "] = AttributeSet::get(C, "
-           << argNo+1 << ", AttrParam" << argNo +1 << ");\n";
+        OS << "      AS[" << numAttrs++ << "] = AttributeList::get(C, "
+           << argNo + 1 << ", AttrParam" << argNo + 1 << ");\n";
       }
     }
 
@@ -699,8 +699,8 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
         break;
       }
       OS << "};\n";
-      OS << "      AS[" << numAttrs++ << "] = AttributeSet::get(C, "
-         << "AttributeSet::FunctionIndex, Atts);\n";
+      OS << "      AS[" << numAttrs++ << "] = AttributeList::get(C, "
+         << "AttributeList::FunctionIndex, Atts);\n";
     }
 
     if (numAttrs) {
@@ -708,14 +708,14 @@ void IntrinsicEmitter::EmitAttributes(const CodeGenIntrinsicTable &Ints,
       OS << "      break;\n";
       OS << "      }\n";
     } else {
-      OS << "      return AttributeSet();\n";
+      OS << "      return AttributeList();\n";
       OS << "      }\n";
     }
   }
 
   OS << "    }\n";
   OS << "  }\n";
-  OS << "  return AttributeSet::get(C, makeArrayRef(AS, NumAttrs));\n";
+  OS << "  return AttributeList::get(C, makeArrayRef(AS, NumAttrs));\n";
   OS << "}\n";
   OS << "#endif // GET_INTRINSIC_ATTRIBUTES\n\n";
 }
diff --git a/utils/TableGen/RegisterInfoEmitter.cpp b/utils/TableGen/RegisterInfoEmitter.cpp
index b75be13c0480e62e8d51941c3003a62459a4360b..5b56578a64b3bd720087a76a489ed949ba8085a8 100644
--- a/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/utils/TableGen/RegisterInfoEmitter.cpp
@@ -1023,18 +1023,14 @@ RegisterInfoEmitter::runMCDesc(raw_ostream &OS, CodeGenTarget &Target,
      << "MCRegisterClasses[] = {\n";
 
   for (const auto &RC : RegisterClasses) {
-    // Asserts to make sure values will fit in table assuming types from
-    // MCRegisterInfo.h
-    assert((RC.SpillSize/8) <= 0xffff && "SpillSize too large.");
-    assert((RC.SpillAlignment/8) <= 0xffff && "SpillAlignment too large.");
-    assert(RC.CopyCost >= -128 && RC.CopyCost <= 127 && "Copy cost too large.");
-
+    assert(isInt<8>(RC.CopyCost) && "Copy cost too large.");
+    // Register size and spill size will become independent, but are not at
+    // the moment. For now use SpillSize as the register size.
     OS << "  { " << RC.getName() << ", " << RC.getName() << "Bits, "
        << RegClassStrings.get(RC.getName()) << ", "
        << RC.getOrder().size() << ", sizeof(" << RC.getName() << "Bits), "
        << RC.getQualifiedName() + "RegClassID" << ", "
        << RC.SpillSize/8 << ", "
-       << RC.SpillAlignment/8 << ", "
        << RC.CopyCost << ", "
        << ( RC.Allocatable ? "true" : "false" ) << " },\n";
   }
@@ -1316,9 +1312,13 @@ RegisterInfoEmitter::runTargetDesc(raw_ostream &OS, CodeGenTarget &Target,
        << " {   // Register class instances\n";
 
     for (const auto &RC : RegisterClasses) {
+      assert(isUInt<16>(RC.SpillSize/8) && "SpillSize too large.");
+      assert(isUInt<16>(RC.SpillAlignment/8) && "SpillAlignment too large.");
       OS << "  extern const TargetRegisterClass " << RC.getName()
          << "RegClass = {\n    " << '&' << Target.getName()
          << "MCRegisterClasses[" << RC.getName() << "RegClassID],\n    "
+         << RC.SpillSize/8 << ", /* SpillSize */\n    "
+         << RC.SpillAlignment/8 << ", /* SpillAlignment */\n    "
          << "VTLists + " << VTSeqs.get(RC.VTs) << ",\n    " << RC.getName()
          << "SubClassMask,\n    SuperRegIdxSeqs + "
          << SuperRegIdxSeqs.get(SuperRegIdxLists[RC.EnumValue]) << ",\n    ";
diff --git a/utils/TableGen/SubtargetEmitter.cpp b/utils/TableGen/SubtargetEmitter.cpp
index bf7b392b15e58238f293392e2621233350b8e53f..30516ef5d10de648af03a2a0c49f3491895a0b2a 100644
--- a/utils/TableGen/SubtargetEmitter.cpp
+++ b/utils/TableGen/SubtargetEmitter.cpp
@@ -917,6 +917,8 @@ void SubtargetEmitter::GenSchedClassTables(const CodeGenProcModel &ProcModel,
         SCDesc.NumMicroOps += WriteRes->getValueAsInt("NumMicroOps");
         SCDesc.BeginGroup |= WriteRes->getValueAsBit("BeginGroup");
         SCDesc.EndGroup |= WriteRes->getValueAsBit("EndGroup");
+        SCDesc.BeginGroup |= WriteRes->getValueAsBit("SingleIssue");
+        SCDesc.EndGroup |= WriteRes->getValueAsBit("SingleIssue");
 
         // Create an entry for each ProcResource listed in WriteRes.
         RecVec PRVec = WriteRes->getValueAsListOfDefs("ProcResources");
diff --git a/utils/TableGen/SubtargetFeatureInfo.cpp b/utils/TableGen/SubtargetFeatureInfo.cpp
index 6c2e8b53c48cdc4e6d45f42497b3bf7c8991ad46..72a556182b1dcdbf9575acbfef656b3c361d42d6 100644
--- a/utils/TableGen/SubtargetFeatureInfo.cpp
+++ b/utils/TableGen/SubtargetFeatureInfo.cpp
@@ -62,11 +62,24 @@ void SubtargetFeatureInfo::emitSubtargetFeatureFlagEnumeration(
 void SubtargetFeatureInfo::emitNameTable(
     std::map<Record *, SubtargetFeatureInfo, LessRecordByID> &SubtargetFeatures,
     raw_ostream &OS) {
+  // Need to sort the name table so that lookup by the log of the enum value
+  // gives the proper name. More specifically, for a feature of value 1<<n,
+  // SubtargetFeatureNames[n] should be the name of the feature.
+  uint64_t IndexUB = 0;
+  for (const auto &SF : SubtargetFeatures)
+    if (IndexUB <= SF.second.Index)
+      IndexUB = SF.second.Index+1;
+
+  std::vector<std::string> Names;
+  if (IndexUB > 0)
+    Names.resize(IndexUB);
+  for (const auto &SF : SubtargetFeatures)
+    Names[SF.second.Index] = SF.second.getEnumName();
+
   OS << "static const char *SubtargetFeatureNames[] = {\n";
-  for (const auto &SF : SubtargetFeatures) {
-    const SubtargetFeatureInfo &SFI = SF.second;
-    OS << "  \"" << SFI.getEnumName() << "\",\n";
-  }
+  for (uint64_t I = 0; I < IndexUB; ++I)
+    OS << "  \"" << Names[I] << "\",\n";
+
   // A small number of targets have no predicates. Null terminate the array to
   // avoid a zero-length array.
   OS << "  nullptr\n"
diff --git a/utils/TableGen/TableGen.cpp b/utils/TableGen/TableGen.cpp
index c9a818e702a48ee953a473a6abf29ba67aab1b31..00d20f1df6c2740148dc940a8fc40e178c10d443 100644
--- a/utils/TableGen/TableGen.cpp
+++ b/utils/TableGen/TableGen.cpp
@@ -46,6 +46,7 @@ enum ActionType {
   GenAttributes,
   GenSearchableTables,
   GenGlobalISel,
+  GenX86EVEX2VEXTables,
   GenRegisterBank,
 };
 
@@ -96,12 +97,15 @@ namespace {
                                "Generate generic binary-searchable table"),
                     clEnumValN(GenGlobalISel, "gen-global-isel",
                                "Generate GlobalISel selector"),
+                    clEnumValN(GenX86EVEX2VEXTables, "gen-x86-EVEX2VEX-tables",
+                               "Generate X86 EVEX to VEX compress tables"),
                     clEnumValN(GenRegisterBank, "gen-register-bank",
                                "Generate registers bank descriptions")));
 
+  cl::OptionCategory PrintEnumsCat("Options for -print-enums");
   cl::opt<std::string>
   Class("class", cl::desc("Print Enum list for this class"),
-          cl::value_desc("class name"));
+        cl::value_desc("class name"), cl::cat(PrintEnumsCat));
 
 bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   switch (Action) {
@@ -185,9 +189,13 @@ bool LLVMTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
     break;
   case GenGlobalISel:
     EmitGlobalISel(Records, OS);
+    break;
   case GenRegisterBank:
     EmitRegisterBank(Records, OS);
     break;
+  case GenX86EVEX2VEXTables:
+    EmitX86EVEX2VEXTables(Records, OS);
+    break;
   }
 
   return false;
diff --git a/utils/TableGen/TableGenBackends.h b/utils/TableGen/TableGenBackends.h
index 066d26a08c701848f297f937d67d53e7b663c6af..2512997e27f93a6c73772f928fcc572a2f238eb8 100644
--- a/utils/TableGen/TableGenBackends.h
+++ b/utils/TableGen/TableGenBackends.h
@@ -81,6 +81,7 @@ void EmitCTags(RecordKeeper &RK, raw_ostream &OS);
 void EmitAttributes(RecordKeeper &RK, raw_ostream &OS);
 void EmitSearchableTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitGlobalISel(RecordKeeper &RK, raw_ostream &OS);
+void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS);
 void EmitRegisterBank(RecordKeeper &RK, raw_ostream &OS);
 
 } // End llvm namespace
diff --git a/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..07b96b03b01cc627edf2dc942c6e0b077a382586
--- /dev/null
+++ b/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
@@ -0,0 +1,339 @@
+//===- utils/TableGen/X86EVEX2VEXTablesEmitter.cpp - X86 backend-*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// This tablegen backend is responsible for emitting the X86 backend EVEX2VEX
+/// compression tables.
+///
+//===----------------------------------------------------------------------===//
+
+#include "CodeGenDAGPatterns.h"
+#include "CodeGenTarget.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+
+namespace {
+
+class X86EVEX2VEXTablesEmitter {
+  CodeGenTarget Target;
+
+  // Hold all non-masked & non-broadcasted EVEX encoded instructions
+  std::vector<const CodeGenInstruction *> EVEXInsts;
+  // Hold all VEX encoded instructions. Divided into groups with same opcodes
+  // to make the search more efficient
+  std::map<uint64_t, std::vector<const CodeGenInstruction *>> VEXInsts;
+
+  typedef std::pair<const CodeGenInstruction *, const CodeGenInstruction *> Entry;
+
+  // Represent both compress tables
+  std::vector<Entry> EVEX2VEX128;
+  std::vector<Entry> EVEX2VEX256;
+
+  // Represents a manually added entry to the tables
+  struct ManualEntry {
+    const char *EVEXInstStr;
+    const char *VEXInstStr;
+    bool Is128Bit;
+  };
+
+public:
+  X86EVEX2VEXTablesEmitter(RecordKeeper &R) : Target(R) {}
+
+  // run - Output X86 EVEX2VEX tables.
+  void run(raw_ostream &OS);
+
+private:
+  // Prints the given table as a C++ array of type
+  // X86EvexToVexCompressTableEntry
+  void printTable(const std::vector<Entry> &Table, raw_ostream &OS);
+
+  bool inExceptionList(const CodeGenInstruction *Inst) {
+    // List of EVEX instructions that match VEX instructions by the encoding
+    // but do not perform the same operation.
+    static constexpr const char *ExceptionList[] = {
+        "VCVTQQ2PD",
+        "VCVTQQ2PS",
+        "VPMAXSQ",
+        "VPMAXUQ",
+        "VPMINSQ",
+        "VPMINUQ",
+        "VPMULLQ",
+        "VPSRAQ",
+        "VDBPSADBW",
+        "VRNDSCALE",
+        "VSCALEFPS"
+    };
+    // Instruction's name starts with one of the entries in the exception list
+    for (StringRef InstStr : ExceptionList) {
+      if (Inst->TheDef->getName().startswith(InstStr))
+        return true;
+    }
+    return false;
+  }
+
+};
+
+void X86EVEX2VEXTablesEmitter::printTable(const std::vector<Entry> &Table,
+                                          raw_ostream &OS) {
+  std::string Size = (Table == EVEX2VEX128) ? "128" : "256";
+
+  OS << "// X86 EVEX encoded instructions that have a VEX " << Size
+     << " encoding\n"
+     << "// (table format: <EVEX opcode, VEX-" << Size << " opcode>).\n"
+     << "static const X86EvexToVexCompressTableEntry X86EvexToVex" << Size
+     << "CompressTable[] = {\n"
+     << "  // EVEX scalar with corresponding VEX.\n";
+
+  // Print all entries added to the table
+  for (auto Pair : Table) {
+    OS << "  { X86::" << Pair.first->TheDef->getName()
+       << ", X86::" << Pair.second->TheDef->getName() << " },\n";
+  }
+
+  // Some VEX instructions were duplicated to multiple EVEX versions due the
+  // introduction of mask variants, and thus some of the EVEX versions have
+  // different encoding than the VEX instruction. In order to maximize the
+  // compression we add these entries manually.
+  static constexpr ManualEntry ManuallyAddedEntries[] = {
+      // EVEX-Inst            VEX-Inst           Is128-bit
+      {"VMOVDQU8Z128mr",      "VMOVDQUmr",       true},
+      {"VMOVDQU8Z128rm",      "VMOVDQUrm",       true},
+      {"VMOVDQU8Z128rr",      "VMOVDQUrr",       true},
+      {"VMOVDQU8Z128rr_REV",  "VMOVDQUrr_REV",   true},
+      {"VMOVDQU16Z128mr",     "VMOVDQUmr",       true},
+      {"VMOVDQU16Z128rm",     "VMOVDQUrm",       true},
+      {"VMOVDQU16Z128rr",     "VMOVDQUrr",       true},
+      {"VMOVDQU16Z128rr_REV", "VMOVDQUrr_REV",   true},
+      {"VMOVDQU8Z256mr",      "VMOVDQUYmr",      false},
+      {"VMOVDQU8Z256rm",      "VMOVDQUYrm",      false},
+      {"VMOVDQU8Z256rr",      "VMOVDQUYrr",      false},
+      {"VMOVDQU8Z256rr_REV",  "VMOVDQUYrr_REV",  false},
+      {"VMOVDQU16Z256mr",     "VMOVDQUYmr",      false},
+      {"VMOVDQU16Z256rm",     "VMOVDQUYrm",      false},
+      {"VMOVDQU16Z256rr",     "VMOVDQUYrr",      false},
+      {"VMOVDQU16Z256rr_REV", "VMOVDQUYrr_REV",  false},
+
+      {"VPERMILPDZ128mi",     "VPERMILPDmi",     true},
+      {"VPERMILPDZ128ri",     "VPERMILPDri",     true},
+      {"VPERMILPDZ128rm",     "VPERMILPDrm",     true},
+      {"VPERMILPDZ128rr",     "VPERMILPDrr",     true},
+      {"VPERMILPDZ256mi",     "VPERMILPDYmi",    false},
+      {"VPERMILPDZ256ri",     "VPERMILPDYri",    false},
+      {"VPERMILPDZ256rm",     "VPERMILPDYrm",    false},
+      {"VPERMILPDZ256rr",     "VPERMILPDYrr",    false},
+
+      {"VPBROADCASTQZ128m",   "VPBROADCASTQrm",  true},
+      {"VPBROADCASTQZ128r",   "VPBROADCASTQrr",  true},
+      {"VPBROADCASTQZ256m",   "VPBROADCASTQYrm", false},
+      {"VPBROADCASTQZ256r",   "VPBROADCASTQYrr", false},
+
+      {"VBROADCASTSDZ256m",   "VBROADCASTSDYrm", false},
+      {"VBROADCASTSDZ256r",   "VBROADCASTSDYrr", false},
+
+      {"VEXTRACTF64x2Z256mr", "VEXTRACTF128mr",  false},
+      {"VEXTRACTF64x2Z256rr", "VEXTRACTF128rr",  false},
+      {"VEXTRACTI64x2Z256mr", "VEXTRACTI128mr",  false},
+      {"VEXTRACTI64x2Z256rr", "VEXTRACTI128rr",  false},
+
+      {"VINSERTF64x2Z256rm",  "VINSERTF128rm",   false},
+      {"VINSERTF64x2Z256rr",  "VINSERTF128rr",   false},
+      {"VINSERTI64x2Z256rm",  "VINSERTI128rm",   false},
+      {"VINSERTI64x2Z256rr",  "VINSERTI128rr",   false}
+  };
+
+  // Print the manually added entries
+  for (const ManualEntry &Entry : ManuallyAddedEntries) {
+    if ((Table == EVEX2VEX128 && Entry.Is128Bit) ||
+        (Table == EVEX2VEX256 && !Entry.Is128Bit)) {
+      OS << "  { X86::" << Entry.EVEXInstStr << ", X86::" << Entry.VEXInstStr
+         << " },\n";
+    }
+  }
+
+  OS << "};\n\n";
+}
+
+// Return true if the 2 BitsInits are equal
+static inline bool equalBitsInits(const BitsInit *B1, const BitsInit *B2) {
+  if (B1->getNumBits() != B2->getNumBits())
+    PrintFatalError("Comparing two BitsInits with different sizes!");
+
+  for (unsigned i = 0, e = B1->getNumBits(); i != e; ++i) {
+    if (BitInit *Bit1 = dyn_cast<BitInit>(B1->getBit(i))) {
+      if (BitInit *Bit2 = dyn_cast<BitInit>(B2->getBit(i))) {
+        if (Bit1->getValue() != Bit2->getValue())
+          return false;
+      } else
+        PrintFatalError("Invalid BitsInit bit");
+    } else
+      PrintFatalError("Invalid BitsInit bit");
+  }
+  return true;
+}
+
+// Calculates the integer value residing BitsInit object
+static inline uint64_t getValueFromBitsInit(const BitsInit *B) {
+  uint64_t Value = 0;
+  for (unsigned i = 0, e = B->getNumBits(); i != e; ++i) {
+    if (BitInit *Bit = dyn_cast<BitInit>(B->getBit(i)))
+      Value |= uint64_t(Bit->getValue()) << i;
+    else
+      PrintFatalError("Invalid VectSize bit");
+  }
+  return Value;
+}
+
+// Function object - Operator() returns true if the given VEX instruction
+// matches the EVEX instruction of this object.
+class IsMatch {
+  const CodeGenInstruction *Inst;
+
+public:
+  IsMatch(const CodeGenInstruction *Inst) : Inst(Inst) {}
+
+  bool operator()(const CodeGenInstruction *Inst2) {
+    Record *Rec1 = Inst->TheDef;
+    Record *Rec2 = Inst2->TheDef;
+    uint64_t Rec1WVEX =
+        getValueFromBitsInit(Rec1->getValueAsBitsInit("VEX_WPrefix"));
+    uint64_t Rec2WVEX =
+        getValueFromBitsInit(Rec2->getValueAsBitsInit("VEX_WPrefix"));
+
+    if (Rec2->getValueAsDef("OpEnc")->getName().str() != "EncVEX" ||
+        // VEX/EVEX fields
+        Rec2->getValueAsDef("OpPrefix") != Rec1->getValueAsDef("OpPrefix") ||
+        Rec2->getValueAsDef("OpMap") != Rec1->getValueAsDef("OpMap") ||
+        Rec2->getValueAsBit("hasVEX_4V") != Rec1->getValueAsBit("hasVEX_4V") ||
+        !equalBitsInits(Rec2->getValueAsBitsInit("EVEX_LL"),
+                        Rec1->getValueAsBitsInit("EVEX_LL")) ||
+        (Rec1WVEX != 2 && Rec2WVEX != 2 && Rec1WVEX != Rec2WVEX) ||
+        // Instruction's format
+        Rec2->getValueAsDef("Form") != Rec1->getValueAsDef("Form") ||
+        Rec2->getValueAsBit("isAsmParserOnly") !=
+            Rec1->getValueAsBit("isAsmParserOnly"))
+      return false;
+
+    // This is needed for instructions with intrinsic version (_Int).
+    // Where the only difference is the size of the operands.
+    // For example: VUCOMISDZrm and Int_VUCOMISDrm
+    // Also for instructions that their EVEX version was upgraded to work with
+    // k-registers. For example VPCMPEQBrm (xmm output register) and
+    // VPCMPEQBZ128rm (k register output register).
+    for (unsigned i = 0; i < Inst->Operands.size(); i++) {
+      Record *OpRec1 = Inst->Operands[i].Rec;
+      Record *OpRec2 = Inst2->Operands[i].Rec;
+
+      if (OpRec1 == OpRec2)
+        continue;
+
+      if (isRegisterOperand(OpRec1) && isRegisterOperand(OpRec2)) {
+        if (getRegOperandSize(OpRec1) != getRegOperandSize(OpRec2))
+          return false;
+      } else if (isMemoryOperand(OpRec1) && isMemoryOperand(OpRec2)) {
+        return false;
+      } else if (isImmediateOperand(OpRec1) && isImmediateOperand(OpRec2)) {
+        if (OpRec1->getValueAsDef("Type") != OpRec2->getValueAsDef("Type"))
+          return false;
+      } else
+        return false;
+    }
+
+    return true;
+  }
+
+private:
+  static inline bool isRegisterOperand(const Record *Rec) {
+    return Rec->isSubClassOf("RegisterClass") ||
+           Rec->isSubClassOf("RegisterOperand");
+  }
+
+  static inline bool isMemoryOperand(const Record *Rec) {
+    return Rec->isSubClassOf("Operand") &&
+           Rec->getValueAsString("OperandType") == "OPERAND_MEMORY";
+  }
+
+  static inline bool isImmediateOperand(const Record *Rec) {
+    return Rec->isSubClassOf("Operand") &&
+           Rec->getValueAsString("OperandType") == "OPERAND_IMMEDIATE";
+  }
+
+  static inline unsigned int getRegOperandSize(const Record *RegRec) {
+    if (RegRec->isSubClassOf("RegisterClass"))
+      return RegRec->getValueAsInt("Alignment");
+    if (RegRec->isSubClassOf("RegisterOperand"))
+      return RegRec->getValueAsDef("RegClass")->getValueAsInt("Alignment");
+
+    llvm_unreachable("Register operand's size not known!");
+  }
+};
+
+void X86EVEX2VEXTablesEmitter::run(raw_ostream &OS) {
+  emitSourceFileHeader("X86 EVEX2VEX tables", OS);
+
+  ArrayRef<const CodeGenInstruction *> NumberedInstructions =
+      Target.getInstructionsByEnumValue();
+
+  for (const CodeGenInstruction *Inst : NumberedInstructions) {
+    // Filter non-X86 instructions.
+    if (!Inst->TheDef->isSubClassOf("X86Inst"))
+      continue;
+
+    // Add VEX encoded instructions to one of VEXInsts vectors according to
+    // it's opcode.
+    if (Inst->TheDef->getValueAsDef("OpEnc")->getName() == "EncVEX") {
+      uint64_t Opcode = getValueFromBitsInit(Inst->TheDef->
+                                             getValueAsBitsInit("Opcode"));
+      VEXInsts[Opcode].push_back(Inst);
+    }
+    // Add relevant EVEX encoded instructions to EVEXInsts
+    else if (Inst->TheDef->getValueAsDef("OpEnc")->getName() == "EncEVEX" &&
+             !Inst->TheDef->getValueAsBit("hasEVEX_K") &&
+             !Inst->TheDef->getValueAsBit("hasEVEX_B") &&
+             getValueFromBitsInit(Inst->TheDef->
+                                        getValueAsBitsInit("EVEX_LL")) != 2 &&
+             !inExceptionList(Inst))
+      EVEXInsts.push_back(Inst);
+  }
+
+  for (const CodeGenInstruction *EVEXInst : EVEXInsts) {
+    uint64_t Opcode = getValueFromBitsInit(EVEXInst->TheDef->
+                                           getValueAsBitsInit("Opcode"));
+    // For each EVEX instruction look for a VEX match in the appropriate vector
+    // (instructions with the same opcode) using function object IsMatch.
+    auto Match = llvm::find_if(VEXInsts[Opcode], IsMatch(EVEXInst));
+    if (Match != VEXInsts[Opcode].end()) {
+      const CodeGenInstruction *VEXInst = *Match;
+
+      // In case a match is found add new entry to the appropriate table
+      switch (getValueFromBitsInit(
+          EVEXInst->TheDef->getValueAsBitsInit("EVEX_LL"))) {
+      case 0:
+        EVEX2VEX128.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,0}
+        break;
+      case 1:
+        EVEX2VEX256.push_back(std::make_pair(EVEXInst, VEXInst)); // {0,1}
+        break;
+      default:
+        llvm_unreachable("Instruction's size not fit for the mapping!");
+      }
+    }
+  }
+
+  // Print both tables
+  printTable(EVEX2VEX128, OS);
+  printTable(EVEX2VEX256, OS);
+}
+}
+
+namespace llvm {
+void EmitX86EVEX2VEXTables(RecordKeeper &RK, raw_ostream &OS) {
+  X86EVEX2VEXTablesEmitter(RK).run(OS);
+}
+}
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index b244750a09ea58b1bbad51f5925b333a4c5e6341..e703bbfc4496fb4076ed2518fc3e8c176fe7dfde 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -138,6 +138,10 @@ namespace X86Local {
   enum {
     AdSize16 = 1, AdSize32 = 2, AdSize64 = 3
   };
+
+  enum {
+    VEX_W0 = 0, VEX_W1 = 1, VEX_WIG = 2
+  };
 }
 
 using namespace X86Disassembler;
@@ -203,7 +207,7 @@ RecognizableInstr::RecognizableInstr(DisassemblerTables &tables,
   AdSize           = byteFromRec(Rec, "AdSizeBits");
   HasREX_WPrefix   = Rec->getValueAsBit("hasREX_WPrefix");
   HasVEX_4V        = Rec->getValueAsBit("hasVEX_4V");
-  HasVEX_WPrefix   = Rec->getValueAsBit("hasVEX_WPrefix");
+  VEX_WPrefix      = byteFromRec(Rec,"VEX_WPrefix");
   IgnoresVEX_L     = Rec->getValueAsBit("ignoresVEX_L");
   HasEVEX_L2Prefix = Rec->getValueAsBit("hasEVEX_L2");
   HasEVEX_K        = Rec->getValueAsBit("hasEVEX_K");
@@ -280,7 +284,7 @@ InstructionContext RecognizableInstr::insnContext() const {
       llvm_unreachable("Don't support VEX.L if EVEX_L2 is enabled");
     }
     // VEX_L & VEX_W
-    if (HasVEX_LPrefix && HasVEX_WPrefix) {
+    if (HasVEX_LPrefix && VEX_WPrefix == X86Local::VEX_W1) {
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_L_W_OPSIZE);
       else if (OpPrefix == X86Local::XS)
@@ -308,7 +312,7 @@ InstructionContext RecognizableInstr::insnContext() const {
         llvm_unreachable("Invalid prefix");
       }
     }
-    else if (HasEVEX_L2Prefix && HasVEX_WPrefix) {
+    else if (HasEVEX_L2Prefix && VEX_WPrefix == X86Local::VEX_W1) {
       // EVEX_L2 & VEX_W
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_L2_W_OPSIZE);
@@ -337,7 +341,7 @@ InstructionContext RecognizableInstr::insnContext() const {
         llvm_unreachable("Invalid prefix");
       }
     }
-    else if (HasVEX_WPrefix) {
+    else if (VEX_WPrefix == X86Local::VEX_W1) {
       // VEX_W
       if (OpPrefix == X86Local::PD)
         insnContext = EVEX_KB(IC_EVEX_W_OPSIZE);
@@ -363,7 +367,7 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = EVEX_KB(IC_EVEX);
     /// eof EVEX
   } else if (Encoding == X86Local::VEX || Encoding == X86Local::XOP) {
-    if (HasVEX_LPrefix && HasVEX_WPrefix) {
+    if (HasVEX_LPrefix && VEX_WPrefix == X86Local::VEX_W1) {
       if (OpPrefix == X86Local::PD)
         insnContext = IC_VEX_L_W_OPSIZE;
       else if (OpPrefix == X86Local::XS)
@@ -378,7 +382,7 @@ InstructionContext RecognizableInstr::insnContext() const {
       }
     } else if (OpPrefix == X86Local::PD && HasVEX_LPrefix)
       insnContext = IC_VEX_L_OPSIZE;
-    else if (OpPrefix == X86Local::PD && HasVEX_WPrefix)
+    else if (OpPrefix == X86Local::PD && VEX_WPrefix == X86Local::VEX_W1)
       insnContext = IC_VEX_W_OPSIZE;
     else if (OpPrefix == X86Local::PD)
       insnContext = IC_VEX_OPSIZE;
@@ -386,11 +390,11 @@ InstructionContext RecognizableInstr::insnContext() const {
       insnContext = IC_VEX_L_XS;
     else if (HasVEX_LPrefix && OpPrefix == X86Local::XD)
       insnContext = IC_VEX_L_XD;
-    else if (HasVEX_WPrefix && OpPrefix == X86Local::XS)
+    else if (VEX_WPrefix == X86Local::VEX_W1 && OpPrefix == X86Local::XS)
       insnContext = IC_VEX_W_XS;
-    else if (HasVEX_WPrefix && OpPrefix == X86Local::XD)
+    else if (VEX_WPrefix == X86Local::VEX_W1 && OpPrefix == X86Local::XD)
       insnContext = IC_VEX_W_XD;
-    else if (HasVEX_WPrefix && OpPrefix == X86Local::PS)
+    else if (VEX_WPrefix == X86Local::VEX_W1 && OpPrefix == X86Local::PS)
       insnContext = IC_VEX_W;
     else if (HasVEX_LPrefix && OpPrefix == X86Local::PS)
       insnContext = IC_VEX_L;
diff --git a/utils/TableGen/X86RecognizableInstr.h b/utils/TableGen/X86RecognizableInstr.h
index 2e611587cc316968d311307bce4ccc53bcc64be0..91ed928540c369e731d12069c02139148bd5bf62 100644
--- a/utils/TableGen/X86RecognizableInstr.h
+++ b/utils/TableGen/X86RecognizableInstr.h
@@ -55,8 +55,8 @@ private:
   bool HasREX_WPrefix;
   /// The hasVEX_4V field from the record
   bool HasVEX_4V;
-  /// The hasVEX_WPrefix field from the record
-  bool HasVEX_WPrefix;
+  /// The VEX_WPrefix field from the record
+  uint8_t VEX_WPrefix;
   /// Inferred from the operands; indicates whether the L bit in the VEX prefix is set
   bool HasVEX_LPrefix;
   /// The ignoreVEX_L field from the record
diff --git a/utils/bisect-skip-count b/utils/bisect-skip-count
new file mode 100755
index 0000000000000000000000000000000000000000..b18b4f41481b6dcdcac6b5a522d601863669b759
--- /dev/null
+++ b/utils/bisect-skip-count
@@ -0,0 +1,75 @@
+#!/usr/bin/env python
+# This script is used to bisect skip and count arguments for --debug-counter.
+# It is similar to bisect, except it understands how to increase skip and decrease count
+import os
+import sys
+import argparse
+# This is for timeout support. Use the recommended way of import.
+# We do timeouts because when doing, execution testing, we have a habit
+# of finding variants that infinite loop
+if os.name == 'posix' and sys.version_info[0] < 3:
+  import subprocess32 as subprocess
+else:
+  import subprocess
+parser = argparse.ArgumentParser()
+
+parser.add_argument('--skipstart', type=int, default=0)
+parser.add_argument('--skipend', type=int, default=(1 << 32))
+parser.add_argument('--countstart', type=int, default=0)
+parser.add_argument('--countend', type=int, default=(1 << 32))
+parser.add_argument('--timeout', type=int, default=None)
+# Use shell support if you need to use complex shell expressions in your command
+parser.add_argument('--shell', type=bool, default=False)
+parser.add_argument('command', nargs='+')
+
+args = parser.parse_args()
+
+start = args.skipstart
+end = args.skipend
+
+print("Bisect of Skip starting!")
+print("Start: %d" % start)
+print("End: %d" % end)
+
+last = None
+while start != end and start != end-1:
+    count = start + (end - start)/2
+    print("Visiting Skip: %d with (Start, End) = (%d,%d)" % (count, start, end))
+    cmd = [x % {'skip':count, 'count':-1} for x in args.command]
+    print cmd
+    try:
+        result = subprocess.call(cmd, shell=args.shell, timeout=args.timeout)
+        if result == 0:
+           print("    PASSES! Setting end to count")
+           end = count
+        else:
+           print("    FAILS! Setting start to count")
+           start = count
+    except:
+        print(" TIMEOUT, setting end to count")
+        end = count
+firstcount = start
+print("Last good skip: %d" % start)
+start = args.countstart
+end = args.countend
+print("Bisect of Count starting!")
+print("Start: %d" % start)
+print("End: %d" % end)
+while start != end and start != end-1:
+    count = start + (end - start)/2
+    print("Visiting Count: %d with (Start, End) = (%d,%d)" % (count, start, end))
+    cmd = [x % {'count':count, 'skip':firstcount } for x in args.command]
+    print cmd
+    try:
+        result = subprocess.call(cmd, shell=args.shell, timeout=args.timeout)
+        if result == 0:
+           print("    PASSES! Setting start to count")
+           start = count
+        else:
+           print("    FAILS! Setting end to count")
+           end = count
+    except:
+        print(" TIMEOUT, setting start to count")
+        start = count
+
+print("Last good count: %d" % start)
diff --git a/utils/gdb-scripts/prettyprinters.py b/utils/gdb-scripts/prettyprinters.py
index 5385e16246b3f62281b75fd5eac824cef0319155..be21b7083f32da5ce21b2dffd60125dfdb494f64 100644
--- a/utils/gdb-scripts/prettyprinters.py
+++ b/utils/gdb-scripts/prettyprinters.py
@@ -193,6 +193,113 @@ class DenseMapPrinter:
   def display_hint(self):
     return 'map'
 
+class TwinePrinter:
+  "Print a Twine"
+
+  def __init__(self, val):
+    self._val = val
+
+  def display_hint(self):
+    return 'string'
+
+  def string_from_pretty_printer_lookup(self, val):
+    '''Lookup the default pretty-printer for val and use it.
+
+    If no pretty-printer is defined for the type of val, print an error and
+    return a placeholder string.'''
+
+    pp = gdb.default_visualizer(val)
+    if pp:
+      s = pp.to_string()
+
+      # The pretty-printer may return a LazyString instead of an actual Python
+      # string.  Convert it to a Python string.  However, GDB doesn't seem to
+      # register the LazyString type, so we can't check
+      # "type(s) == gdb.LazyString".
+      if 'LazyString' in type(s).__name__:
+        s = s.value().address.string()
+
+    else:
+      print(('No pretty printer for {} found. The resulting Twine ' +
+             'representation will be incomplete.').format(val.type.name))
+      s = '(missing {})'.format(val.type.name)
+
+    return s
+
+  def string_from_child(self, child, kind):
+    '''Return the string representation of the Twine::Child child.'''
+
+    if kind in ('llvm::Twine::EmptyKind', 'llvm::Twine::NullKind'):
+      return ''
+
+    if kind == 'llvm::Twine::TwineKind':
+      return self.string_from_twine_object(child['twine'].dereference())
+
+    if kind == 'llvm::Twine::CStringKind':
+      return child['cString'].string()
+
+    if kind == 'llvm::Twine::StdStringKind':
+      val = child['stdString'].dereference()
+      return self.string_from_pretty_printer_lookup(val)
+
+    if kind == 'llvm::Twine::StringRefKind':
+      val = child['stringRef'].dereference()
+      pp = StringRefPrinter(val)
+      return pp.to_string()
+
+    if kind == 'llvm::Twine::SmallStringKind':
+      val = child['smallString'].dereference()
+      pp = SmallStringPrinter(val)
+      return pp.to_string()
+
+    if kind == 'llvm::Twine::CharKind':
+      return chr(child['character'])
+
+    if kind == 'llvm::Twine::DecUIKind':
+      return str(child['decUI'])
+
+    if kind == 'llvm::Twine::DecIKind':
+      return str(child['decI'])
+
+    if kind == 'llvm::Twine::DecULKind':
+      return str(child['decUL'].dereference())
+
+    if kind == 'llvm::Twine::DecLKind':
+      return str(child['decL'].dereference())
+
+    if kind == 'llvm::Twine::DecULLKind':
+      return str(child['decULL'].dereference())
+
+    if kind == 'llvm::Twine::DecLLKind':
+      return str(child['decLL'].dereference())
+
+    if kind == 'llvm::Twine::UHexKind':
+      val = child['uHex'].dereference()
+      return hex(int(val))
+
+    print(('Unhandled NodeKind {} in Twine pretty-printer. The result will be '
+           'incomplete.').format(kind))
+
+    return '(unhandled {})'.format(kind)
+
+  def string_from_twine_object(self, twine):
+    '''Return the string representation of the Twine object twine.'''
+
+    lhs_str = ''
+    rhs_str = ''
+
+    lhs = twine['LHS']
+    rhs = twine['RHS']
+    lhs_kind = str(twine['LHSKind'])
+    rhs_kind = str(twine['RHSKind'])
+
+    lhs_str = self.string_from_child(lhs, lhs_kind)
+    rhs_str = self.string_from_child(rhs, rhs_kind)
+
+    return lhs_str + rhs_str
+
+  def to_string(self):
+    return self.string_from_twine_object(self._val)
 
 pp = gdb.printing.RegexpCollectionPrettyPrinter("LLVMSupport")
 pp.add_printer('llvm::SmallString', '^llvm::SmallString<.*>$', SmallStringPrinter)
@@ -201,4 +308,5 @@ pp.add_printer('llvm::SmallVectorImpl', '^llvm::SmallVector(Impl)?<.*>$', SmallV
 pp.add_printer('llvm::ArrayRef', '^llvm::(Const)?ArrayRef<.*>$', ArrayRefPrinter)
 pp.add_printer('llvm::Optional', '^llvm::Optional<.*>$', OptionalPrinter)
 pp.add_printer('llvm::DenseMap', '^llvm::DenseMap<.*>$', DenseMapPrinter)
+pp.add_printer('llvm::Twine', '^llvm::Twine$', TwinePrinter)
 gdb.printing.register_pretty_printer(gdb.current_objfile(), pp)
diff --git a/utils/lit/lit/ShCommands.py b/utils/lit/lit/ShCommands.py
index 9ca9e8c91c0d404174c8aae1c468defe1691d352..01e91c55da9896621fba9e1d655a471e6c5a9c65 100644
--- a/utils/lit/lit/ShCommands.py
+++ b/utils/lit/lit/ShCommands.py
@@ -35,6 +35,29 @@ class Command:
             else:
                 file.write("%s%s '%s'" % (r[0][1], r[0][0], r[1]))
 
+class GlobItem:
+    def __init__(self, pattern):
+        self.pattern = pattern
+
+    def __repr__(self):
+        return self.pattern
+
+    def __eq__(self, other):
+        if not isinstance(other, Command):
+            return False
+
+        return (self.pattern == other.pattern)
+
+    def resolve(self, cwd):
+        import glob
+        import os
+        if os.path.isabs(self.pattern):
+           abspath = self.pattern
+        else:
+            abspath = os.path.join(cwd, self.pattern)
+        results = glob.glob(abspath)
+        return [self.pattern] if len(results) == 0 else results
+
 class Pipeline:
     def __init__(self, commands, negate=False, pipe_err=False):
         self.commands = commands
diff --git a/utils/lit/lit/ShUtil.py b/utils/lit/lit/ShUtil.py
index 0b3e0f58c97731d671c67b3e1069fc0cf8071d91..00ec8ab004936ef773111bfa7f65b62694c88fda 100644
--- a/utils/lit/lit/ShUtil.py
+++ b/utils/lit/lit/ShUtil.py
@@ -2,7 +2,7 @@ from __future__ import absolute_import
 import itertools
 
 import lit.util
-from lit.ShCommands import Command, Pipeline, Seq
+from lit.ShCommands import Command, GlobItem, Pipeline, Seq
 
 class ShLexer:
     def __init__(self, data, win32Escapes = False):
@@ -40,13 +40,15 @@ class ShLexer:
             return None
         
         self.pos = self.pos - 1 + len(chunk)
-        return chunk
+        return GlobItem(chunk) if '*' in chunk or '?' in chunk else chunk
         
     def lex_arg_slow(self, c):
         if c in "'\"":
             str = self.lex_arg_quoted(c)
         else:
             str = c
+        unquoted_glob_char = False
+        quoted_glob_char = False
         while self.pos != self.end:
             c = self.look()
             if c.isspace() or c in "|&;":
@@ -65,12 +67,12 @@ class ShLexer:
                 tok = self.lex_one_token()
                 assert isinstance(tok, tuple) and len(tok) == 1
                 return (tok[0], num)                    
-            elif c == '"':
+            elif c == '"' or c == "'":
                 self.eat()
-                str += self.lex_arg_quoted('"')
-            elif c == "'":
-                self.eat()
-                str += self.lex_arg_quoted("'")
+                quoted_arg = self.lex_arg_quoted(c)
+                if '*' in quoted_arg or '?' in quoted_arg:
+                    quoted_glob_char = True
+                str += quoted_arg
             elif not self.win32Escapes and c == '\\':
                 # Outside of a string, '\\' escapes everything.
                 self.eat()
@@ -79,9 +81,25 @@ class ShLexer:
                         "escape at end of quoted argument in: %r" % self.data)
                     return str
                 str += self.eat()
+            elif c in '*?':
+                unquoted_glob_char = True
+                str += self.eat()
             else:
                 str += self.eat()
-        return str
+        # If a quote character is present, lex_arg_quoted will remove the quotes
+        # and append the argument directly.  This causes a problem when the
+        # quoted portion contains a glob character, as the character will no
+        # longer be treated literally.  If glob characters occur *only* inside
+        # of quotes, then we can handle this by not globbing at all, and if
+        # glob characters occur *only* outside of quotes, we can still glob just
+        # fine.  But if a glob character occurs both inside and outside of
+        # quotes this presents a problem.  In practice this is such an obscure
+        # edge case that it doesn't seem worth the added complexity to support.
+        # By adding an assertion, it means some bot somewhere will catch this
+        # and flag the user of a non-portable test (which could almost certainly
+        # be re-written to work correctly without triggering this).
+        assert not (quoted_glob_char and unquoted_glob_char)
+        return GlobItem(str) if unquoted_glob_char else str
 
     def lex_arg_quoted(self, delim):
         str = ''
@@ -202,7 +220,7 @@ class ShParser:
                 break
 
             # If this is an argument, just add it to the current command.
-            if isinstance(tok, str):
+            if isinstance(tok, (str, GlobItem)):
                 args.append(self.lex())
                 continue
 
diff --git a/utils/lit/lit/TestRunner.py b/utils/lit/lit/TestRunner.py
index 9f9ff199f9ae9fb1478003eb0278d2c9b38c98cd..3fb9def26ee88fc4755f7e74dca1d541f6dd6ed1 100644
--- a/utils/lit/lit/TestRunner.py
+++ b/utils/lit/lit/TestRunner.py
@@ -5,6 +5,7 @@ import platform
 import tempfile
 import threading
 
+from lit.ShCommands import GlobItem
 import lit.ShUtil as ShUtil
 import lit.Test as Test
 import lit.util
@@ -141,6 +142,17 @@ def executeShCmd(cmd, shenv, results, timeout=0):
 
     return (finalExitCode, timeoutInfo)
 
+def expand_glob(arg, cwd):
+    if isinstance(arg, GlobItem):
+        return arg.resolve(cwd)
+    return [arg]
+
+def expand_glob_expressions(args, cwd):
+    result = [args[0]]
+    for arg in args[1:]:
+        result.extend(expand_glob(arg, cwd))
+    return result
+
 def quote_windows_command(seq):
     """
     Reimplement Python's private subprocess.list2cmdline for MSys compatibility
@@ -197,6 +209,18 @@ def quote_windows_command(seq):
 
     return ''.join(result)
 
+# cmd is export or env
+def updateEnv(env, cmd):
+    arg_idx = 1
+    for arg_idx, arg in enumerate(cmd.args[1:]):
+        # Partition the string into KEY=VALUE.
+        key, eq, val = arg.partition('=')
+        # Stop if there was no equals.
+        if eq == '':
+            break
+        env.env[key] = val
+    cmd.args = cmd.args[arg_idx+1:]
+
 def _executeShCmd(cmd, shenv, results, timeoutHelper):
     if timeoutHelper.timeoutReached():
         # Prevent further recursion if the timeout has been hit
@@ -240,11 +264,19 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
         if os.path.isabs(newdir):
             shenv.cwd = newdir
         else:
-            shenv.cwd = os.path.join(shenv.cwd, newdir)
+            shenv.cwd = os.path.realpath(os.path.join(shenv.cwd, newdir))
         # The cd builtin always succeeds. If the directory does not exist, the
         # following Popen calls will fail instead.
         return 0
 
+    if cmd.commands[0].args[0] == 'export':
+        if len(cmd.commands) != 1:
+            raise ValueError("'export' cannot be part of a pipeline")
+        if len(cmd.commands[0].args) != 2:
+            raise ValueError("'export' supports only one argument")
+        updateEnv(shenv, cmd.commands[0])
+        return 0
+
     procs = []
     input = subprocess.PIPE
     stderrTempFiles = []
@@ -261,15 +293,7 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
             # command. There might be multiple envs in a pipeline:
             #   env FOO=1 llc < %s | env BAR=2 llvm-mc | FileCheck %s
             cmd_shenv = ShellEnvironment(shenv.cwd, shenv.env)
-            arg_idx = 1
-            for arg_idx, arg in enumerate(j.args[1:]):
-                # Partition the string into KEY=VALUE.
-                key, eq, val = arg.partition('=')
-                # Stop if there was no equals.
-                if eq == '':
-                    break
-                cmd_shenv.env[key] = val
-            j.args = j.args[arg_idx+1:]
+            updateEnv(cmd_shenv, j)
 
         # Apply the redirections, we use (N,) as a sentinel to indicate stdin,
         # stdout, stderr for N equal to 0, 1, or 2 respectively. Redirects to or
@@ -313,15 +337,19 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
             else:
                 if r[2] is None:
                     redir_filename = None
-                    if kAvoidDevNull and r[0] == '/dev/null':
+                    name = expand_glob(r[0], cmd_shenv.cwd)
+                    if len(name) != 1:
+                       raise InternalShellError(j,"Unsupported: glob in redirect expanded to multiple files")
+                    name = name[0]
+                    if kAvoidDevNull and name == '/dev/null':
                         r[2] = tempfile.TemporaryFile(mode=r[1])
-                    elif kIsWindows and r[0] == '/dev/tty':
+                    elif kIsWindows and name == '/dev/tty':
                         # Simulate /dev/tty on Windows.
                         # "CON" is a special filename for the console.
                         r[2] = open("CON", r[1])
                     else:
                         # Make sure relative paths are relative to the cwd.
-                        redir_filename = os.path.join(cmd_shenv.cwd, r[0])
+                        redir_filename = os.path.join(cmd_shenv.cwd, name)
                         r[2] = open(redir_filename, r[1])
                     # Workaround a Win32 and/or subprocess bug when appending.
                     #
@@ -372,6 +400,9 @@ def _executeShCmd(cmd, shenv, results, timeoutHelper):
                     named_temp_files.append(f.name)
                     args[i] = f.name
 
+        # Expand all glob expressions
+        args = expand_glob_expressions(args, cmd_shenv.cwd)
+
         # On Windows, do our own command line quoting for better compatibility
         # with some core utility distributions.
         if kIsWindows:
@@ -687,11 +718,14 @@ def getDefaultSubstitutions(test, tmpDir, tmpBase, normalize_slashes=False):
     substitutions = []
     substitutions.extend([('%%', '#_MARKER_#')])
     substitutions.extend(test.config.substitutions)
+    tmpName = tmpBase + '.tmp'
+    baseName = os.path.basename(tmpBase)
     substitutions.extend([('%s', sourcepath),
                           ('%S', sourcedir),
                           ('%p', sourcedir),
                           ('%{pathsep}', os.pathsep),
-                          ('%t', tmpBase + '.tmp'),
+                          ('%t', tmpName),
+                          ('%basename_t', baseName),
                           ('%T', tmpDir),
                           ('#_MARKER_#', '%')])
 
diff --git a/utils/lit/lit/main.py b/utils/lit/lit/main.py
index 95032c6931acd777add7180354dd4d4562106c4c..689a2d55bcea71efa7315d96141a612eedc9148e 100755
--- a/utils/lit/lit/main.py
+++ b/utils/lit/lit/main.py
@@ -278,12 +278,15 @@ def main_with_tmp(builtinParameters):
     debug_group.add_argument("--show-tests", dest="showTests",
                       help="Show all discovered tests",
                       action="store_true", default=False)
-    debug_group.add_argument("--use-processes", dest="useProcesses",
+    debug_group.add_argument("--use-process-pool", dest="executionStrategy",
+                      help="Run tests in parallel with a process pool",
+                      action="store_const", const="PROCESS_POOL")
+    debug_group.add_argument("--use-processes", dest="executionStrategy",
                       help="Run tests in parallel with processes (not threads)",
-                      action="store_true", default=True)
-    debug_group.add_argument("--use-threads", dest="useProcesses",
+                      action="store_const", const="PROCESSES")
+    debug_group.add_argument("--use-threads", dest="executionStrategy",
                       help="Run tests in parallel with threads (not processes)",
-                      action="store_false", default=True)
+                      action="store_const", const="THREADS")
 
     opts = parser.parse_args()
     args = opts.test_paths
@@ -298,6 +301,9 @@ def main_with_tmp(builtinParameters):
     if opts.numThreads is None:
         opts.numThreads = lit.util.detectCPUs()
 
+    if opts.executionStrategy is None:
+        opts.executionStrategy = 'PROCESS_POOL'
+
     if opts.maxFailures == 0:
         parser.error("Setting --max-failures to 0 does not have any effect.")
 
@@ -481,7 +487,7 @@ def main_with_tmp(builtinParameters):
     display = TestingProgressDisplay(opts, len(run.tests), progressBar)
     try:
         run.execute_tests(display, opts.numThreads, opts.maxTime,
-                          opts.useProcesses)
+                          opts.executionStrategy)
     except KeyboardInterrupt:
         sys.exit(2)
     display.finish()
diff --git a/utils/lit/lit/run.py b/utils/lit/lit/run.py
index 2be8a1133b9428a4f8674077a9ec591cce169e2f..14d8ec98490e8db7b5859592cadb4ef4de734e04 100644
--- a/utils/lit/lit/run.py
+++ b/utils/lit/lit/run.py
@@ -1,4 +1,5 @@
 import os
+import sys
 import threading
 import time
 import traceback
@@ -84,11 +85,13 @@ class Tester(object):
     def run_test(self, test_index):
         test = self.run_instance.tests[test_index]
         try:
-            self.run_instance.execute_test(test)
+            execute_test(test, self.run_instance.lit_config,
+                         self.run_instance.parallelism_semaphores)
         except KeyboardInterrupt:
             # This is a sad hack. Unfortunately subprocess goes
             # bonkers with ctrl-c and we start forking merrily.
             print('\nCtrl-C detected, goodbye.')
+            sys.stdout.flush()
             os.kill(0,9)
         self.consumer.update(test_index, test)
 
@@ -167,6 +170,44 @@ class _Display(object):
 def handleFailures(provider, consumer, maxFailures):
     consumer.display = _Display(consumer.display, provider, maxFailures)
 
+def execute_test(test, lit_config, parallelism_semaphores):
+    """Execute one test"""
+    pg = test.config.parallelism_group
+    if callable(pg):
+        pg = pg(test)
+
+    result = None
+    semaphore = None
+    try:
+        if pg:
+            semaphore = parallelism_semaphores[pg]
+        if semaphore:
+            semaphore.acquire()
+        start_time = time.time()
+        result = test.config.test_format.execute(test, lit_config)
+        # Support deprecated result from execute() which returned the result
+        # code and additional output as a tuple.
+        if isinstance(result, tuple):
+            code, output = result
+            result = lit.Test.Result(code, output)
+        elif not isinstance(result, lit.Test.Result):
+            raise ValueError("unexpected result from test execution")
+        result.elapsed = time.time() - start_time
+    except KeyboardInterrupt:
+        raise
+    except:
+        if lit_config.debug:
+            raise
+        output = 'Exception during script execution:\n'
+        output += traceback.format_exc()
+        output += '\n'
+        result = lit.Test.Result(lit.Test.UNRESOLVED, output)
+    finally:
+        if semaphore:
+            semaphore.release()
+
+    test.setResult(result)
+
 class Run(object):
     """
     This class represents a concrete, configured testing run.
@@ -177,42 +218,10 @@ class Run(object):
         self.tests = tests
 
     def execute_test(self, test):
-        pg = test.config.parallelism_group
-        if callable(pg): pg = pg(test)
-
-        result = None
-        semaphore = None
-        try:
-            if pg: semaphore = self.parallelism_semaphores[pg]
-            if semaphore: semaphore.acquire()
-            start_time = time.time()
-            result = test.config.test_format.execute(test, self.lit_config)
-
-            # Support deprecated result from execute() which returned the result
-            # code and additional output as a tuple.
-            if isinstance(result, tuple):
-                code, output = result
-                result = lit.Test.Result(code, output)
-            elif not isinstance(result, lit.Test.Result):
-                raise ValueError("unexpected result from test execution")
-
-            result.elapsed = time.time() - start_time
-        except KeyboardInterrupt:
-            raise
-        except:
-            if self.lit_config.debug:
-                raise
-            output = 'Exception during script execution:\n'
-            output += traceback.format_exc()
-            output += '\n'
-            result = lit.Test.Result(lit.Test.UNRESOLVED, output)
-        finally:
-            if semaphore: semaphore.release()
-
-        test.setResult(result)
+        return execute_test(test, self.lit_config, self.parallelism_semaphores)
 
     def execute_tests(self, display, jobs, max_time=None,
-                      use_processes=False):
+                      execution_strategy=None):
         """
         execute_tests(display, jobs, [max_time])
 
@@ -234,6 +243,14 @@ class Run(object):
         be given an UNRESOLVED result.
         """
 
+        if execution_strategy == 'PROCESS_POOL':
+            self.execute_tests_with_mp_pool(display, jobs, max_time)
+            return
+        # FIXME: Standardize on the PROCESS_POOL execution strategy and remove
+        # the other two strategies.
+
+        use_processes = execution_strategy == 'PROCESSES'
+
         # Choose the appropriate parallel execution implementation.
         consumer = None
         if jobs != 1 and use_processes and multiprocessing:
@@ -263,8 +280,8 @@ class Run(object):
         provider = TestProvider(queue_impl, canceled_flag)
         handleFailures(provider, consumer, self.lit_config.maxFailures)
 
-        # Queue the tests outside the main thread because we can't guarantee
-        # that we can put() all the tests without blocking:
+        # Putting tasks into the threading or multiprocessing Queue may block,
+        # so do it in a separate thread.
         # https://docs.python.org/2/library/multiprocessing.html
         # e.g: On Mac OS X, we will hang if we put 2^15 elements in the queue
         # without taking any out.
@@ -317,3 +334,140 @@ class Run(object):
         # Wait for all the tasks to complete.
         for t in tasks:
             t.join()
+
+    def execute_tests_with_mp_pool(self, display, jobs, max_time=None):
+        # Don't do anything if we aren't going to run any tests.
+        if not self.tests or jobs == 0:
+            return
+
+        # Set up semaphores to limit parallelism of certain classes of tests.
+        # For example, some ASan tests require lots of virtual memory and run
+        # faster with less parallelism on OS X.
+        self.parallelism_semaphores = \
+                {k: multiprocessing.Semaphore(v) for k, v in
+                 self.lit_config.parallelism_groups.items()}
+
+        # Install a console-control signal handler on Windows.
+        if win32api is not None:
+            def console_ctrl_handler(type):
+                print('\nCtrl-C detected, terminating.')
+                pool.terminate()
+                pool.join()
+                os.kill(0,9)
+                return True
+            win32api.SetConsoleCtrlHandler(console_ctrl_handler, True)
+
+        # Save the display object on the runner so that we can update it from
+        # our task completion callback.
+        self.display = display
+
+        # We need to issue many wait calls, so compute the final deadline and
+        # subtract time.time() from that as we go along.
+        deadline = None
+        if max_time:
+            deadline = time.time() + max_time
+
+        # Start a process pool. Copy over the data shared between all test runs.
+        pool = multiprocessing.Pool(jobs, worker_initializer,
+                                    (self.lit_config,
+                                     self.parallelism_semaphores))
+
+        try:
+            self.failure_count = 0
+            self.hit_max_failures = False
+            async_results = [pool.apply_async(worker_run_one_test,
+                                              args=(test_index, test),
+                                              callback=self.consume_test_result)
+                             for test_index, test in enumerate(self.tests)]
+
+            # Wait for all results to come in. The callback that runs in the
+            # parent process will update the display.
+            for a in async_results:
+                if deadline:
+                    a.wait(deadline - time.time())
+                else:
+                    # Python condition variables cannot be interrupted unless
+                    # they have a timeout. This can make lit unresponsive to
+                    # KeyboardInterrupt, so do a busy wait with a timeout.
+                    while not a.ready():
+                        a.wait(1)
+                if not a.successful():
+                    a.get() # Exceptions raised here come from the worker.
+                if self.hit_max_failures:
+                    break
+        finally:
+            # Stop the workers and wait for any straggling results to come in
+            # if we exited without waiting on every async result.
+            pool.terminate()
+            pool.join()
+
+        # Mark any tests that weren't run as UNRESOLVED.
+        for test in self.tests:
+            if test.result is None:
+                test.setResult(lit.Test.Result(lit.Test.UNRESOLVED, '', 0.0))
+
+    def consume_test_result(self, pool_result):
+        """Test completion callback for worker_run_one_test
+
+        Updates the test result status in the parent process. Each task in the
+        pool returns the test index and the result, and we use the index to look
+        up the original test object. Also updates the progress bar as tasks
+        complete.
+        """
+        # Don't add any more test results after we've hit the maximum failure
+        # count.  Otherwise we're racing with the main thread, which is going
+        # to terminate the process pool soon.
+        if self.hit_max_failures:
+            return
+
+        (test_index, test_with_result) = pool_result
+        # Update the parent process copy of the test. This includes the result,
+        # XFAILS, REQUIRES, and UNSUPPORTED statuses.
+        assert self.tests[test_index].file_path == test_with_result.file_path, \
+                "parent and child disagree on test path"
+        self.tests[test_index] = test_with_result
+        self.display.update(test_with_result)
+
+        # If we've finished all the tests or too many tests have failed, notify
+        # the main thread that we've stopped testing.
+        self.failure_count += (test_with_result.result.code == lit.Test.FAIL)
+        if self.lit_config.maxFailures and \
+                self.failure_count == self.lit_config.maxFailures:
+            self.hit_max_failures = True
+
+child_lit_config = None
+child_parallelism_semaphores = None
+
+def worker_initializer(lit_config, parallelism_semaphores):
+    """Copy expensive repeated data into worker processes"""
+    global child_lit_config
+    child_lit_config = lit_config
+    global child_parallelism_semaphores
+    child_parallelism_semaphores = parallelism_semaphores
+
+def worker_run_one_test(test_index, test):
+    """Run one test in a multiprocessing.Pool
+
+    Side effects in this function and functions it calls are not visible in the
+    main lit process.
+
+    Arguments and results of this function are pickled, so they should be cheap
+    to copy. For efficiency, we copy all data needed to execute all tests into
+    each worker and store it in the child_* global variables. This reduces the
+    cost of each task.
+
+    Returns an index and a Result, which the parent process uses to update
+    the display.
+    """
+    try:
+        execute_test(test, child_lit_config, child_parallelism_semaphores)
+        return (test_index, test)
+    except KeyboardInterrupt as e:
+        # This is a sad hack. Unfortunately subprocess goes
+        # bonkers with ctrl-c and we start forking merrily.
+        print('\nCtrl-C detected, goodbye.')
+        traceback.print_exc()
+        sys.stdout.flush()
+        os.kill(0,9)
+    except:
+        traceback.print_exc()
diff --git a/utils/lit/tests/Inputs/test-data/dummy_format.py b/utils/lit/tests/Inputs/test-data/dummy_format.py
new file mode 100644
index 0000000000000000000000000000000000000000..93e48eeb83960a2e48478f8ac6e400fd5a75515d
--- /dev/null
+++ b/utils/lit/tests/Inputs/test-data/dummy_format.py
@@ -0,0 +1,38 @@
+import os
+try:
+    import ConfigParser
+except ImportError:
+    import configparser as ConfigParser
+
+import lit.formats
+import lit.Test
+
+class DummyFormat(lit.formats.FileBasedTest):
+    def execute(self, test, lit_config):
+        # In this dummy format, expect that each test file is actually just a
+        # .ini format dump of the results to report.
+
+        source_path = test.getSourcePath()
+
+        cfg = ConfigParser.ConfigParser()
+        cfg.read(source_path)
+
+        # Create the basic test result.
+        result_code = cfg.get('global', 'result_code')
+        result_output = cfg.get('global', 'result_output')
+        result = lit.Test.Result(getattr(lit.Test, result_code),
+                                 result_output)
+
+        # Load additional metrics.
+        for key,value_str in cfg.items('results'):
+            value = eval(value_str)
+            if isinstance(value, int):
+                metric = lit.Test.IntMetricValue(value)
+            elif isinstance(value, float):
+                metric = lit.Test.RealMetricValue(value)
+            else:
+                raise RuntimeError("unsupported result type")
+            result.addMetric(key, metric)
+
+        return result
+
diff --git a/utils/lit/tests/Inputs/test-data/lit.cfg b/utils/lit/tests/Inputs/test-data/lit.cfg
index f5aba7b217748838d9ceebde9b35f721882bfac5..0191cc21888436168e473c890e22bf0b5eb96a2f 100644
--- a/utils/lit/tests/Inputs/test-data/lit.cfg
+++ b/utils/lit/tests/Inputs/test-data/lit.cfg
@@ -1,44 +1,10 @@
-import os
-try:
-    import ConfigParser
-except ImportError:
-    import configparser as ConfigParser
-
-import lit.formats
-import lit.Test
-
-class DummyFormat(lit.formats.FileBasedTest):
-    def execute(self, test, lit_config):
-        # In this dummy format, expect that each test file is actually just a
-        # .ini format dump of the results to report.
-
-        source_path = test.getSourcePath()
-
-        cfg = ConfigParser.ConfigParser()
-        cfg.read(source_path)
-
-        # Create the basic test result.
-        result_code = cfg.get('global', 'result_code')
-        result_output = cfg.get('global', 'result_output')
-        result = lit.Test.Result(getattr(lit.Test, result_code),
-                                 result_output)
-
-        # Load additional metrics.
-        for key,value_str in cfg.items('results'):
-            value = eval(value_str)
-            if isinstance(value, int):
-                metric = lit.Test.IntMetricValue(value)
-            elif isinstance(value, float):
-                metric = lit.Test.RealMetricValue(value)
-            else:
-                raise RuntimeError("unsupported result type")
-            result.addMetric(key, metric)
-
-        return result
+import site
+site.addsitedir(os.path.dirname(__file__))
+import dummy_format
 
 config.name = 'test-data'
 config.suffixes = ['.ini']
-config.test_format = DummyFormat()
+config.test_format = dummy_format.DummyFormat()
 config.test_source_root = None
 config.test_exec_root = None
 config.target_triple = None
diff --git a/utils/lit/tests/unit/TestRunner.py b/utils/lit/tests/unit/TestRunner.py
index ed0affa28321fd2a0db11f4b9a14f324b81608aa..79cc10f7e14d6e46489a91a0ae70ee95e00d8bd6 100644
--- a/utils/lit/tests/unit/TestRunner.py
+++ b/utils/lit/tests/unit/TestRunner.py
@@ -89,7 +89,7 @@ class TestIntegratedTestKeywordParser(unittest.TestCase):
         parsers = self.make_parsers()
         self.parse_test(parsers)
         list_parser = self.get_parser(parsers, 'MY_LIST:')
-        self.assertItemsEqual(list_parser.getValue(),
+        self.assertEqual(list_parser.getValue(),
                               ['one', 'two', 'three', 'four'])
 
     def test_commands(self):
@@ -106,7 +106,7 @@ class TestIntegratedTestKeywordParser(unittest.TestCase):
         self.parse_test(parsers)
         custom_parser = self.get_parser(parsers, 'MY_CUSTOM:')
         value = custom_parser.getValue()
-        self.assertItemsEqual(value, ['a', 'b', 'c'])
+        self.assertEqual(value, ['a', 'b', 'c'])
 
     def test_bad_keywords(self):
         def custom_parse(line_number, line, output):
diff --git a/utils/opt-viewer/opt-diff.py b/utils/opt-viewer/opt-diff.py
new file mode 100755
index 0000000000000000000000000000000000000000..8c377860653e0646fd9cfcb15eb170587a3e3936
--- /dev/null
+++ b/utils/opt-viewer/opt-diff.py
@@ -0,0 +1,70 @@
+#!/usr/bin/env python2.7
+
+from __future__ import print_function
+
+desc = '''Generate the difference of two YAML files into a new YAML file (works on
+pair of directories too).  A new attribute 'Added' is set to True or False
+depending whether the entry is added or removed from the first input to the
+next.
+
+The tools requires PyYAML.'''
+
+import yaml
+# Try to use the C parser.
+try:
+    from yaml import CLoader as Loader
+except ImportError:
+    from yaml import Loader
+
+import optrecord
+import argparse
+from collections import defaultdict
+from multiprocessing import cpu_count, Pool
+import os, os.path
+import fnmatch
+
+def find_files(dir_or_file):
+    if os.path.isfile(dir_or_file):
+        return [dir_or_file]
+
+    all = []
+    for dir, subdirs, files in os.walk(dir_or_file):
+        for file in files:
+            if fnmatch.fnmatch(file, "*.opt.yaml"):
+                all.append( os.path.join(dir, file))
+    return all
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=desc)
+    parser.add_argument('yaml_dir_or_file_1')
+    parser.add_argument('yaml_dir_or_file_2')
+    parser.add_argument(
+        '--jobs',
+        '-j',
+        default=cpu_count(),
+        type=int,
+        help='Max job count (defaults to current CPU count)')
+    parser.add_argument('--output', '-o', default='diff.opt.yaml')
+    args = parser.parse_args()
+
+    if args.jobs == 1:
+        pmap = map
+    else:
+        pool = Pool(processes=args.jobs)
+        pmap = pool.map
+
+    files1 = find_files(args.yaml_dir_or_file_1)
+    files2 = find_files(args.yaml_dir_or_file_2)
+
+    all_remarks1, _, _ = optrecord.gather_results(pmap, files1)
+    all_remarks2, _, _ = optrecord.gather_results(pmap, files2)
+
+    added = set(all_remarks2.values()) - set(all_remarks1.values())
+    removed = set(all_remarks1.values()) - set(all_remarks2.values())
+
+    for r in added:
+        r.Added = True
+    for r in removed:
+        r.Added = False
+    stream = file(args.output, 'w')
+    yaml.dump_all(added | removed, stream)
diff --git a/utils/opt-viewer/opt-stats.py b/utils/opt-viewer/opt-stats.py
new file mode 100755
index 0000000000000000000000000000000000000000..b22a052a737a1f951dbd776f3e5107a5f7d77466
--- /dev/null
+++ b/utils/opt-viewer/opt-stats.py
@@ -0,0 +1,56 @@
+#!/usr/bin/env python2.7
+
+from __future__ import print_function
+
+desc = '''Generate statistics about optimization records from the YAML files
+generated with -fsave-optimization-record and -fdiagnostics-show-hotness.
+
+The tools requires PyYAML and Pygments Python packages.'''
+
+import optrecord
+import argparse
+import operator
+from collections import defaultdict
+from multiprocessing import cpu_count, Pool
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description=desc)
+    parser.add_argument('yaml_files', nargs='+')
+    parser.add_argument(
+        '--jobs',
+        '-j',
+        default=cpu_count(),
+        type=int,
+        help='Max job count (defaults to current CPU count)')
+    args = parser.parse_args()
+
+    if len(args.yaml_files) == 0:
+        parser.print_help()
+        sys.exit(1)
+
+    if args.jobs == 1:
+        pmap = map
+    else:
+        pool = Pool(processes=args.jobs)
+        pmap = pool.map
+
+    all_remarks, file_remarks, _ = optrecord.gather_results(pmap, args.yaml_files)
+
+    bypass = defaultdict(int)
+    byname = defaultdict(int)
+    for r in all_remarks.itervalues():
+        bypass[r.Pass] += 1
+        byname[r.Pass + "/" + r.Name] += 1
+
+    total = len(all_remarks)
+    print("{:24s} {:10d}\n".format("Total number of remarks", total))
+
+    print("Top 10 remarks by pass:")
+    for (passname, count) in sorted(bypass.items(), key=operator.itemgetter(1),
+                                    reverse=True)[:10]:
+        print("  {:30s} {:2.0f}%". format(passname, count * 100. / total))
+
+    print("\nTop 10 remarks:")
+    for (name, count) in sorted(byname.items(), key=operator.itemgetter(1),
+                                reverse=True)[:10]:
+        print("  {:30s} {:2.0f}%". format(name, count * 100. / total))
diff --git a/utils/opt-viewer/opt-viewer.py b/utils/opt-viewer/opt-viewer.py
index 570fd22047f6e9aa348c9950e8d3b5267d630fda..a14aee5f298df2dd0d48b0c94458fc0b69379bce 100755
--- a/utils/opt-viewer/opt-viewer.py
+++ b/utils/opt-viewer/opt-viewer.py
@@ -5,162 +5,30 @@ from __future__ import print_function
 desc = '''Generate HTML output to visualize optimization records from the YAML files
 generated with -fsave-optimization-record and -fdiagnostics-show-hotness.
 
-The tools requires PyYAML and Pygments Python packages.
-
-For faster parsing, you may want to use libYAML with PyYAML.'''
-
-import yaml
-# Try to use the C parser.
-try:
-    from yaml import CLoader as Loader
-except ImportError:
-    from yaml import Loader
+The tools requires PyYAML and Pygments Python packages.'''
 
+import optrecord
 import functools
-from collections import defaultdict
-import itertools
 from multiprocessing import Pool
 from multiprocessing import Lock, cpu_count
 import errno
 import argparse
 import os.path
 import re
-import subprocess
 import shutil
 from pygments import highlight
 from pygments.lexers.c_cpp import CppLexer
 from pygments.formatters import HtmlFormatter
-
-p = subprocess.Popen(['c++filt', '-n'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
-p_lock = Lock()
-
-
-def demangle(name):
-    with p_lock:
-        p.stdin.write(name + '\n')
-        return p.stdout.readline().rstrip()
+import cgi
 
 # This allows passing the global context to the child processes.
 class Context:
-    def __init__(self, max_hotness = 0, caller_loc = dict()):
-       self.max_hotness = max_hotness
-
+    def __init__(self, caller_loc = dict()):
        # Map function names to their source location for function where inlining happened
        self.caller_loc = caller_loc
 
-    def should_display_hotness(self):
-        # If max_hotness is 0 at the end, we assume hotness information is
-        # missing and no relative hotness information is displayed
-        return self.max_hotness != 0
-
 context = Context()
 
-class Remark(yaml.YAMLObject):
-    # Work-around for http://pyyaml.org/ticket/154.
-    yaml_loader = Loader
-
-    def __getattr__(self, name):
-        # If hotness is missing, assume 0
-        if name == 'Hotness':
-            return 0
-        raise AttributeError
-
-    @property
-    def File(self):
-        return self.DebugLoc['File']
-
-    @property
-    def Line(self):
-        return int(self.DebugLoc['Line'])
-
-    @property
-    def Column(self):
-        return self.DebugLoc['Column']
-
-    @property
-    def DebugLocString(self):
-        return "{}:{}:{}".format(self.File, self.Line, self.Column)
-
-    @property
-    def DemangledFunctionName(self):
-        return demangle(self.Function)
-
-    @classmethod
-    def make_link(cls, File, Line):
-        return "{}#L{}".format(SourceFileRenderer.html_file_name(File), Line)
-
-    @property
-    def Link(self):
-        return Remark.make_link(self.File, self.Line)
-
-    def getArgString(self, mapping):
-        mapping = mapping.copy()
-        dl = mapping.get('DebugLoc')
-        if dl:
-            del mapping['DebugLoc']
-
-        assert(len(mapping) == 1)
-        (key, value) = mapping.items()[0]
-
-        if key == 'Caller' or key == 'Callee':
-            value = demangle(value)
-
-        if dl and key != 'Caller':
-            return "<a href={}>{}</a>".format(
-                Remark.make_link(dl['File'], dl['Line']), value)
-        else:
-            return value
-
-    @property
-    def message(self):
-        # Args is a list of mappings (dictionaries)
-        values = [self.getArgString(mapping) for mapping in self.Args]
-        return "".join(values)
-
-    @property
-    def RelativeHotness(self):
-        if context.should_display_hotness():
-            return "{}%".format(int(round(self.Hotness * 100 / context.max_hotness)))
-        else:
-            return ''
-
-    @property
-    def key(self):
-        return (self.__class__, self.Pass, self.Name, self.File, self.Line, self.Column, self.Function)
-
-
-class Analysis(Remark):
-    yaml_tag = '!Analysis'
-
-    @property
-    def color(self):
-        return "white"
-
-
-class AnalysisFPCommute(Analysis):
-    yaml_tag = '!AnalysisFPCommute'
-
-
-class AnalysisAliasing(Analysis):
-    yaml_tag = '!AnalysisAliasing'
-
-
-class Passed(Remark):
-    yaml_tag = '!Passed'
-
-    @property
-    def color(self):
-        return "green"
-
-
-class Missed(Remark):
-    yaml_tag = '!Missed'
-
-    @property
-    def color(self):
-        return "red"
-
-
 class SourceFileRenderer:
     def __init__(self, source_dir, output_dir, filename):
         existing_filename = None
@@ -171,7 +39,7 @@ class SourceFileRenderer:
             if os.path.exists(fn):
                 existing_filename = fn
 
-        self.stream = open(os.path.join(output_dir, SourceFileRenderer.html_file_name(filename)), 'w')
+        self.stream = open(os.path.join(output_dir, optrecord.html_file_name(filename)), 'w')
         if existing_filename:
             self.source_stream = open(existing_filename)
         else:
@@ -182,36 +50,47 @@ class SourceFileRenderer:
 </html>
             '''.format(filename), file=self.stream)
 
-        self.html_formatter = HtmlFormatter()
-        self.cpp_lexer = CppLexer()
+        self.html_formatter = HtmlFormatter(encoding='utf-8')
+        self.cpp_lexer = CppLexer(stripnl=False)
 
-    def render_source_line(self, linenum, line):
-        html_line = highlight(line, self.cpp_lexer, self.html_formatter)
-        print('''
+    def render_source_lines(self, stream, line_remarks):
+        file_text = stream.read()
+        html_highlighted = highlight(file_text, self.cpp_lexer, self.html_formatter)
+
+        # Take off the header and footer, these must be
+        #   reapplied line-wise, within the page structure
+        html_highlighted = html_highlighted.replace('<div class="highlight"><pre>', '')
+        html_highlighted = html_highlighted.replace('</pre></div>', '')
+
+        for (linenum, html_line) in enumerate(html_highlighted.split('\n'), start=1):
+            print('''
 <tr>
 <td><a name=\"L{linenum}\">{linenum}</a></td>
 <td></td>
 <td></td>
-<td>{html_line}</td>
+<td><div class="highlight"><pre>{html_line}</pre></div></td>
 </tr>'''.format(**locals()), file=self.stream)
 
+            for remark in line_remarks.get(linenum, []):
+                self.render_inline_remarks(remark, html_line)
+
     def render_inline_remarks(self, r, line):
         inlining_context = r.DemangledFunctionName
-        print
         dl = context.caller_loc.get(r.Function)
         if dl:
-            link = Remark.make_link(dl['File'], dl['Line'] - 2)
+            link = optrecord.make_link(dl['File'], dl['Line'] - 2)
             inlining_context = "<a href={link}>{r.DemangledFunctionName}</a>".format(**locals())
 
         # Column is the number of characters *including* tabs, keep those and
         # replace everything else with spaces.
-        indent = line[:r.Column - 1]
+        indent = line[:max(r.Column, 1) - 1]
         indent = re.sub('\S', ' ', indent)
+
         print('''
 <tr>
 <td></td>
 <td>{r.RelativeHotness}</td>
-<td class=\"column-entry-{r.color}\">{r.Pass}</td>
+<td class=\"column-entry-{r.color}\">{r.PassWithDiffPrefix}</td>
 <td><pre style="display:inline">{indent}</pre><span class=\"column-entry-yellow\"> {r.message}&nbsp;</span></td>
 <td class=\"column-entry-yellow\">{inlining_context}</td>
 </tr>'''.format(**locals()), file=self.stream)
@@ -235,31 +114,26 @@ class SourceFileRenderer:
 <td>Source</td>
 <td>Inline Context</td>
 </tr>''', file=self.stream)
-        for (linenum, line) in enumerate(self.source_stream.readlines(), start=1):
-            self.render_source_line(linenum, line)
-            for remark in line_remarks.get(linenum, []):
-                self.render_inline_remarks(remark, line)
+        self.render_source_lines(self.source_stream, line_remarks)
+
         print('''
 </table>
 </body>
 </html>''', file=self.stream)
 
-    @classmethod
-    def html_file_name(cls, filename):
-        return filename.replace('/', '_') + ".html"
-
 
 class IndexRenderer:
     def __init__(self, output_dir):
         self.stream = open(os.path.join(output_dir, 'index.html'), 'w')
 
-    def render_entry(self, r):
+    def render_entry(self, r, odd):
+        escaped_name = cgi.escape(r.DemangledFunctionName)
         print('''
 <tr>
-<td><a href={r.Link}>{r.DebugLocString}</a></td>
-<td>{r.RelativeHotness}</td>
-<td>{r.DemangledFunctionName}</td>
-<td class=\"column-entry-{r.color}\">{r.Pass}</td>
+<td class=\"column-entry-{odd}\"><a href={r.Link}>{r.DebugLocString}</a></td>
+<td class=\"column-entry-{odd}\">{r.RelativeHotness}</td>
+<td class=\"column-entry-{odd}\">{escaped_name}</td>
+<td class=\"column-entry-{r.color}\">{r.PassWithDiffPrefix}</td>
 </tr>'''.format(**locals()), file=self.stream)
 
     def render(self, all_remarks):
@@ -277,35 +151,14 @@ class IndexRenderer:
 <td>Function</td>
 <td>Pass</td>
 </tr>''', file=self.stream)
-        for remark in all_remarks:
-            self.render_entry(remark)
+        for i, remark in enumerate(all_remarks):
+            self.render_entry(remark, i % 2)
         print('''
 </table>
 </body>
 </html>''', file=self.stream)
 
 
-def get_remarks(input_file):
-    max_hotness = 0
-    all_remarks = dict()
-    file_remarks = defaultdict(functools.partial(defaultdict, list))
-
-    with open(input_file) as f:
-        docs = yaml.load_all(f, Loader=Loader)
-
-        for remark in docs:
-            # Avoid remarks withoug debug location or if they are duplicated
-            if not hasattr(remark, 'DebugLoc') or remark.key in all_remarks:
-                continue
-            all_remarks[remark.key] = remark
-
-            file_remarks[remark.File][remark.Line].append(remark)
-
-            max_hotness = max(max_hotness, remark.Hotness)
-
-    return max_hotness, all_remarks, file_remarks
-
-
 def _render_file(source_dir, output_dir, ctx, entry):
     global context
     context = ctx
@@ -313,39 +166,18 @@ def _render_file(source_dir, output_dir, ctx, entry):
     SourceFileRenderer(source_dir, output_dir, filename).render(remarks)
 
 
-def gather_results(pool, filenames):
-    remarks = pool.map(get_remarks, filenames)
-
-    def merge_file_remarks(file_remarks_job, all_remarks, merged):
-        for filename, d in file_remarks_job.iteritems():
-            for line, remarks in d.iteritems():
-                for remark in remarks:
-                    if remark.key not in all_remarks:
-                        merged[filename][line].append(remark)
-
-    all_remarks = dict()
-    file_remarks = defaultdict(functools.partial(defaultdict, list))
-    for _, all_remarks_job, file_remarks_job in remarks:
-        merge_file_remarks(file_remarks_job, all_remarks, file_remarks)
-        all_remarks.update(all_remarks_job)
-
-    context.max_hotness = max(entry[0] for entry in remarks)
-
-    return all_remarks, file_remarks
-
-
 def map_remarks(all_remarks):
     # Set up a map between function names and their source location for
     # function where inlining happened
     for remark in all_remarks.itervalues():
-        if isinstance(remark, Passed) and remark.Pass == "inline" and remark.Name == "Inlined":
+        if isinstance(remark, optrecord.Passed) and remark.Pass == "inline" and remark.Name == "Inlined":
             for arg in remark.Args:
                 caller = arg.get('Caller')
                 if caller:
                     context.caller_loc[caller] = arg['DebugLoc']
 
 
-def generate_report(pool, all_remarks, file_remarks, source_dir, output_dir):
+def generate_report(pmap, all_remarks, file_remarks, source_dir, output_dir, should_display_hotness):
     try:
         os.makedirs(output_dir)
     except OSError as e:
@@ -355,10 +187,10 @@ def generate_report(pool, all_remarks, file_remarks, source_dir, output_dir):
             raise
 
     _render_file_bound = functools.partial(_render_file, source_dir, output_dir, context)
-    pool.map(_render_file_bound, file_remarks.items())
+    pmap(_render_file_bound, file_remarks.items())
 
-    if context.should_display_hotness():
-        sorted_remarks = sorted(all_remarks.itervalues(), key=lambda r: (r.Hotness, r.__dict__), reverse=True)
+    if should_display_hotness:
+        sorted_remarks = sorted(all_remarks.itervalues(), key=lambda r: (r.Hotness, r.File, r.Line, r.Column, r.__dict__), reverse=True)
     else:
         sorted_remarks = sorted(all_remarks.itervalues(), key=lambda r: (r.File, r.Line, r.Column, r.__dict__))
     IndexRenderer(args.output_dir).render(sorted_remarks)
@@ -388,9 +220,14 @@ if __name__ == '__main__':
         parser.print_help()
         sys.exit(1)
 
-    pool = Pool(processes=args.jobs)
-    all_remarks, file_remarks = gather_results(pool, args.yaml_files)
+    if args.jobs == 1:
+        pmap = map
+    else:
+        pool = Pool(processes=args.jobs)
+        pmap = pool.map
+
+    all_remarks, file_remarks, should_display_hotness = optrecord.gather_results(pmap, args.yaml_files)
 
     map_remarks(all_remarks)
 
-    generate_report(pool, all_remarks, file_remarks, args.source_dir, args.output_dir)
+    generate_report(pmap, all_remarks, file_remarks, args.source_dir, args.output_dir, should_display_hotness)
diff --git a/utils/opt-viewer/optrecord.py b/utils/opt-viewer/optrecord.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dc77e9db01993f47dc102000bbfa565bcf58cae
--- /dev/null
+++ b/utils/opt-viewer/optrecord.py
@@ -0,0 +1,214 @@
+#!/usr/bin/env python2.7
+
+from __future__ import print_function
+
+import yaml
+# Try to use the C parser.
+try:
+    from yaml import CLoader as Loader
+except ImportError:
+    print("For faster parsing, you may want to install libYAML for PyYAML")
+    from yaml import Loader
+
+import functools
+from collections import defaultdict
+import itertools
+from multiprocessing import Pool
+from multiprocessing import Lock, cpu_count
+import cgi
+import subprocess
+
+import traceback
+
+p = subprocess.Popen(['c++filt', '-n'], stdin=subprocess.PIPE, stdout=subprocess.PIPE)
+p_lock = Lock()
+
+
+def demangle(name):
+    with p_lock:
+        p.stdin.write(name + '\n')
+        return p.stdout.readline().rstrip()
+
+def html_file_name(filename):
+    return filename.replace('/', '_') + ".html"
+
+def make_link(File, Line):
+    return "{}#L{}".format(html_file_name(File), Line)
+
+
+class Remark(yaml.YAMLObject):
+    # Work-around for http://pyyaml.org/ticket/154.
+    yaml_loader = Loader
+
+    def initmissing(self):
+        if not hasattr(self, 'Hotness'):
+            self.Hotness = 0
+        if not hasattr(self, 'Args'):
+            self.Args = []
+
+    @property
+    def File(self):
+        return self.DebugLoc['File']
+
+    @property
+    def Line(self):
+        return int(self.DebugLoc['Line'])
+
+    @property
+    def Column(self):
+        return self.DebugLoc['Column']
+
+    @property
+    def DebugLocString(self):
+        return "{}:{}:{}".format(self.File, self.Line, self.Column)
+
+    @property
+    def DemangledFunctionName(self):
+        return demangle(self.Function)
+
+    @property
+    def Link(self):
+        return make_link(self.File, self.Line)
+
+    def getArgString(self, mapping):
+        mapping = mapping.copy()
+        dl = mapping.get('DebugLoc')
+        if dl:
+            del mapping['DebugLoc']
+
+        assert(len(mapping) == 1)
+        (key, value) = mapping.items()[0]
+
+        if key == 'Caller' or key == 'Callee':
+            value = cgi.escape(demangle(value))
+
+        if dl and key != 'Caller':
+            return "<a href={}>{}</a>".format(
+                make_link(dl['File'], dl['Line']), value)
+        else:
+            return value
+
+    def getDiffPrefix(self):
+        if hasattr(self, 'Added'):
+            if self.Added:
+                return '+'
+            else:
+                return '-'
+        return ''
+
+    @property
+    def PassWithDiffPrefix(self):
+        return self.getDiffPrefix() + self.Pass
+
+    @property
+    def message(self):
+        # Args is a list of mappings (dictionaries)
+        values = [self.getArgString(mapping) for mapping in self.Args]
+        return "".join(values)
+
+    @property
+    def RelativeHotness(self):
+        if self.max_hotness:
+            return "{}%".format(int(round(self.Hotness * 100 / self.max_hotness)))
+        else:
+            return ''
+
+    @property
+    def key(self):
+        k = (self.__class__, self.PassWithDiffPrefix, self.Name, self.File, self.Line, self.Column, self.Function)
+        for arg in self.Args:
+            for (key, value) in arg.iteritems():
+                if type(value) is dict:
+                    value = tuple(value.items())
+                k += (key, value)
+        return k
+
+    def __hash__(self):
+        return hash(self.key)
+
+    def __eq__(self, other):
+        return self.key == other.key
+
+    def __repr__(self):
+        return str(self.key)
+
+
+class Analysis(Remark):
+    yaml_tag = '!Analysis'
+
+    @property
+    def color(self):
+        return "white"
+
+
+class AnalysisFPCommute(Analysis):
+    yaml_tag = '!AnalysisFPCommute'
+
+
+class AnalysisAliasing(Analysis):
+    yaml_tag = '!AnalysisAliasing'
+
+
+class Passed(Remark):
+    yaml_tag = '!Passed'
+
+    @property
+    def color(self):
+        return "green"
+
+
+class Missed(Remark):
+    yaml_tag = '!Missed'
+
+    @property
+    def color(self):
+        return "red"
+
+
+def get_remarks(input_file):
+    max_hotness = 0
+    all_remarks = dict()
+    file_remarks = defaultdict(functools.partial(defaultdict, list))
+
+    with open(input_file) as f:
+        docs = yaml.load_all(f, Loader=Loader)
+        for remark in docs:
+            remark.initmissing()
+            # Avoid remarks withoug debug location or if they are duplicated
+            if not hasattr(remark, 'DebugLoc') or remark.key in all_remarks:
+                continue
+            all_remarks[remark.key] = remark
+
+            file_remarks[remark.File][remark.Line].append(remark)
+
+            # If we're reading a back a diff yaml file, max_hotness is already
+            # captured which may actually be less than the max hotness found
+            # in the file.
+            if hasattr(remark, 'max_hotness'):
+                max_hotness = remark.max_hotness
+            max_hotness = max(max_hotness, remark.Hotness)
+
+    return max_hotness, all_remarks, file_remarks
+
+
+def gather_results(pmap, filenames):
+    remarks = pmap(get_remarks, filenames)
+    max_hotness = max(entry[0] for entry in remarks)
+
+    def merge_file_remarks(file_remarks_job, all_remarks, merged):
+        for filename, d in file_remarks_job.iteritems():
+            for line, remarks in d.iteritems():
+                for remark in remarks:
+                    # Bring max_hotness into the remarks so that
+                    # RelativeHotness does not depend on an external global.
+                    remark.max_hotness = max_hotness
+                    if remark.key not in all_remarks:
+                        merged[filename][line].append(remark)
+
+    all_remarks = dict()
+    file_remarks = defaultdict(functools.partial(defaultdict, list))
+    for _, all_remarks_job, file_remarks_job in remarks:
+        merge_file_remarks(file_remarks_job, all_remarks, file_remarks)
+        all_remarks.update(all_remarks_job)
+
+    return all_remarks, file_remarks, max_hotness != 0
diff --git a/utils/opt-viewer/style.css b/utils/opt-viewer/style.css
index 2ef244a157188cf9583ed4f26fdeb0a26492be5f..595c3e46847dd8b85d978a968913fff4e749fea4 100644
--- a/utils/opt-viewer/style.css
+++ b/utils/opt-viewer/style.css
@@ -62,6 +62,12 @@ table {
   text-align: left;
   background-color: #ffe1a6;
 }
+.column-entry-0 {
+  background-color: #ffffff;
+}
+.column-entry-1 {
+  background-color: #eeeeee;
+}
 .line-number {
   text-align: right;
   color: #aaa;
diff --git a/utils/prepare-code-coverage-artifact.py b/utils/prepare-code-coverage-artifact.py
index 726375e899cd741b6e4ce6e131b5802010d689b1..883cdd78049bec04f604a0f44b0f94190d13bd35 100644
--- a/utils/prepare-code-coverage-artifact.py
+++ b/utils/prepare-code-coverage-artifact.py
@@ -6,6 +6,9 @@ from __future__ import print_function
 
 - Collate raw profiles into one indexed profile.
 - Generate html reports for the given binaries.
+
+Caution: The positional arguments to this script must be specified before any 
+optional arguments, such as --restrict.
 '''
 
 import argparse
@@ -84,7 +87,8 @@ if __name__ == '__main__':
                        help='Emit a unified report for all binaries')
     parser.add_argument('--restrict', metavar='R', type=str, nargs='*',
                        default=[],
-                       help='Restrict the reporting to the given source paths')
+                       help='Restrict the reporting to the given source paths'
+                   ' (must be specified after all other positional arguments)')
     args = parser.parse_args()
 
     if args.use_existing_profdata and args.only_merge:
diff --git a/utils/release/build_llvm_package.bat b/utils/release/build_llvm_package.bat
index da968c84a8ba970da66667ff1564696bac05aec0..5e3f2ae6e5479e2986c15a279d2df65f9fe44427 100755
--- a/utils/release/build_llvm_package.bat
+++ b/utils/release/build_llvm_package.bat
@@ -70,7 +70,7 @@ mkdir build32
 cd build32
 set CC=..\build32_stage0\bin\clang-cl
 set CXX=..\build32_stage0\bin\clang-cl
-cmake -GNinja %cmake_flags% -DBUILD_CLANG_FORMAT_VS_PLUGIN=ON -DPYTHON_HOME=%python32_dir% ..\llvm || exit /b
+cmake -GNinja %cmake_flags% -DPYTHON_HOME=%python32_dir% ..\llvm || exit /b
 ninja all || exit /b
 ninja check || ninja check || ninja check || exit /b
 ninja check-clang || ninja check-clang || ninja check-clang ||  exit /b
@@ -78,6 +78,16 @@ copy ..\llvm\tools\clang\tools\clang-format-vs\ClangFormat\bin\Release\ClangForm
 ninja package || exit /b
 cd ..
 
+REM The plug-in is built separately as it uses a statically linked clang-cl.exe.
+mkdir build_vsix
+cd build_vsix
+set CC=..\build32_stage0\bin\clang-cl
+set CXX=..\build32_stage0\bin\clang-cl
+cmake -GNinja %cmake_flags% -DLLVM_USE_CRT_RELEASE=MT -DBUILD_CLANG_FORMAT_VS_PLUGIN=ON -DPYTHON_HOME=%python32_dir% ..\llvm || exit /b
+ninja clang_format_vsix || exit /b
+copy ..\llvm\tools\clang\tools\clang-format-vs\ClangFormat\bin\Release\ClangFormat.vsix ClangFormat-r%revision%.vsix
+cd ..
+
 
 call "%vcdir%/vcvarsall.bat" amd64
 set CC=
diff --git a/utils/release/merge-request.sh b/utils/release/merge-request.sh
new file mode 100755
index 0000000000000000000000000000000000000000..3345d2ad85c53caefef7bf9fff6b11bfdcecbe66
--- /dev/null
+++ b/utils/release/merge-request.sh
@@ -0,0 +1,198 @@
+# !/bin/bash
+#===-- merge-request.sh  ---------------------------------------------------===#
+#
+#                     The LLVM Compiler Infrastructure
+#
+# This file is distributed under the University of Illinois Open Source
+# License.
+#
+#===------------------------------------------------------------------------===#
+#
+# Submit a merge request to bugzilla.
+#
+#===------------------------------------------------------------------------===#
+
+dryrun=""
+stable_version=""
+revision=""
+BUGZILLA_BIN=""
+BUGZILLA_CMD=""
+release_metabug=""
+bugzilla_product="new-bugs"
+bugzilla_component="new bugs"
+bugzilla_assigned_to=""
+bugzilla_user=""
+bugzilla_version=""
+bugzilla_url="http://bugs.llvm.org/xmlrpc.cgi"
+
+function usage() {
+  echo "usage: `basename $0` -user EMAIL -stable-version X.Y -r NUM"
+  echo ""
+  echo " -user EMAIL             Your email address for logging into bugzilla."
+  echo " -stable-version X.Y     The stable release version (e.g. 4.0, 5.0)."
+  echo " -r NUM                  Revision number to merge (e.g. 1234567)."
+  echo " -bugzilla-bin PATH      Path to bugzilla binary (optional)."
+  echo " -assign-to EMAIL        Assign bug to user with EMAIL (optional)."
+  echo " -dry-run                Print commands instead of executing them."
+}
+
+while [ $# -gt 0 ]; do
+  case $1 in
+    -user)
+      shift
+      bugzilla_user="$1"
+      ;;
+    -stable-version)
+      shift
+      stable_version="$1"
+      ;;
+    -r)
+      shift
+      revision="$1"
+      ;;
+    -project)
+      shift
+      project="$1"
+      ;;
+    -component)
+      shift
+      bugzilla_component="$1"
+      ;;
+    -bugzilla-bin)
+      shift
+      BUGZILLA_BIN="$1"
+      ;;
+    -assign-to)
+      shift
+      bugzilla_assigned_to="--assigned_to=$1"
+      ;;
+    -dry-run)
+      dryrun="echo"
+      ;;
+    -help | --help | -h | --h | -\? )
+      usage
+      exit 0
+      ;;
+    * )
+      echo "unknown option: $1"
+      usage
+      exit 1
+      ;;
+  esac
+  shift
+done
+
+if [ -z "$stable_version" ]; then
+  echo "error: no stable version specified"
+  exit 1
+fi
+
+case $stable_version in
+  4.0)
+    release_metabug="32061"
+    ;;
+  *)
+    echo "error: invalid stable version"
+    exit 1
+esac
+bugzilla_version=$stable_version
+
+if [ -z "$revision" ]; then
+  echo "error: revision not specified"
+  exit 1
+fi
+
+if [ -z "$bugzilla_user" ]; then
+  echo "error: bugzilla username not specified."
+  exit 1
+fi
+
+if [ -z "$BUGZILLA_BIN" ]; then
+  BUGZILLA_BIN=`which bugzilla`
+  if [ $? -ne 0 ]; then
+    echo "error: could not find bugzilla executable."
+    echo "Make sure the bugzilla cli tool is installed on your system: "
+    echo "pip install python-bugzilla (recommended)"
+    echo ""
+    echo "Fedora: dnf install python-bugzilla"
+    echo "Ubuntu/Debian: apt-get install bugzilla-cli"
+    exit 1
+  fi
+fi
+
+BUGZILLA_MAJOR_VERSION=`$BUGZILLA_BIN --version 2>&1 | cut -d . -f 1`
+
+if [ $BUGZILLA_MAJOR_VERSION -eq 1 ]; then
+
+  echo "***************************** Warning *******************************"
+  echo "You are using an older version of the bugzilla cli tool.  You will be "
+  echo "able to create bugs, but this script will crash with the following "
+  echo "error when trying to read back information about the bug you created:"
+  echo ""
+  echo "KeyError: 'internals'"
+  echo ""
+  echo "To avoid this error, use version 2.0.0 or higher"
+  echo "https://pypi.python.org/pypi/python-bugzilla"
+  echo "*********************************************************************"
+fi
+
+BUGZILLA_CMD="$BUGZILLA_BIN --bugzilla=$bugzilla_url"
+
+bug_url="https://reviews.llvm.org/rL$revision"
+
+echo "Checking for duplicate bugs..."
+
+check_duplicates=`$BUGZILLA_CMD query --url $bug_url`
+
+if [ -n "$check_duplicates" ]; then
+  echo "Duplicate bug found:"
+  echo $check_duplicates
+  exit 1
+fi
+
+echo "Done"
+
+# Get short commit summary
+commit_summary=''
+commit_msg=`svn log -r $revision https://llvm.org/svn/llvm-project/`
+if [ $? -ne 0 ]; then
+  echo "warning: failed to get commit message."
+  commit_msg=""
+fi
+
+if [ -n "$commit_msg" ]; then
+  commit_summary=`echo "$commit_msg" | sed '4q;d' | cut -c1-80`
+  commit_summary=" : ${commit_summary}"
+fi
+
+bug_summary="Merge r$revision into the $stable_version branch${commit_summary}"
+
+if [ -z "$dryrun" ]; then
+  set -x
+fi
+
+${dryrun} $BUGZILLA_CMD --login --user=$bugzilla_user new \
+  -p "$bugzilla_product" \
+  -c "$bugzilla_component" -u $bug_url --blocked=$release_metabug \
+  -o All --priority=P --arch All -v $bugzilla_version \
+  --summary "${bug_summary}" \
+  -l "Is this patch OK to merge to the $stable_version branch?" \
+  $bugzilla_assigned_to \
+  --oneline
+
+set +x
+
+if [ -n "$dryrun" ]; then
+  exit 0
+fi
+
+if [ $BUGZILLA_MAJOR_VERSION -eq 1 ]; then
+  success=`$BUGZILLA_CMD query --url $bug_url`
+  if [ -z "$success" ]; then
+    echo "Failed to create bug."
+    exit 1
+  fi
+
+  echo " Created new bug:"
+  echo $success
+fi
diff --git a/utils/release/test-release.sh b/utils/release/test-release.sh
index 642bb670e7e2164b7f97e292f73e4fcce00e86a5..b0c771579802fefccab709e7c0b1cc6b5f80d734 100755
--- a/utils/release/test-release.sh
+++ b/utils/release/test-release.sh
@@ -36,6 +36,7 @@ do_libs="yes"
 do_libunwind="yes"
 do_test_suite="yes"
 do_openmp="yes"
+do_lld="yes"
 do_lldb="no"
 do_polly="no"
 BuildDir="`pwd`"
@@ -64,6 +65,7 @@ function usage() {
     echo " -no-libunwind        Disable check-out & build libunwind"
     echo " -no-test-suite       Disable check-out & build test-suite"
     echo " -no-openmp           Disable check-out & build libomp"
+    echo " -no-lld              Disable check-out & build lld"
     echo " -lldb                Enable check-out & build lldb"
     echo " -no-lldb             Disable check-out & build lldb (default)"
     echo " -polly               Enable check-out & build Polly"
@@ -143,6 +145,9 @@ while [ $# -gt 0 ]; do
         -no-openmp )
             do_openmp="no"
             ;;
+        -no-lld )
+            do_lld="no"
+            ;;
         -lldb )
             do_lldb="yes"
             ;;
@@ -225,6 +230,9 @@ esac
 if [ $do_openmp = "yes" ]; then
   projects="$projects openmp"
 fi
+if [ $do_lld = "yes" ]; then
+  projects="$projects lld"
+fi
 if [ $do_lldb = "yes" ]; then
   projects="$projects lldb"
 fi
@@ -297,7 +305,7 @@ function export_sources() {
         cfe)
             projsrc=llvm.src/tools/clang
             ;;
-        lldb|polly)
+        lld|lldb|polly)
             projsrc=llvm.src/tools/$proj
             ;;
         clang-tools-extra)
diff --git a/utils/unittest/CMakeLists.txt b/utils/unittest/CMakeLists.txt
index a50733af9aae19064ec1d0794ba61d025cb1a630..b42ac834e3a77d5f661e92d655bb71ab4256de08 100644
--- a/utils/unittest/CMakeLists.txt
+++ b/utils/unittest/CMakeLists.txt
@@ -40,8 +40,8 @@ if (NOT LLVM_ENABLE_THREADS)
   add_definitions( -DGTEST_HAS_PTHREAD=0 )
 endif()
 
-find_library(PTHREAD_LIBRARY_PATH pthread)
-if (PTHREAD_LIBRARY_PATH)
+find_library(LLVM_PTHREAD_LIBRARY_PATH pthread)
+if (LLVM_PTHREAD_LIBRARY_PATH)
   list(APPEND LIBS pthread)
 endif()
 
diff --git a/utils/update_llc_test_checks.py b/utils/update_llc_test_checks.py
index 6d489a4f3b365950822e93f237f33acf54f74567..3b3ff74d8633de2f799ee61e1c94ec8b6d6aec00 100755
--- a/utils/update_llc_test_checks.py
+++ b/utils/update_llc_test_checks.py
@@ -29,6 +29,8 @@ def llc(args, cmd_args, ir):
 SCRUB_WHITESPACE_RE = re.compile(r'(?!^(|  \w))[ \t]+', flags=re.M)
 SCRUB_TRAILING_WHITESPACE_RE = re.compile(r'[ \t]+$', flags=re.M)
 SCRUB_KILL_COMMENT_RE = re.compile(r'^ *#+ +kill:.*\n')
+SCRUB_LOOP_COMMENT_RE = re.compile(
+    r'# =>This Inner Loop Header:.*|# in Loop:.*', flags=re.M)
 
 ASM_FUNCTION_X86_RE = re.compile(
     r'^_?(?P<func>[^:]+):[ \t]*#+[ \t]*@(?P=func)\n[^:]*?'
@@ -68,6 +70,13 @@ ASM_FUNCTION_PPC_RE = re.compile(
     r'.Lfunc_end[0-9]+:\n',
     flags=(re.M | re.S))
 
+ASM_FUNCTION_SYSTEMZ_RE = re.compile(
+    r'^_?(?P<func>[^:]+):[ \t]*#+[ \t]*@(?P=func)\n'
+    r'[ \t]+.cfi_startproc\n'
+    r'(?P<body>.*?)\n'
+    r'.Lfunc_end[0-9]+:\n',
+    flags=(re.M | re.S))
+
 
 def scrub_asm_x86(asm):
   # Scrub runs of whitespace out of the assembly, but leave the leading
@@ -102,6 +111,18 @@ def scrub_asm_arm_eabi(asm):
   return asm
 
 def scrub_asm_powerpc64le(asm):
+  # Scrub runs of whitespace out of the assembly, but leave the leading
+  # whitespace in place.
+  asm = SCRUB_WHITESPACE_RE.sub(r' ', asm)
+  # Expand the tabs used for indentation.
+  asm = string.expandtabs(asm, 2)
+  # Stripe unimportant comments
+  asm = SCRUB_LOOP_COMMENT_RE.sub(r'', asm)
+  # Strip trailing whitespace.
+  asm = SCRUB_TRAILING_WHITESPACE_RE.sub(r'', asm)
+  return asm
+
+def scrub_asm_systemz(asm):
   # Scrub runs of whitespace out of the assembly, but leave the leading
   # whitespace in place.
   asm = SCRUB_WHITESPACE_RE.sub(r' ', asm)
@@ -121,7 +142,11 @@ def build_function_body_dictionary(raw_tool_output, triple, prefixes, func_dict,
       'x86': (scrub_asm_x86, ASM_FUNCTION_X86_RE),
       'i386': (scrub_asm_x86, ASM_FUNCTION_X86_RE),
       'arm-eabi': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE),
+      'thumb-eabi': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE),
+      'thumbv8-eabi': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE),
+      'armeb-eabi': (scrub_asm_arm_eabi, ASM_FUNCTION_ARM_RE),
       'powerpc64le': (scrub_asm_powerpc64le, ASM_FUNCTION_PPC_RE),
+      's390x': (scrub_asm_systemz, ASM_FUNCTION_SYSTEMZ_RE),
   }
   handlers = None
   for prefix, s in target_handlers.items():
diff --git a/utils/vim/syntax/llvm.vim b/utils/vim/syntax/llvm.vim
index 3a2b8285350cbdd5727bbc466476a714d22c0af7..ee768206aae6f9e873a6a6b348cdd293d94fb157 100644
--- a/utils/vim/syntax/llvm.vim
+++ b/utils/vim/syntax/llvm.vim
@@ -123,6 +123,7 @@ syn keyword llvmKeyword
       \ readnone
       \ readonly
       \ release
+      \ returned
       \ returns_twice
       \ sanitize_address
       \ sanitize_memory
diff --git a/utils/vim/vimrc b/utils/vim/vimrc
index fd87d767d6f414adde0356b4dd77611af894b7ce..2cc8ae9cfa2947b411b8191f41220a8cc4c02228 100644
--- a/utils/vim/vimrc
+++ b/utils/vim/vimrc
@@ -74,13 +74,13 @@ command! DeleteTrailingWs :%s/\s\+$//
 command! Untab :%s/\t/  /g
 
 " Enable syntax highlighting for LLVM files. To use, copy
-" utils/vim/llvm.vim to ~/.vim/syntax .
+" utils/vim/syntax/llvm.vim to ~/.vim/syntax .
 augroup filetype
   au! BufRead,BufNewFile *.ll     set filetype=llvm
 augroup END
 
 " Enable syntax highlighting for tablegen files. To use, copy
-" utils/vim/tablegen.vim to ~/.vim/syntax .
+" utils/vim/syntax/tablegen.vim to ~/.vim/syntax .
 augroup filetype
   au! BufRead,BufNewFile *.td     set filetype=tablegen
 augroup END